1/*
2 * Licensed to the Apache Software Foundation (ASF) under one
3 * or more contributor license agreements. See the NOTICE file
4 * distributed with this work for additional information
5 * regarding copyright ownership. The ASF licenses this file
6 * to you under the Apache License, Version 2.0 (the
7 * "License"); you may not use this file except in compliance
8 * with the License. You may obtain a copy of the License at
9 *
10 * http://www.apache.org/licenses/LICENSE-2.0
11 *
12 * Unless required by applicable law or agreed to in writing,
13 * software distributed under the License is distributed on an
14 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15 * KIND, either express or implied. See the License for the
16 * specific language governing permissions and limitations
17 * under the License.
18 */
19
20/*!
21 * \file src/runtime/vm/vm.cc
22 * \brief The Relay virtual machine runtime.
23 */
24
25#include <dmlc/memory_io.h>
26#include <tvm/runtime/container/adt.h>
27#include <tvm/runtime/data_type.h>
28#include <tvm/runtime/debug.h>
29#include <tvm/runtime/logging.h>
30#include <tvm/runtime/memory.h>
31#include <tvm/runtime/object.h>
32#include <tvm/runtime/vm/vm.h>
33
34#include <algorithm>
35#include <chrono>
36#include <iostream>
37#include <stdexcept>
38#include <vector>
39
40#include "../file_utils.h"
41
42using namespace tvm::runtime;
43
44namespace tvm {
45namespace runtime {
46namespace vm {
47
48TVM_REGISTER_OBJECT_TYPE(VMClosureObj);
49
50VMClosure::VMClosure(size_t func_index, std::vector<ObjectRef> free_vars) {
51 auto ptr = make_object<VMClosureObj>();
52 ptr->func_index = func_index;
53 ptr->free_vars = std::move(free_vars);
54 data_ = std::move(ptr);
55}
56
57void VMFunctionPrint(std::ostream& os, const VMFunction& vm_func) {
58 os << vm_func.name << ": " << std::endl;
59 for (size_t i = 0; i < vm_func.instructions.size(); ++i) {
60 os << i << ": " << vm_func.instructions[i] << ";" << std::endl;
61 }
62}
63
64std::ostream& operator<<(std::ostream& os, const VMFunction& vm_func) {
65 VMFunctionPrint(os, vm_func);
66 return os;
67}
68
69inline ObjectRef CopyTo(ObjectRef src, const DLDevice& dev) {
70 if (src->IsInstance<NDArray::ContainerType>()) {
71 auto nd_array = Downcast<NDArray>(src);
72 // TODO(mbs): Should respect device id also.
73 // TODO(vvchernov): it still does not work for different device id
74 // due to simple implementation of Get() and AllocDataSpace() methods
75 // see tvm/src/runtime/c_runtime_api.cc: L139
76 // tvm/src/runtime/cpu_device_api.cc: L47
77 if (nd_array->device.device_type != dev.device_type ||
78 nd_array->device.device_id != dev.device_id) {
79 VLOG(2) << "copying from " << nd_array->device.device_type << "["
80 << nd_array->device.device_id << "] to " << dev.device_type << "[" << dev.device_id
81 << "]";
82 return nd_array.CopyTo(dev);
83 }
84 return src;
85 } else {
86 ICHECK(src->IsInstance<ADTObj>())
87 << "VM data must be NDArray or a list of NDArray, but received: " << src->_type_key;
88 std::vector<ObjectRef> ret;
89 ADT adt = Downcast<ADT>(src);
90 for (size_t i = 0; i < adt.size(); i++) {
91 ret.push_back(CopyTo(adt[i], dev));
92 }
93 return ADT(adt->tag, ret.begin(), ret.end());
94 }
95}
96
97std::vector<int64_t> ToShape(NDArray shape_tensor) {
98 std::vector<int64_t> shape;
99 auto rank = shape_tensor.Shape().size();
100 auto dtype = shape_tensor.DataType();
101
102 // For 0-rank shapes we need to allocate a single scalar.
103 if (rank == 0) {
104 return shape;
105 }
106
107 // Otherwise we should be rank-1, and we will extract the number of dimensions
108 // for the output vector.
109 ICHECK_EQ(rank, 1U) << "shape tensor should be a k-length vector, found " << rank;
110 int64_t ndim = shape_tensor.Shape().at(0);
111 shape.resize(ndim);
112
113 const DLTensor* dl_tensor = shape_tensor.operator->();
114 if (dtype.is_int() && dtype.bits() == 32 && dtype.lanes() == 1) {
115 int32_t* dims = reinterpret_cast<int32_t*>(dl_tensor->data);
116 shape.assign(dims, dims + ndim);
117 } else if (dtype.is_int() && dtype.bits() == 64 && dtype.lanes() == 1) {
118 int64_t* dims = reinterpret_cast<int64_t*>(dl_tensor->data);
119 shape.assign(dims, dims + ndim);
120 } else {
121 LOG(FATAL) << "invalid shape tensor datatype: " << dtype;
122 }
123
124 return shape;
125}
126
127void VirtualMachine::OpStartHook(Instruction instr) {}
128void VirtualMachine::OpStopHook() {}
129
130PackedFunc VirtualMachine::GetFunction(const std::string& name,
131 const ObjectPtr<Object>& sptr_to_self) {
132 if (name == "invoke") {
133 return PackedFunc([sptr_to_self, this](TVMArgs args, TVMRetValue* rv) {
134 ICHECK(exec_) << "The executable is not created yet.";
135
136 std::string func_name = args[0];
137 auto git = exec_->global_map.find(func_name);
138 ICHECK(git != exec_->global_map.end())
139 << "Cannot find function " << func_name << " in the executable";
140 auto func = exec_->functions[git->second];
141 if (func.params.empty()) {
142 *rv = Invoke(func, {});
143 } else {
144 auto it = inputs_.find(func_name);
145 ICHECK(it != inputs_.end()) << "Input has not been set for function " << func_name;
146 const std::vector<ObjectRef>& input_args = it->second;
147 if (set_outputs_enabled_.count(func_name) && set_outputs_enabled_[func_name]) {
148 ICHECK(outputs_.count(func_name))
149 << "Outputs have not been set for function " << func_name;
150 *rv = Invoke(func, input_args, outputs_[func_name]);
151 outputs_[func_name].clear();
152 set_outputs_enabled_[func_name] = false;
153 } else {
154 *rv = Invoke(func, input_args);
155 }
156 }
157 });
158 } else if (name == "invoke_stateful") {
159 // TODO(tkonolige, jroesch, tqchen): invoke_stateful and get_output are
160 // stop-gap measure to allow using vm over a remote connection.
161 return PackedFunc([sptr_to_self, this](TVMArgs args, TVMRetValue* rv) {
162 PackedFunc invoke = GetFunction("invoke", sptr_to_self);
163 TVMRetValue rv_;
164 invoke.CallPacked(args, &rv_);
165 });
166 } else if (name == "invoke_return_to_device") {
167 return PackedFunc([sptr_to_self, this](TVMArgs args, TVMRetValue* rv) {
168 Device host{static_cast<DLDeviceType>(args[1].operator int()), args[2].operator int()};
169
170 SetInput(args[0].operator std::string(), args, 3);
171 PackedFunc invoke = GetFunction("invoke", sptr_to_self);
172 TVMRetValue rv_;
173 invoke.CallPacked(args, &rv_); // Invoke only uses the first arg, so the rest of the args
174 // should not cause an issue
175 if (rv_.type_code() == kTVMObjectHandle) {
176 ADT adt = Downcast<ADT>(rv_.operator ObjectRef());
177 std::vector<ObjectRef> transfered;
178 for (size_t i = 0; i < adt.size(); i++) {
179 transfered.push_back(CopyTo(adt[i], host));
180 }
181 *rv = ADT(adt.tag(), transfered);
182 } else {
183 *rv = CopyTo(rv_, host);
184 }
185 });
186 } else if (name == "get_output") {
187 return TypedPackedFunc<NDArray(int64_t)>([this](int64_t index) {
188 if (this->return_register_.as<ADTObj>()) {
189 return Downcast<NDArray>(Downcast<ADT>(this->return_register_)[index]);
190 } else {
191 CHECK_EQ(index, 0) << "VM output contains only one item, but you are trying to get the "
192 << index << "th.";
193 return Downcast<NDArray>(this->return_register_);
194 }
195 });
196 } else if (name == "get_num_outputs") {
197 return TypedPackedFunc<int64_t(void)>([this]() -> int64_t {
198 // single output is an NDArray not an ADT
199 if (this->return_register_.as<ADTObj>()) {
200 return Downcast<ADT>(this->return_register_).size();
201 } else {
202 return 1;
203 }
204 });
205 } else if (name == "get_input_index") {
206 return TypedPackedFunc<int64_t(std::string, std::string)>(
207 [this](std::string input_name, std::string func_name) {
208 return GetInputIndexFromVMFunction(func_name, input_name);
209 });
210 } else if (name == "init") {
211 return PackedFunc([sptr_to_self, this](TVMArgs args, TVMRetValue* rv) {
212 ICHECK_EQ(args.size() % 3, 0);
213 std::vector<Device> devices;
214 std::vector<AllocatorType> alloc_types;
215 for (int i = 0; i < args.size() / 3; ++i) {
216 Device dev;
217 int device_type = args[i * 3];
218 dev.device_type = DLDeviceType(device_type);
219 dev.device_id = args[i * 3 + 1];
220 int type = args[i * 3 + 2];
221 devices.push_back(dev);
222 alloc_types.push_back(AllocatorType(type));
223 }
224 this->Init(devices, alloc_types);
225 });
226 } else if (name == "set_input") {
227 return PackedFunc(
228 [sptr_to_self, this](TVMArgs args, TVMRetValue* rv) { SetInput(args[0], args, 1); });
229 } else if (name == "set_one_input") {
230 return PackedFunc([sptr_to_self, this](TVMArgs args, TVMRetValue* rv) {
231 ICHECK_EQ(args.size(), 3) << "The expected number of arguments is 3 "
232 << "(func_name, index or name, tensor)";
233 SetOneInput(args[0], args[1], args[2]);
234 });
235 } else if (name == "set_outputs") {
236 return PackedFunc(
237 [sptr_to_self, this](TVMArgs args, TVMRetValue* rv) { SetOutputs(args[0], args); });
238 } else if (name == "load_late_bound_consts") {
239 return PackedFunc([this](TVMArgs args, TVMRetValue* rv) {
240 CHECK_EQ(args.size(), 1);
241 std::string path = args[0];
242 exec_->LoadLateBoundConstantsFromFile(path);
243 });
244 } else {
245 LOG(FATAL) << "Unknown packed function: " << name;
246 }
247}
248
249void VirtualMachine::SetInput(std::string func_name, TVMArgs args, int offset) {
250 const auto& vm_func = CheckAndGetVMFunction(func_name);
251 size_t params_num = vm_func.params.size();
252 ICHECK_EQ(args.size() - offset, params_num)
253 << "The number of provided parameters doesn't match the number of arguments";
254 std::vector<ObjectRef> func_args(params_num);
255 for (int i = offset; i < args.size(); ++i) {
256 int index = i - offset;
257 Device dev = GetDevice(vm_func.param_device_indexes[index]);
258 SetInputTensorWithIndex(func_args, args[i], index, dev);
259 }
260 inputs_.erase(func_name);
261 inputs_.emplace(func_name, func_args);
262}
263
264void VirtualMachine::SetOneInput(std::string func_name, const TVMArgValue& tag,
265 const TVMArgValue& tensor) {
266 const auto& vm_func = CheckAndGetVMFunction(func_name);
267 size_t params_num = vm_func.params.size();
268
269 int inp_index = 0;
270 if (tag.type_code() == kTVMArgInt) {
271 inp_index = tag;
272 } else if (tag.type_code() == kTVMStr) {
273 inp_index = static_cast<int>(GetInputIndexFromName(vm_func.params, tag));
274 } else {
275 LOG(FATAL) << "The type of input tensor tag (" << tag.type_code()
276 << ") doesn't match integer or string";
277 }
278 ICHECK_LT(inp_index, params_num);
279
280 CreateInputsOrCheckSize(func_name, params_num);
281 Device dev = GetDevice(vm_func.param_device_indexes[inp_index]);
282 SetInputTensorWithIndex(inputs_[func_name], tensor, inp_index, dev);
283}
284
285void VirtualMachine::SetOutputs(std::string func_name, TVMArgs args) {
286 set_outputs_enabled_[func_name] = true;
287 size_t outputs_size = args.size();
288 // First args is func_name
289 ICHECK_GT(outputs_size, 1) << "There is no output arguments set";
290
291 std::vector<ObjectRef> func_args(outputs_size - 1);
292 for (size_t i = 1; i < outputs_size; ++i) {
293 // TODO(vvchernov): device?
294 func_args[i - 1] = TensorFromTVMArgValueToObjectRef(args[i]);
295 }
296 outputs_.erase(func_name);
297 outputs_.emplace(func_name, func_args);
298}
299
300void VirtualMachine::PrintInfoAndSetInputArgs(const VMFunction& func,
301 const std::vector<ObjectRef>& args) {
302 VLOG(2) << "Executing Function: " << std::endl << func;
303 for (int i = 0; i < static_cast<int>(devices_.size()); ++i) {
304 VLOG(2) << "Device " << i << " has device type " << devices_[i].device_type << " and device id "
305 << devices_[i].device_id
306 << (i == exec_->host_device_index ? " (using as host device)" : "");
307 }
308
309 InvokeGlobal(func, args);
310}
311
312void VirtualMachine::SetOutputTensorsToRegister(const std::string& func_name,
313 const std::vector<ObjectRef>& outputs) {
314 size_t size = outputs.size();
315
316 if (output_tensor_reg_indices_[func_name].empty()) {
317 output_tensor_reg_indices_[func_name] = GetOutputTensorRegIndices();
318 }
319 auto& reg_indices = output_tensor_reg_indices_[func_name];
320 ICHECK_EQ(reg_indices.size(), size)
321 << "Number of outside output tensors should be equal to model outputs number";
322 size_t i = 0;
323 for (auto it = reg_indices.begin(); it != reg_indices.end(); ++it, ++i) {
324 WriteRegister(*it, outputs[i]);
325 }
326}
327
328ObjectRef VirtualMachine::TensorFromTVMArgValueToObjectRef(const TVMArgValue& output_tensor) const {
329 if (output_tensor.type_code() == kTVMDLTensorHandle) {
330 DLTensor* dl_tensor = output_tensor;
331 return NDArray::FromExternalDLTensor(*dl_tensor);
332 } else if (output_tensor.type_code() == kTVMNDArrayHandle) {
333 return output_tensor.AsObjectRef<tvm::runtime::NDArray>();
334 } else {
335 LOG(FATAL) << "It supports tensor of DLTensor or NDArray type only! Given type is "
336 << output_tensor.type_code();
337 }
338 return ObjectRef();
339}
340
341int64_t VirtualMachine::GetInputIndexFromVMFunction(const std::string& func_name,
342 const std::string& input_name) const {
343 const auto& vm_func = CheckAndGetVMFunction(func_name);
344 return GetInputIndexFromName(vm_func.params, input_name);
345}
346
347int64_t VirtualMachine::GetInputIndexFromName(const std::vector<std::string>& params,
348 const std::string& input_name) const {
349 // TODO(vvchernov): excess integer type?
350 for (uint64_t i = 0; i < params.size(); i++) {
351 if (input_name == params[i]) {
352 return static_cast<int64_t>(i);
353 }
354 }
355 return static_cast<int64_t>(-1);
356}
357
358const VMFunction& VirtualMachine::CheckAndGetVMFunction(const std::string& func_name) const {
359 ICHECK(exec_) << "The executable is not created yet.";
360 return exec_->GetVMFunctionWithName(func_name);
361}
362
363void VirtualMachine::CreateInputsOrCheckSize(const std::string& func_name, size_t size) {
364 if (inputs_.count(func_name)) {
365 ICHECK_EQ(inputs_[func_name].size(), size)
366 << "The size of function" << func_name
367 << " doesn't match the number of provided parameters";
368 } else {
369 std::vector<ObjectRef> func_args(size);
370 inputs_.emplace(func_name, func_args);
371 }
372}
373
374void VirtualMachine::SetInputTensorWithIndex(std::vector<ObjectRef>& tensors,
375 const TVMArgValue& inp_tensor, int index, Device dev) {
376 if (inp_tensor.type_code() == kTVMDLTensorHandle) {
377 if (NDArray::AbilityOfZeroCopyForDLTensor(inp_tensor, dev)) {
378 tensors[index] = NDArray::FromExternalDLTensor(*inp_tensor);
379 } else {
380 tensors[index] = NDArray::NewFromDLTensor(inp_tensor, dev);
381 }
382 } else {
383 tensors[index] = CopyTo(inp_tensor, dev);
384 }
385}
386
387inline Device VirtualMachine::GetDevice(Index device_index) const {
388 ICHECK_GE(devices_.size(), device_index) << "invalid device index: " << device_index;
389 return devices_[device_index];
390}
391
392inline Allocator* VirtualMachine::GetAllocator(Index device_index) const {
393 ICHECK_GE(allocators_.size(), device_index) << "invalid device index: " << device_index;
394 return allocators_[device_index];
395}
396
397void VirtualMachine::PushFrame(Index arg_count, Index ret_pc, const VMFunction& vm_func) {
398 auto frame = VMFrame(ret_pc, func_index_, arg_count, code_, vm_func.register_file_size);
399 frames_.push_back(frame);
400}
401
402Index VirtualMachine::PopFrame() {
403 ICHECK_GT(frames_.size(), 0);
404 const VMFrame& fr = frames_.back();
405 func_index_ = fr.func_index;
406 code_ = fr.code;
407 pc_ = fr.pc;
408 auto call_stack_size = frames_.size();
409 frames_.pop_back();
410 return call_stack_size;
411}
412
413void VirtualMachine::InvokeGlobal(const VMFunction& func, const std::vector<ObjectRef>& args) {
414 VLOG(2) << "Invoking global " << func.name << " with " << args.size() << " args";
415
416 PushFrame(func.params.size(), this->pc_ + 1, func);
417 for (size_t i = 0; i < args.size(); ++i) {
418 WriteRegister(i, args[i]);
419 VLOG(2) << "arg " << i << " = "
420 << RuntimeObject2String(args[i], GetDevice(exec_->host_device_index));
421 }
422
423 code_ = func.instructions.data();
424 pc_ = 0;
425}
426
427ObjectRef VirtualMachine::Invoke(const VMFunction& func, const std::vector<ObjectRef>& args) {
428 PrintInfoAndSetInputArgs(func, args);
429 RunLoop();
430 return return_register_;
431}
432
433ObjectRef VirtualMachine::Invoke(const std::string& name, const std::vector<ObjectRef>& args) {
434 ICHECK(exec_) << "The executable has not been created yet.";
435 auto it = exec_->global_map.find(name);
436 ICHECK(it != exec_->global_map.end()) << "Cannot find function " << name << " in the executable";
437 Index func_index = it->second;
438 VLOG(2) << "Invoke Global " << name << " at index " << func_index;
439 return Invoke(exec_->functions[func_index], args);
440}
441
442ObjectRef VirtualMachine::Invoke(const VMFunction& func, const std::vector<ObjectRef>& input_args,
443 const std::vector<ObjectRef>& output_args) {
444 PrintInfoAndSetInputArgs(func, input_args);
445 SetOutputTensorsToRegister(func.name, output_args);
446 RunLoop(output_tensor_reg_indices_[func.name]);
447 return return_register_;
448}
449
450void VirtualMachine::InvokePacked(Index packed_index, const PackedFunc& func, Index arg_count,
451 Index output_size, const std::vector<ObjectRef>& args) {
452 size_t arity = 0;
453 for (Index i = 0; i < arg_count; i++) {
454 if (const auto* obj = args[i].as<ADTObj>()) {
455 arity += obj->size;
456 } else {
457 ++arity;
458 }
459 }
460
461 std::vector<TVMValue> values(arity);
462 std::vector<int> codes(arity);
463 runtime::TVMArgsSetter setter(values.data(), codes.data());
464 int idx = 0;
465 bool is_empty_output = false;
466 for (Index i = 0; i < arg_count; i++) {
467 if (const auto* dt_cell = args[i].as<ADTObj>()) {
468 for (size_t fi = 0; fi < dt_cell->size; ++fi) {
469 auto obj = (*dt_cell)[fi];
470 auto nd_array = Downcast<NDArray>(obj);
471 setter(idx++, nd_array);
472 }
473 } else {
474 auto nd_array = Downcast<NDArray>(args[i]);
475 // We can safely skip CallPacked if there is only one
476 // output and it is empty.
477 if (i == arg_count - 1 && output_size == 1) {
478 for (const auto& dim : nd_array.Shape()) {
479 if (!dim) {
480 is_empty_output = true;
481 break;
482 }
483 }
484 }
485 setter(idx++, nd_array);
486 }
487 }
488
489 if (!is_empty_output) {
490 TVMRetValue rv;
491 func.CallPacked(TVMArgs(values.data(), codes.data(), arity), &rv);
492 }
493}
494
495void VirtualMachine::LoadExecutable(const ObjectPtr<Executable>& exec) {
496 ICHECK(exec) << "The executable is not created yet.";
497 ICHECK(exec->late_bound_constant_names.empty())
498 << "Need to load late-bound-constants before creating VM";
499 exec_ = exec;
500
501 runtime::Module lib = exec_->GetLib();
502
503 ICHECK(exec_->primitive_map.empty() || lib.operator->())
504 << "If the executable has declared primitive functions, the "
505 << "generated kernel library must non-be null.";
506
507 for (const auto& it : exec_->primitive_map) {
508 const auto& packed_name = it.first;
509 auto packed_index = static_cast<size_t>(it.second);
510 if (packed_funcs_.size() <= packed_index) {
511 packed_funcs_.resize(packed_index + 1);
512 }
513 tvm::runtime::PackedFunc pf = lib.GetFunction(packed_name, /*query_imports=*/true);
514 ICHECK(pf != nullptr) << "Cannot find function in module: " << packed_name;
515 packed_funcs_[packed_index] = pf;
516 }
517 for (size_t i = 0; i < packed_funcs_.size(); ++i) {
518 ICHECK(packed_funcs_[i] != nullptr) << "Packed function " << i << " is not initialized";
519 }
520}
521
522void VirtualMachine::Init(const std::vector<Device>& physical_devices,
523 const std::vector<AllocatorType>& alloc_types) {
524 ICHECK_EQ(physical_devices.size(), alloc_types.size());
525
526 // Find a physical device to represent each virtual device the VM code requires.
527 // (Recall the VM instructions refer to devices by "device index" into this vector of
528 // virtual devices.)
529 const size_t num_virtual_devices = exec_->virtual_devices.size();
530 devices_.reserve(num_virtual_devices);
531 allocators_.reserve(num_virtual_devices);
532
533 for (size_t device_index = 0; device_index < num_virtual_devices; ++device_index) {
534 // We'll retain the legacy behaviour and just match by device type.
535 // TODO(mbs): Generalize.
536 DLDeviceType virtual_device_type = exec_->virtual_devices[device_index].device_type;
537 auto itr = std::find_if(physical_devices.begin(), physical_devices.end(),
538 [virtual_device_type](const Device& physical_device) {
539 return physical_device.device_type == virtual_device_type;
540 });
541 CHECK(itr != physical_devices.end())
542 << "Unable to find a physical device (from among the " << physical_devices.size()
543 << " given) to match the virtual device with device type " << virtual_device_type;
544 const size_t i = std::distance(physical_devices.begin(), itr);
545 devices_.push_back(*itr);
546 allocators_.push_back(MemoryManager::GetOrCreateAllocator(*itr, alloc_types[i]));
547 }
548}
549
550inline void VirtualMachine::WriteRegister(Index r, const ObjectRef& val) {
551 frames_.back().register_file[r] = val;
552}
553
554ObjectRef VirtualMachine::ReadRegister(Index r) const { return frames_.back().register_file[r]; }
555
556int64_t VirtualMachine::LoadScalarInt(Index r) const {
557 int64_t result = 0;
558 const auto& obj = ReadRegister(r);
559 NDArray array = Downcast<NDArray>(CopyTo(obj, GetDevice(exec_->host_device_index)));
560
561 switch (array->dtype.bits) {
562 case 1: {
563 result = reinterpret_cast<bool*>(array->data)[0];
564 break;
565 }
566 case 8: {
567 result = reinterpret_cast<int8_t*>(array->data)[0];
568 break;
569 }
570 case 16: {
571 result = reinterpret_cast<int16_t*>(array->data)[0];
572 break;
573 }
574 case 32: {
575 result = reinterpret_cast<int32_t*>(array->data)[0];
576 break;
577 }
578 case 64: {
579 result = reinterpret_cast<int64_t*>(array->data)[0];
580 break;
581 }
582 default:
583 LOG(FATAL) << "Unknown scalar int type: " << DLDataType2String(array->dtype);
584 }
585 return result;
586}
587
588Index VirtualMachine::GetResultRegisterIndex() const {
589 Index op_index = 0;
590 while (code_[op_index].op != Opcode::Ret) {
591 ++op_index;
592 }
593
594 return code_[op_index].result;
595}
596
597void VirtualMachine::CalculatePreResultOpIndex(Index res_index) {
598 if (preresult_op_index_ == -1) {
599 preresult_op_index_ = 0;
600 while (code_[preresult_op_index_].dst != res_index) {
601 ++preresult_op_index_;
602 }
603 }
604}
605
606std::vector<Index> VirtualMachine::GetOutputTensorRegIndices() {
607 std::vector<Index> reg_indices;
608 Index res_index = GetResultRegisterIndex();
609 CalculatePreResultOpIndex(res_index);
610 auto& preres_instr = code_[preresult_op_index_];
611 auto op_code = preres_instr.op;
612 if (op_code == Opcode::AllocTensor) {
613 reg_indices.emplace_back(res_index);
614 } else if (op_code == Opcode::AllocADT) {
615 for (Index i = 0; i < preres_instr.num_fields; ++i) {
616 reg_indices.push_back(preres_instr.datatype_fields[i]);
617 }
618 } else if (op_code == Opcode::ReshapeTensor) {
619 reg_indices.push_back(preres_instr.reshape_tensor.tensor);
620 } else {
621 LOG(FATAL) << "Operation " << size_t(op_code) << " is not supported for set_outputs method";
622 }
623 return reg_indices;
624}
625
626void VirtualMachine::RunLoop(const std::vector<Index>& output_tensor_reg_indices) {
627 ICHECK(this->exec_);
628 ICHECK(this->code_);
629 pc_ = 0;
630 Index frame_start = frames_.size();
631 while (true) {
632 main_loop:
633 auto const& instr = code_[this->pc_];
634 VLOG(2) << "Executing(" << pc_ << "): " << instr;
635
636 switch (instr.op) {
637 case Opcode::Move: {
638 ObjectRef from_obj;
639 from_obj = ReadRegister(instr.from);
640 WriteRegister(instr.dst, from_obj);
641 pc_++;
642 goto main_loop;
643 }
644 case Opcode::Fatal: {
645 throw std::runtime_error("VM encountered fatal error");
646 }
647 case Opcode::LoadConst: {
648 bool is_not_cached = const_pool_.size() <= static_cast<size_t>(instr.const_index) ||
649 !const_pool_[instr.const_index].defined();
650 if (is_not_cached) {
651 OpStartHook(instr);
652 }
653 auto constant_obj = exec_->constants[instr.const_index];
654 // We cache the allocated object in the constant pool. To measure, the
655 // first iteration will set the pool up. The other iterations will
656 // directly reuse the allocated objects.
657 if (const_pool_.size() <= static_cast<size_t>(instr.const_index)) {
658 const_pool_.resize(instr.const_index + 1);
659 }
660
661 if (!const_pool_[instr.const_index].defined()) {
662 Device dev = GetDevice(exec_->const_device_indexes[instr.const_index]);
663 const_pool_[instr.const_index] = CopyTo(constant_obj, dev);
664 }
665 WriteRegister(instr.dst, const_pool_[instr.const_index]);
666 if (is_not_cached) {
667 OpStopHook();
668 }
669 pc_++;
670 goto main_loop;
671 }
672 case Opcode::LoadConsti: {
673 auto tensor = NDArray::Empty({1}, {kDLInt, 64, 1}, GetDevice(exec_->host_device_index));
674 reinterpret_cast<int64_t*>(tensor->data)[0] = instr.load_consti.val;
675 WriteRegister(instr.dst, tensor);
676 pc_++;
677 goto main_loop;
678 }
679 case Opcode::Invoke: {
680 std::vector<ObjectRef> args;
681 for (Index i = 0; i < instr.num_args; ++i) {
682 args.push_back(ReadRegister(instr.invoke_args_registers[i]));
683 }
684 InvokeGlobal(exec_->functions[instr.func_index], args);
685 frames_.back().caller_return_register = instr.dst;
686 goto main_loop;
687 }
688 case Opcode::InvokePacked: {
689 ICHECK_LE(instr.packed_index, packed_funcs_.size());
690 const auto& func = packed_funcs_[instr.packed_index];
691 const auto& arity = instr.arity;
692 std::vector<ObjectRef> args;
693 for (Index i = 0; i < arity; ++i) {
694 auto arg = ReadRegister(instr.packed_args[i]);
695 args.push_back(arg);
696#if TVM_LOG_DEBUG
697 if (i < arity) {
698 const bool is_input = i < arity - instr.output_size;
699 VLOG(2) << (is_input ? "input" : "placeholder") << " arg " << i << " = "
700 << RuntimeObject2String(arg, GetDevice(exec_->host_device_index),
701 /*show_contents=*/is_input);
702 }
703#endif
704 }
705
706 // We no longer need to write the registers back, we write directly
707 // through the registers mutably.
708 InvokePacked(instr.packed_index, func, arity, instr.output_size, args);
709
710#if TVM_LOG_DEBUG
711 for (Index i = arity - instr.output_size; i < arity; ++i) {
712 auto arg = ReadRegister(instr.packed_args[i]);
713 VLOG(2) << "output arg " << i << " = "
714 << RuntimeObject2String(arg, GetDevice(exec_->host_device_index));
715 }
716#endif
717
718 pc_++;
719 goto main_loop;
720 }
721 case Opcode::InvokeClosure: {
722 auto object = ReadRegister(instr.closure);
723 const auto* closure = object.as<VMClosureObj>();
724 ICHECK(closure);
725 std::vector<ObjectRef> args;
726 for (auto free_var : closure->free_vars) {
727 args.push_back(free_var);
728 }
729 for (Index i = 0; i < instr.num_closure_args; ++i) {
730 args.push_back(ReadRegister(instr.closure_args[i]));
731 }
732 InvokeGlobal(exec_->functions[closure->func_index], args);
733 frames_.back().caller_return_register = instr.dst;
734 goto main_loop;
735 }
736 case Opcode::GetField: {
737 auto object = ReadRegister(instr.object);
738 const auto& tuple = Downcast<ADT>(object);
739 auto field = tuple[instr.field_index];
740 WriteRegister(instr.dst, field);
741 pc_++;
742 goto main_loop;
743 }
744 case Opcode::GetTag: {
745 auto object = ReadRegister(instr.get_tag.object);
746 const auto& adt = Downcast<ADT>(object);
747 auto tag = adt.tag();
748 auto tag_tensor = NDArray::Empty({1}, {kDLInt, 32, 1}, GetDevice(exec_->host_device_index));
749 reinterpret_cast<int32_t*>(tag_tensor->data)[0] = tag;
750 WriteRegister(instr.dst, tag_tensor);
751 pc_++;
752 goto main_loop;
753 }
754 case Opcode::Goto: {
755 pc_ += instr.pc_offset;
756 goto main_loop;
757 }
758 case Opcode::If: {
759 int32_t test_val = LoadScalarInt(instr.if_op.test);
760 int32_t target_val = LoadScalarInt(instr.if_op.target);
761
762 if (test_val == target_val) {
763 ICHECK_NE(instr.if_op.true_offset, 0);
764 pc_ += instr.if_op.true_offset;
765 } else {
766 ICHECK_NE(instr.if_op.false_offset, 0);
767 pc_ += instr.if_op.false_offset;
768 }
769
770 goto main_loop;
771 }
772 case Opcode::AllocTensor: {
773 OpStartHook(instr);
774 if (!output_tensor_reg_indices.empty() && FindIndex(output_tensor_reg_indices, instr.dst)) {
775 WriteAllocatedTensorFromOutside(instr);
776 } else {
777 WriteAllocatedTensor(instr);
778 }
779 OpStopHook();
780 pc_++;
781 goto main_loop;
782 }
783 case Opcode::AllocTensorReg: {
784 OpStartHook(instr);
785 Device cpu_dev = GetDevice(exec_->host_device_index);
786 auto shape_obj = ReadRegister(instr.alloc_tensor_reg.shape_register);
787 NDArray shape_tensor = Downcast<NDArray>(CopyTo(shape_obj, cpu_dev));
788 auto shape = ToShape(shape_tensor);
789 auto storage_obj = ReadRegister(instr.alloc_tensor_reg.storage);
790 auto storage = Downcast<Storage>(storage_obj);
791 auto offset = LoadScalarInt(instr.alloc_tensor.offset);
792 auto obj = storage->AllocNDArray(offset, shape, instr.alloc_tensor_reg.dtype);
793 VLOG(2) << "allocated "
794 << RuntimeObject2String(obj, GetDevice(exec_->host_device_index),
795 /*show_contents=*/false);
796
797 WriteRegister(instr.dst, obj);
798 OpStopHook();
799 pc_++;
800 goto main_loop;
801 }
802 case Opcode::AllocADT: {
803 std::vector<ObjectRef> fields;
804 for (Index i = 0; i < instr.num_fields; ++i) {
805 fields.push_back(ReadRegister(instr.datatype_fields[i]));
806 }
807 ObjectRef obj = ADT(instr.constructor_tag, fields);
808 WriteRegister(instr.dst, obj);
809 pc_++;
810 goto main_loop;
811 }
812 case Opcode::AllocClosure: {
813 std::vector<ObjectRef> free_vars;
814 for (Index i = 0; i < instr.num_freevar; i++) {
815 free_vars.push_back(ReadRegister(instr.free_vars[i]));
816 }
817 WriteRegister(instr.dst, VMClosure(instr.func_index, free_vars));
818 pc_++;
819 goto main_loop;
820 }
821 case Opcode::AllocStorage: {
822 OpStartHook(instr);
823 auto size = LoadScalarInt(instr.alloc_storage.allocation_size);
824 auto alignment = instr.alloc_storage.alignment;
825
826 auto storage_obj = SimpleObjAllocator().make_object<StorageObj>();
827 Allocator* allocator = GetAllocator(instr.alloc_storage.device_index);
828 ICHECK(allocator) << "Did you forget to init the VirtualMachine with devices?";
829 VLOG(2) << "allocating with allocation_size=" << size << ", alignment=" << alignment
830 << ", dtype_hint=" << DLDataType2String(instr.alloc_storage.dtype_hint)
831 << ", device_index=" << instr.alloc_storage.device_index;
832
833 storage_obj->buffer = allocator->Alloc(size, alignment, instr.alloc_storage.dtype_hint);
834 Storage storage(storage_obj);
835 WriteRegister(instr.dst, storage);
836 OpStopHook();
837 pc_++;
838 goto main_loop;
839 }
840 case Opcode::ShapeOf: {
841 auto input = ReadRegister(instr.shape_of.tensor);
842 NDArray input_array = Downcast<NDArray>(input);
843 int ndim = input_array->ndim;
844 auto out_tensor =
845 NDArray::Empty({ndim}, {kDLInt, 64, 1}, GetDevice(exec_->host_device_index));
846 for (int i = 0; i < ndim; ++i) {
847 reinterpret_cast<int64_t*>(out_tensor->data)[i] = input_array->shape[i];
848 }
849 VLOG(2) << "shape = "
850 << RuntimeObject2String(out_tensor, GetDevice(exec_->host_device_index));
851 WriteRegister(instr.dst, out_tensor);
852 pc_++;
853 goto main_loop;
854 }
855 case Opcode::Ret: {
856 // If we have hit the point from which we started
857 // running, we should return to the caller breaking
858 // the dispatch loop.
859 return_register_ = ReadRegister(instr.result);
860 auto caller_return_register = frames_.back().caller_return_register;
861
862 if (PopFrame() == frame_start) {
863 return;
864 // Otherwise we are just returning from a local call.
865 } else {
866 WriteRegister(caller_return_register, return_register_);
867 goto main_loop;
868 }
869 }
870 case Opcode::ReshapeTensor: {
871 OpStartHook(instr);
872 Device cpu_dev = GetDevice(exec_->host_device_index);
873 auto tensor_obj = ReadRegister(instr.reshape_tensor.tensor);
874 NDArray tensor_arr = Downcast<NDArray>(tensor_obj);
875 // Read the shape from shape tensor
876 auto shape_obj = ReadRegister(instr.reshape_tensor.newshape);
877 NDArray shape_tensor = Downcast<NDArray>(CopyTo(shape_obj, cpu_dev));
878 const DLTensor* dl_tensor = shape_tensor.operator->();
879 ICHECK_EQ(dl_tensor->dtype.code, 0u);
880 ICHECK_EQ(dl_tensor->dtype.bits, 64u);
881 int64_t* dims = reinterpret_cast<int64_t*>(dl_tensor->data);
882 int64_t ndim = shape_tensor->shape[0];
883 std::vector<int64_t> shape(dims, dims + ndim);
884 // Reshape the input tensor
885 auto out_tensor = tensor_arr.CreateView(shape, tensor_arr->dtype);
886 VLOG(2) << "reshaped "
887 << RuntimeObject2String(tensor_obj, GetDevice(exec_->host_device_index)) << " to "
888 << RuntimeObject2String(out_tensor, GetDevice(exec_->host_device_index));
889 WriteRegister(instr.dst, out_tensor);
890 OpStopHook();
891 pc_++;
892 goto main_loop;
893 }
894 case Opcode::DeviceCopy: {
895 OpStartHook(instr);
896 auto tensor_src = ReadRegister(instr.device_copy.src);
897 NDArray src_data = Downcast<NDArray>(tensor_src);
898 Device actual_src_dev = src_data->device;
899 Device inst_src_dev = GetDevice(instr.device_copy.src_device_index);
900 ICHECK_EQ(actual_src_dev.device_type, inst_src_dev.device_type);
901 ICHECK_EQ(actual_src_dev.device_id, inst_src_dev.device_id);
902 Device dst_dev = GetDevice(instr.device_copy.dst_device_index);
903
904 NDArray dst_data = src_data.CopyTo(dst_dev);
905 WriteRegister(instr.dst, dst_data);
906 OpStopHook();
907 pc_++;
908 goto main_loop;
909 }
910 case Opcode::KillRegister: {
911 OpStartHook(instr);
912 WriteRegister(instr.dst, ObjectRef());
913 OpStopHook();
914 pc_++;
915 goto main_loop;
916 }
917 default:
918 LOG(FATAL) << "Unknown instruction opcode: " << int(instr.op);
919 }
920 }
921}
922
923void VirtualMachine::WriteAllocatedTensor(const Instruction& instr) {
924 auto shape = std::vector<int64_t>(instr.alloc_tensor.ndim);
925
926 for (uint32_t i = 0; i < instr.alloc_tensor.ndim; ++i) {
927 shape[i] = instr.alloc_tensor.shape[i];
928 }
929
930 auto storage_obj = ReadRegister(instr.alloc_tensor.storage);
931 auto offset = LoadScalarInt(instr.alloc_tensor.offset);
932 auto storage = Downcast<Storage>(storage_obj);
933 auto obj = storage->AllocNDArray(offset, shape, instr.alloc_tensor.dtype);
934 VLOG(2) << "allocated "
935 << RuntimeObject2String(obj, GetDevice(exec_->host_device_index),
936 /*show_contents=*/false);
937
938 WriteRegister(instr.dst, obj);
939}
940
941void VirtualMachine::WriteAllocatedTensorFromOutside(const Instruction& instr) {
942 // External tensor(s) has been already written to the register (instr.dst)
943 auto ex_arr = Downcast<NDArray>(ReadRegister(instr.dst));
944 auto ex_shape = ex_arr.Shape();
945 auto ex_size = ex_shape.size();
946 auto ex_dtype = ex_arr->dtype;
947
948 auto in_size = instr.alloc_tensor.ndim;
949 auto in_dtype = instr.alloc_tensor.dtype;
950 ICHECK_EQ(TypeEqual(in_dtype, ex_dtype), true)
951 << "Data types mismatching for internal and external output tensors";
952
953 bool size_check = false;
954 if (ex_size != in_size) {
955 size_check = true;
956 } else {
957 for (size_t i = 0; i < in_size; ++i) {
958 if (ex_shape[i] != instr.alloc_tensor.shape[i]) {
959 size_check = true;
960 break;
961 }
962 }
963 }
964
965 if (size_check) {
966 // Match element number
967 size_t in_el_num = 1, ex_el_num = 1;
968 for (size_t i = 0; i < ex_size; ++i) {
969 ex_el_num *= ex_shape[i];
970 }
971 for (size_t i = 0; i < in_size; ++i) {
972 in_el_num *= instr.alloc_tensor.shape[i];
973 }
974 ICHECK_EQ(in_el_num, ex_el_num)
975 << "Element number mismatching of internal and external output tensors";
976 if (code_[preresult_op_index_].op == Opcode::ReshapeTensor) {
977 int64_t* dims = instr.alloc_tensor.shape;
978 std::vector<int64_t> ref_shape(dims, dims + int64_t(in_size));
979 auto reshaped_tensor = ex_arr.CreateView(ref_shape, ex_dtype);
980 WriteRegister(instr.dst, reshaped_tensor);
981 } else {
982 LOG(FATAL) << "Internal and external output tensor shapes are mismatched";
983 }
984 }
985}
986
987bool VirtualMachine::FindIndex(const std::vector<Index>& indices, Index val) const {
988 auto it = std::find(indices.begin(), indices.end(), val);
989 return it != indices.end();
990}
991
992runtime::Module CreateVirtualMachine(Executable* exec) {
993 auto vm = make_object<VirtualMachine>();
994 vm->LoadExecutable(GetObjectPtr<Executable>(exec));
995 return runtime::Module(vm);
996}
997
998TVM_REGISTER_GLOBAL("runtime._VirtualMachine").set_body([](TVMArgs args, TVMRetValue* rv) {
999 runtime::Module mod = args[0];
1000 auto* exec = dynamic_cast<Executable*>(mod.operator->());
1001 *rv = CreateVirtualMachine(exec);
1002});
1003
1004} // namespace vm
1005} // namespace runtime
1006} // namespace tvm
1007