1 | /* |
2 | * Licensed to the Apache Software Foundation (ASF) under one |
3 | * or more contributor license agreements. See the NOTICE file |
4 | * distributed with this work for additional information |
5 | * regarding copyright ownership. The ASF licenses this file |
6 | * to you under the Apache License, Version 2.0 (the |
7 | * "License"); you may not use this file except in compliance |
8 | * with the License. You may obtain a copy of the License at |
9 | * |
10 | * http://www.apache.org/licenses/LICENSE-2.0 |
11 | * |
12 | * Unless required by applicable law or agreed to in writing, |
13 | * software distributed under the License is distributed on an |
14 | * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY |
15 | * KIND, either express or implied. See the License for the |
16 | * specific language governing permissions and limitations |
17 | * under the License. |
18 | */ |
19 | |
20 | /*! |
21 | * \file src/runtime/vm/vm.cc |
22 | * \brief The Relay virtual machine runtime. |
23 | */ |
24 | |
25 | #include <dmlc/memory_io.h> |
26 | #include <tvm/runtime/container/adt.h> |
27 | #include <tvm/runtime/data_type.h> |
28 | #include <tvm/runtime/debug.h> |
29 | #include <tvm/runtime/logging.h> |
30 | #include <tvm/runtime/memory.h> |
31 | #include <tvm/runtime/object.h> |
32 | #include <tvm/runtime/vm/vm.h> |
33 | |
34 | #include <algorithm> |
35 | #include <chrono> |
36 | #include <iostream> |
37 | #include <stdexcept> |
38 | #include <vector> |
39 | |
40 | #include "../file_utils.h" |
41 | |
42 | using namespace tvm::runtime; |
43 | |
44 | namespace tvm { |
45 | namespace runtime { |
46 | namespace vm { |
47 | |
48 | TVM_REGISTER_OBJECT_TYPE(VMClosureObj); |
49 | |
50 | VMClosure::VMClosure(size_t func_index, std::vector<ObjectRef> free_vars) { |
51 | auto ptr = make_object<VMClosureObj>(); |
52 | ptr->func_index = func_index; |
53 | ptr->free_vars = std::move(free_vars); |
54 | data_ = std::move(ptr); |
55 | } |
56 | |
57 | void VMFunctionPrint(std::ostream& os, const VMFunction& vm_func) { |
58 | os << vm_func.name << ": " << std::endl; |
59 | for (size_t i = 0; i < vm_func.instructions.size(); ++i) { |
60 | os << i << ": " << vm_func.instructions[i] << ";" << std::endl; |
61 | } |
62 | } |
63 | |
64 | std::ostream& operator<<(std::ostream& os, const VMFunction& vm_func) { |
65 | VMFunctionPrint(os, vm_func); |
66 | return os; |
67 | } |
68 | |
69 | inline ObjectRef CopyTo(ObjectRef src, const DLDevice& dev) { |
70 | if (src->IsInstance<NDArray::ContainerType>()) { |
71 | auto nd_array = Downcast<NDArray>(src); |
72 | // TODO(mbs): Should respect device id also. |
73 | // TODO(vvchernov): it still does not work for different device id |
74 | // due to simple implementation of Get() and AllocDataSpace() methods |
75 | // see tvm/src/runtime/c_runtime_api.cc: L139 |
76 | // tvm/src/runtime/cpu_device_api.cc: L47 |
77 | if (nd_array->device.device_type != dev.device_type || |
78 | nd_array->device.device_id != dev.device_id) { |
79 | VLOG(2) << "copying from " << nd_array->device.device_type << "[" |
80 | << nd_array->device.device_id << "] to " << dev.device_type << "[" << dev.device_id |
81 | << "]" ; |
82 | return nd_array.CopyTo(dev); |
83 | } |
84 | return src; |
85 | } else { |
86 | ICHECK(src->IsInstance<ADTObj>()) |
87 | << "VM data must be NDArray or a list of NDArray, but received: " << src->_type_key; |
88 | std::vector<ObjectRef> ret; |
89 | ADT adt = Downcast<ADT>(src); |
90 | for (size_t i = 0; i < adt.size(); i++) { |
91 | ret.push_back(CopyTo(adt[i], dev)); |
92 | } |
93 | return ADT(adt->tag, ret.begin(), ret.end()); |
94 | } |
95 | } |
96 | |
97 | std::vector<int64_t> ToShape(NDArray shape_tensor) { |
98 | std::vector<int64_t> shape; |
99 | auto rank = shape_tensor.Shape().size(); |
100 | auto dtype = shape_tensor.DataType(); |
101 | |
102 | // For 0-rank shapes we need to allocate a single scalar. |
103 | if (rank == 0) { |
104 | return shape; |
105 | } |
106 | |
107 | // Otherwise we should be rank-1, and we will extract the number of dimensions |
108 | // for the output vector. |
109 | ICHECK_EQ(rank, 1U) << "shape tensor should be a k-length vector, found " << rank; |
110 | int64_t ndim = shape_tensor.Shape().at(0); |
111 | shape.resize(ndim); |
112 | |
113 | const DLTensor* dl_tensor = shape_tensor.operator->(); |
114 | if (dtype.is_int() && dtype.bits() == 32 && dtype.lanes() == 1) { |
115 | int32_t* dims = reinterpret_cast<int32_t*>(dl_tensor->data); |
116 | shape.assign(dims, dims + ndim); |
117 | } else if (dtype.is_int() && dtype.bits() == 64 && dtype.lanes() == 1) { |
118 | int64_t* dims = reinterpret_cast<int64_t*>(dl_tensor->data); |
119 | shape.assign(dims, dims + ndim); |
120 | } else { |
121 | LOG(FATAL) << "invalid shape tensor datatype: " << dtype; |
122 | } |
123 | |
124 | return shape; |
125 | } |
126 | |
127 | void VirtualMachine::OpStartHook(Instruction instr) {} |
128 | void VirtualMachine::OpStopHook() {} |
129 | |
130 | PackedFunc VirtualMachine::GetFunction(const std::string& name, |
131 | const ObjectPtr<Object>& sptr_to_self) { |
132 | if (name == "invoke" ) { |
133 | return PackedFunc([sptr_to_self, this](TVMArgs args, TVMRetValue* rv) { |
134 | ICHECK(exec_) << "The executable is not created yet." ; |
135 | |
136 | std::string func_name = args[0]; |
137 | auto git = exec_->global_map.find(func_name); |
138 | ICHECK(git != exec_->global_map.end()) |
139 | << "Cannot find function " << func_name << " in the executable" ; |
140 | auto func = exec_->functions[git->second]; |
141 | if (func.params.empty()) { |
142 | *rv = Invoke(func, {}); |
143 | } else { |
144 | auto it = inputs_.find(func_name); |
145 | ICHECK(it != inputs_.end()) << "Input has not been set for function " << func_name; |
146 | const std::vector<ObjectRef>& input_args = it->second; |
147 | if (set_outputs_enabled_.count(func_name) && set_outputs_enabled_[func_name]) { |
148 | ICHECK(outputs_.count(func_name)) |
149 | << "Outputs have not been set for function " << func_name; |
150 | *rv = Invoke(func, input_args, outputs_[func_name]); |
151 | outputs_[func_name].clear(); |
152 | set_outputs_enabled_[func_name] = false; |
153 | } else { |
154 | *rv = Invoke(func, input_args); |
155 | } |
156 | } |
157 | }); |
158 | } else if (name == "invoke_stateful" ) { |
159 | // TODO(tkonolige, jroesch, tqchen): invoke_stateful and get_output are |
160 | // stop-gap measure to allow using vm over a remote connection. |
161 | return PackedFunc([sptr_to_self, this](TVMArgs args, TVMRetValue* rv) { |
162 | PackedFunc invoke = GetFunction("invoke" , sptr_to_self); |
163 | TVMRetValue rv_; |
164 | invoke.CallPacked(args, &rv_); |
165 | }); |
166 | } else if (name == "invoke_return_to_device" ) { |
167 | return PackedFunc([sptr_to_self, this](TVMArgs args, TVMRetValue* rv) { |
168 | Device host{static_cast<DLDeviceType>(args[1].operator int()), args[2].operator int()}; |
169 | |
170 | SetInput(args[0].operator std::string(), args, 3); |
171 | PackedFunc invoke = GetFunction("invoke" , sptr_to_self); |
172 | TVMRetValue rv_; |
173 | invoke.CallPacked(args, &rv_); // Invoke only uses the first arg, so the rest of the args |
174 | // should not cause an issue |
175 | if (rv_.type_code() == kTVMObjectHandle) { |
176 | ADT adt = Downcast<ADT>(rv_.operator ObjectRef()); |
177 | std::vector<ObjectRef> transfered; |
178 | for (size_t i = 0; i < adt.size(); i++) { |
179 | transfered.push_back(CopyTo(adt[i], host)); |
180 | } |
181 | *rv = ADT(adt.tag(), transfered); |
182 | } else { |
183 | *rv = CopyTo(rv_, host); |
184 | } |
185 | }); |
186 | } else if (name == "get_output" ) { |
187 | return TypedPackedFunc<NDArray(int64_t)>([this](int64_t index) { |
188 | if (this->return_register_.as<ADTObj>()) { |
189 | return Downcast<NDArray>(Downcast<ADT>(this->return_register_)[index]); |
190 | } else { |
191 | CHECK_EQ(index, 0) << "VM output contains only one item, but you are trying to get the " |
192 | << index << "th." ; |
193 | return Downcast<NDArray>(this->return_register_); |
194 | } |
195 | }); |
196 | } else if (name == "get_num_outputs" ) { |
197 | return TypedPackedFunc<int64_t(void)>([this]() -> int64_t { |
198 | // single output is an NDArray not an ADT |
199 | if (this->return_register_.as<ADTObj>()) { |
200 | return Downcast<ADT>(this->return_register_).size(); |
201 | } else { |
202 | return 1; |
203 | } |
204 | }); |
205 | } else if (name == "get_input_index" ) { |
206 | return TypedPackedFunc<int64_t(std::string, std::string)>( |
207 | [this](std::string input_name, std::string func_name) { |
208 | return GetInputIndexFromVMFunction(func_name, input_name); |
209 | }); |
210 | } else if (name == "init" ) { |
211 | return PackedFunc([sptr_to_self, this](TVMArgs args, TVMRetValue* rv) { |
212 | ICHECK_EQ(args.size() % 3, 0); |
213 | std::vector<Device> devices; |
214 | std::vector<AllocatorType> alloc_types; |
215 | for (int i = 0; i < args.size() / 3; ++i) { |
216 | Device dev; |
217 | int device_type = args[i * 3]; |
218 | dev.device_type = DLDeviceType(device_type); |
219 | dev.device_id = args[i * 3 + 1]; |
220 | int type = args[i * 3 + 2]; |
221 | devices.push_back(dev); |
222 | alloc_types.push_back(AllocatorType(type)); |
223 | } |
224 | this->Init(devices, alloc_types); |
225 | }); |
226 | } else if (name == "set_input" ) { |
227 | return PackedFunc( |
228 | [sptr_to_self, this](TVMArgs args, TVMRetValue* rv) { SetInput(args[0], args, 1); }); |
229 | } else if (name == "set_one_input" ) { |
230 | return PackedFunc([sptr_to_self, this](TVMArgs args, TVMRetValue* rv) { |
231 | ICHECK_EQ(args.size(), 3) << "The expected number of arguments is 3 " |
232 | << "(func_name, index or name, tensor)" ; |
233 | SetOneInput(args[0], args[1], args[2]); |
234 | }); |
235 | } else if (name == "set_outputs" ) { |
236 | return PackedFunc( |
237 | [sptr_to_self, this](TVMArgs args, TVMRetValue* rv) { SetOutputs(args[0], args); }); |
238 | } else if (name == "load_late_bound_consts" ) { |
239 | return PackedFunc([this](TVMArgs args, TVMRetValue* rv) { |
240 | CHECK_EQ(args.size(), 1); |
241 | std::string path = args[0]; |
242 | exec_->LoadLateBoundConstantsFromFile(path); |
243 | }); |
244 | } else { |
245 | LOG(FATAL) << "Unknown packed function: " << name; |
246 | } |
247 | } |
248 | |
249 | void VirtualMachine::SetInput(std::string func_name, TVMArgs args, int offset) { |
250 | const auto& vm_func = CheckAndGetVMFunction(func_name); |
251 | size_t params_num = vm_func.params.size(); |
252 | ICHECK_EQ(args.size() - offset, params_num) |
253 | << "The number of provided parameters doesn't match the number of arguments" ; |
254 | std::vector<ObjectRef> func_args(params_num); |
255 | for (int i = offset; i < args.size(); ++i) { |
256 | int index = i - offset; |
257 | Device dev = GetDevice(vm_func.param_device_indexes[index]); |
258 | SetInputTensorWithIndex(func_args, args[i], index, dev); |
259 | } |
260 | inputs_.erase(func_name); |
261 | inputs_.emplace(func_name, func_args); |
262 | } |
263 | |
264 | void VirtualMachine::SetOneInput(std::string func_name, const TVMArgValue& tag, |
265 | const TVMArgValue& tensor) { |
266 | const auto& vm_func = CheckAndGetVMFunction(func_name); |
267 | size_t params_num = vm_func.params.size(); |
268 | |
269 | int inp_index = 0; |
270 | if (tag.type_code() == kTVMArgInt) { |
271 | inp_index = tag; |
272 | } else if (tag.type_code() == kTVMStr) { |
273 | inp_index = static_cast<int>(GetInputIndexFromName(vm_func.params, tag)); |
274 | } else { |
275 | LOG(FATAL) << "The type of input tensor tag (" << tag.type_code() |
276 | << ") doesn't match integer or string" ; |
277 | } |
278 | ICHECK_LT(inp_index, params_num); |
279 | |
280 | CreateInputsOrCheckSize(func_name, params_num); |
281 | Device dev = GetDevice(vm_func.param_device_indexes[inp_index]); |
282 | SetInputTensorWithIndex(inputs_[func_name], tensor, inp_index, dev); |
283 | } |
284 | |
285 | void VirtualMachine::SetOutputs(std::string func_name, TVMArgs args) { |
286 | set_outputs_enabled_[func_name] = true; |
287 | size_t outputs_size = args.size(); |
288 | // First args is func_name |
289 | ICHECK_GT(outputs_size, 1) << "There is no output arguments set" ; |
290 | |
291 | std::vector<ObjectRef> func_args(outputs_size - 1); |
292 | for (size_t i = 1; i < outputs_size; ++i) { |
293 | // TODO(vvchernov): device? |
294 | func_args[i - 1] = TensorFromTVMArgValueToObjectRef(args[i]); |
295 | } |
296 | outputs_.erase(func_name); |
297 | outputs_.emplace(func_name, func_args); |
298 | } |
299 | |
300 | void VirtualMachine::PrintInfoAndSetInputArgs(const VMFunction& func, |
301 | const std::vector<ObjectRef>& args) { |
302 | VLOG(2) << "Executing Function: " << std::endl << func; |
303 | for (int i = 0; i < static_cast<int>(devices_.size()); ++i) { |
304 | VLOG(2) << "Device " << i << " has device type " << devices_[i].device_type << " and device id " |
305 | << devices_[i].device_id |
306 | << (i == exec_->host_device_index ? " (using as host device)" : "" ); |
307 | } |
308 | |
309 | InvokeGlobal(func, args); |
310 | } |
311 | |
312 | void VirtualMachine::SetOutputTensorsToRegister(const std::string& func_name, |
313 | const std::vector<ObjectRef>& outputs) { |
314 | size_t size = outputs.size(); |
315 | |
316 | if (output_tensor_reg_indices_[func_name].empty()) { |
317 | output_tensor_reg_indices_[func_name] = GetOutputTensorRegIndices(); |
318 | } |
319 | auto& reg_indices = output_tensor_reg_indices_[func_name]; |
320 | ICHECK_EQ(reg_indices.size(), size) |
321 | << "Number of outside output tensors should be equal to model outputs number" ; |
322 | size_t i = 0; |
323 | for (auto it = reg_indices.begin(); it != reg_indices.end(); ++it, ++i) { |
324 | WriteRegister(*it, outputs[i]); |
325 | } |
326 | } |
327 | |
328 | ObjectRef VirtualMachine::TensorFromTVMArgValueToObjectRef(const TVMArgValue& output_tensor) const { |
329 | if (output_tensor.type_code() == kTVMDLTensorHandle) { |
330 | DLTensor* dl_tensor = output_tensor; |
331 | return NDArray::FromExternalDLTensor(*dl_tensor); |
332 | } else if (output_tensor.type_code() == kTVMNDArrayHandle) { |
333 | return output_tensor.AsObjectRef<tvm::runtime::NDArray>(); |
334 | } else { |
335 | LOG(FATAL) << "It supports tensor of DLTensor or NDArray type only! Given type is " |
336 | << output_tensor.type_code(); |
337 | } |
338 | return ObjectRef(); |
339 | } |
340 | |
341 | int64_t VirtualMachine::GetInputIndexFromVMFunction(const std::string& func_name, |
342 | const std::string& input_name) const { |
343 | const auto& vm_func = CheckAndGetVMFunction(func_name); |
344 | return GetInputIndexFromName(vm_func.params, input_name); |
345 | } |
346 | |
347 | int64_t VirtualMachine::GetInputIndexFromName(const std::vector<std::string>& params, |
348 | const std::string& input_name) const { |
349 | // TODO(vvchernov): excess integer type? |
350 | for (uint64_t i = 0; i < params.size(); i++) { |
351 | if (input_name == params[i]) { |
352 | return static_cast<int64_t>(i); |
353 | } |
354 | } |
355 | return static_cast<int64_t>(-1); |
356 | } |
357 | |
358 | const VMFunction& VirtualMachine::CheckAndGetVMFunction(const std::string& func_name) const { |
359 | ICHECK(exec_) << "The executable is not created yet." ; |
360 | return exec_->GetVMFunctionWithName(func_name); |
361 | } |
362 | |
363 | void VirtualMachine::CreateInputsOrCheckSize(const std::string& func_name, size_t size) { |
364 | if (inputs_.count(func_name)) { |
365 | ICHECK_EQ(inputs_[func_name].size(), size) |
366 | << "The size of function" << func_name |
367 | << " doesn't match the number of provided parameters" ; |
368 | } else { |
369 | std::vector<ObjectRef> func_args(size); |
370 | inputs_.emplace(func_name, func_args); |
371 | } |
372 | } |
373 | |
374 | void VirtualMachine::SetInputTensorWithIndex(std::vector<ObjectRef>& tensors, |
375 | const TVMArgValue& inp_tensor, int index, Device dev) { |
376 | if (inp_tensor.type_code() == kTVMDLTensorHandle) { |
377 | if (NDArray::AbilityOfZeroCopyForDLTensor(inp_tensor, dev)) { |
378 | tensors[index] = NDArray::FromExternalDLTensor(*inp_tensor); |
379 | } else { |
380 | tensors[index] = NDArray::NewFromDLTensor(inp_tensor, dev); |
381 | } |
382 | } else { |
383 | tensors[index] = CopyTo(inp_tensor, dev); |
384 | } |
385 | } |
386 | |
387 | inline Device VirtualMachine::GetDevice(Index device_index) const { |
388 | ICHECK_GE(devices_.size(), device_index) << "invalid device index: " << device_index; |
389 | return devices_[device_index]; |
390 | } |
391 | |
392 | inline Allocator* VirtualMachine::GetAllocator(Index device_index) const { |
393 | ICHECK_GE(allocators_.size(), device_index) << "invalid device index: " << device_index; |
394 | return allocators_[device_index]; |
395 | } |
396 | |
397 | void VirtualMachine::PushFrame(Index arg_count, Index ret_pc, const VMFunction& vm_func) { |
398 | auto frame = VMFrame(ret_pc, func_index_, arg_count, code_, vm_func.register_file_size); |
399 | frames_.push_back(frame); |
400 | } |
401 | |
402 | Index VirtualMachine::PopFrame() { |
403 | ICHECK_GT(frames_.size(), 0); |
404 | const VMFrame& fr = frames_.back(); |
405 | func_index_ = fr.func_index; |
406 | code_ = fr.code; |
407 | pc_ = fr.pc; |
408 | auto call_stack_size = frames_.size(); |
409 | frames_.pop_back(); |
410 | return call_stack_size; |
411 | } |
412 | |
413 | void VirtualMachine::InvokeGlobal(const VMFunction& func, const std::vector<ObjectRef>& args) { |
414 | VLOG(2) << "Invoking global " << func.name << " with " << args.size() << " args" ; |
415 | |
416 | PushFrame(func.params.size(), this->pc_ + 1, func); |
417 | for (size_t i = 0; i < args.size(); ++i) { |
418 | WriteRegister(i, args[i]); |
419 | VLOG(2) << "arg " << i << " = " |
420 | << RuntimeObject2String(args[i], GetDevice(exec_->host_device_index)); |
421 | } |
422 | |
423 | code_ = func.instructions.data(); |
424 | pc_ = 0; |
425 | } |
426 | |
427 | ObjectRef VirtualMachine::Invoke(const VMFunction& func, const std::vector<ObjectRef>& args) { |
428 | PrintInfoAndSetInputArgs(func, args); |
429 | RunLoop(); |
430 | return return_register_; |
431 | } |
432 | |
433 | ObjectRef VirtualMachine::Invoke(const std::string& name, const std::vector<ObjectRef>& args) { |
434 | ICHECK(exec_) << "The executable has not been created yet." ; |
435 | auto it = exec_->global_map.find(name); |
436 | ICHECK(it != exec_->global_map.end()) << "Cannot find function " << name << " in the executable" ; |
437 | Index func_index = it->second; |
438 | VLOG(2) << "Invoke Global " << name << " at index " << func_index; |
439 | return Invoke(exec_->functions[func_index], args); |
440 | } |
441 | |
442 | ObjectRef VirtualMachine::Invoke(const VMFunction& func, const std::vector<ObjectRef>& input_args, |
443 | const std::vector<ObjectRef>& output_args) { |
444 | PrintInfoAndSetInputArgs(func, input_args); |
445 | SetOutputTensorsToRegister(func.name, output_args); |
446 | RunLoop(output_tensor_reg_indices_[func.name]); |
447 | return return_register_; |
448 | } |
449 | |
450 | void VirtualMachine::InvokePacked(Index packed_index, const PackedFunc& func, Index arg_count, |
451 | Index output_size, const std::vector<ObjectRef>& args) { |
452 | size_t arity = 0; |
453 | for (Index i = 0; i < arg_count; i++) { |
454 | if (const auto* obj = args[i].as<ADTObj>()) { |
455 | arity += obj->size; |
456 | } else { |
457 | ++arity; |
458 | } |
459 | } |
460 | |
461 | std::vector<TVMValue> values(arity); |
462 | std::vector<int> codes(arity); |
463 | runtime::TVMArgsSetter setter(values.data(), codes.data()); |
464 | int idx = 0; |
465 | bool is_empty_output = false; |
466 | for (Index i = 0; i < arg_count; i++) { |
467 | if (const auto* dt_cell = args[i].as<ADTObj>()) { |
468 | for (size_t fi = 0; fi < dt_cell->size; ++fi) { |
469 | auto obj = (*dt_cell)[fi]; |
470 | auto nd_array = Downcast<NDArray>(obj); |
471 | setter(idx++, nd_array); |
472 | } |
473 | } else { |
474 | auto nd_array = Downcast<NDArray>(args[i]); |
475 | // We can safely skip CallPacked if there is only one |
476 | // output and it is empty. |
477 | if (i == arg_count - 1 && output_size == 1) { |
478 | for (const auto& dim : nd_array.Shape()) { |
479 | if (!dim) { |
480 | is_empty_output = true; |
481 | break; |
482 | } |
483 | } |
484 | } |
485 | setter(idx++, nd_array); |
486 | } |
487 | } |
488 | |
489 | if (!is_empty_output) { |
490 | TVMRetValue rv; |
491 | func.CallPacked(TVMArgs(values.data(), codes.data(), arity), &rv); |
492 | } |
493 | } |
494 | |
495 | void VirtualMachine::LoadExecutable(const ObjectPtr<Executable>& exec) { |
496 | ICHECK(exec) << "The executable is not created yet." ; |
497 | ICHECK(exec->late_bound_constant_names.empty()) |
498 | << "Need to load late-bound-constants before creating VM" ; |
499 | exec_ = exec; |
500 | |
501 | runtime::Module lib = exec_->GetLib(); |
502 | |
503 | ICHECK(exec_->primitive_map.empty() || lib.operator->()) |
504 | << "If the executable has declared primitive functions, the " |
505 | << "generated kernel library must non-be null." ; |
506 | |
507 | for (const auto& it : exec_->primitive_map) { |
508 | const auto& packed_name = it.first; |
509 | auto packed_index = static_cast<size_t>(it.second); |
510 | if (packed_funcs_.size() <= packed_index) { |
511 | packed_funcs_.resize(packed_index + 1); |
512 | } |
513 | tvm::runtime::PackedFunc pf = lib.GetFunction(packed_name, /*query_imports=*/true); |
514 | ICHECK(pf != nullptr) << "Cannot find function in module: " << packed_name; |
515 | packed_funcs_[packed_index] = pf; |
516 | } |
517 | for (size_t i = 0; i < packed_funcs_.size(); ++i) { |
518 | ICHECK(packed_funcs_[i] != nullptr) << "Packed function " << i << " is not initialized" ; |
519 | } |
520 | } |
521 | |
522 | void VirtualMachine::Init(const std::vector<Device>& physical_devices, |
523 | const std::vector<AllocatorType>& alloc_types) { |
524 | ICHECK_EQ(physical_devices.size(), alloc_types.size()); |
525 | |
526 | // Find a physical device to represent each virtual device the VM code requires. |
527 | // (Recall the VM instructions refer to devices by "device index" into this vector of |
528 | // virtual devices.) |
529 | const size_t num_virtual_devices = exec_->virtual_devices.size(); |
530 | devices_.reserve(num_virtual_devices); |
531 | allocators_.reserve(num_virtual_devices); |
532 | |
533 | for (size_t device_index = 0; device_index < num_virtual_devices; ++device_index) { |
534 | // We'll retain the legacy behaviour and just match by device type. |
535 | // TODO(mbs): Generalize. |
536 | DLDeviceType virtual_device_type = exec_->virtual_devices[device_index].device_type; |
537 | auto itr = std::find_if(physical_devices.begin(), physical_devices.end(), |
538 | [virtual_device_type](const Device& physical_device) { |
539 | return physical_device.device_type == virtual_device_type; |
540 | }); |
541 | CHECK(itr != physical_devices.end()) |
542 | << "Unable to find a physical device (from among the " << physical_devices.size() |
543 | << " given) to match the virtual device with device type " << virtual_device_type; |
544 | const size_t i = std::distance(physical_devices.begin(), itr); |
545 | devices_.push_back(*itr); |
546 | allocators_.push_back(MemoryManager::GetOrCreateAllocator(*itr, alloc_types[i])); |
547 | } |
548 | } |
549 | |
550 | inline void VirtualMachine::WriteRegister(Index r, const ObjectRef& val) { |
551 | frames_.back().register_file[r] = val; |
552 | } |
553 | |
554 | ObjectRef VirtualMachine::ReadRegister(Index r) const { return frames_.back().register_file[r]; } |
555 | |
556 | int64_t VirtualMachine::LoadScalarInt(Index r) const { |
557 | int64_t result = 0; |
558 | const auto& obj = ReadRegister(r); |
559 | NDArray array = Downcast<NDArray>(CopyTo(obj, GetDevice(exec_->host_device_index))); |
560 | |
561 | switch (array->dtype.bits) { |
562 | case 1: { |
563 | result = reinterpret_cast<bool*>(array->data)[0]; |
564 | break; |
565 | } |
566 | case 8: { |
567 | result = reinterpret_cast<int8_t*>(array->data)[0]; |
568 | break; |
569 | } |
570 | case 16: { |
571 | result = reinterpret_cast<int16_t*>(array->data)[0]; |
572 | break; |
573 | } |
574 | case 32: { |
575 | result = reinterpret_cast<int32_t*>(array->data)[0]; |
576 | break; |
577 | } |
578 | case 64: { |
579 | result = reinterpret_cast<int64_t*>(array->data)[0]; |
580 | break; |
581 | } |
582 | default: |
583 | LOG(FATAL) << "Unknown scalar int type: " << DLDataType2String(array->dtype); |
584 | } |
585 | return result; |
586 | } |
587 | |
588 | Index VirtualMachine::GetResultRegisterIndex() const { |
589 | Index op_index = 0; |
590 | while (code_[op_index].op != Opcode::Ret) { |
591 | ++op_index; |
592 | } |
593 | |
594 | return code_[op_index].result; |
595 | } |
596 | |
597 | void VirtualMachine::CalculatePreResultOpIndex(Index res_index) { |
598 | if (preresult_op_index_ == -1) { |
599 | preresult_op_index_ = 0; |
600 | while (code_[preresult_op_index_].dst != res_index) { |
601 | ++preresult_op_index_; |
602 | } |
603 | } |
604 | } |
605 | |
606 | std::vector<Index> VirtualMachine::GetOutputTensorRegIndices() { |
607 | std::vector<Index> reg_indices; |
608 | Index res_index = GetResultRegisterIndex(); |
609 | CalculatePreResultOpIndex(res_index); |
610 | auto& preres_instr = code_[preresult_op_index_]; |
611 | auto op_code = preres_instr.op; |
612 | if (op_code == Opcode::AllocTensor) { |
613 | reg_indices.emplace_back(res_index); |
614 | } else if (op_code == Opcode::AllocADT) { |
615 | for (Index i = 0; i < preres_instr.num_fields; ++i) { |
616 | reg_indices.push_back(preres_instr.datatype_fields[i]); |
617 | } |
618 | } else if (op_code == Opcode::ReshapeTensor) { |
619 | reg_indices.push_back(preres_instr.reshape_tensor.tensor); |
620 | } else { |
621 | LOG(FATAL) << "Operation " << size_t(op_code) << " is not supported for set_outputs method" ; |
622 | } |
623 | return reg_indices; |
624 | } |
625 | |
626 | void VirtualMachine::RunLoop(const std::vector<Index>& output_tensor_reg_indices) { |
627 | ICHECK(this->exec_); |
628 | ICHECK(this->code_); |
629 | pc_ = 0; |
630 | Index frame_start = frames_.size(); |
631 | while (true) { |
632 | main_loop: |
633 | auto const& instr = code_[this->pc_]; |
634 | VLOG(2) << "Executing(" << pc_ << "): " << instr; |
635 | |
636 | switch (instr.op) { |
637 | case Opcode::Move: { |
638 | ObjectRef from_obj; |
639 | from_obj = ReadRegister(instr.from); |
640 | WriteRegister(instr.dst, from_obj); |
641 | pc_++; |
642 | goto main_loop; |
643 | } |
644 | case Opcode::Fatal: { |
645 | throw std::runtime_error("VM encountered fatal error" ); |
646 | } |
647 | case Opcode::LoadConst: { |
648 | bool is_not_cached = const_pool_.size() <= static_cast<size_t>(instr.const_index) || |
649 | !const_pool_[instr.const_index].defined(); |
650 | if (is_not_cached) { |
651 | OpStartHook(instr); |
652 | } |
653 | auto constant_obj = exec_->constants[instr.const_index]; |
654 | // We cache the allocated object in the constant pool. To measure, the |
655 | // first iteration will set the pool up. The other iterations will |
656 | // directly reuse the allocated objects. |
657 | if (const_pool_.size() <= static_cast<size_t>(instr.const_index)) { |
658 | const_pool_.resize(instr.const_index + 1); |
659 | } |
660 | |
661 | if (!const_pool_[instr.const_index].defined()) { |
662 | Device dev = GetDevice(exec_->const_device_indexes[instr.const_index]); |
663 | const_pool_[instr.const_index] = CopyTo(constant_obj, dev); |
664 | } |
665 | WriteRegister(instr.dst, const_pool_[instr.const_index]); |
666 | if (is_not_cached) { |
667 | OpStopHook(); |
668 | } |
669 | pc_++; |
670 | goto main_loop; |
671 | } |
672 | case Opcode::LoadConsti: { |
673 | auto tensor = NDArray::Empty({1}, {kDLInt, 64, 1}, GetDevice(exec_->host_device_index)); |
674 | reinterpret_cast<int64_t*>(tensor->data)[0] = instr.load_consti.val; |
675 | WriteRegister(instr.dst, tensor); |
676 | pc_++; |
677 | goto main_loop; |
678 | } |
679 | case Opcode::Invoke: { |
680 | std::vector<ObjectRef> args; |
681 | for (Index i = 0; i < instr.num_args; ++i) { |
682 | args.push_back(ReadRegister(instr.invoke_args_registers[i])); |
683 | } |
684 | InvokeGlobal(exec_->functions[instr.func_index], args); |
685 | frames_.back().caller_return_register = instr.dst; |
686 | goto main_loop; |
687 | } |
688 | case Opcode::InvokePacked: { |
689 | ICHECK_LE(instr.packed_index, packed_funcs_.size()); |
690 | const auto& func = packed_funcs_[instr.packed_index]; |
691 | const auto& arity = instr.arity; |
692 | std::vector<ObjectRef> args; |
693 | for (Index i = 0; i < arity; ++i) { |
694 | auto arg = ReadRegister(instr.packed_args[i]); |
695 | args.push_back(arg); |
696 | #if TVM_LOG_DEBUG |
697 | if (i < arity) { |
698 | const bool is_input = i < arity - instr.output_size; |
699 | VLOG(2) << (is_input ? "input" : "placeholder" ) << " arg " << i << " = " |
700 | << RuntimeObject2String(arg, GetDevice(exec_->host_device_index), |
701 | /*show_contents=*/is_input); |
702 | } |
703 | #endif |
704 | } |
705 | |
706 | // We no longer need to write the registers back, we write directly |
707 | // through the registers mutably. |
708 | InvokePacked(instr.packed_index, func, arity, instr.output_size, args); |
709 | |
710 | #if TVM_LOG_DEBUG |
711 | for (Index i = arity - instr.output_size; i < arity; ++i) { |
712 | auto arg = ReadRegister(instr.packed_args[i]); |
713 | VLOG(2) << "output arg " << i << " = " |
714 | << RuntimeObject2String(arg, GetDevice(exec_->host_device_index)); |
715 | } |
716 | #endif |
717 | |
718 | pc_++; |
719 | goto main_loop; |
720 | } |
721 | case Opcode::InvokeClosure: { |
722 | auto object = ReadRegister(instr.closure); |
723 | const auto* closure = object.as<VMClosureObj>(); |
724 | ICHECK(closure); |
725 | std::vector<ObjectRef> args; |
726 | for (auto free_var : closure->free_vars) { |
727 | args.push_back(free_var); |
728 | } |
729 | for (Index i = 0; i < instr.num_closure_args; ++i) { |
730 | args.push_back(ReadRegister(instr.closure_args[i])); |
731 | } |
732 | InvokeGlobal(exec_->functions[closure->func_index], args); |
733 | frames_.back().caller_return_register = instr.dst; |
734 | goto main_loop; |
735 | } |
736 | case Opcode::GetField: { |
737 | auto object = ReadRegister(instr.object); |
738 | const auto& tuple = Downcast<ADT>(object); |
739 | auto field = tuple[instr.field_index]; |
740 | WriteRegister(instr.dst, field); |
741 | pc_++; |
742 | goto main_loop; |
743 | } |
744 | case Opcode::GetTag: { |
745 | auto object = ReadRegister(instr.get_tag.object); |
746 | const auto& adt = Downcast<ADT>(object); |
747 | auto tag = adt.tag(); |
748 | auto tag_tensor = NDArray::Empty({1}, {kDLInt, 32, 1}, GetDevice(exec_->host_device_index)); |
749 | reinterpret_cast<int32_t*>(tag_tensor->data)[0] = tag; |
750 | WriteRegister(instr.dst, tag_tensor); |
751 | pc_++; |
752 | goto main_loop; |
753 | } |
754 | case Opcode::Goto: { |
755 | pc_ += instr.pc_offset; |
756 | goto main_loop; |
757 | } |
758 | case Opcode::If: { |
759 | int32_t test_val = LoadScalarInt(instr.if_op.test); |
760 | int32_t target_val = LoadScalarInt(instr.if_op.target); |
761 | |
762 | if (test_val == target_val) { |
763 | ICHECK_NE(instr.if_op.true_offset, 0); |
764 | pc_ += instr.if_op.true_offset; |
765 | } else { |
766 | ICHECK_NE(instr.if_op.false_offset, 0); |
767 | pc_ += instr.if_op.false_offset; |
768 | } |
769 | |
770 | goto main_loop; |
771 | } |
772 | case Opcode::AllocTensor: { |
773 | OpStartHook(instr); |
774 | if (!output_tensor_reg_indices.empty() && FindIndex(output_tensor_reg_indices, instr.dst)) { |
775 | WriteAllocatedTensorFromOutside(instr); |
776 | } else { |
777 | WriteAllocatedTensor(instr); |
778 | } |
779 | OpStopHook(); |
780 | pc_++; |
781 | goto main_loop; |
782 | } |
783 | case Opcode::AllocTensorReg: { |
784 | OpStartHook(instr); |
785 | Device cpu_dev = GetDevice(exec_->host_device_index); |
786 | auto shape_obj = ReadRegister(instr.alloc_tensor_reg.shape_register); |
787 | NDArray shape_tensor = Downcast<NDArray>(CopyTo(shape_obj, cpu_dev)); |
788 | auto shape = ToShape(shape_tensor); |
789 | auto storage_obj = ReadRegister(instr.alloc_tensor_reg.storage); |
790 | auto storage = Downcast<Storage>(storage_obj); |
791 | auto offset = LoadScalarInt(instr.alloc_tensor.offset); |
792 | auto obj = storage->AllocNDArray(offset, shape, instr.alloc_tensor_reg.dtype); |
793 | VLOG(2) << "allocated " |
794 | << RuntimeObject2String(obj, GetDevice(exec_->host_device_index), |
795 | /*show_contents=*/false); |
796 | |
797 | WriteRegister(instr.dst, obj); |
798 | OpStopHook(); |
799 | pc_++; |
800 | goto main_loop; |
801 | } |
802 | case Opcode::AllocADT: { |
803 | std::vector<ObjectRef> fields; |
804 | for (Index i = 0; i < instr.num_fields; ++i) { |
805 | fields.push_back(ReadRegister(instr.datatype_fields[i])); |
806 | } |
807 | ObjectRef obj = ADT(instr.constructor_tag, fields); |
808 | WriteRegister(instr.dst, obj); |
809 | pc_++; |
810 | goto main_loop; |
811 | } |
812 | case Opcode::AllocClosure: { |
813 | std::vector<ObjectRef> free_vars; |
814 | for (Index i = 0; i < instr.num_freevar; i++) { |
815 | free_vars.push_back(ReadRegister(instr.free_vars[i])); |
816 | } |
817 | WriteRegister(instr.dst, VMClosure(instr.func_index, free_vars)); |
818 | pc_++; |
819 | goto main_loop; |
820 | } |
821 | case Opcode::AllocStorage: { |
822 | OpStartHook(instr); |
823 | auto size = LoadScalarInt(instr.alloc_storage.allocation_size); |
824 | auto alignment = instr.alloc_storage.alignment; |
825 | |
826 | auto storage_obj = SimpleObjAllocator().make_object<StorageObj>(); |
827 | Allocator* allocator = GetAllocator(instr.alloc_storage.device_index); |
828 | ICHECK(allocator) << "Did you forget to init the VirtualMachine with devices?" ; |
829 | VLOG(2) << "allocating with allocation_size=" << size << ", alignment=" << alignment |
830 | << ", dtype_hint=" << DLDataType2String(instr.alloc_storage.dtype_hint) |
831 | << ", device_index=" << instr.alloc_storage.device_index; |
832 | |
833 | storage_obj->buffer = allocator->Alloc(size, alignment, instr.alloc_storage.dtype_hint); |
834 | Storage storage(storage_obj); |
835 | WriteRegister(instr.dst, storage); |
836 | OpStopHook(); |
837 | pc_++; |
838 | goto main_loop; |
839 | } |
840 | case Opcode::ShapeOf: { |
841 | auto input = ReadRegister(instr.shape_of.tensor); |
842 | NDArray input_array = Downcast<NDArray>(input); |
843 | int ndim = input_array->ndim; |
844 | auto out_tensor = |
845 | NDArray::Empty({ndim}, {kDLInt, 64, 1}, GetDevice(exec_->host_device_index)); |
846 | for (int i = 0; i < ndim; ++i) { |
847 | reinterpret_cast<int64_t*>(out_tensor->data)[i] = input_array->shape[i]; |
848 | } |
849 | VLOG(2) << "shape = " |
850 | << RuntimeObject2String(out_tensor, GetDevice(exec_->host_device_index)); |
851 | WriteRegister(instr.dst, out_tensor); |
852 | pc_++; |
853 | goto main_loop; |
854 | } |
855 | case Opcode::Ret: { |
856 | // If we have hit the point from which we started |
857 | // running, we should return to the caller breaking |
858 | // the dispatch loop. |
859 | return_register_ = ReadRegister(instr.result); |
860 | auto caller_return_register = frames_.back().caller_return_register; |
861 | |
862 | if (PopFrame() == frame_start) { |
863 | return; |
864 | // Otherwise we are just returning from a local call. |
865 | } else { |
866 | WriteRegister(caller_return_register, return_register_); |
867 | goto main_loop; |
868 | } |
869 | } |
870 | case Opcode::ReshapeTensor: { |
871 | OpStartHook(instr); |
872 | Device cpu_dev = GetDevice(exec_->host_device_index); |
873 | auto tensor_obj = ReadRegister(instr.reshape_tensor.tensor); |
874 | NDArray tensor_arr = Downcast<NDArray>(tensor_obj); |
875 | // Read the shape from shape tensor |
876 | auto shape_obj = ReadRegister(instr.reshape_tensor.newshape); |
877 | NDArray shape_tensor = Downcast<NDArray>(CopyTo(shape_obj, cpu_dev)); |
878 | const DLTensor* dl_tensor = shape_tensor.operator->(); |
879 | ICHECK_EQ(dl_tensor->dtype.code, 0u); |
880 | ICHECK_EQ(dl_tensor->dtype.bits, 64u); |
881 | int64_t* dims = reinterpret_cast<int64_t*>(dl_tensor->data); |
882 | int64_t ndim = shape_tensor->shape[0]; |
883 | std::vector<int64_t> shape(dims, dims + ndim); |
884 | // Reshape the input tensor |
885 | auto out_tensor = tensor_arr.CreateView(shape, tensor_arr->dtype); |
886 | VLOG(2) << "reshaped " |
887 | << RuntimeObject2String(tensor_obj, GetDevice(exec_->host_device_index)) << " to " |
888 | << RuntimeObject2String(out_tensor, GetDevice(exec_->host_device_index)); |
889 | WriteRegister(instr.dst, out_tensor); |
890 | OpStopHook(); |
891 | pc_++; |
892 | goto main_loop; |
893 | } |
894 | case Opcode::DeviceCopy: { |
895 | OpStartHook(instr); |
896 | auto tensor_src = ReadRegister(instr.device_copy.src); |
897 | NDArray src_data = Downcast<NDArray>(tensor_src); |
898 | Device actual_src_dev = src_data->device; |
899 | Device inst_src_dev = GetDevice(instr.device_copy.src_device_index); |
900 | ICHECK_EQ(actual_src_dev.device_type, inst_src_dev.device_type); |
901 | ICHECK_EQ(actual_src_dev.device_id, inst_src_dev.device_id); |
902 | Device dst_dev = GetDevice(instr.device_copy.dst_device_index); |
903 | |
904 | NDArray dst_data = src_data.CopyTo(dst_dev); |
905 | WriteRegister(instr.dst, dst_data); |
906 | OpStopHook(); |
907 | pc_++; |
908 | goto main_loop; |
909 | } |
910 | case Opcode::KillRegister: { |
911 | OpStartHook(instr); |
912 | WriteRegister(instr.dst, ObjectRef()); |
913 | OpStopHook(); |
914 | pc_++; |
915 | goto main_loop; |
916 | } |
917 | default: |
918 | LOG(FATAL) << "Unknown instruction opcode: " << int(instr.op); |
919 | } |
920 | } |
921 | } |
922 | |
923 | void VirtualMachine::WriteAllocatedTensor(const Instruction& instr) { |
924 | auto shape = std::vector<int64_t>(instr.alloc_tensor.ndim); |
925 | |
926 | for (uint32_t i = 0; i < instr.alloc_tensor.ndim; ++i) { |
927 | shape[i] = instr.alloc_tensor.shape[i]; |
928 | } |
929 | |
930 | auto storage_obj = ReadRegister(instr.alloc_tensor.storage); |
931 | auto offset = LoadScalarInt(instr.alloc_tensor.offset); |
932 | auto storage = Downcast<Storage>(storage_obj); |
933 | auto obj = storage->AllocNDArray(offset, shape, instr.alloc_tensor.dtype); |
934 | VLOG(2) << "allocated " |
935 | << RuntimeObject2String(obj, GetDevice(exec_->host_device_index), |
936 | /*show_contents=*/false); |
937 | |
938 | WriteRegister(instr.dst, obj); |
939 | } |
940 | |
941 | void VirtualMachine::WriteAllocatedTensorFromOutside(const Instruction& instr) { |
942 | // External tensor(s) has been already written to the register (instr.dst) |
943 | auto ex_arr = Downcast<NDArray>(ReadRegister(instr.dst)); |
944 | auto ex_shape = ex_arr.Shape(); |
945 | auto ex_size = ex_shape.size(); |
946 | auto ex_dtype = ex_arr->dtype; |
947 | |
948 | auto in_size = instr.alloc_tensor.ndim; |
949 | auto in_dtype = instr.alloc_tensor.dtype; |
950 | ICHECK_EQ(TypeEqual(in_dtype, ex_dtype), true) |
951 | << "Data types mismatching for internal and external output tensors" ; |
952 | |
953 | bool size_check = false; |
954 | if (ex_size != in_size) { |
955 | size_check = true; |
956 | } else { |
957 | for (size_t i = 0; i < in_size; ++i) { |
958 | if (ex_shape[i] != instr.alloc_tensor.shape[i]) { |
959 | size_check = true; |
960 | break; |
961 | } |
962 | } |
963 | } |
964 | |
965 | if (size_check) { |
966 | // Match element number |
967 | size_t in_el_num = 1, ex_el_num = 1; |
968 | for (size_t i = 0; i < ex_size; ++i) { |
969 | ex_el_num *= ex_shape[i]; |
970 | } |
971 | for (size_t i = 0; i < in_size; ++i) { |
972 | in_el_num *= instr.alloc_tensor.shape[i]; |
973 | } |
974 | ICHECK_EQ(in_el_num, ex_el_num) |
975 | << "Element number mismatching of internal and external output tensors" ; |
976 | if (code_[preresult_op_index_].op == Opcode::ReshapeTensor) { |
977 | int64_t* dims = instr.alloc_tensor.shape; |
978 | std::vector<int64_t> ref_shape(dims, dims + int64_t(in_size)); |
979 | auto reshaped_tensor = ex_arr.CreateView(ref_shape, ex_dtype); |
980 | WriteRegister(instr.dst, reshaped_tensor); |
981 | } else { |
982 | LOG(FATAL) << "Internal and external output tensor shapes are mismatched" ; |
983 | } |
984 | } |
985 | } |
986 | |
987 | bool VirtualMachine::FindIndex(const std::vector<Index>& indices, Index val) const { |
988 | auto it = std::find(indices.begin(), indices.end(), val); |
989 | return it != indices.end(); |
990 | } |
991 | |
992 | runtime::Module CreateVirtualMachine(Executable* exec) { |
993 | auto vm = make_object<VirtualMachine>(); |
994 | vm->LoadExecutable(GetObjectPtr<Executable>(exec)); |
995 | return runtime::Module(vm); |
996 | } |
997 | |
998 | TVM_REGISTER_GLOBAL("runtime._VirtualMachine" ).set_body([](TVMArgs args, TVMRetValue* rv) { |
999 | runtime::Module mod = args[0]; |
1000 | auto* exec = dynamic_cast<Executable*>(mod.operator->()); |
1001 | *rv = CreateVirtualMachine(exec); |
1002 | }); |
1003 | |
1004 | } // namespace vm |
1005 | } // namespace runtime |
1006 | } // namespace tvm |
1007 | |