vm.cc source code [tvm/src/runtime/vm/vm.cc]

1	/*
2	* Licensed to the Apache Software Foundation (ASF) under one
3	* or more contributor license agreements. See the NOTICE file
4	* distributed with this work for additional information
5	* regarding copyright ownership. The ASF licenses this file
6	* to you under the Apache License, Version 2.0 (the
7	* "License"); you may not use this file except in compliance
8	* with the License. You may obtain a copy of the License at
9	*
10	* http://www.apache.org/licenses/LICENSE-2.0
11	*
12	* Unless required by applicable law or agreed to in writing,
13	* software distributed under the License is distributed on an
14	* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15	* KIND, either express or implied. See the License for the
16	* specific language governing permissions and limitations
17	* under the License.
18	*/
19
20	/!*
21	* \file src/runtime/vm/vm.cc
22	* \brief The Relay virtual machine runtime.
23	*/
24
25	#include <dmlc/memory_io.h>
26	#include <tvm/runtime/container/adt.h>
27	#include <tvm/runtime/data_type.h>
28	#include <tvm/runtime/debug.h>
29	#include <tvm/runtime/logging.h>
30	#include <tvm/runtime/memory.h>
31	#include <tvm/runtime/object.h>
32	#include <tvm/runtime/vm/vm.h>
33
34	#include <algorithm>
35	#include <chrono>
36	#include <iostream>
37	#include <stdexcept>
38	#include <vector>
39
40	#include "../file_utils.h"
41
42	using namespace tvm::runtime;
43
44	namespace tvm {
45	namespace runtime {
46	namespace vm {
47
48	TVM_REGISTER_OBJECT_TYPE(VMClosureObj);
49
50	VMClosure::VMClosure(size_t func_index, std::vector<ObjectRef> free_vars) {
51	auto ptr = make_object<VMClosureObj>();
52	ptr ->func_index = func_index;
53	ptr ->free_vars = std::move(free_vars);
54	data_ = std::move(ptr);
55	}
56
57	void VMFunctionPrint(std::ostream& os, const VMFunction& vm_func) {
58	os << vm_func.name << ": " << std::endl;
59	for (size_t i = `0`; i < vm_func.instructions.size(); ++i) {
60	os << i << ": " << vm_func.instructions [i] << ";" << std::endl;
61	}
62	}
63
64	std::ostream& operator<<(std::ostream& os, const VMFunction& vm_func) {
65	VMFunctionPrint(os, vm_func);
66	return os;
67	}
68
69	inline ObjectRef CopyTo(ObjectRef src, const DLDevice& dev) {
70	if (src ->IsInstance<NDArray::ContainerType>()) {
71	auto nd_array = Downcast<NDArray>(src);
72	// TODO(mbs): Should respect device id also.
73	// TODO(vvchernov): it still does not work for different device id
74	// due to simple implementation of Get() and AllocDataSpace() methods
75	// see tvm/src/runtime/c_runtime_api.cc: L139
76	// tvm/src/runtime/cpu_device_api.cc: L47
77	if (nd_array ->device.device_type != dev.device_type \|\|
78	nd_array ->device.device_id != dev.device_id) {
79	VLOG(`2`) << "copying from " << nd_array ->device.device_type << "["
80	<< nd_array ->device.device_id << "] to " << dev.device_type << "[" << dev.device_id
81	<< "]";
82	return nd_array.CopyTo(dev);
83	}
84	return src;
85	} else {
86	ICHECK(src ->IsInstance<ADTObj>())
87	<< "VM data must be NDArray or a list of NDArray, but received: " << src ->_type_key;
88	std::vector<ObjectRef> ret;
89	ADT adt = Downcast<ADT>(src);
90	for (size_t i = `0`; i < adt.size(); i++) {
91	ret.push_back(CopyTo(adt [i], dev));
92	}
93	return ADT (adt ->tag, ret.begin(), ret.end());
94	}
95	}
96
97	std::vector<int64_t> ToShape(NDArray shape_tensor) {
98	std::vector<int64_t> shape;
99	auto rank = shape_tensor.Shape().size();
100	auto dtype = shape_tensor.DataType();
101
102	// For 0-rank shapes we need to allocate a single scalar.
103	if (rank == `0`) {
104	return shape;
105	}
106
107	// Otherwise we should be rank-1, and we will extract the number of dimensions
108	// for the output vector.
109	ICHECK_EQ(rank, `1U`) << "shape tensor should be a k-length vector, found " << rank;
110	int64_t ndim = shape_tensor.Shape().at(`0`);
111	shape.resize(ndim);
112
113	const DLTensor* dl_tensor = shape_tensor.operator->();
114	if (dtype.is_int() && dtype.bits() == `32` && dtype.lanes() == `1`) {
115	int32_t* dims = reinterpret_cast<int32_t*>(dl_tensor->data);
116	shape.assign(dims, dims + ndim);
117	} else if (dtype.is_int() && dtype.bits() == `64` && dtype.lanes() == `1`) {
118	int64_t* dims = reinterpret_cast<int64_t*>(dl_tensor->data);
119	shape.assign(dims, dims + ndim);
120	} else {
121	LOG(FATAL) << "invalid shape tensor datatype: " << dtype;
122	}
123
124	return shape;
125	}
126
127	void VirtualMachine::OpStartHook(Instruction instr) {}
128	void VirtualMachine::OpStopHook() {}
129
130	PackedFunc VirtualMachine::GetFunction(const std::string& name,
131	const ObjectPtr<Object>& sptr_to_self) {
132	if (name == "invoke") {
133	return PackedFunc ([sptr_to_self, this](TVMArgs args, TVMRetValue* rv) {
134	ICHECK(exec_) << "The executable is not created yet.";
135
136	std::string func_name = args [`0`];
137	auto git = exec_->global_map.find(func_name);
138	ICHECK(git != exec_->global_map.end())
139	<< "Cannot find function " << func_name << " in the executable";
140	auto func = exec_->functions [git ->second];
141	if (func.params.empty()) {
142	*rv = Invoke(func, {});
143	} else {
144	auto it = inputs_.find(func_name);
145	ICHECK(it != inputs_.end()) << "Input has not been set for function " << func_name;
146	const std::vector<ObjectRef>& input_args = it ->second;
147	if (set_outputs_enabled_.count(func_name) && set_outputs_enabled_[func_name]) {
148	ICHECK(outputs_.count(func_name))
149	<< "Outputs have not been set for function " << func_name;
150	*rv = Invoke(func, input_args, outputs_[func_name]);
151	outputs_[func_name].clear();
152	set_outputs_enabled_[func_name] = false;
153	} else {
154	*rv = Invoke(func, input_args);
155	}
156	}
157	});
158	} else if (name == "invoke_stateful") {
159	// TODO(tkonolige, jroesch, tqchen): invoke_stateful and get_output are
160	// stop-gap measure to allow using vm over a remote connection.
161	return PackedFunc ([sptr_to_self, this](TVMArgs args, TVMRetValue* rv) {
162	PackedFunc invoke = GetFunction("invoke", sptr_to_self);
163	TVMRetValue rv_;
164	invoke.CallPacked(args, &rv_);
165	});
166	} else if (name == "invoke_return_to_device") {
167	return PackedFunc ([sptr_to_self, this](TVMArgs args, TVMRetValue* rv) {
168	Device host{static_cast<DLDeviceType>(args [`1`].operator int()), args [`2`].operator int()};
169
170	SetInput(args [`0`].operator std::string(), args, `3`);
171	PackedFunc invoke = GetFunction("invoke", sptr_to_self);
172	TVMRetValue rv_;
173	invoke.CallPacked(args, &rv_); // Invoke only uses the first arg, so the rest of the args
174	// should not cause an issue
175	if (rv_.type_code() == kTVMObjectHandle) {
176	ADT adt = Downcast<ADT>(rv_.operator ObjectRef());
177	std::vector<ObjectRef> transfered;
178	for (size_t i = `0`; i < adt.size(); i++) {
179	transfered.push_back(CopyTo(adt [i], host));
180	}
181	*rv = ADT (adt.tag(), transfered);
182	} else {
183	*rv = CopyTo(rv_, host);
184	}
185	});
186	} else if (name == "get_output") {
187	return TypedPackedFunc<NDArray(int64_t)>([this](int64_t index) {
188	if (this->return_register_.as<ADTObj>()) {
189	return Downcast<NDArray>(Downcast<ADT>(this->return_register_)[index]);
190	} else {
191	CHECK_EQ(index, `0`) << "VM output contains only one item, but you are trying to get the "
192	<< index << "th.";
193	return Downcast<NDArray>(this->return_register_);
194	}
195	});
196	} else if (name == "get_num_outputs") {
197	return TypedPackedFunc<int64_t(void)>([this]() -> int64_t {
198	// single output is an NDArray not an ADT
199	if (this->return_register_.as<ADTObj>()) {
200	return Downcast<ADT>(this->return_register_).size();
201	} else {
202	return `1`;
203	}
204	});
205	} else if (name == "get_input_index") {
206	return TypedPackedFunc<int64_t(std::string, std::string)>(
207	[this](std::string input_name, std::string func_name) {
208	return GetInputIndexFromVMFunction(func_name, input_name);
209	});
210	} else if (name == "init") {
211	return PackedFunc ([sptr_to_self, this](TVMArgs args, TVMRetValue* rv) {
212	ICHECK_EQ(args.size() % `3`, `0`);
213	std::vector<Device> devices;
214	std::vector<AllocatorType> alloc_types;
215	for (int i = `0`; i < args.size() / `3`; ++i) {
216	Device dev;
217	int device_type = args [i * `3`];
218	dev.device_type = DLDeviceType(device_type);
219	dev.device_id = args [i * `3` + `1`];
220	int type = args [i * `3` + `2`];
221	devices.push_back(dev);
222	alloc_types.push_back(AllocatorType(type));
223	}
224	this->Init(devices, alloc_types);
225	});
226	} else if (name == "set_input") {
227	return PackedFunc (
228	[sptr_to_self, this](TVMArgs args, TVMRetValue* rv) { SetInput(args [`0`], args, `1`); });
229	} else if (name == "set_one_input") {
230	return PackedFunc ([sptr_to_self, this](TVMArgs args, TVMRetValue* rv) {
231	ICHECK_EQ(args.size(), `3`) << "The expected number of arguments is 3 "
232	<< "(func_name, index or name, tensor)";
233	SetOneInput(args [`0`], args [`1`], args [`2`]);
234	});
235	} else if (name == "set_outputs") {
236	return PackedFunc (
237	[sptr_to_self, this](TVMArgs args, TVMRetValue* rv) { SetOutputs(args [`0`], args); });
238	} else if (name == "load_late_bound_consts") {
239	return PackedFunc ([this](TVMArgs args, TVMRetValue* rv) {
240	CHECK_EQ(args.size(), `1`);
241	std::string path = args [`0`];
242	exec_->LoadLateBoundConstantsFromFile(path);
243	});
244	} else {
245	LOG(FATAL) << "Unknown packed function: " << name;
246	}
247	}
248
249	void VirtualMachine::SetInput(std::string func_name, TVMArgs args, int offset) {
250	const auto& vm_func = CheckAndGetVMFunction(func_name);
251	size_t params_num = vm_func.params.size();
252	ICHECK_EQ(args.size() - offset, params_num)
253	<< "The number of provided parameters doesn't match the number of arguments";
254	std::vector<ObjectRef> func_args(params_num);
255	for (int i = offset; i < args.size(); ++i) {
256	int index = i - offset;
257	Device dev = GetDevice(vm_func.param_device_indexes [index]);
258	SetInputTensorWithIndex(func_args, args [i], index, dev);
259	}
260	inputs_.erase(func_name);
261	inputs_.emplace(func_name, func_args);
262	}
263
264	void VirtualMachine::SetOneInput(std::string func_name, const TVMArgValue& tag,
265	const TVMArgValue& tensor) {
266	const auto& vm_func = CheckAndGetVMFunction(func_name);
267	size_t params_num = vm_func.params.size();
268
269	int inp_index = `0`;
270	if (tag.type_code() == kTVMArgInt) {
271	inp_index = tag;
272	} else if (tag.type_code() == kTVMStr) {
273	inp_index = static_cast<int>(GetInputIndexFromName(vm_func.params, tag));
274	} else {
275	LOG(FATAL) << "The type of input tensor tag (" << tag.type_code()
276	<< ") doesn't match integer or string";
277	}
278	ICHECK_LT(inp_index, params_num);
279
280	CreateInputsOrCheckSize(func_name, params_num);
281	Device dev = GetDevice(vm_func.param_device_indexes [inp_index]);
282	SetInputTensorWithIndex(inputs_[func_name], tensor, inp_index, dev);
283	}
284
285	void VirtualMachine::SetOutputs(std::string func_name, TVMArgs args) {
286	set_outputs_enabled_[func_name] = true;
287	size_t outputs_size = args.size();
288	// First args is func_name
289	ICHECK_GT(outputs_size, `1`) << "There is no output arguments set";
290
291	std::vector<ObjectRef> func_args(outputs_size - `1`);
292	for (size_t i = `1`; i < outputs_size; ++i) {
293	// TODO(vvchernov): device?
294	func_args [i - `1`] = TensorFromTVMArgValueToObjectRef(args [i]);
295	}
296	outputs_.erase(func_name);
297	outputs_.emplace(func_name, func_args);
298	}
299
300	void VirtualMachine::PrintInfoAndSetInputArgs(const VMFunction& func,
301	const std::vector<ObjectRef>& args) {
302	VLOG(`2`) << "Executing Function: " << std::endl << func;
303	for (int i = `0`; i < static_cast<int>(devices_.size()); ++i) {
304	VLOG(`2`) << "Device " << i << " has device type " << devices_[i].device_type << " and device id "
305	<< devices_[i].device_id
306	<< (i == exec_->host_device_index ? " (using as host device)" : "");
307	}
308
309	InvokeGlobal(func, args);
310	}
311
312	void VirtualMachine::SetOutputTensorsToRegister(const std::string& func_name,
313	const std::vector<ObjectRef>& outputs) {
314	size_t size = outputs.size();
315
316	if (output_tensor_reg_indices_[func_name].empty()) {
317	output_tensor_reg_indices_[func_name] = GetOutputTensorRegIndices();
318	}
319	auto& reg_indices = output_tensor_reg_indices_[func_name];
320	ICHECK_EQ(reg_indices.size(), size)
321	<< "Number of outside output tensors should be equal to model outputs number";
322	size_t i = `0`;
323	for (auto it = reg_indices.begin(); it != reg_indices.end(); ++it, ++i) {
324	WriteRegister(*it, outputs [i]);
325	}
326	}
327
328	ObjectRef VirtualMachine::TensorFromTVMArgValueToObjectRef(const TVMArgValue& output_tensor) const {
329	if (output_tensor.type_code() == kTVMDLTensorHandle) {
330	DLTensor* dl_tensor = output_tensor;
331	return NDArray::FromExternalDLTensor(*dl_tensor);
332	} else if (output_tensor.type_code() == kTVMNDArrayHandle) {
333	return output_tensor.AsObjectRef<tvm::runtime::NDArray>();
334	} else {
335	LOG(FATAL) << "It supports tensor of DLTensor or NDArray type only! Given type is "
336	<< output_tensor.type_code();
337	}
338	return ObjectRef ();
339	}
340
341	int64_t VirtualMachine::GetInputIndexFromVMFunction(const std::string& func_name,
342	const std::string& input_name) const {
343	const auto& vm_func = CheckAndGetVMFunction(func_name);
344	return GetInputIndexFromName(vm_func.params, input_name);
345	}
346
347	int64_t VirtualMachine::GetInputIndexFromName(const std::vector<std::string>& params,
348	const std::string& input_name) const {
349	// TODO(vvchernov): excess integer type?
350	for (uint64_t i = `0`; i < params.size(); i++) {
351	if (input_name == params [i]) {
352	return static_cast<int64_t>(i);
353	}
354	}
355	return static_cast<int64_t>(-`1`);
356	}
357
358	const VMFunction& VirtualMachine::CheckAndGetVMFunction(const std::string& func_name) const {
359	ICHECK(exec_) << "The executable is not created yet.";
360	return exec_->GetVMFunctionWithName(func_name);
361	}
362
363	void VirtualMachine::CreateInputsOrCheckSize(const std::string& func_name, size_t size) {
364	if (inputs_.count(func_name)) {
365	ICHECK_EQ(inputs_[func_name].size(), size)
366	<< "The size of function" << func_name
367	<< " doesn't match the number of provided parameters";
368	} else {
369	std::vector<ObjectRef> func_args(size);
370	inputs_.emplace(func_name, func_args);
371	}
372	}
373
374	void VirtualMachine::SetInputTensorWithIndex(std::vector<ObjectRef>& tensors,
375	const TVMArgValue& inp_tensor, int index, Device dev) {
376	if (inp_tensor.type_code() == kTVMDLTensorHandle) {
377	if (NDArray::AbilityOfZeroCopyForDLTensor(inp_tensor, dev)) {
378	tensors [index] = NDArray::FromExternalDLTensor(*inp_tensor);
379	} else {
380	tensors [index] = NDArray::NewFromDLTensor(inp_tensor, dev);
381	}
382	} else {
383	tensors [index] = CopyTo(inp_tensor, dev);
384	}
385	}
386
387	inline Device VirtualMachine::GetDevice(Index device_index) const {
388	ICHECK_GE(devices_.size(), device_index) << "invalid device index: " << device_index;
389	return devices_[device_index];
390	}
391
392	inline Allocator* VirtualMachine::GetAllocator(Index device_index) const {
393	ICHECK_GE(allocators_.size(), device_index) << "invalid device index: " << device_index;
394	return allocators_[device_index];
395	}
396
397	void VirtualMachine::PushFrame(Index arg_count, Index ret_pc, const VMFunction& vm_func) {
398	auto frame = VMFrame (ret_pc, func_index_, arg_count, code_, vm_func.register_file_size);
399	frames_.push_back(frame);
400	}
401
402	Index VirtualMachine::PopFrame() {
403	ICHECK_GT(frames_.size(), `0`);
404	const VMFrame& fr = frames_.back();
405	func_index_ = fr.func_index;
406	code_ = fr.code;
407	pc_ = fr.pc;
408	auto call_stack_size = frames_.size();
409	frames_.pop_back();
410	return call_stack_size;
411	}
412
413	void VirtualMachine::InvokeGlobal(const VMFunction& func, const std::vector<ObjectRef>& args) {
414	VLOG(`2`) << "Invoking global " << func.name << " with " << args.size() << " args";
415
416	PushFrame(func.params.size(), this->pc_ + `1`, func);
417	for (size_t i = `0`; i < args.size(); ++i) {
418	WriteRegister(i, args [i]);
419	VLOG(`2`) << "arg " << i << " = "
420	<< RuntimeObject2String(args [i], GetDevice(exec_->host_device_index));
421	}
422
423	code_ = func.instructions.data();
424	pc_ = `0`;
425	}
426
427	ObjectRef VirtualMachine::Invoke(const VMFunction& func, const std::vector<ObjectRef>& args) {
428	PrintInfoAndSetInputArgs(func, args);
429	RunLoop();
430	return return_register_;
431	}
432
433	ObjectRef VirtualMachine::Invoke(const std::string& name, const std::vector<ObjectRef>& args) {
434	ICHECK(exec_) << "The executable has not been created yet.";
435	auto it = exec_->global_map.find(name);
436	ICHECK(it != exec_->global_map.end()) << "Cannot find function " << name << " in the executable";
437	Index func_index = it ->second;
438	VLOG(`2`) << "Invoke Global " << name << " at index " << func_index;
439	return Invoke(exec_->functions [func_index], args);
440	}
441
442	ObjectRef VirtualMachine::Invoke(const VMFunction& func, const std::vector<ObjectRef>& input_args,
443	const std::vector<ObjectRef>& output_args) {
444	PrintInfoAndSetInputArgs(func, input_args);
445	SetOutputTensorsToRegister(func.name, output_args);
446	RunLoop(output_tensor_reg_indices_[func.name]);
447	return return_register_;
448	}
449
450	void VirtualMachine::InvokePacked(Index packed_index, const PackedFunc& func, Index arg_count,
451	Index output_size, const std::vector<ObjectRef>& args) {
452	size_t arity = `0`;
453	for (Index i = `0`; i < arg_count; i++) {
454	if (const auto* obj = args [i].as<ADTObj>()) {
455	arity += obj->size;
456	} else {
457	++arity;
458	}
459	}
460
461	std::vector<TVMValue> values(arity);
462	std::vector<int> codes(arity);
463	runtime::TVMArgsSetter setter(values.data(), codes.data());
464	int idx = `0`;
465	bool is_empty_output = false;
466	for (Index i = `0`; i < arg_count; i++) {
467	if (const auto* dt_cell = args [i].as<ADTObj>()) {
468	for (size_t fi = `0`; fi < dt_cell->size; ++fi) {
469	auto obj = (*dt_cell)[fi];
470	auto nd_array = Downcast<NDArray>(obj);
471	setter (idx++, nd_array);
472	}
473	} else {
474	auto nd_array = Downcast<NDArray>(args [i]);
475	// We can safely skip CallPacked if there is only one
476	// output and it is empty.
477	if (i == arg_count - `1` && output_size == `1`) {
478	for (const auto& dim : nd_array.Shape()) {
479	if (!dim) {
480	is_empty_output = true;
481	break;
482	}
483	}
484	}
485	setter (idx++, nd_array);
486	}
487	}
488
489	if (!is_empty_output) {
490	TVMRetValue rv;
491	func.CallPacked(TVMArgs (values.data(), codes.data(), arity), &rv);
492	}
493	}
494
495	void VirtualMachine::LoadExecutable(const ObjectPtr<Executable>& exec) {
496	ICHECK(exec) << "The executable is not created yet.";
497	ICHECK(exec ->late_bound_constant_names.empty())
498	<< "Need to load late-bound-constants before creating VM";
499	exec_ = exec;
500
501	runtime::Module lib = exec_->GetLib();
502
503	ICHECK(exec_->primitive_map.empty() \|\| lib.operator->())
504	<< "If the executable has declared primitive functions, the "
505	<< "generated kernel library must non-be null.";
506
507	for (const auto& it : exec_->primitive_map) {
508	const auto& packed_name = it.first;
509	auto packed_index = static_cast<size_t>(it.second);
510	if (packed_funcs_.size() <= packed_index) {
511	packed_funcs_.resize(packed_index + `1`);
512	}
513	tvm::runtime::PackedFunc pf = lib.GetFunction(packed_name, /query_imports=/true);
514	ICHECK(pf != nullptr) << "Cannot find function in module: " << packed_name;
515	packed_funcs_[packed_index] = pf;
516	}
517	for (size_t i = `0`; i < packed_funcs_.size(); ++i) {
518	ICHECK(packed_funcs_[i] != nullptr) << "Packed function " << i << " is not initialized";
519	}
520	}
521
522	void VirtualMachine::Init(const std::vector<Device>& physical_devices,
523	const std::vector<AllocatorType>& alloc_types) {
524	ICHECK_EQ(physical_devices.size(), alloc_types.size());
525
526	// Find a physical device to represent each virtual device the VM code requires.
527	// (Recall the VM instructions refer to devices by "device index" into this vector of
528	// virtual devices.)
529	const size_t num_virtual_devices = exec_->virtual_devices.size();
530	devices_.reserve(num_virtual_devices);
531	allocators_.reserve(num_virtual_devices);
532
533	for (size_t device_index = `0`; device_index < num_virtual_devices; ++device_index) {
534	// We'll retain the legacy behaviour and just match by device type.
535	// TODO(mbs): Generalize.
536	DLDeviceType virtual_device_type = exec_->virtual_devices [device_index].device_type;
537	auto itr = std::find_if(physical_devices.begin(), physical_devices.end(),
538	[virtual_device_type](const Device& physical_device) {
539	return physical_device.device_type == virtual_device_type;
540	});
541	CHECK(itr != physical_devices.end())
542	<< "Unable to find a physical device (from among the " << physical_devices.size()
543	<< " given) to match the virtual device with device type " << virtual_device_type;
544	const size_t i = std::distance(physical_devices.begin(), itr);
545	devices_.push_back(*itr);
546	allocators_.push_back(MemoryManager::GetOrCreateAllocator(*itr, alloc_types [i]));
547	}
548	}
549
550	inline void VirtualMachine::WriteRegister(Index r, const ObjectRef& val) {
551	frames_.back().register_file [r] = val;
552	}
553
554	ObjectRef VirtualMachine::ReadRegister(Index r) const { return frames_.back().register_file [r]; }
555
556	int64_t VirtualMachine::LoadScalarInt(Index r) const {
557	int64_t result = `0`;
558	const auto& obj = ReadRegister(r);
559	NDArray array = Downcast<NDArray>(CopyTo(obj, GetDevice(exec_->host_device_index)));
560
561	switch (array ->dtype.bits) {
562	case `1`: {
563	result = reinterpret_cast<bool*>(array ->data)[`0`];
564	break;
565	}
566	case `8`: {
567	result = reinterpret_cast<int8_t*>(array ->data)[`0`];
568	break;
569	}
570	case `16`: {
571	result = reinterpret_cast<int16_t*>(array ->data)[`0`];
572	break;
573	}
574	case `32`: {
575	result = reinterpret_cast<int32_t*>(array ->data)[`0`];
576	break;
577	}
578	case `64`: {
579	result = reinterpret_cast<int64_t*>(array ->data)[`0`];
580	break;
581	}
582	default:
583	LOG(FATAL) << "Unknown scalar int type: " << DLDataType2String(array ->dtype);
584	}
585	return result;
586	}
587
588	Index VirtualMachine::GetResultRegisterIndex() const {
589	Index op_index = `0`;
590	while (code_[op_index].op != Opcode::Ret) {
591	++op_index;
592	}
593
594	return code_[op_index].result;
595	}
596
597	void VirtualMachine::CalculatePreResultOpIndex(Index res_index) {
598	if (preresult_op_index_ == -`1`) {
599	preresult_op_index_ = `0`;
600	while (code_[preresult_op_index_].dst != res_index) {
601	++preresult_op_index_;
602	}
603	}
604	}
605
606	std::vector<Index> VirtualMachine::GetOutputTensorRegIndices() {
607	std::vector<Index> reg_indices;
608	Index res_index = GetResultRegisterIndex();
609	CalculatePreResultOpIndex(res_index);
610	auto& preres_instr = code_[preresult_op_index_];
611	auto op_code = preres_instr.op;
612	if (op_code == Opcode::AllocTensor) {
613	reg_indices.emplace_back(res_index);
614	} else if (op_code == Opcode::AllocADT) {
615	for (Index i = `0`; i < preres_instr.num_fields; ++i) {
616	reg_indices.push_back(preres_instr.datatype_fields[i]);
617	}
618	} else if (op_code == Opcode::ReshapeTensor) {
619	reg_indices.push_back(preres_instr.reshape_tensor.tensor);
620	} else {
621	LOG(FATAL) << "Operation " << size_t(op_code) << " is not supported for set_outputs method";
622	}
623	return reg_indices;
624	}
625
626	void VirtualMachine::RunLoop(const std::vector<Index>& output_tensor_reg_indices) {
627	ICHECK(this->exec_);
628	ICHECK(this->code_);
629	pc_ = `0`;
630	Index frame_start = frames_.size();
631	while (true) {
632	main_loop:
633	auto const& instr = code_[this->pc_];
634	VLOG(`2`) << "Executing(" << pc_ << "): " << instr;
635
636	switch (instr.op) {
637	case Opcode::Move: {
638	ObjectRef from_obj;
639	from_obj = ReadRegister(instr.from);
640	WriteRegister(instr.dst, from_obj);
641	pc_++;
642	goto main_loop;
643	}
644	case Opcode::Fatal: {
645	throw std::runtime_error ("VM encountered fatal error");
646	}
647	case Opcode::LoadConst: {
648	bool is_not_cached = const_pool_.size() <= static_cast<size_t>(instr.const_index) \|\|
649	!const_pool_[instr.const_index].defined();
650	if (is_not_cached) {
651	OpStartHook(instr);
652	}
653	auto constant_obj = exec_->constants [instr.const_index];
654	// We cache the allocated object in the constant pool. To measure, the
655	// first iteration will set the pool up. The other iterations will
656	// directly reuse the allocated objects.
657	if (const_pool_.size() <= static_cast<size_t>(instr.const_index)) {
658	const_pool_.resize(instr.const_index + `1`);
659	}
660
661	if (!const_pool_[instr.const_index].defined()) {
662	Device dev = GetDevice(exec_->const_device_indexes [instr.const_index]);
663	const_pool_[instr.const_index] = CopyTo(constant_obj, dev);
664	}
665	WriteRegister(instr.dst, const_pool_[instr.const_index]);
666	if (is_not_cached) {
667	OpStopHook();
668	}
669	pc_++;
670	goto main_loop;
671	}
672	case Opcode::LoadConsti: {
673	auto tensor = NDArray::Empty({`1`}, {kDLInt, `64`, `1`}, GetDevice(exec_->host_device_index));
674	reinterpret_cast<int64_t*>(tensor ->data)[`0`] = instr.load_consti.val;
675	WriteRegister(instr.dst, tensor);
676	pc_++;
677	goto main_loop;
678	}
679	case Opcode::Invoke: {
680	std::vector<ObjectRef> args;
681	for (Index i = `0`; i < instr.num_args; ++i) {
682	args.push_back(ReadRegister(instr.invoke_args_registers[i]));
683	}
684	InvokeGlobal(exec_->functions [instr.func_index], args);
685	frames_.back().caller_return_register = instr.dst;
686	goto main_loop;
687	}
688	case Opcode::InvokePacked: {
689	ICHECK_LE(instr.packed_index, packed_funcs_.size());
690	const auto& func = packed_funcs_[instr.packed_index];
691	const auto& arity = instr.arity;
692	std::vector<ObjectRef> args;
693	for (Index i = `0`; i < arity; ++i) {
694	auto arg = ReadRegister(instr.packed_args[i]);
695	args.push_back(arg);
696	#if TVM_LOG_DEBUG
697	if (i < arity) {
698	const bool is_input = i < arity - instr.output_size;
699	VLOG(`2`) << (is_input ? "input" : "placeholder") << " arg " << i << " = "
700	<< RuntimeObject2String(arg, GetDevice(exec_->host_device_index),
701	/show_contents=/is_input);
702	}
703	#endif
704	}
705
706	// We no longer need to write the registers back, we write directly
707	// through the registers mutably.
708	InvokePacked(instr.packed_index, func, arity, instr.output_size, args);
709
710	#if TVM_LOG_DEBUG
711	for (Index i = arity - instr.output_size; i < arity; ++i) {
712	auto arg = ReadRegister(instr.packed_args[i]);
713	VLOG(`2`) << "output arg " << i << " = "
714	<< RuntimeObject2String(arg, GetDevice(exec_->host_device_index));
715	}
716	#endif
717
718	pc_++;
719	goto main_loop;
720	}
721	case Opcode::InvokeClosure: {
722	auto object = ReadRegister(instr.closure);
723	const auto* closure = object.as<VMClosureObj>();
724	ICHECK(closure);
725	std::vector<ObjectRef> args;
726	for (auto free_var : closure->free_vars) {
727	args.push_back(free_var);
728	}
729	for (Index i = `0`; i < instr.num_closure_args; ++i) {
730	args.push_back(ReadRegister(instr.closure_args[i]));
731	}
732	InvokeGlobal(exec_->functions [closure->func_index], args);
733	frames_.back().caller_return_register = instr.dst;
734	goto main_loop;
735	}
736	case Opcode::GetField: {
737	auto object = ReadRegister(instr.object);
738	const auto& tuple = Downcast<ADT>(object);
739	auto field = tuple [instr.field_index];
740	WriteRegister(instr.dst, field);
741	pc_++;
742	goto main_loop;
743	}
744	case Opcode::GetTag: {
745	auto object = ReadRegister(instr.get_tag.object);
746	const auto& adt = Downcast<ADT>(object);
747	auto tag = adt.tag();
748	auto tag_tensor = NDArray::Empty({`1`}, {kDLInt, `32`, `1`}, GetDevice(exec_->host_device_index));
749	reinterpret_cast<int32_t*>(tag_tensor ->data)[`0`] = tag;
750	WriteRegister(instr.dst, tag_tensor);
751	pc_++;
752	goto main_loop;
753	}
754	case Opcode::Goto: {
755	pc_ += instr.pc_offset;
756	goto main_loop;
757	}
758	case Opcode::If: {
759	int32_t test_val = LoadScalarInt(instr.if_op.test);
760	int32_t target_val = LoadScalarInt(instr.if_op.target);
761
762	if (test_val == target_val) {
763	ICHECK_NE(instr.if_op.true_offset, `0`);
764	pc_ += instr.if_op.true_offset;
765	} else {
766	ICHECK_NE(instr.if_op.false_offset, `0`);
767	pc_ += instr.if_op.false_offset;
768	}
769
770	goto main_loop;
771	}
772	case Opcode::AllocTensor: {
773	OpStartHook(instr);
774	if (!output_tensor_reg_indices.empty() && FindIndex(output_tensor_reg_indices, instr.dst)) {
775	WriteAllocatedTensorFromOutside(instr);
776	} else {
777	WriteAllocatedTensor(instr);
778	}
779	OpStopHook();
780	pc_++;
781	goto main_loop;
782	}
783	case Opcode::AllocTensorReg: {
784	OpStartHook(instr);
785	Device cpu_dev = GetDevice(exec_->host_device_index);
786	auto shape_obj = ReadRegister(instr.alloc_tensor_reg.shape_register);
787	NDArray shape_tensor = Downcast<NDArray>(CopyTo(shape_obj, cpu_dev));
788	auto shape = ToShape(shape_tensor);
789	auto storage_obj = ReadRegister(instr.alloc_tensor_reg.storage);
790	auto storage = Downcast<Storage>(storage_obj);
791	auto offset = LoadScalarInt(instr.alloc_tensor.offset);
792	auto obj = storage ->AllocNDArray(offset, shape, instr.alloc_tensor_reg.dtype);
793	VLOG(`2`) << "allocated "
794	<< RuntimeObject2String(obj, GetDevice(exec_->host_device_index),
795	/show_contents=/false);
796
797	WriteRegister(instr.dst, obj);
798	OpStopHook();
799	pc_++;
800	goto main_loop;
801	}
802	case Opcode::AllocADT: {
803	std::vector<ObjectRef> fields;
804	for (Index i = `0`; i < instr.num_fields; ++i) {
805	fields.push_back(ReadRegister(instr.datatype_fields[i]));
806	}
807	ObjectRef obj = ADT (instr.constructor_tag, fields);
808	WriteRegister(instr.dst, obj);
809	pc_++;
810	goto main_loop;
811	}
812	case Opcode::AllocClosure: {
813	std::vector<ObjectRef> free_vars;
814	for (Index i = `0`; i < instr.num_freevar; i++) {
815	free_vars.push_back(ReadRegister(instr.free_vars[i]));
816	}
817	WriteRegister(instr.dst, VMClosure (instr.func_index, free_vars));
818	pc_++;
819	goto main_loop;
820	}
821	case Opcode::AllocStorage: {
822	OpStartHook(instr);
823	auto size = LoadScalarInt(instr.alloc_storage.allocation_size);
824	auto alignment = instr.alloc_storage.alignment;
825
826	auto storage_obj = SimpleObjAllocator ().make_object<StorageObj>();
827	Allocator* allocator = GetAllocator(instr.alloc_storage.device_index);
828	ICHECK(allocator) << "Did you forget to init the VirtualMachine with devices?";
829	VLOG(`2`) << "allocating with allocation_size=" << size << ", alignment=" << alignment
830	<< ", dtype_hint=" << DLDataType2String(instr.alloc_storage.dtype_hint)
831	<< ", device_index=" << instr.alloc_storage.device_index;
832
833	storage_obj ->buffer = allocator->Alloc(size, alignment, instr.alloc_storage.dtype_hint);
834	Storage storage(storage_obj);
835	WriteRegister(instr.dst, storage);
836	OpStopHook();
837	pc_++;
838	goto main_loop;
839	}
840	case Opcode::ShapeOf: {
841	auto input = ReadRegister(instr.shape_of.tensor);
842	NDArray input_array = Downcast<NDArray>(input);
843	int ndim = input_array ->ndim;
844	auto out_tensor =
845	NDArray::Empty({ndim}, {kDLInt, `64`, `1`}, GetDevice(exec_->host_device_index));
846	for (int i = `0`; i < ndim; ++i) {
847	reinterpret_cast<int64_t*>(out_tensor ->data)[i] = input_array ->shape[i];
848	}
849	VLOG(`2`) << "shape = "
850	<< RuntimeObject2String(out_tensor, GetDevice(exec_->host_device_index));
851	WriteRegister(instr.dst, out_tensor);
852	pc_++;
853	goto main_loop;
854	}
855	case Opcode::Ret: {
856	// If we have hit the point from which we started
857	// running, we should return to the caller breaking
858	// the dispatch loop.
859	return_register_ = ReadRegister(instr.result);
860	auto caller_return_register = frames_.back().caller_return_register;
861
862	if (PopFrame() == frame_start) {
863	return;
864	// Otherwise we are just returning from a local call.
865	} else {
866	WriteRegister(caller_return_register, return_register_);
867	goto main_loop;
868	}
869	}
870	case Opcode::ReshapeTensor: {
871	OpStartHook(instr);
872	Device cpu_dev = GetDevice(exec_->host_device_index);
873	auto tensor_obj = ReadRegister(instr.reshape_tensor.tensor);
874	NDArray tensor_arr = Downcast<NDArray>(tensor_obj);
875	// Read the shape from shape tensor
876	auto shape_obj = ReadRegister(instr.reshape_tensor.newshape);
877	NDArray shape_tensor = Downcast<NDArray>(CopyTo(shape_obj, cpu_dev));
878	const DLTensor* dl_tensor = shape_tensor.operator->();
879	ICHECK_EQ(dl_tensor->dtype.code, `0u`);
880	ICHECK_EQ(dl_tensor->dtype.bits, `64u`);
881	int64_t* dims = reinterpret_cast<int64_t*>(dl_tensor->data);
882	int64_t ndim = shape_tensor ->shape[`0`];
883	std::vector<int64_t> shape(dims, dims + ndim);
884	// Reshape the input tensor
885	auto out_tensor = tensor_arr.CreateView(shape, tensor_arr ->dtype);
886	VLOG(`2`) << "reshaped "
887	<< RuntimeObject2String(tensor_obj, GetDevice(exec_->host_device_index)) << " to "
888	<< RuntimeObject2String(out_tensor, GetDevice(exec_->host_device_index));
889	WriteRegister(instr.dst, out_tensor);
890	OpStopHook();
891	pc_++;
892	goto main_loop;
893	}
894	case Opcode::DeviceCopy: {
895	OpStartHook(instr);
896	auto tensor_src = ReadRegister(instr.device_copy.src);
897	NDArray src_data = Downcast<NDArray>(tensor_src);
898	Device actual_src_dev = src_data ->device;
899	Device inst_src_dev = GetDevice(instr.device_copy.src_device_index);
900	ICHECK_EQ(actual_src_dev.device_type, inst_src_dev.device_type);
901	ICHECK_EQ(actual_src_dev.device_id, inst_src_dev.device_id);
902	Device dst_dev = GetDevice(instr.device_copy.dst_device_index);
903
904	NDArray dst_data = src_data.CopyTo(dst_dev);
905	WriteRegister(instr.dst, dst_data);
906	OpStopHook();
907	pc_++;
908	goto main_loop;
909	}
910	case Opcode::KillRegister: {
911	OpStartHook(instr);
912	WriteRegister(instr.dst, ObjectRef ());
913	OpStopHook();
914	pc_++;
915	goto main_loop;
916	}
917	default:
918	LOG(FATAL) << "Unknown instruction opcode: " << int(instr.op);
919	}
920	}
921	}
922
923	void VirtualMachine::WriteAllocatedTensor(const Instruction& instr) {
924	auto shape = std::vector<int64_t>(instr.alloc_tensor.ndim);
925
926	for (uint32_t i = `0`; i < instr.alloc_tensor.ndim; ++i) {
927	shape [i] = instr.alloc_tensor.shape[i];
928	}
929
930	auto storage_obj = ReadRegister(instr.alloc_tensor.storage);
931	auto offset = LoadScalarInt(instr.alloc_tensor.offset);
932	auto storage = Downcast<Storage>(storage_obj);
933	auto obj = storage ->AllocNDArray(offset, shape, instr.alloc_tensor.dtype);
934	VLOG(`2`) << "allocated "
935	<< RuntimeObject2String(obj, GetDevice(exec_->host_device_index),
936	/show_contents=/false);
937
938	WriteRegister(instr.dst, obj);
939	}
940
941	void VirtualMachine::WriteAllocatedTensorFromOutside(const Instruction& instr) {
942	// External tensor(s) has been already written to the register (instr.dst)
943	auto ex_arr = Downcast<NDArray>(ReadRegister(instr.dst));
944	auto ex_shape = ex_arr.Shape();
945	auto ex_size = ex_shape.size();
946	auto ex_dtype = ex_arr ->dtype;
947
948	auto in_size = instr.alloc_tensor.ndim;
949	auto in_dtype = instr.alloc_tensor.dtype;
950	ICHECK_EQ(TypeEqual(in_dtype, ex_dtype), true)
951	<< "Data types mismatching for internal and external output tensors";
952
953	bool size_check = false;
954	if (ex_size != in_size) {
955	size_check = true;
956	} else {
957	for (size_t i = `0`; i < in_size; ++i) {
958	if (ex_shape [i] != instr.alloc_tensor.shape[i]) {
959	size_check = true;
960	break;
961	}
962	}
963	}
964
965	if (size_check) {
966	// Match element number
967	size_t in_el_num = `1`, ex_el_num = `1`;
968	for (size_t i = `0`; i < ex_size; ++i) {
969	ex_el_num *= ex_shape [i];
970	}
971	for (size_t i = `0`; i < in_size; ++i) {
972	in_el_num *= instr.alloc_tensor.shape[i];
973	}
974	ICHECK_EQ(in_el_num, ex_el_num)
975	<< "Element number mismatching of internal and external output tensors";
976	if (code_[preresult_op_index_].op == Opcode::ReshapeTensor) {
977	int64_t* dims = instr.alloc_tensor.shape;
978	std::vector<int64_t> ref_shape(dims, dims + int64_t(in_size));
979	auto reshaped_tensor = ex_arr.CreateView(ref_shape, ex_dtype);
980	WriteRegister(instr.dst, reshaped_tensor);
981	} else {
982	LOG(FATAL) << "Internal and external output tensor shapes are mismatched";
983	}
984	}
985	}
986
987	bool VirtualMachine::FindIndex(const std::vector<Index>& indices, Index val) const {
988	auto it = std::find(indices.begin(), indices.end(), val);
989	return it != indices.end();
990	}
991
992	runtime::Module CreateVirtualMachine(Executable* exec) {
993	auto vm = make_object<VirtualMachine>();
994	vm ->LoadExecutable(GetObjectPtr<Executable>(exec));
995	return runtime::Module (vm);
996	}
997
998	TVM_REGISTER_GLOBAL("runtime._VirtualMachine").set_body([](TVMArgs args, TVMRetValue* rv) {
999	runtime::Module mod = args [`0`];
1000	auto* exec = dynamic_cast<Executable*>(mod.operator->());
1001	*rv = CreateVirtualMachine(exec);
1002	});
1003
1004	} // namespace vm
1005	} // namespace runtime
1006	} // namespace tvm
1007

Browse the source code of tvm/src/runtime/vm/vm.cc