1/*
2 * Licensed to the Apache Software Foundation (ASF) under one
3 * or more contributor license agreements. See the NOTICE file
4 * distributed with this work for additional information
5 * regarding copyright ownership. The ASF licenses this file
6 * to you under the Apache License, Version 2.0 (the
7 * "License"); you may not use this file except in compliance
8 * with the License. You may obtain a copy of the License at
9 *
10 * http://www.apache.org/licenses/LICENSE-2.0
11 *
12 * Unless required by applicable law or agreed to in writing,
13 * software distributed under the License is distributed on an
14 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15 * KIND, either express or implied. See the License for the
16 * specific language governing permissions and limitations
17 * under the License.
18 */
19
20/*!
21 * \file src/runtime/vm/profiler/vm.cc
22 * \brief The Relay debug virtual machine.
23 */
24
25#include "vm.h"
26
27#include <tvm/runtime/container/adt.h>
28#include <tvm/runtime/data_type.h>
29#include <tvm/runtime/registry.h>
30
31#include <algorithm>
32#include <chrono>
33#include <iomanip>
34#include <memory>
35#include <numeric>
36#include <sstream>
37#include <string>
38#include <utility>
39#include <vector>
40
41namespace tvm {
42namespace runtime {
43namespace vm {
44
45PackedFunc VirtualMachineDebug::GetFunction(const std::string& name,
46 const ObjectPtr<Object>& sptr_to_self) {
47 if (name == "profile") {
48 return TypedPackedFunc<profiling::Report(String, Array<profiling::MetricCollector>)>(
49 [sptr_to_self, this](String arg_name, Array<profiling::MetricCollector> collectors) {
50 std::vector<Device> devices;
51 for (auto dev : devices_) {
52 if (dev.device_type > 0) {
53 devices.push_back(dev);
54 }
55 }
56
57 // We cannot send Arrays over rpc, so in order to support profiling
58 // on remotes, we accept a nullptr for collectors.
59 if (collectors.defined()) {
60 std::vector<profiling::MetricCollector> cs(collectors.begin(), collectors.end());
61 prof_ = profiling::Profiler(devices, cs, {{String("Executor"), String("VM")}});
62 } else {
63 prof_ = profiling::Profiler(devices, {}, {{String("Executor"), String("VM")}});
64 }
65
66 auto invoke = VirtualMachine::GetFunction("invoke", sptr_to_self);
67 // warmup
68 for (int i = 0; i < 3; i++) {
69 invoke(arg_name);
70 }
71
72 prof_.operator*().Start();
73 invoke(arg_name);
74 prof_.operator*().Stop();
75 auto report = prof_.operator*().Report();
76 prof_ = std::nullopt; // releases hardware counters
77 return report;
78 });
79 } else if (name == "profile_rpc") {
80 // We cannot return a Report over RPC because TVM RPC mechanism only
81 // supports a subset of Object classes. Instead we serialize it on the
82 // remote (here) and deserialize it on the other end.
83 return TypedPackedFunc<std::string(std::string)>([sptr_to_self, this](std::string arg_name) {
84 PackedFunc profile = GetFunction("profile", sptr_to_self);
85 profiling::Report report = profile(arg_name, Array<profiling::MetricCollector>());
86 return report->AsJSON();
87 });
88 } else {
89 return VirtualMachine::GetFunction(name, sptr_to_self);
90 }
91}
92
93void VirtualMachineDebug::LoadExecutable(const ObjectPtr<Executable>& exec) {
94 VirtualMachine::LoadExecutable(exec);
95 for (auto kv : exec_->primitive_map) {
96 packed_index_map_[kv.second] = kv.first;
97 }
98}
99
100void VirtualMachineDebug::OpStartHook(Instruction instr) {
101 if (prof_ && prof_.operator*().IsRunning()) {
102 if (instr.op == Opcode::LoadConst) {
103 Device dev = GetDevice(exec_->const_device_indexes[instr.const_index]);
104 prof_.operator*().StartCall("VM::LoadConst", dev, {});
105 } else if (instr.op == Opcode::DeviceCopy) {
106 Device dst_dev = GetDevice(instr.device_copy.dst_device_index);
107 prof_.operator*().StartCall("VM::DeviceCopy", dst_dev, {});
108 } else if (instr.op == Opcode::ReshapeTensor) {
109 prof_.operator*().StartCall("VM::ReshapeTensor", devices_[exec_->host_device_index], {});
110 } else if (instr.op == Opcode::AllocTensor) {
111 auto shape = std::vector<int64_t>(instr.alloc_tensor.ndim);
112
113 for (uint32_t i = 0; i < instr.alloc_tensor.ndim; ++i) {
114 shape[i] = instr.alloc_tensor.shape[i];
115 }
116 auto storage_obj = ReadRegister(instr.alloc_tensor.storage);
117 auto storage = Downcast<Storage>(storage_obj);
118 prof_.operator*().StartCall(
119 "VM::AllocTensor", storage->buffer.device,
120 {{"Argument Shapes", profiling::ShapeString(shape, instr.alloc_tensor.dtype)}});
121 } else if (instr.op == Opcode::AllocTensorReg) {
122 auto storage_obj = ReadRegister(instr.alloc_tensor_reg.storage);
123 auto storage = Downcast<Storage>(storage_obj);
124 Device cpu_dev = GetDevice(exec_->host_device_index);
125 auto shape_obj = ReadRegister(instr.alloc_tensor_reg.shape_register);
126 NDArray shape_tensor = Downcast<NDArray>(shape_obj).CopyTo(cpu_dev);
127 prof_.operator*().StartCall(
128 "VM::AllocTensorReg", storage->buffer.device,
129 {{"Argument Shapes",
130 profiling::ShapeString(shape_tensor, instr.alloc_tensor_reg.dtype)}});
131 } else if (instr.op == Opcode::AllocStorage) {
132 auto size = LoadScalarInt(instr.alloc_storage.allocation_size);
133 std::ostringstream shape;
134 shape << DLDataType2String(instr.alloc_storage.dtype_hint) << "[" << size << "]";
135 Device dev = GetDevice(instr.alloc_storage.device_index);
136 prof_.operator*().StartCall("VM::AllocStorage", dev,
137 {{"VM::Argument Shapes", String(shape.str())}});
138 } else {
139 prof_.operator*().StartCall("VM::UnknownOp", GetDevice(exec_->host_device_index), {});
140 }
141 }
142}
143
144void VirtualMachineDebug::OpStopHook() {
145 if (prof_ && prof_.operator*().IsRunning()) {
146 prof_.operator*().StopCall();
147 }
148}
149
150void VirtualMachineDebug::InvokePacked(Index packed_index, const PackedFunc& func, Index arg_count,
151 Index output_size, const std::vector<ObjectRef>& args) {
152 ICHECK(exec_);
153 ICHECK(!devices_.empty()) << "Device has not been initialized yet.";
154 if (prof_ && prof_.operator*().IsRunning()) {
155 // The device of any input of the operator is used for synchronization.
156 ICHECK_GT(arg_count, 0U);
157 ObjectRef arg = args[0];
158 while (arg->IsInstance<ADTObj>()) {
159 ADT adt = Downcast<ADT>(arg);
160 arg = adt[0];
161 }
162 ICHECK(arg->IsInstance<NDArray::ContainerType>());
163 auto nd_array = Downcast<NDArray>(arg);
164 auto dev = nd_array->device;
165
166 // get argument sizes
167 std::vector<NDArray> shapes;
168 for (Index i = 0; i < arg_count; i++) {
169 if (const auto* obj = args[i].as<ADTObj>()) {
170 for (size_t fi = 0; fi < obj->size; ++fi) {
171 auto o = (*obj)[fi];
172 shapes.push_back(Downcast<NDArray>(o));
173 }
174 } else {
175 shapes.push_back(Downcast<NDArray>(args[i]));
176 }
177 }
178
179 std::unordered_map<std::string, ObjectRef> metrics;
180
181 ICHECK(exec_->op_attrs.find(packed_index) != exec_->op_attrs.end())
182 << packed_index_map_[packed_index] << " not found in op attrs";
183
184 auto& op_attrs = exec_->op_attrs.at(packed_index);
185 for (auto p : op_attrs) {
186 if (std::string(p.first).find("layout") != std::string::npos) {
187 metrics[p.first] = p.second;
188 }
189 }
190 auto it = op_attrs.find("hash");
191 if (it != op_attrs.end()) {
192 metrics["Hash"] = Downcast<String>((*it).second);
193 }
194 metrics["Argument Shapes"] = profiling::ShapeString(shapes);
195
196 prof_.operator*().StartCall(packed_index_map_[packed_index], dev, metrics);
197 }
198 VirtualMachine::InvokePacked(packed_index, func, arg_count, output_size, args);
199 if (prof_ && prof_.operator*().IsRunning()) {
200 prof_.operator*().StopCall();
201 }
202}
203
204runtime::Module CreateVirtualMachineDebug(Executable* exec) {
205 auto vm = make_object<VirtualMachineDebug>();
206 vm->LoadExecutable(GetObjectPtr<Executable>(exec));
207 return runtime::Module(vm);
208}
209
210TVM_REGISTER_GLOBAL("runtime._VirtualMachineDebug").set_body([](TVMArgs args, TVMRetValue* rv) {
211 runtime::Module mod = args[0];
212 auto* exec = dynamic_cast<Executable*>(mod.operator->());
213 *rv = CreateVirtualMachineDebug(exec);
214});
215
216} // namespace vm
217} // namespace runtime
218} // namespace tvm
219