1 | /* |
2 | * Licensed to the Apache Software Foundation (ASF) under one |
3 | * or more contributor license agreements. See the NOTICE file |
4 | * distributed with this work for additional information |
5 | * regarding copyright ownership. The ASF licenses this file |
6 | * to you under the Apache License, Version 2.0 (the |
7 | * "License"); you may not use this file except in compliance |
8 | * with the License. You may obtain a copy of the License at |
9 | * |
10 | * http://www.apache.org/licenses/LICENSE-2.0 |
11 | * |
12 | * Unless required by applicable law or agreed to in writing, |
13 | * software distributed under the License is distributed on an |
14 | * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY |
15 | * KIND, either express or implied. See the License for the |
16 | * specific language governing permissions and limitations |
17 | * under the License. |
18 | */ |
19 | |
20 | /*! |
21 | * \file src/runtime/vm/profiler/vm.cc |
22 | * \brief The Relay debug virtual machine. |
23 | */ |
24 | |
25 | #include "vm.h" |
26 | |
27 | #include <tvm/runtime/container/adt.h> |
28 | #include <tvm/runtime/data_type.h> |
29 | #include <tvm/runtime/registry.h> |
30 | |
31 | #include <algorithm> |
32 | #include <chrono> |
33 | #include <iomanip> |
34 | #include <memory> |
35 | #include <numeric> |
36 | #include <sstream> |
37 | #include <string> |
38 | #include <utility> |
39 | #include <vector> |
40 | |
41 | namespace tvm { |
42 | namespace runtime { |
43 | namespace vm { |
44 | |
45 | PackedFunc VirtualMachineDebug::GetFunction(const std::string& name, |
46 | const ObjectPtr<Object>& sptr_to_self) { |
47 | if (name == "profile" ) { |
48 | return TypedPackedFunc<profiling::Report(String, Array<profiling::MetricCollector>)>( |
49 | [sptr_to_self, this](String arg_name, Array<profiling::MetricCollector> collectors) { |
50 | std::vector<Device> devices; |
51 | for (auto dev : devices_) { |
52 | if (dev.device_type > 0) { |
53 | devices.push_back(dev); |
54 | } |
55 | } |
56 | |
57 | // We cannot send Arrays over rpc, so in order to support profiling |
58 | // on remotes, we accept a nullptr for collectors. |
59 | if (collectors.defined()) { |
60 | std::vector<profiling::MetricCollector> cs(collectors.begin(), collectors.end()); |
61 | prof_ = profiling::Profiler(devices, cs, {{String("Executor" ), String("VM" )}}); |
62 | } else { |
63 | prof_ = profiling::Profiler(devices, {}, {{String("Executor" ), String("VM" )}}); |
64 | } |
65 | |
66 | auto invoke = VirtualMachine::GetFunction("invoke" , sptr_to_self); |
67 | // warmup |
68 | for (int i = 0; i < 3; i++) { |
69 | invoke(arg_name); |
70 | } |
71 | |
72 | prof_.operator*().Start(); |
73 | invoke(arg_name); |
74 | prof_.operator*().Stop(); |
75 | auto report = prof_.operator*().Report(); |
76 | prof_ = std::nullopt; // releases hardware counters |
77 | return report; |
78 | }); |
79 | } else if (name == "profile_rpc" ) { |
80 | // We cannot return a Report over RPC because TVM RPC mechanism only |
81 | // supports a subset of Object classes. Instead we serialize it on the |
82 | // remote (here) and deserialize it on the other end. |
83 | return TypedPackedFunc<std::string(std::string)>([sptr_to_self, this](std::string arg_name) { |
84 | PackedFunc profile = GetFunction("profile" , sptr_to_self); |
85 | profiling::Report report = profile(arg_name, Array<profiling::MetricCollector>()); |
86 | return report->AsJSON(); |
87 | }); |
88 | } else { |
89 | return VirtualMachine::GetFunction(name, sptr_to_self); |
90 | } |
91 | } |
92 | |
93 | void VirtualMachineDebug::LoadExecutable(const ObjectPtr<Executable>& exec) { |
94 | VirtualMachine::LoadExecutable(exec); |
95 | for (auto kv : exec_->primitive_map) { |
96 | packed_index_map_[kv.second] = kv.first; |
97 | } |
98 | } |
99 | |
100 | void VirtualMachineDebug::OpStartHook(Instruction instr) { |
101 | if (prof_ && prof_.operator*().IsRunning()) { |
102 | if (instr.op == Opcode::LoadConst) { |
103 | Device dev = GetDevice(exec_->const_device_indexes[instr.const_index]); |
104 | prof_.operator*().StartCall("VM::LoadConst" , dev, {}); |
105 | } else if (instr.op == Opcode::DeviceCopy) { |
106 | Device dst_dev = GetDevice(instr.device_copy.dst_device_index); |
107 | prof_.operator*().StartCall("VM::DeviceCopy" , dst_dev, {}); |
108 | } else if (instr.op == Opcode::ReshapeTensor) { |
109 | prof_.operator*().StartCall("VM::ReshapeTensor" , devices_[exec_->host_device_index], {}); |
110 | } else if (instr.op == Opcode::AllocTensor) { |
111 | auto shape = std::vector<int64_t>(instr.alloc_tensor.ndim); |
112 | |
113 | for (uint32_t i = 0; i < instr.alloc_tensor.ndim; ++i) { |
114 | shape[i] = instr.alloc_tensor.shape[i]; |
115 | } |
116 | auto storage_obj = ReadRegister(instr.alloc_tensor.storage); |
117 | auto storage = Downcast<Storage>(storage_obj); |
118 | prof_.operator*().StartCall( |
119 | "VM::AllocTensor" , storage->buffer.device, |
120 | {{"Argument Shapes" , profiling::ShapeString(shape, instr.alloc_tensor.dtype)}}); |
121 | } else if (instr.op == Opcode::AllocTensorReg) { |
122 | auto storage_obj = ReadRegister(instr.alloc_tensor_reg.storage); |
123 | auto storage = Downcast<Storage>(storage_obj); |
124 | Device cpu_dev = GetDevice(exec_->host_device_index); |
125 | auto shape_obj = ReadRegister(instr.alloc_tensor_reg.shape_register); |
126 | NDArray shape_tensor = Downcast<NDArray>(shape_obj).CopyTo(cpu_dev); |
127 | prof_.operator*().StartCall( |
128 | "VM::AllocTensorReg" , storage->buffer.device, |
129 | {{"Argument Shapes" , |
130 | profiling::ShapeString(shape_tensor, instr.alloc_tensor_reg.dtype)}}); |
131 | } else if (instr.op == Opcode::AllocStorage) { |
132 | auto size = LoadScalarInt(instr.alloc_storage.allocation_size); |
133 | std::ostringstream shape; |
134 | shape << DLDataType2String(instr.alloc_storage.dtype_hint) << "[" << size << "]" ; |
135 | Device dev = GetDevice(instr.alloc_storage.device_index); |
136 | prof_.operator*().StartCall("VM::AllocStorage" , dev, |
137 | {{"VM::Argument Shapes" , String(shape.str())}}); |
138 | } else { |
139 | prof_.operator*().StartCall("VM::UnknownOp" , GetDevice(exec_->host_device_index), {}); |
140 | } |
141 | } |
142 | } |
143 | |
144 | void VirtualMachineDebug::OpStopHook() { |
145 | if (prof_ && prof_.operator*().IsRunning()) { |
146 | prof_.operator*().StopCall(); |
147 | } |
148 | } |
149 | |
150 | void VirtualMachineDebug::InvokePacked(Index packed_index, const PackedFunc& func, Index arg_count, |
151 | Index output_size, const std::vector<ObjectRef>& args) { |
152 | ICHECK(exec_); |
153 | ICHECK(!devices_.empty()) << "Device has not been initialized yet." ; |
154 | if (prof_ && prof_.operator*().IsRunning()) { |
155 | // The device of any input of the operator is used for synchronization. |
156 | ICHECK_GT(arg_count, 0U); |
157 | ObjectRef arg = args[0]; |
158 | while (arg->IsInstance<ADTObj>()) { |
159 | ADT adt = Downcast<ADT>(arg); |
160 | arg = adt[0]; |
161 | } |
162 | ICHECK(arg->IsInstance<NDArray::ContainerType>()); |
163 | auto nd_array = Downcast<NDArray>(arg); |
164 | auto dev = nd_array->device; |
165 | |
166 | // get argument sizes |
167 | std::vector<NDArray> shapes; |
168 | for (Index i = 0; i < arg_count; i++) { |
169 | if (const auto* obj = args[i].as<ADTObj>()) { |
170 | for (size_t fi = 0; fi < obj->size; ++fi) { |
171 | auto o = (*obj)[fi]; |
172 | shapes.push_back(Downcast<NDArray>(o)); |
173 | } |
174 | } else { |
175 | shapes.push_back(Downcast<NDArray>(args[i])); |
176 | } |
177 | } |
178 | |
179 | std::unordered_map<std::string, ObjectRef> metrics; |
180 | |
181 | ICHECK(exec_->op_attrs.find(packed_index) != exec_->op_attrs.end()) |
182 | << packed_index_map_[packed_index] << " not found in op attrs" ; |
183 | |
184 | auto& op_attrs = exec_->op_attrs.at(packed_index); |
185 | for (auto p : op_attrs) { |
186 | if (std::string(p.first).find("layout" ) != std::string::npos) { |
187 | metrics[p.first] = p.second; |
188 | } |
189 | } |
190 | auto it = op_attrs.find("hash" ); |
191 | if (it != op_attrs.end()) { |
192 | metrics["Hash" ] = Downcast<String>((*it).second); |
193 | } |
194 | metrics["Argument Shapes" ] = profiling::ShapeString(shapes); |
195 | |
196 | prof_.operator*().StartCall(packed_index_map_[packed_index], dev, metrics); |
197 | } |
198 | VirtualMachine::InvokePacked(packed_index, func, arg_count, output_size, args); |
199 | if (prof_ && prof_.operator*().IsRunning()) { |
200 | prof_.operator*().StopCall(); |
201 | } |
202 | } |
203 | |
204 | runtime::Module CreateVirtualMachineDebug(Executable* exec) { |
205 | auto vm = make_object<VirtualMachineDebug>(); |
206 | vm->LoadExecutable(GetObjectPtr<Executable>(exec)); |
207 | return runtime::Module(vm); |
208 | } |
209 | |
210 | TVM_REGISTER_GLOBAL("runtime._VirtualMachineDebug" ).set_body([](TVMArgs args, TVMRetValue* rv) { |
211 | runtime::Module mod = args[0]; |
212 | auto* exec = dynamic_cast<Executable*>(mod.operator->()); |
213 | *rv = CreateVirtualMachineDebug(exec); |
214 | }); |
215 | |
216 | } // namespace vm |
217 | } // namespace runtime |
218 | } // namespace tvm |
219 | |