1/*
2 * Licensed to the Apache Software Foundation (ASF) under one
3 * or more contributor license agreements. See the NOTICE file
4 * distributed with this work for additional information
5 * regarding copyright ownership. The ASF licenses this file
6 * to you under the Apache License, Version 2.0 (the
7 * "License"); you may not use this file except in compliance
8 * with the License. You may obtain a copy of the License at
9 *
10 * http://www.apache.org/licenses/LICENSE-2.0
11 *
12 * Unless required by applicable law or agreed to in writing,
13 * software distributed under the License is distributed on an
14 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15 * KIND, either express or implied. See the License for the
16 * specific language governing permissions and limitations
17 * under the License.
18 */
19
20/*!
21 * \file graph_executor_debug.cc
22 */
23#include "./graph_executor_debug.h"
24
25#include <tvm/runtime/container/string.h>
26#include <tvm/runtime/ndarray.h>
27#include <tvm/runtime/packed_func.h>
28#include <tvm/runtime/profiling.h>
29#include <tvm/runtime/registry.h>
30
31#include <chrono>
32#include <cmath>
33#include <numeric>
34#include <sstream>
35
36#include "../../rpc/rpc_session.h"
37
38namespace tvm {
39namespace runtime {
40std::string GraphExecutorDebug::RunIndividual(int number, int repeat, int min_repeat_ms,
41 int limit_zero_time_iterations,
42 int cooldown_interval_ms, int repeats_to_cooldown) {
43 // warmup run
44 GraphExecutor::Run();
45 std::string tkey = module_->type_key();
46 std::vector<std::vector<double>> time_sec_per_op(op_execs_.size());
47 if (tkey == "rpc") {
48 // RPC modules rely on remote timing which implements the logic from the else branch.
49 for (size_t index = 0; index < op_execs_.size(); ++index) {
50 time_sec_per_op[index] =
51 RunOpRPC(index, number, repeat, min_repeat_ms, limit_zero_time_iterations,
52 cooldown_interval_ms, repeats_to_cooldown);
53 }
54 } else {
55 int op = 0;
56 for (size_t index = 0; index < op_execs_.size(); ++index) {
57 std::string result_str =
58 RunIndividualNode(index, number, repeat, min_repeat_ms, limit_zero_time_iterations,
59 cooldown_interval_ms, repeats_to_cooldown);
60 const double* blob_ptr = reinterpret_cast<const double*>(result_str.data());
61 for (int i = 0; i < repeat; ++i, ++blob_ptr) {
62 time_sec_per_op[index].push_back(*blob_ptr);
63 }
64 if (op_execs_[index]) {
65 LOG(INFO) << "Op #" << op << " " << GetNodeName(index) << ":";
66 for (size_t cur_repeat = 0; cur_repeat < time_sec_per_op[index].size(); cur_repeat++) {
67 const auto& data = time_sec_per_op[index][cur_repeat];
68 LOG(INFO) << "Iteration: " << cur_repeat << ": " << (data * 1e6) << " us/iter";
69 }
70 ++op;
71 }
72 }
73 }
74
75 std::ostringstream os;
76 int64_t size = time_sec_per_op.size();
77 os.write(reinterpret_cast<char*>(&size), sizeof(int64_t));
78 for (size_t index = 0; index < time_sec_per_op.size(); ++index) {
79 for (auto& repeat_data : time_sec_per_op[index]) {
80 // To have good behavior when calculating total time, etc.
81 double data = std::isnan(repeat_data) ? 0 : repeat_data;
82 os.write(reinterpret_cast<char*>(&data), sizeof(double));
83 }
84 }
85 return os.str();
86}
87
88std::string GraphExecutorDebug::RunIndividualNode(int node_index, int number, int repeat,
89 int min_repeat_ms, int limit_zero_time_iterations,
90 int cooldown_interval_ms,
91 int repeats_to_cooldown) {
92 std::string tkey = module_->type_key();
93
94 if (tkey == "rpc") {
95 LOG(FATAL) << "RPC measurements should not use RunIndividualNode!";
96 }
97
98 if (!op_execs_[node_index]) {
99 // don't return anything...
100 std::ostringstream os;
101 double zero = 0;
102 for (int i = 0; i < repeat; ++i) {
103 os.write(reinterpret_cast<char*>(&zero), sizeof(double));
104 }
105 return os.str();
106 }
107
108 // assume host runs things which is first device
109 Device& d = devices_[0];
110 PackedFunc time_evaluator = profiling::WrapTimeEvaluator(
111 TypedPackedFunc<void()>([this, node_index]() { this->RunOpHost(node_index); }), d, number,
112 repeat, min_repeat_ms, limit_zero_time_iterations, cooldown_interval_ms, repeats_to_cooldown);
113 return time_evaluator();
114}
115
116std::vector<double> GraphExecutorDebug::RunOpRPC(int index, int number, int repeat,
117 int min_repeat_ms, int limit_zero_time_iterations,
118 int cooldown_interval_ms,
119 int repeats_to_cooldown) {
120 std::vector<double> results(repeat, 0);
121 // Right now we expect either "tvm_op" for nodes which run PackedFunc or "null" for nodes
122 // which represent inputs/parameters to the graph. Other types may be supported in the
123 // future, but consideration would be needed as to how to do that over RPC before we support
124 // it here.
125 if (nodes_[index].op_type != "tvm_op") {
126 CHECK_EQ(nodes_[index].op_type, "null")
127 << "Don't know how to run op type " << nodes_[index].op_type
128 << " remotely over RPC right now";
129
130 // NOTE: GraphExecutorDebug expects graph nodes to have an "op" attribute of "tvm_op" or
131 // "null" and "null" is a placeholder node for a parameter or input.
132 return results;
133 }
134
135 const Device& dev = data_entry_[entry_id(index, 0)]->device;
136 TVMOpParam param = nodes_[index].param;
137 std::string name = param.func_name;
138 uint32_t num_inputs = param.num_inputs;
139 uint32_t num_outputs = param.num_outputs;
140
141 PackedFunc time_eval =
142 runtime::Registry::Get("runtime.RPCTimeEvaluator")
143 ->
144 operator()(module_, name, static_cast<int>(dev.device_type), dev.device_id, number,
145 repeat, min_repeat_ms, limit_zero_time_iterations, cooldown_interval_ms,
146 repeats_to_cooldown, "");
147
148 int num_flat_args = num_inputs + num_outputs;
149 auto values = std::make_unique<TVMValue[]>(num_flat_args);
150 auto type_codes = std::make_unique<int[]>(num_flat_args);
151 TVMArgsSetter setter(values.get(), type_codes.get());
152 int offs = 0;
153 const auto& inode = nodes_[index];
154 for (const auto& e : inode.inputs) {
155 uint32_t eid = this->entry_id(e);
156 DLTensor* arg = const_cast<DLTensor*>(data_entry_[eid].operator->());
157 setter(offs, arg);
158 offs++;
159 }
160 for (uint32_t i = 0; i < num_outputs; ++i) {
161 uint32_t eid = this->entry_id(index, i);
162 DLTensor* arg = const_cast<DLTensor*>(data_entry_[eid].operator->());
163 setter(offs, arg);
164 offs++;
165 }
166 TVMRetValue rv;
167 time_eval.CallPacked(TVMArgs(values.get(), type_codes.get(), num_flat_args), &rv);
168 std::string results_str = rv.operator std::string();
169 const double* blob_ptr = reinterpret_cast<const double*>(results_str.data());
170 for (int i = 0; i < repeat; ++i, ++blob_ptr) {
171 results[i] = *blob_ptr;
172 }
173
174 std::ostringstream os;
175 for (auto& repeat_data : results) {
176 os << std::to_string(repeat_data) << ", ";
177 }
178 LOG(INFO) << "Got op timing: " << os.str();
179 return results;
180}
181
182Timer GraphExecutorDebug::RunOpHost(int index) {
183 const Device& dev = data_entry_[entry_id(index, 0)]->device;
184 Timer t = Timer::Start(dev);
185 op_execs_[index]();
186 t->Stop();
187 return t;
188}
189
190/*!
191 * \brief GetFunction Get the function based on input.
192 * \param name The function which needs to be invoked.
193 * \param sptr_to_self Packed function pointer.
194 */
195PackedFunc GraphExecutorDebug::GetFunction(const std::string& name,
196 const ObjectPtr<Object>& sptr_to_self) {
197 // return member functions during query.
198 if (name == "debug_get_output") {
199 return PackedFunc([sptr_to_self, this](TVMArgs args, TVMRetValue* rv) {
200 if (String::CanConvertFrom(args[0])) {
201 this->DebugGetNodeOutput(this->GetNodeIndex(args[0]), args[1]);
202 } else {
203 this->DebugGetNodeOutput(args[0], args[1]);
204 }
205 });
206 } else if (name == "execute_node") {
207 return PackedFunc(
208 [sptr_to_self, this](TVMArgs args, TVMRetValue* rv) { this->ExecuteNode(args[0]); });
209 } else if (name == "get_node_output") {
210 return PackedFunc([sptr_to_self, this](TVMArgs args, TVMRetValue* rv) {
211 *rv = this->GetNodeOutput(args[0], args[1]);
212 });
213 } else if (name == "run_individual") {
214 return PackedFunc([sptr_to_self, this](TVMArgs args, TVMRetValue* rv) {
215 int number = args[0];
216 int repeat = args[1];
217 int min_repeat_ms = args[2];
218 int limit_zero_time_iterations = args[3];
219 int cooldown_interval_ms = args[4];
220 int repeats_to_cooldown = args[5];
221 ICHECK_GT(number, 0);
222 ICHECK_GT(repeat, 0);
223 ICHECK_GE(min_repeat_ms, 0);
224 ICHECK_GE(limit_zero_time_iterations, 0);
225 ICHECK_GE(cooldown_interval_ms, 0);
226 ICHECK_GT(repeats_to_cooldown, 0);
227 std::string blob =
228 this->RunIndividual(number, repeat, min_repeat_ms, limit_zero_time_iterations,
229 cooldown_interval_ms, repeats_to_cooldown);
230 TVMByteArray arr;
231 arr.size = blob.length();
232 arr.data = blob.data();
233 *rv = arr;
234 });
235 } else if (name == "run_individual_node") {
236 return PackedFunc([sptr_to_self, this](TVMArgs args, TVMRetValue* rv) {
237 int node_index = args[0];
238 int number = args[1];
239 int repeat = args[2];
240 int min_repeat_ms = args[3];
241 int limit_zero_time_iterations = args[4];
242 int cooldown_interval_ms = args[5];
243 int repeats_to_cooldown = args[6];
244 ICHECK_GE(node_index, 0);
245 ICHECK_LT(node_index, nodes_.size());
246 ICHECK_GT(number, 0);
247 ICHECK_GT(repeat, 0);
248 ICHECK_GE(min_repeat_ms, 0);
249 ICHECK_GE(limit_zero_time_iterations, 0);
250 ICHECK_GE(cooldown_interval_ms, 0);
251 ICHECK_GT(repeats_to_cooldown, 0);
252 std::string blob = this->RunIndividualNode(node_index, number, repeat, min_repeat_ms,
253 limit_zero_time_iterations, cooldown_interval_ms,
254 repeats_to_cooldown);
255 TVMByteArray arr;
256 arr.size = blob.length();
257 arr.data = blob.data();
258 *rv = arr;
259 });
260 } else if (name == "profile") {
261 return TypedPackedFunc<profiling::Report(Array<profiling::MetricCollector>)>(
262 [sptr_to_self, this](Array<profiling::MetricCollector> collectors) {
263 // We cannot send Arrays over rpc, so in order to support profiling
264 // on remotes, we accept a nullptr for collectors.
265 if (collectors.defined()) {
266 return this->Profile(collectors);
267 } else {
268 return this->Profile({});
269 }
270 });
271 } else if (name == "profile_rpc") {
272 // We cannot return a Report over RPC because TMV RPC mechanism only
273 // supports a subset of Object classes. Instead we serialize it on the
274 // remote (here) and deserialize it on the other end.
275 return TypedPackedFunc<std::string()>([sptr_to_self, this]() {
276 PackedFunc profile = GetFunction("profile", sptr_to_self);
277 profiling::Report report = profile(Array<profiling::MetricCollector>());
278 return report->AsJSON();
279 });
280 } else {
281 return GraphExecutor::GetFunction(name, sptr_to_self);
282 }
283}
284
285int GraphExecutorDebug::GetNodeIndex(const std::string& name) const {
286 for (size_t nid = 0; nid < GetNumOfNodes(); ++nid) {
287 if (GetNodeName(nid) == name) {
288 return static_cast<int>(nid);
289 }
290 }
291 LOG(FATAL) << "cannot find " << name << " among nodex";
292 return -1;
293}
294
295void GraphExecutorDebug::ExecuteNode(int node) {
296 ICHECK_LT(static_cast<size_t>(node), op_execs_.size());
297
298 int start_ind;
299 int end_ind;
300 if (node < last_executed_node_) {
301 start_ind = 0;
302 end_ind = node;
303 } else if (node > last_executed_node_) {
304 start_ind = last_executed_node_ + 1;
305 end_ind = node;
306 } else {
307 return;
308 }
309
310 for (int i = start_ind; i <= end_ind; i++) {
311 if (op_execs_[i]) op_execs_[i]();
312 }
313 last_executed_node_ = end_ind;
314}
315
316void GraphExecutorDebug::DebugGetNodeOutput(int index, DLTensor* data_out) {
317 ICHECK_LT(static_cast<size_t>(index), op_execs_.size());
318 uint32_t eid = index;
319
320 for (size_t i = 0; i < op_execs_.size(); ++i) {
321 if (op_execs_[i]) op_execs_[i]();
322 if (static_cast<int>(i) == index) break;
323 }
324
325 data_entry_[eid].CopyTo(data_out);
326}
327
328NDArray GraphExecutorDebug::GetNodeOutput(int node, int out_ind) {
329 ICHECK_EQ(node, last_executed_node_);
330 ICHECK_LT(entry_id(node, out_ind), data_entry_.size());
331 return data_entry_[entry_id(node, out_ind)].CopyTo({kDLCPU, 0});
332}
333
334profiling::Report GraphExecutorDebug::Profile(Array<profiling::MetricCollector> collectors) {
335 std::vector<profiling::MetricCollector> cs(collectors.begin(), collectors.end());
336 profiling::Profiler prof(devices_, cs, {{String("Executor"), String("Graph")}});
337
338 // warm up. 1 iteration does not seem enough.
339 for (int i = 0; i < 3; i++) {
340 GraphExecutor::Run();
341 }
342
343 prof.Start();
344 for (size_t i = 0; i < op_execs_.size(); ++i) {
345 if (op_execs_[i]) {
346 // get argument shapes
347 std::vector<NDArray> shapes;
348 for (const auto& e : nodes_[i].inputs) {
349 uint32_t eid = entry_id(e);
350 shapes.push_back(data_entry_[eid]);
351 }
352 for (uint32_t j = 0; j < nodes_[i].param.num_outputs; ++j) {
353 uint32_t eid = entry_id(i, j);
354 shapes.push_back(data_entry_[eid]);
355 }
356
357 uint32_t eid = entry_id(i, 0);
358 const Device& dev = data_entry_[eid]->device;
359
360 std::unordered_map<std::string, ObjectRef> metrics;
361 for (auto p : nodes_[i].param.attrs) {
362 if (std::string(p.first).find("layout") != std::string::npos) {
363 metrics[p.first] = p.second;
364 }
365 }
366 if (nodes_[i].param.attrs.find("hash") != nodes_[i].param.attrs.end()) {
367 metrics["Hash"] = Downcast<String>(nodes_[i].param.attrs.at("hash"));
368 }
369 metrics["Argument Shapes"] = profiling::ShapeString(shapes);
370 prof.StartCall(nodes_[i].param.func_name, dev, metrics);
371 op_execs_[i]();
372 prof.StopCall();
373 }
374 }
375 prof.Stop();
376 return prof.Report();
377}
378
379/*!
380 * \brief GraphExecutorDebugCreate Get the function based on input.
381 * \param sym_json The graph symbol in json format.
382 * \param m Compiled module which will be loaded.
383 * \param devs All devices.
384 */
385Module GraphExecutorDebugCreate(const std::string& sym_json, const tvm::runtime::Module& m,
386 const std::vector<Device>& devs,
387 PackedFunc lookup_linked_param_func) {
388 auto exec = make_object<GraphExecutorDebug>();
389 exec->Init(sym_json, m, devs, lookup_linked_param_func);
390 return Module(exec);
391}
392
393TVM_REGISTER_GLOBAL("tvm.graph_executor_debug.create").set_body([](TVMArgs args, TVMRetValue* rv) {
394 ICHECK_GE(args.num_args, 4) << "The expected number of arguments for graph_executor.create is "
395 "at least 4, but it has "
396 << args.num_args;
397 PackedFunc lookup_linked_param_func;
398 int dev_start_arg = 2;
399 if (args[2].type_code() == kTVMPackedFuncHandle) {
400 lookup_linked_param_func = args[2];
401 dev_start_arg++;
402 }
403
404 *rv = GraphExecutorDebugCreate(args[0], args[1], GetAllDevice(args, dev_start_arg),
405 lookup_linked_param_func);
406});
407} // namespace runtime
408} // namespace tvm
409