1 | /* |
2 | * Licensed to the Apache Software Foundation (ASF) under one |
3 | * or more contributor license agreements. See the NOTICE file |
4 | * distributed with this work for additional information |
5 | * regarding copyright ownership. The ASF licenses this file |
6 | * to you under the Apache License, Version 2.0 (the |
7 | * "License"); you may not use this file except in compliance |
8 | * with the License. You may obtain a copy of the License at |
9 | * |
10 | * http://www.apache.org/licenses/LICENSE-2.0 |
11 | * |
12 | * Unless required by applicable law or agreed to in writing, |
13 | * software distributed under the License is distributed on an |
14 | * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY |
15 | * KIND, either express or implied. See the License for the |
16 | * specific language governing permissions and limitations |
17 | * under the License. |
18 | */ |
19 | |
20 | /*! |
21 | * \file graph_executor_debug.cc |
22 | */ |
23 | #include "./graph_executor_debug.h" |
24 | |
25 | #include <tvm/runtime/container/string.h> |
26 | #include <tvm/runtime/ndarray.h> |
27 | #include <tvm/runtime/packed_func.h> |
28 | #include <tvm/runtime/profiling.h> |
29 | #include <tvm/runtime/registry.h> |
30 | |
31 | #include <chrono> |
32 | #include <cmath> |
33 | #include <numeric> |
34 | #include <sstream> |
35 | |
36 | #include "../../rpc/rpc_session.h" |
37 | |
38 | namespace tvm { |
39 | namespace runtime { |
40 | std::string GraphExecutorDebug::RunIndividual(int number, int repeat, int min_repeat_ms, |
41 | int limit_zero_time_iterations, |
42 | int cooldown_interval_ms, int repeats_to_cooldown) { |
43 | // warmup run |
44 | GraphExecutor::Run(); |
45 | std::string tkey = module_->type_key(); |
46 | std::vector<std::vector<double>> time_sec_per_op(op_execs_.size()); |
47 | if (tkey == "rpc" ) { |
48 | // RPC modules rely on remote timing which implements the logic from the else branch. |
49 | for (size_t index = 0; index < op_execs_.size(); ++index) { |
50 | time_sec_per_op[index] = |
51 | RunOpRPC(index, number, repeat, min_repeat_ms, limit_zero_time_iterations, |
52 | cooldown_interval_ms, repeats_to_cooldown); |
53 | } |
54 | } else { |
55 | int op = 0; |
56 | for (size_t index = 0; index < op_execs_.size(); ++index) { |
57 | std::string result_str = |
58 | RunIndividualNode(index, number, repeat, min_repeat_ms, limit_zero_time_iterations, |
59 | cooldown_interval_ms, repeats_to_cooldown); |
60 | const double* blob_ptr = reinterpret_cast<const double*>(result_str.data()); |
61 | for (int i = 0; i < repeat; ++i, ++blob_ptr) { |
62 | time_sec_per_op[index].push_back(*blob_ptr); |
63 | } |
64 | if (op_execs_[index]) { |
65 | LOG(INFO) << "Op #" << op << " " << GetNodeName(index) << ":" ; |
66 | for (size_t cur_repeat = 0; cur_repeat < time_sec_per_op[index].size(); cur_repeat++) { |
67 | const auto& data = time_sec_per_op[index][cur_repeat]; |
68 | LOG(INFO) << "Iteration: " << cur_repeat << ": " << (data * 1e6) << " us/iter" ; |
69 | } |
70 | ++op; |
71 | } |
72 | } |
73 | } |
74 | |
75 | std::ostringstream os; |
76 | int64_t size = time_sec_per_op.size(); |
77 | os.write(reinterpret_cast<char*>(&size), sizeof(int64_t)); |
78 | for (size_t index = 0; index < time_sec_per_op.size(); ++index) { |
79 | for (auto& repeat_data : time_sec_per_op[index]) { |
80 | // To have good behavior when calculating total time, etc. |
81 | double data = std::isnan(repeat_data) ? 0 : repeat_data; |
82 | os.write(reinterpret_cast<char*>(&data), sizeof(double)); |
83 | } |
84 | } |
85 | return os.str(); |
86 | } |
87 | |
88 | std::string GraphExecutorDebug::RunIndividualNode(int node_index, int number, int repeat, |
89 | int min_repeat_ms, int limit_zero_time_iterations, |
90 | int cooldown_interval_ms, |
91 | int repeats_to_cooldown) { |
92 | std::string tkey = module_->type_key(); |
93 | |
94 | if (tkey == "rpc" ) { |
95 | LOG(FATAL) << "RPC measurements should not use RunIndividualNode!" ; |
96 | } |
97 | |
98 | if (!op_execs_[node_index]) { |
99 | // don't return anything... |
100 | std::ostringstream os; |
101 | double zero = 0; |
102 | for (int i = 0; i < repeat; ++i) { |
103 | os.write(reinterpret_cast<char*>(&zero), sizeof(double)); |
104 | } |
105 | return os.str(); |
106 | } |
107 | |
108 | // assume host runs things which is first device |
109 | Device& d = devices_[0]; |
110 | PackedFunc time_evaluator = profiling::WrapTimeEvaluator( |
111 | TypedPackedFunc<void()>([this, node_index]() { this->RunOpHost(node_index); }), d, number, |
112 | repeat, min_repeat_ms, limit_zero_time_iterations, cooldown_interval_ms, repeats_to_cooldown); |
113 | return time_evaluator(); |
114 | } |
115 | |
116 | std::vector<double> GraphExecutorDebug::RunOpRPC(int index, int number, int repeat, |
117 | int min_repeat_ms, int limit_zero_time_iterations, |
118 | int cooldown_interval_ms, |
119 | int repeats_to_cooldown) { |
120 | std::vector<double> results(repeat, 0); |
121 | // Right now we expect either "tvm_op" for nodes which run PackedFunc or "null" for nodes |
122 | // which represent inputs/parameters to the graph. Other types may be supported in the |
123 | // future, but consideration would be needed as to how to do that over RPC before we support |
124 | // it here. |
125 | if (nodes_[index].op_type != "tvm_op" ) { |
126 | CHECK_EQ(nodes_[index].op_type, "null" ) |
127 | << "Don't know how to run op type " << nodes_[index].op_type |
128 | << " remotely over RPC right now" ; |
129 | |
130 | // NOTE: GraphExecutorDebug expects graph nodes to have an "op" attribute of "tvm_op" or |
131 | // "null" and "null" is a placeholder node for a parameter or input. |
132 | return results; |
133 | } |
134 | |
135 | const Device& dev = data_entry_[entry_id(index, 0)]->device; |
136 | TVMOpParam param = nodes_[index].param; |
137 | std::string name = param.func_name; |
138 | uint32_t num_inputs = param.num_inputs; |
139 | uint32_t num_outputs = param.num_outputs; |
140 | |
141 | PackedFunc time_eval = |
142 | runtime::Registry::Get("runtime.RPCTimeEvaluator" ) |
143 | -> |
144 | operator()(module_, name, static_cast<int>(dev.device_type), dev.device_id, number, |
145 | repeat, min_repeat_ms, limit_zero_time_iterations, cooldown_interval_ms, |
146 | repeats_to_cooldown, "" ); |
147 | |
148 | int num_flat_args = num_inputs + num_outputs; |
149 | auto values = std::make_unique<TVMValue[]>(num_flat_args); |
150 | auto type_codes = std::make_unique<int[]>(num_flat_args); |
151 | TVMArgsSetter setter(values.get(), type_codes.get()); |
152 | int offs = 0; |
153 | const auto& inode = nodes_[index]; |
154 | for (const auto& e : inode.inputs) { |
155 | uint32_t eid = this->entry_id(e); |
156 | DLTensor* arg = const_cast<DLTensor*>(data_entry_[eid].operator->()); |
157 | setter(offs, arg); |
158 | offs++; |
159 | } |
160 | for (uint32_t i = 0; i < num_outputs; ++i) { |
161 | uint32_t eid = this->entry_id(index, i); |
162 | DLTensor* arg = const_cast<DLTensor*>(data_entry_[eid].operator->()); |
163 | setter(offs, arg); |
164 | offs++; |
165 | } |
166 | TVMRetValue rv; |
167 | time_eval.CallPacked(TVMArgs(values.get(), type_codes.get(), num_flat_args), &rv); |
168 | std::string results_str = rv.operator std::string(); |
169 | const double* blob_ptr = reinterpret_cast<const double*>(results_str.data()); |
170 | for (int i = 0; i < repeat; ++i, ++blob_ptr) { |
171 | results[i] = *blob_ptr; |
172 | } |
173 | |
174 | std::ostringstream os; |
175 | for (auto& repeat_data : results) { |
176 | os << std::to_string(repeat_data) << ", " ; |
177 | } |
178 | LOG(INFO) << "Got op timing: " << os.str(); |
179 | return results; |
180 | } |
181 | |
182 | Timer GraphExecutorDebug::RunOpHost(int index) { |
183 | const Device& dev = data_entry_[entry_id(index, 0)]->device; |
184 | Timer t = Timer::Start(dev); |
185 | op_execs_[index](); |
186 | t->Stop(); |
187 | return t; |
188 | } |
189 | |
190 | /*! |
191 | * \brief GetFunction Get the function based on input. |
192 | * \param name The function which needs to be invoked. |
193 | * \param sptr_to_self Packed function pointer. |
194 | */ |
195 | PackedFunc GraphExecutorDebug::GetFunction(const std::string& name, |
196 | const ObjectPtr<Object>& sptr_to_self) { |
197 | // return member functions during query. |
198 | if (name == "debug_get_output" ) { |
199 | return PackedFunc([sptr_to_self, this](TVMArgs args, TVMRetValue* rv) { |
200 | if (String::CanConvertFrom(args[0])) { |
201 | this->DebugGetNodeOutput(this->GetNodeIndex(args[0]), args[1]); |
202 | } else { |
203 | this->DebugGetNodeOutput(args[0], args[1]); |
204 | } |
205 | }); |
206 | } else if (name == "execute_node" ) { |
207 | return PackedFunc( |
208 | [sptr_to_self, this](TVMArgs args, TVMRetValue* rv) { this->ExecuteNode(args[0]); }); |
209 | } else if (name == "get_node_output" ) { |
210 | return PackedFunc([sptr_to_self, this](TVMArgs args, TVMRetValue* rv) { |
211 | *rv = this->GetNodeOutput(args[0], args[1]); |
212 | }); |
213 | } else if (name == "run_individual" ) { |
214 | return PackedFunc([sptr_to_self, this](TVMArgs args, TVMRetValue* rv) { |
215 | int number = args[0]; |
216 | int repeat = args[1]; |
217 | int min_repeat_ms = args[2]; |
218 | int limit_zero_time_iterations = args[3]; |
219 | int cooldown_interval_ms = args[4]; |
220 | int repeats_to_cooldown = args[5]; |
221 | ICHECK_GT(number, 0); |
222 | ICHECK_GT(repeat, 0); |
223 | ICHECK_GE(min_repeat_ms, 0); |
224 | ICHECK_GE(limit_zero_time_iterations, 0); |
225 | ICHECK_GE(cooldown_interval_ms, 0); |
226 | ICHECK_GT(repeats_to_cooldown, 0); |
227 | std::string blob = |
228 | this->RunIndividual(number, repeat, min_repeat_ms, limit_zero_time_iterations, |
229 | cooldown_interval_ms, repeats_to_cooldown); |
230 | TVMByteArray arr; |
231 | arr.size = blob.length(); |
232 | arr.data = blob.data(); |
233 | *rv = arr; |
234 | }); |
235 | } else if (name == "run_individual_node" ) { |
236 | return PackedFunc([sptr_to_self, this](TVMArgs args, TVMRetValue* rv) { |
237 | int node_index = args[0]; |
238 | int number = args[1]; |
239 | int repeat = args[2]; |
240 | int min_repeat_ms = args[3]; |
241 | int limit_zero_time_iterations = args[4]; |
242 | int cooldown_interval_ms = args[5]; |
243 | int repeats_to_cooldown = args[6]; |
244 | ICHECK_GE(node_index, 0); |
245 | ICHECK_LT(node_index, nodes_.size()); |
246 | ICHECK_GT(number, 0); |
247 | ICHECK_GT(repeat, 0); |
248 | ICHECK_GE(min_repeat_ms, 0); |
249 | ICHECK_GE(limit_zero_time_iterations, 0); |
250 | ICHECK_GE(cooldown_interval_ms, 0); |
251 | ICHECK_GT(repeats_to_cooldown, 0); |
252 | std::string blob = this->RunIndividualNode(node_index, number, repeat, min_repeat_ms, |
253 | limit_zero_time_iterations, cooldown_interval_ms, |
254 | repeats_to_cooldown); |
255 | TVMByteArray arr; |
256 | arr.size = blob.length(); |
257 | arr.data = blob.data(); |
258 | *rv = arr; |
259 | }); |
260 | } else if (name == "profile" ) { |
261 | return TypedPackedFunc<profiling::Report(Array<profiling::MetricCollector>)>( |
262 | [sptr_to_self, this](Array<profiling::MetricCollector> collectors) { |
263 | // We cannot send Arrays over rpc, so in order to support profiling |
264 | // on remotes, we accept a nullptr for collectors. |
265 | if (collectors.defined()) { |
266 | return this->Profile(collectors); |
267 | } else { |
268 | return this->Profile({}); |
269 | } |
270 | }); |
271 | } else if (name == "profile_rpc" ) { |
272 | // We cannot return a Report over RPC because TMV RPC mechanism only |
273 | // supports a subset of Object classes. Instead we serialize it on the |
274 | // remote (here) and deserialize it on the other end. |
275 | return TypedPackedFunc<std::string()>([sptr_to_self, this]() { |
276 | PackedFunc profile = GetFunction("profile" , sptr_to_self); |
277 | profiling::Report report = profile(Array<profiling::MetricCollector>()); |
278 | return report->AsJSON(); |
279 | }); |
280 | } else { |
281 | return GraphExecutor::GetFunction(name, sptr_to_self); |
282 | } |
283 | } |
284 | |
285 | int GraphExecutorDebug::GetNodeIndex(const std::string& name) const { |
286 | for (size_t nid = 0; nid < GetNumOfNodes(); ++nid) { |
287 | if (GetNodeName(nid) == name) { |
288 | return static_cast<int>(nid); |
289 | } |
290 | } |
291 | LOG(FATAL) << "cannot find " << name << " among nodex" ; |
292 | return -1; |
293 | } |
294 | |
295 | void GraphExecutorDebug::ExecuteNode(int node) { |
296 | ICHECK_LT(static_cast<size_t>(node), op_execs_.size()); |
297 | |
298 | int start_ind; |
299 | int end_ind; |
300 | if (node < last_executed_node_) { |
301 | start_ind = 0; |
302 | end_ind = node; |
303 | } else if (node > last_executed_node_) { |
304 | start_ind = last_executed_node_ + 1; |
305 | end_ind = node; |
306 | } else { |
307 | return; |
308 | } |
309 | |
310 | for (int i = start_ind; i <= end_ind; i++) { |
311 | if (op_execs_[i]) op_execs_[i](); |
312 | } |
313 | last_executed_node_ = end_ind; |
314 | } |
315 | |
316 | void GraphExecutorDebug::DebugGetNodeOutput(int index, DLTensor* data_out) { |
317 | ICHECK_LT(static_cast<size_t>(index), op_execs_.size()); |
318 | uint32_t eid = index; |
319 | |
320 | for (size_t i = 0; i < op_execs_.size(); ++i) { |
321 | if (op_execs_[i]) op_execs_[i](); |
322 | if (static_cast<int>(i) == index) break; |
323 | } |
324 | |
325 | data_entry_[eid].CopyTo(data_out); |
326 | } |
327 | |
328 | NDArray GraphExecutorDebug::GetNodeOutput(int node, int out_ind) { |
329 | ICHECK_EQ(node, last_executed_node_); |
330 | ICHECK_LT(entry_id(node, out_ind), data_entry_.size()); |
331 | return data_entry_[entry_id(node, out_ind)].CopyTo({kDLCPU, 0}); |
332 | } |
333 | |
334 | profiling::Report GraphExecutorDebug::Profile(Array<profiling::MetricCollector> collectors) { |
335 | std::vector<profiling::MetricCollector> cs(collectors.begin(), collectors.end()); |
336 | profiling::Profiler prof(devices_, cs, {{String("Executor" ), String("Graph" )}}); |
337 | |
338 | // warm up. 1 iteration does not seem enough. |
339 | for (int i = 0; i < 3; i++) { |
340 | GraphExecutor::Run(); |
341 | } |
342 | |
343 | prof.Start(); |
344 | for (size_t i = 0; i < op_execs_.size(); ++i) { |
345 | if (op_execs_[i]) { |
346 | // get argument shapes |
347 | std::vector<NDArray> shapes; |
348 | for (const auto& e : nodes_[i].inputs) { |
349 | uint32_t eid = entry_id(e); |
350 | shapes.push_back(data_entry_[eid]); |
351 | } |
352 | for (uint32_t j = 0; j < nodes_[i].param.num_outputs; ++j) { |
353 | uint32_t eid = entry_id(i, j); |
354 | shapes.push_back(data_entry_[eid]); |
355 | } |
356 | |
357 | uint32_t eid = entry_id(i, 0); |
358 | const Device& dev = data_entry_[eid]->device; |
359 | |
360 | std::unordered_map<std::string, ObjectRef> metrics; |
361 | for (auto p : nodes_[i].param.attrs) { |
362 | if (std::string(p.first).find("layout" ) != std::string::npos) { |
363 | metrics[p.first] = p.second; |
364 | } |
365 | } |
366 | if (nodes_[i].param.attrs.find("hash" ) != nodes_[i].param.attrs.end()) { |
367 | metrics["Hash" ] = Downcast<String>(nodes_[i].param.attrs.at("hash" )); |
368 | } |
369 | metrics["Argument Shapes" ] = profiling::ShapeString(shapes); |
370 | prof.StartCall(nodes_[i].param.func_name, dev, metrics); |
371 | op_execs_[i](); |
372 | prof.StopCall(); |
373 | } |
374 | } |
375 | prof.Stop(); |
376 | return prof.Report(); |
377 | } |
378 | |
379 | /*! |
380 | * \brief GraphExecutorDebugCreate Get the function based on input. |
381 | * \param sym_json The graph symbol in json format. |
382 | * \param m Compiled module which will be loaded. |
383 | * \param devs All devices. |
384 | */ |
385 | Module GraphExecutorDebugCreate(const std::string& sym_json, const tvm::runtime::Module& m, |
386 | const std::vector<Device>& devs, |
387 | PackedFunc lookup_linked_param_func) { |
388 | auto exec = make_object<GraphExecutorDebug>(); |
389 | exec->Init(sym_json, m, devs, lookup_linked_param_func); |
390 | return Module(exec); |
391 | } |
392 | |
393 | TVM_REGISTER_GLOBAL("tvm.graph_executor_debug.create" ).set_body([](TVMArgs args, TVMRetValue* rv) { |
394 | ICHECK_GE(args.num_args, 4) << "The expected number of arguments for graph_executor.create is " |
395 | "at least 4, but it has " |
396 | << args.num_args; |
397 | PackedFunc lookup_linked_param_func; |
398 | int dev_start_arg = 2; |
399 | if (args[2].type_code() == kTVMPackedFuncHandle) { |
400 | lookup_linked_param_func = args[2]; |
401 | dev_start_arg++; |
402 | } |
403 | |
404 | *rv = GraphExecutorDebugCreate(args[0], args[1], GetAllDevice(args, dev_start_arg), |
405 | lookup_linked_param_func); |
406 | }); |
407 | } // namespace runtime |
408 | } // namespace tvm |
409 | |