graph_executor_debug.cc source code [tvm/src/runtime/graph_executor/debug/graph_executor_debug.cc]

1	/*
2	* Licensed to the Apache Software Foundation (ASF) under one
3	* or more contributor license agreements. See the NOTICE file
4	* distributed with this work for additional information
5	* regarding copyright ownership. The ASF licenses this file
6	* to you under the Apache License, Version 2.0 (the
7	* "License"); you may not use this file except in compliance
8	* with the License. You may obtain a copy of the License at
9	*
10	* http://www.apache.org/licenses/LICENSE-2.0
11	*
12	* Unless required by applicable law or agreed to in writing,
13	* software distributed under the License is distributed on an
14	* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15	* KIND, either express or implied. See the License for the
16	* specific language governing permissions and limitations
17	* under the License.
18	*/
19
20	/!*
21	* \file graph_executor_debug.cc
22	*/
23	#include "./graph_executor_debug.h"
24
25	#include <tvm/runtime/container/string.h>
26	#include <tvm/runtime/ndarray.h>
27	#include <tvm/runtime/packed_func.h>
28	#include <tvm/runtime/profiling.h>
29	#include <tvm/runtime/registry.h>
30
31	#include <chrono>
32	#include <cmath>
33	#include <numeric>
34	#include <sstream>
35
36	#include "../../rpc/rpc_session.h"
37
38	namespace tvm {
39	namespace runtime {
40	std::string GraphExecutorDebug::RunIndividual(int number, int repeat, int min_repeat_ms,
41	int limit_zero_time_iterations,
42	int cooldown_interval_ms, int repeats_to_cooldown) {
43	// warmup run
44	GraphExecutor::Run();
45	std::string tkey = module_->type_key();
46	std::vector<std::vector<double>> time_sec_per_op(op_execs_.size());
47	if (tkey == "rpc") {
48	// RPC modules rely on remote timing which implements the logic from the else branch.
49	for (size_t index = `0`; index < op_execs_.size(); ++index) {
50	time_sec_per_op [index] =
51	RunOpRPC(index, number, repeat, min_repeat_ms, limit_zero_time_iterations,
52	cooldown_interval_ms, repeats_to_cooldown);
53	}
54	} else {
55	int op = `0`;
56	for (size_t index = `0`; index < op_execs_.size(); ++index) {
57	std::string result_str =
58	RunIndividualNode(index, number, repeat, min_repeat_ms, limit_zero_time_iterations,
59	cooldown_interval_ms, repeats_to_cooldown);
60	const double* blob_ptr = reinterpret_cast<const double*>(result_str.data());
61	for (int i = `0`; i < repeat; ++i, ++blob_ptr) {
62	time_sec_per_op [index].push_back(*blob_ptr);
63	}
64	if (op_execs_[index]) {
65	LOG(INFO) << "Op #" << op << " " << GetNodeName(index) << ":";
66	for (size_t cur_repeat = `0`; cur_repeat < time_sec_per_op [index].size(); cur_repeat++) {
67	const auto& data = time_sec_per_op [index][cur_repeat];
68	LOG(INFO) << "Iteration: " << cur_repeat << ": " << (data * `1e6`) << " us/iter";
69	}
70	++op;
71	}
72	}
73	}
74
75	std::ostringstream os;
76	int64_t size = time_sec_per_op.size();
77	os.write(reinterpret_cast<char>(&size), sizeof*(int64_t));
78	for (size_t index = `0`; index < time_sec_per_op.size(); ++index) {
79	for (auto& repeat_data : time_sec_per_op [index]) {
80	// To have good behavior when calculating total time, etc.
81	double data = std::isnan(repeat_data) ? `0` : repeat_data;
82	os.write(reinterpret_cast<char>(&data), sizeof(double*));
83	}
84	}
85	return os.str();
86	}
87
88	std::string GraphExecutorDebug::RunIndividualNode(int node_index, int number, int repeat,
89	int min_repeat_ms, int limit_zero_time_iterations,
90	int cooldown_interval_ms,
91	int repeats_to_cooldown) {
92	std::string tkey = module_->type_key();
93
94	if (tkey == "rpc") {
95	LOG(FATAL) << "RPC measurements should not use RunIndividualNode!";
96	}
97
98	if (!op_execs_[node_index]) {
99	// don't return anything...
100	std::ostringstream os;
101	double zero = `0`;
102	for (int i = `0`; i < repeat; ++i) {
103	os.write(reinterpret_cast<char>(&zero), sizeof(double*));
104	}
105	return os.str();
106	}
107
108	// assume host runs things which is first device
109	Device& d = devices_[`0`];
110	PackedFunc time_evaluator = profiling::WrapTimeEvaluator(
111	TypedPackedFunc<void()>([this, node_index]() { this->RunOpHost(node_index); }), d, number,
112	repeat, min_repeat_ms, limit_zero_time_iterations, cooldown_interval_ms, repeats_to_cooldown);
113	return time_evaluator ();
114	}
115
116	std::vector<double> GraphExecutorDebug::RunOpRPC(int index, int number, int repeat,
117	int min_repeat_ms, int limit_zero_time_iterations,
118	int cooldown_interval_ms,
119	int repeats_to_cooldown) {
120	std::vector<double> results(repeat, `0`);
121	// Right now we expect either "tvm_op" for nodes which run PackedFunc or "null" for nodes
122	// which represent inputs/parameters to the graph. Other types may be supported in the
123	// future, but consideration would be needed as to how to do that over RPC before we support
124	// it here.
125	if (nodes_[index].op_type != "tvm_op") {
126	CHECK_EQ(nodes_[index].op_type, "null")
127	<< "Don't know how to run op type " << nodes_[index].op_type
128	<< " remotely over RPC right now";
129
130	// NOTE: GraphExecutorDebug expects graph nodes to have an "op" attribute of "tvm_op" or
131	// "null" and "null" is a placeholder node for a parameter or input.
132	return results;
133	}
134
135	const Device& dev = data_entry_[entry_id(index, `0`)]->device;
136	TVMOpParam param = nodes_[index].param;
137	std::string name = param.func_name;
138	uint32_t num_inputs = param.num_inputs;
139	uint32_t num_outputs = param.num_outputs;
140
141	PackedFunc time_eval =
142	runtime::Registry::Get("runtime.RPCTimeEvaluator")
143	->
144	operator()(module_, name, static_cast<int>(dev.device_type), dev.device_id, number,
145	repeat, min_repeat_ms, limit_zero_time_iterations, cooldown_interval_ms,
146	repeats_to_cooldown, "");
147
148	int num_flat_args = num_inputs + num_outputs;
149	auto values = std::make_unique<TVMValue[]>(num_flat_args);
150	auto type_codes = std::make_unique<int[]>(num_flat_args);
151	TVMArgsSetter setter(values.get(), type_codes.get());
152	int offs = `0`;
153	const auto& inode = nodes_[index];
154	for (const auto& e : inode.inputs) {
155	uint32_t eid = this->entry_id(e);
156	DLTensor* arg = const_cast<DLTensor*>(data_entry_[eid].operator->());
157	setter (offs, arg);
158	offs++;
159	}
160	for (uint32_t i = `0`; i < num_outputs; ++i) {
161	uint32_t eid = this->entry_id(index, i);
162	DLTensor* arg = const_cast<DLTensor*>(data_entry_[eid].operator->());
163	setter (offs, arg);
164	offs++;
165	}
166	TVMRetValue rv;
167	time_eval.CallPacked(TVMArgs (values.get(), type_codes.get(), num_flat_args), &rv);
168	std::string results_str = rv.operator std::string();
169	const double* blob_ptr = reinterpret_cast<const double*>(results_str.data());
170	for (int i = `0`; i < repeat; ++i, ++blob_ptr) {
171	results [i] = *blob_ptr;
172	}
173
174	std::ostringstream os;
175	for (auto& repeat_data : results) {
176	os << std::to_string(repeat_data) << ", ";
177	}
178	LOG(INFO) << "Got op timing: " << os.str();
179	return results;
180	}
181
182	Timer GraphExecutorDebug::RunOpHost(int index) {
183	const Device& dev = data_entry_[entry_id(index, `0`)]->device;
184	Timer t = Timer::Start(dev);
185	op_execs_[index]();
186	t ->Stop();
187	return t;
188	}
189
190	/!*
191	* \brief GetFunction Get the function based on input.
192	* \param name The function which needs to be invoked.
193	* \param sptr_to_self Packed function pointer.
194	*/
195	PackedFunc GraphExecutorDebug::GetFunction(const std::string& name,
196	const ObjectPtr<Object>& sptr_to_self) {
197	// return member functions during query.
198	if (name == "debug_get_output") {
199	return PackedFunc ([sptr_to_self, this](TVMArgs args, TVMRetValue* rv) {
200	if (String::CanConvertFrom(args [`0`])) {
201	this->DebugGetNodeOutput(this->GetNodeIndex(args [`0`]), args [`1`]);
202	} else {
203	this->DebugGetNodeOutput(args [`0`], args [`1`]);
204	}
205	});
206	} else if (name == "execute_node") {
207	return PackedFunc (
208	[sptr_to_self, this](TVMArgs args, TVMRetValue* rv) { this->ExecuteNode(args [`0`]); });
209	} else if (name == "get_node_output") {
210	return PackedFunc ([sptr_to_self, this](TVMArgs args, TVMRetValue* rv) {
211	rv = this*->GetNodeOutput(args [`0`], args [`1`]);
212	});
213	} else if (name == "run_individual") {
214	return PackedFunc ([sptr_to_self, this](TVMArgs args, TVMRetValue* rv) {
215	int number = args [`0`];
216	int repeat = args [`1`];
217	int min_repeat_ms = args [`2`];
218	int limit_zero_time_iterations = args [`3`];
219	int cooldown_interval_ms = args [`4`];
220	int repeats_to_cooldown = args [`5`];
221	ICHECK_GT(number, `0`);
222	ICHECK_GT(repeat, `0`);
223	ICHECK_GE(min_repeat_ms, `0`);
224	ICHECK_GE(limit_zero_time_iterations, `0`);
225	ICHECK_GE(cooldown_interval_ms, `0`);
226	ICHECK_GT(repeats_to_cooldown, `0`);
227	std::string blob =
228	this->RunIndividual(number, repeat, min_repeat_ms, limit_zero_time_iterations,
229	cooldown_interval_ms, repeats_to_cooldown);
230	TVMByteArray arr;
231	arr.size = blob.length();
232	arr.data = blob.data();
233	*rv = arr;
234	});
235	} else if (name == "run_individual_node") {
236	return PackedFunc ([sptr_to_self, this](TVMArgs args, TVMRetValue* rv) {
237	int node_index = args [`0`];
238	int number = args [`1`];
239	int repeat = args [`2`];
240	int min_repeat_ms = args [`3`];
241	int limit_zero_time_iterations = args [`4`];
242	int cooldown_interval_ms = args [`5`];
243	int repeats_to_cooldown = args [`6`];
244	ICHECK_GE(node_index, `0`);
245	ICHECK_LT(node_index, nodes_.size());
246	ICHECK_GT(number, `0`);
247	ICHECK_GT(repeat, `0`);
248	ICHECK_GE(min_repeat_ms, `0`);
249	ICHECK_GE(limit_zero_time_iterations, `0`);
250	ICHECK_GE(cooldown_interval_ms, `0`);
251	ICHECK_GT(repeats_to_cooldown, `0`);
252	std::string blob = this->RunIndividualNode(node_index, number, repeat, min_repeat_ms,
253	limit_zero_time_iterations, cooldown_interval_ms,
254	repeats_to_cooldown);
255	TVMByteArray arr;
256	arr.size = blob.length();
257	arr.data = blob.data();
258	*rv = arr;
259	});
260	} else if (name == "profile") {
261	return TypedPackedFunc<profiling::Report(Array<profiling::MetricCollector>)>(
262	[sptr_to_self, this](Array<profiling::MetricCollector> collectors) {
263	// We cannot send Arrays over rpc, so in order to support profiling
264	// on remotes, we accept a nullptr for collectors.
265	if (collectors.defined()) {
266	return this->Profile(collectors);
267	} else {
268	return this->Profile({});
269	}
270	});
271	} else if (name == "profile_rpc") {
272	// We cannot return a Report over RPC because TMV RPC mechanism only
273	// supports a subset of Object classes. Instead we serialize it on the
274	// remote (here) and deserialize it on the other end.
275	return TypedPackedFunc<std::string()>([sptr_to_self, this]() {
276	PackedFunc profile = GetFunction("profile", sptr_to_self);
277	profiling::Report report = profile (Array<profiling::MetricCollector>());
278	return report ->AsJSON();
279	});
280	} else {
281	return GraphExecutor::GetFunction(name, sptr_to_self);
282	}
283	}
284
285	int GraphExecutorDebug::GetNodeIndex(const std::string& name) const {
286	for (size_t nid = `0`; nid < GetNumOfNodes(); ++nid) {
287	if (GetNodeName(nid) == name) {
288	return static_cast<int>(nid);
289	}
290	}
291	LOG(FATAL) << "cannot find " << name << " among nodex";
292	return -`1`;
293	}
294
295	void GraphExecutorDebug::ExecuteNode(int node) {
296	ICHECK_LT(static_cast<size_t>(node), op_execs_.size());
297
298	int start_ind;
299	int end_ind;
300	if (node < last_executed_node_) {
301	start_ind = `0`;
302	end_ind = node;
303	} else if (node > last_executed_node_) {
304	start_ind = last_executed_node_ + `1`;
305	end_ind = node;
306	} else {
307	return;
308	}
309
310	for (int i = start_ind; i <= end_ind; i++) {
311	if (op_execs_[i]) op_execs_[i]();
312	}
313	last_executed_node_ = end_ind;
314	}
315
316	void GraphExecutorDebug::DebugGetNodeOutput(int index, DLTensor* data_out) {
317	ICHECK_LT(static_cast<size_t>(index), op_execs_.size());
318	uint32_t eid = index;
319
320	for (size_t i = `0`; i < op_execs_.size(); ++i) {
321	if (op_execs_[i]) op_execs_[i]();
322	if (static_cast<int>(i) == index) break;
323	}
324
325	data_entry_[eid].CopyTo(data_out);
326	}
327
328	NDArray GraphExecutorDebug::GetNodeOutput(int node, int out_ind) {
329	ICHECK_EQ(node, last_executed_node_);
330	ICHECK_LT(entry_id(node, out_ind), data_entry_.size());
331	return data_entry_[entry_id(node, out_ind)].CopyTo({kDLCPU, `0`});
332	}
333
334	profiling::Report GraphExecutorDebug::Profile(Array<profiling::MetricCollector> collectors) {
335	std::vector<profiling::MetricCollector> cs(collectors.begin(), collectors.end());
336	profiling::Profiler prof(devices_, cs, {{String ("Executor"), String ("Graph")}});
337
338	// warm up. 1 iteration does not seem enough.
339	for (int i = `0`; i < `3`; i++) {
340	GraphExecutor::Run();
341	}
342
343	prof.Start();
344	for (size_t i = `0`; i < op_execs_.size(); ++i) {
345	if (op_execs_[i]) {
346	// get argument shapes
347	std::vector<NDArray> shapes;
348	for (const auto& e : nodes_[i].inputs) {
349	uint32_t eid = entry_id(e);
350	shapes.push_back(data_entry_[eid]);
351	}
352	for (uint32_t j = `0`; j < nodes_[i].param.num_outputs; ++j) {
353	uint32_t eid = entry_id(i, j);
354	shapes.push_back(data_entry_[eid]);
355	}
356
357	uint32_t eid = entry_id(i, `0`);
358	const Device& dev = data_entry_[eid]->device;
359
360	std::unordered_map<std::string, ObjectRef> metrics;
361	for (auto p : nodes_[i].param.attrs) {
362	if (std::string (p.first).find("layout") != std::string::npos) {
363	metrics [p.first] = p.second;
364	}
365	}
366	if (nodes_[i].param.attrs.find("hash") != nodes_[i].param.attrs.end()) {
367	metrics ["Hash"] = Downcast<String>(nodes_[i].param.attrs.at("hash"));
368	}
369	metrics ["Argument Shapes"] = profiling::ShapeString(shapes);
370	prof.StartCall(nodes_[i].param.func_name, dev, metrics);
371	op_execs_[i]();
372	prof.StopCall();
373	}
374	}
375	prof.Stop();
376	return prof.Report();
377	}
378
379	/!*
380	* \brief GraphExecutorDebugCreate Get the function based on input.
381	* \param sym_json The graph symbol in json format.
382	* \param m Compiled module which will be loaded.
383	* \param devs All devices.
384	*/
385	Module GraphExecutorDebugCreate(const std::string& sym_json, const tvm::runtime::Module& m,
386	const std::vector<Device>& devs,
387	PackedFunc lookup_linked_param_func) {
388	auto exec = make_object<GraphExecutorDebug>();
389	exec ->Init(sym_json, m, devs, lookup_linked_param_func);
390	return Module (exec);
391	}
392
393	TVM_REGISTER_GLOBAL("tvm.graph_executor_debug.create").set_body([](TVMArgs args, TVMRetValue* rv) {
394	ICHECK_GE(args.num_args, `4`) << "The expected number of arguments for graph_executor.create is "
395	"at least 4, but it has "
396	<< args.num_args;
397	PackedFunc lookup_linked_param_func;
398	int dev_start_arg = `2`;
399	if (args [`2`].type_code() == kTVMPackedFuncHandle) {
400	lookup_linked_param_func = args [`2`];
401	dev_start_arg++;
402	}
403
404	*rv = GraphExecutorDebugCreate(args [`0`], args [`1`], GetAllDevice(args, dev_start_arg),
405	lookup_linked_param_func);
406	});
407	} // namespace runtime
408	} // namespace tvm
409

Browse the source code of tvm/src/runtime/graph_executor/debug/graph_executor_debug.cc