impl.cpp source code [pytorch/torch/csrc/jit/runtime/static/impl.cpp]

1	#include <torch/csrc/jit/runtime/static/impl.h>
2
3	#include <ATen/MemoryOverlap.h>
4	#include <ATen/core/symbol.h>
5	#include <ATen/record_function.h>
6	#include <c10/core/CPUAllocator.h>
7	#include <c10/core/InferenceMode.h>
8	#include <c10/macros/Macros.h>
9	#include <c10/util/MaybeOwned.h>
10	#include <c10/util/irange.h>
11	#include <caffe2/core/scope_guard.h>
12	#include <caffe2/core/timer.h>
13	#include <torch/csrc/jit/ir/alias_analysis.h>
14	#include <torch/csrc/jit/jit_log.h>
15	#include <torch/csrc/jit/passes/add_if_then_else.h>
16	#include <torch/csrc/jit/passes/canonicalize.h>
17	#include <torch/csrc/jit/passes/dead_code_elimination.h>
18	#include <torch/csrc/jit/passes/eliminate_no_ops.h>
19	#include <torch/csrc/jit/passes/freeze_module.h>
20	#include <torch/csrc/jit/passes/remove_mutation.h>
21	#include <torch/csrc/jit/passes/subgraph_rewrite.h>
22	#include <torch/csrc/jit/passes/variadic_ops.h>
23	#include <torch/csrc/jit/runtime/graph_iterator.h>
24	#include <torch/csrc/jit/runtime/static/fusion.h>
25	#include <torch/csrc/jit/runtime/static/memory_planner.h>
26	#include <torch/csrc/jit/runtime/static/ops.h>
27	#include <torch/csrc/jit/runtime/static/passes.h>
28	#include <torch/csrc/jit/runtime/vararg_functions.h>
29	#include <algorithm>
30
31	#ifndef AT_PER_OPERATOR_HEADERS
32	#include <ATen/NativeFunctions.h>
33	#else
34	#include <ATen/ops/clone_native.h>
35	#endif
36
37	#include <iterator>
38	#include <limits>
39	#include <sstream>
40	#include <stdexcept>
41
42	#ifdef FBCODE_CAFFE2
43	#include <common/logging/logging.h>
44	#include <folly/dynamic.h>
45	#include <folly/json.h>
46	#endif
47
48	// used in test only
49	C10_DEFINE_bool(
50	static_runtime_disable_debug_memory_overlap_check,
51	false,
52	"If true, disable the memory overlap check in debug mode in ProcessedNode::run()");
53
54	namespace torch {
55	namespace jit {
56
57	namespace {
58
59	bool allArgsAreTensors(const Node* node) {
60	const auto& inputs = node->inputs();
61	return std::all_of(inputs.begin(), inputs.end(), [](const Value* value) {
62	return value->type()->kind() == TypeKind::TensorType;
63	});
64	}
65
66	} // namespace
67
68	// A manually curated set of ops that are disallowed in static runtime.
69	// These are rarely-used ops. Disallowing them typically eliminates
70	// corner cases in graph optimizations, allowing for more aggressive
71	// optimizations and better performance.
72	bool isUnsupportedOp(const Node* node) {
73	auto kind = node->kind();
74	if (kind != aten::__is__ && kind != aten::__isnot__) {
75	return false;
76	}
77
78	// We can't support aten::__is__ (and __isnot__) with tensor arguments.
79	// Consider the following graph:
80	// def forward(x):
81	// y = x.detach()
82	// return x is y
83	// We have a graph optimization that removes the `detach` node since it is
84	// a no-op during inference. But this affects the result - we get true
85	// instead of false! There are many other graph passes affected by this
86	// issue.
87	return allArgsAreTensors(node);
88	}
89
90	namespace {
91
92	bool canEnableStaticRuntimeImpl(const Block* block) {
93	if (block == nullptr) {
94	return false;
95	}
96
97	bool can_support = true;
98	for (auto* node : block->nodes()) {
99	for (auto* subblock : node->blocks()) {
100	// The ordering prevents && from short circuiting, which we want -
101	// it's useful to see all* the unsupported ops.*
102	can_support = canEnableStaticRuntimeImpl(subblock) && can_support;
103	}
104
105	const auto kind = node->kind();
106	if (kind == prim::Constant) {
107	continue;
108	}
109	// check if can get op from Node
110	const Operator* op = node->maybeOperator();
111	if (isUnsupportedOp(node) \|\| (!op && !nativeOpIsRegistered(kind))) {
112	can_support = false;
113	LOG(WARNING) << "Found unsupported op: " << kind.toQualString();
114	}
115	}
116	return can_support;
117	}
118
119	} // namespace
120
121	// Graph must be frozen. canEnableStaticRuntime will return false
122	// if there's any prim::CallMethod ops left in the graph.
123	bool canEnableStaticRuntime(const std::shared_ptr<torch::jit::Graph>& graph) {
124	return canEnableStaticRuntimeImpl(graph ->block());
125	}
126
127	namespace {
128
129	auto sr_metadata_registerer = torch::class_<StaticRuntimeMetadata>(
130	"StaticRuntime",
131	"StaticRuntimeMetadata");
132
133	} // namespace
134
135	std::string dumpValueSet(
136	const FastSet<const Value*>& value_set,
137	const char* set_name) {
138	std::ostringstream oss;
139	oss << set_name << ": {";
140	for (const auto* val : value_set) {
141	oss << "%" << val->debugName() << ", ";
142	}
143	oss << "}";
144	return oss.str();
145	}
146
147	namespace {
148
149	void OptimizeGraph(
150	std::shared_ptr<torch::jit::Graph>& graph,
151	const StaticModuleOptions& opts,
152	std::vector<IValue> sample_inputs) {
153	GRAPH_DUMP("Before optimizations: ", graph);
154	if (opts.enable_tensorexpr_fusion) {
155	if (sample_inputs.empty()) {
156	VLOG(`1`) << "Cannot perform TensorExpr fusion - sample_inputs is empty";
157	} else {
158	VLOG(`1`) << "Performing TensorExpr fusion";
159	performTensorExprFusion(graph, std::move(sample_inputs));
160	}
161	}
162	Inline(*graph);
163	ConstantPropagation(graph);
164	Canonicalize(graph);
165	ConstantPropagation(graph);
166	RemoveTensorMutation(graph);
167	ConstantPropagation(graph);
168	EliminateNoOpSlice(graph);
169	EliminateDeadCode(graph);
170	FuseInferenceOpsForSparseNN(graph);
171	UseVariadicCat(graph);
172	UseVariadicStack(graph);
173	EliminateTrivialEquallySplit(graph);
174	EliminateExtraPermuteOps(graph);
175
176	if (opts.enable_out_variant) {
177	UseVariadicOp(
178	graph,
179	fromQualString("fb::sigrid_transforms_torch_bind"),
180	fromQualString("fb::variadic_sigrid_transforms_torch_bind"));
181	UseVariadicOp(
182	graph,
183	fromQualString("torcharrow::inference_wrapper_run_flat"),
184	fromQualString("torcharrow::variadic_inference_wrapper_run_flat"));
185	// These fused ops only have out variants - we can't do the fusion when
186	// out variants are disabled.
187	FuseSignLog1P(graph);
188	FuseClampNaNToNum(graph);
189
190	#ifdef FBCODE_CAFFE2
191	if (opts.use_copy_variants && !opts.enable_tensorexpr_fusion) {
192	ReplaceWithCopy(graph);
193	} else {
194	ReplacePermuteWithCopy(graph);
195	}
196	if (opts.use_maybe_copy_variants && !opts.enable_tensorexpr_fusion) {
197	ReplaceWithMaybeCopy(graph);
198	}
199	FuseListUnpack(graph);
200	RemoveUnnecessaryOutputs(graph);
201	PrepackWeights(graph);
202	#endif
203	}
204
205	ConstantPropagation(graph);
206	RemoveImmutableInputDictLookups(graph);
207	UseVariadicTupleUnpack(graph);
208	UseVariadicGroupedAccessor(graph);
209	EliminateNoOps(
210	graph, / custom_ops / {fromQualString("fb::scale_gradient")});
211	AddIfThenElseOp(graph);
212	UseSplitAndSqueeze(graph);
213	UseInPlaceGetRealInputsFromOptionalInputsV2(graph);
214	GRAPH_DUMP("Final graph after optimizations: ", graph);
215	}
216
217	bool IsSelfInGraphInput(std::shared_ptr<torch::jit::Graph>& graph) {
218	return !graph ->inputs().empty() && graph ->inputs().at(`0`)->type()->is_module();
219	}
220
221	// remove unused input 0 from graph
222	bool removeSelfFromGraphInput(std::shared_ptr<torch::jit::Graph>& graph) {
223	if (graph ->inputs().at(`0`)->type()->is_module()) {
224	if (graph ->inputs().at(`0`)->hasUses()) {
225	return false;
226	}
227	graph ->eraseInput(`0`);
228	}
229	return true;
230	}
231
232	std::vector<Value> valueVecFromFastSet(const* FastSet<const Value*>& s) {
233	std::vector<Value*> result;
234	result.reserve(s.size());
235	for (auto* v : s) {
236	// NOLINTNEXTLINE(cppcoreguidelines-pro-type-const-cast)
237	result.emplace_back(const_cast<Value*>(v));
238	}
239	return result;
240	}
241
242	bool mayContainAlias(const AliasDb& db, const Value* v1, const Value* v2) {
243	// AliasDb is not const-correct here, so we have to const_cast
244	// NOLINTNEXTLINE(cppcoreguidelines-pro-type-const-cast)
245	return db.mayContainAlias(const_cast<Value>(v1), const_cast<Value>(v2));
246	}
247
248	bool mayContainAlias(
249	const AliasDb& db,
250	const Value* a,
251	const FastSet<const Value*>& b) {
252	// NOLINTNEXTLINE(cppcoreguidelines-pro-type-const-cast)
253	return db.mayContainAlias(const_cast<Value*>(a), valueVecFromFastSet(b));
254	}
255
256	bool escapesScope(const AliasDb& db, const Value* a) {
257	// NOLINTNEXTLINE(cppcoreguidelines-pro-type-const-cast)
258	return db.escapesScope({const_cast<Value*>(a)});
259	}
260
261	void PrepareGraphForStaticModule(
262	std::shared_ptr<torch::jit::Graph> graph,
263	const StaticModuleOptions& opts,
264	std::vector<IValue> sample_inputs) {
265	TORCH_CHECK(canEnableStaticRuntime(graph));
266	OptimizeGraph(graph, opts, std::move(sample_inputs));
267
268	// Static runtime moves its outputs out of the runtime
269	// by default. In some rare cases, this is not actually safe to
270	// do - for example, if the value is a constant, static runtime
271	// needs to hold onto a copy. Rather than adding special logic
272	// to handle this rare case, we use this pass to detect it and
273	// create an owned reference that can be safely moved out of the
274	// runtime.
275	CreateOwnedRefsForSpecialValues(*graph);
276
277	// We assume that each sub-block has at least one output. If we
278	// detect any that have 0, force the sub-block to return None.
279	ForceNonEmptyOutputs(*graph);
280	}
281
282	std::pair<std::shared_ptr<Graph>, c10::optional<Module>> PrepareForStaticModule(
283	const torch::jit::Module& m,
284	bool is_frozen,
285	const StaticModuleOptions& opts,
286	std::vector<IValue> sample_inputs) {
287	LOG(INFO) << "StaticModuleOptions: enable_out_variant "
288	<< opts.enable_out_variant << ", optimize_memory "
289	<< opts.optimize_memory << ", manage_output_tensors "
290	<< opts.manage_output_tensors << ", use_copy_variants "
291	<< opts.use_copy_variants << ", use_maybe_copy_variants "
292	<< opts.use_maybe_copy_variants << ", enable_tensorexpr_fusion "
293	<< opts.enable_tensorexpr_fusion;
294
295	Module module = m.copy();
296	if (!is_frozen) {
297	module.eval();
298	module = freeze_module(module);
299	}
300
301	Method method = module.get_method("forward");
302	auto graph = module.get_method("forward").graph();
303
304	if (!sample_inputs.empty() && IsSelfInGraphInput(graph)) {
305	sample_inputs.insert(sample_inputs.begin(), m._ivalue());
306	}
307	PrepareGraphForStaticModule(graph, opts, std::move(sample_inputs));
308
309	return std::make_pair(graph, module);
310	}
311
312	std::pair<std::shared_ptr<Graph>, c10::optional<Module>> PrepareForStaticModule(
313	std::shared_ptr<torch::jit::Graph> graph,
314	const StaticModuleOptions& opts,
315	std::vector<IValue> sample_inputs) {
316	PrepareGraphForStaticModule(graph, opts, std::move(sample_inputs));
317	return std::make_pair(graph, c10::nullopt);
318	}
319
320	} // namespace
321
322	void ValueGroup::init(const Block& block, const AliasDb& db) {
323	external_aliases_.clear();
324	output_aliases_.clear();
325	// Build `external_aliases` as we look through nodes forwardly from
326	// the graph's inputs and add aliases of the inputs being created by the
327	// nodes.
328	external_aliases_.insert(block.inputs().begin(), block.inputs().end());
329	for (const auto* node : block.nodes()) {
330	if (node->kind() == prim::Constant) {
331	for (const auto* output : node->outputs()) {
332	external_aliases_.insert(output);
333	}
334	}
335	}
336	for (const auto* node : block.nodes()) {
337	if (node->kind() == prim::Constant) {
338	// Constants are already in `external_aliases`.
339	continue;
340	}
341	for (const auto* v : node->outputs()) {
342	if (escapesScope(db, v) \|\| mayContainAlias(db, v, external_aliases_)) {
343	external_aliases_.insert(v);
344	}
345	}
346	}
347
348	// Build `output_aliases` as we look through nodes reversely so that we can
349	// start from the output values, and follow the flows backwardly from there.
350	output_aliases_.insert(block.outputs().begin(), block.outputs().end());
351	for (const auto* node : block.nodes().reverse()) {
352	if (node->kind() == prim::Constant) {
353	// Constants cannot create any aliases.
354	continue;
355	}
356	for (const auto* v : node->outputs()) {
357	if (mayContainAlias(db, v, output_aliases_)) {
358	output_aliases_.insert(v);
359	}
360	}
361	}
362	}
363
364	namespace {
365
366	bool isTensorList(const Value* value) {
367	auto* type = value->type()->castRaw<ListType>();
368	if (!type) {
369	return false;
370	}
371	return type->getElementType()->kind() == c10::TypeKind::TensorType;
372	}
373
374	bool containTensorsOnly(at::ArrayRef<Value*> values) {
375	// return true only if all outputs are tensors
376	return std::all_of(values.begin(), values.end(), [](const Value* value) {
377	return value->type()->kind() == c10::TypeKind::TensorType \|\|
378	isTensorList(value);
379	});
380	}
381
382	bool isPureFunction(const Node* node) {
383	auto* schema = node->maybeSchema();
384	return schema &&
385	schema->aliasAnalysis() == c10::AliasAnalysisKind::PURE_FUNCTION;
386	}
387
388	} // namespace
389
390	ManagedTensorRanges::ManagedTensorRanges(
391	Block& block,
392	const AliasDb& alias_db,
393	const FastSet<const Value*>& managed_tensor_values) {
394	const std::vector<Node*> nodes(block.nodes().begin(), block.nodes().end());
395	const FastSet<const Value*> graph_inputs(
396	block.inputs().begin(), block.inputs().end());
397
398	const auto num_nodes = nodes.size();
399	for (const auto i : c10::irange(num_nodes)) {
400	auto* node = nodes [i];
401	for (auto* input : node->inputs()) {
402	auto* lifetime = getLifetime(input);
403	if (!lifetime) {
404	continue;
405	}
406	DCHECK(lifetime->end <= i);
407	lifetime->end = i;
408	}
409	for (auto* output : node->outputs()) {
410	if (!alias_db.isMutableType(output)) {
411	continue;
412	}
413	value_lifetimes_.emplace(output, Lifetime (i, i));
414	}
415	}
416	for (auto* graph_output : block.outputs()) {
417	auto* lifetime = getLifetime(graph_output);
418	if (!lifetime) {
419	continue;
420	}
421	lifetime->end = num_nodes;
422	}
423
424	// Handle aliases. Aliases may extend a Value's lifetime. If a node*
425	// has an input and output that may alias each other, set the input's
426	// lifetime end to max(input.lifetime_end, output.lifetime_end). Iterate
427	// backwards to handle chains of aliases.
428	for (const auto* node : block.nodes().reverse()) {
429	if (isPureFunction(node)) {
430	// If the node is a pure function, it doesn't create any aliases,
431	// so we can safely skip it.
432	continue;
433	}
434
435	auto inputs = collectValuesWithTrackedLifetimes(node->inputs());
436	auto outputs = collectValuesWithTrackedLifetimes(node->outputs());
437	for (auto* input : inputs) {
438	auto* input_lifetime = getLifetime(input);
439	DCHECK(input_lifetime != nullptr);
440	for (auto* output : outputs) {
441	if (mayContainAlias(alias_db, input, output)) {
442	auto* output_lifetime = getLifetime(output);
443	DCHECK(output_lifetime != nullptr);
444	input_lifetime->end =
445	std::max(output_lifetime->end, input_lifetime->end);
446	}
447	}
448	}
449	}
450	for (auto* managed_tensor : managed_tensor_values) {
451	auto* lifetime = getLifetime(managed_tensor);
452	DCHECK(lifetime && lifetime->end <= num_nodes);
453	// NOLINTNEXTLINE(cppcoreguidelines-init-variables)
454	Node* freeing_node;
455	if (lifetime->end == num_nodes) {
456	freeing_node = block.return_node();
457	} else {
458	freeing_node = nodes [lifetime->end];
459	}
460	node_to_newly_free_tensors_[freeing_node].emplace_back(managed_tensor);
461	}
462	}
463
464	bool ManagedTensorRanges::nodeFreesManagedTensors(Node* node) const {
465	auto it = node_to_newly_free_tensors_.find(node);
466	return it != node_to_newly_free_tensors_.end() && !it ->second.empty();
467	}
468
469	const std::vector<const Value*>& ManagedTensorRanges::
470	availableTensorValuesAfterNode(Node* node) const {
471	return node_to_newly_free_tensors_.at(node);
472	}
473
474	bool ManagedTensorRanges::lifetimesOverlap(const Value* v1, const Value* v2)
475	const {
476	const auto* v1_lifetime = getLifetime(v1);
477	const auto* v2_lifetime = getLifetime(v2);
478	if (!v1_lifetime \|\| !v2_lifetime) {
479	return false;
480	}
481
482	if (v1_lifetime->start < v2_lifetime->start) {
483	return v1_lifetime->end >= v2_lifetime->start;
484	}
485	return v2_lifetime->end >= v1_lifetime->start;
486	}
487
488	const ManagedTensorRanges::Lifetime* ManagedTensorRanges::getLifetime(
489	const Value* value) const {
490	auto it = value_lifetimes_.find(value);
491	if (it != value_lifetimes_.end()) {
492	return &it ->second;
493	}
494	return nullptr;
495	}
496
497	ManagedTensorRanges::Lifetime* ManagedTensorRanges::getLifetime(
498	const Value* value) {
499	// const_cast is safe here, this is just a way to avoid code duplication
500	// between the const/non-const versions of getLifetime.
501
502	// NOLINTNEXTLINE(cppcoreguidelines-pro-type-const-cast)
503	const auto* const_this = const_cast<const ManagedTensorRanges>(this*);
504
505	// NOLINTNEXTLINE(cppcoreguidelines-pro-type-const-cast)
506	return const_cast<ManagedTensorRanges::Lifetime*>(
507	const_this->getLifetime(value));
508	}
509
510	std::vector<const Value*> ManagedTensorRanges::
511	collectValuesWithTrackedLifetimes(at::ArrayRef<const Value*> values) {
512	std::vector<const Value*> mutable_values;
513	mutable_values.reserve(values.size());
514	std::copy_if(
515	values.begin(),
516	values.end(),
517	std::back_inserter(mutable_values),
518	[this](const Value* value) { return getLifetime(value) != nullptr; });
519	return mutable_values;
520	}
521
522	StaticModule::StaticModule(
523	std::shared_ptr<torch::jit::Graph> g,
524	const StaticModuleOptions& opts,
525	std::vector<IValue> sample_inputs)
526	: StaticModule (
527	PrepareForStaticModule(g ->copy(), opts, std::move(sample_inputs)),
528	opts) {}
529
530	StaticModule::StaticModule(
531	const torch::jit::Module& m,
532	bool is_frozen,
533	const StaticModuleOptions& opts,
534	std::vector<IValue> sample_inputs)
535	: StaticModule (
536	PrepareForStaticModule(m, is_frozen, opts, std::move(sample_inputs)),
537	opts) {}
538
539	StaticModule::StaticModule(
540	std::pair<std::shared_ptr<torch::jit::Graph>, c10::optional<Module>>
541	graph_and_module,
542	const StaticModuleOptions& opts)
543	: opts_(opts),
544	graph_(std::move(graph_and_module.first)),
545	module_(std::move(graph_and_module.second)),
546	num_inputs_(graph_->inputs().size()) {
547	sr_metadata_ = c10::make_intrusive<jit::StaticRuntimeMetadata>(opts_);
548	// recursively attach metadata to prim::fork nodes
549	attachNodeMetadata(graph_->block());
550
551	// check opt flags
552	if (opts.manage_output_tensors) {
553	TORCH_CHECK(
554	opts_.enable_out_variant,
555	"When manage_output_tensors is true, enable_out_variant must be set to true");
556	}
557	if (opts_.optimize_memory) {
558	TORCH_CHECK(
559	opts_.enable_out_variant,
560	"When optimize_memory is true, enable_out_variant must be set to true");
561	}
562
563	// handle schema
564	if (module_.has_value()) {
565	Method method = module_->get_method("forward");
566	schema_ = method.function().getSchema();
567	const auto num_schema_args = schema_->arguments().size();
568	DCHECK(num_schema_args > `0`);
569	if (removeSelfFromGraphInput(graph_)) {
570	module_ = c10::nullopt;
571	num_inputs_ = num_schema_args - `1`;
572	}
573	}
574
575	{
576	size_t nodes_size = `0`, constants_size = `0`;
577	for (Node* node : graph_->nodes()) {
578	++(node->kind() == prim::Constant ? constants_size : nodes_size);
579	}
580
581	constants_.reserve(constants_size);
582	functions_.reserve(nodes_size);
583	}
584
585	// Create ProcessedFunction instances first to freeze their addresses to pass
586	// to ProcessedNode.
587	AliasDb alias_db(graph_, /isFrozen=/false);
588	GRAPH_DEBUG("AliasDb: ", alias_db.toString());
589
590	// Maps each Value in the graph to its index in the values_ array that will*
591	// eventually be created by StaticRuntime.
592	FastMap<const Value*, uint32_t> value_to_index;
593	prepareFunctionsAndConstants(graph_->block(), alias_db, value_to_index);
594
595	const auto constants_index_offset = `0`;
596	const auto values_index_offset = constants_index_offset + constants().size();
597	value_buffer_size_ = values_index_offset;
598
599	value_buffer_size_ +=
600	prepareBlockInfo(graph_->block(), values_index_offset, value_to_index);
601
602	prepareStaticNodeInfos(graph_->block(), value_to_index, alias_db);
603
604	for (auto& block_and_info : block_infos_) {
605	auto& block_info = block_and_info.second;
606	block_info.prepare_for_memory_planner(alias_db, opts);
607	}
608	}
609
610	size_t StaticModule::prepareBlockInfo(
611	Block* block,
612	const size_t start_idx,
613	FastMap<const Value*, uint32_t>& value_to_index) {
614	block_infos_.emplace(block, BlockInfo (start_idx, *block));
615
616	const auto num_inputs = block->inputs().size();
617	for (const auto i : c10::irange(num_inputs)) {
618	value_to_index.emplace(block->inputs()[i], start_idx + i);
619	}
620	auto cur_idx = start_idx + num_inputs;
621
622	for (auto* node : block->nodes()) {
623	for (auto* sub_block : node->blocks()) {
624	cur_idx += prepareBlockInfo(sub_block, cur_idx, value_to_index);
625	}
626
627	if (node->kind() == prim::Constant) {
628	continue;
629	}
630
631	TORCH_CHECK(
632	cur_idx < (`1` << `16`),
633	"outputs offset in values table",
634	cur_idx,
635	" would overflow 2-byte index storage");
636
637	const auto num_outputs = node->outputs().size();
638	for (const auto i : c10::irange(num_outputs)) {
639	value_to_index.emplace(node->outputs()[i], cur_idx + i);
640	}
641	cur_idx += num_outputs;
642	}
643
644	std::vector<uint16_t> output_indices;
645	output_indices.reserve(block->outputs().size());
646	for (auto* output : block->outputs()) {
647	const auto output_idx = value_to_index.at(output);
648	TORCH_CHECK(
649	output_idx < (`1` << `16`),
650	"outputs offset in values table",
651	output_idx,
652	" would overflow 2-byte index storage");
653	output_indices.push_back(output_idx);
654	}
655
656	block_infos_.at(block).set_output_indices(std::move(output_indices));
657	return cur_idx - start_idx;
658	}
659
660	void StaticModule::attachNodeMetadata(Block* block) {
661	for (auto* node : block->nodes()) {
662	if (node->kind() == prim::fork) {
663	node->ival_(getStaticRuntimeMetadataSymbol(), IValue(sr_metadata_));
664	}
665	for (auto* sub_block : node->blocks()) {
666	attachNodeMetadata(sub_block);
667	}
668	}
669	}
670
671	void StaticModule::prepareFunctionsAndConstants(
672	Block* block,
673	const AliasDb& alias_db,
674	FastMap<const Value*, uint32_t>& value_to_index) {
675	for (auto* node : block->nodes()) {
676	for (auto* sub_block : node->blocks()) {
677	prepareFunctionsAndConstants(sub_block, alias_db, value_to_index);
678	}
679
680	if (node->kind() == prim::Constant) {
681	auto* v = node->output();
682	TORCH_CHECK(v->type()->kind() != FunctionType::Kind);
683	value_to_index.emplace(v, constants_.size());
684	constants_.emplace_back(toIValue(v).value());
685	continue;
686	}
687
688	// see [Check and correct bad schema alias info at runtime]
689	bool check_outputs_for_overlap =
690	!alias_db.mayContainAlias(node->inputs(), node->outputs()) &&
691	containTensorsOnly(node->outputs());
692	// new ProcessedFunction
693	functions_.emplace_back(
694	node, opts_.enable_out_variant, check_outputs_for_overlap);
695	}
696	}
697
698	size_t StaticModule::prepareStaticNodeInfos(
699	Block* block,
700	const FastMap<const Value*, uint32_t>& value_to_index,
701	const AliasDb& alias_db,
702	size_t node_idx) {
703	const auto node_start = node_idx;
704
705	auto& block_info = block_infos_.at(block);
706	std::vector<StaticNodeInfo> nodes;
707	FastMap<Node, bool*> node_has_out_variant;
708
709	for (auto* node : block->nodes()) {
710	if (node->kind() == prim::Constant) {
711	continue;
712	}
713
714	for (auto* sub_block : node->blocks()) {
715	node_idx +=
716	prepareStaticNodeInfos(sub_block, value_to_index, alias_db, node_idx);
717	}
718	ProcessedNodeInputs input_indices(node->inputs().size());
719	for (const auto input_idx : c10::irange(node->inputs().size())) {
720	auto* input = node->inputs()[input_idx];
721	auto input_ivalue_idx = value_to_index.at(input);
722	TORCH_CHECK(
723	input_ivalue_idx < (`1` << `16`),
724	"input index in values table ",
725	input_ivalue_idx,
726	" would overflow 2-byte index storage");
727	input_indices [input_idx] = input_ivalue_idx;
728	}
729
730	ProcessedFunction* fn = &functions_[node_idx];
731
732	// create a new ProcessedNode
733	const auto node_output_idx = node->outputs().empty()
734	// The index is unused if there are no outputs, so just create a
735	// placeholder value.
736	? std::numeric_limits<uint16_t>::max()
737	: value_to_index.at(node->output(`0`));
738	nodes.emplace_back(node, fn, std::move(input_indices), node_output_idx);
739
740	node_has_out_variant.emplace(node, nodes.back().has_out_variant());
741	++node_idx;
742	}
743
744	block_info.set_nodes(std::move(nodes), node_has_out_variant);
745	block_info.init_value_group(alias_db);
746
747	return node_idx - node_start;
748	}
749
750	void BlockInfo::set_nodes(
751	std::vector<StaticNodeInfo> nodes,
752	const FastMap<Node, bool*>& node_has_out_variant) {
753	nodes_ = std::move(nodes);
754
755	for (auto& node : nodes_) {
756	if (node.num_outputs() == `1` &&
757	isOptimizableContainerType(node.node(), node_has_out_variant)) {
758	node_is_optimizable_container_type_.emplace(node.node());
759	}
760	}
761	}
762	void BlockInfo::prepare_for_memory_planner(
763	const AliasDb& alias_db,
764	const StaticModuleOptions& opts) {
765	if (!opts.enable_out_variant) {
766	return;
767	}
768
769	// Never manage graph outputs so that we can do std::move(output_ivalue).
770	// This does not affect performance if the graph returns a collection object.
771	FastSet<const Value*> graph_output_values(
772	block_.outputs().begin(), block_.outputs().end());
773
774	// collect register indices of outputs of ops with out variant
775	for (StaticNodeInfo& pnode : nodes_) {
776	if (!pnode.has_out_variant()) {
777	continue;
778	}
779	auto outputs = pnode.node()->outputs();
780	for (const auto i : c10::irange(outputs.size())) {
781	const Value* out_v = outputs [i];
782	// Types are stored in the underlying TorchScript IR
783	bool is_tensor_type = out_v->type()->castRaw<TensorType>();
784	if (opts.manage_output_tensors && is_tensor_type &&
785	graph_output_values.find(out_v) == graph_output_values.end() &&
786	value_group_.isOutputAlias(out_v)) {
787	managed_output_tensor_values_.insert(out_v);
788	continue;
789	}
790	if (value_group_.isAlwaysAlive(out_v)) {
791	continue;
792	}
793	if (is_tensor_type) {
794	managed_tensor_values_.insert(out_v);
795	} else if (node_is_optimizable_container_type(pnode.node())) {
796	// We "leak" certain container types because their allocations
797	// take a long time
798	leaked_values_.insert(out_v);
799	}
800	}
801	}
802
803	for (const Value* output : block_.outputs()) {
804	managed_tensor_values_.erase(output);
805	}
806	GRAPH_DEBUG("managed_tensor_values: ", dumpValueSet(managed_tensor_values_));
807	GRAPH_DEBUG(
808	"managed_output_tensor_values_: ",
809	dumpValueSet(managed_output_tensor_values_));
810
811	managed_tensor_ranges_ =
812	ManagedTensorRanges (block_, alias_db, managed_tensor_values_);
813	}
814
815	const StaticModuleOptions& StaticModule::opts() const {
816	return opts_;
817	}
818
819	size_t StaticModule::num_outputs() const {
820	return graph_->outputs().size();
821	}
822
823	size_t StaticModule::num_inputs() const {
824	return num_inputs_;
825	}
826
827	StaticRuntime& StaticModule::runtime() {
828	if (!cached_runtime_) {
829	cached_runtime_ = std::make_unique<StaticRuntime>(*this);
830	}
831	return *cached_runtime_;
832	}
833
834	Node* StaticModule::findNodeWithKindForTesting(const std::string& kind) const {
835	for (auto& block_and_info : block_infos_) {
836	auto& block_info = block_and_info.second;
837	for (auto& pnode : block_info.nodes()) {
838	if (pnode.node()->kind().toQualString() == kind) {
839	return pnode.node();
840	}
841	}
842	}
843	return nullptr;
844	}
845
846	c10::IValue StaticModule::operator()(
847	const std::vector<c10::IValue>& args,
848	const KeywordArgs& kwargs) {
849	return runtime()(args, kwargs);
850	}
851
852	c10::IValue StaticModule::operator()(
853	std::vector<c10::IValue>&& args,
854	const KeywordArgs& kwargs) {
855	return runtime()(std::move(args), kwargs);
856	}
857
858	BlockRunner::BlockRunner(
859	const StaticModule& sm,
860	IValue* values,
861	Block* block,
862	torch::jit::TaskLauncher* launcher,
863	bool is_root_block)
864	: static_module_(sm),
865	block_info_(static_module_.block_info(block)),
866	is_root_block_(is_root_block),
867	first_input_is_self_(
868	is_root_block_ && static_module_.first_input_is_self()),
869	inputs_begin_(block_info_.block_inputs_idx()),
870	// TODO(T108633124): Turn on manage output tensors for sub-blocks.
871	manage_output_tensors_enabled_(
872	is_root_block_ && sm.opts().manage_output_tensors),
873	values_(values) {
874	nodes_.reserve(block_info_.nodes().size());
875	for (auto& pre_pnode : block_info_.nodes()) {
876	nodes_.emplace_back(pre_pnode, values_);
877	}
878
879	for (auto index : block_info_.block_output_indices()) {
880	outputs_.emplace_back(&values_[index]);
881	}
882
883	for (auto& pnode : nodes_) {
884	auto* node = pnode.node();
885
886	// attach the async taskLauncher to processedNodes
887	pnode.set_metadata(launcher);
888	auto blocks = node->blocks();
889	const auto num_blocks = blocks.size();
890	if (num_blocks == `0`) {
891	continue;
892	}
893	DCHECK(node->kind() == prim::If \|\| node->kind() == prim::Loop);
894	std::vector<BlockRunner> block_runners;
895	block_runners.reserve(num_blocks);
896
897	for (auto* b : blocks) {
898	block_runners.emplace_back(sm, values_, b, launcher);
899	}
900	pnode.set_metadata(std::move(block_runners));
901	}
902	}
903
904	// NOLINTNEXTLINE(cppcoreguidelines-pro-type-member-init)
905	BlockRunner::BlockRunner(BlockRunner&&) noexcept = default;
906
907	BlockRunner::~BlockRunner() = default;
908
909	void BlockRunner::set_arg(const size_t idx, std::vector<IValue>&& args) {
910	DCHECK(idx < args.size());
911	Input(idx + first_input_is_self_) = std::move(args [idx]);
912	}
913
914	void BlockRunner::set_arg(const size_t idx, const std::vector<IValue>& args) {
915	DCHECK(idx < args.size());
916	Input(idx + first_input_is_self_) = args [idx];
917	}
918
919	void BlockRunner::set_arg(const size_t idx, const IValue& arg) {
920	Input(idx + first_input_is_self_) = arg;
921	}
922
923	namespace {
924	void check_type(const Argument& schema_arg, const IValue& arg) {
925	// Fast path for most common case
926	if (arg.isTensor() &&
927	schema_arg.type()->kind() == c10::TypeKind::TensorType) {
928	return;
929	}
930	TORCH_CHECK(arg.type()->isSubtypeOf(schema_arg.type()));
931	}
932	} // namespace
933
934	template <typename IValueList>
935	void BlockRunner::set_inputs(
936	IValueList&& args,
937	const std::unordered_map<std::string, c10::IValue>& kwargs) {
938	const auto& schema = static_module_.schema();
939	if (first_input_is_self_) {
940	Input(`0`) = static_module_.module()._ivalue();
941	}
942
943	if (!is_root_block_ \|\| C10_UNLIKELY(!schema)) {
944	TORCH_CHECK(
945	kwargs.empty(), "Schema is not available, but BlockRunner got kwargs.");
946
947	const auto total_num_inputs = args.size() + first_input_is_self_;
948	TORCH_CHECK(total_num_inputs == block_info_.num_inputs());
949
950	for (size_t i = `0`; i < args.size(); ++i) {
951	set_arg(i, std::forward<IValueList>(args));
952	}
953	return;
954	}
955
956	const auto& schema_args = schema ->arguments();
957	size_t consumed_kwargs = `0`;
958	DCHECK(!schema_args.empty());
959	TORCH_CHECK(
960	args.size() < schema_args.size(),
961	"Static runtime got too many arguments");
962	for (size_t i = `0`; i < schema_args.size() - `1`; ++i) {
963	// Start at 1 since the schema always contains `self`.
964	const auto& schema_arg = schema_args [i + `1`];
965
966	if (i < args.size()) {
967	check_type(schema_arg, args[i]);
968	set_arg(i, std::forward<IValueList>(args));
969	continue;
970	}
971
972	auto it = kwargs.find(schema_arg.name());
973	if (it != kwargs.end()) {
974	check_type(schema_arg, it ->second);
975	set_arg(i, it ->second);
976	++consumed_kwargs;
977	continue;
978	}
979
980	auto maybe_default_val = schema_arg.default_value();
981	if (maybe_default_val) {
982	set_arg(i, *maybe_default_val);
983	continue;
984	}
985
986	TORCH_CHECK(
987	false, "Static runtime is missing required kwarg ", schema_arg.name());
988	}
989	TORCH_CHECK(consumed_kwargs == kwargs.size());
990	}
991
992	void BlockRunner::create_memory_planner() {
993	if (!planner_) {
994	planner_ = std::make_unique<StandardMemoryPlanner>(
995	this,
996	block_info_,
997	static_module_.opts().enable_out_variant,
998	manage_output_tensors_enabled_,
999	static_module_.opts().optimize_memory);
1000	}
1001	}
1002
1003	namespace {
1004
1005	void destroyNodeOutputs(ProcessedNode& p_node) {
1006	const auto borrows_outputs = borrowsOutputs(p_node.node()->kind());
1007	for (const auto i : c10::irange(p_node.num_outputs())) {
1008	auto& output = p_node.Output(i);
1009	if (doesNotHeapAllocateWhenStoredInIValue(*output.type())) {
1010	continue;
1011	}
1012
1013	if (borrows_outputs) {
1014	// NB: No need to incref here. This codepath is only hit if the run didn't
1015	// finish, so we shouldn't be returning anything to the client.
1016	c10::MaybeOwnedTraits<IValue>::destroyBorrow(output);
1017	} else {
1018	output = IValue();
1019	}
1020	}
1021	}
1022
1023	} // namespace
1024
1025	void BlockRunner::clean_up_intermediate_ivalues() noexcept {
1026	// We have to iterate in reverse order here due to borrowed
1027	// IValues - we don't want to destroy a value until all of its
1028	// borrows are cleaned up!
1029	for (auto it = nodes_.rbegin(); it != nodes_.rend(); ++it) {
1030	destroyNodeOutputs(*it);
1031	}
1032	}
1033
1034	void BlockRunner::resetMemory() noexcept {
1035	planner_.reset();
1036	// We must clean up intermediate values before inputs in case
1037	// there are borrowed inputs and static runtime owns the only
1038	// reference (e.g. the inputs were std::move'd into the runtime)
1039	clean_up_intermediate_ivalues();
1040	clean_up_input_ivalues();
1041	}
1042
1043	c10::IValue BlockRunner::move_outputs_to_tuple(uint32_t num_outputs) {
1044	switch (num_outputs) {
1045	case `1`:
1046	return c10::ivalue::Tuple::create(IValue(std::move(*outputs_[`0`])));
1047	case `2`:
1048	return c10::ivalue::Tuple::create(
1049	IValue(std::move(outputs_[`0`])), IValue(std::move(outputs_[`1`])));
1050	case `3`:
1051	return c10::ivalue::Tuple::create(
1052	IValue(std::move(*outputs_[`0`])),
1053	IValue(std::move(*outputs_[`1`])),
1054	IValue(std::move(*outputs_[`2`])));
1055	default: {
1056	std::vector<c10::IValue> outputs;
1057	outputs.reserve(num_outputs);
1058	for (const auto i : c10::irange(num_outputs)) {
1059	// use move here. Otherwise, clean up outputs_[i] explicitly
1060	outputs.emplace_back(std::move(*outputs_[i]));
1061	}
1062	return c10::ivalue::Tuple::create(std::move(outputs));
1063	}
1064	}
1065	}
1066
1067	/// [Check and correct bad schema alias info at runtime]
1068	/// Static runtime relies on the operator schema's alias info to be correct for
1069	/// memory planning. Because it's hard to enforce the alias info to be correct,
1070	/// we need to do runtime detection for accidental aliases that do not comply
1071	/// with the schema. Only aliases of managed tensors are problematic. To avoid
1072	/// runtime crashes, we can add runtime detection and force the op to comply
1073	/// with its schema by cloning the alias. Because all managed tensors' data_ptrs
1074	/// are part of the internal buffer that the MemoryPlanner allocates, we can
1075	/// check aliases by checking the memory overlap with this internal buffer. But
1076	/// a tensor's storage can be resized during inferenceso we need another way to
1077	/// handle the resized case.
1078	///
1079	/// There are two ways for incorrect schema to break memory planning. Let's look
1080	/// at two examples:
1081	///
1082	/// Example 1:
1083	/// @code
1084	/// def forward(x):
1085	/// a = x + x
1086	/// b = bad_op(a) # b ends up aliasing a incorrectly
1087	/// return (b)
1088	/// @endcode
1089	/// bad_op: its schema says it returns a new Tensor, but it actually returns an
1090	/// alias. In this case, the memory planner would recognize `a` as a managed
1091	/// tensor and clean up its memory before returning `b`. But `b` is actually an
1092	/// alias of `a`, when `a`'s data_ptr get reset, `b`'s data_ptr gets reset too.
1093	///
1094	/// Example 2:
1095	/// @code
1096	/// def forward(x):
1097	/// a = x + x
1098	/// a2 = bad_op(a) # a2 ends up alias a incorrectly
1099	/// b = a + a
1100	/// c = b b # c shares storage with a*
1101	/// d = c + 2 # d shares storage with b
1102	/// e = a2 a2*
1103	/// return (d, e)
1104	/// @endcode
1105	/// With the memory reuse algorithm, `c` could end up sharing storage with `a`,
1106	/// but because of bad_op, `a2` now aliases `a`. `c` overwrites `a` and
1107	/// therefore `a2`, leading to the wrong results. We solve this problem with two
1108	/// steps. Note this doesn't happen with the current memory reuse algorithm
1109	/// because of the way it's implemented. Things could change with a different
1110	/// implementation.
1111	///
1112	/// Step 1, annotate the ProcessedNodes with a flag `check_memory_overlap_` set
1113	/// to true if its outputs do not alias its inputs as indicated by the AliasDb
1114	/// and all of its outputs are Tensors. Then at runtime, we check that the
1115	/// nodes' output tensors do not overlap with the internal buffer that the
1116	/// MemoryPlanner allocates. For latency concerns, we only run this check for
1117	/// fallback ops. The schemas of native ops and out variants are vetted and
1118	/// enforced with static runtime unit tests. For the first iteration, we do a
1119	/// full memory overlap check with
1120	/// ProcessedNode::verify_and_correct_memory_overlap() because the internal
1121	/// buffer doesn't exist yet.
1122	///
1123	/// Step 2, if a managed tensor gets resized during inference, it gets a new
1124	/// data_ptr which is not from the buffer. We can tackle this corner case by
1125	/// delaying the deallocation of the managed tensors to after the outputs are no
1126	/// longer used (essentially merging the internal/output buffers into one).
1127	/// Before the merging is implemented, we add another flag `overlap_detected_`
1128	/// to flag any node with overlap detected in Step 1 and do a full memory
1129	/// overlap check if the fast check (by checking memory overlap with internal
1130	/// buffer) fails. There is still a corner case that fails with the added flag.
1131	/// If a resize is triggered at the same time as the op creating an alias at the
1132	/// same time, the current checks would fail to detect the alias.
1133	void BlockRunner::verify_and_correct_memory_overlap(ProcessedNode& n) {
1134	// The slow check can be removed once the internal/output buffers are merged
1135	if (C10_UNLIKELY(n.check_outputs_for_memory_overlap())) {
1136	if (C10_UNLIKELY(!planner_)) {
1137	// slow check, for first iter only
1138	n.verify_and_correct_memory_overlap();
1139	} else {
1140	bool overlap_detected_with_fast_check = false;
1141	for (size_t i = `0`; i < n.outputs().size(); i++) {
1142	auto& output = n.Output(i);
1143	if (output.isTensor()) {
1144	overlap_detected_with_fast_check \|=
1145	fast_check_and_correct_overlap_with(n, output);
1146	} else if (output.isTensorList()) {
1147	auto tensor_list = output.toListRef();
1148	for (auto& ival : tensor_list) {
1149	overlap_detected_with_fast_check \|=
1150	fast_check_and_correct_overlap_with(
1151	n,
1152	// NOLINTNEXTLINE(cppcoreguidelines-pro-type-const-cast)
1153	const_cast<c10::IValue&>(ival));
1154	}
1155	}
1156	}
1157	if (n.outputs_memory_overlap_detected() &&
1158	!overlap_detected_with_fast_check) {
1159	// slow check. Only run when the fast check fails.
1160	n.verify_and_correct_memory_overlap();
1161	}
1162	}
1163	}
1164	}
1165
1166	bool BlockRunner::fast_check_and_correct_overlap_with(
1167	ProcessedNode& n,
1168	c10::IValue& tensor_ival) {
1169	auto& tensor = tensor_ival.toTensor();
1170	if (planner_->overlapWithInternalBuffer(tensor.data_ptr())) {
1171	DLOG(INFO) << "Detected alias for node: " << PrintNode(n.node());
1172	tensor_ival = at::native::clone(tensor, c10::nullopt);
1173	n.set_outputs_memory_overlap_detected();
1174	return true;
1175	}
1176	return false;
1177	}
1178
1179	BlockRunner::Deallocator::~Deallocator() {
1180	// Assume cleanup cannot throw.
1181	cleanupImpl();
1182	#ifndef NDEBUG
1183	block_runner_.check_for_memory_leak(/output_returned/ false);
1184	#endif
1185	}
1186
1187	void BlockRunner::Deallocator::cleanupImpl() {
1188	// MemoryPlanner is created after the first invocation of `run()`. This
1189	// is done intentionally because MemoryPlanner uses `Tensor` sizes of
1190	// the previous `run()` for memory planning of subsequent runs
1191	if (C10_LIKELY(finished_)) {
1192	block_runner_.create_memory_planner();
1193	}
1194
1195	if (C10_LIKELY(block_runner_.planner_)) {
1196	block_runner_.planner_->deallocate();
1197	} else {
1198	// This is the first run, and it didn't finish, so we can't use a
1199	// `MemoryPlanner` to deallocate stuff. Just reset everything mannually.
1200	block_runner_.resetMemory();
1201	}
1202	// clean up owning refs of input tensors
1203	block_runner_.clean_up_input_ivalues();
1204	if (C10_UNLIKELY(!finished_)) {
1205	block_runner_.deallocateOutputTensors();
1206	}
1207	}
1208
1209	template <typename IValueList>
1210	c10::IValue BlockRunner::run_impl(
1211	IValueList&& args,
1212	const KeywordArgs& kwargs) {
1213	// We assume inference workloads, so we do not need
1214	// autograd. Enabling this is a significant win on dispatcher
1215	// overhead because it saves a round of dispatch for at least some
1216	// functions, such as resize_ and resize_as_.
1217	c10::InferenceMode mode;
1218
1219	{
1220	auto on_exit = Deallocator (*this);
1221
1222	if (planner_) {
1223	DCHECK(!manage_output_tensors_enabled_ \|\| checkOutputTensorMemoryLeaks());
1224	planner_->allocate();
1225	}
1226
1227	set_inputs(std::forward<IValueList>(args), kwargs);
1228
1229	for (auto& n : nodes_) {
1230	// LOG(INFO) << "Running node: " << PrintNode(n.node());
1231	n.run();
1232	// Check for incorrect schema alias info.
1233	verify_and_correct_memory_overlap(n);
1234	}
1235	on_exit.setFinished();
1236	}
1237
1238	// no need to keep references of outputs in static runtime anymore
1239	if (block_info_.num_outputs() > `1`) {
1240	return move_outputs_to_tuple(block_info_.num_outputs());
1241	}
1242
1243	DCHECK(check_for_memory_leak(/output_returned/ false));
1244
1245	// use move here. Otherwise, clean up outputs_[0] explicitly
1246	return std::move(*outputs_[`0`]);
1247	}
1248
1249	template <typename IValueList>
1250	c10::IValue BlockRunner::run_impl_record_functions(
1251	IValueList&& args,
1252	const KeywordArgs& kwargs) {
1253	auto step_callbacks =
1254	at::getStepCallbacksUnlessEmpty(at::RecordScope::STATIC_RUNTIME_MODEL);
1255	if (C10_UNLIKELY(step_callbacks.has_value())) {
1256	at::RecordFunction guard(std::move(*step_callbacks));
1257	TORCH_INTERNAL_ASSERT_DEBUG_ONLY(guard.isActive());
1258	guard.needsInputs()
1259	? guard.before(
1260	"forward", c10::ArrayRef<const IValue>(args.data(), args.size()))
1261	: guard.before("forward");
1262
1263	return run_impl(std::forward<IValueList>(args), kwargs);
1264	}
1265	return run_impl(std::forward<IValueList>(args), kwargs);
1266	}
1267
1268	template <typename IValueList>
1269	c10::intrusive_ptr<c10::ivalue::Future> BlockRunner::run_impl_async(
1270	IValueList&& args,
1271	const KeywordArgs& kwargs) {
1272	// run the graph inline in the caller thread. Async ops will be
1273	// executed on taskLauncher attached to the metadata of ProcessedNodes
1274	c10::IValue output = run_impl(args, kwargs);
1275
1276	// If the output is of type future, return it
1277	if (output.isFuture()) {
1278	return output.toFuture();
1279	}
1280
1281	// wrap the output into future, mark completed and return it
1282	TypePtr return_type;
1283	if (block_info_.num_outputs() > `1`) {
1284	return_type = TupleType::create(
1285	fmap(outputs(), [](const IValue* v) { return v->type(); }));
1286	} else {
1287	return_type = outputs().at(`0`)->type();
1288	}
1289	c10::intrusive_ptr<Future> future = c10::make_intrusive<Future>(return_type);
1290	future ->markCompleted(output);
1291	return future;
1292	}
1293
1294	template <typename IValueList>
1295	c10::intrusive_ptr<c10::ivalue::Future> BlockRunner::
1296	run_impl_record_functions_async(
1297	IValueList&& args,
1298	const KeywordArgs& kwargs) {
1299	auto step_callbacks =
1300	at::getStepCallbacksUnlessEmpty(at::RecordScope::STATIC_RUNTIME_MODEL);
1301	if (C10_UNLIKELY(step_callbacks.has_value())) {
1302	at::RecordFunction guard(std::move(*step_callbacks));
1303	TORCH_INTERNAL_ASSERT_DEBUG_ONLY(guard.isActive());
1304	guard.needsInputs()
1305	? guard.before(
1306	"forward", c10::ArrayRef<const IValue>(args.data(), args.size()))
1307	: guard.before("forward");
1308
1309	return run_impl_async(std::forward<IValueList>(args), kwargs);
1310	}
1311	return run_impl_async(std::forward<IValueList>(args), kwargs);
1312	}
1313
1314	c10::IValue BlockRunner::operator()(
1315	const std::vector<c10::IValue>& args,
1316	const KeywordArgs& kwargs) {
1317	#ifdef PYTORCH_DISABLE_NET_PROFILING
1318	return run_impl(args, kwargs);
1319	#else
1320	return run_impl_record_functions(args, kwargs);
1321	#endif
1322	}
1323
1324	c10::IValue BlockRunner::operator()(
1325	std::vector<c10::IValue>&& args,
1326	const KeywordArgs& kwargs) {
1327	#ifdef PYTORCH_DISABLE_NET_PROFILING
1328	return run_impl(std::move(args), kwargs);
1329	#else
1330	return run_impl_record_functions(std::move(args), kwargs);
1331	#endif
1332	}
1333
1334	c10::intrusive_ptr<c10::ivalue::Future> BlockRunner::runAsync(
1335	const std::vector<c10::IValue>& args,
1336	const KeywordArgs& kwargs) {
1337	#ifdef PYTORCH_DISABLE_NET_PROFILING
1338	return run_impl_async(args, kwargs);
1339	#else
1340	return run_impl_record_functions_async(args, kwargs);
1341	#endif
1342	}
1343
1344	c10::intrusive_ptr<c10::ivalue::Future> BlockRunner::runAsync(
1345	std::vector<c10::IValue>&& args,
1346	const KeywordArgs& kwargs) {
1347	#ifdef PYTORCH_DISABLE_NET_PROFILING
1348	return run_impl_async(std::move(args), kwargs);
1349	#else
1350	return run_impl_record_functions_async(std::move(args), kwargs);
1351	#endif
1352	}
1353
1354	namespace {
1355
1356	std::string generate_latency_json(const std::string& label, double millis) {
1357	#ifdef FBCODE_CAFFE2
1358	folly::dynamic json = folly::dynamic::object();
1359	json["type"] = label;
1360	json["metric"] = "latency";
1361	json["unit"] = "ms";
1362	json["value"] = millis;
1363	return "PyTorchObserver " + folly::toJson(json);
1364	#else
1365	return "";
1366	#endif
1367	}
1368
1369	} // namespace
1370
1371	void BlockRunner::benchmark(
1372	const std::vector<std::vector<c10::IValue>>& args_list,
1373	const std::vector<KeywordArgs>& kwargs_list,
1374	const int warmup_runs,
1375	const int main_runs,
1376	bool print_per_node_time,
1377	bool generate_ai_pep_output) {
1378	TORCH_CHECK(kwargs_list.empty() \|\| args_list.size() == kwargs_list.size());
1379	std::cout << "Input size: " << args_list.size() << std::endl;
1380	float time_per_iter =
1381	benchmark_model(args_list, kwargs_list, warmup_runs, main_runs);
1382	std::cout << "Static runtime ms per iter: " << time_per_iter
1383	<< ". Iters per second: " << `1000.0` / time_per_iter << std::endl;
1384
1385	IndividualMetrics results =
1386	benchmark_individual_ops(args_list, kwargs_list, warmup_runs, main_runs);
1387
1388	if (print_per_node_time) {
1389	for (const auto i : c10::irange(nodes_.size())) {
1390	const Node* node = nodes_[i].node();
1391	std::cout << "Node #" << i << ": " << results.time_per_node [i]
1392	<< " ms/iter, ";
1393	node->print(std::cout, `0`, nullptr, false);
1394	}
1395	}
1396
1397	std::vector<std::pair<std::string, double>> time_per_node_type_vec{
1398	results.time_per_node_type.begin(), results.time_per_node_type.end()};
1399	if (args_list.empty()) {
1400	std::sort(
1401	time_per_node_type_vec.begin(),
1402	time_per_node_type_vec.end(),
1403	[&results](auto& left, auto& right) {
1404	return results.instances_per_node_type[left.first] >
1405	results.instances_per_node_type[right.first];
1406	});
1407	} else {
1408	std::sort(
1409	time_per_node_type_vec.begin(),
1410	time_per_node_type_vec.end(),
1411	[](auto& left, auto& right) { return left.second > right.second; });
1412	}
1413	std::cout << "Time per node type:" << std::endl;
1414	for (const auto& p : time_per_node_type_vec) {
1415	const std::string& kind = p.first;
1416	const double ms = p.second;
1417	std::cout << std::setw(`15`) << ms << " ms. " << std::setw(`10`)
1418	<< results.percent_per_node_type [kind] << "%. " << kind << " ("
1419	<< results.instances_per_node_type [kind] << " nodes";
1420	if (results.out_nodes.count(kind)) {
1421	std::cout << ", out variant)" << std::endl;
1422	} else if (results.native_nodes.count(kind)) {
1423	std::cout << ", native)" << std::endl;
1424	} else {
1425	std::cout << ")" << std::endl;
1426	}
1427
1428	if (generate_ai_pep_output) {
1429	LOG(INFO) << generate_latency_json(kind, ms);
1430	}
1431	}
1432	if (generate_ai_pep_output) {
1433	LOG(INFO) << generate_latency_json(
1434	"static_runtime_first_iter", results.first_iter_time);
1435	}
1436	std::cout << std::setw(`15`) << results.total_time << " ms. in Total"
1437	<< std::endl;
1438	std::cout << "BlockRunner setup time: " << results.setup_time << " ms"
1439	<< std::endl;
1440	std::cout << "Memory allocation time: " << results.memory_alloc_time
1441	<< " ms\n";
1442	std::cout << "Memory deallocation time: " << results.memory_dealloc_time
1443	<< " ms" << std::endl;
1444	std::cout << "Outputs deallocation time: " << results.output_dealloc_time
1445	<< " ms" << std::endl;
1446	std::cout << "First iter time: " << results.first_iter_time << " ms"
1447	<< std::endl;
1448	std::cout << "Number of operators: " << nodes_.size() << std::endl;
1449
1450	if (planner_) {
1451	std::cout << "Total number of managed tensors: "
1452	<< planner_->total_num_managed_tensors() << std::endl;
1453	std::cout << "Total number of managed output tensors: "
1454	<< planner_->total_num_managed_output_tensors() << std::endl;
1455	std::cout << "Total number of unmanaged values: "
1456	<< planner_->total_num_unmanaged() << std::endl;
1457	std::cout << "Number of unmanaged values requiring cleanup: "
1458	<< planner_->num_unmanaged_non_scalars() << std::endl;
1459	std::cout << "Number of unmanaged values not requiring cleanup: "
1460	<< planner_->num_unmanaged_scalars() << std::endl;
1461	std::cout << "Total memory managed: " << planner_->total_managed()
1462	<< " bytes" << std::endl;
1463	if (static_module_.opts().optimize_memory) {
1464	std::cout << "Total number of reused tensors: "
1465	<< planner_->total_reused_tensors() << std::endl;
1466	}
1467	}
1468
1469	auto unsupported_nodes_count = results.total_nodes_count -
1470	results.out_nodes_count - results.native_nodes.size();
1471	std::cout << "Total number of 'out' variant nodes/total number of nodes: "
1472	<< results.out_nodes_count << "/" << results.total_nodes_count
1473	<< " ("
1474	<< `100.0` * (results.out_nodes_count) /
1475	static_cast<float>(results.total_nodes_count)
1476	<< "%)" << std::endl;
1477	std::cout << "Total number of nodes not covered by SR/total number of nodes: "
1478	<< unsupported_nodes_count << "/" << results.total_nodes_count
1479	<< " ("
1480	<< `100.0` * (unsupported_nodes_count) /
1481	static_cast<float>(results.total_nodes_count)
1482	<< "%)" << std::endl;
1483
1484	check_for_memory_leak();
1485
1486	#ifndef NDEBUG
1487	KeywordArgs empty_kwargs;
1488	display_nodes(
1489	args_list[`0`], kwargs_list.size() > `0` ? kwargs_list[`0`] : empty_kwargs);
1490	#endif
1491	}
1492
1493	float BlockRunner::benchmark_model(
1494	const std::vector<std::vector<c10::IValue>>& args_list,
1495	const std::vector<KeywordArgs>& kwargs_list,
1496	const int warmup_runs,
1497	const int main_runs) {
1498	TORCH_CHECK(warmup_runs >= `0` && main_runs >= `1`);
1499	TORCH_CHECK(kwargs_list.empty() \|\| args_list.size() == kwargs_list.size());
1500
1501	const bool is_kwargs_empty = kwargs_list.empty();
1502	const KeywordArgs empty_kwargs;
1503	for (const auto i : c10::irange(warmup_runs)) {
1504	(void)i; // Suppress unused variable warning
1505	for (const auto j : c10::irange(args_list.size())) {
1506	operator()(args_list [j], is_kwargs_empty ? empty_kwargs : kwargs_list [j]);
1507	if (manage_output_tensors_enabled_) {
1508	deallocateOutputTensors();
1509	}
1510	}
1511	}
1512	caffe2::Timer timer;
1513	for (const auto i : c10::irange(main_runs)) {
1514	(void)i; // Suppress unused variable warning
1515	for (const auto j : c10::irange(args_list.size())) {
1516	operator()(args_list [j], is_kwargs_empty ? empty_kwargs : kwargs_list [j]);
1517	if (manage_output_tensors_enabled_) {
1518	deallocateOutputTensors();
1519	}
1520	}
1521	}
1522	float millis = timer.MilliSeconds();
1523	return millis / (static_cast<float>(main_runs) * args_list.size());
1524	}
1525
1526	bool display_ivalue(const IValue& iv) {
1527	if (iv.isTensor()) {
1528	std::cout << "Tensor " << iv.toTensor().toString() << " {";
1529	for (const auto i : c10::irange(iv.toTensor().sizes().size())) {
1530	std::cout << iv.toTensor().sizes()[i];
1531	if (iv.toTensor().sizes().size() > i + `1`) {
1532	std::cout << ", ";
1533	}
1534	}
1535	std::cout << "}\n";
1536	return true;
1537	} else if (iv.isTensorList()) {
1538	std::cout << "TensorList {" << iv.toTensorList().size() << "}\n";
1539	return true;
1540	} else if (iv.isGenericDict()) {
1541	std::cout << "Dict {" << iv.toGenericDict().size() << "}\n";
1542	return true;
1543	} else if (iv.isTuple()) {
1544	std::cout << "Tuple {" << iv.toTupleRef().elements().size() << "}\n";
1545	return true;
1546	} else if (iv.isInt()) {
1547	std::cout << "int {" << iv.toInt() << "}\n";
1548	return true;
1549	} else if (iv.isBool()) {
1550	std::cout << "bool {" << iv.toBool() << "}\n";
1551	return true;
1552	} else if (iv.isDouble()) {
1553	std::cout << "double {" << iv.toDouble() << "}\n";
1554	return true;
1555	}
1556	return false;
1557	}
1558
1559	void display_pnode_info(const ProcessedNode& pnode) {
1560	pnode.node()->print(std::cout, `0`, nullptr, false);
1561	for (const auto i : c10::irange(pnode.num_inputs())) {
1562	std::cout << "\ti" << i << ": ";
1563	if (!display_ivalue(pnode.Input(i))) {
1564	std::cout << *(pnode.node()->inputs()[i]->type()) << `'\n'`;
1565	}
1566	}
1567	const auto outputs = pnode.outputs();
1568	for (const auto i : c10::irange(outputs.size())) {
1569	std::cout << "\to" << i << ": ";
1570	if (!display_ivalue(outputs [i])) {
1571	std::cout << *(pnode.node()->outputs()[i]->type()) << `'\n'`;
1572	}
1573	}
1574	}
1575
1576	void BlockRunner::display_nodes(
1577	const std::vector<c10::IValue>& args,
1578	const KeywordArgs& kwargs) {
1579	c10::InferenceMode mode;
1580
1581	auto on_exit = Deallocator (*this);
1582
1583	if (planner_) {
1584	planner_->allocate();
1585	}
1586	set_inputs(args, kwargs);
1587
1588	for (auto& node : nodes_) {
1589	node.run();
1590	display_pnode_info(node);
1591	}
1592	on_exit.setFinished();
1593	}
1594
1595	BlockRunner::IndividualMetrics BlockRunner::benchmark_individual_ops(
1596	const std::vector<std::vector<c10::IValue>>& args_list,
1597	const std::vector<KeywordArgs>& kwargs_list,
1598	const int warmup_runs,
1599	const int main_runs) {
1600	TORCH_CHECK(kwargs_list.empty() \|\| args_list.size() == kwargs_list.size());
1601	TORCH_CHECK(warmup_runs >= `1` && main_runs >= `1`);
1602
1603	IndividualMetrics results;
1604	results.time_per_node.resize(nodes_.size(), `0`);
1605	if (args_list.empty()) {
1606	// When the given input is empty, compute the op statistics from the given
1607	// graph without executing it.
1608	for (const auto i : c10::irange(nodes_.size())) {
1609	const Node* node = nodes_[i].node();
1610	std::string kind(node->kind().toQualString());
1611	// TODO: Collect op statistics from sub-blocks here.
1612	results.time_per_node [i] = `0`;
1613	results.time_per_node_type [kind] = `0`;
1614	results.instances_per_node_type [kind]++;
1615	if (nodes_[i].has_out_variant()) {
1616	results.out_nodes.insert(kind);
1617	results.out_nodes_count++;
1618	} else if (nodes_[i].has_native()) {
1619	results.native_nodes.insert(kind);
1620	}
1621	results.total_time += results.time_per_node [i];
1622	}
1623	results.total_nodes_count = nodes_.size();
1624	results.memory_alloc_time = `0`;
1625	results.memory_dealloc_time = `0`;
1626	results.output_dealloc_time = `0`;
1627	for (const auto& p : results.time_per_node_type) {
1628	const std::string& kind = p.first;
1629	results.percent_per_node_type [kind] = `0`;
1630	}
1631	return results;
1632	}
1633
1634	const bool is_kwargs_empty = kwargs_list.empty();
1635	const KeywordArgs empty_kwargs;
1636	bool manage_output_tensors = static_module_.opts().manage_output_tensors;
1637	// See comment on above use of InferenceMode for
1638	// explanation.
1639	c10::InferenceMode mode;
1640
1641	// setup time
1642	caffe2::Timer timer;
1643
1644	set_inputs(args_list [`0`], is_kwargs_empty ? empty_kwargs : kwargs_list [`0`]);
1645
1646	results.setup_time = timer.MilliSeconds();
1647
1648	// The first iteration profiles each node's output Tensors' sizes and
1649	// initializes the memory planner with the profile information. Folllowing
1650	// iterations just use the already established memory planning.
1651	timer.Start();
1652	operator()(args_list [`0`], is_kwargs_empty ? empty_kwargs : kwargs_list [`0`]);
1653	if (manage_output_tensors) {
1654	deallocateOutputTensors();
1655	}
1656	results.first_iter_time = timer.MilliSeconds();
1657
1658	// warmup runs
1659	for (const auto i : c10::irange(warmup_runs - `1`)) {
1660	(void)i; // Suppress unused variable warning
1661	for (const auto j : c10::irange(args_list.size())) {
1662	operator()(args_list [j], is_kwargs_empty ? empty_kwargs : kwargs_list [j]);
1663	if (manage_output_tensors) {
1664	deallocateOutputTensors();
1665	}
1666	}
1667	}
1668
1669	// main runs
1670	for (const auto i : c10::irange(main_runs)) {
1671	(void)i; // Suppress unused variable warning
1672
1673	for (const auto j : c10::irange(args_list.size())) {
1674	set_inputs(args_list [j], is_kwargs_empty ? empty_kwargs : kwargs_list [j]);
1675
1676	timer.Start();
1677	if (planner_) {
1678	planner_->allocate();
1679	}
1680	float millis = timer.MilliSeconds();
1681	results.memory_alloc_time += millis;
1682
1683	for (const auto k : c10::irange(nodes_.size())) {
1684	timer.Start();
1685	nodes_[k].run();
1686	millis = timer.MilliSeconds();
1687	results.time_per_node [k] += millis;
1688	verify_and_correct_memory_overlap(nodes_[k]);
1689	}
1690	timer.Start();
1691	create_memory_planner();
1692	planner_->deallocate();
1693	// clean up owning refs of input tensors
1694	clean_up_input_ivalues();
1695	if (manage_output_tensors) {
1696	deallocateOutputTensors();
1697	}
1698	millis = timer.MilliSeconds();
1699	results.memory_dealloc_time += millis;
1700
1701	timer.Start();
1702	// no need to keep references of outputs in static runtime anymore
1703	c10::IValue output;
1704	if (static_module_.num_outputs() > `1`) {
1705	output = move_outputs_to_tuple(static_module_.num_outputs());
1706	}
1707
1708	DCHECK(check_for_memory_leak(/output_returned/ false));
1709
1710	// use move here. Otherwise, clean up outputs_[0] explicitly
1711	output = std::move(*outputs_[`0`]);
1712	// release outputs explicitly to measure the time it takes
1713	output = IValue();
1714	millis = timer.MilliSeconds();
1715	results.output_dealloc_time += millis;
1716	}
1717	}
1718
1719	// post processing
1720	const float num_total_iters =
1721	(static_cast<float>(main_runs) * args_list.size());
1722	for (const auto i : c10::irange(nodes_.size())) {
1723	const Node* node = nodes_[i].node();
1724	std::string kind = std::string (node->kind().toQualString());
1725	results.time_per_node [i] /= num_total_iters;
1726	results.time_per_node_type [kind] += results.time_per_node [i];
1727	results.instances_per_node_type [kind]++;
1728	if (nodes_[i].has_out_variant()) {
1729	results.out_nodes.insert(kind);
1730	results.out_nodes_count++;
1731	} else if (nodes_[i].has_native()) {
1732	results.native_nodes.insert(kind);
1733	}
1734	results.total_time += results.time_per_node [i];
1735	}
1736	results.total_nodes_count = nodes_.size();
1737	results.memory_alloc_time /= num_total_iters;
1738	results.memory_dealloc_time /= num_total_iters;
1739	results.output_dealloc_time /= num_total_iters;
1740	for (const auto& p : results.time_per_node_type) {
1741	const std::string& kind = p.first;
1742	results.percent_per_node_type [kind] = p.second / results.total_time * `100`;
1743	}
1744	return results;
1745	}
1746
1747	bool BlockRunner::check_for_memory_leak(
1748	bool output_returned,
1749	bool recurse_on_sub_blocks) {
1750	// check for inputs
1751	for (const auto i : c10::irange(block_info_.num_inputs())) {
1752	TORCH_CHECK(
1753	values_[i + block_info_.block_inputs_idx()].isNone(),
1754	"Input ",
1755	i,
1756	" was not cleaned up");
1757	}
1758	FastSet<const IValue*> output_ivalues(outputs_.begin(), outputs_.end());
1759	for (const auto n : c10::irange(nodes_.size())) {
1760	auto& pnode = nodes_[n];
1761	for (const auto i : c10::irange(pnode.num_outputs())) {
1762	const IValue* ival = &pnode.Output(i);
1763	const Value* val = pnode.node()->output(i);
1764	// subtlety: isManagedOutputTensorValue may give a false
1765	// negative here if an output is an alias of this value, so
1766	// check the actual tensor!
1767	if (planner_ &&
1768	(isManagedOutputTensor(*ival) \|\| isManagedOutputTensorValue(val))) {
1769	// `ival` contains a managed output tensor that the runtime doesn't
1770	// reclaim at the end of an iteration, but the client does so
1771	// by explicitly calling
1772	// `BlockRunner::deallocateOutputTensors`.
1773	continue;
1774	}
1775	const std::string error_msg = "Output " + c10::to_string(i) + ", %" +
1776	val->debugName() + " of node " + c10::to_string(n) +
1777	" which has kind " + pnode.node()->kind().toQualString() +
1778	" was not cleaned up";
1779	if (output_ivalues.count(ival) == `0`) {
1780	// check for intermediates
1781	if (!ival->isNone()) {
1782	TORCH_CHECK(
1783	ival->isTensor() \|\|
1784	block_info_.node_is_optimizable_container_type(
1785	pnode.node()) \|\|
1786	doesNotHeapAllocateWhenStoredInIValue(*val->type()),
1787	error_msg);
1788	if (ival->isTensor()) {
1789	const auto& t = ival->toTensor();
1790	if (t.defined()) {
1791	auto* storage_impl = t.storage().unsafeGetStorageImpl();
1792	TORCH_CHECK(
1793	storage_impl->data() == nullptr \|\|
1794	(planner_ &&
1795	planner_->isManagedStorageImpl(storage_impl)),
1796	error_msg);
1797	}
1798	}
1799	}
1800	} else {
1801	// check for outputs
1802	if (output_returned) {
1803	TORCH_CHECK(ival->isNone(), error_msg);
1804	}
1805	}
1806	}
1807	auto* metadata = pnode.metadata();
1808	if (recurse_on_sub_blocks && metadata) {
1809	auto& block_runners = metadata->block_runners();
1810	for (auto& block_runner : block_runners) {
1811	block_runner.check_for_memory_leak(
1812	output_returned, recurse_on_sub_blocks);
1813	}
1814	}
1815	}
1816	VLOG(`1`) << "Finished checking for memory leak";
1817	return true;
1818	}
1819
1820	void BlockRunner::deallocateOutputTensors() {
1821	if (!static_module_.opts().manage_output_tensors) {
1822	TORCH_CHECK(
1823	!planner_ \|\| planner_->numOutputBufferBytes() == `0`,
1824	"manage_output_tensors is disabled, but output tensor buffer is not empty.");
1825	return;
1826	}
1827	if (planner_) {
1828	planner_->deallocateOutputTensors();
1829	DCHECK(checkOutputTensorMemoryLeaks());
1830	}
1831	}
1832
1833	bool BlockRunner::checkOutputTensorMemoryLeaks() {
1834	if (!static_module_.opts().manage_output_tensors \|\| !planner_) {
1835	return true;
1836	}
1837	for (const auto n : c10::irange(nodes_.size())) {
1838	auto& pnode = nodes_[n];
1839	for (const auto i : c10::irange(pnode.num_outputs())) {
1840	const IValue* ival = &pnode.Output(i);
1841	const Value* val = pnode.node()->output(i);
1842	if (!isManagedOutputTensorValue(val) \|\| !ival->isTensor()) {
1843	// ival can not be a tensor if it's being managed by ops like
1844	// to_maybe_copy_out; see ReplaceWithMaybeCopy for details.
1845	continue;
1846	}
1847	const auto& t = ival->toTensor();
1848	if (t.defined()) {
1849	auto* storage_impl = t.storage().unsafeGetStorageImpl();
1850	const std::string error_msg = "Output " + c10::to_string(i) + ", %" +
1851	val->debugName() + " of node " + c10::to_string(n) +
1852	" was not cleaned up";
1853	TORCH_CHECK(storage_impl->data() == nullptr, error_msg);
1854	}
1855	}
1856	}
1857	VLOG(`1`) << "Finished checking for memory leak from output tensors";
1858	return true;
1859	}
1860
1861	bool BlockRunner::isManagedOutputTensor(const IValue& ivalue) const {
1862	return planner_ && planner_->isManagedOutputTensor(ivalue);
1863	}
1864
1865	bool BlockRunner::isManagedOutputTensorValue(const Value* value) const {
1866	// It's possible that manage_output_tensors_ was disabled after initializing
1867	// managed_output_tensor_values, so we have to check that flag here.
1868	if (!planner_ \|\| !manage_output_tensors_enabled_) {
1869	return false;
1870	}
1871	const auto& managed_outputs = block_info_.managed_output_tensor_values();
1872	return managed_outputs.find(value) != managed_outputs.end();
1873	}
1874
1875	void BlockRunner::disableManageOutputTensors() {
1876	if (!manage_output_tensors_enabled_) {
1877	return;
1878	}
1879	manage_output_tensors_enabled_ = false;
1880	if (!planner_) {
1881	return;
1882	}
1883	// Reset all IValues and destruct planner_ so that it can be reconstructed in
1884	// the next run.
1885	for (auto& n : nodes_) {
1886	for (const auto i : c10::irange(n.outputs().size())) {
1887	n.Output(i) = IValue();
1888	}
1889	}
1890	planner_.reset();
1891	}
1892
1893	ProcessedFunction::ProcessedFunction(
1894	Node* node,
1895	bool enable_out_variant,
1896	bool check_memory_overlap)
1897	: check_memory_overlap_(check_memory_overlap),
1898	num_outputs_(node->outputs().size()) {
1899	if (enable_out_variant) {
1900	f_ = getOutOfPlaceOperation(node);
1901	if (f_) {
1902	kind_ = ProcessedFunction::Kind::kOutVariant;
1903	// do not check memory overlap for out variants
1904	check_memory_overlap_ = false;
1905	VLOG(`1`) << "Switch to out variant for node: " << PrintNode(node);
1906	return;
1907	}
1908	}
1909	{
1910	f_ = getNativeOperation(node);
1911	if (f_) {
1912	kind_ = ProcessedFunction::Kind::kNativeFunction;
1913	#ifdef NDEBUG
1914	// skip this check in opt mode because these ops are better vetted
1915	check_memory_overlap_ = false;
1916	#endif
1917	VLOG(`1`) << "Switch to native impl for node: " << PrintNode(node);
1918	return;
1919	}
1920	}
1921	{
1922	const Operator& op = node->getOperator();
1923	f_ = [node_op = op.getOperation(node),
1924	has_var_args = hasVarArgs(node)](ProcessedNode* pnode) mutable {
1925	std::vector<IValue> stack;
1926	const size_t size = pnode->num_inputs();
1927	stack.reserve(size + has_var_args);
1928	for (const auto i : c10::irange(size)) {
1929	stack.emplace_back(pnode->Input(i));
1930	}
1931	// Need to store the number of inputs in stack for variadic ops.
1932	if (has_var_args) {
1933	stack.emplace_back(static_cast<int>(size));
1934	}
1935	node_op (stack);
1936	TORCH_DCHECK_EQ(stack.size(), pnode->num_outputs());
1937	for (const auto i : c10::irange(pnode->num_outputs())) {
1938	pnode->Output(i) = std::move(stack [i]);
1939	}
1940	};
1941	kind_ = ProcessedFunction::Kind::kInterpreterFallback;
1942	VLOG(`1`) << "Fallback interpreter for node: " << PrintNode(node);
1943	}
1944	}
1945
1946	StaticNodeInfo::StaticNodeInfo(
1947	Node* node,
1948	ProcessedFunction* fn,
1949	ProcessedNodeInputs inputs,
1950	uint16_t outputs_offset)
1951	: node_(node),
1952	fn_(fn),
1953	inputs_(std::move(inputs)),
1954	outputs_offset_(outputs_offset) {
1955	TORCH_CHECK(num_outputs() == node->outputs().size());
1956	}
1957
1958	std::vector<IValue> ProcessedNode::inputs_ivalue_vec() const {
1959	std::vector<IValue> result;
1960	result.reserve(inputs_.size());
1961	for (const auto idx : c10::irange(num_inputs())) {
1962	result.emplace_back(Input(idx));
1963	}
1964	return result;
1965	}
1966
1967	void ProcessedNode::run() {
1968	#ifndef PYTORCH_DISABLE_PER_OP_PROFILING
1969	auto step_callbacks =
1970	at::getStepCallbacksUnlessEmpty(at::RecordScope::STATIC_RUNTIME_OP);
1971	if (C10_UNLIKELY(step_callbacks.has_value())) {
1972	at::RecordFunction guard(std::move(*step_callbacks));
1973	TORCH_INTERNAL_ASSERT_DEBUG_ONLY(guard.isActive());
1974	if (guard.needsInputs()) {
1975	const auto inputs = inputs_ivalue_vec();
1976	guard.before(
1977	get_op_name(),
1978	c10::ArrayRef<const IValue>(inputs.data(), inputs.size()));
1979	} else {
1980	guard.before(get_op_name());
1981	}
1982	if (has_out_variant()) {
1983	guard._setStaticRuntimeOutVariant();
1984	}
1985
1986	fn_->run(this);
1987	} else {
1988	fn_->run(this);
1989	}
1990	#else
1991	fn_->run(this);
1992	#endif
1993	#ifndef NDEBUG
1994	if (FLAGS_static_runtime_disable_debug_memory_overlap_check) {
1995	// run check but do not enforce
1996	verify_no_memory_overlap();
1997	} else {
1998	DCHECK(verify_no_memory_overlap());
1999	}
2000	#endif
2001	}
2002
2003	static bool checkNoMemoryOverlap(const at::Tensor& a, const at::Tensor& b) {
2004	at::MemOverlapStatus status = at::get_overlap_status(a, b);
2005	if (status == at::MemOverlapStatus::Full \|\|
2006	status == at::MemOverlapStatus::Partial) {
2007	return false;
2008	}
2009	if (status == at::MemOverlapStatus::TooHard) {
2010	VLOG(`1`) << "Detected TOO_HARD memory overlap status";
2011	}
2012	return true;
2013	}
2014
2015	bool ProcessedNode::verify_no_memory_overlap(bool force_check) const {
2016	const static std::array<c10::Symbol, `7`> special_case_ops = {
2017	fromQualString("prim::TypeCheck"),
2018	fromQualString("prim::IfThenElse"),
2019	fromQualString("static_runtime::select_tensor"),
2020	fromQualString("static_runtime::VarTupleUnpack"),
2021	fromQualString("static_runtime::dict_unpack"),
2022	fromQualString("static_runtime::fused_split_and_squeeze"),
2023	fromQualString("static_runtime::create_owned_ref")};
2024	if (!force_check &&
2025	std::find(
2026	begin(special_case_ops), end(special_case_ops), node()->kind()) !=
2027	end(special_case_ops)) {
2028	return true;
2029	}
2030
2031	return verify_outputs_dont_overlap_each_other() &&
2032	verify_inputs_dont_overlap_outputs(force_check);
2033	}
2034
2035	bool ProcessedNode::verify_outputs_dont_overlap_each_other() const {
2036	for (const auto i : c10::irange(num_outputs())) {
2037	if (!Output(i).isTensor()) {
2038	continue;
2039	}
2040	const auto& out0_t = Output(i).toTensor();
2041	for (const auto j : c10::irange(i + `1`, num_outputs())) {
2042	if (!Output(j).isTensor()) {
2043	continue;
2044	}
2045	const auto& out1_t = Output(j).toTensor();
2046	if (!checkNoMemoryOverlap(out0_t, out1_t)) {
2047	LOG(INFO) << "Node output " << i << " overlaps with output " << j
2048	<< ", " << PrintNode(node_);
2049	return false;
2050	}
2051	}
2052	}
2053	return true;
2054	}
2055
2056	bool ProcessedNode::verify_inputs_dont_overlap_outputs(bool force_check) const {
2057	auto schema = node()->maybeSchema();
2058	// skip memory overlap check for mutable or view ops with only one output
2059	bool skip_check = !schema \|\|
2060	((schema->is_mutable() \|\| !fn_->checkMemoryOverlap()) &&
2061	num_outputs() == `1`);
2062	if (!schema \|\| (!force_check && skip_check)) {
2063	if (!schema) {
2064	VLOG(`2`) << "Detected that op schema is null";
2065	return true;
2066	}
2067	VLOG(`2`) << "schema->is_mutable: " << schema->is_mutable()
2068	<< ", fn_->checkMemoryOverlap: " << fn_->checkMemoryOverlap()
2069	<< ", num_outputs_: " << num_outputs();
2070	return true;
2071	}
2072
2073	for (const auto i : c10::irange(inputs_.size())) {
2074	const IValue* in = &Input(i);
2075	if (!in->isTensor()) {
2076	continue;
2077	}
2078	const auto& in_t = in->toTensor();
2079	for (const auto j : c10::irange(num_outputs())) {
2080	const IValue& out = Output(j);
2081	if (!out.isTensor()) {
2082	continue;
2083	}
2084	const auto& out_t = out.toTensor();
2085	if (!checkNoMemoryOverlap(in_t, out_t)) {
2086	LOG(INFO) << "Node input " << i << " overlaps with output " << j << ", "
2087	<< PrintNode(node_);
2088	LOG(INFO) << *schema;
2089	return false;
2090	}
2091	}
2092	}
2093	return true;
2094	}
2095
2096	bool ProcessedNode::check_and_correct_overlap_with(
2097	const at::Tensor& input,
2098	c10::IValue& output_ival) {
2099	auto& tensor = output_ival.toTensor();
2100	if (!checkNoMemoryOverlap(input, tensor)) {
2101	DLOG(INFO) << "Detected alias for node: " << PrintNode(node());
2102	output_ival = at::native::clone(tensor, c10::nullopt);
2103	set_outputs_memory_overlap_detected();
2104	return true;
2105	}
2106	return false;
2107	}
2108
2109	void ProcessedNode::verify_and_correct_memory_overlap() {
2110	for (const auto i : c10::irange(inputs_.size())) {
2111	const IValue& in = Input(i);
2112	if (!in.isTensor()) {
2113	continue;
2114	}
2115	const auto& in_t = in.toTensor();
2116	for (const auto j : c10::irange(num_outputs())) {
2117	auto& output = Output(j);
2118	if (output.isTensor()) {
2119	check_and_correct_overlap_with(in_t, output);
2120	} else if (output.isTensorList()) {
2121	auto tensors = output.toListRef();
2122	for (const auto& ival : tensors) {
2123	// NOLINTNEXTLINE(cppcoreguidelines-pro-type-const-cast)
2124	check_and_correct_overlap_with(in_t, const_cast<c10::IValue&>(ival));
2125	}
2126	#ifdef FBCODE_CAFFE2
2127	if (outputs_memory_overlap_detected()) {
2128	LOG_EVERY_MS(WARNING, `60000`)
2129	<< "Detected alias for node: " << PrintNode(node());
2130	}
2131	#endif
2132	}
2133	}
2134	}
2135	}
2136
2137	StaticRuntime::StaticRuntime(const StaticModule& sm)
2138	: values_(sm.value_buffer_size()) {
2139	std::copy(sm.constants().begin(), sm.constants().end(), values_.data());
2140	// default task launcher set to inter-op thread pool
2141	async_task_launcher_ = at::launch;
2142	block_ = std::make_unique<BlockRunner>(
2143	sm,
2144	values_.data(),
2145	sm.root_block(),
2146	&async_task_launcher_,
2147	true /is_root_block/);
2148	}
2149
2150	c10::IValue StaticRuntime::operator()(
2151	const std::vector<c10::IValue>& args,
2152	const KeywordArgs& kwargs) {
2153	return (*block_)(args, kwargs);
2154	}
2155
2156	c10::IValue StaticRuntime::operator()(
2157	std::vector<c10::IValue>&& args,
2158	const KeywordArgs& kwargs) {
2159	return (*block_)(std::move(args), kwargs);
2160	}
2161
2162	c10::intrusive_ptr<c10::ivalue::Future> StaticRuntime::runAsync(
2163	const std::vector<c10::IValue>& args,
2164	const KeywordArgs& kwargs,
2165	torch::jit::TaskLauncher taskLauncher) {
2166	async_task_launcher_ = std::move(taskLauncher);
2167	return block_->runAsync(args, kwargs);
2168	}
2169
2170	c10::intrusive_ptr<c10::ivalue::Future> StaticRuntime::runAsync(
2171	std::vector<c10::IValue>&& args,
2172	const KeywordArgs& kwargs,
2173	torch::jit::TaskLauncher taskLauncher) {
2174	async_task_launcher_ = std::move(taskLauncher);
2175	return block_->runAsync(std::move(args), kwargs);
2176	}
2177
2178	bool StaticRuntime::check_for_memory_leak(bool output_returned) {
2179	return block_->check_for_memory_leak(
2180	output_returned, / recurse_on_sub_blocks / true);
2181	}
2182
2183	bool StaticRuntime::checkOutputTensorMemoryLeaks() {
2184	return block_->checkOutputTensorMemoryLeaks();
2185	}
2186
2187	void StaticRuntime::deallocateOutputTensors() {
2188	block_->deallocateOutputTensors();
2189	}
2190
2191	bool StaticRuntime::isManagedOutputTensor(const IValue& ivalue) const {
2192	return block_->isManagedOutputTensor(ivalue);
2193	}
2194
2195	void StaticRuntime::disableManageOutputTensors() {
2196	block_->disableManageOutputTensors();
2197	}
2198
2199	const MemoryPlanner* StaticRuntime::get_memory_planner() const {
2200	return block_->get_memory_planner();
2201	}
2202
2203	} // namespace jit
2204	} // namespace torch
2205

Browse the source code of pytorch/torch/csrc/jit/runtime/static/impl.cpp