executor.cpp source code [pytorch/third_party/nvfuser/csrc/executor.cpp]

1
2	#include <executor.h>
3
4	#include <codegen.h>
5	#include <executor_kernel_arg.h>
6	#include <executor_utils.h>
7	#include <instrumentation.h>
8	#include <ir_all_nodes.h>
9	#include <ir_utils.h>
10	#include <iter_visitor.h>
11	#include <kernel_ir.h>
12	#include <lower_bank_conflict.h>
13	#include <utils.h>
14
15	#include <ATen/core/LegacyTypeDispatch.h>
16	#include <ATen/cuda/CUDAContext.h>
17	#include <ATen/cuda/llvm_jit_strings.h>
18	#include <ATen/cuda/nvrtc_stub/ATenNVRTC.h>
19	#include <ATen/native/cuda/jit_utils.h>
20	#include <c10/core/DeviceGuard.h>
21	#include <c10/cuda/CUDAFunctions.h>
22	#include <c10/cuda/CUDAStream.h>
23	#include <c10/util/irange.h>
24
25	#include <cmath>
26	#include <fstream>
27
28	namespace torch {
29	namespace jit {
30	namespace fuser {
31	namespace cuda {
32
33	int FusionExecutor::fusion_id_counter_ = `0`; // NOLINT
34
35	bool fill_allocation_with_nan_ = false;
36
37	bool shouldFillAllocationWithNan() {
38	return fill_allocation_with_nan_;
39	}
40
41	void setFillAllocationWithNan(bool value) {
42	fill_allocation_with_nan_ = value;
43	}
44
45	namespace {
46
47	static const char* defineIndexMode(KernelIndexMode index_mode) {
48	switch (index_mode) {
49	case KernelIndexMode::INT32:
50	return "typedef int nvfuser_index_t;\n";
51	case KernelIndexMode::INT64:
52	return "typedef int64_t nvfuser_index_t;\n";
53	default:
54	break;
55	}
56
57	TORCH_INTERNAL_ASSERT(false, "unknow indexing mode");
58	return "";
59	}
60
61	static const char* defineIntegerTypes() {
62	return R"(
63	typedef signed char int8_t;
64	typedef unsigned char uint8_t;
65	typedef short int int16_t;
66	typedef unsigned short int uint16_t;
67	typedef int int32_t;
68	typedef unsigned int uint32_t;
69	typedef long long int int64_t;
70	typedef unsigned long long int uint64_t;
71	)";
72	}
73
74	static const std::string& defineComplexTypes() {
75	static std::string result = std::string (R"ESCAPE(
76	#define POS_INFINITY __int_as_float(0x7f800000)
77	#define INFINITY POS_INFINITY
78	#define NEG_INFINITY __int_as_float(0xff800000)
79	#define NAN __int_as_float(0x7fffffff)
80	)ESCAPE") +
81	at::cuda::get_traits_string() + at::cuda::get_complex_body_string() +
82	at::cuda::get_cmath_string() + at::cuda::get_complex_math_string();
83	return result;
84	}
85
86	} // namespace
87
88	std::string FusionExecutor::getStructuredCode(const std::string& kernel) {
89	// generating cuda code;
90	std::string code = "";
91	#ifdef USE_ROCM
92	#if ROCM_VERSION < 40200
93	code += std::string("#include <hip/hip_runtime.h>\n") +
94	std::string("#include <hip/hip_bf16.h>\n") +
95	std::string("#include <hip/hip_fp16.h>\n");
96	#endif
97	code += std::string("#pragma clang force_cuda_host_device begin\n");
98	#endif
99	code += std::string ("namespace ") + FusionExecutor::kernelNamespace() +
100	" {\n" + defineIntegerTypes() + defineIndexMode(options_.index_mode) +
101	defineComplexTypes() + executor_utils::kernelPreamble() + kernel + "}\n";
102	#ifdef USE_ROCM
103	code += std::string("#pragma clang force_cuda_host_device end\n");
104	#endif
105
106	if (isDebugDumpEnabled(DebugDumpOption::CudaKernel)) {
107	std::cout << "\n======= Codegen output for kernel: " << kernelName()
108	<< " =======\n\n"
109	<< kernel << "\n======================================\n\n";
110	} else if (isDebugDumpEnabled(DebugDumpOption::CudaFull)) {
111	std::cout << "\n======= Codegen output for kernel: " << kernelName()
112	<< " =======\n\n"
113	<< code << "\n======================================\n\n";
114	}
115	if (isDebugDumpEnabled(DebugDumpOption::CudaToFile) \|\|
116	isDebugDumpEnabled(DebugDumpOption::DebugInfo)) {
117	std::stringstream file_name;
118	file_name << "__tmp_kernel" << fusion_id_ << ".cu";
119	std::cout << "PRINTING: " << file_name.str() << std::endl;
120	std::ofstream out(file_name.str());
121	out << code << std::endl;
122	out.close();
123	}
124
125	return code;
126	}
127
128	// TODO: come up with a more user friendly interface
129	void FusionExecutor::debugCompileFusionFromStr(
130	Fusion* fusion,
131	const std::string& code,
132	const std::string& name,
133	int id,
134	CompileOptions options) {
135	options_ = options;
136
137	if (isDebugDumpEnabled(DebugDumpOption::FusionIr)) {
138	fusion->print();
139	} else if (isDebugDumpEnabled(DebugDumpOption::FusionIrMath)) {
140	fusion->printMath();
141	}
142
143	if (isDebugDumpEnabled(DebugDumpOption::CudaFull)) {
144	std::cout << "\n==== codegen output for kernel: " << kernelName()
145	<< " ====" << std::endl
146	<< code << std::endl
147	<< "======================================\n"
148	<< std::endl;
149	}
150
151	lowered_ = std::make_unique<GpuLower>(fusion);
152	const auto kernel = lowered_->kernel();
153	fusion_ = lowered_->kernel();
154
155	fusion_id_ = id;
156	setUsedTVs();
157
158	if (isDebugDumpEnabled(DebugDumpOption::KernelIr)) {
159	kernel->print();
160	}
161
162	const auto& kernel_summary = kernel->summary();
163
164	if (!kernel_summary.static_smem_allocations.empty()) {
165	kir::ExpressionEvaluator static_evaluator;
166	// NOLINTNEXTLINE(cppcoreguidelines-init-variables)
167	const auto static_smem_size = computeSharedMemory(
168	static_evaluator, kernel_summary.static_smem_allocations);
169	TORCH_INTERNAL_ASSERT(
170	static_smem_size < max_static_smem_,
171	"The static shared memory allocation is larger than available memory.");
172	}
173
174	std::tie(compiled_kernel_, last_compiler_log_) =
175	executor_utils::nvrtcCompile(code, name, fusion_id_);
176	TORCH_INTERNAL_ASSERT(
177	fusion_id_ > `0`, "assign a fusion_id_ <= 0 is not accepted.");
178	}
179
180	void FusionExecutor::compileFusion(
181	Fusion* fusion,
182	const KernelArgumentHolder& args,
183	const LaunchParams& launch_constraints) {
184	FUSER_PERF_SCOPE("compileFusion");
185
186	TORCH_INTERNAL_ASSERT(
187	!fusion->outputs().empty(), "No output found for this kernel, aborting.");
188
189	for (auto out : fusion->outputs()) {
190	TORCH_INTERNAL_ASSERT(
191	out->getValType() == ValType::TensorView,
192	"Output types from fusions that are not tensors are not supported at this point.");
193
194	const auto maybe_rfactor_domain =
195	out->as<TensorView>()->getMaybeRFactorDomain();
196	// walking through outputs to see if output shapes are dependent on
197	// non-tensor inputs. For which case, we should have disabled output
198	// allocation, since the caching id only looks at tensor shapes.
199	// See issue https://github.com/csarofeen/pytorch/issues/2002
200	std::vector<Val*> output_extents;
201	for (const auto id : maybe_rfactor_domain) {
202	Val* extent = nullptr;
203	if (id->isReduction() \|\| id->isStride()) {
204	continue;
205	} else if (id->isBroadcast() && id->hasExpandedExtent()) {
206	extent = id->expandedExtent();
207	} else {
208	extent = id->extent();
209	}
210	output_extents.emplace_back(extent);
211	}
212	auto dependencies = InputsOf::outputs(fusion, output_extents);
213	if (std::any_of(dependencies.begin(), dependencies.end(), [](Val* val) {
214	return val->isFusionInput();
215	})) {
216	// TODO: parameter cache is too big a hammer here. We should consider
217	// separate the caching logic of output sizes & launch params. Since
218	// output size dependency should only invalidate the output sizes
219	disable_parameter_cache_ = true;
220	break;
221	}
222	}
223
224	if (isDebugDumpEnabled(DebugDumpOption::FusionIr)) {
225	fusion->print();
226	} else if (isDebugDumpEnabled(DebugDumpOption::FusionIrMath)) {
227	fusion->printMath();
228	}
229
230	// TODO: refactor the options_ passed through
231	options_.device = c10::Device (c10::DeviceType::CUDA, args.getDeviceIndex());
232	options_.index_mode = args.getIndexMode();
233	c10::DeviceGuard dg(options_.device);
234
235	TORCH_INTERNAL_ASSERT(
236	options_.device.is_cuda(), "Provided device to CUDA fuser is the CPU.");
237	auto properties = at::cuda::getDeviceProperties(options_.device.index());
238	configured_device_smem_ = properties->sharedMemPerBlock;
239	#ifndef USE_ROCM
240	device_smem_limit_ = properties->sharedMemPerBlockOptin;
241	#else
242	// don't know if rocm supports opt-in shared memory reconfiguration
243	device_smem_limit_ = properties->sharedMemPerBlock;
244	#endif
245	warp_size_ = properties->warpSize;
246
247	lowered_ = std::make_unique<GpuLower>(
248	fusion,
249	options_.index_mode == KernelIndexMode::INT64 ? DataType::Int
250	: DataType::Int32);
251	const auto kernel = lowered_->kernel();
252	fusion_ = lowered_->kernel()->as<Fusion>();
253
254	fusion_id_ = ++fusion_id_counter_;
255	setUsedTVs();
256
257	if (isDebugDumpEnabled(DebugDumpOption::KernelIr)) {
258	kernel->print();
259	}
260
261	if (isDebugDumpEnabled(DebugDumpOption::BankConflictInfo)) {
262	auto bank_conflict_info = getBankConflictInfo(kernel);
263	if (bank_conflict_info.empty()) {
264	std::cout << "===== No bank confliction =====" << std::endl;
265	} else {
266	std::cout << "======= Bank confliction =======" << std::endl;
267	for (auto info : bank_conflict_info) {
268	std::cout << "Expr: " << info.first->toString() << std::endl;
269	auto conflict = info.second;
270	if (conflict.first > `1`) {
271	std::cout << "input conflict: " << conflict.first << " way, ";
272	}
273	if (conflict.second > `1`) {
274	std::cout << "output conflict: " << conflict.second << " way";
275	}
276	std::cout << std::endl;
277	}
278	std::cout << "================================" << std::endl;
279	}
280	}
281
282	kernel_code_ = codegen::generateCudaKernel(kernel, kernelName());
283	const auto structured_code = getStructuredCode(kernel_code_);
284
285	const auto& kernel_summary = kernel->summary();
286
287	// We currently shouldn't allocate any more shared mem
288	// tensors statically but could keep this path if
289	// needed in later development.
290	if (!kernel_summary.static_smem_allocations.empty()) {
291	kir::ExpressionEvaluator static_evaluator;
292	// NOLINTNEXTLINE(cppcoreguidelines-init-variables)
293	const auto static_smem_size = computeSharedMemory(
294	static_evaluator, kernel_summary.static_smem_allocations);
295	TORCH_INTERNAL_ASSERT(
296	static_smem_size < max_static_smem_,
297	"The static shared memory allocation is larger than available memory.");
298	}
299
300	if (kernel_summary.has_dynamic_local_memory_allocations) {
301	std::stringstream ss;
302	ss << "Allocations must be based on constant integers for local memory. However, found: ";
303	for (auto alloc : kernel_summary.dynamic_lmem_allocations) {
304	ss << alloc->buffer()->toString() << ", ";
305	}
306	ss << " have dynamic allocations but are placed in local memory.";
307	TORCH_INTERNAL_ASSERT(false, ss.str());
308	}
309
310	// TODO: pass block_size here;
311	c10::optional<int> block_size = c10::nullopt;
312	if (!args.empty()) {
313	auto expr_eval = executor_utils::bindKernelInputs(args, kernel);
314	auto launch_params =
315	computeLaunchParams(launch_constraints, expr_eval, warp_size_);
316	block_size = launch_params.nThreads();
317	TORCH_INTERNAL_ASSERT(
318	block_size > `0`, "launch param inferred block size < 0");
319	}
320
321	// TODO: high water mark should be computed via occupancy API after
322	// compilation.
323
324	// Basically setting high water martk as 1 when we don't provide args for
325	// compilation, it will just generate a kernel that gets ditched at the first
326	// run - not great. We should have better heuristics.
327	block_size_high_water_mark = std::max<int64_t>(
328	(block_size.has_value() ? block_size.value() : `1`),
329	block_size_high_water_mark);
330	std::tie(compiled_kernel_, last_compiler_log_) = executor_utils::nvrtcCompile(
331	structured_code,
332	(kernelNamespace() + "::" + kernelName()).c_str(),
333	fusion_id_,
334	block_size);
335	TORCH_INTERNAL_ASSERT(
336	fusion_id_ > `0`, "failed to assign a fusion_id_ after compilation.");
337
338	#ifndef USE_ROCM
339	// The driver API call requires an int argument.
340	int max_dynamic_smem = `0`;
341	AT_CUDA_DRIVER_CHECK(at::globalContext().getNVRTC().cuFuncGetAttribute(
342	&max_dynamic_smem,
343	CU_FUNC_ATTRIBUTE_MAX_DYNAMIC_SHARED_SIZE_BYTES,
344	compiled_kernel_.function));
345	maybe_available_dynamic_smem_ = max_dynamic_smem;
346	#endif
347	}
348
349	namespace {
350
351	void fillTensorWithNan(at::Tensor& t) {
352	switch (t.scalar_type()) {
353	case at::ScalarType::Byte:
354	t.fill_(`0xFF`);
355	break;
356	case at::ScalarType::Char:
357	t.fill_(`0x7F`);
358	break;
359	case at::ScalarType::Short:
360	t.fill_(`0x7FFF`);
361	break;
362	case at::ScalarType::Int:
363	t.fill_(`0x7FFFFFFF`);
364	break;
365	case at::ScalarType::Long:
366	t.fill_(`0x7FFFFFFFFFFFFFFFL`);
367	break;
368	case at::ScalarType::Bool:
369	t.fill_(true);
370	break;
371	case at::ScalarType::Half:
372	case at::ScalarType::Float:
373	case at::ScalarType::Double:
374	case at::ScalarType::BFloat16:
375	t.fill_(std::nan(""));
376	break;
377	case at::ScalarType::ComplexHalf:
378	case at::ScalarType::ComplexFloat:
379	case at::ScalarType::ComplexDouble:
380	t.fill_(c10::complex<double>(std::nan(""), std::nan("")));
381	break;
382	default:
383	TORCH_INTERNAL_ASSERT(false, "Unknown dtype");
384	}
385	}
386
387	at::Tensor inferAndAlloc(
388	const TensorView* tv,
389	const std::vector<Val*>& sizes,
390	kir::ExpressionEvaluator& expr_eval,
391	// Map from dim -> expanded size of TV if any expanded broadcast dimensions
392	// exist
393	std::unordered_map<int, Val*> expanded_map,
394	const CompileOptions& options,
395	bool zero_init = false) {
396	FUSER_PERF_SCOPE("inferAndAlloc");
397
398	// NOLINTNEXTLINE(cppcoreguidelines-init-variables)
399	// Going to infer all the sizes of the TensorView
400	std::vector<int64_t> inferred_sizes;
401	// Expanded sizes is at maximum the same size of inferred_sizes, as you could
402	// have a fully broadcasted tensor that's being expanded
403	std::vector<int64_t> expanded_sizes;
404	bool expanded_dim = false;
405	for (const auto size : sizes) {
406	const auto inferred_val = expr_eval.evaluate(size);
407	TORCH_INTERNAL_ASSERT(
408	inferred_val.has_value(),
409	"Could not launch kernel as program could not infer ",
410	size->toString(),
411	"(",
412	size->name(),
413	") for the buffer ",
414	tv->toString());
415	inferred_sizes.push_back(inferred_val ->as<int64_t>());
416	if (expanded_map.count(expanded_sizes.size())) {
417	auto expanded_size = expanded_map.at(expanded_sizes.size());
418	const auto inferred_expanded_size = expr_eval.evaluate(expanded_size);
419	TORCH_INTERNAL_ASSERT(
420	inferred_expanded_size.has_value(),
421	"Could not launch kernel as program could not infer the expanded extent ",
422	expanded_size->toString(),
423	"(",
424	expanded_size->name(),
425	") for the buffer ",
426	tv->toString());
427	if (inferred_val.value() != `1`) {
428	TORCH_INTERNAL_ASSERT(
429	inferred_val.value() == inferred_expanded_size.value(),
430	"Attempted an expand on a non-broadcasted dimension,",
431	" but the expand doesn't match the dimensions size.");
432	} else {
433	expanded_dim = true;
434	}
435	expanded_sizes.push_back(inferred_expanded_size ->as<int64_t>());
436	} else {
437	expanded_sizes.push_back(inferred_val ->as<int64_t>());
438	}
439	}
440
441	const auto at_type = data_type_to_aten(tv->dtype());
442	const auto tensor_options =
443	at::TensorOptions().dtype(at_type).device(options.device);
444	c10::IntArrayRef isizes(inferred_sizes);
445
446	if (zero_init) {
447	auto zeros = at::zeros(isizes, tensor_options);
448	if (expanded_dim) {
449	return zeros.expand(expanded_sizes);
450	}
451	return zeros;
452	} else {
453	// Non Variable type guard for empty_cuda call
454	at::AutoDispatchBelowADInplaceOrView non_variable_type_mode;
455	auto empty = at::empty(isizes, tensor_options);
456	if (shouldFillAllocationWithNan()) {
457	fillTensorWithNan(empty);
458	}
459	if (expanded_dim) {
460	return empty.expand(expanded_sizes);
461	}
462	return empty;
463	}
464	}
465
466	at::Tensor inferAndAllocOutput(
467	const TensorView* tv,
468	kir::ExpressionEvaluator& expr_eval,
469	const CompileOptions& options,
470	bool zero_init = false) {
471	const auto domain = tv->domain();
472	const auto maybe_rfactor_domain = domain->hasRFactor()
473	? domain->getRFactorDomain()
474	: domain->getRootDomain();
475
476	std::vector<Val*> sizes;
477	std::unordered_map<int, Val*> expand_map;
478
479	for (const auto id : maybe_rfactor_domain) {
480	if (id->isReduction() \|\| id->isStride()) {
481	continue;
482	}
483	sizes.push_back(id->extent());
484	if (id->isBroadcast() && id->hasExpandedExtent()) {
485	expand_map [sizes.size() - `1`] = id->expandedExtent();
486	}
487	}
488	return inferAndAlloc(tv, sizes, expr_eval, expand_map, options, zero_init);
489	}
490
491	} // namespace
492
493	uint64_t FusionExecutor::computeSharedMemory(
494	kir::ExpressionEvaluator& expr_eval,
495	const std::vector<const kir::Allocate*>& buffers,
496	bool align_padding,
497	uint64_t total) {
498	FUSER_PERF_SCOPE("computeSharedMemory");
499	for (auto smem_alloc : buffers) {
500	// If this buffer aliases another buffer,
501	// then do not allocate memory for this buffer.
502	if (smem_alloc->alias() == nullptr) {
503	const auto inferred_val = expr_eval.evaluate(smem_alloc->size());
504	if (inferred_val.has_value()) {
505	const uint64_t data_size = dataTypeSize(smem_alloc->buffer()->dtype());
506	// Add padding to align dynamic shared memory
507	if (align_padding) {
508	#ifndef USE_ROCM
509	const int align_size = `16`; // always align to 16B/128b.
510	#else
511	const int align_size = `8`; // see codegen.cpp for HIP
512	#endif
513	total = ceilDiv(total, align_size) * align_size;
514	}
515	total += inferred_val ->as<int64_t>() * data_size;
516	} else {
517	TORCH_INTERNAL_ASSERT(
518	false,
519	"Failed to evaluate the size ",
520	smem_alloc->size(),
521	" of shared memory buffer - T",
522	smem_alloc->buffer()->name());
523	}
524	}
525	}
526	return total;
527	}
528
529	LaunchParams FusionExecutor::computeLaunchParams(
530	const LaunchParams& launch_constraints,
531	kir::ExpressionEvaluator& expr_eval,
532	const int warp_size) {
533	FUSER_PERF_SCOPE("FusionExecutor::ComputeLaunchParams");
534	TORCH_INTERNAL_ASSERT(warp_size > `0`, "WARP_SIZE should be larger than 0");
535
536	LaunchParams launch_params;
537
538	auto data_cache = compileTimeDataCache();
539
540	auto lower = lowered_.get();
541	auto& used_tvs = getUsedTVs();
542	auto parallel_binding_ids_entry =
543	executor_utils::caching::ExecutorCompileTimeEntry<
544	executor_utils::caching::ParallelBindingIterDomains>(
545	data_cache, [&used_tvs, &lower]() {
546	return std::make_unique<std::vector<IterDomain*>>(
547	executor_utils::getParallelBindingsIterDomains(
548	lower, used_tvs));
549	});
550	auto& parallel_binding_ids = parallel_binding_ids_entry.get();
551
552	auto parallel_iter_extent_entry =
553	executor_utils::caching::ExecutorCompileTimeEntry<
554	executor_utils::caching::ParallelIterExtentMap>(
555	data_cache, [&parallel_binding_ids]() {
556	return executor_utils::getParallelIterExtents(parallel_binding_ids);
557	});
558	auto& parallel_iter_extents = parallel_iter_extent_entry.get();
559
560	auto simplified_parallel_iter_extent_entry =
561	executor_utils::caching::ExecutorCompileTimeEntry<
562	executor_utils::caching::SimplifiedParallelIterExtentMap>(
563	data_cache, [&parallel_binding_ids, &lower]() {
564	return executor_utils::getSimplifiedParallelIterExtents(
565	lower, parallel_binding_ids);
566	});
567	auto& simplified_parallel_iter_extents =
568	simplified_parallel_iter_extent_entry.get();
569
570	auto warp_padded_parallel_entry =
571	executor_utils::caching::ExecutorCompileTimeEntry<
572	executor_utils::caching::WarpPaddedParallelExtents>(
573	data_cache, [&parallel_binding_ids, &lower]() {
574	return executor_utils::getWarpPaddedExtentsInfo(
575	lower->kernel(), parallel_binding_ids);
576	});
577	auto& warp_padded_extent_set =
578	warp_padded_parallel_entry.get().warp_padded_extent_set;
579	auto& warp_padded_constant =
580	warp_padded_parallel_entry.get().warp_padded_constant;
581
582	// TODO: Need to redesign this part a bit to
583	// find the right place to trigger evaluate
584	if (expr_eval.precomputedValues()) {
585	expr_eval.precomputedValues()->bindParallelExtents(
586	parallel_iter_extents, launch_constraints);
587	expr_eval.precomputedValues()->evaluate();
588	}
589
590	// If any dimension was set in launch constraints we need to run through
591	// IterDomains that have been parallelized, and bind those values. Or make
592	// sure if they could be inferred the inference matches what was set.
593	for (auto& entry : parallel_iter_extents) {
594	auto p_type = entry.first;
595	if (launch_constraints.hasDim(p_type)) {
596	auto parallel_extents = entry.second;
597	for (auto extent : parallel_extents) {
598	auto inferred_val = expr_eval.evaluate(extent);
599	if (inferred_val.has_value()) {
600	// This value could have been inferred, make sure it was set right.
601	bool valid =
602	inferred_val.value() == launch_constraints.getDim(p_type) \|\|
603	launch_constraints.getRawVal(p_type) == -`1`;
604	if (!useFallback() && !valid) {
605	TORCH_WARN_ONCE(
606	"Cannot validate parallelization scheme, "
607	"this may be due to mixed broadcast axes that are parallelized.");
608	}
609	} else if (!expr_eval.precomputedValues()) {
610	expr_eval.bind(extent, launch_constraints.getDim(p_type));
611	}
612	if (!launch_params.hasDim(p_type)) {
613	// Bind the launch constraint into our evaluation context
614	launch_params.bind(launch_constraints.getDim(p_type), p_type);
615	// Makes sure the p-types bound to evaluators are the
616	// final values that will become the actual launch
617	// param size to ensure accurate smem buffer size
618	// computation.
619	expr_eval.bind(p_type, launch_constraints.getDim(p_type));
620	}
621	}
622	}
623	}
624
625	// Run through the rest of the parallel IterDomains and infer their size
626	for (auto& entry : simplified_parallel_iter_extents) {
627	FUSER_PERF_SCOPE("FusionExecutor::ParallelBindingResolution");
628	auto p_type = entry.first;
629	auto parallel_extents = entry.second;
630	// Select the maxmimum value out of all the parallel extents
631	int64_t maximum_value = std::numeric_limits<int64_t>::min();
632	for (auto extent : parallel_extents) {
633	auto val = expr_eval.evaluate(extent);
634	TORCH_INTERNAL_ASSERT(
635	val.has_value(),
636	"Tried to evaluate the extent, ",
637	extent->toInlineString(),
638	" for the ptype: ",
639	p_type,
640	" to set launch bounds but could not.");
641
642	// apply padding to the extent if needed
643	if (warp_padded_extent_set.count(extent)) {
644	// Check if the extent has const value
645	auto padded_constant_it = warp_padded_constant.find(extent);
646
647	if (padded_constant_it != warp_padded_constant.end()) {
648	// If already specified padded to constant, need to check
649	// runtime value not over the constant bound
650	TORCH_INTERNAL_ASSERT(*val <= padded_constant_it ->second);
651	*val = padded_constant_it ->second;
652	} else {
653	// If no specified constant, pad to the smallest multiple of warp
654	// above the value.
655	auto padded_number_of_warps = (*val + warp_size - `1`) / warp_size;
656	val = warp_size padded_number_of_warps;
657	}
658	TORCH_INTERNAL_ASSERT(
659	*val <= `1024`, "padded dimension larger than max block size");
660	}
661	maximum_value = std::max(maximum_value, val ->as<int64_t>());
662	}
663	// Protect for size-0 tensors, they still have a value so would prefer to
664	// bind nothing than 0
665	if (maximum_value > `0`) {
666	expr_eval.bind(p_type, maximum_value);
667	launch_params.bind(maximum_value, p_type);
668	}
669	}
670
671	// Re-run the integer machine with all
672	// the thread sizes now determined.
673	if (expr_eval.precomputedValues()) {
674	expr_eval.precomputedValues()->evaluate();
675	}
676
677	const auto kernel = lowered_->kernel();
678	const auto& kernel_summary = kernel->summary();
679
680	// Calculate Dynamic Shared Memory Size
681	// Add workspace for reduction and broadcast
682	uint64_t reduction_broadcast_workspace = `0`;
683	const bool has_workspace = kernel_summary.has_block_reductions \|\|
684	kernel_summary.has_grid_reductions \|\|
685	kernel_summary.has_block_broadcasts \|\| kernel_summary.has_grid_broadcasts;
686	if (has_workspace &&
687	kernel_summary.largest_smem_data_type != DataType::Null) {
688	// Not using nThreads here since it does not handle uninitialized value
689
690	// TODO: here is an optimization opportunity since welford uses int64_t for
691	// N while the data type is not neccessarily double. But it may need more
692	// work on the alignment
693	const int welford_factor =
694	kernel_summary.has_block_welford \|\| kernel_summary.has_grid_welford ? `3`
695	: `1`;
696	reduction_broadcast_workspace =
697	dataTypeSize(kernel_summary.largest_smem_data_type) * welford_factor *
698	launch_params.bdimx() * launch_params.bdimy() * launch_params.bdimz();
699	}
700
701	// NOLINTNEXTLINE(cppcoreguidelines-init-variables)
702	const uint64_t dynamic_smem_size = computeSharedMemory(
703	expr_eval,
704	kernel_summary.dynamic_smem_allocations,
705	true,
706	reduction_broadcast_workspace);
707
708	// Check that requested smem size can be dynamically allocated.
709	// This check is only done once a kernel has been compiled, since
710	// maybe_available_dynamic_smem_ needs to be evaluated on
711	// a compiled kernel.
712	if (maybe_available_dynamic_smem_.has_value()) {
713	// Dynamic shared memory space that we can allocate without
714	// carving more space from L1.
715	const uint64_t available_dynamic_smem_without_reconfiguration =
716	maybe_available_dynamic_smem_.value();
717	// Maximum additional shared memory size we could request
718	// if we do re-configuration.
719	const uint64_t additional_dynamic_smem_available_through_reconfiguration =
720	device_smem_limit_ - configured_device_smem_;
721
722	TORCH_INTERNAL_ASSERT(
723	(dynamic_smem_size) <
724	(available_dynamic_smem_without_reconfiguration +
725	additional_dynamic_smem_available_through_reconfiguration),
726	"The total shared memory allocation is larger than available memory.",
727	" Dynamic size: ",
728	dynamic_smem_size,
729	". Available size: ",
730	maybe_available_dynamic_smem_.value(),
731	". Configured smem size: ",
732	configured_device_smem_,
733	". Device limit size: ",
734	device_smem_limit_);
735	}
736
737	launch_params.setSmem(dynamic_smem_size);
738
739	return launch_params;
740	}
741
742	FusionExecutor::GlobalBuffers FusionExecutor::allocGlobalVals(
743	kir::ExpressionEvaluator& expr_eval) {
744	FUSER_PERF_SCOPE("FusionExecutor::AllocGlobalVals");
745	GlobalBuffers global_buffers;
746	const auto kernel = lowered_->kernel();
747	const auto& kernel_summary = kernel->summary();
748	for (auto alloc : kernel_summary.global_allocations) {
749	TORCH_INTERNAL_ASSERT(
750	alloc->buffer()->isA<TensorView>(),
751	"Cannot allocate global buffers that are not tensors.");
752	auto tv = alloc->buffer()->as<TensorView>();
753	if (tv->isFusionOutput()) {
754	continue;
755	}
756	if (alloc->zeroInit()) {
757	global_buffers.buffers.push_back(
758	inferAndAlloc(tv, alloc->shape(), expr_eval, {}, options_, true));
759	global_buffers.zero_init.push_back(true);
760	} else {
761	global_buffers.buffers.push_back(
762	inferAndAlloc(tv, alloc->shape(), expr_eval, {}, options_, false));
763	global_buffers.zero_init.push_back(false);
764	}
765	// Remember the tensor buffer used for storing kernel profile
766	if (isOptionEnabled(EnableOption::KernelProfile) &&
767	tv == kernel->profile().getBuffer()) {
768	global_buffers.profile_buffer = global_buffers.buffers.back();
769	}
770	}
771
772	return global_buffers;
773	}
774
775	std::vector<at::Tensor> FusionExecutor::allocOutputs(
776	const KernelArgumentHolder& args,
777	kir::ExpressionEvaluator& expr_eval,
778	const std::unordered_set<int>& alias_indices) {
779	FUSER_PERF_SCOPE("FusionExecutor::AllocOutputs");
780	const auto kernel = lowered_->kernel();
781	// NOLINTNEXTLINE(cppcoreguidelines-init-variables)
782	std::vector<at::Tensor> outputs;
783	TORCH_INTERNAL_ASSERT(
784	args.size() == kernel->inputs().size(),
785	"kernel arguments length does not match runtime arguments.");
786	for (const auto out_i : c10::irange(kernel->outputs().size())) {
787	if (kernel->outputs()[out_i]->isFusionInput()) {
788	// pushing empty tensor for trivial forwarding. Since we handle this in
789	// integration, see step 1 - note [trivial forwarding]
790	c10::Device device(c10::DeviceType::CUDA, args.getDeviceIndex());
791	const auto tensor_options =
792	at::TensorOptions().dtype(at::kFloat).device(device);
793	outputs.emplace_back(at::empty({`0`}, tensor_options));
794	} else {
795	TORCH_INTERNAL_ASSERT(
796	kernel->outputs()[out_i]->isA<TensorView>(),
797	"Cannot allocate outputs that are not tensors.");
798	auto output = kernel->outputs()[out_i]->as<TensorView>();
799	if (alias_indices.count(out_i) != `0`) {
800	// aliasing to inputs, no need to allocate real output, just push empty
801	// tensor here.
802	outputs.emplace_back();
803	} else {
804	outputs.push_back(
805	inferAndAllocOutput(output, expr_eval, options_, false));
806	}
807	}
808	}
809	return outputs;
810	}
811
812	void FusionExecutor::setUsedTVs() {
813	auto used_vals = fusion_->usedMathVals();
814	auto used_tvs = ir_utils::filterByType<TensorView>(used_vals);
815	used_tvs_.clear();
816	used_tvs_.insert(used_tvs_.begin(), used_tvs.begin(), used_tvs.end());
817	}
818
819	KernelArgumentHolder FusionExecutor::evaluateOutputSizes(
820	const KernelArgumentHolder& args,
821	kir::ExpressionEvaluator& expr_eval,
822	const std::unordered_set<int>& alias_indices) {
823	FUSER_PERF_SCOPE("FusionExecutor::AllocOutputs");
824	const auto kernel = lowered_->kernel();
825
826	KernelArgumentHolder ret(args.getIndexMode());
827	ret.setDeviceIndex(args.getDeviceIndex());
828
829	CompileOptions meta_options = options_;
830	meta_options.device = c10::Device (DeviceType::Meta, `0`);
831
832	for (const auto out_i : c10::irange(kernel->outputs().size())) {
833	// If the output is just trivially the input, just "copy" it over, see note
834	// [trivial forwarding]
835	if (kernel->outputs()[out_i]->isFusionInput()) {
836	for (auto inp_i : c10::irange(kernel->inputs().size())) {
837	if (kernel->inputs()[inp_i] == kernel->outputs()[out_i]) {
838	TORCH_INTERNAL_ASSERT(
839	inp_i < args.size(),
840	"Issue with an input showing up as output, couldn't find input.");
841
842	auto tensor_arg_abstract =
843	dynamic_cast<const TensorArgAbstract*>(args [inp_i]);
844	TORCH_INTERNAL_ASSERT(
845	tensor_arg_abstract,
846	"Cannot register a scalar as an output in a fusion.");
847	ret.push(tensor_arg_abstract);
848	break;
849	}
850	}
851	} else {
852	TORCH_INTERNAL_ASSERT(
853	kernel->outputs()[out_i]->isA<TensorView>(),
854	"Cannot allocate outputs that are not tensors.");
855	auto output = kernel->outputs()[out_i]->as<TensorView>();
856	if (alias_indices.count(out_i) != `0`) {
857	// aliasing to inputs, no need to allocate real output
858	// but we still need to push an entry here.
859	ret.push(int64_t(`0`));
860	} else {
861	// TODO: we are using meta here, which is bad since it doesn't account
862	// for devices. Switch to fake tensor instead
863	ret.push(inferAndAllocOutput(output, expr_eval, meta_options, false));
864	}
865	}
866	}
867	return ret;
868	}
869
870	KernelArgumentHolder FusionExecutor::inferOutputSizes(
871	const KernelArgumentHolder& args,
872	const LaunchParams& launch_constraints) {
873	FUSER_PERF_SCOPE("FusionExecutor::RunFusion");
874
875	ExecutorEntry* executor_entry = nullptr;
876	c10::optional<size_t> opt_code = args.getCacheId();
877	if (opt_code.has_value()) {
878	executor_entry = &executor_entry_lookup_[*opt_code];
879	}
880
881	at::cuda::jit::initializeCudaContext();
882	TORCH_INTERNAL_ASSERT(lowered_);
883
884	TORCH_INTERNAL_ASSERT(
885	!executor_entry \|\| !executor_entry->init,
886	"compile kernel shouldn't hit a pre-existing cache");
887	FUSER_PERF_SCOPE("ExecutorRunFusion::ValidateAndInitialize");
888	// TODO: validate kernel inputs currently won't be happy, since our fusion
889	// args are mapped with `meta` tensor instead of `cuda` tensor, check if this
890	// would be resolved with FakeTensor
891	// executor_utils::validateKernelInputs(fusion_, args, options_.device);
892
893	if (!evaluator_precomputed_values_) {
894	evaluator_precomputed_values_ =
895	std::make_unique<KernelPrecomputedValues>(lowered_->kernel());
896	}
897
898	kir::ExpressionEvaluator expr_eval;
899	evaluator_precomputed_values_->bindKernelInputs(lowered_->kernel(), args);
900	expr_eval.precomputedValues() = evaluator_precomputed_values_.get();
901
902	// I think this binds something to expr_eval, so even though we are not using
903	// launch_params_, we still need this in order to infer output shapes.
904	launch_params_ =
905	computeLaunchParams(launch_constraints, expr_eval, warp_size_);
906
907	executor_utils::validateVectorizedTensors(
908	lowered_.get()->kernel(), args, {}, compileTimeDataCache(), expr_eval);
909
910	auto alias_indices_entry = executor_utils::caching::ExecutorCompileTimeEntry<
911	executor_utils::caching::InputAliasIndices>(
912	compileTimeDataCache(), [&]() {
913	return std::make_unique<std::vector<std::pair<int, int>>>(
914	fusion_->getInputAliasIndices());
915	});
916
917	auto& alias_indices = alias_indices_entry.get();
918
919	// NOLINTNEXTLINE(bugprone-branch-clone)
920	auto output_alias_indices_entry =
921	executor_utils::caching::ExecutorCompileTimeEntry<
922	executor_utils::caching::OutputAliasIndices>(
923	compileTimeDataCache(), [&]() {
924	return std::make_unique<std::unordered_set<int>>(
925	fusion_->getOutputAliasIndices());
926	});
927
928	auto& output_alias_indices = output_alias_indices_entry.get();
929
930	auto ret = evaluateOutputSizes(args, expr_eval, output_alias_indices);
931
932	for (const auto& entry : alias_indices) {
933	auto aliased_output_index = entry.first;
934	auto aliased_input_index = entry.second;
935	TORCH_INTERNAL_ASSERT(
936	args[aliased_input_index]->isType(ArgType::Tensor),
937	"alias io only supports tensor");
938	ret.swap(aliased_output_index, args [aliased_input_index]);
939	}
940
941	return ret;
942	}
943
944	std::vector<at::Tensor> FusionExecutor::runFusion(
945	KernelArgumentHolder& args,
946	const LaunchParams& launch_constraints,
947	const std::vector<at::Tensor>& outputs) {
948	FUSER_PERF_SCOPE("FusionExecutor::RunFusion");
949	TORCH_INTERNAL_ASSERT(compiled());
950	TORCH_INTERNAL_ASSERT(
951	fusion_id_ > `0`, "Cannot run fusion, it was not compiled.");
952	TORCH_INTERNAL_ASSERT(
953	!args.getCacheId().has_value() \|\| outputs.empty(),
954	"short cut input cache is not compatible with pre-allocated output");
955
956	size_t num_inputs = args.size();
957
958	if (isDebugDumpEnabled(DebugDumpOption::FusionArgs)) {
959	std::cout << "Arguments for fusion" << fusion_id_ << ":" << std::endl
960	<< "Inputs:" << std::endl;
961	for (auto i : c10::irange(args.size())) {
962	args [i]->print();
963	}
964	std::cout << "Outputs:" << std::endl;
965	for (const auto& output : outputs) {
966	std::cout << " " << output.scalar_type() << " " << output.sizes()
967	<< " (strides = " << output.strides() << ")" << std::endl;
968	}
969	std::cout << launch_constraints.toString();
970	}
971
972	ExecutorEntry* executor_entry = nullptr;
973	if (args.getCacheId().has_value()) {
974	executor_entry = &executor_entry_lookup_[*args.getCacheId()];
975	}
976
977	c10::DeviceGuard dg(options_.device);
978	auto stream = at::cuda::getCurrentCUDAStream();
979	at::cuda::jit::initializeCudaContext();
980	TORCH_INTERNAL_ASSERT(lowered_);
981	launch_params_ = LaunchParams ();
982	// NOLINTNEXTLINE(cppcoreguidelines-init-variables)
983	std::vector<at::Tensor> allocated_outputs;
984	GlobalBuffers global_buffers;
985	uint64_t rand_offset = `0`;
986
987	if (executor_entry && executor_entry->init && !disable_parameter_cache_) {
988	{
989	// context manager to disable auto grad for `empty_cuda` calls later
990	at::AutoDispatchBelowADInplaceOrView non_variable_type_mode;
991	// take the short-cut for launch if we see a recorded input set again
992	launch_params_ = executor_entry->launch_params;
993	// only allocate outputs when not given
994	if (outputs.empty()) {
995	FUSER_PERF_SCOPE("ExecutorRunFusion::OutputAlloc");
996	for (const auto i : c10::irange(executor_entry->output_sizes.size())) {
997	allocated_outputs.push_back(at::native::empty_strided_cuda(
998	executor_entry->output_sizes [i],
999	executor_entry->output_strides [i],
1000	executor_entry->output_types [i],
1001	c10::nullopt,
1002	options_.device,
1003	c10::nullopt));
1004	if (shouldFillAllocationWithNan()) {
1005	fillTensorWithNan(allocated_outputs.back());
1006	}
1007	}
1008	// Note: aliased output is not returned as output. But we still need it
1009	// for kernel execution, so would need to push them to args
1010	for (const auto& entry : executor_entry->io_alias_indices) {
1011	auto aliased_output_index = entry.first;
1012	auto aliased_input_index = entry.second;
1013	auto tensor_arg_abstract =
1014	dynamic_cast<const TensorArgAbstract*>(args [aliased_input_index]);
1015	TORCH_INTERNAL_ASSERT(
1016	tensor_arg_abstract, "alias io only supports tensor");
1017	allocated_outputs [aliased_output_index] =
1018	tensor_arg_abstract->getTensor();
1019	}
1020	args.push(allocated_outputs);
1021	} else {
1022	TORCH_INTERNAL_ASSERT(
1023	outputs.size() == fusion_->outputs().size(),
1024	__func__,
1025	" provided number of outputs does match fusion output");
1026	allocated_outputs = outputs;
1027	args.push(outputs);
1028	}
1029
1030	{
1031	FUSER_PERF_SCOPE("ExecutorRunFusion::IntermediateBufferAlloc");
1032	for (const auto i : c10::irange(executor_entry->buffer_sizes.size())) {
1033	if (executor_entry->buffer_zero_init [i]) {
1034	global_buffers.buffers.push_back(at::zeros(
1035	executor_entry->buffer_sizes [i],
1036	at::TensorOptions()
1037	.dtype(executor_entry->buffer_types [i])
1038	.device(options_.device)));
1039	global_buffers.zero_init.push_back(true);
1040	} else {
1041	global_buffers.buffers.push_back(at::native::empty_cuda(
1042	executor_entry->buffer_sizes [i],
1043	executor_entry->buffer_types [i],
1044	c10::nullopt,
1045	options_.device,
1046	c10::nullopt));
1047	if (shouldFillAllocationWithNan()) {
1048	fillTensorWithNan(global_buffers.buffers.back());
1049	}
1050	global_buffers.zero_init.push_back(false);
1051	}
1052	}
1053	}
1054	}
1055	rand_offset = executor_entry->rand_offset;
1056	} else {
1057	FUSER_PERF_SCOPE("ExecutorRunFusion::ValidateAndInitialize");
1058	// code path to take when either:
1059	// 1. no opt_code is provided or
1060	// 2. `executor_entry` is not initialized
1061	executor_utils::validateKernelInputs(fusion_, args, options_.device);
1062
1063	if (!evaluator_precomputed_values_) {
1064	evaluator_precomputed_values_ =
1065	std::make_unique<KernelPrecomputedValues>(lowered_->kernel());
1066	}
1067
1068	kir::ExpressionEvaluator expr_eval;
1069	evaluator_precomputed_values_->bindKernelInputs(lowered_->kernel(), args);
1070	expr_eval.precomputedValues() = evaluator_precomputed_values_.get();
1071
1072	launch_params_ =
1073	computeLaunchParams(launch_constraints, expr_eval, warp_size_);
1074
1075	// Recompile the kernel if the number of threads in the block has increased
1076	if (launch_params_.nThreads() > block_size_high_water_mark) {
1077	const auto kernel = lowered_->kernel();
1078	kernel_code_ = codegen::generateCudaKernel(kernel, kernelName());
1079	const auto structured_code = getStructuredCode(kernel_code_);
1080	block_size_high_water_mark = launch_params_.nThreads();
1081
1082	std::tie(compiled_kernel_, last_compiler_log_) =
1083	executor_utils::nvrtcCompile(
1084	structured_code,
1085	(kernelNamespace() + "::" + kernelName()).c_str(),
1086	fusion_id_,
1087	block_size_high_water_mark);
1088	}
1089
1090	if (kernel()->summary().has_cooperative_grid_reduction) {
1091	#ifndef USE_ROCM
1092	int num_blocks_per_SM = -`1`;
1093	at::globalContext().getNVRTC().cuOccupancyMaxActiveBlocksPerMultiprocessor(
1094	&num_blocks_per_SM,
1095	compiled_kernel_.function,
1096	(int)(launch_params_.bdimx() * launch_params_.bdimy() * launch_params_.bdimz()),
1097	(size_t)launch_params_.smem());
1098
1099	TORCH_INTERNAL_ASSERT(
1100	(int64_t)(
1101	num_blocks_per_SM *
1102	at::cuda::getDeviceProperties(options_.device.index())
1103	->multiProcessorCount) >= launch_params_.gdimx() *
1104	launch_params_.gdimy() * launch_params_.gdimz(),
1105	"Wanted to launch a cooperative kernel, however the number of blocks is greater than ",
1106	"what can be resident on the GPU at once. Need: ",
1107	launch_params_.gdimx() * launch_params_.gdimy() *
1108	launch_params_.gdimz(),
1109	" (",
1110	launch_params_.gdimx(),
1111	" * ",
1112	launch_params_.gdimy(),
1113	" * ",
1114	launch_params_.gdimz(),
1115	") but limited to ",
1116	num_blocks_per_SM,
1117	" * ",
1118	at::cuda::getDeviceProperties(options_.device.index())
1119	->multiProcessorCount);
1120	#else
1121	TORCH_INTERNAL_ASSERT(
1122	false, "Cross grid communication not supported with HIP.");
1123	#endif
1124	}
1125
1126	executor_utils::validateVectorizedTensors(
1127	lowered_.get()->kernel(),
1128	args,
1129	outputs,
1130	compileTimeDataCache(),
1131	expr_eval);
1132
1133	auto alias_indices_entry =
1134	executor_utils::caching::ExecutorCompileTimeEntry<
1135	executor_utils::caching::InputAliasIndices>(
1136	compileTimeDataCache(), [&]() {
1137	return std::make_unique<std::vector<std::pair<int, int>>>(
1138	fusion_->getInputAliasIndices());
1139	});
1140
1141	auto& alias_indices = alias_indices_entry.get();
1142
1143	// NOLINTNEXTLINE(bugprone-branch-clone)
1144	if (outputs.empty()) {
1145	auto output_alias_indices_entry =
1146	executor_utils::caching::ExecutorCompileTimeEntry<
1147	executor_utils::caching::OutputAliasIndices>(
1148	compileTimeDataCache(), [&]() {
1149	return std::make_unique<std::unordered_set<int>>(
1150	fusion_->getOutputAliasIndices());
1151	});
1152
1153	auto& output_alias_indices = output_alias_indices_entry.get();
1154
1155	allocated_outputs = allocOutputs(args, expr_eval, output_alias_indices);
1156
1157	for (const auto& entry : alias_indices) {
1158	auto aliased_output_index = entry.first;
1159	auto aliased_input_index = entry.second;
1160	auto tensor_arg_abstract =
1161	dynamic_cast<const TensorArgAbstract*>(args [aliased_input_index]);
1162	TORCH_INTERNAL_ASSERT(
1163	tensor_arg_abstract, "alias io only supports tensor");
1164	allocated_outputs [aliased_output_index] =
1165	tensor_arg_abstract->getTensor();
1166	}
1167	args.push(allocated_outputs);
1168	} else {
1169	allocated_outputs = outputs;
1170	args.push(outputs);
1171	executor_utils::validateKernelOutputs(
1172	fusion_, allocated_outputs, options_.device);
1173	}
1174
1175	global_buffers = allocGlobalVals(expr_eval);
1176
1177	if (kernel()->summary().max_rng_offsets >= `0`) {
1178	// NOTE: this is how we map offset to PW kernels in order to have
1179	// identical random number generator to match native PyTorch results.
1180	// But it doesn't really work as it takes assumption how threads are
1181	// binded but is not generally how we handle that in scheduler.
1182	// Refer to `Philox` in generated kernel to understand how the mapping
1183	// works.
1184	rand_offset = (kernel()->summary().max_rng_offsets + `1`) * `4`;
1185	}
1186
1187	// This is the entry when we have provided `opt_code` but the entry has not
1188	// been initialized yet.
1189	if (executor_entry) {
1190	FUSER_PERF_SCOPE("ExecutorRunFusion::FillCacheEntry");
1191	// record the the short-cut executor entry for the given input set;
1192	executor_entry->launch_params = launch_params_;
1193	executor_entry->io_alias_indices = alias_indices;
1194	for (const auto& output : allocated_outputs) {
1195	executor_entry->output_sizes.push_back(output.sizes().vec());
1196	executor_entry->output_strides.push_back(output.strides().vec());
1197	executor_entry->output_types.push_back(output.scalar_type());
1198	}
1199
1200	for (const auto& i : c10::irange(global_buffers.buffers.size())) {
1201	executor_entry->buffer_sizes.push_back(
1202	global_buffers.buffers [i].sizes().vec());
1203	executor_entry->buffer_types.push_back(
1204	global_buffers.buffers [i].scalar_type());
1205	executor_entry->buffer_zero_init.push_back(global_buffers.zero_init [i]);
1206	}
1207	executor_entry->rand_offset = rand_offset;
1208	executor_entry->init = true;
1209	}
1210	}
1211
1212	// push back global buffers
1213	args.push(global_buffers.buffers);
1214
1215	// push back RNG state if needed
1216	if (lowered_->kernel()->summary().max_rng_offsets >= `0`) {
1217	args.appendPhiloxRNGSeed(rand_offset);
1218	}
1219
1220	if (isDebugDumpEnabled(DebugDumpOption::LaunchParam)) {
1221	launch_params_.print();
1222	}
1223
1224	if (isDebugDumpEnabled(DebugDumpOption::KernelArgs)) {
1225	std::cout << "Arguments for kernel" << fusion_id_ << ":" << std::endl
1226	<< "Inputs:" << std::endl;
1227	for (auto i : c10::irange(args.size())) {
1228	args [i]->print();
1229	}
1230	std::cout << "Outputs:" << std::endl;
1231	// note: add aliased outputs here.
1232	for (const auto& output : allocated_outputs) {
1233	std::cout << " " << output.scalar_type() << " " << output.sizes()
1234	<< " (strides = " << output.strides()
1235	<< ", address = " << output.data_ptr() << ")" << std::endl;
1236	}
1237	std::cout << "Reduction and semaphore buffers:" << std::endl;
1238	TORCH_INTERNAL_ASSERT(
1239	global_buffers.buffers.size() == global_buffers.zero_init.size(),
1240	"global_buffer buffer & zero_init container should have identical sizes");
1241	for (const auto i : c10::irange(global_buffers.buffers.size())) {
1242	const auto& buffer = global_buffers.buffers [i];
1243	const auto& zero_init = global_buffers.zero_init [i];
1244	std::cout << " " << buffer.scalar_type() << " " << buffer.sizes()
1245	<< " is_zero_initialized: " << zero_init << std::endl;
1246	}
1247	}
1248
1249	cudaEvent_t start_event = {};
1250	cudaEvent_t finish_event = {};
1251
1252	if (measure_kernel_time_ \|\|
1253	isDebugDumpEnabled(DebugDumpOption::EffectiveBandwidth) \|\|
1254	isDebugDumpEnabled(DebugDumpOption::PerfDebugVerbose)) {
1255	C10_CUDA_CHECK(cudaEventCreate(&start_event));
1256	C10_CUDA_CHECK(cudaEventCreate(&finish_event));
1257	C10_CUDA_CHECK(cudaEventRecord(start_event));
1258	}
1259
1260	if (execute_kernel_) {
1261	if (maybe_available_dynamic_smem_.has_value() &&
1262	size_t(launch_params_.smem()) > maybe_available_dynamic_smem_.value()) {
1263	#ifndef USE_ROCM
1264	// Increase limit of dynamic shared memory if needed.
1265	AT_CUDA_DRIVER_CHECK(at::globalContext().getNVRTC().cuFuncSetAttribute(
1266	compiled_kernel_.function,
1267	CU_FUNC_ATTRIBUTE_MAX_DYNAMIC_SHARED_SIZE_BYTES,
1268	launch_params_.smem()));
1269	#else
1270	TORCH_INTERNAL_ASSERT(
1271	false, "cuFuncSetAttribute not supported with HIP.");
1272	#endif
1273	}
1274	if (!kernel()->summary().has_cooperative_grid_reduction) {
1275	FUSER_PERF_SCOPE("ExecutorRunFusion::cuLaunchKernel");
1276	AT_CUDA_DRIVER_CHECK(at::globalContext().getNVRTC().cuLaunchKernel(
1277	compiled_kernel_.function,
1278	launch_params_.gdimx(),
1279	launch_params_.gdimy(),
1280	launch_params_.gdimz(),
1281	launch_params_.bdimx(),
1282	launch_params_.bdimy(),
1283	launch_params_.bdimz(),
1284	launch_params_.smem(),
1285	stream,
1286	args.getBuffer(),
1287	nullptr));
1288	} else {
1289	#ifndef USE_ROCM
1290	FUSER_PERF_SCOPE("ExecutorRunFusion::cuLaunchCooperativeKernel");
1291	AT_CUDA_DRIVER_CHECK(
1292	at::globalContext().getNVRTC().cuLaunchCooperativeKernel(
1293	compiled_kernel_.function,
1294	launch_params_.gdimx(),
1295	launch_params_.gdimy(),
1296	launch_params_.gdimz(),
1297	launch_params_.bdimx(),
1298	launch_params_.bdimy(),
1299	launch_params_.bdimz(),
1300	launch_params_.smem(),
1301	stream,
1302	args.getBuffer()));
1303	#else
1304	TORCH_INTERNAL_ASSERT(
1305	false, "Cross grid communication not supported with HIP.");
1306	#endif
1307	}
1308	}
1309
1310	if (measure_kernel_time_ \|\|
1311	isDebugDumpEnabled(DebugDumpOption::EffectiveBandwidth) \|\|
1312	isDebugDumpEnabled(DebugDumpOption::PerfDebugVerbose)) {
1313	C10_CUDA_CHECK(cudaEventRecord(finish_event));
1314	C10_CUDA_CHECK(cudaEventSynchronize(start_event));
1315	C10_CUDA_CHECK(cudaEventSynchronize(finish_event));
1316	C10_CUDA_CHECK(
1317	cudaEventElapsedTime(&kernel_time_ms_, start_event, finish_event));
1318	C10_CUDA_CHECK(cudaEventDestroy(start_event));
1319	C10_CUDA_CHECK(cudaEventDestroy(finish_event));
1320
1321	bytes_processed_ = `0`;
1322	// Figure how many bytes are inputs, outputs, and temporary buffers
1323	for (auto i : c10::irange(num_inputs)) {
1324	if (auto tensor_arg_abstract =
1325	dynamic_cast<const TensorArgAbstract*>(args [i])) {
1326	bytes_processed_ += tensor_arg_abstract->numel() *
1327	dataTypeSize(tensor_arg_abstract->getDataType());
1328	}
1329	}
1330	for (const auto& output : allocated_outputs) {
1331	bytes_processed_ += output.numel() *
1332	dataTypeSize(aten_to_data_type(output.scalar_type()));
1333	}
1334
1335	if (isDebugDumpEnabled(DebugDumpOption::EffectiveBandwidth)) {
1336	double gb_per_s =
1337	((double)bytes_processed_ / ((double)kernel_time_ms_ / `1000`)) /
1338	(double)`1.0e9`;
1339	std::cout << "kernel" << fusion_id_ << " run in " << kernel_time_ms_
1340	<< " ms, achieved: " << gb_per_s << " GB/s" << std::endl;
1341	}
1342	}
1343
1344	if (isOptionEnabled(EnableOption::KernelProfile)) {
1345	std::cout << kernel()->profile().toString(global_buffers.profile_buffer);
1346	}
1347
1348	return allocated_outputs;
1349	}
1350
1351	void FusionExecutor::compileRtc(
1352	const std::string& code,
1353	const std::string& name,
1354	bool structured,
1355	CompileOptions options) {
1356	FUSER_PERF_SCOPE("ExecutorRunFusion::compileRtc");
1357	std::string scode;
1358	if (!structured) {
1359	scode = getStructuredCode(code);
1360	} else {
1361	scode = code;
1362	}
1363	fusion_id_ = `1`;
1364	options_ = options;
1365
1366	std::tie(compiled_kernel_, last_compiler_log_) =
1367	executor_utils::nvrtcCompile(scode, name, fusion_id_);
1368	}
1369
1370	void FusionExecutor::runRtc(
1371	const LaunchParams& launch_params,
1372	const std::vector<at::Tensor>& args) {
1373	FUSER_PERF_SCOPE("runFusion");
1374
1375	c10::DeviceGuard dg(options_.device);
1376	auto stream = at::cuda::getCurrentCUDAStream();
1377
1378	KernelArgumentHolder kernel_arguments(options_.index_mode);
1379	kernel_arguments.push(args);
1380	AT_CUDA_DRIVER_CHECK(at::globalContext().getNVRTC().cuLaunchKernel(
1381	compiled_kernel_.function,
1382	launch_params.gdimx(),
1383	launch_params.gdimy(),
1384	launch_params.gdimz(),
1385	launch_params.bdimx(),
1386	launch_params.bdimy(),
1387	launch_params.bdimz(),
1388	launch_params.smem(),
1389	stream,
1390	kernel_arguments.getBuffer(),
1391	nullptr));
1392	}
1393
1394	} // namespace cuda
1395	} // namespace fuser
1396	} // namespace jit
1397	} // namespace torch
1398

Browse the source code of pytorch/third_party/nvfuser/csrc/executor.cpp