ir_interface_nodes.h source code [pytorch/third_party/nvfuser/csrc/ir_interface_nodes.h]

1	#pragma once
2
3	#include <c10/macros/Export.h>
4
5	#include <fusion.h>
6	#include <ir_base_nodes.h>
7	#include <ir_internal_nodes.h>
8	#include <mma_type.h>
9
10	#include <torch/csrc/jit/ir/ir.h>
11
12	//! Nodes in here are intended to be "user facing" users in this sense being
13	//! those that want to be able to generate CUDA code.
14
15	namespace torch {
16	namespace jit {
17	namespace fuser {
18	namespace cuda {
19
20	class WelfordResult;
21	class ViewTransform;
22
23	class IrCloner;
24	class IrBuilderPasskey;
25
26	//! A Bool value
27	//!
28	//! This value can be a symbolic value (defined after the kernel
29	//! is compiled) or a constant value (inlined into the kernel definition).
30	//!
31	class TORCH_CUDA_CU_API Bool : public Val {
32	public:
33	Bool(IrBuilderPasskey passkey);
34
35	explicit Bool(IrBuilderPasskey passkey, bool value);
36
37	explicit Bool(IrBuilderPasskey passkey, c10::optional<bool> value);
38
39	Bool(const Bool* src, IrCloner* ir_cloner);
40
41	bool isSymbolic() const {
42	return !(maybe_value_.has_value());
43	}
44	bool isConst() const final {
45	return maybe_value_.has_value();
46	}
47	c10::optional<bool> value() const {
48	return maybe_value_;
49	}
50
51	bool sameAs(const Statement* other) const override;
52
53	private:
54	const c10::optional<bool> maybe_value_;
55	};
56
57	//! A Float64 value. This value can be a symbolic value (defined after the
58	//! kernel is compiled) or a constant value (inlined into the kernel
59	//! definition).
60	class TORCH_CUDA_CU_API Double : public Val {
61	public:
62	using ScalarType = double;
63
64	Double(IrBuilderPasskey passkey);
65
66	explicit Double(IrBuilderPasskey passkey, ScalarType value);
67
68	explicit Double(IrBuilderPasskey passkey, c10::optional<ScalarType> value);
69
70	Double(const Double* src, IrCloner* ir_cloner);
71
72	bool isSymbolic() const {
73	return !(maybe_value_.has_value());
74	}
75	bool isConst() const final {
76	return maybe_value_.has_value();
77	}
78	c10::optional<ScalarType> value() const {
79	return maybe_value_;
80	}
81
82	bool sameAs(const Statement* other) const override;
83
84	private:
85	const c10::optional<ScalarType> maybe_value_;
86	};
87
88	//! An Int64 value. If used for indexing it's set as size_t. Otherwise it's an
89	//! inlined literal in the kernel.
90	class TORCH_CUDA_CU_API Int : public Val {
91	public:
92	using ScalarType = int64_t;
93
94	Int(IrBuilderPasskey passkey);
95
96	explicit Int(IrBuilderPasskey passkey, ScalarType value);
97
98	explicit Int(IrBuilderPasskey passkey, c10::optional<ScalarType> value);
99
100	Int(const Int* src, IrCloner* ir_cloner);
101
102	bool isSymbolic() const {
103	return !(maybe_value_.has_value());
104	}
105	bool isConst() const final {
106	return maybe_value_.has_value();
107	}
108	c10::optional<ScalarType> value() const {
109	return maybe_value_;
110	}
111
112	bool sameAs(const Statement* other) const override;
113
114	private:
115	const c10::optional<ScalarType> maybe_value_;
116	};
117
118	//! An c10::complex<double> value. This value can be a symbolic value (defined
119	//! after the kernel is compiled) or a constant value (inlined into the kernel
120	//! definition).
121	class TORCH_CUDA_CU_API ComplexDouble : public Val {
122	public:
123	using ScalarType = c10::complex<double>;
124
125	ComplexDouble(IrBuilderPasskey passkey);
126
127	explicit ComplexDouble(IrBuilderPasskey passkey, ScalarType value);
128
129	explicit ComplexDouble(
130	IrBuilderPasskey passkey,
131	c10::optional<ScalarType> value);
132
133	ComplexDouble(const ComplexDouble* src, IrCloner* ir_cloner);
134
135	bool isSymbolic() const {
136	return !(maybe_value_.has_value());
137	}
138	bool isConst() const final {
139	return maybe_value_.has_value();
140	}
141	c10::optional<ScalarType> value() const {
142	return maybe_value_;
143	}
144
145	bool sameAs(const Statement* other) const override;
146
147	private:
148	const c10::optional<ScalarType> maybe_value_;
149	};
150
151	//! Mode during propagation of computeAt, standard will throw an error if
152	//! computeAt position provided can't be satisfied, best effort will lower the
153	//! computeAt position as needed during traversal, most inlined will increase
154	//! the compute at position to maximum possible through traversal.
155	enum class ComputeAtMode { Standard, BestEffort, MostInlined };
156
157	class TransformPropagator;
158	struct MostInlinedTransformPropagator;
159	class TransformIter;
160	class TransformReplay;
161	class OptOutMutator;
162	class TensorDomain;
163
164	class MaxPosCalculator;
165
166	namespace ir_utils {
167	class TVDomainGuard;
168	}
169
170	//! TensorView is our primitive Tensor Type used in code generation. It can be
171	//! thought of as representing physical memory, however, its dimensionality is
172	//! modifed as split/merge/computeAt functions are called. The history of
173	//! these transformations are kept and used for generating actual code
174	//! referncing physical memory. Generally when users are thinking of code
175	//! generation in reference to a Tensor, this is the class they should be
176	//! interacting with.
177	//!
178	//! The reason we need both TensorView and TensorDomain is that we need to have
179	//! a record of both what is being computed and how it is being computed. For
180	//! example we may have the operation:
181	//!
182	//! TV3[I, J, K] = TV2[I, J, K] + TV1[I, J, K]
183	//!
184	//! The mathematical operations here are on the tensor views TV1, TV2, and
185	//! TV3. This operation is a pointwise operation. To compute this pointwise
186	//! operation we iterate over the 3D TensorDomain [I, J, K], where K is the
187	//! fastest changing dimension.
188	//!
189	//! \todo Need to work on the const model for TensorView, making all functions
190	//! that should be const, const. Gave this a try but expanded really quickly.
191	//! getComputeAtAxis not being const because it can return a TV that some expect
192	//! to be non-const is the biggest headache.
193	//!
194	class TORCH_CUDA_CU_API TensorView : public Val {
195	public:
196	TensorView(
197	IrBuilderPasskey passkey,
198	TensorDomain* domain,
199	DataType dtype,
200	MemoryType mtype = MemoryType::Local);
201
202	explicit TensorView(
203	IrBuilderPasskey passkey,
204	const std::shared_ptr<c10::TensorType>& tensor_type);
205
206	explicit TensorView(
207	IrBuilderPasskey passkey,
208	const std::shared_ptr<Value>& jit_value);
209
210	TensorView(const TensorView* src, IrCloner* ir_cloner);
211
212	TensorDomain* domain() const {
213	return domain_;
214	}
215
216	//! This is for a TensorView with an rFactor domain that is an input to a
217	//! fusion segment. We convert the rfactor domain into a new root domain.
218	//! Any dynamic-sized rfactor iterDomains are given a new symbolic extent.
219	//! Concrete integer extents are kept. Output TensorViews of any subsequent
220	//! expressions that use this TensorView are also updated.
221	void convertRfactorToRootDomain();
222
223	void setContiguity(const std::vector<bool>& contig) {
224	domain()->setContiguity(contig);
225	}
226
227	void setContiguity(bool contig) {
228	setContiguity(std::vector<bool>(domain()->contiguity().size(), contig));
229	}
230
231	bool hasReduction() const;
232	bool hasBlockReduction() const;
233	bool hasGridReduction() const;
234	bool hasBroadcast() const;
235	bool hasRFactor() const;
236
237	//! This is the previous hasReduction logic,
238	//! kept here exclusively for lower loop pass will
239	//! deprecate when Fusion IR pass can convert
240	//! trivial reductions
241	bool hasAnyReduction() const;
242
243	//! Returns true if this tensor is zero dimensional,
244	//! i.e. a wrapped scalar or an empty placeholder.
245	bool isZeroDim() const {
246	return nDims() == `0`;
247	}
248
249	//! Returns true if this tensor does not contain
250	//! any value.
251	bool isEmptyTensor() const;
252
253	c10::optional<unsigned int> getReductionAxis() const;
254
255	const std::vector<IterDomain>& getRootDomain() const*;
256
257	const std::vector<IterDomain>& getRFactorDomain() const*;
258
259	// If rfactor domain exists in domain() return it, otherwise return root
260	// domain.
261	const std::vector<IterDomain>& getMaybeRFactorDomain() const*;
262
263	IterDomain* axis(int pos) const;
264
265	// Does it share outer axes with other tensors?
266	bool hasComputeAt() const {
267	return compute_at_pos_ > `0`;
268	}
269
270	bool hasMaxProducerPosition() const {
271	return max_producer_pos_ > `0`;
272	}
273
274	size_t nDims() const;
275
276	// sets cpu_scalar_ value, which is special handling for CPU based zero-dim
277	// tensors (i.e. CPU Tensors that only have one value). This is only used if
278	// on an input value, otherwise ignored. This is important as special handling
279	// because these "scalars" should be type promoted as a tensor, but we want to
280	// avoid explicit copying of the data, so we want to pass the data value as a
281	// standard kernel argument value.
282	void setCpuScalar(bool is_cpu_scalar);
283
284	// returns cpu_scalar_ value, which is special handling for CPU based zero-dim
285	// tensors (i.e. CPU Tensors that only have one value). This is only used if
286	// on an input value, otherwise ignored. This is important as special handling
287	// because these "scalars" should be type promoted as a tensor, but we want to
288	// avoid explicit copying of the data, so we want to pass the data value as a
289	// standard kernel argument value.
290	bool isCpuScalar() const {
291	return cpu_scalar_;
292	}
293
294	// Returns the position that this tensor is produced at relative to its axes.
295	unsigned int getComputeAtPosition() const {
296	return compute_at_pos_;
297	}
298
299	// Returns the maximum position of producers are being computed at relative to
300	// this tensor. This position dictates the clear expectations of producers.
301	unsigned int getMaxProducerPosition() const {
302	return max_producer_pos_;
303	}
304
305	//! This is used when we disconnect a tensorview from a reduction
306	//! operation and connect it to a non-reduction operator. We need
307	//! to remove the reduction ids on the tv in this case.
308	//! Currently only used in translate welford, and this function may
309	//! be refactored or extended if any more use cases appear.
310	void clearReductionIterDomains();
311
312	//! Compute this TensorView relative to a consumer position, -1 will
313	//! compute tensors inline with each other, 0 doesn't share
314	//! any loop nests between the tensors. It's an error when the given
315	//! position is not legally viable. Alternatively, when the mode
316	//! parameter is ComputeAtMode::BestEffort, the position is lowered
317	//! one by one until a valid position is found. When
318	//! ComputeAtMode::MostInlined is given, the position parameter is
319	//! ignored, and the deepest possible position is searched.
320	TensorView* computeAt(
321	TensorView* consumer,
322	int position,
323	ComputeAtMode mode = ComputeAtMode::Standard);
324
325	//! Compute this tensor to consumer, at local position, -1 will compute
326	//! tensors inline with eachother, 0 doesn't share any loop nests between the
327	//! tensors. The mode parameter can be used in the same manner as computeAt.
328	TensorView* computeWith(
329	TensorView* consumer,
330	int position,
331	ComputeAtMode mode = ComputeAtMode::Standard);
332
333	// Split "axis" into 2 axes
334	//! inner_split dictates if the factor section of the split should be inside
335	//! the
336	//! remainer or outside.
337	//! e.g. split(0, 4, inner_split = true) will result in:
338	//! tv[id{extent}] -> tv[id{ceilDiv(extent, factor)}, id{factor}]
339	//! e.g. split(0, 4, inner_split = false) will result in:
340	//! tv[id{extent}] -> tv[id{factor}, id{ceilDiv(extent, factor)}]
341	//!
342	//! When trim_out_of_bounds is true, only the inner domain defined by the
343	//! start and stop positions is split.
344	TensorView* split(
345	int axis,
346	unsigned int factor,
347	bool inner_split = true,
348	bool trim_out_of_bounds = false);
349
350	// Split "axis" into 2 axes where the inner axes is size of "factor"
351	// and outer axis is size axis.size() / factor. Factor can be a symbolic
352	// value instead of constant. This requires setting the symbolic value as an
353	// input, or using a parallel dim from NamedScalar::getParallelDim
354	TensorView* split(
355	int axis,
356	Val* factor,
357	bool inner_split = true,
358	bool trim_out_of_bounds = false);
359
360	// Merge axis_o and axis_i into 1 IterDomain
361	TensorView* merge(int axis_o, int axis_i);
362
363	// Merge axis and axis+1 into 1 IterDomain
364	TensorView* merge(int axis) {
365	return merge(axis, axis + `1`);
366	}
367
368	// Reorder axes according to old2new[old_pos] = new_pos
369	TensorView* reorder(const std::unordered_map<int, int>& old2new);
370
371	//! Swizzle indices to improve memory access efficiency.
372	//!
373	//! Swizzle::Transpose is a pattern commonly used to avoid bank
374	//! conflicts in shared memory. It takes two axes and shifts the
375	//! second axis by the first axis as ((axis1 + axis2) % extent). The
376	//! memory type must be Shared.
377	//!
378	//! \input type Swizzle pattern such as transpose.
379	//! \input axes Axes to swizzle
380	TensorView* swizzle(SwizzleType type, const std::vector<int>& axes);
381
382	//! Swizzle the rectangular tile defined by the iterdomains corresponding
383	//! to the 2 given indices.
384	TensorView* swizzle(
385	Swizzle2DType swizzle_type,
386	int x,
387	int y,
388	SwizzleMode swizzle_mode = SwizzleMode::Data);
389
390	// WARNING: rFactor does not return this TensorView, ir returns a new
391	// tensorview consumed by this!
392	//
393	// Take reduction axes out of this domain, and create a new
394	// domain. New domain will be used to create this domain.
395	//
396	// For example:
397	// TV1[I0, R1, R2, I3] = TV0[I0, I1, I2, I3]
398	//
399	// After:
400	// TV1->rfactor({1}), TV1 is transformed to -> TV1[I0, R2, I3]
401	//
402	// The TensorView returned is: TV2[I0, R1, I2, I3]
403	//
404	// The reduction will now beset as:
405	// TV2[I0, R1, I2, I3] = TV0[I0, I1, I2, I3]
406	// TV1[I0, R2, I3] = TV2[I0, R1, I2, I3]
407	//
408	TensorView* rFactor(const std::vector<int>& axes);
409
410	//! Multi-output version of rFactor, semantically similar with
411	//! the reduction version except that the rfactor is done
412	//! for all outputs in a consistent way
413	std::vector<TensorView*> rFactor(
414	const std::vector<int>& axes,
415	const std::vector<TensorView*>& tvs);
416
417	//! Create a TensorView before the original tensor. A common use case is to
418	//! write results into shared memory or registers before moving to global
419	//! memory. Analogous to TVM Cache_Write
420	//!
421	//! @param cache_op: memory operator to use for the inserted op between
422	//! the the data tensor and the cache tensor
423	TensorView* cacheBefore(
424	c10::optional<LoadStoreOpType> cache_op = c10::nullopt);
425
426	//! Create a TensorView after the original tensor. A common use case is to
427	//! read tensor into shared memory or registers. Analogous to TVM Cache_Read
428	//!
429	//! @param cache_op: memory operator to use for the inserted op between
430	//! the the data tensor and the cache tensor
431	TensorView* cacheAfter(
432	c10::optional<LoadStoreOpType> cache_op = c10::nullopt);
433
434	// For a fusion output with other uses, we want to avoid writing to global
435	// memory and then reading the output again. We write to global memory
436	// separately after an operation. We replace this fusion output with the
437	// direct write TensorView.
438	TensorView* cacheFork();
439
440	MemoryType getMemoryType() const {
441	return memory_type_;
442	}
443
444	void setMemoryType(MemoryType mt);
445
446	SwizzleType swizzleType() const {
447	return swizzle_type_;
448	}
449
450	const std::vector<IterDomain>& axesToSwizzle() const* {
451	return axes_to_swizzle_;
452	}
453
454	// Apply double buffering transformation
455	void doubleBuffer();
456
457	// Apply circular buffering transformation
458	void circularBuffer(unsigned int number_of_stage);
459
460	// Returns true if this tensor is double buffered.
461	bool isDoubleBuffered() const {
462	return is_double_buffered_;
463	}
464
465	// Returns true if this tensor is circular buffered.
466	bool isCircularBuffered() const {
467	return is_circular_buffered_;
468	}
469
470	// Returns the depth of circular buffering if applicable.
471	unsigned int circularBufferDepth() const {
472	TORCH_INTERNAL_ASSERT(
473	is_circular_buffered_, toString(), "not circular buffered");
474	return circular_buffer_stage_;
475	}
476
477	//! Transforms the innermost iterdomains according to the given mma swizzle,
478	//! this should be used on the tvs that are either inputs/outputs of an
479	//! MmaOp, or any tv's that are involved in prolog/epilog fusions and need to
480	//! have a matching thread swizzle with the mma operand/result.
481	//! More detail on usage see [WarpMmaSwizzler] in scheduler/mma_utils.h .
482	void applyMmaSwizzle(MmaOptions options);
483
484	//! Returns if this tensor view has swizzle operator on its tensor domain.
485	//! This is the temporary flag for indicating that the new swizzle
486	//! implementation is used and will be removed in follow ups.
487	bool hasSwizzleOp() const {
488	return has_swizzle_op_;
489	}
490
491	friend TORCH_CUDA_CU_API TransformPropagator;
492	friend TORCH_CUDA_CU_API MostInlinedTransformPropagator;
493	friend TORCH_CUDA_CU_API TransformReplay;
494	friend TORCH_CUDA_CU_API OptOutMutator;
495	friend class InlineBatchingGuard;
496	friend class ir_utils::TVDomainGuard;
497
498	// Inline the computation of this tensor into its consumer at the given
499	// position. If this tensor is already inlined in a higher position, then this
500	// call is a no-op. If the right most dimensions before `pos` are
501	// broadcasting, then will not inline into these broadcastings. If
502	// best_effort, then will inline into the highest allowed position that is <=
503	// `pos`.
504	void inlineAt(
505	int64_t pos,
506	bool best_effort = false,
507	MaxPosCalculator* calc = nullptr);
508
509	// Update the max producer position of the current tensor. This is required
510	// when we modify producer-consumer relationship of a scheduled tensor, for
511	// example, grouping multiple reductions.
512	void updateMaxProducerPosition();
513
514	protected:
515	void setDomain(TensorDomain* td) {
516	domain_ = td;
517	}
518
519	private:
520	int normalizeAxisPos(int pos) const {
521	if (pos < `0`) {
522	pos += nDims();
523	}
524	return pos;
525	}
526
527	//! A helper function to maintain the consistency of schedules of
528	//! multiple outputs wheen doing rfactor on multi-output reduction ops.
529	TensorView* multiOutputRfactorHelper(
530	TensorView* tv,
531	const std::vector<int>& axes);
532
533	private:
534	TensorDomain* domain_ = nullptr;
535	unsigned int compute_at_pos_ = `0`;
536	unsigned int max_producer_pos_ = `0`;
537	MemoryType memory_type_ = MemoryType::Local;
538	SwizzleType swizzle_type_ = SwizzleType::NoSwizzle;
539	std::vector<IterDomain*> axes_to_swizzle_;
540	bool is_double_buffered_ = false;
541
542	//! Indicates if the tensor is circular buffered.
543	bool is_circular_buffered_ = false;
544
545	//! Indicates the circular buffering stage depth if applicable.
546	unsigned int circular_buffer_stage_ = `0`;
547
548	// special handling for CPU based zero-dim tensors (i.e. CPU Tensors that only
549	// have one value). This is only used if on an input value, otherwise ignored.
550	// This is important as special handling because these "scalars" should be
551	// type promoted as a tensor, but we want to avoid explicit copying of the
552	// data, so we want to pass the data value as a standard kernel argument
553	// value.
554	bool cpu_scalar_ = false;
555
556	//! Indicates if this tensor view has swizzle operator on its tensor domain.
557	//! This is the temporary flag for indicating that the new swizzle
558	//! implementation is used and will be removed in follow ups.
559	bool has_swizzle_op_ = false;
560	};
561
562	//! A simple TensorView builder
563	//!
564	//! Example usage:
565	//!
566	//! auto tv = TensorViewBuilder()
567	//! .ndims(ndims)
568	//! .dtype(dtype)
569	//! .contiguity(contiguity)
570	//! .build();
571	//!
572	class TORCH_CUDA_CU_API TensorViewBuilder {
573	public:
574	//! Set the number of dimensions of the tensor (default 0, meaning scalar)
575	TensorViewBuilder& ndims(size_t ndims);
576
577	//! Set the data type of the tensor (default DataType::Float)
578	TensorViewBuilder& dtype(DataType dtype);
579
580	//! Set the contiguity information (default non-contiguous)
581	TensorViewBuilder& contiguity(std::vector<bool> contiguity);
582
583	//! Set the shape (default 0 dimensional, ie. scalar)
584	TensorViewBuilder& shape(std::vector<Val*> shape);
585	TensorViewBuilder& shape(const std::vector<int64_t>& shape);
586
587	//! Creates a new TensorView with the specified options
588	TensorView* build() const;
589
590	private:
591	size_t ndims_ = `0`;
592	DataType dtype_ = DataType::Float;
593	std::vector<bool> contiguity_;
594	std::vector<Val*> shape_;
595	};
596
597	} // namespace cuda
598	} // namespace fuser
599	} // namespace jit
600	} // namespace torch
601

Browse the source code of pytorch/third_party/nvfuser/csrc/ir_interface_nodes.h