1#pragma once
2
3#include <c10/macros/Export.h>
4
5#include <ir_all_nodes.h>
6#include <kernel_ir.h>
7#include <kernel_ir_dispatch.h>
8
9// Double buffering a tensor doubles its allocation size and uses two
10// buffers to facilitate computation and memory access
11// overlapping. The basic form of code looks like as follows:
12//
13// Before:
14// for i
15// x[S]; // allocation
16// for j:
17// x[j] = y[i, j]
18// for j:
19// ... = x[j]
20//
21// After:
22// X[S * 2]; // allocation
23// for i in 0 to 1: // Prologue
24// for j:
25// x[j] = y[i, j]
26//
27// for i in 0 to N-1: // Main
28// for j:
29// x[j + (1 - i % 2) * S] = y[i + 1, j]
30// for j:
31// ... = x[j + (i % 2) * S]
32//
33// for i in N-1 to N: // Epilogue
34// for j:
35// ... = x[j + (i % 2) * S]
36//
37// Here, S is the original size of tensor x.
38//
39// The i loop is the double buffer loop of tensor x, where double
40// buffering is applied to the tensor. The first step of lowering is
41// to find the double buffering axis for each double buffered
42// tensor. It must not be parallelized as it isn't possible to double
43// buffer parallelized loops. Also, an unrolled axis expands the
44// allocation and is intended to make the loop completely unrolled,
45// which also conflicts with double buffering. So, basically, the double
46// buffering axis is the inner-most axis within the axes left
47// of the CA position. However, when it is parallelized or unrolled, a
48// further left axis is picked.
49//
50// Once the double buffer axis is determined, the main task is to
51// replicate the corresponding double buffer loop as illustrated
52// above. The Prologue loop is to just fetch the first element to
53// populate the buffer. The main loop is mostly the same as the
54// original loop, except for the indexing change to switch the two
55// buffers. When used as a consumer, an offset of (1 - i % 2) * S is
56// added, whereas (i % 2) * S is added when used as a producer. Here,
57// i is the index of the double buffer loop. The Epilogue loop is just
58// for the last iteration of the loop. Since the main loop reads one
59// element ahead of the producer of the double buffered tensor, it
60// would require an additional guard to prevent buffer overruns with
61// the producer if the main loop were also used for the last
62// iteration. However, the value loaded by the invalid load would not
63// be used, so instead of adding the additional predicate, the Epilogue
64// loop is replicated from the original loop, except for the load
65// expression since it's not used. Note that this overrun does not
66// happen when the producer is on gmem, so in that case, this
67// additional replication is not done.
68//
69// When creating those three types of loops, additional care must be
70// taken when multiple tensors are double buffered. When multiple
71// tensors use the same loop as their double buffer loop, one pass of
72// replication takes care of them at once, meaning the same Prologue,
73// Main, Epilogue loops are used for the multiple tensors.
74//
75// Other tasks to do for a double buffer tensor include:
76// - Move allocation to outside of the double buffer loop
77// - Double the allocation size
78// - Omit the RAW sync in the Main and Epilogue loops
79
80// [Cicular buffer] An generalization of double buffering.
81// On sm80+ hardware there is asynchronous copy infrastructure that
82// motivates a circular buffering generalization of double buffering.
83// Almost all analyses previously done for double buffering are exactly
84// the same with circular buffering, except for the introduction of
85// new concept: `stage depth`.
86//
87// The `stage depth` is defined as the multiplier of extra buffering
88// space used. In the case of double buffering, the stage depth would
89// be 2.
90//
91// A circular buffered loop structure would look like follows, which
92// exactly parallels the case of double buffered loop structure, since
93// it is a exact generalization to the same purpose.
94//
95// Here S is the original allocation size as above,
96// D is the stage depth. With D=2, the below loop structure becomes
97// exactly the same as the case in double buffering.
98//
99// allocate X[S*D] // allocation
100// for i in 0..D-1: // prolog
101// for j in ...
102// if pred:
103// x[i*S+j] = y[i, j];
104//
105// for i in 0..N: // main loop
106// for j in ...
107// if pred:
108// x[((i+D-1)%D)*S+j] = y[i+D-1, j];
109// for j in ...
110// .. = x[(i%D)*S+j]
111//
112// (Epilog omitted since this only makes sense in using
113// cp.async, where producer will be in global mem and consumer will
114// be in shared mem).
115//
116// The profitability of this optimization comes from extra tolerance
117// of global memory pipeline latency, as on the expression `.. = x[(i%D)*S+j]`
118// we only need to make sure the data for the current iteration is
119// completed while the remaining D-2 load iterations could still be in progress
120// and overlap with the computes of the current loop.
121//
122// To express this pattern on sm80+ hardware we can group the loads
123// in each iteration of the circular buffered loop as one "transaction",
124// and specify how many transactions we want to ensure completion when
125// we insert the async barriers.
126//
127// allocate X[S*D] // allocation
128// for i in 0..D-1: // prolog
129// for j in ...
130// if pred:
131// x[i*S+j] = y[i, j];
132// cp.async.commit; // mark the transaction boundary
133//
134// # At this point we have D-1 transactions on the fly.
135// and for the first iteration of the main loop we need
136// one transaction completed, so we leave D-2 transactions
137// on the fly, which would be the input to the barrier instruction.
138//
139// cp.async.wait D-2 // ensure all but the last D-2 transactions complete.
140//
141// for i in 0..N: // main loop
142// # At this point we always have D-2 transactions on the fly.
143// and one completed.
144// for j in ...
145// if pred:
146// x[((i+D-1)%D)*S+j] = y[i+D-1, j];
147// for j in ...
148// .. = x[(i%D)*S+j]
149// cp.async.commit; // mark the transaction boundary for the
150// load issued in this iteration.
151// # At this point we have D-1 transactions on the fly,
152// and none completed.
153// cp.async.wait D-2; // Ensure all but the last D-2 transactions complete.
154// __syncthreads(); // Need to syncthreads because each thread will only
155// ensure completion of its own async copies so
156// would need to sync to this point to ensure
157// completion of the whole tile.
158
159namespace torch {
160namespace jit {
161namespace fuser {
162namespace cuda {
163
164unsigned int getDoubleBufferAxisPosition(const TensorView* tv);
165
166IterDomain* getDoubleBufferAxis(const TensorView* tv);
167
168void validateDoubleBufferedTensor(const TensorView* tv);
169
170class TORCH_CUDA_CU_API DoubleBufferPass {
171 public:
172 //! Apply double buffering transformations
173 static std::vector<Expr*> run(const std::vector<Expr*>& exprs);
174};
175
176class TORCH_CUDA_CU_API DoubleBufferInfo {
177 // Lowering information of double buffered tensors.
178 struct TvInfo {
179 IterDomain* double_buffer_axis = nullptr;
180 Val* original_alloc_size = nullptr;
181 };
182
183 public:
184 void build(Fusion* fusion);
185
186 void setDoubleBufferAxis(const TensorView* tv, IterDomain* id);
187
188 IterDomain* getDoubleBufferAxis(const TensorView* tv);
189
190 //! Get a loop that matches with a given double-buffer axis. If
191 //! ignore_prologue is true, a matched loop is ignored if it's a
192 //! prologue loop.
193 static kir::ForLoop* getDoubleBufferLoop(
194 IterDomain* axis,
195 const std::vector<kir::ForLoop*>& loops,
196 bool ignore_prologue = false);
197
198 //! Get a loop that matches with the double-buffer axis of a given
199 //! double-buffered tensor. If ignore_prologue is true, a matched
200 //! loop is ignored if it's a prologue loop.
201 kir::ForLoop* getDoubleBufferLoop(
202 const TensorView* tv,
203 const std::vector<kir::ForLoop*>& loops,
204 bool ignore_prologue = false);
205
206 void setOriginalAllocSize(const TensorView* tv, Val* size);
207
208 Val* getOriginalAllocSize(const TensorView* tv);
209
210 //! Returns true if the iterdomain will be realized
211 //! as a double buffer loop.
212 bool isDoubleBufferedIterDomain(IterDomain* id);
213
214 //! Get the number of circular buffer stages for the given axis,
215 //! the number of stages will be 2 in the case of double buffer loop.
216 unsigned int getStageDepthFor(IterDomain* circular_buffered_id);
217
218 private:
219 TvInfo& getTvInfo(const TensorView* tv);
220
221 //! Set the number of circular buffer stages for the given
222 //! circular_buffered_id.
223 //! Current code generation only supports one stage depth per loop disjoint
224 //! set,
225 //! so this function will throw an error if trying to set different stage
226 //! numbers to iterdomains that are loop mapped.
227 void setStageDepth(
228 IterDomain* circular_buffered_id,
229 unsigned int stage_depth);
230
231 private:
232 //! Keeps track of information for lowering double buffered tensors
233 std::unordered_map<const TensorView*, TvInfo> map_;
234
235 //! Keeps track of which concrete loop map is realizing double buffer
236 //! iterdomains.
237 std::unordered_set<const IterDomain*> concrete_double_buffered_loop_id_;
238
239 //! Keeps track of double buffer loop stage depth.
240 //! Currently for each disjoint set of loop mapped iterdomains,
241 //! Only one stage depth is supported, so that the loops can indeed
242 //! shared with the same prolog extent and main loop offset.
243 std::unordered_map<IterDomain*, unsigned int> stage_depth_;
244};
245
246} // namespace cuda
247} // namespace fuser
248} // namespace jit
249} // namespace torch
250