1 | #pragma once |
2 | |
3 | #include <c10/macros/Export.h> |
4 | |
5 | #include <ir_all_nodes.h> |
6 | #include <kernel_ir.h> |
7 | #include <kernel_ir_dispatch.h> |
8 | |
9 | // Double buffering a tensor doubles its allocation size and uses two |
10 | // buffers to facilitate computation and memory access |
11 | // overlapping. The basic form of code looks like as follows: |
12 | // |
13 | // Before: |
14 | // for i |
15 | // x[S]; // allocation |
16 | // for j: |
17 | // x[j] = y[i, j] |
18 | // for j: |
19 | // ... = x[j] |
20 | // |
21 | // After: |
22 | // X[S * 2]; // allocation |
23 | // for i in 0 to 1: // Prologue |
24 | // for j: |
25 | // x[j] = y[i, j] |
26 | // |
27 | // for i in 0 to N-1: // Main |
28 | // for j: |
29 | // x[j + (1 - i % 2) * S] = y[i + 1, j] |
30 | // for j: |
31 | // ... = x[j + (i % 2) * S] |
32 | // |
33 | // for i in N-1 to N: // Epilogue |
34 | // for j: |
35 | // ... = x[j + (i % 2) * S] |
36 | // |
37 | // Here, S is the original size of tensor x. |
38 | // |
39 | // The i loop is the double buffer loop of tensor x, where double |
40 | // buffering is applied to the tensor. The first step of lowering is |
41 | // to find the double buffering axis for each double buffered |
42 | // tensor. It must not be parallelized as it isn't possible to double |
43 | // buffer parallelized loops. Also, an unrolled axis expands the |
44 | // allocation and is intended to make the loop completely unrolled, |
45 | // which also conflicts with double buffering. So, basically, the double |
46 | // buffering axis is the inner-most axis within the axes left |
47 | // of the CA position. However, when it is parallelized or unrolled, a |
48 | // further left axis is picked. |
49 | // |
50 | // Once the double buffer axis is determined, the main task is to |
51 | // replicate the corresponding double buffer loop as illustrated |
52 | // above. The Prologue loop is to just fetch the first element to |
53 | // populate the buffer. The main loop is mostly the same as the |
54 | // original loop, except for the indexing change to switch the two |
55 | // buffers. When used as a consumer, an offset of (1 - i % 2) * S is |
56 | // added, whereas (i % 2) * S is added when used as a producer. Here, |
57 | // i is the index of the double buffer loop. The Epilogue loop is just |
58 | // for the last iteration of the loop. Since the main loop reads one |
59 | // element ahead of the producer of the double buffered tensor, it |
60 | // would require an additional guard to prevent buffer overruns with |
61 | // the producer if the main loop were also used for the last |
62 | // iteration. However, the value loaded by the invalid load would not |
63 | // be used, so instead of adding the additional predicate, the Epilogue |
64 | // loop is replicated from the original loop, except for the load |
65 | // expression since it's not used. Note that this overrun does not |
66 | // happen when the producer is on gmem, so in that case, this |
67 | // additional replication is not done. |
68 | // |
69 | // When creating those three types of loops, additional care must be |
70 | // taken when multiple tensors are double buffered. When multiple |
71 | // tensors use the same loop as their double buffer loop, one pass of |
72 | // replication takes care of them at once, meaning the same Prologue, |
73 | // Main, Epilogue loops are used for the multiple tensors. |
74 | // |
75 | // Other tasks to do for a double buffer tensor include: |
76 | // - Move allocation to outside of the double buffer loop |
77 | // - Double the allocation size |
78 | // - Omit the RAW sync in the Main and Epilogue loops |
79 | |
80 | // [Cicular buffer] An generalization of double buffering. |
81 | // On sm80+ hardware there is asynchronous copy infrastructure that |
82 | // motivates a circular buffering generalization of double buffering. |
83 | // Almost all analyses previously done for double buffering are exactly |
84 | // the same with circular buffering, except for the introduction of |
85 | // new concept: `stage depth`. |
86 | // |
87 | // The `stage depth` is defined as the multiplier of extra buffering |
88 | // space used. In the case of double buffering, the stage depth would |
89 | // be 2. |
90 | // |
91 | // A circular buffered loop structure would look like follows, which |
92 | // exactly parallels the case of double buffered loop structure, since |
93 | // it is a exact generalization to the same purpose. |
94 | // |
95 | // Here S is the original allocation size as above, |
96 | // D is the stage depth. With D=2, the below loop structure becomes |
97 | // exactly the same as the case in double buffering. |
98 | // |
99 | // allocate X[S*D] // allocation |
100 | // for i in 0..D-1: // prolog |
101 | // for j in ... |
102 | // if pred: |
103 | // x[i*S+j] = y[i, j]; |
104 | // |
105 | // for i in 0..N: // main loop |
106 | // for j in ... |
107 | // if pred: |
108 | // x[((i+D-1)%D)*S+j] = y[i+D-1, j]; |
109 | // for j in ... |
110 | // .. = x[(i%D)*S+j] |
111 | // |
112 | // (Epilog omitted since this only makes sense in using |
113 | // cp.async, where producer will be in global mem and consumer will |
114 | // be in shared mem). |
115 | // |
116 | // The profitability of this optimization comes from extra tolerance |
117 | // of global memory pipeline latency, as on the expression `.. = x[(i%D)*S+j]` |
118 | // we only need to make sure the data for the current iteration is |
119 | // completed while the remaining D-2 load iterations could still be in progress |
120 | // and overlap with the computes of the current loop. |
121 | // |
122 | // To express this pattern on sm80+ hardware we can group the loads |
123 | // in each iteration of the circular buffered loop as one "transaction", |
124 | // and specify how many transactions we want to ensure completion when |
125 | // we insert the async barriers. |
126 | // |
127 | // allocate X[S*D] // allocation |
128 | // for i in 0..D-1: // prolog |
129 | // for j in ... |
130 | // if pred: |
131 | // x[i*S+j] = y[i, j]; |
132 | // cp.async.commit; // mark the transaction boundary |
133 | // |
134 | // # At this point we have D-1 transactions on the fly. |
135 | // and for the first iteration of the main loop we need |
136 | // one transaction completed, so we leave D-2 transactions |
137 | // on the fly, which would be the input to the barrier instruction. |
138 | // |
139 | // cp.async.wait D-2 // ensure all but the last D-2 transactions complete. |
140 | // |
141 | // for i in 0..N: // main loop |
142 | // # At this point we always have D-2 transactions on the fly. |
143 | // and one completed. |
144 | // for j in ... |
145 | // if pred: |
146 | // x[((i+D-1)%D)*S+j] = y[i+D-1, j]; |
147 | // for j in ... |
148 | // .. = x[(i%D)*S+j] |
149 | // cp.async.commit; // mark the transaction boundary for the |
150 | // load issued in this iteration. |
151 | // # At this point we have D-1 transactions on the fly, |
152 | // and none completed. |
153 | // cp.async.wait D-2; // Ensure all but the last D-2 transactions complete. |
154 | // __syncthreads(); // Need to syncthreads because each thread will only |
155 | // ensure completion of its own async copies so |
156 | // would need to sync to this point to ensure |
157 | // completion of the whole tile. |
158 | |
159 | namespace torch { |
160 | namespace jit { |
161 | namespace fuser { |
162 | namespace cuda { |
163 | |
164 | unsigned int getDoubleBufferAxisPosition(const TensorView* tv); |
165 | |
166 | IterDomain* getDoubleBufferAxis(const TensorView* tv); |
167 | |
168 | void validateDoubleBufferedTensor(const TensorView* tv); |
169 | |
170 | class TORCH_CUDA_CU_API DoubleBufferPass { |
171 | public: |
172 | //! Apply double buffering transformations |
173 | static std::vector<Expr*> run(const std::vector<Expr*>& exprs); |
174 | }; |
175 | |
176 | class TORCH_CUDA_CU_API DoubleBufferInfo { |
177 | // Lowering information of double buffered tensors. |
178 | struct TvInfo { |
179 | IterDomain* double_buffer_axis = nullptr; |
180 | Val* original_alloc_size = nullptr; |
181 | }; |
182 | |
183 | public: |
184 | void build(Fusion* fusion); |
185 | |
186 | void setDoubleBufferAxis(const TensorView* tv, IterDomain* id); |
187 | |
188 | IterDomain* getDoubleBufferAxis(const TensorView* tv); |
189 | |
190 | //! Get a loop that matches with a given double-buffer axis. If |
191 | //! ignore_prologue is true, a matched loop is ignored if it's a |
192 | //! prologue loop. |
193 | static kir::ForLoop* getDoubleBufferLoop( |
194 | IterDomain* axis, |
195 | const std::vector<kir::ForLoop*>& loops, |
196 | bool ignore_prologue = false); |
197 | |
198 | //! Get a loop that matches with the double-buffer axis of a given |
199 | //! double-buffered tensor. If ignore_prologue is true, a matched |
200 | //! loop is ignored if it's a prologue loop. |
201 | kir::ForLoop* getDoubleBufferLoop( |
202 | const TensorView* tv, |
203 | const std::vector<kir::ForLoop*>& loops, |
204 | bool ignore_prologue = false); |
205 | |
206 | void setOriginalAllocSize(const TensorView* tv, Val* size); |
207 | |
208 | Val* getOriginalAllocSize(const TensorView* tv); |
209 | |
210 | //! Returns true if the iterdomain will be realized |
211 | //! as a double buffer loop. |
212 | bool isDoubleBufferedIterDomain(IterDomain* id); |
213 | |
214 | //! Get the number of circular buffer stages for the given axis, |
215 | //! the number of stages will be 2 in the case of double buffer loop. |
216 | unsigned int getStageDepthFor(IterDomain* circular_buffered_id); |
217 | |
218 | private: |
219 | TvInfo& getTvInfo(const TensorView* tv); |
220 | |
221 | //! Set the number of circular buffer stages for the given |
222 | //! circular_buffered_id. |
223 | //! Current code generation only supports one stage depth per loop disjoint |
224 | //! set, |
225 | //! so this function will throw an error if trying to set different stage |
226 | //! numbers to iterdomains that are loop mapped. |
227 | void setStageDepth( |
228 | IterDomain* circular_buffered_id, |
229 | unsigned int stage_depth); |
230 | |
231 | private: |
232 | //! Keeps track of information for lowering double buffered tensors |
233 | std::unordered_map<const TensorView*, TvInfo> map_; |
234 | |
235 | //! Keeps track of which concrete loop map is realizing double buffer |
236 | //! iterdomains. |
237 | std::unordered_set<const IterDomain*> concrete_double_buffered_loop_id_; |
238 | |
239 | //! Keeps track of double buffer loop stage depth. |
240 | //! Currently for each disjoint set of loop mapped iterdomains, |
241 | //! Only one stage depth is supported, so that the loops can indeed |
242 | //! shared with the same prolog extent and main loop offset. |
243 | std::unordered_map<IterDomain*, unsigned int> stage_depth_; |
244 | }; |
245 | |
246 | } // namespace cuda |
247 | } // namespace fuser |
248 | } // namespace jit |
249 | } // namespace torch |
250 | |