1 | #include <instrumentation.h> |
2 | #include <ir_iostream.h> |
3 | #include <kernel.h> |
4 | #include <kernel_expr_evaluator.h> |
5 | #include <kernel_ir_dispatch.h> |
6 | #include <lower2device.h> |
7 | |
8 | #include <ATen/cuda/CUDAContext.h> |
9 | |
10 | #include <iostream> |
11 | #include <unordered_set> |
12 | |
13 | namespace torch { |
14 | namespace jit { |
15 | namespace fuser { |
16 | namespace cuda { |
17 | |
18 | IrBuilderPasskey::IrBuilderPasskey(IrContainer* ir_container) |
19 | : ir_container_(ir_container) {} |
20 | |
21 | namespace kir { |
22 | |
23 | namespace { |
24 | |
25 | //! Scan all primary expressions in the Kernel IR and build |
26 | //! lists of specialized nodes and other interesting information |
27 | class KernelIrScanner : private IrVisitor { |
28 | public: |
29 | explicit KernelIrScanner(const Kernel* kernel) { |
30 | IrVisitor::handle(kernel->topLevelExprs()); |
31 | const auto gpu_lower = GpuLower::current(); |
32 | for (auto split : gpu_lower->nonDivisibleSplitInfo().splitsToValidate()) { |
33 | auto extent = split->in()->extent(); |
34 | auto factor = split->factor(); |
35 | summary_.splits_to_validate.emplace_back(extent, factor); |
36 | } |
37 | } |
38 | |
39 | const auto& summary() const { |
40 | return summary_; |
41 | } |
42 | |
43 | private: |
44 | using IrVisitor::handle; |
45 | void handle(Expr* expr) final { |
46 | IrVisitor::handle(expr); |
47 | for (auto inp : expr->inputs()) { |
48 | handle(inp); |
49 | } |
50 | for (auto out : expr->outputs()) { |
51 | handle(out); |
52 | } |
53 | } |
54 | void handle(BlockSync* sync) final { |
55 | // TODO: Move to a dedicated validation pass |
56 | // which is not on the common execution/compilation path |
57 | if (sync->isWarHazardSync()) { |
58 | ++summary_.war_hazard_syncs_count; |
59 | } |
60 | } |
61 | |
62 | void handle(GridSync* sync) final { |
63 | summary_.has_cooperative_grid_reduction = true; |
64 | } |
65 | |
66 | void handle(Allocate* allocate) final { |
67 | switch (allocate->memoryType()) { |
68 | case MemoryType::Global: |
69 | summary_.global_allocations.push_back(allocate); |
70 | break; |
71 | case MemoryType::Shared: |
72 | summary_.dynamic_smem_allocations.push_back(allocate); |
73 | break; |
74 | case MemoryType::Local: |
75 | if (!ExpressionEvaluator::isConst(allocate->size())) { |
76 | summary_.has_dynamic_local_memory_allocations = true; |
77 | summary_.dynamic_lmem_allocations.emplace_back(allocate); |
78 | } |
79 | break; |
80 | } |
81 | } |
82 | |
83 | void handle(RNGOp* rng_op) final { |
84 | summary_.max_rng_offsets = |
85 | std::max<int>(summary_.max_rng_offsets, rng_op->getRNGOffset()); |
86 | } |
87 | |
88 | void handle(TensorIndex* tensor_index) final { |
89 | const auto tv = tensor_index->view(); |
90 | const auto domain = tv->domain(); |
91 | // Do we have any reductions? |
92 | summary_.has_block_reductions = |
93 | summary_.has_block_reductions || domain->hasBlockReduction(); |
94 | |
95 | // Update the largest smem data type |
96 | if (domain->hasBlockReduction() || domain->hasGridReduction() || |
97 | tv->getMemoryType() == MemoryType::Shared) { |
98 | const auto data_type = tv->dtype(); |
99 | const size_t type_size = dataTypeSize(data_type); |
100 | if (type_size > max_smem_type_size_) { |
101 | max_smem_type_size_ = type_size; |
102 | summary_.largest_smem_data_type = data_type; |
103 | } |
104 | } |
105 | } |
106 | |
107 | void handle(WelfordOp* welford_op) final { |
108 | summary_.has_welford = true; |
109 | TORCH_INTERNAL_ASSERT(welford_op->outAvg()->isA<TensorIndex>()); |
110 | auto out_dom = welford_op->outAvg()->as<TensorIndex>()->view()->domain(); |
111 | summary_.has_block_welford = |
112 | summary_.has_block_welford || out_dom->hasBlockReduction(); |
113 | } |
114 | |
115 | void handle(GridWelford* grid_welford) final { |
116 | summary_.has_welford = true; |
117 | summary_.has_grid_welford = true; |
118 | summary_.has_grid_reductions = true; |
119 | if (grid_welford->welford_op()->isAllreduce()) { |
120 | summary_.has_cooperative_grid_reduction = true; |
121 | } |
122 | } |
123 | |
124 | void handle(GridReduction* grid_reduction) final { |
125 | summary_.has_grid_reductions = true; |
126 | if (grid_reduction->isAllreduce()) { |
127 | summary_.has_cooperative_grid_reduction = true; |
128 | } |
129 | } |
130 | |
131 | void handle(GroupedGridReduction* grid_reduction) final { |
132 | summary_.has_grid_reductions = true; |
133 | if (grid_reduction->isAllreduce()) { |
134 | summary_.has_cooperative_grid_reduction = true; |
135 | } |
136 | } |
137 | |
138 | void handle(GroupedGridWelford* grid_welford) final { |
139 | summary_.has_welford = true; |
140 | summary_.has_grid_welford = true; |
141 | summary_.has_grid_reductions = true; |
142 | if (grid_welford->isAllreduce()) { |
143 | summary_.has_cooperative_grid_reduction = true; |
144 | } |
145 | } |
146 | |
147 | void handle(GridBroadcast* grid_broadcast) final { |
148 | summary_.has_cooperative_grid_reduction = true; |
149 | handle(grid_broadcast->broadcast_op()); |
150 | } |
151 | |
152 | void handle(BroadcastOp* bop) final { |
153 | const ParallelTypeBitmap parallel_types = |
154 | GpuLower::current()->threadPredMap().getParallelBroadcastDomains( |
155 | bop->out()->as<TensorIndex>()->view()); |
156 | summary_.broadcast_parallel_types.emplace(bop, parallel_types); |
157 | // Do we have block broadcasts? |
158 | summary_.has_block_broadcasts = |
159 | summary_.has_block_broadcasts || parallel_types.hasTID(); |
160 | // Do we have grid broadcasts? |
161 | summary_.has_grid_broadcasts = |
162 | summary_.has_grid_broadcasts || parallel_types.hasBID(); |
163 | } |
164 | |
165 | private: |
166 | size_t max_smem_type_size_ = 0; |
167 | KernelSummary summary_; |
168 | }; |
169 | |
170 | //! Make sure tensors have valid allocations even when parallelized |
171 | //! loops potentially have larger iteration counts than the number of |
172 | //! threads. |
173 | //! |
174 | //! When an IterDomain of a tensor is parallelized, the IterDomain |
175 | //! may not contribute to the allocation of the tensor. For example, |
176 | //! it is assumed that an allocation of a local-memory tensor does not |
177 | //! need to be accounted for an parallelied IterDomain. This is true |
178 | //! when it is guaranteed that each thread only needs to execute the |
179 | //! loop body once. However, if not, the allocation is invalid as it |
180 | //! only has a space for one value per thread. |
181 | //! |
182 | //! ValidateAllocation checks all tensor allocations and sees if any |
183 | //! tensor may have a parallelized loop whose iteration count may |
184 | //! be larger than the number of threads. If so, an error is thrown if |
185 | //! the tensor is not allocated on thread-shared memories. Note that |
186 | //! when allocated on a shared memory (i.e., MemoryType::Shared or |
187 | //! MemoryType::Global for tensors parallelized with threadIdx, or |
188 | //! MemoryType::Global for tensors parallelized with blockIdx), it is |
189 | //! assumed that allocation is properly extended for the iteration |
190 | //! count. |
191 | class ValidateAllocation : private OptOutConstDispatch { |
192 | public: |
193 | static void validate(const Kernel* kernel) { |
194 | ValidateAllocation validate_allocation(kernel); |
195 | } |
196 | |
197 | private: |
198 | explicit ValidateAllocation(const Kernel* kernel) { |
199 | live_allocations_.emplace_back(std::vector<const Allocate*>()); |
200 | for (const auto& expr : kernel->topLevelExprs()) { |
201 | OptOutConstDispatch::handle(expr); |
202 | } |
203 | live_allocations_.pop_back(); |
204 | TORCH_INTERNAL_ASSERT(live_allocations_.empty()); |
205 | } |
206 | |
207 | void handle(const Allocate* allocate) final { |
208 | TORCH_INTERNAL_ASSERT(!live_allocations_.empty()); |
209 | live_allocations_.back().push_back(allocate); |
210 | } |
211 | |
212 | // for_loop is parallelized and its stop value is not guaranteed to |
213 | // be <= the number of threads, which breaks an assumption made |
214 | // during in the allocation lowering if it's thread-parallel and not |
215 | // allocated on shared or global memories, or if it's block-parallel |
216 | // ando not allocated on global memory. |
217 | void validate(const ForLoop* for_loop) { |
218 | const auto loop_id = for_loop->iter_domain(); |
219 | for (const auto& allocations : live_allocations_) { |
220 | for (const auto& allocate : allocations) { |
221 | const auto tv = dynamic_cast<TensorView*>(allocate->buffer()); |
222 | if (tv == nullptr) { |
223 | continue; |
224 | } |
225 | for (const auto& axis : tv->domain()->domain()) { |
226 | if (!GpuLower::current()->caMap()->areMapped( |
227 | loop_id, axis, IdMappingMode::LOOP)) { |
228 | continue; |
229 | } |
230 | if (isParallelTypeThreadDim(loop_id->getParallelType())) { |
231 | TORCH_INTERNAL_ASSERT( |
232 | tv->getMemoryType() == MemoryType::Shared || |
233 | tv->getMemoryType() == MemoryType::Global, |
234 | "Tensor t" , |
235 | tv->name(), |
236 | " must be allocated on SMEM or GMEM." ); |
237 | } else if (isParallelTypeBlockDim(loop_id->getParallelType())) { |
238 | TORCH_INTERNAL_ASSERT(tv->getMemoryType() == MemoryType::Global); |
239 | } |
240 | } |
241 | } |
242 | } |
243 | } |
244 | |
245 | void handle(const ForLoop* for_loop) final { |
246 | if (for_loop->stop() != for_loop->iter_domain()->extent() && |
247 | isParallelTypeThread(for_loop->iter_domain()->getParallelType())) { |
248 | validate(for_loop); |
249 | } |
250 | |
251 | live_allocations_.emplace_back(std::vector<const Allocate*>()); |
252 | for (const auto& expr : for_loop->body().exprs()) { |
253 | OptOutConstDispatch::handle(expr); |
254 | } |
255 | live_allocations_.pop_back(); |
256 | } |
257 | |
258 | void handle(const IfThenElse* ite) final { |
259 | for (const auto& expr : ite->thenBody().exprs()) { |
260 | OptOutConstDispatch::handle(expr); |
261 | } |
262 | for (const auto& expr : ite->elseBody().exprs()) { |
263 | OptOutConstDispatch::handle(expr); |
264 | } |
265 | } |
266 | |
267 | private: |
268 | std::vector<std::vector<const Allocate*>> live_allocations_; |
269 | }; |
270 | |
271 | } // namespace |
272 | |
273 | // TODO(kir): Kernel IR validation |
274 | void Kernel::finalize(std::vector<Expr*> top_level_exprs) { |
275 | TORCH_INTERNAL_ASSERT(top_level_exprs_.empty()); |
276 | top_level_exprs_ = std::move(top_level_exprs); |
277 | warp_padded_parallel_info_ = GpuLower::current()->getWarpPaddedParallelInfo(); |
278 | profile_ = GpuLower::current()->profile(); |
279 | ValidateAllocation::validate(this); |
280 | analyze(); |
281 | // Make sure this is after analyze as it sets summary_ |
282 | summary_.vectorized_accesses = GpuLower::current()->vectorizedAccesses(); |
283 | summary_.vectorized_set_info = GpuLower::current()->vectorizedSetInfo(); |
284 | summary_.sync_map = GpuLower::current()->syncMap(); |
285 | summary_.parallel_dimension_map_ = |
286 | GpuLower::current()->parallelDimensionMap(); |
287 | } |
288 | |
289 | void Kernel::analyze() { |
290 | FUSER_PERF_SCOPE("Kernel::analyze" ); |
291 | |
292 | const KernelIrScanner ir_scanner(this); |
293 | summary_ = ir_scanner.summary(); |
294 | } |
295 | |
296 | void Kernel::print() const { |
297 | IrPrinter ir_printer(std::cout); |
298 | ir_printer.handle(this); |
299 | } |
300 | |
301 | //! Register the Val with this fusion |
302 | void Kernel::registerVal(Val* val) { |
303 | if (inContainer(val)) { |
304 | return; |
305 | } |
306 | if (val->kernel()) { |
307 | TORCH_CHECK( |
308 | val->kernel() == this, |
309 | val->toString(), |
310 | " was not found in the active kernel." ); |
311 | } |
312 | |
313 | Fusion::registerVal(val); |
314 | } |
315 | |
316 | //! Register expr with this fusion. |
317 | //! When we register an expression, we want to update the dependency tracking |
318 | //! of Vals. We add expr to our general expr_set_, |
319 | void Kernel::registerExpr(Expr* expr) { |
320 | if (inContainer(expr)) { |
321 | return; |
322 | } |
323 | |
324 | if (expr->kernel()) { |
325 | TORCH_CHECK( |
326 | expr->kernel() == this, |
327 | expr->toString(), |
328 | " was not found in the active kernel." ); |
329 | } |
330 | |
331 | for (Val* input : expr->inputs()) { |
332 | TORCH_INTERNAL_ASSERT( |
333 | inContainer(input), |
334 | "Input\n" , |
335 | input->toString(), |
336 | " to expr,\n" , |
337 | expr->toString(), |
338 | ",\n is invalid because it is not in the same kernel." ); |
339 | } |
340 | |
341 | for (Val* output : expr->outputs()) { |
342 | TORCH_INTERNAL_ASSERT( |
343 | inContainer(output), |
344 | "Output\n" , |
345 | output->toString(), |
346 | " to expr,\n" , |
347 | expr->toString(), |
348 | ",\n is invalid because it is not in the same kernel." ); |
349 | } |
350 | |
351 | // Register expr is explicitly non-SSA when coming from a kernel. This is |
352 | // detected inside Fusion::registerExpr |
353 | Fusion::registerExpr(expr); |
354 | } |
355 | |
356 | std::vector<Expr*>& KernelInternalProxy::topLevelExprs() { |
357 | return kernel_->top_level_exprs_; |
358 | } |
359 | |
360 | void KernelPerformanceProfile::registerExpr(const Expr* expr) { |
361 | if (expr_entry_map_.find(expr) != expr_entry_map_.end()) { |
362 | return; |
363 | } |
364 | |
365 | auto slot = getNewIndex(); |
366 | expr_entry_map_.emplace(expr, slot); |
367 | } |
368 | |
369 | int KernelPerformanceProfile::getNewIndex() { |
370 | return num_profile_entries_++; |
371 | } |
372 | |
373 | bool KernelPerformanceProfile::isProfiled(const Expr* expr) const { |
374 | return expr_entry_map_.find(expr) != expr_entry_map_.end(); |
375 | } |
376 | |
377 | c10::optional<int> KernelPerformanceProfile::getIndex(const Expr* expr) const { |
378 | auto it = expr_entry_map_.find(expr); |
379 | if (it == expr_entry_map_.end()) { |
380 | return c10::optional<int>(); |
381 | } else { |
382 | return it->second; |
383 | } |
384 | } |
385 | |
386 | std::array<int, 2> KernelPerformanceProfile::getIndicesInProfileBuffer( |
387 | const Expr* expr) const { |
388 | TORCH_INTERNAL_ASSERT( |
389 | isProfiled(expr), "Not a profiled expression: " , expr->toString()); |
390 | |
391 | int cycle_index = getIndex(expr).value() * 2; |
392 | int count_index = cycle_index + 1; |
393 | |
394 | return {cycle_index, count_index}; |
395 | } |
396 | |
397 | std::string KernelPerformanceProfile::toString(const at::Tensor& buffer) const { |
398 | std::stringstream ss; |
399 | ss << "Kernel performance profile:\n" ; |
400 | if (!buffer.defined()) { |
401 | ss << "No profile found\n" ; |
402 | return ss.str(); |
403 | } |
404 | |
405 | double kilo_freq = at::cuda::getCurrentDeviceProperties()->clockRate; |
406 | |
407 | ss << std::setprecision(3) << std::fixed; |
408 | |
409 | for (const auto& kv : expr_entry_map_) { |
410 | auto expr = kv.first; |
411 | auto index = kv.second; |
412 | auto out_tv = ir_utils::getTvOutput(expr); |
413 | double cycles = static_cast<double>(buffer[index][0].item<int64_t>()); |
414 | auto count = buffer[index][1].item<int64_t>(); |
415 | auto cycles_per_call = count == 0 ? 0.0 : cycles / count; |
416 | auto us_per_call = cycles_per_call / kilo_freq * 1000.0; |
417 | ss << expr->getExprType().value() << ", T" << out_tv->name() << ", " |
418 | << us_per_call << " us, " << count << "\n" ; |
419 | } |
420 | |
421 | return ss.str(); |
422 | } |
423 | |
424 | } // namespace kir |
425 | } // namespace cuda |
426 | } // namespace fuser |
427 | } // namespace jit |
428 | } // namespace torch |
429 | |