1 | #pragma once |
2 | |
3 | #include <dynamic_type.h> |
4 | #include <executor_launch_params.h> |
5 | #include <ir_base_nodes.h> |
6 | #include <kernel.h> |
7 | |
8 | #include <unordered_map> |
9 | #include <utility> |
10 | |
11 | namespace torch { |
12 | namespace jit { |
13 | namespace fuser { |
14 | namespace cuda { |
15 | |
16 | // for more info on shared memory access see page 54-72 of: |
17 | // https://on-demand.gputechconf.com/gtc/2018/presentation/s81006-volta-architecture-and-performance-optimization.pdf |
18 | |
19 | // Warning: The bank confliction checking utility here is not a replacement of |
20 | // nsight compute. This utility currently has the following assumptions and |
21 | // limitations: |
22 | // |
23 | // 1. This utility assumes that the data of the tensor is accessed by |
24 | // `T0[index]`, where `index` is the one stored in the `TensorIndex` |
25 | // object. |
26 | // 2. This utility only checks the first iteration. If we have something like |
27 | // `T1_s[tidx, 5]`, then different iterations should have different |
28 | // conflictions, which will not be evaluated for all of them |
29 | // 3. This utility assumes that all tensors are independent, which means: |
30 | // 3.1 All shared memory tensors are allocated starting from a multiple of |
31 | // 4*32 bytes |
32 | // 3.2 The only source of bank confliction is from within a tensor. |
33 | // There is no bank conflict between different tensors. |
34 | // |
35 | // Also note that this utility will not provide accurate estimation if the above |
36 | // assumptions are satisfied |
37 | |
38 | std::unordered_map<const Expr*, std::pair<int, int>> getBankConflictInfo( |
39 | kir::Kernel* kernel, |
40 | c10::optional<LaunchParams> launch_params = c10::nullopt, |
41 | const std::unordered_map<std::string, IntOrDouble>& known_values = {}); |
42 | |
43 | } // namespace cuda |
44 | } // namespace fuser |
45 | } // namespace jit |
46 | } // namespace torch |
47 | |