1#pragma once
2
3#include <dynamic_type.h>
4#include <executor_launch_params.h>
5#include <ir_base_nodes.h>
6#include <kernel.h>
7
8#include <unordered_map>
9#include <utility>
10
11namespace torch {
12namespace jit {
13namespace fuser {
14namespace cuda {
15
16// for more info on shared memory access see page 54-72 of:
17// https://on-demand.gputechconf.com/gtc/2018/presentation/s81006-volta-architecture-and-performance-optimization.pdf
18
19// Warning: The bank confliction checking utility here is not a replacement of
20// nsight compute. This utility currently has the following assumptions and
21// limitations:
22//
23// 1. This utility assumes that the data of the tensor is accessed by
24// `T0[index]`, where `index` is the one stored in the `TensorIndex`
25// object.
26// 2. This utility only checks the first iteration. If we have something like
27// `T1_s[tidx, 5]`, then different iterations should have different
28// conflictions, which will not be evaluated for all of them
29// 3. This utility assumes that all tensors are independent, which means:
30// 3.1 All shared memory tensors are allocated starting from a multiple of
31// 4*32 bytes
32// 3.2 The only source of bank confliction is from within a tensor.
33// There is no bank conflict between different tensors.
34//
35// Also note that this utility will not provide accurate estimation if the above
36// assumptions are satisfied
37
38std::unordered_map<const Expr*, std::pair<int, int>> getBankConflictInfo(
39 kir::Kernel* kernel,
40 c10::optional<LaunchParams> launch_params = c10::nullopt,
41 const std::unordered_map<std::string, IntOrDouble>& known_values = {});
42
43} // namespace cuda
44} // namespace fuser
45} // namespace jit
46} // namespace torch
47