lower_bank_conflict.h source code [pytorch/third_party/nvfuser/csrc/lower_bank_conflict.h]

1	#pragma once
2
3	#include <dynamic_type.h>
4	#include <executor_launch_params.h>
5	#include <ir_base_nodes.h>
6	#include <kernel.h>
7
8	#include <unordered_map>
9	#include <utility>
10
11	namespace torch {
12	namespace jit {
13	namespace fuser {
14	namespace cuda {
15
16	// for more info on shared memory access see page 54-72 of:
17	// https://on-demand.gputechconf.com/gtc/2018/presentation/s81006-volta-architecture-and-performance-optimization.pdf
18
19	// Warning: The bank confliction checking utility here is not a replacement of
20	// nsight compute. This utility currently has the following assumptions and
21	// limitations:
22	//
23	// 1. This utility assumes that the data of the tensor is accessed by
24	// `T0[index]`, where `index` is the one stored in the `TensorIndex`
25	// object.
26	// 2. This utility only checks the first iteration. If we have something like
27	// `T1_s[tidx, 5]`, then different iterations should have different
28	// conflictions, which will not be evaluated for all of them
29	// 3. This utility assumes that all tensors are independent, which means:
30	// 3.1 All shared memory tensors are allocated starting from a multiple of
31	// 432 bytes*
32	// 3.2 The only source of bank confliction is from within a tensor.
33	// There is no bank conflict between different tensors.
34	//
35	// Also note that this utility will not provide accurate estimation if the above
36	// assumptions are satisfied
37
38	std::unordered_map<const Expr, std::pair<int, int*>> getBankConflictInfo(
39	kir::Kernel* kernel,
40	c10::optional<LaunchParams> launch_params = c10::nullopt,
41	const std::unordered_map<std::string, IntOrDouble>& known_values = {});
42
43	} // namespace cuda
44	} // namespace fuser
45	} // namespace jit
46	} // namespace torch
47

Browse the source code of pytorch/third_party/nvfuser/csrc/lower_bank_conflict.h