1 | #include <iter_visitor.h> |
2 | #include <kernel_ir_dispatch.h> |
3 | #include <lower2device.h> |
4 | #include <lower_magic_zero.h> |
5 | |
6 | #include <lower_instrument.h> |
7 | |
8 | namespace torch { |
9 | namespace jit { |
10 | namespace fuser { |
11 | namespace cuda { |
12 | |
13 | namespace { |
14 | |
15 | class Instrumentor : private kir::IrVisitor { |
16 | public: |
17 | Instrumentor(const std::vector<Expr*>& exprs) { |
18 | IrVisitor::handle(exprs); |
19 | |
20 | if (profile_.getNumberOfProfileEntries() == 0) { |
21 | exprs_ = exprs; |
22 | return; |
23 | } |
24 | |
25 | // Allocate a new TensorView as a backing buffer |
26 | allocateBuffer(); |
27 | profile_.setBuffer(buffer_); |
28 | |
29 | // Insert the allocation expression at the beginning of the |
30 | // top-level expressions |
31 | exprs_.push_back(buffer_alloc_); |
32 | exprs_.insert(exprs_.end(), exprs.begin(), exprs.end()); |
33 | } |
34 | |
35 | const kir::KernelPerformanceProfile& profile() const { |
36 | return profile_; |
37 | } |
38 | |
39 | const std::vector<Expr*>& exprs() const { |
40 | return exprs_; |
41 | } |
42 | |
43 | private: |
44 | using IrVisitor::handle; |
45 | |
46 | //! Profile all grid reductions. |
47 | //! TODO: support other variants of grid reductions (e.g., |
48 | //! GroupedGridReduction) |
49 | void handle(kir::GridReduction* expr) final { |
50 | profile_.registerExpr(expr); |
51 | } |
52 | |
53 | void handle(kir::GroupedGridReduction* expr) final { |
54 | profile_.registerExpr(expr); |
55 | } |
56 | |
57 | void allocateBuffer() { |
58 | const auto num_profile_entries = profile_.getNumberOfProfileEntries(); |
59 | |
60 | // If nothing to profile, do not allocate anything |
61 | if (num_profile_entries == 0) { |
62 | return; |
63 | } |
64 | |
65 | // Allocate two integers for each entry. One is used for accumulating |
66 | // cycles, and another for couting the number of hits |
67 | const std::vector<IterDomain*> new_buffer_ids = { |
68 | IterDomainBuilder( |
69 | GpuLower::current()->kernel()->zeroVal(), |
70 | IrBuilder::create<Int>(num_profile_entries)) |
71 | .build(), |
72 | IterDomainBuilder( |
73 | GpuLower::current()->kernel()->zeroVal(), IrBuilder::create<Int>(2)) |
74 | .build()}; |
75 | |
76 | const auto buffer_domain = IrBuilder::create<TensorDomain>(new_buffer_ids); |
77 | |
78 | buffer_ = IrBuilder::create<TensorView>( |
79 | buffer_domain, DataType::Int, MemoryType::Global); |
80 | |
81 | buffer_alloc_ = IrBuilder::create<kir::Allocate>( |
82 | buffer_, buffer_->getMemoryType(), nullptr, true); |
83 | } |
84 | |
85 | private: |
86 | std::vector<Expr*> exprs_; |
87 | kir::KernelPerformanceProfile profile_; |
88 | TensorView* buffer_ = nullptr; |
89 | kir::Allocate* buffer_alloc_ = nullptr; |
90 | }; |
91 | |
92 | } // namespace |
93 | |
94 | std::vector<Expr*> instrumentKernel(const std::vector<Expr*>& exprs) { |
95 | if (!isOptionEnabled(EnableOption::KernelProfile)) { |
96 | return exprs; |
97 | } |
98 | |
99 | Instrumentor inst(exprs); |
100 | |
101 | GpuLower::current()->profile() = inst.profile(); |
102 | |
103 | return inst.exprs(); |
104 | } |
105 | |
106 | } // namespace cuda |
107 | } // namespace fuser |
108 | } // namespace jit |
109 | } // namespace torch |
110 | |