1#include <iter_visitor.h>
2#include <kernel_ir_dispatch.h>
3#include <lower2device.h>
4#include <lower_magic_zero.h>
5
6#include <lower_instrument.h>
7
8namespace torch {
9namespace jit {
10namespace fuser {
11namespace cuda {
12
13namespace {
14
15class Instrumentor : private kir::IrVisitor {
16 public:
17 Instrumentor(const std::vector<Expr*>& exprs) {
18 IrVisitor::handle(exprs);
19
20 if (profile_.getNumberOfProfileEntries() == 0) {
21 exprs_ = exprs;
22 return;
23 }
24
25 // Allocate a new TensorView as a backing buffer
26 allocateBuffer();
27 profile_.setBuffer(buffer_);
28
29 // Insert the allocation expression at the beginning of the
30 // top-level expressions
31 exprs_.push_back(buffer_alloc_);
32 exprs_.insert(exprs_.end(), exprs.begin(), exprs.end());
33 }
34
35 const kir::KernelPerformanceProfile& profile() const {
36 return profile_;
37 }
38
39 const std::vector<Expr*>& exprs() const {
40 return exprs_;
41 }
42
43 private:
44 using IrVisitor::handle;
45
46 //! Profile all grid reductions.
47 //! TODO: support other variants of grid reductions (e.g.,
48 //! GroupedGridReduction)
49 void handle(kir::GridReduction* expr) final {
50 profile_.registerExpr(expr);
51 }
52
53 void handle(kir::GroupedGridReduction* expr) final {
54 profile_.registerExpr(expr);
55 }
56
57 void allocateBuffer() {
58 const auto num_profile_entries = profile_.getNumberOfProfileEntries();
59
60 // If nothing to profile, do not allocate anything
61 if (num_profile_entries == 0) {
62 return;
63 }
64
65 // Allocate two integers for each entry. One is used for accumulating
66 // cycles, and another for couting the number of hits
67 const std::vector<IterDomain*> new_buffer_ids = {
68 IterDomainBuilder(
69 GpuLower::current()->kernel()->zeroVal(),
70 IrBuilder::create<Int>(num_profile_entries))
71 .build(),
72 IterDomainBuilder(
73 GpuLower::current()->kernel()->zeroVal(), IrBuilder::create<Int>(2))
74 .build()};
75
76 const auto buffer_domain = IrBuilder::create<TensorDomain>(new_buffer_ids);
77
78 buffer_ = IrBuilder::create<TensorView>(
79 buffer_domain, DataType::Int, MemoryType::Global);
80
81 buffer_alloc_ = IrBuilder::create<kir::Allocate>(
82 buffer_, buffer_->getMemoryType(), nullptr, true);
83 }
84
85 private:
86 std::vector<Expr*> exprs_;
87 kir::KernelPerformanceProfile profile_;
88 TensorView* buffer_ = nullptr;
89 kir::Allocate* buffer_alloc_ = nullptr;
90};
91
92} // namespace
93
94std::vector<Expr*> instrumentKernel(const std::vector<Expr*>& exprs) {
95 if (!isOptionEnabled(EnableOption::KernelProfile)) {
96 return exprs;
97 }
98
99 Instrumentor inst(exprs);
100
101 GpuLower::current()->profile() = inst.profile();
102
103 return inst.exprs();
104}
105
106} // namespace cuda
107} // namespace fuser
108} // namespace jit
109} // namespace torch
110