lower_instrument.cpp source code [pytorch/third_party/nvfuser/csrc/lower_instrument.cpp]

1	#include <iter_visitor.h>
2	#include <kernel_ir_dispatch.h>
3	#include <lower2device.h>
4	#include <lower_magic_zero.h>
5
6	#include <lower_instrument.h>
7
8	namespace torch {
9	namespace jit {
10	namespace fuser {
11	namespace cuda {
12
13	namespace {
14
15	class Instrumentor : private kir::IrVisitor {
16	public:
17	Instrumentor(const std::vector<Expr*>& exprs) {
18	IrVisitor::handle(exprs);
19
20	if (profile_.getNumberOfProfileEntries() == `0`) {
21	exprs_ = exprs;
22	return;
23	}
24
25	// Allocate a new TensorView as a backing buffer
26	allocateBuffer();
27	profile_.setBuffer(buffer_);
28
29	// Insert the allocation expression at the beginning of the
30	// top-level expressions
31	exprs_.push_back(buffer_alloc_);
32	exprs_.insert(exprs_.end(), exprs.begin(), exprs.end());
33	}
34
35	const kir::KernelPerformanceProfile& profile() const {
36	return profile_;
37	}
38
39	const std::vector<Expr>& exprs() const* {
40	return exprs_;
41	}
42
43	private:
44	using IrVisitor::handle;
45
46	//! Profile all grid reductions.
47	//! TODO: support other variants of grid reductions (e.g.,
48	//! GroupedGridReduction)
49	void handle(kir::GridReduction* expr) final {
50	profile_.registerExpr(expr);
51	}
52
53	void handle(kir::GroupedGridReduction* expr) final {
54	profile_.registerExpr(expr);
55	}
56
57	void allocateBuffer() {
58	const auto num_profile_entries = profile_.getNumberOfProfileEntries();
59
60	// If nothing to profile, do not allocate anything
61	if (num_profile_entries == `0`) {
62	return;
63	}
64
65	// Allocate two integers for each entry. One is used for accumulating
66	// cycles, and another for couting the number of hits
67	const std::vector<IterDomain*> new_buffer_ids = {
68	IterDomainBuilder (
69	GpuLower::current()->kernel()->zeroVal(),
70	IrBuilder::create<Int>(num_profile_entries))
71	.build(),
72	IterDomainBuilder (
73	GpuLower::current()->kernel()->zeroVal(), IrBuilder::create<Int>(`2`))
74	.build()};
75
76	const auto buffer_domain = IrBuilder::create<TensorDomain>(new_buffer_ids);
77
78	buffer_ = IrBuilder::create<TensorView>(
79	buffer_domain, DataType::Int, MemoryType::Global);
80
81	buffer_alloc_ = IrBuilder::create<kir::Allocate>(
82	buffer_, buffer_->getMemoryType(), nullptr, true);
83	}
84
85	private:
86	std::vector<Expr*> exprs_;
87	kir::KernelPerformanceProfile profile_;
88	TensorView* buffer_ = nullptr;
89	kir::Allocate* buffer_alloc_ = nullptr;
90	};
91
92	} // namespace
93
94	std::vector<Expr> instrumentKernel(const* std::vector<Expr*>& exprs) {
95	if (!isOptionEnabled(EnableOption::KernelProfile)) {
96	return exprs;
97	}
98
99	Instrumentor inst(exprs);
100
101	GpuLower::current()->profile() = inst.profile();
102
103	return inst.exprs();
104	}
105
106	} // namespace cuda
107	} // namespace fuser
108	} // namespace jit
109	} // namespace torch
110

Browse the source code of pytorch/third_party/nvfuser/csrc/lower_instrument.cpp