1 | #pragma once |
2 | |
3 | #include <ir_all_nodes.h> |
4 | |
5 | namespace torch { |
6 | namespace jit { |
7 | namespace fuser { |
8 | namespace cuda { |
9 | |
10 | //! Set up KernelPerformanceProfile of GpuLower when enabled, which |
11 | //! keeps track of expressions to profile. A new TensorView is added |
12 | //! for storing profiling results. The expression list is prepended |
13 | //! with an kir::Allocate node to allocate the TensorView profile |
14 | //! buffer. Note that any expression added after this pass will not be |
15 | //! profiled, so this pass should be called after all expressions are |
16 | //! lowered. KernelPerformanceProfile is copied to Kernel after |
17 | //! lowering. |
18 | std::vector<Expr*> instrumentKernel(const std::vector<Expr*>& exprs); |
19 | |
20 | } // namespace cuda |
21 | } // namespace fuser |
22 | } // namespace jit |
23 | } // namespace torch |
24 | |