1 | #pragma once |
2 | |
3 | #include <c10/macros/Export.h> |
4 | |
5 | #include <dispatch.h> |
6 | #include <ir_all_nodes.h> |
7 | |
8 | #include <vector> |
9 | |
10 | namespace torch { |
11 | namespace jit { |
12 | namespace fuser { |
13 | namespace cuda { |
14 | |
15 | //! Reuse Allocation nodes via pointer aliasing |
16 | //! |
17 | //! First pass finds candidate TensorViews |
18 | //! A candidate TensorView is anything in shared memory OR |
19 | //! in local memory with a static size larger than register_size_threshold |
20 | //! |
21 | //! Second pass finds appropriate input Allocate Node |
22 | //! among candidate TensorViews |
23 | //! |
24 | //! Alias Criteria: |
25 | //! If input is a candidate TensorView, |
26 | //! input allocation has the same size as output allocation, |
27 | //! thread bindings match, |
28 | //! is not used after this op: |
29 | //! then alias output Allocate to input Allocate. |
30 | //! |
31 | std::vector<Expr*> reuseMemoryAllocations(const std::vector<Expr*>& exprs); |
32 | |
33 | } // namespace cuda |
34 | } // namespace fuser |
35 | } // namespace jit |
36 | } // namespace torch |
37 | |