1 | #pragma once |
2 | |
3 | #include <ir_all_nodes.h> |
4 | |
5 | namespace torch { |
6 | namespace jit { |
7 | namespace fuser { |
8 | namespace cuda { |
9 | |
10 | //! Horizontally fuse multiple reductions. |
11 | //! |
12 | //! Given a list of tensors produced by ReductionOp, create a new |
13 | //! GroupedReductionOp expression that takes the input tensors of the |
14 | //! original reductions and produces the given tensors, replacing |
15 | //! their defining expressions. |
16 | //! |
17 | //! GroupedReductionOp works just like ReductionOp with a potential |
18 | //! benefit of aggregating synchronizations across individual |
19 | //! reductions. See the reduction::gridReduce2 runtime function for a |
20 | //! two-input version of grid reduction. |
21 | //! |
22 | //! The grouped reductions must follow several constraints, which |
23 | //! include: |
24 | //! - There must not exist any data dependency between individual |
25 | //! reductions. |
26 | //! - All reduction output tensors must have the same number of |
27 | //! dimensions, the same transformations and the same axes to |
28 | //! reduce. |
29 | //! |
30 | //! Note that Welford is not allowed yet, though it should be |
31 | //! technically straightforward to support horizontal fusions of |
32 | //! welford ops. Unclear how common it would be in practice, though. |
33 | //! |
34 | //! \param reduction_outputs Tensors produced by ReductionOp |
35 | TORCH_CUDA_CU_API void groupReductions( |
36 | const std::vector<TensorView*>& reduction_outputs); |
37 | |
38 | } // namespace cuda |
39 | } // namespace fuser |
40 | } // namespace jit |
41 | } // namespace torch |
42 | |