1#pragma once
2
3#include <ir_all_nodes.h>
4
5namespace torch {
6namespace jit {
7namespace fuser {
8namespace cuda {
9
10//! Horizontally fuse multiple reductions.
11//!
12//! Given a list of tensors produced by ReductionOp, create a new
13//! GroupedReductionOp expression that takes the input tensors of the
14//! original reductions and produces the given tensors, replacing
15//! their defining expressions.
16//!
17//! GroupedReductionOp works just like ReductionOp with a potential
18//! benefit of aggregating synchronizations across individual
19//! reductions. See the reduction::gridReduce2 runtime function for a
20//! two-input version of grid reduction.
21//!
22//! The grouped reductions must follow several constraints, which
23//! include:
24//! - There must not exist any data dependency between individual
25//! reductions.
26//! - All reduction output tensors must have the same number of
27//! dimensions, the same transformations and the same axes to
28//! reduce.
29//!
30//! Note that Welford is not allowed yet, though it should be
31//! technically straightforward to support horizontal fusions of
32//! welford ops. Unclear how common it would be in practice, though.
33//!
34//! \param reduction_outputs Tensors produced by ReductionOp
35TORCH_CUDA_CU_API void groupReductions(
36 const std::vector<TensorView*>& reduction_outputs);
37
38} // namespace cuda
39} // namespace fuser
40} // namespace jit
41} // namespace torch
42