grouped_reduction.h source code [pytorch/third_party/nvfuser/csrc/grouped_reduction.h]

1	#pragma once
2
3	#include <ir_all_nodes.h>
4
5	namespace torch {
6	namespace jit {
7	namespace fuser {
8	namespace cuda {
9
10	//! Horizontally fuse multiple reductions.
11	//!
12	//! Given a list of tensors produced by ReductionOp, create a new
13	//! GroupedReductionOp expression that takes the input tensors of the
14	//! original reductions and produces the given tensors, replacing
15	//! their defining expressions.
16	//!
17	//! GroupedReductionOp works just like ReductionOp with a potential
18	//! benefit of aggregating synchronizations across individual
19	//! reductions. See the reduction::gridReduce2 runtime function for a
20	//! two-input version of grid reduction.
21	//!
22	//! The grouped reductions must follow several constraints, which
23	//! include:
24	//! - There must not exist any data dependency between individual
25	//! reductions.
26	//! - All reduction output tensors must have the same number of
27	//! dimensions, the same transformations and the same axes to
28	//! reduce.
29	//!
30	//! Note that Welford is not allowed yet, though it should be
31	//! technically straightforward to support horizontal fusions of
32	//! welford ops. Unclear how common it would be in practice, though.
33	//!
34	//! \param reduction_outputs Tensors produced by ReductionOp
35	TORCH_CUDA_CU_API void groupReductions(
36	const std::vector<TensorView*>& reduction_outputs);
37
38	} // namespace cuda
39	} // namespace fuser
40	} // namespace jit
41	} // namespace torch
42

Browse the source code of pytorch/third_party/nvfuser/csrc/grouped_reduction.h