reduce.hpp source code [oneDNN/src/gpu/jit/codegen/reduce.hpp]

1	/*******************************************************************************
2	* Copyright 2022 Intel Corporation
3	*
4	* Licensed under the Apache License, Version 2.0 (the "License");
5	* you may not use this file except in compliance with the License.
6	* You may obtain a copy of the License at
7	*
8	* http://www.apache.org/licenses/LICENSE-2.0
9	*
10	* Unless required by applicable law or agreed to in writing, software
11	* distributed under the License is distributed on an "AS IS" BASIS,
12	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13	* See the License for the specific language governing permissions and
14	* limitations under the License.
15	*******************************************************************************/
16
17	#ifndef GPU_JIT_CODEGEN_REDUCE_HPP
18	#define GPU_JIT_CODEGEN_REDUCE_HPP
19
20	#include "gpu/jit/codegen/register_scope.hpp"
21	#include "gpu/jit/codegen/reorder.hpp"
22	#include "gpu/jit/ir/reduce.hpp"
23	#include "gpu/jit/ngen/ngen.hpp"
24
25	namespace dnnl {
26	namespace impl {
27	namespace gpu {
28	namespace jit {
29
30	class reduce_impl_t {
31	public:
32	reduce_impl_t(ngen::HW hw, const reduce_t &reduce, int simd_size)
33	: hw_(hw)
34	, src_layout_(reduce.src_layout)
35	, dst_layout_(reduce.dst_layout)
36	, simd_size_(simd_size) {}
37
38	template <typename GeneratorT>
39	void emit(GeneratorT *host, ngen_register_scope_t &scope,
40	const reg_buf_data_t &src_rd, const reg_buf_data_t &dst_rd) {
41	auto &src_type = src_layout_.type();
42	auto &dst_type = dst_layout_.type();
43
44	bool is_inplace = (src_rd.base() == dst_rd.base()
45	&& src_rd.byte_offset() == dst_rd.byte_offset());
46	if (is_inplace) {
47	ir_assert(src_type == dst_type)
48	<< "Inplace operation is supported for the same type only.";
49	}
50
51	std::vector<bool> seen(src_layout_.size() * src_type.size());
52
53	tensor_t tile = find_1d_tile(src_layout_, dst_layout_);
54	int tile_elems = (int)tile.elems();
55	auto src_tile_layout = src_layout_.map(tile);
56	auto src_tile_blocks = src_tile_layout.blocks();
57	ir_assert(src_tile_blocks.size() <= `1`);
58	ngen_register_scope_t block_scope(scope.register_allocator());
59	int src_stride
60	= src_tile_blocks.empty() ? `1` : (int)src_tile_blocks[`0`].stride;
61	int grf_size = ngen::GRF::bytes(hw_);
62	src_layout_.for_each_tile(
63	tile, [&](const std::vector<dim_t> &src_start) {
64	ngen_register_scope_t tile_scope(
65	scope.register_allocator());
66	auto dst_start = src_start;
67	for (int i = `0`; i < dst_layout_.ndims(); i++) {
68	if (dst_layout_.dims()[i] == `1`) dst_start[i] = `0`;
69	}
70	int src_off = int(src_layout_(src_start) * src_type.size());
71	int dst_off = int(dst_layout_(dst_start) * dst_type.size());
72
73	if (is_inplace) {
74	bool same_src_dst = (dst_off == src_off);
75	if (!seen[dst_off] && !same_src_dst) {
76	ir_error_not_expected()
77	<< "Invalid inplace reduction.";
78	}
79	seen[dst_off] = true;
80	if (same_src_dst) return;
81	}
82
83	auto d = dst_rd.format(
84	dst_off, to_ngen(dst_type), tile_elems, `1`);
85	auto s = src_rd.format(
86	src_off, to_ngen(src_type), tile_elems, src_stride);
87	bool s_half_grf_aligned
88	= utils::one_of(s.byte_offset(), `0`, grf_size / `2`);
89	bool s_is_bf = src_type.is_bf16();
90
91	if (src_stride != `1` \|\| (s_is_bf && !s_half_grf_aligned)) {
92	auto tmp_type = src_type;
93	if ((d.offset() != `0` \|\| !s_half_grf_aligned)
94	&& s_is_bf) {
95	tmp_type = type_t::f32();
96	}
97	auto tmp = tile_scope.alloc_reg_data(
98	tmp_type.with_elems(tile_elems));
99	emit_reorder_1d_tile(hw_, host, tile_scope, tile_elems,
100	s, src_stride, tmp, `1`);
101	s = tmp.format(`0`, to_ngen(tmp_type), tile_elems, `1`);
102	}
103	align_src_dst_offset(host, tile_scope, tile_elems, d, s);
104	host->add(tile_elems, d.reg_data(), d.reg_data(),
105	s.reg_data());
106	});
107	}
108
109	private:
110	tensor_t find_1d_tile(layout_t a, layout_t b) const {
111	layout_t::align_layouts(a, b);
112
113	ir_assert(!a.blocks().empty());
114	ir_assert(!b.blocks().empty());
115
116	auto &a0 = a.blocks()[`0`];
117	auto &b0 = b.blocks()[`0`];
118
119	bool ok = (a0.dim_idx == b0.dim_idx && a0.block == b0.block);
120	if (!ok) {
121	// Try to match strided layout.
122	if (a0.block == `2`) {
123	auto a_blocks = a.blocks();
124	a_blocks.erase(a_blocks.begin());
125	a = layout_t(a.type(), a.ndims(), `0`, a_blocks);
126	return find_1d_tile(a, b);
127	}
128	}
129
130	ir_assert(ok) << "Incompatible layouts for reduction.";
131	ir_assert(dim_t(b0.stride) == `1`)
132	<< "Reduction is not supported for non-unit dst stride.";
133
134	int grf_size = ngen::GRF::bytes(hw_);
135	int a_grf_elems = grf_size / a.type().size();
136	int b_grf_elems = grf_size / b.type().size();
137
138	int min_step = std::min(a_grf_elems, b_grf_elems);
139	int max_step = `2` * min_step;
140
141	min_step = std::min(std::min(simd_size_, min_step), (int)a0.block);
142
143	ir_assert(a0.block % min_step == `0`) << "Reduction is not supported.";
144
145	std::vector<dim_t> tile_dims(src_layout_.ndims(), `1`);
146	tile_dims[a0.dim_idx]
147	= ir_utils::max_divisor(int(a0.block), {min_step, max_step});
148
149	return tensor_t(tile_dims);
150	}
151
152	ngen::HW hw_;
153	layout_t src_layout_;
154	layout_t dst_layout_;
155	int simd_size_;
156	};
157
158	} // namespace jit
159	} // namespace gpu
160	} // namespace impl
161	} // namespace dnnl
162
163	#endif
164

Browse the source code of oneDNN/src/gpu/jit/codegen/reduce.hpp