1/*******************************************************************************
2* Copyright 2022 Intel Corporation
3*
4* Licensed under the Apache License, Version 2.0 (the "License");
5* you may not use this file except in compliance with the License.
6* You may obtain a copy of the License at
7*
8* http://www.apache.org/licenses/LICENSE-2.0
9*
10* Unless required by applicable law or agreed to in writing, software
11* distributed under the License is distributed on an "AS IS" BASIS,
12* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13* See the License for the specific language governing permissions and
14* limitations under the License.
15*******************************************************************************/
16
17#ifndef GPU_JIT_CODEGEN_REDUCE_HPP
18#define GPU_JIT_CODEGEN_REDUCE_HPP
19
20#include "gpu/jit/codegen/register_scope.hpp"
21#include "gpu/jit/codegen/reorder.hpp"
22#include "gpu/jit/ir/reduce.hpp"
23#include "gpu/jit/ngen/ngen.hpp"
24
25namespace dnnl {
26namespace impl {
27namespace gpu {
28namespace jit {
29
30class reduce_impl_t {
31public:
32 reduce_impl_t(ngen::HW hw, const reduce_t &reduce, int simd_size)
33 : hw_(hw)
34 , src_layout_(reduce.src_layout)
35 , dst_layout_(reduce.dst_layout)
36 , simd_size_(simd_size) {}
37
38 template <typename GeneratorT>
39 void emit(GeneratorT *host, ngen_register_scope_t &scope,
40 const reg_buf_data_t &src_rd, const reg_buf_data_t &dst_rd) {
41 auto &src_type = src_layout_.type();
42 auto &dst_type = dst_layout_.type();
43
44 bool is_inplace = (src_rd.base() == dst_rd.base()
45 && src_rd.byte_offset() == dst_rd.byte_offset());
46 if (is_inplace) {
47 ir_assert(src_type == dst_type)
48 << "Inplace operation is supported for the same type only.";
49 }
50
51 std::vector<bool> seen(src_layout_.size() * src_type.size());
52
53 tensor_t tile = find_1d_tile(src_layout_, dst_layout_);
54 int tile_elems = (int)tile.elems();
55 auto src_tile_layout = src_layout_.map(tile);
56 auto src_tile_blocks = src_tile_layout.blocks();
57 ir_assert(src_tile_blocks.size() <= 1);
58 ngen_register_scope_t block_scope(scope.register_allocator());
59 int src_stride
60 = src_tile_blocks.empty() ? 1 : (int)src_tile_blocks[0].stride;
61 int grf_size = ngen::GRF::bytes(hw_);
62 src_layout_.for_each_tile(
63 tile, [&](const std::vector<dim_t> &src_start) {
64 ngen_register_scope_t tile_scope(
65 scope.register_allocator());
66 auto dst_start = src_start;
67 for (int i = 0; i < dst_layout_.ndims(); i++) {
68 if (dst_layout_.dims()[i] == 1) dst_start[i] = 0;
69 }
70 int src_off = int(src_layout_(src_start) * src_type.size());
71 int dst_off = int(dst_layout_(dst_start) * dst_type.size());
72
73 if (is_inplace) {
74 bool same_src_dst = (dst_off == src_off);
75 if (!seen[dst_off] && !same_src_dst) {
76 ir_error_not_expected()
77 << "Invalid inplace reduction.";
78 }
79 seen[dst_off] = true;
80 if (same_src_dst) return;
81 }
82
83 auto d = dst_rd.format(
84 dst_off, to_ngen(dst_type), tile_elems, 1);
85 auto s = src_rd.format(
86 src_off, to_ngen(src_type), tile_elems, src_stride);
87 bool s_half_grf_aligned
88 = utils::one_of(s.byte_offset(), 0, grf_size / 2);
89 bool s_is_bf = src_type.is_bf16();
90
91 if (src_stride != 1 || (s_is_bf && !s_half_grf_aligned)) {
92 auto tmp_type = src_type;
93 if ((d.offset() != 0 || !s_half_grf_aligned)
94 && s_is_bf) {
95 tmp_type = type_t::f32();
96 }
97 auto tmp = tile_scope.alloc_reg_data(
98 tmp_type.with_elems(tile_elems));
99 emit_reorder_1d_tile(hw_, host, tile_scope, tile_elems,
100 s, src_stride, tmp, 1);
101 s = tmp.format(0, to_ngen(tmp_type), tile_elems, 1);
102 }
103 align_src_dst_offset(host, tile_scope, tile_elems, d, s);
104 host->add(tile_elems, d.reg_data(), d.reg_data(),
105 s.reg_data());
106 });
107 }
108
109private:
110 tensor_t find_1d_tile(layout_t a, layout_t b) const {
111 layout_t::align_layouts(a, b);
112
113 ir_assert(!a.blocks().empty());
114 ir_assert(!b.blocks().empty());
115
116 auto &a0 = a.blocks()[0];
117 auto &b0 = b.blocks()[0];
118
119 bool ok = (a0.dim_idx == b0.dim_idx && a0.block == b0.block);
120 if (!ok) {
121 // Try to match strided layout.
122 if (a0.block == 2) {
123 auto a_blocks = a.blocks();
124 a_blocks.erase(a_blocks.begin());
125 a = layout_t(a.type(), a.ndims(), 0, a_blocks);
126 return find_1d_tile(a, b);
127 }
128 }
129
130 ir_assert(ok) << "Incompatible layouts for reduction.";
131 ir_assert(dim_t(b0.stride) == 1)
132 << "Reduction is not supported for non-unit dst stride.";
133
134 int grf_size = ngen::GRF::bytes(hw_);
135 int a_grf_elems = grf_size / a.type().size();
136 int b_grf_elems = grf_size / b.type().size();
137
138 int min_step = std::min(a_grf_elems, b_grf_elems);
139 int max_step = 2 * min_step;
140
141 min_step = std::min(std::min(simd_size_, min_step), (int)a0.block);
142
143 ir_assert(a0.block % min_step == 0) << "Reduction is not supported.";
144
145 std::vector<dim_t> tile_dims(src_layout_.ndims(), 1);
146 tile_dims[a0.dim_idx]
147 = ir_utils::max_divisor(int(a0.block), {min_step, max_step});
148
149 return tensor_t(tile_dims);
150 }
151
152 ngen::HW hw_;
153 layout_t src_layout_;
154 layout_t dst_layout_;
155 int simd_size_;
156};
157
158} // namespace jit
159} // namespace gpu
160} // namespace impl
161} // namespace dnnl
162
163#endif
164