1 | /******************************************************************************* |
2 | * Copyright 2022 Intel Corporation |
3 | * |
4 | * Licensed under the Apache License, Version 2.0 (the "License"); |
5 | * you may not use this file except in compliance with the License. |
6 | * You may obtain a copy of the License at |
7 | * |
8 | * http://www.apache.org/licenses/LICENSE-2.0 |
9 | * |
10 | * Unless required by applicable law or agreed to in writing, software |
11 | * distributed under the License is distributed on an "AS IS" BASIS, |
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
13 | * See the License for the specific language governing permissions and |
14 | * limitations under the License. |
15 | *******************************************************************************/ |
16 | |
17 | #ifndef GPU_JIT_CODEGEN_REDUCE_HPP |
18 | #define GPU_JIT_CODEGEN_REDUCE_HPP |
19 | |
20 | #include "gpu/jit/codegen/register_scope.hpp" |
21 | #include "gpu/jit/codegen/reorder.hpp" |
22 | #include "gpu/jit/ir/reduce.hpp" |
23 | #include "gpu/jit/ngen/ngen.hpp" |
24 | |
25 | namespace dnnl { |
26 | namespace impl { |
27 | namespace gpu { |
28 | namespace jit { |
29 | |
30 | class reduce_impl_t { |
31 | public: |
32 | reduce_impl_t(ngen::HW hw, const reduce_t &reduce, int simd_size) |
33 | : hw_(hw) |
34 | , src_layout_(reduce.src_layout) |
35 | , dst_layout_(reduce.dst_layout) |
36 | , simd_size_(simd_size) {} |
37 | |
38 | template <typename GeneratorT> |
39 | void emit(GeneratorT *host, ngen_register_scope_t &scope, |
40 | const reg_buf_data_t &src_rd, const reg_buf_data_t &dst_rd) { |
41 | auto &src_type = src_layout_.type(); |
42 | auto &dst_type = dst_layout_.type(); |
43 | |
44 | bool is_inplace = (src_rd.base() == dst_rd.base() |
45 | && src_rd.byte_offset() == dst_rd.byte_offset()); |
46 | if (is_inplace) { |
47 | ir_assert(src_type == dst_type) |
48 | << "Inplace operation is supported for the same type only." ; |
49 | } |
50 | |
51 | std::vector<bool> seen(src_layout_.size() * src_type.size()); |
52 | |
53 | tensor_t tile = find_1d_tile(src_layout_, dst_layout_); |
54 | int tile_elems = (int)tile.elems(); |
55 | auto src_tile_layout = src_layout_.map(tile); |
56 | auto src_tile_blocks = src_tile_layout.blocks(); |
57 | ir_assert(src_tile_blocks.size() <= 1); |
58 | ngen_register_scope_t block_scope(scope.register_allocator()); |
59 | int src_stride |
60 | = src_tile_blocks.empty() ? 1 : (int)src_tile_blocks[0].stride; |
61 | int grf_size = ngen::GRF::bytes(hw_); |
62 | src_layout_.for_each_tile( |
63 | tile, [&](const std::vector<dim_t> &src_start) { |
64 | ngen_register_scope_t tile_scope( |
65 | scope.register_allocator()); |
66 | auto dst_start = src_start; |
67 | for (int i = 0; i < dst_layout_.ndims(); i++) { |
68 | if (dst_layout_.dims()[i] == 1) dst_start[i] = 0; |
69 | } |
70 | int src_off = int(src_layout_(src_start) * src_type.size()); |
71 | int dst_off = int(dst_layout_(dst_start) * dst_type.size()); |
72 | |
73 | if (is_inplace) { |
74 | bool same_src_dst = (dst_off == src_off); |
75 | if (!seen[dst_off] && !same_src_dst) { |
76 | ir_error_not_expected() |
77 | << "Invalid inplace reduction." ; |
78 | } |
79 | seen[dst_off] = true; |
80 | if (same_src_dst) return; |
81 | } |
82 | |
83 | auto d = dst_rd.format( |
84 | dst_off, to_ngen(dst_type), tile_elems, 1); |
85 | auto s = src_rd.format( |
86 | src_off, to_ngen(src_type), tile_elems, src_stride); |
87 | bool s_half_grf_aligned |
88 | = utils::one_of(s.byte_offset(), 0, grf_size / 2); |
89 | bool s_is_bf = src_type.is_bf16(); |
90 | |
91 | if (src_stride != 1 || (s_is_bf && !s_half_grf_aligned)) { |
92 | auto tmp_type = src_type; |
93 | if ((d.offset() != 0 || !s_half_grf_aligned) |
94 | && s_is_bf) { |
95 | tmp_type = type_t::f32(); |
96 | } |
97 | auto tmp = tile_scope.alloc_reg_data( |
98 | tmp_type.with_elems(tile_elems)); |
99 | emit_reorder_1d_tile(hw_, host, tile_scope, tile_elems, |
100 | s, src_stride, tmp, 1); |
101 | s = tmp.format(0, to_ngen(tmp_type), tile_elems, 1); |
102 | } |
103 | align_src_dst_offset(host, tile_scope, tile_elems, d, s); |
104 | host->add(tile_elems, d.reg_data(), d.reg_data(), |
105 | s.reg_data()); |
106 | }); |
107 | } |
108 | |
109 | private: |
110 | tensor_t find_1d_tile(layout_t a, layout_t b) const { |
111 | layout_t::align_layouts(a, b); |
112 | |
113 | ir_assert(!a.blocks().empty()); |
114 | ir_assert(!b.blocks().empty()); |
115 | |
116 | auto &a0 = a.blocks()[0]; |
117 | auto &b0 = b.blocks()[0]; |
118 | |
119 | bool ok = (a0.dim_idx == b0.dim_idx && a0.block == b0.block); |
120 | if (!ok) { |
121 | // Try to match strided layout. |
122 | if (a0.block == 2) { |
123 | auto a_blocks = a.blocks(); |
124 | a_blocks.erase(a_blocks.begin()); |
125 | a = layout_t(a.type(), a.ndims(), 0, a_blocks); |
126 | return find_1d_tile(a, b); |
127 | } |
128 | } |
129 | |
130 | ir_assert(ok) << "Incompatible layouts for reduction." ; |
131 | ir_assert(dim_t(b0.stride) == 1) |
132 | << "Reduction is not supported for non-unit dst stride." ; |
133 | |
134 | int grf_size = ngen::GRF::bytes(hw_); |
135 | int a_grf_elems = grf_size / a.type().size(); |
136 | int b_grf_elems = grf_size / b.type().size(); |
137 | |
138 | int min_step = std::min(a_grf_elems, b_grf_elems); |
139 | int max_step = 2 * min_step; |
140 | |
141 | min_step = std::min(std::min(simd_size_, min_step), (int)a0.block); |
142 | |
143 | ir_assert(a0.block % min_step == 0) << "Reduction is not supported." ; |
144 | |
145 | std::vector<dim_t> tile_dims(src_layout_.ndims(), 1); |
146 | tile_dims[a0.dim_idx] |
147 | = ir_utils::max_divisor(int(a0.block), {min_step, max_step}); |
148 | |
149 | return tensor_t(tile_dims); |
150 | } |
151 | |
152 | ngen::HW hw_; |
153 | layout_t src_layout_; |
154 | layout_t dst_layout_; |
155 | int simd_size_; |
156 | }; |
157 | |
158 | } // namespace jit |
159 | } // namespace gpu |
160 | } // namespace impl |
161 | } // namespace dnnl |
162 | |
163 | #endif |
164 | |