1/*******************************************************************************
2* Copyright 2017-2020 Intel Corporation
3*
4* Licensed under the Apache License, Version 2.0 (the "License");
5* you may not use this file except in compliance with the License.
6* You may obtain a copy of the License at
7*
8* http://www.apache.org/licenses/LICENSE-2.0
9*
10* Unless required by applicable law or agreed to in writing, software
11* distributed under the License is distributed on an "AS IS" BASIS,
12* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13* See the License for the specific language governing permissions and
14* limitations under the License.
15*******************************************************************************/
16
17#ifndef CPU_X64_CPU_REDUCER_HPP
18#define CPU_X64_CPU_REDUCER_HPP
19
20#include <assert.h>
21
22#include "common/c_types_map.hpp"
23#include "common/dnnl_thread.hpp"
24#include "common/memory_tracking.hpp"
25#include "common/nstl.hpp"
26#include "common/type_helpers.hpp"
27#include "common/utils.hpp"
28#include "oneapi/dnnl/dnnl_types.h"
29
30#include "cpu/x64/cpu_barrier.hpp"
31
32namespace dnnl {
33namespace impl {
34namespace cpu {
35namespace x64 {
36
37/** class to perform balancing over 3D array
38 *
39 * Conceptually the reduction happens according to the picture below:
40 *
41 * <--job_size->
42 * +-----------+ +-----------+ +-----------+ ^
43 * | | | | | | |
44 * | | | | | | |
45 * | 1 | | 2 | . . . | njobs | | reduction_size
46 * | | | | | | |
47 * | | | | | | |
48 * +-----------+ +-----------+ +-----------+ v
49 *
50 * | | | | | | | | |
51 * v v v v v v v v v
52 * ===================================================== vertical reduction
53 *
54 * +-----------+ +-----------+ . . . +-----------+ result
55 *
56 * In a simple case the result must be contiguous in memory.
57 * @class cpu_reducer_t is an implementation.
58 *
59 * Threads are divided into groups. The groups are independent of each other.
60 * Each group may work on several jobs (the distribution is not uniform, since
61 * njobs might be not a multiple of groups). Threads within a group work on
62 * different parts of the reduction dimension. Thread 0 in each group is called
63 * master (@sa reduce_balancer_t::master()).
64 *
65 * If threading driver does not allow sync between sub-group of threads (e.g.
66 * TBB) the # of thread per group is enforced to be 1.
67 */
68struct reduce_balancer_t {
69 reduce_balancer_t() { init(1, 1, 1, 1, 0); } /* trivial balance */
70 reduce_balancer_t(int nthr, int job_size, int njobs, int reduction_size,
71 size_t max_buffer_size, bool lock_free = false) {
72 init(nthr, job_size, njobs, reduction_size, max_buffer_size, lock_free);
73 }
74
75 reduce_balancer_t &init(int nthr, int job_size, int njobs,
76 int reduction_size, size_t max_buffer_size,
77 bool lock_free = false) {
78 allow_nthr_in_group_ = lock_free ? true : dnnl_thr_syncable();
79 nthr_ = nthr;
80 job_size_ = job_size;
81 njobs_ = njobs;
82 reduction_size_ = reduction_size;
83 max_buffer_size_ = max_buffer_size;
84 balance();
85 return *this;
86 }
87
88 bool allow_nthr_in_group_;
89 int nthr_;
90 int job_size_, njobs_, reduction_size_;
91
92 int ngroups_; /** number of independent work (thread) groups */
93 int nthr_per_group_; /** number of threads within a single work group */
94 int njobs_per_group_ub_; /** the max # of jobs within a work group */
95
96 bool master(int ithr) const { return id_in_group(ithr) == 0; }
97 bool idle(int ithr) const { return ithr >= nthr_per_group_ * ngroups_; }
98
99 int group_id(int ithr) const { return ithr / nthr_per_group_; }
100 int id_in_group(int ithr) const { return ithr % nthr_per_group_; }
101
102 int grp_njobs(int grp) const {
103 if (grp >= ngroups_) return 0;
104 return njobs_ / ngroups_ + (grp < njobs_ % ngroups_);
105 }
106 int grp_job_off(int grp) const {
107 if (grp >= ngroups_) return njobs_;
108 return njobs_ / ngroups_ * grp + nstl::min(grp, njobs_ % ngroups_);
109 }
110
111 int ithr_njobs(int ithr) const { return grp_njobs(group_id(ithr)); }
112 int ithr_job_off(int ithr) const { return grp_job_off(group_id(ithr)); }
113
114private:
115 size_t max_buffer_size_;
116 void balance();
117};
118
119/** forward declaration of reduce driver */
120template <impl::data_type_t data_type>
121struct reducer_2d_driver_t;
122
123/** class to perform a reduction over 3D array
124 *
125 * Balancing is based on @class reduce_balancer_t.
126 * Restrictions: the result of the reduction must be contiguous in memory. *
127 * The reduction happens according to the picture below (once more):
128 *
129 * <--job_size->
130 * +-----------+ +-----------+ +-----------+ ^
131 * | | | | | | |
132 * | | | | | | |
133 * | 1 | | 2 | . . . | njobs | | reduction_size
134 * | | | | | | |
135 * | | | | | | |
136 * +-----------+ +-----------+ +-----------+ v
137 *
138 * | | | | | | | | |
139 * v v v v v v v v v
140 * ===================================================== vertical reduction
141 *
142 * +-----------+ +-----------+ . . . +-----------+ (contiguous) result
143 *
144 * An example how work might be shared is shown below.
145 *
146 * In this example group 0 owns 2 (independent) jobs -- 2 big squares.
147 * The number of threads per group is also 2 (thread 0 of group 0 and thread 1
148 * of group 0). Master threads (i.e. threads with id 0 in corresponding group)
149 * from each group put the partial result directly into destination memory,
150 * while all the other threads with-in the group use workspace (on the picture
151 * the only thread 1). Once intermediate results obtained each group reduces
152 * corresponding part (own jobs) to the destination memory.
153 *
154 * <------- group 0 ------->
155 *
156 * +-----------+ +-----------+ ^
157 * | | | | | thread 0 of reduces to the dest-memory
158 * | | | | | group 0 +-----------+ +-----------+
159 * |- - - - - -| |- - - - - -| X
160 * | | | | | thread 1 of reduces to workspace[tid=1]:
161 * | | | | | group 0 +-----------+ +-----------+
162 * +-----------+ +-----------+ v
163 * | | | | | |
164 * v v v v v v
165 * ((barrier)) =============================
166 *
167 * dest-memory: +-----------+ +-----------+
168 */
169template <impl::data_type_t data_type>
170struct cpu_reducer_t {
171 typedef typename prec_traits<data_type>::type data_t;
172
173 struct conf_t {
174 conf_t() = default;
175 conf_t &init(const reduce_balancer_t &balancer) {
176 balancer_ = balancer;
177 return *this;
178 }
179
180 void init_scratchpad(memory_tracking::registrar_t &scratchpad) const;
181
182 reduce_balancer_t balancer_;
183 };
184
185 status_t create_kernel();
186
187 cpu_reducer_t(const conf_t &conf);
188 ~cpu_reducer_t();
189
190 /** initializes reducer.
191 * Must be called from a single thread prior to actual usage */
192 void init(const memory_tracking::grantor_t &scratchpad) const {
193 if (balancer().nthr_per_group_ == 1 || !dnnl_thr_syncable()) return;
194
195 auto bctx = scratchpad.template get<simple_barrier::ctx_t>(
196 memory_tracking::names::key_reducer_space_bctx);
197 for (int i = 0; i < balancer().ngroups_; ++i)
198 simple_barrier::ctx_init(&bctx[i]);
199 }
200
201 /** for given thread returns the pointer where to put partial results.
202 * Reduction destination @p dst must be provided as well (master threads
203 * from each group will use it for partial result to reduce memory
204 * pressure).
205 *
206 * @note: job offset is already applied by get_local_ptr(), which means all
207 * threads should start writing from the very beginning of returned
208 * address.
209 */
210 data_t *get_local_ptr(int ithr, data_t *dst,
211 const memory_tracking::grantor_t &scratchpad) const;
212
213 /** performs the reduction with built-in synchronization. */
214 void reduce(int ithr, data_t *dst,
215 const memory_tracking::grantor_t &scratchpad) const {
216 bool redundant_reduction
217 = balancer().nthr_per_group_ == 1 || balancer().idle(ithr);
218 if (redundant_reduction) return;
219
220 auto bctx = scratchpad.template get<simple_barrier::ctx_t>(
221 memory_tracking::names::key_reducer_space_bctx);
222 simple_barrier::barrier(
223 &bctx[balancer().group_id(ithr)], balancer().nthr_per_group_);
224
225 reduce_nolock(ithr, dst, scratchpad);
226 }
227
228 void reduce_nolock(int ithr, data_t *dst,
229 const memory_tracking::grantor_t &scratchpad) const;
230
231 const reduce_balancer_t &balancer() const { return conf_.balancer_; }
232
233private:
234 static size_t space_per_thread(const reduce_balancer_t &balancer) {
235 return balancer.njobs_per_group_ub_ * balancer.job_size_;
236 }
237
238 /* The scratchpad is organized as follows:
239 *
240 * data_t space[nthr_][njobs_per_group_ub_][jobs_size_];
241 * simple_barrier::ctx_t barriers[groups_]; */
242
243 const conf_t conf_;
244 reducer_2d_driver_t<data_type> *drv_;
245
246 DNNL_DISALLOW_COPY_AND_ASSIGN(cpu_reducer_t);
247};
248
249template <impl::data_type_t data_type>
250struct cpu_reducer_2d_t {
251 typedef typename prec_traits<data_type>::type data_t;
252
253 struct conf_t {
254 conf_t() = default;
255 conf_t &init(const reduce_balancer_t &balancer, int job_size_x,
256 int job_size_y, int x_block, int dst_x, int dst_y) {
257 balancer_ = balancer;
258 job_size_x_ = job_size_x;
259 job_size_y_ = job_size_y;
260 x_block_ = x_block;
261 dst_x_ = dst_x;
262 dst_y_ = dst_y;
263 return *this;
264 }
265
266 void init_scratchpad(memory_tracking::registrar_t &scratchpad) const;
267
268 reduce_balancer_t balancer_;
269 int job_size_x_, job_size_y_, x_block_, dst_x_, dst_y_;
270 };
271
272 status_t create_kernel();
273
274 cpu_reducer_2d_t(const conf_t &conf);
275 ~cpu_reducer_2d_t();
276
277 /** initializes reducer.
278 * Must be called from a single thread prior to actual usage */
279 void init(const memory_tracking::grantor_t &scratchpad) const {
280 if (balancer().nthr_per_group_ == 1 || !dnnl_thr_syncable()) return;
281
282 auto bctx = scratchpad.template get<simple_barrier::ctx_t>(
283 memory_tracking::names::key_reducer_space_bctx);
284 for (int i = 0; i < balancer().ngroups_; ++i)
285 simple_barrier::ctx_init(&bctx[i]);
286 }
287
288 /** for given thread returns the pointer where to put partial results */
289 data_t *get_local_ptr(
290 int ithr, const memory_tracking::grantor_t &scratchpad) const;
291
292 /** performs the reduction with built-in synchronization. */
293 void reduce(int ithr, data_t *dst,
294 const memory_tracking::grantor_t &scratchpad) const {
295 bool redundant_reduction
296 = balancer().nthr_per_group_ == 1 || balancer().idle(ithr);
297 if (redundant_reduction) return;
298
299 auto bctx = scratchpad.template get<simple_barrier::ctx_t>(
300 memory_tracking::names::key_reducer_space_bctx);
301 simple_barrier::barrier(
302 &bctx[balancer().group_id(ithr)], balancer().nthr_per_group_);
303
304 reduce_nolock(ithr, dst, scratchpad);
305 }
306
307 void reduce_nolock(int ithr, data_t *dst,
308 const memory_tracking::grantor_t &scratchpad) const;
309
310 const reduce_balancer_t &balancer() const { return conf_.balancer_; }
311
312private:
313 static size_t space_per_thread(const reduce_balancer_t &balancer) {
314 return balancer.njobs_per_group_ub_ * balancer.job_size_;
315 }
316
317 /* The scratchpad is organized as follows:
318 *
319 * data_t space[nthr_][njobs_per_group_ub_][jobs_size_];
320 * simple_barrier::ctx_t barriers[groups_]; */
321
322 const conf_t conf_;
323 reducer_2d_driver_t<data_type> *drv_;
324
325 int choose_x_blocking(int nx, int ny, int nthr_per_grp) const;
326 void reduce_block(const data_t *space_base, data_t *dst, int job,
327 int start_y, int start_x, int ny_start, int nx_start, int ny_step,
328 int nx_step) const;
329
330 DNNL_DISALLOW_COPY_AND_ASSIGN(cpu_reducer_2d_t);
331};
332
333/** simple 1d accumulator: y[:] += x[:] */
334template <impl::data_type_t data_type>
335struct cpu_accumulator_1d_t {
336 typedef typename prec_traits<data_type>::type data_t;
337
338 cpu_accumulator_1d_t();
339 ~cpu_accumulator_1d_t();
340 void accumulate(data_t *dst, const data_t *src, size_t size);
341
342 status_t create_kernel();
343
344 reducer_2d_driver_t<data_type> *drv_;
345
346 DNNL_DISALLOW_COPY_AND_ASSIGN(cpu_accumulator_1d_t);
347};
348
349} // namespace x64
350} // namespace cpu
351} // namespace impl
352} // namespace dnnl
353
354#endif
355
356// vim: et ts=4 sw=4 cindent cino+=l0,\:4,N-s
357