1/*******************************************************************************
2* Copyright 2017-2022 Intel Corporation
3*
4* Licensed under the Apache License, Version 2.0 (the "License");
5* you may not use this file except in compliance with the License.
6* You may obtain a copy of the License at
7*
8* http://www.apache.org/licenses/LICENSE-2.0
9*
10* Unless required by applicable law or agreed to in writing, software
11* distributed under the License is distributed on an "AS IS" BASIS,
12* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13* See the License for the specific language governing permissions and
14* limitations under the License.
15*******************************************************************************/
16
17#include <assert.h>
18
19#include "oneapi/dnnl/dnnl_types.h"
20
21#include "common/dnnl_thread.hpp"
22#include "common/nstl.hpp"
23#include "common/utils.hpp"
24
25#include "cpu/platform.hpp"
26
27#include "cpu/x64/cpu_reducer.hpp"
28
29namespace dnnl {
30namespace impl {
31namespace cpu {
32namespace x64 {
33
34using namespace memory_tracking::names;
35
36void reduce_balancer_t::balance() {
37 using namespace nstl;
38 using namespace utils;
39
40 assert(nthr_ > 0 && job_size_ > 0 && njobs_ > 0 && reduction_size_ > 0);
41
42 const int job_complexity = 1;
43
44 const int min_njobs_per_group = max(1, njobs_ / nthr_);
45 const int max_njobs_per_group
46 = max(1, static_cast<int>(max_buffer_size_ / (nthr_ * job_size_)));
47
48 /* initial guess */
49 int ngroups = min(njobs_ / min_njobs_per_group, nthr_);
50 int nthr_per_group
51 = allow_nthr_in_group_ ? min(nthr_ / ngroups, reduction_size_) : 1;
52 int njobs_per_group_ub = div_up(njobs_, ngroups);
53
54 /* rough upper-bound estimation, will be fixed during brute force */
55 size_t thread_complexity_ub = (size_t)njobs_ * job_size_ * reduction_size_;
56
57 /* brute force parameters for the best balance... */
58 for (int c_njobs_per_group = min_njobs_per_group;
59 c_njobs_per_group < njobs_; ++c_njobs_per_group) {
60 /* current assumption */
61 int c_ngroups = min(njobs_ / c_njobs_per_group, nthr_);
62 int c_nthr_per_group = allow_nthr_in_group_
63 ? min(nthr_ / c_ngroups, reduction_size_)
64 : 1;
65 int c_njobs_per_group_ub = div_up(njobs_, c_ngroups);
66
67 if (c_nthr_per_group > 1 && c_njobs_per_group_ub > max_njobs_per_group)
68 continue;
69
70 int c_thread_reduction_ub = div_up(reduction_size_, c_nthr_per_group);
71 size_t c_group_size_ub = (size_t)job_size_ * c_njobs_per_group_ub;
72 size_t c_thread_complexity_ub = c_group_size_ub
73 * (job_complexity * c_thread_reduction_ub
74 + (c_nthr_per_group != 1));
75
76 if (c_thread_complexity_ub < thread_complexity_ub) {
77 ngroups = c_ngroups;
78 nthr_per_group = c_nthr_per_group;
79 njobs_per_group_ub = c_njobs_per_group_ub;
80 thread_complexity_ub = c_thread_complexity_ub;
81 }
82 }
83
84 assert(njobs_per_group_ub <= max_njobs_per_group || nthr_per_group == 1);
85 assert(ngroups * nthr_per_group <= nthr_);
86 assert((size_t)njobs_per_group_ub * job_size_ * nthr_ <= max_buffer_size_
87 || nthr_per_group == 1); /* no reduction buffer overflow */
88 assert(IMPLICATION(!allow_nthr_in_group_, nthr_per_group == 1));
89
90 ngroups_ = ngroups;
91 nthr_per_group_ = nthr_per_group;
92 njobs_per_group_ub_ = njobs_per_group_ub;
93}
94
95/* reducer jit-ted driver */
96
97using namespace Xbyak;
98
99template <impl::data_type_t data_type>
100struct reducer_2d_driver_t : public jit_generator {
101 using data_t = typename prec_traits<data_type>::type;
102
103 reducer_2d_driver_t(int n_src, size_t src_ld, size_t src_step,
104 size_t dst_step, bool nullify_dst, const char *name)
105 : jit_generator(name)
106 , n_src_(n_src)
107 , src_ld_(src_ld)
108 , src_step_(src_step)
109 , dst_step_(dst_step)
110 , nullify_dst_(nullify_dst) {}
111 virtual void operator()(
112 data_t *dst, const data_t *srcs, size_t ny, size_t nx)
113 = 0;
114
115protected:
116 int n_src_;
117 size_t src_ld_, src_step_, dst_step_;
118 bool nullify_dst_;
119};
120
121template <impl::data_type_t data_type, cpu_isa_t isa>
122struct reducer_2d_driver_f_s_32_t : public reducer_2d_driver_t<data_type> {
123 DECLARE_CPU_JIT_AUX_FUNCTIONS(reducer_2d_driver_f_s_32_t)
124
125 using data_t = typename prec_traits<data_type>::type;
126
127 void operator()(
128 data_t *dst, const data_t *srcs, size_t ny, size_t nx) override {
129 jit_generator::operator()(dst, srcs, ny, nx);
130 }
131
132 /* cpu specific part */
133 using Vmm = typename utils::conditional<isa == avx2, Ymm, Zmm>::type;
134 const AddressFrame &vmmword = (isa == avx2) ? this->yword : this->zword;
135 void uni_vadd(const Xmm &x1, const Xmm &x2, const Operand &op) {
136 if (data_type == data_type::f32)
137 this->vaddps(x1, x2, op);
138 else
139 this->vpaddd(x1, x2, op);
140 }
141 void uni_add(const Xmm &x1, const Operand &op) {
142 if (data_type == data_type::f32)
143 this->addss(x1, op);
144 else
145 this->paddd(x1, op);
146 }
147
148 const int vlen = cpu_isa_traits<isa>::vlen;
149 const int typesize
150 = sizeof(typename dnnl::impl::prec_traits<data_type>::type);
151 Xbyak::Reg64 reg_dst = abi_param1;
152 Xbyak::Reg64 reg_src = abi_param2;
153 Xbyak::Reg64 reg_ny = abi_param3;
154 Xbyak::Reg64 reg_nx = abi_param4;
155
156 Xbyak::Reg64 reg_x = this->rax;
157 Xbyak::Reg64 reg_src_id = this->r10;
158 Xbyak::Reg64 reg_long_offt = this->r11;
159
160 reducer_2d_driver_f_s_32_t(int n_src, size_t src_ld, size_t src_step,
161 size_t dst_step, bool nullify_dst)
162 : reducer_2d_driver_t<data_type>(
163 n_src, src_ld, src_step, dst_step, nullify_dst, jit_name()) {}
164
165 void nullify_dst(int nloads, int load_len) {
166 UNUSED(load_len);
167 for (int i = 0; i < nloads; ++i)
168 this->uni_vpxor(Vmm(i), Vmm(i), Vmm(i));
169 /* prefetches[dst] ? */
170 }
171
172 void load_dst(int nloads, int load_len) {
173 for (int i = 0; i < nloads; ++i) {
174 if (load_len == typesize)
175 this->movd(Xmm(i), this->ptr[reg_dst + i * load_len]);
176 else if (load_len == vlen)
177 this->vmovups(Vmm(i), this->ptr[reg_dst + i * load_len]);
178 else
179 assert(!"unsupported");
180 }
181 }
182
183 void store_dst(int nloads, int load_len) {
184 for (int i = 0; i < nloads; ++i) {
185 if (load_len == typesize)
186 this->movd(this->ptr[reg_dst + i * load_len], Xmm(i));
187 else if (load_len == vlen)
188 this->vmovups(this->ptr[reg_dst + i * load_len], Vmm(i));
189 else
190 assert(!"unsupported");
191 }
192 }
193
194 void accumulate(int nloads, int load_len, size_t base_off) {
195 for (int i = 0; i < nloads; ++i) {
196 size_t off = base_off + i * load_len;
197
198 if (load_len == typesize)
199 this->uni_add(Xmm(i), this->ptr[reg_src + off]);
200 else if (load_len == vlen)
201 this->uni_vadd(Vmm(i), Vmm(i), vmmword[reg_src + off]);
202 else
203 assert(!"unsupported");
204 }
205 }
206
207 void loop_x() {
208 const int nloads[] = {cpu_isa_traits<isa>::n_vregs, 1, 1};
209 const int nbranches = sizeof(nloads) / sizeof(nloads[0]);
210
211 const int load_len[nbranches] = {vlen, vlen, typesize};
212 Label loop_x_label[nbranches + 1];
213
214 this->mov(reg_x, reg_nx);
215
216 for (int id = 0; id < nbranches; ++id) {
217 this->L(loop_x_label[id]);
218
219 this->cmp(reg_x, nloads[id] * load_len[id]);
220 this->jl(loop_x_label[id + 1], this->T_NEAR);
221
222 if (this->nullify_dst_)
223 nullify_dst(nloads[id], load_len[id]);
224 else
225 load_dst(nloads[id], load_len[id]);
226
227 if (nloads[id] > 1) {
228 Label loop_srcs;
229 this->mov(reg_src_id, this->n_src_);
230 this->L(loop_srcs);
231
232 accumulate(nloads[id], load_len[id], 0);
233 this->add(reg_src, this->src_ld_ * typesize);
234
235 this->dec(reg_src_id);
236 this->jnz(loop_srcs, this->T_NEAR);
237
238 size_t base_off
239 = (size_t)this->n_src_ * this->src_ld_ * typesize;
240 this->safe_sub(reg_src, base_off, reg_long_offt);
241 } else {
242 for (int src_id = 0; src_id < this->n_src_; ++src_id) {
243 const size_t base_off
244 = (size_t)src_id * this->src_ld_ * typesize;
245 accumulate(nloads[id], load_len[id], base_off);
246 }
247 }
248
249 store_dst(nloads[id], load_len[id]);
250
251 this->add(reg_src, nloads[id] * load_len[id]);
252 this->add(reg_dst, nloads[id] * load_len[id]);
253
254 this->sub(reg_x, nloads[id] * load_len[id]);
255
256 this->jmp(loop_x_label[id], this->T_NEAR);
257 }
258
259 this->L(loop_x_label[nbranches]);
260
261 /* restore address registers */
262 this->sub(reg_src, reg_nx);
263 this->sub(reg_dst, reg_nx);
264 }
265
266 void generate() override {
267 static_assert(isa == avx2 || isa == avx512_core, "unsupported CPU ISA");
268
269 this->preamble();
270
271 this->shl(reg_nx, 2);
272
273 Label ny_loop;
274 this->L(ny_loop);
275
276 loop_x();
277
278 this->add(reg_dst, this->dst_step_ * typesize);
279 this->add(reg_src, this->src_step_ * typesize);
280
281 this->dec(reg_ny);
282 this->jnz(ny_loop, this->T_NEAR);
283
284 this->postamble();
285 }
286};
287
288template <impl::data_type_t data_type>
289inline reducer_2d_driver_t<data_type> *create_reduce_2d_drv(int n_src,
290 size_t src_ld, size_t src_step, size_t dst_step, bool nullify_dst) {
291 if (mayiuse(avx512_core))
292 return new reducer_2d_driver_f_s_32_t<data_type, avx512_core>(
293 n_src, src_ld, src_step, dst_step, nullify_dst);
294 else if (mayiuse(avx2))
295 return new reducer_2d_driver_f_s_32_t<data_type, avx2>(
296 n_src, src_ld, src_step, dst_step, nullify_dst);
297 assert(!"unimplemented");
298 return nullptr;
299}
300
301/* cpu_reducer_t */
302
303template <impl::data_type_t data_type>
304void cpu_reducer_t<data_type>::conf_t::init_scratchpad(
305 memory_tracking::registrar_t &scratchpad) const {
306 if (balancer_.nthr_per_group_ == 1) return;
307
308 const size_t space_size = balancer_.ngroups_
309 * (balancer_.nthr_per_group_ - 1)
310 * cpu_reducer_t<data_type>::space_per_thread(balancer_);
311 scratchpad.book<data_t>(key_reducer_space, space_size, PAGE_4K);
312 scratchpad.book<simple_barrier::ctx_t>(
313 key_reducer_space_bctx, balancer_.ngroups_);
314}
315
316template <impl::data_type_t data_type>
317cpu_reducer_t<data_type>::cpu_reducer_t(const conf_t &conf)
318 : conf_(conf), drv_(nullptr) {
319 if (balancer().nthr_per_group_ == 1) return;
320
321 drv_ = create_reduce_2d_drv<data_type>(balancer().nthr_per_group_ - 1,
322 space_per_thread(balancer()), 0, 0, false);
323}
324
325template <impl::data_type_t data_type>
326cpu_reducer_t<data_type>::~cpu_reducer_t() {
327 delete drv_;
328}
329
330template <impl::data_type_t data_type>
331status_t cpu_reducer_t<data_type>::create_kernel() {
332 return (drv_) ? drv_->create_kernel() : status::success;
333}
334
335template <impl::data_type_t data_type>
336typename cpu_reducer_t<data_type>::data_t *
337cpu_reducer_t<data_type>::get_local_ptr(int ithr, data_t *dst,
338 const memory_tracking::grantor_t &scratchpad) const {
339 const int id_in_grp = balancer().id_in_group(ithr);
340
341 /* threads 0 from each group writes directly to the destination */
342 if (id_in_grp == 0)
343 return dst + balancer().ithr_job_off(ithr) * balancer().job_size_;
344
345 const int grp_id = balancer().group_id(ithr);
346 const int offset_factor
347 = grp_id * (balancer().nthr_per_group_ - 1) + (id_in_grp - 1);
348
349 auto space = scratchpad.template get<data_t>(key_reducer_space);
350 return space + offset_factor * space_per_thread(balancer());
351}
352
353template <impl::data_type_t data_type>
354void cpu_reducer_t<data_type>::reduce_nolock(int ithr, data_t *dst,
355 const memory_tracking::grantor_t &scratchpad) const {
356 bool redundant_reduction
357 = balancer().nthr_per_group_ == 1 || balancer().idle(ithr);
358 if (redundant_reduction) return;
359
360#ifdef SIMPLE_IMPL
361 if (balancer().id_in_group(ithr) != 0)
362 return; /* only threads 0 do the reduction */
363
364 const int njobs_in_grp = balancer().ithr_njobs(ithr);
365 data_t *d = get_local_ptr(ithr, dst, scratchpad);
366 for (int id_in_grp = 1; id_in_grp < balancer().nthr_per_group_;
367 ++id_in_grp) {
368 const data_t *space = get_local_ptr(ithr + id_in_grp, dst, scratchpad);
369 for (size_t i = 0; i < (size_t)njobs_in_grp * balancer().job_size_; ++i)
370 d[i] += space[i];
371 }
372#else
373 using namespace utils;
374
375 const int id_in_grp = balancer().id_in_group(ithr);
376 const int njobs_in_grp = balancer().ithr_njobs(ithr);
377 const size_t cl = 64 / sizeof(data_t);
378
379 const size_t reduction_size = njobs_in_grp * balancer().job_size_;
380 size_t start {0}, end {0};
381 balance211(div_up(reduction_size, cl), balancer().nthr_per_group_,
382 id_in_grp, start, end);
383
384 if (start == end) return;
385
386 data_t *d = get_local_ptr(ithr - id_in_grp, dst, scratchpad) + start * cl;
387 const data_t *space
388 = get_local_ptr(ithr - id_in_grp + 1, dst, scratchpad) + start * cl;
389 const size_t len = nstl::min(end * cl, reduction_size) - start * cl;
390
391 (*drv_)(d, space, 1, len);
392#endif
393}
394
395template struct cpu_reducer_t<data_type::f32>;
396template struct cpu_reducer_t<data_type::s32>;
397
398/* cpu_reducer_2d_t */
399
400template <impl::data_type_t data_type>
401void cpu_reducer_2d_t<data_type>::conf_t::init_scratchpad(
402 memory_tracking::registrar_t &scratchpad) const {
403 if (balancer_.nthr_per_group_ == 1) return;
404
405 const size_t space_size = balancer_.ngroups_ * balancer_.nthr_per_group_
406 * cpu_reducer_2d_t<data_type>::space_per_thread(balancer_);
407 scratchpad.book<data_t>(key_reducer_space, space_size);
408 scratchpad.book<simple_barrier::ctx_t>(
409 key_reducer_space_bctx, balancer_.ngroups_);
410}
411
412template <impl::data_type_t data_type>
413cpu_reducer_2d_t<data_type>::cpu_reducer_2d_t(const conf_t &conf)
414 : conf_(conf), drv_(nullptr) {
415 if (balancer().nthr_per_group_ == 1) return;
416
417 drv_ = create_reduce_2d_drv<data_type>(balancer().nthr_per_group_,
418 space_per_thread(balancer()), conf_.job_size_x_, conf_.dst_x_,
419 true);
420}
421
422template <impl::data_type_t data_type>
423cpu_reducer_2d_t<data_type>::~cpu_reducer_2d_t() {
424 delete drv_;
425}
426
427template <impl::data_type_t data_type>
428status_t cpu_reducer_2d_t<data_type>::create_kernel() {
429 return (drv_) ? drv_->create_kernel() : status::success;
430}
431
432template <impl::data_type_t data_type>
433typename cpu_reducer_2d_t<data_type>::data_t *
434cpu_reducer_2d_t<data_type>::get_local_ptr(
435 int ithr, const memory_tracking::grantor_t &scratchpad) const {
436 const int id_in_grp = balancer().id_in_group(ithr);
437 const int grp_id = balancer().group_id(ithr);
438 const int offset_factor = grp_id * balancer().nthr_per_group_ + id_in_grp;
439 auto space = scratchpad.template get<data_t>(key_reducer_space);
440 return space + offset_factor * space_per_thread(balancer());
441}
442
443template <impl::data_type_t data_type>
444int cpu_reducer_2d_t<data_type>::choose_x_blocking(
445 int nx, int ny, int nthr_per_grp) const {
446 // find x_blocking for better balance reducing work between threads
447 assert(conf_.x_block_ > 0 && nx > conf_.x_block_
448 && nx % conf_.x_block_ == 0);
449 int x_blocking = nx / conf_.x_block_;
450 int min_x_blocking
451 = utils::div_up(x_blocking, nstl::max(1, nthr_per_grp / ny));
452 while (true) {
453 if (x_blocking % 2 == 0 && x_blocking >= min_x_blocking * 2)
454 x_blocking /= 2;
455 else if (x_blocking % 3 == 0 && x_blocking >= min_x_blocking * 3)
456 x_blocking /= 3;
457 else
458 break;
459 }
460 if (x_blocking >= min_x_blocking * 4) x_blocking = 1;
461 x_blocking *= conf_.x_block_;
462 return x_blocking;
463}
464
465template <impl::data_type_t data_type>
466void cpu_reducer_2d_t<data_type>::reduce_block(const data_t *space_base,
467 data_t *dst, int job, int start_y, int start_x, int ny_start,
468 int nx_start, int ny_step, int nx_step) const {
469 data_t *d = dst + (start_y + ny_start) * conf_.dst_x_ + start_x + nx_start;
470 const data_t *space = space_base + (size_t)job * balancer().job_size_
471 + (size_t)ny_start * conf_.job_size_x_ + nx_start;
472#ifdef SIMPLE_IMPL
473 for (int idg = 0; idg < balancer().nthr_per_group_; ++idg) {
474 const data_t *w = &space[idg * space_per_thread(balancer())];
475 for (int y = 0; y < ny_step; ++y)
476 for (int x = 0; x < nx_step; ++x) {
477 d[y * conf_.dst_x_ + x]
478 = (idg == 0 ? 0 : d[y * conf_.dst_x_ + x])
479 + w[y * conf_.job_size_x_ + x];
480 }
481 }
482#else
483 (*drv_)(d, space, ny_step, nx_step);
484#endif
485}
486
487template <impl::data_type_t data_type>
488void cpu_reducer_2d_t<data_type>::reduce_nolock(int ithr, data_t *dst,
489 const memory_tracking::grantor_t &scratchpad) const {
490 bool redundant_reduction
491 = balancer().nthr_per_group_ == 1 || balancer().idle(ithr);
492 if (redundant_reduction) return;
493
494 const int id_in_grp = balancer().id_in_group(ithr);
495 const int njobs_in_grp = balancer().ithr_njobs(ithr);
496 const int njobs_x = utils::div_up(conf_.dst_x_, conf_.job_size_x_);
497 const int global_job_start = balancer().ithr_job_off(ithr);
498
499 const data_t *space_base = get_local_ptr(ithr - id_in_grp, scratchpad);
500
501 const int pr_grps = nstl::min(njobs_in_grp, balancer().nthr_per_group_);
502 const int pr_nthr_per_grp = balancer().nthr_per_group_ / pr_grps;
503
504 if (id_in_grp >= pr_grps * pr_nthr_per_grp) return; /* idle */
505
506 const int pr_my_grp = id_in_grp / pr_nthr_per_grp;
507 const int pr_my_id = id_in_grp % pr_nthr_per_grp;
508
509 int pr_job_start {0}, pr_job_end {0};
510 balance211(njobs_in_grp, pr_grps, pr_my_grp, pr_job_start, pr_job_end);
511
512 for (int j = pr_job_start; j < pr_job_end; ++j) {
513 const int global_job = global_job_start + j;
514 const int j_y = global_job / njobs_x;
515 const int j_x = global_job % njobs_x;
516 const int start_y = j_y * conf_.job_size_y_;
517 const int start_x = j_x * conf_.job_size_x_;
518 const int ny = nstl::min(conf_.dst_y_ - start_y, conf_.job_size_y_);
519 const int nx = nstl::min(conf_.dst_x_ - start_x, conf_.job_size_x_);
520 int x_blocking = choose_x_blocking(nx, ny, pr_nthr_per_grp);
521
522 int nxy_start {0}, nxy_end {0};
523 balance211(ny * nx / x_blocking, pr_nthr_per_grp, pr_my_id, nxy_start,
524 nxy_end);
525 if (nxy_start == nxy_end) continue;
526 nxy_start *= x_blocking;
527 nxy_end *= x_blocking;
528
529 int nxy = nxy_start;
530 if (nxy % nx != 0) {
531 int nx_step = nstl::min(nx - nxy % nx, nxy_end - nxy);
532 reduce_block(space_base, dst, j, start_y, start_x, nxy / nx,
533 nxy % nx, 1, nx_step);
534 nxy += nx_step;
535 }
536 if ((nxy_end - nxy) > nx) {
537 int ny_step = (nxy_end - nxy) / nx;
538 reduce_block(space_base, dst, j, start_y, start_x, nxy / nx,
539 nxy % nx, ny_step, nx);
540 nxy += nx * ny_step;
541 }
542 if ((nxy_end - nxy) > 0) {
543 reduce_block(space_base, dst, j, start_y, start_x, nxy / nx,
544 nxy % nx, 1, nxy_end - nxy);
545 }
546 }
547}
548
549template struct cpu_reducer_2d_t<data_type::f32>;
550template struct cpu_reducer_2d_t<data_type::s32>;
551
552/* accumulator section */
553
554template <impl::data_type_t data_type>
555cpu_accumulator_1d_t<data_type>::cpu_accumulator_1d_t() : drv_(nullptr) {
556 drv_ = create_reduce_2d_drv<data_type>(1, 0, 0, 0, false);
557}
558
559template <impl::data_type_t data_type>
560cpu_accumulator_1d_t<data_type>::~cpu_accumulator_1d_t() {
561 delete drv_;
562}
563
564template <impl::data_type_t data_type>
565status_t cpu_accumulator_1d_t<data_type>::create_kernel() {
566 return drv_->create_kernel();
567}
568
569template <impl::data_type_t data_type>
570void cpu_accumulator_1d_t<data_type>::accumulate(
571 data_t *dst, const data_t *src, size_t size) {
572 (*drv_)(dst, src, 1, size);
573}
574
575template struct cpu_accumulator_1d_t<data_type::f32>;
576template struct cpu_accumulator_1d_t<data_type::s32>;
577
578} // namespace x64
579} // namespace cpu
580} // namespace impl
581} // namespace dnnl
582
583// vim: et ts=4 sw=4 cindent cino+=l0,\:4,N-s
584