1 | /******************************************************************************* |
2 | * Copyright 2017-2022 Intel Corporation |
3 | * |
4 | * Licensed under the Apache License, Version 2.0 (the "License"); |
5 | * you may not use this file except in compliance with the License. |
6 | * You may obtain a copy of the License at |
7 | * |
8 | * http://www.apache.org/licenses/LICENSE-2.0 |
9 | * |
10 | * Unless required by applicable law or agreed to in writing, software |
11 | * distributed under the License is distributed on an "AS IS" BASIS, |
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
13 | * See the License for the specific language governing permissions and |
14 | * limitations under the License. |
15 | *******************************************************************************/ |
16 | |
17 | #include <assert.h> |
18 | |
19 | #include "oneapi/dnnl/dnnl_types.h" |
20 | |
21 | #include "common/dnnl_thread.hpp" |
22 | #include "common/nstl.hpp" |
23 | #include "common/utils.hpp" |
24 | |
25 | #include "cpu/platform.hpp" |
26 | |
27 | #include "cpu/x64/cpu_reducer.hpp" |
28 | |
29 | namespace dnnl { |
30 | namespace impl { |
31 | namespace cpu { |
32 | namespace x64 { |
33 | |
34 | using namespace memory_tracking::names; |
35 | |
36 | void reduce_balancer_t::balance() { |
37 | using namespace nstl; |
38 | using namespace utils; |
39 | |
40 | assert(nthr_ > 0 && job_size_ > 0 && njobs_ > 0 && reduction_size_ > 0); |
41 | |
42 | const int job_complexity = 1; |
43 | |
44 | const int min_njobs_per_group = max(1, njobs_ / nthr_); |
45 | const int max_njobs_per_group |
46 | = max(1, static_cast<int>(max_buffer_size_ / (nthr_ * job_size_))); |
47 | |
48 | /* initial guess */ |
49 | int ngroups = min(njobs_ / min_njobs_per_group, nthr_); |
50 | int nthr_per_group |
51 | = allow_nthr_in_group_ ? min(nthr_ / ngroups, reduction_size_) : 1; |
52 | int njobs_per_group_ub = div_up(njobs_, ngroups); |
53 | |
54 | /* rough upper-bound estimation, will be fixed during brute force */ |
55 | size_t thread_complexity_ub = (size_t)njobs_ * job_size_ * reduction_size_; |
56 | |
57 | /* brute force parameters for the best balance... */ |
58 | for (int c_njobs_per_group = min_njobs_per_group; |
59 | c_njobs_per_group < njobs_; ++c_njobs_per_group) { |
60 | /* current assumption */ |
61 | int c_ngroups = min(njobs_ / c_njobs_per_group, nthr_); |
62 | int c_nthr_per_group = allow_nthr_in_group_ |
63 | ? min(nthr_ / c_ngroups, reduction_size_) |
64 | : 1; |
65 | int c_njobs_per_group_ub = div_up(njobs_, c_ngroups); |
66 | |
67 | if (c_nthr_per_group > 1 && c_njobs_per_group_ub > max_njobs_per_group) |
68 | continue; |
69 | |
70 | int c_thread_reduction_ub = div_up(reduction_size_, c_nthr_per_group); |
71 | size_t c_group_size_ub = (size_t)job_size_ * c_njobs_per_group_ub; |
72 | size_t c_thread_complexity_ub = c_group_size_ub |
73 | * (job_complexity * c_thread_reduction_ub |
74 | + (c_nthr_per_group != 1)); |
75 | |
76 | if (c_thread_complexity_ub < thread_complexity_ub) { |
77 | ngroups = c_ngroups; |
78 | nthr_per_group = c_nthr_per_group; |
79 | njobs_per_group_ub = c_njobs_per_group_ub; |
80 | thread_complexity_ub = c_thread_complexity_ub; |
81 | } |
82 | } |
83 | |
84 | assert(njobs_per_group_ub <= max_njobs_per_group || nthr_per_group == 1); |
85 | assert(ngroups * nthr_per_group <= nthr_); |
86 | assert((size_t)njobs_per_group_ub * job_size_ * nthr_ <= max_buffer_size_ |
87 | || nthr_per_group == 1); /* no reduction buffer overflow */ |
88 | assert(IMPLICATION(!allow_nthr_in_group_, nthr_per_group == 1)); |
89 | |
90 | ngroups_ = ngroups; |
91 | nthr_per_group_ = nthr_per_group; |
92 | njobs_per_group_ub_ = njobs_per_group_ub; |
93 | } |
94 | |
95 | /* reducer jit-ted driver */ |
96 | |
97 | using namespace Xbyak; |
98 | |
99 | template <impl::data_type_t data_type> |
100 | struct reducer_2d_driver_t : public jit_generator { |
101 | using data_t = typename prec_traits<data_type>::type; |
102 | |
103 | reducer_2d_driver_t(int n_src, size_t src_ld, size_t src_step, |
104 | size_t dst_step, bool nullify_dst, const char *name) |
105 | : jit_generator(name) |
106 | , n_src_(n_src) |
107 | , src_ld_(src_ld) |
108 | , src_step_(src_step) |
109 | , dst_step_(dst_step) |
110 | , nullify_dst_(nullify_dst) {} |
111 | virtual void operator()( |
112 | data_t *dst, const data_t *srcs, size_t ny, size_t nx) |
113 | = 0; |
114 | |
115 | protected: |
116 | int n_src_; |
117 | size_t src_ld_, src_step_, dst_step_; |
118 | bool nullify_dst_; |
119 | }; |
120 | |
121 | template <impl::data_type_t data_type, cpu_isa_t isa> |
122 | struct reducer_2d_driver_f_s_32_t : public reducer_2d_driver_t<data_type> { |
123 | DECLARE_CPU_JIT_AUX_FUNCTIONS(reducer_2d_driver_f_s_32_t) |
124 | |
125 | using data_t = typename prec_traits<data_type>::type; |
126 | |
127 | void operator()( |
128 | data_t *dst, const data_t *srcs, size_t ny, size_t nx) override { |
129 | jit_generator::operator()(dst, srcs, ny, nx); |
130 | } |
131 | |
132 | /* cpu specific part */ |
133 | using Vmm = typename utils::conditional<isa == avx2, Ymm, Zmm>::type; |
134 | const AddressFrame &vmmword = (isa == avx2) ? this->yword : this->zword; |
135 | void uni_vadd(const Xmm &x1, const Xmm &x2, const Operand &op) { |
136 | if (data_type == data_type::f32) |
137 | this->vaddps(x1, x2, op); |
138 | else |
139 | this->vpaddd(x1, x2, op); |
140 | } |
141 | void uni_add(const Xmm &x1, const Operand &op) { |
142 | if (data_type == data_type::f32) |
143 | this->addss(x1, op); |
144 | else |
145 | this->paddd(x1, op); |
146 | } |
147 | |
148 | const int vlen = cpu_isa_traits<isa>::vlen; |
149 | const int typesize |
150 | = sizeof(typename dnnl::impl::prec_traits<data_type>::type); |
151 | Xbyak::Reg64 reg_dst = abi_param1; |
152 | Xbyak::Reg64 reg_src = abi_param2; |
153 | Xbyak::Reg64 reg_ny = abi_param3; |
154 | Xbyak::Reg64 reg_nx = abi_param4; |
155 | |
156 | Xbyak::Reg64 reg_x = this->rax; |
157 | Xbyak::Reg64 reg_src_id = this->r10; |
158 | Xbyak::Reg64 reg_long_offt = this->r11; |
159 | |
160 | reducer_2d_driver_f_s_32_t(int n_src, size_t src_ld, size_t src_step, |
161 | size_t dst_step, bool nullify_dst) |
162 | : reducer_2d_driver_t<data_type>( |
163 | n_src, src_ld, src_step, dst_step, nullify_dst, jit_name()) {} |
164 | |
165 | void nullify_dst(int nloads, int load_len) { |
166 | UNUSED(load_len); |
167 | for (int i = 0; i < nloads; ++i) |
168 | this->uni_vpxor(Vmm(i), Vmm(i), Vmm(i)); |
169 | /* prefetches[dst] ? */ |
170 | } |
171 | |
172 | void load_dst(int nloads, int load_len) { |
173 | for (int i = 0; i < nloads; ++i) { |
174 | if (load_len == typesize) |
175 | this->movd(Xmm(i), this->ptr[reg_dst + i * load_len]); |
176 | else if (load_len == vlen) |
177 | this->vmovups(Vmm(i), this->ptr[reg_dst + i * load_len]); |
178 | else |
179 | assert(!"unsupported" ); |
180 | } |
181 | } |
182 | |
183 | void store_dst(int nloads, int load_len) { |
184 | for (int i = 0; i < nloads; ++i) { |
185 | if (load_len == typesize) |
186 | this->movd(this->ptr[reg_dst + i * load_len], Xmm(i)); |
187 | else if (load_len == vlen) |
188 | this->vmovups(this->ptr[reg_dst + i * load_len], Vmm(i)); |
189 | else |
190 | assert(!"unsupported" ); |
191 | } |
192 | } |
193 | |
194 | void accumulate(int nloads, int load_len, size_t base_off) { |
195 | for (int i = 0; i < nloads; ++i) { |
196 | size_t off = base_off + i * load_len; |
197 | |
198 | if (load_len == typesize) |
199 | this->uni_add(Xmm(i), this->ptr[reg_src + off]); |
200 | else if (load_len == vlen) |
201 | this->uni_vadd(Vmm(i), Vmm(i), vmmword[reg_src + off]); |
202 | else |
203 | assert(!"unsupported" ); |
204 | } |
205 | } |
206 | |
207 | void loop_x() { |
208 | const int nloads[] = {cpu_isa_traits<isa>::n_vregs, 1, 1}; |
209 | const int nbranches = sizeof(nloads) / sizeof(nloads[0]); |
210 | |
211 | const int load_len[nbranches] = {vlen, vlen, typesize}; |
212 | Label loop_x_label[nbranches + 1]; |
213 | |
214 | this->mov(reg_x, reg_nx); |
215 | |
216 | for (int id = 0; id < nbranches; ++id) { |
217 | this->L(loop_x_label[id]); |
218 | |
219 | this->cmp(reg_x, nloads[id] * load_len[id]); |
220 | this->jl(loop_x_label[id + 1], this->T_NEAR); |
221 | |
222 | if (this->nullify_dst_) |
223 | nullify_dst(nloads[id], load_len[id]); |
224 | else |
225 | load_dst(nloads[id], load_len[id]); |
226 | |
227 | if (nloads[id] > 1) { |
228 | Label loop_srcs; |
229 | this->mov(reg_src_id, this->n_src_); |
230 | this->L(loop_srcs); |
231 | |
232 | accumulate(nloads[id], load_len[id], 0); |
233 | this->add(reg_src, this->src_ld_ * typesize); |
234 | |
235 | this->dec(reg_src_id); |
236 | this->jnz(loop_srcs, this->T_NEAR); |
237 | |
238 | size_t base_off |
239 | = (size_t)this->n_src_ * this->src_ld_ * typesize; |
240 | this->safe_sub(reg_src, base_off, reg_long_offt); |
241 | } else { |
242 | for (int src_id = 0; src_id < this->n_src_; ++src_id) { |
243 | const size_t base_off |
244 | = (size_t)src_id * this->src_ld_ * typesize; |
245 | accumulate(nloads[id], load_len[id], base_off); |
246 | } |
247 | } |
248 | |
249 | store_dst(nloads[id], load_len[id]); |
250 | |
251 | this->add(reg_src, nloads[id] * load_len[id]); |
252 | this->add(reg_dst, nloads[id] * load_len[id]); |
253 | |
254 | this->sub(reg_x, nloads[id] * load_len[id]); |
255 | |
256 | this->jmp(loop_x_label[id], this->T_NEAR); |
257 | } |
258 | |
259 | this->L(loop_x_label[nbranches]); |
260 | |
261 | /* restore address registers */ |
262 | this->sub(reg_src, reg_nx); |
263 | this->sub(reg_dst, reg_nx); |
264 | } |
265 | |
266 | void generate() override { |
267 | static_assert(isa == avx2 || isa == avx512_core, "unsupported CPU ISA" ); |
268 | |
269 | this->preamble(); |
270 | |
271 | this->shl(reg_nx, 2); |
272 | |
273 | Label ny_loop; |
274 | this->L(ny_loop); |
275 | |
276 | loop_x(); |
277 | |
278 | this->add(reg_dst, this->dst_step_ * typesize); |
279 | this->add(reg_src, this->src_step_ * typesize); |
280 | |
281 | this->dec(reg_ny); |
282 | this->jnz(ny_loop, this->T_NEAR); |
283 | |
284 | this->postamble(); |
285 | } |
286 | }; |
287 | |
288 | template <impl::data_type_t data_type> |
289 | inline reducer_2d_driver_t<data_type> *create_reduce_2d_drv(int n_src, |
290 | size_t src_ld, size_t src_step, size_t dst_step, bool nullify_dst) { |
291 | if (mayiuse(avx512_core)) |
292 | return new reducer_2d_driver_f_s_32_t<data_type, avx512_core>( |
293 | n_src, src_ld, src_step, dst_step, nullify_dst); |
294 | else if (mayiuse(avx2)) |
295 | return new reducer_2d_driver_f_s_32_t<data_type, avx2>( |
296 | n_src, src_ld, src_step, dst_step, nullify_dst); |
297 | assert(!"unimplemented" ); |
298 | return nullptr; |
299 | } |
300 | |
301 | /* cpu_reducer_t */ |
302 | |
303 | template <impl::data_type_t data_type> |
304 | void cpu_reducer_t<data_type>::conf_t::init_scratchpad( |
305 | memory_tracking::registrar_t &scratchpad) const { |
306 | if (balancer_.nthr_per_group_ == 1) return; |
307 | |
308 | const size_t space_size = balancer_.ngroups_ |
309 | * (balancer_.nthr_per_group_ - 1) |
310 | * cpu_reducer_t<data_type>::space_per_thread(balancer_); |
311 | scratchpad.book<data_t>(key_reducer_space, space_size, PAGE_4K); |
312 | scratchpad.book<simple_barrier::ctx_t>( |
313 | key_reducer_space_bctx, balancer_.ngroups_); |
314 | } |
315 | |
316 | template <impl::data_type_t data_type> |
317 | cpu_reducer_t<data_type>::cpu_reducer_t(const conf_t &conf) |
318 | : conf_(conf), drv_(nullptr) { |
319 | if (balancer().nthr_per_group_ == 1) return; |
320 | |
321 | drv_ = create_reduce_2d_drv<data_type>(balancer().nthr_per_group_ - 1, |
322 | space_per_thread(balancer()), 0, 0, false); |
323 | } |
324 | |
325 | template <impl::data_type_t data_type> |
326 | cpu_reducer_t<data_type>::~cpu_reducer_t() { |
327 | delete drv_; |
328 | } |
329 | |
330 | template <impl::data_type_t data_type> |
331 | status_t cpu_reducer_t<data_type>::create_kernel() { |
332 | return (drv_) ? drv_->create_kernel() : status::success; |
333 | } |
334 | |
335 | template <impl::data_type_t data_type> |
336 | typename cpu_reducer_t<data_type>::data_t * |
337 | cpu_reducer_t<data_type>::get_local_ptr(int ithr, data_t *dst, |
338 | const memory_tracking::grantor_t &scratchpad) const { |
339 | const int id_in_grp = balancer().id_in_group(ithr); |
340 | |
341 | /* threads 0 from each group writes directly to the destination */ |
342 | if (id_in_grp == 0) |
343 | return dst + balancer().ithr_job_off(ithr) * balancer().job_size_; |
344 | |
345 | const int grp_id = balancer().group_id(ithr); |
346 | const int offset_factor |
347 | = grp_id * (balancer().nthr_per_group_ - 1) + (id_in_grp - 1); |
348 | |
349 | auto space = scratchpad.template get<data_t>(key_reducer_space); |
350 | return space + offset_factor * space_per_thread(balancer()); |
351 | } |
352 | |
353 | template <impl::data_type_t data_type> |
354 | void cpu_reducer_t<data_type>::reduce_nolock(int ithr, data_t *dst, |
355 | const memory_tracking::grantor_t &scratchpad) const { |
356 | bool redundant_reduction |
357 | = balancer().nthr_per_group_ == 1 || balancer().idle(ithr); |
358 | if (redundant_reduction) return; |
359 | |
360 | #ifdef SIMPLE_IMPL |
361 | if (balancer().id_in_group(ithr) != 0) |
362 | return; /* only threads 0 do the reduction */ |
363 | |
364 | const int njobs_in_grp = balancer().ithr_njobs(ithr); |
365 | data_t *d = get_local_ptr(ithr, dst, scratchpad); |
366 | for (int id_in_grp = 1; id_in_grp < balancer().nthr_per_group_; |
367 | ++id_in_grp) { |
368 | const data_t *space = get_local_ptr(ithr + id_in_grp, dst, scratchpad); |
369 | for (size_t i = 0; i < (size_t)njobs_in_grp * balancer().job_size_; ++i) |
370 | d[i] += space[i]; |
371 | } |
372 | #else |
373 | using namespace utils; |
374 | |
375 | const int id_in_grp = balancer().id_in_group(ithr); |
376 | const int njobs_in_grp = balancer().ithr_njobs(ithr); |
377 | const size_t cl = 64 / sizeof(data_t); |
378 | |
379 | const size_t reduction_size = njobs_in_grp * balancer().job_size_; |
380 | size_t start {0}, end {0}; |
381 | balance211(div_up(reduction_size, cl), balancer().nthr_per_group_, |
382 | id_in_grp, start, end); |
383 | |
384 | if (start == end) return; |
385 | |
386 | data_t *d = get_local_ptr(ithr - id_in_grp, dst, scratchpad) + start * cl; |
387 | const data_t *space |
388 | = get_local_ptr(ithr - id_in_grp + 1, dst, scratchpad) + start * cl; |
389 | const size_t len = nstl::min(end * cl, reduction_size) - start * cl; |
390 | |
391 | (*drv_)(d, space, 1, len); |
392 | #endif |
393 | } |
394 | |
395 | template struct cpu_reducer_t<data_type::f32>; |
396 | template struct cpu_reducer_t<data_type::s32>; |
397 | |
398 | /* cpu_reducer_2d_t */ |
399 | |
400 | template <impl::data_type_t data_type> |
401 | void cpu_reducer_2d_t<data_type>::conf_t::init_scratchpad( |
402 | memory_tracking::registrar_t &scratchpad) const { |
403 | if (balancer_.nthr_per_group_ == 1) return; |
404 | |
405 | const size_t space_size = balancer_.ngroups_ * balancer_.nthr_per_group_ |
406 | * cpu_reducer_2d_t<data_type>::space_per_thread(balancer_); |
407 | scratchpad.book<data_t>(key_reducer_space, space_size); |
408 | scratchpad.book<simple_barrier::ctx_t>( |
409 | key_reducer_space_bctx, balancer_.ngroups_); |
410 | } |
411 | |
412 | template <impl::data_type_t data_type> |
413 | cpu_reducer_2d_t<data_type>::cpu_reducer_2d_t(const conf_t &conf) |
414 | : conf_(conf), drv_(nullptr) { |
415 | if (balancer().nthr_per_group_ == 1) return; |
416 | |
417 | drv_ = create_reduce_2d_drv<data_type>(balancer().nthr_per_group_, |
418 | space_per_thread(balancer()), conf_.job_size_x_, conf_.dst_x_, |
419 | true); |
420 | } |
421 | |
422 | template <impl::data_type_t data_type> |
423 | cpu_reducer_2d_t<data_type>::~cpu_reducer_2d_t() { |
424 | delete drv_; |
425 | } |
426 | |
427 | template <impl::data_type_t data_type> |
428 | status_t cpu_reducer_2d_t<data_type>::create_kernel() { |
429 | return (drv_) ? drv_->create_kernel() : status::success; |
430 | } |
431 | |
432 | template <impl::data_type_t data_type> |
433 | typename cpu_reducer_2d_t<data_type>::data_t * |
434 | cpu_reducer_2d_t<data_type>::get_local_ptr( |
435 | int ithr, const memory_tracking::grantor_t &scratchpad) const { |
436 | const int id_in_grp = balancer().id_in_group(ithr); |
437 | const int grp_id = balancer().group_id(ithr); |
438 | const int offset_factor = grp_id * balancer().nthr_per_group_ + id_in_grp; |
439 | auto space = scratchpad.template get<data_t>(key_reducer_space); |
440 | return space + offset_factor * space_per_thread(balancer()); |
441 | } |
442 | |
443 | template <impl::data_type_t data_type> |
444 | int cpu_reducer_2d_t<data_type>::choose_x_blocking( |
445 | int nx, int ny, int nthr_per_grp) const { |
446 | // find x_blocking for better balance reducing work between threads |
447 | assert(conf_.x_block_ > 0 && nx > conf_.x_block_ |
448 | && nx % conf_.x_block_ == 0); |
449 | int x_blocking = nx / conf_.x_block_; |
450 | int min_x_blocking |
451 | = utils::div_up(x_blocking, nstl::max(1, nthr_per_grp / ny)); |
452 | while (true) { |
453 | if (x_blocking % 2 == 0 && x_blocking >= min_x_blocking * 2) |
454 | x_blocking /= 2; |
455 | else if (x_blocking % 3 == 0 && x_blocking >= min_x_blocking * 3) |
456 | x_blocking /= 3; |
457 | else |
458 | break; |
459 | } |
460 | if (x_blocking >= min_x_blocking * 4) x_blocking = 1; |
461 | x_blocking *= conf_.x_block_; |
462 | return x_blocking; |
463 | } |
464 | |
465 | template <impl::data_type_t data_type> |
466 | void cpu_reducer_2d_t<data_type>::reduce_block(const data_t *space_base, |
467 | data_t *dst, int job, int start_y, int start_x, int ny_start, |
468 | int nx_start, int ny_step, int nx_step) const { |
469 | data_t *d = dst + (start_y + ny_start) * conf_.dst_x_ + start_x + nx_start; |
470 | const data_t *space = space_base + (size_t)job * balancer().job_size_ |
471 | + (size_t)ny_start * conf_.job_size_x_ + nx_start; |
472 | #ifdef SIMPLE_IMPL |
473 | for (int idg = 0; idg < balancer().nthr_per_group_; ++idg) { |
474 | const data_t *w = &space[idg * space_per_thread(balancer())]; |
475 | for (int y = 0; y < ny_step; ++y) |
476 | for (int x = 0; x < nx_step; ++x) { |
477 | d[y * conf_.dst_x_ + x] |
478 | = (idg == 0 ? 0 : d[y * conf_.dst_x_ + x]) |
479 | + w[y * conf_.job_size_x_ + x]; |
480 | } |
481 | } |
482 | #else |
483 | (*drv_)(d, space, ny_step, nx_step); |
484 | #endif |
485 | } |
486 | |
487 | template <impl::data_type_t data_type> |
488 | void cpu_reducer_2d_t<data_type>::reduce_nolock(int ithr, data_t *dst, |
489 | const memory_tracking::grantor_t &scratchpad) const { |
490 | bool redundant_reduction |
491 | = balancer().nthr_per_group_ == 1 || balancer().idle(ithr); |
492 | if (redundant_reduction) return; |
493 | |
494 | const int id_in_grp = balancer().id_in_group(ithr); |
495 | const int njobs_in_grp = balancer().ithr_njobs(ithr); |
496 | const int njobs_x = utils::div_up(conf_.dst_x_, conf_.job_size_x_); |
497 | const int global_job_start = balancer().ithr_job_off(ithr); |
498 | |
499 | const data_t *space_base = get_local_ptr(ithr - id_in_grp, scratchpad); |
500 | |
501 | const int pr_grps = nstl::min(njobs_in_grp, balancer().nthr_per_group_); |
502 | const int pr_nthr_per_grp = balancer().nthr_per_group_ / pr_grps; |
503 | |
504 | if (id_in_grp >= pr_grps * pr_nthr_per_grp) return; /* idle */ |
505 | |
506 | const int pr_my_grp = id_in_grp / pr_nthr_per_grp; |
507 | const int pr_my_id = id_in_grp % pr_nthr_per_grp; |
508 | |
509 | int pr_job_start {0}, pr_job_end {0}; |
510 | balance211(njobs_in_grp, pr_grps, pr_my_grp, pr_job_start, pr_job_end); |
511 | |
512 | for (int j = pr_job_start; j < pr_job_end; ++j) { |
513 | const int global_job = global_job_start + j; |
514 | const int j_y = global_job / njobs_x; |
515 | const int j_x = global_job % njobs_x; |
516 | const int start_y = j_y * conf_.job_size_y_; |
517 | const int start_x = j_x * conf_.job_size_x_; |
518 | const int ny = nstl::min(conf_.dst_y_ - start_y, conf_.job_size_y_); |
519 | const int nx = nstl::min(conf_.dst_x_ - start_x, conf_.job_size_x_); |
520 | int x_blocking = choose_x_blocking(nx, ny, pr_nthr_per_grp); |
521 | |
522 | int nxy_start {0}, nxy_end {0}; |
523 | balance211(ny * nx / x_blocking, pr_nthr_per_grp, pr_my_id, nxy_start, |
524 | nxy_end); |
525 | if (nxy_start == nxy_end) continue; |
526 | nxy_start *= x_blocking; |
527 | nxy_end *= x_blocking; |
528 | |
529 | int nxy = nxy_start; |
530 | if (nxy % nx != 0) { |
531 | int nx_step = nstl::min(nx - nxy % nx, nxy_end - nxy); |
532 | reduce_block(space_base, dst, j, start_y, start_x, nxy / nx, |
533 | nxy % nx, 1, nx_step); |
534 | nxy += nx_step; |
535 | } |
536 | if ((nxy_end - nxy) > nx) { |
537 | int ny_step = (nxy_end - nxy) / nx; |
538 | reduce_block(space_base, dst, j, start_y, start_x, nxy / nx, |
539 | nxy % nx, ny_step, nx); |
540 | nxy += nx * ny_step; |
541 | } |
542 | if ((nxy_end - nxy) > 0) { |
543 | reduce_block(space_base, dst, j, start_y, start_x, nxy / nx, |
544 | nxy % nx, 1, nxy_end - nxy); |
545 | } |
546 | } |
547 | } |
548 | |
549 | template struct cpu_reducer_2d_t<data_type::f32>; |
550 | template struct cpu_reducer_2d_t<data_type::s32>; |
551 | |
552 | /* accumulator section */ |
553 | |
554 | template <impl::data_type_t data_type> |
555 | cpu_accumulator_1d_t<data_type>::cpu_accumulator_1d_t() : drv_(nullptr) { |
556 | drv_ = create_reduce_2d_drv<data_type>(1, 0, 0, 0, false); |
557 | } |
558 | |
559 | template <impl::data_type_t data_type> |
560 | cpu_accumulator_1d_t<data_type>::~cpu_accumulator_1d_t() { |
561 | delete drv_; |
562 | } |
563 | |
564 | template <impl::data_type_t data_type> |
565 | status_t cpu_accumulator_1d_t<data_type>::create_kernel() { |
566 | return drv_->create_kernel(); |
567 | } |
568 | |
569 | template <impl::data_type_t data_type> |
570 | void cpu_accumulator_1d_t<data_type>::accumulate( |
571 | data_t *dst, const data_t *src, size_t size) { |
572 | (*drv_)(dst, src, 1, size); |
573 | } |
574 | |
575 | template struct cpu_accumulator_1d_t<data_type::f32>; |
576 | template struct cpu_accumulator_1d_t<data_type::s32>; |
577 | |
578 | } // namespace x64 |
579 | } // namespace cpu |
580 | } // namespace impl |
581 | } // namespace dnnl |
582 | |
583 | // vim: et ts=4 sw=4 cindent cino+=l0,\:4,N-s |
584 | |