cpu_reducer.cpp source code [oneDNN/src/cpu/x64/cpu_reducer.cpp]

1	/*******************************************************************************
2	* Copyright 2017-2022 Intel Corporation
3	*
4	* Licensed under the Apache License, Version 2.0 (the "License");
5	* you may not use this file except in compliance with the License.
6	* You may obtain a copy of the License at
7	*
8	* http://www.apache.org/licenses/LICENSE-2.0
9	*
10	* Unless required by applicable law or agreed to in writing, software
11	* distributed under the License is distributed on an "AS IS" BASIS,
12	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13	* See the License for the specific language governing permissions and
14	* limitations under the License.
15	*******************************************************************************/
16
17	#include <assert.h>
18
19	#include "oneapi/dnnl/dnnl_types.h"
20
21	#include "common/dnnl_thread.hpp"
22	#include "common/nstl.hpp"
23	#include "common/utils.hpp"
24
25	#include "cpu/platform.hpp"
26
27	#include "cpu/x64/cpu_reducer.hpp"
28
29	namespace dnnl {
30	namespace impl {
31	namespace cpu {
32	namespace x64 {
33
34	using namespace memory_tracking::names;
35
36	void reduce_balancer_t::balance() {
37	using namespace nstl;
38	using namespace utils;
39
40	assert(nthr_ > `0` && job_size_ > `0` && njobs_ > `0` && reduction_size_ > `0`);
41
42	const int job_complexity = `1`;
43
44	const int min_njobs_per_group = max(`1`, njobs_ / nthr_);
45	const int max_njobs_per_group
46	= max(`1`, static_cast<int>(max_buffer_size_ / (nthr_ * job_size_)));
47
48	/ initial guess /
49	int ngroups = min(njobs_ / min_njobs_per_group, nthr_);
50	int nthr_per_group
51	= allow_nthr_in_group_ ? min(nthr_ / ngroups, reduction_size_) : `1`;
52	int njobs_per_group_ub = div_up(njobs_, ngroups);
53
54	/ rough upper-bound estimation, will be fixed during brute force /
55	size_t thread_complexity_ub = (size_t)njobs_ * job_size_ * reduction_size_;
56
57	/ brute force parameters for the best balance... /
58	for (int c_njobs_per_group = min_njobs_per_group;
59	c_njobs_per_group < njobs_; ++c_njobs_per_group) {
60	/ current assumption /
61	int c_ngroups = min(njobs_ / c_njobs_per_group, nthr_);
62	int c_nthr_per_group = allow_nthr_in_group_
63	? min(nthr_ / c_ngroups, reduction_size_)
64	: `1`;
65	int c_njobs_per_group_ub = div_up(njobs_, c_ngroups);
66
67	if (c_nthr_per_group > `1` && c_njobs_per_group_ub > max_njobs_per_group)
68	continue;
69
70	int c_thread_reduction_ub = div_up(reduction_size_, c_nthr_per_group);
71	size_t c_group_size_ub = (size_t)job_size_ * c_njobs_per_group_ub;
72	size_t c_thread_complexity_ub = c_group_size_ub
73	* (job_complexity * c_thread_reduction_ub
74	+ (c_nthr_per_group != `1`));
75
76	if (c_thread_complexity_ub < thread_complexity_ub) {
77	ngroups = c_ngroups;
78	nthr_per_group = c_nthr_per_group;
79	njobs_per_group_ub = c_njobs_per_group_ub;
80	thread_complexity_ub = c_thread_complexity_ub;
81	}
82	}
83
84	assert(njobs_per_group_ub <= max_njobs_per_group \|\| nthr_per_group == `1`);
85	assert(ngroups * nthr_per_group <= nthr_);
86	assert((size_t)njobs_per_group_ub * job_size_ * nthr_ <= max_buffer_size_
87	\|\| nthr_per_group == `1`); / no reduction buffer overflow /
88	assert(IMPLICATION(!allow_nthr_in_group_, nthr_per_group == `1`));
89
90	ngroups_ = ngroups;
91	nthr_per_group_ = nthr_per_group;
92	njobs_per_group_ub_ = njobs_per_group_ub;
93	}
94
95	/ reducer jit-ted driver /
96
97	using namespace Xbyak;
98
99	template <impl::data_type_t data_type>
100	struct reducer_2d_driver_t : public jit_generator {
101	using data_t = typename prec_traits<data_type>::type;
102
103	reducer_2d_driver_t(int n_src, size_t src_ld, size_t src_step,
104	size_t dst_step, bool nullify_dst, const char *name)
105	: jit_generator(name)
106	, n_src_(n_src)
107	, src_ld_(src_ld)
108	, src_step_(src_step)
109	, dst_step_(dst_step)
110	, nullify_dst_(nullify_dst) {}
111	virtual void operator()(
112	data_t dst, const* data_t *srcs, size_t ny, size_t nx)
113	= `0`;
114
115	protected:
116	int n_src_;
117	size_t src_ld_, src_step_, dst_step_;
118	bool nullify_dst_;
119	};
120
121	template <impl::data_type_t data_type, cpu_isa_t isa>
122	struct reducer_2d_driver_f_s_32_t : public reducer_2d_driver_t<data_type> {
123	DECLARE_CPU_JIT_AUX_FUNCTIONS(reducer_2d_driver_f_s_32_t)
124
125	using data_t = typename prec_traits<data_type>::type;
126
127	void operator()(
128	data_t dst, const* data_t *srcs, size_t ny, size_t nx) override {
129	jit_generator::operator()(dst, srcs, ny, nx);
130	}
131
132	/ cpu specific part /
133	using Vmm = typename utils::conditional<isa == avx2, Ymm, Zmm>::type;
134	const AddressFrame &vmmword = (isa == avx2) ? this->yword : this->zword;
135	void uni_vadd(const Xmm &x1, const Xmm &x2, const Operand &op) {
136	if (data_type == data_type::f32)
137	this->vaddps(x1, x2, op);
138	else
139	this->vpaddd(x1, x2, op);
140	}
141	void uni_add(const Xmm &x1, const Operand &op) {
142	if (data_type == data_type::f32)
143	this->addss(x1, op);
144	else
145	this->paddd(x1, op);
146	}
147
148	const int vlen = cpu_isa_traits<isa>::vlen;
149	const int typesize
150	= sizeof(typename dnnl::impl::prec_traits<data_type>::type);
151	Xbyak::Reg64 reg_dst = abi_param1;
152	Xbyak::Reg64 reg_src = abi_param2;
153	Xbyak::Reg64 reg_ny = abi_param3;
154	Xbyak::Reg64 reg_nx = abi_param4;
155
156	Xbyak::Reg64 reg_x = this->rax;
157	Xbyak::Reg64 reg_src_id = this->r10;
158	Xbyak::Reg64 reg_long_offt = this->r11;
159
160	reducer_2d_driver_f_s_32_t(int n_src, size_t src_ld, size_t src_step,
161	size_t dst_step, bool nullify_dst)
162	: reducer_2d_driver_t<data_type>(
163	n_src, src_ld, src_step, dst_step, nullify_dst, jit_name()) {}
164
165	void nullify_dst(int nloads, int load_len) {
166	UNUSED(load_len);
167	for (int i = `0`; i < nloads; ++i)
168	this->uni_vpxor(Vmm(i), Vmm(i), Vmm(i));
169	/ prefetches[dst] ? /
170	}
171
172	void load_dst(int nloads, int load_len) {
173	for (int i = `0`; i < nloads; ++i) {
174	if (load_len == typesize)
175	this->movd(Xmm (i), this->ptr[reg_dst + i * load_len]);
176	else if (load_len == vlen)
177	this->vmovups(Vmm(i), this->ptr[reg_dst + i * load_len]);
178	else
179	assert(!"unsupported");
180	}
181	}
182
183	void store_dst(int nloads, int load_len) {
184	for (int i = `0`; i < nloads; ++i) {
185	if (load_len == typesize)
186	this->movd(this->ptr[reg_dst + i * load_len], Xmm (i));
187	else if (load_len == vlen)
188	this->vmovups(this->ptr[reg_dst + i * load_len], Vmm(i));
189	else
190	assert(!"unsupported");
191	}
192	}
193
194	void accumulate(int nloads, int load_len, size_t base_off) {
195	for (int i = `0`; i < nloads; ++i) {
196	size_t off = base_off + i * load_len;
197
198	if (load_len == typesize)
199	this->uni_add(Xmm (i), this->ptr[reg_src + off]);
200	else if (load_len == vlen)
201	this->uni_vadd(Vmm(i), Vmm(i), vmmword [reg_src + off]);
202	else
203	assert(!"unsupported");
204	}
205	}
206
207	void loop_x() {
208	const int nloads[] = {cpu_isa_traits<isa>::n_vregs, `1`, `1`};
209	const int nbranches = sizeof(nloads) / sizeof(nloads[`0`]);
210
211	const int load_len[nbranches] = {vlen, vlen, typesize};
212	Label loop_x_label[nbranches + `1`];
213
214	this->mov(reg_x, reg_nx);
215
216	for (int id = `0`; id < nbranches; ++id) {
217	this->L(loop_x_label[id]);
218
219	this->cmp(reg_x, nloads[id] * load_len[id]);
220	this->jl(loop_x_label[id + `1`], this->T_NEAR);
221
222	if (this->nullify_dst_)
223	nullify_dst(nloads[id], load_len[id]);
224	else
225	load_dst(nloads[id], load_len[id]);
226
227	if (nloads[id] > `1`) {
228	Label loop_srcs;
229	this->mov(reg_src_id, this->n_src_);
230	this->L(loop_srcs);
231
232	accumulate(nloads[id], load_len[id], `0`);
233	this->add(reg_src, this->src_ld_ * typesize);
234
235	this->dec(reg_src_id);
236	this->jnz(loop_srcs, this->T_NEAR);
237
238	size_t base_off
239	= (size_t)this->n_src_ * this->src_ld_ * typesize;
240	this->safe_sub(reg_src, base_off, reg_long_offt);
241	} else {
242	for (int src_id = `0`; src_id < this->n_src_; ++src_id) {
243	const size_t base_off
244	= (size_t)src_id * this->src_ld_ * typesize;
245	accumulate(nloads[id], load_len[id], base_off);
246	}
247	}
248
249	store_dst(nloads[id], load_len[id]);
250
251	this->add(reg_src, nloads[id] * load_len[id]);
252	this->add(reg_dst, nloads[id] * load_len[id]);
253
254	this->sub(reg_x, nloads[id] * load_len[id]);
255
256	this->jmp(loop_x_label[id], this->T_NEAR);
257	}
258
259	this->L(loop_x_label[nbranches]);
260
261	/ restore address registers /
262	this->sub(reg_src, reg_nx);
263	this->sub(reg_dst, reg_nx);
264	}
265
266	void generate() override {
267	static_assert(isa == avx2 \|\| isa == avx512_core, "unsupported CPU ISA");
268
269	this->preamble();
270
271	this->shl(reg_nx, `2`);
272
273	Label ny_loop;
274	this->L(ny_loop);
275
276	loop_x();
277
278	this->add(reg_dst, this->dst_step_ * typesize);
279	this->add(reg_src, this->src_step_ * typesize);
280
281	this->dec(reg_ny);
282	this->jnz(ny_loop, this->T_NEAR);
283
284	this->postamble();
285	}
286	};
287
288	template <impl::data_type_t data_type>
289	inline reducer_2d_driver_t<data_type> create_reduce_2d_drv(int* n_src,
290	size_t src_ld, size_t src_step, size_t dst_step, bool nullify_dst) {
291	if (mayiuse(avx512_core))
292	return new reducer_2d_driver_f_s_32_t<data_type, avx512_core>(
293	n_src, src_ld, src_step, dst_step, nullify_dst);
294	else if (mayiuse(avx2))
295	return new reducer_2d_driver_f_s_32_t<data_type, avx2>(
296	n_src, src_ld, src_step, dst_step, nullify_dst);
297	assert(!"unimplemented");
298	return nullptr;
299	}
300
301	/ cpu_reducer_t /
302
303	template <impl::data_type_t data_type>
304	void cpu_reducer_t<data_type>::conf_t::init_scratchpad(
305	memory_tracking::registrar_t &scratchpad) const {
306	if (balancer_.nthr_per_group_ == `1`) return;
307
308	const size_t space_size = balancer_.ngroups_
309	* (balancer_.nthr_per_group_ - `1`)
310	* cpu_reducer_t<data_type>::space_per_thread(balancer_);
311	scratchpad.book<data_t>(key_reducer_space, space_size, PAGE_4K);
312	scratchpad.book<simple_barrier::ctx_t>(
313	key_reducer_space_bctx, balancer_.ngroups_);
314	}
315
316	template <impl::data_type_t data_type>
317	cpu_reducer_t<data_type>::cpu_reducer_t(const conf_t &conf)
318	: conf_(conf), drv_(nullptr) {
319	if (balancer().nthr_per_group_ == `1`) return;
320
321	drv_ = create_reduce_2d_drv<data_type>(balancer().nthr_per_group_ - `1`,
322	space_per_thread(balancer()), `0`, `0`, false);
323	}
324
325	template <impl::data_type_t data_type>
326	cpu_reducer_t<data_type>::~cpu_reducer_t() {
327	delete drv_;
328	}
329
330	template <impl::data_type_t data_type>
331	status_t cpu_reducer_t<data_type>::create_kernel() {
332	return (drv_) ? drv_->create_kernel() : status::success;
333	}
334
335	template <impl::data_type_t data_type>
336	typename cpu_reducer_t<data_type>::data_t *
337	cpu_reducer_t<data_type>::get_local_ptr(int ithr, data_t *dst,
338	const memory_tracking::grantor_t &scratchpad) const {
339	const int id_in_grp = balancer().id_in_group(ithr);
340
341	/ threads 0 from each group writes directly to the destination /
342	if (id_in_grp == `0`)
343	return dst + balancer().ithr_job_off(ithr) * balancer().job_size_;
344
345	const int grp_id = balancer().group_id(ithr);
346	const int offset_factor
347	= grp_id * (balancer().nthr_per_group_ - `1`) + (id_in_grp - `1`);
348
349	auto space = scratchpad.template get<data_t>(key_reducer_space);
350	return space + offset_factor * space_per_thread(balancer());
351	}
352
353	template <impl::data_type_t data_type>
354	void cpu_reducer_t<data_type>::reduce_nolock(int ithr, data_t *dst,
355	const memory_tracking::grantor_t &scratchpad) const {
356	bool redundant_reduction
357	= balancer().nthr_per_group_ == `1` \|\| balancer().idle(ithr);
358	if (redundant_reduction) return;
359
360	#ifdef SIMPLE_IMPL
361	if (balancer().id_in_group(ithr) != `0`)
362	return; / only threads 0 do the reduction /
363
364	const int njobs_in_grp = balancer().ithr_njobs(ithr);
365	data_t *d = get_local_ptr(ithr, dst, scratchpad);
366	for (int id_in_grp = `1`; id_in_grp < balancer().nthr_per_group_;
367	++id_in_grp) {
368	const data_t *space = get_local_ptr(ithr + id_in_grp, dst, scratchpad);
369	for (size_t i = `0`; i < (size_t)njobs_in_grp * balancer().job_size_; ++i)
370	d[i] += space[i];
371	}
372	#else
373	using namespace utils;
374
375	const int id_in_grp = balancer().id_in_group(ithr);
376	const int njobs_in_grp = balancer().ithr_njobs(ithr);
377	const size_t cl = `64` / sizeof(data_t);
378
379	const size_t reduction_size = njobs_in_grp * balancer().job_size_;
380	size_t start {`0`}, end {`0`};
381	balance211(div_up(reduction_size, cl), balancer().nthr_per_group_,
382	id_in_grp, start, end);
383
384	if (start == end) return;
385
386	data_t d = get_local_ptr(ithr - id_in_grp, dst, scratchpad) + start cl;
387	const data_t *space
388	= get_local_ptr(ithr - id_in_grp + `1`, dst, scratchpad) + start * cl;
389	const size_t len = nstl::min(end * cl, reduction_size) - start * cl;
390
391	(*drv_)(d, space, `1`, len);
392	#endif
393	}
394
395	template struct cpu_reducer_t<data_type::f32>;
396	template struct cpu_reducer_t<data_type::s32>;
397
398	/ cpu_reducer_2d_t /
399
400	template <impl::data_type_t data_type>
401	void cpu_reducer_2d_t<data_type>::conf_t::init_scratchpad(
402	memory_tracking::registrar_t &scratchpad) const {
403	if (balancer_.nthr_per_group_ == `1`) return;
404
405	const size_t space_size = balancer_.ngroups_ * balancer_.nthr_per_group_
406	* cpu_reducer_2d_t<data_type>::space_per_thread(balancer_);
407	scratchpad.book<data_t>(key_reducer_space, space_size);
408	scratchpad.book<simple_barrier::ctx_t>(
409	key_reducer_space_bctx, balancer_.ngroups_);
410	}
411
412	template <impl::data_type_t data_type>
413	cpu_reducer_2d_t<data_type>::cpu_reducer_2d_t(const conf_t &conf)
414	: conf_(conf), drv_(nullptr) {
415	if (balancer().nthr_per_group_ == `1`) return;
416
417	drv_ = create_reduce_2d_drv<data_type>(balancer().nthr_per_group_,
418	space_per_thread(balancer()), conf_.job_size_x_, conf_.dst_x_,
419	true);
420	}
421
422	template <impl::data_type_t data_type>
423	cpu_reducer_2d_t<data_type>::~cpu_reducer_2d_t() {
424	delete drv_;
425	}
426
427	template <impl::data_type_t data_type>
428	status_t cpu_reducer_2d_t<data_type>::create_kernel() {
429	return (drv_) ? drv_->create_kernel() : status::success;
430	}
431
432	template <impl::data_type_t data_type>
433	typename cpu_reducer_2d_t<data_type>::data_t *
434	cpu_reducer_2d_t<data_type>::get_local_ptr(
435	int ithr, const memory_tracking::grantor_t &scratchpad) const {
436	const int id_in_grp = balancer().id_in_group(ithr);
437	const int grp_id = balancer().group_id(ithr);
438	const int offset_factor = grp_id * balancer().nthr_per_group_ + id_in_grp;
439	auto space = scratchpad.template get<data_t>(key_reducer_space);
440	return space + offset_factor * space_per_thread(balancer());
441	}
442
443	template <impl::data_type_t data_type>
444	int cpu_reducer_2d_t<data_type>::choose_x_blocking(
445	int nx, int ny, int nthr_per_grp) const {
446	// find x_blocking for better balance reducing work between threads
447	assert(conf_.x_block_ > `0` && nx > conf_.x_block_
448	&& nx % conf_.x_block_ == `0`);
449	int x_blocking = nx / conf_.x_block_;
450	int min_x_blocking
451	= utils::div_up(x_blocking, nstl::max(`1`, nthr_per_grp / ny));
452	while (true) {
453	if (x_blocking % `2` == `0` && x_blocking >= min_x_blocking * `2`)
454	x_blocking /= `2`;
455	else if (x_blocking % `3` == `0` && x_blocking >= min_x_blocking * `3`)
456	x_blocking /= `3`;
457	else
458	break;
459	}
460	if (x_blocking >= min_x_blocking * `4`) x_blocking = `1`;
461	x_blocking *= conf_.x_block_;
462	return x_blocking;
463	}
464
465	template <impl::data_type_t data_type>
466	void cpu_reducer_2d_t<data_type>::reduce_block(const data_t *space_base,
467	data_t dst, int* job, int start_y, int start_x, int ny_start,
468	int nx_start, int ny_step, int nx_step) const {
469	data_t d = dst + (start_y + ny_start) conf_.dst_x_ + start_x + nx_start;
470	const data_t space = space_base + (size_t)job balancer().job_size_
471	+ (size_t)ny_start * conf_.job_size_x_ + nx_start;
472	#ifdef SIMPLE_IMPL
473	for (int idg = `0`; idg < balancer().nthr_per_group_; ++idg) {
474	const data_t w = &space[idg space_per_thread(balancer())];
475	for (int y = `0`; y < ny_step; ++y)
476	for (int x = `0`; x < nx_step; ++x) {
477	d[y * conf_.dst_x_ + x]
478	= (idg == `0` ? `0` : d[y * conf_.dst_x_ + x])
479	+ w[y * conf_.job_size_x_ + x];
480	}
481	}
482	#else
483	(*drv_)(d, space, ny_step, nx_step);
484	#endif
485	}
486
487	template <impl::data_type_t data_type>
488	void cpu_reducer_2d_t<data_type>::reduce_nolock(int ithr, data_t *dst,
489	const memory_tracking::grantor_t &scratchpad) const {
490	bool redundant_reduction
491	= balancer().nthr_per_group_ == `1` \|\| balancer().idle(ithr);
492	if (redundant_reduction) return;
493
494	const int id_in_grp = balancer().id_in_group(ithr);
495	const int njobs_in_grp = balancer().ithr_njobs(ithr);
496	const int njobs_x = utils::div_up(conf_.dst_x_, conf_.job_size_x_);
497	const int global_job_start = balancer().ithr_job_off(ithr);
498
499	const data_t *space_base = get_local_ptr(ithr - id_in_grp, scratchpad);
500
501	const int pr_grps = nstl::min(njobs_in_grp, balancer().nthr_per_group_);
502	const int pr_nthr_per_grp = balancer().nthr_per_group_ / pr_grps;
503
504	if (id_in_grp >= pr_grps * pr_nthr_per_grp) return; / idle /
505
506	const int pr_my_grp = id_in_grp / pr_nthr_per_grp;
507	const int pr_my_id = id_in_grp % pr_nthr_per_grp;
508
509	int pr_job_start {`0`}, pr_job_end {`0`};
510	balance211(njobs_in_grp, pr_grps, pr_my_grp, pr_job_start, pr_job_end);
511
512	for (int j = pr_job_start; j < pr_job_end; ++j) {
513	const int global_job = global_job_start + j;
514	const int j_y = global_job / njobs_x;
515	const int j_x = global_job % njobs_x;
516	const int start_y = j_y * conf_.job_size_y_;
517	const int start_x = j_x * conf_.job_size_x_;
518	const int ny = nstl::min(conf_.dst_y_ - start_y, conf_.job_size_y_);
519	const int nx = nstl::min(conf_.dst_x_ - start_x, conf_.job_size_x_);
520	int x_blocking = choose_x_blocking(nx, ny, pr_nthr_per_grp);
521
522	int nxy_start {`0`}, nxy_end {`0`};
523	balance211(ny * nx / x_blocking, pr_nthr_per_grp, pr_my_id, nxy_start,
524	nxy_end);
525	if (nxy_start == nxy_end) continue;
526	nxy_start *= x_blocking;
527	nxy_end *= x_blocking;
528
529	int nxy = nxy_start;
530	if (nxy % nx != `0`) {
531	int nx_step = nstl::min(nx - nxy % nx, nxy_end - nxy);
532	reduce_block(space_base, dst, j, start_y, start_x, nxy / nx,
533	nxy % nx, `1`, nx_step);
534	nxy += nx_step;
535	}
536	if ((nxy_end - nxy) > nx) {
537	int ny_step = (nxy_end - nxy) / nx;
538	reduce_block(space_base, dst, j, start_y, start_x, nxy / nx,
539	nxy % nx, ny_step, nx);
540	nxy += nx * ny_step;
541	}
542	if ((nxy_end - nxy) > `0`) {
543	reduce_block(space_base, dst, j, start_y, start_x, nxy / nx,
544	nxy % nx, `1`, nxy_end - nxy);
545	}
546	}
547	}
548
549	template struct cpu_reducer_2d_t<data_type::f32>;
550	template struct cpu_reducer_2d_t<data_type::s32>;
551
552	/ accumulator section /
553
554	template <impl::data_type_t data_type>
555	cpu_accumulator_1d_t<data_type>::cpu_accumulator_1d_t() : drv_(nullptr) {
556	drv_ = create_reduce_2d_drv<data_type>(`1`, `0`, `0`, `0`, false);
557	}
558
559	template <impl::data_type_t data_type>
560	cpu_accumulator_1d_t<data_type>::~cpu_accumulator_1d_t() {
561	delete drv_;
562	}
563
564	template <impl::data_type_t data_type>
565	status_t cpu_accumulator_1d_t<data_type>::create_kernel() {
566	return drv_->create_kernel();
567	}
568
569	template <impl::data_type_t data_type>
570	void cpu_accumulator_1d_t<data_type>::accumulate(
571	data_t dst, const* data_t *src, size_t size) {
572	(*drv_)(dst, src, `1`, size);
573	}
574
575	template struct cpu_accumulator_1d_t<data_type::f32>;
576	template struct cpu_accumulator_1d_t<data_type::s32>;
577
578	} // namespace x64
579	} // namespace cpu
580	} // namespace impl
581	} // namespace dnnl
582
583	// vim: et ts=4 sw=4 cindent cino+=l0,\:4,N-s
584

Browse the source code of oneDNN/src/cpu/x64/cpu_reducer.cpp