cpu_reducer.hpp source code [oneDNN/src/cpu/x64/cpu_reducer.hpp]

1	/*******************************************************************************
2	* Copyright 2017-2020 Intel Corporation
3	*
4	* Licensed under the Apache License, Version 2.0 (the "License");
5	* you may not use this file except in compliance with the License.
6	* You may obtain a copy of the License at
7	*
8	* http://www.apache.org/licenses/LICENSE-2.0
9	*
10	* Unless required by applicable law or agreed to in writing, software
11	* distributed under the License is distributed on an "AS IS" BASIS,
12	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13	* See the License for the specific language governing permissions and
14	* limitations under the License.
15	*******************************************************************************/
16
17	#ifndef CPU_X64_CPU_REDUCER_HPP
18	#define CPU_X64_CPU_REDUCER_HPP
19
20	#include <assert.h>
21
22	#include "common/c_types_map.hpp"
23	#include "common/dnnl_thread.hpp"
24	#include "common/memory_tracking.hpp"
25	#include "common/nstl.hpp"
26	#include "common/type_helpers.hpp"
27	#include "common/utils.hpp"
28	#include "oneapi/dnnl/dnnl_types.h"
29
30	#include "cpu/x64/cpu_barrier.hpp"
31
32	namespace dnnl {
33	namespace impl {
34	namespace cpu {
35	namespace x64 {
36
37	/* class to perform balancing over 3D array*
38	*
39	* Conceptually the reduction happens according to the picture below:
40	*
41	* <--job_size->
42	* +-----------+ +-----------+ +-----------+ ^
43	* \| \| \| \| \| \| \|
44	* \| \| \| \| \| \| \|
45	* \| 1 \| \| 2 \| . . . \| njobs \| \| reduction_size
46	* \| \| \| \| \| \| \|
47	* \| \| \| \| \| \| \|
48	* +-----------+ +-----------+ +-----------+ v
49	*
50	* \| \| \| \| \| \| \| \| \|
51	* v v v v v v v v v
52	* ===================================================== vertical reduction
53	*
54	* +-----------+ +-----------+ . . . +-----------+ result
55	*
56	* In a simple case the result must be contiguous in memory.
57	* @class cpu_reducer_t is an implementation.
58	*
59	* Threads are divided into groups. The groups are independent of each other.
60	* Each group may work on several jobs (the distribution is not uniform, since
61	* njobs might be not a multiple of groups). Threads within a group work on
62	* different parts of the reduction dimension. Thread 0 in each group is called
63	* master (@sa reduce_balancer_t::master()).
64	*
65	* If threading driver does not allow sync between sub-group of threads (e.g.
66	* TBB) the # of thread per group is enforced to be 1.
67	*/
68	struct reduce_balancer_t {
69	reduce_balancer_t() { init(`1`, `1`, `1`, `1`, `0`); } / trivial balance /
70	reduce_balancer_t(int nthr, int job_size, int njobs, int reduction_size,
71	size_t max_buffer_size, bool lock_free = false) {
72	init(nthr, job_size, njobs, reduction_size, max_buffer_size, lock_free);
73	}
74
75	reduce_balancer_t &init(int nthr, int job_size, int njobs,
76	int reduction_size, size_t max_buffer_size,
77	bool lock_free = false) {
78	allow_nthr_in_group_ = lock_free ? true : dnnl_thr_syncable();
79	nthr_ = nthr;
80	job_size_ = job_size;
81	njobs_ = njobs;
82	reduction_size_ = reduction_size;
83	max_buffer_size_ = max_buffer_size;
84	balance();
85	return *this;
86	}
87
88	bool allow_nthr_in_group_;
89	int nthr_;
90	int job_size_, njobs_, reduction_size_;
91
92	int ngroups_; /* number of independent work (thread) groups /
93	int nthr_per_group_; /* number of threads within a single work group /
94	int njobs_per_group_ub_; /* the max # of jobs within a work group /
95
96	bool master(int ithr) const { return id_in_group(ithr) == `0`; }
97	bool idle(int ithr) const { return ithr >= nthr_per_group_ * ngroups_; }
98
99	int group_id(int ithr) const { return ithr / nthr_per_group_; }
100	int id_in_group(int ithr) const { return ithr % nthr_per_group_; }
101
102	int grp_njobs(int grp) const {
103	if (grp >= ngroups_) return `0`;
104	return njobs_ / ngroups_ + (grp < njobs_ % ngroups_);
105	}
106	int grp_job_off(int grp) const {
107	if (grp >= ngroups_) return njobs_;
108	return njobs_ / ngroups_ * grp + nstl::min(grp, njobs_ % ngroups_);
109	}
110
111	int ithr_njobs(int ithr) const { return grp_njobs(group_id(ithr)); }
112	int ithr_job_off(int ithr) const { return grp_job_off(group_id(ithr)); }
113
114	private:
115	size_t max_buffer_size_;
116	void balance();
117	};
118
119	/* forward declaration of reduce driver /
120	template <impl::data_type_t data_type>
121	struct reducer_2d_driver_t;
122
123	/* class to perform a reduction over 3D array*
124	*
125	* Balancing is based on @class reduce_balancer_t.
126	* Restrictions: the result of the reduction must be contiguous in memory. *
127	* The reduction happens according to the picture below (once more):
128	*
129	* <--job_size->
130	* +-----------+ +-----------+ +-----------+ ^
131	* \| \| \| \| \| \| \|
132	* \| \| \| \| \| \| \|
133	* \| 1 \| \| 2 \| . . . \| njobs \| \| reduction_size
134	* \| \| \| \| \| \| \|
135	* \| \| \| \| \| \| \|
136	* +-----------+ +-----------+ +-----------+ v
137	*
138	* \| \| \| \| \| \| \| \| \|
139	* v v v v v v v v v
140	* ===================================================== vertical reduction
141	*
142	* +-----------+ +-----------+ . . . +-----------+ (contiguous) result
143	*
144	* An example how work might be shared is shown below.
145	*
146	* In this example group 0 owns 2 (independent) jobs -- 2 big squares.
147	* The number of threads per group is also 2 (thread 0 of group 0 and thread 1
148	* of group 0). Master threads (i.e. threads with id 0 in corresponding group)
149	* from each group put the partial result directly into destination memory,
150	* while all the other threads with-in the group use workspace (on the picture
151	* the only thread 1). Once intermediate results obtained each group reduces
152	* corresponding part (own jobs) to the destination memory.
153	*
154	* <------- group 0 ------->
155	*
156	* +-----------+ +-----------+ ^
157	* \| \| \| \| \| thread 0 of reduces to the dest-memory
158	* \| \| \| \| \| group 0 +-----------+ +-----------+
159	* \|- - - - - -\| \|- - - - - -\| X
160	* \| \| \| \| \| thread 1 of reduces to workspace[tid=1]:
161	* \| \| \| \| \| group 0 +-----------+ +-----------+
162	* +-----------+ +-----------+ v
163	* \| \| \| \| \| \|
164	* v v v v v v
165	* ((barrier)) =============================
166	*
167	* dest-memory: +-----------+ +-----------+
168	*/
169	template <impl::data_type_t data_type>
170	struct cpu_reducer_t {
171	typedef typename prec_traits<data_type>::type data_t;
172
173	struct conf_t {
174	conf_t() = default;
175	conf_t &init(const reduce_balancer_t &balancer) {
176	balancer_ = balancer;
177	return *this;
178	}
179
180	void init_scratchpad(memory_tracking::registrar_t &scratchpad) const;
181
182	reduce_balancer_t balancer_;
183	};
184
185	status_t create_kernel();
186
187	cpu_reducer_t(const conf_t &conf);
188	~cpu_reducer_t();
189
190	/* initializes reducer.*
191	* Must be called from a single thread prior to actual usage */
192	void init(const memory_tracking::grantor_t &scratchpad) const {
193	if (balancer().nthr_per_group_ == `1` \|\| !dnnl_thr_syncable()) return;
194
195	auto bctx = scratchpad.template get<simple_barrier::ctx_t>(
196	memory_tracking::names::key_reducer_space_bctx);
197	for (int i = `0`; i < balancer().ngroups_; ++i)
198	simple_barrier::ctx_init(&bctx[i]);
199	}
200
201	/* for given thread returns the pointer where to put partial results.*
202	* Reduction destination @p dst must be provided as well (master threads
203	* from each group will use it for partial result to reduce memory
204	* pressure).
205	*
206	* @note: job offset is already applied by get_local_ptr(), which means all
207	* threads should start writing from the very beginning of returned
208	* address.
209	*/
210	data_t get_local_ptr(int* ithr, data_t *dst,
211	const memory_tracking::grantor_t &scratchpad) const;
212
213	/* performs the reduction with built-in synchronization. /
214	void reduce(int ithr, data_t *dst,
215	const memory_tracking::grantor_t &scratchpad) const {
216	bool redundant_reduction
217	= balancer().nthr_per_group_ == `1` \|\| balancer().idle(ithr);
218	if (redundant_reduction) return;
219
220	auto bctx = scratchpad.template get<simple_barrier::ctx_t>(
221	memory_tracking::names::key_reducer_space_bctx);
222	simple_barrier::barrier(
223	&bctx[balancer().group_id(ithr)], balancer().nthr_per_group_);
224
225	reduce_nolock(ithr, dst, scratchpad);
226	}
227
228	void reduce_nolock(int ithr, data_t *dst,
229	const memory_tracking::grantor_t &scratchpad) const;
230
231	const reduce_balancer_t &balancer() const { return conf_.balancer_; }
232
233	private:
234	static size_t space_per_thread(const reduce_balancer_t &balancer) {
235	return balancer.njobs_per_group_ub_ * balancer.job_size_;
236	}
237
238	/ The scratchpad is organized as follows:*
239	*
240	* data_t space[nthr_][njobs_per_group_ub_][jobs_size_];
241	* simple_barrier::ctx_t barriers[groups_]; */
242
243	const conf_t conf_;
244	reducer_2d_driver_t<data_type> *drv_;
245
246	DNNL_DISALLOW_COPY_AND_ASSIGN(cpu_reducer_t);
247	};
248
249	template <impl::data_type_t data_type>
250	struct cpu_reducer_2d_t {
251	typedef typename prec_traits<data_type>::type data_t;
252
253	struct conf_t {
254	conf_t() = default;
255	conf_t &init(const reduce_balancer_t &balancer, int job_size_x,
256	int job_size_y, int x_block, int dst_x, int dst_y) {
257	balancer_ = balancer;
258	job_size_x_ = job_size_x;
259	job_size_y_ = job_size_y;
260	x_block_ = x_block;
261	dst_x_ = dst_x;
262	dst_y_ = dst_y;
263	return *this;
264	}
265
266	void init_scratchpad(memory_tracking::registrar_t &scratchpad) const;
267
268	reduce_balancer_t balancer_;
269	int job_size_x_, job_size_y_, x_block_, dst_x_, dst_y_;
270	};
271
272	status_t create_kernel();
273
274	cpu_reducer_2d_t(const conf_t &conf);
275	~cpu_reducer_2d_t();
276
277	/* initializes reducer.*
278	* Must be called from a single thread prior to actual usage */
279	void init(const memory_tracking::grantor_t &scratchpad) const {
280	if (balancer().nthr_per_group_ == `1` \|\| !dnnl_thr_syncable()) return;
281
282	auto bctx = scratchpad.template get<simple_barrier::ctx_t>(
283	memory_tracking::names::key_reducer_space_bctx);
284	for (int i = `0`; i < balancer().ngroups_; ++i)
285	simple_barrier::ctx_init(&bctx[i]);
286	}
287
288	/* for given thread returns the pointer where to put partial results /
289	data_t *get_local_ptr(
290	int ithr, const memory_tracking::grantor_t &scratchpad) const;
291
292	/* performs the reduction with built-in synchronization. /
293	void reduce(int ithr, data_t *dst,
294	const memory_tracking::grantor_t &scratchpad) const {
295	bool redundant_reduction
296	= balancer().nthr_per_group_ == `1` \|\| balancer().idle(ithr);
297	if (redundant_reduction) return;
298
299	auto bctx = scratchpad.template get<simple_barrier::ctx_t>(
300	memory_tracking::names::key_reducer_space_bctx);
301	simple_barrier::barrier(
302	&bctx[balancer().group_id(ithr)], balancer().nthr_per_group_);
303
304	reduce_nolock(ithr, dst, scratchpad);
305	}
306
307	void reduce_nolock(int ithr, data_t *dst,
308	const memory_tracking::grantor_t &scratchpad) const;
309
310	const reduce_balancer_t &balancer() const { return conf_.balancer_; }
311
312	private:
313	static size_t space_per_thread(const reduce_balancer_t &balancer) {
314	return balancer.njobs_per_group_ub_ * balancer.job_size_;
315	}
316
317	/ The scratchpad is organized as follows:*
318	*
319	* data_t space[nthr_][njobs_per_group_ub_][jobs_size_];
320	* simple_barrier::ctx_t barriers[groups_]; */
321
322	const conf_t conf_;
323	reducer_2d_driver_t<data_type> *drv_;
324
325	int choose_x_blocking(int nx, int ny, int nthr_per_grp) const;
326	void reduce_block(const data_t space_base, data_t dst, int job,
327	int start_y, int start_x, int ny_start, int nx_start, int ny_step,
328	int nx_step) const;
329
330	DNNL_DISALLOW_COPY_AND_ASSIGN(cpu_reducer_2d_t);
331	};
332
333	/* simple 1d accumulator: y[:] += x[:] /
334	template <impl::data_type_t data_type>
335	struct cpu_accumulator_1d_t {
336	typedef typename prec_traits<data_type>::type data_t;
337
338	cpu_accumulator_1d_t();
339	~cpu_accumulator_1d_t();
340	void accumulate(data_t dst, const* data_t *src, size_t size);
341
342	status_t create_kernel();
343
344	reducer_2d_driver_t<data_type> *drv_;
345
346	DNNL_DISALLOW_COPY_AND_ASSIGN(cpu_accumulator_1d_t);
347	};
348
349	} // namespace x64
350	} // namespace cpu
351	} // namespace impl
352	} // namespace dnnl
353
354	#endif
355
356	// vim: et ts=4 sw=4 cindent cino+=l0,\:4,N-s
357

Browse the source code of oneDNN/src/cpu/x64/cpu_reducer.hpp