1 | /******************************************************************************* |
2 | * Copyright 2017-2020 Intel Corporation |
3 | * |
4 | * Licensed under the Apache License, Version 2.0 (the "License"); |
5 | * you may not use this file except in compliance with the License. |
6 | * You may obtain a copy of the License at |
7 | * |
8 | * http://www.apache.org/licenses/LICENSE-2.0 |
9 | * |
10 | * Unless required by applicable law or agreed to in writing, software |
11 | * distributed under the License is distributed on an "AS IS" BASIS, |
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
13 | * See the License for the specific language governing permissions and |
14 | * limitations under the License. |
15 | *******************************************************************************/ |
16 | |
17 | #ifndef CPU_X64_CPU_REDUCER_HPP |
18 | #define CPU_X64_CPU_REDUCER_HPP |
19 | |
20 | #include <assert.h> |
21 | |
22 | #include "common/c_types_map.hpp" |
23 | #include "common/dnnl_thread.hpp" |
24 | #include "common/memory_tracking.hpp" |
25 | #include "common/nstl.hpp" |
26 | #include "common/type_helpers.hpp" |
27 | #include "common/utils.hpp" |
28 | #include "oneapi/dnnl/dnnl_types.h" |
29 | |
30 | #include "cpu/x64/cpu_barrier.hpp" |
31 | |
32 | namespace dnnl { |
33 | namespace impl { |
34 | namespace cpu { |
35 | namespace x64 { |
36 | |
37 | /** class to perform balancing over 3D array |
38 | * |
39 | * Conceptually the reduction happens according to the picture below: |
40 | * |
41 | * <--job_size-> |
42 | * +-----------+ +-----------+ +-----------+ ^ |
43 | * | | | | | | | |
44 | * | | | | | | | |
45 | * | 1 | | 2 | . . . | njobs | | reduction_size |
46 | * | | | | | | | |
47 | * | | | | | | | |
48 | * +-----------+ +-----------+ +-----------+ v |
49 | * |
50 | * | | | | | | | | | |
51 | * v v v v v v v v v |
52 | * ===================================================== vertical reduction |
53 | * |
54 | * +-----------+ +-----------+ . . . +-----------+ result |
55 | * |
56 | * In a simple case the result must be contiguous in memory. |
57 | * @class cpu_reducer_t is an implementation. |
58 | * |
59 | * Threads are divided into groups. The groups are independent of each other. |
60 | * Each group may work on several jobs (the distribution is not uniform, since |
61 | * njobs might be not a multiple of groups). Threads within a group work on |
62 | * different parts of the reduction dimension. Thread 0 in each group is called |
63 | * master (@sa reduce_balancer_t::master()). |
64 | * |
65 | * If threading driver does not allow sync between sub-group of threads (e.g. |
66 | * TBB) the # of thread per group is enforced to be 1. |
67 | */ |
68 | struct reduce_balancer_t { |
69 | reduce_balancer_t() { init(1, 1, 1, 1, 0); } /* trivial balance */ |
70 | reduce_balancer_t(int nthr, int job_size, int njobs, int reduction_size, |
71 | size_t max_buffer_size, bool lock_free = false) { |
72 | init(nthr, job_size, njobs, reduction_size, max_buffer_size, lock_free); |
73 | } |
74 | |
75 | reduce_balancer_t &init(int nthr, int job_size, int njobs, |
76 | int reduction_size, size_t max_buffer_size, |
77 | bool lock_free = false) { |
78 | allow_nthr_in_group_ = lock_free ? true : dnnl_thr_syncable(); |
79 | nthr_ = nthr; |
80 | job_size_ = job_size; |
81 | njobs_ = njobs; |
82 | reduction_size_ = reduction_size; |
83 | max_buffer_size_ = max_buffer_size; |
84 | balance(); |
85 | return *this; |
86 | } |
87 | |
88 | bool allow_nthr_in_group_; |
89 | int nthr_; |
90 | int job_size_, njobs_, reduction_size_; |
91 | |
92 | int ngroups_; /** number of independent work (thread) groups */ |
93 | int nthr_per_group_; /** number of threads within a single work group */ |
94 | int njobs_per_group_ub_; /** the max # of jobs within a work group */ |
95 | |
96 | bool master(int ithr) const { return id_in_group(ithr) == 0; } |
97 | bool idle(int ithr) const { return ithr >= nthr_per_group_ * ngroups_; } |
98 | |
99 | int group_id(int ithr) const { return ithr / nthr_per_group_; } |
100 | int id_in_group(int ithr) const { return ithr % nthr_per_group_; } |
101 | |
102 | int grp_njobs(int grp) const { |
103 | if (grp >= ngroups_) return 0; |
104 | return njobs_ / ngroups_ + (grp < njobs_ % ngroups_); |
105 | } |
106 | int grp_job_off(int grp) const { |
107 | if (grp >= ngroups_) return njobs_; |
108 | return njobs_ / ngroups_ * grp + nstl::min(grp, njobs_ % ngroups_); |
109 | } |
110 | |
111 | int ithr_njobs(int ithr) const { return grp_njobs(group_id(ithr)); } |
112 | int ithr_job_off(int ithr) const { return grp_job_off(group_id(ithr)); } |
113 | |
114 | private: |
115 | size_t max_buffer_size_; |
116 | void balance(); |
117 | }; |
118 | |
119 | /** forward declaration of reduce driver */ |
120 | template <impl::data_type_t data_type> |
121 | struct reducer_2d_driver_t; |
122 | |
123 | /** class to perform a reduction over 3D array |
124 | * |
125 | * Balancing is based on @class reduce_balancer_t. |
126 | * Restrictions: the result of the reduction must be contiguous in memory. * |
127 | * The reduction happens according to the picture below (once more): |
128 | * |
129 | * <--job_size-> |
130 | * +-----------+ +-----------+ +-----------+ ^ |
131 | * | | | | | | | |
132 | * | | | | | | | |
133 | * | 1 | | 2 | . . . | njobs | | reduction_size |
134 | * | | | | | | | |
135 | * | | | | | | | |
136 | * +-----------+ +-----------+ +-----------+ v |
137 | * |
138 | * | | | | | | | | | |
139 | * v v v v v v v v v |
140 | * ===================================================== vertical reduction |
141 | * |
142 | * +-----------+ +-----------+ . . . +-----------+ (contiguous) result |
143 | * |
144 | * An example how work might be shared is shown below. |
145 | * |
146 | * In this example group 0 owns 2 (independent) jobs -- 2 big squares. |
147 | * The number of threads per group is also 2 (thread 0 of group 0 and thread 1 |
148 | * of group 0). Master threads (i.e. threads with id 0 in corresponding group) |
149 | * from each group put the partial result directly into destination memory, |
150 | * while all the other threads with-in the group use workspace (on the picture |
151 | * the only thread 1). Once intermediate results obtained each group reduces |
152 | * corresponding part (own jobs) to the destination memory. |
153 | * |
154 | * <------- group 0 -------> |
155 | * |
156 | * +-----------+ +-----------+ ^ |
157 | * | | | | | thread 0 of reduces to the dest-memory |
158 | * | | | | | group 0 +-----------+ +-----------+ |
159 | * |- - - - - -| |- - - - - -| X |
160 | * | | | | | thread 1 of reduces to workspace[tid=1]: |
161 | * | | | | | group 0 +-----------+ +-----------+ |
162 | * +-----------+ +-----------+ v |
163 | * | | | | | | |
164 | * v v v v v v |
165 | * ((barrier)) ============================= |
166 | * |
167 | * dest-memory: +-----------+ +-----------+ |
168 | */ |
169 | template <impl::data_type_t data_type> |
170 | struct cpu_reducer_t { |
171 | typedef typename prec_traits<data_type>::type data_t; |
172 | |
173 | struct conf_t { |
174 | conf_t() = default; |
175 | conf_t &init(const reduce_balancer_t &balancer) { |
176 | balancer_ = balancer; |
177 | return *this; |
178 | } |
179 | |
180 | void init_scratchpad(memory_tracking::registrar_t &scratchpad) const; |
181 | |
182 | reduce_balancer_t balancer_; |
183 | }; |
184 | |
185 | status_t create_kernel(); |
186 | |
187 | cpu_reducer_t(const conf_t &conf); |
188 | ~cpu_reducer_t(); |
189 | |
190 | /** initializes reducer. |
191 | * Must be called from a single thread prior to actual usage */ |
192 | void init(const memory_tracking::grantor_t &scratchpad) const { |
193 | if (balancer().nthr_per_group_ == 1 || !dnnl_thr_syncable()) return; |
194 | |
195 | auto bctx = scratchpad.template get<simple_barrier::ctx_t>( |
196 | memory_tracking::names::key_reducer_space_bctx); |
197 | for (int i = 0; i < balancer().ngroups_; ++i) |
198 | simple_barrier::ctx_init(&bctx[i]); |
199 | } |
200 | |
201 | /** for given thread returns the pointer where to put partial results. |
202 | * Reduction destination @p dst must be provided as well (master threads |
203 | * from each group will use it for partial result to reduce memory |
204 | * pressure). |
205 | * |
206 | * @note: job offset is already applied by get_local_ptr(), which means all |
207 | * threads should start writing from the very beginning of returned |
208 | * address. |
209 | */ |
210 | data_t *get_local_ptr(int ithr, data_t *dst, |
211 | const memory_tracking::grantor_t &scratchpad) const; |
212 | |
213 | /** performs the reduction with built-in synchronization. */ |
214 | void reduce(int ithr, data_t *dst, |
215 | const memory_tracking::grantor_t &scratchpad) const { |
216 | bool redundant_reduction |
217 | = balancer().nthr_per_group_ == 1 || balancer().idle(ithr); |
218 | if (redundant_reduction) return; |
219 | |
220 | auto bctx = scratchpad.template get<simple_barrier::ctx_t>( |
221 | memory_tracking::names::key_reducer_space_bctx); |
222 | simple_barrier::barrier( |
223 | &bctx[balancer().group_id(ithr)], balancer().nthr_per_group_); |
224 | |
225 | reduce_nolock(ithr, dst, scratchpad); |
226 | } |
227 | |
228 | void reduce_nolock(int ithr, data_t *dst, |
229 | const memory_tracking::grantor_t &scratchpad) const; |
230 | |
231 | const reduce_balancer_t &balancer() const { return conf_.balancer_; } |
232 | |
233 | private: |
234 | static size_t space_per_thread(const reduce_balancer_t &balancer) { |
235 | return balancer.njobs_per_group_ub_ * balancer.job_size_; |
236 | } |
237 | |
238 | /* The scratchpad is organized as follows: |
239 | * |
240 | * data_t space[nthr_][njobs_per_group_ub_][jobs_size_]; |
241 | * simple_barrier::ctx_t barriers[groups_]; */ |
242 | |
243 | const conf_t conf_; |
244 | reducer_2d_driver_t<data_type> *drv_; |
245 | |
246 | DNNL_DISALLOW_COPY_AND_ASSIGN(cpu_reducer_t); |
247 | }; |
248 | |
249 | template <impl::data_type_t data_type> |
250 | struct cpu_reducer_2d_t { |
251 | typedef typename prec_traits<data_type>::type data_t; |
252 | |
253 | struct conf_t { |
254 | conf_t() = default; |
255 | conf_t &init(const reduce_balancer_t &balancer, int job_size_x, |
256 | int job_size_y, int x_block, int dst_x, int dst_y) { |
257 | balancer_ = balancer; |
258 | job_size_x_ = job_size_x; |
259 | job_size_y_ = job_size_y; |
260 | x_block_ = x_block; |
261 | dst_x_ = dst_x; |
262 | dst_y_ = dst_y; |
263 | return *this; |
264 | } |
265 | |
266 | void init_scratchpad(memory_tracking::registrar_t &scratchpad) const; |
267 | |
268 | reduce_balancer_t balancer_; |
269 | int job_size_x_, job_size_y_, x_block_, dst_x_, dst_y_; |
270 | }; |
271 | |
272 | status_t create_kernel(); |
273 | |
274 | cpu_reducer_2d_t(const conf_t &conf); |
275 | ~cpu_reducer_2d_t(); |
276 | |
277 | /** initializes reducer. |
278 | * Must be called from a single thread prior to actual usage */ |
279 | void init(const memory_tracking::grantor_t &scratchpad) const { |
280 | if (balancer().nthr_per_group_ == 1 || !dnnl_thr_syncable()) return; |
281 | |
282 | auto bctx = scratchpad.template get<simple_barrier::ctx_t>( |
283 | memory_tracking::names::key_reducer_space_bctx); |
284 | for (int i = 0; i < balancer().ngroups_; ++i) |
285 | simple_barrier::ctx_init(&bctx[i]); |
286 | } |
287 | |
288 | /** for given thread returns the pointer where to put partial results */ |
289 | data_t *get_local_ptr( |
290 | int ithr, const memory_tracking::grantor_t &scratchpad) const; |
291 | |
292 | /** performs the reduction with built-in synchronization. */ |
293 | void reduce(int ithr, data_t *dst, |
294 | const memory_tracking::grantor_t &scratchpad) const { |
295 | bool redundant_reduction |
296 | = balancer().nthr_per_group_ == 1 || balancer().idle(ithr); |
297 | if (redundant_reduction) return; |
298 | |
299 | auto bctx = scratchpad.template get<simple_barrier::ctx_t>( |
300 | memory_tracking::names::key_reducer_space_bctx); |
301 | simple_barrier::barrier( |
302 | &bctx[balancer().group_id(ithr)], balancer().nthr_per_group_); |
303 | |
304 | reduce_nolock(ithr, dst, scratchpad); |
305 | } |
306 | |
307 | void reduce_nolock(int ithr, data_t *dst, |
308 | const memory_tracking::grantor_t &scratchpad) const; |
309 | |
310 | const reduce_balancer_t &balancer() const { return conf_.balancer_; } |
311 | |
312 | private: |
313 | static size_t space_per_thread(const reduce_balancer_t &balancer) { |
314 | return balancer.njobs_per_group_ub_ * balancer.job_size_; |
315 | } |
316 | |
317 | /* The scratchpad is organized as follows: |
318 | * |
319 | * data_t space[nthr_][njobs_per_group_ub_][jobs_size_]; |
320 | * simple_barrier::ctx_t barriers[groups_]; */ |
321 | |
322 | const conf_t conf_; |
323 | reducer_2d_driver_t<data_type> *drv_; |
324 | |
325 | int choose_x_blocking(int nx, int ny, int nthr_per_grp) const; |
326 | void reduce_block(const data_t *space_base, data_t *dst, int job, |
327 | int start_y, int start_x, int ny_start, int nx_start, int ny_step, |
328 | int nx_step) const; |
329 | |
330 | DNNL_DISALLOW_COPY_AND_ASSIGN(cpu_reducer_2d_t); |
331 | }; |
332 | |
333 | /** simple 1d accumulator: y[:] += x[:] */ |
334 | template <impl::data_type_t data_type> |
335 | struct cpu_accumulator_1d_t { |
336 | typedef typename prec_traits<data_type>::type data_t; |
337 | |
338 | cpu_accumulator_1d_t(); |
339 | ~cpu_accumulator_1d_t(); |
340 | void accumulate(data_t *dst, const data_t *src, size_t size); |
341 | |
342 | status_t create_kernel(); |
343 | |
344 | reducer_2d_driver_t<data_type> *drv_; |
345 | |
346 | DNNL_DISALLOW_COPY_AND_ASSIGN(cpu_accumulator_1d_t); |
347 | }; |
348 | |
349 | } // namespace x64 |
350 | } // namespace cpu |
351 | } // namespace impl |
352 | } // namespace dnnl |
353 | |
354 | #endif |
355 | |
356 | // vim: et ts=4 sw=4 cindent cino+=l0,\:4,N-s |
357 | |