1 | /* Standard C headers */ |
2 | #include <assert.h> |
3 | #include <stdbool.h> |
4 | #include <stdint.h> |
5 | #include <stdlib.h> |
6 | #include <string.h> |
7 | |
8 | #if PTHREADPOOL_USE_CPUINFO |
9 | #include <cpuinfo.h> |
10 | #endif |
11 | |
12 | /* Dependencies */ |
13 | #include <fxdiv.h> |
14 | |
15 | /* Public library header */ |
16 | #include <pthreadpool.h> |
17 | |
18 | /* Internal library headers */ |
19 | #include "threadpool-atomics.h" |
20 | #include "threadpool-object.h" |
21 | #include "threadpool-utils.h" |
22 | |
23 | |
24 | size_t pthreadpool_get_threads_count(struct pthreadpool* threadpool) { |
25 | if (threadpool == NULL) { |
26 | return 1; |
27 | } |
28 | |
29 | return threadpool->threads_count.value; |
30 | } |
31 | |
32 | static void thread_parallelize_1d(struct pthreadpool* threadpool, struct thread_info* thread) { |
33 | assert(threadpool != NULL); |
34 | assert(thread != NULL); |
35 | |
36 | const pthreadpool_task_1d_t task = (pthreadpool_task_1d_t) pthreadpool_load_relaxed_void_p(&threadpool->task); |
37 | void *const argument = pthreadpool_load_relaxed_void_p(&threadpool->argument); |
38 | |
39 | /* Process thread's own range of items */ |
40 | size_t range_start = pthreadpool_load_relaxed_size_t(&thread->range_start); |
41 | while (pthreadpool_try_decrement_relaxed_size_t(&thread->range_length)) { |
42 | task(argument, range_start++); |
43 | } |
44 | |
45 | /* There still may be other threads with work */ |
46 | const size_t thread_number = thread->thread_number; |
47 | const size_t threads_count = threadpool->threads_count.value; |
48 | for (size_t tid = modulo_decrement(thread_number, threads_count); |
49 | tid != thread_number; |
50 | tid = modulo_decrement(tid, threads_count)) |
51 | { |
52 | struct thread_info* other_thread = &threadpool->threads[tid]; |
53 | while (pthreadpool_try_decrement_relaxed_size_t(&other_thread->range_length)) { |
54 | const size_t index = pthreadpool_decrement_fetch_relaxed_size_t(&other_thread->range_end); |
55 | task(argument, index); |
56 | } |
57 | } |
58 | |
59 | /* Make changes by this thread visible to other threads */ |
60 | pthreadpool_fence_release(); |
61 | } |
62 | |
63 | static void thread_parallelize_1d_with_uarch(struct pthreadpool* threadpool, struct thread_info* thread) { |
64 | assert(threadpool != NULL); |
65 | assert(thread != NULL); |
66 | |
67 | const pthreadpool_task_1d_with_id_t task = (pthreadpool_task_1d_with_id_t) pthreadpool_load_relaxed_void_p(&threadpool->task); |
68 | void *const argument = pthreadpool_load_relaxed_void_p(&threadpool->argument); |
69 | |
70 | const uint32_t default_uarch_index = threadpool->params.parallelize_1d_with_uarch.default_uarch_index; |
71 | uint32_t uarch_index = default_uarch_index; |
72 | #if PTHREADPOOL_USE_CPUINFO |
73 | uarch_index = cpuinfo_get_current_uarch_index_with_default(default_uarch_index); |
74 | if (uarch_index > threadpool->params.parallelize_1d_with_uarch.max_uarch_index) { |
75 | uarch_index = default_uarch_index; |
76 | } |
77 | #endif |
78 | |
79 | /* Process thread's own range of items */ |
80 | size_t range_start = pthreadpool_load_relaxed_size_t(&thread->range_start); |
81 | while (pthreadpool_try_decrement_relaxed_size_t(&thread->range_length)) { |
82 | task(argument, uarch_index, range_start++); |
83 | } |
84 | |
85 | /* There still may be other threads with work */ |
86 | const size_t thread_number = thread->thread_number; |
87 | const size_t threads_count = threadpool->threads_count.value; |
88 | for (size_t tid = modulo_decrement(thread_number, threads_count); |
89 | tid != thread_number; |
90 | tid = modulo_decrement(tid, threads_count)) |
91 | { |
92 | struct thread_info* other_thread = &threadpool->threads[tid]; |
93 | while (pthreadpool_try_decrement_relaxed_size_t(&other_thread->range_length)) { |
94 | const size_t index = pthreadpool_decrement_fetch_relaxed_size_t(&other_thread->range_end); |
95 | task(argument, uarch_index, index); |
96 | } |
97 | } |
98 | |
99 | /* Make changes by this thread visible to other threads */ |
100 | pthreadpool_fence_release(); |
101 | } |
102 | |
103 | static void thread_parallelize_1d_tile_1d(struct pthreadpool* threadpool, struct thread_info* thread) { |
104 | assert(threadpool != NULL); |
105 | assert(thread != NULL); |
106 | |
107 | const pthreadpool_task_1d_tile_1d_t task = (pthreadpool_task_1d_tile_1d_t) pthreadpool_load_relaxed_void_p(&threadpool->task); |
108 | void *const argument = pthreadpool_load_relaxed_void_p(&threadpool->argument); |
109 | |
110 | /* Process thread's own range of items */ |
111 | const size_t range_start = pthreadpool_load_relaxed_size_t(&thread->range_start); |
112 | const size_t tile = threadpool->params.parallelize_1d_tile_1d.tile; |
113 | size_t tile_start = range_start * tile; |
114 | |
115 | const size_t range = threadpool->params.parallelize_1d_tile_1d.range; |
116 | while (pthreadpool_try_decrement_relaxed_size_t(&thread->range_length)) { |
117 | task(argument, tile_start, min(range - tile_start, tile)); |
118 | tile_start += tile; |
119 | } |
120 | |
121 | /* There still may be other threads with work */ |
122 | const size_t thread_number = thread->thread_number; |
123 | const size_t threads_count = threadpool->threads_count.value; |
124 | for (size_t tid = modulo_decrement(thread_number, threads_count); |
125 | tid != thread_number; |
126 | tid = modulo_decrement(tid, threads_count)) |
127 | { |
128 | struct thread_info* other_thread = &threadpool->threads[tid]; |
129 | while (pthreadpool_try_decrement_relaxed_size_t(&other_thread->range_length)) { |
130 | const size_t tile_index = pthreadpool_decrement_fetch_relaxed_size_t(&other_thread->range_end); |
131 | const size_t tile_start = tile_index * tile; |
132 | task(argument, tile_start, min(range - tile_start, tile)); |
133 | } |
134 | } |
135 | |
136 | /* Make changes by this thread visible to other threads */ |
137 | pthreadpool_fence_release(); |
138 | } |
139 | |
140 | static void thread_parallelize_2d(struct pthreadpool* threadpool, struct thread_info* thread) { |
141 | assert(threadpool != NULL); |
142 | assert(thread != NULL); |
143 | |
144 | const pthreadpool_task_2d_t task = (pthreadpool_task_2d_t) pthreadpool_load_relaxed_void_p(&threadpool->task); |
145 | void *const argument = pthreadpool_load_relaxed_void_p(&threadpool->argument); |
146 | |
147 | /* Process thread's own range of items */ |
148 | const size_t range_start = pthreadpool_load_relaxed_size_t(&thread->range_start); |
149 | const struct fxdiv_divisor_size_t range_j = threadpool->params.parallelize_2d.range_j; |
150 | const struct fxdiv_result_size_t index_i_j = fxdiv_divide_size_t(range_start, range_j); |
151 | size_t i = index_i_j.quotient; |
152 | size_t j = index_i_j.remainder; |
153 | |
154 | while (pthreadpool_try_decrement_relaxed_size_t(&thread->range_length)) { |
155 | task(argument, i, j); |
156 | if (++j == range_j.value) { |
157 | j = 0; |
158 | i += 1; |
159 | } |
160 | } |
161 | |
162 | /* There still may be other threads with work */ |
163 | const size_t thread_number = thread->thread_number; |
164 | const size_t threads_count = threadpool->threads_count.value; |
165 | for (size_t tid = modulo_decrement(thread_number, threads_count); |
166 | tid != thread_number; |
167 | tid = modulo_decrement(tid, threads_count)) |
168 | { |
169 | struct thread_info* other_thread = &threadpool->threads[tid]; |
170 | while (pthreadpool_try_decrement_relaxed_size_t(&other_thread->range_length)) { |
171 | const size_t linear_index = pthreadpool_decrement_fetch_relaxed_size_t(&other_thread->range_end); |
172 | const struct fxdiv_result_size_t index_i_j = fxdiv_divide_size_t(linear_index, range_j); |
173 | task(argument, index_i_j.quotient, index_i_j.remainder); |
174 | } |
175 | } |
176 | |
177 | /* Make changes by this thread visible to other threads */ |
178 | pthreadpool_fence_release(); |
179 | } |
180 | |
181 | static void thread_parallelize_2d_tile_1d(struct pthreadpool* threadpool, struct thread_info* thread) { |
182 | assert(threadpool != NULL); |
183 | assert(thread != NULL); |
184 | |
185 | const pthreadpool_task_2d_tile_1d_t task = (pthreadpool_task_2d_tile_1d_t) pthreadpool_load_relaxed_void_p(&threadpool->task); |
186 | void *const argument = pthreadpool_load_relaxed_void_p(&threadpool->argument); |
187 | |
188 | /* Process thread's own range of items */ |
189 | const size_t range_start = pthreadpool_load_relaxed_size_t(&thread->range_start); |
190 | const struct fxdiv_divisor_size_t tile_range_j = threadpool->params.parallelize_2d_tile_1d.tile_range_j; |
191 | const struct fxdiv_result_size_t tile_index_i_j = fxdiv_divide_size_t(range_start, tile_range_j); |
192 | const size_t tile_j = threadpool->params.parallelize_2d_tile_1d.tile_j; |
193 | size_t i = tile_index_i_j.quotient; |
194 | size_t start_j = tile_index_i_j.remainder * tile_j; |
195 | |
196 | const size_t range_j = threadpool->params.parallelize_2d_tile_1d.range_j; |
197 | while (pthreadpool_try_decrement_relaxed_size_t(&thread->range_length)) { |
198 | task(argument, i, start_j, min(range_j - start_j, tile_j)); |
199 | start_j += tile_j; |
200 | if (start_j >= range_j) { |
201 | start_j = 0; |
202 | i += 1; |
203 | } |
204 | } |
205 | |
206 | /* There still may be other threads with work */ |
207 | const size_t thread_number = thread->thread_number; |
208 | const size_t threads_count = threadpool->threads_count.value; |
209 | for (size_t tid = modulo_decrement(thread_number, threads_count); |
210 | tid != thread_number; |
211 | tid = modulo_decrement(tid, threads_count)) |
212 | { |
213 | struct thread_info* other_thread = &threadpool->threads[tid]; |
214 | while (pthreadpool_try_decrement_relaxed_size_t(&other_thread->range_length)) { |
215 | const size_t linear_index = pthreadpool_decrement_fetch_relaxed_size_t(&other_thread->range_end); |
216 | const struct fxdiv_result_size_t tile_index_i_j = fxdiv_divide_size_t(linear_index, tile_range_j); |
217 | const size_t start_j = tile_index_i_j.remainder * tile_j; |
218 | task(argument, tile_index_i_j.quotient, start_j, min(range_j - start_j, tile_j)); |
219 | } |
220 | } |
221 | |
222 | /* Make changes by this thread visible to other threads */ |
223 | pthreadpool_fence_release(); |
224 | } |
225 | |
226 | static void thread_parallelize_2d_tile_2d(struct pthreadpool* threadpool, struct thread_info* thread) { |
227 | assert(threadpool != NULL); |
228 | assert(thread != NULL); |
229 | |
230 | const pthreadpool_task_2d_tile_2d_t task = (pthreadpool_task_2d_tile_2d_t) pthreadpool_load_relaxed_void_p(&threadpool->task); |
231 | void *const argument = pthreadpool_load_relaxed_void_p(&threadpool->argument); |
232 | |
233 | /* Process thread's own range of items */ |
234 | const size_t range_start = pthreadpool_load_relaxed_size_t(&thread->range_start); |
235 | const struct fxdiv_divisor_size_t tile_range_j = threadpool->params.parallelize_2d_tile_2d.tile_range_j; |
236 | const struct fxdiv_result_size_t tile_index_i_j = fxdiv_divide_size_t(range_start, tile_range_j); |
237 | const size_t tile_i = threadpool->params.parallelize_2d_tile_2d.tile_i; |
238 | const size_t tile_j = threadpool->params.parallelize_2d_tile_2d.tile_j; |
239 | size_t start_i = tile_index_i_j.quotient * tile_i; |
240 | size_t start_j = tile_index_i_j.remainder * tile_j; |
241 | |
242 | const size_t range_i = threadpool->params.parallelize_2d_tile_2d.range_i; |
243 | const size_t range_j = threadpool->params.parallelize_2d_tile_2d.range_j; |
244 | while (pthreadpool_try_decrement_relaxed_size_t(&thread->range_length)) { |
245 | task(argument, start_i, start_j, min(range_i - start_i, tile_i), min(range_j - start_j, tile_j)); |
246 | start_j += tile_j; |
247 | if (start_j >= range_j) { |
248 | start_j = 0; |
249 | start_i += tile_i; |
250 | } |
251 | } |
252 | |
253 | /* There still may be other threads with work */ |
254 | const size_t thread_number = thread->thread_number; |
255 | const size_t threads_count = threadpool->threads_count.value; |
256 | for (size_t tid = modulo_decrement(thread_number, threads_count); |
257 | tid != thread_number; |
258 | tid = modulo_decrement(tid, threads_count)) |
259 | { |
260 | struct thread_info* other_thread = &threadpool->threads[tid]; |
261 | while (pthreadpool_try_decrement_relaxed_size_t(&other_thread->range_length)) { |
262 | const size_t linear_index = pthreadpool_decrement_fetch_relaxed_size_t(&other_thread->range_end); |
263 | const struct fxdiv_result_size_t tile_index_i_j = fxdiv_divide_size_t(linear_index, tile_range_j); |
264 | const size_t start_i = tile_index_i_j.quotient * tile_i; |
265 | const size_t start_j = tile_index_i_j.remainder * tile_j; |
266 | task(argument, start_i, start_j, min(range_i - start_i, tile_i), min(range_j - start_j, tile_j)); |
267 | } |
268 | } |
269 | |
270 | /* Make changes by this thread visible to other threads */ |
271 | pthreadpool_fence_release(); |
272 | } |
273 | |
274 | static void thread_parallelize_2d_tile_2d_with_uarch(struct pthreadpool* threadpool, struct thread_info* thread) { |
275 | assert(threadpool != NULL); |
276 | assert(thread != NULL); |
277 | |
278 | const pthreadpool_task_2d_tile_2d_with_id_t task = (pthreadpool_task_2d_tile_2d_with_id_t) pthreadpool_load_relaxed_void_p(&threadpool->task); |
279 | void *const argument = pthreadpool_load_relaxed_void_p(&threadpool->argument); |
280 | |
281 | const uint32_t default_uarch_index = threadpool->params.parallelize_2d_tile_2d_with_uarch.default_uarch_index; |
282 | uint32_t uarch_index = default_uarch_index; |
283 | #if PTHREADPOOL_USE_CPUINFO |
284 | uarch_index = cpuinfo_get_current_uarch_index_with_default(default_uarch_index); |
285 | if (uarch_index > threadpool->params.parallelize_2d_tile_2d_with_uarch.max_uarch_index) { |
286 | uarch_index = default_uarch_index; |
287 | } |
288 | #endif |
289 | |
290 | /* Process thread's own range of items */ |
291 | const struct fxdiv_divisor_size_t tile_range_j = threadpool->params.parallelize_2d_tile_2d_with_uarch.tile_range_j; |
292 | const size_t range_start = pthreadpool_load_relaxed_size_t(&thread->range_start); |
293 | const struct fxdiv_result_size_t index = fxdiv_divide_size_t(range_start, tile_range_j); |
294 | const size_t range_i = threadpool->params.parallelize_2d_tile_2d_with_uarch.range_i; |
295 | const size_t tile_i = threadpool->params.parallelize_2d_tile_2d_with_uarch.tile_i; |
296 | const size_t range_j = threadpool->params.parallelize_2d_tile_2d_with_uarch.range_j; |
297 | const size_t tile_j = threadpool->params.parallelize_2d_tile_2d_with_uarch.tile_j; |
298 | size_t start_i = index.quotient * tile_i; |
299 | size_t start_j = index.remainder * tile_j; |
300 | |
301 | while (pthreadpool_try_decrement_relaxed_size_t(&thread->range_length)) { |
302 | task(argument, uarch_index, start_i, start_j, min(range_i - start_i, tile_i), min(range_j - start_j, tile_j)); |
303 | start_j += tile_j; |
304 | if (start_j >= range_j) { |
305 | start_j = 0; |
306 | start_i += tile_i; |
307 | } |
308 | } |
309 | |
310 | /* There still may be other threads with work */ |
311 | const size_t thread_number = thread->thread_number; |
312 | const size_t threads_count = threadpool->threads_count.value; |
313 | for (size_t tid = modulo_decrement(thread_number, threads_count); |
314 | tid != thread_number; |
315 | tid = modulo_decrement(tid, threads_count)) |
316 | { |
317 | struct thread_info* other_thread = &threadpool->threads[tid]; |
318 | while (pthreadpool_try_decrement_relaxed_size_t(&other_thread->range_length)) { |
319 | const size_t linear_index = pthreadpool_decrement_fetch_relaxed_size_t(&other_thread->range_end); |
320 | const struct fxdiv_result_size_t tile_index_i_j = fxdiv_divide_size_t(linear_index, tile_range_j); |
321 | const size_t start_i = tile_index_i_j.quotient * tile_i; |
322 | const size_t start_j = tile_index_i_j.remainder * tile_j; |
323 | task(argument, uarch_index, start_i, start_j, min(range_i - start_i, tile_i), min(range_j - start_j, tile_j)); |
324 | } |
325 | } |
326 | |
327 | /* Make changes by this thread visible to other threads */ |
328 | pthreadpool_fence_release(); |
329 | } |
330 | |
331 | static void thread_parallelize_3d(struct pthreadpool* threadpool, struct thread_info* thread) { |
332 | assert(threadpool != NULL); |
333 | assert(thread != NULL); |
334 | |
335 | const pthreadpool_task_3d_t task = (pthreadpool_task_3d_t) pthreadpool_load_relaxed_void_p(&threadpool->task); |
336 | void *const argument = pthreadpool_load_relaxed_void_p(&threadpool->argument); |
337 | |
338 | /* Process thread's own range of items */ |
339 | const size_t range_start = pthreadpool_load_relaxed_size_t(&thread->range_start); |
340 | const struct fxdiv_divisor_size_t range_k = threadpool->params.parallelize_3d.range_k; |
341 | const struct fxdiv_result_size_t index_ij_k = fxdiv_divide_size_t(range_start, range_k); |
342 | const struct fxdiv_divisor_size_t range_j = threadpool->params.parallelize_3d.range_j; |
343 | const struct fxdiv_result_size_t index_i_j = fxdiv_divide_size_t(index_ij_k.quotient, range_j); |
344 | size_t i = index_i_j.quotient; |
345 | size_t j = index_i_j.remainder; |
346 | size_t k = index_ij_k.remainder; |
347 | |
348 | while (pthreadpool_try_decrement_relaxed_size_t(&thread->range_length)) { |
349 | task(argument, i, j, k); |
350 | if (++k == range_k.value) { |
351 | k = 0; |
352 | if (++j == range_j.value) { |
353 | j = 0; |
354 | i += 1; |
355 | } |
356 | } |
357 | } |
358 | |
359 | /* There still may be other threads with work */ |
360 | const size_t thread_number = thread->thread_number; |
361 | const size_t threads_count = threadpool->threads_count.value; |
362 | for (size_t tid = modulo_decrement(thread_number, threads_count); |
363 | tid != thread_number; |
364 | tid = modulo_decrement(tid, threads_count)) |
365 | { |
366 | struct thread_info* other_thread = &threadpool->threads[tid]; |
367 | while (pthreadpool_try_decrement_relaxed_size_t(&other_thread->range_length)) { |
368 | const size_t linear_index = pthreadpool_decrement_fetch_relaxed_size_t(&other_thread->range_end); |
369 | const struct fxdiv_result_size_t index_ij_k = fxdiv_divide_size_t(linear_index, range_k); |
370 | const struct fxdiv_result_size_t index_i_j = fxdiv_divide_size_t(index_ij_k.quotient, range_j); |
371 | task(argument, index_i_j.quotient, index_i_j.remainder, index_ij_k.remainder); |
372 | } |
373 | } |
374 | |
375 | /* Make changes by this thread visible to other threads */ |
376 | pthreadpool_fence_release(); |
377 | } |
378 | |
379 | static void thread_parallelize_3d_tile_1d(struct pthreadpool* threadpool, struct thread_info* thread) { |
380 | assert(threadpool != NULL); |
381 | assert(thread != NULL); |
382 | |
383 | const pthreadpool_task_3d_tile_1d_t task = (pthreadpool_task_3d_tile_1d_t) pthreadpool_load_relaxed_void_p(&threadpool->task); |
384 | void *const argument = pthreadpool_load_relaxed_void_p(&threadpool->argument); |
385 | |
386 | /* Process thread's own range of items */ |
387 | const size_t range_start = pthreadpool_load_relaxed_size_t(&thread->range_start); |
388 | const struct fxdiv_divisor_size_t tile_range_k = threadpool->params.parallelize_3d_tile_1d.tile_range_k; |
389 | const struct fxdiv_result_size_t tile_index_ij_k = fxdiv_divide_size_t(range_start, tile_range_k); |
390 | const struct fxdiv_divisor_size_t range_j = threadpool->params.parallelize_3d_tile_1d.range_j; |
391 | const struct fxdiv_result_size_t index_i_j = fxdiv_divide_size_t(tile_index_ij_k.quotient, range_j); |
392 | const size_t tile_k = threadpool->params.parallelize_3d_tile_1d.tile_k; |
393 | size_t i = index_i_j.quotient; |
394 | size_t j = index_i_j.remainder; |
395 | size_t start_k = tile_index_ij_k.remainder * tile_k; |
396 | |
397 | const size_t range_k = threadpool->params.parallelize_3d_tile_1d.range_k; |
398 | while (pthreadpool_try_decrement_relaxed_size_t(&thread->range_length)) { |
399 | task(argument, i, j, start_k, min(range_k - start_k, tile_k)); |
400 | start_k += tile_k; |
401 | if (start_k >= range_k) { |
402 | start_k = 0; |
403 | if (++j == range_j.value) { |
404 | j = 0; |
405 | i += 1; |
406 | } |
407 | } |
408 | } |
409 | |
410 | /* There still may be other threads with work */ |
411 | const size_t thread_number = thread->thread_number; |
412 | const size_t threads_count = threadpool->threads_count.value; |
413 | for (size_t tid = modulo_decrement(thread_number, threads_count); |
414 | tid != thread_number; |
415 | tid = modulo_decrement(tid, threads_count)) |
416 | { |
417 | struct thread_info* other_thread = &threadpool->threads[tid]; |
418 | while (pthreadpool_try_decrement_relaxed_size_t(&other_thread->range_length)) { |
419 | const size_t linear_index = pthreadpool_decrement_fetch_relaxed_size_t(&other_thread->range_end); |
420 | const struct fxdiv_result_size_t tile_index_ij_k = fxdiv_divide_size_t(linear_index, tile_range_k); |
421 | const struct fxdiv_result_size_t index_i_j = fxdiv_divide_size_t(tile_index_ij_k.quotient, range_j); |
422 | const size_t start_k = tile_index_ij_k.remainder * tile_k; |
423 | task(argument, index_i_j.quotient, index_i_j.remainder, start_k, min(range_k - start_k, tile_k)); |
424 | } |
425 | } |
426 | |
427 | /* Make changes by this thread visible to other threads */ |
428 | pthreadpool_fence_release(); |
429 | } |
430 | |
431 | static void thread_parallelize_3d_tile_2d(struct pthreadpool* threadpool, struct thread_info* thread) { |
432 | assert(threadpool != NULL); |
433 | assert(thread != NULL); |
434 | |
435 | const pthreadpool_task_3d_tile_2d_t task = (pthreadpool_task_3d_tile_2d_t) pthreadpool_load_relaxed_void_p(&threadpool->task); |
436 | void *const argument = pthreadpool_load_relaxed_void_p(&threadpool->argument); |
437 | |
438 | /* Process thread's own range of items */ |
439 | const size_t range_start = pthreadpool_load_relaxed_size_t(&thread->range_start); |
440 | const struct fxdiv_divisor_size_t tile_range_k = threadpool->params.parallelize_3d_tile_2d.tile_range_k; |
441 | const struct fxdiv_result_size_t tile_index_ij_k = fxdiv_divide_size_t(range_start, tile_range_k); |
442 | const struct fxdiv_divisor_size_t tile_range_j = threadpool->params.parallelize_3d_tile_2d.tile_range_j; |
443 | const struct fxdiv_result_size_t tile_index_i_j = fxdiv_divide_size_t(tile_index_ij_k.quotient, tile_range_j); |
444 | const size_t tile_j = threadpool->params.parallelize_3d_tile_2d.tile_j; |
445 | const size_t tile_k = threadpool->params.parallelize_3d_tile_2d.tile_k; |
446 | size_t i = tile_index_i_j.quotient; |
447 | size_t start_j = tile_index_i_j.remainder * tile_j; |
448 | size_t start_k = tile_index_ij_k.remainder * tile_k; |
449 | |
450 | const size_t range_k = threadpool->params.parallelize_3d_tile_2d.range_k; |
451 | const size_t range_j = threadpool->params.parallelize_3d_tile_2d.range_j; |
452 | while (pthreadpool_try_decrement_relaxed_size_t(&thread->range_length)) { |
453 | task(argument, i, start_j, start_k, min(range_j - start_j, tile_j), min(range_k - start_k, tile_k)); |
454 | start_k += tile_k; |
455 | if (start_k >= range_k) { |
456 | start_k = 0; |
457 | start_j += tile_j; |
458 | if (start_j >= range_j) { |
459 | start_j = 0; |
460 | i += 1; |
461 | } |
462 | } |
463 | } |
464 | |
465 | /* There still may be other threads with work */ |
466 | const size_t thread_number = thread->thread_number; |
467 | const size_t threads_count = threadpool->threads_count.value; |
468 | for (size_t tid = modulo_decrement(thread_number, threads_count); |
469 | tid != thread_number; |
470 | tid = modulo_decrement(tid, threads_count)) |
471 | { |
472 | struct thread_info* other_thread = &threadpool->threads[tid]; |
473 | while (pthreadpool_try_decrement_relaxed_size_t(&other_thread->range_length)) { |
474 | const size_t linear_index = pthreadpool_decrement_fetch_relaxed_size_t(&other_thread->range_end); |
475 | const struct fxdiv_result_size_t tile_index_ij_k = fxdiv_divide_size_t(linear_index, tile_range_k); |
476 | const struct fxdiv_result_size_t tile_index_i_j = fxdiv_divide_size_t(tile_index_ij_k.quotient, tile_range_j); |
477 | const size_t start_j = tile_index_i_j.remainder * tile_j; |
478 | const size_t start_k = tile_index_ij_k.remainder * tile_k; |
479 | task(argument, tile_index_i_j.quotient, start_j, start_k, min(range_j - start_j, tile_j), min(range_k - start_k, tile_k)); |
480 | } |
481 | } |
482 | |
483 | /* Make changes by this thread visible to other threads */ |
484 | pthreadpool_fence_release(); |
485 | } |
486 | |
487 | static void thread_parallelize_3d_tile_2d_with_uarch(struct pthreadpool* threadpool, struct thread_info* thread) { |
488 | assert(threadpool != NULL); |
489 | assert(thread != NULL); |
490 | |
491 | const pthreadpool_task_3d_tile_2d_with_id_t task = (pthreadpool_task_3d_tile_2d_with_id_t) pthreadpool_load_relaxed_void_p(&threadpool->task); |
492 | void *const argument = pthreadpool_load_relaxed_void_p(&threadpool->argument); |
493 | |
494 | const uint32_t default_uarch_index = threadpool->params.parallelize_3d_tile_2d_with_uarch.default_uarch_index; |
495 | uint32_t uarch_index = default_uarch_index; |
496 | #if PTHREADPOOL_USE_CPUINFO |
497 | uarch_index = cpuinfo_get_current_uarch_index_with_default(default_uarch_index); |
498 | if (uarch_index > threadpool->params.parallelize_3d_tile_2d_with_uarch.max_uarch_index) { |
499 | uarch_index = default_uarch_index; |
500 | } |
501 | #endif |
502 | |
503 | /* Process thread's own range of items */ |
504 | const size_t range_start = pthreadpool_load_relaxed_size_t(&thread->range_start); |
505 | const struct fxdiv_divisor_size_t tile_range_k = threadpool->params.parallelize_3d_tile_2d_with_uarch.tile_range_k; |
506 | const struct fxdiv_result_size_t tile_index_ij_k = fxdiv_divide_size_t(range_start, tile_range_k); |
507 | const struct fxdiv_divisor_size_t tile_range_j = threadpool->params.parallelize_3d_tile_2d_with_uarch.tile_range_j; |
508 | const struct fxdiv_result_size_t tile_index_i_j = fxdiv_divide_size_t(tile_index_ij_k.quotient, tile_range_j); |
509 | const size_t tile_j = threadpool->params.parallelize_3d_tile_2d_with_uarch.tile_j; |
510 | const size_t tile_k = threadpool->params.parallelize_3d_tile_2d_with_uarch.tile_k; |
511 | size_t i = tile_index_i_j.quotient; |
512 | size_t start_j = tile_index_i_j.remainder * tile_j; |
513 | size_t start_k = tile_index_ij_k.remainder * tile_k; |
514 | |
515 | const size_t range_k = threadpool->params.parallelize_3d_tile_2d_with_uarch.range_k; |
516 | const size_t range_j = threadpool->params.parallelize_3d_tile_2d_with_uarch.range_j; |
517 | while (pthreadpool_try_decrement_relaxed_size_t(&thread->range_length)) { |
518 | task(argument, uarch_index, i, start_j, start_k, min(range_j - start_j, tile_j), min(range_k - start_k, tile_k)); |
519 | start_k += tile_k; |
520 | if (start_k >= range_k) { |
521 | start_k = 0; |
522 | start_j += tile_j; |
523 | if (start_j >= range_j) { |
524 | start_j = 0; |
525 | i += 1; |
526 | } |
527 | } |
528 | } |
529 | |
530 | /* There still may be other threads with work */ |
531 | const size_t thread_number = thread->thread_number; |
532 | const size_t threads_count = threadpool->threads_count.value; |
533 | for (size_t tid = modulo_decrement(thread_number, threads_count); |
534 | tid != thread_number; |
535 | tid = modulo_decrement(tid, threads_count)) |
536 | { |
537 | struct thread_info* other_thread = &threadpool->threads[tid]; |
538 | while (pthreadpool_try_decrement_relaxed_size_t(&other_thread->range_length)) { |
539 | const size_t linear_index = pthreadpool_decrement_fetch_relaxed_size_t(&other_thread->range_end); |
540 | const struct fxdiv_result_size_t tile_index_ij_k = fxdiv_divide_size_t(linear_index, tile_range_k); |
541 | const struct fxdiv_result_size_t tile_index_i_j = fxdiv_divide_size_t(tile_index_ij_k.quotient, tile_range_j); |
542 | const size_t start_j = tile_index_i_j.remainder * tile_j; |
543 | const size_t start_k = tile_index_ij_k.remainder * tile_k; |
544 | task(argument, uarch_index, tile_index_i_j.quotient, start_j, start_k, min(range_j - start_j, tile_j), min(range_k - start_k, tile_k)); |
545 | } |
546 | } |
547 | |
548 | /* Make changes by this thread visible to other threads */ |
549 | pthreadpool_fence_release(); |
550 | } |
551 | |
552 | static void thread_parallelize_4d(struct pthreadpool* threadpool, struct thread_info* thread) { |
553 | assert(threadpool != NULL); |
554 | assert(thread != NULL); |
555 | |
556 | const pthreadpool_task_4d_t task = (pthreadpool_task_4d_t) pthreadpool_load_relaxed_void_p(&threadpool->task); |
557 | void *const argument = pthreadpool_load_relaxed_void_p(&threadpool->argument); |
558 | |
559 | /* Process thread's own range of items */ |
560 | const size_t range_start = pthreadpool_load_relaxed_size_t(&thread->range_start); |
561 | const struct fxdiv_divisor_size_t range_kl = threadpool->params.parallelize_4d.range_kl; |
562 | const struct fxdiv_result_size_t index_ij_kl = fxdiv_divide_size_t(range_start, range_kl); |
563 | const struct fxdiv_divisor_size_t range_j = threadpool->params.parallelize_4d.range_j; |
564 | const struct fxdiv_result_size_t index_i_j = fxdiv_divide_size_t(index_ij_kl.quotient, range_j); |
565 | const struct fxdiv_divisor_size_t range_l = threadpool->params.parallelize_4d.range_l; |
566 | const struct fxdiv_result_size_t index_k_l = fxdiv_divide_size_t(index_ij_kl.remainder, range_l); |
567 | size_t i = index_i_j.quotient; |
568 | size_t j = index_i_j.remainder; |
569 | size_t k = index_k_l.quotient; |
570 | size_t l = index_k_l.remainder; |
571 | |
572 | const size_t range_k = threadpool->params.parallelize_4d.range_k; |
573 | while (pthreadpool_try_decrement_relaxed_size_t(&thread->range_length)) { |
574 | task(argument, i, j, k, l); |
575 | if (++l == range_l.value) { |
576 | l = 0; |
577 | if (++k == range_k) { |
578 | k = 0; |
579 | if (++j == range_j.value) { |
580 | j = 0; |
581 | i += 1; |
582 | } |
583 | } |
584 | } |
585 | } |
586 | |
587 | /* There still may be other threads with work */ |
588 | const size_t thread_number = thread->thread_number; |
589 | const size_t threads_count = threadpool->threads_count.value; |
590 | for (size_t tid = modulo_decrement(thread_number, threads_count); |
591 | tid != thread_number; |
592 | tid = modulo_decrement(tid, threads_count)) |
593 | { |
594 | struct thread_info* other_thread = &threadpool->threads[tid]; |
595 | while (pthreadpool_try_decrement_relaxed_size_t(&other_thread->range_length)) { |
596 | const size_t linear_index = pthreadpool_decrement_fetch_relaxed_size_t(&other_thread->range_end); |
597 | const struct fxdiv_result_size_t index_ij_kl = fxdiv_divide_size_t(linear_index, range_kl); |
598 | const struct fxdiv_result_size_t index_i_j = fxdiv_divide_size_t(index_ij_kl.quotient, range_j); |
599 | const struct fxdiv_result_size_t index_k_l = fxdiv_divide_size_t(index_ij_kl.remainder, range_l); |
600 | task(argument, index_i_j.quotient, index_i_j.remainder, index_k_l.quotient, index_k_l.remainder); |
601 | } |
602 | } |
603 | |
604 | /* Make changes by this thread visible to other threads */ |
605 | pthreadpool_fence_release(); |
606 | } |
607 | |
608 | static void thread_parallelize_4d_tile_1d(struct pthreadpool* threadpool, struct thread_info* thread) { |
609 | assert(threadpool != NULL); |
610 | assert(thread != NULL); |
611 | |
612 | const pthreadpool_task_4d_tile_1d_t task = (pthreadpool_task_4d_tile_1d_t) pthreadpool_load_relaxed_void_p(&threadpool->task); |
613 | void *const argument = pthreadpool_load_relaxed_void_p(&threadpool->argument); |
614 | |
615 | /* Process thread's own range of items */ |
616 | const size_t range_start = pthreadpool_load_relaxed_size_t(&thread->range_start); |
617 | const struct fxdiv_divisor_size_t tile_range_kl = threadpool->params.parallelize_4d_tile_1d.tile_range_kl; |
618 | const struct fxdiv_result_size_t tile_index_ij_kl = fxdiv_divide_size_t(range_start, tile_range_kl); |
619 | const struct fxdiv_divisor_size_t range_j = threadpool->params.parallelize_4d_tile_1d.range_j; |
620 | const struct fxdiv_result_size_t index_i_j = fxdiv_divide_size_t(tile_index_ij_kl.quotient, range_j); |
621 | const struct fxdiv_divisor_size_t tile_range_l = threadpool->params.parallelize_4d_tile_1d.tile_range_l; |
622 | const struct fxdiv_result_size_t tile_index_k_l = fxdiv_divide_size_t(tile_index_ij_kl.remainder, tile_range_l); |
623 | const size_t tile_l = threadpool->params.parallelize_4d_tile_1d.tile_l; |
624 | size_t i = index_i_j.quotient; |
625 | size_t j = index_i_j.remainder; |
626 | size_t k = tile_index_k_l.quotient; |
627 | size_t start_l = tile_index_k_l.remainder * tile_l; |
628 | |
629 | const size_t range_k = threadpool->params.parallelize_4d_tile_1d.range_k; |
630 | const size_t range_l = threadpool->params.parallelize_4d_tile_1d.range_l; |
631 | while (pthreadpool_try_decrement_relaxed_size_t(&thread->range_length)) { |
632 | task(argument, i, j, k, start_l, min(range_l - start_l, tile_l)); |
633 | start_l += tile_l; |
634 | if (start_l >= range_l) { |
635 | start_l = 0; |
636 | if (++k == range_k) { |
637 | k = 0; |
638 | if (++j == range_j.value) { |
639 | j = 0; |
640 | i += 1; |
641 | } |
642 | } |
643 | } |
644 | } |
645 | |
646 | /* There still may be other threads with work */ |
647 | const size_t thread_number = thread->thread_number; |
648 | const size_t threads_count = threadpool->threads_count.value; |
649 | for (size_t tid = modulo_decrement(thread_number, threads_count); |
650 | tid != thread_number; |
651 | tid = modulo_decrement(tid, threads_count)) |
652 | { |
653 | struct thread_info* other_thread = &threadpool->threads[tid]; |
654 | while (pthreadpool_try_decrement_relaxed_size_t(&other_thread->range_length)) { |
655 | const size_t linear_index = pthreadpool_decrement_fetch_relaxed_size_t(&other_thread->range_end); |
656 | const struct fxdiv_result_size_t tile_index_ij_kl = fxdiv_divide_size_t(linear_index, tile_range_kl); |
657 | const struct fxdiv_result_size_t index_i_j = fxdiv_divide_size_t(tile_index_ij_kl.quotient, range_j); |
658 | const struct fxdiv_result_size_t tile_index_k_l = fxdiv_divide_size_t(tile_index_ij_kl.remainder, tile_range_l); |
659 | const size_t start_l = tile_index_k_l.remainder * tile_l; |
660 | task(argument, index_i_j.quotient, index_i_j.remainder, tile_index_k_l.quotient, start_l, min(range_l - start_l, tile_l)); |
661 | } |
662 | } |
663 | |
664 | /* Make changes by this thread visible to other threads */ |
665 | pthreadpool_fence_release(); |
666 | } |
667 | |
668 | static void thread_parallelize_4d_tile_2d(struct pthreadpool* threadpool, struct thread_info* thread) { |
669 | assert(threadpool != NULL); |
670 | assert(thread != NULL); |
671 | |
672 | const pthreadpool_task_4d_tile_2d_t task = (pthreadpool_task_4d_tile_2d_t) pthreadpool_load_relaxed_void_p(&threadpool->task); |
673 | void *const argument = pthreadpool_load_relaxed_void_p(&threadpool->argument); |
674 | |
675 | /* Process thread's own range of items */ |
676 | const size_t range_start = pthreadpool_load_relaxed_size_t(&thread->range_start); |
677 | const struct fxdiv_divisor_size_t tile_range_kl = threadpool->params.parallelize_4d_tile_2d.tile_range_kl; |
678 | const struct fxdiv_result_size_t tile_index_ij_kl = fxdiv_divide_size_t(range_start, tile_range_kl); |
679 | const struct fxdiv_divisor_size_t range_j = threadpool->params.parallelize_4d_tile_2d.range_j; |
680 | const struct fxdiv_result_size_t index_i_j = fxdiv_divide_size_t(tile_index_ij_kl.quotient, range_j); |
681 | const struct fxdiv_divisor_size_t tile_range_l = threadpool->params.parallelize_4d_tile_2d.tile_range_l; |
682 | const struct fxdiv_result_size_t tile_index_k_l = fxdiv_divide_size_t(tile_index_ij_kl.remainder, tile_range_l); |
683 | const size_t tile_k = threadpool->params.parallelize_4d_tile_2d.tile_k; |
684 | const size_t tile_l = threadpool->params.parallelize_4d_tile_2d.tile_l; |
685 | size_t i = index_i_j.quotient; |
686 | size_t j = index_i_j.remainder; |
687 | size_t start_k = tile_index_k_l.quotient * tile_k; |
688 | size_t start_l = tile_index_k_l.remainder * tile_l; |
689 | |
690 | const size_t range_l = threadpool->params.parallelize_4d_tile_2d.range_l; |
691 | const size_t range_k = threadpool->params.parallelize_4d_tile_2d.range_k; |
692 | while (pthreadpool_try_decrement_relaxed_size_t(&thread->range_length)) { |
693 | task(argument, i, j, start_k, start_l, min(range_k - start_k, tile_k), min(range_l - start_l, tile_l)); |
694 | start_l += tile_l; |
695 | if (start_l >= range_l) { |
696 | start_l = 0; |
697 | start_k += tile_k; |
698 | if (start_k >= range_k) { |
699 | start_k = 0; |
700 | if (++j == range_j.value) { |
701 | j = 0; |
702 | i += 1; |
703 | } |
704 | } |
705 | } |
706 | } |
707 | |
708 | /* There still may be other threads with work */ |
709 | const size_t thread_number = thread->thread_number; |
710 | const size_t threads_count = threadpool->threads_count.value; |
711 | for (size_t tid = modulo_decrement(thread_number, threads_count); |
712 | tid != thread_number; |
713 | tid = modulo_decrement(tid, threads_count)) |
714 | { |
715 | struct thread_info* other_thread = &threadpool->threads[tid]; |
716 | while (pthreadpool_try_decrement_relaxed_size_t(&other_thread->range_length)) { |
717 | const size_t linear_index = pthreadpool_decrement_fetch_relaxed_size_t(&other_thread->range_end); |
718 | const struct fxdiv_result_size_t tile_index_ij_kl = fxdiv_divide_size_t(linear_index, tile_range_kl); |
719 | const struct fxdiv_result_size_t index_i_j = fxdiv_divide_size_t(tile_index_ij_kl.quotient, range_j); |
720 | const struct fxdiv_result_size_t tile_index_k_l = fxdiv_divide_size_t(tile_index_ij_kl.remainder, tile_range_l); |
721 | const size_t start_k = tile_index_k_l.quotient * tile_k; |
722 | const size_t start_l = tile_index_k_l.remainder * tile_l; |
723 | task(argument, index_i_j.quotient, index_i_j.remainder, start_k, start_l, min(range_k - start_k, tile_k), min(range_l - start_l, tile_l)); |
724 | } |
725 | } |
726 | |
727 | /* Make changes by this thread visible to other threads */ |
728 | pthreadpool_fence_release(); |
729 | } |
730 | |
731 | static void thread_parallelize_4d_tile_2d_with_uarch(struct pthreadpool* threadpool, struct thread_info* thread) { |
732 | assert(threadpool != NULL); |
733 | assert(thread != NULL); |
734 | |
735 | const pthreadpool_task_4d_tile_2d_with_id_t task = (pthreadpool_task_4d_tile_2d_with_id_t) pthreadpool_load_relaxed_void_p(&threadpool->task); |
736 | void *const argument = pthreadpool_load_relaxed_void_p(&threadpool->argument); |
737 | |
738 | const uint32_t default_uarch_index = threadpool->params.parallelize_4d_tile_2d_with_uarch.default_uarch_index; |
739 | uint32_t uarch_index = default_uarch_index; |
740 | #if PTHREADPOOL_USE_CPUINFO |
741 | uarch_index = cpuinfo_get_current_uarch_index_with_default(default_uarch_index); |
742 | if (uarch_index > threadpool->params.parallelize_4d_tile_2d_with_uarch.max_uarch_index) { |
743 | uarch_index = default_uarch_index; |
744 | } |
745 | #endif |
746 | |
747 | /* Process thread's own range of items */ |
748 | const size_t range_start = pthreadpool_load_relaxed_size_t(&thread->range_start); |
749 | const struct fxdiv_divisor_size_t tile_range_kl = threadpool->params.parallelize_4d_tile_2d_with_uarch.tile_range_kl; |
750 | const struct fxdiv_result_size_t tile_index_ij_kl = fxdiv_divide_size_t(range_start, tile_range_kl); |
751 | const struct fxdiv_divisor_size_t range_j = threadpool->params.parallelize_4d_tile_2d_with_uarch.range_j; |
752 | const struct fxdiv_result_size_t index_i_j = fxdiv_divide_size_t(tile_index_ij_kl.quotient, range_j); |
753 | const struct fxdiv_divisor_size_t tile_range_l = threadpool->params.parallelize_4d_tile_2d_with_uarch.tile_range_l; |
754 | const struct fxdiv_result_size_t tile_index_k_l = fxdiv_divide_size_t(tile_index_ij_kl.remainder, tile_range_l); |
755 | const size_t tile_k = threadpool->params.parallelize_4d_tile_2d_with_uarch.tile_k; |
756 | const size_t tile_l = threadpool->params.parallelize_4d_tile_2d_with_uarch.tile_l; |
757 | size_t i = index_i_j.quotient; |
758 | size_t j = index_i_j.remainder; |
759 | size_t start_k = tile_index_k_l.quotient * tile_k; |
760 | size_t start_l = tile_index_k_l.remainder * tile_l; |
761 | |
762 | const size_t range_l = threadpool->params.parallelize_4d_tile_2d_with_uarch.range_l; |
763 | const size_t range_k = threadpool->params.parallelize_4d_tile_2d_with_uarch.range_k; |
764 | while (pthreadpool_try_decrement_relaxed_size_t(&thread->range_length)) { |
765 | task(argument, uarch_index, i, j, start_k, start_l, min(range_k - start_k, tile_k), min(range_l - start_l, tile_l)); |
766 | start_l += tile_l; |
767 | if (start_l >= range_l) { |
768 | start_l = 0; |
769 | start_k += tile_k; |
770 | if (start_k >= range_k) { |
771 | start_k = 0; |
772 | if (++j == range_j.value) { |
773 | j = 0; |
774 | i += 1; |
775 | } |
776 | } |
777 | } |
778 | } |
779 | |
780 | /* There still may be other threads with work */ |
781 | const size_t thread_number = thread->thread_number; |
782 | const size_t threads_count = threadpool->threads_count.value; |
783 | for (size_t tid = modulo_decrement(thread_number, threads_count); |
784 | tid != thread_number; |
785 | tid = modulo_decrement(tid, threads_count)) |
786 | { |
787 | struct thread_info* other_thread = &threadpool->threads[tid]; |
788 | while (pthreadpool_try_decrement_relaxed_size_t(&other_thread->range_length)) { |
789 | const size_t linear_index = pthreadpool_decrement_fetch_relaxed_size_t(&other_thread->range_end); |
790 | const struct fxdiv_result_size_t tile_index_ij_kl = fxdiv_divide_size_t(linear_index, tile_range_kl); |
791 | const struct fxdiv_result_size_t index_i_j = fxdiv_divide_size_t(tile_index_ij_kl.quotient, range_j); |
792 | const struct fxdiv_result_size_t tile_index_k_l = fxdiv_divide_size_t(tile_index_ij_kl.remainder, tile_range_l); |
793 | const size_t start_k = tile_index_k_l.quotient * tile_k; |
794 | const size_t start_l = tile_index_k_l.remainder * tile_l; |
795 | task(argument, uarch_index, index_i_j.quotient, index_i_j.remainder, start_k, start_l, min(range_k - start_k, tile_k), min(range_l - start_l, tile_l)); |
796 | } |
797 | } |
798 | |
799 | /* Make changes by this thread visible to other threads */ |
800 | pthreadpool_fence_release(); |
801 | } |
802 | |
803 | static void thread_parallelize_5d(struct pthreadpool* threadpool, struct thread_info* thread) { |
804 | assert(threadpool != NULL); |
805 | assert(thread != NULL); |
806 | |
807 | const pthreadpool_task_5d_t task = (pthreadpool_task_5d_t) pthreadpool_load_relaxed_void_p(&threadpool->task); |
808 | void *const argument = pthreadpool_load_relaxed_void_p(&threadpool->argument); |
809 | |
810 | /* Process thread's own range of items */ |
811 | const size_t range_start = pthreadpool_load_relaxed_size_t(&thread->range_start); |
812 | const struct fxdiv_divisor_size_t range_lm = threadpool->params.parallelize_5d.range_lm; |
813 | const struct fxdiv_result_size_t index_ijk_lm = fxdiv_divide_size_t(range_start, range_lm); |
814 | const struct fxdiv_divisor_size_t range_k = threadpool->params.parallelize_5d.range_k; |
815 | const struct fxdiv_result_size_t index_ij_k = fxdiv_divide_size_t(index_ijk_lm.quotient, range_k); |
816 | const struct fxdiv_divisor_size_t range_m = threadpool->params.parallelize_5d.range_m; |
817 | const struct fxdiv_result_size_t index_l_m = fxdiv_divide_size_t(index_ijk_lm.remainder, range_m); |
818 | const struct fxdiv_divisor_size_t range_j = threadpool->params.parallelize_5d.range_j; |
819 | const struct fxdiv_result_size_t index_i_j = fxdiv_divide_size_t(index_ij_k.quotient, range_j); |
820 | size_t i = index_i_j.quotient; |
821 | size_t j = index_i_j.remainder; |
822 | size_t k = index_ij_k.remainder; |
823 | size_t l = index_l_m.quotient; |
824 | size_t m = index_l_m.remainder; |
825 | |
826 | const size_t range_l = threadpool->params.parallelize_5d.range_l; |
827 | while (pthreadpool_try_decrement_relaxed_size_t(&thread->range_length)) { |
828 | task(argument, i, j, k, l, m); |
829 | if (++m == range_m.value) { |
830 | m = 0; |
831 | if (++l == range_l) { |
832 | l = 0; |
833 | if (++k == range_k.value) { |
834 | k = 0; |
835 | if (++j == range_j.value) { |
836 | j = 0; |
837 | i += 1; |
838 | } |
839 | } |
840 | } |
841 | } |
842 | } |
843 | |
844 | /* There still may be other threads with work */ |
845 | const size_t thread_number = thread->thread_number; |
846 | const size_t threads_count = threadpool->threads_count.value; |
847 | for (size_t tid = modulo_decrement(thread_number, threads_count); |
848 | tid != thread_number; |
849 | tid = modulo_decrement(tid, threads_count)) |
850 | { |
851 | struct thread_info* other_thread = &threadpool->threads[tid]; |
852 | while (pthreadpool_try_decrement_relaxed_size_t(&other_thread->range_length)) { |
853 | const size_t linear_index = pthreadpool_decrement_fetch_relaxed_size_t(&other_thread->range_end); |
854 | const struct fxdiv_result_size_t index_ijk_lm = fxdiv_divide_size_t(linear_index, range_lm); |
855 | const struct fxdiv_result_size_t index_ij_k = fxdiv_divide_size_t(index_ijk_lm.quotient, range_k); |
856 | const struct fxdiv_result_size_t index_l_m = fxdiv_divide_size_t(index_ijk_lm.remainder, range_m); |
857 | const struct fxdiv_result_size_t index_i_j = fxdiv_divide_size_t(index_ij_k.quotient, range_j); |
858 | task(argument, index_i_j.quotient, index_i_j.remainder, index_ij_k.remainder, index_l_m.quotient, index_l_m.remainder); |
859 | } |
860 | } |
861 | |
862 | /* Make changes by this thread visible to other threads */ |
863 | pthreadpool_fence_release(); |
864 | } |
865 | |
866 | static void thread_parallelize_5d_tile_1d(struct pthreadpool* threadpool, struct thread_info* thread) { |
867 | assert(threadpool != NULL); |
868 | assert(thread != NULL); |
869 | |
870 | const pthreadpool_task_5d_tile_1d_t task = (pthreadpool_task_5d_tile_1d_t) pthreadpool_load_relaxed_void_p(&threadpool->task); |
871 | void *const argument = pthreadpool_load_relaxed_void_p(&threadpool->argument); |
872 | |
873 | /* Process thread's own range of items */ |
874 | const size_t range_start = pthreadpool_load_relaxed_size_t(&thread->range_start); |
875 | const struct fxdiv_divisor_size_t tile_range_m = threadpool->params.parallelize_5d_tile_1d.tile_range_m; |
876 | const struct fxdiv_result_size_t tile_index_ijkl_m = fxdiv_divide_size_t(range_start, tile_range_m); |
877 | const struct fxdiv_divisor_size_t range_kl = threadpool->params.parallelize_5d_tile_1d.range_kl; |
878 | const struct fxdiv_result_size_t index_ij_kl = fxdiv_divide_size_t(tile_index_ijkl_m.quotient, range_kl); |
879 | const struct fxdiv_divisor_size_t range_j = threadpool->params.parallelize_5d_tile_1d.range_j; |
880 | const struct fxdiv_result_size_t index_i_j = fxdiv_divide_size_t(index_ij_kl.quotient, range_j); |
881 | const struct fxdiv_divisor_size_t range_l = threadpool->params.parallelize_5d_tile_1d.range_l; |
882 | const struct fxdiv_result_size_t index_k_l = fxdiv_divide_size_t(index_ij_kl.remainder, range_l); |
883 | const size_t tile_m = threadpool->params.parallelize_5d_tile_1d.tile_m; |
884 | size_t i = index_i_j.quotient; |
885 | size_t j = index_i_j.remainder; |
886 | size_t k = index_k_l.quotient; |
887 | size_t l = index_k_l.remainder; |
888 | size_t start_m = tile_index_ijkl_m.remainder * tile_m; |
889 | |
890 | const size_t range_m = threadpool->params.parallelize_5d_tile_1d.range_m; |
891 | const size_t range_k = threadpool->params.parallelize_5d_tile_1d.range_k; |
892 | while (pthreadpool_try_decrement_relaxed_size_t(&thread->range_length)) { |
893 | task(argument, i, j, k, l, start_m, min(range_m - start_m, tile_m)); |
894 | start_m += tile_m; |
895 | if (start_m >= range_m) { |
896 | start_m = 0; |
897 | if (++l == range_l.value) { |
898 | l = 0; |
899 | if (++k == range_k) { |
900 | k = 0; |
901 | if (++j == range_j.value) { |
902 | j = 0; |
903 | i += 1; |
904 | } |
905 | } |
906 | } |
907 | } |
908 | } |
909 | |
910 | /* There still may be other threads with work */ |
911 | const size_t thread_number = thread->thread_number; |
912 | const size_t threads_count = threadpool->threads_count.value; |
913 | for (size_t tid = modulo_decrement(thread_number, threads_count); |
914 | tid != thread_number; |
915 | tid = modulo_decrement(tid, threads_count)) |
916 | { |
917 | struct thread_info* other_thread = &threadpool->threads[tid]; |
918 | while (pthreadpool_try_decrement_relaxed_size_t(&other_thread->range_length)) { |
919 | const size_t linear_index = pthreadpool_decrement_fetch_relaxed_size_t(&other_thread->range_end); |
920 | const struct fxdiv_result_size_t tile_index_ijkl_m = fxdiv_divide_size_t(linear_index, tile_range_m); |
921 | const struct fxdiv_result_size_t index_ij_kl = fxdiv_divide_size_t(tile_index_ijkl_m.quotient, range_kl); |
922 | const struct fxdiv_result_size_t index_i_j = fxdiv_divide_size_t(index_ij_kl.quotient, range_j); |
923 | const struct fxdiv_result_size_t index_k_l = fxdiv_divide_size_t(index_ij_kl.remainder, range_l); |
924 | size_t start_m = tile_index_ijkl_m.remainder * tile_m; |
925 | task(argument, index_i_j.quotient, index_i_j.remainder, index_k_l.quotient, index_k_l.remainder, start_m, |
926 | min(range_m - start_m, tile_m)); |
927 | } |
928 | } |
929 | |
930 | /* Make changes by this thread visible to other threads */ |
931 | pthreadpool_fence_release(); |
932 | } |
933 | |
934 | static void thread_parallelize_5d_tile_2d(struct pthreadpool* threadpool, struct thread_info* thread) { |
935 | assert(threadpool != NULL); |
936 | assert(thread != NULL); |
937 | |
938 | const pthreadpool_task_5d_tile_2d_t task = (pthreadpool_task_5d_tile_2d_t) pthreadpool_load_relaxed_void_p(&threadpool->task); |
939 | void *const argument = pthreadpool_load_relaxed_void_p(&threadpool->argument); |
940 | |
941 | /* Process thread's own range of items */ |
942 | const size_t range_start = pthreadpool_load_relaxed_size_t(&thread->range_start); |
943 | const struct fxdiv_divisor_size_t tile_range_lm = threadpool->params.parallelize_5d_tile_2d.tile_range_lm; |
944 | const struct fxdiv_result_size_t tile_index_ijk_lm = fxdiv_divide_size_t(range_start, tile_range_lm); |
945 | const struct fxdiv_divisor_size_t range_k = threadpool->params.parallelize_5d_tile_2d.range_k; |
946 | const struct fxdiv_result_size_t index_ij_k = fxdiv_divide_size_t(tile_index_ijk_lm.quotient, range_k); |
947 | const struct fxdiv_divisor_size_t tile_range_m = threadpool->params.parallelize_5d_tile_2d.tile_range_m; |
948 | const struct fxdiv_result_size_t tile_index_l_m = fxdiv_divide_size_t(tile_index_ijk_lm.remainder, tile_range_m); |
949 | const struct fxdiv_divisor_size_t range_j = threadpool->params.parallelize_5d_tile_2d.range_j; |
950 | const struct fxdiv_result_size_t index_i_j = fxdiv_divide_size_t(index_ij_k.quotient, range_j); |
951 | const size_t tile_l = threadpool->params.parallelize_5d_tile_2d.tile_l; |
952 | const size_t tile_m = threadpool->params.parallelize_5d_tile_2d.tile_m; |
953 | size_t i = index_i_j.quotient; |
954 | size_t j = index_i_j.remainder; |
955 | size_t k = index_ij_k.remainder; |
956 | size_t start_l = tile_index_l_m.quotient * tile_l; |
957 | size_t start_m = tile_index_l_m.remainder * tile_m; |
958 | |
959 | const size_t range_m = threadpool->params.parallelize_5d_tile_2d.range_m; |
960 | const size_t range_l = threadpool->params.parallelize_5d_tile_2d.range_l; |
961 | while (pthreadpool_try_decrement_relaxed_size_t(&thread->range_length)) { |
962 | task(argument, i, j, k, start_l, start_m, min(range_l - start_l, tile_l), min(range_m - start_m, tile_m)); |
963 | start_m += tile_m; |
964 | if (start_m >= range_m) { |
965 | start_m = 0; |
966 | start_l += tile_l; |
967 | if (start_l >= range_l) { |
968 | start_l = 0; |
969 | if (++k == range_k.value) { |
970 | k = 0; |
971 | if (++j == range_j.value) { |
972 | j = 0; |
973 | i += 1; |
974 | } |
975 | } |
976 | } |
977 | } |
978 | } |
979 | |
980 | /* There still may be other threads with work */ |
981 | const size_t thread_number = thread->thread_number; |
982 | const size_t threads_count = threadpool->threads_count.value; |
983 | for (size_t tid = modulo_decrement(thread_number, threads_count); |
984 | tid != thread_number; |
985 | tid = modulo_decrement(tid, threads_count)) |
986 | { |
987 | struct thread_info* other_thread = &threadpool->threads[tid]; |
988 | while (pthreadpool_try_decrement_relaxed_size_t(&other_thread->range_length)) { |
989 | const size_t linear_index = pthreadpool_decrement_fetch_relaxed_size_t(&other_thread->range_end); |
990 | const struct fxdiv_result_size_t tile_index_ijk_lm = fxdiv_divide_size_t(linear_index, tile_range_lm); |
991 | const struct fxdiv_result_size_t index_ij_k = fxdiv_divide_size_t(tile_index_ijk_lm.quotient, range_k); |
992 | const struct fxdiv_result_size_t tile_index_l_m = fxdiv_divide_size_t(tile_index_ijk_lm.remainder, tile_range_m); |
993 | const struct fxdiv_result_size_t index_i_j = fxdiv_divide_size_t(index_ij_k.quotient, range_j); |
994 | const size_t start_l = tile_index_l_m.quotient * tile_l; |
995 | const size_t start_m = tile_index_l_m.remainder * tile_m; |
996 | task(argument, index_i_j.quotient, index_i_j.remainder, index_ij_k.remainder, |
997 | start_l, start_m, min(range_l - start_l, tile_l), min(range_m - start_m, tile_m)); |
998 | } |
999 | } |
1000 | |
1001 | /* Make changes by this thread visible to other threads */ |
1002 | pthreadpool_fence_release(); |
1003 | } |
1004 | |
1005 | static void thread_parallelize_6d(struct pthreadpool* threadpool, struct thread_info* thread) { |
1006 | assert(threadpool != NULL); |
1007 | assert(thread != NULL); |
1008 | |
1009 | const pthreadpool_task_6d_t task = (pthreadpool_task_6d_t) pthreadpool_load_relaxed_void_p(&threadpool->task); |
1010 | void *const argument = pthreadpool_load_relaxed_void_p(&threadpool->argument); |
1011 | |
1012 | /* Process thread's own range of items */ |
1013 | const size_t range_start = pthreadpool_load_relaxed_size_t(&thread->range_start); |
1014 | const struct fxdiv_divisor_size_t range_lmn = threadpool->params.parallelize_6d.range_lmn; |
1015 | const struct fxdiv_result_size_t index_ijk_lmn = fxdiv_divide_size_t(range_start, range_lmn); |
1016 | const struct fxdiv_divisor_size_t range_k = threadpool->params.parallelize_6d.range_k; |
1017 | const struct fxdiv_result_size_t index_ij_k = fxdiv_divide_size_t(index_ijk_lmn.quotient, range_k); |
1018 | const struct fxdiv_divisor_size_t range_n = threadpool->params.parallelize_6d.range_n; |
1019 | const struct fxdiv_result_size_t index_lm_n = fxdiv_divide_size_t(index_ijk_lmn.remainder, range_n); |
1020 | const struct fxdiv_divisor_size_t range_j = threadpool->params.parallelize_6d.range_j; |
1021 | const struct fxdiv_result_size_t index_i_j = fxdiv_divide_size_t(index_ij_k.quotient, range_j); |
1022 | const struct fxdiv_divisor_size_t range_m = threadpool->params.parallelize_6d.range_m; |
1023 | const struct fxdiv_result_size_t index_l_m = fxdiv_divide_size_t(index_lm_n.quotient, range_m); |
1024 | size_t i = index_i_j.quotient; |
1025 | size_t j = index_i_j.remainder; |
1026 | size_t k = index_ij_k.remainder; |
1027 | size_t l = index_l_m.quotient; |
1028 | size_t m = index_l_m.remainder; |
1029 | size_t n = index_lm_n.remainder; |
1030 | |
1031 | const size_t range_l = threadpool->params.parallelize_6d.range_l; |
1032 | while (pthreadpool_try_decrement_relaxed_size_t(&thread->range_length)) { |
1033 | task(argument, i, j, k, l, m, n); |
1034 | if (++n == range_n.value) { |
1035 | n = 0; |
1036 | if (++m == range_m.value) { |
1037 | m = 0; |
1038 | if (++l == range_l) { |
1039 | l = 0; |
1040 | if (++k == range_k.value) { |
1041 | k = 0; |
1042 | if (++j == range_j.value) { |
1043 | j = 0; |
1044 | i += 1; |
1045 | } |
1046 | } |
1047 | } |
1048 | } |
1049 | } |
1050 | } |
1051 | |
1052 | |
1053 | /* There still may be other threads with work */ |
1054 | const size_t thread_number = thread->thread_number; |
1055 | const size_t threads_count = threadpool->threads_count.value; |
1056 | for (size_t tid = modulo_decrement(thread_number, threads_count); |
1057 | tid != thread_number; |
1058 | tid = modulo_decrement(tid, threads_count)) |
1059 | { |
1060 | struct thread_info* other_thread = &threadpool->threads[tid]; |
1061 | while (pthreadpool_try_decrement_relaxed_size_t(&other_thread->range_length)) { |
1062 | const size_t linear_index = pthreadpool_decrement_fetch_relaxed_size_t(&other_thread->range_end); |
1063 | const struct fxdiv_result_size_t index_ijk_lmn = fxdiv_divide_size_t(linear_index, range_lmn); |
1064 | const struct fxdiv_result_size_t index_ij_k = fxdiv_divide_size_t(index_ijk_lmn.quotient, range_k); |
1065 | const struct fxdiv_result_size_t index_lm_n = fxdiv_divide_size_t(index_ijk_lmn.remainder, range_n); |
1066 | const struct fxdiv_result_size_t index_i_j = fxdiv_divide_size_t(index_ij_k.quotient, range_j); |
1067 | const struct fxdiv_result_size_t index_l_m = fxdiv_divide_size_t(index_lm_n.quotient, range_m); |
1068 | task(argument, index_i_j.quotient, index_i_j.remainder, index_ij_k.remainder, index_l_m.quotient, index_l_m.remainder, index_lm_n.remainder); |
1069 | } |
1070 | } |
1071 | |
1072 | /* Make changes by this thread visible to other threads */ |
1073 | pthreadpool_fence_release(); |
1074 | } |
1075 | |
1076 | static void thread_parallelize_6d_tile_1d(struct pthreadpool* threadpool, struct thread_info* thread) { |
1077 | assert(threadpool != NULL); |
1078 | assert(thread != NULL); |
1079 | |
1080 | const pthreadpool_task_6d_tile_1d_t task = (pthreadpool_task_6d_tile_1d_t) pthreadpool_load_relaxed_void_p(&threadpool->task); |
1081 | void *const argument = pthreadpool_load_relaxed_void_p(&threadpool->argument); |
1082 | |
1083 | /* Process thread's own range of items */ |
1084 | const size_t range_start = pthreadpool_load_relaxed_size_t(&thread->range_start); |
1085 | const struct fxdiv_divisor_size_t tile_range_lmn = threadpool->params.parallelize_6d_tile_1d.tile_range_lmn; |
1086 | const struct fxdiv_result_size_t tile_index_ijk_lmn = fxdiv_divide_size_t(range_start, tile_range_lmn); |
1087 | const struct fxdiv_divisor_size_t range_k = threadpool->params.parallelize_6d_tile_1d.range_k; |
1088 | const struct fxdiv_result_size_t index_ij_k = fxdiv_divide_size_t(tile_index_ijk_lmn.quotient, range_k); |
1089 | const struct fxdiv_divisor_size_t tile_range_n = threadpool->params.parallelize_6d_tile_1d.tile_range_n; |
1090 | const struct fxdiv_result_size_t tile_index_lm_n = fxdiv_divide_size_t(tile_index_ijk_lmn.remainder, tile_range_n); |
1091 | const struct fxdiv_divisor_size_t range_j = threadpool->params.parallelize_6d_tile_1d.range_j; |
1092 | const struct fxdiv_result_size_t index_i_j = fxdiv_divide_size_t(index_ij_k.quotient, range_j); |
1093 | const struct fxdiv_divisor_size_t range_m = threadpool->params.parallelize_6d_tile_1d.range_m; |
1094 | const struct fxdiv_result_size_t index_l_m = fxdiv_divide_size_t(tile_index_lm_n.quotient, range_m); |
1095 | const size_t tile_n = threadpool->params.parallelize_6d_tile_1d.tile_n; |
1096 | size_t i = index_i_j.quotient; |
1097 | size_t j = index_i_j.remainder; |
1098 | size_t k = index_ij_k.remainder; |
1099 | size_t l = index_l_m.quotient; |
1100 | size_t m = index_l_m.remainder; |
1101 | size_t start_n = tile_index_lm_n.remainder * tile_n; |
1102 | |
1103 | const size_t range_n = threadpool->params.parallelize_6d_tile_1d.range_n; |
1104 | const size_t range_l = threadpool->params.parallelize_6d_tile_1d.range_l; |
1105 | while (pthreadpool_try_decrement_relaxed_size_t(&thread->range_length)) { |
1106 | task(argument, i, j, k, l, m, start_n, min(range_n - start_n, tile_n)); |
1107 | start_n += tile_n; |
1108 | if (start_n >= range_n) { |
1109 | start_n = 0; |
1110 | if (++m == range_m.value) { |
1111 | m = 0; |
1112 | if (++l == range_l) { |
1113 | l = 0; |
1114 | if (++k == range_k.value) { |
1115 | k = 0; |
1116 | if (++j == range_j.value) { |
1117 | j = 0; |
1118 | i += 1; |
1119 | } |
1120 | } |
1121 | } |
1122 | } |
1123 | } |
1124 | } |
1125 | |
1126 | |
1127 | /* There still may be other threads with work */ |
1128 | const size_t thread_number = thread->thread_number; |
1129 | const size_t threads_count = threadpool->threads_count.value; |
1130 | for (size_t tid = modulo_decrement(thread_number, threads_count); |
1131 | tid != thread_number; |
1132 | tid = modulo_decrement(tid, threads_count)) |
1133 | { |
1134 | struct thread_info* other_thread = &threadpool->threads[tid]; |
1135 | while (pthreadpool_try_decrement_relaxed_size_t(&other_thread->range_length)) { |
1136 | const size_t linear_index = pthreadpool_decrement_fetch_relaxed_size_t(&other_thread->range_end); |
1137 | const struct fxdiv_result_size_t tile_index_ijk_lmn = fxdiv_divide_size_t(linear_index, tile_range_lmn); |
1138 | const struct fxdiv_result_size_t index_ij_k = fxdiv_divide_size_t(tile_index_ijk_lmn.quotient, range_k); |
1139 | const struct fxdiv_result_size_t tile_index_lm_n = fxdiv_divide_size_t(tile_index_ijk_lmn.remainder, tile_range_n); |
1140 | const struct fxdiv_result_size_t index_i_j = fxdiv_divide_size_t(index_ij_k.quotient, range_j); |
1141 | const struct fxdiv_result_size_t index_l_m = fxdiv_divide_size_t(tile_index_lm_n.quotient, range_m); |
1142 | const size_t start_n = tile_index_lm_n.remainder * tile_n; |
1143 | task(argument, index_i_j.quotient, index_i_j.remainder, index_ij_k.remainder, index_l_m.quotient, index_l_m.remainder, |
1144 | start_n, min(range_n - start_n, tile_n)); |
1145 | } |
1146 | } |
1147 | |
1148 | /* Make changes by this thread visible to other threads */ |
1149 | pthreadpool_fence_release(); |
1150 | } |
1151 | |
1152 | static void thread_parallelize_6d_tile_2d(struct pthreadpool* threadpool, struct thread_info* thread) { |
1153 | assert(threadpool != NULL); |
1154 | assert(thread != NULL); |
1155 | |
1156 | const pthreadpool_task_6d_tile_2d_t task = (pthreadpool_task_6d_tile_2d_t) pthreadpool_load_relaxed_void_p(&threadpool->task); |
1157 | void *const argument = pthreadpool_load_relaxed_void_p(&threadpool->argument); |
1158 | |
1159 | /* Process thread's own range of items */ |
1160 | const size_t range_start = pthreadpool_load_relaxed_size_t(&thread->range_start); |
1161 | const struct fxdiv_divisor_size_t tile_range_mn = threadpool->params.parallelize_6d_tile_2d.tile_range_mn; |
1162 | const struct fxdiv_result_size_t tile_index_ijkl_mn = fxdiv_divide_size_t(range_start, tile_range_mn); |
1163 | const struct fxdiv_divisor_size_t range_kl = threadpool->params.parallelize_6d_tile_2d.range_kl; |
1164 | const struct fxdiv_result_size_t index_ij_kl = fxdiv_divide_size_t(tile_index_ijkl_mn.quotient, range_kl); |
1165 | const struct fxdiv_divisor_size_t tile_range_n = threadpool->params.parallelize_6d_tile_2d.tile_range_n; |
1166 | const struct fxdiv_result_size_t tile_index_m_n = fxdiv_divide_size_t(tile_index_ijkl_mn.remainder, tile_range_n); |
1167 | const struct fxdiv_divisor_size_t range_j = threadpool->params.parallelize_6d_tile_2d.range_j; |
1168 | const struct fxdiv_result_size_t index_i_j = fxdiv_divide_size_t(index_ij_kl.quotient, range_j); |
1169 | const struct fxdiv_divisor_size_t range_l = threadpool->params.parallelize_6d_tile_2d.range_l; |
1170 | const struct fxdiv_result_size_t index_k_l = fxdiv_divide_size_t(index_ij_kl.remainder, range_l); |
1171 | const size_t tile_m = threadpool->params.parallelize_6d_tile_2d.tile_m; |
1172 | const size_t tile_n = threadpool->params.parallelize_6d_tile_2d.tile_n; |
1173 | size_t i = index_i_j.quotient; |
1174 | size_t j = index_i_j.remainder; |
1175 | size_t k = index_k_l.quotient; |
1176 | size_t l = index_k_l.remainder; |
1177 | size_t start_m = tile_index_m_n.quotient * tile_m; |
1178 | size_t start_n = tile_index_m_n.remainder * tile_n; |
1179 | |
1180 | const size_t range_n = threadpool->params.parallelize_6d_tile_2d.range_n; |
1181 | const size_t range_m = threadpool->params.parallelize_6d_tile_2d.range_m; |
1182 | const size_t range_k = threadpool->params.parallelize_6d_tile_2d.range_k; |
1183 | while (pthreadpool_try_decrement_relaxed_size_t(&thread->range_length)) { |
1184 | task(argument, i, j, k, l, start_m, start_n, min(range_m - start_m, tile_m), min(range_n - start_n, tile_n)); |
1185 | start_n += tile_n; |
1186 | if (start_n >= range_n) { |
1187 | start_n = 0; |
1188 | start_m += tile_m; |
1189 | if (start_m >= range_m) { |
1190 | start_m = 0; |
1191 | if (++l == range_l.value) { |
1192 | l = 0; |
1193 | if (++k == range_k) { |
1194 | k = 0; |
1195 | if (++j == range_j.value) { |
1196 | j = 0; |
1197 | i += 1; |
1198 | } |
1199 | } |
1200 | } |
1201 | } |
1202 | } |
1203 | } |
1204 | |
1205 | /* There still may be other threads with work */ |
1206 | const size_t thread_number = thread->thread_number; |
1207 | const size_t threads_count = threadpool->threads_count.value; |
1208 | for (size_t tid = modulo_decrement(thread_number, threads_count); |
1209 | tid != thread_number; |
1210 | tid = modulo_decrement(tid, threads_count)) |
1211 | { |
1212 | struct thread_info* other_thread = &threadpool->threads[tid]; |
1213 | while (pthreadpool_try_decrement_relaxed_size_t(&other_thread->range_length)) { |
1214 | const size_t linear_index = pthreadpool_decrement_fetch_relaxed_size_t(&other_thread->range_end); |
1215 | const struct fxdiv_result_size_t tile_index_ijkl_mn = fxdiv_divide_size_t(linear_index, tile_range_mn); |
1216 | const struct fxdiv_result_size_t index_ij_kl = fxdiv_divide_size_t(tile_index_ijkl_mn.quotient, range_kl); |
1217 | const struct fxdiv_result_size_t tile_index_m_n = fxdiv_divide_size_t(tile_index_ijkl_mn.remainder, tile_range_n); |
1218 | const struct fxdiv_result_size_t index_i_j = fxdiv_divide_size_t(index_ij_kl.quotient, range_j); |
1219 | const struct fxdiv_result_size_t index_k_l = fxdiv_divide_size_t(index_ij_kl.remainder, range_l); |
1220 | const size_t start_m = tile_index_m_n.quotient * tile_m; |
1221 | const size_t start_n = tile_index_m_n.remainder * tile_n; |
1222 | task(argument, index_i_j.quotient, index_i_j.remainder, index_k_l.quotient, index_k_l.remainder, |
1223 | start_m, start_n, min(range_m - start_m, tile_m), min(range_n - start_n, tile_n)); |
1224 | } |
1225 | } |
1226 | |
1227 | /* Make changes by this thread visible to other threads */ |
1228 | pthreadpool_fence_release(); |
1229 | } |
1230 | |
1231 | void pthreadpool_parallelize_1d( |
1232 | struct pthreadpool* threadpool, |
1233 | pthreadpool_task_1d_t task, |
1234 | void* argument, |
1235 | size_t range, |
1236 | uint32_t flags) |
1237 | { |
1238 | size_t threads_count; |
1239 | if (threadpool == NULL || (threads_count = threadpool->threads_count.value) <= 1 || range <= 1) { |
1240 | /* No thread pool used: execute task sequentially on the calling thread */ |
1241 | struct fpu_state saved_fpu_state = { 0 }; |
1242 | if (flags & PTHREADPOOL_FLAG_DISABLE_DENORMALS) { |
1243 | saved_fpu_state = get_fpu_state(); |
1244 | disable_fpu_denormals(); |
1245 | } |
1246 | for (size_t i = 0; i < range; i++) { |
1247 | task(argument, i); |
1248 | } |
1249 | if (flags & PTHREADPOOL_FLAG_DISABLE_DENORMALS) { |
1250 | set_fpu_state(saved_fpu_state); |
1251 | } |
1252 | } else { |
1253 | thread_function_t parallelize_1d = &thread_parallelize_1d; |
1254 | #if PTHREADPOOL_USE_FASTPATH |
1255 | const size_t range_threshold = -threads_count; |
1256 | if (range < range_threshold) { |
1257 | parallelize_1d = &pthreadpool_thread_parallelize_1d_fastpath; |
1258 | } |
1259 | #endif |
1260 | pthreadpool_parallelize( |
1261 | threadpool, parallelize_1d, NULL, 0, |
1262 | (void*) task, argument, range, flags); |
1263 | } |
1264 | } |
1265 | |
1266 | void pthreadpool_parallelize_1d_with_uarch( |
1267 | pthreadpool_t threadpool, |
1268 | pthreadpool_task_1d_with_id_t task, |
1269 | void* argument, |
1270 | uint32_t default_uarch_index, |
1271 | uint32_t max_uarch_index, |
1272 | size_t range, |
1273 | uint32_t flags) |
1274 | { |
1275 | size_t threads_count; |
1276 | if (threadpool == NULL || (threads_count = threadpool->threads_count.value) <= 1 || range <= 1) { |
1277 | /* No thread pool used: execute task sequentially on the calling thread */ |
1278 | |
1279 | uint32_t uarch_index = default_uarch_index; |
1280 | #if PTHREADPOOL_USE_CPUINFO |
1281 | uarch_index = cpuinfo_get_current_uarch_index_with_default(default_uarch_index); |
1282 | if (uarch_index > max_uarch_index) { |
1283 | uarch_index = default_uarch_index; |
1284 | } |
1285 | #endif |
1286 | |
1287 | struct fpu_state saved_fpu_state = { 0 }; |
1288 | if (flags & PTHREADPOOL_FLAG_DISABLE_DENORMALS) { |
1289 | saved_fpu_state = get_fpu_state(); |
1290 | disable_fpu_denormals(); |
1291 | } |
1292 | for (size_t i = 0; i < range; i++) { |
1293 | task(argument, uarch_index, i); |
1294 | } |
1295 | if (flags & PTHREADPOOL_FLAG_DISABLE_DENORMALS) { |
1296 | set_fpu_state(saved_fpu_state); |
1297 | } |
1298 | } else { |
1299 | const struct pthreadpool_1d_with_uarch_params params = { |
1300 | .default_uarch_index = default_uarch_index, |
1301 | .max_uarch_index = max_uarch_index, |
1302 | }; |
1303 | thread_function_t parallelize_1d_with_uarch = &thread_parallelize_1d_with_uarch; |
1304 | #if PTHREADPOOL_USE_FASTPATH |
1305 | const size_t range_threshold = -threads_count; |
1306 | if (range < range_threshold) { |
1307 | parallelize_1d_with_uarch = &pthreadpool_thread_parallelize_1d_with_uarch_fastpath; |
1308 | } |
1309 | #endif |
1310 | pthreadpool_parallelize( |
1311 | threadpool, parallelize_1d_with_uarch, ¶ms, sizeof(params), |
1312 | task, argument, range, flags); |
1313 | } |
1314 | } |
1315 | |
1316 | void pthreadpool_parallelize_1d_tile_1d( |
1317 | pthreadpool_t threadpool, |
1318 | pthreadpool_task_1d_tile_1d_t task, |
1319 | void* argument, |
1320 | size_t range, |
1321 | size_t tile, |
1322 | uint32_t flags) |
1323 | { |
1324 | size_t threads_count; |
1325 | if (threadpool == NULL || (threads_count = threadpool->threads_count.value) <= 1 || range <= tile) { |
1326 | /* No thread pool used: execute task sequentially on the calling thread */ |
1327 | struct fpu_state saved_fpu_state = { 0 }; |
1328 | if (flags & PTHREADPOOL_FLAG_DISABLE_DENORMALS) { |
1329 | saved_fpu_state = get_fpu_state(); |
1330 | disable_fpu_denormals(); |
1331 | } |
1332 | for (size_t i = 0; i < range; i += tile) { |
1333 | task(argument, i, min(range - i, tile)); |
1334 | } |
1335 | if (flags & PTHREADPOOL_FLAG_DISABLE_DENORMALS) { |
1336 | set_fpu_state(saved_fpu_state); |
1337 | } |
1338 | } else { |
1339 | const size_t tile_range = divide_round_up(range, tile); |
1340 | const struct pthreadpool_1d_tile_1d_params params = { |
1341 | .range = range, |
1342 | .tile = tile, |
1343 | }; |
1344 | thread_function_t parallelize_1d_tile_1d = &thread_parallelize_1d_tile_1d; |
1345 | #if PTHREADPOOL_USE_FASTPATH |
1346 | const size_t range_threshold = -threads_count; |
1347 | if (range < range_threshold) { |
1348 | parallelize_1d_tile_1d = &pthreadpool_thread_parallelize_1d_tile_1d_fastpath; |
1349 | } |
1350 | #endif |
1351 | pthreadpool_parallelize( |
1352 | threadpool, parallelize_1d_tile_1d, ¶ms, sizeof(params), |
1353 | task, argument, tile_range, flags); |
1354 | } |
1355 | } |
1356 | |
1357 | void pthreadpool_parallelize_2d( |
1358 | pthreadpool_t threadpool, |
1359 | pthreadpool_task_2d_t task, |
1360 | void* argument, |
1361 | size_t range_i, |
1362 | size_t range_j, |
1363 | uint32_t flags) |
1364 | { |
1365 | size_t threads_count; |
1366 | if (threadpool == NULL || (threads_count = threadpool->threads_count.value) <= 1 || (range_i | range_j) <= 1) { |
1367 | /* No thread pool used: execute task sequentially on the calling thread */ |
1368 | struct fpu_state saved_fpu_state = { 0 }; |
1369 | if (flags & PTHREADPOOL_FLAG_DISABLE_DENORMALS) { |
1370 | saved_fpu_state = get_fpu_state(); |
1371 | disable_fpu_denormals(); |
1372 | } |
1373 | for (size_t i = 0; i < range_i; i++) { |
1374 | for (size_t j = 0; j < range_j; j++) { |
1375 | task(argument, i, j); |
1376 | } |
1377 | } |
1378 | if (flags & PTHREADPOOL_FLAG_DISABLE_DENORMALS) { |
1379 | set_fpu_state(saved_fpu_state); |
1380 | } |
1381 | } else { |
1382 | const size_t range = range_i * range_j; |
1383 | const struct pthreadpool_2d_params params = { |
1384 | .range_j = fxdiv_init_size_t(range_j), |
1385 | }; |
1386 | thread_function_t parallelize_2d = &thread_parallelize_2d; |
1387 | #if PTHREADPOOL_USE_FASTPATH |
1388 | const size_t range_threshold = -threads_count; |
1389 | if (range < range_threshold) { |
1390 | parallelize_2d = &pthreadpool_thread_parallelize_2d_fastpath; |
1391 | } |
1392 | #endif |
1393 | pthreadpool_parallelize( |
1394 | threadpool, parallelize_2d, ¶ms, sizeof(params), |
1395 | task, argument, range, flags); |
1396 | } |
1397 | } |
1398 | |
1399 | void pthreadpool_parallelize_2d_tile_1d( |
1400 | pthreadpool_t threadpool, |
1401 | pthreadpool_task_2d_tile_1d_t task, |
1402 | void* argument, |
1403 | size_t range_i, |
1404 | size_t range_j, |
1405 | size_t tile_j, |
1406 | uint32_t flags) |
1407 | { |
1408 | size_t threads_count; |
1409 | if (threadpool == NULL || (threads_count = threadpool->threads_count.value) <= 1 || (range_i <= 1 && range_j <= tile_j)) { |
1410 | /* No thread pool used: execute task sequentially on the calling thread */ |
1411 | struct fpu_state saved_fpu_state = { 0 }; |
1412 | if (flags & PTHREADPOOL_FLAG_DISABLE_DENORMALS) { |
1413 | saved_fpu_state = get_fpu_state(); |
1414 | disable_fpu_denormals(); |
1415 | } |
1416 | for (size_t i = 0; i < range_i; i++) { |
1417 | for (size_t j = 0; j < range_j; j += tile_j) { |
1418 | task(argument, i, j, min(range_j - j, tile_j)); |
1419 | } |
1420 | } |
1421 | if (flags & PTHREADPOOL_FLAG_DISABLE_DENORMALS) { |
1422 | set_fpu_state(saved_fpu_state); |
1423 | } |
1424 | } else { |
1425 | const size_t tile_range_j = divide_round_up(range_j, tile_j); |
1426 | const size_t tile_range = range_i * tile_range_j; |
1427 | const struct pthreadpool_2d_tile_1d_params params = { |
1428 | .range_j = range_j, |
1429 | .tile_j = tile_j, |
1430 | .tile_range_j = fxdiv_init_size_t(tile_range_j), |
1431 | }; |
1432 | thread_function_t parallelize_2d_tile_1d = &thread_parallelize_2d_tile_1d; |
1433 | #if PTHREADPOOL_USE_FASTPATH |
1434 | const size_t range_threshold = -threads_count; |
1435 | if (tile_range < range_threshold) { |
1436 | parallelize_2d_tile_1d = &pthreadpool_thread_parallelize_2d_tile_1d_fastpath; |
1437 | } |
1438 | #endif |
1439 | pthreadpool_parallelize( |
1440 | threadpool, parallelize_2d_tile_1d, ¶ms, sizeof(params), |
1441 | task, argument, tile_range, flags); |
1442 | } |
1443 | } |
1444 | |
1445 | void pthreadpool_parallelize_2d_tile_2d( |
1446 | pthreadpool_t threadpool, |
1447 | pthreadpool_task_2d_tile_2d_t task, |
1448 | void* argument, |
1449 | size_t range_i, |
1450 | size_t range_j, |
1451 | size_t tile_i, |
1452 | size_t tile_j, |
1453 | uint32_t flags) |
1454 | { |
1455 | size_t threads_count; |
1456 | if (threadpool == NULL || (threads_count = threadpool->threads_count.value) <= 1 || (range_i <= tile_i && range_j <= tile_j)) { |
1457 | /* No thread pool used: execute task sequentially on the calling thread */ |
1458 | struct fpu_state saved_fpu_state = { 0 }; |
1459 | if (flags & PTHREADPOOL_FLAG_DISABLE_DENORMALS) { |
1460 | saved_fpu_state = get_fpu_state(); |
1461 | disable_fpu_denormals(); |
1462 | } |
1463 | for (size_t i = 0; i < range_i; i += tile_i) { |
1464 | for (size_t j = 0; j < range_j; j += tile_j) { |
1465 | task(argument, i, j, min(range_i - i, tile_i), min(range_j - j, tile_j)); |
1466 | } |
1467 | } |
1468 | if (flags & PTHREADPOOL_FLAG_DISABLE_DENORMALS) { |
1469 | set_fpu_state(saved_fpu_state); |
1470 | } |
1471 | } else { |
1472 | const size_t tile_range_i = divide_round_up(range_i, tile_i); |
1473 | const size_t tile_range_j = divide_round_up(range_j, tile_j); |
1474 | const size_t tile_range = tile_range_i * tile_range_j; |
1475 | const struct pthreadpool_2d_tile_2d_params params = { |
1476 | .range_i = range_i, |
1477 | .tile_i = tile_i, |
1478 | .range_j = range_j, |
1479 | .tile_j = tile_j, |
1480 | .tile_range_j = fxdiv_init_size_t(tile_range_j), |
1481 | }; |
1482 | thread_function_t parallelize_2d_tile_2d = &thread_parallelize_2d_tile_2d; |
1483 | #if PTHREADPOOL_USE_FASTPATH |
1484 | const size_t range_threshold = -threads_count; |
1485 | if (tile_range < range_threshold) { |
1486 | parallelize_2d_tile_2d = &pthreadpool_thread_parallelize_2d_tile_2d_fastpath; |
1487 | } |
1488 | #endif |
1489 | pthreadpool_parallelize( |
1490 | threadpool, parallelize_2d_tile_2d, ¶ms, sizeof(params), |
1491 | task, argument, tile_range, flags); |
1492 | } |
1493 | } |
1494 | |
1495 | void pthreadpool_parallelize_2d_tile_2d_with_uarch( |
1496 | pthreadpool_t threadpool, |
1497 | pthreadpool_task_2d_tile_2d_with_id_t task, |
1498 | void* argument, |
1499 | uint32_t default_uarch_index, |
1500 | uint32_t max_uarch_index, |
1501 | size_t range_i, |
1502 | size_t range_j, |
1503 | size_t tile_i, |
1504 | size_t tile_j, |
1505 | uint32_t flags) |
1506 | { |
1507 | size_t threads_count; |
1508 | if (threadpool == NULL || (threads_count = threadpool->threads_count.value) <= 1 || (range_i <= tile_i && range_j <= tile_j)) { |
1509 | /* No thread pool used: execute task sequentially on the calling thread */ |
1510 | |
1511 | uint32_t uarch_index = default_uarch_index; |
1512 | #if PTHREADPOOL_USE_CPUINFO |
1513 | uarch_index = cpuinfo_get_current_uarch_index_with_default(default_uarch_index); |
1514 | if (uarch_index > max_uarch_index) { |
1515 | uarch_index = default_uarch_index; |
1516 | } |
1517 | #endif |
1518 | |
1519 | struct fpu_state saved_fpu_state = { 0 }; |
1520 | if (flags & PTHREADPOOL_FLAG_DISABLE_DENORMALS) { |
1521 | saved_fpu_state = get_fpu_state(); |
1522 | disable_fpu_denormals(); |
1523 | } |
1524 | for (size_t i = 0; i < range_i; i += tile_i) { |
1525 | for (size_t j = 0; j < range_j; j += tile_j) { |
1526 | task(argument, uarch_index, i, j, min(range_i - i, tile_i), min(range_j - j, tile_j)); |
1527 | } |
1528 | } |
1529 | if (flags & PTHREADPOOL_FLAG_DISABLE_DENORMALS) { |
1530 | set_fpu_state(saved_fpu_state); |
1531 | } |
1532 | } else { |
1533 | const size_t tile_range_i = divide_round_up(range_i, tile_i); |
1534 | const size_t tile_range_j = divide_round_up(range_j, tile_j); |
1535 | const size_t tile_range = tile_range_i * tile_range_j; |
1536 | const struct pthreadpool_2d_tile_2d_with_uarch_params params = { |
1537 | .default_uarch_index = default_uarch_index, |
1538 | .max_uarch_index = max_uarch_index, |
1539 | .range_i = range_i, |
1540 | .tile_i = tile_i, |
1541 | .range_j = range_j, |
1542 | .tile_j = tile_j, |
1543 | .tile_range_j = fxdiv_init_size_t(tile_range_j), |
1544 | }; |
1545 | thread_function_t parallelize_2d_tile_2d_with_uarch = &thread_parallelize_2d_tile_2d_with_uarch; |
1546 | #if PTHREADPOOL_USE_FASTPATH |
1547 | const size_t range_threshold = -threads_count; |
1548 | if (tile_range < range_threshold) { |
1549 | parallelize_2d_tile_2d_with_uarch = &pthreadpool_thread_parallelize_2d_tile_2d_with_uarch_fastpath; |
1550 | } |
1551 | #endif |
1552 | pthreadpool_parallelize( |
1553 | threadpool, parallelize_2d_tile_2d_with_uarch, ¶ms, sizeof(params), |
1554 | task, argument, tile_range, flags); |
1555 | } |
1556 | } |
1557 | |
1558 | void pthreadpool_parallelize_3d( |
1559 | pthreadpool_t threadpool, |
1560 | pthreadpool_task_3d_t task, |
1561 | void* argument, |
1562 | size_t range_i, |
1563 | size_t range_j, |
1564 | size_t range_k, |
1565 | uint32_t flags) |
1566 | { |
1567 | size_t threads_count; |
1568 | if (threadpool == NULL || (threads_count = threadpool->threads_count.value) <= 1 || (range_i | range_j | range_k) <= 1) { |
1569 | /* No thread pool used: execute task sequentially on the calling thread */ |
1570 | struct fpu_state saved_fpu_state = { 0 }; |
1571 | if (flags & PTHREADPOOL_FLAG_DISABLE_DENORMALS) { |
1572 | saved_fpu_state = get_fpu_state(); |
1573 | disable_fpu_denormals(); |
1574 | } |
1575 | for (size_t i = 0; i < range_i; i++) { |
1576 | for (size_t j = 0; j < range_j; j++) { |
1577 | for (size_t k = 0; k < range_k; k++) { |
1578 | task(argument, i, j, k); |
1579 | } |
1580 | } |
1581 | } |
1582 | if (flags & PTHREADPOOL_FLAG_DISABLE_DENORMALS) { |
1583 | set_fpu_state(saved_fpu_state); |
1584 | } |
1585 | } else { |
1586 | const size_t range = range_i * range_j * range_k; |
1587 | const struct pthreadpool_3d_params params = { |
1588 | .range_j = fxdiv_init_size_t(range_j), |
1589 | .range_k = fxdiv_init_size_t(range_k), |
1590 | }; |
1591 | thread_function_t parallelize_3d = &thread_parallelize_3d; |
1592 | #if PTHREADPOOL_USE_FASTPATH |
1593 | const size_t range_threshold = -threads_count; |
1594 | if (range < range_threshold) { |
1595 | parallelize_3d = &pthreadpool_thread_parallelize_3d_fastpath; |
1596 | } |
1597 | #endif |
1598 | pthreadpool_parallelize( |
1599 | threadpool, parallelize_3d, ¶ms, sizeof(params), |
1600 | task, argument, range, flags); |
1601 | } |
1602 | } |
1603 | |
1604 | void pthreadpool_parallelize_3d_tile_1d( |
1605 | pthreadpool_t threadpool, |
1606 | pthreadpool_task_3d_tile_1d_t task, |
1607 | void* argument, |
1608 | size_t range_i, |
1609 | size_t range_j, |
1610 | size_t range_k, |
1611 | size_t tile_k, |
1612 | uint32_t flags) |
1613 | { |
1614 | size_t threads_count; |
1615 | if (threadpool == NULL || (threads_count = threadpool->threads_count.value) <= 1 || ((range_i | range_j) <= 1 && range_k <= tile_k)) { |
1616 | /* No thread pool used: execute task sequentially on the calling thread */ |
1617 | struct fpu_state saved_fpu_state = { 0 }; |
1618 | if (flags & PTHREADPOOL_FLAG_DISABLE_DENORMALS) { |
1619 | saved_fpu_state = get_fpu_state(); |
1620 | disable_fpu_denormals(); |
1621 | } |
1622 | for (size_t i = 0; i < range_i; i++) { |
1623 | for (size_t j = 0; j < range_j; j++) { |
1624 | for (size_t k = 0; k < range_k; k += tile_k) { |
1625 | task(argument, i, j, k, min(range_k - k, tile_k)); |
1626 | } |
1627 | } |
1628 | } |
1629 | if (flags & PTHREADPOOL_FLAG_DISABLE_DENORMALS) { |
1630 | set_fpu_state(saved_fpu_state); |
1631 | } |
1632 | } else { |
1633 | const size_t tile_range_k = divide_round_up(range_k, tile_k); |
1634 | const size_t tile_range = range_i * range_j * tile_range_k; |
1635 | const struct pthreadpool_3d_tile_1d_params params = { |
1636 | .range_k = range_k, |
1637 | .tile_k = tile_k, |
1638 | .range_j = fxdiv_init_size_t(range_j), |
1639 | .tile_range_k = fxdiv_init_size_t(tile_range_k), |
1640 | }; |
1641 | thread_function_t parallelize_3d_tile_1d = &thread_parallelize_3d_tile_1d; |
1642 | #if PTHREADPOOL_USE_FASTPATH |
1643 | const size_t range_threshold = -threads_count; |
1644 | if (tile_range < range_threshold) { |
1645 | parallelize_3d_tile_1d = &pthreadpool_thread_parallelize_3d_tile_1d_fastpath; |
1646 | } |
1647 | #endif |
1648 | pthreadpool_parallelize( |
1649 | threadpool, parallelize_3d_tile_1d, ¶ms, sizeof(params), |
1650 | task, argument, tile_range, flags); |
1651 | } |
1652 | } |
1653 | |
1654 | void pthreadpool_parallelize_3d_tile_2d( |
1655 | pthreadpool_t threadpool, |
1656 | pthreadpool_task_3d_tile_2d_t task, |
1657 | void* argument, |
1658 | size_t range_i, |
1659 | size_t range_j, |
1660 | size_t range_k, |
1661 | size_t tile_j, |
1662 | size_t tile_k, |
1663 | uint32_t flags) |
1664 | { |
1665 | size_t threads_count; |
1666 | if (threadpool == NULL || (threads_count = threadpool->threads_count.value) <= 1 || (range_i <= 1 && range_j <= tile_j && range_k <= tile_k)) { |
1667 | /* No thread pool used: execute task sequentially on the calling thread */ |
1668 | struct fpu_state saved_fpu_state = { 0 }; |
1669 | if (flags & PTHREADPOOL_FLAG_DISABLE_DENORMALS) { |
1670 | saved_fpu_state = get_fpu_state(); |
1671 | disable_fpu_denormals(); |
1672 | } |
1673 | for (size_t i = 0; i < range_i; i++) { |
1674 | for (size_t j = 0; j < range_j; j += tile_j) { |
1675 | for (size_t k = 0; k < range_k; k += tile_k) { |
1676 | task(argument, i, j, k, min(range_j - j, tile_j), min(range_k - k, tile_k)); |
1677 | } |
1678 | } |
1679 | } |
1680 | if (flags & PTHREADPOOL_FLAG_DISABLE_DENORMALS) { |
1681 | set_fpu_state(saved_fpu_state); |
1682 | } |
1683 | } else { |
1684 | const size_t tile_range_j = divide_round_up(range_j, tile_j); |
1685 | const size_t tile_range_k = divide_round_up(range_k, tile_k); |
1686 | const size_t tile_range = range_i * tile_range_j * tile_range_k; |
1687 | const struct pthreadpool_3d_tile_2d_params params = { |
1688 | .range_j = range_j, |
1689 | .tile_j = tile_j, |
1690 | .range_k = range_k, |
1691 | .tile_k = tile_k, |
1692 | .tile_range_j = fxdiv_init_size_t(tile_range_j), |
1693 | .tile_range_k = fxdiv_init_size_t(tile_range_k), |
1694 | }; |
1695 | thread_function_t parallelize_3d_tile_2d = &thread_parallelize_3d_tile_2d; |
1696 | #if PTHREADPOOL_USE_FASTPATH |
1697 | const size_t range_threshold = -threads_count; |
1698 | if (tile_range < range_threshold) { |
1699 | parallelize_3d_tile_2d = &pthreadpool_thread_parallelize_3d_tile_2d_fastpath; |
1700 | } |
1701 | #endif |
1702 | pthreadpool_parallelize( |
1703 | threadpool, parallelize_3d_tile_2d, ¶ms, sizeof(params), |
1704 | task, argument, tile_range, flags); |
1705 | } |
1706 | } |
1707 | |
1708 | void pthreadpool_parallelize_3d_tile_2d_with_uarch( |
1709 | pthreadpool_t threadpool, |
1710 | pthreadpool_task_3d_tile_2d_with_id_t task, |
1711 | void* argument, |
1712 | uint32_t default_uarch_index, |
1713 | uint32_t max_uarch_index, |
1714 | size_t range_i, |
1715 | size_t range_j, |
1716 | size_t range_k, |
1717 | size_t tile_j, |
1718 | size_t tile_k, |
1719 | uint32_t flags) |
1720 | { |
1721 | size_t threads_count; |
1722 | if (threadpool == NULL || (threads_count = threadpool->threads_count.value) <= 1 || (range_i <= 1 && range_j <= tile_j && range_k <= tile_k)) { |
1723 | /* No thread pool used: execute task sequentially on the calling thread */ |
1724 | |
1725 | uint32_t uarch_index = default_uarch_index; |
1726 | #if PTHREADPOOL_USE_CPUINFO |
1727 | uarch_index = cpuinfo_get_current_uarch_index_with_default(default_uarch_index); |
1728 | if (uarch_index > max_uarch_index) { |
1729 | uarch_index = default_uarch_index; |
1730 | } |
1731 | #endif |
1732 | |
1733 | struct fpu_state saved_fpu_state = { 0 }; |
1734 | if (flags & PTHREADPOOL_FLAG_DISABLE_DENORMALS) { |
1735 | saved_fpu_state = get_fpu_state(); |
1736 | disable_fpu_denormals(); |
1737 | } |
1738 | for (size_t i = 0; i < range_i; i++) { |
1739 | for (size_t j = 0; j < range_j; j += tile_j) { |
1740 | for (size_t k = 0; k < range_k; k += tile_k) { |
1741 | task(argument, uarch_index, i, j, k, min(range_j - j, tile_j), min(range_k - k, tile_k)); |
1742 | } |
1743 | } |
1744 | } |
1745 | if (flags & PTHREADPOOL_FLAG_DISABLE_DENORMALS) { |
1746 | set_fpu_state(saved_fpu_state); |
1747 | } |
1748 | } else { |
1749 | const size_t tile_range_j = divide_round_up(range_j, tile_j); |
1750 | const size_t tile_range_k = divide_round_up(range_k, tile_k); |
1751 | const size_t tile_range = range_i * tile_range_j * tile_range_k; |
1752 | const struct pthreadpool_3d_tile_2d_with_uarch_params params = { |
1753 | .default_uarch_index = default_uarch_index, |
1754 | .max_uarch_index = max_uarch_index, |
1755 | .range_j = range_j, |
1756 | .tile_j = tile_j, |
1757 | .range_k = range_k, |
1758 | .tile_k = tile_k, |
1759 | .tile_range_j = fxdiv_init_size_t(tile_range_j), |
1760 | .tile_range_k = fxdiv_init_size_t(tile_range_k), |
1761 | }; |
1762 | thread_function_t parallelize_3d_tile_2d_with_uarch = &thread_parallelize_3d_tile_2d_with_uarch; |
1763 | #if PTHREADPOOL_USE_FASTPATH |
1764 | const size_t range_threshold = -threads_count; |
1765 | if (tile_range < range_threshold) { |
1766 | parallelize_3d_tile_2d_with_uarch = &pthreadpool_thread_parallelize_3d_tile_2d_with_uarch_fastpath; |
1767 | } |
1768 | #endif |
1769 | pthreadpool_parallelize( |
1770 | threadpool, parallelize_3d_tile_2d_with_uarch, ¶ms, sizeof(params), |
1771 | task, argument, tile_range, flags); |
1772 | } |
1773 | } |
1774 | |
1775 | void pthreadpool_parallelize_4d( |
1776 | pthreadpool_t threadpool, |
1777 | pthreadpool_task_4d_t task, |
1778 | void* argument, |
1779 | size_t range_i, |
1780 | size_t range_j, |
1781 | size_t range_k, |
1782 | size_t range_l, |
1783 | uint32_t flags) |
1784 | { |
1785 | size_t threads_count; |
1786 | if (threadpool == NULL || (threads_count = threadpool->threads_count.value) <= 1 || (range_i | range_j | range_k | range_l) <= 1) { |
1787 | /* No thread pool used: execute task sequentially on the calling thread */ |
1788 | struct fpu_state saved_fpu_state = { 0 }; |
1789 | if (flags & PTHREADPOOL_FLAG_DISABLE_DENORMALS) { |
1790 | saved_fpu_state = get_fpu_state(); |
1791 | disable_fpu_denormals(); |
1792 | } |
1793 | for (size_t i = 0; i < range_i; i++) { |
1794 | for (size_t j = 0; j < range_j; j++) { |
1795 | for (size_t k = 0; k < range_k; k++) { |
1796 | for (size_t l = 0; l < range_l; l++) { |
1797 | task(argument, i, j, k, l); |
1798 | } |
1799 | } |
1800 | } |
1801 | } |
1802 | if (flags & PTHREADPOOL_FLAG_DISABLE_DENORMALS) { |
1803 | set_fpu_state(saved_fpu_state); |
1804 | } |
1805 | } else { |
1806 | const size_t range_kl = range_k * range_l; |
1807 | const size_t range = range_i * range_j * range_kl; |
1808 | const struct pthreadpool_4d_params params = { |
1809 | .range_k = range_k, |
1810 | .range_j = fxdiv_init_size_t(range_j), |
1811 | .range_kl = fxdiv_init_size_t(range_kl), |
1812 | .range_l = fxdiv_init_size_t(range_l), |
1813 | }; |
1814 | thread_function_t parallelize_4d = &thread_parallelize_4d; |
1815 | #if PTHREADPOOL_USE_FASTPATH |
1816 | const size_t range_threshold = -threads_count; |
1817 | if (range < range_threshold) { |
1818 | parallelize_4d = &pthreadpool_thread_parallelize_4d_fastpath; |
1819 | } |
1820 | #endif |
1821 | pthreadpool_parallelize( |
1822 | threadpool, parallelize_4d, ¶ms, sizeof(params), |
1823 | task, argument, range, flags); |
1824 | } |
1825 | } |
1826 | |
1827 | void pthreadpool_parallelize_4d_tile_1d( |
1828 | pthreadpool_t threadpool, |
1829 | pthreadpool_task_4d_tile_1d_t task, |
1830 | void* argument, |
1831 | size_t range_i, |
1832 | size_t range_j, |
1833 | size_t range_k, |
1834 | size_t range_l, |
1835 | size_t tile_l, |
1836 | uint32_t flags) |
1837 | { |
1838 | size_t threads_count; |
1839 | if (threadpool == NULL || (threads_count = threadpool->threads_count.value) <= 1 || ((range_i | range_j | range_k) <= 1 && range_l <= tile_l)) { |
1840 | /* No thread pool used: execute task sequentially on the calling thread */ |
1841 | struct fpu_state saved_fpu_state = { 0 }; |
1842 | if (flags & PTHREADPOOL_FLAG_DISABLE_DENORMALS) { |
1843 | saved_fpu_state = get_fpu_state(); |
1844 | disable_fpu_denormals(); |
1845 | } |
1846 | for (size_t i = 0; i < range_i; i++) { |
1847 | for (size_t j = 0; j < range_j; j++) { |
1848 | for (size_t k = 0; k < range_k; k++) { |
1849 | for (size_t l = 0; l < range_l; l += tile_l) { |
1850 | task(argument, i, j, k, l, min(range_l - l, tile_l)); |
1851 | } |
1852 | } |
1853 | } |
1854 | } |
1855 | if (flags & PTHREADPOOL_FLAG_DISABLE_DENORMALS) { |
1856 | set_fpu_state(saved_fpu_state); |
1857 | } |
1858 | } else { |
1859 | const size_t tile_range_l = divide_round_up(range_l, tile_l); |
1860 | const size_t tile_range_kl = range_k * tile_range_l; |
1861 | const size_t tile_range = range_i * range_j * tile_range_kl; |
1862 | const struct pthreadpool_4d_tile_1d_params params = { |
1863 | .range_k = range_k, |
1864 | .range_l = range_l, |
1865 | .tile_l = tile_l, |
1866 | .range_j = fxdiv_init_size_t(range_j), |
1867 | .tile_range_kl = fxdiv_init_size_t(tile_range_kl), |
1868 | .tile_range_l = fxdiv_init_size_t(tile_range_l), |
1869 | }; |
1870 | thread_function_t parallelize_4d_tile_1d = &thread_parallelize_4d_tile_1d; |
1871 | #if PTHREADPOOL_USE_FASTPATH |
1872 | const size_t range_threshold = -threads_count; |
1873 | if (tile_range < range_threshold) { |
1874 | parallelize_4d_tile_1d = &pthreadpool_thread_parallelize_4d_tile_1d_fastpath; |
1875 | } |
1876 | #endif |
1877 | pthreadpool_parallelize( |
1878 | threadpool, parallelize_4d_tile_1d, ¶ms, sizeof(params), |
1879 | task, argument, tile_range, flags); |
1880 | } |
1881 | } |
1882 | |
1883 | void pthreadpool_parallelize_4d_tile_2d( |
1884 | pthreadpool_t threadpool, |
1885 | pthreadpool_task_4d_tile_2d_t task, |
1886 | void* argument, |
1887 | size_t range_i, |
1888 | size_t range_j, |
1889 | size_t range_k, |
1890 | size_t range_l, |
1891 | size_t tile_k, |
1892 | size_t tile_l, |
1893 | uint32_t flags) |
1894 | { |
1895 | size_t threads_count; |
1896 | if (threadpool == NULL || (threads_count = threadpool->threads_count.value) <= 1 || ((range_i | range_j) <= 1 && range_k <= tile_k && range_l <= tile_l)) { |
1897 | /* No thread pool used: execute task sequentially on the calling thread */ |
1898 | struct fpu_state saved_fpu_state = { 0 }; |
1899 | if (flags & PTHREADPOOL_FLAG_DISABLE_DENORMALS) { |
1900 | saved_fpu_state = get_fpu_state(); |
1901 | disable_fpu_denormals(); |
1902 | } |
1903 | for (size_t i = 0; i < range_i; i++) { |
1904 | for (size_t j = 0; j < range_j; j++) { |
1905 | for (size_t k = 0; k < range_k; k += tile_k) { |
1906 | for (size_t l = 0; l < range_l; l += tile_l) { |
1907 | task(argument, i, j, k, l, |
1908 | min(range_k - k, tile_k), min(range_l - l, tile_l)); |
1909 | } |
1910 | } |
1911 | } |
1912 | } |
1913 | if (flags & PTHREADPOOL_FLAG_DISABLE_DENORMALS) { |
1914 | set_fpu_state(saved_fpu_state); |
1915 | } |
1916 | } else { |
1917 | const size_t tile_range_l = divide_round_up(range_l, tile_l); |
1918 | const size_t tile_range_kl = divide_round_up(range_k, tile_k) * tile_range_l; |
1919 | const size_t tile_range = range_i * range_j * tile_range_kl; |
1920 | const struct pthreadpool_4d_tile_2d_params params = { |
1921 | .range_k = range_k, |
1922 | .tile_k = tile_k, |
1923 | .range_l = range_l, |
1924 | .tile_l = tile_l, |
1925 | .range_j = fxdiv_init_size_t(range_j), |
1926 | .tile_range_kl = fxdiv_init_size_t(tile_range_kl), |
1927 | .tile_range_l = fxdiv_init_size_t(tile_range_l), |
1928 | }; |
1929 | thread_function_t parallelize_4d_tile_2d = &thread_parallelize_4d_tile_2d; |
1930 | #if PTHREADPOOL_USE_FASTPATH |
1931 | const size_t range_threshold = -threads_count; |
1932 | if (tile_range < range_threshold) { |
1933 | parallelize_4d_tile_2d = &pthreadpool_thread_parallelize_4d_tile_2d_fastpath; |
1934 | } |
1935 | #endif |
1936 | pthreadpool_parallelize( |
1937 | threadpool, parallelize_4d_tile_2d, ¶ms, sizeof(params), |
1938 | task, argument, tile_range, flags); |
1939 | } |
1940 | } |
1941 | |
1942 | void pthreadpool_parallelize_4d_tile_2d_with_uarch( |
1943 | pthreadpool_t threadpool, |
1944 | pthreadpool_task_4d_tile_2d_with_id_t task, |
1945 | void* argument, |
1946 | uint32_t default_uarch_index, |
1947 | uint32_t max_uarch_index, |
1948 | size_t range_i, |
1949 | size_t range_j, |
1950 | size_t range_k, |
1951 | size_t range_l, |
1952 | size_t tile_k, |
1953 | size_t tile_l, |
1954 | uint32_t flags) |
1955 | { |
1956 | size_t threads_count; |
1957 | if (threadpool == NULL || (threads_count = threadpool->threads_count.value) <= 1 || ((range_i | range_j) <= 1 && range_k <= tile_k && range_l <= tile_l)) { |
1958 | /* No thread pool used: execute task sequentially on the calling thread */ |
1959 | |
1960 | uint32_t uarch_index = default_uarch_index; |
1961 | #if PTHREADPOOL_USE_CPUINFO |
1962 | uarch_index = cpuinfo_get_current_uarch_index_with_default(default_uarch_index); |
1963 | if (uarch_index > max_uarch_index) { |
1964 | uarch_index = default_uarch_index; |
1965 | } |
1966 | #endif |
1967 | |
1968 | struct fpu_state saved_fpu_state = { 0 }; |
1969 | if (flags & PTHREADPOOL_FLAG_DISABLE_DENORMALS) { |
1970 | saved_fpu_state = get_fpu_state(); |
1971 | disable_fpu_denormals(); |
1972 | } |
1973 | for (size_t i = 0; i < range_i; i++) { |
1974 | for (size_t j = 0; j < range_j; j++) { |
1975 | for (size_t k = 0; k < range_k; k += tile_k) { |
1976 | for (size_t l = 0; l < range_l; l += tile_l) { |
1977 | task(argument, uarch_index, i, j, k, l, |
1978 | min(range_k - k, tile_k), min(range_l - l, tile_l)); |
1979 | } |
1980 | } |
1981 | } |
1982 | } |
1983 | if (flags & PTHREADPOOL_FLAG_DISABLE_DENORMALS) { |
1984 | set_fpu_state(saved_fpu_state); |
1985 | } |
1986 | } else { |
1987 | const size_t tile_range_l = divide_round_up(range_l, tile_l); |
1988 | const size_t tile_range_kl = divide_round_up(range_k, tile_k) * tile_range_l; |
1989 | const size_t tile_range = range_i * range_j * tile_range_kl; |
1990 | const struct pthreadpool_4d_tile_2d_with_uarch_params params = { |
1991 | .default_uarch_index = default_uarch_index, |
1992 | .max_uarch_index = max_uarch_index, |
1993 | .range_k = range_k, |
1994 | .tile_k = tile_k, |
1995 | .range_l = range_l, |
1996 | .tile_l = tile_l, |
1997 | .range_j = fxdiv_init_size_t(range_j), |
1998 | .tile_range_kl = fxdiv_init_size_t(tile_range_kl), |
1999 | .tile_range_l = fxdiv_init_size_t(tile_range_l), |
2000 | }; |
2001 | thread_function_t parallelize_4d_tile_2d_with_uarch = &thread_parallelize_4d_tile_2d_with_uarch; |
2002 | #if PTHREADPOOL_USE_FASTPATH |
2003 | const size_t range_threshold = -threads_count; |
2004 | if (tile_range < range_threshold) { |
2005 | parallelize_4d_tile_2d_with_uarch = &pthreadpool_thread_parallelize_4d_tile_2d_with_uarch_fastpath; |
2006 | } |
2007 | #endif |
2008 | pthreadpool_parallelize( |
2009 | threadpool, parallelize_4d_tile_2d_with_uarch, ¶ms, sizeof(params), |
2010 | task, argument, tile_range, flags); |
2011 | } |
2012 | } |
2013 | |
2014 | void pthreadpool_parallelize_5d( |
2015 | pthreadpool_t threadpool, |
2016 | pthreadpool_task_5d_t task, |
2017 | void* argument, |
2018 | size_t range_i, |
2019 | size_t range_j, |
2020 | size_t range_k, |
2021 | size_t range_l, |
2022 | size_t range_m, |
2023 | uint32_t flags) |
2024 | { |
2025 | size_t threads_count; |
2026 | if (threadpool == NULL || (threads_count = threadpool->threads_count.value) <= 1 || (range_i | range_j | range_k | range_l | range_m) <= 1) { |
2027 | /* No thread pool used: execute task sequentially on the calling thread */ |
2028 | struct fpu_state saved_fpu_state = { 0 }; |
2029 | if (flags & PTHREADPOOL_FLAG_DISABLE_DENORMALS) { |
2030 | saved_fpu_state = get_fpu_state(); |
2031 | disable_fpu_denormals(); |
2032 | } |
2033 | for (size_t i = 0; i < range_i; i++) { |
2034 | for (size_t j = 0; j < range_j; j++) { |
2035 | for (size_t k = 0; k < range_k; k++) { |
2036 | for (size_t l = 0; l < range_l; l++) { |
2037 | for (size_t m = 0; m < range_m; m++) { |
2038 | task(argument, i, j, k, l, m); |
2039 | } |
2040 | } |
2041 | } |
2042 | } |
2043 | } |
2044 | if (flags & PTHREADPOOL_FLAG_DISABLE_DENORMALS) { |
2045 | set_fpu_state(saved_fpu_state); |
2046 | } |
2047 | } else { |
2048 | const size_t range_lm = range_l * range_m; |
2049 | const size_t range = range_i * range_j * range_k * range_lm; |
2050 | const struct pthreadpool_5d_params params = { |
2051 | .range_l = range_l, |
2052 | .range_j = fxdiv_init_size_t(range_j), |
2053 | .range_k = fxdiv_init_size_t(range_k), |
2054 | .range_lm = fxdiv_init_size_t(range_lm), |
2055 | .range_m = fxdiv_init_size_t(range_m), |
2056 | }; |
2057 | thread_function_t parallelize_5d = &thread_parallelize_5d; |
2058 | #if PTHREADPOOL_USE_FASTPATH |
2059 | const size_t range_threshold = -threads_count; |
2060 | if (range < range_threshold) { |
2061 | parallelize_5d = &pthreadpool_thread_parallelize_5d_fastpath; |
2062 | } |
2063 | #endif |
2064 | pthreadpool_parallelize( |
2065 | threadpool, parallelize_5d, ¶ms, sizeof(params), |
2066 | task, argument, range, flags); |
2067 | } |
2068 | } |
2069 | |
2070 | void pthreadpool_parallelize_5d_tile_1d( |
2071 | pthreadpool_t threadpool, |
2072 | pthreadpool_task_5d_tile_1d_t task, |
2073 | void* argument, |
2074 | size_t range_i, |
2075 | size_t range_j, |
2076 | size_t range_k, |
2077 | size_t range_l, |
2078 | size_t range_m, |
2079 | size_t tile_m, |
2080 | uint32_t flags) |
2081 | { |
2082 | size_t threads_count; |
2083 | if (threadpool == NULL || (threads_count = threadpool->threads_count.value) <= 1 || ((range_i | range_j | range_k | range_l) <= 1 && range_m <= tile_m)) { |
2084 | /* No thread pool used: execute task sequentially on the calling thread */ |
2085 | struct fpu_state saved_fpu_state = { 0 }; |
2086 | if (flags & PTHREADPOOL_FLAG_DISABLE_DENORMALS) { |
2087 | saved_fpu_state = get_fpu_state(); |
2088 | disable_fpu_denormals(); |
2089 | } |
2090 | for (size_t i = 0; i < range_i; i++) { |
2091 | for (size_t j = 0; j < range_j; j++) { |
2092 | for (size_t k = 0; k < range_k; k++) { |
2093 | for (size_t l = 0; l < range_l; l++) { |
2094 | for (size_t m = 0; m < range_m; m += tile_m) { |
2095 | task(argument, i, j, k, l, m, min(range_m - m, tile_m)); |
2096 | } |
2097 | } |
2098 | } |
2099 | } |
2100 | } |
2101 | if (flags & PTHREADPOOL_FLAG_DISABLE_DENORMALS) { |
2102 | set_fpu_state(saved_fpu_state); |
2103 | } |
2104 | } else { |
2105 | const size_t tile_range_m = divide_round_up(range_m, tile_m); |
2106 | const size_t range_kl = range_k * range_l; |
2107 | const size_t tile_range = range_i * range_j * range_kl * tile_range_m; |
2108 | const struct pthreadpool_5d_tile_1d_params params = { |
2109 | .range_k = range_k, |
2110 | .range_m = range_m, |
2111 | .tile_m = tile_m, |
2112 | .range_j = fxdiv_init_size_t(range_j), |
2113 | .range_kl = fxdiv_init_size_t(range_kl), |
2114 | .range_l = fxdiv_init_size_t(range_l), |
2115 | .tile_range_m = fxdiv_init_size_t(tile_range_m), |
2116 | }; |
2117 | thread_function_t parallelize_5d_tile_1d = &thread_parallelize_5d_tile_1d; |
2118 | #if PTHREADPOOL_USE_FASTPATH |
2119 | const size_t range_threshold = -threads_count; |
2120 | if (tile_range < range_threshold) { |
2121 | parallelize_5d_tile_1d = &pthreadpool_thread_parallelize_5d_tile_1d_fastpath; |
2122 | } |
2123 | #endif |
2124 | pthreadpool_parallelize( |
2125 | threadpool, parallelize_5d_tile_1d, ¶ms, sizeof(params), |
2126 | task, argument, tile_range, flags); |
2127 | } |
2128 | } |
2129 | |
2130 | void pthreadpool_parallelize_5d_tile_2d( |
2131 | pthreadpool_t threadpool, |
2132 | pthreadpool_task_5d_tile_2d_t task, |
2133 | void* argument, |
2134 | size_t range_i, |
2135 | size_t range_j, |
2136 | size_t range_k, |
2137 | size_t range_l, |
2138 | size_t range_m, |
2139 | size_t tile_l, |
2140 | size_t tile_m, |
2141 | uint32_t flags) |
2142 | { |
2143 | size_t threads_count; |
2144 | if (threadpool == NULL || (threads_count = threadpool->threads_count.value) <= 1 || ((range_i | range_j | range_k) <= 1 && range_l <= tile_l && range_m <= tile_m)) { |
2145 | /* No thread pool used: execute task sequentially on the calling thread */ |
2146 | struct fpu_state saved_fpu_state = { 0 }; |
2147 | if (flags & PTHREADPOOL_FLAG_DISABLE_DENORMALS) { |
2148 | saved_fpu_state = get_fpu_state(); |
2149 | disable_fpu_denormals(); |
2150 | } |
2151 | for (size_t i = 0; i < range_i; i++) { |
2152 | for (size_t j = 0; j < range_j; j++) { |
2153 | for (size_t k = 0; k < range_k; k++) { |
2154 | for (size_t l = 0; l < range_l; l += tile_l) { |
2155 | for (size_t m = 0; m < range_m; m += tile_m) { |
2156 | task(argument, i, j, k, l, m, |
2157 | min(range_l - l, tile_l), min(range_m - m, tile_m)); |
2158 | } |
2159 | } |
2160 | } |
2161 | } |
2162 | } |
2163 | if (flags & PTHREADPOOL_FLAG_DISABLE_DENORMALS) { |
2164 | set_fpu_state(saved_fpu_state); |
2165 | } |
2166 | } else { |
2167 | const size_t tile_range_m = divide_round_up(range_m, tile_m); |
2168 | const size_t tile_range_lm = divide_round_up(range_l, tile_l) * tile_range_m; |
2169 | const size_t tile_range = range_i * range_j * range_k * tile_range_lm; |
2170 | const struct pthreadpool_5d_tile_2d_params params = { |
2171 | .range_l = range_l, |
2172 | .tile_l = tile_l, |
2173 | .range_m = range_m, |
2174 | .tile_m = tile_m, |
2175 | .range_j = fxdiv_init_size_t(range_j), |
2176 | .range_k = fxdiv_init_size_t(range_k), |
2177 | .tile_range_lm = fxdiv_init_size_t(tile_range_lm), |
2178 | .tile_range_m = fxdiv_init_size_t(tile_range_m), |
2179 | }; |
2180 | thread_function_t parallelize_5d_tile_2d = &thread_parallelize_5d_tile_2d; |
2181 | #if PTHREADPOOL_USE_FASTPATH |
2182 | const size_t range_threshold = -threads_count; |
2183 | if (tile_range < range_threshold) { |
2184 | parallelize_5d_tile_2d = &pthreadpool_thread_parallelize_5d_tile_2d_fastpath; |
2185 | } |
2186 | #endif |
2187 | pthreadpool_parallelize( |
2188 | threadpool, parallelize_5d_tile_2d, ¶ms, sizeof(params), |
2189 | task, argument, tile_range, flags); |
2190 | } |
2191 | } |
2192 | |
2193 | void pthreadpool_parallelize_6d( |
2194 | pthreadpool_t threadpool, |
2195 | pthreadpool_task_6d_t task, |
2196 | void* argument, |
2197 | size_t range_i, |
2198 | size_t range_j, |
2199 | size_t range_k, |
2200 | size_t range_l, |
2201 | size_t range_m, |
2202 | size_t range_n, |
2203 | uint32_t flags) |
2204 | { |
2205 | size_t threads_count; |
2206 | if (threadpool == NULL || (threads_count = threadpool->threads_count.value) <= 1 || (range_i | range_j | range_k | range_l | range_m | range_n) <= 1) { |
2207 | /* No thread pool used: execute task sequentially on the calling thread */ |
2208 | struct fpu_state saved_fpu_state = { 0 }; |
2209 | if (flags & PTHREADPOOL_FLAG_DISABLE_DENORMALS) { |
2210 | saved_fpu_state = get_fpu_state(); |
2211 | disable_fpu_denormals(); |
2212 | } |
2213 | for (size_t i = 0; i < range_i; i++) { |
2214 | for (size_t j = 0; j < range_j; j++) { |
2215 | for (size_t k = 0; k < range_k; k++) { |
2216 | for (size_t l = 0; l < range_l; l++) { |
2217 | for (size_t m = 0; m < range_m; m++) { |
2218 | for (size_t n = 0; n < range_n; n++) { |
2219 | task(argument, i, j, k, l, m, n); |
2220 | } |
2221 | } |
2222 | } |
2223 | } |
2224 | } |
2225 | } |
2226 | if (flags & PTHREADPOOL_FLAG_DISABLE_DENORMALS) { |
2227 | set_fpu_state(saved_fpu_state); |
2228 | } |
2229 | } else { |
2230 | const size_t range_lmn = range_l * range_m * range_n; |
2231 | const size_t range = range_i * range_j * range_k * range_lmn; |
2232 | const struct pthreadpool_6d_params params = { |
2233 | .range_l = range_l, |
2234 | .range_j = fxdiv_init_size_t(range_j), |
2235 | .range_k = fxdiv_init_size_t(range_k), |
2236 | .range_lmn = fxdiv_init_size_t(range_lmn), |
2237 | .range_m = fxdiv_init_size_t(range_m), |
2238 | .range_n = fxdiv_init_size_t(range_n), |
2239 | }; |
2240 | thread_function_t parallelize_6d = &thread_parallelize_6d; |
2241 | #if PTHREADPOOL_USE_FASTPATH |
2242 | const size_t range_threshold = -threads_count; |
2243 | if (range < range_threshold) { |
2244 | parallelize_6d = &pthreadpool_thread_parallelize_6d_fastpath; |
2245 | } |
2246 | #endif |
2247 | pthreadpool_parallelize( |
2248 | threadpool, parallelize_6d, ¶ms, sizeof(params), |
2249 | task, argument, range, flags); |
2250 | } |
2251 | } |
2252 | |
2253 | void pthreadpool_parallelize_6d_tile_1d( |
2254 | pthreadpool_t threadpool, |
2255 | pthreadpool_task_6d_tile_1d_t task, |
2256 | void* argument, |
2257 | size_t range_i, |
2258 | size_t range_j, |
2259 | size_t range_k, |
2260 | size_t range_l, |
2261 | size_t range_m, |
2262 | size_t range_n, |
2263 | size_t tile_n, |
2264 | uint32_t flags) |
2265 | { |
2266 | size_t threads_count; |
2267 | if (threadpool == NULL || (threads_count = threadpool->threads_count.value) <= 1 || ((range_i | range_j | range_k | range_l | range_m) <= 1 && range_n <= tile_n)) { |
2268 | /* No thread pool used: execute task sequentially on the calling thread */ |
2269 | struct fpu_state saved_fpu_state = { 0 }; |
2270 | if (flags & PTHREADPOOL_FLAG_DISABLE_DENORMALS) { |
2271 | saved_fpu_state = get_fpu_state(); |
2272 | disable_fpu_denormals(); |
2273 | } |
2274 | for (size_t i = 0; i < range_i; i++) { |
2275 | for (size_t j = 0; j < range_j; j++) { |
2276 | for (size_t k = 0; k < range_k; k++) { |
2277 | for (size_t l = 0; l < range_l; l++) { |
2278 | for (size_t m = 0; m < range_m; m++) { |
2279 | for (size_t n = 0; n < range_n; n += tile_n) { |
2280 | task(argument, i, j, k, l, m, n, min(range_n - n, tile_n)); |
2281 | } |
2282 | } |
2283 | } |
2284 | } |
2285 | } |
2286 | } |
2287 | if (flags & PTHREADPOOL_FLAG_DISABLE_DENORMALS) { |
2288 | set_fpu_state(saved_fpu_state); |
2289 | } |
2290 | } else { |
2291 | const size_t tile_range_n = divide_round_up(range_n, tile_n); |
2292 | const size_t tile_range_lmn = range_l * range_m * tile_range_n; |
2293 | const size_t tile_range = range_i * range_j * range_k * tile_range_lmn; |
2294 | const struct pthreadpool_6d_tile_1d_params params = { |
2295 | .range_l = range_l, |
2296 | .range_n = range_n, |
2297 | .tile_n = tile_n, |
2298 | .range_j = fxdiv_init_size_t(range_j), |
2299 | .range_k = fxdiv_init_size_t(range_k), |
2300 | .tile_range_lmn = fxdiv_init_size_t(tile_range_lmn), |
2301 | .range_m = fxdiv_init_size_t(range_m), |
2302 | .tile_range_n = fxdiv_init_size_t(tile_range_n), |
2303 | }; |
2304 | thread_function_t parallelize_6d_tile_1d = &thread_parallelize_6d_tile_1d; |
2305 | #if PTHREADPOOL_USE_FASTPATH |
2306 | const size_t range_threshold = -threads_count; |
2307 | if (tile_range < range_threshold) { |
2308 | parallelize_6d_tile_1d = &pthreadpool_thread_parallelize_6d_tile_1d_fastpath; |
2309 | } |
2310 | #endif |
2311 | pthreadpool_parallelize( |
2312 | threadpool, parallelize_6d_tile_1d, ¶ms, sizeof(params), |
2313 | task, argument, tile_range, flags); |
2314 | } |
2315 | } |
2316 | |
2317 | void pthreadpool_parallelize_6d_tile_2d( |
2318 | pthreadpool_t threadpool, |
2319 | pthreadpool_task_6d_tile_2d_t task, |
2320 | void* argument, |
2321 | size_t range_i, |
2322 | size_t range_j, |
2323 | size_t range_k, |
2324 | size_t range_l, |
2325 | size_t range_m, |
2326 | size_t range_n, |
2327 | size_t tile_m, |
2328 | size_t tile_n, |
2329 | uint32_t flags) |
2330 | { |
2331 | size_t threads_count; |
2332 | if (threadpool == NULL || (threads_count = threadpool->threads_count.value) <= 1 || ((range_i | range_j | range_k | range_l) <= 1 && range_m <= tile_m && range_n <= tile_n)) { |
2333 | /* No thread pool used: execute task sequentially on the calling thread */ |
2334 | struct fpu_state saved_fpu_state = { 0 }; |
2335 | if (flags & PTHREADPOOL_FLAG_DISABLE_DENORMALS) { |
2336 | saved_fpu_state = get_fpu_state(); |
2337 | disable_fpu_denormals(); |
2338 | } |
2339 | for (size_t i = 0; i < range_i; i++) { |
2340 | for (size_t j = 0; j < range_j; j++) { |
2341 | for (size_t k = 0; k < range_k; k++) { |
2342 | for (size_t l = 0; l < range_l; l++) { |
2343 | for (size_t m = 0; m < range_m; m += tile_m) { |
2344 | for (size_t n = 0; n < range_n; n += tile_n) { |
2345 | task(argument, i, j, k, l, m, n, |
2346 | min(range_m - m, tile_m), min(range_n - n, tile_n)); |
2347 | } |
2348 | } |
2349 | } |
2350 | } |
2351 | } |
2352 | } |
2353 | if (flags & PTHREADPOOL_FLAG_DISABLE_DENORMALS) { |
2354 | set_fpu_state(saved_fpu_state); |
2355 | } |
2356 | } else { |
2357 | const size_t range_kl = range_k * range_l; |
2358 | const size_t tile_range_n = divide_round_up(range_n, tile_n); |
2359 | const size_t tile_range_mn = divide_round_up(range_m, tile_m) * tile_range_n; |
2360 | const size_t tile_range = range_i * range_j * range_kl * tile_range_mn; |
2361 | const struct pthreadpool_6d_tile_2d_params params = { |
2362 | .range_k = range_k, |
2363 | .range_m = range_m, |
2364 | .tile_m = tile_m, |
2365 | .range_n = range_n, |
2366 | .tile_n = tile_n, |
2367 | .range_j = fxdiv_init_size_t(range_j), |
2368 | .range_kl = fxdiv_init_size_t(range_kl), |
2369 | .range_l = fxdiv_init_size_t(range_l), |
2370 | .tile_range_mn = fxdiv_init_size_t(tile_range_mn), |
2371 | .tile_range_n = fxdiv_init_size_t(tile_range_n), |
2372 | }; |
2373 | thread_function_t parallelize_6d_tile_2d = &thread_parallelize_6d_tile_2d; |
2374 | #if PTHREADPOOL_USE_FASTPATH |
2375 | const size_t range_threshold = -threads_count; |
2376 | if (tile_range < range_threshold) { |
2377 | parallelize_6d_tile_2d = &pthreadpool_thread_parallelize_6d_tile_2d_fastpath; |
2378 | } |
2379 | #endif |
2380 | pthreadpool_parallelize( |
2381 | threadpool, parallelize_6d_tile_2d, ¶ms, sizeof(params), |
2382 | task, argument, tile_range, flags); |
2383 | } |
2384 | } |
2385 | |