1/* Standard C headers */
2#include <assert.h>
3#include <stdbool.h>
4#include <stdint.h>
5#include <stdlib.h>
6#include <string.h>
7
8#if PTHREADPOOL_USE_CPUINFO
9 #include <cpuinfo.h>
10#endif
11
12/* Dependencies */
13#include <fxdiv.h>
14
15/* Public library header */
16#include <pthreadpool.h>
17
18/* Internal library headers */
19#include "threadpool-atomics.h"
20#include "threadpool-object.h"
21#include "threadpool-utils.h"
22
23
24size_t pthreadpool_get_threads_count(struct pthreadpool* threadpool) {
25 if (threadpool == NULL) {
26 return 1;
27 }
28
29 return threadpool->threads_count.value;
30}
31
32static void thread_parallelize_1d(struct pthreadpool* threadpool, struct thread_info* thread) {
33 assert(threadpool != NULL);
34 assert(thread != NULL);
35
36 const pthreadpool_task_1d_t task = (pthreadpool_task_1d_t) pthreadpool_load_relaxed_void_p(&threadpool->task);
37 void *const argument = pthreadpool_load_relaxed_void_p(&threadpool->argument);
38
39 /* Process thread's own range of items */
40 size_t range_start = pthreadpool_load_relaxed_size_t(&thread->range_start);
41 while (pthreadpool_try_decrement_relaxed_size_t(&thread->range_length)) {
42 task(argument, range_start++);
43 }
44
45 /* There still may be other threads with work */
46 const size_t thread_number = thread->thread_number;
47 const size_t threads_count = threadpool->threads_count.value;
48 for (size_t tid = modulo_decrement(thread_number, threads_count);
49 tid != thread_number;
50 tid = modulo_decrement(tid, threads_count))
51 {
52 struct thread_info* other_thread = &threadpool->threads[tid];
53 while (pthreadpool_try_decrement_relaxed_size_t(&other_thread->range_length)) {
54 const size_t index = pthreadpool_decrement_fetch_relaxed_size_t(&other_thread->range_end);
55 task(argument, index);
56 }
57 }
58
59 /* Make changes by this thread visible to other threads */
60 pthreadpool_fence_release();
61}
62
63static void thread_parallelize_1d_with_uarch(struct pthreadpool* threadpool, struct thread_info* thread) {
64 assert(threadpool != NULL);
65 assert(thread != NULL);
66
67 const pthreadpool_task_1d_with_id_t task = (pthreadpool_task_1d_with_id_t) pthreadpool_load_relaxed_void_p(&threadpool->task);
68 void *const argument = pthreadpool_load_relaxed_void_p(&threadpool->argument);
69
70 const uint32_t default_uarch_index = threadpool->params.parallelize_1d_with_uarch.default_uarch_index;
71 uint32_t uarch_index = default_uarch_index;
72 #if PTHREADPOOL_USE_CPUINFO
73 uarch_index = cpuinfo_get_current_uarch_index_with_default(default_uarch_index);
74 if (uarch_index > threadpool->params.parallelize_1d_with_uarch.max_uarch_index) {
75 uarch_index = default_uarch_index;
76 }
77 #endif
78
79 /* Process thread's own range of items */
80 size_t range_start = pthreadpool_load_relaxed_size_t(&thread->range_start);
81 while (pthreadpool_try_decrement_relaxed_size_t(&thread->range_length)) {
82 task(argument, uarch_index, range_start++);
83 }
84
85 /* There still may be other threads with work */
86 const size_t thread_number = thread->thread_number;
87 const size_t threads_count = threadpool->threads_count.value;
88 for (size_t tid = modulo_decrement(thread_number, threads_count);
89 tid != thread_number;
90 tid = modulo_decrement(tid, threads_count))
91 {
92 struct thread_info* other_thread = &threadpool->threads[tid];
93 while (pthreadpool_try_decrement_relaxed_size_t(&other_thread->range_length)) {
94 const size_t index = pthreadpool_decrement_fetch_relaxed_size_t(&other_thread->range_end);
95 task(argument, uarch_index, index);
96 }
97 }
98
99 /* Make changes by this thread visible to other threads */
100 pthreadpool_fence_release();
101}
102
103static void thread_parallelize_1d_tile_1d(struct pthreadpool* threadpool, struct thread_info* thread) {
104 assert(threadpool != NULL);
105 assert(thread != NULL);
106
107 const pthreadpool_task_1d_tile_1d_t task = (pthreadpool_task_1d_tile_1d_t) pthreadpool_load_relaxed_void_p(&threadpool->task);
108 void *const argument = pthreadpool_load_relaxed_void_p(&threadpool->argument);
109
110 /* Process thread's own range of items */
111 const size_t range_start = pthreadpool_load_relaxed_size_t(&thread->range_start);
112 const size_t tile = threadpool->params.parallelize_1d_tile_1d.tile;
113 size_t tile_start = range_start * tile;
114
115 const size_t range = threadpool->params.parallelize_1d_tile_1d.range;
116 while (pthreadpool_try_decrement_relaxed_size_t(&thread->range_length)) {
117 task(argument, tile_start, min(range - tile_start, tile));
118 tile_start += tile;
119 }
120
121 /* There still may be other threads with work */
122 const size_t thread_number = thread->thread_number;
123 const size_t threads_count = threadpool->threads_count.value;
124 for (size_t tid = modulo_decrement(thread_number, threads_count);
125 tid != thread_number;
126 tid = modulo_decrement(tid, threads_count))
127 {
128 struct thread_info* other_thread = &threadpool->threads[tid];
129 while (pthreadpool_try_decrement_relaxed_size_t(&other_thread->range_length)) {
130 const size_t tile_index = pthreadpool_decrement_fetch_relaxed_size_t(&other_thread->range_end);
131 const size_t tile_start = tile_index * tile;
132 task(argument, tile_start, min(range - tile_start, tile));
133 }
134 }
135
136 /* Make changes by this thread visible to other threads */
137 pthreadpool_fence_release();
138}
139
140static void thread_parallelize_2d(struct pthreadpool* threadpool, struct thread_info* thread) {
141 assert(threadpool != NULL);
142 assert(thread != NULL);
143
144 const pthreadpool_task_2d_t task = (pthreadpool_task_2d_t) pthreadpool_load_relaxed_void_p(&threadpool->task);
145 void *const argument = pthreadpool_load_relaxed_void_p(&threadpool->argument);
146
147 /* Process thread's own range of items */
148 const size_t range_start = pthreadpool_load_relaxed_size_t(&thread->range_start);
149 const struct fxdiv_divisor_size_t range_j = threadpool->params.parallelize_2d.range_j;
150 const struct fxdiv_result_size_t index_i_j = fxdiv_divide_size_t(range_start, range_j);
151 size_t i = index_i_j.quotient;
152 size_t j = index_i_j.remainder;
153
154 while (pthreadpool_try_decrement_relaxed_size_t(&thread->range_length)) {
155 task(argument, i, j);
156 if (++j == range_j.value) {
157 j = 0;
158 i += 1;
159 }
160 }
161
162 /* There still may be other threads with work */
163 const size_t thread_number = thread->thread_number;
164 const size_t threads_count = threadpool->threads_count.value;
165 for (size_t tid = modulo_decrement(thread_number, threads_count);
166 tid != thread_number;
167 tid = modulo_decrement(tid, threads_count))
168 {
169 struct thread_info* other_thread = &threadpool->threads[tid];
170 while (pthreadpool_try_decrement_relaxed_size_t(&other_thread->range_length)) {
171 const size_t linear_index = pthreadpool_decrement_fetch_relaxed_size_t(&other_thread->range_end);
172 const struct fxdiv_result_size_t index_i_j = fxdiv_divide_size_t(linear_index, range_j);
173 task(argument, index_i_j.quotient, index_i_j.remainder);
174 }
175 }
176
177 /* Make changes by this thread visible to other threads */
178 pthreadpool_fence_release();
179}
180
181static void thread_parallelize_2d_tile_1d(struct pthreadpool* threadpool, struct thread_info* thread) {
182 assert(threadpool != NULL);
183 assert(thread != NULL);
184
185 const pthreadpool_task_2d_tile_1d_t task = (pthreadpool_task_2d_tile_1d_t) pthreadpool_load_relaxed_void_p(&threadpool->task);
186 void *const argument = pthreadpool_load_relaxed_void_p(&threadpool->argument);
187
188 /* Process thread's own range of items */
189 const size_t range_start = pthreadpool_load_relaxed_size_t(&thread->range_start);
190 const struct fxdiv_divisor_size_t tile_range_j = threadpool->params.parallelize_2d_tile_1d.tile_range_j;
191 const struct fxdiv_result_size_t tile_index_i_j = fxdiv_divide_size_t(range_start, tile_range_j);
192 const size_t tile_j = threadpool->params.parallelize_2d_tile_1d.tile_j;
193 size_t i = tile_index_i_j.quotient;
194 size_t start_j = tile_index_i_j.remainder * tile_j;
195
196 const size_t range_j = threadpool->params.parallelize_2d_tile_1d.range_j;
197 while (pthreadpool_try_decrement_relaxed_size_t(&thread->range_length)) {
198 task(argument, i, start_j, min(range_j - start_j, tile_j));
199 start_j += tile_j;
200 if (start_j >= range_j) {
201 start_j = 0;
202 i += 1;
203 }
204 }
205
206 /* There still may be other threads with work */
207 const size_t thread_number = thread->thread_number;
208 const size_t threads_count = threadpool->threads_count.value;
209 for (size_t tid = modulo_decrement(thread_number, threads_count);
210 tid != thread_number;
211 tid = modulo_decrement(tid, threads_count))
212 {
213 struct thread_info* other_thread = &threadpool->threads[tid];
214 while (pthreadpool_try_decrement_relaxed_size_t(&other_thread->range_length)) {
215 const size_t linear_index = pthreadpool_decrement_fetch_relaxed_size_t(&other_thread->range_end);
216 const struct fxdiv_result_size_t tile_index_i_j = fxdiv_divide_size_t(linear_index, tile_range_j);
217 const size_t start_j = tile_index_i_j.remainder * tile_j;
218 task(argument, tile_index_i_j.quotient, start_j, min(range_j - start_j, tile_j));
219 }
220 }
221
222 /* Make changes by this thread visible to other threads */
223 pthreadpool_fence_release();
224}
225
226static void thread_parallelize_2d_tile_2d(struct pthreadpool* threadpool, struct thread_info* thread) {
227 assert(threadpool != NULL);
228 assert(thread != NULL);
229
230 const pthreadpool_task_2d_tile_2d_t task = (pthreadpool_task_2d_tile_2d_t) pthreadpool_load_relaxed_void_p(&threadpool->task);
231 void *const argument = pthreadpool_load_relaxed_void_p(&threadpool->argument);
232
233 /* Process thread's own range of items */
234 const size_t range_start = pthreadpool_load_relaxed_size_t(&thread->range_start);
235 const struct fxdiv_divisor_size_t tile_range_j = threadpool->params.parallelize_2d_tile_2d.tile_range_j;
236 const struct fxdiv_result_size_t tile_index_i_j = fxdiv_divide_size_t(range_start, tile_range_j);
237 const size_t tile_i = threadpool->params.parallelize_2d_tile_2d.tile_i;
238 const size_t tile_j = threadpool->params.parallelize_2d_tile_2d.tile_j;
239 size_t start_i = tile_index_i_j.quotient * tile_i;
240 size_t start_j = tile_index_i_j.remainder * tile_j;
241
242 const size_t range_i = threadpool->params.parallelize_2d_tile_2d.range_i;
243 const size_t range_j = threadpool->params.parallelize_2d_tile_2d.range_j;
244 while (pthreadpool_try_decrement_relaxed_size_t(&thread->range_length)) {
245 task(argument, start_i, start_j, min(range_i - start_i, tile_i), min(range_j - start_j, tile_j));
246 start_j += tile_j;
247 if (start_j >= range_j) {
248 start_j = 0;
249 start_i += tile_i;
250 }
251 }
252
253 /* There still may be other threads with work */
254 const size_t thread_number = thread->thread_number;
255 const size_t threads_count = threadpool->threads_count.value;
256 for (size_t tid = modulo_decrement(thread_number, threads_count);
257 tid != thread_number;
258 tid = modulo_decrement(tid, threads_count))
259 {
260 struct thread_info* other_thread = &threadpool->threads[tid];
261 while (pthreadpool_try_decrement_relaxed_size_t(&other_thread->range_length)) {
262 const size_t linear_index = pthreadpool_decrement_fetch_relaxed_size_t(&other_thread->range_end);
263 const struct fxdiv_result_size_t tile_index_i_j = fxdiv_divide_size_t(linear_index, tile_range_j);
264 const size_t start_i = tile_index_i_j.quotient * tile_i;
265 const size_t start_j = tile_index_i_j.remainder * tile_j;
266 task(argument, start_i, start_j, min(range_i - start_i, tile_i), min(range_j - start_j, tile_j));
267 }
268 }
269
270 /* Make changes by this thread visible to other threads */
271 pthreadpool_fence_release();
272}
273
274static void thread_parallelize_2d_tile_2d_with_uarch(struct pthreadpool* threadpool, struct thread_info* thread) {
275 assert(threadpool != NULL);
276 assert(thread != NULL);
277
278 const pthreadpool_task_2d_tile_2d_with_id_t task = (pthreadpool_task_2d_tile_2d_with_id_t) pthreadpool_load_relaxed_void_p(&threadpool->task);
279 void *const argument = pthreadpool_load_relaxed_void_p(&threadpool->argument);
280
281 const uint32_t default_uarch_index = threadpool->params.parallelize_2d_tile_2d_with_uarch.default_uarch_index;
282 uint32_t uarch_index = default_uarch_index;
283 #if PTHREADPOOL_USE_CPUINFO
284 uarch_index = cpuinfo_get_current_uarch_index_with_default(default_uarch_index);
285 if (uarch_index > threadpool->params.parallelize_2d_tile_2d_with_uarch.max_uarch_index) {
286 uarch_index = default_uarch_index;
287 }
288 #endif
289
290 /* Process thread's own range of items */
291 const struct fxdiv_divisor_size_t tile_range_j = threadpool->params.parallelize_2d_tile_2d_with_uarch.tile_range_j;
292 const size_t range_start = pthreadpool_load_relaxed_size_t(&thread->range_start);
293 const struct fxdiv_result_size_t index = fxdiv_divide_size_t(range_start, tile_range_j);
294 const size_t range_i = threadpool->params.parallelize_2d_tile_2d_with_uarch.range_i;
295 const size_t tile_i = threadpool->params.parallelize_2d_tile_2d_with_uarch.tile_i;
296 const size_t range_j = threadpool->params.parallelize_2d_tile_2d_with_uarch.range_j;
297 const size_t tile_j = threadpool->params.parallelize_2d_tile_2d_with_uarch.tile_j;
298 size_t start_i = index.quotient * tile_i;
299 size_t start_j = index.remainder * tile_j;
300
301 while (pthreadpool_try_decrement_relaxed_size_t(&thread->range_length)) {
302 task(argument, uarch_index, start_i, start_j, min(range_i - start_i, tile_i), min(range_j - start_j, tile_j));
303 start_j += tile_j;
304 if (start_j >= range_j) {
305 start_j = 0;
306 start_i += tile_i;
307 }
308 }
309
310 /* There still may be other threads with work */
311 const size_t thread_number = thread->thread_number;
312 const size_t threads_count = threadpool->threads_count.value;
313 for (size_t tid = modulo_decrement(thread_number, threads_count);
314 tid != thread_number;
315 tid = modulo_decrement(tid, threads_count))
316 {
317 struct thread_info* other_thread = &threadpool->threads[tid];
318 while (pthreadpool_try_decrement_relaxed_size_t(&other_thread->range_length)) {
319 const size_t linear_index = pthreadpool_decrement_fetch_relaxed_size_t(&other_thread->range_end);
320 const struct fxdiv_result_size_t tile_index_i_j = fxdiv_divide_size_t(linear_index, tile_range_j);
321 const size_t start_i = tile_index_i_j.quotient * tile_i;
322 const size_t start_j = tile_index_i_j.remainder * tile_j;
323 task(argument, uarch_index, start_i, start_j, min(range_i - start_i, tile_i), min(range_j - start_j, tile_j));
324 }
325 }
326
327 /* Make changes by this thread visible to other threads */
328 pthreadpool_fence_release();
329}
330
331static void thread_parallelize_3d(struct pthreadpool* threadpool, struct thread_info* thread) {
332 assert(threadpool != NULL);
333 assert(thread != NULL);
334
335 const pthreadpool_task_3d_t task = (pthreadpool_task_3d_t) pthreadpool_load_relaxed_void_p(&threadpool->task);
336 void *const argument = pthreadpool_load_relaxed_void_p(&threadpool->argument);
337
338 /* Process thread's own range of items */
339 const size_t range_start = pthreadpool_load_relaxed_size_t(&thread->range_start);
340 const struct fxdiv_divisor_size_t range_k = threadpool->params.parallelize_3d.range_k;
341 const struct fxdiv_result_size_t index_ij_k = fxdiv_divide_size_t(range_start, range_k);
342 const struct fxdiv_divisor_size_t range_j = threadpool->params.parallelize_3d.range_j;
343 const struct fxdiv_result_size_t index_i_j = fxdiv_divide_size_t(index_ij_k.quotient, range_j);
344 size_t i = index_i_j.quotient;
345 size_t j = index_i_j.remainder;
346 size_t k = index_ij_k.remainder;
347
348 while (pthreadpool_try_decrement_relaxed_size_t(&thread->range_length)) {
349 task(argument, i, j, k);
350 if (++k == range_k.value) {
351 k = 0;
352 if (++j == range_j.value) {
353 j = 0;
354 i += 1;
355 }
356 }
357 }
358
359 /* There still may be other threads with work */
360 const size_t thread_number = thread->thread_number;
361 const size_t threads_count = threadpool->threads_count.value;
362 for (size_t tid = modulo_decrement(thread_number, threads_count);
363 tid != thread_number;
364 tid = modulo_decrement(tid, threads_count))
365 {
366 struct thread_info* other_thread = &threadpool->threads[tid];
367 while (pthreadpool_try_decrement_relaxed_size_t(&other_thread->range_length)) {
368 const size_t linear_index = pthreadpool_decrement_fetch_relaxed_size_t(&other_thread->range_end);
369 const struct fxdiv_result_size_t index_ij_k = fxdiv_divide_size_t(linear_index, range_k);
370 const struct fxdiv_result_size_t index_i_j = fxdiv_divide_size_t(index_ij_k.quotient, range_j);
371 task(argument, index_i_j.quotient, index_i_j.remainder, index_ij_k.remainder);
372 }
373 }
374
375 /* Make changes by this thread visible to other threads */
376 pthreadpool_fence_release();
377}
378
379static void thread_parallelize_3d_tile_1d(struct pthreadpool* threadpool, struct thread_info* thread) {
380 assert(threadpool != NULL);
381 assert(thread != NULL);
382
383 const pthreadpool_task_3d_tile_1d_t task = (pthreadpool_task_3d_tile_1d_t) pthreadpool_load_relaxed_void_p(&threadpool->task);
384 void *const argument = pthreadpool_load_relaxed_void_p(&threadpool->argument);
385
386 /* Process thread's own range of items */
387 const size_t range_start = pthreadpool_load_relaxed_size_t(&thread->range_start);
388 const struct fxdiv_divisor_size_t tile_range_k = threadpool->params.parallelize_3d_tile_1d.tile_range_k;
389 const struct fxdiv_result_size_t tile_index_ij_k = fxdiv_divide_size_t(range_start, tile_range_k);
390 const struct fxdiv_divisor_size_t range_j = threadpool->params.parallelize_3d_tile_1d.range_j;
391 const struct fxdiv_result_size_t index_i_j = fxdiv_divide_size_t(tile_index_ij_k.quotient, range_j);
392 const size_t tile_k = threadpool->params.parallelize_3d_tile_1d.tile_k;
393 size_t i = index_i_j.quotient;
394 size_t j = index_i_j.remainder;
395 size_t start_k = tile_index_ij_k.remainder * tile_k;
396
397 const size_t range_k = threadpool->params.parallelize_3d_tile_1d.range_k;
398 while (pthreadpool_try_decrement_relaxed_size_t(&thread->range_length)) {
399 task(argument, i, j, start_k, min(range_k - start_k, tile_k));
400 start_k += tile_k;
401 if (start_k >= range_k) {
402 start_k = 0;
403 if (++j == range_j.value) {
404 j = 0;
405 i += 1;
406 }
407 }
408 }
409
410 /* There still may be other threads with work */
411 const size_t thread_number = thread->thread_number;
412 const size_t threads_count = threadpool->threads_count.value;
413 for (size_t tid = modulo_decrement(thread_number, threads_count);
414 tid != thread_number;
415 tid = modulo_decrement(tid, threads_count))
416 {
417 struct thread_info* other_thread = &threadpool->threads[tid];
418 while (pthreadpool_try_decrement_relaxed_size_t(&other_thread->range_length)) {
419 const size_t linear_index = pthreadpool_decrement_fetch_relaxed_size_t(&other_thread->range_end);
420 const struct fxdiv_result_size_t tile_index_ij_k = fxdiv_divide_size_t(linear_index, tile_range_k);
421 const struct fxdiv_result_size_t index_i_j = fxdiv_divide_size_t(tile_index_ij_k.quotient, range_j);
422 const size_t start_k = tile_index_ij_k.remainder * tile_k;
423 task(argument, index_i_j.quotient, index_i_j.remainder, start_k, min(range_k - start_k, tile_k));
424 }
425 }
426
427 /* Make changes by this thread visible to other threads */
428 pthreadpool_fence_release();
429}
430
431static void thread_parallelize_3d_tile_2d(struct pthreadpool* threadpool, struct thread_info* thread) {
432 assert(threadpool != NULL);
433 assert(thread != NULL);
434
435 const pthreadpool_task_3d_tile_2d_t task = (pthreadpool_task_3d_tile_2d_t) pthreadpool_load_relaxed_void_p(&threadpool->task);
436 void *const argument = pthreadpool_load_relaxed_void_p(&threadpool->argument);
437
438 /* Process thread's own range of items */
439 const size_t range_start = pthreadpool_load_relaxed_size_t(&thread->range_start);
440 const struct fxdiv_divisor_size_t tile_range_k = threadpool->params.parallelize_3d_tile_2d.tile_range_k;
441 const struct fxdiv_result_size_t tile_index_ij_k = fxdiv_divide_size_t(range_start, tile_range_k);
442 const struct fxdiv_divisor_size_t tile_range_j = threadpool->params.parallelize_3d_tile_2d.tile_range_j;
443 const struct fxdiv_result_size_t tile_index_i_j = fxdiv_divide_size_t(tile_index_ij_k.quotient, tile_range_j);
444 const size_t tile_j = threadpool->params.parallelize_3d_tile_2d.tile_j;
445 const size_t tile_k = threadpool->params.parallelize_3d_tile_2d.tile_k;
446 size_t i = tile_index_i_j.quotient;
447 size_t start_j = tile_index_i_j.remainder * tile_j;
448 size_t start_k = tile_index_ij_k.remainder * tile_k;
449
450 const size_t range_k = threadpool->params.parallelize_3d_tile_2d.range_k;
451 const size_t range_j = threadpool->params.parallelize_3d_tile_2d.range_j;
452 while (pthreadpool_try_decrement_relaxed_size_t(&thread->range_length)) {
453 task(argument, i, start_j, start_k, min(range_j - start_j, tile_j), min(range_k - start_k, tile_k));
454 start_k += tile_k;
455 if (start_k >= range_k) {
456 start_k = 0;
457 start_j += tile_j;
458 if (start_j >= range_j) {
459 start_j = 0;
460 i += 1;
461 }
462 }
463 }
464
465 /* There still may be other threads with work */
466 const size_t thread_number = thread->thread_number;
467 const size_t threads_count = threadpool->threads_count.value;
468 for (size_t tid = modulo_decrement(thread_number, threads_count);
469 tid != thread_number;
470 tid = modulo_decrement(tid, threads_count))
471 {
472 struct thread_info* other_thread = &threadpool->threads[tid];
473 while (pthreadpool_try_decrement_relaxed_size_t(&other_thread->range_length)) {
474 const size_t linear_index = pthreadpool_decrement_fetch_relaxed_size_t(&other_thread->range_end);
475 const struct fxdiv_result_size_t tile_index_ij_k = fxdiv_divide_size_t(linear_index, tile_range_k);
476 const struct fxdiv_result_size_t tile_index_i_j = fxdiv_divide_size_t(tile_index_ij_k.quotient, tile_range_j);
477 const size_t start_j = tile_index_i_j.remainder * tile_j;
478 const size_t start_k = tile_index_ij_k.remainder * tile_k;
479 task(argument, tile_index_i_j.quotient, start_j, start_k, min(range_j - start_j, tile_j), min(range_k - start_k, tile_k));
480 }
481 }
482
483 /* Make changes by this thread visible to other threads */
484 pthreadpool_fence_release();
485}
486
487static void thread_parallelize_3d_tile_2d_with_uarch(struct pthreadpool* threadpool, struct thread_info* thread) {
488 assert(threadpool != NULL);
489 assert(thread != NULL);
490
491 const pthreadpool_task_3d_tile_2d_with_id_t task = (pthreadpool_task_3d_tile_2d_with_id_t) pthreadpool_load_relaxed_void_p(&threadpool->task);
492 void *const argument = pthreadpool_load_relaxed_void_p(&threadpool->argument);
493
494 const uint32_t default_uarch_index = threadpool->params.parallelize_3d_tile_2d_with_uarch.default_uarch_index;
495 uint32_t uarch_index = default_uarch_index;
496 #if PTHREADPOOL_USE_CPUINFO
497 uarch_index = cpuinfo_get_current_uarch_index_with_default(default_uarch_index);
498 if (uarch_index > threadpool->params.parallelize_3d_tile_2d_with_uarch.max_uarch_index) {
499 uarch_index = default_uarch_index;
500 }
501 #endif
502
503 /* Process thread's own range of items */
504 const size_t range_start = pthreadpool_load_relaxed_size_t(&thread->range_start);
505 const struct fxdiv_divisor_size_t tile_range_k = threadpool->params.parallelize_3d_tile_2d_with_uarch.tile_range_k;
506 const struct fxdiv_result_size_t tile_index_ij_k = fxdiv_divide_size_t(range_start, tile_range_k);
507 const struct fxdiv_divisor_size_t tile_range_j = threadpool->params.parallelize_3d_tile_2d_with_uarch.tile_range_j;
508 const struct fxdiv_result_size_t tile_index_i_j = fxdiv_divide_size_t(tile_index_ij_k.quotient, tile_range_j);
509 const size_t tile_j = threadpool->params.parallelize_3d_tile_2d_with_uarch.tile_j;
510 const size_t tile_k = threadpool->params.parallelize_3d_tile_2d_with_uarch.tile_k;
511 size_t i = tile_index_i_j.quotient;
512 size_t start_j = tile_index_i_j.remainder * tile_j;
513 size_t start_k = tile_index_ij_k.remainder * tile_k;
514
515 const size_t range_k = threadpool->params.parallelize_3d_tile_2d_with_uarch.range_k;
516 const size_t range_j = threadpool->params.parallelize_3d_tile_2d_with_uarch.range_j;
517 while (pthreadpool_try_decrement_relaxed_size_t(&thread->range_length)) {
518 task(argument, uarch_index, i, start_j, start_k, min(range_j - start_j, tile_j), min(range_k - start_k, tile_k));
519 start_k += tile_k;
520 if (start_k >= range_k) {
521 start_k = 0;
522 start_j += tile_j;
523 if (start_j >= range_j) {
524 start_j = 0;
525 i += 1;
526 }
527 }
528 }
529
530 /* There still may be other threads with work */
531 const size_t thread_number = thread->thread_number;
532 const size_t threads_count = threadpool->threads_count.value;
533 for (size_t tid = modulo_decrement(thread_number, threads_count);
534 tid != thread_number;
535 tid = modulo_decrement(tid, threads_count))
536 {
537 struct thread_info* other_thread = &threadpool->threads[tid];
538 while (pthreadpool_try_decrement_relaxed_size_t(&other_thread->range_length)) {
539 const size_t linear_index = pthreadpool_decrement_fetch_relaxed_size_t(&other_thread->range_end);
540 const struct fxdiv_result_size_t tile_index_ij_k = fxdiv_divide_size_t(linear_index, tile_range_k);
541 const struct fxdiv_result_size_t tile_index_i_j = fxdiv_divide_size_t(tile_index_ij_k.quotient, tile_range_j);
542 const size_t start_j = tile_index_i_j.remainder * tile_j;
543 const size_t start_k = tile_index_ij_k.remainder * tile_k;
544 task(argument, uarch_index, tile_index_i_j.quotient, start_j, start_k, min(range_j - start_j, tile_j), min(range_k - start_k, tile_k));
545 }
546 }
547
548 /* Make changes by this thread visible to other threads */
549 pthreadpool_fence_release();
550}
551
552static void thread_parallelize_4d(struct pthreadpool* threadpool, struct thread_info* thread) {
553 assert(threadpool != NULL);
554 assert(thread != NULL);
555
556 const pthreadpool_task_4d_t task = (pthreadpool_task_4d_t) pthreadpool_load_relaxed_void_p(&threadpool->task);
557 void *const argument = pthreadpool_load_relaxed_void_p(&threadpool->argument);
558
559 /* Process thread's own range of items */
560 const size_t range_start = pthreadpool_load_relaxed_size_t(&thread->range_start);
561 const struct fxdiv_divisor_size_t range_kl = threadpool->params.parallelize_4d.range_kl;
562 const struct fxdiv_result_size_t index_ij_kl = fxdiv_divide_size_t(range_start, range_kl);
563 const struct fxdiv_divisor_size_t range_j = threadpool->params.parallelize_4d.range_j;
564 const struct fxdiv_result_size_t index_i_j = fxdiv_divide_size_t(index_ij_kl.quotient, range_j);
565 const struct fxdiv_divisor_size_t range_l = threadpool->params.parallelize_4d.range_l;
566 const struct fxdiv_result_size_t index_k_l = fxdiv_divide_size_t(index_ij_kl.remainder, range_l);
567 size_t i = index_i_j.quotient;
568 size_t j = index_i_j.remainder;
569 size_t k = index_k_l.quotient;
570 size_t l = index_k_l.remainder;
571
572 const size_t range_k = threadpool->params.parallelize_4d.range_k;
573 while (pthreadpool_try_decrement_relaxed_size_t(&thread->range_length)) {
574 task(argument, i, j, k, l);
575 if (++l == range_l.value) {
576 l = 0;
577 if (++k == range_k) {
578 k = 0;
579 if (++j == range_j.value) {
580 j = 0;
581 i += 1;
582 }
583 }
584 }
585 }
586
587 /* There still may be other threads with work */
588 const size_t thread_number = thread->thread_number;
589 const size_t threads_count = threadpool->threads_count.value;
590 for (size_t tid = modulo_decrement(thread_number, threads_count);
591 tid != thread_number;
592 tid = modulo_decrement(tid, threads_count))
593 {
594 struct thread_info* other_thread = &threadpool->threads[tid];
595 while (pthreadpool_try_decrement_relaxed_size_t(&other_thread->range_length)) {
596 const size_t linear_index = pthreadpool_decrement_fetch_relaxed_size_t(&other_thread->range_end);
597 const struct fxdiv_result_size_t index_ij_kl = fxdiv_divide_size_t(linear_index, range_kl);
598 const struct fxdiv_result_size_t index_i_j = fxdiv_divide_size_t(index_ij_kl.quotient, range_j);
599 const struct fxdiv_result_size_t index_k_l = fxdiv_divide_size_t(index_ij_kl.remainder, range_l);
600 task(argument, index_i_j.quotient, index_i_j.remainder, index_k_l.quotient, index_k_l.remainder);
601 }
602 }
603
604 /* Make changes by this thread visible to other threads */
605 pthreadpool_fence_release();
606}
607
608static void thread_parallelize_4d_tile_1d(struct pthreadpool* threadpool, struct thread_info* thread) {
609 assert(threadpool != NULL);
610 assert(thread != NULL);
611
612 const pthreadpool_task_4d_tile_1d_t task = (pthreadpool_task_4d_tile_1d_t) pthreadpool_load_relaxed_void_p(&threadpool->task);
613 void *const argument = pthreadpool_load_relaxed_void_p(&threadpool->argument);
614
615 /* Process thread's own range of items */
616 const size_t range_start = pthreadpool_load_relaxed_size_t(&thread->range_start);
617 const struct fxdiv_divisor_size_t tile_range_kl = threadpool->params.parallelize_4d_tile_1d.tile_range_kl;
618 const struct fxdiv_result_size_t tile_index_ij_kl = fxdiv_divide_size_t(range_start, tile_range_kl);
619 const struct fxdiv_divisor_size_t range_j = threadpool->params.parallelize_4d_tile_1d.range_j;
620 const struct fxdiv_result_size_t index_i_j = fxdiv_divide_size_t(tile_index_ij_kl.quotient, range_j);
621 const struct fxdiv_divisor_size_t tile_range_l = threadpool->params.parallelize_4d_tile_1d.tile_range_l;
622 const struct fxdiv_result_size_t tile_index_k_l = fxdiv_divide_size_t(tile_index_ij_kl.remainder, tile_range_l);
623 const size_t tile_l = threadpool->params.parallelize_4d_tile_1d.tile_l;
624 size_t i = index_i_j.quotient;
625 size_t j = index_i_j.remainder;
626 size_t k = tile_index_k_l.quotient;
627 size_t start_l = tile_index_k_l.remainder * tile_l;
628
629 const size_t range_k = threadpool->params.parallelize_4d_tile_1d.range_k;
630 const size_t range_l = threadpool->params.parallelize_4d_tile_1d.range_l;
631 while (pthreadpool_try_decrement_relaxed_size_t(&thread->range_length)) {
632 task(argument, i, j, k, start_l, min(range_l - start_l, tile_l));
633 start_l += tile_l;
634 if (start_l >= range_l) {
635 start_l = 0;
636 if (++k == range_k) {
637 k = 0;
638 if (++j == range_j.value) {
639 j = 0;
640 i += 1;
641 }
642 }
643 }
644 }
645
646 /* There still may be other threads with work */
647 const size_t thread_number = thread->thread_number;
648 const size_t threads_count = threadpool->threads_count.value;
649 for (size_t tid = modulo_decrement(thread_number, threads_count);
650 tid != thread_number;
651 tid = modulo_decrement(tid, threads_count))
652 {
653 struct thread_info* other_thread = &threadpool->threads[tid];
654 while (pthreadpool_try_decrement_relaxed_size_t(&other_thread->range_length)) {
655 const size_t linear_index = pthreadpool_decrement_fetch_relaxed_size_t(&other_thread->range_end);
656 const struct fxdiv_result_size_t tile_index_ij_kl = fxdiv_divide_size_t(linear_index, tile_range_kl);
657 const struct fxdiv_result_size_t index_i_j = fxdiv_divide_size_t(tile_index_ij_kl.quotient, range_j);
658 const struct fxdiv_result_size_t tile_index_k_l = fxdiv_divide_size_t(tile_index_ij_kl.remainder, tile_range_l);
659 const size_t start_l = tile_index_k_l.remainder * tile_l;
660 task(argument, index_i_j.quotient, index_i_j.remainder, tile_index_k_l.quotient, start_l, min(range_l - start_l, tile_l));
661 }
662 }
663
664 /* Make changes by this thread visible to other threads */
665 pthreadpool_fence_release();
666}
667
668static void thread_parallelize_4d_tile_2d(struct pthreadpool* threadpool, struct thread_info* thread) {
669 assert(threadpool != NULL);
670 assert(thread != NULL);
671
672 const pthreadpool_task_4d_tile_2d_t task = (pthreadpool_task_4d_tile_2d_t) pthreadpool_load_relaxed_void_p(&threadpool->task);
673 void *const argument = pthreadpool_load_relaxed_void_p(&threadpool->argument);
674
675 /* Process thread's own range of items */
676 const size_t range_start = pthreadpool_load_relaxed_size_t(&thread->range_start);
677 const struct fxdiv_divisor_size_t tile_range_kl = threadpool->params.parallelize_4d_tile_2d.tile_range_kl;
678 const struct fxdiv_result_size_t tile_index_ij_kl = fxdiv_divide_size_t(range_start, tile_range_kl);
679 const struct fxdiv_divisor_size_t range_j = threadpool->params.parallelize_4d_tile_2d.range_j;
680 const struct fxdiv_result_size_t index_i_j = fxdiv_divide_size_t(tile_index_ij_kl.quotient, range_j);
681 const struct fxdiv_divisor_size_t tile_range_l = threadpool->params.parallelize_4d_tile_2d.tile_range_l;
682 const struct fxdiv_result_size_t tile_index_k_l = fxdiv_divide_size_t(tile_index_ij_kl.remainder, tile_range_l);
683 const size_t tile_k = threadpool->params.parallelize_4d_tile_2d.tile_k;
684 const size_t tile_l = threadpool->params.parallelize_4d_tile_2d.tile_l;
685 size_t i = index_i_j.quotient;
686 size_t j = index_i_j.remainder;
687 size_t start_k = tile_index_k_l.quotient * tile_k;
688 size_t start_l = tile_index_k_l.remainder * tile_l;
689
690 const size_t range_l = threadpool->params.parallelize_4d_tile_2d.range_l;
691 const size_t range_k = threadpool->params.parallelize_4d_tile_2d.range_k;
692 while (pthreadpool_try_decrement_relaxed_size_t(&thread->range_length)) {
693 task(argument, i, j, start_k, start_l, min(range_k - start_k, tile_k), min(range_l - start_l, tile_l));
694 start_l += tile_l;
695 if (start_l >= range_l) {
696 start_l = 0;
697 start_k += tile_k;
698 if (start_k >= range_k) {
699 start_k = 0;
700 if (++j == range_j.value) {
701 j = 0;
702 i += 1;
703 }
704 }
705 }
706 }
707
708 /* There still may be other threads with work */
709 const size_t thread_number = thread->thread_number;
710 const size_t threads_count = threadpool->threads_count.value;
711 for (size_t tid = modulo_decrement(thread_number, threads_count);
712 tid != thread_number;
713 tid = modulo_decrement(tid, threads_count))
714 {
715 struct thread_info* other_thread = &threadpool->threads[tid];
716 while (pthreadpool_try_decrement_relaxed_size_t(&other_thread->range_length)) {
717 const size_t linear_index = pthreadpool_decrement_fetch_relaxed_size_t(&other_thread->range_end);
718 const struct fxdiv_result_size_t tile_index_ij_kl = fxdiv_divide_size_t(linear_index, tile_range_kl);
719 const struct fxdiv_result_size_t index_i_j = fxdiv_divide_size_t(tile_index_ij_kl.quotient, range_j);
720 const struct fxdiv_result_size_t tile_index_k_l = fxdiv_divide_size_t(tile_index_ij_kl.remainder, tile_range_l);
721 const size_t start_k = tile_index_k_l.quotient * tile_k;
722 const size_t start_l = tile_index_k_l.remainder * tile_l;
723 task(argument, index_i_j.quotient, index_i_j.remainder, start_k, start_l, min(range_k - start_k, tile_k), min(range_l - start_l, tile_l));
724 }
725 }
726
727 /* Make changes by this thread visible to other threads */
728 pthreadpool_fence_release();
729}
730
731static void thread_parallelize_4d_tile_2d_with_uarch(struct pthreadpool* threadpool, struct thread_info* thread) {
732 assert(threadpool != NULL);
733 assert(thread != NULL);
734
735 const pthreadpool_task_4d_tile_2d_with_id_t task = (pthreadpool_task_4d_tile_2d_with_id_t) pthreadpool_load_relaxed_void_p(&threadpool->task);
736 void *const argument = pthreadpool_load_relaxed_void_p(&threadpool->argument);
737
738 const uint32_t default_uarch_index = threadpool->params.parallelize_4d_tile_2d_with_uarch.default_uarch_index;
739 uint32_t uarch_index = default_uarch_index;
740 #if PTHREADPOOL_USE_CPUINFO
741 uarch_index = cpuinfo_get_current_uarch_index_with_default(default_uarch_index);
742 if (uarch_index > threadpool->params.parallelize_4d_tile_2d_with_uarch.max_uarch_index) {
743 uarch_index = default_uarch_index;
744 }
745 #endif
746
747 /* Process thread's own range of items */
748 const size_t range_start = pthreadpool_load_relaxed_size_t(&thread->range_start);
749 const struct fxdiv_divisor_size_t tile_range_kl = threadpool->params.parallelize_4d_tile_2d_with_uarch.tile_range_kl;
750 const struct fxdiv_result_size_t tile_index_ij_kl = fxdiv_divide_size_t(range_start, tile_range_kl);
751 const struct fxdiv_divisor_size_t range_j = threadpool->params.parallelize_4d_tile_2d_with_uarch.range_j;
752 const struct fxdiv_result_size_t index_i_j = fxdiv_divide_size_t(tile_index_ij_kl.quotient, range_j);
753 const struct fxdiv_divisor_size_t tile_range_l = threadpool->params.parallelize_4d_tile_2d_with_uarch.tile_range_l;
754 const struct fxdiv_result_size_t tile_index_k_l = fxdiv_divide_size_t(tile_index_ij_kl.remainder, tile_range_l);
755 const size_t tile_k = threadpool->params.parallelize_4d_tile_2d_with_uarch.tile_k;
756 const size_t tile_l = threadpool->params.parallelize_4d_tile_2d_with_uarch.tile_l;
757 size_t i = index_i_j.quotient;
758 size_t j = index_i_j.remainder;
759 size_t start_k = tile_index_k_l.quotient * tile_k;
760 size_t start_l = tile_index_k_l.remainder * tile_l;
761
762 const size_t range_l = threadpool->params.parallelize_4d_tile_2d_with_uarch.range_l;
763 const size_t range_k = threadpool->params.parallelize_4d_tile_2d_with_uarch.range_k;
764 while (pthreadpool_try_decrement_relaxed_size_t(&thread->range_length)) {
765 task(argument, uarch_index, i, j, start_k, start_l, min(range_k - start_k, tile_k), min(range_l - start_l, tile_l));
766 start_l += tile_l;
767 if (start_l >= range_l) {
768 start_l = 0;
769 start_k += tile_k;
770 if (start_k >= range_k) {
771 start_k = 0;
772 if (++j == range_j.value) {
773 j = 0;
774 i += 1;
775 }
776 }
777 }
778 }
779
780 /* There still may be other threads with work */
781 const size_t thread_number = thread->thread_number;
782 const size_t threads_count = threadpool->threads_count.value;
783 for (size_t tid = modulo_decrement(thread_number, threads_count);
784 tid != thread_number;
785 tid = modulo_decrement(tid, threads_count))
786 {
787 struct thread_info* other_thread = &threadpool->threads[tid];
788 while (pthreadpool_try_decrement_relaxed_size_t(&other_thread->range_length)) {
789 const size_t linear_index = pthreadpool_decrement_fetch_relaxed_size_t(&other_thread->range_end);
790 const struct fxdiv_result_size_t tile_index_ij_kl = fxdiv_divide_size_t(linear_index, tile_range_kl);
791 const struct fxdiv_result_size_t index_i_j = fxdiv_divide_size_t(tile_index_ij_kl.quotient, range_j);
792 const struct fxdiv_result_size_t tile_index_k_l = fxdiv_divide_size_t(tile_index_ij_kl.remainder, tile_range_l);
793 const size_t start_k = tile_index_k_l.quotient * tile_k;
794 const size_t start_l = tile_index_k_l.remainder * tile_l;
795 task(argument, uarch_index, index_i_j.quotient, index_i_j.remainder, start_k, start_l, min(range_k - start_k, tile_k), min(range_l - start_l, tile_l));
796 }
797 }
798
799 /* Make changes by this thread visible to other threads */
800 pthreadpool_fence_release();
801}
802
803static void thread_parallelize_5d(struct pthreadpool* threadpool, struct thread_info* thread) {
804 assert(threadpool != NULL);
805 assert(thread != NULL);
806
807 const pthreadpool_task_5d_t task = (pthreadpool_task_5d_t) pthreadpool_load_relaxed_void_p(&threadpool->task);
808 void *const argument = pthreadpool_load_relaxed_void_p(&threadpool->argument);
809
810 /* Process thread's own range of items */
811 const size_t range_start = pthreadpool_load_relaxed_size_t(&thread->range_start);
812 const struct fxdiv_divisor_size_t range_lm = threadpool->params.parallelize_5d.range_lm;
813 const struct fxdiv_result_size_t index_ijk_lm = fxdiv_divide_size_t(range_start, range_lm);
814 const struct fxdiv_divisor_size_t range_k = threadpool->params.parallelize_5d.range_k;
815 const struct fxdiv_result_size_t index_ij_k = fxdiv_divide_size_t(index_ijk_lm.quotient, range_k);
816 const struct fxdiv_divisor_size_t range_m = threadpool->params.parallelize_5d.range_m;
817 const struct fxdiv_result_size_t index_l_m = fxdiv_divide_size_t(index_ijk_lm.remainder, range_m);
818 const struct fxdiv_divisor_size_t range_j = threadpool->params.parallelize_5d.range_j;
819 const struct fxdiv_result_size_t index_i_j = fxdiv_divide_size_t(index_ij_k.quotient, range_j);
820 size_t i = index_i_j.quotient;
821 size_t j = index_i_j.remainder;
822 size_t k = index_ij_k.remainder;
823 size_t l = index_l_m.quotient;
824 size_t m = index_l_m.remainder;
825
826 const size_t range_l = threadpool->params.parallelize_5d.range_l;
827 while (pthreadpool_try_decrement_relaxed_size_t(&thread->range_length)) {
828 task(argument, i, j, k, l, m);
829 if (++m == range_m.value) {
830 m = 0;
831 if (++l == range_l) {
832 l = 0;
833 if (++k == range_k.value) {
834 k = 0;
835 if (++j == range_j.value) {
836 j = 0;
837 i += 1;
838 }
839 }
840 }
841 }
842 }
843
844 /* There still may be other threads with work */
845 const size_t thread_number = thread->thread_number;
846 const size_t threads_count = threadpool->threads_count.value;
847 for (size_t tid = modulo_decrement(thread_number, threads_count);
848 tid != thread_number;
849 tid = modulo_decrement(tid, threads_count))
850 {
851 struct thread_info* other_thread = &threadpool->threads[tid];
852 while (pthreadpool_try_decrement_relaxed_size_t(&other_thread->range_length)) {
853 const size_t linear_index = pthreadpool_decrement_fetch_relaxed_size_t(&other_thread->range_end);
854 const struct fxdiv_result_size_t index_ijk_lm = fxdiv_divide_size_t(linear_index, range_lm);
855 const struct fxdiv_result_size_t index_ij_k = fxdiv_divide_size_t(index_ijk_lm.quotient, range_k);
856 const struct fxdiv_result_size_t index_l_m = fxdiv_divide_size_t(index_ijk_lm.remainder, range_m);
857 const struct fxdiv_result_size_t index_i_j = fxdiv_divide_size_t(index_ij_k.quotient, range_j);
858 task(argument, index_i_j.quotient, index_i_j.remainder, index_ij_k.remainder, index_l_m.quotient, index_l_m.remainder);
859 }
860 }
861
862 /* Make changes by this thread visible to other threads */
863 pthreadpool_fence_release();
864}
865
866static void thread_parallelize_5d_tile_1d(struct pthreadpool* threadpool, struct thread_info* thread) {
867 assert(threadpool != NULL);
868 assert(thread != NULL);
869
870 const pthreadpool_task_5d_tile_1d_t task = (pthreadpool_task_5d_tile_1d_t) pthreadpool_load_relaxed_void_p(&threadpool->task);
871 void *const argument = pthreadpool_load_relaxed_void_p(&threadpool->argument);
872
873 /* Process thread's own range of items */
874 const size_t range_start = pthreadpool_load_relaxed_size_t(&thread->range_start);
875 const struct fxdiv_divisor_size_t tile_range_m = threadpool->params.parallelize_5d_tile_1d.tile_range_m;
876 const struct fxdiv_result_size_t tile_index_ijkl_m = fxdiv_divide_size_t(range_start, tile_range_m);
877 const struct fxdiv_divisor_size_t range_kl = threadpool->params.parallelize_5d_tile_1d.range_kl;
878 const struct fxdiv_result_size_t index_ij_kl = fxdiv_divide_size_t(tile_index_ijkl_m.quotient, range_kl);
879 const struct fxdiv_divisor_size_t range_j = threadpool->params.parallelize_5d_tile_1d.range_j;
880 const struct fxdiv_result_size_t index_i_j = fxdiv_divide_size_t(index_ij_kl.quotient, range_j);
881 const struct fxdiv_divisor_size_t range_l = threadpool->params.parallelize_5d_tile_1d.range_l;
882 const struct fxdiv_result_size_t index_k_l = fxdiv_divide_size_t(index_ij_kl.remainder, range_l);
883 const size_t tile_m = threadpool->params.parallelize_5d_tile_1d.tile_m;
884 size_t i = index_i_j.quotient;
885 size_t j = index_i_j.remainder;
886 size_t k = index_k_l.quotient;
887 size_t l = index_k_l.remainder;
888 size_t start_m = tile_index_ijkl_m.remainder * tile_m;
889
890 const size_t range_m = threadpool->params.parallelize_5d_tile_1d.range_m;
891 const size_t range_k = threadpool->params.parallelize_5d_tile_1d.range_k;
892 while (pthreadpool_try_decrement_relaxed_size_t(&thread->range_length)) {
893 task(argument, i, j, k, l, start_m, min(range_m - start_m, tile_m));
894 start_m += tile_m;
895 if (start_m >= range_m) {
896 start_m = 0;
897 if (++l == range_l.value) {
898 l = 0;
899 if (++k == range_k) {
900 k = 0;
901 if (++j == range_j.value) {
902 j = 0;
903 i += 1;
904 }
905 }
906 }
907 }
908 }
909
910 /* There still may be other threads with work */
911 const size_t thread_number = thread->thread_number;
912 const size_t threads_count = threadpool->threads_count.value;
913 for (size_t tid = modulo_decrement(thread_number, threads_count);
914 tid != thread_number;
915 tid = modulo_decrement(tid, threads_count))
916 {
917 struct thread_info* other_thread = &threadpool->threads[tid];
918 while (pthreadpool_try_decrement_relaxed_size_t(&other_thread->range_length)) {
919 const size_t linear_index = pthreadpool_decrement_fetch_relaxed_size_t(&other_thread->range_end);
920 const struct fxdiv_result_size_t tile_index_ijkl_m = fxdiv_divide_size_t(linear_index, tile_range_m);
921 const struct fxdiv_result_size_t index_ij_kl = fxdiv_divide_size_t(tile_index_ijkl_m.quotient, range_kl);
922 const struct fxdiv_result_size_t index_i_j = fxdiv_divide_size_t(index_ij_kl.quotient, range_j);
923 const struct fxdiv_result_size_t index_k_l = fxdiv_divide_size_t(index_ij_kl.remainder, range_l);
924 size_t start_m = tile_index_ijkl_m.remainder * tile_m;
925 task(argument, index_i_j.quotient, index_i_j.remainder, index_k_l.quotient, index_k_l.remainder, start_m,
926 min(range_m - start_m, tile_m));
927 }
928 }
929
930 /* Make changes by this thread visible to other threads */
931 pthreadpool_fence_release();
932}
933
934static void thread_parallelize_5d_tile_2d(struct pthreadpool* threadpool, struct thread_info* thread) {
935 assert(threadpool != NULL);
936 assert(thread != NULL);
937
938 const pthreadpool_task_5d_tile_2d_t task = (pthreadpool_task_5d_tile_2d_t) pthreadpool_load_relaxed_void_p(&threadpool->task);
939 void *const argument = pthreadpool_load_relaxed_void_p(&threadpool->argument);
940
941 /* Process thread's own range of items */
942 const size_t range_start = pthreadpool_load_relaxed_size_t(&thread->range_start);
943 const struct fxdiv_divisor_size_t tile_range_lm = threadpool->params.parallelize_5d_tile_2d.tile_range_lm;
944 const struct fxdiv_result_size_t tile_index_ijk_lm = fxdiv_divide_size_t(range_start, tile_range_lm);
945 const struct fxdiv_divisor_size_t range_k = threadpool->params.parallelize_5d_tile_2d.range_k;
946 const struct fxdiv_result_size_t index_ij_k = fxdiv_divide_size_t(tile_index_ijk_lm.quotient, range_k);
947 const struct fxdiv_divisor_size_t tile_range_m = threadpool->params.parallelize_5d_tile_2d.tile_range_m;
948 const struct fxdiv_result_size_t tile_index_l_m = fxdiv_divide_size_t(tile_index_ijk_lm.remainder, tile_range_m);
949 const struct fxdiv_divisor_size_t range_j = threadpool->params.parallelize_5d_tile_2d.range_j;
950 const struct fxdiv_result_size_t index_i_j = fxdiv_divide_size_t(index_ij_k.quotient, range_j);
951 const size_t tile_l = threadpool->params.parallelize_5d_tile_2d.tile_l;
952 const size_t tile_m = threadpool->params.parallelize_5d_tile_2d.tile_m;
953 size_t i = index_i_j.quotient;
954 size_t j = index_i_j.remainder;
955 size_t k = index_ij_k.remainder;
956 size_t start_l = tile_index_l_m.quotient * tile_l;
957 size_t start_m = tile_index_l_m.remainder * tile_m;
958
959 const size_t range_m = threadpool->params.parallelize_5d_tile_2d.range_m;
960 const size_t range_l = threadpool->params.parallelize_5d_tile_2d.range_l;
961 while (pthreadpool_try_decrement_relaxed_size_t(&thread->range_length)) {
962 task(argument, i, j, k, start_l, start_m, min(range_l - start_l, tile_l), min(range_m - start_m, tile_m));
963 start_m += tile_m;
964 if (start_m >= range_m) {
965 start_m = 0;
966 start_l += tile_l;
967 if (start_l >= range_l) {
968 start_l = 0;
969 if (++k == range_k.value) {
970 k = 0;
971 if (++j == range_j.value) {
972 j = 0;
973 i += 1;
974 }
975 }
976 }
977 }
978 }
979
980 /* There still may be other threads with work */
981 const size_t thread_number = thread->thread_number;
982 const size_t threads_count = threadpool->threads_count.value;
983 for (size_t tid = modulo_decrement(thread_number, threads_count);
984 tid != thread_number;
985 tid = modulo_decrement(tid, threads_count))
986 {
987 struct thread_info* other_thread = &threadpool->threads[tid];
988 while (pthreadpool_try_decrement_relaxed_size_t(&other_thread->range_length)) {
989 const size_t linear_index = pthreadpool_decrement_fetch_relaxed_size_t(&other_thread->range_end);
990 const struct fxdiv_result_size_t tile_index_ijk_lm = fxdiv_divide_size_t(linear_index, tile_range_lm);
991 const struct fxdiv_result_size_t index_ij_k = fxdiv_divide_size_t(tile_index_ijk_lm.quotient, range_k);
992 const struct fxdiv_result_size_t tile_index_l_m = fxdiv_divide_size_t(tile_index_ijk_lm.remainder, tile_range_m);
993 const struct fxdiv_result_size_t index_i_j = fxdiv_divide_size_t(index_ij_k.quotient, range_j);
994 const size_t start_l = tile_index_l_m.quotient * tile_l;
995 const size_t start_m = tile_index_l_m.remainder * tile_m;
996 task(argument, index_i_j.quotient, index_i_j.remainder, index_ij_k.remainder,
997 start_l, start_m, min(range_l - start_l, tile_l), min(range_m - start_m, tile_m));
998 }
999 }
1000
1001 /* Make changes by this thread visible to other threads */
1002 pthreadpool_fence_release();
1003}
1004
1005static void thread_parallelize_6d(struct pthreadpool* threadpool, struct thread_info* thread) {
1006 assert(threadpool != NULL);
1007 assert(thread != NULL);
1008
1009 const pthreadpool_task_6d_t task = (pthreadpool_task_6d_t) pthreadpool_load_relaxed_void_p(&threadpool->task);
1010 void *const argument = pthreadpool_load_relaxed_void_p(&threadpool->argument);
1011
1012 /* Process thread's own range of items */
1013 const size_t range_start = pthreadpool_load_relaxed_size_t(&thread->range_start);
1014 const struct fxdiv_divisor_size_t range_lmn = threadpool->params.parallelize_6d.range_lmn;
1015 const struct fxdiv_result_size_t index_ijk_lmn = fxdiv_divide_size_t(range_start, range_lmn);
1016 const struct fxdiv_divisor_size_t range_k = threadpool->params.parallelize_6d.range_k;
1017 const struct fxdiv_result_size_t index_ij_k = fxdiv_divide_size_t(index_ijk_lmn.quotient, range_k);
1018 const struct fxdiv_divisor_size_t range_n = threadpool->params.parallelize_6d.range_n;
1019 const struct fxdiv_result_size_t index_lm_n = fxdiv_divide_size_t(index_ijk_lmn.remainder, range_n);
1020 const struct fxdiv_divisor_size_t range_j = threadpool->params.parallelize_6d.range_j;
1021 const struct fxdiv_result_size_t index_i_j = fxdiv_divide_size_t(index_ij_k.quotient, range_j);
1022 const struct fxdiv_divisor_size_t range_m = threadpool->params.parallelize_6d.range_m;
1023 const struct fxdiv_result_size_t index_l_m = fxdiv_divide_size_t(index_lm_n.quotient, range_m);
1024 size_t i = index_i_j.quotient;
1025 size_t j = index_i_j.remainder;
1026 size_t k = index_ij_k.remainder;
1027 size_t l = index_l_m.quotient;
1028 size_t m = index_l_m.remainder;
1029 size_t n = index_lm_n.remainder;
1030
1031 const size_t range_l = threadpool->params.parallelize_6d.range_l;
1032 while (pthreadpool_try_decrement_relaxed_size_t(&thread->range_length)) {
1033 task(argument, i, j, k, l, m, n);
1034 if (++n == range_n.value) {
1035 n = 0;
1036 if (++m == range_m.value) {
1037 m = 0;
1038 if (++l == range_l) {
1039 l = 0;
1040 if (++k == range_k.value) {
1041 k = 0;
1042 if (++j == range_j.value) {
1043 j = 0;
1044 i += 1;
1045 }
1046 }
1047 }
1048 }
1049 }
1050 }
1051
1052
1053 /* There still may be other threads with work */
1054 const size_t thread_number = thread->thread_number;
1055 const size_t threads_count = threadpool->threads_count.value;
1056 for (size_t tid = modulo_decrement(thread_number, threads_count);
1057 tid != thread_number;
1058 tid = modulo_decrement(tid, threads_count))
1059 {
1060 struct thread_info* other_thread = &threadpool->threads[tid];
1061 while (pthreadpool_try_decrement_relaxed_size_t(&other_thread->range_length)) {
1062 const size_t linear_index = pthreadpool_decrement_fetch_relaxed_size_t(&other_thread->range_end);
1063 const struct fxdiv_result_size_t index_ijk_lmn = fxdiv_divide_size_t(linear_index, range_lmn);
1064 const struct fxdiv_result_size_t index_ij_k = fxdiv_divide_size_t(index_ijk_lmn.quotient, range_k);
1065 const struct fxdiv_result_size_t index_lm_n = fxdiv_divide_size_t(index_ijk_lmn.remainder, range_n);
1066 const struct fxdiv_result_size_t index_i_j = fxdiv_divide_size_t(index_ij_k.quotient, range_j);
1067 const struct fxdiv_result_size_t index_l_m = fxdiv_divide_size_t(index_lm_n.quotient, range_m);
1068 task(argument, index_i_j.quotient, index_i_j.remainder, index_ij_k.remainder, index_l_m.quotient, index_l_m.remainder, index_lm_n.remainder);
1069 }
1070 }
1071
1072 /* Make changes by this thread visible to other threads */
1073 pthreadpool_fence_release();
1074}
1075
1076static void thread_parallelize_6d_tile_1d(struct pthreadpool* threadpool, struct thread_info* thread) {
1077 assert(threadpool != NULL);
1078 assert(thread != NULL);
1079
1080 const pthreadpool_task_6d_tile_1d_t task = (pthreadpool_task_6d_tile_1d_t) pthreadpool_load_relaxed_void_p(&threadpool->task);
1081 void *const argument = pthreadpool_load_relaxed_void_p(&threadpool->argument);
1082
1083 /* Process thread's own range of items */
1084 const size_t range_start = pthreadpool_load_relaxed_size_t(&thread->range_start);
1085 const struct fxdiv_divisor_size_t tile_range_lmn = threadpool->params.parallelize_6d_tile_1d.tile_range_lmn;
1086 const struct fxdiv_result_size_t tile_index_ijk_lmn = fxdiv_divide_size_t(range_start, tile_range_lmn);
1087 const struct fxdiv_divisor_size_t range_k = threadpool->params.parallelize_6d_tile_1d.range_k;
1088 const struct fxdiv_result_size_t index_ij_k = fxdiv_divide_size_t(tile_index_ijk_lmn.quotient, range_k);
1089 const struct fxdiv_divisor_size_t tile_range_n = threadpool->params.parallelize_6d_tile_1d.tile_range_n;
1090 const struct fxdiv_result_size_t tile_index_lm_n = fxdiv_divide_size_t(tile_index_ijk_lmn.remainder, tile_range_n);
1091 const struct fxdiv_divisor_size_t range_j = threadpool->params.parallelize_6d_tile_1d.range_j;
1092 const struct fxdiv_result_size_t index_i_j = fxdiv_divide_size_t(index_ij_k.quotient, range_j);
1093 const struct fxdiv_divisor_size_t range_m = threadpool->params.parallelize_6d_tile_1d.range_m;
1094 const struct fxdiv_result_size_t index_l_m = fxdiv_divide_size_t(tile_index_lm_n.quotient, range_m);
1095 const size_t tile_n = threadpool->params.parallelize_6d_tile_1d.tile_n;
1096 size_t i = index_i_j.quotient;
1097 size_t j = index_i_j.remainder;
1098 size_t k = index_ij_k.remainder;
1099 size_t l = index_l_m.quotient;
1100 size_t m = index_l_m.remainder;
1101 size_t start_n = tile_index_lm_n.remainder * tile_n;
1102
1103 const size_t range_n = threadpool->params.parallelize_6d_tile_1d.range_n;
1104 const size_t range_l = threadpool->params.parallelize_6d_tile_1d.range_l;
1105 while (pthreadpool_try_decrement_relaxed_size_t(&thread->range_length)) {
1106 task(argument, i, j, k, l, m, start_n, min(range_n - start_n, tile_n));
1107 start_n += tile_n;
1108 if (start_n >= range_n) {
1109 start_n = 0;
1110 if (++m == range_m.value) {
1111 m = 0;
1112 if (++l == range_l) {
1113 l = 0;
1114 if (++k == range_k.value) {
1115 k = 0;
1116 if (++j == range_j.value) {
1117 j = 0;
1118 i += 1;
1119 }
1120 }
1121 }
1122 }
1123 }
1124 }
1125
1126
1127 /* There still may be other threads with work */
1128 const size_t thread_number = thread->thread_number;
1129 const size_t threads_count = threadpool->threads_count.value;
1130 for (size_t tid = modulo_decrement(thread_number, threads_count);
1131 tid != thread_number;
1132 tid = modulo_decrement(tid, threads_count))
1133 {
1134 struct thread_info* other_thread = &threadpool->threads[tid];
1135 while (pthreadpool_try_decrement_relaxed_size_t(&other_thread->range_length)) {
1136 const size_t linear_index = pthreadpool_decrement_fetch_relaxed_size_t(&other_thread->range_end);
1137 const struct fxdiv_result_size_t tile_index_ijk_lmn = fxdiv_divide_size_t(linear_index, tile_range_lmn);
1138 const struct fxdiv_result_size_t index_ij_k = fxdiv_divide_size_t(tile_index_ijk_lmn.quotient, range_k);
1139 const struct fxdiv_result_size_t tile_index_lm_n = fxdiv_divide_size_t(tile_index_ijk_lmn.remainder, tile_range_n);
1140 const struct fxdiv_result_size_t index_i_j = fxdiv_divide_size_t(index_ij_k.quotient, range_j);
1141 const struct fxdiv_result_size_t index_l_m = fxdiv_divide_size_t(tile_index_lm_n.quotient, range_m);
1142 const size_t start_n = tile_index_lm_n.remainder * tile_n;
1143 task(argument, index_i_j.quotient, index_i_j.remainder, index_ij_k.remainder, index_l_m.quotient, index_l_m.remainder,
1144 start_n, min(range_n - start_n, tile_n));
1145 }
1146 }
1147
1148 /* Make changes by this thread visible to other threads */
1149 pthreadpool_fence_release();
1150}
1151
1152static void thread_parallelize_6d_tile_2d(struct pthreadpool* threadpool, struct thread_info* thread) {
1153 assert(threadpool != NULL);
1154 assert(thread != NULL);
1155
1156 const pthreadpool_task_6d_tile_2d_t task = (pthreadpool_task_6d_tile_2d_t) pthreadpool_load_relaxed_void_p(&threadpool->task);
1157 void *const argument = pthreadpool_load_relaxed_void_p(&threadpool->argument);
1158
1159 /* Process thread's own range of items */
1160 const size_t range_start = pthreadpool_load_relaxed_size_t(&thread->range_start);
1161 const struct fxdiv_divisor_size_t tile_range_mn = threadpool->params.parallelize_6d_tile_2d.tile_range_mn;
1162 const struct fxdiv_result_size_t tile_index_ijkl_mn = fxdiv_divide_size_t(range_start, tile_range_mn);
1163 const struct fxdiv_divisor_size_t range_kl = threadpool->params.parallelize_6d_tile_2d.range_kl;
1164 const struct fxdiv_result_size_t index_ij_kl = fxdiv_divide_size_t(tile_index_ijkl_mn.quotient, range_kl);
1165 const struct fxdiv_divisor_size_t tile_range_n = threadpool->params.parallelize_6d_tile_2d.tile_range_n;
1166 const struct fxdiv_result_size_t tile_index_m_n = fxdiv_divide_size_t(tile_index_ijkl_mn.remainder, tile_range_n);
1167 const struct fxdiv_divisor_size_t range_j = threadpool->params.parallelize_6d_tile_2d.range_j;
1168 const struct fxdiv_result_size_t index_i_j = fxdiv_divide_size_t(index_ij_kl.quotient, range_j);
1169 const struct fxdiv_divisor_size_t range_l = threadpool->params.parallelize_6d_tile_2d.range_l;
1170 const struct fxdiv_result_size_t index_k_l = fxdiv_divide_size_t(index_ij_kl.remainder, range_l);
1171 const size_t tile_m = threadpool->params.parallelize_6d_tile_2d.tile_m;
1172 const size_t tile_n = threadpool->params.parallelize_6d_tile_2d.tile_n;
1173 size_t i = index_i_j.quotient;
1174 size_t j = index_i_j.remainder;
1175 size_t k = index_k_l.quotient;
1176 size_t l = index_k_l.remainder;
1177 size_t start_m = tile_index_m_n.quotient * tile_m;
1178 size_t start_n = tile_index_m_n.remainder * tile_n;
1179
1180 const size_t range_n = threadpool->params.parallelize_6d_tile_2d.range_n;
1181 const size_t range_m = threadpool->params.parallelize_6d_tile_2d.range_m;
1182 const size_t range_k = threadpool->params.parallelize_6d_tile_2d.range_k;
1183 while (pthreadpool_try_decrement_relaxed_size_t(&thread->range_length)) {
1184 task(argument, i, j, k, l, start_m, start_n, min(range_m - start_m, tile_m), min(range_n - start_n, tile_n));
1185 start_n += tile_n;
1186 if (start_n >= range_n) {
1187 start_n = 0;
1188 start_m += tile_m;
1189 if (start_m >= range_m) {
1190 start_m = 0;
1191 if (++l == range_l.value) {
1192 l = 0;
1193 if (++k == range_k) {
1194 k = 0;
1195 if (++j == range_j.value) {
1196 j = 0;
1197 i += 1;
1198 }
1199 }
1200 }
1201 }
1202 }
1203 }
1204
1205 /* There still may be other threads with work */
1206 const size_t thread_number = thread->thread_number;
1207 const size_t threads_count = threadpool->threads_count.value;
1208 for (size_t tid = modulo_decrement(thread_number, threads_count);
1209 tid != thread_number;
1210 tid = modulo_decrement(tid, threads_count))
1211 {
1212 struct thread_info* other_thread = &threadpool->threads[tid];
1213 while (pthreadpool_try_decrement_relaxed_size_t(&other_thread->range_length)) {
1214 const size_t linear_index = pthreadpool_decrement_fetch_relaxed_size_t(&other_thread->range_end);
1215 const struct fxdiv_result_size_t tile_index_ijkl_mn = fxdiv_divide_size_t(linear_index, tile_range_mn);
1216 const struct fxdiv_result_size_t index_ij_kl = fxdiv_divide_size_t(tile_index_ijkl_mn.quotient, range_kl);
1217 const struct fxdiv_result_size_t tile_index_m_n = fxdiv_divide_size_t(tile_index_ijkl_mn.remainder, tile_range_n);
1218 const struct fxdiv_result_size_t index_i_j = fxdiv_divide_size_t(index_ij_kl.quotient, range_j);
1219 const struct fxdiv_result_size_t index_k_l = fxdiv_divide_size_t(index_ij_kl.remainder, range_l);
1220 const size_t start_m = tile_index_m_n.quotient * tile_m;
1221 const size_t start_n = tile_index_m_n.remainder * tile_n;
1222 task(argument, index_i_j.quotient, index_i_j.remainder, index_k_l.quotient, index_k_l.remainder,
1223 start_m, start_n, min(range_m - start_m, tile_m), min(range_n - start_n, tile_n));
1224 }
1225 }
1226
1227 /* Make changes by this thread visible to other threads */
1228 pthreadpool_fence_release();
1229}
1230
1231void pthreadpool_parallelize_1d(
1232 struct pthreadpool* threadpool,
1233 pthreadpool_task_1d_t task,
1234 void* argument,
1235 size_t range,
1236 uint32_t flags)
1237{
1238 size_t threads_count;
1239 if (threadpool == NULL || (threads_count = threadpool->threads_count.value) <= 1 || range <= 1) {
1240 /* No thread pool used: execute task sequentially on the calling thread */
1241 struct fpu_state saved_fpu_state = { 0 };
1242 if (flags & PTHREADPOOL_FLAG_DISABLE_DENORMALS) {
1243 saved_fpu_state = get_fpu_state();
1244 disable_fpu_denormals();
1245 }
1246 for (size_t i = 0; i < range; i++) {
1247 task(argument, i);
1248 }
1249 if (flags & PTHREADPOOL_FLAG_DISABLE_DENORMALS) {
1250 set_fpu_state(saved_fpu_state);
1251 }
1252 } else {
1253 thread_function_t parallelize_1d = &thread_parallelize_1d;
1254 #if PTHREADPOOL_USE_FASTPATH
1255 const size_t range_threshold = -threads_count;
1256 if (range < range_threshold) {
1257 parallelize_1d = &pthreadpool_thread_parallelize_1d_fastpath;
1258 }
1259 #endif
1260 pthreadpool_parallelize(
1261 threadpool, parallelize_1d, NULL, 0,
1262 (void*) task, argument, range, flags);
1263 }
1264}
1265
1266void pthreadpool_parallelize_1d_with_uarch(
1267 pthreadpool_t threadpool,
1268 pthreadpool_task_1d_with_id_t task,
1269 void* argument,
1270 uint32_t default_uarch_index,
1271 uint32_t max_uarch_index,
1272 size_t range,
1273 uint32_t flags)
1274{
1275 size_t threads_count;
1276 if (threadpool == NULL || (threads_count = threadpool->threads_count.value) <= 1 || range <= 1) {
1277 /* No thread pool used: execute task sequentially on the calling thread */
1278
1279 uint32_t uarch_index = default_uarch_index;
1280 #if PTHREADPOOL_USE_CPUINFO
1281 uarch_index = cpuinfo_get_current_uarch_index_with_default(default_uarch_index);
1282 if (uarch_index > max_uarch_index) {
1283 uarch_index = default_uarch_index;
1284 }
1285 #endif
1286
1287 struct fpu_state saved_fpu_state = { 0 };
1288 if (flags & PTHREADPOOL_FLAG_DISABLE_DENORMALS) {
1289 saved_fpu_state = get_fpu_state();
1290 disable_fpu_denormals();
1291 }
1292 for (size_t i = 0; i < range; i++) {
1293 task(argument, uarch_index, i);
1294 }
1295 if (flags & PTHREADPOOL_FLAG_DISABLE_DENORMALS) {
1296 set_fpu_state(saved_fpu_state);
1297 }
1298 } else {
1299 const struct pthreadpool_1d_with_uarch_params params = {
1300 .default_uarch_index = default_uarch_index,
1301 .max_uarch_index = max_uarch_index,
1302 };
1303 thread_function_t parallelize_1d_with_uarch = &thread_parallelize_1d_with_uarch;
1304 #if PTHREADPOOL_USE_FASTPATH
1305 const size_t range_threshold = -threads_count;
1306 if (range < range_threshold) {
1307 parallelize_1d_with_uarch = &pthreadpool_thread_parallelize_1d_with_uarch_fastpath;
1308 }
1309 #endif
1310 pthreadpool_parallelize(
1311 threadpool, parallelize_1d_with_uarch, &params, sizeof(params),
1312 task, argument, range, flags);
1313 }
1314}
1315
1316void pthreadpool_parallelize_1d_tile_1d(
1317 pthreadpool_t threadpool,
1318 pthreadpool_task_1d_tile_1d_t task,
1319 void* argument,
1320 size_t range,
1321 size_t tile,
1322 uint32_t flags)
1323{
1324 size_t threads_count;
1325 if (threadpool == NULL || (threads_count = threadpool->threads_count.value) <= 1 || range <= tile) {
1326 /* No thread pool used: execute task sequentially on the calling thread */
1327 struct fpu_state saved_fpu_state = { 0 };
1328 if (flags & PTHREADPOOL_FLAG_DISABLE_DENORMALS) {
1329 saved_fpu_state = get_fpu_state();
1330 disable_fpu_denormals();
1331 }
1332 for (size_t i = 0; i < range; i += tile) {
1333 task(argument, i, min(range - i, tile));
1334 }
1335 if (flags & PTHREADPOOL_FLAG_DISABLE_DENORMALS) {
1336 set_fpu_state(saved_fpu_state);
1337 }
1338 } else {
1339 const size_t tile_range = divide_round_up(range, tile);
1340 const struct pthreadpool_1d_tile_1d_params params = {
1341 .range = range,
1342 .tile = tile,
1343 };
1344 thread_function_t parallelize_1d_tile_1d = &thread_parallelize_1d_tile_1d;
1345 #if PTHREADPOOL_USE_FASTPATH
1346 const size_t range_threshold = -threads_count;
1347 if (range < range_threshold) {
1348 parallelize_1d_tile_1d = &pthreadpool_thread_parallelize_1d_tile_1d_fastpath;
1349 }
1350 #endif
1351 pthreadpool_parallelize(
1352 threadpool, parallelize_1d_tile_1d, &params, sizeof(params),
1353 task, argument, tile_range, flags);
1354 }
1355}
1356
1357void pthreadpool_parallelize_2d(
1358 pthreadpool_t threadpool,
1359 pthreadpool_task_2d_t task,
1360 void* argument,
1361 size_t range_i,
1362 size_t range_j,
1363 uint32_t flags)
1364{
1365 size_t threads_count;
1366 if (threadpool == NULL || (threads_count = threadpool->threads_count.value) <= 1 || (range_i | range_j) <= 1) {
1367 /* No thread pool used: execute task sequentially on the calling thread */
1368 struct fpu_state saved_fpu_state = { 0 };
1369 if (flags & PTHREADPOOL_FLAG_DISABLE_DENORMALS) {
1370 saved_fpu_state = get_fpu_state();
1371 disable_fpu_denormals();
1372 }
1373 for (size_t i = 0; i < range_i; i++) {
1374 for (size_t j = 0; j < range_j; j++) {
1375 task(argument, i, j);
1376 }
1377 }
1378 if (flags & PTHREADPOOL_FLAG_DISABLE_DENORMALS) {
1379 set_fpu_state(saved_fpu_state);
1380 }
1381 } else {
1382 const size_t range = range_i * range_j;
1383 const struct pthreadpool_2d_params params = {
1384 .range_j = fxdiv_init_size_t(range_j),
1385 };
1386 thread_function_t parallelize_2d = &thread_parallelize_2d;
1387 #if PTHREADPOOL_USE_FASTPATH
1388 const size_t range_threshold = -threads_count;
1389 if (range < range_threshold) {
1390 parallelize_2d = &pthreadpool_thread_parallelize_2d_fastpath;
1391 }
1392 #endif
1393 pthreadpool_parallelize(
1394 threadpool, parallelize_2d, &params, sizeof(params),
1395 task, argument, range, flags);
1396 }
1397}
1398
1399void pthreadpool_parallelize_2d_tile_1d(
1400 pthreadpool_t threadpool,
1401 pthreadpool_task_2d_tile_1d_t task,
1402 void* argument,
1403 size_t range_i,
1404 size_t range_j,
1405 size_t tile_j,
1406 uint32_t flags)
1407{
1408 size_t threads_count;
1409 if (threadpool == NULL || (threads_count = threadpool->threads_count.value) <= 1 || (range_i <= 1 && range_j <= tile_j)) {
1410 /* No thread pool used: execute task sequentially on the calling thread */
1411 struct fpu_state saved_fpu_state = { 0 };
1412 if (flags & PTHREADPOOL_FLAG_DISABLE_DENORMALS) {
1413 saved_fpu_state = get_fpu_state();
1414 disable_fpu_denormals();
1415 }
1416 for (size_t i = 0; i < range_i; i++) {
1417 for (size_t j = 0; j < range_j; j += tile_j) {
1418 task(argument, i, j, min(range_j - j, tile_j));
1419 }
1420 }
1421 if (flags & PTHREADPOOL_FLAG_DISABLE_DENORMALS) {
1422 set_fpu_state(saved_fpu_state);
1423 }
1424 } else {
1425 const size_t tile_range_j = divide_round_up(range_j, tile_j);
1426 const size_t tile_range = range_i * tile_range_j;
1427 const struct pthreadpool_2d_tile_1d_params params = {
1428 .range_j = range_j,
1429 .tile_j = tile_j,
1430 .tile_range_j = fxdiv_init_size_t(tile_range_j),
1431 };
1432 thread_function_t parallelize_2d_tile_1d = &thread_parallelize_2d_tile_1d;
1433 #if PTHREADPOOL_USE_FASTPATH
1434 const size_t range_threshold = -threads_count;
1435 if (tile_range < range_threshold) {
1436 parallelize_2d_tile_1d = &pthreadpool_thread_parallelize_2d_tile_1d_fastpath;
1437 }
1438 #endif
1439 pthreadpool_parallelize(
1440 threadpool, parallelize_2d_tile_1d, &params, sizeof(params),
1441 task, argument, tile_range, flags);
1442 }
1443}
1444
1445void pthreadpool_parallelize_2d_tile_2d(
1446 pthreadpool_t threadpool,
1447 pthreadpool_task_2d_tile_2d_t task,
1448 void* argument,
1449 size_t range_i,
1450 size_t range_j,
1451 size_t tile_i,
1452 size_t tile_j,
1453 uint32_t flags)
1454{
1455 size_t threads_count;
1456 if (threadpool == NULL || (threads_count = threadpool->threads_count.value) <= 1 || (range_i <= tile_i && range_j <= tile_j)) {
1457 /* No thread pool used: execute task sequentially on the calling thread */
1458 struct fpu_state saved_fpu_state = { 0 };
1459 if (flags & PTHREADPOOL_FLAG_DISABLE_DENORMALS) {
1460 saved_fpu_state = get_fpu_state();
1461 disable_fpu_denormals();
1462 }
1463 for (size_t i = 0; i < range_i; i += tile_i) {
1464 for (size_t j = 0; j < range_j; j += tile_j) {
1465 task(argument, i, j, min(range_i - i, tile_i), min(range_j - j, tile_j));
1466 }
1467 }
1468 if (flags & PTHREADPOOL_FLAG_DISABLE_DENORMALS) {
1469 set_fpu_state(saved_fpu_state);
1470 }
1471 } else {
1472 const size_t tile_range_i = divide_round_up(range_i, tile_i);
1473 const size_t tile_range_j = divide_round_up(range_j, tile_j);
1474 const size_t tile_range = tile_range_i * tile_range_j;
1475 const struct pthreadpool_2d_tile_2d_params params = {
1476 .range_i = range_i,
1477 .tile_i = tile_i,
1478 .range_j = range_j,
1479 .tile_j = tile_j,
1480 .tile_range_j = fxdiv_init_size_t(tile_range_j),
1481 };
1482 thread_function_t parallelize_2d_tile_2d = &thread_parallelize_2d_tile_2d;
1483 #if PTHREADPOOL_USE_FASTPATH
1484 const size_t range_threshold = -threads_count;
1485 if (tile_range < range_threshold) {
1486 parallelize_2d_tile_2d = &pthreadpool_thread_parallelize_2d_tile_2d_fastpath;
1487 }
1488 #endif
1489 pthreadpool_parallelize(
1490 threadpool, parallelize_2d_tile_2d, &params, sizeof(params),
1491 task, argument, tile_range, flags);
1492 }
1493}
1494
1495void pthreadpool_parallelize_2d_tile_2d_with_uarch(
1496 pthreadpool_t threadpool,
1497 pthreadpool_task_2d_tile_2d_with_id_t task,
1498 void* argument,
1499 uint32_t default_uarch_index,
1500 uint32_t max_uarch_index,
1501 size_t range_i,
1502 size_t range_j,
1503 size_t tile_i,
1504 size_t tile_j,
1505 uint32_t flags)
1506{
1507 size_t threads_count;
1508 if (threadpool == NULL || (threads_count = threadpool->threads_count.value) <= 1 || (range_i <= tile_i && range_j <= tile_j)) {
1509 /* No thread pool used: execute task sequentially on the calling thread */
1510
1511 uint32_t uarch_index = default_uarch_index;
1512 #if PTHREADPOOL_USE_CPUINFO
1513 uarch_index = cpuinfo_get_current_uarch_index_with_default(default_uarch_index);
1514 if (uarch_index > max_uarch_index) {
1515 uarch_index = default_uarch_index;
1516 }
1517 #endif
1518
1519 struct fpu_state saved_fpu_state = { 0 };
1520 if (flags & PTHREADPOOL_FLAG_DISABLE_DENORMALS) {
1521 saved_fpu_state = get_fpu_state();
1522 disable_fpu_denormals();
1523 }
1524 for (size_t i = 0; i < range_i; i += tile_i) {
1525 for (size_t j = 0; j < range_j; j += tile_j) {
1526 task(argument, uarch_index, i, j, min(range_i - i, tile_i), min(range_j - j, tile_j));
1527 }
1528 }
1529 if (flags & PTHREADPOOL_FLAG_DISABLE_DENORMALS) {
1530 set_fpu_state(saved_fpu_state);
1531 }
1532 } else {
1533 const size_t tile_range_i = divide_round_up(range_i, tile_i);
1534 const size_t tile_range_j = divide_round_up(range_j, tile_j);
1535 const size_t tile_range = tile_range_i * tile_range_j;
1536 const struct pthreadpool_2d_tile_2d_with_uarch_params params = {
1537 .default_uarch_index = default_uarch_index,
1538 .max_uarch_index = max_uarch_index,
1539 .range_i = range_i,
1540 .tile_i = tile_i,
1541 .range_j = range_j,
1542 .tile_j = tile_j,
1543 .tile_range_j = fxdiv_init_size_t(tile_range_j),
1544 };
1545 thread_function_t parallelize_2d_tile_2d_with_uarch = &thread_parallelize_2d_tile_2d_with_uarch;
1546 #if PTHREADPOOL_USE_FASTPATH
1547 const size_t range_threshold = -threads_count;
1548 if (tile_range < range_threshold) {
1549 parallelize_2d_tile_2d_with_uarch = &pthreadpool_thread_parallelize_2d_tile_2d_with_uarch_fastpath;
1550 }
1551 #endif
1552 pthreadpool_parallelize(
1553 threadpool, parallelize_2d_tile_2d_with_uarch, &params, sizeof(params),
1554 task, argument, tile_range, flags);
1555 }
1556}
1557
1558void pthreadpool_parallelize_3d(
1559 pthreadpool_t threadpool,
1560 pthreadpool_task_3d_t task,
1561 void* argument,
1562 size_t range_i,
1563 size_t range_j,
1564 size_t range_k,
1565 uint32_t flags)
1566{
1567 size_t threads_count;
1568 if (threadpool == NULL || (threads_count = threadpool->threads_count.value) <= 1 || (range_i | range_j | range_k) <= 1) {
1569 /* No thread pool used: execute task sequentially on the calling thread */
1570 struct fpu_state saved_fpu_state = { 0 };
1571 if (flags & PTHREADPOOL_FLAG_DISABLE_DENORMALS) {
1572 saved_fpu_state = get_fpu_state();
1573 disable_fpu_denormals();
1574 }
1575 for (size_t i = 0; i < range_i; i++) {
1576 for (size_t j = 0; j < range_j; j++) {
1577 for (size_t k = 0; k < range_k; k++) {
1578 task(argument, i, j, k);
1579 }
1580 }
1581 }
1582 if (flags & PTHREADPOOL_FLAG_DISABLE_DENORMALS) {
1583 set_fpu_state(saved_fpu_state);
1584 }
1585 } else {
1586 const size_t range = range_i * range_j * range_k;
1587 const struct pthreadpool_3d_params params = {
1588 .range_j = fxdiv_init_size_t(range_j),
1589 .range_k = fxdiv_init_size_t(range_k),
1590 };
1591 thread_function_t parallelize_3d = &thread_parallelize_3d;
1592 #if PTHREADPOOL_USE_FASTPATH
1593 const size_t range_threshold = -threads_count;
1594 if (range < range_threshold) {
1595 parallelize_3d = &pthreadpool_thread_parallelize_3d_fastpath;
1596 }
1597 #endif
1598 pthreadpool_parallelize(
1599 threadpool, parallelize_3d, &params, sizeof(params),
1600 task, argument, range, flags);
1601 }
1602}
1603
1604void pthreadpool_parallelize_3d_tile_1d(
1605 pthreadpool_t threadpool,
1606 pthreadpool_task_3d_tile_1d_t task,
1607 void* argument,
1608 size_t range_i,
1609 size_t range_j,
1610 size_t range_k,
1611 size_t tile_k,
1612 uint32_t flags)
1613{
1614 size_t threads_count;
1615 if (threadpool == NULL || (threads_count = threadpool->threads_count.value) <= 1 || ((range_i | range_j) <= 1 && range_k <= tile_k)) {
1616 /* No thread pool used: execute task sequentially on the calling thread */
1617 struct fpu_state saved_fpu_state = { 0 };
1618 if (flags & PTHREADPOOL_FLAG_DISABLE_DENORMALS) {
1619 saved_fpu_state = get_fpu_state();
1620 disable_fpu_denormals();
1621 }
1622 for (size_t i = 0; i < range_i; i++) {
1623 for (size_t j = 0; j < range_j; j++) {
1624 for (size_t k = 0; k < range_k; k += tile_k) {
1625 task(argument, i, j, k, min(range_k - k, tile_k));
1626 }
1627 }
1628 }
1629 if (flags & PTHREADPOOL_FLAG_DISABLE_DENORMALS) {
1630 set_fpu_state(saved_fpu_state);
1631 }
1632 } else {
1633 const size_t tile_range_k = divide_round_up(range_k, tile_k);
1634 const size_t tile_range = range_i * range_j * tile_range_k;
1635 const struct pthreadpool_3d_tile_1d_params params = {
1636 .range_k = range_k,
1637 .tile_k = tile_k,
1638 .range_j = fxdiv_init_size_t(range_j),
1639 .tile_range_k = fxdiv_init_size_t(tile_range_k),
1640 };
1641 thread_function_t parallelize_3d_tile_1d = &thread_parallelize_3d_tile_1d;
1642 #if PTHREADPOOL_USE_FASTPATH
1643 const size_t range_threshold = -threads_count;
1644 if (tile_range < range_threshold) {
1645 parallelize_3d_tile_1d = &pthreadpool_thread_parallelize_3d_tile_1d_fastpath;
1646 }
1647 #endif
1648 pthreadpool_parallelize(
1649 threadpool, parallelize_3d_tile_1d, &params, sizeof(params),
1650 task, argument, tile_range, flags);
1651 }
1652}
1653
1654void pthreadpool_parallelize_3d_tile_2d(
1655 pthreadpool_t threadpool,
1656 pthreadpool_task_3d_tile_2d_t task,
1657 void* argument,
1658 size_t range_i,
1659 size_t range_j,
1660 size_t range_k,
1661 size_t tile_j,
1662 size_t tile_k,
1663 uint32_t flags)
1664{
1665 size_t threads_count;
1666 if (threadpool == NULL || (threads_count = threadpool->threads_count.value) <= 1 || (range_i <= 1 && range_j <= tile_j && range_k <= tile_k)) {
1667 /* No thread pool used: execute task sequentially on the calling thread */
1668 struct fpu_state saved_fpu_state = { 0 };
1669 if (flags & PTHREADPOOL_FLAG_DISABLE_DENORMALS) {
1670 saved_fpu_state = get_fpu_state();
1671 disable_fpu_denormals();
1672 }
1673 for (size_t i = 0; i < range_i; i++) {
1674 for (size_t j = 0; j < range_j; j += tile_j) {
1675 for (size_t k = 0; k < range_k; k += tile_k) {
1676 task(argument, i, j, k, min(range_j - j, tile_j), min(range_k - k, tile_k));
1677 }
1678 }
1679 }
1680 if (flags & PTHREADPOOL_FLAG_DISABLE_DENORMALS) {
1681 set_fpu_state(saved_fpu_state);
1682 }
1683 } else {
1684 const size_t tile_range_j = divide_round_up(range_j, tile_j);
1685 const size_t tile_range_k = divide_round_up(range_k, tile_k);
1686 const size_t tile_range = range_i * tile_range_j * tile_range_k;
1687 const struct pthreadpool_3d_tile_2d_params params = {
1688 .range_j = range_j,
1689 .tile_j = tile_j,
1690 .range_k = range_k,
1691 .tile_k = tile_k,
1692 .tile_range_j = fxdiv_init_size_t(tile_range_j),
1693 .tile_range_k = fxdiv_init_size_t(tile_range_k),
1694 };
1695 thread_function_t parallelize_3d_tile_2d = &thread_parallelize_3d_tile_2d;
1696 #if PTHREADPOOL_USE_FASTPATH
1697 const size_t range_threshold = -threads_count;
1698 if (tile_range < range_threshold) {
1699 parallelize_3d_tile_2d = &pthreadpool_thread_parallelize_3d_tile_2d_fastpath;
1700 }
1701 #endif
1702 pthreadpool_parallelize(
1703 threadpool, parallelize_3d_tile_2d, &params, sizeof(params),
1704 task, argument, tile_range, flags);
1705 }
1706}
1707
1708void pthreadpool_parallelize_3d_tile_2d_with_uarch(
1709 pthreadpool_t threadpool,
1710 pthreadpool_task_3d_tile_2d_with_id_t task,
1711 void* argument,
1712 uint32_t default_uarch_index,
1713 uint32_t max_uarch_index,
1714 size_t range_i,
1715 size_t range_j,
1716 size_t range_k,
1717 size_t tile_j,
1718 size_t tile_k,
1719 uint32_t flags)
1720{
1721 size_t threads_count;
1722 if (threadpool == NULL || (threads_count = threadpool->threads_count.value) <= 1 || (range_i <= 1 && range_j <= tile_j && range_k <= tile_k)) {
1723 /* No thread pool used: execute task sequentially on the calling thread */
1724
1725 uint32_t uarch_index = default_uarch_index;
1726 #if PTHREADPOOL_USE_CPUINFO
1727 uarch_index = cpuinfo_get_current_uarch_index_with_default(default_uarch_index);
1728 if (uarch_index > max_uarch_index) {
1729 uarch_index = default_uarch_index;
1730 }
1731 #endif
1732
1733 struct fpu_state saved_fpu_state = { 0 };
1734 if (flags & PTHREADPOOL_FLAG_DISABLE_DENORMALS) {
1735 saved_fpu_state = get_fpu_state();
1736 disable_fpu_denormals();
1737 }
1738 for (size_t i = 0; i < range_i; i++) {
1739 for (size_t j = 0; j < range_j; j += tile_j) {
1740 for (size_t k = 0; k < range_k; k += tile_k) {
1741 task(argument, uarch_index, i, j, k, min(range_j - j, tile_j), min(range_k - k, tile_k));
1742 }
1743 }
1744 }
1745 if (flags & PTHREADPOOL_FLAG_DISABLE_DENORMALS) {
1746 set_fpu_state(saved_fpu_state);
1747 }
1748 } else {
1749 const size_t tile_range_j = divide_round_up(range_j, tile_j);
1750 const size_t tile_range_k = divide_round_up(range_k, tile_k);
1751 const size_t tile_range = range_i * tile_range_j * tile_range_k;
1752 const struct pthreadpool_3d_tile_2d_with_uarch_params params = {
1753 .default_uarch_index = default_uarch_index,
1754 .max_uarch_index = max_uarch_index,
1755 .range_j = range_j,
1756 .tile_j = tile_j,
1757 .range_k = range_k,
1758 .tile_k = tile_k,
1759 .tile_range_j = fxdiv_init_size_t(tile_range_j),
1760 .tile_range_k = fxdiv_init_size_t(tile_range_k),
1761 };
1762 thread_function_t parallelize_3d_tile_2d_with_uarch = &thread_parallelize_3d_tile_2d_with_uarch;
1763 #if PTHREADPOOL_USE_FASTPATH
1764 const size_t range_threshold = -threads_count;
1765 if (tile_range < range_threshold) {
1766 parallelize_3d_tile_2d_with_uarch = &pthreadpool_thread_parallelize_3d_tile_2d_with_uarch_fastpath;
1767 }
1768 #endif
1769 pthreadpool_parallelize(
1770 threadpool, parallelize_3d_tile_2d_with_uarch, &params, sizeof(params),
1771 task, argument, tile_range, flags);
1772 }
1773}
1774
1775void pthreadpool_parallelize_4d(
1776 pthreadpool_t threadpool,
1777 pthreadpool_task_4d_t task,
1778 void* argument,
1779 size_t range_i,
1780 size_t range_j,
1781 size_t range_k,
1782 size_t range_l,
1783 uint32_t flags)
1784{
1785 size_t threads_count;
1786 if (threadpool == NULL || (threads_count = threadpool->threads_count.value) <= 1 || (range_i | range_j | range_k | range_l) <= 1) {
1787 /* No thread pool used: execute task sequentially on the calling thread */
1788 struct fpu_state saved_fpu_state = { 0 };
1789 if (flags & PTHREADPOOL_FLAG_DISABLE_DENORMALS) {
1790 saved_fpu_state = get_fpu_state();
1791 disable_fpu_denormals();
1792 }
1793 for (size_t i = 0; i < range_i; i++) {
1794 for (size_t j = 0; j < range_j; j++) {
1795 for (size_t k = 0; k < range_k; k++) {
1796 for (size_t l = 0; l < range_l; l++) {
1797 task(argument, i, j, k, l);
1798 }
1799 }
1800 }
1801 }
1802 if (flags & PTHREADPOOL_FLAG_DISABLE_DENORMALS) {
1803 set_fpu_state(saved_fpu_state);
1804 }
1805 } else {
1806 const size_t range_kl = range_k * range_l;
1807 const size_t range = range_i * range_j * range_kl;
1808 const struct pthreadpool_4d_params params = {
1809 .range_k = range_k,
1810 .range_j = fxdiv_init_size_t(range_j),
1811 .range_kl = fxdiv_init_size_t(range_kl),
1812 .range_l = fxdiv_init_size_t(range_l),
1813 };
1814 thread_function_t parallelize_4d = &thread_parallelize_4d;
1815 #if PTHREADPOOL_USE_FASTPATH
1816 const size_t range_threshold = -threads_count;
1817 if (range < range_threshold) {
1818 parallelize_4d = &pthreadpool_thread_parallelize_4d_fastpath;
1819 }
1820 #endif
1821 pthreadpool_parallelize(
1822 threadpool, parallelize_4d, &params, sizeof(params),
1823 task, argument, range, flags);
1824 }
1825}
1826
1827void pthreadpool_parallelize_4d_tile_1d(
1828 pthreadpool_t threadpool,
1829 pthreadpool_task_4d_tile_1d_t task,
1830 void* argument,
1831 size_t range_i,
1832 size_t range_j,
1833 size_t range_k,
1834 size_t range_l,
1835 size_t tile_l,
1836 uint32_t flags)
1837{
1838 size_t threads_count;
1839 if (threadpool == NULL || (threads_count = threadpool->threads_count.value) <= 1 || ((range_i | range_j | range_k) <= 1 && range_l <= tile_l)) {
1840 /* No thread pool used: execute task sequentially on the calling thread */
1841 struct fpu_state saved_fpu_state = { 0 };
1842 if (flags & PTHREADPOOL_FLAG_DISABLE_DENORMALS) {
1843 saved_fpu_state = get_fpu_state();
1844 disable_fpu_denormals();
1845 }
1846 for (size_t i = 0; i < range_i; i++) {
1847 for (size_t j = 0; j < range_j; j++) {
1848 for (size_t k = 0; k < range_k; k++) {
1849 for (size_t l = 0; l < range_l; l += tile_l) {
1850 task(argument, i, j, k, l, min(range_l - l, tile_l));
1851 }
1852 }
1853 }
1854 }
1855 if (flags & PTHREADPOOL_FLAG_DISABLE_DENORMALS) {
1856 set_fpu_state(saved_fpu_state);
1857 }
1858 } else {
1859 const size_t tile_range_l = divide_round_up(range_l, tile_l);
1860 const size_t tile_range_kl = range_k * tile_range_l;
1861 const size_t tile_range = range_i * range_j * tile_range_kl;
1862 const struct pthreadpool_4d_tile_1d_params params = {
1863 .range_k = range_k,
1864 .range_l = range_l,
1865 .tile_l = tile_l,
1866 .range_j = fxdiv_init_size_t(range_j),
1867 .tile_range_kl = fxdiv_init_size_t(tile_range_kl),
1868 .tile_range_l = fxdiv_init_size_t(tile_range_l),
1869 };
1870 thread_function_t parallelize_4d_tile_1d = &thread_parallelize_4d_tile_1d;
1871 #if PTHREADPOOL_USE_FASTPATH
1872 const size_t range_threshold = -threads_count;
1873 if (tile_range < range_threshold) {
1874 parallelize_4d_tile_1d = &pthreadpool_thread_parallelize_4d_tile_1d_fastpath;
1875 }
1876 #endif
1877 pthreadpool_parallelize(
1878 threadpool, parallelize_4d_tile_1d, &params, sizeof(params),
1879 task, argument, tile_range, flags);
1880 }
1881}
1882
1883void pthreadpool_parallelize_4d_tile_2d(
1884 pthreadpool_t threadpool,
1885 pthreadpool_task_4d_tile_2d_t task,
1886 void* argument,
1887 size_t range_i,
1888 size_t range_j,
1889 size_t range_k,
1890 size_t range_l,
1891 size_t tile_k,
1892 size_t tile_l,
1893 uint32_t flags)
1894{
1895 size_t threads_count;
1896 if (threadpool == NULL || (threads_count = threadpool->threads_count.value) <= 1 || ((range_i | range_j) <= 1 && range_k <= tile_k && range_l <= tile_l)) {
1897 /* No thread pool used: execute task sequentially on the calling thread */
1898 struct fpu_state saved_fpu_state = { 0 };
1899 if (flags & PTHREADPOOL_FLAG_DISABLE_DENORMALS) {
1900 saved_fpu_state = get_fpu_state();
1901 disable_fpu_denormals();
1902 }
1903 for (size_t i = 0; i < range_i; i++) {
1904 for (size_t j = 0; j < range_j; j++) {
1905 for (size_t k = 0; k < range_k; k += tile_k) {
1906 for (size_t l = 0; l < range_l; l += tile_l) {
1907 task(argument, i, j, k, l,
1908 min(range_k - k, tile_k), min(range_l - l, tile_l));
1909 }
1910 }
1911 }
1912 }
1913 if (flags & PTHREADPOOL_FLAG_DISABLE_DENORMALS) {
1914 set_fpu_state(saved_fpu_state);
1915 }
1916 } else {
1917 const size_t tile_range_l = divide_round_up(range_l, tile_l);
1918 const size_t tile_range_kl = divide_round_up(range_k, tile_k) * tile_range_l;
1919 const size_t tile_range = range_i * range_j * tile_range_kl;
1920 const struct pthreadpool_4d_tile_2d_params params = {
1921 .range_k = range_k,
1922 .tile_k = tile_k,
1923 .range_l = range_l,
1924 .tile_l = tile_l,
1925 .range_j = fxdiv_init_size_t(range_j),
1926 .tile_range_kl = fxdiv_init_size_t(tile_range_kl),
1927 .tile_range_l = fxdiv_init_size_t(tile_range_l),
1928 };
1929 thread_function_t parallelize_4d_tile_2d = &thread_parallelize_4d_tile_2d;
1930 #if PTHREADPOOL_USE_FASTPATH
1931 const size_t range_threshold = -threads_count;
1932 if (tile_range < range_threshold) {
1933 parallelize_4d_tile_2d = &pthreadpool_thread_parallelize_4d_tile_2d_fastpath;
1934 }
1935 #endif
1936 pthreadpool_parallelize(
1937 threadpool, parallelize_4d_tile_2d, &params, sizeof(params),
1938 task, argument, tile_range, flags);
1939 }
1940}
1941
1942void pthreadpool_parallelize_4d_tile_2d_with_uarch(
1943 pthreadpool_t threadpool,
1944 pthreadpool_task_4d_tile_2d_with_id_t task,
1945 void* argument,
1946 uint32_t default_uarch_index,
1947 uint32_t max_uarch_index,
1948 size_t range_i,
1949 size_t range_j,
1950 size_t range_k,
1951 size_t range_l,
1952 size_t tile_k,
1953 size_t tile_l,
1954 uint32_t flags)
1955{
1956 size_t threads_count;
1957 if (threadpool == NULL || (threads_count = threadpool->threads_count.value) <= 1 || ((range_i | range_j) <= 1 && range_k <= tile_k && range_l <= tile_l)) {
1958 /* No thread pool used: execute task sequentially on the calling thread */
1959
1960 uint32_t uarch_index = default_uarch_index;
1961 #if PTHREADPOOL_USE_CPUINFO
1962 uarch_index = cpuinfo_get_current_uarch_index_with_default(default_uarch_index);
1963 if (uarch_index > max_uarch_index) {
1964 uarch_index = default_uarch_index;
1965 }
1966 #endif
1967
1968 struct fpu_state saved_fpu_state = { 0 };
1969 if (flags & PTHREADPOOL_FLAG_DISABLE_DENORMALS) {
1970 saved_fpu_state = get_fpu_state();
1971 disable_fpu_denormals();
1972 }
1973 for (size_t i = 0; i < range_i; i++) {
1974 for (size_t j = 0; j < range_j; j++) {
1975 for (size_t k = 0; k < range_k; k += tile_k) {
1976 for (size_t l = 0; l < range_l; l += tile_l) {
1977 task(argument, uarch_index, i, j, k, l,
1978 min(range_k - k, tile_k), min(range_l - l, tile_l));
1979 }
1980 }
1981 }
1982 }
1983 if (flags & PTHREADPOOL_FLAG_DISABLE_DENORMALS) {
1984 set_fpu_state(saved_fpu_state);
1985 }
1986 } else {
1987 const size_t tile_range_l = divide_round_up(range_l, tile_l);
1988 const size_t tile_range_kl = divide_round_up(range_k, tile_k) * tile_range_l;
1989 const size_t tile_range = range_i * range_j * tile_range_kl;
1990 const struct pthreadpool_4d_tile_2d_with_uarch_params params = {
1991 .default_uarch_index = default_uarch_index,
1992 .max_uarch_index = max_uarch_index,
1993 .range_k = range_k,
1994 .tile_k = tile_k,
1995 .range_l = range_l,
1996 .tile_l = tile_l,
1997 .range_j = fxdiv_init_size_t(range_j),
1998 .tile_range_kl = fxdiv_init_size_t(tile_range_kl),
1999 .tile_range_l = fxdiv_init_size_t(tile_range_l),
2000 };
2001 thread_function_t parallelize_4d_tile_2d_with_uarch = &thread_parallelize_4d_tile_2d_with_uarch;
2002 #if PTHREADPOOL_USE_FASTPATH
2003 const size_t range_threshold = -threads_count;
2004 if (tile_range < range_threshold) {
2005 parallelize_4d_tile_2d_with_uarch = &pthreadpool_thread_parallelize_4d_tile_2d_with_uarch_fastpath;
2006 }
2007 #endif
2008 pthreadpool_parallelize(
2009 threadpool, parallelize_4d_tile_2d_with_uarch, &params, sizeof(params),
2010 task, argument, tile_range, flags);
2011 }
2012}
2013
2014void pthreadpool_parallelize_5d(
2015 pthreadpool_t threadpool,
2016 pthreadpool_task_5d_t task,
2017 void* argument,
2018 size_t range_i,
2019 size_t range_j,
2020 size_t range_k,
2021 size_t range_l,
2022 size_t range_m,
2023 uint32_t flags)
2024{
2025 size_t threads_count;
2026 if (threadpool == NULL || (threads_count = threadpool->threads_count.value) <= 1 || (range_i | range_j | range_k | range_l | range_m) <= 1) {
2027 /* No thread pool used: execute task sequentially on the calling thread */
2028 struct fpu_state saved_fpu_state = { 0 };
2029 if (flags & PTHREADPOOL_FLAG_DISABLE_DENORMALS) {
2030 saved_fpu_state = get_fpu_state();
2031 disable_fpu_denormals();
2032 }
2033 for (size_t i = 0; i < range_i; i++) {
2034 for (size_t j = 0; j < range_j; j++) {
2035 for (size_t k = 0; k < range_k; k++) {
2036 for (size_t l = 0; l < range_l; l++) {
2037 for (size_t m = 0; m < range_m; m++) {
2038 task(argument, i, j, k, l, m);
2039 }
2040 }
2041 }
2042 }
2043 }
2044 if (flags & PTHREADPOOL_FLAG_DISABLE_DENORMALS) {
2045 set_fpu_state(saved_fpu_state);
2046 }
2047 } else {
2048 const size_t range_lm = range_l * range_m;
2049 const size_t range = range_i * range_j * range_k * range_lm;
2050 const struct pthreadpool_5d_params params = {
2051 .range_l = range_l,
2052 .range_j = fxdiv_init_size_t(range_j),
2053 .range_k = fxdiv_init_size_t(range_k),
2054 .range_lm = fxdiv_init_size_t(range_lm),
2055 .range_m = fxdiv_init_size_t(range_m),
2056 };
2057 thread_function_t parallelize_5d = &thread_parallelize_5d;
2058 #if PTHREADPOOL_USE_FASTPATH
2059 const size_t range_threshold = -threads_count;
2060 if (range < range_threshold) {
2061 parallelize_5d = &pthreadpool_thread_parallelize_5d_fastpath;
2062 }
2063 #endif
2064 pthreadpool_parallelize(
2065 threadpool, parallelize_5d, &params, sizeof(params),
2066 task, argument, range, flags);
2067 }
2068}
2069
2070void pthreadpool_parallelize_5d_tile_1d(
2071 pthreadpool_t threadpool,
2072 pthreadpool_task_5d_tile_1d_t task,
2073 void* argument,
2074 size_t range_i,
2075 size_t range_j,
2076 size_t range_k,
2077 size_t range_l,
2078 size_t range_m,
2079 size_t tile_m,
2080 uint32_t flags)
2081{
2082 size_t threads_count;
2083 if (threadpool == NULL || (threads_count = threadpool->threads_count.value) <= 1 || ((range_i | range_j | range_k | range_l) <= 1 && range_m <= tile_m)) {
2084 /* No thread pool used: execute task sequentially on the calling thread */
2085 struct fpu_state saved_fpu_state = { 0 };
2086 if (flags & PTHREADPOOL_FLAG_DISABLE_DENORMALS) {
2087 saved_fpu_state = get_fpu_state();
2088 disable_fpu_denormals();
2089 }
2090 for (size_t i = 0; i < range_i; i++) {
2091 for (size_t j = 0; j < range_j; j++) {
2092 for (size_t k = 0; k < range_k; k++) {
2093 for (size_t l = 0; l < range_l; l++) {
2094 for (size_t m = 0; m < range_m; m += tile_m) {
2095 task(argument, i, j, k, l, m, min(range_m - m, tile_m));
2096 }
2097 }
2098 }
2099 }
2100 }
2101 if (flags & PTHREADPOOL_FLAG_DISABLE_DENORMALS) {
2102 set_fpu_state(saved_fpu_state);
2103 }
2104 } else {
2105 const size_t tile_range_m = divide_round_up(range_m, tile_m);
2106 const size_t range_kl = range_k * range_l;
2107 const size_t tile_range = range_i * range_j * range_kl * tile_range_m;
2108 const struct pthreadpool_5d_tile_1d_params params = {
2109 .range_k = range_k,
2110 .range_m = range_m,
2111 .tile_m = tile_m,
2112 .range_j = fxdiv_init_size_t(range_j),
2113 .range_kl = fxdiv_init_size_t(range_kl),
2114 .range_l = fxdiv_init_size_t(range_l),
2115 .tile_range_m = fxdiv_init_size_t(tile_range_m),
2116 };
2117 thread_function_t parallelize_5d_tile_1d = &thread_parallelize_5d_tile_1d;
2118 #if PTHREADPOOL_USE_FASTPATH
2119 const size_t range_threshold = -threads_count;
2120 if (tile_range < range_threshold) {
2121 parallelize_5d_tile_1d = &pthreadpool_thread_parallelize_5d_tile_1d_fastpath;
2122 }
2123 #endif
2124 pthreadpool_parallelize(
2125 threadpool, parallelize_5d_tile_1d, &params, sizeof(params),
2126 task, argument, tile_range, flags);
2127 }
2128}
2129
2130void pthreadpool_parallelize_5d_tile_2d(
2131 pthreadpool_t threadpool,
2132 pthreadpool_task_5d_tile_2d_t task,
2133 void* argument,
2134 size_t range_i,
2135 size_t range_j,
2136 size_t range_k,
2137 size_t range_l,
2138 size_t range_m,
2139 size_t tile_l,
2140 size_t tile_m,
2141 uint32_t flags)
2142{
2143 size_t threads_count;
2144 if (threadpool == NULL || (threads_count = threadpool->threads_count.value) <= 1 || ((range_i | range_j | range_k) <= 1 && range_l <= tile_l && range_m <= tile_m)) {
2145 /* No thread pool used: execute task sequentially on the calling thread */
2146 struct fpu_state saved_fpu_state = { 0 };
2147 if (flags & PTHREADPOOL_FLAG_DISABLE_DENORMALS) {
2148 saved_fpu_state = get_fpu_state();
2149 disable_fpu_denormals();
2150 }
2151 for (size_t i = 0; i < range_i; i++) {
2152 for (size_t j = 0; j < range_j; j++) {
2153 for (size_t k = 0; k < range_k; k++) {
2154 for (size_t l = 0; l < range_l; l += tile_l) {
2155 for (size_t m = 0; m < range_m; m += tile_m) {
2156 task(argument, i, j, k, l, m,
2157 min(range_l - l, tile_l), min(range_m - m, tile_m));
2158 }
2159 }
2160 }
2161 }
2162 }
2163 if (flags & PTHREADPOOL_FLAG_DISABLE_DENORMALS) {
2164 set_fpu_state(saved_fpu_state);
2165 }
2166 } else {
2167 const size_t tile_range_m = divide_round_up(range_m, tile_m);
2168 const size_t tile_range_lm = divide_round_up(range_l, tile_l) * tile_range_m;
2169 const size_t tile_range = range_i * range_j * range_k * tile_range_lm;
2170 const struct pthreadpool_5d_tile_2d_params params = {
2171 .range_l = range_l,
2172 .tile_l = tile_l,
2173 .range_m = range_m,
2174 .tile_m = tile_m,
2175 .range_j = fxdiv_init_size_t(range_j),
2176 .range_k = fxdiv_init_size_t(range_k),
2177 .tile_range_lm = fxdiv_init_size_t(tile_range_lm),
2178 .tile_range_m = fxdiv_init_size_t(tile_range_m),
2179 };
2180 thread_function_t parallelize_5d_tile_2d = &thread_parallelize_5d_tile_2d;
2181 #if PTHREADPOOL_USE_FASTPATH
2182 const size_t range_threshold = -threads_count;
2183 if (tile_range < range_threshold) {
2184 parallelize_5d_tile_2d = &pthreadpool_thread_parallelize_5d_tile_2d_fastpath;
2185 }
2186 #endif
2187 pthreadpool_parallelize(
2188 threadpool, parallelize_5d_tile_2d, &params, sizeof(params),
2189 task, argument, tile_range, flags);
2190 }
2191}
2192
2193void pthreadpool_parallelize_6d(
2194 pthreadpool_t threadpool,
2195 pthreadpool_task_6d_t task,
2196 void* argument,
2197 size_t range_i,
2198 size_t range_j,
2199 size_t range_k,
2200 size_t range_l,
2201 size_t range_m,
2202 size_t range_n,
2203 uint32_t flags)
2204{
2205 size_t threads_count;
2206 if (threadpool == NULL || (threads_count = threadpool->threads_count.value) <= 1 || (range_i | range_j | range_k | range_l | range_m | range_n) <= 1) {
2207 /* No thread pool used: execute task sequentially on the calling thread */
2208 struct fpu_state saved_fpu_state = { 0 };
2209 if (flags & PTHREADPOOL_FLAG_DISABLE_DENORMALS) {
2210 saved_fpu_state = get_fpu_state();
2211 disable_fpu_denormals();
2212 }
2213 for (size_t i = 0; i < range_i; i++) {
2214 for (size_t j = 0; j < range_j; j++) {
2215 for (size_t k = 0; k < range_k; k++) {
2216 for (size_t l = 0; l < range_l; l++) {
2217 for (size_t m = 0; m < range_m; m++) {
2218 for (size_t n = 0; n < range_n; n++) {
2219 task(argument, i, j, k, l, m, n);
2220 }
2221 }
2222 }
2223 }
2224 }
2225 }
2226 if (flags & PTHREADPOOL_FLAG_DISABLE_DENORMALS) {
2227 set_fpu_state(saved_fpu_state);
2228 }
2229 } else {
2230 const size_t range_lmn = range_l * range_m * range_n;
2231 const size_t range = range_i * range_j * range_k * range_lmn;
2232 const struct pthreadpool_6d_params params = {
2233 .range_l = range_l,
2234 .range_j = fxdiv_init_size_t(range_j),
2235 .range_k = fxdiv_init_size_t(range_k),
2236 .range_lmn = fxdiv_init_size_t(range_lmn),
2237 .range_m = fxdiv_init_size_t(range_m),
2238 .range_n = fxdiv_init_size_t(range_n),
2239 };
2240 thread_function_t parallelize_6d = &thread_parallelize_6d;
2241 #if PTHREADPOOL_USE_FASTPATH
2242 const size_t range_threshold = -threads_count;
2243 if (range < range_threshold) {
2244 parallelize_6d = &pthreadpool_thread_parallelize_6d_fastpath;
2245 }
2246 #endif
2247 pthreadpool_parallelize(
2248 threadpool, parallelize_6d, &params, sizeof(params),
2249 task, argument, range, flags);
2250 }
2251}
2252
2253void pthreadpool_parallelize_6d_tile_1d(
2254 pthreadpool_t threadpool,
2255 pthreadpool_task_6d_tile_1d_t task,
2256 void* argument,
2257 size_t range_i,
2258 size_t range_j,
2259 size_t range_k,
2260 size_t range_l,
2261 size_t range_m,
2262 size_t range_n,
2263 size_t tile_n,
2264 uint32_t flags)
2265{
2266 size_t threads_count;
2267 if (threadpool == NULL || (threads_count = threadpool->threads_count.value) <= 1 || ((range_i | range_j | range_k | range_l | range_m) <= 1 && range_n <= tile_n)) {
2268 /* No thread pool used: execute task sequentially on the calling thread */
2269 struct fpu_state saved_fpu_state = { 0 };
2270 if (flags & PTHREADPOOL_FLAG_DISABLE_DENORMALS) {
2271 saved_fpu_state = get_fpu_state();
2272 disable_fpu_denormals();
2273 }
2274 for (size_t i = 0; i < range_i; i++) {
2275 for (size_t j = 0; j < range_j; j++) {
2276 for (size_t k = 0; k < range_k; k++) {
2277 for (size_t l = 0; l < range_l; l++) {
2278 for (size_t m = 0; m < range_m; m++) {
2279 for (size_t n = 0; n < range_n; n += tile_n) {
2280 task(argument, i, j, k, l, m, n, min(range_n - n, tile_n));
2281 }
2282 }
2283 }
2284 }
2285 }
2286 }
2287 if (flags & PTHREADPOOL_FLAG_DISABLE_DENORMALS) {
2288 set_fpu_state(saved_fpu_state);
2289 }
2290 } else {
2291 const size_t tile_range_n = divide_round_up(range_n, tile_n);
2292 const size_t tile_range_lmn = range_l * range_m * tile_range_n;
2293 const size_t tile_range = range_i * range_j * range_k * tile_range_lmn;
2294 const struct pthreadpool_6d_tile_1d_params params = {
2295 .range_l = range_l,
2296 .range_n = range_n,
2297 .tile_n = tile_n,
2298 .range_j = fxdiv_init_size_t(range_j),
2299 .range_k = fxdiv_init_size_t(range_k),
2300 .tile_range_lmn = fxdiv_init_size_t(tile_range_lmn),
2301 .range_m = fxdiv_init_size_t(range_m),
2302 .tile_range_n = fxdiv_init_size_t(tile_range_n),
2303 };
2304 thread_function_t parallelize_6d_tile_1d = &thread_parallelize_6d_tile_1d;
2305 #if PTHREADPOOL_USE_FASTPATH
2306 const size_t range_threshold = -threads_count;
2307 if (tile_range < range_threshold) {
2308 parallelize_6d_tile_1d = &pthreadpool_thread_parallelize_6d_tile_1d_fastpath;
2309 }
2310 #endif
2311 pthreadpool_parallelize(
2312 threadpool, parallelize_6d_tile_1d, &params, sizeof(params),
2313 task, argument, tile_range, flags);
2314 }
2315}
2316
2317void pthreadpool_parallelize_6d_tile_2d(
2318 pthreadpool_t threadpool,
2319 pthreadpool_task_6d_tile_2d_t task,
2320 void* argument,
2321 size_t range_i,
2322 size_t range_j,
2323 size_t range_k,
2324 size_t range_l,
2325 size_t range_m,
2326 size_t range_n,
2327 size_t tile_m,
2328 size_t tile_n,
2329 uint32_t flags)
2330{
2331 size_t threads_count;
2332 if (threadpool == NULL || (threads_count = threadpool->threads_count.value) <= 1 || ((range_i | range_j | range_k | range_l) <= 1 && range_m <= tile_m && range_n <= tile_n)) {
2333 /* No thread pool used: execute task sequentially on the calling thread */
2334 struct fpu_state saved_fpu_state = { 0 };
2335 if (flags & PTHREADPOOL_FLAG_DISABLE_DENORMALS) {
2336 saved_fpu_state = get_fpu_state();
2337 disable_fpu_denormals();
2338 }
2339 for (size_t i = 0; i < range_i; i++) {
2340 for (size_t j = 0; j < range_j; j++) {
2341 for (size_t k = 0; k < range_k; k++) {
2342 for (size_t l = 0; l < range_l; l++) {
2343 for (size_t m = 0; m < range_m; m += tile_m) {
2344 for (size_t n = 0; n < range_n; n += tile_n) {
2345 task(argument, i, j, k, l, m, n,
2346 min(range_m - m, tile_m), min(range_n - n, tile_n));
2347 }
2348 }
2349 }
2350 }
2351 }
2352 }
2353 if (flags & PTHREADPOOL_FLAG_DISABLE_DENORMALS) {
2354 set_fpu_state(saved_fpu_state);
2355 }
2356 } else {
2357 const size_t range_kl = range_k * range_l;
2358 const size_t tile_range_n = divide_round_up(range_n, tile_n);
2359 const size_t tile_range_mn = divide_round_up(range_m, tile_m) * tile_range_n;
2360 const size_t tile_range = range_i * range_j * range_kl * tile_range_mn;
2361 const struct pthreadpool_6d_tile_2d_params params = {
2362 .range_k = range_k,
2363 .range_m = range_m,
2364 .tile_m = tile_m,
2365 .range_n = range_n,
2366 .tile_n = tile_n,
2367 .range_j = fxdiv_init_size_t(range_j),
2368 .range_kl = fxdiv_init_size_t(range_kl),
2369 .range_l = fxdiv_init_size_t(range_l),
2370 .tile_range_mn = fxdiv_init_size_t(tile_range_mn),
2371 .tile_range_n = fxdiv_init_size_t(tile_range_n),
2372 };
2373 thread_function_t parallelize_6d_tile_2d = &thread_parallelize_6d_tile_2d;
2374 #if PTHREADPOOL_USE_FASTPATH
2375 const size_t range_threshold = -threads_count;
2376 if (tile_range < range_threshold) {
2377 parallelize_6d_tile_2d = &pthreadpool_thread_parallelize_6d_tile_2d_fastpath;
2378 }
2379 #endif
2380 pthreadpool_parallelize(
2381 threadpool, parallelize_6d_tile_2d, &params, sizeof(params),
2382 task, argument, tile_range, flags);
2383 }
2384}
2385