1#ifndef PTHREADPOOL_H_
2#define PTHREADPOOL_H_
3
4#include <stddef.h>
5#include <stdint.h>
6
7typedef struct pthreadpool* pthreadpool_t;
8
9typedef void (*pthreadpool_task_1d_t)(void*, size_t);
10typedef void (*pthreadpool_task_1d_tile_1d_t)(void*, size_t, size_t);
11typedef void (*pthreadpool_task_2d_t)(void*, size_t, size_t);
12typedef void (*pthreadpool_task_2d_tile_1d_t)(void*, size_t, size_t, size_t);
13typedef void (*pthreadpool_task_2d_tile_2d_t)(void*, size_t, size_t, size_t, size_t);
14typedef void (*pthreadpool_task_3d_t)(void*, size_t, size_t, size_t);
15typedef void (*pthreadpool_task_3d_tile_1d_t)(void*, size_t, size_t, size_t, size_t);
16typedef void (*pthreadpool_task_3d_tile_2d_t)(void*, size_t, size_t, size_t, size_t, size_t);
17typedef void (*pthreadpool_task_4d_t)(void*, size_t, size_t, size_t, size_t);
18typedef void (*pthreadpool_task_4d_tile_1d_t)(void*, size_t, size_t, size_t, size_t, size_t);
19typedef void (*pthreadpool_task_4d_tile_2d_t)(void*, size_t, size_t, size_t, size_t, size_t, size_t);
20typedef void (*pthreadpool_task_5d_t)(void*, size_t, size_t, size_t, size_t, size_t);
21typedef void (*pthreadpool_task_5d_tile_1d_t)(void*, size_t, size_t, size_t, size_t, size_t, size_t);
22typedef void (*pthreadpool_task_5d_tile_2d_t)(void*, size_t, size_t, size_t, size_t, size_t, size_t, size_t);
23typedef void (*pthreadpool_task_6d_t)(void*, size_t, size_t, size_t, size_t, size_t, size_t);
24typedef void (*pthreadpool_task_6d_tile_1d_t)(void*, size_t, size_t, size_t, size_t, size_t, size_t, size_t);
25typedef void (*pthreadpool_task_6d_tile_2d_t)(void*, size_t, size_t, size_t, size_t, size_t, size_t, size_t, size_t);
26
27typedef void (*pthreadpool_task_1d_with_id_t)(void*, uint32_t, size_t);
28typedef void (*pthreadpool_task_2d_tile_2d_with_id_t)(void*, uint32_t, size_t, size_t, size_t, size_t);
29typedef void (*pthreadpool_task_3d_tile_2d_with_id_t)(void*, uint32_t, size_t, size_t, size_t, size_t, size_t);
30typedef void (*pthreadpool_task_4d_tile_2d_with_id_t)(void*, uint32_t, size_t, size_t, size_t, size_t, size_t, size_t);
31
32
33/**
34 * Disable support for denormalized numbers to the maximum extent possible for
35 * the duration of the computation.
36 *
37 * Handling denormalized floating-point numbers is often implemented in
38 * microcode, and incurs significant performance degradation. This hint
39 * instructs the thread pool to disable support for denormalized numbers before
40 * running the computation by manipulating architecture-specific control
41 * registers, and restore the initial value of control registers after the
42 * computation is complete. The thread pool temporary disables denormalized
43 * numbers on all threads involved in the computation (i.e. the caller threads,
44 * and potentially worker threads).
45 *
46 * Disabling denormalized numbers may have a small negative effect on results'
47 * accuracy. As various architectures differ in capabilities to control
48 * processing of denormalized numbers, using this flag may also hurt results'
49 * reproducibility across different instruction set architectures.
50 */
51#define PTHREADPOOL_FLAG_DISABLE_DENORMALS 0x00000001
52
53/**
54 * Yield worker threads to the system scheduler after the operation is finished.
55 *
56 * Force workers to use kernel wait (instead of active spin-wait by default) for
57 * new commands after this command is processed. This flag affects only the
58 * immediate next operation on this thread pool. To make the thread pool always
59 * use kernel wait, pass this flag to all parallelization functions.
60 */
61#define PTHREADPOOL_FLAG_YIELD_WORKERS 0x00000002
62
63#ifdef __cplusplus
64extern "C" {
65#endif
66
67/**
68 * Create a thread pool with the specified number of threads.
69 *
70 * @param threads_count the number of threads in the thread pool.
71 * A value of 0 has special interpretation: it creates a thread pool with as
72 * many threads as there are logical processors in the system.
73 *
74 * @returns A pointer to an opaque thread pool object if the call is
75 * successful, or NULL pointer if the call failed.
76 */
77pthreadpool_t pthreadpool_create(size_t threads_count);
78
79/**
80 * Query the number of threads in a thread pool.
81 *
82 * @param threadpool the thread pool to query.
83 *
84 * @returns The number of threads in the thread pool.
85 */
86size_t pthreadpool_get_threads_count(pthreadpool_t threadpool);
87
88/**
89 * Process items on a 1D grid.
90 *
91 * The function implements a parallel version of the following snippet:
92 *
93 * for (size_t i = 0; i < range; i++)
94 * function(context, i);
95 *
96 * When the function returns, all items have been processed and the thread pool
97 * is ready for a new task.
98 *
99 * @note If multiple threads call this function with the same thread pool, the
100 * calls are serialized.
101 *
102 * @param threadpool the thread pool to use for parallelisation. If threadpool
103 * is NULL, all items are processed serially on the calling thread.
104 * @param function the function to call for each item.
105 * @param context the first argument passed to the specified function.
106 * @param range the number of items on the 1D grid to process. The
107 * specified function will be called once for each item.
108 * @param flags a bitwise combination of zero or more optional flags
109 * (PTHREADPOOL_FLAG_DISABLE_DENORMALS or PTHREADPOOL_FLAG_YIELD_WORKERS)
110 */
111void pthreadpool_parallelize_1d(
112 pthreadpool_t threadpool,
113 pthreadpool_task_1d_t function,
114 void* context,
115 size_t range,
116 uint32_t flags);
117
118/**
119 * Process items on a 1D grid using a microarchitecture-aware task function.
120 *
121 * The function implements a parallel version of the following snippet:
122 *
123 * uint32_t uarch_index = cpuinfo_initialize() ?
124 * cpuinfo_get_current_uarch_index() : default_uarch_index;
125 * if (uarch_index > max_uarch_index) uarch_index = default_uarch_index;
126 * for (size_t i = 0; i < range; i++)
127 * function(context, uarch_index, i);
128 *
129 * When the function returns, all items have been processed and the thread pool
130 * is ready for a new task.
131 *
132 * @note If multiple threads call this function with the same thread pool, the
133 * calls are serialized.
134 *
135 * @param threadpool the thread pool to use for parallelisation. If
136 * threadpool is NULL, all items are processed serially on the calling
137 * thread.
138 * @param function the function to call for each item.
139 * @param context the first argument passed to the specified
140 * function.
141 * @param default_uarch_index the microarchitecture index to use when
142 * pthreadpool is configured without cpuinfo, cpuinfo initialization failed,
143 * or index returned by cpuinfo_get_current_uarch_index() exceeds the
144 * max_uarch_index value.
145 * @param max_uarch_index the maximum microarchitecture index expected by
146 * the specified function. If the index returned by
147 * cpuinfo_get_current_uarch_index() exceeds this value, default_uarch_index
148 * will be used instead. default_uarch_index can exceed max_uarch_index.
149 * @param range the number of items on the 1D grid to process.
150 * The specified function will be called once for each item.
151 * @param flags a bitwise combination of zero or more optional
152 * flags (PTHREADPOOL_FLAG_DISABLE_DENORMALS or
153 * PTHREADPOOL_FLAG_YIELD_WORKERS)
154 */
155void pthreadpool_parallelize_1d_with_uarch(
156 pthreadpool_t threadpool,
157 pthreadpool_task_1d_with_id_t function,
158 void* context,
159 uint32_t default_uarch_index,
160 uint32_t max_uarch_index,
161 size_t range,
162 uint32_t flags);
163
164/**
165 * Process items on a 1D grid with specified maximum tile size.
166 *
167 * The function implements a parallel version of the following snippet:
168 *
169 * for (size_t i = 0; i < range; i += tile)
170 * function(context, i, min(range - i, tile));
171 *
172 * When the call returns, all items have been processed and the thread pool is
173 * ready for a new task.
174 *
175 * @note If multiple threads call this function with the same thread pool,
176 * the calls are serialized.
177 *
178 * @param threadpool the thread pool to use for parallelisation. If threadpool
179 * is NULL, all items are processed serially on the calling thread.
180 * @param function the function to call for each tile.
181 * @param context the first argument passed to the specified function.
182 * @param range the number of items on the 1D grid to process.
183 * @param tile the maximum number of items on the 1D grid to process in
184 * one function call.
185 * @param flags a bitwise combination of zero or more optional flags
186 * (PTHREADPOOL_FLAG_DISABLE_DENORMALS or PTHREADPOOL_FLAG_YIELD_WORKERS)
187 */
188void pthreadpool_parallelize_1d_tile_1d(
189 pthreadpool_t threadpool,
190 pthreadpool_task_1d_tile_1d_t function,
191 void* context,
192 size_t range,
193 size_t tile,
194 uint32_t flags);
195
196/**
197 * Process items on a 2D grid.
198 *
199 * The function implements a parallel version of the following snippet:
200 *
201 * for (size_t i = 0; i < range_i; i++)
202 * for (size_t j = 0; j < range_j; j++)
203 * function(context, i, j);
204 *
205 * When the function returns, all items have been processed and the thread pool
206 * is ready for a new task.
207 *
208 * @note If multiple threads call this function with the same thread pool, the
209 * calls are serialized.
210 *
211 * @param threadpool the thread pool to use for parallelisation. If threadpool
212 * is NULL, all items are processed serially on the calling thread.
213 * @param function the function to call for each item.
214 * @param context the first argument passed to the specified function.
215 * @param range_i the number of items to process along the first dimension
216 * of the 2D grid.
217 * @param range_j the number of items to process along the second dimension
218 * of the 2D grid.
219 * @param flags a bitwise combination of zero or more optional flags
220 * (PTHREADPOOL_FLAG_DISABLE_DENORMALS or PTHREADPOOL_FLAG_YIELD_WORKERS)
221 */
222void pthreadpool_parallelize_2d(
223 pthreadpool_t threadpool,
224 pthreadpool_task_2d_t function,
225 void* context,
226 size_t range_i,
227 size_t range_j,
228 uint32_t flags);
229
230/**
231 * Process items on a 2D grid with the specified maximum tile size along the
232 * last grid dimension.
233 *
234 * The function implements a parallel version of the following snippet:
235 *
236 * for (size_t i = 0; i < range_i; i++)
237 * for (size_t j = 0; j < range_j; j += tile_j)
238 * function(context, i, j, min(range_j - j, tile_j));
239 *
240 * When the function returns, all items have been processed and the thread pool
241 * is ready for a new task.
242 *
243 * @note If multiple threads call this function with the same thread pool, the
244 * calls are serialized.
245 *
246 * @param threadpool the thread pool to use for parallelisation. If threadpool
247 * is NULL, all items are processed serially on the calling thread.
248 * @param function the function to call for each tile.
249 * @param context the first argument passed to the specified function.
250 * @param range_i the number of items to process along the first dimension
251 * of the 2D grid.
252 * @param range_j the number of items to process along the second dimension
253 * of the 2D grid.
254 * @param tile_j the maximum number of items along the second dimension of
255 * the 2D grid to process in one function call.
256 * @param flags a bitwise combination of zero or more optional flags
257 * (PTHREADPOOL_FLAG_DISABLE_DENORMALS or PTHREADPOOL_FLAG_YIELD_WORKERS)
258 */
259void pthreadpool_parallelize_2d_tile_1d(
260 pthreadpool_t threadpool,
261 pthreadpool_task_2d_tile_1d_t function,
262 void* context,
263 size_t range_i,
264 size_t range_j,
265 size_t tile_j,
266 uint32_t flags);
267
268/**
269 * Process items on a 2D grid with the specified maximum tile size along each
270 * grid dimension.
271 *
272 * The function implements a parallel version of the following snippet:
273 *
274 * for (size_t i = 0; i < range_i; i += tile_i)
275 * for (size_t j = 0; j < range_j; j += tile_j)
276 * function(context, i, j,
277 * min(range_i - i, tile_i), min(range_j - j, tile_j));
278 *
279 * When the function returns, all items have been processed and the thread pool
280 * is ready for a new task.
281 *
282 * @note If multiple threads call this function with the same thread pool, the
283 * calls are serialized.
284 *
285 * @param threadpool the thread pool to use for parallelisation. If threadpool
286 * is NULL, all items are processed serially on the calling thread.
287 * @param function the function to call for each tile.
288 * @param context the first argument passed to the specified function.
289 * @param range_i the number of items to process along the first dimension
290 * of the 2D grid.
291 * @param range_j the number of items to process along the second dimension
292 * of the 2D grid.
293 * @param tile_j the maximum number of items along the first dimension of
294 * the 2D grid to process in one function call.
295 * @param tile_j the maximum number of items along the second dimension of
296 * the 2D grid to process in one function call.
297 * @param flags a bitwise combination of zero or more optional flags
298 * (PTHREADPOOL_FLAG_DISABLE_DENORMALS or PTHREADPOOL_FLAG_YIELD_WORKERS)
299 */
300void pthreadpool_parallelize_2d_tile_2d(
301 pthreadpool_t threadpool,
302 pthreadpool_task_2d_tile_2d_t function,
303 void* context,
304 size_t range_i,
305 size_t range_j,
306 size_t tile_i,
307 size_t tile_j,
308 uint32_t flags);
309
310/**
311 * Process items on a 2D grid with the specified maximum tile size along each
312 * grid dimension using a microarchitecture-aware task function.
313 *
314 * The function implements a parallel version of the following snippet:
315 *
316 * uint32_t uarch_index = cpuinfo_initialize() ?
317 * cpuinfo_get_current_uarch_index() : default_uarch_index;
318 * if (uarch_index > max_uarch_index) uarch_index = default_uarch_index;
319 * for (size_t i = 0; i < range_i; i += tile_i)
320 * for (size_t j = 0; j < range_j; j += tile_j)
321 * function(context, uarch_index, i, j,
322 * min(range_i - i, tile_i), min(range_j - j, tile_j));
323 *
324 * When the function returns, all items have been processed and the thread pool
325 * is ready for a new task.
326 *
327 * @note If multiple threads call this function with the same thread pool, the
328 * calls are serialized.
329 *
330 * @param threadpool the thread pool to use for parallelisation. If
331 * threadpool is NULL, all items are processed serially on the calling
332 * thread.
333 * @param function the function to call for each tile.
334 * @param context the first argument passed to the specified
335 * function.
336 * @param default_uarch_index the microarchitecture index to use when
337 * pthreadpool is configured without cpuinfo,
338 * cpuinfo initialization failed, or index returned
339 * by cpuinfo_get_current_uarch_index() exceeds
340 * the max_uarch_index value.
341 * @param max_uarch_index the maximum microarchitecture index expected
342 * by the specified function. If the index returned
343 * by cpuinfo_get_current_uarch_index() exceeds this
344 * value, default_uarch_index will be used instead.
345 * default_uarch_index can exceed max_uarch_index.
346 * @param range_i the number of items to process along the first
347 * dimension of the 2D grid.
348 * @param range_j the number of items to process along the second
349 * dimension of the 2D grid.
350 * @param tile_j the maximum number of items along the first
351 * dimension of the 2D grid to process in one function call.
352 * @param tile_j the maximum number of items along the second
353 * dimension of the 2D grid to process in one function call.
354 * @param flags a bitwise combination of zero or more optional
355 * flags (PTHREADPOOL_FLAG_DISABLE_DENORMALS or
356 * PTHREADPOOL_FLAG_YIELD_WORKERS)
357 */
358void pthreadpool_parallelize_2d_tile_2d_with_uarch(
359 pthreadpool_t threadpool,
360 pthreadpool_task_2d_tile_2d_with_id_t function,
361 void* context,
362 uint32_t default_uarch_index,
363 uint32_t max_uarch_index,
364 size_t range_i,
365 size_t range_j,
366 size_t tile_i,
367 size_t tile_j,
368 uint32_t flags);
369
370/**
371 * Process items on a 3D grid.
372 *
373 * The function implements a parallel version of the following snippet:
374 *
375 * for (size_t i = 0; i < range_i; i++)
376 * for (size_t j = 0; j < range_j; j++)
377 * for (size_t k = 0; k < range_k; k++)
378 * function(context, i, j, k);
379 *
380 * When the function returns, all items have been processed and the thread pool
381 * is ready for a new task.
382 *
383 * @note If multiple threads call this function with the same thread pool, the
384 * calls are serialized.
385 *
386 * @param threadpool the thread pool to use for parallelisation. If threadpool
387 * is NULL, all items are processed serially on the calling thread.
388 * @param function the function to call for each tile.
389 * @param context the first argument passed to the specified function.
390 * @param range_i the number of items to process along the first dimension
391 * of the 3D grid.
392 * @param range_j the number of items to process along the second dimension
393 * of the 3D grid.
394 * @param range_k the number of items to process along the third dimension
395 * of the 3D grid.
396 * @param flags a bitwise combination of zero or more optional flags
397 * (PTHREADPOOL_FLAG_DISABLE_DENORMALS or PTHREADPOOL_FLAG_YIELD_WORKERS)
398 */
399void pthreadpool_parallelize_3d(
400 pthreadpool_t threadpool,
401 pthreadpool_task_3d_t function,
402 void* context,
403 size_t range_i,
404 size_t range_j,
405 size_t range_k,
406 uint32_t flags);
407
408/**
409 * Process items on a 3D grid with the specified maximum tile size along the
410 * last grid dimension.
411 *
412 * The function implements a parallel version of the following snippet:
413 *
414 * for (size_t i = 0; i < range_i; i++)
415 * for (size_t j = 0; j < range_j; j++)
416 * for (size_t k = 0; k < range_k; k += tile_k)
417 * function(context, i, j, k, min(range_k - k, tile_k));
418 *
419 * When the function returns, all items have been processed and the thread pool
420 * is ready for a new task.
421 *
422 * @note If multiple threads call this function with the same thread pool, the
423 * calls are serialized.
424 *
425 * @param threadpool the thread pool to use for parallelisation. If threadpool
426 * is NULL, all items are processed serially on the calling thread.
427 * @param function the function to call for each tile.
428 * @param context the first argument passed to the specified function.
429 * @param range_i the number of items to process along the first dimension
430 * of the 3D grid.
431 * @param range_j the number of items to process along the second dimension
432 * of the 3D grid.
433 * @param range_k the number of items to process along the third dimension
434 * of the 3D grid.
435 * @param tile_k the maximum number of items along the third dimension of
436 * the 3D grid to process in one function call.
437 * @param flags a bitwise combination of zero or more optional flags
438 * (PTHREADPOOL_FLAG_DISABLE_DENORMALS or PTHREADPOOL_FLAG_YIELD_WORKERS)
439 */
440void pthreadpool_parallelize_3d_tile_1d(
441 pthreadpool_t threadpool,
442 pthreadpool_task_3d_tile_1d_t function,
443 void* context,
444 size_t range_i,
445 size_t range_j,
446 size_t range_k,
447 size_t tile_k,
448 uint32_t flags);
449
450/**
451 * Process items on a 3D grid with the specified maximum tile size along the
452 * last two grid dimensions.
453 *
454 * The function implements a parallel version of the following snippet:
455 *
456 * for (size_t i = 0; i < range_i; i++)
457 * for (size_t j = 0; j < range_j; j += tile_j)
458 * for (size_t k = 0; k < range_k; k += tile_k)
459 * function(context, i, j, k,
460 * min(range_j - j, tile_j), min(range_k - k, tile_k));
461 *
462 * When the function returns, all items have been processed and the thread pool
463 * is ready for a new task.
464 *
465 * @note If multiple threads call this function with the same thread pool, the
466 * calls are serialized.
467 *
468 * @param threadpool the thread pool to use for parallelisation. If threadpool
469 * is NULL, all items are processed serially on the calling thread.
470 * @param function the function to call for each tile.
471 * @param context the first argument passed to the specified function.
472 * @param range_i the number of items to process along the first dimension
473 * of the 3D grid.
474 * @param range_j the number of items to process along the second dimension
475 * of the 3D grid.
476 * @param range_k the number of items to process along the third dimension
477 * of the 3D grid.
478 * @param tile_j the maximum number of items along the second dimension of
479 * the 3D grid to process in one function call.
480 * @param tile_k the maximum number of items along the third dimension of
481 * the 3D grid to process in one function call.
482 * @param flags a bitwise combination of zero or more optional flags
483 * (PTHREADPOOL_FLAG_DISABLE_DENORMALS or PTHREADPOOL_FLAG_YIELD_WORKERS)
484 */
485void pthreadpool_parallelize_3d_tile_2d(
486 pthreadpool_t threadpool,
487 pthreadpool_task_3d_tile_2d_t function,
488 void* context,
489 size_t range_i,
490 size_t range_j,
491 size_t range_k,
492 size_t tile_j,
493 size_t tile_k,
494 uint32_t flags);
495
496/**
497 * Process items on a 3D grid with the specified maximum tile size along the
498 * last two grid dimensions using a microarchitecture-aware task function.
499 *
500 * The function implements a parallel version of the following snippet:
501 *
502 * uint32_t uarch_index = cpuinfo_initialize() ?
503 * cpuinfo_get_current_uarch_index() : default_uarch_index;
504 * if (uarch_index > max_uarch_index) uarch_index = default_uarch_index;
505 * for (size_t i = 0; i < range_i; i++)
506 * for (size_t j = 0; j < range_j; j += tile_j)
507 * for (size_t k = 0; k < range_k; k += tile_k)
508 * function(context, uarch_index, i, j, k,
509 * min(range_j - j, tile_j), min(range_k - k, tile_k));
510 *
511 * When the function returns, all items have been processed and the thread pool
512 * is ready for a new task.
513 *
514 * @note If multiple threads call this function with the same thread pool, the
515 * calls are serialized.
516 *
517 * @param threadpool the thread pool to use for parallelisation. If
518 * threadpool is NULL, all items are processed serially on the calling
519 * thread.
520 * @param function the function to call for each tile.
521 * @param context the first argument passed to the specified
522 * function.
523 * @param default_uarch_index the microarchitecture index to use when
524 * pthreadpool is configured without cpuinfo, cpuinfo initialization failed,
525 * or index returned by cpuinfo_get_current_uarch_index() exceeds the
526 * max_uarch_index value.
527 * @param max_uarch_index the maximum microarchitecture index expected by
528 * the specified function. If the index returned by
529 * cpuinfo_get_current_uarch_index() exceeds this value, default_uarch_index
530 * will be used instead. default_uarch_index can exceed max_uarch_index.
531 * @param range_i the number of items to process along the first
532 * dimension of the 3D grid.
533 * @param range_j the number of items to process along the second
534 * dimension of the 3D grid.
535 * @param range_k the number of items to process along the third
536 * dimension of the 3D grid.
537 * @param tile_j the maximum number of items along the second
538 * dimension of the 3D grid to process in one function call.
539 * @param tile_k the maximum number of items along the third
540 * dimension of the 3D grid to process in one function call.
541 * @param flags a bitwise combination of zero or more optional
542 * flags (PTHREADPOOL_FLAG_DISABLE_DENORMALS or
543 * PTHREADPOOL_FLAG_YIELD_WORKERS)
544 */
545void pthreadpool_parallelize_3d_tile_2d_with_uarch(
546 pthreadpool_t threadpool,
547 pthreadpool_task_3d_tile_2d_with_id_t function,
548 void* context,
549 uint32_t default_uarch_index,
550 uint32_t max_uarch_index,
551 size_t range_i,
552 size_t range_j,
553 size_t range_k,
554 size_t tile_j,
555 size_t tile_k,
556 uint32_t flags);
557
558/**
559 * Process items on a 4D grid.
560 *
561 * The function implements a parallel version of the following snippet:
562 *
563 * for (size_t i = 0; i < range_i; i++)
564 * for (size_t j = 0; j < range_j; j++)
565 * for (size_t k = 0; k < range_k; k++)
566 * for (size_t l = 0; l < range_l; l++)
567 * function(context, i, j, k, l);
568 *
569 * When the function returns, all items have been processed and the thread pool
570 * is ready for a new task.
571 *
572 * @note If multiple threads call this function with the same thread pool, the
573 * calls are serialized.
574 *
575 * @param threadpool the thread pool to use for parallelisation. If threadpool
576 * is NULL, all items are processed serially on the calling thread.
577 * @param function the function to call for each tile.
578 * @param context the first argument passed to the specified function.
579 * @param range_i the number of items to process along the first dimension
580 * of the 4D grid.
581 * @param range_j the number of items to process along the second dimension
582 * of the 4D grid.
583 * @param range_k the number of items to process along the third dimension
584 * of the 4D grid.
585 * @param range_l the number of items to process along the fourth dimension
586 * of the 4D grid.
587 * @param flags a bitwise combination of zero or more optional flags
588 * (PTHREADPOOL_FLAG_DISABLE_DENORMALS or PTHREADPOOL_FLAG_YIELD_WORKERS)
589 */
590void pthreadpool_parallelize_4d(
591 pthreadpool_t threadpool,
592 pthreadpool_task_4d_t function,
593 void* context,
594 size_t range_i,
595 size_t range_j,
596 size_t range_k,
597 size_t range_l,
598 uint32_t flags);
599
600/**
601 * Process items on a 4D grid with the specified maximum tile size along the
602 * last grid dimension.
603 *
604 * The function implements a parallel version of the following snippet:
605 *
606 * for (size_t i = 0; i < range_i; i++)
607 * for (size_t j = 0; j < range_j; j++)
608 * for (size_t k = 0; k < range_k; k++)
609 * for (size_t l = 0; l < range_l; l += tile_l)
610 * function(context, i, j, k, l, min(range_l - l, tile_l));
611 *
612 * When the function returns, all items have been processed and the thread pool
613 * is ready for a new task.
614 *
615 * @note If multiple threads call this function with the same thread pool, the
616 * calls are serialized.
617 *
618 * @param threadpool the thread pool to use for parallelisation. If threadpool
619 * is NULL, all items are processed serially on the calling thread.
620 * @param function the function to call for each tile.
621 * @param context the first argument passed to the specified function.
622 * @param range_i the number of items to process along the first dimension
623 * of the 4D grid.
624 * @param range_j the number of items to process along the second dimension
625 * of the 4D grid.
626 * @param range_k the number of items to process along the third dimension
627 * of the 4D grid.
628 * @param range_l the number of items to process along the fourth dimension
629 * of the 4D grid.
630 * @param tile_l the maximum number of items along the fourth dimension of
631 * the 4D grid to process in one function call.
632 * @param flags a bitwise combination of zero or more optional flags
633 * (PTHREADPOOL_FLAG_DISABLE_DENORMALS or PTHREADPOOL_FLAG_YIELD_WORKERS)
634 */
635void pthreadpool_parallelize_4d_tile_1d(
636 pthreadpool_t threadpool,
637 pthreadpool_task_4d_tile_1d_t function,
638 void* context,
639 size_t range_i,
640 size_t range_j,
641 size_t range_k,
642 size_t range_l,
643 size_t tile_l,
644 uint32_t flags);
645
646/**
647 * Process items on a 4D grid with the specified maximum tile size along the
648 * last two grid dimensions.
649 *
650 * The function implements a parallel version of the following snippet:
651 *
652 * for (size_t i = 0; i < range_i; i++)
653 * for (size_t j = 0; j < range_j; j++)
654 * for (size_t k = 0; k < range_k; k += tile_k)
655 * for (size_t l = 0; l < range_l; l += tile_l)
656 * function(context, i, j, k, l,
657 * min(range_k - k, tile_k), min(range_l - l, tile_l));
658 *
659 * When the function returns, all items have been processed and the thread pool
660 * is ready for a new task.
661 *
662 * @note If multiple threads call this function with the same thread pool, the
663 * calls are serialized.
664 *
665 * @param threadpool the thread pool to use for parallelisation. If threadpool
666 * is NULL, all items are processed serially on the calling thread.
667 * @param function the function to call for each tile.
668 * @param context the first argument passed to the specified function.
669 * @param range_i the number of items to process along the first dimension
670 * of the 4D grid.
671 * @param range_j the number of items to process along the second dimension
672 * of the 4D grid.
673 * @param range_k the number of items to process along the third dimension
674 * of the 4D grid.
675 * @param range_l the number of items to process along the fourth dimension
676 * of the 4D grid.
677 * @param tile_k the maximum number of items along the third dimension of
678 * the 4D grid to process in one function call.
679 * @param tile_l the maximum number of items along the fourth dimension of
680 * the 4D grid to process in one function call.
681 * @param flags a bitwise combination of zero or more optional flags
682 * (PTHREADPOOL_FLAG_DISABLE_DENORMALS or PTHREADPOOL_FLAG_YIELD_WORKERS)
683 */
684void pthreadpool_parallelize_4d_tile_2d(
685 pthreadpool_t threadpool,
686 pthreadpool_task_4d_tile_2d_t function,
687 void* context,
688 size_t range_i,
689 size_t range_j,
690 size_t range_k,
691 size_t range_l,
692 size_t tile_k,
693 size_t tile_l,
694 uint32_t flags);
695
696/**
697 * Process items on a 4D grid with the specified maximum tile size along the
698 * last two grid dimensions using a microarchitecture-aware task function.
699 *
700 * The function implements a parallel version of the following snippet:
701 *
702 * uint32_t uarch_index = cpuinfo_initialize() ?
703 * cpuinfo_get_current_uarch_index() : default_uarch_index;
704 * if (uarch_index > max_uarch_index) uarch_index = default_uarch_index;
705 * for (size_t i = 0; i < range_i; i++)
706 * for (size_t j = 0; j < range_j; j++)
707 * for (size_t k = 0; k < range_k; k += tile_k)
708 * for (size_t l = 0; l < range_l; l += tile_l)
709 * function(context, uarch_index, i, j, k, l,
710 * min(range_k - k, tile_k), min(range_l - l, tile_l));
711 *
712 * When the function returns, all items have been processed and the thread pool
713 * is ready for a new task.
714 *
715 * @note If multiple threads call this function with the same thread pool, the
716 * calls are serialized.
717 *
718 * @param threadpool the thread pool to use for parallelisation. If
719 * threadpool is NULL, all items are processed serially on the calling
720 * thread.
721 * @param function the function to call for each tile.
722 * @param context the first argument passed to the specified
723 * function.
724 * @param default_uarch_index the microarchitecture index to use when
725 * pthreadpool is configured without cpuinfo, cpuinfo initialization failed,
726 * or index returned by cpuinfo_get_current_uarch_index() exceeds the
727 * max_uarch_index value.
728 * @param max_uarch_index the maximum microarchitecture index expected by
729 * the specified function. If the index returned by
730 * cpuinfo_get_current_uarch_index() exceeds this value, default_uarch_index
731 * will be used instead. default_uarch_index can exceed max_uarch_index.
732 * @param range_i the number of items to process along the first
733 * dimension of the 4D grid.
734 * @param range_j the number of items to process along the second
735 * dimension of the 4D grid.
736 * @param range_k the number of items to process along the third
737 * dimension of the 4D grid.
738 * @param range_l the number of items to process along the fourth
739 * dimension of the 4D grid.
740 * @param tile_k the maximum number of items along the third
741 * dimension of the 4D grid to process in one function call.
742 * @param tile_l the maximum number of items along the fourth
743 * dimension of the 4D grid to process in one function call.
744 * @param flags a bitwise combination of zero or more optional
745 * flags (PTHREADPOOL_FLAG_DISABLE_DENORMALS or
746 * PTHREADPOOL_FLAG_YIELD_WORKERS)
747 */
748void pthreadpool_parallelize_4d_tile_2d_with_uarch(
749 pthreadpool_t threadpool,
750 pthreadpool_task_4d_tile_2d_with_id_t function,
751 void* context,
752 uint32_t default_uarch_index,
753 uint32_t max_uarch_index,
754 size_t range_i,
755 size_t range_j,
756 size_t range_k,
757 size_t range_l,
758 size_t tile_k,
759 size_t tile_l,
760 uint32_t flags);
761
762/**
763 * Process items on a 5D grid.
764 *
765 * The function implements a parallel version of the following snippet:
766 *
767 * for (size_t i = 0; i < range_i; i++)
768 * for (size_t j = 0; j < range_j; j++)
769 * for (size_t k = 0; k < range_k; k++)
770 * for (size_t l = 0; l < range_l; l++)
771 * for (size_t m = 0; m < range_m; m++)
772 * function(context, i, j, k, l, m);
773 *
774 * When the function returns, all items have been processed and the thread pool
775 * is ready for a new task.
776 *
777 * @note If multiple threads call this function with the same thread pool, the
778 * calls are serialized.
779 *
780 * @param threadpool the thread pool to use for parallelisation. If threadpool
781 * is NULL, all items are processed serially on the calling thread.
782 * @param function the function to call for each tile.
783 * @param context the first argument passed to the specified function.
784 * @param range_i the number of items to process along the first dimension
785 * of the 5D grid.
786 * @param range_j the number of items to process along the second dimension
787 * of the 5D grid.
788 * @param range_k the number of items to process along the third dimension
789 * of the 5D grid.
790 * @param range_l the number of items to process along the fourth dimension
791 * of the 5D grid.
792 * @param range_m the number of items to process along the fifth dimension
793 * of the 5D grid.
794 * @param flags a bitwise combination of zero or more optional flags
795 * (PTHREADPOOL_FLAG_DISABLE_DENORMALS or PTHREADPOOL_FLAG_YIELD_WORKERS)
796 */
797void pthreadpool_parallelize_5d(
798 pthreadpool_t threadpool,
799 pthreadpool_task_5d_t function,
800 void* context,
801 size_t range_i,
802 size_t range_j,
803 size_t range_k,
804 size_t range_l,
805 size_t range_m,
806 uint32_t flags);
807
808/**
809 * Process items on a 5D grid with the specified maximum tile size along the
810 * last grid dimension.
811 *
812 * The function implements a parallel version of the following snippet:
813 *
814 * for (size_t i = 0; i < range_i; i++)
815 * for (size_t j = 0; j < range_j; j++)
816 * for (size_t k = 0; k < range_k; k++)
817 * for (size_t l = 0; l < range_l; l++)
818 * for (size_t m = 0; m < range_m; m += tile_m)
819 * function(context, i, j, k, l, m, min(range_m - m, tile_m));
820 *
821 * When the function returns, all items have been processed and the thread pool
822 * is ready for a new task.
823 *
824 * @note If multiple threads call this function with the same thread pool, the
825 * calls are serialized.
826 *
827 * @param threadpool the thread pool to use for parallelisation. If threadpool
828 * is NULL, all items are processed serially on the calling thread.
829 * @param function the function to call for each tile.
830 * @param context the first argument passed to the specified function.
831 * @param range_i the number of items to process along the first dimension
832 * of the 5D grid.
833 * @param range_j the number of items to process along the second dimension
834 * of the 5D grid.
835 * @param range_k the number of items to process along the third dimension
836 * of the 5D grid.
837 * @param range_l the number of items to process along the fourth dimension
838 * of the 5D grid.
839 * @param range_m the number of items to process along the fifth dimension
840 * of the 5D grid.
841 * @param tile_m the maximum number of items along the fifth dimension of
842 * the 5D grid to process in one function call.
843 * @param flags a bitwise combination of zero or more optional flags
844 * (PTHREADPOOL_FLAG_DISABLE_DENORMALS or PTHREADPOOL_FLAG_YIELD_WORKERS)
845 */
846void pthreadpool_parallelize_5d_tile_1d(
847 pthreadpool_t threadpool,
848 pthreadpool_task_5d_tile_1d_t function,
849 void* context,
850 size_t range_i,
851 size_t range_j,
852 size_t range_k,
853 size_t range_l,
854 size_t range_m,
855 size_t tile_m,
856 uint32_t flags);
857
858/**
859 * Process items on a 5D grid with the specified maximum tile size along the
860 * last two grid dimensions.
861 *
862 * The function implements a parallel version of the following snippet:
863 *
864 * for (size_t i = 0; i < range_i; i++)
865 * for (size_t j = 0; j < range_j; j++)
866 * for (size_t k = 0; k < range_k; k++)
867 * for (size_t l = 0; l < range_l; l += tile_l)
868 * for (size_t m = 0; m < range_m; m += tile_m)
869 * function(context, i, j, k, l, m,
870 * min(range_l - l, tile_l), min(range_m - m, tile_m));
871 *
872 * When the function returns, all items have been processed and the thread pool
873 * is ready for a new task.
874 *
875 * @note If multiple threads call this function with the same thread pool, the
876 * calls are serialized.
877 *
878 * @param threadpool the thread pool to use for parallelisation. If threadpool
879 * is NULL, all items are processed serially on the calling thread.
880 * @param function the function to call for each tile.
881 * @param context the first argument passed to the specified function.
882 * @param range_i the number of items to process along the first dimension
883 * of the 5D grid.
884 * @param range_j the number of items to process along the second dimension
885 * of the 5D grid.
886 * @param range_k the number of items to process along the third dimension
887 * of the 5D grid.
888 * @param range_l the number of items to process along the fourth dimension
889 * of the 5D grid.
890 * @param range_m the number of items to process along the fifth dimension
891 * of the 5D grid.
892 * @param tile_l the maximum number of items along the fourth dimension of
893 * the 5D grid to process in one function call.
894 * @param tile_m the maximum number of items along the fifth dimension of
895 * the 5D grid to process in one function call.
896 * @param flags a bitwise combination of zero or more optional flags
897 * (PTHREADPOOL_FLAG_DISABLE_DENORMALS or PTHREADPOOL_FLAG_YIELD_WORKERS)
898 */
899void pthreadpool_parallelize_5d_tile_2d(
900 pthreadpool_t threadpool,
901 pthreadpool_task_5d_tile_2d_t function,
902 void* context,
903 size_t range_i,
904 size_t range_j,
905 size_t range_k,
906 size_t range_l,
907 size_t range_m,
908 size_t tile_l,
909 size_t tile_m,
910 uint32_t flags);
911
912/**
913 * Process items on a 6D grid.
914 *
915 * The function implements a parallel version of the following snippet:
916 *
917 * for (size_t i = 0; i < range_i; i++)
918 * for (size_t j = 0; j < range_j; j++)
919 * for (size_t k = 0; k < range_k; k++)
920 * for (size_t l = 0; l < range_l; l++)
921 * for (size_t m = 0; m < range_m; m++)
922 * for (size_t n = 0; n < range_n; n++)
923 * function(context, i, j, k, l, m, n);
924 *
925 * When the function returns, all items have been processed and the thread pool
926 * is ready for a new task.
927 *
928 * @note If multiple threads call this function with the same thread pool, the
929 * calls are serialized.
930 *
931 * @param threadpool the thread pool to use for parallelisation. If threadpool
932 * is NULL, all items are processed serially on the calling thread.
933 * @param function the function to call for each tile.
934 * @param context the first argument passed to the specified function.
935 * @param range_i the number of items to process along the first dimension
936 * of the 6D grid.
937 * @param range_j the number of items to process along the second dimension
938 * of the 6D grid.
939 * @param range_k the number of items to process along the third dimension
940 * of the 6D grid.
941 * @param range_l the number of items to process along the fourth dimension
942 * of the 6D grid.
943 * @param range_m the number of items to process along the fifth dimension
944 * of the 6D grid.
945 * @param range_n the number of items to process along the sixth dimension
946 * of the 6D grid.
947 * @param tile_n the maximum number of items along the sixth dimension of
948 * the 6D grid to process in one function call.
949 * @param flags a bitwise combination of zero or more optional flags
950 * (PTHREADPOOL_FLAG_DISABLE_DENORMALS or PTHREADPOOL_FLAG_YIELD_WORKERS)
951 */
952void pthreadpool_parallelize_6d(
953 pthreadpool_t threadpool,
954 pthreadpool_task_6d_t function,
955 void* context,
956 size_t range_i,
957 size_t range_j,
958 size_t range_k,
959 size_t range_l,
960 size_t range_m,
961 size_t range_n,
962 uint32_t flags);
963
964/**
965 * Process items on a 6D grid with the specified maximum tile size along the
966 * last grid dimension.
967 *
968 * The function implements a parallel version of the following snippet:
969 *
970 * for (size_t i = 0; i < range_i; i++)
971 * for (size_t j = 0; j < range_j; j++)
972 * for (size_t k = 0; k < range_k; k++)
973 * for (size_t l = 0; l < range_l; l++)
974 * for (size_t m = 0; m < range_m; m++)
975 * for (size_t n = 0; n < range_n; n += tile_n)
976 * function(context, i, j, k, l, m, n, min(range_n - n, tile_n));
977 *
978 * When the function returns, all items have been processed and the thread pool
979 * is ready for a new task.
980 *
981 * @note If multiple threads call this function with the same thread pool, the
982 * calls are serialized.
983 *
984 * @param threadpool the thread pool to use for parallelisation. If threadpool
985 * is NULL, all items are processed serially on the calling thread.
986 * @param function the function to call for each tile.
987 * @param context the first argument passed to the specified function.
988 * @param range_i the number of items to process along the first dimension
989 * of the 6D grid.
990 * @param range_j the number of items to process along the second dimension
991 * of the 6D grid.
992 * @param range_k the number of items to process along the third dimension
993 * of the 6D grid.
994 * @param range_l the number of items to process along the fourth dimension
995 * of the 6D grid.
996 * @param range_m the number of items to process along the fifth dimension
997 * of the 6D grid.
998 * @param range_n the number of items to process along the sixth dimension
999 * of the 6D grid.
1000 * @param tile_n the maximum number of items along the sixth dimension of
1001 * the 6D grid to process in one function call.
1002 * @param flags a bitwise combination of zero or more optional flags
1003 * (PTHREADPOOL_FLAG_DISABLE_DENORMALS or PTHREADPOOL_FLAG_YIELD_WORKERS)
1004 */
1005void pthreadpool_parallelize_6d_tile_1d(
1006 pthreadpool_t threadpool,
1007 pthreadpool_task_6d_tile_1d_t function,
1008 void* context,
1009 size_t range_i,
1010 size_t range_j,
1011 size_t range_k,
1012 size_t range_l,
1013 size_t range_m,
1014 size_t range_n,
1015 size_t tile_n,
1016 uint32_t flags);
1017
1018/**
1019 * Process items on a 6D grid with the specified maximum tile size along the
1020 * last two grid dimensions.
1021 *
1022 * The function implements a parallel version of the following snippet:
1023 *
1024 * for (size_t i = 0; i < range_i; i++)
1025 * for (size_t j = 0; j < range_j; j++)
1026 * for (size_t k = 0; k < range_k; k++)
1027 * for (size_t l = 0; l < range_l; l++)
1028 * for (size_t m = 0; m < range_m; m += tile_m)
1029 * for (size_t n = 0; n < range_n; n += tile_n)
1030 * function(context, i, j, k, l, m, n,
1031 * min(range_m - m, tile_m), min(range_n - n, tile_n));
1032 *
1033 * When the function returns, all items have been processed and the thread pool
1034 * is ready for a new task.
1035 *
1036 * @note If multiple threads call this function with the same thread pool, the
1037 * calls are serialized.
1038 *
1039 * @param threadpool the thread pool to use for parallelisation. If threadpool
1040 * is NULL, all items are processed serially on the calling thread.
1041 * @param function the function to call for each tile.
1042 * @param context the first argument passed to the specified function.
1043 * @param range_i the number of items to process along the first dimension
1044 * of the 6D grid.
1045 * @param range_j the number of items to process along the second dimension
1046 * of the 6D grid.
1047 * @param range_k the number of items to process along the third dimension
1048 * of the 6D grid.
1049 * @param range_l the number of items to process along the fourth dimension
1050 * of the 6D grid.
1051 * @param range_m the number of items to process along the fifth dimension
1052 * of the 6D grid.
1053 * @param range_n the number of items to process along the sixth dimension
1054 * of the 6D grid.
1055 * @param tile_m the maximum number of items along the fifth dimension of
1056 * the 6D grid to process in one function call.
1057 * @param tile_n the maximum number of items along the sixth dimension of
1058 * the 6D grid to process in one function call.
1059 * @param flags a bitwise combination of zero or more optional flags
1060 * (PTHREADPOOL_FLAG_DISABLE_DENORMALS or PTHREADPOOL_FLAG_YIELD_WORKERS)
1061 */
1062void pthreadpool_parallelize_6d_tile_2d(
1063 pthreadpool_t threadpool,
1064 pthreadpool_task_6d_tile_2d_t function,
1065 void* context,
1066 size_t range_i,
1067 size_t range_j,
1068 size_t range_k,
1069 size_t range_l,
1070 size_t range_m,
1071 size_t range_n,
1072 size_t tile_m,
1073 size_t tile_n,
1074 uint32_t flags);
1075
1076/**
1077 * Terminates threads in the thread pool and releases associated resources.
1078 *
1079 * @warning Accessing the thread pool after a call to this function constitutes
1080 * undefined behaviour and may cause data corruption.
1081 *
1082 * @param[in,out] threadpool The thread pool to destroy.
1083 */
1084void pthreadpool_destroy(pthreadpool_t threadpool);
1085
1086
1087#ifndef PTHREADPOOL_NO_DEPRECATED_API
1088
1089/* Legacy API for compatibility with pre-existing users (e.g. NNPACK) */
1090#if defined(__GNUC__)
1091 #define PTHREADPOOL_DEPRECATED __attribute__((__deprecated__))
1092#else
1093 #define PTHREADPOOL_DEPRECATED
1094#endif
1095
1096typedef void (*pthreadpool_function_1d_t)(void*, size_t) PTHREADPOOL_DEPRECATED;
1097typedef void (*pthreadpool_function_1d_tiled_t)(void*, size_t, size_t) PTHREADPOOL_DEPRECATED;
1098typedef void (*pthreadpool_function_2d_t)(void*, size_t, size_t) PTHREADPOOL_DEPRECATED;
1099typedef void (*pthreadpool_function_2d_tiled_t)(void*, size_t, size_t, size_t, size_t) PTHREADPOOL_DEPRECATED;
1100typedef void (*pthreadpool_function_3d_tiled_t)(void*, size_t, size_t, size_t, size_t, size_t, size_t) PTHREADPOOL_DEPRECATED;
1101typedef void (*pthreadpool_function_4d_tiled_t)(void*, size_t, size_t, size_t, size_t, size_t, size_t, size_t, size_t) PTHREADPOOL_DEPRECATED;
1102
1103void pthreadpool_compute_1d(
1104 pthreadpool_t threadpool,
1105 pthreadpool_function_1d_t function,
1106 void* argument,
1107 size_t range) PTHREADPOOL_DEPRECATED;
1108
1109void pthreadpool_compute_1d_tiled(
1110 pthreadpool_t threadpool,
1111 pthreadpool_function_1d_tiled_t function,
1112 void* argument,
1113 size_t range,
1114 size_t tile) PTHREADPOOL_DEPRECATED;
1115
1116void pthreadpool_compute_2d(
1117 pthreadpool_t threadpool,
1118 pthreadpool_function_2d_t function,
1119 void* argument,
1120 size_t range_i,
1121 size_t range_j) PTHREADPOOL_DEPRECATED;
1122
1123void pthreadpool_compute_2d_tiled(
1124 pthreadpool_t threadpool,
1125 pthreadpool_function_2d_tiled_t function,
1126 void* argument,
1127 size_t range_i,
1128 size_t range_j,
1129 size_t tile_i,
1130 size_t tile_j) PTHREADPOOL_DEPRECATED;
1131
1132void pthreadpool_compute_3d_tiled(
1133 pthreadpool_t threadpool,
1134 pthreadpool_function_3d_tiled_t function,
1135 void* argument,
1136 size_t range_i,
1137 size_t range_j,
1138 size_t range_k,
1139 size_t tile_i,
1140 size_t tile_j,
1141 size_t tile_k) PTHREADPOOL_DEPRECATED;
1142
1143void pthreadpool_compute_4d_tiled(
1144 pthreadpool_t threadpool,
1145 pthreadpool_function_4d_tiled_t function,
1146 void* argument,
1147 size_t range_i,
1148 size_t range_j,
1149 size_t range_k,
1150 size_t range_l,
1151 size_t tile_i,
1152 size_t tile_j,
1153 size_t tile_k,
1154 size_t tile_l) PTHREADPOOL_DEPRECATED;
1155
1156#endif /* PTHREADPOOL_NO_DEPRECATED_API */
1157
1158#ifdef __cplusplus
1159} /* extern "C" */
1160#endif
1161
1162#endif /* PTHREADPOOL_H_ */
1163