portable-api.c source code [pytorch/third_party/pthreadpool/src/portable-api.c]

1	/ Standard C headers /
2	#include <assert.h>
3	#include <stdbool.h>
4	#include <stdint.h>
5	#include <stdlib.h>
6	#include <string.h>
7
8	#if PTHREADPOOL_USE_CPUINFO
9	#include <cpuinfo.h>
10	#endif
11
12	/ Dependencies /
13	#include <fxdiv.h>
14
15	/ Public library header /
16	#include <pthreadpool.h>
17
18	/ Internal library headers /
19	#include "threadpool-atomics.h"
20	#include "threadpool-object.h"
21	#include "threadpool-utils.h"
22
23
24	size_t pthreadpool_get_threads_count(struct pthreadpool* threadpool) {
25	if (threadpool == NULL) {
26	return `1`;
27	}
28
29	return threadpool->threads_count.value;
30	}
31
32	static void thread_parallelize_1d(struct pthreadpool* threadpool, struct thread_info* thread) {
33	assert(threadpool != NULL);
34	assert(thread != NULL);
35
36	const pthreadpool_task_1d_t task = (pthreadpool_task_1d_t) pthreadpool_load_relaxed_void_p(&threadpool->task);
37	void *const argument = pthreadpool_load_relaxed_void_p(&threadpool->argument);
38
39	/ Process thread's own range of items /
40	size_t range_start = pthreadpool_load_relaxed_size_t(&thread->range_start);
41	while (pthreadpool_try_decrement_relaxed_size_t(&thread->range_length)) {
42	task(argument, range_start++);
43	}
44
45	/ There still may be other threads with work /
46	const size_t thread_number = thread->thread_number;
47	const size_t threads_count = threadpool->threads_count.value;
48	for (size_t tid = modulo_decrement(thread_number, threads_count);
49	tid != thread_number;
50	tid = modulo_decrement(tid, threads_count))
51	{
52	struct thread_info* other_thread = &threadpool->threads[tid];
53	while (pthreadpool_try_decrement_relaxed_size_t(&other_thread->range_length)) {
54	const size_t index = pthreadpool_decrement_fetch_relaxed_size_t(&other_thread->range_end);
55	task(argument, index);
56	}
57	}
58
59	/ Make changes by this thread visible to other threads /
60	pthreadpool_fence_release();
61	}
62
63	static void thread_parallelize_1d_with_uarch(struct pthreadpool* threadpool, struct thread_info* thread) {
64	assert(threadpool != NULL);
65	assert(thread != NULL);
66
67	const pthreadpool_task_1d_with_id_t task = (pthreadpool_task_1d_with_id_t) pthreadpool_load_relaxed_void_p(&threadpool->task);
68	void *const argument = pthreadpool_load_relaxed_void_p(&threadpool->argument);
69
70	const uint32_t default_uarch_index = threadpool->params.parallelize_1d_with_uarch.default_uarch_index;
71	uint32_t uarch_index = default_uarch_index;
72	#if PTHREADPOOL_USE_CPUINFO
73	uarch_index = cpuinfo_get_current_uarch_index_with_default(default_uarch_index);
74	if (uarch_index > threadpool->params.parallelize_1d_with_uarch.max_uarch_index) {
75	uarch_index = default_uarch_index;
76	}
77	#endif
78
79	/ Process thread's own range of items /
80	size_t range_start = pthreadpool_load_relaxed_size_t(&thread->range_start);
81	while (pthreadpool_try_decrement_relaxed_size_t(&thread->range_length)) {
82	task(argument, uarch_index, range_start++);
83	}
84
85	/ There still may be other threads with work /
86	const size_t thread_number = thread->thread_number;
87	const size_t threads_count = threadpool->threads_count.value;
88	for (size_t tid = modulo_decrement(thread_number, threads_count);
89	tid != thread_number;
90	tid = modulo_decrement(tid, threads_count))
91	{
92	struct thread_info* other_thread = &threadpool->threads[tid];
93	while (pthreadpool_try_decrement_relaxed_size_t(&other_thread->range_length)) {
94	const size_t index = pthreadpool_decrement_fetch_relaxed_size_t(&other_thread->range_end);
95	task(argument, uarch_index, index);
96	}
97	}
98
99	/ Make changes by this thread visible to other threads /
100	pthreadpool_fence_release();
101	}
102
103	static void thread_parallelize_1d_tile_1d(struct pthreadpool* threadpool, struct thread_info* thread) {
104	assert(threadpool != NULL);
105	assert(thread != NULL);
106
107	const pthreadpool_task_1d_tile_1d_t task = (pthreadpool_task_1d_tile_1d_t) pthreadpool_load_relaxed_void_p(&threadpool->task);
108	void *const argument = pthreadpool_load_relaxed_void_p(&threadpool->argument);
109
110	/ Process thread's own range of items /
111	const size_t range_start = pthreadpool_load_relaxed_size_t(&thread->range_start);
112	const size_t tile = threadpool->params.parallelize_1d_tile_1d.tile;
113	size_t tile_start = range_start * tile;
114
115	const size_t range = threadpool->params.parallelize_1d_tile_1d.range;
116	while (pthreadpool_try_decrement_relaxed_size_t(&thread->range_length)) {
117	task(argument, tile_start, min(range - tile_start, tile));
118	tile_start += tile;
119	}
120
121	/ There still may be other threads with work /
122	const size_t thread_number = thread->thread_number;
123	const size_t threads_count = threadpool->threads_count.value;
124	for (size_t tid = modulo_decrement(thread_number, threads_count);
125	tid != thread_number;
126	tid = modulo_decrement(tid, threads_count))
127	{
128	struct thread_info* other_thread = &threadpool->threads[tid];
129	while (pthreadpool_try_decrement_relaxed_size_t(&other_thread->range_length)) {
130	const size_t tile_index = pthreadpool_decrement_fetch_relaxed_size_t(&other_thread->range_end);
131	const size_t tile_start = tile_index * tile;
132	task(argument, tile_start, min(range - tile_start, tile));
133	}
134	}
135
136	/ Make changes by this thread visible to other threads /
137	pthreadpool_fence_release();
138	}
139
140	static void thread_parallelize_2d(struct pthreadpool* threadpool, struct thread_info* thread) {
141	assert(threadpool != NULL);
142	assert(thread != NULL);
143
144	const pthreadpool_task_2d_t task = (pthreadpool_task_2d_t) pthreadpool_load_relaxed_void_p(&threadpool->task);
145	void *const argument = pthreadpool_load_relaxed_void_p(&threadpool->argument);
146
147	/ Process thread's own range of items /
148	const size_t range_start = pthreadpool_load_relaxed_size_t(&thread->range_start);
149	const struct fxdiv_divisor_size_t range_j = threadpool->params.parallelize_2d.range_j;
150	const struct fxdiv_result_size_t index_i_j = fxdiv_divide_size_t(range_start, range_j);
151	size_t i = index_i_j.quotient;
152	size_t j = index_i_j.remainder;
153
154	while (pthreadpool_try_decrement_relaxed_size_t(&thread->range_length)) {
155	task(argument, i, j);
156	if (++j == range_j.value) {
157	j = `0`;
158	i += `1`;
159	}
160	}
161
162	/ There still may be other threads with work /
163	const size_t thread_number = thread->thread_number;
164	const size_t threads_count = threadpool->threads_count.value;
165	for (size_t tid = modulo_decrement(thread_number, threads_count);
166	tid != thread_number;
167	tid = modulo_decrement(tid, threads_count))
168	{
169	struct thread_info* other_thread = &threadpool->threads[tid];
170	while (pthreadpool_try_decrement_relaxed_size_t(&other_thread->range_length)) {
171	const size_t linear_index = pthreadpool_decrement_fetch_relaxed_size_t(&other_thread->range_end);
172	const struct fxdiv_result_size_t index_i_j = fxdiv_divide_size_t(linear_index, range_j);
173	task(argument, index_i_j.quotient, index_i_j.remainder);
174	}
175	}
176
177	/ Make changes by this thread visible to other threads /
178	pthreadpool_fence_release();
179	}
180
181	static void thread_parallelize_2d_tile_1d(struct pthreadpool* threadpool, struct thread_info* thread) {
182	assert(threadpool != NULL);
183	assert(thread != NULL);
184
185	const pthreadpool_task_2d_tile_1d_t task = (pthreadpool_task_2d_tile_1d_t) pthreadpool_load_relaxed_void_p(&threadpool->task);
186	void *const argument = pthreadpool_load_relaxed_void_p(&threadpool->argument);
187
188	/ Process thread's own range of items /
189	const size_t range_start = pthreadpool_load_relaxed_size_t(&thread->range_start);
190	const struct fxdiv_divisor_size_t tile_range_j = threadpool->params.parallelize_2d_tile_1d.tile_range_j;
191	const struct fxdiv_result_size_t tile_index_i_j = fxdiv_divide_size_t(range_start, tile_range_j);
192	const size_t tile_j = threadpool->params.parallelize_2d_tile_1d.tile_j;
193	size_t i = tile_index_i_j.quotient;
194	size_t start_j = tile_index_i_j.remainder * tile_j;
195
196	const size_t range_j = threadpool->params.parallelize_2d_tile_1d.range_j;
197	while (pthreadpool_try_decrement_relaxed_size_t(&thread->range_length)) {
198	task(argument, i, start_j, min(range_j - start_j, tile_j));
199	start_j += tile_j;
200	if (start_j >= range_j) {
201	start_j = `0`;
202	i += `1`;
203	}
204	}
205
206	/ There still may be other threads with work /
207	const size_t thread_number = thread->thread_number;
208	const size_t threads_count = threadpool->threads_count.value;
209	for (size_t tid = modulo_decrement(thread_number, threads_count);
210	tid != thread_number;
211	tid = modulo_decrement(tid, threads_count))
212	{
213	struct thread_info* other_thread = &threadpool->threads[tid];
214	while (pthreadpool_try_decrement_relaxed_size_t(&other_thread->range_length)) {
215	const size_t linear_index = pthreadpool_decrement_fetch_relaxed_size_t(&other_thread->range_end);
216	const struct fxdiv_result_size_t tile_index_i_j = fxdiv_divide_size_t(linear_index, tile_range_j);
217	const size_t start_j = tile_index_i_j.remainder * tile_j;
218	task(argument, tile_index_i_j.quotient, start_j, min(range_j - start_j, tile_j));
219	}
220	}
221
222	/ Make changes by this thread visible to other threads /
223	pthreadpool_fence_release();
224	}
225
226	static void thread_parallelize_2d_tile_2d(struct pthreadpool* threadpool, struct thread_info* thread) {
227	assert(threadpool != NULL);
228	assert(thread != NULL);
229
230	const pthreadpool_task_2d_tile_2d_t task = (pthreadpool_task_2d_tile_2d_t) pthreadpool_load_relaxed_void_p(&threadpool->task);
231	void *const argument = pthreadpool_load_relaxed_void_p(&threadpool->argument);
232
233	/ Process thread's own range of items /
234	const size_t range_start = pthreadpool_load_relaxed_size_t(&thread->range_start);
235	const struct fxdiv_divisor_size_t tile_range_j = threadpool->params.parallelize_2d_tile_2d.tile_range_j;
236	const struct fxdiv_result_size_t tile_index_i_j = fxdiv_divide_size_t(range_start, tile_range_j);
237	const size_t tile_i = threadpool->params.parallelize_2d_tile_2d.tile_i;
238	const size_t tile_j = threadpool->params.parallelize_2d_tile_2d.tile_j;
239	size_t start_i = tile_index_i_j.quotient * tile_i;
240	size_t start_j = tile_index_i_j.remainder * tile_j;
241
242	const size_t range_i = threadpool->params.parallelize_2d_tile_2d.range_i;
243	const size_t range_j = threadpool->params.parallelize_2d_tile_2d.range_j;
244	while (pthreadpool_try_decrement_relaxed_size_t(&thread->range_length)) {
245	task(argument, start_i, start_j, min(range_i - start_i, tile_i), min(range_j - start_j, tile_j));
246	start_j += tile_j;
247	if (start_j >= range_j) {
248	start_j = `0`;
249	start_i += tile_i;
250	}
251	}
252
253	/ There still may be other threads with work /
254	const size_t thread_number = thread->thread_number;
255	const size_t threads_count = threadpool->threads_count.value;
256	for (size_t tid = modulo_decrement(thread_number, threads_count);
257	tid != thread_number;
258	tid = modulo_decrement(tid, threads_count))
259	{
260	struct thread_info* other_thread = &threadpool->threads[tid];
261	while (pthreadpool_try_decrement_relaxed_size_t(&other_thread->range_length)) {
262	const size_t linear_index = pthreadpool_decrement_fetch_relaxed_size_t(&other_thread->range_end);
263	const struct fxdiv_result_size_t tile_index_i_j = fxdiv_divide_size_t(linear_index, tile_range_j);
264	const size_t start_i = tile_index_i_j.quotient * tile_i;
265	const size_t start_j = tile_index_i_j.remainder * tile_j;
266	task(argument, start_i, start_j, min(range_i - start_i, tile_i), min(range_j - start_j, tile_j));
267	}
268	}
269
270	/ Make changes by this thread visible to other threads /
271	pthreadpool_fence_release();
272	}
273
274	static void thread_parallelize_2d_tile_2d_with_uarch(struct pthreadpool* threadpool, struct thread_info* thread) {
275	assert(threadpool != NULL);
276	assert(thread != NULL);
277
278	const pthreadpool_task_2d_tile_2d_with_id_t task = (pthreadpool_task_2d_tile_2d_with_id_t) pthreadpool_load_relaxed_void_p(&threadpool->task);
279	void *const argument = pthreadpool_load_relaxed_void_p(&threadpool->argument);
280
281	const uint32_t default_uarch_index = threadpool->params.parallelize_2d_tile_2d_with_uarch.default_uarch_index;
282	uint32_t uarch_index = default_uarch_index;
283	#if PTHREADPOOL_USE_CPUINFO
284	uarch_index = cpuinfo_get_current_uarch_index_with_default(default_uarch_index);
285	if (uarch_index > threadpool->params.parallelize_2d_tile_2d_with_uarch.max_uarch_index) {
286	uarch_index = default_uarch_index;
287	}
288	#endif
289
290	/ Process thread's own range of items /
291	const struct fxdiv_divisor_size_t tile_range_j = threadpool->params.parallelize_2d_tile_2d_with_uarch.tile_range_j;
292	const size_t range_start = pthreadpool_load_relaxed_size_t(&thread->range_start);
293	const struct fxdiv_result_size_t index = fxdiv_divide_size_t(range_start, tile_range_j);
294	const size_t range_i = threadpool->params.parallelize_2d_tile_2d_with_uarch.range_i;
295	const size_t tile_i = threadpool->params.parallelize_2d_tile_2d_with_uarch.tile_i;
296	const size_t range_j = threadpool->params.parallelize_2d_tile_2d_with_uarch.range_j;
297	const size_t tile_j = threadpool->params.parallelize_2d_tile_2d_with_uarch.tile_j;
298	size_t start_i = index.quotient * tile_i;
299	size_t start_j = index.remainder * tile_j;
300
301	while (pthreadpool_try_decrement_relaxed_size_t(&thread->range_length)) {
302	task(argument, uarch_index, start_i, start_j, min(range_i - start_i, tile_i), min(range_j - start_j, tile_j));
303	start_j += tile_j;
304	if (start_j >= range_j) {
305	start_j = `0`;
306	start_i += tile_i;
307	}
308	}
309
310	/ There still may be other threads with work /
311	const size_t thread_number = thread->thread_number;
312	const size_t threads_count = threadpool->threads_count.value;
313	for (size_t tid = modulo_decrement(thread_number, threads_count);
314	tid != thread_number;
315	tid = modulo_decrement(tid, threads_count))
316	{
317	struct thread_info* other_thread = &threadpool->threads[tid];
318	while (pthreadpool_try_decrement_relaxed_size_t(&other_thread->range_length)) {
319	const size_t linear_index = pthreadpool_decrement_fetch_relaxed_size_t(&other_thread->range_end);
320	const struct fxdiv_result_size_t tile_index_i_j = fxdiv_divide_size_t(linear_index, tile_range_j);
321	const size_t start_i = tile_index_i_j.quotient * tile_i;
322	const size_t start_j = tile_index_i_j.remainder * tile_j;
323	task(argument, uarch_index, start_i, start_j, min(range_i - start_i, tile_i), min(range_j - start_j, tile_j));
324	}
325	}
326
327	/ Make changes by this thread visible to other threads /
328	pthreadpool_fence_release();
329	}
330
331	static void thread_parallelize_3d(struct pthreadpool* threadpool, struct thread_info* thread) {
332	assert(threadpool != NULL);
333	assert(thread != NULL);
334
335	const pthreadpool_task_3d_t task = (pthreadpool_task_3d_t) pthreadpool_load_relaxed_void_p(&threadpool->task);
336	void *const argument = pthreadpool_load_relaxed_void_p(&threadpool->argument);
337
338	/ Process thread's own range of items /
339	const size_t range_start = pthreadpool_load_relaxed_size_t(&thread->range_start);
340	const struct fxdiv_divisor_size_t range_k = threadpool->params.parallelize_3d.range_k;
341	const struct fxdiv_result_size_t index_ij_k = fxdiv_divide_size_t(range_start, range_k);
342	const struct fxdiv_divisor_size_t range_j = threadpool->params.parallelize_3d.range_j;
343	const struct fxdiv_result_size_t index_i_j = fxdiv_divide_size_t(index_ij_k.quotient, range_j);
344	size_t i = index_i_j.quotient;
345	size_t j = index_i_j.remainder;
346	size_t k = index_ij_k.remainder;
347
348	while (pthreadpool_try_decrement_relaxed_size_t(&thread->range_length)) {
349	task(argument, i, j, k);
350	if (++k == range_k.value) {
351	k = `0`;
352	if (++j == range_j.value) {
353	j = `0`;
354	i += `1`;
355	}
356	}
357	}
358
359	/ There still may be other threads with work /
360	const size_t thread_number = thread->thread_number;
361	const size_t threads_count = threadpool->threads_count.value;
362	for (size_t tid = modulo_decrement(thread_number, threads_count);
363	tid != thread_number;
364	tid = modulo_decrement(tid, threads_count))
365	{
366	struct thread_info* other_thread = &threadpool->threads[tid];
367	while (pthreadpool_try_decrement_relaxed_size_t(&other_thread->range_length)) {
368	const size_t linear_index = pthreadpool_decrement_fetch_relaxed_size_t(&other_thread->range_end);
369	const struct fxdiv_result_size_t index_ij_k = fxdiv_divide_size_t(linear_index, range_k);
370	const struct fxdiv_result_size_t index_i_j = fxdiv_divide_size_t(index_ij_k.quotient, range_j);
371	task(argument, index_i_j.quotient, index_i_j.remainder, index_ij_k.remainder);
372	}
373	}
374
375	/ Make changes by this thread visible to other threads /
376	pthreadpool_fence_release();
377	}
378
379	static void thread_parallelize_3d_tile_1d(struct pthreadpool* threadpool, struct thread_info* thread) {
380	assert(threadpool != NULL);
381	assert(thread != NULL);
382
383	const pthreadpool_task_3d_tile_1d_t task = (pthreadpool_task_3d_tile_1d_t) pthreadpool_load_relaxed_void_p(&threadpool->task);
384	void *const argument = pthreadpool_load_relaxed_void_p(&threadpool->argument);
385
386	/ Process thread's own range of items /
387	const size_t range_start = pthreadpool_load_relaxed_size_t(&thread->range_start);
388	const struct fxdiv_divisor_size_t tile_range_k = threadpool->params.parallelize_3d_tile_1d.tile_range_k;
389	const struct fxdiv_result_size_t tile_index_ij_k = fxdiv_divide_size_t(range_start, tile_range_k);
390	const struct fxdiv_divisor_size_t range_j = threadpool->params.parallelize_3d_tile_1d.range_j;
391	const struct fxdiv_result_size_t index_i_j = fxdiv_divide_size_t(tile_index_ij_k.quotient, range_j);
392	const size_t tile_k = threadpool->params.parallelize_3d_tile_1d.tile_k;
393	size_t i = index_i_j.quotient;
394	size_t j = index_i_j.remainder;
395	size_t start_k = tile_index_ij_k.remainder * tile_k;
396
397	const size_t range_k = threadpool->params.parallelize_3d_tile_1d.range_k;
398	while (pthreadpool_try_decrement_relaxed_size_t(&thread->range_length)) {
399	task(argument, i, j, start_k, min(range_k - start_k, tile_k));
400	start_k += tile_k;
401	if (start_k >= range_k) {
402	start_k = `0`;
403	if (++j == range_j.value) {
404	j = `0`;
405	i += `1`;
406	}
407	}
408	}
409
410	/ There still may be other threads with work /
411	const size_t thread_number = thread->thread_number;
412	const size_t threads_count = threadpool->threads_count.value;
413	for (size_t tid = modulo_decrement(thread_number, threads_count);
414	tid != thread_number;
415	tid = modulo_decrement(tid, threads_count))
416	{
417	struct thread_info* other_thread = &threadpool->threads[tid];
418	while (pthreadpool_try_decrement_relaxed_size_t(&other_thread->range_length)) {
419	const size_t linear_index = pthreadpool_decrement_fetch_relaxed_size_t(&other_thread->range_end);
420	const struct fxdiv_result_size_t tile_index_ij_k = fxdiv_divide_size_t(linear_index, tile_range_k);
421	const struct fxdiv_result_size_t index_i_j = fxdiv_divide_size_t(tile_index_ij_k.quotient, range_j);
422	const size_t start_k = tile_index_ij_k.remainder * tile_k;
423	task(argument, index_i_j.quotient, index_i_j.remainder, start_k, min(range_k - start_k, tile_k));
424	}
425	}
426
427	/ Make changes by this thread visible to other threads /
428	pthreadpool_fence_release();
429	}
430
431	static void thread_parallelize_3d_tile_2d(struct pthreadpool* threadpool, struct thread_info* thread) {
432	assert(threadpool != NULL);
433	assert(thread != NULL);
434
435	const pthreadpool_task_3d_tile_2d_t task = (pthreadpool_task_3d_tile_2d_t) pthreadpool_load_relaxed_void_p(&threadpool->task);
436	void *const argument = pthreadpool_load_relaxed_void_p(&threadpool->argument);
437
438	/ Process thread's own range of items /
439	const size_t range_start = pthreadpool_load_relaxed_size_t(&thread->range_start);
440	const struct fxdiv_divisor_size_t tile_range_k = threadpool->params.parallelize_3d_tile_2d.tile_range_k;
441	const struct fxdiv_result_size_t tile_index_ij_k = fxdiv_divide_size_t(range_start, tile_range_k);
442	const struct fxdiv_divisor_size_t tile_range_j = threadpool->params.parallelize_3d_tile_2d.tile_range_j;
443	const struct fxdiv_result_size_t tile_index_i_j = fxdiv_divide_size_t(tile_index_ij_k.quotient, tile_range_j);
444	const size_t tile_j = threadpool->params.parallelize_3d_tile_2d.tile_j;
445	const size_t tile_k = threadpool->params.parallelize_3d_tile_2d.tile_k;
446	size_t i = tile_index_i_j.quotient;
447	size_t start_j = tile_index_i_j.remainder * tile_j;
448	size_t start_k = tile_index_ij_k.remainder * tile_k;
449
450	const size_t range_k = threadpool->params.parallelize_3d_tile_2d.range_k;
451	const size_t range_j = threadpool->params.parallelize_3d_tile_2d.range_j;
452	while (pthreadpool_try_decrement_relaxed_size_t(&thread->range_length)) {
453	task(argument, i, start_j, start_k, min(range_j - start_j, tile_j), min(range_k - start_k, tile_k));
454	start_k += tile_k;
455	if (start_k >= range_k) {
456	start_k = `0`;
457	start_j += tile_j;
458	if (start_j >= range_j) {
459	start_j = `0`;
460	i += `1`;
461	}
462	}
463	}
464
465	/ There still may be other threads with work /
466	const size_t thread_number = thread->thread_number;
467	const size_t threads_count = threadpool->threads_count.value;
468	for (size_t tid = modulo_decrement(thread_number, threads_count);
469	tid != thread_number;
470	tid = modulo_decrement(tid, threads_count))
471	{
472	struct thread_info* other_thread = &threadpool->threads[tid];
473	while (pthreadpool_try_decrement_relaxed_size_t(&other_thread->range_length)) {
474	const size_t linear_index = pthreadpool_decrement_fetch_relaxed_size_t(&other_thread->range_end);
475	const struct fxdiv_result_size_t tile_index_ij_k = fxdiv_divide_size_t(linear_index, tile_range_k);
476	const struct fxdiv_result_size_t tile_index_i_j = fxdiv_divide_size_t(tile_index_ij_k.quotient, tile_range_j);
477	const size_t start_j = tile_index_i_j.remainder * tile_j;
478	const size_t start_k = tile_index_ij_k.remainder * tile_k;
479	task(argument, tile_index_i_j.quotient, start_j, start_k, min(range_j - start_j, tile_j), min(range_k - start_k, tile_k));
480	}
481	}
482
483	/ Make changes by this thread visible to other threads /
484	pthreadpool_fence_release();
485	}
486
487	static void thread_parallelize_3d_tile_2d_with_uarch(struct pthreadpool* threadpool, struct thread_info* thread) {
488	assert(threadpool != NULL);
489	assert(thread != NULL);
490
491	const pthreadpool_task_3d_tile_2d_with_id_t task = (pthreadpool_task_3d_tile_2d_with_id_t) pthreadpool_load_relaxed_void_p(&threadpool->task);
492	void *const argument = pthreadpool_load_relaxed_void_p(&threadpool->argument);
493
494	const uint32_t default_uarch_index = threadpool->params.parallelize_3d_tile_2d_with_uarch.default_uarch_index;
495	uint32_t uarch_index = default_uarch_index;
496	#if PTHREADPOOL_USE_CPUINFO
497	uarch_index = cpuinfo_get_current_uarch_index_with_default(default_uarch_index);
498	if (uarch_index > threadpool->params.parallelize_3d_tile_2d_with_uarch.max_uarch_index) {
499	uarch_index = default_uarch_index;
500	}
501	#endif
502
503	/ Process thread's own range of items /
504	const size_t range_start = pthreadpool_load_relaxed_size_t(&thread->range_start);
505	const struct fxdiv_divisor_size_t tile_range_k = threadpool->params.parallelize_3d_tile_2d_with_uarch.tile_range_k;
506	const struct fxdiv_result_size_t tile_index_ij_k = fxdiv_divide_size_t(range_start, tile_range_k);
507	const struct fxdiv_divisor_size_t tile_range_j = threadpool->params.parallelize_3d_tile_2d_with_uarch.tile_range_j;
508	const struct fxdiv_result_size_t tile_index_i_j = fxdiv_divide_size_t(tile_index_ij_k.quotient, tile_range_j);
509	const size_t tile_j = threadpool->params.parallelize_3d_tile_2d_with_uarch.tile_j;
510	const size_t tile_k = threadpool->params.parallelize_3d_tile_2d_with_uarch.tile_k;
511	size_t i = tile_index_i_j.quotient;
512	size_t start_j = tile_index_i_j.remainder * tile_j;
513	size_t start_k = tile_index_ij_k.remainder * tile_k;
514
515	const size_t range_k = threadpool->params.parallelize_3d_tile_2d_with_uarch.range_k;
516	const size_t range_j = threadpool->params.parallelize_3d_tile_2d_with_uarch.range_j;
517	while (pthreadpool_try_decrement_relaxed_size_t(&thread->range_length)) {
518	task(argument, uarch_index, i, start_j, start_k, min(range_j - start_j, tile_j), min(range_k - start_k, tile_k));
519	start_k += tile_k;
520	if (start_k >= range_k) {
521	start_k = `0`;
522	start_j += tile_j;
523	if (start_j >= range_j) {
524	start_j = `0`;
525	i += `1`;
526	}
527	}
528	}
529
530	/ There still may be other threads with work /
531	const size_t thread_number = thread->thread_number;
532	const size_t threads_count = threadpool->threads_count.value;
533	for (size_t tid = modulo_decrement(thread_number, threads_count);
534	tid != thread_number;
535	tid = modulo_decrement(tid, threads_count))
536	{
537	struct thread_info* other_thread = &threadpool->threads[tid];
538	while (pthreadpool_try_decrement_relaxed_size_t(&other_thread->range_length)) {
539	const size_t linear_index = pthreadpool_decrement_fetch_relaxed_size_t(&other_thread->range_end);
540	const struct fxdiv_result_size_t tile_index_ij_k = fxdiv_divide_size_t(linear_index, tile_range_k);
541	const struct fxdiv_result_size_t tile_index_i_j = fxdiv_divide_size_t(tile_index_ij_k.quotient, tile_range_j);
542	const size_t start_j = tile_index_i_j.remainder * tile_j;
543	const size_t start_k = tile_index_ij_k.remainder * tile_k;
544	task(argument, uarch_index, tile_index_i_j.quotient, start_j, start_k, min(range_j - start_j, tile_j), min(range_k - start_k, tile_k));
545	}
546	}
547
548	/ Make changes by this thread visible to other threads /
549	pthreadpool_fence_release();
550	}
551
552	static void thread_parallelize_4d(struct pthreadpool* threadpool, struct thread_info* thread) {
553	assert(threadpool != NULL);
554	assert(thread != NULL);
555
556	const pthreadpool_task_4d_t task = (pthreadpool_task_4d_t) pthreadpool_load_relaxed_void_p(&threadpool->task);
557	void *const argument = pthreadpool_load_relaxed_void_p(&threadpool->argument);
558
559	/ Process thread's own range of items /
560	const size_t range_start = pthreadpool_load_relaxed_size_t(&thread->range_start);
561	const struct fxdiv_divisor_size_t range_kl = threadpool->params.parallelize_4d.range_kl;
562	const struct fxdiv_result_size_t index_ij_kl = fxdiv_divide_size_t(range_start, range_kl);
563	const struct fxdiv_divisor_size_t range_j = threadpool->params.parallelize_4d.range_j;
564	const struct fxdiv_result_size_t index_i_j = fxdiv_divide_size_t(index_ij_kl.quotient, range_j);
565	const struct fxdiv_divisor_size_t range_l = threadpool->params.parallelize_4d.range_l;
566	const struct fxdiv_result_size_t index_k_l = fxdiv_divide_size_t(index_ij_kl.remainder, range_l);
567	size_t i = index_i_j.quotient;
568	size_t j = index_i_j.remainder;
569	size_t k = index_k_l.quotient;
570	size_t l = index_k_l.remainder;
571
572	const size_t range_k = threadpool->params.parallelize_4d.range_k;
573	while (pthreadpool_try_decrement_relaxed_size_t(&thread->range_length)) {
574	task(argument, i, j, k, l);
575	if (++l == range_l.value) {
576	l = `0`;
577	if (++k == range_k) {
578	k = `0`;
579	if (++j == range_j.value) {
580	j = `0`;
581	i += `1`;
582	}
583	}
584	}
585	}
586
587	/ There still may be other threads with work /
588	const size_t thread_number = thread->thread_number;
589	const size_t threads_count = threadpool->threads_count.value;
590	for (size_t tid = modulo_decrement(thread_number, threads_count);
591	tid != thread_number;
592	tid = modulo_decrement(tid, threads_count))
593	{
594	struct thread_info* other_thread = &threadpool->threads[tid];
595	while (pthreadpool_try_decrement_relaxed_size_t(&other_thread->range_length)) {
596	const size_t linear_index = pthreadpool_decrement_fetch_relaxed_size_t(&other_thread->range_end);
597	const struct fxdiv_result_size_t index_ij_kl = fxdiv_divide_size_t(linear_index, range_kl);
598	const struct fxdiv_result_size_t index_i_j = fxdiv_divide_size_t(index_ij_kl.quotient, range_j);
599	const struct fxdiv_result_size_t index_k_l = fxdiv_divide_size_t(index_ij_kl.remainder, range_l);
600	task(argument, index_i_j.quotient, index_i_j.remainder, index_k_l.quotient, index_k_l.remainder);
601	}
602	}
603
604	/ Make changes by this thread visible to other threads /
605	pthreadpool_fence_release();
606	}
607
608	static void thread_parallelize_4d_tile_1d(struct pthreadpool* threadpool, struct thread_info* thread) {
609	assert(threadpool != NULL);
610	assert(thread != NULL);
611
612	const pthreadpool_task_4d_tile_1d_t task = (pthreadpool_task_4d_tile_1d_t) pthreadpool_load_relaxed_void_p(&threadpool->task);
613	void *const argument = pthreadpool_load_relaxed_void_p(&threadpool->argument);
614
615	/ Process thread's own range of items /
616	const size_t range_start = pthreadpool_load_relaxed_size_t(&thread->range_start);
617	const struct fxdiv_divisor_size_t tile_range_kl = threadpool->params.parallelize_4d_tile_1d.tile_range_kl;
618	const struct fxdiv_result_size_t tile_index_ij_kl = fxdiv_divide_size_t(range_start, tile_range_kl);
619	const struct fxdiv_divisor_size_t range_j = threadpool->params.parallelize_4d_tile_1d.range_j;
620	const struct fxdiv_result_size_t index_i_j = fxdiv_divide_size_t(tile_index_ij_kl.quotient, range_j);
621	const struct fxdiv_divisor_size_t tile_range_l = threadpool->params.parallelize_4d_tile_1d.tile_range_l;
622	const struct fxdiv_result_size_t tile_index_k_l = fxdiv_divide_size_t(tile_index_ij_kl.remainder, tile_range_l);
623	const size_t tile_l = threadpool->params.parallelize_4d_tile_1d.tile_l;
624	size_t i = index_i_j.quotient;
625	size_t j = index_i_j.remainder;
626	size_t k = tile_index_k_l.quotient;
627	size_t start_l = tile_index_k_l.remainder * tile_l;
628
629	const size_t range_k = threadpool->params.parallelize_4d_tile_1d.range_k;
630	const size_t range_l = threadpool->params.parallelize_4d_tile_1d.range_l;
631	while (pthreadpool_try_decrement_relaxed_size_t(&thread->range_length)) {
632	task(argument, i, j, k, start_l, min(range_l - start_l, tile_l));
633	start_l += tile_l;
634	if (start_l >= range_l) {
635	start_l = `0`;
636	if (++k == range_k) {
637	k = `0`;
638	if (++j == range_j.value) {
639	j = `0`;
640	i += `1`;
641	}
642	}
643	}
644	}
645
646	/ There still may be other threads with work /
647	const size_t thread_number = thread->thread_number;
648	const size_t threads_count = threadpool->threads_count.value;
649	for (size_t tid = modulo_decrement(thread_number, threads_count);
650	tid != thread_number;
651	tid = modulo_decrement(tid, threads_count))
652	{
653	struct thread_info* other_thread = &threadpool->threads[tid];
654	while (pthreadpool_try_decrement_relaxed_size_t(&other_thread->range_length)) {
655	const size_t linear_index = pthreadpool_decrement_fetch_relaxed_size_t(&other_thread->range_end);
656	const struct fxdiv_result_size_t tile_index_ij_kl = fxdiv_divide_size_t(linear_index, tile_range_kl);
657	const struct fxdiv_result_size_t index_i_j = fxdiv_divide_size_t(tile_index_ij_kl.quotient, range_j);
658	const struct fxdiv_result_size_t tile_index_k_l = fxdiv_divide_size_t(tile_index_ij_kl.remainder, tile_range_l);
659	const size_t start_l = tile_index_k_l.remainder * tile_l;
660	task(argument, index_i_j.quotient, index_i_j.remainder, tile_index_k_l.quotient, start_l, min(range_l - start_l, tile_l));
661	}
662	}
663
664	/ Make changes by this thread visible to other threads /
665	pthreadpool_fence_release();
666	}
667
668	static void thread_parallelize_4d_tile_2d(struct pthreadpool* threadpool, struct thread_info* thread) {
669	assert(threadpool != NULL);
670	assert(thread != NULL);
671
672	const pthreadpool_task_4d_tile_2d_t task = (pthreadpool_task_4d_tile_2d_t) pthreadpool_load_relaxed_void_p(&threadpool->task);
673	void *const argument = pthreadpool_load_relaxed_void_p(&threadpool->argument);
674
675	/ Process thread's own range of items /
676	const size_t range_start = pthreadpool_load_relaxed_size_t(&thread->range_start);
677	const struct fxdiv_divisor_size_t tile_range_kl = threadpool->params.parallelize_4d_tile_2d.tile_range_kl;
678	const struct fxdiv_result_size_t tile_index_ij_kl = fxdiv_divide_size_t(range_start, tile_range_kl);
679	const struct fxdiv_divisor_size_t range_j = threadpool->params.parallelize_4d_tile_2d.range_j;
680	const struct fxdiv_result_size_t index_i_j = fxdiv_divide_size_t(tile_index_ij_kl.quotient, range_j);
681	const struct fxdiv_divisor_size_t tile_range_l = threadpool->params.parallelize_4d_tile_2d.tile_range_l;
682	const struct fxdiv_result_size_t tile_index_k_l = fxdiv_divide_size_t(tile_index_ij_kl.remainder, tile_range_l);
683	const size_t tile_k = threadpool->params.parallelize_4d_tile_2d.tile_k;
684	const size_t tile_l = threadpool->params.parallelize_4d_tile_2d.tile_l;
685	size_t i = index_i_j.quotient;
686	size_t j = index_i_j.remainder;
687	size_t start_k = tile_index_k_l.quotient * tile_k;
688	size_t start_l = tile_index_k_l.remainder * tile_l;
689
690	const size_t range_l = threadpool->params.parallelize_4d_tile_2d.range_l;
691	const size_t range_k = threadpool->params.parallelize_4d_tile_2d.range_k;
692	while (pthreadpool_try_decrement_relaxed_size_t(&thread->range_length)) {
693	task(argument, i, j, start_k, start_l, min(range_k - start_k, tile_k), min(range_l - start_l, tile_l));
694	start_l += tile_l;
695	if (start_l >= range_l) {
696	start_l = `0`;
697	start_k += tile_k;
698	if (start_k >= range_k) {
699	start_k = `0`;
700	if (++j == range_j.value) {
701	j = `0`;
702	i += `1`;
703	}
704	}
705	}
706	}
707
708	/ There still may be other threads with work /
709	const size_t thread_number = thread->thread_number;
710	const size_t threads_count = threadpool->threads_count.value;
711	for (size_t tid = modulo_decrement(thread_number, threads_count);
712	tid != thread_number;
713	tid = modulo_decrement(tid, threads_count))
714	{
715	struct thread_info* other_thread = &threadpool->threads[tid];
716	while (pthreadpool_try_decrement_relaxed_size_t(&other_thread->range_length)) {
717	const size_t linear_index = pthreadpool_decrement_fetch_relaxed_size_t(&other_thread->range_end);
718	const struct fxdiv_result_size_t tile_index_ij_kl = fxdiv_divide_size_t(linear_index, tile_range_kl);
719	const struct fxdiv_result_size_t index_i_j = fxdiv_divide_size_t(tile_index_ij_kl.quotient, range_j);
720	const struct fxdiv_result_size_t tile_index_k_l = fxdiv_divide_size_t(tile_index_ij_kl.remainder, tile_range_l);
721	const size_t start_k = tile_index_k_l.quotient * tile_k;
722	const size_t start_l = tile_index_k_l.remainder * tile_l;
723	task(argument, index_i_j.quotient, index_i_j.remainder, start_k, start_l, min(range_k - start_k, tile_k), min(range_l - start_l, tile_l));
724	}
725	}
726
727	/ Make changes by this thread visible to other threads /
728	pthreadpool_fence_release();
729	}
730
731	static void thread_parallelize_4d_tile_2d_with_uarch(struct pthreadpool* threadpool, struct thread_info* thread) {
732	assert(threadpool != NULL);
733	assert(thread != NULL);
734
735	const pthreadpool_task_4d_tile_2d_with_id_t task = (pthreadpool_task_4d_tile_2d_with_id_t) pthreadpool_load_relaxed_void_p(&threadpool->task);
736	void *const argument = pthreadpool_load_relaxed_void_p(&threadpool->argument);
737
738	const uint32_t default_uarch_index = threadpool->params.parallelize_4d_tile_2d_with_uarch.default_uarch_index;
739	uint32_t uarch_index = default_uarch_index;
740	#if PTHREADPOOL_USE_CPUINFO
741	uarch_index = cpuinfo_get_current_uarch_index_with_default(default_uarch_index);
742	if (uarch_index > threadpool->params.parallelize_4d_tile_2d_with_uarch.max_uarch_index) {
743	uarch_index = default_uarch_index;
744	}
745	#endif
746
747	/ Process thread's own range of items /
748	const size_t range_start = pthreadpool_load_relaxed_size_t(&thread->range_start);
749	const struct fxdiv_divisor_size_t tile_range_kl = threadpool->params.parallelize_4d_tile_2d_with_uarch.tile_range_kl;
750	const struct fxdiv_result_size_t tile_index_ij_kl = fxdiv_divide_size_t(range_start, tile_range_kl);
751	const struct fxdiv_divisor_size_t range_j = threadpool->params.parallelize_4d_tile_2d_with_uarch.range_j;
752	const struct fxdiv_result_size_t index_i_j = fxdiv_divide_size_t(tile_index_ij_kl.quotient, range_j);
753	const struct fxdiv_divisor_size_t tile_range_l = threadpool->params.parallelize_4d_tile_2d_with_uarch.tile_range_l;
754	const struct fxdiv_result_size_t tile_index_k_l = fxdiv_divide_size_t(tile_index_ij_kl.remainder, tile_range_l);
755	const size_t tile_k = threadpool->params.parallelize_4d_tile_2d_with_uarch.tile_k;
756	const size_t tile_l = threadpool->params.parallelize_4d_tile_2d_with_uarch.tile_l;
757	size_t i = index_i_j.quotient;
758	size_t j = index_i_j.remainder;
759	size_t start_k = tile_index_k_l.quotient * tile_k;
760	size_t start_l = tile_index_k_l.remainder * tile_l;
761
762	const size_t range_l = threadpool->params.parallelize_4d_tile_2d_with_uarch.range_l;
763	const size_t range_k = threadpool->params.parallelize_4d_tile_2d_with_uarch.range_k;
764	while (pthreadpool_try_decrement_relaxed_size_t(&thread->range_length)) {
765	task(argument, uarch_index, i, j, start_k, start_l, min(range_k - start_k, tile_k), min(range_l - start_l, tile_l));
766	start_l += tile_l;
767	if (start_l >= range_l) {
768	start_l = `0`;
769	start_k += tile_k;
770	if (start_k >= range_k) {
771	start_k = `0`;
772	if (++j == range_j.value) {
773	j = `0`;
774	i += `1`;
775	}
776	}
777	}
778	}
779
780	/ There still may be other threads with work /
781	const size_t thread_number = thread->thread_number;
782	const size_t threads_count = threadpool->threads_count.value;
783	for (size_t tid = modulo_decrement(thread_number, threads_count);
784	tid != thread_number;
785	tid = modulo_decrement(tid, threads_count))
786	{
787	struct thread_info* other_thread = &threadpool->threads[tid];
788	while (pthreadpool_try_decrement_relaxed_size_t(&other_thread->range_length)) {
789	const size_t linear_index = pthreadpool_decrement_fetch_relaxed_size_t(&other_thread->range_end);
790	const struct fxdiv_result_size_t tile_index_ij_kl = fxdiv_divide_size_t(linear_index, tile_range_kl);
791	const struct fxdiv_result_size_t index_i_j = fxdiv_divide_size_t(tile_index_ij_kl.quotient, range_j);
792	const struct fxdiv_result_size_t tile_index_k_l = fxdiv_divide_size_t(tile_index_ij_kl.remainder, tile_range_l);
793	const size_t start_k = tile_index_k_l.quotient * tile_k;
794	const size_t start_l = tile_index_k_l.remainder * tile_l;
795	task(argument, uarch_index, index_i_j.quotient, index_i_j.remainder, start_k, start_l, min(range_k - start_k, tile_k), min(range_l - start_l, tile_l));
796	}
797	}
798
799	/ Make changes by this thread visible to other threads /
800	pthreadpool_fence_release();
801	}
802
803	static void thread_parallelize_5d(struct pthreadpool* threadpool, struct thread_info* thread) {
804	assert(threadpool != NULL);
805	assert(thread != NULL);
806
807	const pthreadpool_task_5d_t task = (pthreadpool_task_5d_t) pthreadpool_load_relaxed_void_p(&threadpool->task);
808	void *const argument = pthreadpool_load_relaxed_void_p(&threadpool->argument);
809
810	/ Process thread's own range of items /
811	const size_t range_start = pthreadpool_load_relaxed_size_t(&thread->range_start);
812	const struct fxdiv_divisor_size_t range_lm = threadpool->params.parallelize_5d.range_lm;
813	const struct fxdiv_result_size_t index_ijk_lm = fxdiv_divide_size_t(range_start, range_lm);
814	const struct fxdiv_divisor_size_t range_k = threadpool->params.parallelize_5d.range_k;
815	const struct fxdiv_result_size_t index_ij_k = fxdiv_divide_size_t(index_ijk_lm.quotient, range_k);
816	const struct fxdiv_divisor_size_t range_m = threadpool->params.parallelize_5d.range_m;
817	const struct fxdiv_result_size_t index_l_m = fxdiv_divide_size_t(index_ijk_lm.remainder, range_m);
818	const struct fxdiv_divisor_size_t range_j = threadpool->params.parallelize_5d.range_j;
819	const struct fxdiv_result_size_t index_i_j = fxdiv_divide_size_t(index_ij_k.quotient, range_j);
820	size_t i = index_i_j.quotient;
821	size_t j = index_i_j.remainder;
822	size_t k = index_ij_k.remainder;
823	size_t l = index_l_m.quotient;
824	size_t m = index_l_m.remainder;
825
826	const size_t range_l = threadpool->params.parallelize_5d.range_l;
827	while (pthreadpool_try_decrement_relaxed_size_t(&thread->range_length)) {
828	task(argument, i, j, k, l, m);
829	if (++m == range_m.value) {
830	m = `0`;
831	if (++l == range_l) {
832	l = `0`;
833	if (++k == range_k.value) {
834	k = `0`;
835	if (++j == range_j.value) {
836	j = `0`;
837	i += `1`;
838	}
839	}
840	}
841	}
842	}
843
844	/ There still may be other threads with work /
845	const size_t thread_number = thread->thread_number;
846	const size_t threads_count = threadpool->threads_count.value;
847	for (size_t tid = modulo_decrement(thread_number, threads_count);
848	tid != thread_number;
849	tid = modulo_decrement(tid, threads_count))
850	{
851	struct thread_info* other_thread = &threadpool->threads[tid];
852	while (pthreadpool_try_decrement_relaxed_size_t(&other_thread->range_length)) {
853	const size_t linear_index = pthreadpool_decrement_fetch_relaxed_size_t(&other_thread->range_end);
854	const struct fxdiv_result_size_t index_ijk_lm = fxdiv_divide_size_t(linear_index, range_lm);
855	const struct fxdiv_result_size_t index_ij_k = fxdiv_divide_size_t(index_ijk_lm.quotient, range_k);
856	const struct fxdiv_result_size_t index_l_m = fxdiv_divide_size_t(index_ijk_lm.remainder, range_m);
857	const struct fxdiv_result_size_t index_i_j = fxdiv_divide_size_t(index_ij_k.quotient, range_j);
858	task(argument, index_i_j.quotient, index_i_j.remainder, index_ij_k.remainder, index_l_m.quotient, index_l_m.remainder);
859	}
860	}
861
862	/ Make changes by this thread visible to other threads /
863	pthreadpool_fence_release();
864	}
865
866	static void thread_parallelize_5d_tile_1d(struct pthreadpool* threadpool, struct thread_info* thread) {
867	assert(threadpool != NULL);
868	assert(thread != NULL);
869
870	const pthreadpool_task_5d_tile_1d_t task = (pthreadpool_task_5d_tile_1d_t) pthreadpool_load_relaxed_void_p(&threadpool->task);
871	void *const argument = pthreadpool_load_relaxed_void_p(&threadpool->argument);
872
873	/ Process thread's own range of items /
874	const size_t range_start = pthreadpool_load_relaxed_size_t(&thread->range_start);
875	const struct fxdiv_divisor_size_t tile_range_m = threadpool->params.parallelize_5d_tile_1d.tile_range_m;
876	const struct fxdiv_result_size_t tile_index_ijkl_m = fxdiv_divide_size_t(range_start, tile_range_m);
877	const struct fxdiv_divisor_size_t range_kl = threadpool->params.parallelize_5d_tile_1d.range_kl;
878	const struct fxdiv_result_size_t index_ij_kl = fxdiv_divide_size_t(tile_index_ijkl_m.quotient, range_kl);
879	const struct fxdiv_divisor_size_t range_j = threadpool->params.parallelize_5d_tile_1d.range_j;
880	const struct fxdiv_result_size_t index_i_j = fxdiv_divide_size_t(index_ij_kl.quotient, range_j);
881	const struct fxdiv_divisor_size_t range_l = threadpool->params.parallelize_5d_tile_1d.range_l;
882	const struct fxdiv_result_size_t index_k_l = fxdiv_divide_size_t(index_ij_kl.remainder, range_l);
883	const size_t tile_m = threadpool->params.parallelize_5d_tile_1d.tile_m;
884	size_t i = index_i_j.quotient;
885	size_t j = index_i_j.remainder;
886	size_t k = index_k_l.quotient;
887	size_t l = index_k_l.remainder;
888	size_t start_m = tile_index_ijkl_m.remainder * tile_m;
889
890	const size_t range_m = threadpool->params.parallelize_5d_tile_1d.range_m;
891	const size_t range_k = threadpool->params.parallelize_5d_tile_1d.range_k;
892	while (pthreadpool_try_decrement_relaxed_size_t(&thread->range_length)) {
893	task(argument, i, j, k, l, start_m, min(range_m - start_m, tile_m));
894	start_m += tile_m;
895	if (start_m >= range_m) {
896	start_m = `0`;
897	if (++l == range_l.value) {
898	l = `0`;
899	if (++k == range_k) {
900	k = `0`;
901	if (++j == range_j.value) {
902	j = `0`;
903	i += `1`;
904	}
905	}
906	}
907	}
908	}
909
910	/ There still may be other threads with work /
911	const size_t thread_number = thread->thread_number;
912	const size_t threads_count = threadpool->threads_count.value;
913	for (size_t tid = modulo_decrement(thread_number, threads_count);
914	tid != thread_number;
915	tid = modulo_decrement(tid, threads_count))
916	{
917	struct thread_info* other_thread = &threadpool->threads[tid];
918	while (pthreadpool_try_decrement_relaxed_size_t(&other_thread->range_length)) {
919	const size_t linear_index = pthreadpool_decrement_fetch_relaxed_size_t(&other_thread->range_end);
920	const struct fxdiv_result_size_t tile_index_ijkl_m = fxdiv_divide_size_t(linear_index, tile_range_m);
921	const struct fxdiv_result_size_t index_ij_kl = fxdiv_divide_size_t(tile_index_ijkl_m.quotient, range_kl);
922	const struct fxdiv_result_size_t index_i_j = fxdiv_divide_size_t(index_ij_kl.quotient, range_j);
923	const struct fxdiv_result_size_t index_k_l = fxdiv_divide_size_t(index_ij_kl.remainder, range_l);
924	size_t start_m = tile_index_ijkl_m.remainder * tile_m;
925	task(argument, index_i_j.quotient, index_i_j.remainder, index_k_l.quotient, index_k_l.remainder, start_m,
926	min(range_m - start_m, tile_m));
927	}
928	}
929
930	/ Make changes by this thread visible to other threads /
931	pthreadpool_fence_release();
932	}
933
934	static void thread_parallelize_5d_tile_2d(struct pthreadpool* threadpool, struct thread_info* thread) {
935	assert(threadpool != NULL);
936	assert(thread != NULL);
937
938	const pthreadpool_task_5d_tile_2d_t task = (pthreadpool_task_5d_tile_2d_t) pthreadpool_load_relaxed_void_p(&threadpool->task);
939	void *const argument = pthreadpool_load_relaxed_void_p(&threadpool->argument);
940
941	/ Process thread's own range of items /
942	const size_t range_start = pthreadpool_load_relaxed_size_t(&thread->range_start);
943	const struct fxdiv_divisor_size_t tile_range_lm = threadpool->params.parallelize_5d_tile_2d.tile_range_lm;
944	const struct fxdiv_result_size_t tile_index_ijk_lm = fxdiv_divide_size_t(range_start, tile_range_lm);
945	const struct fxdiv_divisor_size_t range_k = threadpool->params.parallelize_5d_tile_2d.range_k;
946	const struct fxdiv_result_size_t index_ij_k = fxdiv_divide_size_t(tile_index_ijk_lm.quotient, range_k);
947	const struct fxdiv_divisor_size_t tile_range_m = threadpool->params.parallelize_5d_tile_2d.tile_range_m;
948	const struct fxdiv_result_size_t tile_index_l_m = fxdiv_divide_size_t(tile_index_ijk_lm.remainder, tile_range_m);
949	const struct fxdiv_divisor_size_t range_j = threadpool->params.parallelize_5d_tile_2d.range_j;
950	const struct fxdiv_result_size_t index_i_j = fxdiv_divide_size_t(index_ij_k.quotient, range_j);
951	const size_t tile_l = threadpool->params.parallelize_5d_tile_2d.tile_l;
952	const size_t tile_m = threadpool->params.parallelize_5d_tile_2d.tile_m;
953	size_t i = index_i_j.quotient;
954	size_t j = index_i_j.remainder;
955	size_t k = index_ij_k.remainder;
956	size_t start_l = tile_index_l_m.quotient * tile_l;
957	size_t start_m = tile_index_l_m.remainder * tile_m;
958
959	const size_t range_m = threadpool->params.parallelize_5d_tile_2d.range_m;
960	const size_t range_l = threadpool->params.parallelize_5d_tile_2d.range_l;
961	while (pthreadpool_try_decrement_relaxed_size_t(&thread->range_length)) {
962	task(argument, i, j, k, start_l, start_m, min(range_l - start_l, tile_l), min(range_m - start_m, tile_m));
963	start_m += tile_m;
964	if (start_m >= range_m) {
965	start_m = `0`;
966	start_l += tile_l;
967	if (start_l >= range_l) {
968	start_l = `0`;
969	if (++k == range_k.value) {
970	k = `0`;
971	if (++j == range_j.value) {
972	j = `0`;
973	i += `1`;
974	}
975	}
976	}
977	}
978	}
979
980	/ There still may be other threads with work /
981	const size_t thread_number = thread->thread_number;
982	const size_t threads_count = threadpool->threads_count.value;
983	for (size_t tid = modulo_decrement(thread_number, threads_count);
984	tid != thread_number;
985	tid = modulo_decrement(tid, threads_count))
986	{
987	struct thread_info* other_thread = &threadpool->threads[tid];
988	while (pthreadpool_try_decrement_relaxed_size_t(&other_thread->range_length)) {
989	const size_t linear_index = pthreadpool_decrement_fetch_relaxed_size_t(&other_thread->range_end);
990	const struct fxdiv_result_size_t tile_index_ijk_lm = fxdiv_divide_size_t(linear_index, tile_range_lm);
991	const struct fxdiv_result_size_t index_ij_k = fxdiv_divide_size_t(tile_index_ijk_lm.quotient, range_k);
992	const struct fxdiv_result_size_t tile_index_l_m = fxdiv_divide_size_t(tile_index_ijk_lm.remainder, tile_range_m);
993	const struct fxdiv_result_size_t index_i_j = fxdiv_divide_size_t(index_ij_k.quotient, range_j);
994	const size_t start_l = tile_index_l_m.quotient * tile_l;
995	const size_t start_m = tile_index_l_m.remainder * tile_m;
996	task(argument, index_i_j.quotient, index_i_j.remainder, index_ij_k.remainder,
997	start_l, start_m, min(range_l - start_l, tile_l), min(range_m - start_m, tile_m));
998	}
999	}
1000
1001	/ Make changes by this thread visible to other threads /
1002	pthreadpool_fence_release();
1003	}
1004
1005	static void thread_parallelize_6d(struct pthreadpool* threadpool, struct thread_info* thread) {
1006	assert(threadpool != NULL);
1007	assert(thread != NULL);
1008
1009	const pthreadpool_task_6d_t task = (pthreadpool_task_6d_t) pthreadpool_load_relaxed_void_p(&threadpool->task);
1010	void *const argument = pthreadpool_load_relaxed_void_p(&threadpool->argument);
1011
1012	/ Process thread's own range of items /
1013	const size_t range_start = pthreadpool_load_relaxed_size_t(&thread->range_start);
1014	const struct fxdiv_divisor_size_t range_lmn = threadpool->params.parallelize_6d.range_lmn;
1015	const struct fxdiv_result_size_t index_ijk_lmn = fxdiv_divide_size_t(range_start, range_lmn);
1016	const struct fxdiv_divisor_size_t range_k = threadpool->params.parallelize_6d.range_k;
1017	const struct fxdiv_result_size_t index_ij_k = fxdiv_divide_size_t(index_ijk_lmn.quotient, range_k);
1018	const struct fxdiv_divisor_size_t range_n = threadpool->params.parallelize_6d.range_n;
1019	const struct fxdiv_result_size_t index_lm_n = fxdiv_divide_size_t(index_ijk_lmn.remainder, range_n);
1020	const struct fxdiv_divisor_size_t range_j = threadpool->params.parallelize_6d.range_j;
1021	const struct fxdiv_result_size_t index_i_j = fxdiv_divide_size_t(index_ij_k.quotient, range_j);
1022	const struct fxdiv_divisor_size_t range_m = threadpool->params.parallelize_6d.range_m;
1023	const struct fxdiv_result_size_t index_l_m = fxdiv_divide_size_t(index_lm_n.quotient, range_m);
1024	size_t i = index_i_j.quotient;
1025	size_t j = index_i_j.remainder;
1026	size_t k = index_ij_k.remainder;
1027	size_t l = index_l_m.quotient;
1028	size_t m = index_l_m.remainder;
1029	size_t n = index_lm_n.remainder;
1030
1031	const size_t range_l = threadpool->params.parallelize_6d.range_l;
1032	while (pthreadpool_try_decrement_relaxed_size_t(&thread->range_length)) {
1033	task(argument, i, j, k, l, m, n);
1034	if (++n == range_n.value) {
1035	n = `0`;
1036	if (++m == range_m.value) {
1037	m = `0`;
1038	if (++l == range_l) {
1039	l = `0`;
1040	if (++k == range_k.value) {
1041	k = `0`;
1042	if (++j == range_j.value) {
1043	j = `0`;
1044	i += `1`;
1045	}
1046	}
1047	}
1048	}
1049	}
1050	}
1051
1052
1053	/ There still may be other threads with work /
1054	const size_t thread_number = thread->thread_number;
1055	const size_t threads_count = threadpool->threads_count.value;
1056	for (size_t tid = modulo_decrement(thread_number, threads_count);
1057	tid != thread_number;
1058	tid = modulo_decrement(tid, threads_count))
1059	{
1060	struct thread_info* other_thread = &threadpool->threads[tid];
1061	while (pthreadpool_try_decrement_relaxed_size_t(&other_thread->range_length)) {
1062	const size_t linear_index = pthreadpool_decrement_fetch_relaxed_size_t(&other_thread->range_end);
1063	const struct fxdiv_result_size_t index_ijk_lmn = fxdiv_divide_size_t(linear_index, range_lmn);
1064	const struct fxdiv_result_size_t index_ij_k = fxdiv_divide_size_t(index_ijk_lmn.quotient, range_k);
1065	const struct fxdiv_result_size_t index_lm_n = fxdiv_divide_size_t(index_ijk_lmn.remainder, range_n);
1066	const struct fxdiv_result_size_t index_i_j = fxdiv_divide_size_t(index_ij_k.quotient, range_j);
1067	const struct fxdiv_result_size_t index_l_m = fxdiv_divide_size_t(index_lm_n.quotient, range_m);
1068	task(argument, index_i_j.quotient, index_i_j.remainder, index_ij_k.remainder, index_l_m.quotient, index_l_m.remainder, index_lm_n.remainder);
1069	}
1070	}
1071
1072	/ Make changes by this thread visible to other threads /
1073	pthreadpool_fence_release();
1074	}
1075
1076	static void thread_parallelize_6d_tile_1d(struct pthreadpool* threadpool, struct thread_info* thread) {
1077	assert(threadpool != NULL);
1078	assert(thread != NULL);
1079
1080	const pthreadpool_task_6d_tile_1d_t task = (pthreadpool_task_6d_tile_1d_t) pthreadpool_load_relaxed_void_p(&threadpool->task);
1081	void *const argument = pthreadpool_load_relaxed_void_p(&threadpool->argument);
1082
1083	/ Process thread's own range of items /
1084	const size_t range_start = pthreadpool_load_relaxed_size_t(&thread->range_start);
1085	const struct fxdiv_divisor_size_t tile_range_lmn = threadpool->params.parallelize_6d_tile_1d.tile_range_lmn;
1086	const struct fxdiv_result_size_t tile_index_ijk_lmn = fxdiv_divide_size_t(range_start, tile_range_lmn);
1087	const struct fxdiv_divisor_size_t range_k = threadpool->params.parallelize_6d_tile_1d.range_k;
1088	const struct fxdiv_result_size_t index_ij_k = fxdiv_divide_size_t(tile_index_ijk_lmn.quotient, range_k);
1089	const struct fxdiv_divisor_size_t tile_range_n = threadpool->params.parallelize_6d_tile_1d.tile_range_n;
1090	const struct fxdiv_result_size_t tile_index_lm_n = fxdiv_divide_size_t(tile_index_ijk_lmn.remainder, tile_range_n);
1091	const struct fxdiv_divisor_size_t range_j = threadpool->params.parallelize_6d_tile_1d.range_j;
1092	const struct fxdiv_result_size_t index_i_j = fxdiv_divide_size_t(index_ij_k.quotient, range_j);
1093	const struct fxdiv_divisor_size_t range_m = threadpool->params.parallelize_6d_tile_1d.range_m;
1094	const struct fxdiv_result_size_t index_l_m = fxdiv_divide_size_t(tile_index_lm_n.quotient, range_m);
1095	const size_t tile_n = threadpool->params.parallelize_6d_tile_1d.tile_n;
1096	size_t i = index_i_j.quotient;
1097	size_t j = index_i_j.remainder;
1098	size_t k = index_ij_k.remainder;
1099	size_t l = index_l_m.quotient;
1100	size_t m = index_l_m.remainder;
1101	size_t start_n = tile_index_lm_n.remainder * tile_n;
1102
1103	const size_t range_n = threadpool->params.parallelize_6d_tile_1d.range_n;
1104	const size_t range_l = threadpool->params.parallelize_6d_tile_1d.range_l;
1105	while (pthreadpool_try_decrement_relaxed_size_t(&thread->range_length)) {
1106	task(argument, i, j, k, l, m, start_n, min(range_n - start_n, tile_n));
1107	start_n += tile_n;
1108	if (start_n >= range_n) {
1109	start_n = `0`;
1110	if (++m == range_m.value) {
1111	m = `0`;
1112	if (++l == range_l) {
1113	l = `0`;
1114	if (++k == range_k.value) {
1115	k = `0`;
1116	if (++j == range_j.value) {
1117	j = `0`;
1118	i += `1`;
1119	}
1120	}
1121	}
1122	}
1123	}
1124	}
1125
1126
1127	/ There still may be other threads with work /
1128	const size_t thread_number = thread->thread_number;
1129	const size_t threads_count = threadpool->threads_count.value;
1130	for (size_t tid = modulo_decrement(thread_number, threads_count);
1131	tid != thread_number;
1132	tid = modulo_decrement(tid, threads_count))
1133	{
1134	struct thread_info* other_thread = &threadpool->threads[tid];
1135	while (pthreadpool_try_decrement_relaxed_size_t(&other_thread->range_length)) {
1136	const size_t linear_index = pthreadpool_decrement_fetch_relaxed_size_t(&other_thread->range_end);
1137	const struct fxdiv_result_size_t tile_index_ijk_lmn = fxdiv_divide_size_t(linear_index, tile_range_lmn);
1138	const struct fxdiv_result_size_t index_ij_k = fxdiv_divide_size_t(tile_index_ijk_lmn.quotient, range_k);
1139	const struct fxdiv_result_size_t tile_index_lm_n = fxdiv_divide_size_t(tile_index_ijk_lmn.remainder, tile_range_n);
1140	const struct fxdiv_result_size_t index_i_j = fxdiv_divide_size_t(index_ij_k.quotient, range_j);
1141	const struct fxdiv_result_size_t index_l_m = fxdiv_divide_size_t(tile_index_lm_n.quotient, range_m);
1142	const size_t start_n = tile_index_lm_n.remainder * tile_n;
1143	task(argument, index_i_j.quotient, index_i_j.remainder, index_ij_k.remainder, index_l_m.quotient, index_l_m.remainder,
1144	start_n, min(range_n - start_n, tile_n));
1145	}
1146	}
1147
1148	/ Make changes by this thread visible to other threads /
1149	pthreadpool_fence_release();
1150	}
1151
1152	static void thread_parallelize_6d_tile_2d(struct pthreadpool* threadpool, struct thread_info* thread) {
1153	assert(threadpool != NULL);
1154	assert(thread != NULL);
1155
1156	const pthreadpool_task_6d_tile_2d_t task = (pthreadpool_task_6d_tile_2d_t) pthreadpool_load_relaxed_void_p(&threadpool->task);
1157	void *const argument = pthreadpool_load_relaxed_void_p(&threadpool->argument);
1158
1159	/ Process thread's own range of items /
1160	const size_t range_start = pthreadpool_load_relaxed_size_t(&thread->range_start);
1161	const struct fxdiv_divisor_size_t tile_range_mn = threadpool->params.parallelize_6d_tile_2d.tile_range_mn;
1162	const struct fxdiv_result_size_t tile_index_ijkl_mn = fxdiv_divide_size_t(range_start, tile_range_mn);
1163	const struct fxdiv_divisor_size_t range_kl = threadpool->params.parallelize_6d_tile_2d.range_kl;
1164	const struct fxdiv_result_size_t index_ij_kl = fxdiv_divide_size_t(tile_index_ijkl_mn.quotient, range_kl);
1165	const struct fxdiv_divisor_size_t tile_range_n = threadpool->params.parallelize_6d_tile_2d.tile_range_n;
1166	const struct fxdiv_result_size_t tile_index_m_n = fxdiv_divide_size_t(tile_index_ijkl_mn.remainder, tile_range_n);
1167	const struct fxdiv_divisor_size_t range_j = threadpool->params.parallelize_6d_tile_2d.range_j;
1168	const struct fxdiv_result_size_t index_i_j = fxdiv_divide_size_t(index_ij_kl.quotient, range_j);
1169	const struct fxdiv_divisor_size_t range_l = threadpool->params.parallelize_6d_tile_2d.range_l;
1170	const struct fxdiv_result_size_t index_k_l = fxdiv_divide_size_t(index_ij_kl.remainder, range_l);
1171	const size_t tile_m = threadpool->params.parallelize_6d_tile_2d.tile_m;
1172	const size_t tile_n = threadpool->params.parallelize_6d_tile_2d.tile_n;
1173	size_t i = index_i_j.quotient;
1174	size_t j = index_i_j.remainder;
1175	size_t k = index_k_l.quotient;
1176	size_t l = index_k_l.remainder;
1177	size_t start_m = tile_index_m_n.quotient * tile_m;
1178	size_t start_n = tile_index_m_n.remainder * tile_n;
1179
1180	const size_t range_n = threadpool->params.parallelize_6d_tile_2d.range_n;
1181	const size_t range_m = threadpool->params.parallelize_6d_tile_2d.range_m;
1182	const size_t range_k = threadpool->params.parallelize_6d_tile_2d.range_k;
1183	while (pthreadpool_try_decrement_relaxed_size_t(&thread->range_length)) {
1184	task(argument, i, j, k, l, start_m, start_n, min(range_m - start_m, tile_m), min(range_n - start_n, tile_n));
1185	start_n += tile_n;
1186	if (start_n >= range_n) {
1187	start_n = `0`;
1188	start_m += tile_m;
1189	if (start_m >= range_m) {
1190	start_m = `0`;
1191	if (++l == range_l.value) {
1192	l = `0`;
1193	if (++k == range_k) {
1194	k = `0`;
1195	if (++j == range_j.value) {
1196	j = `0`;
1197	i += `1`;
1198	}
1199	}
1200	}
1201	}
1202	}
1203	}
1204
1205	/ There still may be other threads with work /
1206	const size_t thread_number = thread->thread_number;
1207	const size_t threads_count = threadpool->threads_count.value;
1208	for (size_t tid = modulo_decrement(thread_number, threads_count);
1209	tid != thread_number;
1210	tid = modulo_decrement(tid, threads_count))
1211	{
1212	struct thread_info* other_thread = &threadpool->threads[tid];
1213	while (pthreadpool_try_decrement_relaxed_size_t(&other_thread->range_length)) {
1214	const size_t linear_index = pthreadpool_decrement_fetch_relaxed_size_t(&other_thread->range_end);
1215	const struct fxdiv_result_size_t tile_index_ijkl_mn = fxdiv_divide_size_t(linear_index, tile_range_mn);
1216	const struct fxdiv_result_size_t index_ij_kl = fxdiv_divide_size_t(tile_index_ijkl_mn.quotient, range_kl);
1217	const struct fxdiv_result_size_t tile_index_m_n = fxdiv_divide_size_t(tile_index_ijkl_mn.remainder, tile_range_n);
1218	const struct fxdiv_result_size_t index_i_j = fxdiv_divide_size_t(index_ij_kl.quotient, range_j);
1219	const struct fxdiv_result_size_t index_k_l = fxdiv_divide_size_t(index_ij_kl.remainder, range_l);
1220	const size_t start_m = tile_index_m_n.quotient * tile_m;
1221	const size_t start_n = tile_index_m_n.remainder * tile_n;
1222	task(argument, index_i_j.quotient, index_i_j.remainder, index_k_l.quotient, index_k_l.remainder,
1223	start_m, start_n, min(range_m - start_m, tile_m), min(range_n - start_n, tile_n));
1224	}
1225	}
1226
1227	/ Make changes by this thread visible to other threads /
1228	pthreadpool_fence_release();
1229	}
1230
1231	void pthreadpool_parallelize_1d(
1232	struct pthreadpool* threadpool,
1233	pthreadpool_task_1d_t task,
1234	void* argument,
1235	size_t range,
1236	uint32_t flags)
1237	{
1238	size_t threads_count;
1239	if (threadpool == NULL \|\| (threads_count = threadpool->threads_count.value) <= `1` \|\| range <= `1`) {
1240	/ No thread pool used: execute task sequentially on the calling thread /
1241	struct fpu_state saved_fpu_state = { `0` };
1242	if (flags & PTHREADPOOL_FLAG_DISABLE_DENORMALS) {
1243	saved_fpu_state = get_fpu_state();
1244	disable_fpu_denormals();
1245	}
1246	for (size_t i = `0`; i < range; i++) {
1247	task(argument, i);
1248	}
1249	if (flags & PTHREADPOOL_FLAG_DISABLE_DENORMALS) {
1250	set_fpu_state(saved_fpu_state);
1251	}
1252	} else {
1253	thread_function_t parallelize_1d = &thread_parallelize_1d;
1254	#if PTHREADPOOL_USE_FASTPATH
1255	const size_t range_threshold = -threads_count;
1256	if (range < range_threshold) {
1257	parallelize_1d = &pthreadpool_thread_parallelize_1d_fastpath;
1258	}
1259	#endif
1260	pthreadpool_parallelize(
1261	threadpool, parallelize_1d, NULL, `0`,
1262	(void*) task, argument, range, flags);
1263	}
1264	}
1265
1266	void pthreadpool_parallelize_1d_with_uarch(
1267	pthreadpool_t threadpool,
1268	pthreadpool_task_1d_with_id_t task,
1269	void* argument,
1270	uint32_t default_uarch_index,
1271	uint32_t max_uarch_index,
1272	size_t range,
1273	uint32_t flags)
1274	{
1275	size_t threads_count;
1276	if (threadpool == NULL \|\| (threads_count = threadpool->threads_count.value) <= `1` \|\| range <= `1`) {
1277	/ No thread pool used: execute task sequentially on the calling thread /
1278
1279	uint32_t uarch_index = default_uarch_index;
1280	#if PTHREADPOOL_USE_CPUINFO
1281	uarch_index = cpuinfo_get_current_uarch_index_with_default(default_uarch_index);
1282	if (uarch_index > max_uarch_index) {
1283	uarch_index = default_uarch_index;
1284	}
1285	#endif
1286
1287	struct fpu_state saved_fpu_state = { `0` };
1288	if (flags & PTHREADPOOL_FLAG_DISABLE_DENORMALS) {
1289	saved_fpu_state = get_fpu_state();
1290	disable_fpu_denormals();
1291	}
1292	for (size_t i = `0`; i < range; i++) {
1293	task(argument, uarch_index, i);
1294	}
1295	if (flags & PTHREADPOOL_FLAG_DISABLE_DENORMALS) {
1296	set_fpu_state(saved_fpu_state);
1297	}
1298	} else {
1299	const struct pthreadpool_1d_with_uarch_params params = {
1300	.default_uarch_index = default_uarch_index,
1301	.max_uarch_index = max_uarch_index,
1302	};
1303	thread_function_t parallelize_1d_with_uarch = &thread_parallelize_1d_with_uarch;
1304	#if PTHREADPOOL_USE_FASTPATH
1305	const size_t range_threshold = -threads_count;
1306	if (range < range_threshold) {
1307	parallelize_1d_with_uarch = &pthreadpool_thread_parallelize_1d_with_uarch_fastpath;
1308	}
1309	#endif
1310	pthreadpool_parallelize(
1311	threadpool, parallelize_1d_with_uarch, &params, sizeof(params),
1312	task, argument, range, flags);
1313	}
1314	}
1315
1316	void pthreadpool_parallelize_1d_tile_1d(
1317	pthreadpool_t threadpool,
1318	pthreadpool_task_1d_tile_1d_t task,
1319	void* argument,
1320	size_t range,
1321	size_t tile,
1322	uint32_t flags)
1323	{
1324	size_t threads_count;
1325	if (threadpool == NULL \|\| (threads_count = threadpool->threads_count.value) <= `1` \|\| range <= tile) {
1326	/ No thread pool used: execute task sequentially on the calling thread /
1327	struct fpu_state saved_fpu_state = { `0` };
1328	if (flags & PTHREADPOOL_FLAG_DISABLE_DENORMALS) {
1329	saved_fpu_state = get_fpu_state();
1330	disable_fpu_denormals();
1331	}
1332	for (size_t i = `0`; i < range; i += tile) {
1333	task(argument, i, min(range - i, tile));
1334	}
1335	if (flags & PTHREADPOOL_FLAG_DISABLE_DENORMALS) {
1336	set_fpu_state(saved_fpu_state);
1337	}
1338	} else {
1339	const size_t tile_range = divide_round_up(range, tile);
1340	const struct pthreadpool_1d_tile_1d_params params = {
1341	.range = range,
1342	.tile = tile,
1343	};
1344	thread_function_t parallelize_1d_tile_1d = &thread_parallelize_1d_tile_1d;
1345	#if PTHREADPOOL_USE_FASTPATH
1346	const size_t range_threshold = -threads_count;
1347	if (range < range_threshold) {
1348	parallelize_1d_tile_1d = &pthreadpool_thread_parallelize_1d_tile_1d_fastpath;
1349	}
1350	#endif
1351	pthreadpool_parallelize(
1352	threadpool, parallelize_1d_tile_1d, &params, sizeof(params),
1353	task, argument, tile_range, flags);
1354	}
1355	}
1356
1357	void pthreadpool_parallelize_2d(
1358	pthreadpool_t threadpool,
1359	pthreadpool_task_2d_t task,
1360	void* argument,
1361	size_t range_i,
1362	size_t range_j,
1363	uint32_t flags)
1364	{
1365	size_t threads_count;
1366	if (threadpool == NULL \|\| (threads_count = threadpool->threads_count.value) <= `1` \|\| (range_i \| range_j) <= `1`) {
1367	/ No thread pool used: execute task sequentially on the calling thread /
1368	struct fpu_state saved_fpu_state = { `0` };
1369	if (flags & PTHREADPOOL_FLAG_DISABLE_DENORMALS) {
1370	saved_fpu_state = get_fpu_state();
1371	disable_fpu_denormals();
1372	}
1373	for (size_t i = `0`; i < range_i; i++) {
1374	for (size_t j = `0`; j < range_j; j++) {
1375	task(argument, i, j);
1376	}
1377	}
1378	if (flags & PTHREADPOOL_FLAG_DISABLE_DENORMALS) {
1379	set_fpu_state(saved_fpu_state);
1380	}
1381	} else {
1382	const size_t range = range_i * range_j;
1383	const struct pthreadpool_2d_params params = {
1384	.range_j = fxdiv_init_size_t(range_j),
1385	};
1386	thread_function_t parallelize_2d = &thread_parallelize_2d;
1387	#if PTHREADPOOL_USE_FASTPATH
1388	const size_t range_threshold = -threads_count;
1389	if (range < range_threshold) {
1390	parallelize_2d = &pthreadpool_thread_parallelize_2d_fastpath;
1391	}
1392	#endif
1393	pthreadpool_parallelize(
1394	threadpool, parallelize_2d, &params, sizeof(params),
1395	task, argument, range, flags);
1396	}
1397	}
1398
1399	void pthreadpool_parallelize_2d_tile_1d(
1400	pthreadpool_t threadpool,
1401	pthreadpool_task_2d_tile_1d_t task,
1402	void* argument,
1403	size_t range_i,
1404	size_t range_j,
1405	size_t tile_j,
1406	uint32_t flags)
1407	{
1408	size_t threads_count;
1409	if (threadpool == NULL \|\| (threads_count = threadpool->threads_count.value) <= `1` \|\| (range_i <= `1` && range_j <= tile_j)) {
1410	/ No thread pool used: execute task sequentially on the calling thread /
1411	struct fpu_state saved_fpu_state = { `0` };
1412	if (flags & PTHREADPOOL_FLAG_DISABLE_DENORMALS) {
1413	saved_fpu_state = get_fpu_state();
1414	disable_fpu_denormals();
1415	}
1416	for (size_t i = `0`; i < range_i; i++) {
1417	for (size_t j = `0`; j < range_j; j += tile_j) {
1418	task(argument, i, j, min(range_j - j, tile_j));
1419	}
1420	}
1421	if (flags & PTHREADPOOL_FLAG_DISABLE_DENORMALS) {
1422	set_fpu_state(saved_fpu_state);
1423	}
1424	} else {
1425	const size_t tile_range_j = divide_round_up(range_j, tile_j);
1426	const size_t tile_range = range_i * tile_range_j;
1427	const struct pthreadpool_2d_tile_1d_params params = {
1428	.range_j = range_j,
1429	.tile_j = tile_j,
1430	.tile_range_j = fxdiv_init_size_t(tile_range_j),
1431	};
1432	thread_function_t parallelize_2d_tile_1d = &thread_parallelize_2d_tile_1d;
1433	#if PTHREADPOOL_USE_FASTPATH
1434	const size_t range_threshold = -threads_count;
1435	if (tile_range < range_threshold) {
1436	parallelize_2d_tile_1d = &pthreadpool_thread_parallelize_2d_tile_1d_fastpath;
1437	}
1438	#endif
1439	pthreadpool_parallelize(
1440	threadpool, parallelize_2d_tile_1d, &params, sizeof(params),
1441	task, argument, tile_range, flags);
1442	}
1443	}
1444
1445	void pthreadpool_parallelize_2d_tile_2d(
1446	pthreadpool_t threadpool,
1447	pthreadpool_task_2d_tile_2d_t task,
1448	void* argument,
1449	size_t range_i,
1450	size_t range_j,
1451	size_t tile_i,
1452	size_t tile_j,
1453	uint32_t flags)
1454	{
1455	size_t threads_count;
1456	if (threadpool == NULL \|\| (threads_count = threadpool->threads_count.value) <= `1` \|\| (range_i <= tile_i && range_j <= tile_j)) {
1457	/ No thread pool used: execute task sequentially on the calling thread /
1458	struct fpu_state saved_fpu_state = { `0` };
1459	if (flags & PTHREADPOOL_FLAG_DISABLE_DENORMALS) {
1460	saved_fpu_state = get_fpu_state();
1461	disable_fpu_denormals();
1462	}
1463	for (size_t i = `0`; i < range_i; i += tile_i) {
1464	for (size_t j = `0`; j < range_j; j += tile_j) {
1465	task(argument, i, j, min(range_i - i, tile_i), min(range_j - j, tile_j));
1466	}
1467	}
1468	if (flags & PTHREADPOOL_FLAG_DISABLE_DENORMALS) {
1469	set_fpu_state(saved_fpu_state);
1470	}
1471	} else {
1472	const size_t tile_range_i = divide_round_up(range_i, tile_i);
1473	const size_t tile_range_j = divide_round_up(range_j, tile_j);
1474	const size_t tile_range = tile_range_i * tile_range_j;
1475	const struct pthreadpool_2d_tile_2d_params params = {
1476	.range_i = range_i,
1477	.tile_i = tile_i,
1478	.range_j = range_j,
1479	.tile_j = tile_j,
1480	.tile_range_j = fxdiv_init_size_t(tile_range_j),
1481	};
1482	thread_function_t parallelize_2d_tile_2d = &thread_parallelize_2d_tile_2d;
1483	#if PTHREADPOOL_USE_FASTPATH
1484	const size_t range_threshold = -threads_count;
1485	if (tile_range < range_threshold) {
1486	parallelize_2d_tile_2d = &pthreadpool_thread_parallelize_2d_tile_2d_fastpath;
1487	}
1488	#endif
1489	pthreadpool_parallelize(
1490	threadpool, parallelize_2d_tile_2d, &params, sizeof(params),
1491	task, argument, tile_range, flags);
1492	}
1493	}
1494
1495	void pthreadpool_parallelize_2d_tile_2d_with_uarch(
1496	pthreadpool_t threadpool,
1497	pthreadpool_task_2d_tile_2d_with_id_t task,
1498	void* argument,
1499	uint32_t default_uarch_index,
1500	uint32_t max_uarch_index,
1501	size_t range_i,
1502	size_t range_j,
1503	size_t tile_i,
1504	size_t tile_j,
1505	uint32_t flags)
1506	{
1507	size_t threads_count;
1508	if (threadpool == NULL \|\| (threads_count = threadpool->threads_count.value) <= `1` \|\| (range_i <= tile_i && range_j <= tile_j)) {
1509	/ No thread pool used: execute task sequentially on the calling thread /
1510
1511	uint32_t uarch_index = default_uarch_index;
1512	#if PTHREADPOOL_USE_CPUINFO
1513	uarch_index = cpuinfo_get_current_uarch_index_with_default(default_uarch_index);
1514	if (uarch_index > max_uarch_index) {
1515	uarch_index = default_uarch_index;
1516	}
1517	#endif
1518
1519	struct fpu_state saved_fpu_state = { `0` };
1520	if (flags & PTHREADPOOL_FLAG_DISABLE_DENORMALS) {
1521	saved_fpu_state = get_fpu_state();
1522	disable_fpu_denormals();
1523	}
1524	for (size_t i = `0`; i < range_i; i += tile_i) {
1525	for (size_t j = `0`; j < range_j; j += tile_j) {
1526	task(argument, uarch_index, i, j, min(range_i - i, tile_i), min(range_j - j, tile_j));
1527	}
1528	}
1529	if (flags & PTHREADPOOL_FLAG_DISABLE_DENORMALS) {
1530	set_fpu_state(saved_fpu_state);
1531	}
1532	} else {
1533	const size_t tile_range_i = divide_round_up(range_i, tile_i);
1534	const size_t tile_range_j = divide_round_up(range_j, tile_j);
1535	const size_t tile_range = tile_range_i * tile_range_j;
1536	const struct pthreadpool_2d_tile_2d_with_uarch_params params = {
1537	.default_uarch_index = default_uarch_index,
1538	.max_uarch_index = max_uarch_index,
1539	.range_i = range_i,
1540	.tile_i = tile_i,
1541	.range_j = range_j,
1542	.tile_j = tile_j,
1543	.tile_range_j = fxdiv_init_size_t(tile_range_j),
1544	};
1545	thread_function_t parallelize_2d_tile_2d_with_uarch = &thread_parallelize_2d_tile_2d_with_uarch;
1546	#if PTHREADPOOL_USE_FASTPATH
1547	const size_t range_threshold = -threads_count;
1548	if (tile_range < range_threshold) {
1549	parallelize_2d_tile_2d_with_uarch = &pthreadpool_thread_parallelize_2d_tile_2d_with_uarch_fastpath;
1550	}
1551	#endif
1552	pthreadpool_parallelize(
1553	threadpool, parallelize_2d_tile_2d_with_uarch, &params, sizeof(params),
1554	task, argument, tile_range, flags);
1555	}
1556	}
1557
1558	void pthreadpool_parallelize_3d(
1559	pthreadpool_t threadpool,
1560	pthreadpool_task_3d_t task,
1561	void* argument,
1562	size_t range_i,
1563	size_t range_j,
1564	size_t range_k,
1565	uint32_t flags)
1566	{
1567	size_t threads_count;
1568	if (threadpool == NULL \|\| (threads_count = threadpool->threads_count.value) <= `1` \|\| (range_i \| range_j \| range_k) <= `1`) {
1569	/ No thread pool used: execute task sequentially on the calling thread /
1570	struct fpu_state saved_fpu_state = { `0` };
1571	if (flags & PTHREADPOOL_FLAG_DISABLE_DENORMALS) {
1572	saved_fpu_state = get_fpu_state();
1573	disable_fpu_denormals();
1574	}
1575	for (size_t i = `0`; i < range_i; i++) {
1576	for (size_t j = `0`; j < range_j; j++) {
1577	for (size_t k = `0`; k < range_k; k++) {
1578	task(argument, i, j, k);
1579	}
1580	}
1581	}
1582	if (flags & PTHREADPOOL_FLAG_DISABLE_DENORMALS) {
1583	set_fpu_state(saved_fpu_state);
1584	}
1585	} else {
1586	const size_t range = range_i * range_j * range_k;
1587	const struct pthreadpool_3d_params params = {
1588	.range_j = fxdiv_init_size_t(range_j),
1589	.range_k = fxdiv_init_size_t(range_k),
1590	};
1591	thread_function_t parallelize_3d = &thread_parallelize_3d;
1592	#if PTHREADPOOL_USE_FASTPATH
1593	const size_t range_threshold = -threads_count;
1594	if (range < range_threshold) {
1595	parallelize_3d = &pthreadpool_thread_parallelize_3d_fastpath;
1596	}
1597	#endif
1598	pthreadpool_parallelize(
1599	threadpool, parallelize_3d, &params, sizeof(params),
1600	task, argument, range, flags);
1601	}
1602	}
1603
1604	void pthreadpool_parallelize_3d_tile_1d(
1605	pthreadpool_t threadpool,
1606	pthreadpool_task_3d_tile_1d_t task,
1607	void* argument,
1608	size_t range_i,
1609	size_t range_j,
1610	size_t range_k,
1611	size_t tile_k,
1612	uint32_t flags)
1613	{
1614	size_t threads_count;
1615	if (threadpool == NULL \|\| (threads_count = threadpool->threads_count.value) <= `1` \|\| ((range_i \| range_j) <= `1` && range_k <= tile_k)) {
1616	/ No thread pool used: execute task sequentially on the calling thread /
1617	struct fpu_state saved_fpu_state = { `0` };
1618	if (flags & PTHREADPOOL_FLAG_DISABLE_DENORMALS) {
1619	saved_fpu_state = get_fpu_state();
1620	disable_fpu_denormals();
1621	}
1622	for (size_t i = `0`; i < range_i; i++) {
1623	for (size_t j = `0`; j < range_j; j++) {
1624	for (size_t k = `0`; k < range_k; k += tile_k) {
1625	task(argument, i, j, k, min(range_k - k, tile_k));
1626	}
1627	}
1628	}
1629	if (flags & PTHREADPOOL_FLAG_DISABLE_DENORMALS) {
1630	set_fpu_state(saved_fpu_state);
1631	}
1632	} else {
1633	const size_t tile_range_k = divide_round_up(range_k, tile_k);
1634	const size_t tile_range = range_i * range_j * tile_range_k;
1635	const struct pthreadpool_3d_tile_1d_params params = {
1636	.range_k = range_k,
1637	.tile_k = tile_k,
1638	.range_j = fxdiv_init_size_t(range_j),
1639	.tile_range_k = fxdiv_init_size_t(tile_range_k),
1640	};
1641	thread_function_t parallelize_3d_tile_1d = &thread_parallelize_3d_tile_1d;
1642	#if PTHREADPOOL_USE_FASTPATH
1643	const size_t range_threshold = -threads_count;
1644	if (tile_range < range_threshold) {
1645	parallelize_3d_tile_1d = &pthreadpool_thread_parallelize_3d_tile_1d_fastpath;
1646	}
1647	#endif
1648	pthreadpool_parallelize(
1649	threadpool, parallelize_3d_tile_1d, &params, sizeof(params),
1650	task, argument, tile_range, flags);
1651	}
1652	}
1653
1654	void pthreadpool_parallelize_3d_tile_2d(
1655	pthreadpool_t threadpool,
1656	pthreadpool_task_3d_tile_2d_t task,
1657	void* argument,
1658	size_t range_i,
1659	size_t range_j,
1660	size_t range_k,
1661	size_t tile_j,
1662	size_t tile_k,
1663	uint32_t flags)
1664	{
1665	size_t threads_count;
1666	if (threadpool == NULL \|\| (threads_count = threadpool->threads_count.value) <= `1` \|\| (range_i <= `1` && range_j <= tile_j && range_k <= tile_k)) {
1667	/ No thread pool used: execute task sequentially on the calling thread /
1668	struct fpu_state saved_fpu_state = { `0` };
1669	if (flags & PTHREADPOOL_FLAG_DISABLE_DENORMALS) {
1670	saved_fpu_state = get_fpu_state();
1671	disable_fpu_denormals();
1672	}
1673	for (size_t i = `0`; i < range_i; i++) {
1674	for (size_t j = `0`; j < range_j; j += tile_j) {
1675	for (size_t k = `0`; k < range_k; k += tile_k) {
1676	task(argument, i, j, k, min(range_j - j, tile_j), min(range_k - k, tile_k));
1677	}
1678	}
1679	}
1680	if (flags & PTHREADPOOL_FLAG_DISABLE_DENORMALS) {
1681	set_fpu_state(saved_fpu_state);
1682	}
1683	} else {
1684	const size_t tile_range_j = divide_round_up(range_j, tile_j);
1685	const size_t tile_range_k = divide_round_up(range_k, tile_k);
1686	const size_t tile_range = range_i * tile_range_j * tile_range_k;
1687	const struct pthreadpool_3d_tile_2d_params params = {
1688	.range_j = range_j,
1689	.tile_j = tile_j,
1690	.range_k = range_k,
1691	.tile_k = tile_k,
1692	.tile_range_j = fxdiv_init_size_t(tile_range_j),
1693	.tile_range_k = fxdiv_init_size_t(tile_range_k),
1694	};
1695	thread_function_t parallelize_3d_tile_2d = &thread_parallelize_3d_tile_2d;
1696	#if PTHREADPOOL_USE_FASTPATH
1697	const size_t range_threshold = -threads_count;
1698	if (tile_range < range_threshold) {
1699	parallelize_3d_tile_2d = &pthreadpool_thread_parallelize_3d_tile_2d_fastpath;
1700	}
1701	#endif
1702	pthreadpool_parallelize(
1703	threadpool, parallelize_3d_tile_2d, &params, sizeof(params),
1704	task, argument, tile_range, flags);
1705	}
1706	}
1707
1708	void pthreadpool_parallelize_3d_tile_2d_with_uarch(
1709	pthreadpool_t threadpool,
1710	pthreadpool_task_3d_tile_2d_with_id_t task,
1711	void* argument,
1712	uint32_t default_uarch_index,
1713	uint32_t max_uarch_index,
1714	size_t range_i,
1715	size_t range_j,
1716	size_t range_k,
1717	size_t tile_j,
1718	size_t tile_k,
1719	uint32_t flags)
1720	{
1721	size_t threads_count;
1722	if (threadpool == NULL \|\| (threads_count = threadpool->threads_count.value) <= `1` \|\| (range_i <= `1` && range_j <= tile_j && range_k <= tile_k)) {
1723	/ No thread pool used: execute task sequentially on the calling thread /
1724
1725	uint32_t uarch_index = default_uarch_index;
1726	#if PTHREADPOOL_USE_CPUINFO
1727	uarch_index = cpuinfo_get_current_uarch_index_with_default(default_uarch_index);
1728	if (uarch_index > max_uarch_index) {
1729	uarch_index = default_uarch_index;
1730	}
1731	#endif
1732
1733	struct fpu_state saved_fpu_state = { `0` };
1734	if (flags & PTHREADPOOL_FLAG_DISABLE_DENORMALS) {
1735	saved_fpu_state = get_fpu_state();
1736	disable_fpu_denormals();
1737	}
1738	for (size_t i = `0`; i < range_i; i++) {
1739	for (size_t j = `0`; j < range_j; j += tile_j) {
1740	for (size_t k = `0`; k < range_k; k += tile_k) {
1741	task(argument, uarch_index, i, j, k, min(range_j - j, tile_j), min(range_k - k, tile_k));
1742	}
1743	}
1744	}
1745	if (flags & PTHREADPOOL_FLAG_DISABLE_DENORMALS) {
1746	set_fpu_state(saved_fpu_state);
1747	}
1748	} else {
1749	const size_t tile_range_j = divide_round_up(range_j, tile_j);
1750	const size_t tile_range_k = divide_round_up(range_k, tile_k);
1751	const size_t tile_range = range_i * tile_range_j * tile_range_k;
1752	const struct pthreadpool_3d_tile_2d_with_uarch_params params = {
1753	.default_uarch_index = default_uarch_index,
1754	.max_uarch_index = max_uarch_index,
1755	.range_j = range_j,
1756	.tile_j = tile_j,
1757	.range_k = range_k,
1758	.tile_k = tile_k,
1759	.tile_range_j = fxdiv_init_size_t(tile_range_j),
1760	.tile_range_k = fxdiv_init_size_t(tile_range_k),
1761	};
1762	thread_function_t parallelize_3d_tile_2d_with_uarch = &thread_parallelize_3d_tile_2d_with_uarch;
1763	#if PTHREADPOOL_USE_FASTPATH
1764	const size_t range_threshold = -threads_count;
1765	if (tile_range < range_threshold) {
1766	parallelize_3d_tile_2d_with_uarch = &pthreadpool_thread_parallelize_3d_tile_2d_with_uarch_fastpath;
1767	}
1768	#endif
1769	pthreadpool_parallelize(
1770	threadpool, parallelize_3d_tile_2d_with_uarch, &params, sizeof(params),
1771	task, argument, tile_range, flags);
1772	}
1773	}
1774
1775	void pthreadpool_parallelize_4d(
1776	pthreadpool_t threadpool,
1777	pthreadpool_task_4d_t task,
1778	void* argument,
1779	size_t range_i,
1780	size_t range_j,
1781	size_t range_k,
1782	size_t range_l,
1783	uint32_t flags)
1784	{
1785	size_t threads_count;
1786	if (threadpool == NULL \|\| (threads_count = threadpool->threads_count.value) <= `1` \|\| (range_i \| range_j \| range_k \| range_l) <= `1`) {
1787	/ No thread pool used: execute task sequentially on the calling thread /
1788	struct fpu_state saved_fpu_state = { `0` };
1789	if (flags & PTHREADPOOL_FLAG_DISABLE_DENORMALS) {
1790	saved_fpu_state = get_fpu_state();
1791	disable_fpu_denormals();
1792	}
1793	for (size_t i = `0`; i < range_i; i++) {
1794	for (size_t j = `0`; j < range_j; j++) {
1795	for (size_t k = `0`; k < range_k; k++) {
1796	for (size_t l = `0`; l < range_l; l++) {
1797	task(argument, i, j, k, l);
1798	}
1799	}
1800	}
1801	}
1802	if (flags & PTHREADPOOL_FLAG_DISABLE_DENORMALS) {
1803	set_fpu_state(saved_fpu_state);
1804	}
1805	} else {
1806	const size_t range_kl = range_k * range_l;
1807	const size_t range = range_i * range_j * range_kl;
1808	const struct pthreadpool_4d_params params = {
1809	.range_k = range_k,
1810	.range_j = fxdiv_init_size_t(range_j),
1811	.range_kl = fxdiv_init_size_t(range_kl),
1812	.range_l = fxdiv_init_size_t(range_l),
1813	};
1814	thread_function_t parallelize_4d = &thread_parallelize_4d;
1815	#if PTHREADPOOL_USE_FASTPATH
1816	const size_t range_threshold = -threads_count;
1817	if (range < range_threshold) {
1818	parallelize_4d = &pthreadpool_thread_parallelize_4d_fastpath;
1819	}
1820	#endif
1821	pthreadpool_parallelize(
1822	threadpool, parallelize_4d, &params, sizeof(params),
1823	task, argument, range, flags);
1824	}
1825	}
1826
1827	void pthreadpool_parallelize_4d_tile_1d(
1828	pthreadpool_t threadpool,
1829	pthreadpool_task_4d_tile_1d_t task,
1830	void* argument,
1831	size_t range_i,
1832	size_t range_j,
1833	size_t range_k,
1834	size_t range_l,
1835	size_t tile_l,
1836	uint32_t flags)
1837	{
1838	size_t threads_count;
1839	if (threadpool == NULL \|\| (threads_count = threadpool->threads_count.value) <= `1` \|\| ((range_i \| range_j \| range_k) <= `1` && range_l <= tile_l)) {
1840	/ No thread pool used: execute task sequentially on the calling thread /
1841	struct fpu_state saved_fpu_state = { `0` };
1842	if (flags & PTHREADPOOL_FLAG_DISABLE_DENORMALS) {
1843	saved_fpu_state = get_fpu_state();
1844	disable_fpu_denormals();
1845	}
1846	for (size_t i = `0`; i < range_i; i++) {
1847	for (size_t j = `0`; j < range_j; j++) {
1848	for (size_t k = `0`; k < range_k; k++) {
1849	for (size_t l = `0`; l < range_l; l += tile_l) {
1850	task(argument, i, j, k, l, min(range_l - l, tile_l));
1851	}
1852	}
1853	}
1854	}
1855	if (flags & PTHREADPOOL_FLAG_DISABLE_DENORMALS) {
1856	set_fpu_state(saved_fpu_state);
1857	}
1858	} else {
1859	const size_t tile_range_l = divide_round_up(range_l, tile_l);
1860	const size_t tile_range_kl = range_k * tile_range_l;
1861	const size_t tile_range = range_i * range_j * tile_range_kl;
1862	const struct pthreadpool_4d_tile_1d_params params = {
1863	.range_k = range_k,
1864	.range_l = range_l,
1865	.tile_l = tile_l,
1866	.range_j = fxdiv_init_size_t(range_j),
1867	.tile_range_kl = fxdiv_init_size_t(tile_range_kl),
1868	.tile_range_l = fxdiv_init_size_t(tile_range_l),
1869	};
1870	thread_function_t parallelize_4d_tile_1d = &thread_parallelize_4d_tile_1d;
1871	#if PTHREADPOOL_USE_FASTPATH
1872	const size_t range_threshold = -threads_count;
1873	if (tile_range < range_threshold) {
1874	parallelize_4d_tile_1d = &pthreadpool_thread_parallelize_4d_tile_1d_fastpath;
1875	}
1876	#endif
1877	pthreadpool_parallelize(
1878	threadpool, parallelize_4d_tile_1d, &params, sizeof(params),
1879	task, argument, tile_range, flags);
1880	}
1881	}
1882
1883	void pthreadpool_parallelize_4d_tile_2d(
1884	pthreadpool_t threadpool,
1885	pthreadpool_task_4d_tile_2d_t task,
1886	void* argument,
1887	size_t range_i,
1888	size_t range_j,
1889	size_t range_k,
1890	size_t range_l,
1891	size_t tile_k,
1892	size_t tile_l,
1893	uint32_t flags)
1894	{
1895	size_t threads_count;
1896	if (threadpool == NULL \|\| (threads_count = threadpool->threads_count.value) <= `1` \|\| ((range_i \| range_j) <= `1` && range_k <= tile_k && range_l <= tile_l)) {
1897	/ No thread pool used: execute task sequentially on the calling thread /
1898	struct fpu_state saved_fpu_state = { `0` };
1899	if (flags & PTHREADPOOL_FLAG_DISABLE_DENORMALS) {
1900	saved_fpu_state = get_fpu_state();
1901	disable_fpu_denormals();
1902	}
1903	for (size_t i = `0`; i < range_i; i++) {
1904	for (size_t j = `0`; j < range_j; j++) {
1905	for (size_t k = `0`; k < range_k; k += tile_k) {
1906	for (size_t l = `0`; l < range_l; l += tile_l) {
1907	task(argument, i, j, k, l,
1908	min(range_k - k, tile_k), min(range_l - l, tile_l));
1909	}
1910	}
1911	}
1912	}
1913	if (flags & PTHREADPOOL_FLAG_DISABLE_DENORMALS) {
1914	set_fpu_state(saved_fpu_state);
1915	}
1916	} else {
1917	const size_t tile_range_l = divide_round_up(range_l, tile_l);
1918	const size_t tile_range_kl = divide_round_up(range_k, tile_k) * tile_range_l;
1919	const size_t tile_range = range_i * range_j * tile_range_kl;
1920	const struct pthreadpool_4d_tile_2d_params params = {
1921	.range_k = range_k,
1922	.tile_k = tile_k,
1923	.range_l = range_l,
1924	.tile_l = tile_l,
1925	.range_j = fxdiv_init_size_t(range_j),
1926	.tile_range_kl = fxdiv_init_size_t(tile_range_kl),
1927	.tile_range_l = fxdiv_init_size_t(tile_range_l),
1928	};
1929	thread_function_t parallelize_4d_tile_2d = &thread_parallelize_4d_tile_2d;
1930	#if PTHREADPOOL_USE_FASTPATH
1931	const size_t range_threshold = -threads_count;
1932	if (tile_range < range_threshold) {
1933	parallelize_4d_tile_2d = &pthreadpool_thread_parallelize_4d_tile_2d_fastpath;
1934	}
1935	#endif
1936	pthreadpool_parallelize(
1937	threadpool, parallelize_4d_tile_2d, &params, sizeof(params),
1938	task, argument, tile_range, flags);
1939	}
1940	}
1941
1942	void pthreadpool_parallelize_4d_tile_2d_with_uarch(
1943	pthreadpool_t threadpool,
1944	pthreadpool_task_4d_tile_2d_with_id_t task,
1945	void* argument,
1946	uint32_t default_uarch_index,
1947	uint32_t max_uarch_index,
1948	size_t range_i,
1949	size_t range_j,
1950	size_t range_k,
1951	size_t range_l,
1952	size_t tile_k,
1953	size_t tile_l,
1954	uint32_t flags)
1955	{
1956	size_t threads_count;
1957	if (threadpool == NULL \|\| (threads_count = threadpool->threads_count.value) <= `1` \|\| ((range_i \| range_j) <= `1` && range_k <= tile_k && range_l <= tile_l)) {
1958	/ No thread pool used: execute task sequentially on the calling thread /
1959
1960	uint32_t uarch_index = default_uarch_index;
1961	#if PTHREADPOOL_USE_CPUINFO
1962	uarch_index = cpuinfo_get_current_uarch_index_with_default(default_uarch_index);
1963	if (uarch_index > max_uarch_index) {
1964	uarch_index = default_uarch_index;
1965	}
1966	#endif
1967
1968	struct fpu_state saved_fpu_state = { `0` };
1969	if (flags & PTHREADPOOL_FLAG_DISABLE_DENORMALS) {
1970	saved_fpu_state = get_fpu_state();
1971	disable_fpu_denormals();
1972	}
1973	for (size_t i = `0`; i < range_i; i++) {
1974	for (size_t j = `0`; j < range_j; j++) {
1975	for (size_t k = `0`; k < range_k; k += tile_k) {
1976	for (size_t l = `0`; l < range_l; l += tile_l) {
1977	task(argument, uarch_index, i, j, k, l,
1978	min(range_k - k, tile_k), min(range_l - l, tile_l));
1979	}
1980	}
1981	}
1982	}
1983	if (flags & PTHREADPOOL_FLAG_DISABLE_DENORMALS) {
1984	set_fpu_state(saved_fpu_state);
1985	}
1986	} else {
1987	const size_t tile_range_l = divide_round_up(range_l, tile_l);
1988	const size_t tile_range_kl = divide_round_up(range_k, tile_k) * tile_range_l;
1989	const size_t tile_range = range_i * range_j * tile_range_kl;
1990	const struct pthreadpool_4d_tile_2d_with_uarch_params params = {
1991	.default_uarch_index = default_uarch_index,
1992	.max_uarch_index = max_uarch_index,
1993	.range_k = range_k,
1994	.tile_k = tile_k,
1995	.range_l = range_l,
1996	.tile_l = tile_l,
1997	.range_j = fxdiv_init_size_t(range_j),
1998	.tile_range_kl = fxdiv_init_size_t(tile_range_kl),
1999	.tile_range_l = fxdiv_init_size_t(tile_range_l),
2000	};
2001	thread_function_t parallelize_4d_tile_2d_with_uarch = &thread_parallelize_4d_tile_2d_with_uarch;
2002	#if PTHREADPOOL_USE_FASTPATH
2003	const size_t range_threshold = -threads_count;
2004	if (tile_range < range_threshold) {
2005	parallelize_4d_tile_2d_with_uarch = &pthreadpool_thread_parallelize_4d_tile_2d_with_uarch_fastpath;
2006	}
2007	#endif
2008	pthreadpool_parallelize(
2009	threadpool, parallelize_4d_tile_2d_with_uarch, &params, sizeof(params),
2010	task, argument, tile_range, flags);
2011	}
2012	}
2013
2014	void pthreadpool_parallelize_5d(
2015	pthreadpool_t threadpool,
2016	pthreadpool_task_5d_t task,
2017	void* argument,
2018	size_t range_i,
2019	size_t range_j,
2020	size_t range_k,
2021	size_t range_l,
2022	size_t range_m,
2023	uint32_t flags)
2024	{
2025	size_t threads_count;
2026	if (threadpool == NULL \|\| (threads_count = threadpool->threads_count.value) <= `1` \|\| (range_i \| range_j \| range_k \| range_l \| range_m) <= `1`) {
2027	/ No thread pool used: execute task sequentially on the calling thread /
2028	struct fpu_state saved_fpu_state = { `0` };
2029	if (flags & PTHREADPOOL_FLAG_DISABLE_DENORMALS) {
2030	saved_fpu_state = get_fpu_state();
2031	disable_fpu_denormals();
2032	}
2033	for (size_t i = `0`; i < range_i; i++) {
2034	for (size_t j = `0`; j < range_j; j++) {
2035	for (size_t k = `0`; k < range_k; k++) {
2036	for (size_t l = `0`; l < range_l; l++) {
2037	for (size_t m = `0`; m < range_m; m++) {
2038	task(argument, i, j, k, l, m);
2039	}
2040	}
2041	}
2042	}
2043	}
2044	if (flags & PTHREADPOOL_FLAG_DISABLE_DENORMALS) {
2045	set_fpu_state(saved_fpu_state);
2046	}
2047	} else {
2048	const size_t range_lm = range_l * range_m;
2049	const size_t range = range_i * range_j * range_k * range_lm;
2050	const struct pthreadpool_5d_params params = {
2051	.range_l = range_l,
2052	.range_j = fxdiv_init_size_t(range_j),
2053	.range_k = fxdiv_init_size_t(range_k),
2054	.range_lm = fxdiv_init_size_t(range_lm),
2055	.range_m = fxdiv_init_size_t(range_m),
2056	};
2057	thread_function_t parallelize_5d = &thread_parallelize_5d;
2058	#if PTHREADPOOL_USE_FASTPATH
2059	const size_t range_threshold = -threads_count;
2060	if (range < range_threshold) {
2061	parallelize_5d = &pthreadpool_thread_parallelize_5d_fastpath;
2062	}
2063	#endif
2064	pthreadpool_parallelize(
2065	threadpool, parallelize_5d, &params, sizeof(params),
2066	task, argument, range, flags);
2067	}
2068	}
2069
2070	void pthreadpool_parallelize_5d_tile_1d(
2071	pthreadpool_t threadpool,
2072	pthreadpool_task_5d_tile_1d_t task,
2073	void* argument,
2074	size_t range_i,
2075	size_t range_j,
2076	size_t range_k,
2077	size_t range_l,
2078	size_t range_m,
2079	size_t tile_m,
2080	uint32_t flags)
2081	{
2082	size_t threads_count;
2083	if (threadpool == NULL \|\| (threads_count = threadpool->threads_count.value) <= `1` \|\| ((range_i \| range_j \| range_k \| range_l) <= `1` && range_m <= tile_m)) {
2084	/ No thread pool used: execute task sequentially on the calling thread /
2085	struct fpu_state saved_fpu_state = { `0` };
2086	if (flags & PTHREADPOOL_FLAG_DISABLE_DENORMALS) {
2087	saved_fpu_state = get_fpu_state();
2088	disable_fpu_denormals();
2089	}
2090	for (size_t i = `0`; i < range_i; i++) {
2091	for (size_t j = `0`; j < range_j; j++) {
2092	for (size_t k = `0`; k < range_k; k++) {
2093	for (size_t l = `0`; l < range_l; l++) {
2094	for (size_t m = `0`; m < range_m; m += tile_m) {
2095	task(argument, i, j, k, l, m, min(range_m - m, tile_m));
2096	}
2097	}
2098	}
2099	}
2100	}
2101	if (flags & PTHREADPOOL_FLAG_DISABLE_DENORMALS) {
2102	set_fpu_state(saved_fpu_state);
2103	}
2104	} else {
2105	const size_t tile_range_m = divide_round_up(range_m, tile_m);
2106	const size_t range_kl = range_k * range_l;
2107	const size_t tile_range = range_i * range_j * range_kl * tile_range_m;
2108	const struct pthreadpool_5d_tile_1d_params params = {
2109	.range_k = range_k,
2110	.range_m = range_m,
2111	.tile_m = tile_m,
2112	.range_j = fxdiv_init_size_t(range_j),
2113	.range_kl = fxdiv_init_size_t(range_kl),
2114	.range_l = fxdiv_init_size_t(range_l),
2115	.tile_range_m = fxdiv_init_size_t(tile_range_m),
2116	};
2117	thread_function_t parallelize_5d_tile_1d = &thread_parallelize_5d_tile_1d;
2118	#if PTHREADPOOL_USE_FASTPATH
2119	const size_t range_threshold = -threads_count;
2120	if (tile_range < range_threshold) {
2121	parallelize_5d_tile_1d = &pthreadpool_thread_parallelize_5d_tile_1d_fastpath;
2122	}
2123	#endif
2124	pthreadpool_parallelize(
2125	threadpool, parallelize_5d_tile_1d, &params, sizeof(params),
2126	task, argument, tile_range, flags);
2127	}
2128	}
2129
2130	void pthreadpool_parallelize_5d_tile_2d(
2131	pthreadpool_t threadpool,
2132	pthreadpool_task_5d_tile_2d_t task,
2133	void* argument,
2134	size_t range_i,
2135	size_t range_j,
2136	size_t range_k,
2137	size_t range_l,
2138	size_t range_m,
2139	size_t tile_l,
2140	size_t tile_m,
2141	uint32_t flags)
2142	{
2143	size_t threads_count;
2144	if (threadpool == NULL \|\| (threads_count = threadpool->threads_count.value) <= `1` \|\| ((range_i \| range_j \| range_k) <= `1` && range_l <= tile_l && range_m <= tile_m)) {
2145	/ No thread pool used: execute task sequentially on the calling thread /
2146	struct fpu_state saved_fpu_state = { `0` };
2147	if (flags & PTHREADPOOL_FLAG_DISABLE_DENORMALS) {
2148	saved_fpu_state = get_fpu_state();
2149	disable_fpu_denormals();
2150	}
2151	for (size_t i = `0`; i < range_i; i++) {
2152	for (size_t j = `0`; j < range_j; j++) {
2153	for (size_t k = `0`; k < range_k; k++) {
2154	for (size_t l = `0`; l < range_l; l += tile_l) {
2155	for (size_t m = `0`; m < range_m; m += tile_m) {
2156	task(argument, i, j, k, l, m,
2157	min(range_l - l, tile_l), min(range_m - m, tile_m));
2158	}
2159	}
2160	}
2161	}
2162	}
2163	if (flags & PTHREADPOOL_FLAG_DISABLE_DENORMALS) {
2164	set_fpu_state(saved_fpu_state);
2165	}
2166	} else {
2167	const size_t tile_range_m = divide_round_up(range_m, tile_m);
2168	const size_t tile_range_lm = divide_round_up(range_l, tile_l) * tile_range_m;
2169	const size_t tile_range = range_i * range_j * range_k * tile_range_lm;
2170	const struct pthreadpool_5d_tile_2d_params params = {
2171	.range_l = range_l,
2172	.tile_l = tile_l,
2173	.range_m = range_m,
2174	.tile_m = tile_m,
2175	.range_j = fxdiv_init_size_t(range_j),
2176	.range_k = fxdiv_init_size_t(range_k),
2177	.tile_range_lm = fxdiv_init_size_t(tile_range_lm),
2178	.tile_range_m = fxdiv_init_size_t(tile_range_m),
2179	};
2180	thread_function_t parallelize_5d_tile_2d = &thread_parallelize_5d_tile_2d;
2181	#if PTHREADPOOL_USE_FASTPATH
2182	const size_t range_threshold = -threads_count;
2183	if (tile_range < range_threshold) {
2184	parallelize_5d_tile_2d = &pthreadpool_thread_parallelize_5d_tile_2d_fastpath;
2185	}
2186	#endif
2187	pthreadpool_parallelize(
2188	threadpool, parallelize_5d_tile_2d, &params, sizeof(params),
2189	task, argument, tile_range, flags);
2190	}
2191	}
2192
2193	void pthreadpool_parallelize_6d(
2194	pthreadpool_t threadpool,
2195	pthreadpool_task_6d_t task,
2196	void* argument,
2197	size_t range_i,
2198	size_t range_j,
2199	size_t range_k,
2200	size_t range_l,
2201	size_t range_m,
2202	size_t range_n,
2203	uint32_t flags)
2204	{
2205	size_t threads_count;
2206	if (threadpool == NULL \|\| (threads_count = threadpool->threads_count.value) <= `1` \|\| (range_i \| range_j \| range_k \| range_l \| range_m \| range_n) <= `1`) {
2207	/ No thread pool used: execute task sequentially on the calling thread /
2208	struct fpu_state saved_fpu_state = { `0` };
2209	if (flags & PTHREADPOOL_FLAG_DISABLE_DENORMALS) {
2210	saved_fpu_state = get_fpu_state();
2211	disable_fpu_denormals();
2212	}
2213	for (size_t i = `0`; i < range_i; i++) {
2214	for (size_t j = `0`; j < range_j; j++) {
2215	for (size_t k = `0`; k < range_k; k++) {
2216	for (size_t l = `0`; l < range_l; l++) {
2217	for (size_t m = `0`; m < range_m; m++) {
2218	for (size_t n = `0`; n < range_n; n++) {
2219	task(argument, i, j, k, l, m, n);
2220	}
2221	}
2222	}
2223	}
2224	}
2225	}
2226	if (flags & PTHREADPOOL_FLAG_DISABLE_DENORMALS) {
2227	set_fpu_state(saved_fpu_state);
2228	}
2229	} else {
2230	const size_t range_lmn = range_l * range_m * range_n;
2231	const size_t range = range_i * range_j * range_k * range_lmn;
2232	const struct pthreadpool_6d_params params = {
2233	.range_l = range_l,
2234	.range_j = fxdiv_init_size_t(range_j),
2235	.range_k = fxdiv_init_size_t(range_k),
2236	.range_lmn = fxdiv_init_size_t(range_lmn),
2237	.range_m = fxdiv_init_size_t(range_m),
2238	.range_n = fxdiv_init_size_t(range_n),
2239	};
2240	thread_function_t parallelize_6d = &thread_parallelize_6d;
2241	#if PTHREADPOOL_USE_FASTPATH
2242	const size_t range_threshold = -threads_count;
2243	if (range < range_threshold) {
2244	parallelize_6d = &pthreadpool_thread_parallelize_6d_fastpath;
2245	}
2246	#endif
2247	pthreadpool_parallelize(
2248	threadpool, parallelize_6d, &params, sizeof(params),
2249	task, argument, range, flags);
2250	}
2251	}
2252
2253	void pthreadpool_parallelize_6d_tile_1d(
2254	pthreadpool_t threadpool,
2255	pthreadpool_task_6d_tile_1d_t task,
2256	void* argument,
2257	size_t range_i,
2258	size_t range_j,
2259	size_t range_k,
2260	size_t range_l,
2261	size_t range_m,
2262	size_t range_n,
2263	size_t tile_n,
2264	uint32_t flags)
2265	{
2266	size_t threads_count;
2267	if (threadpool == NULL \|\| (threads_count = threadpool->threads_count.value) <= `1` \|\| ((range_i \| range_j \| range_k \| range_l \| range_m) <= `1` && range_n <= tile_n)) {
2268	/ No thread pool used: execute task sequentially on the calling thread /
2269	struct fpu_state saved_fpu_state = { `0` };
2270	if (flags & PTHREADPOOL_FLAG_DISABLE_DENORMALS) {
2271	saved_fpu_state = get_fpu_state();
2272	disable_fpu_denormals();
2273	}
2274	for (size_t i = `0`; i < range_i; i++) {
2275	for (size_t j = `0`; j < range_j; j++) {
2276	for (size_t k = `0`; k < range_k; k++) {
2277	for (size_t l = `0`; l < range_l; l++) {
2278	for (size_t m = `0`; m < range_m; m++) {
2279	for (size_t n = `0`; n < range_n; n += tile_n) {
2280	task(argument, i, j, k, l, m, n, min(range_n - n, tile_n));
2281	}
2282	}
2283	}
2284	}
2285	}
2286	}
2287	if (flags & PTHREADPOOL_FLAG_DISABLE_DENORMALS) {
2288	set_fpu_state(saved_fpu_state);
2289	}
2290	} else {
2291	const size_t tile_range_n = divide_round_up(range_n, tile_n);
2292	const size_t tile_range_lmn = range_l * range_m * tile_range_n;
2293	const size_t tile_range = range_i * range_j * range_k * tile_range_lmn;
2294	const struct pthreadpool_6d_tile_1d_params params = {
2295	.range_l = range_l,
2296	.range_n = range_n,
2297	.tile_n = tile_n,
2298	.range_j = fxdiv_init_size_t(range_j),
2299	.range_k = fxdiv_init_size_t(range_k),
2300	.tile_range_lmn = fxdiv_init_size_t(tile_range_lmn),
2301	.range_m = fxdiv_init_size_t(range_m),
2302	.tile_range_n = fxdiv_init_size_t(tile_range_n),
2303	};
2304	thread_function_t parallelize_6d_tile_1d = &thread_parallelize_6d_tile_1d;
2305	#if PTHREADPOOL_USE_FASTPATH
2306	const size_t range_threshold = -threads_count;
2307	if (tile_range < range_threshold) {
2308	parallelize_6d_tile_1d = &pthreadpool_thread_parallelize_6d_tile_1d_fastpath;
2309	}
2310	#endif
2311	pthreadpool_parallelize(
2312	threadpool, parallelize_6d_tile_1d, &params, sizeof(params),
2313	task, argument, tile_range, flags);
2314	}
2315	}
2316
2317	void pthreadpool_parallelize_6d_tile_2d(
2318	pthreadpool_t threadpool,
2319	pthreadpool_task_6d_tile_2d_t task,
2320	void* argument,
2321	size_t range_i,
2322	size_t range_j,
2323	size_t range_k,
2324	size_t range_l,
2325	size_t range_m,
2326	size_t range_n,
2327	size_t tile_m,
2328	size_t tile_n,
2329	uint32_t flags)
2330	{
2331	size_t threads_count;
2332	if (threadpool == NULL \|\| (threads_count = threadpool->threads_count.value) <= `1` \|\| ((range_i \| range_j \| range_k \| range_l) <= `1` && range_m <= tile_m && range_n <= tile_n)) {
2333	/ No thread pool used: execute task sequentially on the calling thread /
2334	struct fpu_state saved_fpu_state = { `0` };
2335	if (flags & PTHREADPOOL_FLAG_DISABLE_DENORMALS) {
2336	saved_fpu_state = get_fpu_state();
2337	disable_fpu_denormals();
2338	}
2339	for (size_t i = `0`; i < range_i; i++) {
2340	for (size_t j = `0`; j < range_j; j++) {
2341	for (size_t k = `0`; k < range_k; k++) {
2342	for (size_t l = `0`; l < range_l; l++) {
2343	for (size_t m = `0`; m < range_m; m += tile_m) {
2344	for (size_t n = `0`; n < range_n; n += tile_n) {
2345	task(argument, i, j, k, l, m, n,
2346	min(range_m - m, tile_m), min(range_n - n, tile_n));
2347	}
2348	}
2349	}
2350	}
2351	}
2352	}
2353	if (flags & PTHREADPOOL_FLAG_DISABLE_DENORMALS) {
2354	set_fpu_state(saved_fpu_state);
2355	}
2356	} else {
2357	const size_t range_kl = range_k * range_l;
2358	const size_t tile_range_n = divide_round_up(range_n, tile_n);
2359	const size_t tile_range_mn = divide_round_up(range_m, tile_m) * tile_range_n;
2360	const size_t tile_range = range_i * range_j * range_kl * tile_range_mn;
2361	const struct pthreadpool_6d_tile_2d_params params = {
2362	.range_k = range_k,
2363	.range_m = range_m,
2364	.tile_m = tile_m,
2365	.range_n = range_n,
2366	.tile_n = tile_n,
2367	.range_j = fxdiv_init_size_t(range_j),
2368	.range_kl = fxdiv_init_size_t(range_kl),
2369	.range_l = fxdiv_init_size_t(range_l),
2370	.tile_range_mn = fxdiv_init_size_t(tile_range_mn),
2371	.tile_range_n = fxdiv_init_size_t(tile_range_n),
2372	};
2373	thread_function_t parallelize_6d_tile_2d = &thread_parallelize_6d_tile_2d;
2374	#if PTHREADPOOL_USE_FASTPATH
2375	const size_t range_threshold = -threads_count;
2376	if (tile_range < range_threshold) {
2377	parallelize_6d_tile_2d = &pthreadpool_thread_parallelize_6d_tile_2d_fastpath;
2378	}
2379	#endif
2380	pthreadpool_parallelize(
2381	threadpool, parallelize_6d_tile_2d, &params, sizeof(params),
2382	task, argument, tile_range, flags);
2383	}
2384	}
2385

Browse the source code of pytorch/third_party/pthreadpool/src/portable-api.c