1 | /* Standard C headers */ |
2 | #include <assert.h> |
3 | #include <stdbool.h> |
4 | #include <stdint.h> |
5 | #include <stdlib.h> |
6 | #include <string.h> |
7 | |
8 | #if PTHREADPOOL_USE_CPUINFO |
9 | #include <cpuinfo.h> |
10 | #endif |
11 | |
12 | /* Dependencies */ |
13 | #include <fxdiv.h> |
14 | |
15 | /* Public library header */ |
16 | #include <pthreadpool.h> |
17 | |
18 | /* Internal library headers */ |
19 | #include "threadpool-atomics.h" |
20 | #include "threadpool-common.h" |
21 | #include "threadpool-object.h" |
22 | #include "threadpool-utils.h" |
23 | |
24 | |
25 | PTHREADPOOL_INTERNAL void pthreadpool_thread_parallelize_1d_fastpath( |
26 | struct pthreadpool* threadpool, |
27 | struct thread_info* thread) |
28 | { |
29 | assert(threadpool != NULL); |
30 | assert(thread != NULL); |
31 | |
32 | const pthreadpool_task_1d_t task = (pthreadpool_task_1d_t) pthreadpool_load_relaxed_void_p(&threadpool->task); |
33 | void *const argument = pthreadpool_load_relaxed_void_p(&threadpool->argument); |
34 | |
35 | const size_t threads_count = threadpool->threads_count.value; |
36 | const size_t range_threshold = -threads_count; |
37 | |
38 | /* Process thread's own range of items */ |
39 | size_t range_start = pthreadpool_load_relaxed_size_t(&thread->range_start); |
40 | while (pthreadpool_decrement_fetch_relaxed_size_t(&thread->range_length) < range_threshold) { |
41 | task(argument, range_start++); |
42 | } |
43 | |
44 | /* There still may be other threads with work */ |
45 | const size_t thread_number = thread->thread_number; |
46 | for (size_t tid = modulo_decrement(thread_number, threads_count); |
47 | tid != thread_number; |
48 | tid = modulo_decrement(tid, threads_count)) |
49 | { |
50 | struct thread_info* other_thread = &threadpool->threads[tid]; |
51 | while (pthreadpool_decrement_fetch_relaxed_size_t(&other_thread->range_length) < range_threshold) { |
52 | const size_t index = pthreadpool_decrement_fetch_relaxed_size_t(&other_thread->range_end); |
53 | task(argument, index); |
54 | } |
55 | } |
56 | |
57 | /* Make changes by this thread visible to other threads */ |
58 | pthreadpool_fence_release(); |
59 | } |
60 | |
61 | PTHREADPOOL_INTERNAL void pthreadpool_thread_parallelize_1d_with_uarch_fastpath( |
62 | struct pthreadpool* threadpool, |
63 | struct thread_info* thread) |
64 | { |
65 | assert(threadpool != NULL); |
66 | assert(thread != NULL); |
67 | |
68 | const pthreadpool_task_1d_with_id_t task = (pthreadpool_task_1d_with_id_t) pthreadpool_load_relaxed_void_p(&threadpool->task); |
69 | void *const argument = pthreadpool_load_relaxed_void_p(&threadpool->argument); |
70 | |
71 | const uint32_t default_uarch_index = threadpool->params.parallelize_1d_with_uarch.default_uarch_index; |
72 | uint32_t uarch_index = default_uarch_index; |
73 | #if PTHREADPOOL_USE_CPUINFO |
74 | uarch_index = cpuinfo_get_current_uarch_index_with_default(default_uarch_index); |
75 | if (uarch_index > threadpool->params.parallelize_1d_with_uarch.max_uarch_index) { |
76 | uarch_index = default_uarch_index; |
77 | } |
78 | #endif |
79 | |
80 | const size_t threads_count = threadpool->threads_count.value; |
81 | const size_t range_threshold = -threads_count; |
82 | |
83 | /* Process thread's own range of items */ |
84 | size_t range_start = pthreadpool_load_relaxed_size_t(&thread->range_start); |
85 | while (pthreadpool_decrement_fetch_relaxed_size_t(&thread->range_length) < range_threshold) { |
86 | task(argument, uarch_index, range_start++); |
87 | } |
88 | |
89 | /* There still may be other threads with work */ |
90 | const size_t thread_number = thread->thread_number; |
91 | for (size_t tid = modulo_decrement(thread_number, threads_count); |
92 | tid != thread_number; |
93 | tid = modulo_decrement(tid, threads_count)) |
94 | { |
95 | struct thread_info* other_thread = &threadpool->threads[tid]; |
96 | while (pthreadpool_decrement_fetch_relaxed_size_t(&other_thread->range_length) < range_threshold) { |
97 | const size_t index = pthreadpool_decrement_fetch_relaxed_size_t(&other_thread->range_end); |
98 | task(argument, uarch_index, index); |
99 | } |
100 | } |
101 | |
102 | /* Make changes by this thread visible to other threads */ |
103 | pthreadpool_fence_release(); |
104 | } |
105 | |
106 | PTHREADPOOL_INTERNAL void pthreadpool_thread_parallelize_1d_tile_1d_fastpath( |
107 | struct pthreadpool* threadpool, |
108 | struct thread_info* thread) |
109 | { |
110 | assert(threadpool != NULL); |
111 | assert(thread != NULL); |
112 | |
113 | const pthreadpool_task_1d_tile_1d_t task = (pthreadpool_task_1d_tile_1d_t) pthreadpool_load_relaxed_void_p(&threadpool->task); |
114 | void *const argument = pthreadpool_load_relaxed_void_p(&threadpool->argument); |
115 | |
116 | const size_t threads_count = threadpool->threads_count.value; |
117 | const size_t range_threshold = -threads_count; |
118 | |
119 | /* Process thread's own range of items */ |
120 | const size_t range_start = pthreadpool_load_relaxed_size_t(&thread->range_start); |
121 | const size_t tile = threadpool->params.parallelize_1d_tile_1d.tile; |
122 | size_t tile_start = range_start * tile; |
123 | |
124 | const size_t range = threadpool->params.parallelize_1d_tile_1d.range; |
125 | while (pthreadpool_decrement_fetch_relaxed_size_t(&thread->range_length) < range_threshold) { |
126 | task(argument, tile_start, min(range - tile_start, tile)); |
127 | tile_start += tile; |
128 | } |
129 | |
130 | /* There still may be other threads with work */ |
131 | const size_t thread_number = thread->thread_number; |
132 | for (size_t tid = modulo_decrement(thread_number, threads_count); |
133 | tid != thread_number; |
134 | tid = modulo_decrement(tid, threads_count)) |
135 | { |
136 | struct thread_info* other_thread = &threadpool->threads[tid]; |
137 | while (pthreadpool_decrement_fetch_relaxed_size_t(&other_thread->range_length) < range_threshold) { |
138 | const size_t tile_index = pthreadpool_decrement_fetch_relaxed_size_t(&other_thread->range_end); |
139 | const size_t tile_start = tile_index * tile; |
140 | task(argument, tile_start, min(range - tile_start, tile)); |
141 | } |
142 | } |
143 | |
144 | /* Make changes by this thread visible to other threads */ |
145 | pthreadpool_fence_release(); |
146 | } |
147 | |
148 | PTHREADPOOL_INTERNAL void pthreadpool_thread_parallelize_2d_fastpath( |
149 | struct pthreadpool* threadpool, |
150 | struct thread_info* thread) |
151 | { |
152 | assert(threadpool != NULL); |
153 | assert(thread != NULL); |
154 | |
155 | const pthreadpool_task_2d_t task = (pthreadpool_task_2d_t) pthreadpool_load_relaxed_void_p(&threadpool->task); |
156 | void *const argument = pthreadpool_load_relaxed_void_p(&threadpool->argument); |
157 | |
158 | const size_t threads_count = threadpool->threads_count.value; |
159 | const size_t range_threshold = -threads_count; |
160 | |
161 | /* Process thread's own range of items */ |
162 | const size_t range_start = pthreadpool_load_relaxed_size_t(&thread->range_start); |
163 | const struct fxdiv_divisor_size_t range_j = threadpool->params.parallelize_2d.range_j; |
164 | const struct fxdiv_result_size_t index_i_j = fxdiv_divide_size_t(range_start, range_j); |
165 | size_t i = index_i_j.quotient; |
166 | size_t j = index_i_j.remainder; |
167 | |
168 | while (pthreadpool_decrement_fetch_relaxed_size_t(&thread->range_length) < range_threshold) { |
169 | task(argument, i, j); |
170 | if (++j == range_j.value) { |
171 | j = 0; |
172 | i += 1; |
173 | } |
174 | } |
175 | |
176 | /* There still may be other threads with work */ |
177 | const size_t thread_number = thread->thread_number; |
178 | for (size_t tid = modulo_decrement(thread_number, threads_count); |
179 | tid != thread_number; |
180 | tid = modulo_decrement(tid, threads_count)) |
181 | { |
182 | struct thread_info* other_thread = &threadpool->threads[tid]; |
183 | while (pthreadpool_decrement_fetch_relaxed_size_t(&other_thread->range_length) < range_threshold) { |
184 | const size_t linear_index = pthreadpool_decrement_fetch_relaxed_size_t(&other_thread->range_end); |
185 | const struct fxdiv_result_size_t index_i_j = fxdiv_divide_size_t(linear_index, range_j); |
186 | task(argument, index_i_j.quotient, index_i_j.remainder); |
187 | } |
188 | } |
189 | |
190 | /* Make changes by this thread visible to other threads */ |
191 | pthreadpool_fence_release(); |
192 | } |
193 | |
194 | PTHREADPOOL_INTERNAL void pthreadpool_thread_parallelize_2d_tile_1d_fastpath( |
195 | struct pthreadpool* threadpool, |
196 | struct thread_info* thread) |
197 | { |
198 | assert(threadpool != NULL); |
199 | assert(thread != NULL); |
200 | |
201 | const pthreadpool_task_2d_tile_1d_t task = (pthreadpool_task_2d_tile_1d_t) pthreadpool_load_relaxed_void_p(&threadpool->task); |
202 | void *const argument = pthreadpool_load_relaxed_void_p(&threadpool->argument); |
203 | |
204 | const size_t threads_count = threadpool->threads_count.value; |
205 | const size_t range_threshold = -threads_count; |
206 | |
207 | /* Process thread's own range of items */ |
208 | const size_t range_start = pthreadpool_load_relaxed_size_t(&thread->range_start); |
209 | const struct fxdiv_divisor_size_t tile_range_j = threadpool->params.parallelize_2d_tile_1d.tile_range_j; |
210 | const struct fxdiv_result_size_t tile_index_i_j = fxdiv_divide_size_t(range_start, tile_range_j); |
211 | const size_t tile_j = threadpool->params.parallelize_2d_tile_1d.tile_j; |
212 | size_t i = tile_index_i_j.quotient; |
213 | size_t start_j = tile_index_i_j.remainder * tile_j; |
214 | |
215 | const size_t range_j = threadpool->params.parallelize_2d_tile_1d.range_j; |
216 | while (pthreadpool_decrement_fetch_relaxed_size_t(&thread->range_length) < range_threshold) { |
217 | task(argument, i, start_j, min(range_j - start_j, tile_j)); |
218 | start_j += tile_j; |
219 | if (start_j >= range_j) { |
220 | start_j = 0; |
221 | i += 1; |
222 | } |
223 | } |
224 | |
225 | /* There still may be other threads with work */ |
226 | const size_t thread_number = thread->thread_number; |
227 | for (size_t tid = modulo_decrement(thread_number, threads_count); |
228 | tid != thread_number; |
229 | tid = modulo_decrement(tid, threads_count)) |
230 | { |
231 | struct thread_info* other_thread = &threadpool->threads[tid]; |
232 | while (pthreadpool_decrement_fetch_relaxed_size_t(&other_thread->range_length) < range_threshold) { |
233 | const size_t linear_index = pthreadpool_decrement_fetch_relaxed_size_t(&other_thread->range_end); |
234 | const struct fxdiv_result_size_t tile_index_i_j = fxdiv_divide_size_t(linear_index, tile_range_j); |
235 | const size_t start_j = tile_index_i_j.remainder * tile_j; |
236 | task(argument, tile_index_i_j.quotient, start_j, min(range_j - start_j, tile_j)); |
237 | } |
238 | } |
239 | |
240 | /* Make changes by this thread visible to other threads */ |
241 | pthreadpool_fence_release(); |
242 | } |
243 | |
244 | PTHREADPOOL_INTERNAL void pthreadpool_thread_parallelize_2d_tile_2d_fastpath( |
245 | struct pthreadpool* threadpool, |
246 | struct thread_info* thread) |
247 | { |
248 | assert(threadpool != NULL); |
249 | assert(thread != NULL); |
250 | |
251 | const pthreadpool_task_2d_tile_2d_t task = (pthreadpool_task_2d_tile_2d_t) pthreadpool_load_relaxed_void_p(&threadpool->task); |
252 | void *const argument = pthreadpool_load_relaxed_void_p(&threadpool->argument); |
253 | |
254 | const size_t threads_count = threadpool->threads_count.value; |
255 | const size_t range_threshold = -threads_count; |
256 | |
257 | /* Process thread's own range of items */ |
258 | const size_t range_start = pthreadpool_load_relaxed_size_t(&thread->range_start); |
259 | const struct fxdiv_divisor_size_t tile_range_j = threadpool->params.parallelize_2d_tile_2d.tile_range_j; |
260 | const struct fxdiv_result_size_t tile_index_i_j = fxdiv_divide_size_t(range_start, tile_range_j); |
261 | const size_t tile_i = threadpool->params.parallelize_2d_tile_2d.tile_i; |
262 | const size_t tile_j = threadpool->params.parallelize_2d_tile_2d.tile_j; |
263 | size_t start_i = tile_index_i_j.quotient * tile_i; |
264 | size_t start_j = tile_index_i_j.remainder * tile_j; |
265 | |
266 | const size_t range_i = threadpool->params.parallelize_2d_tile_2d.range_i; |
267 | const size_t range_j = threadpool->params.parallelize_2d_tile_2d.range_j; |
268 | while (pthreadpool_decrement_fetch_relaxed_size_t(&thread->range_length) < range_threshold) { |
269 | task(argument, start_i, start_j, min(range_i - start_i, tile_i), min(range_j - start_j, tile_j)); |
270 | start_j += tile_j; |
271 | if (start_j >= range_j) { |
272 | start_j = 0; |
273 | start_i += tile_i; |
274 | } |
275 | } |
276 | |
277 | /* There still may be other threads with work */ |
278 | const size_t thread_number = thread->thread_number; |
279 | for (size_t tid = modulo_decrement(thread_number, threads_count); |
280 | tid != thread_number; |
281 | tid = modulo_decrement(tid, threads_count)) |
282 | { |
283 | struct thread_info* other_thread = &threadpool->threads[tid]; |
284 | while (pthreadpool_decrement_fetch_relaxed_size_t(&other_thread->range_length) < range_threshold) { |
285 | const size_t linear_index = pthreadpool_decrement_fetch_relaxed_size_t(&other_thread->range_end); |
286 | const struct fxdiv_result_size_t tile_index_i_j = fxdiv_divide_size_t(linear_index, tile_range_j); |
287 | const size_t start_i = tile_index_i_j.quotient * tile_i; |
288 | const size_t start_j = tile_index_i_j.remainder * tile_j; |
289 | task(argument, start_i, start_j, min(range_i - start_i, tile_i), min(range_j - start_j, tile_j)); |
290 | } |
291 | } |
292 | |
293 | /* Make changes by this thread visible to other threads */ |
294 | pthreadpool_fence_release(); |
295 | } |
296 | |
297 | PTHREADPOOL_INTERNAL void pthreadpool_thread_parallelize_2d_tile_2d_with_uarch_fastpath( |
298 | struct pthreadpool* threadpool, |
299 | struct thread_info* thread) |
300 | { |
301 | assert(threadpool != NULL); |
302 | assert(thread != NULL); |
303 | |
304 | const pthreadpool_task_2d_tile_2d_with_id_t task = (pthreadpool_task_2d_tile_2d_with_id_t) pthreadpool_load_relaxed_void_p(&threadpool->task); |
305 | void *const argument = pthreadpool_load_relaxed_void_p(&threadpool->argument); |
306 | |
307 | const uint32_t default_uarch_index = threadpool->params.parallelize_2d_tile_2d_with_uarch.default_uarch_index; |
308 | uint32_t uarch_index = default_uarch_index; |
309 | #if PTHREADPOOL_USE_CPUINFO |
310 | uarch_index = cpuinfo_get_current_uarch_index_with_default(default_uarch_index); |
311 | if (uarch_index > threadpool->params.parallelize_2d_tile_2d_with_uarch.max_uarch_index) { |
312 | uarch_index = default_uarch_index; |
313 | } |
314 | #endif |
315 | |
316 | const size_t threads_count = threadpool->threads_count.value; |
317 | const size_t range_threshold = -threads_count; |
318 | |
319 | /* Process thread's own range of items */ |
320 | const struct fxdiv_divisor_size_t tile_range_j = threadpool->params.parallelize_2d_tile_2d_with_uarch.tile_range_j; |
321 | const size_t range_start = pthreadpool_load_relaxed_size_t(&thread->range_start); |
322 | const struct fxdiv_result_size_t index = fxdiv_divide_size_t(range_start, tile_range_j); |
323 | const size_t range_i = threadpool->params.parallelize_2d_tile_2d_with_uarch.range_i; |
324 | const size_t tile_i = threadpool->params.parallelize_2d_tile_2d_with_uarch.tile_i; |
325 | const size_t range_j = threadpool->params.parallelize_2d_tile_2d_with_uarch.range_j; |
326 | const size_t tile_j = threadpool->params.parallelize_2d_tile_2d_with_uarch.tile_j; |
327 | size_t start_i = index.quotient * tile_i; |
328 | size_t start_j = index.remainder * tile_j; |
329 | |
330 | while (pthreadpool_decrement_fetch_relaxed_size_t(&thread->range_length) < range_threshold) { |
331 | task(argument, uarch_index, start_i, start_j, min(range_i - start_i, tile_i), min(range_j - start_j, tile_j)); |
332 | start_j += tile_j; |
333 | if (start_j >= range_j) { |
334 | start_j = 0; |
335 | start_i += tile_i; |
336 | } |
337 | } |
338 | |
339 | /* There still may be other threads with work */ |
340 | const size_t thread_number = thread->thread_number; |
341 | for (size_t tid = modulo_decrement(thread_number, threads_count); |
342 | tid != thread_number; |
343 | tid = modulo_decrement(tid, threads_count)) |
344 | { |
345 | struct thread_info* other_thread = &threadpool->threads[tid]; |
346 | while (pthreadpool_decrement_fetch_relaxed_size_t(&other_thread->range_length) < range_threshold) { |
347 | const size_t linear_index = pthreadpool_decrement_fetch_relaxed_size_t(&other_thread->range_end); |
348 | const struct fxdiv_result_size_t tile_index_i_j = fxdiv_divide_size_t(linear_index, tile_range_j); |
349 | const size_t start_i = tile_index_i_j.quotient * tile_i; |
350 | const size_t start_j = tile_index_i_j.remainder * tile_j; |
351 | task(argument, uarch_index, start_i, start_j, min(range_i - start_i, tile_i), min(range_j - start_j, tile_j)); |
352 | } |
353 | } |
354 | |
355 | /* Make changes by this thread visible to other threads */ |
356 | pthreadpool_fence_release(); |
357 | } |
358 | |
359 | PTHREADPOOL_INTERNAL void pthreadpool_thread_parallelize_3d_fastpath( |
360 | struct pthreadpool* threadpool, |
361 | struct thread_info* thread) |
362 | { |
363 | assert(threadpool != NULL); |
364 | assert(thread != NULL); |
365 | |
366 | const pthreadpool_task_3d_t task = (pthreadpool_task_3d_t) pthreadpool_load_relaxed_void_p(&threadpool->task); |
367 | void *const argument = pthreadpool_load_relaxed_void_p(&threadpool->argument); |
368 | |
369 | const size_t threads_count = threadpool->threads_count.value; |
370 | const size_t range_threshold = -threads_count; |
371 | |
372 | /* Process thread's own range of items */ |
373 | const size_t range_start = pthreadpool_load_relaxed_size_t(&thread->range_start); |
374 | const struct fxdiv_divisor_size_t range_k = threadpool->params.parallelize_3d.range_k; |
375 | const struct fxdiv_result_size_t index_ij_k = fxdiv_divide_size_t(range_start, range_k); |
376 | const struct fxdiv_divisor_size_t range_j = threadpool->params.parallelize_3d.range_j; |
377 | const struct fxdiv_result_size_t index_i_j = fxdiv_divide_size_t(index_ij_k.quotient, range_j); |
378 | size_t i = index_i_j.quotient; |
379 | size_t j = index_i_j.remainder; |
380 | size_t k = index_ij_k.remainder; |
381 | |
382 | while (pthreadpool_decrement_fetch_relaxed_size_t(&thread->range_length) < range_threshold) { |
383 | task(argument, i, j, k); |
384 | if (++k == range_k.value) { |
385 | k = 0; |
386 | if (++j == range_j.value) { |
387 | j = 0; |
388 | i += 1; |
389 | } |
390 | } |
391 | } |
392 | |
393 | /* There still may be other threads with work */ |
394 | const size_t thread_number = thread->thread_number; |
395 | for (size_t tid = modulo_decrement(thread_number, threads_count); |
396 | tid != thread_number; |
397 | tid = modulo_decrement(tid, threads_count)) |
398 | { |
399 | struct thread_info* other_thread = &threadpool->threads[tid]; |
400 | while (pthreadpool_decrement_fetch_relaxed_size_t(&other_thread->range_length) < range_threshold) { |
401 | const size_t linear_index = pthreadpool_decrement_fetch_relaxed_size_t(&other_thread->range_end); |
402 | const struct fxdiv_result_size_t index_ij_k = fxdiv_divide_size_t(linear_index, range_k); |
403 | const struct fxdiv_result_size_t index_i_j = fxdiv_divide_size_t(index_ij_k.quotient, range_j); |
404 | task(argument, index_i_j.quotient, index_i_j.remainder, index_ij_k.remainder); |
405 | } |
406 | } |
407 | |
408 | /* Make changes by this thread visible to other threads */ |
409 | pthreadpool_fence_release(); |
410 | } |
411 | |
412 | PTHREADPOOL_INTERNAL void pthreadpool_thread_parallelize_3d_tile_1d_fastpath( |
413 | struct pthreadpool* threadpool, |
414 | struct thread_info* thread) |
415 | { |
416 | assert(threadpool != NULL); |
417 | assert(thread != NULL); |
418 | |
419 | const pthreadpool_task_3d_tile_1d_t task = (pthreadpool_task_3d_tile_1d_t) pthreadpool_load_relaxed_void_p(&threadpool->task); |
420 | void *const argument = pthreadpool_load_relaxed_void_p(&threadpool->argument); |
421 | |
422 | const size_t threads_count = threadpool->threads_count.value; |
423 | const size_t range_threshold = -threads_count; |
424 | |
425 | /* Process thread's own range of items */ |
426 | const size_t range_start = pthreadpool_load_relaxed_size_t(&thread->range_start); |
427 | const struct fxdiv_divisor_size_t tile_range_k = threadpool->params.parallelize_3d_tile_1d.tile_range_k; |
428 | const struct fxdiv_result_size_t tile_index_ij_k = fxdiv_divide_size_t(range_start, tile_range_k); |
429 | const struct fxdiv_divisor_size_t range_j = threadpool->params.parallelize_3d_tile_1d.range_j; |
430 | const struct fxdiv_result_size_t index_i_j = fxdiv_divide_size_t(tile_index_ij_k.quotient, range_j); |
431 | const size_t tile_k = threadpool->params.parallelize_3d_tile_1d.tile_k; |
432 | size_t i = index_i_j.quotient; |
433 | size_t j = index_i_j.remainder; |
434 | size_t start_k = tile_index_ij_k.remainder * tile_k; |
435 | |
436 | const size_t range_k = threadpool->params.parallelize_3d_tile_1d.range_k; |
437 | while (pthreadpool_decrement_fetch_relaxed_size_t(&thread->range_length) < range_threshold) { |
438 | task(argument, i, j, start_k, min(range_k - start_k, tile_k)); |
439 | start_k += tile_k; |
440 | if (start_k >= range_k) { |
441 | start_k = 0; |
442 | if (++j == range_j.value) { |
443 | j = 0; |
444 | i += 1; |
445 | } |
446 | } |
447 | } |
448 | |
449 | /* There still may be other threads with work */ |
450 | const size_t thread_number = thread->thread_number; |
451 | for (size_t tid = modulo_decrement(thread_number, threads_count); |
452 | tid != thread_number; |
453 | tid = modulo_decrement(tid, threads_count)) |
454 | { |
455 | struct thread_info* other_thread = &threadpool->threads[tid]; |
456 | while (pthreadpool_decrement_fetch_relaxed_size_t(&other_thread->range_length) < range_threshold) { |
457 | const size_t linear_index = pthreadpool_decrement_fetch_relaxed_size_t(&other_thread->range_end); |
458 | const struct fxdiv_result_size_t tile_index_ij_k = fxdiv_divide_size_t(linear_index, tile_range_k); |
459 | const struct fxdiv_result_size_t index_i_j = fxdiv_divide_size_t(tile_index_ij_k.quotient, range_j); |
460 | const size_t start_k = tile_index_ij_k.remainder * tile_k; |
461 | task(argument, index_i_j.quotient, index_i_j.remainder, start_k, min(range_k - start_k, tile_k)); |
462 | } |
463 | } |
464 | |
465 | /* Make changes by this thread visible to other threads */ |
466 | pthreadpool_fence_release(); |
467 | } |
468 | |
469 | PTHREADPOOL_INTERNAL void pthreadpool_thread_parallelize_3d_tile_2d_fastpath( |
470 | struct pthreadpool* threadpool, |
471 | struct thread_info* thread) |
472 | { |
473 | assert(threadpool != NULL); |
474 | assert(thread != NULL); |
475 | |
476 | const pthreadpool_task_3d_tile_2d_t task = (pthreadpool_task_3d_tile_2d_t) pthreadpool_load_relaxed_void_p(&threadpool->task); |
477 | void *const argument = pthreadpool_load_relaxed_void_p(&threadpool->argument); |
478 | |
479 | const size_t threads_count = threadpool->threads_count.value; |
480 | const size_t range_threshold = -threads_count; |
481 | |
482 | /* Process thread's own range of items */ |
483 | const size_t range_start = pthreadpool_load_relaxed_size_t(&thread->range_start); |
484 | const struct fxdiv_divisor_size_t tile_range_k = threadpool->params.parallelize_3d_tile_2d.tile_range_k; |
485 | const struct fxdiv_result_size_t tile_index_ij_k = fxdiv_divide_size_t(range_start, tile_range_k); |
486 | const struct fxdiv_divisor_size_t tile_range_j = threadpool->params.parallelize_3d_tile_2d.tile_range_j; |
487 | const struct fxdiv_result_size_t tile_index_i_j = fxdiv_divide_size_t(tile_index_ij_k.quotient, tile_range_j); |
488 | const size_t tile_j = threadpool->params.parallelize_3d_tile_2d.tile_j; |
489 | const size_t tile_k = threadpool->params.parallelize_3d_tile_2d.tile_k; |
490 | size_t i = tile_index_i_j.quotient; |
491 | size_t start_j = tile_index_i_j.remainder * tile_j; |
492 | size_t start_k = tile_index_ij_k.remainder * tile_k; |
493 | |
494 | const size_t range_k = threadpool->params.parallelize_3d_tile_2d.range_k; |
495 | const size_t range_j = threadpool->params.parallelize_3d_tile_2d.range_j; |
496 | while (pthreadpool_decrement_fetch_relaxed_size_t(&thread->range_length) < range_threshold) { |
497 | task(argument, i, start_j, start_k, min(range_j - start_j, tile_j), min(range_k - start_k, tile_k)); |
498 | start_k += tile_k; |
499 | if (start_k >= range_k) { |
500 | start_k = 0; |
501 | start_j += tile_j; |
502 | if (start_j >= range_j) { |
503 | start_j = 0; |
504 | i += 1; |
505 | } |
506 | } |
507 | } |
508 | |
509 | /* There still may be other threads with work */ |
510 | const size_t thread_number = thread->thread_number; |
511 | for (size_t tid = modulo_decrement(thread_number, threads_count); |
512 | tid != thread_number; |
513 | tid = modulo_decrement(tid, threads_count)) |
514 | { |
515 | struct thread_info* other_thread = &threadpool->threads[tid]; |
516 | while (pthreadpool_decrement_fetch_relaxed_size_t(&other_thread->range_length) < range_threshold) { |
517 | const size_t linear_index = pthreadpool_decrement_fetch_relaxed_size_t(&other_thread->range_end); |
518 | const struct fxdiv_result_size_t tile_index_ij_k = fxdiv_divide_size_t(linear_index, tile_range_k); |
519 | const struct fxdiv_result_size_t tile_index_i_j = fxdiv_divide_size_t(tile_index_ij_k.quotient, tile_range_j); |
520 | const size_t start_j = tile_index_i_j.remainder * tile_j; |
521 | const size_t start_k = tile_index_ij_k.remainder * tile_k; |
522 | task(argument, tile_index_i_j.quotient, start_j, start_k, min(range_j - start_j, tile_j), min(range_k - start_k, tile_k)); |
523 | } |
524 | } |
525 | |
526 | /* Make changes by this thread visible to other threads */ |
527 | pthreadpool_fence_release(); |
528 | } |
529 | |
530 | PTHREADPOOL_INTERNAL void pthreadpool_thread_parallelize_3d_tile_2d_with_uarch_fastpath( |
531 | struct pthreadpool* threadpool, |
532 | struct thread_info* thread) |
533 | { |
534 | assert(threadpool != NULL); |
535 | assert(thread != NULL); |
536 | |
537 | const pthreadpool_task_3d_tile_2d_with_id_t task = (pthreadpool_task_3d_tile_2d_with_id_t) pthreadpool_load_relaxed_void_p(&threadpool->task); |
538 | void *const argument = pthreadpool_load_relaxed_void_p(&threadpool->argument); |
539 | |
540 | const uint32_t default_uarch_index = threadpool->params.parallelize_3d_tile_2d_with_uarch.default_uarch_index; |
541 | uint32_t uarch_index = default_uarch_index; |
542 | #if PTHREADPOOL_USE_CPUINFO |
543 | uarch_index = cpuinfo_get_current_uarch_index_with_default(default_uarch_index); |
544 | if (uarch_index > threadpool->params.parallelize_3d_tile_2d_with_uarch.max_uarch_index) { |
545 | uarch_index = default_uarch_index; |
546 | } |
547 | #endif |
548 | |
549 | const size_t threads_count = threadpool->threads_count.value; |
550 | const size_t range_threshold = -threads_count; |
551 | |
552 | /* Process thread's own range of items */ |
553 | const size_t range_start = pthreadpool_load_relaxed_size_t(&thread->range_start); |
554 | const struct fxdiv_divisor_size_t tile_range_k = threadpool->params.parallelize_3d_tile_2d_with_uarch.tile_range_k; |
555 | const struct fxdiv_result_size_t tile_index_ij_k = fxdiv_divide_size_t(range_start, tile_range_k); |
556 | const struct fxdiv_divisor_size_t tile_range_j = threadpool->params.parallelize_3d_tile_2d_with_uarch.tile_range_j; |
557 | const struct fxdiv_result_size_t tile_index_i_j = fxdiv_divide_size_t(tile_index_ij_k.quotient, tile_range_j); |
558 | const size_t tile_j = threadpool->params.parallelize_3d_tile_2d_with_uarch.tile_j; |
559 | const size_t tile_k = threadpool->params.parallelize_3d_tile_2d_with_uarch.tile_k; |
560 | size_t i = tile_index_i_j.quotient; |
561 | size_t start_j = tile_index_i_j.remainder * tile_j; |
562 | size_t start_k = tile_index_ij_k.remainder * tile_k; |
563 | |
564 | const size_t range_k = threadpool->params.parallelize_3d_tile_2d_with_uarch.range_k; |
565 | const size_t range_j = threadpool->params.parallelize_3d_tile_2d_with_uarch.range_j; |
566 | while (pthreadpool_decrement_fetch_relaxed_size_t(&thread->range_length) < range_threshold) { |
567 | task(argument, uarch_index, i, start_j, start_k, min(range_j - start_j, tile_j), min(range_k - start_k, tile_k)); |
568 | start_k += tile_k; |
569 | if (start_k >= range_k) { |
570 | start_k = 0; |
571 | start_j += tile_j; |
572 | if (start_j >= range_j) { |
573 | start_j = 0; |
574 | i += 1; |
575 | } |
576 | } |
577 | } |
578 | |
579 | /* There still may be other threads with work */ |
580 | const size_t thread_number = thread->thread_number; |
581 | for (size_t tid = modulo_decrement(thread_number, threads_count); |
582 | tid != thread_number; |
583 | tid = modulo_decrement(tid, threads_count)) |
584 | { |
585 | struct thread_info* other_thread = &threadpool->threads[tid]; |
586 | while (pthreadpool_decrement_fetch_relaxed_size_t(&other_thread->range_length) < range_threshold) { |
587 | const size_t linear_index = pthreadpool_decrement_fetch_relaxed_size_t(&other_thread->range_end); |
588 | const struct fxdiv_result_size_t tile_index_ij_k = fxdiv_divide_size_t(linear_index, tile_range_k); |
589 | const struct fxdiv_result_size_t tile_index_i_j = fxdiv_divide_size_t(tile_index_ij_k.quotient, tile_range_j); |
590 | const size_t start_j = tile_index_i_j.remainder * tile_j; |
591 | const size_t start_k = tile_index_ij_k.remainder * tile_k; |
592 | task(argument, uarch_index, tile_index_i_j.quotient, start_j, start_k, min(range_j - start_j, tile_j), min(range_k - start_k, tile_k)); |
593 | } |
594 | } |
595 | |
596 | /* Make changes by this thread visible to other threads */ |
597 | pthreadpool_fence_release(); |
598 | } |
599 | |
600 | PTHREADPOOL_INTERNAL void pthreadpool_thread_parallelize_4d_fastpath( |
601 | struct pthreadpool* threadpool, |
602 | struct thread_info* thread) |
603 | { |
604 | assert(threadpool != NULL); |
605 | assert(thread != NULL); |
606 | |
607 | const pthreadpool_task_4d_t task = (pthreadpool_task_4d_t) pthreadpool_load_relaxed_void_p(&threadpool->task); |
608 | void *const argument = pthreadpool_load_relaxed_void_p(&threadpool->argument); |
609 | |
610 | const size_t threads_count = threadpool->threads_count.value; |
611 | const size_t range_threshold = -threads_count; |
612 | |
613 | /* Process thread's own range of items */ |
614 | const size_t range_start = pthreadpool_load_relaxed_size_t(&thread->range_start); |
615 | const struct fxdiv_divisor_size_t range_kl = threadpool->params.parallelize_4d.range_kl; |
616 | const struct fxdiv_result_size_t index_ij_kl = fxdiv_divide_size_t(range_start, range_kl); |
617 | const struct fxdiv_divisor_size_t range_j = threadpool->params.parallelize_4d.range_j; |
618 | const struct fxdiv_result_size_t index_i_j = fxdiv_divide_size_t(index_ij_kl.quotient, range_j); |
619 | const struct fxdiv_divisor_size_t range_l = threadpool->params.parallelize_4d.range_l; |
620 | const struct fxdiv_result_size_t index_k_l = fxdiv_divide_size_t(index_ij_kl.remainder, range_l); |
621 | size_t i = index_i_j.quotient; |
622 | size_t j = index_i_j.remainder; |
623 | size_t k = index_k_l.quotient; |
624 | size_t l = index_k_l.remainder; |
625 | |
626 | const size_t range_k = threadpool->params.parallelize_4d.range_k; |
627 | while (pthreadpool_decrement_fetch_relaxed_size_t(&thread->range_length) < range_threshold) { |
628 | task(argument, i, j, k, l); |
629 | if (++l == range_l.value) { |
630 | l = 0; |
631 | if (++k == range_k) { |
632 | k = 0; |
633 | if (++j == range_j.value) { |
634 | j = 0; |
635 | i += 1; |
636 | } |
637 | } |
638 | } |
639 | } |
640 | |
641 | /* There still may be other threads with work */ |
642 | const size_t thread_number = thread->thread_number; |
643 | for (size_t tid = modulo_decrement(thread_number, threads_count); |
644 | tid != thread_number; |
645 | tid = modulo_decrement(tid, threads_count)) |
646 | { |
647 | struct thread_info* other_thread = &threadpool->threads[tid]; |
648 | while (pthreadpool_decrement_fetch_relaxed_size_t(&other_thread->range_length) < range_threshold) { |
649 | const size_t linear_index = pthreadpool_decrement_fetch_relaxed_size_t(&other_thread->range_end); |
650 | const struct fxdiv_result_size_t index_ij_kl = fxdiv_divide_size_t(linear_index, range_kl); |
651 | const struct fxdiv_result_size_t index_i_j = fxdiv_divide_size_t(index_ij_kl.quotient, range_j); |
652 | const struct fxdiv_result_size_t index_k_l = fxdiv_divide_size_t(index_ij_kl.remainder, range_l); |
653 | task(argument, index_i_j.quotient, index_i_j.remainder, index_k_l.quotient, index_k_l.remainder); |
654 | } |
655 | } |
656 | |
657 | /* Make changes by this thread visible to other threads */ |
658 | pthreadpool_fence_release(); |
659 | } |
660 | |
661 | PTHREADPOOL_INTERNAL void pthreadpool_thread_parallelize_4d_tile_1d_fastpath( |
662 | struct pthreadpool* threadpool, |
663 | struct thread_info* thread) |
664 | { |
665 | assert(threadpool != NULL); |
666 | assert(thread != NULL); |
667 | |
668 | const pthreadpool_task_4d_tile_1d_t task = (pthreadpool_task_4d_tile_1d_t) pthreadpool_load_relaxed_void_p(&threadpool->task); |
669 | void *const argument = pthreadpool_load_relaxed_void_p(&threadpool->argument); |
670 | |
671 | const size_t threads_count = threadpool->threads_count.value; |
672 | const size_t range_threshold = -threads_count; |
673 | |
674 | /* Process thread's own range of items */ |
675 | const size_t range_start = pthreadpool_load_relaxed_size_t(&thread->range_start); |
676 | const struct fxdiv_divisor_size_t tile_range_kl = threadpool->params.parallelize_4d_tile_1d.tile_range_kl; |
677 | const struct fxdiv_result_size_t tile_index_ij_kl = fxdiv_divide_size_t(range_start, tile_range_kl); |
678 | const struct fxdiv_divisor_size_t range_j = threadpool->params.parallelize_4d_tile_1d.range_j; |
679 | const struct fxdiv_result_size_t index_i_j = fxdiv_divide_size_t(tile_index_ij_kl.quotient, range_j); |
680 | const struct fxdiv_divisor_size_t tile_range_l = threadpool->params.parallelize_4d_tile_1d.tile_range_l; |
681 | const struct fxdiv_result_size_t tile_index_k_l = fxdiv_divide_size_t(tile_index_ij_kl.remainder, tile_range_l); |
682 | const size_t tile_l = threadpool->params.parallelize_4d_tile_1d.tile_l; |
683 | size_t i = index_i_j.quotient; |
684 | size_t j = index_i_j.remainder; |
685 | size_t k = tile_index_k_l.quotient; |
686 | size_t start_l = tile_index_k_l.remainder * tile_l; |
687 | |
688 | const size_t range_l = threadpool->params.parallelize_4d_tile_1d.range_l; |
689 | const size_t range_k = threadpool->params.parallelize_4d_tile_1d.range_k; |
690 | while (pthreadpool_decrement_fetch_relaxed_size_t(&thread->range_length) < range_threshold) { |
691 | task(argument, i, j, k, start_l, min(range_l - start_l, tile_l)); |
692 | start_l += tile_l; |
693 | if (start_l >= range_l) { |
694 | start_l = 0; |
695 | if (++k == range_k) { |
696 | k = 0; |
697 | if (++j == range_j.value) { |
698 | j = 0; |
699 | i += 1; |
700 | } |
701 | } |
702 | } |
703 | } |
704 | |
705 | /* There still may be other threads with work */ |
706 | const size_t thread_number = thread->thread_number; |
707 | for (size_t tid = modulo_decrement(thread_number, threads_count); |
708 | tid != thread_number; |
709 | tid = modulo_decrement(tid, threads_count)) |
710 | { |
711 | struct thread_info* other_thread = &threadpool->threads[tid]; |
712 | while (pthreadpool_decrement_fetch_relaxed_size_t(&other_thread->range_length) < range_threshold) { |
713 | const size_t linear_index = pthreadpool_decrement_fetch_relaxed_size_t(&other_thread->range_end); |
714 | const struct fxdiv_result_size_t tile_index_ij_kl = fxdiv_divide_size_t(linear_index, tile_range_kl); |
715 | const struct fxdiv_result_size_t index_i_j = fxdiv_divide_size_t(tile_index_ij_kl.quotient, range_j); |
716 | const struct fxdiv_result_size_t tile_index_k_l = fxdiv_divide_size_t(tile_index_ij_kl.remainder, tile_range_l); |
717 | const size_t start_l = tile_index_k_l.remainder * tile_l; |
718 | task(argument, index_i_j.quotient, index_i_j.remainder, tile_index_k_l.quotient, start_l, min(range_l - start_l, tile_l)); |
719 | } |
720 | } |
721 | |
722 | /* Make changes by this thread visible to other threads */ |
723 | pthreadpool_fence_release(); |
724 | } |
725 | |
726 | PTHREADPOOL_INTERNAL void pthreadpool_thread_parallelize_4d_tile_2d_fastpath( |
727 | struct pthreadpool* threadpool, |
728 | struct thread_info* thread) |
729 | { |
730 | assert(threadpool != NULL); |
731 | assert(thread != NULL); |
732 | |
733 | const pthreadpool_task_4d_tile_2d_t task = (pthreadpool_task_4d_tile_2d_t) pthreadpool_load_relaxed_void_p(&threadpool->task); |
734 | void *const argument = pthreadpool_load_relaxed_void_p(&threadpool->argument); |
735 | |
736 | const size_t threads_count = threadpool->threads_count.value; |
737 | const size_t range_threshold = -threads_count; |
738 | |
739 | /* Process thread's own range of items */ |
740 | const size_t range_start = pthreadpool_load_relaxed_size_t(&thread->range_start); |
741 | const struct fxdiv_divisor_size_t tile_range_kl = threadpool->params.parallelize_4d_tile_2d.tile_range_kl; |
742 | const struct fxdiv_result_size_t tile_index_ij_kl = fxdiv_divide_size_t(range_start, tile_range_kl); |
743 | const struct fxdiv_divisor_size_t range_j = threadpool->params.parallelize_4d_tile_2d.range_j; |
744 | const struct fxdiv_result_size_t index_i_j = fxdiv_divide_size_t(tile_index_ij_kl.quotient, range_j); |
745 | const struct fxdiv_divisor_size_t tile_range_l = threadpool->params.parallelize_4d_tile_2d.tile_range_l; |
746 | const struct fxdiv_result_size_t tile_index_k_l = fxdiv_divide_size_t(tile_index_ij_kl.remainder, tile_range_l); |
747 | const size_t tile_k = threadpool->params.parallelize_4d_tile_2d.tile_k; |
748 | const size_t tile_l = threadpool->params.parallelize_4d_tile_2d.tile_l; |
749 | size_t i = index_i_j.quotient; |
750 | size_t j = index_i_j.remainder; |
751 | size_t start_k = tile_index_k_l.quotient * tile_k; |
752 | size_t start_l = tile_index_k_l.remainder * tile_l; |
753 | |
754 | const size_t range_l = threadpool->params.parallelize_4d_tile_2d.range_l; |
755 | const size_t range_k = threadpool->params.parallelize_4d_tile_2d.range_k; |
756 | while (pthreadpool_decrement_fetch_relaxed_size_t(&thread->range_length) < range_threshold) { |
757 | task(argument, i, j, start_k, start_l, min(range_k - start_k, tile_k), min(range_l - start_l, tile_l)); |
758 | start_l += tile_l; |
759 | if (start_l >= range_l) { |
760 | start_l = 0; |
761 | start_k += tile_k; |
762 | if (start_k >= range_k) { |
763 | start_k = 0; |
764 | if (++j == range_j.value) { |
765 | j = 0; |
766 | i += 1; |
767 | } |
768 | } |
769 | } |
770 | } |
771 | |
772 | /* There still may be other threads with work */ |
773 | const size_t thread_number = thread->thread_number; |
774 | for (size_t tid = modulo_decrement(thread_number, threads_count); |
775 | tid != thread_number; |
776 | tid = modulo_decrement(tid, threads_count)) |
777 | { |
778 | struct thread_info* other_thread = &threadpool->threads[tid]; |
779 | while (pthreadpool_decrement_fetch_relaxed_size_t(&other_thread->range_length) < range_threshold) { |
780 | const size_t linear_index = pthreadpool_decrement_fetch_relaxed_size_t(&other_thread->range_end); |
781 | const struct fxdiv_result_size_t tile_index_ij_kl = fxdiv_divide_size_t(linear_index, tile_range_kl); |
782 | const struct fxdiv_result_size_t index_i_j = fxdiv_divide_size_t(tile_index_ij_kl.quotient, range_j); |
783 | const struct fxdiv_result_size_t tile_index_k_l = fxdiv_divide_size_t(tile_index_ij_kl.remainder, tile_range_l); |
784 | const size_t start_k = tile_index_k_l.quotient * tile_k; |
785 | const size_t start_l = tile_index_k_l.remainder * tile_l; |
786 | task(argument, index_i_j.quotient, index_i_j.remainder, start_k, start_l, min(range_k - start_k, tile_k), min(range_l - start_l, tile_l)); |
787 | } |
788 | } |
789 | |
790 | /* Make changes by this thread visible to other threads */ |
791 | pthreadpool_fence_release(); |
792 | } |
793 | |
794 | PTHREADPOOL_INTERNAL void pthreadpool_thread_parallelize_4d_tile_2d_with_uarch_fastpath( |
795 | struct pthreadpool* threadpool, |
796 | struct thread_info* thread) |
797 | { |
798 | assert(threadpool != NULL); |
799 | assert(thread != NULL); |
800 | |
801 | const pthreadpool_task_4d_tile_2d_with_id_t task = (pthreadpool_task_4d_tile_2d_with_id_t) pthreadpool_load_relaxed_void_p(&threadpool->task); |
802 | void *const argument = pthreadpool_load_relaxed_void_p(&threadpool->argument); |
803 | |
804 | const uint32_t default_uarch_index = threadpool->params.parallelize_4d_tile_2d_with_uarch.default_uarch_index; |
805 | uint32_t uarch_index = default_uarch_index; |
806 | #if PTHREADPOOL_USE_CPUINFO |
807 | uarch_index = cpuinfo_get_current_uarch_index_with_default(default_uarch_index); |
808 | if (uarch_index > threadpool->params.parallelize_4d_tile_2d_with_uarch.max_uarch_index) { |
809 | uarch_index = default_uarch_index; |
810 | } |
811 | #endif |
812 | |
813 | const size_t threads_count = threadpool->threads_count.value; |
814 | const size_t range_threshold = -threads_count; |
815 | |
816 | /* Process thread's own range of items */ |
817 | const size_t range_start = pthreadpool_load_relaxed_size_t(&thread->range_start); |
818 | const struct fxdiv_divisor_size_t tile_range_kl = threadpool->params.parallelize_4d_tile_2d_with_uarch.tile_range_kl; |
819 | const struct fxdiv_result_size_t tile_index_ij_kl = fxdiv_divide_size_t(range_start, tile_range_kl); |
820 | const struct fxdiv_divisor_size_t range_j = threadpool->params.parallelize_4d_tile_2d_with_uarch.range_j; |
821 | const struct fxdiv_result_size_t index_i_j = fxdiv_divide_size_t(tile_index_ij_kl.quotient, range_j); |
822 | const struct fxdiv_divisor_size_t tile_range_l = threadpool->params.parallelize_4d_tile_2d_with_uarch.tile_range_l; |
823 | const struct fxdiv_result_size_t tile_index_k_l = fxdiv_divide_size_t(tile_index_ij_kl.remainder, tile_range_l); |
824 | const size_t tile_k = threadpool->params.parallelize_4d_tile_2d_with_uarch.tile_k; |
825 | const size_t tile_l = threadpool->params.parallelize_4d_tile_2d_with_uarch.tile_l; |
826 | size_t i = index_i_j.quotient; |
827 | size_t j = index_i_j.remainder; |
828 | size_t start_k = tile_index_k_l.quotient * tile_k; |
829 | size_t start_l = tile_index_k_l.remainder * tile_l; |
830 | |
831 | const size_t range_l = threadpool->params.parallelize_4d_tile_2d_with_uarch.range_l; |
832 | const size_t range_k = threadpool->params.parallelize_4d_tile_2d_with_uarch.range_k; |
833 | while (pthreadpool_decrement_fetch_relaxed_size_t(&thread->range_length) < range_threshold) { |
834 | task(argument, uarch_index, i, j, start_k, start_l, min(range_k - start_k, tile_k), min(range_l - start_l, tile_l)); |
835 | start_l += tile_l; |
836 | if (start_l >= range_l) { |
837 | start_l = 0; |
838 | start_k += tile_k; |
839 | if (start_k >= range_k) { |
840 | start_k = 0; |
841 | if (++j == range_j.value) { |
842 | j = 0; |
843 | i += 1; |
844 | } |
845 | } |
846 | } |
847 | } |
848 | |
849 | /* There still may be other threads with work */ |
850 | const size_t thread_number = thread->thread_number; |
851 | for (size_t tid = modulo_decrement(thread_number, threads_count); |
852 | tid != thread_number; |
853 | tid = modulo_decrement(tid, threads_count)) |
854 | { |
855 | struct thread_info* other_thread = &threadpool->threads[tid]; |
856 | while (pthreadpool_decrement_fetch_relaxed_size_t(&other_thread->range_length) < range_threshold) { |
857 | const size_t linear_index = pthreadpool_decrement_fetch_relaxed_size_t(&other_thread->range_end); |
858 | const struct fxdiv_result_size_t tile_index_ij_kl = fxdiv_divide_size_t(linear_index, tile_range_kl); |
859 | const struct fxdiv_result_size_t index_i_j = fxdiv_divide_size_t(tile_index_ij_kl.quotient, range_j); |
860 | const struct fxdiv_result_size_t tile_index_k_l = fxdiv_divide_size_t(tile_index_ij_kl.remainder, tile_range_l); |
861 | const size_t start_k = tile_index_k_l.quotient * tile_k; |
862 | const size_t start_l = tile_index_k_l.remainder * tile_l; |
863 | task(argument, uarch_index, index_i_j.quotient, index_i_j.remainder, start_k, start_l, min(range_k - start_k, tile_k), min(range_l - start_l, tile_l)); |
864 | } |
865 | } |
866 | |
867 | /* Make changes by this thread visible to other threads */ |
868 | pthreadpool_fence_release(); |
869 | } |
870 | |
871 | PTHREADPOOL_INTERNAL void pthreadpool_thread_parallelize_5d_fastpath( |
872 | struct pthreadpool* threadpool, |
873 | struct thread_info* thread) |
874 | { |
875 | assert(threadpool != NULL); |
876 | assert(thread != NULL); |
877 | |
878 | const pthreadpool_task_5d_t task = (pthreadpool_task_5d_t) pthreadpool_load_relaxed_void_p(&threadpool->task); |
879 | void *const argument = pthreadpool_load_relaxed_void_p(&threadpool->argument); |
880 | |
881 | const size_t threads_count = threadpool->threads_count.value; |
882 | const size_t range_threshold = -threads_count; |
883 | |
884 | /* Process thread's own range of items */ |
885 | const size_t range_start = pthreadpool_load_relaxed_size_t(&thread->range_start); |
886 | const struct fxdiv_divisor_size_t range_lm = threadpool->params.parallelize_5d.range_lm; |
887 | const struct fxdiv_result_size_t index_ijk_lm = fxdiv_divide_size_t(range_start, range_lm); |
888 | const struct fxdiv_divisor_size_t range_k = threadpool->params.parallelize_5d.range_k; |
889 | const struct fxdiv_result_size_t index_ij_k = fxdiv_divide_size_t(index_ijk_lm.quotient, range_k); |
890 | const struct fxdiv_divisor_size_t range_m = threadpool->params.parallelize_5d.range_m; |
891 | const struct fxdiv_result_size_t index_l_m = fxdiv_divide_size_t(index_ijk_lm.remainder, range_m); |
892 | const struct fxdiv_divisor_size_t range_j = threadpool->params.parallelize_5d.range_j; |
893 | const struct fxdiv_result_size_t index_i_j = fxdiv_divide_size_t(index_ij_k.quotient, range_j); |
894 | size_t i = index_i_j.quotient; |
895 | size_t j = index_i_j.remainder; |
896 | size_t k = index_ij_k.remainder; |
897 | size_t l = index_l_m.quotient; |
898 | size_t m = index_l_m.remainder; |
899 | |
900 | const size_t range_l = threadpool->params.parallelize_5d.range_l; |
901 | while (pthreadpool_decrement_fetch_relaxed_size_t(&thread->range_length) < range_threshold) { |
902 | task(argument, i, j, k, l, m); |
903 | if (++m == range_m.value) { |
904 | m = 0; |
905 | if (++l == range_l) { |
906 | l = 0; |
907 | if (++k == range_k.value) { |
908 | k = 0; |
909 | if (++j == range_j.value) { |
910 | j = 0; |
911 | i += 1; |
912 | } |
913 | } |
914 | } |
915 | } |
916 | } |
917 | |
918 | /* There still may be other threads with work */ |
919 | const size_t thread_number = thread->thread_number; |
920 | for (size_t tid = modulo_decrement(thread_number, threads_count); |
921 | tid != thread_number; |
922 | tid = modulo_decrement(tid, threads_count)) |
923 | { |
924 | struct thread_info* other_thread = &threadpool->threads[tid]; |
925 | while (pthreadpool_decrement_fetch_relaxed_size_t(&other_thread->range_length) < range_threshold) { |
926 | const size_t linear_index = pthreadpool_decrement_fetch_relaxed_size_t(&other_thread->range_end); |
927 | const struct fxdiv_result_size_t index_ijk_lm = fxdiv_divide_size_t(linear_index, range_lm); |
928 | const struct fxdiv_result_size_t index_ij_k = fxdiv_divide_size_t(index_ijk_lm.quotient, range_k); |
929 | const struct fxdiv_result_size_t index_l_m = fxdiv_divide_size_t(index_ijk_lm.remainder, range_m); |
930 | const struct fxdiv_result_size_t index_i_j = fxdiv_divide_size_t(index_ij_k.quotient, range_j); |
931 | task(argument, index_i_j.quotient, index_i_j.remainder, index_ij_k.remainder, index_l_m.quotient, index_l_m.remainder); |
932 | } |
933 | } |
934 | |
935 | /* Make changes by this thread visible to other threads */ |
936 | pthreadpool_fence_release(); |
937 | } |
938 | |
939 | PTHREADPOOL_INTERNAL void pthreadpool_thread_parallelize_5d_tile_1d_fastpath( |
940 | struct pthreadpool* threadpool, |
941 | struct thread_info* thread) |
942 | { |
943 | assert(threadpool != NULL); |
944 | assert(thread != NULL); |
945 | |
946 | const pthreadpool_task_5d_tile_1d_t task = (pthreadpool_task_5d_tile_1d_t) pthreadpool_load_relaxed_void_p(&threadpool->task); |
947 | void *const argument = pthreadpool_load_relaxed_void_p(&threadpool->argument); |
948 | |
949 | const size_t threads_count = threadpool->threads_count.value; |
950 | const size_t range_threshold = -threads_count; |
951 | |
952 | /* Process thread's own range of items */ |
953 | const size_t range_start = pthreadpool_load_relaxed_size_t(&thread->range_start); |
954 | const struct fxdiv_divisor_size_t tile_range_m = threadpool->params.parallelize_5d_tile_1d.tile_range_m; |
955 | const struct fxdiv_result_size_t tile_index_ijkl_m = fxdiv_divide_size_t(range_start, tile_range_m); |
956 | const struct fxdiv_divisor_size_t range_kl = threadpool->params.parallelize_5d_tile_1d.range_kl; |
957 | const struct fxdiv_result_size_t index_ij_kl = fxdiv_divide_size_t(tile_index_ijkl_m.quotient, range_kl); |
958 | const struct fxdiv_divisor_size_t range_j = threadpool->params.parallelize_5d_tile_1d.range_j; |
959 | const struct fxdiv_result_size_t index_i_j = fxdiv_divide_size_t(index_ij_kl.quotient, range_j); |
960 | const struct fxdiv_divisor_size_t range_l = threadpool->params.parallelize_5d_tile_1d.range_l; |
961 | const struct fxdiv_result_size_t index_k_l = fxdiv_divide_size_t(index_ij_kl.remainder, range_l); |
962 | const size_t tile_m = threadpool->params.parallelize_5d_tile_1d.tile_m; |
963 | size_t i = index_i_j.quotient; |
964 | size_t j = index_i_j.remainder; |
965 | size_t k = index_k_l.quotient; |
966 | size_t l = index_k_l.remainder; |
967 | size_t start_m = tile_index_ijkl_m.remainder * tile_m; |
968 | |
969 | const size_t range_m = threadpool->params.parallelize_5d_tile_1d.range_m; |
970 | const size_t range_k = threadpool->params.parallelize_5d_tile_1d.range_k; |
971 | while (pthreadpool_decrement_fetch_relaxed_size_t(&thread->range_length) < range_threshold) { |
972 | task(argument, i, j, k, l, start_m, min(range_m - start_m, tile_m)); |
973 | start_m += tile_m; |
974 | if (start_m >= range_m) { |
975 | start_m = 0; |
976 | if (++l == range_l.value) { |
977 | l = 0; |
978 | if (++k == range_k) { |
979 | k = 0; |
980 | if (++j == range_j.value) { |
981 | j = 0; |
982 | i += 1; |
983 | } |
984 | } |
985 | } |
986 | } |
987 | } |
988 | |
989 | /* There still may be other threads with work */ |
990 | const size_t thread_number = thread->thread_number; |
991 | for (size_t tid = modulo_decrement(thread_number, threads_count); |
992 | tid != thread_number; |
993 | tid = modulo_decrement(tid, threads_count)) |
994 | { |
995 | struct thread_info* other_thread = &threadpool->threads[tid]; |
996 | while (pthreadpool_decrement_fetch_relaxed_size_t(&other_thread->range_length) < range_threshold) { |
997 | const size_t linear_index = pthreadpool_decrement_fetch_relaxed_size_t(&other_thread->range_end); |
998 | const struct fxdiv_result_size_t tile_index_ijkl_m = fxdiv_divide_size_t(linear_index, tile_range_m); |
999 | const struct fxdiv_result_size_t index_ij_kl = fxdiv_divide_size_t(tile_index_ijkl_m.quotient, range_kl); |
1000 | const struct fxdiv_result_size_t index_i_j = fxdiv_divide_size_t(index_ij_kl.quotient, range_j); |
1001 | const struct fxdiv_result_size_t index_k_l = fxdiv_divide_size_t(index_ij_kl.remainder, range_l); |
1002 | size_t start_m = tile_index_ijkl_m.remainder * tile_m; |
1003 | task(argument, index_i_j.quotient, index_i_j.remainder, index_k_l.quotient, index_k_l.remainder, start_m, |
1004 | min(range_m - start_m, tile_m)); |
1005 | } |
1006 | } |
1007 | |
1008 | /* Make changes by this thread visible to other threads */ |
1009 | pthreadpool_fence_release(); |
1010 | } |
1011 | |
1012 | PTHREADPOOL_INTERNAL void pthreadpool_thread_parallelize_5d_tile_2d_fastpath( |
1013 | struct pthreadpool* threadpool, |
1014 | struct thread_info* thread) |
1015 | { |
1016 | assert(threadpool != NULL); |
1017 | assert(thread != NULL); |
1018 | |
1019 | const pthreadpool_task_5d_tile_2d_t task = (pthreadpool_task_5d_tile_2d_t) pthreadpool_load_relaxed_void_p(&threadpool->task); |
1020 | void *const argument = pthreadpool_load_relaxed_void_p(&threadpool->argument); |
1021 | |
1022 | const size_t threads_count = threadpool->threads_count.value; |
1023 | const size_t range_threshold = -threads_count; |
1024 | |
1025 | /* Process thread's own range of items */ |
1026 | const size_t range_start = pthreadpool_load_relaxed_size_t(&thread->range_start); |
1027 | const struct fxdiv_divisor_size_t tile_range_lm = threadpool->params.parallelize_5d_tile_2d.tile_range_lm; |
1028 | const struct fxdiv_result_size_t tile_index_ijk_lm = fxdiv_divide_size_t(range_start, tile_range_lm); |
1029 | const struct fxdiv_divisor_size_t range_k = threadpool->params.parallelize_5d_tile_2d.range_k; |
1030 | const struct fxdiv_result_size_t index_ij_k = fxdiv_divide_size_t(tile_index_ijk_lm.quotient, range_k); |
1031 | const struct fxdiv_divisor_size_t tile_range_m = threadpool->params.parallelize_5d_tile_2d.tile_range_m; |
1032 | const struct fxdiv_result_size_t tile_index_l_m = fxdiv_divide_size_t(tile_index_ijk_lm.remainder, tile_range_m); |
1033 | const struct fxdiv_divisor_size_t range_j = threadpool->params.parallelize_5d_tile_2d.range_j; |
1034 | const struct fxdiv_result_size_t index_i_j = fxdiv_divide_size_t(index_ij_k.quotient, range_j); |
1035 | const size_t tile_l = threadpool->params.parallelize_5d_tile_2d.tile_l; |
1036 | const size_t tile_m = threadpool->params.parallelize_5d_tile_2d.tile_m; |
1037 | size_t i = index_i_j.quotient; |
1038 | size_t j = index_i_j.remainder; |
1039 | size_t k = index_ij_k.remainder; |
1040 | size_t start_l = tile_index_l_m.quotient * tile_l; |
1041 | size_t start_m = tile_index_l_m.remainder * tile_m; |
1042 | |
1043 | const size_t range_m = threadpool->params.parallelize_5d_tile_2d.range_m; |
1044 | const size_t range_l = threadpool->params.parallelize_5d_tile_2d.range_l; |
1045 | while (pthreadpool_decrement_fetch_relaxed_size_t(&thread->range_length) < range_threshold) { |
1046 | task(argument, i, j, k, start_l, start_m, min(range_l - start_l, tile_l), min(range_m - start_m, tile_m)); |
1047 | start_m += tile_m; |
1048 | if (start_m >= range_m) { |
1049 | start_m = 0; |
1050 | start_l += tile_l; |
1051 | if (start_l >= range_l) { |
1052 | start_l = 0; |
1053 | if (++k == range_k.value) { |
1054 | k = 0; |
1055 | if (++j == range_j.value) { |
1056 | j = 0; |
1057 | i += 1; |
1058 | } |
1059 | } |
1060 | } |
1061 | } |
1062 | } |
1063 | |
1064 | /* There still may be other threads with work */ |
1065 | const size_t thread_number = thread->thread_number; |
1066 | for (size_t tid = modulo_decrement(thread_number, threads_count); |
1067 | tid != thread_number; |
1068 | tid = modulo_decrement(tid, threads_count)) |
1069 | { |
1070 | struct thread_info* other_thread = &threadpool->threads[tid]; |
1071 | while (pthreadpool_decrement_fetch_relaxed_size_t(&other_thread->range_length) < range_threshold) { |
1072 | const size_t linear_index = pthreadpool_decrement_fetch_relaxed_size_t(&other_thread->range_end); |
1073 | const struct fxdiv_result_size_t tile_index_ijk_lm = fxdiv_divide_size_t(linear_index, tile_range_lm); |
1074 | const struct fxdiv_result_size_t index_ij_k = fxdiv_divide_size_t(tile_index_ijk_lm.quotient, range_k); |
1075 | const struct fxdiv_result_size_t tile_index_l_m = fxdiv_divide_size_t(tile_index_ijk_lm.remainder, tile_range_m); |
1076 | const struct fxdiv_result_size_t index_i_j = fxdiv_divide_size_t(index_ij_k.quotient, range_j); |
1077 | const size_t start_l = tile_index_l_m.quotient * tile_l; |
1078 | const size_t start_m = tile_index_l_m.remainder * tile_m; |
1079 | task(argument, index_i_j.quotient, index_i_j.remainder, index_ij_k.remainder, |
1080 | start_l, start_m, min(range_l - start_l, tile_l), min(range_m - start_m, tile_m)); |
1081 | } |
1082 | } |
1083 | |
1084 | /* Make changes by this thread visible to other threads */ |
1085 | pthreadpool_fence_release(); |
1086 | } |
1087 | |
1088 | PTHREADPOOL_INTERNAL void pthreadpool_thread_parallelize_6d_fastpath( |
1089 | struct pthreadpool* threadpool, |
1090 | struct thread_info* thread) |
1091 | { |
1092 | assert(threadpool != NULL); |
1093 | assert(thread != NULL); |
1094 | |
1095 | const pthreadpool_task_6d_t task = (pthreadpool_task_6d_t) pthreadpool_load_relaxed_void_p(&threadpool->task); |
1096 | void *const argument = pthreadpool_load_relaxed_void_p(&threadpool->argument); |
1097 | |
1098 | const size_t threads_count = threadpool->threads_count.value; |
1099 | const size_t range_threshold = -threads_count; |
1100 | |
1101 | /* Process thread's own range of items */ |
1102 | const size_t range_start = pthreadpool_load_relaxed_size_t(&thread->range_start); |
1103 | const struct fxdiv_divisor_size_t range_lmn = threadpool->params.parallelize_6d.range_lmn; |
1104 | const struct fxdiv_result_size_t index_ijk_lmn = fxdiv_divide_size_t(range_start, range_lmn); |
1105 | const struct fxdiv_divisor_size_t range_k = threadpool->params.parallelize_6d.range_k; |
1106 | const struct fxdiv_result_size_t index_ij_k = fxdiv_divide_size_t(index_ijk_lmn.quotient, range_k); |
1107 | const struct fxdiv_divisor_size_t range_n = threadpool->params.parallelize_6d.range_n; |
1108 | const struct fxdiv_result_size_t index_lm_n = fxdiv_divide_size_t(index_ijk_lmn.remainder, range_n); |
1109 | const struct fxdiv_divisor_size_t range_j = threadpool->params.parallelize_6d.range_j; |
1110 | const struct fxdiv_result_size_t index_i_j = fxdiv_divide_size_t(index_ij_k.quotient, range_j); |
1111 | const struct fxdiv_divisor_size_t range_m = threadpool->params.parallelize_6d.range_m; |
1112 | const struct fxdiv_result_size_t index_l_m = fxdiv_divide_size_t(index_lm_n.quotient, range_m); |
1113 | size_t i = index_i_j.quotient; |
1114 | size_t j = index_i_j.remainder; |
1115 | size_t k = index_ij_k.remainder; |
1116 | size_t l = index_l_m.quotient; |
1117 | size_t m = index_l_m.remainder; |
1118 | size_t n = index_lm_n.remainder; |
1119 | |
1120 | const size_t range_l = threadpool->params.parallelize_6d.range_l; |
1121 | while (pthreadpool_decrement_fetch_relaxed_size_t(&thread->range_length) < range_threshold) { |
1122 | task(argument, i, j, k, l, m, n); |
1123 | if (++n == range_n.value) { |
1124 | n = 0; |
1125 | if (++m == range_m.value) { |
1126 | m = 0; |
1127 | if (++l == range_l) { |
1128 | l = 0; |
1129 | if (++k == range_k.value) { |
1130 | k = 0; |
1131 | if (++j == range_j.value) { |
1132 | j = 0; |
1133 | i += 1; |
1134 | } |
1135 | } |
1136 | } |
1137 | } |
1138 | } |
1139 | } |
1140 | |
1141 | |
1142 | /* There still may be other threads with work */ |
1143 | const size_t thread_number = thread->thread_number; |
1144 | for (size_t tid = modulo_decrement(thread_number, threads_count); |
1145 | tid != thread_number; |
1146 | tid = modulo_decrement(tid, threads_count)) |
1147 | { |
1148 | struct thread_info* other_thread = &threadpool->threads[tid]; |
1149 | while (pthreadpool_decrement_fetch_relaxed_size_t(&other_thread->range_length) < range_threshold) { |
1150 | const size_t linear_index = pthreadpool_decrement_fetch_relaxed_size_t(&other_thread->range_end); |
1151 | const struct fxdiv_result_size_t index_ijk_lmn = fxdiv_divide_size_t(linear_index, range_lmn); |
1152 | const struct fxdiv_result_size_t index_ij_k = fxdiv_divide_size_t(index_ijk_lmn.quotient, range_k); |
1153 | const struct fxdiv_result_size_t index_lm_n = fxdiv_divide_size_t(index_ijk_lmn.remainder, range_n); |
1154 | const struct fxdiv_result_size_t index_i_j = fxdiv_divide_size_t(index_ij_k.quotient, range_j); |
1155 | const struct fxdiv_result_size_t index_l_m = fxdiv_divide_size_t(index_lm_n.quotient, range_m); |
1156 | task(argument, index_i_j.quotient, index_i_j.remainder, index_ij_k.remainder, index_l_m.quotient, index_l_m.remainder, index_lm_n.remainder); |
1157 | } |
1158 | } |
1159 | |
1160 | /* Make changes by this thread visible to other threads */ |
1161 | pthreadpool_fence_release(); |
1162 | } |
1163 | |
1164 | PTHREADPOOL_INTERNAL void pthreadpool_thread_parallelize_6d_tile_1d_fastpath( |
1165 | struct pthreadpool* threadpool, |
1166 | struct thread_info* thread) |
1167 | { |
1168 | assert(threadpool != NULL); |
1169 | assert(thread != NULL); |
1170 | |
1171 | const pthreadpool_task_6d_tile_1d_t task = (pthreadpool_task_6d_tile_1d_t) pthreadpool_load_relaxed_void_p(&threadpool->task); |
1172 | void *const argument = pthreadpool_load_relaxed_void_p(&threadpool->argument); |
1173 | |
1174 | const size_t threads_count = threadpool->threads_count.value; |
1175 | const size_t range_threshold = -threads_count; |
1176 | |
1177 | /* Process thread's own range of items */ |
1178 | const size_t range_start = pthreadpool_load_relaxed_size_t(&thread->range_start); |
1179 | const struct fxdiv_divisor_size_t tile_range_lmn = threadpool->params.parallelize_6d_tile_1d.tile_range_lmn; |
1180 | const struct fxdiv_result_size_t tile_index_ijk_lmn = fxdiv_divide_size_t(range_start, tile_range_lmn); |
1181 | const struct fxdiv_divisor_size_t range_k = threadpool->params.parallelize_6d_tile_1d.range_k; |
1182 | const struct fxdiv_result_size_t index_ij_k = fxdiv_divide_size_t(tile_index_ijk_lmn.quotient, range_k); |
1183 | const struct fxdiv_divisor_size_t tile_range_n = threadpool->params.parallelize_6d_tile_1d.tile_range_n; |
1184 | const struct fxdiv_result_size_t tile_index_lm_n = fxdiv_divide_size_t(tile_index_ijk_lmn.remainder, tile_range_n); |
1185 | const struct fxdiv_divisor_size_t range_j = threadpool->params.parallelize_6d_tile_1d.range_j; |
1186 | const struct fxdiv_result_size_t index_i_j = fxdiv_divide_size_t(index_ij_k.quotient, range_j); |
1187 | const struct fxdiv_divisor_size_t range_m = threadpool->params.parallelize_6d_tile_1d.range_m; |
1188 | const struct fxdiv_result_size_t index_l_m = fxdiv_divide_size_t(tile_index_lm_n.quotient, range_m); |
1189 | const size_t tile_n = threadpool->params.parallelize_6d_tile_1d.tile_n; |
1190 | size_t i = index_i_j.quotient; |
1191 | size_t j = index_i_j.remainder; |
1192 | size_t k = index_ij_k.remainder; |
1193 | size_t l = index_l_m.quotient; |
1194 | size_t m = index_l_m.remainder; |
1195 | size_t start_n = tile_index_lm_n.remainder * tile_n; |
1196 | |
1197 | const size_t range_n = threadpool->params.parallelize_6d_tile_1d.range_n; |
1198 | const size_t range_l = threadpool->params.parallelize_6d_tile_1d.range_l; |
1199 | while (pthreadpool_decrement_fetch_relaxed_size_t(&thread->range_length) < range_threshold) { |
1200 | task(argument, i, j, k, l, m, start_n, min(range_n - start_n, tile_n)); |
1201 | start_n += tile_n; |
1202 | if (start_n >= range_n) { |
1203 | start_n = 0; |
1204 | if (++m == range_m.value) { |
1205 | m = 0; |
1206 | if (++l == range_l) { |
1207 | l = 0; |
1208 | if (++k == range_k.value) { |
1209 | k = 0; |
1210 | if (++j == range_j.value) { |
1211 | j = 0; |
1212 | i += 1; |
1213 | } |
1214 | } |
1215 | } |
1216 | } |
1217 | } |
1218 | } |
1219 | |
1220 | |
1221 | /* There still may be other threads with work */ |
1222 | const size_t thread_number = thread->thread_number; |
1223 | for (size_t tid = modulo_decrement(thread_number, threads_count); |
1224 | tid != thread_number; |
1225 | tid = modulo_decrement(tid, threads_count)) |
1226 | { |
1227 | struct thread_info* other_thread = &threadpool->threads[tid]; |
1228 | while (pthreadpool_decrement_fetch_relaxed_size_t(&other_thread->range_length) < range_threshold) { |
1229 | const size_t linear_index = pthreadpool_decrement_fetch_relaxed_size_t(&other_thread->range_end); |
1230 | const struct fxdiv_result_size_t tile_index_ijk_lmn = fxdiv_divide_size_t(linear_index, tile_range_lmn); |
1231 | const struct fxdiv_result_size_t index_ij_k = fxdiv_divide_size_t(tile_index_ijk_lmn.quotient, range_k); |
1232 | const struct fxdiv_result_size_t tile_index_lm_n = fxdiv_divide_size_t(tile_index_ijk_lmn.remainder, tile_range_n); |
1233 | const struct fxdiv_result_size_t index_i_j = fxdiv_divide_size_t(index_ij_k.quotient, range_j); |
1234 | const struct fxdiv_result_size_t index_l_m = fxdiv_divide_size_t(tile_index_lm_n.quotient, range_m); |
1235 | const size_t start_n = tile_index_lm_n.remainder * tile_n; |
1236 | task(argument, index_i_j.quotient, index_i_j.remainder, index_ij_k.remainder, index_l_m.quotient, index_l_m.remainder, |
1237 | start_n, min(range_n - start_n, tile_n)); |
1238 | } |
1239 | } |
1240 | |
1241 | /* Make changes by this thread visible to other threads */ |
1242 | pthreadpool_fence_release(); |
1243 | } |
1244 | |
1245 | PTHREADPOOL_INTERNAL void pthreadpool_thread_parallelize_6d_tile_2d_fastpath( |
1246 | struct pthreadpool* threadpool, |
1247 | struct thread_info* thread) |
1248 | { |
1249 | assert(threadpool != NULL); |
1250 | assert(thread != NULL); |
1251 | |
1252 | const pthreadpool_task_6d_tile_2d_t task = (pthreadpool_task_6d_tile_2d_t) pthreadpool_load_relaxed_void_p(&threadpool->task); |
1253 | void *const argument = pthreadpool_load_relaxed_void_p(&threadpool->argument); |
1254 | |
1255 | const size_t threads_count = threadpool->threads_count.value; |
1256 | const size_t range_threshold = -threads_count; |
1257 | |
1258 | /* Process thread's own range of items */ |
1259 | const size_t range_start = pthreadpool_load_relaxed_size_t(&thread->range_start); |
1260 | const struct fxdiv_divisor_size_t tile_range_mn = threadpool->params.parallelize_6d_tile_2d.tile_range_mn; |
1261 | const struct fxdiv_result_size_t tile_index_ijkl_mn = fxdiv_divide_size_t(range_start, tile_range_mn); |
1262 | const struct fxdiv_divisor_size_t range_kl = threadpool->params.parallelize_6d_tile_2d.range_kl; |
1263 | const struct fxdiv_result_size_t index_ij_kl = fxdiv_divide_size_t(tile_index_ijkl_mn.quotient, range_kl); |
1264 | const struct fxdiv_divisor_size_t tile_range_n = threadpool->params.parallelize_6d_tile_2d.tile_range_n; |
1265 | const struct fxdiv_result_size_t tile_index_m_n = fxdiv_divide_size_t(tile_index_ijkl_mn.remainder, tile_range_n); |
1266 | const struct fxdiv_divisor_size_t range_j = threadpool->params.parallelize_6d_tile_2d.range_j; |
1267 | const struct fxdiv_result_size_t index_i_j = fxdiv_divide_size_t(index_ij_kl.quotient, range_j); |
1268 | const struct fxdiv_divisor_size_t range_l = threadpool->params.parallelize_6d_tile_2d.range_l; |
1269 | const struct fxdiv_result_size_t index_k_l = fxdiv_divide_size_t(index_ij_kl.remainder, range_l); |
1270 | const size_t tile_m = threadpool->params.parallelize_6d_tile_2d.tile_m; |
1271 | const size_t tile_n = threadpool->params.parallelize_6d_tile_2d.tile_n; |
1272 | size_t i = index_i_j.quotient; |
1273 | size_t j = index_i_j.remainder; |
1274 | size_t k = index_k_l.quotient; |
1275 | size_t l = index_k_l.remainder; |
1276 | size_t start_m = tile_index_m_n.quotient * tile_m; |
1277 | size_t start_n = tile_index_m_n.remainder * tile_n; |
1278 | |
1279 | const size_t range_n = threadpool->params.parallelize_6d_tile_2d.range_n; |
1280 | const size_t range_m = threadpool->params.parallelize_6d_tile_2d.range_m; |
1281 | const size_t range_k = threadpool->params.parallelize_6d_tile_2d.range_k; |
1282 | while (pthreadpool_decrement_fetch_relaxed_size_t(&thread->range_length) < range_threshold) { |
1283 | task(argument, i, j, k, l, start_m, start_n, min(range_m - start_m, tile_m), min(range_n - start_n, tile_n)); |
1284 | start_n += tile_n; |
1285 | if (start_n >= range_n) { |
1286 | start_n = 0; |
1287 | start_m += tile_m; |
1288 | if (start_m >= range_m) { |
1289 | start_m = 0; |
1290 | if (++l == range_l.value) { |
1291 | l = 0; |
1292 | if (++k == range_k) { |
1293 | k = 0; |
1294 | if (++j == range_j.value) { |
1295 | j = 0; |
1296 | i += 1; |
1297 | } |
1298 | } |
1299 | } |
1300 | } |
1301 | } |
1302 | } |
1303 | |
1304 | /* There still may be other threads with work */ |
1305 | const size_t thread_number = thread->thread_number; |
1306 | for (size_t tid = modulo_decrement(thread_number, threads_count); |
1307 | tid != thread_number; |
1308 | tid = modulo_decrement(tid, threads_count)) |
1309 | { |
1310 | struct thread_info* other_thread = &threadpool->threads[tid]; |
1311 | while (pthreadpool_decrement_fetch_relaxed_size_t(&other_thread->range_length) < range_threshold) { |
1312 | const size_t linear_index = pthreadpool_decrement_fetch_relaxed_size_t(&other_thread->range_end); |
1313 | const struct fxdiv_result_size_t tile_index_ijkl_mn = fxdiv_divide_size_t(linear_index, tile_range_mn); |
1314 | const struct fxdiv_result_size_t index_ij_kl = fxdiv_divide_size_t(tile_index_ijkl_mn.quotient, range_kl); |
1315 | const struct fxdiv_result_size_t tile_index_m_n = fxdiv_divide_size_t(tile_index_ijkl_mn.remainder, tile_range_n); |
1316 | const struct fxdiv_result_size_t index_i_j = fxdiv_divide_size_t(index_ij_kl.quotient, range_j); |
1317 | const struct fxdiv_result_size_t index_k_l = fxdiv_divide_size_t(index_ij_kl.remainder, range_l); |
1318 | const size_t start_m = tile_index_m_n.quotient * tile_m; |
1319 | const size_t start_n = tile_index_m_n.remainder * tile_n; |
1320 | task(argument, index_i_j.quotient, index_i_j.remainder, index_k_l.quotient, index_k_l.remainder, |
1321 | start_m, start_n, min(range_m - start_m, tile_m), min(range_n - start_n, tile_n)); |
1322 | } |
1323 | } |
1324 | |
1325 | /* Make changes by this thread visible to other threads */ |
1326 | pthreadpool_fence_release(); |
1327 | } |
1328 | |