1/* Standard C headers */
2#include <stddef.h>
3
4/* Dependencies */
5#include <fxdiv.h>
6
7/* Public library header */
8#include <pthreadpool.h>
9
10/* Internal library headers */
11#include "threadpool-utils.h"
12
13
14void pthreadpool_compute_1d(
15 pthreadpool_t threadpool,
16 pthreadpool_function_1d_t function,
17 void* argument,
18 size_t range)
19{
20 pthreadpool_parallelize_1d(threadpool,
21 (pthreadpool_task_1d_t) function, argument,
22 range, 0 /* flags */);
23}
24
25void pthreadpool_compute_1d_tiled(
26 pthreadpool_t threadpool,
27 pthreadpool_function_1d_tiled_t function,
28 void* argument,
29 size_t range,
30 size_t tile)
31{
32 pthreadpool_parallelize_1d_tile_1d(threadpool,
33 (pthreadpool_task_1d_tile_1d_t) function, argument,
34 range, tile, 0 /* flags */);
35}
36
37void pthreadpool_compute_2d(
38 pthreadpool_t threadpool,
39 pthreadpool_function_2d_t function,
40 void* argument,
41 size_t range_i,
42 size_t range_j)
43{
44 pthreadpool_parallelize_2d(threadpool,
45 (pthreadpool_task_2d_t) function, argument,
46 range_i, range_j, 0 /* flags */);
47}
48
49void pthreadpool_compute_2d_tiled(
50 pthreadpool_t threadpool,
51 pthreadpool_function_2d_tiled_t function,
52 void* argument,
53 size_t range_i,
54 size_t range_j,
55 size_t tile_i,
56 size_t tile_j)
57{
58 pthreadpool_parallelize_2d_tile_2d(threadpool,
59 (pthreadpool_task_2d_tile_2d_t) function, argument,
60 range_i, range_j, tile_i, tile_j, 0 /* flags */);
61}
62
63struct compute_3d_tiled_context {
64 pthreadpool_function_3d_tiled_t function;
65 void* argument;
66 struct fxdiv_divisor_size_t tile_range_j;
67 struct fxdiv_divisor_size_t tile_range_k;
68 size_t range_i;
69 size_t range_j;
70 size_t range_k;
71 size_t tile_i;
72 size_t tile_j;
73 size_t tile_k;
74};
75
76static void compute_3d_tiled(const struct compute_3d_tiled_context* context, size_t linear_index) {
77 const struct fxdiv_divisor_size_t tile_range_k = context->tile_range_k;
78 const struct fxdiv_result_size_t tile_index_ij_k = fxdiv_divide_size_t(linear_index, tile_range_k);
79 const struct fxdiv_divisor_size_t tile_range_j = context->tile_range_j;
80 const struct fxdiv_result_size_t tile_index_i_j = fxdiv_divide_size_t(tile_index_ij_k.quotient, tile_range_j);
81 const size_t max_tile_i = context->tile_i;
82 const size_t max_tile_j = context->tile_j;
83 const size_t max_tile_k = context->tile_k;
84 const size_t index_i = tile_index_i_j.quotient * max_tile_i;
85 const size_t index_j = tile_index_i_j.remainder * max_tile_j;
86 const size_t index_k = tile_index_ij_k.remainder * max_tile_k;
87 const size_t tile_i = min(max_tile_i, context->range_i - index_i);
88 const size_t tile_j = min(max_tile_j, context->range_j - index_j);
89 const size_t tile_k = min(max_tile_k, context->range_k - index_k);
90 context->function(context->argument, index_i, index_j, index_k, tile_i, tile_j, tile_k);
91}
92
93void pthreadpool_compute_3d_tiled(
94 pthreadpool_t threadpool,
95 pthreadpool_function_3d_tiled_t function,
96 void* argument,
97 size_t range_i,
98 size_t range_j,
99 size_t range_k,
100 size_t tile_i,
101 size_t tile_j,
102 size_t tile_k)
103{
104 if (pthreadpool_get_threads_count(threadpool) <= 1) {
105 /* No thread pool used: execute function sequentially on the calling thread */
106 for (size_t i = 0; i < range_i; i += tile_i) {
107 for (size_t j = 0; j < range_j; j += tile_j) {
108 for (size_t k = 0; k < range_k; k += tile_k) {
109 function(argument, i, j, k, min(range_i - i, tile_i), min(range_j - j, tile_j), min(range_k - k, tile_k));
110 }
111 }
112 }
113 } else {
114 /* Execute in parallel on the thread pool using linearized index */
115 const size_t tile_range_i = divide_round_up(range_i, tile_i);
116 const size_t tile_range_j = divide_round_up(range_j, tile_j);
117 const size_t tile_range_k = divide_round_up(range_k, tile_k);
118 struct compute_3d_tiled_context context = {
119 .function = function,
120 .argument = argument,
121 .tile_range_j = fxdiv_init_size_t(tile_range_j),
122 .tile_range_k = fxdiv_init_size_t(tile_range_k),
123 .range_i = range_i,
124 .range_j = range_j,
125 .range_k = range_k,
126 .tile_i = tile_i,
127 .tile_j = tile_j,
128 .tile_k = tile_k
129 };
130 pthreadpool_parallelize_1d(threadpool,
131 (pthreadpool_task_1d_t) compute_3d_tiled, &context,
132 tile_range_i * tile_range_j * tile_range_k,
133 0 /* flags */);
134 }
135}
136
137struct compute_4d_tiled_context {
138 pthreadpool_function_4d_tiled_t function;
139 void* argument;
140 struct fxdiv_divisor_size_t tile_range_kl;
141 struct fxdiv_divisor_size_t tile_range_j;
142 struct fxdiv_divisor_size_t tile_range_l;
143 size_t range_i;
144 size_t range_j;
145 size_t range_k;
146 size_t range_l;
147 size_t tile_i;
148 size_t tile_j;
149 size_t tile_k;
150 size_t tile_l;
151};
152
153static void compute_4d_tiled(const struct compute_4d_tiled_context* context, size_t linear_index) {
154 const struct fxdiv_divisor_size_t tile_range_kl = context->tile_range_kl;
155 const struct fxdiv_result_size_t tile_index_ij_kl = fxdiv_divide_size_t(linear_index, tile_range_kl);
156 const struct fxdiv_divisor_size_t tile_range_j = context->tile_range_j;
157 const struct fxdiv_result_size_t tile_index_i_j = fxdiv_divide_size_t(tile_index_ij_kl.quotient, tile_range_j);
158 const struct fxdiv_divisor_size_t tile_range_l = context->tile_range_l;
159 const struct fxdiv_result_size_t tile_index_k_l = fxdiv_divide_size_t(tile_index_ij_kl.remainder, tile_range_l);
160 const size_t max_tile_i = context->tile_i;
161 const size_t max_tile_j = context->tile_j;
162 const size_t max_tile_k = context->tile_k;
163 const size_t max_tile_l = context->tile_l;
164 const size_t index_i = tile_index_i_j.quotient * max_tile_i;
165 const size_t index_j = tile_index_i_j.remainder * max_tile_j;
166 const size_t index_k = tile_index_k_l.quotient * max_tile_k;
167 const size_t index_l = tile_index_k_l.remainder * max_tile_l;
168 const size_t tile_i = min(max_tile_i, context->range_i - index_i);
169 const size_t tile_j = min(max_tile_j, context->range_j - index_j);
170 const size_t tile_k = min(max_tile_k, context->range_k - index_k);
171 const size_t tile_l = min(max_tile_l, context->range_l - index_l);
172 context->function(context->argument, index_i, index_j, index_k, index_l, tile_i, tile_j, tile_k, tile_l);
173}
174
175void pthreadpool_compute_4d_tiled(
176 pthreadpool_t threadpool,
177 pthreadpool_function_4d_tiled_t function,
178 void* argument,
179 size_t range_i,
180 size_t range_j,
181 size_t range_k,
182 size_t range_l,
183 size_t tile_i,
184 size_t tile_j,
185 size_t tile_k,
186 size_t tile_l)
187{
188 if (pthreadpool_get_threads_count(threadpool) <= 1) {
189 /* No thread pool used: execute function sequentially on the calling thread */
190 for (size_t i = 0; i < range_i; i += tile_i) {
191 for (size_t j = 0; j < range_j; j += tile_j) {
192 for (size_t k = 0; k < range_k; k += tile_k) {
193 for (size_t l = 0; l < range_l; l += tile_l) {
194 function(argument, i, j, k, l,
195 min(range_i - i, tile_i), min(range_j - j, tile_j), min(range_k - k, tile_k), min(range_l - l, tile_l));
196 }
197 }
198 }
199 }
200 } else {
201 /* Execute in parallel on the thread pool using linearized index */
202 const size_t tile_range_i = divide_round_up(range_i, tile_i);
203 const size_t tile_range_j = divide_round_up(range_j, tile_j);
204 const size_t tile_range_k = divide_round_up(range_k, tile_k);
205 const size_t tile_range_l = divide_round_up(range_l, tile_l);
206 struct compute_4d_tiled_context context = {
207 .function = function,
208 .argument = argument,
209 .tile_range_kl = fxdiv_init_size_t(tile_range_k * tile_range_l),
210 .tile_range_j = fxdiv_init_size_t(tile_range_j),
211 .tile_range_l = fxdiv_init_size_t(tile_range_l),
212 .range_i = range_i,
213 .range_j = range_j,
214 .range_k = range_k,
215 .range_l = range_l,
216 .tile_i = tile_i,
217 .tile_j = tile_j,
218 .tile_k = tile_k,
219 .tile_l = tile_l
220 };
221 pthreadpool_parallelize_1d(threadpool,
222 (pthreadpool_task_1d_t) compute_4d_tiled, &context,
223 tile_range_i * tile_range_j * tile_range_k * tile_range_l,
224 0 /* flags */);
225 }
226}
227