1 | /* Standard C headers */ |
2 | #include <stddef.h> |
3 | |
4 | /* Dependencies */ |
5 | #include <fxdiv.h> |
6 | |
7 | /* Public library header */ |
8 | #include <pthreadpool.h> |
9 | |
10 | /* Internal library headers */ |
11 | #include "threadpool-utils.h" |
12 | |
13 | |
14 | void pthreadpool_compute_1d( |
15 | pthreadpool_t threadpool, |
16 | pthreadpool_function_1d_t function, |
17 | void* argument, |
18 | size_t range) |
19 | { |
20 | pthreadpool_parallelize_1d(threadpool, |
21 | (pthreadpool_task_1d_t) function, argument, |
22 | range, 0 /* flags */); |
23 | } |
24 | |
25 | void pthreadpool_compute_1d_tiled( |
26 | pthreadpool_t threadpool, |
27 | pthreadpool_function_1d_tiled_t function, |
28 | void* argument, |
29 | size_t range, |
30 | size_t tile) |
31 | { |
32 | pthreadpool_parallelize_1d_tile_1d(threadpool, |
33 | (pthreadpool_task_1d_tile_1d_t) function, argument, |
34 | range, tile, 0 /* flags */); |
35 | } |
36 | |
37 | void pthreadpool_compute_2d( |
38 | pthreadpool_t threadpool, |
39 | pthreadpool_function_2d_t function, |
40 | void* argument, |
41 | size_t range_i, |
42 | size_t range_j) |
43 | { |
44 | pthreadpool_parallelize_2d(threadpool, |
45 | (pthreadpool_task_2d_t) function, argument, |
46 | range_i, range_j, 0 /* flags */); |
47 | } |
48 | |
49 | void pthreadpool_compute_2d_tiled( |
50 | pthreadpool_t threadpool, |
51 | pthreadpool_function_2d_tiled_t function, |
52 | void* argument, |
53 | size_t range_i, |
54 | size_t range_j, |
55 | size_t tile_i, |
56 | size_t tile_j) |
57 | { |
58 | pthreadpool_parallelize_2d_tile_2d(threadpool, |
59 | (pthreadpool_task_2d_tile_2d_t) function, argument, |
60 | range_i, range_j, tile_i, tile_j, 0 /* flags */); |
61 | } |
62 | |
63 | struct compute_3d_tiled_context { |
64 | pthreadpool_function_3d_tiled_t function; |
65 | void* argument; |
66 | struct fxdiv_divisor_size_t tile_range_j; |
67 | struct fxdiv_divisor_size_t tile_range_k; |
68 | size_t range_i; |
69 | size_t range_j; |
70 | size_t range_k; |
71 | size_t tile_i; |
72 | size_t tile_j; |
73 | size_t tile_k; |
74 | }; |
75 | |
76 | static void compute_3d_tiled(const struct compute_3d_tiled_context* context, size_t linear_index) { |
77 | const struct fxdiv_divisor_size_t tile_range_k = context->tile_range_k; |
78 | const struct fxdiv_result_size_t tile_index_ij_k = fxdiv_divide_size_t(linear_index, tile_range_k); |
79 | const struct fxdiv_divisor_size_t tile_range_j = context->tile_range_j; |
80 | const struct fxdiv_result_size_t tile_index_i_j = fxdiv_divide_size_t(tile_index_ij_k.quotient, tile_range_j); |
81 | const size_t max_tile_i = context->tile_i; |
82 | const size_t max_tile_j = context->tile_j; |
83 | const size_t max_tile_k = context->tile_k; |
84 | const size_t index_i = tile_index_i_j.quotient * max_tile_i; |
85 | const size_t index_j = tile_index_i_j.remainder * max_tile_j; |
86 | const size_t index_k = tile_index_ij_k.remainder * max_tile_k; |
87 | const size_t tile_i = min(max_tile_i, context->range_i - index_i); |
88 | const size_t tile_j = min(max_tile_j, context->range_j - index_j); |
89 | const size_t tile_k = min(max_tile_k, context->range_k - index_k); |
90 | context->function(context->argument, index_i, index_j, index_k, tile_i, tile_j, tile_k); |
91 | } |
92 | |
93 | void pthreadpool_compute_3d_tiled( |
94 | pthreadpool_t threadpool, |
95 | pthreadpool_function_3d_tiled_t function, |
96 | void* argument, |
97 | size_t range_i, |
98 | size_t range_j, |
99 | size_t range_k, |
100 | size_t tile_i, |
101 | size_t tile_j, |
102 | size_t tile_k) |
103 | { |
104 | if (pthreadpool_get_threads_count(threadpool) <= 1) { |
105 | /* No thread pool used: execute function sequentially on the calling thread */ |
106 | for (size_t i = 0; i < range_i; i += tile_i) { |
107 | for (size_t j = 0; j < range_j; j += tile_j) { |
108 | for (size_t k = 0; k < range_k; k += tile_k) { |
109 | function(argument, i, j, k, min(range_i - i, tile_i), min(range_j - j, tile_j), min(range_k - k, tile_k)); |
110 | } |
111 | } |
112 | } |
113 | } else { |
114 | /* Execute in parallel on the thread pool using linearized index */ |
115 | const size_t tile_range_i = divide_round_up(range_i, tile_i); |
116 | const size_t tile_range_j = divide_round_up(range_j, tile_j); |
117 | const size_t tile_range_k = divide_round_up(range_k, tile_k); |
118 | struct compute_3d_tiled_context context = { |
119 | .function = function, |
120 | .argument = argument, |
121 | .tile_range_j = fxdiv_init_size_t(tile_range_j), |
122 | .tile_range_k = fxdiv_init_size_t(tile_range_k), |
123 | .range_i = range_i, |
124 | .range_j = range_j, |
125 | .range_k = range_k, |
126 | .tile_i = tile_i, |
127 | .tile_j = tile_j, |
128 | .tile_k = tile_k |
129 | }; |
130 | pthreadpool_parallelize_1d(threadpool, |
131 | (pthreadpool_task_1d_t) compute_3d_tiled, &context, |
132 | tile_range_i * tile_range_j * tile_range_k, |
133 | 0 /* flags */); |
134 | } |
135 | } |
136 | |
137 | struct compute_4d_tiled_context { |
138 | pthreadpool_function_4d_tiled_t function; |
139 | void* argument; |
140 | struct fxdiv_divisor_size_t tile_range_kl; |
141 | struct fxdiv_divisor_size_t tile_range_j; |
142 | struct fxdiv_divisor_size_t tile_range_l; |
143 | size_t range_i; |
144 | size_t range_j; |
145 | size_t range_k; |
146 | size_t range_l; |
147 | size_t tile_i; |
148 | size_t tile_j; |
149 | size_t tile_k; |
150 | size_t tile_l; |
151 | }; |
152 | |
153 | static void compute_4d_tiled(const struct compute_4d_tiled_context* context, size_t linear_index) { |
154 | const struct fxdiv_divisor_size_t tile_range_kl = context->tile_range_kl; |
155 | const struct fxdiv_result_size_t tile_index_ij_kl = fxdiv_divide_size_t(linear_index, tile_range_kl); |
156 | const struct fxdiv_divisor_size_t tile_range_j = context->tile_range_j; |
157 | const struct fxdiv_result_size_t tile_index_i_j = fxdiv_divide_size_t(tile_index_ij_kl.quotient, tile_range_j); |
158 | const struct fxdiv_divisor_size_t tile_range_l = context->tile_range_l; |
159 | const struct fxdiv_result_size_t tile_index_k_l = fxdiv_divide_size_t(tile_index_ij_kl.remainder, tile_range_l); |
160 | const size_t max_tile_i = context->tile_i; |
161 | const size_t max_tile_j = context->tile_j; |
162 | const size_t max_tile_k = context->tile_k; |
163 | const size_t max_tile_l = context->tile_l; |
164 | const size_t index_i = tile_index_i_j.quotient * max_tile_i; |
165 | const size_t index_j = tile_index_i_j.remainder * max_tile_j; |
166 | const size_t index_k = tile_index_k_l.quotient * max_tile_k; |
167 | const size_t index_l = tile_index_k_l.remainder * max_tile_l; |
168 | const size_t tile_i = min(max_tile_i, context->range_i - index_i); |
169 | const size_t tile_j = min(max_tile_j, context->range_j - index_j); |
170 | const size_t tile_k = min(max_tile_k, context->range_k - index_k); |
171 | const size_t tile_l = min(max_tile_l, context->range_l - index_l); |
172 | context->function(context->argument, index_i, index_j, index_k, index_l, tile_i, tile_j, tile_k, tile_l); |
173 | } |
174 | |
175 | void pthreadpool_compute_4d_tiled( |
176 | pthreadpool_t threadpool, |
177 | pthreadpool_function_4d_tiled_t function, |
178 | void* argument, |
179 | size_t range_i, |
180 | size_t range_j, |
181 | size_t range_k, |
182 | size_t range_l, |
183 | size_t tile_i, |
184 | size_t tile_j, |
185 | size_t tile_k, |
186 | size_t tile_l) |
187 | { |
188 | if (pthreadpool_get_threads_count(threadpool) <= 1) { |
189 | /* No thread pool used: execute function sequentially on the calling thread */ |
190 | for (size_t i = 0; i < range_i; i += tile_i) { |
191 | for (size_t j = 0; j < range_j; j += tile_j) { |
192 | for (size_t k = 0; k < range_k; k += tile_k) { |
193 | for (size_t l = 0; l < range_l; l += tile_l) { |
194 | function(argument, i, j, k, l, |
195 | min(range_i - i, tile_i), min(range_j - j, tile_j), min(range_k - k, tile_k), min(range_l - l, tile_l)); |
196 | } |
197 | } |
198 | } |
199 | } |
200 | } else { |
201 | /* Execute in parallel on the thread pool using linearized index */ |
202 | const size_t tile_range_i = divide_round_up(range_i, tile_i); |
203 | const size_t tile_range_j = divide_round_up(range_j, tile_j); |
204 | const size_t tile_range_k = divide_round_up(range_k, tile_k); |
205 | const size_t tile_range_l = divide_round_up(range_l, tile_l); |
206 | struct compute_4d_tiled_context context = { |
207 | .function = function, |
208 | .argument = argument, |
209 | .tile_range_kl = fxdiv_init_size_t(tile_range_k * tile_range_l), |
210 | .tile_range_j = fxdiv_init_size_t(tile_range_j), |
211 | .tile_range_l = fxdiv_init_size_t(tile_range_l), |
212 | .range_i = range_i, |
213 | .range_j = range_j, |
214 | .range_k = range_k, |
215 | .range_l = range_l, |
216 | .tile_i = tile_i, |
217 | .tile_j = tile_j, |
218 | .tile_k = tile_k, |
219 | .tile_l = tile_l |
220 | }; |
221 | pthreadpool_parallelize_1d(threadpool, |
222 | (pthreadpool_task_1d_t) compute_4d_tiled, &context, |
223 | tile_range_i * tile_range_j * tile_range_k * tile_range_l, |
224 | 0 /* flags */); |
225 | } |
226 | } |
227 | |