1 | #include <stdbool.h> |
2 | #include <stdint.h> |
3 | #include <stddef.h> |
4 | #include <string.h> |
5 | |
6 | #include <fxdiv.h> |
7 | |
8 | #include <nnpack.h> |
9 | #include <nnpack/macros.h> |
10 | #include <nnpack/utils.h> |
11 | #include <nnpack/system.h> |
12 | |
13 | #include <nnpack/hwinfo.h> |
14 | #include <nnpack/activations.h> |
15 | #include <nnpack/validation.h> |
16 | |
17 | |
18 | struct NNP_CACHE_ALIGN kernel_transform_context { |
19 | nnp_transform_2d_with_offset transform_function; |
20 | const float* kernel; |
21 | void* kernel_transform; |
22 | |
23 | size_t tuple_size; |
24 | size_t input_channels; |
25 | size_t input_channels_block_size; |
26 | size_t output_channels; |
27 | struct nnp_size kernel_size; |
28 | }; |
29 | |
30 | static void compute_kernel_transform( |
31 | const struct kernel_transform_context context[restrict static 1], |
32 | size_t output_channels_subblock_start, size_t input_channels_block_offset, |
33 | size_t output_channels_subblock_size, size_t input_channels_block_increment) |
34 | { |
35 | const size_t tuple_size = context->tuple_size; |
36 | const size_t input_channels = context->input_channels; |
37 | const size_t input_channels_block_size = context->input_channels_block_size; |
38 | const size_t output_channels = context->output_channels; |
39 | const struct nnp_size kernel_size = context->kernel_size; |
40 | |
41 | const float (*kernel)[input_channels][kernel_size.width * kernel_size.height] = |
42 | (const float(*)[input_channels][kernel_size.width * kernel_size.height]) context->kernel; |
43 | void* kernel_transform = context->kernel_transform; |
44 | nnp_transform_2d_with_offset transform_function = context->transform_function; |
45 | |
46 | for (size_t output_channels_subblock_offset = 0; output_channels_subblock_offset < output_channels_subblock_size; output_channels_subblock_offset += 1) { |
47 | const size_t output_channel = output_channels_subblock_start + output_channels_subblock_offset; |
48 | transform_function( |
49 | kernel[output_channel][input_channels_block_offset], |
50 | kernel_transform + |
51 | (output_channels_subblock_start * input_channels_block_size + input_channels_block_offset * output_channels_subblock_size + output_channels_subblock_offset) * tuple_size, |
52 | kernel_size.width, |
53 | input_channels_block_size * output_channels * tuple_size, |
54 | kernel_size.height, kernel_size.width, 0, 0); |
55 | } |
56 | } |
57 | |
58 | struct NNP_CACHE_ALIGN input_transform_context { |
59 | const float* input; |
60 | void* input_transform; |
61 | nnp_transform_2d_with_offset transform_function; |
62 | |
63 | const size_t tuple_size; |
64 | const size_t tiles_count; |
65 | const struct fxdiv_divisor_size_t tiles_x_count; |
66 | const size_t input_channels_block_start; |
67 | const size_t input_channels_block_size; |
68 | const struct nnp_size input_size; |
69 | const size_t input_padding_left; |
70 | const size_t input_padding_top; |
71 | const struct nnp_size input_tile; |
72 | const struct nnp_size input_tile_step; |
73 | }; |
74 | |
75 | static void compute_input_transform( |
76 | const struct input_transform_context context[restrict static 1], |
77 | size_t input_channels_block_offset, size_t tiles_subblock_start, |
78 | size_t input_channels_block_range, size_t tiles_subblock_size) |
79 | { |
80 | const size_t tuple_size = context->tuple_size; |
81 | const size_t tiles_count = context->tiles_count; |
82 | const struct fxdiv_divisor_size_t tiles_x_count = context->tiles_x_count; |
83 | const size_t input_channels_block_start = context->input_channels_block_start; |
84 | const size_t input_channels_block_size = context->input_channels_block_size; |
85 | const struct nnp_size input_size = context->input_size; |
86 | const size_t input_padding_left = context->input_padding_left; |
87 | const size_t input_padding_top = context->input_padding_top; |
88 | const struct nnp_size input_tile = context->input_tile; |
89 | const struct nnp_size input_tile_step = context->input_tile_step; |
90 | |
91 | const float (*input)[input_size.height][input_size.width] = |
92 | (const float(*)[input_size.height][input_size.width]) context->input; |
93 | void* input_transform = context->input_transform; |
94 | nnp_transform_2d_with_offset transform_function = context->transform_function; |
95 | |
96 | const size_t input_channel = input_channels_block_start + input_channels_block_offset; |
97 | for (size_t tiles_subblock_offset = 0; tiles_subblock_offset < tiles_subblock_size; tiles_subblock_offset += 1) { |
98 | const size_t tile = tiles_subblock_start + tiles_subblock_offset; |
99 | const struct fxdiv_result_size_t tile_xy = fxdiv_divide_size_t(tile, tiles_x_count); |
100 | const size_t tile_x = tile_xy.remainder; |
101 | const size_t tile_y = tile_xy.quotient; |
102 | |
103 | const size_t output_x = tile_x * input_tile_step.width; |
104 | const size_t output_y = tile_y * input_tile_step.height; |
105 | |
106 | const size_t input_x = min(doz(output_x, input_padding_left), input_size.width); |
107 | const size_t input_y = min(doz(output_y, input_padding_top), input_size.height); |
108 | |
109 | const size_t row_offset = doz(input_padding_top, output_y); |
110 | const size_t row_count = min(input_size.height - input_y, input_tile.height - row_offset); |
111 | const size_t column_offset = doz(input_padding_left, output_x); |
112 | const size_t column_count = min(input_size.width - input_x, input_tile.width - column_offset); |
113 | |
114 | transform_function( |
115 | &input[input_channel][input_y][input_x], |
116 | input_transform + (tiles_subblock_start * input_channels_block_size + input_channels_block_offset * tiles_subblock_size + tiles_subblock_offset) * tuple_size, |
117 | input_size.width, |
118 | input_channels_block_size * tiles_count * tuple_size, |
119 | row_count, column_count, row_offset, column_offset); |
120 | } |
121 | } |
122 | |
123 | struct NNP_CACHE_ALIGN output_transform_context { |
124 | nnp_transform_2d_with_bias transform_function; |
125 | float* output; |
126 | const void* output_transform; |
127 | const float* bias; |
128 | |
129 | size_t tuple_size; |
130 | size_t tiles_count; |
131 | struct fxdiv_divisor_size_t tiles_x_count; |
132 | struct fxdiv_divisor_size_t tiles_block_max; |
133 | size_t output_channels; |
134 | struct nnp_size output_size; |
135 | struct nnp_size output_tile; |
136 | }; |
137 | |
138 | static void compute_output_transform( |
139 | const struct output_transform_context context[restrict static 1], |
140 | size_t output_channels_subblock_start, size_t tiles_subblock_start, |
141 | size_t output_channels_subblock_size, size_t tiles_subblock_size) |
142 | { |
143 | const size_t tuple_size = context->tuple_size; |
144 | const size_t tiles_count = context->tiles_count; |
145 | const struct fxdiv_divisor_size_t tiles_x_count = context->tiles_x_count; |
146 | const struct fxdiv_divisor_size_t tiles_block_max = context->tiles_block_max; |
147 | const size_t output_channels = context->output_channels; |
148 | const struct nnp_size output_size = context->output_size; |
149 | const struct nnp_size output_tile = context->output_tile; |
150 | |
151 | const size_t tiles_block_start = fxdiv_round_down_size_t(tiles_subblock_start, tiles_block_max); |
152 | const size_t tiles_block_size = min(tiles_count - tiles_block_start, tiles_block_max.value); |
153 | |
154 | float (*output)[output_size.height][output_size.width] = |
155 | (float(*)[output_size.height][output_size.width]) context->output; |
156 | const void* output_transform = context->output_transform; |
157 | const float* bias = context->bias; |
158 | nnp_transform_2d_with_bias transform_function = context->transform_function; |
159 | |
160 | for (size_t tiles_subblock_offset = 0; tiles_subblock_offset < tiles_subblock_size; tiles_subblock_offset += 1) { |
161 | const size_t tile = tiles_subblock_start + tiles_subblock_offset; |
162 | const struct fxdiv_result_size_t tile_xy = fxdiv_divide_size_t(tile, tiles_x_count); |
163 | const size_t tile_x = tile_xy.remainder; |
164 | const size_t tile_y = tile_xy.quotient; |
165 | |
166 | const size_t output_x = tile_x * output_tile.width; |
167 | const size_t output_y = tile_y * output_tile.height; |
168 | |
169 | for (size_t output_channels_subblock_offset = 0; output_channels_subblock_offset < output_channels_subblock_size; output_channels_subblock_offset += 1) { |
170 | const size_t output_channel = output_channels_subblock_start + output_channels_subblock_offset; |
171 | transform_function( |
172 | output_transform + |
173 | (tiles_block_start * output_channels + output_channels_subblock_start * tiles_block_size + ((tiles_subblock_start - tiles_block_start) + tiles_subblock_offset) * output_channels_subblock_size + output_channels_subblock_offset) * tuple_size, |
174 | &output[output_channel][output_y][output_x], |
175 | &bias[output_channel], |
176 | tiles_count * output_channels * tuple_size, |
177 | output_size.width, |
178 | min(output_tile.height, output_size.height - output_y), |
179 | min(output_tile.width, output_size.width - output_x)); |
180 | } |
181 | } |
182 | } |
183 | |
184 | struct NNP_CACHE_ALIGN tuple_multiplication_context { |
185 | size_t tuple_elements; |
186 | size_t tuple_size; |
187 | size_t tiles_subblock_max; |
188 | size_t input_channels_block_size; |
189 | size_t input_channels_block_start; |
190 | size_t output_channels; |
191 | size_t output_channels_subblock_max; |
192 | size_t output_channels_block_start; |
193 | |
194 | const void* input_transform; |
195 | const void* kernel_transform; |
196 | void* output_transform; |
197 | |
198 | nnp_fast_tuple_gemm_function fast_gemm; |
199 | nnp_full_tuple_gemm_function full_gemm; |
200 | }; |
201 | |
202 | static void compute_tuple_multiplication( |
203 | const struct tuple_multiplication_context context[restrict static 1], |
204 | size_t tiles_block_start, size_t output_channels_subblock_start, |
205 | size_t tiles_block_size, size_t output_channels_subblock_size) |
206 | { |
207 | const size_t tuple_elements = context->tuple_elements; |
208 | const size_t tuple_size = context->tuple_size; |
209 | const size_t tiles_subblock_max = context->tiles_subblock_max; |
210 | const size_t input_channels_block_size = context->input_channels_block_size; |
211 | const size_t input_channels_block_start = context->input_channels_block_start; |
212 | const size_t output_channels = context->output_channels; |
213 | const size_t output_channels_subblock_max = context->output_channels_subblock_max; |
214 | const size_t output_channels_block_start = context->output_channels_block_start; |
215 | |
216 | const void* input_transform = context->input_transform + |
217 | tiles_block_start * input_channels_block_size * tuple_size; |
218 | const void* kernel_transform = context->kernel_transform + |
219 | (output_channels_block_start + output_channels_subblock_start) * input_channels_block_size * tuple_size; |
220 | void* output_transform = context->output_transform + |
221 | (tiles_block_start * output_channels + (output_channels_block_start + output_channels_subblock_start) * tiles_block_size) * tuple_size; |
222 | |
223 | if (output_channels_subblock_size == output_channels_subblock_max) { |
224 | const nnp_fast_tuple_gemm_function fast_gemm = context->fast_gemm; |
225 | while (tiles_block_size >= tiles_subblock_max) { |
226 | tiles_block_size -= tiles_subblock_max; |
227 | |
228 | fast_gemm( |
229 | input_channels_block_size, input_channels_block_start, |
230 | input_transform, kernel_transform, output_transform, |
231 | output_channels_subblock_size * tuple_elements); |
232 | |
233 | input_transform += tiles_subblock_max * input_channels_block_size * tuple_size; |
234 | output_transform += tiles_subblock_max * output_channels_subblock_size * tuple_size; |
235 | } |
236 | } |
237 | |
238 | const nnp_full_tuple_gemm_function full_gemm = context->full_gemm; |
239 | while (tiles_block_size != 0) { |
240 | const size_t tiles_subblock_size = min(tiles_block_size, tiles_subblock_max); |
241 | tiles_block_size -= tiles_subblock_size; |
242 | |
243 | full_gemm( |
244 | tiles_subblock_size, output_channels_subblock_size, |
245 | input_channels_block_size, input_channels_block_start, |
246 | input_transform, kernel_transform, output_transform, |
247 | output_channels_subblock_size * tuple_elements); |
248 | |
249 | input_transform += tiles_subblock_max * input_channels_block_size * tuple_size; |
250 | output_transform += tiles_subblock_max * output_channels_subblock_size * tuple_size; |
251 | } |
252 | } |
253 | |
254 | struct NNP_CACHE_ALIGN kernel_packing_context { |
255 | const float* kernel; |
256 | float* packed_kernel; |
257 | |
258 | size_t reduction_size; |
259 | size_t reduction_block_start; |
260 | size_t reduction_block_size; |
261 | }; |
262 | |
263 | static void compute_kernel_packing( |
264 | const struct kernel_packing_context context[restrict static 1], |
265 | size_t output_channels_subblock_start, size_t reduction_block_offset, |
266 | size_t output_channels_subblock_size, size_t reduction_block_range) |
267 | { |
268 | const size_t reduction_size = context->reduction_size; |
269 | const size_t reduction_block_start = context->reduction_block_start; |
270 | const size_t reduction_block_size = context->reduction_block_size; |
271 | |
272 | const float* kernel = context->kernel + |
273 | output_channels_subblock_start * reduction_size + reduction_block_offset; |
274 | float* packed_kernel = context->packed_kernel + |
275 | output_channels_subblock_start * reduction_block_size + reduction_block_offset * output_channels_subblock_size; |
276 | |
277 | for (size_t output_channels_subblock_offset = 0; output_channels_subblock_offset < output_channels_subblock_size; output_channels_subblock_offset += 1) { |
278 | packed_kernel[output_channels_subblock_offset] = kernel[output_channels_subblock_offset * reduction_size]; |
279 | } |
280 | } |
281 | |
282 | struct NNP_CACHE_ALIGN input_packing_context { |
283 | const float* input; |
284 | float* packed_input; |
285 | |
286 | size_t simd_width; |
287 | size_t reduction_block_start; |
288 | size_t reduction_block_size; |
289 | size_t output_image_block_start; |
290 | struct nnp_size input_size; |
291 | size_t input_padding_top; |
292 | size_t input_padding_left; |
293 | struct fxdiv_divisor_size_t kernel_elements; |
294 | struct fxdiv_divisor_size_t kernel_width; |
295 | struct fxdiv_divisor_size_t output_width; |
296 | struct nnp_size output_subsampling; |
297 | }; |
298 | |
299 | static void compute_input_packing( |
300 | const struct input_packing_context context[restrict static 1], |
301 | size_t reduction_block_offset, size_t output_image_subblock_start, |
302 | size_t reduction_block_range, size_t output_image_subblock_size) |
303 | { |
304 | const size_t simd_width = context->simd_width; |
305 | const size_t reduction_block_start = context->reduction_block_start; |
306 | const size_t reduction_block_size = context->reduction_block_size; |
307 | const size_t output_image_block_start = context->output_image_block_start; |
308 | const struct nnp_size input_size = context->input_size; |
309 | const size_t input_padding_top = context->input_padding_top; |
310 | const size_t input_padding_left = context->input_padding_left; |
311 | const struct fxdiv_divisor_size_t kernel_elements = context->kernel_elements; |
312 | const struct fxdiv_divisor_size_t kernel_width = context->kernel_width; |
313 | const struct fxdiv_divisor_size_t output_width = context->output_width; |
314 | const struct nnp_size output_subsampling = context->output_subsampling; |
315 | |
316 | const float (*input)[input_size.height][input_size.width] = |
317 | (const float(*)[input_size.height][input_size.width]) context->input; |
318 | float* packed_input = context->packed_input; |
319 | |
320 | const size_t output_image_subblock_stride = round_up_by_power_of_2(output_image_subblock_size, simd_width); |
321 | |
322 | const size_t reduction_index = reduction_block_start + reduction_block_offset; |
323 | const struct fxdiv_result_size_t reduction_index_divmod = fxdiv_divide_size_t(reduction_index, kernel_elements); |
324 | const size_t input_channel = reduction_index_divmod.quotient; |
325 | const struct fxdiv_result_size_t kernel_xy = fxdiv_divide_size_t(reduction_index_divmod.remainder, kernel_width); |
326 | const size_t kernel_y = kernel_xy.quotient; |
327 | const size_t kernel_x = kernel_xy.remainder; |
328 | |
329 | for (size_t output_image_subblock_offset = 0; output_image_subblock_offset < output_image_subblock_size; output_image_subblock_offset += 1) { |
330 | const size_t output_image_index = output_image_block_start + output_image_subblock_start + output_image_subblock_offset; |
331 | const struct fxdiv_result_size_t output_xy = fxdiv_divide_size_t(output_image_index, output_width); |
332 | const size_t output_y = output_xy.quotient; |
333 | const size_t output_x = output_xy.remainder; |
334 | |
335 | const size_t input_y = output_y * output_subsampling.height + kernel_y - input_padding_top; |
336 | const size_t input_x = output_x * output_subsampling.width + kernel_x - input_padding_left; |
337 | |
338 | const size_t packed_index = output_image_subblock_start * reduction_block_size + |
339 | reduction_block_offset * output_image_subblock_stride + output_image_subblock_offset; |
340 | if ((input_x < input_size.width) && (input_y < input_size.height)) { |
341 | packed_input[packed_index] = input[input_channel][input_y][input_x]; |
342 | } else { |
343 | packed_input[packed_index] = 0.0f; |
344 | } |
345 | } |
346 | } |
347 | |
348 | struct NNP_CACHE_ALIGN matrix_multiplication_context { |
349 | const float* packed_kernel; |
350 | const float* packed_input; |
351 | float* output; |
352 | |
353 | size_t reduction_block_start; |
354 | size_t reduction_block_size; |
355 | size_t output_image_size; |
356 | size_t output_image_block_start; |
357 | size_t output_image_subblock_max; |
358 | size_t output_channels_subblock_max; |
359 | }; |
360 | |
361 | static void compute_matrix_multiplication( |
362 | const struct matrix_multiplication_context context[restrict static 1], |
363 | size_t output_channels_block_start, size_t output_image_subblock_start, |
364 | size_t output_channels_block_size, size_t output_image_subblock_size) |
365 | { |
366 | const size_t reduction_block_start = context->reduction_block_start; |
367 | const size_t reduction_block_size = context->reduction_block_size; |
368 | const size_t output_image_size = context->output_image_size; |
369 | const size_t output_image_block_start = context->output_image_block_start; |
370 | const size_t output_image_subblock_max = context->output_image_subblock_max; |
371 | const size_t output_channels_subblock_max = context->output_channels_subblock_max; |
372 | |
373 | const float* packed_kernel = context->packed_kernel + |
374 | output_channels_block_start * reduction_block_size; |
375 | const float* packed_input = context->packed_input + |
376 | output_image_subblock_start * reduction_block_size; |
377 | float* output = context->output + |
378 | output_channels_block_start * output_image_size + output_image_block_start + output_image_subblock_start; |
379 | |
380 | if (output_image_subblock_size == output_image_subblock_max) { |
381 | const nnp_fast_sgemm_function fast_gemm = nnp_hwinfo.sgemm.only_mr_x_nr; |
382 | while (output_channels_block_size >= output_channels_subblock_max) { |
383 | output_channels_block_size -= output_channels_subblock_max; |
384 | |
385 | fast_gemm( |
386 | reduction_block_size, reduction_block_start, |
387 | packed_kernel, packed_input, output, |
388 | output_image_size); |
389 | |
390 | packed_kernel += reduction_block_size * output_channels_subblock_max; |
391 | output += output_image_size * output_channels_subblock_max; |
392 | } |
393 | } |
394 | |
395 | const nnp_full_sgemm_function full_gemm = nnp_hwinfo.sgemm.upto_mr_x_nr; |
396 | while (output_channels_block_size != 0) { |
397 | const size_t output_channels_subblock_size = min(output_channels_block_size, output_channels_subblock_max); |
398 | output_channels_block_size -= output_channels_subblock_size; |
399 | |
400 | full_gemm( |
401 | output_channels_subblock_size, output_image_subblock_size, |
402 | reduction_block_size, reduction_block_start, |
403 | packed_kernel, packed_input, output, |
404 | output_image_size); |
405 | |
406 | packed_kernel += reduction_block_size * output_channels_subblock_max; |
407 | output += output_image_size * output_channels_subblock_max; |
408 | } |
409 | } |
410 | |
411 | struct NNP_CACHE_ALIGN direct_convolution_context { |
412 | const float* input; |
413 | const float* kernel; |
414 | float* output; |
415 | |
416 | size_t image_elements; |
417 | size_t input_channels; |
418 | size_t input_channels_block_max; |
419 | size_t output_channels_block_max; |
420 | |
421 | nnp_fast_conv_function fast_conv; |
422 | nnp_full_conv_function full_conv; |
423 | }; |
424 | |
425 | static void compute_direct_convolution( |
426 | const struct direct_convolution_context context[restrict static 1], |
427 | size_t output_channels_block_start, size_t output_channels_block_size) |
428 | { |
429 | const size_t image_elements = context->image_elements; |
430 | const size_t input_channels = context->input_channels; |
431 | const size_t input_channels_block_max = context->input_channels_block_max; |
432 | const size_t output_channels_block_max = context->output_channels_block_max; |
433 | |
434 | const float* input = context->input; |
435 | const float* kernel = context->kernel + output_channels_block_start * input_channels; |
436 | float* output = context->output + output_channels_block_start * image_elements; |
437 | |
438 | memset(output, 0, sizeof(float) * output_channels_block_size * image_elements); |
439 | |
440 | size_t input_channels_unprocessed = input_channels; |
441 | if (output_channels_block_size == output_channels_block_max) { |
442 | const nnp_fast_conv_function fast_conv = context->fast_conv; |
443 | while (input_channels_unprocessed >= input_channels_block_max) { |
444 | input_channels_unprocessed -= input_channels_block_max; |
445 | |
446 | fast_conv( |
447 | input_channels, image_elements, |
448 | input, kernel, output); |
449 | |
450 | input += input_channels_block_max * image_elements; |
451 | kernel += input_channels_block_max; |
452 | } |
453 | } |
454 | |
455 | const nnp_full_conv_function full_conv = context->full_conv; |
456 | while (input_channels_unprocessed != 0) { |
457 | const size_t input_channels_block_size = min(input_channels_unprocessed, input_channels_block_max); |
458 | input_channels_unprocessed -= input_channels_block_size; |
459 | |
460 | full_conv( |
461 | input_channels_block_size, output_channels_block_size, |
462 | input_channels, image_elements, |
463 | input, kernel, output); |
464 | |
465 | input += input_channels_block_max * image_elements; |
466 | kernel += input_channels_block_max; |
467 | } |
468 | } |
469 | |
470 | static enum nnp_status compute_fast_convolution_inference( |
471 | const bool fourier_transform, |
472 | const enum nnp_convolution_transform_strategy transform_strategy, |
473 | const size_t transform_element_size, |
474 | const size_t input_channels, |
475 | const size_t output_channels, |
476 | const struct nnp_size tile_size, |
477 | const struct nnp_size input_size, |
478 | const struct nnp_padding input_padding, |
479 | const struct nnp_size kernel_size, |
480 | const struct nnp_size output_size, |
481 | const struct nnp_size output_subsampling, |
482 | const float* input, |
483 | const float* kernel, |
484 | const float* bias, |
485 | float* output, |
486 | void* workspace_buffer, |
487 | size_t* workspace_size, |
488 | const nnp_transform_2d_with_offset input_transform_function, |
489 | const nnp_transform_2d_with_offset kernel_transform_function, |
490 | const nnp_transform_2d_with_bias output_transform_function, |
491 | pthreadpool_t threadpool, |
492 | struct nnp_profile* profile) |
493 | { |
494 | void* memory_block = NULL; |
495 | size_t memory_size = 0; |
496 | const size_t simd_width = nnp_hwinfo.simd_width; |
497 | const size_t tuple_elements = (fourier_transform ? simd_width * 2 : simd_width); |
498 | const size_t tuple_size = tuple_elements * transform_element_size; |
499 | const size_t tile_elements = tile_size.height * tile_size.width; |
500 | const size_t tuple_count = tile_elements / tuple_elements; |
501 | |
502 | const struct nnp_size output_tile_size = { |
503 | .width = (tile_size.width - kernel_size.width) / output_subsampling.width + 1, |
504 | .height = (tile_size.height - kernel_size.height) / output_subsampling.height + 1 |
505 | }; |
506 | const struct nnp_size tile_step = { |
507 | .width = tile_size.width - kernel_size.width + 1, |
508 | .height = tile_size.height - kernel_size.height + 1 |
509 | }; |
510 | |
511 | const size_t tiles_y_count = divide_round_up(output_size.height, output_tile_size.height); |
512 | const size_t tiles_x_count = divide_round_up(output_size.width, output_tile_size.width); |
513 | const size_t tiles_count = tiles_x_count * tiles_y_count; |
514 | |
515 | /* Calculate cache blocking parameters */ |
516 | const size_t cache_elements_l1 = nnp_hwinfo.blocking.l1 / tuple_size; |
517 | const size_t cache_elements_l2 = nnp_hwinfo.blocking.l2 / tuple_size; |
518 | const size_t cache_elements_l3 = nnp_hwinfo.blocking.l3 / tuple_size; |
519 | |
520 | const size_t tiles_subblock_max = (fourier_transform ? nnp_hwinfo.cxgemm.mr : nnp_hwinfo.sxgemm.mr); |
521 | const size_t output_channels_subblock_max = (fourier_transform ? nnp_hwinfo.cxgemm.nr : nnp_hwinfo.sxgemm.nr); |
522 | |
523 | const size_t input_channels_block_max = |
524 | round_down(cache_elements_l1 / (tiles_subblock_max + output_channels_subblock_max), 2); |
525 | const size_t tiles_block_max = |
526 | round_down(cache_elements_l2 / input_channels_block_max, tiles_subblock_max); |
527 | const size_t output_channels_block_max = |
528 | round_down(cache_elements_l3 / input_channels_block_max, output_channels_subblock_max); |
529 | |
530 | const size_t transform_tile_size = tile_elements * transform_element_size; |
531 | const size_t input_transform_size = tiles_count * min(input_channels, input_channels_block_max) * transform_tile_size; |
532 | const size_t output_transform_size = tiles_count * output_channels * transform_tile_size; |
533 | switch (transform_strategy) { |
534 | case nnp_convolution_transform_strategy_compute: |
535 | case nnp_convolution_transform_strategy_reuse: |
536 | { |
537 | memory_size = input_transform_size + output_transform_size; |
538 | const size_t kernel_transform_size = output_channels * min(input_channels, input_channels_block_max) * transform_tile_size; |
539 | if (transform_strategy == nnp_convolution_transform_strategy_compute) { |
540 | memory_size += kernel_transform_size; |
541 | } |
542 | if (workspace_buffer == NULL) { |
543 | if (workspace_size == NULL) { |
544 | memory_block = allocate_memory(memory_size); |
545 | if (memory_block == NULL) { |
546 | return nnp_status_out_of_memory; |
547 | } |
548 | } else { |
549 | *workspace_size = memory_size; |
550 | return nnp_status_success; |
551 | } |
552 | } else { |
553 | if (*workspace_size < memory_size) { |
554 | return nnp_status_insufficient_buffer; |
555 | } |
556 | memory_block = workspace_buffer; |
557 | } |
558 | |
559 | void* input_transform = memory_block; |
560 | void* output_transform = memory_block + input_transform_size; |
561 | void* kernel_transform = memory_block + input_transform_size + output_transform_size; |
562 | |
563 | for (size_t input_channels_block_start = 0; input_channels_block_start < input_channels; input_channels_block_start += input_channels_block_max) { |
564 | const size_t input_channels_block_size = min(input_channels - input_channels_block_start, input_channels_block_max); |
565 | |
566 | if (transform_strategy == nnp_convolution_transform_strategy_compute) { |
567 | NNP_KERNEL_TRANSFORM_START(profile) |
568 | struct kernel_transform_context kernel_transform_context = { |
569 | .transform_function = kernel_transform_function, |
570 | .kernel = kernel + input_channels_block_start * kernel_size.height * kernel_size.width, |
571 | .kernel_transform = kernel_transform, |
572 | .tuple_size = tuple_size, |
573 | .input_channels = input_channels, |
574 | .input_channels_block_size = input_channels_block_size, |
575 | .output_channels = output_channels, |
576 | .kernel_size = kernel_size, |
577 | }; |
578 | pthreadpool_parallelize_2d_tile_2d(threadpool, |
579 | (pthreadpool_task_2d_tile_2d_t) compute_kernel_transform, |
580 | &kernel_transform_context, |
581 | output_channels, input_channels_block_size, |
582 | output_channels_subblock_max, 1, |
583 | PTHREADPOOL_FLAG_DISABLE_DENORMALS); |
584 | NNP_KERNEL_TRANSFORM_END(profile) |
585 | } else { |
586 | kernel_transform = (void*) kernel + input_channels_block_start * output_channels * transform_tile_size; |
587 | } |
588 | |
589 | NNP_INPUT_TRANSFORM_START(profile) |
590 | struct input_transform_context input_transform_context = { |
591 | .input = input, |
592 | .input_transform = input_transform, |
593 | .transform_function = input_transform_function, |
594 | .tuple_size = tuple_size, |
595 | .tiles_count = tiles_count, |
596 | .tiles_x_count = fxdiv_init_size_t(tiles_x_count), |
597 | .input_channels_block_start = input_channels_block_start, |
598 | .input_channels_block_size = input_channels_block_size, |
599 | .input_size = input_size, |
600 | .input_padding_left = input_padding.left, |
601 | .input_padding_top = input_padding.top, |
602 | .input_tile = tile_size, |
603 | .input_tile_step = tile_step, |
604 | }; |
605 | pthreadpool_parallelize_2d_tile_2d(threadpool, |
606 | (pthreadpool_task_2d_tile_2d_t) compute_input_transform, |
607 | &input_transform_context, |
608 | input_channels_block_size, tiles_count, |
609 | 1, tiles_subblock_max, |
610 | PTHREADPOOL_FLAG_DISABLE_DENORMALS); |
611 | NNP_INPUT_TRANSFORM_END(profile) |
612 | |
613 | NNP_BLOCK_MULTIPLICATION_START(profile) |
614 | for (size_t tuple_index = 0; tuple_index < tuple_count; tuple_index += 1) { |
615 | nnp_full_tuple_gemm_function full_gemm_function; |
616 | nnp_fast_tuple_gemm_function fast_gemm_function; |
617 | if (fourier_transform) { |
618 | if (tuple_index < NNP_COMPLEX_TUPLE_INDEX) { |
619 | fast_gemm_function = nnp_hwinfo.cxgemm.s4cX_conjb_only_mr_x_nr; |
620 | full_gemm_function = nnp_hwinfo.cxgemm.s4cX_conjb_upto_mr_x_nr; |
621 | } else { |
622 | fast_gemm_function = nnp_hwinfo.cxgemm.cX_conjb_only_mr_x_nr; |
623 | full_gemm_function = nnp_hwinfo.cxgemm.cX_conjb_upto_mr_x_nr; |
624 | } |
625 | } else { |
626 | if NNP_LIKELY(transform_element_size == sizeof(float)) { |
627 | fast_gemm_function = nnp_hwinfo.sxgemm.only_mr_x_nr; |
628 | full_gemm_function = nnp_hwinfo.sxgemm.upto_mr_x_nr; |
629 | } else { |
630 | #if NNP_BACKEND_ARM |
631 | fast_gemm_function = nnp_hwinfo.hxgemm.only_mr_x_nr; |
632 | full_gemm_function = nnp_hwinfo.hxgemm.upto_mr_x_nr; |
633 | #endif /* NNP_BACKEND_ARM */ |
634 | } |
635 | } |
636 | for (size_t output_channels_block_start = 0; output_channels_block_start < output_channels; output_channels_block_start += output_channels_block_max) { |
637 | const size_t output_channels_block_size = min(output_channels - output_channels_block_start, output_channels_block_max); |
638 | struct tuple_multiplication_context tuple_multiplication_context = { |
639 | .tuple_elements = tuple_elements, |
640 | .tuple_size = tuple_size, |
641 | .tiles_subblock_max = tiles_subblock_max, |
642 | .input_channels_block_start = input_channels_block_start, |
643 | .input_channels_block_size = input_channels_block_size, |
644 | .output_channels = output_channels, |
645 | .output_channels_subblock_max = output_channels_subblock_max, |
646 | .output_channels_block_start = output_channels_block_start, |
647 | .input_transform = input_transform + |
648 | tuple_index * tiles_count * input_channels_block_size * tuple_size, |
649 | .kernel_transform = kernel_transform + |
650 | tuple_index * output_channels * input_channels_block_size * tuple_size, |
651 | .output_transform = output_transform + |
652 | tuple_index * tiles_count * output_channels * tuple_size, |
653 | .fast_gemm = fast_gemm_function, |
654 | .full_gemm = full_gemm_function, |
655 | }; |
656 | pthreadpool_parallelize_2d_tile_2d(threadpool, |
657 | (pthreadpool_task_2d_tile_2d_t) compute_tuple_multiplication, |
658 | &tuple_multiplication_context, |
659 | tiles_count, output_channels_block_size, |
660 | tiles_block_max, output_channels_subblock_max, |
661 | PTHREADPOOL_FLAG_DISABLE_DENORMALS); |
662 | } |
663 | } |
664 | NNP_BLOCK_MULTIPLICATION_END(profile) |
665 | } |
666 | NNP_OUTPUT_TRANSFORM_START(profile) |
667 | struct output_transform_context output_transform_context = { |
668 | .transform_function = output_transform_function, |
669 | .output = output, |
670 | .output_transform = output_transform, |
671 | .bias = bias, |
672 | .tuple_size = tuple_size, |
673 | .tiles_count = tiles_count, |
674 | .tiles_x_count = fxdiv_init_size_t(tiles_x_count), |
675 | .tiles_block_max = fxdiv_init_size_t(tiles_block_max), |
676 | .output_channels = output_channels, |
677 | .output_size = output_size, |
678 | .output_tile = output_tile_size, |
679 | }; |
680 | pthreadpool_parallelize_2d_tile_2d(threadpool, |
681 | (pthreadpool_task_2d_tile_2d_t) compute_output_transform, |
682 | &output_transform_context, |
683 | output_channels, tiles_count, |
684 | output_channels_subblock_max, tiles_subblock_max, |
685 | PTHREADPOOL_FLAG_DISABLE_DENORMALS); |
686 | NNP_OUTPUT_TRANSFORM_END(profile) |
687 | break; |
688 | } |
689 | case nnp_convolution_transform_strategy_precompute: |
690 | { |
691 | const size_t kernel_transform_size = output_channels * input_channels * transform_tile_size; |
692 | if (workspace_buffer == NULL) { |
693 | *workspace_size = kernel_transform_size; |
694 | return nnp_status_success; |
695 | } else { |
696 | if (*workspace_size < kernel_transform_size) { |
697 | return nnp_status_insufficient_buffer; |
698 | } |
699 | memory_block = workspace_buffer; |
700 | } |
701 | |
702 | for (size_t input_channels_block_start = 0; input_channels_block_start < input_channels; input_channels_block_start += input_channels_block_max) { |
703 | const size_t input_channels_block_size = min(input_channels - input_channels_block_start, input_channels_block_max); |
704 | |
705 | NNP_KERNEL_TRANSFORM_START(profile) |
706 | struct kernel_transform_context kernel_transform_context = { |
707 | .transform_function = kernel_transform_function, |
708 | .kernel = kernel + input_channels_block_start * kernel_size.height * kernel_size.width, |
709 | .kernel_transform = (void*) workspace_buffer + input_channels_block_start * output_channels * transform_tile_size, |
710 | .tuple_size = tuple_size, |
711 | .input_channels = input_channels, |
712 | .input_channels_block_size = input_channels_block_size, |
713 | .output_channels = output_channels, |
714 | .kernel_size = kernel_size, |
715 | }; |
716 | pthreadpool_parallelize_2d_tile_2d(threadpool, |
717 | (pthreadpool_task_2d_tile_2d_t) compute_kernel_transform, |
718 | &kernel_transform_context, |
719 | output_channels, input_channels_block_size, |
720 | output_channels_subblock_max, 1, |
721 | PTHREADPOOL_FLAG_DISABLE_DENORMALS); |
722 | NNP_KERNEL_TRANSFORM_END(profile) |
723 | } |
724 | break; |
725 | } |
726 | default: |
727 | return nnp_status_invalid_transform_strategy; |
728 | } |
729 | |
730 | if (memory_block != workspace_buffer) { |
731 | release_memory(memory_block, memory_size); |
732 | } |
733 | return nnp_status_success; |
734 | } |
735 | |
736 | static enum nnp_status compute_gemm_convolution_inference( |
737 | const enum nnp_convolution_transform_strategy transform_strategy, |
738 | const size_t input_channels, |
739 | const size_t output_channels, |
740 | const struct nnp_size input_size, |
741 | const struct nnp_padding input_padding, |
742 | const struct nnp_size kernel_size, |
743 | const struct nnp_size output_size, |
744 | const struct nnp_size output_subsampling, |
745 | const float* input, |
746 | const float* kernel, |
747 | const float* bias, |
748 | float* output, |
749 | void* workspace_buffer, |
750 | size_t* workspace_size, |
751 | enum nnp_activation activation, |
752 | pthreadpool_t threadpool, |
753 | struct nnp_profile* profile) |
754 | { |
755 | enum nnp_status status = nnp_status_success; |
756 | void* memory_block = NULL; |
757 | size_t memory_size = 0; |
758 | const size_t simd_width = nnp_hwinfo.simd_width; |
759 | |
760 | /* Calculate cache blocking parameters */ |
761 | const size_t cache_elements_l1 = nnp_hwinfo.blocking.l1 / sizeof(float); |
762 | const size_t cache_elements_l2 = nnp_hwinfo.blocking.l2 / sizeof(float); |
763 | const size_t cache_elements_l3 = nnp_hwinfo.blocking.l3 / sizeof(float); |
764 | |
765 | const size_t output_channels_subblock_max = nnp_hwinfo.sgemm.mr; |
766 | const size_t output_image_subblock_max = nnp_hwinfo.sgemm.nr; |
767 | |
768 | const size_t reduction_size = input_channels * kernel_size.height * kernel_size.width; |
769 | const size_t output_image_size = output_size.height * output_size.width; |
770 | const size_t reduction_block_max = |
771 | round_down(cache_elements_l1 / (output_channels_subblock_max + output_image_subblock_max), 2); |
772 | const size_t output_channels_block_max = |
773 | round_down(cache_elements_l2 / reduction_block_max, output_channels_subblock_max); |
774 | const size_t output_image_block_max = |
775 | round_down(cache_elements_l3 / reduction_block_max, output_image_subblock_max); |
776 | |
777 | switch (transform_strategy) { |
778 | case nnp_convolution_transform_strategy_compute: |
779 | case nnp_convolution_transform_strategy_reuse: |
780 | { |
781 | const size_t packed_kernel_size = output_channels * |
782 | min(reduction_block_max, reduction_size) * sizeof(float); |
783 | const size_t packed_input_size = min(output_image_block_max, round_up(output_image_size, simd_width)) * |
784 | min(reduction_block_max, reduction_size) * sizeof(float); |
785 | memory_size = packed_kernel_size + packed_input_size; |
786 | if (workspace_buffer == NULL) { |
787 | if (workspace_size == NULL) { |
788 | memory_block = allocate_memory(memory_size); |
789 | if (memory_block == NULL) { |
790 | return nnp_status_out_of_memory; |
791 | } |
792 | } else { |
793 | *workspace_size = memory_size; |
794 | return nnp_status_success; |
795 | } |
796 | } else { |
797 | if (*workspace_size < memory_size) { |
798 | return nnp_status_insufficient_buffer; |
799 | } |
800 | memory_block = workspace_buffer; |
801 | } |
802 | |
803 | float* packed_input = memory_block; |
804 | float* packed_kernel = memory_block + packed_input_size; |
805 | |
806 | for (size_t reduction_block_start = 0; reduction_block_start < reduction_size; reduction_block_start += reduction_block_max) { |
807 | const size_t reduction_block_size = min(reduction_size - reduction_block_start, reduction_block_max); |
808 | |
809 | if (transform_strategy == nnp_convolution_transform_strategy_compute) { |
810 | /* Pack kernel into memory block */ |
811 | NNP_KERNEL_TRANSFORM_START(profile) |
812 | struct kernel_packing_context kernel_packing_context = { |
813 | .kernel = kernel + reduction_block_start, |
814 | .packed_kernel = packed_kernel, |
815 | .reduction_size = reduction_size, |
816 | .reduction_block_start = reduction_block_start, |
817 | .reduction_block_size = reduction_block_size, |
818 | }; |
819 | pthreadpool_parallelize_2d_tile_2d(threadpool, |
820 | (pthreadpool_task_2d_tile_2d_t) compute_kernel_packing, |
821 | &kernel_packing_context, |
822 | output_channels, reduction_block_size, |
823 | output_channels_subblock_max, 1, |
824 | PTHREADPOOL_FLAG_DISABLE_DENORMALS); |
825 | NNP_KERNEL_TRANSFORM_END(profile) |
826 | } else { |
827 | packed_kernel = (void*) kernel + output_channels * reduction_block_start * sizeof(float); |
828 | } |
829 | |
830 | const struct fxdiv_divisor_size_t kernel_elements_divisor = fxdiv_init_size_t(kernel_size.height * kernel_size.width); |
831 | const struct fxdiv_divisor_size_t kernel_width_divisor = fxdiv_init_size_t(kernel_size.width); |
832 | const struct fxdiv_divisor_size_t output_width_divisor = fxdiv_init_size_t(output_size.width); |
833 | for (size_t output_image_block_start = 0; output_image_block_start < output_image_size; output_image_block_start += output_image_block_max) { |
834 | const size_t output_image_block_size = min(output_image_size - output_image_block_start, output_image_block_max); |
835 | |
836 | /* Pack image into L3 block */ |
837 | NNP_INPUT_TRANSFORM_START(profile) |
838 | struct input_packing_context input_packing_context = { |
839 | .input = input, |
840 | .packed_input = packed_input, |
841 | .simd_width = simd_width, |
842 | .reduction_block_start = reduction_block_start, |
843 | .reduction_block_size = reduction_block_size, |
844 | .output_image_block_start = output_image_block_start, |
845 | .input_size = input_size, |
846 | .input_padding_top = input_padding.top, |
847 | .input_padding_left = input_padding.left, |
848 | .kernel_elements = kernel_elements_divisor, |
849 | .kernel_width = kernel_width_divisor, |
850 | .output_width = output_width_divisor, |
851 | .output_subsampling = output_subsampling, |
852 | }; |
853 | pthreadpool_parallelize_2d_tile_2d(threadpool, |
854 | (pthreadpool_task_2d_tile_2d_t) compute_input_packing, |
855 | &input_packing_context, |
856 | reduction_block_size, output_image_block_size, |
857 | 1, output_image_subblock_max, |
858 | PTHREADPOOL_FLAG_DISABLE_DENORMALS); |
859 | NNP_INPUT_TRANSFORM_END(profile) |
860 | |
861 | NNP_BLOCK_MULTIPLICATION_START(profile) |
862 | struct matrix_multiplication_context matrix_multiplication_context = { |
863 | .packed_kernel = packed_kernel, |
864 | .packed_input = packed_input, |
865 | .output = output, |
866 | .reduction_block_start = reduction_block_start, |
867 | .reduction_block_size = reduction_block_size, |
868 | .output_image_size = output_image_size, |
869 | .output_image_block_start = output_image_block_start, |
870 | .output_image_subblock_max = output_image_subblock_max, |
871 | .output_channels_subblock_max = output_channels_subblock_max, |
872 | }; |
873 | pthreadpool_parallelize_2d_tile_2d(threadpool, |
874 | (pthreadpool_task_2d_tile_2d_t) compute_matrix_multiplication, |
875 | &matrix_multiplication_context, |
876 | output_channels, output_image_block_size, |
877 | output_channels_block_max, output_image_subblock_max, |
878 | PTHREADPOOL_FLAG_DISABLE_DENORMALS); |
879 | NNP_BLOCK_MULTIPLICATION_END(profile) |
880 | } |
881 | } |
882 | /* Add bias */ |
883 | NNP_OUTPUT_TRANSFORM_START(profile) |
884 | switch (activation) { |
885 | case nnp_activation_identity: |
886 | for (size_t output_channel = 0; output_channel < output_channels; output_channel += 1) { |
887 | const float bias_value = bias[output_channel]; |
888 | for (size_t index = 0; index < output_image_size; index += 1) { |
889 | output[output_channel * output_image_size + index] += bias_value; |
890 | } |
891 | } |
892 | break; |
893 | case nnp_activation_relu: |
894 | for (size_t output_channel = 0; output_channel < output_channels; output_channel += 1) { |
895 | const float bias_value = bias[output_channel]; |
896 | for (size_t index = 0; index < output_image_size; index += 1) { |
897 | output[output_channel * output_image_size + index] = |
898 | relu(output[output_channel * output_image_size + index] + bias_value, 0.0f); |
899 | } |
900 | } |
901 | break; |
902 | default: |
903 | NNP_UNREACHABLE; |
904 | } |
905 | NNP_OUTPUT_TRANSFORM_END(profile) |
906 | break; |
907 | } |
908 | case nnp_convolution_transform_strategy_precompute: |
909 | { |
910 | const size_t packed_kernel_size = output_channels * reduction_size * sizeof(float); |
911 | if (workspace_buffer == NULL) { |
912 | *workspace_size = packed_kernel_size; |
913 | return nnp_status_success; |
914 | } else { |
915 | if (*workspace_size < packed_kernel_size) { |
916 | return nnp_status_insufficient_buffer; |
917 | } |
918 | memory_block = workspace_buffer; |
919 | } |
920 | |
921 | for (size_t reduction_block_start = 0; reduction_block_start < reduction_size; reduction_block_start += reduction_block_max) { |
922 | const size_t reduction_block_size = min(reduction_size - reduction_block_start, reduction_block_max); |
923 | |
924 | /* Pack kernel into memory block */ |
925 | NNP_KERNEL_TRANSFORM_START(profile) |
926 | struct kernel_packing_context kernel_packing_context = { |
927 | .kernel = kernel + reduction_block_start, |
928 | .packed_kernel = (void*) workspace_buffer + output_channels * reduction_block_start * sizeof(float), |
929 | .reduction_size = reduction_size, |
930 | .reduction_block_start = reduction_block_start, |
931 | .reduction_block_size = reduction_block_size, |
932 | }; |
933 | pthreadpool_parallelize_2d_tile_2d(threadpool, |
934 | (pthreadpool_task_2d_tile_2d_t) compute_kernel_packing, |
935 | &kernel_packing_context, |
936 | output_channels, reduction_block_size, |
937 | output_channels_subblock_max, 1, |
938 | PTHREADPOOL_FLAG_DISABLE_DENORMALS); |
939 | NNP_KERNEL_TRANSFORM_END(profile) |
940 | } |
941 | break; |
942 | } |
943 | default: |
944 | return nnp_status_invalid_transform_strategy; |
945 | } |
946 | |
947 | if (memory_block != workspace_buffer) { |
948 | release_memory(memory_block, memory_size); |
949 | } |
950 | return status; |
951 | } |
952 | |
953 | static enum nnp_status compute_direct_convolution_inference( |
954 | const size_t input_channels, |
955 | const size_t output_channels, |
956 | const struct nnp_size image_size, |
957 | const struct nnp_size kernel_size, |
958 | const float* input, |
959 | const float* kernel, |
960 | const float* bias, |
961 | float* output, |
962 | void* workspace_buffer, |
963 | size_t* workspace_size, |
964 | enum nnp_activation activation, |
965 | pthreadpool_t threadpool, |
966 | struct nnp_profile* profile) |
967 | { |
968 | const size_t image_elements = image_size.height * image_size.width; |
969 | |
970 | if (workspace_buffer == NULL && workspace_size != NULL) { |
971 | *workspace_size = 0; |
972 | return nnp_status_success; |
973 | } |
974 | |
975 | NNP_BLOCK_MULTIPLICATION_START(profile) |
976 | struct direct_convolution_context direct_convolution_context = { |
977 | .input = input, |
978 | .kernel = kernel, |
979 | .output = output, |
980 | .image_elements = image_elements, |
981 | .input_channels = input_channels, |
982 | .input_channels_block_max = nnp_hwinfo.conv1x1.mr, |
983 | .output_channels_block_max = nnp_hwinfo.conv1x1.nr, |
984 | .fast_conv = nnp_hwinfo.conv1x1.only_mr_x_nr, |
985 | .full_conv = nnp_hwinfo.conv1x1.upto_mr_x_nr, |
986 | }; |
987 | pthreadpool_parallelize_1d_tile_1d(threadpool, |
988 | (pthreadpool_task_1d_tile_1d_t) compute_direct_convolution, |
989 | &direct_convolution_context, |
990 | output_channels, nnp_hwinfo.conv1x1.nr, |
991 | PTHREADPOOL_FLAG_DISABLE_DENORMALS); |
992 | NNP_BLOCK_MULTIPLICATION_END(profile) |
993 | |
994 | /* Add bias */ |
995 | NNP_OUTPUT_TRANSFORM_START(profile) |
996 | switch (activation) { |
997 | case nnp_activation_identity: |
998 | for (size_t output_channel = 0; output_channel < output_channels; output_channel += 1) { |
999 | const float bias_value = bias[output_channel]; |
1000 | for (size_t index = 0; index < image_elements; index += 1) { |
1001 | output[output_channel * image_elements + index] += bias_value; |
1002 | } |
1003 | } |
1004 | break; |
1005 | case nnp_activation_relu: |
1006 | for (size_t output_channel = 0; output_channel < output_channels; output_channel += 1) { |
1007 | const float bias_value = bias[output_channel]; |
1008 | for (size_t index = 0; index < image_elements; index += 1) { |
1009 | output[output_channel * image_elements + index] = |
1010 | relu(output[output_channel * image_elements + index] + bias_value, 0.0f); |
1011 | } |
1012 | } |
1013 | break; |
1014 | default: |
1015 | NNP_UNREACHABLE; |
1016 | } |
1017 | NNP_OUTPUT_TRANSFORM_END(profile) |
1018 | |
1019 | return nnp_status_success; |
1020 | } |
1021 | |
1022 | static inline enum nnp_convolution_algorithm select_algorithm( |
1023 | struct nnp_size kernel_size, |
1024 | struct nnp_size output_subsampling, |
1025 | struct nnp_size output_size) |
1026 | { |
1027 | if (max(output_subsampling.height, output_subsampling.width) == 1) { |
1028 | /* Stride-1 convolution: consider fast convolution algorithm and direct 1x1 */ |
1029 | if (max(kernel_size.height, kernel_size.width) == 1) { |
1030 | return nnp_convolution_algorithm_direct; |
1031 | } else if (kernel_size.height == 3 && kernel_size.width == 3) { |
1032 | return nnp_convolution_algorithm_wt8x8; |
1033 | } else if (min(kernel_size.height, kernel_size.width) >= 2) { |
1034 | /* Consider FFT-based fast convolution */ |
1035 | if (max(kernel_size.height, kernel_size.width) <= 8) { |
1036 | /* Decide between FFT 8x8 and FFT 16x16 */ |
1037 | const size_t tile_count_8x8 = |
1038 | divide_round_up(output_size.height, 8 - kernel_size.height + 1) * |
1039 | divide_round_up(output_size.width, 8 - kernel_size.width + 1); |
1040 | const size_t tile_count_16x16 = |
1041 | divide_round_up(output_size.height, 16 - kernel_size.height + 1) * |
1042 | divide_round_up(output_size.width, 16 - kernel_size.width + 1); |
1043 | if (tile_count_8x8 <= 4 * tile_count_16x16) { |
1044 | /* 8x8 tiles are more efficient */ |
1045 | return nnp_convolution_algorithm_ft8x8; |
1046 | } else { |
1047 | return nnp_convolution_algorithm_ft16x16; |
1048 | } |
1049 | } else if (max(kernel_size.height, kernel_size.width) <= 16) { |
1050 | return nnp_convolution_algorithm_ft16x16; |
1051 | } |
1052 | } |
1053 | } |
1054 | |
1055 | /* Fall-back algorithm */ |
1056 | return nnp_convolution_algorithm_implicit_gemm; |
1057 | } |
1058 | |
1059 | enum nnp_status nnp_convolution_inference( |
1060 | enum nnp_convolution_algorithm algorithm, |
1061 | enum nnp_convolution_transform_strategy transform_strategy, |
1062 | size_t input_channels, |
1063 | size_t output_channels, |
1064 | struct nnp_size input_size, |
1065 | struct nnp_padding input_padding, |
1066 | struct nnp_size kernel_size, |
1067 | struct nnp_size output_subsampling, |
1068 | const float* input, |
1069 | const float* kernel, |
1070 | const float* bias, |
1071 | float* output, |
1072 | void* workspace_buffer, |
1073 | size_t* workspace_size, |
1074 | enum nnp_activation activation, |
1075 | const void* activation_parameters, |
1076 | pthreadpool_t threadpool, |
1077 | struct nnp_profile* profile) |
1078 | { |
1079 | NNP_TOTAL_START(profile) |
1080 | |
1081 | /* Basic validation of parameters. This check detects invalid, but not unsupported parameters. */ |
1082 | enum nnp_status status = validate_convolution_arguments( |
1083 | 1, input_channels, output_channels, |
1084 | input_size, input_padding, kernel_size, output_subsampling, |
1085 | activation, activation_parameters); |
1086 | if (status != nnp_status_success) { |
1087 | goto cleanup; |
1088 | } |
1089 | |
1090 | if (activation_parameters != NULL) { |
1091 | status = nnp_status_unsupported_activation_parameters; |
1092 | goto cleanup; |
1093 | } |
1094 | |
1095 | const struct nnp_size output_size = { |
1096 | .width = (input_padding.left + input_size.width + input_padding.right - kernel_size.width) / output_subsampling.width + 1, |
1097 | .height = (input_padding.top + input_size.height + input_padding.bottom - kernel_size.height) / output_subsampling.height + 1 |
1098 | }; |
1099 | |
1100 | if (algorithm == nnp_convolution_algorithm_auto) { |
1101 | algorithm = select_algorithm(kernel_size, output_subsampling, output_size); |
1102 | } |
1103 | |
1104 | struct nnp_size tile_size; |
1105 | size_t transform_element_size; |
1106 | bool fourier_transform; |
1107 | nnp_transform_2d_with_offset input_transform_function = NULL; |
1108 | nnp_transform_2d_with_offset kernel_transform_function = NULL; |
1109 | nnp_transform_2d_with_bias output_transform_function = NULL; |
1110 | switch (algorithm) { |
1111 | case nnp_convolution_algorithm_wt8x8_fp16: |
1112 | #if NNP_BACKEND_ARM |
1113 | if (kernel_size.height != 3 || kernel_size.width != 3) { |
1114 | status = nnp_status_unsupported_algorithm; |
1115 | goto cleanup; |
1116 | } |
1117 | if (max(output_subsampling.height, output_subsampling.width) > 1) { |
1118 | status = nnp_status_unsupported_algorithm; |
1119 | goto cleanup; |
1120 | } |
1121 | tile_size = (struct nnp_size) { .height = 8, .width = 8 }; |
1122 | transform_element_size = sizeof(uint16_t); |
1123 | fourier_transform = false; |
1124 | |
1125 | input_transform_function = nnp_hwinfo.transforms.iwt_f6x6_3x3_fp16_with_offset; |
1126 | kernel_transform_function = nnp_hwinfo.transforms.kwt_f6x6_3x3_fp16; |
1127 | switch (activation) { |
1128 | case nnp_activation_identity: |
1129 | output_transform_function = nnp_hwinfo.transforms.owt_f6x6_3x3_fp16_with_bias; |
1130 | break; |
1131 | case nnp_activation_relu: |
1132 | output_transform_function = nnp_hwinfo.transforms.owt_f6x6_3x3_fp16_with_bias_with_relu; |
1133 | break; |
1134 | default: |
1135 | NNP_UNREACHABLE; |
1136 | } |
1137 | if (input_transform_function != NULL && kernel_transform_function != NULL && output_transform_function != NULL) { |
1138 | break; |
1139 | } |
1140 | #endif |
1141 | /* |
1142 | * Fallthrough otherwise. The rationale here is that only some backends have fp16 storage natively implemented |
1143 | * (e.g. ARM NEON + VFP_FP16 currently), while configuration is (currently) fairly platform-independent. |
1144 | * Thus silently falling back to the baseline Winograd implementation is reasonable. |
1145 | */ |
1146 | case nnp_convolution_algorithm_wt8x8: |
1147 | if (kernel_size.height != 3 || kernel_size.width != 3) { |
1148 | status = nnp_status_unsupported_algorithm; |
1149 | goto cleanup; |
1150 | } |
1151 | tile_size = (struct nnp_size) { .height = 8, .width = 8 }; |
1152 | transform_element_size = sizeof(float); |
1153 | fourier_transform = false; |
1154 | |
1155 | input_transform_function = nnp_hwinfo.transforms.iwt_f6x6_3x3_with_offset_and_stream; |
1156 | kernel_transform_function = nnp_hwinfo.transforms.kwt_f6x6_3x3; |
1157 | switch (activation) { |
1158 | case nnp_activation_identity: |
1159 | if (output_subsampling.height == 1 && output_subsampling.width == 1) { |
1160 | output_transform_function = nnp_hwinfo.transforms.owt_f6x6_3x3_with_bias; |
1161 | } else if (output_subsampling.height == 2 && output_subsampling.width == 2) { |
1162 | output_transform_function = nnp_hwinfo.transforms.owt_f6x6_3x3s2_with_bias; |
1163 | } |
1164 | break; |
1165 | case nnp_activation_relu: |
1166 | if (output_subsampling.height == 1 && output_subsampling.width == 1) { |
1167 | output_transform_function = nnp_hwinfo.transforms.owt_f6x6_3x3_with_bias_with_relu; |
1168 | } else if (output_subsampling.height == 2 && output_subsampling.width == 2) { |
1169 | output_transform_function = nnp_hwinfo.transforms.owt_f6x6_3x3s2_with_bias_with_relu; |
1170 | } |
1171 | break; |
1172 | default: |
1173 | NNP_UNREACHABLE; |
1174 | } |
1175 | break; |
1176 | case nnp_convolution_algorithm_ft8x8: |
1177 | if (max(kernel_size.height, kernel_size.width) > 8) { |
1178 | status = nnp_status_unsupported_algorithm; |
1179 | goto cleanup; |
1180 | } |
1181 | if (max(output_subsampling.height, output_subsampling.width) > 1) { |
1182 | status = nnp_status_unsupported_algorithm; |
1183 | goto cleanup; |
1184 | } |
1185 | tile_size = (struct nnp_size) { .height = 8, .width = 8 }; |
1186 | transform_element_size = sizeof(float); |
1187 | fourier_transform = true; |
1188 | |
1189 | input_transform_function = nnp_hwinfo.transforms.fft8x8_with_offset_and_stream; |
1190 | kernel_transform_function = nnp_hwinfo.transforms.fft8x8_with_offset_and_stream; |
1191 | switch (activation) { |
1192 | case nnp_activation_identity: |
1193 | output_transform_function = nnp_hwinfo.transforms.ifft8x8_with_bias; |
1194 | break; |
1195 | case nnp_activation_relu: |
1196 | output_transform_function = nnp_hwinfo.transforms.ifft8x8_with_bias_with_relu; |
1197 | break; |
1198 | default: |
1199 | NNP_UNREACHABLE; |
1200 | } |
1201 | break; |
1202 | case nnp_convolution_algorithm_ft16x16: |
1203 | if (max(kernel_size.height, kernel_size.width) > 16) { |
1204 | status = nnp_status_unsupported_algorithm; |
1205 | goto cleanup; |
1206 | } |
1207 | if (max(output_subsampling.height, output_subsampling.width) > 1) { |
1208 | status = nnp_status_unsupported_algorithm; |
1209 | goto cleanup; |
1210 | } |
1211 | tile_size = (struct nnp_size) { .height = 16, .width = 16 }; |
1212 | transform_element_size = sizeof(float); |
1213 | fourier_transform = true; |
1214 | |
1215 | input_transform_function = nnp_hwinfo.transforms.fft16x16_with_offset_and_stream; |
1216 | kernel_transform_function = nnp_hwinfo.transforms.fft16x16_with_offset_and_stream; |
1217 | switch (activation) { |
1218 | case nnp_activation_identity: |
1219 | output_transform_function = nnp_hwinfo.transforms.ifft16x16_with_bias; |
1220 | break; |
1221 | case nnp_activation_relu: |
1222 | output_transform_function = nnp_hwinfo.transforms.ifft16x16_with_bias_with_relu; |
1223 | break; |
1224 | default: |
1225 | NNP_UNREACHABLE; |
1226 | } |
1227 | break; |
1228 | case nnp_convolution_algorithm_implicit_gemm: |
1229 | break; |
1230 | case nnp_convolution_algorithm_direct: |
1231 | if (max(kernel_size.height, kernel_size.width) > 1) { |
1232 | status = nnp_status_unsupported_algorithm; |
1233 | goto cleanup; |
1234 | } |
1235 | if (max(output_subsampling.height, output_subsampling.width) > 1) { |
1236 | status = nnp_status_unsupported_algorithm; |
1237 | goto cleanup; |
1238 | } |
1239 | break; |
1240 | case nnp_convolution_algorithm_auto: |
1241 | NNP_UNREACHABLE; |
1242 | default: |
1243 | status = nnp_status_invalid_algorithm; |
1244 | goto cleanup; |
1245 | } |
1246 | |
1247 | switch (algorithm) { |
1248 | case nnp_convolution_algorithm_wt8x8: |
1249 | case nnp_convolution_algorithm_wt8x8_fp16: |
1250 | case nnp_convolution_algorithm_ft8x8: |
1251 | case nnp_convolution_algorithm_ft16x16: |
1252 | if (input_transform_function == NULL || kernel_transform_function == NULL || output_transform_function == NULL) { |
1253 | status = nnp_status_unsupported_algorithm; |
1254 | goto cleanup; |
1255 | } |
1256 | status = compute_fast_convolution_inference( |
1257 | fourier_transform, transform_strategy, transform_element_size, |
1258 | input_channels, output_channels, |
1259 | tile_size, input_size, input_padding, kernel_size, output_size, output_subsampling, |
1260 | input, kernel, bias, output, workspace_buffer, workspace_size, |
1261 | input_transform_function, kernel_transform_function, output_transform_function, |
1262 | threadpool, profile); |
1263 | break; |
1264 | case nnp_convolution_algorithm_implicit_gemm: |
1265 | status = compute_gemm_convolution_inference( |
1266 | transform_strategy, |
1267 | input_channels, output_channels, |
1268 | input_size, input_padding, kernel_size, output_size, output_subsampling, |
1269 | input, kernel, bias, output, workspace_buffer, workspace_size, |
1270 | activation, |
1271 | threadpool, profile); |
1272 | break; |
1273 | case nnp_convolution_algorithm_direct: |
1274 | if (transform_strategy != nnp_convolution_transform_strategy_compute) { |
1275 | status = nnp_status_unsupported_transform_strategy; |
1276 | goto cleanup; |
1277 | } |
1278 | status = compute_direct_convolution_inference( |
1279 | input_channels, output_channels, input_size, kernel_size, |
1280 | input, kernel, bias, output, workspace_buffer, workspace_size, |
1281 | activation, |
1282 | threadpool, profile); |
1283 | break; |
1284 | case nnp_convolution_algorithm_auto: |
1285 | NNP_UNREACHABLE; |
1286 | } |
1287 | |
1288 | cleanup: |
1289 | NNP_TOTAL_END(profile) |
1290 | return status; |
1291 | } |
1292 | |