1 | #pragma once |
2 | |
3 | #include <stddef.h> |
4 | #include <stdint.h> |
5 | #include <stdbool.h> |
6 | |
7 | #include <pthreadpool.h> |
8 | |
9 | #ifdef __cplusplus |
10 | extern "C" { |
11 | #endif |
12 | |
13 | /** |
14 | * @brief Status code for any NNPACK function call. |
15 | */ |
16 | enum nnp_status { |
17 | /** The call succeeded, and all output arguments now contain valid data. */ |
18 | nnp_status_success = 0, |
19 | /** NNPACK function was called with batch_size == 0. */ |
20 | nnp_status_invalid_batch_size = 2, |
21 | /** NNPACK function was called with channels == 0. */ |
22 | nnp_status_invalid_channels = 3, |
23 | /** NNPACK function was called with input_channels == 0. */ |
24 | nnp_status_invalid_input_channels = 4, |
25 | /** NNPACK function was called with output_channels == 0. */ |
26 | nnp_status_invalid_output_channels = 5, |
27 | /** NNPACK function was called with input_size.height == 0 or input_size.width == 0 */ |
28 | nnp_status_invalid_input_size = 10, |
29 | /** NNPACK function was called with input_stride.height == 0 or input_stride.width == 0 */ |
30 | nnp_status_invalid_input_stride = 11, |
31 | /** NNPACK function was called with input_padding not less than respective kernel (or pooling) size, i.e.: |
32 | * |
33 | * - input_padding.left >= kernel_size.width (>= pooling_size.width) |
34 | * - input_padding.right >= kernel_size.width (>= pooling_size.width) |
35 | * - input_padding.top >= kernel_size.height (>= pooling_size.height) |
36 | * - input_padding.bottom >= kernel_size.height (>= pooling_size.height) |
37 | */ |
38 | nnp_status_invalid_input_padding = 12, |
39 | /** NNPACK function was called with kernel_size.height == 0 or kernel_size.width == 0 */ |
40 | nnp_status_invalid_kernel_size = 13, |
41 | /** NNPACK function was called with pooling_size.height == 0 or pooling_size.width == 0 */ |
42 | nnp_status_invalid_pooling_size = 14, |
43 | /** NNPACK function was called with pooling_stride.height == 0 or pooling_stride.width == 0 */ |
44 | nnp_status_invalid_pooling_stride = 15, |
45 | /** NNPACK function was called with convolution algorithm not in nnp_convolution_algorithm enumeration */ |
46 | nnp_status_invalid_algorithm = 16, |
47 | /** NNPACK function was called with convolution transform strategy not in nnp_convolution_transform_strategy enum */ |
48 | nnp_status_invalid_transform_strategy = 17, |
49 | /** NNPACK function was called with output_subsampling.height == 0 or output_subsampling.width == 0 */ |
50 | nnp_status_invalid_output_subsampling = 13, |
51 | /** NNPACK function was called with activation not in nnp_activation enum */ |
52 | nnp_status_invalid_activation = 14, |
53 | /** NNPACK function was called with invalid activation parameters */ |
54 | nnp_status_invalid_activation_parameters = 15, |
55 | |
56 | /** NNPACK does not support the particular input size for the function */ |
57 | nnp_status_unsupported_input_size = 20, |
58 | /** NNPACK does not support the particular input stride for the function */ |
59 | nnp_status_unsupported_input_stride = 21, |
60 | /** NNPACK does not support the particular input padding for the function */ |
61 | nnp_status_unsupported_input_padding = 22, |
62 | /** NNPACK does not support the particular kernel size for the function */ |
63 | nnp_status_unsupported_kernel_size = 23, |
64 | /** NNPACK does not support the particular pooling size for the function */ |
65 | nnp_status_unsupported_pooling_size = 24, |
66 | /** NNPACK does not support the particular pooling stride for the function */ |
67 | nnp_status_unsupported_pooling_stride = 25, |
68 | /** NNPACK does not support the particular convolution algorithm for the function */ |
69 | nnp_status_unsupported_algorithm = 26, |
70 | /** NNPACK does not support the particular convolution transform strategy for the algorithm */ |
71 | nnp_status_unsupported_transform_strategy = 27, |
72 | /** NNPACK does not support the particular activation function for the function */ |
73 | nnp_status_unsupported_activation = 28, |
74 | /** NNPACK does not support the particular activation function parameters for the function */ |
75 | nnp_status_unsupported_activation_parameters = 29, |
76 | |
77 | /** NNPACK function was called before the library was initialized */ |
78 | nnp_status_uninitialized = 50, |
79 | /** NNPACK does not implement this function for the host CPU */ |
80 | nnp_status_unsupported_hardware = 51, |
81 | /** NNPACK failed to allocate memory for temporary buffers */ |
82 | nnp_status_out_of_memory = 52, |
83 | /** Scratch space buffer is too small */ |
84 | nnp_status_insufficient_buffer = 53, |
85 | /** Scratch space buffer is not properly aligned */ |
86 | nnp_status_misaligned_buffer = 54 |
87 | }; |
88 | |
89 | /** |
90 | * @brief Activation applied applied after a convolutional or fully-connected layer. |
91 | */ |
92 | enum nnp_activation { |
93 | /** Identity activation f(x) := x, i.e. no transformation */ |
94 | nnp_activation_identity = 0, |
95 | /** ReLU activation f(x) := max(0, x) */ |
96 | nnp_activation_relu = 1, |
97 | }; |
98 | |
99 | /** |
100 | * @brief Algorithm for computing convolutional layers. |
101 | */ |
102 | enum nnp_convolution_algorithm { |
103 | /** Let NNPACK choose the algorithm depending on layer parameters */ |
104 | nnp_convolution_algorithm_auto = 0, |
105 | /** Tiled convolution based on 2D Fourier transform with 8x8 blocks. Supports kernels up to 8x8. */ |
106 | nnp_convolution_algorithm_ft8x8 = 1, |
107 | /** Tiled convolution based on 2D Fourier transform with 16x16 blocks. Supports kernels up to 16x16. */ |
108 | nnp_convolution_algorithm_ft16x16 = 2, |
109 | /** Tiled convolution based on 2D Winograd transform F(3x3, 6x6) with 8x8 blocks. Supports only 3x3 kernels. */ |
110 | nnp_convolution_algorithm_wt8x8 = 3, |
111 | /** Direct convolution via implicit GEMM. */ |
112 | nnp_convolution_algorithm_implicit_gemm = 4, |
113 | /** Direct convolution implementation. */ |
114 | nnp_convolution_algorithm_direct = 5, |
115 | /** |
116 | * Tiled convolution based on 2D Winograd transform F(3x3, 6x6) with 8x8 blocks in FP16. |
117 | * Supports only 3x3 kernels. Implemented only for new ARM processors (with NEON-HP), |
118 | * on non-supported processors falls back to nnp_convolution_algorithm_wt8x8. |
119 | */ |
120 | nnp_convolution_algorithm_wt8x8_fp16 = 6, |
121 | }; |
122 | |
123 | enum nnp_convolution_transform_strategy { |
124 | nnp_convolution_transform_strategy_compute = 1, |
125 | nnp_convolution_transform_strategy_precompute = 2, |
126 | nnp_convolution_transform_strategy_reuse = 3 |
127 | }; |
128 | |
129 | /* For backward compatibility */ |
130 | #define nnp_convolution_transform_strategy_block_based nnp_convolution_transform_strategy_compute |
131 | #define nnp_convolution_transform_strategy_tuple_based nnp_convolution_transform_strategy_compute |
132 | |
133 | /** |
134 | * @brief Size of images, kernels, and pooling filters in NNPACK. |
135 | */ |
136 | struct nnp_size { |
137 | /** Width (horizontal size) of an image, kernel, or pooling filter. */ |
138 | size_t width; |
139 | /** Height (vertical size) of an image, kernel, or pooling filter. */ |
140 | size_t height; |
141 | }; |
142 | |
143 | /** |
144 | * @brief Padding of images in NNPACK. |
145 | */ |
146 | struct nnp_padding { |
147 | /** Padding above the image data */ |
148 | size_t top; |
149 | /** Padding on the right of image data */ |
150 | size_t right; |
151 | /** Padding below the image data */ |
152 | size_t bottom; |
153 | /** Padding on the left of image data */ |
154 | size_t left; |
155 | }; |
156 | |
157 | /** |
158 | * @brief Profiling information about time spent in different phases of a function call. |
159 | */ |
160 | struct nnp_profile { |
161 | /** Time spent inside the function call, in seconds. */ |
162 | double total; |
163 | /** Time spend on transformation of the input or input gradient tensor, in seconds. */ |
164 | double input_transform; |
165 | /** Time spend on transformation of the kernel or kernel gradient tensor, in seconds. */ |
166 | double kernel_transform; |
167 | /** Time spend on transformation of the output or output gradient tensor, in seconds. */ |
168 | double output_transform; |
169 | /** Time spend on multiplication-accumulation of transformed coefficients, in seconds. */ |
170 | double block_multiplication; |
171 | }; |
172 | |
173 | enum nnp_status nnp_initialize(void); |
174 | |
175 | enum nnp_status nnp_deinitialize(void); |
176 | |
177 | /** |
178 | * @brief Computes output of a 2D convolutional layer from input and kernel tensors. |
179 | * @details This function targets training of convolutional neural networks and performs forward propagation. |
180 | * It is optimized for moderate minibatch sizes (64-128) and can be inefficient on a small minibatch. |
181 | * For minibatch size 1, use nnp_convolution_inference for optimal performance. |
182 | * @param algorithm The type of algorithm to use for convolution. Possible values are: |
183 | * |
184 | * - nnp_convolution_algorithm_auto -- let the function choose the algorithm. |
185 | * - nnp_convolution_algorithm_ft8x8 -- tiled convolution based on 2D Fourier transform with 8x8 blocks. |
186 | * Supports kernels up to 8x8. |
187 | * - nnp_convolution_algorithm_ft16x16 -- tiled convolution based on 2D Fourier transform with 16x16 blocks. |
188 | * Supports kernels up to 16x16. |
189 | * - nnp_convolution_algorithm_wt8x8 -- tiled convolution based on 2D Winograd transform F(3x3, 6x6). |
190 | * Supports only 3x3 kernels. |
191 | * |
192 | * @param batch_size The number of images on the input and output of the convolutional layer. |
193 | * @param input_channels The number of channels (AKA features, dimensions) in the input images. |
194 | * @param output_channels The number of channels (AKA features, dimensions) in the output images. |
195 | * @param input_size Size of input images, excluding implicit zero-padding. |
196 | * @param input_padding Implicit zero-padding of input images. |
197 | * @param kernel_size Kernel size. |
198 | * @param[in] input A 4D tensor input[batch_size][input_channels][input_size.height][input_size.width]. |
199 | * @param[in] kernel A 4D tensor kernel[output_channels][input_channels][kernel_size.height][kernel_size.width]. |
200 | * @param[in] bias A 1D array bias[output_channels]. |
201 | * @param[out] output A 4D tensor output[batch_size][output_channels][output_size.height][output_size.width] where |
202 | * output_size.height = (input_padding.top + input_size.height + input_padding.bottom) - |
203 | * (kernel_size.height - 1) |
204 | * output_size.width = (input_padding.left + input_size.width + input_padding.right) - |
205 | * (kernel_size.width - 1) |
206 | * @param threadpool A thread pool for parallelization of the computation. |
207 | * If threadpool is NULL, the computation would run on the caller thread without parallelization. |
208 | * @param[out] profile An optional pointer to profiling structure. |
209 | * If provided, the structure would record time spent in different phases of the computation. |
210 | */ |
211 | |
212 | enum nnp_status nnp_convolution_output( |
213 | enum nnp_convolution_algorithm algorithm, |
214 | size_t batch_size, |
215 | size_t input_channels, |
216 | size_t output_channels, |
217 | struct nnp_size input_size, |
218 | struct nnp_padding input_padding, |
219 | struct nnp_size kernel_size, |
220 | const float* input, |
221 | const float* kernel, |
222 | const float* bias, |
223 | float* output, |
224 | void* workspace_buffer, |
225 | size_t* workspace_size, |
226 | enum nnp_activation activation, |
227 | const void* activation_parameters, |
228 | pthreadpool_t threadpool, |
229 | struct nnp_profile* profile); |
230 | |
231 | /** |
232 | * @brief Computes gradient of input of a 2D convolutional layer from gradient of output and kernel tensors. |
233 | * @details This function targets training of convolutional neural networks and performs backward propagation. |
234 | * It is optimized for moderate minibatch sizes (64-128) and can be inefficient on a small minibatch. |
235 | * @param algorithm The type of algorithm to use for convolution. Possible values are: |
236 | * |
237 | * - nnp_convolution_algorithm_auto -- let the function choose the algorithm. |
238 | * - nnp_convolution_algorithm_ft8x8 -- tiled convolution based on 2D Fourier transform with 8x8 blocks. |
239 | * Supports kernels up to 8x8. |
240 | * - nnp_convolution_algorithm_ft16x16 -- tiled convolution based on 2D Fourier transform with 16x16 blocks. |
241 | * Supports kernels up to 16x16. |
242 | * - nnp_convolution_algorithm_wt8x8 -- tiled convolution based on 2D Winograd transform F(3x3, 6x6). |
243 | * Supports only 3x3 kernels. |
244 | * |
245 | * @param batch_size The number of images (and their gradients) on the input and output of the convolutional layer. |
246 | * @param input_channels The number of channels (AKA features, dimensions) in the input images (and gradients). |
247 | * @param output_channels The number of channels (AKA features, dimensions) in the output images (and gradients). |
248 | * @param input_size Size of input images and their gradients, excluding implicit zero-padding. |
249 | * @param input_padding Implicit zero-padding of input images. |
250 | * @param kernel_size Kernel size. |
251 | * @param[in] grad_output A 4D tensor grad_output[batch_size][output_channels][output_size.height][output_size.width] |
252 | * where |
253 | * output_size.height = (input_padding.top + input_size.height + input_padding.bottom) - |
254 | * (kernel_size.height - 1) |
255 | * output_size.width = (input_padding.left + input_size.width + input_padding.right) - |
256 | * (kernel_size.width - 1) |
257 | * @param[in] kernel A 4D tensor kernel[output_channels][input_channels][kernel_size.height][kernel_size.width]. |
258 | * @param[out] grad_input A 4D tensor grad_input[batch_size][input_channels][input_size.height][input_size.width]. |
259 | * @param threadpool A thread pool for parallelization of the computation. |
260 | * If threadpool is NULL, the computation would run on the caller thread without parallelization. |
261 | * @param[out] profile An optional pointer to profiling structure. |
262 | * If provided, the structure would record time spent in different phases of the computation. |
263 | */ |
264 | enum nnp_status nnp_convolution_input_gradient( |
265 | enum nnp_convolution_algorithm algorithm, |
266 | size_t batch_size, |
267 | size_t input_channels, |
268 | size_t output_channels, |
269 | struct nnp_size input_size, |
270 | struct nnp_padding input_padding, |
271 | struct nnp_size kernel_size, |
272 | const float* grad_output, |
273 | const float* kernel, |
274 | float* grad_input, |
275 | void* workspace_buffer, |
276 | size_t* workspace_size, |
277 | enum nnp_activation activation, |
278 | const void* activation_parameters, |
279 | pthreadpool_t threadpool, |
280 | struct nnp_profile* profile); |
281 | |
282 | /** |
283 | * @brief Computes gradient of kernel of a 2D convolutional layer from gradient of output and input tensors. |
284 | * @details This function targets training of convolutional neural networks and performs backward propagation. |
285 | * It is optimized for moderate minibatch sizes (64-128) and can be inefficient on a small minibatch. |
286 | * @param algorithm The type of algorithm to use for convolution. Possible values are: |
287 | * |
288 | * - nnp_convolution_algorithm_auto -- let the function choose the algorithm. |
289 | * - nnp_convolution_algorithm_ft8x8 -- tiled convolution based on 2D Fourier transform with 8x8 blocks. |
290 | * Supports kernels up to 8x8. |
291 | * - nnp_convolution_algorithm_ft16x16 -- tiled convolution based on 2D Fourier transform with 16x16 blocks. |
292 | * Supports kernels up to 16x16. |
293 | * |
294 | * @param batch_size The number of images (and their gradients) on the input and output of the convolutional layer. |
295 | * @param input_channels The number of channels (AKA features, dimensions) in the input images. |
296 | * @param output_channels The number of channels (AKA features, dimensions) in the output images (and gradients). |
297 | * @param input_size Size of input images and their gradients, excluding implicit zero-padding. |
298 | * @param input_padding Implicit zero-padding of input images. |
299 | * @param kernel_size Kernel size. |
300 | * @param[in] input A 4D tensor input[batch_size][input_channels][input_size.height][input_size.width]. |
301 | * @param[in] grad_output A 4D tensor grad_output[batch_size][output_channels][output_size.height][output_size.width] |
302 | * where |
303 | * output_size.height = (input_padding.top + input_size.height + input_padding.bottom) - |
304 | * (kernel_size.height - 1) |
305 | * output_size.width = (input_padding.left + input_size.width + input_padding.right) - |
306 | * (kernel_size.width - 1) |
307 | * @param[out] grad_kernel A 4D tensor |
308 | * grad_kernel[output_channels][input_channels][kernel_size.height][kernel_size.width]. |
309 | * @param threadpool A thread pool for parallelization of the computation. |
310 | * If threadpool is NULL, the computation would run on the caller thread without parallelization. |
311 | * @param[out] profile An optional pointer to profiling structure. |
312 | * If provided, the structure would record time spent in different phases of the computation. |
313 | */ |
314 | enum nnp_status nnp_convolution_kernel_gradient( |
315 | enum nnp_convolution_algorithm algorithm, |
316 | size_t batch_size, |
317 | size_t input_channels, |
318 | size_t output_channels, |
319 | struct nnp_size input_size, |
320 | struct nnp_padding input_padding, |
321 | struct nnp_size kernel_size, |
322 | const float* input, |
323 | const float* grad_output, |
324 | float* grad_kernel, |
325 | void* workspace_buffer, |
326 | size_t* workspace_size, |
327 | enum nnp_activation activation, |
328 | const void* activation_parameters, |
329 | pthreadpool_t threadpool, |
330 | struct nnp_profile* profile); |
331 | |
332 | /** |
333 | * @brief Computes output of a 2D convolutional layer for a single input image and a kernel tensor. |
334 | * @details This function targets prediction with convolutional neural networks and performs forward propagation. |
335 | * @param algorithm The type of algorithm to use for convolution. Possible values are: |
336 | * |
337 | * - nnp_convolution_algorithm_auto -- let the function choose the algorithm. |
338 | * - nnp_convolution_algorithm_ft8x8 -- tiled convolution based on 2D Fourier transform with 8x8 blocks. |
339 | * Supports kernels up to 8x8. |
340 | * - nnp_convolution_algorithm_ft16x16 -- tiled convolution based on 2D Fourier transform with 16x16 blocks. |
341 | * Supports kernels up to 16x16. |
342 | * - nnp_convolution_algorithm_wt8x8 -- tiled convolution based on 2D Winograd transform F(3x3, 6x6). |
343 | * Supports only 3x3 kernels. |
344 | * |
345 | * @param transform_strategy A strategy that guides computation of kernel transforms coefficients. |
346 | * Possible values are: |
347 | * |
348 | * - nnp_convolution_transform_strategy_block_based -- do multiplication-accumulations on blocks of transformed |
349 | * coefficients. |
350 | * - nnp_convolution_transform_strategy_tuple_based -- do multiplication-accumulations on tuples of transformed |
351 | * coefficients. |
352 | * |
353 | * @param input_channels The number of channels (AKA features, dimensions) in the input image. |
354 | * @param output_channels The number of channels (AKA features, dimensions) in the output image. |
355 | * @param input_size Size of input image, excluding implicit zero-padding. |
356 | * @param input_padding Implicit zero-padding of input image. |
357 | * @param kernel_size Kernel size. |
358 | * @param output_subsampling Subsample region for output, also known as convolution stride. |
359 | * @param[in] input A 3D tensor input[input_channels][input_size.height][input_size.width]. |
360 | * @param[in] kernel A 4D tensor kernel[output_channels][input_channels][kernel_size.height][kernel_size.width]. |
361 | * @param[in] bias A 1D array bias[output_channels]. |
362 | * @param[out] output A 3D tensor output[output_channels][output_size.height][output_size.width] where |
363 | * output_size.height = (input_padding.top + input_size.height + input_padding.bottom) - |
364 | * (kernel_size.height - 1) |
365 | * output_size.width = (input_padding.left + input_size.width + input_padding.right) - |
366 | * (kernel_size.width - 1) |
367 | * @param[in] workspace_buffer Buffer for scratch memory used during computation. Buffer must be aligned on 64 bytes. |
368 | * If workspace_buffer is NULL and workspace_size is non-NULL, NNPACK would store the size |
369 | * of required workspace memory at the workspace_size location, and exit without |
370 | * computations. |
371 | * If workspace_buffer is NULL and workspace_size is NULL, NNPACK would allocate memory |
372 | * before and deallocate after this computation, potentially at significant runtime cost. |
373 | * @param[in,out] workspace_size Pointer to the size of workspace buffer. |
374 | * If workspace_buffer is NULL, NNPACK will write the size of required scratch memory to |
375 | * the location specified by this pointer. |
376 | * If workspace_buffer is non-NULL, NNPACK expects workspace_size to specify the size of |
377 | * the buffer, in bytes. |
378 | * If workspace_size is NULL, workspace_buffer must be NULL as well. In this case NNPACK |
379 | * would allocate memory before and deallocate after this computation, potentially at |
380 | * significant runtime cost. |
381 | * @param threadpool A thread pool for parallelization of the computation. |
382 | * If threadpool is NULL, the computation would run on the caller thread without parallelization. |
383 | * @param[out] profile An optional pointer to profiling structure. |
384 | * If provided, the structure would record time spent in different phases of the computation. |
385 | */ |
386 | enum nnp_status nnp_convolution_inference( |
387 | enum nnp_convolution_algorithm algorithm, |
388 | enum nnp_convolution_transform_strategy transform_strategy, |
389 | size_t input_channels, |
390 | size_t output_channels, |
391 | struct nnp_size input_size, |
392 | struct nnp_padding input_padding, |
393 | struct nnp_size kernel_size, |
394 | struct nnp_size output_subsampling, |
395 | const float* input, |
396 | const float* kernel, |
397 | const float* bias, |
398 | float* output, |
399 | void* workspace_buffer, |
400 | size_t* workspace_size, |
401 | enum nnp_activation activation, |
402 | const void* activation_parameters, |
403 | pthreadpool_t threadpool, |
404 | struct nnp_profile* profile); |
405 | |
406 | /** |
407 | * @brief Computes output of a fully connected layer from input and kernel matrices. |
408 | * @details This function targets training of convolutional neural networks and performs forward propagation. |
409 | * It is optimized for moderate minibatch sizes (64-128) and can be inefficient on a small minibatch. |
410 | * For minibatch size 1, use nnp_fully_connected_inference for optimal performance. |
411 | * @param batch_size The number of vectors on the input and output of the fully connected layer. |
412 | * @param input_channels The number of channels (AKA features, dimensions) in the input matrix. |
413 | * @param output_channels The number of channels (AKA features, dimensions) in the output matrix. |
414 | * @param[in] input A 2D matrix input[batch_size][input_channels]. |
415 | * @param[in] kernel A 2D matrix kernel[output_channels][input_channels]. |
416 | * @param[out] output A 2D matrix output[batch_size][output_channels]. |
417 | * @param threadpool A thread pool for parallelization of the computation. |
418 | * If threadpool is NULL, the computation would run on the caller thread without parallelization. |
419 | */ |
420 | enum nnp_status nnp_fully_connected_output( |
421 | size_t batch_size, |
422 | size_t input_channels, |
423 | size_t output_channels, |
424 | const float input[], |
425 | const float kernel[], |
426 | float output[], |
427 | pthreadpool_t threadpool, |
428 | struct nnp_profile* profile); |
429 | |
430 | /** |
431 | * @brief Computes output of a fully connected layer for a single input vector and a kernel matrix. |
432 | * @details This function targets prediction with convolutional neural networks and performs forward propagation. |
433 | * @param input_channels The number of channels (AKA features, dimensions) in the input vector. |
434 | * @param output_channels The number of channels (AKA features, dimensions) in the output vector. |
435 | * @param[in] input A 1D array input[input_channels] of FP32 elements. |
436 | * @param[in] kernel A 2D matrix kernel[output_channels][input_channels] of FP32 elements. |
437 | * @param[out] output A 1D array output[output_channels] of FP32 elements. |
438 | * @param threadpool A thread pool for parallelization of the computation. |
439 | * If threadpool is NULL, the computation would run on the caller thread without parallelization. |
440 | */ |
441 | enum nnp_status nnp_fully_connected_inference( |
442 | size_t input_channels, |
443 | size_t output_channels, |
444 | const float* input, |
445 | const float* kernel, |
446 | float* output, |
447 | pthreadpool_t threadpool); |
448 | |
449 | /** |
450 | * @brief Computes output of a fully connected layer for a single input vector and a kernel matrix. |
451 | * @details This function targets prediction with convolutional neural networks and performs forward propagation. |
452 | * @param input_channels The number of channels (AKA features, dimensions) in the input vector. |
453 | * @param output_channels The number of channels (AKA features, dimensions) in the output vector. |
454 | * @param[in] input A 1D array input[input_channels] of FP32 elements. |
455 | * @param[in] kernel A 2D matrix kernel[output_channels][input_channels] of FP16 (ARM alternative format) elements. |
456 | * @param[out] output A 1D array output[output_channels] of FP32 elements. |
457 | * @param threadpool A thread pool for parallelization of the computation. |
458 | * If threadpool is NULL, the computation would run on the caller thread without parallelization. |
459 | */ |
460 | enum nnp_status nnp_fully_connected_inference_f16f32( |
461 | size_t input_channels, |
462 | size_t output_channels, |
463 | const float* input, |
464 | const void* kernel, |
465 | float* output, |
466 | pthreadpool_t threadpool); |
467 | |
468 | /** |
469 | * @brief Computes output of a max-pooling layer for an input tensor. |
470 | * @details This function targets both prediction and training of convolutional neural networks and performs forward |
471 | * propagation. Is is optimized for both large and small minibatch sizes. |
472 | * @param batch_size The number of images on the input and output of the max-pooling layer. |
473 | * @param channels The number of channels (AKA features, dimensions) in both input and output images. |
474 | * @param input_size Size of input images, excluding implicit zero-padding. |
475 | * @param input_padding Implicit padding of input images. The padding pixels are ignored by the pooling filter, but |
476 | * affect the output size. |
477 | * @param pooling_size Size of the pooling filter. Only 2x2 filter are currently supported. |
478 | * @param pooling_stride Stride of the pooling filter. Only 2x2 strides are currently supported. |
479 | * @param[in] input A 4D tensor input[batch_size][channels][input_size.height][input_size.width]. |
480 | * @param[out] output A 4D tensor output[batch_size][channels][output_size.height][output_size.width] where |
481 | * output_size.height = ceil( |
482 | * (input_padding.top + input_size.height + input_padding.bottom - pooling_size.height) / |
483 | * pooling_stride.height) + 1 |
484 | * output_size.width = ceil( |
485 | * (input_padding.left + input_size.width + input_padding.right - pooling_size.width) / |
486 | * pooling_stride.width) + 1 |
487 | * @param threadpool A thread pool for parallelization of the computation. |
488 | * If threadpool is NULL, the computation would run on the caller thread without parallelization. |
489 | */ |
490 | enum nnp_status nnp_max_pooling_output( |
491 | size_t batch_size, |
492 | size_t channels, |
493 | struct nnp_size input_size, |
494 | struct nnp_padding input_padding, |
495 | struct nnp_size pooling_size, |
496 | struct nnp_size pooling_stride, |
497 | const float input[], |
498 | float output[], |
499 | pthreadpool_t threadpool); |
500 | |
501 | /** |
502 | * @brief Computes output of a softmax layer for an input matrix. |
503 | * @details This function targets both prediction and training of convolutional neural networks and performs forward |
504 | * propagation. Is is optimized for both large and small minibatch sizes. |
505 | * @param batch_size The number of vectors on the input and output of the softmax layer. |
506 | * @param channels The number of channels (AKA features, dimensions) in both input and output vectors. |
507 | * @param[in] input A 2D matrix input[batch_size][channels]. |
508 | * @param[out] output A 2D matrix output[batch_size][channels]. |
509 | * @param threadpool A thread pool for parallelization of the computation. |
510 | * If threadpool is NULL, the computation would run on the caller thread without parallelization. |
511 | */ |
512 | enum nnp_status nnp_softmax_output( |
513 | size_t batch_size, |
514 | size_t channels, |
515 | const float input[], |
516 | float output[], |
517 | pthreadpool_t threadpool); |
518 | |
519 | /** |
520 | * @brief Computes output of a rectified linear unit (ReLU) layer for an input matrix. |
521 | * @details This function targets both prediction and training of convolutional neural networks and performs forward |
522 | * propagation. Is is optimized for both large and small minibatch sizes. |
523 | * @param batch_size The number of vectors on the input and output of the ReLU layer. |
524 | * @param channels The number of channels (AKA features, dimensions) in both input and output matrices. |
525 | * @param[in] input A 2D matrix input[batch_size][channels]. |
526 | * @param[out] output A 2D matrix output[batch_size][channels]. |
527 | * @param threadpool A thread pool for parallelization of the computation. |
528 | * If threadpool is NULL, the computation would run on the caller thread without parallelization. |
529 | */ |
530 | enum nnp_status nnp_relu_output( |
531 | size_t batch_size, |
532 | size_t channels, |
533 | const float input[], |
534 | float output[], |
535 | float negative_slope, |
536 | pthreadpool_t threadpool); |
537 | |
538 | /** |
539 | * @brief Computes gradient of input of a rectified linear unit (ReLU) layer from gradient of output and input matrices. |
540 | * @details This function targets training of convolutional neural networks and performs backward propagation. |
541 | * Is is optimized for both large and small minibatch sizes. |
542 | * @param batch_size The number of vectors on the input and output of the ReLU layer. |
543 | * @param channels The number of channels (AKA features, dimensions) in both input and output matrices. |
544 | * @param[in] input A 2D matrix input[batch_size][channels]. |
545 | * @param[out] output A 2D matrix output[batch_size][channels]. |
546 | * @param threadpool A thread pool for parallelization of the computation. |
547 | * If threadpool is NULL, the computation would run on the caller thread without parallelization. |
548 | */ |
549 | enum nnp_status nnp_relu_input_gradient( |
550 | size_t batch_size, |
551 | size_t channels, |
552 | const float grad_output[], |
553 | const float input[], |
554 | float grad_input[], |
555 | float negative_slope, |
556 | pthreadpool_t threadpool); |
557 | |
558 | #ifdef __cplusplus |
559 | } /* extern "C" */ |
560 | #endif |
561 | |
562 | #ifdef __cplusplus |
563 | // Backward compatible implementations for nnp_convolution_*, if we are in C++ |
564 | // mode. |
565 | inline enum nnp_status nnp_convolution_output( |
566 | enum nnp_convolution_algorithm algorithm, |
567 | size_t batch_size, |
568 | size_t input_channels, |
569 | size_t output_channels, |
570 | struct nnp_size input_size, |
571 | struct nnp_padding input_padding, |
572 | struct nnp_size kernel_size, |
573 | const float input[], |
574 | const float kernel[], |
575 | const float bias[], |
576 | float output[], |
577 | pthreadpool_t threadpool, |
578 | struct nnp_profile* profile) |
579 | { |
580 | return nnp_convolution_output( |
581 | algorithm, |
582 | batch_size, input_channels, output_channels, |
583 | input_size, input_padding, kernel_size, |
584 | input, kernel, bias, output, |
585 | NULL, NULL, |
586 | nnp_activation_identity, NULL, threadpool, profile); |
587 | } |
588 | |
589 | inline enum nnp_status nnp_convolution_input_gradient( |
590 | enum nnp_convolution_algorithm algorithm, |
591 | size_t batch_size, |
592 | size_t input_channels, |
593 | size_t output_channels, |
594 | struct nnp_size input_size, |
595 | struct nnp_padding input_padding, |
596 | struct nnp_size kernel_size, |
597 | const float grad_output[], |
598 | const float kernel[], |
599 | float grad_input[], |
600 | pthreadpool_t threadpool, |
601 | struct nnp_profile* profile) |
602 | { |
603 | return nnp_convolution_input_gradient( |
604 | algorithm, |
605 | batch_size, input_channels, output_channels, |
606 | input_size, input_padding, kernel_size, |
607 | grad_output, kernel, grad_input, |
608 | NULL, NULL, |
609 | nnp_activation_identity, NULL, threadpool, profile); |
610 | } |
611 | |
612 | inline enum nnp_status nnp_convolution_kernel_gradient( |
613 | enum nnp_convolution_algorithm algorithm, |
614 | size_t batch_size, |
615 | size_t input_channels, |
616 | size_t output_channels, |
617 | struct nnp_size input_size, |
618 | struct nnp_padding input_padding, |
619 | struct nnp_size kernel_size, |
620 | const float input[], |
621 | const float grad_output[], |
622 | float grad_kernel[], |
623 | pthreadpool_t threadpool, |
624 | struct nnp_profile* profile) |
625 | { |
626 | return nnp_convolution_kernel_gradient( |
627 | algorithm, |
628 | batch_size, input_channels, output_channels, |
629 | input_size, input_padding, kernel_size, |
630 | input, grad_output, grad_kernel, |
631 | NULL, NULL, |
632 | nnp_activation_identity, NULL, threadpool, profile); |
633 | } |
634 | |
635 | inline enum nnp_status nnp_convolution_inference( |
636 | enum nnp_convolution_algorithm algorithm, |
637 | enum nnp_convolution_transform_strategy transform_strategy, |
638 | size_t input_channels, |
639 | size_t output_channels, |
640 | struct nnp_size input_size, |
641 | struct nnp_padding input_padding, |
642 | struct nnp_size kernel_size, |
643 | struct nnp_size output_subsampling, |
644 | const float input[], |
645 | const float kernel[], |
646 | const float bias[], |
647 | float output[], |
648 | pthreadpool_t threadpool, |
649 | struct nnp_profile* profile) { |
650 | return nnp_convolution_inference( |
651 | algorithm, transform_strategy, |
652 | input_channels, output_channels, |
653 | input_size, input_padding, kernel_size, output_subsampling, |
654 | input, kernel, bias, output, NULL, NULL, |
655 | nnp_activation_identity, NULL, |
656 | threadpool, profile); |
657 | } |
658 | |
659 | #endif // __cplusplus |
660 | |