1#pragma once
2
3#include <stddef.h>
4#include <stdint.h>
5#include <stdbool.h>
6
7#include <pthreadpool.h>
8
9#ifdef __cplusplus
10extern "C" {
11#endif
12
13/**
14 * @brief Status code for any NNPACK function call.
15 */
16enum nnp_status {
17 /** The call succeeded, and all output arguments now contain valid data. */
18 nnp_status_success = 0,
19 /** NNPACK function was called with batch_size == 0. */
20 nnp_status_invalid_batch_size = 2,
21 /** NNPACK function was called with channels == 0. */
22 nnp_status_invalid_channels = 3,
23 /** NNPACK function was called with input_channels == 0. */
24 nnp_status_invalid_input_channels = 4,
25 /** NNPACK function was called with output_channels == 0. */
26 nnp_status_invalid_output_channels = 5,
27 /** NNPACK function was called with input_size.height == 0 or input_size.width == 0 */
28 nnp_status_invalid_input_size = 10,
29 /** NNPACK function was called with input_stride.height == 0 or input_stride.width == 0 */
30 nnp_status_invalid_input_stride = 11,
31 /** NNPACK function was called with input_padding not less than respective kernel (or pooling) size, i.e.:
32 *
33 * - input_padding.left >= kernel_size.width (>= pooling_size.width)
34 * - input_padding.right >= kernel_size.width (>= pooling_size.width)
35 * - input_padding.top >= kernel_size.height (>= pooling_size.height)
36 * - input_padding.bottom >= kernel_size.height (>= pooling_size.height)
37 */
38 nnp_status_invalid_input_padding = 12,
39 /** NNPACK function was called with kernel_size.height == 0 or kernel_size.width == 0 */
40 nnp_status_invalid_kernel_size = 13,
41 /** NNPACK function was called with pooling_size.height == 0 or pooling_size.width == 0 */
42 nnp_status_invalid_pooling_size = 14,
43 /** NNPACK function was called with pooling_stride.height == 0 or pooling_stride.width == 0 */
44 nnp_status_invalid_pooling_stride = 15,
45 /** NNPACK function was called with convolution algorithm not in nnp_convolution_algorithm enumeration */
46 nnp_status_invalid_algorithm = 16,
47 /** NNPACK function was called with convolution transform strategy not in nnp_convolution_transform_strategy enum */
48 nnp_status_invalid_transform_strategy = 17,
49 /** NNPACK function was called with output_subsampling.height == 0 or output_subsampling.width == 0 */
50 nnp_status_invalid_output_subsampling = 13,
51 /** NNPACK function was called with activation not in nnp_activation enum */
52 nnp_status_invalid_activation = 14,
53 /** NNPACK function was called with invalid activation parameters */
54 nnp_status_invalid_activation_parameters = 15,
55
56 /** NNPACK does not support the particular input size for the function */
57 nnp_status_unsupported_input_size = 20,
58 /** NNPACK does not support the particular input stride for the function */
59 nnp_status_unsupported_input_stride = 21,
60 /** NNPACK does not support the particular input padding for the function */
61 nnp_status_unsupported_input_padding = 22,
62 /** NNPACK does not support the particular kernel size for the function */
63 nnp_status_unsupported_kernel_size = 23,
64 /** NNPACK does not support the particular pooling size for the function */
65 nnp_status_unsupported_pooling_size = 24,
66 /** NNPACK does not support the particular pooling stride for the function */
67 nnp_status_unsupported_pooling_stride = 25,
68 /** NNPACK does not support the particular convolution algorithm for the function */
69 nnp_status_unsupported_algorithm = 26,
70 /** NNPACK does not support the particular convolution transform strategy for the algorithm */
71 nnp_status_unsupported_transform_strategy = 27,
72 /** NNPACK does not support the particular activation function for the function */
73 nnp_status_unsupported_activation = 28,
74 /** NNPACK does not support the particular activation function parameters for the function */
75 nnp_status_unsupported_activation_parameters = 29,
76
77 /** NNPACK function was called before the library was initialized */
78 nnp_status_uninitialized = 50,
79 /** NNPACK does not implement this function for the host CPU */
80 nnp_status_unsupported_hardware = 51,
81 /** NNPACK failed to allocate memory for temporary buffers */
82 nnp_status_out_of_memory = 52,
83 /** Scratch space buffer is too small */
84 nnp_status_insufficient_buffer = 53,
85 /** Scratch space buffer is not properly aligned */
86 nnp_status_misaligned_buffer = 54
87};
88
89/**
90 * @brief Activation applied applied after a convolutional or fully-connected layer.
91 */
92enum nnp_activation {
93 /** Identity activation f(x) := x, i.e. no transformation */
94 nnp_activation_identity = 0,
95 /** ReLU activation f(x) := max(0, x) */
96 nnp_activation_relu = 1,
97};
98
99/**
100 * @brief Algorithm for computing convolutional layers.
101 */
102enum nnp_convolution_algorithm {
103 /** Let NNPACK choose the algorithm depending on layer parameters */
104 nnp_convolution_algorithm_auto = 0,
105 /** Tiled convolution based on 2D Fourier transform with 8x8 blocks. Supports kernels up to 8x8. */
106 nnp_convolution_algorithm_ft8x8 = 1,
107 /** Tiled convolution based on 2D Fourier transform with 16x16 blocks. Supports kernels up to 16x16. */
108 nnp_convolution_algorithm_ft16x16 = 2,
109 /** Tiled convolution based on 2D Winograd transform F(3x3, 6x6) with 8x8 blocks. Supports only 3x3 kernels. */
110 nnp_convolution_algorithm_wt8x8 = 3,
111 /** Direct convolution via implicit GEMM. */
112 nnp_convolution_algorithm_implicit_gemm = 4,
113 /** Direct convolution implementation. */
114 nnp_convolution_algorithm_direct = 5,
115 /**
116 * Tiled convolution based on 2D Winograd transform F(3x3, 6x6) with 8x8 blocks in FP16.
117 * Supports only 3x3 kernels. Implemented only for new ARM processors (with NEON-HP),
118 * on non-supported processors falls back to nnp_convolution_algorithm_wt8x8.
119 */
120 nnp_convolution_algorithm_wt8x8_fp16 = 6,
121};
122
123enum nnp_convolution_transform_strategy {
124 nnp_convolution_transform_strategy_compute = 1,
125 nnp_convolution_transform_strategy_precompute = 2,
126 nnp_convolution_transform_strategy_reuse = 3
127};
128
129/* For backward compatibility */
130#define nnp_convolution_transform_strategy_block_based nnp_convolution_transform_strategy_compute
131#define nnp_convolution_transform_strategy_tuple_based nnp_convolution_transform_strategy_compute
132
133/**
134 * @brief Size of images, kernels, and pooling filters in NNPACK.
135 */
136struct nnp_size {
137 /** Width (horizontal size) of an image, kernel, or pooling filter. */
138 size_t width;
139 /** Height (vertical size) of an image, kernel, or pooling filter. */
140 size_t height;
141};
142
143/**
144 * @brief Padding of images in NNPACK.
145 */
146struct nnp_padding {
147 /** Padding above the image data */
148 size_t top;
149 /** Padding on the right of image data */
150 size_t right;
151 /** Padding below the image data */
152 size_t bottom;
153 /** Padding on the left of image data */
154 size_t left;
155};
156
157/**
158 * @brief Profiling information about time spent in different phases of a function call.
159 */
160struct nnp_profile {
161 /** Time spent inside the function call, in seconds. */
162 double total;
163 /** Time spend on transformation of the input or input gradient tensor, in seconds. */
164 double input_transform;
165 /** Time spend on transformation of the kernel or kernel gradient tensor, in seconds. */
166 double kernel_transform;
167 /** Time spend on transformation of the output or output gradient tensor, in seconds. */
168 double output_transform;
169 /** Time spend on multiplication-accumulation of transformed coefficients, in seconds. */
170 double block_multiplication;
171};
172
173enum nnp_status nnp_initialize(void);
174
175enum nnp_status nnp_deinitialize(void);
176
177/**
178 * @brief Computes output of a 2D convolutional layer from input and kernel tensors.
179 * @details This function targets training of convolutional neural networks and performs forward propagation.
180 * It is optimized for moderate minibatch sizes (64-128) and can be inefficient on a small minibatch.
181 * For minibatch size 1, use nnp_convolution_inference for optimal performance.
182 * @param algorithm The type of algorithm to use for convolution. Possible values are:
183 *
184 * - nnp_convolution_algorithm_auto -- let the function choose the algorithm.
185 * - nnp_convolution_algorithm_ft8x8 -- tiled convolution based on 2D Fourier transform with 8x8 blocks.
186 * Supports kernels up to 8x8.
187 * - nnp_convolution_algorithm_ft16x16 -- tiled convolution based on 2D Fourier transform with 16x16 blocks.
188 * Supports kernels up to 16x16.
189 * - nnp_convolution_algorithm_wt8x8 -- tiled convolution based on 2D Winograd transform F(3x3, 6x6).
190 * Supports only 3x3 kernels.
191 *
192 * @param batch_size The number of images on the input and output of the convolutional layer.
193 * @param input_channels The number of channels (AKA features, dimensions) in the input images.
194 * @param output_channels The number of channels (AKA features, dimensions) in the output images.
195 * @param input_size Size of input images, excluding implicit zero-padding.
196 * @param input_padding Implicit zero-padding of input images.
197 * @param kernel_size Kernel size.
198 * @param[in] input A 4D tensor input[batch_size][input_channels][input_size.height][input_size.width].
199 * @param[in] kernel A 4D tensor kernel[output_channels][input_channels][kernel_size.height][kernel_size.width].
200 * @param[in] bias A 1D array bias[output_channels].
201 * @param[out] output A 4D tensor output[batch_size][output_channels][output_size.height][output_size.width] where
202 * output_size.height = (input_padding.top + input_size.height + input_padding.bottom) -
203 * (kernel_size.height - 1)
204 * output_size.width = (input_padding.left + input_size.width + input_padding.right) -
205 * (kernel_size.width - 1)
206 * @param threadpool A thread pool for parallelization of the computation.
207 * If threadpool is NULL, the computation would run on the caller thread without parallelization.
208 * @param[out] profile An optional pointer to profiling structure.
209 * If provided, the structure would record time spent in different phases of the computation.
210 */
211
212enum nnp_status nnp_convolution_output(
213 enum nnp_convolution_algorithm algorithm,
214 size_t batch_size,
215 size_t input_channels,
216 size_t output_channels,
217 struct nnp_size input_size,
218 struct nnp_padding input_padding,
219 struct nnp_size kernel_size,
220 const float* input,
221 const float* kernel,
222 const float* bias,
223 float* output,
224 void* workspace_buffer,
225 size_t* workspace_size,
226 enum nnp_activation activation,
227 const void* activation_parameters,
228 pthreadpool_t threadpool,
229 struct nnp_profile* profile);
230
231/**
232 * @brief Computes gradient of input of a 2D convolutional layer from gradient of output and kernel tensors.
233 * @details This function targets training of convolutional neural networks and performs backward propagation.
234 * It is optimized for moderate minibatch sizes (64-128) and can be inefficient on a small minibatch.
235 * @param algorithm The type of algorithm to use for convolution. Possible values are:
236 *
237 * - nnp_convolution_algorithm_auto -- let the function choose the algorithm.
238 * - nnp_convolution_algorithm_ft8x8 -- tiled convolution based on 2D Fourier transform with 8x8 blocks.
239 * Supports kernels up to 8x8.
240 * - nnp_convolution_algorithm_ft16x16 -- tiled convolution based on 2D Fourier transform with 16x16 blocks.
241 * Supports kernels up to 16x16.
242 * - nnp_convolution_algorithm_wt8x8 -- tiled convolution based on 2D Winograd transform F(3x3, 6x6).
243 * Supports only 3x3 kernels.
244 *
245 * @param batch_size The number of images (and their gradients) on the input and output of the convolutional layer.
246 * @param input_channels The number of channels (AKA features, dimensions) in the input images (and gradients).
247 * @param output_channels The number of channels (AKA features, dimensions) in the output images (and gradients).
248 * @param input_size Size of input images and their gradients, excluding implicit zero-padding.
249 * @param input_padding Implicit zero-padding of input images.
250 * @param kernel_size Kernel size.
251 * @param[in] grad_output A 4D tensor grad_output[batch_size][output_channels][output_size.height][output_size.width]
252 * where
253 * output_size.height = (input_padding.top + input_size.height + input_padding.bottom) -
254 * (kernel_size.height - 1)
255 * output_size.width = (input_padding.left + input_size.width + input_padding.right) -
256 * (kernel_size.width - 1)
257 * @param[in] kernel A 4D tensor kernel[output_channels][input_channels][kernel_size.height][kernel_size.width].
258 * @param[out] grad_input A 4D tensor grad_input[batch_size][input_channels][input_size.height][input_size.width].
259 * @param threadpool A thread pool for parallelization of the computation.
260 * If threadpool is NULL, the computation would run on the caller thread without parallelization.
261 * @param[out] profile An optional pointer to profiling structure.
262 * If provided, the structure would record time spent in different phases of the computation.
263 */
264enum nnp_status nnp_convolution_input_gradient(
265 enum nnp_convolution_algorithm algorithm,
266 size_t batch_size,
267 size_t input_channels,
268 size_t output_channels,
269 struct nnp_size input_size,
270 struct nnp_padding input_padding,
271 struct nnp_size kernel_size,
272 const float* grad_output,
273 const float* kernel,
274 float* grad_input,
275 void* workspace_buffer,
276 size_t* workspace_size,
277 enum nnp_activation activation,
278 const void* activation_parameters,
279 pthreadpool_t threadpool,
280 struct nnp_profile* profile);
281
282/**
283 * @brief Computes gradient of kernel of a 2D convolutional layer from gradient of output and input tensors.
284 * @details This function targets training of convolutional neural networks and performs backward propagation.
285 * It is optimized for moderate minibatch sizes (64-128) and can be inefficient on a small minibatch.
286 * @param algorithm The type of algorithm to use for convolution. Possible values are:
287 *
288 * - nnp_convolution_algorithm_auto -- let the function choose the algorithm.
289 * - nnp_convolution_algorithm_ft8x8 -- tiled convolution based on 2D Fourier transform with 8x8 blocks.
290 * Supports kernels up to 8x8.
291 * - nnp_convolution_algorithm_ft16x16 -- tiled convolution based on 2D Fourier transform with 16x16 blocks.
292 * Supports kernels up to 16x16.
293 *
294 * @param batch_size The number of images (and their gradients) on the input and output of the convolutional layer.
295 * @param input_channels The number of channels (AKA features, dimensions) in the input images.
296 * @param output_channels The number of channels (AKA features, dimensions) in the output images (and gradients).
297 * @param input_size Size of input images and their gradients, excluding implicit zero-padding.
298 * @param input_padding Implicit zero-padding of input images.
299 * @param kernel_size Kernel size.
300 * @param[in] input A 4D tensor input[batch_size][input_channels][input_size.height][input_size.width].
301 * @param[in] grad_output A 4D tensor grad_output[batch_size][output_channels][output_size.height][output_size.width]
302 * where
303 * output_size.height = (input_padding.top + input_size.height + input_padding.bottom) -
304 * (kernel_size.height - 1)
305 * output_size.width = (input_padding.left + input_size.width + input_padding.right) -
306 * (kernel_size.width - 1)
307 * @param[out] grad_kernel A 4D tensor
308 * grad_kernel[output_channels][input_channels][kernel_size.height][kernel_size.width].
309 * @param threadpool A thread pool for parallelization of the computation.
310 * If threadpool is NULL, the computation would run on the caller thread without parallelization.
311 * @param[out] profile An optional pointer to profiling structure.
312 * If provided, the structure would record time spent in different phases of the computation.
313 */
314enum nnp_status nnp_convolution_kernel_gradient(
315 enum nnp_convolution_algorithm algorithm,
316 size_t batch_size,
317 size_t input_channels,
318 size_t output_channels,
319 struct nnp_size input_size,
320 struct nnp_padding input_padding,
321 struct nnp_size kernel_size,
322 const float* input,
323 const float* grad_output,
324 float* grad_kernel,
325 void* workspace_buffer,
326 size_t* workspace_size,
327 enum nnp_activation activation,
328 const void* activation_parameters,
329 pthreadpool_t threadpool,
330 struct nnp_profile* profile);
331
332/**
333 * @brief Computes output of a 2D convolutional layer for a single input image and a kernel tensor.
334 * @details This function targets prediction with convolutional neural networks and performs forward propagation.
335 * @param algorithm The type of algorithm to use for convolution. Possible values are:
336 *
337 * - nnp_convolution_algorithm_auto -- let the function choose the algorithm.
338 * - nnp_convolution_algorithm_ft8x8 -- tiled convolution based on 2D Fourier transform with 8x8 blocks.
339 * Supports kernels up to 8x8.
340 * - nnp_convolution_algorithm_ft16x16 -- tiled convolution based on 2D Fourier transform with 16x16 blocks.
341 * Supports kernels up to 16x16.
342 * - nnp_convolution_algorithm_wt8x8 -- tiled convolution based on 2D Winograd transform F(3x3, 6x6).
343 * Supports only 3x3 kernels.
344 *
345 * @param transform_strategy A strategy that guides computation of kernel transforms coefficients.
346 * Possible values are:
347 *
348 * - nnp_convolution_transform_strategy_block_based -- do multiplication-accumulations on blocks of transformed
349 * coefficients.
350 * - nnp_convolution_transform_strategy_tuple_based -- do multiplication-accumulations on tuples of transformed
351 * coefficients.
352 *
353 * @param input_channels The number of channels (AKA features, dimensions) in the input image.
354 * @param output_channels The number of channels (AKA features, dimensions) in the output image.
355 * @param input_size Size of input image, excluding implicit zero-padding.
356 * @param input_padding Implicit zero-padding of input image.
357 * @param kernel_size Kernel size.
358 * @param output_subsampling Subsample region for output, also known as convolution stride.
359 * @param[in] input A 3D tensor input[input_channels][input_size.height][input_size.width].
360 * @param[in] kernel A 4D tensor kernel[output_channels][input_channels][kernel_size.height][kernel_size.width].
361 * @param[in] bias A 1D array bias[output_channels].
362 * @param[out] output A 3D tensor output[output_channels][output_size.height][output_size.width] where
363 * output_size.height = (input_padding.top + input_size.height + input_padding.bottom) -
364 * (kernel_size.height - 1)
365 * output_size.width = (input_padding.left + input_size.width + input_padding.right) -
366 * (kernel_size.width - 1)
367 * @param[in] workspace_buffer Buffer for scratch memory used during computation. Buffer must be aligned on 64 bytes.
368 * If workspace_buffer is NULL and workspace_size is non-NULL, NNPACK would store the size
369 * of required workspace memory at the workspace_size location, and exit without
370 * computations.
371 * If workspace_buffer is NULL and workspace_size is NULL, NNPACK would allocate memory
372 * before and deallocate after this computation, potentially at significant runtime cost.
373 * @param[in,out] workspace_size Pointer to the size of workspace buffer.
374 * If workspace_buffer is NULL, NNPACK will write the size of required scratch memory to
375 * the location specified by this pointer.
376 * If workspace_buffer is non-NULL, NNPACK expects workspace_size to specify the size of
377 * the buffer, in bytes.
378 * If workspace_size is NULL, workspace_buffer must be NULL as well. In this case NNPACK
379 * would allocate memory before and deallocate after this computation, potentially at
380 * significant runtime cost.
381 * @param threadpool A thread pool for parallelization of the computation.
382 * If threadpool is NULL, the computation would run on the caller thread without parallelization.
383 * @param[out] profile An optional pointer to profiling structure.
384 * If provided, the structure would record time spent in different phases of the computation.
385 */
386enum nnp_status nnp_convolution_inference(
387 enum nnp_convolution_algorithm algorithm,
388 enum nnp_convolution_transform_strategy transform_strategy,
389 size_t input_channels,
390 size_t output_channels,
391 struct nnp_size input_size,
392 struct nnp_padding input_padding,
393 struct nnp_size kernel_size,
394 struct nnp_size output_subsampling,
395 const float* input,
396 const float* kernel,
397 const float* bias,
398 float* output,
399 void* workspace_buffer,
400 size_t* workspace_size,
401 enum nnp_activation activation,
402 const void* activation_parameters,
403 pthreadpool_t threadpool,
404 struct nnp_profile* profile);
405
406/**
407 * @brief Computes output of a fully connected layer from input and kernel matrices.
408 * @details This function targets training of convolutional neural networks and performs forward propagation.
409 * It is optimized for moderate minibatch sizes (64-128) and can be inefficient on a small minibatch.
410 * For minibatch size 1, use nnp_fully_connected_inference for optimal performance.
411 * @param batch_size The number of vectors on the input and output of the fully connected layer.
412 * @param input_channels The number of channels (AKA features, dimensions) in the input matrix.
413 * @param output_channels The number of channels (AKA features, dimensions) in the output matrix.
414 * @param[in] input A 2D matrix input[batch_size][input_channels].
415 * @param[in] kernel A 2D matrix kernel[output_channels][input_channels].
416 * @param[out] output A 2D matrix output[batch_size][output_channels].
417 * @param threadpool A thread pool for parallelization of the computation.
418 * If threadpool is NULL, the computation would run on the caller thread without parallelization.
419 */
420enum nnp_status nnp_fully_connected_output(
421 size_t batch_size,
422 size_t input_channels,
423 size_t output_channels,
424 const float input[],
425 const float kernel[],
426 float output[],
427 pthreadpool_t threadpool,
428 struct nnp_profile* profile);
429
430/**
431 * @brief Computes output of a fully connected layer for a single input vector and a kernel matrix.
432 * @details This function targets prediction with convolutional neural networks and performs forward propagation.
433 * @param input_channels The number of channels (AKA features, dimensions) in the input vector.
434 * @param output_channels The number of channels (AKA features, dimensions) in the output vector.
435 * @param[in] input A 1D array input[input_channels] of FP32 elements.
436 * @param[in] kernel A 2D matrix kernel[output_channels][input_channels] of FP32 elements.
437 * @param[out] output A 1D array output[output_channels] of FP32 elements.
438 * @param threadpool A thread pool for parallelization of the computation.
439 * If threadpool is NULL, the computation would run on the caller thread without parallelization.
440 */
441enum nnp_status nnp_fully_connected_inference(
442 size_t input_channels,
443 size_t output_channels,
444 const float* input,
445 const float* kernel,
446 float* output,
447 pthreadpool_t threadpool);
448
449/**
450 * @brief Computes output of a fully connected layer for a single input vector and a kernel matrix.
451 * @details This function targets prediction with convolutional neural networks and performs forward propagation.
452 * @param input_channels The number of channels (AKA features, dimensions) in the input vector.
453 * @param output_channels The number of channels (AKA features, dimensions) in the output vector.
454 * @param[in] input A 1D array input[input_channels] of FP32 elements.
455 * @param[in] kernel A 2D matrix kernel[output_channels][input_channels] of FP16 (ARM alternative format) elements.
456 * @param[out] output A 1D array output[output_channels] of FP32 elements.
457 * @param threadpool A thread pool for parallelization of the computation.
458 * If threadpool is NULL, the computation would run on the caller thread without parallelization.
459 */
460enum nnp_status nnp_fully_connected_inference_f16f32(
461 size_t input_channels,
462 size_t output_channels,
463 const float* input,
464 const void* kernel,
465 float* output,
466 pthreadpool_t threadpool);
467
468/**
469 * @brief Computes output of a max-pooling layer for an input tensor.
470 * @details This function targets both prediction and training of convolutional neural networks and performs forward
471 * propagation. Is is optimized for both large and small minibatch sizes.
472 * @param batch_size The number of images on the input and output of the max-pooling layer.
473 * @param channels The number of channels (AKA features, dimensions) in both input and output images.
474 * @param input_size Size of input images, excluding implicit zero-padding.
475 * @param input_padding Implicit padding of input images. The padding pixels are ignored by the pooling filter, but
476 * affect the output size.
477 * @param pooling_size Size of the pooling filter. Only 2x2 filter are currently supported.
478 * @param pooling_stride Stride of the pooling filter. Only 2x2 strides are currently supported.
479 * @param[in] input A 4D tensor input[batch_size][channels][input_size.height][input_size.width].
480 * @param[out] output A 4D tensor output[batch_size][channels][output_size.height][output_size.width] where
481 * output_size.height = ceil(
482 * (input_padding.top + input_size.height + input_padding.bottom - pooling_size.height) /
483 * pooling_stride.height) + 1
484 * output_size.width = ceil(
485 * (input_padding.left + input_size.width + input_padding.right - pooling_size.width) /
486 * pooling_stride.width) + 1
487 * @param threadpool A thread pool for parallelization of the computation.
488 * If threadpool is NULL, the computation would run on the caller thread without parallelization.
489 */
490enum nnp_status nnp_max_pooling_output(
491 size_t batch_size,
492 size_t channels,
493 struct nnp_size input_size,
494 struct nnp_padding input_padding,
495 struct nnp_size pooling_size,
496 struct nnp_size pooling_stride,
497 const float input[],
498 float output[],
499 pthreadpool_t threadpool);
500
501/**
502 * @brief Computes output of a softmax layer for an input matrix.
503 * @details This function targets both prediction and training of convolutional neural networks and performs forward
504 * propagation. Is is optimized for both large and small minibatch sizes.
505 * @param batch_size The number of vectors on the input and output of the softmax layer.
506 * @param channels The number of channels (AKA features, dimensions) in both input and output vectors.
507 * @param[in] input A 2D matrix input[batch_size][channels].
508 * @param[out] output A 2D matrix output[batch_size][channels].
509 * @param threadpool A thread pool for parallelization of the computation.
510 * If threadpool is NULL, the computation would run on the caller thread without parallelization.
511 */
512enum nnp_status nnp_softmax_output(
513 size_t batch_size,
514 size_t channels,
515 const float input[],
516 float output[],
517 pthreadpool_t threadpool);
518
519/**
520 * @brief Computes output of a rectified linear unit (ReLU) layer for an input matrix.
521 * @details This function targets both prediction and training of convolutional neural networks and performs forward
522 * propagation. Is is optimized for both large and small minibatch sizes.
523 * @param batch_size The number of vectors on the input and output of the ReLU layer.
524 * @param channels The number of channels (AKA features, dimensions) in both input and output matrices.
525 * @param[in] input A 2D matrix input[batch_size][channels].
526 * @param[out] output A 2D matrix output[batch_size][channels].
527 * @param threadpool A thread pool for parallelization of the computation.
528 * If threadpool is NULL, the computation would run on the caller thread without parallelization.
529 */
530enum nnp_status nnp_relu_output(
531 size_t batch_size,
532 size_t channels,
533 const float input[],
534 float output[],
535 float negative_slope,
536 pthreadpool_t threadpool);
537
538/**
539 * @brief Computes gradient of input of a rectified linear unit (ReLU) layer from gradient of output and input matrices.
540 * @details This function targets training of convolutional neural networks and performs backward propagation.
541 * Is is optimized for both large and small minibatch sizes.
542 * @param batch_size The number of vectors on the input and output of the ReLU layer.
543 * @param channels The number of channels (AKA features, dimensions) in both input and output matrices.
544 * @param[in] input A 2D matrix input[batch_size][channels].
545 * @param[out] output A 2D matrix output[batch_size][channels].
546 * @param threadpool A thread pool for parallelization of the computation.
547 * If threadpool is NULL, the computation would run on the caller thread without parallelization.
548 */
549enum nnp_status nnp_relu_input_gradient(
550 size_t batch_size,
551 size_t channels,
552 const float grad_output[],
553 const float input[],
554 float grad_input[],
555 float negative_slope,
556 pthreadpool_t threadpool);
557
558#ifdef __cplusplus
559} /* extern "C" */
560#endif
561
562#ifdef __cplusplus
563// Backward compatible implementations for nnp_convolution_*, if we are in C++
564// mode.
565inline enum nnp_status nnp_convolution_output(
566 enum nnp_convolution_algorithm algorithm,
567 size_t batch_size,
568 size_t input_channels,
569 size_t output_channels,
570 struct nnp_size input_size,
571 struct nnp_padding input_padding,
572 struct nnp_size kernel_size,
573 const float input[],
574 const float kernel[],
575 const float bias[],
576 float output[],
577 pthreadpool_t threadpool,
578 struct nnp_profile* profile)
579{
580 return nnp_convolution_output(
581 algorithm,
582 batch_size, input_channels, output_channels,
583 input_size, input_padding, kernel_size,
584 input, kernel, bias, output,
585 NULL, NULL,
586 nnp_activation_identity, NULL, threadpool, profile);
587}
588
589inline enum nnp_status nnp_convolution_input_gradient(
590 enum nnp_convolution_algorithm algorithm,
591 size_t batch_size,
592 size_t input_channels,
593 size_t output_channels,
594 struct nnp_size input_size,
595 struct nnp_padding input_padding,
596 struct nnp_size kernel_size,
597 const float grad_output[],
598 const float kernel[],
599 float grad_input[],
600 pthreadpool_t threadpool,
601 struct nnp_profile* profile)
602{
603 return nnp_convolution_input_gradient(
604 algorithm,
605 batch_size, input_channels, output_channels,
606 input_size, input_padding, kernel_size,
607 grad_output, kernel, grad_input,
608 NULL, NULL,
609 nnp_activation_identity, NULL, threadpool, profile);
610}
611
612inline enum nnp_status nnp_convolution_kernel_gradient(
613 enum nnp_convolution_algorithm algorithm,
614 size_t batch_size,
615 size_t input_channels,
616 size_t output_channels,
617 struct nnp_size input_size,
618 struct nnp_padding input_padding,
619 struct nnp_size kernel_size,
620 const float input[],
621 const float grad_output[],
622 float grad_kernel[],
623 pthreadpool_t threadpool,
624 struct nnp_profile* profile)
625{
626 return nnp_convolution_kernel_gradient(
627 algorithm,
628 batch_size, input_channels, output_channels,
629 input_size, input_padding, kernel_size,
630 input, grad_output, grad_kernel,
631 NULL, NULL,
632 nnp_activation_identity, NULL, threadpool, profile);
633}
634
635inline enum nnp_status nnp_convolution_inference(
636 enum nnp_convolution_algorithm algorithm,
637 enum nnp_convolution_transform_strategy transform_strategy,
638 size_t input_channels,
639 size_t output_channels,
640 struct nnp_size input_size,
641 struct nnp_padding input_padding,
642 struct nnp_size kernel_size,
643 struct nnp_size output_subsampling,
644 const float input[],
645 const float kernel[],
646 const float bias[],
647 float output[],
648 pthreadpool_t threadpool,
649 struct nnp_profile* profile) {
650 return nnp_convolution_inference(
651 algorithm, transform_strategy,
652 input_channels, output_channels,
653 input_size, input_padding, kernel_size, output_subsampling,
654 input, kernel, bias, output, NULL, NULL,
655 nnp_activation_identity, NULL,
656 threadpool, profile);
657}
658
659#endif // __cplusplus
660