nnpack.h source code [pytorch/third_party/NNPACK/include/nnpack.h]

1	#pragma once
2
3	#include <stddef.h>
4	#include <stdint.h>
5	#include <stdbool.h>
6
7	#include <pthreadpool.h>
8
9	#ifdef __cplusplus
10	extern "C" {
11	#endif
12
13	/**
14	* @brief Status code for any NNPACK function call.
15	*/
16	enum nnp_status {
17	/* The call succeeded, and all output arguments now contain valid data. /
18	nnp_status_success = `0`,
19	/* NNPACK function was called with batch_size == 0. /
20	nnp_status_invalid_batch_size = `2`,
21	/* NNPACK function was called with channels == 0. /
22	nnp_status_invalid_channels = `3`,
23	/* NNPACK function was called with input_channels == 0. /
24	nnp_status_invalid_input_channels = `4`,
25	/* NNPACK function was called with output_channels == 0. /
26	nnp_status_invalid_output_channels = `5`,
27	/* NNPACK function was called with input_size.height == 0 or input_size.width == 0 /
28	nnp_status_invalid_input_size = `10`,
29	/* NNPACK function was called with input_stride.height == 0 or input_stride.width == 0 /
30	nnp_status_invalid_input_stride = `11`,
31	/* NNPACK function was called with input_padding not less than respective kernel (or pooling) size, i.e.:*
32	*
33	* - input_padding.left >= kernel_size.width (>= pooling_size.width)
34	* - input_padding.right >= kernel_size.width (>= pooling_size.width)
35	* - input_padding.top >= kernel_size.height (>= pooling_size.height)
36	* - input_padding.bottom >= kernel_size.height (>= pooling_size.height)
37	*/
38	nnp_status_invalid_input_padding = `12`,
39	/* NNPACK function was called with kernel_size.height == 0 or kernel_size.width == 0 /
40	nnp_status_invalid_kernel_size = `13`,
41	/* NNPACK function was called with pooling_size.height == 0 or pooling_size.width == 0 /
42	nnp_status_invalid_pooling_size = `14`,
43	/* NNPACK function was called with pooling_stride.height == 0 or pooling_stride.width == 0 /
44	nnp_status_invalid_pooling_stride = `15`,
45	/* NNPACK function was called with convolution algorithm not in nnp_convolution_algorithm enumeration /
46	nnp_status_invalid_algorithm = `16`,
47	/* NNPACK function was called with convolution transform strategy not in nnp_convolution_transform_strategy enum /
48	nnp_status_invalid_transform_strategy = `17`,
49	/* NNPACK function was called with output_subsampling.height == 0 or output_subsampling.width == 0 /
50	nnp_status_invalid_output_subsampling = `13`,
51	/* NNPACK function was called with activation not in nnp_activation enum /
52	nnp_status_invalid_activation = `14`,
53	/* NNPACK function was called with invalid activation parameters /
54	nnp_status_invalid_activation_parameters = `15`,
55
56	/* NNPACK does not support the particular input size for the function /
57	nnp_status_unsupported_input_size = `20`,
58	/* NNPACK does not support the particular input stride for the function /
59	nnp_status_unsupported_input_stride = `21`,
60	/* NNPACK does not support the particular input padding for the function /
61	nnp_status_unsupported_input_padding = `22`,
62	/* NNPACK does not support the particular kernel size for the function /
63	nnp_status_unsupported_kernel_size = `23`,
64	/* NNPACK does not support the particular pooling size for the function /
65	nnp_status_unsupported_pooling_size = `24`,
66	/* NNPACK does not support the particular pooling stride for the function /
67	nnp_status_unsupported_pooling_stride = `25`,
68	/* NNPACK does not support the particular convolution algorithm for the function /
69	nnp_status_unsupported_algorithm = `26`,
70	/* NNPACK does not support the particular convolution transform strategy for the algorithm /
71	nnp_status_unsupported_transform_strategy = `27`,
72	/* NNPACK does not support the particular activation function for the function /
73	nnp_status_unsupported_activation = `28`,
74	/* NNPACK does not support the particular activation function parameters for the function /
75	nnp_status_unsupported_activation_parameters = `29`,
76
77	/* NNPACK function was called before the library was initialized /
78	nnp_status_uninitialized = `50`,
79	/* NNPACK does not implement this function for the host CPU /
80	nnp_status_unsupported_hardware = `51`,
81	/* NNPACK failed to allocate memory for temporary buffers /
82	nnp_status_out_of_memory = `52`,
83	/* Scratch space buffer is too small /
84	nnp_status_insufficient_buffer = `53`,
85	/* Scratch space buffer is not properly aligned /
86	nnp_status_misaligned_buffer = `54`
87	};
88
89	/**
90	* @brief Activation applied applied after a convolutional or fully-connected layer.
91	*/
92	enum nnp_activation {
93	/* Identity activation f(x) := x, i.e. no transformation /
94	nnp_activation_identity = `0`,
95	/* ReLU activation f(x) := max(0, x) /
96	nnp_activation_relu = `1`,
97	};
98
99	/**
100	* @brief Algorithm for computing convolutional layers.
101	*/
102	enum nnp_convolution_algorithm {
103	/* Let NNPACK choose the algorithm depending on layer parameters /
104	nnp_convolution_algorithm_auto = `0`,
105	/* Tiled convolution based on 2D Fourier transform with 8x8 blocks. Supports kernels up to 8x8. /
106	nnp_convolution_algorithm_ft8x8 = `1`,
107	/* Tiled convolution based on 2D Fourier transform with 16x16 blocks. Supports kernels up to 16x16. /
108	nnp_convolution_algorithm_ft16x16 = `2`,
109	/* Tiled convolution based on 2D Winograd transform F(3x3, 6x6) with 8x8 blocks. Supports only 3x3 kernels. /
110	nnp_convolution_algorithm_wt8x8 = `3`,
111	/* Direct convolution via implicit GEMM. /
112	nnp_convolution_algorithm_implicit_gemm = `4`,
113	/* Direct convolution implementation. /
114	nnp_convolution_algorithm_direct = `5`,
115	/**
116	* Tiled convolution based on 2D Winograd transform F(3x3, 6x6) with 8x8 blocks in FP16.
117	* Supports only 3x3 kernels. Implemented only for new ARM processors (with NEON-HP),
118	* on non-supported processors falls back to nnp_convolution_algorithm_wt8x8.
119	*/
120	nnp_convolution_algorithm_wt8x8_fp16 = `6`,
121	};
122
123	enum nnp_convolution_transform_strategy {
124	nnp_convolution_transform_strategy_compute = `1`,
125	nnp_convolution_transform_strategy_precompute = `2`,
126	nnp_convolution_transform_strategy_reuse = `3`
127	};
128
129	/ For backward compatibility /
130	#define nnp_convolution_transform_strategy_block_based nnp_convolution_transform_strategy_compute
131	#define nnp_convolution_transform_strategy_tuple_based nnp_convolution_transform_strategy_compute
132
133	/**
134	* @brief Size of images, kernels, and pooling filters in NNPACK.
135	*/
136	struct nnp_size {
137	/* Width (horizontal size) of an image, kernel, or pooling filter. /
138	size_t width;
139	/* Height (vertical size) of an image, kernel, or pooling filter. /
140	size_t height;
141	};
142
143	/**
144	* @brief Padding of images in NNPACK.
145	*/
146	struct nnp_padding {
147	/* Padding above the image data /
148	size_t top;
149	/* Padding on the right of image data /
150	size_t right;
151	/* Padding below the image data /
152	size_t bottom;
153	/* Padding on the left of image data /
154	size_t left;
155	};
156
157	/**
158	* @brief Profiling information about time spent in different phases of a function call.
159	*/
160	struct nnp_profile {
161	/* Time spent inside the function call, in seconds. /
162	double total;
163	/* Time spend on transformation of the input or input gradient tensor, in seconds. /
164	double input_transform;
165	/* Time spend on transformation of the kernel or kernel gradient tensor, in seconds. /
166	double kernel_transform;
167	/* Time spend on transformation of the output or output gradient tensor, in seconds. /
168	double output_transform;
169	/* Time spend on multiplication-accumulation of transformed coefficients, in seconds. /
170	double block_multiplication;
171	};
172
173	enum nnp_status nnp_initialize(void);
174
175	enum nnp_status nnp_deinitialize(void);
176
177	/**
178	* @brief Computes output of a 2D convolutional layer from input and kernel tensors.
179	* @details This function targets training of convolutional neural networks and performs forward propagation.
180	* It is optimized for moderate minibatch sizes (64-128) and can be inefficient on a small minibatch.
181	* For minibatch size 1, use nnp_convolution_inference for optimal performance.
182	* @param algorithm The type of algorithm to use for convolution. Possible values are:
183	*
184	* - nnp_convolution_algorithm_auto -- let the function choose the algorithm.
185	* - nnp_convolution_algorithm_ft8x8 -- tiled convolution based on 2D Fourier transform with 8x8 blocks.
186	* Supports kernels up to 8x8.
187	* - nnp_convolution_algorithm_ft16x16 -- tiled convolution based on 2D Fourier transform with 16x16 blocks.
188	* Supports kernels up to 16x16.
189	* - nnp_convolution_algorithm_wt8x8 -- tiled convolution based on 2D Winograd transform F(3x3, 6x6).
190	* Supports only 3x3 kernels.
191	*
192	* @param batch_size The number of images on the input and output of the convolutional layer.
193	* @param input_channels The number of channels (AKA features, dimensions) in the input images.
194	* @param output_channels The number of channels (AKA features, dimensions) in the output images.
195	* @param input_size Size of input images, excluding implicit zero-padding.
196	* @param input_padding Implicit zero-padding of input images.
197	* @param kernel_size Kernel size.
198	* @param[in] input A 4D tensor input[batch_size][input_channels][input_size.height][input_size.width].
199	* @param[in] kernel A 4D tensor kernel[output_channels][input_channels][kernel_size.height][kernel_size.width].
200	* @param[in] bias A 1D array bias[output_channels].
201	* @param[out] output A 4D tensor output[batch_size][output_channels][output_size.height][output_size.width] where
202	* output_size.height = (input_padding.top + input_size.height + input_padding.bottom) -
203	* (kernel_size.height - 1)
204	* output_size.width = (input_padding.left + input_size.width + input_padding.right) -
205	* (kernel_size.width - 1)
206	* @param threadpool A thread pool for parallelization of the computation.
207	* If threadpool is NULL, the computation would run on the caller thread without parallelization.
208	* @param[out] profile An optional pointer to profiling structure.
209	* If provided, the structure would record time spent in different phases of the computation.
210	*/
211
212	enum nnp_status nnp_convolution_output(
213	enum nnp_convolution_algorithm algorithm,
214	size_t batch_size,
215	size_t input_channels,
216	size_t output_channels,
217	struct nnp_size input_size,
218	struct nnp_padding input_padding,
219	struct nnp_size kernel_size,
220	const float* input,
221	const float* kernel,
222	const float* bias,
223	float* output,
224	void* workspace_buffer,
225	size_t* workspace_size,
226	enum nnp_activation activation,
227	const void* activation_parameters,
228	pthreadpool_t threadpool,
229	struct nnp_profile* profile);
230
231	/**
232	* @brief Computes gradient of input of a 2D convolutional layer from gradient of output and kernel tensors.
233	* @details This function targets training of convolutional neural networks and performs backward propagation.
234	* It is optimized for moderate minibatch sizes (64-128) and can be inefficient on a small minibatch.
235	* @param algorithm The type of algorithm to use for convolution. Possible values are:
236	*
237	* - nnp_convolution_algorithm_auto -- let the function choose the algorithm.
238	* - nnp_convolution_algorithm_ft8x8 -- tiled convolution based on 2D Fourier transform with 8x8 blocks.
239	* Supports kernels up to 8x8.
240	* - nnp_convolution_algorithm_ft16x16 -- tiled convolution based on 2D Fourier transform with 16x16 blocks.
241	* Supports kernels up to 16x16.
242	* - nnp_convolution_algorithm_wt8x8 -- tiled convolution based on 2D Winograd transform F(3x3, 6x6).
243	* Supports only 3x3 kernels.
244	*
245	* @param batch_size The number of images (and their gradients) on the input and output of the convolutional layer.
246	* @param input_channels The number of channels (AKA features, dimensions) in the input images (and gradients).
247	* @param output_channels The number of channels (AKA features, dimensions) in the output images (and gradients).
248	* @param input_size Size of input images and their gradients, excluding implicit zero-padding.
249	* @param input_padding Implicit zero-padding of input images.
250	* @param kernel_size Kernel size.
251	* @param[in] grad_output A 4D tensor grad_output[batch_size][output_channels][output_size.height][output_size.width]
252	* where
253	* output_size.height = (input_padding.top + input_size.height + input_padding.bottom) -
254	* (kernel_size.height - 1)
255	* output_size.width = (input_padding.left + input_size.width + input_padding.right) -
256	* (kernel_size.width - 1)
257	* @param[in] kernel A 4D tensor kernel[output_channels][input_channels][kernel_size.height][kernel_size.width].
258	* @param[out] grad_input A 4D tensor grad_input[batch_size][input_channels][input_size.height][input_size.width].
259	* @param threadpool A thread pool for parallelization of the computation.
260	* If threadpool is NULL, the computation would run on the caller thread without parallelization.
261	* @param[out] profile An optional pointer to profiling structure.
262	* If provided, the structure would record time spent in different phases of the computation.
263	*/
264	enum nnp_status nnp_convolution_input_gradient(
265	enum nnp_convolution_algorithm algorithm,
266	size_t batch_size,
267	size_t input_channels,
268	size_t output_channels,
269	struct nnp_size input_size,
270	struct nnp_padding input_padding,
271	struct nnp_size kernel_size,
272	const float* grad_output,
273	const float* kernel,
274	float* grad_input,
275	void* workspace_buffer,
276	size_t* workspace_size,
277	enum nnp_activation activation,
278	const void* activation_parameters,
279	pthreadpool_t threadpool,
280	struct nnp_profile* profile);
281
282	/**
283	* @brief Computes gradient of kernel of a 2D convolutional layer from gradient of output and input tensors.
284	* @details This function targets training of convolutional neural networks and performs backward propagation.
285	* It is optimized for moderate minibatch sizes (64-128) and can be inefficient on a small minibatch.
286	* @param algorithm The type of algorithm to use for convolution. Possible values are:
287	*
288	* - nnp_convolution_algorithm_auto -- let the function choose the algorithm.
289	* - nnp_convolution_algorithm_ft8x8 -- tiled convolution based on 2D Fourier transform with 8x8 blocks.
290	* Supports kernels up to 8x8.
291	* - nnp_convolution_algorithm_ft16x16 -- tiled convolution based on 2D Fourier transform with 16x16 blocks.
292	* Supports kernels up to 16x16.
293	*
294	* @param batch_size The number of images (and their gradients) on the input and output of the convolutional layer.
295	* @param input_channels The number of channels (AKA features, dimensions) in the input images.
296	* @param output_channels The number of channels (AKA features, dimensions) in the output images (and gradients).
297	* @param input_size Size of input images and their gradients, excluding implicit zero-padding.
298	* @param input_padding Implicit zero-padding of input images.
299	* @param kernel_size Kernel size.
300	* @param[in] input A 4D tensor input[batch_size][input_channels][input_size.height][input_size.width].
301	* @param[in] grad_output A 4D tensor grad_output[batch_size][output_channels][output_size.height][output_size.width]
302	* where
303	* output_size.height = (input_padding.top + input_size.height + input_padding.bottom) -
304	* (kernel_size.height - 1)
305	* output_size.width = (input_padding.left + input_size.width + input_padding.right) -
306	* (kernel_size.width - 1)
307	* @param[out] grad_kernel A 4D tensor
308	* grad_kernel[output_channels][input_channels][kernel_size.height][kernel_size.width].
309	* @param threadpool A thread pool for parallelization of the computation.
310	* If threadpool is NULL, the computation would run on the caller thread without parallelization.
311	* @param[out] profile An optional pointer to profiling structure.
312	* If provided, the structure would record time spent in different phases of the computation.
313	*/
314	enum nnp_status nnp_convolution_kernel_gradient(
315	enum nnp_convolution_algorithm algorithm,
316	size_t batch_size,
317	size_t input_channels,
318	size_t output_channels,
319	struct nnp_size input_size,
320	struct nnp_padding input_padding,
321	struct nnp_size kernel_size,
322	const float* input,
323	const float* grad_output,
324	float* grad_kernel,
325	void* workspace_buffer,
326	size_t* workspace_size,
327	enum nnp_activation activation,
328	const void* activation_parameters,
329	pthreadpool_t threadpool,
330	struct nnp_profile* profile);
331
332	/**
333	* @brief Computes output of a 2D convolutional layer for a single input image and a kernel tensor.
334	* @details This function targets prediction with convolutional neural networks and performs forward propagation.
335	* @param algorithm The type of algorithm to use for convolution. Possible values are:
336	*
337	* - nnp_convolution_algorithm_auto -- let the function choose the algorithm.
338	* - nnp_convolution_algorithm_ft8x8 -- tiled convolution based on 2D Fourier transform with 8x8 blocks.
339	* Supports kernels up to 8x8.
340	* - nnp_convolution_algorithm_ft16x16 -- tiled convolution based on 2D Fourier transform with 16x16 blocks.
341	* Supports kernels up to 16x16.
342	* - nnp_convolution_algorithm_wt8x8 -- tiled convolution based on 2D Winograd transform F(3x3, 6x6).
343	* Supports only 3x3 kernels.
344	*
345	* @param transform_strategy A strategy that guides computation of kernel transforms coefficients.
346	* Possible values are:
347	*
348	* - nnp_convolution_transform_strategy_block_based -- do multiplication-accumulations on blocks of transformed
349	* coefficients.
350	* - nnp_convolution_transform_strategy_tuple_based -- do multiplication-accumulations on tuples of transformed
351	* coefficients.
352	*
353	* @param input_channels The number of channels (AKA features, dimensions) in the input image.
354	* @param output_channels The number of channels (AKA features, dimensions) in the output image.
355	* @param input_size Size of input image, excluding implicit zero-padding.
356	* @param input_padding Implicit zero-padding of input image.
357	* @param kernel_size Kernel size.
358	* @param output_subsampling Subsample region for output, also known as convolution stride.
359	* @param[in] input A 3D tensor input[input_channels][input_size.height][input_size.width].
360	* @param[in] kernel A 4D tensor kernel[output_channels][input_channels][kernel_size.height][kernel_size.width].
361	* @param[in] bias A 1D array bias[output_channels].
362	* @param[out] output A 3D tensor output[output_channels][output_size.height][output_size.width] where
363	* output_size.height = (input_padding.top + input_size.height + input_padding.bottom) -
364	* (kernel_size.height - 1)
365	* output_size.width = (input_padding.left + input_size.width + input_padding.right) -
366	* (kernel_size.width - 1)
367	* @param[in] workspace_buffer Buffer for scratch memory used during computation. Buffer must be aligned on 64 bytes.
368	* If workspace_buffer is NULL and workspace_size is non-NULL, NNPACK would store the size
369	* of required workspace memory at the workspace_size location, and exit without
370	* computations.
371	* If workspace_buffer is NULL and workspace_size is NULL, NNPACK would allocate memory
372	* before and deallocate after this computation, potentially at significant runtime cost.
373	* @param[in,out] workspace_size Pointer to the size of workspace buffer.
374	* If workspace_buffer is NULL, NNPACK will write the size of required scratch memory to
375	* the location specified by this pointer.
376	* If workspace_buffer is non-NULL, NNPACK expects workspace_size to specify the size of
377	* the buffer, in bytes.
378	* If workspace_size is NULL, workspace_buffer must be NULL as well. In this case NNPACK
379	* would allocate memory before and deallocate after this computation, potentially at
380	* significant runtime cost.
381	* @param threadpool A thread pool for parallelization of the computation.
382	* If threadpool is NULL, the computation would run on the caller thread without parallelization.
383	* @param[out] profile An optional pointer to profiling structure.
384	* If provided, the structure would record time spent in different phases of the computation.
385	*/
386	enum nnp_status nnp_convolution_inference(
387	enum nnp_convolution_algorithm algorithm,
388	enum nnp_convolution_transform_strategy transform_strategy,
389	size_t input_channels,
390	size_t output_channels,
391	struct nnp_size input_size,
392	struct nnp_padding input_padding,
393	struct nnp_size kernel_size,
394	struct nnp_size output_subsampling,
395	const float* input,
396	const float* kernel,
397	const float* bias,
398	float* output,
399	void* workspace_buffer,
400	size_t* workspace_size,
401	enum nnp_activation activation,
402	const void* activation_parameters,
403	pthreadpool_t threadpool,
404	struct nnp_profile* profile);
405
406	/**
407	* @brief Computes output of a fully connected layer from input and kernel matrices.
408	* @details This function targets training of convolutional neural networks and performs forward propagation.
409	* It is optimized for moderate minibatch sizes (64-128) and can be inefficient on a small minibatch.
410	* For minibatch size 1, use nnp_fully_connected_inference for optimal performance.
411	* @param batch_size The number of vectors on the input and output of the fully connected layer.
412	* @param input_channels The number of channels (AKA features, dimensions) in the input matrix.
413	* @param output_channels The number of channels (AKA features, dimensions) in the output matrix.
414	* @param[in] input A 2D matrix input[batch_size][input_channels].
415	* @param[in] kernel A 2D matrix kernel[output_channels][input_channels].
416	* @param[out] output A 2D matrix output[batch_size][output_channels].
417	* @param threadpool A thread pool for parallelization of the computation.
418	* If threadpool is NULL, the computation would run on the caller thread without parallelization.
419	*/
420	enum nnp_status nnp_fully_connected_output(
421	size_t batch_size,
422	size_t input_channels,
423	size_t output_channels,
424	const float input[],
425	const float kernel[],
426	float output[],
427	pthreadpool_t threadpool,
428	struct nnp_profile* profile);
429
430	/**
431	* @brief Computes output of a fully connected layer for a single input vector and a kernel matrix.
432	* @details This function targets prediction with convolutional neural networks and performs forward propagation.
433	* @param input_channels The number of channels (AKA features, dimensions) in the input vector.
434	* @param output_channels The number of channels (AKA features, dimensions) in the output vector.
435	* @param[in] input A 1D array input[input_channels] of FP32 elements.
436	* @param[in] kernel A 2D matrix kernel[output_channels][input_channels] of FP32 elements.
437	* @param[out] output A 1D array output[output_channels] of FP32 elements.
438	* @param threadpool A thread pool for parallelization of the computation.
439	* If threadpool is NULL, the computation would run on the caller thread without parallelization.
440	*/
441	enum nnp_status nnp_fully_connected_inference(
442	size_t input_channels,
443	size_t output_channels,
444	const float* input,
445	const float* kernel,
446	float* output,
447	pthreadpool_t threadpool);
448
449	/**
450	* @brief Computes output of a fully connected layer for a single input vector and a kernel matrix.
451	* @details This function targets prediction with convolutional neural networks and performs forward propagation.
452	* @param input_channels The number of channels (AKA features, dimensions) in the input vector.
453	* @param output_channels The number of channels (AKA features, dimensions) in the output vector.
454	* @param[in] input A 1D array input[input_channels] of FP32 elements.
455	* @param[in] kernel A 2D matrix kernel[output_channels][input_channels] of FP16 (ARM alternative format) elements.
456	* @param[out] output A 1D array output[output_channels] of FP32 elements.
457	* @param threadpool A thread pool for parallelization of the computation.
458	* If threadpool is NULL, the computation would run on the caller thread without parallelization.
459	*/
460	enum nnp_status nnp_fully_connected_inference_f16f32(
461	size_t input_channels,
462	size_t output_channels,
463	const float* input,
464	const void* kernel,
465	float* output,
466	pthreadpool_t threadpool);
467
468	/**
469	* @brief Computes output of a max-pooling layer for an input tensor.
470	* @details This function targets both prediction and training of convolutional neural networks and performs forward
471	* propagation. Is is optimized for both large and small minibatch sizes.
472	* @param batch_size The number of images on the input and output of the max-pooling layer.
473	* @param channels The number of channels (AKA features, dimensions) in both input and output images.
474	* @param input_size Size of input images, excluding implicit zero-padding.
475	* @param input_padding Implicit padding of input images. The padding pixels are ignored by the pooling filter, but
476	* affect the output size.
477	* @param pooling_size Size of the pooling filter. Only 2x2 filter are currently supported.
478	* @param pooling_stride Stride of the pooling filter. Only 2x2 strides are currently supported.
479	* @param[in] input A 4D tensor input[batch_size][channels][input_size.height][input_size.width].
480	* @param[out] output A 4D tensor output[batch_size][channels][output_size.height][output_size.width] where
481	* output_size.height = ceil(
482	* (input_padding.top + input_size.height + input_padding.bottom - pooling_size.height) /
483	* pooling_stride.height) + 1
484	* output_size.width = ceil(
485	* (input_padding.left + input_size.width + input_padding.right - pooling_size.width) /
486	* pooling_stride.width) + 1
487	* @param threadpool A thread pool for parallelization of the computation.
488	* If threadpool is NULL, the computation would run on the caller thread without parallelization.
489	*/
490	enum nnp_status nnp_max_pooling_output(
491	size_t batch_size,
492	size_t channels,
493	struct nnp_size input_size,
494	struct nnp_padding input_padding,
495	struct nnp_size pooling_size,
496	struct nnp_size pooling_stride,
497	const float input[],
498	float output[],
499	pthreadpool_t threadpool);
500
501	/**
502	* @brief Computes output of a softmax layer for an input matrix.
503	* @details This function targets both prediction and training of convolutional neural networks and performs forward
504	* propagation. Is is optimized for both large and small minibatch sizes.
505	* @param batch_size The number of vectors on the input and output of the softmax layer.
506	* @param channels The number of channels (AKA features, dimensions) in both input and output vectors.
507	* @param[in] input A 2D matrix input[batch_size][channels].
508	* @param[out] output A 2D matrix output[batch_size][channels].
509	* @param threadpool A thread pool for parallelization of the computation.
510	* If threadpool is NULL, the computation would run on the caller thread without parallelization.
511	*/
512	enum nnp_status nnp_softmax_output(
513	size_t batch_size,
514	size_t channels,
515	const float input[],
516	float output[],
517	pthreadpool_t threadpool);
518
519	/**
520	* @brief Computes output of a rectified linear unit (ReLU) layer for an input matrix.
521	* @details This function targets both prediction and training of convolutional neural networks and performs forward
522	* propagation. Is is optimized for both large and small minibatch sizes.
523	* @param batch_size The number of vectors on the input and output of the ReLU layer.
524	* @param channels The number of channels (AKA features, dimensions) in both input and output matrices.
525	* @param[in] input A 2D matrix input[batch_size][channels].
526	* @param[out] output A 2D matrix output[batch_size][channels].
527	* @param threadpool A thread pool for parallelization of the computation.
528	* If threadpool is NULL, the computation would run on the caller thread without parallelization.
529	*/
530	enum nnp_status nnp_relu_output(
531	size_t batch_size,
532	size_t channels,
533	const float input[],
534	float output[],
535	float negative_slope,
536	pthreadpool_t threadpool);
537
538	/**
539	* @brief Computes gradient of input of a rectified linear unit (ReLU) layer from gradient of output and input matrices.
540	* @details This function targets training of convolutional neural networks and performs backward propagation.
541	* Is is optimized for both large and small minibatch sizes.
542	* @param batch_size The number of vectors on the input and output of the ReLU layer.
543	* @param channels The number of channels (AKA features, dimensions) in both input and output matrices.
544	* @param[in] input A 2D matrix input[batch_size][channels].
545	* @param[out] output A 2D matrix output[batch_size][channels].
546	* @param threadpool A thread pool for parallelization of the computation.
547	* If threadpool is NULL, the computation would run on the caller thread without parallelization.
548	*/
549	enum nnp_status nnp_relu_input_gradient(
550	size_t batch_size,
551	size_t channels,
552	const float grad_output[],
553	const float input[],
554	float grad_input[],
555	float negative_slope,
556	pthreadpool_t threadpool);
557
558	#ifdef __cplusplus
559	} / extern "C" /
560	#endif
561
562	#ifdef __cplusplus
563	// Backward compatible implementations for nnp_convolution_, if we are in C++*
564	// mode.
565	inline enum nnp_status nnp_convolution_output(
566	enum nnp_convolution_algorithm algorithm,
567	size_t batch_size,
568	size_t input_channels,
569	size_t output_channels,
570	struct nnp_size input_size,
571	struct nnp_padding input_padding,
572	struct nnp_size kernel_size,
573	const float input[],
574	const float kernel[],
575	const float bias[],
576	float output[],
577	pthreadpool_t threadpool,
578	struct nnp_profile* profile)
579	{
580	return nnp_convolution_output(
581	algorithm,
582	batch_size, input_channels, output_channels,
583	input_size, input_padding, kernel_size,
584	input, kernel, bias, output,
585	NULL, NULL,
586	nnp_activation_identity, NULL, threadpool, profile);
587	}
588
589	inline enum nnp_status nnp_convolution_input_gradient(
590	enum nnp_convolution_algorithm algorithm,
591	size_t batch_size,
592	size_t input_channels,
593	size_t output_channels,
594	struct nnp_size input_size,
595	struct nnp_padding input_padding,
596	struct nnp_size kernel_size,
597	const float grad_output[],
598	const float kernel[],
599	float grad_input[],
600	pthreadpool_t threadpool,
601	struct nnp_profile* profile)
602	{
603	return nnp_convolution_input_gradient(
604	algorithm,
605	batch_size, input_channels, output_channels,
606	input_size, input_padding, kernel_size,
607	grad_output, kernel, grad_input,
608	NULL, NULL,
609	nnp_activation_identity, NULL, threadpool, profile);
610	}
611
612	inline enum nnp_status nnp_convolution_kernel_gradient(
613	enum nnp_convolution_algorithm algorithm,
614	size_t batch_size,
615	size_t input_channels,
616	size_t output_channels,
617	struct nnp_size input_size,
618	struct nnp_padding input_padding,
619	struct nnp_size kernel_size,
620	const float input[],
621	const float grad_output[],
622	float grad_kernel[],
623	pthreadpool_t threadpool,
624	struct nnp_profile* profile)
625	{
626	return nnp_convolution_kernel_gradient(
627	algorithm,
628	batch_size, input_channels, output_channels,
629	input_size, input_padding, kernel_size,
630	input, grad_output, grad_kernel,
631	NULL, NULL,
632	nnp_activation_identity, NULL, threadpool, profile);
633	}
634
635	inline enum nnp_status nnp_convolution_inference(
636	enum nnp_convolution_algorithm algorithm,
637	enum nnp_convolution_transform_strategy transform_strategy,
638	size_t input_channels,
639	size_t output_channels,
640	struct nnp_size input_size,
641	struct nnp_padding input_padding,
642	struct nnp_size kernel_size,
643	struct nnp_size output_subsampling,
644	const float input[],
645	const float kernel[],
646	const float bias[],
647	float output[],
648	pthreadpool_t threadpool,
649	struct nnp_profile* profile) {
650	return nnp_convolution_inference(
651	algorithm, transform_strategy,
652	input_channels, output_channels,
653	input_size, input_padding, kernel_size, output_subsampling,
654	input, kernel, bias, output, NULL, NULL,
655	nnp_activation_identity, NULL,
656	threadpool, profile);
657	}
658
659	#endif // __cplusplus
660

Browse the source code of pytorch/third_party/NNPACK/include/nnpack.h