xnnpack.h source code [pytorch/third_party/XNNPACK/include/xnnpack.h]

1	// Copyright (c) Facebook, Inc. and its affiliates.
2	// All rights reserved.
3	//
4	// Copyright 2019 Google LLC
5	//
6	// This source code is licensed under the BSD-style license found in the
7	// LICENSE file in the root directory of this source tree.
8
9	#pragma once
10
11	#include <stdbool.h>
12	#include <stddef.h>
13	#include <stdint.h>
14
15	#include <pthreadpool.h>
16
17	#ifdef __cplusplus
18	extern "C" {
19	#endif
20
21	/// The number of bytes XNNPACK may read beyond array bounds.
22	/// The caller must allocate at least this many extra bytes after the tensor data passed to XNNPACK.
23	///
24	/// Note: XNNPACK reads, but never writes beyond array bounds.
25	#define XNN_EXTRA_BYTES 16
26
27	/// Maximum number of dimensions in tensor shape.
28	#define XNN_MAX_TENSOR_DIMS 6
29
30	/// Allow sparse inference in a Runtime.
31	///
32	/// Note: this flag hints XNNPACK to consider sparse inference, but does not guarantee it.
33	#define XNN_FLAG_SPARSE_INFERENCE 0x00000001
34	#define XNN_FLAG_HINT_SPARSE_INFERENCE XNN_FLAG_SPARSE_INFERENCE
35
36	/// Allow IEEE FP16 inference in a Runtime.
37	///
38	/// Note: this flag hints XNNPACK to consider IEEE FP16 inference, but does not guarantee it.
39	#define XNN_FLAG_FP16_INFERENCE 0x00000002
40	#define XNN_FLAG_HINT_FP16_INFERENCE XNN_FLAG_FP16_INFERENCE
41
42	/// Force IEEE FP16 inference in a Runtime, and fail if FP16 inference is not possible.
43	///
44	/// Note: this flag guarantees that XNNPACK will use IEEE FP16 inference, or fail to create the Runtime object.
45	/// Warning: on x86 systems FP16 computations will be emulated at a substantial performance cost.
46	#define XNN_FLAG_FORCE_FP16_INFERENCE 0x00000004
47
48	/// Enable timing of each operator's runtime.
49	#define XNN_FLAG_BASIC_PROFILING 0x00000008
50
51	/// The convolution operator represents a depthwise convolution, and use HWGo layout for filters.
52	#define XNN_FLAG_DEPTHWISE_CONVOLUTION 0x00000001
53
54	/// Assume transposed weights in a fully connected operator.
55	#define XNN_FLAG_TRANSPOSE_WEIGHTS 0x00000001
56
57	/// The operator assumes NHWC layout for the input, regardless of the output layout.
58	#define XNN_FLAG_INPUT_NHWC 0x00000002
59
60	/// Match "SAME" padding in TensorFlow. Exact padding values are computed dynamically depending on input size.
61	#define XNN_FLAG_TENSORFLOW_SAME_PADDING 0x00000004
62
63	/// Implicitly flatten and reshape input of a Fully Connected operator into a 2D tensor.
64	#define XNN_FLAG_TENSORFLOW_RESHAPE_2D 0x00000004
65
66	/// Match behaviour of TensorFlow 1.x.
67	#define XNN_FLAG_TENSORFLOW_LEGACY_MODE 0x00000004
68
69	/// Static weights of the FP16 operator are in FP32 format.
70	#define XNN_FLAG_FP32_STATIC_WEIGHTS 0x00000008
71
72	/// Align corners of input and output images in resize operations.
73	#define XNN_FLAG_ALIGN_CORNERS 0x00000008
74
75	/// Yield worker threads of the thread pool to the system scheduler after the inference.
76	#define XNN_FLAG_YIELD_WORKERS 0x00000010
77
78	/// Status code for any XNNPACK function call.
79	enum xnn_status {
80	/// The call succeeded, and all output arguments now contain valid data.
81	xnn_status_success = `0`,
82	xnn_status_uninitialized = `1`,
83	xnn_status_invalid_parameter = `2`,
84	xnn_status_invalid_state = `3`,
85	xnn_status_unsupported_parameter = `4`,
86	xnn_status_unsupported_hardware = `5`,
87	xnn_status_out_of_memory = `6`,
88	};
89
90	struct xnn_allocator {
91	/// User-specified pointer that will be passed as-is to all functions in this structure.
92	void* context;
93	/// Pointer to a function to be called for general memory allocation.
94	///
95	/// @param context - The user-specified pointer from xnn_allocator structure.
96	/// @param size - The size of the memory block to allocate, in bytes.
97	///
98	/// @returns Pointer to the allocated memory block of at least @ref size bytes.
99	/// If allocation fails, the function must return NULL.
100	void* (allocate)(void** context, size_t size);
101	/// Pointer to a function to be called for general memory re-allocation, i.e. to increase or shrink a previously
102	/// allocated memory block. The content of the old memory block is copied to the new memory block.
103	///
104	/// @param context - The user-specified pointer from xnn_allocator structure.
105	/// @param pointer - Pointer to a memory block allocated by @ref allocate or @ref reallocate functions. Can be NULL.
106	/// If the pointer is NULL, the @ref reallocate call is equivalent to an @ref allocate call.
107	/// @param size - The new size of the memory block to allocate, in bytes.
108	///
109	/// @returns Pointer to the newly allocated memory block of at least @ref size bytes with the content of the previous
110	/// memory block.
111	/// If allocation fails, the function must return NULL, but must not release the previous memory block.
112	void* (reallocate)(void** context, void* pointer, size_t size);
113	/// Pointer to a function to be called for general memory de-allocation.
114	///
115	/// @param context - The user-specified pointer from xnn_allocator structure.
116	/// @param pointer - Pointer to a memory block allocated by @ref allocate or @ref reallocate functions. Can be NULL.
117	/// If the pointer is NULL, the @ref deallocate call is a no-op.
118	void (deallocate)(void** context, void* pointer);
119	/// Pointer to a function to be called for aligned memory allocation.
120	///
121	/// @param context - The user-specified pointer from xnn_allocator structure.
122	/// @param alignment - The alignment of the memory block to allocate, in bytes. Alignment is always a power-of-2.
123	/// @param size - The size of the memory block to allocate, in bytes.
124	///
125	/// @returns Pointer to the allocated memory block of at least @ref size bytes.
126	/// If allocation fails, the function must return NULL.
127	void* (aligned_allocate)(void** context, size_t alignment, size_t size);
128	/// Pointer to a function to be called for aligned memory de-allocation.
129	///
130	/// @param context - The user-specified pointer from xnn_allocator structure.
131	/// @param pointer - Pointer to a memory block allocated by @ref aligned_allocate function. Can be NULL.
132	/// If the pointer is NULL, the @ref aligned_deallocate call is a no-op.
133	void (aligned_deallocate)(void** context, void* pointer);
134	};
135
136	/// Initialize XNNPACK library.
137	///
138	/// XNNPACK must be successfully initialized before use. During initialization, XNNPACK populates internal structures
139	/// depending on the host processor. Initialization can be time-consuming.
140	///
141	/// @param[in] allocator - structure with function pointers to be use for memory allocation and de-allocation.
142	/// If this argument is NULL, system-provided memory management functions (e.g. malloc/free)
143	/// will be used.
144	///
145	/// @retval xnn_status_success - XNNPACK is successfully initialized and ready to use.
146	/// @retval xnn_status_out_of_memory - initialization failed due to out-of-memory condition.
147	/// @retval xnn_status_unsupported_hardware - initialization failed because the host processor does not satisfy the
148	/// minimum hardware requirements for XNNPACK. E.g. this may happen on x86
149	/// processors without SSE2 extension, or on 32-bit ARM processors without
150	/// the NEON SIMD extension.
151	enum xnn_status xnn_initialize(const struct xnn_allocator* allocator);
152
153	/// Deinitialize XNNPACK library.
154	///
155	/// To avoid memory and resource leaks, users must call xnn_deinitialize once for each successful xnn_initialize call.
156	///
157	/// @retval xnn_status_success - deinitialization call succeeded.
158	enum xnn_status xnn_deinitialize(void);
159
160	/// Subgraph is an abstract representation of a neural network model.
161	/// Subgraph objects are used to define Values (tensors) and Nodes (operators) comprising the model.
162	typedef struct xnn_subgraph* xnn_subgraph_t;
163
164	/// Create a empty Subgraph object.
165	///
166	/// @param external_value_ids - number of Value IDs to reserve for communication with external graph representation.
167	/// The Subgraph object would avoid creating internal Value IDs in the
168	/// [0, reserved_value_ids-1] range.
169	/// @param flags - binary features of the subgraph. No supported flags are currently defined.
170	/// @param subgraph_out - pointer to the variable that will be initialized with a handle to the Subgraph object upon
171	/// successful return.
172	enum xnn_status xnn_create_subgraph(
173	uint32_t external_value_ids,
174	uint32_t flags,
175	xnn_subgraph_t* subgraph_out);
176
177	/// Destroy a Subgraph object, as well as Values, and Nodes associated with the subgraph.
178	///
179	/// @param subgraph - the Subgraph object to destroy.
180	enum xnn_status xnn_delete_subgraph(
181	xnn_subgraph_t subgraph);
182
183	#define XNN_VALUE_FLAG_EXTERNAL_INPUT 0x00000001
184	#define XNN_VALUE_FLAG_EXTERNAL_OUTPUT 0x00000002
185	#define XNN_VALUE_FLAG_PERSISTENT 0x00000004
186
187	#define XNN_INVALID_VALUE_ID UINT32_MAX
188
189	/// Type of elements in a Value object.
190	enum xnn_datatype {
191	/// Invalid data type. Valid Values never have this datatype.
192	xnn_datatype_invalid = `0`,
193	/// IEEE754 single-precision floating-point.
194	xnn_datatype_fp32 = `1`,
195	/// IEEE754 half-precision floating-point.
196	xnn_datatype_fp16 = `2`,
197	/// Quantized 8-bit signed integer with shared per-Value quantization parameters.
198	xnn_datatype_qint8 = `3`,
199	/// Quantized 8-bit unsigned integer with shared per-Value quantization parameters.
200	xnn_datatype_quint8 = `4`,
201	/// Quantized 32-bit signed integer with shared per-Value quantization parameters.
202	xnn_datatype_qint32 = `5`,
203	/// Quantized 8-bit signed integer with shared per-channel quantization parameters.
204	xnn_datatype_qcint8 = `6`,
205	/// Quantized 32-bit signed integer with shared per-channel quantization parameters.
206	xnn_datatype_qcint32 = `7`,
207	};
208
209	/// Define a tensor-type Value and add it to a Subgraph.
210	///
211	/// @param subgraph - a Subgraph object that will own the created Value.
212	/// @param datatype - type of the tensor elements.
213	/// @param num_dims - number of dimensions in the shape.
214	/// @param dims - pointer to an array of @a num_dims shape dimensions. If num_dims is 0, this pointer can be NULL.
215	/// XNNPACK does not keep any pointers to this array after the function returns.
216	/// @param data - pointer to static data used for tensor initialization. If the tensor is not statically initialized,
217	/// this pointer must be is NULL. If non-NULL, the life-time of the static data must exceed the life-time
218	/// of the Subgraph object, and of any Runtime objects created from the Subgraph.
219	/// @param external_id - external ID for the Value. The ID must be within the range of reversed Value IDs specified on
220	/// the Subgraph creation. If the external ID is XNN_INVALID_VALUE_ID, an internal ID will be
221	/// created for the Value.
222	/// @param flags - binary features of the Value. Supported values are any combination of XNN_VALUE_FLAG_EXTERNAL_INPUT
223	/// and XNN_VALUE_FLAG_EXTERNAL_OUTPUT.
224	/// @param id_out - pointer to the variable that will be initialized with the Value ID upon successful return. If a
225	/// valid @a external_id was provided, the variable will be initialized with the @a external_id value.
226	enum xnn_status xnn_define_tensor_value(
227	xnn_subgraph_t subgraph,
228	enum xnn_datatype datatype,
229	size_t num_dims,
230	const size_t* dims,
231	const void* data,
232	uint32_t external_id,
233	uint32_t flags,
234	uint32_t* id_out);
235
236	/// Define a quantized tensor-type Value and add it to a Subgraph.
237	///
238	/// @param subgraph - a Subgraph object that will own the created Value.
239	/// @param datatype - type of the tensor elements.
240	/// @param zero_point - offset from zero to subtract from the quantized elements in the Value.
241	/// @param scale - multiplication factor to convert quantized elements to real representation.
242	/// @param num_dims - number of dimensions in the shape.
243	/// @param dims - pointer to an array of @a num_dims shape dimensions. If num_dims is 0, this pointer can be NULL.
244	/// XNNPACK does not keep any pointers to this array after the function returns.
245	/// @param data - pointer to static data used for tensor initialization. If the tensor is not statically initialized,
246	/// this pointer must be is NULL. If non-NULL, the life-time of the static data must exceed the life-time
247	/// of the Subgraph object, and of any Runtime objects created from the Subgraph.
248	/// @param external_id - external ID for the Value. The ID must be within the range of reversed Value IDs specified on
249	/// the Subgraph creation. If the external ID is XNN_INVALID_VALUE_ID, an internal ID will be
250	/// created for the Value.
251	/// @param flags - binary features of the Value. Supported values are any combination of XNN_VALUE_FLAG_EXTERNAL_INPUT
252	/// and XNN_VALUE_FLAG_EXTERNAL_OUTPUT.
253	/// @param id_out - pointer to the variable that will be initialized with the Value ID upon successful return. If a
254	/// valid @a external_id was provided, the variable will be initialized with the @a external_id value.
255	enum xnn_status xnn_define_quantized_tensor_value(
256	xnn_subgraph_t subgraph,
257	enum xnn_datatype datatype,
258	int32_t zero_point,
259	float scale,
260	size_t num_dims,
261	const size_t* dims,
262	const void* data,
263	uint32_t external_id,
264	uint32_t flags,
265	uint32_t* id_out);
266
267	/// Define a channelwise quantized tensor-type Value and add it to a Subgraph.
268	///
269	/// @param subgraph - a Subgraph object that will own the created Value.
270	/// @param datatype - type of the tensor elements.
271	/// @param scale - per-channel multiplication factors to convert quantized elements to real representation.
272	/// @param num_dims - number of dimensions in the shape.
273	/// @param channel_dim - index of the channel dimension in the tensor with per-channel quantization parameters.
274	/// Typically this is the first dimension (dimension #0) of the filter tensors in the Convolution,
275	/// Deconvolution, and Fully Connected operators and the last dimension of the filter tensors in
276	/// the Depthwise Convolution operators.
277	/// @param dims - pointer to an array of @a num_dims shape dimensions. If num_dims is 0, this pointer can be NULL.
278	/// XNNPACK does not keep any pointers to this array after the function returns.
279	/// @param data - pointer to static data used for tensor initialization. If the tensor is not statically initialized,
280	/// this pointer must be is NULL. If non-NULL, the life-time of the static data must exceed the life-time
281	/// of the Subgraph object, and of any Runtime objects created from the Subgraph.
282	/// @param external_id - external ID for the Value. The ID must be within the range of reversed Value IDs specified on
283	/// the Subgraph creation. If the external ID is XNN_INVALID_VALUE_ID, an internal ID will be
284	/// created for the Value.
285	/// @param flags - binary features of the Value. Supported values are any combination of XNN_VALUE_FLAG_EXTERNAL_INPUT
286	/// and XNN_VALUE_FLAG_EXTERNAL_OUTPUT.
287	/// @param id_out - pointer to the variable that will be initialized with the Value ID upon successful return. If a
288	/// valid @a external_id was provided, the variable will be initialized with the @a external_id value.
289	enum xnn_status xnn_define_channelwise_quantized_tensor_value(
290	xnn_subgraph_t subgraph,
291	enum xnn_datatype datatype,
292	const float* scale,
293	size_t num_dims,
294	size_t channel_dim,
295	const size_t* dims,
296	const void* data,
297	uint32_t external_id,
298	uint32_t flags,
299	uint32_t* id_out);
300
301	/// Define a Convert Node and add it to a Subgraph.
302	///
303	/// @param subgraph - a Subgraph object that will own the created Node.
304	/// @param input_id - Value ID for the input tensor. The input tensor must be defined in the @a subgraph.
305	/// @param output_id - Value ID for the output tensor. The output tensor must be defined in the @a subgraph, and its
306	/// shape must match the shape of the input tensor.
307	/// @param flags - binary features of the Convert Node. No supported flags are currently defined.
308	enum xnn_status xnn_define_convert(
309	xnn_subgraph_t subgraph,
310	uint32_t input_id,
311	uint32_t output_id,
312	uint32_t flags);
313
314	/// Define a 2D Convolution Node and add it to a Subgraph.
315	///
316	/// @param subgraph - a Subgraph object that will own the created Node.
317	/// @param input_padding_top - implicit zero-padding above 2D input data. Must be 0 if XNN_FLAG_TENSORFLOW_SAME_PADDING
318	/// flag is specified.
319	/// @param input_padding_right - implicit zero-padding to the right of 2D input data. Must be 0 if
320	/// XNN_FLAG_TENSORFLOW_SAME_PADDING flag is specified.
321	/// @param input_padding_bottom - implicit zero-padding below 2D input data. Must be 0 if
322	/// XNN_FLAG_TENSORFLOW_SAME_PADDING flag is specified.
323	/// @param input_padding_left - implicit zero-padding to the left of 2D input data. Must be 0 if
324	/// XNN_FLAG_TENSORFLOW_SAME_PADDING flag is specified.
325	/// @param kernel_height - kernel (filter) height.
326	/// @param kernel_width - kernel (filter) width.
327	/// @param subsampling_height - height of subsampling region for convolution output (convolution height stride).
328	/// @param subsampling_width - width of subsampling region for convolution output (convolution width stride).
329	/// @param dilation_height - dilation of kernel elements along the height dimension.
330	/// @param dilation_width - dilation of kernel elements along the width dimension.
331	/// @param groups - number of convolution groups.
332	/// @param group_input_channels - number of input channels per group.
333	/// @param group_output_channels - number of output channels per group.
334	/// @param output_min - lower bound for clipping output values.
335	/// @param output_max - upper bound for clipping output values.
336	/// @param input_id - Value ID for the input tensor. The input tensor must be a 4D tensor defined in the @a subgraph
337	/// with [N, IH, IW, groups group_input_channels] dimensions*
338	/// @param filter_id - Value ID for the filter tensor. The filter tensor must ge a 4D tensor defined in the @a subgraph
339	/// with [groups group_output_channels, kernel_height, kernel_width, group_input_channels]*
340	/// dimensions.
341	/// @param bias_id - Value ID for the bias tensor, or XNN_INVALID_VALUE_ID for a 2D Convolution Node without a bias. If
342	/// present, the bias tensor must be a 1D tensor defined in the @a subgraph with [groups *
343	/// group_output_channels] dimensions.
344	/// @param output_id - Value ID for the output tensor. The output tensor must be a 4D tensor defined in the @a subgraph
345	/// with [N, OH, OW, groups group_output_channels] dimensions.*
346	/// @param flags - binary features of the 2D Convolution Node. The only currently supported values is
347	/// XNN_FLAG_TENSORFLOW_SAME_PADDING.
348	enum xnn_status xnn_define_convolution_2d(
349	xnn_subgraph_t subgraph,
350	uint32_t input_padding_top,
351	uint32_t input_padding_right,
352	uint32_t input_padding_bottom,
353	uint32_t input_padding_left,
354	uint32_t kernel_height,
355	uint32_t kernel_width,
356	uint32_t subsampling_height,
357	uint32_t subsampling_width,
358	uint32_t dilation_height,
359	uint32_t dilation_width,
360	uint32_t groups,
361	size_t group_input_channels,
362	size_t group_output_channels,
363	float output_min,
364	float output_max,
365	uint32_t input_id,
366	uint32_t filter_id,
367	uint32_t bias_id,
368	uint32_t output_id,
369	uint32_t flags);
370
371	/// Define a 2D Deconvolution (Transposed Convolution) Node and add it to a Subgraph.
372	///
373	/// @param subgraph - a Subgraph object that will own the created Node.
374	/// @param padding_top - implicit padding above 2D output data.
375	/// @param padding_right - implicit padding to the right of 2D output data.
376	/// @param padding_bottom - implicit padding below 2D output data.
377	/// @param padding_left - implicit padding to the left of 2D output data.
378	/// @param adjustment_height - additional elements in the bottom of the 2D output data.
379	/// @param adjustment_width - additional elements to the right of the 2D output data.
380	/// @param kernel_height - kernel (filter) height.
381	/// @param kernel_width - kernel (filter) width.
382	/// @param upsampling_height - height of upsampling region for deconvolution input (deconvolution height stride).
383	/// @param upsampling_width - width of upsampling region for deconvolution input (deconvolution width stride).
384	/// @param dilation_height - dilation of kernel elements along the height dimension.
385	/// @param dilation_width - dilation of kernel elements along the width dimension.
386	/// @param groups - number of convolution groups.
387	/// @param group_input_channels - number of input channels per group.
388	/// @param group_output_channels - number of output channels per group.
389	/// @param output_min - lower bound for clipping output values.
390	/// @param output_max - upper bound for clipping output values.
391	/// @param input_id - Value ID for the input tensor. The input tensor must be a 4D tensor defined in the @a subgraph
392	/// with [N, IH, IW, groups group_input_channels] dimensions*
393	/// @param filter_id - Value ID for the filter tensor. The filter tensor must ge a 4D tensor defined in the @a subgraph
394	/// with [groups group_output_channels, kernel_height, kernel_width, group_input_channels]*
395	/// dimensions.
396	/// @param bias_id - Value ID for the bias tensor, or XNN_INVALID_VALUE_ID for a 2D Convolution Node without a bias. If
397	/// present, the bias tensor must be a 1D tensor defined in the @a subgraph with
398	/// [groups group_output_channels] dimensions.*
399	/// @param output_id - Value ID for the output tensor. The output tensor must be a 4D tensor defined in the @a subgraph
400	/// with [N, OH, OW, groups group_output_channels] dimensions.*
401	/// @param flags - binary features of the 2D Deconvolution Node. No supported flags are currently defined.
402	enum xnn_status xnn_define_deconvolution_2d(
403	xnn_subgraph_t subgraph,
404	uint32_t padding_top,
405	uint32_t padding_right,
406	uint32_t padding_bottom,
407	uint32_t padding_left,
408	uint32_t adjustment_height,
409	uint32_t adjustment_width,
410	uint32_t kernel_height,
411	uint32_t kernel_width,
412	uint32_t upsampling_height,
413	uint32_t upsampling_width,
414	uint32_t dilation_height,
415	uint32_t dilation_width,
416	uint32_t groups,
417	size_t group_input_channels,
418	size_t group_output_channels,
419	float output_min,
420	float output_max,
421	uint32_t input_id,
422	uint32_t filter_id,
423	uint32_t bias_id,
424	uint32_t output_id,
425	uint32_t flags);
426
427	/// Define a 2D Depthwise Convolution Node and add it to a Subgraph.
428	///
429	/// @param subgraph - a Subgraph object that will own the created Node.
430	/// @param input_padding_top - implicit zero-padding above 2D input data. Must be 0 if XNN_FLAG_TENSORFLOW_SAME_PADDING
431	/// flag is specified.
432	/// @param input_padding_right - implicit zero-padding to the right of 2D input data. Must be 0 if
433	/// XNN_FLAG_TENSORFLOW_SAME_PADDING flag is specified.
434	/// @param input_padding_bottom - implicit zero-padding below 2D input data. Must be 0 if
435	/// XNN_FLAG_TENSORFLOW_SAME_PADDING flag is specified.
436	/// @param input_padding_left - implicit zero-padding to the left of 2D input data. Must be 0 if
437	/// XNN_FLAG_TENSORFLOW_SAME_PADDING flag is specified.
438	/// @param kernel_height - kernel (filter) height.
439	/// @param kernel_width - kernel (filter) width.
440	/// @param subsampling_height - height of subsampling region for convolution output (convolution height stride).
441	/// @param subsampling_width - width of subsampling region for convolution output (convolution width stride).
442	/// @param dilation_height - dilation of kernel elements along the height dimension.
443	/// @param dilation_width - dilation of kernel elements along the width dimension.
444	/// @param depth_multiplier - ratio of output channels to input channels.
445	/// @param input_channels - number of input channels.
446	/// @param output_min - lower bound for clipping output values.
447	/// @param output_max - upper bound for clipping output values.
448	/// @param input_id - Value ID for the input tensor. The input tensor must be a 4D tensor defined in the @a subgraph
449	/// with [N, IH, IW, input_channels] dimensions
450	/// @param filter_id - Value ID for the filter tensor. The filter tensor must ge a 4D tensor defined in the @a subgraph
451	/// with [1, kernel_height, kernel_width, input_channels depth_multiplier] dimensions.*
452	/// @param bias_id - Value ID for the bias tensor, or XNN_INVALID_VALUE_ID for a 2D Depthwise Convolution Node without
453	/// a bias. If present, the bias tensor must be a 1D tensor defined in the @a subgraph with
454	/// [input_channels depth_multiplier] dimensions.*
455	/// @param output_id - Value ID for the output tensor. The output tensor must be a 4D tensor defined in the @a subgraph
456	/// with [N, OH, OW, input_channels depth_multiplier] dimensions.*
457	/// @param flags - binary features of the 2D Depthwise Convolution Node. The only currently supported values is
458	/// XNN_FLAG_TENSORFLOW_SAME_PADDING.
459	enum xnn_status xnn_define_depthwise_convolution_2d(
460	xnn_subgraph_t subgraph,
461	uint32_t input_padding_top,
462	uint32_t input_padding_right,
463	uint32_t input_padding_bottom,
464	uint32_t input_padding_left,
465	uint32_t kernel_height,
466	uint32_t kernel_width,
467	uint32_t subsampling_height,
468	uint32_t subsampling_width,
469	uint32_t dilation_height,
470	uint32_t dilation_width,
471	uint32_t depth_multiplier,
472	size_t input_channels,
473	float output_min,
474	float output_max,
475	uint32_t input_id,
476	uint32_t filter_id,
477	uint32_t bias_id,
478	uint32_t output_id,
479	uint32_t flags);
480
481	/// Define a Depth To Space Node and add it to a Subgraph.
482	///
483	/// The Depth To Space Node rearranges data from depth into blocks of spatial data (a reverse transform to
484	/// Space To Depth). For a given input pixel, an output square of pixels with side @a block_size is formed from values
485	/// in the corresponding number of its channels. The output depth is therefore @a block_size x @a block_size times
486	/// smaller than that of the input.
487	///
488	/// @param subgraph - a Subgraph object that will own the created Node.
489	/// @param input_id - Value ID for the input tensor. The input tensor must be a 4D tensor defined in the @a subgraph
490	/// with [N, IH, IW, OC block_size * block_size] dimensions.*
491	/// @param output_id - Value ID for the output tensor. The output tensor must be a 4D tensor defined in the @a subgraph
492	/// with [N, IH block_size, IW * block_size, OC] dimensions.*
493	/// @param block_size - the size of the spatial block.
494	/// @param flags - binary features of the input_channels Node. No supported flags are currently defined.
495	enum xnn_status xnn_define_depth_to_space(
496	xnn_subgraph_t subgraph,
497	uint32_t input_id,
498	uint32_t output_id,
499	uint32_t block_size,
500	uint32_t flags);
501
502	/// Define a 1D Global Average Pooling Node and add it to a Subgraph.
503	///
504	/// @param subgraph - a Subgraph object that will own the created Node.
505	/// @param output_min - lower bound for clipping output values.
506	/// @param output_max - upper bound for clipping output values.
507	/// @param input_id - Value ID for the input tensor. The input tensor must be a dense tensor with 2 or more dimensions
508	/// defined in the @a subgraph. Averaging is performed across the second-innermost dimension.
509	/// @param output_id - Value ID for the output tensor. The output tensor must be a dense tensor with 2 or more
510	/// dimensions defined in the @a subgraph.
511	/// @param flags - binary features of the 1D Global Average Pooling Node. No supported flags are currently defined.
512	enum xnn_status xnn_define_global_average_pooling_1d(
513	xnn_subgraph_t subgraph,
514	float output_min,
515	float output_max,
516	uint32_t input_id,
517	uint32_t output_id,
518	uint32_t flags);
519
520	/// Define a 2D Global Average Pooling Node and add it to a Subgraph.
521	///
522	/// @param subgraph - a Subgraph object that will own the created Node.
523	/// @param output_min - lower bound for clipping output values.
524	/// @param output_max - upper bound for clipping output values.
525	/// @param input_id - Value ID for the input tensor. The input tensor must be a dense tensor with 3 or more dimensions
526	/// defined in the @a subgraph. Averaging is performed across the second- and third-innermost
527	/// dimensions.
528	/// @param output_id - Value ID for the output tensor. The output tensor must be a dense tensor with 3 or more
529	/// dimensions defined in the @a subgraph.
530	/// @param flags - binary features of the 2D Global Average Pooling Node. No supported flags are currently defined.
531	enum xnn_status xnn_define_global_average_pooling_2d(
532	xnn_subgraph_t subgraph,
533	float output_min,
534	float output_max,
535	uint32_t input_id,
536	uint32_t output_id,
537	uint32_t flags);
538
539	/// Define a 2D Average Pooling Node and add it to a Subgraph.
540	///
541	/// @param subgraph - a Subgraph object that will own the created Node.
542	/// @param input_padding_top - implicit zero-padding above 2D input data. Must be 0 if XNN_FLAG_TENSORFLOW_SAME_PADDING
543	/// flag is specified.
544	/// @param input_padding_right - implicit zero-padding to the right of 2D input data. Must be 0 if
545	/// XNN_FLAG_TENSORFLOW_SAME_PADDING flag is specified.
546	/// @param input_padding_bottom - implicit zero-padding below 2D input data. Must be 0 if
547	/// XNN_FLAG_TENSORFLOW_SAME_PADDING flag is specified.
548	/// @param input_padding_left - implicit zero-padding to the left of 2D input data. Must be 0 if
549	/// XNN_FLAG_TENSORFLOW_SAME_PADDING flag is specified.
550	/// @param pooling_height - pooling (kernel) height.
551	/// @param pooling_width - pooling (kernel) width.
552	/// @param stride_height - displacing of the pooling window in the vertical dimension of the input pixels corresponding
553	/// to vertically adjacent output pixels.
554	/// @param stride_width - displacing of the pooling window in the horizontal dimension of the input pixels corresponding
555	/// to horizontally adjacent output pixels.
556	/// @param output_min - lower bound for clipping output values.
557	/// @param output_max - upper bound for clipping output values.
558	/// @param input_id - Value ID for the input tensor. The input tensor must be a 4D tensor defined in the @a subgraph
559	/// with [N, IH, IW, channels] dimensions
560	/// @param output_id - Value ID for the output tensor. The output tensor must be a 4D tensor defined in the @a subgraph
561	/// with [N, OH, OW, channels] dimensions.
562	/// @param flags - binary features of the 2D Average Pooling Node. The only currently supported values is
563	/// XNN_FLAG_TENSORFLOW_SAME_PADDING.
564	enum xnn_status xnn_define_average_pooling_2d(
565	xnn_subgraph_t subgraph,
566	uint32_t input_padding_top,
567	uint32_t input_padding_right,
568	uint32_t input_padding_bottom,
569	uint32_t input_padding_left,
570	uint32_t pooling_height,
571	uint32_t pooling_width,
572	uint32_t stride_height,
573	uint32_t stride_width,
574	float output_min,
575	float output_max,
576	uint32_t input_id,
577	uint32_t output_id,
578	uint32_t flags);
579
580	/// Define a Fully Connected Node and add it to a Subgraph.
581	///
582	/// @param subgraph - a Subgraph object that will own the created Node.
583	/// @param output_min - lower bound for clipping output values.
584	/// @param output_max - upper bound for clipping output values.
585	/// @param input_id - Value ID for the input tensor. The input tensor must be an N-dimensional tensor defined in the
586	/// @a subgraph. If XNN_FLAG_TENSORFLOW_RESHAPE_2D is not specified, the input tensor must be at least
587	/// 1D and its last dimension must match the last dimension of the filter tensor. In particular, if
588	/// input is a 2D tensor, it must have [batch_size, input_channels] dimensions.
589	/// If XNN_FLAG_TENSORFLOW_RESHAPE_2D is specified, the number of elements in the input tensor must be
590	/// divisible by the input_channels. The tensor will be first flattened into a 1D tensor of
591	/// [num_input_elements] dimensions, then reshaped into a 2D tensor of
592	/// [num_input_elements / input_channels, input_channels] dimensions where num_input_elements is the
593	/// total number of elements in the input tensor.
594	/// @param filter_id - Value ID for the filter tensor. The filter tensor must a 2D tensor defined in the @a subgraph.
595	/// If the XNN_FLAG_TRANSPOSE_WEIGHTS flag is not specified, the filter tensor must have
596	/// [output_channels, input_channels] dimensions. If the XNN_FLAG_TRANSPOSE_WEIGHTS flag is
597	/// specified, the filter tensor must have [input_channels, output_channels] dimensions.
598	/// @param bias_id - Value ID for the bias tensor, or XNN_INVALID_VALUE_ID for a Fully Connected Node without a bias.
599	/// If present, the bias tensor must be a 1D tensor defined in the @a subgraph with [output_channels]
600	/// dimensions.
601	/// @param output_id - Value ID for the output tensor. The output tensor must be defined in the @a subgraph.
602	/// If XNN_FLAG_TENSORFLOW_RESHAPE_2D is not specified, the output tensor must have the same
603	/// dimensionality as the input tensor, all its dimensions but the last one must match the
604	/// corresponding dimensions of the input tensor, and the last dimensions of the output tensor must
605	/// match the first dimension of the filter tensor. In particular, if input is a 2D tensor, output
606	/// must be a 2D tensor of [batch_size, output_channels] dimensions.
607	/// If XNN_FLAG_TENSORFLOW_RESHAPE_2D is specified, output must be a 2D tensor of
608	/// [num_input_elements / input_channels, output_channels] dimensions where num_input_elements is the
609	/// total number of elements in the input tensor.
610	/// @param flags - binary features of the Fully Connected Node. The only currently supported values are
611	/// XNN_FLAG_TENSORFLOW_RESHAPE_2D and XNN_FLAG_TRANSPOSE_WEIGHTS.
612	enum xnn_status xnn_define_fully_connected(
613	xnn_subgraph_t subgraph,
614	float output_min,
615	float output_max,
616	uint32_t input_id,
617	uint32_t filter_id,
618	uint32_t bias_id,
619	uint32_t output_id,
620	uint32_t flags);
621
622	/// Define a 2D Max Pooling Node and add it to a Subgraph.
623	///
624	/// @param subgraph - a Subgraph object that will own the created Node.
625	/// @param input_padding_top - implicit zero-padding above 2D input data. Must be 0 if XNN_FLAG_TENSORFLOW_SAME_PADDING
626	/// flag is specified.
627	/// @param input_padding_right - implicit zero-padding to the right of 2D input data. Must be 0 if
628	/// XNN_FLAG_TENSORFLOW_SAME_PADDING flag is specified.
629	/// @param input_padding_bottom - implicit zero-padding below 2D input data. Must be 0 if
630	/// XNN_FLAG_TENSORFLOW_SAME_PADDING flag is specified.
631	/// @param input_padding_left - implicit zero-padding to the left of 2D input data. Must be 0 if
632	/// XNN_FLAG_TENSORFLOW_SAME_PADDING flag is specified.
633	/// @param pooling_height - pooling (kernel) height.
634	/// @param pooling_width - pooling (kernel) width.
635	/// @param stride_height - displacing of the pooling window in the vertical dimension of the input pixels corresponding
636	/// to vertically adjacent output pixels.
637	/// @param stride_width - displacing of the pooling window in the horizontal dimension of the input pixels corresponding
638	/// to horizontally adjacent output pixels.
639	/// @param dilation_height - dilation of pooling elements along the height dimension.
640	/// @param dilation_width - dilation of pooling elements along the width dimension.
641	/// @param output_min - lower bound for clipping output values.
642	/// @param output_max - upper bound for clipping output values.
643	/// @param input_id - Value ID for the input tensor. The input tensor must be a 4D tensor defined in the @a subgraph
644	/// with [N, IH, IW, channels] dimensions
645	/// @param output_id - Value ID for the output tensor. The output tensor must be a 4D tensor defined in the @a subgraph
646	/// with [N, OH, OW, channels] dimensions.
647	/// @param flags - binary features of the 2D Max Pooling Node. The only currently supported values is
648	/// XNN_FLAG_TENSORFLOW_SAME_PADDING.
649	enum xnn_status xnn_define_max_pooling_2d(
650	xnn_subgraph_t subgraph,
651	uint32_t input_padding_top,
652	uint32_t input_padding_right,
653	uint32_t input_padding_bottom,
654	uint32_t input_padding_left,
655	uint32_t pooling_height,
656	uint32_t pooling_width,
657	uint32_t stride_height,
658	uint32_t stride_width,
659	uint32_t dilation_height,
660	uint32_t dilation_width,
661	float output_min,
662	float output_max,
663	uint32_t input_id,
664	uint32_t output_id,
665	uint32_t flags);
666
667	/// Define a 2D ArgMax Pooling Node and add it to a Subgraph.
668	///
669	/// @param subgraph - a Subgraph object that will own the created Node.
670	/// @param input_padding_top - implicit zero-padding above 2D input data.
671	/// @param input_padding_right - implicit zero-padding to the right of 2D input data.
672	/// @param input_padding_bottom - implicit zero-padding below 2D input data.
673	/// @param input_padding_left - implicit zero-padding to the left of 2D input data.
674	/// @param pooling_height - pooling (kernel) height. Vertical stride between pooling regions match this value.
675	/// @param pooling_width - pooling (kernel) width. Horizontal stride between pooling regions match this value.
676	/// @param input_id - Value ID for the input tensor. The input tensor must be a 4D tensor defined in the @a subgraph
677	/// with [N, IH, IW, channels] dimensions
678	/// @param output_value_id - Value ID for the output tensor with the maximum values in the pools. The output tensor must
679	/// be a 4D tensor defined in the @a subgraph with [N, OH, OW, channels] dimensions.
680	/// @param output_index_id - Value ID for the output tensor with the indexes of the maximum values in the pools. The
681	/// output tensor must be a 4D tensor defined in the @a subgraph with [N, OH, OW, channels]
682	/// dimensions.
683	/// @param flags - binary features of the 2D ArgMax Pooling Node. No supported flags are currently defined.
684	enum xnn_status xnn_define_argmax_pooling_2d(
685	xnn_subgraph_t subgraph,
686	uint32_t input_padding_top,
687	uint32_t input_padding_right,
688	uint32_t input_padding_bottom,
689	uint32_t input_padding_left,
690	uint32_t pooling_height,
691	uint32_t pooling_width,
692	uint32_t input_id,
693	uint32_t output_value_id,
694	uint32_t output_index_id,
695	uint32_t flags);
696
697	/// Define a 2D UnPooling Node and add it to a Subgraph.
698	///
699	/// @param subgraph - a Subgraph object that will own the created Node.
700	/// @param padding_top - implicit padding above 2D output data.
701	/// @param padding_right - implicit padding to the right of 2D output data.
702	/// @param padding_bottom - implicit padding below 2D output data.
703	/// @param padding_left - implicit padding to the left of 2D output data.
704	/// @param pooling_height - height of the pooling window.
705	/// @param pooling_width - width of the pooling window.
706	/// @param input_value_id - Value ID for the input tensor with the max-pooling values to invert. The input value tensor
707	/// must be a 4D tensor defined in the @a subgraph with [N, IH, IW, channels] dimensions.
708	/// @param input_index_id - Value ID for the input tensor with the indices of the per-pool maximum values produced by
709	/// a 2D UnPooling Node. The input tensor must be a 4D tensor defined in the @a subgraph with
710	/// [N, IH, IW, channels] dimensions.
711	/// @param output_id - Value ID for the output tensor. The output tensor must be a 4D tensor defined in the @a subgraph
712	/// with [N, OH, OW, channels] dimensions.
713	/// @param flags - binary features of the 2D UnPooling Node. No supported flags are currently defined.
714	enum xnn_status xnn_define_unpooling_2d(
715	xnn_subgraph_t subgraph,
716	uint32_t padding_top,
717	uint32_t padding_right,
718	uint32_t padding_bottom,
719	uint32_t padding_left,
720	uint32_t pooling_height,
721	uint32_t pooling_width,
722	uint32_t input_value_id,
723	uint32_t input_index_id,
724	uint32_t output_id,
725	uint32_t flags);
726
727	/// Define a 2-Input Add Node and add it to a Subgraph.
728	///
729	/// The 2-Input Add Node computes elementwise addition of two tensor inputs with numpy broadcasting rules.
730	///
731	/// @param subgraph - a Subgraph object that will own the created Node.
732	/// @param output_min - lower bound for clipping output values.
733	/// @param output_max - upper bound for clipping output values.
734	/// @param input1_id - Value ID for the first input tensor. The input tensor must be an N-dimensional tensor defined in
735	/// the @a subgraph with each dimension either equal to the corresponding dimension of the second
736	/// input, or equal to 1. In the latter case, the elements of the input tensor are broadcasted along
737	/// that dimension.
738	/// @param input2_id - Value ID for the second input tensor. The input tensor must be an M-dimensional tensor defined in
739	/// the @a subgraph with each dimension either equal to the corresponding dimension of the first
740	/// input, or equal to 1. In the latter case, the elements of the input tensor are broadcasted along
741	/// that dimension.
742	/// @param output_id - Value ID for the output tensor. The output tensor must be a max(N,M)-dimensional tensor defined
743	/// in the @a subgraph with each dimension equal to the maximum between the corresponding dimension
744	/// of the two inputs.
745	/// @param flags - binary features of the Add Node. No supported flags are currently defined.
746	enum xnn_status xnn_define_add2(
747	xnn_subgraph_t subgraph,
748	float output_min,
749	float output_max,
750	uint32_t input1_id,
751	uint32_t input2_id,
752	uint32_t output_id,
753	uint32_t flags);
754
755	/// Define a 2-Input Multiply Node and add it to a Subgraph.
756	///
757	/// The 2-Input Multiply Node computes elementwise multiplication of two tensor inputs with numpy broadcasting rules.
758	///
759	/// @param subgraph - a Subgraph object that will own the created Node.
760	/// @param output_min - lower bound for clipping output values.
761	/// @param output_max - upper bound for clipping output values.
762	/// @param input1_id - Value ID for the first input tensor. The input tensor must be an N-dimensional tensor defined in
763	/// the @a subgraph with each dimension either equal to the corresponding dimension of the second
764	/// input, or equal to 1. In the latter case, the elements of the input tensor are broadcasted along
765	/// that dimension.
766	/// @param input2_id - Value ID for the second input tensor. The input tensor must be an M-dimensional tensor defined in
767	/// the @a subgraph with each dimension either equal to the corresponding dimension of the first
768	/// input, or equal to 1. In the latter case, the elements of the input tensor are broadcasted along
769	/// that dimension.
770	/// @param output_id - Value ID for the output tensor. The output tensor must be a max(N,M)-dimensional tensor defined
771	/// in the @a subgraph with each dimension equal to the maximum between the corresponding dimension
772	/// of the two inputs.
773	/// @param flags - binary features of the Multiply Node. No supported flags are currently defined.
774	enum xnn_status xnn_define_multiply2(
775	xnn_subgraph_t subgraph,
776	float output_min,
777	float output_max,
778	uint32_t input1_id,
779	uint32_t input2_id,
780	uint32_t output_id,
781	uint32_t flags);
782
783	/// Define a Subtract Node and add it to a Subgraph.
784	///
785	/// The Subtract Node computes elementwise subtraction of two tensor inputs with numpy broadcasting rules.
786	///
787	/// @param subgraph - a Subgraph object that will own the created Node.
788	/// @param output_min - lower bound for clipping output values.
789	/// @param output_max - upper bound for clipping output values.
790	/// @param input1_id - Value ID for the first input tensor. The input tensor must be an N-dimensional tensor defined in
791	/// the @a subgraph with each dimension either equal to the corresponding dimension of the second
792	/// input, or equal to 1. In the latter case, the elements of the input tensor are broadcasted along
793	/// that dimension.
794	/// @param input2_id - Value ID for the second input tensor. The input tensor must be an M-dimensional tensor defined in
795	/// the @a subgraph with each dimension either equal to the corresponding dimension of the first
796	/// input, or equal to 1. In the latter case, the elements of the input tensor are broadcasted along
797	/// that dimension.
798	/// @param output_id - Value ID for the output tensor. The output tensor must be a max(N,M)-dimensional tensor defined
799	/// in the @a subgraph with each dimension equal to the maximum between the corresponding dimension
800	/// of the two inputs.
801	/// @param flags - binary features of the Subtract Node. No supported flags are currently defined.
802	enum xnn_status xnn_define_subtract(
803	xnn_subgraph_t subgraph,
804	float output_min,
805	float output_max,
806	uint32_t input1_id,
807	uint32_t input2_id,
808	uint32_t output_id,
809	uint32_t flags);
810
811	/// Define a Divide Node and add it to a Subgraph.
812	///
813	/// The Divide Node computes elementwise division of two tensor inputs with numpy broadcasting rules.
814	///
815	/// @param subgraph - a Subgraph object that will own the created Node.
816	/// @param output_min - lower bound for clipping output values.
817	/// @param output_max - upper bound for clipping output values.
818	/// @param input1_id - Value ID for the first input tensor. The input tensor must be an N-dimensional tensor defined in
819	/// the @a subgraph with each dimension either equal to the corresponding dimension of the second
820	/// input, or equal to 1. In the latter case, the elements of the input tensor are broadcasted along
821	/// that dimension.
822	/// @param input2_id - Value ID for the second input tensor. The input tensor must be an M-dimensional tensor defined in
823	/// the @a subgraph with each dimension either equal to the corresponding dimension of the first
824	/// input, or equal to 1. In the latter case, the elements of the input tensor are broadcasted along
825	/// that dimension.
826	/// @param output_id - Value ID for the output tensor. The output tensor must be a max(N,M)-dimensional tensor defined
827	/// in the @a subgraph with each dimension equal to the maximum between the corresponding dimension
828	/// of the two inputs.
829	/// @param flags - binary features of the Divide Node. No supported flags are currently defined.
830	enum xnn_status xnn_define_divide(
831	xnn_subgraph_t subgraph,
832	float output_min,
833	float output_max,
834	uint32_t input1_id,
835	uint32_t input2_id,
836	uint32_t output_id,
837	uint32_t flags);
838
839	/// Define a 2-Input Maximum Node and add it to a Subgraph.
840	///
841	/// The 2-Input Maximum Node computes elementwise maximum of two tensor inputs with numpy broadcasting rules.
842	///
843	/// @param subgraph - a Subgraph object that will own the created Node.
844	/// @param input1_id - Value ID for the first input tensor. The input tensor must be an N-dimensional tensor defined in
845	/// the @a subgraph with each dimension either equal to the corresponding dimension of the second
846	/// input, or equal to 1. In the latter case, the elements of the input tensor are broadcasted along
847	/// that dimension.
848	/// @param input2_id - Value ID for the second input tensor. The input tensor must be an M-dimensional tensor defined in
849	/// the @a subgraph with each dimension either equal to the corresponding dimension of the first
850	/// input, or equal to 1. In the latter case, the elements of the input tensor are broadcasted along
851	/// that dimension.
852	/// @param output_id - Value ID for the output tensor. The output tensor must be a max(N,M)-dimensional tensor defined
853	/// in the @a subgraph with each dimension equal to the maximum between the corresponding dimension
854	/// of the two inputs.
855	/// @param flags - binary features of the Maximum Node. No supported flags are currently defined.
856	enum xnn_status xnn_define_maximum2(
857	xnn_subgraph_t subgraph,
858	uint32_t input1_id,
859	uint32_t input2_id,
860	uint32_t output_id,
861	uint32_t flags);
862
863	/// Define a 2-Input Minimum Node and add it to a Subgraph.
864	///
865	/// The 2-Input Minimum Node computes elementwise minimum of two tensor inputs with numpy broadcasting rules.
866	///
867	/// @param subgraph - a Subgraph object that will own the created Node.
868	/// @param input1_id - Value ID for the first input tensor. The input tensor must be an N-dimensional tensor defined in
869	/// the @a subgraph with each dimension either equal to the corresponding dimension of the second
870	/// input, or equal to 1. In the latter case, the elements of the input tensor are broadcasted along
871	/// that dimension.
872	/// @param input2_id - Value ID for the second input tensor. The input tensor must be an M-dimensional tensor defined in
873	/// the @a subgraph with each dimension either equal to the corresponding dimension of the first
874	/// input, or equal to 1. In the latter case, the elements of the input tensor are broadcasted along
875	/// that dimension.
876	/// @param output_id - Value ID for the output tensor. The output tensor must be a max(N,M)-dimensional tensor defined
877	/// in the @a subgraph with each dimension equal to the maximum between the corresponding dimension
878	/// of the two inputs.
879	/// @param flags - binary features of the Minimum Node. No supported flags are currently defined.
880	enum xnn_status xnn_define_minimum2(
881	xnn_subgraph_t subgraph,
882	uint32_t input1_id,
883	uint32_t input2_id,
884	uint32_t output_id,
885	uint32_t flags);
886
887	/// Define a Squared Difference Node and add it to a Subgraph.
888	///
889	/// The Squared Difference Node computes elementwise squared difference of two tensor inputs with numpy broadcasting
890	/// rules.
891	///
892	/// @param subgraph - a Subgraph object that will own the created Node.
893	/// @param input1_id - Value ID for the first input tensor. The input tensor must be an N-dimensional tensor defined in
894	/// the @a subgraph with each dimension either equal to the corresponding dimension of the second
895	/// input, or equal to 1. In the latter case, the elements of the input tensor are broadcasted along
896	/// that dimension.
897	/// @param input2_id - Value ID for the second input tensor. The input tensor must be an M-dimensional tensor defined in
898	/// the @a subgraph with each dimension either equal to the corresponding dimension of the first
899	/// input, or equal to 1. In the latter case, the elements of the input tensor are broadcasted along
900	/// that dimension.
901	/// @param output_id - Value ID for the output tensor. The output tensor must be a max(N,M)-dimensional tensor defined
902	/// in the @a subgraph with each dimension equal to the maximum between the corresponding dimension
903	/// of the two inputs.
904	/// @param flags - binary features of the Squared Difference Node. No supported flags are currently defined.
905	enum xnn_status xnn_define_squared_difference(
906	xnn_subgraph_t subgraph,
907	uint32_t input1_id,
908	uint32_t input2_id,
909	uint32_t output_id,
910	uint32_t flags);
911
912	/// Define a Constant Pad Node with static padding specification and add it to a Subgraph.
913	///
914	/// @param subgraph - a Subgraph object that will own the created Node.
915	/// @param pre_paddings - number of padding elements to insert before input elements for every dimension. This array
916	/// must have as many elements as the number of dimensions in the input tensor.
917	/// @param post_paddings - number of padding elements to insert after input elements for every dimension. This array
918	/// must have as many elements as the number of dimensions in the input tensor.
919	/// @param padding_value - constant value used to initialize padding elements.
920	/// @param input_id - Value ID for the input tensor. The input tensor must be defined in the @a subgraph.
921	/// @param output_id - Value ID for the output tensor. The output tensor must be defined in the @a subgraph, and its
922	/// shape must match the shape of the input tensor with padding.
923	/// @param flags - binary features of the Constant Pad Node. No supported flags are currently defined.
924	enum xnn_status xnn_define_static_constant_pad(
925	xnn_subgraph_t subgraph,
926	const size_t* pre_paddings,
927	const size_t* post_paddings,
928	float padding_value,
929	uint32_t input_id,
930	uint32_t output_id,
931	uint32_t flags);
932
933	/// Define a 2-Input Concatenate Node and add it to a Subgraph.
934	///
935	/// The 2-Input Concatenate Node concatenates two tensors along a specified axis.
936	///
937	/// @param subgraph - a Subgraph object that will own the created Node.
938	/// @param axis - the axis to concatenate the two input tensors along
939	/// @param input1_id - Value ID for the first input tensor. The input tensor must be an N-dimensional tensor defined in
940	/// the @a subgraph with each dimension, except the axis, equal to the corresponding dimension of the
941	/// second input.
942	/// @param input2_id - Value ID for the second input tensor. The input tensor must be an N-dimensional tensor defined in
943	/// the @a subgraph with each dimension, except the axis, equal to the corresponding dimension of the
944	/// first input.
945	/// @param output_id - Value ID for the output tensor. The output tensor must be a N-dimensional tensor defined
946	/// in the @a subgraph with each dimension equal to the dimension of both inputs, except the axis
947	/// dimension, where it is the sum of the corresponding dimensions of both inputs.
948	/// @param flags - binary features of the Concatenate Node. No supported flags are currently defined.
949	enum xnn_status xnn_define_concatenate2(
950	xnn_subgraph_t subgraph,
951	size_t axis,
952	uint32_t input1_id,
953	uint32_t input2_id,
954	uint32_t output_id,
955	uint32_t flags);
956
957	/// Define a 3-Input Concatenate Node and add it to a Subgraph.
958	///
959	/// The 3-Input Concatenate Node concatenates three tensors along a specified axis.
960	///
961	/// @param subgraph - a Subgraph object that will own the created Node.
962	/// @param axis - the axis to concatenate the three input tensors along
963	/// @param input1_id - Value ID for the first input tensor. The input tensor must be an N-dimensional tensor defined in
964	/// the @a subgraph with each dimension, except the axis, equal to the corresponding dimension of the
965	/// other inputs.
966	/// @param input2_id - Value ID for the second input tensor. The input tensor must be an N-dimensional tensor defined in
967	/// the @a subgraph with each dimension, except the axis, equal to the corresponding dimension of the
968	/// other inputs.
969	/// @param input3_id - Value ID for the third input tensor. The input tensor must be an N-dimensional tensor defined in
970	/// the @a subgraph with each dimension, except the axis, equal to the corresponding dimension of the
971	/// other inputs.
972	/// @param output_id - Value ID for the output tensor. The output tensor must be a N-dimensional tensor defined
973	/// in the @a subgraph with each dimension equal to the dimension of all inputs, except the axis
974	/// dimension, where it is the sum of the corresponding dimensions of all inputs.
975	/// @param flags - binary features of the Concatenate Node. No supported flags are currently defined.
976	enum xnn_status xnn_define_concatenate3(
977	xnn_subgraph_t subgraph,
978	size_t axis,
979	uint32_t input1_id,
980	uint32_t input2_id,
981	uint32_t input3_id,
982	uint32_t output_id,
983	uint32_t flags);
984
985	/// Define a 4-Input Concatenate Node and add it to a Subgraph.
986	///
987	/// The 4-Input Concatenate Node concatenates four tensors along a specified axis.
988	///
989	/// @param subgraph - a Subgraph object that will own the created Node.
990	/// @param axis - the axis to concatenate the four input tensors along
991	/// @param input1_id - Value ID for the first input tensor. The input tensor must be an N-dimensional tensor defined in
992	/// the @a subgraph with each dimension, except the axis, equal to the corresponding dimension of the
993	/// other inputs.
994	/// @param input2_id - Value ID for the second input tensor. The input tensor must be an N-dimensional tensor defined in
995	/// the @a subgraph with each dimension, except the axis, equal to the corresponding dimension of the
996	/// other inputs.
997	/// @param input3_id - Value ID for the third input tensor. The input tensor must be an N-dimensional tensor defined in
998	/// the @a subgraph with each dimension, except the axis, equal to the corresponding dimension of the
999	/// other inputs.
1000	/// @param input4_id - Value ID for the fourth input tensor. The input tensor must be an N-dimensional tensor defined in
1001	/// the @a subgraph with each dimension, except the axis, equal to the corresponding dimension of the
1002	/// other inputs.
1003	/// @param output_id - Value ID for the output tensor. The output tensor must be a N-dimensional tensor defined
1004	/// in the @a subgraph with each dimension equal to the dimension of all inputs, except the axis
1005	/// dimension, where it is the sum of the corresponding dimensions of all inputs.
1006	/// @param flags - binary features of the Concatenate Node. No supported flags are currently defined.
1007	enum xnn_status xnn_define_concatenate4(
1008	xnn_subgraph_t subgraph,
1009	size_t axis,
1010	uint32_t input1_id,
1011	uint32_t input2_id,
1012	uint32_t input3_id,
1013	uint32_t input4_id,
1014	uint32_t output_id,
1015	uint32_t flags);
1016
1017	/// Define a Copy Node and add it to a Subgraph.
1018	///
1019	/// The Copy Node copies an input tensor to an output tensor.
1020	///
1021	/// @param subgraph - a Subgraph object that will own the created Node.
1022	/// @param input_id - Value ID for the first input tensor. The input tensor must be defined in the @a subgraph.
1023	/// @param output_id - Value ID for the output tensor. The output tensor must be defined in the @a subgraph, and its
1024	/// shape must match the shape of the input tensor.
1025	/// @param flags - binary features of the Copy Node. No supported flags are currently defined.
1026	enum xnn_status xnn_define_copy(
1027	xnn_subgraph_t subgraph,
1028	uint32_t input_id,
1029	uint32_t output_id,
1030	uint32_t flags);
1031
1032	/// Define a 2-Output Split Node and add it to a Subgraph.
1033	///
1034	/// The 2-Output Split Node splits an input tensor into two output tensors along a specified axis evenly.
1035	///
1036	/// @param subgraph - a Subgraph object that will own the created Node.
1037	/// @param split_dim - the dimension to split the input tensor along
1038	/// @param input_id - Value ID for the input tensor. The input tensor must be an N-dimensional tensor defined in the @a
1039	/// subgraph.
1040	/// @param output1_id - Value ID for the first output tensor. The output tensor must be an N-dimensional tensor defined
1041	/// in the @a subgraph with each dimension, except the axis, equal to the corresponding dimension
1042	/// of the second output. The split_dim dimension is half of the input's split_dim.
1043	/// @param output2_id - Value ID for the second output tensor. The output tensor must be an N-dimensional tensor
1044	/// defined in the @a subgraph with each dimension, except the axis, equal to the corresponding
1045	/// dimension of the first output. The split_dim dimension is half of the input's split_dim.
1046	/// @param flags - binary features of the Split Node. No supported flags are currently defined.
1047	enum xnn_status xnn_define_even_split2(
1048	xnn_subgraph_t subgraph,
1049	size_t split_dim,
1050	uint32_t input_id,
1051	uint32_t output1_id,
1052	uint32_t output2_id,
1053	uint32_t flags);
1054
1055	/// Define a 3-Output Split Node and add it to a Subgraph.
1056	///
1057	/// The 3-Output Split Node splits an input tensor into three output tensors along a specified axis evenly.
1058	///
1059	/// @param subgraph - a Subgraph object that will own the created Node.
1060	/// @param split_dim - the dimension to split the input tensor along
1061	/// @param input_id - Value ID for the input tensor. The input tensor must be an N-dimensional tensor defined in the @a
1062	/// subgraph.
1063	/// @param output1_id - Value ID for the first output tensor. The output tensor must be an N-dimensional tensor defined
1064	/// in the @a subgraph with each dimension, except the axis, equal to the corresponding dimension
1065	/// of the second and third output. The split_dim dimension is one third of the input's split_dim.
1066	/// @param output2_id - Value ID for the second output tensor. The output tensor must be an N-dimensional tensor
1067	/// defined in the @a subgraph with each dimension, except the axis, equal to the corresponding
1068	/// dimension of the first and third output. The split_dim dimension is one third of the input's
1069	/// split_dim.
1070	/// @param output3_id - Value ID for the third output tensor. The output tensor must be an N-dimensional tensor
1071	/// defined in the @a subgraph with each dimension, except the axis, equal to the corresponding
1072	/// dimension of the second and third output. The split_dim dimension is one third of the input's
1073	/// split_dim.
1074	/// @param flags - binary features of the Split Node. No supported flags are currently defined.
1075	enum xnn_status xnn_define_even_split3(
1076	xnn_subgraph_t subgraph,
1077	size_t split_dim,
1078	uint32_t input_id,
1079	uint32_t output1_id,
1080	uint32_t output2_id,
1081	uint32_t output3_id,
1082	uint32_t flags);
1083
1084	/// Define a 4-Output Split Node and add it to a Subgraph.
1085	///
1086	/// The 4-Output Split Node splits an input tensor into four output tensors along a specified axis evenly.
1087	///
1088	/// @param subgraph - a Subgraph object that will own the created Node.
1089	/// @param split_dim - the dimension to split the input tensor along
1090	/// @param input_id - Value ID for the input tensor. The input tensor must be an N-dimensional tensor defined in the @a
1091	/// subgraph.
1092	/// @param output1_id - Value ID for the first output tensor. The output tensor must be an N-dimensional tensor defined
1093	/// in the @a subgraph with each dimension, except the axis, equal to the corresponding dimension
1094	/// of the other output tensors. The split_dim dimension is one fourth of the input's split_dim.
1095	/// @param output2_id - Value ID for the second output tensor. The output tensor must be an N-dimensional tensor
1096	/// defined in the @a subgraph with each dimension, except the axis, equal to the corresponding
1097	/// dimension of the other output tensors. The split_dim dimension is one fourth of the input's
1098	/// split_dim.
1099	/// @param output3_id - Value ID for the third output tensor. The output tensor must be an N-dimensional tensor
1100	/// defined in the @a subgraph with each dimension, except the axis, equal to the corresponding
1101	/// dimension of the other output tensors. The split_dim dimension is one fourth of the input's
1102	/// split_dim.
1103	/// @param output4_id - Value ID for the fourth output tensor. The output tensor must be an N-dimensional tensor
1104	/// defined in the @a subgraph with each dimension, except the axis, equal to the corresponding
1105	/// dimension of the other output tensors. The split_dim dimension is one fourth of the input's
1106	/// split_dim.
1107	/// @param flags - binary features of the Split Node. No supported flags are currently defined.
1108	enum xnn_status xnn_define_even_split4(
1109	xnn_subgraph_t subgraph,
1110	size_t split_dim,
1111	uint32_t input_id,
1112	uint32_t output1_id,
1113	uint32_t output2_id,
1114	uint32_t output3_id,
1115	uint32_t output4_id,
1116	uint32_t flags);
1117
1118	/// Define a Reshape Node with static shape specification and add it to a Subgraph.
1119	///
1120	/// @param subgraph - a Subgraph object that will own the created Node.
1121	/// @param num_dims - number of shape dimensions in the output tensor.
1122	/// @param new_shape - shape dimensions of the output tensor.
1123	/// @param input_id - Value ID for the input tensor. The input tensor must be defined in the @a subgraph.
1124	/// @param output_id - Value ID for the output tensor. The output tensor must be defined in the @a subgraph, and its
1125	/// shape must match the shape of the input tensor with padding.
1126	/// @param flags - binary features of the Reshape Node. No supported flags are currently defined.
1127	enum xnn_status xnn_define_static_reshape(
1128	xnn_subgraph_t subgraph,
1129	size_t num_dims,
1130	const size_t* new_shape,
1131	uint32_t input_id,
1132	uint32_t output_id,
1133	uint32_t flags);
1134
1135	/// Define a 2D Resize Bilinear Node with static output height & width specification and add it to a Subgraph.
1136	///
1137	/// @param subgraph - a Subgraph object that will own the created Node.
1138	/// @param new_height - height dimension of the output tensor.
1139	/// @param new_width - width dimension of the output tensor.
1140	/// @param input_id - Value ID for the input tensor. The input tensor must be a 4D tensor defined in the @a subgraph
1141	/// with [N, H, W, C] dimensions.
1142	/// @param output_id - Value ID for the output tensor. The output tensor must be a 4D tensor defined in the @a subgraph
1143	/// with [N, new_height, new_width, C] dimensions.
1144	/// @param flags - binary features of the 2D Resize Bilinear Node. The only currently supported values are
1145	/// XNN_FLAG_TENSORFLOW_LEGACY_MODE and XNN_FLAG_ALIGN_CORNERS, which are mutually exclusive.
1146	enum xnn_status xnn_define_static_resize_bilinear_2d(
1147	xnn_subgraph_t subgraph,
1148	size_t new_height,
1149	size_t new_width,
1150	uint32_t input_id,
1151	uint32_t output_id,
1152	uint32_t flags);
1153
1154	/// Define a PReLU (Parametric ReLU) Node and add it to a Subgraph.
1155	///
1156	/// @param subgraph - a Subgraph object that will own the created Node.
1157	/// @param input_id - Value ID for the input tensor. The input tensor must be a 4D tensor defined in the @a subgraph
1158	/// with [N, H, W, channels] dimensions.
1159	/// @param slope_id - Value ID for the bias tensor. The bias tensor must be a 1D tensor defined in the @a subgraph with
1160	/// [channels] dimensions.
1161	/// @param output_id - Value ID for the output tensor. The output tensor must be a 4D tensor defined in the @a subgraph
1162	/// with [N, H, W, channels] dimensions.
1163	/// @param flags - binary features of the PReLU Node. No supported flags are currently defined.
1164	enum xnn_status xnn_define_prelu(
1165	xnn_subgraph_t subgraph,
1166	uint32_t input_id,
1167	uint32_t slope_id,
1168	uint32_t output_id,
1169	uint32_t flags);
1170
1171	/// Define a Abs Node and add it to a Subgraph.
1172	///
1173	/// @param subgraph - a Subgraph object that will own the created Node.
1174	/// @param input_id - Value ID for the input tensor. The input tensor must be defined in the @a subgraph.
1175	/// @param output_id - Value ID for the output tensor. The output tensor must be defined in the @a subgraph, and its
1176	/// shape must match the shape of the input tensor.
1177	/// @param flags - binary features of the Abs Node. No supported flags are currently defined.
1178	enum xnn_status xnn_define_abs(
1179	xnn_subgraph_t subgraph,
1180	uint32_t input_id,
1181	uint32_t output_id,
1182	uint32_t flags);
1183
1184	/// Define a Bankers' Rounding Node and add it to a Subgraph.
1185	///
1186	/// @param subgraph - a Subgraph object that will own the created Node.
1187	/// @param input_id - Value ID for the input tensor. The input tensor must be defined in the @a subgraph.
1188	/// @param output_id - Value ID for the output tensor. The output tensor must be defined in the @a subgraph, and its
1189	/// shape must match the shape of the input tensor.
1190	/// @param flags - binary features of the Bankers' Rounding Node. No supported flags are currently defined.
1191	enum xnn_status xnn_define_bankers_rounding(
1192	xnn_subgraph_t subgraph,
1193	uint32_t input_id,
1194	uint32_t output_id,
1195	uint32_t flags);
1196
1197	/// Define a Ceiling Node and add it to a Subgraph.
1198	///
1199	/// @param subgraph - a Subgraph object that will own the created Node.
1200	/// @param input_id - Value ID for the input tensor. The input tensor must be defined in the @a subgraph.
1201	/// @param output_id - Value ID for the output tensor. The output tensor must be defined in the @a subgraph, and its
1202	/// shape must match the shape of the input tensor.
1203	/// @param flags - binary features of the Ceiling Node. No supported flags are currently defined.
1204	enum xnn_status xnn_define_ceiling(
1205	xnn_subgraph_t subgraph,
1206	uint32_t input_id,
1207	uint32_t output_id,
1208	uint32_t flags);
1209
1210	/// Define a Clamp Node and add it to a Subgraph.
1211	///
1212	/// @param subgraph - a Subgraph object that will own the created Node.
1213	/// @param output_min - lower bound for clipping output values.
1214	/// @param output_max - upper bound for clipping output values.
1215	/// @param input_id - Value ID for the input tensor. The input tensor must be defined in the @a subgraph.
1216	/// @param output_id - Value ID for the output tensor. The output tensor must be defined in the @a subgraph, and its
1217	/// shape must match the shape of the input tensor.
1218	/// @param flags - binary features of the Clamp Node. No supported flags are currently defined.
1219	enum xnn_status xnn_define_clamp(
1220	xnn_subgraph_t subgraph,
1221	float output_min,
1222	float output_max,
1223	uint32_t input_id,
1224	uint32_t output_id,
1225	uint32_t flags);
1226
1227	/// Define an ELU (Exponential Linear Unit) Node and add it to a Subgraph.
1228	///
1229	/// @param subgraph - a Subgraph object that will own the created Node.
1230	/// @param alpha - scale factor for negative output elements.
1231	/// @param input_id - Value ID for the input tensor. The input tensor must be defined in the @a subgraph.
1232	/// @param output_id - Value ID for the output tensor. The output tensor must be defined in the @a subgraph, and its
1233	/// shape must match the shape of the input tensor.
1234	/// @param flags - binary features of the ELU Node. No supported flags are currently defined.
1235	enum xnn_status xnn_define_elu(
1236	xnn_subgraph_t subgraph,
1237	float alpha,
1238	uint32_t input_id,
1239	uint32_t output_id,
1240	uint32_t flags);
1241
1242	/// Define a Floor Node and add it to a Subgraph.
1243	///
1244	/// @param subgraph - a Subgraph object that will own the created Node.
1245	/// @param input_id - Value ID for the input tensor. The input tensor must be defined in the @a subgraph.
1246	/// @param output_id - Value ID for the output tensor. The output tensor must be defined in the @a subgraph, and its
1247	/// shape must match the shape of the input tensor.
1248	/// @param flags - binary features of the Floor Node. No supported flags are currently defined.
1249	enum xnn_status xnn_define_floor(
1250	xnn_subgraph_t subgraph,
1251	uint32_t input_id,
1252	uint32_t output_id,
1253	uint32_t flags);
1254
1255	/// Define a HardSwish Node and add it to a Subgraph.
1256	///
1257	/// @param subgraph - a Subgraph object that will own the created Node.
1258	/// @param input_id - Value ID for the input tensor. The input tensor must be defined in the @a subgraph.
1259	/// @param output_id - Value ID for the output tensor. The output tensor must be defined in the @a subgraph, and its
1260	/// shape must match the shape of the input tensor.
1261	/// @param flags - binary features of the HardSwish Node. No supported flags are currently defined.
1262	enum xnn_status xnn_define_hardswish(
1263	xnn_subgraph_t subgraph,
1264	uint32_t input_id,
1265	uint32_t output_id,
1266	uint32_t flags);
1267
1268	/// Define a Leaky ReLU Node and add it to a Subgraph.
1269	///
1270	/// @param subgraph - a Subgraph object that will own the created Node.
1271	/// @param negative_slope - scale factor for negative input elements.
1272	/// @param input_id - Value ID for the input tensor. The input tensor must be defined in the @a subgraph.
1273	/// @param output_id - Value ID for the output tensor. The output tensor must be defined in the @a subgraph, and its
1274	/// shape must match the shape of the input tensor.
1275	/// @param flags - binary features of the Leaky ReLU Node. No supported flags are currently defined.
1276	enum xnn_status xnn_define_leaky_relu(
1277	xnn_subgraph_t subgraph,
1278	float negative_slope,
1279	uint32_t input_id,
1280	uint32_t output_id,
1281	uint32_t flags);
1282
1283	/// Define a Negate Node and add it to a Subgraph.
1284	///
1285	/// @param subgraph - a Subgraph object that will own the created Node.
1286	/// @param input_id - Value ID for the input tensor. The input tensor must be defined in the @a subgraph.
1287	/// @param output_id - Value ID for the output tensor. The output tensor must be defined in the @a subgraph, and its
1288	/// shape must match the shape of the input tensor.
1289	/// @param flags - binary features of the Negate Node. No supported flags are currently defined.
1290	enum xnn_status xnn_define_negate(
1291	xnn_subgraph_t subgraph,
1292	uint32_t input_id,
1293	uint32_t output_id,
1294	uint32_t flags);
1295
1296	/// Define a Sigmoid Node and add it to a Subgraph.
1297	///
1298	/// @param subgraph - a Subgraph object that will own the created Node.
1299	/// @param input_id - Value ID for the input tensor. The input tensor must be defined in the @a subgraph.
1300	/// @param output_id - Value ID for the output tensor. The output tensor must be defined in the @a subgraph, and its
1301	/// shape must match the shape of the input tensor.
1302	/// @param flags - binary features of the Sigmoid Node. No supported flags are currently defined.
1303	enum xnn_status xnn_define_sigmoid(
1304	xnn_subgraph_t subgraph,
1305	uint32_t input_id,
1306	uint32_t output_id,
1307	uint32_t flags);
1308
1309	/// Define a SoftMax Node and add it to a Subgraph.
1310	///
1311	/// @param subgraph - a Subgraph object that will own the created Node.
1312	/// @param input_id - Value ID for the input tensor. The input tensor must be defined in the @a subgraph, and have at
1313	/// least one dimension.
1314	/// @param output_id - Value ID for the output tensor. The output tensor must be defined in the @a subgraph, and its
1315	/// shape must match the shape of the input tensor.
1316	/// @param flags - binary features of the SoftMax Node. No supported flags are currently defined.
1317	enum xnn_status xnn_define_softmax(
1318	xnn_subgraph_t subgraph,
1319	uint32_t input_id,
1320	uint32_t output_id,
1321	uint32_t flags);
1322
1323	/// Define a Space To Depth 2D Node and add it to a Subgraph.
1324	///
1325	/// The Space To Depth 2D Node rearranges blocks of spatial data into blocks (a reverse transform to Depth To Space 2D).
1326	/// For a given input pixel, an output square of pixels with side @a block_size is formed from values in the
1327	/// corresponding number of its channels. The output depth is therefore @a block_size x @a block_size times greater
1328	/// than that of the input.
1329	///
1330	/// @param subgraph - a Subgraph object that will own the created Node.
1331	/// @param block_size - the size of the spatial block.
1332	/// @param input_id - Value ID for the input tensor. The input tensor must be a 4D tensor defined in the @a subgraph
1333	/// with [N, IH block_size, IW * block_size, OC] dimensions.*
1334	/// @param output_id - Value ID for the output tensor. The output tensor must be a 4D tensor defined in the @a subgraph
1335	/// with [N, IH, IW, OC block_size * block_size] dimensions.*
1336	/// @param flags - binary features of the input_channels Node. No supported flags are currently defined.
1337	enum xnn_status xnn_define_space_to_depth_2d(
1338	xnn_subgraph_t subgraph,
1339	uint32_t block_size,
1340	uint32_t input_id,
1341	uint32_t output_id,
1342	uint32_t flags);
1343
1344	/// Define a Square Node and add it to a Subgraph.
1345	///
1346	/// @param subgraph - a Subgraph object that will own the created Node.
1347	/// @param input_id - Value ID for the input tensor. The input tensor must be defined in the @a subgraph.
1348	/// @param output_id - Value ID for the output tensor. The output tensor must be defined in the @a subgraph, and its
1349	/// shape must match the shape of the input tensor.
1350	/// @param flags - binary features of the Square Node. No supported flags are currently defined.
1351	enum xnn_status xnn_define_square(
1352	xnn_subgraph_t subgraph,
1353	uint32_t input_id,
1354	uint32_t output_id,
1355	uint32_t flags);
1356
1357	/// Define a Square Root Node and add it to a Subgraph.
1358	///
1359	/// @param subgraph - a Subgraph object that will own the created Node.
1360	/// @param input_id - Value ID for the input tensor. The input tensor must be defined in the @a subgraph.
1361	/// @param output_id - Value ID for the output tensor. The output tensor must be defined in the @a subgraph, and its
1362	/// shape must match the shape of the input tensor.
1363	/// @param flags - binary features of the Square Root Node. No supported flags are currently defined.
1364	enum xnn_status xnn_define_square_root(
1365	xnn_subgraph_t subgraph,
1366	uint32_t input_id,
1367	uint32_t output_id,
1368	uint32_t flags);
1369
1370	/// Define a Static Slice Node add it to a Subgraph.
1371	///
1372	/// @param subgraph - a Subgraph object that will own the created Node.
1373	/// @param num_dims - number of shape dimensions in the input and output tensor.
1374	/// @param offsets - offsets in each dimension of the input tensor. This array must have @a num_dims elements.
1375	/// @param sizes - size of each dimension in output tensor. This array must have @a num_dims elements.
1376	/// @param input_id - Value ID for the input tensor. The input tensor must be defined in the @a subgraph.
1377	/// @param output_id - Value ID for the output tensor. The output tensor must be defined in the @a subgraph, and its
1378	/// dimensions must match @a sizes.
1379	/// @param flags - binary features of the Static Slice Node. No supported flags are currently defined.
1380	enum xnn_status xnn_define_static_slice(
1381	xnn_subgraph_t subgraph,
1382	size_t num_dims,
1383	const size_t* offsets,
1384	const size_t* sizes,
1385	uint32_t input_id,
1386	uint32_t output_id,
1387	uint32_t flags);
1388
1389	/// Define a Static Transpose Node and add it to a Subgraph.
1390	///
1391	/// The Static Transpose Node applies a generalized transpose to the input tensor using the permuation in perm.
1392	///
1393	/// @param subgraph - a Subgraph object that will own the created Node.
1394	/// @param input_id - Value ID for the input tensor. The input tensor must be an N-dimensional tensor defined in
1395	/// the @a subgraph.
1396	/// @param output_id - Value ID for the output tensor. The output tensor must be an N-dimensional tensor defined
1397	/// in the @a subgraph with each dimension equal to its corresponding permuted input dimension.
1398	/// @param num_dims - the number of permutation dimensions. This must be equal to the number of input dimensions.
1399	/// @param perm - The permutation of the axis of the input tensor. The perm array must must contain 0 to N-1 in the
1400	/// permuted order.
1401	/// @param flags - binary features of the Static Transpose Node. No supported flags are currently defined.
1402	enum xnn_status xnn_define_static_transpose(
1403	xnn_subgraph_t subgraph,
1404	size_t num_dims,
1405	const size_t* perm,
1406	uint32_t input_id,
1407	uint32_t output_id,
1408	uint32_t flags);
1409
1410	/// Weights cache is a cache for packed weights. It can be reused between runtimes.
1411	typedef struct xnn_weights_cache* xnn_weights_cache_t;
1412
1413	enum xnn_status xnn_create_weights_cache(xnn_weights_cache_t* weights_cache_out);
1414
1415	/// Create a weights cache object specifying the initial size of weights cache (in bytes).
1416	/// @size - initial capacity of the weights cache (in bytes), i.e. it can hold size bytes without growing.
1417	/// @param weights_cache_out - pointer to the variable that will be initialized to a handle to the weights cache object
1418	/// upon successful return. Once created, the weights cache object can be shared between
1419	/// different Runtime objects.
1420	enum xnn_status xnn_create_weights_cache_with_size(size_t size, xnn_weights_cache_t* weights_cache_out);
1421
1422
1423	/// Weights cache can be finalized in these ways:
1424	enum xnn_weights_cache_finalization_kind {
1425	/// Weights cache is finalized, no insert operations into the weights cache is allowed, even if the "inserted"
1426	/// weights already exist in thee cache. Weights cache memory will also be trimmed to page boundary and set to
1427	/// read-only (to prevent writes).
1428	xnn_weights_cache_finalization_kind_hard,
1429	/// Weights cache will be finalized with some extra space at the end, this allows for "inserting" into the cache only
1430	/// if the weights are already in the cache, and errors on inserting uncached weights. There is memory overhead.
1431	xnn_weights_cache_finalization_kind_soft,
1432	};
1433
1434	/// Finalizes the weights cache. The kind of finalization is specified by `finalization_kind`.
1435	/// @param weights_cache - the weights cache object to finalize.
1436	/// @param finalization_kind - the kind of finalization.
1437	enum xnn_status xnn_finalize_weights_cache(
1438	xnn_weights_cache_t weights_cache,
1439	enum xnn_weights_cache_finalization_kind finalization_kind);
1440
1441	/// Destroy a weights cache object, as well as memory used for the cache.
1442	/// @param weights_cache - the weights cache object to destroy.
1443	enum xnn_status xnn_delete_weights_cache(xnn_weights_cache_t weights_cache);
1444
1445	typedef struct xnn_workspace* xnn_workspace_t;
1446
1447	/// Create a workspace object.
1448	/// @param workspace_out - pointer to the variable that will be initialized to a handle to the workspace object upon
1449	/// successful return. Once created, the workspace can be shared between different Runtime
1450	/// objects.
1451	enum xnn_status xnn_create_workspace(xnn_workspace_t* workspace_out);
1452	/// Destroy a workspace object, as well as memory used by the workspace. Object destruction can be deferred until all
1453	/// Runtime objects created with this workspace are destroyed.
1454	/// @param workspace - the workspace object to destroy.
1455	enum xnn_status xnn_release_workspace(xnn_workspace_t workspace);
1456
1457	/// Runtime is a combination of an execution plan for subgraph Nodes and a memory manager for subgraph Values.
1458	typedef struct xnn_runtime* xnn_runtime_t;
1459
1460	enum xnn_profile_info {
1461	/// Returns a size_t containing the number of operators.
1462	xnn_profile_info_num_operators,
1463	/// Returns a char[] containing the null character separated names of all operators.
1464	xnn_profile_info_operator_name,
1465	/// Returns a uint64_t[] with the runtimes of all operators in the same order as xnn_profile_info_operator_name.
1466	xnn_profile_info_operator_timing,
1467	};
1468
1469	/// Return profile information for all operators.
1470	///
1471	/// @param runtime - a Runtime object created with @ref xnn_create_runtime, @ref xnn_create_runtime_v2 or
1472	/// @ref xnn_create_runtime_v3.
1473	/// @param param_name - type of profile information required.
1474	/// @param param_value_size - the size in bytes of memory pointed to by param_value. If this is not sufficient then
1475	/// param_value_size_ret will be set to the required size and xnn_status_out_of_memory will be
1476	/// returned.
1477	/// @param param_value - a pointer to memory location where appropriate values for a given param_value will be written.
1478	/// @param param_value_size_ret - returns number of bytes required to write the result if param_value_size is not
1479	/// sufficient.
1480	enum xnn_status xnn_get_runtime_profiling_info(xnn_runtime_t runtime,
1481	enum xnn_profile_info param_name,
1482	size_t param_value_size,
1483	void* param_value,
1484	size_t* param_value_size_ret);
1485
1486	/// Create a Runtime object from a subgraph.
1487	///
1488	/// @param subgraph - a Subgraph object with all Values and Nodes that would be handled by the runtime. No Values or
1489	/// Nodes can be added to the runtime once it is constructed.
1490	/// @param weights_cache - a cache for packed weights. The runtime will look up and reuse packed weights in this cache,
1491	/// this will reduce memory allocated for packed weights.
1492	/// @param workspace - a workspace to hold internal tensors. The runtime will allocate space used for internal tensors
1493	/// and track them using workspace. Workspace can be shared and reused across different runtimes. If
1494	/// workspace is NULL, there will be no sharing: each runtime has its own workspace.
1495	/// @param threadpool - the thread pool to be used for parallelisation of computations in the runtime. If the thread
1496	/// pool is NULL, the computation would run on the caller thread without parallelization.
1497	/// @param flags - binary features of the runtime. The only currently supported values are
1498	/// XNN_FLAG_HINT_SPARSE_INFERENCE, XNN_FLAG_HINT_FP16_INFERENCE, XNN_FLAG_FORCE_FP16_INFERENCE, and
1499	/// XNN_FLAG_YIELD_WORKERS. If XNN_FLAG_YIELD_WORKERS is specified, worker threads would be yielded to
1500	/// the system scheduler after processing the last operator in the Runtime.
1501	/// @param runtime_out - pointer to the variable that will be initialized with a handle to the Runtime object upon
1502	/// successful return. Once constructed, the Runtime object is independent of the Subgraph object
1503	/// used to create it.
1504	enum xnn_status xnn_create_runtime_v4(
1505	xnn_subgraph_t subgraph,
1506	xnn_weights_cache_t weights_cache,
1507	xnn_workspace_t workspace,
1508	pthreadpool_t threadpool,
1509	uint32_t flags,
1510	xnn_runtime_t* runtime_out);
1511
1512	enum xnn_status xnn_create_runtime_v3(
1513	xnn_subgraph_t subgraph,
1514	xnn_weights_cache_t weights_cache,
1515	pthreadpool_t threadpool,
1516	uint32_t flags,
1517	xnn_runtime_t* runtime_out);
1518
1519	enum xnn_status xnn_create_runtime_v2(
1520	xnn_subgraph_t subgraph,
1521	pthreadpool_t threadpool,
1522	uint32_t flags,
1523	xnn_runtime_t* runtime_out);
1524
1525	enum xnn_status xnn_create_runtime(
1526	xnn_subgraph_t subgraph,
1527	xnn_runtime_t* runtime_out);
1528
1529	struct xnn_external_value {
1530	uint32_t id;
1531	void* data;
1532	};
1533
1534	/// Setup data pointers for external inputs and outputs in a Runtime object.
1535	///
1536	/// @param runtime - a Runtime object created with @ref xnn_create_runtime or @ref xnn_create_runtime_v2.
1537	/// @param num_external_values - the number of external inputs and outputs specified in this call. This number must
1538	/// match the number of external inputs and outputs in the runtime, i.e. all external
1539	/// inputs and outputs in the runtime must be specified in one call.
1540	/// @param external_values - array with location information for all external inputs and outputs in the runtime.
1541	enum xnn_status xnn_setup_runtime(
1542	xnn_runtime_t runtime,
1543	size_t num_external_values,
1544	const struct xnn_external_value* external_values);
1545
1546	/// Execute forward pass for all operators in the runtime.
1547	///
1548	/// @param runtime - the Runtime object with the execution plan to invoke.
1549	enum xnn_status xnn_invoke_runtime(
1550	xnn_runtime_t runtime);
1551
1552	/// Destroy a Runtime object, as well as operators and memory associated with it.
1553	///
1554	/// @param runtime - the Runtime object to destroy.
1555	enum xnn_status xnn_delete_runtime(
1556	xnn_runtime_t runtime);
1557
1558	typedef struct xnn_operator* xnn_operator_t;
1559
1560	enum xnn_status xnn_run_operator(
1561	xnn_operator_t op,
1562	pthreadpool_t threadpool);
1563
1564	enum xnn_status xnn_delete_operator(
1565	xnn_operator_t op);
1566
1567	#ifndef XNN_NO_F32_OPERATORS
1568
1569	enum xnn_status xnn_create_abs_nc_f32(
1570	size_t channels,
1571	size_t input_stride,
1572	size_t output_stride,
1573	uint32_t flags,
1574	xnn_operator_t* abs_op_out);
1575
1576	enum xnn_status xnn_setup_abs_nc_f32(
1577	xnn_operator_t abs_op,
1578	size_t batch_size,
1579	const float* input,
1580	float* output,
1581	pthreadpool_t threadpool);
1582
1583	enum xnn_status xnn_run_abs_nc_f32(
1584	size_t channels,
1585	size_t input_stride,
1586	size_t output_stride,
1587	size_t batch_size,
1588	const float* input,
1589	float* output,
1590	uint32_t flags,
1591	pthreadpool_t threadpool);
1592
1593	enum xnn_status xnn_create_add_nd_f32(
1594	float output_min,
1595	float output_max,
1596	uint32_t flags,
1597	xnn_operator_t* add_op_out);
1598
1599	enum xnn_status xnn_setup_add_nd_f32(
1600	xnn_operator_t add_op,
1601	size_t num_input1_dims,
1602	const size_t* input1_shape,
1603	size_t num_input2_dims,
1604	const size_t* input2_shape,
1605	const float* input1,
1606	const float* input2,
1607	float* output,
1608	pthreadpool_t threadpool);
1609
1610	enum xnn_status xnn_run_add_nd_f32(
1611	size_t num_input1_dims,
1612	const size_t* input1_shape,
1613	size_t num_input2_dims,
1614	const size_t* input2_shape,
1615	const float* input1,
1616	const float* input2,
1617	float* output,
1618	float output_min,
1619	float output_max,
1620	uint32_t flags,
1621	pthreadpool_t threadpool);
1622
1623	enum xnn_status xnn_create_argmax_pooling2d_nhwc_f32(
1624	uint32_t input_padding_top,
1625	uint32_t input_padding_right,
1626	uint32_t input_padding_bottom,
1627	uint32_t input_padding_left,
1628	uint32_t pooling_height,
1629	uint32_t pooling_width,
1630	size_t channels,
1631	size_t input_pixel_stride,
1632	size_t output_pixel_stride,
1633	uint32_t flags,
1634	xnn_operator_t* argmax_pooling_op_out);
1635
1636	enum xnn_status xnn_setup_argmax_pooling2d_nhwc_f32(
1637	xnn_operator_t argmax_pooling_op,
1638	size_t batch_size,
1639	size_t input_height,
1640	size_t input_width,
1641	const float* input,
1642	float* output,
1643	uint32_t* index,
1644	pthreadpool_t threadpool);
1645
1646	enum xnn_status xnn_create_average_pooling2d_nhwc_f32(
1647	uint32_t input_padding_top,
1648	uint32_t input_padding_right,
1649	uint32_t input_padding_bottom,
1650	uint32_t input_padding_left,
1651	uint32_t pooling_height,
1652	uint32_t pooling_width,
1653	uint32_t stride_height,
1654	uint32_t stride_width,
1655	size_t channels,
1656	size_t input_pixel_stride,
1657	size_t output_pixel_stride,
1658	float output_min,
1659	float output_max,
1660	uint32_t flags,
1661	xnn_operator_t* average_pooling_op_out);
1662
1663	enum xnn_status xnn_setup_average_pooling2d_nhwc_f32(
1664	xnn_operator_t average_pooling_op,
1665	size_t batch_size,
1666	size_t input_height,
1667	size_t input_width,
1668	const float* input,
1669	float* output,
1670	pthreadpool_t threadpool);
1671
1672	enum xnn_status xnn_create_bankers_rounding_nc_f32(
1673	size_t channels,
1674	size_t input_stride,
1675	size_t output_stride,
1676	uint32_t flags,
1677	xnn_operator_t* rounding_op_out);
1678
1679	enum xnn_status xnn_setup_bankers_rounding_nc_f32(
1680	xnn_operator_t rounding_op,
1681	size_t batch_size,
1682	const float* input,
1683	float* output,
1684	pthreadpool_t threadpool);
1685
1686	enum xnn_status xnn_run_bankers_rounding_nc_f32(
1687	size_t channels,
1688	size_t input_stride,
1689	size_t output_stride,
1690	size_t batch_size,
1691	const float* input,
1692	float* output,
1693	uint32_t flags,
1694	pthreadpool_t threadpool);
1695
1696	enum xnn_status xnn_create_ceiling_nc_f32(
1697	size_t channels,
1698	size_t input_stride,
1699	size_t output_stride,
1700	uint32_t flags,
1701	xnn_operator_t* ceiling_op_out);
1702
1703	enum xnn_status xnn_run_ceiling_nc_f32(
1704	size_t channels,
1705	size_t input_stride,
1706	size_t output_stride,
1707	size_t batch_size,
1708	const float* input,
1709	float* output,
1710	uint32_t flags,
1711	pthreadpool_t threadpool);
1712
1713	enum xnn_status xnn_setup_ceiling_nc_f32(
1714	xnn_operator_t ceiling_op,
1715	size_t batch_size,
1716	const float* input,
1717	float* output,
1718	pthreadpool_t threadpool);
1719
1720	enum xnn_status xnn_create_clamp_nc_f32(
1721	size_t channels,
1722	size_t input_stride,
1723	size_t output_stride,
1724	float output_min,
1725	float output_max,
1726	uint32_t flags,
1727	xnn_operator_t* clamp_op_out);
1728
1729	enum xnn_status xnn_setup_clamp_nc_f32(
1730	xnn_operator_t clamp_op,
1731	size_t batch_size,
1732	const float* input,
1733	float* output,
1734	pthreadpool_t threadpool);
1735
1736	enum xnn_status xnn_run_clamp_nc_f32(
1737	size_t channels,
1738	size_t input_stride,
1739	size_t output_stride,
1740	size_t batch_size,
1741	const float* input,
1742	float* output,
1743	float output_min,
1744	float output_max,
1745	uint32_t flags,
1746	pthreadpool_t threadpool);
1747
1748	typedef const struct xnn_caches* xnn_caches_t;
1749
1750	enum xnn_status xnn_create_convolution2d_nhwc_f32(
1751	uint32_t input_padding_top,
1752	uint32_t input_padding_right,
1753	uint32_t input_padding_bottom,
1754	uint32_t input_padding_left,
1755	uint32_t kernel_height,
1756	uint32_t kernel_width,
1757	uint32_t subsampling_height,
1758	uint32_t subsampling_width,
1759	uint32_t dilation_height,
1760	uint32_t dilation_width,
1761	uint32_t groups,
1762	size_t group_input_channels,
1763	size_t group_output_channels,
1764	size_t input_channel_stride,
1765	size_t output_channel_stride,
1766	const float* kernel,
1767	const float* bias,
1768	float output_min,
1769	float output_max,
1770	uint32_t flags,
1771	xnn_caches_t caches,
1772	xnn_operator_t* convolution_op_out);
1773
1774	// Forward declare.
1775	struct xnn_post_operation;
1776
1777	/// Create a convolution operator with a number of post operations. The
1778	/// convolution operator created using this function does not have output_min
1779	/// and output_max. The list of operators in post_operations will be applied in
1780	/// order. Convolution with post operations is only supported on JIT platforms
1781	/// and when JIT is enabled.
1782	enum xnn_status xnn_create_fused_convolution2d_nhwc_f32(
1783	uint32_t input_padding_top,
1784	uint32_t input_padding_right,
1785	uint32_t input_padding_bottom,
1786	uint32_t input_padding_left,
1787	uint32_t kernel_height,
1788	uint32_t kernel_width,
1789	uint32_t subsampling_height,
1790	uint32_t subsampling_width,
1791	uint32_t dilation_height,
1792	uint32_t dilation_width,
1793	uint32_t groups,
1794	size_t group_input_channels,
1795	size_t group_output_channels,
1796	size_t input_channel_stride,
1797	size_t output_channel_stride,
1798	const float* kernel,
1799	const float* bias,
1800	size_t num_post_operations,
1801	struct xnn_post_operation* post_operations,
1802	uint32_t flags,
1803	xnn_caches_t caches,
1804	xnn_operator_t* convolution_op_out);
1805
1806	enum xnn_status xnn_setup_convolution2d_nhwc_f32(
1807	xnn_operator_t convolution_op,
1808	size_t batch_size,
1809	size_t input_height,
1810	size_t input_width,
1811	const float* input,
1812	float* output,
1813	pthreadpool_t threadpool);
1814
1815	enum xnn_status xnn_create_deconvolution2d_nhwc_f32(
1816	uint32_t output_padding_top,
1817	uint32_t output_padding_right,
1818	uint32_t output_padding_bottom,
1819	uint32_t output_padding_left,
1820	uint32_t kernel_height,
1821	uint32_t kernel_width,
1822	uint32_t stride_height,
1823	uint32_t stride_width,
1824	uint32_t dilation_height,
1825	uint32_t dilation_width,
1826	uint32_t groups,
1827	size_t group_input_channels,
1828	size_t group_output_channels,
1829	size_t input_pixel_stride,
1830	size_t output_pixel_stride,
1831	const float* kernel,
1832	const float* bias,
1833	float output_min,
1834	float output_max,
1835	uint32_t flags,
1836	xnn_caches_t caches,
1837	xnn_operator_t* deconvolution_op_out);
1838
1839	enum xnn_status xnn_setup_deconvolution2d_nhwc_f32(
1840	xnn_operator_t deconvolution_op,
1841	size_t batch_size,
1842	size_t input_height,
1843	size_t input_width,
1844	uint32_t adjustment_height,
1845	uint32_t adjustment_width,
1846	const float* input,
1847	float* output,
1848	pthreadpool_t threadpool);
1849
1850	enum xnn_status xnn_create_divide_nd_f32(
1851	float output_min,
1852	float output_max,
1853	uint32_t flags,
1854	xnn_operator_t* divide_op_out);
1855
1856	enum xnn_status xnn_setup_divide_nd_f32(
1857	xnn_operator_t divide_op,
1858	size_t num_input1_dims,
1859	const size_t* input1_shape,
1860	size_t num_input2_dims,
1861	const size_t* input2_shape,
1862	const float* input1,
1863	const float* input2,
1864	float* output,
1865	pthreadpool_t threadpool);
1866
1867	enum xnn_status xnn_run_divide_nd_f32(
1868	size_t num_input1_dims,
1869	const size_t* input1_shape,
1870	size_t num_input2_dims,
1871	const size_t* input2_shape,
1872	const float* input1,
1873	const float* input2,
1874	float* output,
1875	float output_min,
1876	float output_max,
1877	uint32_t flags,
1878	pthreadpool_t threadpool);
1879
1880	enum xnn_status xnn_create_elu_nc_f32(
1881	size_t channels,
1882	size_t input_stride,
1883	size_t output_stride,
1884	float alpha,
1885	uint32_t flags,
1886	xnn_operator_t* elu_op_out);
1887
1888	enum xnn_status xnn_setup_elu_nc_f32(
1889	xnn_operator_t elu_op,
1890	size_t batch_size,
1891	const float* input,
1892	float* output,
1893	pthreadpool_t threadpool);
1894
1895	enum xnn_status xnn_run_elu_nc_f32(
1896	size_t channels,
1897	size_t input_stride,
1898	size_t output_stride,
1899	size_t batch_size,
1900	const float* input,
1901	float* output,
1902	float alpha,
1903	uint32_t flags,
1904	pthreadpool_t threadpool);
1905
1906	enum xnn_status xnn_create_floor_nc_f32(
1907	size_t channels,
1908	size_t input_stride,
1909	size_t output_stride,
1910	uint32_t flags,
1911	xnn_operator_t* floor_op_out);
1912
1913	enum xnn_status xnn_setup_floor_nc_f32(
1914	xnn_operator_t floor_op,
1915	size_t batch_size,
1916	const float* input,
1917	float* output,
1918	pthreadpool_t threadpool);
1919
1920	enum xnn_status xnn_run_floor_nc_f32(
1921	size_t channels,
1922	size_t input_stride,
1923	size_t output_stride,
1924	size_t batch_size,
1925	const float* input,
1926	float* output,
1927	uint32_t flags,
1928	pthreadpool_t threadpool);
1929
1930	enum xnn_status xnn_create_fully_connected_nc_f32(
1931	size_t input_channels,
1932	size_t output_channels,
1933	size_t input_stride,
1934	size_t output_stride,
1935	const float* kernel,
1936	const float* bias,
1937	float output_min,
1938	float output_max,
1939	uint32_t flags,
1940	const xnn_caches_t caches,
1941	xnn_operator_t* fully_connected_op_out);
1942
1943	enum xnn_status xnn_setup_fully_connected_nc_f32(
1944	xnn_operator_t fully_connected_op,
1945	size_t batch_size,
1946	const float* input,
1947	float* output,
1948	pthreadpool_t threadpool);
1949
1950	enum xnn_status xnn_create_global_average_pooling_nwc_f32(
1951	size_t channels,
1952	size_t input_stride,
1953	size_t output_stride,
1954	float output_min,
1955	float output_max,
1956	uint32_t flags,
1957	xnn_operator_t* global_average_pooling_op_out);
1958
1959	enum xnn_status xnn_setup_global_average_pooling_nwc_f32(
1960	xnn_operator_t global_average_pooling_op,
1961	size_t batch_size,
1962	size_t width,
1963	const float* input,
1964	float* output,
1965	pthreadpool_t threadpool);
1966
1967	enum xnn_status xnn_create_hardswish_nc_f32(
1968	size_t channels,
1969	size_t input_stride,
1970	size_t output_stride,
1971	uint32_t flags,
1972	xnn_operator_t* hardswish_op_out);
1973
1974	enum xnn_status xnn_setup_hardswish_nc_f32(
1975	xnn_operator_t hardswish_op,
1976	size_t batch_size,
1977	const float* input,
1978	float* output,
1979	pthreadpool_t threadpool);
1980
1981	enum xnn_status xnn_run_hardswish_nc_f32(
1982	size_t channels,
1983	size_t input_stride,
1984	size_t output_stride,
1985	size_t batch_size,
1986	const float* input,
1987	float* output,
1988	uint32_t flags,
1989	pthreadpool_t threadpool);
1990
1991	enum xnn_status xnn_create_leaky_relu_nc_f32(
1992	size_t channels,
1993	size_t input_stride,
1994	size_t output_stride,
1995	float negative_slope,
1996	uint32_t flags,
1997	xnn_operator_t* leaky_relu_op_out);
1998
1999	enum xnn_status xnn_setup_leaky_relu_nc_f32(
2000	xnn_operator_t leaky_relu_op,
2001	size_t batch_size,
2002	const float* input,
2003	float* output,
2004	pthreadpool_t threadpool);
2005
2006	enum xnn_status xnn_run_leaky_relu_nc_f32(
2007	size_t channels,
2008	size_t input_stride,
2009	size_t output_stride,
2010	size_t batch_size,
2011	const float* input,
2012	float* output,
2013	float negative_slope,
2014	uint32_t flags,
2015	pthreadpool_t threadpool);
2016
2017	enum xnn_status xnn_create_max_pooling2d_nhwc_f32(
2018	uint32_t input_padding_top,
2019	uint32_t input_padding_right,
2020	uint32_t input_padding_bottom,
2021	uint32_t input_padding_left,
2022	uint32_t pooling_height,
2023	uint32_t pooling_width,
2024	uint32_t stride_height,
2025	uint32_t stride_width,
2026	uint32_t dilation_height,
2027	uint32_t dilation_width,
2028	size_t channels,
2029	size_t input_pixel_stride,
2030	size_t output_pixel_stride,
2031	float output_min,
2032	float output_max,
2033	uint32_t flags,
2034	xnn_operator_t* max_pooling_op_out);
2035
2036	enum xnn_status xnn_setup_max_pooling2d_nhwc_f32(
2037	xnn_operator_t max_pooling_op,
2038	size_t batch_size,
2039	size_t input_height,
2040	size_t input_width,
2041	const float* input,
2042	float* output,
2043	pthreadpool_t threadpool);
2044
2045	enum xnn_status xnn_create_maximum_nd_f32(
2046	uint32_t flags,
2047	xnn_operator_t* maximum_op_out);
2048
2049	enum xnn_status xnn_setup_maximum_nd_f32(
2050	xnn_operator_t maximum_op,
2051	size_t num_input1_dims,
2052	const size_t* input1_shape,
2053	size_t num_input2_dims,
2054	const size_t* input2_shape,
2055	const float* input1,
2056	const float* input2,
2057	float* output,
2058	pthreadpool_t threadpool);
2059
2060	enum xnn_status xnn_run_maximum_nd_f32(
2061	size_t num_input1_dims,
2062	const size_t* input1_shape,
2063	size_t num_input2_dims,
2064	const size_t* input2_shape,
2065	const float* input1,
2066	const float* input2,
2067	float* output,
2068	float output_min,
2069	float output_max,
2070	uint32_t flags,
2071	pthreadpool_t threadpool);
2072
2073	enum xnn_status xnn_create_minimum_nd_f32(
2074	uint32_t flags,
2075	xnn_operator_t* minimum_op_out);
2076
2077	enum xnn_status xnn_setup_minimum_nd_f32(
2078	xnn_operator_t minimum_op,
2079	size_t num_input1_dims,
2080	const size_t* input1_shape,
2081	size_t num_input2_dims,
2082	const size_t* input2_shape,
2083	const float* input1,
2084	const float* input2,
2085	float* output,
2086	pthreadpool_t threadpool);
2087
2088	enum xnn_status xnn_run_minimum_nd_f32(
2089	size_t num_input1_dims,
2090	const size_t* input1_shape,
2091	size_t num_input2_dims,
2092	const size_t* input2_shape,
2093	const float* input1,
2094	const float* input2,
2095	float* output,
2096	float output_min,
2097	float output_max,
2098	uint32_t flags,
2099	pthreadpool_t threadpool);
2100
2101	enum xnn_status xnn_create_multiply_nd_f32(
2102	float output_min,
2103	float output_max,
2104	uint32_t flags,
2105	xnn_operator_t* multiply_op_out);
2106
2107	enum xnn_status xnn_setup_multiply_nd_f32(
2108	xnn_operator_t multiply_op,
2109	size_t num_input1_dims,
2110	const size_t* input1_shape,
2111	size_t num_input2_dims,
2112	const size_t* input2_shape,
2113	const float* input1,
2114	const float* input2,
2115	float* output,
2116	pthreadpool_t threadpool);
2117
2118	enum xnn_status xnn_run_multiply_nd_f32(
2119	size_t num_input1_dims,
2120	const size_t* input1_shape,
2121	size_t num_input2_dims,
2122	const size_t* input2_shape,
2123	const float* input1,
2124	const float* input2,
2125	float* output,
2126	float output_min,
2127	float output_max,
2128	uint32_t flags,
2129	pthreadpool_t threadpool);
2130
2131	enum xnn_status xnn_create_negate_nc_f32(
2132	size_t channels,
2133	size_t input_stride,
2134	size_t output_stride,
2135	uint32_t flags,
2136	xnn_operator_t* negate_op_out);
2137
2138	enum xnn_status xnn_setup_negate_nc_f32(
2139	xnn_operator_t negate_op,
2140	size_t batch_size,
2141	const float* input,
2142	float* output,
2143	pthreadpool_t threadpool);
2144
2145	enum xnn_status xnn_run_negate_nc_f32(
2146	size_t channels,
2147	size_t input_stride,
2148	size_t output_stride,
2149	size_t batch_size,
2150	const float* input,
2151	float* output,
2152	uint32_t flags,
2153	pthreadpool_t threadpool);
2154
2155	enum xnn_status xnn_create_prelu_nc_f32(
2156	size_t channels,
2157	size_t input_stride,
2158	size_t output_stride,
2159	const float* negative_slope,
2160	uint32_t flags,
2161	xnn_caches_t caches,
2162	xnn_operator_t* prelu_op_out);
2163
2164	enum xnn_status xnn_setup_prelu_nc_f32(
2165	xnn_operator_t prelu_op,
2166	size_t batch_size,
2167	const float* input,
2168	float* output,
2169	pthreadpool_t threadpool);
2170
2171	enum xnn_status xnn_create_resize_bilinear2d_nhwc_f32(
2172	size_t channels,
2173	size_t input_pixel_stride,
2174	size_t output_pixel_stride,
2175	uint32_t flags,
2176	xnn_operator_t* resize_op_out);
2177
2178	enum xnn_status xnn_setup_resize_bilinear2d_nhwc_f32(
2179	xnn_operator_t resize_op,
2180	size_t batch_size,
2181	size_t input_height,
2182	size_t input_width,
2183	size_t output_height,
2184	size_t output_width,
2185	const float* input,
2186	float* output,
2187	pthreadpool_t threadpool);
2188
2189	enum xnn_status xnn_create_sigmoid_nc_f32(
2190	size_t channels,
2191	size_t input_stride,
2192	size_t output_stride,
2193	uint32_t flags,
2194	xnn_operator_t* sigmoid_op_out);
2195
2196	enum xnn_status xnn_setup_sigmoid_nc_f32(
2197	xnn_operator_t sigmoid_op,
2198	size_t batch_size,
2199	const float* input,
2200	float* output,
2201	pthreadpool_t threadpool);
2202
2203	enum xnn_status xnn_run_sigmoid_nc_f32(
2204	size_t channels,
2205	size_t input_stride,
2206	size_t output_stride,
2207	size_t batch_size,
2208	const float* input,
2209	float* output,
2210	uint32_t flags,
2211	pthreadpool_t threadpool);
2212
2213	enum xnn_status xnn_create_softmax_nc_f32(
2214	size_t channels,
2215	size_t input_stride,
2216	size_t output_stride,
2217	uint32_t flags,
2218	xnn_operator_t* softmax_op_out);
2219
2220	enum xnn_status xnn_setup_softmax_nc_f32(
2221	xnn_operator_t softmax_op,
2222	size_t batch_size,
2223	const float* input,
2224	float* output,
2225	pthreadpool_t threadpool);
2226
2227	enum xnn_status xnn_create_square_nc_f32(
2228	size_t channels,
2229	size_t input_stride,
2230	size_t output_stride,
2231	uint32_t flags,
2232	xnn_operator_t* square_op_out);
2233
2234	enum xnn_status xnn_setup_square_nc_f32(
2235	xnn_operator_t square_op,
2236	size_t batch_size,
2237	const float* input,
2238	float* output,
2239	pthreadpool_t threadpool);
2240
2241	enum xnn_status xnn_run_square_nc_f32(
2242	size_t channels,
2243	size_t input_stride,
2244	size_t output_stride,
2245	size_t batch_size,
2246	const float* input,
2247	float* output,
2248	uint32_t flags,
2249	pthreadpool_t threadpool);
2250
2251	enum xnn_status xnn_create_square_root_nc_f32(
2252	size_t channels,
2253	size_t input_stride,
2254	size_t output_stride,
2255	uint32_t flags,
2256	xnn_operator_t* sqrt_op_out);
2257
2258	enum xnn_status xnn_setup_square_root_nc_f32(
2259	xnn_operator_t sqrt_op,
2260	size_t batch_size,
2261	const float* input,
2262	float* output,
2263	pthreadpool_t threadpool);
2264
2265	enum xnn_status xnn_run_square_root_nc_f32(
2266	size_t channels,
2267	size_t input_stride,
2268	size_t output_stride,
2269	size_t batch_size,
2270	const float* input,
2271	float* output,
2272	uint32_t flags,
2273	pthreadpool_t threadpool);
2274
2275	enum xnn_status xnn_create_squared_difference_nd_f32(
2276	uint32_t flags,
2277	xnn_operator_t* squared_difference_op_out);
2278
2279	enum xnn_status xnn_setup_squared_difference_nd_f32(
2280	xnn_operator_t squared_difference_op,
2281	size_t num_input1_dims,
2282	const size_t* input1_shape,
2283	size_t num_input2_dims,
2284	const size_t* input2_shape,
2285	const float* input1,
2286	const float* input2,
2287	float* output,
2288	pthreadpool_t threadpool);
2289
2290	enum xnn_status xnn_run_squared_difference_nd_f32(
2291	size_t num_input1_dims,
2292	const size_t* input1_shape,
2293	size_t num_input2_dims,
2294	const size_t* input2_shape,
2295	const float* input1,
2296	const float* input2,
2297	float* output,
2298	float output_min,
2299	float output_max,
2300	uint32_t flags,
2301	pthreadpool_t threadpool);
2302
2303	enum xnn_status xnn_create_subtract_nd_f32(
2304	float output_min,
2305	float output_max,
2306	uint32_t flags,
2307	xnn_operator_t* subtract_op_out);
2308
2309	enum xnn_status xnn_setup_subtract_nd_f32(
2310	xnn_operator_t subtract_op,
2311	size_t num_input1_dims,
2312	const size_t* input1_shape,
2313	size_t num_input2_dims,
2314	const size_t* input2_shape,
2315	const float* input1,
2316	const float* input2,
2317	float* output,
2318	pthreadpool_t threadpool);
2319
2320	enum xnn_status xnn_run_subtract_nd_f32(
2321	size_t num_input1_dims,
2322	const size_t* input1_shape,
2323	size_t num_input2_dims,
2324	const size_t* input2_shape,
2325	const float* input1,
2326	const float* input2,
2327	float* output,
2328	float output_min,
2329	float output_max,
2330	uint32_t flags,
2331	pthreadpool_t threadpool);
2332
2333	enum xnn_status xnn_create_truncation_nc_f32(
2334	size_t channels,
2335	size_t input_stride,
2336	size_t output_stride,
2337	uint32_t flags,
2338	xnn_operator_t* truncation_op_out);
2339
2340	enum xnn_status xnn_setup_truncation_nc_f32(
2341	xnn_operator_t truncation_op,
2342	size_t batch_size,
2343	const float* input,
2344	float* output,
2345	pthreadpool_t threadpool);
2346
2347	enum xnn_status xnn_run_truncation_nc_f32(
2348	size_t channels,
2349	size_t input_stride,
2350	size_t output_stride,
2351	size_t batch_size,
2352	const float* input,
2353	float* output,
2354	uint32_t flags,
2355	pthreadpool_t threadpool);
2356
2357	#ifndef XNN_NO_NCHW_OPERATORS
2358
2359	enum xnn_status xnn_create_depth_to_space_nchw2nhwc_x32(
2360	size_t output_channels,
2361	size_t input_channel_stride,
2362	size_t output_channel_stride,
2363	uint32_t block_size,
2364	uint32_t flags,
2365	xnn_operator_t* depth_to_space_op_out);
2366
2367	enum xnn_status xnn_setup_depth_to_space_nchw2nhwc_x32(
2368	xnn_operator_t depth_to_space_op,
2369	size_t batch_size,
2370	size_t input_height,
2371	size_t input_width,
2372	const void* input,
2373	void* output,
2374	pthreadpool_t threadpool);
2375
2376	enum xnn_status xnn_create_convolution2d_nchw_f32(
2377	uint32_t input_padding_top,
2378	uint32_t input_padding_right,
2379	uint32_t input_padding_bottom,
2380	uint32_t input_padding_left,
2381	uint32_t kernel_height,
2382	uint32_t kernel_width,
2383	uint32_t subsampling_height,
2384	uint32_t subsampling_width,
2385	uint32_t dilation_height,
2386	uint32_t dilation_width,
2387	uint32_t groups,
2388	size_t group_input_channels,
2389	size_t group_output_channels,
2390	size_t input_channel_stride,
2391	size_t output_channel_stride,
2392	const float* kernel,
2393	const float* bias,
2394	float output_min,
2395	float output_max,
2396	uint32_t flags,
2397	xnn_caches_t caches,
2398	xnn_operator_t* convolution_op_out);
2399
2400	enum xnn_status xnn_setup_convolution2d_nchw_f32(
2401	xnn_operator_t convolution_op,
2402	size_t batch_size,
2403	size_t input_height,
2404	size_t input_width,
2405	const float* input,
2406	float* output,
2407	pthreadpool_t threadpool);
2408
2409	enum xnn_status xnn_create_global_average_pooling_ncw_f32(
2410	size_t channels,
2411	float output_min,
2412	float output_max,
2413	uint32_t flags,
2414	xnn_operator_t* global_average_pooling_op_out);
2415
2416	enum xnn_status xnn_setup_global_average_pooling_ncw_f32(
2417	xnn_operator_t global_average_pooling_op,
2418	size_t batch_size,
2419	size_t width,
2420	const float* input,
2421	float* output,
2422	pthreadpool_t threadpool);
2423
2424	enum xnn_status xnn_create_resize_bilinear2d_nchw_f32(
2425	size_t channels,
2426	size_t input_pixel_stride,
2427	size_t output_pixel_stride,
2428	uint32_t flags,
2429	xnn_operator_t* resize_op_out);
2430
2431	enum xnn_status xnn_setup_resize_bilinear2d_nchw_f32(
2432	xnn_operator_t resize_op,
2433	size_t batch_size,
2434	size_t input_height,
2435	size_t input_width,
2436	size_t output_height,
2437	size_t output_width,
2438	const float* input,
2439	float* output,
2440	pthreadpool_t threadpool);
2441
2442	#endif // XNN_NO_NCHW_OPERATORS
2443
2444	#endif // XNN_NO_F32_OPERATORS
2445
2446	#ifndef XNN_NO_X32_OPERATORS
2447
2448	enum xnn_status xnn_create_channel_shuffle_nc_x32(
2449	size_t groups,
2450	size_t group_channels,
2451	size_t input_stride,
2452	size_t output_stride,
2453	uint32_t flags,
2454	xnn_operator_t* channel_shuffle_op_out);
2455
2456	enum xnn_status xnn_setup_channel_shuffle_nc_x32(
2457	xnn_operator_t channel_shuffle_op,
2458	size_t batch_size,
2459	const void* input,
2460	void* output,
2461	pthreadpool_t threadpool);
2462
2463	enum xnn_status xnn_create_constant_pad_nd_x32(
2464	const void* padding_value,
2465	uint32_t flags,
2466	xnn_operator_t* constant_pad_op_out);
2467
2468	enum xnn_status xnn_setup_constant_pad_nd_x32(
2469	xnn_operator_t constant_pad_op,
2470	size_t num_dims,
2471	const size_t* input_shape,
2472	const size_t* pre_padding,
2473	const size_t* post_padding,
2474	const void* input,
2475	void* output,
2476	pthreadpool_t threadpool);
2477
2478	enum xnn_status xnn_run_constant_pad_nd_x32(
2479	uint32_t flags,
2480	size_t num_dims,
2481	const size_t* input_shape,
2482	const size_t* pre_paddings,
2483	const size_t* post_paddings,
2484	const void* input,
2485	void* output,
2486	const void* padding_value,
2487	pthreadpool_t threadpool);
2488
2489	enum xnn_status xnn_create_copy_nc_x32(
2490	size_t channels,
2491	size_t input_stride,
2492	size_t output_stride,
2493	uint32_t flags,
2494	xnn_operator_t* copy_op_out);
2495
2496	enum xnn_status xnn_setup_copy_nc_x32(
2497	xnn_operator_t copy_op,
2498	size_t batch_size,
2499	const void* input,
2500	void* output,
2501	pthreadpool_t threadpool);
2502
2503	enum xnn_status xnn_run_copy_nc_x32(
2504	size_t channels,
2505	size_t input_stride,
2506	size_t output_stride,
2507	size_t batch_size,
2508	const uint32_t* input,
2509	uint32_t* output,
2510	uint32_t flags,
2511	pthreadpool_t threadpool);
2512
2513	enum xnn_status xnn_create_depth_to_space_nhwc_x32(
2514	size_t output_channels,
2515	size_t input_channel_stride,
2516	size_t output_channel_stride,
2517	uint32_t block_size,
2518	uint32_t flags,
2519	xnn_operator_t* depth_to_space_op_out);
2520
2521	enum xnn_status xnn_setup_depth_to_space_nhwc_x32(
2522	xnn_operator_t depth_to_space_op,
2523	size_t batch_size,
2524	size_t input_height,
2525	size_t input_width,
2526	const void* input,
2527	void* output,
2528	pthreadpool_t threadpool);
2529
2530	enum xnn_status xnn_create_slice_nd_x32(
2531	uint32_t flags,
2532	xnn_operator_t* slice_op_out);
2533
2534	enum xnn_status xnn_setup_slice_nd_x32(
2535	xnn_operator_t slice_op,
2536	size_t num_dims,
2537	const size_t* input_shape,
2538	const size_t* offsets,
2539	const size_t* sizes,
2540	const void* input,
2541	void* output,
2542	pthreadpool_t threadpool);
2543
2544	enum xnn_status xnn_run_slice_nd_x32(
2545	size_t num_dims,
2546	const size_t* input_shape,
2547	const size_t* offsets,
2548	const size_t* sizes,
2549	const void* input,
2550	void* output,
2551	uint32_t flags,
2552	pthreadpool_t threadpool);
2553
2554	enum xnn_status xnn_create_space_to_depth_nhwc_x32(
2555	size_t input_channels,
2556	size_t input_channel_stride,
2557	size_t output_channel_stride,
2558	uint32_t block_size,
2559	uint32_t flags,
2560	xnn_operator_t* space_to_depth_op_out);
2561
2562	enum xnn_status xnn_setup_space_to_depth_nhwc_x32(
2563	xnn_operator_t space_to_depth_op,
2564	size_t batch_size,
2565	size_t input_height,
2566	size_t input_width,
2567	const void* input,
2568	void* output,
2569	pthreadpool_t threadpool);
2570
2571	enum xnn_status xnn_create_transpose_nd_x32(
2572	uint32_t flags,
2573	xnn_operator_t* transpose_op_out);
2574
2575	enum xnn_status xnn_setup_transpose_nd_x32(
2576	xnn_operator_t transpose_op,
2577	const void* input,
2578	void* output,
2579	const size_t num_dims,
2580	const size_t* input_shape,
2581	const size_t* output_perm,
2582	pthreadpool_t threadpool);
2583
2584	enum xnn_status xnn_run_transpose_nd_x32(
2585	const void* input,
2586	void* output,
2587	const size_t num_dims,
2588	const size_t* input_shape,
2589	const size_t* output_perm,
2590	uint32_t flags,
2591	pthreadpool_t threadpool);
2592
2593	enum xnn_status xnn_create_unpooling2d_nhwc_x32(
2594	uint32_t input_padding_top,
2595	uint32_t input_padding_right,
2596	uint32_t input_padding_bottom,
2597	uint32_t input_padding_left,
2598	uint32_t pooling_height,
2599	uint32_t pooling_width,
2600	size_t channels,
2601	size_t input_pixel_stride,
2602	size_t output_pixel_stride,
2603	uint32_t flags,
2604	xnn_operator_t* unpooling_op_out);
2605
2606	enum xnn_status xnn_setup_unpooling2d_nhwc_x32(
2607	xnn_operator_t unpooling_op,
2608	size_t batch_size,
2609	size_t input_height,
2610	size_t input_width,
2611	const void* input,
2612	const uint32_t* index,
2613	void* output,
2614	pthreadpool_t threadpool);
2615
2616	#endif // XNN_NO_X32_OPERATORS
2617
2618	#ifndef XNN_NO_F16_OPERATORS
2619
2620	enum xnn_status xnn_create_abs_nc_f16(
2621	size_t channels,
2622	size_t input_stride,
2623	size_t output_stride,
2624	uint32_t flags,
2625	xnn_operator_t* abs_op_out);
2626
2627	enum xnn_status xnn_setup_abs_nc_f16(
2628	xnn_operator_t abs_op,
2629	size_t batch_size,
2630	const void* input,
2631	void* output,
2632	pthreadpool_t threadpool);
2633
2634	enum xnn_status xnn_create_add_nd_f16(
2635	float output_min,
2636	float output_max,
2637	uint32_t flags,
2638	xnn_operator_t* add_op_out);
2639
2640	enum xnn_status xnn_setup_add_nd_f16(
2641	xnn_operator_t add_op,
2642	size_t num_input1_dims,
2643	const size_t* input1_shape,
2644	size_t num_input2_dims,
2645	const size_t* input2_shape,
2646	const void* input1,
2647	const void* input2,
2648	void* output,
2649	pthreadpool_t threadpool);
2650
2651	enum xnn_status xnn_create_average_pooling2d_nhwc_f16(
2652	uint32_t input_padding_top,
2653	uint32_t input_padding_right,
2654	uint32_t input_padding_bottom,
2655	uint32_t input_padding_left,
2656	uint32_t pooling_height,
2657	uint32_t pooling_width,
2658	uint32_t stride_height,
2659	uint32_t stride_width,
2660	size_t channels,
2661	size_t input_pixel_stride,
2662	size_t output_pixel_stride,
2663	float output_min,
2664	float output_max,
2665	uint32_t flags,
2666	xnn_operator_t* average_pooling_op_out);
2667
2668	enum xnn_status xnn_setup_average_pooling2d_nhwc_f16(
2669	xnn_operator_t average_pooling_op,
2670	size_t batch_size,
2671	size_t input_height,
2672	size_t input_width,
2673	const void* input,
2674	void* output,
2675	pthreadpool_t threadpool);
2676
2677	enum xnn_status xnn_create_bankers_rounding_nc_f16(
2678	size_t channels,
2679	size_t input_stride,
2680	size_t output_stride,
2681	uint32_t flags,
2682	xnn_operator_t* rounding_op_out);
2683
2684	enum xnn_status xnn_setup_bankers_rounding_nc_f16(
2685	xnn_operator_t rounding_op,
2686	size_t batch_size,
2687	const void* input,
2688	void* output,
2689	pthreadpool_t threadpool);
2690
2691	enum xnn_status xnn_create_ceiling_nc_f16(
2692	size_t channels,
2693	size_t input_stride,
2694	size_t output_stride,
2695	uint32_t flags,
2696	xnn_operator_t* ceiling_op_out);
2697
2698	enum xnn_status xnn_setup_ceiling_nc_f16(
2699	xnn_operator_t ceiling_op,
2700	size_t batch_size,
2701	const void* input,
2702	void* output,
2703	pthreadpool_t threadpool);
2704
2705	enum xnn_status xnn_create_clamp_nc_f16(
2706	size_t channels,
2707	size_t input_stride,
2708	size_t output_stride,
2709	float output_min,
2710	float output_max,
2711	uint32_t flags,
2712	xnn_operator_t* clamp_op_out);
2713
2714	enum xnn_status xnn_setup_clamp_nc_f16(
2715	xnn_operator_t clamp_op,
2716	size_t batch_size,
2717	const void* input,
2718	void* output,
2719	pthreadpool_t threadpool);
2720
2721	enum xnn_status xnn_create_convolution2d_nhwc_f16(
2722	uint32_t input_padding_top,
2723	uint32_t input_padding_right,
2724	uint32_t input_padding_bottom,
2725	uint32_t input_padding_left,
2726	uint32_t kernel_height,
2727	uint32_t kernel_width,
2728	uint32_t subsampling_height,
2729	uint32_t subsampling_width,
2730	uint32_t dilation_height,
2731	uint32_t dilation_width,
2732	uint32_t groups,
2733	size_t group_input_channels,
2734	size_t group_output_channels,
2735	size_t input_channel_stride,
2736	size_t output_channel_stride,
2737	const void* kernel,
2738	const void* bias,
2739	float output_min,
2740	float output_max,
2741	uint32_t flags,
2742	xnn_caches_t caches,
2743	xnn_operator_t* convolution_op_out);
2744
2745	enum xnn_status xnn_setup_convolution2d_nhwc_f16(
2746	xnn_operator_t convolution_op,
2747	size_t batch_size,
2748	size_t input_height,
2749	size_t input_width,
2750	const void* input,
2751	void* output,
2752	pthreadpool_t threadpool);
2753
2754	enum xnn_status xnn_create_deconvolution2d_nhwc_f16(
2755	uint32_t output_padding_top,
2756	uint32_t output_padding_right,
2757	uint32_t output_padding_bottom,
2758	uint32_t output_padding_left,
2759	uint32_t kernel_height,
2760	uint32_t kernel_width,
2761	uint32_t stride_height,
2762	uint32_t stride_width,
2763	uint32_t dilation_height,
2764	uint32_t dilation_width,
2765	uint32_t groups,
2766	size_t group_input_channels,
2767	size_t group_output_channels,
2768	size_t input_pixel_stride,
2769	size_t output_pixel_stride,
2770	const void* kernel,
2771	const void* bias,
2772	float output_min,
2773	float output_max,
2774	uint32_t flags,
2775	xnn_caches_t caches,
2776	xnn_operator_t* deconvolution_op_out);
2777
2778	enum xnn_status xnn_setup_deconvolution2d_nhwc_f16(
2779	xnn_operator_t deconvolution_op,
2780	size_t batch_size,
2781	size_t input_height,
2782	size_t input_width,
2783	uint32_t adjustment_height,
2784	uint32_t adjustment_width,
2785	const void* input,
2786	void* output,
2787	pthreadpool_t threadpool);
2788
2789	enum xnn_status xnn_create_divide_nd_f16(
2790	float output_min,
2791	float output_max,
2792	uint32_t flags,
2793	xnn_operator_t* divide_op_out);
2794
2795	enum xnn_status xnn_setup_divide_nd_f16(
2796	xnn_operator_t divide_op,
2797	size_t num_input1_dims,
2798	const size_t* input1_shape,
2799	size_t num_input2_dims,
2800	const size_t* input2_shape,
2801	const void* input1,
2802	const void* input2,
2803	void* output,
2804	pthreadpool_t threadpool);
2805
2806	enum xnn_status xnn_create_elu_nc_f16(
2807	size_t channels,
2808	size_t input_stride,
2809	size_t output_stride,
2810	float alpha,
2811	uint32_t flags,
2812	xnn_operator_t* elu_op_out);
2813
2814	enum xnn_status xnn_setup_elu_nc_f16(
2815	xnn_operator_t elu_op,
2816	size_t batch_size,
2817	const void* input,
2818	void* output,
2819	pthreadpool_t threadpool);
2820
2821	enum xnn_status xnn_create_floor_nc_f16(
2822	size_t channels,
2823	size_t input_stride,
2824	size_t output_stride,
2825	uint32_t flags,
2826	xnn_operator_t* floor_op_out);
2827
2828	enum xnn_status xnn_setup_floor_nc_f16(
2829	xnn_operator_t floor_op,
2830	size_t batch_size,
2831	const void* input,
2832	void* output,
2833	pthreadpool_t threadpool);
2834
2835	enum xnn_status xnn_create_fully_connected_nc_f16(
2836	size_t input_channels,
2837	size_t output_channels,
2838	size_t input_stride,
2839	size_t output_stride,
2840	const void* kernel,
2841	const void* bias,
2842	float output_min,
2843	float output_max,
2844	uint32_t flags,
2845	xnn_caches_t caches,
2846	xnn_operator_t* fully_connected_op_out);
2847
2848	enum xnn_status xnn_setup_fully_connected_nc_f16(
2849	xnn_operator_t fully_connected_op,
2850	size_t batch_size,
2851	const void* input,
2852	void* output,
2853	pthreadpool_t threadpool);
2854
2855	enum xnn_status xnn_create_global_average_pooling_nwc_f16(
2856	size_t channels,
2857	size_t input_stride,
2858	size_t output_stride,
2859	float output_min,
2860	float output_max,
2861	uint32_t flags,
2862	xnn_operator_t* global_average_pooling_op_out);
2863
2864	enum xnn_status xnn_setup_global_average_pooling_nwc_f16(
2865	xnn_operator_t global_average_pooling_op,
2866	size_t batch_size,
2867	size_t width,
2868	const void* input,
2869	void* output,
2870	pthreadpool_t threadpool);
2871
2872	enum xnn_status xnn_create_hardswish_nc_f16(
2873	size_t channels,
2874	size_t input_stride,
2875	size_t output_stride,
2876	uint32_t flags,
2877	xnn_operator_t* hardswish_op_out);
2878
2879	enum xnn_status xnn_setup_hardswish_nc_f16(
2880	xnn_operator_t hardswish_op,
2881	size_t batch_size,
2882	const void* input,
2883	void* output,
2884	pthreadpool_t threadpool);
2885
2886	enum xnn_status xnn_create_leaky_relu_nc_f16(
2887	size_t channels,
2888	size_t input_stride,
2889	size_t output_stride,
2890	float negative_slope,
2891	uint32_t flags,
2892	xnn_operator_t* leaky_relu_op_out);
2893
2894	enum xnn_status xnn_setup_leaky_relu_nc_f16(
2895	xnn_operator_t leaky_relu_op,
2896	size_t batch_size,
2897	const void* input,
2898	void* output,
2899	pthreadpool_t threadpool);
2900
2901	enum xnn_status xnn_create_max_pooling2d_nhwc_f16(
2902	uint32_t input_padding_top,
2903	uint32_t input_padding_right,
2904	uint32_t input_padding_bottom,
2905	uint32_t input_padding_left,
2906	uint32_t pooling_height,
2907	uint32_t pooling_width,
2908	uint32_t stride_height,
2909	uint32_t stride_width,
2910	uint32_t dilation_height,
2911	uint32_t dilation_width,
2912	size_t channels,
2913	size_t input_pixel_stride,
2914	size_t output_pixel_stride,
2915	float output_min,
2916	float output_max,
2917	uint32_t flags,
2918	xnn_operator_t* max_pooling_op_out);
2919
2920	enum xnn_status xnn_setup_max_pooling2d_nhwc_f16(
2921	xnn_operator_t max_pooling_op,
2922	size_t batch_size,
2923	size_t input_height,
2924	size_t input_width,
2925	const void* input,
2926	void* output,
2927	pthreadpool_t threadpool);
2928
2929	enum xnn_status xnn_create_maximum_nd_f16(
2930	uint32_t flags,
2931	xnn_operator_t* maximum_op_out);
2932
2933	enum xnn_status xnn_setup_maximum_nd_f16(
2934	xnn_operator_t maximum_op,
2935	size_t num_input1_dims,
2936	const size_t* input1_shape,
2937	size_t num_input2_dims,
2938	const size_t* input2_shape,
2939	const void* input1,
2940	const void* input2,
2941	void* output,
2942	pthreadpool_t threadpool);
2943
2944	enum xnn_status xnn_create_minimum_nd_f16(
2945	uint32_t flags,
2946	xnn_operator_t* minimum_op_out);
2947
2948	enum xnn_status xnn_setup_minimum_nd_f16(
2949	xnn_operator_t minimum_op,
2950	size_t num_input1_dims,
2951	const size_t* input1_shape,
2952	size_t num_input2_dims,
2953	const size_t* input2_shape,
2954	const void* input1,
2955	const void* input2,
2956	void* output,
2957	pthreadpool_t threadpool);
2958
2959	enum xnn_status xnn_create_multiply_nd_f16(
2960	float output_min,
2961	float output_max,
2962	uint32_t flags,
2963	xnn_operator_t* multiply_op_out);
2964
2965	enum xnn_status xnn_setup_multiply_nd_f16(
2966	xnn_operator_t multiply_op,
2967	size_t num_input1_dims,
2968	const size_t* input1_shape,
2969	size_t num_input2_dims,
2970	const size_t* input2_shape,
2971	const void* input1,
2972	const void* input2,
2973	void* output,
2974	pthreadpool_t threadpool);
2975
2976	enum xnn_status xnn_create_negate_nc_f16(
2977	size_t channels,
2978	size_t input_stride,
2979	size_t output_stride,
2980	uint32_t flags,
2981	xnn_operator_t* negate_op_out);
2982
2983	enum xnn_status xnn_setup_negate_nc_f16(
2984	xnn_operator_t negate_op,
2985	size_t batch_size,
2986	const void* input,
2987	void* output,
2988	pthreadpool_t threadpool);
2989
2990	enum xnn_status xnn_create_prelu_nc_f16(
2991	size_t channels,
2992	size_t input_stride,
2993	size_t output_stride,
2994	const void* negative_slope,
2995	uint32_t flags,
2996	xnn_caches_t caches,
2997	xnn_operator_t* prelu_op_out);
2998
2999	enum xnn_status xnn_setup_prelu_nc_f16(
3000	xnn_operator_t prelu_op,
3001	size_t batch_size,
3002	const void* input,
3003	void* output,
3004	pthreadpool_t threadpool);
3005
3006	enum xnn_status xnn_create_resize_bilinear2d_nhwc_f16(
3007	size_t channels,
3008	size_t input_pixel_stride,
3009	size_t output_pixel_stride,
3010	uint32_t flags,
3011	xnn_operator_t* resize_op_out);
3012
3013	enum xnn_status xnn_setup_resize_bilinear2d_nhwc_f16(
3014	xnn_operator_t resize_op,
3015	size_t batch_size,
3016	size_t input_height,
3017	size_t input_width,
3018	size_t output_height,
3019	size_t output_width,
3020	const void* input,
3021	void* output,
3022	pthreadpool_t threadpool);
3023
3024	enum xnn_status xnn_create_sigmoid_nc_f16(
3025	size_t channels,
3026	size_t input_stride,
3027	size_t output_stride,
3028	uint32_t flags,
3029	xnn_operator_t* sigmoid_op_out);
3030
3031	enum xnn_status xnn_setup_sigmoid_nc_f16(
3032	xnn_operator_t sigmoid_op,
3033	size_t batch_size,
3034	const void* input,
3035	void* output,
3036	pthreadpool_t threadpool);
3037
3038	enum xnn_status xnn_create_softmax_nc_f16(
3039	size_t channels,
3040	size_t input_stride,
3041	size_t output_stride,
3042	uint32_t flags,
3043	xnn_operator_t* softmax_op_out);
3044
3045	enum xnn_status xnn_setup_softmax_nc_f16(
3046	xnn_operator_t softmax_op,
3047	size_t batch_size,
3048	const void* input,
3049	void* output,
3050	pthreadpool_t threadpool);
3051
3052	enum xnn_status xnn_create_square_nc_f16(
3053	size_t channels,
3054	size_t input_stride,
3055	size_t output_stride,
3056	uint32_t flags,
3057	xnn_operator_t* square_op_out);
3058
3059	enum xnn_status xnn_setup_square_nc_f16(
3060	xnn_operator_t square_op,
3061	size_t batch_size,
3062	const void* input,
3063	void* output,
3064	pthreadpool_t threadpool);
3065
3066	enum xnn_status xnn_create_square_root_nc_f16(
3067	size_t channels,
3068	size_t input_stride,
3069	size_t output_stride,
3070	uint32_t flags,
3071	xnn_operator_t* sqrt_op_out);
3072
3073	enum xnn_status xnn_setup_square_root_nc_f16(
3074	xnn_operator_t sqrt_op,
3075	size_t batch_size,
3076	const void* input,
3077	void* output,
3078	pthreadpool_t threadpool);
3079
3080	enum xnn_status xnn_create_squared_difference_nd_f16(
3081	uint32_t flags,
3082	xnn_operator_t* squared_difference_op_out);
3083
3084	enum xnn_status xnn_setup_squared_difference_nd_f16(
3085	xnn_operator_t squared_difference_op,
3086	size_t num_input1_dims,
3087	const size_t* input1_shape,
3088	size_t num_input2_dims,
3089	const size_t* input2_shape,
3090	const void* input1,
3091	const void* input2,
3092	void* output,
3093	pthreadpool_t threadpool);
3094
3095	enum xnn_status xnn_create_subtract_nd_f16(
3096	float output_min,
3097	float output_max,
3098	uint32_t flags,
3099	xnn_operator_t* subtract_op_out);
3100
3101	enum xnn_status xnn_setup_subtract_nd_f16(
3102	xnn_operator_t subtract_op,
3103	size_t num_input1_dims,
3104	const size_t* input1_shape,
3105	size_t num_input2_dims,
3106	const size_t* input2_shape,
3107	const void* input1,
3108	const void* input2,
3109	void* output,
3110	pthreadpool_t threadpool);
3111
3112	enum xnn_status xnn_create_truncation_nc_f16(
3113	size_t channels,
3114	size_t input_stride,
3115	size_t output_stride,
3116	uint32_t flags,
3117	xnn_operator_t* truncation_op_out);
3118
3119	enum xnn_status xnn_setup_truncation_nc_f16(
3120	xnn_operator_t truncation_op,
3121	size_t batch_size,
3122	const void* input,
3123	void* output,
3124	pthreadpool_t threadpool);
3125
3126	#ifndef XNN_NO_NCHW_OPERATORS
3127
3128	enum xnn_status xnn_create_convolution2d_nchw_f16(
3129	uint32_t input_padding_top,
3130	uint32_t input_padding_right,
3131	uint32_t input_padding_bottom,
3132	uint32_t input_padding_left,
3133	uint32_t kernel_height,
3134	uint32_t kernel_width,
3135	uint32_t subsampling_height,
3136	uint32_t subsampling_width,
3137	uint32_t dilation_height,
3138	uint32_t dilation_width,
3139	uint32_t groups,
3140	size_t group_input_channels,
3141	size_t group_output_channels,
3142	size_t input_channel_stride,
3143	size_t output_channel_stride,
3144	const void* kernel,
3145	const void* bias,
3146	float output_min,
3147	float output_max,
3148	uint32_t flags,
3149	xnn_caches_t caches,
3150	xnn_operator_t* convolution_op_out);
3151
3152	enum xnn_status xnn_setup_convolution2d_nchw_f16(
3153	xnn_operator_t convolution_op,
3154	size_t batch_size,
3155	size_t input_height,
3156	size_t input_width,
3157	const void* input,
3158	void* output,
3159	pthreadpool_t threadpool);
3160
3161	enum xnn_status xnn_create_depth_to_space_nchw2nhwc_x16(
3162	size_t output_channels,
3163	size_t input_channel_stride,
3164	size_t output_channel_stride,
3165	uint32_t block_size,
3166	uint32_t flags,
3167	xnn_operator_t* depth_to_space_op_out);
3168
3169	enum xnn_status xnn_setup_depth_to_space_nchw2nhwc_x16(
3170	xnn_operator_t depth_to_space_op,
3171	size_t batch_size,
3172	size_t input_height,
3173	size_t input_width,
3174	const void* input,
3175	void* output,
3176	pthreadpool_t threadpool);
3177
3178	enum xnn_status xnn_create_global_average_pooling_ncw_f16(
3179	size_t channels,
3180	float output_min,
3181	float output_max,
3182	uint32_t flags,
3183	xnn_operator_t* global_average_pooling_op_out);
3184
3185	enum xnn_status xnn_setup_global_average_pooling_ncw_f16(
3186	xnn_operator_t global_average_pooling_op,
3187	size_t batch_size,
3188	size_t width,
3189	const void* input,
3190	void* output,
3191	pthreadpool_t threadpool);
3192
3193	enum xnn_status xnn_create_resize_bilinear2d_nchw_f16(
3194	size_t channels,
3195	size_t input_pixel_stride,
3196	size_t output_pixel_stride,
3197	uint32_t flags,
3198	xnn_operator_t* resize_op_out);
3199
3200	enum xnn_status xnn_setup_resize_bilinear2d_nchw_f16(
3201	xnn_operator_t resize_op,
3202	size_t batch_size,
3203	size_t input_height,
3204	size_t input_width,
3205	size_t output_height,
3206	size_t output_width,
3207	const void* input,
3208	void* output,
3209	pthreadpool_t threadpool);
3210
3211	#endif // XNN_NO_NCHW_OPERATORS
3212
3213	#endif // XNN_NO_F16_OPERATORS
3214
3215	#ifndef XNN_NO_X16_OPERATORS
3216
3217	enum xnn_status xnn_create_constant_pad_nd_x16(
3218	const void* padding_value,
3219	uint32_t flags,
3220	xnn_operator_t* constant_pad_op_out);
3221
3222	enum xnn_status xnn_setup_constant_pad_nd_x16(
3223	xnn_operator_t constant_pad_op,
3224	size_t num_dims,
3225	const size_t* input_shape,
3226	const size_t* pre_padding,
3227	const size_t* post_padding,
3228	const void* input,
3229	void* output,
3230	pthreadpool_t threadpool);
3231
3232	enum xnn_status xnn_run_constant_pad_nd_x16(
3233	uint32_t flags,
3234	size_t num_dims,
3235	const size_t* input_shape,
3236	const size_t* pre_paddings,
3237	const size_t* post_paddings,
3238	const void* input,
3239	void* output,
3240	const void* padding_value,
3241	pthreadpool_t threadpool);
3242
3243	enum xnn_status xnn_create_copy_nc_x16(
3244	size_t channels,
3245	size_t input_stride,
3246	size_t output_stride,
3247	uint32_t flags,
3248	xnn_operator_t* copy_op_out);
3249
3250	enum xnn_status xnn_setup_copy_nc_x16(
3251	xnn_operator_t copy_op,
3252	size_t batch_size,
3253	const void* input,
3254	void* output,
3255	pthreadpool_t threadpool);
3256
3257	enum xnn_status xnn_create_depth_to_space_nhwc_x16(
3258	size_t output_channels,
3259	size_t input_channel_stride,
3260	size_t output_channel_stride,
3261	uint32_t block_size,
3262	uint32_t flags,
3263	xnn_operator_t* depth_to_space_op_out);
3264
3265	enum xnn_status xnn_setup_depth_to_space_nhwc_x16(
3266	xnn_operator_t depth_to_space_op,
3267	size_t batch_size,
3268	size_t input_height,
3269	size_t input_width,
3270	const void* input,
3271	void* output,
3272	pthreadpool_t threadpool);
3273
3274	enum xnn_status xnn_create_slice_nd_x16(
3275	uint32_t flags,
3276	xnn_operator_t* slice_op_out);
3277
3278	enum xnn_status xnn_setup_slice_nd_x16(
3279	xnn_operator_t slice_op,
3280	size_t num_dims,
3281	const size_t* input_shape,
3282	const size_t* offsets,
3283	const size_t* sizes,
3284	const void* input,
3285	void* output,
3286	pthreadpool_t threadpool);
3287
3288	enum xnn_status xnn_create_space_to_depth_nhwc_x16(
3289	size_t input_channels,
3290	size_t input_channel_stride,
3291	size_t output_channel_stride,
3292	uint32_t block_size,
3293	uint32_t flags,
3294	xnn_operator_t* space_to_depth_op_out);
3295
3296	enum xnn_status xnn_setup_space_to_depth_nhwc_x16(
3297	xnn_operator_t space_to_depth_op,
3298	size_t batch_size,
3299	size_t input_height,
3300	size_t input_width,
3301	const void* input,
3302	void* output,
3303	pthreadpool_t threadpool);
3304
3305	enum xnn_status xnn_create_transpose_nd_x16(
3306	uint32_t flags,
3307	xnn_operator_t* transpose_op_out);
3308
3309	enum xnn_status xnn_setup_transpose_nd_x16(
3310	xnn_operator_t transpose_op,
3311	const void* input,
3312	void* output,
3313	const size_t num_dims,
3314	const size_t* input_shape,
3315	const size_t* output_perm,
3316	pthreadpool_t threadpool);
3317
3318	enum xnn_status xnn_run_transpose_nd_x16(
3319	const void* input,
3320	void* output,
3321	const size_t num_dims,
3322	const size_t* input_shape,
3323	const size_t* output_perm,
3324	uint32_t flags,
3325	pthreadpool_t threadpool);
3326
3327	#endif // XNN_NO_X16_OPERATORS
3328
3329	#ifndef XNN_NO_QC8_OPERATORS
3330
3331	enum xnn_status xnn_create_convolution2d_nhwc_qc8(
3332	uint32_t input_padding_top,
3333	uint32_t input_padding_right,
3334	uint32_t input_padding_bottom,
3335	uint32_t input_padding_left,
3336	uint32_t kernel_height,
3337	uint32_t kernel_width,
3338	uint32_t subsampling_height,
3339	uint32_t subsampling_width,
3340	uint32_t dilation_height,
3341	uint32_t dilation_width,
3342	uint32_t groups,
3343	size_t group_input_channels,
3344	size_t group_output_channels,
3345	size_t input_channel_stride,
3346	size_t output_channel_stride,
3347	int8_t input_zero_point,
3348	float input_scale,
3349	const float* kernel_scale,
3350	const int8_t* kernel,
3351	const int32_t* bias,
3352	int8_t output_zero_point,
3353	float output_scale,
3354	int8_t output_min,
3355	int8_t output_max,
3356	uint32_t flags,
3357	xnn_caches_t caches,
3358	xnn_operator_t* convolution_op_out);
3359
3360	enum xnn_status xnn_setup_convolution2d_nhwc_qc8(
3361	xnn_operator_t convolution_op,
3362	size_t batch_size,
3363	size_t input_height,
3364	size_t input_width,
3365	const int8_t* input,
3366	int8_t* output,
3367	pthreadpool_t threadpool);
3368
3369	#endif // XNN_NO_QC8_OPERATORS
3370
3371	#ifndef XNN_NO_QS8_OPERATORS
3372
3373	enum xnn_status xnn_create_add_nd_qs8(
3374	int8_t input1_zero_point,
3375	float input1_scale,
3376	int8_t input2_zero_point,
3377	float input2_scale,
3378	int8_t output_zero_point,
3379	float output_scale,
3380	int8_t output_min,
3381	int8_t output_max,
3382	uint32_t flags,
3383	xnn_operator_t* add_op_out);
3384
3385	enum xnn_status xnn_setup_add_nd_qs8(
3386	xnn_operator_t add_op,
3387	size_t num_input1_dims,
3388	const size_t* input1_shape,
3389	size_t num_input2_dims,
3390	const size_t* input2_shape,
3391	const int8_t* input1,
3392	const int8_t* input2,
3393	int8_t* output,
3394	pthreadpool_t threadpool);
3395
3396	enum xnn_status xnn_run_add_nd_qs8(
3397	size_t num_input1_dims,
3398	const size_t* input1_shape,
3399	int8_t input1_zero_point,
3400	float input1_scale,
3401	size_t num_input2_dims,
3402	const size_t* input2_shape,
3403	int8_t input2_zero_point,
3404	float input2_scale,
3405	const int8_t* input1,
3406	const int8_t* input2,
3407	int8_t* output,
3408	int8_t output_zero_point,
3409	float output_scale,
3410	int8_t output_min,
3411	int8_t output_max,
3412	uint32_t flags,
3413	pthreadpool_t threadpool);
3414
3415	enum xnn_status xnn_create_convolution2d_nhwc_qs8(
3416	uint32_t input_padding_top,
3417	uint32_t input_padding_right,
3418	uint32_t input_padding_bottom,
3419	uint32_t input_padding_left,
3420	uint32_t kernel_height,
3421	uint32_t kernel_width,
3422	uint32_t subsampling_height,
3423	uint32_t subsampling_width,
3424	uint32_t dilation_height,
3425	uint32_t dilation_width,
3426	uint32_t groups,
3427	size_t group_input_channels,
3428	size_t group_output_channels,
3429	size_t input_channel_stride,
3430	size_t output_channel_stride,
3431	int8_t input_zero_point,
3432	float input_scale,
3433	float kernel_scale,
3434	const int8_t* kernel,
3435	const int32_t* bias,
3436	int8_t output_zero_point,
3437	float output_scale,
3438	int8_t output_min,
3439	int8_t output_max,
3440	uint32_t flags,
3441	xnn_caches_t caches,
3442	xnn_operator_t* convolution_op_out);
3443
3444	enum xnn_status xnn_setup_convolution2d_nhwc_qs8(
3445	xnn_operator_t convolution_op,
3446	size_t batch_size,
3447	size_t input_height,
3448	size_t input_width,
3449	const int8_t* input,
3450	int8_t* output,
3451	pthreadpool_t threadpool);
3452
3453	enum xnn_status xnn_create_deconvolution2d_nhwc_qs8(
3454	uint32_t output_padding_top,
3455	uint32_t output_padding_right,
3456	uint32_t output_padding_bottom,
3457	uint32_t output_padding_left,
3458	uint32_t kernel_height,
3459	uint32_t kernel_width,
3460	uint32_t stride_height,
3461	uint32_t stride_width,
3462	uint32_t dilation_height,
3463	uint32_t dilation_width,
3464	uint32_t groups,
3465	size_t group_input_channels,
3466	size_t group_output_channels,
3467	size_t input_pixel_stride,
3468	size_t output_pixel_stride,
3469	int8_t input_zero_point,
3470	float input_scale,
3471	float kernel_scale,
3472	const int8_t* kernel,
3473	const int32_t* bias,
3474	int8_t output_zero_point,
3475	float output_scale,
3476	int8_t output_min,
3477	int8_t output_max,
3478	uint32_t flags,
3479	xnn_caches_t caches,
3480	xnn_operator_t* deconvolution_op_out);
3481
3482	enum xnn_status xnn_setup_deconvolution2d_nhwc_qs8(
3483	xnn_operator_t deconvolution_op,
3484	size_t batch_size,
3485	size_t input_height,
3486	size_t input_width,
3487	uint32_t adjustment_height,
3488	uint32_t adjustment_width,
3489	const int8_t* input,
3490	int8_t* output,
3491	pthreadpool_t threadpool);
3492
3493	enum xnn_status xnn_create_elu_nc_qs8(
3494	size_t channels,
3495	size_t input_stride,
3496	size_t output_stride,
3497	float alpha,
3498	int8_t input_zero_point,
3499	float input_scale,
3500	int8_t output_zero_point,
3501	float output_scale,
3502	int8_t output_min,
3503	int8_t output_max,
3504	uint32_t flags,
3505	xnn_operator_t* elu_op_out);
3506
3507	enum xnn_status xnn_setup_elu_nc_qs8(
3508	xnn_operator_t elu_op,
3509	size_t batch_size,
3510	const int8_t* input,
3511	int8_t* output,
3512	pthreadpool_t threadpool);
3513
3514	enum xnn_status xnn_create_fully_connected_nc_qs8(
3515	size_t input_channels,
3516	size_t output_channels,
3517	size_t input_stride,
3518	size_t output_stride,
3519	int8_t input_zero_point,
3520	float input_scale,
3521	float kernel_scale,
3522	const int8_t* kernel,
3523	const int32_t* bias,
3524	int8_t output_zero_point,
3525	float output_scale,
3526	int8_t output_min,
3527	int8_t output_max,
3528	uint32_t flags,
3529	xnn_caches_t caches,
3530	xnn_operator_t* fully_connected_op_out);
3531
3532	enum xnn_status xnn_setup_fully_connected_nc_qs8(
3533	xnn_operator_t fully_connected_op,
3534	size_t batch_size,
3535	const int8_t* input,
3536	int8_t* output,
3537	pthreadpool_t threadpool);
3538
3539	enum xnn_status xnn_create_global_average_pooling_nwc_qs8(
3540	size_t channels,
3541	size_t input_stride,
3542	size_t output_stride,
3543	int8_t input_zero_point,
3544	float input_scale,
3545	int8_t output_zero_point,
3546	float output_scale,
3547	int8_t output_min,
3548	int8_t output_max,
3549	uint32_t flags,
3550	xnn_operator_t* global_average_pooling_op_out);
3551
3552	enum xnn_status xnn_setup_global_average_pooling_nwc_qs8(
3553	xnn_operator_t global_average_pooling_op,
3554	size_t batch_size,
3555	size_t width,
3556	const int8_t* input,
3557	int8_t* output,
3558	pthreadpool_t threadpool);
3559
3560	enum xnn_status xnn_create_multiply_nd_qs8(
3561	int8_t input1_zero_point,
3562	float input1_scale,
3563	int8_t input2_zero_point,
3564	float input2_scale,
3565	int8_t output_zero_point,
3566	float output_scale,
3567	int8_t output_min,
3568	int8_t output_max,
3569	uint32_t flags,
3570	xnn_operator_t* multiply_op_out);
3571
3572	enum xnn_status xnn_setup_multiply_nd_qs8(
3573	xnn_operator_t multiply_op,
3574	size_t num_input1_dims,
3575	const size_t* input1_shape,
3576	size_t num_input2_dims,
3577	const size_t* input2_shape,
3578	const int8_t* input1,
3579	const int8_t* input2,
3580	int8_t* output,
3581	pthreadpool_t threadpool);
3582
3583	enum xnn_status xnn_run_multiply_nd_qs8(
3584	size_t num_input1_dims,
3585	const size_t* input1_shape,
3586	int8_t input1_zero_point,
3587	float input1_scale,
3588	size_t num_input2_dims,
3589	const size_t* input2_shape,
3590	int8_t input2_zero_point,
3591	float input2_scale,
3592	const int8_t* input1,
3593	const int8_t* input2,
3594	int8_t* output,
3595	int8_t output_zero_point,
3596	float output_scale,
3597	int8_t output_min,
3598	int8_t output_max,
3599	uint32_t flags,
3600	pthreadpool_t threadpool);
3601
3602	enum xnn_status xnn_create_leaky_relu_nc_qs8(
3603	size_t channels,
3604	size_t input_stride,
3605	size_t output_stride,
3606	float negative_slope,
3607	int8_t input_zero_point,
3608	float input_scale,
3609	int8_t output_zero_point,
3610	float output_scale,
3611	uint32_t flags,
3612	xnn_operator_t* leaky_relu_op_out);
3613
3614	enum xnn_status xnn_setup_leaky_relu_nc_qs8(
3615	xnn_operator_t leaky_relu_op,
3616	size_t batch_size,
3617	const int8_t* input,
3618	int8_t* output,
3619	pthreadpool_t threadpool);
3620
3621	enum xnn_status xnn_create_sigmoid_nc_qs8(
3622	size_t channels,
3623	size_t input_stride,
3624	size_t output_stride,
3625	int8_t input_zero_point,
3626	float input_scale,
3627	int8_t output_zero_point,
3628	float output_scale,
3629	int8_t output_min,
3630	int8_t output_max,
3631	uint32_t flags,
3632	xnn_operator_t* sigmoid_op_out);
3633
3634	enum xnn_status xnn_setup_sigmoid_nc_qs8(
3635	xnn_operator_t sigmoid_op,
3636	size_t batch_size,
3637	const int8_t* input,
3638	int8_t* output,
3639	pthreadpool_t threadpool);
3640
3641	enum xnn_status xnn_create_subtract_nd_qs8(
3642	int8_t input1_zero_point,
3643	float input1_scale,
3644	int8_t input2_zero_point,
3645	float input2_scale,
3646	int8_t output_zero_point,
3647	float output_scale,
3648	int8_t output_min,
3649	int8_t output_max,
3650	uint32_t flags,
3651	xnn_operator_t* subtract_op_out);
3652
3653	enum xnn_status xnn_setup_subtract_nd_qs8(
3654	xnn_operator_t subtract_op,
3655	size_t num_input1_dims,
3656	const size_t* input1_shape,
3657	size_t num_input2_dims,
3658	const size_t* input2_shape,
3659	const int8_t* input1,
3660	const int8_t* input2,
3661	int8_t* output,
3662	pthreadpool_t threadpool);
3663
3664	enum xnn_status xnn_run_subtract_nd_qs8(
3665	size_t num_input1_dims,
3666	const size_t* input1_shape,
3667	int8_t input1_zero_point,
3668	float input1_scale,
3669	size_t num_input2_dims,
3670	const size_t* input2_shape,
3671	int8_t input2_zero_point,
3672	float input2_scale,
3673	const int8_t* input1,
3674	const int8_t* input2,
3675	int8_t* output,
3676	int8_t output_zero_point,
3677	float output_scale,
3678	int8_t output_min,
3679	int8_t output_max,
3680	uint32_t flags,
3681	pthreadpool_t threadpool);
3682
3683	enum xnn_status xnn_create_tanh_nc_qs8(
3684	size_t channels,
3685	size_t input_stride,
3686	size_t output_stride,
3687	int8_t input_zero_point,
3688	float input_scale,
3689	int8_t output_zero_point,
3690	float output_scale,
3691	int8_t output_min,
3692	int8_t output_max,
3693	uint32_t flags,
3694	xnn_operator_t* tanh_op_out);
3695
3696	enum xnn_status xnn_setup_tanh_nc_qs8(
3697	xnn_operator_t tanh_op,
3698	size_t batch_size,
3699	const int8_t* input,
3700	int8_t* output,
3701	pthreadpool_t threadpool);
3702
3703	#endif // XNN_NO_QS8_OPERATORS
3704
3705	#ifndef XNN_NO_QU8_OPERATORS
3706
3707	enum xnn_status xnn_create_add_nd_qu8(
3708	uint8_t input1_zero_point,
3709	float input1_scale,
3710	uint8_t input2_zero_point,
3711	float input2_scale,
3712	uint8_t output_zero_point,
3713	float output_scale,
3714	uint8_t output_min,
3715	uint8_t output_max,
3716	uint32_t flags,
3717	xnn_operator_t* add_op_out);
3718
3719	enum xnn_status xnn_setup_add_nd_qu8(
3720	xnn_operator_t add_op,
3721	size_t num_input1_dims,
3722	const size_t* input1_shape,
3723	size_t num_input2_dims,
3724	const size_t* input2_shape,
3725	const uint8_t* input1,
3726	const uint8_t* input2,
3727	uint8_t* output,
3728	pthreadpool_t threadpool);
3729
3730	enum xnn_status xnn_run_add_nd_qu8(
3731	size_t num_input1_dims,
3732	const size_t* input1_shape,
3733	uint8_t input1_zero_point,
3734	float input1_scale,
3735	size_t num_input2_dims,
3736	const size_t* input2_shape,
3737	uint8_t input2_zero_point,
3738	float input2_scale,
3739	const uint8_t* input1,
3740	const uint8_t* input2,
3741	uint8_t* output,
3742	uint8_t output_zero_point,
3743	float output_scale,
3744	uint8_t output_min,
3745	uint8_t output_max,
3746	uint32_t flags,
3747	pthreadpool_t threadpool);
3748
3749	enum xnn_status xnn_create_average_pooling2d_nhwc_qu8(
3750	uint32_t input_padding_top,
3751	uint32_t input_padding_right,
3752	uint32_t input_padding_bottom,
3753	uint32_t input_padding_left,
3754	uint32_t pooling_height,
3755	uint32_t pooling_width,
3756	uint32_t stride_height,
3757	uint32_t stride_width,
3758	size_t channels,
3759	size_t input_pixel_stride,
3760	size_t output_pixel_stride,
3761	uint8_t input_zero_point,
3762	float input_scale,
3763	uint8_t output_zero_point,
3764	float output_scale,
3765	uint8_t output_min,
3766	uint8_t output_max,
3767	uint32_t flags,
3768	xnn_operator_t* average_pooling_op_out);
3769
3770	enum xnn_status xnn_setup_average_pooling2d_nhwc_qu8(
3771	xnn_operator_t average_pooling_op,
3772	size_t batch_size,
3773	size_t input_height,
3774	size_t input_width,
3775	const uint8_t* input,
3776	uint8_t* output,
3777	pthreadpool_t threadpool);
3778
3779	enum xnn_status xnn_create_convolution2d_nhwc_qu8(
3780	uint32_t input_padding_top,
3781	uint32_t input_padding_right,
3782	uint32_t input_padding_bottom,
3783	uint32_t input_padding_left,
3784	uint32_t kernel_height,
3785	uint32_t kernel_width,
3786	uint32_t subsampling_height,
3787	uint32_t subsampling_width,
3788	uint32_t dilation_height,
3789	uint32_t dilation_width,
3790	uint32_t groups,
3791	size_t group_input_channels,
3792	size_t group_output_channels,
3793	size_t input_channel_stride,
3794	size_t output_channel_stride,
3795	uint8_t input_zero_point,
3796	float input_scale,
3797	uint8_t kernel_zero_point,
3798	float kernel_scale,
3799	const uint8_t* kernel,
3800	const int32_t* bias,
3801	uint8_t output_zero_point,
3802	float output_scale,
3803	uint8_t output_min,
3804	uint8_t output_max,
3805	uint32_t flags,
3806	xnn_caches_t caches,
3807	xnn_operator_t* convolution_op_out);
3808
3809	enum xnn_status xnn_setup_convolution2d_nhwc_qu8(
3810	xnn_operator_t convolution_op,
3811	size_t batch_size,
3812	size_t input_height,
3813	size_t input_width,
3814	const uint8_t* input,
3815	uint8_t* output,
3816	pthreadpool_t threadpool);
3817
3818	enum xnn_status xnn_create_deconvolution2d_nhwc_qu8(
3819	uint32_t output_padding_top,
3820	uint32_t output_padding_right,
3821	uint32_t output_padding_bottom,
3822	uint32_t output_padding_left,
3823	uint32_t kernel_height,
3824	uint32_t kernel_width,
3825	uint32_t stride_height,
3826	uint32_t stride_width,
3827	uint32_t dilation_height,
3828	uint32_t dilation_width,
3829	uint32_t groups,
3830	size_t group_input_channels,
3831	size_t group_output_channels,
3832	size_t input_pixel_stride,
3833	size_t output_pixel_stride,
3834	uint8_t input_zero_point,
3835	float input_scale,
3836	uint8_t kernel_zero_point,
3837	float kernel_scale,
3838	const uint8_t* kernel,
3839	const int32_t* bias,
3840	uint8_t output_zero_point,
3841	float output_scale,
3842	uint8_t output_min,
3843	uint8_t output_max,
3844	uint32_t flags,
3845	xnn_caches_t caches,
3846	xnn_operator_t* deconvolution_op_out);
3847
3848	enum xnn_status xnn_setup_deconvolution2d_nhwc_qu8(
3849	xnn_operator_t deconvolution_op,
3850	size_t batch_size,
3851	size_t input_height,
3852	size_t input_width,
3853	uint32_t adjustment_height,
3854	uint32_t adjustment_width,
3855	const uint8_t* input,
3856	uint8_t* output,
3857	pthreadpool_t threadpool);
3858
3859	enum xnn_status xnn_create_fully_connected_nc_qu8(
3860	size_t input_channels,
3861	size_t output_channels,
3862	size_t input_stride,
3863	size_t output_stride,
3864	uint8_t input_zero_point,
3865	float input_scale,
3866	uint8_t kernel_zero_point,
3867	float kernel_scale,
3868	const uint8_t* kernel,
3869	const int32_t* bias,
3870	uint8_t output_zero_point,
3871	float output_scale,
3872	uint8_t output_min,
3873	uint8_t output_max,
3874	uint32_t flags,
3875	xnn_caches_t caches,
3876	xnn_operator_t* fully_connected_op_out);
3877
3878	enum xnn_status xnn_setup_fully_connected_nc_qu8(
3879	xnn_operator_t fully_connected_op,
3880	size_t batch_size,
3881	const uint8_t* input,
3882	uint8_t* output,
3883	pthreadpool_t threadpool);
3884
3885	enum xnn_status xnn_create_global_average_pooling_nwc_qu8(
3886	size_t channels,
3887	size_t input_stride,
3888	size_t output_stride,
3889	uint8_t input_zero_point,
3890	float input_scale,
3891	uint8_t output_zero_point,
3892	float output_scale,
3893	uint8_t output_min,
3894	uint8_t output_max,
3895	uint32_t flags,
3896	xnn_operator_t* global_average_pooling_op_out);
3897
3898	enum xnn_status xnn_setup_global_average_pooling_nwc_qu8(
3899	xnn_operator_t global_average_pooling_op,
3900	size_t batch_size,
3901	size_t width,
3902	const uint8_t* input,
3903	uint8_t* output,
3904	pthreadpool_t threadpool);
3905
3906	enum xnn_status xnn_create_leaky_relu_nc_qu8(
3907	size_t channels,
3908	size_t input_stride,
3909	size_t output_stride,
3910	float negative_slope,
3911	uint8_t input_zero_point,
3912	float input_scale,
3913	uint8_t output_zero_point,
3914	float output_scale,
3915	uint32_t flags,
3916	xnn_operator_t* leaky_relu_op_out);
3917
3918	enum xnn_status xnn_setup_leaky_relu_nc_qu8(
3919	xnn_operator_t leaky_relu_op,
3920	size_t batch_size,
3921	const uint8_t* input,
3922	uint8_t* output,
3923	pthreadpool_t threadpool);
3924
3925	enum xnn_status xnn_create_multiply_nd_qu8(
3926	uint8_t input1_zero_point,
3927	float input1_scale,
3928	uint8_t input2_zero_point,
3929	float input2_scale,
3930	uint8_t output_zero_point,
3931	float output_scale,
3932	uint8_t output_min,
3933	uint8_t output_max,
3934	uint32_t flags,
3935	xnn_operator_t* multiply_op_out);
3936
3937	enum xnn_status xnn_setup_multiply_nd_qu8(
3938	xnn_operator_t multiply_op,
3939	size_t num_input1_dims,
3940	const size_t* input1_shape,
3941	size_t num_input2_dims,
3942	const size_t* input2_shape,
3943	const uint8_t* input1,
3944	const uint8_t* input2,
3945	uint8_t* output,
3946	pthreadpool_t threadpool);
3947
3948	enum xnn_status xnn_run_multiply_nd_qu8(
3949	size_t num_input1_dims,
3950	const size_t* input1_shape,
3951	uint8_t input1_zero_point,
3952	float input1_scale,
3953	size_t num_input2_dims,
3954	const size_t* input2_shape,
3955	uint8_t input2_zero_point,
3956	float input2_scale,
3957	const uint8_t* input1,
3958	const uint8_t* input2,
3959	uint8_t* output,
3960	uint8_t output_zero_point,
3961	float output_scale,
3962	uint8_t output_min,
3963	uint8_t output_max,
3964	uint32_t flags,
3965	pthreadpool_t threadpool);
3966
3967	enum xnn_status xnn_create_sigmoid_nc_qu8(
3968	size_t channels,
3969	size_t input_stride,
3970	size_t output_stride,
3971	uint8_t input_zero_point,
3972	float input_scale,
3973	uint8_t output_zero_point,
3974	float output_scale,
3975	uint8_t output_min,
3976	uint8_t output_max,
3977	uint32_t flags,
3978	xnn_operator_t* sigmoid_op_out);
3979
3980	enum xnn_status xnn_setup_sigmoid_nc_qu8(
3981	xnn_operator_t sigmoid_op,
3982	size_t batch_size,
3983	const uint8_t* input,
3984	uint8_t* output,
3985	pthreadpool_t threadpool);
3986
3987	enum xnn_status xnn_create_softmax_nc_qu8(
3988	size_t channels,
3989	size_t input_stride,
3990	size_t output_stride,
3991	float input_scale,
3992	uint8_t output_zero_point,
3993	float output_scale,
3994	uint32_t flags,
3995	xnn_operator_t* softmax_op_out);
3996
3997	enum xnn_status xnn_setup_softmax_nc_qu8(
3998	xnn_operator_t softmax_op,
3999	size_t batch_size,
4000	const uint8_t* input,
4001	uint8_t* output,
4002	pthreadpool_t threadpool);
4003
4004	enum xnn_status xnn_create_subtract_nd_qu8(
4005	uint8_t input1_zero_point,
4006	float input1_scale,
4007	uint8_t input2_zero_point,
4008	float input2_scale,
4009	uint8_t output_zero_point,
4010	float output_scale,
4011	uint8_t output_min,
4012	uint8_t output_max,
4013	uint32_t flags,
4014	xnn_operator_t* subtract_op_out);
4015
4016	enum xnn_status xnn_setup_subtract_nd_qu8(
4017	xnn_operator_t subtract_op,
4018	size_t num_input1_dims,
4019	const size_t* input1_shape,
4020	size_t num_input2_dims,
4021	const size_t* input2_shape,
4022	const uint8_t* input1,
4023	const uint8_t* input2,
4024	uint8_t* output,
4025	pthreadpool_t threadpool);
4026
4027	enum xnn_status xnn_run_subtract_nd_qu8(
4028	size_t num_input1_dims,
4029	const size_t* input1_shape,
4030	uint8_t input1_zero_point,
4031	float input1_scale,
4032	size_t num_input2_dims,
4033	const size_t* input2_shape,
4034	uint8_t input2_zero_point,
4035	float input2_scale,
4036	const uint8_t* input1,
4037	const uint8_t* input2,
4038	uint8_t* output,
4039	uint8_t output_zero_point,
4040	float output_scale,
4041	uint8_t output_min,
4042	uint8_t output_max,
4043	uint32_t flags,
4044	pthreadpool_t threadpool);
4045
4046	enum xnn_status xnn_create_tanh_nc_qu8(
4047	size_t channels,
4048	size_t input_stride,
4049	size_t output_stride,
4050	uint8_t input_zero_point,
4051	float input_scale,
4052	uint8_t output_zero_point,
4053	float output_scale,
4054	uint8_t output_min,
4055	uint8_t output_max,
4056	uint32_t flags,
4057	xnn_operator_t* tanh_op_out);
4058
4059	enum xnn_status xnn_setup_tanh_nc_qu8(
4060	xnn_operator_t tanh_op,
4061	size_t batch_size,
4062	const uint8_t* input,
4063	uint8_t* output,
4064	pthreadpool_t threadpool);
4065
4066	#endif // XNN_NO_QU8_OPERATORS
4067
4068	#ifndef XNN_NO_S8_OPERATORS
4069
4070	enum xnn_status xnn_create_clamp_nc_s8(
4071	size_t channels,
4072	size_t input_stride,
4073	size_t output_stride,
4074	int8_t output_min,
4075	int8_t output_max,
4076	uint32_t flags,
4077	xnn_operator_t* clamp_op_out);
4078
4079	enum xnn_status xnn_setup_clamp_nc_s8(
4080	xnn_operator_t clamp_op,
4081	size_t batch_size,
4082	const int8_t* input,
4083	int8_t* output,
4084	pthreadpool_t threadpool);
4085
4086	enum xnn_status xnn_create_max_pooling2d_nhwc_s8(
4087	uint32_t input_padding_top,
4088	uint32_t input_padding_right,
4089	uint32_t input_padding_bottom,
4090	uint32_t input_padding_left,
4091	uint32_t pooling_height,
4092	uint32_t pooling_width,
4093	uint32_t stride_height,
4094	uint32_t stride_width,
4095	uint32_t dilation_height,
4096	uint32_t dilation_width,
4097	size_t channels,
4098	size_t input_pixel_stride,
4099	size_t output_pixel_stride,
4100	int8_t output_min,
4101	int8_t output_max,
4102	uint32_t flags,
4103	xnn_operator_t* max_pooling_op_out);
4104
4105	enum xnn_status xnn_setup_max_pooling2d_nhwc_s8(
4106	xnn_operator_t max_pooling_op,
4107	size_t batch_size,
4108	size_t input_height,
4109	size_t input_width,
4110	const int8_t* input,
4111	int8_t* output,
4112	pthreadpool_t threadpool);
4113
4114	enum xnn_status xnn_create_resize_bilinear2d_nhwc_s8(
4115	size_t channels,
4116	size_t input_pixel_stride,
4117	size_t output_pixel_stride,
4118	uint32_t flags,
4119	xnn_operator_t* resize_op_out);
4120
4121	enum xnn_status xnn_setup_resize_bilinear2d_nhwc_s8(
4122	xnn_operator_t resize_op,
4123	size_t batch_size,
4124	size_t input_height,
4125	size_t input_width,
4126	size_t output_height,
4127	size_t output_width,
4128	const int8_t* input,
4129	int8_t* output,
4130	pthreadpool_t threadpool);
4131
4132	#endif // XNN_NO_S8_OPERATORS
4133
4134	#ifndef XNN_NO_U8_OPERATORS
4135
4136	enum xnn_status xnn_create_clamp_nc_u8(
4137	size_t channels,
4138	size_t input_stride,
4139	size_t output_stride,
4140	uint8_t output_min,
4141	uint8_t output_max,
4142	uint32_t flags,
4143	xnn_operator_t* clamp_op_out);
4144
4145	enum xnn_status xnn_setup_clamp_nc_u8(
4146	xnn_operator_t clamp_op,
4147	size_t batch_size,
4148	const uint8_t* input,
4149	uint8_t* output,
4150	pthreadpool_t threadpool);
4151
4152	enum xnn_status xnn_create_max_pooling2d_nhwc_u8(
4153	uint32_t input_padding_top,
4154	uint32_t input_padding_right,
4155	uint32_t input_padding_bottom,
4156	uint32_t input_padding_left,
4157	uint32_t pooling_height,
4158	uint32_t pooling_width,
4159	uint32_t stride_height,
4160	uint32_t stride_width,
4161	uint32_t dilation_height,
4162	uint32_t dilation_width,
4163	size_t channels,
4164	size_t input_pixel_stride,
4165	size_t output_pixel_stride,
4166	uint8_t output_min,
4167	uint8_t output_max,
4168	uint32_t flags,
4169	xnn_operator_t* max_pooling_op_out);
4170
4171	enum xnn_status xnn_setup_max_pooling2d_nhwc_u8(
4172	xnn_operator_t max_pooling_op,
4173	size_t batch_size,
4174	size_t input_height,
4175	size_t input_width,
4176	const uint8_t* input,
4177	uint8_t* output,
4178	pthreadpool_t threadpool);
4179
4180	enum xnn_status xnn_create_resize_bilinear2d_nhwc_u8(
4181	size_t channels,
4182	size_t input_pixel_stride,
4183	size_t output_pixel_stride,
4184	uint32_t flags,
4185	xnn_operator_t* resize_op_out);
4186
4187	enum xnn_status xnn_setup_resize_bilinear2d_nhwc_u8(
4188	xnn_operator_t resize_op,
4189	size_t batch_size,
4190	size_t input_height,
4191	size_t input_width,
4192	size_t output_height,
4193	size_t output_width,
4194	const uint8_t* input,
4195	uint8_t* output,
4196	pthreadpool_t threadpool);
4197
4198	#endif // XNN_NO_U8_OPERATORS
4199
4200	#ifndef XNN_NO_X8_OPERATORS
4201
4202	enum xnn_status xnn_create_copy_nc_x8(
4203	size_t channels,
4204	size_t input_stride,
4205	size_t output_stride,
4206	uint32_t flags,
4207	xnn_operator_t* copy_op_out);
4208
4209	enum xnn_status xnn_setup_copy_nc_x8(
4210	xnn_operator_t copy_op,
4211	size_t batch_size,
4212	const void* input,
4213	void* output,
4214	pthreadpool_t threadpool);
4215
4216	enum xnn_status xnn_create_channel_shuffle_nc_x8(
4217	size_t groups,
4218	size_t group_channels,
4219	size_t input_stride,
4220	size_t output_stride,
4221	uint32_t flags,
4222	xnn_operator_t* channel_shuffle_op_out);
4223
4224	enum xnn_status xnn_setup_channel_shuffle_nc_x8(
4225	xnn_operator_t channel_shuffle_op,
4226	size_t batch_size,
4227	const void* input,
4228	void* output,
4229	pthreadpool_t threadpool);
4230
4231	enum xnn_status xnn_create_constant_pad_nd_x8(
4232	const void* padding_value,
4233	uint32_t flags,
4234	xnn_operator_t* constant_pad_op_out);
4235
4236	enum xnn_status xnn_setup_constant_pad_nd_x8(
4237	xnn_operator_t constant_pad_op,
4238	size_t num_dims,
4239	const size_t* input_shape,
4240	const size_t* pre_padding,
4241	const size_t* post_padding,
4242	const void* input,
4243	void* output,
4244	pthreadpool_t threadpool);
4245
4246	enum xnn_status xnn_run_constant_pad_nd_x8(
4247	uint32_t flags,
4248	size_t num_dims,
4249	const size_t* input_shape,
4250	const size_t* pre_paddings,
4251	const size_t* post_paddings,
4252	const void* input,
4253	void* output,
4254	const void* padding_value,
4255	pthreadpool_t threadpool);
4256
4257	enum xnn_status xnn_create_depth_to_space_nhwc_x8(
4258	size_t output_channels,
4259	size_t input_channel_stride,
4260	size_t output_channel_stride,
4261	uint32_t block_size,
4262	uint32_t flags,
4263	xnn_operator_t* depth_to_space_op_out);
4264
4265	enum xnn_status xnn_setup_depth_to_space_nhwc_x8(
4266	xnn_operator_t depth_to_space_op,
4267	size_t batch_size,
4268	size_t input_height,
4269	size_t input_width,
4270	const void* input,
4271	void* output,
4272	pthreadpool_t threadpool);
4273
4274	enum xnn_status xnn_create_slice_nd_x8(
4275	uint32_t flags,
4276	xnn_operator_t* slice_op_out);
4277
4278	enum xnn_status xnn_setup_slice_nd_x8(
4279	xnn_operator_t slice_op,
4280	size_t num_dims,
4281	const size_t* input_shape,
4282	const size_t* offsets,
4283	const size_t* sizes,
4284	const void* input,
4285	void* output,
4286	pthreadpool_t threadpool);
4287
4288	enum xnn_status xnn_create_space_to_depth_nhwc_x8(
4289	size_t input_channels,
4290	size_t input_channel_stride,
4291	size_t output_channel_stride,
4292	uint32_t block_size,
4293	uint32_t flags,
4294	xnn_operator_t* space_to_depth_op_out);
4295
4296	enum xnn_status xnn_setup_space_to_depth_nhwc_x8(
4297	xnn_operator_t space_to_depth_op,
4298	size_t batch_size,
4299	size_t input_height,
4300	size_t input_width,
4301	const void* input,
4302	void* output,
4303	pthreadpool_t threadpool);
4304
4305	enum xnn_status xnn_create_transpose_nd_x8(
4306	uint32_t flags,
4307	xnn_operator_t* transpose_op_out);
4308
4309	enum xnn_status xnn_setup_transpose_nd_x8(
4310	xnn_operator_t transpose_op,
4311	const void* input,
4312	void* output,
4313	const size_t num_dims,
4314	const size_t* input_shape,
4315	const size_t* output_perm,
4316	pthreadpool_t threadpool);
4317
4318	enum xnn_status xnn_run_transpose_nd_x8(
4319	const void* input,
4320	void* output,
4321	const size_t num_dims,
4322	const size_t* input_shape,
4323	const size_t* output_perm,
4324	uint32_t flags,
4325	pthreadpool_t threadpool);
4326
4327	#endif // XNN_NO_X8_OPERATORS
4328
4329	#ifndef XNN_NO_CVT_OPERATORS
4330
4331	enum xnn_status xnn_create_convert_nc_f16_f32(
4332	size_t channels,
4333	size_t input_stride,
4334	size_t output_stride,
4335	uint32_t flags,
4336	xnn_operator_t* convert_op_out);
4337
4338	enum xnn_status xnn_setup_convert_nc_f16_f32(
4339	xnn_operator_t convert_op,
4340	size_t batch_size,
4341	const void* input,
4342	float* output,
4343	pthreadpool_t threadpool);
4344
4345	enum xnn_status xnn_run_convert_nc_f16_f32(
4346	size_t channels,
4347	size_t input_stride,
4348	size_t output_stride,
4349	size_t batch_size,
4350	const void* input,
4351	float* output,
4352	uint32_t flags,
4353	pthreadpool_t threadpool);
4354
4355	enum xnn_status xnn_create_convert_nc_f32_f16(
4356	size_t channels,
4357	size_t input_stride,
4358	size_t output_stride,
4359	uint32_t flags,
4360	xnn_operator_t* convert_op_out);
4361
4362	enum xnn_status xnn_setup_convert_nc_f32_f16(
4363	xnn_operator_t convert_op,
4364	size_t batch_size,
4365	const float* input,
4366	void* output,
4367	pthreadpool_t threadpool);
4368
4369	enum xnn_status xnn_run_convert_nc_f32_f16(
4370	size_t channels,
4371	size_t input_stride,
4372	size_t output_stride,
4373	size_t batch_size,
4374	const float* input,
4375	void* output,
4376	uint32_t flags,
4377	pthreadpool_t threadpool);
4378
4379	enum xnn_status xnn_create_convert_nc_f32_qs8(
4380	size_t channels,
4381	size_t input_stride,
4382	size_t output_stride,
4383	float output_scale,
4384	int8_t output_zero_point,
4385	int8_t output_min,
4386	int8_t output_max,
4387	uint32_t flags,
4388	xnn_operator_t* convert_op_out);
4389
4390	enum xnn_status xnn_setup_convert_nc_f32_qs8(
4391	xnn_operator_t convert_op,
4392	size_t batch_size,
4393	const float* input,
4394	int8_t* output,
4395	pthreadpool_t threadpool);
4396
4397	enum xnn_status xnn_run_convert_nc_f32_qs8(
4398	size_t channels,
4399	size_t input_stride,
4400	size_t output_stride,
4401	size_t batch_size,
4402	const float* input,
4403	int8_t* output,
4404	float output_scale,
4405	int8_t output_zero_point,
4406	uint32_t flags,
4407	pthreadpool_t threadpool);
4408
4409	enum xnn_status xnn_create_convert_nc_f32_qu8(
4410	size_t channels,
4411	size_t input_stride,
4412	size_t output_stride,
4413	float output_scale,
4414	uint8_t output_zero_point,
4415	uint8_t output_min,
4416	uint8_t output_max,
4417	uint32_t flags,
4418	xnn_operator_t* convert_op_out);
4419
4420	enum xnn_status xnn_setup_convert_nc_f32_qu8(
4421	xnn_operator_t convert_op,
4422	size_t batch_size,
4423	const float* input,
4424	uint8_t* output,
4425	pthreadpool_t threadpool);
4426
4427	enum xnn_status xnn_run_convert_nc_f32_qu8(
4428	size_t channels,
4429	size_t input_stride,
4430	size_t output_stride,
4431	size_t batch_size,
4432	const float* input,
4433	uint8_t* output,
4434	float output_scale,
4435	uint8_t output_zero_point,
4436	uint32_t flags,
4437	pthreadpool_t threadpool);
4438
4439	enum xnn_status xnn_create_convert_nc_qs8(
4440	size_t channels,
4441	size_t input_stride,
4442	size_t output_stride,
4443	float input_scale,
4444	int8_t input_zero_point,
4445	float output_scale,
4446	int8_t output_zero_point,
4447	uint32_t flags,
4448	xnn_operator_t* convert_op_out);
4449
4450	enum xnn_status xnn_setup_convert_nc_qs8(
4451	xnn_operator_t convert_op,
4452	size_t batch_size,
4453	const int8_t* input,
4454	int8_t* output,
4455	pthreadpool_t threadpool);
4456
4457	enum xnn_status xnn_create_convert_nc_qs8_f32(
4458	size_t channels,
4459	size_t input_stride,
4460	size_t output_stride,
4461	float input_scale,
4462	int8_t input_zero_point,
4463	uint32_t flags,
4464	xnn_operator_t* convert_op_out);
4465
4466	enum xnn_status xnn_setup_convert_nc_qs8_f32(
4467	xnn_operator_t convert_op,
4468	size_t batch_size,
4469	const int8_t* input,
4470	float* output,
4471	pthreadpool_t threadpool);
4472
4473	enum xnn_status xnn_run_convert_nc_qs8_f32(
4474	size_t channels,
4475	size_t input_stride,
4476	size_t output_stride,
4477	size_t batch_size,
4478	const int8_t* input,
4479	float* output,
4480	float input_scale,
4481	int8_t input_zero_point,
4482	uint32_t flags,
4483	pthreadpool_t threadpool);
4484
4485	enum xnn_status xnn_create_convert_nc_qu8(
4486	size_t channels,
4487	size_t input_stride,
4488	size_t output_stride,
4489	float input_scale,
4490	uint8_t input_zero_point,
4491	float output_scale,
4492	uint8_t output_zero_point,
4493	uint32_t flags,
4494	xnn_operator_t* convert_op_out);
4495
4496	enum xnn_status xnn_setup_convert_nc_qu8(
4497	xnn_operator_t convert_op,
4498	size_t batch_size,
4499	const uint8_t* input,
4500	uint8_t* output,
4501	pthreadpool_t threadpool);
4502
4503	enum xnn_status xnn_create_convert_nc_qu8_f32(
4504	size_t channels,
4505	size_t input_stride,
4506	size_t output_stride,
4507	float input_scale,
4508	uint8_t input_zero_point,
4509	uint32_t flags,
4510	xnn_operator_t* convert_op_out);
4511
4512	enum xnn_status xnn_setup_convert_nc_qu8_f32(
4513	xnn_operator_t convert_op,
4514	size_t batch_size,
4515	const uint8_t* input,
4516	float* output,
4517	pthreadpool_t threadpool);
4518
4519	enum xnn_status xnn_run_convert_nc_qu8_f32(
4520	size_t channels,
4521	size_t input_stride,
4522	size_t output_stride,
4523	size_t batch_size,
4524	const uint8_t* input,
4525	float* output,
4526	float input_scale,
4527	uint8_t input_zero_point,
4528	uint32_t flags,
4529	pthreadpool_t threadpool);
4530
4531	#endif // XNN_NO_CVT_OPERATORS
4532
4533	#ifdef __cplusplus
4534	} // extern "C"
4535	#endif
4536

Browse the source code of pytorch/third_party/XNNPACK/include/xnnpack.h