xnnpack.h source code [tensorflow/external/XNNPACK/include/xnnpack.h]

1	// Copyright (c) Facebook, Inc. and its affiliates.
2	// All rights reserved.
3	//
4	// Copyright 2019 Google LLC
5	//
6	// This source code is licensed under the BSD-style license found in the
7	// LICENSE file in the root directory of this source tree.
8
9	#pragma once
10
11	#include <stdbool.h>
12	#include <stddef.h>
13	#include <stdint.h>
14
15	#include <pthreadpool.h>
16
17	#ifdef __cplusplus
18	extern "C" {
19	#endif
20
21	/// The number of bytes XNNPACK may read beyond array bounds.
22	/// The caller must allocate at least this many extra bytes after the tensor data passed to XNNPACK.
23	///
24	/// Note: XNNPACK reads, but never writes beyond array bounds.
25	#define XNN_EXTRA_BYTES 16
26
27	/// Maximum number of dimensions in tensor shape.
28	#define XNN_MAX_TENSOR_DIMS 6
29
30	/// Allow sparse inference in a Runtime.
31	///
32	/// Note: this flag hints XNNPACK to consider sparse inference, but does not guarantee it.
33	#define XNN_FLAG_SPARSE_INFERENCE 0x00000001
34	#define XNN_FLAG_HINT_SPARSE_INFERENCE XNN_FLAG_SPARSE_INFERENCE
35
36	/// Allow IEEE FP16 inference in a Runtime.
37	///
38	/// Note: this flag hints XNNPACK to consider IEEE FP16 inference, but does not guarantee it.
39	#define XNN_FLAG_FP16_INFERENCE 0x00000002
40	#define XNN_FLAG_HINT_FP16_INFERENCE XNN_FLAG_FP16_INFERENCE
41
42	/// Force IEEE FP16 inference in a Runtime, and fail if FP16 inference is not possible.
43	///
44	/// Note: this flag guarantees that XNNPACK will use IEEE FP16 inference, or fail to create the Runtime object.
45	/// Warning: on x86 systems FP16 computations will be emulated at a substantial performance cost.
46	#define XNN_FLAG_FORCE_FP16_INFERENCE 0x00000004
47
48	/// Enable timing of each operator's runtime.
49	#define XNN_FLAG_BASIC_PROFILING 0x00000008
50
51	/// The convolution operator represents a depthwise convolution, and use HWGo layout for filters.
52	#define XNN_FLAG_DEPTHWISE_CONVOLUTION 0x00000001
53
54	/// Assume transposed weights in a fully connected operator.
55	#define XNN_FLAG_TRANSPOSE_WEIGHTS 0x00000001
56
57	/// The operator assumes NHWC layout for the input, regardless of the output layout.
58	#define XNN_FLAG_INPUT_NHWC 0x00000002
59
60	/// Match "SAME" padding in TensorFlow. Exact padding values are computed dynamically depending on input size.
61	#define XNN_FLAG_TENSORFLOW_SAME_PADDING 0x00000004
62
63	/// Implicitly flatten and reshape input of a Fully Connected operator into a 2D tensor.
64	#define XNN_FLAG_TENSORFLOW_RESHAPE_2D 0x00000004
65
66	/// Match behaviour of TensorFlow 1.x.
67	#define XNN_FLAG_TENSORFLOW_LEGACY_MODE 0x00000004
68
69	/// Static weights of the FP16 operator are in FP32 format.
70	#define XNN_FLAG_FP32_STATIC_WEIGHTS 0x00000008
71
72	/// Align corners of input and output images in resize operations.
73	#define XNN_FLAG_ALIGN_CORNERS 0x00000008
74
75	/// Yield worker threads of the thread pool to the system scheduler after the inference.
76	#define XNN_FLAG_YIELD_WORKERS 0x00000010
77
78	/// Status code for any XNNPACK function call.
79	enum xnn_status {
80	/// The call succeeded, and all output arguments now contain valid data.
81	xnn_status_success = `0`,
82	xnn_status_uninitialized = `1`,
83	xnn_status_invalid_parameter = `2`,
84	xnn_status_invalid_state = `3`,
85	xnn_status_unsupported_parameter = `4`,
86	xnn_status_unsupported_hardware = `5`,
87	xnn_status_out_of_memory = `6`,
88	};
89
90	struct xnn_allocator {
91	/// User-specified pointer that will be passed as-is to all functions in this structure.
92	void* context;
93	/// Pointer to a function to be called for general memory allocation.
94	///
95	/// @param context - The user-specified pointer from xnn_allocator structure.
96	/// @param size - The size of the memory block to allocate, in bytes.
97	///
98	/// @returns Pointer to the allocated memory block of at least @ref size bytes.
99	/// If allocation fails, the function must return NULL.
100	void* (allocate)(void** context, size_t size);
101	/// Pointer to a function to be called for general memory re-allocation, i.e. to increase or shrink a previously
102	/// allocated memory block. The content of the old memory block is copied to the new memory block.
103	///
104	/// @param context - The user-specified pointer from xnn_allocator structure.
105	/// @param pointer - Pointer to a memory block allocated by @ref allocate or @ref reallocate functions. Can be NULL.
106	/// If the pointer is NULL, the @ref reallocate call is equivalent to an @ref allocate call.
107	/// @param size - The new size of the memory block to allocate, in bytes.
108	///
109	/// @returns Pointer to the newly allocated memory block of at least @ref size bytes with the content of the previous
110	/// memory block.
111	/// If allocation fails, the function must return NULL, but must not release the previous memory block.
112	void* (reallocate)(void** context, void* pointer, size_t size);
113	/// Pointer to a function to be called for general memory de-allocation.
114	///
115	/// @param context - The user-specified pointer from xnn_allocator structure.
116	/// @param pointer - Pointer to a memory block allocated by @ref allocate or @ref reallocate functions. Can be NULL.
117	/// If the pointer is NULL, the @ref deallocate call is a no-op.
118	void (deallocate)(void** context, void* pointer);
119	/// Pointer to a function to be called for aligned memory allocation.
120	///
121	/// @param context - The user-specified pointer from xnn_allocator structure.
122	/// @param alignment - The alignment of the memory block to allocate, in bytes. Alignment is always a power-of-2.
123	/// @param size - The size of the memory block to allocate, in bytes.
124	///
125	/// @returns Pointer to the allocated memory block of at least @ref size bytes.
126	/// If allocation fails, the function must return NULL.
127	void* (aligned_allocate)(void** context, size_t alignment, size_t size);
128	/// Pointer to a function to be called for aligned memory de-allocation.
129	///
130	/// @param context - The user-specified pointer from xnn_allocator structure.
131	/// @param pointer - Pointer to a memory block allocated by @ref aligned_allocate function. Can be NULL.
132	/// If the pointer is NULL, the @ref aligned_deallocate call is a no-op.
133	void (aligned_deallocate)(void** context, void* pointer);
134	};
135
136	/// Initialize XNNPACK library.
137	///
138	/// XNNPACK must be successfully initialized before use. During initialization, XNNPACK populates internal structures
139	/// depending on the host processor. Initialization can be time-consuming.
140	///
141	/// @param[in] allocator - structure with function pointers to be use for memory allocation and de-allocation.
142	/// If this argument is NULL, system-provided memory management functions (e.g. malloc/free)
143	/// will be used.
144	///
145	/// @retval xnn_status_success - XNNPACK is successfully initialized and ready to use.
146	/// @retval xnn_status_out_of_memory - initialization failed due to out-of-memory condition.
147	/// @retval xnn_status_unsupported_hardware - initialization failed because the host processor does not satisfy the
148	/// minimum hardware requirements for XNNPACK. E.g. this may happen on x86
149	/// processors without SSE2 extension, or on 32-bit ARM processors without
150	/// the NEON SIMD extension.
151	enum xnn_status xnn_initialize(const struct xnn_allocator* allocator);
152
153	/// Deinitialize XNNPACK library.
154	///
155	/// To avoid memory and resource leaks, users must call xnn_deinitialize once for each successful xnn_initialize call.
156	///
157	/// @retval xnn_status_success - deinitialization call succeeded.
158	enum xnn_status xnn_deinitialize(void);
159
160	/// Subgraph is an abstract representation of a neural network model.
161	/// Subgraph objects are used to define Values (tensors) and Nodes (operators) comprising the model.
162	typedef struct xnn_subgraph* xnn_subgraph_t;
163
164	/// Create a empty Subgraph object.
165	///
166	/// @param external_value_ids - number of Value IDs to reserve for communication with external graph representation.
167	/// The Subgraph object would avoid creating internal Value IDs in the
168	/// [0, reserved_value_ids-1] range.
169	/// @param flags - binary features of the subgraph. No supported flags are currently defined.
170	/// @param subgraph_out - pointer to the variable that will be initialized with a handle to the Subgraph object upon
171	/// successful return.
172	enum xnn_status xnn_create_subgraph(
173	uint32_t external_value_ids,
174	uint32_t flags,
175	xnn_subgraph_t* subgraph_out);
176
177	/// Destroy a Subgraph object, as well as Values, and Nodes associated with the subgraph.
178	///
179	/// @param subgraph - the Subgraph object to destroy.
180	enum xnn_status xnn_delete_subgraph(
181	xnn_subgraph_t subgraph);
182
183	#define XNN_VALUE_FLAG_EXTERNAL_INPUT 0x00000001
184	#define XNN_VALUE_FLAG_EXTERNAL_OUTPUT 0x00000002
185	#define XNN_VALUE_FLAG_PERSISTENT 0x00000004
186
187	#define XNN_INVALID_VALUE_ID UINT32_MAX
188
189	/// Type of elements in a Value object.
190	enum xnn_datatype {
191	/// Invalid data type. Valid Values never have this datatype.
192	xnn_datatype_invalid = `0`,
193	/// IEEE754 single-precision floating-point.
194	xnn_datatype_fp32 = `1`,
195	/// IEEE754 half-precision floating-point.
196	xnn_datatype_fp16 = `2`,
197	/// Quantized 8-bit signed integer with shared per-Value quantization parameters.
198	xnn_datatype_qint8 = `3`,
199	/// Quantized 8-bit unsigned integer with shared per-Value quantization parameters.
200	xnn_datatype_quint8 = `4`,
201	/// Quantized 32-bit signed integer with shared per-Value quantization parameters.
202	xnn_datatype_qint32 = `5`,
203	/// Quantized 8-bit signed integer with shared per-channel quantization parameters.
204	xnn_datatype_qcint8 = `6`,
205	/// Quantized 32-bit signed integer with shared per-channel quantization parameters.
206	xnn_datatype_qcint32 = `7`,
207	};
208
209	/// Define a tensor-type Value and add it to a Subgraph.
210	///
211	/// @param subgraph - a Subgraph object that will own the created Value.
212	/// @param datatype - type of the tensor elements.
213	/// @param num_dims - number of dimensions in the shape.
214	/// @param dims - pointer to an array of @a num_dims shape dimensions. If num_dims is 0, this pointer can be NULL.
215	/// XNNPACK does not keep any pointers to this array after the function returns.
216	/// @param data - pointer to static data used for tensor initialization. If the tensor is not statically initialized,
217	/// this pointer must be is NULL. If non-NULL, the life-time of the static data must exceed the life-time
218	/// of the Subgraph object, and of any Runtime objects created from the Subgraph.
219	/// @param external_id - external ID for the Value. The ID must be within the range of reversed Value IDs specified on
220	/// the Subgraph creation. If the external ID is XNN_INVALID_VALUE_ID, an internal ID will be
221	/// created for the Value.
222	/// @param flags - binary features of the Value. Supported values are any combination of XNN_VALUE_FLAG_EXTERNAL_INPUT
223	/// and XNN_VALUE_FLAG_EXTERNAL_OUTPUT.
224	/// @param id_out - pointer to the variable that will be initialized with the Value ID upon successful return. If a
225	/// valid @a external_id was provided, the variable will be initialized with the @a external_id value.
226	enum xnn_status xnn_define_tensor_value(
227	xnn_subgraph_t subgraph,
228	enum xnn_datatype datatype,
229	size_t num_dims,
230	const size_t* dims,
231	const void* data,
232	uint32_t external_id,
233	uint32_t flags,
234	uint32_t* id_out);
235
236	/// Define a quantized tensor-type Value and add it to a Subgraph.
237	///
238	/// @param subgraph - a Subgraph object that will own the created Value.
239	/// @param datatype - type of the tensor elements.
240	/// @param zero_point - offset from zero to subtract from the quantized elements in the Value.
241	/// @param scale - multiplication factor to convert quantized elements to real representation.
242	/// @param num_dims - number of dimensions in the shape.
243	/// @param dims - pointer to an array of @a num_dims shape dimensions. If num_dims is 0, this pointer can be NULL.
244	/// XNNPACK does not keep any pointers to this array after the function returns.
245	/// @param data - pointer to static data used for tensor initialization. If the tensor is not statically initialized,
246	/// this pointer must be is NULL. If non-NULL, the life-time of the static data must exceed the life-time
247	/// of the Subgraph object, and of any Runtime objects created from the Subgraph.
248	/// @param external_id - external ID for the Value. The ID must be within the range of reversed Value IDs specified on
249	/// the Subgraph creation. If the external ID is XNN_INVALID_VALUE_ID, an internal ID will be
250	/// created for the Value.
251	/// @param flags - binary features of the Value. Supported values are any combination of XNN_VALUE_FLAG_EXTERNAL_INPUT
252	/// and XNN_VALUE_FLAG_EXTERNAL_OUTPUT.
253	/// @param id_out - pointer to the variable that will be initialized with the Value ID upon successful return. If a
254	/// valid @a external_id was provided, the variable will be initialized with the @a external_id value.
255	enum xnn_status xnn_define_quantized_tensor_value(
256	xnn_subgraph_t subgraph,
257	enum xnn_datatype datatype,
258	int32_t zero_point,
259	float scale,
260	size_t num_dims,
261	const size_t* dims,
262	const void* data,
263	uint32_t external_id,
264	uint32_t flags,
265	uint32_t* id_out);
266
267	/// Define a channelwise quantized tensor-type Value and add it to a Subgraph.
268	///
269	/// @param subgraph - a Subgraph object that will own the created Value.
270	/// @param datatype - type of the tensor elements.
271	/// @param scale - per-channel multiplication factors to convert quantized elements to real representation.
272	/// @param num_dims - number of dimensions in the shape.
273	/// @param channel_dim - index of the channel dimension in the tensor with per-channel quantization parameters.
274	/// Typically this is the first dimension (dimension #0) of the filter tensors in the Convolution,
275	/// Deconvolution, and Fully Connected operators and the last dimension of the filter tensors in
276	/// the Depthwise Convolution operators.
277	/// @param dims - pointer to an array of @a num_dims shape dimensions. If num_dims is 0, this pointer can be NULL.
278	/// XNNPACK does not keep any pointers to this array after the function returns.
279	/// @param data - pointer to static data used for tensor initialization. If the tensor is not statically initialized,
280	/// this pointer must be is NULL. If non-NULL, the life-time of the static data must exceed the life-time
281	/// of the Subgraph object, and of any Runtime objects created from the Subgraph.
282	/// @param external_id - external ID for the Value. The ID must be within the range of reversed Value IDs specified on
283	/// the Subgraph creation. If the external ID is XNN_INVALID_VALUE_ID, an internal ID will be
284	/// created for the Value.
285	/// @param flags - binary features of the Value. Supported values are any combination of XNN_VALUE_FLAG_EXTERNAL_INPUT
286	/// and XNN_VALUE_FLAG_EXTERNAL_OUTPUT.
287	/// @param id_out - pointer to the variable that will be initialized with the Value ID upon successful return. If a
288	/// valid @a external_id was provided, the variable will be initialized with the @a external_id value.
289	enum xnn_status xnn_define_channelwise_quantized_tensor_value(
290	xnn_subgraph_t subgraph,
291	enum xnn_datatype datatype,
292	const float* scale,
293	size_t num_dims,
294	size_t channel_dim,
295	const size_t* dims,
296	const void* data,
297	uint32_t external_id,
298	uint32_t flags,
299	uint32_t* id_out);
300
301	/// Define a Convert Node and add it to a Subgraph.
302	///
303	/// @param subgraph - a Subgraph object that will own the created Node.
304	/// @param input_id - Value ID for the input tensor. The input tensor must be defined in the @a subgraph.
305	/// @param output_id - Value ID for the output tensor. The output tensor must be defined in the @a subgraph, and its
306	/// shape must match the shape of the input tensor.
307	/// @param flags - binary features of the Convert Node. No supported flags are currently defined.
308	enum xnn_status xnn_define_convert(
309	xnn_subgraph_t subgraph,
310	uint32_t input_id,
311	uint32_t output_id,
312	uint32_t flags);
313
314	/// Define a 2D Convolution Node and add it to a Subgraph.
315	///
316	/// @param subgraph - a Subgraph object that will own the created Node.
317	/// @param input_padding_top - implicit zero-padding above 2D input data. Must be 0 if XNN_FLAG_TENSORFLOW_SAME_PADDING
318	/// flag is specified.
319	/// @param input_padding_right - implicit zero-padding to the right of 2D input data. Must be 0 if
320	/// XNN_FLAG_TENSORFLOW_SAME_PADDING flag is specified.
321	/// @param input_padding_bottom - implicit zero-padding below 2D input data. Must be 0 if
322	/// XNN_FLAG_TENSORFLOW_SAME_PADDING flag is specified.
323	/// @param input_padding_left - implicit zero-padding to the left of 2D input data. Must be 0 if
324	/// XNN_FLAG_TENSORFLOW_SAME_PADDING flag is specified.
325	/// @param kernel_height - kernel (filter) height.
326	/// @param kernel_width - kernel (filter) width.
327	/// @param subsampling_height - height of subsampling region for convolution output (convolution height stride).
328	/// @param subsampling_width - width of subsampling region for convolution output (convolution width stride).
329	/// @param dilation_height - dilation of kernel elements along the height dimension.
330	/// @param dilation_width - dilation of kernel elements along the width dimension.
331	/// @param groups - number of convolution groups.
332	/// @param group_input_channels - number of input channels per group.
333	/// @param group_output_channels - number of output channels per group.
334	/// @param output_min - lower bound for clipping output values.
335	/// @param output_max - upper bound for clipping output values.
336	/// @param input_id - Value ID for the input tensor. The input tensor must be a 4D tensor defined in the @a subgraph
337	/// with [N, IH, IW, groups group_input_channels] dimensions*
338	/// @param filter_id - Value ID for the filter tensor. The filter tensor must ge a 4D tensor defined in the @a subgraph
339	/// with [groups group_output_channels, kernel_height, kernel_width, group_input_channels]*
340	/// dimensions.
341	/// @param bias_id - Value ID for the bias tensor, or XNN_INVALID_VALUE_ID for a 2D Convolution Node without a bias. If
342	/// present, the bias tensor must be a 1D tensor defined in the @a subgraph with [groups *
343	/// group_output_channels] dimensions.
344	/// @param output_id - Value ID for the output tensor. The output tensor must be a 4D tensor defined in the @a subgraph
345	/// with [N, OH, OW, groups group_output_channels] dimensions.*
346	/// @param flags - binary features of the 2D Convolution Node. The only currently supported values is
347	/// XNN_FLAG_TENSORFLOW_SAME_PADDING.
348	enum xnn_status xnn_define_convolution_2d(
349	xnn_subgraph_t subgraph,
350	uint32_t input_padding_top,
351	uint32_t input_padding_right,
352	uint32_t input_padding_bottom,
353	uint32_t input_padding_left,
354	uint32_t kernel_height,
355	uint32_t kernel_width,
356	uint32_t subsampling_height,
357	uint32_t subsampling_width,
358	uint32_t dilation_height,
359	uint32_t dilation_width,
360	uint32_t groups,
361	size_t group_input_channels,
362	size_t group_output_channels,
363	float output_min,
364	float output_max,
365	uint32_t input_id,
366	uint32_t filter_id,
367	uint32_t bias_id,
368	uint32_t output_id,
369	uint32_t flags);
370
371	/// Define a 2D Deconvolution (Transposed Convolution) Node and add it to a Subgraph.
372	///
373	/// @param subgraph - a Subgraph object that will own the created Node.
374	/// @param padding_top - implicit padding above 2D output data.
375	/// @param padding_right - implicit padding to the right of 2D output data.
376	/// @param padding_bottom - implicit padding below 2D output data.
377	/// @param padding_left - implicit padding to the left of 2D output data.
378	/// @param adjustment_height - additional elements in the bottom of the 2D output data.
379	/// @param adjustment_width - additional elements to the right of the 2D output data.
380	/// @param kernel_height - kernel (filter) height.
381	/// @param kernel_width - kernel (filter) width.
382	/// @param upsampling_height - height of upsampling region for deconvolution input (deconvolution height stride).
383	/// @param upsampling_width - width of upsampling region for deconvolution input (deconvolution width stride).
384	/// @param dilation_height - dilation of kernel elements along the height dimension.
385	/// @param dilation_width - dilation of kernel elements along the width dimension.
386	/// @param groups - number of convolution groups.
387	/// @param group_input_channels - number of input channels per group.
388	/// @param group_output_channels - number of output channels per group.
389	/// @param output_min - lower bound for clipping output values.
390	/// @param output_max - upper bound for clipping output values.
391	/// @param input_id - Value ID for the input tensor. The input tensor must be a 4D tensor defined in the @a subgraph
392	/// with [N, IH, IW, groups group_input_channels] dimensions*
393	/// @param filter_id - Value ID for the filter tensor. The filter tensor must ge a 4D tensor defined in the @a subgraph
394	/// with [groups group_output_channels, kernel_height, kernel_width, group_input_channels]*
395	/// dimensions.
396	/// @param bias_id - Value ID for the bias tensor, or XNN_INVALID_VALUE_ID for a 2D Convolution Node without a bias. If
397	/// present, the bias tensor must be a 1D tensor defined in the @a subgraph with
398	/// [groups group_output_channels] dimensions.*
399	/// @param output_id - Value ID for the output tensor. The output tensor must be a 4D tensor defined in the @a subgraph
400	/// with [N, OH, OW, groups group_output_channels] dimensions.*
401	/// @param flags - binary features of the 2D Deconvolution Node. No supported flags are currently defined.
402	enum xnn_status xnn_define_deconvolution_2d(
403	xnn_subgraph_t subgraph,
404	uint32_t padding_top,
405	uint32_t padding_right,
406	uint32_t padding_bottom,
407	uint32_t padding_left,
408	uint32_t adjustment_height,
409	uint32_t adjustment_width,
410	uint32_t kernel_height,
411	uint32_t kernel_width,
412	uint32_t upsampling_height,
413	uint32_t upsampling_width,
414	uint32_t dilation_height,
415	uint32_t dilation_width,
416	uint32_t groups,
417	size_t group_input_channels,
418	size_t group_output_channels,
419	float output_min,
420	float output_max,
421	uint32_t input_id,
422	uint32_t filter_id,
423	uint32_t bias_id,
424	uint32_t output_id,
425	uint32_t flags);
426
427	/// Define a 2D Depthwise Convolution Node and add it to a Subgraph.
428	///
429	/// @param subgraph - a Subgraph object that will own the created Node.
430	/// @param input_padding_top - implicit zero-padding above 2D input data. Must be 0 if XNN_FLAG_TENSORFLOW_SAME_PADDING
431	/// flag is specified.
432	/// @param input_padding_right - implicit zero-padding to the right of 2D input data. Must be 0 if
433	/// XNN_FLAG_TENSORFLOW_SAME_PADDING flag is specified.
434	/// @param input_padding_bottom - implicit zero-padding below 2D input data. Must be 0 if
435	/// XNN_FLAG_TENSORFLOW_SAME_PADDING flag is specified.
436	/// @param input_padding_left - implicit zero-padding to the left of 2D input data. Must be 0 if
437	/// XNN_FLAG_TENSORFLOW_SAME_PADDING flag is specified.
438	/// @param kernel_height - kernel (filter) height.
439	/// @param kernel_width - kernel (filter) width.
440	/// @param subsampling_height - height of subsampling region for convolution output (convolution height stride).
441	/// @param subsampling_width - width of subsampling region for convolution output (convolution width stride).
442	/// @param dilation_height - dilation of kernel elements along the height dimension.
443	/// @param dilation_width - dilation of kernel elements along the width dimension.
444	/// @param depth_multiplier - ratio of output channels to input channels.
445	/// @param input_channels - number of input channels.
446	/// @param output_min - lower bound for clipping output values.
447	/// @param output_max - upper bound for clipping output values.
448	/// @param input_id - Value ID for the input tensor. The input tensor must be a 4D tensor defined in the @a subgraph
449	/// with [N, IH, IW, input_channels] dimensions
450	/// @param filter_id - Value ID for the filter tensor. The filter tensor must ge a 4D tensor defined in the @a subgraph
451	/// with [1, kernel_height, kernel_width, input_channels depth_multiplier] dimensions.*
452	/// @param bias_id - Value ID for the bias tensor, or XNN_INVALID_VALUE_ID for a 2D Depthwise Convolution Node without
453	/// a bias. If present, the bias tensor must be a 1D tensor defined in the @a subgraph with
454	/// [input_channels depth_multiplier] dimensions.*
455	/// @param output_id - Value ID for the output tensor. The output tensor must be a 4D tensor defined in the @a subgraph
456	/// with [N, OH, OW, input_channels depth_multiplier] dimensions.*
457	/// @param flags - binary features of the 2D Depthwise Convolution Node. The only currently supported values is
458	/// XNN_FLAG_TENSORFLOW_SAME_PADDING.
459	enum xnn_status xnn_define_depthwise_convolution_2d(
460	xnn_subgraph_t subgraph,
461	uint32_t input_padding_top,
462	uint32_t input_padding_right,
463	uint32_t input_padding_bottom,
464	uint32_t input_padding_left,
465	uint32_t kernel_height,
466	uint32_t kernel_width,
467	uint32_t subsampling_height,
468	uint32_t subsampling_width,
469	uint32_t dilation_height,
470	uint32_t dilation_width,
471	uint32_t depth_multiplier,
472	size_t input_channels,
473	float output_min,
474	float output_max,
475	uint32_t input_id,
476	uint32_t filter_id,
477	uint32_t bias_id,
478	uint32_t output_id,
479	uint32_t flags);
480
481	/// Define a Depth To Space Node and add it to a Subgraph.
482	///
483	/// The Depth To Space Node rearranges data from depth into blocks of spatial data (a reverse transform to
484	/// Space To Depth). For a given input pixel, an output square of pixels with side @a block_size is formed from values
485	/// in the corresponding number of its channels. The output depth is therefore @a block_size x @a block_size times
486	/// smaller than that of the input.
487	///
488	/// @param subgraph - a Subgraph object that will own the created Node.
489	/// @param input_id - Value ID for the input tensor. The input tensor must be a 4D tensor defined in the @a subgraph
490	/// with [N, IH, IW, OC block_size * block_size] dimensions.*
491	/// @param output_id - Value ID for the output tensor. The output tensor must be a 4D tensor defined in the @a subgraph
492	/// with [N, IH block_size, IW * block_size, OC] dimensions.*
493	/// @param block_size - the size of the spatial block.
494	/// @param flags - binary features of the input_channels Node. No supported flags are currently defined.
495	enum xnn_status xnn_define_depth_to_space(
496	xnn_subgraph_t subgraph,
497	uint32_t input_id,
498	uint32_t output_id,
499	uint32_t block_size,
500	uint32_t flags);
501
502	/// Define a 1D Global Average Pooling Node and add it to a Subgraph.
503	///
504	/// @param subgraph - a Subgraph object that will own the created Node.
505	/// @param output_min - lower bound for clipping output values.
506	/// @param output_max - upper bound for clipping output values.
507	/// @param input_id - Value ID for the input tensor. The input tensor must be a dense tensor with 2 or more dimensions
508	/// defined in the @a subgraph. Averaging is performed across the second-innermost dimension.
509	/// @param output_id - Value ID for the output tensor. The output tensor must be a dense tensor with 2 or more
510	/// dimensions defined in the @a subgraph.
511	/// @param flags - binary features of the 1D Global Average Pooling Node. No supported flags are currently defined.
512	enum xnn_status xnn_define_global_average_pooling_1d(
513	xnn_subgraph_t subgraph,
514	float output_min,
515	float output_max,
516	uint32_t input_id,
517	uint32_t output_id,
518	uint32_t flags);
519
520	/// Define a 2D Global Average Pooling Node and add it to a Subgraph.
521	///
522	/// @param subgraph - a Subgraph object that will own the created Node.
523	/// @param output_min - lower bound for clipping output values.
524	/// @param output_max - upper bound for clipping output values.
525	/// @param input_id - Value ID for the input tensor. The input tensor must be a dense tensor with 3 or more dimensions
526	/// defined in the @a subgraph. Averaging is performed across the second- and third-innermost
527	/// dimensions.
528	/// @param output_id - Value ID for the output tensor. The output tensor must be a dense tensor with 3 or more
529	/// dimensions defined in the @a subgraph.
530	/// @param flags - binary features of the 2D Global Average Pooling Node. No supported flags are currently defined.
531	enum xnn_status xnn_define_global_average_pooling_2d(
532	xnn_subgraph_t subgraph,
533	float output_min,
534	float output_max,
535	uint32_t input_id,
536	uint32_t output_id,
537	uint32_t flags);
538
539	/// Define a 2D Average Pooling Node and add it to a Subgraph.
540	///
541	/// @param subgraph - a Subgraph object that will own the created Node.
542	/// @param input_padding_top - implicit zero-padding above 2D input data. Must be 0 if XNN_FLAG_TENSORFLOW_SAME_PADDING
543	/// flag is specified.
544	/// @param input_padding_right - implicit zero-padding to the right of 2D input data. Must be 0 if
545	/// XNN_FLAG_TENSORFLOW_SAME_PADDING flag is specified.
546	/// @param input_padding_bottom - implicit zero-padding below 2D input data. Must be 0 if
547	/// XNN_FLAG_TENSORFLOW_SAME_PADDING flag is specified.
548	/// @param input_padding_left - implicit zero-padding to the left of 2D input data. Must be 0 if
549	/// XNN_FLAG_TENSORFLOW_SAME_PADDING flag is specified.
550	/// @param pooling_height - pooling (kernel) height.
551	/// @param pooling_width - pooling (kernel) width.
552	/// @param stride_height - displacing of the pooling window in the vertical dimension of the input pixels corresponding
553	/// to vertically adjacent output pixels.
554	/// @param stride_width - displacing of the pooling window in the horizontal dimension of the input pixels corresponding
555	/// to horizontally adjacent output pixels.
556	/// @param output_min - lower bound for clipping output values.
557	/// @param output_max - upper bound for clipping output values.
558	/// @param input_id - Value ID for the input tensor. The input tensor must be a 4D tensor defined in the @a subgraph
559	/// with [N, IH, IW, channels] dimensions
560	/// @param output_id - Value ID for the output tensor. The output tensor must be a 4D tensor defined in the @a subgraph
561	/// with [N, OH, OW, channels] dimensions.
562	/// @param flags - binary features of the 2D Average Pooling Node. The only currently supported values is
563	/// XNN_FLAG_TENSORFLOW_SAME_PADDING.
564	enum xnn_status xnn_define_average_pooling_2d(
565	xnn_subgraph_t subgraph,
566	uint32_t input_padding_top,
567	uint32_t input_padding_right,
568	uint32_t input_padding_bottom,
569	uint32_t input_padding_left,
570	uint32_t pooling_height,
571	uint32_t pooling_width,
572	uint32_t stride_height,
573	uint32_t stride_width,
574	float output_min,
575	float output_max,
576	uint32_t input_id,
577	uint32_t output_id,
578	uint32_t flags);
579
580	/// Define a Fully Connected Node and add it to a Subgraph.
581	///
582	/// @param subgraph - a Subgraph object that will own the created Node.
583	/// @param output_min - lower bound for clipping output values.
584	/// @param output_max - upper bound for clipping output values.
585	/// @param input_id - Value ID for the input tensor. The input tensor must be an N-dimensional tensor defined in the
586	/// @a subgraph. If XNN_FLAG_TENSORFLOW_RESHAPE_2D is not specified, the input tensor must be at least
587	/// 1D and its last dimension must match the last dimension of the filter tensor. In particular, if
588	/// input is a 2D tensor, it must have [batch_size, input_channels] dimensions.
589	/// If XNN_FLAG_TENSORFLOW_RESHAPE_2D is specified, the number of elements in the input tensor must be
590	/// divisible by the input_channels. The tensor will be first flattened into a 1D tensor of
591	/// [num_input_elements] dimensions, then reshaped into a 2D tensor of
592	/// [num_input_elements / input_channels, input_channels] dimensions where num_input_elements is the
593	/// total number of elements in the input tensor.
594	/// @param filter_id - Value ID for the filter tensor. The filter tensor must a 2D tensor defined in the @a subgraph.
595	/// If the XNN_FLAG_TRANSPOSE_WEIGHTS flag is not specified, the filter tensor must have
596	/// [output_channels, input_channels] dimensions. If the XNN_FLAG_TRANSPOSE_WEIGHTS flag is
597	/// specified, the filter tensor must have [input_channels, output_channels] dimensions.
598	/// @param bias_id - Value ID for the bias tensor, or XNN_INVALID_VALUE_ID for a Fully Connected Node without a bias.
599	/// If present, the bias tensor must be a 1D tensor defined in the @a subgraph with [output_channels]
600	/// dimensions.
601	/// @param output_id - Value ID for the output tensor. The output tensor must be defined in the @a subgraph.
602	/// If XNN_FLAG_TENSORFLOW_RESHAPE_2D is not specified, the output tensor must have the same
603	/// dimensionality as the input tensor, all its dimensions but the last one must match the
604	/// corresponding dimensions of the input tensor, and the last dimensions of the output tensor must
605	/// match the first dimension of the filter tensor. In particular, if input is a 2D tensor, output
606	/// must be a 2D tensor of [batch_size, output_channels] dimensions.
607	/// If XNN_FLAG_TENSORFLOW_RESHAPE_2D is specified, output must be a 2D tensor of
608	/// [num_input_elements / input_channels, output_channels] dimensions where num_input_elements is the
609	/// total number of elements in the input tensor.
610	/// @param flags - binary features of the Fully Connected Node. The only currently supported values are
611	/// XNN_FLAG_TENSORFLOW_RESHAPE_2D and XNN_FLAG_TRANSPOSE_WEIGHTS.
612	enum xnn_status xnn_define_fully_connected(
613	xnn_subgraph_t subgraph,
614	float output_min,
615	float output_max,
616	uint32_t input_id,
617	uint32_t filter_id,
618	uint32_t bias_id,
619	uint32_t output_id,
620	uint32_t flags);
621
622	/// Define a 2D Max Pooling Node and add it to a Subgraph.
623	///
624	/// @param subgraph - a Subgraph object that will own the created Node.
625	/// @param input_padding_top - implicit zero-padding above 2D input data. Must be 0 if XNN_FLAG_TENSORFLOW_SAME_PADDING
626	/// flag is specified.
627	/// @param input_padding_right - implicit zero-padding to the right of 2D input data. Must be 0 if
628	/// XNN_FLAG_TENSORFLOW_SAME_PADDING flag is specified.
629	/// @param input_padding_bottom - implicit zero-padding below 2D input data. Must be 0 if
630	/// XNN_FLAG_TENSORFLOW_SAME_PADDING flag is specified.
631	/// @param input_padding_left - implicit zero-padding to the left of 2D input data. Must be 0 if
632	/// XNN_FLAG_TENSORFLOW_SAME_PADDING flag is specified.
633	/// @param pooling_height - pooling (kernel) height.
634	/// @param pooling_width - pooling (kernel) width.
635	/// @param stride_height - displacing of the pooling window in the vertical dimension of the input pixels corresponding
636	/// to vertically adjacent output pixels.
637	/// @param stride_width - displacing of the pooling window in the horizontal dimension of the input pixels corresponding
638	/// to horizontally adjacent output pixels.
639	/// @param dilation_height - dilation of pooling elements along the height dimension.
640	/// @param dilation_width - dilation of pooling elements along the width dimension.
641	/// @param output_min - lower bound for clipping output values.
642	/// @param output_max - upper bound for clipping output values.
643	/// @param input_id - Value ID for the input tensor. The input tensor must be a 4D tensor defined in the @a subgraph
644	/// with [N, IH, IW, channels] dimensions
645	/// @param output_id - Value ID for the output tensor. The output tensor must be a 4D tensor defined in the @a subgraph
646	/// with [N, OH, OW, channels] dimensions.
647	/// @param flags - binary features of the 2D Max Pooling Node. The only currently supported values is
648	/// XNN_FLAG_TENSORFLOW_SAME_PADDING.
649	enum xnn_status xnn_define_max_pooling_2d(
650	xnn_subgraph_t subgraph,
651	uint32_t input_padding_top,
652	uint32_t input_padding_right,
653	uint32_t input_padding_bottom,
654	uint32_t input_padding_left,
655	uint32_t pooling_height,
656	uint32_t pooling_width,
657	uint32_t stride_height,
658	uint32_t stride_width,
659	uint32_t dilation_height,
660	uint32_t dilation_width,
661	float output_min,
662	float output_max,
663	uint32_t input_id,
664	uint32_t output_id,
665	uint32_t flags);
666
667	/// Define a 2D ArgMax Pooling Node and add it to a Subgraph.
668	///
669	/// @param subgraph - a Subgraph object that will own the created Node.
670	/// @param input_padding_top - implicit zero-padding above 2D input data.
671	/// @param input_padding_right - implicit zero-padding to the right of 2D input data.
672	/// @param input_padding_bottom - implicit zero-padding below 2D input data.
673	/// @param input_padding_left - implicit zero-padding to the left of 2D input data.
674	/// @param pooling_height - pooling (kernel) height. Vertical stride between pooling regions match this value.
675	/// @param pooling_width - pooling (kernel) width. Horizontal stride between pooling regions match this value.
676	/// @param input_id - Value ID for the input tensor. The input tensor must be a 4D tensor defined in the @a subgraph
677	/// with [N, IH, IW, channels] dimensions
678	/// @param output_value_id - Value ID for the output tensor with the maximum values in the pools. The output tensor must
679	/// be a 4D tensor defined in the @a subgraph with [N, OH, OW, channels] dimensions.
680	/// @param output_index_id - Value ID for the output tensor with the indexes of the maximum values in the pools. The
681	/// output tensor must be a 4D tensor defined in the @a subgraph with [N, OH, OW, channels]
682	/// dimensions.
683	/// @param flags - binary features of the 2D ArgMax Pooling Node. No supported flags are currently defined.
684	enum xnn_status xnn_define_argmax_pooling_2d(
685	xnn_subgraph_t subgraph,
686	uint32_t input_padding_top,
687	uint32_t input_padding_right,
688	uint32_t input_padding_bottom,
689	uint32_t input_padding_left,
690	uint32_t pooling_height,
691	uint32_t pooling_width,
692	uint32_t input_id,
693	uint32_t output_value_id,
694	uint32_t output_index_id,
695	uint32_t flags);
696
697	/// Define a 2D UnPooling Node and add it to a Subgraph.
698	///
699	/// @param subgraph - a Subgraph object that will own the created Node.
700	/// @param padding_top - implicit padding above 2D output data.
701	/// @param padding_right - implicit padding to the right of 2D output data.
702	/// @param padding_bottom - implicit padding below 2D output data.
703	/// @param padding_left - implicit padding to the left of 2D output data.
704	/// @param pooling_height - height of the pooling window.
705	/// @param pooling_width - width of the pooling window.
706	/// @param input_value_id - Value ID for the input tensor with the max-pooling values to invert. The input value tensor
707	/// must be a 4D tensor defined in the @a subgraph with [N, IH, IW, channels] dimensions.
708	/// @param input_index_id - Value ID for the input tensor with the indices of the per-pool maximum values produced by
709	/// a 2D UnPooling Node. The input tensor must be a 4D tensor defined in the @a subgraph with
710	/// [N, IH, IW, channels] dimensions.
711	/// @param output_id - Value ID for the output tensor. The output tensor must be a 4D tensor defined in the @a subgraph
712	/// with [N, OH, OW, channels] dimensions.
713	/// @param flags - binary features of the 2D UnPooling Node. No supported flags are currently defined.
714	enum xnn_status xnn_define_unpooling_2d(
715	xnn_subgraph_t subgraph,
716	uint32_t padding_top,
717	uint32_t padding_right,
718	uint32_t padding_bottom,
719	uint32_t padding_left,
720	uint32_t pooling_height,
721	uint32_t pooling_width,
722	uint32_t input_value_id,
723	uint32_t input_index_id,
724	uint32_t output_id,
725	uint32_t flags);
726
727	/// Define a 2-Input Add Node and add it to a Subgraph.
728	///
729	/// The 2-Input Add Node computes elementwise addition of two tensor inputs with numpy broadcasting rules.
730	///
731	/// @param subgraph - a Subgraph object that will own the created Node.
732	/// @param output_min - lower bound for clipping output values.
733	/// @param output_max - upper bound for clipping output values.
734	/// @param input1_id - Value ID for the first input tensor. The input tensor must be an N-dimensional tensor defined in
735	/// the @a subgraph with each dimension either equal to the corresponding dimension of the second
736	/// input, or equal to 1. In the latter case, the elements of the input tensor are broadcasted along
737	/// that dimension.
738	/// @param input2_id - Value ID for the second input tensor. The input tensor must be an M-dimensional tensor defined in
739	/// the @a subgraph with each dimension either equal to the corresponding dimension of the first
740	/// input, or equal to 1. In the latter case, the elements of the input tensor are broadcasted along
741	/// that dimension.
742	/// @param output_id - Value ID for the output tensor. The output tensor must be a max(N,M)-dimensional tensor defined
743	/// in the @a subgraph with each dimension equal to the maximum between the corresponding dimension
744	/// of the two inputs.
745	/// @param flags - binary features of the Add Node. No supported flags are currently defined.
746	enum xnn_status xnn_define_add2(
747	xnn_subgraph_t subgraph,
748	float output_min,
749	float output_max,
750	uint32_t input1_id,
751	uint32_t input2_id,
752	uint32_t output_id,
753	uint32_t flags);
754
755	/// Define a 2-Input Multiply Node and add it to a Subgraph.
756	///
757	/// The 2-Input Multiply Node computes elementwise multiplication of two tensor inputs with numpy broadcasting rules.
758	///
759	/// @param subgraph - a Subgraph object that will own the created Node.
760	/// @param output_min - lower bound for clipping output values.
761	/// @param output_max - upper bound for clipping output values.
762	/// @param input1_id - Value ID for the first input tensor. The input tensor must be an N-dimensional tensor defined in
763	/// the @a subgraph with each dimension either equal to the corresponding dimension of the second
764	/// input, or equal to 1. In the latter case, the elements of the input tensor are broadcasted along
765	/// that dimension.
766	/// @param input2_id - Value ID for the second input tensor. The input tensor must be an M-dimensional tensor defined in
767	/// the @a subgraph with each dimension either equal to the corresponding dimension of the first
768	/// input, or equal to 1. In the latter case, the elements of the input tensor are broadcasted along
769	/// that dimension.
770	/// @param output_id - Value ID for the output tensor. The output tensor must be a max(N,M)-dimensional tensor defined
771	/// in the @a subgraph with each dimension equal to the maximum between the corresponding dimension
772	/// of the two inputs.
773	/// @param flags - binary features of the Multiply Node. No supported flags are currently defined.
774	enum xnn_status xnn_define_multiply2(
775	xnn_subgraph_t subgraph,
776	float output_min,
777	float output_max,
778	uint32_t input1_id,
779	uint32_t input2_id,
780	uint32_t output_id,
781	uint32_t flags);
782
783	/// Define a Subtract Node and add it to a Subgraph.
784	///
785	/// The Subtract Node computes elementwise subtraction of two tensor inputs with numpy broadcasting rules.
786	///
787	/// @param subgraph - a Subgraph object that will own the created Node.
788	/// @param output_min - lower bound for clipping output values.
789	/// @param output_max - upper bound for clipping output values.
790	/// @param input1_id - Value ID for the first input tensor. The input tensor must be an N-dimensional tensor defined in
791	/// the @a subgraph with each dimension either equal to the corresponding dimension of the second
792	/// input, or equal to 1. In the latter case, the elements of the input tensor are broadcasted along
793	/// that dimension.
794	/// @param input2_id - Value ID for the second input tensor. The input tensor must be an M-dimensional tensor defined in
795	/// the @a subgraph with each dimension either equal to the corresponding dimension of the first
796	/// input, or equal to 1. In the latter case, the elements of the input tensor are broadcasted along
797	/// that dimension.
798	/// @param output_id - Value ID for the output tensor. The output tensor must be a max(N,M)-dimensional tensor defined
799	/// in the @a subgraph with each dimension equal to the maximum between the corresponding dimension
800	/// of the two inputs.
801	/// @param flags - binary features of the Subtract Node. No supported flags are currently defined.
802	enum xnn_status xnn_define_subtract(
803	xnn_subgraph_t subgraph,
804	float output_min,
805	float output_max,
806	uint32_t input1_id,
807	uint32_t input2_id,
808	uint32_t output_id,
809	uint32_t flags);
810
811	/// Define a Divide Node and add it to a Subgraph.
812	///
813	/// The Divide Node computes elementwise division of two tensor inputs with numpy broadcasting rules.
814	///
815	/// @param subgraph - a Subgraph object that will own the created Node.
816	/// @param output_min - lower bound for clipping output values.
817	/// @param output_max - upper bound for clipping output values.
818	/// @param input1_id - Value ID for the first input tensor. The input tensor must be an N-dimensional tensor defined in
819	/// the @a subgraph with each dimension either equal to the corresponding dimension of the second
820	/// input, or equal to 1. In the latter case, the elements of the input tensor are broadcasted along
821	/// that dimension.
822	/// @param input2_id - Value ID for the second input tensor. The input tensor must be an M-dimensional tensor defined in
823	/// the @a subgraph with each dimension either equal to the corresponding dimension of the first
824	/// input, or equal to 1. In the latter case, the elements of the input tensor are broadcasted along
825	/// that dimension.
826	/// @param output_id - Value ID for the output tensor. The output tensor must be a max(N,M)-dimensional tensor defined
827	/// in the @a subgraph with each dimension equal to the maximum between the corresponding dimension
828	/// of the two inputs.
829	/// @param flags - binary features of the Divide Node. No supported flags are currently defined.
830	enum xnn_status xnn_define_divide(
831	xnn_subgraph_t subgraph,
832	float output_min,
833	float output_max,
834	uint32_t input1_id,
835	uint32_t input2_id,
836	uint32_t output_id,
837	uint32_t flags);
838
839	/// Define a 2-Input Maximum Node and add it to a Subgraph.
840	///
841	/// The 2-Input Maximum Node computes elementwise maximum of two tensor inputs with numpy broadcasting rules.
842	///
843	/// @param subgraph - a Subgraph object that will own the created Node.
844	/// @param input1_id - Value ID for the first input tensor. The input tensor must be an N-dimensional tensor defined in
845	/// the @a subgraph with each dimension either equal to the corresponding dimension of the second
846	/// input, or equal to 1. In the latter case, the elements of the input tensor are broadcasted along
847	/// that dimension.
848	/// @param input2_id - Value ID for the second input tensor. The input tensor must be an M-dimensional tensor defined in
849	/// the @a subgraph with each dimension either equal to the corresponding dimension of the first
850	/// input, or equal to 1. In the latter case, the elements of the input tensor are broadcasted along
851	/// that dimension.
852	/// @param output_id - Value ID for the output tensor. The output tensor must be a max(N,M)-dimensional tensor defined
853	/// in the @a subgraph with each dimension equal to the maximum between the corresponding dimension
854	/// of the two inputs.
855	/// @param flags - binary features of the Maximum Node. No supported flags are currently defined.
856	enum xnn_status xnn_define_maximum2(
857	xnn_subgraph_t subgraph,
858	uint32_t input1_id,
859	uint32_t input2_id,
860	uint32_t output_id,
861	uint32_t flags);
862
863	/// Define a 2-Input Minimum Node and add it to a Subgraph.
864	///
865	/// The 2-Input Minimum Node computes elementwise minimum of two tensor inputs with numpy broadcasting rules.
866	///
867	/// @param subgraph - a Subgraph object that will own the created Node.
868	/// @param input1_id - Value ID for the first input tensor. The input tensor must be an N-dimensional tensor defined in
869	/// the @a subgraph with each dimension either equal to the corresponding dimension of the second
870	/// input, or equal to 1. In the latter case, the elements of the input tensor are broadcasted along
871	/// that dimension.
872	/// @param input2_id - Value ID for the second input tensor. The input tensor must be an M-dimensional tensor defined in
873	/// the @a subgraph with each dimension either equal to the corresponding dimension of the first
874	/// input, or equal to 1. In the latter case, the elements of the input tensor are broadcasted along
875	/// that dimension.
876	/// @param output_id - Value ID for the output tensor. The output tensor must be a max(N,M)-dimensional tensor defined
877	/// in the @a subgraph with each dimension equal to the maximum between the corresponding dimension
878	/// of the two inputs.
879	/// @param flags - binary features of the Minimum Node. No supported flags are currently defined.
880	enum xnn_status xnn_define_minimum2(
881	xnn_subgraph_t subgraph,
882	uint32_t input1_id,
883	uint32_t input2_id,
884	uint32_t output_id,
885	uint32_t flags);
886
887	/// Define a Squared Difference Node and add it to a Subgraph.
888	///
889	/// The Squared Difference Node computes elementwise squared difference of two tensor inputs with numpy broadcasting
890	/// rules.
891	///
892	/// @param subgraph - a Subgraph object that will own the created Node.
893	/// @param input1_id - Value ID for the first input tensor. The input tensor must be an N-dimensional tensor defined in
894	/// the @a subgraph with each dimension either equal to the corresponding dimension of the second
895	/// input, or equal to 1. In the latter case, the elements of the input tensor are broadcasted along
896	/// that dimension.
897	/// @param input2_id - Value ID for the second input tensor. The input tensor must be an M-dimensional tensor defined in
898	/// the @a subgraph with each dimension either equal to the corresponding dimension of the first
899	/// input, or equal to 1. In the latter case, the elements of the input tensor are broadcasted along
900	/// that dimension.
901	/// @param output_id - Value ID for the output tensor. The output tensor must be a max(N,M)-dimensional tensor defined
902	/// in the @a subgraph with each dimension equal to the maximum between the corresponding dimension
903	/// of the two inputs.
904	/// @param flags - binary features of the Squared Difference Node. No supported flags are currently defined.
905	enum xnn_status xnn_define_squared_difference(
906	xnn_subgraph_t subgraph,
907	uint32_t input1_id,
908	uint32_t input2_id,
909	uint32_t output_id,
910	uint32_t flags);
911
912	/// Define a Constant Pad Node with static padding specification and add it to a Subgraph.
913	///
914	/// @param subgraph - a Subgraph object that will own the created Node.
915	/// @param pre_paddings - number of padding elements to insert before input elements for every dimension. This array
916	/// must have as many elements as the the number of dimensions in the input tensor.
917	/// @param post_paddings - number of padding elements to insert after input elements for every dimension. This array
918	/// must have as many elements as the the number of dimensions in the input tensor.
919	/// @param padding_value - constant value used to initialize padding elements.
920	/// @param input_id - Value ID for the input tensor. The input tensor must be defined in the @a subgraph.
921	/// @param output_id - Value ID for the output tensor. The output tensor must be defined in the @a subgraph, and its
922	/// shape must match the shape of the input tensor with padding.
923	/// @param flags - binary features of the Constant Pad Node. No supported flags are currently defined.
924	enum xnn_status xnn_define_static_constant_pad(
925	xnn_subgraph_t subgraph,
926	const size_t* pre_paddings,
927	const size_t* post_paddings,
928	float padding_value,
929	uint32_t input_id,
930	uint32_t output_id,
931	uint32_t flags);
932
933	/// Define a 2-Input Concatenate Node and add it to a Subgraph.
934	///
935	/// The 2-Input Concatenate Node concatenates two tensors along a specified axis.
936	///
937	/// @param subgraph - a Subgraph object that will own the created Node.
938	/// @param axis - the axis to concatenate the two input tensors along
939	/// @param input1_id - Value ID for the first input tensor. The input tensor must be an N-dimensional tensor defined in
940	/// the @a subgraph with each dimension, except the axis, equal to the corresponding dimension of the
941	/// second input.
942	/// @param input2_id - Value ID for the second input tensor. The input tensor must be an N-dimensional tensor defined in
943	/// the @a subgraph with each dimension, except the axis, equal to the corresponding dimension of the
944	/// first input.
945	/// @param output_id - Value ID for the output tensor. The output tensor must be a N-dimensional tensor defined
946	/// in the @a subgraph with each dimension equal to the dimension of both inputs, except the axis
947	/// dimension, where it is the sum of the corresponding dimensions of both inputs.
948	/// @param flags - binary features of the Concatenate Node. No supported flags are currently defined.
949	enum xnn_status xnn_define_concatenate2(
950	xnn_subgraph_t subgraph,
951	size_t axis,
952	uint32_t input1_id,
953	uint32_t input2_id,
954	uint32_t output_id,
955	uint32_t flags);
956
957	/// Define a 3-Input Concatenate Node and add it to a Subgraph.
958	///
959	/// The 3-Input Concatenate Node concatenates three tensors along a specified axis.
960	///
961	/// @param subgraph - a Subgraph object that will own the created Node.
962	/// @param axis - the axis to concatenate the three input tensors along
963	/// @param input1_id - Value ID for the first input tensor. The input tensor must be an N-dimensional tensor defined in
964	/// the @a subgraph with each dimension, except the axis, equal to the corresponding dimension of the
965	/// other inputs.
966	/// @param input2_id - Value ID for the second input tensor. The input tensor must be an N-dimensional tensor defined in
967	/// the @a subgraph with each dimension, except the axis, equal to the corresponding dimension of the
968	/// other inputs.
969	/// @param input3_id - Value ID for the third input tensor. The input tensor must be an N-dimensional tensor defined in
970	/// the @a subgraph with each dimension, except the axis, equal to the corresponding dimension of the
971	/// other inputs.
972	/// @param output_id - Value ID for the output tensor. The output tensor must be a N-dimensional tensor defined
973	/// in the @a subgraph with each dimension equal to the dimension of all inputs, except the axis
974	/// dimension, where it is the sum of the corresponding dimensions of all inputs.
975	/// @param flags - binary features of the Concatenate Node. No supported flags are currently defined.
976	enum xnn_status xnn_define_concatenate3(
977	xnn_subgraph_t subgraph,
978	size_t axis,
979	uint32_t input1_id,
980	uint32_t input2_id,
981	uint32_t input3_id,
982	uint32_t output_id,
983	uint32_t flags);
984
985	/// Define a 4-Input Concatenate Node and add it to a Subgraph.
986	///
987	/// The 4-Input Concatenate Node concatenates four tensors along a specified axis.
988	///
989	/// @param subgraph - a Subgraph object that will own the created Node.
990	/// @param axis - the axis to concatenate the four input tensors along
991	/// @param input1_id - Value ID for the first input tensor. The input tensor must be an N-dimensional tensor defined in
992	/// the @a subgraph with each dimension, except the axis, equal to the corresponding dimension of the
993	/// other inputs.
994	/// @param input2_id - Value ID for the second input tensor. The input tensor must be an N-dimensional tensor defined in
995	/// the @a subgraph with each dimension, except the axis, equal to the corresponding dimension of the
996	/// other inputs.
997	/// @param input3_id - Value ID for the third input tensor. The input tensor must be an N-dimensional tensor defined in
998	/// the @a subgraph with each dimension, except the axis, equal to the corresponding dimension of the
999	/// other inputs.
1000	/// @param input4_id - Value ID for the fourth input tensor. The input tensor must be an N-dimensional tensor defined in
1001	/// the @a subgraph with each dimension, except the axis, equal to the corresponding dimension of the
1002	/// other inputs.
1003	/// @param output_id - Value ID for the output tensor. The output tensor must be a N-dimensional tensor defined
1004	/// in the @a subgraph with each dimension equal to the dimension of all inputs, except the axis
1005	/// dimension, where it is the sum of the corresponding dimensions of all inputs.
1006	/// @param flags - binary features of the Concatenate Node. No supported flags are currently defined.
1007	enum xnn_status xnn_define_concatenate4(
1008	xnn_subgraph_t subgraph,
1009	size_t axis,
1010	uint32_t input1_id,
1011	uint32_t input2_id,
1012	uint32_t input3_id,
1013	uint32_t input4_id,
1014	uint32_t output_id,
1015	uint32_t flags);
1016
1017	/// Define a Copy Node and add it to a Subgraph.
1018	///
1019	/// The Copy Node copies an input tensor to an output tensor.
1020	///
1021	/// @param subgraph - a Subgraph object that will own the created Node.
1022	/// @param input_id - Value ID for the first input tensor. The input tensor must be defined in the @a subgraph.
1023	/// @param output_id - Value ID for the output tensor. The output tensor must be defined in the @a subgraph, and its
1024	/// shape must match the shape of the input tensor.
1025	/// @param flags - binary features of the Copy Node. No supported flags are currently defined.
1026	enum xnn_status xnn_define_copy(
1027	xnn_subgraph_t subgraph,
1028	uint32_t input_id,
1029	uint32_t output_id,
1030	uint32_t flags);
1031
1032	/// Define a 2-Output Split Node and add it to a Subgraph.
1033	///
1034	/// The 2-Output Split Node splits an input tensor into two output tensors along a specified axis evenly.
1035	///
1036	/// @param subgraph - a Subgraph object that will own the created Node.
1037	/// @param split_dim - the dimension to split the input tensor along
1038	/// @param input_id - Value ID for the input tensor. The input tensor must be an N-dimensional tensor defined in the @a
1039	/// subgraph.
1040	/// @param output1_id - Value ID for the first output tensor. The output tensor must be an N-dimensional tensor defined
1041	/// in the @a subgraph with each dimension, except the axis, equal to the corresponding dimension
1042	/// of the second output. The split_dim dimension is half of the input's split_dim.
1043	/// @param output2_id - Value ID for the second output tensor. The output tensor must be an N-dimensional tensor
1044	/// defined in the @a subgraph with each dimension, except the axis, equal to the corresponding
1045	/// dimension of the first output. The split_dim dimension is half of the input's split_dim.
1046	/// @param flags - binary features of the Split Node. No supported flags are currently defined.
1047	enum xnn_status xnn_define_even_split2(
1048	xnn_subgraph_t subgraph,
1049	size_t split_dim,
1050	uint32_t input_id,
1051	uint32_t output1_id,
1052	uint32_t output2_id,
1053	uint32_t flags);
1054
1055	/// Define a 3-Output Split Node and add it to a Subgraph.
1056	///
1057	/// The 3-Output Split Node splits an input tensor into three output tensors along a specified axis evenly.
1058	///
1059	/// @param subgraph - a Subgraph object that will own the created Node.
1060	/// @param split_dim - the dimension to split the input tensor along
1061	/// @param input_id - Value ID for the input tensor. The input tensor must be an N-dimensional tensor defined in the @a
1062	/// subgraph.
1063	/// @param output1_id - Value ID for the first output tensor. The output tensor must be an N-dimensional tensor defined
1064	/// in the @a subgraph with each dimension, except the axis, equal to the corresponding dimension
1065	/// of the second and third output. The split_dim dimension is one third of the input's split_dim.
1066	/// @param output2_id - Value ID for the second output tensor. The output tensor must be an N-dimensional tensor
1067	/// defined in the @a subgraph with each dimension, except the axis, equal to the corresponding
1068	/// dimension of the first and third output. The split_dim dimension is one third of the input's
1069	/// split_dim.
1070	/// @param output3_id - Value ID for the third output tensor. The output tensor must be an N-dimensional tensor
1071	/// defined in the @a subgraph with each dimension, except the axis, equal to the corresponding
1072	/// dimension of the second and third output. The split_dim dimension is one third of the input's
1073	/// split_dim.
1074	/// @param flags - binary features of the Split Node. No supported flags are currently defined.
1075	enum xnn_status xnn_define_even_split3(
1076	xnn_subgraph_t subgraph,
1077	size_t split_dim,
1078	uint32_t input_id,
1079	uint32_t output1_id,
1080	uint32_t output2_id,
1081	uint32_t output3_id,
1082	uint32_t flags);
1083
1084	/// Define a 4-Output Split Node and add it to a Subgraph.
1085	///
1086	/// The 4-Output Split Node splits an input tensor into four output tensors along a specified axis evenly.
1087	///
1088	/// @param subgraph - a Subgraph object that will own the created Node.
1089	/// @param split_dim - the dimension to split the input tensor along
1090	/// @param input_id - Value ID for the input tensor. The input tensor must be an N-dimensional tensor defined in the @a
1091	/// subgraph.
1092	/// @param output1_id - Value ID for the first output tensor. The output tensor must be an N-dimensional tensor defined
1093	/// in the @a subgraph with each dimension, except the axis, equal to the corresponding dimension
1094	/// of the other output tensors. The split_dim dimension is one fourth of the input's split_dim.
1095	/// @param output2_id - Value ID for the second output tensor. The output tensor must be an N-dimensional tensor
1096	/// defined in the @a subgraph with each dimension, except the axis, equal to the corresponding
1097	/// dimension of the other output tensors. The split_dim dimension is one fourth of the input's
1098	/// split_dim.
1099	/// @param output3_id - Value ID for the third output tensor. The output tensor must be an N-dimensional tensor
1100	/// defined in the @a subgraph with each dimension, except the axis, equal to the corresponding
1101	/// dimension of the other output tensors. The split_dim dimension is one fourth of the input's
1102	/// split_dim.
1103	/// @param output4_id - Value ID for the fourth output tensor. The output tensor must be an N-dimensional tensor
1104	/// defined in the @a subgraph with each dimension, except the axis, equal to the corresponding
1105	/// dimension of the other output tensors. The split_dim dimension is one fourth of the input's
1106	/// split_dim.
1107	/// @param flags - binary features of the Split Node. No supported flags are currently defined.
1108	enum xnn_status xnn_define_even_split4(
1109	xnn_subgraph_t subgraph,
1110	size_t split_dim,
1111	uint32_t input_id,
1112	uint32_t output1_id,
1113	uint32_t output2_id,
1114	uint32_t output3_id,
1115	uint32_t output4_id,
1116	uint32_t flags);
1117
1118	/// Define a Reshape Node with static shape specification and add it to a Subgraph.
1119	///
1120	/// @param subgraph - a Subgraph object that will own the created Node.
1121	/// @param num_dims - number of shape dimensions in the output tensor.
1122	/// @param new_shape - shape dimensions of the output tensor.
1123	/// @param input_id - Value ID for the input tensor. The input tensor must be defined in the @a subgraph.
1124	/// @param output_id - Value ID for the output tensor. The output tensor must be defined in the @a subgraph, and its
1125	/// shape must match the shape of the input tensor with padding.
1126	/// @param flags - binary features of the Reshape Node. No supported flags are currently defined.
1127	enum xnn_status xnn_define_static_reshape(
1128	xnn_subgraph_t subgraph,
1129	size_t num_dims,
1130	const size_t* new_shape,
1131	uint32_t input_id,
1132	uint32_t output_id,
1133	uint32_t flags);
1134
1135	/// Define a 2D Resize Bilinear Node with static output height & width specification and add it to a Subgraph.
1136	///
1137	/// @param subgraph - a Subgraph object that will own the created Node.
1138	/// @param new_height - height dimension of the output tensor.
1139	/// @param new_width - width dimension of the output tensor.
1140	/// @param input_id - Value ID for the input tensor. The input tensor must be a 4D tensor defined in the @a subgraph
1141	/// with [N, H, W, C] dimensions.
1142	/// @param output_id - Value ID for the output tensor. The output tensor must be a 4D tensor defined in the @a subgraph
1143	/// with [N, new_height, new_width, C] dimensions.
1144	/// @param flags - binary features of the 2D Resize Bilinear Node. The only currently supported values are
1145	/// XNN_FLAG_TENSORFLOW_LEGACY_MODE and XNN_FLAG_ALIGN_CORNERS, which are mutually exclusive.
1146	enum xnn_status xnn_define_static_resize_bilinear_2d(
1147	xnn_subgraph_t subgraph,
1148	size_t new_height,
1149	size_t new_width,
1150	uint32_t input_id,
1151	uint32_t output_id,
1152	uint32_t flags);
1153
1154	/// Define a PReLU (Parametric ReLU) Node and add it to a Subgraph.
1155	///
1156	/// @param subgraph - a Subgraph object that will own the created Node.
1157	/// @param input_id - Value ID for the input tensor. The input tensor must be a 4D tensor defined in the @a subgraph
1158	/// with [N, H, W, channels] dimensions.
1159	/// @param slope_id - Value ID for the bias tensor. The bias tensor must be a 1D tensor defined in the @a subgraph with
1160	/// [channels] dimensions.
1161	/// @param output_id - Value ID for the output tensor. The output tensor must be a 4D tensor defined in the @a subgraph
1162	/// with [N, H, W, channels] dimensions.
1163	/// @param flags - binary features of the PReLU Node. No supported flags are currently defined.
1164	enum xnn_status xnn_define_prelu(
1165	xnn_subgraph_t subgraph,
1166	uint32_t input_id,
1167	uint32_t slope_id,
1168	uint32_t output_id,
1169	uint32_t flags);
1170
1171	/// Define a Abs Node and add it to a Subgraph.
1172	///
1173	/// @param subgraph - a Subgraph object that will own the created Node.
1174	/// @param input_id - Value ID for the input tensor. The input tensor must be defined in the @a subgraph.
1175	/// @param output_id - Value ID for the output tensor. The output tensor must be defined in the @a subgraph, and its
1176	/// shape must match the shape of the input tensor.
1177	/// @param flags - binary features of the Abs Node. No supported flags are currently defined.
1178	enum xnn_status xnn_define_abs(
1179	xnn_subgraph_t subgraph,
1180	uint32_t input_id,
1181	uint32_t output_id,
1182	uint32_t flags);
1183
1184	/// Define a Bankers' Rounding Node and add it to a Subgraph.
1185	///
1186	/// @param subgraph - a Subgraph object that will own the created Node.
1187	/// @param input_id - Value ID for the input tensor. The input tensor must be defined in the @a subgraph.
1188	/// @param output_id - Value ID for the output tensor. The output tensor must be defined in the @a subgraph, and its
1189	/// shape must match the shape of the input tensor.
1190	/// @param flags - binary features of the Bankers' Rounding Node. No supported flags are currently defined.
1191	enum xnn_status xnn_define_bankers_rounding(
1192	xnn_subgraph_t subgraph,
1193	uint32_t input_id,
1194	uint32_t output_id,
1195	uint32_t flags);
1196
1197	/// Define a Ceiling Node and add it to a Subgraph.
1198	///
1199	/// @param subgraph - a Subgraph object that will own the created Node.
1200	/// @param input_id - Value ID for the input tensor. The input tensor must be defined in the @a subgraph.
1201	/// @param output_id - Value ID for the output tensor. The output tensor must be defined in the @a subgraph, and its
1202	/// shape must match the shape of the input tensor.
1203	/// @param flags - binary features of the Ceiling Node. No supported flags are currently defined.
1204	enum xnn_status xnn_define_ceiling(
1205	xnn_subgraph_t subgraph,
1206	uint32_t input_id,
1207	uint32_t output_id,
1208	uint32_t flags);
1209
1210	/// Define a Clamp Node and add it to a Subgraph.
1211	///
1212	/// @param subgraph - a Subgraph object that will own the created Node.
1213	/// @param output_min - lower bound for clipping output values.
1214	/// @param output_max - upper bound for clipping output values.
1215	/// @param input_id - Value ID for the input tensor. The input tensor must be defined in the @a subgraph.
1216	/// @param output_id - Value ID for the output tensor. The output tensor must be defined in the @a subgraph, and its
1217	/// shape must match the shape of the input tensor.
1218	/// @param flags - binary features of the Clamp Node. No supported flags are currently defined.
1219	enum xnn_status xnn_define_clamp(
1220	xnn_subgraph_t subgraph,
1221	float output_min,
1222	float output_max,
1223	uint32_t input_id,
1224	uint32_t output_id,
1225	uint32_t flags);
1226
1227	/// Define an ELU (Exponential Linear Unit) Node and add it to a Subgraph.
1228	///
1229	/// @param subgraph - a Subgraph object that will own the created Node.
1230	/// @param alpha - scale factor for negative output elements.
1231	/// @param input_id - Value ID for the input tensor. The input tensor must be defined in the @a subgraph.
1232	/// @param output_id - Value ID for the output tensor. The output tensor must be defined in the @a subgraph, and its
1233	/// shape must match the shape of the input tensor.
1234	/// @param flags - binary features of the ELU Node. No supported flags are currently defined.
1235	enum xnn_status xnn_define_elu(
1236	xnn_subgraph_t subgraph,
1237	float alpha,
1238	uint32_t input_id,
1239	uint32_t output_id,
1240	uint32_t flags);
1241
1242	/// Define a Floor Node and add it to a Subgraph.
1243	///
1244	/// @param subgraph - a Subgraph object that will own the created Node.
1245	/// @param input_id - Value ID for the input tensor. The input tensor must be defined in the @a subgraph.
1246	/// @param output_id - Value ID for the output tensor. The output tensor must be defined in the @a subgraph, and its
1247	/// shape must match the shape of the input tensor.
1248	/// @param flags - binary features of the Floor Node. No supported flags are currently defined.
1249	enum xnn_status xnn_define_floor(
1250	xnn_subgraph_t subgraph,
1251	uint32_t input_id,
1252	uint32_t output_id,
1253	uint32_t flags);
1254
1255	/// Define a HardSwish Node and add it to a Subgraph.
1256	///
1257	/// @param subgraph - a Subgraph object that will own the created Node.
1258	/// @param input_id - Value ID for the input tensor. The input tensor must be defined in the @a subgraph.
1259	/// @param output_id - Value ID for the output tensor. The output tensor must be defined in the @a subgraph, and its
1260	/// shape must match the shape of the input tensor.
1261	/// @param flags - binary features of the HardSwish Node. No supported flags are currently defined.
1262	enum xnn_status xnn_define_hardswish(
1263	xnn_subgraph_t subgraph,
1264	uint32_t input_id,
1265	uint32_t output_id,
1266	uint32_t flags);
1267
1268	/// Define a Leaky ReLU Node and add it to a Subgraph.
1269	///
1270	/// @param subgraph - a Subgraph object that will own the created Node.
1271	/// @param negative_slope - scale factor for negative input elements.
1272	/// @param input_id - Value ID for the input tensor. The input tensor must be defined in the @a subgraph.
1273	/// @param output_id - Value ID for the output tensor. The output tensor must be defined in the @a subgraph, and its
1274	/// shape must match the shape of the input tensor.
1275	/// @param flags - binary features of the Leaky ReLU Node. No supported flags are currently defined.
1276	enum xnn_status xnn_define_leaky_relu(
1277	xnn_subgraph_t subgraph,
1278	float negative_slope,
1279	uint32_t input_id,
1280	uint32_t output_id,
1281	uint32_t flags);
1282
1283	/// Define a Negate Node and add it to a Subgraph.
1284	///
1285	/// @param subgraph - a Subgraph object that will own the created Node.
1286	/// @param input_id - Value ID for the input tensor. The input tensor must be defined in the @a subgraph.
1287	/// @param output_id - Value ID for the output tensor. The output tensor must be defined in the @a subgraph, and its
1288	/// shape must match the shape of the input tensor.
1289	/// @param flags - binary features of the Negate Node. No supported flags are currently defined.
1290	enum xnn_status xnn_define_negate(
1291	xnn_subgraph_t subgraph,
1292	uint32_t input_id,
1293	uint32_t output_id,
1294	uint32_t flags);
1295
1296	/// Define a Sigmoid Node and add it to a Subgraph.
1297	///
1298	/// @param subgraph - a Subgraph object that will own the created Node.
1299	/// @param input_id - Value ID for the input tensor. The input tensor must be defined in the @a subgraph.
1300	/// @param output_id - Value ID for the output tensor. The output tensor must be defined in the @a subgraph, and its
1301	/// shape must match the shape of the input tensor.
1302	/// @param flags - binary features of the Sigmoid Node. No supported flags are currently defined.
1303	enum xnn_status xnn_define_sigmoid(
1304	xnn_subgraph_t subgraph,
1305	uint32_t input_id,
1306	uint32_t output_id,
1307	uint32_t flags);
1308
1309	/// Define a SoftMax Node and add it to a Subgraph.
1310	///
1311	/// @param subgraph - a Subgraph object that will own the created Node.
1312	/// @param input_id - Value ID for the input tensor. The input tensor must be defined in the @a subgraph, and have at
1313	/// least one dimension.
1314	/// @param output_id - Value ID for the output tensor. The output tensor must be defined in the @a subgraph, and its
1315	/// shape must match the shape of the input tensor.
1316	/// @param flags - binary features of the SoftMax Node. No supported flags are currently defined.
1317	enum xnn_status xnn_define_softmax(
1318	xnn_subgraph_t subgraph,
1319	uint32_t input_id,
1320	uint32_t output_id,
1321	uint32_t flags);
1322
1323	/// Define a Square Node and add it to a Subgraph.
1324	///
1325	/// @param subgraph - a Subgraph object that will own the created Node.
1326	/// @param input_id - Value ID for the input tensor. The input tensor must be defined in the @a subgraph.
1327	/// @param output_id - Value ID for the output tensor. The output tensor must be defined in the @a subgraph, and its
1328	/// shape must match the shape of the input tensor.
1329	/// @param flags - binary features of the Square Node. No supported flags are currently defined.
1330	enum xnn_status xnn_define_square(
1331	xnn_subgraph_t subgraph,
1332	uint32_t input_id,
1333	uint32_t output_id,
1334	uint32_t flags);
1335
1336	/// Define a Square Root Node and add it to a Subgraph.
1337	///
1338	/// @param subgraph - a Subgraph object that will own the created Node.
1339	/// @param input_id - Value ID for the input tensor. The input tensor must be defined in the @a subgraph.
1340	/// @param output_id - Value ID for the output tensor. The output tensor must be defined in the @a subgraph, and its
1341	/// shape must match the shape of the input tensor.
1342	/// @param flags - binary features of the Square Root Node. No supported flags are currently defined.
1343	enum xnn_status xnn_define_square_root(
1344	xnn_subgraph_t subgraph,
1345	uint32_t input_id,
1346	uint32_t output_id,
1347	uint32_t flags);
1348
1349	/// Define a Static Transpose Node and add it to a Subgraph.
1350	///
1351	/// The Static Transpose Node applies a generalized transpose to the input tensor using the permuation in perm.
1352	///
1353	/// @param subgraph - a Subgraph object that will own the created Node.
1354	/// @param input_id - Value ID for the input tensor. The input tensor must be an N-dimensional tensor defined in
1355	/// the @a subgraph.
1356	/// @param output_id - Value ID for the output tensor. The output tensor must be an N-dimensional tensor defined
1357	/// in the @a subgraph with each dimension equal to its corresponding permuted input dimension.
1358	/// @param num_dims - the number of permutation dimensions. This must be equal to the number of input dimensions.
1359	/// @param perm - The permutation of the axis of the input tensor. The perm array must must contain 0 to N-1 in the
1360	/// permuted order.
1361	/// @param flags - binary features of the Static Transpose Node. No supported flags are currently defined.
1362	enum xnn_status xnn_define_static_transpose(
1363	xnn_subgraph_t subgraph,
1364	size_t num_dims,
1365	const size_t* perm,
1366	uint32_t input_id,
1367	uint32_t output_id,
1368	uint32_t flags);
1369
1370	/// Weights cache is a cache for packed weights. It can be reused between runtimes.
1371	typedef struct xnn_weights_cache* xnn_weights_cache_t;
1372
1373	enum xnn_status xnn_create_weights_cache(xnn_weights_cache_t* weights_cache_out);
1374
1375	/// Create a weights cache object specifying the initial size of weights cache (in bytes).
1376	/// @size - initial capacity of the weights cache (in bytes), i.e. it can hold size bytes without growing.
1377	/// @param weights_cache_out - pointer to the variable that will be initialized to a handle to the weights cache object
1378	/// upon successful return. Once created, the weights cache object can be shared between
1379	/// different Runtime objects.
1380	enum xnn_status xnn_create_weights_cache_with_size(size_t size, xnn_weights_cache_t* weights_cache_out);
1381
1382
1383	/// Weights cache can be finalized in these ways:
1384	enum xnn_weights_cache_finalization_kind {
1385	/// Weights cache is finalized, no insert operations into the weights cache is allowed, even if the "inserted"
1386	/// weights already exist in thee cache. Weights cache memory will also be trimmed to page boundary and set to
1387	/// read-only (to prevent writes).
1388	xnn_weights_cache_finalization_kind_hard,
1389	/// Weights cache will be finalized with some extra space at the end, this allows for "inserting" into the cache only
1390	/// if the weights are already in the cache, and errors on inserting uncached weights. There is memory overhead.
1391	xnn_weights_cache_finalization_kind_soft,
1392	};
1393
1394	/// Finalizes the weights cache. The kind of finalization is specified by `finalization_kind`.
1395	/// @param weights_cache - the weights cache object to finalize.
1396	/// @param finalization_kind - the kind of finalization.
1397	enum xnn_status xnn_finalize_weights_cache(
1398	xnn_weights_cache_t weights_cache,
1399	enum xnn_weights_cache_finalization_kind finalization_kind);
1400
1401	/// Destroy a weights cache object, as well as memory used for the cache.
1402	/// @param weights_cache - the weights cache object to destroy.
1403	enum xnn_status xnn_delete_weights_cache(xnn_weights_cache_t weights_cache);
1404
1405	typedef struct xnn_workspace* xnn_workspace_t;
1406
1407	/// Create a workspace object.
1408	/// @param workspace_out - pointer to the variable that will be initialized to a handle to the workspace object upon
1409	/// successful return. Once created, the workspace can be shared between different Runtime
1410	/// objects.
1411	enum xnn_status xnn_create_workspace(xnn_workspace_t* workspace_out);
1412	/// Destroy a workspace object, as well as memory used by the workspace. Object destruction can be deferred until all
1413	/// Runtime objects created with this workspace are destroyed.
1414	/// @param workspace - the workspace object to destroy.
1415	enum xnn_status xnn_release_workspace(xnn_workspace_t workspace);
1416
1417	/// Runtime is a combination of an execution plan for subgraph Nodes and a memory manager for subgraph Values.
1418	typedef struct xnn_runtime* xnn_runtime_t;
1419
1420	enum xnn_profile_info {
1421	/// Returns a size_t containing the number of operators.
1422	xnn_profile_info_num_operators,
1423	/// Returns a char[] containing the null character separated names of all operators.
1424	xnn_profile_info_operator_name,
1425	/// Returns a uint64_t[] with the runtimes of all operators in the same order as xnn_profile_info_operator_name.
1426	xnn_profile_info_operator_timing,
1427	};
1428
1429	/// Return profile information for all operators.
1430	///
1431	/// @param runtime - a Runtime object created with @ref xnn_create_runtime, @ref xnn_create_runtime_v2 or
1432	/// @ref xnn_create_runtime_v3.
1433	/// @param param_name - type of profile information required.
1434	/// @param param_value_size - the size in bytes of memory pointed to by param_value. If this is not sufficient then
1435	/// param_value_size_ret will be set to the required size and xnn_status_out_of_memory will be
1436	/// returned.
1437	/// @param param_value - a pointer to memory location where appropriate values for a given param_value will be written.
1438	/// @param param_value_size_ret - returns number of bytes required to write the result if param_value_size is not
1439	/// sufficient.
1440	enum xnn_status xnn_get_runtime_profiling_info(xnn_runtime_t runtime,
1441	enum xnn_profile_info param_name,
1442	size_t param_value_size,
1443	void* param_value,
1444	size_t* param_value_size_ret);
1445
1446	/// Create a Runtime object from a subgraph.
1447	///
1448	/// @param subgraph - a Subgraph object with all Values and Nodes that would be handled by the runtime. No Values or
1449	/// Nodes can be added to the runtime once it is constructed.
1450	/// @param weights_cache - a cache for packed weights. The runtime will look up and reuse packed weights in this cache,
1451	/// this will reduce memory allocated for packed weights.
1452	/// @param workspace - a workspace to hold internal tensors. The runtime will allocate space used for internal tensors
1453	/// and track them using workspace. Workspace can be shared and reused across different runtimes. If
1454	/// workspace is NULL, there will be no sharing: each runtime has its own workspace.
1455	/// @param threadpool - the thread pool to be used for parallelisation of computations in the runtime. If the thread
1456	/// pool is NULL, the computation would run on the caller thread without parallelization.
1457	/// @param flags - binary features of the runtime. The only currently supported values are
1458	/// XNN_FLAG_HINT_SPARSE_INFERENCE, XNN_FLAG_HINT_FP16_INFERENCE, XNN_FLAG_FORCE_FP16_INFERENCE, and
1459	/// XNN_FLAG_YIELD_WORKERS. If XNN_FLAG_YIELD_WORKERS is specified, worker threads would be yielded to
1460	/// the system scheduler after processing the last operator in the Runtime.
1461	/// @param runtime_out - pointer to the variable that will be initialized with a handle to the Runtime object upon
1462	/// successful return. Once constructed, the Runtime object is independent of the Subgraph object
1463	/// used to create it.
1464	enum xnn_status xnn_create_runtime_v4(
1465	xnn_subgraph_t subgraph,
1466	xnn_weights_cache_t weights_cache,
1467	xnn_workspace_t workspace,
1468	pthreadpool_t threadpool,
1469	uint32_t flags,
1470	xnn_runtime_t* runtime_out);
1471
1472	enum xnn_status xnn_create_runtime_v3(
1473	xnn_subgraph_t subgraph,
1474	xnn_weights_cache_t weights_cache,
1475	pthreadpool_t threadpool,
1476	uint32_t flags,
1477	xnn_runtime_t* runtime_out);
1478
1479	enum xnn_status xnn_create_runtime_v2(
1480	xnn_subgraph_t subgraph,
1481	pthreadpool_t threadpool,
1482	uint32_t flags,
1483	xnn_runtime_t* runtime_out);
1484
1485	enum xnn_status xnn_create_runtime(
1486	xnn_subgraph_t subgraph,
1487	xnn_runtime_t* runtime_out);
1488
1489	struct xnn_external_value {
1490	uint32_t id;
1491	void* data;
1492	};
1493
1494	/// Setup data pointers for external inputs and outputs in a Runtime object.
1495	///
1496	/// @param runtime - a Runtime object created with @ref xnn_create_runtime or @ref xnn_create_runtime_v2.
1497	/// @param num_external_values - the number of external inputs and outputs specified in this call. This number must
1498	/// match the number of external inputs and outputs in the runtime, i.e. all external
1499	/// inputs and outputs in the runtime must be specified in one call.
1500	/// @param external_values - array with location information for all external inputs and outputs in the runtime.
1501	enum xnn_status xnn_setup_runtime(
1502	xnn_runtime_t runtime,
1503	size_t num_external_values,
1504	const struct xnn_external_value* external_values);
1505
1506	/// Execute forward pass for all operators in the runtime.
1507	///
1508	/// @param runtime - the Runtime object with the execution plan to invoke.
1509	enum xnn_status xnn_invoke_runtime(
1510	xnn_runtime_t runtime);
1511
1512	/// Destroy a Runtime object, as well as operators and memory associated with it.
1513	///
1514	/// @param runtime - the Runtime object to destroy.
1515	enum xnn_status xnn_delete_runtime(
1516	xnn_runtime_t runtime);
1517
1518	typedef struct xnn_operator* xnn_operator_t;
1519
1520	enum xnn_status xnn_run_operator(
1521	xnn_operator_t op,
1522	pthreadpool_t threadpool);
1523
1524	enum xnn_status xnn_delete_operator(
1525	xnn_operator_t op);
1526
1527	#ifndef XNN_NO_F32_OPERATORS
1528
1529	enum xnn_status xnn_create_abs_nc_f32(
1530	size_t channels,
1531	size_t input_stride,
1532	size_t output_stride,
1533	uint32_t flags,
1534	xnn_operator_t* abs_op_out);
1535
1536	enum xnn_status xnn_setup_abs_nc_f32(
1537	xnn_operator_t abs_op,
1538	size_t batch_size,
1539	const float* input,
1540	float* output,
1541	pthreadpool_t threadpool);
1542
1543	enum xnn_status xnn_create_add_nd_f32(
1544	float output_min,
1545	float output_max,
1546	uint32_t flags,
1547	xnn_operator_t* add_op_out);
1548
1549	enum xnn_status xnn_setup_add_nd_f32(
1550	xnn_operator_t add_op,
1551	size_t num_input1_dims,
1552	const size_t* input1_shape,
1553	size_t num_input2_dims,
1554	const size_t* input2_shape,
1555	const float* input1,
1556	const float* input2,
1557	float* output,
1558	pthreadpool_t threadpool);
1559
1560	enum xnn_status xnn_run_add_nd_f32(
1561	size_t num_input1_dims,
1562	const size_t* input1_shape,
1563	size_t num_input2_dims,
1564	const size_t* input2_shape,
1565	const float* input1,
1566	const float* input2,
1567	float* output,
1568	float output_min,
1569	float output_max,
1570	uint32_t flags,
1571	pthreadpool_t threadpool);
1572
1573	enum xnn_status xnn_create_argmax_pooling2d_nhwc_f32(
1574	uint32_t input_padding_top,
1575	uint32_t input_padding_right,
1576	uint32_t input_padding_bottom,
1577	uint32_t input_padding_left,
1578	uint32_t pooling_height,
1579	uint32_t pooling_width,
1580	size_t channels,
1581	size_t input_pixel_stride,
1582	size_t output_pixel_stride,
1583	uint32_t flags,
1584	xnn_operator_t* argmax_pooling_op_out);
1585
1586	enum xnn_status xnn_setup_argmax_pooling2d_nhwc_f32(
1587	xnn_operator_t argmax_pooling_op,
1588	size_t batch_size,
1589	size_t input_height,
1590	size_t input_width,
1591	const float* input,
1592	float* output,
1593	uint32_t* index,
1594	pthreadpool_t threadpool);
1595
1596	enum xnn_status xnn_create_average_pooling2d_nhwc_f32(
1597	uint32_t input_padding_top,
1598	uint32_t input_padding_right,
1599	uint32_t input_padding_bottom,
1600	uint32_t input_padding_left,
1601	uint32_t pooling_height,
1602	uint32_t pooling_width,
1603	uint32_t stride_height,
1604	uint32_t stride_width,
1605	size_t channels,
1606	size_t input_pixel_stride,
1607	size_t output_pixel_stride,
1608	float output_min,
1609	float output_max,
1610	uint32_t flags,
1611	xnn_operator_t* average_pooling_op_out);
1612
1613	enum xnn_status xnn_setup_average_pooling2d_nhwc_f32(
1614	xnn_operator_t average_pooling_op,
1615	size_t batch_size,
1616	size_t input_height,
1617	size_t input_width,
1618	const float* input,
1619	float* output,
1620	pthreadpool_t threadpool);
1621
1622	enum xnn_status xnn_create_bankers_rounding_nc_f32(
1623	size_t channels,
1624	size_t input_stride,
1625	size_t output_stride,
1626	uint32_t flags,
1627	xnn_operator_t* rounding_op_out);
1628
1629	enum xnn_status xnn_setup_bankers_rounding_nc_f32(
1630	xnn_operator_t rounding_op,
1631	size_t batch_size,
1632	const float* input,
1633	float* output,
1634	pthreadpool_t threadpool);
1635
1636	enum xnn_status xnn_create_ceiling_nc_f32(
1637	size_t channels,
1638	size_t input_stride,
1639	size_t output_stride,
1640	uint32_t flags,
1641	xnn_operator_t* ceiling_op_out);
1642
1643	enum xnn_status xnn_setup_ceiling_nc_f32(
1644	xnn_operator_t ceiling_op,
1645	size_t batch_size,
1646	const float* input,
1647	float* output,
1648	pthreadpool_t threadpool);
1649
1650	enum xnn_status xnn_create_clamp_nc_f32(
1651	size_t channels,
1652	size_t input_stride,
1653	size_t output_stride,
1654	float output_min,
1655	float output_max,
1656	uint32_t flags,
1657	xnn_operator_t* clamp_op_out);
1658
1659	enum xnn_status xnn_setup_clamp_nc_f32(
1660	xnn_operator_t clamp_op,
1661	size_t batch_size,
1662	const float* input,
1663	float* output,
1664	pthreadpool_t threadpool);
1665
1666	typedef const struct xnn_caches* xnn_caches_t;
1667
1668	enum xnn_status xnn_create_convolution2d_nhwc_f32(
1669	uint32_t input_padding_top,
1670	uint32_t input_padding_right,
1671	uint32_t input_padding_bottom,
1672	uint32_t input_padding_left,
1673	uint32_t kernel_height,
1674	uint32_t kernel_width,
1675	uint32_t subsampling_height,
1676	uint32_t subsampling_width,
1677	uint32_t dilation_height,
1678	uint32_t dilation_width,
1679	uint32_t groups,
1680	size_t group_input_channels,
1681	size_t group_output_channels,
1682	size_t input_channel_stride,
1683	size_t output_channel_stride,
1684	const float* kernel,
1685	const float* bias,
1686	float output_min,
1687	float output_max,
1688	uint32_t flags,
1689	xnn_caches_t caches,
1690	xnn_operator_t* convolution_op_out);
1691
1692	// Forward declare.
1693	struct xnn_post_operation;
1694
1695	/// Create a convolution operator with a number of post operations. The
1696	/// convolution operator created using this function does not have output_min
1697	/// and output_max. The list of operators in post_operations will be applied in
1698	/// order. Convolution with post operations is only supported on JIT platforms
1699	/// and when JIT is enabled.
1700	enum xnn_status xnn_create_fused_convolution2d_nhwc_f32(
1701	uint32_t input_padding_top,
1702	uint32_t input_padding_right,
1703	uint32_t input_padding_bottom,
1704	uint32_t input_padding_left,
1705	uint32_t kernel_height,
1706	uint32_t kernel_width,
1707	uint32_t subsampling_height,
1708	uint32_t subsampling_width,
1709	uint32_t dilation_height,
1710	uint32_t dilation_width,
1711	uint32_t groups,
1712	size_t group_input_channels,
1713	size_t group_output_channels,
1714	size_t input_channel_stride,
1715	size_t output_channel_stride,
1716	const float* kernel,
1717	const float* bias,
1718	size_t num_post_operations,
1719	struct xnn_post_operation* post_operations,
1720	uint32_t flags,
1721	xnn_caches_t caches,
1722	xnn_operator_t* convolution_op_out);
1723
1724	enum xnn_status xnn_setup_convolution2d_nhwc_f32(
1725	xnn_operator_t convolution_op,
1726	size_t batch_size,
1727	size_t input_height,
1728	size_t input_width,
1729	const float* input,
1730	float* output,
1731	pthreadpool_t threadpool);
1732
1733	enum xnn_status xnn_create_deconvolution2d_nhwc_f32(
1734	uint32_t output_padding_top,
1735	uint32_t output_padding_right,
1736	uint32_t output_padding_bottom,
1737	uint32_t output_padding_left,
1738	uint32_t kernel_height,
1739	uint32_t kernel_width,
1740	uint32_t stride_height,
1741	uint32_t stride_width,
1742	uint32_t dilation_height,
1743	uint32_t dilation_width,
1744	uint32_t groups,
1745	size_t group_input_channels,
1746	size_t group_output_channels,
1747	size_t input_pixel_stride,
1748	size_t output_pixel_stride,
1749	const float* kernel,
1750	const float* bias,
1751	float output_min,
1752	float output_max,
1753	uint32_t flags,
1754	xnn_caches_t caches,
1755	xnn_operator_t* deconvolution_op_out);
1756
1757	enum xnn_status xnn_setup_deconvolution2d_nhwc_f32(
1758	xnn_operator_t deconvolution_op,
1759	size_t batch_size,
1760	size_t input_height,
1761	size_t input_width,
1762	uint32_t adjustment_height,
1763	uint32_t adjustment_width,
1764	const float* input,
1765	float* output,
1766	pthreadpool_t threadpool);
1767
1768	enum xnn_status xnn_create_divide_nd_f32(
1769	float output_min,
1770	float output_max,
1771	uint32_t flags,
1772	xnn_operator_t* divide_op_out);
1773
1774	enum xnn_status xnn_setup_divide_nd_f32(
1775	xnn_operator_t divide_op,
1776	size_t num_input1_dims,
1777	const size_t* input1_shape,
1778	size_t num_input2_dims,
1779	const size_t* input2_shape,
1780	const float* input1,
1781	const float* input2,
1782	float* output,
1783	pthreadpool_t threadpool);
1784
1785	enum xnn_status xnn_create_elu_nc_f32(
1786	size_t channels,
1787	size_t input_stride,
1788	size_t output_stride,
1789	float alpha,
1790	uint32_t flags,
1791	xnn_operator_t* elu_op_out);
1792
1793	enum xnn_status xnn_setup_elu_nc_f32(
1794	xnn_operator_t elu_op,
1795	size_t batch_size,
1796	const float* input,
1797	float* output,
1798	pthreadpool_t threadpool);
1799
1800	enum xnn_status xnn_create_floor_nc_f32(
1801	size_t channels,
1802	size_t input_stride,
1803	size_t output_stride,
1804	uint32_t flags,
1805	xnn_operator_t* floor_op_out);
1806
1807	enum xnn_status xnn_setup_floor_nc_f32(
1808	xnn_operator_t floor_op,
1809	size_t batch_size,
1810	const float* input,
1811	float* output,
1812	pthreadpool_t threadpool);
1813
1814	enum xnn_status xnn_create_fully_connected_nc_f32(
1815	size_t input_channels,
1816	size_t output_channels,
1817	size_t input_stride,
1818	size_t output_stride,
1819	const float* kernel,
1820	const float* bias,
1821	float output_min,
1822	float output_max,
1823	uint32_t flags,
1824	const xnn_caches_t caches,
1825	xnn_operator_t* fully_connected_op_out);
1826
1827	enum xnn_status xnn_setup_fully_connected_nc_f32(
1828	xnn_operator_t fully_connected_op,
1829	size_t batch_size,
1830	const float* input,
1831	float* output,
1832	pthreadpool_t threadpool);
1833
1834	enum xnn_status xnn_create_global_average_pooling_nwc_f32(
1835	size_t channels,
1836	size_t input_stride,
1837	size_t output_stride,
1838	float output_min,
1839	float output_max,
1840	uint32_t flags,
1841	xnn_operator_t* global_average_pooling_op_out);
1842
1843	enum xnn_status xnn_setup_global_average_pooling_nwc_f32(
1844	xnn_operator_t global_average_pooling_op,
1845	size_t batch_size,
1846	size_t width,
1847	const float* input,
1848	float* output,
1849	pthreadpool_t threadpool);
1850
1851	enum xnn_status xnn_create_hardswish_nc_f32(
1852	size_t channels,
1853	size_t input_stride,
1854	size_t output_stride,
1855	uint32_t flags,
1856	xnn_operator_t* hardswish_op_out);
1857
1858	enum xnn_status xnn_setup_hardswish_nc_f32(
1859	xnn_operator_t hardswish_op,
1860	size_t batch_size,
1861	const float* input,
1862	float* output,
1863	pthreadpool_t threadpool);
1864
1865	enum xnn_status xnn_create_leaky_relu_nc_f32(
1866	size_t channels,
1867	size_t input_stride,
1868	size_t output_stride,
1869	float negative_slope,
1870	uint32_t flags,
1871	xnn_operator_t* leaky_relu_op_out);
1872
1873	enum xnn_status xnn_setup_leaky_relu_nc_f32(
1874	xnn_operator_t leaky_relu_op,
1875	size_t batch_size,
1876	const float* input,
1877	float* output,
1878	pthreadpool_t threadpool);
1879
1880	enum xnn_status xnn_create_max_pooling2d_nhwc_f32(
1881	uint32_t input_padding_top,
1882	uint32_t input_padding_right,
1883	uint32_t input_padding_bottom,
1884	uint32_t input_padding_left,
1885	uint32_t pooling_height,
1886	uint32_t pooling_width,
1887	uint32_t stride_height,
1888	uint32_t stride_width,
1889	uint32_t dilation_height,
1890	uint32_t dilation_width,
1891	size_t channels,
1892	size_t input_pixel_stride,
1893	size_t output_pixel_stride,
1894	float output_min,
1895	float output_max,
1896	uint32_t flags,
1897	xnn_operator_t* max_pooling_op_out);
1898
1899	enum xnn_status xnn_setup_max_pooling2d_nhwc_f32(
1900	xnn_operator_t max_pooling_op,
1901	size_t batch_size,
1902	size_t input_height,
1903	size_t input_width,
1904	const float* input,
1905	float* output,
1906	pthreadpool_t threadpool);
1907
1908	enum xnn_status xnn_create_maximum_nd_f32(
1909	uint32_t flags,
1910	xnn_operator_t* maximum_op_out);
1911
1912	enum xnn_status xnn_setup_maximum_nd_f32(
1913	xnn_operator_t maximum_op,
1914	size_t num_input1_dims,
1915	const size_t* input1_shape,
1916	size_t num_input2_dims,
1917	const size_t* input2_shape,
1918	const float* input1,
1919	const float* input2,
1920	float* output,
1921	pthreadpool_t threadpool);
1922
1923	enum xnn_status xnn_create_minimum_nd_f32(
1924	uint32_t flags,
1925	xnn_operator_t* minimum_op_out);
1926
1927	enum xnn_status xnn_setup_minimum_nd_f32(
1928	xnn_operator_t minimum_op,
1929	size_t num_input1_dims,
1930	const size_t* input1_shape,
1931	size_t num_input2_dims,
1932	const size_t* input2_shape,
1933	const float* input1,
1934	const float* input2,
1935	float* output,
1936	pthreadpool_t threadpool);
1937
1938	enum xnn_status xnn_create_multiply_nd_f32(
1939	float output_min,
1940	float output_max,
1941	uint32_t flags,
1942	xnn_operator_t* multiply_op_out);
1943
1944	enum xnn_status xnn_setup_multiply_nd_f32(
1945	xnn_operator_t multiply_op,
1946	size_t num_input1_dims,
1947	const size_t* input1_shape,
1948	size_t num_input2_dims,
1949	const size_t* input2_shape,
1950	const float* input1,
1951	const float* input2,
1952	float* output,
1953	pthreadpool_t threadpool);
1954
1955	enum xnn_status xnn_create_negate_nc_f32(
1956	size_t channels,
1957	size_t input_stride,
1958	size_t output_stride,
1959	uint32_t flags,
1960	xnn_operator_t* negate_op_out);
1961
1962	enum xnn_status xnn_setup_negate_nc_f32(
1963	xnn_operator_t negate_op,
1964	size_t batch_size,
1965	const float* input,
1966	float* output,
1967	pthreadpool_t threadpool);
1968
1969	enum xnn_status xnn_create_prelu_nc_f32(
1970	size_t channels,
1971	size_t input_stride,
1972	size_t output_stride,
1973	const float* negative_slope,
1974	uint32_t flags,
1975	xnn_caches_t caches,
1976	xnn_operator_t* prelu_op_out);
1977
1978	enum xnn_status xnn_setup_prelu_nc_f32(
1979	xnn_operator_t prelu_op,
1980	size_t batch_size,
1981	const float* input,
1982	float* output,
1983	pthreadpool_t threadpool);
1984
1985	enum xnn_status xnn_create_resize_bilinear2d_nchw_f32(
1986	size_t channels,
1987	size_t input_pixel_stride,
1988	size_t output_pixel_stride,
1989	uint32_t flags,
1990	xnn_operator_t* resize_op_out);
1991
1992	enum xnn_status xnn_setup_resize_bilinear2d_nchw_f32(
1993	xnn_operator_t resize_op,
1994	size_t batch_size,
1995	size_t input_height,
1996	size_t input_width,
1997	size_t output_height,
1998	size_t output_width,
1999	const float* input,
2000	float* output,
2001	pthreadpool_t threadpool);
2002
2003	enum xnn_status xnn_create_resize_bilinear2d_nhwc_f32(
2004	size_t channels,
2005	size_t input_pixel_stride,
2006	size_t output_pixel_stride,
2007	uint32_t flags,
2008	xnn_operator_t* resize_op_out);
2009
2010	enum xnn_status xnn_setup_resize_bilinear2d_nhwc_f32(
2011	xnn_operator_t resize_op,
2012	size_t batch_size,
2013	size_t input_height,
2014	size_t input_width,
2015	size_t output_height,
2016	size_t output_width,
2017	const float* input,
2018	float* output,
2019	pthreadpool_t threadpool);
2020
2021	enum xnn_status xnn_create_sigmoid_nc_f32(
2022	size_t channels,
2023	size_t input_stride,
2024	size_t output_stride,
2025	uint32_t flags,
2026	xnn_operator_t* sigmoid_op_out);
2027
2028	enum xnn_status xnn_setup_sigmoid_nc_f32(
2029	xnn_operator_t sigmoid_op,
2030	size_t batch_size,
2031	const float* input,
2032	float* output,
2033	pthreadpool_t threadpool);
2034
2035	enum xnn_status xnn_create_softmax_nc_f32(
2036	size_t channels,
2037	size_t input_stride,
2038	size_t output_stride,
2039	uint32_t flags,
2040	xnn_operator_t* softmax_op_out);
2041
2042	enum xnn_status xnn_setup_softmax_nc_f32(
2043	xnn_operator_t softmax_op,
2044	size_t batch_size,
2045	const float* input,
2046	float* output,
2047	pthreadpool_t threadpool);
2048
2049	enum xnn_status xnn_create_square_nc_f32(
2050	size_t channels,
2051	size_t input_stride,
2052	size_t output_stride,
2053	uint32_t flags,
2054	xnn_operator_t* square_op_out);
2055
2056	enum xnn_status xnn_setup_square_nc_f32(
2057	xnn_operator_t square_op,
2058	size_t batch_size,
2059	const float* input,
2060	float* output,
2061	pthreadpool_t threadpool);
2062
2063	enum xnn_status xnn_create_square_root_nc_f32(
2064	size_t channels,
2065	size_t input_stride,
2066	size_t output_stride,
2067	uint32_t flags,
2068	xnn_operator_t* sqrt_op_out);
2069
2070	enum xnn_status xnn_setup_square_root_nc_f32(
2071	xnn_operator_t sqrt_op,
2072	size_t batch_size,
2073	const float* input,
2074	float* output,
2075	pthreadpool_t threadpool);
2076
2077	enum xnn_status xnn_create_squared_difference_nd_f32(
2078	uint32_t flags,
2079	xnn_operator_t* squared_difference_op_out);
2080
2081	enum xnn_status xnn_setup_squared_difference_nd_f32(
2082	xnn_operator_t squared_difference_op,
2083	size_t num_input1_dims,
2084	const size_t* input1_shape,
2085	size_t num_input2_dims,
2086	const size_t* input2_shape,
2087	const float* input1,
2088	const float* input2,
2089	float* output,
2090	pthreadpool_t threadpool);
2091
2092	enum xnn_status xnn_create_subtract_nd_f32(
2093	float output_min,
2094	float output_max,
2095	uint32_t flags,
2096	xnn_operator_t* subtract_op_out);
2097
2098	enum xnn_status xnn_setup_subtract_nd_f32(
2099	xnn_operator_t subtract_op,
2100	size_t num_input1_dims,
2101	const size_t* input1_shape,
2102	size_t num_input2_dims,
2103	const size_t* input2_shape,
2104	const float* input1,
2105	const float* input2,
2106	float* output,
2107	pthreadpool_t threadpool);
2108
2109	enum xnn_status xnn_create_truncation_nc_f32(
2110	size_t channels,
2111	size_t input_stride,
2112	size_t output_stride,
2113	uint32_t flags,
2114	xnn_operator_t* truncation_op_out);
2115
2116	enum xnn_status xnn_setup_truncation_nc_f32(
2117	xnn_operator_t truncation_op,
2118	size_t batch_size,
2119	const float* input,
2120	float* output,
2121	pthreadpool_t threadpool);
2122
2123	#ifndef XNN_NO_NCHW_OPERATORS
2124
2125	enum xnn_status xnn_create_convolution2d_nchw_f32(
2126	uint32_t input_padding_top,
2127	uint32_t input_padding_right,
2128	uint32_t input_padding_bottom,
2129	uint32_t input_padding_left,
2130	uint32_t kernel_height,
2131	uint32_t kernel_width,
2132	uint32_t subsampling_height,
2133	uint32_t subsampling_width,
2134	uint32_t dilation_height,
2135	uint32_t dilation_width,
2136	uint32_t groups,
2137	size_t group_input_channels,
2138	size_t group_output_channels,
2139	size_t input_channel_stride,
2140	size_t output_channel_stride,
2141	const float* kernel,
2142	const float* bias,
2143	float output_min,
2144	float output_max,
2145	uint32_t flags,
2146	xnn_caches_t caches,
2147	xnn_operator_t* convolution_op_out);
2148
2149	enum xnn_status xnn_setup_convolution2d_nchw_f32(
2150	xnn_operator_t convolution_op,
2151	size_t batch_size,
2152	size_t input_height,
2153	size_t input_width,
2154	const float* input,
2155	float* output,
2156	pthreadpool_t threadpool);
2157
2158	enum xnn_status xnn_create_global_average_pooling_ncw_f32(
2159	size_t channels,
2160	float output_min,
2161	float output_max,
2162	uint32_t flags,
2163	xnn_operator_t* global_average_pooling_op_out);
2164
2165	enum xnn_status xnn_setup_global_average_pooling_ncw_f32(
2166	xnn_operator_t global_average_pooling_op,
2167	size_t batch_size,
2168	size_t width,
2169	const float* input,
2170	float* output,
2171	pthreadpool_t threadpool);
2172
2173	#endif // XNN_NO_NCHW_OPERATORS
2174
2175	#endif // XNN_NO_F32_OPERATORS
2176
2177	#ifndef XNN_NO_X32_OPERATORS
2178
2179	enum xnn_status xnn_create_channel_shuffle_nc_x32(
2180	size_t groups,
2181	size_t group_channels,
2182	size_t input_stride,
2183	size_t output_stride,
2184	uint32_t flags,
2185	xnn_operator_t* channel_shuffle_op_out);
2186
2187	enum xnn_status xnn_setup_channel_shuffle_nc_x32(
2188	xnn_operator_t channel_shuffle_op,
2189	size_t batch_size,
2190	const void* input,
2191	void* output,
2192	pthreadpool_t threadpool);
2193
2194	enum xnn_status xnn_create_constant_pad_nd_x32(
2195	const void* padding_value,
2196	uint32_t flags,
2197	xnn_operator_t* constant_pad_op_out);
2198
2199	enum xnn_status xnn_setup_constant_pad_nd_x32(
2200	xnn_operator_t constant_pad_op,
2201	size_t num_dims,
2202	const size_t* input_shape,
2203	const size_t* pre_padding,
2204	const size_t* post_padding,
2205	const void* input,
2206	void* output,
2207	pthreadpool_t threadpool);
2208
2209	enum xnn_status xnn_create_copy_nc_x32(
2210	size_t channels,
2211	size_t input_stride,
2212	size_t output_stride,
2213	uint32_t flags,
2214	xnn_operator_t* copy_op_out);
2215
2216	enum xnn_status xnn_setup_copy_nc_x32(
2217	xnn_operator_t copy_op,
2218	size_t batch_size,
2219	const void* input,
2220	void* output,
2221	pthreadpool_t threadpool);
2222
2223	enum xnn_status xnn_create_depth_to_space_nhwc_x32(
2224	size_t output_channels,
2225	size_t input_channel_stride,
2226	size_t output_channel_stride,
2227	uint32_t block_size,
2228	uint32_t flags,
2229	xnn_operator_t* depth_to_space_op_out);
2230
2231	enum xnn_status xnn_setup_depth_to_space_nhwc_x32(
2232	xnn_operator_t depth_to_space_op,
2233	size_t batch_size,
2234	size_t input_height,
2235	size_t input_width,
2236	const void* input,
2237	void* output,
2238	pthreadpool_t threadpool);
2239
2240	enum xnn_status xnn_create_depth_to_space_nchw2nhwc_x32(
2241	size_t output_channels,
2242	size_t input_channel_stride,
2243	size_t output_channel_stride,
2244	uint32_t block_size,
2245	uint32_t flags,
2246	xnn_operator_t* depth_to_space_op_out);
2247
2248	enum xnn_status xnn_setup_depth_to_space_nchw2nhwc_x32(
2249	xnn_operator_t depth_to_space_op,
2250	size_t batch_size,
2251	size_t input_height,
2252	size_t input_width,
2253	const void* input,
2254	void* output,
2255	pthreadpool_t threadpool);
2256
2257	enum xnn_status xnn_create_space_to_depth_nhwc_x32(
2258	size_t input_channels,
2259	size_t input_channel_stride,
2260	size_t output_channel_stride,
2261	uint32_t block_size,
2262	uint32_t flags,
2263	xnn_operator_t* space_to_depth_op_out);
2264
2265	enum xnn_status xnn_setup_space_to_depth_nhwc_x32(
2266	xnn_operator_t space_to_depth_op,
2267	size_t batch_size,
2268	size_t input_height,
2269	size_t input_width,
2270	const void* input,
2271	void* output,
2272	pthreadpool_t threadpool);
2273
2274	enum xnn_status xnn_create_transpose_nd_x32(
2275	uint32_t flags,
2276	xnn_operator_t* transpose_op_out);
2277
2278	enum xnn_status xnn_setup_transpose_nd_x32(
2279	xnn_operator_t transpose_op,
2280	const void* input,
2281	void* output,
2282	const size_t num_dims,
2283	const size_t* input_shape,
2284	const size_t* output_perm,
2285	pthreadpool_t threadpool);
2286
2287	enum xnn_status xnn_run_transpose_nd_x32(
2288	const void* input,
2289	void* output,
2290	const size_t num_dims,
2291	const size_t* input_shape,
2292	const size_t* output_perm,
2293	uint32_t flags,
2294	pthreadpool_t threadpool);
2295
2296	enum xnn_status xnn_create_unpooling2d_nhwc_x32(
2297	uint32_t input_padding_top,
2298	uint32_t input_padding_right,
2299	uint32_t input_padding_bottom,
2300	uint32_t input_padding_left,
2301	uint32_t pooling_height,
2302	uint32_t pooling_width,
2303	size_t channels,
2304	size_t input_pixel_stride,
2305	size_t output_pixel_stride,
2306	uint32_t flags,
2307	xnn_operator_t* unpooling_op_out);
2308
2309	enum xnn_status xnn_setup_unpooling2d_nhwc_x32(
2310	xnn_operator_t unpooling_op,
2311	size_t batch_size,
2312	size_t input_height,
2313	size_t input_width,
2314	const void* input,
2315	const uint32_t* index,
2316	void* output,
2317	pthreadpool_t threadpool);
2318
2319	#endif // XNN_NO_X32_OPERATORS
2320
2321	#ifndef XNN_NO_F16_OPERATORS
2322
2323	enum xnn_status xnn_create_abs_nc_f16(
2324	size_t channels,
2325	size_t input_stride,
2326	size_t output_stride,
2327	uint32_t flags,
2328	xnn_operator_t* abs_op_out);
2329
2330	enum xnn_status xnn_setup_abs_nc_f16(
2331	xnn_operator_t abs_op,
2332	size_t batch_size,
2333	const void* input,
2334	void* output,
2335	pthreadpool_t threadpool);
2336
2337	enum xnn_status xnn_create_add_nd_f16(
2338	float output_min,
2339	float output_max,
2340	uint32_t flags,
2341	xnn_operator_t* add_op_out);
2342
2343	enum xnn_status xnn_setup_add_nd_f16(
2344	xnn_operator_t add_op,
2345	size_t num_input1_dims,
2346	const size_t* input1_shape,
2347	size_t num_input2_dims,
2348	const size_t* input2_shape,
2349	const void* input1,
2350	const void* input2,
2351	void* output,
2352	pthreadpool_t threadpool);
2353
2354	enum xnn_status xnn_create_average_pooling2d_nhwc_f16(
2355	uint32_t input_padding_top,
2356	uint32_t input_padding_right,
2357	uint32_t input_padding_bottom,
2358	uint32_t input_padding_left,
2359	uint32_t pooling_height,
2360	uint32_t pooling_width,
2361	uint32_t stride_height,
2362	uint32_t stride_width,
2363	size_t channels,
2364	size_t input_pixel_stride,
2365	size_t output_pixel_stride,
2366	float output_min,
2367	float output_max,
2368	uint32_t flags,
2369	xnn_operator_t* average_pooling_op_out);
2370
2371	enum xnn_status xnn_setup_average_pooling2d_nhwc_f16(
2372	xnn_operator_t average_pooling_op,
2373	size_t batch_size,
2374	size_t input_height,
2375	size_t input_width,
2376	const void* input,
2377	void* output,
2378	pthreadpool_t threadpool);
2379
2380	enum xnn_status xnn_create_bankers_rounding_nc_f16(
2381	size_t channels,
2382	size_t input_stride,
2383	size_t output_stride,
2384	uint32_t flags,
2385	xnn_operator_t* rounding_op_out);
2386
2387	enum xnn_status xnn_setup_bankers_rounding_nc_f16(
2388	xnn_operator_t rounding_op,
2389	size_t batch_size,
2390	const void* input,
2391	void* output,
2392	pthreadpool_t threadpool);
2393
2394	enum xnn_status xnn_create_ceiling_nc_f16(
2395	size_t channels,
2396	size_t input_stride,
2397	size_t output_stride,
2398	uint32_t flags,
2399	xnn_operator_t* ceiling_op_out);
2400
2401	enum xnn_status xnn_setup_ceiling_nc_f16(
2402	xnn_operator_t ceiling_op,
2403	size_t batch_size,
2404	const void* input,
2405	void* output,
2406	pthreadpool_t threadpool);
2407
2408	enum xnn_status xnn_create_clamp_nc_f16(
2409	size_t channels,
2410	size_t input_stride,
2411	size_t output_stride,
2412	float output_min,
2413	float output_max,
2414	uint32_t flags,
2415	xnn_operator_t* clamp_op_out);
2416
2417	enum xnn_status xnn_setup_clamp_nc_f16(
2418	xnn_operator_t clamp_op,
2419	size_t batch_size,
2420	const void* input,
2421	void* output,
2422	pthreadpool_t threadpool);
2423
2424	enum xnn_status xnn_create_convolution2d_nhwc_f16(
2425	uint32_t input_padding_top,
2426	uint32_t input_padding_right,
2427	uint32_t input_padding_bottom,
2428	uint32_t input_padding_left,
2429	uint32_t kernel_height,
2430	uint32_t kernel_width,
2431	uint32_t subsampling_height,
2432	uint32_t subsampling_width,
2433	uint32_t dilation_height,
2434	uint32_t dilation_width,
2435	uint32_t groups,
2436	size_t group_input_channels,
2437	size_t group_output_channels,
2438	size_t input_channel_stride,
2439	size_t output_channel_stride,
2440	const void* kernel,
2441	const void* bias,
2442	float output_min,
2443	float output_max,
2444	uint32_t flags,
2445	xnn_caches_t caches,
2446	xnn_operator_t* convolution_op_out);
2447
2448	enum xnn_status xnn_setup_convolution2d_nhwc_f16(
2449	xnn_operator_t convolution_op,
2450	size_t batch_size,
2451	size_t input_height,
2452	size_t input_width,
2453	const void* input,
2454	void* output,
2455	pthreadpool_t threadpool);
2456
2457	enum xnn_status xnn_create_deconvolution2d_nhwc_f16(
2458	uint32_t output_padding_top,
2459	uint32_t output_padding_right,
2460	uint32_t output_padding_bottom,
2461	uint32_t output_padding_left,
2462	uint32_t kernel_height,
2463	uint32_t kernel_width,
2464	uint32_t stride_height,
2465	uint32_t stride_width,
2466	uint32_t dilation_height,
2467	uint32_t dilation_width,
2468	uint32_t groups,
2469	size_t group_input_channels,
2470	size_t group_output_channels,
2471	size_t input_pixel_stride,
2472	size_t output_pixel_stride,
2473	const void* kernel,
2474	const void* bias,
2475	float output_min,
2476	float output_max,
2477	uint32_t flags,
2478	xnn_caches_t caches,
2479	xnn_operator_t* deconvolution_op_out);
2480
2481	enum xnn_status xnn_setup_deconvolution2d_nhwc_f16(
2482	xnn_operator_t deconvolution_op,
2483	size_t batch_size,
2484	size_t input_height,
2485	size_t input_width,
2486	uint32_t adjustment_height,
2487	uint32_t adjustment_width,
2488	const void* input,
2489	void* output,
2490	pthreadpool_t threadpool);
2491
2492	enum xnn_status xnn_create_divide_nd_f16(
2493	float output_min,
2494	float output_max,
2495	uint32_t flags,
2496	xnn_operator_t* divide_op_out);
2497
2498	enum xnn_status xnn_setup_divide_nd_f16(
2499	xnn_operator_t divide_op,
2500	size_t num_input1_dims,
2501	const size_t* input1_shape,
2502	size_t num_input2_dims,
2503	const size_t* input2_shape,
2504	const void* input1,
2505	const void* input2,
2506	void* output,
2507	pthreadpool_t threadpool);
2508
2509	enum xnn_status xnn_create_elu_nc_f16(
2510	size_t channels,
2511	size_t input_stride,
2512	size_t output_stride,
2513	float alpha,
2514	uint32_t flags,
2515	xnn_operator_t* elu_op_out);
2516
2517	enum xnn_status xnn_setup_elu_nc_f16(
2518	xnn_operator_t elu_op,
2519	size_t batch_size,
2520	const void* input,
2521	void* output,
2522	pthreadpool_t threadpool);
2523
2524	enum xnn_status xnn_create_floor_nc_f16(
2525	size_t channels,
2526	size_t input_stride,
2527	size_t output_stride,
2528	uint32_t flags,
2529	xnn_operator_t* floor_op_out);
2530
2531	enum xnn_status xnn_setup_floor_nc_f16(
2532	xnn_operator_t floor_op,
2533	size_t batch_size,
2534	const void* input,
2535	void* output,
2536	pthreadpool_t threadpool);
2537
2538	enum xnn_status xnn_create_fully_connected_nc_f16(
2539	size_t input_channels,
2540	size_t output_channels,
2541	size_t input_stride,
2542	size_t output_stride,
2543	const void* kernel,
2544	const void* bias,
2545	float output_min,
2546	float output_max,
2547	uint32_t flags,
2548	xnn_caches_t caches,
2549	xnn_operator_t* fully_connected_op_out);
2550
2551	enum xnn_status xnn_setup_fully_connected_nc_f16(
2552	xnn_operator_t fully_connected_op,
2553	size_t batch_size,
2554	const void* input,
2555	void* output,
2556	pthreadpool_t threadpool);
2557
2558	enum xnn_status xnn_create_global_average_pooling_nwc_f16(
2559	size_t channels,
2560	size_t input_stride,
2561	size_t output_stride,
2562	float output_min,
2563	float output_max,
2564	uint32_t flags,
2565	xnn_operator_t* global_average_pooling_op_out);
2566
2567	enum xnn_status xnn_setup_global_average_pooling_nwc_f16(
2568	xnn_operator_t global_average_pooling_op,
2569	size_t batch_size,
2570	size_t width,
2571	const void* input,
2572	void* output,
2573	pthreadpool_t threadpool);
2574
2575	enum xnn_status xnn_create_hardswish_nc_f16(
2576	size_t channels,
2577	size_t input_stride,
2578	size_t output_stride,
2579	uint32_t flags,
2580	xnn_operator_t* hardswish_op_out);
2581
2582	enum xnn_status xnn_setup_hardswish_nc_f16(
2583	xnn_operator_t hardswish_op,
2584	size_t batch_size,
2585	const void* input,
2586	void* output,
2587	pthreadpool_t threadpool);
2588
2589	enum xnn_status xnn_create_leaky_relu_nc_f16(
2590	size_t channels,
2591	size_t input_stride,
2592	size_t output_stride,
2593	float negative_slope,
2594	uint32_t flags,
2595	xnn_operator_t* leaky_relu_op_out);
2596
2597	enum xnn_status xnn_setup_leaky_relu_nc_f16(
2598	xnn_operator_t leaky_relu_op,
2599	size_t batch_size,
2600	const void* input,
2601	void* output,
2602	pthreadpool_t threadpool);
2603
2604	enum xnn_status xnn_create_max_pooling2d_nhwc_f16(
2605	uint32_t input_padding_top,
2606	uint32_t input_padding_right,
2607	uint32_t input_padding_bottom,
2608	uint32_t input_padding_left,
2609	uint32_t pooling_height,
2610	uint32_t pooling_width,
2611	uint32_t stride_height,
2612	uint32_t stride_width,
2613	uint32_t dilation_height,
2614	uint32_t dilation_width,
2615	size_t channels,
2616	size_t input_pixel_stride,
2617	size_t output_pixel_stride,
2618	float output_min,
2619	float output_max,
2620	uint32_t flags,
2621	xnn_operator_t* max_pooling_op_out);
2622
2623	enum xnn_status xnn_setup_max_pooling2d_nhwc_f16(
2624	xnn_operator_t max_pooling_op,
2625	size_t batch_size,
2626	size_t input_height,
2627	size_t input_width,
2628	const void* input,
2629	void* output,
2630	pthreadpool_t threadpool);
2631
2632	enum xnn_status xnn_create_maximum_nd_f16(
2633	uint32_t flags,
2634	xnn_operator_t* maximum_op_out);
2635
2636	enum xnn_status xnn_setup_maximum_nd_f16(
2637	xnn_operator_t maximum_op,
2638	size_t num_input1_dims,
2639	const size_t* input1_shape,
2640	size_t num_input2_dims,
2641	const size_t* input2_shape,
2642	const void* input1,
2643	const void* input2,
2644	void* output,
2645	pthreadpool_t threadpool);
2646
2647	enum xnn_status xnn_create_minimum_nd_f16(
2648	uint32_t flags,
2649	xnn_operator_t* minimum_op_out);
2650
2651	enum xnn_status xnn_setup_minimum_nd_f16(
2652	xnn_operator_t minimum_op,
2653	size_t num_input1_dims,
2654	const size_t* input1_shape,
2655	size_t num_input2_dims,
2656	const size_t* input2_shape,
2657	const void* input1,
2658	const void* input2,
2659	void* output,
2660	pthreadpool_t threadpool);
2661
2662	enum xnn_status xnn_create_multiply_nd_f16(
2663	float output_min,
2664	float output_max,
2665	uint32_t flags,
2666	xnn_operator_t* multiply_op_out);
2667
2668	enum xnn_status xnn_setup_multiply_nd_f16(
2669	xnn_operator_t multiply_op,
2670	size_t num_input1_dims,
2671	const size_t* input1_shape,
2672	size_t num_input2_dims,
2673	const size_t* input2_shape,
2674	const void* input1,
2675	const void* input2,
2676	void* output,
2677	pthreadpool_t threadpool);
2678
2679	enum xnn_status xnn_create_negate_nc_f16(
2680	size_t channels,
2681	size_t input_stride,
2682	size_t output_stride,
2683	uint32_t flags,
2684	xnn_operator_t* negate_op_out);
2685
2686	enum xnn_status xnn_setup_negate_nc_f16(
2687	xnn_operator_t negate_op,
2688	size_t batch_size,
2689	const void* input,
2690	void* output,
2691	pthreadpool_t threadpool);
2692
2693	enum xnn_status xnn_create_prelu_nc_f16(
2694	size_t channels,
2695	size_t input_stride,
2696	size_t output_stride,
2697	const void* negative_slope,
2698	uint32_t flags,
2699	xnn_caches_t caches,
2700	xnn_operator_t* prelu_op_out);
2701
2702	enum xnn_status xnn_setup_prelu_nc_f16(
2703	xnn_operator_t prelu_op,
2704	size_t batch_size,
2705	const void* input,
2706	void* output,
2707	pthreadpool_t threadpool);
2708
2709	enum xnn_status xnn_create_resize_bilinear2d_nhwc_f16(
2710	size_t channels,
2711	size_t input_pixel_stride,
2712	size_t output_pixel_stride,
2713	uint32_t flags,
2714	xnn_operator_t* resize_op_out);
2715
2716	enum xnn_status xnn_setup_resize_bilinear2d_nhwc_f16(
2717	xnn_operator_t resize_op,
2718	size_t batch_size,
2719	size_t input_height,
2720	size_t input_width,
2721	size_t output_height,
2722	size_t output_width,
2723	const void* input,
2724	void* output,
2725	pthreadpool_t threadpool);
2726
2727	enum xnn_status xnn_create_sigmoid_nc_f16(
2728	size_t channels,
2729	size_t input_stride,
2730	size_t output_stride,
2731	uint32_t flags,
2732	xnn_operator_t* sigmoid_op_out);
2733
2734	enum xnn_status xnn_setup_sigmoid_nc_f16(
2735	xnn_operator_t sigmoid_op,
2736	size_t batch_size,
2737	const void* input,
2738	void* output,
2739	pthreadpool_t threadpool);
2740
2741	enum xnn_status xnn_create_softmax_nc_f16(
2742	size_t channels,
2743	size_t input_stride,
2744	size_t output_stride,
2745	uint32_t flags,
2746	xnn_operator_t* softmax_op_out);
2747
2748	enum xnn_status xnn_setup_softmax_nc_f16(
2749	xnn_operator_t softmax_op,
2750	size_t batch_size,
2751	const void* input,
2752	void* output,
2753	pthreadpool_t threadpool);
2754
2755	enum xnn_status xnn_create_square_nc_f16(
2756	size_t channels,
2757	size_t input_stride,
2758	size_t output_stride,
2759	uint32_t flags,
2760	xnn_operator_t* square_op_out);
2761
2762	enum xnn_status xnn_setup_square_nc_f16(
2763	xnn_operator_t square_op,
2764	size_t batch_size,
2765	const void* input,
2766	void* output,
2767	pthreadpool_t threadpool);
2768
2769	enum xnn_status xnn_create_square_root_nc_f16(
2770	size_t channels,
2771	size_t input_stride,
2772	size_t output_stride,
2773	uint32_t flags,
2774	xnn_operator_t* sqrt_op_out);
2775
2776	enum xnn_status xnn_setup_square_root_nc_f16(
2777	xnn_operator_t sqrt_op,
2778	size_t batch_size,
2779	const void* input,
2780	void* output,
2781	pthreadpool_t threadpool);
2782
2783	enum xnn_status xnn_create_squared_difference_nd_f16(
2784	uint32_t flags,
2785	xnn_operator_t* squared_difference_op_out);
2786
2787	enum xnn_status xnn_setup_squared_difference_nd_f16(
2788	xnn_operator_t squared_difference_op,
2789	size_t num_input1_dims,
2790	const size_t* input1_shape,
2791	size_t num_input2_dims,
2792	const size_t* input2_shape,
2793	const void* input1,
2794	const void* input2,
2795	void* output,
2796	pthreadpool_t threadpool);
2797
2798	enum xnn_status xnn_create_subtract_nd_f16(
2799	float output_min,
2800	float output_max,
2801	uint32_t flags,
2802	xnn_operator_t* subtract_op_out);
2803
2804	enum xnn_status xnn_setup_subtract_nd_f16(
2805	xnn_operator_t subtract_op,
2806	size_t num_input1_dims,
2807	const size_t* input1_shape,
2808	size_t num_input2_dims,
2809	const size_t* input2_shape,
2810	const void* input1,
2811	const void* input2,
2812	void* output,
2813	pthreadpool_t threadpool);
2814
2815	enum xnn_status xnn_create_truncation_nc_f16(
2816	size_t channels,
2817	size_t input_stride,
2818	size_t output_stride,
2819	uint32_t flags,
2820	xnn_operator_t* truncation_op_out);
2821
2822	enum xnn_status xnn_setup_truncation_nc_f16(
2823	xnn_operator_t truncation_op,
2824	size_t batch_size,
2825	const void* input,
2826	void* output,
2827	pthreadpool_t threadpool);
2828
2829	#endif // XNN_NO_F16_OPERATORS
2830
2831	#ifndef XNN_NO_X16_OPERATORS
2832
2833	enum xnn_status xnn_create_constant_pad_nd_x16(
2834	const void* padding_value,
2835	uint32_t flags,
2836	xnn_operator_t* constant_pad_op_out);
2837
2838	enum xnn_status xnn_setup_constant_pad_nd_x16(
2839	xnn_operator_t constant_pad_op,
2840	size_t num_dims,
2841	const size_t* input_shape,
2842	const size_t* pre_padding,
2843	const size_t* post_padding,
2844	const void* input,
2845	void* output,
2846	pthreadpool_t threadpool);
2847
2848	enum xnn_status xnn_create_copy_nc_x16(
2849	size_t channels,
2850	size_t input_stride,
2851	size_t output_stride,
2852	uint32_t flags,
2853	xnn_operator_t* copy_op_out);
2854
2855	enum xnn_status xnn_setup_copy_nc_x16(
2856	xnn_operator_t copy_op,
2857	size_t batch_size,
2858	const void* input,
2859	void* output,
2860	pthreadpool_t threadpool);
2861
2862	enum xnn_status xnn_create_depth_to_space_nhwc_x16(
2863	size_t output_channels,
2864	size_t input_channel_stride,
2865	size_t output_channel_stride,
2866	uint32_t block_size,
2867	uint32_t flags,
2868	xnn_operator_t* depth_to_space_op_out);
2869
2870	enum xnn_status xnn_setup_depth_to_space_nhwc_x16(
2871	xnn_operator_t depth_to_space_op,
2872	size_t batch_size,
2873	size_t input_height,
2874	size_t input_width,
2875	const void* input,
2876	void* output,
2877	pthreadpool_t threadpool);
2878
2879	enum xnn_status xnn_create_space_to_depth_nhwc_x16(
2880	size_t input_channels,
2881	size_t input_channel_stride,
2882	size_t output_channel_stride,
2883	uint32_t block_size,
2884	uint32_t flags,
2885	xnn_operator_t* space_to_depth_op_out);
2886
2887	enum xnn_status xnn_setup_space_to_depth_nhwc_x16(
2888	xnn_operator_t space_to_depth_op,
2889	size_t batch_size,
2890	size_t input_height,
2891	size_t input_width,
2892	const void* input,
2893	void* output,
2894	pthreadpool_t threadpool);
2895
2896	enum xnn_status xnn_create_transpose_nd_x16(
2897	uint32_t flags,
2898	xnn_operator_t* transpose_op_out);
2899
2900	enum xnn_status xnn_setup_transpose_nd_x16(
2901	xnn_operator_t transpose_op,
2902	const void* input,
2903	void* output,
2904	const size_t num_dims,
2905	const size_t* input_shape,
2906	const size_t* output_perm,
2907	pthreadpool_t threadpool);
2908
2909	enum xnn_status xnn_run_transpose_nd_x16(
2910	const void* input,
2911	void* output,
2912	const size_t num_dims,
2913	const size_t* input_shape,
2914	const size_t* output_perm,
2915	uint32_t flags,
2916	pthreadpool_t threadpool);
2917
2918	#endif // XNN_NO_X16_OPERATORS
2919
2920	#ifndef XNN_NO_QC8_OPERATORS
2921
2922	enum xnn_status xnn_create_convolution2d_nhwc_qc8(
2923	uint32_t input_padding_top,
2924	uint32_t input_padding_right,
2925	uint32_t input_padding_bottom,
2926	uint32_t input_padding_left,
2927	uint32_t kernel_height,
2928	uint32_t kernel_width,
2929	uint32_t subsampling_height,
2930	uint32_t subsampling_width,
2931	uint32_t dilation_height,
2932	uint32_t dilation_width,
2933	uint32_t groups,
2934	size_t group_input_channels,
2935	size_t group_output_channels,
2936	size_t input_channel_stride,
2937	size_t output_channel_stride,
2938	int8_t input_zero_point,
2939	float input_scale,
2940	const float* kernel_scale,
2941	const int8_t* kernel,
2942	const int32_t* bias,
2943	int8_t output_zero_point,
2944	float output_scale,
2945	int8_t output_min,
2946	int8_t output_max,
2947	uint32_t flags,
2948	xnn_caches_t caches,
2949	xnn_operator_t* convolution_op_out);
2950
2951	enum xnn_status xnn_setup_convolution2d_nhwc_qc8(
2952	xnn_operator_t convolution_op,
2953	size_t batch_size,
2954	size_t input_height,
2955	size_t input_width,
2956	const int8_t* input,
2957	int8_t* output,
2958	pthreadpool_t threadpool);
2959
2960	#endif // XNN_NO_QC8_OPERATORS
2961
2962	#ifndef XNN_NO_QS8_OPERATORS
2963
2964	enum xnn_status xnn_create_add_nd_qs8(
2965	int8_t input1_zero_point,
2966	float input1_scale,
2967	int8_t input2_zero_point,
2968	float input2_scale,
2969	int8_t output_zero_point,
2970	float output_scale,
2971	int8_t output_min,
2972	int8_t output_max,
2973	uint32_t flags,
2974	xnn_operator_t* add_op_out);
2975
2976	enum xnn_status xnn_setup_add_nd_qs8(
2977	xnn_operator_t add_op,
2978	size_t num_input1_dims,
2979	const size_t* input1_shape,
2980	size_t num_input2_dims,
2981	const size_t* input2_shape,
2982	const int8_t* input1,
2983	const int8_t* input2,
2984	int8_t* output,
2985	pthreadpool_t threadpool);
2986
2987	enum xnn_status xnn_create_convolution2d_nhwc_qs8(
2988	uint32_t input_padding_top,
2989	uint32_t input_padding_right,
2990	uint32_t input_padding_bottom,
2991	uint32_t input_padding_left,
2992	uint32_t kernel_height,
2993	uint32_t kernel_width,
2994	uint32_t subsampling_height,
2995	uint32_t subsampling_width,
2996	uint32_t dilation_height,
2997	uint32_t dilation_width,
2998	uint32_t groups,
2999	size_t group_input_channels,
3000	size_t group_output_channels,
3001	size_t input_channel_stride,
3002	size_t output_channel_stride,
3003	int8_t input_zero_point,
3004	float input_scale,
3005	float kernel_scale,
3006	const int8_t* kernel,
3007	const int32_t* bias,
3008	int8_t output_zero_point,
3009	float output_scale,
3010	int8_t output_min,
3011	int8_t output_max,
3012	uint32_t flags,
3013	xnn_caches_t caches,
3014	xnn_operator_t* convolution_op_out);
3015
3016	enum xnn_status xnn_setup_convolution2d_nhwc_qs8(
3017	xnn_operator_t convolution_op,
3018	size_t batch_size,
3019	size_t input_height,
3020	size_t input_width,
3021	const int8_t* input,
3022	int8_t* output,
3023	pthreadpool_t threadpool);
3024
3025	enum xnn_status xnn_create_deconvolution2d_nhwc_qs8(
3026	uint32_t output_padding_top,
3027	uint32_t output_padding_right,
3028	uint32_t output_padding_bottom,
3029	uint32_t output_padding_left,
3030	uint32_t kernel_height,
3031	uint32_t kernel_width,
3032	uint32_t stride_height,
3033	uint32_t stride_width,
3034	uint32_t dilation_height,
3035	uint32_t dilation_width,
3036	uint32_t groups,
3037	size_t group_input_channels,
3038	size_t group_output_channels,
3039	size_t input_pixel_stride,
3040	size_t output_pixel_stride,
3041	int8_t input_zero_point,
3042	float input_scale,
3043	float kernel_scale,
3044	const int8_t* kernel,
3045	const int32_t* bias,
3046	int8_t output_zero_point,
3047	float output_scale,
3048	int8_t output_min,
3049	int8_t output_max,
3050	uint32_t flags,
3051	xnn_caches_t caches,
3052	xnn_operator_t* deconvolution_op_out);
3053
3054	enum xnn_status xnn_setup_deconvolution2d_nhwc_qs8(
3055	xnn_operator_t deconvolution_op,
3056	size_t batch_size,
3057	size_t input_height,
3058	size_t input_width,
3059	uint32_t adjustment_height,
3060	uint32_t adjustment_width,
3061	const int8_t* input,
3062	int8_t* output,
3063	pthreadpool_t threadpool);
3064
3065	enum xnn_status xnn_create_elu_nc_qs8(
3066	size_t channels,
3067	size_t input_stride,
3068	size_t output_stride,
3069	float alpha,
3070	int8_t input_zero_point,
3071	float input_scale,
3072	int8_t output_zero_point,
3073	float output_scale,
3074	int8_t output_min,
3075	int8_t output_max,
3076	uint32_t flags,
3077	xnn_operator_t* elu_op_out);
3078
3079	enum xnn_status xnn_setup_elu_nc_qs8(
3080	xnn_operator_t elu_op,
3081	size_t batch_size,
3082	const int8_t* input,
3083	int8_t* output,
3084	pthreadpool_t threadpool);
3085
3086	enum xnn_status xnn_create_fully_connected_nc_qs8(
3087	size_t input_channels,
3088	size_t output_channels,
3089	size_t input_stride,
3090	size_t output_stride,
3091	int8_t input_zero_point,
3092	float input_scale,
3093	float kernel_scale,
3094	const int8_t* kernel,
3095	const int32_t* bias,
3096	int8_t output_zero_point,
3097	float output_scale,
3098	int8_t output_min,
3099	int8_t output_max,
3100	uint32_t flags,
3101	xnn_caches_t caches,
3102	xnn_operator_t* fully_connected_op_out);
3103
3104	enum xnn_status xnn_setup_fully_connected_nc_qs8(
3105	xnn_operator_t fully_connected_op,
3106	size_t batch_size,
3107	const int8_t* input,
3108	int8_t* output,
3109	pthreadpool_t threadpool);
3110
3111	enum xnn_status xnn_create_global_average_pooling_nwc_qs8(
3112	size_t channels,
3113	size_t input_stride,
3114	size_t output_stride,
3115	int8_t input_zero_point,
3116	float input_scale,
3117	int8_t output_zero_point,
3118	float output_scale,
3119	int8_t output_min,
3120	int8_t output_max,
3121	uint32_t flags,
3122	xnn_operator_t* global_average_pooling_op_out);
3123
3124	enum xnn_status xnn_setup_global_average_pooling_nwc_qs8(
3125	xnn_operator_t global_average_pooling_op,
3126	size_t batch_size,
3127	size_t width,
3128	const int8_t* input,
3129	int8_t* output,
3130	pthreadpool_t threadpool);
3131
3132	enum xnn_status xnn_create_multiply_nd_qs8(
3133	int8_t input1_zero_point,
3134	float input1_scale,
3135	int8_t input2_zero_point,
3136	float input2_scale,
3137	int8_t output_zero_point,
3138	float output_scale,
3139	int8_t output_min,
3140	int8_t output_max,
3141	uint32_t flags,
3142	xnn_operator_t* multiply_op_out);
3143
3144	enum xnn_status xnn_setup_multiply_nd_qs8(
3145	xnn_operator_t multiply_op,
3146	size_t num_input1_dims,
3147	const size_t* input1_shape,
3148	size_t num_input2_dims,
3149	const size_t* input2_shape,
3150	const int8_t* input1,
3151	const int8_t* input2,
3152	int8_t* output,
3153	pthreadpool_t threadpool);
3154
3155	enum xnn_status xnn_create_leaky_relu_nc_qs8(
3156	size_t channels,
3157	size_t input_stride,
3158	size_t output_stride,
3159	float negative_slope,
3160	int8_t input_zero_point,
3161	float input_scale,
3162	int8_t output_zero_point,
3163	float output_scale,
3164	uint32_t flags,
3165	xnn_operator_t* leaky_relu_op_out);
3166
3167	enum xnn_status xnn_setup_leaky_relu_nc_qs8(
3168	xnn_operator_t leaky_relu_op,
3169	size_t batch_size,
3170	const int8_t* input,
3171	int8_t* output,
3172	pthreadpool_t threadpool);
3173
3174	enum xnn_status xnn_create_sigmoid_nc_qs8(
3175	size_t channels,
3176	size_t input_stride,
3177	size_t output_stride,
3178	int8_t input_zero_point,
3179	float input_scale,
3180	int8_t output_zero_point,
3181	float output_scale,
3182	int8_t output_min,
3183	int8_t output_max,
3184	uint32_t flags,
3185	xnn_operator_t* sigmoid_op_out);
3186
3187	enum xnn_status xnn_setup_sigmoid_nc_qs8(
3188	xnn_operator_t sigmoid_op,
3189	size_t batch_size,
3190	const int8_t* input,
3191	int8_t* output,
3192	pthreadpool_t threadpool);
3193
3194	enum xnn_status xnn_create_subtract_nd_qs8(
3195	int8_t input1_zero_point,
3196	float input1_scale,
3197	int8_t input2_zero_point,
3198	float input2_scale,
3199	int8_t output_zero_point,
3200	float output_scale,
3201	int8_t output_min,
3202	int8_t output_max,
3203	uint32_t flags,
3204	xnn_operator_t* subtract_op_out);
3205
3206	enum xnn_status xnn_setup_subtract_nd_qs8(
3207	xnn_operator_t subtract_op,
3208	size_t num_input1_dims,
3209	const size_t* input1_shape,
3210	size_t num_input2_dims,
3211	const size_t* input2_shape,
3212	const int8_t* input1,
3213	const int8_t* input2,
3214	int8_t* output,
3215	pthreadpool_t threadpool);
3216
3217	enum xnn_status xnn_create_tanh_nc_qs8(
3218	size_t channels,
3219	size_t input_stride,
3220	size_t output_stride,
3221	int8_t input_zero_point,
3222	float input_scale,
3223	int8_t output_zero_point,
3224	float output_scale,
3225	int8_t output_min,
3226	int8_t output_max,
3227	uint32_t flags,
3228	xnn_operator_t* tanh_op_out);
3229
3230	enum xnn_status xnn_setup_tanh_nc_qs8(
3231	xnn_operator_t tanh_op,
3232	size_t batch_size,
3233	const int8_t* input,
3234	int8_t* output,
3235	pthreadpool_t threadpool);
3236
3237	#endif // XNN_NO_QS8_OPERATORS
3238
3239	#ifndef XNN_NO_QU8_OPERATORS
3240
3241	enum xnn_status xnn_create_add_nd_qu8(
3242	uint8_t input1_zero_point,
3243	float input1_scale,
3244	uint8_t input2_zero_point,
3245	float input2_scale,
3246	uint8_t output_zero_point,
3247	float output_scale,
3248	uint8_t output_min,
3249	uint8_t output_max,
3250	uint32_t flags,
3251	xnn_operator_t* add_op_out);
3252
3253	enum xnn_status xnn_setup_add_nd_qu8(
3254	xnn_operator_t add_op,
3255	size_t num_input1_dims,
3256	const size_t* input1_shape,
3257	size_t num_input2_dims,
3258	const size_t* input2_shape,
3259	const uint8_t* input1,
3260	const uint8_t* input2,
3261	uint8_t* output,
3262	pthreadpool_t threadpool);
3263
3264	enum xnn_status xnn_create_average_pooling2d_nhwc_qu8(
3265	uint32_t input_padding_top,
3266	uint32_t input_padding_right,
3267	uint32_t input_padding_bottom,
3268	uint32_t input_padding_left,
3269	uint32_t pooling_height,
3270	uint32_t pooling_width,
3271	uint32_t stride_height,
3272	uint32_t stride_width,
3273	size_t channels,
3274	size_t input_pixel_stride,
3275	size_t output_pixel_stride,
3276	uint8_t input_zero_point,
3277	float input_scale,
3278	uint8_t output_zero_point,
3279	float output_scale,
3280	uint8_t output_min,
3281	uint8_t output_max,
3282	uint32_t flags,
3283	xnn_operator_t* average_pooling_op_out);
3284
3285	enum xnn_status xnn_setup_average_pooling2d_nhwc_qu8(
3286	xnn_operator_t average_pooling_op,
3287	size_t batch_size,
3288	size_t input_height,
3289	size_t input_width,
3290	const uint8_t* input,
3291	uint8_t* output,
3292	pthreadpool_t threadpool);
3293
3294	enum xnn_status xnn_create_convolution2d_nhwc_qu8(
3295	uint32_t input_padding_top,
3296	uint32_t input_padding_right,
3297	uint32_t input_padding_bottom,
3298	uint32_t input_padding_left,
3299	uint32_t kernel_height,
3300	uint32_t kernel_width,
3301	uint32_t subsampling_height,
3302	uint32_t subsampling_width,
3303	uint32_t dilation_height,
3304	uint32_t dilation_width,
3305	uint32_t groups,
3306	size_t group_input_channels,
3307	size_t group_output_channels,
3308	size_t input_channel_stride,
3309	size_t output_channel_stride,
3310	uint8_t input_zero_point,
3311	float input_scale,
3312	uint8_t kernel_zero_point,
3313	float kernel_scale,
3314	const uint8_t* kernel,
3315	const int32_t* bias,
3316	uint8_t output_zero_point,
3317	float output_scale,
3318	uint8_t output_min,
3319	uint8_t output_max,
3320	uint32_t flags,
3321	xnn_caches_t caches,
3322	xnn_operator_t* convolution_op_out);
3323
3324	enum xnn_status xnn_setup_convolution2d_nhwc_qu8(
3325	xnn_operator_t convolution_op,
3326	size_t batch_size,
3327	size_t input_height,
3328	size_t input_width,
3329	const uint8_t* input,
3330	uint8_t* output,
3331	pthreadpool_t threadpool);
3332
3333	enum xnn_status xnn_create_deconvolution2d_nhwc_qu8(
3334	uint32_t output_padding_top,
3335	uint32_t output_padding_right,
3336	uint32_t output_padding_bottom,
3337	uint32_t output_padding_left,
3338	uint32_t kernel_height,
3339	uint32_t kernel_width,
3340	uint32_t stride_height,
3341	uint32_t stride_width,
3342	uint32_t dilation_height,
3343	uint32_t dilation_width,
3344	uint32_t groups,
3345	size_t group_input_channels,
3346	size_t group_output_channels,
3347	size_t input_pixel_stride,
3348	size_t output_pixel_stride,
3349	uint8_t input_zero_point,
3350	float input_scale,
3351	uint8_t kernel_zero_point,
3352	float kernel_scale,
3353	const uint8_t* kernel,
3354	const int32_t* bias,
3355	uint8_t output_zero_point,
3356	float output_scale,
3357	uint8_t output_min,
3358	uint8_t output_max,
3359	uint32_t flags,
3360	xnn_caches_t caches,
3361	xnn_operator_t* deconvolution_op_out);
3362
3363	enum xnn_status xnn_setup_deconvolution2d_nhwc_qu8(
3364	xnn_operator_t deconvolution_op,
3365	size_t batch_size,
3366	size_t input_height,
3367	size_t input_width,
3368	uint32_t adjustment_height,
3369	uint32_t adjustment_width,
3370	const uint8_t* input,
3371	uint8_t* output,
3372	pthreadpool_t threadpool);
3373
3374	enum xnn_status xnn_create_fully_connected_nc_qu8(
3375	size_t input_channels,
3376	size_t output_channels,
3377	size_t input_stride,
3378	size_t output_stride,
3379	uint8_t input_zero_point,
3380	float input_scale,
3381	uint8_t kernel_zero_point,
3382	float kernel_scale,
3383	const uint8_t* kernel,
3384	const int32_t* bias,
3385	uint8_t output_zero_point,
3386	float output_scale,
3387	uint8_t output_min,
3388	uint8_t output_max,
3389	uint32_t flags,
3390	xnn_caches_t caches,
3391	xnn_operator_t* fully_connected_op_out);
3392
3393	enum xnn_status xnn_setup_fully_connected_nc_qu8(
3394	xnn_operator_t fully_connected_op,
3395	size_t batch_size,
3396	const uint8_t* input,
3397	uint8_t* output,
3398	pthreadpool_t threadpool);
3399
3400	enum xnn_status xnn_create_global_average_pooling_nwc_qu8(
3401	size_t channels,
3402	size_t input_stride,
3403	size_t output_stride,
3404	uint8_t input_zero_point,
3405	float input_scale,
3406	uint8_t output_zero_point,
3407	float output_scale,
3408	uint8_t output_min,
3409	uint8_t output_max,
3410	uint32_t flags,
3411	xnn_operator_t* global_average_pooling_op_out);
3412
3413	enum xnn_status xnn_setup_global_average_pooling_nwc_qu8(
3414	xnn_operator_t global_average_pooling_op,
3415	size_t batch_size,
3416	size_t width,
3417	const uint8_t* input,
3418	uint8_t* output,
3419	pthreadpool_t threadpool);
3420
3421	enum xnn_status xnn_create_leaky_relu_nc_qu8(
3422	size_t channels,
3423	size_t input_stride,
3424	size_t output_stride,
3425	float negative_slope,
3426	uint8_t input_zero_point,
3427	float input_scale,
3428	uint8_t output_zero_point,
3429	float output_scale,
3430	uint32_t flags,
3431	xnn_operator_t* leaky_relu_op_out);
3432
3433	enum xnn_status xnn_setup_leaky_relu_nc_qu8(
3434	xnn_operator_t leaky_relu_op,
3435	size_t batch_size,
3436	const uint8_t* input,
3437	uint8_t* output,
3438	pthreadpool_t threadpool);
3439
3440	enum xnn_status xnn_create_multiply_nd_qu8(
3441	uint8_t input1_zero_point,
3442	float input1_scale,
3443	uint8_t input2_zero_point,
3444	float input2_scale,
3445	uint8_t output_zero_point,
3446	float output_scale,
3447	uint8_t output_min,
3448	uint8_t output_max,
3449	uint32_t flags,
3450	xnn_operator_t* multiply_op_out);
3451
3452	enum xnn_status xnn_setup_multiply_nd_qu8(
3453	xnn_operator_t multiply_op,
3454	size_t num_input1_dims,
3455	const size_t* input1_shape,
3456	size_t num_input2_dims,
3457	const size_t* input2_shape,
3458	const uint8_t* input1,
3459	const uint8_t* input2,
3460	uint8_t* output,
3461	pthreadpool_t threadpool);
3462
3463	enum xnn_status xnn_create_sigmoid_nc_qu8(
3464	size_t channels,
3465	size_t input_stride,
3466	size_t output_stride,
3467	uint8_t input_zero_point,
3468	float input_scale,
3469	uint8_t output_zero_point,
3470	float output_scale,
3471	uint8_t output_min,
3472	uint8_t output_max,
3473	uint32_t flags,
3474	xnn_operator_t* sigmoid_op_out);
3475
3476	enum xnn_status xnn_setup_sigmoid_nc_qu8(
3477	xnn_operator_t sigmoid_op,
3478	size_t batch_size,
3479	const uint8_t* input,
3480	uint8_t* output,
3481	pthreadpool_t threadpool);
3482
3483	enum xnn_status xnn_create_softmax_nc_qu8(
3484	size_t channels,
3485	size_t input_stride,
3486	size_t output_stride,
3487	float input_scale,
3488	uint8_t output_zero_point,
3489	float output_scale,
3490	uint32_t flags,
3491	xnn_operator_t* softmax_op_out);
3492
3493	enum xnn_status xnn_setup_softmax_nc_qu8(
3494	xnn_operator_t softmax_op,
3495	size_t batch_size,
3496	const uint8_t* input,
3497	uint8_t* output,
3498	pthreadpool_t threadpool);
3499
3500	enum xnn_status xnn_create_subtract_nd_qu8(
3501	uint8_t input1_zero_point,
3502	float input1_scale,
3503	uint8_t input2_zero_point,
3504	float input2_scale,
3505	uint8_t output_zero_point,
3506	float output_scale,
3507	uint8_t output_min,
3508	uint8_t output_max,
3509	uint32_t flags,
3510	xnn_operator_t* subtract_op_out);
3511
3512	enum xnn_status xnn_setup_subtract_nd_qu8(
3513	xnn_operator_t subtract_op,
3514	size_t num_input1_dims,
3515	const size_t* input1_shape,
3516	size_t num_input2_dims,
3517	const size_t* input2_shape,
3518	const uint8_t* input1,
3519	const uint8_t* input2,
3520	uint8_t* output,
3521	pthreadpool_t threadpool);
3522
3523	enum xnn_status xnn_create_tanh_nc_qu8(
3524	size_t channels,
3525	size_t input_stride,
3526	size_t output_stride,
3527	uint8_t input_zero_point,
3528	float input_scale,
3529	uint8_t output_zero_point,
3530	float output_scale,
3531	uint8_t output_min,
3532	uint8_t output_max,
3533	uint32_t flags,
3534	xnn_operator_t* tanh_op_out);
3535
3536	enum xnn_status xnn_setup_tanh_nc_qu8(
3537	xnn_operator_t tanh_op,
3538	size_t batch_size,
3539	const uint8_t* input,
3540	uint8_t* output,
3541	pthreadpool_t threadpool);
3542
3543	#endif // XNN_NO_QU8_OPERATORS
3544
3545	#ifndef XNN_NO_S8_OPERATORS
3546
3547	enum xnn_status xnn_create_clamp_nc_s8(
3548	size_t channels,
3549	size_t input_stride,
3550	size_t output_stride,
3551	int8_t output_min,
3552	int8_t output_max,
3553	uint32_t flags,
3554	xnn_operator_t* clamp_op_out);
3555
3556	enum xnn_status xnn_setup_clamp_nc_s8(
3557	xnn_operator_t clamp_op,
3558	size_t batch_size,
3559	const int8_t* input,
3560	int8_t* output,
3561	pthreadpool_t threadpool);
3562
3563	enum xnn_status xnn_create_max_pooling2d_nhwc_s8(
3564	uint32_t input_padding_top,
3565	uint32_t input_padding_right,
3566	uint32_t input_padding_bottom,
3567	uint32_t input_padding_left,
3568	uint32_t pooling_height,
3569	uint32_t pooling_width,
3570	uint32_t stride_height,
3571	uint32_t stride_width,
3572	uint32_t dilation_height,
3573	uint32_t dilation_width,
3574	size_t channels,
3575	size_t input_pixel_stride,
3576	size_t output_pixel_stride,
3577	int8_t output_min,
3578	int8_t output_max,
3579	uint32_t flags,
3580	xnn_operator_t* max_pooling_op_out);
3581
3582	enum xnn_status xnn_setup_max_pooling2d_nhwc_s8(
3583	xnn_operator_t max_pooling_op,
3584	size_t batch_size,
3585	size_t input_height,
3586	size_t input_width,
3587	const int8_t* input,
3588	int8_t* output,
3589	pthreadpool_t threadpool);
3590
3591	enum xnn_status xnn_create_resize_bilinear2d_nhwc_s8(
3592	size_t channels,
3593	size_t input_pixel_stride,
3594	size_t output_pixel_stride,
3595	uint32_t flags,
3596	xnn_operator_t* resize_op_out);
3597
3598	enum xnn_status xnn_setup_resize_bilinear2d_nhwc_s8(
3599	xnn_operator_t resize_op,
3600	size_t batch_size,
3601	size_t input_height,
3602	size_t input_width,
3603	size_t output_height,
3604	size_t output_width,
3605	const int8_t* input,
3606	int8_t* output,
3607	pthreadpool_t threadpool);
3608
3609	#endif // XNN_NO_S8_OPERATORS
3610
3611	#ifndef XNN_NO_U8_OPERATORS
3612
3613	enum xnn_status xnn_create_clamp_nc_u8(
3614	size_t channels,
3615	size_t input_stride,
3616	size_t output_stride,
3617	uint8_t output_min,
3618	uint8_t output_max,
3619	uint32_t flags,
3620	xnn_operator_t* clamp_op_out);
3621
3622	enum xnn_status xnn_setup_clamp_nc_u8(
3623	xnn_operator_t clamp_op,
3624	size_t batch_size,
3625	const uint8_t* input,
3626	uint8_t* output,
3627	pthreadpool_t threadpool);
3628
3629	enum xnn_status xnn_create_max_pooling2d_nhwc_u8(
3630	uint32_t input_padding_top,
3631	uint32_t input_padding_right,
3632	uint32_t input_padding_bottom,
3633	uint32_t input_padding_left,
3634	uint32_t pooling_height,
3635	uint32_t pooling_width,
3636	uint32_t stride_height,
3637	uint32_t stride_width,
3638	uint32_t dilation_height,
3639	uint32_t dilation_width,
3640	size_t channels,
3641	size_t input_pixel_stride,
3642	size_t output_pixel_stride,
3643	uint8_t output_min,
3644	uint8_t output_max,
3645	uint32_t flags,
3646	xnn_operator_t* max_pooling_op_out);
3647
3648	enum xnn_status xnn_setup_max_pooling2d_nhwc_u8(
3649	xnn_operator_t max_pooling_op,
3650	size_t batch_size,
3651	size_t input_height,
3652	size_t input_width,
3653	const uint8_t* input,
3654	uint8_t* output,
3655	pthreadpool_t threadpool);
3656
3657	enum xnn_status xnn_create_resize_bilinear2d_nhwc_u8(
3658	size_t channels,
3659	size_t input_pixel_stride,
3660	size_t output_pixel_stride,
3661	uint32_t flags,
3662	xnn_operator_t* resize_op_out);
3663
3664	enum xnn_status xnn_setup_resize_bilinear2d_nhwc_u8(
3665	xnn_operator_t resize_op,
3666	size_t batch_size,
3667	size_t input_height,
3668	size_t input_width,
3669	size_t output_height,
3670	size_t output_width,
3671	const uint8_t* input,
3672	uint8_t* output,
3673	pthreadpool_t threadpool);
3674
3675	#endif // XNN_NO_U8_OPERATORS
3676
3677	#ifndef XNN_NO_X8_OPERATORS
3678
3679	enum xnn_status xnn_create_copy_nc_x8(
3680	size_t channels,
3681	size_t input_stride,
3682	size_t output_stride,
3683	uint32_t flags,
3684	xnn_operator_t* copy_op_out);
3685
3686	enum xnn_status xnn_setup_copy_nc_x8(
3687	xnn_operator_t copy_op,
3688	size_t batch_size,
3689	const void* input,
3690	void* output,
3691	pthreadpool_t threadpool);
3692
3693	enum xnn_status xnn_create_channel_shuffle_nc_x8(
3694	size_t groups,
3695	size_t group_channels,
3696	size_t input_stride,
3697	size_t output_stride,
3698	uint32_t flags,
3699	xnn_operator_t* channel_shuffle_op_out);
3700
3701	enum xnn_status xnn_setup_channel_shuffle_nc_x8(
3702	xnn_operator_t channel_shuffle_op,
3703	size_t batch_size,
3704	const void* input,
3705	void* output,
3706	pthreadpool_t threadpool);
3707
3708	enum xnn_status xnn_create_constant_pad_nd_x8(
3709	const void* padding_value,
3710	uint32_t flags,
3711	xnn_operator_t* constant_pad_op_out);
3712
3713	enum xnn_status xnn_setup_constant_pad_nd_x8(
3714	xnn_operator_t constant_pad_op,
3715	size_t num_dims,
3716	const size_t* input_shape,
3717	const size_t* pre_padding,
3718	const size_t* post_padding,
3719	const void* input,
3720	void* output,
3721	pthreadpool_t threadpool);
3722
3723	enum xnn_status xnn_create_depth_to_space_nhwc_x8(
3724	size_t output_channels,
3725	size_t input_channel_stride,
3726	size_t output_channel_stride,
3727	uint32_t block_size,
3728	uint32_t flags,
3729	xnn_operator_t* depth_to_space_op_out);
3730
3731	enum xnn_status xnn_setup_depth_to_space_nhwc_x8(
3732	xnn_operator_t depth_to_space_op,
3733	size_t batch_size,
3734	size_t input_height,
3735	size_t input_width,
3736	const void* input,
3737	void* output,
3738	pthreadpool_t threadpool);
3739
3740	enum xnn_status xnn_create_space_to_depth_nhwc_x8(
3741	size_t input_channels,
3742	size_t input_channel_stride,
3743	size_t output_channel_stride,
3744	uint32_t block_size,
3745	uint32_t flags,
3746	xnn_operator_t* space_to_depth_op_out);
3747
3748	enum xnn_status xnn_setup_space_to_depth_nhwc_x8(
3749	xnn_operator_t space_to_depth_op,
3750	size_t batch_size,
3751	size_t input_height,
3752	size_t input_width,
3753	const void* input,
3754	void* output,
3755	pthreadpool_t threadpool);
3756
3757	enum xnn_status xnn_create_transpose_nd_x8(
3758	uint32_t flags,
3759	xnn_operator_t* transpose_op_out);
3760
3761	enum xnn_status xnn_setup_transpose_nd_x8(
3762	xnn_operator_t transpose_op,
3763	const void* input,
3764	void* output,
3765	const size_t num_dims,
3766	const size_t* input_shape,
3767	const size_t* output_perm,
3768	pthreadpool_t threadpool);
3769
3770	enum xnn_status xnn_run_transpose_nd_x8(
3771	const void* input,
3772	void* output,
3773	const size_t num_dims,
3774	const size_t* input_shape,
3775	const size_t* output_perm,
3776	uint32_t flags,
3777	pthreadpool_t threadpool);
3778
3779	#endif // XNN_NO_X8_OPERATORS
3780
3781	#ifndef XNN_NO_CVT_OPERATORS
3782
3783	enum xnn_status xnn_create_convert_nc_f16_f32(
3784	size_t channels,
3785	size_t input_stride,
3786	size_t output_stride,
3787	uint32_t flags,
3788	xnn_operator_t* convert_op_out);
3789
3790	enum xnn_status xnn_setup_convert_nc_f16_f32(
3791	xnn_operator_t convert_op,
3792	size_t batch_size,
3793	const void* input,
3794	float* output,
3795	pthreadpool_t threadpool);
3796
3797	enum xnn_status xnn_create_convert_nc_f32_f16(
3798	size_t channels,
3799	size_t input_stride,
3800	size_t output_stride,
3801	uint32_t flags,
3802	xnn_operator_t* convert_op_out);
3803
3804	enum xnn_status xnn_setup_convert_nc_f32_f16(
3805	xnn_operator_t convert_op,
3806	size_t batch_size,
3807	const float* input,
3808	void* output,
3809	pthreadpool_t threadpool);
3810
3811	enum xnn_status xnn_create_convert_nc_f32_qs8(
3812	size_t channels,
3813	size_t input_stride,
3814	size_t output_stride,
3815	float output_scale,
3816	int8_t output_zero_point,
3817	int8_t output_min,
3818	int8_t output_max,
3819	uint32_t flags,
3820	xnn_operator_t* convert_op_out);
3821
3822	enum xnn_status xnn_setup_convert_nc_f32_qs8(
3823	xnn_operator_t convert_op,
3824	size_t batch_size,
3825	const float* input,
3826	int8_t* output,
3827	pthreadpool_t threadpool);
3828
3829	enum xnn_status xnn_create_convert_nc_f32_qu8(
3830	size_t channels,
3831	size_t input_stride,
3832	size_t output_stride,
3833	float output_scale,
3834	uint8_t output_zero_point,
3835	uint8_t output_min,
3836	uint8_t output_max,
3837	uint32_t flags,
3838	xnn_operator_t* convert_op_out);
3839
3840	enum xnn_status xnn_setup_convert_nc_f32_qu8(
3841	xnn_operator_t convert_op,
3842	size_t batch_size,
3843	const float* input,
3844	uint8_t* output,
3845	pthreadpool_t threadpool);
3846
3847	enum xnn_status xnn_create_convert_nc_qs8(
3848	size_t channels,
3849	size_t input_stride,
3850	size_t output_stride,
3851	float input_scale,
3852	int8_t input_zero_point,
3853	float output_scale,
3854	int8_t output_zero_point,
3855	uint32_t flags,
3856	xnn_operator_t* convert_op_out);
3857
3858	enum xnn_status xnn_setup_convert_nc_qs8(
3859	xnn_operator_t convert_op,
3860	size_t batch_size,
3861	const int8_t* input,
3862	int8_t* output,
3863	pthreadpool_t threadpool);
3864
3865	enum xnn_status xnn_create_convert_nc_qs8_f32(
3866	size_t channels,
3867	size_t input_stride,
3868	size_t output_stride,
3869	float input_scale,
3870	int8_t input_zero_point,
3871	uint32_t flags,
3872	xnn_operator_t* convert_op_out);
3873
3874	enum xnn_status xnn_setup_convert_nc_qs8_f32(
3875	xnn_operator_t convert_op,
3876	size_t batch_size,
3877	const int8_t* input,
3878	float* output,
3879	pthreadpool_t threadpool);
3880
3881	enum xnn_status xnn_create_convert_nc_qu8(
3882	size_t channels,
3883	size_t input_stride,
3884	size_t output_stride,
3885	float input_scale,
3886	uint8_t input_zero_point,
3887	float output_scale,
3888	uint8_t output_zero_point,
3889	uint32_t flags,
3890	xnn_operator_t* convert_op_out);
3891
3892	enum xnn_status xnn_setup_convert_nc_qu8(
3893	xnn_operator_t convert_op,
3894	size_t batch_size,
3895	const uint8_t* input,
3896	uint8_t* output,
3897	pthreadpool_t threadpool);
3898
3899	enum xnn_status xnn_create_convert_nc_qu8_f32(
3900	size_t channels,
3901	size_t input_stride,
3902	size_t output_stride,
3903	float input_scale,
3904	uint8_t input_zero_point,
3905	uint32_t flags,
3906	xnn_operator_t* convert_op_out);
3907
3908	enum xnn_status xnn_setup_convert_nc_qu8_f32(
3909	xnn_operator_t convert_op,
3910	size_t batch_size,
3911	const uint8_t* input,
3912	float* output,
3913	pthreadpool_t threadpool);
3914
3915	#endif // XNN_NO_CVT_OPERATORS
3916
3917	#ifdef __cplusplus
3918	} // extern "C"
3919	#endif
3920

Browse the source code of tensorflow/external/XNNPACK/include/xnnpack.h