1// Copyright (c) Facebook, Inc. and its affiliates.
2// All rights reserved.
3//
4// Copyright 2019 Google LLC
5//
6// This source code is licensed under the BSD-style license found in the
7// LICENSE file in the root directory of this source tree.
8
9#pragma once
10
11#include <stdbool.h>
12#include <stddef.h>
13#include <stdint.h>
14
15#include <pthreadpool.h>
16
17#ifdef __cplusplus
18extern "C" {
19#endif
20
21/// The number of bytes XNNPACK may read beyond array bounds.
22/// The caller must allocate at least this many extra bytes after the tensor data passed to XNNPACK.
23///
24/// Note: XNNPACK reads, but never writes beyond array bounds.
25#define XNN_EXTRA_BYTES 16
26
27/// Maximum number of dimensions in tensor shape.
28#define XNN_MAX_TENSOR_DIMS 6
29
30/// Allow sparse inference in a Runtime.
31///
32/// Note: this flag hints XNNPACK to consider sparse inference, but does not guarantee it.
33#define XNN_FLAG_SPARSE_INFERENCE 0x00000001
34#define XNN_FLAG_HINT_SPARSE_INFERENCE XNN_FLAG_SPARSE_INFERENCE
35
36/// Allow IEEE FP16 inference in a Runtime.
37///
38/// Note: this flag hints XNNPACK to consider IEEE FP16 inference, but does not guarantee it.
39#define XNN_FLAG_FP16_INFERENCE 0x00000002
40#define XNN_FLAG_HINT_FP16_INFERENCE XNN_FLAG_FP16_INFERENCE
41
42/// Force IEEE FP16 inference in a Runtime, and fail if FP16 inference is not possible.
43///
44/// Note: this flag guarantees that XNNPACK will use IEEE FP16 inference, or fail to create the Runtime object.
45/// Warning: on x86 systems FP16 computations will be emulated at a substantial performance cost.
46#define XNN_FLAG_FORCE_FP16_INFERENCE 0x00000004
47
48/// Enable timing of each operator's runtime.
49#define XNN_FLAG_BASIC_PROFILING 0x00000008
50
51/// The convolution operator represents a depthwise convolution, and use HWGo layout for filters.
52#define XNN_FLAG_DEPTHWISE_CONVOLUTION 0x00000001
53
54/// Assume transposed weights in a fully connected operator.
55#define XNN_FLAG_TRANSPOSE_WEIGHTS 0x00000001
56
57/// The operator assumes NHWC layout for the input, regardless of the output layout.
58#define XNN_FLAG_INPUT_NHWC 0x00000002
59
60/// Match "SAME" padding in TensorFlow. Exact padding values are computed dynamically depending on input size.
61#define XNN_FLAG_TENSORFLOW_SAME_PADDING 0x00000004
62
63/// Implicitly flatten and reshape input of a Fully Connected operator into a 2D tensor.
64#define XNN_FLAG_TENSORFLOW_RESHAPE_2D 0x00000004
65
66/// Match behaviour of TensorFlow 1.x.
67#define XNN_FLAG_TENSORFLOW_LEGACY_MODE 0x00000004
68
69/// Static weights of the FP16 operator are in FP32 format.
70#define XNN_FLAG_FP32_STATIC_WEIGHTS 0x00000008
71
72/// Align corners of input and output images in resize operations.
73#define XNN_FLAG_ALIGN_CORNERS 0x00000008
74
75/// Yield worker threads of the thread pool to the system scheduler after the inference.
76#define XNN_FLAG_YIELD_WORKERS 0x00000010
77
78/// Status code for any XNNPACK function call.
79enum xnn_status {
80 /// The call succeeded, and all output arguments now contain valid data.
81 xnn_status_success = 0,
82 xnn_status_uninitialized = 1,
83 xnn_status_invalid_parameter = 2,
84 xnn_status_invalid_state = 3,
85 xnn_status_unsupported_parameter = 4,
86 xnn_status_unsupported_hardware = 5,
87 xnn_status_out_of_memory = 6,
88};
89
90struct xnn_allocator {
91 /// User-specified pointer that will be passed as-is to all functions in this structure.
92 void* context;
93 /// Pointer to a function to be called for general memory allocation.
94 ///
95 /// @param context - The user-specified pointer from xnn_allocator structure.
96 /// @param size - The size of the memory block to allocate, in bytes.
97 ///
98 /// @returns Pointer to the allocated memory block of at least @ref size bytes.
99 /// If allocation fails, the function must return NULL.
100 void* (*allocate)(void* context, size_t size);
101 /// Pointer to a function to be called for general memory re-allocation, i.e. to increase or shrink a previously
102 /// allocated memory block. The content of the old memory block is copied to the new memory block.
103 ///
104 /// @param context - The user-specified pointer from xnn_allocator structure.
105 /// @param pointer - Pointer to a memory block allocated by @ref allocate or @ref reallocate functions. Can be NULL.
106 /// If the pointer is NULL, the @ref reallocate call is equivalent to an @ref allocate call.
107 /// @param size - The new size of the memory block to allocate, in bytes.
108 ///
109 /// @returns Pointer to the newly allocated memory block of at least @ref size bytes with the content of the previous
110 /// memory block.
111 /// If allocation fails, the function must return NULL, but must not release the previous memory block.
112 void* (*reallocate)(void* context, void* pointer, size_t size);
113 /// Pointer to a function to be called for general memory de-allocation.
114 ///
115 /// @param context - The user-specified pointer from xnn_allocator structure.
116 /// @param pointer - Pointer to a memory block allocated by @ref allocate or @ref reallocate functions. Can be NULL.
117 /// If the pointer is NULL, the @ref deallocate call is a no-op.
118 void (*deallocate)(void* context, void* pointer);
119 /// Pointer to a function to be called for aligned memory allocation.
120 ///
121 /// @param context - The user-specified pointer from xnn_allocator structure.
122 /// @param alignment - The alignment of the memory block to allocate, in bytes. Alignment is always a power-of-2.
123 /// @param size - The size of the memory block to allocate, in bytes.
124 ///
125 /// @returns Pointer to the allocated memory block of at least @ref size bytes.
126 /// If allocation fails, the function must return NULL.
127 void* (*aligned_allocate)(void* context, size_t alignment, size_t size);
128 /// Pointer to a function to be called for aligned memory de-allocation.
129 ///
130 /// @param context - The user-specified pointer from xnn_allocator structure.
131 /// @param pointer - Pointer to a memory block allocated by @ref aligned_allocate function. Can be NULL.
132 /// If the pointer is NULL, the @ref aligned_deallocate call is a no-op.
133 void (*aligned_deallocate)(void* context, void* pointer);
134};
135
136/// Initialize XNNPACK library.
137///
138/// XNNPACK must be successfully initialized before use. During initialization, XNNPACK populates internal structures
139/// depending on the host processor. Initialization can be time-consuming.
140///
141/// @param[in] allocator - structure with function pointers to be use for memory allocation and de-allocation.
142/// If this argument is NULL, system-provided memory management functions (e.g. malloc/free)
143/// will be used.
144///
145/// @retval xnn_status_success - XNNPACK is successfully initialized and ready to use.
146/// @retval xnn_status_out_of_memory - initialization failed due to out-of-memory condition.
147/// @retval xnn_status_unsupported_hardware - initialization failed because the host processor does not satisfy the
148/// minimum hardware requirements for XNNPACK. E.g. this may happen on x86
149/// processors without SSE2 extension, or on 32-bit ARM processors without
150/// the NEON SIMD extension.
151enum xnn_status xnn_initialize(const struct xnn_allocator* allocator);
152
153/// Deinitialize XNNPACK library.
154///
155/// To avoid memory and resource leaks, users must call xnn_deinitialize once for each successful xnn_initialize call.
156///
157/// @retval xnn_status_success - deinitialization call succeeded.
158enum xnn_status xnn_deinitialize(void);
159
160/// Subgraph is an abstract representation of a neural network model.
161/// Subgraph objects are used to define Values (tensors) and Nodes (operators) comprising the model.
162typedef struct xnn_subgraph* xnn_subgraph_t;
163
164/// Create a empty Subgraph object.
165///
166/// @param external_value_ids - number of Value IDs to reserve for communication with external graph representation.
167/// The Subgraph object would avoid creating internal Value IDs in the
168/// [0, reserved_value_ids-1] range.
169/// @param flags - binary features of the subgraph. No supported flags are currently defined.
170/// @param subgraph_out - pointer to the variable that will be initialized with a handle to the Subgraph object upon
171/// successful return.
172enum xnn_status xnn_create_subgraph(
173 uint32_t external_value_ids,
174 uint32_t flags,
175 xnn_subgraph_t* subgraph_out);
176
177/// Destroy a Subgraph object, as well as Values, and Nodes associated with the subgraph.
178///
179/// @param subgraph - the Subgraph object to destroy.
180enum xnn_status xnn_delete_subgraph(
181 xnn_subgraph_t subgraph);
182
183#define XNN_VALUE_FLAG_EXTERNAL_INPUT 0x00000001
184#define XNN_VALUE_FLAG_EXTERNAL_OUTPUT 0x00000002
185#define XNN_VALUE_FLAG_PERSISTENT 0x00000004
186
187#define XNN_INVALID_VALUE_ID UINT32_MAX
188
189/// Type of elements in a Value object.
190enum xnn_datatype {
191 /// Invalid data type. Valid Values never have this datatype.
192 xnn_datatype_invalid = 0,
193 /// IEEE754 single-precision floating-point.
194 xnn_datatype_fp32 = 1,
195 /// IEEE754 half-precision floating-point.
196 xnn_datatype_fp16 = 2,
197 /// Quantized 8-bit signed integer with shared per-Value quantization parameters.
198 xnn_datatype_qint8 = 3,
199 /// Quantized 8-bit unsigned integer with shared per-Value quantization parameters.
200 xnn_datatype_quint8 = 4,
201 /// Quantized 32-bit signed integer with shared per-Value quantization parameters.
202 xnn_datatype_qint32 = 5,
203 /// Quantized 8-bit signed integer with shared per-channel quantization parameters.
204 xnn_datatype_qcint8 = 6,
205 /// Quantized 32-bit signed integer with shared per-channel quantization parameters.
206 xnn_datatype_qcint32 = 7,
207};
208
209/// Define a tensor-type Value and add it to a Subgraph.
210///
211/// @param subgraph - a Subgraph object that will own the created Value.
212/// @param datatype - type of the tensor elements.
213/// @param num_dims - number of dimensions in the shape.
214/// @param dims - pointer to an array of @a num_dims shape dimensions. If num_dims is 0, this pointer can be NULL.
215/// XNNPACK does not keep any pointers to this array after the function returns.
216/// @param data - pointer to static data used for tensor initialization. If the tensor is not statically initialized,
217/// this pointer must be is NULL. If non-NULL, the life-time of the static data must exceed the life-time
218/// of the Subgraph object, and of any Runtime objects created from the Subgraph.
219/// @param external_id - external ID for the Value. The ID must be within the range of reversed Value IDs specified on
220/// the Subgraph creation. If the external ID is XNN_INVALID_VALUE_ID, an internal ID will be
221/// created for the Value.
222/// @param flags - binary features of the Value. Supported values are any combination of XNN_VALUE_FLAG_EXTERNAL_INPUT
223/// and XNN_VALUE_FLAG_EXTERNAL_OUTPUT.
224/// @param id_out - pointer to the variable that will be initialized with the Value ID upon successful return. If a
225/// valid @a external_id was provided, the variable will be initialized with the @a external_id value.
226enum xnn_status xnn_define_tensor_value(
227 xnn_subgraph_t subgraph,
228 enum xnn_datatype datatype,
229 size_t num_dims,
230 const size_t* dims,
231 const void* data,
232 uint32_t external_id,
233 uint32_t flags,
234 uint32_t* id_out);
235
236/// Define a quantized tensor-type Value and add it to a Subgraph.
237///
238/// @param subgraph - a Subgraph object that will own the created Value.
239/// @param datatype - type of the tensor elements.
240/// @param zero_point - offset from zero to subtract from the quantized elements in the Value.
241/// @param scale - multiplication factor to convert quantized elements to real representation.
242/// @param num_dims - number of dimensions in the shape.
243/// @param dims - pointer to an array of @a num_dims shape dimensions. If num_dims is 0, this pointer can be NULL.
244/// XNNPACK does not keep any pointers to this array after the function returns.
245/// @param data - pointer to static data used for tensor initialization. If the tensor is not statically initialized,
246/// this pointer must be is NULL. If non-NULL, the life-time of the static data must exceed the life-time
247/// of the Subgraph object, and of any Runtime objects created from the Subgraph.
248/// @param external_id - external ID for the Value. The ID must be within the range of reversed Value IDs specified on
249/// the Subgraph creation. If the external ID is XNN_INVALID_VALUE_ID, an internal ID will be
250/// created for the Value.
251/// @param flags - binary features of the Value. Supported values are any combination of XNN_VALUE_FLAG_EXTERNAL_INPUT
252/// and XNN_VALUE_FLAG_EXTERNAL_OUTPUT.
253/// @param id_out - pointer to the variable that will be initialized with the Value ID upon successful return. If a
254/// valid @a external_id was provided, the variable will be initialized with the @a external_id value.
255enum xnn_status xnn_define_quantized_tensor_value(
256 xnn_subgraph_t subgraph,
257 enum xnn_datatype datatype,
258 int32_t zero_point,
259 float scale,
260 size_t num_dims,
261 const size_t* dims,
262 const void* data,
263 uint32_t external_id,
264 uint32_t flags,
265 uint32_t* id_out);
266
267/// Define a channelwise quantized tensor-type Value and add it to a Subgraph.
268///
269/// @param subgraph - a Subgraph object that will own the created Value.
270/// @param datatype - type of the tensor elements.
271/// @param scale - per-channel multiplication factors to convert quantized elements to real representation.
272/// @param num_dims - number of dimensions in the shape.
273/// @param channel_dim - index of the channel dimension in the tensor with per-channel quantization parameters.
274/// Typically this is the first dimension (dimension #0) of the filter tensors in the Convolution,
275/// Deconvolution, and Fully Connected operators and the last dimension of the filter tensors in
276/// the Depthwise Convolution operators.
277/// @param dims - pointer to an array of @a num_dims shape dimensions. If num_dims is 0, this pointer can be NULL.
278/// XNNPACK does not keep any pointers to this array after the function returns.
279/// @param data - pointer to static data used for tensor initialization. If the tensor is not statically initialized,
280/// this pointer must be is NULL. If non-NULL, the life-time of the static data must exceed the life-time
281/// of the Subgraph object, and of any Runtime objects created from the Subgraph.
282/// @param external_id - external ID for the Value. The ID must be within the range of reversed Value IDs specified on
283/// the Subgraph creation. If the external ID is XNN_INVALID_VALUE_ID, an internal ID will be
284/// created for the Value.
285/// @param flags - binary features of the Value. Supported values are any combination of XNN_VALUE_FLAG_EXTERNAL_INPUT
286/// and XNN_VALUE_FLAG_EXTERNAL_OUTPUT.
287/// @param id_out - pointer to the variable that will be initialized with the Value ID upon successful return. If a
288/// valid @a external_id was provided, the variable will be initialized with the @a external_id value.
289enum xnn_status xnn_define_channelwise_quantized_tensor_value(
290 xnn_subgraph_t subgraph,
291 enum xnn_datatype datatype,
292 const float* scale,
293 size_t num_dims,
294 size_t channel_dim,
295 const size_t* dims,
296 const void* data,
297 uint32_t external_id,
298 uint32_t flags,
299 uint32_t* id_out);
300
301/// Define a Convert Node and add it to a Subgraph.
302///
303/// @param subgraph - a Subgraph object that will own the created Node.
304/// @param input_id - Value ID for the input tensor. The input tensor must be defined in the @a subgraph.
305/// @param output_id - Value ID for the output tensor. The output tensor must be defined in the @a subgraph, and its
306/// shape must match the shape of the input tensor.
307/// @param flags - binary features of the Convert Node. No supported flags are currently defined.
308enum xnn_status xnn_define_convert(
309 xnn_subgraph_t subgraph,
310 uint32_t input_id,
311 uint32_t output_id,
312 uint32_t flags);
313
314/// Define a 2D Convolution Node and add it to a Subgraph.
315///
316/// @param subgraph - a Subgraph object that will own the created Node.
317/// @param input_padding_top - implicit zero-padding above 2D input data. Must be 0 if XNN_FLAG_TENSORFLOW_SAME_PADDING
318/// flag is specified.
319/// @param input_padding_right - implicit zero-padding to the right of 2D input data. Must be 0 if
320/// XNN_FLAG_TENSORFLOW_SAME_PADDING flag is specified.
321/// @param input_padding_bottom - implicit zero-padding below 2D input data. Must be 0 if
322/// XNN_FLAG_TENSORFLOW_SAME_PADDING flag is specified.
323/// @param input_padding_left - implicit zero-padding to the left of 2D input data. Must be 0 if
324/// XNN_FLAG_TENSORFLOW_SAME_PADDING flag is specified.
325/// @param kernel_height - kernel (filter) height.
326/// @param kernel_width - kernel (filter) width.
327/// @param subsampling_height - height of subsampling region for convolution output (convolution height stride).
328/// @param subsampling_width - width of subsampling region for convolution output (convolution width stride).
329/// @param dilation_height - dilation of kernel elements along the height dimension.
330/// @param dilation_width - dilation of kernel elements along the width dimension.
331/// @param groups - number of convolution groups.
332/// @param group_input_channels - number of input channels per group.
333/// @param group_output_channels - number of output channels per group.
334/// @param output_min - lower bound for clipping output values.
335/// @param output_max - upper bound for clipping output values.
336/// @param input_id - Value ID for the input tensor. The input tensor must be a 4D tensor defined in the @a subgraph
337/// with [N, IH, IW, groups * group_input_channels] dimensions
338/// @param filter_id - Value ID for the filter tensor. The filter tensor must ge a 4D tensor defined in the @a subgraph
339/// with [groups * group_output_channels, kernel_height, kernel_width, group_input_channels]
340/// dimensions.
341/// @param bias_id - Value ID for the bias tensor, or XNN_INVALID_VALUE_ID for a 2D Convolution Node without a bias. If
342/// present, the bias tensor must be a 1D tensor defined in the @a subgraph with [groups *
343/// group_output_channels] dimensions.
344/// @param output_id - Value ID for the output tensor. The output tensor must be a 4D tensor defined in the @a subgraph
345/// with [N, OH, OW, groups * group_output_channels] dimensions.
346/// @param flags - binary features of the 2D Convolution Node. The only currently supported values is
347/// XNN_FLAG_TENSORFLOW_SAME_PADDING.
348enum xnn_status xnn_define_convolution_2d(
349 xnn_subgraph_t subgraph,
350 uint32_t input_padding_top,
351 uint32_t input_padding_right,
352 uint32_t input_padding_bottom,
353 uint32_t input_padding_left,
354 uint32_t kernel_height,
355 uint32_t kernel_width,
356 uint32_t subsampling_height,
357 uint32_t subsampling_width,
358 uint32_t dilation_height,
359 uint32_t dilation_width,
360 uint32_t groups,
361 size_t group_input_channels,
362 size_t group_output_channels,
363 float output_min,
364 float output_max,
365 uint32_t input_id,
366 uint32_t filter_id,
367 uint32_t bias_id,
368 uint32_t output_id,
369 uint32_t flags);
370
371/// Define a 2D Deconvolution (Transposed Convolution) Node and add it to a Subgraph.
372///
373/// @param subgraph - a Subgraph object that will own the created Node.
374/// @param padding_top - implicit padding above 2D output data.
375/// @param padding_right - implicit padding to the right of 2D output data.
376/// @param padding_bottom - implicit padding below 2D output data.
377/// @param padding_left - implicit padding to the left of 2D output data.
378/// @param adjustment_height - additional elements in the bottom of the 2D output data.
379/// @param adjustment_width - additional elements to the right of the 2D output data.
380/// @param kernel_height - kernel (filter) height.
381/// @param kernel_width - kernel (filter) width.
382/// @param upsampling_height - height of upsampling region for deconvolution input (deconvolution height stride).
383/// @param upsampling_width - width of upsampling region for deconvolution input (deconvolution width stride).
384/// @param dilation_height - dilation of kernel elements along the height dimension.
385/// @param dilation_width - dilation of kernel elements along the width dimension.
386/// @param groups - number of convolution groups.
387/// @param group_input_channels - number of input channels per group.
388/// @param group_output_channels - number of output channels per group.
389/// @param output_min - lower bound for clipping output values.
390/// @param output_max - upper bound for clipping output values.
391/// @param input_id - Value ID for the input tensor. The input tensor must be a 4D tensor defined in the @a subgraph
392/// with [N, IH, IW, groups * group_input_channels] dimensions
393/// @param filter_id - Value ID for the filter tensor. The filter tensor must ge a 4D tensor defined in the @a subgraph
394/// with [groups * group_output_channels, kernel_height, kernel_width, group_input_channels]
395/// dimensions.
396/// @param bias_id - Value ID for the bias tensor, or XNN_INVALID_VALUE_ID for a 2D Convolution Node without a bias. If
397/// present, the bias tensor must be a 1D tensor defined in the @a subgraph with
398/// [groups * group_output_channels] dimensions.
399/// @param output_id - Value ID for the output tensor. The output tensor must be a 4D tensor defined in the @a subgraph
400/// with [N, OH, OW, groups * group_output_channels] dimensions.
401/// @param flags - binary features of the 2D Deconvolution Node. No supported flags are currently defined.
402enum xnn_status xnn_define_deconvolution_2d(
403 xnn_subgraph_t subgraph,
404 uint32_t padding_top,
405 uint32_t padding_right,
406 uint32_t padding_bottom,
407 uint32_t padding_left,
408 uint32_t adjustment_height,
409 uint32_t adjustment_width,
410 uint32_t kernel_height,
411 uint32_t kernel_width,
412 uint32_t upsampling_height,
413 uint32_t upsampling_width,
414 uint32_t dilation_height,
415 uint32_t dilation_width,
416 uint32_t groups,
417 size_t group_input_channels,
418 size_t group_output_channels,
419 float output_min,
420 float output_max,
421 uint32_t input_id,
422 uint32_t filter_id,
423 uint32_t bias_id,
424 uint32_t output_id,
425 uint32_t flags);
426
427/// Define a 2D Depthwise Convolution Node and add it to a Subgraph.
428///
429/// @param subgraph - a Subgraph object that will own the created Node.
430/// @param input_padding_top - implicit zero-padding above 2D input data. Must be 0 if XNN_FLAG_TENSORFLOW_SAME_PADDING
431/// flag is specified.
432/// @param input_padding_right - implicit zero-padding to the right of 2D input data. Must be 0 if
433/// XNN_FLAG_TENSORFLOW_SAME_PADDING flag is specified.
434/// @param input_padding_bottom - implicit zero-padding below 2D input data. Must be 0 if
435/// XNN_FLAG_TENSORFLOW_SAME_PADDING flag is specified.
436/// @param input_padding_left - implicit zero-padding to the left of 2D input data. Must be 0 if
437/// XNN_FLAG_TENSORFLOW_SAME_PADDING flag is specified.
438/// @param kernel_height - kernel (filter) height.
439/// @param kernel_width - kernel (filter) width.
440/// @param subsampling_height - height of subsampling region for convolution output (convolution height stride).
441/// @param subsampling_width - width of subsampling region for convolution output (convolution width stride).
442/// @param dilation_height - dilation of kernel elements along the height dimension.
443/// @param dilation_width - dilation of kernel elements along the width dimension.
444/// @param depth_multiplier - ratio of output channels to input channels.
445/// @param input_channels - number of input channels.
446/// @param output_min - lower bound for clipping output values.
447/// @param output_max - upper bound for clipping output values.
448/// @param input_id - Value ID for the input tensor. The input tensor must be a 4D tensor defined in the @a subgraph
449/// with [N, IH, IW, input_channels] dimensions
450/// @param filter_id - Value ID for the filter tensor. The filter tensor must ge a 4D tensor defined in the @a subgraph
451/// with [1, kernel_height, kernel_width, input_channels * depth_multiplier] dimensions.
452/// @param bias_id - Value ID for the bias tensor, or XNN_INVALID_VALUE_ID for a 2D Depthwise Convolution Node without
453/// a bias. If present, the bias tensor must be a 1D tensor defined in the @a subgraph with
454/// [input_channels * depth_multiplier] dimensions.
455/// @param output_id - Value ID for the output tensor. The output tensor must be a 4D tensor defined in the @a subgraph
456/// with [N, OH, OW, input_channels * depth_multiplier] dimensions.
457/// @param flags - binary features of the 2D Depthwise Convolution Node. The only currently supported values is
458/// XNN_FLAG_TENSORFLOW_SAME_PADDING.
459enum xnn_status xnn_define_depthwise_convolution_2d(
460 xnn_subgraph_t subgraph,
461 uint32_t input_padding_top,
462 uint32_t input_padding_right,
463 uint32_t input_padding_bottom,
464 uint32_t input_padding_left,
465 uint32_t kernel_height,
466 uint32_t kernel_width,
467 uint32_t subsampling_height,
468 uint32_t subsampling_width,
469 uint32_t dilation_height,
470 uint32_t dilation_width,
471 uint32_t depth_multiplier,
472 size_t input_channels,
473 float output_min,
474 float output_max,
475 uint32_t input_id,
476 uint32_t filter_id,
477 uint32_t bias_id,
478 uint32_t output_id,
479 uint32_t flags);
480
481/// Define a Depth To Space Node and add it to a Subgraph.
482///
483/// The Depth To Space Node rearranges data from depth into blocks of spatial data (a reverse transform to
484/// Space To Depth). For a given input pixel, an output square of pixels with side @a block_size is formed from values
485/// in the corresponding number of its channels. The output depth is therefore @a block_size x @a block_size times
486/// smaller than that of the input.
487///
488/// @param subgraph - a Subgraph object that will own the created Node.
489/// @param input_id - Value ID for the input tensor. The input tensor must be a 4D tensor defined in the @a subgraph
490/// with [N, IH, IW, OC * block_size * block_size] dimensions.
491/// @param output_id - Value ID for the output tensor. The output tensor must be a 4D tensor defined in the @a subgraph
492/// with [N, IH * block_size, IW * block_size, OC] dimensions.
493/// @param block_size - the size of the spatial block.
494/// @param flags - binary features of the input_channels Node. No supported flags are currently defined.
495enum xnn_status xnn_define_depth_to_space(
496 xnn_subgraph_t subgraph,
497 uint32_t input_id,
498 uint32_t output_id,
499 uint32_t block_size,
500 uint32_t flags);
501
502/// Define a 1D Global Average Pooling Node and add it to a Subgraph.
503///
504/// @param subgraph - a Subgraph object that will own the created Node.
505/// @param output_min - lower bound for clipping output values.
506/// @param output_max - upper bound for clipping output values.
507/// @param input_id - Value ID for the input tensor. The input tensor must be a dense tensor with 2 or more dimensions
508/// defined in the @a subgraph. Averaging is performed across the second-innermost dimension.
509/// @param output_id - Value ID for the output tensor. The output tensor must be a dense tensor with 2 or more
510/// dimensions defined in the @a subgraph.
511/// @param flags - binary features of the 1D Global Average Pooling Node. No supported flags are currently defined.
512enum xnn_status xnn_define_global_average_pooling_1d(
513 xnn_subgraph_t subgraph,
514 float output_min,
515 float output_max,
516 uint32_t input_id,
517 uint32_t output_id,
518 uint32_t flags);
519
520/// Define a 2D Global Average Pooling Node and add it to a Subgraph.
521///
522/// @param subgraph - a Subgraph object that will own the created Node.
523/// @param output_min - lower bound for clipping output values.
524/// @param output_max - upper bound for clipping output values.
525/// @param input_id - Value ID for the input tensor. The input tensor must be a dense tensor with 3 or more dimensions
526/// defined in the @a subgraph. Averaging is performed across the second- and third-innermost
527/// dimensions.
528/// @param output_id - Value ID for the output tensor. The output tensor must be a dense tensor with 3 or more
529/// dimensions defined in the @a subgraph.
530/// @param flags - binary features of the 2D Global Average Pooling Node. No supported flags are currently defined.
531enum xnn_status xnn_define_global_average_pooling_2d(
532 xnn_subgraph_t subgraph,
533 float output_min,
534 float output_max,
535 uint32_t input_id,
536 uint32_t output_id,
537 uint32_t flags);
538
539/// Define a 2D Average Pooling Node and add it to a Subgraph.
540///
541/// @param subgraph - a Subgraph object that will own the created Node.
542/// @param input_padding_top - implicit zero-padding above 2D input data. Must be 0 if XNN_FLAG_TENSORFLOW_SAME_PADDING
543/// flag is specified.
544/// @param input_padding_right - implicit zero-padding to the right of 2D input data. Must be 0 if
545/// XNN_FLAG_TENSORFLOW_SAME_PADDING flag is specified.
546/// @param input_padding_bottom - implicit zero-padding below 2D input data. Must be 0 if
547/// XNN_FLAG_TENSORFLOW_SAME_PADDING flag is specified.
548/// @param input_padding_left - implicit zero-padding to the left of 2D input data. Must be 0 if
549/// XNN_FLAG_TENSORFLOW_SAME_PADDING flag is specified.
550/// @param pooling_height - pooling (kernel) height.
551/// @param pooling_width - pooling (kernel) width.
552/// @param stride_height - displacing of the pooling window in the vertical dimension of the input pixels corresponding
553/// to vertically adjacent output pixels.
554/// @param stride_width - displacing of the pooling window in the horizontal dimension of the input pixels corresponding
555/// to horizontally adjacent output pixels.
556/// @param output_min - lower bound for clipping output values.
557/// @param output_max - upper bound for clipping output values.
558/// @param input_id - Value ID for the input tensor. The input tensor must be a 4D tensor defined in the @a subgraph
559/// with [N, IH, IW, channels] dimensions
560/// @param output_id - Value ID for the output tensor. The output tensor must be a 4D tensor defined in the @a subgraph
561/// with [N, OH, OW, channels] dimensions.
562/// @param flags - binary features of the 2D Average Pooling Node. The only currently supported values is
563/// XNN_FLAG_TENSORFLOW_SAME_PADDING.
564enum xnn_status xnn_define_average_pooling_2d(
565 xnn_subgraph_t subgraph,
566 uint32_t input_padding_top,
567 uint32_t input_padding_right,
568 uint32_t input_padding_bottom,
569 uint32_t input_padding_left,
570 uint32_t pooling_height,
571 uint32_t pooling_width,
572 uint32_t stride_height,
573 uint32_t stride_width,
574 float output_min,
575 float output_max,
576 uint32_t input_id,
577 uint32_t output_id,
578 uint32_t flags);
579
580/// Define a Fully Connected Node and add it to a Subgraph.
581///
582/// @param subgraph - a Subgraph object that will own the created Node.
583/// @param output_min - lower bound for clipping output values.
584/// @param output_max - upper bound for clipping output values.
585/// @param input_id - Value ID for the input tensor. The input tensor must be an N-dimensional tensor defined in the
586/// @a subgraph. If XNN_FLAG_TENSORFLOW_RESHAPE_2D is not specified, the input tensor must be at least
587/// 1D and its last dimension must match the last dimension of the filter tensor. In particular, if
588/// input is a 2D tensor, it must have [batch_size, input_channels] dimensions.
589/// If XNN_FLAG_TENSORFLOW_RESHAPE_2D is specified, the number of elements in the input tensor must be
590/// divisible by the input_channels. The tensor will be first flattened into a 1D tensor of
591/// [num_input_elements] dimensions, then reshaped into a 2D tensor of
592/// [num_input_elements / input_channels, input_channels] dimensions where num_input_elements is the
593/// total number of elements in the input tensor.
594/// @param filter_id - Value ID for the filter tensor. The filter tensor must a 2D tensor defined in the @a subgraph.
595/// If the XNN_FLAG_TRANSPOSE_WEIGHTS flag is not specified, the filter tensor must have
596/// [output_channels, input_channels] dimensions. If the XNN_FLAG_TRANSPOSE_WEIGHTS flag is
597/// specified, the filter tensor must have [input_channels, output_channels] dimensions.
598/// @param bias_id - Value ID for the bias tensor, or XNN_INVALID_VALUE_ID for a Fully Connected Node without a bias.
599/// If present, the bias tensor must be a 1D tensor defined in the @a subgraph with [output_channels]
600/// dimensions.
601/// @param output_id - Value ID for the output tensor. The output tensor must be defined in the @a subgraph.
602/// If XNN_FLAG_TENSORFLOW_RESHAPE_2D is not specified, the output tensor must have the same
603/// dimensionality as the input tensor, all its dimensions but the last one must match the
604/// corresponding dimensions of the input tensor, and the last dimensions of the output tensor must
605/// match the first dimension of the filter tensor. In particular, if input is a 2D tensor, output
606/// must be a 2D tensor of [batch_size, output_channels] dimensions.
607/// If XNN_FLAG_TENSORFLOW_RESHAPE_2D is specified, output must be a 2D tensor of
608/// [num_input_elements / input_channels, output_channels] dimensions where num_input_elements is the
609/// total number of elements in the input tensor.
610/// @param flags - binary features of the Fully Connected Node. The only currently supported values are
611/// XNN_FLAG_TENSORFLOW_RESHAPE_2D and XNN_FLAG_TRANSPOSE_WEIGHTS.
612enum xnn_status xnn_define_fully_connected(
613 xnn_subgraph_t subgraph,
614 float output_min,
615 float output_max,
616 uint32_t input_id,
617 uint32_t filter_id,
618 uint32_t bias_id,
619 uint32_t output_id,
620 uint32_t flags);
621
622/// Define a 2D Max Pooling Node and add it to a Subgraph.
623///
624/// @param subgraph - a Subgraph object that will own the created Node.
625/// @param input_padding_top - implicit zero-padding above 2D input data. Must be 0 if XNN_FLAG_TENSORFLOW_SAME_PADDING
626/// flag is specified.
627/// @param input_padding_right - implicit zero-padding to the right of 2D input data. Must be 0 if
628/// XNN_FLAG_TENSORFLOW_SAME_PADDING flag is specified.
629/// @param input_padding_bottom - implicit zero-padding below 2D input data. Must be 0 if
630/// XNN_FLAG_TENSORFLOW_SAME_PADDING flag is specified.
631/// @param input_padding_left - implicit zero-padding to the left of 2D input data. Must be 0 if
632/// XNN_FLAG_TENSORFLOW_SAME_PADDING flag is specified.
633/// @param pooling_height - pooling (kernel) height.
634/// @param pooling_width - pooling (kernel) width.
635/// @param stride_height - displacing of the pooling window in the vertical dimension of the input pixels corresponding
636/// to vertically adjacent output pixels.
637/// @param stride_width - displacing of the pooling window in the horizontal dimension of the input pixels corresponding
638/// to horizontally adjacent output pixels.
639/// @param dilation_height - dilation of pooling elements along the height dimension.
640/// @param dilation_width - dilation of pooling elements along the width dimension.
641/// @param output_min - lower bound for clipping output values.
642/// @param output_max - upper bound for clipping output values.
643/// @param input_id - Value ID for the input tensor. The input tensor must be a 4D tensor defined in the @a subgraph
644/// with [N, IH, IW, channels] dimensions
645/// @param output_id - Value ID for the output tensor. The output tensor must be a 4D tensor defined in the @a subgraph
646/// with [N, OH, OW, channels] dimensions.
647/// @param flags - binary features of the 2D Max Pooling Node. The only currently supported values is
648/// XNN_FLAG_TENSORFLOW_SAME_PADDING.
649enum xnn_status xnn_define_max_pooling_2d(
650 xnn_subgraph_t subgraph,
651 uint32_t input_padding_top,
652 uint32_t input_padding_right,
653 uint32_t input_padding_bottom,
654 uint32_t input_padding_left,
655 uint32_t pooling_height,
656 uint32_t pooling_width,
657 uint32_t stride_height,
658 uint32_t stride_width,
659 uint32_t dilation_height,
660 uint32_t dilation_width,
661 float output_min,
662 float output_max,
663 uint32_t input_id,
664 uint32_t output_id,
665 uint32_t flags);
666
667/// Define a 2D ArgMax Pooling Node and add it to a Subgraph.
668///
669/// @param subgraph - a Subgraph object that will own the created Node.
670/// @param input_padding_top - implicit zero-padding above 2D input data.
671/// @param input_padding_right - implicit zero-padding to the right of 2D input data.
672/// @param input_padding_bottom - implicit zero-padding below 2D input data.
673/// @param input_padding_left - implicit zero-padding to the left of 2D input data.
674/// @param pooling_height - pooling (kernel) height. Vertical stride between pooling regions match this value.
675/// @param pooling_width - pooling (kernel) width. Horizontal stride between pooling regions match this value.
676/// @param input_id - Value ID for the input tensor. The input tensor must be a 4D tensor defined in the @a subgraph
677/// with [N, IH, IW, channels] dimensions
678/// @param output_value_id - Value ID for the output tensor with the maximum values in the pools. The output tensor must
679/// be a 4D tensor defined in the @a subgraph with [N, OH, OW, channels] dimensions.
680/// @param output_index_id - Value ID for the output tensor with the indexes of the maximum values in the pools. The
681/// output tensor must be a 4D tensor defined in the @a subgraph with [N, OH, OW, channels]
682/// dimensions.
683/// @param flags - binary features of the 2D ArgMax Pooling Node. No supported flags are currently defined.
684enum xnn_status xnn_define_argmax_pooling_2d(
685 xnn_subgraph_t subgraph,
686 uint32_t input_padding_top,
687 uint32_t input_padding_right,
688 uint32_t input_padding_bottom,
689 uint32_t input_padding_left,
690 uint32_t pooling_height,
691 uint32_t pooling_width,
692 uint32_t input_id,
693 uint32_t output_value_id,
694 uint32_t output_index_id,
695 uint32_t flags);
696
697/// Define a 2D UnPooling Node and add it to a Subgraph.
698///
699/// @param subgraph - a Subgraph object that will own the created Node.
700/// @param padding_top - implicit padding above 2D output data.
701/// @param padding_right - implicit padding to the right of 2D output data.
702/// @param padding_bottom - implicit padding below 2D output data.
703/// @param padding_left - implicit padding to the left of 2D output data.
704/// @param pooling_height - height of the pooling window.
705/// @param pooling_width - width of the pooling window.
706/// @param input_value_id - Value ID for the input tensor with the max-pooling values to invert. The input value tensor
707/// must be a 4D tensor defined in the @a subgraph with [N, IH, IW, channels] dimensions.
708/// @param input_index_id - Value ID for the input tensor with the indices of the per-pool maximum values produced by
709/// a 2D UnPooling Node. The input tensor must be a 4D tensor defined in the @a subgraph with
710/// [N, IH, IW, channels] dimensions.
711/// @param output_id - Value ID for the output tensor. The output tensor must be a 4D tensor defined in the @a subgraph
712/// with [N, OH, OW, channels] dimensions.
713/// @param flags - binary features of the 2D UnPooling Node. No supported flags are currently defined.
714enum xnn_status xnn_define_unpooling_2d(
715 xnn_subgraph_t subgraph,
716 uint32_t padding_top,
717 uint32_t padding_right,
718 uint32_t padding_bottom,
719 uint32_t padding_left,
720 uint32_t pooling_height,
721 uint32_t pooling_width,
722 uint32_t input_value_id,
723 uint32_t input_index_id,
724 uint32_t output_id,
725 uint32_t flags);
726
727/// Define a 2-Input Add Node and add it to a Subgraph.
728///
729/// The 2-Input Add Node computes elementwise addition of two tensor inputs with numpy broadcasting rules.
730///
731/// @param subgraph - a Subgraph object that will own the created Node.
732/// @param output_min - lower bound for clipping output values.
733/// @param output_max - upper bound for clipping output values.
734/// @param input1_id - Value ID for the first input tensor. The input tensor must be an N-dimensional tensor defined in
735/// the @a subgraph with each dimension either equal to the corresponding dimension of the second
736/// input, or equal to 1. In the latter case, the elements of the input tensor are broadcasted along
737/// that dimension.
738/// @param input2_id - Value ID for the second input tensor. The input tensor must be an M-dimensional tensor defined in
739/// the @a subgraph with each dimension either equal to the corresponding dimension of the first
740/// input, or equal to 1. In the latter case, the elements of the input tensor are broadcasted along
741/// that dimension.
742/// @param output_id - Value ID for the output tensor. The output tensor must be a max(N,M)-dimensional tensor defined
743/// in the @a subgraph with each dimension equal to the maximum between the corresponding dimension
744/// of the two inputs.
745/// @param flags - binary features of the Add Node. No supported flags are currently defined.
746enum xnn_status xnn_define_add2(
747 xnn_subgraph_t subgraph,
748 float output_min,
749 float output_max,
750 uint32_t input1_id,
751 uint32_t input2_id,
752 uint32_t output_id,
753 uint32_t flags);
754
755/// Define a 2-Input Multiply Node and add it to a Subgraph.
756///
757/// The 2-Input Multiply Node computes elementwise multiplication of two tensor inputs with numpy broadcasting rules.
758///
759/// @param subgraph - a Subgraph object that will own the created Node.
760/// @param output_min - lower bound for clipping output values.
761/// @param output_max - upper bound for clipping output values.
762/// @param input1_id - Value ID for the first input tensor. The input tensor must be an N-dimensional tensor defined in
763/// the @a subgraph with each dimension either equal to the corresponding dimension of the second
764/// input, or equal to 1. In the latter case, the elements of the input tensor are broadcasted along
765/// that dimension.
766/// @param input2_id - Value ID for the second input tensor. The input tensor must be an M-dimensional tensor defined in
767/// the @a subgraph with each dimension either equal to the corresponding dimension of the first
768/// input, or equal to 1. In the latter case, the elements of the input tensor are broadcasted along
769/// that dimension.
770/// @param output_id - Value ID for the output tensor. The output tensor must be a max(N,M)-dimensional tensor defined
771/// in the @a subgraph with each dimension equal to the maximum between the corresponding dimension
772/// of the two inputs.
773/// @param flags - binary features of the Multiply Node. No supported flags are currently defined.
774enum xnn_status xnn_define_multiply2(
775 xnn_subgraph_t subgraph,
776 float output_min,
777 float output_max,
778 uint32_t input1_id,
779 uint32_t input2_id,
780 uint32_t output_id,
781 uint32_t flags);
782
783/// Define a Subtract Node and add it to a Subgraph.
784///
785/// The Subtract Node computes elementwise subtraction of two tensor inputs with numpy broadcasting rules.
786///
787/// @param subgraph - a Subgraph object that will own the created Node.
788/// @param output_min - lower bound for clipping output values.
789/// @param output_max - upper bound for clipping output values.
790/// @param input1_id - Value ID for the first input tensor. The input tensor must be an N-dimensional tensor defined in
791/// the @a subgraph with each dimension either equal to the corresponding dimension of the second
792/// input, or equal to 1. In the latter case, the elements of the input tensor are broadcasted along
793/// that dimension.
794/// @param input2_id - Value ID for the second input tensor. The input tensor must be an M-dimensional tensor defined in
795/// the @a subgraph with each dimension either equal to the corresponding dimension of the first
796/// input, or equal to 1. In the latter case, the elements of the input tensor are broadcasted along
797/// that dimension.
798/// @param output_id - Value ID for the output tensor. The output tensor must be a max(N,M)-dimensional tensor defined
799/// in the @a subgraph with each dimension equal to the maximum between the corresponding dimension
800/// of the two inputs.
801/// @param flags - binary features of the Subtract Node. No supported flags are currently defined.
802enum xnn_status xnn_define_subtract(
803 xnn_subgraph_t subgraph,
804 float output_min,
805 float output_max,
806 uint32_t input1_id,
807 uint32_t input2_id,
808 uint32_t output_id,
809 uint32_t flags);
810
811/// Define a Divide Node and add it to a Subgraph.
812///
813/// The Divide Node computes elementwise division of two tensor inputs with numpy broadcasting rules.
814///
815/// @param subgraph - a Subgraph object that will own the created Node.
816/// @param output_min - lower bound for clipping output values.
817/// @param output_max - upper bound for clipping output values.
818/// @param input1_id - Value ID for the first input tensor. The input tensor must be an N-dimensional tensor defined in
819/// the @a subgraph with each dimension either equal to the corresponding dimension of the second
820/// input, or equal to 1. In the latter case, the elements of the input tensor are broadcasted along
821/// that dimension.
822/// @param input2_id - Value ID for the second input tensor. The input tensor must be an M-dimensional tensor defined in
823/// the @a subgraph with each dimension either equal to the corresponding dimension of the first
824/// input, or equal to 1. In the latter case, the elements of the input tensor are broadcasted along
825/// that dimension.
826/// @param output_id - Value ID for the output tensor. The output tensor must be a max(N,M)-dimensional tensor defined
827/// in the @a subgraph with each dimension equal to the maximum between the corresponding dimension
828/// of the two inputs.
829/// @param flags - binary features of the Divide Node. No supported flags are currently defined.
830enum xnn_status xnn_define_divide(
831 xnn_subgraph_t subgraph,
832 float output_min,
833 float output_max,
834 uint32_t input1_id,
835 uint32_t input2_id,
836 uint32_t output_id,
837 uint32_t flags);
838
839/// Define a 2-Input Maximum Node and add it to a Subgraph.
840///
841/// The 2-Input Maximum Node computes elementwise maximum of two tensor inputs with numpy broadcasting rules.
842///
843/// @param subgraph - a Subgraph object that will own the created Node.
844/// @param input1_id - Value ID for the first input tensor. The input tensor must be an N-dimensional tensor defined in
845/// the @a subgraph with each dimension either equal to the corresponding dimension of the second
846/// input, or equal to 1. In the latter case, the elements of the input tensor are broadcasted along
847/// that dimension.
848/// @param input2_id - Value ID for the second input tensor. The input tensor must be an M-dimensional tensor defined in
849/// the @a subgraph with each dimension either equal to the corresponding dimension of the first
850/// input, or equal to 1. In the latter case, the elements of the input tensor are broadcasted along
851/// that dimension.
852/// @param output_id - Value ID for the output tensor. The output tensor must be a max(N,M)-dimensional tensor defined
853/// in the @a subgraph with each dimension equal to the maximum between the corresponding dimension
854/// of the two inputs.
855/// @param flags - binary features of the Maximum Node. No supported flags are currently defined.
856enum xnn_status xnn_define_maximum2(
857 xnn_subgraph_t subgraph,
858 uint32_t input1_id,
859 uint32_t input2_id,
860 uint32_t output_id,
861 uint32_t flags);
862
863/// Define a 2-Input Minimum Node and add it to a Subgraph.
864///
865/// The 2-Input Minimum Node computes elementwise minimum of two tensor inputs with numpy broadcasting rules.
866///
867/// @param subgraph - a Subgraph object that will own the created Node.
868/// @param input1_id - Value ID for the first input tensor. The input tensor must be an N-dimensional tensor defined in
869/// the @a subgraph with each dimension either equal to the corresponding dimension of the second
870/// input, or equal to 1. In the latter case, the elements of the input tensor are broadcasted along
871/// that dimension.
872/// @param input2_id - Value ID for the second input tensor. The input tensor must be an M-dimensional tensor defined in
873/// the @a subgraph with each dimension either equal to the corresponding dimension of the first
874/// input, or equal to 1. In the latter case, the elements of the input tensor are broadcasted along
875/// that dimension.
876/// @param output_id - Value ID for the output tensor. The output tensor must be a max(N,M)-dimensional tensor defined
877/// in the @a subgraph with each dimension equal to the maximum between the corresponding dimension
878/// of the two inputs.
879/// @param flags - binary features of the Minimum Node. No supported flags are currently defined.
880enum xnn_status xnn_define_minimum2(
881 xnn_subgraph_t subgraph,
882 uint32_t input1_id,
883 uint32_t input2_id,
884 uint32_t output_id,
885 uint32_t flags);
886
887/// Define a Squared Difference Node and add it to a Subgraph.
888///
889/// The Squared Difference Node computes elementwise squared difference of two tensor inputs with numpy broadcasting
890/// rules.
891///
892/// @param subgraph - a Subgraph object that will own the created Node.
893/// @param input1_id - Value ID for the first input tensor. The input tensor must be an N-dimensional tensor defined in
894/// the @a subgraph with each dimension either equal to the corresponding dimension of the second
895/// input, or equal to 1. In the latter case, the elements of the input tensor are broadcasted along
896/// that dimension.
897/// @param input2_id - Value ID for the second input tensor. The input tensor must be an M-dimensional tensor defined in
898/// the @a subgraph with each dimension either equal to the corresponding dimension of the first
899/// input, or equal to 1. In the latter case, the elements of the input tensor are broadcasted along
900/// that dimension.
901/// @param output_id - Value ID for the output tensor. The output tensor must be a max(N,M)-dimensional tensor defined
902/// in the @a subgraph with each dimension equal to the maximum between the corresponding dimension
903/// of the two inputs.
904/// @param flags - binary features of the Squared Difference Node. No supported flags are currently defined.
905enum xnn_status xnn_define_squared_difference(
906 xnn_subgraph_t subgraph,
907 uint32_t input1_id,
908 uint32_t input2_id,
909 uint32_t output_id,
910 uint32_t flags);
911
912/// Define a Constant Pad Node with static padding specification and add it to a Subgraph.
913///
914/// @param subgraph - a Subgraph object that will own the created Node.
915/// @param pre_paddings - number of padding elements to insert before input elements for every dimension. This array
916/// must have as many elements as the the number of dimensions in the input tensor.
917/// @param post_paddings - number of padding elements to insert after input elements for every dimension. This array
918/// must have as many elements as the the number of dimensions in the input tensor.
919/// @param padding_value - constant value used to initialize padding elements.
920/// @param input_id - Value ID for the input tensor. The input tensor must be defined in the @a subgraph.
921/// @param output_id - Value ID for the output tensor. The output tensor must be defined in the @a subgraph, and its
922/// shape must match the shape of the input tensor with padding.
923/// @param flags - binary features of the Constant Pad Node. No supported flags are currently defined.
924enum xnn_status xnn_define_static_constant_pad(
925 xnn_subgraph_t subgraph,
926 const size_t* pre_paddings,
927 const size_t* post_paddings,
928 float padding_value,
929 uint32_t input_id,
930 uint32_t output_id,
931 uint32_t flags);
932
933/// Define a 2-Input Concatenate Node and add it to a Subgraph.
934///
935/// The 2-Input Concatenate Node concatenates two tensors along a specified axis.
936///
937/// @param subgraph - a Subgraph object that will own the created Node.
938/// @param axis - the axis to concatenate the two input tensors along
939/// @param input1_id - Value ID for the first input tensor. The input tensor must be an N-dimensional tensor defined in
940/// the @a subgraph with each dimension, except the axis, equal to the corresponding dimension of the
941/// second input.
942/// @param input2_id - Value ID for the second input tensor. The input tensor must be an N-dimensional tensor defined in
943/// the @a subgraph with each dimension, except the axis, equal to the corresponding dimension of the
944/// first input.
945/// @param output_id - Value ID for the output tensor. The output tensor must be a N-dimensional tensor defined
946/// in the @a subgraph with each dimension equal to the dimension of both inputs, except the axis
947/// dimension, where it is the sum of the corresponding dimensions of both inputs.
948/// @param flags - binary features of the Concatenate Node. No supported flags are currently defined.
949enum xnn_status xnn_define_concatenate2(
950 xnn_subgraph_t subgraph,
951 size_t axis,
952 uint32_t input1_id,
953 uint32_t input2_id,
954 uint32_t output_id,
955 uint32_t flags);
956
957/// Define a 3-Input Concatenate Node and add it to a Subgraph.
958///
959/// The 3-Input Concatenate Node concatenates three tensors along a specified axis.
960///
961/// @param subgraph - a Subgraph object that will own the created Node.
962/// @param axis - the axis to concatenate the three input tensors along
963/// @param input1_id - Value ID for the first input tensor. The input tensor must be an N-dimensional tensor defined in
964/// the @a subgraph with each dimension, except the axis, equal to the corresponding dimension of the
965/// other inputs.
966/// @param input2_id - Value ID for the second input tensor. The input tensor must be an N-dimensional tensor defined in
967/// the @a subgraph with each dimension, except the axis, equal to the corresponding dimension of the
968/// other inputs.
969/// @param input3_id - Value ID for the third input tensor. The input tensor must be an N-dimensional tensor defined in
970/// the @a subgraph with each dimension, except the axis, equal to the corresponding dimension of the
971/// other inputs.
972/// @param output_id - Value ID for the output tensor. The output tensor must be a N-dimensional tensor defined
973/// in the @a subgraph with each dimension equal to the dimension of all inputs, except the axis
974/// dimension, where it is the sum of the corresponding dimensions of all inputs.
975/// @param flags - binary features of the Concatenate Node. No supported flags are currently defined.
976enum xnn_status xnn_define_concatenate3(
977 xnn_subgraph_t subgraph,
978 size_t axis,
979 uint32_t input1_id,
980 uint32_t input2_id,
981 uint32_t input3_id,
982 uint32_t output_id,
983 uint32_t flags);
984
985/// Define a 4-Input Concatenate Node and add it to a Subgraph.
986///
987/// The 4-Input Concatenate Node concatenates four tensors along a specified axis.
988///
989/// @param subgraph - a Subgraph object that will own the created Node.
990/// @param axis - the axis to concatenate the four input tensors along
991/// @param input1_id - Value ID for the first input tensor. The input tensor must be an N-dimensional tensor defined in
992/// the @a subgraph with each dimension, except the axis, equal to the corresponding dimension of the
993/// other inputs.
994/// @param input2_id - Value ID for the second input tensor. The input tensor must be an N-dimensional tensor defined in
995/// the @a subgraph with each dimension, except the axis, equal to the corresponding dimension of the
996/// other inputs.
997/// @param input3_id - Value ID for the third input tensor. The input tensor must be an N-dimensional tensor defined in
998/// the @a subgraph with each dimension, except the axis, equal to the corresponding dimension of the
999/// other inputs.
1000/// @param input4_id - Value ID for the fourth input tensor. The input tensor must be an N-dimensional tensor defined in
1001/// the @a subgraph with each dimension, except the axis, equal to the corresponding dimension of the
1002/// other inputs.
1003/// @param output_id - Value ID for the output tensor. The output tensor must be a N-dimensional tensor defined
1004/// in the @a subgraph with each dimension equal to the dimension of all inputs, except the axis
1005/// dimension, where it is the sum of the corresponding dimensions of all inputs.
1006/// @param flags - binary features of the Concatenate Node. No supported flags are currently defined.
1007enum xnn_status xnn_define_concatenate4(
1008 xnn_subgraph_t subgraph,
1009 size_t axis,
1010 uint32_t input1_id,
1011 uint32_t input2_id,
1012 uint32_t input3_id,
1013 uint32_t input4_id,
1014 uint32_t output_id,
1015 uint32_t flags);
1016
1017/// Define a Copy Node and add it to a Subgraph.
1018///
1019/// The Copy Node copies an input tensor to an output tensor.
1020///
1021/// @param subgraph - a Subgraph object that will own the created Node.
1022/// @param input_id - Value ID for the first input tensor. The input tensor must be defined in the @a subgraph.
1023/// @param output_id - Value ID for the output tensor. The output tensor must be defined in the @a subgraph, and its
1024/// shape must match the shape of the input tensor.
1025/// @param flags - binary features of the Copy Node. No supported flags are currently defined.
1026enum xnn_status xnn_define_copy(
1027 xnn_subgraph_t subgraph,
1028 uint32_t input_id,
1029 uint32_t output_id,
1030 uint32_t flags);
1031
1032/// Define a 2-Output Split Node and add it to a Subgraph.
1033///
1034/// The 2-Output Split Node splits an input tensor into two output tensors along a specified axis evenly.
1035///
1036/// @param subgraph - a Subgraph object that will own the created Node.
1037/// @param split_dim - the dimension to split the input tensor along
1038/// @param input_id - Value ID for the input tensor. The input tensor must be an N-dimensional tensor defined in the @a
1039/// subgraph.
1040/// @param output1_id - Value ID for the first output tensor. The output tensor must be an N-dimensional tensor defined
1041/// in the @a subgraph with each dimension, except the axis, equal to the corresponding dimension
1042/// of the second output. The split_dim dimension is half of the input's split_dim.
1043/// @param output2_id - Value ID for the second output tensor. The output tensor must be an N-dimensional tensor
1044/// defined in the @a subgraph with each dimension, except the axis, equal to the corresponding
1045/// dimension of the first output. The split_dim dimension is half of the input's split_dim.
1046/// @param flags - binary features of the Split Node. No supported flags are currently defined.
1047enum xnn_status xnn_define_even_split2(
1048 xnn_subgraph_t subgraph,
1049 size_t split_dim,
1050 uint32_t input_id,
1051 uint32_t output1_id,
1052 uint32_t output2_id,
1053 uint32_t flags);
1054
1055/// Define a 3-Output Split Node and add it to a Subgraph.
1056///
1057/// The 3-Output Split Node splits an input tensor into three output tensors along a specified axis evenly.
1058///
1059/// @param subgraph - a Subgraph object that will own the created Node.
1060/// @param split_dim - the dimension to split the input tensor along
1061/// @param input_id - Value ID for the input tensor. The input tensor must be an N-dimensional tensor defined in the @a
1062/// subgraph.
1063/// @param output1_id - Value ID for the first output tensor. The output tensor must be an N-dimensional tensor defined
1064/// in the @a subgraph with each dimension, except the axis, equal to the corresponding dimension
1065/// of the second and third output. The split_dim dimension is one third of the input's split_dim.
1066/// @param output2_id - Value ID for the second output tensor. The output tensor must be an N-dimensional tensor
1067/// defined in the @a subgraph with each dimension, except the axis, equal to the corresponding
1068/// dimension of the first and third output. The split_dim dimension is one third of the input's
1069/// split_dim.
1070/// @param output3_id - Value ID for the third output tensor. The output tensor must be an N-dimensional tensor
1071/// defined in the @a subgraph with each dimension, except the axis, equal to the corresponding
1072/// dimension of the second and third output. The split_dim dimension is one third of the input's
1073/// split_dim.
1074/// @param flags - binary features of the Split Node. No supported flags are currently defined.
1075enum xnn_status xnn_define_even_split3(
1076 xnn_subgraph_t subgraph,
1077 size_t split_dim,
1078 uint32_t input_id,
1079 uint32_t output1_id,
1080 uint32_t output2_id,
1081 uint32_t output3_id,
1082 uint32_t flags);
1083
1084/// Define a 4-Output Split Node and add it to a Subgraph.
1085///
1086/// The 4-Output Split Node splits an input tensor into four output tensors along a specified axis evenly.
1087///
1088/// @param subgraph - a Subgraph object that will own the created Node.
1089/// @param split_dim - the dimension to split the input tensor along
1090/// @param input_id - Value ID for the input tensor. The input tensor must be an N-dimensional tensor defined in the @a
1091/// subgraph.
1092/// @param output1_id - Value ID for the first output tensor. The output tensor must be an N-dimensional tensor defined
1093/// in the @a subgraph with each dimension, except the axis, equal to the corresponding dimension
1094/// of the other output tensors. The split_dim dimension is one fourth of the input's split_dim.
1095/// @param output2_id - Value ID for the second output tensor. The output tensor must be an N-dimensional tensor
1096/// defined in the @a subgraph with each dimension, except the axis, equal to the corresponding
1097/// dimension of the other output tensors. The split_dim dimension is one fourth of the input's
1098/// split_dim.
1099/// @param output3_id - Value ID for the third output tensor. The output tensor must be an N-dimensional tensor
1100/// defined in the @a subgraph with each dimension, except the axis, equal to the corresponding
1101/// dimension of the other output tensors. The split_dim dimension is one fourth of the input's
1102/// split_dim.
1103/// @param output4_id - Value ID for the fourth output tensor. The output tensor must be an N-dimensional tensor
1104/// defined in the @a subgraph with each dimension, except the axis, equal to the corresponding
1105/// dimension of the other output tensors. The split_dim dimension is one fourth of the input's
1106/// split_dim.
1107/// @param flags - binary features of the Split Node. No supported flags are currently defined.
1108enum xnn_status xnn_define_even_split4(
1109 xnn_subgraph_t subgraph,
1110 size_t split_dim,
1111 uint32_t input_id,
1112 uint32_t output1_id,
1113 uint32_t output2_id,
1114 uint32_t output3_id,
1115 uint32_t output4_id,
1116 uint32_t flags);
1117
1118/// Define a Reshape Node with static shape specification and add it to a Subgraph.
1119///
1120/// @param subgraph - a Subgraph object that will own the created Node.
1121/// @param num_dims - number of shape dimensions in the output tensor.
1122/// @param new_shape - shape dimensions of the output tensor.
1123/// @param input_id - Value ID for the input tensor. The input tensor must be defined in the @a subgraph.
1124/// @param output_id - Value ID for the output tensor. The output tensor must be defined in the @a subgraph, and its
1125/// shape must match the shape of the input tensor with padding.
1126/// @param flags - binary features of the Reshape Node. No supported flags are currently defined.
1127enum xnn_status xnn_define_static_reshape(
1128 xnn_subgraph_t subgraph,
1129 size_t num_dims,
1130 const size_t* new_shape,
1131 uint32_t input_id,
1132 uint32_t output_id,
1133 uint32_t flags);
1134
1135/// Define a 2D Resize Bilinear Node with static output height & width specification and add it to a Subgraph.
1136///
1137/// @param subgraph - a Subgraph object that will own the created Node.
1138/// @param new_height - height dimension of the output tensor.
1139/// @param new_width - width dimension of the output tensor.
1140/// @param input_id - Value ID for the input tensor. The input tensor must be a 4D tensor defined in the @a subgraph
1141/// with [N, H, W, C] dimensions.
1142/// @param output_id - Value ID for the output tensor. The output tensor must be a 4D tensor defined in the @a subgraph
1143/// with [N, new_height, new_width, C] dimensions.
1144/// @param flags - binary features of the 2D Resize Bilinear Node. The only currently supported values are
1145/// XNN_FLAG_TENSORFLOW_LEGACY_MODE and XNN_FLAG_ALIGN_CORNERS, which are mutually exclusive.
1146enum xnn_status xnn_define_static_resize_bilinear_2d(
1147 xnn_subgraph_t subgraph,
1148 size_t new_height,
1149 size_t new_width,
1150 uint32_t input_id,
1151 uint32_t output_id,
1152 uint32_t flags);
1153
1154/// Define a PReLU (Parametric ReLU) Node and add it to a Subgraph.
1155///
1156/// @param subgraph - a Subgraph object that will own the created Node.
1157/// @param input_id - Value ID for the input tensor. The input tensor must be a 4D tensor defined in the @a subgraph
1158/// with [N, H, W, channels] dimensions.
1159/// @param slope_id - Value ID for the bias tensor. The bias tensor must be a 1D tensor defined in the @a subgraph with
1160/// [channels] dimensions.
1161/// @param output_id - Value ID for the output tensor. The output tensor must be a 4D tensor defined in the @a subgraph
1162/// with [N, H, W, channels] dimensions.
1163/// @param flags - binary features of the PReLU Node. No supported flags are currently defined.
1164enum xnn_status xnn_define_prelu(
1165 xnn_subgraph_t subgraph,
1166 uint32_t input_id,
1167 uint32_t slope_id,
1168 uint32_t output_id,
1169 uint32_t flags);
1170
1171/// Define a Abs Node and add it to a Subgraph.
1172///
1173/// @param subgraph - a Subgraph object that will own the created Node.
1174/// @param input_id - Value ID for the input tensor. The input tensor must be defined in the @a subgraph.
1175/// @param output_id - Value ID for the output tensor. The output tensor must be defined in the @a subgraph, and its
1176/// shape must match the shape of the input tensor.
1177/// @param flags - binary features of the Abs Node. No supported flags are currently defined.
1178enum xnn_status xnn_define_abs(
1179 xnn_subgraph_t subgraph,
1180 uint32_t input_id,
1181 uint32_t output_id,
1182 uint32_t flags);
1183
1184/// Define a Bankers' Rounding Node and add it to a Subgraph.
1185///
1186/// @param subgraph - a Subgraph object that will own the created Node.
1187/// @param input_id - Value ID for the input tensor. The input tensor must be defined in the @a subgraph.
1188/// @param output_id - Value ID for the output tensor. The output tensor must be defined in the @a subgraph, and its
1189/// shape must match the shape of the input tensor.
1190/// @param flags - binary features of the Bankers' Rounding Node. No supported flags are currently defined.
1191enum xnn_status xnn_define_bankers_rounding(
1192 xnn_subgraph_t subgraph,
1193 uint32_t input_id,
1194 uint32_t output_id,
1195 uint32_t flags);
1196
1197/// Define a Ceiling Node and add it to a Subgraph.
1198///
1199/// @param subgraph - a Subgraph object that will own the created Node.
1200/// @param input_id - Value ID for the input tensor. The input tensor must be defined in the @a subgraph.
1201/// @param output_id - Value ID for the output tensor. The output tensor must be defined in the @a subgraph, and its
1202/// shape must match the shape of the input tensor.
1203/// @param flags - binary features of the Ceiling Node. No supported flags are currently defined.
1204enum xnn_status xnn_define_ceiling(
1205 xnn_subgraph_t subgraph,
1206 uint32_t input_id,
1207 uint32_t output_id,
1208 uint32_t flags);
1209
1210/// Define a Clamp Node and add it to a Subgraph.
1211///
1212/// @param subgraph - a Subgraph object that will own the created Node.
1213/// @param output_min - lower bound for clipping output values.
1214/// @param output_max - upper bound for clipping output values.
1215/// @param input_id - Value ID for the input tensor. The input tensor must be defined in the @a subgraph.
1216/// @param output_id - Value ID for the output tensor. The output tensor must be defined in the @a subgraph, and its
1217/// shape must match the shape of the input tensor.
1218/// @param flags - binary features of the Clamp Node. No supported flags are currently defined.
1219enum xnn_status xnn_define_clamp(
1220 xnn_subgraph_t subgraph,
1221 float output_min,
1222 float output_max,
1223 uint32_t input_id,
1224 uint32_t output_id,
1225 uint32_t flags);
1226
1227/// Define an ELU (Exponential Linear Unit) Node and add it to a Subgraph.
1228///
1229/// @param subgraph - a Subgraph object that will own the created Node.
1230/// @param alpha - scale factor for negative output elements.
1231/// @param input_id - Value ID for the input tensor. The input tensor must be defined in the @a subgraph.
1232/// @param output_id - Value ID for the output tensor. The output tensor must be defined in the @a subgraph, and its
1233/// shape must match the shape of the input tensor.
1234/// @param flags - binary features of the ELU Node. No supported flags are currently defined.
1235enum xnn_status xnn_define_elu(
1236 xnn_subgraph_t subgraph,
1237 float alpha,
1238 uint32_t input_id,
1239 uint32_t output_id,
1240 uint32_t flags);
1241
1242/// Define a Floor Node and add it to a Subgraph.
1243///
1244/// @param subgraph - a Subgraph object that will own the created Node.
1245/// @param input_id - Value ID for the input tensor. The input tensor must be defined in the @a subgraph.
1246/// @param output_id - Value ID for the output tensor. The output tensor must be defined in the @a subgraph, and its
1247/// shape must match the shape of the input tensor.
1248/// @param flags - binary features of the Floor Node. No supported flags are currently defined.
1249enum xnn_status xnn_define_floor(
1250 xnn_subgraph_t subgraph,
1251 uint32_t input_id,
1252 uint32_t output_id,
1253 uint32_t flags);
1254
1255/// Define a HardSwish Node and add it to a Subgraph.
1256///
1257/// @param subgraph - a Subgraph object that will own the created Node.
1258/// @param input_id - Value ID for the input tensor. The input tensor must be defined in the @a subgraph.
1259/// @param output_id - Value ID for the output tensor. The output tensor must be defined in the @a subgraph, and its
1260/// shape must match the shape of the input tensor.
1261/// @param flags - binary features of the HardSwish Node. No supported flags are currently defined.
1262enum xnn_status xnn_define_hardswish(
1263 xnn_subgraph_t subgraph,
1264 uint32_t input_id,
1265 uint32_t output_id,
1266 uint32_t flags);
1267
1268/// Define a Leaky ReLU Node and add it to a Subgraph.
1269///
1270/// @param subgraph - a Subgraph object that will own the created Node.
1271/// @param negative_slope - scale factor for negative input elements.
1272/// @param input_id - Value ID for the input tensor. The input tensor must be defined in the @a subgraph.
1273/// @param output_id - Value ID for the output tensor. The output tensor must be defined in the @a subgraph, and its
1274/// shape must match the shape of the input tensor.
1275/// @param flags - binary features of the Leaky ReLU Node. No supported flags are currently defined.
1276enum xnn_status xnn_define_leaky_relu(
1277 xnn_subgraph_t subgraph,
1278 float negative_slope,
1279 uint32_t input_id,
1280 uint32_t output_id,
1281 uint32_t flags);
1282
1283/// Define a Negate Node and add it to a Subgraph.
1284///
1285/// @param subgraph - a Subgraph object that will own the created Node.
1286/// @param input_id - Value ID for the input tensor. The input tensor must be defined in the @a subgraph.
1287/// @param output_id - Value ID for the output tensor. The output tensor must be defined in the @a subgraph, and its
1288/// shape must match the shape of the input tensor.
1289/// @param flags - binary features of the Negate Node. No supported flags are currently defined.
1290enum xnn_status xnn_define_negate(
1291 xnn_subgraph_t subgraph,
1292 uint32_t input_id,
1293 uint32_t output_id,
1294 uint32_t flags);
1295
1296/// Define a Sigmoid Node and add it to a Subgraph.
1297///
1298/// @param subgraph - a Subgraph object that will own the created Node.
1299/// @param input_id - Value ID for the input tensor. The input tensor must be defined in the @a subgraph.
1300/// @param output_id - Value ID for the output tensor. The output tensor must be defined in the @a subgraph, and its
1301/// shape must match the shape of the input tensor.
1302/// @param flags - binary features of the Sigmoid Node. No supported flags are currently defined.
1303enum xnn_status xnn_define_sigmoid(
1304 xnn_subgraph_t subgraph,
1305 uint32_t input_id,
1306 uint32_t output_id,
1307 uint32_t flags);
1308
1309/// Define a SoftMax Node and add it to a Subgraph.
1310///
1311/// @param subgraph - a Subgraph object that will own the created Node.
1312/// @param input_id - Value ID for the input tensor. The input tensor must be defined in the @a subgraph, and have at
1313/// least one dimension.
1314/// @param output_id - Value ID for the output tensor. The output tensor must be defined in the @a subgraph, and its
1315/// shape must match the shape of the input tensor.
1316/// @param flags - binary features of the SoftMax Node. No supported flags are currently defined.
1317enum xnn_status xnn_define_softmax(
1318 xnn_subgraph_t subgraph,
1319 uint32_t input_id,
1320 uint32_t output_id,
1321 uint32_t flags);
1322
1323/// Define a Square Node and add it to a Subgraph.
1324///
1325/// @param subgraph - a Subgraph object that will own the created Node.
1326/// @param input_id - Value ID for the input tensor. The input tensor must be defined in the @a subgraph.
1327/// @param output_id - Value ID for the output tensor. The output tensor must be defined in the @a subgraph, and its
1328/// shape must match the shape of the input tensor.
1329/// @param flags - binary features of the Square Node. No supported flags are currently defined.
1330enum xnn_status xnn_define_square(
1331 xnn_subgraph_t subgraph,
1332 uint32_t input_id,
1333 uint32_t output_id,
1334 uint32_t flags);
1335
1336/// Define a Square Root Node and add it to a Subgraph.
1337///
1338/// @param subgraph - a Subgraph object that will own the created Node.
1339/// @param input_id - Value ID for the input tensor. The input tensor must be defined in the @a subgraph.
1340/// @param output_id - Value ID for the output tensor. The output tensor must be defined in the @a subgraph, and its
1341/// shape must match the shape of the input tensor.
1342/// @param flags - binary features of the Square Root Node. No supported flags are currently defined.
1343enum xnn_status xnn_define_square_root(
1344 xnn_subgraph_t subgraph,
1345 uint32_t input_id,
1346 uint32_t output_id,
1347 uint32_t flags);
1348
1349/// Define a Static Transpose Node and add it to a Subgraph.
1350///
1351/// The Static Transpose Node applies a generalized transpose to the input tensor using the permuation in perm.
1352///
1353/// @param subgraph - a Subgraph object that will own the created Node.
1354/// @param input_id - Value ID for the input tensor. The input tensor must be an N-dimensional tensor defined in
1355/// the @a subgraph.
1356/// @param output_id - Value ID for the output tensor. The output tensor must be an N-dimensional tensor defined
1357/// in the @a subgraph with each dimension equal to its corresponding permuted input dimension.
1358/// @param num_dims - the number of permutation dimensions. This must be equal to the number of input dimensions.
1359/// @param perm - The permutation of the axis of the input tensor. The perm array must must contain 0 to N-1 in the
1360/// permuted order.
1361/// @param flags - binary features of the Static Transpose Node. No supported flags are currently defined.
1362enum xnn_status xnn_define_static_transpose(
1363 xnn_subgraph_t subgraph,
1364 size_t num_dims,
1365 const size_t* perm,
1366 uint32_t input_id,
1367 uint32_t output_id,
1368 uint32_t flags);
1369
1370/// Weights cache is a cache for packed weights. It can be reused between runtimes.
1371typedef struct xnn_weights_cache* xnn_weights_cache_t;
1372
1373enum xnn_status xnn_create_weights_cache(xnn_weights_cache_t* weights_cache_out);
1374
1375/// Create a weights cache object specifying the initial size of weights cache (in bytes).
1376/// @size - initial capacity of the weights cache (in bytes), i.e. it can hold size bytes without growing.
1377/// @param weights_cache_out - pointer to the variable that will be initialized to a handle to the weights cache object
1378/// upon successful return. Once created, the weights cache object can be shared between
1379/// different Runtime objects.
1380enum xnn_status xnn_create_weights_cache_with_size(size_t size, xnn_weights_cache_t* weights_cache_out);
1381
1382
1383/// Weights cache can be finalized in these ways:
1384enum xnn_weights_cache_finalization_kind {
1385 /// Weights cache is finalized, no insert operations into the weights cache is allowed, even if the "inserted"
1386 /// weights already exist in thee cache. Weights cache memory will also be trimmed to page boundary and set to
1387 /// read-only (to prevent writes).
1388 xnn_weights_cache_finalization_kind_hard,
1389 /// Weights cache will be finalized with some extra space at the end, this allows for "inserting" into the cache only
1390 /// if the weights are already in the cache, and errors on inserting uncached weights. There is memory overhead.
1391 xnn_weights_cache_finalization_kind_soft,
1392};
1393
1394/// Finalizes the weights cache. The kind of finalization is specified by `finalization_kind`.
1395/// @param weights_cache - the weights cache object to finalize.
1396/// @param finalization_kind - the kind of finalization.
1397enum xnn_status xnn_finalize_weights_cache(
1398 xnn_weights_cache_t weights_cache,
1399 enum xnn_weights_cache_finalization_kind finalization_kind);
1400
1401/// Destroy a weights cache object, as well as memory used for the cache.
1402/// @param weights_cache - the weights cache object to destroy.
1403enum xnn_status xnn_delete_weights_cache(xnn_weights_cache_t weights_cache);
1404
1405typedef struct xnn_workspace* xnn_workspace_t;
1406
1407/// Create a workspace object.
1408/// @param workspace_out - pointer to the variable that will be initialized to a handle to the workspace object upon
1409/// successful return. Once created, the workspace can be shared between different Runtime
1410/// objects.
1411enum xnn_status xnn_create_workspace(xnn_workspace_t* workspace_out);
1412/// Destroy a workspace object, as well as memory used by the workspace. Object destruction can be deferred until all
1413/// Runtime objects created with this workspace are destroyed.
1414/// @param workspace - the workspace object to destroy.
1415enum xnn_status xnn_release_workspace(xnn_workspace_t workspace);
1416
1417/// Runtime is a combination of an execution plan for subgraph Nodes and a memory manager for subgraph Values.
1418typedef struct xnn_runtime* xnn_runtime_t;
1419
1420enum xnn_profile_info {
1421 /// Returns a size_t containing the number of operators.
1422 xnn_profile_info_num_operators,
1423 /// Returns a char[] containing the null character separated names of all operators.
1424 xnn_profile_info_operator_name,
1425 /// Returns a uint64_t[] with the runtimes of all operators in the same order as xnn_profile_info_operator_name.
1426 xnn_profile_info_operator_timing,
1427};
1428
1429/// Return profile information for all operators.
1430///
1431/// @param runtime - a Runtime object created with @ref xnn_create_runtime, @ref xnn_create_runtime_v2 or
1432/// @ref xnn_create_runtime_v3.
1433/// @param param_name - type of profile information required.
1434/// @param param_value_size - the size in bytes of memory pointed to by param_value. If this is not sufficient then
1435/// param_value_size_ret will be set to the required size and xnn_status_out_of_memory will be
1436/// returned.
1437/// @param param_value - a pointer to memory location where appropriate values for a given param_value will be written.
1438/// @param param_value_size_ret - returns number of bytes required to write the result if param_value_size is not
1439/// sufficient.
1440enum xnn_status xnn_get_runtime_profiling_info(xnn_runtime_t runtime,
1441 enum xnn_profile_info param_name,
1442 size_t param_value_size,
1443 void* param_value,
1444 size_t* param_value_size_ret);
1445
1446/// Create a Runtime object from a subgraph.
1447///
1448/// @param subgraph - a Subgraph object with all Values and Nodes that would be handled by the runtime. No Values or
1449/// Nodes can be added to the runtime once it is constructed.
1450/// @param weights_cache - a cache for packed weights. The runtime will look up and reuse packed weights in this cache,
1451/// this will reduce memory allocated for packed weights.
1452/// @param workspace - a workspace to hold internal tensors. The runtime will allocate space used for internal tensors
1453/// and track them using workspace. Workspace can be shared and reused across different runtimes. If
1454/// workspace is NULL, there will be no sharing: each runtime has its own workspace.
1455/// @param threadpool - the thread pool to be used for parallelisation of computations in the runtime. If the thread
1456/// pool is NULL, the computation would run on the caller thread without parallelization.
1457/// @param flags - binary features of the runtime. The only currently supported values are
1458/// XNN_FLAG_HINT_SPARSE_INFERENCE, XNN_FLAG_HINT_FP16_INFERENCE, XNN_FLAG_FORCE_FP16_INFERENCE, and
1459/// XNN_FLAG_YIELD_WORKERS. If XNN_FLAG_YIELD_WORKERS is specified, worker threads would be yielded to
1460/// the system scheduler after processing the last operator in the Runtime.
1461/// @param runtime_out - pointer to the variable that will be initialized with a handle to the Runtime object upon
1462/// successful return. Once constructed, the Runtime object is independent of the Subgraph object
1463/// used to create it.
1464enum xnn_status xnn_create_runtime_v4(
1465 xnn_subgraph_t subgraph,
1466 xnn_weights_cache_t weights_cache,
1467 xnn_workspace_t workspace,
1468 pthreadpool_t threadpool,
1469 uint32_t flags,
1470 xnn_runtime_t* runtime_out);
1471
1472enum xnn_status xnn_create_runtime_v3(
1473 xnn_subgraph_t subgraph,
1474 xnn_weights_cache_t weights_cache,
1475 pthreadpool_t threadpool,
1476 uint32_t flags,
1477 xnn_runtime_t* runtime_out);
1478
1479enum xnn_status xnn_create_runtime_v2(
1480 xnn_subgraph_t subgraph,
1481 pthreadpool_t threadpool,
1482 uint32_t flags,
1483 xnn_runtime_t* runtime_out);
1484
1485enum xnn_status xnn_create_runtime(
1486 xnn_subgraph_t subgraph,
1487 xnn_runtime_t* runtime_out);
1488
1489struct xnn_external_value {
1490 uint32_t id;
1491 void* data;
1492};
1493
1494/// Setup data pointers for external inputs and outputs in a Runtime object.
1495///
1496/// @param runtime - a Runtime object created with @ref xnn_create_runtime or @ref xnn_create_runtime_v2.
1497/// @param num_external_values - the number of external inputs and outputs specified in this call. This number must
1498/// match the number of external inputs and outputs in the runtime, i.e. all external
1499/// inputs and outputs in the runtime must be specified in one call.
1500/// @param external_values - array with location information for all external inputs and outputs in the runtime.
1501enum xnn_status xnn_setup_runtime(
1502 xnn_runtime_t runtime,
1503 size_t num_external_values,
1504 const struct xnn_external_value* external_values);
1505
1506/// Execute forward pass for all operators in the runtime.
1507///
1508/// @param runtime - the Runtime object with the execution plan to invoke.
1509enum xnn_status xnn_invoke_runtime(
1510 xnn_runtime_t runtime);
1511
1512/// Destroy a Runtime object, as well as operators and memory associated with it.
1513///
1514/// @param runtime - the Runtime object to destroy.
1515enum xnn_status xnn_delete_runtime(
1516 xnn_runtime_t runtime);
1517
1518typedef struct xnn_operator* xnn_operator_t;
1519
1520enum xnn_status xnn_run_operator(
1521 xnn_operator_t op,
1522 pthreadpool_t threadpool);
1523
1524enum xnn_status xnn_delete_operator(
1525 xnn_operator_t op);
1526
1527#ifndef XNN_NO_F32_OPERATORS
1528
1529enum xnn_status xnn_create_abs_nc_f32(
1530 size_t channels,
1531 size_t input_stride,
1532 size_t output_stride,
1533 uint32_t flags,
1534 xnn_operator_t* abs_op_out);
1535
1536enum xnn_status xnn_setup_abs_nc_f32(
1537 xnn_operator_t abs_op,
1538 size_t batch_size,
1539 const float* input,
1540 float* output,
1541 pthreadpool_t threadpool);
1542
1543enum xnn_status xnn_create_add_nd_f32(
1544 float output_min,
1545 float output_max,
1546 uint32_t flags,
1547 xnn_operator_t* add_op_out);
1548
1549enum xnn_status xnn_setup_add_nd_f32(
1550 xnn_operator_t add_op,
1551 size_t num_input1_dims,
1552 const size_t* input1_shape,
1553 size_t num_input2_dims,
1554 const size_t* input2_shape,
1555 const float* input1,
1556 const float* input2,
1557 float* output,
1558 pthreadpool_t threadpool);
1559
1560enum xnn_status xnn_run_add_nd_f32(
1561 size_t num_input1_dims,
1562 const size_t* input1_shape,
1563 size_t num_input2_dims,
1564 const size_t* input2_shape,
1565 const float* input1,
1566 const float* input2,
1567 float* output,
1568 float output_min,
1569 float output_max,
1570 uint32_t flags,
1571 pthreadpool_t threadpool);
1572
1573enum xnn_status xnn_create_argmax_pooling2d_nhwc_f32(
1574 uint32_t input_padding_top,
1575 uint32_t input_padding_right,
1576 uint32_t input_padding_bottom,
1577 uint32_t input_padding_left,
1578 uint32_t pooling_height,
1579 uint32_t pooling_width,
1580 size_t channels,
1581 size_t input_pixel_stride,
1582 size_t output_pixel_stride,
1583 uint32_t flags,
1584 xnn_operator_t* argmax_pooling_op_out);
1585
1586enum xnn_status xnn_setup_argmax_pooling2d_nhwc_f32(
1587 xnn_operator_t argmax_pooling_op,
1588 size_t batch_size,
1589 size_t input_height,
1590 size_t input_width,
1591 const float* input,
1592 float* output,
1593 uint32_t* index,
1594 pthreadpool_t threadpool);
1595
1596enum xnn_status xnn_create_average_pooling2d_nhwc_f32(
1597 uint32_t input_padding_top,
1598 uint32_t input_padding_right,
1599 uint32_t input_padding_bottom,
1600 uint32_t input_padding_left,
1601 uint32_t pooling_height,
1602 uint32_t pooling_width,
1603 uint32_t stride_height,
1604 uint32_t stride_width,
1605 size_t channels,
1606 size_t input_pixel_stride,
1607 size_t output_pixel_stride,
1608 float output_min,
1609 float output_max,
1610 uint32_t flags,
1611 xnn_operator_t* average_pooling_op_out);
1612
1613enum xnn_status xnn_setup_average_pooling2d_nhwc_f32(
1614 xnn_operator_t average_pooling_op,
1615 size_t batch_size,
1616 size_t input_height,
1617 size_t input_width,
1618 const float* input,
1619 float* output,
1620 pthreadpool_t threadpool);
1621
1622enum xnn_status xnn_create_bankers_rounding_nc_f32(
1623 size_t channels,
1624 size_t input_stride,
1625 size_t output_stride,
1626 uint32_t flags,
1627 xnn_operator_t* rounding_op_out);
1628
1629enum xnn_status xnn_setup_bankers_rounding_nc_f32(
1630 xnn_operator_t rounding_op,
1631 size_t batch_size,
1632 const float* input,
1633 float* output,
1634 pthreadpool_t threadpool);
1635
1636enum xnn_status xnn_create_ceiling_nc_f32(
1637 size_t channels,
1638 size_t input_stride,
1639 size_t output_stride,
1640 uint32_t flags,
1641 xnn_operator_t* ceiling_op_out);
1642
1643enum xnn_status xnn_setup_ceiling_nc_f32(
1644 xnn_operator_t ceiling_op,
1645 size_t batch_size,
1646 const float* input,
1647 float* output,
1648 pthreadpool_t threadpool);
1649
1650enum xnn_status xnn_create_clamp_nc_f32(
1651 size_t channels,
1652 size_t input_stride,
1653 size_t output_stride,
1654 float output_min,
1655 float output_max,
1656 uint32_t flags,
1657 xnn_operator_t* clamp_op_out);
1658
1659enum xnn_status xnn_setup_clamp_nc_f32(
1660 xnn_operator_t clamp_op,
1661 size_t batch_size,
1662 const float* input,
1663 float* output,
1664 pthreadpool_t threadpool);
1665
1666typedef const struct xnn_caches* xnn_caches_t;
1667
1668enum xnn_status xnn_create_convolution2d_nhwc_f32(
1669 uint32_t input_padding_top,
1670 uint32_t input_padding_right,
1671 uint32_t input_padding_bottom,
1672 uint32_t input_padding_left,
1673 uint32_t kernel_height,
1674 uint32_t kernel_width,
1675 uint32_t subsampling_height,
1676 uint32_t subsampling_width,
1677 uint32_t dilation_height,
1678 uint32_t dilation_width,
1679 uint32_t groups,
1680 size_t group_input_channels,
1681 size_t group_output_channels,
1682 size_t input_channel_stride,
1683 size_t output_channel_stride,
1684 const float* kernel,
1685 const float* bias,
1686 float output_min,
1687 float output_max,
1688 uint32_t flags,
1689 xnn_caches_t caches,
1690 xnn_operator_t* convolution_op_out);
1691
1692// Forward declare.
1693struct xnn_post_operation;
1694
1695/// Create a convolution operator with a number of post operations. The
1696/// convolution operator created using this function does not have output_min
1697/// and output_max. The list of operators in post_operations will be applied in
1698/// order. Convolution with post operations is only supported on JIT platforms
1699/// and when JIT is enabled.
1700enum xnn_status xnn_create_fused_convolution2d_nhwc_f32(
1701 uint32_t input_padding_top,
1702 uint32_t input_padding_right,
1703 uint32_t input_padding_bottom,
1704 uint32_t input_padding_left,
1705 uint32_t kernel_height,
1706 uint32_t kernel_width,
1707 uint32_t subsampling_height,
1708 uint32_t subsampling_width,
1709 uint32_t dilation_height,
1710 uint32_t dilation_width,
1711 uint32_t groups,
1712 size_t group_input_channels,
1713 size_t group_output_channels,
1714 size_t input_channel_stride,
1715 size_t output_channel_stride,
1716 const float* kernel,
1717 const float* bias,
1718 size_t num_post_operations,
1719 struct xnn_post_operation* post_operations,
1720 uint32_t flags,
1721 xnn_caches_t caches,
1722 xnn_operator_t* convolution_op_out);
1723
1724enum xnn_status xnn_setup_convolution2d_nhwc_f32(
1725 xnn_operator_t convolution_op,
1726 size_t batch_size,
1727 size_t input_height,
1728 size_t input_width,
1729 const float* input,
1730 float* output,
1731 pthreadpool_t threadpool);
1732
1733enum xnn_status xnn_create_deconvolution2d_nhwc_f32(
1734 uint32_t output_padding_top,
1735 uint32_t output_padding_right,
1736 uint32_t output_padding_bottom,
1737 uint32_t output_padding_left,
1738 uint32_t kernel_height,
1739 uint32_t kernel_width,
1740 uint32_t stride_height,
1741 uint32_t stride_width,
1742 uint32_t dilation_height,
1743 uint32_t dilation_width,
1744 uint32_t groups,
1745 size_t group_input_channels,
1746 size_t group_output_channels,
1747 size_t input_pixel_stride,
1748 size_t output_pixel_stride,
1749 const float* kernel,
1750 const float* bias,
1751 float output_min,
1752 float output_max,
1753 uint32_t flags,
1754 xnn_caches_t caches,
1755 xnn_operator_t* deconvolution_op_out);
1756
1757enum xnn_status xnn_setup_deconvolution2d_nhwc_f32(
1758 xnn_operator_t deconvolution_op,
1759 size_t batch_size,
1760 size_t input_height,
1761 size_t input_width,
1762 uint32_t adjustment_height,
1763 uint32_t adjustment_width,
1764 const float* input,
1765 float* output,
1766 pthreadpool_t threadpool);
1767
1768enum xnn_status xnn_create_divide_nd_f32(
1769 float output_min,
1770 float output_max,
1771 uint32_t flags,
1772 xnn_operator_t* divide_op_out);
1773
1774enum xnn_status xnn_setup_divide_nd_f32(
1775 xnn_operator_t divide_op,
1776 size_t num_input1_dims,
1777 const size_t* input1_shape,
1778 size_t num_input2_dims,
1779 const size_t* input2_shape,
1780 const float* input1,
1781 const float* input2,
1782 float* output,
1783 pthreadpool_t threadpool);
1784
1785enum xnn_status xnn_create_elu_nc_f32(
1786 size_t channels,
1787 size_t input_stride,
1788 size_t output_stride,
1789 float alpha,
1790 uint32_t flags,
1791 xnn_operator_t* elu_op_out);
1792
1793enum xnn_status xnn_setup_elu_nc_f32(
1794 xnn_operator_t elu_op,
1795 size_t batch_size,
1796 const float* input,
1797 float* output,
1798 pthreadpool_t threadpool);
1799
1800enum xnn_status xnn_create_floor_nc_f32(
1801 size_t channels,
1802 size_t input_stride,
1803 size_t output_stride,
1804 uint32_t flags,
1805 xnn_operator_t* floor_op_out);
1806
1807enum xnn_status xnn_setup_floor_nc_f32(
1808 xnn_operator_t floor_op,
1809 size_t batch_size,
1810 const float* input,
1811 float* output,
1812 pthreadpool_t threadpool);
1813
1814enum xnn_status xnn_create_fully_connected_nc_f32(
1815 size_t input_channels,
1816 size_t output_channels,
1817 size_t input_stride,
1818 size_t output_stride,
1819 const float* kernel,
1820 const float* bias,
1821 float output_min,
1822 float output_max,
1823 uint32_t flags,
1824 const xnn_caches_t caches,
1825 xnn_operator_t* fully_connected_op_out);
1826
1827enum xnn_status xnn_setup_fully_connected_nc_f32(
1828 xnn_operator_t fully_connected_op,
1829 size_t batch_size,
1830 const float* input,
1831 float* output,
1832 pthreadpool_t threadpool);
1833
1834enum xnn_status xnn_create_global_average_pooling_nwc_f32(
1835 size_t channels,
1836 size_t input_stride,
1837 size_t output_stride,
1838 float output_min,
1839 float output_max,
1840 uint32_t flags,
1841 xnn_operator_t* global_average_pooling_op_out);
1842
1843enum xnn_status xnn_setup_global_average_pooling_nwc_f32(
1844 xnn_operator_t global_average_pooling_op,
1845 size_t batch_size,
1846 size_t width,
1847 const float* input,
1848 float* output,
1849 pthreadpool_t threadpool);
1850
1851enum xnn_status xnn_create_hardswish_nc_f32(
1852 size_t channels,
1853 size_t input_stride,
1854 size_t output_stride,
1855 uint32_t flags,
1856 xnn_operator_t* hardswish_op_out);
1857
1858enum xnn_status xnn_setup_hardswish_nc_f32(
1859 xnn_operator_t hardswish_op,
1860 size_t batch_size,
1861 const float* input,
1862 float* output,
1863 pthreadpool_t threadpool);
1864
1865enum xnn_status xnn_create_leaky_relu_nc_f32(
1866 size_t channels,
1867 size_t input_stride,
1868 size_t output_stride,
1869 float negative_slope,
1870 uint32_t flags,
1871 xnn_operator_t* leaky_relu_op_out);
1872
1873enum xnn_status xnn_setup_leaky_relu_nc_f32(
1874 xnn_operator_t leaky_relu_op,
1875 size_t batch_size,
1876 const float* input,
1877 float* output,
1878 pthreadpool_t threadpool);
1879
1880enum xnn_status xnn_create_max_pooling2d_nhwc_f32(
1881 uint32_t input_padding_top,
1882 uint32_t input_padding_right,
1883 uint32_t input_padding_bottom,
1884 uint32_t input_padding_left,
1885 uint32_t pooling_height,
1886 uint32_t pooling_width,
1887 uint32_t stride_height,
1888 uint32_t stride_width,
1889 uint32_t dilation_height,
1890 uint32_t dilation_width,
1891 size_t channels,
1892 size_t input_pixel_stride,
1893 size_t output_pixel_stride,
1894 float output_min,
1895 float output_max,
1896 uint32_t flags,
1897 xnn_operator_t* max_pooling_op_out);
1898
1899enum xnn_status xnn_setup_max_pooling2d_nhwc_f32(
1900 xnn_operator_t max_pooling_op,
1901 size_t batch_size,
1902 size_t input_height,
1903 size_t input_width,
1904 const float* input,
1905 float* output,
1906 pthreadpool_t threadpool);
1907
1908enum xnn_status xnn_create_maximum_nd_f32(
1909 uint32_t flags,
1910 xnn_operator_t* maximum_op_out);
1911
1912enum xnn_status xnn_setup_maximum_nd_f32(
1913 xnn_operator_t maximum_op,
1914 size_t num_input1_dims,
1915 const size_t* input1_shape,
1916 size_t num_input2_dims,
1917 const size_t* input2_shape,
1918 const float* input1,
1919 const float* input2,
1920 float* output,
1921 pthreadpool_t threadpool);
1922
1923enum xnn_status xnn_create_minimum_nd_f32(
1924 uint32_t flags,
1925 xnn_operator_t* minimum_op_out);
1926
1927enum xnn_status xnn_setup_minimum_nd_f32(
1928 xnn_operator_t minimum_op,
1929 size_t num_input1_dims,
1930 const size_t* input1_shape,
1931 size_t num_input2_dims,
1932 const size_t* input2_shape,
1933 const float* input1,
1934 const float* input2,
1935 float* output,
1936 pthreadpool_t threadpool);
1937
1938enum xnn_status xnn_create_multiply_nd_f32(
1939 float output_min,
1940 float output_max,
1941 uint32_t flags,
1942 xnn_operator_t* multiply_op_out);
1943
1944enum xnn_status xnn_setup_multiply_nd_f32(
1945 xnn_operator_t multiply_op,
1946 size_t num_input1_dims,
1947 const size_t* input1_shape,
1948 size_t num_input2_dims,
1949 const size_t* input2_shape,
1950 const float* input1,
1951 const float* input2,
1952 float* output,
1953 pthreadpool_t threadpool);
1954
1955enum xnn_status xnn_create_negate_nc_f32(
1956 size_t channels,
1957 size_t input_stride,
1958 size_t output_stride,
1959 uint32_t flags,
1960 xnn_operator_t* negate_op_out);
1961
1962enum xnn_status xnn_setup_negate_nc_f32(
1963 xnn_operator_t negate_op,
1964 size_t batch_size,
1965 const float* input,
1966 float* output,
1967 pthreadpool_t threadpool);
1968
1969enum xnn_status xnn_create_prelu_nc_f32(
1970 size_t channels,
1971 size_t input_stride,
1972 size_t output_stride,
1973 const float* negative_slope,
1974 uint32_t flags,
1975 xnn_caches_t caches,
1976 xnn_operator_t* prelu_op_out);
1977
1978enum xnn_status xnn_setup_prelu_nc_f32(
1979 xnn_operator_t prelu_op,
1980 size_t batch_size,
1981 const float* input,
1982 float* output,
1983 pthreadpool_t threadpool);
1984
1985enum xnn_status xnn_create_resize_bilinear2d_nchw_f32(
1986 size_t channels,
1987 size_t input_pixel_stride,
1988 size_t output_pixel_stride,
1989 uint32_t flags,
1990 xnn_operator_t* resize_op_out);
1991
1992enum xnn_status xnn_setup_resize_bilinear2d_nchw_f32(
1993 xnn_operator_t resize_op,
1994 size_t batch_size,
1995 size_t input_height,
1996 size_t input_width,
1997 size_t output_height,
1998 size_t output_width,
1999 const float* input,
2000 float* output,
2001 pthreadpool_t threadpool);
2002
2003enum xnn_status xnn_create_resize_bilinear2d_nhwc_f32(
2004 size_t channels,
2005 size_t input_pixel_stride,
2006 size_t output_pixel_stride,
2007 uint32_t flags,
2008 xnn_operator_t* resize_op_out);
2009
2010enum xnn_status xnn_setup_resize_bilinear2d_nhwc_f32(
2011 xnn_operator_t resize_op,
2012 size_t batch_size,
2013 size_t input_height,
2014 size_t input_width,
2015 size_t output_height,
2016 size_t output_width,
2017 const float* input,
2018 float* output,
2019 pthreadpool_t threadpool);
2020
2021enum xnn_status xnn_create_sigmoid_nc_f32(
2022 size_t channels,
2023 size_t input_stride,
2024 size_t output_stride,
2025 uint32_t flags,
2026 xnn_operator_t* sigmoid_op_out);
2027
2028enum xnn_status xnn_setup_sigmoid_nc_f32(
2029 xnn_operator_t sigmoid_op,
2030 size_t batch_size,
2031 const float* input,
2032 float* output,
2033 pthreadpool_t threadpool);
2034
2035enum xnn_status xnn_create_softmax_nc_f32(
2036 size_t channels,
2037 size_t input_stride,
2038 size_t output_stride,
2039 uint32_t flags,
2040 xnn_operator_t* softmax_op_out);
2041
2042enum xnn_status xnn_setup_softmax_nc_f32(
2043 xnn_operator_t softmax_op,
2044 size_t batch_size,
2045 const float* input,
2046 float* output,
2047 pthreadpool_t threadpool);
2048
2049enum xnn_status xnn_create_square_nc_f32(
2050 size_t channels,
2051 size_t input_stride,
2052 size_t output_stride,
2053 uint32_t flags,
2054 xnn_operator_t* square_op_out);
2055
2056enum xnn_status xnn_setup_square_nc_f32(
2057 xnn_operator_t square_op,
2058 size_t batch_size,
2059 const float* input,
2060 float* output,
2061 pthreadpool_t threadpool);
2062
2063enum xnn_status xnn_create_square_root_nc_f32(
2064 size_t channels,
2065 size_t input_stride,
2066 size_t output_stride,
2067 uint32_t flags,
2068 xnn_operator_t* sqrt_op_out);
2069
2070enum xnn_status xnn_setup_square_root_nc_f32(
2071 xnn_operator_t sqrt_op,
2072 size_t batch_size,
2073 const float* input,
2074 float* output,
2075 pthreadpool_t threadpool);
2076
2077enum xnn_status xnn_create_squared_difference_nd_f32(
2078 uint32_t flags,
2079 xnn_operator_t* squared_difference_op_out);
2080
2081enum xnn_status xnn_setup_squared_difference_nd_f32(
2082 xnn_operator_t squared_difference_op,
2083 size_t num_input1_dims,
2084 const size_t* input1_shape,
2085 size_t num_input2_dims,
2086 const size_t* input2_shape,
2087 const float* input1,
2088 const float* input2,
2089 float* output,
2090 pthreadpool_t threadpool);
2091
2092enum xnn_status xnn_create_subtract_nd_f32(
2093 float output_min,
2094 float output_max,
2095 uint32_t flags,
2096 xnn_operator_t* subtract_op_out);
2097
2098enum xnn_status xnn_setup_subtract_nd_f32(
2099 xnn_operator_t subtract_op,
2100 size_t num_input1_dims,
2101 const size_t* input1_shape,
2102 size_t num_input2_dims,
2103 const size_t* input2_shape,
2104 const float* input1,
2105 const float* input2,
2106 float* output,
2107 pthreadpool_t threadpool);
2108
2109enum xnn_status xnn_create_truncation_nc_f32(
2110 size_t channels,
2111 size_t input_stride,
2112 size_t output_stride,
2113 uint32_t flags,
2114 xnn_operator_t* truncation_op_out);
2115
2116enum xnn_status xnn_setup_truncation_nc_f32(
2117 xnn_operator_t truncation_op,
2118 size_t batch_size,
2119 const float* input,
2120 float* output,
2121 pthreadpool_t threadpool);
2122
2123#ifndef XNN_NO_NCHW_OPERATORS
2124
2125enum xnn_status xnn_create_convolution2d_nchw_f32(
2126 uint32_t input_padding_top,
2127 uint32_t input_padding_right,
2128 uint32_t input_padding_bottom,
2129 uint32_t input_padding_left,
2130 uint32_t kernel_height,
2131 uint32_t kernel_width,
2132 uint32_t subsampling_height,
2133 uint32_t subsampling_width,
2134 uint32_t dilation_height,
2135 uint32_t dilation_width,
2136 uint32_t groups,
2137 size_t group_input_channels,
2138 size_t group_output_channels,
2139 size_t input_channel_stride,
2140 size_t output_channel_stride,
2141 const float* kernel,
2142 const float* bias,
2143 float output_min,
2144 float output_max,
2145 uint32_t flags,
2146 xnn_caches_t caches,
2147 xnn_operator_t* convolution_op_out);
2148
2149enum xnn_status xnn_setup_convolution2d_nchw_f32(
2150 xnn_operator_t convolution_op,
2151 size_t batch_size,
2152 size_t input_height,
2153 size_t input_width,
2154 const float* input,
2155 float* output,
2156 pthreadpool_t threadpool);
2157
2158enum xnn_status xnn_create_global_average_pooling_ncw_f32(
2159 size_t channels,
2160 float output_min,
2161 float output_max,
2162 uint32_t flags,
2163 xnn_operator_t* global_average_pooling_op_out);
2164
2165enum xnn_status xnn_setup_global_average_pooling_ncw_f32(
2166 xnn_operator_t global_average_pooling_op,
2167 size_t batch_size,
2168 size_t width,
2169 const float* input,
2170 float* output,
2171 pthreadpool_t threadpool);
2172
2173#endif // XNN_NO_NCHW_OPERATORS
2174
2175#endif // XNN_NO_F32_OPERATORS
2176
2177#ifndef XNN_NO_X32_OPERATORS
2178
2179enum xnn_status xnn_create_channel_shuffle_nc_x32(
2180 size_t groups,
2181 size_t group_channels,
2182 size_t input_stride,
2183 size_t output_stride,
2184 uint32_t flags,
2185 xnn_operator_t* channel_shuffle_op_out);
2186
2187enum xnn_status xnn_setup_channel_shuffle_nc_x32(
2188 xnn_operator_t channel_shuffle_op,
2189 size_t batch_size,
2190 const void* input,
2191 void* output,
2192 pthreadpool_t threadpool);
2193
2194enum xnn_status xnn_create_constant_pad_nd_x32(
2195 const void* padding_value,
2196 uint32_t flags,
2197 xnn_operator_t* constant_pad_op_out);
2198
2199enum xnn_status xnn_setup_constant_pad_nd_x32(
2200 xnn_operator_t constant_pad_op,
2201 size_t num_dims,
2202 const size_t* input_shape,
2203 const size_t* pre_padding,
2204 const size_t* post_padding,
2205 const void* input,
2206 void* output,
2207 pthreadpool_t threadpool);
2208
2209enum xnn_status xnn_create_copy_nc_x32(
2210 size_t channels,
2211 size_t input_stride,
2212 size_t output_stride,
2213 uint32_t flags,
2214 xnn_operator_t* copy_op_out);
2215
2216enum xnn_status xnn_setup_copy_nc_x32(
2217 xnn_operator_t copy_op,
2218 size_t batch_size,
2219 const void* input,
2220 void* output,
2221 pthreadpool_t threadpool);
2222
2223enum xnn_status xnn_create_depth_to_space_nhwc_x32(
2224 size_t output_channels,
2225 size_t input_channel_stride,
2226 size_t output_channel_stride,
2227 uint32_t block_size,
2228 uint32_t flags,
2229 xnn_operator_t* depth_to_space_op_out);
2230
2231enum xnn_status xnn_setup_depth_to_space_nhwc_x32(
2232 xnn_operator_t depth_to_space_op,
2233 size_t batch_size,
2234 size_t input_height,
2235 size_t input_width,
2236 const void* input,
2237 void* output,
2238 pthreadpool_t threadpool);
2239
2240enum xnn_status xnn_create_depth_to_space_nchw2nhwc_x32(
2241 size_t output_channels,
2242 size_t input_channel_stride,
2243 size_t output_channel_stride,
2244 uint32_t block_size,
2245 uint32_t flags,
2246 xnn_operator_t* depth_to_space_op_out);
2247
2248enum xnn_status xnn_setup_depth_to_space_nchw2nhwc_x32(
2249 xnn_operator_t depth_to_space_op,
2250 size_t batch_size,
2251 size_t input_height,
2252 size_t input_width,
2253 const void* input,
2254 void* output,
2255 pthreadpool_t threadpool);
2256
2257enum xnn_status xnn_create_space_to_depth_nhwc_x32(
2258 size_t input_channels,
2259 size_t input_channel_stride,
2260 size_t output_channel_stride,
2261 uint32_t block_size,
2262 uint32_t flags,
2263 xnn_operator_t* space_to_depth_op_out);
2264
2265enum xnn_status xnn_setup_space_to_depth_nhwc_x32(
2266 xnn_operator_t space_to_depth_op,
2267 size_t batch_size,
2268 size_t input_height,
2269 size_t input_width,
2270 const void* input,
2271 void* output,
2272 pthreadpool_t threadpool);
2273
2274enum xnn_status xnn_create_transpose_nd_x32(
2275 uint32_t flags,
2276 xnn_operator_t* transpose_op_out);
2277
2278enum xnn_status xnn_setup_transpose_nd_x32(
2279 xnn_operator_t transpose_op,
2280 const void* input,
2281 void* output,
2282 const size_t num_dims,
2283 const size_t* input_shape,
2284 const size_t* output_perm,
2285 pthreadpool_t threadpool);
2286
2287enum xnn_status xnn_run_transpose_nd_x32(
2288 const void* input,
2289 void* output,
2290 const size_t num_dims,
2291 const size_t* input_shape,
2292 const size_t* output_perm,
2293 uint32_t flags,
2294 pthreadpool_t threadpool);
2295
2296enum xnn_status xnn_create_unpooling2d_nhwc_x32(
2297 uint32_t input_padding_top,
2298 uint32_t input_padding_right,
2299 uint32_t input_padding_bottom,
2300 uint32_t input_padding_left,
2301 uint32_t pooling_height,
2302 uint32_t pooling_width,
2303 size_t channels,
2304 size_t input_pixel_stride,
2305 size_t output_pixel_stride,
2306 uint32_t flags,
2307 xnn_operator_t* unpooling_op_out);
2308
2309enum xnn_status xnn_setup_unpooling2d_nhwc_x32(
2310 xnn_operator_t unpooling_op,
2311 size_t batch_size,
2312 size_t input_height,
2313 size_t input_width,
2314 const void* input,
2315 const uint32_t* index,
2316 void* output,
2317 pthreadpool_t threadpool);
2318
2319#endif // XNN_NO_X32_OPERATORS
2320
2321#ifndef XNN_NO_F16_OPERATORS
2322
2323enum xnn_status xnn_create_abs_nc_f16(
2324 size_t channels,
2325 size_t input_stride,
2326 size_t output_stride,
2327 uint32_t flags,
2328 xnn_operator_t* abs_op_out);
2329
2330enum xnn_status xnn_setup_abs_nc_f16(
2331 xnn_operator_t abs_op,
2332 size_t batch_size,
2333 const void* input,
2334 void* output,
2335 pthreadpool_t threadpool);
2336
2337enum xnn_status xnn_create_add_nd_f16(
2338 float output_min,
2339 float output_max,
2340 uint32_t flags,
2341 xnn_operator_t* add_op_out);
2342
2343enum xnn_status xnn_setup_add_nd_f16(
2344 xnn_operator_t add_op,
2345 size_t num_input1_dims,
2346 const size_t* input1_shape,
2347 size_t num_input2_dims,
2348 const size_t* input2_shape,
2349 const void* input1,
2350 const void* input2,
2351 void* output,
2352 pthreadpool_t threadpool);
2353
2354enum xnn_status xnn_create_average_pooling2d_nhwc_f16(
2355 uint32_t input_padding_top,
2356 uint32_t input_padding_right,
2357 uint32_t input_padding_bottom,
2358 uint32_t input_padding_left,
2359 uint32_t pooling_height,
2360 uint32_t pooling_width,
2361 uint32_t stride_height,
2362 uint32_t stride_width,
2363 size_t channels,
2364 size_t input_pixel_stride,
2365 size_t output_pixel_stride,
2366 float output_min,
2367 float output_max,
2368 uint32_t flags,
2369 xnn_operator_t* average_pooling_op_out);
2370
2371enum xnn_status xnn_setup_average_pooling2d_nhwc_f16(
2372 xnn_operator_t average_pooling_op,
2373 size_t batch_size,
2374 size_t input_height,
2375 size_t input_width,
2376 const void* input,
2377 void* output,
2378 pthreadpool_t threadpool);
2379
2380enum xnn_status xnn_create_bankers_rounding_nc_f16(
2381 size_t channels,
2382 size_t input_stride,
2383 size_t output_stride,
2384 uint32_t flags,
2385 xnn_operator_t* rounding_op_out);
2386
2387enum xnn_status xnn_setup_bankers_rounding_nc_f16(
2388 xnn_operator_t rounding_op,
2389 size_t batch_size,
2390 const void* input,
2391 void* output,
2392 pthreadpool_t threadpool);
2393
2394enum xnn_status xnn_create_ceiling_nc_f16(
2395 size_t channels,
2396 size_t input_stride,
2397 size_t output_stride,
2398 uint32_t flags,
2399 xnn_operator_t* ceiling_op_out);
2400
2401enum xnn_status xnn_setup_ceiling_nc_f16(
2402 xnn_operator_t ceiling_op,
2403 size_t batch_size,
2404 const void* input,
2405 void* output,
2406 pthreadpool_t threadpool);
2407
2408enum xnn_status xnn_create_clamp_nc_f16(
2409 size_t channels,
2410 size_t input_stride,
2411 size_t output_stride,
2412 float output_min,
2413 float output_max,
2414 uint32_t flags,
2415 xnn_operator_t* clamp_op_out);
2416
2417enum xnn_status xnn_setup_clamp_nc_f16(
2418 xnn_operator_t clamp_op,
2419 size_t batch_size,
2420 const void* input,
2421 void* output,
2422 pthreadpool_t threadpool);
2423
2424enum xnn_status xnn_create_convolution2d_nhwc_f16(
2425 uint32_t input_padding_top,
2426 uint32_t input_padding_right,
2427 uint32_t input_padding_bottom,
2428 uint32_t input_padding_left,
2429 uint32_t kernel_height,
2430 uint32_t kernel_width,
2431 uint32_t subsampling_height,
2432 uint32_t subsampling_width,
2433 uint32_t dilation_height,
2434 uint32_t dilation_width,
2435 uint32_t groups,
2436 size_t group_input_channels,
2437 size_t group_output_channels,
2438 size_t input_channel_stride,
2439 size_t output_channel_stride,
2440 const void* kernel,
2441 const void* bias,
2442 float output_min,
2443 float output_max,
2444 uint32_t flags,
2445 xnn_caches_t caches,
2446 xnn_operator_t* convolution_op_out);
2447
2448enum xnn_status xnn_setup_convolution2d_nhwc_f16(
2449 xnn_operator_t convolution_op,
2450 size_t batch_size,
2451 size_t input_height,
2452 size_t input_width,
2453 const void* input,
2454 void* output,
2455 pthreadpool_t threadpool);
2456
2457enum xnn_status xnn_create_deconvolution2d_nhwc_f16(
2458 uint32_t output_padding_top,
2459 uint32_t output_padding_right,
2460 uint32_t output_padding_bottom,
2461 uint32_t output_padding_left,
2462 uint32_t kernel_height,
2463 uint32_t kernel_width,
2464 uint32_t stride_height,
2465 uint32_t stride_width,
2466 uint32_t dilation_height,
2467 uint32_t dilation_width,
2468 uint32_t groups,
2469 size_t group_input_channels,
2470 size_t group_output_channels,
2471 size_t input_pixel_stride,
2472 size_t output_pixel_stride,
2473 const void* kernel,
2474 const void* bias,
2475 float output_min,
2476 float output_max,
2477 uint32_t flags,
2478 xnn_caches_t caches,
2479 xnn_operator_t* deconvolution_op_out);
2480
2481enum xnn_status xnn_setup_deconvolution2d_nhwc_f16(
2482 xnn_operator_t deconvolution_op,
2483 size_t batch_size,
2484 size_t input_height,
2485 size_t input_width,
2486 uint32_t adjustment_height,
2487 uint32_t adjustment_width,
2488 const void* input,
2489 void* output,
2490 pthreadpool_t threadpool);
2491
2492enum xnn_status xnn_create_divide_nd_f16(
2493 float output_min,
2494 float output_max,
2495 uint32_t flags,
2496 xnn_operator_t* divide_op_out);
2497
2498enum xnn_status xnn_setup_divide_nd_f16(
2499 xnn_operator_t divide_op,
2500 size_t num_input1_dims,
2501 const size_t* input1_shape,
2502 size_t num_input2_dims,
2503 const size_t* input2_shape,
2504 const void* input1,
2505 const void* input2,
2506 void* output,
2507 pthreadpool_t threadpool);
2508
2509enum xnn_status xnn_create_elu_nc_f16(
2510 size_t channels,
2511 size_t input_stride,
2512 size_t output_stride,
2513 float alpha,
2514 uint32_t flags,
2515 xnn_operator_t* elu_op_out);
2516
2517enum xnn_status xnn_setup_elu_nc_f16(
2518 xnn_operator_t elu_op,
2519 size_t batch_size,
2520 const void* input,
2521 void* output,
2522 pthreadpool_t threadpool);
2523
2524enum xnn_status xnn_create_floor_nc_f16(
2525 size_t channels,
2526 size_t input_stride,
2527 size_t output_stride,
2528 uint32_t flags,
2529 xnn_operator_t* floor_op_out);
2530
2531enum xnn_status xnn_setup_floor_nc_f16(
2532 xnn_operator_t floor_op,
2533 size_t batch_size,
2534 const void* input,
2535 void* output,
2536 pthreadpool_t threadpool);
2537
2538enum xnn_status xnn_create_fully_connected_nc_f16(
2539 size_t input_channels,
2540 size_t output_channels,
2541 size_t input_stride,
2542 size_t output_stride,
2543 const void* kernel,
2544 const void* bias,
2545 float output_min,
2546 float output_max,
2547 uint32_t flags,
2548 xnn_caches_t caches,
2549 xnn_operator_t* fully_connected_op_out);
2550
2551enum xnn_status xnn_setup_fully_connected_nc_f16(
2552 xnn_operator_t fully_connected_op,
2553 size_t batch_size,
2554 const void* input,
2555 void* output,
2556 pthreadpool_t threadpool);
2557
2558enum xnn_status xnn_create_global_average_pooling_nwc_f16(
2559 size_t channels,
2560 size_t input_stride,
2561 size_t output_stride,
2562 float output_min,
2563 float output_max,
2564 uint32_t flags,
2565 xnn_operator_t* global_average_pooling_op_out);
2566
2567enum xnn_status xnn_setup_global_average_pooling_nwc_f16(
2568 xnn_operator_t global_average_pooling_op,
2569 size_t batch_size,
2570 size_t width,
2571 const void* input,
2572 void* output,
2573 pthreadpool_t threadpool);
2574
2575enum xnn_status xnn_create_hardswish_nc_f16(
2576 size_t channels,
2577 size_t input_stride,
2578 size_t output_stride,
2579 uint32_t flags,
2580 xnn_operator_t* hardswish_op_out);
2581
2582enum xnn_status xnn_setup_hardswish_nc_f16(
2583 xnn_operator_t hardswish_op,
2584 size_t batch_size,
2585 const void* input,
2586 void* output,
2587 pthreadpool_t threadpool);
2588
2589enum xnn_status xnn_create_leaky_relu_nc_f16(
2590 size_t channels,
2591 size_t input_stride,
2592 size_t output_stride,
2593 float negative_slope,
2594 uint32_t flags,
2595 xnn_operator_t* leaky_relu_op_out);
2596
2597enum xnn_status xnn_setup_leaky_relu_nc_f16(
2598 xnn_operator_t leaky_relu_op,
2599 size_t batch_size,
2600 const void* input,
2601 void* output,
2602 pthreadpool_t threadpool);
2603
2604enum xnn_status xnn_create_max_pooling2d_nhwc_f16(
2605 uint32_t input_padding_top,
2606 uint32_t input_padding_right,
2607 uint32_t input_padding_bottom,
2608 uint32_t input_padding_left,
2609 uint32_t pooling_height,
2610 uint32_t pooling_width,
2611 uint32_t stride_height,
2612 uint32_t stride_width,
2613 uint32_t dilation_height,
2614 uint32_t dilation_width,
2615 size_t channels,
2616 size_t input_pixel_stride,
2617 size_t output_pixel_stride,
2618 float output_min,
2619 float output_max,
2620 uint32_t flags,
2621 xnn_operator_t* max_pooling_op_out);
2622
2623enum xnn_status xnn_setup_max_pooling2d_nhwc_f16(
2624 xnn_operator_t max_pooling_op,
2625 size_t batch_size,
2626 size_t input_height,
2627 size_t input_width,
2628 const void* input,
2629 void* output,
2630 pthreadpool_t threadpool);
2631
2632enum xnn_status xnn_create_maximum_nd_f16(
2633 uint32_t flags,
2634 xnn_operator_t* maximum_op_out);
2635
2636enum xnn_status xnn_setup_maximum_nd_f16(
2637 xnn_operator_t maximum_op,
2638 size_t num_input1_dims,
2639 const size_t* input1_shape,
2640 size_t num_input2_dims,
2641 const size_t* input2_shape,
2642 const void* input1,
2643 const void* input2,
2644 void* output,
2645 pthreadpool_t threadpool);
2646
2647enum xnn_status xnn_create_minimum_nd_f16(
2648 uint32_t flags,
2649 xnn_operator_t* minimum_op_out);
2650
2651enum xnn_status xnn_setup_minimum_nd_f16(
2652 xnn_operator_t minimum_op,
2653 size_t num_input1_dims,
2654 const size_t* input1_shape,
2655 size_t num_input2_dims,
2656 const size_t* input2_shape,
2657 const void* input1,
2658 const void* input2,
2659 void* output,
2660 pthreadpool_t threadpool);
2661
2662enum xnn_status xnn_create_multiply_nd_f16(
2663 float output_min,
2664 float output_max,
2665 uint32_t flags,
2666 xnn_operator_t* multiply_op_out);
2667
2668enum xnn_status xnn_setup_multiply_nd_f16(
2669 xnn_operator_t multiply_op,
2670 size_t num_input1_dims,
2671 const size_t* input1_shape,
2672 size_t num_input2_dims,
2673 const size_t* input2_shape,
2674 const void* input1,
2675 const void* input2,
2676 void* output,
2677 pthreadpool_t threadpool);
2678
2679enum xnn_status xnn_create_negate_nc_f16(
2680 size_t channels,
2681 size_t input_stride,
2682 size_t output_stride,
2683 uint32_t flags,
2684 xnn_operator_t* negate_op_out);
2685
2686enum xnn_status xnn_setup_negate_nc_f16(
2687 xnn_operator_t negate_op,
2688 size_t batch_size,
2689 const void* input,
2690 void* output,
2691 pthreadpool_t threadpool);
2692
2693enum xnn_status xnn_create_prelu_nc_f16(
2694 size_t channels,
2695 size_t input_stride,
2696 size_t output_stride,
2697 const void* negative_slope,
2698 uint32_t flags,
2699 xnn_caches_t caches,
2700 xnn_operator_t* prelu_op_out);
2701
2702enum xnn_status xnn_setup_prelu_nc_f16(
2703 xnn_operator_t prelu_op,
2704 size_t batch_size,
2705 const void* input,
2706 void* output,
2707 pthreadpool_t threadpool);
2708
2709enum xnn_status xnn_create_resize_bilinear2d_nhwc_f16(
2710 size_t channels,
2711 size_t input_pixel_stride,
2712 size_t output_pixel_stride,
2713 uint32_t flags,
2714 xnn_operator_t* resize_op_out);
2715
2716enum xnn_status xnn_setup_resize_bilinear2d_nhwc_f16(
2717 xnn_operator_t resize_op,
2718 size_t batch_size,
2719 size_t input_height,
2720 size_t input_width,
2721 size_t output_height,
2722 size_t output_width,
2723 const void* input,
2724 void* output,
2725 pthreadpool_t threadpool);
2726
2727enum xnn_status xnn_create_sigmoid_nc_f16(
2728 size_t channels,
2729 size_t input_stride,
2730 size_t output_stride,
2731 uint32_t flags,
2732 xnn_operator_t* sigmoid_op_out);
2733
2734enum xnn_status xnn_setup_sigmoid_nc_f16(
2735 xnn_operator_t sigmoid_op,
2736 size_t batch_size,
2737 const void* input,
2738 void* output,
2739 pthreadpool_t threadpool);
2740
2741enum xnn_status xnn_create_softmax_nc_f16(
2742 size_t channels,
2743 size_t input_stride,
2744 size_t output_stride,
2745 uint32_t flags,
2746 xnn_operator_t* softmax_op_out);
2747
2748enum xnn_status xnn_setup_softmax_nc_f16(
2749 xnn_operator_t softmax_op,
2750 size_t batch_size,
2751 const void* input,
2752 void* output,
2753 pthreadpool_t threadpool);
2754
2755enum xnn_status xnn_create_square_nc_f16(
2756 size_t channels,
2757 size_t input_stride,
2758 size_t output_stride,
2759 uint32_t flags,
2760 xnn_operator_t* square_op_out);
2761
2762enum xnn_status xnn_setup_square_nc_f16(
2763 xnn_operator_t square_op,
2764 size_t batch_size,
2765 const void* input,
2766 void* output,
2767 pthreadpool_t threadpool);
2768
2769enum xnn_status xnn_create_square_root_nc_f16(
2770 size_t channels,
2771 size_t input_stride,
2772 size_t output_stride,
2773 uint32_t flags,
2774 xnn_operator_t* sqrt_op_out);
2775
2776enum xnn_status xnn_setup_square_root_nc_f16(
2777 xnn_operator_t sqrt_op,
2778 size_t batch_size,
2779 const void* input,
2780 void* output,
2781 pthreadpool_t threadpool);
2782
2783enum xnn_status xnn_create_squared_difference_nd_f16(
2784 uint32_t flags,
2785 xnn_operator_t* squared_difference_op_out);
2786
2787enum xnn_status xnn_setup_squared_difference_nd_f16(
2788 xnn_operator_t squared_difference_op,
2789 size_t num_input1_dims,
2790 const size_t* input1_shape,
2791 size_t num_input2_dims,
2792 const size_t* input2_shape,
2793 const void* input1,
2794 const void* input2,
2795 void* output,
2796 pthreadpool_t threadpool);
2797
2798enum xnn_status xnn_create_subtract_nd_f16(
2799 float output_min,
2800 float output_max,
2801 uint32_t flags,
2802 xnn_operator_t* subtract_op_out);
2803
2804enum xnn_status xnn_setup_subtract_nd_f16(
2805 xnn_operator_t subtract_op,
2806 size_t num_input1_dims,
2807 const size_t* input1_shape,
2808 size_t num_input2_dims,
2809 const size_t* input2_shape,
2810 const void* input1,
2811 const void* input2,
2812 void* output,
2813 pthreadpool_t threadpool);
2814
2815enum xnn_status xnn_create_truncation_nc_f16(
2816 size_t channels,
2817 size_t input_stride,
2818 size_t output_stride,
2819 uint32_t flags,
2820 xnn_operator_t* truncation_op_out);
2821
2822enum xnn_status xnn_setup_truncation_nc_f16(
2823 xnn_operator_t truncation_op,
2824 size_t batch_size,
2825 const void* input,
2826 void* output,
2827 pthreadpool_t threadpool);
2828
2829#endif // XNN_NO_F16_OPERATORS
2830
2831#ifndef XNN_NO_X16_OPERATORS
2832
2833enum xnn_status xnn_create_constant_pad_nd_x16(
2834 const void* padding_value,
2835 uint32_t flags,
2836 xnn_operator_t* constant_pad_op_out);
2837
2838enum xnn_status xnn_setup_constant_pad_nd_x16(
2839 xnn_operator_t constant_pad_op,
2840 size_t num_dims,
2841 const size_t* input_shape,
2842 const size_t* pre_padding,
2843 const size_t* post_padding,
2844 const void* input,
2845 void* output,
2846 pthreadpool_t threadpool);
2847
2848enum xnn_status xnn_create_copy_nc_x16(
2849 size_t channels,
2850 size_t input_stride,
2851 size_t output_stride,
2852 uint32_t flags,
2853 xnn_operator_t* copy_op_out);
2854
2855enum xnn_status xnn_setup_copy_nc_x16(
2856 xnn_operator_t copy_op,
2857 size_t batch_size,
2858 const void* input,
2859 void* output,
2860 pthreadpool_t threadpool);
2861
2862enum xnn_status xnn_create_depth_to_space_nhwc_x16(
2863 size_t output_channels,
2864 size_t input_channel_stride,
2865 size_t output_channel_stride,
2866 uint32_t block_size,
2867 uint32_t flags,
2868 xnn_operator_t* depth_to_space_op_out);
2869
2870enum xnn_status xnn_setup_depth_to_space_nhwc_x16(
2871 xnn_operator_t depth_to_space_op,
2872 size_t batch_size,
2873 size_t input_height,
2874 size_t input_width,
2875 const void* input,
2876 void* output,
2877 pthreadpool_t threadpool);
2878
2879enum xnn_status xnn_create_space_to_depth_nhwc_x16(
2880 size_t input_channels,
2881 size_t input_channel_stride,
2882 size_t output_channel_stride,
2883 uint32_t block_size,
2884 uint32_t flags,
2885 xnn_operator_t* space_to_depth_op_out);
2886
2887enum xnn_status xnn_setup_space_to_depth_nhwc_x16(
2888 xnn_operator_t space_to_depth_op,
2889 size_t batch_size,
2890 size_t input_height,
2891 size_t input_width,
2892 const void* input,
2893 void* output,
2894 pthreadpool_t threadpool);
2895
2896enum xnn_status xnn_create_transpose_nd_x16(
2897 uint32_t flags,
2898 xnn_operator_t* transpose_op_out);
2899
2900enum xnn_status xnn_setup_transpose_nd_x16(
2901 xnn_operator_t transpose_op,
2902 const void* input,
2903 void* output,
2904 const size_t num_dims,
2905 const size_t* input_shape,
2906 const size_t* output_perm,
2907 pthreadpool_t threadpool);
2908
2909enum xnn_status xnn_run_transpose_nd_x16(
2910 const void* input,
2911 void* output,
2912 const size_t num_dims,
2913 const size_t* input_shape,
2914 const size_t* output_perm,
2915 uint32_t flags,
2916 pthreadpool_t threadpool);
2917
2918#endif // XNN_NO_X16_OPERATORS
2919
2920#ifndef XNN_NO_QC8_OPERATORS
2921
2922enum xnn_status xnn_create_convolution2d_nhwc_qc8(
2923 uint32_t input_padding_top,
2924 uint32_t input_padding_right,
2925 uint32_t input_padding_bottom,
2926 uint32_t input_padding_left,
2927 uint32_t kernel_height,
2928 uint32_t kernel_width,
2929 uint32_t subsampling_height,
2930 uint32_t subsampling_width,
2931 uint32_t dilation_height,
2932 uint32_t dilation_width,
2933 uint32_t groups,
2934 size_t group_input_channels,
2935 size_t group_output_channels,
2936 size_t input_channel_stride,
2937 size_t output_channel_stride,
2938 int8_t input_zero_point,
2939 float input_scale,
2940 const float* kernel_scale,
2941 const int8_t* kernel,
2942 const int32_t* bias,
2943 int8_t output_zero_point,
2944 float output_scale,
2945 int8_t output_min,
2946 int8_t output_max,
2947 uint32_t flags,
2948 xnn_caches_t caches,
2949 xnn_operator_t* convolution_op_out);
2950
2951enum xnn_status xnn_setup_convolution2d_nhwc_qc8(
2952 xnn_operator_t convolution_op,
2953 size_t batch_size,
2954 size_t input_height,
2955 size_t input_width,
2956 const int8_t* input,
2957 int8_t* output,
2958 pthreadpool_t threadpool);
2959
2960#endif // XNN_NO_QC8_OPERATORS
2961
2962#ifndef XNN_NO_QS8_OPERATORS
2963
2964enum xnn_status xnn_create_add_nd_qs8(
2965 int8_t input1_zero_point,
2966 float input1_scale,
2967 int8_t input2_zero_point,
2968 float input2_scale,
2969 int8_t output_zero_point,
2970 float output_scale,
2971 int8_t output_min,
2972 int8_t output_max,
2973 uint32_t flags,
2974 xnn_operator_t* add_op_out);
2975
2976enum xnn_status xnn_setup_add_nd_qs8(
2977 xnn_operator_t add_op,
2978 size_t num_input1_dims,
2979 const size_t* input1_shape,
2980 size_t num_input2_dims,
2981 const size_t* input2_shape,
2982 const int8_t* input1,
2983 const int8_t* input2,
2984 int8_t* output,
2985 pthreadpool_t threadpool);
2986
2987enum xnn_status xnn_create_convolution2d_nhwc_qs8(
2988 uint32_t input_padding_top,
2989 uint32_t input_padding_right,
2990 uint32_t input_padding_bottom,
2991 uint32_t input_padding_left,
2992 uint32_t kernel_height,
2993 uint32_t kernel_width,
2994 uint32_t subsampling_height,
2995 uint32_t subsampling_width,
2996 uint32_t dilation_height,
2997 uint32_t dilation_width,
2998 uint32_t groups,
2999 size_t group_input_channels,
3000 size_t group_output_channels,
3001 size_t input_channel_stride,
3002 size_t output_channel_stride,
3003 int8_t input_zero_point,
3004 float input_scale,
3005 float kernel_scale,
3006 const int8_t* kernel,
3007 const int32_t* bias,
3008 int8_t output_zero_point,
3009 float output_scale,
3010 int8_t output_min,
3011 int8_t output_max,
3012 uint32_t flags,
3013 xnn_caches_t caches,
3014 xnn_operator_t* convolution_op_out);
3015
3016enum xnn_status xnn_setup_convolution2d_nhwc_qs8(
3017 xnn_operator_t convolution_op,
3018 size_t batch_size,
3019 size_t input_height,
3020 size_t input_width,
3021 const int8_t* input,
3022 int8_t* output,
3023 pthreadpool_t threadpool);
3024
3025enum xnn_status xnn_create_deconvolution2d_nhwc_qs8(
3026 uint32_t output_padding_top,
3027 uint32_t output_padding_right,
3028 uint32_t output_padding_bottom,
3029 uint32_t output_padding_left,
3030 uint32_t kernel_height,
3031 uint32_t kernel_width,
3032 uint32_t stride_height,
3033 uint32_t stride_width,
3034 uint32_t dilation_height,
3035 uint32_t dilation_width,
3036 uint32_t groups,
3037 size_t group_input_channels,
3038 size_t group_output_channels,
3039 size_t input_pixel_stride,
3040 size_t output_pixel_stride,
3041 int8_t input_zero_point,
3042 float input_scale,
3043 float kernel_scale,
3044 const int8_t* kernel,
3045 const int32_t* bias,
3046 int8_t output_zero_point,
3047 float output_scale,
3048 int8_t output_min,
3049 int8_t output_max,
3050 uint32_t flags,
3051 xnn_caches_t caches,
3052 xnn_operator_t* deconvolution_op_out);
3053
3054enum xnn_status xnn_setup_deconvolution2d_nhwc_qs8(
3055 xnn_operator_t deconvolution_op,
3056 size_t batch_size,
3057 size_t input_height,
3058 size_t input_width,
3059 uint32_t adjustment_height,
3060 uint32_t adjustment_width,
3061 const int8_t* input,
3062 int8_t* output,
3063 pthreadpool_t threadpool);
3064
3065enum xnn_status xnn_create_elu_nc_qs8(
3066 size_t channels,
3067 size_t input_stride,
3068 size_t output_stride,
3069 float alpha,
3070 int8_t input_zero_point,
3071 float input_scale,
3072 int8_t output_zero_point,
3073 float output_scale,
3074 int8_t output_min,
3075 int8_t output_max,
3076 uint32_t flags,
3077 xnn_operator_t* elu_op_out);
3078
3079enum xnn_status xnn_setup_elu_nc_qs8(
3080 xnn_operator_t elu_op,
3081 size_t batch_size,
3082 const int8_t* input,
3083 int8_t* output,
3084 pthreadpool_t threadpool);
3085
3086enum xnn_status xnn_create_fully_connected_nc_qs8(
3087 size_t input_channels,
3088 size_t output_channels,
3089 size_t input_stride,
3090 size_t output_stride,
3091 int8_t input_zero_point,
3092 float input_scale,
3093 float kernel_scale,
3094 const int8_t* kernel,
3095 const int32_t* bias,
3096 int8_t output_zero_point,
3097 float output_scale,
3098 int8_t output_min,
3099 int8_t output_max,
3100 uint32_t flags,
3101 xnn_caches_t caches,
3102 xnn_operator_t* fully_connected_op_out);
3103
3104enum xnn_status xnn_setup_fully_connected_nc_qs8(
3105 xnn_operator_t fully_connected_op,
3106 size_t batch_size,
3107 const int8_t* input,
3108 int8_t* output,
3109 pthreadpool_t threadpool);
3110
3111enum xnn_status xnn_create_global_average_pooling_nwc_qs8(
3112 size_t channels,
3113 size_t input_stride,
3114 size_t output_stride,
3115 int8_t input_zero_point,
3116 float input_scale,
3117 int8_t output_zero_point,
3118 float output_scale,
3119 int8_t output_min,
3120 int8_t output_max,
3121 uint32_t flags,
3122 xnn_operator_t* global_average_pooling_op_out);
3123
3124enum xnn_status xnn_setup_global_average_pooling_nwc_qs8(
3125 xnn_operator_t global_average_pooling_op,
3126 size_t batch_size,
3127 size_t width,
3128 const int8_t* input,
3129 int8_t* output,
3130 pthreadpool_t threadpool);
3131
3132enum xnn_status xnn_create_multiply_nd_qs8(
3133 int8_t input1_zero_point,
3134 float input1_scale,
3135 int8_t input2_zero_point,
3136 float input2_scale,
3137 int8_t output_zero_point,
3138 float output_scale,
3139 int8_t output_min,
3140 int8_t output_max,
3141 uint32_t flags,
3142 xnn_operator_t* multiply_op_out);
3143
3144enum xnn_status xnn_setup_multiply_nd_qs8(
3145 xnn_operator_t multiply_op,
3146 size_t num_input1_dims,
3147 const size_t* input1_shape,
3148 size_t num_input2_dims,
3149 const size_t* input2_shape,
3150 const int8_t* input1,
3151 const int8_t* input2,
3152 int8_t* output,
3153 pthreadpool_t threadpool);
3154
3155enum xnn_status xnn_create_leaky_relu_nc_qs8(
3156 size_t channels,
3157 size_t input_stride,
3158 size_t output_stride,
3159 float negative_slope,
3160 int8_t input_zero_point,
3161 float input_scale,
3162 int8_t output_zero_point,
3163 float output_scale,
3164 uint32_t flags,
3165 xnn_operator_t* leaky_relu_op_out);
3166
3167enum xnn_status xnn_setup_leaky_relu_nc_qs8(
3168 xnn_operator_t leaky_relu_op,
3169 size_t batch_size,
3170 const int8_t* input,
3171 int8_t* output,
3172 pthreadpool_t threadpool);
3173
3174enum xnn_status xnn_create_sigmoid_nc_qs8(
3175 size_t channels,
3176 size_t input_stride,
3177 size_t output_stride,
3178 int8_t input_zero_point,
3179 float input_scale,
3180 int8_t output_zero_point,
3181 float output_scale,
3182 int8_t output_min,
3183 int8_t output_max,
3184 uint32_t flags,
3185 xnn_operator_t* sigmoid_op_out);
3186
3187enum xnn_status xnn_setup_sigmoid_nc_qs8(
3188 xnn_operator_t sigmoid_op,
3189 size_t batch_size,
3190 const int8_t* input,
3191 int8_t* output,
3192 pthreadpool_t threadpool);
3193
3194enum xnn_status xnn_create_subtract_nd_qs8(
3195 int8_t input1_zero_point,
3196 float input1_scale,
3197 int8_t input2_zero_point,
3198 float input2_scale,
3199 int8_t output_zero_point,
3200 float output_scale,
3201 int8_t output_min,
3202 int8_t output_max,
3203 uint32_t flags,
3204 xnn_operator_t* subtract_op_out);
3205
3206enum xnn_status xnn_setup_subtract_nd_qs8(
3207 xnn_operator_t subtract_op,
3208 size_t num_input1_dims,
3209 const size_t* input1_shape,
3210 size_t num_input2_dims,
3211 const size_t* input2_shape,
3212 const int8_t* input1,
3213 const int8_t* input2,
3214 int8_t* output,
3215 pthreadpool_t threadpool);
3216
3217enum xnn_status xnn_create_tanh_nc_qs8(
3218 size_t channels,
3219 size_t input_stride,
3220 size_t output_stride,
3221 int8_t input_zero_point,
3222 float input_scale,
3223 int8_t output_zero_point,
3224 float output_scale,
3225 int8_t output_min,
3226 int8_t output_max,
3227 uint32_t flags,
3228 xnn_operator_t* tanh_op_out);
3229
3230enum xnn_status xnn_setup_tanh_nc_qs8(
3231 xnn_operator_t tanh_op,
3232 size_t batch_size,
3233 const int8_t* input,
3234 int8_t* output,
3235 pthreadpool_t threadpool);
3236
3237#endif // XNN_NO_QS8_OPERATORS
3238
3239#ifndef XNN_NO_QU8_OPERATORS
3240
3241enum xnn_status xnn_create_add_nd_qu8(
3242 uint8_t input1_zero_point,
3243 float input1_scale,
3244 uint8_t input2_zero_point,
3245 float input2_scale,
3246 uint8_t output_zero_point,
3247 float output_scale,
3248 uint8_t output_min,
3249 uint8_t output_max,
3250 uint32_t flags,
3251 xnn_operator_t* add_op_out);
3252
3253enum xnn_status xnn_setup_add_nd_qu8(
3254 xnn_operator_t add_op,
3255 size_t num_input1_dims,
3256 const size_t* input1_shape,
3257 size_t num_input2_dims,
3258 const size_t* input2_shape,
3259 const uint8_t* input1,
3260 const uint8_t* input2,
3261 uint8_t* output,
3262 pthreadpool_t threadpool);
3263
3264enum xnn_status xnn_create_average_pooling2d_nhwc_qu8(
3265 uint32_t input_padding_top,
3266 uint32_t input_padding_right,
3267 uint32_t input_padding_bottom,
3268 uint32_t input_padding_left,
3269 uint32_t pooling_height,
3270 uint32_t pooling_width,
3271 uint32_t stride_height,
3272 uint32_t stride_width,
3273 size_t channels,
3274 size_t input_pixel_stride,
3275 size_t output_pixel_stride,
3276 uint8_t input_zero_point,
3277 float input_scale,
3278 uint8_t output_zero_point,
3279 float output_scale,
3280 uint8_t output_min,
3281 uint8_t output_max,
3282 uint32_t flags,
3283 xnn_operator_t* average_pooling_op_out);
3284
3285enum xnn_status xnn_setup_average_pooling2d_nhwc_qu8(
3286 xnn_operator_t average_pooling_op,
3287 size_t batch_size,
3288 size_t input_height,
3289 size_t input_width,
3290 const uint8_t* input,
3291 uint8_t* output,
3292 pthreadpool_t threadpool);
3293
3294enum xnn_status xnn_create_convolution2d_nhwc_qu8(
3295 uint32_t input_padding_top,
3296 uint32_t input_padding_right,
3297 uint32_t input_padding_bottom,
3298 uint32_t input_padding_left,
3299 uint32_t kernel_height,
3300 uint32_t kernel_width,
3301 uint32_t subsampling_height,
3302 uint32_t subsampling_width,
3303 uint32_t dilation_height,
3304 uint32_t dilation_width,
3305 uint32_t groups,
3306 size_t group_input_channels,
3307 size_t group_output_channels,
3308 size_t input_channel_stride,
3309 size_t output_channel_stride,
3310 uint8_t input_zero_point,
3311 float input_scale,
3312 uint8_t kernel_zero_point,
3313 float kernel_scale,
3314 const uint8_t* kernel,
3315 const int32_t* bias,
3316 uint8_t output_zero_point,
3317 float output_scale,
3318 uint8_t output_min,
3319 uint8_t output_max,
3320 uint32_t flags,
3321 xnn_caches_t caches,
3322 xnn_operator_t* convolution_op_out);
3323
3324enum xnn_status xnn_setup_convolution2d_nhwc_qu8(
3325 xnn_operator_t convolution_op,
3326 size_t batch_size,
3327 size_t input_height,
3328 size_t input_width,
3329 const uint8_t* input,
3330 uint8_t* output,
3331 pthreadpool_t threadpool);
3332
3333enum xnn_status xnn_create_deconvolution2d_nhwc_qu8(
3334 uint32_t output_padding_top,
3335 uint32_t output_padding_right,
3336 uint32_t output_padding_bottom,
3337 uint32_t output_padding_left,
3338 uint32_t kernel_height,
3339 uint32_t kernel_width,
3340 uint32_t stride_height,
3341 uint32_t stride_width,
3342 uint32_t dilation_height,
3343 uint32_t dilation_width,
3344 uint32_t groups,
3345 size_t group_input_channels,
3346 size_t group_output_channels,
3347 size_t input_pixel_stride,
3348 size_t output_pixel_stride,
3349 uint8_t input_zero_point,
3350 float input_scale,
3351 uint8_t kernel_zero_point,
3352 float kernel_scale,
3353 const uint8_t* kernel,
3354 const int32_t* bias,
3355 uint8_t output_zero_point,
3356 float output_scale,
3357 uint8_t output_min,
3358 uint8_t output_max,
3359 uint32_t flags,
3360 xnn_caches_t caches,
3361 xnn_operator_t* deconvolution_op_out);
3362
3363enum xnn_status xnn_setup_deconvolution2d_nhwc_qu8(
3364 xnn_operator_t deconvolution_op,
3365 size_t batch_size,
3366 size_t input_height,
3367 size_t input_width,
3368 uint32_t adjustment_height,
3369 uint32_t adjustment_width,
3370 const uint8_t* input,
3371 uint8_t* output,
3372 pthreadpool_t threadpool);
3373
3374enum xnn_status xnn_create_fully_connected_nc_qu8(
3375 size_t input_channels,
3376 size_t output_channels,
3377 size_t input_stride,
3378 size_t output_stride,
3379 uint8_t input_zero_point,
3380 float input_scale,
3381 uint8_t kernel_zero_point,
3382 float kernel_scale,
3383 const uint8_t* kernel,
3384 const int32_t* bias,
3385 uint8_t output_zero_point,
3386 float output_scale,
3387 uint8_t output_min,
3388 uint8_t output_max,
3389 uint32_t flags,
3390 xnn_caches_t caches,
3391 xnn_operator_t* fully_connected_op_out);
3392
3393enum xnn_status xnn_setup_fully_connected_nc_qu8(
3394 xnn_operator_t fully_connected_op,
3395 size_t batch_size,
3396 const uint8_t* input,
3397 uint8_t* output,
3398 pthreadpool_t threadpool);
3399
3400enum xnn_status xnn_create_global_average_pooling_nwc_qu8(
3401 size_t channels,
3402 size_t input_stride,
3403 size_t output_stride,
3404 uint8_t input_zero_point,
3405 float input_scale,
3406 uint8_t output_zero_point,
3407 float output_scale,
3408 uint8_t output_min,
3409 uint8_t output_max,
3410 uint32_t flags,
3411 xnn_operator_t* global_average_pooling_op_out);
3412
3413enum xnn_status xnn_setup_global_average_pooling_nwc_qu8(
3414 xnn_operator_t global_average_pooling_op,
3415 size_t batch_size,
3416 size_t width,
3417 const uint8_t* input,
3418 uint8_t* output,
3419 pthreadpool_t threadpool);
3420
3421enum xnn_status xnn_create_leaky_relu_nc_qu8(
3422 size_t channels,
3423 size_t input_stride,
3424 size_t output_stride,
3425 float negative_slope,
3426 uint8_t input_zero_point,
3427 float input_scale,
3428 uint8_t output_zero_point,
3429 float output_scale,
3430 uint32_t flags,
3431 xnn_operator_t* leaky_relu_op_out);
3432
3433enum xnn_status xnn_setup_leaky_relu_nc_qu8(
3434 xnn_operator_t leaky_relu_op,
3435 size_t batch_size,
3436 const uint8_t* input,
3437 uint8_t* output,
3438 pthreadpool_t threadpool);
3439
3440enum xnn_status xnn_create_multiply_nd_qu8(
3441 uint8_t input1_zero_point,
3442 float input1_scale,
3443 uint8_t input2_zero_point,
3444 float input2_scale,
3445 uint8_t output_zero_point,
3446 float output_scale,
3447 uint8_t output_min,
3448 uint8_t output_max,
3449 uint32_t flags,
3450 xnn_operator_t* multiply_op_out);
3451
3452enum xnn_status xnn_setup_multiply_nd_qu8(
3453 xnn_operator_t multiply_op,
3454 size_t num_input1_dims,
3455 const size_t* input1_shape,
3456 size_t num_input2_dims,
3457 const size_t* input2_shape,
3458 const uint8_t* input1,
3459 const uint8_t* input2,
3460 uint8_t* output,
3461 pthreadpool_t threadpool);
3462
3463enum xnn_status xnn_create_sigmoid_nc_qu8(
3464 size_t channels,
3465 size_t input_stride,
3466 size_t output_stride,
3467 uint8_t input_zero_point,
3468 float input_scale,
3469 uint8_t output_zero_point,
3470 float output_scale,
3471 uint8_t output_min,
3472 uint8_t output_max,
3473 uint32_t flags,
3474 xnn_operator_t* sigmoid_op_out);
3475
3476enum xnn_status xnn_setup_sigmoid_nc_qu8(
3477 xnn_operator_t sigmoid_op,
3478 size_t batch_size,
3479 const uint8_t* input,
3480 uint8_t* output,
3481 pthreadpool_t threadpool);
3482
3483enum xnn_status xnn_create_softmax_nc_qu8(
3484 size_t channels,
3485 size_t input_stride,
3486 size_t output_stride,
3487 float input_scale,
3488 uint8_t output_zero_point,
3489 float output_scale,
3490 uint32_t flags,
3491 xnn_operator_t* softmax_op_out);
3492
3493enum xnn_status xnn_setup_softmax_nc_qu8(
3494 xnn_operator_t softmax_op,
3495 size_t batch_size,
3496 const uint8_t* input,
3497 uint8_t* output,
3498 pthreadpool_t threadpool);
3499
3500enum xnn_status xnn_create_subtract_nd_qu8(
3501 uint8_t input1_zero_point,
3502 float input1_scale,
3503 uint8_t input2_zero_point,
3504 float input2_scale,
3505 uint8_t output_zero_point,
3506 float output_scale,
3507 uint8_t output_min,
3508 uint8_t output_max,
3509 uint32_t flags,
3510 xnn_operator_t* subtract_op_out);
3511
3512enum xnn_status xnn_setup_subtract_nd_qu8(
3513 xnn_operator_t subtract_op,
3514 size_t num_input1_dims,
3515 const size_t* input1_shape,
3516 size_t num_input2_dims,
3517 const size_t* input2_shape,
3518 const uint8_t* input1,
3519 const uint8_t* input2,
3520 uint8_t* output,
3521 pthreadpool_t threadpool);
3522
3523enum xnn_status xnn_create_tanh_nc_qu8(
3524 size_t channels,
3525 size_t input_stride,
3526 size_t output_stride,
3527 uint8_t input_zero_point,
3528 float input_scale,
3529 uint8_t output_zero_point,
3530 float output_scale,
3531 uint8_t output_min,
3532 uint8_t output_max,
3533 uint32_t flags,
3534 xnn_operator_t* tanh_op_out);
3535
3536enum xnn_status xnn_setup_tanh_nc_qu8(
3537 xnn_operator_t tanh_op,
3538 size_t batch_size,
3539 const uint8_t* input,
3540 uint8_t* output,
3541 pthreadpool_t threadpool);
3542
3543#endif // XNN_NO_QU8_OPERATORS
3544
3545#ifndef XNN_NO_S8_OPERATORS
3546
3547enum xnn_status xnn_create_clamp_nc_s8(
3548 size_t channels,
3549 size_t input_stride,
3550 size_t output_stride,
3551 int8_t output_min,
3552 int8_t output_max,
3553 uint32_t flags,
3554 xnn_operator_t* clamp_op_out);
3555
3556enum xnn_status xnn_setup_clamp_nc_s8(
3557 xnn_operator_t clamp_op,
3558 size_t batch_size,
3559 const int8_t* input,
3560 int8_t* output,
3561 pthreadpool_t threadpool);
3562
3563enum xnn_status xnn_create_max_pooling2d_nhwc_s8(
3564 uint32_t input_padding_top,
3565 uint32_t input_padding_right,
3566 uint32_t input_padding_bottom,
3567 uint32_t input_padding_left,
3568 uint32_t pooling_height,
3569 uint32_t pooling_width,
3570 uint32_t stride_height,
3571 uint32_t stride_width,
3572 uint32_t dilation_height,
3573 uint32_t dilation_width,
3574 size_t channels,
3575 size_t input_pixel_stride,
3576 size_t output_pixel_stride,
3577 int8_t output_min,
3578 int8_t output_max,
3579 uint32_t flags,
3580 xnn_operator_t* max_pooling_op_out);
3581
3582enum xnn_status xnn_setup_max_pooling2d_nhwc_s8(
3583 xnn_operator_t max_pooling_op,
3584 size_t batch_size,
3585 size_t input_height,
3586 size_t input_width,
3587 const int8_t* input,
3588 int8_t* output,
3589 pthreadpool_t threadpool);
3590
3591enum xnn_status xnn_create_resize_bilinear2d_nhwc_s8(
3592 size_t channels,
3593 size_t input_pixel_stride,
3594 size_t output_pixel_stride,
3595 uint32_t flags,
3596 xnn_operator_t* resize_op_out);
3597
3598enum xnn_status xnn_setup_resize_bilinear2d_nhwc_s8(
3599 xnn_operator_t resize_op,
3600 size_t batch_size,
3601 size_t input_height,
3602 size_t input_width,
3603 size_t output_height,
3604 size_t output_width,
3605 const int8_t* input,
3606 int8_t* output,
3607 pthreadpool_t threadpool);
3608
3609#endif // XNN_NO_S8_OPERATORS
3610
3611#ifndef XNN_NO_U8_OPERATORS
3612
3613enum xnn_status xnn_create_clamp_nc_u8(
3614 size_t channels,
3615 size_t input_stride,
3616 size_t output_stride,
3617 uint8_t output_min,
3618 uint8_t output_max,
3619 uint32_t flags,
3620 xnn_operator_t* clamp_op_out);
3621
3622enum xnn_status xnn_setup_clamp_nc_u8(
3623 xnn_operator_t clamp_op,
3624 size_t batch_size,
3625 const uint8_t* input,
3626 uint8_t* output,
3627 pthreadpool_t threadpool);
3628
3629enum xnn_status xnn_create_max_pooling2d_nhwc_u8(
3630 uint32_t input_padding_top,
3631 uint32_t input_padding_right,
3632 uint32_t input_padding_bottom,
3633 uint32_t input_padding_left,
3634 uint32_t pooling_height,
3635 uint32_t pooling_width,
3636 uint32_t stride_height,
3637 uint32_t stride_width,
3638 uint32_t dilation_height,
3639 uint32_t dilation_width,
3640 size_t channels,
3641 size_t input_pixel_stride,
3642 size_t output_pixel_stride,
3643 uint8_t output_min,
3644 uint8_t output_max,
3645 uint32_t flags,
3646 xnn_operator_t* max_pooling_op_out);
3647
3648enum xnn_status xnn_setup_max_pooling2d_nhwc_u8(
3649 xnn_operator_t max_pooling_op,
3650 size_t batch_size,
3651 size_t input_height,
3652 size_t input_width,
3653 const uint8_t* input,
3654 uint8_t* output,
3655 pthreadpool_t threadpool);
3656
3657enum xnn_status xnn_create_resize_bilinear2d_nhwc_u8(
3658 size_t channels,
3659 size_t input_pixel_stride,
3660 size_t output_pixel_stride,
3661 uint32_t flags,
3662 xnn_operator_t* resize_op_out);
3663
3664enum xnn_status xnn_setup_resize_bilinear2d_nhwc_u8(
3665 xnn_operator_t resize_op,
3666 size_t batch_size,
3667 size_t input_height,
3668 size_t input_width,
3669 size_t output_height,
3670 size_t output_width,
3671 const uint8_t* input,
3672 uint8_t* output,
3673 pthreadpool_t threadpool);
3674
3675#endif // XNN_NO_U8_OPERATORS
3676
3677#ifndef XNN_NO_X8_OPERATORS
3678
3679enum xnn_status xnn_create_copy_nc_x8(
3680 size_t channels,
3681 size_t input_stride,
3682 size_t output_stride,
3683 uint32_t flags,
3684 xnn_operator_t* copy_op_out);
3685
3686enum xnn_status xnn_setup_copy_nc_x8(
3687 xnn_operator_t copy_op,
3688 size_t batch_size,
3689 const void* input,
3690 void* output,
3691 pthreadpool_t threadpool);
3692
3693enum xnn_status xnn_create_channel_shuffle_nc_x8(
3694 size_t groups,
3695 size_t group_channels,
3696 size_t input_stride,
3697 size_t output_stride,
3698 uint32_t flags,
3699 xnn_operator_t* channel_shuffle_op_out);
3700
3701enum xnn_status xnn_setup_channel_shuffle_nc_x8(
3702 xnn_operator_t channel_shuffle_op,
3703 size_t batch_size,
3704 const void* input,
3705 void* output,
3706 pthreadpool_t threadpool);
3707
3708enum xnn_status xnn_create_constant_pad_nd_x8(
3709 const void* padding_value,
3710 uint32_t flags,
3711 xnn_operator_t* constant_pad_op_out);
3712
3713enum xnn_status xnn_setup_constant_pad_nd_x8(
3714 xnn_operator_t constant_pad_op,
3715 size_t num_dims,
3716 const size_t* input_shape,
3717 const size_t* pre_padding,
3718 const size_t* post_padding,
3719 const void* input,
3720 void* output,
3721 pthreadpool_t threadpool);
3722
3723enum xnn_status xnn_create_depth_to_space_nhwc_x8(
3724 size_t output_channels,
3725 size_t input_channel_stride,
3726 size_t output_channel_stride,
3727 uint32_t block_size,
3728 uint32_t flags,
3729 xnn_operator_t* depth_to_space_op_out);
3730
3731enum xnn_status xnn_setup_depth_to_space_nhwc_x8(
3732 xnn_operator_t depth_to_space_op,
3733 size_t batch_size,
3734 size_t input_height,
3735 size_t input_width,
3736 const void* input,
3737 void* output,
3738 pthreadpool_t threadpool);
3739
3740enum xnn_status xnn_create_space_to_depth_nhwc_x8(
3741 size_t input_channels,
3742 size_t input_channel_stride,
3743 size_t output_channel_stride,
3744 uint32_t block_size,
3745 uint32_t flags,
3746 xnn_operator_t* space_to_depth_op_out);
3747
3748enum xnn_status xnn_setup_space_to_depth_nhwc_x8(
3749 xnn_operator_t space_to_depth_op,
3750 size_t batch_size,
3751 size_t input_height,
3752 size_t input_width,
3753 const void* input,
3754 void* output,
3755 pthreadpool_t threadpool);
3756
3757enum xnn_status xnn_create_transpose_nd_x8(
3758 uint32_t flags,
3759 xnn_operator_t* transpose_op_out);
3760
3761enum xnn_status xnn_setup_transpose_nd_x8(
3762 xnn_operator_t transpose_op,
3763 const void* input,
3764 void* output,
3765 const size_t num_dims,
3766 const size_t* input_shape,
3767 const size_t* output_perm,
3768 pthreadpool_t threadpool);
3769
3770enum xnn_status xnn_run_transpose_nd_x8(
3771 const void* input,
3772 void* output,
3773 const size_t num_dims,
3774 const size_t* input_shape,
3775 const size_t* output_perm,
3776 uint32_t flags,
3777 pthreadpool_t threadpool);
3778
3779#endif // XNN_NO_X8_OPERATORS
3780
3781#ifndef XNN_NO_CVT_OPERATORS
3782
3783enum xnn_status xnn_create_convert_nc_f16_f32(
3784 size_t channels,
3785 size_t input_stride,
3786 size_t output_stride,
3787 uint32_t flags,
3788 xnn_operator_t* convert_op_out);
3789
3790enum xnn_status xnn_setup_convert_nc_f16_f32(
3791 xnn_operator_t convert_op,
3792 size_t batch_size,
3793 const void* input,
3794 float* output,
3795 pthreadpool_t threadpool);
3796
3797enum xnn_status xnn_create_convert_nc_f32_f16(
3798 size_t channels,
3799 size_t input_stride,
3800 size_t output_stride,
3801 uint32_t flags,
3802 xnn_operator_t* convert_op_out);
3803
3804enum xnn_status xnn_setup_convert_nc_f32_f16(
3805 xnn_operator_t convert_op,
3806 size_t batch_size,
3807 const float* input,
3808 void* output,
3809 pthreadpool_t threadpool);
3810
3811enum xnn_status xnn_create_convert_nc_f32_qs8(
3812 size_t channels,
3813 size_t input_stride,
3814 size_t output_stride,
3815 float output_scale,
3816 int8_t output_zero_point,
3817 int8_t output_min,
3818 int8_t output_max,
3819 uint32_t flags,
3820 xnn_operator_t* convert_op_out);
3821
3822enum xnn_status xnn_setup_convert_nc_f32_qs8(
3823 xnn_operator_t convert_op,
3824 size_t batch_size,
3825 const float* input,
3826 int8_t* output,
3827 pthreadpool_t threadpool);
3828
3829enum xnn_status xnn_create_convert_nc_f32_qu8(
3830 size_t channels,
3831 size_t input_stride,
3832 size_t output_stride,
3833 float output_scale,
3834 uint8_t output_zero_point,
3835 uint8_t output_min,
3836 uint8_t output_max,
3837 uint32_t flags,
3838 xnn_operator_t* convert_op_out);
3839
3840enum xnn_status xnn_setup_convert_nc_f32_qu8(
3841 xnn_operator_t convert_op,
3842 size_t batch_size,
3843 const float* input,
3844 uint8_t* output,
3845 pthreadpool_t threadpool);
3846
3847enum xnn_status xnn_create_convert_nc_qs8(
3848 size_t channels,
3849 size_t input_stride,
3850 size_t output_stride,
3851 float input_scale,
3852 int8_t input_zero_point,
3853 float output_scale,
3854 int8_t output_zero_point,
3855 uint32_t flags,
3856 xnn_operator_t* convert_op_out);
3857
3858enum xnn_status xnn_setup_convert_nc_qs8(
3859 xnn_operator_t convert_op,
3860 size_t batch_size,
3861 const int8_t* input,
3862 int8_t* output,
3863 pthreadpool_t threadpool);
3864
3865enum xnn_status xnn_create_convert_nc_qs8_f32(
3866 size_t channels,
3867 size_t input_stride,
3868 size_t output_stride,
3869 float input_scale,
3870 int8_t input_zero_point,
3871 uint32_t flags,
3872 xnn_operator_t* convert_op_out);
3873
3874enum xnn_status xnn_setup_convert_nc_qs8_f32(
3875 xnn_operator_t convert_op,
3876 size_t batch_size,
3877 const int8_t* input,
3878 float* output,
3879 pthreadpool_t threadpool);
3880
3881enum xnn_status xnn_create_convert_nc_qu8(
3882 size_t channels,
3883 size_t input_stride,
3884 size_t output_stride,
3885 float input_scale,
3886 uint8_t input_zero_point,
3887 float output_scale,
3888 uint8_t output_zero_point,
3889 uint32_t flags,
3890 xnn_operator_t* convert_op_out);
3891
3892enum xnn_status xnn_setup_convert_nc_qu8(
3893 xnn_operator_t convert_op,
3894 size_t batch_size,
3895 const uint8_t* input,
3896 uint8_t* output,
3897 pthreadpool_t threadpool);
3898
3899enum xnn_status xnn_create_convert_nc_qu8_f32(
3900 size_t channels,
3901 size_t input_stride,
3902 size_t output_stride,
3903 float input_scale,
3904 uint8_t input_zero_point,
3905 uint32_t flags,
3906 xnn_operator_t* convert_op_out);
3907
3908enum xnn_status xnn_setup_convert_nc_qu8_f32(
3909 xnn_operator_t convert_op,
3910 size_t batch_size,
3911 const uint8_t* input,
3912 float* output,
3913 pthreadpool_t threadpool);
3914
3915#endif // XNN_NO_CVT_OPERATORS
3916
3917#ifdef __cplusplus
3918} // extern "C"
3919#endif
3920