1// Copyright (c) Facebook, Inc. and its affiliates.
2// All rights reserved.
3//
4// Copyright 2019 Google LLC
5//
6// This source code is licensed under the BSD-style license found in the
7// LICENSE file in the root directory of this source tree.
8
9#include <math.h>
10#include <stdbool.h>
11#include <stddef.h>
12#include <stdint.h>
13#include <string.h>
14
15#ifdef _WIN32
16 #include <windows.h>
17#else
18 #include <errno.h>
19 #include <pthread.h>
20 #include <sys/mman.h>
21 #include <unistd.h>
22#endif
23
24#ifdef _MSC_VER
25 #include <intrin.h>
26#endif
27
28#ifndef __EMSCRIPTEN__
29 #include <cpuinfo.h>
30#endif
31
32#include <xnnpack.h>
33#include <xnnpack/allocator.h>
34#include <xnnpack/argmaxpool.h>
35#include <xnnpack/avgpool.h>
36#include <xnnpack/common.h>
37#include <xnnpack/conv.h>
38#include <xnnpack/dwconv.h>
39#include <xnnpack/gavgpool.h>
40#include <xnnpack/gemm.h>
41#include <xnnpack/fill.h>
42#include <xnnpack/ibilinear.h>
43#include <xnnpack/igemm.h>
44#include <xnnpack/log.h>
45#include <xnnpack/lut.h>
46#include <xnnpack/maxpool.h>
47#include <xnnpack/pad.h>
48#include <xnnpack/params.h>
49#include <xnnpack/microparams-init.h>
50#include <xnnpack/pavgpool.h>
51#include <xnnpack/prelu.h>
52#include <xnnpack/raddstoreexpminusmax.h>
53#include <xnnpack/rmax.h>
54#include <xnnpack/spmm.h>
55#include <xnnpack/unpool.h>
56#include <xnnpack/transpose.h>
57#include <xnnpack/vadd.h>
58#include <xnnpack/vbinary.h>
59#include <xnnpack/vcvt.h>
60#include <xnnpack/vlrelu.h>
61#include <xnnpack/vmul.h>
62#include <xnnpack/vmulcaddc.h>
63#include <xnnpack/vunary.h>
64#include <xnnpack/zip.h>
65
66#ifndef XNN_ENABLE_ASSEMBLY
67 #define XNN_ENABLE_ASSEMBLY 1
68#endif
69
70#if XNN_PLATFORM_WINDOWS
71 static INIT_ONCE init_guard = INIT_ONCE_STATIC_INIT;
72#else
73 static pthread_once_t init_guard = PTHREAD_ONCE_INIT;
74#endif
75
76#define XNN_MR_TO_INDEX(MR) (MR-1)
77
78#ifndef XNN_ENABLE_GEMM_M_SPECIALIZATION
79#error "XNN_ENABLE_GEMM_M_SPECIALIZATION is not defined"
80#endif
81
82static const struct xnn_allocator* volatile init_allocator = NULL;
83
84static void init(void) {
85#if XNN_ARCH_WASM || XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
86 // Unlike most other architectures, on x86/x86-64 when floating-point instructions
87 // have no NaN arguments, but produce NaN output, the output NaN has sign bit set.
88 // We use it to distinguish x86/x86-64 from other architectures, by doing subtraction
89 // of two infinities (must produce NaN per IEEE 754 standard).
90 static const volatile float inf = INFINITY;
91 const bool is_wasm_x86 = signbit(inf - inf);
92#endif
93 uint32_t init_flags = XNN_INIT_FLAG_XNNPACK;
94
95#if XNN_ARCH_ARM
96 #if XNN_PLATFORM_MOBILE
97 if (!cpuinfo_has_arm_neon()) {
98 xnn_log_error("XNNPACK initialization failed: NEON is not supported");
99 return;
100 }
101 #else
102 if (!cpuinfo_has_arm_v6()) {
103 xnn_log_error("XNNPACK initialization failed: ARMv6 instructions not supported");
104 return;
105 }
106
107 if (!cpuinfo_has_arm_vfpv2() && !cpuinfo_has_arm_vfpv3()) {
108 xnn_log_error("XNNPACK initialization failed: VFP is not supported");
109 return;
110 }
111 #endif
112
113 if (cpuinfo_has_arm_neon()) {
114 /**************************** QC8 AArch32 micro-kernels ****************************/
115 #ifndef XNN_NO_QC8_OPERATORS
116 init_flags |= XNN_INIT_FLAG_QC8;
117
118 #if XNN_ENABLE_ASSEMBLY
119 if (XNN_ENABLE_ARM_DOTPROD && cpuinfo_has_arm_neon_dot()) {
120 #if XNN_ENABLE_ARM_DOTPROD
121 switch (cpuinfo_get_uarch(0)->uarch) {
122 case cpuinfo_uarch_cortex_a55:
123 xnn_params.qc8.gemm.minmax.gemm[XNN_MR_TO_INDEX(4)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qc8_gemm_minmax_fp32_ukernel_4x8c4__aarch32_neondot_cortex_a55);
124 xnn_params.qc8.gemm.minmax.igemm[XNN_MR_TO_INDEX(4)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qc8_igemm_minmax_fp32_ukernel_4x8c4__aarch32_neondot_cortex_a55);
125 xnn_params.qc8.gemm.minmax.gemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qc8_gemm_minmax_fp32_ukernel_1x8c4__neondot);
126 xnn_params.qc8.gemm.minmax.igemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qc8_igemm_minmax_fp32_ukernel_1x8c4__neondot);
127 xnn_params.qc8.gemm.init.qc8 = xnn_init_qc8_conv_minmax_fp32_neonv8_params;
128 xnn_params.qc8.gemm.mr = 4;
129 xnn_params.qc8.gemm.nr = 8;
130 xnn_params.qc8.gemm.log2_kr = 2;
131 break;
132 default:
133 xnn_params.qc8.gemm.minmax.gemm[XNN_MR_TO_INDEX(4)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qc8_gemm_minmax_fp32_ukernel_4x8c4__aarch32_neondot_ld64);
134 xnn_params.qc8.gemm.minmax.igemm[XNN_MR_TO_INDEX(4)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qc8_igemm_minmax_fp32_ukernel_4x8c4__aarch32_neondot_ld64);
135 xnn_params.qc8.gemm.minmax.gemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qc8_gemm_minmax_fp32_ukernel_1x8c4__neondot);
136 xnn_params.qc8.gemm.minmax.igemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qc8_igemm_minmax_fp32_ukernel_1x8c4__neondot);
137 xnn_params.qc8.gemm.init.qc8 = xnn_init_qc8_conv_minmax_fp32_neonv8_params;
138 xnn_params.qc8.gemm.mr = 4;
139 xnn_params.qc8.gemm.nr = 8;
140 xnn_params.qc8.gemm.log2_kr = 2;
141 break;
142 }
143 #endif // XNN_ENABLE_ARM_DOTPROD
144 } else {
145 switch (cpuinfo_get_uarch(0)->uarch) {
146 case cpuinfo_uarch_cortex_a5:
147 case cpuinfo_uarch_cortex_a7:
148 case cpuinfo_uarch_krait:
149 case cpuinfo_uarch_kryo:
150 xnn_params.qc8.gemm.minmax.gemm[XNN_MR_TO_INDEX(4)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qc8_gemm_minmax_fp32_ukernel_4x8__aarch32_neon_mlal_lane_prfm_cortex_a7);
151 xnn_params.qc8.gemm.minmax.igemm[XNN_MR_TO_INDEX(4)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qc8_igemm_minmax_fp32_ukernel_4x8__aarch32_neon_mlal_lane_prfm_cortex_a7);
152 xnn_params.qc8.gemm.minmax.gemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qc8_gemm_minmax_fp32_ukernel_1x8__aarch32_neon_mlal_lane_prfm_cortex_a7);
153 xnn_params.qc8.gemm.minmax.igemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qc8_gemm_minmax_fp32_ukernel_1x8__aarch32_neon_mlal_lane_prfm_cortex_a7);
154 xnn_params.qc8.gemm.init.qc8 = xnn_init_qc8_conv_minmax_fp32_neon_params;
155 xnn_params.qc8.gemm.mr = 4;
156 xnn_params.qc8.gemm.nr = 8;
157 break;
158 case cpuinfo_uarch_cortex_a32:
159 case cpuinfo_uarch_cortex_a35:
160 xnn_params.qc8.gemm.minmax.gemm[XNN_MR_TO_INDEX(4)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qc8_gemm_minmax_fp32_ukernel_4x8__aarch32_neonv8_mlal_lane_cortex_a35);
161 xnn_params.qc8.gemm.minmax.igemm[XNN_MR_TO_INDEX(4)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qc8_igemm_minmax_fp32_ukernel_4x8__aarch32_neonv8_mlal_lane_cortex_a35);
162 xnn_params.qc8.gemm.minmax.gemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qc8_gemm_minmax_fp32_ukernel_1x8__aarch32_neonv8_mlal_lane_cortex_a35);
163 xnn_params.qc8.gemm.minmax.igemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qc8_igemm_minmax_fp32_ukernel_1x8__aarch32_neonv8_mlal_lane_cortex_a35);
164 xnn_params.qc8.gemm.init.qc8 = xnn_init_qc8_conv_minmax_fp32_neonv8_params;
165 xnn_params.qc8.gemm.mr = 4;
166 xnn_params.qc8.gemm.nr = 8;
167 break;
168 case cpuinfo_uarch_cortex_a53:
169 case cpuinfo_uarch_cortex_a57:
170 xnn_params.qc8.gemm.minmax.gemm[XNN_MR_TO_INDEX(4)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qc8_gemm_minmax_fp32_ukernel_4x8__aarch32_neonv8_mlal_lane_prfm_cortex_a53);
171 xnn_params.qc8.gemm.minmax.igemm[XNN_MR_TO_INDEX(4)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qc8_igemm_minmax_fp32_ukernel_4x8__aarch32_neonv8_mlal_lane_prfm_cortex_a53);
172 xnn_params.qc8.gemm.minmax.gemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qc8_gemm_minmax_fp32_ukernel_1x8__aarch32_neonv8_mlal_lane_prfm_cortex_a35);
173 xnn_params.qc8.gemm.minmax.igemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qc8_igemm_minmax_fp32_ukernel_1x8__aarch32_neonv8_mlal_lane_prfm_cortex_a35);
174 xnn_params.qc8.gemm.init.qc8 = xnn_init_qc8_conv_minmax_fp32_neonv8_params;
175 xnn_params.qc8.gemm.mr = 4;
176 xnn_params.qc8.gemm.nr = 8;
177 break;
178 case cpuinfo_uarch_cortex_a55r0:
179 xnn_params.qc8.gemm.minmax.gemm[XNN_MR_TO_INDEX(4)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qc8_gemm_minmax_fp32_ukernel_4x8__aarch32_neonv8_mlal_lane_cortex_a53);
180 xnn_params.qc8.gemm.minmax.igemm[XNN_MR_TO_INDEX(4)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qc8_igemm_minmax_fp32_ukernel_4x8__aarch32_neonv8_mlal_lane_cortex_a53);
181 xnn_params.qc8.gemm.minmax.gemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qc8_gemm_minmax_fp32_ukernel_1x8__aarch32_neonv8_mlal_lane_cortex_a35);
182 xnn_params.qc8.gemm.minmax.igemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qc8_igemm_minmax_fp32_ukernel_1x8__aarch32_neonv8_mlal_lane_cortex_a35);
183 xnn_params.qc8.gemm.init.qc8 = xnn_init_qc8_conv_minmax_fp32_neonv8_params;
184 xnn_params.qc8.gemm.mr = 4;
185 xnn_params.qc8.gemm.nr = 8;
186 break;
187 case cpuinfo_uarch_cortex_a72:
188 xnn_params.qc8.gemm.minmax.gemm[XNN_MR_TO_INDEX(2)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qc8_gemm_minmax_fp32_ukernel_2x8c2s4__neonv8_mlal);
189 xnn_params.qc8.gemm.minmax.igemm[XNN_MR_TO_INDEX(2)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qc8_igemm_minmax_fp32_ukernel_2x8c2s4__neonv8_mlal);
190 xnn_params.qc8.gemm.minmax.gemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qc8_gemm_minmax_fp32_ukernel_1x8c2s4__neonv8_mlal);
191 xnn_params.qc8.gemm.minmax.igemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qc8_igemm_minmax_fp32_ukernel_1x8c2s4__neonv8_mlal);
192 xnn_params.qc8.gemm.init.qc8 = xnn_init_qc8_conv_minmax_fp32_neonv8_params;
193 xnn_params.qc8.gemm.mr = 2;
194 xnn_params.qc8.gemm.nr = 8;
195 xnn_params.qc8.gemm.log2_kr = 1;
196 xnn_params.qc8.gemm.log2_sr = 2;
197 break;
198 case cpuinfo_uarch_exynos_m1:
199 case cpuinfo_uarch_exynos_m2:
200 case cpuinfo_uarch_exynos_m3:
201 xnn_params.qc8.gemm.minmax.gemm[XNN_MR_TO_INDEX(4)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qc8_gemm_minmax_fp32_ukernel_4x8__aarch32_neonv8_mlal_lane_prfm_ld64);
202 xnn_params.qc8.gemm.minmax.igemm[XNN_MR_TO_INDEX(4)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qc8_igemm_minmax_fp32_ukernel_4x8__aarch32_neonv8_mlal_lane_prfm_ld64);
203 xnn_params.qc8.gemm.minmax.gemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qc8_gemm_minmax_fp32_ukernel_1x8__aarch32_neonv8_mlal_lane_prfm_cortex_a35);
204 xnn_params.qc8.gemm.minmax.igemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qc8_igemm_minmax_fp32_ukernel_1x8__aarch32_neonv8_mlal_lane_prfm_cortex_a35);
205 xnn_params.qc8.gemm.init.qc8 = xnn_init_qc8_conv_minmax_fp32_neonv8_params;
206 xnn_params.qc8.gemm.mr = 4;
207 xnn_params.qc8.gemm.nr = 8;
208 break;
209
210 default:
211 if (cpuinfo_has_arm_neon_v8()) {
212 xnn_params.qc8.gemm.minmax.gemm[XNN_MR_TO_INDEX(4)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qc8_gemm_minmax_fp32_ukernel_4x8__aarch32_neonv8_mlal_lane_ld64);
213 xnn_params.qc8.gemm.minmax.igemm[XNN_MR_TO_INDEX(4)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qc8_igemm_minmax_fp32_ukernel_4x8__aarch32_neonv8_mlal_lane_ld64);
214 xnn_params.qc8.gemm.minmax.gemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qc8_gemm_minmax_fp32_ukernel_1x8__aarch32_neonv8_mlal_lane_cortex_a35);
215 xnn_params.qc8.gemm.minmax.igemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qc8_igemm_minmax_fp32_ukernel_1x8__aarch32_neonv8_mlal_lane_cortex_a35);
216 xnn_params.qc8.gemm.init.qc8 = xnn_init_qc8_conv_minmax_fp32_neonv8_params;
217 xnn_params.qc8.gemm.mr = 4;
218 xnn_params.qc8.gemm.nr = 8;
219 } else {
220 xnn_params.qc8.gemm.minmax.gemm[XNN_MR_TO_INDEX(4)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qc8_gemm_minmax_fp32_ukernel_4x8__aarch32_neon_mlal_lane_ld64);
221 xnn_params.qc8.gemm.minmax.igemm[XNN_MR_TO_INDEX(4)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qc8_igemm_minmax_fp32_ukernel_4x8__aarch32_neon_mlal_lane_ld64);
222 xnn_params.qc8.gemm.minmax.gemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qc8_gemm_minmax_fp32_ukernel_1x8__aarch32_neon_mlal_lane_cortex_a7);
223 xnn_params.qc8.gemm.minmax.igemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qc8_igemm_minmax_fp32_ukernel_1x8__aarch32_neon_mlal_lane_cortex_a7);
224 xnn_params.qc8.gemm.init.qc8 = xnn_init_qc8_conv_minmax_fp32_neon_params;
225 xnn_params.qc8.gemm.mr = 4;
226 xnn_params.qc8.gemm.nr = 8;
227 }
228 break;
229 }
230 }
231 #if XNN_MAX_UARCH_TYPES > 1
232 {
233 /* Choose micro-kernels for little cores according to micro-kernel specification for the big core */
234 const uint32_t mr = xnn_params.qc8.gemm.mr;
235 const uint32_t nr = xnn_params.qc8.gemm.nr;
236 const uint32_t log2_kr = xnn_params.qc8.gemm.log2_kr;
237 for (size_t i = 1; i < XNN_MAX_UARCH_TYPES; i++) {
238 const struct cpuinfo_uarch_info* uarch_info = cpuinfo_get_uarch(i);
239 if (uarch_info == NULL) {
240 /* No more microarchitectures in the system */
241 break;
242 }
243
244 switch (uarch_info->uarch) {
245 case cpuinfo_uarch_cortex_a55:
246 #if XNN_ENABLE_ARM_DOTPROD
247 if (mr == 4 && nr == 8 && log2_kr == 2 && cpuinfo_has_arm_neon_dot()) {
248 xnn_params.qc8.gemm.minmax.gemm[XNN_MR_TO_INDEX(4)].function[i] = (xnn_gemm_ukernel_function) xnn_qc8_gemm_minmax_fp32_ukernel_4x8c4__aarch32_neondot_cortex_a55;
249 xnn_params.qc8.gemm.minmax.igemm[XNN_MR_TO_INDEX(4)].function[i] = (xnn_igemm_ukernel_function) xnn_qc8_igemm_minmax_fp32_ukernel_4x8c4__aarch32_neondot_cortex_a55;
250 xnn_params.qc8.gemm.minmax.gemm[XNN_MR_TO_INDEX(1)].function[i] = (xnn_gemm_ukernel_function) xnn_qc8_gemm_minmax_fp32_ukernel_1x8c4__neondot;
251 xnn_params.qc8.gemm.minmax.igemm[XNN_MR_TO_INDEX(1)].function[i] = (xnn_igemm_ukernel_function) xnn_qc8_igemm_minmax_fp32_ukernel_1x8c4__neondot;
252 }
253 #endif // XNN_ENABLE_ARM_DOTPROD
254 break;
255 case cpuinfo_uarch_cortex_a53:
256 if (mr == 4 && nr == 8 && log2_kr == 0) {
257 xnn_params.qc8.gemm.minmax.gemm[XNN_MR_TO_INDEX(4)].function[i] = (xnn_gemm_ukernel_function) xnn_qc8_gemm_minmax_fp32_ukernel_4x8__aarch32_neonv8_mlal_lane_prfm_cortex_a53;
258 xnn_params.qc8.gemm.minmax.igemm[XNN_MR_TO_INDEX(4)].function[i] = (xnn_igemm_ukernel_function) xnn_qc8_igemm_minmax_fp32_ukernel_4x8__aarch32_neonv8_mlal_lane_prfm_cortex_a53;
259 xnn_params.qc8.gemm.minmax.gemm[XNN_MR_TO_INDEX(1)].function[i] = (xnn_gemm_ukernel_function) xnn_qc8_gemm_minmax_fp32_ukernel_1x8__aarch32_neonv8_mlal_lane_prfm_cortex_a35;
260 xnn_params.qc8.gemm.minmax.igemm[XNN_MR_TO_INDEX(1)].function[i] = (xnn_igemm_ukernel_function) xnn_qc8_igemm_minmax_fp32_ukernel_1x8__aarch32_neonv8_mlal_lane_prfm_cortex_a35;
261 }
262 break;
263 case cpuinfo_uarch_cortex_a55r0:
264 if (mr == 4 && nr == 8 && log2_kr == 0) {
265 xnn_params.qc8.gemm.minmax.gemm[XNN_MR_TO_INDEX(4)].function[i] = (xnn_gemm_ukernel_function) xnn_qc8_gemm_minmax_fp32_ukernel_4x8__aarch32_neonv8_mlal_lane_cortex_a53;
266 xnn_params.qc8.gemm.minmax.igemm[XNN_MR_TO_INDEX(4)].function[i] = (xnn_igemm_ukernel_function) xnn_qc8_igemm_minmax_fp32_ukernel_4x8__aarch32_neonv8_mlal_lane_cortex_a53;
267 xnn_params.qc8.gemm.minmax.gemm[XNN_MR_TO_INDEX(1)].function[i] = (xnn_gemm_ukernel_function) xnn_qc8_gemm_minmax_fp32_ukernel_1x8__aarch32_neonv8_mlal_lane_cortex_a35;
268 xnn_params.qc8.gemm.minmax.igemm[XNN_MR_TO_INDEX(1)].function[i] = (xnn_igemm_ukernel_function) xnn_qc8_igemm_minmax_fp32_ukernel_1x8__aarch32_neonv8_mlal_lane_cortex_a35;
269 }
270 break;
271
272 default:
273 break;
274 }
275 }
276 }
277 #endif // XNN_MAX_UARCH_TYPES > 1
278 #else // XNN_ENABLE_ASSEMBLY
279 if (XNN_ENABLE_ARM_DOTPROD && cpuinfo_has_arm_neon_dot()) {
280 #if XNN_ENABLE_ARM_DOTPROD
281 xnn_params.qc8.gemm.minmax.gemm[XNN_MR_TO_INDEX(4)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qc8_gemm_minmax_fp32_ukernel_4x8c4__neondot);
282 xnn_params.qc8.gemm.minmax.igemm[XNN_MR_TO_INDEX(4)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qc8_igemm_minmax_fp32_ukernel_4x8c4__neondot);
283 xnn_params.qc8.gemm.minmax.gemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qc8_gemm_minmax_fp32_ukernel_1x8c4__neondot);
284 xnn_params.qc8.gemm.minmax.igemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qc8_igemm_minmax_fp32_ukernel_1x8c4__neondot);
285 xnn_params.qc8.gemm.init.qc8 = xnn_init_qc8_conv_minmax_fp32_neonv8_params;
286 xnn_params.qc8.gemm.mr = 4;
287 xnn_params.qc8.gemm.nr = 8;
288 xnn_params.qc8.gemm.log2_kr = 2;
289 #endif // XNN_ENABLE_ARM_DOTPROD
290 } else if (cpuinfo_has_arm_v8()) {
291 xnn_params.qc8.gemm.minmax.gemm[XNN_MR_TO_INDEX(2)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qc8_gemm_minmax_fp32_ukernel_2x8c2s4__neonv8_mlal);
292 xnn_params.qc8.gemm.minmax.igemm[XNN_MR_TO_INDEX(2)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qc8_igemm_minmax_fp32_ukernel_2x8c2s4__neonv8_mlal);
293 xnn_params.qc8.gemm.minmax.gemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qc8_gemm_minmax_fp32_ukernel_1x8c2s4__neonv8_mlal);
294 xnn_params.qc8.gemm.minmax.igemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qc8_igemm_minmax_fp32_ukernel_1x8c2s4__neonv8_mlal);
295 xnn_params.qc8.gemm.init.qc8 = xnn_init_qc8_conv_minmax_fp32_neonv8_params;
296 xnn_params.qc8.gemm.mr = 2;
297 xnn_params.qc8.gemm.nr = 8;
298 xnn_params.qc8.gemm.log2_kr = 1;
299 xnn_params.qc8.gemm.log2_sr = 2;
300 } else {
301 xnn_params.qc8.gemm.minmax.gemm[XNN_MR_TO_INDEX(2)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qc8_gemm_minmax_fp32_ukernel_2x8c2s4__neon_mlal);
302 xnn_params.qc8.gemm.minmax.igemm[XNN_MR_TO_INDEX(2)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qc8_igemm_minmax_fp32_ukernel_2x8c2s4__neon_mlal);
303 xnn_params.qc8.gemm.minmax.gemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qc8_gemm_minmax_fp32_ukernel_1x8c2s4__neon_mlal);
304 xnn_params.qc8.gemm.minmax.igemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qc8_igemm_minmax_fp32_ukernel_1x8c2s4__neon_mlal);
305 xnn_params.qc8.gemm.init.qc8 = xnn_init_qc8_conv_minmax_fp32_neon_params;
306 xnn_params.qc8.gemm.mr = 2;
307 xnn_params.qc8.gemm.nr = 8;
308 xnn_params.qc8.gemm.log2_kr = 1;
309 xnn_params.qc8.gemm.log2_sr = 2;
310 }
311 #endif // XNN_ENABLE_ASSEMBLY
312
313 if (cpuinfo_has_arm_neon_v8()) {
314 xnn_params.qc8.dwconv[0].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qc8_dwconv_minmax_fp32_ukernel_up16x3__aarch32_neonv8_mla8_cortex_a35;
315 xnn_params.qc8.dwconv[0].init.qc8 = xnn_init_qc8_conv_minmax_fp32_neonv8_params;
316 xnn_params.qc8.dwconv[0].channel_tile = 16;
317 xnn_params.qc8.dwconv[0].primary_tile = 3;
318 xnn_params.qc8.dwconv[1].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qc8_dwconv_minmax_fp32_ukernel_up16x9__neonv8_mla8_ld64;
319 xnn_params.qc8.dwconv[1].init.qc8 = xnn_init_qc8_conv_minmax_fp32_neonv8_params;
320 xnn_params.qc8.dwconv[1].channel_tile = 16;
321 xnn_params.qc8.dwconv[1].primary_tile = 9;
322 xnn_params.qc8.dwconv[2].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qc8_dwconv_minmax_fp32_ukernel_up8x25__neonv8_mla8_ld64;
323 xnn_params.qc8.dwconv[2].init.qc8 = xnn_init_qc8_conv_minmax_fp32_neonv8_params;
324 xnn_params.qc8.dwconv[2].channel_tile = 8;
325 xnn_params.qc8.dwconv[2].primary_tile = 25;
326 } else {
327 xnn_params.qc8.dwconv[0].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qc8_dwconv_minmax_fp32_ukernel_up16x3__neon_mla8_ld128;
328 xnn_params.qc8.dwconv[0].init.qc8 = xnn_init_qc8_conv_minmax_fp32_neon_params;
329 xnn_params.qc8.dwconv[0].channel_tile = 16;
330 xnn_params.qc8.dwconv[0].primary_tile = 3;
331 xnn_params.qc8.dwconv[1].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qc8_dwconv_minmax_fp32_ukernel_up16x9__neon_mla8_ld64;
332 xnn_params.qc8.dwconv[1].init.qc8 = xnn_init_qc8_conv_minmax_fp32_neon_params;
333 xnn_params.qc8.dwconv[1].channel_tile = 16;
334 xnn_params.qc8.dwconv[1].primary_tile = 9;
335 xnn_params.qc8.dwconv[2].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qc8_dwconv_minmax_fp32_ukernel_up8x25__neon_mla8_ld64;
336 xnn_params.qc8.dwconv[2].init.qc8 = xnn_init_qc8_conv_minmax_fp32_neon_params;
337 xnn_params.qc8.dwconv[2].channel_tile = 8;
338 xnn_params.qc8.dwconv[2].primary_tile = 25;
339 }
340 #endif // XNN_NO_QC8_OPERATORS
341
342 /**************************** QS8 AArch32 micro-kernels ****************************/
343 #ifndef XNN_NO_QS8_OPERATORS
344 init_flags |= XNN_INIT_FLAG_QS8;
345
346 #if XNN_ENABLE_ASSEMBLY
347 if (XNN_ENABLE_ARM_DOTPROD && cpuinfo_has_arm_neon_dot()) {
348 #if XNN_ENABLE_ARM_DOTPROD
349 switch (cpuinfo_get_uarch(0)->uarch) {
350 case cpuinfo_uarch_cortex_a55:
351 xnn_params.qs8.gemm.minmax.gemm[XNN_MR_TO_INDEX(4)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c4__aarch32_neondot_cortex_a55);
352 xnn_params.qs8.gemm.minmax.igemm[XNN_MR_TO_INDEX(4)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c4__aarch32_neondot_cortex_a55);
353 xnn_params.qs8.gemm.minmax.gemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c4__neondot);
354 xnn_params.qs8.gemm.minmax.igemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4__neondot);
355 xnn_params.qs8.gemm.init.qs8 = xnn_init_qs8_conv_minmax_rndnu_neon_params;
356 xnn_params.qs8.gemm.mr = 4;
357 xnn_params.qs8.gemm.nr = 8;
358 xnn_params.qs8.gemm.log2_kr = 2;
359 break;
360 default:
361 xnn_params.qs8.gemm.minmax.gemm[XNN_MR_TO_INDEX(4)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c4__aarch32_neondot_ld64);
362 xnn_params.qs8.gemm.minmax.igemm[XNN_MR_TO_INDEX(4)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c4__aarch32_neondot_ld64);
363 xnn_params.qs8.gemm.minmax.gemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c4__neondot);
364 xnn_params.qs8.gemm.minmax.igemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4__neondot);
365 xnn_params.qs8.gemm.init.qs8 = xnn_init_qs8_conv_minmax_rndnu_neon_params;
366 xnn_params.qs8.gemm.mr = 4;
367 xnn_params.qs8.gemm.nr = 8;
368 xnn_params.qs8.gemm.log2_kr = 2;
369 break;
370 }
371 #endif // XNN_ENABLE_ARM_DOTPROD
372 } else {
373 switch (cpuinfo_get_uarch(0)->uarch) {
374 case cpuinfo_uarch_cortex_a5:
375 case cpuinfo_uarch_cortex_a7:
376 case cpuinfo_uarch_krait:
377 case cpuinfo_uarch_kryo:
378 xnn_params.qs8.gemm.minmax.gemm[XNN_MR_TO_INDEX(4)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_prfm_cortex_a7);
379 xnn_params.qs8.gemm.minmax.igemm[XNN_MR_TO_INDEX(4)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_prfm_cortex_a7);
380 xnn_params.qs8.gemm.minmax.gemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_rndnu_ukernel_1x8__aarch32_neon_mlal_lane_prfm_cortex_a7);
381 xnn_params.qs8.gemm.minmax.igemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_rndnu_ukernel_1x8__aarch32_neon_mlal_lane_prfm_cortex_a7);
382 xnn_params.qs8.gemm.init.qs8 = xnn_init_qs8_conv_minmax_rndnu_neon_params;
383 xnn_params.qs8.gemm.mr = 4;
384 xnn_params.qs8.gemm.nr = 8;
385 break;
386 case cpuinfo_uarch_cortex_a32:
387 case cpuinfo_uarch_cortex_a35:
388 xnn_params.qs8.gemm.minmax.gemm[XNN_MR_TO_INDEX(4)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_cortex_a7);
389 xnn_params.qs8.gemm.minmax.igemm[XNN_MR_TO_INDEX(4)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_cortex_a7);
390 xnn_params.qs8.gemm.minmax.gemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_rndnu_ukernel_1x8__aarch32_neon_mlal_lane_cortex_a7);
391 xnn_params.qs8.gemm.minmax.igemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_rndnu_ukernel_1x8__aarch32_neon_mlal_lane_cortex_a7);
392 xnn_params.qs8.gemm.init.qs8 = xnn_init_qs8_conv_minmax_rndnu_neon_params;
393 xnn_params.qs8.gemm.mr = 4;
394 xnn_params.qs8.gemm.nr = 8;
395 break;
396 case cpuinfo_uarch_cortex_a53:
397 case cpuinfo_uarch_cortex_a57:
398 xnn_params.qs8.gemm.minmax.gemm[XNN_MR_TO_INDEX(4)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_prfm_cortex_a53);
399 xnn_params.qs8.gemm.minmax.igemm[XNN_MR_TO_INDEX(4)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_prfm_cortex_a53);
400 xnn_params.qs8.gemm.minmax.gemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_rndnu_ukernel_1x8__aarch32_neon_mlal_lane_prfm_cortex_a7);
401 xnn_params.qs8.gemm.minmax.igemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_rndnu_ukernel_1x8__aarch32_neon_mlal_lane_prfm_cortex_a7);
402 xnn_params.qs8.gemm.init.qs8 = xnn_init_qs8_conv_minmax_rndnu_neon_params;
403 xnn_params.qs8.gemm.mr = 4;
404 xnn_params.qs8.gemm.nr = 8;
405 break;
406 case cpuinfo_uarch_cortex_a55r0:
407 xnn_params.qs8.gemm.minmax.gemm[XNN_MR_TO_INDEX(4)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_cortex_a53);
408 xnn_params.qs8.gemm.minmax.igemm[XNN_MR_TO_INDEX(4)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_cortex_a53);
409 xnn_params.qs8.gemm.minmax.gemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_rndnu_ukernel_1x8__aarch32_neon_mlal_lane_cortex_a7);
410 xnn_params.qs8.gemm.minmax.igemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_rndnu_ukernel_1x8__aarch32_neon_mlal_lane_cortex_a7);
411 xnn_params.qs8.gemm.init.qs8 = xnn_init_qs8_conv_minmax_rndnu_neon_params;
412 xnn_params.qs8.gemm.mr = 4;
413 xnn_params.qs8.gemm.nr = 8;
414 break;
415 case cpuinfo_uarch_cortex_a72:
416 xnn_params.qs8.gemm.minmax.gemm[XNN_MR_TO_INDEX(2)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c2s4__neon_mlal);
417 xnn_params.qs8.gemm.minmax.igemm[XNN_MR_TO_INDEX(2)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c2s4__neon_mlal);
418 xnn_params.qs8.gemm.minmax.gemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c2s4__neon_mlal);
419 xnn_params.qs8.gemm.minmax.igemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2s4__neon_mlal);
420 xnn_params.qs8.gemm.init.qs8 = xnn_init_qs8_conv_minmax_rndnu_neon_params;
421 xnn_params.qs8.gemm.mr = 2;
422 xnn_params.qs8.gemm.nr = 8;
423 xnn_params.qs8.gemm.log2_kr = 1;
424 xnn_params.qs8.gemm.log2_sr = 2;
425 break;
426 case cpuinfo_uarch_exynos_m1:
427 case cpuinfo_uarch_exynos_m2:
428 case cpuinfo_uarch_exynos_m3:
429 xnn_params.qs8.gemm.minmax.gemm[XNN_MR_TO_INDEX(4)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_prfm_ld64);
430 xnn_params.qs8.gemm.minmax.igemm[XNN_MR_TO_INDEX(4)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_prfm_ld64);
431 xnn_params.qs8.gemm.minmax.gemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_rndnu_ukernel_1x8__aarch32_neon_mlal_lane_cortex_a7);
432 xnn_params.qs8.gemm.minmax.igemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_rndnu_ukernel_1x8__aarch32_neon_mlal_lane_cortex_a7);
433 xnn_params.qs8.gemm.init.qs8 = xnn_init_qs8_conv_minmax_rndnu_neon_params;
434 xnn_params.qs8.gemm.mr = 4;
435 xnn_params.qs8.gemm.nr = 8;
436 break;
437 default:
438 xnn_params.qs8.gemm.minmax.gemm[XNN_MR_TO_INDEX(4)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_ld64);
439 xnn_params.qs8.gemm.minmax.igemm[XNN_MR_TO_INDEX(4)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_ld64);
440 xnn_params.qs8.gemm.minmax.gemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_rndnu_ukernel_1x8__aarch32_neon_mlal_lane_cortex_a7);
441 xnn_params.qs8.gemm.minmax.igemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_rndnu_ukernel_1x8__aarch32_neon_mlal_lane_cortex_a7);
442 xnn_params.qs8.gemm.init.qs8 = xnn_init_qs8_conv_minmax_rndnu_neon_params;
443 xnn_params.qs8.gemm.mr = 4;
444 xnn_params.qs8.gemm.nr = 8;
445 break;
446 }
447 }
448 #if XNN_MAX_UARCH_TYPES > 1
449 {
450 /* Choose micro-kernels for little cores according to micro-kernel specification for the big core */
451 const uint32_t mr = xnn_params.qs8.gemm.mr;
452 const uint32_t nr = xnn_params.qs8.gemm.nr;
453 const uint32_t log2_kr = xnn_params.qs8.gemm.log2_kr;
454 for (size_t i = 1; i < XNN_MAX_UARCH_TYPES; i++) {
455 const struct cpuinfo_uarch_info* uarch_info = cpuinfo_get_uarch(i);
456 if (uarch_info == NULL) {
457 /* No more microarchitectures in the system */
458 break;
459 }
460
461 switch (uarch_info->uarch) {
462 case cpuinfo_uarch_cortex_a55:
463 #if XNN_ENABLE_ARM_DOTPROD
464 if (mr == 4 && nr == 8 && log2_kr == 2 && cpuinfo_has_arm_neon_dot()) {
465 xnn_params.qs8.gemm.minmax.gemm[XNN_MR_TO_INDEX(4)].function[i] = (xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c4__aarch32_neondot_cortex_a55;
466 xnn_params.qs8.gemm.minmax.igemm[XNN_MR_TO_INDEX(4)].function[i] = (xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c4__aarch32_neondot_cortex_a55;
467 xnn_params.qs8.gemm.minmax.gemm[XNN_MR_TO_INDEX(1)].function[i] = (xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c4__neondot;
468 xnn_params.qs8.gemm.minmax.igemm[XNN_MR_TO_INDEX(1)].function[i] = (xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4__neondot;
469 }
470 #endif // XNN_ENABLE_ARM_DOTPROD
471 break;
472 case cpuinfo_uarch_cortex_a53:
473 if (mr == 4 && nr == 8 && log2_kr == 0) {
474 xnn_params.qs8.gemm.minmax.gemm[XNN_MR_TO_INDEX(4)].function[i] = (xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_prfm_cortex_a53;
475 xnn_params.qs8.gemm.minmax.igemm[XNN_MR_TO_INDEX(4)].function[i] = (xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_prfm_cortex_a53;
476 xnn_params.qs8.gemm.minmax.gemm[XNN_MR_TO_INDEX(1)].function[i] = (xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_rndnu_ukernel_1x8__aarch32_neon_mlal_lane_prfm_cortex_a7;
477 xnn_params.qs8.gemm.minmax.igemm[XNN_MR_TO_INDEX(1)].function[i] = (xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_rndnu_ukernel_1x8__aarch32_neon_mlal_lane_prfm_cortex_a7;
478 }
479 break;
480 case cpuinfo_uarch_cortex_a55r0:
481 if (mr == 4 && nr == 8 && log2_kr == 0) {
482 xnn_params.qs8.gemm.minmax.gemm[XNN_MR_TO_INDEX(4)].function[i] = (xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_cortex_a53;
483 xnn_params.qs8.gemm.minmax.igemm[XNN_MR_TO_INDEX(4)].function[i] = (xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_cortex_a53;
484 xnn_params.qs8.gemm.minmax.gemm[XNN_MR_TO_INDEX(1)].function[i] = (xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_rndnu_ukernel_1x8__aarch32_neon_mlal_lane_cortex_a7;
485 xnn_params.qs8.gemm.minmax.igemm[XNN_MR_TO_INDEX(1)].function[i] = (xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_rndnu_ukernel_1x8__aarch32_neon_mlal_lane_cortex_a7;
486 }
487 break;
488 default:
489 break;
490 }
491 }
492 }
493 #endif // XNN_MAX_UARCH_TYPES > 1
494 #else // XNN_ENABLE_ASSEMBLY
495 if (XNN_ENABLE_ARM_DOTPROD && cpuinfo_has_arm_neon_dot()) {
496 #if XNN_ENABLE_ARM_DOTPROD
497 xnn_params.qs8.gemm.minmax.gemm[XNN_MR_TO_INDEX(4)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c4__neondot);
498 xnn_params.qs8.gemm.minmax.igemm[XNN_MR_TO_INDEX(4)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c4__neondot);
499 xnn_params.qs8.gemm.minmax.gemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c4__neondot);
500 xnn_params.qs8.gemm.minmax.igemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4__neondot);
501 xnn_params.qs8.gemm.init.qs8 = xnn_init_qs8_conv_minmax_rndnu_neon_params;
502 xnn_params.qs8.gemm.mr = 4;
503 xnn_params.qs8.gemm.nr = 8;
504 xnn_params.qs8.gemm.log2_kr = 2;
505 #endif // XNN_ENABLE_ARM_DOTPROD
506 } else {
507 xnn_params.qs8.gemm.minmax.gemm[XNN_MR_TO_INDEX(2)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c2s4__neon_mlal);
508 xnn_params.qs8.gemm.minmax.igemm[XNN_MR_TO_INDEX(2)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c2s4__neon_mlal);
509 xnn_params.qs8.gemm.minmax.gemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c2s4__neon_mlal);
510 xnn_params.qs8.gemm.minmax.igemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2s4__neon_mlal);
511 xnn_params.qs8.gemm.init.qs8 = xnn_init_qs8_conv_minmax_rndnu_neon_params;
512 xnn_params.qs8.gemm.mr = 2;
513 xnn_params.qs8.gemm.nr = 8;
514 xnn_params.qs8.gemm.log2_kr = 1;
515 xnn_params.qs8.gemm.log2_sr = 2;
516 }
517 #endif // XNN_ENABLE_ASSEMBLY
518
519 xnn_params.qs8.dwconv[0].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qs8_dwconv_minmax_rndnu_ukernel_up16x9__neon_mla8_ld64;
520 xnn_params.qs8.dwconv[0].init.qs8 = xnn_init_qs8_conv_minmax_rndnu_neon_params;
521 xnn_params.qs8.dwconv[0].channel_tile = 16;
522 xnn_params.qs8.dwconv[0].primary_tile = 9;
523 xnn_params.qs8.dwconv[1].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qs8_dwconv_minmax_rndnu_ukernel_up8x25__neon_mla8_ld64;
524 xnn_params.qs8.dwconv[1].init.qs8 = xnn_init_qs8_conv_minmax_rndnu_neon_params;
525 xnn_params.qs8.dwconv[1].channel_tile = 8;
526 xnn_params.qs8.dwconv[1].primary_tile = 25;
527
528 xnn_params.qs8.gavgpool = (struct gavgpool_parameters) {
529 .unipass = (xnn_gavgpool_unipass_ukernel_function) xnn_qs8_gavgpool_minmax_rndnu_ukernel_7x__neon_c8,
530 .multipass = (xnn_gavgpool_multipass_ukernel_function) xnn_qs8_gavgpool_minmax_rndnu_ukernel_7p7x__neon_c8,
531 .init.qs8 = xnn_init_qs8_avgpool_minmax_rndnu_neon_params,
532 .update.qs8 = xnn_update_qs8_avgpool_minmax_rndnu_neon_params,
533 .row_tile = 7,
534 .channel_tile = 8,
535 };
536
537 xnn_params.qs8.vadd = (struct vbinary_parameters) {
538 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_qs8_vadd_minmax_ukernel__neon_ld64_x16,
539 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_qs8_vaddc_minmax_ukernel__neon_ld64_x16,
540 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_qs8_vaddc_minmax_ukernel__neon_ld64_x16,
541 .init.qs8_add = xnn_init_qs8_add_minmax_neon_params,
542 .element_tile = 16,
543 };
544 xnn_params.qs8.vmul = (struct vbinary_parameters) {
545 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_qs8_vmul_minmax_rndnu_ukernel__neon_ld64_x16,
546 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_qs8_vmulc_minmax_rndnu_ukernel__neon_ld64_x16,
547 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_qs8_vmulc_minmax_rndnu_ukernel__neon_ld64_x16,
548 .init.qs8_mul = xnn_init_qs8_mul_minmax_rndnu_neon_params,
549 .element_tile = 16,
550 };
551
552 xnn_params.qs8.lrelu = (struct vunary_parameters) {
553 .ukernel = (xnn_vunary_ukernel_function) xnn_qs8_vlrelu_ukernel__neon_x32,
554 .init.qs8_lrelu = xnn_init_qs8_lrelu_neon_params,
555 .element_tile = 32,
556 };
557 #endif // XNN_NO_QS8_OPERATORS
558
559 /*************************** QU8 AArch32 micro-kernels ***************************/
560 #ifndef XNN_NO_QU8_OPERATORS
561 init_flags |= XNN_INIT_FLAG_QU8;
562
563 #if XNN_ENABLE_ASSEMBLY
564 if (XNN_ENABLE_ARM_DOTPROD && cpuinfo_has_arm_neon_dot()) {
565 #if XNN_ENABLE_ARM_DOTPROD
566 xnn_params.qu8.gemm.minmax.gemm[XNN_MR_TO_INDEX(4)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qu8_gemm_minmax_rndnu_ukernel_4x8c4__neondot);
567 xnn_params.qu8.gemm.minmax.igemm[XNN_MR_TO_INDEX(4)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qu8_igemm_minmax_rndnu_ukernel_4x8c4__neondot);
568 xnn_params.qu8.gemm.minmax.gemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qu8_gemm_minmax_rndnu_ukernel_1x8c4__neondot);
569 xnn_params.qu8.gemm.minmax.igemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qu8_igemm_minmax_rndnu_ukernel_1x8c4__neondot);
570 xnn_params.qu8.gemm.init.qu8 = xnn_init_qu8_conv_minmax_rndnu_neon_params;
571 xnn_params.qu8.gemm.mr = 4;
572 xnn_params.qu8.gemm.nr = 8;
573 xnn_params.qu8.gemm.log2_kr = 2;
574 #endif // XNN_ENABLE_ARM_DOTPROD
575 } else {
576 switch (cpuinfo_get_uarch(0)->uarch) {
577 case cpuinfo_uarch_cortex_a5:
578 case cpuinfo_uarch_cortex_a7:
579 case cpuinfo_uarch_krait:
580 case cpuinfo_uarch_kryo:
581 xnn_params.qu8.gemm.minmax.gemm[XNN_MR_TO_INDEX(4)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qu8_gemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_prfm_cortex_a7);
582 xnn_params.qu8.gemm.minmax.igemm[XNN_MR_TO_INDEX(4)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qu8_igemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_prfm_cortex_a7);
583 xnn_params.qu8.gemm.minmax.gemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qu8_gemm_minmax_rndnu_ukernel_1x8__aarch32_neon_mlal_lane_prfm_cortex_a7);
584 xnn_params.qu8.gemm.minmax.igemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qu8_igemm_minmax_rndnu_ukernel_1x8__aarch32_neon_mlal_lane_prfm_cortex_a7);
585 xnn_params.qu8.gemm.init.qu8 = xnn_init_qu8_conv_minmax_rndnu_neon_params;
586 xnn_params.qu8.gemm.mr = 4;
587 xnn_params.qu8.gemm.nr = 8;
588 break;
589 case cpuinfo_uarch_cortex_a32:
590 case cpuinfo_uarch_cortex_a35:
591 xnn_params.qu8.gemm.minmax.gemm[XNN_MR_TO_INDEX(4)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qu8_gemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_cortex_a7);
592 xnn_params.qu8.gemm.minmax.igemm[XNN_MR_TO_INDEX(4)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qu8_igemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_cortex_a7);
593 xnn_params.qu8.gemm.minmax.gemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qu8_gemm_minmax_rndnu_ukernel_1x8__aarch32_neon_mlal_lane_cortex_a7);
594 xnn_params.qu8.gemm.minmax.igemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qu8_igemm_minmax_rndnu_ukernel_1x8__aarch32_neon_mlal_lane_cortex_a7);
595 xnn_params.qu8.gemm.init.qu8 = xnn_init_qu8_conv_minmax_rndnu_neon_params;
596 xnn_params.qu8.gemm.mr = 4;
597 xnn_params.qu8.gemm.nr = 8;
598 break;
599 case cpuinfo_uarch_cortex_a53:
600 case cpuinfo_uarch_cortex_a57:
601 case cpuinfo_uarch_cortex_a72:
602 xnn_params.qu8.gemm.minmax.gemm[XNN_MR_TO_INDEX(4)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qu8_gemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_prfm_cortex_a53);
603 xnn_params.qu8.gemm.minmax.igemm[XNN_MR_TO_INDEX(4)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qu8_igemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_prfm_cortex_a53);
604 xnn_params.qu8.gemm.minmax.gemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qu8_gemm_minmax_rndnu_ukernel_1x8__aarch32_neon_mlal_lane_prfm_cortex_a7);
605 xnn_params.qu8.gemm.minmax.igemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qu8_igemm_minmax_rndnu_ukernel_1x8__aarch32_neon_mlal_lane_prfm_cortex_a7);
606 xnn_params.qu8.gemm.init.qu8 = xnn_init_qu8_conv_minmax_rndnu_neon_params;
607 xnn_params.qu8.gemm.mr = 4;
608 xnn_params.qu8.gemm.nr = 8;
609 break;
610 case cpuinfo_uarch_cortex_a55r0:
611 xnn_params.qu8.gemm.minmax.gemm[XNN_MR_TO_INDEX(4)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qu8_gemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_cortex_a53);
612 xnn_params.qu8.gemm.minmax.igemm[XNN_MR_TO_INDEX(4)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qu8_igemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_cortex_a53);
613 xnn_params.qu8.gemm.minmax.gemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qu8_gemm_minmax_rndnu_ukernel_1x8__aarch32_neon_mlal_lane_cortex_a7);
614 xnn_params.qu8.gemm.minmax.igemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qu8_igemm_minmax_rndnu_ukernel_1x8__aarch32_neon_mlal_lane_cortex_a7);
615 xnn_params.qu8.gemm.init.qu8 = xnn_init_qu8_conv_minmax_rndnu_neon_params;
616 xnn_params.qu8.gemm.mr = 4;
617 xnn_params.qu8.gemm.nr = 8;
618 break;
619 case cpuinfo_uarch_exynos_m1:
620 case cpuinfo_uarch_exynos_m2:
621 case cpuinfo_uarch_exynos_m3:
622 xnn_params.qu8.gemm.minmax.gemm[XNN_MR_TO_INDEX(4)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qu8_gemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_prfm_ld64);
623 xnn_params.qu8.gemm.minmax.igemm[XNN_MR_TO_INDEX(4)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qu8_igemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_prfm_ld64);
624 xnn_params.qu8.gemm.minmax.gemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qu8_gemm_minmax_rndnu_ukernel_1x8__aarch32_neon_mlal_lane_prfm_cortex_a7);
625 xnn_params.qu8.gemm.minmax.igemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qu8_igemm_minmax_rndnu_ukernel_1x8__aarch32_neon_mlal_lane_prfm_cortex_a7);
626 xnn_params.qu8.gemm.init.qu8 = xnn_init_qu8_conv_minmax_rndnu_neon_params;
627 xnn_params.qu8.gemm.mr = 4;
628 xnn_params.qu8.gemm.nr = 8;
629 break;
630 default:
631 xnn_params.qu8.gemm.minmax.gemm[XNN_MR_TO_INDEX(4)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qu8_gemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_ld64);
632 xnn_params.qu8.gemm.minmax.igemm[XNN_MR_TO_INDEX(4)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qu8_igemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_ld64);
633 xnn_params.qu8.gemm.minmax.gemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qu8_gemm_minmax_rndnu_ukernel_1x8__aarch32_neon_mlal_lane_cortex_a7);
634 xnn_params.qu8.gemm.minmax.igemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qu8_igemm_minmax_rndnu_ukernel_1x8__aarch32_neon_mlal_lane_cortex_a7);
635 xnn_params.qu8.gemm.init.qu8 = xnn_init_qu8_conv_minmax_rndnu_neon_params;
636 xnn_params.qu8.gemm.mr = 4;
637 xnn_params.qu8.gemm.nr = 8;
638 break;
639 }
640 }
641 #if XNN_MAX_UARCH_TYPES > 1
642 {
643 /* Choose micro-kernels for little cores according to micro-kernel specification for the big core */
644 const uint32_t mr = xnn_params.qu8.gemm.mr;
645 const uint32_t nr = xnn_params.qu8.gemm.nr;
646 const uint32_t log2_kr = xnn_params.qu8.gemm.log2_kr;
647 for (size_t i = 1; i < XNN_MAX_UARCH_TYPES; i++) {
648 const struct cpuinfo_uarch_info* uarch_info = cpuinfo_get_uarch(i);
649 if (uarch_info == NULL) {
650 /* No more microarchitectures in the system */
651 break;
652 }
653
654 switch (uarch_info->uarch) {
655 case cpuinfo_uarch_cortex_a53:
656 if (mr == 4 && nr == 8 && log2_kr == 0) {
657 xnn_params.qu8.gemm.minmax.gemm[XNN_MR_TO_INDEX(4)].function[i] = (xnn_gemm_ukernel_function) xnn_qu8_gemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_prfm_cortex_a53;
658 xnn_params.qu8.gemm.minmax.igemm[XNN_MR_TO_INDEX(4)].function[i] = (xnn_igemm_ukernel_function) xnn_qu8_igemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_prfm_cortex_a53;
659 xnn_params.qu8.gemm.minmax.gemm[XNN_MR_TO_INDEX(1)].function[i] = (xnn_gemm_ukernel_function) xnn_qu8_gemm_minmax_rndnu_ukernel_1x8__aarch32_neon_mlal_lane_prfm_cortex_a7;
660 xnn_params.qu8.gemm.minmax.igemm[XNN_MR_TO_INDEX(1)].function[i] = (xnn_igemm_ukernel_function) xnn_qu8_igemm_minmax_rndnu_ukernel_1x8__aarch32_neon_mlal_lane_prfm_cortex_a7;
661 }
662 break;
663 case cpuinfo_uarch_cortex_a55r0:
664 if (mr == 4 && nr == 8 && log2_kr == 0) {
665 xnn_params.qu8.gemm.minmax.gemm[XNN_MR_TO_INDEX(4)].function[i] = (xnn_gemm_ukernel_function) xnn_qu8_gemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_cortex_a53;
666 xnn_params.qu8.gemm.minmax.igemm[XNN_MR_TO_INDEX(4)].function[i] = (xnn_igemm_ukernel_function) xnn_qu8_igemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_cortex_a53;
667 xnn_params.qu8.gemm.minmax.gemm[XNN_MR_TO_INDEX(1)].function[i] = (xnn_gemm_ukernel_function) xnn_qu8_gemm_minmax_rndnu_ukernel_1x8__aarch32_neon_mlal_lane_cortex_a7;
668 xnn_params.qu8.gemm.minmax.igemm[XNN_MR_TO_INDEX(1)].function[i] = (xnn_igemm_ukernel_function) xnn_qu8_igemm_minmax_rndnu_ukernel_1x8__aarch32_neon_mlal_lane_cortex_a7;
669 }
670 break;
671 default:
672 break;
673 }
674 }
675 }
676 #endif // XNN_MAX_UARCH_TYPES > 1
677 #else // XNN_ENABLE_ASSEMBLY
678 if (XNN_ENABLE_ARM_DOTPROD && cpuinfo_has_arm_neon_dot()) {
679 #if XNN_ENABLE_ARM_DOTPROD
680 xnn_params.qu8.gemm.minmax.gemm[XNN_MR_TO_INDEX(4)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qu8_gemm_minmax_rndnu_ukernel_4x8c4__neondot);
681 xnn_params.qu8.gemm.minmax.igemm[XNN_MR_TO_INDEX(4)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qu8_igemm_minmax_rndnu_ukernel_4x8c4__neondot);
682 xnn_params.qu8.gemm.minmax.gemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qu8_gemm_minmax_rndnu_ukernel_1x8c4__neondot);
683 xnn_params.qu8.gemm.minmax.igemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qu8_igemm_minmax_rndnu_ukernel_1x8c4__neondot);
684 xnn_params.qu8.gemm.init.qu8 = xnn_init_qu8_conv_minmax_rndnu_neon_params;
685 xnn_params.qu8.gemm.mr = 4;
686 xnn_params.qu8.gemm.nr = 8;
687 xnn_params.qu8.gemm.log2_kr = 2;
688 #endif // XNN_ENABLE_ARM_DOTPROD
689 } else {
690 xnn_params.qu8.gemm.minmax.gemm[XNN_MR_TO_INDEX(3)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qu8_gemm_minmax_rndnu_ukernel_3x8__neon_mlal_lane);
691 xnn_params.qu8.gemm.minmax.igemm[XNN_MR_TO_INDEX(3)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qu8_igemm_minmax_rndnu_ukernel_3x8__neon_mlal_lane);
692 xnn_params.qu8.gemm.minmax.gemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qu8_gemm_minmax_rndnu_ukernel_1x8__neon_mlal_lane);
693 xnn_params.qu8.gemm.minmax.igemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qu8_igemm_minmax_rndnu_ukernel_1x8__neon_mlal_lane);
694 xnn_params.qu8.gemm.init.qu8 = xnn_init_qu8_conv_minmax_rndnu_neon_params;
695 xnn_params.qu8.gemm.mr = 3;
696 xnn_params.qu8.gemm.nr = 8;
697 }
698 #endif // XNN_ENABLE_ASSEMBLY
699
700 xnn_params.qu8.dwconv[0].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qu8_dwconv_minmax_rndnu_ukernel_up16x9__neon_mul8;
701 xnn_params.qu8.dwconv[0].init.qu8 = xnn_init_qu8_conv_minmax_rndnu_neon_params;
702 xnn_params.qu8.dwconv[0].channel_tile = 16;
703 xnn_params.qu8.dwconv[0].primary_tile = 9;
704 xnn_params.qu8.dwconv[1].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qu8_dwconv_minmax_rndnu_ukernel_up8x25__neon_mul8;
705 xnn_params.qu8.dwconv[1].init.qu8 = xnn_init_qu8_conv_minmax_rndnu_neon_params;
706 xnn_params.qu8.dwconv[1].channel_tile = 8;
707 xnn_params.qu8.dwconv[1].primary_tile = 25;
708
709 xnn_params.qu8.avgpool = (struct avgpool_parameters) {
710 .unipass = (xnn_avgpool_unipass_ukernel_function) xnn_qu8_avgpool_minmax_ukernel_9x__neon_c8,
711 .multipass = (xnn_avgpool_multipass_ukernel_function) xnn_qu8_avgpool_minmax_ukernel_9p8x__neon_c8,
712 .init.qu8 = xnn_init_qu8_avgpool_minmax_neon_params,
713 .primary_tile = 9,
714 .incremental_tile = 8,
715 .channel_tile = 8,
716 };
717 xnn_params.qu8.gavgpool = (struct gavgpool_parameters) {
718 .unipass = (xnn_gavgpool_unipass_ukernel_function) xnn_qu8_gavgpool_minmax_rndnu_ukernel_7x__neon_c8,
719 .multipass = (xnn_gavgpool_multipass_ukernel_function) xnn_qu8_gavgpool_minmax_rndnu_ukernel_7p7x__neon_c8,
720 .init.qu8 = xnn_init_qu8_avgpool_minmax_rndnu_neon_params,
721 .update.qu8 = xnn_update_qu8_avgpool_minmax_rndnu_neon_params,
722 .row_tile = 7,
723 .channel_tile = 8,
724 };
725 xnn_params.qu8.vadd = (struct vbinary_parameters) {
726 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_qu8_vadd_minmax_ukernel__neon_ld64_x16,
727 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_qu8_vaddc_minmax_ukernel__neon_ld64_x16,
728 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_qu8_vaddc_minmax_ukernel__neon_ld64_x16,
729 .init.qu8_add = xnn_init_qu8_add_minmax_neon_params,
730 .element_tile = 8,
731 };
732 xnn_params.qu8.vmul = (struct vbinary_parameters) {
733 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_qu8_vmul_minmax_rndnu_ukernel__neon_ld64_x16,
734 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_qu8_vmulc_minmax_rndnu_ukernel__neon_ld64_x16,
735 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_qu8_vmulc_minmax_rndnu_ukernel__neon_ld64_x16,
736 .init.qu8_mul = xnn_init_qu8_mul_minmax_rndnu_neon_params,
737 .element_tile = 16,
738 };
739
740 xnn_params.qu8.lrelu = (struct vunary_parameters) {
741 .ukernel = (xnn_vunary_ukernel_function) xnn_qu8_vlrelu_ukernel__neon_x32,
742 .init.qu8_lrelu = xnn_init_qu8_lrelu_neon_params,
743 .element_tile = 32,
744 };
745 #endif // XNN_NO_QU8_OPERATORS
746
747 /**************************** S8 AArch32 micro-kernels ****************************/
748 #ifndef XNN_NO_S8_OPERATORS
749 init_flags |= XNN_INIT_FLAG_S8;
750
751 xnn_params.s8.clamp = (struct vunary_parameters) {
752 .ukernel = (xnn_vunary_ukernel_function) xnn_s8_vclamp_ukernel__neon_x64,
753 .init.s8_minmax = xnn_init_s8_minmax_neon_params,
754 .element_tile = 64,
755 };
756 xnn_params.s8.ibilinear = (struct ibilinear_parameters) {
757 .ukernel = (xnn_ibilinear_ukernel_function) xnn_s8_ibilinear_ukernel__neon_c8,
758 .pixel_tile = 1,
759 .channel_tile = 8,
760 };
761 xnn_params.s8.maxpool = (struct maxpool_parameters) {
762 .ukernel = (xnn_maxpool_ukernel_function) xnn_s8_maxpool_minmax_ukernel_9p8x__neon_c16,
763 .init.s8 = xnn_init_s8_minmax_neon_params,
764 .mr = 9,
765 .qr = 8,
766 };
767 #endif // XNN_NO_S8_OPERATORS
768
769 /**************************** U8 AArch32 micro-kernels ****************************/
770 #ifndef XNN_NO_U8_OPERATORS
771 init_flags |= XNN_INIT_FLAG_U8;
772
773 xnn_params.u8.clamp = (struct vunary_parameters) {
774 .ukernel = (xnn_vunary_ukernel_function) xnn_u8_vclamp_ukernel__neon_x64,
775 .init.u8_minmax = xnn_init_u8_minmax_neon_params,
776 .element_tile = 64,
777 };
778 xnn_params.u8.ibilinear = (struct ibilinear_parameters) {
779 .ukernel = (xnn_ibilinear_ukernel_function) xnn_u8_ibilinear_ukernel__neon_c8,
780 .pixel_tile = 1,
781 .channel_tile = 8,
782 };
783 xnn_params.u8.maxpool = (struct maxpool_parameters) {
784 .ukernel = (xnn_maxpool_ukernel_function) xnn_u8_maxpool_minmax_ukernel_9p8x__neon_c16,
785 .init.u8 = xnn_init_u8_minmax_neon_params,
786 .mr = 9,
787 .qr = 8,
788 };
789 xnn_params.u8.rmax = xnn_u8_rmax_ukernel__neon;
790 xnn_params.u8.lut32norm = xnn_u8_lut32norm_ukernel__scalar;
791 #endif // XNN_NO_U8_OPERATORS
792
793 /**************************** X8 AArch32 micro-kernels ****************************/
794 #ifndef XNN_NO_X8_OPERATORS
795 init_flags |= XNN_INIT_FLAG_X8;
796
797 xnn_params.x8.lut = xnn_x8_lut_ukernel__scalar_x4;
798 xnn_params.x8.zip = (struct zip_parameters) {
799 .x2 = (xnn_zipc_ukernel_function) xnn_x8_zip_x2_ukernel__neon,
800 .x3 = (xnn_zipc_ukernel_function) xnn_x8_zip_x3_ukernel__neon,
801 .x4 = (xnn_zipc_ukernel_function) xnn_x8_zip_x4_ukernel__neon,
802 .xm = (xnn_zipv_ukernel_function) xnn_x8_zip_xm_ukernel__neon,
803 };
804
805 xnn_params.x8.transpose = (struct transpose_parameters) {
806 .const_size_ukernel = (xnn_transposec_ukernel_function) xnn_x8_transposec_ukernel__16x16_reuse_dec_zip_neon,
807 .tile_size = 32,
808 };
809 #endif // XNN_NO_X8_OPERATORS
810
811 /**************************** X16 AArch32 micro-kernels ****************************/
812 #ifndef XNN_NO_X16_OPERATORS
813 init_flags |= XNN_INIT_FLAG_X16;
814
815 xnn_params.x16.transpose = (struct transpose_parameters) {
816 .const_size_ukernel = (xnn_transposec_ukernel_function) xnn_x16_transposec_ukernel__8x8_reuse_dec_zip_neon,
817 .tile_size = 32,
818 };
819 #endif // XNN_NO_X16_OPERATORS
820
821 /**************************** F32 AArch32 micro-kernels ****************************/
822 #ifndef XNN_NO_F32_OPERATORS
823 init_flags |= XNN_INIT_FLAG_F32;
824
825 #if XNN_ENABLE_ASSEMBLY
826 switch (cpuinfo_get_uarch(0)->uarch) {
827 case cpuinfo_uarch_cortex_a5:
828 case cpuinfo_uarch_cortex_a7:
829 case cpuinfo_uarch_krait:
830 case cpuinfo_uarch_kryo:
831 xnn_params.f32.gemm.minmax.gemm[XNN_MR_TO_INDEX(4)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_4x8__aarch32_neon_cortex_a7);
832 xnn_params.f32.gemm.minmax.igemm[XNN_MR_TO_INDEX(4)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_4x8__aarch32_neon_cortex_a7);
833 xnn_params.f32.gemm.minmax.gemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_1x8__neon_lane_ld64);
834 xnn_params.f32.gemm.minmax.igemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_1x8__neon_lane_ld64);
835 xnn_params.f32.gemm.init.f32 = xnn_init_f32_minmax_scalar_params;
836 xnn_params.f32.gemm.mr = 4;
837 xnn_params.f32.gemm.nr = 8;
838 break;
839 case cpuinfo_uarch_cortex_a53:
840 xnn_params.f32.gemm.minmax.gemm[XNN_MR_TO_INDEX(4)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_4x8__aarch32_neon_prfm_cortex_a53);
841 xnn_params.f32.gemm.minmax.igemm[XNN_MR_TO_INDEX(4)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_4x8__aarch32_neon_prfm_cortex_a53);
842 xnn_params.f32.gemm.minmax.gemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_1x8__neon_lane_ld64);
843 xnn_params.f32.gemm.minmax.igemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_1x8__neon_lane_ld64);
844 xnn_params.f32.gemm.init.f32 = xnn_init_f32_minmax_scalar_params;
845 xnn_params.f32.gemm.mr = 4;
846 xnn_params.f32.gemm.nr = 8;
847 break;
848 case cpuinfo_uarch_cortex_a55r0:
849 xnn_params.f32.gemm.minmax.gemm[XNN_MR_TO_INDEX(4)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_4x8__aarch32_neon_cortex_a53);
850 xnn_params.f32.gemm.minmax.igemm[XNN_MR_TO_INDEX(4)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_4x8__aarch32_neon_cortex_a53);
851 xnn_params.f32.gemm.minmax.gemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_1x8__neon_lane_ld64);
852 xnn_params.f32.gemm.minmax.igemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_1x8__neon_lane_ld64);
853 xnn_params.f32.gemm.init.f32 = xnn_init_f32_minmax_scalar_params;
854 xnn_params.f32.gemm.mr = 4;
855 xnn_params.f32.gemm.nr = 8;
856 break;
857 case cpuinfo_uarch_cortex_a32:
858 case cpuinfo_uarch_cortex_a35:
859 case cpuinfo_uarch_cortex_a55:
860 xnn_params.f32.gemm.minmax.gemm[XNN_MR_TO_INDEX(4)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_4x8__aarch32_neon_cortex_a55);
861 xnn_params.f32.gemm.minmax.igemm[XNN_MR_TO_INDEX(4)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_4x8__aarch32_neon_cortex_a55);
862 xnn_params.f32.gemm.minmax.gemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_1x8__neon_lane_ld64);
863 xnn_params.f32.gemm.minmax.igemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_1x8__neon_lane_ld64);
864 xnn_params.f32.gemm.init.f32 = xnn_init_f32_minmax_scalar_params;
865 xnn_params.f32.gemm.mr = 4;
866 xnn_params.f32.gemm.nr = 8;
867 break;
868
869 case cpuinfo_uarch_cortex_a57:
870 case cpuinfo_uarch_cortex_a72:
871 case cpuinfo_uarch_cortex_a73:
872 xnn_params.f32.gemm.minmax.gemm[XNN_MR_TO_INDEX(4)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_4x8__aarch32_neon_prfm_cortex_a75);
873 xnn_params.f32.gemm.minmax.igemm[XNN_MR_TO_INDEX(4)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_4x8__aarch32_neon_prfm_cortex_a75);
874 xnn_params.f32.gemm.minmax.gemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_1x8__neon_lane_ld64);
875 xnn_params.f32.gemm.minmax.igemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_1x8__neon_lane_ld64);
876 xnn_params.f32.gemm.init.f32 = xnn_init_f32_minmax_scalar_params;
877 xnn_params.f32.gemm.mr = 4;
878 xnn_params.f32.gemm.nr = 8;
879 break;
880
881 default:
882 xnn_params.f32.gemm.minmax.gemm[XNN_MR_TO_INDEX(4)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_4x8__aarch32_neon_cortex_a75);
883 xnn_params.f32.gemm.minmax.igemm[XNN_MR_TO_INDEX(4)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_4x8__aarch32_neon_cortex_a75);
884 xnn_params.f32.gemm.minmax.gemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_1x8__neon_lane_ld64);
885 xnn_params.f32.gemm.minmax.igemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_1x8__neon_lane_ld64);
886 xnn_params.f32.gemm.init.f32 = xnn_init_f32_minmax_scalar_params;
887 xnn_params.f32.gemm.mr = 4;
888 xnn_params.f32.gemm.nr = 8;
889 #if XNN_ENABLE_JIT
890 xnn_params.f32.gemm.generator.gemm = xnn_init_hmp_gemm_codegen(xnn_generate_f32_gemm_ukernel_4x8__aarch32_neon_cortex_a75);
891 xnn_params.f32.gemm.generator.igemm = xnn_init_hmp_igemm_codegen(xnn_generate_f32_igemm_ukernel_4x8__aarch32_neon_cortex_a75);
892 #endif
893 break;
894 }
895 #if XNN_MAX_UARCH_TYPES > 1
896 {
897 /* Choose micro-kernels for little cores according to micro-kernel specification for the big core */
898 const uint32_t mr = xnn_params.f32.gemm.mr;
899 const uint32_t nr = xnn_params.f32.gemm.nr;
900 for (size_t i = 1; i < XNN_MAX_UARCH_TYPES; i++) {
901 const struct cpuinfo_uarch_info* uarch_info = cpuinfo_get_uarch(i);
902 if (uarch_info == NULL) {
903 /* No more microarchitectures in the system */
904 break;
905 }
906
907 switch (uarch_info->uarch) {
908 case cpuinfo_uarch_cortex_a53:
909 if (mr == 4 && nr == 8) {
910 xnn_params.f32.gemm.minmax.gemm[XNN_MR_TO_INDEX(4)].function[i] = (xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_4x8__aarch32_neon_prfm_cortex_a53;
911 xnn_params.f32.gemm.minmax.igemm[XNN_MR_TO_INDEX(4)].function[i] = (xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_4x8__aarch32_neon_prfm_cortex_a53;
912 xnn_params.f32.gemm.minmax.gemm[XNN_MR_TO_INDEX(1)].function[i] = (xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_1x8__neon_lane_ld64;
913 xnn_params.f32.gemm.minmax.igemm[XNN_MR_TO_INDEX(1)].function[i] = (xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_1x8__neon_lane_ld64;
914 }
915 break;
916 case cpuinfo_uarch_cortex_a55r0:
917 if (mr == 4 && nr == 8) {
918 xnn_params.f32.gemm.minmax.gemm[XNN_MR_TO_INDEX(4)].function[i] = (xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_4x8__aarch32_neon_cortex_a53;
919 xnn_params.f32.gemm.minmax.igemm[XNN_MR_TO_INDEX(4)].function[i] = (xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_4x8__aarch32_neon_cortex_a53;
920 xnn_params.f32.gemm.minmax.gemm[XNN_MR_TO_INDEX(1)].function[i] = (xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_1x8__neon_lane_ld64;
921 xnn_params.f32.gemm.minmax.igemm[XNN_MR_TO_INDEX(1)].function[i] = (xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_1x8__neon_lane_ld64;
922 }
923 break;
924 case cpuinfo_uarch_cortex_a55:
925 if (mr == 4 && nr == 8) {
926 xnn_params.f32.gemm.minmax.gemm[XNN_MR_TO_INDEX(4)].function[i] = (xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_4x8__aarch32_neon_cortex_a55;
927 xnn_params.f32.gemm.minmax.igemm[XNN_MR_TO_INDEX(4)].function[i] = (xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_4x8__aarch32_neon_cortex_a55;
928 xnn_params.f32.gemm.minmax.gemm[XNN_MR_TO_INDEX(1)].function[i] = (xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_1x8__neon_lane_ld64;
929 xnn_params.f32.gemm.minmax.igemm[XNN_MR_TO_INDEX(1)].function[i] = (xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_1x8__neon_lane_ld64;
930 }
931 break;
932 default:
933 break;
934 }
935 }
936 }
937 #endif // XNN_MAX_UARCH_TYPES > 1
938 #else // XNN_ENABLE_ASSEMBLY
939 xnn_params.f32.gemm.minmax.gemm[XNN_MR_TO_INDEX(4)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_4x8__neon_lane_ld128);
940 xnn_params.f32.gemm.minmax.igemm[XNN_MR_TO_INDEX(4)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_4x8__neon_lane_ld128);
941 xnn_params.f32.gemm.minmax.gemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_1x8__neon_lane_ld64);
942 xnn_params.f32.gemm.minmax.igemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_1x8__neon_lane_ld64);
943 xnn_params.f32.gemm.init.f32 = xnn_init_f32_minmax_scalar_params;
944 xnn_params.f32.gemm.mr = 4;
945 xnn_params.f32.gemm.nr = 8;
946 #endif // XNN_ENABLE_ASSEMBLY
947 xnn_params.f32.gemm2.minmax.gemm[XNN_MR_TO_INDEX(4)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_4x2__neon_lane_ld64);
948 xnn_params.f32.gemm2.minmax.igemm[XNN_MR_TO_INDEX(4)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_4x2__neon_lane_ld64);
949 xnn_params.f32.gemm2.init.f32 = xnn_init_f32_minmax_scalar_params;
950 xnn_params.f32.gemm2.mr = 4;
951 xnn_params.f32.gemm2.nr = 2;
952
953 xnn_params.f32.dwconv[0].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_f32_dwconv_minmax_ukernel_up8x3__neon;
954 xnn_params.f32.dwconv[0].init.f32 = xnn_init_f32_minmax_scalar_params;
955 xnn_params.f32.dwconv[0].channel_tile = 8,
956 xnn_params.f32.dwconv[0].primary_tile = 3,
957
958 xnn_params.f32.dwconv[1].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_f32_dwconv_minmax_ukernel_up8x4__neon;
959 xnn_params.f32.dwconv[1].init.f32 = xnn_init_f32_minmax_scalar_params;
960 xnn_params.f32.dwconv[1].channel_tile = 8,
961 xnn_params.f32.dwconv[1].primary_tile = 4,
962
963 xnn_params.f32.dwconv[2].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_f32_dwconv_minmax_ukernel_up8x9__neon;
964 xnn_params.f32.dwconv[2].init.f32 = xnn_init_f32_minmax_scalar_params;
965 xnn_params.f32.dwconv[2].channel_tile = 8;
966 xnn_params.f32.dwconv[2].primary_tile = 9;
967
968 xnn_params.f32.dwconv[3].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_f32_dwconv_minmax_ukernel_up8x25__neon_acc2;
969 xnn_params.f32.dwconv[3].init.f32 = xnn_init_f32_minmax_scalar_params;
970 xnn_params.f32.dwconv[3].channel_tile = 8;
971 xnn_params.f32.dwconv[3].primary_tile = 25;
972
973 xnn_params.f32.avgpool = (struct avgpool_parameters) {
974 .unipass = (xnn_avgpool_unipass_ukernel_function) xnn_f32_avgpool_minmax_ukernel_9x__neon_c4,
975 .multipass = (xnn_avgpool_multipass_ukernel_function) xnn_f32_avgpool_minmax_ukernel_9p8x__neon_c4,
976 .init.f32 = xnn_init_f32_scaleminmax_scalar_params,
977 .primary_tile = 9,
978 .incremental_tile = 8,
979 .channel_tile = 4,
980 };
981 xnn_params.f32.pavgpool = (struct pavgpool_parameters) {
982 .unipass = (xnn_pavgpool_unipass_ukernel_function) xnn_f32_pavgpool_minmax_ukernel_9x__neon_c4,
983 .multipass = (xnn_pavgpool_multipass_ukernel_function) xnn_f32_pavgpool_minmax_ukernel_9p8x__neon_c4,
984 .init.f32 = xnn_init_f32_minmax_scalar_params,
985 .primary_tile = 9,
986 .incremental_tile = 8,
987 .channel_tile = 4,
988 };
989 xnn_params.f32.gavgpool = (struct gavgpool_parameters) {
990 .unipass = (xnn_gavgpool_unipass_ukernel_function) xnn_f32_gavgpool_minmax_ukernel_7x__neon_c4,
991 .multipass = (xnn_gavgpool_multipass_ukernel_function) xnn_f32_gavgpool_minmax_ukernel_7p7x__neon_c4,
992 .init.f32 = xnn_init_f32_scaleminmax_scalar_params,
993 .update.f32 = xnn_update_f32_scaleminmax_scalar_params,
994 .row_tile = 7,
995 .channel_tile = 4,
996 };
997 xnn_params.f32.maxpool = (struct maxpool_parameters) {
998 .ukernel = (xnn_maxpool_ukernel_function) xnn_f32_maxpool_minmax_ukernel_9p8x__neon_c4,
999 .init.f32 = xnn_init_f32_minmax_scalar_params,
1000 .mr = 9,
1001 .qr = 8,
1002 };
1003 xnn_params.f32.argmaxpool[0] = (struct argmaxpool_parameters) {
1004 .up = (xnn_argmaxpool_unipass_ukernel_function) xnn_f32_argmaxpool_ukernel_4x__neon_c4,
1005 .mr = 4,
1006 };
1007 xnn_params.f32.argmaxpool[1] = (struct argmaxpool_parameters) {
1008 .up = (xnn_argmaxpool_unipass_ukernel_function) xnn_f32_argmaxpool_ukernel_9x__neon_c4,
1009 .mr = 9,
1010 };
1011 xnn_params.f32.argmaxpool[2] = (struct argmaxpool_parameters) {
1012 .mp = (xnn_argmaxpool_multipass_ukernel_function) xnn_f32_argmaxpool_ukernel_9p8x__neon_c4,
1013 .mr = 9,
1014 .qr = 8,
1015 };
1016 xnn_params.f32.ibilinear = (struct ibilinear_parameters) {
1017 .ukernel = (xnn_ibilinear_ukernel_function) xnn_f32_ibilinear_ukernel__neon_c8,
1018 .pixel_tile = 1,
1019 .channel_tile = 8,
1020 };
1021 xnn_params.f32.abs = (struct vunary_parameters) {
1022 .ukernel = (xnn_vunary_ukernel_function) xnn_f32_vabs_ukernel__neon_x8,
1023 .element_tile = 8,
1024 };
1025 xnn_params.f32.clamp = (struct vunary_parameters) {
1026 .ukernel = (xnn_vunary_ukernel_function) xnn_f32_vclamp_ukernel__neon_x8,
1027 .init.f32_minmax = xnn_init_f32_minmax_scalar_params,
1028 .element_tile = 8,
1029 };
1030 if (cpuinfo_has_arm_neon_fma()) {
1031 xnn_params.f32.elu = (struct vunary_parameters) {
1032 .ukernel = (xnn_vunary_ukernel_function) xnn_f32_velu_ukernel__neonfma_rr1_p6_x8,
1033 .init.f32_elu = xnn_init_f32_elu_neonfma_rr1_p6_params,
1034 .element_tile = 8,
1035 };
1036 } else {
1037 xnn_params.f32.elu = (struct vunary_parameters) {
1038 .ukernel = (xnn_vunary_ukernel_function) xnn_f32_velu_ukernel__neon_rr2_lut16_p3_x8,
1039 .init.f32_elu = xnn_init_f32_elu_neon_rr2_lut16_p3_params,
1040 .element_tile = 8,
1041 };
1042 }
1043 xnn_params.f32.hswish = (struct vunary_parameters) {
1044 .ukernel = (xnn_vunary_ukernel_function) xnn_f32_vhswish_ukernel__neon_x16,
1045 .init.f32_hswish = xnn_init_f32_hswish_scalar_params,
1046 .element_tile = 16,
1047 };
1048 xnn_params.f32.lrelu = (struct vunary_parameters) {
1049 .ukernel = (xnn_vunary_ukernel_function) xnn_f32_vlrelu_ukernel__neon_x8,
1050 .init.f32_lrelu = xnn_init_f32_lrelu_scalar_params,
1051 .element_tile = 8,
1052 };
1053 xnn_params.f32.neg = (struct vunary_parameters) {
1054 .ukernel = (xnn_vunary_ukernel_function) xnn_f32_vneg_ukernel__neon_x8,
1055 .element_tile = 8,
1056 };
1057 if (cpuinfo_has_arm_neon_v8()) {
1058 xnn_params.f32.rndne = (struct vunary_parameters) {
1059 .ukernel = (xnn_vunary_ukernel_function) xnn_f32_vrndne_ukernel__neonv8_x8,
1060 .element_tile = 8,
1061 };
1062 xnn_params.f32.rndz = (struct vunary_parameters) {
1063 .ukernel = (xnn_vunary_ukernel_function) xnn_f32_vrndz_ukernel__neonv8_x8,
1064 .element_tile = 8,
1065 };
1066 xnn_params.f32.rndu = (struct vunary_parameters) {
1067 .ukernel = (xnn_vunary_ukernel_function) xnn_f32_vrndu_ukernel__neonv8_x8,
1068 .element_tile = 8,
1069 };
1070 xnn_params.f32.rndd = (struct vunary_parameters) {
1071 .ukernel = (xnn_vunary_ukernel_function) xnn_f32_vrndd_ukernel__neonv8_x8,
1072 .element_tile = 8,
1073 };
1074 } else {
1075 xnn_params.f32.rndne = (struct vunary_parameters) {
1076 .ukernel = (xnn_vunary_ukernel_function) xnn_f32_vrndne_ukernel__neon_x8,
1077 .element_tile = 8,
1078 };
1079 xnn_params.f32.rndz = (struct vunary_parameters) {
1080 .ukernel = (xnn_vunary_ukernel_function) xnn_f32_vrndz_ukernel__neon_x8,
1081 .element_tile = 8,
1082 };
1083 xnn_params.f32.rndu = (struct vunary_parameters) {
1084 .ukernel = (xnn_vunary_ukernel_function) xnn_f32_vrndu_ukernel__neon_x8,
1085 .element_tile = 8,
1086 };
1087 xnn_params.f32.rndd = (struct vunary_parameters) {
1088 .ukernel = (xnn_vunary_ukernel_function) xnn_f32_vrndd_ukernel__neon_x8,
1089 .element_tile = 8,
1090 };
1091 }
1092 xnn_params.f32.sigmoid = (struct vunary_parameters) {
1093 .ukernel = (xnn_vunary_ukernel_function) xnn_f32_vsigmoid_ukernel__neon_rr2_lut64_p2_nr2recps_x8,
1094 .init.f32_sigmoid = xnn_init_f32_sigmoid_neon_rr2_lut64_p2_params,
1095 .element_tile = 8,
1096 };
1097 xnn_params.f32.sqr = (struct vunary_parameters) {
1098 .ukernel = (xnn_vunary_ukernel_function) xnn_f32_vsqr_ukernel__neon_x8,
1099 .element_tile = 8,
1100 };
1101 xnn_params.f32.sqrt = (struct vunary_parameters) {
1102 .ukernel = (xnn_vunary_ukernel_function) xnn_f32_vsqrt_ukernel__scalar_sqrt_x1,
1103 .element_tile = 1,
1104 };
1105 xnn_params.f32.prelu = (struct prelu_parameters) {
1106 .ukernel = (xnn_prelu_ukernel_function) xnn_f32_prelu_ukernel__neon_2x8,
1107 .row_tile = 2,
1108 .channel_tile = 8,
1109 };
1110 xnn_params.f32.raddstoreexpminusmax = (struct raddstoreexpminusmax_parameters) {
1111 .ukernel = (xnn_raddstoreexpminusmax_ukernel_function) xnn_f32_raddstoreexpminusmax_ukernel__neon_rr2_lut64_p2_x8,
1112 .init.f32 = xnn_init_f32_expminus_neon_rr2_lut64_p2_params,
1113 .element_tile = 8,
1114 };
1115 xnn_params.f32.rmax = (xnn_rmax_ukernel_function) xnn_f32_rmax_ukernel__neon;
1116 xnn_params.f32.vadd = (struct vbinary_parameters) {
1117 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vadd_minmax_ukernel__neon_x8,
1118 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vaddc_minmax_ukernel__neon_x8,
1119 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vaddc_minmax_ukernel__neon_x8,
1120 .init.f32_minmax = xnn_init_f32_minmax_scalar_params,
1121 .element_tile = 8,
1122 };
1123 xnn_params.f32.vdiv = (struct vbinary_parameters) {
1124 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vdiv_minmax_ukernel__scalar_x2,
1125 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vdivc_minmax_ukernel__scalar_x2,
1126 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vrdivc_minmax_ukernel__scalar_x2,
1127 .init.f32_minmax = xnn_init_f32_minmax_scalar_params,
1128 .element_tile = 2,
1129 };
1130 xnn_params.f32.vmax = (struct vbinary_parameters) {
1131 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmax_ukernel__neon_x8,
1132 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmaxc_ukernel__neon_x8,
1133 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmaxc_ukernel__neon_x8,
1134 .element_tile = 8,
1135 };
1136 xnn_params.f32.vmin = (struct vbinary_parameters) {
1137 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmin_ukernel__neon_x8,
1138 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vminc_ukernel__neon_x8,
1139 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vminc_ukernel__neon_x8,
1140 .element_tile = 8,
1141 };
1142 xnn_params.f32.vmul = (struct vbinary_parameters) {
1143 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmul_minmax_ukernel__neon_x8,
1144 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmulc_minmax_ukernel__neon_x8,
1145 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmulc_minmax_ukernel__neon_x8,
1146 .init.f32_minmax = xnn_init_f32_minmax_scalar_params,
1147 .element_tile = 8,
1148 };
1149 xnn_params.f32.vsub = (struct vbinary_parameters) {
1150 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vsub_minmax_ukernel__neon_x8,
1151 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vsubc_minmax_ukernel__neon_x8,
1152 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vrsubc_minmax_ukernel__neon_x8,
1153 .init.f32_minmax = xnn_init_f32_minmax_scalar_params,
1154 .element_tile = 8,
1155 };
1156 xnn_params.f32.vsqrdiff = (struct vbinary_parameters) {
1157 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vsqrdiff_ukernel__neon_x8,
1158 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vsqrdiffc_ukernel__neon_x8,
1159 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vsqrdiffc_ukernel__neon_x8,
1160 .element_tile = 8,
1161 };
1162 xnn_params.f32.vmulcaddc = (struct vmulcaddc_parameters) {
1163 .ukernel = (xnn_vmulcaddc_ukernel_function) xnn_f32_vmulcaddc_minmax_ukernel_c4__neon_2x,
1164 .init.f32 = xnn_init_f32_minmax_scalar_params,
1165 .channel_tile = 4,
1166 .row_tile = 2,
1167 };
1168 #ifndef XNN_NO_NCHW_OPERATORS
1169 init_flags |= XNN_INIT_FLAG_CHW_OPT;
1170
1171 xnn_params.f32.spmm = (struct spmm_parameters) {
1172 .ukernel = (xnn_spmm_ukernel_function) xnn_f32_spmm_minmax_ukernel_32x1__neon,
1173 .mr = 32,
1174 .nr = 1,
1175 };
1176 xnn_params.f32.conv_hwc2chw_3x3c3s2 = (struct conv_hwc2chw_parameters) {
1177 .ukernel_with_symm_padding =
1178 (xnn_conv_hwc2chw_ukernel_function) xnn_f32_conv_hwc2chw_ukernel_3x3s2p1c3x4__neon_2x2,
1179 .output_channel_tile = 4,
1180 .output_height_tile = 2,
1181 .output_width_tile = 2,
1182 };
1183 xnn_params.f32.dwconv2d_chw_3x3 = (struct dwconv2d_chw_parameters) {
1184 .ukernel = (xnn_dwconv2d_chw_ukernel_function) xnn_f32_dwconv2d_chw_ukernel_3x3p1__neon_2x4,
1185 .output_width_tile = 4,
1186 .output_height_tile = 2,
1187 };
1188 xnn_params.f32.dwconv2d_chw_3x3s2 = (struct dwconv2d_chw_parameters) {
1189 .ukernel = (xnn_dwconv2d_chw_ukernel_function) xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__neon_1x4,
1190 .output_width_tile = 4,
1191 .output_height_tile = 1,
1192 };
1193 xnn_params.f32.dwconv2d_chw_5x5 = (struct dwconv2d_chw_parameters) {
1194 .ukernel = (xnn_dwconv2d_chw_ukernel_function) xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_1x4,
1195 .output_width_tile = 4,
1196 .output_height_tile = 1,
1197 };
1198 xnn_params.f32.dwconv2d_chw_5x5s2 = (struct dwconv2d_chw_parameters) {
1199 .ukernel = (xnn_dwconv2d_chw_ukernel_function) xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__neon_1x4,
1200 .output_width_tile = 4,
1201 .output_height_tile = 1,
1202 };
1203 xnn_params.f32.gavgpool_cw = (struct gavgpool_cw_parameters) {
1204 .ukernel = (xnn_gavgpool_cw_ukernel_function) xnn_f32_gavgpool_cw_ukernel__neon_x4,
1205 .channel_tile = 4,
1206 };
1207 xnn_params.f32.ibilinear_chw = (struct ibilinear_chw_parameters) {
1208 .ukernel = (xnn_ibilinear_chw_ukernel_function) xnn_f32_ibilinear_chw_ukernel__neon_p8,
1209 .channel_tile = 1,
1210 .pixel_tile = 8,
1211 };
1212 #endif // XNN_NO_NCHW_OPERATORS
1213 #endif // XNN_NO_F32_OPERATORS
1214
1215 /*************************** VCVT AArch32 micro-kernels ***************************/
1216 #ifndef XNN_NO_VCVT_OPERATORS
1217 init_flags |= XNN_INIT_FLAG_VCVT;
1218
1219 if (cpuinfo_has_arm_neon_fp16()) {
1220 xnn_params.vcvt.f16_to_f32 = (struct vunary_parameters) {
1221 .ukernel = (xnn_vunary_ukernel_function) xnn_f16_f32_vcvt_ukernel__neonfp16_x16,
1222 .element_tile = 16,
1223 };
1224 xnn_params.vcvt.f32_to_f16 = (struct vunary_parameters) {
1225 .ukernel = (xnn_vunary_ukernel_function) xnn_f32_f16_vcvt_ukernel__neonfp16_x16,
1226 .element_tile = 16,
1227 };
1228 } else {
1229 xnn_params.vcvt.f16_to_f32 = (struct vunary_parameters) {
1230 .ukernel = (xnn_vunary_ukernel_function) xnn_f16_f32_vcvt_ukernel__neon_int16_x16,
1231 .init.f16_f32_cvt = xnn_init_f16_f32_cvt_neon_params,
1232 .element_tile = 16,
1233 };
1234 xnn_params.vcvt.f32_to_f16 = (struct vunary_parameters) {
1235 .ukernel = (xnn_vunary_ukernel_function) xnn_f32_f16_vcvt_ukernel__neon_x8,
1236 .init.f32_f16_cvt = xnn_init_f32_f16_cvt_neon_params,
1237 .element_tile = 8,
1238 };
1239 }
1240 if (cpuinfo_has_arm_neon_v8()) {
1241 xnn_params.vcvt.f32_to_qs8 = (struct vunary_parameters) {
1242 .ukernel = (xnn_vunary_ukernel_function) xnn_f32_qs8_vcvt_ukernel__neonv8_x32,
1243 .init.f32_qs8_cvt = xnn_init_f32_qs8_cvt_neonv8_params,
1244 .element_tile = 32,
1245 };
1246 xnn_params.vcvt.f32_to_qu8 = (struct vunary_parameters) {
1247 .ukernel = (xnn_vunary_ukernel_function) xnn_f32_qu8_vcvt_ukernel__neonv8_x32,
1248 .init.f32_qu8_cvt = xnn_init_f32_qu8_cvt_neonv8_params,
1249 .element_tile = 32,
1250 };
1251 } else {
1252 xnn_params.vcvt.f32_to_qs8 = (struct vunary_parameters) {
1253 .ukernel = (xnn_vunary_ukernel_function) xnn_f32_qs8_vcvt_ukernel__neon_x32,
1254 .init.f32_qs8_cvt = xnn_init_f32_qs8_cvt_neon_params,
1255 .element_tile = 32,
1256 };
1257 xnn_params.vcvt.f32_to_qu8 = (struct vunary_parameters) {
1258 .ukernel = (xnn_vunary_ukernel_function) xnn_f32_qu8_vcvt_ukernel__neon_x32,
1259 .init.f32_qu8_cvt = xnn_init_f32_qu8_cvt_neon_params,
1260 .element_tile = 32,
1261 };
1262 }
1263 xnn_params.vcvt.qs8 = (struct vunary_parameters) {
1264 .ukernel = (xnn_vunary_ukernel_function) xnn_qs8_vcvt_ukernel__neon_x32,
1265 .init.qs8_cvt = xnn_init_qs8_cvt_neon_params,
1266 .element_tile = 32,
1267 };
1268 xnn_params.vcvt.qs8_to_f32 = (struct vunary_parameters) {
1269 .ukernel = (xnn_vunary_ukernel_function) xnn_qs8_f32_vcvt_ukernel__neon_x32,
1270 .init.qs8_f32_cvt = xnn_init_qs8_f32_cvt_neon_params,
1271 .element_tile = 32,
1272 };
1273 xnn_params.vcvt.qu8 = (struct vunary_parameters) {
1274 .ukernel = (xnn_vunary_ukernel_function) xnn_qu8_vcvt_ukernel__neon_x32,
1275 .init.qu8_cvt = xnn_init_qu8_cvt_neon_params,
1276 .element_tile = 32,
1277 };
1278 xnn_params.vcvt.qu8_to_f32 = (struct vunary_parameters) {
1279 .ukernel = (xnn_vunary_ukernel_function) xnn_qu8_f32_vcvt_ukernel__neon_x32,
1280 .init.qu8_f32_cvt = xnn_init_qu8_f32_cvt_neon_params,
1281 .element_tile = 32,
1282 };
1283 #endif // XNN_NO_VCVT_OPERATORS
1284
1285 /**************************** X32 AArch32 micro-kernels ****************************/
1286 #ifndef XNN_NO_X32_OPERATORS
1287 init_flags |= XNN_INIT_FLAG_X32;
1288
1289 xnn_params.x32.unpool = (xnn_unpool_ukernel_function) xnn_x32_unpool_ukernel__neon;
1290 xnn_params.x32.zip = (struct zip_parameters) {
1291 .x2 = (xnn_zipc_ukernel_function) xnn_x32_zip_x2_ukernel__neon,
1292 .x3 = (xnn_zipc_ukernel_function) xnn_x32_zip_x3_ukernel__neon,
1293 .x4 = (xnn_zipc_ukernel_function) xnn_x32_zip_x4_ukernel__neon,
1294 .xm = (xnn_zipv_ukernel_function) xnn_x32_zip_xm_ukernel__neon,
1295 };
1296
1297 xnn_params.x32.transpose = (struct transpose_parameters) {
1298 .const_size_ukernel = (xnn_transposec_ukernel_function) xnn_x32_transposec_ukernel__4x4_reuse_dec_zip_neon,
1299 .tile_size = 32,
1300 };
1301 #endif // XNN_NO_X32_OPERATORS
1302
1303 /**************************** XX AArch32 micro-kernels ****************************/
1304 #ifndef XNN_NO_XX_OPERATORS
1305 init_flags |= XNN_INIT_FLAG_XX;
1306
1307 xnn_params.xx.copy = (xnn_vunary_ukernel_function) xnn_xx_copy_ukernel__memcpy;
1308 xnn_params.xx.fill = (struct fill_parameters) {
1309 .ukernel = (xnn_fill_ukernel_function) xnn_xx_fill_ukernel__neon_x64,
1310 .row_tile = 1,
1311 };
1312 xnn_params.xx.pad = (struct pad_parameters) {
1313 .ukernel = (xnn_pad_ukernel_function) xnn_xx_pad_ukernel__neon,
1314 .row_tile = 1,
1315 };
1316 xnn_params.xx.transpose = (struct transpose_parameters) {
1317 .variable_size_ukernel = xnn_xx_transposev_ukernel__1x1_memcpy,
1318 .tile_size = 32,
1319 };
1320 #endif // XNN_NO_XX_OPERATORS
1321
1322 } else if (!XNN_PLATFORM_MOBILE) {
1323
1324 /*************************** QC8 AArch32 Pre-NEON micro-kernels ***************************/
1325 #ifndef XNN_NO_QC8_OPERATORS
1326 init_flags |= XNN_INIT_FLAG_QC8;
1327
1328 xnn_params.qc8.gemm.minmax.gemm[XNN_MR_TO_INDEX(2)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qc8_gemm_minmax_fp32_ukernel_2x2c4__armsimd32);
1329 xnn_params.qc8.gemm.minmax.igemm[XNN_MR_TO_INDEX(2)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qc8_igemm_minmax_fp32_ukernel_2x2c4__armsimd32);
1330 xnn_params.qc8.gemm.minmax.gemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qc8_gemm_minmax_fp32_ukernel_1x2c4__armsimd32);
1331 xnn_params.qc8.gemm.minmax.igemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qc8_igemm_minmax_fp32_ukernel_1x2c4__armsimd32);
1332 xnn_params.qc8.gemm.init.qc8 = xnn_init_qc8_conv_minmax_fp32_armsimd32_params;
1333 xnn_params.qc8.gemm.mr = 2;
1334 xnn_params.qc8.gemm.nr = 2;
1335 xnn_params.qc8.gemm.log2_kr = 2;
1336
1337 xnn_params.qc8.dwconv[0].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qc8_dwconv_minmax_fp32_ukernel_up1x3__scalar_fmagic;
1338 xnn_params.qc8.dwconv[0].init.qc8 = xnn_init_qc8_conv_minmax_fp32_scalar_fmagic_params;
1339 xnn_params.qc8.dwconv[0].channel_tile = 1;
1340 xnn_params.qc8.dwconv[0].primary_tile = 3;
1341 xnn_params.qc8.dwconv[1].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qc8_dwconv_minmax_fp32_ukernel_up1x9__scalar_fmagic;
1342 xnn_params.qc8.dwconv[1].init.qc8 = xnn_init_qc8_conv_minmax_fp32_scalar_fmagic_params;
1343 xnn_params.qc8.dwconv[1].channel_tile = 1;
1344 xnn_params.qc8.dwconv[1].primary_tile = 9;
1345 xnn_params.qc8.dwconv[2].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qc8_dwconv_minmax_fp32_ukernel_up1x25__scalar_fmagic;
1346 xnn_params.qc8.dwconv[2].init.qc8 = xnn_init_qc8_conv_minmax_fp32_scalar_fmagic_params;
1347 xnn_params.qc8.dwconv[2].channel_tile = 1;
1348 xnn_params.qc8.dwconv[2].primary_tile = 25;
1349 #endif // XNN_NO_QS8_OPERATORS
1350
1351 /*************************** QS8 AArch32 Pre-NEON micro-kernels ***************************/
1352 #ifndef XNN_NO_QS8_OPERATORS
1353 init_flags |= XNN_INIT_FLAG_QS8;
1354
1355 xnn_params.qs8.gemm.minmax.gemm[XNN_MR_TO_INDEX(2)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_fp32_ukernel_2x2c4__armsimd32);
1356 xnn_params.qs8.gemm.minmax.igemm[XNN_MR_TO_INDEX(2)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_fp32_ukernel_2x2c4__armsimd32);
1357 xnn_params.qs8.gemm.minmax.gemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_fp32_ukernel_1x2c4__armsimd32);
1358 xnn_params.qs8.gemm.minmax.igemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_fp32_ukernel_1x2c4__armsimd32);
1359 xnn_params.qs8.gemm.init.qs8 = xnn_init_qs8_conv_minmax_fp32_armsimd32_params;
1360 xnn_params.qs8.gemm.mr = 2;
1361 xnn_params.qs8.gemm.nr = 2;
1362 xnn_params.qs8.gemm.log2_kr = 2;
1363
1364 xnn_params.qs8.dwconv[0].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qs8_dwconv_minmax_fp32_ukernel_up1x9__scalar_fmagic;
1365 xnn_params.qs8.dwconv[0].init.qs8 = xnn_init_qs8_conv_minmax_fp32_scalar_fmagic_params;
1366 xnn_params.qs8.dwconv[0].channel_tile = 1;
1367 xnn_params.qs8.dwconv[0].primary_tile = 9;
1368 xnn_params.qs8.dwconv[1].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qs8_dwconv_minmax_fp32_ukernel_up1x25__scalar_fmagic;
1369 xnn_params.qs8.dwconv[1].init.qs8 = xnn_init_qs8_conv_minmax_fp32_scalar_fmagic_params;
1370 xnn_params.qs8.dwconv[1].channel_tile = 1;
1371 xnn_params.qs8.dwconv[1].primary_tile = 25;
1372
1373 xnn_params.qs8.gavgpool = (struct gavgpool_parameters) {
1374 .unipass = (xnn_gavgpool_unipass_ukernel_function) xnn_qs8_gavgpool_minmax_fp32_ukernel_7x__scalar_imagic_c1,
1375 .multipass = (xnn_gavgpool_multipass_ukernel_function) xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__scalar_imagic_c1,
1376 .init.qs8 = xnn_init_qs8_avgpool_minmax_fp32_scalar_imagic_params,
1377 .update.qs8 = xnn_update_qs8_avgpool_minmax_fp32_scalar_imagic_params,
1378 .row_tile = 7,
1379 .channel_tile = 1,
1380 };
1381 xnn_params.qs8.vadd = (struct vbinary_parameters) {
1382 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_qs8_vadd_minmax_ukernel__scalar_x1,
1383 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_qs8_vaddc_minmax_ukernel__scalar_x1,
1384 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_qs8_vaddc_minmax_ukernel__scalar_x1,
1385 .init.qs8_add = xnn_init_qs8_add_minmax_scalar_params,
1386 .element_tile = 1,
1387 };
1388 xnn_params.qs8.vmul = (struct vbinary_parameters) {
1389 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_qs8_vmul_minmax_fp32_ukernel__scalar_x4,
1390 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_qs8_vmulc_minmax_fp32_ukernel__scalar_x4,
1391 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_qs8_vmulc_minmax_fp32_ukernel__scalar_x4,
1392 .init.qs8_mul = xnn_init_qs8_mul_minmax_fp32_scalar_params,
1393 .element_tile = 4,
1394 };
1395
1396 xnn_params.qs8.lrelu = (struct vunary_parameters) {
1397 .ukernel = (xnn_vunary_ukernel_function) xnn_qs8_vlrelu_ukernel__armsimd32_x4,
1398 .init.qs8_lrelu = xnn_init_qs8_lrelu_armsimd32_params,
1399 .element_tile = 4,
1400 };
1401 #endif // XNN_NO_QS8_OPERATORS
1402
1403 /*************************** QU8 AArch32 Pre-NEON micro-kernels ***************************/
1404 #ifndef XNN_NO_QU8_OPERATORS
1405 init_flags |= XNN_INIT_FLAG_QU8;
1406
1407 xnn_params.qu8.gemm.minmax.gemm[XNN_MR_TO_INDEX(2)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qu8_gemm_minmax_fp32_ukernel_2x2c4__armsimd32);
1408 xnn_params.qu8.gemm.minmax.igemm[XNN_MR_TO_INDEX(2)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qu8_igemm_minmax_fp32_ukernel_2x2c4__armsimd32);
1409 xnn_params.qu8.gemm.minmax.gemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qu8_gemm_minmax_fp32_ukernel_1x2c4__armsimd32);
1410 xnn_params.qu8.gemm.minmax.igemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qu8_igemm_minmax_fp32_ukernel_1x2c4__armsimd32);
1411 xnn_params.qu8.gemm.init.qu8 = xnn_init_qu8_conv_minmax_fp32_armsimd32_params;
1412 xnn_params.qu8.gemm.mr = 2;
1413 xnn_params.qu8.gemm.nr = 2;
1414 xnn_params.qu8.gemm.log2_kr = 2;
1415
1416 xnn_params.qu8.dwconv[0].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qu8_dwconv_minmax_fp32_ukernel_up1x9__scalar_fmagic;
1417 xnn_params.qu8.dwconv[0].init.qu8 = xnn_init_qu8_conv_minmax_fp32_scalar_fmagic_params;
1418 xnn_params.qu8.dwconv[0].channel_tile = 1;
1419 xnn_params.qu8.dwconv[0].primary_tile = 9;
1420 xnn_params.qu8.dwconv[1].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qu8_dwconv_minmax_fp32_ukernel_up1x25__scalar_fmagic;
1421 xnn_params.qu8.dwconv[1].init.qu8 = xnn_init_qu8_conv_minmax_fp32_scalar_fmagic_params;
1422 xnn_params.qu8.dwconv[1].channel_tile = 1;
1423 xnn_params.qu8.dwconv[1].primary_tile = 25;
1424
1425 xnn_params.qu8.avgpool = (struct avgpool_parameters) {
1426 .unipass = (xnn_avgpool_unipass_ukernel_function) xnn_qu8_avgpool_minmax_ukernel_9x__scalar_c1,
1427 .multipass = (xnn_avgpool_multipass_ukernel_function) xnn_qu8_avgpool_minmax_ukernel_9p8x__scalar_c1,
1428 .init.qu8 = xnn_init_qu8_avgpool_minmax_scalar_params,
1429 .primary_tile = 9,
1430 .incremental_tile = 8,
1431 .channel_tile = 1,
1432 };
1433 xnn_params.qu8.gavgpool = (struct gavgpool_parameters) {
1434 .unipass = (xnn_gavgpool_unipass_ukernel_function) xnn_qu8_gavgpool_minmax_fp32_ukernel_7x__scalar_imagic_c1,
1435 .multipass = (xnn_gavgpool_multipass_ukernel_function) xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__scalar_imagic_c1,
1436 .init.qu8 = xnn_init_qu8_avgpool_minmax_fp32_scalar_imagic_params,
1437 .update.qu8 = xnn_update_qu8_avgpool_minmax_fp32_scalar_imagic_params,
1438 .row_tile = 7,
1439 .channel_tile = 1,
1440 };
1441 xnn_params.qu8.vadd = (struct vbinary_parameters) {
1442 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_qu8_vadd_minmax_ukernel__scalar_x1,
1443 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_qu8_vaddc_minmax_ukernel__scalar_x1,
1444 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_qu8_vaddc_minmax_ukernel__scalar_x1,
1445 .init.qu8_add = xnn_init_qu8_add_minmax_scalar_params,
1446 .element_tile = 1,
1447 };
1448 xnn_params.qu8.vmul = (struct vbinary_parameters) {
1449 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_qu8_vmul_minmax_fp32_ukernel__scalar_x4,
1450 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_qu8_vmulc_minmax_fp32_ukernel__scalar_x4,
1451 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_qu8_vmulc_minmax_fp32_ukernel__scalar_x4,
1452 .init.qu8_mul = xnn_init_qu8_mul_minmax_fp32_scalar_params,
1453 .element_tile = 4,
1454 };
1455
1456 xnn_params.qu8.lrelu = (struct vunary_parameters) {
1457 .ukernel = (xnn_vunary_ukernel_function) xnn_qu8_vlrelu_ukernel__armsimd32_x4,
1458 .init.qu8_lrelu = xnn_init_qu8_lrelu_armsimd32_params,
1459 .element_tile = 4,
1460 };
1461 #endif // XNN_NO_QU8_OPERATORS
1462
1463 /**************************** S8 AArch32 Pre-NEON micro-kernels ****************************/
1464 #ifndef XNN_NO_S8_OPERATORS
1465 init_flags |= XNN_INIT_FLAG_S8;
1466
1467 xnn_params.s8.clamp = (struct vunary_parameters) {
1468 .ukernel = (xnn_vunary_ukernel_function) xnn_s8_vclamp_ukernel__scalar_x4,
1469 .init.s8_minmax = xnn_init_s8_minmax_scalar_params,
1470 .element_tile = 4,
1471 };
1472 xnn_params.s8.ibilinear = (struct ibilinear_parameters) {
1473 .ukernel = (xnn_ibilinear_ukernel_function) xnn_s8_ibilinear_ukernel__scalar_c1,
1474 .pixel_tile = 1,
1475 .channel_tile = 1,
1476 };
1477 xnn_params.s8.maxpool = (struct maxpool_parameters) {
1478 .ukernel = (xnn_maxpool_ukernel_function) xnn_s8_maxpool_minmax_ukernel_9p8x__scalar_c1,
1479 .init.s8 = xnn_init_s8_minmax_scalar_params,
1480 .mr = 9,
1481 .qr = 8,
1482 };
1483 #endif // XNN_NO_S8_OPERATORS
1484
1485 /**************************** U8 AArch32 Pre-NEON micro-kernels ****************************/
1486 #ifndef XNN_NO_U8_OPERATORS
1487 init_flags |= XNN_INIT_FLAG_U8;
1488
1489 xnn_params.u8.clamp = (struct vunary_parameters) {
1490 .ukernel = (xnn_vunary_ukernel_function) xnn_u8_vclamp_ukernel__scalar_x4,
1491 .init.u8_minmax = xnn_init_u8_minmax_scalar_params,
1492 .element_tile = 4,
1493 };
1494 xnn_params.u8.ibilinear = (struct ibilinear_parameters) {
1495 .ukernel = (xnn_ibilinear_ukernel_function) xnn_u8_ibilinear_ukernel__scalar_c1,
1496 .pixel_tile = 1,
1497 .channel_tile = 1,
1498 };
1499 xnn_params.u8.maxpool = (struct maxpool_parameters) {
1500 .ukernel = (xnn_maxpool_ukernel_function) xnn_u8_maxpool_minmax_ukernel_9p8x__scalar_c1,
1501 .init.u8 = xnn_init_u8_minmax_scalar_params,
1502 .mr = 9,
1503 .qr = 8,
1504 };
1505 xnn_params.u8.lut32norm = xnn_u8_lut32norm_ukernel__scalar;
1506 xnn_params.u8.rmax = xnn_u8_rmax_ukernel__scalar;
1507 #endif // XNN_NO_U8_OPERATORS
1508
1509 /**************************** X8 AArch32 Pre-NEON micro-kernels ****************************/
1510 #ifndef XNN_NO_X8_OPERATORS
1511 init_flags |= XNN_INIT_FLAG_X8;
1512
1513 xnn_params.x8.lut = xnn_x8_lut_ukernel__scalar_x4;
1514 xnn_params.x8.zip = (struct zip_parameters) {
1515 .x2 = (xnn_zipc_ukernel_function) xnn_x8_zip_x2_ukernel__scalar,
1516 .x3 = (xnn_zipc_ukernel_function) xnn_x8_zip_x3_ukernel__scalar,
1517 .x4 = (xnn_zipc_ukernel_function) xnn_x8_zip_x4_ukernel__scalar,
1518 .xm = (xnn_zipv_ukernel_function) xnn_x8_zip_xm_ukernel__scalar,
1519 };
1520
1521 xnn_params.x8.transpose = (struct transpose_parameters) {
1522 .const_size_ukernel = (xnn_transposec_ukernel_function) xnn_x8_transposec_ukernel__2x4_scalar_int,
1523 .tile_size = 32,
1524 };
1525 #endif // XNN_NO_X8_OPERATORS
1526
1527 /**************************** X16 AArch32 Pre-NEON micro-kernels ****************************/
1528 #ifndef XNN_NO_X16_OPERATORS
1529 init_flags |= XNN_INIT_FLAG_X16;
1530
1531 xnn_params.x16.transpose = (struct transpose_parameters) {
1532 .const_size_ukernel = (xnn_transposec_ukernel_function) xnn_x16_transposec_ukernel__2x4_scalar_int,
1533 .tile_size = 32,
1534 };
1535 #endif // XNN_NO_X16_OPERATORS
1536
1537 /**************************** F32 AArch32 Pre-NEON micro-kernels ****************************/
1538 #ifndef XNN_NO_F32_OPERATORS
1539 init_flags |= XNN_INIT_FLAG_F32;
1540
1541 xnn_params.f32.gemm.minmax.gemm[XNN_MR_TO_INDEX(4)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_4x4__scalar);
1542 xnn_params.f32.gemm.minmax.igemm[XNN_MR_TO_INDEX(4)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_4x4__scalar);
1543 xnn_params.f32.gemm.minmax.gemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_1x4__scalar);
1544 xnn_params.f32.gemm.minmax.igemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_1x4__scalar);
1545 xnn_params.f32.gemm.relu.gemm[XNN_MR_TO_INDEX(4)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_relu_ukernel_4x4__scalar);
1546 xnn_params.f32.gemm.relu.igemm[XNN_MR_TO_INDEX(4)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_relu_ukernel_4x4__scalar);
1547 xnn_params.f32.gemm.relu.gemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_relu_ukernel_1x4__scalar);
1548 xnn_params.f32.gemm.relu.igemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_relu_ukernel_1x4__scalar);
1549 xnn_params.f32.gemm.linear.gemm[XNN_MR_TO_INDEX(4)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_ukernel_4x4__scalar);
1550 xnn_params.f32.gemm.linear.igemm[XNN_MR_TO_INDEX(4)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_ukernel_4x4__scalar);
1551 xnn_params.f32.gemm.linear.gemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_ukernel_1x4__scalar);
1552 xnn_params.f32.gemm.linear.igemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_ukernel_1x4__scalar);
1553 xnn_params.f32.gemm.init.f32 = xnn_init_f32_minmax_scalar_params;
1554 xnn_params.f32.gemm.mr = 4;
1555 xnn_params.f32.gemm.nr = 4;
1556
1557 xnn_params.f32.gemm2.minmax.gemm[XNN_MR_TO_INDEX(4)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_4x2__scalar);
1558 xnn_params.f32.gemm2.minmax.igemm[XNN_MR_TO_INDEX(4)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_4x2__scalar);
1559 xnn_params.f32.gemm2.linear.gemm[XNN_MR_TO_INDEX(4)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_ukernel_4x2__scalar);
1560 xnn_params.f32.gemm2.linear.igemm[XNN_MR_TO_INDEX(4)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_ukernel_4x2__scalar);
1561 xnn_params.f32.gemm2.init.f32 = xnn_init_f32_minmax_scalar_params;
1562 xnn_params.f32.gemm2.mr = 4;
1563 xnn_params.f32.gemm2.nr = 2;
1564
1565 xnn_params.f32.dwconv[0].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_f32_dwconv_minmax_ukernel_up1x3__scalar_acc2;
1566 xnn_params.f32.dwconv[0].linear.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_f32_dwconv_ukernel_up1x3__scalar_acc2;
1567 xnn_params.f32.dwconv[0].init.f32 = xnn_init_f32_minmax_scalar_params;
1568 xnn_params.f32.dwconv[0].channel_tile = 1;
1569 xnn_params.f32.dwconv[0].primary_tile = 3;
1570
1571 xnn_params.f32.dwconv[1].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_f32_dwconv_minmax_ukernel_up1x4__scalar_acc2;
1572 xnn_params.f32.dwconv[1].linear.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_f32_dwconv_ukernel_up1x4__scalar_acc2;
1573 xnn_params.f32.dwconv[1].init.f32 = xnn_init_f32_minmax_scalar_params;
1574 xnn_params.f32.dwconv[1].channel_tile = 1;
1575 xnn_params.f32.dwconv[1].primary_tile = 4;
1576
1577 xnn_params.f32.dwconv[2].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_f32_dwconv_minmax_ukernel_up1x9__scalar_acc2;
1578 xnn_params.f32.dwconv[2].linear.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_f32_dwconv_ukernel_up1x9__scalar_acc2;
1579 xnn_params.f32.dwconv[2].init.f32 = xnn_init_f32_minmax_scalar_params;
1580 xnn_params.f32.dwconv[2].channel_tile = 1;
1581 xnn_params.f32.dwconv[2].primary_tile = 9;
1582
1583 xnn_params.f32.dwconv[3].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_f32_dwconv_minmax_ukernel_up1x25__scalar_acc2;
1584 xnn_params.f32.dwconv[3].linear.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_f32_dwconv_ukernel_up1x25__scalar_acc2;
1585 xnn_params.f32.dwconv[3].init.f32 = xnn_init_f32_minmax_scalar_params;
1586 xnn_params.f32.dwconv[3].channel_tile = 1;
1587 xnn_params.f32.dwconv[3].primary_tile = 25;
1588
1589 xnn_params.f32.avgpool = (struct avgpool_parameters) {
1590 .unipass = (xnn_avgpool_unipass_ukernel_function) xnn_f32_avgpool_minmax_ukernel_9x__scalar_c1,
1591 .multipass = (xnn_avgpool_multipass_ukernel_function) xnn_f32_avgpool_minmax_ukernel_9p8x__scalar_c1,
1592 .init.f32 = xnn_init_f32_scaleminmax_scalar_params,
1593 .primary_tile = 9,
1594 .incremental_tile = 8,
1595 .channel_tile = 1,
1596 };
1597 xnn_params.f32.pavgpool = (struct pavgpool_parameters) {
1598 .unipass = (xnn_pavgpool_unipass_ukernel_function) xnn_f32_pavgpool_minmax_ukernel_9x__scalar_c1,
1599 .multipass = (xnn_pavgpool_multipass_ukernel_function) xnn_f32_pavgpool_minmax_ukernel_9p8x__scalar_c1,
1600 .init.f32 = xnn_init_f32_minmax_scalar_params,
1601 .primary_tile = 9,
1602 .incremental_tile = 8,
1603 .channel_tile = 1,
1604 };
1605 xnn_params.f32.gavgpool = (struct gavgpool_parameters) {
1606 .unipass = (xnn_gavgpool_unipass_ukernel_function) xnn_f32_gavgpool_minmax_ukernel_7x__scalar_c1,
1607 .multipass = (xnn_gavgpool_multipass_ukernel_function) xnn_f32_gavgpool_minmax_ukernel_7p7x__scalar_c1,
1608 .init.f32 = xnn_init_f32_scaleminmax_scalar_params,
1609 .update.f32 = xnn_update_f32_scaleminmax_scalar_params,
1610 .row_tile = 7,
1611 .channel_tile = 1,
1612 };
1613 xnn_params.f32.maxpool = (struct maxpool_parameters) {
1614 .ukernel = (xnn_maxpool_ukernel_function) xnn_f32_maxpool_minmax_ukernel_9p8x__scalar_c1,
1615 .init.f32 = xnn_init_f32_minmax_scalar_params,
1616 .mr = 9,
1617 .qr = 8,
1618 };
1619 xnn_params.f32.argmaxpool[0] = (struct argmaxpool_parameters) {
1620 .up = (xnn_argmaxpool_unipass_ukernel_function) xnn_f32_argmaxpool_ukernel_4x__scalar_c1,
1621 .mr = 4,
1622 };
1623 xnn_params.f32.argmaxpool[1] = (struct argmaxpool_parameters) {
1624 .up = (xnn_argmaxpool_unipass_ukernel_function) xnn_f32_argmaxpool_ukernel_9x__scalar_c1,
1625 .mr = 9,
1626 };
1627 xnn_params.f32.argmaxpool[2] = (struct argmaxpool_parameters) {
1628 .mp = (xnn_argmaxpool_multipass_ukernel_function) xnn_f32_argmaxpool_ukernel_9p8x__scalar_c1,
1629 .mr = 9,
1630 .qr = 8,
1631 };
1632 xnn_params.f32.ibilinear = (struct ibilinear_parameters) {
1633 .ukernel = (xnn_ibilinear_ukernel_function) xnn_f32_ibilinear_ukernel__scalar_c2,
1634 .pixel_tile = 1,
1635 .channel_tile = 2,
1636 };
1637 xnn_params.f32.abs = (struct vunary_parameters) {
1638 .ukernel = (xnn_vunary_ukernel_function) xnn_f32_vabs_ukernel__scalar_x4,
1639 .element_tile = 4,
1640 };
1641 xnn_params.f32.clamp = (struct vunary_parameters) {
1642 .ukernel = (xnn_vunary_ukernel_function) xnn_f32_vclamp_ukernel__scalar_x4,
1643 .init.f32_minmax = xnn_init_f32_minmax_scalar_params,
1644 .element_tile = 4,
1645 };
1646 xnn_params.f32.elu = (struct vunary_parameters) {
1647 .ukernel = (xnn_vunary_ukernel_function) xnn_f32_velu_ukernel__scalar_rr2_lut16_p3_x4,
1648 .init.f32_elu = xnn_init_f32_elu_scalar_rr2_lut16_p3_params,
1649 .element_tile = 4,
1650 };
1651 xnn_params.f32.hswish = (struct vunary_parameters) {
1652 .ukernel = (xnn_vunary_ukernel_function) xnn_f32_vhswish_ukernel__scalar_x4,
1653 .init.f32_hswish = xnn_init_f32_hswish_scalar_params,
1654 .element_tile = 4,
1655 };
1656 xnn_params.f32.lrelu = (struct vunary_parameters) {
1657 .ukernel = (xnn_vunary_ukernel_function) xnn_f32_vlrelu_ukernel__scalar_x4,
1658 .init.f32_lrelu = xnn_init_f32_lrelu_scalar_params,
1659 .element_tile = 4,
1660 };
1661 xnn_params.f32.neg = (struct vunary_parameters) {
1662 .ukernel = (xnn_vunary_ukernel_function) xnn_f32_vneg_ukernel__scalar_x4,
1663 .element_tile = 4,
1664 };
1665 xnn_params.f32.rndne = (struct vunary_parameters) {
1666 .ukernel = (xnn_vunary_ukernel_function) xnn_f32_vrndne_ukernel__scalar_libm_x1,
1667 .element_tile = 1,
1668 };
1669 xnn_params.f32.rndz = (struct vunary_parameters) {
1670 .ukernel = (xnn_vunary_ukernel_function) xnn_f32_vrndz_ukernel__scalar_libm_x1,
1671 .element_tile = 1,
1672 };
1673 xnn_params.f32.rndu = (struct vunary_parameters) {
1674 .ukernel = (xnn_vunary_ukernel_function) xnn_f32_vrndu_ukernel__scalar_libm_x1,
1675 .element_tile = 1,
1676 };
1677 xnn_params.f32.rndd = (struct vunary_parameters) {
1678 .ukernel = (xnn_vunary_ukernel_function) xnn_f32_vrndd_ukernel__scalar_libm_x1,
1679 .element_tile = 1,
1680 };
1681 xnn_params.f32.sigmoid = (struct vunary_parameters) {
1682 .ukernel = (xnn_vunary_ukernel_function) xnn_f32_vsigmoid_ukernel__scalar_rr2_lut64_p2_div_x2,
1683 .init.f32_sigmoid = xnn_init_f32_sigmoid_scalar_rr2_lut64_p2_params,
1684 .element_tile = 2,
1685 };
1686 xnn_params.f32.sqr = (struct vunary_parameters) {
1687 .ukernel = (xnn_vunary_ukernel_function) xnn_f32_vsqr_ukernel__scalar_x4,
1688 .element_tile = 4,
1689 };
1690 xnn_params.f32.sqrt = (struct vunary_parameters) {
1691 .ukernel = (xnn_vunary_ukernel_function) xnn_f32_vsqrt_ukernel__scalar_sqrt_x1,
1692 .element_tile = 1,
1693 };
1694 xnn_params.f32.prelu = (struct prelu_parameters) {
1695 .ukernel = (xnn_prelu_ukernel_function) xnn_f32_prelu_ukernel__scalar_2x4,
1696 .row_tile = 4,
1697 .channel_tile = 4,
1698 };
1699 xnn_params.f32.raddstoreexpminusmax = (struct raddstoreexpminusmax_parameters) {
1700 .ukernel = (xnn_raddstoreexpminusmax_ukernel_function) xnn_f32_raddstoreexpminusmax_ukernel__scalar_rr2_p5_x4_acc2,
1701 .init.f32 = xnn_init_f32_expminus_scalar_rr2_p5_params,
1702 .element_tile = 4,
1703 };
1704 xnn_params.f32.rmax = (xnn_rmax_ukernel_function) xnn_f32_rmax_ukernel__scalar;
1705 xnn_params.f32.vadd = (struct vbinary_parameters) {
1706 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vadd_minmax_ukernel__scalar_x8,
1707 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vaddc_minmax_ukernel__scalar_x8,
1708 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vaddc_minmax_ukernel__scalar_x8,
1709 .init.f32_minmax = xnn_init_f32_minmax_scalar_params,
1710 .element_tile = 8,
1711 };
1712 xnn_params.f32.vdiv = (struct vbinary_parameters) {
1713 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vdiv_minmax_ukernel__scalar_x2,
1714 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vdivc_minmax_ukernel__scalar_x2,
1715 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vrdivc_minmax_ukernel__scalar_x2,
1716 .init.f32_minmax = xnn_init_f32_minmax_scalar_params,
1717 .element_tile = 2,
1718 };
1719 xnn_params.f32.vmax = (struct vbinary_parameters) {
1720 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmax_ukernel__scalar_x8,
1721 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmaxc_ukernel__scalar_x8,
1722 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmaxc_ukernel__scalar_x8,
1723 .element_tile = 8,
1724 };
1725 xnn_params.f32.vmin = (struct vbinary_parameters) {
1726 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmin_ukernel__scalar_x8,
1727 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vminc_ukernel__scalar_x8,
1728 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vminc_ukernel__scalar_x8,
1729 .element_tile = 8,
1730 };
1731 xnn_params.f32.vmul = (struct vbinary_parameters) {
1732 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmul_minmax_ukernel__scalar_x8,
1733 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmulc_minmax_ukernel__scalar_x8,
1734 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmulc_minmax_ukernel__scalar_x8,
1735 .init.f32_minmax = xnn_init_f32_minmax_scalar_params,
1736 .element_tile = 8,
1737 };
1738 xnn_params.f32.vsub = (struct vbinary_parameters) {
1739 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vsub_minmax_ukernel__scalar_x8,
1740 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vsubc_minmax_ukernel__scalar_x8,
1741 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vrsubc_minmax_ukernel__scalar_x8,
1742 .init.f32_minmax = xnn_init_f32_minmax_scalar_params,
1743 .element_tile = 8,
1744 };
1745 xnn_params.f32.vsqrdiff = (struct vbinary_parameters) {
1746 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vsqrdiff_ukernel__scalar_x8,
1747 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vsqrdiffc_ukernel__scalar_x8,
1748 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vsqrdiffc_ukernel__scalar_x8,
1749 .element_tile = 8,
1750 };
1751 xnn_params.f32.vmulcaddc = (struct vmulcaddc_parameters) {
1752 .ukernel = (xnn_vmulcaddc_ukernel_function) xnn_f32_vmulcaddc_minmax_ukernel_c1__scalar_2x,
1753 .init.f32 = xnn_init_f32_minmax_scalar_params,
1754 .channel_tile = 1,
1755 .row_tile = 2,
1756 };
1757 #ifndef XNN_NO_NCHW_OPERATORS
1758 init_flags |= XNN_INIT_FLAG_CHW_OPT;
1759
1760 xnn_params.f32.spmm = (struct spmm_parameters) {
1761 .ukernel = (xnn_spmm_ukernel_function) xnn_f32_spmm_minmax_ukernel_8x1__scalar,
1762 .mr = 8,
1763 .nr = 1,
1764 };
1765 xnn_params.f32.spmm2 = (struct spmm_parameters) {
1766 .ukernel = (xnn_spmm_ukernel_function) xnn_f32_spmm_minmax_ukernel_8x2__scalar,
1767 .mr = 8,
1768 .nr = 2,
1769 };
1770 xnn_params.f32.spmm4 = (struct spmm_parameters) {
1771 .ukernel = (xnn_spmm_ukernel_function) xnn_f32_spmm_minmax_ukernel_8x4__scalar,
1772 .mr = 8,
1773 .nr = 4,
1774 };
1775 xnn_params.f32.conv_hwc2chw_3x3c3s2 = (struct conv_hwc2chw_parameters) {
1776 .ukernel_with_symm_padding =
1777 (xnn_conv_hwc2chw_ukernel_function) xnn_f32_conv_hwc2chw_ukernel_3x3s2p1c3x4__scalar_1x1,
1778 .output_channel_tile = 4,
1779 .output_height_tile = 1,
1780 .output_width_tile = 1,
1781 };
1782 xnn_params.f32.dwconv2d_chw_3x3 = (struct dwconv2d_chw_parameters) {
1783 .ukernel = (xnn_dwconv2d_chw_ukernel_function) xnn_f32_dwconv2d_chw_ukernel_3x3p1__scalar_4x1,
1784 .output_width_tile = 1,
1785 .output_height_tile = 4,
1786 };
1787 xnn_params.f32.dwconv2d_chw_3x3s2 = (struct dwconv2d_chw_parameters) {
1788 .ukernel = (xnn_dwconv2d_chw_ukernel_function) xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__scalar_2x1_acc2,
1789 .output_width_tile = 1,
1790 .output_height_tile = 2,
1791 };
1792 xnn_params.f32.dwconv2d_chw_5x5 = (struct dwconv2d_chw_parameters) {
1793 .ukernel = (xnn_dwconv2d_chw_ukernel_function) xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_2x1_acc2,
1794 .output_width_tile = 1,
1795 .output_height_tile = 2,
1796 };
1797 xnn_params.f32.dwconv2d_chw_5x5s2 = (struct dwconv2d_chw_parameters) {
1798 .ukernel = (xnn_dwconv2d_chw_ukernel_function) xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_2x1_acc2,
1799 .output_width_tile = 1,
1800 .output_height_tile = 2,
1801 };
1802 xnn_params.f32.gavgpool_cw = (struct gavgpool_cw_parameters) {
1803 .ukernel = (xnn_gavgpool_cw_ukernel_function) xnn_f32_gavgpool_cw_ukernel__scalar_x1,
1804 .channel_tile = 1,
1805 };
1806 xnn_params.f32.ibilinear_chw = (struct ibilinear_chw_parameters) {
1807 .ukernel = (xnn_ibilinear_chw_ukernel_function) xnn_f32_ibilinear_chw_ukernel__scalar_p4,
1808 .channel_tile = 1,
1809 .pixel_tile = 4,
1810 };
1811 #endif // XNN_NO_NCHW_OPERATORS
1812 #endif // XNN_NO_F32_OPERATORS
1813
1814 /*************************** VCVT AArch32 Pre-NEON micro-kernels ***************************/
1815 #ifndef XNN_NO_VCVT_OPERATORS
1816 init_flags |= XNN_INIT_FLAG_VCVT;
1817
1818 xnn_params.vcvt.f16_to_f32 = (struct vunary_parameters) {
1819 .ukernel = (xnn_vunary_ukernel_function) xnn_f16_f32_vcvt_ukernel__scalar_x4,
1820 .init.f16_f32_cvt = xnn_init_f16_f32_cvt_scalar_params,
1821 .element_tile = 4,
1822 };
1823 xnn_params.vcvt.f32_to_f16 = (struct vunary_parameters) {
1824 .ukernel = (xnn_vunary_ukernel_function) xnn_f32_f16_vcvt_ukernel__scalar_fabsf_x2,
1825 .init.f32_f16_cvt = xnn_init_f32_f16_cvt_scalar_fabsf_params,
1826 .element_tile = 2,
1827 };
1828 xnn_params.vcvt.f32_to_qs8 = (struct vunary_parameters) {
1829 .ukernel = (xnn_vunary_ukernel_function) xnn_f32_qs8_vcvt_ukernel__scalar_imagic_x4,
1830 .init.f32_qs8_cvt = xnn_init_f32_qs8_cvt_scalar_imagic_params,
1831 .element_tile = 4,
1832 };
1833 xnn_params.vcvt.f32_to_qu8 = (struct vunary_parameters) {
1834 .ukernel = (xnn_vunary_ukernel_function) xnn_f32_qu8_vcvt_ukernel__scalar_imagic_x4,
1835 .init.f32_qu8_cvt = xnn_init_f32_qu8_cvt_scalar_imagic_params,
1836 .element_tile = 4,
1837 };
1838 xnn_params.vcvt.qs8 = (struct vunary_parameters) {
1839 .ukernel = (xnn_vunary_ukernel_function) xnn_qs8_vcvt_ukernel__armsimd32_x8,
1840 .init.qs8_cvt = xnn_init_qs8_cvt_scalar_params,
1841 .element_tile = 8,
1842 };
1843 xnn_params.vcvt.qs8_to_f32 = (struct vunary_parameters) {
1844 .ukernel = (xnn_vunary_ukernel_function) xnn_qs8_f32_vcvt_ukernel__scalar_x4,
1845 .init.qs8_f32_cvt = xnn_init_qs8_f32_cvt_scalar_params,
1846 .element_tile = 4,
1847 };
1848 xnn_params.vcvt.qu8 = (struct vunary_parameters) {
1849 .ukernel = (xnn_vunary_ukernel_function) xnn_qu8_vcvt_ukernel__armsimd32_x8,
1850 .init.qu8_cvt = xnn_init_qu8_cvt_scalar_params,
1851 .element_tile = 8,
1852 };
1853 xnn_params.vcvt.qu8_to_f32 = (struct vunary_parameters) {
1854 .ukernel = (xnn_vunary_ukernel_function) xnn_qu8_f32_vcvt_ukernel__scalar_x4,
1855 .init.qu8_f32_cvt = xnn_init_qu8_f32_cvt_scalar_params,
1856 .element_tile = 4,
1857 };
1858 #endif // XNN_NO_VCVT_OPERATORS
1859
1860 /**************************** X32 AArch32 Pre-NEON micro-kernels ****************************/
1861 #ifndef XNN_NO_X32_OPERATORS
1862 init_flags |= XNN_INIT_FLAG_X32;
1863
1864 xnn_params.x32.unpool = (xnn_unpool_ukernel_function) xnn_x32_unpool_ukernel__scalar;
1865 xnn_params.x32.zip = (struct zip_parameters) {
1866 .x2 = (xnn_zipc_ukernel_function) xnn_x32_zip_x2_ukernel__scalar,
1867 .x3 = (xnn_zipc_ukernel_function) xnn_x32_zip_x3_ukernel__scalar,
1868 .x4 = (xnn_zipc_ukernel_function) xnn_x32_zip_x4_ukernel__scalar,
1869 .xm = (xnn_zipv_ukernel_function) xnn_x32_zip_xm_ukernel__scalar,
1870 };
1871
1872 xnn_params.x32.transpose = (struct transpose_parameters) {
1873 .const_size_ukernel = (xnn_transposec_ukernel_function) xnn_x32_transposec_ukernel__2x4_scalar_int,
1874 .tile_size = 32,
1875 };
1876 #endif // XNN_NO_X32_OPERATORS
1877
1878 /**************************** XX AArch32 Pre-NEON micro-kernels ****************************/
1879 #ifndef XNN_NO_XX_OPERATORS
1880 init_flags |= XNN_INIT_FLAG_XX;
1881
1882 xnn_params.xx.copy = (xnn_vunary_ukernel_function) xnn_xx_copy_ukernel__memcpy;
1883 xnn_params.xx.fill = (struct fill_parameters) {
1884 .ukernel = (xnn_fill_ukernel_function) xnn_xx_fill_ukernel__scalar_x16,
1885 .row_tile = 1,
1886 };
1887 xnn_params.xx.pad = (struct pad_parameters) {
1888 .ukernel = (xnn_pad_ukernel_function) xnn_xx_pad_ukernel__scalar,
1889 .row_tile = 1,
1890 };
1891 xnn_params.xx.transpose = (struct transpose_parameters) {
1892 .variable_size_ukernel = xnn_xx_transposev_ukernel__1x1_memcpy,
1893 .tile_size = 32,
1894 };
1895 #endif // XNN_NO_XX_OPERATORS
1896 }
1897
1898#elif XNN_ARCH_ARM64
1899
1900 /**************************** QC8 AArch64 micro-kernels ****************************/
1901 #ifndef XNN_NO_QC8_OPERATORS
1902 init_flags |= XNN_INIT_FLAG_QC8;
1903
1904 #if XNN_PLATFORM_IOS || XNN_PLATFORM_MAC
1905 #if XNN_ENABLE_ASSEMBLY
1906 if (XNN_ENABLE_ARM_DOTPROD && cpuinfo_has_arm_neon_dot()) {
1907 #if XNN_ENABLE_ARM_DOTPROD
1908 xnn_params.qc8.gemm.minmax.gemm[XNN_MR_TO_INDEX(4)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qc8_gemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_ld128);
1909 xnn_params.qc8.gemm.minmax.gemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qc8_gemm_minmax_fp32_ukernel_1x16c4__neondot);
1910 xnn_params.qc8.gemm.minmax.igemm[XNN_MR_TO_INDEX(4)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qc8_igemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_ld128);
1911 xnn_params.qc8.gemm.minmax.igemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qc8_igemm_minmax_fp32_ukernel_1x16c4__neondot);
1912 xnn_params.qc8.gemm.init.qc8 = xnn_init_qc8_conv_minmax_fp32_neonv8_params;
1913 xnn_params.qc8.gemm.mr = 4;
1914 xnn_params.qc8.gemm.nr = 16;
1915 xnn_params.qc8.gemm.log2_kr = 2;
1916 #endif // XNN_ENABLE_ARM_DOTPROD
1917 } else {
1918 xnn_params.qc8.gemm.minmax.gemm[XNN_MR_TO_INDEX(2)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qc8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal);
1919 xnn_params.qc8.gemm.minmax.igemm[XNN_MR_TO_INDEX(2)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qc8_igemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal);
1920 xnn_params.qc8.gemm.minmax.gemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qc8_gemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal);
1921 xnn_params.qc8.gemm.minmax.igemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qc8_igemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal);
1922 xnn_params.qc8.gemm.init.qc8 = xnn_init_qc8_conv_minmax_fp32_neonv8_params;
1923 xnn_params.qc8.gemm.mr = 2;
1924 xnn_params.qc8.gemm.nr = 8;
1925 xnn_params.qc8.gemm.log2_kr = 3;
1926 }
1927 #else // !XNN_ENABLE_ASSEMBLY
1928 if (XNN_ENABLE_ARM_DOTPROD && cpuinfo_has_arm_neon_dot()) {
1929 #if XNN_ENABLE_ARM_DOTPROD
1930 xnn_params.qc8.gemm.minmax.gemm[XNN_MR_TO_INDEX(4)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qc8_gemm_minmax_fp32_ukernel_4x16c4__neondot);
1931 xnn_params.qc8.gemm.minmax.gemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qc8_gemm_minmax_fp32_ukernel_1x16c4__neondot);
1932 xnn_params.qc8.gemm.minmax.igemm[XNN_MR_TO_INDEX(4)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qc8_igemm_minmax_fp32_ukernel_4x16c4__neondot);
1933 xnn_params.qc8.gemm.minmax.igemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qc8_igemm_minmax_fp32_ukernel_1x16c4__neondot);
1934 xnn_params.qc8.gemm.init.qc8 = xnn_init_qc8_conv_minmax_fp32_neonv8_params;
1935 xnn_params.qc8.gemm.mr = 4;
1936 xnn_params.qc8.gemm.nr = 16;
1937 xnn_params.qc8.gemm.log2_kr = 2;
1938 #endif // XNN_ENABLE_ARM_DOTPROD
1939 } else {
1940 xnn_params.qc8.gemm.minmax.gemm[XNN_MR_TO_INDEX(2)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qc8_gemm_minmax_fp32_ukernel_2x8c2s4__neonv8_mlal);
1941 xnn_params.qc8.gemm.minmax.igemm[XNN_MR_TO_INDEX(2)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qc8_igemm_minmax_fp32_ukernel_2x8c2s4__neonv8_mlal);
1942 xnn_params.qc8.gemm.minmax.gemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qc8_gemm_minmax_fp32_ukernel_1x8c2s4__neonv8_mlal);
1943 xnn_params.qc8.gemm.minmax.igemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qc8_igemm_minmax_fp32_ukernel_1x8c2s4__neonv8_mlal);
1944 xnn_params.qc8.gemm.init.qc8 = xnn_init_qc8_conv_minmax_fp32_neonv8_params;
1945 xnn_params.qc8.gemm.mr = 2;
1946 xnn_params.qc8.gemm.nr = 8;
1947 xnn_params.qc8.gemm.log2_kr = 1;
1948 xnn_params.qc8.gemm.log2_sr = 2;
1949 }
1950 #endif // XNN_ENABLE_ASSEMBLY
1951 #else // !XNN_PLATFORM_IOS && !XNN_PLATFORM_MAC
1952 #if XNN_ENABLE_ASSEMBLY
1953 if (XNN_ENABLE_ARM_DOTPROD && cpuinfo_has_arm_neon_dot()) {
1954 #if XNN_ENABLE_ARM_DOTPROD
1955 switch (cpuinfo_get_core(0)->uarch) {
1956 case cpuinfo_uarch_cortex_a55:
1957 xnn_params.qc8.gemm.minmax.gemm[XNN_MR_TO_INDEX(4)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qc8_gemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_cortex_a55);
1958 xnn_params.qc8.gemm.minmax.igemm[XNN_MR_TO_INDEX(4)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qc8_igemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_cortex_a55);
1959 break;
1960 case cpuinfo_uarch_cortex_x1:
1961 case cpuinfo_uarch_cortex_a78:
1962 xnn_params.qc8.gemm.minmax.gemm[XNN_MR_TO_INDEX(4)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qc8_gemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_ld128);
1963 xnn_params.qc8.gemm.minmax.igemm[XNN_MR_TO_INDEX(4)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qc8_igemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_ld128);
1964 break;
1965 default:
1966 xnn_params.qc8.gemm.minmax.gemm[XNN_MR_TO_INDEX(4)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qc8_gemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_ld64);
1967 xnn_params.qc8.gemm.minmax.igemm[XNN_MR_TO_INDEX(4)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qc8_igemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_ld64);
1968 break;
1969 }
1970 xnn_params.qc8.gemm.minmax.gemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qc8_gemm_minmax_fp32_ukernel_1x16c4__neondot);
1971 xnn_params.qc8.gemm.minmax.igemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qc8_igemm_minmax_fp32_ukernel_1x16c4__neondot);
1972 xnn_params.qc8.gemm.init.qc8 = xnn_init_qc8_conv_minmax_fp32_neonv8_params;
1973 xnn_params.qc8.gemm.mr = 4;
1974 xnn_params.qc8.gemm.nr = 16;
1975 xnn_params.qc8.gemm.log2_kr = 2;
1976 #endif // XNN_ENABLE_ARM_DOTPROD
1977 } else {
1978 switch (cpuinfo_get_core(0)->uarch) {
1979 case cpuinfo_uarch_cortex_a35:
1980 case cpuinfo_uarch_kryo:
1981 xnn_params.qc8.gemm.minmax.gemm[XNN_MR_TO_INDEX(4)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qc8_gemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_ld64);
1982 xnn_params.qc8.gemm.minmax.igemm[XNN_MR_TO_INDEX(4)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qc8_igemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_ld64);
1983 xnn_params.qc8.gemm.minmax.gemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qc8_gemm_minmax_fp32_ukernel_1x16__neonv8_mlal_lane);
1984 xnn_params.qc8.gemm.minmax.igemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qc8_igemm_minmax_fp32_ukernel_1x16__neonv8_mlal_lane);
1985 xnn_params.qc8.gemm.init.qc8 = xnn_init_qc8_conv_minmax_fp32_neonv8_params;
1986 xnn_params.qc8.gemm.mr = 4;
1987 xnn_params.qc8.gemm.nr = 16;
1988 break;
1989
1990 case cpuinfo_uarch_cortex_a53:
1991 case cpuinfo_uarch_cortex_a55r0:
1992 xnn_params.qc8.gemm.minmax.gemm[XNN_MR_TO_INDEX(4)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qc8_gemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_cortex_a53);
1993 xnn_params.qc8.gemm.minmax.igemm[XNN_MR_TO_INDEX(4)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qc8_igemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_cortex_a53);
1994 xnn_params.qc8.gemm.minmax.gemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qc8_gemm_minmax_fp32_ukernel_1x16__neonv8_mlal_lane);
1995 xnn_params.qc8.gemm.minmax.igemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qc8_igemm_minmax_fp32_ukernel_1x16__neonv8_mlal_lane);
1996 xnn_params.qc8.gemm.init.qc8 = xnn_init_qc8_conv_minmax_fp32_neonv8_params;
1997 xnn_params.qc8.gemm.mr = 4;
1998 xnn_params.qc8.gemm.nr = 16;
1999 break;
2000
2001 case cpuinfo_uarch_cortex_a72:
2002 case cpuinfo_uarch_cortex_a73:
2003 xnn_params.qc8.gemm.minmax.gemm[XNN_MR_TO_INDEX(2)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qc8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_prfm);
2004 xnn_params.qc8.gemm.minmax.igemm[XNN_MR_TO_INDEX(2)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qc8_igemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_prfm);
2005 xnn_params.qc8.gemm.minmax.gemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qc8_gemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_prfm);
2006 xnn_params.qc8.gemm.minmax.igemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qc8_igemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_prfm);
2007 xnn_params.qc8.gemm.init.qc8 = xnn_init_qc8_conv_minmax_fp32_neonv8_params;
2008 xnn_params.qc8.gemm.mr = 2;
2009 xnn_params.qc8.gemm.nr = 8;
2010 xnn_params.qc8.gemm.log2_kr = 3;
2011 break;
2012
2013 default:
2014 xnn_params.qc8.gemm.minmax.gemm[XNN_MR_TO_INDEX(2)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qc8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal);
2015 xnn_params.qc8.gemm.minmax.igemm[XNN_MR_TO_INDEX(2)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qc8_igemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal);
2016 xnn_params.qc8.gemm.minmax.gemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qc8_gemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal);
2017 xnn_params.qc8.gemm.minmax.igemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qc8_igemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal);
2018 xnn_params.qc8.gemm.init.qc8 = xnn_init_qc8_conv_minmax_fp32_neonv8_params;
2019 xnn_params.qc8.gemm.mr = 2;
2020 xnn_params.qc8.gemm.nr = 8;
2021 xnn_params.qc8.gemm.log2_kr = 3;
2022 break;
2023 }
2024 }
2025 #if XNN_MAX_UARCH_TYPES > 1
2026 {
2027 /* Choose micro-kernels for little cores according to micro-kernel specification for the big core */
2028 const uint32_t mr = xnn_params.qc8.gemm.mr;
2029 const uint32_t nr = xnn_params.qc8.gemm.nr;
2030 const uint32_t log2_kr = xnn_params.qc8.gemm.log2_kr;
2031 for (size_t i = 1; i < XNN_MAX_UARCH_TYPES; i++) {
2032 const struct cpuinfo_uarch_info* uarch_info = cpuinfo_get_uarch(i);
2033 if (uarch_info == NULL) {
2034 /* No more microarchitectures in the system */
2035 break;
2036 }
2037
2038 switch (uarch_info->uarch) {
2039 case cpuinfo_uarch_cortex_a53:
2040 case cpuinfo_uarch_cortex_a55r0:
2041 if (mr == 2 && nr == 8 && log2_kr == 3) {
2042 xnn_params.qc8.gemm.minmax.gemm[XNN_MR_TO_INDEX(2)].function[i] = (xnn_gemm_ukernel_function) xnn_qc8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_prfm_cortex_a53;
2043 xnn_params.qc8.gemm.minmax.igemm[XNN_MR_TO_INDEX(2)].function[i] = (xnn_igemm_ukernel_function) xnn_qc8_igemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_prfm_cortex_a53;
2044 xnn_params.qc8.gemm.minmax.gemm[XNN_MR_TO_INDEX(1)].function[i] = (xnn_gemm_ukernel_function) xnn_qc8_gemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_prfm_cortex_a53;
2045 xnn_params.qc8.gemm.minmax.igemm[XNN_MR_TO_INDEX(1)].function[i] = (xnn_igemm_ukernel_function) xnn_qc8_igemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_prfm_cortex_a53;
2046 }
2047 break;
2048
2049 case cpuinfo_uarch_cortex_a55:
2050 #if XNN_ENABLE_ARM_DOTPROD
2051 if (mr == 4 && nr == 16 && log2_kr == 2 && cpuinfo_has_arm_neon_dot()) {
2052 xnn_params.qc8.gemm.minmax.gemm[XNN_MR_TO_INDEX(4)].function[i] = (xnn_gemm_ukernel_function) xnn_qc8_gemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_cortex_a55;
2053 xnn_params.qc8.gemm.minmax.igemm[XNN_MR_TO_INDEX(4)].function[i] = (xnn_igemm_ukernel_function) xnn_qc8_igemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_cortex_a55;
2054 xnn_params.qc8.gemm.minmax.gemm[XNN_MR_TO_INDEX(1)].function[i] = (xnn_gemm_ukernel_function) xnn_qc8_gemm_minmax_fp32_ukernel_1x16c4__neondot;
2055 xnn_params.qc8.gemm.minmax.igemm[XNN_MR_TO_INDEX(1)].function[i] = (xnn_igemm_ukernel_function) xnn_qc8_igemm_minmax_fp32_ukernel_1x16c4__neondot;
2056 }
2057 #endif // XNN_ENABLE_ARM_DOTPROD
2058 break;
2059 default:
2060 break;
2061 }
2062 }
2063 }
2064 #endif // XNN_MAX_UARCH_TYPES > 1
2065 #else // !XNN_ENABLE_ASSEMBLY
2066 if (XNN_ENABLE_ARM_DOTPROD && cpuinfo_has_arm_neon_dot()) {
2067 #if XNN_ENABLE_ARM_DOTPROD
2068 xnn_params.qc8.gemm.minmax.gemm[XNN_MR_TO_INDEX(4)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qc8_gemm_minmax_fp32_ukernel_4x16c4__neondot);
2069 xnn_params.qc8.gemm.minmax.igemm[XNN_MR_TO_INDEX(4)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qc8_igemm_minmax_fp32_ukernel_4x16c4__neondot);
2070 xnn_params.qc8.gemm.minmax.gemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qc8_gemm_minmax_fp32_ukernel_1x16c4__neondot);
2071 xnn_params.qc8.gemm.minmax.igemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qc8_igemm_minmax_fp32_ukernel_1x16c4__neondot);
2072 xnn_params.qc8.gemm.init.qc8 = xnn_init_qc8_conv_minmax_fp32_neonv8_params;
2073 xnn_params.qc8.gemm.mr = 4;
2074 xnn_params.qc8.gemm.nr = 16;
2075 xnn_params.qc8.gemm.log2_kr = 2;
2076 #endif // XNN_ENABLE_ARM_DOTPROD
2077 } else {
2078 xnn_params.qc8.gemm.minmax.gemm[XNN_MR_TO_INDEX(2)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qc8_gemm_minmax_fp32_ukernel_2x8c2s4__neonv8_mlal);
2079 xnn_params.qc8.gemm.minmax.igemm[XNN_MR_TO_INDEX(2)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qc8_igemm_minmax_fp32_ukernel_2x8c2s4__neonv8_mlal);
2080 xnn_params.qc8.gemm.minmax.gemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qc8_gemm_minmax_fp32_ukernel_1x8c2s4__neonv8_mlal);
2081 xnn_params.qc8.gemm.minmax.igemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qc8_igemm_minmax_fp32_ukernel_1x8c2s4__neonv8_mlal);
2082 xnn_params.qc8.gemm.init.qc8 = xnn_init_qc8_conv_minmax_fp32_neonv8_params;
2083 xnn_params.qc8.gemm.mr = 2;
2084 xnn_params.qc8.gemm.nr = 8;
2085 xnn_params.qc8.gemm.log2_kr = 1;
2086 xnn_params.qc8.gemm.log2_sr = 2;
2087 }
2088 #endif // XNN_ENABLE_ASSEMBLY
2089 #endif // XNN_PLATFORM_IOS || XNN_PLATFORM_MAC
2090
2091 xnn_params.qc8.dwconv[0].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qc8_dwconv_minmax_fp32_ukernel_up16x3__neonv8_mla8_ld128;
2092 xnn_params.qc8.dwconv[0].init.qc8 = xnn_init_qc8_conv_minmax_fp32_neonv8_params;
2093 xnn_params.qc8.dwconv[0].channel_tile = 16;
2094 xnn_params.qc8.dwconv[0].primary_tile = 3;
2095 xnn_params.qc8.dwconv[1].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qc8_dwconv_minmax_fp32_ukernel_up16x9__neonv8_mla8_ld64;
2096 xnn_params.qc8.dwconv[1].init.qc8 = xnn_init_qc8_conv_minmax_fp32_neonv8_params;
2097 xnn_params.qc8.dwconv[1].channel_tile = 16;
2098 xnn_params.qc8.dwconv[1].primary_tile = 9;
2099 xnn_params.qc8.dwconv[2].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qc8_dwconv_minmax_fp32_ukernel_up16x25__neonv8_mla8_ld64;
2100 xnn_params.qc8.dwconv[2].init.qc8 = xnn_init_qc8_conv_minmax_fp32_neonv8_params;
2101 xnn_params.qc8.dwconv[2].channel_tile = 16;
2102 xnn_params.qc8.dwconv[2].primary_tile = 25;
2103 #endif // XNN_NO_QC8_OPERATORS
2104
2105 /**************************** QS8 AArch64 micro-kernels ****************************/
2106 #ifndef XNN_NO_QS8_OPERATORS
2107 init_flags |= XNN_INIT_FLAG_QS8;
2108
2109 #if XNN_PLATFORM_IOS || XNN_PLATFORM_MAC
2110 #if XNN_ENABLE_ASSEMBLY
2111 if (XNN_ENABLE_ARM_DOTPROD && cpuinfo_has_arm_neon_dot()) {
2112 #if XNN_ENABLE_ARM_DOTPROD
2113 xnn_params.qs8.gemm.minmax.gemm[XNN_MR_TO_INDEX(4)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c4__aarch64_neondot_ld128);
2114 xnn_params.qs8.gemm.minmax.gemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c4__neondot);
2115 xnn_params.qs8.gemm.minmax.igemm[XNN_MR_TO_INDEX(4)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4__aarch64_neondot_ld128);
2116 xnn_params.qs8.gemm.minmax.igemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c4__neondot);
2117 xnn_params.qs8.gemm.init.qs8 = xnn_init_qs8_conv_minmax_rndnu_neon_params;
2118 xnn_params.qs8.gemm.mr = 4;
2119 xnn_params.qs8.gemm.nr = 16;
2120 xnn_params.qs8.gemm.log2_kr = 2;
2121 #endif // XNN_ENABLE_ARM_DOTPROD
2122 } else {
2123 xnn_params.qs8.gemm.minmax.gemm[XNN_MR_TO_INDEX(2)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mlal);
2124 xnn_params.qs8.gemm.minmax.igemm[XNN_MR_TO_INDEX(2)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mlal);
2125 xnn_params.qs8.gemm.minmax.gemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c8__aarch64_neon_mlal);
2126 xnn_params.qs8.gemm.minmax.igemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c8__aarch64_neon_mlal);
2127 xnn_params.qs8.gemm.init.qs8 = xnn_init_qs8_conv_minmax_rndnu_neon_params;
2128 xnn_params.qs8.gemm.mr = 2;
2129 xnn_params.qs8.gemm.nr = 8;
2130 xnn_params.qs8.gemm.log2_kr = 3;
2131 }
2132 #else // !XNN_ENABLE_ASSEMBLY
2133 if (XNN_ENABLE_ARM_DOTPROD && cpuinfo_has_arm_neon_dot()) {
2134 #if XNN_ENABLE_ARM_DOTPROD
2135 xnn_params.qs8.gemm.minmax.gemm[XNN_MR_TO_INDEX(4)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c4__neondot);
2136 xnn_params.qs8.gemm.minmax.gemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c4__neondot);
2137 xnn_params.qs8.gemm.minmax.igemm[XNN_MR_TO_INDEX(4)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4__neondot);
2138 xnn_params.qs8.gemm.minmax.igemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c4__neondot);
2139 xnn_params.qs8.gemm.init.qs8 = xnn_init_qs8_conv_minmax_rndnu_neon_params;
2140 xnn_params.qs8.gemm.mr = 4;
2141 xnn_params.qs8.gemm.nr = 16;
2142 xnn_params.qs8.gemm.log2_kr = 2;
2143 #endif // XNN_ENABLE_ARM_DOTPROD
2144 } else {
2145 xnn_params.qs8.gemm.minmax.gemm[XNN_MR_TO_INDEX(2)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c2s4__neon_mlal);
2146 xnn_params.qs8.gemm.minmax.igemm[XNN_MR_TO_INDEX(2)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c2s4__neon_mlal);
2147 xnn_params.qs8.gemm.minmax.gemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c2s4__neon_mlal);
2148 xnn_params.qs8.gemm.minmax.igemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2s4__neon_mlal);
2149 xnn_params.qs8.gemm.init.qs8 = xnn_init_qs8_conv_minmax_rndnu_neon_params;
2150 xnn_params.qs8.gemm.mr = 2;
2151 xnn_params.qs8.gemm.nr = 8;
2152 xnn_params.qs8.gemm.log2_kr = 1;
2153 xnn_params.qs8.gemm.log2_sr = 2;
2154 }
2155 #endif // XNN_ENABLE_ASSEMBLY
2156 #else // !XNN_PLATFORM_IOS && !XNN_PLATFORM_MAC
2157 #if XNN_ENABLE_ASSEMBLY
2158 if (XNN_ENABLE_ARM_DOTPROD && cpuinfo_has_arm_neon_dot()) {
2159 #if XNN_ENABLE_ARM_DOTPROD
2160 switch (cpuinfo_get_core(0)->uarch) {
2161 case cpuinfo_uarch_cortex_a55:
2162 xnn_params.qs8.gemm.minmax.gemm[XNN_MR_TO_INDEX(4)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c4__aarch64_neondot_cortex_a55);
2163 xnn_params.qs8.gemm.minmax.igemm[XNN_MR_TO_INDEX(4)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4__aarch64_neondot_cortex_a55);
2164 break;
2165 case cpuinfo_uarch_cortex_x1:
2166 case cpuinfo_uarch_cortex_a78:
2167 xnn_params.qs8.gemm.minmax.gemm[XNN_MR_TO_INDEX(4)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c4__aarch64_neondot_ld128);
2168 xnn_params.qs8.gemm.minmax.igemm[XNN_MR_TO_INDEX(4)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4__aarch64_neondot_ld128);
2169 break;
2170 default:
2171 xnn_params.qs8.gemm.minmax.gemm[XNN_MR_TO_INDEX(4)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c4__aarch64_neondot_ld64);
2172 xnn_params.qs8.gemm.minmax.igemm[XNN_MR_TO_INDEX(4)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4__aarch64_neondot_ld64);
2173 break;
2174 }
2175 xnn_params.qs8.gemm.minmax.gemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c4__neondot);
2176 xnn_params.qs8.gemm.minmax.igemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c4__neondot);
2177 xnn_params.qs8.gemm.init.qs8 = xnn_init_qs8_conv_minmax_rndnu_neon_params;
2178 xnn_params.qs8.gemm.mr = 4;
2179 xnn_params.qs8.gemm.nr = 16;
2180 xnn_params.qs8.gemm.log2_kr = 2;
2181 #endif // XNN_ENABLE_ARM_DOTPROD
2182 } else {
2183 switch (cpuinfo_get_core(0)->uarch) {
2184 case cpuinfo_uarch_cortex_a35:
2185 case cpuinfo_uarch_kryo:
2186 xnn_params.qs8.gemm.minmax.gemm[XNN_MR_TO_INDEX(4)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_ld64);
2187 xnn_params.qs8.gemm.minmax.igemm[XNN_MR_TO_INDEX(4)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_ld64);
2188 xnn_params.qs8.gemm.minmax.gemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_rndnu_ukernel_1x16__neon_mlal_lane);
2189 xnn_params.qs8.gemm.minmax.igemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_rndnu_ukernel_1x16__neon_mlal_lane);
2190 xnn_params.qs8.gemm.init.qs8 = xnn_init_qs8_conv_minmax_rndnu_neon_params;
2191 xnn_params.qs8.gemm.mr = 4;
2192 xnn_params.qs8.gemm.nr = 16;
2193 break;
2194
2195 case cpuinfo_uarch_cortex_a53:
2196 case cpuinfo_uarch_cortex_a55r0:
2197 xnn_params.qs8.gemm.minmax.gemm[XNN_MR_TO_INDEX(4)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_cortex_a53);
2198 xnn_params.qs8.gemm.minmax.igemm[XNN_MR_TO_INDEX(4)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_cortex_a53);
2199 xnn_params.qs8.gemm.minmax.gemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_rndnu_ukernel_1x16__neon_mlal_lane);
2200 xnn_params.qs8.gemm.minmax.igemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_rndnu_ukernel_1x16__neon_mlal_lane);
2201 xnn_params.qs8.gemm.init.qs8 = xnn_init_qs8_conv_minmax_rndnu_neon_params;
2202 xnn_params.qs8.gemm.mr = 4;
2203 xnn_params.qs8.gemm.nr = 16;
2204 break;
2205
2206 case cpuinfo_uarch_cortex_a72:
2207 case cpuinfo_uarch_cortex_a73:
2208 xnn_params.qs8.gemm.minmax.gemm[XNN_MR_TO_INDEX(2)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mlal_prfm);
2209 xnn_params.qs8.gemm.minmax.igemm[XNN_MR_TO_INDEX(2)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mlal_prfm);
2210 xnn_params.qs8.gemm.minmax.gemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c8__aarch64_neon_mlal_prfm);
2211 xnn_params.qs8.gemm.minmax.igemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c8__aarch64_neon_mlal_prfm);
2212 xnn_params.qs8.gemm.init.qs8 = xnn_init_qs8_conv_minmax_rndnu_neon_params;
2213 xnn_params.qs8.gemm.mr = 2;
2214 xnn_params.qs8.gemm.nr = 8;
2215 xnn_params.qs8.gemm.log2_kr = 3;
2216 break;
2217
2218 default:
2219 xnn_params.qs8.gemm.minmax.gemm[XNN_MR_TO_INDEX(2)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mlal);
2220 xnn_params.qs8.gemm.minmax.igemm[XNN_MR_TO_INDEX(2)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mlal);
2221 xnn_params.qs8.gemm.minmax.gemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c8__aarch64_neon_mlal);
2222 xnn_params.qs8.gemm.minmax.igemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c8__aarch64_neon_mlal);
2223 xnn_params.qs8.gemm.init.qs8 = xnn_init_qs8_conv_minmax_rndnu_neon_params;
2224 xnn_params.qs8.gemm.mr = 2;
2225 xnn_params.qs8.gemm.nr = 8;
2226 xnn_params.qs8.gemm.log2_kr = 3;
2227 break;
2228 }
2229 }
2230 #if XNN_MAX_UARCH_TYPES > 1
2231 {
2232 /* Choose micro-kernels for little cores according to micro-kernel specification for the big core */
2233 const uint32_t mr = xnn_params.qs8.gemm.mr;
2234 const uint32_t nr = xnn_params.qs8.gemm.nr;
2235 const uint32_t log2_kr = xnn_params.qs8.gemm.log2_kr;
2236 for (size_t i = 1; i < XNN_MAX_UARCH_TYPES; i++) {
2237 const struct cpuinfo_uarch_info* uarch_info = cpuinfo_get_uarch(i);
2238 if (uarch_info == NULL) {
2239 /* No more microarchitectures in the system */
2240 break;
2241 }
2242
2243 switch (uarch_info->uarch) {
2244 case cpuinfo_uarch_cortex_a53:
2245 case cpuinfo_uarch_cortex_a55r0:
2246 if (mr == 2 && nr == 8 && log2_kr == 3) {
2247 xnn_params.qs8.gemm.minmax.gemm[XNN_MR_TO_INDEX(2)].function[i] = (xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mlal_prfm_cortex_a53;
2248 xnn_params.qs8.gemm.minmax.igemm[XNN_MR_TO_INDEX(2)].function[i] = (xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mlal_prfm_cortex_a53;
2249 xnn_params.qs8.gemm.minmax.gemm[XNN_MR_TO_INDEX(1)].function[i] = (xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c8__aarch64_neon_mlal_prfm_cortex_a53;
2250 xnn_params.qs8.gemm.minmax.igemm[XNN_MR_TO_INDEX(1)].function[i] = (xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c8__aarch64_neon_mlal_prfm_cortex_a53;
2251 }
2252 break;
2253
2254 case cpuinfo_uarch_cortex_a55:
2255 #if XNN_ENABLE_ARM_DOTPROD
2256 if (mr == 4 && nr == 16 && log2_kr == 2 && cpuinfo_has_arm_neon_dot()) {
2257 xnn_params.qs8.gemm.minmax.gemm[XNN_MR_TO_INDEX(4)].function[i] = (xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c4__aarch64_neondot_cortex_a55;
2258 xnn_params.qs8.gemm.minmax.igemm[XNN_MR_TO_INDEX(4)].function[i] = (xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4__aarch64_neondot_cortex_a55;
2259 xnn_params.qs8.gemm.minmax.gemm[XNN_MR_TO_INDEX(1)].function[i] = (xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c4__neondot;
2260 xnn_params.qs8.gemm.minmax.igemm[XNN_MR_TO_INDEX(1)].function[i] = (xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c4__neondot;
2261 }
2262 #endif // XNN_ENABLE_ARM_DOTPROD
2263 break;
2264 default:
2265 break;
2266 }
2267 }
2268 }
2269 #endif // XNN_MAX_UARCH_TYPES > 1
2270 #else // !XNN_ENABLE_ASSEMBLY
2271 if (XNN_ENABLE_ARM_DOTPROD && cpuinfo_has_arm_neon_dot()) {
2272 #if XNN_ENABLE_ARM_DOTPROD
2273 xnn_params.qs8.gemm.minmax.gemm[XNN_MR_TO_INDEX(4)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c4__neondot);
2274 xnn_params.qs8.gemm.minmax.gemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c4__neondot);
2275 xnn_params.qs8.gemm.minmax.igemm[XNN_MR_TO_INDEX(4)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4__neondot);
2276 xnn_params.qs8.gemm.minmax.igemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c4__neondot);
2277 xnn_params.qs8.gemm.init.qs8 = xnn_init_qs8_conv_minmax_rndnu_neon_params;
2278 xnn_params.qs8.gemm.mr = 4;
2279 xnn_params.qs8.gemm.nr = 16;
2280 xnn_params.qs8.gemm.log2_kr = 2;
2281 #endif // XNN_ENABLE_ARM_DOTPROD
2282 } else {
2283 xnn_params.qs8.gemm.minmax.gemm[XNN_MR_TO_INDEX(2)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c2s4__neon_mlal);
2284 xnn_params.qs8.gemm.minmax.igemm[XNN_MR_TO_INDEX(2)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c2s4__neon_mlal);
2285 xnn_params.qs8.gemm.minmax.gemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c2s4__neon_mlal);
2286 xnn_params.qs8.gemm.minmax.igemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2s4__neon_mlal);
2287 xnn_params.qs8.gemm.init.qs8 = xnn_init_qs8_conv_minmax_rndnu_neon_params;
2288 xnn_params.qs8.gemm.mr = 2;
2289 xnn_params.qs8.gemm.nr = 8;
2290 xnn_params.qs8.gemm.log2_kr = 1;
2291 xnn_params.qs8.gemm.log2_sr = 2;
2292 }
2293 #endif // XNN_ENABLE_ASSEMBLY
2294 #endif // XNN_PLATFORM_IOS || XNN_PLATFORM_MAC
2295
2296 xnn_params.qs8.dwconv[0].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qs8_dwconv_minmax_rndnu_ukernel_up16x9__neon_mla8_ld64;
2297 xnn_params.qs8.dwconv[0].init.qs8 = xnn_init_qs8_conv_minmax_rndnu_neon_params;
2298 xnn_params.qs8.dwconv[0].channel_tile = 16;
2299 xnn_params.qs8.dwconv[0].primary_tile = 9;
2300 xnn_params.qs8.dwconv[1].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qs8_dwconv_minmax_rndnu_ukernel_up16x25__neon_mla8_ld64;
2301 xnn_params.qs8.dwconv[1].init.qs8 = xnn_init_qs8_conv_minmax_rndnu_neon_params;
2302 xnn_params.qs8.dwconv[1].channel_tile = 16;
2303 xnn_params.qs8.dwconv[1].primary_tile = 25;
2304
2305 xnn_params.qs8.gavgpool = (struct gavgpool_parameters) {
2306 .unipass = (xnn_gavgpool_unipass_ukernel_function) xnn_qs8_gavgpool_minmax_rndnu_ukernel_7x__neon_c8,
2307 .multipass = (xnn_gavgpool_multipass_ukernel_function) xnn_qs8_gavgpool_minmax_rndnu_ukernel_7p7x__neon_c8,
2308 .init.qs8 = xnn_init_qs8_avgpool_minmax_rndnu_neon_params,
2309 .update.qs8 = xnn_update_qs8_avgpool_minmax_rndnu_neon_params,
2310 .row_tile = 7,
2311 .channel_tile = 8,
2312 };
2313
2314 xnn_params.qs8.vadd = (struct vbinary_parameters) {
2315 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_qs8_vadd_minmax_ukernel__neon_ld64_x32,
2316 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_qs8_vaddc_minmax_ukernel__neon_ld64_x32,
2317 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_qs8_vaddc_minmax_ukernel__neon_ld64_x32,
2318 .init.qs8_add = xnn_init_qs8_add_minmax_neon_params,
2319 .element_tile = 32,
2320 };
2321 xnn_params.qs8.vmul = (struct vbinary_parameters) {
2322 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_qs8_vmul_minmax_rndnu_ukernel__neon_ld64_x16,
2323 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_qs8_vmulc_minmax_rndnu_ukernel__neon_ld64_x16,
2324 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_qs8_vmulc_minmax_rndnu_ukernel__neon_ld64_x16,
2325 .init.qs8_mul = xnn_init_qs8_mul_minmax_rndnu_neon_params,
2326 .element_tile = 16,
2327 };
2328
2329 xnn_params.qs8.lrelu = (struct vunary_parameters) {
2330 .ukernel = (xnn_vunary_ukernel_function) xnn_qs8_vlrelu_ukernel__neon_x32,
2331 .init.qs8_lrelu = xnn_init_qs8_lrelu_neon_params,
2332 .element_tile = 32,
2333 };
2334 #endif // XNN_NO_QS8_OPERATORS
2335
2336 /**************************** QU8 AArch64 micro-kernels ****************************/
2337 #ifndef XNN_NO_QU8_OPERATORS
2338 init_flags |= XNN_INIT_FLAG_QU8;
2339
2340 #if XNN_ENABLE_ASSEMBLY
2341 if (XNN_ENABLE_ARM_DOTPROD && cpuinfo_has_arm_neon_dot()) {
2342 #if XNN_ENABLE_ARM_DOTPROD
2343 switch (cpuinfo_get_core(0)->uarch) {
2344 case cpuinfo_uarch_cortex_a55:
2345 xnn_params.qu8.gemm.minmax.gemm[XNN_MR_TO_INDEX(4)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qu8_gemm_minmax_rndnu_ukernel_4x16c4__aarch64_neondot_cortex_a55);
2346 xnn_params.qu8.gemm.minmax.igemm[XNN_MR_TO_INDEX(4)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qu8_igemm_minmax_rndnu_ukernel_4x16c4__aarch64_neondot_cortex_a55);
2347 xnn_params.qu8.gemm.minmax.gemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qu8_gemm_minmax_rndnu_ukernel_1x16c4__neondot);
2348 xnn_params.qu8.gemm.minmax.igemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qu8_igemm_minmax_rndnu_ukernel_1x16c4__neondot);
2349 xnn_params.qu8.gemm.init.qu8 = xnn_init_qu8_conv_minmax_rndnu_neon_params;
2350 xnn_params.qu8.gemm.mr = 4;
2351 xnn_params.qu8.gemm.nr = 16;
2352 xnn_params.qu8.gemm.log2_kr = 2;
2353 break;
2354 default:
2355 xnn_params.qu8.gemm.minmax.gemm[XNN_MR_TO_INDEX(4)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qu8_gemm_minmax_rndnu_ukernel_4x16c4__aarch64_neondot_ld128);
2356 xnn_params.qu8.gemm.minmax.igemm[XNN_MR_TO_INDEX(4)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qu8_igemm_minmax_rndnu_ukernel_4x16c4__aarch64_neondot_ld128);
2357 xnn_params.qu8.gemm.minmax.gemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qu8_gemm_minmax_rndnu_ukernel_1x16c4__neondot);
2358 xnn_params.qu8.gemm.minmax.igemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qu8_igemm_minmax_rndnu_ukernel_1x16c4__neondot);
2359 xnn_params.qu8.gemm.init.qu8 = xnn_init_qu8_conv_minmax_rndnu_neon_params;
2360 xnn_params.qu8.gemm.mr = 4;
2361 xnn_params.qu8.gemm.nr = 16;
2362 xnn_params.qu8.gemm.log2_kr = 2;
2363 break;
2364 }
2365 #endif // XNN_ENABLE_ARM_DOTPROD
2366 } else {
2367 switch (cpuinfo_get_core(0)->uarch) {
2368 case cpuinfo_uarch_cortex_a53:
2369 case cpuinfo_uarch_cortex_a55r0:
2370 case cpuinfo_uarch_kryo:
2371 xnn_params.qu8.gemm.minmax.gemm[XNN_MR_TO_INDEX(4)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qu8_gemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_prfm_cortex_a53);
2372 xnn_params.qu8.gemm.minmax.igemm[XNN_MR_TO_INDEX(4)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qu8_igemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_prfm_cortex_a53);
2373 xnn_params.qu8.gemm.minmax.gemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qu8_gemm_minmax_rndnu_ukernel_1x16__neon_mlal_lane);
2374 xnn_params.qu8.gemm.minmax.igemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qu8_igemm_minmax_rndnu_ukernel_1x16__neon_mlal_lane);
2375 xnn_params.qu8.gemm.init.qu8 = xnn_init_qu8_conv_minmax_rndnu_neon_params;
2376 xnn_params.qu8.gemm.mr = 4;
2377 xnn_params.qu8.gemm.nr = 16;
2378 break;
2379
2380 case cpuinfo_uarch_cortex_a57:
2381 case cpuinfo_uarch_cortex_a72:
2382 case cpuinfo_uarch_cortex_a73:
2383 case cpuinfo_uarch_cortex_a75:
2384 case cpuinfo_uarch_cortex_a76:
2385 case cpuinfo_uarch_exynos_m1:
2386 case cpuinfo_uarch_exynos_m2:
2387 case cpuinfo_uarch_exynos_m3:
2388 case cpuinfo_uarch_exynos_m4:
2389 xnn_params.qu8.gemm.minmax.gemm[XNN_MR_TO_INDEX(4)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qu8_gemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_prfm_cortex_a75);
2390 xnn_params.qu8.gemm.minmax.igemm[XNN_MR_TO_INDEX(4)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qu8_igemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_prfm_cortex_a75);
2391 xnn_params.qu8.gemm.minmax.gemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qu8_gemm_minmax_rndnu_ukernel_1x16__neon_mlal_lane);
2392 xnn_params.qu8.gemm.minmax.igemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qu8_igemm_minmax_rndnu_ukernel_1x16__neon_mlal_lane);
2393 xnn_params.qu8.gemm.init.qu8 = xnn_init_qu8_conv_minmax_rndnu_neon_params;
2394 xnn_params.qu8.gemm.mr = 4;
2395 xnn_params.qu8.gemm.nr = 16;
2396 break;
2397
2398 default:
2399 xnn_params.qu8.gemm.minmax.gemm[XNN_MR_TO_INDEX(4)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qu8_gemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_cortex_a75);
2400 xnn_params.qu8.gemm.minmax.igemm[XNN_MR_TO_INDEX(4)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qu8_igemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_cortex_a75);
2401 xnn_params.qu8.gemm.minmax.gemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qu8_gemm_minmax_rndnu_ukernel_1x16__neon_mlal_lane);
2402 xnn_params.qu8.gemm.minmax.igemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qu8_igemm_minmax_rndnu_ukernel_1x16__neon_mlal_lane);
2403 xnn_params.qu8.gemm.init.qu8 = xnn_init_qu8_conv_minmax_rndnu_neon_params;
2404 xnn_params.qu8.gemm.mr = 4;
2405 xnn_params.qu8.gemm.nr = 16;
2406 break;
2407 }
2408 }
2409 #if XNN_MAX_UARCH_TYPES > 1
2410 {
2411 /* Choose micro-kernels for little cores according to micro-kernel specification for the big core */
2412 const uint32_t mr = xnn_params.qu8.gemm.mr;
2413 const uint32_t nr = xnn_params.qu8.gemm.nr;
2414 const uint32_t log2_kr = xnn_params.qu8.gemm.log2_kr;
2415 for (size_t i = 1; i < XNN_MAX_UARCH_TYPES; i++) {
2416 const struct cpuinfo_uarch_info* uarch_info = cpuinfo_get_uarch(i);
2417 if (uarch_info == NULL) {
2418 /* No more microarchitectures in the system */
2419 break;
2420 }
2421
2422 switch (uarch_info->uarch) {
2423 case cpuinfo_uarch_cortex_a53:
2424 case cpuinfo_uarch_cortex_a55r0:
2425 if (mr == 4 && nr == 16 && log2_kr == 0) {
2426 xnn_params.qu8.gemm.minmax.gemm[XNN_MR_TO_INDEX(4)].function[i] = (xnn_gemm_ukernel_function) xnn_qu8_gemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_prfm_cortex_a53;
2427 xnn_params.qu8.gemm.minmax.igemm[XNN_MR_TO_INDEX(4)].function[i] = (xnn_igemm_ukernel_function) xnn_qu8_igemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_prfm_cortex_a53;
2428 }
2429 break;
2430
2431 case cpuinfo_uarch_cortex_a55:
2432 #if XNN_ENABLE_ARM_DOTPROD
2433 if (mr == 4 && nr == 16 && log2_kr == 2 && cpuinfo_has_arm_neon_dot()) {
2434 xnn_params.qu8.gemm.minmax.gemm[XNN_MR_TO_INDEX(4)].function[i] = (xnn_gemm_ukernel_function) xnn_qu8_gemm_minmax_rndnu_ukernel_4x16c4__aarch64_neondot_cortex_a55;
2435 xnn_params.qu8.gemm.minmax.igemm[XNN_MR_TO_INDEX(4)].function[i] = (xnn_igemm_ukernel_function) xnn_qu8_igemm_minmax_rndnu_ukernel_4x16c4__aarch64_neondot_cortex_a55;
2436 }
2437 #endif // XNN_ENABLE_ARM_DOTPROD
2438 break;
2439 default:
2440 break;
2441 }
2442 }
2443 }
2444 #endif // XNN_MAX_UARCH_TYPES > 1
2445 #else // !XNN_ENABLE_ASSEMBLY
2446 if (XNN_ENABLE_ARM_DOTPROD && cpuinfo_has_arm_neon_dot()) {
2447 #if XNN_ENABLE_ARM_DOTPROD
2448 xnn_params.qu8.gemm.minmax.gemm[XNN_MR_TO_INDEX(4)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qu8_gemm_minmax_rndnu_ukernel_4x16c4__neondot);
2449 xnn_params.qu8.gemm.minmax.igemm[XNN_MR_TO_INDEX(4)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qu8_igemm_minmax_rndnu_ukernel_4x16c4__neondot);
2450 xnn_params.qu8.gemm.minmax.gemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qu8_gemm_minmax_rndnu_ukernel_1x16c4__neondot);
2451 xnn_params.qu8.gemm.minmax.igemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qu8_igemm_minmax_rndnu_ukernel_1x16c4__neondot);
2452 xnn_params.qu8.gemm.init.qu8 = xnn_init_qu8_conv_minmax_rndnu_neon_params;
2453 xnn_params.qu8.gemm.mr = 4;
2454 xnn_params.qu8.gemm.nr = 16;
2455 xnn_params.qu8.gemm.log2_kr = 2;
2456 #endif // XNN_ENABLE_ARM_DOTPROD
2457 } else {
2458 xnn_params.qu8.gemm.minmax.gemm[XNN_MR_TO_INDEX(4)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qu8_gemm_minmax_rndnu_ukernel_4x16__neon_mlal_lane);
2459 xnn_params.qu8.gemm.minmax.igemm[XNN_MR_TO_INDEX(4)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qu8_igemm_minmax_rndnu_ukernel_4x16__neon_mlal_lane);
2460 xnn_params.qu8.gemm.minmax.gemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qu8_gemm_minmax_rndnu_ukernel_1x16__neon_mlal_lane);
2461 xnn_params.qu8.gemm.minmax.igemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qu8_igemm_minmax_rndnu_ukernel_1x16__neon_mlal_lane);
2462 xnn_params.qu8.gemm.init.qu8 = xnn_init_qu8_conv_minmax_rndnu_neon_params;
2463 xnn_params.qu8.gemm.mr = 4;
2464 xnn_params.qu8.gemm.nr = 16;
2465 }
2466 #endif // XNN_ENABLE_ASSEMBLY
2467
2468 xnn_params.qu8.dwconv[0].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qu8_dwconv_minmax_rndnu_ukernel_up16x9__neon_mul8;
2469 xnn_params.qu8.dwconv[0].init.qu8 = xnn_init_qu8_conv_minmax_rndnu_neon_params;
2470 xnn_params.qu8.dwconv[0].channel_tile = 16;
2471 xnn_params.qu8.dwconv[0].primary_tile = 9;
2472 xnn_params.qu8.dwconv[1].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qu8_dwconv_minmax_rndnu_ukernel_up8x25__neon_mul8;
2473 xnn_params.qu8.dwconv[1].init.qu8 = xnn_init_qu8_conv_minmax_rndnu_neon_params;
2474 xnn_params.qu8.dwconv[1].channel_tile = 8;
2475 xnn_params.qu8.dwconv[1].primary_tile = 25;
2476
2477 xnn_params.qu8.avgpool = (struct avgpool_parameters) {
2478 .unipass = (xnn_avgpool_unipass_ukernel_function) xnn_qu8_avgpool_minmax_ukernel_9x__neon_c8,
2479 .multipass = (xnn_avgpool_multipass_ukernel_function) xnn_qu8_avgpool_minmax_ukernel_9p8x__neon_c8,
2480 .init.qu8 = xnn_init_qu8_avgpool_minmax_neon_params,
2481 .primary_tile = 9,
2482 .incremental_tile = 8,
2483 .channel_tile = 8,
2484 };
2485 xnn_params.qu8.gavgpool = (struct gavgpool_parameters) {
2486 .unipass = (xnn_gavgpool_unipass_ukernel_function) xnn_qu8_gavgpool_minmax_rndnu_ukernel_7x__neon_c8,
2487 .multipass = (xnn_gavgpool_multipass_ukernel_function) xnn_qu8_gavgpool_minmax_rndnu_ukernel_7p7x__neon_c8,
2488 .init.qu8 = xnn_init_qu8_avgpool_minmax_rndnu_neon_params,
2489 .update.qu8 = xnn_update_qu8_avgpool_minmax_rndnu_neon_params,
2490 .row_tile = 7,
2491 .channel_tile = 8,
2492 };
2493 xnn_params.qu8.vadd = (struct vbinary_parameters) {
2494 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_qu8_vadd_minmax_ukernel__neon_ld64_x32,
2495 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_qu8_vaddc_minmax_ukernel__neon_ld64_x32,
2496 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_qu8_vaddc_minmax_ukernel__neon_ld64_x32,
2497 .init.qu8_add = xnn_init_qu8_add_minmax_neon_params,
2498 .element_tile = 8,
2499 };
2500 xnn_params.qu8.vmul = (struct vbinary_parameters) {
2501 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_qu8_vmul_minmax_rndnu_ukernel__neon_ld64_x16,
2502 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_qu8_vmulc_minmax_rndnu_ukernel__neon_ld64_x16,
2503 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_qu8_vmulc_minmax_rndnu_ukernel__neon_ld64_x16,
2504 .init.qu8_mul = xnn_init_qu8_mul_minmax_rndnu_neon_params,
2505 .element_tile = 16,
2506 };
2507
2508 xnn_params.qu8.lrelu = (struct vunary_parameters) {
2509 .ukernel = (xnn_vunary_ukernel_function) xnn_qu8_vlrelu_ukernel__neon_x32,
2510 .init.qu8_lrelu = xnn_init_qu8_lrelu_neon_params,
2511 .element_tile = 32,
2512 };
2513 #endif // XNN_NO_QU8_OPERATORS
2514
2515 /**************************** S8 AArch64 micro-kernels ****************************/
2516 #ifndef XNN_NO_S8_OPERATORS
2517 init_flags |= XNN_INIT_FLAG_S8;
2518
2519 xnn_params.s8.clamp = (struct vunary_parameters) {
2520 .ukernel = (xnn_vunary_ukernel_function) xnn_s8_vclamp_ukernel__neon_x64,
2521 .init.s8_minmax = xnn_init_s8_minmax_neon_params,
2522 .element_tile = 64,
2523 };
2524 xnn_params.s8.ibilinear = (struct ibilinear_parameters) {
2525 .ukernel = (xnn_ibilinear_ukernel_function) xnn_s8_ibilinear_ukernel__neon_c16,
2526 .pixel_tile = 1,
2527 .channel_tile = 16,
2528 };
2529 xnn_params.s8.maxpool = (struct maxpool_parameters) {
2530 .ukernel = (xnn_maxpool_ukernel_function) xnn_s8_maxpool_minmax_ukernel_9p8x__neon_c16,
2531 .init.s8 = xnn_init_s8_minmax_neon_params,
2532 .mr = 9,
2533 .qr = 8,
2534 };
2535 #endif // XNN_NO_S8_OPERATORS
2536
2537 /**************************** U8 AArch64 micro-kernels ****************************/
2538 #ifndef XNN_NO_U8_OPERATORS
2539 init_flags |= XNN_INIT_FLAG_U8;
2540
2541 xnn_params.u8.clamp = (struct vunary_parameters) {
2542 .ukernel = (xnn_vunary_ukernel_function) xnn_u8_vclamp_ukernel__neon_x64,
2543 .init.u8_minmax = xnn_init_u8_minmax_neon_params,
2544 .element_tile = 64,
2545 };
2546 xnn_params.u8.ibilinear = (struct ibilinear_parameters) {
2547 .ukernel = (xnn_ibilinear_ukernel_function) xnn_u8_ibilinear_ukernel__neon_c16,
2548 .pixel_tile = 1,
2549 .channel_tile = 16,
2550 };
2551 xnn_params.u8.maxpool = (struct maxpool_parameters) {
2552 .ukernel = (xnn_maxpool_ukernel_function) xnn_u8_maxpool_minmax_ukernel_9p8x__neon_c16,
2553 .init.u8 = xnn_init_u8_minmax_neon_params,
2554 .mr = 9,
2555 .qr = 8,
2556 };
2557 xnn_params.u8.lut32norm = xnn_u8_lut32norm_ukernel__scalar;
2558 xnn_params.u8.rmax = xnn_u8_rmax_ukernel__neon;
2559 #endif // XNN_NO_U8_OPERATORS
2560
2561 /**************************** X8 AArch64 micro-kernels ****************************/
2562 #ifndef XNN_NO_X8_OPERATORS
2563 init_flags |= XNN_INIT_FLAG_X8;
2564
2565 xnn_params.x8.lut = xnn_x8_lut_ukernel__neon_tbx128x4_x64;
2566 xnn_params.x8.zip = (struct zip_parameters) {
2567 .x2 = (xnn_zipc_ukernel_function) xnn_x8_zip_x2_ukernel__neon,
2568 .x3 = (xnn_zipc_ukernel_function) xnn_x8_zip_x3_ukernel__neon,
2569 .x4 = (xnn_zipc_ukernel_function) xnn_x8_zip_x4_ukernel__neon,
2570 .xm = (xnn_zipv_ukernel_function) xnn_x8_zip_xm_ukernel__neon,
2571 };
2572
2573 xnn_params.x8.transpose = (struct transpose_parameters) {
2574 .const_size_ukernel = (xnn_transposec_ukernel_function) xnn_x8_transposec_ukernel__16x16_reuse_dec_zip_neon,
2575 .tile_size = 32,
2576 };
2577 #endif // XNN_NO_X8_OPERATORS
2578
2579 /**************************** X16 AArch64 micro-kernels ****************************/
2580 #ifndef XNN_NO_X16_OPERATORS
2581 init_flags |= XNN_INIT_FLAG_X16;
2582
2583 xnn_params.x16.transpose = (struct transpose_parameters) {
2584 .const_size_ukernel = (xnn_transposec_ukernel_function) xnn_x16_transposec_ukernel__8x8_reuse_dec_zip_neon,
2585 .tile_size = 32,
2586 };
2587 #endif // XNN_NO_X16_OPERATORS
2588
2589 /**************************** F16 AArch64 micro-kernels ****************************/
2590 #ifndef XNN_NO_F16_OPERATORS
2591 #if XNN_ENABLE_ARM_FP16
2592 if (cpuinfo_has_arm_neon_fp16_arith()) {
2593 init_flags |= XNN_INIT_FLAG_F16 | XNN_INIT_FLAG_F16_NATIVE;
2594
2595 #if XNN_ENABLE_ASSEMBLY
2596 switch (cpuinfo_get_core(0)->uarch) {
2597 case cpuinfo_uarch_cortex_a55:
2598 xnn_params.f16.gemm.minmax.gemm[XNN_MR_TO_INDEX(6)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f16_gemm_minmax_ukernel_6x16__aarch64_neonfp16arith_cortex_a55);
2599 xnn_params.f16.gemm.minmax.igemm[XNN_MR_TO_INDEX(6)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f16_igemm_minmax_ukernel_6x16__aarch64_neonfp16arith_cortex_a55);
2600 xnn_params.f16.gemm.minmax.gemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f16_gemm_minmax_ukernel_1x16__aarch64_neonfp16arith_ld64);
2601 xnn_params.f16.gemm.minmax.igemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f16_igemm_minmax_ukernel_1x16__aarch64_neonfp16arith_ld64);
2602 xnn_params.f16.gemm.init.f16 = xnn_init_f16_minmax_fp16arith_params;
2603 xnn_params.f16.gemm.mr = 6;
2604 xnn_params.f16.gemm.nr = 16;
2605 break;
2606 case cpuinfo_uarch_cortex_a55r0:
2607 case cpuinfo_uarch_cortex_a75:
2608 xnn_params.f16.gemm.minmax.gemm[XNN_MR_TO_INDEX(6)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f16_gemm_minmax_ukernel_6x16__aarch64_neonfp16arith_cortex_a55r0);
2609 xnn_params.f16.gemm.minmax.igemm[XNN_MR_TO_INDEX(6)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f16_igemm_minmax_ukernel_6x16__aarch64_neonfp16arith_cortex_a55r0);
2610 xnn_params.f16.gemm.minmax.gemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f16_gemm_minmax_ukernel_1x16__aarch64_neonfp16arith_ld64);
2611 xnn_params.f16.gemm.minmax.igemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f16_igemm_minmax_ukernel_1x16__aarch64_neonfp16arith_ld64);
2612 xnn_params.f16.gemm.init.f16 = xnn_init_f16_minmax_fp16arith_params;
2613 xnn_params.f16.gemm.mr = 6;
2614 xnn_params.f16.gemm.nr = 16;
2615 break;
2616 case cpuinfo_uarch_exynos_m5:
2617 xnn_params.f16.gemm.minmax.gemm[XNN_MR_TO_INDEX(4)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f16_gemm_minmax_ukernel_4x16__aarch64_neonfp16arith_ld64);
2618 xnn_params.f16.gemm.minmax.igemm[XNN_MR_TO_INDEX(4)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f16_igemm_minmax_ukernel_4x16__aarch64_neonfp16arith_ld64);
2619 xnn_params.f16.gemm.minmax.gemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f16_gemm_minmax_ukernel_1x16__aarch64_neonfp16arith_ld64);
2620 xnn_params.f16.gemm.minmax.igemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f16_igemm_minmax_ukernel_1x16__aarch64_neonfp16arith_ld64);
2621 xnn_params.f16.gemm.init.f16 = xnn_init_f16_minmax_fp16arith_params;
2622 xnn_params.f16.gemm.mr = 4;
2623 xnn_params.f16.gemm.nr = 16;
2624 break;
2625 case cpuinfo_uarch_exynos_m4:
2626 xnn_params.f16.gemm.minmax.gemm[XNN_MR_TO_INDEX(6)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f16_gemm_minmax_ukernel_6x16__aarch64_neonfp16arith_ld64);
2627 xnn_params.f16.gemm.minmax.igemm[XNN_MR_TO_INDEX(6)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f16_igemm_minmax_ukernel_6x16__aarch64_neonfp16arith_ld64);
2628 xnn_params.f16.gemm.minmax.gemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f16_gemm_minmax_ukernel_1x16__aarch64_neonfp16arith_ld64);
2629 xnn_params.f16.gemm.minmax.igemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f16_igemm_minmax_ukernel_1x16__aarch64_neonfp16arith_ld64);
2630 xnn_params.f16.gemm.init.f16 = xnn_init_f16_minmax_fp16arith_params;
2631 xnn_params.f16.gemm.mr = 6;
2632 xnn_params.f16.gemm.nr = 16;
2633 break;
2634 default:
2635 case cpuinfo_uarch_cortex_a76:
2636 case cpuinfo_uarch_cortex_a77:
2637 case cpuinfo_uarch_cortex_a78:
2638 case cpuinfo_uarch_cortex_x1:
2639 xnn_params.f16.gemm.minmax.gemm[XNN_MR_TO_INDEX(6)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f16_gemm_minmax_ukernel_6x16__aarch64_neonfp16arith_cortex_a75);
2640 xnn_params.f16.gemm.minmax.igemm[XNN_MR_TO_INDEX(6)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f16_igemm_minmax_ukernel_6x16__aarch64_neonfp16arith_cortex_a75);
2641 xnn_params.f16.gemm.minmax.gemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f16_gemm_minmax_ukernel_1x16__aarch64_neonfp16arith_ld64);
2642 xnn_params.f16.gemm.minmax.igemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f16_igemm_minmax_ukernel_1x16__aarch64_neonfp16arith_ld64);
2643 xnn_params.f16.gemm.init.f16 = xnn_init_f16_minmax_fp16arith_params;
2644 xnn_params.f16.gemm.mr = 6;
2645 xnn_params.f16.gemm.nr = 16;
2646 break;
2647 }
2648
2649 #if XNN_MAX_UARCH_TYPES > 1
2650 {
2651 /* Choose micro-kernels for little cores according to micro-kernel specification for the big core */
2652 const uint32_t mr = xnn_params.f16.gemm.mr;
2653 const uint32_t nr = xnn_params.f16.gemm.nr;
2654 for (size_t i = 1; i < XNN_MAX_UARCH_TYPES; i++) {
2655 const struct cpuinfo_uarch_info* uarch_info = cpuinfo_get_uarch(i);
2656 if (uarch_info == NULL) {
2657 /* No more microarchitectures in the system */
2658 break;
2659 }
2660
2661 switch (uarch_info->uarch) {
2662 case cpuinfo_uarch_cortex_a55:
2663 if (mr == 6 && nr == 16) {
2664 xnn_params.f16.gemm.minmax.gemm[XNN_MR_TO_INDEX(6)].function[i] = (xnn_gemm_ukernel_function) xnn_f16_gemm_minmax_ukernel_6x16__aarch64_neonfp16arith_cortex_a55;
2665 xnn_params.f16.gemm.minmax.igemm[XNN_MR_TO_INDEX(6)].function[i] = (xnn_igemm_ukernel_function) xnn_f16_igemm_minmax_ukernel_6x16__aarch64_neonfp16arith_cortex_a55;
2666 }
2667 break;
2668 case cpuinfo_uarch_cortex_a55r0:
2669 case cpuinfo_uarch_cortex_a75:
2670 if (mr == 6 && nr == 16) {
2671 xnn_params.f16.gemm.minmax.gemm[XNN_MR_TO_INDEX(6)].function[i] = (xnn_gemm_ukernel_function) xnn_f16_gemm_minmax_ukernel_6x16__aarch64_neonfp16arith_cortex_a55r0;
2672 xnn_params.f16.gemm.minmax.igemm[XNN_MR_TO_INDEX(6)].function[i] = (xnn_igemm_ukernel_function) xnn_f16_igemm_minmax_ukernel_6x16__aarch64_neonfp16arith_cortex_a55r0;
2673 }
2674 break;
2675 default:
2676 break;
2677 }
2678 }
2679 }
2680 #endif // XNN_MAX_UARCH_TYPES > 1
2681 #else // XNN_ENABLE_ASSEMBLY
2682 xnn_params.f16.gemm.minmax.gemm[XNN_MR_TO_INDEX(6)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f16_gemm_minmax_ukernel_6x16__neonfp16arith_ld64);
2683 xnn_params.f16.gemm.minmax.igemm[XNN_MR_TO_INDEX(6)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f16_igemm_minmax_ukernel_6x16__neonfp16arith_ld64);
2684 xnn_params.f16.gemm.minmax.gemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f16_gemm_minmax_ukernel_1x16__neonfp16arith_ld64);
2685 xnn_params.f16.gemm.minmax.igemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f16_igemm_minmax_ukernel_1x16__neonfp16arith_ld64);
2686 xnn_params.f16.gemm.init.f16 = xnn_init_f16_minmax_fp16arith_params;
2687 xnn_params.f16.gemm.mr = 6;
2688 xnn_params.f16.gemm.nr = 16;
2689 #endif // XNN_ENABLE_ASSEMBLY
2690
2691 xnn_params.f16.dwconv[0].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_f16_dwconv_minmax_ukernel_up16x3__neonfp16arith;
2692 xnn_params.f16.dwconv[0].init.f16 = xnn_init_f16_minmax_fp16arith_params;
2693 xnn_params.f16.dwconv[0].channel_tile = 16;
2694 xnn_params.f16.dwconv[0].primary_tile = 3;
2695
2696 xnn_params.f16.dwconv[1].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_f16_dwconv_minmax_ukernel_up16x4__neonfp16arith;
2697 xnn_params.f16.dwconv[1].init.f16 = xnn_init_f16_minmax_fp16arith_params;
2698 xnn_params.f16.dwconv[1].channel_tile = 16;
2699 xnn_params.f16.dwconv[1].primary_tile = 4;
2700
2701 xnn_params.f16.dwconv[2].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_f16_dwconv_minmax_ukernel_up16x9__neonfp16arith;
2702 xnn_params.f16.dwconv[2].init.f16 = xnn_init_f16_minmax_fp16arith_params;
2703 xnn_params.f16.dwconv[2].channel_tile = 16;
2704 xnn_params.f16.dwconv[2].primary_tile = 9;
2705
2706 xnn_params.f16.dwconv[3].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_f16_dwconv_minmax_ukernel_up8x25__neonfp16arith_acc2;
2707 xnn_params.f16.dwconv[3].init.f16 = xnn_init_f16_minmax_fp16arith_params;
2708 xnn_params.f16.dwconv[3].channel_tile = 8;
2709 xnn_params.f16.dwconv[3].primary_tile = 25;
2710
2711 xnn_params.f16.avgpool = (struct avgpool_parameters) {
2712 .unipass = (xnn_avgpool_unipass_ukernel_function) xnn_f16_avgpool_minmax_ukernel_9x__neonfp16arith_c8,
2713 .multipass = (xnn_avgpool_multipass_ukernel_function) xnn_f16_avgpool_minmax_ukernel_9p8x__neonfp16arith_c8,
2714 .init.f16 = xnn_init_f16_scaleminmax_fp16arith_params,
2715 .primary_tile = 9,
2716 .incremental_tile = 8,
2717 .channel_tile = 8,
2718 };
2719 xnn_params.f16.pavgpool = (struct pavgpool_parameters) {
2720 .unipass = (xnn_pavgpool_unipass_ukernel_function) xnn_f16_pavgpool_minmax_ukernel_9x__neonfp16arith_c8,
2721 .multipass = (xnn_pavgpool_multipass_ukernel_function) xnn_f16_pavgpool_minmax_ukernel_9p8x__neonfp16arith_c8,
2722 .init.f16 = xnn_init_f16_minmax_fp16arith_params,
2723 .primary_tile = 9,
2724 .incremental_tile = 8,
2725 .channel_tile = 8,
2726 };
2727 xnn_params.f16.gavgpool = (struct gavgpool_parameters) {
2728 .unipass = (xnn_gavgpool_unipass_ukernel_function) xnn_f16_gavgpool_minmax_ukernel_7x__neonfp16arith_c8,
2729 .multipass = (xnn_gavgpool_multipass_ukernel_function) xnn_f16_gavgpool_minmax_ukernel_7p7x__neonfp16arith_c8,
2730 .init.f16 = xnn_init_f16_scaleminmax_fp16arith_params,
2731 .update.f16 = xnn_update_f16_scaleminmax_fp16arith_params,
2732 .row_tile = 7,
2733 .channel_tile = 8,
2734 };
2735
2736 xnn_params.f16.maxpool = (struct maxpool_parameters) {
2737 .ukernel = (xnn_maxpool_ukernel_function) xnn_f16_maxpool_minmax_ukernel_9p8x__neonfp16arith_c8,
2738 .init.f16 = xnn_init_f16_minmax_fp16arith_params,
2739 .mr = 9,
2740 .qr = 8,
2741 };
2742 xnn_params.f16.ibilinear = (struct ibilinear_parameters) {
2743 .ukernel = (xnn_ibilinear_ukernel_function) xnn_f16_ibilinear_ukernel__neonfp16arith_c8,
2744 .pixel_tile = 1,
2745 .channel_tile = 8,
2746 };
2747
2748 xnn_params.f16.prelu = (struct prelu_parameters) {
2749 .ukernel = (xnn_prelu_ukernel_function) xnn_f16_prelu_ukernel__neonfp16arith_2x16,
2750 .row_tile = 2,
2751 .channel_tile = 16,
2752 };
2753
2754 xnn_params.f16.raddstoreexpminusmax = (struct raddstoreexpminusmax_parameters) {
2755 .ukernel = (xnn_raddstoreexpminusmax_ukernel_function) xnn_f16_raddstoreexpminusmax_ukernel__neonfp16arith_rr2_p2_x40,
2756 .init.f16 = xnn_init_f16_expminus_fp16arith_rr2_p2_params,
2757 .element_tile = 40,
2758 };
2759 xnn_params.f16.rmax = (xnn_rmax_ukernel_function) xnn_f16_rmax_ukernel__neonfp16arith;
2760
2761 xnn_params.f16.vadd = (struct vbinary_parameters) {
2762 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f16_vadd_minmax_ukernel__neonfp16arith_x16,
2763 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f16_vaddc_minmax_ukernel__neonfp16arith_x16,
2764 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f16_vaddc_minmax_ukernel__neonfp16arith_x16,
2765 .init.f16_minmax = xnn_init_f16_minmax_fp16arith_params,
2766 .element_tile = 16,
2767 };
2768 xnn_params.f16.vdiv = (struct vbinary_parameters) {
2769 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f16_vdiv_minmax_ukernel__neonfp16arith_x8,
2770 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f16_vdivc_minmax_ukernel__neonfp16arith_x8,
2771 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f16_vrdivc_minmax_ukernel__neonfp16arith_x8,
2772 .init.f16_minmax = xnn_init_f16_minmax_fp16arith_params,
2773 .element_tile = 8,
2774 };
2775 xnn_params.f16.vmax = (struct vbinary_parameters) {
2776 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f16_vmax_ukernel__neonfp16arith_x16,
2777 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f16_vmaxc_ukernel__neonfp16arith_x16,
2778 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f16_vmaxc_ukernel__neonfp16arith_x16,
2779 .element_tile = 16,
2780 };
2781 xnn_params.f16.vmin = (struct vbinary_parameters) {
2782 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f16_vmin_ukernel__neonfp16arith_x16,
2783 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f16_vminc_ukernel__neonfp16arith_x16,
2784 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f16_vminc_ukernel__neonfp16arith_x16,
2785 .element_tile = 16,
2786 };
2787 xnn_params.f16.vmul = (struct vbinary_parameters) {
2788 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f16_vmul_minmax_ukernel__neonfp16arith_x16,
2789 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f16_vmulc_minmax_ukernel__neonfp16arith_x16,
2790 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f16_vmulc_minmax_ukernel__neonfp16arith_x16,
2791 .init.f16_minmax = xnn_init_f16_minmax_fp16arith_params,
2792 .element_tile = 16,
2793 };
2794 xnn_params.f16.vsub = (struct vbinary_parameters) {
2795 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f16_vsub_minmax_ukernel__neonfp16arith_x16,
2796 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f16_vsubc_minmax_ukernel__neonfp16arith_x16,
2797 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f16_vrsubc_minmax_ukernel__neonfp16arith_x16,
2798 .init.f16_minmax = xnn_init_f16_minmax_fp16arith_params,
2799 .element_tile = 16,
2800 };
2801 xnn_params.f16.vsqrdiff = (struct vbinary_parameters) {
2802 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f16_vsqrdiff_ukernel__neonfp16arith_x16,
2803 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f16_vsqrdiffc_ukernel__neonfp16arith_x16,
2804 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f16_vsqrdiffc_ukernel__neonfp16arith_x16,
2805 .init.f16_minmax = xnn_init_f16_minmax_fp16arith_params,
2806 .element_tile = 16,
2807 };
2808
2809 xnn_params.f16.vmulcaddc = (struct vmulcaddc_parameters) {
2810 .ukernel = (xnn_vmulcaddc_ukernel_function) xnn_f16_vmulcaddc_minmax_ukernel_c8__neonfp16arith_2x,
2811 .init.f16 = xnn_init_f16_minmax_fp16arith_params,
2812 .channel_tile = 8,
2813 .row_tile = 2,
2814 };
2815
2816 xnn_params.f16.abs = (struct vunary_parameters) {
2817 .ukernel = (xnn_vunary_ukernel_function) xnn_f16_vabs_ukernel__neonfp16arith_x16,
2818 .element_tile = 16,
2819 };
2820 xnn_params.f16.clamp = (struct vunary_parameters) {
2821 .ukernel = (xnn_vunary_ukernel_function) xnn_f16_vclamp_ukernel__neonfp16arith_x16,
2822 .init.f16_minmax = xnn_init_f16_minmax_fp16arith_params,
2823 .element_tile = 16,
2824 };
2825 xnn_params.f16.elu = (struct vunary_parameters) {
2826 .ukernel = (xnn_vunary_ukernel_function) xnn_f16_velu_ukernel__neonfp16arith_rr1_p3_x16,
2827 .init.f16_elu = xnn_init_f16_elu_fp16arith_rr1_p3_params,
2828 .element_tile = 16,
2829 };
2830 xnn_params.f16.hswish = (struct vunary_parameters) {
2831 .ukernel = (xnn_vunary_ukernel_function) xnn_f16_vhswish_ukernel__neonfp16arith_x16,
2832 .init.f16_hswish = xnn_init_f16_hswish_fp16arith_params,
2833 .element_tile = 16,
2834 };
2835 xnn_params.f16.lrelu = (struct vunary_parameters) {
2836 .ukernel = (xnn_vunary_ukernel_function) xnn_f16_vlrelu_ukernel__neonfp16arith_x16,
2837 .init.f16_lrelu = xnn_init_f16_lrelu_fp16arith_params,
2838 .element_tile = 16,
2839 };
2840 xnn_params.f16.neg = (struct vunary_parameters) {
2841 .ukernel = (xnn_vunary_ukernel_function) xnn_f16_vneg_ukernel__neonfp16arith_x16,
2842 .element_tile = 16,
2843 };
2844 xnn_params.f16.rndne = (struct vunary_parameters) {
2845 .ukernel = (xnn_vunary_ukernel_function) xnn_f16_vrndne_ukernel__neonfp16arith_x16,
2846 .element_tile = 16,
2847 };
2848 xnn_params.f16.rndz = (struct vunary_parameters) {
2849 .ukernel = (xnn_vunary_ukernel_function) xnn_f16_vrndz_ukernel__neonfp16arith_x16,
2850 .element_tile = 16,
2851 };
2852 xnn_params.f16.rndu = (struct vunary_parameters) {
2853 .ukernel = (xnn_vunary_ukernel_function) xnn_f16_vrndu_ukernel__neonfp16arith_x16,
2854 .element_tile = 16,
2855 };
2856 xnn_params.f16.rndd = (struct vunary_parameters) {
2857 .ukernel = (xnn_vunary_ukernel_function) xnn_f16_vrndd_ukernel__neonfp16arith_x16,
2858 .element_tile = 16,
2859 };
2860 xnn_params.f16.sigmoid = (struct vunary_parameters) {
2861 .ukernel = (xnn_vunary_ukernel_function) xnn_f16_vsigmoid_ukernel__neonfp16arith_rr2_p2_nr1fma_x40,
2862 .init.f16_sigmoid = xnn_init_f16_sigmoid_fp16arith_rr2_p2_params,
2863 .element_tile = 40,
2864 };
2865 xnn_params.f16.sqr = (struct vunary_parameters) {
2866 .ukernel = (xnn_vunary_ukernel_function) xnn_f16_vsqr_ukernel__neonfp16arith_x16,
2867 .element_tile = 16,
2868 };
2869 xnn_params.f16.sqrt = (struct vunary_parameters) {
2870 .ukernel = (xnn_vunary_ukernel_function) xnn_f16_vsqrt_ukernel__neonfp16arith_sqrt_x8,
2871 .element_tile = 8,
2872 };
2873
2874 #ifndef XNN_NO_NCHW_OPERATORS
2875 init_flags |= XNN_INIT_FLAG_CHW_OPT;
2876
2877 xnn_params.f16.spmm = (struct spmm_parameters) {
2878 .ukernel = (xnn_spmm_ukernel_function) xnn_f16_spmm_minmax_ukernel_32x1__neonfp16arith,
2879 .mr = 32,
2880 .nr = 1,
2881 };
2882 xnn_params.f16.conv_hwc2chw_3x3c3s2 = (struct conv_hwc2chw_parameters) {
2883 .ukernel_with_symm_padding =
2884 (xnn_conv_hwc2chw_ukernel_function) xnn_f16_conv_hwc2chw_ukernel_3x3s2p1c3x4__neonfp16arith_2x2,
2885 .output_channel_tile = 4,
2886 .output_height_tile = 2,
2887 .output_width_tile = 2,
2888 };
2889 xnn_params.f16.dwconv2d_chw_3x3 = (struct dwconv2d_chw_parameters) {
2890 .ukernel = (xnn_dwconv2d_chw_ukernel_function) xnn_f16_dwconv2d_chw_ukernel_3x3p1__neonfp16arith_2x8,
2891 .output_width_tile = 8,
2892 .output_height_tile = 2,
2893 };
2894 xnn_params.f16.dwconv2d_chw_3x3s2 = (struct dwconv2d_chw_parameters) {
2895 .ukernel = (xnn_dwconv2d_chw_ukernel_function) xnn_f16_dwconv2d_chw_ukernel_3x3s2p1__neonfp16arith_1x4,
2896 .output_width_tile = 4,
2897 .output_height_tile = 1,
2898 };
2899 xnn_params.f16.dwconv2d_chw_5x5 = (struct dwconv2d_chw_parameters) {
2900 .ukernel = (xnn_dwconv2d_chw_ukernel_function) xnn_f16_dwconv2d_chw_ukernel_5x5p2__neonfp16arith_1x4,
2901 .output_width_tile = 4,
2902 .output_height_tile = 1,
2903 };
2904 xnn_params.f16.dwconv2d_chw_5x5s2 = (struct dwconv2d_chw_parameters) {
2905 .ukernel = (xnn_dwconv2d_chw_ukernel_function) xnn_f16_dwconv2d_chw_ukernel_5x5s2p2__neonfp16arith_1x4,
2906 .output_width_tile = 4,
2907 .output_height_tile = 1,
2908 };
2909 xnn_params.f16.gavgpool_cw = (struct gavgpool_cw_parameters) {
2910 .ukernel = (xnn_gavgpool_cw_ukernel_function) xnn_f16_gavgpool_cw_ukernel__neonfp16arith_x4,
2911 .channel_tile = 4,
2912 };
2913 xnn_params.f16.ibilinear_chw = (struct ibilinear_chw_parameters) {
2914 .ukernel = (xnn_ibilinear_chw_ukernel_function) xnn_f16_ibilinear_chw_ukernel__neonfp16arith_p8,
2915 .channel_tile = 1,
2916 .pixel_tile = 8,
2917 };
2918 #endif // XNN_NO_NCHW_OPERATORS
2919 }
2920 #endif // XNN_ENABLE_ARM_FP16
2921 #endif // XNN_NO_F16_OPERATORS
2922
2923 /**************************** F32 AArch64 micro-kernels ****************************/
2924 #ifndef XNN_NO_F32_OPERATORS
2925 init_flags |= XNN_INIT_FLAG_F32;
2926
2927 #if XNN_ENABLE_ASSEMBLY && !XNN_PLATFORM_IOS && !XNN_PLATFORM_MAC
2928 switch (cpuinfo_get_core(0)->uarch) {
2929 case cpuinfo_uarch_cortex_a72:
2930 xnn_params.f32.gemm.minmax.gemm[XNN_MR_TO_INDEX(4)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_4x8__aarch64_neonfma_prfm_cortex_a75);
2931 xnn_params.f32.gemm.minmax.igemm[XNN_MR_TO_INDEX(4)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_4x8__aarch64_neonfma_prfm_cortex_a75);
2932 xnn_params.f32.gemm.minmax.gemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_1x8__aarch64_neonfma_prfm_cortex_a75);
2933 xnn_params.f32.gemm.minmax.igemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_1x8__aarch64_neonfma_prfm_cortex_a75);
2934 xnn_params.f32.gemm.init.f32 = xnn_init_f32_minmax_scalar_params;
2935 xnn_params.f32.gemm.mr = 4;
2936 xnn_params.f32.gemm.nr = 8;
2937 break;
2938 case cpuinfo_uarch_cortex_a57:
2939 case cpuinfo_uarch_cortex_a75:
2940 case cpuinfo_uarch_cortex_a76:
2941 case cpuinfo_uarch_exynos_m3:
2942 case cpuinfo_uarch_exynos_m4:
2943 xnn_params.f32.gemm.minmax.gemm[XNN_MR_TO_INDEX(6)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_6x8__aarch64_neonfma_prfm_cortex_a75);
2944 xnn_params.f32.gemm.minmax.igemm[XNN_MR_TO_INDEX(6)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_6x8__aarch64_neonfma_prfm_cortex_a75);
2945 #if XNN_ENABLE_GEMM_M_SPECIALIZATION
2946 xnn_params.f32.gemm.minmax.gemm[XNN_MR_TO_INDEX(4)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_4x8__aarch64_neonfma_prfm_cortex_a75);
2947 xnn_params.f32.gemm.minmax.igemm[XNN_MR_TO_INDEX(4)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_4x8__aarch64_neonfma_prfm_cortex_a75);
2948 #endif
2949 xnn_params.f32.gemm.minmax.gemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_1x8__aarch64_neonfma_prfm_cortex_a75);
2950 xnn_params.f32.gemm.minmax.igemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_1x8__aarch64_neonfma_prfm_cortex_a75);
2951 xnn_params.f32.gemm.init.f32 = xnn_init_f32_minmax_scalar_params;
2952 xnn_params.f32.gemm.mr = 6;
2953 xnn_params.f32.gemm.nr = 8;
2954 #if XNN_ENABLE_JIT
2955 xnn_params.f32.gemm.generator.gemm = xnn_init_hmp_gemm_codegen(xnn_generate_f32_gemm_ukernel_upto6x8__aarch64_neonfma_prfm_cortex_a75);
2956 xnn_params.f32.gemm.generator.igemm = xnn_init_hmp_igemm_codegen(xnn_generate_f32_igemm_ukernel_upto6x8__aarch64_neonfma_prfm_cortex_a75);
2957 xnn_params.f32.gemm.generator.gemm1 = xnn_init_hmp_gemm_codegen(xnn_generate_f32_gemm_ukernel_1x8__aarch64_neonfma_prfm_cortex_a75);
2958 xnn_params.f32.gemm.generator.igemm1 = xnn_init_hmp_igemm_codegen(xnn_generate_f32_igemm_ukernel_1x8__aarch64_neonfma_prfm_cortex_a75);
2959 #endif
2960 break;
2961 case cpuinfo_uarch_exynos_m1:
2962 case cpuinfo_uarch_exynos_m2:
2963 xnn_params.f32.gemm.minmax.gemm[XNN_MR_TO_INDEX(6)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_6x8s4__neonfma);
2964 xnn_params.f32.gemm.minmax.igemm[XNN_MR_TO_INDEX(6)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_6x8s4__neonfma);
2965 xnn_params.f32.gemm.minmax.gemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_1x8s4__neonfma);
2966 xnn_params.f32.gemm.minmax.igemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_1x8s4__neonfma);
2967 xnn_params.f32.gemm.init.f32 = xnn_init_f32_minmax_scalar_params;
2968 xnn_params.f32.gemm.mr = 6;
2969 xnn_params.f32.gemm.nr = 8;
2970 xnn_params.f32.gemm.log2_sr = 2;
2971 break;
2972 case cpuinfo_uarch_cortex_a53:
2973 xnn_params.f32.gemm.minmax.gemm[XNN_MR_TO_INDEX(6)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_6x8__aarch64_neonfma_prfm_cortex_a53);
2974 xnn_params.f32.gemm.minmax.igemm[XNN_MR_TO_INDEX(6)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_6x8__aarch64_neonfma_prfm_cortex_a53);
2975 xnn_params.f32.gemm.minmax.gemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_1x8__aarch64_neonfma_prfm_cortex_a53);
2976 xnn_params.f32.gemm.minmax.igemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_1x8__aarch64_neonfma_prfm_cortex_a53);
2977 xnn_params.f32.gemm.init.f32 = xnn_init_f32_minmax_scalar_params;
2978 xnn_params.f32.gemm.mr = 6;
2979 xnn_params.f32.gemm.nr = 8;
2980 break;
2981 case cpuinfo_uarch_cortex_a55r0:
2982 xnn_params.f32.gemm.minmax.gemm[XNN_MR_TO_INDEX(6)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_6x8__aarch64_neonfma_cortex_a53);
2983 xnn_params.f32.gemm.minmax.igemm[XNN_MR_TO_INDEX(6)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_6x8__aarch64_neonfma_cortex_a53);
2984 xnn_params.f32.gemm.minmax.gemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_1x8__aarch64_neonfma_cortex_a53);
2985 xnn_params.f32.gemm.minmax.igemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_1x8__aarch64_neonfma_cortex_a53);
2986 xnn_params.f32.gemm.init.f32 = xnn_init_f32_minmax_scalar_params;
2987 xnn_params.f32.gemm.mr = 6;
2988 xnn_params.f32.gemm.nr = 8;
2989 break;
2990 case cpuinfo_uarch_cortex_a35:
2991 case cpuinfo_uarch_cortex_a55:
2992 case cpuinfo_uarch_kryo:
2993 xnn_params.f32.gemm.minmax.gemm[XNN_MR_TO_INDEX(6)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_6x8__aarch64_neonfma_cortex_a55);
2994 xnn_params.f32.gemm.minmax.igemm[XNN_MR_TO_INDEX(6)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_6x8__aarch64_neonfma_cortex_a55);
2995 xnn_params.f32.gemm.minmax.gemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_1x8__aarch64_neonfma_cortex_a53);
2996 xnn_params.f32.gemm.minmax.igemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_1x8__aarch64_neonfma_cortex_a53);
2997 xnn_params.f32.gemm.init.f32 = xnn_init_f32_minmax_scalar_params;
2998 xnn_params.f32.gemm.mr = 6;
2999 xnn_params.f32.gemm.nr = 8;
3000 break;
3001 case cpuinfo_uarch_cortex_a73:
3002 xnn_params.f32.gemm.minmax.gemm[XNN_MR_TO_INDEX(6)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_6x8__aarch64_neonfma_cortex_a73);
3003 xnn_params.f32.gemm.minmax.igemm[XNN_MR_TO_INDEX(6)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_6x8__aarch64_neonfma_cortex_a73);
3004 xnn_params.f32.gemm.minmax.gemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_1x8__aarch64_neonfma_prfm_cortex_a75);
3005 xnn_params.f32.gemm.minmax.igemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_1x8__aarch64_neonfma_prfm_cortex_a75);
3006 xnn_params.f32.gemm.init.f32 = xnn_init_f32_minmax_scalar_params;
3007 xnn_params.f32.gemm.mr = 6;
3008 xnn_params.f32.gemm.nr = 8;
3009 break;
3010 case cpuinfo_uarch_cortex_a77:
3011 case cpuinfo_uarch_exynos_m5:
3012 xnn_params.f32.gemm.minmax.gemm[XNN_MR_TO_INDEX(4)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_4x8__aarch64_neonfma_cortex_a75);
3013 xnn_params.f32.gemm.minmax.igemm[XNN_MR_TO_INDEX(4)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_4x8__aarch64_neonfma_cortex_a75);
3014 xnn_params.f32.gemm.minmax.gemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_1x8__aarch64_neonfma_cortex_a75);
3015 xnn_params.f32.gemm.minmax.igemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_1x8__aarch64_neonfma_cortex_a75);
3016 xnn_params.f32.gemm.init.f32 = xnn_init_f32_minmax_scalar_params;
3017 xnn_params.f32.gemm.mr = 4;
3018 xnn_params.f32.gemm.nr = 8;
3019 break;
3020 case cpuinfo_uarch_cortex_a78:
3021 case cpuinfo_uarch_cortex_x1:
3022 default:
3023 xnn_params.f32.gemm.minmax.gemm[XNN_MR_TO_INDEX(6)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_6x8__aarch64_neonfma_ld128);
3024 xnn_params.f32.gemm.minmax.igemm[XNN_MR_TO_INDEX(6)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_6x8__aarch64_neonfma_ld128);
3025 xnn_params.f32.gemm.minmax.gemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_1x8__neonfma_lane_ld64);
3026 xnn_params.f32.gemm.minmax.igemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_1x8__neonfma_lane_ld64);
3027 xnn_params.f32.gemm.init.f32 = xnn_init_f32_minmax_scalar_params;
3028 xnn_params.f32.gemm.mr = 6;
3029 xnn_params.f32.gemm.nr = 8;
3030 #if XNN_ENABLE_JIT
3031 xnn_params.f32.gemm.generator.gemm = xnn_init_hmp_gemm_codegen(xnn_generate_f32_gemm_ukernel_6x8__aarch64_neonfma_ld128);
3032 xnn_params.f32.gemm.generator.igemm = xnn_init_hmp_igemm_codegen(xnn_generate_f32_igemm_ukernel_6x8__aarch64_neonfma_ld128);
3033 #endif
3034 break;
3035 }
3036 #if XNN_MAX_UARCH_TYPES > 1
3037 {
3038 /* Choose micro-kernels for little cores according to micro-kernel specification for the big core */
3039 const uint32_t mr = xnn_params.f32.gemm.mr;
3040 const uint32_t nr = xnn_params.f32.gemm.nr;
3041 const uint32_t log2_sr = xnn_params.f32.gemm.log2_sr;
3042 for (size_t i = 1; i < XNN_MAX_UARCH_TYPES; i++) {
3043 const struct cpuinfo_uarch_info* uarch_info = cpuinfo_get_uarch(i);
3044 if (uarch_info == NULL) {
3045 /* No more microarchitectures in the system */
3046 break;
3047 }
3048
3049 switch (uarch_info->uarch) {
3050 case cpuinfo_uarch_cortex_a53:
3051 if (mr == 6 && nr == 8 && log2_sr == 0) {
3052 xnn_params.f32.gemm.minmax.gemm[XNN_MR_TO_INDEX(6)].function[i] = (xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_6x8__aarch64_neonfma_prfm_cortex_a53;
3053 xnn_params.f32.gemm.minmax.igemm[XNN_MR_TO_INDEX(6)].function[i] = (xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_6x8__aarch64_neonfma_prfm_cortex_a53;
3054 xnn_params.f32.gemm.minmax.gemm[XNN_MR_TO_INDEX(1)].function[i] = (xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_1x8__aarch64_neonfma_cortex_a53;
3055 xnn_params.f32.gemm.minmax.igemm[XNN_MR_TO_INDEX(1)].function[i] = (xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_1x8__aarch64_neonfma_prfm_cortex_a53;
3056 } else if (mr == 4 && nr == 8 && log2_sr == 0) {
3057 xnn_params.f32.gemm.minmax.gemm[XNN_MR_TO_INDEX(4)].function[i] = (xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_4x8__aarch64_neonfma_prfm_cortex_a53;
3058 xnn_params.f32.gemm.minmax.igemm[XNN_MR_TO_INDEX(4)].function[i] = (xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_4x8__aarch64_neonfma_prfm_cortex_a53;
3059 xnn_params.f32.gemm.minmax.gemm[XNN_MR_TO_INDEX(1)].function[i] = (xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_1x8__aarch64_neonfma_cortex_a53;
3060 xnn_params.f32.gemm.minmax.igemm[XNN_MR_TO_INDEX(1)].function[i] = (xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_1x8__aarch64_neonfma_prfm_cortex_a53;
3061 }
3062 break;
3063 case cpuinfo_uarch_cortex_a55r0:
3064 if (mr == 6 && nr == 8 && log2_sr == 0) {
3065 xnn_params.f32.gemm.minmax.gemm[XNN_MR_TO_INDEX(6)].function[i] = (xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_6x8__aarch64_neonfma_cortex_a53;
3066 xnn_params.f32.gemm.minmax.igemm[XNN_MR_TO_INDEX(6)].function[i] = (xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_6x8__aarch64_neonfma_cortex_a53;
3067 xnn_params.f32.gemm.minmax.gemm[XNN_MR_TO_INDEX(1)].function[i] = (xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_1x8__aarch64_neonfma_cortex_a53;
3068 xnn_params.f32.gemm.minmax.igemm[XNN_MR_TO_INDEX(1)].function[i] = (xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_1x8__aarch64_neonfma_cortex_a53;
3069 } else if (mr == 4 && nr == 8 && log2_sr == 0) {
3070 xnn_params.f32.gemm.minmax.gemm[XNN_MR_TO_INDEX(4)].function[i] = (xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_4x8__aarch64_neonfma_cortex_a53;
3071 xnn_params.f32.gemm.minmax.igemm[XNN_MR_TO_INDEX(4)].function[i] = (xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_4x8__aarch64_neonfma_cortex_a53;
3072 xnn_params.f32.gemm.minmax.gemm[XNN_MR_TO_INDEX(1)].function[i] = (xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_1x8__aarch64_neonfma_cortex_a53;
3073 xnn_params.f32.gemm.minmax.igemm[XNN_MR_TO_INDEX(1)].function[i] = (xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_1x8__aarch64_neonfma_cortex_a53;
3074 }
3075 break;
3076 case cpuinfo_uarch_cortex_a55:
3077 if (mr == 6 && nr == 8 && log2_sr == 0) {
3078 xnn_params.f32.gemm.minmax.gemm[XNN_MR_TO_INDEX(6)].function[i] = (xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_6x8__aarch64_neonfma_cortex_a55;
3079 xnn_params.f32.gemm.minmax.igemm[XNN_MR_TO_INDEX(6)].function[i] = (xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_6x8__aarch64_neonfma_cortex_a55;
3080 xnn_params.f32.gemm.minmax.gemm[XNN_MR_TO_INDEX(1)].function[i] = (xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_1x8__aarch64_neonfma_cortex_a53;
3081 xnn_params.f32.gemm.minmax.igemm[XNN_MR_TO_INDEX(1)].function[i] = (xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_1x8__aarch64_neonfma_cortex_a53;
3082 } else if (mr == 4 && nr == 8 && log2_sr == 0) {
3083 xnn_params.f32.gemm.minmax.gemm[XNN_MR_TO_INDEX(4)].function[i] = (xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_4x8__aarch64_neonfma_cortex_a55;
3084 xnn_params.f32.gemm.minmax.igemm[XNN_MR_TO_INDEX(4)].function[i] = (xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_4x8__aarch64_neonfma_cortex_a55;
3085 xnn_params.f32.gemm.minmax.gemm[XNN_MR_TO_INDEX(1)].function[i] = (xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_1x8__aarch64_neonfma_cortex_a53;
3086 xnn_params.f32.gemm.minmax.igemm[XNN_MR_TO_INDEX(1)].function[i] = (xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_1x8__aarch64_neonfma_cortex_a53;
3087 }
3088 break;
3089 default:
3090 break;
3091 }
3092 }
3093 }
3094 #endif // XNN_MAX_UARCH_TYPES > 1
3095 xnn_params.f32.gemm2.minmax.gemm[XNN_MR_TO_INDEX(4)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_4x2__aarch64_neonfma_prfm_cortex_a75);
3096 xnn_params.f32.gemm2.minmax.igemm[XNN_MR_TO_INDEX(4)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_4x2__aarch64_neonfma_prfm_cortex_a75);
3097 xnn_params.f32.gemm2.init.f32 = xnn_init_f32_minmax_scalar_params;
3098 xnn_params.f32.gemm2.mr = 4;
3099 xnn_params.f32.gemm2.nr = 2;
3100
3101 #else // XNN_ENABLE_ASSEMBLY && !XNN_PLATFORM_IOS && !XNN_PLATFORM_MAC
3102 #if XNN_ENABLE_ASSEMBLY
3103 xnn_params.f32.gemm.minmax.gemm[XNN_MR_TO_INDEX(6)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_6x8__aarch64_neonfma_prfm_cortex_a75);
3104 xnn_params.f32.gemm.minmax.igemm[XNN_MR_TO_INDEX(6)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_6x8__aarch64_neonfma_prfm_cortex_a75);
3105 xnn_params.f32.gemm.minmax.gemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_1x8__aarch64_neonfma_prfm_cortex_a75);
3106 xnn_params.f32.gemm.minmax.igemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_1x8__aarch64_neonfma_prfm_cortex_a75);
3107 xnn_params.f32.gemm.init.f32 = xnn_init_f32_minmax_scalar_params;
3108 xnn_params.f32.gemm.mr = 6;
3109 xnn_params.f32.gemm.nr = 8;
3110
3111 xnn_params.f32.gemm2.minmax.gemm[XNN_MR_TO_INDEX(4)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_4x2__aarch64_neonfma_prfm_cortex_a75);
3112 xnn_params.f32.gemm2.minmax.igemm[XNN_MR_TO_INDEX(4)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_4x2__aarch64_neonfma_prfm_cortex_a75);
3113 xnn_params.f32.gemm2.init.f32 = xnn_init_f32_minmax_scalar_params;
3114 xnn_params.f32.gemm2.mr = 4;
3115 xnn_params.f32.gemm2.nr = 2;
3116 #else // !XNN_ENABLE_ASSEMBLY
3117 xnn_params.f32.gemm.minmax.gemm[XNN_MR_TO_INDEX(6)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_6x8__neonfma_lane_ld64);
3118 xnn_params.f32.gemm.minmax.igemm[XNN_MR_TO_INDEX(6)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_6x8__neonfma_lane_ld64);
3119 xnn_params.f32.gemm.minmax.gemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_1x8__neonfma_lane_ld64);
3120 xnn_params.f32.gemm.minmax.igemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_1x8__neonfma_lane_ld64);
3121 xnn_params.f32.gemm.init.f32 = xnn_init_f32_minmax_scalar_params;
3122 xnn_params.f32.gemm.mr = 6;
3123 xnn_params.f32.gemm.nr = 8;
3124
3125 xnn_params.f32.gemm2.minmax.gemm[XNN_MR_TO_INDEX(4)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_4x2__neonfma_lane_ld64);
3126 xnn_params.f32.gemm2.minmax.igemm[XNN_MR_TO_INDEX(4)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_4x2__neonfma_lane_ld64);
3127 xnn_params.f32.gemm2.init.f32 = xnn_init_f32_minmax_scalar_params;
3128 xnn_params.f32.gemm2.mr = 4;
3129 xnn_params.f32.gemm2.nr = 2;
3130 #endif // XNN_ENABLE_ASSEMBLY
3131 #endif // XNN_ENABLE_ASSEMBLY && !XNN_PLATFORM_IOS && !XNN_PLATFORM_MAC
3132
3133 xnn_params.f32.dwconv[0].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_f32_dwconv_minmax_ukernel_up8x3__neonfma;
3134 xnn_params.f32.dwconv[0].init.f32 = xnn_init_f32_minmax_scalar_params;
3135 xnn_params.f32.dwconv[0].channel_tile = 8;
3136 xnn_params.f32.dwconv[0].primary_tile = 3;
3137
3138 xnn_params.f32.dwconv[1].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_f32_dwconv_minmax_ukernel_up8x4__neonfma;
3139 xnn_params.f32.dwconv[1].init.f32 = xnn_init_f32_minmax_scalar_params;
3140 xnn_params.f32.dwconv[1].channel_tile = 8;
3141 xnn_params.f32.dwconv[1].primary_tile = 4;
3142
3143 #if XNN_PLATFORM_IOS || XNN_PLATFORM_MAC
3144 xnn_params.f32.dwconv[2].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_f32_dwconv_minmax_ukernel_up8x9__neonfma;
3145 xnn_params.f32.dwconv[2].init.f32 = xnn_init_f32_minmax_scalar_params;
3146 xnn_params.f32.dwconv[2].channel_tile = 8;
3147 xnn_params.f32.dwconv[2].primary_tile = 9;
3148 #else // !XNN_PLATFORM_IOS && !XNN_PLATFORM_MAC
3149 switch (cpuinfo_get_core(0)->uarch) {
3150 case cpuinfo_uarch_kryo:
3151 xnn_params.f32.dwconv[2].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_f32_dwconv_minmax_ukernel_up8x9__neonfma;
3152 xnn_params.f32.dwconv[2].init.f32 = xnn_init_f32_minmax_scalar_params;
3153 xnn_params.f32.dwconv[2].channel_tile = 8;
3154 xnn_params.f32.dwconv[2].primary_tile = 9;
3155 break;
3156 #if XNN_ENABLE_ASSEMBLY
3157 case cpuinfo_uarch_cortex_a53:
3158 case cpuinfo_uarch_cortex_a55r0:
3159 case cpuinfo_uarch_cortex_a55:
3160 xnn_params.f32.dwconv[2].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_f32_dwconv_minmax_ukernel_up4x9__aarch64_neonfma_cortex_a55;
3161 xnn_params.f32.dwconv[2].init.f32 = xnn_init_f32_minmax_scalar_params;
3162 xnn_params.f32.dwconv[2].channel_tile = 4;
3163 xnn_params.f32.dwconv[2].primary_tile = 9;
3164 break;
3165 #endif // XNN_ENABLE_ASSEMBLY
3166 default:
3167 xnn_params.f32.dwconv[2].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_f32_dwconv_minmax_ukernel_up8x9__neonfma;
3168 xnn_params.f32.dwconv[2].init.f32 = xnn_init_f32_minmax_scalar_params;
3169 xnn_params.f32.dwconv[2].channel_tile = 8;
3170 xnn_params.f32.dwconv[2].primary_tile = 9;
3171 break;
3172 }
3173 #endif // XNN_PLATFORM_IOS && XNN_PLATFORM_MAC
3174
3175 xnn_params.f32.dwconv[3].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_f32_dwconv_minmax_ukernel_up8x25__neonfma_acc2;
3176 xnn_params.f32.dwconv[3].init.f32 = xnn_init_f32_minmax_scalar_params;
3177 xnn_params.f32.dwconv[3].channel_tile = 8;
3178 xnn_params.f32.dwconv[3].primary_tile = 25;
3179
3180 xnn_params.f32.avgpool = (struct avgpool_parameters) {
3181 .unipass = (xnn_avgpool_unipass_ukernel_function) xnn_f32_avgpool_minmax_ukernel_9x__neon_c4,
3182 .multipass = (xnn_avgpool_multipass_ukernel_function) xnn_f32_avgpool_minmax_ukernel_9p8x__neon_c4,
3183 .init.f32 = xnn_init_f32_scaleminmax_scalar_params,
3184 .primary_tile = 9,
3185 .incremental_tile = 8,
3186 .channel_tile = 4,
3187 };
3188 xnn_params.f32.pavgpool = (struct pavgpool_parameters) {
3189 .unipass = (xnn_pavgpool_unipass_ukernel_function) xnn_f32_pavgpool_minmax_ukernel_9x__neon_c4,
3190 .multipass = (xnn_pavgpool_multipass_ukernel_function) xnn_f32_pavgpool_minmax_ukernel_9p8x__neon_c4,
3191 .init.f32 = xnn_init_f32_minmax_scalar_params,
3192 .primary_tile = 9,
3193 .incremental_tile = 8,
3194 .channel_tile = 4,
3195 };
3196 xnn_params.f32.gavgpool = (struct gavgpool_parameters) {
3197 .unipass = (xnn_gavgpool_unipass_ukernel_function) xnn_f32_gavgpool_minmax_ukernel_7x__neon_c4,
3198 .multipass = (xnn_gavgpool_multipass_ukernel_function) xnn_f32_gavgpool_minmax_ukernel_7p7x__neon_c4,
3199 .init.f32 = xnn_init_f32_scaleminmax_scalar_params,
3200 .update.f32 = xnn_update_f32_scaleminmax_scalar_params,
3201 .row_tile = 7,
3202 .channel_tile = 4,
3203 };
3204 xnn_params.f32.maxpool = (struct maxpool_parameters) {
3205 .ukernel = (xnn_maxpool_ukernel_function) xnn_f32_maxpool_minmax_ukernel_9p8x__neon_c4,
3206 .init.f32 = xnn_init_f32_minmax_scalar_params,
3207 .mr = 9,
3208 .qr = 8,
3209 };
3210 xnn_params.f32.argmaxpool[0] = (struct argmaxpool_parameters) {
3211 .up = (xnn_argmaxpool_unipass_ukernel_function) xnn_f32_argmaxpool_ukernel_4x__neon_c4,
3212 .mr = 4,
3213 };
3214 xnn_params.f32.argmaxpool[1] = (struct argmaxpool_parameters) {
3215 .up = (xnn_argmaxpool_unipass_ukernel_function) xnn_f32_argmaxpool_ukernel_9x__neon_c4,
3216 .mr = 9,
3217 };
3218 xnn_params.f32.argmaxpool[2] = (struct argmaxpool_parameters) {
3219 .mp = (xnn_argmaxpool_multipass_ukernel_function) xnn_f32_argmaxpool_ukernel_9p8x__neon_c4,
3220 .mr = 9,
3221 .qr = 8,
3222 };
3223 xnn_params.f32.ibilinear = (struct ibilinear_parameters) {
3224 .ukernel = (xnn_ibilinear_ukernel_function) xnn_f32_ibilinear_ukernel__neonfma_c8,
3225 .pixel_tile = 1,
3226 .channel_tile = 8,
3227 };
3228 xnn_params.f32.abs = (struct vunary_parameters) {
3229 .ukernel = (xnn_vunary_ukernel_function) xnn_f32_vabs_ukernel__neon_x8,
3230 .element_tile = 8,
3231 };
3232 xnn_params.f32.clamp = (struct vunary_parameters) {
3233 .ukernel = (xnn_vunary_ukernel_function) xnn_f32_vclamp_ukernel__neon_x8,
3234 .init.f32_minmax = xnn_init_f32_minmax_scalar_params,
3235 .element_tile = 8,
3236 };
3237 xnn_params.f32.elu = (struct vunary_parameters) {
3238 .ukernel = (xnn_vunary_ukernel_function) xnn_f32_velu_ukernel__neonfma_rr1_lut16_p3_x16,
3239 .init.f32_elu = xnn_init_f32_elu_neonfma_rr1_lut16_p3_params,
3240 .element_tile = 16,
3241 };
3242 xnn_params.f32.hswish = (struct vunary_parameters) {
3243 .ukernel = (xnn_vunary_ukernel_function) xnn_f32_vhswish_ukernel__neon_x16,
3244 .init.f32_hswish = xnn_init_f32_hswish_scalar_params,
3245 .element_tile = 16,
3246 };
3247 xnn_params.f32.lrelu = (struct vunary_parameters) {
3248 .ukernel = (xnn_vunary_ukernel_function) xnn_f32_vlrelu_ukernel__neon_x8,
3249 .init.f32_lrelu = xnn_init_f32_lrelu_scalar_params,
3250 .element_tile = 8,
3251 };
3252 xnn_params.f32.neg = (struct vunary_parameters) {
3253 .ukernel = (xnn_vunary_ukernel_function) xnn_f32_vneg_ukernel__neon_x8,
3254 .element_tile = 8,
3255 };
3256 xnn_params.f32.rndne = (struct vunary_parameters) {
3257 .ukernel = (xnn_vunary_ukernel_function) xnn_f32_vrndne_ukernel__neonv8_x8,
3258 .element_tile = 8,
3259 };
3260 xnn_params.f32.rndz = (struct vunary_parameters) {
3261 .ukernel = (xnn_vunary_ukernel_function) xnn_f32_vrndz_ukernel__neonv8_x8,
3262 .element_tile = 8,
3263 };
3264 xnn_params.f32.rndu = (struct vunary_parameters) {
3265 .ukernel = (xnn_vunary_ukernel_function) xnn_f32_vrndu_ukernel__neonv8_x8,
3266 .element_tile = 8,
3267 };
3268 xnn_params.f32.rndd = (struct vunary_parameters) {
3269 .ukernel = (xnn_vunary_ukernel_function) xnn_f32_vrndd_ukernel__neonv8_x8,
3270 .element_tile = 8,
3271 };
3272 xnn_params.f32.sigmoid = (struct vunary_parameters) {
3273 .ukernel = (xnn_vunary_ukernel_function) xnn_f32_vsigmoid_ukernel__neonfma_rr1_lut64_p2_nr2recps_x16,
3274 .init.f32_sigmoid = xnn_init_f32_sigmoid_neonfma_rr1_lut64_p2_params,
3275 .element_tile = 16,
3276 };
3277 xnn_params.f32.sqr = (struct vunary_parameters) {
3278 .ukernel = (xnn_vunary_ukernel_function) xnn_f32_vsqr_ukernel__neon_x8,
3279 .element_tile = 8,
3280 };
3281 xnn_params.f32.sqrt = (struct vunary_parameters) {
3282 .ukernel = (xnn_vunary_ukernel_function) xnn_f32_vsqrt_ukernel__neon_sqrt_x4,
3283 .element_tile = 4,
3284 };
3285 xnn_params.f32.prelu = (struct prelu_parameters) {
3286 .ukernel = (xnn_prelu_ukernel_function) xnn_f32_prelu_ukernel__neon_2x8,
3287 .row_tile = 2,
3288 .channel_tile = 8,
3289 };
3290 xnn_params.f32.raddstoreexpminusmax = (struct raddstoreexpminusmax_parameters) {
3291 .ukernel = (xnn_raddstoreexpminusmax_ukernel_function) xnn_f32_raddstoreexpminusmax_ukernel__neonfma_rr1_lut64_p2_x16,
3292 .init.f32 = xnn_init_f32_expminus_neonfma_rr1_lut64_p2_params,
3293 .element_tile = 16,
3294 };
3295 xnn_params.f32.rmax = (xnn_rmax_ukernel_function) xnn_f32_rmax_ukernel__neon;
3296 xnn_params.f32.vadd = (struct vbinary_parameters) {
3297 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vadd_minmax_ukernel__neon_x8,
3298 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vaddc_minmax_ukernel__neon_x8,
3299 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vaddc_minmax_ukernel__neon_x8,
3300 .init.f32_minmax = xnn_init_f32_minmax_scalar_params,
3301 .element_tile = 8,
3302 };
3303 xnn_params.f32.vdiv = (struct vbinary_parameters) {
3304 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vdiv_minmax_ukernel__neon_x8,
3305 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vdivc_minmax_ukernel__neon_x8,
3306 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vrdivc_minmax_ukernel__neon_x8,
3307 .init.f32_minmax = xnn_init_f32_minmax_scalar_params,
3308 .element_tile = 8,
3309 };
3310 xnn_params.f32.vmax = (struct vbinary_parameters) {
3311 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmax_ukernel__neon_x8,
3312 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmaxc_ukernel__neon_x8,
3313 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmaxc_ukernel__neon_x8,
3314 .element_tile = 8,
3315 };
3316 xnn_params.f32.vmin = (struct vbinary_parameters) {
3317 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmin_ukernel__neon_x8,
3318 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vminc_ukernel__neon_x8,
3319 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vminc_ukernel__neon_x8,
3320 .element_tile = 8,
3321 };
3322 xnn_params.f32.vmul = (struct vbinary_parameters) {
3323 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmul_minmax_ukernel__neon_x8,
3324 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmulc_minmax_ukernel__neon_x8,
3325 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmulc_minmax_ukernel__neon_x8,
3326 .init.f32_minmax = xnn_init_f32_minmax_scalar_params,
3327 .element_tile = 8,
3328 };
3329 xnn_params.f32.vsub = (struct vbinary_parameters) {
3330 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vsub_minmax_ukernel__neon_x8,
3331 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vsubc_minmax_ukernel__neon_x8,
3332 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vrsubc_minmax_ukernel__neon_x8,
3333 .init.f32_minmax = xnn_init_f32_minmax_scalar_params,
3334 .element_tile = 8,
3335 };
3336 xnn_params.f32.vsqrdiff = (struct vbinary_parameters) {
3337 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vsqrdiff_ukernel__neon_x8,
3338 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vsqrdiffc_ukernel__neon_x8,
3339 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vsqrdiffc_ukernel__neon_x8,
3340 .element_tile = 8,
3341 };
3342 xnn_params.f32.vmulcaddc = (struct vmulcaddc_parameters) {
3343 .ukernel = (xnn_vmulcaddc_ukernel_function) xnn_f32_vmulcaddc_minmax_ukernel_c4__neonfma_2x,
3344 .init.f32 = xnn_init_f32_minmax_scalar_params,
3345 .channel_tile = 4,
3346 .row_tile = 2,
3347 };
3348 #ifndef XNN_NO_NCHW_OPERATORS
3349 init_flags |= XNN_INIT_FLAG_CHW_OPT;
3350
3351 xnn_params.f32.spmm = (struct spmm_parameters) {
3352 .ukernel = (xnn_spmm_ukernel_function) xnn_f32_spmm_minmax_ukernel_32x1__neonfma_pipelined,
3353 .mr = 32,
3354 .nr = 1,
3355 };
3356 xnn_params.f32.spmm2 = (struct spmm_parameters) {
3357 .ukernel = (xnn_spmm_ukernel_function) xnn_f32_spmm_minmax_ukernel_32x2__neonfma,
3358 .mr = 32,
3359 .nr = 2,
3360 };
3361 xnn_params.f32.spmm4 = (struct spmm_parameters) {
3362 .ukernel = (xnn_spmm_ukernel_function) xnn_f32_spmm_minmax_ukernel_32x4__neonfma,
3363 .mr = 32,
3364 .nr = 4,
3365 };
3366 xnn_params.f32.conv_hwc2chw_3x3c3s2 = (struct conv_hwc2chw_parameters) {
3367 .ukernel_with_symm_padding =
3368 (xnn_conv_hwc2chw_ukernel_function) xnn_f32_conv_hwc2chw_ukernel_3x3s2p1c3x4__neonfma_2x2,
3369 .output_channel_tile = 4,
3370 .output_height_tile = 2,
3371 .output_width_tile = 2,
3372 };
3373 xnn_params.f32.dwconv2d_chw_3x3 = (struct dwconv2d_chw_parameters) {
3374 .ukernel = (xnn_dwconv2d_chw_ukernel_function) xnn_f32_dwconv2d_chw_ukernel_3x3p1__neonfma_3x4,
3375 .output_width_tile = 4,
3376 .output_height_tile = 3,
3377 };
3378 xnn_params.f32.dwconv2d_chw_3x3s2 = (struct dwconv2d_chw_parameters) {
3379 .ukernel = (xnn_dwconv2d_chw_ukernel_function) xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__neonfma_2x4_acc2,
3380 .output_width_tile = 4,
3381 .output_height_tile = 2,
3382 };
3383 xnn_params.f32.dwconv2d_chw_5x5 = (struct dwconv2d_chw_parameters) {
3384 .ukernel = (xnn_dwconv2d_chw_ukernel_function) xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_4x4,
3385 .output_width_tile = 4,
3386 .output_height_tile = 4,
3387 };
3388 xnn_params.f32.dwconv2d_chw_5x5s2 = (struct dwconv2d_chw_parameters) {
3389 .ukernel = (xnn_dwconv2d_chw_ukernel_function) xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__neonfma_1x4_acc2,
3390 .output_width_tile = 4,
3391 .output_height_tile = 1,
3392 };
3393 xnn_params.f32.gavgpool_cw = (struct gavgpool_cw_parameters) {
3394 .ukernel = (xnn_gavgpool_cw_ukernel_function) xnn_f32_gavgpool_cw_ukernel__neon_x4,
3395 .channel_tile = 4,
3396 };
3397 xnn_params.f32.ibilinear_chw = (struct ibilinear_chw_parameters) {
3398 .ukernel = (xnn_ibilinear_chw_ukernel_function) xnn_f32_ibilinear_chw_ukernel__neonfma_p8,
3399 .channel_tile = 1,
3400 .pixel_tile = 8,
3401 };
3402 #endif // XNN_NO_NCHW_OPERATORS
3403 #endif // XNN_NO_F32_OPERATORS
3404
3405 /*************************** VCVT AArch64 micro-kernels ***************************/
3406 #ifndef XNN_NO_VCVT_OPERATORS
3407 init_flags |= XNN_INIT_FLAG_VCVT;
3408
3409 xnn_params.vcvt.f16_to_f32 = (struct vunary_parameters) {
3410 .ukernel = (xnn_vunary_ukernel_function) xnn_f16_f32_vcvt_ukernel__neonfp16_x16,
3411 .element_tile = 16,
3412 };
3413 xnn_params.vcvt.f32_to_f16 = (struct vunary_parameters) {
3414 .ukernel = (xnn_vunary_ukernel_function) xnn_f32_f16_vcvt_ukernel__neonfp16_x16,
3415 .element_tile = 16,
3416 };
3417 xnn_params.vcvt.f32_to_qs8 = (struct vunary_parameters) {
3418 .ukernel = (xnn_vunary_ukernel_function) xnn_f32_qs8_vcvt_ukernel__neonv8_x32,
3419 .init.f32_qs8_cvt = xnn_init_f32_qs8_cvt_neonv8_params,
3420 .element_tile = 32,
3421 };
3422 xnn_params.vcvt.f32_to_qu8 = (struct vunary_parameters) {
3423 .ukernel = (xnn_vunary_ukernel_function) xnn_f32_qu8_vcvt_ukernel__neonv8_x32,
3424 .init.f32_qu8_cvt = xnn_init_f32_qu8_cvt_neonv8_params,
3425 .element_tile = 32,
3426 };
3427 xnn_params.vcvt.qs8 = (struct vunary_parameters) {
3428 .ukernel = (xnn_vunary_ukernel_function) xnn_qs8_vcvt_ukernel__neon_x32,
3429 .init.qs8_cvt = xnn_init_qs8_cvt_neon_params,
3430 .element_tile = 32,
3431 };
3432 xnn_params.vcvt.qs8_to_f32 = (struct vunary_parameters) {
3433 .ukernel = (xnn_vunary_ukernel_function) xnn_qs8_f32_vcvt_ukernel__neon_x32,
3434 .init.qs8_f32_cvt = xnn_init_qs8_f32_cvt_neon_params,
3435 .element_tile = 32,
3436 };
3437 xnn_params.vcvt.qu8 = (struct vunary_parameters) {
3438 .ukernel = (xnn_vunary_ukernel_function) xnn_qu8_vcvt_ukernel__neon_x32,
3439 .init.qu8_cvt = xnn_init_qu8_cvt_neon_params,
3440 .element_tile = 32,
3441 };
3442 xnn_params.vcvt.qu8_to_f32 = (struct vunary_parameters) {
3443 .ukernel = (xnn_vunary_ukernel_function) xnn_qu8_f32_vcvt_ukernel__neon_x32,
3444 .init.qu8_f32_cvt = xnn_init_qu8_f32_cvt_neon_params,
3445 .element_tile = 32,
3446 };
3447 #endif // XNN_NO_VCVT_OPERATORS
3448
3449 /**************************** X32 AArch64 micro-kernels ****************************/
3450 #ifndef XNN_NO_X32_OPERATORS
3451 init_flags |= XNN_INIT_FLAG_X32;
3452
3453 xnn_params.x32.unpool = (xnn_unpool_ukernel_function) xnn_x32_unpool_ukernel__neon;
3454 xnn_params.x32.zip = (struct zip_parameters) {
3455 .x2 = (xnn_zipc_ukernel_function) xnn_x32_zip_x2_ukernel__neon,
3456 .x3 = (xnn_zipc_ukernel_function) xnn_x32_zip_x3_ukernel__neon,
3457 .x4 = (xnn_zipc_ukernel_function) xnn_x32_zip_x4_ukernel__neon,
3458 .xm = (xnn_zipv_ukernel_function) xnn_x32_zip_xm_ukernel__neon,
3459 };
3460
3461 xnn_params.x32.transpose = (struct transpose_parameters) {
3462 .const_size_ukernel = (xnn_transposec_ukernel_function) xnn_x32_transposec_ukernel__4x4_aarch64_neon_tbl,
3463 .tile_size = 32,
3464 };
3465 #endif // XNN_NO_X32_OPERATORS
3466
3467 /**************************** XX AArch64 micro-kernels ****************************/
3468 #ifndef XNN_NO_XX_OPERATORS
3469 init_flags |= XNN_INIT_FLAG_XX;
3470
3471 xnn_params.xx.copy = (xnn_vunary_ukernel_function) xnn_xx_copy_ukernel__memcpy;
3472 xnn_params.xx.fill = (struct fill_parameters) {
3473 .ukernel = (xnn_fill_ukernel_function) xnn_xx_fill_ukernel__neon_x64,
3474 .row_tile = 1,
3475 };
3476 xnn_params.xx.pad = (struct pad_parameters) {
3477 .ukernel = (xnn_pad_ukernel_function) xnn_xx_pad_ukernel__neon,
3478 .row_tile = 1,
3479 };
3480 xnn_params.xx.transpose = (struct transpose_parameters) {
3481 .variable_size_ukernel = xnn_xx_transposev_ukernel__1x1_memcpy,
3482 .tile_size = 32,
3483 };
3484 #endif
3485
3486#elif XNN_ARCH_X86 || XNN_ARCH_X86_64
3487 if (!cpuinfo_has_x86_sse2()) {
3488 xnn_log_error("XNNPACK initialization failed: SSE2 is not supported");
3489 return;
3490 }
3491
3492 /**************************** QC8 x86 micro-kernels ****************************/
3493 #ifndef XNN_NO_QC8_OPERATORS
3494 init_flags |= XNN_INIT_FLAG_QC8;
3495
3496 if (!XNN_PLATFORM_MOBILE && cpuinfo_has_x86_avx512f() && cpuinfo_has_x86_avx512bw() && cpuinfo_has_x86_avx512dq() && cpuinfo_has_x86_avx512vl()) {
3497 xnn_params.qc8.gemm.minmax.gemm[XNN_MR_TO_INDEX(4)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qc8_gemm_minmax_fp32_ukernel_4x16c8__avx512skx);
3498 xnn_params.qc8.gemm.minmax.igemm[XNN_MR_TO_INDEX(4)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qc8_igemm_minmax_fp32_ukernel_4x16c8__avx512skx);
3499 xnn_params.qc8.gemm.minmax.gemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qc8_gemm_minmax_fp32_ukernel_1x16c8__avx512skx);
3500 xnn_params.qc8.gemm.minmax.igemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qc8_igemm_minmax_fp32_ukernel_1x16c8__avx512skx);
3501 xnn_params.qc8.gemm.init.qc8 = xnn_init_qc8_conv_minmax_fp32_avx512_params;
3502 xnn_params.qc8.gemm.mr = 4;
3503 xnn_params.qc8.gemm.nr = 16;
3504 xnn_params.qc8.gemm.log2_kr = 3;
3505 } else if (cpuinfo_has_x86_xop()) {
3506 // XOP should be checked before AVX2: AMD Excavator supports both, but performs better with XOP microkernels
3507 xnn_params.qc8.gemm.minmax.gemm[XNN_MR_TO_INDEX(2)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__xop_ld64);
3508 xnn_params.qc8.gemm.minmax.igemm[XNN_MR_TO_INDEX(2)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qc8_igemm_minmax_fp32_ukernel_2x4c8__xop_ld64);
3509 xnn_params.qc8.gemm.minmax.gemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__xop_ld64);
3510 xnn_params.qc8.gemm.minmax.igemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qc8_igemm_minmax_fp32_ukernel_1x4c8__xop_ld64);
3511 xnn_params.qc8.gemm.init.qc8 = xnn_init_qc8_conv_minmax_fp32_sse4_params;
3512 xnn_params.qc8.gemm.mr = 2;
3513 xnn_params.qc8.gemm.nr = 4;
3514 xnn_params.qc8.gemm.log2_kr = 3;
3515 } else if (cpuinfo_has_x86_avx2()) {
3516 xnn_params.qc8.gemm.minmax.gemm[XNN_MR_TO_INDEX(3)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qc8_gemm_minmax_fp32_ukernel_3x8c8__avx2);
3517 xnn_params.qc8.gemm.minmax.igemm[XNN_MR_TO_INDEX(3)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qc8_igemm_minmax_fp32_ukernel_3x8c8__avx2);
3518 xnn_params.qc8.gemm.minmax.gemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qc8_gemm_minmax_fp32_ukernel_1x8c8__avx2);
3519 xnn_params.qc8.gemm.minmax.igemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qc8_igemm_minmax_fp32_ukernel_1x8c8__avx2);
3520 xnn_params.qc8.gemm.init.qc8 = xnn_init_qc8_conv_minmax_fp32_avx2_params;
3521 xnn_params.qc8.gemm.mr = 3;
3522 xnn_params.qc8.gemm.nr = 8;
3523 xnn_params.qc8.gemm.log2_kr = 3;
3524 } else if (cpuinfo_has_x86_avx()) {
3525 xnn_params.qc8.gemm.minmax.gemm[XNN_MR_TO_INDEX(2)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__avx_ld128);
3526 xnn_params.qc8.gemm.minmax.igemm[XNN_MR_TO_INDEX(2)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qc8_igemm_minmax_fp32_ukernel_2x4c8__avx_ld128);
3527 xnn_params.qc8.gemm.minmax.gemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__avx_ld128);
3528 xnn_params.qc8.gemm.minmax.igemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qc8_igemm_minmax_fp32_ukernel_1x4c8__avx_ld128);
3529 xnn_params.qc8.gemm.init.qc8 = xnn_init_qc8_conv_minmax_fp32_sse4_params;
3530 xnn_params.qc8.gemm.mr = 2;
3531 xnn_params.qc8.gemm.nr = 4;
3532 xnn_params.qc8.gemm.log2_kr = 3;
3533 } else if (cpuinfo_has_x86_sse4_1()) {
3534 xnn_params.qc8.gemm.minmax.gemm[XNN_MR_TO_INDEX(3)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__sse41_ld64);
3535 xnn_params.qc8.gemm.minmax.igemm[XNN_MR_TO_INDEX(3)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qc8_igemm_minmax_fp32_ukernel_3x4c8__sse41_ld64);
3536 xnn_params.qc8.gemm.minmax.gemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__sse41_ld64);
3537 xnn_params.qc8.gemm.minmax.igemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qc8_igemm_minmax_fp32_ukernel_1x4c8__sse41_ld64);
3538 xnn_params.qc8.gemm.init.qc8 = xnn_init_qc8_conv_minmax_fp32_sse4_params;
3539 xnn_params.qc8.gemm.mr = 3;
3540 xnn_params.qc8.gemm.nr = 4;
3541 xnn_params.qc8.gemm.log2_kr = 3;
3542 } else {
3543 xnn_params.qc8.gemm.minmax.gemm[XNN_MR_TO_INDEX(3)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__sse2_ld64);
3544 xnn_params.qc8.gemm.minmax.igemm[XNN_MR_TO_INDEX(3)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qc8_igemm_minmax_fp32_ukernel_3x4c8__sse2_ld64);
3545 xnn_params.qc8.gemm.minmax.gemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__sse2_ld64);
3546 xnn_params.qc8.gemm.minmax.igemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qc8_igemm_minmax_fp32_ukernel_1x4c8__sse2_ld64);
3547 xnn_params.qc8.gemm.init.qc8 = xnn_init_qc8_conv_minmax_fp32_sse2_params;
3548 xnn_params.qc8.gemm.mr = 3;
3549 xnn_params.qc8.gemm.nr = 4;
3550 xnn_params.qc8.gemm.log2_kr = 3;
3551 }
3552
3553 if (!XNN_PLATFORM_MOBILE && cpuinfo_has_x86_avx512f() && cpuinfo_has_x86_avx512bw() && cpuinfo_has_x86_avx512dq() && cpuinfo_has_x86_avx512vl()) {
3554 xnn_params.qc8.dwconv[0].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qc8_dwconv_minmax_fp32_ukernel_up32x3__avx512skx_mul32;
3555 xnn_params.qc8.dwconv[0].init.qc8 = xnn_init_qc8_conv_minmax_fp32_avx512_params;
3556 xnn_params.qc8.dwconv[0].channel_tile = 32;
3557 xnn_params.qc8.dwconv[1].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qc8_dwconv_minmax_fp32_ukernel_up32x9__avx512skx_mul32;
3558 xnn_params.qc8.dwconv[1].init.qc8 = xnn_init_qc8_conv_minmax_fp32_avx512_params;
3559 xnn_params.qc8.dwconv[1].channel_tile = 32;
3560 xnn_params.qc8.dwconv[2].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qc8_dwconv_minmax_fp32_ukernel_up32x25__avx512skx_mul32;
3561 xnn_params.qc8.dwconv[2].init.qc8 = xnn_init_qc8_conv_minmax_fp32_avx512_params;
3562 xnn_params.qc8.dwconv[2].channel_tile = 32;
3563 } else if (cpuinfo_has_x86_xop()) {
3564 // XOP should be checked before AVX2: AMD Excavator supports both, but performs better with XOP microkernels
3565 xnn_params.qc8.dwconv[0].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qc8_dwconv_minmax_fp32_ukernel_up16x3__xop_mul16_add16;
3566 xnn_params.qc8.dwconv[0].init.qc8 = xnn_init_qc8_conv_minmax_fp32_sse4_params;
3567 xnn_params.qc8.dwconv[0].channel_tile = 16;
3568 xnn_params.qc8.dwconv[1].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qc8_dwconv_minmax_fp32_ukernel_up16x9__xop_mul16_add16;
3569 xnn_params.qc8.dwconv[1].init.qc8 = xnn_init_qc8_conv_minmax_fp32_sse4_params;
3570 xnn_params.qc8.dwconv[1].channel_tile = 16;
3571 xnn_params.qc8.dwconv[2].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qc8_dwconv_minmax_fp32_ukernel_up16x25__xop_mul16_add16;
3572 xnn_params.qc8.dwconv[2].init.qc8 = xnn_init_qc8_conv_minmax_fp32_sse4_params;
3573 xnn_params.qc8.dwconv[2].channel_tile = 16;
3574 } else if (cpuinfo_has_x86_avx2()) {
3575 xnn_params.qc8.dwconv[0].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qc8_dwconv_minmax_fp32_ukernel_up16x3__avx2_mul32;
3576 xnn_params.qc8.dwconv[0].init.qc8 = xnn_init_qc8_conv_minmax_fp32_avx2_params;
3577 xnn_params.qc8.dwconv[0].channel_tile = 16;
3578 xnn_params.qc8.dwconv[1].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qc8_dwconv_minmax_fp32_ukernel_up16x9__avx2_mul32;
3579 xnn_params.qc8.dwconv[1].init.qc8 = xnn_init_qc8_conv_minmax_fp32_avx2_params;
3580 xnn_params.qc8.dwconv[1].channel_tile = 16;
3581 xnn_params.qc8.dwconv[2].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qc8_dwconv_minmax_fp32_ukernel_up16x25__avx2_mul32;
3582 xnn_params.qc8.dwconv[2].init.qc8 = xnn_init_qc8_conv_minmax_fp32_avx2_params;
3583 xnn_params.qc8.dwconv[2].channel_tile = 16;
3584 } else if (cpuinfo_has_x86_avx()) {
3585 xnn_params.qc8.dwconv[0].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qc8_dwconv_minmax_fp32_ukernel_up16x3__avx_mul16_add16;
3586 xnn_params.qc8.dwconv[0].init.qc8 = xnn_init_qc8_conv_minmax_fp32_sse4_params;
3587 xnn_params.qc8.dwconv[0].channel_tile = 16;
3588 xnn_params.qc8.dwconv[1].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qc8_dwconv_minmax_fp32_ukernel_up16x9__avx_mul16_add16;
3589 xnn_params.qc8.dwconv[1].init.qc8 = xnn_init_qc8_conv_minmax_fp32_sse4_params;
3590 xnn_params.qc8.dwconv[1].channel_tile = 16;
3591 xnn_params.qc8.dwconv[2].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qc8_dwconv_minmax_fp32_ukernel_up16x25__avx_mul16_add16;
3592 xnn_params.qc8.dwconv[2].init.qc8 = xnn_init_qc8_conv_minmax_fp32_sse4_params;
3593 xnn_params.qc8.dwconv[2].channel_tile = 16;
3594 } else if (cpuinfo_has_x86_sse4_1()) {
3595 xnn_params.qc8.dwconv[0].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qc8_dwconv_minmax_fp32_ukernel_up8x3__sse41_mul16;
3596 xnn_params.qc8.dwconv[0].init.qc8 = xnn_init_qc8_conv_minmax_fp32_sse4_params;
3597 xnn_params.qc8.dwconv[0].channel_tile = 8;
3598 xnn_params.qc8.dwconv[1].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qc8_dwconv_minmax_fp32_ukernel_up8x9__sse41_mul16;
3599 xnn_params.qc8.dwconv[1].init.qc8 = xnn_init_qc8_conv_minmax_fp32_sse4_params;
3600 xnn_params.qc8.dwconv[1].channel_tile = 8;
3601 xnn_params.qc8.dwconv[2].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qc8_dwconv_minmax_fp32_ukernel_up8x25__sse41_mul16;
3602 xnn_params.qc8.dwconv[2].init.qc8 = xnn_init_qc8_conv_minmax_fp32_sse4_params;
3603 xnn_params.qc8.dwconv[2].channel_tile = 8;
3604 } else if (cpuinfo_has_x86_sse2()) {
3605 xnn_params.qc8.dwconv[0].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qc8_dwconv_minmax_fp32_ukernel_up8x3__sse2_mul16;
3606 xnn_params.qc8.dwconv[0].init.qc8 = xnn_init_qc8_conv_minmax_fp32_sse2_params;
3607 xnn_params.qc8.dwconv[0].channel_tile = 8;
3608 xnn_params.qc8.dwconv[1].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qc8_dwconv_minmax_fp32_ukernel_up8x9__sse2_mul16;
3609 xnn_params.qc8.dwconv[1].init.qc8 = xnn_init_qc8_conv_minmax_fp32_sse2_params;
3610 xnn_params.qc8.dwconv[1].channel_tile = 8;
3611 xnn_params.qc8.dwconv[2].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qc8_dwconv_minmax_fp32_ukernel_up8x25__sse2_mul16;
3612 xnn_params.qc8.dwconv[2].init.qc8 = xnn_init_qc8_conv_minmax_fp32_sse2_params;
3613 xnn_params.qc8.dwconv[2].channel_tile = 8;
3614 }
3615 xnn_params.qc8.dwconv[0].primary_tile = 3;
3616 xnn_params.qc8.dwconv[1].primary_tile = 9;
3617 xnn_params.qc8.dwconv[2].primary_tile = 25;
3618 #endif // XNN_NO_QC8_OPERATORS
3619
3620 /**************************** QS8 x86 micro-kernels ****************************/
3621 #ifndef XNN_NO_QS8_OPERATORS
3622 init_flags |= XNN_INIT_FLAG_QS8;
3623
3624 if (!XNN_PLATFORM_MOBILE && cpuinfo_has_x86_avx512f() && cpuinfo_has_x86_avx512bw() && cpuinfo_has_x86_avx512dq() && cpuinfo_has_x86_avx512vl()) {
3625 xnn_params.qs8.gemm.minmax.gemm[XNN_MR_TO_INDEX(4)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_fp32_ukernel_4x16c8__avx512skx);
3626 xnn_params.qs8.gemm.minmax.igemm[XNN_MR_TO_INDEX(4)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_fp32_ukernel_4x16c8__avx512skx);
3627 xnn_params.qs8.gemm.minmax.gemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_fp32_ukernel_1x16c8__avx512skx);
3628 xnn_params.qs8.gemm.minmax.igemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_fp32_ukernel_1x16c8__avx512skx);
3629 xnn_params.qs8.gemm.init.qs8 = xnn_init_qs8_conv_minmax_fp32_avx512_params;
3630 xnn_params.qs8.gemm.mr = 4;
3631 xnn_params.qs8.gemm.nr = 16;
3632 xnn_params.qs8.gemm.log2_kr = 3;
3633 } else if (cpuinfo_has_x86_xop()) {
3634 // XOP should be checked before AVX2: AMD Excavator supports both, but performs better with XOP microkernels
3635 xnn_params.qs8.gemm.minmax.gemm[XNN_MR_TO_INDEX(2)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_fp32_ukernel_2x4c8__xop_ld64);
3636 xnn_params.qs8.gemm.minmax.igemm[XNN_MR_TO_INDEX(2)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_fp32_ukernel_2x4c8__xop_ld64);
3637 xnn_params.qs8.gemm.minmax.gemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_fp32_ukernel_1x4c8__xop_ld64);
3638 xnn_params.qs8.gemm.minmax.igemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_fp32_ukernel_1x4c8__xop_ld64);
3639 xnn_params.qs8.gemm.init.qs8 = xnn_init_qs8_conv_minmax_fp32_sse4_params;
3640 xnn_params.qs8.gemm.mr = 2;
3641 xnn_params.qs8.gemm.nr = 4;
3642 xnn_params.qs8.gemm.log2_kr = 3;
3643 } else if (cpuinfo_has_x86_avx2()) {
3644 xnn_params.qs8.gemm.minmax.gemm[XNN_MR_TO_INDEX(3)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_fp32_ukernel_3x8c8__avx2);
3645 xnn_params.qs8.gemm.minmax.igemm[XNN_MR_TO_INDEX(3)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_fp32_ukernel_3x8c8__avx2);
3646 xnn_params.qs8.gemm.minmax.gemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_fp32_ukernel_1x8c8__avx2);
3647 xnn_params.qs8.gemm.minmax.igemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_fp32_ukernel_1x8c8__avx2);
3648 xnn_params.qs8.gemm.init.qs8 = xnn_init_qs8_conv_minmax_fp32_avx2_params;
3649 xnn_params.qs8.gemm.mr = 3;
3650 xnn_params.qs8.gemm.nr = 8;
3651 xnn_params.qs8.gemm.log2_kr = 3;
3652 } else if (cpuinfo_has_x86_avx()) {
3653 xnn_params.qs8.gemm.minmax.gemm[XNN_MR_TO_INDEX(2)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_fp32_ukernel_2x4c8__avx_ld128);
3654 xnn_params.qs8.gemm.minmax.igemm[XNN_MR_TO_INDEX(2)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_fp32_ukernel_2x4c8__avx_ld128);
3655 xnn_params.qs8.gemm.minmax.gemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_fp32_ukernel_1x4c8__avx_ld128);
3656 xnn_params.qs8.gemm.minmax.igemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_fp32_ukernel_1x4c8__avx_ld128);
3657 xnn_params.qs8.gemm.init.qs8 = xnn_init_qs8_conv_minmax_fp32_sse4_params;
3658 xnn_params.qs8.gemm.mr = 2;
3659 xnn_params.qs8.gemm.nr = 4;
3660 xnn_params.qs8.gemm.log2_kr = 3;
3661 } else if (cpuinfo_has_x86_sse4_1()) {
3662 xnn_params.qs8.gemm.minmax.gemm[XNN_MR_TO_INDEX(3)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_fp32_ukernel_3x4c8__sse41_ld64);
3663 xnn_params.qs8.gemm.minmax.igemm[XNN_MR_TO_INDEX(3)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_fp32_ukernel_3x4c8__sse41_ld64);
3664 xnn_params.qs8.gemm.minmax.gemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_fp32_ukernel_1x4c8__sse41_ld64);
3665 xnn_params.qs8.gemm.minmax.igemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_fp32_ukernel_1x4c8__sse41_ld64);
3666 xnn_params.qs8.gemm.init.qs8 = xnn_init_qs8_conv_minmax_fp32_sse4_params;
3667 xnn_params.qs8.gemm.mr = 3;
3668 xnn_params.qs8.gemm.nr = 4;
3669 xnn_params.qs8.gemm.log2_kr = 3;
3670 } else {
3671 xnn_params.qs8.gemm.minmax.gemm[XNN_MR_TO_INDEX(3)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_fp32_ukernel_3x4c8__sse2_ld64);
3672 xnn_params.qs8.gemm.minmax.igemm[XNN_MR_TO_INDEX(3)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_fp32_ukernel_3x4c8__sse2_ld64);
3673 xnn_params.qs8.gemm.minmax.gemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_fp32_ukernel_1x4c8__sse2_ld64);
3674 xnn_params.qs8.gemm.minmax.igemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_fp32_ukernel_1x4c8__sse2_ld64);
3675 xnn_params.qs8.gemm.init.qs8 = xnn_init_qs8_conv_minmax_fp32_sse2_params;
3676 xnn_params.qs8.gemm.mr = 3;
3677 xnn_params.qs8.gemm.nr = 4;
3678 xnn_params.qs8.gemm.log2_kr = 3;
3679 }
3680
3681 if (!XNN_PLATFORM_MOBILE && cpuinfo_has_x86_avx512f() && cpuinfo_has_x86_avx512bw() && cpuinfo_has_x86_avx512dq() && cpuinfo_has_x86_avx512vl()) {
3682 xnn_params.qs8.dwconv[0].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qs8_dwconv_minmax_fp32_ukernel_up32x9__avx512skx_mul32;
3683 xnn_params.qs8.dwconv[0].init.qs8 = xnn_init_qs8_conv_minmax_fp32_avx512_params;
3684 xnn_params.qs8.dwconv[0].channel_tile = 32;
3685 xnn_params.qs8.dwconv[1].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qs8_dwconv_minmax_fp32_ukernel_up32x25__avx512skx_mul32;
3686 xnn_params.qs8.dwconv[1].init.qs8 = xnn_init_qs8_conv_minmax_fp32_avx512_params;
3687 xnn_params.qs8.dwconv[1].channel_tile = 32;
3688 } else if (cpuinfo_has_x86_xop()) {
3689 // XOP should be checked before AVX2: AMD Excavator supports both, but performs better with XOP microkernels
3690 xnn_params.qs8.dwconv[0].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qs8_dwconv_minmax_fp32_ukernel_up16x9__xop_mul16_add16;
3691 xnn_params.qs8.dwconv[0].init.qs8 = xnn_init_qs8_conv_minmax_fp32_sse4_params;
3692 xnn_params.qs8.dwconv[0].channel_tile = 16;
3693 xnn_params.qs8.dwconv[1].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qs8_dwconv_minmax_fp32_ukernel_up16x25__xop_mul16_add16;
3694 xnn_params.qs8.dwconv[1].init.qs8 = xnn_init_qs8_conv_minmax_fp32_sse4_params;
3695 xnn_params.qs8.dwconv[1].channel_tile = 16;
3696 } else if (cpuinfo_has_x86_avx2()) {
3697 xnn_params.qs8.dwconv[0].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qs8_dwconv_minmax_fp32_ukernel_up16x9__avx2_mul32;
3698 xnn_params.qs8.dwconv[0].init.qs8 = xnn_init_qs8_conv_minmax_fp32_avx2_params;
3699 xnn_params.qs8.dwconv[0].channel_tile = 16;
3700 xnn_params.qs8.dwconv[1].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qs8_dwconv_minmax_fp32_ukernel_up16x25__avx2_mul32;
3701 xnn_params.qs8.dwconv[1].init.qs8 = xnn_init_qs8_conv_minmax_fp32_avx2_params;
3702 xnn_params.qs8.dwconv[1].channel_tile = 16;
3703 } else if (cpuinfo_has_x86_avx()) {
3704 xnn_params.qs8.dwconv[0].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qs8_dwconv_minmax_fp32_ukernel_up16x9__avx_mul16_add16;
3705 xnn_params.qs8.dwconv[0].init.qs8 = xnn_init_qs8_conv_minmax_fp32_sse4_params;
3706 xnn_params.qs8.dwconv[0].channel_tile = 16;
3707 xnn_params.qs8.dwconv[1].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qs8_dwconv_minmax_fp32_ukernel_up16x25__avx_mul16_add16;
3708 xnn_params.qs8.dwconv[1].init.qs8 = xnn_init_qs8_conv_minmax_fp32_sse4_params;
3709 xnn_params.qs8.dwconv[1].channel_tile = 16;
3710 } else if (cpuinfo_has_x86_sse4_1()) {
3711 xnn_params.qs8.dwconv[0].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qs8_dwconv_minmax_fp32_ukernel_up8x9__sse41_mul16_add16;
3712 xnn_params.qs8.dwconv[0].init.qs8 = xnn_init_qs8_conv_minmax_fp32_sse4_params;
3713 xnn_params.qs8.dwconv[0].channel_tile = 8;
3714 xnn_params.qs8.dwconv[1].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qs8_dwconv_minmax_fp32_ukernel_up8x25__sse41_mul16_add16;
3715 xnn_params.qs8.dwconv[1].init.qs8 = xnn_init_qs8_conv_minmax_fp32_sse4_params;
3716 xnn_params.qs8.dwconv[1].channel_tile = 8;
3717 } else if (cpuinfo_has_x86_sse2()) {
3718 xnn_params.qs8.dwconv[0].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qs8_dwconv_minmax_fp32_ukernel_up8x9__sse2_mul16_add16;
3719 xnn_params.qs8.dwconv[0].init.qs8 = xnn_init_qs8_conv_minmax_fp32_sse2_params;
3720 xnn_params.qs8.dwconv[0].channel_tile = 8;
3721 xnn_params.qs8.dwconv[1].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qs8_dwconv_minmax_fp32_ukernel_up8x25__sse2_mul16_add16;
3722 xnn_params.qs8.dwconv[1].init.qs8 = xnn_init_qs8_conv_minmax_fp32_sse2_params;
3723 xnn_params.qs8.dwconv[1].channel_tile = 8;
3724 }
3725 xnn_params.qs8.dwconv[0].primary_tile = 9;
3726 xnn_params.qs8.dwconv[1].primary_tile = 25;
3727
3728 if (cpuinfo_has_x86_sse4_1()) {
3729 xnn_params.qs8.gavgpool = (struct gavgpool_parameters) {
3730 .unipass = (xnn_gavgpool_unipass_ukernel_function) xnn_qs8_gavgpool_minmax_fp32_ukernel_7x__sse41_c8,
3731 .multipass = (xnn_gavgpool_multipass_ukernel_function) xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__sse41_c8,
3732 .init.qs8 = xnn_init_qs8_avgpool_minmax_fp32_sse4_params,
3733 .update.qs8 = xnn_update_qs8_avgpool_minmax_fp32_sse4_params,
3734 .row_tile = 7,
3735 .channel_tile = 8,
3736 };
3737 } else {
3738 xnn_params.qs8.gavgpool = (struct gavgpool_parameters) {
3739 .unipass = (xnn_gavgpool_unipass_ukernel_function) xnn_qs8_gavgpool_minmax_fp32_ukernel_7x__sse2_c8,
3740 .multipass = (xnn_gavgpool_multipass_ukernel_function) xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__sse2_c8,
3741 .init.qs8 = xnn_init_qs8_avgpool_minmax_fp32_sse2_params,
3742 .update.qs8 = xnn_update_qs8_avgpool_minmax_fp32_sse2_params,
3743 .row_tile = 7,
3744 .channel_tile = 8,
3745 };
3746 }
3747
3748 if (!XNN_PLATFORM_MOBILE && cpuinfo_has_x86_avx512f() && cpuinfo_has_x86_avx512bw() && cpuinfo_has_x86_avx512dq() && cpuinfo_has_x86_avx512vl()) {
3749 xnn_params.qs8.vadd = (struct vbinary_parameters) {
3750 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_qs8_vadd_minmax_ukernel__avx512skx_mul32_ld128_x16,
3751 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_qs8_vaddc_minmax_ukernel__avx512skx_mul32_ld128_x16,
3752 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_qs8_vaddc_minmax_ukernel__avx512skx_mul32_ld128_x16,
3753 .init.qs8_add = xnn_init_qs8_add_minmax_avx512_params,
3754 .element_tile = 16,
3755 };
3756 } else if (cpuinfo_has_x86_xop()) {
3757 xnn_params.qs8.vadd = (struct vbinary_parameters) {
3758 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_qs8_vadd_minmax_ukernel__xop_mul32_ld32_x8,
3759 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_qs8_vaddc_minmax_ukernel__xop_mul32_ld32_x8,
3760 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_qs8_vaddc_minmax_ukernel__xop_mul32_ld32_x8,
3761 .init.qs8_add = xnn_init_qs8_add_minmax_sse4_mul32_params,
3762 .element_tile = 8,
3763 };
3764 } else if (cpuinfo_has_x86_avx2()) {
3765 xnn_params.qs8.vadd = (struct vbinary_parameters) {
3766 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_qs8_vadd_minmax_ukernel__avx2_mul32_ld64_x16,
3767 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_qs8_vaddc_minmax_ukernel__avx2_mul32_ld64_x16,
3768 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_qs8_vaddc_minmax_ukernel__avx2_mul32_ld64_x16,
3769 .init.qs8_add = xnn_init_qs8_add_minmax_avx2_params,
3770 .element_tile = 16,
3771 };
3772 } else if (cpuinfo_has_x86_avx()) {
3773 xnn_params.qs8.vadd = (struct vbinary_parameters) {
3774 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_qs8_vadd_minmax_ukernel__avx_mul32_ld32_x8,
3775 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_qs8_vaddc_minmax_ukernel__avx_mul32_ld32_x8,
3776 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_qs8_vaddc_minmax_ukernel__avx_mul32_ld32_x8,
3777 .init.qs8_add = xnn_init_qs8_add_minmax_sse4_mul32_params,
3778 .element_tile = 8,
3779 };
3780 } else if (cpuinfo_has_x86_sse4_1()) {
3781 xnn_params.qs8.vadd = (struct vbinary_parameters) {
3782 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_qs8_vadd_minmax_ukernel__sse41_mul16_ld64_x8,
3783 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_qs8_vaddc_minmax_ukernel__sse41_mul16_ld64_x8,
3784 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_qs8_vaddc_minmax_ukernel__sse41_mul16_ld64_x8,
3785 .init.qs8_add = xnn_init_qs8_add_minmax_sse4_mul16_params,
3786 .element_tile = 8,
3787 };
3788 } else {
3789 xnn_params.qs8.vadd = (struct vbinary_parameters) {
3790 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_qs8_vadd_minmax_ukernel__sse2_mul16_ld64_x8,
3791 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_qs8_vaddc_minmax_ukernel__sse2_mul16_ld64_x8,
3792 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_qs8_vaddc_minmax_ukernel__sse2_mul16_ld64_x8,
3793 .init.qs8_add = xnn_init_qs8_add_minmax_sse2_params,
3794 .element_tile = 8,
3795 };
3796 }
3797 if (cpuinfo_has_x86_avx()) {
3798 xnn_params.qs8.vmul = (struct vbinary_parameters) {
3799 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_qs8_vmul_minmax_fp32_ukernel__avx_mul16_ld64_x16,
3800 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_qs8_vmulc_minmax_fp32_ukernel__avx_mul16_ld64_x16,
3801 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_qs8_vmulc_minmax_fp32_ukernel__avx_mul16_ld64_x16,
3802 .init.qs8_mul = xnn_init_qs8_mul_minmax_fp32_sse4_params,
3803 .element_tile = 16,
3804 };
3805 } else if (cpuinfo_has_x86_sse4_1()) {
3806 xnn_params.qs8.vmul = (struct vbinary_parameters) {
3807 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_qs8_vmul_minmax_fp32_ukernel__sse41_mul16_ld64_x16,
3808 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_qs8_vmulc_minmax_fp32_ukernel__sse41_mul16_ld64_x16,
3809 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_qs8_vmulc_minmax_fp32_ukernel__sse41_mul16_ld64_x16,
3810 .init.qs8_mul = xnn_init_qs8_mul_minmax_fp32_sse4_params,
3811 .element_tile = 16,
3812 };
3813 } else {
3814 xnn_params.qs8.vmul = (struct vbinary_parameters) {
3815 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_qs8_vmul_minmax_fp32_ukernel__sse2_mul16_ld64_x8,
3816 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_qs8_vmulc_minmax_fp32_ukernel__sse2_mul16_ld64_x8,
3817 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_qs8_vmulc_minmax_fp32_ukernel__sse2_mul16_ld64_x8,
3818 .init.qs8_mul = xnn_init_qs8_mul_minmax_fp32_sse2_params,
3819 .element_tile = 8,
3820 };
3821 }
3822
3823 if (cpuinfo_has_x86_avx2()) {
3824 xnn_params.qs8.lrelu = (struct vunary_parameters) {
3825 .ukernel = (xnn_vunary_ukernel_function) xnn_qs8_vlrelu_ukernel__avx2_x32,
3826 .init.qs8_lrelu = xnn_init_qs8_lrelu_avx2_params,
3827 .element_tile = 32,
3828 };
3829 } else if (cpuinfo_has_x86_avx()) {
3830 xnn_params.qs8.lrelu = (struct vunary_parameters) {
3831 .ukernel = (xnn_vunary_ukernel_function) xnn_qs8_vlrelu_ukernel__avx_x32,
3832 .init.qs8_lrelu = xnn_init_qs8_lrelu_avx_params,
3833 .element_tile = 32,
3834 };
3835 } else if (cpuinfo_has_x86_sse4_1()) {
3836 xnn_params.qs8.lrelu = (struct vunary_parameters) {
3837 .ukernel = (xnn_vunary_ukernel_function) xnn_qs8_vlrelu_ukernel__sse41_x32,
3838 .init.qs8_lrelu = xnn_init_qs8_lrelu_sse2_params,
3839 .element_tile = 32,
3840 };
3841 } else if (cpuinfo_has_x86_sse4_1()) {
3842 xnn_params.qs8.lrelu = (struct vunary_parameters) {
3843 .ukernel = (xnn_vunary_ukernel_function) xnn_qs8_vlrelu_ukernel__ssse3_x32,
3844 .init.qs8_lrelu = xnn_init_qs8_lrelu_sse2_params,
3845 .element_tile = 32,
3846 };
3847 } else {
3848 xnn_params.qs8.lrelu = (struct vunary_parameters) {
3849 .ukernel = (xnn_vunary_ukernel_function) xnn_qs8_vlrelu_ukernel__sse2_x32,
3850 .init.qs8_lrelu = xnn_init_qs8_lrelu_sse2_params,
3851 .element_tile = 32,
3852 };
3853 }
3854 #endif // XNN_NO_QS8_OPERATORS
3855
3856 /**************************** QU8 x86 micro-kernels ****************************/
3857 #ifndef XNN_NO_QU8_OPERATORS
3858 init_flags |= XNN_INIT_FLAG_QU8;
3859
3860 if (!XNN_PLATFORM_MOBILE && cpuinfo_has_x86_avx512f() && cpuinfo_has_x86_avx512bw() && cpuinfo_has_x86_avx512dq() && cpuinfo_has_x86_avx512vl()) {
3861 xnn_params.qu8.gemm.minmax.gemm[XNN_MR_TO_INDEX(4)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qu8_gemm_minmax_fp32_ukernel_4x16c8__avx512skx);
3862 xnn_params.qu8.gemm.minmax.igemm[XNN_MR_TO_INDEX(4)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qu8_igemm_minmax_fp32_ukernel_4x16c8__avx512skx);
3863 xnn_params.qu8.gemm.minmax.gemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qu8_gemm_minmax_fp32_ukernel_1x16c8__avx512skx);
3864 xnn_params.qu8.gemm.minmax.igemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qu8_igemm_minmax_fp32_ukernel_1x16c8__avx512skx);
3865 xnn_params.qu8.gemm.init.qu8 = xnn_init_qu8_conv_minmax_fp32_avx512_params;
3866 xnn_params.qu8.gemm.mr = 4;
3867 xnn_params.qu8.gemm.nr = 16;
3868 xnn_params.qu8.gemm.log2_kr = 3;
3869 } else if (cpuinfo_has_x86_xop()) {
3870 // XOP should be checked before AVX2: AMD Excavator supports both, but performs better with XOP microkernels
3871 xnn_params.qu8.gemm.minmax.gemm[XNN_MR_TO_INDEX(2)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qu8_gemm_minmax_fp32_ukernel_2x4c8__xop_ld64);
3872 xnn_params.qu8.gemm.minmax.igemm[XNN_MR_TO_INDEX(2)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qu8_igemm_minmax_fp32_ukernel_2x4c8__xop_ld64);
3873 xnn_params.qu8.gemm.minmax.gemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qu8_gemm_minmax_fp32_ukernel_1x4c8__xop_ld64);
3874 xnn_params.qu8.gemm.minmax.igemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qu8_igemm_minmax_fp32_ukernel_1x4c8__xop_ld64);
3875 xnn_params.qu8.gemm.init.qu8 = xnn_init_qu8_conv_minmax_fp32_sse2_params;
3876 xnn_params.qu8.gemm.mr = 2;
3877 xnn_params.qu8.gemm.nr = 4;
3878 xnn_params.qu8.gemm.log2_kr = 3;
3879 } else if (cpuinfo_has_x86_avx2()) {
3880 xnn_params.qu8.gemm.minmax.gemm[XNN_MR_TO_INDEX(3)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qu8_gemm_minmax_fp32_ukernel_3x8c8__avx2);
3881 xnn_params.qu8.gemm.minmax.igemm[XNN_MR_TO_INDEX(3)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qu8_igemm_minmax_fp32_ukernel_3x8c8__avx2);
3882 xnn_params.qu8.gemm.minmax.gemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qu8_gemm_minmax_fp32_ukernel_1x8c8__avx2);
3883 xnn_params.qu8.gemm.minmax.igemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qu8_igemm_minmax_fp32_ukernel_1x8c8__avx2);
3884 xnn_params.qu8.gemm.init.qu8 = xnn_init_qu8_conv_minmax_fp32_avx2_params;
3885 xnn_params.qu8.gemm.mr = 3;
3886 xnn_params.qu8.gemm.nr = 8;
3887 xnn_params.qu8.gemm.log2_kr = 3;
3888 } else if (cpuinfo_has_x86_avx()) {
3889 xnn_params.qu8.gemm.minmax.gemm[XNN_MR_TO_INDEX(2)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qu8_gemm_minmax_fp32_ukernel_2x4c8__avx_ld128);
3890 xnn_params.qu8.gemm.minmax.igemm[XNN_MR_TO_INDEX(2)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qu8_igemm_minmax_fp32_ukernel_2x4c8__avx_ld128);
3891 xnn_params.qu8.gemm.minmax.gemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qu8_gemm_minmax_fp32_ukernel_1x4c8__avx_ld128);
3892 xnn_params.qu8.gemm.minmax.igemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qu8_igemm_minmax_fp32_ukernel_1x4c8__avx_ld128);
3893 xnn_params.qu8.gemm.init.qu8 = xnn_init_qu8_conv_minmax_fp32_sse2_params;
3894 xnn_params.qu8.gemm.mr = 2;
3895 xnn_params.qu8.gemm.nr = 4;
3896 xnn_params.qu8.gemm.log2_kr = 3;
3897 } else if (cpuinfo_has_x86_sse4_1()) {
3898 xnn_params.qu8.gemm.minmax.gemm[XNN_MR_TO_INDEX(3)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qu8_gemm_minmax_fp32_ukernel_3x4c8__sse41_ld64);
3899 xnn_params.qu8.gemm.minmax.igemm[XNN_MR_TO_INDEX(3)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qu8_igemm_minmax_fp32_ukernel_3x4c8__sse41_ld64);
3900 xnn_params.qu8.gemm.minmax.gemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qu8_gemm_minmax_fp32_ukernel_1x4c8__sse41_ld64);
3901 xnn_params.qu8.gemm.minmax.igemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qu8_igemm_minmax_fp32_ukernel_1x4c8__sse41_ld64);
3902 xnn_params.qu8.gemm.init.qu8 = xnn_init_qu8_conv_minmax_fp32_sse2_params;
3903 xnn_params.qu8.gemm.mr = 3;
3904 xnn_params.qu8.gemm.nr = 4;
3905 xnn_params.qu8.gemm.log2_kr = 3;
3906 } else {
3907 xnn_params.qu8.gemm.minmax.gemm[XNN_MR_TO_INDEX(3)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qu8_gemm_minmax_fp32_ukernel_3x4c8__sse2_ld64);
3908 xnn_params.qu8.gemm.minmax.igemm[XNN_MR_TO_INDEX(3)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qu8_igemm_minmax_fp32_ukernel_3x4c8__sse2_ld64);
3909 xnn_params.qu8.gemm.minmax.gemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qu8_gemm_minmax_fp32_ukernel_1x4c8__sse2_ld64);
3910 xnn_params.qu8.gemm.minmax.igemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qu8_igemm_minmax_fp32_ukernel_1x4c8__sse2_ld64);
3911 xnn_params.qu8.gemm.init.qu8 = xnn_init_qu8_conv_minmax_fp32_sse2_params;
3912 xnn_params.qu8.gemm.mr = 3;
3913 xnn_params.qu8.gemm.nr = 4;
3914 xnn_params.qu8.gemm.log2_kr = 3;
3915 }
3916
3917 if (!XNN_PLATFORM_MOBILE && cpuinfo_has_x86_avx512f() && cpuinfo_has_x86_avx512bw() && cpuinfo_has_x86_avx512dq() && cpuinfo_has_x86_avx512vl()) {
3918 xnn_params.qu8.dwconv[0].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qu8_dwconv_minmax_fp32_ukernel_up32x9__avx512skx_mul32;
3919 xnn_params.qu8.dwconv[0].init.qu8 = xnn_init_qu8_conv_minmax_fp32_avx512_params;
3920 xnn_params.qu8.dwconv[0].channel_tile = 32;
3921 xnn_params.qu8.dwconv[1].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qu8_dwconv_minmax_fp32_ukernel_up32x25__avx512skx_mul32;
3922 xnn_params.qu8.dwconv[1].init.qu8 = xnn_init_qu8_conv_minmax_fp32_avx512_params;
3923 xnn_params.qu8.dwconv[1].channel_tile = 32;
3924 } else if (cpuinfo_has_x86_xop()) {
3925 // XOP should be checked before AVX2: AMD Excavator supports both, but performs better with XOP microkernels
3926 xnn_params.qu8.dwconv[0].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qu8_dwconv_minmax_fp32_ukernel_up16x9__xop_mul32;
3927 xnn_params.qu8.dwconv[0].init.qu8 = xnn_init_qu8_conv_minmax_fp32_sse2_params;
3928 xnn_params.qu8.dwconv[0].channel_tile = 16;
3929 xnn_params.qu8.dwconv[1].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qu8_dwconv_minmax_fp32_ukernel_up16x25__xop_mul32;
3930 xnn_params.qu8.dwconv[1].init.qu8 = xnn_init_qu8_conv_minmax_fp32_sse2_params;
3931 xnn_params.qu8.dwconv[1].channel_tile = 16;
3932 } else if (cpuinfo_has_x86_avx2()) {
3933 xnn_params.qu8.dwconv[0].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qu8_dwconv_minmax_fp32_ukernel_up16x9__avx2_mul32;
3934 xnn_params.qu8.dwconv[0].init.qu8 = xnn_init_qu8_conv_minmax_fp32_avx2_params;
3935 xnn_params.qu8.dwconv[0].channel_tile = 16;
3936 xnn_params.qu8.dwconv[1].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qu8_dwconv_minmax_fp32_ukernel_up16x25__avx2_mul32;
3937 xnn_params.qu8.dwconv[1].init.qu8 = xnn_init_qu8_conv_minmax_fp32_avx2_params;
3938 xnn_params.qu8.dwconv[1].channel_tile = 16;
3939 } else if (cpuinfo_has_x86_avx()) {
3940 xnn_params.qu8.dwconv[0].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qu8_dwconv_minmax_fp32_ukernel_up16x9__avx_mul16;
3941 xnn_params.qu8.dwconv[0].init.qu8 = xnn_init_qu8_conv_minmax_fp32_sse2_params;
3942 xnn_params.qu8.dwconv[0].channel_tile = 16;
3943 xnn_params.qu8.dwconv[1].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qu8_dwconv_minmax_fp32_ukernel_up16x25__avx_mul16;
3944 xnn_params.qu8.dwconv[1].init.qu8 = xnn_init_qu8_conv_minmax_fp32_sse2_params;
3945 xnn_params.qu8.dwconv[1].channel_tile = 16;
3946 } else if (cpuinfo_has_x86_sse4_1()) {
3947 xnn_params.qu8.dwconv[0].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qu8_dwconv_minmax_fp32_ukernel_up8x9__sse41_mul16;
3948 xnn_params.qu8.dwconv[0].init.qu8 = xnn_init_qu8_conv_minmax_fp32_sse2_params;
3949 xnn_params.qu8.dwconv[0].channel_tile = 8;
3950 xnn_params.qu8.dwconv[1].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qu8_dwconv_minmax_fp32_ukernel_up8x25__sse41_mul16;
3951 xnn_params.qu8.dwconv[1].init.qu8 = xnn_init_qu8_conv_minmax_fp32_sse2_params;
3952 xnn_params.qu8.dwconv[1].channel_tile = 8;
3953 } else if (cpuinfo_has_x86_sse2()) {
3954 xnn_params.qu8.dwconv[0].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qu8_dwconv_minmax_fp32_ukernel_up8x9__sse2_mul16;
3955 xnn_params.qu8.dwconv[0].init.qu8 = xnn_init_qu8_conv_minmax_fp32_sse2_params;
3956 xnn_params.qu8.dwconv[0].channel_tile = 8;
3957 xnn_params.qu8.dwconv[1].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qu8_dwconv_minmax_fp32_ukernel_up8x25__sse2_mul16;
3958 xnn_params.qu8.dwconv[1].init.qu8 = xnn_init_qu8_conv_minmax_fp32_sse2_params;
3959 xnn_params.qu8.dwconv[1].channel_tile = 8;
3960 }
3961 xnn_params.qu8.dwconv[0].primary_tile = 9;
3962 xnn_params.qu8.dwconv[1].primary_tile = 25;
3963
3964 xnn_params.qu8.avgpool = (struct avgpool_parameters) {
3965 .unipass = (xnn_avgpool_unipass_ukernel_function) xnn_qu8_avgpool_minmax_ukernel_9x__sse2_c8,
3966 .multipass = (xnn_avgpool_multipass_ukernel_function) xnn_qu8_avgpool_minmax_ukernel_9p8x__sse2_c8,
3967 .init.qu8 = xnn_init_qu8_avgpool_minmax_sse2_params,
3968 .primary_tile = 9,
3969 .incremental_tile = 8,
3970 .channel_tile = 8,
3971 };
3972 if (cpuinfo_has_x86_sse4_1()) {
3973 xnn_params.qu8.gavgpool = (struct gavgpool_parameters) {
3974 .unipass = (xnn_gavgpool_unipass_ukernel_function) xnn_qu8_gavgpool_minmax_fp32_ukernel_7x__sse41_c8,
3975 .multipass = (xnn_gavgpool_multipass_ukernel_function) xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__sse41_c8,
3976 .init.qu8 = xnn_init_qu8_avgpool_minmax_fp32_sse4_params,
3977 .update.qu8 = xnn_update_qu8_avgpool_minmax_fp32_sse4_params,
3978 .row_tile = 7,
3979 .channel_tile = 8,
3980 };
3981 } else {
3982 xnn_params.qu8.gavgpool = (struct gavgpool_parameters) {
3983 .unipass = (xnn_gavgpool_unipass_ukernel_function) xnn_qu8_gavgpool_minmax_fp32_ukernel_7x__sse2_c8,
3984 .multipass = (xnn_gavgpool_multipass_ukernel_function) xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__sse2_c8,
3985 .init.qu8 = xnn_init_qu8_avgpool_minmax_fp32_sse2_params,
3986 .update.qu8 = xnn_update_qu8_avgpool_minmax_fp32_sse2_params,
3987 .row_tile = 7,
3988 .channel_tile = 8,
3989 };
3990 }
3991
3992 if (!XNN_PLATFORM_MOBILE && cpuinfo_has_x86_avx512f() && cpuinfo_has_x86_avx512bw() && cpuinfo_has_x86_avx512dq() && cpuinfo_has_x86_avx512vl()) {
3993 xnn_params.qu8.vadd = (struct vbinary_parameters) {
3994 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_qu8_vadd_minmax_ukernel__avx512skx_mul32_ld128_x16,
3995 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_qu8_vaddc_minmax_ukernel__avx512skx_mul32_ld128_x16,
3996 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_qu8_vaddc_minmax_ukernel__avx512skx_mul32_ld128_x16,
3997 .init.qu8_add = xnn_init_qu8_add_minmax_avx512_params,
3998 .element_tile = 16,
3999 };
4000 } else if (cpuinfo_has_x86_xop()) {
4001 xnn_params.qu8.vadd = (struct vbinary_parameters) {
4002 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_qu8_vadd_minmax_ukernel__xop_mul32_ld32_x8,
4003 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_qu8_vaddc_minmax_ukernel__xop_mul32_ld32_x8,
4004 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_qu8_vaddc_minmax_ukernel__xop_mul32_ld32_x8,
4005 .init.qu8_add = xnn_init_qu8_add_minmax_sse4_params,
4006 .element_tile = 8,
4007 };
4008 } else if (cpuinfo_has_x86_avx2()) {
4009 xnn_params.qu8.vadd = (struct vbinary_parameters) {
4010 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_qu8_vadd_minmax_ukernel__avx2_mul32_ld64_x16,
4011 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_qu8_vaddc_minmax_ukernel__avx2_mul32_ld64_x16,
4012 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_qu8_vaddc_minmax_ukernel__avx2_mul32_ld64_x16,
4013 .init.qu8_add = xnn_init_qu8_add_minmax_avx2_params,
4014 .element_tile = 16,
4015 };
4016 } else if (cpuinfo_has_x86_avx()) {
4017 xnn_params.qu8.vadd = (struct vbinary_parameters) {
4018 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_qu8_vadd_minmax_ukernel__avx_mul32_ld32_x8,
4019 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_qu8_vaddc_minmax_ukernel__avx_mul32_ld32_x8,
4020 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_qu8_vaddc_minmax_ukernel__avx_mul32_ld32_x8,
4021 .init.qu8_add = xnn_init_qu8_add_minmax_sse4_params,
4022 .element_tile = 8,
4023 };
4024 } else if (cpuinfo_has_x86_sse4_1()) {
4025 xnn_params.qu8.vadd = (struct vbinary_parameters) {
4026 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_qu8_vadd_minmax_ukernel__sse41_mul16_ld64_x8,
4027 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_qu8_vaddc_minmax_ukernel__sse41_mul16_ld64_x8,
4028 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_qu8_vaddc_minmax_ukernel__sse41_mul16_ld64_x8,
4029 .init.qu8_add = xnn_init_qu8_add_minmax_sse2_params,
4030 .element_tile = 8,
4031 };
4032 } else {
4033 xnn_params.qu8.vadd = (struct vbinary_parameters) {
4034 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_qu8_vadd_minmax_ukernel__sse2_mul16_ld64_x8,
4035 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_qu8_vaddc_minmax_ukernel__sse2_mul16_ld64_x8,
4036 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_qu8_vaddc_minmax_ukernel__sse2_mul16_ld64_x8,
4037 .init.qu8_add = xnn_init_qu8_add_minmax_sse2_params,
4038 .element_tile = 8,
4039 };
4040 }
4041 if (cpuinfo_has_x86_avx()) {
4042 xnn_params.qu8.vmul = (struct vbinary_parameters) {
4043 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_qu8_vmul_minmax_fp32_ukernel__avx_mul16_ld64_x16,
4044 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_qu8_vmulc_minmax_fp32_ukernel__avx_mul16_ld64_x16,
4045 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_qu8_vmulc_minmax_fp32_ukernel__avx_mul16_ld64_x16,
4046 .init.qu8_mul = xnn_init_qu8_mul_minmax_fp32_sse2_params,
4047 .element_tile = 16,
4048 };
4049 } else if (cpuinfo_has_x86_sse4_1()) {
4050 xnn_params.qu8.vmul = (struct vbinary_parameters) {
4051 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_qu8_vmul_minmax_fp32_ukernel__sse41_mul16_ld64_x16,
4052 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_qu8_vmulc_minmax_fp32_ukernel__sse41_mul16_ld64_x16,
4053 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_qu8_vmulc_minmax_fp32_ukernel__sse41_mul16_ld64_x16,
4054 .init.qu8_mul = xnn_init_qu8_mul_minmax_fp32_sse2_params,
4055 .element_tile = 16,
4056 };
4057 } else {
4058 xnn_params.qu8.vmul = (struct vbinary_parameters) {
4059 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_qu8_vmul_minmax_fp32_ukernel__sse2_mul16_ld64_x8,
4060 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_qu8_vmulc_minmax_fp32_ukernel__sse2_mul16_ld64_x8,
4061 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_qu8_vmulc_minmax_fp32_ukernel__sse2_mul16_ld64_x8,
4062 .init.qu8_mul = xnn_init_qu8_mul_minmax_fp32_sse2_params,
4063 .element_tile = 8,
4064 };
4065 }
4066
4067 if (cpuinfo_has_x86_avx2()) {
4068 xnn_params.qu8.lrelu = (struct vunary_parameters) {
4069 .ukernel = (xnn_vunary_ukernel_function) xnn_qu8_vlrelu_ukernel__avx2_x32,
4070 .init.qu8_lrelu = xnn_init_qu8_lrelu_avx2_params,
4071 .element_tile = 32,
4072 };
4073 } else if (cpuinfo_has_x86_avx()) {
4074 xnn_params.qu8.lrelu = (struct vunary_parameters) {
4075 .ukernel = (xnn_vunary_ukernel_function) xnn_qu8_vlrelu_ukernel__avx_x32,
4076 .init.qu8_lrelu = xnn_init_qu8_lrelu_avx_params,
4077 .element_tile = 32,
4078 };
4079 } else if (cpuinfo_has_x86_sse4_1()) {
4080 xnn_params.qu8.lrelu = (struct vunary_parameters) {
4081 .ukernel = (xnn_vunary_ukernel_function) xnn_qu8_vlrelu_ukernel__sse41_x32,
4082 .init.qu8_lrelu = xnn_init_qu8_lrelu_sse2_params,
4083 .element_tile = 32,
4084 };
4085 } else if (cpuinfo_has_x86_sse4_1()) {
4086 xnn_params.qu8.lrelu = (struct vunary_parameters) {
4087 .ukernel = (xnn_vunary_ukernel_function) xnn_qu8_vlrelu_ukernel__ssse3_x32,
4088 .init.qu8_lrelu = xnn_init_qu8_lrelu_sse2_params,
4089 .element_tile = 32,
4090 };
4091 } else {
4092 xnn_params.qu8.lrelu = (struct vunary_parameters) {
4093 .ukernel = (xnn_vunary_ukernel_function) xnn_qu8_vlrelu_ukernel__sse2_x32,
4094 .init.qu8_lrelu = xnn_init_qu8_lrelu_sse2_params,
4095 .element_tile = 32,
4096 };
4097 }
4098 #endif // XNN_NO_QU8_OPERATORS
4099
4100 /**************************** U8 x86 micro-kernels ****************************/
4101 #ifndef XNN_NO_S8_OPERATORS
4102 init_flags |= XNN_INIT_FLAG_S8;
4103
4104 if (cpuinfo_has_x86_sse4_1()) {
4105 xnn_params.s8.clamp = (struct vunary_parameters) {
4106 .ukernel = (xnn_vunary_ukernel_function) xnn_s8_vclamp_ukernel__sse41_x64,
4107 .init.s8_minmax = xnn_init_s8_minmax_sse4_params,
4108 .element_tile = 64,
4109 };
4110 xnn_params.s8.ibilinear = (struct ibilinear_parameters) {
4111 .ukernel = (xnn_ibilinear_ukernel_function) xnn_s8_ibilinear_ukernel__sse41_c16,
4112 .pixel_tile = 1,
4113 .channel_tile = 16,
4114 };
4115 xnn_params.s8.maxpool = (struct maxpool_parameters) {
4116 .ukernel = (xnn_maxpool_ukernel_function) xnn_s8_maxpool_minmax_ukernel_9p8x__sse41_c16,
4117 .init.s8 = xnn_init_s8_minmax_sse4_params,
4118 .mr = 9,
4119 .qr = 8,
4120 };
4121 } else {
4122 xnn_params.s8.clamp = (struct vunary_parameters) {
4123 .ukernel = (xnn_vunary_ukernel_function) xnn_s8_vclamp_ukernel__sse2_x64,
4124 .init.s8_minmax = xnn_init_s8_minmax_sse2_params,
4125 .element_tile = 64,
4126 };
4127 xnn_params.s8.ibilinear = (struct ibilinear_parameters) {
4128 .ukernel = (xnn_ibilinear_ukernel_function) xnn_s8_ibilinear_ukernel__sse2_c8,
4129 .pixel_tile = 1,
4130 .channel_tile = 8,
4131 };
4132 xnn_params.s8.maxpool = (struct maxpool_parameters) {
4133 .ukernel = (xnn_maxpool_ukernel_function) xnn_s8_maxpool_minmax_ukernel_9p8x__sse2_c16,
4134 .init.s8 = xnn_init_s8_minmax_sse2_params,
4135 .mr = 9,
4136 .qr = 8,
4137 };
4138 }
4139 #endif // XNN_NO_S8_OPERATORS
4140
4141 /**************************** U8 x86 micro-kernels ****************************/
4142 #ifndef XNN_NO_U8_OPERATORS
4143 init_flags |= XNN_INIT_FLAG_U8;
4144
4145 xnn_params.u8.clamp = (struct vunary_parameters) {
4146 .ukernel = (xnn_vunary_ukernel_function) xnn_u8_vclamp_ukernel__sse2_x64,
4147 .init.u8_minmax = xnn_init_u8_minmax_sse2_params,
4148 .element_tile = 64,
4149 };
4150 if (cpuinfo_has_x86_sse4_1()) {
4151 xnn_params.u8.ibilinear = (struct ibilinear_parameters) {
4152 .ukernel = (xnn_ibilinear_ukernel_function) xnn_u8_ibilinear_ukernel__sse41_c16,
4153 .pixel_tile = 1,
4154 .channel_tile = 16,
4155 };
4156 } else {
4157 xnn_params.u8.ibilinear = (struct ibilinear_parameters) {
4158 .ukernel = (xnn_ibilinear_ukernel_function) xnn_u8_ibilinear_ukernel__sse2_c8,
4159 .pixel_tile = 1,
4160 .channel_tile = 8,
4161 };
4162 }
4163 xnn_params.u8.maxpool = (struct maxpool_parameters) {
4164 .ukernel = (xnn_maxpool_ukernel_function) xnn_u8_maxpool_minmax_ukernel_9p8x__sse2_c16,
4165 .init.u8 = xnn_init_u8_minmax_sse2_params,
4166 .mr = 9,
4167 .qr = 8,
4168 };
4169 xnn_params.u8.lut32norm = xnn_u8_lut32norm_ukernel__scalar;
4170 xnn_params.u8.rmax = xnn_u8_rmax_ukernel__sse2;
4171 #endif // XNN_NO_U8_OPERATORS
4172
4173 /**************************** X8 x86 micro-kernels ****************************/
4174 #ifndef XNN_NO_X8_OPERATORS
4175 init_flags |= XNN_INIT_FLAG_X8;
4176
4177 if (!XNN_PLATFORM_MOBILE && cpuinfo_has_x86_avx512f() && cpuinfo_has_x86_avx512bw() && cpuinfo_has_x86_avx512dq() && cpuinfo_has_x86_avx512vl()) {
4178 xnn_params.x8.lut = xnn_x8_lut_ukernel__avx512skx_vpshufb_x64;
4179 } else if (cpuinfo_has_x86_avx2()) {
4180 xnn_params.x8.lut = xnn_x8_lut_ukernel__avx2_x128;
4181 } else if (cpuinfo_has_x86_avx()) {
4182 xnn_params.x8.lut = xnn_x8_lut_ukernel__avx_x64;
4183 } else {
4184 // Note: SSSE3 version is usually slower than scalar
4185 xnn_params.x8.lut = xnn_x8_lut_ukernel__scalar_x4;
4186 }
4187 xnn_params.x8.zip = (struct zip_parameters) {
4188 .x2 = (xnn_zipc_ukernel_function) xnn_x8_zip_x2_ukernel__sse2,
4189 .x3 = (xnn_zipc_ukernel_function) xnn_x8_zip_x3_ukernel__sse2,
4190 .x4 = (xnn_zipc_ukernel_function) xnn_x8_zip_x4_ukernel__sse2,
4191 .xm = (xnn_zipv_ukernel_function) xnn_x8_zip_xm_ukernel__sse2,
4192 };
4193
4194 xnn_params.x8.transpose = (struct transpose_parameters) {
4195 .const_size_ukernel = (xnn_transposec_ukernel_function) xnn_x8_transposec_ukernel__16x16_reuse_mov_sse2,
4196 .tile_size = 32,
4197 };
4198 #endif // XNN_NO_X8_OPERATORS
4199
4200
4201 /**************************** X16 x86 micro-kernels ****************************/
4202 #ifndef XNN_NO_X16_OPERATORS
4203 init_flags |= XNN_INIT_FLAG_X16;
4204
4205 xnn_params.x16.transpose = (struct transpose_parameters) {
4206 .const_size_ukernel = (xnn_transposec_ukernel_function) xnn_x16_transposec_ukernel__8x8_reuse_multi_sse2,
4207 .tile_size = 32,
4208 };
4209 #endif // XNN_NO_X16_OPERATORS
4210
4211 /**************************** F16 x86 micro-kernels ****************************/
4212 #ifndef XNN_NO_F16_OPERATORS
4213 if (!XNN_PLATFORM_MOBILE && cpuinfo_has_x86_avx2()) {
4214 init_flags |= XNN_INIT_FLAG_F16;
4215
4216 xnn_params.f16.gemm.minmax.gemm[XNN_MR_TO_INDEX(4)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f16_gemm_minmax_ukernel_4x16__avx2_broadcast);
4217 xnn_params.f16.gemm.minmax.igemm[XNN_MR_TO_INDEX(4)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f16_igemm_minmax_ukernel_4x16__avx2_broadcast);
4218 xnn_params.f16.gemm.minmax.gemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f16_gemm_minmax_ukernel_1x16__avx2_broadcast);
4219 xnn_params.f16.gemm.minmax.igemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f16_igemm_minmax_ukernel_1x16__avx2_broadcast);
4220 xnn_params.f16.gemm.init.f16 = xnn_init_f16_minmax_avx_params;
4221 xnn_params.f16.gemm.mr = 4;
4222 xnn_params.f16.gemm.nr = 16;
4223
4224 xnn_params.f16.dwconv[0].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_f16_dwconv_minmax_ukernel_up16x3__fma3;
4225 xnn_params.f16.dwconv[0].init.f16 = xnn_init_f16_minmax_avx_params;
4226 xnn_params.f16.dwconv[0].channel_tile = 16;
4227 xnn_params.f16.dwconv[0].primary_tile = 3;
4228
4229 xnn_params.f16.dwconv[1].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_f16_dwconv_minmax_ukernel_up16x4__fma3;
4230 xnn_params.f16.dwconv[1].init.f16 = xnn_init_f16_minmax_avx_params;
4231 xnn_params.f16.dwconv[1].channel_tile = 16;
4232 xnn_params.f16.dwconv[1].primary_tile = 4;
4233
4234 xnn_params.f16.dwconv[2].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_f16_dwconv_minmax_ukernel_up16x9__fma3;
4235 xnn_params.f16.dwconv[2].init.f16 = xnn_init_f16_minmax_avx_params;
4236 xnn_params.f16.dwconv[2].channel_tile = 16;
4237 xnn_params.f16.dwconv[2].primary_tile = 9;
4238
4239 xnn_params.f16.dwconv[3].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_f16_dwconv_minmax_ukernel_up8x25__fma3_acc2;
4240 xnn_params.f16.dwconv[3].init.f16 = xnn_init_f16_minmax_avx_params;
4241 xnn_params.f16.dwconv[3].channel_tile = 8;
4242 xnn_params.f16.dwconv[3].primary_tile = 25;
4243
4244 xnn_params.f16.avgpool = (struct avgpool_parameters) {
4245 .unipass = (xnn_avgpool_unipass_ukernel_function) xnn_f16_avgpool_minmax_ukernel_9x__f16c_c8,
4246 .multipass = (xnn_avgpool_multipass_ukernel_function) xnn_f16_avgpool_minmax_ukernel_9p8x__f16c_c8,
4247 .init.f16 = xnn_init_f16_scaleminmax_avx_params,
4248 .primary_tile = 9,
4249 .incremental_tile = 8,
4250 .channel_tile = 8,
4251 };
4252 xnn_params.f16.pavgpool = (struct pavgpool_parameters) {
4253 .unipass = (xnn_pavgpool_unipass_ukernel_function) xnn_f16_pavgpool_minmax_ukernel_9x__avx2_c8,
4254 .multipass = (xnn_pavgpool_multipass_ukernel_function) xnn_f16_pavgpool_minmax_ukernel_9p8x__avx2_c8,
4255 .init.f16 = xnn_init_f16_minmax_avx_params,
4256 .primary_tile = 9,
4257 .incremental_tile = 8,
4258 .channel_tile = 8,
4259 };
4260 xnn_params.f16.gavgpool = (struct gavgpool_parameters) {
4261 .unipass = (xnn_gavgpool_unipass_ukernel_function) xnn_f16_gavgpool_minmax_ukernel_7x__f16c_c8,
4262 .multipass = (xnn_gavgpool_multipass_ukernel_function) xnn_f16_gavgpool_minmax_ukernel_7p7x__f16c_c8,
4263 .init.f16 = xnn_init_f16_scaleminmax_avx_params,
4264 .update.f16 = xnn_update_f16_scaleminmax_avx_params,
4265 .row_tile = 7,
4266 .channel_tile = 8,
4267 };
4268
4269 xnn_params.f16.maxpool = (struct maxpool_parameters) {
4270 .ukernel = (xnn_maxpool_ukernel_function) xnn_f16_maxpool_minmax_ukernel_9p8x__f16c_c8,
4271 .init.f16 = xnn_init_f16_minmax_avx_params,
4272 .mr = 9,
4273 .qr = 8,
4274 };
4275 xnn_params.f16.ibilinear = (struct ibilinear_parameters) {
4276 .ukernel = (xnn_ibilinear_ukernel_function) xnn_f16_ibilinear_ukernel__fma3_c8,
4277 .pixel_tile = 1,
4278 .channel_tile = 8,
4279 };
4280
4281 xnn_params.f16.prelu = (struct prelu_parameters) {
4282 .ukernel = (xnn_prelu_ukernel_function) xnn_f16_prelu_ukernel__f16c_2x16,
4283 .row_tile = 2,
4284 .channel_tile = 16,
4285 };
4286
4287 xnn_params.f16.raddstoreexpminusmax = (struct raddstoreexpminusmax_parameters) {
4288 .ukernel = (xnn_raddstoreexpminusmax_ukernel_function) xnn_f16_raddstoreexpminusmax_ukernel__avx2_rr1_p2_x40,
4289 .init.f16 = xnn_init_f16_expminus_avx2_rr1_p2_params,
4290 .element_tile = 40,
4291 };
4292 xnn_params.f16.rmax = (xnn_rmax_ukernel_function) xnn_f16_rmax_ukernel__f16c;
4293
4294 xnn_params.f16.vadd = (struct vbinary_parameters) {
4295 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f16_vadd_minmax_ukernel__f16c_x16,
4296 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f16_vaddc_minmax_ukernel__f16c_x16,
4297 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f16_vaddc_minmax_ukernel__f16c_x16,
4298 .init.f16_minmax = xnn_init_f16_minmax_avx_params,
4299 .element_tile = 16,
4300 };
4301 xnn_params.f16.vdiv = (struct vbinary_parameters) {
4302 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f16_vdiv_minmax_ukernel__f16c_x8,
4303 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f16_vdivc_minmax_ukernel__f16c_x8,
4304 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f16_vrdivc_minmax_ukernel__f16c_x8,
4305 .init.f16_minmax = xnn_init_f16_minmax_avx_params,
4306 .element_tile = 8,
4307 };
4308 xnn_params.f16.vmax = (struct vbinary_parameters) {
4309 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f16_vmax_ukernel__f16c_x16,
4310 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f16_vmaxc_ukernel__f16c_x16,
4311 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f16_vmaxc_ukernel__f16c_x16,
4312 .element_tile = 16,
4313 };
4314 xnn_params.f16.vmin = (struct vbinary_parameters) {
4315 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f16_vmin_ukernel__f16c_x16,
4316 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f16_vminc_ukernel__f16c_x16,
4317 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f16_vminc_ukernel__f16c_x16,
4318 .element_tile = 16,
4319 };
4320 xnn_params.f16.vmul = (struct vbinary_parameters) {
4321 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f16_vmul_minmax_ukernel__f16c_x16,
4322 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f16_vmulc_minmax_ukernel__f16c_x16,
4323 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f16_vmulc_minmax_ukernel__f16c_x16,
4324 .init.f16_minmax = xnn_init_f16_minmax_avx_params,
4325 .element_tile = 16,
4326 };
4327 xnn_params.f16.vsub = (struct vbinary_parameters) {
4328 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f16_vsub_minmax_ukernel__f16c_x16,
4329 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f16_vsubc_minmax_ukernel__f16c_x16,
4330 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f16_vrsubc_minmax_ukernel__f16c_x16,
4331 .init.f16_minmax = xnn_init_f16_minmax_avx_params,
4332 .element_tile = 16,
4333 };
4334 xnn_params.f16.vsqrdiff = (struct vbinary_parameters) {
4335 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f16_vsqrdiff_ukernel__f16c_x16,
4336 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f16_vsqrdiffc_ukernel__f16c_x16,
4337 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f16_vsqrdiffc_ukernel__f16c_x16,
4338 .init.f16_minmax = xnn_init_f16_minmax_avx_params,
4339 .element_tile = 16,
4340 };
4341
4342 xnn_params.f16.vmulcaddc = (struct vmulcaddc_parameters) {
4343 .ukernel = (xnn_vmulcaddc_ukernel_function) xnn_f16_vmulcaddc_minmax_ukernel_c8__fma3_2x,
4344 .init.f16 = xnn_init_f16_minmax_avx_params,
4345 .channel_tile = 8,
4346 .row_tile = 2,
4347 };
4348
4349 xnn_params.f16.abs = (struct vunary_parameters) {
4350 .ukernel = (xnn_vunary_ukernel_function) xnn_f16_vabs_ukernel__sse2_x16,
4351 .init.f16_abs = xnn_init_f16_abs_sse_params,
4352 .element_tile = 16,
4353 };
4354 xnn_params.f16.clamp = (struct vunary_parameters) {
4355 .ukernel = (xnn_vunary_ukernel_function) xnn_f16_vclamp_ukernel__f16c_x16,
4356 .init.f16_minmax = xnn_init_f16_minmax_avx_params,
4357 .element_tile = 16,
4358 };
4359 xnn_params.f16.elu = (struct vunary_parameters) {
4360 .ukernel = (xnn_vunary_ukernel_function) xnn_f16_velu_ukernel__avx2_rr1_p3_x16,
4361 .init.f16_elu = xnn_init_f16_elu_avx2_rr1_p3_params,
4362 .element_tile = 16,
4363 };
4364 xnn_params.f16.hswish = (struct vunary_parameters) {
4365 .ukernel = (xnn_vunary_ukernel_function) xnn_f16_vhswish_ukernel__f16c_x16,
4366 .init.f16_hswish = xnn_init_f16_hswish_avx_params,
4367 .element_tile = 16,
4368 };
4369 xnn_params.f16.lrelu = (struct vunary_parameters) {
4370 .ukernel = (xnn_vunary_ukernel_function) xnn_f16_vlrelu_ukernel__f16c_x16,
4371 .init.f16_lrelu = xnn_init_f16_lrelu_avx_params,
4372 .element_tile = 16,
4373 };
4374 xnn_params.f16.neg = (struct vunary_parameters) {
4375 .ukernel = (xnn_vunary_ukernel_function) xnn_f16_vneg_ukernel__sse2_x16,
4376 .init.f16_neg = xnn_init_f16_neg_sse_params,
4377 .element_tile = 16,
4378 };
4379 xnn_params.f16.rndne = (struct vunary_parameters) {
4380 .ukernel = (xnn_vunary_ukernel_function) xnn_f16_vrndne_ukernel__f16c_x16,
4381 .element_tile = 16,
4382 };
4383 xnn_params.f16.rndz = (struct vunary_parameters) {
4384 .ukernel = (xnn_vunary_ukernel_function) xnn_f16_vrndz_ukernel__f16c_x16,
4385 .element_tile = 16,
4386 };
4387 xnn_params.f16.rndu = (struct vunary_parameters) {
4388 .ukernel = (xnn_vunary_ukernel_function) xnn_f16_vrndu_ukernel__f16c_x16,
4389 .element_tile = 16,
4390 };
4391 xnn_params.f16.rndd = (struct vunary_parameters) {
4392 .ukernel = (xnn_vunary_ukernel_function) xnn_f16_vrndd_ukernel__f16c_x16,
4393 .element_tile = 16,
4394 };
4395 xnn_params.f16.sigmoid = (struct vunary_parameters) {
4396 .ukernel = (xnn_vunary_ukernel_function) xnn_f16_vsigmoid_ukernel__avx2_rr1_p2_rcp_x32,
4397 .init.f16_sigmoid = xnn_init_f16_sigmoid_avx2_rr1_p2_params,
4398 .element_tile = 32,
4399 };
4400 xnn_params.f16.sqr = (struct vunary_parameters) {
4401 .ukernel = (xnn_vunary_ukernel_function) xnn_f16_vsqr_ukernel__f16c_x16,
4402 .element_tile = 16,
4403 };
4404 xnn_params.f16.sqrt = (struct vunary_parameters) {
4405 .ukernel = (xnn_vunary_ukernel_function) xnn_f16_vsqrt_ukernel__f16c_sqrt_x8,
4406 .element_tile = 8,
4407 };
4408 }
4409 #endif // XNN_NO_F16_OPERATORS
4410
4411 /**************************** F32 x86 micro-kernels ****************************/
4412 #ifndef XNN_NO_F32_OPERATORS
4413 init_flags |= XNN_INIT_FLAG_F32;
4414
4415 if (!XNN_PLATFORM_MOBILE && cpuinfo_has_x86_avx512f()) {
4416 xnn_params.f32.gemm.minmax.gemm[XNN_MR_TO_INDEX(7)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_7x16__avx512f_broadcast);
4417 xnn_params.f32.gemm.minmax.igemm[XNN_MR_TO_INDEX(7)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_7x16__avx512f_broadcast);
4418 xnn_params.f32.gemm.minmax.gemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_1x16__avx512f_broadcast);
4419 xnn_params.f32.gemm.minmax.igemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_1x16__avx512f_broadcast);
4420 xnn_params.f32.gemm.init.f32 = xnn_init_f32_minmax_scalar_params;
4421 xnn_params.f32.gemm.mr = 7;
4422 xnn_params.f32.gemm.nr = 16;
4423 } else if (cpuinfo_has_x86_fma3()) {
4424 switch (cpuinfo_get_core(0)->uarch) {
4425 case cpuinfo_uarch_zen:
4426 case cpuinfo_uarch_dhyana:
4427 xnn_params.f32.gemm.minmax.gemm[XNN_MR_TO_INDEX(4)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_4x16s4__fma3_broadcast);
4428 xnn_params.f32.gemm.minmax.igemm[XNN_MR_TO_INDEX(4)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_4x16s4__fma3_broadcast);
4429 xnn_params.f32.gemm.minmax.gemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_1x16s4__fma3_broadcast);
4430 xnn_params.f32.gemm.minmax.igemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_1x16s4__fma3_broadcast);
4431 xnn_params.f32.gemm.init.f32 = xnn_init_f32_minmax_avx_params;
4432 xnn_params.f32.gemm.mr = 4;
4433 xnn_params.f32.gemm.nr = 16;
4434 xnn_params.f32.gemm.log2_sr = 2;
4435 break;
4436 default:
4437 xnn_params.f32.gemm.minmax.gemm[XNN_MR_TO_INDEX(5)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_5x16__fma3_broadcast);
4438 xnn_params.f32.gemm.minmax.igemm[XNN_MR_TO_INDEX(5)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_5x16__fma3_broadcast);
4439 xnn_params.f32.gemm.minmax.gemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_1x16__fma3_broadcast);
4440 xnn_params.f32.gemm.minmax.igemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_1x16__fma3_broadcast);
4441 xnn_params.f32.gemm.init.f32 = xnn_init_f32_minmax_avx_params;
4442 xnn_params.f32.gemm.mr = 5;
4443 xnn_params.f32.gemm.nr = 16;
4444 break;
4445 }
4446 } else if (cpuinfo_has_x86_avx()) {
4447 xnn_params.f32.gemm.minmax.gemm[XNN_MR_TO_INDEX(5)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_5x16__avx_broadcast);
4448 xnn_params.f32.gemm.minmax.igemm[XNN_MR_TO_INDEX(5)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_5x16__avx_broadcast);
4449 xnn_params.f32.gemm.minmax.gemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_1x16__avx_broadcast);
4450 xnn_params.f32.gemm.minmax.igemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_1x16__avx_broadcast);
4451 xnn_params.f32.gemm.init.f32 = xnn_init_f32_minmax_avx_params;
4452 xnn_params.f32.gemm.mr = 5;
4453 xnn_params.f32.gemm.nr = 16;
4454 } else {
4455 xnn_params.f32.gemm.minmax.gemm[XNN_MR_TO_INDEX(4)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_4x8__sse_load1);
4456 xnn_params.f32.gemm.minmax.igemm[XNN_MR_TO_INDEX(4)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_4x8__sse_load1);
4457 xnn_params.f32.gemm.minmax.gemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_1x8__sse_load1);
4458 xnn_params.f32.gemm.minmax.igemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_1x8__sse_load1);
4459 xnn_params.f32.gemm.init.f32 = xnn_init_f32_minmax_sse_params;
4460 xnn_params.f32.gemm.mr = 4;
4461 xnn_params.f32.gemm.nr = 8;
4462 }
4463 xnn_params.f32.gemm2.minmax.gemm[XNN_MR_TO_INDEX(4)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_4x2c4__sse);
4464 xnn_params.f32.gemm2.minmax.igemm[XNN_MR_TO_INDEX(4)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_4x2c4__sse);
4465 xnn_params.f32.gemm2.init.f32 = xnn_init_f32_minmax_sse_params;
4466 xnn_params.f32.gemm2.mr = 4;
4467 xnn_params.f32.gemm2.nr = 2;
4468 xnn_params.f32.gemm2.log2_kr = 2;
4469
4470 if (!XNN_PLATFORM_MOBILE && cpuinfo_has_x86_avx512f()) {
4471 xnn_params.f32.dwconv[0].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_f32_dwconv_minmax_ukernel_up16x3__avx512f;
4472 xnn_params.f32.dwconv[0].init.f32 = xnn_init_f32_minmax_scalar_params;
4473 xnn_params.f32.dwconv[0].channel_tile = 16;
4474 xnn_params.f32.dwconv[0].primary_tile = 3;
4475
4476 xnn_params.f32.dwconv[1].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_f32_dwconv_minmax_ukernel_up16x4__avx512f;
4477 xnn_params.f32.dwconv[1].init.f32 = xnn_init_f32_minmax_scalar_params;
4478 xnn_params.f32.dwconv[1].channel_tile = 16;
4479 xnn_params.f32.dwconv[1].primary_tile = 4;
4480
4481 xnn_params.f32.dwconv[2].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_f32_dwconv_minmax_ukernel_up16x9__avx512f;
4482 xnn_params.f32.dwconv[2].init.f32 = xnn_init_f32_minmax_scalar_params;
4483 xnn_params.f32.dwconv[2].channel_tile = 16;
4484 xnn_params.f32.dwconv[2].primary_tile = 9;
4485
4486 xnn_params.f32.dwconv[3].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_f32_dwconv_minmax_ukernel_up16x25__avx512f;
4487 xnn_params.f32.dwconv[3].init.f32 = xnn_init_f32_minmax_scalar_params;
4488 xnn_params.f32.dwconv[3].channel_tile = 16;
4489 xnn_params.f32.dwconv[3].primary_tile = 25;
4490 } else if (cpuinfo_has_x86_fma3()) {
4491 xnn_params.f32.dwconv[0].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_f32_dwconv_minmax_ukernel_up16x3__fma3;
4492 xnn_params.f32.dwconv[0].init.f32 = xnn_init_f32_minmax_avx_params;
4493 xnn_params.f32.dwconv[0].channel_tile = 16;
4494 xnn_params.f32.dwconv[0].primary_tile = 3;
4495
4496 xnn_params.f32.dwconv[1].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_f32_dwconv_minmax_ukernel_up16x4__fma3;
4497 xnn_params.f32.dwconv[1].init.f32 = xnn_init_f32_minmax_avx_params;
4498 xnn_params.f32.dwconv[1].channel_tile = 16;
4499 xnn_params.f32.dwconv[1].primary_tile = 4;
4500
4501 xnn_params.f32.dwconv[2].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_f32_dwconv_minmax_ukernel_up16x9__fma3;
4502 xnn_params.f32.dwconv[2].init.f32 = xnn_init_f32_minmax_avx_params;
4503 xnn_params.f32.dwconv[2].channel_tile = 16;
4504 xnn_params.f32.dwconv[2].primary_tile = 9;
4505
4506 xnn_params.f32.dwconv[3].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_f32_dwconv_minmax_ukernel_up8x25__fma3;
4507 xnn_params.f32.dwconv[3].init.f32 = xnn_init_f32_minmax_avx_params;
4508 xnn_params.f32.dwconv[3].channel_tile = 8;
4509 xnn_params.f32.dwconv[3].primary_tile = 25;
4510 } else if (cpuinfo_has_x86_avx()) {
4511 xnn_params.f32.dwconv[0].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_f32_dwconv_minmax_ukernel_up16x3__avx;
4512 xnn_params.f32.dwconv[0].init.f32 = xnn_init_f32_minmax_avx_params;
4513 xnn_params.f32.dwconv[0].channel_tile = 16;
4514 xnn_params.f32.dwconv[0].primary_tile = 3;
4515
4516 xnn_params.f32.dwconv[1].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_f32_dwconv_minmax_ukernel_up16x4__avx;
4517 xnn_params.f32.dwconv[1].init.f32 = xnn_init_f32_minmax_avx_params;
4518 xnn_params.f32.dwconv[1].channel_tile = 16;
4519 xnn_params.f32.dwconv[1].primary_tile = 4;
4520
4521 xnn_params.f32.dwconv[2].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_f32_dwconv_minmax_ukernel_up16x9__avx;
4522 xnn_params.f32.dwconv[2].init.f32 = xnn_init_f32_minmax_avx_params;
4523 xnn_params.f32.dwconv[2].channel_tile = 16;
4524 xnn_params.f32.dwconv[2].primary_tile = 9;
4525
4526 xnn_params.f32.dwconv[3].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_f32_dwconv_minmax_ukernel_up8x25__avx;
4527 xnn_params.f32.dwconv[3].init.f32 = xnn_init_f32_minmax_avx_params;
4528 xnn_params.f32.dwconv[3].channel_tile = 8;
4529 xnn_params.f32.dwconv[3].primary_tile = 25;
4530 } else {
4531 xnn_params.f32.dwconv[0].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_f32_dwconv_minmax_ukernel_up8x3__sse;
4532 xnn_params.f32.dwconv[0].init.f32 = xnn_init_f32_minmax_sse_params;
4533 xnn_params.f32.dwconv[0].channel_tile = 8;
4534 xnn_params.f32.dwconv[0].primary_tile = 3;
4535
4536 xnn_params.f32.dwconv[1].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_f32_dwconv_minmax_ukernel_up8x4__sse;
4537 xnn_params.f32.dwconv[1].init.f32 = xnn_init_f32_minmax_sse_params;
4538 xnn_params.f32.dwconv[1].channel_tile = 8;
4539 xnn_params.f32.dwconv[1].primary_tile = 4;
4540
4541 xnn_params.f32.dwconv[2].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_f32_dwconv_minmax_ukernel_up8x9__sse;
4542 xnn_params.f32.dwconv[2].init.f32 = xnn_init_f32_minmax_sse_params;
4543 xnn_params.f32.dwconv[2].channel_tile = 8;
4544 xnn_params.f32.dwconv[2].primary_tile = 9;
4545
4546 xnn_params.f32.dwconv[3].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_f32_dwconv_minmax_ukernel_up8x25__sse;
4547 xnn_params.f32.dwconv[3].init.f32 = xnn_init_f32_minmax_sse_params;
4548 xnn_params.f32.dwconv[3].channel_tile = 8;
4549 xnn_params.f32.dwconv[3].primary_tile = 25;
4550 }
4551 xnn_params.f32.avgpool = (struct avgpool_parameters) {
4552 .unipass = (xnn_avgpool_unipass_ukernel_function) xnn_f32_avgpool_minmax_ukernel_9x__sse_c4,
4553 .multipass = (xnn_avgpool_multipass_ukernel_function) xnn_f32_avgpool_minmax_ukernel_9p8x__sse_c4,
4554 .init.f32 = xnn_init_f32_scaleminmax_sse_params,
4555 .primary_tile = 9,
4556 .incremental_tile = 8,
4557 .channel_tile = 4,
4558 };
4559 xnn_params.f32.pavgpool = (struct pavgpool_parameters) {
4560 .unipass = (xnn_pavgpool_unipass_ukernel_function) xnn_f32_pavgpool_minmax_ukernel_9x__sse_c4,
4561 .multipass = (xnn_pavgpool_multipass_ukernel_function) xnn_f32_pavgpool_minmax_ukernel_9p8x__sse_c4,
4562 .init.f32 = xnn_init_f32_minmax_sse_params,
4563 .primary_tile = 9,
4564 .incremental_tile = 8,
4565 .channel_tile = 4,
4566 };
4567 xnn_params.f32.gavgpool = (struct gavgpool_parameters) {
4568 .unipass = (xnn_gavgpool_unipass_ukernel_function) xnn_f32_gavgpool_minmax_ukernel_7x__sse_c4,
4569 .multipass = (xnn_gavgpool_multipass_ukernel_function) xnn_f32_gavgpool_minmax_ukernel_7p7x__sse_c4,
4570 .init.f32 = xnn_init_f32_scaleminmax_sse_params,
4571 .update.f32 = xnn_update_f32_scaleminmax_sse_params,
4572 .row_tile = 7,
4573 .channel_tile = 4,
4574 };
4575 xnn_params.f32.maxpool = (struct maxpool_parameters) {
4576 .ukernel = (xnn_maxpool_ukernel_function) xnn_f32_maxpool_minmax_ukernel_9p8x__sse_c4,
4577 .init.f32 = xnn_init_f32_minmax_sse_params,
4578 .mr = 9,
4579 .qr = 8,
4580 };
4581 xnn_params.f32.argmaxpool[0] = (struct argmaxpool_parameters) {
4582 .up = (xnn_argmaxpool_unipass_ukernel_function) xnn_f32_argmaxpool_ukernel_4x__sse2_c4,
4583 .mr = 4,
4584 };
4585 xnn_params.f32.argmaxpool[1] = (struct argmaxpool_parameters) {
4586 .up = (xnn_argmaxpool_unipass_ukernel_function) xnn_f32_argmaxpool_ukernel_9x__sse2_c4,
4587 .mr = 9,
4588 };
4589 xnn_params.f32.argmaxpool[2] = (struct argmaxpool_parameters) {
4590 .mp = (xnn_argmaxpool_multipass_ukernel_function) xnn_f32_argmaxpool_ukernel_9p8x__sse2_c4,
4591 .mr = 9,
4592 .qr = 8,
4593 };
4594 xnn_params.f32.ibilinear = (struct ibilinear_parameters) {
4595 .ukernel = (xnn_ibilinear_ukernel_function) xnn_f32_ibilinear_ukernel__sse_c8,
4596 .pixel_tile = 1,
4597 .channel_tile = 8,
4598 };
4599 if (!XNN_PLATFORM_MOBILE && cpuinfo_has_x86_avx512f()) {
4600 xnn_params.f32.abs = (struct vunary_parameters) {
4601 .ukernel = (xnn_vunary_ukernel_function) xnn_f32_vabs_ukernel__avx512f_x16,
4602 .init.f32_abs = xnn_init_f32_abs_avx512_params,
4603 .element_tile = 16,
4604 };
4605 } else if (cpuinfo_has_x86_avx()) {
4606 xnn_params.f32.abs = (struct vunary_parameters) {
4607 .ukernel = (xnn_vunary_ukernel_function) xnn_f32_vabs_ukernel__avx_x16,
4608 .init.f32_abs = xnn_init_f32_abs_avx_params,
4609 .element_tile = 16,
4610 };
4611 } else {
4612 xnn_params.f32.abs = (struct vunary_parameters) {
4613 .ukernel = (xnn_vunary_ukernel_function) xnn_f32_vabs_ukernel__sse_x8,
4614 .init.f32_abs = xnn_init_f32_abs_sse_params,
4615 .element_tile = 8,
4616 };
4617 }
4618 if (!XNN_PLATFORM_MOBILE && cpuinfo_has_x86_avx512f()) {
4619 xnn_params.f32.clamp = (struct vunary_parameters) {
4620 .ukernel = (xnn_vunary_ukernel_function) xnn_f32_vclamp_ukernel__avx512f_x16,
4621 .init.f32_minmax = xnn_init_f32_minmax_scalar_params,
4622 .element_tile = 16,
4623 };
4624 } else if (cpuinfo_has_x86_avx()) {
4625 xnn_params.f32.clamp = (struct vunary_parameters) {
4626 .ukernel = (xnn_vunary_ukernel_function) xnn_f32_vclamp_ukernel__avx_x16,
4627 .init.f32_minmax = xnn_init_f32_minmax_avx_params,
4628 .element_tile = 16,
4629 };
4630 } else {
4631 xnn_params.f32.clamp = (struct vunary_parameters) {
4632 .ukernel = (xnn_vunary_ukernel_function) xnn_f32_vclamp_ukernel__sse_x8,
4633 .init.f32_minmax = xnn_init_f32_minmax_sse_params,
4634 .element_tile = 8,
4635 };
4636 }
4637 if (!XNN_PLATFORM_MOBILE && cpuinfo_has_x86_avx512f()) {
4638 xnn_params.f32.elu = (struct vunary_parameters) {
4639 .ukernel = (xnn_vunary_ukernel_function) xnn_f32_velu_ukernel__avx512f_rr1_lut16_p3_perm_x64,
4640 .init.f32_elu = xnn_init_f32_elu_avx512_rr1_lut16_p3_params,
4641 .element_tile = 64,
4642 };
4643 } else if (cpuinfo_has_x86_avx2()) {
4644 xnn_params.f32.elu = (struct vunary_parameters) {
4645 .ukernel = (xnn_vunary_ukernel_function) xnn_f32_velu_ukernel__avx2_rr1_lut4_p4_perm_x56,
4646 .init.f32_elu = xnn_init_f32_elu_avx2_rr1_lut4_p4_params,
4647 .element_tile = 56,
4648 };
4649 } else if (cpuinfo_has_x86_avx()) {
4650 xnn_params.f32.elu = (struct vunary_parameters) {
4651 .ukernel = (xnn_vunary_ukernel_function) xnn_f32_velu_ukernel__avx_rr2_lut4_p4_perm_x32,
4652 .init.f32_elu = xnn_init_f32_elu_avx_rr2_lut4_p4_params,
4653 .element_tile = 32,
4654 };
4655 } else {
4656 xnn_params.f32.elu = (struct vunary_parameters) {
4657 .ukernel = (xnn_vunary_ukernel_function) xnn_f32_velu_ukernel__sse2_rr2_lut16_p3_x12,
4658 .init.f32_elu = xnn_init_f32_elu_sse2_rr2_lut16_p3_params,
4659 .element_tile = 12,
4660 };
4661 }
4662 if (!XNN_PLATFORM_MOBILE && cpuinfo_has_x86_avx512f()) {
4663 xnn_params.f32.hswish = (struct vunary_parameters) {
4664 .ukernel = (xnn_vunary_ukernel_function) xnn_f32_vhswish_ukernel__avx512f_x16,
4665 .init.f32_hswish = xnn_init_f32_hswish_avx512_params,
4666 .element_tile = 16,
4667 };
4668 } else if (cpuinfo_has_x86_fma3()) {
4669 xnn_params.f32.hswish = (struct vunary_parameters) {
4670 .ukernel = (xnn_vunary_ukernel_function) xnn_f32_vhswish_ukernel__fma3_x16,
4671 .init.f32_hswish = xnn_init_f32_hswish_avx_params,
4672 .element_tile = 16,
4673 };
4674 } else if (cpuinfo_has_x86_avx()) {
4675 xnn_params.f32.hswish = (struct vunary_parameters) {
4676 .ukernel = (xnn_vunary_ukernel_function) xnn_f32_vhswish_ukernel__avx_x16,
4677 .init.f32_hswish = xnn_init_f32_hswish_avx_params,
4678 .element_tile = 16,
4679 };
4680 } else {
4681 xnn_params.f32.hswish = (struct vunary_parameters) {
4682 .ukernel = (xnn_vunary_ukernel_function) xnn_f32_vhswish_ukernel__sse_x8,
4683 .init.f32_hswish = xnn_init_f32_hswish_sse_params,
4684 .element_tile = 8,
4685 };
4686 }
4687 if (!XNN_PLATFORM_MOBILE && cpuinfo_has_x86_avx512f()) {
4688 xnn_params.f32.lrelu = (struct vunary_parameters) {
4689 .ukernel = (xnn_vunary_ukernel_function) xnn_f32_vlrelu_ukernel__avx512f_x16,
4690 .init.f32_lrelu = xnn_init_f32_lrelu_scalar_params,
4691 .element_tile = 16,
4692 };
4693 } else if (cpuinfo_has_x86_avx()) {
4694 xnn_params.f32.lrelu = (struct vunary_parameters) {
4695 .ukernel = (xnn_vunary_ukernel_function) xnn_f32_vlrelu_ukernel__avx_x16,
4696 .init.f32_lrelu = xnn_init_f32_lrelu_avx_params,
4697 .element_tile = 16,
4698 };
4699 } else if (cpuinfo_has_x86_sse4_1()) {
4700 xnn_params.f32.lrelu = (struct vunary_parameters) {
4701 .ukernel = (xnn_vunary_ukernel_function) xnn_f32_vlrelu_ukernel__sse41_x8,
4702 .init.f32_lrelu = xnn_init_f32_lrelu_sse_params,
4703 .element_tile = 8,
4704 };
4705 } else {
4706 xnn_params.f32.lrelu = (struct vunary_parameters) {
4707 .ukernel = (xnn_vunary_ukernel_function) xnn_f32_vlrelu_ukernel__sse_x8,
4708 .init.f32_lrelu = xnn_init_f32_lrelu_sse_params,
4709 .element_tile = 8,
4710 };
4711 }
4712 if (!XNN_PLATFORM_MOBILE && cpuinfo_has_x86_avx512f()) {
4713 xnn_params.f32.neg = (struct vunary_parameters) {
4714 .ukernel = (xnn_vunary_ukernel_function) xnn_f32_vneg_ukernel__avx512f_x16,
4715 .init.f32_neg = xnn_init_f32_neg_avx512_params,
4716 .element_tile = 16,
4717 };
4718 } else if (cpuinfo_has_x86_avx()) {
4719 xnn_params.f32.neg = (struct vunary_parameters) {
4720 .ukernel = (xnn_vunary_ukernel_function) xnn_f32_vneg_ukernel__avx_x16,
4721 .init.f32_neg = xnn_init_f32_neg_avx_params,
4722 .element_tile = 16,
4723 };
4724 } else {
4725 xnn_params.f32.neg = (struct vunary_parameters) {
4726 .ukernel = (xnn_vunary_ukernel_function) xnn_f32_vneg_ukernel__sse_x8,
4727 .init.f32_neg = xnn_init_f32_neg_sse_params,
4728 .element_tile = 8,
4729 };
4730 }
4731 if (!XNN_PLATFORM_MOBILE && cpuinfo_has_x86_avx512f()) {
4732 xnn_params.f32.rndne = (struct vunary_parameters) {
4733 .ukernel = (xnn_vunary_ukernel_function) xnn_f32_vrndne_ukernel__avx512f_x16,
4734 .element_tile = 16,
4735 };
4736 xnn_params.f32.rndz = (struct vunary_parameters) {
4737 .ukernel = (xnn_vunary_ukernel_function) xnn_f32_vrndz_ukernel__avx512f_x16,
4738 .element_tile = 16,
4739 };
4740 xnn_params.f32.rndu = (struct vunary_parameters) {
4741 .ukernel = (xnn_vunary_ukernel_function) xnn_f32_vrndu_ukernel__avx512f_x16,
4742 .element_tile = 16,
4743 };
4744 xnn_params.f32.rndd = (struct vunary_parameters) {
4745 .ukernel = (xnn_vunary_ukernel_function) xnn_f32_vrndd_ukernel__avx512f_x16,
4746 .element_tile = 16,
4747 };
4748 } else if (cpuinfo_has_x86_avx()) {
4749 xnn_params.f32.rndne = (struct vunary_parameters) {
4750 .ukernel = (xnn_vunary_ukernel_function) xnn_f32_vrndne_ukernel__avx_x16,
4751 .init.f32_rnd = xnn_init_f32_rnd_avx_params,
4752 .element_tile = 16,
4753 };
4754 xnn_params.f32.rndz = (struct vunary_parameters) {
4755 .ukernel = (xnn_vunary_ukernel_function) xnn_f32_vrndz_ukernel__avx_x16,
4756 .init.f32_rnd = xnn_init_f32_rnd_avx_params,
4757 .element_tile = 16,
4758 };
4759 xnn_params.f32.rndu = (struct vunary_parameters) {
4760 .ukernel = (xnn_vunary_ukernel_function) xnn_f32_vrndu_ukernel__avx_x16,
4761 .init.f32_rnd = xnn_init_f32_rnd_avx_params,
4762 .element_tile = 16,
4763 };
4764 xnn_params.f32.rndd = (struct vunary_parameters) {
4765 .ukernel = (xnn_vunary_ukernel_function) xnn_f32_vrndd_ukernel__avx_x16,
4766 .init.f32_rnd = xnn_init_f32_rnd_avx_params,
4767 .element_tile = 16,
4768 };
4769 } else if (cpuinfo_has_x86_sse4_1()) {
4770 xnn_params.f32.rndne = (struct vunary_parameters) {
4771 .ukernel = (xnn_vunary_ukernel_function) xnn_f32_vrndne_ukernel__sse41_x8,
4772 .element_tile = 8,
4773 };
4774 xnn_params.f32.rndz = (struct vunary_parameters) {
4775 .ukernel = (xnn_vunary_ukernel_function) xnn_f32_vrndz_ukernel__sse41_x8,
4776 .element_tile = 8,
4777 };
4778 xnn_params.f32.rndu = (struct vunary_parameters) {
4779 .ukernel = (xnn_vunary_ukernel_function) xnn_f32_vrndu_ukernel__sse41_x8,
4780 .element_tile = 8,
4781 };
4782 xnn_params.f32.rndd = (struct vunary_parameters) {
4783 .ukernel = (xnn_vunary_ukernel_function) xnn_f32_vrndd_ukernel__sse41_x8,
4784 .element_tile = 8,
4785 };
4786 } else {
4787 xnn_params.f32.rndne = (struct vunary_parameters) {
4788 .ukernel = (xnn_vunary_ukernel_function) xnn_f32_vrndne_ukernel__sse2_x8,
4789 .init.f32_rnd = xnn_init_f32_rnd_sse2_params,
4790 .element_tile = 8,
4791 };
4792 xnn_params.f32.rndz = (struct vunary_parameters) {
4793 .ukernel = (xnn_vunary_ukernel_function) xnn_f32_vrndz_ukernel__sse2_x8,
4794 .init.f32_rnd = xnn_init_f32_rnd_sse2_params,
4795 .element_tile = 8,
4796 };
4797 xnn_params.f32.rndu = (struct vunary_parameters) {
4798 .ukernel = (xnn_vunary_ukernel_function) xnn_f32_vrndu_ukernel__sse2_x8,
4799 .init.f32_rnd = xnn_init_f32_rnd_sse2_params,
4800 .element_tile = 8,
4801 };
4802 xnn_params.f32.rndd = (struct vunary_parameters) {
4803 .ukernel = (xnn_vunary_ukernel_function) xnn_f32_vrndd_ukernel__sse2_x8,
4804 .init.f32_rnd = xnn_init_f32_rnd_sse2_params,
4805 .element_tile = 8,
4806 };
4807 }
4808 if (!XNN_PLATFORM_MOBILE && cpuinfo_has_x86_avx512f()) {
4809 xnn_params.f32.sigmoid = (struct vunary_parameters) {
4810 .ukernel = (xnn_vunary_ukernel_function) xnn_f32_vsigmoid_ukernel__avx512f_rr2_lut32_p2_perm2_scalef_div_x64,
4811 .init.f32_sigmoid = xnn_init_f32_sigmoid_avx512_rr2_lut32_p2_params,
4812 .element_tile = 64,
4813 };
4814 } else if (cpuinfo_has_x86_avx2()) {
4815 xnn_params.f32.sigmoid = (struct vunary_parameters) {
4816 .ukernel = (xnn_vunary_ukernel_function) xnn_f32_vsigmoid_ukernel__avx2_rr1_p5_div_x40,
4817 .init.f32_sigmoid = xnn_init_f32_sigmoid_avx2_rr1_p5_params,
4818 .element_tile = 40,
4819 };
4820 } else if (cpuinfo_has_x86_avx()) {
4821 xnn_params.f32.sigmoid = (struct vunary_parameters) {
4822 .ukernel = (xnn_vunary_ukernel_function) xnn_f32_vsigmoid_ukernel__avx_rr2_p5_nr2_x40,
4823 .init.f32_sigmoid = xnn_init_f32_sigmoid_avx_rr2_p5_params,
4824 .element_tile = 40,
4825 };
4826 } else if (cpuinfo_has_x86_sse4_1()) {
4827 xnn_params.f32.sigmoid = (struct vunary_parameters) {
4828 .ukernel = (xnn_vunary_ukernel_function) xnn_f32_vsigmoid_ukernel__sse41_rr2_lut64_p2_div_x8,
4829 .init.f32_sigmoid = xnn_init_f32_sigmoid_sse2_rr2_lut64_p2_params,
4830 .element_tile = 8,
4831 };
4832 } else {
4833 xnn_params.f32.sigmoid = (struct vunary_parameters) {
4834 .ukernel = (xnn_vunary_ukernel_function) xnn_f32_vsigmoid_ukernel__sse2_rr2_lut64_p2_div_x8,
4835 .init.f32_sigmoid = xnn_init_f32_sigmoid_sse2_rr2_lut64_p2_params,
4836 .element_tile = 8,
4837 };
4838 }
4839 if (!XNN_PLATFORM_MOBILE && cpuinfo_has_x86_avx512f()) {
4840 xnn_params.f32.sqr = (struct vunary_parameters) {
4841 .ukernel = (xnn_vunary_ukernel_function) xnn_f32_vsqr_ukernel__avx512f_x16,
4842 .element_tile = 16,
4843 };
4844 } else if (cpuinfo_has_x86_avx()) {
4845 xnn_params.f32.sqr = (struct vunary_parameters) {
4846 .ukernel = (xnn_vunary_ukernel_function) xnn_f32_vsqr_ukernel__avx_x16,
4847 .init.f32_default = xnn_init_f32_default_avx_params,
4848 .element_tile = 16,
4849 };
4850 } else {
4851 xnn_params.f32.sqr = (struct vunary_parameters) {
4852 .ukernel = (xnn_vunary_ukernel_function) xnn_f32_vsqr_ukernel__sse_x8,
4853 .element_tile = 8,
4854 };
4855 }
4856 if (cpuinfo_has_x86_avx()) {
4857 xnn_params.f32.sqrt = (struct vunary_parameters) {
4858 .ukernel = (xnn_vunary_ukernel_function) xnn_f32_vsqrt_ukernel__avx_sqrt_x8,
4859 .init.f32_sqrt = xnn_init_f32_sqrt_avx_params,
4860 .element_tile = 8,
4861 };
4862 } else {
4863 xnn_params.f32.sqrt = (struct vunary_parameters) {
4864 .ukernel = (xnn_vunary_ukernel_function) xnn_f32_vsqrt_ukernel__sse_sqrt_x4,
4865 .element_tile = 4,
4866 };
4867 }
4868 if (!XNN_PLATFORM_MOBILE && cpuinfo_has_x86_avx512f()) {
4869 xnn_params.f32.prelu = (struct prelu_parameters) {
4870 .ukernel = (xnn_prelu_ukernel_function) xnn_f32_prelu_ukernel__avx512f_2x16,
4871 .row_tile = 2,
4872 .channel_tile = 16,
4873 };
4874 } else if (cpuinfo_has_x86_avx()) {
4875 xnn_params.f32.prelu = (struct prelu_parameters) {
4876 .ukernel = (xnn_prelu_ukernel_function) xnn_f32_prelu_ukernel__avx_2x16,
4877 .row_tile = 2,
4878 .channel_tile = 16,
4879 };
4880 } else if (cpuinfo_has_x86_sse4_1()) {
4881 xnn_params.f32.prelu = (struct prelu_parameters) {
4882 .ukernel = (xnn_prelu_ukernel_function) xnn_f32_prelu_ukernel__sse41_2x8,
4883 .row_tile = 2,
4884 .channel_tile = 8,
4885 };
4886 } else {
4887 xnn_params.f32.prelu = (struct prelu_parameters) {
4888 .ukernel = (xnn_prelu_ukernel_function) xnn_f32_prelu_ukernel__sse2_2x8,
4889 .row_tile = 2,
4890 .channel_tile = 8,
4891 };
4892 }
4893 xnn_params.f32.raddstoreexpminusmax = (struct raddstoreexpminusmax_parameters) {
4894 .ukernel = (xnn_raddstoreexpminusmax_ukernel_function) xnn_f32_raddstoreexpminusmax_ukernel__sse2_rr2_p5_x20_acc2,
4895 .init.f32 = xnn_init_f32_expminus_sse2_rr2_p5_params,
4896 .element_tile = 20,
4897 };
4898 xnn_params.f32.rmax = (xnn_rmax_ukernel_function) xnn_f32_rmax_ukernel__sse;
4899 if (!XNN_PLATFORM_MOBILE && cpuinfo_has_x86_avx512f()) {
4900 xnn_params.f32.vadd = (struct vbinary_parameters) {
4901 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vadd_minmax_ukernel__avx512f_x32,
4902 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vaddc_minmax_ukernel__avx512f_x32,
4903 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vaddc_minmax_ukernel__avx512f_x32,
4904 .init.f32_minmax = xnn_init_f32_minmax_scalar_params,
4905 .element_tile = 32,
4906 };
4907 xnn_params.f32.vdiv = (struct vbinary_parameters) {
4908 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vdiv_minmax_ukernel__avx512f_x32,
4909 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vdivc_minmax_ukernel__avx512f_x32,
4910 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vrdivc_minmax_ukernel__avx512f_x32,
4911 .init.f32_minmax = xnn_init_f32_minmax_scalar_params,
4912 .element_tile = 32,
4913 };
4914 xnn_params.f32.vmax = (struct vbinary_parameters) {
4915 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmax_ukernel__avx512f_x32,
4916 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmaxc_ukernel__avx512f_x32,
4917 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmaxc_ukernel__avx512f_x32,
4918 .element_tile = 32,
4919 };
4920 xnn_params.f32.vmin = (struct vbinary_parameters) {
4921 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmin_ukernel__avx512f_x32,
4922 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vminc_ukernel__avx512f_x32,
4923 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vminc_ukernel__avx512f_x32,
4924 .element_tile = 32,
4925 };
4926 xnn_params.f32.vmul = (struct vbinary_parameters) {
4927 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmul_minmax_ukernel__avx512f_x32,
4928 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmulc_minmax_ukernel__avx512f_x32,
4929 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmulc_minmax_ukernel__avx512f_x32,
4930 .init.f32_minmax = xnn_init_f32_minmax_scalar_params,
4931 .element_tile = 32,
4932 };
4933 xnn_params.f32.vsub = (struct vbinary_parameters) {
4934 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vsub_minmax_ukernel__avx512f_x32,
4935 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vsubc_minmax_ukernel__avx512f_x32,
4936 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vrsubc_minmax_ukernel__avx512f_x32,
4937 .init.f32_minmax = xnn_init_f32_minmax_scalar_params,
4938 .element_tile = 32,
4939 };
4940 xnn_params.f32.vsqrdiff = (struct vbinary_parameters) {
4941 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vsqrdiff_ukernel__avx512f_x32,
4942 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vsqrdiffc_ukernel__avx512f_x32,
4943 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vsqrdiffc_ukernel__avx512f_x32,
4944 .element_tile = 32,
4945 };
4946 } else if (cpuinfo_has_x86_avx()) {
4947 xnn_params.f32.vadd = (struct vbinary_parameters) {
4948 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vadd_minmax_ukernel__avx_x16,
4949 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vaddc_minmax_ukernel__avx_x16,
4950 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vaddc_minmax_ukernel__avx_x16,
4951 .init.f32_minmax = xnn_init_f32_minmax_avx_params,
4952 .element_tile = 16,
4953 };
4954 xnn_params.f32.vdiv = (struct vbinary_parameters) {
4955 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vdiv_minmax_ukernel__avx_x16,
4956 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vdivc_minmax_ukernel__avx_x16,
4957 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vrdivc_minmax_ukernel__avx_x16,
4958 .init.f32_minmax = xnn_init_f32_minmax_avx_params,
4959 .element_tile = 16,
4960 };
4961 xnn_params.f32.vmax = (struct vbinary_parameters) {
4962 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmax_ukernel__avx_x16,
4963 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmaxc_ukernel__avx_x16,
4964 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmaxc_ukernel__avx_x16,
4965 .init.f32_default = xnn_init_f32_default_avx_params,
4966 .element_tile = 16,
4967 };
4968 xnn_params.f32.vmin = (struct vbinary_parameters) {
4969 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmin_ukernel__avx_x16,
4970 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vminc_ukernel__avx_x16,
4971 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vminc_ukernel__avx_x16,
4972 .init.f32_default = xnn_init_f32_default_avx_params,
4973 .element_tile = 16,
4974 };
4975 xnn_params.f32.vmul = (struct vbinary_parameters) {
4976 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmul_minmax_ukernel__avx_x16,
4977 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmulc_minmax_ukernel__avx_x16,
4978 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmulc_minmax_ukernel__avx_x16,
4979 .init.f32_minmax = xnn_init_f32_minmax_avx_params,
4980 .element_tile = 16,
4981 };
4982 xnn_params.f32.vsub = (struct vbinary_parameters) {
4983 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vsub_minmax_ukernel__avx_x16,
4984 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vsubc_minmax_ukernel__avx_x16,
4985 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vrsubc_minmax_ukernel__avx_x16,
4986 .init.f32_minmax = xnn_init_f32_minmax_avx_params,
4987 .element_tile = 16,
4988 };
4989 xnn_params.f32.vsqrdiff = (struct vbinary_parameters) {
4990 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vsqrdiff_ukernel__avx_x16,
4991 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vsqrdiffc_ukernel__avx_x16,
4992 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vsqrdiffc_ukernel__avx_x16,
4993 .init.f32_default = xnn_init_f32_default_avx_params,
4994 .element_tile = 16,
4995 };
4996 } else {
4997 xnn_params.f32.vadd = (struct vbinary_parameters) {
4998 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vadd_minmax_ukernel__sse_x8,
4999 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vaddc_minmax_ukernel__sse_x8,
5000 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vaddc_minmax_ukernel__sse_x8,
5001 .init.f32_minmax = xnn_init_f32_minmax_sse_params,
5002 .element_tile = 8,
5003 };
5004 xnn_params.f32.vdiv = (struct vbinary_parameters) {
5005 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vdiv_minmax_ukernel__sse_x8,
5006 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vdivc_minmax_ukernel__sse_x8,
5007 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vrdivc_minmax_ukernel__sse_x8,
5008 .init.f32_minmax = xnn_init_f32_minmax_sse_params,
5009 .element_tile = 8,
5010 };
5011 xnn_params.f32.vmax = (struct vbinary_parameters) {
5012 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmax_ukernel__sse_x8,
5013 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmaxc_ukernel__sse_x8,
5014 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmaxc_ukernel__sse_x8,
5015 .element_tile = 8,
5016 };
5017 xnn_params.f32.vmin = (struct vbinary_parameters) {
5018 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmin_ukernel__sse_x8,
5019 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vminc_ukernel__sse_x8,
5020 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vminc_ukernel__sse_x8,
5021 .element_tile = 8,
5022 };
5023 xnn_params.f32.vmul = (struct vbinary_parameters) {
5024 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmul_minmax_ukernel__sse_x8,
5025 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmulc_minmax_ukernel__sse_x8,
5026 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmulc_minmax_ukernel__sse_x8,
5027 .init.f32_minmax = xnn_init_f32_minmax_sse_params,
5028 .element_tile = 8,
5029 };
5030 xnn_params.f32.vsub = (struct vbinary_parameters) {
5031 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vsub_minmax_ukernel__sse_x8,
5032 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vsubc_minmax_ukernel__sse_x8,
5033 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vrsubc_minmax_ukernel__sse_x8,
5034 .init.f32_minmax = xnn_init_f32_minmax_sse_params,
5035 .element_tile = 8,
5036 };
5037 xnn_params.f32.vsqrdiff = (struct vbinary_parameters) {
5038 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vsqrdiff_ukernel__sse_x8,
5039 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vsqrdiffc_ukernel__sse_x8,
5040 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vsqrdiffc_ukernel__sse_x8,
5041 .element_tile = 8,
5042 };
5043 }
5044 xnn_params.f32.vmulcaddc = (struct vmulcaddc_parameters) {
5045 .ukernel = (xnn_vmulcaddc_ukernel_function) xnn_f32_vmulcaddc_minmax_ukernel_c4__sse_2x,
5046 .init.f32 = xnn_init_f32_minmax_sse_params,
5047 .channel_tile = 4,
5048 .row_tile = 2,
5049 };
5050 #ifndef XNN_NO_NCHW_OPERATORS
5051 // Sparse microkernels on x86 currently target only SSE, and on processors
5052 // with AVX ISA dense inference is expected to be faster than sparse.
5053 if (!cpuinfo_has_x86_avx()) {
5054 init_flags |= XNN_INIT_FLAG_CHW_OPT;
5055 }
5056
5057 xnn_params.f32.spmm = (struct spmm_parameters) {
5058 .ukernel = (xnn_spmm_ukernel_function) xnn_f32_spmm_minmax_ukernel_32x1__sse,
5059 .mr = 32,
5060 .nr = 1,
5061 };
5062 xnn_params.f32.conv_hwc2chw_3x3c3s2 = (struct conv_hwc2chw_parameters) {
5063 .ukernel_with_symm_padding =
5064 (xnn_conv_hwc2chw_ukernel_function) xnn_f32_conv_hwc2chw_ukernel_3x3s2p1c3x4__sse_2x2,
5065 .output_channel_tile = 4,
5066 .output_height_tile = 2,
5067 .output_width_tile = 2,
5068 };
5069 if (cpuinfo_has_x86_ssse3()) {
5070 xnn_params.f32.dwconv2d_chw_3x3 = (struct dwconv2d_chw_parameters) {
5071 .ukernel = (xnn_dwconv2d_chw_ukernel_function) xnn_f32_dwconv2d_chw_ukernel_3x3p1__ssse3_2x4_acc2,
5072 .output_width_tile = 4,
5073 .output_height_tile = 2,
5074 };
5075 } else {
5076 xnn_params.f32.dwconv2d_chw_3x3 = (struct dwconv2d_chw_parameters) {
5077 .ukernel = (xnn_dwconv2d_chw_ukernel_function) xnn_f32_dwconv2d_chw_ukernel_3x3p1__sse_2x4_acc2,
5078 .output_width_tile = 4,
5079 .output_height_tile = 2,
5080 };
5081 }
5082 xnn_params.f32.dwconv2d_chw_3x3s2 = (struct dwconv2d_chw_parameters) {
5083 .ukernel = (xnn_dwconv2d_chw_ukernel_function) xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__sse_1x4_acc3,
5084 .output_width_tile = 4,
5085 .output_height_tile = 1,
5086 };
5087 xnn_params.f32.dwconv2d_chw_5x5 = (struct dwconv2d_chw_parameters) {
5088 .ukernel = (xnn_dwconv2d_chw_ukernel_function) xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_4x4,
5089 .output_width_tile = 4,
5090 .output_height_tile = 4,
5091 };
5092 xnn_params.f32.dwconv2d_chw_5x5s2 = (struct dwconv2d_chw_parameters) {
5093 .ukernel = (xnn_dwconv2d_chw_ukernel_function) xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__sse_2x4,
5094 .output_width_tile = 4,
5095 .output_height_tile = 2,
5096 };
5097 xnn_params.f32.gavgpool_cw = (struct gavgpool_cw_parameters) {
5098 .ukernel = (xnn_gavgpool_cw_ukernel_function) xnn_f32_gavgpool_cw_ukernel__sse_x4,
5099 .channel_tile = 4,
5100 };
5101 xnn_params.f32.ibilinear_chw = (struct ibilinear_chw_parameters) {
5102 .ukernel = (xnn_ibilinear_chw_ukernel_function) xnn_f32_ibilinear_chw_ukernel__sse_p8,
5103 .channel_tile = 1,
5104 .pixel_tile = 8,
5105 };
5106 #endif // XNN_NO_NCHW_OPERATORS
5107 #endif // XNN_NO_F32_OPERATORS
5108
5109 /*************************** VCVT x86 micro-kernels ***************************/
5110 #ifndef XNN_NO_VCVT_OPERATORS
5111 init_flags |= XNN_INIT_FLAG_VCVT;
5112
5113 if (cpuinfo_has_x86_avx512f() && cpuinfo_has_x86_avx512bw() && cpuinfo_has_x86_avx512dq() && cpuinfo_has_x86_avx512vl()) {
5114 xnn_params.vcvt.f16_to_f32 = (struct vunary_parameters) {
5115 .ukernel = (xnn_vunary_ukernel_function) xnn_f16_f32_vcvt_ukernel__avx512skx_x16,
5116 .element_tile = 16,
5117 };
5118 xnn_params.vcvt.f32_to_f16 = (struct vunary_parameters) {
5119 .ukernel = (xnn_vunary_ukernel_function) xnn_f32_f16_vcvt_ukernel__avx512skx_x16,
5120 .element_tile = 16,
5121 };
5122 } else if (cpuinfo_has_x86_f16c()) {
5123 xnn_params.vcvt.f16_to_f32 = (struct vunary_parameters) {
5124 .ukernel = (xnn_vunary_ukernel_function) xnn_f16_f32_vcvt_ukernel__f16c_x16,
5125 .element_tile = 16,
5126 };
5127 xnn_params.vcvt.f32_to_f16 = (struct vunary_parameters) {
5128 .ukernel = (xnn_vunary_ukernel_function) xnn_f32_f16_vcvt_ukernel__f16c_x16,
5129 .init.f32_f16_cvt = xnn_init_f32_f16_cvt_f16c_params,
5130 .element_tile = 16,
5131 };
5132 } else if (cpuinfo_has_x86_avx()) {
5133 xnn_params.vcvt.f16_to_f32 = (struct vunary_parameters) {
5134 .ukernel = (xnn_vunary_ukernel_function) xnn_f16_f32_vcvt_ukernel__avx_int16_x16,
5135 .init.f16_f32_cvt = xnn_init_f16_f32_cvt_sse_int16_params,
5136 .element_tile = 16,
5137 };
5138 xnn_params.vcvt.f32_to_f16 = (struct vunary_parameters) {
5139 .ukernel = (xnn_vunary_ukernel_function) xnn_f32_f16_vcvt_ukernel__avx_x24,
5140 .init.f32_f16_cvt = xnn_init_f32_f16_cvt_sse2_params,
5141 .element_tile = 24,
5142 };
5143 } else if (cpuinfo_has_x86_sse4_1()) {
5144 xnn_params.vcvt.f16_to_f32 = (struct vunary_parameters) {
5145 .ukernel = (xnn_vunary_ukernel_function) xnn_f16_f32_vcvt_ukernel__sse41_int16_x16,
5146 .init.f16_f32_cvt = xnn_init_f16_f32_cvt_sse_int16_params,
5147 .element_tile = 16,
5148 };
5149 xnn_params.vcvt.f32_to_f16 = (struct vunary_parameters) {
5150 .ukernel = (xnn_vunary_ukernel_function) xnn_f32_f16_vcvt_ukernel__sse41_x8,
5151 .init.f32_f16_cvt = xnn_init_f32_f16_cvt_sse2_params,
5152 .element_tile = 8,
5153 };
5154 } else {
5155 xnn_params.vcvt.f16_to_f32 = (struct vunary_parameters) {
5156 .ukernel = (xnn_vunary_ukernel_function) xnn_f16_f32_vcvt_ukernel__sse2_int16_x32,
5157 .init.f16_f32_cvt = xnn_init_f16_f32_cvt_sse_int16_params,
5158 .element_tile = 32,
5159 };
5160 xnn_params.vcvt.f32_to_f16 = (struct vunary_parameters) {
5161 .ukernel = (xnn_vunary_ukernel_function) xnn_f32_f16_vcvt_ukernel__sse2_x16,
5162 .init.f32_f16_cvt = xnn_init_f32_f16_cvt_sse2_params,
5163 .element_tile = 16,
5164 };
5165 }
5166 if (cpuinfo_has_x86_avx512f() && cpuinfo_has_x86_avx512bw() && cpuinfo_has_x86_avx512dq() && cpuinfo_has_x86_avx512vl()) {
5167 xnn_params.vcvt.f32_to_qs8 = (struct vunary_parameters) {
5168 .ukernel = (xnn_vunary_ukernel_function) xnn_f32_qs8_vcvt_ukernel__avx512skx_x128,
5169 .init.f32_qs8_cvt = xnn_init_f32_qs8_cvt_avx512_params,
5170 .element_tile = 128,
5171 };
5172 } else if (cpuinfo_has_x86_avx2()) {
5173 xnn_params.vcvt.f32_to_qs8 = (struct vunary_parameters) {
5174 .ukernel = (xnn_vunary_ukernel_function) xnn_f32_qs8_vcvt_ukernel__avx2_x64,
5175 .init.f32_qs8_cvt = xnn_init_f32_qs8_cvt_avx2_params,
5176 .element_tile = 64,
5177 };
5178 } else if (cpuinfo_has_x86_avx()) {
5179 xnn_params.vcvt.f32_to_qs8 = (struct vunary_parameters) {
5180 .ukernel = (xnn_vunary_ukernel_function) xnn_f32_qs8_vcvt_ukernel__avx_x32,
5181 .init.f32_qs8_cvt = xnn_init_f32_qs8_cvt_avx_params,
5182 .element_tile = 32,
5183 };
5184 } else if (cpuinfo_has_x86_sse4_1()) {
5185 xnn_params.vcvt.f32_to_qs8 = (struct vunary_parameters) {
5186 .ukernel = (xnn_vunary_ukernel_function) xnn_f32_qs8_vcvt_ukernel__sse41_x32,
5187 .init.f32_qs8_cvt = xnn_init_f32_qs8_cvt_sse4_params,
5188 .element_tile = 32,
5189 };
5190 } else {
5191 xnn_params.vcvt.f32_to_qs8 = (struct vunary_parameters) {
5192 .ukernel = (xnn_vunary_ukernel_function) xnn_f32_qs8_vcvt_ukernel__sse2_x32,
5193 .init.f32_qs8_cvt = xnn_init_f32_qs8_cvt_sse2_params,
5194 .element_tile = 32,
5195 };
5196 }
5197 if (cpuinfo_has_x86_avx512f() && cpuinfo_has_x86_avx512bw() && cpuinfo_has_x86_avx512dq() && cpuinfo_has_x86_avx512vl()) {
5198 xnn_params.vcvt.f32_to_qu8 = (struct vunary_parameters) {
5199 .ukernel = (xnn_vunary_ukernel_function) xnn_f32_qu8_vcvt_ukernel__avx512skx_x128,
5200 .init.f32_qu8_cvt = xnn_init_f32_qu8_cvt_avx512_params,
5201 .element_tile = 128,
5202 };
5203 } else if (cpuinfo_has_x86_avx2()) {
5204 xnn_params.vcvt.f32_to_qu8 = (struct vunary_parameters) {
5205 .ukernel = (xnn_vunary_ukernel_function) xnn_f32_qu8_vcvt_ukernel__avx2_x64,
5206 .init.f32_qu8_cvt = xnn_init_f32_qu8_cvt_avx2_params,
5207 .element_tile = 64,
5208 };
5209 } else if (cpuinfo_has_x86_avx()) {
5210 xnn_params.vcvt.f32_to_qu8 = (struct vunary_parameters) {
5211 .ukernel = (xnn_vunary_ukernel_function) xnn_f32_qu8_vcvt_ukernel__avx_x32,
5212 .init.f32_qu8_cvt = xnn_init_f32_qu8_cvt_avx_params,
5213 .element_tile = 32,
5214 };
5215 } else {
5216 xnn_params.vcvt.f32_to_qu8 = (struct vunary_parameters) {
5217 .ukernel = (xnn_vunary_ukernel_function) xnn_f32_qu8_vcvt_ukernel__sse2_x32,
5218 .init.f32_qu8_cvt = xnn_init_f32_qu8_cvt_sse2_params,
5219 .element_tile = 32,
5220 };
5221 }
5222 if (cpuinfo_has_x86_avx2()) {
5223 xnn_params.vcvt.qs8 = (struct vunary_parameters) {
5224 .ukernel = (xnn_vunary_ukernel_function) xnn_qs8_vcvt_ukernel__avx2_x32,
5225 .init.qs8_cvt = xnn_init_qs8_cvt_avx2_params,
5226 .element_tile = 32,
5227 };
5228 xnn_params.vcvt.qu8 = (struct vunary_parameters) {
5229 .ukernel = (xnn_vunary_ukernel_function) xnn_qu8_vcvt_ukernel__avx2_x32,
5230 .init.qu8_cvt = xnn_init_qu8_cvt_avx2_params,
5231 .element_tile = 32,
5232 };
5233 } else if (cpuinfo_has_x86_avx()) {
5234 xnn_params.vcvt.qs8 = (struct vunary_parameters) {
5235 .ukernel = (xnn_vunary_ukernel_function) xnn_qs8_vcvt_ukernel__avx_x32,
5236 .init.qs8_cvt = xnn_init_qs8_cvt_ssse3_params,
5237 .element_tile = 32,
5238 };
5239 xnn_params.vcvt.qu8 = (struct vunary_parameters) {
5240 .ukernel = (xnn_vunary_ukernel_function) xnn_qu8_vcvt_ukernel__avx_x32,
5241 .init.qu8_cvt = xnn_init_qu8_cvt_ssse3_params,
5242 .element_tile = 32,
5243 };
5244 } else if (cpuinfo_has_x86_sse4_1()) {
5245 xnn_params.vcvt.qs8 = (struct vunary_parameters) {
5246 .ukernel = (xnn_vunary_ukernel_function) xnn_qs8_vcvt_ukernel__sse41_x32,
5247 .init.qs8_cvt = xnn_init_qs8_cvt_ssse3_params,
5248 .element_tile = 32,
5249 };
5250 xnn_params.vcvt.qu8 = (struct vunary_parameters) {
5251 .ukernel = (xnn_vunary_ukernel_function) xnn_qu8_vcvt_ukernel__sse41_x32,
5252 .init.qu8_cvt = xnn_init_qu8_cvt_ssse3_params,
5253 .element_tile = 32,
5254 };
5255 } else if (cpuinfo_has_x86_ssse3()) {
5256 xnn_params.vcvt.qs8 = (struct vunary_parameters) {
5257 .ukernel = (xnn_vunary_ukernel_function) xnn_qs8_vcvt_ukernel__ssse3_x32,
5258 .init.qs8_cvt = xnn_init_qs8_cvt_ssse3_params,
5259 .element_tile = 32,
5260 };
5261 xnn_params.vcvt.qu8 = (struct vunary_parameters) {
5262 .ukernel = (xnn_vunary_ukernel_function) xnn_qu8_vcvt_ukernel__ssse3_x32,
5263 .init.qu8_cvt = xnn_init_qu8_cvt_ssse3_params,
5264 .element_tile = 32,
5265 };
5266 } else {
5267 xnn_params.vcvt.qs8 = (struct vunary_parameters) {
5268 .ukernel = (xnn_vunary_ukernel_function) xnn_qs8_vcvt_ukernel__sse2_x32,
5269 .init.qs8_cvt = xnn_init_qs8_cvt_sse2_params,
5270 .element_tile = 32,
5271 };
5272 xnn_params.vcvt.qu8 = (struct vunary_parameters) {
5273 .ukernel = (xnn_vunary_ukernel_function) xnn_qu8_vcvt_ukernel__sse2_x32,
5274 .init.qu8_cvt = xnn_init_qu8_cvt_sse2_params,
5275 .element_tile = 32,
5276 };
5277 }
5278 if (cpuinfo_has_x86_avx512f() && cpuinfo_has_x86_avx512bw() && cpuinfo_has_x86_avx512dq() && cpuinfo_has_x86_avx512vl()) {
5279 xnn_params.vcvt.qs8_to_f32 = (struct vunary_parameters) {
5280 .ukernel = (xnn_vunary_ukernel_function) xnn_qs8_f32_vcvt_ukernel__avx512skx_x32,
5281 .init.qs8_f32_cvt = xnn_init_qs8_f32_cvt_avx512_params,
5282 .element_tile = 32,
5283 };
5284 xnn_params.vcvt.qu8_to_f32 = (struct vunary_parameters) {
5285 .ukernel = (xnn_vunary_ukernel_function) xnn_qu8_f32_vcvt_ukernel__avx512skx_x32,
5286 .init.qu8_f32_cvt = xnn_init_qu8_f32_cvt_avx512_params,
5287 .element_tile = 32,
5288 };
5289 } else if (cpuinfo_has_x86_avx2()) {
5290 xnn_params.vcvt.qs8_to_f32 = (struct vunary_parameters) {
5291 .ukernel = (xnn_vunary_ukernel_function) xnn_qs8_f32_vcvt_ukernel__avx2_x16,
5292 .init.qs8_f32_cvt = xnn_init_qs8_f32_cvt_avx_params,
5293 .element_tile = 16,
5294 };
5295 xnn_params.vcvt.qu8_to_f32 = (struct vunary_parameters) {
5296 .ukernel = (xnn_vunary_ukernel_function) xnn_qu8_f32_vcvt_ukernel__avx2_x16,
5297 .init.qu8_f32_cvt = xnn_init_qu8_f32_cvt_avx_params,
5298 .element_tile = 16,
5299 };
5300 } else if (cpuinfo_has_x86_avx()) {
5301 xnn_params.vcvt.qs8_to_f32 = (struct vunary_parameters) {
5302 .ukernel = (xnn_vunary_ukernel_function) xnn_qs8_f32_vcvt_ukernel__avx_x32,
5303 .init.qs8_f32_cvt = xnn_init_qs8_f32_cvt_avx_params,
5304 .element_tile = 32,
5305 };
5306 xnn_params.vcvt.qu8_to_f32 = (struct vunary_parameters) {
5307 .ukernel = (xnn_vunary_ukernel_function) xnn_qu8_f32_vcvt_ukernel__avx_x32,
5308 .init.qu8_f32_cvt = xnn_init_qu8_f32_cvt_avx_params,
5309 .element_tile = 32,
5310 };
5311 } else if (cpuinfo_has_x86_sse4_1()) {
5312 xnn_params.vcvt.qs8_to_f32 = (struct vunary_parameters) {
5313 .ukernel = (xnn_vunary_ukernel_function) xnn_qs8_f32_vcvt_ukernel__sse41_x16,
5314 .init.qs8_f32_cvt = xnn_init_qs8_f32_cvt_sse4_params,
5315 .element_tile = 16,
5316 };
5317 xnn_params.vcvt.qu8_to_f32 = (struct vunary_parameters) {
5318 .ukernel = (xnn_vunary_ukernel_function) xnn_qu8_f32_vcvt_ukernel__sse41_x16,
5319 .init.qu8_f32_cvt = xnn_init_qu8_f32_cvt_sse4_params,
5320 .element_tile = 16,
5321 };
5322 } else {
5323 xnn_params.vcvt.qs8_to_f32 = (struct vunary_parameters) {
5324 .ukernel = (xnn_vunary_ukernel_function) xnn_qs8_f32_vcvt_ukernel__sse2_x32,
5325 .init.qs8_f32_cvt = xnn_init_qs8_f32_cvt_sse2_params,
5326 .element_tile = 32,
5327 };
5328 xnn_params.vcvt.qu8_to_f32 = (struct vunary_parameters) {
5329 .ukernel = (xnn_vunary_ukernel_function) xnn_qu8_f32_vcvt_ukernel__sse2_x32,
5330 .init.qu8_f32_cvt = xnn_init_qu8_f32_cvt_sse2_params,
5331 .element_tile = 32,
5332 };
5333 }
5334 #endif // XNN_NO_VCVT_OPERATORS
5335
5336 /**************************** X32 x86 micro-kernels ****************************/
5337 #ifndef XNN_NO_X32_OPERATORS
5338 init_flags |= XNN_INIT_FLAG_X32;
5339
5340 xnn_params.x32.unpool = (xnn_unpool_ukernel_function) xnn_x32_unpool_ukernel__sse2;
5341 xnn_params.x32.zip = (struct zip_parameters) {
5342 .x2 = (xnn_zipc_ukernel_function) xnn_x32_zip_x2_ukernel__sse2,
5343 .x3 = (xnn_zipc_ukernel_function) xnn_x32_zip_x3_ukernel__sse2,
5344 .x4 = (xnn_zipc_ukernel_function) xnn_x32_zip_x4_ukernel__sse2,
5345 .xm = (xnn_zipv_ukernel_function) xnn_x32_zip_xm_ukernel__sse2,
5346 };
5347
5348 xnn_params.x32.transpose = (struct transpose_parameters) {
5349 .const_size_ukernel = (xnn_transposec_ukernel_function) xnn_x32_transposec_ukernel__4x4_sse,
5350 .tile_size = 32,
5351 };
5352 #endif // XNN_NO_X32_OPERATORS
5353
5354 /**************************** XX x86 micro-kernels ****************************/
5355 #ifndef XNN_NO_XX_OPERATORS
5356 init_flags |= XNN_INIT_FLAG_XX;
5357
5358 xnn_params.xx.copy = (xnn_vunary_ukernel_function) xnn_xx_copy_ukernel__memcpy;
5359 xnn_params.xx.fill = (struct fill_parameters) {
5360 .ukernel = (xnn_fill_ukernel_function) xnn_xx_fill_ukernel__sse2_x64,
5361 .row_tile = 1,
5362 };
5363 xnn_params.xx.pad = (struct pad_parameters) {
5364 .ukernel = (xnn_pad_ukernel_function) xnn_xx_pad_ukernel__sse2,
5365 .row_tile = 1,
5366 };
5367 xnn_params.xx.transpose = (struct transpose_parameters) {
5368 .variable_size_ukernel = xnn_xx_transposev_ukernel__1x1_memcpy,
5369 .tile_size = 32,
5370 };
5371 #endif
5372
5373#elif XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
5374
5375 /**************************** QC8 WAsm SIMD micro-kernels****************************/
5376 #ifndef XNN_NO_QS8_OPERATORS
5377 init_flags |= XNN_INIT_FLAG_QC8;
5378
5379 xnn_params.qc8.gemm.minmax.gemm[XNN_MR_TO_INDEX(4)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2s4__wasmsimd_dot16x2_ld128);
5380 xnn_params.qc8.gemm.minmax.igemm[XNN_MR_TO_INDEX(4)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qc8_igemm_minmax_fp32_ukernel_4x4c2s4__wasmsimd_dot16x2_ld128);
5381 xnn_params.qc8.gemm.minmax.gemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qc8_gemm_minmax_fp32_ukernel_1x4c2s4__wasmsimd_dot16x2_ld128);
5382 xnn_params.qc8.gemm.minmax.igemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qc8_igemm_minmax_fp32_ukernel_1x4c2s4__wasmsimd_dot16x2_ld128);
5383 xnn_params.qc8.gemm.init.qc8 = xnn_init_qc8_conv_minmax_fp32_wasmsimd_params;
5384 xnn_params.qc8.gemm.mr = 4;
5385 xnn_params.qc8.gemm.nr = 4;
5386 xnn_params.qc8.gemm.log2_kr = 1;
5387 xnn_params.qc8.gemm.log2_sr = 2;
5388
5389 xnn_params.qc8.dwconv[0].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qc8_dwconv_minmax_fp32_ukernel_up16x3__wasmsimd_mul16_add16;
5390 xnn_params.qc8.dwconv[0].init.qc8 = xnn_init_qc8_conv_minmax_fp32_wasmsimd_params;
5391 xnn_params.qc8.dwconv[0].channel_tile = 16;
5392 xnn_params.qc8.dwconv[0].primary_tile = 3;
5393 xnn_params.qc8.dwconv[1].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qc8_dwconv_minmax_fp32_ukernel_up16x9__wasmsimd_mul16_add16;
5394 xnn_params.qc8.dwconv[1].init.qc8 = xnn_init_qc8_conv_minmax_fp32_wasmsimd_params;
5395 xnn_params.qc8.dwconv[1].channel_tile = 16;
5396 xnn_params.qc8.dwconv[1].primary_tile = 9;
5397 xnn_params.qc8.dwconv[2].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qc8_dwconv_minmax_fp32_ukernel_up16x25__wasmsimd_mul16_add16;
5398 xnn_params.qc8.dwconv[2].init.qc8 = xnn_init_qc8_conv_minmax_fp32_wasmsimd_params;
5399 xnn_params.qc8.dwconv[2].channel_tile = 16;
5400 xnn_params.qc8.dwconv[2].primary_tile = 25;
5401 #endif // XNN_NO_QC8_OPERATORS
5402
5403 /**************************** QS8 WAsm SIMD micro-kernels****************************/
5404 #ifndef XNN_NO_QS8_OPERATORS
5405 init_flags |= XNN_INIT_FLAG_QS8;
5406
5407 xnn_params.qs8.gemm.minmax.gemm[XNN_MR_TO_INDEX(4)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_fp32_ukernel_4x4c2s4__wasmsimd_dot16x2_ld128);
5408 xnn_params.qs8.gemm.minmax.igemm[XNN_MR_TO_INDEX(4)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_fp32_ukernel_4x4c2s4__wasmsimd_dot16x2_ld128);
5409 xnn_params.qs8.gemm.minmax.gemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_fp32_ukernel_1x4c2s4__wasmsimd_dot16x2_ld128);
5410 xnn_params.qs8.gemm.minmax.igemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_fp32_ukernel_1x4c2s4__wasmsimd_dot16x2_ld128);
5411 xnn_params.qs8.gemm.init.qs8 = xnn_init_qs8_conv_minmax_fp32_wasmsimd_params;
5412 xnn_params.qs8.gemm.mr = 4;
5413 xnn_params.qs8.gemm.nr = 4;
5414 xnn_params.qs8.gemm.log2_kr = 1;
5415 xnn_params.qs8.gemm.log2_sr = 2;
5416
5417 xnn_params.qs8.dwconv[0].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qs8_dwconv_minmax_fp32_ukernel_up16x9__wasmsimd_mul16_add16;
5418 xnn_params.qs8.dwconv[0].init.qs8 = xnn_init_qs8_conv_minmax_fp32_wasmsimd_params;
5419 xnn_params.qs8.dwconv[0].channel_tile = 16;
5420 xnn_params.qs8.dwconv[0].primary_tile = 9;
5421 xnn_params.qs8.dwconv[1].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qs8_dwconv_minmax_fp32_ukernel_up16x25__wasmsimd_mul16_add16;
5422 xnn_params.qs8.dwconv[1].init.qs8 = xnn_init_qs8_conv_minmax_fp32_wasmsimd_params;
5423 xnn_params.qs8.dwconv[1].channel_tile = 16;
5424 xnn_params.qs8.dwconv[1].primary_tile = 25;
5425
5426 xnn_params.qs8.gavgpool = (struct gavgpool_parameters) {
5427 .unipass = (xnn_gavgpool_unipass_ukernel_function) xnn_qs8_gavgpool_minmax_fp32_ukernel_7x__wasmsimd_c16,
5428 .multipass = (xnn_gavgpool_multipass_ukernel_function) xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__wasmsimd_c16,
5429 .init.qs8 = xnn_init_qs8_avgpool_minmax_fp32_wasmsimd_params,
5430 .update.qs8 = xnn_update_qs8_avgpool_minmax_fp32_wasmsimd_params,
5431 .row_tile = 7,
5432 .channel_tile = 16,
5433 };
5434
5435 xnn_params.qs8.vadd = (struct vbinary_parameters) {
5436 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_qs8_vadd_minmax_ukernel__wasmsimd_x32,
5437 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_qs8_vaddc_minmax_ukernel__wasmsimd_x32,
5438 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_qs8_vaddc_minmax_ukernel__wasmsimd_x32,
5439 .init.qs8_add = xnn_init_qs8_add_minmax_wasmsimd_params,
5440 .element_tile = 32,
5441 };
5442 xnn_params.qs8.vmul = (struct vbinary_parameters) {
5443 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_qs8_vmul_minmax_fp32_ukernel__wasmsimd_mul32_ld64_x8,
5444 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_qs8_vmulc_minmax_fp32_ukernel__wasmsimd_mul32_ld64_x8,
5445 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_qs8_vmulc_minmax_fp32_ukernel__wasmsimd_mul32_ld64_x8,
5446 .init.qs8_mul = xnn_init_qs8_mul_minmax_fp32_wasmsimd_params,
5447 .element_tile = 8,
5448 };
5449
5450 #if XNN_ARCH_WASMRELAXEDSIMD
5451 if (is_wasm_x86) {
5452 xnn_params.qs8.lrelu = (struct vunary_parameters) {
5453 .ukernel = (xnn_vunary_ukernel_function) xnn_qs8_vlrelu_ukernel__wasmrelaxedsimd_x86_x32,
5454 .init.qs8_lrelu = xnn_init_qs8_lrelu_wasmsimd_x86_params,
5455 .element_tile = 32,
5456 };
5457 } else {
5458 xnn_params.qs8.lrelu = (struct vunary_parameters) {
5459 .ukernel = (xnn_vunary_ukernel_function) xnn_qs8_vlrelu_ukernel__wasmrelaxedsimd_arm_x32,
5460 .init.qs8_lrelu = xnn_init_qs8_lrelu_wasmsimd_arm_params,
5461 .element_tile = 32,
5462 };
5463 }
5464 #else
5465 if (is_wasm_x86) {
5466 xnn_params.qs8.lrelu = (struct vunary_parameters) {
5467 .ukernel = (xnn_vunary_ukernel_function) xnn_qs8_vlrelu_ukernel__wasmsimd_x86_x16,
5468 .init.qs8_lrelu = xnn_init_qs8_lrelu_wasmsimd_x86_params,
5469 .element_tile = 16,
5470 };
5471 } else {
5472 xnn_params.qs8.lrelu = (struct vunary_parameters) {
5473 .ukernel = (xnn_vunary_ukernel_function) xnn_qs8_vlrelu_ukernel__wasmsimd_arm_x32,
5474 .init.qs8_lrelu = xnn_init_qs8_lrelu_wasmsimd_arm_params,
5475 .element_tile = 32,
5476 };
5477 }
5478 #endif
5479 #endif // XNN_NO_QS8_OPERATORS
5480
5481 /**************************** QU8 WAsm SIMD micro-kernels****************************/
5482 #ifndef XNN_NO_QU8_OPERATORS
5483 init_flags |= XNN_INIT_FLAG_QU8;
5484
5485 xnn_params.qu8.gemm.minmax.gemm[XNN_MR_TO_INDEX(4)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qu8_gemm_minmax_fp32_ukernel_4x4c2s4__wasmsimd_dot16x2_ld128);
5486 xnn_params.qu8.gemm.minmax.igemm[XNN_MR_TO_INDEX(4)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qu8_igemm_minmax_fp32_ukernel_4x4c2s4__wasmsimd_dot16x2_ld128);
5487 xnn_params.qu8.gemm.minmax.gemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qu8_gemm_minmax_fp32_ukernel_1x4c2s4__wasmsimd_dot16x2_ld128);
5488 xnn_params.qu8.gemm.minmax.igemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qu8_igemm_minmax_fp32_ukernel_1x4c2s4__wasmsimd_dot16x2_ld128);
5489 xnn_params.qu8.gemm.init.qu8 = xnn_init_qu8_conv_minmax_fp32_wasmsimd_params;
5490 xnn_params.qu8.gemm.mr = 4;
5491 xnn_params.qu8.gemm.nr = 4;
5492 xnn_params.qu8.gemm.log2_kr = 1;
5493 xnn_params.qu8.gemm.log2_sr = 2;
5494
5495 xnn_params.qu8.dwconv[0].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qu8_dwconv_minmax_fp32_ukernel_up8x9__wasmsimd_mul16;
5496 xnn_params.qu8.dwconv[0].init.qu8 = xnn_init_qu8_conv_minmax_fp32_wasmsimd_params;
5497 xnn_params.qu8.dwconv[0].channel_tile = 8;
5498 xnn_params.qu8.dwconv[0].primary_tile = 9;
5499 xnn_params.qu8.dwconv[1].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qu8_dwconv_minmax_fp32_ukernel_up8x25__wasmsimd_mul16;
5500 xnn_params.qu8.dwconv[1].init.qu8 = xnn_init_qu8_conv_minmax_fp32_wasmsimd_params;
5501 xnn_params.qu8.dwconv[1].channel_tile = 8;
5502 xnn_params.qu8.dwconv[1].primary_tile = 25;
5503
5504 xnn_params.qu8.avgpool = (struct avgpool_parameters) {
5505 .unipass = (xnn_avgpool_unipass_ukernel_function) xnn_qu8_avgpool_minmax_ukernel_9x__scalar_c1,
5506 .multipass = (xnn_avgpool_multipass_ukernel_function) xnn_qu8_avgpool_minmax_ukernel_9p8x__scalar_c1,
5507 .init.qu8 = xnn_init_qu8_avgpool_minmax_scalar_params,
5508 .primary_tile = 9,
5509 .incremental_tile = 8,
5510 .channel_tile = 1,
5511 };
5512 xnn_params.qu8.gavgpool = (struct gavgpool_parameters) {
5513 .unipass = (xnn_gavgpool_unipass_ukernel_function) xnn_qu8_gavgpool_minmax_fp32_ukernel_7x__wasmsimd_c16,
5514 .multipass = (xnn_gavgpool_multipass_ukernel_function) xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__wasmsimd_c16,
5515 .init.qu8 = xnn_init_qu8_avgpool_minmax_fp32_wasmsimd_params,
5516 .update.qu8 = xnn_update_qu8_avgpool_minmax_fp32_wasmsimd_params,
5517 .row_tile = 7,
5518 .channel_tile = 16,
5519 };
5520
5521 xnn_params.qu8.vadd = (struct vbinary_parameters) {
5522 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_qu8_vadd_minmax_ukernel__wasmsimd_x32,
5523 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_qu8_vaddc_minmax_ukernel__wasmsimd_x32,
5524 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_qu8_vaddc_minmax_ukernel__wasmsimd_x32,
5525 .init.qu8_add = xnn_init_qu8_add_minmax_wasmsimd_params,
5526 .element_tile = 32,
5527 };
5528 xnn_params.qu8.vmul = (struct vbinary_parameters) {
5529 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_qu8_vmul_minmax_fp32_ukernel__wasmsimd_mul32_ld64_x8,
5530 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_qu8_vmulc_minmax_fp32_ukernel__wasmsimd_mul32_ld64_x8,
5531 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_qu8_vmulc_minmax_fp32_ukernel__wasmsimd_mul32_ld64_x8,
5532 .init.qu8_mul = xnn_init_qu8_mul_minmax_fp32_wasmsimd_params,
5533 .element_tile = 8,
5534 };
5535
5536 #if XNN_ARCH_WASMRELAXEDSIMD
5537 if (is_wasm_x86) {
5538 xnn_params.qu8.lrelu = (struct vunary_parameters) {
5539 .ukernel = (xnn_vunary_ukernel_function) xnn_qu8_vlrelu_ukernel__wasmrelaxedsimd_x86_x32,
5540 .init.qu8_lrelu = xnn_init_qu8_lrelu_wasmsimd_x86_params,
5541 .element_tile = 32,
5542 };
5543 } else {
5544 xnn_params.qu8.lrelu = (struct vunary_parameters) {
5545 .ukernel = (xnn_vunary_ukernel_function) xnn_qu8_vlrelu_ukernel__wasmrelaxedsimd_arm_x32,
5546 .init.qu8_lrelu = xnn_init_qu8_lrelu_wasmsimd_arm_params,
5547 .element_tile = 32,
5548 };
5549 }
5550 #else
5551 if (is_wasm_x86) {
5552 xnn_params.qu8.lrelu = (struct vunary_parameters) {
5553 .ukernel = (xnn_vunary_ukernel_function) xnn_qu8_vlrelu_ukernel__wasmsimd_x86_x16,
5554 .init.qu8_lrelu = xnn_init_qu8_lrelu_wasmsimd_x86_params,
5555 .element_tile = 16,
5556 };
5557 } else {
5558 xnn_params.qu8.lrelu = (struct vunary_parameters) {
5559 .ukernel = (xnn_vunary_ukernel_function) xnn_qu8_vlrelu_ukernel__wasmsimd_arm_x32,
5560 .init.qu8_lrelu = xnn_init_qu8_lrelu_wasmsimd_arm_params,
5561 .element_tile = 32,
5562 };
5563 }
5564 #endif
5565 #endif // XNN_NO_QU8_OPERATORS
5566
5567 /**************************** S8 WAsm SIMD micro-kernels****************************/
5568 #ifndef XNN_NO_S8_OPERATORS
5569 init_flags |= XNN_INIT_FLAG_S8;
5570
5571 xnn_params.s8.clamp = (struct vunary_parameters) {
5572 .ukernel = (xnn_vunary_ukernel_function) xnn_s8_vclamp_ukernel__wasmsimd_x64,
5573 .init.s8_minmax = xnn_init_s8_minmax_wasmsimd_params,
5574 .element_tile = 64,
5575 };
5576 xnn_params.s8.ibilinear = (struct ibilinear_parameters) {
5577 .ukernel = (xnn_ibilinear_ukernel_function) xnn_s8_ibilinear_ukernel__wasmsimd_dot16x2_c8,
5578 .pixel_tile = 1,
5579 .channel_tile = 8,
5580 };
5581 xnn_params.s8.maxpool = (struct maxpool_parameters) {
5582 .ukernel = (xnn_maxpool_ukernel_function) xnn_s8_maxpool_minmax_ukernel_9p8x__wasmsimd_c16,
5583 .init.s8 = xnn_init_s8_minmax_wasmsimd_params,
5584 .mr = 9,
5585 .qr = 8,
5586 };
5587 #endif // XNN_NO_S8_OPERATORS
5588
5589 /**************************** U8 WAsm SIMD micro-kernels****************************/
5590 #ifndef XNN_NO_U8_OPERATORS
5591 init_flags |= XNN_INIT_FLAG_U8;
5592
5593 xnn_params.u8.clamp = (struct vunary_parameters) {
5594 .ukernel = (xnn_vunary_ukernel_function) xnn_u8_vclamp_ukernel__wasmsimd_x64,
5595 .init.u8_minmax = xnn_init_u8_minmax_wasmsimd_params,
5596 .element_tile = 64,
5597 };
5598 xnn_params.u8.ibilinear = (struct ibilinear_parameters) {
5599 .ukernel = (xnn_ibilinear_ukernel_function) xnn_u8_ibilinear_ukernel__wasmsimd_dot16x2_c8,
5600 .pixel_tile = 1,
5601 .channel_tile = 8,
5602 };
5603 xnn_params.u8.maxpool = (struct maxpool_parameters) {
5604 .ukernel = (xnn_maxpool_ukernel_function) xnn_u8_maxpool_minmax_ukernel_9p8x__wasmsimd_c16,
5605 .init.u8 = xnn_init_u8_minmax_wasmsimd_params,
5606 .mr = 9,
5607 .qr = 8,
5608 };
5609 xnn_params.u8.lut32norm = xnn_u8_lut32norm_ukernel__scalar;
5610 xnn_params.u8.rmax = xnn_u8_rmax_ukernel__scalar;
5611 #endif // XNN_NO_U8_OPERATORS
5612
5613 /**************************** X8 WAsm SIMD micro-kernels****************************/
5614 #ifndef XNN_NO_X8_OPERATORS
5615 init_flags |= XNN_INIT_FLAG_X8;
5616
5617 xnn_params.x8.lut = xnn_x8_lut_ukernel__scalar_x4;
5618 xnn_params.x8.zip = (struct zip_parameters) {
5619 .x2 = (xnn_zipc_ukernel_function) xnn_x8_zip_x2_ukernel__scalar,
5620 .x3 = (xnn_zipc_ukernel_function) xnn_x8_zip_x3_ukernel__scalar,
5621 .x4 = (xnn_zipc_ukernel_function) xnn_x8_zip_x4_ukernel__scalar,
5622 .xm = (xnn_zipv_ukernel_function) xnn_x8_zip_xm_ukernel__scalar,
5623 };
5624
5625 xnn_params.x8.transpose = (struct transpose_parameters) {
5626 .const_size_ukernel = (xnn_transposec_ukernel_function) xnn_x8_transposec_ukernel__2x4_scalar_int,
5627 .tile_size = 32,
5628 };
5629 #endif // XNN_NO_X8_OPERATORS
5630
5631 /**************************** X16 WAsm SIMD micro-kernels****************************/
5632 #ifndef XNN_NO_X16_OPERATORS
5633 init_flags |= XNN_INIT_FLAG_X16;
5634
5635 xnn_params.x16.transpose = (struct transpose_parameters) {
5636 .const_size_ukernel = (xnn_transposec_ukernel_function) xnn_x16_transposec_ukernel__2x4_scalar_int,
5637 .tile_size = 32,
5638 };
5639 #endif // XNN_NO_X16_OPERATORS
5640
5641 /**************************** F32 WAsm SIMD micro-kernels****************************/
5642 #ifndef XNN_NO_F32_OPERATORS
5643 init_flags |= XNN_INIT_FLAG_F32;
5644
5645 if (is_wasm_x86) {
5646 #if XNN_ARCH_WASMRELAXEDSIMD
5647 xnn_params.f32.gemm.minmax.gemm[XNN_MR_TO_INDEX(4)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_4x8__wasmrelaxedsimd_fma_loadsplat);
5648 xnn_params.f32.gemm.minmax.igemm[XNN_MR_TO_INDEX(4)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_4x8__wasmrelaxedsimd_fma_loadsplat);
5649 xnn_params.f32.gemm.minmax.gemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_1x8__wasmrelaxedsimd_fma_loadsplat);
5650 xnn_params.f32.gemm.minmax.igemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_1x8__wasmrelaxedsimd_fma_loadsplat);
5651 xnn_params.f32.gemm.relu.gemm[XNN_MR_TO_INDEX(4)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_relu_ukernel_4x8__wasmrelaxedsimd_fma_loadsplat);
5652 xnn_params.f32.gemm.relu.igemm[XNN_MR_TO_INDEX(4)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_relu_ukernel_4x8__wasmrelaxedsimd_fma_loadsplat);
5653 xnn_params.f32.gemm.relu.gemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_relu_ukernel_1x8__wasmrelaxedsimd_fma_loadsplat);
5654 xnn_params.f32.gemm.relu.igemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_relu_ukernel_1x8__wasmrelaxedsimd_fma_loadsplat);
5655 xnn_params.f32.gemm.linear.gemm[XNN_MR_TO_INDEX(4)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_ukernel_4x8__wasmrelaxedsimd_fma_loadsplat);
5656 xnn_params.f32.gemm.linear.igemm[XNN_MR_TO_INDEX(4)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_ukernel_4x8__wasmrelaxedsimd_fma_loadsplat);
5657 xnn_params.f32.gemm.linear.gemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_ukernel_1x8__wasmrelaxedsimd_fma_loadsplat);
5658 xnn_params.f32.gemm.linear.igemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_ukernel_1x8__wasmrelaxedsimd_fma_loadsplat);
5659 #else
5660 xnn_params.f32.gemm.minmax.gemm[XNN_MR_TO_INDEX(4)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_4x8__wasmsimd_x86_splat);
5661 xnn_params.f32.gemm.minmax.igemm[XNN_MR_TO_INDEX(4)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_4x8__wasmsimd_x86_splat);
5662 xnn_params.f32.gemm.minmax.gemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_1x8__wasmsimd_x86_splat);
5663 xnn_params.f32.gemm.minmax.igemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_1x8__wasmsimd_x86_splat);
5664 xnn_params.f32.gemm.relu.gemm[XNN_MR_TO_INDEX(4)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_relu_ukernel_4x8__wasmsimd_splat);
5665 xnn_params.f32.gemm.relu.igemm[XNN_MR_TO_INDEX(4)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_relu_ukernel_4x8__wasmsimd_splat);
5666 xnn_params.f32.gemm.relu.gemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_relu_ukernel_1x8__wasmsimd_splat);
5667 xnn_params.f32.gemm.relu.igemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_relu_ukernel_1x8__wasmsimd_splat);
5668 xnn_params.f32.gemm.linear.gemm[XNN_MR_TO_INDEX(4)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_ukernel_4x8__wasmsimd_splat);
5669 xnn_params.f32.gemm.linear.igemm[XNN_MR_TO_INDEX(4)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_ukernel_4x8__wasmsimd_splat);
5670 xnn_params.f32.gemm.linear.gemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_ukernel_1x8__wasmsimd_splat);
5671 xnn_params.f32.gemm.linear.igemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_ukernel_1x8__wasmsimd_splat);
5672 #endif
5673 xnn_params.f32.gemm.init.f32 = xnn_init_f32_minmax_wasmsimd_params;
5674 xnn_params.f32.gemm.mr = 4;
5675 xnn_params.f32.gemm.nr = 8;
5676
5677 #if XNN_ARCH_WASMRELAXEDSIMD
5678 xnn_params.f32.gemm2.minmax.gemm[XNN_MR_TO_INDEX(4)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_4x2c4__wasmrelaxedsimd_fma);
5679 xnn_params.f32.gemm2.minmax.igemm[XNN_MR_TO_INDEX(4)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_4x2c4__wasmrelaxedsimd_fma);
5680 xnn_params.f32.gemm2.linear.gemm[XNN_MR_TO_INDEX(4)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_ukernel_4x2c4__wasmrelaxedsimd_fma);
5681 xnn_params.f32.gemm2.linear.igemm[XNN_MR_TO_INDEX(4)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_ukernel_4x2c4__wasmrelaxedsimd_fma);
5682 #else
5683 xnn_params.f32.gemm2.minmax.gemm[XNN_MR_TO_INDEX(4)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_4x2c4__wasmsimd_x86);
5684 xnn_params.f32.gemm2.minmax.igemm[XNN_MR_TO_INDEX(4)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_4x2c4__wasmsimd_x86);
5685 xnn_params.f32.gemm2.linear.gemm[XNN_MR_TO_INDEX(4)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_ukernel_4x2c4__wasmsimd);
5686 xnn_params.f32.gemm2.linear.igemm[XNN_MR_TO_INDEX(4)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_ukernel_4x2c4__wasmsimd);
5687 #endif
5688 xnn_params.f32.gemm2.init.f32 = xnn_init_f32_minmax_wasmsimd_params;
5689 xnn_params.f32.gemm2.mr = 4;
5690 xnn_params.f32.gemm2.nr = 2;
5691 xnn_params.f32.gemm2.log2_kr = 2;
5692 } else {
5693 #if XNN_ARCH_WASMRELAXEDSIMD
5694 xnn_params.f32.gemm.minmax.gemm[XNN_MR_TO_INDEX(5)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_5x8__wasmrelaxedsimd_fma_splat);
5695 xnn_params.f32.gemm.minmax.igemm[XNN_MR_TO_INDEX(5)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_5x8__wasmrelaxedsimd_fma_splat);
5696 xnn_params.f32.gemm.minmax.gemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_1x8__wasmrelaxedsimd_fma_splat);
5697 xnn_params.f32.gemm.minmax.igemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_1x8__wasmrelaxedsimd_fma_splat);
5698 xnn_params.f32.gemm.relu.gemm[XNN_MR_TO_INDEX(5)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_relu_ukernel_5x8__wasmrelaxedsimd_fma_splat);
5699 xnn_params.f32.gemm.relu.igemm[XNN_MR_TO_INDEX(5)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_relu_ukernel_5x8__wasmrelaxedsimd_fma_splat);
5700 xnn_params.f32.gemm.relu.gemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_relu_ukernel_1x8__wasmrelaxedsimd_fma_splat);
5701 xnn_params.f32.gemm.relu.igemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_relu_ukernel_1x8__wasmrelaxedsimd_fma_splat);
5702 xnn_params.f32.gemm.linear.gemm[XNN_MR_TO_INDEX(5)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_ukernel_5x8__wasmrelaxedsimd_fma_splat);
5703 xnn_params.f32.gemm.linear.igemm[XNN_MR_TO_INDEX(5)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_ukernel_5x8__wasmrelaxedsimd_fma_splat);
5704 xnn_params.f32.gemm.linear.gemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_ukernel_1x8__wasmrelaxedsimd_fma_splat);
5705 xnn_params.f32.gemm.linear.igemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_ukernel_1x8__wasmrelaxedsimd_fma_splat);
5706 #else
5707 xnn_params.f32.gemm.minmax.gemm[XNN_MR_TO_INDEX(5)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_5x8__wasmsimd_arm_splat);
5708 xnn_params.f32.gemm.minmax.igemm[XNN_MR_TO_INDEX(5)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_5x8__wasmsimd_arm_splat);
5709 xnn_params.f32.gemm.minmax.gemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_1x8__wasmsimd_arm_splat);
5710 xnn_params.f32.gemm.minmax.igemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_1x8__wasmsimd_arm_splat);
5711 xnn_params.f32.gemm.relu.gemm[XNN_MR_TO_INDEX(5)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_relu_ukernel_5x8__wasmsimd_splat);
5712 xnn_params.f32.gemm.relu.igemm[XNN_MR_TO_INDEX(5)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_relu_ukernel_5x8__wasmsimd_splat);
5713 xnn_params.f32.gemm.relu.gemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_relu_ukernel_1x8__wasmsimd_splat);
5714 xnn_params.f32.gemm.relu.igemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_relu_ukernel_1x8__wasmsimd_splat);
5715 xnn_params.f32.gemm.linear.gemm[XNN_MR_TO_INDEX(5)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_ukernel_5x8__wasmsimd_splat);
5716 xnn_params.f32.gemm.linear.igemm[XNN_MR_TO_INDEX(5)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_ukernel_5x8__wasmsimd_splat);
5717 xnn_params.f32.gemm.linear.gemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_ukernel_1x8__wasmsimd_splat);
5718 xnn_params.f32.gemm.linear.igemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_ukernel_1x8__wasmsimd_splat);
5719 #endif
5720 xnn_params.f32.gemm.init.f32 = xnn_init_f32_minmax_wasmsimd_params;
5721 xnn_params.f32.gemm.mr = 5;
5722 xnn_params.f32.gemm.nr = 8;
5723
5724 #if XNN_ARCH_WASMRELAXEDSIMD
5725 xnn_params.f32.gemm2.minmax.gemm[XNN_MR_TO_INDEX(4)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_4x2c4__wasmrelaxedsimd_fma);
5726 xnn_params.f32.gemm2.minmax.igemm[XNN_MR_TO_INDEX(4)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_4x2c4__wasmrelaxedsimd_fma);
5727 xnn_params.f32.gemm2.linear.gemm[XNN_MR_TO_INDEX(4)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_ukernel_4x2c4__wasmrelaxedsimd_fma);
5728 xnn_params.f32.gemm2.linear.igemm[XNN_MR_TO_INDEX(4)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_ukernel_4x2c4__wasmrelaxedsimd_fma);
5729 #else
5730 xnn_params.f32.gemm2.minmax.gemm[XNN_MR_TO_INDEX(4)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_4x2c4__wasmsimd_arm);
5731 xnn_params.f32.gemm2.minmax.igemm[XNN_MR_TO_INDEX(4)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_4x2c4__wasmsimd_arm);
5732 xnn_params.f32.gemm2.linear.gemm[XNN_MR_TO_INDEX(4)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_ukernel_4x2c4__wasmsimd);
5733 xnn_params.f32.gemm2.linear.igemm[XNN_MR_TO_INDEX(4)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_ukernel_4x2c4__wasmsimd);
5734 #endif
5735 xnn_params.f32.gemm2.init.f32 = xnn_init_f32_minmax_wasmsimd_params;
5736 xnn_params.f32.gemm2.mr = 4;
5737 xnn_params.f32.gemm2.nr = 2;
5738 xnn_params.f32.gemm2.log2_kr = 2;
5739 }
5740
5741 #if XNN_ARCH_WASMRELAXEDSIMD
5742 xnn_params.f32.dwconv[0].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_f32_dwconv_minmax_ukernel_up8x3__wasmrelaxedsimd_fma;
5743 xnn_params.f32.dwconv[0].linear.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_f32_dwconv_ukernel_up8x3__wasmrelaxedsimd_fma;
5744 xnn_params.f32.dwconv[0].init.f32 = xnn_init_f32_minmax_wasmsimd_params;
5745 xnn_params.f32.dwconv[0].channel_tile = 8;
5746 xnn_params.f32.dwconv[0].primary_tile = 3;
5747
5748 xnn_params.f32.dwconv[1].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_f32_dwconv_minmax_ukernel_up8x4__wasmrelaxedsimd_fma;
5749 xnn_params.f32.dwconv[1].linear.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_f32_dwconv_ukernel_up8x4__wasmrelaxedsimd_fma;
5750 xnn_params.f32.dwconv[1].init.f32 = xnn_init_f32_minmax_wasmsimd_params;
5751 xnn_params.f32.dwconv[1].channel_tile = 8;
5752 xnn_params.f32.dwconv[1].primary_tile = 4;
5753
5754 xnn_params.f32.dwconv[2].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_f32_dwconv_minmax_ukernel_up8x9__wasmrelaxedsimd_fma;
5755 xnn_params.f32.dwconv[2].linear.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_f32_dwconv_ukernel_up8x9__wasmrelaxedsimd_fma;
5756 xnn_params.f32.dwconv[2].init.f32 = xnn_init_f32_minmax_wasmsimd_params;
5757 xnn_params.f32.dwconv[2].channel_tile = 8;
5758 xnn_params.f32.dwconv[2].primary_tile = 9;
5759 #else
5760 if (is_wasm_x86) {
5761 xnn_params.f32.dwconv[0].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_f32_dwconv_minmax_ukernel_up8x3__wasmsimd_x86;
5762 xnn_params.f32.dwconv[0].linear.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_f32_dwconv_ukernel_up8x3__wasmsimd;
5763 xnn_params.f32.dwconv[0].init.f32 = xnn_init_f32_minmax_wasmsimd_params;
5764 xnn_params.f32.dwconv[0].channel_tile = 8;
5765 xnn_params.f32.dwconv[0].primary_tile = 3;
5766
5767 xnn_params.f32.dwconv[1].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_f32_dwconv_minmax_ukernel_up8x4__wasmsimd_x86;
5768 xnn_params.f32.dwconv[1].linear.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_f32_dwconv_ukernel_up8x4__wasmsimd;
5769 xnn_params.f32.dwconv[1].init.f32 = xnn_init_f32_minmax_wasmsimd_params;
5770 xnn_params.f32.dwconv[1].channel_tile = 8;
5771 xnn_params.f32.dwconv[1].primary_tile = 4;
5772
5773 xnn_params.f32.dwconv[2].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_f32_dwconv_minmax_ukernel_up8x9__wasmsimd_x86;
5774 xnn_params.f32.dwconv[2].linear.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_f32_dwconv_ukernel_up8x9__wasmsimd;
5775 xnn_params.f32.dwconv[2].init.f32 = xnn_init_f32_minmax_wasmsimd_params;
5776 xnn_params.f32.dwconv[2].channel_tile = 8;
5777 xnn_params.f32.dwconv[2].primary_tile = 9;
5778 } else {
5779 xnn_params.f32.dwconv[0].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_f32_dwconv_minmax_ukernel_up4x3__wasmsimd_arm;
5780 xnn_params.f32.dwconv[0].linear.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_f32_dwconv_ukernel_up4x3__wasmsimd;
5781 xnn_params.f32.dwconv[0].init.f32 = xnn_init_f32_minmax_wasmsimd_params;
5782 xnn_params.f32.dwconv[0].channel_tile = 4;
5783 xnn_params.f32.dwconv[0].primary_tile = 3;
5784
5785 xnn_params.f32.dwconv[1].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_f32_dwconv_minmax_ukernel_up4x4__wasmsimd_arm;
5786 xnn_params.f32.dwconv[1].linear.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_f32_dwconv_ukernel_up4x4__wasmsimd;
5787 xnn_params.f32.dwconv[1].init.f32 = xnn_init_f32_minmax_wasmsimd_params;
5788 xnn_params.f32.dwconv[1].channel_tile = 4;
5789 xnn_params.f32.dwconv[1].primary_tile = 4;
5790
5791 xnn_params.f32.dwconv[2].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_f32_dwconv_minmax_ukernel_up4x9__wasmsimd_arm;
5792 xnn_params.f32.dwconv[2].linear.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_f32_dwconv_ukernel_up4x9__wasmsimd;
5793 xnn_params.f32.dwconv[2].init.f32 = xnn_init_f32_minmax_wasmsimd_params;
5794 xnn_params.f32.dwconv[2].channel_tile = 4;
5795 xnn_params.f32.dwconv[2].primary_tile = 9;
5796 }
5797 #endif
5798
5799 #if XNN_ARCH_WASMRELAXEDSIMD
5800 xnn_params.f32.dwconv[3].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_f32_dwconv_minmax_ukernel_up8x25__wasmrelaxedsimd_fma;
5801 xnn_params.f32.dwconv[3].linear.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_f32_dwconv_ukernel_up8x25__wasmrelaxedsimd_fma;
5802 xnn_params.f32.dwconv[3].init.f32 = xnn_init_f32_minmax_wasmsimd_params;
5803 xnn_params.f32.dwconv[3].channel_tile = 8;
5804 xnn_params.f32.dwconv[3].primary_tile = 25;
5805 #else
5806 xnn_params.f32.dwconv[3].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_f32_dwconv_minmax_ukernel_up4x25__wasmsimd_arm;
5807 xnn_params.f32.dwconv[3].linear.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_f32_dwconv_ukernel_up4x25__wasmsimd;
5808 xnn_params.f32.dwconv[3].init.f32 = xnn_init_f32_minmax_wasmsimd_params;
5809 xnn_params.f32.dwconv[3].channel_tile = 4;
5810 xnn_params.f32.dwconv[3].primary_tile = 25;
5811 #endif
5812
5813 if (is_wasm_x86) {
5814 xnn_params.f32.avgpool = (struct avgpool_parameters) {
5815 .unipass = (xnn_avgpool_unipass_ukernel_function) xnn_f32_avgpool_minmax_ukernel_9x__wasmsimd_x86_c4,
5816 .multipass = (xnn_avgpool_multipass_ukernel_function) xnn_f32_avgpool_minmax_ukernel_9p8x__wasmsimd_x86_c4,
5817 .init.f32 = xnn_init_f32_scaleminmax_scalar_params,
5818 .primary_tile = 9,
5819 .incremental_tile = 8,
5820 .channel_tile = 4,
5821 };
5822 xnn_params.f32.pavgpool = (struct pavgpool_parameters) {
5823 .unipass = (xnn_pavgpool_unipass_ukernel_function) xnn_f32_pavgpool_minmax_ukernel_9x__wasmsimd_x86_c4,
5824 .multipass = (xnn_pavgpool_multipass_ukernel_function) xnn_f32_pavgpool_minmax_ukernel_9p8x__wasmsimd_x86_c4,
5825 .init.f32 = xnn_init_f32_minmax_wasmsimd_params,
5826 .primary_tile = 9,
5827 .incremental_tile = 8,
5828 .channel_tile = 4,
5829 };
5830 xnn_params.f32.gavgpool = (struct gavgpool_parameters) {
5831 .unipass = (xnn_gavgpool_unipass_ukernel_function) xnn_f32_gavgpool_minmax_ukernel_7x__wasmsimd_x86_c4,
5832 .multipass = (xnn_gavgpool_multipass_ukernel_function) xnn_f32_gavgpool_minmax_ukernel_7p7x__wasmsimd_x86_c4,
5833 .init.f32 = xnn_init_f32_scaleminmax_scalar_params,
5834 .update.f32 = xnn_update_f32_scaleminmax_scalar_params,
5835 .row_tile = 7,
5836 .channel_tile = 4,
5837 };
5838 } else {
5839 xnn_params.f32.avgpool = (struct avgpool_parameters) {
5840 .unipass = (xnn_avgpool_unipass_ukernel_function) xnn_f32_avgpool_minmax_ukernel_9x__wasmsimd_arm_c4,
5841 .multipass = (xnn_avgpool_multipass_ukernel_function) xnn_f32_avgpool_minmax_ukernel_9p8x__wasmsimd_arm_c4,
5842 .init.f32 = xnn_init_f32_scaleminmax_scalar_params,
5843 .primary_tile = 9,
5844 .incremental_tile = 8,
5845 .channel_tile = 4,
5846 };
5847 xnn_params.f32.pavgpool = (struct pavgpool_parameters) {
5848 .unipass = (xnn_pavgpool_unipass_ukernel_function) xnn_f32_pavgpool_minmax_ukernel_9x__wasmsimd_arm_c4,
5849 .multipass = (xnn_pavgpool_multipass_ukernel_function) xnn_f32_pavgpool_minmax_ukernel_9p8x__wasmsimd_arm_c4,
5850 .init.f32 = xnn_init_f32_minmax_wasmsimd_params,
5851 .primary_tile = 9,
5852 .incremental_tile = 8,
5853 .channel_tile = 4,
5854 };
5855 xnn_params.f32.gavgpool = (struct gavgpool_parameters) {
5856 .unipass = (xnn_gavgpool_unipass_ukernel_function) xnn_f32_gavgpool_minmax_ukernel_7x__wasmsimd_arm_c4,
5857 .multipass = (xnn_gavgpool_multipass_ukernel_function) xnn_f32_gavgpool_minmax_ukernel_7p7x__wasmsimd_arm_c4,
5858 .init.f32 = xnn_init_f32_scaleminmax_scalar_params,
5859 .update.f32 = xnn_update_f32_scaleminmax_scalar_params,
5860 .row_tile = 7,
5861 .channel_tile = 4,
5862 };
5863 }
5864 if (is_wasm_x86) {
5865 xnn_params.f32.maxpool = (struct maxpool_parameters) {
5866 .ukernel = (xnn_maxpool_ukernel_function) xnn_f32_maxpool_minmax_ukernel_9p8x__wasmsimd_x86_c4,
5867 .init.f32 = xnn_init_f32_minmax_wasmsimd_params,
5868 .mr = 9,
5869 .qr = 8,
5870 };
5871 } else {
5872 xnn_params.f32.maxpool = (struct maxpool_parameters) {
5873 .ukernel = (xnn_maxpool_ukernel_function) xnn_f32_maxpool_minmax_ukernel_9p8x__wasmsimd_arm_c4,
5874 .init.f32 = xnn_init_f32_minmax_wasmsimd_params,
5875 .mr = 9,
5876 .qr = 8,
5877 };
5878 }
5879 xnn_params.f32.argmaxpool[0] = (struct argmaxpool_parameters) {
5880 .up = (xnn_argmaxpool_unipass_ukernel_function) xnn_f32_argmaxpool_ukernel_4x__wasmsimd_c4,
5881 .mr = 4,
5882 };
5883 xnn_params.f32.argmaxpool[1] = (struct argmaxpool_parameters) {
5884 .up = (xnn_argmaxpool_unipass_ukernel_function) xnn_f32_argmaxpool_ukernel_9x__wasmsimd_c4,
5885 .mr = 9,
5886 };
5887 xnn_params.f32.argmaxpool[2] = (struct argmaxpool_parameters) {
5888 .mp = (xnn_argmaxpool_multipass_ukernel_function) xnn_f32_argmaxpool_ukernel_9p8x__wasmsimd_c4,
5889 .mr = 9,
5890 .qr = 8,
5891 };
5892 xnn_params.f32.ibilinear = (struct ibilinear_parameters) {
5893 .ukernel = (xnn_ibilinear_ukernel_function) xnn_f32_ibilinear_ukernel__wasmsimd_c8,
5894 .pixel_tile = 1,
5895 .channel_tile = 8,
5896 };
5897 xnn_params.f32.abs = (struct vunary_parameters) {
5898 .ukernel = (xnn_vunary_ukernel_function) xnn_f32_vabs_ukernel__wasmsimd_x8,
5899 .init.f32_abs = xnn_init_f32_abs_wasmsimd_params,
5900 .element_tile = 16,
5901 };
5902 if (is_wasm_x86) {
5903 xnn_params.f32.clamp = (struct vunary_parameters) {
5904 .ukernel = (xnn_vunary_ukernel_function) xnn_f32_vclamp_ukernel__wasmsimd_x86_x8,
5905 .init.f32_minmax = xnn_init_f32_minmax_wasmsimd_params,
5906 .element_tile = 8,
5907 };
5908 } else {
5909 xnn_params.f32.clamp = (struct vunary_parameters) {
5910 .ukernel = (xnn_vunary_ukernel_function) xnn_f32_vclamp_ukernel__wasmsimd_arm_x8,
5911 .init.f32_minmax = xnn_init_f32_minmax_wasmsimd_params,
5912 .element_tile = 8,
5913 };
5914 }
5915 if (is_wasm_x86) {
5916 xnn_params.f32.elu = (struct vunary_parameters) {
5917 .ukernel = (xnn_vunary_ukernel_function) xnn_f32_velu_ukernel__wasmsimd_x86_rr2_p6_x20,
5918 .init.f32_elu = xnn_init_f32_elu_wasmsimd_rr2_p6_params,
5919 .element_tile = 20,
5920 };
5921 } else {
5922 xnn_params.f32.elu = (struct vunary_parameters) {
5923 .ukernel = (xnn_vunary_ukernel_function) xnn_f32_velu_ukernel__wasmsimd_arm_rr2_p6_x20,
5924 .init.f32_elu = xnn_init_f32_elu_wasmsimd_rr2_p6_params,
5925 .element_tile = 20,
5926 };
5927 }
5928 xnn_params.f32.hswish = (struct vunary_parameters) {
5929 .ukernel = (xnn_vunary_ukernel_function) xnn_f32_vhswish_ukernel__wasmsimd_x16,
5930 .init.f32_hswish = xnn_init_f32_hswish_wasmsimd_params,
5931 .element_tile = 16,
5932 };
5933 if (is_wasm_x86) {
5934 xnn_params.f32.lrelu = (struct vunary_parameters) {
5935 .ukernel = (xnn_vunary_ukernel_function) xnn_f32_vlrelu_ukernel__wasmsimd_minmax_x8,
5936 .init.f32_lrelu = xnn_init_f32_lrelu_wasmsimd_params,
5937 .element_tile = 8,
5938 };
5939 } else {
5940 xnn_params.f32.lrelu = (struct vunary_parameters) {
5941 .ukernel = (xnn_vunary_ukernel_function) xnn_f32_vlrelu_ukernel__wasmsimd_bitselect_x8,
5942 .init.f32_lrelu = xnn_init_f32_lrelu_wasmsimd_params,
5943 .element_tile = 8,
5944 };
5945 }
5946 xnn_params.f32.neg = (struct vunary_parameters) {
5947 .ukernel = (xnn_vunary_ukernel_function) xnn_f32_vneg_ukernel__wasmsimd_x8,
5948 .init.f32_neg = xnn_init_f32_neg_wasmsimd_params,
5949 .element_tile = 16,
5950 };
5951 xnn_params.f32.relu = (struct vunary_parameters) {
5952 .ukernel = (xnn_vunary_ukernel_function) xnn_f32_vrelu_ukernel__wasmsimd_x16,
5953 .element_tile = 16,
5954 };
5955 xnn_params.f32.rndne = (struct vunary_parameters) {
5956 .ukernel = (xnn_vunary_ukernel_function) xnn_f32_vrndne_ukernel__wasmsimd_x8,
5957 .element_tile = 8,
5958 };
5959 xnn_params.f32.rndz = (struct vunary_parameters) {
5960 .ukernel = (xnn_vunary_ukernel_function) xnn_f32_vrndz_ukernel__wasmsimd_x8,
5961 .element_tile = 8,
5962 };
5963 xnn_params.f32.rndu = (struct vunary_parameters) {
5964 .ukernel = (xnn_vunary_ukernel_function) xnn_f32_vrndu_ukernel__wasmsimd_x8,
5965 .element_tile = 8,
5966 };
5967 xnn_params.f32.rndd = (struct vunary_parameters) {
5968 .ukernel = (xnn_vunary_ukernel_function) xnn_f32_vrndd_ukernel__wasmsimd_x8,
5969 .element_tile = 8,
5970 };
5971 xnn_params.f32.sigmoid = (struct vunary_parameters) {
5972 .ukernel = (xnn_vunary_ukernel_function) xnn_f32_vsigmoid_ukernel__wasmsimd_rr2_p5_div_x16,
5973 .init.f32_sigmoid = xnn_init_f32_sigmoid_wasmsimd_rr2_p5_params,
5974 .element_tile = 16,
5975 };
5976 xnn_params.f32.sqr = (struct vunary_parameters) {
5977 .ukernel = (xnn_vunary_ukernel_function) xnn_f32_vsqr_ukernel__wasmsimd_x8,
5978 .element_tile = 16,
5979 };
5980 xnn_params.f32.sqrt = (struct vunary_parameters) {
5981 .ukernel = (xnn_vunary_ukernel_function) xnn_f32_vsqrt_ukernel__wasmsimd_sqrt_x8,
5982 .element_tile = 8,
5983 };
5984 if (is_wasm_x86) {
5985 xnn_params.f32.prelu = (struct prelu_parameters) {
5986 .ukernel = (xnn_prelu_ukernel_function) xnn_f32_prelu_ukernel__wasmsimd_minmax_2x8,
5987 .row_tile = 2,
5988 .channel_tile = 8,
5989 };
5990 } else {
5991 xnn_params.f32.prelu = (struct prelu_parameters) {
5992 .ukernel = (xnn_prelu_ukernel_function) xnn_f32_prelu_ukernel__wasmsimd_bitselect_2x8,
5993 .row_tile = 2,
5994 .channel_tile = 8,
5995 };
5996 }
5997 xnn_params.f32.raddstoreexpminusmax = (struct raddstoreexpminusmax_parameters) {
5998 .ukernel = (xnn_raddstoreexpminusmax_ukernel_function) xnn_f32_raddstoreexpminusmax_ukernel__wasmsimd_rr2_p5_x16_acc2,
5999 .init.f32 = xnn_init_f32_expminus_wasmsimd_rr2_p5_params,
6000 .element_tile = 16,
6001 };
6002 if (is_wasm_x86) {
6003 xnn_params.f32.rmax = (xnn_rmax_ukernel_function) xnn_f32_rmax_ukernel__wasmsimd_x86;
6004 xnn_params.f32.vadd = (struct vbinary_parameters) {
6005 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vadd_minmax_ukernel__wasmsimd_x86_x16,
6006 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vaddc_minmax_ukernel__wasmsimd_x86_x16,
6007 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vaddc_minmax_ukernel__wasmsimd_x86_x16,
6008 .linear.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vadd_ukernel__wasmsimd_x16,
6009 .linear.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vaddc_ukernel__wasmsimd_x16,
6010 .linear.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vaddc_ukernel__wasmsimd_x16,
6011 .init.f32_minmax = xnn_init_f32_minmax_wasmsimd_params,
6012 .element_tile = 16,
6013 };
6014 xnn_params.f32.vdiv = (struct vbinary_parameters) {
6015 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vdiv_minmax_ukernel__wasmsimd_x86_x16,
6016 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vdivc_minmax_ukernel__wasmsimd_x86_x16,
6017 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vrdivc_minmax_ukernel__wasmsimd_x86_x16,
6018 .linear.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vdiv_ukernel__wasmsimd_x16,
6019 .linear.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vdivc_ukernel__wasmsimd_x16,
6020 .linear.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vrdivc_ukernel__wasmsimd_x16,
6021 .init.f32_minmax = xnn_init_f32_minmax_wasmsimd_params,
6022 .element_tile = 16,
6023 };
6024 xnn_params.f32.vmax = (struct vbinary_parameters) {
6025 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmax_ukernel__wasmsimd_x86_x16,
6026 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmaxc_ukernel__wasmsimd_x86_x16,
6027 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmaxc_ukernel__wasmsimd_x86_x16,
6028 .element_tile = 16,
6029 };
6030 xnn_params.f32.vmin = (struct vbinary_parameters) {
6031 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmin_ukernel__wasmsimd_x86_x16,
6032 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vminc_ukernel__wasmsimd_x86_x16,
6033 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vminc_ukernel__wasmsimd_x86_x16,
6034 .element_tile = 16,
6035 };
6036 xnn_params.f32.vmul = (struct vbinary_parameters) {
6037 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmul_minmax_ukernel__wasmsimd_x86_x16,
6038 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmulc_minmax_ukernel__wasmsimd_x86_x16,
6039 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmulc_minmax_ukernel__wasmsimd_x86_x16,
6040 .linear.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmul_ukernel__wasmsimd_x16,
6041 .linear.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmulc_ukernel__wasmsimd_x16,
6042 .linear.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmulc_ukernel__wasmsimd_x16,
6043 .init.f32_minmax = xnn_init_f32_minmax_wasmsimd_params,
6044 .element_tile = 16,
6045 };
6046 xnn_params.f32.vsub = (struct vbinary_parameters) {
6047 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vsub_minmax_ukernel__wasmsimd_x86_x16,
6048 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vsubc_minmax_ukernel__wasmsimd_x86_x16,
6049 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vrsubc_minmax_ukernel__wasmsimd_x86_x16,
6050 .linear.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vsub_ukernel__wasmsimd_x16,
6051 .linear.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vsubc_ukernel__wasmsimd_x16,
6052 .linear.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vrsubc_ukernel__wasmsimd_x16,
6053 .init.f32_minmax = xnn_init_f32_minmax_wasmsimd_params,
6054 .element_tile = 16,
6055 };
6056 } else {
6057 xnn_params.f32.rmax = (xnn_rmax_ukernel_function) xnn_f32_rmax_ukernel__wasmsimd_arm;
6058 xnn_params.f32.vadd = (struct vbinary_parameters) {
6059 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vadd_minmax_ukernel__wasmsimd_arm_x16,
6060 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vaddc_minmax_ukernel__wasmsimd_arm_x16,
6061 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vaddc_minmax_ukernel__wasmsimd_arm_x16,
6062 .linear.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vadd_ukernel__wasmsimd_x16,
6063 .linear.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vaddc_ukernel__wasmsimd_x16,
6064 .linear.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vaddc_ukernel__wasmsimd_x16,
6065 .init.f32_minmax = xnn_init_f32_minmax_wasmsimd_params,
6066 .element_tile = 16,
6067 };
6068 xnn_params.f32.vdiv = (struct vbinary_parameters) {
6069 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vdiv_minmax_ukernel__wasmsimd_arm_x16,
6070 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vdivc_minmax_ukernel__wasmsimd_arm_x16,
6071 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vrdivc_minmax_ukernel__wasmsimd_arm_x16,
6072 .linear.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vdiv_ukernel__wasmsimd_x16,
6073 .linear.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vdivc_ukernel__wasmsimd_x16,
6074 .linear.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vrdivc_ukernel__wasmsimd_x16,
6075 .init.f32_minmax = xnn_init_f32_minmax_wasmsimd_params,
6076 .element_tile = 16,
6077 };
6078 xnn_params.f32.vmax = (struct vbinary_parameters) {
6079 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmax_ukernel__wasmsimd_arm_x16,
6080 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmaxc_ukernel__wasmsimd_arm_x16,
6081 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmaxc_ukernel__wasmsimd_arm_x16,
6082 .element_tile = 16,
6083 };
6084 xnn_params.f32.vmin = (struct vbinary_parameters) {
6085 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmin_ukernel__wasmsimd_arm_x16,
6086 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vminc_ukernel__wasmsimd_arm_x16,
6087 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vminc_ukernel__wasmsimd_arm_x16,
6088 .element_tile = 16,
6089 };
6090 xnn_params.f32.vmul = (struct vbinary_parameters) {
6091 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmul_minmax_ukernel__wasmsimd_arm_x16,
6092 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmulc_minmax_ukernel__wasmsimd_arm_x16,
6093 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmulc_minmax_ukernel__wasmsimd_arm_x16,
6094 .linear.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmul_ukernel__wasmsimd_x16,
6095 .linear.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmulc_ukernel__wasmsimd_x16,
6096 .linear.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmulc_ukernel__wasmsimd_x16,
6097 .init.f32_minmax = xnn_init_f32_minmax_wasmsimd_params,
6098 .element_tile = 16,
6099 };
6100 xnn_params.f32.vsub = (struct vbinary_parameters) {
6101 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vsub_minmax_ukernel__wasmsimd_arm_x16,
6102 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vsubc_minmax_ukernel__wasmsimd_arm_x16,
6103 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vrsubc_minmax_ukernel__wasmsimd_arm_x16,
6104 .linear.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vsub_ukernel__wasmsimd_x16,
6105 .linear.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vsubc_ukernel__wasmsimd_x16,
6106 .linear.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vrsubc_ukernel__wasmsimd_x16,
6107 .init.f32_minmax = xnn_init_f32_minmax_wasmsimd_params,
6108 .element_tile = 16,
6109 };
6110 }
6111 xnn_params.f32.vsqrdiff = (struct vbinary_parameters) {
6112 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vsqrdiff_ukernel__wasmsimd_x16,
6113 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vsqrdiffc_ukernel__wasmsimd_x16,
6114 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vsqrdiffc_ukernel__wasmsimd_x16,
6115 .element_tile = 16,
6116 };
6117 if (is_wasm_x86) {
6118 xnn_params.f32.vmulcaddc = (struct vmulcaddc_parameters) {
6119 .ukernel = (xnn_vmulcaddc_ukernel_function) xnn_f32_vmulcaddc_minmax_ukernel_c4__wasmsimd_x86_2x,
6120 .init.f32 = xnn_init_f32_minmax_wasmsimd_params,
6121 .channel_tile = 4,
6122 .row_tile = 2,
6123 };
6124 } else {
6125 xnn_params.f32.vmulcaddc = (struct vmulcaddc_parameters) {
6126 .ukernel = (xnn_vmulcaddc_ukernel_function) xnn_f32_vmulcaddc_minmax_ukernel_c4__wasmsimd_arm_2x,
6127 .init.f32 = xnn_init_f32_minmax_wasmsimd_params,
6128 .channel_tile = 4,
6129 .row_tile = 2,
6130 };
6131 }
6132 #ifndef XNN_NO_NCHW_OPERATORS
6133 init_flags |= XNN_INIT_FLAG_CHW_OPT;
6134
6135 if (is_wasm_x86) {
6136 xnn_params.f32.spmm = (struct spmm_parameters) {
6137 .ukernel = (xnn_spmm_ukernel_function) xnn_f32_spmm_minmax_ukernel_32x1__wasmsimd_x86,
6138 .mr = 32,
6139 .nr = 1,
6140 };
6141 } else {
6142 xnn_params.f32.spmm = (struct spmm_parameters) {
6143 .ukernel = (xnn_spmm_ukernel_function) xnn_f32_spmm_minmax_ukernel_32x1__wasmsimd_arm,
6144 .mr = 32,
6145 .nr = 1,
6146 };
6147 }
6148 xnn_params.f32.conv_hwc2chw_3x3c3s2 = (struct conv_hwc2chw_parameters) {
6149 .ukernel_with_symm_padding =
6150 (xnn_conv_hwc2chw_ukernel_function) xnn_f32_conv_hwc2chw_ukernel_3x3s2p1c3x4__wasmsimd_2x2,
6151 .output_channel_tile = 4,
6152 .output_height_tile = 2,
6153 .output_width_tile = 2,
6154 };
6155 if (is_wasm_x86) {
6156 xnn_params.f32.dwconv2d_chw_3x3 = (struct dwconv2d_chw_parameters) {
6157 .ukernel = (xnn_dwconv2d_chw_ukernel_function) xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_loadsplat_2x4,
6158 .output_width_tile = 4,
6159 .output_height_tile = 2,
6160 };
6161 xnn_params.f32.dwconv2d_chw_3x3s2 = (struct dwconv2d_chw_parameters) {
6162 .ukernel = (xnn_dwconv2d_chw_ukernel_function) xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__wasmsimd_x86_splat_1x4_acc2,
6163 .output_width_tile = 4,
6164 .output_height_tile = 1,
6165 };
6166 xnn_params.f32.dwconv2d_chw_5x5 = (struct dwconv2d_chw_parameters) {
6167 .ukernel = (xnn_dwconv2d_chw_ukernel_function) xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_splat_3x4,
6168 .output_width_tile = 4,
6169 .output_height_tile = 3,
6170 };
6171 xnn_params.f32.dwconv2d_chw_5x5s2 = (struct dwconv2d_chw_parameters) {
6172 .ukernel = (xnn_dwconv2d_chw_ukernel_function) xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_x86_splat_1x4_acc2,
6173 .output_width_tile = 4,
6174 .output_height_tile = 1,
6175 };
6176 } else {
6177 xnn_params.f32.dwconv2d_chw_3x3 = (struct dwconv2d_chw_parameters) {
6178 .ukernel = (xnn_dwconv2d_chw_ukernel_function) xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_loadsplat_2x4,
6179 .output_width_tile = 4,
6180 .output_height_tile = 2,
6181 };
6182 xnn_params.f32.dwconv2d_chw_3x3s2 = (struct dwconv2d_chw_parameters) {
6183 .ukernel = (xnn_dwconv2d_chw_ukernel_function) xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__wasmsimd_arm_splat_1x4_acc4,
6184 .output_width_tile = 4,
6185 .output_height_tile = 1,
6186 };
6187 xnn_params.f32.dwconv2d_chw_5x5 = (struct dwconv2d_chw_parameters) {
6188 .ukernel = (xnn_dwconv2d_chw_ukernel_function) xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_3x4,
6189 .output_width_tile = 4,
6190 .output_height_tile = 3,
6191 };
6192 xnn_params.f32.dwconv2d_chw_5x5s2 = (struct dwconv2d_chw_parameters) {
6193 .ukernel = (xnn_dwconv2d_chw_ukernel_function) xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_arm_splat_1x4_acc2,
6194 .output_width_tile = 4,
6195 .output_height_tile = 1,
6196 };
6197 }
6198 if (is_wasm_x86) {
6199 xnn_params.f32.gavgpool_cw = (struct gavgpool_cw_parameters) {
6200 .ukernel = (xnn_gavgpool_cw_ukernel_function) xnn_f32_gavgpool_cw_ukernel__wasmsimd_x86_x4,
6201 .channel_tile = 4,
6202 };
6203 } else {
6204 xnn_params.f32.gavgpool_cw = (struct gavgpool_cw_parameters) {
6205 .ukernel = (xnn_gavgpool_cw_ukernel_function) xnn_f32_gavgpool_cw_ukernel__wasmsimd_arm_x4,
6206 .channel_tile = 4,
6207 };
6208 }
6209 xnn_params.f32.ibilinear_chw = (struct ibilinear_chw_parameters) {
6210 .ukernel = (xnn_ibilinear_chw_ukernel_function) xnn_f32_ibilinear_chw_ukernel__wasmsimd_p8,
6211 .channel_tile = 1,
6212 .pixel_tile = 8,
6213 };
6214 #endif // XNN_NO_NCHW_OPERATORS
6215 #endif // XNN_NO_F32_OPERATORS
6216
6217 /*************************** VCVT WAsm SIMD micro-kernels***************************/
6218 #ifndef XNN_NO_VCVT_OPERATORS
6219 init_flags |= XNN_INIT_FLAG_VCVT;
6220
6221 xnn_params.vcvt.f16_to_f32 = (struct vunary_parameters) {
6222 .ukernel = (xnn_vunary_ukernel_function) xnn_f16_f32_vcvt_ukernel__wasmsimd_int16_x16,
6223 .init.f16_f32_cvt = xnn_init_f16_f32_cvt_wasmsimd_int16_params,
6224 .element_tile = 16,
6225 };
6226 xnn_params.vcvt.f32_to_f16 = (struct vunary_parameters) {
6227 .ukernel = (xnn_vunary_ukernel_function) xnn_f32_f16_vcvt_ukernel__wasmsimd_x24,
6228 .init.f32_f16_cvt = xnn_init_f32_f16_cvt_wasmsimd_params,
6229 .element_tile = 24,
6230 };
6231 xnn_params.vcvt.f32_to_qs8 = (struct vunary_parameters) {
6232 .ukernel = (xnn_vunary_ukernel_function) xnn_f32_qs8_vcvt_ukernel__wasmsimd_magic_x32,
6233 .init.f32_qs8_cvt = xnn_init_f32_qs8_cvt_wasmsimd_magic_params,
6234 .element_tile = 32,
6235 };
6236 xnn_params.vcvt.f32_to_qu8 = (struct vunary_parameters) {
6237 .ukernel = (xnn_vunary_ukernel_function) xnn_f32_qu8_vcvt_ukernel__wasmsimd_magic_x32,
6238 .init.f32_qu8_cvt = xnn_init_f32_qu8_cvt_wasmsimd_magic_params,
6239 .element_tile = 32,
6240 };
6241 #if XNN_ARCH_WASMRELAXEDSIMD
6242 xnn_params.vcvt.qs8 = (struct vunary_parameters) {
6243 .ukernel = (xnn_vunary_ukernel_function) xnn_qs8_vcvt_ukernel__wasmrelaxedsimd_x32,
6244 .init.qs8_cvt = xnn_init_qs8_cvt_wasmsimd_params,
6245 .element_tile = 32,
6246 };
6247 #else
6248 xnn_params.vcvt.qs8 = (struct vunary_parameters) {
6249 .ukernel = (xnn_vunary_ukernel_function) xnn_qs8_vcvt_ukernel__wasmsimd_x16,
6250 .init.qs8_cvt = xnn_init_qs8_cvt_wasmsimd_params,
6251 .element_tile = 16,
6252 };
6253 #endif
6254 xnn_params.vcvt.qs8_to_f32 = (struct vunary_parameters) {
6255 .ukernel = (xnn_vunary_ukernel_function) xnn_qs8_f32_vcvt_ukernel__wasmsimd_x32,
6256 .init.qs8_f32_cvt = xnn_init_qs8_f32_cvt_wasmsimd_params,
6257 .element_tile = 32,
6258 };
6259 #if XNN_ARCH_WASMRELAXEDSIMD
6260 xnn_params.vcvt.qu8 = (struct vunary_parameters) {
6261 .ukernel = (xnn_vunary_ukernel_function) xnn_qu8_vcvt_ukernel__wasmrelaxedsimd_x32,
6262 .init.qu8_cvt = xnn_init_qu8_cvt_wasmsimd_params,
6263 .element_tile = 32,
6264 };
6265 #else
6266 xnn_params.vcvt.qu8 = (struct vunary_parameters) {
6267 .ukernel = (xnn_vunary_ukernel_function) xnn_qu8_vcvt_ukernel__wasmsimd_x16,
6268 .init.qu8_cvt = xnn_init_qu8_cvt_wasmsimd_params,
6269 .element_tile = 16,
6270 };
6271 #endif
6272 xnn_params.vcvt.qu8_to_f32 = (struct vunary_parameters) {
6273 .ukernel = (xnn_vunary_ukernel_function) xnn_qu8_f32_vcvt_ukernel__wasmsimd_x32,
6274 .init.qu8_f32_cvt = xnn_init_qu8_f32_cvt_wasmsimd_params,
6275 .element_tile = 32,
6276 };
6277 #endif // XNN_NO_VCVT_OPERATORS
6278
6279 /**************************** X32 WAsm SIMD micro-kernels****************************/
6280 #ifndef XNN_NO_X32_OPERATORS
6281 init_flags |= XNN_INIT_FLAG_X32;
6282
6283 xnn_params.x32.unpool = (xnn_unpool_ukernel_function) xnn_x32_unpool_ukernel__wasmsimd;
6284 xnn_params.x32.zip = (struct zip_parameters) {
6285 .x2 = (xnn_zipc_ukernel_function) xnn_x32_zip_x2_ukernel__wasmsimd,
6286 .x3 = (xnn_zipc_ukernel_function) xnn_x32_zip_x3_ukernel__wasmsimd,
6287 .x4 = (xnn_zipc_ukernel_function) xnn_x32_zip_x4_ukernel__wasmsimd,
6288 .xm = (xnn_zipv_ukernel_function) xnn_x32_zip_xm_ukernel__wasmsimd,
6289 };
6290
6291 xnn_params.x32.transpose = (struct transpose_parameters) {
6292 .const_size_ukernel = (xnn_transposec_ukernel_function) xnn_x32_transposec_ukernel__2x4_scalar_int,
6293 .tile_size = 32,
6294 };
6295 #endif // XNN_NO_X32_OPERATORS
6296
6297 /**************************** XX WAsm SIMD micro-kernels****************************/
6298 #ifndef XNN_NO_XX_OPERATORS
6299 init_flags |= XNN_INIT_FLAG_XX;
6300
6301 xnn_params.xx.copy = (xnn_vunary_ukernel_function) xnn_xx_copy_ukernel__memcpy;
6302 xnn_params.xx.fill = (struct fill_parameters) {
6303 .ukernel = (xnn_fill_ukernel_function) xnn_xx_fill_ukernel__wasmsimd_x64,
6304 .row_tile = 1,
6305 };
6306 xnn_params.xx.pad = (struct pad_parameters) {
6307 .ukernel = (xnn_pad_ukernel_function) xnn_xx_pad_ukernel__wasmsimd,
6308 .row_tile = 1,
6309 };
6310 xnn_params.xx.transpose = (struct transpose_parameters) {
6311 .variable_size_ukernel = xnn_xx_transposev_ukernel__1x1_memcpy,
6312 .tile_size = 32,
6313 };
6314 #endif
6315
6316#elif XNN_ARCH_WASM
6317
6318 /**************************** QC8 WAsm micro-kernels****************************/
6319 #ifndef XNN_NO_QC8_OPERATORS
6320 init_flags |= XNN_INIT_FLAG_QC8;
6321
6322 if (is_wasm_x86) {
6323 xnn_params.qc8.gemm.minmax.gemm[XNN_MR_TO_INDEX(2)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qc8_gemm_minmax_fp32_ukernel_2x2__scalar_imagic);
6324 xnn_params.qc8.gemm.minmax.igemm[XNN_MR_TO_INDEX(2)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qc8_igemm_minmax_fp32_ukernel_2x2__scalar_imagic);
6325 xnn_params.qc8.gemm.minmax.gemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qc8_gemm_minmax_fp32_ukernel_1x2__scalar_imagic);
6326 xnn_params.qc8.gemm.minmax.igemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qc8_igemm_minmax_fp32_ukernel_1x2__scalar_imagic);
6327 xnn_params.qc8.gemm.init.qc8 = xnn_init_qc8_conv_minmax_fp32_scalar_imagic_params;
6328 xnn_params.qc8.gemm.mr = 2;
6329 xnn_params.qc8.gemm.nr = 2;
6330 } else {
6331 xnn_params.qc8.gemm.minmax.gemm[XNN_MR_TO_INDEX(4)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qc8_gemm_minmax_fp32_ukernel_4x4__wasm_fmagic);
6332 xnn_params.qc8.gemm.minmax.igemm[XNN_MR_TO_INDEX(4)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qc8_igemm_minmax_fp32_ukernel_4x4__wasm_fmagic);
6333 xnn_params.qc8.gemm.minmax.gemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qc8_gemm_minmax_fp32_ukernel_1x4__wasm_fmagic);
6334 xnn_params.qc8.gemm.minmax.igemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qc8_igemm_minmax_fp32_ukernel_1x4__wasm_fmagic);
6335 xnn_params.qc8.gemm.init.qc8 = xnn_init_qc8_conv_minmax_fp32_scalar_fmagic_params;
6336 xnn_params.qc8.gemm.mr = 4;
6337 xnn_params.qc8.gemm.nr = 4;
6338 }
6339
6340 if (is_wasm_x86) {
6341 xnn_params.qc8.dwconv[0].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qc8_dwconv_minmax_fp32_ukernel_up2x3__scalar_imagic;
6342 xnn_params.qc8.dwconv[0].init.qc8 = xnn_init_qc8_conv_minmax_fp32_scalar_imagic_params;
6343 xnn_params.qc8.dwconv[0].channel_tile = 2;
6344 xnn_params.qc8.dwconv[0].primary_tile = 3;
6345 xnn_params.qc8.dwconv[1].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qc8_dwconv_minmax_fp32_ukernel_up2x9__scalar_imagic;
6346 xnn_params.qc8.dwconv[1].init.qc8 = xnn_init_qc8_conv_minmax_fp32_scalar_imagic_params;
6347 xnn_params.qc8.dwconv[1].channel_tile = 2;
6348 xnn_params.qc8.dwconv[1].primary_tile = 9;
6349 xnn_params.qc8.dwconv[2].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qc8_dwconv_minmax_fp32_ukernel_up1x25__scalar_imagic;
6350 xnn_params.qc8.dwconv[2].init.qc8 = xnn_init_qc8_conv_minmax_fp32_scalar_imagic_params;
6351 xnn_params.qc8.dwconv[2].channel_tile = 1;
6352 xnn_params.qc8.dwconv[2].primary_tile = 25;
6353 } else {
6354 xnn_params.qc8.dwconv[0].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qc8_dwconv_minmax_fp32_ukernel_up2x3__wasm_fmagic;
6355 xnn_params.qc8.dwconv[0].init.qc8 = xnn_init_qc8_conv_minmax_fp32_scalar_fmagic_params;
6356 xnn_params.qc8.dwconv[0].channel_tile = 2;
6357 xnn_params.qc8.dwconv[0].primary_tile = 3;
6358 xnn_params.qc8.dwconv[1].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qc8_dwconv_minmax_fp32_ukernel_up2x9__wasm_fmagic;
6359 xnn_params.qc8.dwconv[1].init.qc8 = xnn_init_qc8_conv_minmax_fp32_scalar_fmagic_params;
6360 xnn_params.qc8.dwconv[1].channel_tile = 2;
6361 xnn_params.qc8.dwconv[1].primary_tile = 9;
6362 xnn_params.qc8.dwconv[2].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qc8_dwconv_minmax_fp32_ukernel_up2x25__wasm_fmagic;
6363 xnn_params.qc8.dwconv[2].init.qc8 = xnn_init_qc8_conv_minmax_fp32_scalar_fmagic_params;
6364 xnn_params.qc8.dwconv[2].channel_tile = 2;
6365 xnn_params.qc8.dwconv[2].primary_tile = 25;
6366 }
6367 #endif // XNN_NO_QC8_OPERATORS
6368
6369 /**************************** QS8 WAsm micro-kernels****************************/
6370 #ifndef XNN_NO_QS8_OPERATORS
6371 init_flags |= XNN_INIT_FLAG_QS8;
6372
6373 if (is_wasm_x86) {
6374 xnn_params.qs8.gemm.minmax.gemm[XNN_MR_TO_INDEX(2)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_fp32_ukernel_2x2__scalar_imagic);
6375 xnn_params.qs8.gemm.minmax.igemm[XNN_MR_TO_INDEX(2)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_fp32_ukernel_2x2__scalar_imagic);
6376 xnn_params.qs8.gemm.minmax.gemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_fp32_ukernel_1x2__scalar_imagic);
6377 xnn_params.qs8.gemm.minmax.igemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_fp32_ukernel_1x2__scalar_imagic);
6378 xnn_params.qs8.gemm.init.qs8 = xnn_init_qs8_conv_minmax_fp32_scalar_imagic_params;
6379 xnn_params.qs8.gemm.mr = 2;
6380 xnn_params.qs8.gemm.nr = 2;
6381 } else {
6382 xnn_params.qs8.gemm.minmax.gemm[XNN_MR_TO_INDEX(4)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_fp32_ukernel_4x4__wasm_fmagic);
6383 xnn_params.qs8.gemm.minmax.igemm[XNN_MR_TO_INDEX(4)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_fp32_ukernel_4x4__wasm_fmagic);
6384 xnn_params.qs8.gemm.minmax.gemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_fp32_ukernel_1x4__wasm_fmagic);
6385 xnn_params.qs8.gemm.minmax.igemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_fp32_ukernel_1x4__wasm_fmagic);
6386 xnn_params.qs8.gemm.init.qs8 = xnn_init_qs8_conv_minmax_fp32_scalar_fmagic_params;
6387 xnn_params.qs8.gemm.mr = 4;
6388 xnn_params.qs8.gemm.nr = 4;
6389 }
6390
6391 if (is_wasm_x86) {
6392 xnn_params.qs8.dwconv[0].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qs8_dwconv_minmax_fp32_ukernel_up2x9__scalar_imagic;
6393 xnn_params.qs8.dwconv[0].init.qs8 = xnn_init_qs8_conv_minmax_fp32_scalar_imagic_params;
6394 xnn_params.qs8.dwconv[0].channel_tile = 2;
6395 xnn_params.qs8.dwconv[0].primary_tile = 9;
6396 xnn_params.qs8.dwconv[1].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qs8_dwconv_minmax_fp32_ukernel_up1x25__scalar_imagic;
6397 xnn_params.qs8.dwconv[1].init.qs8 = xnn_init_qs8_conv_minmax_fp32_scalar_imagic_params;
6398 xnn_params.qs8.dwconv[1].channel_tile = 1;
6399 xnn_params.qs8.dwconv[1].primary_tile = 25;
6400 } else {
6401 xnn_params.qs8.dwconv[0].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qs8_dwconv_minmax_fp32_ukernel_up2x9__wasm_fmagic;
6402 xnn_params.qs8.dwconv[0].init.qs8 = xnn_init_qs8_conv_minmax_fp32_scalar_fmagic_params;
6403 xnn_params.qs8.dwconv[0].channel_tile = 2;
6404 xnn_params.qs8.dwconv[0].primary_tile = 9;
6405 xnn_params.qs8.dwconv[1].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qs8_dwconv_minmax_fp32_ukernel_up2x25__wasm_fmagic;
6406 xnn_params.qs8.dwconv[1].init.qs8 = xnn_init_qs8_conv_minmax_fp32_scalar_fmagic_params;
6407 xnn_params.qs8.dwconv[1].channel_tile = 2;
6408 xnn_params.qs8.dwconv[1].primary_tile = 25;
6409 }
6410
6411 xnn_params.qs8.gavgpool = (struct gavgpool_parameters) {
6412 .unipass = (xnn_gavgpool_unipass_ukernel_function) xnn_qs8_gavgpool_minmax_fp32_ukernel_7x__scalar_imagic_c4,
6413 .multipass = (xnn_gavgpool_multipass_ukernel_function) xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__scalar_imagic_c4,
6414 .init.qs8 = xnn_init_qs8_avgpool_minmax_fp32_scalar_imagic_params,
6415 .update.qs8 = xnn_update_qs8_avgpool_minmax_fp32_scalar_imagic_params,
6416 .row_tile = 7,
6417 .channel_tile = 4,
6418 };
6419
6420 xnn_params.qs8.vadd = (struct vbinary_parameters) {
6421 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_qs8_vadd_minmax_ukernel__scalar_x4,
6422 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_qs8_vaddc_minmax_ukernel__scalar_x4,
6423 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_qs8_vaddc_minmax_ukernel__scalar_x4,
6424 .init.qs8_add = xnn_init_qs8_add_minmax_scalar_params,
6425 .element_tile = 4,
6426 };
6427 xnn_params.qs8.vmul = (struct vbinary_parameters) {
6428 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_qs8_vmul_minmax_fp32_ukernel__scalar_x4,
6429 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_qs8_vmulc_minmax_fp32_ukernel__scalar_x4,
6430 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_qs8_vmulc_minmax_fp32_ukernel__scalar_x4,
6431 .init.qs8_mul = xnn_init_qs8_mul_minmax_fp32_scalar_params,
6432 .element_tile = 4,
6433 };
6434
6435 if (is_wasm_x86) {
6436 xnn_params.qs8.lrelu = (struct vunary_parameters) {
6437 .ukernel = (xnn_vunary_ukernel_function) xnn_qs8_vlrelu_ukernel__scalar_select_x4,
6438 .init.qs8_lrelu = xnn_init_qs8_lrelu_scalar_select_params,
6439 .element_tile = 4,
6440 };
6441 } else {
6442 xnn_params.qs8.lrelu = (struct vunary_parameters) {
6443 .ukernel = (xnn_vunary_ukernel_function) xnn_qs8_vlrelu_ukernel__scalar_andxor_x4,
6444 .init.qs8_lrelu = xnn_init_qs8_lrelu_scalar_andxor_params,
6445 .element_tile = 4,
6446 };
6447 }
6448 #endif // XNN_NO_QS8_OPERATORS
6449
6450 /**************************** QU8 WAsm micro-kernels****************************/
6451 #ifndef XNN_NO_QU8_OPERATORS
6452 init_flags |= XNN_INIT_FLAG_QU8;
6453
6454 if (is_wasm_x86) {
6455 xnn_params.qu8.gemm.minmax.gemm[XNN_MR_TO_INDEX(2)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qu8_gemm_minmax_fp32_ukernel_2x2__scalar_imagic);
6456 xnn_params.qu8.gemm.minmax.igemm[XNN_MR_TO_INDEX(2)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qu8_igemm_minmax_fp32_ukernel_2x2__scalar_imagic);
6457 xnn_params.qu8.gemm.minmax.gemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qu8_gemm_minmax_fp32_ukernel_1x2__scalar_imagic);
6458 xnn_params.qu8.gemm.minmax.igemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qu8_igemm_minmax_fp32_ukernel_1x2__scalar_imagic);
6459 xnn_params.qu8.gemm.init.qu8 = xnn_init_qu8_conv_minmax_fp32_scalar_imagic_params;
6460 xnn_params.qu8.gemm.mr = 2;
6461 xnn_params.qu8.gemm.nr = 2;
6462 } else {
6463 xnn_params.qu8.gemm.minmax.gemm[XNN_MR_TO_INDEX(4)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qu8_gemm_minmax_fp32_ukernel_4x4__wasm_fmagic);
6464 xnn_params.qu8.gemm.minmax.igemm[XNN_MR_TO_INDEX(4)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qu8_igemm_minmax_fp32_ukernel_4x4__wasm_fmagic);
6465 xnn_params.qu8.gemm.minmax.gemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qu8_gemm_minmax_fp32_ukernel_1x4__wasm_fmagic);
6466 xnn_params.qu8.gemm.minmax.igemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qu8_igemm_minmax_fp32_ukernel_1x4__wasm_fmagic);
6467 xnn_params.qu8.gemm.init.qu8 = xnn_init_qu8_conv_minmax_fp32_scalar_fmagic_params;
6468 xnn_params.qu8.gemm.mr = 4;
6469 xnn_params.qu8.gemm.nr = 4;
6470 }
6471
6472 if (is_wasm_x86) {
6473 xnn_params.qu8.dwconv[0].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qu8_dwconv_minmax_fp32_ukernel_up2x9__scalar_imagic;
6474 xnn_params.qu8.dwconv[0].init.qu8 = xnn_init_qu8_conv_minmax_fp32_scalar_imagic_params;
6475 xnn_params.qu8.dwconv[0].channel_tile = 2;
6476 xnn_params.qu8.dwconv[0].primary_tile = 9;
6477 xnn_params.qu8.dwconv[1].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qu8_dwconv_minmax_fp32_ukernel_up1x25__scalar_imagic;
6478 xnn_params.qu8.dwconv[1].init.qu8 = xnn_init_qu8_conv_minmax_fp32_scalar_imagic_params;
6479 xnn_params.qu8.dwconv[1].channel_tile = 1;
6480 xnn_params.qu8.dwconv[1].primary_tile = 25;
6481 } else {
6482 xnn_params.qu8.dwconv[0].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qu8_dwconv_minmax_fp32_ukernel_up2x9__wasm_fmagic;
6483 xnn_params.qu8.dwconv[0].init.qu8 = xnn_init_qu8_conv_minmax_fp32_scalar_fmagic_params;
6484 xnn_params.qu8.dwconv[0].channel_tile = 2;
6485 xnn_params.qu8.dwconv[0].primary_tile = 9;
6486 xnn_params.qu8.dwconv[1].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qu8_dwconv_minmax_fp32_ukernel_up2x25__wasm_fmagic;
6487 xnn_params.qu8.dwconv[1].init.qu8 = xnn_init_qu8_conv_minmax_fp32_scalar_fmagic_params;
6488 xnn_params.qu8.dwconv[1].channel_tile = 2;
6489 xnn_params.qu8.dwconv[1].primary_tile = 25;
6490 }
6491
6492 xnn_params.qu8.avgpool = (struct avgpool_parameters) {
6493 .unipass = (xnn_avgpool_unipass_ukernel_function) xnn_qu8_avgpool_minmax_ukernel_9x__scalar_c1,
6494 .multipass = (xnn_avgpool_multipass_ukernel_function) xnn_qu8_avgpool_minmax_ukernel_9p8x__scalar_c1,
6495 .init.qu8 = xnn_init_qu8_avgpool_minmax_scalar_params,
6496 .primary_tile = 9,
6497 .incremental_tile = 8,
6498 .channel_tile = 1,
6499 };
6500 xnn_params.qu8.gavgpool = (struct gavgpool_parameters) {
6501 .unipass = (xnn_gavgpool_unipass_ukernel_function) xnn_qu8_gavgpool_minmax_fp32_ukernel_7x__scalar_imagic_c4,
6502 .multipass = (xnn_gavgpool_multipass_ukernel_function) xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__scalar_imagic_c4,
6503 .init.qu8 = xnn_init_qu8_avgpool_minmax_fp32_scalar_imagic_params,
6504 .update.qu8 = xnn_update_qu8_avgpool_minmax_fp32_scalar_imagic_params,
6505 .row_tile = 7,
6506 .channel_tile = 4,
6507 };
6508
6509 xnn_params.qu8.vadd = (struct vbinary_parameters) {
6510 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_qu8_vadd_minmax_ukernel__scalar_x4,
6511 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_qu8_vaddc_minmax_ukernel__scalar_x4,
6512 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_qu8_vaddc_minmax_ukernel__scalar_x4,
6513 .init.qu8_add = xnn_init_qu8_add_minmax_scalar_params,
6514 .element_tile = 4,
6515 };
6516 xnn_params.qu8.vmul = (struct vbinary_parameters) {
6517 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_qu8_vmul_minmax_fp32_ukernel__scalar_x4,
6518 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_qu8_vmulc_minmax_fp32_ukernel__scalar_x4,
6519 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_qu8_vmulc_minmax_fp32_ukernel__scalar_x4,
6520 .init.qu8_mul = xnn_init_qu8_mul_minmax_fp32_scalar_params,
6521 .element_tile = 4,
6522 };
6523
6524 if (is_wasm_x86) {
6525 xnn_params.qu8.lrelu = (struct vunary_parameters) {
6526 .ukernel = (xnn_vunary_ukernel_function) xnn_qu8_vlrelu_ukernel__scalar_select_x4,
6527 .init.qu8_lrelu = xnn_init_qu8_lrelu_scalar_select_params,
6528 .element_tile = 4,
6529 };
6530 } else {
6531 xnn_params.qu8.lrelu = (struct vunary_parameters) {
6532 .ukernel = (xnn_vunary_ukernel_function) xnn_qu8_vlrelu_ukernel__scalar_andxor_x4,
6533 .init.qu8_lrelu = xnn_init_qu8_lrelu_scalar_andxor_params,
6534 .element_tile = 4,
6535 };
6536 }
6537 #endif // XNN_NO_QU8_OPERATORS
6538
6539 /**************************** S8 WAsm micro-kernels****************************/
6540 #ifndef XNN_NO_S8_OPERATORS
6541 init_flags |= XNN_INIT_FLAG_S8;
6542
6543 xnn_params.s8.clamp = (struct vunary_parameters) {
6544 .ukernel = (xnn_vunary_ukernel_function) xnn_s8_vclamp_ukernel__scalar_x4,
6545 .init.s8_minmax = xnn_init_s8_minmax_scalar_params,
6546 .element_tile = 4,
6547 };
6548 xnn_params.s8.ibilinear = (struct ibilinear_parameters) {
6549 .ukernel = (xnn_ibilinear_ukernel_function) xnn_s8_ibilinear_ukernel__scalar_c1,
6550 .pixel_tile = 1,
6551 .channel_tile = 1,
6552 };
6553 xnn_params.s8.maxpool = (struct maxpool_parameters) {
6554 .ukernel = (xnn_maxpool_ukernel_function) xnn_s8_maxpool_minmax_ukernel_9p8x__scalar_c1,
6555 .init.s8 = xnn_init_s8_minmax_scalar_params,
6556 .mr = 9,
6557 .qr = 8,
6558 };
6559 #endif // XNN_NO_S8_OPERATORS
6560
6561 /**************************** U8 WAsm micro-kernels****************************/
6562 #ifndef XNN_NO_U8_OPERATORS
6563 init_flags |= XNN_INIT_FLAG_U8;
6564
6565 xnn_params.u8.clamp = (struct vunary_parameters) {
6566 .ukernel = (xnn_vunary_ukernel_function) xnn_u8_vclamp_ukernel__scalar_x4,
6567 .init.u8_minmax = xnn_init_u8_minmax_scalar_params,
6568 .element_tile = 4,
6569 };
6570 xnn_params.u8.ibilinear = (struct ibilinear_parameters) {
6571 .ukernel = (xnn_ibilinear_ukernel_function) xnn_u8_ibilinear_ukernel__scalar_c1,
6572 .pixel_tile = 1,
6573 .channel_tile = 1,
6574 };
6575 xnn_params.u8.maxpool = (struct maxpool_parameters) {
6576 .ukernel = (xnn_maxpool_ukernel_function) xnn_u8_maxpool_minmax_ukernel_9p8x__scalar_c1,
6577 .init.u8 = xnn_init_u8_minmax_scalar_params,
6578 .mr = 9,
6579 .qr = 8,
6580 };
6581 xnn_params.u8.lut32norm = xnn_u8_lut32norm_ukernel__scalar;
6582 xnn_params.u8.rmax = xnn_u8_rmax_ukernel__scalar;
6583 #endif // XNN_NO_U8_OPERATORS
6584
6585 /**************************** X8 WAsm micro-kernels****************************/
6586 #ifndef XNN_NO_X8_OPERATORS
6587 init_flags |= XNN_INIT_FLAG_X8;
6588
6589 xnn_params.x8.lut = xnn_x8_lut_ukernel__scalar_x4;
6590 xnn_params.x8.zip = (struct zip_parameters) {
6591 .x2 = (xnn_zipc_ukernel_function) xnn_x8_zip_x2_ukernel__scalar,
6592 .x3 = (xnn_zipc_ukernel_function) xnn_x8_zip_x3_ukernel__scalar,
6593 .x4 = (xnn_zipc_ukernel_function) xnn_x8_zip_x4_ukernel__scalar,
6594 .xm = (xnn_zipv_ukernel_function) xnn_x8_zip_xm_ukernel__scalar,
6595 };
6596
6597 xnn_params.x8.transpose = (struct transpose_parameters) {
6598 .const_size_ukernel = (xnn_transposec_ukernel_function) xnn_x8_transposec_ukernel__2x4_scalar_int,
6599 .tile_size = 32,
6600 };
6601 #endif // XNN_NO_X8_OPERATORS
6602
6603 /**************************** X16 WAsm micro-kernels****************************/
6604 #ifndef XNN_NO_X16_OPERATORS
6605 init_flags |= XNN_INIT_FLAG_X16;
6606
6607 xnn_params.x16.transpose = (struct transpose_parameters) {
6608 .const_size_ukernel = (xnn_transposec_ukernel_function) xnn_x16_transposec_ukernel__2x4_scalar_int,
6609 .tile_size = 32,
6610 };
6611 #endif // XNN_NO_X16_OPERATORS
6612
6613 /**************************** F32 WAsm micro-kernels****************************/
6614 #ifndef XNN_NO_F32_OPERATORS
6615 init_flags |= XNN_INIT_FLAG_F32;
6616
6617 if (is_wasm_x86) {
6618 xnn_params.f32.gemm.minmax.gemm[XNN_MR_TO_INDEX(2)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_2x4__scalar);
6619 xnn_params.f32.gemm.minmax.igemm[XNN_MR_TO_INDEX(2)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_2x4__scalar);
6620 xnn_params.f32.gemm.minmax.gemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_1x4__wasm);
6621 xnn_params.f32.gemm.minmax.igemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_1x4__wasm);
6622 xnn_params.f32.gemm.relu.gemm[XNN_MR_TO_INDEX(2)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_relu_ukernel_2x4__scalar);
6623 xnn_params.f32.gemm.relu.igemm[XNN_MR_TO_INDEX(2)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_relu_ukernel_2x4__scalar);
6624 xnn_params.f32.gemm.relu.gemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_relu_ukernel_1x4__wasm);
6625 xnn_params.f32.gemm.relu.igemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_relu_ukernel_1x4__wasm);
6626 xnn_params.f32.gemm.linear.gemm[XNN_MR_TO_INDEX(2)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_ukernel_2x4__scalar);
6627 xnn_params.f32.gemm.linear.igemm[XNN_MR_TO_INDEX(2)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_ukernel_2x4__scalar);
6628 xnn_params.f32.gemm.linear.gemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_ukernel_1x4__scalar);
6629 xnn_params.f32.gemm.linear.igemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_ukernel_1x4__scalar);
6630 xnn_params.f32.gemm.init.f32 = xnn_init_f32_minmax_scalar_params;
6631 xnn_params.f32.gemm.mr = 2;
6632 xnn_params.f32.gemm.nr = 4;
6633 } else {
6634 xnn_params.f32.gemm.minmax.gemm[XNN_MR_TO_INDEX(4)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_4x4__wasm);
6635 xnn_params.f32.gemm.minmax.igemm[XNN_MR_TO_INDEX(4)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_4x4__wasm);
6636 xnn_params.f32.gemm.minmax.gemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_1x4__wasm);
6637 xnn_params.f32.gemm.minmax.igemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_1x4__wasm);
6638 xnn_params.f32.gemm.relu.gemm[XNN_MR_TO_INDEX(4)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_relu_ukernel_4x4__wasm);
6639 xnn_params.f32.gemm.relu.igemm[XNN_MR_TO_INDEX(4)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_relu_ukernel_4x4__wasm);
6640 xnn_params.f32.gemm.relu.gemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_relu_ukernel_1x4__wasm);
6641 xnn_params.f32.gemm.relu.igemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_relu_ukernel_1x4__wasm);
6642 xnn_params.f32.gemm.linear.gemm[XNN_MR_TO_INDEX(4)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_ukernel_4x4__scalar);
6643 xnn_params.f32.gemm.linear.igemm[XNN_MR_TO_INDEX(4)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_ukernel_4x4__scalar);
6644 xnn_params.f32.gemm.linear.gemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_ukernel_1x4__scalar);
6645 xnn_params.f32.gemm.linear.igemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_ukernel_1x4__scalar);
6646 xnn_params.f32.gemm.init.f32 = xnn_init_f32_minmax_scalar_params;
6647 xnn_params.f32.gemm.mr = 4;
6648 xnn_params.f32.gemm.nr = 4;
6649 }
6650 xnn_params.f32.gemm2.minmax.gemm[XNN_MR_TO_INDEX(4)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_4x2__wasm);
6651 xnn_params.f32.gemm2.minmax.igemm[XNN_MR_TO_INDEX(4)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_4x2__wasm);
6652 xnn_params.f32.gemm2.linear.gemm[XNN_MR_TO_INDEX(4)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_ukernel_4x2__scalar);
6653 xnn_params.f32.gemm2.linear.igemm[XNN_MR_TO_INDEX(4)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_ukernel_4x2__scalar);
6654 xnn_params.f32.gemm2.init.f32 = xnn_init_f32_minmax_scalar_params;
6655 xnn_params.f32.gemm2.mr = 4;
6656 xnn_params.f32.gemm2.nr = 2;
6657
6658 xnn_params.f32.dwconv[0].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_f32_dwconv_minmax_ukernel_up1x3__wasm_acc2;
6659 xnn_params.f32.dwconv[0].linear.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_f32_dwconv_ukernel_up1x3__scalar_acc2;
6660 xnn_params.f32.dwconv[0].init.f32 = xnn_init_f32_minmax_scalar_params;
6661 xnn_params.f32.dwconv[0].channel_tile = 1;
6662 xnn_params.f32.dwconv[0].primary_tile = 3;
6663
6664 xnn_params.f32.dwconv[1].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_f32_dwconv_minmax_ukernel_up1x4__wasm_acc2;
6665 xnn_params.f32.dwconv[1].linear.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_f32_dwconv_ukernel_up1x4__scalar_acc2;
6666 xnn_params.f32.dwconv[1].init.f32 = xnn_init_f32_minmax_scalar_params;
6667 xnn_params.f32.dwconv[1].channel_tile = 1;
6668 xnn_params.f32.dwconv[1].primary_tile = 4;
6669
6670 xnn_params.f32.dwconv[2].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_f32_dwconv_minmax_ukernel_up1x9__wasm_acc2;
6671 xnn_params.f32.dwconv[2].linear.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_f32_dwconv_ukernel_up1x9__scalar_acc2;
6672 xnn_params.f32.dwconv[2].init.f32 = xnn_init_f32_minmax_scalar_params;
6673 xnn_params.f32.dwconv[2].channel_tile = 1;
6674 xnn_params.f32.dwconv[2].primary_tile = 9;
6675
6676 xnn_params.f32.dwconv[3].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_f32_dwconv_minmax_ukernel_up1x25__wasm_acc2;
6677 xnn_params.f32.dwconv[3].linear.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_f32_dwconv_ukernel_up1x25__scalar_acc2;
6678 xnn_params.f32.dwconv[3].init.f32 = xnn_init_f32_minmax_scalar_params;
6679 xnn_params.f32.dwconv[3].channel_tile = 1;
6680 xnn_params.f32.dwconv[3].primary_tile = 25;
6681
6682 xnn_params.f32.avgpool = (struct avgpool_parameters) {
6683 .unipass = (xnn_avgpool_unipass_ukernel_function) xnn_f32_avgpool_minmax_ukernel_9x__wasm_c1,
6684 .multipass = (xnn_avgpool_multipass_ukernel_function) xnn_f32_avgpool_minmax_ukernel_9p8x__wasm_c1,
6685 .init.f32 = xnn_init_f32_scaleminmax_scalar_params,
6686 .primary_tile = 9,
6687 .incremental_tile = 8,
6688 .channel_tile = 1,
6689 };
6690 xnn_params.f32.pavgpool = (struct pavgpool_parameters) {
6691 .unipass = (xnn_pavgpool_unipass_ukernel_function) xnn_f32_pavgpool_minmax_ukernel_9x__wasm_c1,
6692 .multipass = (xnn_pavgpool_multipass_ukernel_function) xnn_f32_pavgpool_minmax_ukernel_9p8x__wasm_c1,
6693 .init.f32 = xnn_init_f32_minmax_scalar_params,
6694 .primary_tile = 9,
6695 .incremental_tile = 8,
6696 .channel_tile = 1,
6697 };
6698 xnn_params.f32.gavgpool = (struct gavgpool_parameters) {
6699 .unipass = (xnn_gavgpool_unipass_ukernel_function) xnn_f32_gavgpool_minmax_ukernel_7x__wasm_c1,
6700 .multipass = (xnn_gavgpool_multipass_ukernel_function) xnn_f32_gavgpool_minmax_ukernel_7p7x__wasm_c1,
6701 .init.f32 = xnn_init_f32_scaleminmax_scalar_params,
6702 .update.f32 = xnn_update_f32_scaleminmax_scalar_params,
6703 .row_tile = 7,
6704 .channel_tile = 1,
6705 };
6706 xnn_params.f32.maxpool = (struct maxpool_parameters) {
6707 .ukernel = (xnn_maxpool_ukernel_function) xnn_f32_maxpool_minmax_ukernel_9p8x__wasm_c1,
6708 .init.f32 = xnn_init_f32_minmax_scalar_params,
6709 .mr = 9,
6710 .qr = 8,
6711 };
6712 xnn_params.f32.argmaxpool[0] = (struct argmaxpool_parameters) {
6713 .up = (xnn_argmaxpool_unipass_ukernel_function) xnn_f32_argmaxpool_ukernel_4x__scalar_c1,
6714 .mr = 4,
6715 };
6716 xnn_params.f32.argmaxpool[1] = (struct argmaxpool_parameters) {
6717 .up = (xnn_argmaxpool_unipass_ukernel_function) xnn_f32_argmaxpool_ukernel_9x__scalar_c1,
6718 .mr = 9,
6719 };
6720 xnn_params.f32.argmaxpool[2] = (struct argmaxpool_parameters) {
6721 .mp = (xnn_argmaxpool_multipass_ukernel_function) xnn_f32_argmaxpool_ukernel_9p8x__scalar_c1,
6722 .mr = 9,
6723 .qr = 8,
6724 };
6725 xnn_params.f32.ibilinear = (struct ibilinear_parameters) {
6726 .ukernel = (xnn_ibilinear_ukernel_function) xnn_f32_ibilinear_ukernel__scalar_c2,
6727 .pixel_tile = 1,
6728 .channel_tile = 2,
6729 };
6730 xnn_params.f32.abs = (struct vunary_parameters) {
6731 .ukernel = (xnn_vunary_ukernel_function) xnn_f32_vabs_ukernel__scalar_x4,
6732 .element_tile = 4,
6733 };
6734 xnn_params.f32.clamp = (struct vunary_parameters) {
6735 .ukernel = (xnn_vunary_ukernel_function) xnn_f32_vclamp_ukernel__wasm_x4,
6736 .init.f32_minmax = xnn_init_f32_minmax_scalar_params,
6737 .element_tile = 4,
6738 };
6739 if (is_wasm_x86) {
6740 xnn_params.f32.hswish = (struct vunary_parameters) {
6741 .ukernel = (xnn_vunary_ukernel_function) xnn_f32_vhswish_ukernel__scalar_x4,
6742 .init.f32_hswish = xnn_init_f32_hswish_scalar_params,
6743 .element_tile = 4,
6744 };
6745 } else {
6746 xnn_params.f32.hswish = (struct vunary_parameters) {
6747 .ukernel = (xnn_vunary_ukernel_function) xnn_f32_vhswish_ukernel__wasm_x4,
6748 .init.f32_hswish = xnn_init_f32_hswish_scalar_params,
6749 .element_tile = 4,
6750 };
6751 }
6752 if (is_wasm_x86) {
6753 xnn_params.f32.elu = (struct vunary_parameters) {
6754 .ukernel = (xnn_vunary_ukernel_function) xnn_f32_velu_ukernel__scalar_rr2_lut16_p3_x2,
6755 .init.f32_elu = xnn_init_f32_elu_scalar_rr2_lut16_p3_params,
6756 .element_tile = 2,
6757 };
6758 } else {
6759 xnn_params.f32.elu = (struct vunary_parameters) {
6760 .ukernel = (xnn_vunary_ukernel_function) xnn_f32_velu_ukernel__wasm_rr2_p6_x6,
6761 .init.f32_elu = xnn_init_f32_elu_scalar_rr2_p6_params,
6762 .element_tile = 6,
6763 };
6764 }
6765 xnn_params.f32.lrelu = (struct vunary_parameters) {
6766 .ukernel = (xnn_vunary_ukernel_function) xnn_f32_vlrelu_ukernel__scalar_x4,
6767 .init.f32_lrelu = xnn_init_f32_lrelu_scalar_params,
6768 .element_tile = 4,
6769 };
6770 xnn_params.f32.neg = (struct vunary_parameters) {
6771 .ukernel = (xnn_vunary_ukernel_function) xnn_f32_vneg_ukernel__scalar_x4,
6772 .element_tile = 4,
6773 };
6774 if (is_wasm_x86) {
6775 xnn_params.f32.relu = (struct vunary_parameters) {
6776 .ukernel = (xnn_vunary_ukernel_function) xnn_f32_vrelu_ukernel__scalar_x8,
6777 .element_tile = 8,
6778 };
6779 } else {
6780 xnn_params.f32.relu = (struct vunary_parameters) {
6781 .ukernel = (xnn_vunary_ukernel_function) xnn_f32_vrelu_ukernel__wasm_x8,
6782 .element_tile = 8,
6783 };
6784 }
6785 xnn_params.f32.rndne = (struct vunary_parameters) {
6786 .ukernel = (xnn_vunary_ukernel_function) xnn_f32_vrndne_ukernel__scalar_libm_x4,
6787 .element_tile = 4,
6788 };
6789 xnn_params.f32.rndz = (struct vunary_parameters) {
6790 .ukernel = (xnn_vunary_ukernel_function) xnn_f32_vrndz_ukernel__scalar_libm_x4,
6791 .element_tile = 4,
6792 };
6793 xnn_params.f32.rndu = (struct vunary_parameters) {
6794 .ukernel = (xnn_vunary_ukernel_function) xnn_f32_vrndu_ukernel__scalar_libm_x4,
6795 .element_tile = 4,
6796 };
6797 xnn_params.f32.rndd = (struct vunary_parameters) {
6798 .ukernel = (xnn_vunary_ukernel_function) xnn_f32_vrndd_ukernel__scalar_libm_x4,
6799 .element_tile = 4,
6800 };
6801 xnn_params.f32.sigmoid = (struct vunary_parameters) {
6802 .ukernel = (xnn_vunary_ukernel_function) xnn_f32_vsigmoid_ukernel__scalar_rr2_lut64_p2_div_x2,
6803 .init.f32_sigmoid = xnn_init_f32_sigmoid_scalar_rr2_lut64_p2_params,
6804 .element_tile = 2,
6805 };
6806 xnn_params.f32.sqr = (struct vunary_parameters) {
6807 .ukernel = (xnn_vunary_ukernel_function) xnn_f32_vsqr_ukernel__scalar_x4,
6808 .element_tile = 4,
6809 };
6810 xnn_params.f32.sqrt = (struct vunary_parameters) {
6811 .ukernel = (xnn_vunary_ukernel_function) xnn_f32_vsqrt_ukernel__scalar_sqrt_x1,
6812 .element_tile = 1,
6813 };
6814 if (is_wasm_x86) {
6815 xnn_params.f32.prelu = (struct prelu_parameters) {
6816 .ukernel = (xnn_prelu_ukernel_function) xnn_f32_prelu_ukernel__scalar_2x4,
6817 .row_tile = 2,
6818 .channel_tile = 4,
6819 };
6820 } else {
6821 xnn_params.f32.prelu = (struct prelu_parameters) {
6822 .ukernel = (xnn_prelu_ukernel_function) xnn_f32_prelu_ukernel__wasm_2x4,
6823 .row_tile = 2,
6824 .channel_tile = 4,
6825 };
6826 }
6827 xnn_params.f32.raddstoreexpminusmax = (struct raddstoreexpminusmax_parameters) {
6828 .ukernel = (xnn_raddstoreexpminusmax_ukernel_function) xnn_f32_raddstoreexpminusmax_ukernel__scalar_rr2_p5_x4_acc2,
6829 .init.f32 = xnn_init_f32_expminus_scalar_rr2_p5_params,
6830 .element_tile = 4,
6831 };
6832 xnn_params.f32.rmax = (xnn_rmax_ukernel_function) xnn_f32_rmax_ukernel__scalar;
6833 xnn_params.f32.vadd = (struct vbinary_parameters) {
6834 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vadd_minmax_ukernel__wasm_x8,
6835 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vaddc_minmax_ukernel__wasm_x8,
6836 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vaddc_minmax_ukernel__wasm_x8,
6837 .init.f32_minmax = xnn_init_f32_minmax_scalar_params,
6838 .element_tile = 8,
6839 };
6840 xnn_params.f32.vdiv = (struct vbinary_parameters) {
6841 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vdiv_minmax_ukernel__wasm_x8,
6842 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vdivc_minmax_ukernel__wasm_x8,
6843 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vrdivc_minmax_ukernel__wasm_x8,
6844 .init.f32_minmax = xnn_init_f32_minmax_scalar_params,
6845 .element_tile = 8,
6846 };
6847 xnn_params.f32.vmax = (struct vbinary_parameters) {
6848 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmax_ukernel__wasm_x8,
6849 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmaxc_ukernel__wasm_x8,
6850 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmaxc_ukernel__wasm_x8,
6851 .element_tile = 8,
6852 };
6853 xnn_params.f32.vmin = (struct vbinary_parameters) {
6854 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmin_ukernel__wasm_x8,
6855 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vminc_ukernel__wasm_x8,
6856 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vminc_ukernel__wasm_x8,
6857 .element_tile = 8,
6858 };
6859 xnn_params.f32.vmul = (struct vbinary_parameters) {
6860 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmul_minmax_ukernel__wasm_x8,
6861 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmulc_minmax_ukernel__wasm_x8,
6862 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmulc_minmax_ukernel__wasm_x8,
6863 .init.f32_minmax = xnn_init_f32_minmax_scalar_params,
6864 .element_tile = 8,
6865 };
6866 xnn_params.f32.vsub = (struct vbinary_parameters) {
6867 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vsub_minmax_ukernel__wasm_x8,
6868 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vsubc_minmax_ukernel__wasm_x8,
6869 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vrsubc_minmax_ukernel__wasm_x8,
6870 .init.f32_minmax = xnn_init_f32_minmax_scalar_params,
6871 .element_tile = 8,
6872 };
6873 xnn_params.f32.vsqrdiff = (struct vbinary_parameters) {
6874 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vsqrdiff_ukernel__scalar_x8,
6875 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vsqrdiffc_ukernel__scalar_x8,
6876 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vsqrdiffc_ukernel__scalar_x8,
6877 .element_tile = 8,
6878 };
6879 xnn_params.f32.vmulcaddc = (struct vmulcaddc_parameters) {
6880 .ukernel = (xnn_vmulcaddc_ukernel_function) xnn_f32_vmulcaddc_minmax_ukernel_c1__wasm_2x,
6881 .init.f32 = xnn_init_f32_minmax_scalar_params,
6882 .channel_tile = 1,
6883 .row_tile = 2,
6884 };
6885 #ifndef XNN_NO_NCHW_OPERATORS
6886 init_flags |= XNN_INIT_FLAG_CHW_OPT;
6887
6888 xnn_params.f32.spmm = (struct spmm_parameters) {
6889 .ukernel = (xnn_spmm_ukernel_function) xnn_f32_spmm_minmax_ukernel_8x1__scalar,
6890 .mr = 8,
6891 .nr = 1,
6892 };
6893 xnn_params.f32.spmm2 = (struct spmm_parameters) {
6894 .ukernel = (xnn_spmm_ukernel_function) xnn_f32_spmm_minmax_ukernel_8x2__scalar,
6895 .mr = 8,
6896 .nr = 2,
6897 };
6898 xnn_params.f32.spmm4 = (struct spmm_parameters) {
6899 .ukernel = (xnn_spmm_ukernel_function) xnn_f32_spmm_minmax_ukernel_8x4__scalar,
6900 .mr = 8,
6901 .nr = 4,
6902 };
6903 xnn_params.f32.conv_hwc2chw_3x3c3s2 = (struct conv_hwc2chw_parameters) {
6904 .ukernel_with_symm_padding =
6905 (xnn_conv_hwc2chw_ukernel_function) xnn_f32_conv_hwc2chw_ukernel_3x3s2p1c3x4__scalar_1x1,
6906 .output_channel_tile = 4,
6907 .output_height_tile = 1,
6908 .output_width_tile = 1,
6909 };
6910 xnn_params.f32.dwconv2d_chw_3x3 = (struct dwconv2d_chw_parameters) {
6911 .ukernel = (xnn_dwconv2d_chw_ukernel_function) xnn_f32_dwconv2d_chw_ukernel_3x3p1__scalar_2x1_acc2,
6912 .output_width_tile = 1,
6913 .output_height_tile = 2,
6914 };
6915 xnn_params.f32.dwconv2d_chw_3x3s2 = (struct dwconv2d_chw_parameters) {
6916 .ukernel = (xnn_dwconv2d_chw_ukernel_function) xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__scalar_1x1_acc2,
6917 .output_width_tile = 1,
6918 .output_height_tile = 1,
6919 };
6920 xnn_params.f32.dwconv2d_chw_5x5 = (struct dwconv2d_chw_parameters) {
6921 .ukernel = (xnn_dwconv2d_chw_ukernel_function) xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_1x1_acc5,
6922 .output_width_tile = 1,
6923 .output_height_tile = 1,
6924 };
6925 xnn_params.f32.dwconv2d_chw_5x5s2 = (struct dwconv2d_chw_parameters) {
6926 .ukernel = (xnn_dwconv2d_chw_ukernel_function) xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_1x1_acc5,
6927 .output_width_tile = 1,
6928 .output_height_tile = 1,
6929 };
6930 xnn_params.f32.gavgpool_cw = (struct gavgpool_cw_parameters) {
6931 .ukernel = (xnn_gavgpool_cw_ukernel_function) xnn_f32_gavgpool_cw_ukernel__scalar_x1,
6932 .channel_tile = 1,
6933 };
6934 xnn_params.f32.ibilinear_chw = (struct ibilinear_chw_parameters) {
6935 .ukernel = (xnn_ibilinear_chw_ukernel_function) xnn_f32_ibilinear_chw_ukernel__scalar_p4,
6936 .channel_tile = 1,
6937 .pixel_tile = 4,
6938 };
6939 #endif // XNN_NO_NCHW_OPERATORS
6940 #endif // XNN_NO_F32_OPERATORS
6941
6942 /*************************** VCVT WAsm micro-kernels***************************/
6943 #ifndef XNN_NO_VCVT_OPERATORS
6944 init_flags |= XNN_INIT_FLAG_VCVT;
6945
6946 xnn_params.vcvt.f16_to_f32 = (struct vunary_parameters) {
6947 .ukernel = (xnn_vunary_ukernel_function) xnn_f16_f32_vcvt_ukernel__scalar_x1,
6948 .init.f16_f32_cvt = xnn_init_f16_f32_cvt_scalar_params,
6949 .element_tile = 1,
6950 };
6951 xnn_params.vcvt.f32_to_f16 = (struct vunary_parameters) {
6952 .ukernel = (xnn_vunary_ukernel_function) xnn_f32_f16_vcvt_ukernel__scalar_bitcast_x4,
6953 .init.f32_f16_cvt = xnn_init_f32_f16_cvt_scalar_bitcast_params,
6954 .element_tile = 4,
6955 };
6956 if (is_wasm_x86) {
6957 xnn_params.vcvt.f32_to_qs8 = (struct vunary_parameters) {
6958 .ukernel = (xnn_vunary_ukernel_function) xnn_f32_qu8_vcvt_ukernel__scalar_imagic_x1,
6959 .init.f32_qs8_cvt = xnn_init_f32_qs8_cvt_scalar_imagic_params,
6960 .element_tile = 1,
6961 };
6962 xnn_params.vcvt.f32_to_qu8 = (struct vunary_parameters) {
6963 .ukernel = (xnn_vunary_ukernel_function) xnn_f32_qu8_vcvt_ukernel__scalar_imagic_x1,
6964 .init.f32_qu8_cvt = xnn_init_f32_qu8_cvt_scalar_imagic_params,
6965 .element_tile = 1,
6966 };
6967 } else {
6968 xnn_params.vcvt.f32_to_qs8 = (struct vunary_parameters) {
6969 .ukernel = (xnn_vunary_ukernel_function) xnn_f32_qu8_vcvt_ukernel__wasm_fmagic_x4,
6970 .init.f32_qs8_cvt = xnn_init_f32_qs8_cvt_scalar_fmagic_params,
6971 .element_tile = 4,
6972 };
6973 xnn_params.vcvt.f32_to_qu8 = (struct vunary_parameters) {
6974 .ukernel = (xnn_vunary_ukernel_function) xnn_f32_qu8_vcvt_ukernel__wasm_fmagic_x4,
6975 .init.f32_qu8_cvt = xnn_init_f32_qu8_cvt_scalar_fmagic_params,
6976 .element_tile = 4,
6977 };
6978 }
6979 if (is_wasm_x86) {
6980 xnn_params.vcvt.qs8 = (struct vunary_parameters) {
6981 .ukernel = (xnn_vunary_ukernel_function) xnn_qs8_vcvt_ukernel__scalar_x1,
6982 .init.qs8_cvt = xnn_init_qs8_cvt_scalar_params,
6983 .element_tile = 1,
6984 };
6985 } else {
6986 xnn_params.vcvt.qs8 = (struct vunary_parameters) {
6987 .ukernel = (xnn_vunary_ukernel_function) xnn_qs8_vcvt_ukernel__scalar_x4,
6988 .init.qs8_cvt = xnn_init_qs8_cvt_scalar_params,
6989 .element_tile = 4,
6990 };
6991 }
6992 xnn_params.vcvt.qs8_to_f32 = (struct vunary_parameters) {
6993 .ukernel = (xnn_vunary_ukernel_function) xnn_qs8_f32_vcvt_ukernel__scalar_x1,
6994 .init.qs8_f32_cvt = xnn_init_qs8_f32_cvt_scalar_params,
6995 .element_tile = 1,
6996 };
6997 if (is_wasm_x86) {
6998 xnn_params.vcvt.qu8 = (struct vunary_parameters) {
6999 .ukernel = (xnn_vunary_ukernel_function) xnn_qu8_vcvt_ukernel__scalar_x1,
7000 .init.qu8_cvt = xnn_init_qu8_cvt_scalar_params,
7001 .element_tile = 1,
7002 };
7003 } else {
7004 xnn_params.vcvt.qu8 = (struct vunary_parameters) {
7005 .ukernel = (xnn_vunary_ukernel_function) xnn_qu8_vcvt_ukernel__scalar_x4,
7006 .init.qu8_cvt = xnn_init_qu8_cvt_scalar_params,
7007 .element_tile = 4,
7008 };
7009 }
7010 xnn_params.vcvt.qu8_to_f32 = (struct vunary_parameters) {
7011 .ukernel = (xnn_vunary_ukernel_function) xnn_qu8_f32_vcvt_ukernel__scalar_x1,
7012 .init.qu8_f32_cvt = xnn_init_qu8_f32_cvt_scalar_params,
7013 .element_tile = 1,
7014 };
7015 #endif // XNN_NO_VCVT_OPERATORS
7016
7017 /**************************** X32 WAsm micro-kernels****************************/
7018 #ifndef XNN_NO_X32_OPERATORS
7019 init_flags |= XNN_INIT_FLAG_X32;
7020
7021 xnn_params.x32.unpool = (xnn_unpool_ukernel_function) xnn_x32_unpool_ukernel__scalar;
7022 xnn_params.x32.zip = (struct zip_parameters) {
7023 .x2 = (xnn_zipc_ukernel_function) xnn_x32_zip_x2_ukernel__scalar,
7024 .x3 = (xnn_zipc_ukernel_function) xnn_x32_zip_x3_ukernel__scalar,
7025 .x4 = (xnn_zipc_ukernel_function) xnn_x32_zip_x4_ukernel__scalar,
7026 .xm = (xnn_zipv_ukernel_function) xnn_x32_zip_xm_ukernel__scalar,
7027 };
7028
7029 xnn_params.x32.transpose = (struct transpose_parameters) {
7030 .const_size_ukernel = (xnn_transposec_ukernel_function) xnn_x32_transposec_ukernel__2x4_scalar_int,
7031 .tile_size = 32,
7032 };
7033 #endif // XNN_NO_X32_OPERATORS
7034
7035 /**************************** XX WAsm micro-kernels****************************/
7036 #ifndef XNN_NO_XX_OPERATORS
7037 init_flags |= XNN_INIT_FLAG_XX;
7038
7039 xnn_params.xx.copy = (xnn_vunary_ukernel_function) xnn_xx_copy_ukernel__memcpy;
7040 xnn_params.xx.fill = (struct fill_parameters) {
7041 .ukernel = (xnn_fill_ukernel_function) xnn_xx_fill_ukernel__scalar_x16,
7042 .row_tile = 1,
7043 };
7044 xnn_params.xx.pad = (struct pad_parameters) {
7045 .ukernel = (xnn_pad_ukernel_function) xnn_xx_pad_ukernel__scalar,
7046 .row_tile = 1,
7047 };
7048 xnn_params.xx.transpose = (struct transpose_parameters) {
7049 .variable_size_ukernel = xnn_xx_transposev_ukernel__1x1_memcpy,
7050 .tile_size = 32,
7051 };
7052 #endif
7053
7054#elif XNN_ARCH_RISCV
7055
7056 /************************** QC8 RISC-V micro-kernels **************************/
7057 #ifndef XNN_NO_QC8_OPERATORS
7058 init_flags |= XNN_INIT_FLAG_QC8;
7059
7060 xnn_params.qc8.gemm.minmax.gemm[XNN_MR_TO_INDEX(3)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qc8_gemm_minmax_fp32_ukernel_3x4__scalar_lrintf);
7061 xnn_params.qc8.gemm.minmax.igemm[XNN_MR_TO_INDEX(3)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qc8_igemm_minmax_fp32_ukernel_3x4__scalar_lrintf);
7062 xnn_params.qc8.gemm.minmax.gemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qc8_gemm_minmax_fp32_ukernel_1x4__scalar_lrintf);
7063 xnn_params.qc8.gemm.minmax.igemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qc8_igemm_minmax_fp32_ukernel_1x4__scalar_lrintf);
7064 xnn_params.qc8.gemm.init.qc8 = xnn_init_qc8_conv_minmax_fp32_scalar_lrintf_params;
7065 xnn_params.qc8.gemm.mr = 3;
7066 xnn_params.qc8.gemm.nr = 4;
7067
7068 xnn_params.qc8.dwconv[0].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qc8_dwconv_minmax_fp32_ukernel_up2x3__scalar_lrintf;
7069 xnn_params.qc8.dwconv[0].init.qc8 = xnn_init_qc8_conv_minmax_fp32_scalar_lrintf_params;
7070 xnn_params.qc8.dwconv[0].channel_tile = 2;
7071 xnn_params.qc8.dwconv[0].primary_tile = 3;
7072 xnn_params.qc8.dwconv[1].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qc8_dwconv_minmax_fp32_ukernel_up2x9__scalar_lrintf;
7073 xnn_params.qc8.dwconv[1].init.qc8 = xnn_init_qc8_conv_minmax_fp32_scalar_lrintf_params;
7074 xnn_params.qc8.dwconv[1].channel_tile = 2;
7075 xnn_params.qc8.dwconv[1].primary_tile = 9;
7076 xnn_params.qc8.dwconv[2].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qc8_dwconv_minmax_fp32_ukernel_up2x25__scalar_lrintf;
7077 xnn_params.qc8.dwconv[2].init.qc8 = xnn_init_qc8_conv_minmax_fp32_scalar_lrintf_params;
7078 xnn_params.qc8.dwconv[2].channel_tile = 2;
7079 xnn_params.qc8.dwconv[2].primary_tile = 25;
7080 #endif // XNN_NO_QS8_OPERATORS
7081
7082 /************************** QS8 RISC-V micro-kernels **************************/
7083 #ifndef XNN_NO_QS8_OPERATORS
7084 init_flags |= XNN_INIT_FLAG_QS8;
7085
7086 xnn_params.qs8.gemm.minmax.gemm[XNN_MR_TO_INDEX(3)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_fp32_ukernel_3x4__scalar_lrintf);
7087 xnn_params.qs8.gemm.minmax.igemm[XNN_MR_TO_INDEX(3)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_fp32_ukernel_3x4__scalar_lrintf);
7088 xnn_params.qs8.gemm.minmax.gemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_fp32_ukernel_1x4__scalar_lrintf);
7089 xnn_params.qs8.gemm.minmax.igemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_fp32_ukernel_1x4__scalar_lrintf);
7090 xnn_params.qs8.gemm.init.qs8 = xnn_init_qs8_conv_minmax_fp32_scalar_lrintf_params;
7091 xnn_params.qs8.gemm.mr = 3;
7092 xnn_params.qs8.gemm.nr = 4;
7093
7094 xnn_params.qs8.dwconv[0].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qs8_dwconv_minmax_fp32_ukernel_up2x9__scalar_lrintf;
7095 xnn_params.qs8.dwconv[0].init.qs8 = xnn_init_qs8_conv_minmax_fp32_scalar_lrintf_params;
7096 xnn_params.qs8.dwconv[0].channel_tile = 2;
7097 xnn_params.qs8.dwconv[0].primary_tile = 9;
7098 xnn_params.qs8.dwconv[1].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qs8_dwconv_minmax_fp32_ukernel_up2x25__scalar_lrintf;
7099 xnn_params.qs8.dwconv[1].init.qs8 = xnn_init_qs8_conv_minmax_fp32_scalar_lrintf_params;
7100 xnn_params.qs8.dwconv[1].channel_tile = 2;
7101 xnn_params.qs8.dwconv[1].primary_tile = 25;
7102
7103 xnn_params.qs8.gavgpool = (struct gavgpool_parameters) {
7104 .unipass = (xnn_gavgpool_unipass_ukernel_function) xnn_qs8_gavgpool_minmax_fp32_ukernel_7x__scalar_imagic_c1,
7105 .multipass = (xnn_gavgpool_multipass_ukernel_function) xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__scalar_imagic_c1,
7106 .init.qs8 = xnn_init_qs8_avgpool_minmax_fp32_scalar_imagic_params,
7107 .update.qs8 = xnn_update_qs8_avgpool_minmax_fp32_scalar_imagic_params,
7108 .row_tile = 7,
7109 .channel_tile = 1,
7110 };
7111
7112 xnn_params.qs8.vadd = (struct vbinary_parameters) {
7113 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_qs8_vadd_minmax_ukernel__scalar_x4,
7114 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_qs8_vaddc_minmax_ukernel__scalar_x4,
7115 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_qs8_vaddc_minmax_ukernel__scalar_x4,
7116 .init.qs8_add = xnn_init_qs8_add_minmax_scalar_params,
7117 .element_tile = 4,
7118 };
7119 xnn_params.qs8.vmul = (struct vbinary_parameters) {
7120 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_qs8_vmul_minmax_fp32_ukernel__scalar_x4,
7121 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_qs8_vmulc_minmax_fp32_ukernel__scalar_x4,
7122 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_qs8_vmulc_minmax_fp32_ukernel__scalar_x4,
7123 .init.qs8_mul = xnn_init_qs8_mul_minmax_fp32_scalar_params,
7124 .element_tile = 4,
7125 };
7126
7127 xnn_params.qs8.lrelu = (struct vunary_parameters) {
7128 .ukernel = (xnn_vunary_ukernel_function) xnn_qs8_vlrelu_ukernel__scalar_andxor_x4,
7129 .init.qs8_lrelu = xnn_init_qs8_lrelu_scalar_andxor_params,
7130 .element_tile = 4,
7131 };
7132 #endif // XNN_NO_QS8_OPERATORS
7133
7134 /************************** QU8 RISC-V micro-kernels **************************/
7135 #ifndef XNN_NO_QU8_OPERATORS
7136 init_flags |= XNN_INIT_FLAG_QU8;
7137
7138 xnn_params.qu8.gemm.minmax.gemm[XNN_MR_TO_INDEX(3)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qu8_gemm_minmax_fp32_ukernel_3x4__scalar_lrintf);
7139 xnn_params.qu8.gemm.minmax.igemm[XNN_MR_TO_INDEX(3)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qu8_igemm_minmax_fp32_ukernel_3x4__scalar_lrintf);
7140 xnn_params.qu8.gemm.minmax.gemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qu8_gemm_minmax_fp32_ukernel_1x4__scalar_lrintf);
7141 xnn_params.qu8.gemm.minmax.igemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qu8_igemm_minmax_fp32_ukernel_1x4__scalar_lrintf);
7142 xnn_params.qu8.gemm.init.qu8 = xnn_init_qu8_conv_minmax_fp32_scalar_lrintf_params;
7143 xnn_params.qu8.gemm.mr = 3;
7144 xnn_params.qu8.gemm.nr = 4;
7145
7146 xnn_params.qu8.dwconv[0].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qu8_dwconv_minmax_fp32_ukernel_up2x9__scalar_lrintf;
7147 xnn_params.qu8.dwconv[0].init.qu8 = xnn_init_qu8_conv_minmax_fp32_scalar_lrintf_params;
7148 xnn_params.qu8.dwconv[0].channel_tile = 2;
7149 xnn_params.qu8.dwconv[0].primary_tile = 9;
7150 xnn_params.qu8.dwconv[1].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qu8_dwconv_minmax_fp32_ukernel_up2x25__scalar_lrintf;
7151 xnn_params.qu8.dwconv[1].init.qu8 = xnn_init_qu8_conv_minmax_fp32_scalar_lrintf_params;
7152 xnn_params.qu8.dwconv[1].channel_tile = 2;
7153 xnn_params.qu8.dwconv[1].primary_tile = 25;
7154
7155 xnn_params.qu8.avgpool = (struct avgpool_parameters) {
7156 .unipass = (xnn_avgpool_unipass_ukernel_function) xnn_qu8_avgpool_minmax_ukernel_9x__scalar_c1,
7157 .multipass = (xnn_avgpool_multipass_ukernel_function) xnn_qu8_avgpool_minmax_ukernel_9p8x__scalar_c1,
7158 .init.qu8 = xnn_init_qu8_avgpool_minmax_scalar_params,
7159 .primary_tile = 9,
7160 .incremental_tile = 8,
7161 .channel_tile = 1,
7162 };
7163 xnn_params.qu8.gavgpool = (struct gavgpool_parameters) {
7164 .unipass = (xnn_gavgpool_unipass_ukernel_function) xnn_qu8_gavgpool_minmax_fp32_ukernel_7x__scalar_imagic_c1,
7165 .multipass = (xnn_gavgpool_multipass_ukernel_function) xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__scalar_imagic_c1,
7166 .init.qu8 = xnn_init_qu8_avgpool_minmax_fp32_scalar_imagic_params,
7167 .update.qu8 = xnn_update_qu8_avgpool_minmax_fp32_scalar_imagic_params,
7168 .row_tile = 7,
7169 .channel_tile = 1,
7170 };
7171
7172 xnn_params.qu8.vadd = (struct vbinary_parameters) {
7173 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_qu8_vadd_minmax_ukernel__scalar_x4,
7174 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_qu8_vaddc_minmax_ukernel__scalar_x4,
7175 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_qu8_vaddc_minmax_ukernel__scalar_x4,
7176 .init.qu8_add = xnn_init_qu8_add_minmax_scalar_params,
7177 .element_tile = 4,
7178 };
7179 xnn_params.qu8.vmul = (struct vbinary_parameters) {
7180 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_qu8_vmul_minmax_fp32_ukernel__scalar_x4,
7181 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_qu8_vmulc_minmax_fp32_ukernel__scalar_x4,
7182 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_qu8_vmulc_minmax_fp32_ukernel__scalar_x4,
7183 .init.qu8_mul = xnn_init_qu8_mul_minmax_fp32_scalar_params,
7184 .element_tile = 4,
7185 };
7186
7187 xnn_params.qu8.lrelu = (struct vunary_parameters) {
7188 .ukernel = (xnn_vunary_ukernel_function) xnn_qu8_vlrelu_ukernel__scalar_andxor_x4,
7189 .init.qu8_lrelu = xnn_init_qu8_lrelu_scalar_andxor_params,
7190 .element_tile = 4,
7191 };
7192 #endif // XNN_NO_QU8_OPERATORS
7193
7194 /************************** S8 RISC-V micro-kernels ***************************/
7195 #ifndef XNN_NO_S8_OPERATORS
7196 init_flags |= XNN_INIT_FLAG_S8;
7197
7198 xnn_params.s8.clamp = (struct vunary_parameters) {
7199 .ukernel = (xnn_vunary_ukernel_function) xnn_s8_vclamp_ukernel__scalar_x4,
7200 .init.s8_minmax = xnn_init_s8_minmax_scalar_params,
7201 .element_tile = 4,
7202 };
7203 xnn_params.s8.ibilinear = (struct ibilinear_parameters) {
7204 .ukernel = (xnn_ibilinear_ukernel_function) xnn_s8_ibilinear_ukernel__scalar_c1,
7205 .pixel_tile = 1,
7206 .channel_tile = 1,
7207 };
7208 xnn_params.s8.maxpool = (struct maxpool_parameters) {
7209 .ukernel = (xnn_maxpool_ukernel_function) xnn_s8_maxpool_minmax_ukernel_9p8x__scalar_c1,
7210 .init.s8 = xnn_init_s8_minmax_scalar_params,
7211 .mr = 9,
7212 .qr = 8,
7213 };
7214 #endif // XNN_NO_S8_OPERATORS
7215
7216 /************************** U8 RISC-V micro-kernels ***************************/
7217 #ifndef XNN_NO_U8_OPERATORS
7218 init_flags |= XNN_INIT_FLAG_U8;
7219
7220 xnn_params.u8.clamp = (struct vunary_parameters) {
7221 .ukernel = (xnn_vunary_ukernel_function) xnn_u8_vclamp_ukernel__scalar_x4,
7222 .init.u8_minmax = xnn_init_u8_minmax_scalar_params,
7223 .element_tile = 4,
7224 };
7225 xnn_params.u8.ibilinear = (struct ibilinear_parameters) {
7226 .ukernel = (xnn_ibilinear_ukernel_function) xnn_u8_ibilinear_ukernel__scalar_c1,
7227 .pixel_tile = 1,
7228 .channel_tile = 1,
7229 };
7230 xnn_params.u8.maxpool = (struct maxpool_parameters) {
7231 .ukernel = (xnn_maxpool_ukernel_function) xnn_u8_maxpool_minmax_ukernel_9p8x__scalar_c1,
7232 .init.u8 = xnn_init_u8_minmax_scalar_params,
7233 .mr = 9,
7234 .qr = 8,
7235 };
7236 xnn_params.u8.lut32norm = xnn_u8_lut32norm_ukernel__scalar;
7237 xnn_params.u8.rmax = xnn_u8_rmax_ukernel__scalar;
7238 #endif // XNN_NO_U8_OPERATORS
7239
7240 /************************** X8 RISC-V micro-kernels ***************************/
7241 #ifndef XNN_NO_X8_OPERATORS
7242 init_flags |= XNN_INIT_FLAG_X8;
7243
7244 xnn_params.x8.lut = xnn_x8_lut_ukernel__scalar_x4;
7245 xnn_params.x8.zip = (struct zip_parameters) {
7246 .x2 = (xnn_zipc_ukernel_function) xnn_x8_zip_x2_ukernel__scalar,
7247 .x3 = (xnn_zipc_ukernel_function) xnn_x8_zip_x3_ukernel__scalar,
7248 .x4 = (xnn_zipc_ukernel_function) xnn_x8_zip_x4_ukernel__scalar,
7249 .xm = (xnn_zipv_ukernel_function) xnn_x8_zip_xm_ukernel__scalar,
7250 };
7251
7252 xnn_params.x8.transpose = (struct transpose_parameters) {
7253 .const_size_ukernel = (xnn_transposec_ukernel_function) xnn_x8_transposec_ukernel__2x4_scalar_int,
7254 .tile_size = 32,
7255 };
7256 #endif // XNN_NO_X8_OPERATORS
7257
7258 /************************** X16 RISC-V micro-kernels ***************************/
7259 #ifndef XNN_NO_X16_OPERATORS
7260 init_flags |= XNN_INIT_FLAG_X16;
7261
7262 xnn_params.x16.transpose = (struct transpose_parameters) {
7263 .const_size_ukernel = (xnn_transposec_ukernel_function) xnn_x16_transposec_ukernel__2x4_scalar_int,
7264 .tile_size = 32,
7265 };
7266 #endif // XNN_NO_X16_OPERATORS
7267
7268 /************************** F32 RISC-V micro-kernels **************************/
7269 #ifndef XNN_NO_F32_OPERATORS
7270 init_flags |= XNN_INIT_FLAG_F32;
7271
7272 xnn_params.f32.gemm.minmax.gemm[XNN_MR_TO_INDEX(4)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_4x4__scalar);
7273 xnn_params.f32.gemm.minmax.igemm[XNN_MR_TO_INDEX(4)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_4x4__scalar);
7274 xnn_params.f32.gemm.minmax.gemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_1x4__scalar);
7275 xnn_params.f32.gemm.minmax.igemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_1x4__scalar);
7276 xnn_params.f32.gemm.relu.gemm[XNN_MR_TO_INDEX(4)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_relu_ukernel_4x4__scalar);
7277 xnn_params.f32.gemm.relu.igemm[XNN_MR_TO_INDEX(4)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_relu_ukernel_4x4__scalar);
7278 xnn_params.f32.gemm.relu.gemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_relu_ukernel_1x4__scalar);
7279 xnn_params.f32.gemm.relu.igemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_relu_ukernel_1x4__scalar);
7280 xnn_params.f32.gemm.linear.gemm[XNN_MR_TO_INDEX(4)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_ukernel_4x4__scalar);
7281 xnn_params.f32.gemm.linear.igemm[XNN_MR_TO_INDEX(4)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_ukernel_4x4__scalar);
7282 xnn_params.f32.gemm.linear.gemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_ukernel_1x4__scalar);
7283 xnn_params.f32.gemm.linear.igemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_ukernel_1x4__scalar);
7284 xnn_params.f32.gemm.init.f32 = xnn_init_f32_minmax_scalar_params;
7285 xnn_params.f32.gemm.mr = 4;
7286 xnn_params.f32.gemm.nr = 4;
7287
7288 xnn_params.f32.gemm2.minmax.gemm[XNN_MR_TO_INDEX(4)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_4x2__scalar);
7289 xnn_params.f32.gemm2.minmax.igemm[XNN_MR_TO_INDEX(4)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_4x2__scalar);
7290 xnn_params.f32.gemm2.linear.gemm[XNN_MR_TO_INDEX(4)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_ukernel_4x2__scalar);
7291 xnn_params.f32.gemm2.linear.igemm[XNN_MR_TO_INDEX(4)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_ukernel_4x2__scalar);
7292 xnn_params.f32.gemm2.init.f32 = xnn_init_f32_minmax_scalar_params;
7293 xnn_params.f32.gemm2.mr = 4;
7294 xnn_params.f32.gemm2.nr = 2;
7295
7296 xnn_params.f32.dwconv[0].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_f32_dwconv_minmax_ukernel_up1x3__scalar_acc2;
7297 xnn_params.f32.dwconv[0].linear.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_f32_dwconv_ukernel_up1x3__scalar_acc2;
7298 xnn_params.f32.dwconv[0].init.f32 = xnn_init_f32_minmax_scalar_params;
7299 xnn_params.f32.dwconv[0].channel_tile = 1;
7300 xnn_params.f32.dwconv[0].primary_tile = 3;
7301
7302 xnn_params.f32.dwconv[1].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_f32_dwconv_minmax_ukernel_up1x4__scalar_acc2;
7303 xnn_params.f32.dwconv[1].linear.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_f32_dwconv_ukernel_up1x4__scalar_acc2;
7304 xnn_params.f32.dwconv[1].init.f32 = xnn_init_f32_minmax_scalar_params;
7305 xnn_params.f32.dwconv[1].channel_tile = 1;
7306 xnn_params.f32.dwconv[1].primary_tile = 4;
7307
7308 xnn_params.f32.dwconv[2].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_f32_dwconv_minmax_ukernel_up1x9__scalar_acc2;
7309 xnn_params.f32.dwconv[2].linear.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_f32_dwconv_ukernel_up1x9__scalar_acc2;
7310 xnn_params.f32.dwconv[2].init.f32 = xnn_init_f32_minmax_scalar_params;
7311 xnn_params.f32.dwconv[2].channel_tile = 1;
7312 xnn_params.f32.dwconv[2].primary_tile = 9;
7313
7314 xnn_params.f32.dwconv[3].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_f32_dwconv_minmax_ukernel_up1x25__scalar_acc2;
7315 xnn_params.f32.dwconv[3].linear.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_f32_dwconv_ukernel_up1x25__scalar_acc2;
7316 xnn_params.f32.dwconv[3].init.f32 = xnn_init_f32_minmax_scalar_params;
7317 xnn_params.f32.dwconv[3].channel_tile = 1;
7318 xnn_params.f32.dwconv[3].primary_tile = 25;
7319
7320 xnn_params.f32.avgpool = (struct avgpool_parameters) {
7321 .unipass = (xnn_avgpool_unipass_ukernel_function) xnn_f32_avgpool_minmax_ukernel_9x__scalar_c1,
7322 .multipass = (xnn_avgpool_multipass_ukernel_function) xnn_f32_avgpool_minmax_ukernel_9p8x__scalar_c1,
7323 .init.f32 = xnn_init_f32_scaleminmax_scalar_params,
7324 .primary_tile = 9,
7325 .incremental_tile = 8,
7326 .channel_tile = 1,
7327 };
7328 xnn_params.f32.pavgpool = (struct pavgpool_parameters) {
7329 .unipass = (xnn_pavgpool_unipass_ukernel_function) xnn_f32_pavgpool_minmax_ukernel_9x__scalar_c1,
7330 .multipass = (xnn_pavgpool_multipass_ukernel_function) xnn_f32_pavgpool_minmax_ukernel_9p8x__scalar_c1,
7331 .init.f32 = xnn_init_f32_minmax_scalar_params,
7332 .primary_tile = 9,
7333 .incremental_tile = 8,
7334 .channel_tile = 1,
7335 };
7336 xnn_params.f32.gavgpool = (struct gavgpool_parameters) {
7337 .unipass = (xnn_gavgpool_unipass_ukernel_function) xnn_f32_gavgpool_minmax_ukernel_7x__scalar_c1,
7338 .multipass = (xnn_gavgpool_multipass_ukernel_function) xnn_f32_gavgpool_minmax_ukernel_7p7x__scalar_c1,
7339 .init.f32 = xnn_init_f32_scaleminmax_scalar_params,
7340 .update.f32 = xnn_update_f32_scaleminmax_scalar_params,
7341 .row_tile = 7,
7342 .channel_tile = 1,
7343 };
7344 xnn_params.f32.maxpool = (struct maxpool_parameters) {
7345 .ukernel = (xnn_maxpool_ukernel_function) xnn_f32_maxpool_minmax_ukernel_9p8x__scalar_c1,
7346 .init.f32 = xnn_init_f32_minmax_scalar_params,
7347 .mr = 9,
7348 .qr = 8,
7349 };
7350 xnn_params.f32.argmaxpool[0] = (struct argmaxpool_parameters) {
7351 .up = (xnn_argmaxpool_unipass_ukernel_function) xnn_f32_argmaxpool_ukernel_4x__scalar_c1,
7352 .mr = 4,
7353 };
7354 xnn_params.f32.argmaxpool[1] = (struct argmaxpool_parameters) {
7355 .up = (xnn_argmaxpool_unipass_ukernel_function) xnn_f32_argmaxpool_ukernel_9x__scalar_c1,
7356 .mr = 9,
7357 };
7358 xnn_params.f32.argmaxpool[2] = (struct argmaxpool_parameters) {
7359 .mp = (xnn_argmaxpool_multipass_ukernel_function) xnn_f32_argmaxpool_ukernel_9p8x__scalar_c1,
7360 .mr = 9,
7361 .qr = 8,
7362 };
7363 xnn_params.f32.ibilinear = (struct ibilinear_parameters) {
7364 .ukernel = (xnn_ibilinear_ukernel_function) xnn_f32_ibilinear_ukernel__scalar_c2,
7365 .pixel_tile = 1,
7366 .channel_tile = 2,
7367 };
7368 xnn_params.f32.abs = (struct vunary_parameters) {
7369 .ukernel = (xnn_vunary_ukernel_function) xnn_f32_vabs_ukernel__scalar_x4,
7370 .element_tile = 4,
7371 };
7372 xnn_params.f32.clamp = (struct vunary_parameters) {
7373 .ukernel = (xnn_vunary_ukernel_function) xnn_f32_vclamp_ukernel__scalar_x4,
7374 .init.f32_minmax = xnn_init_f32_minmax_scalar_params,
7375 .element_tile = 4,
7376 };
7377 xnn_params.f32.elu = (struct vunary_parameters) {
7378 .ukernel = (xnn_vunary_ukernel_function) xnn_f32_velu_ukernel__scalar_rr2_lut16_p3_x4,
7379 .init.f32_elu = xnn_init_f32_elu_scalar_rr2_lut16_p3_params,
7380 .element_tile = 4,
7381 };
7382 xnn_params.f32.hswish = (struct vunary_parameters) {
7383 .ukernel = (xnn_vunary_ukernel_function) xnn_f32_vhswish_ukernel__scalar_x4,
7384 .init.f32_hswish = xnn_init_f32_hswish_scalar_params,
7385 .element_tile = 4,
7386 };
7387 xnn_params.f32.lrelu = (struct vunary_parameters) {
7388 .ukernel = (xnn_vunary_ukernel_function) xnn_f32_vlrelu_ukernel__scalar_x4,
7389 .init.f32_lrelu = xnn_init_f32_lrelu_scalar_params,
7390 .element_tile = 4,
7391 };
7392 xnn_params.f32.neg = (struct vunary_parameters) {
7393 .ukernel = (xnn_vunary_ukernel_function) xnn_f32_vneg_ukernel__scalar_x4,
7394 .element_tile = 4,
7395 };
7396 xnn_params.f32.rndne = (struct vunary_parameters) {
7397 .ukernel = (xnn_vunary_ukernel_function) xnn_f32_vrndne_ukernel__scalar_libm_x1,
7398 .element_tile = 1,
7399 };
7400 xnn_params.f32.rndz = (struct vunary_parameters) {
7401 .ukernel = (xnn_vunary_ukernel_function) xnn_f32_vrndz_ukernel__scalar_libm_x1,
7402 .element_tile = 1,
7403 };
7404 xnn_params.f32.rndu = (struct vunary_parameters) {
7405 .ukernel = (xnn_vunary_ukernel_function) xnn_f32_vrndu_ukernel__scalar_libm_x1,
7406 .element_tile = 1,
7407 };
7408 xnn_params.f32.rndd = (struct vunary_parameters) {
7409 .ukernel = (xnn_vunary_ukernel_function) xnn_f32_vrndd_ukernel__scalar_libm_x1,
7410 .element_tile = 1,
7411 };
7412 xnn_params.f32.sigmoid = (struct vunary_parameters) {
7413 .ukernel = (xnn_vunary_ukernel_function) xnn_f32_vsigmoid_ukernel__scalar_rr2_lut64_p2_div_x2,
7414 .init.f32_sigmoid = xnn_init_f32_sigmoid_scalar_rr2_lut64_p2_params,
7415 .element_tile = 2,
7416 };
7417 xnn_params.f32.sqr = (struct vunary_parameters) {
7418 .ukernel = (xnn_vunary_ukernel_function) xnn_f32_vsqr_ukernel__scalar_x4,
7419 .element_tile = 4,
7420 };
7421 xnn_params.f32.sqrt = (struct vunary_parameters) {
7422 .ukernel = (xnn_vunary_ukernel_function) xnn_f32_vsqrt_ukernel__scalar_sqrt_x1,
7423 .element_tile = 1,
7424 };
7425 xnn_params.f32.prelu = (struct prelu_parameters) {
7426 .ukernel = (xnn_prelu_ukernel_function) xnn_f32_prelu_ukernel__scalar_2x4,
7427 .row_tile = 4,
7428 .channel_tile = 4,
7429 };
7430 xnn_params.f32.raddstoreexpminusmax = (struct raddstoreexpminusmax_parameters) {
7431 .ukernel = (xnn_raddstoreexpminusmax_ukernel_function) xnn_f32_raddstoreexpminusmax_ukernel__scalar_rr2_p5_x4_acc2,
7432 .init.f32 = xnn_init_f32_expminus_scalar_rr2_p5_params,
7433 .element_tile = 4,
7434 };
7435 xnn_params.f32.rmax = (xnn_rmax_ukernel_function) xnn_f32_rmax_ukernel__scalar;
7436 xnn_params.f32.vadd = (struct vbinary_parameters) {
7437 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vadd_minmax_ukernel__scalar_x8,
7438 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vaddc_minmax_ukernel__scalar_x8,
7439 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vaddc_minmax_ukernel__scalar_x8,
7440 .init.f32_minmax = xnn_init_f32_minmax_scalar_params,
7441 .element_tile = 8,
7442 };
7443 xnn_params.f32.vdiv = (struct vbinary_parameters) {
7444 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vdiv_minmax_ukernel__scalar_x2,
7445 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vdivc_minmax_ukernel__scalar_x2,
7446 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vrdivc_minmax_ukernel__scalar_x2,
7447 .init.f32_minmax = xnn_init_f32_minmax_scalar_params,
7448 .element_tile = 2,
7449 };
7450 xnn_params.f32.vmax = (struct vbinary_parameters) {
7451 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmax_ukernel__scalar_x8,
7452 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmaxc_ukernel__scalar_x8,
7453 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmaxc_ukernel__scalar_x8,
7454 .element_tile = 8,
7455 };
7456 xnn_params.f32.vmin = (struct vbinary_parameters) {
7457 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmin_ukernel__scalar_x8,
7458 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vminc_ukernel__scalar_x8,
7459 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vminc_ukernel__scalar_x8,
7460 .element_tile = 8,
7461 };
7462 xnn_params.f32.vmul = (struct vbinary_parameters) {
7463 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmul_minmax_ukernel__scalar_x8,
7464 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmulc_minmax_ukernel__scalar_x8,
7465 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmulc_minmax_ukernel__scalar_x8,
7466 .init.f32_minmax = xnn_init_f32_minmax_scalar_params,
7467 .element_tile = 8,
7468 };
7469 xnn_params.f32.vsub = (struct vbinary_parameters) {
7470 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vsub_minmax_ukernel__scalar_x8,
7471 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vsubc_minmax_ukernel__scalar_x8,
7472 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vrsubc_minmax_ukernel__scalar_x8,
7473 .init.f32_minmax = xnn_init_f32_minmax_scalar_params,
7474 .element_tile = 8,
7475 };
7476 xnn_params.f32.vsqrdiff = (struct vbinary_parameters) {
7477 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vsqrdiff_ukernel__scalar_x8,
7478 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vsqrdiffc_ukernel__scalar_x8,
7479 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vsqrdiffc_ukernel__scalar_x8,
7480 .element_tile = 8,
7481 };
7482 xnn_params.f32.vmulcaddc = (struct vmulcaddc_parameters) {
7483 .ukernel = (xnn_vmulcaddc_ukernel_function) xnn_f32_vmulcaddc_minmax_ukernel_c1__scalar_2x,
7484 .init.f32 = xnn_init_f32_minmax_scalar_params,
7485 .channel_tile = 1,
7486 .row_tile = 2,
7487 };
7488 #ifndef XNN_NO_NCHW_OPERATORS
7489 init_flags |= XNN_INIT_FLAG_CHW_OPT;
7490
7491 xnn_params.f32.spmm = (struct spmm_parameters) {
7492 .ukernel = (xnn_spmm_ukernel_function) xnn_f32_spmm_minmax_ukernel_8x1__scalar,
7493 .mr = 8,
7494 .nr = 1,
7495 };
7496 xnn_params.f32.spmm2 = (struct spmm_parameters) {
7497 .ukernel = (xnn_spmm_ukernel_function) xnn_f32_spmm_minmax_ukernel_8x2__scalar,
7498 .mr = 8,
7499 .nr = 2,
7500 };
7501 xnn_params.f32.spmm4 = (struct spmm_parameters) {
7502 .ukernel = (xnn_spmm_ukernel_function) xnn_f32_spmm_minmax_ukernel_8x4__scalar,
7503 .mr = 8,
7504 .nr = 4,
7505 };
7506 xnn_params.f32.conv_hwc2chw_3x3c3s2 = (struct conv_hwc2chw_parameters) {
7507 .ukernel_with_symm_padding =
7508 (xnn_conv_hwc2chw_ukernel_function) xnn_f32_conv_hwc2chw_ukernel_3x3s2p1c3x4__scalar_1x1,
7509 .output_channel_tile = 4,
7510 .output_height_tile = 1,
7511 .output_width_tile = 1,
7512 };
7513 xnn_params.f32.dwconv2d_chw_3x3 = (struct dwconv2d_chw_parameters) {
7514 .ukernel = (xnn_dwconv2d_chw_ukernel_function) xnn_f32_dwconv2d_chw_ukernel_3x3p1__scalar_2x1_acc2,
7515 .output_width_tile = 1,
7516 .output_height_tile = 2,
7517 };
7518 xnn_params.f32.dwconv2d_chw_3x3s2 = (struct dwconv2d_chw_parameters) {
7519 .ukernel = (xnn_dwconv2d_chw_ukernel_function) xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__scalar_1x1_acc2,
7520 .output_width_tile = 1,
7521 .output_height_tile = 1,
7522 };
7523 xnn_params.f32.dwconv2d_chw_5x5 = (struct dwconv2d_chw_parameters) {
7524 .ukernel = (xnn_dwconv2d_chw_ukernel_function) xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_1x1_acc5,
7525 .output_width_tile = 1,
7526 .output_height_tile = 1,
7527 };
7528 xnn_params.f32.dwconv2d_chw_5x5s2 = (struct dwconv2d_chw_parameters) {
7529 .ukernel = (xnn_dwconv2d_chw_ukernel_function) xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_1x1_acc5,
7530 .output_width_tile = 1,
7531 .output_height_tile = 1,
7532 };
7533 xnn_params.f32.gavgpool_cw = (struct gavgpool_cw_parameters) {
7534 .ukernel = (xnn_gavgpool_cw_ukernel_function) xnn_f32_gavgpool_cw_ukernel__scalar_x1,
7535 .channel_tile = 1,
7536 };
7537 xnn_params.f32.ibilinear_chw = (struct ibilinear_chw_parameters) {
7538 .ukernel = (xnn_ibilinear_chw_ukernel_function) xnn_f32_ibilinear_chw_ukernel__scalar_p4,
7539 .channel_tile = 1,
7540 .pixel_tile = 4,
7541 };
7542 #endif // XNN_NO_NCHW_OPERATORS
7543 #endif // XNN_NO_F32_OPERATORS
7544
7545 /************************** VCVT RISC-V micro-kernels *************************/
7546 #ifndef XNN_NO_VCVT_OPERATORS
7547 init_flags |= XNN_INIT_FLAG_VCVT;
7548
7549 xnn_params.vcvt.f16_to_f32 = (struct vunary_parameters) {
7550 .ukernel = (xnn_vunary_ukernel_function) xnn_f16_f32_vcvt_ukernel__scalar_x4,
7551 .init.f16_f32_cvt = xnn_init_f16_f32_cvt_scalar_params,
7552 .element_tile = 4,
7553 };
7554 xnn_params.vcvt.f32_to_f16 = (struct vunary_parameters) {
7555 .ukernel = (xnn_vunary_ukernel_function) xnn_f32_f16_vcvt_ukernel__scalar_fabsf_x2,
7556 .init.f32_f16_cvt = xnn_init_f32_f16_cvt_scalar_fabsf_params,
7557 .element_tile = 2,
7558 };
7559 xnn_params.vcvt.f32_to_qs8 = (struct vunary_parameters) {
7560 .ukernel = (xnn_vunary_ukernel_function) xnn_f32_qs8_vcvt_ukernel__scalar_lrintf_x4,
7561 .init.f32_qs8_cvt = xnn_init_f32_qs8_cvt_scalar_lrintf_params,
7562 .element_tile = 4,
7563 };
7564 xnn_params.vcvt.f32_to_qu8 = (struct vunary_parameters) {
7565 .ukernel = (xnn_vunary_ukernel_function) xnn_f32_qu8_vcvt_ukernel__scalar_lrintf_x4,
7566 .init.f32_qu8_cvt = xnn_init_f32_qu8_cvt_scalar_lrintf_params,
7567 .element_tile = 4,
7568 };
7569 xnn_params.vcvt.qs8 = (struct vunary_parameters) {
7570 .ukernel = (xnn_vunary_ukernel_function) xnn_qs8_vcvt_ukernel__scalar_x4,
7571 .init.qs8_cvt = xnn_init_qs8_cvt_scalar_params,
7572 .element_tile = 4,
7573 };
7574 xnn_params.vcvt.qs8_to_f32 = (struct vunary_parameters) {
7575 .ukernel = (xnn_vunary_ukernel_function) xnn_qs8_f32_vcvt_ukernel__scalar_x4,
7576 .init.qs8_f32_cvt = xnn_init_qs8_f32_cvt_scalar_params,
7577 .element_tile = 4,
7578 };
7579 xnn_params.vcvt.qu8 = (struct vunary_parameters) {
7580 .ukernel = (xnn_vunary_ukernel_function) xnn_qu8_vcvt_ukernel__scalar_x4,
7581 .init.qu8_cvt = xnn_init_qu8_cvt_scalar_params,
7582 .element_tile = 4,
7583 };
7584 xnn_params.vcvt.qu8_to_f32 = (struct vunary_parameters) {
7585 .ukernel = (xnn_vunary_ukernel_function) xnn_qu8_f32_vcvt_ukernel__scalar_x4,
7586 .init.qu8_f32_cvt = xnn_init_qu8_f32_cvt_scalar_params,
7587 .element_tile = 4,
7588 };
7589 #endif // XNN_NO_VCVT_OPERATORS
7590
7591 /************************** X32 RISC-V micro-kernels **************************/
7592 #ifndef XNN_NO_X32_OPERATORS
7593 init_flags |= XNN_INIT_FLAG_X32;
7594
7595 xnn_params.x32.unpool = (xnn_unpool_ukernel_function) xnn_x32_unpool_ukernel__scalar;
7596 xnn_params.x32.zip = (struct zip_parameters) {
7597 .x2 = (xnn_zipc_ukernel_function) xnn_x32_zip_x2_ukernel__scalar,
7598 .x3 = (xnn_zipc_ukernel_function) xnn_x32_zip_x3_ukernel__scalar,
7599 .x4 = (xnn_zipc_ukernel_function) xnn_x32_zip_x4_ukernel__scalar,
7600 .xm = (xnn_zipv_ukernel_function) xnn_x32_zip_xm_ukernel__scalar,
7601 };
7602
7603 xnn_params.x32.transpose = (struct transpose_parameters) {
7604 .const_size_ukernel = (xnn_transposec_ukernel_function) xnn_x32_transposec_ukernel__2x4_scalar_int,
7605 .tile_size = 32,
7606 };
7607 #endif // XNN_NO_X32_OPERATORS
7608
7609 /************************** XX RISC-V micro-kernels ***************************/
7610 #ifndef XNN_NO_XX_OPERATORS
7611 init_flags |= XNN_INIT_FLAG_XX;
7612
7613 xnn_params.xx.copy = (xnn_vunary_ukernel_function) xnn_xx_copy_ukernel__memcpy;
7614 xnn_params.xx.fill = (struct fill_parameters) {
7615 .ukernel = (xnn_fill_ukernel_function) xnn_xx_fill_ukernel__scalar_x16,
7616 .row_tile = 1,
7617 };
7618 xnn_params.xx.pad = (struct pad_parameters) {
7619 .ukernel = (xnn_pad_ukernel_function) xnn_xx_pad_ukernel__scalar,
7620 .row_tile = 1,
7621 };
7622 xnn_params.xx.transpose = (struct transpose_parameters) {
7623 .variable_size_ukernel = xnn_xx_transposev_ukernel__1x1_memcpy,
7624 .tile_size = 32,
7625 };
7626 #endif // XNN_NO_XX_OPERATORS
7627
7628#else
7629 #error "Unsupported architecture"
7630#endif
7631
7632 // Get page size.
7633 #if XNN_PLATFORM_WINDOWS
7634 SYSTEM_INFO sysinfo;
7635 GetSystemInfo(&sysinfo);
7636 xnn_params.page_size = sysinfo.dwPageSize;
7637 #else
7638 const long res = sysconf(_SC_PAGESIZE);
7639 if (res == -1) {
7640 xnn_log_error("failed to get page size, error code: %d", errno);
7641 return;
7642 }
7643 xnn_params.page_size = res;
7644 #endif
7645
7646 memcpy(&xnn_params.allocator, init_allocator, sizeof(struct xnn_allocator));
7647 xnn_params.init_flags = init_flags;
7648}
7649
7650#if XNN_PLATFORM_WINDOWS
7651 static BOOL CALLBACK init_windows(PINIT_ONCE init_once, PVOID parameter, PVOID* context) {
7652 init();
7653 return TRUE;
7654 }
7655#endif
7656
7657enum xnn_status xnn_initialize(const struct xnn_allocator* allocator) {
7658 #if !XNN_PLATFORM_WEB && !XNN_ARCH_RISCV
7659 if (!cpuinfo_initialize()) {
7660 return xnn_status_out_of_memory;
7661 }
7662 #endif // !XNN_PLATFORM_WEB && !XNN_ARCH_RISCV
7663 if (allocator == NULL) {
7664 allocator = &xnn_default_allocator;
7665 }
7666 #ifdef _MSC_VER
7667 _InterlockedCompareExchangePointer((PVOID volatile*) &init_allocator, (PVOID) allocator, NULL);
7668 #else
7669 __sync_bool_compare_and_swap(&init_allocator, NULL, allocator);
7670 #endif
7671 #if XNN_PLATFORM_WINDOWS
7672 InitOnceExecuteOnce(&init_guard, &init_windows, NULL, NULL);
7673 #else
7674 pthread_once(&init_guard, &init);
7675 #endif
7676 if ((xnn_params.init_flags & XNN_INIT_FLAG_XNNPACK) != 0) {
7677 return xnn_status_success;
7678 } else {
7679 return xnn_status_unsupported_hardware;
7680 }
7681}
7682
7683enum xnn_status xnn_deinitialize(void) {
7684 #if !XNN_PLATFORM_WEB && !XNN_ARCH_RISCV
7685 cpuinfo_deinitialize();
7686 #endif // !XNN_PLATFORM_WEB && !XNN_ARCH_RISCV
7687 return xnn_status_success;
7688}
7689