1// Copyright (c) Facebook, Inc. and its affiliates.
2// All rights reserved.
3//
4// Copyright 2019 Google LLC
5//
6// This source code is licensed under the BSD-style license found in the
7// LICENSE file in the root directory of this source tree.
8
9#include <assert.h>
10#include <stdbool.h>
11#include <stddef.h>
12#include <stdint.h>
13#include <string.h>
14
15#ifdef _WIN32
16 #include <windows.h>
17#else
18 #include <errno.h>
19 #include <pthread.h>
20 #include <sys/mman.h>
21 #include <unistd.h>
22#endif
23
24#ifdef _MSC_VER
25 #include <intrin.h>
26#endif
27
28#ifndef __EMSCRIPTEN__
29 #include <cpuinfo.h>
30#endif
31
32#include <xnnpack.h>
33#include <xnnpack/allocator.h>
34#include <xnnpack/argmaxpool.h>
35#include <xnnpack/avgpool.h>
36#include <xnnpack/common.h>
37#include <xnnpack/config.h>
38#include <xnnpack/conv.h>
39#include <xnnpack/dwconv.h>
40#include <xnnpack/gavgpool.h>
41#include <xnnpack/gemm.h>
42#include <xnnpack/fill.h>
43#include <xnnpack/ibilinear.h>
44#include <xnnpack/igemm.h>
45#include <xnnpack/log.h>
46#include <xnnpack/lut.h>
47#include <xnnpack/maxpool.h>
48#include <xnnpack/pad.h>
49#include <xnnpack/params.h>
50#include <xnnpack/microparams-init.h>
51#include <xnnpack/pavgpool.h>
52#include <xnnpack/prelu.h>
53#include <xnnpack/raddstoreexpminusmax.h>
54#include <xnnpack/rmax.h>
55#include <xnnpack/spmm.h>
56#include <xnnpack/unpool.h>
57#include <xnnpack/vadd.h>
58#include <xnnpack/vbinary.h>
59#include <xnnpack/vcvt.h>
60#include <xnnpack/vlrelu.h>
61#include <xnnpack/vmul.h>
62#include <xnnpack/vmulcaddc.h>
63#include <xnnpack/vunary.h>
64#include <xnnpack/zip.h>
65
66
67#if XNN_PLATFORM_WINDOWS
68 static INIT_ONCE init_guard = INIT_ONCE_STATIC_INIT;
69#else
70 static pthread_once_t init_guard = PTHREAD_ONCE_INIT;
71#endif
72
73#define XNN_MR_TO_INDEX(MR) (MR-1)
74
75#ifndef XNN_ENABLE_ASSEMBLY
76 #error "XNN_ENABLE_ASSEMBLY is not defined"
77#endif
78
79#ifndef XNN_ENABLE_GEMM_M_SPECIALIZATION
80 #error "XNN_ENABLE_GEMM_M_SPECIALIZATION is not defined"
81#endif
82
83static const struct xnn_allocator* volatile init_allocator = NULL;
84
85static void init(void) {
86 const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config();
87 assert(hardware_config != NULL);
88
89 uint32_t init_flags = XNN_INIT_FLAG_XNNPACK;
90
91#if XNN_ARCH_ARM
92 if (hardware_config->use_arm_neon) {
93 /**************************** QC8 AArch32 micro-kernels ****************************/
94 #ifndef XNN_NO_QC8_OPERATORS
95 init_flags |= XNN_INIT_FLAG_QC8;
96
97 #if XNN_ENABLE_ASSEMBLY
98 if (XNN_ENABLE_ARM_DOTPROD && hardware_config->use_arm_neon_dot) {
99 #if XNN_ENABLE_ARM_DOTPROD
100 switch (cpuinfo_get_uarch(0)->uarch) {
101 case cpuinfo_uarch_cortex_a55:
102 xnn_params.qc8.gemm.minmax.gemm[XNN_MR_TO_INDEX(4)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_fn) xnn_qc8_gemm_minmax_fp32_ukernel_4x8c4__asm_aarch32_neondot_cortex_a55);
103 xnn_params.qc8.gemm.minmax.igemm[XNN_MR_TO_INDEX(4)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_fn) xnn_qc8_igemm_minmax_fp32_ukernel_4x8c4__asm_aarch32_neondot_cortex_a55);
104 xnn_params.qc8.gemm.minmax.gemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_fn) xnn_qc8_gemm_minmax_fp32_ukernel_1x8c4__neondot);
105 xnn_params.qc8.gemm.minmax.igemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_fn) xnn_qc8_igemm_minmax_fp32_ukernel_1x8c4__neondot);
106 xnn_params.qc8.gemm.init.qc8 = xnn_init_qc8_conv_minmax_fp32_neonv8_params;
107 xnn_params.qc8.gemm.mr = 4;
108 xnn_params.qc8.gemm.nr = 8;
109 xnn_params.qc8.gemm.log2_kr = 2;
110 break;
111 default:
112 xnn_params.qc8.gemm.minmax.gemm[XNN_MR_TO_INDEX(4)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_fn) xnn_qc8_gemm_minmax_fp32_ukernel_4x8c4__asm_aarch32_neondot_ld64);
113 xnn_params.qc8.gemm.minmax.igemm[XNN_MR_TO_INDEX(4)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_fn) xnn_qc8_igemm_minmax_fp32_ukernel_4x8c4__asm_aarch32_neondot_ld64);
114 xnn_params.qc8.gemm.minmax.gemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_fn) xnn_qc8_gemm_minmax_fp32_ukernel_1x8c4__neondot);
115 xnn_params.qc8.gemm.minmax.igemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_fn) xnn_qc8_igemm_minmax_fp32_ukernel_1x8c4__neondot);
116 xnn_params.qc8.gemm.init.qc8 = xnn_init_qc8_conv_minmax_fp32_neonv8_params;
117 xnn_params.qc8.gemm.mr = 4;
118 xnn_params.qc8.gemm.nr = 8;
119 xnn_params.qc8.gemm.log2_kr = 2;
120 break;
121 }
122 #endif // XNN_ENABLE_ARM_DOTPROD
123 } else {
124 switch (cpuinfo_get_uarch(0)->uarch) {
125 case cpuinfo_uarch_cortex_a5:
126 case cpuinfo_uarch_cortex_a7:
127 case cpuinfo_uarch_krait:
128 case cpuinfo_uarch_kryo:
129 xnn_params.qc8.gemm.minmax.gemm[XNN_MR_TO_INDEX(4)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_fn) xnn_qc8_gemm_minmax_fp32_ukernel_4x8__asm_aarch32_neon_mlal_lane_prfm_cortex_a7);
130 xnn_params.qc8.gemm.minmax.igemm[XNN_MR_TO_INDEX(4)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_fn) xnn_qc8_igemm_minmax_fp32_ukernel_4x8__asm_aarch32_neon_mlal_lane_prfm_cortex_a7);
131 xnn_params.qc8.gemm.minmax.gemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_fn) xnn_qc8_gemm_minmax_fp32_ukernel_1x8__asm_aarch32_neon_mlal_lane_prfm_cortex_a7);
132 xnn_params.qc8.gemm.minmax.igemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_fn) xnn_qc8_gemm_minmax_fp32_ukernel_1x8__asm_aarch32_neon_mlal_lane_prfm_cortex_a7);
133 xnn_params.qc8.gemm.init.qc8 = xnn_init_qc8_conv_minmax_fp32_neon_params;
134 xnn_params.qc8.gemm.mr = 4;
135 xnn_params.qc8.gemm.nr = 8;
136 break;
137 case cpuinfo_uarch_cortex_a32:
138 case cpuinfo_uarch_cortex_a35:
139 xnn_params.qc8.gemm.minmax.gemm[XNN_MR_TO_INDEX(4)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_fn) xnn_qc8_gemm_minmax_fp32_ukernel_4x8__asm_aarch32_neonv8_mlal_lane_cortex_a35);
140 xnn_params.qc8.gemm.minmax.igemm[XNN_MR_TO_INDEX(4)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_fn) xnn_qc8_igemm_minmax_fp32_ukernel_4x8__asm_aarch32_neonv8_mlal_lane_cortex_a35);
141 xnn_params.qc8.gemm.minmax.gemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_fn) xnn_qc8_gemm_minmax_fp32_ukernel_1x8__asm_aarch32_neonv8_mlal_lane_cortex_a35);
142 xnn_params.qc8.gemm.minmax.igemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_fn) xnn_qc8_igemm_minmax_fp32_ukernel_1x8__asm_aarch32_neonv8_mlal_lane_cortex_a35);
143 xnn_params.qc8.gemm.init.qc8 = xnn_init_qc8_conv_minmax_fp32_neonv8_params;
144 xnn_params.qc8.gemm.mr = 4;
145 xnn_params.qc8.gemm.nr = 8;
146 break;
147 case cpuinfo_uarch_cortex_a53:
148 case cpuinfo_uarch_cortex_a57:
149 xnn_params.qc8.gemm.minmax.gemm[XNN_MR_TO_INDEX(4)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_fn) xnn_qc8_gemm_minmax_fp32_ukernel_4x8__asm_aarch32_neonv8_mlal_lane_prfm_cortex_a53);
150 xnn_params.qc8.gemm.minmax.igemm[XNN_MR_TO_INDEX(4)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_fn) xnn_qc8_igemm_minmax_fp32_ukernel_4x8__asm_aarch32_neonv8_mlal_lane_prfm_cortex_a53);
151 xnn_params.qc8.gemm.minmax.gemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_fn) xnn_qc8_gemm_minmax_fp32_ukernel_1x8__asm_aarch32_neonv8_mlal_lane_prfm_cortex_a35);
152 xnn_params.qc8.gemm.minmax.igemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_fn) xnn_qc8_igemm_minmax_fp32_ukernel_1x8__asm_aarch32_neonv8_mlal_lane_prfm_cortex_a35);
153 xnn_params.qc8.gemm.init.qc8 = xnn_init_qc8_conv_minmax_fp32_neonv8_params;
154 xnn_params.qc8.gemm.mr = 4;
155 xnn_params.qc8.gemm.nr = 8;
156 break;
157 case cpuinfo_uarch_cortex_a55r0:
158 xnn_params.qc8.gemm.minmax.gemm[XNN_MR_TO_INDEX(4)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_fn) xnn_qc8_gemm_minmax_fp32_ukernel_4x8__asm_aarch32_neonv8_mlal_lane_cortex_a53);
159 xnn_params.qc8.gemm.minmax.igemm[XNN_MR_TO_INDEX(4)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_fn) xnn_qc8_igemm_minmax_fp32_ukernel_4x8__asm_aarch32_neonv8_mlal_lane_cortex_a53);
160 xnn_params.qc8.gemm.minmax.gemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_fn) xnn_qc8_gemm_minmax_fp32_ukernel_1x8__asm_aarch32_neonv8_mlal_lane_cortex_a35);
161 xnn_params.qc8.gemm.minmax.igemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_fn) xnn_qc8_igemm_minmax_fp32_ukernel_1x8__asm_aarch32_neonv8_mlal_lane_cortex_a35);
162 xnn_params.qc8.gemm.init.qc8 = xnn_init_qc8_conv_minmax_fp32_neonv8_params;
163 xnn_params.qc8.gemm.mr = 4;
164 xnn_params.qc8.gemm.nr = 8;
165 break;
166 case cpuinfo_uarch_cortex_a72:
167 xnn_params.qc8.gemm.minmax.gemm[XNN_MR_TO_INDEX(2)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_fn) xnn_qc8_gemm_minmax_fp32_ukernel_2x8c2s4__neonv8_mlal);
168 xnn_params.qc8.gemm.minmax.igemm[XNN_MR_TO_INDEX(2)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_fn) xnn_qc8_igemm_minmax_fp32_ukernel_2x8c2s4__neonv8_mlal);
169 xnn_params.qc8.gemm.minmax.gemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_fn) xnn_qc8_gemm_minmax_fp32_ukernel_1x8c2s4__neonv8_mlal);
170 xnn_params.qc8.gemm.minmax.igemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_fn) xnn_qc8_igemm_minmax_fp32_ukernel_1x8c2s4__neonv8_mlal);
171 xnn_params.qc8.gemm.init.qc8 = xnn_init_qc8_conv_minmax_fp32_neonv8_params;
172 xnn_params.qc8.gemm.mr = 2;
173 xnn_params.qc8.gemm.nr = 8;
174 xnn_params.qc8.gemm.log2_kr = 1;
175 xnn_params.qc8.gemm.log2_sr = 2;
176 break;
177 case cpuinfo_uarch_exynos_m1:
178 case cpuinfo_uarch_exynos_m2:
179 case cpuinfo_uarch_exynos_m3:
180 xnn_params.qc8.gemm.minmax.gemm[XNN_MR_TO_INDEX(4)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_fn) xnn_qc8_gemm_minmax_fp32_ukernel_4x8__asm_aarch32_neonv8_mlal_lane_prfm_ld64);
181 xnn_params.qc8.gemm.minmax.igemm[XNN_MR_TO_INDEX(4)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_fn) xnn_qc8_igemm_minmax_fp32_ukernel_4x8__asm_aarch32_neonv8_mlal_lane_prfm_ld64);
182 xnn_params.qc8.gemm.minmax.gemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_fn) xnn_qc8_gemm_minmax_fp32_ukernel_1x8__asm_aarch32_neonv8_mlal_lane_prfm_cortex_a35);
183 xnn_params.qc8.gemm.minmax.igemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_fn) xnn_qc8_igemm_minmax_fp32_ukernel_1x8__asm_aarch32_neonv8_mlal_lane_prfm_cortex_a35);
184 xnn_params.qc8.gemm.init.qc8 = xnn_init_qc8_conv_minmax_fp32_neonv8_params;
185 xnn_params.qc8.gemm.mr = 4;
186 xnn_params.qc8.gemm.nr = 8;
187 break;
188
189 default:
190 if (hardware_config->use_arm_neon_v8) {
191 xnn_params.qc8.gemm.minmax.gemm[XNN_MR_TO_INDEX(4)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_fn) xnn_qc8_gemm_minmax_fp32_ukernel_4x8__asm_aarch32_neonv8_mlal_lane_ld64);
192 xnn_params.qc8.gemm.minmax.igemm[XNN_MR_TO_INDEX(4)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_fn) xnn_qc8_igemm_minmax_fp32_ukernel_4x8__asm_aarch32_neonv8_mlal_lane_ld64);
193 xnn_params.qc8.gemm.minmax.gemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_fn) xnn_qc8_gemm_minmax_fp32_ukernel_1x8__asm_aarch32_neonv8_mlal_lane_cortex_a35);
194 xnn_params.qc8.gemm.minmax.igemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_fn) xnn_qc8_igemm_minmax_fp32_ukernel_1x8__asm_aarch32_neonv8_mlal_lane_cortex_a35);
195 xnn_params.qc8.gemm.init.qc8 = xnn_init_qc8_conv_minmax_fp32_neonv8_params;
196 xnn_params.qc8.gemm.mr = 4;
197 xnn_params.qc8.gemm.nr = 8;
198 } else {
199 xnn_params.qc8.gemm.minmax.gemm[XNN_MR_TO_INDEX(4)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_fn) xnn_qc8_gemm_minmax_fp32_ukernel_4x8__asm_aarch32_neon_mlal_lane_ld64);
200 xnn_params.qc8.gemm.minmax.igemm[XNN_MR_TO_INDEX(4)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_fn) xnn_qc8_igemm_minmax_fp32_ukernel_4x8__asm_aarch32_neon_mlal_lane_ld64);
201 xnn_params.qc8.gemm.minmax.gemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_fn) xnn_qc8_gemm_minmax_fp32_ukernel_1x8__asm_aarch32_neon_mlal_lane_cortex_a7);
202 xnn_params.qc8.gemm.minmax.igemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_fn) xnn_qc8_igemm_minmax_fp32_ukernel_1x8__asm_aarch32_neon_mlal_lane_cortex_a7);
203 xnn_params.qc8.gemm.init.qc8 = xnn_init_qc8_conv_minmax_fp32_neon_params;
204 xnn_params.qc8.gemm.mr = 4;
205 xnn_params.qc8.gemm.nr = 8;
206 }
207 break;
208 }
209 }
210 #if XNN_MAX_UARCH_TYPES > 1
211 {
212 /* Choose micro-kernels for little cores according to micro-kernel specification for the big core */
213 const uint32_t mr = xnn_params.qc8.gemm.mr;
214 const uint32_t nr = xnn_params.qc8.gemm.nr;
215 const uint32_t log2_kr = xnn_params.qc8.gemm.log2_kr;
216 for (size_t i = 1; i < XNN_MAX_UARCH_TYPES; i++) {
217 const struct cpuinfo_uarch_info* uarch_info = cpuinfo_get_uarch(i);
218 if (uarch_info == NULL) {
219 /* No more microarchitectures in the system */
220 break;
221 }
222
223 switch (uarch_info->uarch) {
224 case cpuinfo_uarch_cortex_a55:
225 #if XNN_ENABLE_ARM_DOTPROD
226 if (mr == 4 && nr == 8 && log2_kr == 2 && hardware_config->use_arm_neon_dot) {
227 xnn_params.qc8.gemm.minmax.gemm[XNN_MR_TO_INDEX(4)].function[i] = (xnn_gemm_ukernel_fn) xnn_qc8_gemm_minmax_fp32_ukernel_4x8c4__asm_aarch32_neondot_cortex_a55;
228 xnn_params.qc8.gemm.minmax.igemm[XNN_MR_TO_INDEX(4)].function[i] = (xnn_igemm_ukernel_fn) xnn_qc8_igemm_minmax_fp32_ukernel_4x8c4__asm_aarch32_neondot_cortex_a55;
229 xnn_params.qc8.gemm.minmax.gemm[XNN_MR_TO_INDEX(1)].function[i] = (xnn_gemm_ukernel_fn) xnn_qc8_gemm_minmax_fp32_ukernel_1x8c4__neondot;
230 xnn_params.qc8.gemm.minmax.igemm[XNN_MR_TO_INDEX(1)].function[i] = (xnn_igemm_ukernel_fn) xnn_qc8_igemm_minmax_fp32_ukernel_1x8c4__neondot;
231 }
232 #endif // XNN_ENABLE_ARM_DOTPROD
233 break;
234 case cpuinfo_uarch_cortex_a53:
235 if (mr == 4 && nr == 8 && log2_kr == 0) {
236 xnn_params.qc8.gemm.minmax.gemm[XNN_MR_TO_INDEX(4)].function[i] = (xnn_gemm_ukernel_fn) xnn_qc8_gemm_minmax_fp32_ukernel_4x8__asm_aarch32_neonv8_mlal_lane_prfm_cortex_a53;
237 xnn_params.qc8.gemm.minmax.igemm[XNN_MR_TO_INDEX(4)].function[i] = (xnn_igemm_ukernel_fn) xnn_qc8_igemm_minmax_fp32_ukernel_4x8__asm_aarch32_neonv8_mlal_lane_prfm_cortex_a53;
238 xnn_params.qc8.gemm.minmax.gemm[XNN_MR_TO_INDEX(1)].function[i] = (xnn_gemm_ukernel_fn) xnn_qc8_gemm_minmax_fp32_ukernel_1x8__asm_aarch32_neonv8_mlal_lane_prfm_cortex_a35;
239 xnn_params.qc8.gemm.minmax.igemm[XNN_MR_TO_INDEX(1)].function[i] = (xnn_igemm_ukernel_fn) xnn_qc8_igemm_minmax_fp32_ukernel_1x8__asm_aarch32_neonv8_mlal_lane_prfm_cortex_a35;
240 }
241 break;
242 case cpuinfo_uarch_cortex_a55r0:
243 if (mr == 4 && nr == 8 && log2_kr == 0) {
244 xnn_params.qc8.gemm.minmax.gemm[XNN_MR_TO_INDEX(4)].function[i] = (xnn_gemm_ukernel_fn) xnn_qc8_gemm_minmax_fp32_ukernel_4x8__asm_aarch32_neonv8_mlal_lane_cortex_a53;
245 xnn_params.qc8.gemm.minmax.igemm[XNN_MR_TO_INDEX(4)].function[i] = (xnn_igemm_ukernel_fn) xnn_qc8_igemm_minmax_fp32_ukernel_4x8__asm_aarch32_neonv8_mlal_lane_cortex_a53;
246 xnn_params.qc8.gemm.minmax.gemm[XNN_MR_TO_INDEX(1)].function[i] = (xnn_gemm_ukernel_fn) xnn_qc8_gemm_minmax_fp32_ukernel_1x8__asm_aarch32_neonv8_mlal_lane_cortex_a35;
247 xnn_params.qc8.gemm.minmax.igemm[XNN_MR_TO_INDEX(1)].function[i] = (xnn_igemm_ukernel_fn) xnn_qc8_igemm_minmax_fp32_ukernel_1x8__asm_aarch32_neonv8_mlal_lane_cortex_a35;
248 }
249 break;
250
251 default:
252 break;
253 }
254 }
255 }
256 #endif // XNN_MAX_UARCH_TYPES > 1
257 #else // XNN_ENABLE_ASSEMBLY
258 if (XNN_ENABLE_ARM_DOTPROD && hardware_config->use_arm_neon_dot) {
259 #if XNN_ENABLE_ARM_DOTPROD
260 xnn_params.qc8.gemm.minmax.gemm[XNN_MR_TO_INDEX(4)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_fn) xnn_qc8_gemm_minmax_fp32_ukernel_4x8c4__neondot);
261 xnn_params.qc8.gemm.minmax.igemm[XNN_MR_TO_INDEX(4)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_fn) xnn_qc8_igemm_minmax_fp32_ukernel_4x8c4__neondot);
262 xnn_params.qc8.gemm.minmax.gemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_fn) xnn_qc8_gemm_minmax_fp32_ukernel_1x8c4__neondot);
263 xnn_params.qc8.gemm.minmax.igemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_fn) xnn_qc8_igemm_minmax_fp32_ukernel_1x8c4__neondot);
264 xnn_params.qc8.gemm.init.qc8 = xnn_init_qc8_conv_minmax_fp32_neonv8_params;
265 xnn_params.qc8.gemm.mr = 4;
266 xnn_params.qc8.gemm.nr = 8;
267 xnn_params.qc8.gemm.log2_kr = 2;
268 #endif // XNN_ENABLE_ARM_DOTPROD
269 } else if (hardware_config->use_arm_neon_v8) {
270 xnn_params.qc8.gemm.minmax.gemm[XNN_MR_TO_INDEX(2)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_fn) xnn_qc8_gemm_minmax_fp32_ukernel_2x8c2s4__neonv8_mlal);
271 xnn_params.qc8.gemm.minmax.igemm[XNN_MR_TO_INDEX(2)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_fn) xnn_qc8_igemm_minmax_fp32_ukernel_2x8c2s4__neonv8_mlal);
272 xnn_params.qc8.gemm.minmax.gemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_fn) xnn_qc8_gemm_minmax_fp32_ukernel_1x8c2s4__neonv8_mlal);
273 xnn_params.qc8.gemm.minmax.igemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_fn) xnn_qc8_igemm_minmax_fp32_ukernel_1x8c2s4__neonv8_mlal);
274 xnn_params.qc8.gemm.init.qc8 = xnn_init_qc8_conv_minmax_fp32_neonv8_params;
275 xnn_params.qc8.gemm.mr = 2;
276 xnn_params.qc8.gemm.nr = 8;
277 xnn_params.qc8.gemm.log2_kr = 1;
278 xnn_params.qc8.gemm.log2_sr = 2;
279 } else {
280 xnn_params.qc8.gemm.minmax.gemm[XNN_MR_TO_INDEX(2)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_fn) xnn_qc8_gemm_minmax_fp32_ukernel_2x8c2s4__neon_mlal);
281 xnn_params.qc8.gemm.minmax.igemm[XNN_MR_TO_INDEX(2)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_fn) xnn_qc8_igemm_minmax_fp32_ukernel_2x8c2s4__neon_mlal);
282 xnn_params.qc8.gemm.minmax.gemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_fn) xnn_qc8_gemm_minmax_fp32_ukernel_1x8c2s4__neon_mlal);
283 xnn_params.qc8.gemm.minmax.igemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_fn) xnn_qc8_igemm_minmax_fp32_ukernel_1x8c2s4__neon_mlal);
284 xnn_params.qc8.gemm.init.qc8 = xnn_init_qc8_conv_minmax_fp32_neon_params;
285 xnn_params.qc8.gemm.mr = 2;
286 xnn_params.qc8.gemm.nr = 8;
287 xnn_params.qc8.gemm.log2_kr = 1;
288 xnn_params.qc8.gemm.log2_sr = 2;
289 }
290 #endif // XNN_ENABLE_ASSEMBLY
291
292 if (hardware_config->use_arm_neon_v8) {
293 xnn_params.qc8.dwconv[0].minmax.unipass = (xnn_dwconv_unipass_ukernel_fn) xnn_qc8_dwconv_minmax_fp32_ukernel_3p16c__asm_aarch32_neonv8_mla8_cortex_a35;
294 xnn_params.qc8.dwconv[0].init.qc8 = xnn_init_qc8_conv_minmax_fp32_neonv8_params;
295 xnn_params.qc8.dwconv[0].channel_tile = 16;
296 xnn_params.qc8.dwconv[0].primary_tile = 3;
297 xnn_params.qc8.dwconv[1].minmax.unipass = (xnn_dwconv_unipass_ukernel_fn) xnn_qc8_dwconv_minmax_fp32_ukernel_9p16c__neonv8_mla8_ld64;
298 xnn_params.qc8.dwconv[1].init.qc8 = xnn_init_qc8_conv_minmax_fp32_neonv8_params;
299 xnn_params.qc8.dwconv[1].channel_tile = 16;
300 xnn_params.qc8.dwconv[1].primary_tile = 9;
301 xnn_params.qc8.dwconv[2].minmax.unipass = (xnn_dwconv_unipass_ukernel_fn) xnn_qc8_dwconv_minmax_fp32_ukernel_25p8c__neonv8_mla8_ld64;
302 xnn_params.qc8.dwconv[2].init.qc8 = xnn_init_qc8_conv_minmax_fp32_neonv8_params;
303 xnn_params.qc8.dwconv[2].channel_tile = 8;
304 xnn_params.qc8.dwconv[2].primary_tile = 25;
305 } else {
306 xnn_params.qc8.dwconv[0].minmax.unipass = (xnn_dwconv_unipass_ukernel_fn) xnn_qc8_dwconv_minmax_fp32_ukernel_3p16c__neon_mla8_ld128;
307 xnn_params.qc8.dwconv[0].init.qc8 = xnn_init_qc8_conv_minmax_fp32_neon_params;
308 xnn_params.qc8.dwconv[0].channel_tile = 16;
309 xnn_params.qc8.dwconv[0].primary_tile = 3;
310 xnn_params.qc8.dwconv[1].minmax.unipass = (xnn_dwconv_unipass_ukernel_fn) xnn_qc8_dwconv_minmax_fp32_ukernel_9p16c__neon_mla8_ld64;
311 xnn_params.qc8.dwconv[1].init.qc8 = xnn_init_qc8_conv_minmax_fp32_neon_params;
312 xnn_params.qc8.dwconv[1].channel_tile = 16;
313 xnn_params.qc8.dwconv[1].primary_tile = 9;
314 xnn_params.qc8.dwconv[2].minmax.unipass = (xnn_dwconv_unipass_ukernel_fn) xnn_qc8_dwconv_minmax_fp32_ukernel_25p8c__neon_mla8_ld64;
315 xnn_params.qc8.dwconv[2].init.qc8 = xnn_init_qc8_conv_minmax_fp32_neon_params;
316 xnn_params.qc8.dwconv[2].channel_tile = 8;
317 xnn_params.qc8.dwconv[2].primary_tile = 25;
318 }
319 #endif // XNN_NO_QC8_OPERATORS
320
321 /**************************** QS8 AArch32 micro-kernels ****************************/
322 #ifndef XNN_NO_QS8_OPERATORS
323 init_flags |= XNN_INIT_FLAG_QS8;
324
325 #if XNN_ENABLE_ASSEMBLY
326 if (XNN_ENABLE_ARM_DOTPROD && hardware_config->use_arm_neon_dot) {
327 #if XNN_ENABLE_ARM_DOTPROD
328 switch (cpuinfo_get_uarch(0)->uarch) {
329 case cpuinfo_uarch_cortex_a55:
330 xnn_params.qs8.gemm.minmax.gemm[XNN_MR_TO_INDEX(4)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_fn) xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c4__asm_aarch32_neondot_cortex_a55);
331 xnn_params.qs8.gemm.minmax.igemm[XNN_MR_TO_INDEX(4)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_fn) xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c4__asm_aarch32_neondot_cortex_a55);
332 xnn_params.qs8.gemm.minmax.gemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_fn) xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c4__neondot);
333 xnn_params.qs8.gemm.minmax.igemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_fn) xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4__neondot);
334 xnn_params.qs8.gemm.init.qs8 = xnn_init_qs8_conv_minmax_rndnu_neon_params;
335 xnn_params.qs8.gemm.mr = 4;
336 xnn_params.qs8.gemm.nr = 8;
337 xnn_params.qs8.gemm.log2_kr = 2;
338 break;
339 default:
340 xnn_params.qs8.gemm.minmax.gemm[XNN_MR_TO_INDEX(4)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_fn) xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c4__asm_aarch32_neondot_ld64);
341 xnn_params.qs8.gemm.minmax.igemm[XNN_MR_TO_INDEX(4)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_fn) xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c4__asm_aarch32_neondot_ld64);
342 xnn_params.qs8.gemm.minmax.gemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_fn) xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c4__neondot);
343 xnn_params.qs8.gemm.minmax.igemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_fn) xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4__neondot);
344 xnn_params.qs8.gemm.init.qs8 = xnn_init_qs8_conv_minmax_rndnu_neon_params;
345 xnn_params.qs8.gemm.mr = 4;
346 xnn_params.qs8.gemm.nr = 8;
347 xnn_params.qs8.gemm.log2_kr = 2;
348 break;
349 }
350 #endif // XNN_ENABLE_ARM_DOTPROD
351 } else {
352 switch (cpuinfo_get_uarch(0)->uarch) {
353 case cpuinfo_uarch_cortex_a5:
354 case cpuinfo_uarch_cortex_a7:
355 case cpuinfo_uarch_krait:
356 case cpuinfo_uarch_kryo:
357 xnn_params.qs8.gemm.minmax.gemm[XNN_MR_TO_INDEX(4)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_fn) xnn_qs8_gemm_minmax_rndnu_ukernel_4x8__asm_aarch32_neon_mlal_lane_prfm_cortex_a7);
358 xnn_params.qs8.gemm.minmax.igemm[XNN_MR_TO_INDEX(4)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_fn) xnn_qs8_igemm_minmax_rndnu_ukernel_4x8__asm_aarch32_neon_mlal_lane_prfm_cortex_a7);
359 xnn_params.qs8.gemm.minmax.gemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_fn) xnn_qs8_gemm_minmax_rndnu_ukernel_1x8__asm_aarch32_neon_mlal_lane_prfm_cortex_a7);
360 xnn_params.qs8.gemm.minmax.igemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_fn) xnn_qs8_igemm_minmax_rndnu_ukernel_1x8__asm_aarch32_neon_mlal_lane_prfm_cortex_a7);
361 xnn_params.qs8.gemm.init.qs8 = xnn_init_qs8_conv_minmax_rndnu_neon_params;
362 xnn_params.qs8.gemm.mr = 4;
363 xnn_params.qs8.gemm.nr = 8;
364 break;
365 case cpuinfo_uarch_cortex_a32:
366 case cpuinfo_uarch_cortex_a35:
367 xnn_params.qs8.gemm.minmax.gemm[XNN_MR_TO_INDEX(4)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_fn) xnn_qs8_gemm_minmax_rndnu_ukernel_4x8__asm_aarch32_neon_mlal_lane_cortex_a7);
368 xnn_params.qs8.gemm.minmax.igemm[XNN_MR_TO_INDEX(4)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_fn) xnn_qs8_igemm_minmax_rndnu_ukernel_4x8__asm_aarch32_neon_mlal_lane_cortex_a7);
369 xnn_params.qs8.gemm.minmax.gemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_fn) xnn_qs8_gemm_minmax_rndnu_ukernel_1x8__asm_aarch32_neon_mlal_lane_cortex_a7);
370 xnn_params.qs8.gemm.minmax.igemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_fn) xnn_qs8_igemm_minmax_rndnu_ukernel_1x8__asm_aarch32_neon_mlal_lane_cortex_a7);
371 xnn_params.qs8.gemm.init.qs8 = xnn_init_qs8_conv_minmax_rndnu_neon_params;
372 xnn_params.qs8.gemm.mr = 4;
373 xnn_params.qs8.gemm.nr = 8;
374 break;
375 case cpuinfo_uarch_cortex_a53:
376 case cpuinfo_uarch_cortex_a57:
377 xnn_params.qs8.gemm.minmax.gemm[XNN_MR_TO_INDEX(4)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_fn) xnn_qs8_gemm_minmax_rndnu_ukernel_4x8__asm_aarch32_neon_mlal_lane_prfm_cortex_a53);
378 xnn_params.qs8.gemm.minmax.igemm[XNN_MR_TO_INDEX(4)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_fn) xnn_qs8_igemm_minmax_rndnu_ukernel_4x8__asm_aarch32_neon_mlal_lane_prfm_cortex_a53);
379 xnn_params.qs8.gemm.minmax.gemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_fn) xnn_qs8_gemm_minmax_rndnu_ukernel_1x8__asm_aarch32_neon_mlal_lane_prfm_cortex_a7);
380 xnn_params.qs8.gemm.minmax.igemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_fn) xnn_qs8_igemm_minmax_rndnu_ukernel_1x8__asm_aarch32_neon_mlal_lane_prfm_cortex_a7);
381 xnn_params.qs8.gemm.init.qs8 = xnn_init_qs8_conv_minmax_rndnu_neon_params;
382 xnn_params.qs8.gemm.mr = 4;
383 xnn_params.qs8.gemm.nr = 8;
384 break;
385 case cpuinfo_uarch_cortex_a55r0:
386 xnn_params.qs8.gemm.minmax.gemm[XNN_MR_TO_INDEX(4)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_fn) xnn_qs8_gemm_minmax_rndnu_ukernel_4x8__asm_aarch32_neon_mlal_lane_cortex_a53);
387 xnn_params.qs8.gemm.minmax.igemm[XNN_MR_TO_INDEX(4)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_fn) xnn_qs8_igemm_minmax_rndnu_ukernel_4x8__asm_aarch32_neon_mlal_lane_cortex_a53);
388 xnn_params.qs8.gemm.minmax.gemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_fn) xnn_qs8_gemm_minmax_rndnu_ukernel_1x8__asm_aarch32_neon_mlal_lane_cortex_a7);
389 xnn_params.qs8.gemm.minmax.igemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_fn) xnn_qs8_igemm_minmax_rndnu_ukernel_1x8__asm_aarch32_neon_mlal_lane_cortex_a7);
390 xnn_params.qs8.gemm.init.qs8 = xnn_init_qs8_conv_minmax_rndnu_neon_params;
391 xnn_params.qs8.gemm.mr = 4;
392 xnn_params.qs8.gemm.nr = 8;
393 break;
394 case cpuinfo_uarch_cortex_a72:
395 xnn_params.qs8.gemm.minmax.gemm[XNN_MR_TO_INDEX(2)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_fn) xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c2s4__neon_mlal);
396 xnn_params.qs8.gemm.minmax.igemm[XNN_MR_TO_INDEX(2)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_fn) xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c2s4__neon_mlal);
397 xnn_params.qs8.gemm.minmax.gemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_fn) xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c2s4__neon_mlal);
398 xnn_params.qs8.gemm.minmax.igemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_fn) xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2s4__neon_mlal);
399 xnn_params.qs8.gemm.init.qs8 = xnn_init_qs8_conv_minmax_rndnu_neon_params;
400 xnn_params.qs8.gemm.mr = 2;
401 xnn_params.qs8.gemm.nr = 8;
402 xnn_params.qs8.gemm.log2_kr = 1;
403 xnn_params.qs8.gemm.log2_sr = 2;
404 break;
405 case cpuinfo_uarch_exynos_m1:
406 case cpuinfo_uarch_exynos_m2:
407 case cpuinfo_uarch_exynos_m3:
408 xnn_params.qs8.gemm.minmax.gemm[XNN_MR_TO_INDEX(4)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_fn) xnn_qs8_gemm_minmax_rndnu_ukernel_4x8__asm_aarch32_neon_mlal_lane_prfm_ld64);
409 xnn_params.qs8.gemm.minmax.igemm[XNN_MR_TO_INDEX(4)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_fn) xnn_qs8_igemm_minmax_rndnu_ukernel_4x8__asm_aarch32_neon_mlal_lane_prfm_ld64);
410 xnn_params.qs8.gemm.minmax.gemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_fn) xnn_qs8_gemm_minmax_rndnu_ukernel_1x8__asm_aarch32_neon_mlal_lane_cortex_a7);
411 xnn_params.qs8.gemm.minmax.igemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_fn) xnn_qs8_igemm_minmax_rndnu_ukernel_1x8__asm_aarch32_neon_mlal_lane_cortex_a7);
412 xnn_params.qs8.gemm.init.qs8 = xnn_init_qs8_conv_minmax_rndnu_neon_params;
413 xnn_params.qs8.gemm.mr = 4;
414 xnn_params.qs8.gemm.nr = 8;
415 break;
416 default:
417 xnn_params.qs8.gemm.minmax.gemm[XNN_MR_TO_INDEX(4)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_fn) xnn_qs8_gemm_minmax_rndnu_ukernel_4x8__asm_aarch32_neon_mlal_lane_ld64);
418 xnn_params.qs8.gemm.minmax.igemm[XNN_MR_TO_INDEX(4)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_fn) xnn_qs8_igemm_minmax_rndnu_ukernel_4x8__asm_aarch32_neon_mlal_lane_ld64);
419 xnn_params.qs8.gemm.minmax.gemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_fn) xnn_qs8_gemm_minmax_rndnu_ukernel_1x8__asm_aarch32_neon_mlal_lane_cortex_a7);
420 xnn_params.qs8.gemm.minmax.igemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_fn) xnn_qs8_igemm_minmax_rndnu_ukernel_1x8__asm_aarch32_neon_mlal_lane_cortex_a7);
421 xnn_params.qs8.gemm.init.qs8 = xnn_init_qs8_conv_minmax_rndnu_neon_params;
422 xnn_params.qs8.gemm.mr = 4;
423 xnn_params.qs8.gemm.nr = 8;
424 break;
425 }
426 }
427 #if XNN_MAX_UARCH_TYPES > 1
428 {
429 /* Choose micro-kernels for little cores according to micro-kernel specification for the big core */
430 const uint32_t mr = xnn_params.qs8.gemm.mr;
431 const uint32_t nr = xnn_params.qs8.gemm.nr;
432 const uint32_t log2_kr = xnn_params.qs8.gemm.log2_kr;
433 for (size_t i = 1; i < XNN_MAX_UARCH_TYPES; i++) {
434 const struct cpuinfo_uarch_info* uarch_info = cpuinfo_get_uarch(i);
435 if (uarch_info == NULL) {
436 /* No more microarchitectures in the system */
437 break;
438 }
439
440 switch (uarch_info->uarch) {
441 case cpuinfo_uarch_cortex_a55:
442 #if XNN_ENABLE_ARM_DOTPROD
443 if (mr == 4 && nr == 8 && log2_kr == 2 && hardware_config->use_arm_neon_dot) {
444 xnn_params.qs8.gemm.minmax.gemm[XNN_MR_TO_INDEX(4)].function[i] = (xnn_gemm_ukernel_fn) xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c4__asm_aarch32_neondot_cortex_a55;
445 xnn_params.qs8.gemm.minmax.igemm[XNN_MR_TO_INDEX(4)].function[i] = (xnn_igemm_ukernel_fn) xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c4__asm_aarch32_neondot_cortex_a55;
446 xnn_params.qs8.gemm.minmax.gemm[XNN_MR_TO_INDEX(1)].function[i] = (xnn_gemm_ukernel_fn) xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c4__neondot;
447 xnn_params.qs8.gemm.minmax.igemm[XNN_MR_TO_INDEX(1)].function[i] = (xnn_igemm_ukernel_fn) xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4__neondot;
448 }
449 #endif // XNN_ENABLE_ARM_DOTPROD
450 break;
451 case cpuinfo_uarch_cortex_a53:
452 if (mr == 4 && nr == 8 && log2_kr == 0) {
453 xnn_params.qs8.gemm.minmax.gemm[XNN_MR_TO_INDEX(4)].function[i] = (xnn_gemm_ukernel_fn) xnn_qs8_gemm_minmax_rndnu_ukernel_4x8__asm_aarch32_neon_mlal_lane_prfm_cortex_a53;
454 xnn_params.qs8.gemm.minmax.igemm[XNN_MR_TO_INDEX(4)].function[i] = (xnn_igemm_ukernel_fn) xnn_qs8_igemm_minmax_rndnu_ukernel_4x8__asm_aarch32_neon_mlal_lane_prfm_cortex_a53;
455 xnn_params.qs8.gemm.minmax.gemm[XNN_MR_TO_INDEX(1)].function[i] = (xnn_gemm_ukernel_fn) xnn_qs8_gemm_minmax_rndnu_ukernel_1x8__asm_aarch32_neon_mlal_lane_prfm_cortex_a7;
456 xnn_params.qs8.gemm.minmax.igemm[XNN_MR_TO_INDEX(1)].function[i] = (xnn_igemm_ukernel_fn) xnn_qs8_igemm_minmax_rndnu_ukernel_1x8__asm_aarch32_neon_mlal_lane_prfm_cortex_a7;
457 }
458 break;
459 case cpuinfo_uarch_cortex_a55r0:
460 if (mr == 4 && nr == 8 && log2_kr == 0) {
461 xnn_params.qs8.gemm.minmax.gemm[XNN_MR_TO_INDEX(4)].function[i] = (xnn_gemm_ukernel_fn) xnn_qs8_gemm_minmax_rndnu_ukernel_4x8__asm_aarch32_neon_mlal_lane_cortex_a53;
462 xnn_params.qs8.gemm.minmax.igemm[XNN_MR_TO_INDEX(4)].function[i] = (xnn_igemm_ukernel_fn) xnn_qs8_igemm_minmax_rndnu_ukernel_4x8__asm_aarch32_neon_mlal_lane_cortex_a53;
463 xnn_params.qs8.gemm.minmax.gemm[XNN_MR_TO_INDEX(1)].function[i] = (xnn_gemm_ukernel_fn) xnn_qs8_gemm_minmax_rndnu_ukernel_1x8__asm_aarch32_neon_mlal_lane_cortex_a7;
464 xnn_params.qs8.gemm.minmax.igemm[XNN_MR_TO_INDEX(1)].function[i] = (xnn_igemm_ukernel_fn) xnn_qs8_igemm_minmax_rndnu_ukernel_1x8__asm_aarch32_neon_mlal_lane_cortex_a7;
465 }
466 break;
467 default:
468 break;
469 }
470 }
471 }
472 #endif // XNN_MAX_UARCH_TYPES > 1
473 #else // XNN_ENABLE_ASSEMBLY
474 if (XNN_ENABLE_ARM_DOTPROD && hardware_config->use_arm_neon_dot) {
475 #if XNN_ENABLE_ARM_DOTPROD
476 xnn_params.qs8.gemm.minmax.gemm[XNN_MR_TO_INDEX(4)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_fn) xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c4__neondot);
477 xnn_params.qs8.gemm.minmax.igemm[XNN_MR_TO_INDEX(4)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_fn) xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c4__neondot);
478 xnn_params.qs8.gemm.minmax.gemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_fn) xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c4__neondot);
479 xnn_params.qs8.gemm.minmax.igemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_fn) xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4__neondot);
480 xnn_params.qs8.gemm.init.qs8 = xnn_init_qs8_conv_minmax_rndnu_neon_params;
481 xnn_params.qs8.gemm.mr = 4;
482 xnn_params.qs8.gemm.nr = 8;
483 xnn_params.qs8.gemm.log2_kr = 2;
484 #endif // XNN_ENABLE_ARM_DOTPROD
485 } else {
486 xnn_params.qs8.gemm.minmax.gemm[XNN_MR_TO_INDEX(2)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_fn) xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c2s4__neon_mlal);
487 xnn_params.qs8.gemm.minmax.igemm[XNN_MR_TO_INDEX(2)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_fn) xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c2s4__neon_mlal);
488 xnn_params.qs8.gemm.minmax.gemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_fn) xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c2s4__neon_mlal);
489 xnn_params.qs8.gemm.minmax.igemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_fn) xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2s4__neon_mlal);
490 xnn_params.qs8.gemm.init.qs8 = xnn_init_qs8_conv_minmax_rndnu_neon_params;
491 xnn_params.qs8.gemm.mr = 2;
492 xnn_params.qs8.gemm.nr = 8;
493 xnn_params.qs8.gemm.log2_kr = 1;
494 xnn_params.qs8.gemm.log2_sr = 2;
495 }
496 #endif // XNN_ENABLE_ASSEMBLY
497
498 xnn_params.qs8.dwconv[0].minmax.unipass = (xnn_dwconv_unipass_ukernel_fn) xnn_qs8_dwconv_minmax_rndnu_ukernel_9p16c__neon_mla8_ld64;
499 xnn_params.qs8.dwconv[0].init.qs8 = xnn_init_qs8_conv_minmax_rndnu_neon_params;
500 xnn_params.qs8.dwconv[0].channel_tile = 16;
501 xnn_params.qs8.dwconv[0].primary_tile = 9;
502 xnn_params.qs8.dwconv[1].minmax.unipass = (xnn_dwconv_unipass_ukernel_fn) xnn_qs8_dwconv_minmax_rndnu_ukernel_25p8c__neon_mla8_ld64;
503 xnn_params.qs8.dwconv[1].init.qs8 = xnn_init_qs8_conv_minmax_rndnu_neon_params;
504 xnn_params.qs8.dwconv[1].channel_tile = 8;
505 xnn_params.qs8.dwconv[1].primary_tile = 25;
506
507 xnn_params.qs8.gavgpool = (struct gavgpool_parameters) {
508 .unipass = (xnn_gavgpool_unipass_ukernel_fn) xnn_qs8_gavgpool_minmax_rndnu_ukernel_7x__neon_c8,
509 .multipass = (xnn_gavgpool_multipass_ukernel_fn) xnn_qs8_gavgpool_minmax_rndnu_ukernel_7p7x__neon_c8,
510 .init.qs8 = xnn_init_qs8_avgpool_minmax_rndnu_neon_params,
511 .update.qs8 = xnn_update_qs8_avgpool_minmax_rndnu_neon_params,
512 .row_tile = 7,
513 .channel_tile = 8,
514 };
515
516
517 xnn_params.qs8.lrelu = (struct vunary_parameters) {
518 .ukernel = (xnn_vunary_ukernel_fn) xnn_qs8_vlrelu_ukernel__neon_x32,
519 .init.qs8_lrelu = xnn_init_qs8_lrelu_neon_params,
520 .element_tile = 32,
521 };
522 #endif // XNN_NO_QS8_OPERATORS
523
524 /*************************** QU8 AArch32 micro-kernels ***************************/
525 #ifndef XNN_NO_QU8_OPERATORS
526 init_flags |= XNN_INIT_FLAG_QU8;
527
528 #if XNN_ENABLE_ASSEMBLY
529 if (XNN_ENABLE_ARM_DOTPROD && hardware_config->use_arm_neon_dot) {
530 #if XNN_ENABLE_ARM_DOTPROD
531 xnn_params.qu8.gemm.minmax.gemm[XNN_MR_TO_INDEX(4)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_fn) xnn_qu8_gemm_minmax_rndnu_ukernel_4x8c4__neondot);
532 xnn_params.qu8.gemm.minmax.igemm[XNN_MR_TO_INDEX(4)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_fn) xnn_qu8_igemm_minmax_rndnu_ukernel_4x8c4__neondot);
533 xnn_params.qu8.gemm.minmax.gemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_fn) xnn_qu8_gemm_minmax_rndnu_ukernel_1x8c4__neondot);
534 xnn_params.qu8.gemm.minmax.igemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_fn) xnn_qu8_igemm_minmax_rndnu_ukernel_1x8c4__neondot);
535 xnn_params.qu8.gemm.init.qu8 = xnn_init_qu8_conv_minmax_rndnu_neon_params;
536 xnn_params.qu8.gemm.mr = 4;
537 xnn_params.qu8.gemm.nr = 8;
538 xnn_params.qu8.gemm.log2_kr = 2;
539 #endif // XNN_ENABLE_ARM_DOTPROD
540 } else {
541 switch (cpuinfo_get_uarch(0)->uarch) {
542 case cpuinfo_uarch_cortex_a5:
543 case cpuinfo_uarch_cortex_a7:
544 case cpuinfo_uarch_krait:
545 case cpuinfo_uarch_kryo:
546 xnn_params.qu8.gemm.minmax.gemm[XNN_MR_TO_INDEX(4)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_fn) xnn_qu8_gemm_minmax_rndnu_ukernel_4x8__asm_aarch32_neon_mlal_lane_prfm_cortex_a7);
547 xnn_params.qu8.gemm.minmax.igemm[XNN_MR_TO_INDEX(4)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_fn) xnn_qu8_igemm_minmax_rndnu_ukernel_4x8__asm_aarch32_neon_mlal_lane_prfm_cortex_a7);
548 xnn_params.qu8.gemm.minmax.gemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_fn) xnn_qu8_gemm_minmax_rndnu_ukernel_1x8__asm_aarch32_neon_mlal_lane_prfm_cortex_a7);
549 xnn_params.qu8.gemm.minmax.igemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_fn) xnn_qu8_igemm_minmax_rndnu_ukernel_1x8__asm_aarch32_neon_mlal_lane_prfm_cortex_a7);
550 xnn_params.qu8.gemm.init.qu8 = xnn_init_qu8_conv_minmax_rndnu_neon_params;
551 xnn_params.qu8.gemm.mr = 4;
552 xnn_params.qu8.gemm.nr = 8;
553 break;
554 case cpuinfo_uarch_cortex_a32:
555 case cpuinfo_uarch_cortex_a35:
556 xnn_params.qu8.gemm.minmax.gemm[XNN_MR_TO_INDEX(4)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_fn) xnn_qu8_gemm_minmax_rndnu_ukernel_4x8__asm_aarch32_neon_mlal_lane_cortex_a7);
557 xnn_params.qu8.gemm.minmax.igemm[XNN_MR_TO_INDEX(4)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_fn) xnn_qu8_igemm_minmax_rndnu_ukernel_4x8__asm_aarch32_neon_mlal_lane_cortex_a7);
558 xnn_params.qu8.gemm.minmax.gemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_fn) xnn_qu8_gemm_minmax_rndnu_ukernel_1x8__asm_aarch32_neon_mlal_lane_cortex_a7);
559 xnn_params.qu8.gemm.minmax.igemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_fn) xnn_qu8_igemm_minmax_rndnu_ukernel_1x8__asm_aarch32_neon_mlal_lane_cortex_a7);
560 xnn_params.qu8.gemm.init.qu8 = xnn_init_qu8_conv_minmax_rndnu_neon_params;
561 xnn_params.qu8.gemm.mr = 4;
562 xnn_params.qu8.gemm.nr = 8;
563 break;
564 case cpuinfo_uarch_cortex_a53:
565 case cpuinfo_uarch_cortex_a57:
566 case cpuinfo_uarch_cortex_a72:
567 xnn_params.qu8.gemm.minmax.gemm[XNN_MR_TO_INDEX(4)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_fn) xnn_qu8_gemm_minmax_rndnu_ukernel_4x8__asm_aarch32_neon_mlal_lane_prfm_cortex_a53);
568 xnn_params.qu8.gemm.minmax.igemm[XNN_MR_TO_INDEX(4)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_fn) xnn_qu8_igemm_minmax_rndnu_ukernel_4x8__asm_aarch32_neon_mlal_lane_prfm_cortex_a53);
569 xnn_params.qu8.gemm.minmax.gemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_fn) xnn_qu8_gemm_minmax_rndnu_ukernel_1x8__asm_aarch32_neon_mlal_lane_prfm_cortex_a7);
570 xnn_params.qu8.gemm.minmax.igemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_fn) xnn_qu8_igemm_minmax_rndnu_ukernel_1x8__asm_aarch32_neon_mlal_lane_prfm_cortex_a7);
571 xnn_params.qu8.gemm.init.qu8 = xnn_init_qu8_conv_minmax_rndnu_neon_params;
572 xnn_params.qu8.gemm.mr = 4;
573 xnn_params.qu8.gemm.nr = 8;
574 break;
575 case cpuinfo_uarch_cortex_a55r0:
576 xnn_params.qu8.gemm.minmax.gemm[XNN_MR_TO_INDEX(4)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_fn) xnn_qu8_gemm_minmax_rndnu_ukernel_4x8__asm_aarch32_neon_mlal_lane_cortex_a53);
577 xnn_params.qu8.gemm.minmax.igemm[XNN_MR_TO_INDEX(4)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_fn) xnn_qu8_igemm_minmax_rndnu_ukernel_4x8__asm_aarch32_neon_mlal_lane_cortex_a53);
578 xnn_params.qu8.gemm.minmax.gemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_fn) xnn_qu8_gemm_minmax_rndnu_ukernel_1x8__asm_aarch32_neon_mlal_lane_cortex_a7);
579 xnn_params.qu8.gemm.minmax.igemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_fn) xnn_qu8_igemm_minmax_rndnu_ukernel_1x8__asm_aarch32_neon_mlal_lane_cortex_a7);
580 xnn_params.qu8.gemm.init.qu8 = xnn_init_qu8_conv_minmax_rndnu_neon_params;
581 xnn_params.qu8.gemm.mr = 4;
582 xnn_params.qu8.gemm.nr = 8;
583 break;
584 case cpuinfo_uarch_exynos_m1:
585 case cpuinfo_uarch_exynos_m2:
586 case cpuinfo_uarch_exynos_m3:
587 xnn_params.qu8.gemm.minmax.gemm[XNN_MR_TO_INDEX(4)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_fn) xnn_qu8_gemm_minmax_rndnu_ukernel_4x8__asm_aarch32_neon_mlal_lane_prfm_ld64);
588 xnn_params.qu8.gemm.minmax.igemm[XNN_MR_TO_INDEX(4)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_fn) xnn_qu8_igemm_minmax_rndnu_ukernel_4x8__asm_aarch32_neon_mlal_lane_prfm_ld64);
589 xnn_params.qu8.gemm.minmax.gemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_fn) xnn_qu8_gemm_minmax_rndnu_ukernel_1x8__asm_aarch32_neon_mlal_lane_prfm_cortex_a7);
590 xnn_params.qu8.gemm.minmax.igemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_fn) xnn_qu8_igemm_minmax_rndnu_ukernel_1x8__asm_aarch32_neon_mlal_lane_prfm_cortex_a7);
591 xnn_params.qu8.gemm.init.qu8 = xnn_init_qu8_conv_minmax_rndnu_neon_params;
592 xnn_params.qu8.gemm.mr = 4;
593 xnn_params.qu8.gemm.nr = 8;
594 break;
595 default:
596 xnn_params.qu8.gemm.minmax.gemm[XNN_MR_TO_INDEX(4)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_fn) xnn_qu8_gemm_minmax_rndnu_ukernel_4x8__asm_aarch32_neon_mlal_lane_ld64);
597 xnn_params.qu8.gemm.minmax.igemm[XNN_MR_TO_INDEX(4)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_fn) xnn_qu8_igemm_minmax_rndnu_ukernel_4x8__asm_aarch32_neon_mlal_lane_ld64);
598 xnn_params.qu8.gemm.minmax.gemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_fn) xnn_qu8_gemm_minmax_rndnu_ukernel_1x8__asm_aarch32_neon_mlal_lane_cortex_a7);
599 xnn_params.qu8.gemm.minmax.igemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_fn) xnn_qu8_igemm_minmax_rndnu_ukernel_1x8__asm_aarch32_neon_mlal_lane_cortex_a7);
600 xnn_params.qu8.gemm.init.qu8 = xnn_init_qu8_conv_minmax_rndnu_neon_params;
601 xnn_params.qu8.gemm.mr = 4;
602 xnn_params.qu8.gemm.nr = 8;
603 break;
604 }
605 }
606 #if XNN_MAX_UARCH_TYPES > 1
607 {
608 /* Choose micro-kernels for little cores according to micro-kernel specification for the big core */
609 const uint32_t mr = xnn_params.qu8.gemm.mr;
610 const uint32_t nr = xnn_params.qu8.gemm.nr;
611 const uint32_t log2_kr = xnn_params.qu8.gemm.log2_kr;
612 for (size_t i = 1; i < XNN_MAX_UARCH_TYPES; i++) {
613 const struct cpuinfo_uarch_info* uarch_info = cpuinfo_get_uarch(i);
614 if (uarch_info == NULL) {
615 /* No more microarchitectures in the system */
616 break;
617 }
618
619 switch (uarch_info->uarch) {
620 case cpuinfo_uarch_cortex_a53:
621 if (mr == 4 && nr == 8 && log2_kr == 0) {
622 xnn_params.qu8.gemm.minmax.gemm[XNN_MR_TO_INDEX(4)].function[i] = (xnn_gemm_ukernel_fn) xnn_qu8_gemm_minmax_rndnu_ukernel_4x8__asm_aarch32_neon_mlal_lane_prfm_cortex_a53;
623 xnn_params.qu8.gemm.minmax.igemm[XNN_MR_TO_INDEX(4)].function[i] = (xnn_igemm_ukernel_fn) xnn_qu8_igemm_minmax_rndnu_ukernel_4x8__asm_aarch32_neon_mlal_lane_prfm_cortex_a53;
624 xnn_params.qu8.gemm.minmax.gemm[XNN_MR_TO_INDEX(1)].function[i] = (xnn_gemm_ukernel_fn) xnn_qu8_gemm_minmax_rndnu_ukernel_1x8__asm_aarch32_neon_mlal_lane_prfm_cortex_a7;
625 xnn_params.qu8.gemm.minmax.igemm[XNN_MR_TO_INDEX(1)].function[i] = (xnn_igemm_ukernel_fn) xnn_qu8_igemm_minmax_rndnu_ukernel_1x8__asm_aarch32_neon_mlal_lane_prfm_cortex_a7;
626 }
627 break;
628 case cpuinfo_uarch_cortex_a55r0:
629 if (mr == 4 && nr == 8 && log2_kr == 0) {
630 xnn_params.qu8.gemm.minmax.gemm[XNN_MR_TO_INDEX(4)].function[i] = (xnn_gemm_ukernel_fn) xnn_qu8_gemm_minmax_rndnu_ukernel_4x8__asm_aarch32_neon_mlal_lane_cortex_a53;
631 xnn_params.qu8.gemm.minmax.igemm[XNN_MR_TO_INDEX(4)].function[i] = (xnn_igemm_ukernel_fn) xnn_qu8_igemm_minmax_rndnu_ukernel_4x8__asm_aarch32_neon_mlal_lane_cortex_a53;
632 xnn_params.qu8.gemm.minmax.gemm[XNN_MR_TO_INDEX(1)].function[i] = (xnn_gemm_ukernel_fn) xnn_qu8_gemm_minmax_rndnu_ukernel_1x8__asm_aarch32_neon_mlal_lane_cortex_a7;
633 xnn_params.qu8.gemm.minmax.igemm[XNN_MR_TO_INDEX(1)].function[i] = (xnn_igemm_ukernel_fn) xnn_qu8_igemm_minmax_rndnu_ukernel_1x8__asm_aarch32_neon_mlal_lane_cortex_a7;
634 }
635 break;
636 default:
637 break;
638 }
639 }
640 }
641 #endif // XNN_MAX_UARCH_TYPES > 1
642 #else // XNN_ENABLE_ASSEMBLY
643 if (XNN_ENABLE_ARM_DOTPROD && hardware_config->use_arm_neon_dot) {
644 #if XNN_ENABLE_ARM_DOTPROD
645 xnn_params.qu8.gemm.minmax.gemm[XNN_MR_TO_INDEX(4)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_fn) xnn_qu8_gemm_minmax_rndnu_ukernel_4x8c4__neondot);
646 xnn_params.qu8.gemm.minmax.igemm[XNN_MR_TO_INDEX(4)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_fn) xnn_qu8_igemm_minmax_rndnu_ukernel_4x8c4__neondot);
647 xnn_params.qu8.gemm.minmax.gemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_fn) xnn_qu8_gemm_minmax_rndnu_ukernel_1x8c4__neondot);
648 xnn_params.qu8.gemm.minmax.igemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_fn) xnn_qu8_igemm_minmax_rndnu_ukernel_1x8c4__neondot);
649 xnn_params.qu8.gemm.init.qu8 = xnn_init_qu8_conv_minmax_rndnu_neon_params;
650 xnn_params.qu8.gemm.mr = 4;
651 xnn_params.qu8.gemm.nr = 8;
652 xnn_params.qu8.gemm.log2_kr = 2;
653 #endif // XNN_ENABLE_ARM_DOTPROD
654 } else {
655 xnn_params.qu8.gemm.minmax.gemm[XNN_MR_TO_INDEX(3)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_fn) xnn_qu8_gemm_minmax_rndnu_ukernel_3x8__neon_mlal_lane);
656 xnn_params.qu8.gemm.minmax.igemm[XNN_MR_TO_INDEX(3)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_fn) xnn_qu8_igemm_minmax_rndnu_ukernel_3x8__neon_mlal_lane);
657 xnn_params.qu8.gemm.minmax.gemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_fn) xnn_qu8_gemm_minmax_rndnu_ukernel_1x8__neon_mlal_lane);
658 xnn_params.qu8.gemm.minmax.igemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_fn) xnn_qu8_igemm_minmax_rndnu_ukernel_1x8__neon_mlal_lane);
659 xnn_params.qu8.gemm.init.qu8 = xnn_init_qu8_conv_minmax_rndnu_neon_params;
660 xnn_params.qu8.gemm.mr = 3;
661 xnn_params.qu8.gemm.nr = 8;
662 }
663 #endif // XNN_ENABLE_ASSEMBLY
664
665 xnn_params.qu8.dwconv[0].minmax.unipass = (xnn_dwconv_unipass_ukernel_fn) xnn_qu8_dwconv_minmax_rndnu_ukernel_9p16c__neon_mul8;
666 xnn_params.qu8.dwconv[0].init.qu8 = xnn_init_qu8_conv_minmax_rndnu_neon_params;
667 xnn_params.qu8.dwconv[0].channel_tile = 16;
668 xnn_params.qu8.dwconv[0].primary_tile = 9;
669 xnn_params.qu8.dwconv[1].minmax.unipass = (xnn_dwconv_unipass_ukernel_fn) xnn_qu8_dwconv_minmax_rndnu_ukernel_25p8c__neon_mul8;
670 xnn_params.qu8.dwconv[1].init.qu8 = xnn_init_qu8_conv_minmax_rndnu_neon_params;
671 xnn_params.qu8.dwconv[1].channel_tile = 8;
672 xnn_params.qu8.dwconv[1].primary_tile = 25;
673
674 xnn_params.qu8.avgpool = (struct avgpool_parameters) {
675 .unipass = (xnn_avgpool_unipass_ukernel_fn) xnn_qu8_avgpool_minmax_fp32_ukernel_9x__neon_c8,
676 .multipass = (xnn_avgpool_multipass_ukernel_fn) xnn_qu8_avgpool_minmax_fp32_ukernel_9p8x__neon_c8,
677 .init.qu8 = xnn_init_qu8_avgpool_minmax_fp32_neon_params,
678 .primary_tile = 9,
679 .incremental_tile = 8,
680 .channel_tile = 8,
681 };
682 xnn_params.qu8.gavgpool = (struct gavgpool_parameters) {
683 .unipass = (xnn_gavgpool_unipass_ukernel_fn) xnn_qu8_gavgpool_minmax_rndnu_ukernel_7x__neon_c8,
684 .multipass = (xnn_gavgpool_multipass_ukernel_fn) xnn_qu8_gavgpool_minmax_rndnu_ukernel_7p7x__neon_c8,
685 .init.qu8 = xnn_init_qu8_avgpool_minmax_rndnu_neon_params,
686 .update.qu8 = xnn_update_qu8_avgpool_minmax_rndnu_neon_params,
687 .row_tile = 7,
688 .channel_tile = 8,
689 };
690
691 xnn_params.qu8.lrelu = (struct vunary_parameters) {
692 .ukernel = (xnn_vunary_ukernel_fn) xnn_qu8_vlrelu_ukernel__neon_x32,
693 .init.qu8_lrelu = xnn_init_qu8_lrelu_neon_params,
694 .element_tile = 32,
695 };
696 #endif // XNN_NO_QU8_OPERATORS
697
698 /**************************** S8 AArch32 micro-kernels ****************************/
699 #ifndef XNN_NO_S8_OPERATORS
700 init_flags |= XNN_INIT_FLAG_S8;
701
702 xnn_params.s8.clamp = (struct vunary_parameters) {
703 .ukernel = (xnn_vunary_ukernel_fn) xnn_s8_vclamp_ukernel__neon_x64,
704 .init.s8_minmax = xnn_init_s8_minmax_neon_params,
705 .element_tile = 64,
706 };
707 xnn_params.s8.ibilinear = (struct ibilinear_parameters) {
708 .ukernel = (xnn_ibilinear_ukernel_fn) xnn_s8_ibilinear_ukernel__neon_c8,
709 .pixel_tile = 1,
710 .channel_tile = 8,
711 };
712 xnn_params.s8.maxpool = (struct maxpool_parameters) {
713 .ukernel = (xnn_maxpool_ukernel_fn) xnn_s8_maxpool_minmax_ukernel_9p8x__neon_c16,
714 .init.s8 = xnn_init_s8_minmax_neon_params,
715 .mr = 9,
716 .qr = 8,
717 };
718 #endif // XNN_NO_S8_OPERATORS
719
720 /**************************** U8 AArch32 micro-kernels ****************************/
721 #ifndef XNN_NO_U8_OPERATORS
722 init_flags |= XNN_INIT_FLAG_U8;
723
724 xnn_params.u8.clamp = (struct vunary_parameters) {
725 .ukernel = (xnn_vunary_ukernel_fn) xnn_u8_vclamp_ukernel__neon_x64,
726 .init.u8_minmax = xnn_init_u8_minmax_neon_params,
727 .element_tile = 64,
728 };
729 xnn_params.u8.ibilinear = (struct ibilinear_parameters) {
730 .ukernel = (xnn_ibilinear_ukernel_fn) xnn_u8_ibilinear_ukernel__neon_c8,
731 .pixel_tile = 1,
732 .channel_tile = 8,
733 };
734 xnn_params.u8.maxpool = (struct maxpool_parameters) {
735 .ukernel = (xnn_maxpool_ukernel_fn) xnn_u8_maxpool_minmax_ukernel_9p8x__neon_c16,
736 .init.u8 = xnn_init_u8_minmax_neon_params,
737 .mr = 9,
738 .qr = 8,
739 };
740 xnn_params.u8.rmax = xnn_u8_rmax_ukernel__neon;
741 xnn_params.u8.lut32norm = xnn_u8_lut32norm_ukernel__scalar;
742 #endif // XNN_NO_U8_OPERATORS
743
744 /**************************** X8 AArch32 micro-kernels ****************************/
745 #ifndef XNN_NO_X8_OPERATORS
746 init_flags |= XNN_INIT_FLAG_X8;
747
748 xnn_params.x8.zip = (struct zip_parameters) {
749 .x2 = (xnn_zipc_ukernel_fn) xnn_x8_zip_x2_ukernel__neon,
750 .x3 = (xnn_zipc_ukernel_fn) xnn_x8_zip_x3_ukernel__neon,
751 .x4 = (xnn_zipc_ukernel_fn) xnn_x8_zip_x4_ukernel__neon,
752 .xm = (xnn_zipv_ukernel_fn) xnn_x8_zip_xm_ukernel__neon,
753 };
754 #endif // XNN_NO_X8_OPERATORS
755
756 /**************************** F16 AArch32 micro-kernels ****************************/
757 #ifndef XNN_NO_F16_OPERATORS
758 #if XNN_ENABLE_ARM_FP16_VECTOR && XNN_ENABLE_ARM_FP16_SCALAR
759 if (hardware_config->use_arm_neon_fp16_arith) {
760 init_flags |= XNN_INIT_FLAG_F16 | XNN_INIT_FLAG_F16_NATIVE;
761
762 xnn_params.f16.gemm.minmax.gemm[XNN_MR_TO_INDEX(6)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_fn) xnn_f16_gemm_minmax_ukernel_6x8__neonfp16arith_ld64);
763 xnn_params.f16.gemm.minmax.igemm[XNN_MR_TO_INDEX(6)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_fn) xnn_f16_igemm_minmax_ukernel_6x8__neonfp16arith_ld64);
764 xnn_params.f16.gemm.minmax.gemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_fn) xnn_f16_gemm_minmax_ukernel_1x8__neonfp16arith_ld64);
765 xnn_params.f16.gemm.minmax.igemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_fn) xnn_f16_igemm_minmax_ukernel_1x8__neonfp16arith_ld64);
766 xnn_params.f16.gemm.init.f16 = xnn_init_f16_minmax_fp16arith_params;
767 xnn_params.f16.gemm.mr = 6;
768 xnn_params.f16.gemm.nr = 8;
769
770 xnn_params.f16.dwconv[0].minmax.unipass = (xnn_dwconv_unipass_ukernel_fn) xnn_f16_dwconv_minmax_ukernel_3p16c__neonfp16arith;
771 xnn_params.f16.dwconv[0].init.f16 = xnn_init_f16_minmax_fp16arith_params;
772 xnn_params.f16.dwconv[0].channel_tile = 16;
773 xnn_params.f16.dwconv[0].primary_tile = 3;
774
775 xnn_params.f16.dwconv[1].minmax.unipass = (xnn_dwconv_unipass_ukernel_fn) xnn_f16_dwconv_minmax_ukernel_4p16c__neonfp16arith;
776 xnn_params.f16.dwconv[1].init.f16 = xnn_init_f16_minmax_fp16arith_params;
777 xnn_params.f16.dwconv[1].channel_tile = 16;
778 xnn_params.f16.dwconv[1].primary_tile = 4;
779
780 xnn_params.f16.dwconv[2].minmax.unipass = (xnn_dwconv_unipass_ukernel_fn) xnn_f16_dwconv_minmax_ukernel_9p8c__neonfp16arith;
781 xnn_params.f16.dwconv[2].init.f16 = xnn_init_f16_minmax_fp16arith_params;
782 xnn_params.f16.dwconv[2].channel_tile = 8;
783 xnn_params.f16.dwconv[2].primary_tile = 9;
784
785 xnn_params.f16.dwconv[3].minmax.unipass = (xnn_dwconv_unipass_ukernel_fn) xnn_f16_dwconv_minmax_ukernel_25p8c__neonfp16arith_acc2;
786 xnn_params.f16.dwconv[3].init.f16 = xnn_init_f16_minmax_fp16arith_params;
787 xnn_params.f16.dwconv[3].channel_tile = 8;
788 xnn_params.f16.dwconv[3].primary_tile = 25;
789
790 xnn_params.f16.avgpool = (struct avgpool_parameters) {
791 .unipass = (xnn_avgpool_unipass_ukernel_fn) xnn_f16_avgpool_minmax_ukernel_9x__neonfp16arith_c8,
792 .multipass = (xnn_avgpool_multipass_ukernel_fn) xnn_f16_avgpool_minmax_ukernel_9p8x__neonfp16arith_c8,
793 .init.f16 = xnn_init_f16_scaleminmax_fp16arith_params,
794 .primary_tile = 9,
795 .incremental_tile = 8,
796 .channel_tile = 8,
797 };
798 xnn_params.f16.pavgpool = (struct pavgpool_parameters) {
799 .unipass = (xnn_pavgpool_unipass_ukernel_fn) xnn_f16_pavgpool_minmax_ukernel_9x__neonfp16arith_c8,
800 .multipass = (xnn_pavgpool_multipass_ukernel_fn) xnn_f16_pavgpool_minmax_ukernel_9p8x__neonfp16arith_c8,
801 .init.f16 = xnn_init_f16_minmax_fp16arith_params,
802 .primary_tile = 9,
803 .incremental_tile = 8,
804 .channel_tile = 8,
805 };
806 xnn_params.f16.gavgpool = (struct gavgpool_parameters) {
807 .unipass = (xnn_gavgpool_unipass_ukernel_fn) xnn_f16_gavgpool_minmax_ukernel_7x__neonfp16arith_c8,
808 .multipass = (xnn_gavgpool_multipass_ukernel_fn) xnn_f16_gavgpool_minmax_ukernel_7p7x__neonfp16arith_c8,
809 .init.f16 = xnn_init_f16_scaleminmax_fp16arith_params,
810 .update.f16 = xnn_update_f16_scaleminmax_fp16arith_params,
811 .row_tile = 7,
812 .channel_tile = 8,
813 };
814
815 xnn_params.f16.maxpool = (struct maxpool_parameters) {
816 .ukernel = (xnn_maxpool_ukernel_fn) xnn_f16_maxpool_minmax_ukernel_9p8x__neonfp16arith_c8,
817 .init.f16 = xnn_init_f16_minmax_fp16arith_params,
818 .mr = 9,
819 .qr = 8,
820 };
821 xnn_params.f16.ibilinear = (struct ibilinear_parameters) {
822 .ukernel = (xnn_ibilinear_ukernel_fn) xnn_f16_ibilinear_ukernel__neonfp16arith_c8,
823 .pixel_tile = 1,
824 .channel_tile = 8,
825 };
826
827 xnn_params.f16.prelu = (struct prelu_parameters) {
828 .ukernel = (xnn_prelu_ukernel_fn) xnn_f16_prelu_ukernel__neonfp16arith_2x16,
829 .row_tile = 2,
830 .channel_tile = 16,
831 };
832
833 xnn_params.f16.raddstoreexpminusmax = (struct raddstoreexpminusmax_parameters) {
834 .ukernel = (xnn_raddstoreexpminusmax_ukernel_fn) xnn_f16_raddstoreexpminusmax_ukernel__neonfp16arith_rr2_p2_x32,
835 .init.f16 = xnn_init_f16_expminus_fp16arith_rr2_p2_params,
836 .element_tile = 32,
837 };
838 xnn_params.f16.rmax = (xnn_rmax_ukernel_fn) xnn_f16_rmax_ukernel__neonfp16arith;
839
840 xnn_params.f16.vmulcaddc = (struct vmulcaddc_parameters) {
841 .ukernel = (xnn_vmulcaddc_ukernel_fn) xnn_f16_vmulcaddc_minmax_ukernel_c8__neonfp16arith_2x,
842 .init.f16 = xnn_init_f16_minmax_fp16arith_params,
843 .channel_tile = 8,
844 .row_tile = 2,
845 };
846
847 xnn_params.f16.abs = (struct vunary_parameters) {
848 .ukernel = (xnn_vunary_ukernel_fn) xnn_f16_vabs_ukernel__neonfp16arith_x16,
849 .element_tile = 16,
850 };
851 xnn_params.f16.clamp = (struct vunary_parameters) {
852 .ukernel = (xnn_vunary_ukernel_fn) xnn_f16_vclamp_ukernel__neonfp16arith_x16,
853 .init.f16_minmax = xnn_init_f16_minmax_fp16arith_params,
854 .element_tile = 16,
855 };
856 xnn_params.f16.elu = (struct vunary_parameters) {
857 .ukernel = (xnn_vunary_ukernel_fn) xnn_f16_velu_ukernel__neonfp16arith_rr1_p3_x16,
858 .init.f16_elu = xnn_init_f16_elu_fp16arith_rr1_p3_params,
859 .element_tile = 16,
860 };
861 xnn_params.f16.hswish = (struct vunary_parameters) {
862 .ukernel = (xnn_vunary_ukernel_fn) xnn_f16_vhswish_ukernel__neonfp16arith_x16,
863 .init.f16_hswish = xnn_init_f16_hswish_fp16arith_params,
864 .element_tile = 16,
865 };
866 xnn_params.f16.lrelu = (struct vunary_parameters) {
867 .ukernel = (xnn_vunary_ukernel_fn) xnn_f16_vlrelu_ukernel__neonfp16arith_x16,
868 .init.f16_lrelu = xnn_init_f16_lrelu_fp16arith_params,
869 .element_tile = 16,
870 };
871 xnn_params.f16.neg = (struct vunary_parameters) {
872 .ukernel = (xnn_vunary_ukernel_fn) xnn_f16_vneg_ukernel__neonfp16arith_x16,
873 .element_tile = 16,
874 };
875 xnn_params.f16.rndne = (struct vunary_parameters) {
876 .ukernel = (xnn_vunary_ukernel_fn) xnn_f16_vrndne_ukernel__neonfp16arith_x16,
877 .element_tile = 16,
878 };
879 xnn_params.f16.rndz = (struct vunary_parameters) {
880 .ukernel = (xnn_vunary_ukernel_fn) xnn_f16_vrndz_ukernel__neonfp16arith_x16,
881 .element_tile = 16,
882 };
883 xnn_params.f16.rndu = (struct vunary_parameters) {
884 .ukernel = (xnn_vunary_ukernel_fn) xnn_f16_vrndu_ukernel__neonfp16arith_x16,
885 .element_tile = 16,
886 };
887 xnn_params.f16.rndd = (struct vunary_parameters) {
888 .ukernel = (xnn_vunary_ukernel_fn) xnn_f16_vrndd_ukernel__neonfp16arith_x16,
889 .element_tile = 16,
890 };
891 xnn_params.f16.sigmoid = (struct vunary_parameters) {
892 .ukernel = (xnn_vunary_ukernel_fn) xnn_f16_vsigmoid_ukernel__neonfp16arith_rr2_p2_nr1recps_x16,
893 .init.f16_sigmoid = xnn_init_f16_sigmoid_fp16arith_rr2_p2_params,
894 .element_tile = 16,
895 };
896 xnn_params.f16.sqr = (struct vunary_parameters) {
897 .ukernel = (xnn_vunary_ukernel_fn) xnn_f16_vsqr_ukernel__neonfp16arith_x16,
898 .element_tile = 16,
899 };
900 xnn_params.f16.sqrt = (struct vunary_parameters) {
901 .ukernel = (xnn_vunary_ukernel_fn) xnn_f16_vsqrt_ukernel__neonfp16arith_nr1fma1adj_x8,
902 .element_tile = 8,
903 };
904
905 #ifndef XNN_NO_NCHW_OPERATORS
906 init_flags |= XNN_INIT_FLAG_CHW_OPT;
907
908 xnn_params.f16.spmm = (struct spmm_parameters) {
909 .ukernel = (xnn_spmm_ukernel_fn) xnn_f16_spmm_minmax_ukernel_32x1__neonfp16arith_pipelined,
910 .init.f16 = xnn_init_f16_minmax_fp16arith_params,
911 .mr = 32,
912 .nr = 1,
913 };
914 xnn_params.f16.conv_hwc2chw_3x3c3s2 = (struct conv_hwc2chw_parameters) {
915 .ukernel_with_symm_padding =
916 (xnn_conv_hwc2chw_ukernel_fn) xnn_f16_conv_hwc2chw_ukernel_3x3s2p1c3x4__neonfp16arith_2x2,
917 .init.f16 = xnn_init_f16_minmax_fp16arith_params,
918 .output_channel_tile = 4,
919 .output_height_tile = 2,
920 .output_width_tile = 2,
921 };
922 xnn_params.f16.dwconv2d_chw_3x3 = (struct dwconv2d_chw_parameters) {
923 .ukernel = (xnn_dwconv2d_chw_ukernel_fn) xnn_f16_dwconv2d_chw_ukernel_3x3p1__neonfp16arith_2x8,
924 .init.f16 = xnn_init_f16_chw_neonfp16arith_stride1_params,
925 .update.f16 = xnn_update_f16_chw_neonfp16arith_stride1_params,
926 .output_height_tile = 2,
927 .output_width_tile = 8,
928 };
929 xnn_params.f16.dwconv2d_chw_3x3s2 = (struct dwconv2d_chw_parameters) {
930 .ukernel = (xnn_dwconv2d_chw_ukernel_fn) xnn_f16_dwconv2d_chw_ukernel_3x3s2p1__neonfp16arith_1x8,
931 .init.f16 = xnn_init_f16_chw_neonfp16arith_stride2_params,
932 .update.f16 = xnn_update_f16_chw_neonfp16arith_stride2_params,
933 .output_height_tile = 1,
934 .output_width_tile = 8,
935 };
936 xnn_params.f16.dwconv2d_chw_5x5 = (struct dwconv2d_chw_parameters) {
937 .ukernel = (xnn_dwconv2d_chw_ukernel_fn) xnn_f16_dwconv2d_chw_ukernel_5x5p2__neonfp16arith_1x8,
938 .init.f16 = xnn_init_f16_chw_neonfp16arith_stride1_params,
939 .update.f16 = xnn_update_f16_chw_neonfp16arith_stride1_params,
940 .output_height_tile = 1,
941 .output_width_tile = 8,
942 };
943 xnn_params.f16.dwconv2d_chw_5x5s2 = (struct dwconv2d_chw_parameters) {
944 .ukernel = (xnn_dwconv2d_chw_ukernel_fn) xnn_f16_dwconv2d_chw_ukernel_5x5s2p2__neonfp16arith_1x8,
945 .init.f16 = xnn_init_f16_chw_neonfp16arith_stride2_params,
946 .update.f16 = xnn_update_f16_chw_neonfp16arith_stride2_params,
947 .output_height_tile = 1,
948 .output_width_tile = 8,
949 };
950 xnn_params.f16.gavgpool_cw = (struct gavgpool_cw_parameters) {
951 .ukernel = (xnn_gavgpool_cw_ukernel_fn) xnn_f16_gavgpool_cw_ukernel__neonfp16arith_x8,
952 .init.f16 = xnn_init_f16_gavgpool_neonfp16arith_params,
953 .update.f16 = xnn_update_f16_gavgpool_neonfp16arith_params,
954 .channel_tile = 8,
955 };
956 xnn_params.f16.ibilinear_chw = (struct ibilinear_chw_parameters) {
957 .ukernel = (xnn_ibilinear_chw_ukernel_fn) xnn_f16_ibilinear_chw_ukernel__neonfp16arith_p8,
958 .channel_tile = 1,
959 .pixel_tile = 8,
960 };
961 #endif // XNN_NO_NCHW_OPERATORS
962 }
963 #endif // XNN_ENABLE_ARM_FP16_VECTOR && XNN_ENABLE_ARM_FP16_SCALAR
964 #endif // XNN_NO_F16_OPERATORS
965
966 /**************************** F32 AArch32 micro-kernels ****************************/
967 #ifndef XNN_NO_F32_OPERATORS
968 init_flags |= XNN_INIT_FLAG_F32;
969
970 #if XNN_ENABLE_ASSEMBLY
971 switch (cpuinfo_get_uarch(0)->uarch) {
972 case cpuinfo_uarch_cortex_a5:
973 case cpuinfo_uarch_cortex_a7:
974 case cpuinfo_uarch_krait:
975 case cpuinfo_uarch_kryo:
976 xnn_params.f32.gemm.minmax.gemm[XNN_MR_TO_INDEX(4)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_fn) xnn_f32_gemm_minmax_ukernel_4x8__asm_aarch32_neon_cortex_a7);
977 xnn_params.f32.gemm.minmax.igemm[XNN_MR_TO_INDEX(4)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_fn) xnn_f32_igemm_minmax_ukernel_4x8__asm_aarch32_neon_cortex_a7);
978 xnn_params.f32.gemm.minmax.gemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_fn) xnn_f32_gemm_minmax_ukernel_1x8__neon_lane_ld64);
979 xnn_params.f32.gemm.minmax.igemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_fn) xnn_f32_igemm_minmax_ukernel_1x8__neon_lane_ld64);
980 xnn_params.f32.gemm.init.f32 = xnn_init_f32_minmax_scalar_params;
981 xnn_params.f32.gemm.mr = 4;
982 xnn_params.f32.gemm.nr = 8;
983 break;
984 case cpuinfo_uarch_cortex_a53:
985 xnn_params.f32.gemm.minmax.gemm[XNN_MR_TO_INDEX(4)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_fn) xnn_f32_gemm_minmax_ukernel_4x8__asm_aarch32_neon_prfm_cortex_a53);
986 xnn_params.f32.gemm.minmax.igemm[XNN_MR_TO_INDEX(4)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_fn) xnn_f32_igemm_minmax_ukernel_4x8__asm_aarch32_neon_prfm_cortex_a53);
987 xnn_params.f32.gemm.minmax.gemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_fn) xnn_f32_gemm_minmax_ukernel_1x8__neon_lane_ld64);
988 xnn_params.f32.gemm.minmax.igemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_fn) xnn_f32_igemm_minmax_ukernel_1x8__neon_lane_ld64);
989 xnn_params.f32.gemm.init.f32 = xnn_init_f32_minmax_scalar_params;
990 xnn_params.f32.gemm.mr = 4;
991 xnn_params.f32.gemm.nr = 8;
992 break;
993 case cpuinfo_uarch_cortex_a55r0:
994 xnn_params.f32.gemm.minmax.gemm[XNN_MR_TO_INDEX(4)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_fn) xnn_f32_gemm_minmax_ukernel_4x8__asm_aarch32_neon_cortex_a53);
995 xnn_params.f32.gemm.minmax.igemm[XNN_MR_TO_INDEX(4)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_fn) xnn_f32_igemm_minmax_ukernel_4x8__asm_aarch32_neon_cortex_a53);
996 xnn_params.f32.gemm.minmax.gemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_fn) xnn_f32_gemm_minmax_ukernel_1x8__neon_lane_ld64);
997 xnn_params.f32.gemm.minmax.igemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_fn) xnn_f32_igemm_minmax_ukernel_1x8__neon_lane_ld64);
998 xnn_params.f32.gemm.init.f32 = xnn_init_f32_minmax_scalar_params;
999 xnn_params.f32.gemm.mr = 4;
1000 xnn_params.f32.gemm.nr = 8;
1001 break;
1002 case cpuinfo_uarch_cortex_a32:
1003 case cpuinfo_uarch_cortex_a35:
1004 case cpuinfo_uarch_cortex_a55:
1005 xnn_params.f32.gemm.minmax.gemm[XNN_MR_TO_INDEX(4)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_fn) xnn_f32_gemm_minmax_ukernel_4x8__asm_aarch32_neon_cortex_a55);
1006 xnn_params.f32.gemm.minmax.igemm[XNN_MR_TO_INDEX(4)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_fn) xnn_f32_igemm_minmax_ukernel_4x8__asm_aarch32_neon_cortex_a55);
1007 xnn_params.f32.gemm.minmax.gemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_fn) xnn_f32_gemm_minmax_ukernel_1x8__neon_lane_ld64);
1008 xnn_params.f32.gemm.minmax.igemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_fn) xnn_f32_igemm_minmax_ukernel_1x8__neon_lane_ld64);
1009 xnn_params.f32.gemm.init.f32 = xnn_init_f32_minmax_scalar_params;
1010 xnn_params.f32.gemm.mr = 4;
1011 xnn_params.f32.gemm.nr = 8;
1012 break;
1013
1014 case cpuinfo_uarch_cortex_a57:
1015 case cpuinfo_uarch_cortex_a72:
1016 case cpuinfo_uarch_cortex_a73:
1017 xnn_params.f32.gemm.minmax.gemm[XNN_MR_TO_INDEX(4)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_fn) xnn_f32_gemm_minmax_ukernel_4x8__asm_aarch32_neon_prfm_cortex_a75);
1018 xnn_params.f32.gemm.minmax.igemm[XNN_MR_TO_INDEX(4)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_fn) xnn_f32_igemm_minmax_ukernel_4x8__asm_aarch32_neon_prfm_cortex_a75);
1019 xnn_params.f32.gemm.minmax.gemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_fn) xnn_f32_gemm_minmax_ukernel_1x8__neon_lane_ld64);
1020 xnn_params.f32.gemm.minmax.igemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_fn) xnn_f32_igemm_minmax_ukernel_1x8__neon_lane_ld64);
1021 xnn_params.f32.gemm.init.f32 = xnn_init_f32_minmax_scalar_params;
1022 xnn_params.f32.gemm.mr = 4;
1023 xnn_params.f32.gemm.nr = 8;
1024 break;
1025
1026 default:
1027 xnn_params.f32.gemm.minmax.gemm[XNN_MR_TO_INDEX(4)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_fn) xnn_f32_gemm_minmax_ukernel_4x8__asm_aarch32_neon_cortex_a75);
1028 xnn_params.f32.gemm.minmax.igemm[XNN_MR_TO_INDEX(4)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_fn) xnn_f32_igemm_minmax_ukernel_4x8__asm_aarch32_neon_cortex_a75);
1029 xnn_params.f32.gemm.minmax.gemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_fn) xnn_f32_gemm_minmax_ukernel_1x8__neon_lane_ld64);
1030 xnn_params.f32.gemm.minmax.igemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_fn) xnn_f32_igemm_minmax_ukernel_1x8__neon_lane_ld64);
1031 xnn_params.f32.gemm.init.f32 = xnn_init_f32_minmax_scalar_params;
1032 xnn_params.f32.gemm.mr = 4;
1033 xnn_params.f32.gemm.nr = 8;
1034 #if XNN_ENABLE_JIT
1035 xnn_params.f32.gemm.generator.gemm[XNN_MR_TO_INDEX(4)] = xnn_init_hmp_gemm_codegen(xnn_generate_f32_gemm_ukernel_4x8__aarch32_neon_cortex_a75);
1036 xnn_params.f32.gemm.generator.igemm[XNN_MR_TO_INDEX(4)] = xnn_init_hmp_igemm_codegen(xnn_generate_f32_igemm_ukernel_4x8__aarch32_neon_cortex_a75);
1037 #endif
1038 break;
1039 }
1040 #if XNN_MAX_UARCH_TYPES > 1
1041 {
1042 /* Choose micro-kernels for little cores according to micro-kernel specification for the big core */
1043 const uint32_t mr = xnn_params.f32.gemm.mr;
1044 const uint32_t nr = xnn_params.f32.gemm.nr;
1045 for (size_t i = 1; i < XNN_MAX_UARCH_TYPES; i++) {
1046 const struct cpuinfo_uarch_info* uarch_info = cpuinfo_get_uarch(i);
1047 if (uarch_info == NULL) {
1048 /* No more microarchitectures in the system */
1049 break;
1050 }
1051
1052 switch (uarch_info->uarch) {
1053 case cpuinfo_uarch_cortex_a53:
1054 if (mr == 4 && nr == 8) {
1055 xnn_params.f32.gemm.minmax.gemm[XNN_MR_TO_INDEX(4)].function[i] = (xnn_gemm_ukernel_fn) xnn_f32_gemm_minmax_ukernel_4x8__asm_aarch32_neon_prfm_cortex_a53;
1056 xnn_params.f32.gemm.minmax.igemm[XNN_MR_TO_INDEX(4)].function[i] = (xnn_igemm_ukernel_fn) xnn_f32_igemm_minmax_ukernel_4x8__asm_aarch32_neon_prfm_cortex_a53;
1057 xnn_params.f32.gemm.minmax.gemm[XNN_MR_TO_INDEX(1)].function[i] = (xnn_gemm_ukernel_fn) xnn_f32_gemm_minmax_ukernel_1x8__neon_lane_ld64;
1058 xnn_params.f32.gemm.minmax.igemm[XNN_MR_TO_INDEX(1)].function[i] = (xnn_igemm_ukernel_fn) xnn_f32_igemm_minmax_ukernel_1x8__neon_lane_ld64;
1059 }
1060 break;
1061 case cpuinfo_uarch_cortex_a55r0:
1062 if (mr == 4 && nr == 8) {
1063 xnn_params.f32.gemm.minmax.gemm[XNN_MR_TO_INDEX(4)].function[i] = (xnn_gemm_ukernel_fn) xnn_f32_gemm_minmax_ukernel_4x8__asm_aarch32_neon_cortex_a53;
1064 xnn_params.f32.gemm.minmax.igemm[XNN_MR_TO_INDEX(4)].function[i] = (xnn_igemm_ukernel_fn) xnn_f32_igemm_minmax_ukernel_4x8__asm_aarch32_neon_cortex_a53;
1065 xnn_params.f32.gemm.minmax.gemm[XNN_MR_TO_INDEX(1)].function[i] = (xnn_gemm_ukernel_fn) xnn_f32_gemm_minmax_ukernel_1x8__neon_lane_ld64;
1066 xnn_params.f32.gemm.minmax.igemm[XNN_MR_TO_INDEX(1)].function[i] = (xnn_igemm_ukernel_fn) xnn_f32_igemm_minmax_ukernel_1x8__neon_lane_ld64;
1067 }
1068 break;
1069 case cpuinfo_uarch_cortex_a55:
1070 if (mr == 4 && nr == 8) {
1071 xnn_params.f32.gemm.minmax.gemm[XNN_MR_TO_INDEX(4)].function[i] = (xnn_gemm_ukernel_fn) xnn_f32_gemm_minmax_ukernel_4x8__asm_aarch32_neon_cortex_a55;
1072 xnn_params.f32.gemm.minmax.igemm[XNN_MR_TO_INDEX(4)].function[i] = (xnn_igemm_ukernel_fn) xnn_f32_igemm_minmax_ukernel_4x8__asm_aarch32_neon_cortex_a55;
1073 xnn_params.f32.gemm.minmax.gemm[XNN_MR_TO_INDEX(1)].function[i] = (xnn_gemm_ukernel_fn) xnn_f32_gemm_minmax_ukernel_1x8__neon_lane_ld64;
1074 xnn_params.f32.gemm.minmax.igemm[XNN_MR_TO_INDEX(1)].function[i] = (xnn_igemm_ukernel_fn) xnn_f32_igemm_minmax_ukernel_1x8__neon_lane_ld64;
1075 }
1076 break;
1077 default:
1078 break;
1079 }
1080 }
1081 }
1082 #endif // XNN_MAX_UARCH_TYPES > 1
1083 #else // XNN_ENABLE_ASSEMBLY
1084 xnn_params.f32.gemm.minmax.gemm[XNN_MR_TO_INDEX(4)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_fn) xnn_f32_gemm_minmax_ukernel_4x8__neon_lane_ld128);
1085 xnn_params.f32.gemm.minmax.igemm[XNN_MR_TO_INDEX(4)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_fn) xnn_f32_igemm_minmax_ukernel_4x8__neon_lane_ld128);
1086 xnn_params.f32.gemm.minmax.gemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_fn) xnn_f32_gemm_minmax_ukernel_1x8__neon_lane_ld64);
1087 xnn_params.f32.gemm.minmax.igemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_fn) xnn_f32_igemm_minmax_ukernel_1x8__neon_lane_ld64);
1088 xnn_params.f32.gemm.init.f32 = xnn_init_f32_minmax_scalar_params;
1089 xnn_params.f32.gemm.mr = 4;
1090 xnn_params.f32.gemm.nr = 8;
1091 #endif // XNN_ENABLE_ASSEMBLY
1092 xnn_params.f32.gemm2.minmax.gemm[XNN_MR_TO_INDEX(4)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_fn) xnn_f32_gemm_minmax_ukernel_4x2__neon_lane_ld64);
1093 xnn_params.f32.gemm2.minmax.igemm[XNN_MR_TO_INDEX(4)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_fn) xnn_f32_igemm_minmax_ukernel_4x2__neon_lane_ld64);
1094 xnn_params.f32.gemm2.init.f32 = xnn_init_f32_minmax_scalar_params;
1095 xnn_params.f32.gemm2.mr = 4;
1096 xnn_params.f32.gemm2.nr = 2;
1097
1098 xnn_params.f32.dwconv[0].minmax.unipass = (xnn_dwconv_unipass_ukernel_fn) xnn_f32_dwconv_minmax_ukernel_3p8c__neon;
1099 xnn_params.f32.dwconv[0].init.f32 = xnn_init_f32_minmax_scalar_params;
1100 xnn_params.f32.dwconv[0].channel_tile = 8,
1101 xnn_params.f32.dwconv[0].primary_tile = 3,
1102
1103 xnn_params.f32.dwconv[1].minmax.unipass = (xnn_dwconv_unipass_ukernel_fn) xnn_f32_dwconv_minmax_ukernel_4p8c__neon;
1104 xnn_params.f32.dwconv[1].init.f32 = xnn_init_f32_minmax_scalar_params;
1105 xnn_params.f32.dwconv[1].channel_tile = 8,
1106 xnn_params.f32.dwconv[1].primary_tile = 4,
1107
1108 xnn_params.f32.dwconv[2].minmax.unipass = (xnn_dwconv_unipass_ukernel_fn) xnn_f32_dwconv_minmax_ukernel_9p8c__neon;
1109 xnn_params.f32.dwconv[2].init.f32 = xnn_init_f32_minmax_scalar_params;
1110 xnn_params.f32.dwconv[2].channel_tile = 8;
1111 xnn_params.f32.dwconv[2].primary_tile = 9;
1112
1113 xnn_params.f32.dwconv[3].minmax.unipass = (xnn_dwconv_unipass_ukernel_fn) xnn_f32_dwconv_minmax_ukernel_25p8c__neon_acc2;
1114 xnn_params.f32.dwconv[3].init.f32 = xnn_init_f32_minmax_scalar_params;
1115 xnn_params.f32.dwconv[3].channel_tile = 8;
1116 xnn_params.f32.dwconv[3].primary_tile = 25;
1117
1118 xnn_params.f32.avgpool = (struct avgpool_parameters) {
1119 .unipass = (xnn_avgpool_unipass_ukernel_fn) xnn_f32_avgpool_minmax_ukernel_9x__neon_c4,
1120 .multipass = (xnn_avgpool_multipass_ukernel_fn) xnn_f32_avgpool_minmax_ukernel_9p8x__neon_c4,
1121 .init.f32 = xnn_init_f32_scaleminmax_scalar_params,
1122 .primary_tile = 9,
1123 .incremental_tile = 8,
1124 .channel_tile = 4,
1125 };
1126 xnn_params.f32.pavgpool = (struct pavgpool_parameters) {
1127 .unipass = (xnn_pavgpool_unipass_ukernel_fn) xnn_f32_pavgpool_minmax_ukernel_9x__neon_c4,
1128 .multipass = (xnn_pavgpool_multipass_ukernel_fn) xnn_f32_pavgpool_minmax_ukernel_9p8x__neon_c4,
1129 .init.f32 = xnn_init_f32_minmax_scalar_params,
1130 .primary_tile = 9,
1131 .incremental_tile = 8,
1132 .channel_tile = 4,
1133 };
1134 xnn_params.f32.gavgpool = (struct gavgpool_parameters) {
1135 .unipass = (xnn_gavgpool_unipass_ukernel_fn) xnn_f32_gavgpool_minmax_ukernel_7x__neon_c4,
1136 .multipass = (xnn_gavgpool_multipass_ukernel_fn) xnn_f32_gavgpool_minmax_ukernel_7p7x__neon_c4,
1137 .init.f32 = xnn_init_f32_scaleminmax_scalar_params,
1138 .update.f32 = xnn_update_f32_scaleminmax_scalar_params,
1139 .row_tile = 7,
1140 .channel_tile = 4,
1141 };
1142 xnn_params.f32.maxpool = (struct maxpool_parameters) {
1143 .ukernel = (xnn_maxpool_ukernel_fn) xnn_f32_maxpool_minmax_ukernel_9p8x__neon_c4,
1144 .init.f32 = xnn_init_f32_minmax_scalar_params,
1145 .mr = 9,
1146 .qr = 8,
1147 };
1148 xnn_params.f32.argmaxpool[0] = (struct argmaxpool_parameters) {
1149 .up = (xnn_argmaxpool_unipass_ukernel_fn) xnn_f32_argmaxpool_ukernel_4x__neon_c4,
1150 .mr = 4,
1151 };
1152 xnn_params.f32.argmaxpool[1] = (struct argmaxpool_parameters) {
1153 .up = (xnn_argmaxpool_unipass_ukernel_fn) xnn_f32_argmaxpool_ukernel_9x__neon_c4,
1154 .mr = 9,
1155 };
1156 xnn_params.f32.argmaxpool[2] = (struct argmaxpool_parameters) {
1157 .mp = (xnn_argmaxpool_multipass_ukernel_fn) xnn_f32_argmaxpool_ukernel_9p8x__neon_c4,
1158 .mr = 9,
1159 .qr = 8,
1160 };
1161 xnn_params.f32.ibilinear = (struct ibilinear_parameters) {
1162 .ukernel = (xnn_ibilinear_ukernel_fn) xnn_f32_ibilinear_ukernel__neon_c8,
1163 .pixel_tile = 1,
1164 .channel_tile = 8,
1165 };
1166 xnn_params.f32.abs = (struct vunary_parameters) {
1167 .ukernel = (xnn_vunary_ukernel_fn) xnn_f32_vabs_ukernel__neon_x8,
1168 .element_tile = 8,
1169 };
1170 xnn_params.f32.clamp = (struct vunary_parameters) {
1171 .ukernel = (xnn_vunary_ukernel_fn) xnn_f32_vclamp_ukernel__neon_x8,
1172 .init.f32_minmax = xnn_init_f32_minmax_scalar_params,
1173 .element_tile = 8,
1174 };
1175 if (hardware_config->use_arm_neon_fma) {
1176 xnn_params.f32.elu = (struct vunary_parameters) {
1177 .ukernel = (xnn_vunary_ukernel_fn) xnn_f32_velu_ukernel__neonfma_rr1_p6_x8,
1178 .init.f32_elu = xnn_init_f32_elu_neonfma_rr1_p6_params,
1179 .element_tile = 8,
1180 };
1181 } else {
1182 xnn_params.f32.elu = (struct vunary_parameters) {
1183 .ukernel = (xnn_vunary_ukernel_fn) xnn_f32_velu_ukernel__neon_rr2_lut16_p3_x8,
1184 .init.f32_elu = xnn_init_f32_elu_neon_rr2_lut16_p3_params,
1185 .element_tile = 8,
1186 };
1187 }
1188 xnn_params.f32.hswish = (struct vunary_parameters) {
1189 .ukernel = (xnn_vunary_ukernel_fn) xnn_f32_vhswish_ukernel__neon_x16,
1190 .init.f32_hswish = xnn_init_f32_hswish_scalar_params,
1191 .element_tile = 16,
1192 };
1193 xnn_params.f32.lrelu = (struct vunary_parameters) {
1194 .ukernel = (xnn_vunary_ukernel_fn) xnn_f32_vlrelu_ukernel__neon_x8,
1195 .init.f32_lrelu = xnn_init_f32_lrelu_scalar_params,
1196 .element_tile = 8,
1197 };
1198 xnn_params.f32.neg = (struct vunary_parameters) {
1199 .ukernel = (xnn_vunary_ukernel_fn) xnn_f32_vneg_ukernel__neon_x8,
1200 .element_tile = 8,
1201 };
1202 if (hardware_config->use_arm_neon_v8) {
1203 xnn_params.f32.rndne = (struct vunary_parameters) {
1204 .ukernel = (xnn_vunary_ukernel_fn) xnn_f32_vrndne_ukernel__neonv8_x8,
1205 .element_tile = 8,
1206 };
1207 xnn_params.f32.rndz = (struct vunary_parameters) {
1208 .ukernel = (xnn_vunary_ukernel_fn) xnn_f32_vrndz_ukernel__neonv8_x8,
1209 .element_tile = 8,
1210 };
1211 xnn_params.f32.rndu = (struct vunary_parameters) {
1212 .ukernel = (xnn_vunary_ukernel_fn) xnn_f32_vrndu_ukernel__neonv8_x8,
1213 .element_tile = 8,
1214 };
1215 xnn_params.f32.rndd = (struct vunary_parameters) {
1216 .ukernel = (xnn_vunary_ukernel_fn) xnn_f32_vrndd_ukernel__neonv8_x8,
1217 .element_tile = 8,
1218 };
1219 } else {
1220 xnn_params.f32.rndne = (struct vunary_parameters) {
1221 .ukernel = (xnn_vunary_ukernel_fn) xnn_f32_vrndne_ukernel__neon_x8,
1222 .element_tile = 8,
1223 };
1224 xnn_params.f32.rndz = (struct vunary_parameters) {
1225 .ukernel = (xnn_vunary_ukernel_fn) xnn_f32_vrndz_ukernel__neon_x8,
1226 .element_tile = 8,
1227 };
1228 xnn_params.f32.rndu = (struct vunary_parameters) {
1229 .ukernel = (xnn_vunary_ukernel_fn) xnn_f32_vrndu_ukernel__neon_x8,
1230 .element_tile = 8,
1231 };
1232 xnn_params.f32.rndd = (struct vunary_parameters) {
1233 .ukernel = (xnn_vunary_ukernel_fn) xnn_f32_vrndd_ukernel__neon_x8,
1234 .element_tile = 8,
1235 };
1236 }
1237 xnn_params.f32.sigmoid = (struct vunary_parameters) {
1238 .ukernel = (xnn_vunary_ukernel_fn) xnn_f32_vsigmoid_ukernel__neon_rr2_lut64_p2_nr2recps_x8,
1239 .init.f32_sigmoid = xnn_init_f32_sigmoid_neon_rr2_lut64_p2_params,
1240 .element_tile = 8,
1241 };
1242 xnn_params.f32.sqr = (struct vunary_parameters) {
1243 .ukernel = (xnn_vunary_ukernel_fn) xnn_f32_vsqr_ukernel__neon_x8,
1244 .element_tile = 8,
1245 };
1246 xnn_params.f32.sqrt = (struct vunary_parameters) {
1247 .ukernel = (xnn_vunary_ukernel_fn) xnn_f32_vsqrt_ukernel__scalar_sqrt_x1,
1248 .element_tile = 1,
1249 };
1250 xnn_params.f32.prelu = (struct prelu_parameters) {
1251 .ukernel = (xnn_prelu_ukernel_fn) xnn_f32_prelu_ukernel__neon_2x8,
1252 .row_tile = 2,
1253 .channel_tile = 8,
1254 };
1255 xnn_params.f32.raddstoreexpminusmax = (struct raddstoreexpminusmax_parameters) {
1256 .ukernel = (xnn_raddstoreexpminusmax_ukernel_fn) xnn_f32_raddstoreexpminusmax_ukernel__neon_rr2_lut64_p2_x8,
1257 .init.f32 = xnn_init_f32_expminus_neon_rr2_lut64_p2_params,
1258 .element_tile = 8,
1259 };
1260 xnn_params.f32.rmax = (xnn_rmax_ukernel_fn) xnn_f32_rmax_ukernel__neon;
1261 xnn_params.f32.vmulcaddc = (struct vmulcaddc_parameters) {
1262 .ukernel = (xnn_vmulcaddc_ukernel_fn) xnn_f32_vmulcaddc_minmax_ukernel_c4__neon_2x,
1263 .init.f32 = xnn_init_f32_minmax_scalar_params,
1264 .channel_tile = 4,
1265 .row_tile = 2,
1266 };
1267 #ifndef XNN_NO_NCHW_OPERATORS
1268 init_flags |= XNN_INIT_FLAG_CHW_OPT;
1269
1270 xnn_params.f32.spmm = (struct spmm_parameters) {
1271 .ukernel = (xnn_spmm_ukernel_fn) xnn_f32_spmm_minmax_ukernel_32x1__neon,
1272 .init.f32 = xnn_init_f32_minmax_scalar_params,
1273 .mr = 32,
1274 .nr = 1,
1275 };
1276 xnn_params.f32.conv_hwc2chw_3x3c3s2 = (struct conv_hwc2chw_parameters) {
1277 .ukernel_with_symm_padding =
1278 (xnn_conv_hwc2chw_ukernel_fn) xnn_f32_conv_hwc2chw_ukernel_3x3s2p1c3x4__neon_2x2,
1279 .init.f32 = xnn_init_f32_minmax_scalar_params,
1280 .output_channel_tile = 4,
1281 .output_height_tile = 2,
1282 .output_width_tile = 2,
1283 };
1284 xnn_params.f32.dwconv2d_chw_3x3 = (struct dwconv2d_chw_parameters) {
1285 .ukernel = (xnn_dwconv2d_chw_ukernel_fn) xnn_f32_dwconv2d_chw_ukernel_3x3p1__neon_2x4,
1286 .init.f32 = xnn_init_f32_chw_neon_stride1_params,
1287 .update.f32 = xnn_update_f32_chw_neon_stride1_params,
1288 .output_height_tile = 2,
1289 .output_width_tile = 4,
1290 };
1291 xnn_params.f32.dwconv2d_chw_3x3s2 = (struct dwconv2d_chw_parameters) {
1292 .ukernel = (xnn_dwconv2d_chw_ukernel_fn) xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__neon_1x4,
1293 .init.f32 = xnn_init_f32_chw_neon_stride2_params,
1294 .update.f32 = xnn_update_f32_chw_neon_stride2_params,
1295 .output_height_tile = 1,
1296 .output_width_tile = 4,
1297 };
1298 xnn_params.f32.dwconv2d_chw_5x5 = (struct dwconv2d_chw_parameters) {
1299 .ukernel = (xnn_dwconv2d_chw_ukernel_fn) xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_1x4,
1300 .init.f32 = xnn_init_f32_chw_neon_stride1_params,
1301 .update.f32 = xnn_update_f32_chw_neon_stride1_params,
1302 .output_height_tile = 1,
1303 .output_width_tile = 4,
1304 };
1305 xnn_params.f32.dwconv2d_chw_5x5s2 = (struct dwconv2d_chw_parameters) {
1306 .ukernel = (xnn_dwconv2d_chw_ukernel_fn) xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__neon_1x4,
1307 .init.f32 = xnn_init_f32_chw_neon_stride2_params,
1308 .update.f32 = xnn_update_f32_chw_neon_stride2_params,
1309 .output_height_tile = 1,
1310 .output_width_tile = 4,
1311 };
1312 xnn_params.f32.gavgpool_cw = (struct gavgpool_cw_parameters) {
1313 .ukernel = (xnn_gavgpool_cw_ukernel_fn) xnn_f32_gavgpool_cw_ukernel__neon_x4,
1314 .channel_tile = 4,
1315 };
1316 xnn_params.f32.ibilinear_chw = (struct ibilinear_chw_parameters) {
1317 .ukernel = (xnn_ibilinear_chw_ukernel_fn) xnn_f32_ibilinear_chw_ukernel__neon_p8,
1318 .channel_tile = 1,
1319 .pixel_tile = 8,
1320 };
1321 #endif // XNN_NO_NCHW_OPERATORS
1322 #endif // XNN_NO_F32_OPERATORS
1323
1324 /*************************** VCVT AArch32 micro-kernels ***************************/
1325 #ifndef XNN_NO_VCVT_OPERATORS
1326 init_flags |= XNN_INIT_FLAG_VCVT;
1327
1328 if (hardware_config->use_arm_neon_fp16) {
1329 xnn_params.vcvt.f16_to_f32 = (struct vunary_parameters) {
1330 .ukernel = (xnn_vunary_ukernel_fn) xnn_f16_f32_vcvt_ukernel__neonfp16_x16,
1331 .element_tile = 16,
1332 };
1333 xnn_params.vcvt.f32_to_f16 = (struct vunary_parameters) {
1334 .ukernel = (xnn_vunary_ukernel_fn) xnn_f32_f16_vcvt_ukernel__neonfp16_x16,
1335 .element_tile = 16,
1336 };
1337 } else {
1338 xnn_params.vcvt.f16_to_f32 = (struct vunary_parameters) {
1339 .ukernel = (xnn_vunary_ukernel_fn) xnn_f16_f32_vcvt_ukernel__neon_int16_x16,
1340 .init.f16_f32_cvt = xnn_init_f16_f32_cvt_neon_params,
1341 .element_tile = 16,
1342 };
1343 xnn_params.vcvt.f32_to_f16 = (struct vunary_parameters) {
1344 .ukernel = (xnn_vunary_ukernel_fn) xnn_f32_f16_vcvt_ukernel__neon_x8,
1345 .init.f32_f16_cvt = xnn_init_f32_f16_cvt_neon_params,
1346 .element_tile = 8,
1347 };
1348 }
1349 if (hardware_config->use_arm_neon_v8) {
1350 xnn_params.vcvt.f32_to_qs8 = (struct vunary_parameters) {
1351 .ukernel = (xnn_vunary_ukernel_fn) xnn_f32_qs8_vcvt_ukernel__neonv8_x32,
1352 .init.f32_qs8_cvt = xnn_init_f32_qs8_cvt_neonv8_params,
1353 .element_tile = 32,
1354 };
1355 xnn_params.vcvt.f32_to_qu8 = (struct vunary_parameters) {
1356 .ukernel = (xnn_vunary_ukernel_fn) xnn_f32_qu8_vcvt_ukernel__neonv8_x32,
1357 .init.f32_qu8_cvt = xnn_init_f32_qu8_cvt_neonv8_params,
1358 .element_tile = 32,
1359 };
1360 } else {
1361 xnn_params.vcvt.f32_to_qs8 = (struct vunary_parameters) {
1362 .ukernel = (xnn_vunary_ukernel_fn) xnn_f32_qs8_vcvt_ukernel__neon_x32,
1363 .init.f32_qs8_cvt = xnn_init_f32_qs8_cvt_neon_params,
1364 .element_tile = 32,
1365 };
1366 xnn_params.vcvt.f32_to_qu8 = (struct vunary_parameters) {
1367 .ukernel = (xnn_vunary_ukernel_fn) xnn_f32_qu8_vcvt_ukernel__neon_x32,
1368 .init.f32_qu8_cvt = xnn_init_f32_qu8_cvt_neon_params,
1369 .element_tile = 32,
1370 };
1371 }
1372 xnn_params.vcvt.qs8 = (struct vunary_parameters) {
1373 .ukernel = (xnn_vunary_ukernel_fn) xnn_qs8_vcvt_ukernel__neon_x32,
1374 .init.qs8_cvt = xnn_init_qs8_cvt_neon_params,
1375 .element_tile = 32,
1376 };
1377 xnn_params.vcvt.qs8_to_f32 = (struct vunary_parameters) {
1378 .ukernel = (xnn_vunary_ukernel_fn) xnn_qs8_f32_vcvt_ukernel__neon_x32,
1379 .init.qs8_f32_cvt = xnn_init_qs8_f32_cvt_neon_params,
1380 .element_tile = 32,
1381 };
1382 xnn_params.vcvt.qu8 = (struct vunary_parameters) {
1383 .ukernel = (xnn_vunary_ukernel_fn) xnn_qu8_vcvt_ukernel__neon_x32,
1384 .init.qu8_cvt = xnn_init_qu8_cvt_neon_params,
1385 .element_tile = 32,
1386 };
1387 xnn_params.vcvt.qu8_to_f32 = (struct vunary_parameters) {
1388 .ukernel = (xnn_vunary_ukernel_fn) xnn_qu8_f32_vcvt_ukernel__neon_x32,
1389 .init.qu8_f32_cvt = xnn_init_qu8_f32_cvt_neon_params,
1390 .element_tile = 32,
1391 };
1392 #endif // XNN_NO_VCVT_OPERATORS
1393
1394 /**************************** X32 AArch32 micro-kernels ****************************/
1395 #ifndef XNN_NO_X32_OPERATORS
1396 init_flags |= XNN_INIT_FLAG_X32;
1397
1398 xnn_params.x32.unpool = (xnn_unpool_ukernel_fn) xnn_x32_unpool_ukernel__neon;
1399 xnn_params.x32.zip = (struct zip_parameters) {
1400 .x2 = (xnn_zipc_ukernel_fn) xnn_x32_zip_x2_ukernel__neon,
1401 .x3 = (xnn_zipc_ukernel_fn) xnn_x32_zip_x3_ukernel__neon,
1402 .x4 = (xnn_zipc_ukernel_fn) xnn_x32_zip_x4_ukernel__neon,
1403 .xm = (xnn_zipv_ukernel_fn) xnn_x32_zip_xm_ukernel__neon,
1404 };
1405 #endif // XNN_NO_X32_OPERATORS
1406
1407 /**************************** XX AArch32 micro-kernels ****************************/
1408 #ifndef XNN_NO_XX_OPERATORS
1409 init_flags |= XNN_INIT_FLAG_XX;
1410
1411 xnn_params.xx.copy = (xnn_vunary_ukernel_fn) xnn_xx_copy_ukernel__scalar_memcpy;
1412 xnn_params.xx.fill = (struct fill_parameters) {
1413 .ukernel = (xnn_fill_ukernel_fn) xnn_xx_fill_ukernel__neon_x64,
1414 .row_tile = 1,
1415 };
1416 xnn_params.xx.pad = (struct pad_parameters) {
1417 .ukernel = (xnn_pad_ukernel_fn) xnn_xx_pad_ukernel__neon,
1418 .row_tile = 1,
1419 };
1420 #endif // XNN_NO_XX_OPERATORS
1421
1422 } else if (!XNN_PLATFORM_MOBILE) {
1423
1424 /*************************** QC8 AArch32 Pre-NEON micro-kernels ***************************/
1425 #ifndef XNN_NO_QC8_OPERATORS
1426 init_flags |= XNN_INIT_FLAG_QC8;
1427
1428 xnn_params.qc8.gemm.minmax.gemm[XNN_MR_TO_INDEX(2)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_fn) xnn_qc8_gemm_minmax_fp32_ukernel_2x2c4__armsimd32);
1429 xnn_params.qc8.gemm.minmax.igemm[XNN_MR_TO_INDEX(2)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_fn) xnn_qc8_igemm_minmax_fp32_ukernel_2x2c4__armsimd32);
1430 xnn_params.qc8.gemm.minmax.gemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_fn) xnn_qc8_gemm_minmax_fp32_ukernel_1x2c4__armsimd32);
1431 xnn_params.qc8.gemm.minmax.igemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_fn) xnn_qc8_igemm_minmax_fp32_ukernel_1x2c4__armsimd32);
1432 xnn_params.qc8.gemm.init.qc8 = xnn_init_qc8_conv_minmax_fp32_armsimd32_params;
1433 xnn_params.qc8.gemm.mr = 2;
1434 xnn_params.qc8.gemm.nr = 2;
1435 xnn_params.qc8.gemm.log2_kr = 2;
1436
1437 xnn_params.qc8.dwconv[0].minmax.unipass = (xnn_dwconv_unipass_ukernel_fn) xnn_qc8_dwconv_minmax_fp32_ukernel_3p1c__scalar_fmagic;
1438 xnn_params.qc8.dwconv[0].init.qc8 = xnn_init_qc8_conv_minmax_fp32_scalar_fmagic_params;
1439 xnn_params.qc8.dwconv[0].channel_tile = 1;
1440 xnn_params.qc8.dwconv[0].primary_tile = 3;
1441 xnn_params.qc8.dwconv[1].minmax.unipass = (xnn_dwconv_unipass_ukernel_fn) xnn_qc8_dwconv_minmax_fp32_ukernel_9p1c__scalar_fmagic;
1442 xnn_params.qc8.dwconv[1].init.qc8 = xnn_init_qc8_conv_minmax_fp32_scalar_fmagic_params;
1443 xnn_params.qc8.dwconv[1].channel_tile = 1;
1444 xnn_params.qc8.dwconv[1].primary_tile = 9;
1445 xnn_params.qc8.dwconv[2].minmax.unipass = (xnn_dwconv_unipass_ukernel_fn) xnn_qc8_dwconv_minmax_fp32_ukernel_25p1c__scalar_fmagic;
1446 xnn_params.qc8.dwconv[2].init.qc8 = xnn_init_qc8_conv_minmax_fp32_scalar_fmagic_params;
1447 xnn_params.qc8.dwconv[2].channel_tile = 1;
1448 xnn_params.qc8.dwconv[2].primary_tile = 25;
1449 #endif // XNN_NO_QS8_OPERATORS
1450
1451 /*************************** QS8 AArch32 Pre-NEON micro-kernels ***************************/
1452 #ifndef XNN_NO_QS8_OPERATORS
1453 init_flags |= XNN_INIT_FLAG_QS8;
1454
1455 xnn_params.qs8.gemm.minmax.gemm[XNN_MR_TO_INDEX(2)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_fn) xnn_qs8_gemm_minmax_fp32_ukernel_2x2c4__armsimd32);
1456 xnn_params.qs8.gemm.minmax.igemm[XNN_MR_TO_INDEX(2)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_fn) xnn_qs8_igemm_minmax_fp32_ukernel_2x2c4__armsimd32);
1457 xnn_params.qs8.gemm.minmax.gemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_fn) xnn_qs8_gemm_minmax_fp32_ukernel_1x2c4__armsimd32);
1458 xnn_params.qs8.gemm.minmax.igemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_fn) xnn_qs8_igemm_minmax_fp32_ukernel_1x2c4__armsimd32);
1459 xnn_params.qs8.gemm.init.qs8 = xnn_init_qs8_conv_minmax_fp32_armsimd32_params;
1460 xnn_params.qs8.gemm.mr = 2;
1461 xnn_params.qs8.gemm.nr = 2;
1462 xnn_params.qs8.gemm.log2_kr = 2;
1463
1464 xnn_params.qs8.dwconv[0].minmax.unipass = (xnn_dwconv_unipass_ukernel_fn) xnn_qs8_dwconv_minmax_fp32_ukernel_9p1c__scalar_fmagic;
1465 xnn_params.qs8.dwconv[0].init.qs8 = xnn_init_qs8_conv_minmax_fp32_scalar_fmagic_params;
1466 xnn_params.qs8.dwconv[0].channel_tile = 1;
1467 xnn_params.qs8.dwconv[0].primary_tile = 9;
1468 xnn_params.qs8.dwconv[1].minmax.unipass = (xnn_dwconv_unipass_ukernel_fn) xnn_qs8_dwconv_minmax_fp32_ukernel_25p1c__scalar_fmagic;
1469 xnn_params.qs8.dwconv[1].init.qs8 = xnn_init_qs8_conv_minmax_fp32_scalar_fmagic_params;
1470 xnn_params.qs8.dwconv[1].channel_tile = 1;
1471 xnn_params.qs8.dwconv[1].primary_tile = 25;
1472
1473 xnn_params.qs8.gavgpool = (struct gavgpool_parameters) {
1474 .unipass = (xnn_gavgpool_unipass_ukernel_fn) xnn_qs8_gavgpool_minmax_fp32_ukernel_7x__scalar_imagic_c1,
1475 .multipass = (xnn_gavgpool_multipass_ukernel_fn) xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__scalar_imagic_c1,
1476 .init.qs8 = xnn_init_qs8_avgpool_minmax_fp32_scalar_imagic_params,
1477 .update.qs8 = xnn_update_qs8_avgpool_minmax_fp32_scalar_imagic_params,
1478 .row_tile = 7,
1479 .channel_tile = 1,
1480 };
1481
1482 xnn_params.qs8.lrelu = (struct vunary_parameters) {
1483 .ukernel = (xnn_vunary_ukernel_fn) xnn_qs8_vlrelu_ukernel__armsimd32_x4,
1484 .init.qs8_lrelu = xnn_init_qs8_lrelu_armsimd32_params,
1485 .element_tile = 4,
1486 };
1487 #endif // XNN_NO_QS8_OPERATORS
1488
1489 /*************************** QU8 AArch32 Pre-NEON micro-kernels ***************************/
1490 #ifndef XNN_NO_QU8_OPERATORS
1491 init_flags |= XNN_INIT_FLAG_QU8;
1492
1493 xnn_params.qu8.gemm.minmax.gemm[XNN_MR_TO_INDEX(2)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_fn) xnn_qu8_gemm_minmax_fp32_ukernel_2x2c4__armsimd32);
1494 xnn_params.qu8.gemm.minmax.igemm[XNN_MR_TO_INDEX(2)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_fn) xnn_qu8_igemm_minmax_fp32_ukernel_2x2c4__armsimd32);
1495 xnn_params.qu8.gemm.minmax.gemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_fn) xnn_qu8_gemm_minmax_fp32_ukernel_1x2c4__armsimd32);
1496 xnn_params.qu8.gemm.minmax.igemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_fn) xnn_qu8_igemm_minmax_fp32_ukernel_1x2c4__armsimd32);
1497 xnn_params.qu8.gemm.init.qu8 = xnn_init_qu8_conv_minmax_fp32_armsimd32_params;
1498 xnn_params.qu8.gemm.mr = 2;
1499 xnn_params.qu8.gemm.nr = 2;
1500 xnn_params.qu8.gemm.log2_kr = 2;
1501
1502 xnn_params.qu8.dwconv[0].minmax.unipass = (xnn_dwconv_unipass_ukernel_fn) xnn_qu8_dwconv_minmax_fp32_ukernel_9p1c__scalar_fmagic;
1503 xnn_params.qu8.dwconv[0].init.qu8 = xnn_init_qu8_conv_minmax_fp32_scalar_fmagic_params;
1504 xnn_params.qu8.dwconv[0].channel_tile = 1;
1505 xnn_params.qu8.dwconv[0].primary_tile = 9;
1506 xnn_params.qu8.dwconv[1].minmax.unipass = (xnn_dwconv_unipass_ukernel_fn) xnn_qu8_dwconv_minmax_fp32_ukernel_25p1c__scalar_fmagic;
1507 xnn_params.qu8.dwconv[1].init.qu8 = xnn_init_qu8_conv_minmax_fp32_scalar_fmagic_params;
1508 xnn_params.qu8.dwconv[1].channel_tile = 1;
1509 xnn_params.qu8.dwconv[1].primary_tile = 25;
1510
1511 xnn_params.qu8.avgpool = (struct avgpool_parameters) {
1512 .unipass = (xnn_avgpool_unipass_ukernel_fn) xnn_qu8_avgpool_minmax_fp32_ukernel_9x__scalar_imagic_c1,
1513 .multipass = (xnn_avgpool_multipass_ukernel_fn) xnn_qu8_avgpool_minmax_fp32_ukernel_9p8x__scalar_imagic_c1,
1514 .init.qu8 = xnn_init_qu8_avgpool_minmax_fp32_scalar_imagic_params,
1515 .primary_tile = 9,
1516 .incremental_tile = 8,
1517 .channel_tile = 1,
1518 };
1519 xnn_params.qu8.gavgpool = (struct gavgpool_parameters) {
1520 .unipass = (xnn_gavgpool_unipass_ukernel_fn) xnn_qu8_gavgpool_minmax_fp32_ukernel_7x__scalar_imagic_c1,
1521 .multipass = (xnn_gavgpool_multipass_ukernel_fn) xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__scalar_imagic_c1,
1522 .init.qu8 = xnn_init_qu8_avgpool_minmax_fp32_scalar_imagic_params,
1523 .update.qu8 = xnn_update_qu8_avgpool_minmax_fp32_scalar_imagic_params,
1524 .row_tile = 7,
1525 .channel_tile = 1,
1526 };
1527
1528 xnn_params.qu8.lrelu = (struct vunary_parameters) {
1529 .ukernel = (xnn_vunary_ukernel_fn) xnn_qu8_vlrelu_ukernel__armsimd32_x4,
1530 .init.qu8_lrelu = xnn_init_qu8_lrelu_armsimd32_params,
1531 .element_tile = 4,
1532 };
1533 #endif // XNN_NO_QU8_OPERATORS
1534
1535 /**************************** S8 AArch32 Pre-NEON micro-kernels ****************************/
1536 #ifndef XNN_NO_S8_OPERATORS
1537 init_flags |= XNN_INIT_FLAG_S8;
1538
1539 xnn_params.s8.clamp = (struct vunary_parameters) {
1540 .ukernel = (xnn_vunary_ukernel_fn) xnn_s8_vclamp_ukernel__scalar_x4,
1541 .init.s8_minmax = xnn_init_s8_minmax_scalar_params,
1542 .element_tile = 4,
1543 };
1544 xnn_params.s8.ibilinear = (struct ibilinear_parameters) {
1545 .ukernel = (xnn_ibilinear_ukernel_fn) xnn_s8_ibilinear_ukernel__scalar_c1,
1546 .pixel_tile = 1,
1547 .channel_tile = 1,
1548 };
1549 xnn_params.s8.maxpool = (struct maxpool_parameters) {
1550 .ukernel = (xnn_maxpool_ukernel_fn) xnn_s8_maxpool_minmax_ukernel_9p8x__scalar_c1,
1551 .init.s8 = xnn_init_s8_minmax_scalar_params,
1552 .mr = 9,
1553 .qr = 8,
1554 };
1555 #endif // XNN_NO_S8_OPERATORS
1556
1557 /**************************** U8 AArch32 Pre-NEON micro-kernels ****************************/
1558 #ifndef XNN_NO_U8_OPERATORS
1559 init_flags |= XNN_INIT_FLAG_U8;
1560
1561 xnn_params.u8.clamp = (struct vunary_parameters) {
1562 .ukernel = (xnn_vunary_ukernel_fn) xnn_u8_vclamp_ukernel__scalar_x4,
1563 .init.u8_minmax = xnn_init_u8_minmax_scalar_params,
1564 .element_tile = 4,
1565 };
1566 xnn_params.u8.ibilinear = (struct ibilinear_parameters) {
1567 .ukernel = (xnn_ibilinear_ukernel_fn) xnn_u8_ibilinear_ukernel__scalar_c1,
1568 .pixel_tile = 1,
1569 .channel_tile = 1,
1570 };
1571 xnn_params.u8.maxpool = (struct maxpool_parameters) {
1572 .ukernel = (xnn_maxpool_ukernel_fn) xnn_u8_maxpool_minmax_ukernel_9p8x__scalar_c1,
1573 .init.u8 = xnn_init_u8_minmax_scalar_params,
1574 .mr = 9,
1575 .qr = 8,
1576 };
1577 xnn_params.u8.lut32norm = xnn_u8_lut32norm_ukernel__scalar;
1578 xnn_params.u8.rmax = xnn_u8_rmax_ukernel__scalar;
1579 #endif // XNN_NO_U8_OPERATORS
1580
1581 /**************************** X8 AArch32 Pre-NEON micro-kernels ****************************/
1582 #ifndef XNN_NO_X8_OPERATORS
1583 init_flags |= XNN_INIT_FLAG_X8;
1584
1585 xnn_params.x8.zip = (struct zip_parameters) {
1586 .x2 = (xnn_zipc_ukernel_fn) xnn_x8_zip_x2_ukernel__scalar,
1587 .x3 = (xnn_zipc_ukernel_fn) xnn_x8_zip_x3_ukernel__scalar,
1588 .x4 = (xnn_zipc_ukernel_fn) xnn_x8_zip_x4_ukernel__scalar,
1589 .xm = (xnn_zipv_ukernel_fn) xnn_x8_zip_xm_ukernel__scalar,
1590 };
1591 #endif // XNN_NO_X8_OPERATORS
1592
1593 /**************************** F32 AArch32 Pre-NEON micro-kernels ****************************/
1594 #ifndef XNN_NO_F32_OPERATORS
1595 init_flags |= XNN_INIT_FLAG_F32;
1596
1597 xnn_params.f32.gemm.minmax.gemm[XNN_MR_TO_INDEX(4)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_fn) xnn_f32_gemm_minmax_ukernel_4x4__scalar);
1598 xnn_params.f32.gemm.minmax.igemm[XNN_MR_TO_INDEX(4)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_fn) xnn_f32_igemm_minmax_ukernel_4x4__scalar);
1599 xnn_params.f32.gemm.minmax.gemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_fn) xnn_f32_gemm_minmax_ukernel_1x4__scalar);
1600 xnn_params.f32.gemm.minmax.igemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_fn) xnn_f32_igemm_minmax_ukernel_1x4__scalar);
1601 xnn_params.f32.gemm.relu.gemm[XNN_MR_TO_INDEX(4)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_fn) xnn_f32_gemm_relu_ukernel_4x4__scalar);
1602 xnn_params.f32.gemm.relu.igemm[XNN_MR_TO_INDEX(4)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_fn) xnn_f32_igemm_relu_ukernel_4x4__scalar);
1603 xnn_params.f32.gemm.relu.gemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_fn) xnn_f32_gemm_relu_ukernel_1x4__scalar);
1604 xnn_params.f32.gemm.relu.igemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_fn) xnn_f32_igemm_relu_ukernel_1x4__scalar);
1605 xnn_params.f32.gemm.linear.gemm[XNN_MR_TO_INDEX(4)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_fn) xnn_f32_gemm_ukernel_4x4__scalar);
1606 xnn_params.f32.gemm.linear.igemm[XNN_MR_TO_INDEX(4)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_fn) xnn_f32_igemm_ukernel_4x4__scalar);
1607 xnn_params.f32.gemm.linear.gemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_fn) xnn_f32_gemm_ukernel_1x4__scalar);
1608 xnn_params.f32.gemm.linear.igemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_fn) xnn_f32_igemm_ukernel_1x4__scalar);
1609 xnn_params.f32.gemm.init.f32 = xnn_init_f32_minmax_scalar_params;
1610 xnn_params.f32.gemm.mr = 4;
1611 xnn_params.f32.gemm.nr = 4;
1612
1613 xnn_params.f32.gemm2.minmax.gemm[XNN_MR_TO_INDEX(4)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_fn) xnn_f32_gemm_minmax_ukernel_4x2__scalar);
1614 xnn_params.f32.gemm2.minmax.igemm[XNN_MR_TO_INDEX(4)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_fn) xnn_f32_igemm_minmax_ukernel_4x2__scalar);
1615 xnn_params.f32.gemm2.linear.gemm[XNN_MR_TO_INDEX(4)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_fn) xnn_f32_gemm_ukernel_4x2__scalar);
1616 xnn_params.f32.gemm2.linear.igemm[XNN_MR_TO_INDEX(4)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_fn) xnn_f32_igemm_ukernel_4x2__scalar);
1617 xnn_params.f32.gemm2.init.f32 = xnn_init_f32_minmax_scalar_params;
1618 xnn_params.f32.gemm2.mr = 4;
1619 xnn_params.f32.gemm2.nr = 2;
1620
1621 xnn_params.f32.dwconv[0].minmax.unipass = (xnn_dwconv_unipass_ukernel_fn) xnn_f32_dwconv_minmax_ukernel_3p1c__scalar_acc2;
1622 xnn_params.f32.dwconv[0].linear.unipass = (xnn_dwconv_unipass_ukernel_fn) xnn_f32_dwconv_ukernel_3p1c__scalar_acc2;
1623 xnn_params.f32.dwconv[0].init.f32 = xnn_init_f32_minmax_scalar_params;
1624 xnn_params.f32.dwconv[0].channel_tile = 1;
1625 xnn_params.f32.dwconv[0].primary_tile = 3;
1626
1627 xnn_params.f32.dwconv[1].minmax.unipass = (xnn_dwconv_unipass_ukernel_fn) xnn_f32_dwconv_minmax_ukernel_4p1c__scalar_acc2;
1628 xnn_params.f32.dwconv[1].linear.unipass = (xnn_dwconv_unipass_ukernel_fn) xnn_f32_dwconv_ukernel_4p1c__scalar_acc2;
1629 xnn_params.f32.dwconv[1].init.f32 = xnn_init_f32_minmax_scalar_params;
1630 xnn_params.f32.dwconv[1].channel_tile = 1;
1631 xnn_params.f32.dwconv[1].primary_tile = 4;
1632
1633 xnn_params.f32.dwconv[2].minmax.unipass = (xnn_dwconv_unipass_ukernel_fn) xnn_f32_dwconv_minmax_ukernel_9p1c__scalar_acc2;
1634 xnn_params.f32.dwconv[2].linear.unipass = (xnn_dwconv_unipass_ukernel_fn) xnn_f32_dwconv_ukernel_9p1c__scalar_acc2;
1635 xnn_params.f32.dwconv[2].init.f32 = xnn_init_f32_minmax_scalar_params;
1636 xnn_params.f32.dwconv[2].channel_tile = 1;
1637 xnn_params.f32.dwconv[2].primary_tile = 9;
1638
1639 xnn_params.f32.dwconv[3].minmax.unipass = (xnn_dwconv_unipass_ukernel_fn) xnn_f32_dwconv_minmax_ukernel_25p1c__scalar_acc2;
1640 xnn_params.f32.dwconv[3].linear.unipass = (xnn_dwconv_unipass_ukernel_fn) xnn_f32_dwconv_ukernel_25p1c__scalar_acc2;
1641 xnn_params.f32.dwconv[3].init.f32 = xnn_init_f32_minmax_scalar_params;
1642 xnn_params.f32.dwconv[3].channel_tile = 1;
1643 xnn_params.f32.dwconv[3].primary_tile = 25;
1644
1645 xnn_params.f32.avgpool = (struct avgpool_parameters) {
1646 .unipass = (xnn_avgpool_unipass_ukernel_fn) xnn_f32_avgpool_minmax_ukernel_9x__scalar_c1,
1647 .multipass = (xnn_avgpool_multipass_ukernel_fn) xnn_f32_avgpool_minmax_ukernel_9p8x__scalar_c1,
1648 .init.f32 = xnn_init_f32_scaleminmax_scalar_params,
1649 .primary_tile = 9,
1650 .incremental_tile = 8,
1651 .channel_tile = 1,
1652 };
1653 xnn_params.f32.pavgpool = (struct pavgpool_parameters) {
1654 .unipass = (xnn_pavgpool_unipass_ukernel_fn) xnn_f32_pavgpool_minmax_ukernel_9x__scalar_c1,
1655 .multipass = (xnn_pavgpool_multipass_ukernel_fn) xnn_f32_pavgpool_minmax_ukernel_9p8x__scalar_c1,
1656 .init.f32 = xnn_init_f32_minmax_scalar_params,
1657 .primary_tile = 9,
1658 .incremental_tile = 8,
1659 .channel_tile = 1,
1660 };
1661 xnn_params.f32.gavgpool = (struct gavgpool_parameters) {
1662 .unipass = (xnn_gavgpool_unipass_ukernel_fn) xnn_f32_gavgpool_minmax_ukernel_7x__scalar_c1,
1663 .multipass = (xnn_gavgpool_multipass_ukernel_fn) xnn_f32_gavgpool_minmax_ukernel_7p7x__scalar_c1,
1664 .init.f32 = xnn_init_f32_scaleminmax_scalar_params,
1665 .update.f32 = xnn_update_f32_scaleminmax_scalar_params,
1666 .row_tile = 7,
1667 .channel_tile = 1,
1668 };
1669 xnn_params.f32.maxpool = (struct maxpool_parameters) {
1670 .ukernel = (xnn_maxpool_ukernel_fn) xnn_f32_maxpool_minmax_ukernel_9p8x__scalar_c1,
1671 .init.f32 = xnn_init_f32_minmax_scalar_params,
1672 .mr = 9,
1673 .qr = 8,
1674 };
1675 xnn_params.f32.argmaxpool[0] = (struct argmaxpool_parameters) {
1676 .up = (xnn_argmaxpool_unipass_ukernel_fn) xnn_f32_argmaxpool_ukernel_4x__scalar_c1,
1677 .mr = 4,
1678 };
1679 xnn_params.f32.argmaxpool[1] = (struct argmaxpool_parameters) {
1680 .up = (xnn_argmaxpool_unipass_ukernel_fn) xnn_f32_argmaxpool_ukernel_9x__scalar_c1,
1681 .mr = 9,
1682 };
1683 xnn_params.f32.argmaxpool[2] = (struct argmaxpool_parameters) {
1684 .mp = (xnn_argmaxpool_multipass_ukernel_fn) xnn_f32_argmaxpool_ukernel_9p8x__scalar_c1,
1685 .mr = 9,
1686 .qr = 8,
1687 };
1688 xnn_params.f32.ibilinear = (struct ibilinear_parameters) {
1689 .ukernel = (xnn_ibilinear_ukernel_fn) xnn_f32_ibilinear_ukernel__scalar_c2,
1690 .pixel_tile = 1,
1691 .channel_tile = 2,
1692 };
1693 xnn_params.f32.abs = (struct vunary_parameters) {
1694 .ukernel = (xnn_vunary_ukernel_fn) xnn_f32_vabs_ukernel__scalar_x4,
1695 .element_tile = 4,
1696 };
1697 xnn_params.f32.clamp = (struct vunary_parameters) {
1698 .ukernel = (xnn_vunary_ukernel_fn) xnn_f32_vclamp_ukernel__scalar_x4,
1699 .init.f32_minmax = xnn_init_f32_minmax_scalar_params,
1700 .element_tile = 4,
1701 };
1702 xnn_params.f32.elu = (struct vunary_parameters) {
1703 .ukernel = (xnn_vunary_ukernel_fn) xnn_f32_velu_ukernel__scalar_rr2_lut16_p3_x4,
1704 .init.f32_elu = xnn_init_f32_elu_scalar_rr2_lut16_p3_params,
1705 .element_tile = 4,
1706 };
1707 xnn_params.f32.hswish = (struct vunary_parameters) {
1708 .ukernel = (xnn_vunary_ukernel_fn) xnn_f32_vhswish_ukernel__scalar_x4,
1709 .init.f32_hswish = xnn_init_f32_hswish_scalar_params,
1710 .element_tile = 4,
1711 };
1712 xnn_params.f32.lrelu = (struct vunary_parameters) {
1713 .ukernel = (xnn_vunary_ukernel_fn) xnn_f32_vlrelu_ukernel__scalar_x4,
1714 .init.f32_lrelu = xnn_init_f32_lrelu_scalar_params,
1715 .element_tile = 4,
1716 };
1717 xnn_params.f32.neg = (struct vunary_parameters) {
1718 .ukernel = (xnn_vunary_ukernel_fn) xnn_f32_vneg_ukernel__scalar_x4,
1719 .element_tile = 4,
1720 };
1721 xnn_params.f32.rndne = (struct vunary_parameters) {
1722 .ukernel = (xnn_vunary_ukernel_fn) xnn_f32_vrndne_ukernel__scalar_libm_x1,
1723 .element_tile = 1,
1724 };
1725 xnn_params.f32.rndz = (struct vunary_parameters) {
1726 .ukernel = (xnn_vunary_ukernel_fn) xnn_f32_vrndz_ukernel__scalar_libm_x1,
1727 .element_tile = 1,
1728 };
1729 xnn_params.f32.rndu = (struct vunary_parameters) {
1730 .ukernel = (xnn_vunary_ukernel_fn) xnn_f32_vrndu_ukernel__scalar_libm_x1,
1731 .element_tile = 1,
1732 };
1733 xnn_params.f32.rndd = (struct vunary_parameters) {
1734 .ukernel = (xnn_vunary_ukernel_fn) xnn_f32_vrndd_ukernel__scalar_libm_x1,
1735 .element_tile = 1,
1736 };
1737 xnn_params.f32.sigmoid = (struct vunary_parameters) {
1738 .ukernel = (xnn_vunary_ukernel_fn) xnn_f32_vsigmoid_ukernel__scalar_rr2_lut64_p2_div_x2,
1739 .init.f32_sigmoid = xnn_init_f32_sigmoid_scalar_rr2_lut64_p2_params,
1740 .element_tile = 2,
1741 };
1742 xnn_params.f32.sqr = (struct vunary_parameters) {
1743 .ukernel = (xnn_vunary_ukernel_fn) xnn_f32_vsqr_ukernel__scalar_x4,
1744 .element_tile = 4,
1745 };
1746 xnn_params.f32.sqrt = (struct vunary_parameters) {
1747 .ukernel = (xnn_vunary_ukernel_fn) xnn_f32_vsqrt_ukernel__scalar_sqrt_x1,
1748 .element_tile = 1,
1749 };
1750 xnn_params.f32.prelu = (struct prelu_parameters) {
1751 .ukernel = (xnn_prelu_ukernel_fn) xnn_f32_prelu_ukernel__scalar_2x4,
1752 .row_tile = 4,
1753 .channel_tile = 4,
1754 };
1755 xnn_params.f32.raddstoreexpminusmax = (struct raddstoreexpminusmax_parameters) {
1756 .ukernel = (xnn_raddstoreexpminusmax_ukernel_fn) xnn_f32_raddstoreexpminusmax_ukernel__scalar_rr2_p5_x4_acc2,
1757 .init.f32 = xnn_init_f32_expminus_scalar_rr2_p5_params,
1758 .element_tile = 4,
1759 };
1760 xnn_params.f32.rmax = (xnn_rmax_ukernel_fn) xnn_f32_rmax_ukernel__scalar;
1761 xnn_params.f32.vmulcaddc = (struct vmulcaddc_parameters) {
1762 .ukernel = (xnn_vmulcaddc_ukernel_fn) xnn_f32_vmulcaddc_minmax_ukernel_c1__scalar_2x,
1763 .init.f32 = xnn_init_f32_minmax_scalar_params,
1764 .channel_tile = 1,
1765 .row_tile = 2,
1766 };
1767 #ifndef XNN_NO_NCHW_OPERATORS
1768 init_flags |= XNN_INIT_FLAG_CHW_OPT;
1769
1770 xnn_params.f32.spmm = (struct spmm_parameters) {
1771 .ukernel = (xnn_spmm_ukernel_fn) xnn_f32_spmm_minmax_ukernel_8x1__scalar,
1772 .init.f32 = xnn_init_f32_minmax_scalar_params,
1773 .mr = 8,
1774 .nr = 1,
1775 };
1776 xnn_params.f32.spmm2 = (struct spmm_parameters) {
1777 .ukernel = (xnn_spmm_ukernel_fn) xnn_f32_spmm_minmax_ukernel_8x2__scalar,
1778 .init.f32 = xnn_init_f32_minmax_scalar_params,
1779 .mr = 8,
1780 .nr = 2,
1781 };
1782 xnn_params.f32.spmm4 = (struct spmm_parameters) {
1783 .ukernel = (xnn_spmm_ukernel_fn) xnn_f32_spmm_minmax_ukernel_8x4__scalar,
1784 .init.f32 = xnn_init_f32_minmax_scalar_params,
1785 .mr = 8,
1786 .nr = 4,
1787 };
1788 xnn_params.f32.conv_hwc2chw_3x3c3s2 = (struct conv_hwc2chw_parameters) {
1789 .ukernel_with_symm_padding =
1790 (xnn_conv_hwc2chw_ukernel_fn) xnn_f32_conv_hwc2chw_ukernel_3x3s2p1c3x4__scalar_1x1,
1791 .init.f32 = xnn_init_f32_minmax_scalar_params,
1792 .output_channel_tile = 4,
1793 .output_height_tile = 1,
1794 .output_width_tile = 1,
1795 };
1796 xnn_params.f32.dwconv2d_chw_3x3 = (struct dwconv2d_chw_parameters) {
1797 .ukernel = (xnn_dwconv2d_chw_ukernel_fn) xnn_f32_dwconv2d_chw_ukernel_3x3p1__scalar_4x1,
1798 .init.f32 = xnn_init_f32_chw_scalar_params,
1799 .output_height_tile = 4,
1800 .output_width_tile = 1,
1801 };
1802 xnn_params.f32.dwconv2d_chw_3x3s2 = (struct dwconv2d_chw_parameters) {
1803 .ukernel = (xnn_dwconv2d_chw_ukernel_fn) xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__scalar_2x1_acc2,
1804 .init.f32 = xnn_init_f32_chw_scalar_params,
1805 .output_height_tile = 2,
1806 .output_width_tile = 1,
1807 };
1808 xnn_params.f32.dwconv2d_chw_5x5 = (struct dwconv2d_chw_parameters) {
1809 .ukernel = (xnn_dwconv2d_chw_ukernel_fn) xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_2x1_acc2,
1810 .init.f32 = xnn_init_f32_chw_scalar_params,
1811 .output_height_tile = 2,
1812 .output_width_tile = 1,
1813 };
1814 xnn_params.f32.dwconv2d_chw_5x5s2 = (struct dwconv2d_chw_parameters) {
1815 .ukernel = (xnn_dwconv2d_chw_ukernel_fn) xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_2x1_acc2,
1816 .init.f32 = xnn_init_f32_chw_scalar_params,
1817 .output_height_tile = 2,
1818 .output_width_tile = 1,
1819 };
1820 xnn_params.f32.gavgpool_cw = (struct gavgpool_cw_parameters) {
1821 .ukernel = (xnn_gavgpool_cw_ukernel_fn) xnn_f32_gavgpool_cw_ukernel__scalar_x1,
1822 .channel_tile = 1,
1823 };
1824 xnn_params.f32.ibilinear_chw = (struct ibilinear_chw_parameters) {
1825 .ukernel = (xnn_ibilinear_chw_ukernel_fn) xnn_f32_ibilinear_chw_ukernel__scalar_p4,
1826 .channel_tile = 1,
1827 .pixel_tile = 4,
1828 };
1829 #endif // XNN_NO_NCHW_OPERATORS
1830 #endif // XNN_NO_F32_OPERATORS
1831
1832 /*************************** VCVT AArch32 Pre-NEON micro-kernels ***************************/
1833 #ifndef XNN_NO_VCVT_OPERATORS
1834 init_flags |= XNN_INIT_FLAG_VCVT;
1835
1836 xnn_params.vcvt.f16_to_f32 = (struct vunary_parameters) {
1837 .ukernel = (xnn_vunary_ukernel_fn) xnn_f16_f32_vcvt_ukernel__scalar_x4,
1838 .init.f16_f32_cvt = xnn_init_f16_f32_cvt_scalar_params,
1839 .element_tile = 4,
1840 };
1841 xnn_params.vcvt.f32_to_f16 = (struct vunary_parameters) {
1842 .ukernel = (xnn_vunary_ukernel_fn) xnn_f32_f16_vcvt_ukernel__scalar_fabsf_x2,
1843 .init.f32_f16_cvt = xnn_init_f32_f16_cvt_scalar_fabsf_params,
1844 .element_tile = 2,
1845 };
1846 xnn_params.vcvt.f32_to_qs8 = (struct vunary_parameters) {
1847 .ukernel = (xnn_vunary_ukernel_fn) xnn_f32_qs8_vcvt_ukernel__scalar_imagic_x4,
1848 .init.f32_qs8_cvt = xnn_init_f32_qs8_cvt_scalar_imagic_params,
1849 .element_tile = 4,
1850 };
1851 xnn_params.vcvt.f32_to_qu8 = (struct vunary_parameters) {
1852 .ukernel = (xnn_vunary_ukernel_fn) xnn_f32_qu8_vcvt_ukernel__scalar_imagic_x4,
1853 .init.f32_qu8_cvt = xnn_init_f32_qu8_cvt_scalar_imagic_params,
1854 .element_tile = 4,
1855 };
1856 xnn_params.vcvt.qs8 = (struct vunary_parameters) {
1857 .ukernel = (xnn_vunary_ukernel_fn) xnn_qs8_vcvt_ukernel__armsimd32_x8,
1858 .init.qs8_cvt = xnn_init_qs8_cvt_scalar_params,
1859 .element_tile = 8,
1860 };
1861 xnn_params.vcvt.qs8_to_f32 = (struct vunary_parameters) {
1862 .ukernel = (xnn_vunary_ukernel_fn) xnn_qs8_f32_vcvt_ukernel__scalar_x4,
1863 .init.qs8_f32_cvt = xnn_init_qs8_f32_cvt_scalar_params,
1864 .element_tile = 4,
1865 };
1866 xnn_params.vcvt.qu8 = (struct vunary_parameters) {
1867 .ukernel = (xnn_vunary_ukernel_fn) xnn_qu8_vcvt_ukernel__armsimd32_x8,
1868 .init.qu8_cvt = xnn_init_qu8_cvt_scalar_params,
1869 .element_tile = 8,
1870 };
1871 xnn_params.vcvt.qu8_to_f32 = (struct vunary_parameters) {
1872 .ukernel = (xnn_vunary_ukernel_fn) xnn_qu8_f32_vcvt_ukernel__scalar_x4,
1873 .init.qu8_f32_cvt = xnn_init_qu8_f32_cvt_scalar_params,
1874 .element_tile = 4,
1875 };
1876 #endif // XNN_NO_VCVT_OPERATORS
1877
1878 /**************************** X32 AArch32 Pre-NEON micro-kernels ****************************/
1879 #ifndef XNN_NO_X32_OPERATORS
1880 init_flags |= XNN_INIT_FLAG_X32;
1881
1882 xnn_params.x32.unpool = (xnn_unpool_ukernel_fn) xnn_x32_unpool_ukernel__scalar;
1883 xnn_params.x32.zip = (struct zip_parameters) {
1884 .x2 = (xnn_zipc_ukernel_fn) xnn_x32_zip_x2_ukernel__scalar,
1885 .x3 = (xnn_zipc_ukernel_fn) xnn_x32_zip_x3_ukernel__scalar,
1886 .x4 = (xnn_zipc_ukernel_fn) xnn_x32_zip_x4_ukernel__scalar,
1887 .xm = (xnn_zipv_ukernel_fn) xnn_x32_zip_xm_ukernel__scalar,
1888 };
1889 #endif // XNN_NO_X32_OPERATORS
1890
1891 /**************************** XX AArch32 Pre-NEON micro-kernels ****************************/
1892 #ifndef XNN_NO_XX_OPERATORS
1893 init_flags |= XNN_INIT_FLAG_XX;
1894
1895 xnn_params.xx.copy = (xnn_vunary_ukernel_fn) xnn_xx_copy_ukernel__scalar_memcpy;
1896 xnn_params.xx.fill = (struct fill_parameters) {
1897 .ukernel = (xnn_fill_ukernel_fn) xnn_xx_fill_ukernel__scalar_x16,
1898 .row_tile = 1,
1899 };
1900 xnn_params.xx.pad = (struct pad_parameters) {
1901 .ukernel = (xnn_pad_ukernel_fn) xnn_xx_pad_ukernel__scalar,
1902 .row_tile = 1,
1903 };
1904 #endif // XNN_NO_XX_OPERATORS
1905 }
1906
1907#elif XNN_ARCH_ARM64
1908
1909 /**************************** QC8 AArch64 micro-kernels ****************************/
1910 #ifndef XNN_NO_QC8_OPERATORS
1911 init_flags |= XNN_INIT_FLAG_QC8;
1912
1913 #if XNN_PLATFORM_IOS || XNN_PLATFORM_MAC || XNN_PLATFORM_WINDOWS
1914 #if XNN_ENABLE_ASSEMBLY
1915 if (XNN_ENABLE_ARM_DOTPROD && hardware_config->use_arm_neon_dot) {
1916 #if XNN_ENABLE_ARM_DOTPROD
1917 xnn_params.qc8.gemm.minmax.gemm[XNN_MR_TO_INDEX(4)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_fn) xnn_qc8_gemm_minmax_fp32_ukernel_4x16c4__asm_aarch64_neondot_ld128);
1918 xnn_params.qc8.gemm.minmax.gemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_fn) xnn_qc8_gemm_minmax_fp32_ukernel_1x16c4__neondot);
1919 xnn_params.qc8.gemm.minmax.igemm[XNN_MR_TO_INDEX(4)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_fn) xnn_qc8_igemm_minmax_fp32_ukernel_4x16c4__asm_aarch64_neondot_ld128);
1920 xnn_params.qc8.gemm.minmax.igemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_fn) xnn_qc8_igemm_minmax_fp32_ukernel_1x16c4__neondot);
1921 xnn_params.qc8.gemm.init.qc8 = xnn_init_qc8_conv_minmax_fp32_neonv8_params;
1922 xnn_params.qc8.gemm.mr = 4;
1923 xnn_params.qc8.gemm.nr = 16;
1924 xnn_params.qc8.gemm.log2_kr = 2;
1925 #endif // XNN_ENABLE_ARM_DOTPROD
1926 } else {
1927 xnn_params.qc8.gemm.minmax.gemm[XNN_MR_TO_INDEX(2)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_fn) xnn_qc8_gemm_minmax_fp32_ukernel_2x8c8__asm_aarch64_neon_mlal);
1928 xnn_params.qc8.gemm.minmax.igemm[XNN_MR_TO_INDEX(2)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_fn) xnn_qc8_igemm_minmax_fp32_ukernel_2x8c8__asm_aarch64_neon_mlal);
1929 xnn_params.qc8.gemm.minmax.gemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_fn) xnn_qc8_gemm_minmax_fp32_ukernel_1x8c8__asm_aarch64_neon_mlal);
1930 xnn_params.qc8.gemm.minmax.igemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_fn) xnn_qc8_igemm_minmax_fp32_ukernel_1x8c8__asm_aarch64_neon_mlal);
1931 xnn_params.qc8.gemm.init.qc8 = xnn_init_qc8_conv_minmax_fp32_neonv8_params;
1932 xnn_params.qc8.gemm.mr = 2;
1933 xnn_params.qc8.gemm.nr = 8;
1934 xnn_params.qc8.gemm.log2_kr = 3;
1935 }
1936 #else // !XNN_ENABLE_ASSEMBLY
1937 if (XNN_ENABLE_ARM_DOTPROD && hardware_config->use_arm_neon_dot) {
1938 #if XNN_ENABLE_ARM_DOTPROD
1939 xnn_params.qc8.gemm.minmax.gemm[XNN_MR_TO_INDEX(4)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_fn) xnn_qc8_gemm_minmax_fp32_ukernel_4x16c4__neondot);
1940 xnn_params.qc8.gemm.minmax.gemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_fn) xnn_qc8_gemm_minmax_fp32_ukernel_1x16c4__neondot);
1941 xnn_params.qc8.gemm.minmax.igemm[XNN_MR_TO_INDEX(4)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_fn) xnn_qc8_igemm_minmax_fp32_ukernel_4x16c4__neondot);
1942 xnn_params.qc8.gemm.minmax.igemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_fn) xnn_qc8_igemm_minmax_fp32_ukernel_1x16c4__neondot);
1943 xnn_params.qc8.gemm.init.qc8 = xnn_init_qc8_conv_minmax_fp32_neonv8_params;
1944 xnn_params.qc8.gemm.mr = 4;
1945 xnn_params.qc8.gemm.nr = 16;
1946 xnn_params.qc8.gemm.log2_kr = 2;
1947 #endif // XNN_ENABLE_ARM_DOTPROD
1948 } else {
1949 xnn_params.qc8.gemm.minmax.gemm[XNN_MR_TO_INDEX(2)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_fn) xnn_qc8_gemm_minmax_fp32_ukernel_2x8c2s4__neonv8_mlal);
1950 xnn_params.qc8.gemm.minmax.igemm[XNN_MR_TO_INDEX(2)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_fn) xnn_qc8_igemm_minmax_fp32_ukernel_2x8c2s4__neonv8_mlal);
1951 xnn_params.qc8.gemm.minmax.gemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_fn) xnn_qc8_gemm_minmax_fp32_ukernel_1x8c2s4__neonv8_mlal);
1952 xnn_params.qc8.gemm.minmax.igemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_fn) xnn_qc8_igemm_minmax_fp32_ukernel_1x8c2s4__neonv8_mlal);
1953 xnn_params.qc8.gemm.init.qc8 = xnn_init_qc8_conv_minmax_fp32_neonv8_params;
1954 xnn_params.qc8.gemm.mr = 2;
1955 xnn_params.qc8.gemm.nr = 8;
1956 xnn_params.qc8.gemm.log2_kr = 1;
1957 xnn_params.qc8.gemm.log2_sr = 2;
1958 }
1959 #endif // XNN_ENABLE_ASSEMBLY
1960 #else // !XNN_PLATFORM_IOS && !XNN_PLATFORM_MAC
1961 #if XNN_ENABLE_ASSEMBLY
1962 if (XNN_ENABLE_ARM_DOTPROD && hardware_config->use_arm_neon_dot) {
1963 #if XNN_ENABLE_ARM_DOTPROD
1964 switch (cpuinfo_get_core(0)->uarch) {
1965 case cpuinfo_uarch_cortex_a55:
1966 xnn_params.qc8.gemm.minmax.gemm[XNN_MR_TO_INDEX(4)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_fn) xnn_qc8_gemm_minmax_fp32_ukernel_4x16c4__asm_aarch64_neondot_cortex_a55);
1967 xnn_params.qc8.gemm.minmax.igemm[XNN_MR_TO_INDEX(4)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_fn) xnn_qc8_igemm_minmax_fp32_ukernel_4x16c4__asm_aarch64_neondot_cortex_a55);
1968 break;
1969 case cpuinfo_uarch_cortex_x1:
1970 case cpuinfo_uarch_cortex_a78:
1971 xnn_params.qc8.gemm.minmax.gemm[XNN_MR_TO_INDEX(4)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_fn) xnn_qc8_gemm_minmax_fp32_ukernel_4x16c4__asm_aarch64_neondot_ld128);
1972 xnn_params.qc8.gemm.minmax.igemm[XNN_MR_TO_INDEX(4)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_fn) xnn_qc8_igemm_minmax_fp32_ukernel_4x16c4__asm_aarch64_neondot_ld128);
1973 break;
1974 default:
1975 xnn_params.qc8.gemm.minmax.gemm[XNN_MR_TO_INDEX(4)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_fn) xnn_qc8_gemm_minmax_fp32_ukernel_4x16c4__asm_aarch64_neondot_ld64);
1976 xnn_params.qc8.gemm.minmax.igemm[XNN_MR_TO_INDEX(4)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_fn) xnn_qc8_igemm_minmax_fp32_ukernel_4x16c4__asm_aarch64_neondot_ld64);
1977 break;
1978 }
1979 xnn_params.qc8.gemm.minmax.gemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_fn) xnn_qc8_gemm_minmax_fp32_ukernel_1x16c4__neondot);
1980 xnn_params.qc8.gemm.minmax.igemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_fn) xnn_qc8_igemm_minmax_fp32_ukernel_1x16c4__neondot);
1981 xnn_params.qc8.gemm.init.qc8 = xnn_init_qc8_conv_minmax_fp32_neonv8_params;
1982 xnn_params.qc8.gemm.mr = 4;
1983 xnn_params.qc8.gemm.nr = 16;
1984 xnn_params.qc8.gemm.log2_kr = 2;
1985 #endif // XNN_ENABLE_ARM_DOTPROD
1986 } else {
1987 switch (cpuinfo_get_core(0)->uarch) {
1988 case cpuinfo_uarch_cortex_a35:
1989 case cpuinfo_uarch_kryo:
1990 xnn_params.qc8.gemm.minmax.gemm[XNN_MR_TO_INDEX(4)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_fn) xnn_qc8_gemm_minmax_fp32_ukernel_4x16__asm_aarch64_neon_mlal_lane_ld64);
1991 xnn_params.qc8.gemm.minmax.igemm[XNN_MR_TO_INDEX(4)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_fn) xnn_qc8_igemm_minmax_fp32_ukernel_4x16__asm_aarch64_neon_mlal_lane_ld64);
1992 xnn_params.qc8.gemm.minmax.gemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_fn) xnn_qc8_gemm_minmax_fp32_ukernel_1x16__neonv8_mlal_lane);
1993 xnn_params.qc8.gemm.minmax.igemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_fn) xnn_qc8_igemm_minmax_fp32_ukernel_1x16__neonv8_mlal_lane);
1994 xnn_params.qc8.gemm.init.qc8 = xnn_init_qc8_conv_minmax_fp32_neonv8_params;
1995 xnn_params.qc8.gemm.mr = 4;
1996 xnn_params.qc8.gemm.nr = 16;
1997 break;
1998
1999 case cpuinfo_uarch_cortex_a53:
2000 case cpuinfo_uarch_cortex_a55r0:
2001 xnn_params.qc8.gemm.minmax.gemm[XNN_MR_TO_INDEX(4)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_fn) xnn_qc8_gemm_minmax_fp32_ukernel_4x16__asm_aarch64_neon_mlal_lane_cortex_a53);
2002 xnn_params.qc8.gemm.minmax.igemm[XNN_MR_TO_INDEX(4)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_fn) xnn_qc8_igemm_minmax_fp32_ukernel_4x16__asm_aarch64_neon_mlal_lane_cortex_a53);
2003 xnn_params.qc8.gemm.minmax.gemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_fn) xnn_qc8_gemm_minmax_fp32_ukernel_1x16__neonv8_mlal_lane);
2004 xnn_params.qc8.gemm.minmax.igemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_fn) xnn_qc8_igemm_minmax_fp32_ukernel_1x16__neonv8_mlal_lane);
2005 xnn_params.qc8.gemm.init.qc8 = xnn_init_qc8_conv_minmax_fp32_neonv8_params;
2006 xnn_params.qc8.gemm.mr = 4;
2007 xnn_params.qc8.gemm.nr = 16;
2008 break;
2009
2010 case cpuinfo_uarch_cortex_a72:
2011 case cpuinfo_uarch_cortex_a73:
2012 xnn_params.qc8.gemm.minmax.gemm[XNN_MR_TO_INDEX(2)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_fn) xnn_qc8_gemm_minmax_fp32_ukernel_2x8c8__asm_aarch64_neon_mlal_prfm);
2013 xnn_params.qc8.gemm.minmax.igemm[XNN_MR_TO_INDEX(2)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_fn) xnn_qc8_igemm_minmax_fp32_ukernel_2x8c8__asm_aarch64_neon_mlal_prfm);
2014 xnn_params.qc8.gemm.minmax.gemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_fn) xnn_qc8_gemm_minmax_fp32_ukernel_1x8c8__asm_aarch64_neon_mlal_prfm);
2015 xnn_params.qc8.gemm.minmax.igemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_fn) xnn_qc8_igemm_minmax_fp32_ukernel_1x8c8__asm_aarch64_neon_mlal_prfm);
2016 xnn_params.qc8.gemm.init.qc8 = xnn_init_qc8_conv_minmax_fp32_neonv8_params;
2017 xnn_params.qc8.gemm.mr = 2;
2018 xnn_params.qc8.gemm.nr = 8;
2019 xnn_params.qc8.gemm.log2_kr = 3;
2020 break;
2021
2022 default:
2023 xnn_params.qc8.gemm.minmax.gemm[XNN_MR_TO_INDEX(2)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_fn) xnn_qc8_gemm_minmax_fp32_ukernel_2x8c8__asm_aarch64_neon_mlal);
2024 xnn_params.qc8.gemm.minmax.igemm[XNN_MR_TO_INDEX(2)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_fn) xnn_qc8_igemm_minmax_fp32_ukernel_2x8c8__asm_aarch64_neon_mlal);
2025 xnn_params.qc8.gemm.minmax.gemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_fn) xnn_qc8_gemm_minmax_fp32_ukernel_1x8c8__asm_aarch64_neon_mlal);
2026 xnn_params.qc8.gemm.minmax.igemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_fn) xnn_qc8_igemm_minmax_fp32_ukernel_1x8c8__asm_aarch64_neon_mlal);
2027 xnn_params.qc8.gemm.init.qc8 = xnn_init_qc8_conv_minmax_fp32_neonv8_params;
2028 xnn_params.qc8.gemm.mr = 2;
2029 xnn_params.qc8.gemm.nr = 8;
2030 xnn_params.qc8.gemm.log2_kr = 3;
2031 break;
2032 }
2033 }
2034 #if XNN_MAX_UARCH_TYPES > 1
2035 {
2036 /* Choose micro-kernels for little cores according to micro-kernel specification for the big core */
2037 const uint32_t mr = xnn_params.qc8.gemm.mr;
2038 const uint32_t nr = xnn_params.qc8.gemm.nr;
2039 const uint32_t log2_kr = xnn_params.qc8.gemm.log2_kr;
2040 for (size_t i = 1; i < XNN_MAX_UARCH_TYPES; i++) {
2041 const struct cpuinfo_uarch_info* uarch_info = cpuinfo_get_uarch(i);
2042 if (uarch_info == NULL) {
2043 /* No more microarchitectures in the system */
2044 break;
2045 }
2046
2047 switch (uarch_info->uarch) {
2048 case cpuinfo_uarch_cortex_a53:
2049 case cpuinfo_uarch_cortex_a55r0:
2050 if (mr == 2 && nr == 8 && log2_kr == 3) {
2051 xnn_params.qc8.gemm.minmax.gemm[XNN_MR_TO_INDEX(2)].function[i] = (xnn_gemm_ukernel_fn) xnn_qc8_gemm_minmax_fp32_ukernel_2x8c8__asm_aarch64_neon_mlal_prfm_cortex_a53;
2052 xnn_params.qc8.gemm.minmax.igemm[XNN_MR_TO_INDEX(2)].function[i] = (xnn_igemm_ukernel_fn) xnn_qc8_igemm_minmax_fp32_ukernel_2x8c8__asm_aarch64_neon_mlal_prfm_cortex_a53;
2053 xnn_params.qc8.gemm.minmax.gemm[XNN_MR_TO_INDEX(1)].function[i] = (xnn_gemm_ukernel_fn) xnn_qc8_gemm_minmax_fp32_ukernel_1x8c8__asm_aarch64_neon_mlal_prfm_cortex_a53;
2054 xnn_params.qc8.gemm.minmax.igemm[XNN_MR_TO_INDEX(1)].function[i] = (xnn_igemm_ukernel_fn) xnn_qc8_igemm_minmax_fp32_ukernel_1x8c8__asm_aarch64_neon_mlal_prfm_cortex_a53;
2055 }
2056 break;
2057
2058 case cpuinfo_uarch_cortex_a55:
2059 #if XNN_ENABLE_ARM_DOTPROD
2060 if (mr == 4 && nr == 16 && log2_kr == 2 && hardware_config->use_arm_neon_dot) {
2061 xnn_params.qc8.gemm.minmax.gemm[XNN_MR_TO_INDEX(4)].function[i] = (xnn_gemm_ukernel_fn) xnn_qc8_gemm_minmax_fp32_ukernel_4x16c4__asm_aarch64_neondot_cortex_a55;
2062 xnn_params.qc8.gemm.minmax.igemm[XNN_MR_TO_INDEX(4)].function[i] = (xnn_igemm_ukernel_fn) xnn_qc8_igemm_minmax_fp32_ukernel_4x16c4__asm_aarch64_neondot_cortex_a55;
2063 xnn_params.qc8.gemm.minmax.gemm[XNN_MR_TO_INDEX(1)].function[i] = (xnn_gemm_ukernel_fn) xnn_qc8_gemm_minmax_fp32_ukernel_1x16c4__neondot;
2064 xnn_params.qc8.gemm.minmax.igemm[XNN_MR_TO_INDEX(1)].function[i] = (xnn_igemm_ukernel_fn) xnn_qc8_igemm_minmax_fp32_ukernel_1x16c4__neondot;
2065 }
2066 #endif // XNN_ENABLE_ARM_DOTPROD
2067 break;
2068 default:
2069 break;
2070 }
2071 }
2072 }
2073 #endif // XNN_MAX_UARCH_TYPES > 1
2074 #else // !XNN_ENABLE_ASSEMBLY
2075 if (XNN_ENABLE_ARM_DOTPROD && hardware_config->use_arm_neon_dot) {
2076 #if XNN_ENABLE_ARM_DOTPROD
2077 xnn_params.qc8.gemm.minmax.gemm[XNN_MR_TO_INDEX(4)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_fn) xnn_qc8_gemm_minmax_fp32_ukernel_4x16c4__neondot);
2078 xnn_params.qc8.gemm.minmax.igemm[XNN_MR_TO_INDEX(4)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_fn) xnn_qc8_igemm_minmax_fp32_ukernel_4x16c4__neondot);
2079 xnn_params.qc8.gemm.minmax.gemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_fn) xnn_qc8_gemm_minmax_fp32_ukernel_1x16c4__neondot);
2080 xnn_params.qc8.gemm.minmax.igemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_fn) xnn_qc8_igemm_minmax_fp32_ukernel_1x16c4__neondot);
2081 xnn_params.qc8.gemm.init.qc8 = xnn_init_qc8_conv_minmax_fp32_neonv8_params;
2082 xnn_params.qc8.gemm.mr = 4;
2083 xnn_params.qc8.gemm.nr = 16;
2084 xnn_params.qc8.gemm.log2_kr = 2;
2085 #endif // XNN_ENABLE_ARM_DOTPROD
2086 } else {
2087 xnn_params.qc8.gemm.minmax.gemm[XNN_MR_TO_INDEX(2)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_fn) xnn_qc8_gemm_minmax_fp32_ukernel_2x8c2s4__neonv8_mlal);
2088 xnn_params.qc8.gemm.minmax.igemm[XNN_MR_TO_INDEX(2)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_fn) xnn_qc8_igemm_minmax_fp32_ukernel_2x8c2s4__neonv8_mlal);
2089 xnn_params.qc8.gemm.minmax.gemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_fn) xnn_qc8_gemm_minmax_fp32_ukernel_1x8c2s4__neonv8_mlal);
2090 xnn_params.qc8.gemm.minmax.igemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_fn) xnn_qc8_igemm_minmax_fp32_ukernel_1x8c2s4__neonv8_mlal);
2091 xnn_params.qc8.gemm.init.qc8 = xnn_init_qc8_conv_minmax_fp32_neonv8_params;
2092 xnn_params.qc8.gemm.mr = 2;
2093 xnn_params.qc8.gemm.nr = 8;
2094 xnn_params.qc8.gemm.log2_kr = 1;
2095 xnn_params.qc8.gemm.log2_sr = 2;
2096 }
2097 #endif // XNN_ENABLE_ASSEMBLY
2098 #endif // XNN_PLATFORM_IOS || XNN_PLATFORM_MAC || XNN_PLATFORM_WINDOWS
2099
2100 xnn_params.qc8.dwconv[0].minmax.unipass = (xnn_dwconv_unipass_ukernel_fn) xnn_qc8_dwconv_minmax_fp32_ukernel_3p16c__neonv8_mla8_ld128;
2101 xnn_params.qc8.dwconv[0].init.qc8 = xnn_init_qc8_conv_minmax_fp32_neonv8_params;
2102 xnn_params.qc8.dwconv[0].channel_tile = 16;
2103 xnn_params.qc8.dwconv[0].primary_tile = 3;
2104 xnn_params.qc8.dwconv[1].minmax.unipass = (xnn_dwconv_unipass_ukernel_fn) xnn_qc8_dwconv_minmax_fp32_ukernel_9p16c__neonv8_mla8_ld64;
2105 xnn_params.qc8.dwconv[1].init.qc8 = xnn_init_qc8_conv_minmax_fp32_neonv8_params;
2106 xnn_params.qc8.dwconv[1].channel_tile = 16;
2107 xnn_params.qc8.dwconv[1].primary_tile = 9;
2108 xnn_params.qc8.dwconv[2].minmax.unipass = (xnn_dwconv_unipass_ukernel_fn) xnn_qc8_dwconv_minmax_fp32_ukernel_25p16c__neonv8_mla8_ld64;
2109 xnn_params.qc8.dwconv[2].init.qc8 = xnn_init_qc8_conv_minmax_fp32_neonv8_params;
2110 xnn_params.qc8.dwconv[2].channel_tile = 16;
2111 xnn_params.qc8.dwconv[2].primary_tile = 25;
2112 #endif // XNN_NO_QC8_OPERATORS
2113
2114 /**************************** QS8 AArch64 micro-kernels ****************************/
2115 #ifndef XNN_NO_QS8_OPERATORS
2116 init_flags |= XNN_INIT_FLAG_QS8;
2117
2118 #if XNN_PLATFORM_IOS || XNN_PLATFORM_MAC || XNN_PLATFORM_WINDOWS
2119 #if XNN_ENABLE_ASSEMBLY
2120 if (XNN_ENABLE_ARM_DOTPROD && hardware_config->use_arm_neon_dot) {
2121 #if XNN_ENABLE_ARM_DOTPROD
2122 xnn_params.qs8.gemm.minmax.gemm[XNN_MR_TO_INDEX(4)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_fn) xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c4__asm_aarch64_neondot_ld128);
2123 xnn_params.qs8.gemm.minmax.gemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_fn) xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c4__neondot);
2124 xnn_params.qs8.gemm.minmax.igemm[XNN_MR_TO_INDEX(4)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_fn) xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4__asm_aarch64_neondot_ld128);
2125 xnn_params.qs8.gemm.minmax.igemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_fn) xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c4__neondot);
2126 xnn_params.qs8.gemm.init.qs8 = xnn_init_qs8_conv_minmax_rndnu_neon_params;
2127 xnn_params.qs8.gemm.mr = 4;
2128 xnn_params.qs8.gemm.nr = 16;
2129 xnn_params.qs8.gemm.log2_kr = 2;
2130 #endif // XNN_ENABLE_ARM_DOTPROD
2131 } else {
2132 xnn_params.qs8.gemm.minmax.gemm[XNN_MR_TO_INDEX(2)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_fn) xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c8__asm_aarch64_neon_mlal);
2133 xnn_params.qs8.gemm.minmax.igemm[XNN_MR_TO_INDEX(2)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_fn) xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c8__asm_aarch64_neon_mlal);
2134 xnn_params.qs8.gemm.minmax.gemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_fn) xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c8__asm_aarch64_neon_mlal);
2135 xnn_params.qs8.gemm.minmax.igemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_fn) xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c8__asm_aarch64_neon_mlal);
2136 xnn_params.qs8.gemm.init.qs8 = xnn_init_qs8_conv_minmax_rndnu_neon_params;
2137 xnn_params.qs8.gemm.mr = 2;
2138 xnn_params.qs8.gemm.nr = 8;
2139 xnn_params.qs8.gemm.log2_kr = 3;
2140 }
2141 #else // !XNN_ENABLE_ASSEMBLY
2142 if (XNN_ENABLE_ARM_DOTPROD && hardware_config->use_arm_neon_dot) {
2143 #if XNN_ENABLE_ARM_DOTPROD
2144 xnn_params.qs8.gemm.minmax.gemm[XNN_MR_TO_INDEX(4)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_fn) xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c4__neondot);
2145 xnn_params.qs8.gemm.minmax.gemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_fn) xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c4__neondot);
2146 xnn_params.qs8.gemm.minmax.igemm[XNN_MR_TO_INDEX(4)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_fn) xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4__neondot);
2147 xnn_params.qs8.gemm.minmax.igemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_fn) xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c4__neondot);
2148 xnn_params.qs8.gemm.init.qs8 = xnn_init_qs8_conv_minmax_rndnu_neon_params;
2149 xnn_params.qs8.gemm.mr = 4;
2150 xnn_params.qs8.gemm.nr = 16;
2151 xnn_params.qs8.gemm.log2_kr = 2;
2152 #endif // XNN_ENABLE_ARM_DOTPROD
2153 } else {
2154 xnn_params.qs8.gemm.minmax.gemm[XNN_MR_TO_INDEX(2)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_fn) xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c2s4__neon_mlal);
2155 xnn_params.qs8.gemm.minmax.igemm[XNN_MR_TO_INDEX(2)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_fn) xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c2s4__neon_mlal);
2156 xnn_params.qs8.gemm.minmax.gemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_fn) xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c2s4__neon_mlal);
2157 xnn_params.qs8.gemm.minmax.igemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_fn) xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2s4__neon_mlal);
2158 xnn_params.qs8.gemm.init.qs8 = xnn_init_qs8_conv_minmax_rndnu_neon_params;
2159 xnn_params.qs8.gemm.mr = 2;
2160 xnn_params.qs8.gemm.nr = 8;
2161 xnn_params.qs8.gemm.log2_kr = 1;
2162 xnn_params.qs8.gemm.log2_sr = 2;
2163 }
2164 #endif // XNN_ENABLE_ASSEMBLY
2165 #else // !XNN_PLATFORM_IOS && !XNN_PLATFORM_MAC
2166 #if XNN_ENABLE_ASSEMBLY
2167 if (XNN_ENABLE_ARM_DOTPROD && hardware_config->use_arm_neon_dot) {
2168 #if XNN_ENABLE_ARM_DOTPROD
2169 switch (cpuinfo_get_core(0)->uarch) {
2170 case cpuinfo_uarch_cortex_a55:
2171 xnn_params.qs8.gemm.minmax.gemm[XNN_MR_TO_INDEX(4)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_fn) xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c4__asm_aarch64_neondot_cortex_a55);
2172 xnn_params.qs8.gemm.minmax.igemm[XNN_MR_TO_INDEX(4)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_fn) xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4__asm_aarch64_neondot_cortex_a55);
2173 break;
2174 case cpuinfo_uarch_cortex_x1:
2175 case cpuinfo_uarch_cortex_a78:
2176 xnn_params.qs8.gemm.minmax.gemm[XNN_MR_TO_INDEX(4)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_fn) xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c4__asm_aarch64_neondot_ld128);
2177 xnn_params.qs8.gemm.minmax.igemm[XNN_MR_TO_INDEX(4)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_fn) xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4__asm_aarch64_neondot_ld128);
2178 break;
2179 default:
2180 xnn_params.qs8.gemm.minmax.gemm[XNN_MR_TO_INDEX(4)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_fn) xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c4__asm_aarch64_neondot_ld64);
2181 xnn_params.qs8.gemm.minmax.igemm[XNN_MR_TO_INDEX(4)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_fn) xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4__asm_aarch64_neondot_ld64);
2182 break;
2183 }
2184 xnn_params.qs8.gemm.minmax.gemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_fn) xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c4__neondot);
2185 xnn_params.qs8.gemm.minmax.igemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_fn) xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c4__neondot);
2186 xnn_params.qs8.gemm.init.qs8 = xnn_init_qs8_conv_minmax_rndnu_neon_params;
2187 xnn_params.qs8.gemm.mr = 4;
2188 xnn_params.qs8.gemm.nr = 16;
2189 xnn_params.qs8.gemm.log2_kr = 2;
2190 #endif // XNN_ENABLE_ARM_DOTPROD
2191 } else {
2192 switch (cpuinfo_get_core(0)->uarch) {
2193 case cpuinfo_uarch_cortex_a35:
2194 case cpuinfo_uarch_kryo:
2195 xnn_params.qs8.gemm.minmax.gemm[XNN_MR_TO_INDEX(4)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_fn) xnn_qs8_gemm_minmax_rndnu_ukernel_4x16__asm_aarch64_neon_mlal_lane_ld64);
2196 xnn_params.qs8.gemm.minmax.igemm[XNN_MR_TO_INDEX(4)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_fn) xnn_qs8_igemm_minmax_rndnu_ukernel_4x16__asm_aarch64_neon_mlal_lane_ld64);
2197 xnn_params.qs8.gemm.minmax.gemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_fn) xnn_qs8_gemm_minmax_rndnu_ukernel_1x16__neon_mlal_lane);
2198 xnn_params.qs8.gemm.minmax.igemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_fn) xnn_qs8_igemm_minmax_rndnu_ukernel_1x16__neon_mlal_lane);
2199 xnn_params.qs8.gemm.init.qs8 = xnn_init_qs8_conv_minmax_rndnu_neon_params;
2200 xnn_params.qs8.gemm.mr = 4;
2201 xnn_params.qs8.gemm.nr = 16;
2202 break;
2203
2204 case cpuinfo_uarch_cortex_a53:
2205 case cpuinfo_uarch_cortex_a55r0:
2206 xnn_params.qs8.gemm.minmax.gemm[XNN_MR_TO_INDEX(4)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_fn) xnn_qs8_gemm_minmax_rndnu_ukernel_4x16__asm_aarch64_neon_mlal_lane_cortex_a53);
2207 xnn_params.qs8.gemm.minmax.igemm[XNN_MR_TO_INDEX(4)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_fn) xnn_qs8_igemm_minmax_rndnu_ukernel_4x16__asm_aarch64_neon_mlal_lane_cortex_a53);
2208 xnn_params.qs8.gemm.minmax.gemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_fn) xnn_qs8_gemm_minmax_rndnu_ukernel_1x16__neon_mlal_lane);
2209 xnn_params.qs8.gemm.minmax.igemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_fn) xnn_qs8_igemm_minmax_rndnu_ukernel_1x16__neon_mlal_lane);
2210 xnn_params.qs8.gemm.init.qs8 = xnn_init_qs8_conv_minmax_rndnu_neon_params;
2211 xnn_params.qs8.gemm.mr = 4;
2212 xnn_params.qs8.gemm.nr = 16;
2213 break;
2214
2215 case cpuinfo_uarch_cortex_a72:
2216 case cpuinfo_uarch_cortex_a73:
2217 xnn_params.qs8.gemm.minmax.gemm[XNN_MR_TO_INDEX(2)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_fn) xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c8__asm_aarch64_neon_mlal_prfm);
2218 xnn_params.qs8.gemm.minmax.igemm[XNN_MR_TO_INDEX(2)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_fn) xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c8__asm_aarch64_neon_mlal_prfm);
2219 xnn_params.qs8.gemm.minmax.gemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_fn) xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c8__asm_aarch64_neon_mlal_prfm);
2220 xnn_params.qs8.gemm.minmax.igemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_fn) xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c8__asm_aarch64_neon_mlal_prfm);
2221 xnn_params.qs8.gemm.init.qs8 = xnn_init_qs8_conv_minmax_rndnu_neon_params;
2222 xnn_params.qs8.gemm.mr = 2;
2223 xnn_params.qs8.gemm.nr = 8;
2224 xnn_params.qs8.gemm.log2_kr = 3;
2225 break;
2226
2227 default:
2228 xnn_params.qs8.gemm.minmax.gemm[XNN_MR_TO_INDEX(2)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_fn) xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c8__asm_aarch64_neon_mlal);
2229 xnn_params.qs8.gemm.minmax.igemm[XNN_MR_TO_INDEX(2)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_fn) xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c8__asm_aarch64_neon_mlal);
2230 xnn_params.qs8.gemm.minmax.gemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_fn) xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c8__asm_aarch64_neon_mlal);
2231 xnn_params.qs8.gemm.minmax.igemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_fn) xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c8__asm_aarch64_neon_mlal);
2232 xnn_params.qs8.gemm.init.qs8 = xnn_init_qs8_conv_minmax_rndnu_neon_params;
2233 xnn_params.qs8.gemm.mr = 2;
2234 xnn_params.qs8.gemm.nr = 8;
2235 xnn_params.qs8.gemm.log2_kr = 3;
2236 break;
2237 }
2238 }
2239 #if XNN_MAX_UARCH_TYPES > 1
2240 {
2241 /* Choose micro-kernels for little cores according to micro-kernel specification for the big core */
2242 const uint32_t mr = xnn_params.qs8.gemm.mr;
2243 const uint32_t nr = xnn_params.qs8.gemm.nr;
2244 const uint32_t log2_kr = xnn_params.qs8.gemm.log2_kr;
2245 for (size_t i = 1; i < XNN_MAX_UARCH_TYPES; i++) {
2246 const struct cpuinfo_uarch_info* uarch_info = cpuinfo_get_uarch(i);
2247 if (uarch_info == NULL) {
2248 /* No more microarchitectures in the system */
2249 break;
2250 }
2251
2252 switch (uarch_info->uarch) {
2253 case cpuinfo_uarch_cortex_a53:
2254 case cpuinfo_uarch_cortex_a55r0:
2255 if (mr == 2 && nr == 8 && log2_kr == 3) {
2256 xnn_params.qs8.gemm.minmax.gemm[XNN_MR_TO_INDEX(2)].function[i] = (xnn_gemm_ukernel_fn) xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c8__asm_aarch64_neon_mlal_prfm_cortex_a53;
2257 xnn_params.qs8.gemm.minmax.igemm[XNN_MR_TO_INDEX(2)].function[i] = (xnn_igemm_ukernel_fn) xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c8__asm_aarch64_neon_mlal_prfm_cortex_a53;
2258 xnn_params.qs8.gemm.minmax.gemm[XNN_MR_TO_INDEX(1)].function[i] = (xnn_gemm_ukernel_fn) xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c8__asm_aarch64_neon_mlal_prfm_cortex_a53;
2259 xnn_params.qs8.gemm.minmax.igemm[XNN_MR_TO_INDEX(1)].function[i] = (xnn_igemm_ukernel_fn) xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c8__asm_aarch64_neon_mlal_prfm_cortex_a53;
2260 }
2261 break;
2262
2263 case cpuinfo_uarch_cortex_a55:
2264 #if XNN_ENABLE_ARM_DOTPROD
2265 if (mr == 4 && nr == 16 && log2_kr == 2 && hardware_config->use_arm_neon_dot) {
2266 xnn_params.qs8.gemm.minmax.gemm[XNN_MR_TO_INDEX(4)].function[i] = (xnn_gemm_ukernel_fn) xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c4__asm_aarch64_neondot_cortex_a55;
2267 xnn_params.qs8.gemm.minmax.igemm[XNN_MR_TO_INDEX(4)].function[i] = (xnn_igemm_ukernel_fn) xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4__asm_aarch64_neondot_cortex_a55;
2268 xnn_params.qs8.gemm.minmax.gemm[XNN_MR_TO_INDEX(1)].function[i] = (xnn_gemm_ukernel_fn) xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c4__neondot;
2269 xnn_params.qs8.gemm.minmax.igemm[XNN_MR_TO_INDEX(1)].function[i] = (xnn_igemm_ukernel_fn) xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c4__neondot;
2270 }
2271 #endif // XNN_ENABLE_ARM_DOTPROD
2272 break;
2273 default:
2274 break;
2275 }
2276 }
2277 }
2278 #endif // XNN_MAX_UARCH_TYPES > 1
2279 #else // !XNN_ENABLE_ASSEMBLY
2280 if (XNN_ENABLE_ARM_DOTPROD && hardware_config->use_arm_neon_dot) {
2281 #if XNN_ENABLE_ARM_DOTPROD
2282 xnn_params.qs8.gemm.minmax.gemm[XNN_MR_TO_INDEX(4)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_fn) xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c4__neondot);
2283 xnn_params.qs8.gemm.minmax.gemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_fn) xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c4__neondot);
2284 xnn_params.qs8.gemm.minmax.igemm[XNN_MR_TO_INDEX(4)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_fn) xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4__neondot);
2285 xnn_params.qs8.gemm.minmax.igemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_fn) xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c4__neondot);
2286 xnn_params.qs8.gemm.init.qs8 = xnn_init_qs8_conv_minmax_rndnu_neon_params;
2287 xnn_params.qs8.gemm.mr = 4;
2288 xnn_params.qs8.gemm.nr = 16;
2289 xnn_params.qs8.gemm.log2_kr = 2;
2290 #endif // XNN_ENABLE_ARM_DOTPROD
2291 } else {
2292 xnn_params.qs8.gemm.minmax.gemm[XNN_MR_TO_INDEX(2)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_fn) xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c2s4__neon_mlal);
2293 xnn_params.qs8.gemm.minmax.igemm[XNN_MR_TO_INDEX(2)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_fn) xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c2s4__neon_mlal);
2294 xnn_params.qs8.gemm.minmax.gemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_fn) xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c2s4__neon_mlal);
2295 xnn_params.qs8.gemm.minmax.igemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_fn) xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2s4__neon_mlal);
2296 xnn_params.qs8.gemm.init.qs8 = xnn_init_qs8_conv_minmax_rndnu_neon_params;
2297 xnn_params.qs8.gemm.mr = 2;
2298 xnn_params.qs8.gemm.nr = 8;
2299 xnn_params.qs8.gemm.log2_kr = 1;
2300 xnn_params.qs8.gemm.log2_sr = 2;
2301 }
2302 #endif // XNN_ENABLE_ASSEMBLY
2303 #endif // XNN_PLATFORM_IOS || XNN_PLATFORM_MAC || XNN_PLATFORM_WINDOWS
2304
2305 xnn_params.qs8.dwconv[0].minmax.unipass = (xnn_dwconv_unipass_ukernel_fn) xnn_qs8_dwconv_minmax_rndnu_ukernel_9p16c__neon_mla8_ld64;
2306 xnn_params.qs8.dwconv[0].init.qs8 = xnn_init_qs8_conv_minmax_rndnu_neon_params;
2307 xnn_params.qs8.dwconv[0].channel_tile = 16;
2308 xnn_params.qs8.dwconv[0].primary_tile = 9;
2309 xnn_params.qs8.dwconv[1].minmax.unipass = (xnn_dwconv_unipass_ukernel_fn) xnn_qs8_dwconv_minmax_rndnu_ukernel_25p16c__neon_mla8_ld64;
2310 xnn_params.qs8.dwconv[1].init.qs8 = xnn_init_qs8_conv_minmax_rndnu_neon_params;
2311 xnn_params.qs8.dwconv[1].channel_tile = 16;
2312 xnn_params.qs8.dwconv[1].primary_tile = 25;
2313
2314 xnn_params.qs8.gavgpool = (struct gavgpool_parameters) {
2315 .unipass = (xnn_gavgpool_unipass_ukernel_fn) xnn_qs8_gavgpool_minmax_rndnu_ukernel_7x__neon_c8,
2316 .multipass = (xnn_gavgpool_multipass_ukernel_fn) xnn_qs8_gavgpool_minmax_rndnu_ukernel_7p7x__neon_c8,
2317 .init.qs8 = xnn_init_qs8_avgpool_minmax_rndnu_neon_params,
2318 .update.qs8 = xnn_update_qs8_avgpool_minmax_rndnu_neon_params,
2319 .row_tile = 7,
2320 .channel_tile = 8,
2321 };
2322
2323
2324 xnn_params.qs8.lrelu = (struct vunary_parameters) {
2325 .ukernel = (xnn_vunary_ukernel_fn) xnn_qs8_vlrelu_ukernel__neon_x32,
2326 .init.qs8_lrelu = xnn_init_qs8_lrelu_neon_params,
2327 .element_tile = 32,
2328 };
2329 #endif // XNN_NO_QS8_OPERATORS
2330
2331 /**************************** QU8 AArch64 micro-kernels ****************************/
2332 #ifndef XNN_NO_QU8_OPERATORS
2333 init_flags |= XNN_INIT_FLAG_QU8;
2334
2335 #if XNN_ENABLE_ASSEMBLY
2336 if (XNN_ENABLE_ARM_DOTPROD && hardware_config->use_arm_neon_dot) {
2337 #if XNN_ENABLE_ARM_DOTPROD
2338 switch (cpuinfo_get_core(0)->uarch) {
2339 case cpuinfo_uarch_cortex_a55:
2340 xnn_params.qu8.gemm.minmax.gemm[XNN_MR_TO_INDEX(4)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_fn) xnn_qu8_gemm_minmax_rndnu_ukernel_4x16c4__asm_aarch64_neondot_cortex_a55);
2341 xnn_params.qu8.gemm.minmax.igemm[XNN_MR_TO_INDEX(4)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_fn) xnn_qu8_igemm_minmax_rndnu_ukernel_4x16c4__asm_aarch64_neondot_cortex_a55);
2342 xnn_params.qu8.gemm.minmax.gemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_fn) xnn_qu8_gemm_minmax_rndnu_ukernel_1x16c4__neondot);
2343 xnn_params.qu8.gemm.minmax.igemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_fn) xnn_qu8_igemm_minmax_rndnu_ukernel_1x16c4__neondot);
2344 xnn_params.qu8.gemm.init.qu8 = xnn_init_qu8_conv_minmax_rndnu_neon_params;
2345 xnn_params.qu8.gemm.mr = 4;
2346 xnn_params.qu8.gemm.nr = 16;
2347 xnn_params.qu8.gemm.log2_kr = 2;
2348 break;
2349 default:
2350 xnn_params.qu8.gemm.minmax.gemm[XNN_MR_TO_INDEX(4)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_fn) xnn_qu8_gemm_minmax_rndnu_ukernel_4x16c4__asm_aarch64_neondot_ld128);
2351 xnn_params.qu8.gemm.minmax.igemm[XNN_MR_TO_INDEX(4)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_fn) xnn_qu8_igemm_minmax_rndnu_ukernel_4x16c4__asm_aarch64_neondot_ld128);
2352 xnn_params.qu8.gemm.minmax.gemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_fn) xnn_qu8_gemm_minmax_rndnu_ukernel_1x16c4__neondot);
2353 xnn_params.qu8.gemm.minmax.igemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_fn) xnn_qu8_igemm_minmax_rndnu_ukernel_1x16c4__neondot);
2354 xnn_params.qu8.gemm.init.qu8 = xnn_init_qu8_conv_minmax_rndnu_neon_params;
2355 xnn_params.qu8.gemm.mr = 4;
2356 xnn_params.qu8.gemm.nr = 16;
2357 xnn_params.qu8.gemm.log2_kr = 2;
2358 break;
2359 }
2360 #endif // XNN_ENABLE_ARM_DOTPROD
2361 } else {
2362 switch (cpuinfo_get_core(0)->uarch) {
2363 case cpuinfo_uarch_cortex_a53:
2364 case cpuinfo_uarch_cortex_a55r0:
2365 case cpuinfo_uarch_kryo:
2366 xnn_params.qu8.gemm.minmax.gemm[XNN_MR_TO_INDEX(4)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_fn) xnn_qu8_gemm_minmax_rndnu_ukernel_4x16__asm_aarch64_neon_mlal_lane_prfm_cortex_a53);
2367 xnn_params.qu8.gemm.minmax.igemm[XNN_MR_TO_INDEX(4)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_fn) xnn_qu8_igemm_minmax_rndnu_ukernel_4x16__asm_aarch64_neon_mlal_lane_prfm_cortex_a53);
2368 xnn_params.qu8.gemm.minmax.gemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_fn) xnn_qu8_gemm_minmax_rndnu_ukernel_1x16__neon_mlal_lane);
2369 xnn_params.qu8.gemm.minmax.igemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_fn) xnn_qu8_igemm_minmax_rndnu_ukernel_1x16__neon_mlal_lane);
2370 xnn_params.qu8.gemm.init.qu8 = xnn_init_qu8_conv_minmax_rndnu_neon_params;
2371 xnn_params.qu8.gemm.mr = 4;
2372 xnn_params.qu8.gemm.nr = 16;
2373 break;
2374
2375 case cpuinfo_uarch_cortex_a57:
2376 case cpuinfo_uarch_cortex_a72:
2377 case cpuinfo_uarch_cortex_a73:
2378 case cpuinfo_uarch_cortex_a75:
2379 case cpuinfo_uarch_cortex_a76:
2380 case cpuinfo_uarch_exynos_m1:
2381 case cpuinfo_uarch_exynos_m2:
2382 case cpuinfo_uarch_exynos_m3:
2383 case cpuinfo_uarch_exynos_m4:
2384 xnn_params.qu8.gemm.minmax.gemm[XNN_MR_TO_INDEX(4)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_fn) xnn_qu8_gemm_minmax_rndnu_ukernel_4x16__asm_aarch64_neon_mlal_lane_prfm_cortex_a75);
2385 xnn_params.qu8.gemm.minmax.igemm[XNN_MR_TO_INDEX(4)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_fn) xnn_qu8_igemm_minmax_rndnu_ukernel_4x16__asm_aarch64_neon_mlal_lane_prfm_cortex_a75);
2386 xnn_params.qu8.gemm.minmax.gemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_fn) xnn_qu8_gemm_minmax_rndnu_ukernel_1x16__neon_mlal_lane);
2387 xnn_params.qu8.gemm.minmax.igemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_fn) xnn_qu8_igemm_minmax_rndnu_ukernel_1x16__neon_mlal_lane);
2388 xnn_params.qu8.gemm.init.qu8 = xnn_init_qu8_conv_minmax_rndnu_neon_params;
2389 xnn_params.qu8.gemm.mr = 4;
2390 xnn_params.qu8.gemm.nr = 16;
2391 break;
2392
2393 default:
2394 xnn_params.qu8.gemm.minmax.gemm[XNN_MR_TO_INDEX(4)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_fn) xnn_qu8_gemm_minmax_rndnu_ukernel_4x16__asm_aarch64_neon_mlal_lane_cortex_a75);
2395 xnn_params.qu8.gemm.minmax.igemm[XNN_MR_TO_INDEX(4)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_fn) xnn_qu8_igemm_minmax_rndnu_ukernel_4x16__asm_aarch64_neon_mlal_lane_cortex_a75);
2396 xnn_params.qu8.gemm.minmax.gemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_fn) xnn_qu8_gemm_minmax_rndnu_ukernel_1x16__neon_mlal_lane);
2397 xnn_params.qu8.gemm.minmax.igemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_fn) xnn_qu8_igemm_minmax_rndnu_ukernel_1x16__neon_mlal_lane);
2398 xnn_params.qu8.gemm.init.qu8 = xnn_init_qu8_conv_minmax_rndnu_neon_params;
2399 xnn_params.qu8.gemm.mr = 4;
2400 xnn_params.qu8.gemm.nr = 16;
2401 break;
2402 }
2403 }
2404 #if XNN_MAX_UARCH_TYPES > 1
2405 {
2406 /* Choose micro-kernels for little cores according to micro-kernel specification for the big core */
2407 const uint32_t mr = xnn_params.qu8.gemm.mr;
2408 const uint32_t nr = xnn_params.qu8.gemm.nr;
2409 const uint32_t log2_kr = xnn_params.qu8.gemm.log2_kr;
2410 for (size_t i = 1; i < XNN_MAX_UARCH_TYPES; i++) {
2411 const struct cpuinfo_uarch_info* uarch_info = cpuinfo_get_uarch(i);
2412 if (uarch_info == NULL) {
2413 /* No more microarchitectures in the system */
2414 break;
2415 }
2416
2417 switch (uarch_info->uarch) {
2418 case cpuinfo_uarch_cortex_a53:
2419 case cpuinfo_uarch_cortex_a55r0:
2420 if (mr == 4 && nr == 16 && log2_kr == 0) {
2421 xnn_params.qu8.gemm.minmax.gemm[XNN_MR_TO_INDEX(4)].function[i] = (xnn_gemm_ukernel_fn) xnn_qu8_gemm_minmax_rndnu_ukernel_4x16__asm_aarch64_neon_mlal_lane_prfm_cortex_a53;
2422 xnn_params.qu8.gemm.minmax.igemm[XNN_MR_TO_INDEX(4)].function[i] = (xnn_igemm_ukernel_fn) xnn_qu8_igemm_minmax_rndnu_ukernel_4x16__asm_aarch64_neon_mlal_lane_prfm_cortex_a53;
2423 }
2424 break;
2425
2426 case cpuinfo_uarch_cortex_a55:
2427 #if XNN_ENABLE_ARM_DOTPROD
2428 if (mr == 4 && nr == 16 && log2_kr == 2 && hardware_config->use_arm_neon_dot) {
2429 xnn_params.qu8.gemm.minmax.gemm[XNN_MR_TO_INDEX(4)].function[i] = (xnn_gemm_ukernel_fn) xnn_qu8_gemm_minmax_rndnu_ukernel_4x16c4__asm_aarch64_neondot_cortex_a55;
2430 xnn_params.qu8.gemm.minmax.igemm[XNN_MR_TO_INDEX(4)].function[i] = (xnn_igemm_ukernel_fn) xnn_qu8_igemm_minmax_rndnu_ukernel_4x16c4__asm_aarch64_neondot_cortex_a55;
2431 }
2432 #endif // XNN_ENABLE_ARM_DOTPROD
2433 break;
2434 default:
2435 break;
2436 }
2437 }
2438 }
2439 #endif // XNN_MAX_UARCH_TYPES > 1
2440 #else // !XNN_ENABLE_ASSEMBLY
2441 if (XNN_ENABLE_ARM_DOTPROD && hardware_config->use_arm_neon_dot) {
2442 #if XNN_ENABLE_ARM_DOTPROD
2443 xnn_params.qu8.gemm.minmax.gemm[XNN_MR_TO_INDEX(4)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_fn) xnn_qu8_gemm_minmax_rndnu_ukernel_4x16c4__neondot);
2444 xnn_params.qu8.gemm.minmax.igemm[XNN_MR_TO_INDEX(4)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_fn) xnn_qu8_igemm_minmax_rndnu_ukernel_4x16c4__neondot);
2445 xnn_params.qu8.gemm.minmax.gemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_fn) xnn_qu8_gemm_minmax_rndnu_ukernel_1x16c4__neondot);
2446 xnn_params.qu8.gemm.minmax.igemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_fn) xnn_qu8_igemm_minmax_rndnu_ukernel_1x16c4__neondot);
2447 xnn_params.qu8.gemm.init.qu8 = xnn_init_qu8_conv_minmax_rndnu_neon_params;
2448 xnn_params.qu8.gemm.mr = 4;
2449 xnn_params.qu8.gemm.nr = 16;
2450 xnn_params.qu8.gemm.log2_kr = 2;
2451 #endif // XNN_ENABLE_ARM_DOTPROD
2452 } else {
2453 xnn_params.qu8.gemm.minmax.gemm[XNN_MR_TO_INDEX(4)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_fn) xnn_qu8_gemm_minmax_rndnu_ukernel_4x16__neon_mlal_lane);
2454 xnn_params.qu8.gemm.minmax.igemm[XNN_MR_TO_INDEX(4)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_fn) xnn_qu8_igemm_minmax_rndnu_ukernel_4x16__neon_mlal_lane);
2455 xnn_params.qu8.gemm.minmax.gemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_fn) xnn_qu8_gemm_minmax_rndnu_ukernel_1x16__neon_mlal_lane);
2456 xnn_params.qu8.gemm.minmax.igemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_fn) xnn_qu8_igemm_minmax_rndnu_ukernel_1x16__neon_mlal_lane);
2457 xnn_params.qu8.gemm.init.qu8 = xnn_init_qu8_conv_minmax_rndnu_neon_params;
2458 xnn_params.qu8.gemm.mr = 4;
2459 xnn_params.qu8.gemm.nr = 16;
2460 }
2461 #endif // XNN_ENABLE_ASSEMBLY
2462
2463 xnn_params.qu8.dwconv[0].minmax.unipass = (xnn_dwconv_unipass_ukernel_fn) xnn_qu8_dwconv_minmax_rndnu_ukernel_9p16c__neon_mul8;
2464 xnn_params.qu8.dwconv[0].init.qu8 = xnn_init_qu8_conv_minmax_rndnu_neon_params;
2465 xnn_params.qu8.dwconv[0].channel_tile = 16;
2466 xnn_params.qu8.dwconv[0].primary_tile = 9;
2467 xnn_params.qu8.dwconv[1].minmax.unipass = (xnn_dwconv_unipass_ukernel_fn) xnn_qu8_dwconv_minmax_rndnu_ukernel_25p8c__neon_mul8;
2468 xnn_params.qu8.dwconv[1].init.qu8 = xnn_init_qu8_conv_minmax_rndnu_neon_params;
2469 xnn_params.qu8.dwconv[1].channel_tile = 8;
2470 xnn_params.qu8.dwconv[1].primary_tile = 25;
2471
2472 xnn_params.qu8.avgpool = (struct avgpool_parameters) {
2473 .unipass = (xnn_avgpool_unipass_ukernel_fn) xnn_qu8_avgpool_minmax_fp32_ukernel_9x__neon_c8,
2474 .multipass = (xnn_avgpool_multipass_ukernel_fn) xnn_qu8_avgpool_minmax_fp32_ukernel_9p8x__neon_c8,
2475 .init.qu8 = xnn_init_qu8_avgpool_minmax_fp32_neon_params,
2476 .primary_tile = 9,
2477 .incremental_tile = 8,
2478 .channel_tile = 8,
2479 };
2480 xnn_params.qu8.gavgpool = (struct gavgpool_parameters) {
2481 .unipass = (xnn_gavgpool_unipass_ukernel_fn) xnn_qu8_gavgpool_minmax_rndnu_ukernel_7x__neon_c8,
2482 .multipass = (xnn_gavgpool_multipass_ukernel_fn) xnn_qu8_gavgpool_minmax_rndnu_ukernel_7p7x__neon_c8,
2483 .init.qu8 = xnn_init_qu8_avgpool_minmax_rndnu_neon_params,
2484 .update.qu8 = xnn_update_qu8_avgpool_minmax_rndnu_neon_params,
2485 .row_tile = 7,
2486 .channel_tile = 8,
2487 };
2488
2489 xnn_params.qu8.lrelu = (struct vunary_parameters) {
2490 .ukernel = (xnn_vunary_ukernel_fn) xnn_qu8_vlrelu_ukernel__neon_x32,
2491 .init.qu8_lrelu = xnn_init_qu8_lrelu_neon_params,
2492 .element_tile = 32,
2493 };
2494 #endif // XNN_NO_QU8_OPERATORS
2495
2496 /**************************** S8 AArch64 micro-kernels ****************************/
2497 #ifndef XNN_NO_S8_OPERATORS
2498 init_flags |= XNN_INIT_FLAG_S8;
2499
2500 xnn_params.s8.clamp = (struct vunary_parameters) {
2501 .ukernel = (xnn_vunary_ukernel_fn) xnn_s8_vclamp_ukernel__neon_x64,
2502 .init.s8_minmax = xnn_init_s8_minmax_neon_params,
2503 .element_tile = 64,
2504 };
2505 xnn_params.s8.ibilinear = (struct ibilinear_parameters) {
2506 .ukernel = (xnn_ibilinear_ukernel_fn) xnn_s8_ibilinear_ukernel__neon_c16,
2507 .pixel_tile = 1,
2508 .channel_tile = 16,
2509 };
2510 xnn_params.s8.maxpool = (struct maxpool_parameters) {
2511 .ukernel = (xnn_maxpool_ukernel_fn) xnn_s8_maxpool_minmax_ukernel_9p8x__neon_c16,
2512 .init.s8 = xnn_init_s8_minmax_neon_params,
2513 .mr = 9,
2514 .qr = 8,
2515 };
2516 #endif // XNN_NO_S8_OPERATORS
2517
2518 /**************************** U8 AArch64 micro-kernels ****************************/
2519 #ifndef XNN_NO_U8_OPERATORS
2520 init_flags |= XNN_INIT_FLAG_U8;
2521
2522 xnn_params.u8.clamp = (struct vunary_parameters) {
2523 .ukernel = (xnn_vunary_ukernel_fn) xnn_u8_vclamp_ukernel__neon_x64,
2524 .init.u8_minmax = xnn_init_u8_minmax_neon_params,
2525 .element_tile = 64,
2526 };
2527 xnn_params.u8.ibilinear = (struct ibilinear_parameters) {
2528 .ukernel = (xnn_ibilinear_ukernel_fn) xnn_u8_ibilinear_ukernel__neon_c16,
2529 .pixel_tile = 1,
2530 .channel_tile = 16,
2531 };
2532 xnn_params.u8.maxpool = (struct maxpool_parameters) {
2533 .ukernel = (xnn_maxpool_ukernel_fn) xnn_u8_maxpool_minmax_ukernel_9p8x__neon_c16,
2534 .init.u8 = xnn_init_u8_minmax_neon_params,
2535 .mr = 9,
2536 .qr = 8,
2537 };
2538 xnn_params.u8.lut32norm = xnn_u8_lut32norm_ukernel__scalar;
2539 xnn_params.u8.rmax = xnn_u8_rmax_ukernel__neon;
2540 #endif // XNN_NO_U8_OPERATORS
2541
2542 /**************************** X8 AArch64 micro-kernels ****************************/
2543 #ifndef XNN_NO_X8_OPERATORS
2544 init_flags |= XNN_INIT_FLAG_X8;
2545
2546 xnn_params.x8.zip = (struct zip_parameters) {
2547 .x2 = (xnn_zipc_ukernel_fn) xnn_x8_zip_x2_ukernel__neon,
2548 .x3 = (xnn_zipc_ukernel_fn) xnn_x8_zip_x3_ukernel__neon,
2549 .x4 = (xnn_zipc_ukernel_fn) xnn_x8_zip_x4_ukernel__neon,
2550 .xm = (xnn_zipv_ukernel_fn) xnn_x8_zip_xm_ukernel__neon,
2551 };
2552 #endif // XNN_NO_X8_OPERATORS
2553
2554 /**************************** F16 AArch64 micro-kernels ****************************/
2555 #ifndef XNN_NO_F16_OPERATORS
2556 #if XNN_ENABLE_ARM_FP16_VECTOR
2557 if (hardware_config->use_arm_neon_fp16_arith) {
2558 init_flags |= XNN_INIT_FLAG_F16 | XNN_INIT_FLAG_F16_NATIVE;
2559
2560 #if XNN_ENABLE_ASSEMBLY
2561 switch (cpuinfo_get_core(0)->uarch) {
2562 case cpuinfo_uarch_cortex_a55:
2563 xnn_params.f16.gemm.minmax.gemm[XNN_MR_TO_INDEX(6)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_fn) xnn_f16_gemm_minmax_ukernel_6x16__asm_aarch64_neonfp16arith_cortex_a55);
2564 xnn_params.f16.gemm.minmax.igemm[XNN_MR_TO_INDEX(6)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_fn) xnn_f16_igemm_minmax_ukernel_6x16__asm_aarch64_neonfp16arith_cortex_a55);
2565 xnn_params.f16.gemm.minmax.gemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_fn) xnn_f16_gemm_minmax_ukernel_1x16__asm_aarch64_neonfp16arith_ld64);
2566 xnn_params.f16.gemm.minmax.igemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_fn) xnn_f16_igemm_minmax_ukernel_1x16__asm_aarch64_neonfp16arith_ld64);
2567 xnn_params.f16.gemm.init.f16 = xnn_init_f16_minmax_fp16arith_params;
2568 xnn_params.f16.gemm.mr = 6;
2569 xnn_params.f16.gemm.nr = 16;
2570 break;
2571 case cpuinfo_uarch_cortex_a55r0:
2572 case cpuinfo_uarch_cortex_a75:
2573 xnn_params.f16.gemm.minmax.gemm[XNN_MR_TO_INDEX(6)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_fn) xnn_f16_gemm_minmax_ukernel_6x16__asm_aarch64_neonfp16arith_cortex_a55r0);
2574 xnn_params.f16.gemm.minmax.igemm[XNN_MR_TO_INDEX(6)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_fn) xnn_f16_igemm_minmax_ukernel_6x16__asm_aarch64_neonfp16arith_cortex_a55r0);
2575 xnn_params.f16.gemm.minmax.gemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_fn) xnn_f16_gemm_minmax_ukernel_1x16__asm_aarch64_neonfp16arith_ld64);
2576 xnn_params.f16.gemm.minmax.igemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_fn) xnn_f16_igemm_minmax_ukernel_1x16__asm_aarch64_neonfp16arith_ld64);
2577 xnn_params.f16.gemm.init.f16 = xnn_init_f16_minmax_fp16arith_params;
2578 xnn_params.f16.gemm.mr = 6;
2579 xnn_params.f16.gemm.nr = 16;
2580 break;
2581 case cpuinfo_uarch_exynos_m5:
2582 xnn_params.f16.gemm.minmax.gemm[XNN_MR_TO_INDEX(4)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_fn) xnn_f16_gemm_minmax_ukernel_4x16__asm_aarch64_neonfp16arith_ld64);
2583 xnn_params.f16.gemm.minmax.igemm[XNN_MR_TO_INDEX(4)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_fn) xnn_f16_igemm_minmax_ukernel_4x16__asm_aarch64_neonfp16arith_ld64);
2584 xnn_params.f16.gemm.minmax.gemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_fn) xnn_f16_gemm_minmax_ukernel_1x16__asm_aarch64_neonfp16arith_ld64);
2585 xnn_params.f16.gemm.minmax.igemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_fn) xnn_f16_igemm_minmax_ukernel_1x16__asm_aarch64_neonfp16arith_ld64);
2586 xnn_params.f16.gemm.init.f16 = xnn_init_f16_minmax_fp16arith_params;
2587 xnn_params.f16.gemm.mr = 4;
2588 xnn_params.f16.gemm.nr = 16;
2589 break;
2590 case cpuinfo_uarch_exynos_m4:
2591 xnn_params.f16.gemm.minmax.gemm[XNN_MR_TO_INDEX(6)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_fn) xnn_f16_gemm_minmax_ukernel_6x16__asm_aarch64_neonfp16arith_ld64);
2592 xnn_params.f16.gemm.minmax.igemm[XNN_MR_TO_INDEX(6)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_fn) xnn_f16_igemm_minmax_ukernel_6x16__asm_aarch64_neonfp16arith_ld64);
2593 xnn_params.f16.gemm.minmax.gemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_fn) xnn_f16_gemm_minmax_ukernel_1x16__asm_aarch64_neonfp16arith_ld64);
2594 xnn_params.f16.gemm.minmax.igemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_fn) xnn_f16_igemm_minmax_ukernel_1x16__asm_aarch64_neonfp16arith_ld64);
2595 xnn_params.f16.gemm.init.f16 = xnn_init_f16_minmax_fp16arith_params;
2596 xnn_params.f16.gemm.mr = 6;
2597 xnn_params.f16.gemm.nr = 16;
2598 break;
2599 default:
2600 case cpuinfo_uarch_cortex_a76:
2601 case cpuinfo_uarch_cortex_a77:
2602 case cpuinfo_uarch_cortex_a78:
2603 case cpuinfo_uarch_cortex_x1:
2604 xnn_params.f16.gemm.minmax.gemm[XNN_MR_TO_INDEX(6)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_fn) xnn_f16_gemm_minmax_ukernel_6x16__asm_aarch64_neonfp16arith_cortex_a75);
2605 xnn_params.f16.gemm.minmax.igemm[XNN_MR_TO_INDEX(6)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_fn) xnn_f16_igemm_minmax_ukernel_6x16__asm_aarch64_neonfp16arith_cortex_a75);
2606 xnn_params.f16.gemm.minmax.gemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_fn) xnn_f16_gemm_minmax_ukernel_1x16__asm_aarch64_neonfp16arith_ld64);
2607 xnn_params.f16.gemm.minmax.igemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_fn) xnn_f16_igemm_minmax_ukernel_1x16__asm_aarch64_neonfp16arith_ld64);
2608 xnn_params.f16.gemm.init.f16 = xnn_init_f16_minmax_fp16arith_params;
2609 xnn_params.f16.gemm.mr = 6;
2610 xnn_params.f16.gemm.nr = 16;
2611 break;
2612 }
2613
2614 #if XNN_MAX_UARCH_TYPES > 1
2615 {
2616 /* Choose micro-kernels for little cores according to micro-kernel specification for the big core */
2617 const uint32_t mr = xnn_params.f16.gemm.mr;
2618 const uint32_t nr = xnn_params.f16.gemm.nr;
2619 for (size_t i = 1; i < XNN_MAX_UARCH_TYPES; i++) {
2620 const struct cpuinfo_uarch_info* uarch_info = cpuinfo_get_uarch(i);
2621 if (uarch_info == NULL) {
2622 /* No more microarchitectures in the system */
2623 break;
2624 }
2625
2626 switch (uarch_info->uarch) {
2627 case cpuinfo_uarch_cortex_a55:
2628 if (mr == 6 && nr == 16) {
2629 xnn_params.f16.gemm.minmax.gemm[XNN_MR_TO_INDEX(6)].function[i] = (xnn_gemm_ukernel_fn) xnn_f16_gemm_minmax_ukernel_6x16__asm_aarch64_neonfp16arith_cortex_a55;
2630 xnn_params.f16.gemm.minmax.igemm[XNN_MR_TO_INDEX(6)].function[i] = (xnn_igemm_ukernel_fn) xnn_f16_igemm_minmax_ukernel_6x16__asm_aarch64_neonfp16arith_cortex_a55;
2631 }
2632 break;
2633 case cpuinfo_uarch_cortex_a55r0:
2634 case cpuinfo_uarch_cortex_a75:
2635 if (mr == 6 && nr == 16) {
2636 xnn_params.f16.gemm.minmax.gemm[XNN_MR_TO_INDEX(6)].function[i] = (xnn_gemm_ukernel_fn) xnn_f16_gemm_minmax_ukernel_6x16__asm_aarch64_neonfp16arith_cortex_a55r0;
2637 xnn_params.f16.gemm.minmax.igemm[XNN_MR_TO_INDEX(6)].function[i] = (xnn_igemm_ukernel_fn) xnn_f16_igemm_minmax_ukernel_6x16__asm_aarch64_neonfp16arith_cortex_a55r0;
2638 }
2639 break;
2640 default:
2641 break;
2642 }
2643 }
2644 }
2645 #endif // XNN_MAX_UARCH_TYPES > 1
2646 #else // XNN_ENABLE_ASSEMBLY
2647 xnn_params.f16.gemm.minmax.gemm[XNN_MR_TO_INDEX(6)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_fn) xnn_f16_gemm_minmax_ukernel_6x16__neonfp16arith_ld64);
2648 xnn_params.f16.gemm.minmax.igemm[XNN_MR_TO_INDEX(6)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_fn) xnn_f16_igemm_minmax_ukernel_6x16__neonfp16arith_ld64);
2649 xnn_params.f16.gemm.minmax.gemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_fn) xnn_f16_gemm_minmax_ukernel_1x16__neonfp16arith_ld64);
2650 xnn_params.f16.gemm.minmax.igemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_fn) xnn_f16_igemm_minmax_ukernel_1x16__neonfp16arith_ld64);
2651 xnn_params.f16.gemm.init.f16 = xnn_init_f16_minmax_fp16arith_params;
2652 xnn_params.f16.gemm.mr = 6;
2653 xnn_params.f16.gemm.nr = 16;
2654 #endif // XNN_ENABLE_ASSEMBLY
2655
2656 xnn_params.f16.dwconv[0].minmax.unipass = (xnn_dwconv_unipass_ukernel_fn) xnn_f16_dwconv_minmax_ukernel_3p16c__neonfp16arith;
2657 xnn_params.f16.dwconv[0].init.f16 = xnn_init_f16_minmax_fp16arith_params;
2658 xnn_params.f16.dwconv[0].channel_tile = 16;
2659 xnn_params.f16.dwconv[0].primary_tile = 3;
2660
2661 xnn_params.f16.dwconv[1].minmax.unipass = (xnn_dwconv_unipass_ukernel_fn) xnn_f16_dwconv_minmax_ukernel_4p16c__neonfp16arith;
2662 xnn_params.f16.dwconv[1].init.f16 = xnn_init_f16_minmax_fp16arith_params;
2663 xnn_params.f16.dwconv[1].channel_tile = 16;
2664 xnn_params.f16.dwconv[1].primary_tile = 4;
2665
2666 xnn_params.f16.dwconv[2].minmax.unipass = (xnn_dwconv_unipass_ukernel_fn) xnn_f16_dwconv_minmax_ukernel_9p16c__neonfp16arith;
2667 xnn_params.f16.dwconv[2].init.f16 = xnn_init_f16_minmax_fp16arith_params;
2668 xnn_params.f16.dwconv[2].channel_tile = 16;
2669 xnn_params.f16.dwconv[2].primary_tile = 9;
2670
2671 xnn_params.f16.dwconv[3].minmax.unipass = (xnn_dwconv_unipass_ukernel_fn) xnn_f16_dwconv_minmax_ukernel_25p8c__neonfp16arith_acc2;
2672 xnn_params.f16.dwconv[3].init.f16 = xnn_init_f16_minmax_fp16arith_params;
2673 xnn_params.f16.dwconv[3].channel_tile = 8;
2674 xnn_params.f16.dwconv[3].primary_tile = 25;
2675
2676 xnn_params.f16.avgpool = (struct avgpool_parameters) {
2677 .unipass = (xnn_avgpool_unipass_ukernel_fn) xnn_f16_avgpool_minmax_ukernel_9x__neonfp16arith_c8,
2678 .multipass = (xnn_avgpool_multipass_ukernel_fn) xnn_f16_avgpool_minmax_ukernel_9p8x__neonfp16arith_c8,
2679 .init.f16 = xnn_init_f16_scaleminmax_fp16arith_params,
2680 .primary_tile = 9,
2681 .incremental_tile = 8,
2682 .channel_tile = 8,
2683 };
2684 xnn_params.f16.pavgpool = (struct pavgpool_parameters) {
2685 .unipass = (xnn_pavgpool_unipass_ukernel_fn) xnn_f16_pavgpool_minmax_ukernel_9x__neonfp16arith_c8,
2686 .multipass = (xnn_pavgpool_multipass_ukernel_fn) xnn_f16_pavgpool_minmax_ukernel_9p8x__neonfp16arith_c8,
2687 .init.f16 = xnn_init_f16_minmax_fp16arith_params,
2688 .primary_tile = 9,
2689 .incremental_tile = 8,
2690 .channel_tile = 8,
2691 };
2692 xnn_params.f16.gavgpool = (struct gavgpool_parameters) {
2693 .unipass = (xnn_gavgpool_unipass_ukernel_fn) xnn_f16_gavgpool_minmax_ukernel_7x__neonfp16arith_c8,
2694 .multipass = (xnn_gavgpool_multipass_ukernel_fn) xnn_f16_gavgpool_minmax_ukernel_7p7x__neonfp16arith_c8,
2695 .init.f16 = xnn_init_f16_scaleminmax_fp16arith_params,
2696 .update.f16 = xnn_update_f16_scaleminmax_fp16arith_params,
2697 .row_tile = 7,
2698 .channel_tile = 8,
2699 };
2700
2701 xnn_params.f16.maxpool = (struct maxpool_parameters) {
2702 .ukernel = (xnn_maxpool_ukernel_fn) xnn_f16_maxpool_minmax_ukernel_9p8x__neonfp16arith_c8,
2703 .init.f16 = xnn_init_f16_minmax_fp16arith_params,
2704 .mr = 9,
2705 .qr = 8,
2706 };
2707 xnn_params.f16.ibilinear = (struct ibilinear_parameters) {
2708 .ukernel = (xnn_ibilinear_ukernel_fn) xnn_f16_ibilinear_ukernel__neonfp16arith_c8,
2709 .pixel_tile = 1,
2710 .channel_tile = 8,
2711 };
2712
2713 xnn_params.f16.prelu = (struct prelu_parameters) {
2714 .ukernel = (xnn_prelu_ukernel_fn) xnn_f16_prelu_ukernel__neonfp16arith_2x16,
2715 .row_tile = 2,
2716 .channel_tile = 16,
2717 };
2718
2719 xnn_params.f16.raddstoreexpminusmax = (struct raddstoreexpminusmax_parameters) {
2720 .ukernel = (xnn_raddstoreexpminusmax_ukernel_fn) xnn_f16_raddstoreexpminusmax_ukernel__neonfp16arith_rr2_p2_x40,
2721 .init.f16 = xnn_init_f16_expminus_fp16arith_rr2_p2_params,
2722 .element_tile = 40,
2723 };
2724 xnn_params.f16.rmax = (xnn_rmax_ukernel_fn) xnn_f16_rmax_ukernel__neonfp16arith;
2725
2726 xnn_params.f16.vmulcaddc = (struct vmulcaddc_parameters) {
2727 .ukernel = (xnn_vmulcaddc_ukernel_fn) xnn_f16_vmulcaddc_minmax_ukernel_c8__neonfp16arith_2x,
2728 .init.f16 = xnn_init_f16_minmax_fp16arith_params,
2729 .channel_tile = 8,
2730 .row_tile = 2,
2731 };
2732
2733 xnn_params.f16.abs = (struct vunary_parameters) {
2734 .ukernel = (xnn_vunary_ukernel_fn) xnn_f16_vabs_ukernel__neonfp16arith_x16,
2735 .element_tile = 16,
2736 };
2737 xnn_params.f16.clamp = (struct vunary_parameters) {
2738 .ukernel = (xnn_vunary_ukernel_fn) xnn_f16_vclamp_ukernel__neonfp16arith_x16,
2739 .init.f16_minmax = xnn_init_f16_minmax_fp16arith_params,
2740 .element_tile = 16,
2741 };
2742 xnn_params.f16.elu = (struct vunary_parameters) {
2743 .ukernel = (xnn_vunary_ukernel_fn) xnn_f16_velu_ukernel__neonfp16arith_rr1_p3_x16,
2744 .init.f16_elu = xnn_init_f16_elu_fp16arith_rr1_p3_params,
2745 .element_tile = 16,
2746 };
2747 xnn_params.f16.hswish = (struct vunary_parameters) {
2748 .ukernel = (xnn_vunary_ukernel_fn) xnn_f16_vhswish_ukernel__neonfp16arith_x16,
2749 .init.f16_hswish = xnn_init_f16_hswish_fp16arith_params,
2750 .element_tile = 16,
2751 };
2752 xnn_params.f16.lrelu = (struct vunary_parameters) {
2753 .ukernel = (xnn_vunary_ukernel_fn) xnn_f16_vlrelu_ukernel__neonfp16arith_x16,
2754 .init.f16_lrelu = xnn_init_f16_lrelu_fp16arith_params,
2755 .element_tile = 16,
2756 };
2757 xnn_params.f16.neg = (struct vunary_parameters) {
2758 .ukernel = (xnn_vunary_ukernel_fn) xnn_f16_vneg_ukernel__neonfp16arith_x16,
2759 .element_tile = 16,
2760 };
2761 xnn_params.f16.rndne = (struct vunary_parameters) {
2762 .ukernel = (xnn_vunary_ukernel_fn) xnn_f16_vrndne_ukernel__neonfp16arith_x16,
2763 .element_tile = 16,
2764 };
2765 xnn_params.f16.rndz = (struct vunary_parameters) {
2766 .ukernel = (xnn_vunary_ukernel_fn) xnn_f16_vrndz_ukernel__neonfp16arith_x16,
2767 .element_tile = 16,
2768 };
2769 xnn_params.f16.rndu = (struct vunary_parameters) {
2770 .ukernel = (xnn_vunary_ukernel_fn) xnn_f16_vrndu_ukernel__neonfp16arith_x16,
2771 .element_tile = 16,
2772 };
2773 xnn_params.f16.rndd = (struct vunary_parameters) {
2774 .ukernel = (xnn_vunary_ukernel_fn) xnn_f16_vrndd_ukernel__neonfp16arith_x16,
2775 .element_tile = 16,
2776 };
2777 xnn_params.f16.sigmoid = (struct vunary_parameters) {
2778 .ukernel = (xnn_vunary_ukernel_fn) xnn_f16_vsigmoid_ukernel__neonfp16arith_rr2_p2_nr1fma_x40,
2779 .init.f16_sigmoid = xnn_init_f16_sigmoid_fp16arith_rr2_p2_params,
2780 .element_tile = 40,
2781 };
2782 xnn_params.f16.sqr = (struct vunary_parameters) {
2783 .ukernel = (xnn_vunary_ukernel_fn) xnn_f16_vsqr_ukernel__neonfp16arith_x16,
2784 .element_tile = 16,
2785 };
2786 xnn_params.f16.sqrt = (struct vunary_parameters) {
2787 .ukernel = (xnn_vunary_ukernel_fn) xnn_f16_vsqrt_ukernel__aarch64_neonfp16arith_sqrt_x8,
2788 .element_tile = 8,
2789 };
2790
2791 #ifndef XNN_NO_NCHW_OPERATORS
2792 init_flags |= XNN_INIT_FLAG_CHW_OPT;
2793
2794 xnn_params.f16.spmm = (struct spmm_parameters) {
2795 .ukernel = (xnn_spmm_ukernel_fn) xnn_f16_spmm_minmax_ukernel_32x1__neonfp16arith_pipelined,
2796 .init.f16 = xnn_init_f16_minmax_fp16arith_params,
2797 .mr = 32,
2798 .nr = 1,
2799 };
2800 xnn_params.f16.conv_hwc2chw_3x3c3s2 = (struct conv_hwc2chw_parameters) {
2801 .ukernel_with_symm_padding =
2802 (xnn_conv_hwc2chw_ukernel_fn) xnn_f16_conv_hwc2chw_ukernel_3x3s2p1c3x4__neonfp16arith_2x2,
2803 .init.f16 = xnn_init_f16_minmax_fp16arith_params,
2804 .output_channel_tile = 4,
2805 .output_height_tile = 2,
2806 .output_width_tile = 2,
2807 };
2808 xnn_params.f16.dwconv2d_chw_3x3 = (struct dwconv2d_chw_parameters) {
2809 .ukernel = (xnn_dwconv2d_chw_ukernel_fn) xnn_f16_dwconv2d_chw_ukernel_3x3p1__neonfp16arith_2x8,
2810 .init.f16 = xnn_init_f16_chw_neonfp16arith_stride1_params,
2811 .update.f16 = xnn_update_f16_chw_neonfp16arith_stride1_params,
2812 .output_height_tile = 2,
2813 .output_width_tile = 8,
2814 };
2815 xnn_params.f16.dwconv2d_chw_3x3s2 = (struct dwconv2d_chw_parameters) {
2816 .ukernel = (xnn_dwconv2d_chw_ukernel_fn) xnn_f16_dwconv2d_chw_ukernel_3x3s2p1__neonfp16arith_1x8,
2817 .init.f16 = xnn_init_f16_chw_neonfp16arith_stride2_params,
2818 .update.f16 = xnn_update_f16_chw_neonfp16arith_stride2_params,
2819 .output_height_tile = 1,
2820 .output_width_tile = 8,
2821 };
2822 xnn_params.f16.dwconv2d_chw_5x5 = (struct dwconv2d_chw_parameters) {
2823 .ukernel = (xnn_dwconv2d_chw_ukernel_fn) xnn_f16_dwconv2d_chw_ukernel_5x5p2__neonfp16arith_1x8,
2824 .init.f16 = xnn_init_f16_chw_neonfp16arith_stride1_params,
2825 .update.f16 = xnn_update_f16_chw_neonfp16arith_stride1_params,
2826 .output_height_tile = 1,
2827 .output_width_tile = 8,
2828 };
2829 xnn_params.f16.dwconv2d_chw_5x5s2 = (struct dwconv2d_chw_parameters) {
2830 .ukernel = (xnn_dwconv2d_chw_ukernel_fn) xnn_f16_dwconv2d_chw_ukernel_5x5s2p2__neonfp16arith_1x8,
2831 .init.f16 = xnn_init_f16_chw_neonfp16arith_stride2_params,
2832 .update.f16 = xnn_update_f16_chw_neonfp16arith_stride2_params,
2833 .output_height_tile = 1,
2834 .output_width_tile = 8,
2835 };
2836 xnn_params.f16.gavgpool_cw = (struct gavgpool_cw_parameters) {
2837 .ukernel = (xnn_gavgpool_cw_ukernel_fn) xnn_f16_gavgpool_cw_ukernel__neonfp16arith_x8,
2838 .init.f16 = xnn_init_f16_gavgpool_neonfp16arith_params,
2839 .update.f16 = xnn_update_f16_gavgpool_neonfp16arith_params,
2840 .channel_tile = 8,
2841 };
2842 xnn_params.f16.ibilinear_chw = (struct ibilinear_chw_parameters) {
2843 .ukernel = (xnn_ibilinear_chw_ukernel_fn) xnn_f16_ibilinear_chw_ukernel__neonfp16arith_p8,
2844 .channel_tile = 1,
2845 .pixel_tile = 8,
2846 };
2847 #endif // XNN_NO_NCHW_OPERATORS
2848 }
2849 #endif // XNN_ENABLE_ARM_FP16_VECTOR
2850 #endif // XNN_NO_F16_OPERATORS
2851
2852 /**************************** F32 AArch64 micro-kernels ****************************/
2853 #ifndef XNN_NO_F32_OPERATORS
2854 init_flags |= XNN_INIT_FLAG_F32;
2855
2856 #if XNN_ENABLE_ASSEMBLY && !XNN_PLATFORM_IOS && !XNN_PLATFORM_MAC
2857 switch (cpuinfo_get_core(0)->uarch) {
2858 case cpuinfo_uarch_cortex_a72:
2859 xnn_params.f32.gemm.minmax.gemm[XNN_MR_TO_INDEX(4)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_fn) xnn_f32_gemm_minmax_ukernel_4x8__asm_aarch64_neonfma_prfm_cortex_a75);
2860 xnn_params.f32.gemm.minmax.igemm[XNN_MR_TO_INDEX(4)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_fn) xnn_f32_igemm_minmax_ukernel_4x8__asm_aarch64_neonfma_prfm_cortex_a75);
2861 xnn_params.f32.gemm.minmax.gemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_fn) xnn_f32_gemm_minmax_ukernel_1x8__asm_aarch64_neonfma_prfm_cortex_a75);
2862 xnn_params.f32.gemm.minmax.igemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_fn) xnn_f32_igemm_minmax_ukernel_1x8__asm_aarch64_neonfma_prfm_cortex_a75);
2863 xnn_params.f32.gemm.init.f32 = xnn_init_f32_minmax_scalar_params;
2864 xnn_params.f32.gemm.mr = 4;
2865 xnn_params.f32.gemm.nr = 8;
2866 break;
2867 case cpuinfo_uarch_cortex_a57:
2868 case cpuinfo_uarch_cortex_a75:
2869 case cpuinfo_uarch_cortex_a76:
2870 case cpuinfo_uarch_exynos_m3:
2871 case cpuinfo_uarch_exynos_m4:
2872 xnn_params.f32.gemm.minmax.gemm[XNN_MR_TO_INDEX(6)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_fn) xnn_f32_gemm_minmax_ukernel_6x8__asm_aarch64_neonfma_prfm_cortex_a75);
2873 xnn_params.f32.gemm.minmax.igemm[XNN_MR_TO_INDEX(6)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_fn) xnn_f32_igemm_minmax_ukernel_6x8__asm_aarch64_neonfma_prfm_cortex_a75);
2874 #if XNN_ENABLE_GEMM_M_SPECIALIZATION
2875 xnn_params.f32.gemm.minmax.gemm[XNN_MR_TO_INDEX(4)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_fn) xnn_f32_gemm_minmax_ukernel_4x8__asm_aarch64_neonfma_prfm_cortex_a75);
2876 xnn_params.f32.gemm.minmax.igemm[XNN_MR_TO_INDEX(4)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_fn) xnn_f32_igemm_minmax_ukernel_4x8__asm_aarch64_neonfma_prfm_cortex_a75);
2877 #endif
2878 xnn_params.f32.gemm.minmax.gemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_fn) xnn_f32_gemm_minmax_ukernel_1x8__asm_aarch64_neonfma_prfm_cortex_a75);
2879 xnn_params.f32.gemm.minmax.igemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_fn) xnn_f32_igemm_minmax_ukernel_1x8__asm_aarch64_neonfma_prfm_cortex_a75);
2880 xnn_params.f32.gemm.init.f32 = xnn_init_f32_minmax_scalar_params;
2881 xnn_params.f32.gemm.mr = 6;
2882 xnn_params.f32.gemm.nr = 8;
2883 #if XNN_ENABLE_JIT
2884 xnn_params.f32.gemm.generator.gemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_gemm_codegen(xnn_generate_f32_gemm_ukernel_1x8__aarch64_neonfma_prfm_cortex_a75);
2885 xnn_params.f32.gemm.generator.igemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_igemm_codegen(xnn_generate_f32_igemm_ukernel_1x8__aarch64_neonfma_prfm_cortex_a75);
2886 xnn_params.f32.gemm.generator.gemm[XNN_MR_TO_INDEX(6)] = xnn_init_hmp_gemm_codegen(xnn_generate_f32_gemm_ukernel_6x8__aarch64_neonfma_prfm_cortex_a75);
2887 xnn_params.f32.gemm.generator.igemm[XNN_MR_TO_INDEX(6)] = xnn_init_hmp_igemm_codegen(xnn_generate_f32_igemm_ukernel_6x8__aarch64_neonfma_prfm_cortex_a75);
2888 #endif
2889 break;
2890 case cpuinfo_uarch_exynos_m1:
2891 case cpuinfo_uarch_exynos_m2:
2892 xnn_params.f32.gemm.minmax.gemm[XNN_MR_TO_INDEX(6)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_fn) xnn_f32_gemm_minmax_ukernel_6x8s4__neonfma);
2893 xnn_params.f32.gemm.minmax.igemm[XNN_MR_TO_INDEX(6)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_fn) xnn_f32_igemm_minmax_ukernel_6x8s4__neonfma);
2894 xnn_params.f32.gemm.minmax.gemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_fn) xnn_f32_gemm_minmax_ukernel_1x8s4__neonfma);
2895 xnn_params.f32.gemm.minmax.igemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_fn) xnn_f32_igemm_minmax_ukernel_1x8s4__neonfma);
2896 xnn_params.f32.gemm.init.f32 = xnn_init_f32_minmax_scalar_params;
2897 xnn_params.f32.gemm.mr = 6;
2898 xnn_params.f32.gemm.nr = 8;
2899 xnn_params.f32.gemm.log2_sr = 2;
2900 break;
2901 case cpuinfo_uarch_cortex_a53:
2902 xnn_params.f32.gemm.minmax.gemm[XNN_MR_TO_INDEX(6)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_fn) xnn_f32_gemm_minmax_ukernel_6x8__asm_aarch64_neonfma_prfm_cortex_a53);
2903 xnn_params.f32.gemm.minmax.igemm[XNN_MR_TO_INDEX(6)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_fn) xnn_f32_igemm_minmax_ukernel_6x8__asm_aarch64_neonfma_prfm_cortex_a53);
2904 xnn_params.f32.gemm.minmax.gemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_fn) xnn_f32_gemm_minmax_ukernel_1x8__asm_aarch64_neonfma_prfm_cortex_a53);
2905 xnn_params.f32.gemm.minmax.igemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_fn) xnn_f32_igemm_minmax_ukernel_1x8__asm_aarch64_neonfma_prfm_cortex_a53);
2906 xnn_params.f32.gemm.init.f32 = xnn_init_f32_minmax_scalar_params;
2907 xnn_params.f32.gemm.mr = 6;
2908 xnn_params.f32.gemm.nr = 8;
2909 break;
2910 case cpuinfo_uarch_cortex_a55r0:
2911 xnn_params.f32.gemm.minmax.gemm[XNN_MR_TO_INDEX(6)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_fn) xnn_f32_gemm_minmax_ukernel_6x8__asm_aarch64_neonfma_cortex_a53);
2912 xnn_params.f32.gemm.minmax.igemm[XNN_MR_TO_INDEX(6)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_fn) xnn_f32_igemm_minmax_ukernel_6x8__asm_aarch64_neonfma_cortex_a53);
2913 xnn_params.f32.gemm.minmax.gemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_fn) xnn_f32_gemm_minmax_ukernel_1x8__asm_aarch64_neonfma_cortex_a53);
2914 xnn_params.f32.gemm.minmax.igemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_fn) xnn_f32_igemm_minmax_ukernel_1x8__asm_aarch64_neonfma_cortex_a53);
2915 xnn_params.f32.gemm.init.f32 = xnn_init_f32_minmax_scalar_params;
2916 xnn_params.f32.gemm.mr = 6;
2917 xnn_params.f32.gemm.nr = 8;
2918 break;
2919 case cpuinfo_uarch_cortex_a35:
2920 case cpuinfo_uarch_cortex_a55:
2921 case cpuinfo_uarch_kryo:
2922 xnn_params.f32.gemm.minmax.gemm[XNN_MR_TO_INDEX(6)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_fn) xnn_f32_gemm_minmax_ukernel_6x8__asm_aarch64_neonfma_cortex_a55);
2923 xnn_params.f32.gemm.minmax.igemm[XNN_MR_TO_INDEX(6)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_fn) xnn_f32_igemm_minmax_ukernel_6x8__asm_aarch64_neonfma_cortex_a55);
2924 xnn_params.f32.gemm.minmax.gemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_fn) xnn_f32_gemm_minmax_ukernel_1x8__asm_aarch64_neonfma_cortex_a53);
2925 xnn_params.f32.gemm.minmax.igemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_fn) xnn_f32_igemm_minmax_ukernel_1x8__asm_aarch64_neonfma_cortex_a53);
2926 xnn_params.f32.gemm.init.f32 = xnn_init_f32_minmax_scalar_params;
2927 xnn_params.f32.gemm.mr = 6;
2928 xnn_params.f32.gemm.nr = 8;
2929 break;
2930 case cpuinfo_uarch_cortex_a73:
2931 xnn_params.f32.gemm.minmax.gemm[XNN_MR_TO_INDEX(6)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_fn) xnn_f32_gemm_minmax_ukernel_6x8__asm_aarch64_neonfma_cortex_a73);
2932 xnn_params.f32.gemm.minmax.igemm[XNN_MR_TO_INDEX(6)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_fn) xnn_f32_igemm_minmax_ukernel_6x8__asm_aarch64_neonfma_cortex_a73);
2933 xnn_params.f32.gemm.minmax.gemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_fn) xnn_f32_gemm_minmax_ukernel_1x8__asm_aarch64_neonfma_prfm_cortex_a75);
2934 xnn_params.f32.gemm.minmax.igemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_fn) xnn_f32_igemm_minmax_ukernel_1x8__asm_aarch64_neonfma_prfm_cortex_a75);
2935 xnn_params.f32.gemm.init.f32 = xnn_init_f32_minmax_scalar_params;
2936 xnn_params.f32.gemm.mr = 6;
2937 xnn_params.f32.gemm.nr = 8;
2938 break;
2939 case cpuinfo_uarch_cortex_a77:
2940 case cpuinfo_uarch_exynos_m5:
2941 xnn_params.f32.gemm.minmax.gemm[XNN_MR_TO_INDEX(4)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_fn) xnn_f32_gemm_minmax_ukernel_4x8__asm_aarch64_neonfma_cortex_a75);
2942 xnn_params.f32.gemm.minmax.igemm[XNN_MR_TO_INDEX(4)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_fn) xnn_f32_igemm_minmax_ukernel_4x8__asm_aarch64_neonfma_cortex_a75);
2943 xnn_params.f32.gemm.minmax.gemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_fn) xnn_f32_gemm_minmax_ukernel_1x8__asm_aarch64_neonfma_cortex_a75);
2944 xnn_params.f32.gemm.minmax.igemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_fn) xnn_f32_igemm_minmax_ukernel_1x8__asm_aarch64_neonfma_cortex_a75);
2945 xnn_params.f32.gemm.init.f32 = xnn_init_f32_minmax_scalar_params;
2946 xnn_params.f32.gemm.mr = 4;
2947 xnn_params.f32.gemm.nr = 8;
2948 break;
2949 case cpuinfo_uarch_cortex_a78:
2950 case cpuinfo_uarch_cortex_x1:
2951 default:
2952 xnn_params.f32.gemm.minmax.gemm[XNN_MR_TO_INDEX(6)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_fn) xnn_f32_gemm_minmax_ukernel_6x8__asm_aarch64_neonfma_ld128);
2953 xnn_params.f32.gemm.minmax.igemm[XNN_MR_TO_INDEX(6)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_fn) xnn_f32_igemm_minmax_ukernel_6x8__asm_aarch64_neonfma_ld128);
2954 xnn_params.f32.gemm.minmax.gemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_fn) xnn_f32_gemm_minmax_ukernel_1x8__aarch64_neonfma_lane_ld64);
2955 xnn_params.f32.gemm.minmax.igemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_fn) xnn_f32_igemm_minmax_ukernel_1x8__aarch64_neonfma_lane_ld64);
2956 xnn_params.f32.gemm.init.f32 = xnn_init_f32_minmax_scalar_params;
2957 xnn_params.f32.gemm.mr = 6;
2958 xnn_params.f32.gemm.nr = 8;
2959 #if XNN_ENABLE_JIT
2960 xnn_params.f32.gemm.generator.gemm[XNN_MR_TO_INDEX(6)] = xnn_init_hmp_gemm_codegen(xnn_generate_f32_gemm_ukernel_6x8__aarch64_neonfma_ld128);
2961 xnn_params.f32.gemm.generator.igemm[XNN_MR_TO_INDEX(6)] = xnn_init_hmp_igemm_codegen(xnn_generate_f32_igemm_ukernel_6x8__aarch64_neonfma_ld128);
2962 #endif
2963 break;
2964 }
2965 #if XNN_MAX_UARCH_TYPES > 1
2966 {
2967 /* Choose micro-kernels for little cores according to micro-kernel specification for the big core */
2968 const uint32_t mr = xnn_params.f32.gemm.mr;
2969 const uint32_t nr = xnn_params.f32.gemm.nr;
2970 const uint32_t log2_sr = xnn_params.f32.gemm.log2_sr;
2971 for (size_t i = 1; i < XNN_MAX_UARCH_TYPES; i++) {
2972 const struct cpuinfo_uarch_info* uarch_info = cpuinfo_get_uarch(i);
2973 if (uarch_info == NULL) {
2974 /* No more microarchitectures in the system */
2975 break;
2976 }
2977
2978 switch (uarch_info->uarch) {
2979 case cpuinfo_uarch_cortex_a53:
2980 if (mr == 6 && nr == 8 && log2_sr == 0) {
2981 xnn_params.f32.gemm.minmax.gemm[XNN_MR_TO_INDEX(6)].function[i] = (xnn_gemm_ukernel_fn) xnn_f32_gemm_minmax_ukernel_6x8__asm_aarch64_neonfma_prfm_cortex_a53;
2982 xnn_params.f32.gemm.minmax.igemm[XNN_MR_TO_INDEX(6)].function[i] = (xnn_igemm_ukernel_fn) xnn_f32_igemm_minmax_ukernel_6x8__asm_aarch64_neonfma_prfm_cortex_a53;
2983 xnn_params.f32.gemm.minmax.gemm[XNN_MR_TO_INDEX(1)].function[i] = (xnn_gemm_ukernel_fn) xnn_f32_gemm_minmax_ukernel_1x8__asm_aarch64_neonfma_cortex_a53;
2984 xnn_params.f32.gemm.minmax.igemm[XNN_MR_TO_INDEX(1)].function[i] = (xnn_igemm_ukernel_fn) xnn_f32_igemm_minmax_ukernel_1x8__asm_aarch64_neonfma_prfm_cortex_a53;
2985 #if XNN_ENABLE_JIT
2986 xnn_params.f32.gemm.generator.gemm[XNN_MR_TO_INDEX(1)].function[i] = (xnn_jit_gemm_code_generator_fn) xnn_generate_f32_gemm_ukernel_1x8__aarch64_neonfma_cortex_a53;
2987 xnn_params.f32.gemm.generator.igemm[XNN_MR_TO_INDEX(1)].function[i] = (xnn_jit_igemm_code_generator_fn) xnn_generate_f32_igemm_ukernel_1x8__aarch64_neonfma_prfm_cortex_a53;
2988 xnn_params.f32.gemm.generator.gemm[XNN_MR_TO_INDEX(6)].function[i] = (xnn_jit_gemm_code_generator_fn) xnn_generate_f32_gemm_ukernel_6x8__aarch64_neonfma_prfm_cortex_a53;
2989 xnn_params.f32.gemm.generator.igemm[XNN_MR_TO_INDEX(6)].function[i] = (xnn_jit_igemm_code_generator_fn) xnn_generate_f32_igemm_ukernel_6x8__aarch64_neonfma_prfm_cortex_a53;
2990 #endif
2991 } else if (mr == 4 && nr == 8 && log2_sr == 0) {
2992 xnn_params.f32.gemm.minmax.gemm[XNN_MR_TO_INDEX(4)].function[i] = (xnn_gemm_ukernel_fn) xnn_f32_gemm_minmax_ukernel_4x8__asm_aarch64_neonfma_prfm_cortex_a53;
2993 xnn_params.f32.gemm.minmax.igemm[XNN_MR_TO_INDEX(4)].function[i] = (xnn_igemm_ukernel_fn) xnn_f32_igemm_minmax_ukernel_4x8__asm_aarch64_neonfma_prfm_cortex_a53;
2994 xnn_params.f32.gemm.minmax.gemm[XNN_MR_TO_INDEX(1)].function[i] = (xnn_gemm_ukernel_fn) xnn_f32_gemm_minmax_ukernel_1x8__asm_aarch64_neonfma_cortex_a53;
2995 xnn_params.f32.gemm.minmax.igemm[XNN_MR_TO_INDEX(1)].function[i] = (xnn_igemm_ukernel_fn) xnn_f32_igemm_minmax_ukernel_1x8__asm_aarch64_neonfma_prfm_cortex_a53;
2996 #if XNN_ENABLE_JIT
2997 xnn_params.f32.gemm.generator.gemm[XNN_MR_TO_INDEX(1)].function[i] = (xnn_jit_gemm_code_generator_fn) xnn_generate_f32_gemm_ukernel_1x8__aarch64_neonfma_cortex_a53;
2998 xnn_params.f32.gemm.generator.igemm[XNN_MR_TO_INDEX(1)].function[i] = (xnn_jit_igemm_code_generator_fn) xnn_generate_f32_igemm_ukernel_1x8__aarch64_neonfma_prfm_cortex_a53;
2999 xnn_params.f32.gemm.generator.gemm[XNN_MR_TO_INDEX(4)].function[i] = (xnn_jit_gemm_code_generator_fn) xnn_generate_f32_gemm_ukernel_4x8__aarch64_neonfma_prfm_cortex_a53;
3000 xnn_params.f32.gemm.generator.igemm[XNN_MR_TO_INDEX(4)].function[i] = (xnn_jit_igemm_code_generator_fn) xnn_generate_f32_igemm_ukernel_4x8__aarch64_neonfma_prfm_cortex_a53;
3001 #endif
3002 }
3003 break;
3004 case cpuinfo_uarch_cortex_a55r0:
3005 if (mr == 6 && nr == 8 && log2_sr == 0) {
3006 xnn_params.f32.gemm.minmax.gemm[XNN_MR_TO_INDEX(6)].function[i] = (xnn_gemm_ukernel_fn) xnn_f32_gemm_minmax_ukernel_6x8__asm_aarch64_neonfma_cortex_a53;
3007 xnn_params.f32.gemm.minmax.igemm[XNN_MR_TO_INDEX(6)].function[i] = (xnn_igemm_ukernel_fn) xnn_f32_igemm_minmax_ukernel_6x8__asm_aarch64_neonfma_cortex_a53;
3008 xnn_params.f32.gemm.minmax.gemm[XNN_MR_TO_INDEX(1)].function[i] = (xnn_gemm_ukernel_fn) xnn_f32_gemm_minmax_ukernel_1x8__asm_aarch64_neonfma_cortex_a53;
3009 xnn_params.f32.gemm.minmax.igemm[XNN_MR_TO_INDEX(1)].function[i] = (xnn_igemm_ukernel_fn) xnn_f32_igemm_minmax_ukernel_1x8__asm_aarch64_neonfma_cortex_a53;
3010 #if XNN_ENABLE_JIT
3011 xnn_params.f32.gemm.generator.gemm[XNN_MR_TO_INDEX(1)].function[i] = (xnn_jit_gemm_code_generator_fn) xnn_generate_f32_gemm_ukernel_1x8__aarch64_neonfma_cortex_a53;
3012 xnn_params.f32.gemm.generator.igemm[XNN_MR_TO_INDEX(1)].function[i] = (xnn_jit_igemm_code_generator_fn) xnn_generate_f32_igemm_ukernel_1x8__aarch64_neonfma_cortex_a53;
3013 xnn_params.f32.gemm.generator.gemm[XNN_MR_TO_INDEX(6)].function[i] = (xnn_jit_gemm_code_generator_fn) xnn_generate_f32_gemm_ukernel_6x8__aarch64_neonfma_cortex_a53;
3014 xnn_params.f32.gemm.generator.igemm[XNN_MR_TO_INDEX(6)].function[i] = (xnn_jit_igemm_code_generator_fn) xnn_generate_f32_igemm_ukernel_6x8__aarch64_neonfma_cortex_a53;
3015 #endif
3016 } else if (mr == 4 && nr == 8 && log2_sr == 0) {
3017 xnn_params.f32.gemm.minmax.gemm[XNN_MR_TO_INDEX(4)].function[i] = (xnn_gemm_ukernel_fn) xnn_f32_gemm_minmax_ukernel_4x8__asm_aarch64_neonfma_cortex_a53;
3018 xnn_params.f32.gemm.minmax.igemm[XNN_MR_TO_INDEX(4)].function[i] = (xnn_igemm_ukernel_fn) xnn_f32_igemm_minmax_ukernel_4x8__asm_aarch64_neonfma_cortex_a53;
3019 xnn_params.f32.gemm.minmax.gemm[XNN_MR_TO_INDEX(1)].function[i] = (xnn_gemm_ukernel_fn) xnn_f32_gemm_minmax_ukernel_1x8__asm_aarch64_neonfma_cortex_a53;
3020 xnn_params.f32.gemm.minmax.igemm[XNN_MR_TO_INDEX(1)].function[i] = (xnn_igemm_ukernel_fn) xnn_f32_igemm_minmax_ukernel_1x8__asm_aarch64_neonfma_cortex_a53;
3021 #if XNN_ENABLE_JIT
3022 xnn_params.f32.gemm.generator.gemm[XNN_MR_TO_INDEX(1)].function[i] = (xnn_jit_gemm_code_generator_fn) xnn_generate_f32_gemm_ukernel_1x8__aarch64_neonfma_cortex_a53;
3023 xnn_params.f32.gemm.generator.igemm[XNN_MR_TO_INDEX(1)].function[i] = (xnn_jit_igemm_code_generator_fn) xnn_generate_f32_igemm_ukernel_1x8__aarch64_neonfma_cortex_a53;
3024 xnn_params.f32.gemm.generator.gemm[XNN_MR_TO_INDEX(4)].function[i] = (xnn_jit_gemm_code_generator_fn) xnn_generate_f32_gemm_ukernel_4x8__aarch64_neonfma_cortex_a53;
3025 xnn_params.f32.gemm.generator.igemm[XNN_MR_TO_INDEX(4)].function[i] = (xnn_jit_igemm_code_generator_fn) xnn_generate_f32_igemm_ukernel_4x8__aarch64_neonfma_cortex_a53;
3026 #endif
3027 }
3028 break;
3029 case cpuinfo_uarch_cortex_a55:
3030 if (mr == 6 && nr == 8 && log2_sr == 0) {
3031 xnn_params.f32.gemm.minmax.gemm[XNN_MR_TO_INDEX(6)].function[i] = (xnn_gemm_ukernel_fn) xnn_f32_gemm_minmax_ukernel_6x8__asm_aarch64_neonfma_cortex_a55;
3032 xnn_params.f32.gemm.minmax.igemm[XNN_MR_TO_INDEX(6)].function[i] = (xnn_igemm_ukernel_fn) xnn_f32_igemm_minmax_ukernel_6x8__asm_aarch64_neonfma_cortex_a55;
3033 xnn_params.f32.gemm.minmax.gemm[XNN_MR_TO_INDEX(1)].function[i] = (xnn_gemm_ukernel_fn) xnn_f32_gemm_minmax_ukernel_1x8__asm_aarch64_neonfma_cortex_a53;
3034 xnn_params.f32.gemm.minmax.igemm[XNN_MR_TO_INDEX(1)].function[i] = (xnn_igemm_ukernel_fn) xnn_f32_igemm_minmax_ukernel_1x8__asm_aarch64_neonfma_cortex_a53;
3035 #if XNN_ENABLE_JIT
3036 xnn_params.f32.gemm.generator.gemm[XNN_MR_TO_INDEX(1)].function[i] = (xnn_jit_gemm_code_generator_fn) xnn_generate_f32_gemm_ukernel_1x8__aarch64_neonfma_cortex_a53;
3037 xnn_params.f32.gemm.generator.igemm[XNN_MR_TO_INDEX(1)].function[i] = (xnn_jit_igemm_code_generator_fn) xnn_generate_f32_igemm_ukernel_1x8__aarch64_neonfma_cortex_a53;
3038 xnn_params.f32.gemm.generator.gemm[XNN_MR_TO_INDEX(6)].function[i] = (xnn_jit_gemm_code_generator_fn) xnn_generate_f32_gemm_ukernel_6x8__aarch64_neonfma_cortex_a55;
3039 xnn_params.f32.gemm.generator.igemm[XNN_MR_TO_INDEX(6)].function[i] = (xnn_jit_igemm_code_generator_fn) xnn_generate_f32_igemm_ukernel_6x8__aarch64_neonfma_cortex_a55;
3040 #endif
3041 } else if (mr == 4 && nr == 8 && log2_sr == 0) {
3042 xnn_params.f32.gemm.minmax.gemm[XNN_MR_TO_INDEX(4)].function[i] = (xnn_gemm_ukernel_fn) xnn_f32_gemm_minmax_ukernel_4x8__asm_aarch64_neonfma_cortex_a55;
3043 xnn_params.f32.gemm.minmax.igemm[XNN_MR_TO_INDEX(4)].function[i] = (xnn_igemm_ukernel_fn) xnn_f32_igemm_minmax_ukernel_4x8__asm_aarch64_neonfma_cortex_a55;
3044 xnn_params.f32.gemm.minmax.gemm[XNN_MR_TO_INDEX(1)].function[i] = (xnn_gemm_ukernel_fn) xnn_f32_gemm_minmax_ukernel_1x8__asm_aarch64_neonfma_cortex_a53;
3045 xnn_params.f32.gemm.minmax.igemm[XNN_MR_TO_INDEX(1)].function[i] = (xnn_igemm_ukernel_fn) xnn_f32_igemm_minmax_ukernel_1x8__asm_aarch64_neonfma_cortex_a53;
3046 #if XNN_ENABLE_JIT
3047 xnn_params.f32.gemm.generator.gemm[XNN_MR_TO_INDEX(1)].function[i] = (xnn_jit_gemm_code_generator_fn) xnn_generate_f32_gemm_ukernel_1x8__aarch64_neonfma_cortex_a53;
3048 xnn_params.f32.gemm.generator.igemm[XNN_MR_TO_INDEX(1)].function[i] = (xnn_jit_igemm_code_generator_fn) xnn_generate_f32_igemm_ukernel_1x8__aarch64_neonfma_cortex_a53;
3049 xnn_params.f32.gemm.generator.gemm[XNN_MR_TO_INDEX(4)].function[i] = (xnn_jit_gemm_code_generator_fn) xnn_generate_f32_gemm_ukernel_4x8__aarch64_neonfma_cortex_a55;
3050 xnn_params.f32.gemm.generator.igemm[XNN_MR_TO_INDEX(4)].function[i] = (xnn_jit_igemm_code_generator_fn) xnn_generate_f32_igemm_ukernel_4x8__aarch64_neonfma_cortex_a55;
3051 #endif
3052 }
3053 break;
3054 default:
3055 break;
3056 }
3057 }
3058 }
3059 #endif // XNN_MAX_UARCH_TYPES > 1
3060 xnn_params.f32.gemm2.minmax.gemm[XNN_MR_TO_INDEX(4)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_fn) xnn_f32_gemm_minmax_ukernel_4x2__asm_aarch64_neonfma_prfm_cortex_a75);
3061 xnn_params.f32.gemm2.minmax.igemm[XNN_MR_TO_INDEX(4)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_fn) xnn_f32_igemm_minmax_ukernel_4x2__asm_aarch64_neonfma_prfm_cortex_a75);
3062 xnn_params.f32.gemm2.init.f32 = xnn_init_f32_minmax_scalar_params;
3063 xnn_params.f32.gemm2.mr = 4;
3064 xnn_params.f32.gemm2.nr = 2;
3065
3066 #else // XNN_ENABLE_ASSEMBLY && !XNN_PLATFORM_IOS && !XNN_PLATFORM_MAC
3067 #if XNN_ENABLE_ASSEMBLY
3068 xnn_params.f32.gemm.minmax.gemm[XNN_MR_TO_INDEX(6)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_fn) xnn_f32_gemm_minmax_ukernel_6x8__asm_aarch64_neonfma_prfm_cortex_a75);
3069 xnn_params.f32.gemm.minmax.igemm[XNN_MR_TO_INDEX(6)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_fn) xnn_f32_igemm_minmax_ukernel_6x8__asm_aarch64_neonfma_prfm_cortex_a75);
3070 xnn_params.f32.gemm.minmax.gemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_fn) xnn_f32_gemm_minmax_ukernel_1x8__asm_aarch64_neonfma_prfm_cortex_a75);
3071 xnn_params.f32.gemm.minmax.igemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_fn) xnn_f32_igemm_minmax_ukernel_1x8__asm_aarch64_neonfma_prfm_cortex_a75);
3072 xnn_params.f32.gemm.init.f32 = xnn_init_f32_minmax_scalar_params;
3073 xnn_params.f32.gemm.mr = 6;
3074 xnn_params.f32.gemm.nr = 8;
3075
3076 xnn_params.f32.gemm2.minmax.gemm[XNN_MR_TO_INDEX(4)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_fn) xnn_f32_gemm_minmax_ukernel_4x2__asm_aarch64_neonfma_prfm_cortex_a75);
3077 xnn_params.f32.gemm2.minmax.igemm[XNN_MR_TO_INDEX(4)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_fn) xnn_f32_igemm_minmax_ukernel_4x2__asm_aarch64_neonfma_prfm_cortex_a75);
3078 xnn_params.f32.gemm2.init.f32 = xnn_init_f32_minmax_scalar_params;
3079 xnn_params.f32.gemm2.mr = 4;
3080 xnn_params.f32.gemm2.nr = 2;
3081 #else // !XNN_ENABLE_ASSEMBLY
3082 xnn_params.f32.gemm.minmax.gemm[XNN_MR_TO_INDEX(6)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_fn) xnn_f32_gemm_minmax_ukernel_6x8__aarch64_neonfma_lane_ld64);
3083 xnn_params.f32.gemm.minmax.igemm[XNN_MR_TO_INDEX(6)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_fn) xnn_f32_igemm_minmax_ukernel_6x8__aarch64_neonfma_lane_ld64);
3084 xnn_params.f32.gemm.minmax.gemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_fn) xnn_f32_gemm_minmax_ukernel_1x8__aarch64_neonfma_lane_ld64);
3085 xnn_params.f32.gemm.minmax.igemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_fn) xnn_f32_igemm_minmax_ukernel_1x8__aarch64_neonfma_lane_ld64);
3086 xnn_params.f32.gemm.init.f32 = xnn_init_f32_minmax_scalar_params;
3087 xnn_params.f32.gemm.mr = 6;
3088 xnn_params.f32.gemm.nr = 8;
3089
3090 xnn_params.f32.gemm2.minmax.gemm[XNN_MR_TO_INDEX(4)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_fn) xnn_f32_gemm_minmax_ukernel_4x2__aarch64_neonfma_lane_ld64);
3091 xnn_params.f32.gemm2.minmax.igemm[XNN_MR_TO_INDEX(4)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_fn) xnn_f32_igemm_minmax_ukernel_4x2__aarch64_neonfma_lane_ld64);
3092 xnn_params.f32.gemm2.init.f32 = xnn_init_f32_minmax_scalar_params;
3093 xnn_params.f32.gemm2.mr = 4;
3094 xnn_params.f32.gemm2.nr = 2;
3095 #endif // XNN_ENABLE_ASSEMBLY
3096 #endif // XNN_ENABLE_ASSEMBLY && !XNN_PLATFORM_IOS && !XNN_PLATFORM_MAC
3097
3098 xnn_params.f32.dwconv[0].minmax.unipass = (xnn_dwconv_unipass_ukernel_fn) xnn_f32_dwconv_minmax_ukernel_3p8c__neonfma;
3099 xnn_params.f32.dwconv[0].init.f32 = xnn_init_f32_minmax_scalar_params;
3100 xnn_params.f32.dwconv[0].channel_tile = 8;
3101 xnn_params.f32.dwconv[0].primary_tile = 3;
3102
3103 xnn_params.f32.dwconv[1].minmax.unipass = (xnn_dwconv_unipass_ukernel_fn) xnn_f32_dwconv_minmax_ukernel_4p8c__neonfma;
3104 xnn_params.f32.dwconv[1].init.f32 = xnn_init_f32_minmax_scalar_params;
3105 xnn_params.f32.dwconv[1].channel_tile = 8;
3106 xnn_params.f32.dwconv[1].primary_tile = 4;
3107
3108 #if XNN_PLATFORM_IOS || XNN_PLATFORM_MAC || XNN_PLATFORM_WINDOWS
3109 xnn_params.f32.dwconv[2].minmax.unipass = (xnn_dwconv_unipass_ukernel_fn) xnn_f32_dwconv_minmax_ukernel_9p8c__neonfma;
3110 xnn_params.f32.dwconv[2].init.f32 = xnn_init_f32_minmax_scalar_params;
3111 xnn_params.f32.dwconv[2].channel_tile = 8;
3112 xnn_params.f32.dwconv[2].primary_tile = 9;
3113 #else // !XNN_PLATFORM_IOS && !XNN_PLATFORM_MAC
3114 switch (cpuinfo_get_core(0)->uarch) {
3115 case cpuinfo_uarch_kryo:
3116 xnn_params.f32.dwconv[2].minmax.unipass = (xnn_dwconv_unipass_ukernel_fn) xnn_f32_dwconv_minmax_ukernel_9p8c__neonfma;
3117 xnn_params.f32.dwconv[2].init.f32 = xnn_init_f32_minmax_scalar_params;
3118 xnn_params.f32.dwconv[2].channel_tile = 8;
3119 xnn_params.f32.dwconv[2].primary_tile = 9;
3120 break;
3121 #if XNN_ENABLE_ASSEMBLY
3122 case cpuinfo_uarch_cortex_a53:
3123 case cpuinfo_uarch_cortex_a55r0:
3124 case cpuinfo_uarch_cortex_a55:
3125 xnn_params.f32.dwconv[2].minmax.unipass = (xnn_dwconv_unipass_ukernel_fn) xnn_f32_dwconv_minmax_ukernel_9p4c__asm_aarch64_neonfma_cortex_a55;
3126 xnn_params.f32.dwconv[2].init.f32 = xnn_init_f32_minmax_scalar_params;
3127 xnn_params.f32.dwconv[2].channel_tile = 4;
3128 xnn_params.f32.dwconv[2].primary_tile = 9;
3129 break;
3130 #endif // XNN_ENABLE_ASSEMBLY
3131 default:
3132 xnn_params.f32.dwconv[2].minmax.unipass = (xnn_dwconv_unipass_ukernel_fn) xnn_f32_dwconv_minmax_ukernel_9p8c__neonfma;
3133 xnn_params.f32.dwconv[2].init.f32 = xnn_init_f32_minmax_scalar_params;
3134 xnn_params.f32.dwconv[2].channel_tile = 8;
3135 xnn_params.f32.dwconv[2].primary_tile = 9;
3136 break;
3137 }
3138 #endif // XNN_PLATFORM_IOS && XNN_PLATFORM_MAC
3139
3140 xnn_params.f32.dwconv[3].minmax.unipass = (xnn_dwconv_unipass_ukernel_fn) xnn_f32_dwconv_minmax_ukernel_25p8c__neonfma_acc2;
3141 xnn_params.f32.dwconv[3].init.f32 = xnn_init_f32_minmax_scalar_params;
3142 xnn_params.f32.dwconv[3].channel_tile = 8;
3143 xnn_params.f32.dwconv[3].primary_tile = 25;
3144
3145 xnn_params.f32.avgpool = (struct avgpool_parameters) {
3146 .unipass = (xnn_avgpool_unipass_ukernel_fn) xnn_f32_avgpool_minmax_ukernel_9x__neon_c4,
3147 .multipass = (xnn_avgpool_multipass_ukernel_fn) xnn_f32_avgpool_minmax_ukernel_9p8x__neon_c4,
3148 .init.f32 = xnn_init_f32_scaleminmax_scalar_params,
3149 .primary_tile = 9,
3150 .incremental_tile = 8,
3151 .channel_tile = 4,
3152 };
3153 xnn_params.f32.pavgpool = (struct pavgpool_parameters) {
3154 .unipass = (xnn_pavgpool_unipass_ukernel_fn) xnn_f32_pavgpool_minmax_ukernel_9x__neon_c4,
3155 .multipass = (xnn_pavgpool_multipass_ukernel_fn) xnn_f32_pavgpool_minmax_ukernel_9p8x__neon_c4,
3156 .init.f32 = xnn_init_f32_minmax_scalar_params,
3157 .primary_tile = 9,
3158 .incremental_tile = 8,
3159 .channel_tile = 4,
3160 };
3161 xnn_params.f32.gavgpool = (struct gavgpool_parameters) {
3162 .unipass = (xnn_gavgpool_unipass_ukernel_fn) xnn_f32_gavgpool_minmax_ukernel_7x__neon_c4,
3163 .multipass = (xnn_gavgpool_multipass_ukernel_fn) xnn_f32_gavgpool_minmax_ukernel_7p7x__neon_c4,
3164 .init.f32 = xnn_init_f32_scaleminmax_scalar_params,
3165 .update.f32 = xnn_update_f32_scaleminmax_scalar_params,
3166 .row_tile = 7,
3167 .channel_tile = 4,
3168 };
3169 xnn_params.f32.maxpool = (struct maxpool_parameters) {
3170 .ukernel = (xnn_maxpool_ukernel_fn) xnn_f32_maxpool_minmax_ukernel_9p8x__neon_c4,
3171 .init.f32 = xnn_init_f32_minmax_scalar_params,
3172 .mr = 9,
3173 .qr = 8,
3174 };
3175 xnn_params.f32.argmaxpool[0] = (struct argmaxpool_parameters) {
3176 .up = (xnn_argmaxpool_unipass_ukernel_fn) xnn_f32_argmaxpool_ukernel_4x__neon_c4,
3177 .mr = 4,
3178 };
3179 xnn_params.f32.argmaxpool[1] = (struct argmaxpool_parameters) {
3180 .up = (xnn_argmaxpool_unipass_ukernel_fn) xnn_f32_argmaxpool_ukernel_9x__neon_c4,
3181 .mr = 9,
3182 };
3183 xnn_params.f32.argmaxpool[2] = (struct argmaxpool_parameters) {
3184 .mp = (xnn_argmaxpool_multipass_ukernel_fn) xnn_f32_argmaxpool_ukernel_9p8x__neon_c4,
3185 .mr = 9,
3186 .qr = 8,
3187 };
3188 xnn_params.f32.ibilinear = (struct ibilinear_parameters) {
3189 .ukernel = (xnn_ibilinear_ukernel_fn) xnn_f32_ibilinear_ukernel__neonfma_c8,
3190 .pixel_tile = 1,
3191 .channel_tile = 8,
3192 };
3193 xnn_params.f32.abs = (struct vunary_parameters) {
3194 .ukernel = (xnn_vunary_ukernel_fn) xnn_f32_vabs_ukernel__neon_x8,
3195 .element_tile = 8,
3196 };
3197 xnn_params.f32.clamp = (struct vunary_parameters) {
3198 .ukernel = (xnn_vunary_ukernel_fn) xnn_f32_vclamp_ukernel__neon_x8,
3199 .init.f32_minmax = xnn_init_f32_minmax_scalar_params,
3200 .element_tile = 8,
3201 };
3202 xnn_params.f32.elu = (struct vunary_parameters) {
3203 .ukernel = (xnn_vunary_ukernel_fn) xnn_f32_velu_ukernel__neonfma_rr1_lut16_p3_x16,
3204 .init.f32_elu = xnn_init_f32_elu_neonfma_rr1_lut16_p3_params,
3205 .element_tile = 16,
3206 };
3207 xnn_params.f32.hswish = (struct vunary_parameters) {
3208 .ukernel = (xnn_vunary_ukernel_fn) xnn_f32_vhswish_ukernel__neon_x16,
3209 .init.f32_hswish = xnn_init_f32_hswish_scalar_params,
3210 .element_tile = 16,
3211 };
3212 xnn_params.f32.lrelu = (struct vunary_parameters) {
3213 .ukernel = (xnn_vunary_ukernel_fn) xnn_f32_vlrelu_ukernel__neon_x8,
3214 .init.f32_lrelu = xnn_init_f32_lrelu_scalar_params,
3215 .element_tile = 8,
3216 };
3217 xnn_params.f32.neg = (struct vunary_parameters) {
3218 .ukernel = (xnn_vunary_ukernel_fn) xnn_f32_vneg_ukernel__neon_x8,
3219 .element_tile = 8,
3220 };
3221 xnn_params.f32.rndne = (struct vunary_parameters) {
3222 .ukernel = (xnn_vunary_ukernel_fn) xnn_f32_vrndne_ukernel__neonv8_x8,
3223 .element_tile = 8,
3224 };
3225 xnn_params.f32.rndz = (struct vunary_parameters) {
3226 .ukernel = (xnn_vunary_ukernel_fn) xnn_f32_vrndz_ukernel__neonv8_x8,
3227 .element_tile = 8,
3228 };
3229 xnn_params.f32.rndu = (struct vunary_parameters) {
3230 .ukernel = (xnn_vunary_ukernel_fn) xnn_f32_vrndu_ukernel__neonv8_x8,
3231 .element_tile = 8,
3232 };
3233 xnn_params.f32.rndd = (struct vunary_parameters) {
3234 .ukernel = (xnn_vunary_ukernel_fn) xnn_f32_vrndd_ukernel__neonv8_x8,
3235 .element_tile = 8,
3236 };
3237 xnn_params.f32.sigmoid = (struct vunary_parameters) {
3238 .ukernel = (xnn_vunary_ukernel_fn) xnn_f32_vsigmoid_ukernel__neonfma_rr1_lut64_p2_nr2recps_x16,
3239 .init.f32_sigmoid = xnn_init_f32_sigmoid_neonfma_rr1_lut64_p2_params,
3240 .element_tile = 16,
3241 };
3242 xnn_params.f32.sqr = (struct vunary_parameters) {
3243 .ukernel = (xnn_vunary_ukernel_fn) xnn_f32_vsqr_ukernel__neon_x8,
3244 .element_tile = 8,
3245 };
3246 xnn_params.f32.sqrt = (struct vunary_parameters) {
3247 .ukernel = (xnn_vunary_ukernel_fn) xnn_f32_vsqrt_ukernel__aarch64_neon_sqrt_x4,
3248 .element_tile = 4,
3249 };
3250 xnn_params.f32.prelu = (struct prelu_parameters) {
3251 .ukernel = (xnn_prelu_ukernel_fn) xnn_f32_prelu_ukernel__neon_2x8,
3252 .row_tile = 2,
3253 .channel_tile = 8,
3254 };
3255 xnn_params.f32.raddstoreexpminusmax = (struct raddstoreexpminusmax_parameters) {
3256 .ukernel = (xnn_raddstoreexpminusmax_ukernel_fn) xnn_f32_raddstoreexpminusmax_ukernel__neonfma_rr1_lut64_p2_x16,
3257 .init.f32 = xnn_init_f32_expminus_neonfma_rr1_lut64_p2_params,
3258 .element_tile = 16,
3259 };
3260 xnn_params.f32.rmax = (xnn_rmax_ukernel_fn) xnn_f32_rmax_ukernel__neon;
3261 xnn_params.f32.vmulcaddc = (struct vmulcaddc_parameters) {
3262 .ukernel = (xnn_vmulcaddc_ukernel_fn) xnn_f32_vmulcaddc_minmax_ukernel_c4__neonfma_2x,
3263 .init.f32 = xnn_init_f32_minmax_scalar_params,
3264 .channel_tile = 4,
3265 .row_tile = 2,
3266 };
3267 #ifndef XNN_NO_NCHW_OPERATORS
3268 init_flags |= XNN_INIT_FLAG_CHW_OPT;
3269
3270 xnn_params.f32.spmm = (struct spmm_parameters) {
3271 .ukernel = (xnn_spmm_ukernel_fn) xnn_f32_spmm_minmax_ukernel_32x1__neonfma_pipelined,
3272 .init.f32 = xnn_init_f32_minmax_scalar_params,
3273 .mr = 32,
3274 .nr = 1,
3275 };
3276 xnn_params.f32.spmm2 = (struct spmm_parameters) {
3277 .ukernel = (xnn_spmm_ukernel_fn) xnn_f32_spmm_minmax_ukernel_32x2__aarch64_neonfma,
3278 .init.f32 = xnn_init_f32_minmax_scalar_params,
3279 .mr = 32,
3280 .nr = 2,
3281 };
3282 xnn_params.f32.spmm4 = (struct spmm_parameters) {
3283 .ukernel = (xnn_spmm_ukernel_fn) xnn_f32_spmm_minmax_ukernel_32x4__aarch64_neonfma,
3284 .init.f32 = xnn_init_f32_minmax_scalar_params,
3285 .mr = 32,
3286 .nr = 4,
3287 };
3288 xnn_params.f32.conv_hwc2chw_3x3c3s2 = (struct conv_hwc2chw_parameters) {
3289 .ukernel_with_symm_padding =
3290 (xnn_conv_hwc2chw_ukernel_fn) xnn_f32_conv_hwc2chw_ukernel_3x3s2p1c3x4__aarch64_neonfma_2x2,
3291 .init.f32 = xnn_init_f32_minmax_scalar_params,
3292 .output_channel_tile = 4,
3293 .output_height_tile = 2,
3294 .output_width_tile = 2,
3295 };
3296 xnn_params.f32.dwconv2d_chw_3x3 = (struct dwconv2d_chw_parameters) {
3297 .ukernel = (xnn_dwconv2d_chw_ukernel_fn) xnn_f32_dwconv2d_chw_ukernel_3x3p1__aarch64_neonfma_3x4,
3298 .init.f32 = xnn_init_f32_chw_neon_stride1_params,
3299 .update.f32 = xnn_update_f32_chw_neon_stride1_params,
3300 .output_height_tile = 3,
3301 .output_width_tile = 4,
3302 };
3303 xnn_params.f32.dwconv2d_chw_3x3s2 = (struct dwconv2d_chw_parameters) {
3304 .ukernel = (xnn_dwconv2d_chw_ukernel_fn) xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__aarch64_neonfma_2x4_acc2,
3305 .init.f32 = xnn_init_f32_chw_neon_stride2_params,
3306 .update.f32 = xnn_update_f32_chw_neon_stride2_params,
3307 .output_height_tile = 2,
3308 .output_width_tile = 4,
3309 };
3310 xnn_params.f32.dwconv2d_chw_5x5 = (struct dwconv2d_chw_parameters) {
3311 .ukernel = (xnn_dwconv2d_chw_ukernel_fn) xnn_f32_dwconv2d_chw_ukernel_5x5p2__aarch64_neonfma_4x4,
3312 .init.f32 = xnn_init_f32_chw_neon_stride1_params,
3313 .update.f32 = xnn_update_f32_chw_neon_stride1_params,
3314 .output_height_tile = 4,
3315 .output_width_tile = 4,
3316 };
3317 xnn_params.f32.dwconv2d_chw_5x5s2 = (struct dwconv2d_chw_parameters) {
3318 .ukernel = (xnn_dwconv2d_chw_ukernel_fn) xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__aarch64_neonfma_1x4_acc2,
3319 .init.f32 = xnn_init_f32_chw_neon_stride2_params,
3320 .update.f32 = xnn_update_f32_chw_neon_stride2_params,
3321 .output_height_tile = 1,
3322 .output_width_tile = 4,
3323 };
3324 xnn_params.f32.gavgpool_cw = (struct gavgpool_cw_parameters) {
3325 .ukernel = (xnn_gavgpool_cw_ukernel_fn) xnn_f32_gavgpool_cw_ukernel__neon_x4,
3326 .channel_tile = 4,
3327 };
3328 xnn_params.f32.ibilinear_chw = (struct ibilinear_chw_parameters) {
3329 .ukernel = (xnn_ibilinear_chw_ukernel_fn) xnn_f32_ibilinear_chw_ukernel__neonfma_p8,
3330 .channel_tile = 1,
3331 .pixel_tile = 8,
3332 };
3333 #endif // XNN_NO_NCHW_OPERATORS
3334 #endif // XNN_NO_F32_OPERATORS
3335
3336 /*************************** VCVT AArch64 micro-kernels ***************************/
3337 #ifndef XNN_NO_VCVT_OPERATORS
3338 init_flags |= XNN_INIT_FLAG_VCVT;
3339
3340 xnn_params.vcvt.f16_to_f32 = (struct vunary_parameters) {
3341 .ukernel = (xnn_vunary_ukernel_fn) xnn_f16_f32_vcvt_ukernel__neonfp16_x16,
3342 .element_tile = 16,
3343 };
3344 xnn_params.vcvt.f32_to_f16 = (struct vunary_parameters) {
3345 .ukernel = (xnn_vunary_ukernel_fn) xnn_f32_f16_vcvt_ukernel__neonfp16_x16,
3346 .element_tile = 16,
3347 };
3348 xnn_params.vcvt.f32_to_qs8 = (struct vunary_parameters) {
3349 .ukernel = (xnn_vunary_ukernel_fn) xnn_f32_qs8_vcvt_ukernel__neonv8_x32,
3350 .init.f32_qs8_cvt = xnn_init_f32_qs8_cvt_neonv8_params,
3351 .element_tile = 32,
3352 };
3353 xnn_params.vcvt.f32_to_qu8 = (struct vunary_parameters) {
3354 .ukernel = (xnn_vunary_ukernel_fn) xnn_f32_qu8_vcvt_ukernel__neonv8_x32,
3355 .init.f32_qu8_cvt = xnn_init_f32_qu8_cvt_neonv8_params,
3356 .element_tile = 32,
3357 };
3358 xnn_params.vcvt.qs8 = (struct vunary_parameters) {
3359 .ukernel = (xnn_vunary_ukernel_fn) xnn_qs8_vcvt_ukernel__neon_x32,
3360 .init.qs8_cvt = xnn_init_qs8_cvt_neon_params,
3361 .element_tile = 32,
3362 };
3363 xnn_params.vcvt.qs8_to_f32 = (struct vunary_parameters) {
3364 .ukernel = (xnn_vunary_ukernel_fn) xnn_qs8_f32_vcvt_ukernel__neon_x32,
3365 .init.qs8_f32_cvt = xnn_init_qs8_f32_cvt_neon_params,
3366 .element_tile = 32,
3367 };
3368 xnn_params.vcvt.qu8 = (struct vunary_parameters) {
3369 .ukernel = (xnn_vunary_ukernel_fn) xnn_qu8_vcvt_ukernel__neon_x32,
3370 .init.qu8_cvt = xnn_init_qu8_cvt_neon_params,
3371 .element_tile = 32,
3372 };
3373 xnn_params.vcvt.qu8_to_f32 = (struct vunary_parameters) {
3374 .ukernel = (xnn_vunary_ukernel_fn) xnn_qu8_f32_vcvt_ukernel__neon_x32,
3375 .init.qu8_f32_cvt = xnn_init_qu8_f32_cvt_neon_params,
3376 .element_tile = 32,
3377 };
3378 #endif // XNN_NO_VCVT_OPERATORS
3379
3380 /**************************** X32 AArch64 micro-kernels ****************************/
3381 #ifndef XNN_NO_X32_OPERATORS
3382 init_flags |= XNN_INIT_FLAG_X32;
3383
3384 xnn_params.x32.unpool = (xnn_unpool_ukernel_fn) xnn_x32_unpool_ukernel__neon;
3385 xnn_params.x32.zip = (struct zip_parameters) {
3386 .x2 = (xnn_zipc_ukernel_fn) xnn_x32_zip_x2_ukernel__neon,
3387 .x3 = (xnn_zipc_ukernel_fn) xnn_x32_zip_x3_ukernel__neon,
3388 .x4 = (xnn_zipc_ukernel_fn) xnn_x32_zip_x4_ukernel__neon,
3389 .xm = (xnn_zipv_ukernel_fn) xnn_x32_zip_xm_ukernel__neon,
3390 };
3391 #endif // XNN_NO_X32_OPERATORS
3392
3393 /**************************** XX AArch64 micro-kernels ****************************/
3394 #ifndef XNN_NO_XX_OPERATORS
3395 init_flags |= XNN_INIT_FLAG_XX;
3396
3397 xnn_params.xx.copy = (xnn_vunary_ukernel_fn) xnn_xx_copy_ukernel__scalar_memcpy;
3398 xnn_params.xx.fill = (struct fill_parameters) {
3399 .ukernel = (xnn_fill_ukernel_fn) xnn_xx_fill_ukernel__neon_x64,
3400 .row_tile = 1,
3401 };
3402 xnn_params.xx.pad = (struct pad_parameters) {
3403 .ukernel = (xnn_pad_ukernel_fn) xnn_xx_pad_ukernel__neon,
3404 .row_tile = 1,
3405 };
3406 #endif
3407
3408#elif XNN_ARCH_X86 || XNN_ARCH_X86_64
3409 /**************************** QC8 x86 micro-kernels ****************************/
3410 #ifndef XNN_NO_QC8_OPERATORS
3411 init_flags |= XNN_INIT_FLAG_QC8;
3412
3413 if (!XNN_PLATFORM_MOBILE && hardware_config->use_x86_avx512skx) {
3414 xnn_params.qc8.gemm.minmax.gemm[XNN_MR_TO_INDEX(4)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_fn) xnn_qc8_gemm_minmax_fp32_ukernel_4x16c8__avx512skx);
3415 xnn_params.qc8.gemm.minmax.igemm[XNN_MR_TO_INDEX(4)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_fn) xnn_qc8_igemm_minmax_fp32_ukernel_4x16c8__avx512skx);
3416 xnn_params.qc8.gemm.minmax.gemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_fn) xnn_qc8_gemm_minmax_fp32_ukernel_1x16c8__avx512skx);
3417 xnn_params.qc8.gemm.minmax.igemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_fn) xnn_qc8_igemm_minmax_fp32_ukernel_1x16c8__avx512skx);
3418 xnn_params.qc8.gemm.init.qc8 = xnn_init_qc8_conv_minmax_fp32_avx512_params;
3419 xnn_params.qc8.gemm.mr = 4;
3420 xnn_params.qc8.gemm.nr = 16;
3421 xnn_params.qc8.gemm.log2_kr = 3;
3422 } else if (hardware_config->use_x86_xop) {
3423 // XOP should be checked before AVX2: AMD Excavator supports both, but performs better with XOP microkernels
3424 xnn_params.qc8.gemm.minmax.gemm[XNN_MR_TO_INDEX(2)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_fn) xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__xop_ld64);
3425 xnn_params.qc8.gemm.minmax.igemm[XNN_MR_TO_INDEX(2)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_fn) xnn_qc8_igemm_minmax_fp32_ukernel_2x4c8__xop_ld64);
3426 xnn_params.qc8.gemm.minmax.gemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_fn) xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__xop_ld64);
3427 xnn_params.qc8.gemm.minmax.igemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_fn) xnn_qc8_igemm_minmax_fp32_ukernel_1x4c8__xop_ld64);
3428 xnn_params.qc8.gemm.init.qc8 = xnn_init_qc8_conv_minmax_fp32_sse4_params;
3429 xnn_params.qc8.gemm.mr = 2;
3430 xnn_params.qc8.gemm.nr = 4;
3431 xnn_params.qc8.gemm.log2_kr = 3;
3432 } else if (hardware_config->use_x86_avx2) {
3433 xnn_params.qc8.gemm.minmax.gemm[XNN_MR_TO_INDEX(3)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_fn) xnn_qc8_gemm_minmax_fp32_ukernel_3x8c8__avx2);
3434 xnn_params.qc8.gemm.minmax.igemm[XNN_MR_TO_INDEX(3)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_fn) xnn_qc8_igemm_minmax_fp32_ukernel_3x8c8__avx2);
3435 xnn_params.qc8.gemm.minmax.gemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_fn) xnn_qc8_gemm_minmax_fp32_ukernel_1x8c8__avx2);
3436 xnn_params.qc8.gemm.minmax.igemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_fn) xnn_qc8_igemm_minmax_fp32_ukernel_1x8c8__avx2);
3437 xnn_params.qc8.gemm.init.qc8 = xnn_init_qc8_conv_minmax_fp32_avx2_params;
3438 xnn_params.qc8.gemm.mr = 3;
3439 xnn_params.qc8.gemm.nr = 8;
3440 xnn_params.qc8.gemm.log2_kr = 3;
3441 } else if (hardware_config->use_x86_avx) {
3442 xnn_params.qc8.gemm.minmax.gemm[XNN_MR_TO_INDEX(2)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_fn) xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__avx_ld128);
3443 xnn_params.qc8.gemm.minmax.igemm[XNN_MR_TO_INDEX(2)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_fn) xnn_qc8_igemm_minmax_fp32_ukernel_2x4c8__avx_ld128);
3444 xnn_params.qc8.gemm.minmax.gemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_fn) xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__avx_ld128);
3445 xnn_params.qc8.gemm.minmax.igemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_fn) xnn_qc8_igemm_minmax_fp32_ukernel_1x4c8__avx_ld128);
3446 xnn_params.qc8.gemm.init.qc8 = xnn_init_qc8_conv_minmax_fp32_sse4_params;
3447 xnn_params.qc8.gemm.mr = 2;
3448 xnn_params.qc8.gemm.nr = 4;
3449 xnn_params.qc8.gemm.log2_kr = 3;
3450 } else if (hardware_config->use_x86_sse4_1) {
3451 xnn_params.qc8.gemm.minmax.gemm[XNN_MR_TO_INDEX(3)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_fn) xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__sse41_ld64);
3452 xnn_params.qc8.gemm.minmax.igemm[XNN_MR_TO_INDEX(3)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_fn) xnn_qc8_igemm_minmax_fp32_ukernel_3x4c8__sse41_ld64);
3453 xnn_params.qc8.gemm.minmax.gemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_fn) xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__sse41_ld64);
3454 xnn_params.qc8.gemm.minmax.igemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_fn) xnn_qc8_igemm_minmax_fp32_ukernel_1x4c8__sse41_ld64);
3455 xnn_params.qc8.gemm.init.qc8 = xnn_init_qc8_conv_minmax_fp32_sse4_params;
3456 xnn_params.qc8.gemm.mr = 3;
3457 xnn_params.qc8.gemm.nr = 4;
3458 xnn_params.qc8.gemm.log2_kr = 3;
3459 } else {
3460 xnn_params.qc8.gemm.minmax.gemm[XNN_MR_TO_INDEX(3)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_fn) xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__sse2_ld64);
3461 xnn_params.qc8.gemm.minmax.igemm[XNN_MR_TO_INDEX(3)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_fn) xnn_qc8_igemm_minmax_fp32_ukernel_3x4c8__sse2_ld64);
3462 xnn_params.qc8.gemm.minmax.gemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_fn) xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__sse2_ld64);
3463 xnn_params.qc8.gemm.minmax.igemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_fn) xnn_qc8_igemm_minmax_fp32_ukernel_1x4c8__sse2_ld64);
3464 xnn_params.qc8.gemm.init.qc8 = xnn_init_qc8_conv_minmax_fp32_sse2_params;
3465 xnn_params.qc8.gemm.mr = 3;
3466 xnn_params.qc8.gemm.nr = 4;
3467 xnn_params.qc8.gemm.log2_kr = 3;
3468 }
3469
3470 if (!XNN_PLATFORM_MOBILE && hardware_config->use_x86_avx512skx) {
3471 xnn_params.qc8.dwconv[0].minmax.unipass = (xnn_dwconv_unipass_ukernel_fn) xnn_qc8_dwconv_minmax_fp32_ukernel_3p32c__avx512skx_mul32;
3472 xnn_params.qc8.dwconv[0].init.qc8 = xnn_init_qc8_conv_minmax_fp32_avx512_params;
3473 xnn_params.qc8.dwconv[0].channel_tile = 32;
3474 xnn_params.qc8.dwconv[1].minmax.unipass = (xnn_dwconv_unipass_ukernel_fn) xnn_qc8_dwconv_minmax_fp32_ukernel_9p32c__avx512skx_mul32;
3475 xnn_params.qc8.dwconv[1].init.qc8 = xnn_init_qc8_conv_minmax_fp32_avx512_params;
3476 xnn_params.qc8.dwconv[1].channel_tile = 32;
3477 xnn_params.qc8.dwconv[2].minmax.unipass = (xnn_dwconv_unipass_ukernel_fn) xnn_qc8_dwconv_minmax_fp32_ukernel_25p32c__avx512skx_mul32;
3478 xnn_params.qc8.dwconv[2].init.qc8 = xnn_init_qc8_conv_minmax_fp32_avx512_params;
3479 xnn_params.qc8.dwconv[2].channel_tile = 32;
3480 } else if (hardware_config->use_x86_xop) {
3481 // XOP should be checked before AVX2: AMD Excavator supports both, but performs better with XOP microkernels
3482 xnn_params.qc8.dwconv[0].minmax.unipass = (xnn_dwconv_unipass_ukernel_fn) xnn_qc8_dwconv_minmax_fp32_ukernel_3p16c__xop_mul16_add16;
3483 xnn_params.qc8.dwconv[0].init.qc8 = xnn_init_qc8_conv_minmax_fp32_sse4_params;
3484 xnn_params.qc8.dwconv[0].channel_tile = 16;
3485 xnn_params.qc8.dwconv[1].minmax.unipass = (xnn_dwconv_unipass_ukernel_fn) xnn_qc8_dwconv_minmax_fp32_ukernel_9p16c__xop_mul16_add16;
3486 xnn_params.qc8.dwconv[1].init.qc8 = xnn_init_qc8_conv_minmax_fp32_sse4_params;
3487 xnn_params.qc8.dwconv[1].channel_tile = 16;
3488 xnn_params.qc8.dwconv[2].minmax.unipass = (xnn_dwconv_unipass_ukernel_fn) xnn_qc8_dwconv_minmax_fp32_ukernel_25p16c__xop_mul16_add16;
3489 xnn_params.qc8.dwconv[2].init.qc8 = xnn_init_qc8_conv_minmax_fp32_sse4_params;
3490 xnn_params.qc8.dwconv[2].channel_tile = 16;
3491 } else if (hardware_config->use_x86_avx2) {
3492 xnn_params.qc8.dwconv[0].minmax.unipass = (xnn_dwconv_unipass_ukernel_fn) xnn_qc8_dwconv_minmax_fp32_ukernel_3p16c__avx2_mul32;
3493 xnn_params.qc8.dwconv[0].init.qc8 = xnn_init_qc8_conv_minmax_fp32_avx2_params;
3494 xnn_params.qc8.dwconv[0].channel_tile = 16;
3495 xnn_params.qc8.dwconv[1].minmax.unipass = (xnn_dwconv_unipass_ukernel_fn) xnn_qc8_dwconv_minmax_fp32_ukernel_9p16c__avx2_mul32;
3496 xnn_params.qc8.dwconv[1].init.qc8 = xnn_init_qc8_conv_minmax_fp32_avx2_params;
3497 xnn_params.qc8.dwconv[1].channel_tile = 16;
3498 xnn_params.qc8.dwconv[2].minmax.unipass = (xnn_dwconv_unipass_ukernel_fn) xnn_qc8_dwconv_minmax_fp32_ukernel_25p16c__avx2_mul32;
3499 xnn_params.qc8.dwconv[2].init.qc8 = xnn_init_qc8_conv_minmax_fp32_avx2_params;
3500 xnn_params.qc8.dwconv[2].channel_tile = 16;
3501 } else if (hardware_config->use_x86_avx) {
3502 xnn_params.qc8.dwconv[0].minmax.unipass = (xnn_dwconv_unipass_ukernel_fn) xnn_qc8_dwconv_minmax_fp32_ukernel_3p16c__avx_mul16_add16;
3503 xnn_params.qc8.dwconv[0].init.qc8 = xnn_init_qc8_conv_minmax_fp32_sse4_params;
3504 xnn_params.qc8.dwconv[0].channel_tile = 16;
3505 xnn_params.qc8.dwconv[1].minmax.unipass = (xnn_dwconv_unipass_ukernel_fn) xnn_qc8_dwconv_minmax_fp32_ukernel_9p16c__avx_mul16_add16;
3506 xnn_params.qc8.dwconv[1].init.qc8 = xnn_init_qc8_conv_minmax_fp32_sse4_params;
3507 xnn_params.qc8.dwconv[1].channel_tile = 16;
3508 xnn_params.qc8.dwconv[2].minmax.unipass = (xnn_dwconv_unipass_ukernel_fn) xnn_qc8_dwconv_minmax_fp32_ukernel_25p16c__avx_mul16_add16;
3509 xnn_params.qc8.dwconv[2].init.qc8 = xnn_init_qc8_conv_minmax_fp32_sse4_params;
3510 xnn_params.qc8.dwconv[2].channel_tile = 16;
3511 } else if (hardware_config->use_x86_sse4_1) {
3512 xnn_params.qc8.dwconv[0].minmax.unipass = (xnn_dwconv_unipass_ukernel_fn) xnn_qc8_dwconv_minmax_fp32_ukernel_3p8c__sse41_mul16;
3513 xnn_params.qc8.dwconv[0].init.qc8 = xnn_init_qc8_conv_minmax_fp32_sse4_params;
3514 xnn_params.qc8.dwconv[0].channel_tile = 8;
3515 xnn_params.qc8.dwconv[1].minmax.unipass = (xnn_dwconv_unipass_ukernel_fn) xnn_qc8_dwconv_minmax_fp32_ukernel_9p8c__sse41_mul16;
3516 xnn_params.qc8.dwconv[1].init.qc8 = xnn_init_qc8_conv_minmax_fp32_sse4_params;
3517 xnn_params.qc8.dwconv[1].channel_tile = 8;
3518 xnn_params.qc8.dwconv[2].minmax.unipass = (xnn_dwconv_unipass_ukernel_fn) xnn_qc8_dwconv_minmax_fp32_ukernel_25p8c__sse41_mul16;
3519 xnn_params.qc8.dwconv[2].init.qc8 = xnn_init_qc8_conv_minmax_fp32_sse4_params;
3520 xnn_params.qc8.dwconv[2].channel_tile = 8;
3521 } else {
3522 xnn_params.qc8.dwconv[0].minmax.unipass = (xnn_dwconv_unipass_ukernel_fn) xnn_qc8_dwconv_minmax_fp32_ukernel_3p8c__sse2_mul16;
3523 xnn_params.qc8.dwconv[0].init.qc8 = xnn_init_qc8_conv_minmax_fp32_sse2_params;
3524 xnn_params.qc8.dwconv[0].channel_tile = 8;
3525 xnn_params.qc8.dwconv[1].minmax.unipass = (xnn_dwconv_unipass_ukernel_fn) xnn_qc8_dwconv_minmax_fp32_ukernel_9p8c__sse2_mul16;
3526 xnn_params.qc8.dwconv[1].init.qc8 = xnn_init_qc8_conv_minmax_fp32_sse2_params;
3527 xnn_params.qc8.dwconv[1].channel_tile = 8;
3528 xnn_params.qc8.dwconv[2].minmax.unipass = (xnn_dwconv_unipass_ukernel_fn) xnn_qc8_dwconv_minmax_fp32_ukernel_25p8c__sse2_mul16;
3529 xnn_params.qc8.dwconv[2].init.qc8 = xnn_init_qc8_conv_minmax_fp32_sse2_params;
3530 xnn_params.qc8.dwconv[2].channel_tile = 8;
3531 }
3532 xnn_params.qc8.dwconv[0].primary_tile = 3;
3533 xnn_params.qc8.dwconv[1].primary_tile = 9;
3534 xnn_params.qc8.dwconv[2].primary_tile = 25;
3535 #endif // XNN_NO_QC8_OPERATORS
3536
3537 /**************************** QS8 x86 micro-kernels ****************************/
3538 #ifndef XNN_NO_QS8_OPERATORS
3539 init_flags |= XNN_INIT_FLAG_QS8;
3540
3541 if (!XNN_PLATFORM_MOBILE && hardware_config->use_x86_avx512skx) {
3542 xnn_params.qs8.gemm.minmax.gemm[XNN_MR_TO_INDEX(4)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_fn) xnn_qs8_gemm_minmax_fp32_ukernel_4x16c8__avx512skx);
3543 xnn_params.qs8.gemm.minmax.igemm[XNN_MR_TO_INDEX(4)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_fn) xnn_qs8_igemm_minmax_fp32_ukernel_4x16c8__avx512skx);
3544 xnn_params.qs8.gemm.minmax.gemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_fn) xnn_qs8_gemm_minmax_fp32_ukernel_1x16c8__avx512skx);
3545 xnn_params.qs8.gemm.minmax.igemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_fn) xnn_qs8_igemm_minmax_fp32_ukernel_1x16c8__avx512skx);
3546 xnn_params.qs8.gemm.init.qs8 = xnn_init_qs8_conv_minmax_fp32_avx512_params;
3547 xnn_params.qs8.gemm.mr = 4;
3548 xnn_params.qs8.gemm.nr = 16;
3549 xnn_params.qs8.gemm.log2_kr = 3;
3550 } else if (hardware_config->use_x86_xop) {
3551 // XOP should be checked before AVX2: AMD Excavator supports both, but performs better with XOP microkernels
3552 xnn_params.qs8.gemm.minmax.gemm[XNN_MR_TO_INDEX(2)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_fn) xnn_qs8_gemm_minmax_fp32_ukernel_2x4c8__xop_ld64);
3553 xnn_params.qs8.gemm.minmax.igemm[XNN_MR_TO_INDEX(2)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_fn) xnn_qs8_igemm_minmax_fp32_ukernel_2x4c8__xop_ld64);
3554 xnn_params.qs8.gemm.minmax.gemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_fn) xnn_qs8_gemm_minmax_fp32_ukernel_1x4c8__xop_ld64);
3555 xnn_params.qs8.gemm.minmax.igemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_fn) xnn_qs8_igemm_minmax_fp32_ukernel_1x4c8__xop_ld64);
3556 xnn_params.qs8.gemm.init.qs8 = xnn_init_qs8_conv_minmax_fp32_sse4_params;
3557 xnn_params.qs8.gemm.mr = 2;
3558 xnn_params.qs8.gemm.nr = 4;
3559 xnn_params.qs8.gemm.log2_kr = 3;
3560 } else if (hardware_config->use_x86_avx2) {
3561 xnn_params.qs8.gemm.minmax.gemm[XNN_MR_TO_INDEX(3)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_fn) xnn_qs8_gemm_minmax_fp32_ukernel_3x8c8__avx2);
3562 xnn_params.qs8.gemm.minmax.igemm[XNN_MR_TO_INDEX(3)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_fn) xnn_qs8_igemm_minmax_fp32_ukernel_3x8c8__avx2);
3563 xnn_params.qs8.gemm.minmax.gemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_fn) xnn_qs8_gemm_minmax_fp32_ukernel_1x8c8__avx2);
3564 xnn_params.qs8.gemm.minmax.igemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_fn) xnn_qs8_igemm_minmax_fp32_ukernel_1x8c8__avx2);
3565 xnn_params.qs8.gemm.init.qs8 = xnn_init_qs8_conv_minmax_fp32_avx2_params;
3566 xnn_params.qs8.gemm.mr = 3;
3567 xnn_params.qs8.gemm.nr = 8;
3568 xnn_params.qs8.gemm.log2_kr = 3;
3569 } else if (hardware_config->use_x86_avx) {
3570 xnn_params.qs8.gemm.minmax.gemm[XNN_MR_TO_INDEX(2)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_fn) xnn_qs8_gemm_minmax_fp32_ukernel_2x4c8__avx_ld128);
3571 xnn_params.qs8.gemm.minmax.igemm[XNN_MR_TO_INDEX(2)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_fn) xnn_qs8_igemm_minmax_fp32_ukernel_2x4c8__avx_ld128);
3572 xnn_params.qs8.gemm.minmax.gemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_fn) xnn_qs8_gemm_minmax_fp32_ukernel_1x4c8__avx_ld128);
3573 xnn_params.qs8.gemm.minmax.igemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_fn) xnn_qs8_igemm_minmax_fp32_ukernel_1x4c8__avx_ld128);
3574 xnn_params.qs8.gemm.init.qs8 = xnn_init_qs8_conv_minmax_fp32_sse4_params;
3575 xnn_params.qs8.gemm.mr = 2;
3576 xnn_params.qs8.gemm.nr = 4;
3577 xnn_params.qs8.gemm.log2_kr = 3;
3578 } else if (hardware_config->use_x86_sse4_1) {
3579 xnn_params.qs8.gemm.minmax.gemm[XNN_MR_TO_INDEX(3)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_fn) xnn_qs8_gemm_minmax_fp32_ukernel_3x4c8__sse41_ld64);
3580 xnn_params.qs8.gemm.minmax.igemm[XNN_MR_TO_INDEX(3)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_fn) xnn_qs8_igemm_minmax_fp32_ukernel_3x4c8__sse41_ld64);
3581 xnn_params.qs8.gemm.minmax.gemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_fn) xnn_qs8_gemm_minmax_fp32_ukernel_1x4c8__sse41_ld64);
3582 xnn_params.qs8.gemm.minmax.igemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_fn) xnn_qs8_igemm_minmax_fp32_ukernel_1x4c8__sse41_ld64);
3583 xnn_params.qs8.gemm.init.qs8 = xnn_init_qs8_conv_minmax_fp32_sse4_params;
3584 xnn_params.qs8.gemm.mr = 3;
3585 xnn_params.qs8.gemm.nr = 4;
3586 xnn_params.qs8.gemm.log2_kr = 3;
3587 } else {
3588 xnn_params.qs8.gemm.minmax.gemm[XNN_MR_TO_INDEX(3)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_fn) xnn_qs8_gemm_minmax_fp32_ukernel_3x4c8__sse2_ld64);
3589 xnn_params.qs8.gemm.minmax.igemm[XNN_MR_TO_INDEX(3)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_fn) xnn_qs8_igemm_minmax_fp32_ukernel_3x4c8__sse2_ld64);
3590 xnn_params.qs8.gemm.minmax.gemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_fn) xnn_qs8_gemm_minmax_fp32_ukernel_1x4c8__sse2_ld64);
3591 xnn_params.qs8.gemm.minmax.igemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_fn) xnn_qs8_igemm_minmax_fp32_ukernel_1x4c8__sse2_ld64);
3592 xnn_params.qs8.gemm.init.qs8 = xnn_init_qs8_conv_minmax_fp32_sse2_params;
3593 xnn_params.qs8.gemm.mr = 3;
3594 xnn_params.qs8.gemm.nr = 4;
3595 xnn_params.qs8.gemm.log2_kr = 3;
3596 }
3597
3598 if (!XNN_PLATFORM_MOBILE && hardware_config->use_x86_avx512skx) {
3599 xnn_params.qs8.dwconv[0].minmax.unipass = (xnn_dwconv_unipass_ukernel_fn) xnn_qs8_dwconv_minmax_fp32_ukernel_9p32c__avx512skx_mul32;
3600 xnn_params.qs8.dwconv[0].init.qs8 = xnn_init_qs8_conv_minmax_fp32_avx512_params;
3601 xnn_params.qs8.dwconv[0].channel_tile = 32;
3602 xnn_params.qs8.dwconv[1].minmax.unipass = (xnn_dwconv_unipass_ukernel_fn) xnn_qs8_dwconv_minmax_fp32_ukernel_25p32c__avx512skx_mul32;
3603 xnn_params.qs8.dwconv[1].init.qs8 = xnn_init_qs8_conv_minmax_fp32_avx512_params;
3604 xnn_params.qs8.dwconv[1].channel_tile = 32;
3605 } else if (hardware_config->use_x86_xop) {
3606 // XOP should be checked before AVX2: AMD Excavator supports both, but performs better with XOP microkernels
3607 xnn_params.qs8.dwconv[0].minmax.unipass = (xnn_dwconv_unipass_ukernel_fn) xnn_qs8_dwconv_minmax_fp32_ukernel_9p16c__xop_mul16_add16;
3608 xnn_params.qs8.dwconv[0].init.qs8 = xnn_init_qs8_conv_minmax_fp32_sse4_params;
3609 xnn_params.qs8.dwconv[0].channel_tile = 16;
3610 xnn_params.qs8.dwconv[1].minmax.unipass = (xnn_dwconv_unipass_ukernel_fn) xnn_qs8_dwconv_minmax_fp32_ukernel_25p16c__xop_mul16_add16;
3611 xnn_params.qs8.dwconv[1].init.qs8 = xnn_init_qs8_conv_minmax_fp32_sse4_params;
3612 xnn_params.qs8.dwconv[1].channel_tile = 16;
3613 } else if (hardware_config->use_x86_avx2) {
3614 xnn_params.qs8.dwconv[0].minmax.unipass = (xnn_dwconv_unipass_ukernel_fn) xnn_qs8_dwconv_minmax_fp32_ukernel_9p16c__avx2_mul32;
3615 xnn_params.qs8.dwconv[0].init.qs8 = xnn_init_qs8_conv_minmax_fp32_avx2_params;
3616 xnn_params.qs8.dwconv[0].channel_tile = 16;
3617 xnn_params.qs8.dwconv[1].minmax.unipass = (xnn_dwconv_unipass_ukernel_fn) xnn_qs8_dwconv_minmax_fp32_ukernel_25p16c__avx2_mul32;
3618 xnn_params.qs8.dwconv[1].init.qs8 = xnn_init_qs8_conv_minmax_fp32_avx2_params;
3619 xnn_params.qs8.dwconv[1].channel_tile = 16;
3620 } else if (hardware_config->use_x86_avx) {
3621 xnn_params.qs8.dwconv[0].minmax.unipass = (xnn_dwconv_unipass_ukernel_fn) xnn_qs8_dwconv_minmax_fp32_ukernel_9p16c__avx_mul16_add16;
3622 xnn_params.qs8.dwconv[0].init.qs8 = xnn_init_qs8_conv_minmax_fp32_sse4_params;
3623 xnn_params.qs8.dwconv[0].channel_tile = 16;
3624 xnn_params.qs8.dwconv[1].minmax.unipass = (xnn_dwconv_unipass_ukernel_fn) xnn_qs8_dwconv_minmax_fp32_ukernel_25p16c__avx_mul16_add16;
3625 xnn_params.qs8.dwconv[1].init.qs8 = xnn_init_qs8_conv_minmax_fp32_sse4_params;
3626 xnn_params.qs8.dwconv[1].channel_tile = 16;
3627 } else if (hardware_config->use_x86_sse4_1) {
3628 xnn_params.qs8.dwconv[0].minmax.unipass = (xnn_dwconv_unipass_ukernel_fn) xnn_qs8_dwconv_minmax_fp32_ukernel_9p8c__sse41_mul16_add16;
3629 xnn_params.qs8.dwconv[0].init.qs8 = xnn_init_qs8_conv_minmax_fp32_sse4_params;
3630 xnn_params.qs8.dwconv[0].channel_tile = 8;
3631 xnn_params.qs8.dwconv[1].minmax.unipass = (xnn_dwconv_unipass_ukernel_fn) xnn_qs8_dwconv_minmax_fp32_ukernel_25p8c__sse41_mul16_add16;
3632 xnn_params.qs8.dwconv[1].init.qs8 = xnn_init_qs8_conv_minmax_fp32_sse4_params;
3633 xnn_params.qs8.dwconv[1].channel_tile = 8;
3634 } else {
3635 xnn_params.qs8.dwconv[0].minmax.unipass = (xnn_dwconv_unipass_ukernel_fn) xnn_qs8_dwconv_minmax_fp32_ukernel_9p8c__sse2_mul16_add16;
3636 xnn_params.qs8.dwconv[0].init.qs8 = xnn_init_qs8_conv_minmax_fp32_sse2_params;
3637 xnn_params.qs8.dwconv[0].channel_tile = 8;
3638 xnn_params.qs8.dwconv[1].minmax.unipass = (xnn_dwconv_unipass_ukernel_fn) xnn_qs8_dwconv_minmax_fp32_ukernel_25p8c__sse2_mul16_add16;
3639 xnn_params.qs8.dwconv[1].init.qs8 = xnn_init_qs8_conv_minmax_fp32_sse2_params;
3640 xnn_params.qs8.dwconv[1].channel_tile = 8;
3641 }
3642 xnn_params.qs8.dwconv[0].primary_tile = 9;
3643 xnn_params.qs8.dwconv[1].primary_tile = 25;
3644
3645 if (hardware_config->use_x86_sse4_1) {
3646 xnn_params.qs8.gavgpool = (struct gavgpool_parameters) {
3647 .unipass = (xnn_gavgpool_unipass_ukernel_fn) xnn_qs8_gavgpool_minmax_fp32_ukernel_7x__sse41_c8,
3648 .multipass = (xnn_gavgpool_multipass_ukernel_fn) xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__sse41_c8,
3649 .init.qs8 = xnn_init_qs8_avgpool_minmax_fp32_sse4_params,
3650 .update.qs8 = xnn_update_qs8_avgpool_minmax_fp32_sse4_params,
3651 .row_tile = 7,
3652 .channel_tile = 8,
3653 };
3654 } else {
3655 xnn_params.qs8.gavgpool = (struct gavgpool_parameters) {
3656 .unipass = (xnn_gavgpool_unipass_ukernel_fn) xnn_qs8_gavgpool_minmax_fp32_ukernel_7x__sse2_c8,
3657 .multipass = (xnn_gavgpool_multipass_ukernel_fn) xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__sse2_c8,
3658 .init.qs8 = xnn_init_qs8_avgpool_minmax_fp32_sse2_params,
3659 .update.qs8 = xnn_update_qs8_avgpool_minmax_fp32_sse2_params,
3660 .row_tile = 7,
3661 .channel_tile = 8,
3662 };
3663 }
3664
3665 if (hardware_config->use_x86_avx2) {
3666 xnn_params.qs8.lrelu = (struct vunary_parameters) {
3667 .ukernel = (xnn_vunary_ukernel_fn) xnn_qs8_vlrelu_ukernel__avx2_x32,
3668 .init.qs8_lrelu = xnn_init_qs8_lrelu_avx2_params,
3669 .element_tile = 32,
3670 };
3671 } else if (hardware_config->use_x86_avx) {
3672 xnn_params.qs8.lrelu = (struct vunary_parameters) {
3673 .ukernel = (xnn_vunary_ukernel_fn) xnn_qs8_vlrelu_ukernel__avx_x32,
3674 .init.qs8_lrelu = xnn_init_qs8_lrelu_avx_params,
3675 .element_tile = 32,
3676 };
3677 } else if (hardware_config->use_x86_sse4_1) {
3678 xnn_params.qs8.lrelu = (struct vunary_parameters) {
3679 .ukernel = (xnn_vunary_ukernel_fn) xnn_qs8_vlrelu_ukernel__sse41_x32,
3680 .init.qs8_lrelu = xnn_init_qs8_lrelu_sse2_params,
3681 .element_tile = 32,
3682 };
3683 } else if (hardware_config->use_x86_sse4_1) {
3684 xnn_params.qs8.lrelu = (struct vunary_parameters) {
3685 .ukernel = (xnn_vunary_ukernel_fn) xnn_qs8_vlrelu_ukernel__ssse3_x32,
3686 .init.qs8_lrelu = xnn_init_qs8_lrelu_sse2_params,
3687 .element_tile = 32,
3688 };
3689 } else {
3690 xnn_params.qs8.lrelu = (struct vunary_parameters) {
3691 .ukernel = (xnn_vunary_ukernel_fn) xnn_qs8_vlrelu_ukernel__sse2_x32,
3692 .init.qs8_lrelu = xnn_init_qs8_lrelu_sse2_params,
3693 .element_tile = 32,
3694 };
3695 }
3696 #endif // XNN_NO_QS8_OPERATORS
3697
3698 /**************************** QU8 x86 micro-kernels ****************************/
3699 #ifndef XNN_NO_QU8_OPERATORS
3700 init_flags |= XNN_INIT_FLAG_QU8;
3701
3702 if (!XNN_PLATFORM_MOBILE && hardware_config->use_x86_avx512skx) {
3703 xnn_params.qu8.gemm.minmax.gemm[XNN_MR_TO_INDEX(4)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_fn) xnn_qu8_gemm_minmax_fp32_ukernel_4x16c8__avx512skx);
3704 xnn_params.qu8.gemm.minmax.igemm[XNN_MR_TO_INDEX(4)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_fn) xnn_qu8_igemm_minmax_fp32_ukernel_4x16c8__avx512skx);
3705 xnn_params.qu8.gemm.minmax.gemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_fn) xnn_qu8_gemm_minmax_fp32_ukernel_1x16c8__avx512skx);
3706 xnn_params.qu8.gemm.minmax.igemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_fn) xnn_qu8_igemm_minmax_fp32_ukernel_1x16c8__avx512skx);
3707 xnn_params.qu8.gemm.init.qu8 = xnn_init_qu8_conv_minmax_fp32_avx512_params;
3708 xnn_params.qu8.gemm.mr = 4;
3709 xnn_params.qu8.gemm.nr = 16;
3710 xnn_params.qu8.gemm.log2_kr = 3;
3711 } else if (hardware_config->use_x86_xop) {
3712 // XOP should be checked before AVX2: AMD Excavator supports both, but performs better with XOP microkernels
3713 xnn_params.qu8.gemm.minmax.gemm[XNN_MR_TO_INDEX(2)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_fn) xnn_qu8_gemm_minmax_fp32_ukernel_2x4c8__xop_ld64);
3714 xnn_params.qu8.gemm.minmax.igemm[XNN_MR_TO_INDEX(2)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_fn) xnn_qu8_igemm_minmax_fp32_ukernel_2x4c8__xop_ld64);
3715 xnn_params.qu8.gemm.minmax.gemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_fn) xnn_qu8_gemm_minmax_fp32_ukernel_1x4c8__xop_ld64);
3716 xnn_params.qu8.gemm.minmax.igemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_fn) xnn_qu8_igemm_minmax_fp32_ukernel_1x4c8__xop_ld64);
3717 xnn_params.qu8.gemm.init.qu8 = xnn_init_qu8_conv_minmax_fp32_sse2_params;
3718 xnn_params.qu8.gemm.mr = 2;
3719 xnn_params.qu8.gemm.nr = 4;
3720 xnn_params.qu8.gemm.log2_kr = 3;
3721 } else if (hardware_config->use_x86_avx2) {
3722 xnn_params.qu8.gemm.minmax.gemm[XNN_MR_TO_INDEX(3)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_fn) xnn_qu8_gemm_minmax_fp32_ukernel_3x8c8__avx2);
3723 xnn_params.qu8.gemm.minmax.igemm[XNN_MR_TO_INDEX(3)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_fn) xnn_qu8_igemm_minmax_fp32_ukernel_3x8c8__avx2);
3724 xnn_params.qu8.gemm.minmax.gemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_fn) xnn_qu8_gemm_minmax_fp32_ukernel_1x8c8__avx2);
3725 xnn_params.qu8.gemm.minmax.igemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_fn) xnn_qu8_igemm_minmax_fp32_ukernel_1x8c8__avx2);
3726 xnn_params.qu8.gemm.init.qu8 = xnn_init_qu8_conv_minmax_fp32_avx2_params;
3727 xnn_params.qu8.gemm.mr = 3;
3728 xnn_params.qu8.gemm.nr = 8;
3729 xnn_params.qu8.gemm.log2_kr = 3;
3730 } else if (hardware_config->use_x86_avx) {
3731 xnn_params.qu8.gemm.minmax.gemm[XNN_MR_TO_INDEX(2)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_fn) xnn_qu8_gemm_minmax_fp32_ukernel_2x4c8__avx_ld128);
3732 xnn_params.qu8.gemm.minmax.igemm[XNN_MR_TO_INDEX(2)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_fn) xnn_qu8_igemm_minmax_fp32_ukernel_2x4c8__avx_ld128);
3733 xnn_params.qu8.gemm.minmax.gemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_fn) xnn_qu8_gemm_minmax_fp32_ukernel_1x4c8__avx_ld128);
3734 xnn_params.qu8.gemm.minmax.igemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_fn) xnn_qu8_igemm_minmax_fp32_ukernel_1x4c8__avx_ld128);
3735 xnn_params.qu8.gemm.init.qu8 = xnn_init_qu8_conv_minmax_fp32_sse2_params;
3736 xnn_params.qu8.gemm.mr = 2;
3737 xnn_params.qu8.gemm.nr = 4;
3738 xnn_params.qu8.gemm.log2_kr = 3;
3739 } else if (hardware_config->use_x86_sse4_1) {
3740 xnn_params.qu8.gemm.minmax.gemm[XNN_MR_TO_INDEX(3)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_fn) xnn_qu8_gemm_minmax_fp32_ukernel_3x4c8__sse41_ld64);
3741 xnn_params.qu8.gemm.minmax.igemm[XNN_MR_TO_INDEX(3)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_fn) xnn_qu8_igemm_minmax_fp32_ukernel_3x4c8__sse41_ld64);
3742 xnn_params.qu8.gemm.minmax.gemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_fn) xnn_qu8_gemm_minmax_fp32_ukernel_1x4c8__sse41_ld64);
3743 xnn_params.qu8.gemm.minmax.igemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_fn) xnn_qu8_igemm_minmax_fp32_ukernel_1x4c8__sse41_ld64);
3744 xnn_params.qu8.gemm.init.qu8 = xnn_init_qu8_conv_minmax_fp32_sse2_params;
3745 xnn_params.qu8.gemm.mr = 3;
3746 xnn_params.qu8.gemm.nr = 4;
3747 xnn_params.qu8.gemm.log2_kr = 3;
3748 } else {
3749 xnn_params.qu8.gemm.minmax.gemm[XNN_MR_TO_INDEX(3)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_fn) xnn_qu8_gemm_minmax_fp32_ukernel_3x4c8__sse2_ld64);
3750 xnn_params.qu8.gemm.minmax.igemm[XNN_MR_TO_INDEX(3)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_fn) xnn_qu8_igemm_minmax_fp32_ukernel_3x4c8__sse2_ld64);
3751 xnn_params.qu8.gemm.minmax.gemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_fn) xnn_qu8_gemm_minmax_fp32_ukernel_1x4c8__sse2_ld64);
3752 xnn_params.qu8.gemm.minmax.igemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_fn) xnn_qu8_igemm_minmax_fp32_ukernel_1x4c8__sse2_ld64);
3753 xnn_params.qu8.gemm.init.qu8 = xnn_init_qu8_conv_minmax_fp32_sse2_params;
3754 xnn_params.qu8.gemm.mr = 3;
3755 xnn_params.qu8.gemm.nr = 4;
3756 xnn_params.qu8.gemm.log2_kr = 3;
3757 }
3758
3759 if (!XNN_PLATFORM_MOBILE && hardware_config->use_x86_avx512skx) {
3760 xnn_params.qu8.dwconv[0].minmax.unipass = (xnn_dwconv_unipass_ukernel_fn) xnn_qu8_dwconv_minmax_fp32_ukernel_9p32c__avx512skx_mul32;
3761 xnn_params.qu8.dwconv[0].init.qu8 = xnn_init_qu8_conv_minmax_fp32_avx512_params;
3762 xnn_params.qu8.dwconv[0].channel_tile = 32;
3763 xnn_params.qu8.dwconv[1].minmax.unipass = (xnn_dwconv_unipass_ukernel_fn) xnn_qu8_dwconv_minmax_fp32_ukernel_25p32c__avx512skx_mul32;
3764 xnn_params.qu8.dwconv[1].init.qu8 = xnn_init_qu8_conv_minmax_fp32_avx512_params;
3765 xnn_params.qu8.dwconv[1].channel_tile = 32;
3766 } else if (hardware_config->use_x86_xop) {
3767 // XOP should be checked before AVX2: AMD Excavator supports both, but performs better with XOP microkernels
3768 xnn_params.qu8.dwconv[0].minmax.unipass = (xnn_dwconv_unipass_ukernel_fn) xnn_qu8_dwconv_minmax_fp32_ukernel_9p16c__xop_mul32;
3769 xnn_params.qu8.dwconv[0].init.qu8 = xnn_init_qu8_conv_minmax_fp32_sse2_params;
3770 xnn_params.qu8.dwconv[0].channel_tile = 16;
3771 xnn_params.qu8.dwconv[1].minmax.unipass = (xnn_dwconv_unipass_ukernel_fn) xnn_qu8_dwconv_minmax_fp32_ukernel_25p16c__xop_mul32;
3772 xnn_params.qu8.dwconv[1].init.qu8 = xnn_init_qu8_conv_minmax_fp32_sse2_params;
3773 xnn_params.qu8.dwconv[1].channel_tile = 16;
3774 } else if (hardware_config->use_x86_avx2) {
3775 xnn_params.qu8.dwconv[0].minmax.unipass = (xnn_dwconv_unipass_ukernel_fn) xnn_qu8_dwconv_minmax_fp32_ukernel_9p16c__avx2_mul32;
3776 xnn_params.qu8.dwconv[0].init.qu8 = xnn_init_qu8_conv_minmax_fp32_avx2_params;
3777 xnn_params.qu8.dwconv[0].channel_tile = 16;
3778 xnn_params.qu8.dwconv[1].minmax.unipass = (xnn_dwconv_unipass_ukernel_fn) xnn_qu8_dwconv_minmax_fp32_ukernel_25p16c__avx2_mul32;
3779 xnn_params.qu8.dwconv[1].init.qu8 = xnn_init_qu8_conv_minmax_fp32_avx2_params;
3780 xnn_params.qu8.dwconv[1].channel_tile = 16;
3781 } else if (hardware_config->use_x86_avx) {
3782 xnn_params.qu8.dwconv[0].minmax.unipass = (xnn_dwconv_unipass_ukernel_fn) xnn_qu8_dwconv_minmax_fp32_ukernel_9p16c__avx_mul16;
3783 xnn_params.qu8.dwconv[0].init.qu8 = xnn_init_qu8_conv_minmax_fp32_sse2_params;
3784 xnn_params.qu8.dwconv[0].channel_tile = 16;
3785 xnn_params.qu8.dwconv[1].minmax.unipass = (xnn_dwconv_unipass_ukernel_fn) xnn_qu8_dwconv_minmax_fp32_ukernel_25p16c__avx_mul16;
3786 xnn_params.qu8.dwconv[1].init.qu8 = xnn_init_qu8_conv_minmax_fp32_sse2_params;
3787 xnn_params.qu8.dwconv[1].channel_tile = 16;
3788 } else if (hardware_config->use_x86_sse4_1) {
3789 xnn_params.qu8.dwconv[0].minmax.unipass = (xnn_dwconv_unipass_ukernel_fn) xnn_qu8_dwconv_minmax_fp32_ukernel_9p8c__sse41_mul16;
3790 xnn_params.qu8.dwconv[0].init.qu8 = xnn_init_qu8_conv_minmax_fp32_sse2_params;
3791 xnn_params.qu8.dwconv[0].channel_tile = 8;
3792 xnn_params.qu8.dwconv[1].minmax.unipass = (xnn_dwconv_unipass_ukernel_fn) xnn_qu8_dwconv_minmax_fp32_ukernel_25p8c__sse41_mul16;
3793 xnn_params.qu8.dwconv[1].init.qu8 = xnn_init_qu8_conv_minmax_fp32_sse2_params;
3794 xnn_params.qu8.dwconv[1].channel_tile = 8;
3795 } else {
3796 xnn_params.qu8.dwconv[0].minmax.unipass = (xnn_dwconv_unipass_ukernel_fn) xnn_qu8_dwconv_minmax_fp32_ukernel_9p8c__sse2_mul16;
3797 xnn_params.qu8.dwconv[0].init.qu8 = xnn_init_qu8_conv_minmax_fp32_sse2_params;
3798 xnn_params.qu8.dwconv[0].channel_tile = 8;
3799 xnn_params.qu8.dwconv[1].minmax.unipass = (xnn_dwconv_unipass_ukernel_fn) xnn_qu8_dwconv_minmax_fp32_ukernel_25p8c__sse2_mul16;
3800 xnn_params.qu8.dwconv[1].init.qu8 = xnn_init_qu8_conv_minmax_fp32_sse2_params;
3801 xnn_params.qu8.dwconv[1].channel_tile = 8;
3802 }
3803 xnn_params.qu8.dwconv[0].primary_tile = 9;
3804 xnn_params.qu8.dwconv[1].primary_tile = 25;
3805
3806 xnn_params.qu8.avgpool = (struct avgpool_parameters) {
3807 .unipass = (xnn_avgpool_unipass_ukernel_fn) xnn_qu8_avgpool_minmax_fp32_ukernel_9x__sse2_c8,
3808 .multipass = (xnn_avgpool_multipass_ukernel_fn) xnn_qu8_avgpool_minmax_fp32_ukernel_9p8x__sse2_c8,
3809 .init.qu8 = xnn_init_qu8_avgpool_minmax_fp32_sse2_params,
3810 .primary_tile = 9,
3811 .incremental_tile = 8,
3812 .channel_tile = 8,
3813 };
3814 if (hardware_config->use_x86_sse4_1) {
3815 xnn_params.qu8.gavgpool = (struct gavgpool_parameters) {
3816 .unipass = (xnn_gavgpool_unipass_ukernel_fn) xnn_qu8_gavgpool_minmax_fp32_ukernel_7x__sse41_c8,
3817 .multipass = (xnn_gavgpool_multipass_ukernel_fn) xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__sse41_c8,
3818 .init.qu8 = xnn_init_qu8_avgpool_minmax_fp32_sse4_params,
3819 .update.qu8 = xnn_update_qu8_avgpool_minmax_fp32_sse4_params,
3820 .row_tile = 7,
3821 .channel_tile = 8,
3822 };
3823 } else {
3824 xnn_params.qu8.gavgpool = (struct gavgpool_parameters) {
3825 .unipass = (xnn_gavgpool_unipass_ukernel_fn) xnn_qu8_gavgpool_minmax_fp32_ukernel_7x__sse2_c8,
3826 .multipass = (xnn_gavgpool_multipass_ukernel_fn) xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__sse2_c8,
3827 .init.qu8 = xnn_init_qu8_avgpool_minmax_fp32_sse2_params,
3828 .update.qu8 = xnn_update_qu8_avgpool_minmax_fp32_sse2_params,
3829 .row_tile = 7,
3830 .channel_tile = 8,
3831 };
3832 }
3833
3834 if (hardware_config->use_x86_avx2) {
3835 xnn_params.qu8.lrelu = (struct vunary_parameters) {
3836 .ukernel = (xnn_vunary_ukernel_fn) xnn_qu8_vlrelu_ukernel__avx2_x32,
3837 .init.qu8_lrelu = xnn_init_qu8_lrelu_avx2_params,
3838 .element_tile = 32,
3839 };
3840 } else if (hardware_config->use_x86_avx) {
3841 xnn_params.qu8.lrelu = (struct vunary_parameters) {
3842 .ukernel = (xnn_vunary_ukernel_fn) xnn_qu8_vlrelu_ukernel__avx_x32,
3843 .init.qu8_lrelu = xnn_init_qu8_lrelu_avx_params,
3844 .element_tile = 32,
3845 };
3846 } else if (hardware_config->use_x86_sse4_1) {
3847 xnn_params.qu8.lrelu = (struct vunary_parameters) {
3848 .ukernel = (xnn_vunary_ukernel_fn) xnn_qu8_vlrelu_ukernel__sse41_x32,
3849 .init.qu8_lrelu = xnn_init_qu8_lrelu_sse2_params,
3850 .element_tile = 32,
3851 };
3852 } else if (hardware_config->use_x86_sse4_1) {
3853 xnn_params.qu8.lrelu = (struct vunary_parameters) {
3854 .ukernel = (xnn_vunary_ukernel_fn) xnn_qu8_vlrelu_ukernel__ssse3_x32,
3855 .init.qu8_lrelu = xnn_init_qu8_lrelu_sse2_params,
3856 .element_tile = 32,
3857 };
3858 } else {
3859 xnn_params.qu8.lrelu = (struct vunary_parameters) {
3860 .ukernel = (xnn_vunary_ukernel_fn) xnn_qu8_vlrelu_ukernel__sse2_x32,
3861 .init.qu8_lrelu = xnn_init_qu8_lrelu_sse2_params,
3862 .element_tile = 32,
3863 };
3864 }
3865 #endif // XNN_NO_QU8_OPERATORS
3866
3867 /**************************** U8 x86 micro-kernels ****************************/
3868 #ifndef XNN_NO_S8_OPERATORS
3869 init_flags |= XNN_INIT_FLAG_S8;
3870
3871 if (hardware_config->use_x86_sse4_1) {
3872 xnn_params.s8.clamp = (struct vunary_parameters) {
3873 .ukernel = (xnn_vunary_ukernel_fn) xnn_s8_vclamp_ukernel__sse41_x64,
3874 .init.s8_minmax = xnn_init_s8_minmax_sse4_params,
3875 .element_tile = 64,
3876 };
3877 xnn_params.s8.ibilinear = (struct ibilinear_parameters) {
3878 .ukernel = (xnn_ibilinear_ukernel_fn) xnn_s8_ibilinear_ukernel__sse41_c16,
3879 .pixel_tile = 1,
3880 .channel_tile = 16,
3881 };
3882 xnn_params.s8.maxpool = (struct maxpool_parameters) {
3883 .ukernel = (xnn_maxpool_ukernel_fn) xnn_s8_maxpool_minmax_ukernel_9p8x__sse41_c16,
3884 .init.s8 = xnn_init_s8_minmax_sse4_params,
3885 .mr = 9,
3886 .qr = 8,
3887 };
3888 } else {
3889 xnn_params.s8.clamp = (struct vunary_parameters) {
3890 .ukernel = (xnn_vunary_ukernel_fn) xnn_s8_vclamp_ukernel__sse2_x64,
3891 .init.s8_minmax = xnn_init_s8_minmax_sse2_params,
3892 .element_tile = 64,
3893 };
3894 xnn_params.s8.ibilinear = (struct ibilinear_parameters) {
3895 .ukernel = (xnn_ibilinear_ukernel_fn) xnn_s8_ibilinear_ukernel__sse2_c8,
3896 .pixel_tile = 1,
3897 .channel_tile = 8,
3898 };
3899 xnn_params.s8.maxpool = (struct maxpool_parameters) {
3900 .ukernel = (xnn_maxpool_ukernel_fn) xnn_s8_maxpool_minmax_ukernel_9p8x__sse2_c16,
3901 .init.s8 = xnn_init_s8_minmax_sse2_params,
3902 .mr = 9,
3903 .qr = 8,
3904 };
3905 }
3906 #endif // XNN_NO_S8_OPERATORS
3907
3908 /**************************** U8 x86 micro-kernels ****************************/
3909 #ifndef XNN_NO_U8_OPERATORS
3910 init_flags |= XNN_INIT_FLAG_U8;
3911
3912 xnn_params.u8.clamp = (struct vunary_parameters) {
3913 .ukernel = (xnn_vunary_ukernel_fn) xnn_u8_vclamp_ukernel__sse2_x64,
3914 .init.u8_minmax = xnn_init_u8_minmax_sse2_params,
3915 .element_tile = 64,
3916 };
3917 if (hardware_config->use_x86_sse4_1) {
3918 xnn_params.u8.ibilinear = (struct ibilinear_parameters) {
3919 .ukernel = (xnn_ibilinear_ukernel_fn) xnn_u8_ibilinear_ukernel__sse41_c16,
3920 .pixel_tile = 1,
3921 .channel_tile = 16,
3922 };
3923 } else {
3924 xnn_params.u8.ibilinear = (struct ibilinear_parameters) {
3925 .ukernel = (xnn_ibilinear_ukernel_fn) xnn_u8_ibilinear_ukernel__sse2_c8,
3926 .pixel_tile = 1,
3927 .channel_tile = 8,
3928 };
3929 }
3930 xnn_params.u8.maxpool = (struct maxpool_parameters) {
3931 .ukernel = (xnn_maxpool_ukernel_fn) xnn_u8_maxpool_minmax_ukernel_9p8x__sse2_c16,
3932 .init.u8 = xnn_init_u8_minmax_sse2_params,
3933 .mr = 9,
3934 .qr = 8,
3935 };
3936 xnn_params.u8.lut32norm = xnn_u8_lut32norm_ukernel__scalar;
3937 xnn_params.u8.rmax = xnn_u8_rmax_ukernel__sse2;
3938 #endif // XNN_NO_U8_OPERATORS
3939
3940 /**************************** X8 x86 micro-kernels ****************************/
3941 #ifndef XNN_NO_X8_OPERATORS
3942 init_flags |= XNN_INIT_FLAG_X8;
3943
3944 xnn_params.x8.zip = (struct zip_parameters) {
3945 .x2 = (xnn_zipc_ukernel_fn) xnn_x8_zip_x2_ukernel__sse2,
3946 .x3 = (xnn_zipc_ukernel_fn) xnn_x8_zip_x3_ukernel__sse2,
3947 .x4 = (xnn_zipc_ukernel_fn) xnn_x8_zip_x4_ukernel__sse2,
3948 .xm = (xnn_zipv_ukernel_fn) xnn_x8_zip_xm_ukernel__sse2,
3949 };
3950 #endif // XNN_NO_X8_OPERATORS
3951
3952 /**************************** F16 x86 micro-kernels ****************************/
3953 #ifndef XNN_NO_F16_OPERATORS
3954 if (!XNN_PLATFORM_MOBILE && hardware_config->use_x86_avx2) {
3955 init_flags |= XNN_INIT_FLAG_F16;
3956
3957 xnn_params.f16.gemm.minmax.gemm[XNN_MR_TO_INDEX(4)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_fn) xnn_f16_gemm_minmax_ukernel_4x16__avx2_broadcast);
3958 xnn_params.f16.gemm.minmax.igemm[XNN_MR_TO_INDEX(4)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_fn) xnn_f16_igemm_minmax_ukernel_4x16__avx2_broadcast);
3959 xnn_params.f16.gemm.minmax.gemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_fn) xnn_f16_gemm_minmax_ukernel_1x16__avx2_broadcast);
3960 xnn_params.f16.gemm.minmax.igemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_fn) xnn_f16_igemm_minmax_ukernel_1x16__avx2_broadcast);
3961 xnn_params.f16.gemm.init.f16 = xnn_init_f16_minmax_avx_params;
3962 xnn_params.f16.gemm.mr = 4;
3963 xnn_params.f16.gemm.nr = 16;
3964
3965 xnn_params.f16.dwconv[0].minmax.unipass = (xnn_dwconv_unipass_ukernel_fn) xnn_f16_dwconv_minmax_ukernel_3p16c__fma3;
3966 xnn_params.f16.dwconv[0].init.f16 = xnn_init_f16_minmax_avx_params;
3967 xnn_params.f16.dwconv[0].channel_tile = 16;
3968 xnn_params.f16.dwconv[0].primary_tile = 3;
3969
3970 xnn_params.f16.dwconv[1].minmax.unipass = (xnn_dwconv_unipass_ukernel_fn) xnn_f16_dwconv_minmax_ukernel_4p16c__fma3;
3971 xnn_params.f16.dwconv[1].init.f16 = xnn_init_f16_minmax_avx_params;
3972 xnn_params.f16.dwconv[1].channel_tile = 16;
3973 xnn_params.f16.dwconv[1].primary_tile = 4;
3974
3975 xnn_params.f16.dwconv[2].minmax.unipass = (xnn_dwconv_unipass_ukernel_fn) xnn_f16_dwconv_minmax_ukernel_9p16c__fma3;
3976 xnn_params.f16.dwconv[2].init.f16 = xnn_init_f16_minmax_avx_params;
3977 xnn_params.f16.dwconv[2].channel_tile = 16;
3978 xnn_params.f16.dwconv[2].primary_tile = 9;
3979
3980 xnn_params.f16.dwconv[3].minmax.unipass = (xnn_dwconv_unipass_ukernel_fn) xnn_f16_dwconv_minmax_ukernel_25p8c__fma3_acc2;
3981 xnn_params.f16.dwconv[3].init.f16 = xnn_init_f16_minmax_avx_params;
3982 xnn_params.f16.dwconv[3].channel_tile = 8;
3983 xnn_params.f16.dwconv[3].primary_tile = 25;
3984
3985 xnn_params.f16.avgpool = (struct avgpool_parameters) {
3986 .unipass = (xnn_avgpool_unipass_ukernel_fn) xnn_f16_avgpool_minmax_ukernel_9x__f16c_c8,
3987 .multipass = (xnn_avgpool_multipass_ukernel_fn) xnn_f16_avgpool_minmax_ukernel_9p8x__f16c_c8,
3988 .init.f16 = xnn_init_f16_scaleminmax_avx_params,
3989 .primary_tile = 9,
3990 .incremental_tile = 8,
3991 .channel_tile = 8,
3992 };
3993 xnn_params.f16.pavgpool = (struct pavgpool_parameters) {
3994 .unipass = (xnn_pavgpool_unipass_ukernel_fn) xnn_f16_pavgpool_minmax_ukernel_9x__avx2_c8,
3995 .multipass = (xnn_pavgpool_multipass_ukernel_fn) xnn_f16_pavgpool_minmax_ukernel_9p8x__avx2_c8,
3996 .init.f16 = xnn_init_f16_minmax_avx_params,
3997 .primary_tile = 9,
3998 .incremental_tile = 8,
3999 .channel_tile = 8,
4000 };
4001 xnn_params.f16.gavgpool = (struct gavgpool_parameters) {
4002 .unipass = (xnn_gavgpool_unipass_ukernel_fn) xnn_f16_gavgpool_minmax_ukernel_7x__f16c_c8,
4003 .multipass = (xnn_gavgpool_multipass_ukernel_fn) xnn_f16_gavgpool_minmax_ukernel_7p7x__f16c_c8,
4004 .init.f16 = xnn_init_f16_scaleminmax_avx_params,
4005 .update.f16 = xnn_update_f16_scaleminmax_avx_params,
4006 .row_tile = 7,
4007 .channel_tile = 8,
4008 };
4009
4010 xnn_params.f16.maxpool = (struct maxpool_parameters) {
4011 .ukernel = (xnn_maxpool_ukernel_fn) xnn_f16_maxpool_minmax_ukernel_9p8x__f16c_c8,
4012 .init.f16 = xnn_init_f16_minmax_avx_params,
4013 .mr = 9,
4014 .qr = 8,
4015 };
4016 xnn_params.f16.ibilinear = (struct ibilinear_parameters) {
4017 .ukernel = (xnn_ibilinear_ukernel_fn) xnn_f16_ibilinear_ukernel__fma3_c8,
4018 .pixel_tile = 1,
4019 .channel_tile = 8,
4020 };
4021
4022 xnn_params.f16.prelu = (struct prelu_parameters) {
4023 .ukernel = (xnn_prelu_ukernel_fn) xnn_f16_prelu_ukernel__f16c_2x16,
4024 .row_tile = 2,
4025 .channel_tile = 16,
4026 };
4027
4028 xnn_params.f16.raddstoreexpminusmax = (struct raddstoreexpminusmax_parameters) {
4029 .ukernel = (xnn_raddstoreexpminusmax_ukernel_fn) xnn_f16_raddstoreexpminusmax_ukernel__avx2_rr1_p2_x40,
4030 .init.f16 = xnn_init_f16_expminus_avx2_rr1_p2_params,
4031 .element_tile = 40,
4032 };
4033 xnn_params.f16.rmax = (xnn_rmax_ukernel_fn) xnn_f16_rmax_ukernel__f16c;
4034
4035 xnn_params.f16.vmulcaddc = (struct vmulcaddc_parameters) {
4036 .ukernel = (xnn_vmulcaddc_ukernel_fn) xnn_f16_vmulcaddc_minmax_ukernel_c8__fma3_2x,
4037 .init.f16 = xnn_init_f16_minmax_avx_params,
4038 .channel_tile = 8,
4039 .row_tile = 2,
4040 };
4041
4042 xnn_params.f16.abs = (struct vunary_parameters) {
4043 .ukernel = (xnn_vunary_ukernel_fn) xnn_f16_vabs_ukernel__sse2_x16,
4044 .init.f16_abs = xnn_init_f16_abs_sse_params,
4045 .element_tile = 16,
4046 };
4047 xnn_params.f16.clamp = (struct vunary_parameters) {
4048 .ukernel = (xnn_vunary_ukernel_fn) xnn_f16_vclamp_ukernel__f16c_x16,
4049 .init.f16_minmax = xnn_init_f16_minmax_avx_params,
4050 .element_tile = 16,
4051 };
4052 xnn_params.f16.elu = (struct vunary_parameters) {
4053 .ukernel = (xnn_vunary_ukernel_fn) xnn_f16_velu_ukernel__avx2_rr1_p3_x16,
4054 .init.f16_elu = xnn_init_f16_elu_avx2_rr1_p3_params,
4055 .element_tile = 16,
4056 };
4057 xnn_params.f16.hswish = (struct vunary_parameters) {
4058 .ukernel = (xnn_vunary_ukernel_fn) xnn_f16_vhswish_ukernel__f16c_x16,
4059 .init.f16_hswish = xnn_init_f16_hswish_avx_params,
4060 .element_tile = 16,
4061 };
4062 xnn_params.f16.lrelu = (struct vunary_parameters) {
4063 .ukernel = (xnn_vunary_ukernel_fn) xnn_f16_vlrelu_ukernel__f16c_x16,
4064 .init.f16_lrelu = xnn_init_f16_lrelu_avx_params,
4065 .element_tile = 16,
4066 };
4067 xnn_params.f16.neg = (struct vunary_parameters) {
4068 .ukernel = (xnn_vunary_ukernel_fn) xnn_f16_vneg_ukernel__sse2_x16,
4069 .init.f16_neg = xnn_init_f16_neg_sse_params,
4070 .element_tile = 16,
4071 };
4072 xnn_params.f16.rndne = (struct vunary_parameters) {
4073 .ukernel = (xnn_vunary_ukernel_fn) xnn_f16_vrndne_ukernel__f16c_x16,
4074 .element_tile = 16,
4075 };
4076 xnn_params.f16.rndz = (struct vunary_parameters) {
4077 .ukernel = (xnn_vunary_ukernel_fn) xnn_f16_vrndz_ukernel__f16c_x16,
4078 .element_tile = 16,
4079 };
4080 xnn_params.f16.rndu = (struct vunary_parameters) {
4081 .ukernel = (xnn_vunary_ukernel_fn) xnn_f16_vrndu_ukernel__f16c_x16,
4082 .element_tile = 16,
4083 };
4084 xnn_params.f16.rndd = (struct vunary_parameters) {
4085 .ukernel = (xnn_vunary_ukernel_fn) xnn_f16_vrndd_ukernel__f16c_x16,
4086 .element_tile = 16,
4087 };
4088 xnn_params.f16.sigmoid = (struct vunary_parameters) {
4089 .ukernel = (xnn_vunary_ukernel_fn) xnn_f16_vsigmoid_ukernel__avx2_rr1_p2_rcp_x32,
4090 .init.f16_sigmoid = xnn_init_f16_sigmoid_avx2_rr1_p2_params,
4091 .element_tile = 32,
4092 };
4093 xnn_params.f16.sqr = (struct vunary_parameters) {
4094 .ukernel = (xnn_vunary_ukernel_fn) xnn_f16_vsqr_ukernel__f16c_x16,
4095 .element_tile = 16,
4096 };
4097 xnn_params.f16.sqrt = (struct vunary_parameters) {
4098 .ukernel = (xnn_vunary_ukernel_fn) xnn_f16_vsqrt_ukernel__f16c_sqrt_x8,
4099 .element_tile = 8,
4100 };
4101 }
4102 #endif // XNN_NO_F16_OPERATORS
4103
4104 /**************************** F32 x86 micro-kernels ****************************/
4105 #ifndef XNN_NO_F32_OPERATORS
4106 init_flags |= XNN_INIT_FLAG_F32;
4107
4108 if (!XNN_PLATFORM_MOBILE && hardware_config->use_x86_avx512f) {
4109 xnn_params.f32.gemm.minmax.gemm[XNN_MR_TO_INDEX(7)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_fn) xnn_f32_gemm_minmax_ukernel_7x16__avx512f_broadcast);
4110 xnn_params.f32.gemm.minmax.igemm[XNN_MR_TO_INDEX(7)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_fn) xnn_f32_igemm_minmax_ukernel_7x16__avx512f_broadcast);
4111 xnn_params.f32.gemm.minmax.gemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_fn) xnn_f32_gemm_minmax_ukernel_1x16__avx512f_broadcast);
4112 xnn_params.f32.gemm.minmax.igemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_fn) xnn_f32_igemm_minmax_ukernel_1x16__avx512f_broadcast);
4113 xnn_params.f32.gemm.init.f32 = xnn_init_f32_minmax_scalar_params;
4114 xnn_params.f32.gemm.mr = 7;
4115 xnn_params.f32.gemm.nr = 16;
4116 } else if (hardware_config->use_x86_fma3) {
4117 switch (cpuinfo_get_core(0)->uarch) {
4118 case cpuinfo_uarch_zen:
4119 case cpuinfo_uarch_dhyana:
4120 xnn_params.f32.gemm.minmax.gemm[XNN_MR_TO_INDEX(4)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_fn) xnn_f32_gemm_minmax_ukernel_4x16s4__fma3_broadcast);
4121 xnn_params.f32.gemm.minmax.igemm[XNN_MR_TO_INDEX(4)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_fn) xnn_f32_igemm_minmax_ukernel_4x16s4__fma3_broadcast);
4122 xnn_params.f32.gemm.minmax.gemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_fn) xnn_f32_gemm_minmax_ukernel_1x16s4__fma3_broadcast);
4123 xnn_params.f32.gemm.minmax.igemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_fn) xnn_f32_igemm_minmax_ukernel_1x16s4__fma3_broadcast);
4124 xnn_params.f32.gemm.init.f32 = xnn_init_f32_minmax_avx_params;
4125 xnn_params.f32.gemm.mr = 4;
4126 xnn_params.f32.gemm.nr = 16;
4127 xnn_params.f32.gemm.log2_sr = 2;
4128 break;
4129 default:
4130 xnn_params.f32.gemm.minmax.gemm[XNN_MR_TO_INDEX(5)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_fn) xnn_f32_gemm_minmax_ukernel_5x16__fma3_broadcast);
4131 xnn_params.f32.gemm.minmax.igemm[XNN_MR_TO_INDEX(5)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_fn) xnn_f32_igemm_minmax_ukernel_5x16__fma3_broadcast);
4132 xnn_params.f32.gemm.minmax.gemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_fn) xnn_f32_gemm_minmax_ukernel_1x16__fma3_broadcast);
4133 xnn_params.f32.gemm.minmax.igemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_fn) xnn_f32_igemm_minmax_ukernel_1x16__fma3_broadcast);
4134 xnn_params.f32.gemm.init.f32 = xnn_init_f32_minmax_avx_params;
4135 xnn_params.f32.gemm.mr = 5;
4136 xnn_params.f32.gemm.nr = 16;
4137 break;
4138 }
4139 } else if (hardware_config->use_x86_avx) {
4140 xnn_params.f32.gemm.minmax.gemm[XNN_MR_TO_INDEX(5)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_fn) xnn_f32_gemm_minmax_ukernel_5x16__avx_broadcast);
4141 xnn_params.f32.gemm.minmax.igemm[XNN_MR_TO_INDEX(5)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_fn) xnn_f32_igemm_minmax_ukernel_5x16__avx_broadcast);
4142 xnn_params.f32.gemm.minmax.gemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_fn) xnn_f32_gemm_minmax_ukernel_1x16__avx_broadcast);
4143 xnn_params.f32.gemm.minmax.igemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_fn) xnn_f32_igemm_minmax_ukernel_1x16__avx_broadcast);
4144 xnn_params.f32.gemm.init.f32 = xnn_init_f32_minmax_avx_params;
4145 xnn_params.f32.gemm.mr = 5;
4146 xnn_params.f32.gemm.nr = 16;
4147 } else {
4148 xnn_params.f32.gemm.minmax.gemm[XNN_MR_TO_INDEX(4)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_fn) xnn_f32_gemm_minmax_ukernel_4x8__sse_load1);
4149 xnn_params.f32.gemm.minmax.igemm[XNN_MR_TO_INDEX(4)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_fn) xnn_f32_igemm_minmax_ukernel_4x8__sse_load1);
4150 xnn_params.f32.gemm.minmax.gemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_fn) xnn_f32_gemm_minmax_ukernel_1x8__sse_load1);
4151 xnn_params.f32.gemm.minmax.igemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_fn) xnn_f32_igemm_minmax_ukernel_1x8__sse_load1);
4152 xnn_params.f32.gemm.init.f32 = xnn_init_f32_minmax_sse_params;
4153 xnn_params.f32.gemm.mr = 4;
4154 xnn_params.f32.gemm.nr = 8;
4155 }
4156 xnn_params.f32.gemm2.minmax.gemm[XNN_MR_TO_INDEX(4)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_fn) xnn_f32_gemm_minmax_ukernel_4x2c4__sse);
4157 xnn_params.f32.gemm2.minmax.igemm[XNN_MR_TO_INDEX(4)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_fn) xnn_f32_igemm_minmax_ukernel_4x2c4__sse);
4158 xnn_params.f32.gemm2.init.f32 = xnn_init_f32_minmax_sse_params;
4159 xnn_params.f32.gemm2.mr = 4;
4160 xnn_params.f32.gemm2.nr = 2;
4161 xnn_params.f32.gemm2.log2_kr = 2;
4162
4163 if (!XNN_PLATFORM_MOBILE && hardware_config->use_x86_avx512f) {
4164 xnn_params.f32.dwconv[0].minmax.unipass = (xnn_dwconv_unipass_ukernel_fn) xnn_f32_dwconv_minmax_ukernel_3p16c__avx512f;
4165 xnn_params.f32.dwconv[0].init.f32 = xnn_init_f32_minmax_scalar_params;
4166 xnn_params.f32.dwconv[0].channel_tile = 16;
4167 xnn_params.f32.dwconv[0].primary_tile = 3;
4168
4169 xnn_params.f32.dwconv[1].minmax.unipass = (xnn_dwconv_unipass_ukernel_fn) xnn_f32_dwconv_minmax_ukernel_4p16c__avx512f;
4170 xnn_params.f32.dwconv[1].init.f32 = xnn_init_f32_minmax_scalar_params;
4171 xnn_params.f32.dwconv[1].channel_tile = 16;
4172 xnn_params.f32.dwconv[1].primary_tile = 4;
4173
4174 xnn_params.f32.dwconv[2].minmax.unipass = (xnn_dwconv_unipass_ukernel_fn) xnn_f32_dwconv_minmax_ukernel_9p16c__avx512f;
4175 xnn_params.f32.dwconv[2].init.f32 = xnn_init_f32_minmax_scalar_params;
4176 xnn_params.f32.dwconv[2].channel_tile = 16;
4177 xnn_params.f32.dwconv[2].primary_tile = 9;
4178
4179 xnn_params.f32.dwconv[3].minmax.unipass = (xnn_dwconv_unipass_ukernel_fn) xnn_f32_dwconv_minmax_ukernel_25p16c__avx512f;
4180 xnn_params.f32.dwconv[3].init.f32 = xnn_init_f32_minmax_scalar_params;
4181 xnn_params.f32.dwconv[3].channel_tile = 16;
4182 xnn_params.f32.dwconv[3].primary_tile = 25;
4183 } else if (hardware_config->use_x86_fma3) {
4184 xnn_params.f32.dwconv[0].minmax.unipass = (xnn_dwconv_unipass_ukernel_fn) xnn_f32_dwconv_minmax_ukernel_3p16c__fma3;
4185 xnn_params.f32.dwconv[0].init.f32 = xnn_init_f32_minmax_avx_params;
4186 xnn_params.f32.dwconv[0].channel_tile = 16;
4187 xnn_params.f32.dwconv[0].primary_tile = 3;
4188
4189 xnn_params.f32.dwconv[1].minmax.unipass = (xnn_dwconv_unipass_ukernel_fn) xnn_f32_dwconv_minmax_ukernel_4p16c__fma3;
4190 xnn_params.f32.dwconv[1].init.f32 = xnn_init_f32_minmax_avx_params;
4191 xnn_params.f32.dwconv[1].channel_tile = 16;
4192 xnn_params.f32.dwconv[1].primary_tile = 4;
4193
4194 xnn_params.f32.dwconv[2].minmax.unipass = (xnn_dwconv_unipass_ukernel_fn) xnn_f32_dwconv_minmax_ukernel_9p16c__fma3;
4195 xnn_params.f32.dwconv[2].init.f32 = xnn_init_f32_minmax_avx_params;
4196 xnn_params.f32.dwconv[2].channel_tile = 16;
4197 xnn_params.f32.dwconv[2].primary_tile = 9;
4198
4199 xnn_params.f32.dwconv[3].minmax.unipass = (xnn_dwconv_unipass_ukernel_fn) xnn_f32_dwconv_minmax_ukernel_25p8c__fma3;
4200 xnn_params.f32.dwconv[3].init.f32 = xnn_init_f32_minmax_avx_params;
4201 xnn_params.f32.dwconv[3].channel_tile = 8;
4202 xnn_params.f32.dwconv[3].primary_tile = 25;
4203 } else if (hardware_config->use_x86_avx) {
4204 xnn_params.f32.dwconv[0].minmax.unipass = (xnn_dwconv_unipass_ukernel_fn) xnn_f32_dwconv_minmax_ukernel_3p16c__avx;
4205 xnn_params.f32.dwconv[0].init.f32 = xnn_init_f32_minmax_avx_params;
4206 xnn_params.f32.dwconv[0].channel_tile = 16;
4207 xnn_params.f32.dwconv[0].primary_tile = 3;
4208
4209 xnn_params.f32.dwconv[1].minmax.unipass = (xnn_dwconv_unipass_ukernel_fn) xnn_f32_dwconv_minmax_ukernel_4p16c__avx;
4210 xnn_params.f32.dwconv[1].init.f32 = xnn_init_f32_minmax_avx_params;
4211 xnn_params.f32.dwconv[1].channel_tile = 16;
4212 xnn_params.f32.dwconv[1].primary_tile = 4;
4213
4214 xnn_params.f32.dwconv[2].minmax.unipass = (xnn_dwconv_unipass_ukernel_fn) xnn_f32_dwconv_minmax_ukernel_9p16c__avx;
4215 xnn_params.f32.dwconv[2].init.f32 = xnn_init_f32_minmax_avx_params;
4216 xnn_params.f32.dwconv[2].channel_tile = 16;
4217 xnn_params.f32.dwconv[2].primary_tile = 9;
4218
4219 xnn_params.f32.dwconv[3].minmax.unipass = (xnn_dwconv_unipass_ukernel_fn) xnn_f32_dwconv_minmax_ukernel_25p8c__avx;
4220 xnn_params.f32.dwconv[3].init.f32 = xnn_init_f32_minmax_avx_params;
4221 xnn_params.f32.dwconv[3].channel_tile = 8;
4222 xnn_params.f32.dwconv[3].primary_tile = 25;
4223 } else {
4224 xnn_params.f32.dwconv[0].minmax.unipass = (xnn_dwconv_unipass_ukernel_fn) xnn_f32_dwconv_minmax_ukernel_3p8c__sse;
4225 xnn_params.f32.dwconv[0].init.f32 = xnn_init_f32_minmax_sse_params;
4226 xnn_params.f32.dwconv[0].channel_tile = 8;
4227 xnn_params.f32.dwconv[0].primary_tile = 3;
4228
4229 xnn_params.f32.dwconv[1].minmax.unipass = (xnn_dwconv_unipass_ukernel_fn) xnn_f32_dwconv_minmax_ukernel_4p8c__sse;
4230 xnn_params.f32.dwconv[1].init.f32 = xnn_init_f32_minmax_sse_params;
4231 xnn_params.f32.dwconv[1].channel_tile = 8;
4232 xnn_params.f32.dwconv[1].primary_tile = 4;
4233
4234 xnn_params.f32.dwconv[2].minmax.unipass = (xnn_dwconv_unipass_ukernel_fn) xnn_f32_dwconv_minmax_ukernel_9p8c__sse;
4235 xnn_params.f32.dwconv[2].init.f32 = xnn_init_f32_minmax_sse_params;
4236 xnn_params.f32.dwconv[2].channel_tile = 8;
4237 xnn_params.f32.dwconv[2].primary_tile = 9;
4238
4239 xnn_params.f32.dwconv[3].minmax.unipass = (xnn_dwconv_unipass_ukernel_fn) xnn_f32_dwconv_minmax_ukernel_25p8c__sse;
4240 xnn_params.f32.dwconv[3].init.f32 = xnn_init_f32_minmax_sse_params;
4241 xnn_params.f32.dwconv[3].channel_tile = 8;
4242 xnn_params.f32.dwconv[3].primary_tile = 25;
4243 }
4244 xnn_params.f32.avgpool = (struct avgpool_parameters) {
4245 .unipass = (xnn_avgpool_unipass_ukernel_fn) xnn_f32_avgpool_minmax_ukernel_9x__sse_c4,
4246 .multipass = (xnn_avgpool_multipass_ukernel_fn) xnn_f32_avgpool_minmax_ukernel_9p8x__sse_c4,
4247 .init.f32 = xnn_init_f32_scaleminmax_sse_params,
4248 .primary_tile = 9,
4249 .incremental_tile = 8,
4250 .channel_tile = 4,
4251 };
4252 xnn_params.f32.pavgpool = (struct pavgpool_parameters) {
4253 .unipass = (xnn_pavgpool_unipass_ukernel_fn) xnn_f32_pavgpool_minmax_ukernel_9x__sse_c4,
4254 .multipass = (xnn_pavgpool_multipass_ukernel_fn) xnn_f32_pavgpool_minmax_ukernel_9p8x__sse_c4,
4255 .init.f32 = xnn_init_f32_minmax_sse_params,
4256 .primary_tile = 9,
4257 .incremental_tile = 8,
4258 .channel_tile = 4,
4259 };
4260 xnn_params.f32.gavgpool = (struct gavgpool_parameters) {
4261 .unipass = (xnn_gavgpool_unipass_ukernel_fn) xnn_f32_gavgpool_minmax_ukernel_7x__sse_c4,
4262 .multipass = (xnn_gavgpool_multipass_ukernel_fn) xnn_f32_gavgpool_minmax_ukernel_7p7x__sse_c4,
4263 .init.f32 = xnn_init_f32_scaleminmax_sse_params,
4264 .update.f32 = xnn_update_f32_scaleminmax_sse_params,
4265 .row_tile = 7,
4266 .channel_tile = 4,
4267 };
4268 xnn_params.f32.maxpool = (struct maxpool_parameters) {
4269 .ukernel = (xnn_maxpool_ukernel_fn) xnn_f32_maxpool_minmax_ukernel_9p8x__sse_c4,
4270 .init.f32 = xnn_init_f32_minmax_sse_params,
4271 .mr = 9,
4272 .qr = 8,
4273 };
4274 xnn_params.f32.argmaxpool[0] = (struct argmaxpool_parameters) {
4275 .up = (xnn_argmaxpool_unipass_ukernel_fn) xnn_f32_argmaxpool_ukernel_4x__sse2_c4,
4276 .mr = 4,
4277 };
4278 xnn_params.f32.argmaxpool[1] = (struct argmaxpool_parameters) {
4279 .up = (xnn_argmaxpool_unipass_ukernel_fn) xnn_f32_argmaxpool_ukernel_9x__sse2_c4,
4280 .mr = 9,
4281 };
4282 xnn_params.f32.argmaxpool[2] = (struct argmaxpool_parameters) {
4283 .mp = (xnn_argmaxpool_multipass_ukernel_fn) xnn_f32_argmaxpool_ukernel_9p8x__sse2_c4,
4284 .mr = 9,
4285 .qr = 8,
4286 };
4287 xnn_params.f32.ibilinear = (struct ibilinear_parameters) {
4288 .ukernel = (xnn_ibilinear_ukernel_fn) xnn_f32_ibilinear_ukernel__sse_c8,
4289 .pixel_tile = 1,
4290 .channel_tile = 8,
4291 };
4292 if (!XNN_PLATFORM_MOBILE && hardware_config->use_x86_avx512f) {
4293 xnn_params.f32.abs = (struct vunary_parameters) {
4294 .ukernel = (xnn_vunary_ukernel_fn) xnn_f32_vabs_ukernel__avx512f_x16,
4295 .init.f32_abs = xnn_init_f32_abs_avx512_params,
4296 .element_tile = 16,
4297 };
4298 } else if (hardware_config->use_x86_avx) {
4299 xnn_params.f32.abs = (struct vunary_parameters) {
4300 .ukernel = (xnn_vunary_ukernel_fn) xnn_f32_vabs_ukernel__avx_x16,
4301 .init.f32_abs = xnn_init_f32_abs_avx_params,
4302 .element_tile = 16,
4303 };
4304 } else {
4305 xnn_params.f32.abs = (struct vunary_parameters) {
4306 .ukernel = (xnn_vunary_ukernel_fn) xnn_f32_vabs_ukernel__sse_x8,
4307 .init.f32_abs = xnn_init_f32_abs_sse_params,
4308 .element_tile = 8,
4309 };
4310 }
4311 if (!XNN_PLATFORM_MOBILE && hardware_config->use_x86_avx512f) {
4312 xnn_params.f32.clamp = (struct vunary_parameters) {
4313 .ukernel = (xnn_vunary_ukernel_fn) xnn_f32_vclamp_ukernel__avx512f_x16,
4314 .init.f32_minmax = xnn_init_f32_minmax_scalar_params,
4315 .element_tile = 16,
4316 };
4317 } else if (hardware_config->use_x86_avx) {
4318 xnn_params.f32.clamp = (struct vunary_parameters) {
4319 .ukernel = (xnn_vunary_ukernel_fn) xnn_f32_vclamp_ukernel__avx_x16,
4320 .init.f32_minmax = xnn_init_f32_minmax_avx_params,
4321 .element_tile = 16,
4322 };
4323 } else {
4324 xnn_params.f32.clamp = (struct vunary_parameters) {
4325 .ukernel = (xnn_vunary_ukernel_fn) xnn_f32_vclamp_ukernel__sse_x8,
4326 .init.f32_minmax = xnn_init_f32_minmax_sse_params,
4327 .element_tile = 8,
4328 };
4329 }
4330 if (!XNN_PLATFORM_MOBILE && hardware_config->use_x86_avx512f) {
4331 xnn_params.f32.elu = (struct vunary_parameters) {
4332 .ukernel = (xnn_vunary_ukernel_fn) xnn_f32_velu_ukernel__avx512f_rr1_lut16_p3_perm_x64,
4333 .init.f32_elu = xnn_init_f32_elu_avx512_rr1_lut16_p3_params,
4334 .element_tile = 64,
4335 };
4336 } else if (hardware_config->use_x86_avx2) {
4337 xnn_params.f32.elu = (struct vunary_parameters) {
4338 .ukernel = (xnn_vunary_ukernel_fn) xnn_f32_velu_ukernel__avx2_rr1_lut4_p4_perm_x56,
4339 .init.f32_elu = xnn_init_f32_elu_avx2_rr1_lut4_p4_params,
4340 .element_tile = 56,
4341 };
4342 } else if (hardware_config->use_x86_avx) {
4343 xnn_params.f32.elu = (struct vunary_parameters) {
4344 .ukernel = (xnn_vunary_ukernel_fn) xnn_f32_velu_ukernel__avx_rr2_lut4_p4_perm_x32,
4345 .init.f32_elu = xnn_init_f32_elu_avx_rr2_lut4_p4_params,
4346 .element_tile = 32,
4347 };
4348 } else {
4349 xnn_params.f32.elu = (struct vunary_parameters) {
4350 .ukernel = (xnn_vunary_ukernel_fn) xnn_f32_velu_ukernel__sse2_rr2_lut16_p3_x12,
4351 .init.f32_elu = xnn_init_f32_elu_sse2_rr2_lut16_p3_params,
4352 .element_tile = 12,
4353 };
4354 }
4355 if (!XNN_PLATFORM_MOBILE && hardware_config->use_x86_avx512f) {
4356 xnn_params.f32.hswish = (struct vunary_parameters) {
4357 .ukernel = (xnn_vunary_ukernel_fn) xnn_f32_vhswish_ukernel__avx512f_x16,
4358 .init.f32_hswish = xnn_init_f32_hswish_avx512_params,
4359 .element_tile = 16,
4360 };
4361 } else if (hardware_config->use_x86_fma3) {
4362 xnn_params.f32.hswish = (struct vunary_parameters) {
4363 .ukernel = (xnn_vunary_ukernel_fn) xnn_f32_vhswish_ukernel__fma3_x16,
4364 .init.f32_hswish = xnn_init_f32_hswish_avx_params,
4365 .element_tile = 16,
4366 };
4367 } else if (hardware_config->use_x86_avx) {
4368 xnn_params.f32.hswish = (struct vunary_parameters) {
4369 .ukernel = (xnn_vunary_ukernel_fn) xnn_f32_vhswish_ukernel__avx_x16,
4370 .init.f32_hswish = xnn_init_f32_hswish_avx_params,
4371 .element_tile = 16,
4372 };
4373 } else {
4374 xnn_params.f32.hswish = (struct vunary_parameters) {
4375 .ukernel = (xnn_vunary_ukernel_fn) xnn_f32_vhswish_ukernel__sse_x8,
4376 .init.f32_hswish = xnn_init_f32_hswish_sse_params,
4377 .element_tile = 8,
4378 };
4379 }
4380 if (!XNN_PLATFORM_MOBILE && hardware_config->use_x86_avx512f) {
4381 xnn_params.f32.lrelu = (struct vunary_parameters) {
4382 .ukernel = (xnn_vunary_ukernel_fn) xnn_f32_vlrelu_ukernel__avx512f_x16,
4383 .init.f32_lrelu = xnn_init_f32_lrelu_scalar_params,
4384 .element_tile = 16,
4385 };
4386 } else if (hardware_config->use_x86_avx) {
4387 xnn_params.f32.lrelu = (struct vunary_parameters) {
4388 .ukernel = (xnn_vunary_ukernel_fn) xnn_f32_vlrelu_ukernel__avx_x16,
4389 .init.f32_lrelu = xnn_init_f32_lrelu_avx_params,
4390 .element_tile = 16,
4391 };
4392 } else if (hardware_config->use_x86_sse4_1) {
4393 xnn_params.f32.lrelu = (struct vunary_parameters) {
4394 .ukernel = (xnn_vunary_ukernel_fn) xnn_f32_vlrelu_ukernel__sse41_x8,
4395 .init.f32_lrelu = xnn_init_f32_lrelu_sse_params,
4396 .element_tile = 8,
4397 };
4398 } else {
4399 xnn_params.f32.lrelu = (struct vunary_parameters) {
4400 .ukernel = (xnn_vunary_ukernel_fn) xnn_f32_vlrelu_ukernel__sse_x8,
4401 .init.f32_lrelu = xnn_init_f32_lrelu_sse_params,
4402 .element_tile = 8,
4403 };
4404 }
4405 if (!XNN_PLATFORM_MOBILE && hardware_config->use_x86_avx512f) {
4406 xnn_params.f32.neg = (struct vunary_parameters) {
4407 .ukernel = (xnn_vunary_ukernel_fn) xnn_f32_vneg_ukernel__avx512f_x16,
4408 .init.f32_neg = xnn_init_f32_neg_avx512_params,
4409 .element_tile = 16,
4410 };
4411 } else if (hardware_config->use_x86_avx) {
4412 xnn_params.f32.neg = (struct vunary_parameters) {
4413 .ukernel = (xnn_vunary_ukernel_fn) xnn_f32_vneg_ukernel__avx_x16,
4414 .init.f32_neg = xnn_init_f32_neg_avx_params,
4415 .element_tile = 16,
4416 };
4417 } else {
4418 xnn_params.f32.neg = (struct vunary_parameters) {
4419 .ukernel = (xnn_vunary_ukernel_fn) xnn_f32_vneg_ukernel__sse_x8,
4420 .init.f32_neg = xnn_init_f32_neg_sse_params,
4421 .element_tile = 8,
4422 };
4423 }
4424 if (!XNN_PLATFORM_MOBILE && hardware_config->use_x86_avx512f) {
4425 xnn_params.f32.rndne = (struct vunary_parameters) {
4426 .ukernel = (xnn_vunary_ukernel_fn) xnn_f32_vrndne_ukernel__avx512f_x16,
4427 .element_tile = 16,
4428 };
4429 xnn_params.f32.rndz = (struct vunary_parameters) {
4430 .ukernel = (xnn_vunary_ukernel_fn) xnn_f32_vrndz_ukernel__avx512f_x16,
4431 .element_tile = 16,
4432 };
4433 xnn_params.f32.rndu = (struct vunary_parameters) {
4434 .ukernel = (xnn_vunary_ukernel_fn) xnn_f32_vrndu_ukernel__avx512f_x16,
4435 .element_tile = 16,
4436 };
4437 xnn_params.f32.rndd = (struct vunary_parameters) {
4438 .ukernel = (xnn_vunary_ukernel_fn) xnn_f32_vrndd_ukernel__avx512f_x16,
4439 .element_tile = 16,
4440 };
4441 } else if (hardware_config->use_x86_avx) {
4442 xnn_params.f32.rndne = (struct vunary_parameters) {
4443 .ukernel = (xnn_vunary_ukernel_fn) xnn_f32_vrndne_ukernel__avx_x16,
4444 .init.f32_rnd = xnn_init_f32_rnd_avx_params,
4445 .element_tile = 16,
4446 };
4447 xnn_params.f32.rndz = (struct vunary_parameters) {
4448 .ukernel = (xnn_vunary_ukernel_fn) xnn_f32_vrndz_ukernel__avx_x16,
4449 .init.f32_rnd = xnn_init_f32_rnd_avx_params,
4450 .element_tile = 16,
4451 };
4452 xnn_params.f32.rndu = (struct vunary_parameters) {
4453 .ukernel = (xnn_vunary_ukernel_fn) xnn_f32_vrndu_ukernel__avx_x16,
4454 .init.f32_rnd = xnn_init_f32_rnd_avx_params,
4455 .element_tile = 16,
4456 };
4457 xnn_params.f32.rndd = (struct vunary_parameters) {
4458 .ukernel = (xnn_vunary_ukernel_fn) xnn_f32_vrndd_ukernel__avx_x16,
4459 .init.f32_rnd = xnn_init_f32_rnd_avx_params,
4460 .element_tile = 16,
4461 };
4462 } else if (hardware_config->use_x86_sse4_1) {
4463 xnn_params.f32.rndne = (struct vunary_parameters) {
4464 .ukernel = (xnn_vunary_ukernel_fn) xnn_f32_vrndne_ukernel__sse41_x8,
4465 .element_tile = 8,
4466 };
4467 xnn_params.f32.rndz = (struct vunary_parameters) {
4468 .ukernel = (xnn_vunary_ukernel_fn) xnn_f32_vrndz_ukernel__sse41_x8,
4469 .element_tile = 8,
4470 };
4471 xnn_params.f32.rndu = (struct vunary_parameters) {
4472 .ukernel = (xnn_vunary_ukernel_fn) xnn_f32_vrndu_ukernel__sse41_x8,
4473 .element_tile = 8,
4474 };
4475 xnn_params.f32.rndd = (struct vunary_parameters) {
4476 .ukernel = (xnn_vunary_ukernel_fn) xnn_f32_vrndd_ukernel__sse41_x8,
4477 .element_tile = 8,
4478 };
4479 } else {
4480 xnn_params.f32.rndne = (struct vunary_parameters) {
4481 .ukernel = (xnn_vunary_ukernel_fn) xnn_f32_vrndne_ukernel__sse2_x8,
4482 .init.f32_rnd = xnn_init_f32_rnd_sse2_params,
4483 .element_tile = 8,
4484 };
4485 xnn_params.f32.rndz = (struct vunary_parameters) {
4486 .ukernel = (xnn_vunary_ukernel_fn) xnn_f32_vrndz_ukernel__sse2_x8,
4487 .init.f32_rnd = xnn_init_f32_rnd_sse2_params,
4488 .element_tile = 8,
4489 };
4490 xnn_params.f32.rndu = (struct vunary_parameters) {
4491 .ukernel = (xnn_vunary_ukernel_fn) xnn_f32_vrndu_ukernel__sse2_x8,
4492 .init.f32_rnd = xnn_init_f32_rnd_sse2_params,
4493 .element_tile = 8,
4494 };
4495 xnn_params.f32.rndd = (struct vunary_parameters) {
4496 .ukernel = (xnn_vunary_ukernel_fn) xnn_f32_vrndd_ukernel__sse2_x8,
4497 .init.f32_rnd = xnn_init_f32_rnd_sse2_params,
4498 .element_tile = 8,
4499 };
4500 }
4501 if (!XNN_PLATFORM_MOBILE && hardware_config->use_x86_avx512f) {
4502 xnn_params.f32.sigmoid = (struct vunary_parameters) {
4503 .ukernel = (xnn_vunary_ukernel_fn) xnn_f32_vsigmoid_ukernel__avx512f_rr2_lut32_p2_perm2_scalef_div_x64,
4504 .init.f32_sigmoid = xnn_init_f32_sigmoid_avx512_rr2_lut32_p2_params,
4505 .element_tile = 64,
4506 };
4507 } else if (hardware_config->use_x86_avx2) {
4508 xnn_params.f32.sigmoid = (struct vunary_parameters) {
4509 .ukernel = (xnn_vunary_ukernel_fn) xnn_f32_vsigmoid_ukernel__avx2_rr1_p5_div_x40,
4510 .init.f32_sigmoid = xnn_init_f32_sigmoid_avx2_rr1_p5_params,
4511 .element_tile = 40,
4512 };
4513 } else if (hardware_config->use_x86_avx) {
4514 xnn_params.f32.sigmoid = (struct vunary_parameters) {
4515 .ukernel = (xnn_vunary_ukernel_fn) xnn_f32_vsigmoid_ukernel__avx_rr2_p5_nr2_x40,
4516 .init.f32_sigmoid = xnn_init_f32_sigmoid_avx_rr2_p5_params,
4517 .element_tile = 40,
4518 };
4519 } else if (hardware_config->use_x86_sse4_1) {
4520 xnn_params.f32.sigmoid = (struct vunary_parameters) {
4521 .ukernel = (xnn_vunary_ukernel_fn) xnn_f32_vsigmoid_ukernel__sse41_rr2_lut64_p2_div_x8,
4522 .init.f32_sigmoid = xnn_init_f32_sigmoid_sse2_rr2_lut64_p2_params,
4523 .element_tile = 8,
4524 };
4525 } else {
4526 xnn_params.f32.sigmoid = (struct vunary_parameters) {
4527 .ukernel = (xnn_vunary_ukernel_fn) xnn_f32_vsigmoid_ukernel__sse2_rr2_lut64_p2_div_x8,
4528 .init.f32_sigmoid = xnn_init_f32_sigmoid_sse2_rr2_lut64_p2_params,
4529 .element_tile = 8,
4530 };
4531 }
4532 if (!XNN_PLATFORM_MOBILE && hardware_config->use_x86_avx512f) {
4533 xnn_params.f32.sqr = (struct vunary_parameters) {
4534 .ukernel = (xnn_vunary_ukernel_fn) xnn_f32_vsqr_ukernel__avx512f_x16,
4535 .element_tile = 16,
4536 };
4537 } else if (hardware_config->use_x86_avx) {
4538 xnn_params.f32.sqr = (struct vunary_parameters) {
4539 .ukernel = (xnn_vunary_ukernel_fn) xnn_f32_vsqr_ukernel__avx_x16,
4540 .init.f32_default = xnn_init_f32_default_avx_params,
4541 .element_tile = 16,
4542 };
4543 } else {
4544 xnn_params.f32.sqr = (struct vunary_parameters) {
4545 .ukernel = (xnn_vunary_ukernel_fn) xnn_f32_vsqr_ukernel__sse_x8,
4546 .element_tile = 8,
4547 };
4548 }
4549 if (hardware_config->use_x86_avx) {
4550 xnn_params.f32.sqrt = (struct vunary_parameters) {
4551 .ukernel = (xnn_vunary_ukernel_fn) xnn_f32_vsqrt_ukernel__avx_sqrt_x8,
4552 .init.f32_sqrt = xnn_init_f32_sqrt_avx_params,
4553 .element_tile = 8,
4554 };
4555 } else {
4556 xnn_params.f32.sqrt = (struct vunary_parameters) {
4557 .ukernel = (xnn_vunary_ukernel_fn) xnn_f32_vsqrt_ukernel__sse_sqrt_x4,
4558 .element_tile = 4,
4559 };
4560 }
4561 if (!XNN_PLATFORM_MOBILE && hardware_config->use_x86_avx512f) {
4562 xnn_params.f32.prelu = (struct prelu_parameters) {
4563 .ukernel = (xnn_prelu_ukernel_fn) xnn_f32_prelu_ukernel__avx512f_2x16,
4564 .row_tile = 2,
4565 .channel_tile = 16,
4566 };
4567 } else if (hardware_config->use_x86_avx) {
4568 xnn_params.f32.prelu = (struct prelu_parameters) {
4569 .ukernel = (xnn_prelu_ukernel_fn) xnn_f32_prelu_ukernel__avx_2x16,
4570 .row_tile = 2,
4571 .channel_tile = 16,
4572 };
4573 } else if (hardware_config->use_x86_sse4_1) {
4574 xnn_params.f32.prelu = (struct prelu_parameters) {
4575 .ukernel = (xnn_prelu_ukernel_fn) xnn_f32_prelu_ukernel__sse41_2x8,
4576 .row_tile = 2,
4577 .channel_tile = 8,
4578 };
4579 } else {
4580 xnn_params.f32.prelu = (struct prelu_parameters) {
4581 .ukernel = (xnn_prelu_ukernel_fn) xnn_f32_prelu_ukernel__sse2_2x8,
4582 .row_tile = 2,
4583 .channel_tile = 8,
4584 };
4585 }
4586 xnn_params.f32.raddstoreexpminusmax = (struct raddstoreexpminusmax_parameters) {
4587 .ukernel = (xnn_raddstoreexpminusmax_ukernel_fn) xnn_f32_raddstoreexpminusmax_ukernel__sse2_rr2_p5_x20_acc2,
4588 .init.f32 = xnn_init_f32_expminus_sse2_rr2_p5_params,
4589 .element_tile = 20,
4590 };
4591 xnn_params.f32.rmax = (xnn_rmax_ukernel_fn) xnn_f32_rmax_ukernel__sse;
4592 xnn_params.f32.vmulcaddc = (struct vmulcaddc_parameters) {
4593 .ukernel = (xnn_vmulcaddc_ukernel_fn) xnn_f32_vmulcaddc_minmax_ukernel_c4__sse_2x,
4594 .init.f32 = xnn_init_f32_minmax_sse_params,
4595 .channel_tile = 4,
4596 .row_tile = 2,
4597 };
4598 #ifndef XNN_NO_NCHW_OPERATORS
4599 // Sparse microkernels on x86 currently target only SSE, and on processors
4600 // with AVX ISA dense inference is expected to be faster than sparse.
4601 if (!hardware_config->use_x86_avx) {
4602 init_flags |= XNN_INIT_FLAG_CHW_OPT;
4603 }
4604
4605 xnn_params.f32.spmm = (struct spmm_parameters) {
4606 .ukernel = (xnn_spmm_ukernel_fn) xnn_f32_spmm_minmax_ukernel_32x1__sse,
4607 .init.f32 = xnn_init_f32_minmax_sse_params,
4608 .mr = 32,
4609 .nr = 1,
4610 };
4611 xnn_params.f32.conv_hwc2chw_3x3c3s2 = (struct conv_hwc2chw_parameters) {
4612 .ukernel_with_symm_padding =
4613 (xnn_conv_hwc2chw_ukernel_fn) xnn_f32_conv_hwc2chw_ukernel_3x3s2p1c3x4__sse_2x2,
4614 .init.f32 = xnn_init_f32_minmax_sse_params,
4615 .output_channel_tile = 4,
4616 .output_height_tile = 2,
4617 .output_width_tile = 2,
4618 };
4619 if (hardware_config->use_x86_ssse3) {
4620 xnn_params.f32.dwconv2d_chw_3x3 = (struct dwconv2d_chw_parameters) {
4621 .ukernel = (xnn_dwconv2d_chw_ukernel_fn) xnn_f32_dwconv2d_chw_ukernel_3x3p1__ssse3_2x4_acc2,
4622 .init.f32 = xnn_init_f32_chw_sse_stride1_params,
4623 .update.f32 = xnn_update_f32_chw_sse_stride1_params,
4624 .output_height_tile = 2,
4625 .output_width_tile = 4,
4626 };
4627 } else {
4628 xnn_params.f32.dwconv2d_chw_3x3 = (struct dwconv2d_chw_parameters) {
4629 .ukernel = (xnn_dwconv2d_chw_ukernel_fn) xnn_f32_dwconv2d_chw_ukernel_3x3p1__sse_2x4_acc2,
4630 .init.f32 = xnn_init_f32_chw_sse_stride1_params,
4631 .update.f32 = xnn_update_f32_chw_sse_stride1_params,
4632 .output_height_tile = 2,
4633 .output_width_tile = 4,
4634 };
4635 }
4636 xnn_params.f32.dwconv2d_chw_3x3s2 = (struct dwconv2d_chw_parameters) {
4637 .ukernel = (xnn_dwconv2d_chw_ukernel_fn) xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__sse_1x4_acc3,
4638 .init.f32 = xnn_init_f32_chw_sse_stride2_params,
4639 .update.f32 = xnn_update_f32_chw_sse_stride2_params,
4640 .output_height_tile = 1,
4641 .output_width_tile = 4,
4642 };
4643 xnn_params.f32.dwconv2d_chw_5x5 = (struct dwconv2d_chw_parameters) {
4644 .ukernel = (xnn_dwconv2d_chw_ukernel_fn) xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_4x4,
4645 .init.f32 = xnn_init_f32_chw_sse_stride1_params,
4646 .update.f32 = xnn_update_f32_chw_sse_stride1_params,
4647 .output_height_tile = 4,
4648 .output_width_tile = 4,
4649 };
4650 xnn_params.f32.dwconv2d_chw_5x5s2 = (struct dwconv2d_chw_parameters) {
4651 .ukernel = (xnn_dwconv2d_chw_ukernel_fn) xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__sse_2x4,
4652 .init.f32 = xnn_init_f32_chw_sse_stride2_params,
4653 .update.f32 = xnn_update_f32_chw_sse_stride2_params,
4654 .output_height_tile = 2,
4655 .output_width_tile = 4,
4656 };
4657 xnn_params.f32.gavgpool_cw = (struct gavgpool_cw_parameters) {
4658 .ukernel = (xnn_gavgpool_cw_ukernel_fn) xnn_f32_gavgpool_cw_ukernel__sse_x4,
4659 .channel_tile = 4,
4660 };
4661 xnn_params.f32.ibilinear_chw = (struct ibilinear_chw_parameters) {
4662 .ukernel = (xnn_ibilinear_chw_ukernel_fn) xnn_f32_ibilinear_chw_ukernel__sse_p8,
4663 .channel_tile = 1,
4664 .pixel_tile = 8,
4665 };
4666 #endif // XNN_NO_NCHW_OPERATORS
4667 #endif // XNN_NO_F32_OPERATORS
4668
4669 /*************************** VCVT x86 micro-kernels ***************************/
4670 #ifndef XNN_NO_VCVT_OPERATORS
4671 init_flags |= XNN_INIT_FLAG_VCVT;
4672
4673 if (hardware_config->use_x86_avx512skx) {
4674 xnn_params.vcvt.f16_to_f32 = (struct vunary_parameters) {
4675 .ukernel = (xnn_vunary_ukernel_fn) xnn_f16_f32_vcvt_ukernel__avx512skx_x16,
4676 .element_tile = 16,
4677 };
4678 xnn_params.vcvt.f32_to_f16 = (struct vunary_parameters) {
4679 .ukernel = (xnn_vunary_ukernel_fn) xnn_f32_f16_vcvt_ukernel__avx512skx_x16,
4680 .element_tile = 16,
4681 };
4682 } else if (hardware_config->use_x86_f16c) {
4683 xnn_params.vcvt.f16_to_f32 = (struct vunary_parameters) {
4684 .ukernel = (xnn_vunary_ukernel_fn) xnn_f16_f32_vcvt_ukernel__f16c_x16,
4685 .element_tile = 16,
4686 };
4687 xnn_params.vcvt.f32_to_f16 = (struct vunary_parameters) {
4688 .ukernel = (xnn_vunary_ukernel_fn) xnn_f32_f16_vcvt_ukernel__f16c_x16,
4689 .init.f32_f16_cvt = xnn_init_f32_f16_cvt_f16c_params,
4690 .element_tile = 16,
4691 };
4692 } else if (hardware_config->use_x86_avx) {
4693 xnn_params.vcvt.f16_to_f32 = (struct vunary_parameters) {
4694 .ukernel = (xnn_vunary_ukernel_fn) xnn_f16_f32_vcvt_ukernel__avx_int16_x16,
4695 .init.f16_f32_cvt = xnn_init_f16_f32_cvt_sse_int16_params,
4696 .element_tile = 16,
4697 };
4698 xnn_params.vcvt.f32_to_f16 = (struct vunary_parameters) {
4699 .ukernel = (xnn_vunary_ukernel_fn) xnn_f32_f16_vcvt_ukernel__avx_x24,
4700 .init.f32_f16_cvt = xnn_init_f32_f16_cvt_sse2_params,
4701 .element_tile = 24,
4702 };
4703 } else if (hardware_config->use_x86_sse4_1) {
4704 xnn_params.vcvt.f16_to_f32 = (struct vunary_parameters) {
4705 .ukernel = (xnn_vunary_ukernel_fn) xnn_f16_f32_vcvt_ukernel__sse41_int16_x16,
4706 .init.f16_f32_cvt = xnn_init_f16_f32_cvt_sse_int16_params,
4707 .element_tile = 16,
4708 };
4709 xnn_params.vcvt.f32_to_f16 = (struct vunary_parameters) {
4710 .ukernel = (xnn_vunary_ukernel_fn) xnn_f32_f16_vcvt_ukernel__sse41_x8,
4711 .init.f32_f16_cvt = xnn_init_f32_f16_cvt_sse2_params,
4712 .element_tile = 8,
4713 };
4714 } else {
4715 xnn_params.vcvt.f16_to_f32 = (struct vunary_parameters) {
4716 .ukernel = (xnn_vunary_ukernel_fn) xnn_f16_f32_vcvt_ukernel__sse2_int16_x32,
4717 .init.f16_f32_cvt = xnn_init_f16_f32_cvt_sse_int16_params,
4718 .element_tile = 32,
4719 };
4720 xnn_params.vcvt.f32_to_f16 = (struct vunary_parameters) {
4721 .ukernel = (xnn_vunary_ukernel_fn) xnn_f32_f16_vcvt_ukernel__sse2_x16,
4722 .init.f32_f16_cvt = xnn_init_f32_f16_cvt_sse2_params,
4723 .element_tile = 16,
4724 };
4725 }
4726 if (hardware_config->use_x86_avx512skx) {
4727 xnn_params.vcvt.f32_to_qs8 = (struct vunary_parameters) {
4728 .ukernel = (xnn_vunary_ukernel_fn) xnn_f32_qs8_vcvt_ukernel__avx512skx_x128,
4729 .init.f32_qs8_cvt = xnn_init_f32_qs8_cvt_avx512_params,
4730 .element_tile = 128,
4731 };
4732 } else if (hardware_config->use_x86_avx2) {
4733 xnn_params.vcvt.f32_to_qs8 = (struct vunary_parameters) {
4734 .ukernel = (xnn_vunary_ukernel_fn) xnn_f32_qs8_vcvt_ukernel__avx2_x64,
4735 .init.f32_qs8_cvt = xnn_init_f32_qs8_cvt_avx2_params,
4736 .element_tile = 64,
4737 };
4738 } else if (hardware_config->use_x86_avx) {
4739 xnn_params.vcvt.f32_to_qs8 = (struct vunary_parameters) {
4740 .ukernel = (xnn_vunary_ukernel_fn) xnn_f32_qs8_vcvt_ukernel__avx_x32,
4741 .init.f32_qs8_cvt = xnn_init_f32_qs8_cvt_avx_params,
4742 .element_tile = 32,
4743 };
4744 } else if (hardware_config->use_x86_sse4_1) {
4745 xnn_params.vcvt.f32_to_qs8 = (struct vunary_parameters) {
4746 .ukernel = (xnn_vunary_ukernel_fn) xnn_f32_qs8_vcvt_ukernel__sse41_x32,
4747 .init.f32_qs8_cvt = xnn_init_f32_qs8_cvt_sse4_params,
4748 .element_tile = 32,
4749 };
4750 } else {
4751 xnn_params.vcvt.f32_to_qs8 = (struct vunary_parameters) {
4752 .ukernel = (xnn_vunary_ukernel_fn) xnn_f32_qs8_vcvt_ukernel__sse2_x32,
4753 .init.f32_qs8_cvt = xnn_init_f32_qs8_cvt_sse2_params,
4754 .element_tile = 32,
4755 };
4756 }
4757 if (hardware_config->use_x86_avx512skx) {
4758 xnn_params.vcvt.f32_to_qu8 = (struct vunary_parameters) {
4759 .ukernel = (xnn_vunary_ukernel_fn) xnn_f32_qu8_vcvt_ukernel__avx512skx_x128,
4760 .init.f32_qu8_cvt = xnn_init_f32_qu8_cvt_avx512_params,
4761 .element_tile = 128,
4762 };
4763 } else if (hardware_config->use_x86_avx2) {
4764 xnn_params.vcvt.f32_to_qu8 = (struct vunary_parameters) {
4765 .ukernel = (xnn_vunary_ukernel_fn) xnn_f32_qu8_vcvt_ukernel__avx2_x64,
4766 .init.f32_qu8_cvt = xnn_init_f32_qu8_cvt_avx2_params,
4767 .element_tile = 64,
4768 };
4769 } else if (hardware_config->use_x86_avx) {
4770 xnn_params.vcvt.f32_to_qu8 = (struct vunary_parameters) {
4771 .ukernel = (xnn_vunary_ukernel_fn) xnn_f32_qu8_vcvt_ukernel__avx_x32,
4772 .init.f32_qu8_cvt = xnn_init_f32_qu8_cvt_avx_params,
4773 .element_tile = 32,
4774 };
4775 } else {
4776 xnn_params.vcvt.f32_to_qu8 = (struct vunary_parameters) {
4777 .ukernel = (xnn_vunary_ukernel_fn) xnn_f32_qu8_vcvt_ukernel__sse2_x32,
4778 .init.f32_qu8_cvt = xnn_init_f32_qu8_cvt_sse2_params,
4779 .element_tile = 32,
4780 };
4781 }
4782 if (hardware_config->use_x86_avx2) {
4783 xnn_params.vcvt.qs8 = (struct vunary_parameters) {
4784 .ukernel = (xnn_vunary_ukernel_fn) xnn_qs8_vcvt_ukernel__avx2_x32,
4785 .init.qs8_cvt = xnn_init_qs8_cvt_avx2_params,
4786 .element_tile = 32,
4787 };
4788 xnn_params.vcvt.qu8 = (struct vunary_parameters) {
4789 .ukernel = (xnn_vunary_ukernel_fn) xnn_qu8_vcvt_ukernel__avx2_x32,
4790 .init.qu8_cvt = xnn_init_qu8_cvt_avx2_params,
4791 .element_tile = 32,
4792 };
4793 } else if (hardware_config->use_x86_avx) {
4794 xnn_params.vcvt.qs8 = (struct vunary_parameters) {
4795 .ukernel = (xnn_vunary_ukernel_fn) xnn_qs8_vcvt_ukernel__avx_x32,
4796 .init.qs8_cvt = xnn_init_qs8_cvt_ssse3_params,
4797 .element_tile = 32,
4798 };
4799 xnn_params.vcvt.qu8 = (struct vunary_parameters) {
4800 .ukernel = (xnn_vunary_ukernel_fn) xnn_qu8_vcvt_ukernel__avx_x32,
4801 .init.qu8_cvt = xnn_init_qu8_cvt_ssse3_params,
4802 .element_tile = 32,
4803 };
4804 } else if (hardware_config->use_x86_sse4_1) {
4805 xnn_params.vcvt.qs8 = (struct vunary_parameters) {
4806 .ukernel = (xnn_vunary_ukernel_fn) xnn_qs8_vcvt_ukernel__sse41_x32,
4807 .init.qs8_cvt = xnn_init_qs8_cvt_ssse3_params,
4808 .element_tile = 32,
4809 };
4810 xnn_params.vcvt.qu8 = (struct vunary_parameters) {
4811 .ukernel = (xnn_vunary_ukernel_fn) xnn_qu8_vcvt_ukernel__sse41_x32,
4812 .init.qu8_cvt = xnn_init_qu8_cvt_ssse3_params,
4813 .element_tile = 32,
4814 };
4815 } else if (hardware_config->use_x86_ssse3) {
4816 xnn_params.vcvt.qs8 = (struct vunary_parameters) {
4817 .ukernel = (xnn_vunary_ukernel_fn) xnn_qs8_vcvt_ukernel__ssse3_x32,
4818 .init.qs8_cvt = xnn_init_qs8_cvt_ssse3_params,
4819 .element_tile = 32,
4820 };
4821 xnn_params.vcvt.qu8 = (struct vunary_parameters) {
4822 .ukernel = (xnn_vunary_ukernel_fn) xnn_qu8_vcvt_ukernel__ssse3_x32,
4823 .init.qu8_cvt = xnn_init_qu8_cvt_ssse3_params,
4824 .element_tile = 32,
4825 };
4826 } else {
4827 xnn_params.vcvt.qs8 = (struct vunary_parameters) {
4828 .ukernel = (xnn_vunary_ukernel_fn) xnn_qs8_vcvt_ukernel__sse2_x32,
4829 .init.qs8_cvt = xnn_init_qs8_cvt_sse2_params,
4830 .element_tile = 32,
4831 };
4832 xnn_params.vcvt.qu8 = (struct vunary_parameters) {
4833 .ukernel = (xnn_vunary_ukernel_fn) xnn_qu8_vcvt_ukernel__sse2_x32,
4834 .init.qu8_cvt = xnn_init_qu8_cvt_sse2_params,
4835 .element_tile = 32,
4836 };
4837 }
4838 if (hardware_config->use_x86_avx512skx) {
4839 xnn_params.vcvt.qs8_to_f32 = (struct vunary_parameters) {
4840 .ukernel = (xnn_vunary_ukernel_fn) xnn_qs8_f32_vcvt_ukernel__avx512skx_x32,
4841 .init.qs8_f32_cvt = xnn_init_qs8_f32_cvt_avx512_params,
4842 .element_tile = 32,
4843 };
4844 xnn_params.vcvt.qu8_to_f32 = (struct vunary_parameters) {
4845 .ukernel = (xnn_vunary_ukernel_fn) xnn_qu8_f32_vcvt_ukernel__avx512skx_x32,
4846 .init.qu8_f32_cvt = xnn_init_qu8_f32_cvt_avx512_params,
4847 .element_tile = 32,
4848 };
4849 } else if (hardware_config->use_x86_avx2) {
4850 xnn_params.vcvt.qs8_to_f32 = (struct vunary_parameters) {
4851 .ukernel = (xnn_vunary_ukernel_fn) xnn_qs8_f32_vcvt_ukernel__avx2_x16,
4852 .init.qs8_f32_cvt = xnn_init_qs8_f32_cvt_avx_params,
4853 .element_tile = 16,
4854 };
4855 xnn_params.vcvt.qu8_to_f32 = (struct vunary_parameters) {
4856 .ukernel = (xnn_vunary_ukernel_fn) xnn_qu8_f32_vcvt_ukernel__avx2_x16,
4857 .init.qu8_f32_cvt = xnn_init_qu8_f32_cvt_avx_params,
4858 .element_tile = 16,
4859 };
4860 } else if (hardware_config->use_x86_avx) {
4861 xnn_params.vcvt.qs8_to_f32 = (struct vunary_parameters) {
4862 .ukernel = (xnn_vunary_ukernel_fn) xnn_qs8_f32_vcvt_ukernel__avx_x32,
4863 .init.qs8_f32_cvt = xnn_init_qs8_f32_cvt_avx_params,
4864 .element_tile = 32,
4865 };
4866 xnn_params.vcvt.qu8_to_f32 = (struct vunary_parameters) {
4867 .ukernel = (xnn_vunary_ukernel_fn) xnn_qu8_f32_vcvt_ukernel__avx_x32,
4868 .init.qu8_f32_cvt = xnn_init_qu8_f32_cvt_avx_params,
4869 .element_tile = 32,
4870 };
4871 } else if (hardware_config->use_x86_sse4_1) {
4872 xnn_params.vcvt.qs8_to_f32 = (struct vunary_parameters) {
4873 .ukernel = (xnn_vunary_ukernel_fn) xnn_qs8_f32_vcvt_ukernel__sse41_x16,
4874 .init.qs8_f32_cvt = xnn_init_qs8_f32_cvt_sse4_params,
4875 .element_tile = 16,
4876 };
4877 xnn_params.vcvt.qu8_to_f32 = (struct vunary_parameters) {
4878 .ukernel = (xnn_vunary_ukernel_fn) xnn_qu8_f32_vcvt_ukernel__sse41_x16,
4879 .init.qu8_f32_cvt = xnn_init_qu8_f32_cvt_sse4_params,
4880 .element_tile = 16,
4881 };
4882 } else {
4883 xnn_params.vcvt.qs8_to_f32 = (struct vunary_parameters) {
4884 .ukernel = (xnn_vunary_ukernel_fn) xnn_qs8_f32_vcvt_ukernel__sse2_x32,
4885 .init.qs8_f32_cvt = xnn_init_qs8_f32_cvt_sse2_params,
4886 .element_tile = 32,
4887 };
4888 xnn_params.vcvt.qu8_to_f32 = (struct vunary_parameters) {
4889 .ukernel = (xnn_vunary_ukernel_fn) xnn_qu8_f32_vcvt_ukernel__sse2_x32,
4890 .init.qu8_f32_cvt = xnn_init_qu8_f32_cvt_sse2_params,
4891 .element_tile = 32,
4892 };
4893 }
4894 #endif // XNN_NO_VCVT_OPERATORS
4895
4896 /**************************** X32 x86 micro-kernels ****************************/
4897 #ifndef XNN_NO_X32_OPERATORS
4898 init_flags |= XNN_INIT_FLAG_X32;
4899
4900 xnn_params.x32.unpool = (xnn_unpool_ukernel_fn) xnn_x32_unpool_ukernel__sse2;
4901 xnn_params.x32.zip = (struct zip_parameters) {
4902 .x2 = (xnn_zipc_ukernel_fn) xnn_x32_zip_x2_ukernel__sse2,
4903 .x3 = (xnn_zipc_ukernel_fn) xnn_x32_zip_x3_ukernel__sse2,
4904 .x4 = (xnn_zipc_ukernel_fn) xnn_x32_zip_x4_ukernel__sse2,
4905 .xm = (xnn_zipv_ukernel_fn) xnn_x32_zip_xm_ukernel__sse2,
4906 };
4907 #endif // XNN_NO_X32_OPERATORS
4908
4909 /**************************** XX x86 micro-kernels ****************************/
4910 #ifndef XNN_NO_XX_OPERATORS
4911 init_flags |= XNN_INIT_FLAG_XX;
4912
4913 xnn_params.xx.copy = (xnn_vunary_ukernel_fn) xnn_xx_copy_ukernel__scalar_memcpy;
4914 xnn_params.xx.fill = (struct fill_parameters) {
4915 .ukernel = (xnn_fill_ukernel_fn) xnn_xx_fill_ukernel__sse2_x64,
4916 .row_tile = 1,
4917 };
4918 xnn_params.xx.pad = (struct pad_parameters) {
4919 .ukernel = (xnn_pad_ukernel_fn) xnn_xx_pad_ukernel__sse2,
4920 .row_tile = 1,
4921 };
4922 #endif
4923
4924#elif XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
4925
4926 /**************************** QC8 WAsm SIMD micro-kernels****************************/
4927 #ifndef XNN_NO_QS8_OPERATORS
4928 init_flags |= XNN_INIT_FLAG_QC8;
4929
4930 xnn_params.qc8.gemm.minmax.gemm[XNN_MR_TO_INDEX(4)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_fn) xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2s4__wasmsimd_dot16x2_ld128);
4931 xnn_params.qc8.gemm.minmax.igemm[XNN_MR_TO_INDEX(4)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_fn) xnn_qc8_igemm_minmax_fp32_ukernel_4x4c2s4__wasmsimd_dot16x2_ld128);
4932 xnn_params.qc8.gemm.minmax.gemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_fn) xnn_qc8_gemm_minmax_fp32_ukernel_1x4c2s4__wasmsimd_dot16x2_ld128);
4933 xnn_params.qc8.gemm.minmax.igemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_fn) xnn_qc8_igemm_minmax_fp32_ukernel_1x4c2s4__wasmsimd_dot16x2_ld128);
4934 xnn_params.qc8.gemm.init.qc8 = xnn_init_qc8_conv_minmax_fp32_wasmsimd_params;
4935 xnn_params.qc8.gemm.mr = 4;
4936 xnn_params.qc8.gemm.nr = 4;
4937 xnn_params.qc8.gemm.log2_kr = 1;
4938 xnn_params.qc8.gemm.log2_sr = 2;
4939
4940 xnn_params.qc8.dwconv[0].minmax.unipass = (xnn_dwconv_unipass_ukernel_fn) xnn_qc8_dwconv_minmax_fp32_ukernel_3p16c__wasmsimd_mul16_add16;
4941 xnn_params.qc8.dwconv[0].init.qc8 = xnn_init_qc8_conv_minmax_fp32_wasmsimd_params;
4942 xnn_params.qc8.dwconv[0].channel_tile = 16;
4943 xnn_params.qc8.dwconv[0].primary_tile = 3;
4944 xnn_params.qc8.dwconv[1].minmax.unipass = (xnn_dwconv_unipass_ukernel_fn) xnn_qc8_dwconv_minmax_fp32_ukernel_9p16c__wasmsimd_mul16_add16;
4945 xnn_params.qc8.dwconv[1].init.qc8 = xnn_init_qc8_conv_minmax_fp32_wasmsimd_params;
4946 xnn_params.qc8.dwconv[1].channel_tile = 16;
4947 xnn_params.qc8.dwconv[1].primary_tile = 9;
4948 xnn_params.qc8.dwconv[2].minmax.unipass = (xnn_dwconv_unipass_ukernel_fn) xnn_qc8_dwconv_minmax_fp32_ukernel_25p16c__wasmsimd_mul16_add16;
4949 xnn_params.qc8.dwconv[2].init.qc8 = xnn_init_qc8_conv_minmax_fp32_wasmsimd_params;
4950 xnn_params.qc8.dwconv[2].channel_tile = 16;
4951 xnn_params.qc8.dwconv[2].primary_tile = 25;
4952 #endif // XNN_NO_QC8_OPERATORS
4953
4954 /**************************** QS8 WAsm SIMD micro-kernels****************************/
4955 #ifndef XNN_NO_QS8_OPERATORS
4956 init_flags |= XNN_INIT_FLAG_QS8;
4957
4958 xnn_params.qs8.gemm.minmax.gemm[XNN_MR_TO_INDEX(4)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_fn) xnn_qs8_gemm_minmax_fp32_ukernel_4x4c2s4__wasmsimd_dot16x2_ld128);
4959 xnn_params.qs8.gemm.minmax.igemm[XNN_MR_TO_INDEX(4)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_fn) xnn_qs8_igemm_minmax_fp32_ukernel_4x4c2s4__wasmsimd_dot16x2_ld128);
4960 xnn_params.qs8.gemm.minmax.gemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_fn) xnn_qs8_gemm_minmax_fp32_ukernel_1x4c2s4__wasmsimd_dot16x2_ld128);
4961 xnn_params.qs8.gemm.minmax.igemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_fn) xnn_qs8_igemm_minmax_fp32_ukernel_1x4c2s4__wasmsimd_dot16x2_ld128);
4962 xnn_params.qs8.gemm.init.qs8 = xnn_init_qs8_conv_minmax_fp32_wasmsimd_params;
4963 xnn_params.qs8.gemm.mr = 4;
4964 xnn_params.qs8.gemm.nr = 4;
4965 xnn_params.qs8.gemm.log2_kr = 1;
4966 xnn_params.qs8.gemm.log2_sr = 2;
4967
4968 xnn_params.qs8.dwconv[0].minmax.unipass = (xnn_dwconv_unipass_ukernel_fn) xnn_qs8_dwconv_minmax_fp32_ukernel_9p16c__wasmsimd_mul16_add16;
4969 xnn_params.qs8.dwconv[0].init.qs8 = xnn_init_qs8_conv_minmax_fp32_wasmsimd_params;
4970 xnn_params.qs8.dwconv[0].channel_tile = 16;
4971 xnn_params.qs8.dwconv[0].primary_tile = 9;
4972 xnn_params.qs8.dwconv[1].minmax.unipass = (xnn_dwconv_unipass_ukernel_fn) xnn_qs8_dwconv_minmax_fp32_ukernel_25p16c__wasmsimd_mul16_add16;
4973 xnn_params.qs8.dwconv[1].init.qs8 = xnn_init_qs8_conv_minmax_fp32_wasmsimd_params;
4974 xnn_params.qs8.dwconv[1].channel_tile = 16;
4975 xnn_params.qs8.dwconv[1].primary_tile = 25;
4976
4977 xnn_params.qs8.gavgpool = (struct gavgpool_parameters) {
4978 .unipass = (xnn_gavgpool_unipass_ukernel_fn) xnn_qs8_gavgpool_minmax_fp32_ukernel_7x__wasmsimd_c16,
4979 .multipass = (xnn_gavgpool_multipass_ukernel_fn) xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__wasmsimd_c16,
4980 .init.qs8 = xnn_init_qs8_avgpool_minmax_fp32_wasmsimd_params,
4981 .update.qs8 = xnn_update_qs8_avgpool_minmax_fp32_wasmsimd_params,
4982 .row_tile = 7,
4983 .channel_tile = 16,
4984 };
4985
4986
4987 #if XNN_ARCH_WASMRELAXEDSIMD
4988 if (hardware_config->is_x86) {
4989 xnn_params.qs8.lrelu = (struct vunary_parameters) {
4990 .ukernel = (xnn_vunary_ukernel_fn) xnn_qs8_vlrelu_ukernel__wasmrelaxedsimd_x86_x32,
4991 .init.qs8_lrelu = xnn_init_qs8_lrelu_wasmsimd_x86_params,
4992 .element_tile = 32,
4993 };
4994 } else {
4995 xnn_params.qs8.lrelu = (struct vunary_parameters) {
4996 .ukernel = (xnn_vunary_ukernel_fn) xnn_qs8_vlrelu_ukernel__wasmrelaxedsimd_arm_x32,
4997 .init.qs8_lrelu = xnn_init_qs8_lrelu_wasmsimd_arm_params,
4998 .element_tile = 32,
4999 };
5000 }
5001 #else
5002 if (hardware_config->is_x86) {
5003 xnn_params.qs8.lrelu = (struct vunary_parameters) {
5004 .ukernel = (xnn_vunary_ukernel_fn) xnn_qs8_vlrelu_ukernel__wasmsimd_x86_x16,
5005 .init.qs8_lrelu = xnn_init_qs8_lrelu_wasmsimd_x86_params,
5006 .element_tile = 16,
5007 };
5008 } else {
5009 xnn_params.qs8.lrelu = (struct vunary_parameters) {
5010 .ukernel = (xnn_vunary_ukernel_fn) xnn_qs8_vlrelu_ukernel__wasmsimd_arm_x32,
5011 .init.qs8_lrelu = xnn_init_qs8_lrelu_wasmsimd_arm_params,
5012 .element_tile = 32,
5013 };
5014 }
5015 #endif
5016 #endif // XNN_NO_QS8_OPERATORS
5017
5018 /**************************** QU8 WAsm SIMD micro-kernels****************************/
5019 #ifndef XNN_NO_QU8_OPERATORS
5020 init_flags |= XNN_INIT_FLAG_QU8;
5021
5022 xnn_params.qu8.gemm.minmax.gemm[XNN_MR_TO_INDEX(4)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_fn) xnn_qu8_gemm_minmax_fp32_ukernel_4x4c2s4__wasmsimd_dot16x2_ld128);
5023 xnn_params.qu8.gemm.minmax.igemm[XNN_MR_TO_INDEX(4)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_fn) xnn_qu8_igemm_minmax_fp32_ukernel_4x4c2s4__wasmsimd_dot16x2_ld128);
5024 xnn_params.qu8.gemm.minmax.gemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_fn) xnn_qu8_gemm_minmax_fp32_ukernel_1x4c2s4__wasmsimd_dot16x2_ld128);
5025 xnn_params.qu8.gemm.minmax.igemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_fn) xnn_qu8_igemm_minmax_fp32_ukernel_1x4c2s4__wasmsimd_dot16x2_ld128);
5026 xnn_params.qu8.gemm.init.qu8 = xnn_init_qu8_conv_minmax_fp32_wasmsimd_params;
5027 xnn_params.qu8.gemm.mr = 4;
5028 xnn_params.qu8.gemm.nr = 4;
5029 xnn_params.qu8.gemm.log2_kr = 1;
5030 xnn_params.qu8.gemm.log2_sr = 2;
5031
5032 xnn_params.qu8.dwconv[0].minmax.unipass = (xnn_dwconv_unipass_ukernel_fn) xnn_qu8_dwconv_minmax_fp32_ukernel_9p8c__wasmsimd_mul16;
5033 xnn_params.qu8.dwconv[0].init.qu8 = xnn_init_qu8_conv_minmax_fp32_wasmsimd_params;
5034 xnn_params.qu8.dwconv[0].channel_tile = 8;
5035 xnn_params.qu8.dwconv[0].primary_tile = 9;
5036 xnn_params.qu8.dwconv[1].minmax.unipass = (xnn_dwconv_unipass_ukernel_fn) xnn_qu8_dwconv_minmax_fp32_ukernel_25p8c__wasmsimd_mul16;
5037 xnn_params.qu8.dwconv[1].init.qu8 = xnn_init_qu8_conv_minmax_fp32_wasmsimd_params;
5038 xnn_params.qu8.dwconv[1].channel_tile = 8;
5039 xnn_params.qu8.dwconv[1].primary_tile = 25;
5040
5041 xnn_params.qu8.avgpool = (struct avgpool_parameters) {
5042 .unipass = (xnn_avgpool_unipass_ukernel_fn) xnn_qu8_avgpool_minmax_fp32_ukernel_9x__scalar_imagic_c1,
5043 .multipass = (xnn_avgpool_multipass_ukernel_fn) xnn_qu8_avgpool_minmax_fp32_ukernel_9p8x__scalar_imagic_c1,
5044 .init.qu8 = xnn_init_qu8_avgpool_minmax_fp32_scalar_imagic_params,
5045 .primary_tile = 9,
5046 .incremental_tile = 8,
5047 .channel_tile = 1,
5048 };
5049 xnn_params.qu8.gavgpool = (struct gavgpool_parameters) {
5050 .unipass = (xnn_gavgpool_unipass_ukernel_fn) xnn_qu8_gavgpool_minmax_fp32_ukernel_7x__wasmsimd_c16,
5051 .multipass = (xnn_gavgpool_multipass_ukernel_fn) xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__wasmsimd_c16,
5052 .init.qu8 = xnn_init_qu8_avgpool_minmax_fp32_wasmsimd_params,
5053 .update.qu8 = xnn_update_qu8_avgpool_minmax_fp32_wasmsimd_params,
5054 .row_tile = 7,
5055 .channel_tile = 16,
5056 };
5057
5058 #if XNN_ARCH_WASMRELAXEDSIMD
5059 if (hardware_config->is_x86) {
5060 xnn_params.qu8.lrelu = (struct vunary_parameters) {
5061 .ukernel = (xnn_vunary_ukernel_fn) xnn_qu8_vlrelu_ukernel__wasmrelaxedsimd_x86_x32,
5062 .init.qu8_lrelu = xnn_init_qu8_lrelu_wasmsimd_x86_params,
5063 .element_tile = 32,
5064 };
5065 } else {
5066 xnn_params.qu8.lrelu = (struct vunary_parameters) {
5067 .ukernel = (xnn_vunary_ukernel_fn) xnn_qu8_vlrelu_ukernel__wasmrelaxedsimd_arm_x32,
5068 .init.qu8_lrelu = xnn_init_qu8_lrelu_wasmsimd_arm_params,
5069 .element_tile = 32,
5070 };
5071 }
5072 #else
5073 if (hardware_config->is_x86) {
5074 xnn_params.qu8.lrelu = (struct vunary_parameters) {
5075 .ukernel = (xnn_vunary_ukernel_fn) xnn_qu8_vlrelu_ukernel__wasmsimd_x86_x16,
5076 .init.qu8_lrelu = xnn_init_qu8_lrelu_wasmsimd_x86_params,
5077 .element_tile = 16,
5078 };
5079 } else {
5080 xnn_params.qu8.lrelu = (struct vunary_parameters) {
5081 .ukernel = (xnn_vunary_ukernel_fn) xnn_qu8_vlrelu_ukernel__wasmsimd_arm_x32,
5082 .init.qu8_lrelu = xnn_init_qu8_lrelu_wasmsimd_arm_params,
5083 .element_tile = 32,
5084 };
5085 }
5086 #endif
5087 #endif // XNN_NO_QU8_OPERATORS
5088
5089 /**************************** S8 WAsm SIMD micro-kernels****************************/
5090 #ifndef XNN_NO_S8_OPERATORS
5091 init_flags |= XNN_INIT_FLAG_S8;
5092
5093 xnn_params.s8.clamp = (struct vunary_parameters) {
5094 .ukernel = (xnn_vunary_ukernel_fn) xnn_s8_vclamp_ukernel__wasmsimd_x64,
5095 .init.s8_minmax = xnn_init_s8_minmax_wasmsimd_params,
5096 .element_tile = 64,
5097 };
5098 xnn_params.s8.ibilinear = (struct ibilinear_parameters) {
5099 .ukernel = (xnn_ibilinear_ukernel_fn) xnn_s8_ibilinear_ukernel__wasmsimd_dot16x2_c8,
5100 .pixel_tile = 1,
5101 .channel_tile = 8,
5102 };
5103 xnn_params.s8.maxpool = (struct maxpool_parameters) {
5104 .ukernel = (xnn_maxpool_ukernel_fn) xnn_s8_maxpool_minmax_ukernel_9p8x__wasmsimd_c16,
5105 .init.s8 = xnn_init_s8_minmax_wasmsimd_params,
5106 .mr = 9,
5107 .qr = 8,
5108 };
5109 #endif // XNN_NO_S8_OPERATORS
5110
5111 /**************************** U8 WAsm SIMD micro-kernels****************************/
5112 #ifndef XNN_NO_U8_OPERATORS
5113 init_flags |= XNN_INIT_FLAG_U8;
5114
5115 xnn_params.u8.clamp = (struct vunary_parameters) {
5116 .ukernel = (xnn_vunary_ukernel_fn) xnn_u8_vclamp_ukernel__wasmsimd_x64,
5117 .init.u8_minmax = xnn_init_u8_minmax_wasmsimd_params,
5118 .element_tile = 64,
5119 };
5120 xnn_params.u8.ibilinear = (struct ibilinear_parameters) {
5121 .ukernel = (xnn_ibilinear_ukernel_fn) xnn_u8_ibilinear_ukernel__wasmsimd_dot16x2_c8,
5122 .pixel_tile = 1,
5123 .channel_tile = 8,
5124 };
5125 xnn_params.u8.maxpool = (struct maxpool_parameters) {
5126 .ukernel = (xnn_maxpool_ukernel_fn) xnn_u8_maxpool_minmax_ukernel_9p8x__wasmsimd_c16,
5127 .init.u8 = xnn_init_u8_minmax_wasmsimd_params,
5128 .mr = 9,
5129 .qr = 8,
5130 };
5131 xnn_params.u8.lut32norm = xnn_u8_lut32norm_ukernel__scalar;
5132 xnn_params.u8.rmax = xnn_u8_rmax_ukernel__scalar;
5133 #endif // XNN_NO_U8_OPERATORS
5134
5135 /**************************** X8 WAsm SIMD micro-kernels****************************/
5136 #ifndef XNN_NO_X8_OPERATORS
5137 init_flags |= XNN_INIT_FLAG_X8;
5138
5139 xnn_params.x8.zip = (struct zip_parameters) {
5140 .x2 = (xnn_zipc_ukernel_fn) xnn_x8_zip_x2_ukernel__scalar,
5141 .x3 = (xnn_zipc_ukernel_fn) xnn_x8_zip_x3_ukernel__scalar,
5142 .x4 = (xnn_zipc_ukernel_fn) xnn_x8_zip_x4_ukernel__scalar,
5143 .xm = (xnn_zipv_ukernel_fn) xnn_x8_zip_xm_ukernel__scalar,
5144 };
5145 #endif // XNN_NO_X8_OPERATORS
5146
5147 /**************************** F32 WAsm SIMD micro-kernels****************************/
5148 #ifndef XNN_NO_F32_OPERATORS
5149 init_flags |= XNN_INIT_FLAG_F32;
5150
5151 if (hardware_config->is_x86) {
5152 #if XNN_ARCH_WASMRELAXEDSIMD
5153 xnn_params.f32.gemm.minmax.gemm[XNN_MR_TO_INDEX(4)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_fn) xnn_f32_gemm_minmax_ukernel_4x8__wasmrelaxedsimd_fma_splat);
5154 xnn_params.f32.gemm.minmax.igemm[XNN_MR_TO_INDEX(4)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_fn) xnn_f32_igemm_minmax_ukernel_4x8__wasmrelaxedsimd_fma_splat);
5155 xnn_params.f32.gemm.minmax.gemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_fn) xnn_f32_gemm_minmax_ukernel_1x8__wasmrelaxedsimd_fma_splat);
5156 xnn_params.f32.gemm.minmax.igemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_fn) xnn_f32_igemm_minmax_ukernel_1x8__wasmrelaxedsimd_fma_splat);
5157 xnn_params.f32.gemm.relu.gemm[XNN_MR_TO_INDEX(4)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_fn) xnn_f32_gemm_relu_ukernel_4x8__wasmrelaxedsimd_fma_splat);
5158 xnn_params.f32.gemm.relu.igemm[XNN_MR_TO_INDEX(4)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_fn) xnn_f32_igemm_relu_ukernel_4x8__wasmrelaxedsimd_fma_splat);
5159 xnn_params.f32.gemm.relu.gemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_fn) xnn_f32_gemm_relu_ukernel_1x8__wasmrelaxedsimd_fma_splat);
5160 xnn_params.f32.gemm.relu.igemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_fn) xnn_f32_igemm_relu_ukernel_1x8__wasmrelaxedsimd_fma_splat);
5161 xnn_params.f32.gemm.linear.gemm[XNN_MR_TO_INDEX(4)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_fn) xnn_f32_gemm_ukernel_4x8__wasmrelaxedsimd_fma_splat);
5162 xnn_params.f32.gemm.linear.igemm[XNN_MR_TO_INDEX(4)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_fn) xnn_f32_igemm_ukernel_4x8__wasmrelaxedsimd_fma_splat);
5163 xnn_params.f32.gemm.linear.gemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_fn) xnn_f32_gemm_ukernel_1x8__wasmrelaxedsimd_fma_splat);
5164 xnn_params.f32.gemm.linear.igemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_fn) xnn_f32_igemm_ukernel_1x8__wasmrelaxedsimd_fma_splat);
5165 #else
5166 xnn_params.f32.gemm.minmax.gemm[XNN_MR_TO_INDEX(4)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_fn) xnn_f32_gemm_minmax_ukernel_4x8__wasmsimd_x86_splat);
5167 xnn_params.f32.gemm.minmax.igemm[XNN_MR_TO_INDEX(4)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_fn) xnn_f32_igemm_minmax_ukernel_4x8__wasmsimd_x86_splat);
5168 xnn_params.f32.gemm.minmax.gemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_fn) xnn_f32_gemm_minmax_ukernel_1x8__wasmsimd_x86_splat);
5169 xnn_params.f32.gemm.minmax.igemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_fn) xnn_f32_igemm_minmax_ukernel_1x8__wasmsimd_x86_splat);
5170 xnn_params.f32.gemm.relu.gemm[XNN_MR_TO_INDEX(4)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_fn) xnn_f32_gemm_relu_ukernel_4x8__wasmsimd_splat);
5171 xnn_params.f32.gemm.relu.igemm[XNN_MR_TO_INDEX(4)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_fn) xnn_f32_igemm_relu_ukernel_4x8__wasmsimd_splat);
5172 xnn_params.f32.gemm.relu.gemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_fn) xnn_f32_gemm_relu_ukernel_1x8__wasmsimd_splat);
5173 xnn_params.f32.gemm.relu.igemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_fn) xnn_f32_igemm_relu_ukernel_1x8__wasmsimd_splat);
5174 xnn_params.f32.gemm.linear.gemm[XNN_MR_TO_INDEX(4)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_fn) xnn_f32_gemm_ukernel_4x8__wasmsimd_splat);
5175 xnn_params.f32.gemm.linear.igemm[XNN_MR_TO_INDEX(4)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_fn) xnn_f32_igemm_ukernel_4x8__wasmsimd_splat);
5176 xnn_params.f32.gemm.linear.gemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_fn) xnn_f32_gemm_ukernel_1x8__wasmsimd_splat);
5177 xnn_params.f32.gemm.linear.igemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_fn) xnn_f32_igemm_ukernel_1x8__wasmsimd_splat);
5178 #endif
5179 xnn_params.f32.gemm.init.f32 = xnn_init_f32_minmax_wasmsimd_params;
5180 xnn_params.f32.gemm.mr = 4;
5181 xnn_params.f32.gemm.nr = 8;
5182
5183 #if XNN_ARCH_WASMRELAXEDSIMD
5184 xnn_params.f32.gemm2.minmax.gemm[XNN_MR_TO_INDEX(4)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_fn) xnn_f32_gemm_minmax_ukernel_4x2c4__wasmrelaxedsimd_fma);
5185 xnn_params.f32.gemm2.minmax.igemm[XNN_MR_TO_INDEX(4)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_fn) xnn_f32_igemm_minmax_ukernel_4x2c4__wasmrelaxedsimd_fma);
5186 xnn_params.f32.gemm2.linear.gemm[XNN_MR_TO_INDEX(4)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_fn) xnn_f32_gemm_ukernel_4x2c4__wasmrelaxedsimd_fma);
5187 xnn_params.f32.gemm2.linear.igemm[XNN_MR_TO_INDEX(4)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_fn) xnn_f32_igemm_ukernel_4x2c4__wasmrelaxedsimd_fma);
5188 #else
5189 xnn_params.f32.gemm2.minmax.gemm[XNN_MR_TO_INDEX(4)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_fn) xnn_f32_gemm_minmax_ukernel_4x2c4__wasmsimd_x86);
5190 xnn_params.f32.gemm2.minmax.igemm[XNN_MR_TO_INDEX(4)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_fn) xnn_f32_igemm_minmax_ukernel_4x2c4__wasmsimd_x86);
5191 xnn_params.f32.gemm2.linear.gemm[XNN_MR_TO_INDEX(4)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_fn) xnn_f32_gemm_ukernel_4x2c4__wasmsimd);
5192 xnn_params.f32.gemm2.linear.igemm[XNN_MR_TO_INDEX(4)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_fn) xnn_f32_igemm_ukernel_4x2c4__wasmsimd);
5193 #endif
5194 xnn_params.f32.gemm2.init.f32 = xnn_init_f32_minmax_wasmsimd_params;
5195 xnn_params.f32.gemm2.mr = 4;
5196 xnn_params.f32.gemm2.nr = 2;
5197 xnn_params.f32.gemm2.log2_kr = 2;
5198 } else {
5199 #if XNN_ARCH_WASMRELAXEDSIMD
5200 xnn_params.f32.gemm.minmax.gemm[XNN_MR_TO_INDEX(6)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_fn) xnn_f32_gemm_minmax_ukernel_6x8__wasmrelaxedsimd_fma_splat);
5201 xnn_params.f32.gemm.minmax.igemm[XNN_MR_TO_INDEX(6)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_fn) xnn_f32_igemm_minmax_ukernel_6x8__wasmrelaxedsimd_fma_splat);
5202 xnn_params.f32.gemm.minmax.gemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_fn) xnn_f32_gemm_minmax_ukernel_1x8__wasmrelaxedsimd_fma_splat);
5203 xnn_params.f32.gemm.minmax.igemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_fn) xnn_f32_igemm_minmax_ukernel_1x8__wasmrelaxedsimd_fma_splat);
5204 xnn_params.f32.gemm.relu.gemm[XNN_MR_TO_INDEX(6)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_fn) xnn_f32_gemm_relu_ukernel_6x8__wasmrelaxedsimd_fma_splat);
5205 xnn_params.f32.gemm.relu.igemm[XNN_MR_TO_INDEX(6)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_fn) xnn_f32_igemm_relu_ukernel_6x8__wasmrelaxedsimd_fma_splat);
5206 xnn_params.f32.gemm.relu.gemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_fn) xnn_f32_gemm_relu_ukernel_1x8__wasmrelaxedsimd_fma_splat);
5207 xnn_params.f32.gemm.relu.igemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_fn) xnn_f32_igemm_relu_ukernel_1x8__wasmrelaxedsimd_fma_splat);
5208 xnn_params.f32.gemm.linear.gemm[XNN_MR_TO_INDEX(6)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_fn) xnn_f32_gemm_ukernel_6x8__wasmrelaxedsimd_fma_splat);
5209 xnn_params.f32.gemm.linear.igemm[XNN_MR_TO_INDEX(6)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_fn) xnn_f32_igemm_ukernel_6x8__wasmrelaxedsimd_fma_splat);
5210 xnn_params.f32.gemm.linear.gemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_fn) xnn_f32_gemm_ukernel_1x8__wasmrelaxedsimd_fma_splat);
5211 xnn_params.f32.gemm.linear.igemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_fn) xnn_f32_igemm_ukernel_1x8__wasmrelaxedsimd_fma_splat);
5212 xnn_params.f32.gemm.init.f32 = xnn_init_f32_minmax_wasmsimd_params;
5213 xnn_params.f32.gemm.mr = 6;
5214 xnn_params.f32.gemm.nr = 8;
5215 #else
5216 xnn_params.f32.gemm.minmax.gemm[XNN_MR_TO_INDEX(5)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_fn) xnn_f32_gemm_minmax_ukernel_5x8__wasmsimd_arm_splat);
5217 xnn_params.f32.gemm.minmax.igemm[XNN_MR_TO_INDEX(5)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_fn) xnn_f32_igemm_minmax_ukernel_5x8__wasmsimd_arm_splat);
5218 xnn_params.f32.gemm.minmax.gemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_fn) xnn_f32_gemm_minmax_ukernel_1x8__wasmsimd_arm_splat);
5219 xnn_params.f32.gemm.minmax.igemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_fn) xnn_f32_igemm_minmax_ukernel_1x8__wasmsimd_arm_splat);
5220 xnn_params.f32.gemm.relu.gemm[XNN_MR_TO_INDEX(5)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_fn) xnn_f32_gemm_relu_ukernel_5x8__wasmsimd_splat);
5221 xnn_params.f32.gemm.relu.igemm[XNN_MR_TO_INDEX(5)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_fn) xnn_f32_igemm_relu_ukernel_5x8__wasmsimd_splat);
5222 xnn_params.f32.gemm.relu.gemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_fn) xnn_f32_gemm_relu_ukernel_1x8__wasmsimd_splat);
5223 xnn_params.f32.gemm.relu.igemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_fn) xnn_f32_igemm_relu_ukernel_1x8__wasmsimd_splat);
5224 xnn_params.f32.gemm.linear.gemm[XNN_MR_TO_INDEX(5)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_fn) xnn_f32_gemm_ukernel_5x8__wasmsimd_splat);
5225 xnn_params.f32.gemm.linear.igemm[XNN_MR_TO_INDEX(5)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_fn) xnn_f32_igemm_ukernel_5x8__wasmsimd_splat);
5226 xnn_params.f32.gemm.linear.gemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_fn) xnn_f32_gemm_ukernel_1x8__wasmsimd_splat);
5227 xnn_params.f32.gemm.linear.igemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_fn) xnn_f32_igemm_ukernel_1x8__wasmsimd_splat);
5228 xnn_params.f32.gemm.init.f32 = xnn_init_f32_minmax_wasmsimd_params;
5229 xnn_params.f32.gemm.mr = 5;
5230 xnn_params.f32.gemm.nr = 8;
5231 #endif
5232
5233 #if XNN_ARCH_WASMRELAXEDSIMD
5234 xnn_params.f32.gemm2.minmax.gemm[XNN_MR_TO_INDEX(4)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_fn) xnn_f32_gemm_minmax_ukernel_4x2c4__wasmrelaxedsimd_fma);
5235 xnn_params.f32.gemm2.minmax.igemm[XNN_MR_TO_INDEX(4)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_fn) xnn_f32_igemm_minmax_ukernel_4x2c4__wasmrelaxedsimd_fma);
5236 xnn_params.f32.gemm2.linear.gemm[XNN_MR_TO_INDEX(4)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_fn) xnn_f32_gemm_ukernel_4x2c4__wasmrelaxedsimd_fma);
5237 xnn_params.f32.gemm2.linear.igemm[XNN_MR_TO_INDEX(4)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_fn) xnn_f32_igemm_ukernel_4x2c4__wasmrelaxedsimd_fma);
5238 #else
5239 xnn_params.f32.gemm2.minmax.gemm[XNN_MR_TO_INDEX(4)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_fn) xnn_f32_gemm_minmax_ukernel_4x2c4__wasmsimd_arm);
5240 xnn_params.f32.gemm2.minmax.igemm[XNN_MR_TO_INDEX(4)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_fn) xnn_f32_igemm_minmax_ukernel_4x2c4__wasmsimd_arm);
5241 xnn_params.f32.gemm2.linear.gemm[XNN_MR_TO_INDEX(4)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_fn) xnn_f32_gemm_ukernel_4x2c4__wasmsimd);
5242 xnn_params.f32.gemm2.linear.igemm[XNN_MR_TO_INDEX(4)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_fn) xnn_f32_igemm_ukernel_4x2c4__wasmsimd);
5243 #endif
5244 xnn_params.f32.gemm2.init.f32 = xnn_init_f32_minmax_wasmsimd_params;
5245 xnn_params.f32.gemm2.mr = 4;
5246 xnn_params.f32.gemm2.nr = 2;
5247 xnn_params.f32.gemm2.log2_kr = 2;
5248 }
5249
5250 #if XNN_ARCH_WASMRELAXEDSIMD
5251 xnn_params.f32.dwconv[0].minmax.unipass = (xnn_dwconv_unipass_ukernel_fn) xnn_f32_dwconv_minmax_ukernel_3p8c__wasmrelaxedsimd_fma;
5252 xnn_params.f32.dwconv[0].linear.unipass = (xnn_dwconv_unipass_ukernel_fn) xnn_f32_dwconv_ukernel_3p8c__wasmrelaxedsimd_fma;
5253 xnn_params.f32.dwconv[0].init.f32 = xnn_init_f32_minmax_wasmsimd_params;
5254 xnn_params.f32.dwconv[0].channel_tile = 8;
5255 xnn_params.f32.dwconv[0].primary_tile = 3;
5256
5257 xnn_params.f32.dwconv[1].minmax.unipass = (xnn_dwconv_unipass_ukernel_fn) xnn_f32_dwconv_minmax_ukernel_4p8c__wasmrelaxedsimd_fma;
5258 xnn_params.f32.dwconv[1].linear.unipass = (xnn_dwconv_unipass_ukernel_fn) xnn_f32_dwconv_ukernel_4p8c__wasmrelaxedsimd_fma;
5259 xnn_params.f32.dwconv[1].init.f32 = xnn_init_f32_minmax_wasmsimd_params;
5260 xnn_params.f32.dwconv[1].channel_tile = 8;
5261 xnn_params.f32.dwconv[1].primary_tile = 4;
5262
5263 xnn_params.f32.dwconv[2].minmax.unipass = (xnn_dwconv_unipass_ukernel_fn) xnn_f32_dwconv_minmax_ukernel_9p8c__wasmrelaxedsimd_fma;
5264 xnn_params.f32.dwconv[2].linear.unipass = (xnn_dwconv_unipass_ukernel_fn) xnn_f32_dwconv_ukernel_9p8c__wasmrelaxedsimd_fma;
5265 xnn_params.f32.dwconv[2].init.f32 = xnn_init_f32_minmax_wasmsimd_params;
5266 xnn_params.f32.dwconv[2].channel_tile = 8;
5267 xnn_params.f32.dwconv[2].primary_tile = 9;
5268 #else
5269 if (hardware_config->is_x86) {
5270 xnn_params.f32.dwconv[0].minmax.unipass = (xnn_dwconv_unipass_ukernel_fn) xnn_f32_dwconv_minmax_ukernel_3p8c__wasmsimd_x86;
5271 xnn_params.f32.dwconv[0].linear.unipass = (xnn_dwconv_unipass_ukernel_fn) xnn_f32_dwconv_ukernel_3p8c__wasmsimd;
5272 xnn_params.f32.dwconv[0].init.f32 = xnn_init_f32_minmax_wasmsimd_params;
5273 xnn_params.f32.dwconv[0].channel_tile = 8;
5274 xnn_params.f32.dwconv[0].primary_tile = 3;
5275
5276 xnn_params.f32.dwconv[1].minmax.unipass = (xnn_dwconv_unipass_ukernel_fn) xnn_f32_dwconv_minmax_ukernel_4p8c__wasmsimd_x86;
5277 xnn_params.f32.dwconv[1].linear.unipass = (xnn_dwconv_unipass_ukernel_fn) xnn_f32_dwconv_ukernel_4p8c__wasmsimd;
5278 xnn_params.f32.dwconv[1].init.f32 = xnn_init_f32_minmax_wasmsimd_params;
5279 xnn_params.f32.dwconv[1].channel_tile = 8;
5280 xnn_params.f32.dwconv[1].primary_tile = 4;
5281
5282 xnn_params.f32.dwconv[2].minmax.unipass = (xnn_dwconv_unipass_ukernel_fn) xnn_f32_dwconv_minmax_ukernel_9p8c__wasmsimd_x86;
5283 xnn_params.f32.dwconv[2].linear.unipass = (xnn_dwconv_unipass_ukernel_fn) xnn_f32_dwconv_ukernel_9p8c__wasmsimd;
5284 xnn_params.f32.dwconv[2].init.f32 = xnn_init_f32_minmax_wasmsimd_params;
5285 xnn_params.f32.dwconv[2].channel_tile = 8;
5286 xnn_params.f32.dwconv[2].primary_tile = 9;
5287 } else {
5288 xnn_params.f32.dwconv[0].minmax.unipass = (xnn_dwconv_unipass_ukernel_fn) xnn_f32_dwconv_minmax_ukernel_3p4c__wasmsimd_arm;
5289 xnn_params.f32.dwconv[0].linear.unipass = (xnn_dwconv_unipass_ukernel_fn) xnn_f32_dwconv_ukernel_3p4c__wasmsimd;
5290 xnn_params.f32.dwconv[0].init.f32 = xnn_init_f32_minmax_wasmsimd_params;
5291 xnn_params.f32.dwconv[0].channel_tile = 4;
5292 xnn_params.f32.dwconv[0].primary_tile = 3;
5293
5294 xnn_params.f32.dwconv[1].minmax.unipass = (xnn_dwconv_unipass_ukernel_fn) xnn_f32_dwconv_minmax_ukernel_4p4c__wasmsimd_arm;
5295 xnn_params.f32.dwconv[1].linear.unipass = (xnn_dwconv_unipass_ukernel_fn) xnn_f32_dwconv_ukernel_4p4c__wasmsimd;
5296 xnn_params.f32.dwconv[1].init.f32 = xnn_init_f32_minmax_wasmsimd_params;
5297 xnn_params.f32.dwconv[1].channel_tile = 4;
5298 xnn_params.f32.dwconv[1].primary_tile = 4;
5299
5300 xnn_params.f32.dwconv[2].minmax.unipass = (xnn_dwconv_unipass_ukernel_fn) xnn_f32_dwconv_minmax_ukernel_9p4c__wasmsimd_arm;
5301 xnn_params.f32.dwconv[2].linear.unipass = (xnn_dwconv_unipass_ukernel_fn) xnn_f32_dwconv_ukernel_9p4c__wasmsimd;
5302 xnn_params.f32.dwconv[2].init.f32 = xnn_init_f32_minmax_wasmsimd_params;
5303 xnn_params.f32.dwconv[2].channel_tile = 4;
5304 xnn_params.f32.dwconv[2].primary_tile = 9;
5305 }
5306 #endif
5307
5308 #if XNN_ARCH_WASMRELAXEDSIMD
5309 xnn_params.f32.dwconv[3].minmax.unipass = (xnn_dwconv_unipass_ukernel_fn) xnn_f32_dwconv_minmax_ukernel_25p8c__wasmrelaxedsimd_fma;
5310 xnn_params.f32.dwconv[3].linear.unipass = (xnn_dwconv_unipass_ukernel_fn) xnn_f32_dwconv_ukernel_25p8c__wasmrelaxedsimd_fma;
5311 xnn_params.f32.dwconv[3].init.f32 = xnn_init_f32_minmax_wasmsimd_params;
5312 xnn_params.f32.dwconv[3].channel_tile = 8;
5313 xnn_params.f32.dwconv[3].primary_tile = 25;
5314 #else
5315 xnn_params.f32.dwconv[3].minmax.unipass = (xnn_dwconv_unipass_ukernel_fn) xnn_f32_dwconv_minmax_ukernel_25p4c__wasmsimd_arm;
5316 xnn_params.f32.dwconv[3].linear.unipass = (xnn_dwconv_unipass_ukernel_fn) xnn_f32_dwconv_ukernel_25p4c__wasmsimd;
5317 xnn_params.f32.dwconv[3].init.f32 = xnn_init_f32_minmax_wasmsimd_params;
5318 xnn_params.f32.dwconv[3].channel_tile = 4;
5319 xnn_params.f32.dwconv[3].primary_tile = 25;
5320 #endif
5321
5322 if (hardware_config->is_x86) {
5323 xnn_params.f32.avgpool = (struct avgpool_parameters) {
5324 .unipass = (xnn_avgpool_unipass_ukernel_fn) xnn_f32_avgpool_minmax_ukernel_9x__wasmsimd_x86_c4,
5325 .multipass = (xnn_avgpool_multipass_ukernel_fn) xnn_f32_avgpool_minmax_ukernel_9p8x__wasmsimd_x86_c4,
5326 .init.f32 = xnn_init_f32_scaleminmax_scalar_params,
5327 .primary_tile = 9,
5328 .incremental_tile = 8,
5329 .channel_tile = 4,
5330 };
5331 xnn_params.f32.pavgpool = (struct pavgpool_parameters) {
5332 .unipass = (xnn_pavgpool_unipass_ukernel_fn) xnn_f32_pavgpool_minmax_ukernel_9x__wasmsimd_x86_c4,
5333 .multipass = (xnn_pavgpool_multipass_ukernel_fn) xnn_f32_pavgpool_minmax_ukernel_9p8x__wasmsimd_x86_c4,
5334 .init.f32 = xnn_init_f32_minmax_wasmsimd_params,
5335 .primary_tile = 9,
5336 .incremental_tile = 8,
5337 .channel_tile = 4,
5338 };
5339 xnn_params.f32.gavgpool = (struct gavgpool_parameters) {
5340 .unipass = (xnn_gavgpool_unipass_ukernel_fn) xnn_f32_gavgpool_minmax_ukernel_7x__wasmsimd_x86_c4,
5341 .multipass = (xnn_gavgpool_multipass_ukernel_fn) xnn_f32_gavgpool_minmax_ukernel_7p7x__wasmsimd_x86_c4,
5342 .init.f32 = xnn_init_f32_scaleminmax_scalar_params,
5343 .update.f32 = xnn_update_f32_scaleminmax_scalar_params,
5344 .row_tile = 7,
5345 .channel_tile = 4,
5346 };
5347 } else {
5348 xnn_params.f32.avgpool = (struct avgpool_parameters) {
5349 .unipass = (xnn_avgpool_unipass_ukernel_fn) xnn_f32_avgpool_minmax_ukernel_9x__wasmsimd_arm_c4,
5350 .multipass = (xnn_avgpool_multipass_ukernel_fn) xnn_f32_avgpool_minmax_ukernel_9p8x__wasmsimd_arm_c4,
5351 .init.f32 = xnn_init_f32_scaleminmax_scalar_params,
5352 .primary_tile = 9,
5353 .incremental_tile = 8,
5354 .channel_tile = 4,
5355 };
5356 xnn_params.f32.pavgpool = (struct pavgpool_parameters) {
5357 .unipass = (xnn_pavgpool_unipass_ukernel_fn) xnn_f32_pavgpool_minmax_ukernel_9x__wasmsimd_arm_c4,
5358 .multipass = (xnn_pavgpool_multipass_ukernel_fn) xnn_f32_pavgpool_minmax_ukernel_9p8x__wasmsimd_arm_c4,
5359 .init.f32 = xnn_init_f32_minmax_wasmsimd_params,
5360 .primary_tile = 9,
5361 .incremental_tile = 8,
5362 .channel_tile = 4,
5363 };
5364 xnn_params.f32.gavgpool = (struct gavgpool_parameters) {
5365 .unipass = (xnn_gavgpool_unipass_ukernel_fn) xnn_f32_gavgpool_minmax_ukernel_7x__wasmsimd_arm_c4,
5366 .multipass = (xnn_gavgpool_multipass_ukernel_fn) xnn_f32_gavgpool_minmax_ukernel_7p7x__wasmsimd_arm_c4,
5367 .init.f32 = xnn_init_f32_scaleminmax_scalar_params,
5368 .update.f32 = xnn_update_f32_scaleminmax_scalar_params,
5369 .row_tile = 7,
5370 .channel_tile = 4,
5371 };
5372 }
5373 if (hardware_config->is_x86) {
5374 xnn_params.f32.maxpool = (struct maxpool_parameters) {
5375 .ukernel = (xnn_maxpool_ukernel_fn) xnn_f32_maxpool_minmax_ukernel_9p8x__wasmsimd_x86_c4,
5376 .init.f32 = xnn_init_f32_minmax_wasmsimd_params,
5377 .mr = 9,
5378 .qr = 8,
5379 };
5380 } else {
5381 xnn_params.f32.maxpool = (struct maxpool_parameters) {
5382 .ukernel = (xnn_maxpool_ukernel_fn) xnn_f32_maxpool_minmax_ukernel_9p8x__wasmsimd_arm_c4,
5383 .init.f32 = xnn_init_f32_minmax_wasmsimd_params,
5384 .mr = 9,
5385 .qr = 8,
5386 };
5387 }
5388 xnn_params.f32.argmaxpool[0] = (struct argmaxpool_parameters) {
5389 .up = (xnn_argmaxpool_unipass_ukernel_fn) xnn_f32_argmaxpool_ukernel_4x__wasmsimd_c4,
5390 .mr = 4,
5391 };
5392 xnn_params.f32.argmaxpool[1] = (struct argmaxpool_parameters) {
5393 .up = (xnn_argmaxpool_unipass_ukernel_fn) xnn_f32_argmaxpool_ukernel_9x__wasmsimd_c4,
5394 .mr = 9,
5395 };
5396 xnn_params.f32.argmaxpool[2] = (struct argmaxpool_parameters) {
5397 .mp = (xnn_argmaxpool_multipass_ukernel_fn) xnn_f32_argmaxpool_ukernel_9p8x__wasmsimd_c4,
5398 .mr = 9,
5399 .qr = 8,
5400 };
5401 #if XNN_ARCH_WASMRELAXEDSIMD
5402 xnn_params.f32.ibilinear = (struct ibilinear_parameters) {
5403 .ukernel = (xnn_ibilinear_ukernel_fn) xnn_f32_ibilinear_ukernel__wasmrelaxedsimd_c8,
5404 .pixel_tile = 1,
5405 .channel_tile = 8,
5406 };
5407 #else
5408 xnn_params.f32.ibilinear = (struct ibilinear_parameters) {
5409 .ukernel = (xnn_ibilinear_ukernel_fn) xnn_f32_ibilinear_ukernel__wasmsimd_c8,
5410 .pixel_tile = 1,
5411 .channel_tile = 8,
5412 };
5413 #endif
5414 xnn_params.f32.abs = (struct vunary_parameters) {
5415 .ukernel = (xnn_vunary_ukernel_fn) xnn_f32_vabs_ukernel__wasmsimd_x8,
5416 .init.f32_abs = xnn_init_f32_abs_wasmsimd_params,
5417 .element_tile = 8,
5418 };
5419 if (hardware_config->is_x86) {
5420 xnn_params.f32.clamp = (struct vunary_parameters) {
5421 .ukernel = (xnn_vunary_ukernel_fn) xnn_f32_vclamp_ukernel__wasmsimd_x86_x8,
5422 .init.f32_minmax = xnn_init_f32_minmax_wasmsimd_params,
5423 .element_tile = 8,
5424 };
5425 } else {
5426 xnn_params.f32.clamp = (struct vunary_parameters) {
5427 .ukernel = (xnn_vunary_ukernel_fn) xnn_f32_vclamp_ukernel__wasmsimd_arm_x8,
5428 .init.f32_minmax = xnn_init_f32_minmax_wasmsimd_params,
5429 .element_tile = 8,
5430 };
5431 }
5432 #if XNN_ARCH_WASMRELAXEDSIMD
5433 xnn_params.f32.elu = (struct vunary_parameters) {
5434 .ukernel = (xnn_vunary_ukernel_fn) xnn_f32_velu_ukernel__wasmrelaxedsimd_fma_rr2_p6_x24,
5435 .init.f32_elu = xnn_init_f32_elu_wasmsimd_rr2_p6_params,
5436 .element_tile = 24,
5437 };
5438 #else
5439 if (hardware_config->is_x86) {
5440 xnn_params.f32.elu = (struct vunary_parameters) {
5441 .ukernel = (xnn_vunary_ukernel_fn) xnn_f32_velu_ukernel__wasmsimd_x86_rr2_p6_x20,
5442 .init.f32_elu = xnn_init_f32_elu_wasmsimd_rr2_p6_params,
5443 .element_tile = 20,
5444 };
5445 } else {
5446 xnn_params.f32.elu = (struct vunary_parameters) {
5447 .ukernel = (xnn_vunary_ukernel_fn) xnn_f32_velu_ukernel__wasmsimd_arm_rr2_p6_x20,
5448 .init.f32_elu = xnn_init_f32_elu_wasmsimd_rr2_p6_params,
5449 .element_tile = 20,
5450 };
5451 }
5452 #endif
5453 xnn_params.f32.hswish = (struct vunary_parameters) {
5454 .ukernel = (xnn_vunary_ukernel_fn) xnn_f32_vhswish_ukernel__wasmsimd_x16,
5455 .init.f32_hswish = xnn_init_f32_hswish_wasmsimd_params,
5456 .element_tile = 16,
5457 };
5458 #if XNN_ARCH_WASMRELAXEDSIMD
5459 if (hardware_config->is_x86) {
5460 xnn_params.f32.lrelu = (struct vunary_parameters) {
5461 .ukernel = (xnn_vunary_ukernel_fn) xnn_f32_vlrelu_ukernel__wasmrelaxedsimd_iminmax_x4,
5462 .init.f32_lrelu = xnn_init_f32_lrelu_wasmsimd_params,
5463 .element_tile = 4,
5464 };
5465 } else {
5466 xnn_params.f32.lrelu = (struct vunary_parameters) {
5467 .ukernel = (xnn_vunary_ukernel_fn) xnn_f32_vlrelu_ukernel__wasmrelaxedsimd_laneselect_x4,
5468 .init.f32_lrelu = xnn_init_f32_lrelu_wasmsimd_params,
5469 .element_tile = 4,
5470 };
5471 }
5472 #else
5473 if (hardware_config->is_x86) {
5474 xnn_params.f32.lrelu = (struct vunary_parameters) {
5475 .ukernel = (xnn_vunary_ukernel_fn) xnn_f32_vlrelu_ukernel__wasmsimd_iminmax_x8,
5476 .init.f32_lrelu = xnn_init_f32_lrelu_wasmsimd_params,
5477 .element_tile = 8,
5478 };
5479 } else {
5480 xnn_params.f32.lrelu = (struct vunary_parameters) {
5481 .ukernel = (xnn_vunary_ukernel_fn) xnn_f32_vlrelu_ukernel__wasmsimd_laneselect_x8,
5482 .init.f32_lrelu = xnn_init_f32_lrelu_wasmsimd_params,
5483 .element_tile = 8,
5484 };
5485 }
5486 #endif
5487 xnn_params.f32.neg = (struct vunary_parameters) {
5488 .ukernel = (xnn_vunary_ukernel_fn) xnn_f32_vneg_ukernel__wasmsimd_x8,
5489 .init.f32_neg = xnn_init_f32_neg_wasmsimd_params,
5490 .element_tile = 16,
5491 };
5492 xnn_params.f32.relu = (struct vunary_parameters) {
5493 .ukernel = (xnn_vunary_ukernel_fn) xnn_f32_vrelu_ukernel__wasmsimd_x16,
5494 .element_tile = 16,
5495 };
5496 xnn_params.f32.rndne = (struct vunary_parameters) {
5497 .ukernel = (xnn_vunary_ukernel_fn) xnn_f32_vrndne_ukernel__wasmsimd_x8,
5498 .element_tile = 8,
5499 };
5500 xnn_params.f32.rndz = (struct vunary_parameters) {
5501 .ukernel = (xnn_vunary_ukernel_fn) xnn_f32_vrndz_ukernel__wasmsimd_x8,
5502 .element_tile = 8,
5503 };
5504 xnn_params.f32.rndu = (struct vunary_parameters) {
5505 .ukernel = (xnn_vunary_ukernel_fn) xnn_f32_vrndu_ukernel__wasmsimd_x8,
5506 .element_tile = 8,
5507 };
5508 xnn_params.f32.rndd = (struct vunary_parameters) {
5509 .ukernel = (xnn_vunary_ukernel_fn) xnn_f32_vrndd_ukernel__wasmsimd_x8,
5510 .element_tile = 8,
5511 };
5512 #if XNN_ARCH_WASMRELAXEDSIMD
5513 xnn_params.f32.sigmoid = (struct vunary_parameters) {
5514 .ukernel = (xnn_vunary_ukernel_fn) xnn_f32_vsigmoid_ukernel__wasmrelaxedsimd_fma_rr2_p5_div_x24,
5515 .init.f32_sigmoid = xnn_init_f32_sigmoid_wasmsimd_rr2_p5_params,
5516 .element_tile = 24,
5517 };
5518 #else
5519 xnn_params.f32.sigmoid = (struct vunary_parameters) {
5520 .ukernel = (xnn_vunary_ukernel_fn) xnn_f32_vsigmoid_ukernel__wasmsimd_rr2_p5_div_x16,
5521 .init.f32_sigmoid = xnn_init_f32_sigmoid_wasmsimd_rr2_p5_params,
5522 .element_tile = 16,
5523 };
5524 #endif
5525 xnn_params.f32.sqr = (struct vunary_parameters) {
5526 .ukernel = (xnn_vunary_ukernel_fn) xnn_f32_vsqr_ukernel__wasmsimd_x8,
5527 .element_tile = 16,
5528 };
5529 xnn_params.f32.sqrt = (struct vunary_parameters) {
5530 .ukernel = (xnn_vunary_ukernel_fn) xnn_f32_vsqrt_ukernel__wasmsimd_sqrt_x8,
5531 .element_tile = 8,
5532 };
5533 #if XNN_ARCH_WASMRELAXEDSIMD
5534 if (hardware_config->is_x86) {
5535 xnn_params.f32.prelu = (struct prelu_parameters) {
5536 .ukernel = (xnn_prelu_ukernel_fn) xnn_f32_prelu_ukernel__wasmrelaxedsimd_iminmax_2x4,
5537 .row_tile = 2,
5538 .channel_tile = 4,
5539 };
5540 } else {
5541 xnn_params.f32.prelu = (struct prelu_parameters) {
5542 .ukernel = (xnn_prelu_ukernel_fn) xnn_f32_prelu_ukernel__wasmrelaxedsimd_laneselect_2x4,
5543 .row_tile = 2,
5544 .channel_tile = 4,
5545 };
5546 }
5547 xnn_params.f32.raddstoreexpminusmax = (struct raddstoreexpminusmax_parameters) {
5548 .ukernel = (xnn_raddstoreexpminusmax_ukernel_fn) xnn_f32_raddstoreexpminusmax_ukernel__wasmrelaxedsimd_rr2_p5_x16_acc2,
5549 .init.f32 = xnn_init_f32_expminus_wasmsimd_rr2_p5_params,
5550 .element_tile = 16,
5551 };
5552 #else
5553 if (hardware_config->is_x86) {
5554 xnn_params.f32.prelu = (struct prelu_parameters) {
5555 .ukernel = (xnn_prelu_ukernel_fn) xnn_f32_prelu_ukernel__wasmsimd_iminmax_2x8,
5556 .row_tile = 2,
5557 .channel_tile = 8,
5558 };
5559 } else {
5560 xnn_params.f32.prelu = (struct prelu_parameters) {
5561 .ukernel = (xnn_prelu_ukernel_fn) xnn_f32_prelu_ukernel__wasmsimd_laneselect_2x8,
5562 .row_tile = 2,
5563 .channel_tile = 8,
5564 };
5565 }
5566 xnn_params.f32.raddstoreexpminusmax = (struct raddstoreexpminusmax_parameters) {
5567 .ukernel = (xnn_raddstoreexpminusmax_ukernel_fn) xnn_f32_raddstoreexpminusmax_ukernel__wasmsimd_rr2_p5_x16_acc2,
5568 .init.f32 = xnn_init_f32_expminus_wasmsimd_rr2_p5_params,
5569 .element_tile = 16,
5570 };
5571 #endif
5572 if (hardware_config->is_x86) {
5573 xnn_params.f32.rmax = (xnn_rmax_ukernel_fn) xnn_f32_rmax_ukernel__wasmsimd_x86;
5574 } else {
5575 xnn_params.f32.rmax = (xnn_rmax_ukernel_fn) xnn_f32_rmax_ukernel__wasmsimd_arm;
5576 }
5577 #if XNN_ARCH_WASMRELAXEDSIMD
5578 xnn_params.f32.vmulcaddc = (struct vmulcaddc_parameters) {
5579 .ukernel = (xnn_vmulcaddc_ukernel_fn) xnn_f32_vmulcaddc_minmax_ukernel_c4__wasmrelaxedsimd_fma_2x,
5580 .init.f32 = xnn_init_f32_minmax_wasmsimd_params,
5581 .channel_tile = 4,
5582 .row_tile = 2,
5583 };
5584 #else
5585 if (hardware_config->is_x86) {
5586 xnn_params.f32.vmulcaddc = (struct vmulcaddc_parameters) {
5587 .ukernel = (xnn_vmulcaddc_ukernel_fn) xnn_f32_vmulcaddc_minmax_ukernel_c4__wasmsimd_x86_2x,
5588 .init.f32 = xnn_init_f32_minmax_wasmsimd_params,
5589 .channel_tile = 4,
5590 .row_tile = 2,
5591 };
5592 } else {
5593 xnn_params.f32.vmulcaddc = (struct vmulcaddc_parameters) {
5594 .ukernel = (xnn_vmulcaddc_ukernel_fn) xnn_f32_vmulcaddc_minmax_ukernel_c4__wasmsimd_arm_2x,
5595 .init.f32 = xnn_init_f32_minmax_wasmsimd_params,
5596 .channel_tile = 4,
5597 .row_tile = 2,
5598 };
5599 }
5600 #endif
5601 #ifndef XNN_NO_NCHW_OPERATORS
5602 init_flags |= XNN_INIT_FLAG_CHW_OPT;
5603
5604 if (hardware_config->is_x86) {
5605 xnn_params.f32.spmm = (struct spmm_parameters) {
5606 .ukernel = (xnn_spmm_ukernel_fn) xnn_f32_spmm_minmax_ukernel_32x1__wasmsimd_x86,
5607 .init.f32 = xnn_init_f32_minmax_wasmsimd_params,
5608 .mr = 32,
5609 .nr = 1,
5610 };
5611 } else {
5612 xnn_params.f32.spmm = (struct spmm_parameters) {
5613 .ukernel = (xnn_spmm_ukernel_fn) xnn_f32_spmm_minmax_ukernel_32x1__wasmsimd_arm,
5614 .init.f32 = xnn_init_f32_minmax_wasmsimd_params,
5615 .mr = 32,
5616 .nr = 1,
5617 };
5618 }
5619 xnn_params.f32.conv_hwc2chw_3x3c3s2 = (struct conv_hwc2chw_parameters) {
5620 .ukernel_with_symm_padding =
5621 (xnn_conv_hwc2chw_ukernel_fn) xnn_f32_conv_hwc2chw_ukernel_3x3s2p1c3x4__wasmsimd_2x2,
5622 .init.f32 = xnn_init_f32_minmax_wasmsimd_params,
5623 .output_channel_tile = 4,
5624 .output_height_tile = 2,
5625 .output_width_tile = 2,
5626 };
5627 if (hardware_config->is_x86) {
5628 xnn_params.f32.dwconv2d_chw_3x3 = (struct dwconv2d_chw_parameters) {
5629 .ukernel = (xnn_dwconv2d_chw_ukernel_fn) xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_loadsplat_2x4,
5630 .init.f32 = xnn_init_f32_chw_wasmsimd_stride1_params,
5631 .update.f32 = xnn_update_f32_chw_wasmsimd_stride1_params,
5632 .output_height_tile = 2,
5633 .output_width_tile = 4,
5634 };
5635 xnn_params.f32.dwconv2d_chw_3x3s2 = (struct dwconv2d_chw_parameters) {
5636 .ukernel = (xnn_dwconv2d_chw_ukernel_fn) xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__wasmsimd_x86_splat_1x4_acc2,
5637 .init.f32 = xnn_init_f32_chw_wasmsimd_stride2_params,
5638 .update.f32 = xnn_update_f32_chw_wasmsimd_stride2_params,
5639 .output_height_tile = 1,
5640 .output_width_tile = 4,
5641 };
5642 xnn_params.f32.dwconv2d_chw_5x5 = (struct dwconv2d_chw_parameters) {
5643 .ukernel = (xnn_dwconv2d_chw_ukernel_fn) xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_splat_3x4,
5644 .init.f32 = xnn_init_f32_chw_wasmsimd_stride1_params,
5645 .update.f32 = xnn_update_f32_chw_wasmsimd_stride1_params,
5646 .output_height_tile = 3,
5647 .output_width_tile = 4,
5648 };
5649 xnn_params.f32.dwconv2d_chw_5x5s2 = (struct dwconv2d_chw_parameters) {
5650 .ukernel = (xnn_dwconv2d_chw_ukernel_fn) xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_x86_splat_1x4_acc2,
5651 .init.f32 = xnn_init_f32_chw_wasmsimd_stride2_params,
5652 .update.f32 = xnn_update_f32_chw_wasmsimd_stride2_params,
5653 .output_height_tile = 1,
5654 .output_width_tile = 4,
5655 };
5656 } else {
5657 xnn_params.f32.dwconv2d_chw_3x3 = (struct dwconv2d_chw_parameters) {
5658 .ukernel = (xnn_dwconv2d_chw_ukernel_fn) xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_loadsplat_2x4,
5659 .init.f32 = xnn_init_f32_chw_wasmsimd_stride1_params,
5660 .update.f32 = xnn_update_f32_chw_wasmsimd_stride1_params,
5661 .output_height_tile = 2,
5662 .output_width_tile = 4,
5663 };
5664 xnn_params.f32.dwconv2d_chw_3x3s2 = (struct dwconv2d_chw_parameters) {
5665 .ukernel = (xnn_dwconv2d_chw_ukernel_fn) xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__wasmsimd_arm_splat_1x4_acc4,
5666 .init.f32 = xnn_init_f32_chw_wasmsimd_stride2_params,
5667 .update.f32 = xnn_update_f32_chw_wasmsimd_stride2_params,
5668 .output_height_tile = 1,
5669 .output_width_tile = 4,
5670 };
5671 xnn_params.f32.dwconv2d_chw_5x5 = (struct dwconv2d_chw_parameters) {
5672 .ukernel = (xnn_dwconv2d_chw_ukernel_fn) xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_3x4,
5673 .init.f32 = xnn_init_f32_chw_wasmsimd_stride1_params,
5674 .update.f32 = xnn_update_f32_chw_wasmsimd_stride1_params,
5675 .output_height_tile = 3,
5676 .output_width_tile = 4,
5677 };
5678 xnn_params.f32.dwconv2d_chw_5x5s2 = (struct dwconv2d_chw_parameters) {
5679 .ukernel = (xnn_dwconv2d_chw_ukernel_fn) xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_arm_splat_1x4_acc2,
5680 .init.f32 = xnn_init_f32_chw_wasmsimd_stride2_params,
5681 .update.f32 = xnn_update_f32_chw_wasmsimd_stride2_params,
5682 .output_height_tile = 1,
5683 .output_width_tile = 4,
5684 };
5685 }
5686 if (hardware_config->is_x86) {
5687 xnn_params.f32.gavgpool_cw = (struct gavgpool_cw_parameters) {
5688 .ukernel = (xnn_gavgpool_cw_ukernel_fn) xnn_f32_gavgpool_cw_ukernel__wasmsimd_x86_x4,
5689 .channel_tile = 4,
5690 };
5691 } else {
5692 xnn_params.f32.gavgpool_cw = (struct gavgpool_cw_parameters) {
5693 .ukernel = (xnn_gavgpool_cw_ukernel_fn) xnn_f32_gavgpool_cw_ukernel__wasmsimd_arm_x4,
5694 .channel_tile = 4,
5695 };
5696 }
5697 xnn_params.f32.ibilinear_chw = (struct ibilinear_chw_parameters) {
5698 .ukernel = (xnn_ibilinear_chw_ukernel_fn) xnn_f32_ibilinear_chw_ukernel__wasmsimd_p8,
5699 .channel_tile = 1,
5700 .pixel_tile = 8,
5701 };
5702 #endif // XNN_NO_NCHW_OPERATORS
5703 #endif // XNN_NO_F32_OPERATORS
5704
5705 /*************************** VCVT WAsm SIMD micro-kernels***************************/
5706 #ifndef XNN_NO_VCVT_OPERATORS
5707 init_flags |= XNN_INIT_FLAG_VCVT;
5708
5709 #if XNN_ARCH_WASMRELAXEDSIMD
5710 xnn_params.vcvt.f16_to_f32 = (struct vunary_parameters) {
5711 .ukernel = (xnn_vunary_ukernel_fn) xnn_f16_f32_vcvt_ukernel__wasmrelaxedsimd_int16_x16,
5712 .init.f16_f32_cvt = xnn_init_f16_f32_cvt_wasmsimd_int16_params,
5713 .element_tile = 16,
5714 };
5715 xnn_params.vcvt.f32_to_f16 = (struct vunary_parameters) {
5716 .ukernel = (xnn_vunary_ukernel_fn) xnn_f32_f16_vcvt_ukernel__wasmrelaxedsimd_x24,
5717 .init.f32_f16_cvt = xnn_init_f32_f16_cvt_wasmsimd_params,
5718 .element_tile = 24,
5719 };
5720 #else
5721 xnn_params.vcvt.f16_to_f32 = (struct vunary_parameters) {
5722 .ukernel = (xnn_vunary_ukernel_fn) xnn_f16_f32_vcvt_ukernel__wasmsimd_int16_x16,
5723 .init.f16_f32_cvt = xnn_init_f16_f32_cvt_wasmsimd_int16_params,
5724 .element_tile = 16,
5725 };
5726 xnn_params.vcvt.f32_to_f16 = (struct vunary_parameters) {
5727 .ukernel = (xnn_vunary_ukernel_fn) xnn_f32_f16_vcvt_ukernel__wasmsimd_x24,
5728 .init.f32_f16_cvt = xnn_init_f32_f16_cvt_wasmsimd_params,
5729 .element_tile = 24,
5730 };
5731 #endif
5732 xnn_params.vcvt.f32_to_qs8 = (struct vunary_parameters) {
5733 .ukernel = (xnn_vunary_ukernel_fn) xnn_f32_qs8_vcvt_ukernel__wasmsimd_magic_x32,
5734 .init.f32_qs8_cvt = xnn_init_f32_qs8_cvt_wasmsimd_magic_params,
5735 .element_tile = 32,
5736 };
5737 xnn_params.vcvt.f32_to_qu8 = (struct vunary_parameters) {
5738 .ukernel = (xnn_vunary_ukernel_fn) xnn_f32_qu8_vcvt_ukernel__wasmsimd_magic_x32,
5739 .init.f32_qu8_cvt = xnn_init_f32_qu8_cvt_wasmsimd_magic_params,
5740 .element_tile = 32,
5741 };
5742 #if XNN_ARCH_WASMRELAXEDSIMD
5743 xnn_params.vcvt.qs8 = (struct vunary_parameters) {
5744 .ukernel = (xnn_vunary_ukernel_fn) xnn_qs8_vcvt_ukernel__wasmrelaxedsimd_x32,
5745 .init.qs8_cvt = xnn_init_qs8_cvt_wasmsimd_params,
5746 .element_tile = 32,
5747 };
5748 #else
5749 xnn_params.vcvt.qs8 = (struct vunary_parameters) {
5750 .ukernel = (xnn_vunary_ukernel_fn) xnn_qs8_vcvt_ukernel__wasmsimd_x16,
5751 .init.qs8_cvt = xnn_init_qs8_cvt_wasmsimd_params,
5752 .element_tile = 16,
5753 };
5754 #endif
5755 xnn_params.vcvt.qs8_to_f32 = (struct vunary_parameters) {
5756 .ukernel = (xnn_vunary_ukernel_fn) xnn_qs8_f32_vcvt_ukernel__wasmsimd_x32,
5757 .init.qs8_f32_cvt = xnn_init_qs8_f32_cvt_wasmsimd_params,
5758 .element_tile = 32,
5759 };
5760 #if XNN_ARCH_WASMRELAXEDSIMD
5761 xnn_params.vcvt.qu8 = (struct vunary_parameters) {
5762 .ukernel = (xnn_vunary_ukernel_fn) xnn_qu8_vcvt_ukernel__wasmrelaxedsimd_x32,
5763 .init.qu8_cvt = xnn_init_qu8_cvt_wasmsimd_params,
5764 .element_tile = 32,
5765 };
5766 #else
5767 xnn_params.vcvt.qu8 = (struct vunary_parameters) {
5768 .ukernel = (xnn_vunary_ukernel_fn) xnn_qu8_vcvt_ukernel__wasmsimd_x16,
5769 .init.qu8_cvt = xnn_init_qu8_cvt_wasmsimd_params,
5770 .element_tile = 16,
5771 };
5772 #endif
5773 xnn_params.vcvt.qu8_to_f32 = (struct vunary_parameters) {
5774 .ukernel = (xnn_vunary_ukernel_fn) xnn_qu8_f32_vcvt_ukernel__wasmsimd_x32,
5775 .init.qu8_f32_cvt = xnn_init_qu8_f32_cvt_wasmsimd_params,
5776 .element_tile = 32,
5777 };
5778 #endif // XNN_NO_VCVT_OPERATORS
5779
5780 /**************************** X32 WAsm SIMD micro-kernels****************************/
5781 #ifndef XNN_NO_X32_OPERATORS
5782 init_flags |= XNN_INIT_FLAG_X32;
5783
5784 xnn_params.x32.unpool = (xnn_unpool_ukernel_fn) xnn_x32_unpool_ukernel__wasmsimd;
5785 xnn_params.x32.zip = (struct zip_parameters) {
5786 .x2 = (xnn_zipc_ukernel_fn) xnn_x32_zip_x2_ukernel__wasmsimd,
5787 .x3 = (xnn_zipc_ukernel_fn) xnn_x32_zip_x3_ukernel__wasmsimd,
5788 .x4 = (xnn_zipc_ukernel_fn) xnn_x32_zip_x4_ukernel__wasmsimd,
5789 .xm = (xnn_zipv_ukernel_fn) xnn_x32_zip_xm_ukernel__wasmsimd,
5790 };
5791 #endif // XNN_NO_X32_OPERATORS
5792
5793 /**************************** XX WAsm SIMD micro-kernels****************************/
5794 #ifndef XNN_NO_XX_OPERATORS
5795 init_flags |= XNN_INIT_FLAG_XX;
5796
5797 xnn_params.xx.copy = (xnn_vunary_ukernel_fn) xnn_xx_copy_ukernel__scalar_memcpy;
5798 xnn_params.xx.fill = (struct fill_parameters) {
5799 .ukernel = (xnn_fill_ukernel_fn) xnn_xx_fill_ukernel__wasmsimd_x64,
5800 .row_tile = 1,
5801 };
5802 xnn_params.xx.pad = (struct pad_parameters) {
5803 .ukernel = (xnn_pad_ukernel_fn) xnn_xx_pad_ukernel__wasmsimd,
5804 .row_tile = 1,
5805 };
5806 #endif
5807
5808#elif XNN_ARCH_WASM
5809
5810 /**************************** QC8 WAsm micro-kernels****************************/
5811 #ifndef XNN_NO_QC8_OPERATORS
5812 init_flags |= XNN_INIT_FLAG_QC8;
5813
5814 if (hardware_config->is_x86) {
5815 xnn_params.qc8.gemm.minmax.gemm[XNN_MR_TO_INDEX(2)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_fn) xnn_qc8_gemm_minmax_fp32_ukernel_2x2__scalar_imagic);
5816 xnn_params.qc8.gemm.minmax.igemm[XNN_MR_TO_INDEX(2)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_fn) xnn_qc8_igemm_minmax_fp32_ukernel_2x2__scalar_imagic);
5817 xnn_params.qc8.gemm.minmax.gemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_fn) xnn_qc8_gemm_minmax_fp32_ukernel_1x2__scalar_imagic);
5818 xnn_params.qc8.gemm.minmax.igemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_fn) xnn_qc8_igemm_minmax_fp32_ukernel_1x2__scalar_imagic);
5819 xnn_params.qc8.gemm.init.qc8 = xnn_init_qc8_conv_minmax_fp32_scalar_imagic_params;
5820 xnn_params.qc8.gemm.mr = 2;
5821 xnn_params.qc8.gemm.nr = 2;
5822 } else {
5823 xnn_params.qc8.gemm.minmax.gemm[XNN_MR_TO_INDEX(4)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_fn) xnn_qc8_gemm_minmax_fp32_ukernel_4x4__wasm_fmagic);
5824 xnn_params.qc8.gemm.minmax.igemm[XNN_MR_TO_INDEX(4)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_fn) xnn_qc8_igemm_minmax_fp32_ukernel_4x4__wasm_fmagic);
5825 xnn_params.qc8.gemm.minmax.gemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_fn) xnn_qc8_gemm_minmax_fp32_ukernel_1x4__wasm_fmagic);
5826 xnn_params.qc8.gemm.minmax.igemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_fn) xnn_qc8_igemm_minmax_fp32_ukernel_1x4__wasm_fmagic);
5827 xnn_params.qc8.gemm.init.qc8 = xnn_init_qc8_conv_minmax_fp32_scalar_fmagic_params;
5828 xnn_params.qc8.gemm.mr = 4;
5829 xnn_params.qc8.gemm.nr = 4;
5830 }
5831
5832 if (hardware_config->is_x86) {
5833 xnn_params.qc8.dwconv[0].minmax.unipass = (xnn_dwconv_unipass_ukernel_fn) xnn_qc8_dwconv_minmax_fp32_ukernel_3p2c__scalar_imagic;
5834 xnn_params.qc8.dwconv[0].init.qc8 = xnn_init_qc8_conv_minmax_fp32_scalar_imagic_params;
5835 xnn_params.qc8.dwconv[0].channel_tile = 2;
5836 xnn_params.qc8.dwconv[0].primary_tile = 3;
5837 xnn_params.qc8.dwconv[1].minmax.unipass = (xnn_dwconv_unipass_ukernel_fn) xnn_qc8_dwconv_minmax_fp32_ukernel_9p2c__scalar_imagic;
5838 xnn_params.qc8.dwconv[1].init.qc8 = xnn_init_qc8_conv_minmax_fp32_scalar_imagic_params;
5839 xnn_params.qc8.dwconv[1].channel_tile = 2;
5840 xnn_params.qc8.dwconv[1].primary_tile = 9;
5841 xnn_params.qc8.dwconv[2].minmax.unipass = (xnn_dwconv_unipass_ukernel_fn) xnn_qc8_dwconv_minmax_fp32_ukernel_25p1c__scalar_imagic;
5842 xnn_params.qc8.dwconv[2].init.qc8 = xnn_init_qc8_conv_minmax_fp32_scalar_imagic_params;
5843 xnn_params.qc8.dwconv[2].channel_tile = 1;
5844 xnn_params.qc8.dwconv[2].primary_tile = 25;
5845 } else {
5846 xnn_params.qc8.dwconv[0].minmax.unipass = (xnn_dwconv_unipass_ukernel_fn) xnn_qc8_dwconv_minmax_fp32_ukernel_3p2c__wasm_fmagic;
5847 xnn_params.qc8.dwconv[0].init.qc8 = xnn_init_qc8_conv_minmax_fp32_scalar_fmagic_params;
5848 xnn_params.qc8.dwconv[0].channel_tile = 2;
5849 xnn_params.qc8.dwconv[0].primary_tile = 3;
5850 xnn_params.qc8.dwconv[1].minmax.unipass = (xnn_dwconv_unipass_ukernel_fn) xnn_qc8_dwconv_minmax_fp32_ukernel_9p2c__wasm_fmagic;
5851 xnn_params.qc8.dwconv[1].init.qc8 = xnn_init_qc8_conv_minmax_fp32_scalar_fmagic_params;
5852 xnn_params.qc8.dwconv[1].channel_tile = 2;
5853 xnn_params.qc8.dwconv[1].primary_tile = 9;
5854 xnn_params.qc8.dwconv[2].minmax.unipass = (xnn_dwconv_unipass_ukernel_fn) xnn_qc8_dwconv_minmax_fp32_ukernel_25p2c__wasm_fmagic;
5855 xnn_params.qc8.dwconv[2].init.qc8 = xnn_init_qc8_conv_minmax_fp32_scalar_fmagic_params;
5856 xnn_params.qc8.dwconv[2].channel_tile = 2;
5857 xnn_params.qc8.dwconv[2].primary_tile = 25;
5858 }
5859 #endif // XNN_NO_QC8_OPERATORS
5860
5861 /**************************** QS8 WAsm micro-kernels****************************/
5862 #ifndef XNN_NO_QS8_OPERATORS
5863 init_flags |= XNN_INIT_FLAG_QS8;
5864
5865 if (hardware_config->is_x86) {
5866 xnn_params.qs8.gemm.minmax.gemm[XNN_MR_TO_INDEX(2)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_fn) xnn_qs8_gemm_minmax_fp32_ukernel_2x2__scalar_imagic);
5867 xnn_params.qs8.gemm.minmax.igemm[XNN_MR_TO_INDEX(2)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_fn) xnn_qs8_igemm_minmax_fp32_ukernel_2x2__scalar_imagic);
5868 xnn_params.qs8.gemm.minmax.gemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_fn) xnn_qs8_gemm_minmax_fp32_ukernel_1x2__scalar_imagic);
5869 xnn_params.qs8.gemm.minmax.igemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_fn) xnn_qs8_igemm_minmax_fp32_ukernel_1x2__scalar_imagic);
5870 xnn_params.qs8.gemm.init.qs8 = xnn_init_qs8_conv_minmax_fp32_scalar_imagic_params;
5871 xnn_params.qs8.gemm.mr = 2;
5872 xnn_params.qs8.gemm.nr = 2;
5873 } else {
5874 xnn_params.qs8.gemm.minmax.gemm[XNN_MR_TO_INDEX(4)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_fn) xnn_qs8_gemm_minmax_fp32_ukernel_4x4__wasm_fmagic);
5875 xnn_params.qs8.gemm.minmax.igemm[XNN_MR_TO_INDEX(4)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_fn) xnn_qs8_igemm_minmax_fp32_ukernel_4x4__wasm_fmagic);
5876 xnn_params.qs8.gemm.minmax.gemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_fn) xnn_qs8_gemm_minmax_fp32_ukernel_1x4__wasm_fmagic);
5877 xnn_params.qs8.gemm.minmax.igemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_fn) xnn_qs8_igemm_minmax_fp32_ukernel_1x4__wasm_fmagic);
5878 xnn_params.qs8.gemm.init.qs8 = xnn_init_qs8_conv_minmax_fp32_scalar_fmagic_params;
5879 xnn_params.qs8.gemm.mr = 4;
5880 xnn_params.qs8.gemm.nr = 4;
5881 }
5882
5883 if (hardware_config->is_x86) {
5884 xnn_params.qs8.dwconv[0].minmax.unipass = (xnn_dwconv_unipass_ukernel_fn) xnn_qs8_dwconv_minmax_fp32_ukernel_9p2c__scalar_imagic;
5885 xnn_params.qs8.dwconv[0].init.qs8 = xnn_init_qs8_conv_minmax_fp32_scalar_imagic_params;
5886 xnn_params.qs8.dwconv[0].channel_tile = 2;
5887 xnn_params.qs8.dwconv[0].primary_tile = 9;
5888 xnn_params.qs8.dwconv[1].minmax.unipass = (xnn_dwconv_unipass_ukernel_fn) xnn_qs8_dwconv_minmax_fp32_ukernel_25p1c__scalar_imagic;
5889 xnn_params.qs8.dwconv[1].init.qs8 = xnn_init_qs8_conv_minmax_fp32_scalar_imagic_params;
5890 xnn_params.qs8.dwconv[1].channel_tile = 1;
5891 xnn_params.qs8.dwconv[1].primary_tile = 25;
5892 } else {
5893 xnn_params.qs8.dwconv[0].minmax.unipass = (xnn_dwconv_unipass_ukernel_fn) xnn_qs8_dwconv_minmax_fp32_ukernel_9p2c__wasm_fmagic;
5894 xnn_params.qs8.dwconv[0].init.qs8 = xnn_init_qs8_conv_minmax_fp32_scalar_fmagic_params;
5895 xnn_params.qs8.dwconv[0].channel_tile = 2;
5896 xnn_params.qs8.dwconv[0].primary_tile = 9;
5897 xnn_params.qs8.dwconv[1].minmax.unipass = (xnn_dwconv_unipass_ukernel_fn) xnn_qs8_dwconv_minmax_fp32_ukernel_25p2c__wasm_fmagic;
5898 xnn_params.qs8.dwconv[1].init.qs8 = xnn_init_qs8_conv_minmax_fp32_scalar_fmagic_params;
5899 xnn_params.qs8.dwconv[1].channel_tile = 2;
5900 xnn_params.qs8.dwconv[1].primary_tile = 25;
5901 }
5902
5903 xnn_params.qs8.gavgpool = (struct gavgpool_parameters) {
5904 .unipass = (xnn_gavgpool_unipass_ukernel_fn) xnn_qs8_gavgpool_minmax_fp32_ukernel_7x__scalar_imagic_c4,
5905 .multipass = (xnn_gavgpool_multipass_ukernel_fn) xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__scalar_imagic_c4,
5906 .init.qs8 = xnn_init_qs8_avgpool_minmax_fp32_scalar_imagic_params,
5907 .update.qs8 = xnn_update_qs8_avgpool_minmax_fp32_scalar_imagic_params,
5908 .row_tile = 7,
5909 .channel_tile = 4,
5910 };
5911
5912
5913 if (hardware_config->is_x86) {
5914 xnn_params.qs8.lrelu = (struct vunary_parameters) {
5915 .ukernel = (xnn_vunary_ukernel_fn) xnn_qs8_vlrelu_ukernel__scalar_select_x4,
5916 .init.qs8_lrelu = xnn_init_qs8_lrelu_scalar_select_params,
5917 .element_tile = 4,
5918 };
5919 } else {
5920 xnn_params.qs8.lrelu = (struct vunary_parameters) {
5921 .ukernel = (xnn_vunary_ukernel_fn) xnn_qs8_vlrelu_ukernel__scalar_andxor_x4,
5922 .init.qs8_lrelu = xnn_init_qs8_lrelu_scalar_andxor_params,
5923 .element_tile = 4,
5924 };
5925 }
5926 #endif // XNN_NO_QS8_OPERATORS
5927
5928 /**************************** QU8 WAsm micro-kernels****************************/
5929 #ifndef XNN_NO_QU8_OPERATORS
5930 init_flags |= XNN_INIT_FLAG_QU8;
5931
5932 if (hardware_config->is_x86) {
5933 xnn_params.qu8.gemm.minmax.gemm[XNN_MR_TO_INDEX(2)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_fn) xnn_qu8_gemm_minmax_fp32_ukernel_2x2__scalar_imagic);
5934 xnn_params.qu8.gemm.minmax.igemm[XNN_MR_TO_INDEX(2)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_fn) xnn_qu8_igemm_minmax_fp32_ukernel_2x2__scalar_imagic);
5935 xnn_params.qu8.gemm.minmax.gemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_fn) xnn_qu8_gemm_minmax_fp32_ukernel_1x2__scalar_imagic);
5936 xnn_params.qu8.gemm.minmax.igemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_fn) xnn_qu8_igemm_minmax_fp32_ukernel_1x2__scalar_imagic);
5937 xnn_params.qu8.gemm.init.qu8 = xnn_init_qu8_conv_minmax_fp32_scalar_imagic_params;
5938 xnn_params.qu8.gemm.mr = 2;
5939 xnn_params.qu8.gemm.nr = 2;
5940 } else {
5941 xnn_params.qu8.gemm.minmax.gemm[XNN_MR_TO_INDEX(4)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_fn) xnn_qu8_gemm_minmax_fp32_ukernel_4x4__wasm_fmagic);
5942 xnn_params.qu8.gemm.minmax.igemm[XNN_MR_TO_INDEX(4)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_fn) xnn_qu8_igemm_minmax_fp32_ukernel_4x4__wasm_fmagic);
5943 xnn_params.qu8.gemm.minmax.gemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_fn) xnn_qu8_gemm_minmax_fp32_ukernel_1x4__wasm_fmagic);
5944 xnn_params.qu8.gemm.minmax.igemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_fn) xnn_qu8_igemm_minmax_fp32_ukernel_1x4__wasm_fmagic);
5945 xnn_params.qu8.gemm.init.qu8 = xnn_init_qu8_conv_minmax_fp32_scalar_fmagic_params;
5946 xnn_params.qu8.gemm.mr = 4;
5947 xnn_params.qu8.gemm.nr = 4;
5948 }
5949
5950 if (hardware_config->is_x86) {
5951 xnn_params.qu8.dwconv[0].minmax.unipass = (xnn_dwconv_unipass_ukernel_fn) xnn_qu8_dwconv_minmax_fp32_ukernel_9p2c__scalar_imagic;
5952 xnn_params.qu8.dwconv[0].init.qu8 = xnn_init_qu8_conv_minmax_fp32_scalar_imagic_params;
5953 xnn_params.qu8.dwconv[0].channel_tile = 2;
5954 xnn_params.qu8.dwconv[0].primary_tile = 9;
5955 xnn_params.qu8.dwconv[1].minmax.unipass = (xnn_dwconv_unipass_ukernel_fn) xnn_qu8_dwconv_minmax_fp32_ukernel_25p1c__scalar_imagic;
5956 xnn_params.qu8.dwconv[1].init.qu8 = xnn_init_qu8_conv_minmax_fp32_scalar_imagic_params;
5957 xnn_params.qu8.dwconv[1].channel_tile = 1;
5958 xnn_params.qu8.dwconv[1].primary_tile = 25;
5959 } else {
5960 xnn_params.qu8.dwconv[0].minmax.unipass = (xnn_dwconv_unipass_ukernel_fn) xnn_qu8_dwconv_minmax_fp32_ukernel_9p2c__wasm_fmagic;
5961 xnn_params.qu8.dwconv[0].init.qu8 = xnn_init_qu8_conv_minmax_fp32_scalar_fmagic_params;
5962 xnn_params.qu8.dwconv[0].channel_tile = 2;
5963 xnn_params.qu8.dwconv[0].primary_tile = 9;
5964 xnn_params.qu8.dwconv[1].minmax.unipass = (xnn_dwconv_unipass_ukernel_fn) xnn_qu8_dwconv_minmax_fp32_ukernel_25p2c__wasm_fmagic;
5965 xnn_params.qu8.dwconv[1].init.qu8 = xnn_init_qu8_conv_minmax_fp32_scalar_fmagic_params;
5966 xnn_params.qu8.dwconv[1].channel_tile = 2;
5967 xnn_params.qu8.dwconv[1].primary_tile = 25;
5968 }
5969
5970 xnn_params.qu8.avgpool = (struct avgpool_parameters) {
5971 .unipass = (xnn_avgpool_unipass_ukernel_fn) xnn_qu8_avgpool_minmax_fp32_ukernel_9x__scalar_imagic_c1,
5972 .multipass = (xnn_avgpool_multipass_ukernel_fn) xnn_qu8_avgpool_minmax_fp32_ukernel_9p8x__scalar_imagic_c1,
5973 .init.qu8 = xnn_init_qu8_avgpool_minmax_fp32_scalar_imagic_params,
5974 .primary_tile = 9,
5975 .incremental_tile = 8,
5976 .channel_tile = 1,
5977 };
5978 xnn_params.qu8.gavgpool = (struct gavgpool_parameters) {
5979 .unipass = (xnn_gavgpool_unipass_ukernel_fn) xnn_qu8_gavgpool_minmax_fp32_ukernel_7x__scalar_imagic_c4,
5980 .multipass = (xnn_gavgpool_multipass_ukernel_fn) xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__scalar_imagic_c4,
5981 .init.qu8 = xnn_init_qu8_avgpool_minmax_fp32_scalar_imagic_params,
5982 .update.qu8 = xnn_update_qu8_avgpool_minmax_fp32_scalar_imagic_params,
5983 .row_tile = 7,
5984 .channel_tile = 4,
5985 };
5986
5987 if (hardware_config->is_x86) {
5988 xnn_params.qu8.lrelu = (struct vunary_parameters) {
5989 .ukernel = (xnn_vunary_ukernel_fn) xnn_qu8_vlrelu_ukernel__scalar_select_x4,
5990 .init.qu8_lrelu = xnn_init_qu8_lrelu_scalar_select_params,
5991 .element_tile = 4,
5992 };
5993 } else {
5994 xnn_params.qu8.lrelu = (struct vunary_parameters) {
5995 .ukernel = (xnn_vunary_ukernel_fn) xnn_qu8_vlrelu_ukernel__scalar_andxor_x4,
5996 .init.qu8_lrelu = xnn_init_qu8_lrelu_scalar_andxor_params,
5997 .element_tile = 4,
5998 };
5999 }
6000 #endif // XNN_NO_QU8_OPERATORS
6001
6002 /**************************** S8 WAsm micro-kernels****************************/
6003 #ifndef XNN_NO_S8_OPERATORS
6004 init_flags |= XNN_INIT_FLAG_S8;
6005
6006 xnn_params.s8.clamp = (struct vunary_parameters) {
6007 .ukernel = (xnn_vunary_ukernel_fn) xnn_s8_vclamp_ukernel__scalar_x4,
6008 .init.s8_minmax = xnn_init_s8_minmax_scalar_params,
6009 .element_tile = 4,
6010 };
6011 xnn_params.s8.ibilinear = (struct ibilinear_parameters) {
6012 .ukernel = (xnn_ibilinear_ukernel_fn) xnn_s8_ibilinear_ukernel__scalar_c1,
6013 .pixel_tile = 1,
6014 .channel_tile = 1,
6015 };
6016 xnn_params.s8.maxpool = (struct maxpool_parameters) {
6017 .ukernel = (xnn_maxpool_ukernel_fn) xnn_s8_maxpool_minmax_ukernel_9p8x__scalar_c1,
6018 .init.s8 = xnn_init_s8_minmax_scalar_params,
6019 .mr = 9,
6020 .qr = 8,
6021 };
6022 #endif // XNN_NO_S8_OPERATORS
6023
6024 /**************************** U8 WAsm micro-kernels****************************/
6025 #ifndef XNN_NO_U8_OPERATORS
6026 init_flags |= XNN_INIT_FLAG_U8;
6027
6028 xnn_params.u8.clamp = (struct vunary_parameters) {
6029 .ukernel = (xnn_vunary_ukernel_fn) xnn_u8_vclamp_ukernel__scalar_x4,
6030 .init.u8_minmax = xnn_init_u8_minmax_scalar_params,
6031 .element_tile = 4,
6032 };
6033 xnn_params.u8.ibilinear = (struct ibilinear_parameters) {
6034 .ukernel = (xnn_ibilinear_ukernel_fn) xnn_u8_ibilinear_ukernel__scalar_c1,
6035 .pixel_tile = 1,
6036 .channel_tile = 1,
6037 };
6038 xnn_params.u8.maxpool = (struct maxpool_parameters) {
6039 .ukernel = (xnn_maxpool_ukernel_fn) xnn_u8_maxpool_minmax_ukernel_9p8x__scalar_c1,
6040 .init.u8 = xnn_init_u8_minmax_scalar_params,
6041 .mr = 9,
6042 .qr = 8,
6043 };
6044 xnn_params.u8.lut32norm = xnn_u8_lut32norm_ukernel__scalar;
6045 xnn_params.u8.rmax = xnn_u8_rmax_ukernel__scalar;
6046 #endif // XNN_NO_U8_OPERATORS
6047
6048 /**************************** X8 WAsm micro-kernels****************************/
6049 #ifndef XNN_NO_X8_OPERATORS
6050 init_flags |= XNN_INIT_FLAG_X8;
6051
6052 xnn_params.x8.zip = (struct zip_parameters) {
6053 .x2 = (xnn_zipc_ukernel_fn) xnn_x8_zip_x2_ukernel__scalar,
6054 .x3 = (xnn_zipc_ukernel_fn) xnn_x8_zip_x3_ukernel__scalar,
6055 .x4 = (xnn_zipc_ukernel_fn) xnn_x8_zip_x4_ukernel__scalar,
6056 .xm = (xnn_zipv_ukernel_fn) xnn_x8_zip_xm_ukernel__scalar,
6057 };
6058 #endif // XNN_NO_X8_OPERATORS
6059
6060 /**************************** F32 WAsm micro-kernels****************************/
6061 #ifndef XNN_NO_F32_OPERATORS
6062 init_flags |= XNN_INIT_FLAG_F32;
6063
6064 if (hardware_config->is_x86) {
6065 xnn_params.f32.gemm.minmax.gemm[XNN_MR_TO_INDEX(2)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_fn) xnn_f32_gemm_minmax_ukernel_2x4__scalar);
6066 xnn_params.f32.gemm.minmax.igemm[XNN_MR_TO_INDEX(2)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_fn) xnn_f32_igemm_minmax_ukernel_2x4__scalar);
6067 xnn_params.f32.gemm.minmax.gemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_fn) xnn_f32_gemm_minmax_ukernel_1x4__wasm);
6068 xnn_params.f32.gemm.minmax.igemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_fn) xnn_f32_igemm_minmax_ukernel_1x4__wasm);
6069 xnn_params.f32.gemm.relu.gemm[XNN_MR_TO_INDEX(2)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_fn) xnn_f32_gemm_relu_ukernel_2x4__scalar);
6070 xnn_params.f32.gemm.relu.igemm[XNN_MR_TO_INDEX(2)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_fn) xnn_f32_igemm_relu_ukernel_2x4__scalar);
6071 xnn_params.f32.gemm.relu.gemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_fn) xnn_f32_gemm_relu_ukernel_1x4__wasm);
6072 xnn_params.f32.gemm.relu.igemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_fn) xnn_f32_igemm_relu_ukernel_1x4__wasm);
6073 xnn_params.f32.gemm.linear.gemm[XNN_MR_TO_INDEX(2)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_fn) xnn_f32_gemm_ukernel_2x4__scalar);
6074 xnn_params.f32.gemm.linear.igemm[XNN_MR_TO_INDEX(2)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_fn) xnn_f32_igemm_ukernel_2x4__scalar);
6075 xnn_params.f32.gemm.linear.gemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_fn) xnn_f32_gemm_ukernel_1x4__scalar);
6076 xnn_params.f32.gemm.linear.igemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_fn) xnn_f32_igemm_ukernel_1x4__scalar);
6077 xnn_params.f32.gemm.init.f32 = xnn_init_f32_minmax_scalar_params;
6078 xnn_params.f32.gemm.mr = 2;
6079 xnn_params.f32.gemm.nr = 4;
6080 } else {
6081 xnn_params.f32.gemm.minmax.gemm[XNN_MR_TO_INDEX(4)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_fn) xnn_f32_gemm_minmax_ukernel_4x4__wasm);
6082 xnn_params.f32.gemm.minmax.igemm[XNN_MR_TO_INDEX(4)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_fn) xnn_f32_igemm_minmax_ukernel_4x4__wasm);
6083 xnn_params.f32.gemm.minmax.gemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_fn) xnn_f32_gemm_minmax_ukernel_1x4__wasm);
6084 xnn_params.f32.gemm.minmax.igemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_fn) xnn_f32_igemm_minmax_ukernel_1x4__wasm);
6085 xnn_params.f32.gemm.relu.gemm[XNN_MR_TO_INDEX(4)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_fn) xnn_f32_gemm_relu_ukernel_4x4__wasm);
6086 xnn_params.f32.gemm.relu.igemm[XNN_MR_TO_INDEX(4)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_fn) xnn_f32_igemm_relu_ukernel_4x4__wasm);
6087 xnn_params.f32.gemm.relu.gemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_fn) xnn_f32_gemm_relu_ukernel_1x4__wasm);
6088 xnn_params.f32.gemm.relu.igemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_fn) xnn_f32_igemm_relu_ukernel_1x4__wasm);
6089 xnn_params.f32.gemm.linear.gemm[XNN_MR_TO_INDEX(4)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_fn) xnn_f32_gemm_ukernel_4x4__scalar);
6090 xnn_params.f32.gemm.linear.igemm[XNN_MR_TO_INDEX(4)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_fn) xnn_f32_igemm_ukernel_4x4__scalar);
6091 xnn_params.f32.gemm.linear.gemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_fn) xnn_f32_gemm_ukernel_1x4__scalar);
6092 xnn_params.f32.gemm.linear.igemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_fn) xnn_f32_igemm_ukernel_1x4__scalar);
6093 xnn_params.f32.gemm.init.f32 = xnn_init_f32_minmax_scalar_params;
6094 xnn_params.f32.gemm.mr = 4;
6095 xnn_params.f32.gemm.nr = 4;
6096 }
6097 xnn_params.f32.gemm2.minmax.gemm[XNN_MR_TO_INDEX(4)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_fn) xnn_f32_gemm_minmax_ukernel_4x2__wasm);
6098 xnn_params.f32.gemm2.minmax.igemm[XNN_MR_TO_INDEX(4)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_fn) xnn_f32_igemm_minmax_ukernel_4x2__wasm);
6099 xnn_params.f32.gemm2.linear.gemm[XNN_MR_TO_INDEX(4)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_fn) xnn_f32_gemm_ukernel_4x2__scalar);
6100 xnn_params.f32.gemm2.linear.igemm[XNN_MR_TO_INDEX(4)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_fn) xnn_f32_igemm_ukernel_4x2__scalar);
6101 xnn_params.f32.gemm2.init.f32 = xnn_init_f32_minmax_scalar_params;
6102 xnn_params.f32.gemm2.mr = 4;
6103 xnn_params.f32.gemm2.nr = 2;
6104
6105 xnn_params.f32.dwconv[0].minmax.unipass = (xnn_dwconv_unipass_ukernel_fn) xnn_f32_dwconv_minmax_ukernel_3p1c__wasm_acc2;
6106 xnn_params.f32.dwconv[0].linear.unipass = (xnn_dwconv_unipass_ukernel_fn) xnn_f32_dwconv_ukernel_3p1c__scalar_acc2;
6107 xnn_params.f32.dwconv[0].init.f32 = xnn_init_f32_minmax_scalar_params;
6108 xnn_params.f32.dwconv[0].channel_tile = 1;
6109 xnn_params.f32.dwconv[0].primary_tile = 3;
6110
6111 xnn_params.f32.dwconv[1].minmax.unipass = (xnn_dwconv_unipass_ukernel_fn) xnn_f32_dwconv_minmax_ukernel_4p1c__wasm_acc2;
6112 xnn_params.f32.dwconv[1].linear.unipass = (xnn_dwconv_unipass_ukernel_fn) xnn_f32_dwconv_ukernel_4p1c__scalar_acc2;
6113 xnn_params.f32.dwconv[1].init.f32 = xnn_init_f32_minmax_scalar_params;
6114 xnn_params.f32.dwconv[1].channel_tile = 1;
6115 xnn_params.f32.dwconv[1].primary_tile = 4;
6116
6117 xnn_params.f32.dwconv[2].minmax.unipass = (xnn_dwconv_unipass_ukernel_fn) xnn_f32_dwconv_minmax_ukernel_9p1c__wasm_acc2;
6118 xnn_params.f32.dwconv[2].linear.unipass = (xnn_dwconv_unipass_ukernel_fn) xnn_f32_dwconv_ukernel_9p1c__scalar_acc2;
6119 xnn_params.f32.dwconv[2].init.f32 = xnn_init_f32_minmax_scalar_params;
6120 xnn_params.f32.dwconv[2].channel_tile = 1;
6121 xnn_params.f32.dwconv[2].primary_tile = 9;
6122
6123 xnn_params.f32.dwconv[3].minmax.unipass = (xnn_dwconv_unipass_ukernel_fn) xnn_f32_dwconv_minmax_ukernel_25p1c__wasm_acc2;
6124 xnn_params.f32.dwconv[3].linear.unipass = (xnn_dwconv_unipass_ukernel_fn) xnn_f32_dwconv_ukernel_25p1c__scalar_acc2;
6125 xnn_params.f32.dwconv[3].init.f32 = xnn_init_f32_minmax_scalar_params;
6126 xnn_params.f32.dwconv[3].channel_tile = 1;
6127 xnn_params.f32.dwconv[3].primary_tile = 25;
6128
6129 xnn_params.f32.avgpool = (struct avgpool_parameters) {
6130 .unipass = (xnn_avgpool_unipass_ukernel_fn) xnn_f32_avgpool_minmax_ukernel_9x__wasm_c1,
6131 .multipass = (xnn_avgpool_multipass_ukernel_fn) xnn_f32_avgpool_minmax_ukernel_9p8x__wasm_c1,
6132 .init.f32 = xnn_init_f32_scaleminmax_scalar_params,
6133 .primary_tile = 9,
6134 .incremental_tile = 8,
6135 .channel_tile = 1,
6136 };
6137 xnn_params.f32.pavgpool = (struct pavgpool_parameters) {
6138 .unipass = (xnn_pavgpool_unipass_ukernel_fn) xnn_f32_pavgpool_minmax_ukernel_9x__wasm_c1,
6139 .multipass = (xnn_pavgpool_multipass_ukernel_fn) xnn_f32_pavgpool_minmax_ukernel_9p8x__wasm_c1,
6140 .init.f32 = xnn_init_f32_minmax_scalar_params,
6141 .primary_tile = 9,
6142 .incremental_tile = 8,
6143 .channel_tile = 1,
6144 };
6145 xnn_params.f32.gavgpool = (struct gavgpool_parameters) {
6146 .unipass = (xnn_gavgpool_unipass_ukernel_fn) xnn_f32_gavgpool_minmax_ukernel_7x__wasm_c1,
6147 .multipass = (xnn_gavgpool_multipass_ukernel_fn) xnn_f32_gavgpool_minmax_ukernel_7p7x__wasm_c1,
6148 .init.f32 = xnn_init_f32_scaleminmax_scalar_params,
6149 .update.f32 = xnn_update_f32_scaleminmax_scalar_params,
6150 .row_tile = 7,
6151 .channel_tile = 1,
6152 };
6153 xnn_params.f32.maxpool = (struct maxpool_parameters) {
6154 .ukernel = (xnn_maxpool_ukernel_fn) xnn_f32_maxpool_minmax_ukernel_9p8x__wasm_c1,
6155 .init.f32 = xnn_init_f32_minmax_scalar_params,
6156 .mr = 9,
6157 .qr = 8,
6158 };
6159 xnn_params.f32.argmaxpool[0] = (struct argmaxpool_parameters) {
6160 .up = (xnn_argmaxpool_unipass_ukernel_fn) xnn_f32_argmaxpool_ukernel_4x__scalar_c1,
6161 .mr = 4,
6162 };
6163 xnn_params.f32.argmaxpool[1] = (struct argmaxpool_parameters) {
6164 .up = (xnn_argmaxpool_unipass_ukernel_fn) xnn_f32_argmaxpool_ukernel_9x__scalar_c1,
6165 .mr = 9,
6166 };
6167 xnn_params.f32.argmaxpool[2] = (struct argmaxpool_parameters) {
6168 .mp = (xnn_argmaxpool_multipass_ukernel_fn) xnn_f32_argmaxpool_ukernel_9p8x__scalar_c1,
6169 .mr = 9,
6170 .qr = 8,
6171 };
6172 xnn_params.f32.ibilinear = (struct ibilinear_parameters) {
6173 .ukernel = (xnn_ibilinear_ukernel_fn) xnn_f32_ibilinear_ukernel__scalar_c2,
6174 .pixel_tile = 1,
6175 .channel_tile = 2,
6176 };
6177 xnn_params.f32.abs = (struct vunary_parameters) {
6178 .ukernel = (xnn_vunary_ukernel_fn) xnn_f32_vabs_ukernel__scalar_x4,
6179 .element_tile = 4,
6180 };
6181 xnn_params.f32.clamp = (struct vunary_parameters) {
6182 .ukernel = (xnn_vunary_ukernel_fn) xnn_f32_vclamp_ukernel__wasm_x4,
6183 .init.f32_minmax = xnn_init_f32_minmax_scalar_params,
6184 .element_tile = 4,
6185 };
6186 if (hardware_config->is_x86) {
6187 xnn_params.f32.hswish = (struct vunary_parameters) {
6188 .ukernel = (xnn_vunary_ukernel_fn) xnn_f32_vhswish_ukernel__scalar_x4,
6189 .init.f32_hswish = xnn_init_f32_hswish_scalar_params,
6190 .element_tile = 4,
6191 };
6192 } else {
6193 xnn_params.f32.hswish = (struct vunary_parameters) {
6194 .ukernel = (xnn_vunary_ukernel_fn) xnn_f32_vhswish_ukernel__wasm_x4,
6195 .init.f32_hswish = xnn_init_f32_hswish_scalar_params,
6196 .element_tile = 4,
6197 };
6198 }
6199 if (hardware_config->is_x86) {
6200 xnn_params.f32.elu = (struct vunary_parameters) {
6201 .ukernel = (xnn_vunary_ukernel_fn) xnn_f32_velu_ukernel__scalar_rr2_lut16_p3_x2,
6202 .init.f32_elu = xnn_init_f32_elu_scalar_rr2_lut16_p3_params,
6203 .element_tile = 2,
6204 };
6205 } else {
6206 xnn_params.f32.elu = (struct vunary_parameters) {
6207 .ukernel = (xnn_vunary_ukernel_fn) xnn_f32_velu_ukernel__wasm_rr2_p6_x6,
6208 .init.f32_elu = xnn_init_f32_elu_scalar_rr2_p6_params,
6209 .element_tile = 6,
6210 };
6211 }
6212 xnn_params.f32.lrelu = (struct vunary_parameters) {
6213 .ukernel = (xnn_vunary_ukernel_fn) xnn_f32_vlrelu_ukernel__scalar_x4,
6214 .init.f32_lrelu = xnn_init_f32_lrelu_scalar_params,
6215 .element_tile = 4,
6216 };
6217 xnn_params.f32.neg = (struct vunary_parameters) {
6218 .ukernel = (xnn_vunary_ukernel_fn) xnn_f32_vneg_ukernel__scalar_x4,
6219 .element_tile = 4,
6220 };
6221 if (hardware_config->is_x86) {
6222 xnn_params.f32.relu = (struct vunary_parameters) {
6223 .ukernel = (xnn_vunary_ukernel_fn) xnn_f32_vrelu_ukernel__scalar_x8,
6224 .element_tile = 8,
6225 };
6226 } else {
6227 xnn_params.f32.relu = (struct vunary_parameters) {
6228 .ukernel = (xnn_vunary_ukernel_fn) xnn_f32_vrelu_ukernel__wasm_x8,
6229 .element_tile = 8,
6230 };
6231 }
6232 xnn_params.f32.rndne = (struct vunary_parameters) {
6233 .ukernel = (xnn_vunary_ukernel_fn) xnn_f32_vrndne_ukernel__scalar_libm_x4,
6234 .element_tile = 4,
6235 };
6236 xnn_params.f32.rndz = (struct vunary_parameters) {
6237 .ukernel = (xnn_vunary_ukernel_fn) xnn_f32_vrndz_ukernel__scalar_libm_x4,
6238 .element_tile = 4,
6239 };
6240 xnn_params.f32.rndu = (struct vunary_parameters) {
6241 .ukernel = (xnn_vunary_ukernel_fn) xnn_f32_vrndu_ukernel__scalar_libm_x4,
6242 .element_tile = 4,
6243 };
6244 xnn_params.f32.rndd = (struct vunary_parameters) {
6245 .ukernel = (xnn_vunary_ukernel_fn) xnn_f32_vrndd_ukernel__scalar_libm_x4,
6246 .element_tile = 4,
6247 };
6248 xnn_params.f32.sigmoid = (struct vunary_parameters) {
6249 .ukernel = (xnn_vunary_ukernel_fn) xnn_f32_vsigmoid_ukernel__scalar_rr2_lut64_p2_div_x2,
6250 .init.f32_sigmoid = xnn_init_f32_sigmoid_scalar_rr2_lut64_p2_params,
6251 .element_tile = 2,
6252 };
6253 xnn_params.f32.sqr = (struct vunary_parameters) {
6254 .ukernel = (xnn_vunary_ukernel_fn) xnn_f32_vsqr_ukernel__scalar_x4,
6255 .element_tile = 4,
6256 };
6257 xnn_params.f32.sqrt = (struct vunary_parameters) {
6258 .ukernel = (xnn_vunary_ukernel_fn) xnn_f32_vsqrt_ukernel__scalar_sqrt_x1,
6259 .element_tile = 1,
6260 };
6261 if (hardware_config->is_x86) {
6262 xnn_params.f32.prelu = (struct prelu_parameters) {
6263 .ukernel = (xnn_prelu_ukernel_fn) xnn_f32_prelu_ukernel__scalar_2x4,
6264 .row_tile = 2,
6265 .channel_tile = 4,
6266 };
6267 } else {
6268 xnn_params.f32.prelu = (struct prelu_parameters) {
6269 .ukernel = (xnn_prelu_ukernel_fn) xnn_f32_prelu_ukernel__wasm_2x4,
6270 .row_tile = 2,
6271 .channel_tile = 4,
6272 };
6273 }
6274 xnn_params.f32.raddstoreexpminusmax = (struct raddstoreexpminusmax_parameters) {
6275 .ukernel = (xnn_raddstoreexpminusmax_ukernel_fn) xnn_f32_raddstoreexpminusmax_ukernel__scalar_rr2_p5_x4_acc2,
6276 .init.f32 = xnn_init_f32_expminus_scalar_rr2_p5_params,
6277 .element_tile = 4,
6278 };
6279 xnn_params.f32.rmax = (xnn_rmax_ukernel_fn) xnn_f32_rmax_ukernel__scalar;
6280 xnn_params.f32.vmulcaddc = (struct vmulcaddc_parameters) {
6281 .ukernel = (xnn_vmulcaddc_ukernel_fn) xnn_f32_vmulcaddc_minmax_ukernel_c1__wasm_2x,
6282 .init.f32 = xnn_init_f32_minmax_scalar_params,
6283 .channel_tile = 1,
6284 .row_tile = 2,
6285 };
6286 #ifndef XNN_NO_NCHW_OPERATORS
6287 init_flags |= XNN_INIT_FLAG_CHW_OPT;
6288
6289 xnn_params.f32.spmm = (struct spmm_parameters) {
6290 .ukernel = (xnn_spmm_ukernel_fn) xnn_f32_spmm_minmax_ukernel_8x1__scalar,
6291 .init.f32 = xnn_init_f32_minmax_scalar_params,
6292 .mr = 8,
6293 .nr = 1,
6294 };
6295 xnn_params.f32.spmm2 = (struct spmm_parameters) {
6296 .ukernel = (xnn_spmm_ukernel_fn) xnn_f32_spmm_minmax_ukernel_8x2__scalar,
6297 .init.f32 = xnn_init_f32_minmax_scalar_params,
6298 .mr = 8,
6299 .nr = 2,
6300 };
6301 xnn_params.f32.spmm4 = (struct spmm_parameters) {
6302 .ukernel = (xnn_spmm_ukernel_fn) xnn_f32_spmm_minmax_ukernel_8x4__scalar,
6303 .init.f32 = xnn_init_f32_minmax_scalar_params,
6304 .mr = 8,
6305 .nr = 4,
6306 };
6307 xnn_params.f32.conv_hwc2chw_3x3c3s2 = (struct conv_hwc2chw_parameters) {
6308 .ukernel_with_symm_padding =
6309 (xnn_conv_hwc2chw_ukernel_fn) xnn_f32_conv_hwc2chw_ukernel_3x3s2p1c3x4__scalar_1x1,
6310 .init.f32 = xnn_init_f32_minmax_scalar_params,
6311 .output_channel_tile = 4,
6312 .output_height_tile = 1,
6313 .output_width_tile = 1,
6314 };
6315 xnn_params.f32.dwconv2d_chw_3x3 = (struct dwconv2d_chw_parameters) {
6316 .ukernel = (xnn_dwconv2d_chw_ukernel_fn) xnn_f32_dwconv2d_chw_ukernel_3x3p1__scalar_2x1_acc2,
6317 .init.f32 = xnn_init_f32_chw_scalar_params,
6318 .output_height_tile = 2,
6319 .output_width_tile = 1,
6320 };
6321 xnn_params.f32.dwconv2d_chw_3x3s2 = (struct dwconv2d_chw_parameters) {
6322 .ukernel = (xnn_dwconv2d_chw_ukernel_fn) xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__scalar_1x1_acc2,
6323 .init.f32 = xnn_init_f32_chw_scalar_params,
6324 .output_height_tile = 1,
6325 .output_width_tile = 1,
6326 };
6327 xnn_params.f32.dwconv2d_chw_5x5 = (struct dwconv2d_chw_parameters) {
6328 .ukernel = (xnn_dwconv2d_chw_ukernel_fn) xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_1x1_acc5,
6329 .init.f32 = xnn_init_f32_chw_scalar_params,
6330 .output_height_tile = 1,
6331 .output_width_tile = 1,
6332 };
6333 xnn_params.f32.dwconv2d_chw_5x5s2 = (struct dwconv2d_chw_parameters) {
6334 .ukernel = (xnn_dwconv2d_chw_ukernel_fn) xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_1x1_acc5,
6335 .init.f32 = xnn_init_f32_chw_scalar_params,
6336 .output_height_tile = 1,
6337 .output_width_tile = 1,
6338 };
6339 xnn_params.f32.gavgpool_cw = (struct gavgpool_cw_parameters) {
6340 .ukernel = (xnn_gavgpool_cw_ukernel_fn) xnn_f32_gavgpool_cw_ukernel__scalar_x1,
6341 .channel_tile = 1,
6342 };
6343 xnn_params.f32.ibilinear_chw = (struct ibilinear_chw_parameters) {
6344 .ukernel = (xnn_ibilinear_chw_ukernel_fn) xnn_f32_ibilinear_chw_ukernel__scalar_p4,
6345 .channel_tile = 1,
6346 .pixel_tile = 4,
6347 };
6348 #endif // XNN_NO_NCHW_OPERATORS
6349 #endif // XNN_NO_F32_OPERATORS
6350
6351 /*************************** VCVT WAsm micro-kernels***************************/
6352 #ifndef XNN_NO_VCVT_OPERATORS
6353 init_flags |= XNN_INIT_FLAG_VCVT;
6354
6355 xnn_params.vcvt.f16_to_f32 = (struct vunary_parameters) {
6356 .ukernel = (xnn_vunary_ukernel_fn) xnn_f16_f32_vcvt_ukernel__scalar_x1,
6357 .init.f16_f32_cvt = xnn_init_f16_f32_cvt_scalar_params,
6358 .element_tile = 1,
6359 };
6360 xnn_params.vcvt.f32_to_f16 = (struct vunary_parameters) {
6361 .ukernel = (xnn_vunary_ukernel_fn) xnn_f32_f16_vcvt_ukernel__scalar_bitcast_x4,
6362 .init.f32_f16_cvt = xnn_init_f32_f16_cvt_scalar_bitcast_params,
6363 .element_tile = 4,
6364 };
6365 if (hardware_config->is_x86) {
6366 xnn_params.vcvt.f32_to_qs8 = (struct vunary_parameters) {
6367 .ukernel = (xnn_vunary_ukernel_fn) xnn_f32_qu8_vcvt_ukernel__scalar_imagic_x1,
6368 .init.f32_qs8_cvt = xnn_init_f32_qs8_cvt_scalar_imagic_params,
6369 .element_tile = 1,
6370 };
6371 xnn_params.vcvt.f32_to_qu8 = (struct vunary_parameters) {
6372 .ukernel = (xnn_vunary_ukernel_fn) xnn_f32_qu8_vcvt_ukernel__scalar_imagic_x1,
6373 .init.f32_qu8_cvt = xnn_init_f32_qu8_cvt_scalar_imagic_params,
6374 .element_tile = 1,
6375 };
6376 } else {
6377 xnn_params.vcvt.f32_to_qs8 = (struct vunary_parameters) {
6378 .ukernel = (xnn_vunary_ukernel_fn) xnn_f32_qu8_vcvt_ukernel__wasm_fmagic_x4,
6379 .init.f32_qs8_cvt = xnn_init_f32_qs8_cvt_scalar_fmagic_params,
6380 .element_tile = 4,
6381 };
6382 xnn_params.vcvt.f32_to_qu8 = (struct vunary_parameters) {
6383 .ukernel = (xnn_vunary_ukernel_fn) xnn_f32_qu8_vcvt_ukernel__wasm_fmagic_x4,
6384 .init.f32_qu8_cvt = xnn_init_f32_qu8_cvt_scalar_fmagic_params,
6385 .element_tile = 4,
6386 };
6387 }
6388 if (hardware_config->is_x86) {
6389 xnn_params.vcvt.qs8 = (struct vunary_parameters) {
6390 .ukernel = (xnn_vunary_ukernel_fn) xnn_qs8_vcvt_ukernel__scalar_x1,
6391 .init.qs8_cvt = xnn_init_qs8_cvt_scalar_params,
6392 .element_tile = 1,
6393 };
6394 } else {
6395 xnn_params.vcvt.qs8 = (struct vunary_parameters) {
6396 .ukernel = (xnn_vunary_ukernel_fn) xnn_qs8_vcvt_ukernel__scalar_x4,
6397 .init.qs8_cvt = xnn_init_qs8_cvt_scalar_params,
6398 .element_tile = 4,
6399 };
6400 }
6401 xnn_params.vcvt.qs8_to_f32 = (struct vunary_parameters) {
6402 .ukernel = (xnn_vunary_ukernel_fn) xnn_qs8_f32_vcvt_ukernel__scalar_x1,
6403 .init.qs8_f32_cvt = xnn_init_qs8_f32_cvt_scalar_params,
6404 .element_tile = 1,
6405 };
6406 if (hardware_config->is_x86) {
6407 xnn_params.vcvt.qu8 = (struct vunary_parameters) {
6408 .ukernel = (xnn_vunary_ukernel_fn) xnn_qu8_vcvt_ukernel__scalar_x1,
6409 .init.qu8_cvt = xnn_init_qu8_cvt_scalar_params,
6410 .element_tile = 1,
6411 };
6412 } else {
6413 xnn_params.vcvt.qu8 = (struct vunary_parameters) {
6414 .ukernel = (xnn_vunary_ukernel_fn) xnn_qu8_vcvt_ukernel__scalar_x4,
6415 .init.qu8_cvt = xnn_init_qu8_cvt_scalar_params,
6416 .element_tile = 4,
6417 };
6418 }
6419 xnn_params.vcvt.qu8_to_f32 = (struct vunary_parameters) {
6420 .ukernel = (xnn_vunary_ukernel_fn) xnn_qu8_f32_vcvt_ukernel__scalar_x1,
6421 .init.qu8_f32_cvt = xnn_init_qu8_f32_cvt_scalar_params,
6422 .element_tile = 1,
6423 };
6424 #endif // XNN_NO_VCVT_OPERATORS
6425
6426 /**************************** X32 WAsm micro-kernels****************************/
6427 #ifndef XNN_NO_X32_OPERATORS
6428 init_flags |= XNN_INIT_FLAG_X32;
6429
6430 xnn_params.x32.unpool = (xnn_unpool_ukernel_fn) xnn_x32_unpool_ukernel__scalar;
6431 xnn_params.x32.zip = (struct zip_parameters) {
6432 .x2 = (xnn_zipc_ukernel_fn) xnn_x32_zip_x2_ukernel__scalar,
6433 .x3 = (xnn_zipc_ukernel_fn) xnn_x32_zip_x3_ukernel__scalar,
6434 .x4 = (xnn_zipc_ukernel_fn) xnn_x32_zip_x4_ukernel__scalar,
6435 .xm = (xnn_zipv_ukernel_fn) xnn_x32_zip_xm_ukernel__scalar,
6436 };
6437 #endif // XNN_NO_X32_OPERATORS
6438
6439 /**************************** XX WAsm micro-kernels****************************/
6440 #ifndef XNN_NO_XX_OPERATORS
6441 init_flags |= XNN_INIT_FLAG_XX;
6442
6443 xnn_params.xx.copy = (xnn_vunary_ukernel_fn) xnn_xx_copy_ukernel__scalar_memcpy;
6444 xnn_params.xx.fill = (struct fill_parameters) {
6445 .ukernel = (xnn_fill_ukernel_fn) xnn_xx_fill_ukernel__scalar_x16,
6446 .row_tile = 1,
6447 };
6448 xnn_params.xx.pad = (struct pad_parameters) {
6449 .ukernel = (xnn_pad_ukernel_fn) xnn_xx_pad_ukernel__scalar,
6450 .row_tile = 1,
6451 };
6452 #endif
6453
6454#elif XNN_ARCH_RISCV
6455
6456 /************************** QC8 RISC-V micro-kernels **************************/
6457 #ifndef XNN_NO_QC8_OPERATORS
6458 init_flags |= XNN_INIT_FLAG_QC8;
6459
6460 xnn_params.qc8.gemm.minmax.gemm[XNN_MR_TO_INDEX(3)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_fn) xnn_qc8_gemm_minmax_fp32_ukernel_3x4__scalar_lrintf);
6461 xnn_params.qc8.gemm.minmax.igemm[XNN_MR_TO_INDEX(3)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_fn) xnn_qc8_igemm_minmax_fp32_ukernel_3x4__scalar_lrintf);
6462 xnn_params.qc8.gemm.minmax.gemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_fn) xnn_qc8_gemm_minmax_fp32_ukernel_1x4__scalar_lrintf);
6463 xnn_params.qc8.gemm.minmax.igemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_fn) xnn_qc8_igemm_minmax_fp32_ukernel_1x4__scalar_lrintf);
6464 xnn_params.qc8.gemm.init.qc8 = xnn_init_qc8_conv_minmax_fp32_scalar_lrintf_params;
6465 xnn_params.qc8.gemm.mr = 3;
6466 xnn_params.qc8.gemm.nr = 4;
6467
6468 xnn_params.qc8.dwconv[0].minmax.unipass = (xnn_dwconv_unipass_ukernel_fn) xnn_qc8_dwconv_minmax_fp32_ukernel_3p2c__scalar_lrintf;
6469 xnn_params.qc8.dwconv[0].init.qc8 = xnn_init_qc8_conv_minmax_fp32_scalar_lrintf_params;
6470 xnn_params.qc8.dwconv[0].channel_tile = 2;
6471 xnn_params.qc8.dwconv[0].primary_tile = 3;
6472 xnn_params.qc8.dwconv[1].minmax.unipass = (xnn_dwconv_unipass_ukernel_fn) xnn_qc8_dwconv_minmax_fp32_ukernel_9p2c__scalar_lrintf;
6473 xnn_params.qc8.dwconv[1].init.qc8 = xnn_init_qc8_conv_minmax_fp32_scalar_lrintf_params;
6474 xnn_params.qc8.dwconv[1].channel_tile = 2;
6475 xnn_params.qc8.dwconv[1].primary_tile = 9;
6476 xnn_params.qc8.dwconv[2].minmax.unipass = (xnn_dwconv_unipass_ukernel_fn) xnn_qc8_dwconv_minmax_fp32_ukernel_25p2c__scalar_lrintf;
6477 xnn_params.qc8.dwconv[2].init.qc8 = xnn_init_qc8_conv_minmax_fp32_scalar_lrintf_params;
6478 xnn_params.qc8.dwconv[2].channel_tile = 2;
6479 xnn_params.qc8.dwconv[2].primary_tile = 25;
6480 #endif // XNN_NO_QS8_OPERATORS
6481
6482 /************************** QS8 RISC-V micro-kernels **************************/
6483 #ifndef XNN_NO_QS8_OPERATORS
6484 init_flags |= XNN_INIT_FLAG_QS8;
6485
6486 xnn_params.qs8.gemm.minmax.gemm[XNN_MR_TO_INDEX(3)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_fn) xnn_qs8_gemm_minmax_fp32_ukernel_3x4__scalar_lrintf);
6487 xnn_params.qs8.gemm.minmax.igemm[XNN_MR_TO_INDEX(3)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_fn) xnn_qs8_igemm_minmax_fp32_ukernel_3x4__scalar_lrintf);
6488 xnn_params.qs8.gemm.minmax.gemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_fn) xnn_qs8_gemm_minmax_fp32_ukernel_1x4__scalar_lrintf);
6489 xnn_params.qs8.gemm.minmax.igemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_fn) xnn_qs8_igemm_minmax_fp32_ukernel_1x4__scalar_lrintf);
6490 xnn_params.qs8.gemm.init.qs8 = xnn_init_qs8_conv_minmax_fp32_scalar_lrintf_params;
6491 xnn_params.qs8.gemm.mr = 3;
6492 xnn_params.qs8.gemm.nr = 4;
6493
6494 xnn_params.qs8.dwconv[0].minmax.unipass = (xnn_dwconv_unipass_ukernel_fn) xnn_qs8_dwconv_minmax_fp32_ukernel_9p2c__scalar_lrintf;
6495 xnn_params.qs8.dwconv[0].init.qs8 = xnn_init_qs8_conv_minmax_fp32_scalar_lrintf_params;
6496 xnn_params.qs8.dwconv[0].channel_tile = 2;
6497 xnn_params.qs8.dwconv[0].primary_tile = 9;
6498 xnn_params.qs8.dwconv[1].minmax.unipass = (xnn_dwconv_unipass_ukernel_fn) xnn_qs8_dwconv_minmax_fp32_ukernel_25p2c__scalar_lrintf;
6499 xnn_params.qs8.dwconv[1].init.qs8 = xnn_init_qs8_conv_minmax_fp32_scalar_lrintf_params;
6500 xnn_params.qs8.dwconv[1].channel_tile = 2;
6501 xnn_params.qs8.dwconv[1].primary_tile = 25;
6502
6503 xnn_params.qs8.gavgpool = (struct gavgpool_parameters) {
6504 .unipass = (xnn_gavgpool_unipass_ukernel_fn) xnn_qs8_gavgpool_minmax_fp32_ukernel_7x__scalar_imagic_c1,
6505 .multipass = (xnn_gavgpool_multipass_ukernel_fn) xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__scalar_imagic_c1,
6506 .init.qs8 = xnn_init_qs8_avgpool_minmax_fp32_scalar_imagic_params,
6507 .update.qs8 = xnn_update_qs8_avgpool_minmax_fp32_scalar_imagic_params,
6508 .row_tile = 7,
6509 .channel_tile = 1,
6510 };
6511
6512 xnn_params.qs8.lrelu = (struct vunary_parameters) {
6513 .ukernel = (xnn_vunary_ukernel_fn) xnn_qs8_vlrelu_ukernel__scalar_andxor_x4,
6514 .init.qs8_lrelu = xnn_init_qs8_lrelu_scalar_andxor_params,
6515 .element_tile = 4,
6516 };
6517 #endif // XNN_NO_QS8_OPERATORS
6518
6519 /************************** QU8 RISC-V micro-kernels **************************/
6520 #ifndef XNN_NO_QU8_OPERATORS
6521 init_flags |= XNN_INIT_FLAG_QU8;
6522
6523 xnn_params.qu8.gemm.minmax.gemm[XNN_MR_TO_INDEX(3)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_fn) xnn_qu8_gemm_minmax_fp32_ukernel_3x4__scalar_lrintf);
6524 xnn_params.qu8.gemm.minmax.igemm[XNN_MR_TO_INDEX(3)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_fn) xnn_qu8_igemm_minmax_fp32_ukernel_3x4__scalar_lrintf);
6525 xnn_params.qu8.gemm.minmax.gemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_fn) xnn_qu8_gemm_minmax_fp32_ukernel_1x4__scalar_lrintf);
6526 xnn_params.qu8.gemm.minmax.igemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_fn) xnn_qu8_igemm_minmax_fp32_ukernel_1x4__scalar_lrintf);
6527 xnn_params.qu8.gemm.init.qu8 = xnn_init_qu8_conv_minmax_fp32_scalar_lrintf_params;
6528 xnn_params.qu8.gemm.mr = 3;
6529 xnn_params.qu8.gemm.nr = 4;
6530
6531 xnn_params.qu8.dwconv[0].minmax.unipass = (xnn_dwconv_unipass_ukernel_fn) xnn_qu8_dwconv_minmax_fp32_ukernel_9p2c__scalar_lrintf;
6532 xnn_params.qu8.dwconv[0].init.qu8 = xnn_init_qu8_conv_minmax_fp32_scalar_lrintf_params;
6533 xnn_params.qu8.dwconv[0].channel_tile = 2;
6534 xnn_params.qu8.dwconv[0].primary_tile = 9;
6535 xnn_params.qu8.dwconv[1].minmax.unipass = (xnn_dwconv_unipass_ukernel_fn) xnn_qu8_dwconv_minmax_fp32_ukernel_25p2c__scalar_lrintf;
6536 xnn_params.qu8.dwconv[1].init.qu8 = xnn_init_qu8_conv_minmax_fp32_scalar_lrintf_params;
6537 xnn_params.qu8.dwconv[1].channel_tile = 2;
6538 xnn_params.qu8.dwconv[1].primary_tile = 25;
6539
6540 xnn_params.qu8.avgpool = (struct avgpool_parameters) {
6541 .unipass = (xnn_avgpool_unipass_ukernel_fn) xnn_qu8_avgpool_minmax_fp32_ukernel_9x__scalar_imagic_c1,
6542 .multipass = (xnn_avgpool_multipass_ukernel_fn) xnn_qu8_avgpool_minmax_fp32_ukernel_9p8x__scalar_imagic_c1,
6543 .init.qu8 = xnn_init_qu8_avgpool_minmax_fp32_scalar_imagic_params,
6544 .primary_tile = 9,
6545 .incremental_tile = 8,
6546 .channel_tile = 1,
6547 };
6548 xnn_params.qu8.gavgpool = (struct gavgpool_parameters) {
6549 .unipass = (xnn_gavgpool_unipass_ukernel_fn) xnn_qu8_gavgpool_minmax_fp32_ukernel_7x__scalar_imagic_c1,
6550 .multipass = (xnn_gavgpool_multipass_ukernel_fn) xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__scalar_imagic_c1,
6551 .init.qu8 = xnn_init_qu8_avgpool_minmax_fp32_scalar_imagic_params,
6552 .update.qu8 = xnn_update_qu8_avgpool_minmax_fp32_scalar_imagic_params,
6553 .row_tile = 7,
6554 .channel_tile = 1,
6555 };
6556
6557 xnn_params.qu8.lrelu = (struct vunary_parameters) {
6558 .ukernel = (xnn_vunary_ukernel_fn) xnn_qu8_vlrelu_ukernel__scalar_andxor_x4,
6559 .init.qu8_lrelu = xnn_init_qu8_lrelu_scalar_andxor_params,
6560 .element_tile = 4,
6561 };
6562 #endif // XNN_NO_QU8_OPERATORS
6563
6564 /************************** S8 RISC-V micro-kernels ***************************/
6565 #ifndef XNN_NO_S8_OPERATORS
6566 init_flags |= XNN_INIT_FLAG_S8;
6567
6568 xnn_params.s8.clamp = (struct vunary_parameters) {
6569 .ukernel = (xnn_vunary_ukernel_fn) xnn_s8_vclamp_ukernel__scalar_x4,
6570 .init.s8_minmax = xnn_init_s8_minmax_scalar_params,
6571 .element_tile = 4,
6572 };
6573 xnn_params.s8.ibilinear = (struct ibilinear_parameters) {
6574 .ukernel = (xnn_ibilinear_ukernel_fn) xnn_s8_ibilinear_ukernel__scalar_c1,
6575 .pixel_tile = 1,
6576 .channel_tile = 1,
6577 };
6578 xnn_params.s8.maxpool = (struct maxpool_parameters) {
6579 .ukernel = (xnn_maxpool_ukernel_fn) xnn_s8_maxpool_minmax_ukernel_9p8x__scalar_c1,
6580 .init.s8 = xnn_init_s8_minmax_scalar_params,
6581 .mr = 9,
6582 .qr = 8,
6583 };
6584 #endif // XNN_NO_S8_OPERATORS
6585
6586 /************************** U8 RISC-V micro-kernels ***************************/
6587 #ifndef XNN_NO_U8_OPERATORS
6588 init_flags |= XNN_INIT_FLAG_U8;
6589
6590 xnn_params.u8.clamp = (struct vunary_parameters) {
6591 .ukernel = (xnn_vunary_ukernel_fn) xnn_u8_vclamp_ukernel__scalar_x4,
6592 .init.u8_minmax = xnn_init_u8_minmax_scalar_params,
6593 .element_tile = 4,
6594 };
6595 xnn_params.u8.ibilinear = (struct ibilinear_parameters) {
6596 .ukernel = (xnn_ibilinear_ukernel_fn) xnn_u8_ibilinear_ukernel__scalar_c1,
6597 .pixel_tile = 1,
6598 .channel_tile = 1,
6599 };
6600 xnn_params.u8.maxpool = (struct maxpool_parameters) {
6601 .ukernel = (xnn_maxpool_ukernel_fn) xnn_u8_maxpool_minmax_ukernel_9p8x__scalar_c1,
6602 .init.u8 = xnn_init_u8_minmax_scalar_params,
6603 .mr = 9,
6604 .qr = 8,
6605 };
6606 xnn_params.u8.lut32norm = xnn_u8_lut32norm_ukernel__scalar;
6607 xnn_params.u8.rmax = xnn_u8_rmax_ukernel__scalar;
6608 #endif // XNN_NO_U8_OPERATORS
6609
6610 /************************** X8 RISC-V micro-kernels ***************************/
6611 #ifndef XNN_NO_X8_OPERATORS
6612 init_flags |= XNN_INIT_FLAG_X8;
6613
6614 xnn_params.x8.zip = (struct zip_parameters) {
6615 .x2 = (xnn_zipc_ukernel_fn) xnn_x8_zip_x2_ukernel__scalar,
6616 .x3 = (xnn_zipc_ukernel_fn) xnn_x8_zip_x3_ukernel__scalar,
6617 .x4 = (xnn_zipc_ukernel_fn) xnn_x8_zip_x4_ukernel__scalar,
6618 .xm = (xnn_zipv_ukernel_fn) xnn_x8_zip_xm_ukernel__scalar,
6619 };
6620 #endif // XNN_NO_X8_OPERATORS
6621
6622 /************************** F32 RISC-V micro-kernels **************************/
6623 #ifndef XNN_NO_F32_OPERATORS
6624 init_flags |= XNN_INIT_FLAG_F32;
6625
6626 xnn_params.f32.gemm.minmax.gemm[XNN_MR_TO_INDEX(4)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_fn) xnn_f32_gemm_minmax_ukernel_4x4__scalar);
6627 xnn_params.f32.gemm.minmax.igemm[XNN_MR_TO_INDEX(4)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_fn) xnn_f32_igemm_minmax_ukernel_4x4__scalar);
6628 xnn_params.f32.gemm.minmax.gemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_fn) xnn_f32_gemm_minmax_ukernel_1x4__scalar);
6629 xnn_params.f32.gemm.minmax.igemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_fn) xnn_f32_igemm_minmax_ukernel_1x4__scalar);
6630 xnn_params.f32.gemm.relu.gemm[XNN_MR_TO_INDEX(4)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_fn) xnn_f32_gemm_relu_ukernel_4x4__scalar);
6631 xnn_params.f32.gemm.relu.igemm[XNN_MR_TO_INDEX(4)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_fn) xnn_f32_igemm_relu_ukernel_4x4__scalar);
6632 xnn_params.f32.gemm.relu.gemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_fn) xnn_f32_gemm_relu_ukernel_1x4__scalar);
6633 xnn_params.f32.gemm.relu.igemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_fn) xnn_f32_igemm_relu_ukernel_1x4__scalar);
6634 xnn_params.f32.gemm.linear.gemm[XNN_MR_TO_INDEX(4)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_fn) xnn_f32_gemm_ukernel_4x4__scalar);
6635 xnn_params.f32.gemm.linear.igemm[XNN_MR_TO_INDEX(4)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_fn) xnn_f32_igemm_ukernel_4x4__scalar);
6636 xnn_params.f32.gemm.linear.gemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_fn) xnn_f32_gemm_ukernel_1x4__scalar);
6637 xnn_params.f32.gemm.linear.igemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_fn) xnn_f32_igemm_ukernel_1x4__scalar);
6638 xnn_params.f32.gemm.init.f32 = xnn_init_f32_minmax_scalar_params;
6639 xnn_params.f32.gemm.mr = 4;
6640 xnn_params.f32.gemm.nr = 4;
6641
6642 xnn_params.f32.gemm2.minmax.gemm[XNN_MR_TO_INDEX(4)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_fn) xnn_f32_gemm_minmax_ukernel_4x2__scalar);
6643 xnn_params.f32.gemm2.minmax.igemm[XNN_MR_TO_INDEX(4)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_fn) xnn_f32_igemm_minmax_ukernel_4x2__scalar);
6644 xnn_params.f32.gemm2.linear.gemm[XNN_MR_TO_INDEX(4)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_fn) xnn_f32_gemm_ukernel_4x2__scalar);
6645 xnn_params.f32.gemm2.linear.igemm[XNN_MR_TO_INDEX(4)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_fn) xnn_f32_igemm_ukernel_4x2__scalar);
6646 xnn_params.f32.gemm2.init.f32 = xnn_init_f32_minmax_scalar_params;
6647 xnn_params.f32.gemm2.mr = 4;
6648 xnn_params.f32.gemm2.nr = 2;
6649
6650 xnn_params.f32.dwconv[0].minmax.unipass = (xnn_dwconv_unipass_ukernel_fn) xnn_f32_dwconv_minmax_ukernel_3p1c__scalar_acc2;
6651 xnn_params.f32.dwconv[0].linear.unipass = (xnn_dwconv_unipass_ukernel_fn) xnn_f32_dwconv_ukernel_3p1c__scalar_acc2;
6652 xnn_params.f32.dwconv[0].init.f32 = xnn_init_f32_minmax_scalar_params;
6653 xnn_params.f32.dwconv[0].channel_tile = 1;
6654 xnn_params.f32.dwconv[0].primary_tile = 3;
6655
6656 xnn_params.f32.dwconv[1].minmax.unipass = (xnn_dwconv_unipass_ukernel_fn) xnn_f32_dwconv_minmax_ukernel_4p1c__scalar_acc2;
6657 xnn_params.f32.dwconv[1].linear.unipass = (xnn_dwconv_unipass_ukernel_fn) xnn_f32_dwconv_ukernel_4p1c__scalar_acc2;
6658 xnn_params.f32.dwconv[1].init.f32 = xnn_init_f32_minmax_scalar_params;
6659 xnn_params.f32.dwconv[1].channel_tile = 1;
6660 xnn_params.f32.dwconv[1].primary_tile = 4;
6661
6662 xnn_params.f32.dwconv[2].minmax.unipass = (xnn_dwconv_unipass_ukernel_fn) xnn_f32_dwconv_minmax_ukernel_9p1c__scalar_acc2;
6663 xnn_params.f32.dwconv[2].linear.unipass = (xnn_dwconv_unipass_ukernel_fn) xnn_f32_dwconv_ukernel_9p1c__scalar_acc2;
6664 xnn_params.f32.dwconv[2].init.f32 = xnn_init_f32_minmax_scalar_params;
6665 xnn_params.f32.dwconv[2].channel_tile = 1;
6666 xnn_params.f32.dwconv[2].primary_tile = 9;
6667
6668 xnn_params.f32.dwconv[3].minmax.unipass = (xnn_dwconv_unipass_ukernel_fn) xnn_f32_dwconv_minmax_ukernel_25p1c__scalar_acc2;
6669 xnn_params.f32.dwconv[3].linear.unipass = (xnn_dwconv_unipass_ukernel_fn) xnn_f32_dwconv_ukernel_25p1c__scalar_acc2;
6670 xnn_params.f32.dwconv[3].init.f32 = xnn_init_f32_minmax_scalar_params;
6671 xnn_params.f32.dwconv[3].channel_tile = 1;
6672 xnn_params.f32.dwconv[3].primary_tile = 25;
6673
6674 xnn_params.f32.avgpool = (struct avgpool_parameters) {
6675 .unipass = (xnn_avgpool_unipass_ukernel_fn) xnn_f32_avgpool_minmax_ukernel_9x__scalar_c1,
6676 .multipass = (xnn_avgpool_multipass_ukernel_fn) xnn_f32_avgpool_minmax_ukernel_9p8x__scalar_c1,
6677 .init.f32 = xnn_init_f32_scaleminmax_scalar_params,
6678 .primary_tile = 9,
6679 .incremental_tile = 8,
6680 .channel_tile = 1,
6681 };
6682 xnn_params.f32.pavgpool = (struct pavgpool_parameters) {
6683 .unipass = (xnn_pavgpool_unipass_ukernel_fn) xnn_f32_pavgpool_minmax_ukernel_9x__scalar_c1,
6684 .multipass = (xnn_pavgpool_multipass_ukernel_fn) xnn_f32_pavgpool_minmax_ukernel_9p8x__scalar_c1,
6685 .init.f32 = xnn_init_f32_minmax_scalar_params,
6686 .primary_tile = 9,
6687 .incremental_tile = 8,
6688 .channel_tile = 1,
6689 };
6690 xnn_params.f32.gavgpool = (struct gavgpool_parameters) {
6691 .unipass = (xnn_gavgpool_unipass_ukernel_fn) xnn_f32_gavgpool_minmax_ukernel_7x__scalar_c1,
6692 .multipass = (xnn_gavgpool_multipass_ukernel_fn) xnn_f32_gavgpool_minmax_ukernel_7p7x__scalar_c1,
6693 .init.f32 = xnn_init_f32_scaleminmax_scalar_params,
6694 .update.f32 = xnn_update_f32_scaleminmax_scalar_params,
6695 .row_tile = 7,
6696 .channel_tile = 1,
6697 };
6698 xnn_params.f32.maxpool = (struct maxpool_parameters) {
6699 .ukernel = (xnn_maxpool_ukernel_fn) xnn_f32_maxpool_minmax_ukernel_9p8x__scalar_c1,
6700 .init.f32 = xnn_init_f32_minmax_scalar_params,
6701 .mr = 9,
6702 .qr = 8,
6703 };
6704 xnn_params.f32.argmaxpool[0] = (struct argmaxpool_parameters) {
6705 .up = (xnn_argmaxpool_unipass_ukernel_fn) xnn_f32_argmaxpool_ukernel_4x__scalar_c1,
6706 .mr = 4,
6707 };
6708 xnn_params.f32.argmaxpool[1] = (struct argmaxpool_parameters) {
6709 .up = (xnn_argmaxpool_unipass_ukernel_fn) xnn_f32_argmaxpool_ukernel_9x__scalar_c1,
6710 .mr = 9,
6711 };
6712 xnn_params.f32.argmaxpool[2] = (struct argmaxpool_parameters) {
6713 .mp = (xnn_argmaxpool_multipass_ukernel_fn) xnn_f32_argmaxpool_ukernel_9p8x__scalar_c1,
6714 .mr = 9,
6715 .qr = 8,
6716 };
6717 xnn_params.f32.ibilinear = (struct ibilinear_parameters) {
6718 .ukernel = (xnn_ibilinear_ukernel_fn) xnn_f32_ibilinear_ukernel__scalar_c2,
6719 .pixel_tile = 1,
6720 .channel_tile = 2,
6721 };
6722 xnn_params.f32.abs = (struct vunary_parameters) {
6723 .ukernel = (xnn_vunary_ukernel_fn) xnn_f32_vabs_ukernel__scalar_x4,
6724 .element_tile = 4,
6725 };
6726 xnn_params.f32.clamp = (struct vunary_parameters) {
6727 .ukernel = (xnn_vunary_ukernel_fn) xnn_f32_vclamp_ukernel__scalar_x4,
6728 .init.f32_minmax = xnn_init_f32_minmax_scalar_params,
6729 .element_tile = 4,
6730 };
6731 xnn_params.f32.elu = (struct vunary_parameters) {
6732 .ukernel = (xnn_vunary_ukernel_fn) xnn_f32_velu_ukernel__scalar_rr2_lut16_p3_x4,
6733 .init.f32_elu = xnn_init_f32_elu_scalar_rr2_lut16_p3_params,
6734 .element_tile = 4,
6735 };
6736 xnn_params.f32.hswish = (struct vunary_parameters) {
6737 .ukernel = (xnn_vunary_ukernel_fn) xnn_f32_vhswish_ukernel__scalar_x4,
6738 .init.f32_hswish = xnn_init_f32_hswish_scalar_params,
6739 .element_tile = 4,
6740 };
6741 xnn_params.f32.lrelu = (struct vunary_parameters) {
6742 .ukernel = (xnn_vunary_ukernel_fn) xnn_f32_vlrelu_ukernel__scalar_x4,
6743 .init.f32_lrelu = xnn_init_f32_lrelu_scalar_params,
6744 .element_tile = 4,
6745 };
6746 xnn_params.f32.neg = (struct vunary_parameters) {
6747 .ukernel = (xnn_vunary_ukernel_fn) xnn_f32_vneg_ukernel__scalar_x4,
6748 .element_tile = 4,
6749 };
6750 xnn_params.f32.rndne = (struct vunary_parameters) {
6751 .ukernel = (xnn_vunary_ukernel_fn) xnn_f32_vrndne_ukernel__scalar_libm_x1,
6752 .element_tile = 1,
6753 };
6754 xnn_params.f32.rndz = (struct vunary_parameters) {
6755 .ukernel = (xnn_vunary_ukernel_fn) xnn_f32_vrndz_ukernel__scalar_libm_x1,
6756 .element_tile = 1,
6757 };
6758 xnn_params.f32.rndu = (struct vunary_parameters) {
6759 .ukernel = (xnn_vunary_ukernel_fn) xnn_f32_vrndu_ukernel__scalar_libm_x1,
6760 .element_tile = 1,
6761 };
6762 xnn_params.f32.rndd = (struct vunary_parameters) {
6763 .ukernel = (xnn_vunary_ukernel_fn) xnn_f32_vrndd_ukernel__scalar_libm_x1,
6764 .element_tile = 1,
6765 };
6766 xnn_params.f32.sigmoid = (struct vunary_parameters) {
6767 .ukernel = (xnn_vunary_ukernel_fn) xnn_f32_vsigmoid_ukernel__scalar_rr2_lut64_p2_div_x2,
6768 .init.f32_sigmoid = xnn_init_f32_sigmoid_scalar_rr2_lut64_p2_params,
6769 .element_tile = 2,
6770 };
6771 xnn_params.f32.sqr = (struct vunary_parameters) {
6772 .ukernel = (xnn_vunary_ukernel_fn) xnn_f32_vsqr_ukernel__scalar_x4,
6773 .element_tile = 4,
6774 };
6775 xnn_params.f32.sqrt = (struct vunary_parameters) {
6776 .ukernel = (xnn_vunary_ukernel_fn) xnn_f32_vsqrt_ukernel__scalar_sqrt_x1,
6777 .element_tile = 1,
6778 };
6779 xnn_params.f32.prelu = (struct prelu_parameters) {
6780 .ukernel = (xnn_prelu_ukernel_fn) xnn_f32_prelu_ukernel__scalar_2x4,
6781 .row_tile = 4,
6782 .channel_tile = 4,
6783 };
6784 xnn_params.f32.raddstoreexpminusmax = (struct raddstoreexpminusmax_parameters) {
6785 .ukernel = (xnn_raddstoreexpminusmax_ukernel_fn) xnn_f32_raddstoreexpminusmax_ukernel__scalar_rr2_p5_x4_acc2,
6786 .init.f32 = xnn_init_f32_expminus_scalar_rr2_p5_params,
6787 .element_tile = 4,
6788 };
6789 xnn_params.f32.rmax = (xnn_rmax_ukernel_fn) xnn_f32_rmax_ukernel__scalar;
6790 xnn_params.f32.vmulcaddc = (struct vmulcaddc_parameters) {
6791 .ukernel = (xnn_vmulcaddc_ukernel_fn) xnn_f32_vmulcaddc_minmax_ukernel_c1__scalar_2x,
6792 .init.f32 = xnn_init_f32_minmax_scalar_params,
6793 .channel_tile = 1,
6794 .row_tile = 2,
6795 };
6796 #ifndef XNN_NO_NCHW_OPERATORS
6797 init_flags |= XNN_INIT_FLAG_CHW_OPT;
6798
6799 xnn_params.f32.spmm = (struct spmm_parameters) {
6800 .ukernel = (xnn_spmm_ukernel_fn) xnn_f32_spmm_minmax_ukernel_8x1__scalar,
6801 .init.f32 = xnn_init_f32_minmax_scalar_params,
6802 .mr = 8,
6803 .nr = 1,
6804 };
6805 xnn_params.f32.spmm2 = (struct spmm_parameters) {
6806 .ukernel = (xnn_spmm_ukernel_fn) xnn_f32_spmm_minmax_ukernel_8x2__scalar,
6807 .init.f32 = xnn_init_f32_minmax_scalar_params,
6808 .mr = 8,
6809 .nr = 2,
6810 };
6811 xnn_params.f32.spmm4 = (struct spmm_parameters) {
6812 .ukernel = (xnn_spmm_ukernel_fn) xnn_f32_spmm_minmax_ukernel_8x4__scalar,
6813 .init.f32 = xnn_init_f32_minmax_scalar_params,
6814 .mr = 8,
6815 .nr = 4,
6816 };
6817 xnn_params.f32.conv_hwc2chw_3x3c3s2 = (struct conv_hwc2chw_parameters) {
6818 .ukernel_with_symm_padding =
6819 (xnn_conv_hwc2chw_ukernel_fn) xnn_f32_conv_hwc2chw_ukernel_3x3s2p1c3x4__scalar_1x1,
6820 .init.f32 = xnn_init_f32_minmax_scalar_params,
6821 .output_channel_tile = 4,
6822 .output_height_tile = 1,
6823 .output_width_tile = 1,
6824 };
6825 xnn_params.f32.dwconv2d_chw_3x3 = (struct dwconv2d_chw_parameters) {
6826 .ukernel = (xnn_dwconv2d_chw_ukernel_fn) xnn_f32_dwconv2d_chw_ukernel_3x3p1__scalar_2x1_acc2,
6827 .init.f32 = xnn_init_f32_chw_scalar_params,
6828 .output_height_tile = 2,
6829 .output_width_tile = 1,
6830 };
6831 xnn_params.f32.dwconv2d_chw_3x3s2 = (struct dwconv2d_chw_parameters) {
6832 .ukernel = (xnn_dwconv2d_chw_ukernel_fn) xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__scalar_1x1_acc2,
6833 .init.f32 = xnn_init_f32_chw_scalar_params,
6834 .output_height_tile = 1,
6835 .output_width_tile = 1,
6836 };
6837 xnn_params.f32.dwconv2d_chw_5x5 = (struct dwconv2d_chw_parameters) {
6838 .ukernel = (xnn_dwconv2d_chw_ukernel_fn) xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_1x1_acc5,
6839 .init.f32 = xnn_init_f32_chw_scalar_params,
6840 .output_height_tile = 1,
6841 .output_width_tile = 1,
6842 };
6843 xnn_params.f32.dwconv2d_chw_5x5s2 = (struct dwconv2d_chw_parameters) {
6844 .ukernel = (xnn_dwconv2d_chw_ukernel_fn) xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_1x1_acc5,
6845 .init.f32 = xnn_init_f32_chw_scalar_params,
6846 .output_height_tile = 1,
6847 .output_width_tile = 1,
6848 };
6849 xnn_params.f32.gavgpool_cw = (struct gavgpool_cw_parameters) {
6850 .ukernel = (xnn_gavgpool_cw_ukernel_fn) xnn_f32_gavgpool_cw_ukernel__scalar_x1,
6851 .channel_tile = 1,
6852 };
6853 xnn_params.f32.ibilinear_chw = (struct ibilinear_chw_parameters) {
6854 .ukernel = (xnn_ibilinear_chw_ukernel_fn) xnn_f32_ibilinear_chw_ukernel__scalar_p4,
6855 .channel_tile = 1,
6856 .pixel_tile = 4,
6857 };
6858 #endif // XNN_NO_NCHW_OPERATORS
6859 #endif // XNN_NO_F32_OPERATORS
6860
6861 /************************** VCVT RISC-V micro-kernels *************************/
6862 #ifndef XNN_NO_VCVT_OPERATORS
6863 init_flags |= XNN_INIT_FLAG_VCVT;
6864
6865 xnn_params.vcvt.f16_to_f32 = (struct vunary_parameters) {
6866 .ukernel = (xnn_vunary_ukernel_fn) xnn_f16_f32_vcvt_ukernel__scalar_x4,
6867 .init.f16_f32_cvt = xnn_init_f16_f32_cvt_scalar_params,
6868 .element_tile = 4,
6869 };
6870 xnn_params.vcvt.f32_to_f16 = (struct vunary_parameters) {
6871 .ukernel = (xnn_vunary_ukernel_fn) xnn_f32_f16_vcvt_ukernel__scalar_fabsf_x2,
6872 .init.f32_f16_cvt = xnn_init_f32_f16_cvt_scalar_fabsf_params,
6873 .element_tile = 2,
6874 };
6875 xnn_params.vcvt.f32_to_qs8 = (struct vunary_parameters) {
6876 .ukernel = (xnn_vunary_ukernel_fn) xnn_f32_qs8_vcvt_ukernel__scalar_lrintf_x4,
6877 .init.f32_qs8_cvt = xnn_init_f32_qs8_cvt_scalar_lrintf_params,
6878 .element_tile = 4,
6879 };
6880 xnn_params.vcvt.f32_to_qu8 = (struct vunary_parameters) {
6881 .ukernel = (xnn_vunary_ukernel_fn) xnn_f32_qu8_vcvt_ukernel__scalar_lrintf_x4,
6882 .init.f32_qu8_cvt = xnn_init_f32_qu8_cvt_scalar_lrintf_params,
6883 .element_tile = 4,
6884 };
6885 xnn_params.vcvt.qs8 = (struct vunary_parameters) {
6886 .ukernel = (xnn_vunary_ukernel_fn) xnn_qs8_vcvt_ukernel__scalar_x4,
6887 .init.qs8_cvt = xnn_init_qs8_cvt_scalar_params,
6888 .element_tile = 4,
6889 };
6890 xnn_params.vcvt.qs8_to_f32 = (struct vunary_parameters) {
6891 .ukernel = (xnn_vunary_ukernel_fn) xnn_qs8_f32_vcvt_ukernel__scalar_x4,
6892 .init.qs8_f32_cvt = xnn_init_qs8_f32_cvt_scalar_params,
6893 .element_tile = 4,
6894 };
6895 xnn_params.vcvt.qu8 = (struct vunary_parameters) {
6896 .ukernel = (xnn_vunary_ukernel_fn) xnn_qu8_vcvt_ukernel__scalar_x4,
6897 .init.qu8_cvt = xnn_init_qu8_cvt_scalar_params,
6898 .element_tile = 4,
6899 };
6900 xnn_params.vcvt.qu8_to_f32 = (struct vunary_parameters) {
6901 .ukernel = (xnn_vunary_ukernel_fn) xnn_qu8_f32_vcvt_ukernel__scalar_x4,
6902 .init.qu8_f32_cvt = xnn_init_qu8_f32_cvt_scalar_params,
6903 .element_tile = 4,
6904 };
6905 #endif // XNN_NO_VCVT_OPERATORS
6906
6907 /************************** X32 RISC-V micro-kernels **************************/
6908 #ifndef XNN_NO_X32_OPERATORS
6909 init_flags |= XNN_INIT_FLAG_X32;
6910
6911 xnn_params.x32.unpool = (xnn_unpool_ukernel_fn) xnn_x32_unpool_ukernel__scalar;
6912 xnn_params.x32.zip = (struct zip_parameters) {
6913 .x2 = (xnn_zipc_ukernel_fn) xnn_x32_zip_x2_ukernel__scalar,
6914 .x3 = (xnn_zipc_ukernel_fn) xnn_x32_zip_x3_ukernel__scalar,
6915 .x4 = (xnn_zipc_ukernel_fn) xnn_x32_zip_x4_ukernel__scalar,
6916 .xm = (xnn_zipv_ukernel_fn) xnn_x32_zip_xm_ukernel__scalar,
6917 };
6918 #endif // XNN_NO_X32_OPERATORS
6919
6920 /************************** XX RISC-V micro-kernels ***************************/
6921 #ifndef XNN_NO_XX_OPERATORS
6922 init_flags |= XNN_INIT_FLAG_XX;
6923
6924 xnn_params.xx.copy = (xnn_vunary_ukernel_fn) xnn_xx_copy_ukernel__scalar_memcpy;
6925 xnn_params.xx.fill = (struct fill_parameters) {
6926 .ukernel = (xnn_fill_ukernel_fn) xnn_xx_fill_ukernel__scalar_x16,
6927 .row_tile = 1,
6928 };
6929 xnn_params.xx.pad = (struct pad_parameters) {
6930 .ukernel = (xnn_pad_ukernel_fn) xnn_xx_pad_ukernel__scalar,
6931 .row_tile = 1,
6932 };
6933 #endif // XNN_NO_XX_OPERATORS
6934
6935#else
6936 #error "Unsupported architecture"
6937#endif
6938
6939 memcpy(&xnn_params.allocator, init_allocator, sizeof(struct xnn_allocator));
6940 xnn_params.init_flags = init_flags;
6941}
6942
6943#if XNN_PLATFORM_WINDOWS
6944 static BOOL CALLBACK init_windows(PINIT_ONCE init_once, PVOID parameter, PVOID* context) {
6945 init();
6946 return TRUE;
6947 }
6948#endif
6949
6950enum xnn_status xnn_initialize(const struct xnn_allocator* allocator) {
6951 const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config();
6952 if (hardware_config == NULL) {
6953 xnn_log_error("XNNPACK initialization failed: hardware not supported");
6954 return xnn_status_unsupported_hardware;
6955 }
6956
6957 if (allocator == NULL) {
6958 allocator = &xnn_default_allocator;
6959 }
6960 #ifdef _MSC_VER
6961 _InterlockedCompareExchangePointer((PVOID volatile*) &init_allocator, (PVOID) allocator, NULL);
6962 #else
6963 __sync_bool_compare_and_swap(&init_allocator, NULL, allocator);
6964 #endif
6965 #if XNN_PLATFORM_WINDOWS
6966 InitOnceExecuteOnce(&init_guard, &init_windows, NULL, NULL);
6967 #else
6968 pthread_once(&init_guard, &init);
6969 #endif
6970 if ((xnn_params.init_flags & XNN_INIT_FLAG_XNNPACK) != 0) {
6971 return xnn_status_success;
6972 } else {
6973 return xnn_status_unsupported_hardware;
6974 }
6975}
6976
6977enum xnn_status xnn_deinitialize(void) {
6978 return xnn_status_success;
6979}
6980