1 | // Copyright 2022 Google LLC |
2 | // |
3 | // This source code is licensed under the BSD-style license found in the |
4 | // LICENSE file in the root directory of this source tree. |
5 | |
6 | #include <assert.h> |
7 | #include <stddef.h> |
8 | |
9 | #ifdef _WIN32 |
10 | #include <windows.h> |
11 | #else |
12 | #include <pthread.h> |
13 | #endif |
14 | |
15 | #include <xnnpack/common.h> |
16 | #include <xnnpack/config.h> |
17 | #include <xnnpack/microparams-init.h> |
18 | #include <xnnpack/vbinary.h> |
19 | #include <xnnpack/vadd.h> |
20 | #include <xnnpack/vmul.h> |
21 | |
22 | static struct xnn_binary_elementwise_config f16_vadd_config = {0}; |
23 | static struct xnn_binary_elementwise_config f16_vdiv_config = {0}; |
24 | static struct xnn_binary_elementwise_config f16_vmax_config = {0}; |
25 | static struct xnn_binary_elementwise_config f16_vmin_config = {0}; |
26 | static struct xnn_binary_elementwise_config f16_vmul_config = {0}; |
27 | static struct xnn_binary_elementwise_config f16_vsub_config = {0}; |
28 | static struct xnn_binary_elementwise_config f16_vsqrdiff_config = {0}; |
29 | |
30 | static struct xnn_binary_elementwise_config f32_vadd_config = {0}; |
31 | static struct xnn_binary_elementwise_config f32_vdiv_config = {0}; |
32 | static struct xnn_binary_elementwise_config f32_vmax_config = {0}; |
33 | static struct xnn_binary_elementwise_config f32_vmin_config = {0}; |
34 | static struct xnn_binary_elementwise_config f32_vmul_config = {0}; |
35 | static struct xnn_binary_elementwise_config f32_vsub_config = {0}; |
36 | static struct xnn_binary_elementwise_config f32_vsqrdiff_config = {0}; |
37 | |
38 | static struct xnn_binary_elementwise_config qs8_vadd_config = {0}; |
39 | static struct xnn_binary_elementwise_config qs8_vmul_config = {0}; |
40 | |
41 | static struct xnn_binary_elementwise_config qu8_vadd_config = {0}; |
42 | static struct xnn_binary_elementwise_config qu8_vmul_config = {0}; |
43 | |
44 | #if XNN_PLATFORM_WINDOWS |
45 | static INIT_ONCE init_guard_f16_vadd = INIT_ONCE_STATIC_INIT; |
46 | static INIT_ONCE init_guard_f16_vdiv = INIT_ONCE_STATIC_INIT; |
47 | static INIT_ONCE init_guard_f16_vmax = INIT_ONCE_STATIC_INIT; |
48 | static INIT_ONCE init_guard_f16_vmin = INIT_ONCE_STATIC_INIT; |
49 | static INIT_ONCE init_guard_f16_vmul = INIT_ONCE_STATIC_INIT; |
50 | static INIT_ONCE init_guard_f16_vsub = INIT_ONCE_STATIC_INIT; |
51 | static INIT_ONCE init_guard_f16_vsqrdiff = INIT_ONCE_STATIC_INIT; |
52 | static INIT_ONCE init_guard_f32_vadd = INIT_ONCE_STATIC_INIT; |
53 | static INIT_ONCE init_guard_f32_vdiv = INIT_ONCE_STATIC_INIT; |
54 | static INIT_ONCE init_guard_f32_vmax = INIT_ONCE_STATIC_INIT; |
55 | static INIT_ONCE init_guard_f32_vmin = INIT_ONCE_STATIC_INIT; |
56 | static INIT_ONCE init_guard_f32_vmul = INIT_ONCE_STATIC_INIT; |
57 | static INIT_ONCE init_guard_f32_vsub = INIT_ONCE_STATIC_INIT; |
58 | static INIT_ONCE init_guard_f32_vsqrdiff = INIT_ONCE_STATIC_INIT; |
59 | static INIT_ONCE init_guard_qs8_vadd = INIT_ONCE_STATIC_INIT; |
60 | static INIT_ONCE init_guard_qs8_vmul = INIT_ONCE_STATIC_INIT; |
61 | static INIT_ONCE init_guard_qu8_vadd = INIT_ONCE_STATIC_INIT; |
62 | static INIT_ONCE init_guard_qu8_vmul = INIT_ONCE_STATIC_INIT; |
63 | #else |
64 | static pthread_once_t init_guard_f16_vadd = PTHREAD_ONCE_INIT; |
65 | static pthread_once_t init_guard_f16_vdiv = PTHREAD_ONCE_INIT; |
66 | static pthread_once_t init_guard_f16_vmax = PTHREAD_ONCE_INIT; |
67 | static pthread_once_t init_guard_f16_vmin = PTHREAD_ONCE_INIT; |
68 | static pthread_once_t init_guard_f16_vmul = PTHREAD_ONCE_INIT; |
69 | static pthread_once_t init_guard_f16_vsub = PTHREAD_ONCE_INIT; |
70 | static pthread_once_t init_guard_f16_vsqrdiff = PTHREAD_ONCE_INIT; |
71 | static pthread_once_t init_guard_f32_vadd = PTHREAD_ONCE_INIT; |
72 | static pthread_once_t init_guard_f32_vdiv = PTHREAD_ONCE_INIT; |
73 | static pthread_once_t init_guard_f32_vmax = PTHREAD_ONCE_INIT; |
74 | static pthread_once_t init_guard_f32_vmin = PTHREAD_ONCE_INIT; |
75 | static pthread_once_t init_guard_f32_vmul = PTHREAD_ONCE_INIT; |
76 | static pthread_once_t init_guard_f32_vsub = PTHREAD_ONCE_INIT; |
77 | static pthread_once_t init_guard_f32_vsqrdiff = PTHREAD_ONCE_INIT; |
78 | static pthread_once_t init_guard_qs8_vadd = PTHREAD_ONCE_INIT; |
79 | static pthread_once_t init_guard_qs8_vmul = PTHREAD_ONCE_INIT; |
80 | static pthread_once_t init_guard_qu8_vadd = PTHREAD_ONCE_INIT; |
81 | static pthread_once_t init_guard_qu8_vmul = PTHREAD_ONCE_INIT; |
82 | #endif |
83 | |
84 | |
85 | static void init_f16_vadd_config(void) { |
86 | #if XNN_ARCH_ARM && XNN_ENABLE_ARM_FP16_VECTOR && XNN_ENABLE_ARM_FP16_SCALAR |
87 | const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config(); |
88 | assert(hardware_config != NULL); |
89 | if (hardware_config->use_arm_neon_fp16_arith) { |
90 | f16_vadd_config.minmax.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_f16_vadd_minmax_ukernel__neonfp16arith_x16; |
91 | f16_vadd_config.minmax.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f16_vaddc_minmax_ukernel__neonfp16arith_x16; |
92 | f16_vadd_config.minmax.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f16_vaddc_minmax_ukernel__neonfp16arith_x16; |
93 | f16_vadd_config.init.f16_minmax = xnn_init_f16_minmax_fp16arith_params; |
94 | f16_vadd_config.element_tile = 16; |
95 | } |
96 | #elif XNN_ARCH_ARM64 && XNN_ENABLE_ARM_FP16_VECTOR |
97 | const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config(); |
98 | assert(hardware_config != NULL); |
99 | if (hardware_config->use_arm_neon_fp16_arith) { |
100 | f16_vadd_config.minmax.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_f16_vadd_minmax_ukernel__neonfp16arith_x16; |
101 | f16_vadd_config.minmax.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f16_vaddc_minmax_ukernel__neonfp16arith_x16; |
102 | f16_vadd_config.minmax.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f16_vaddc_minmax_ukernel__neonfp16arith_x16; |
103 | f16_vadd_config.init.f16_minmax = xnn_init_f16_minmax_fp16arith_params; |
104 | f16_vadd_config.element_tile = 16; |
105 | } |
106 | #elif (XNN_ARCH_X86 || XNN_ARCH_X86_64) && !XNN_PLATFORM_MOBILE |
107 | const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config(); |
108 | assert(hardware_config != NULL); |
109 | if (hardware_config->use_x86_avx2) { |
110 | f16_vadd_config.minmax.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_f16_vadd_minmax_ukernel__f16c_x16; |
111 | f16_vadd_config.minmax.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f16_vaddc_minmax_ukernel__f16c_x16; |
112 | f16_vadd_config.minmax.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f16_vaddc_minmax_ukernel__f16c_x16; |
113 | f16_vadd_config.init.f16_minmax = xnn_init_f16_minmax_avx_params; |
114 | f16_vadd_config.element_tile = 16; |
115 | } |
116 | #endif |
117 | } |
118 | |
119 | static void init_f16_vdiv_config(void) { |
120 | #if XNN_ARCH_ARM && XNN_ENABLE_ARM_FP16_VECTOR && XNN_ENABLE_ARM_FP16_SCALAR |
121 | const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config(); |
122 | assert(hardware_config != NULL); |
123 | if (hardware_config->use_arm_neon_fp16_arith) { |
124 | f16_vdiv_config.minmax.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_f16_vdiv_minmax_ukernel__fp16arith_x2; |
125 | f16_vdiv_config.minmax.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f16_vdivc_minmax_ukernel__fp16arith_x2; |
126 | f16_vdiv_config.minmax.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f16_vrdivc_minmax_ukernel__fp16arith_x2; |
127 | f16_vdiv_config.init.f16_minmax = xnn_init_f16_minmax_fp16arith_params; |
128 | f16_vdiv_config.element_tile = 2; |
129 | } |
130 | #elif XNN_ARCH_ARM64 && XNN_ENABLE_ARM_FP16_VECTOR |
131 | const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config(); |
132 | assert(hardware_config != NULL); |
133 | if (hardware_config->use_arm_neon_fp16_arith) { |
134 | f16_vdiv_config.minmax.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_f16_vdiv_minmax_ukernel__aarch64_neonfp16arith_x8; |
135 | f16_vdiv_config.minmax.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f16_vdivc_minmax_ukernel__aarch64_neonfp16arith_x8; |
136 | f16_vdiv_config.minmax.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f16_vrdivc_minmax_ukernel__aarch64_neonfp16arith_x8; |
137 | f16_vdiv_config.init.f16_minmax = xnn_init_f16_minmax_fp16arith_params; |
138 | f16_vdiv_config.element_tile = 8; |
139 | } |
140 | #elif (XNN_ARCH_X86 || XNN_ARCH_X86_64) && !XNN_PLATFORM_MOBILE |
141 | const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config(); |
142 | assert(hardware_config != NULL); |
143 | if (hardware_config->use_x86_avx2) { |
144 | f16_vdiv_config.minmax.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_f16_vdiv_minmax_ukernel__f16c_x8; |
145 | f16_vdiv_config.minmax.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f16_vdivc_minmax_ukernel__f16c_x8; |
146 | f16_vdiv_config.minmax.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f16_vrdivc_minmax_ukernel__f16c_x8; |
147 | f16_vdiv_config.init.f16_minmax = xnn_init_f16_minmax_avx_params; |
148 | f16_vdiv_config.element_tile = 8; |
149 | } |
150 | #endif |
151 | } |
152 | |
153 | static void init_f16_vmax_config(void) { |
154 | #if XNN_ARCH_ARM && XNN_ENABLE_ARM_FP16_VECTOR && XNN_ENABLE_ARM_FP16_SCALAR |
155 | const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config(); |
156 | assert(hardware_config != NULL); |
157 | if (hardware_config->use_arm_neon_fp16_arith) { |
158 | f16_vmax_config.minmax.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_f16_vmax_ukernel__neonfp16arith_x16; |
159 | f16_vmax_config.minmax.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f16_vmaxc_ukernel__neonfp16arith_x16; |
160 | f16_vmax_config.minmax.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f16_vmaxc_ukernel__neonfp16arith_x16; |
161 | f16_vmax_config.element_tile = 16; |
162 | } |
163 | #elif XNN_ARCH_ARM64 && XNN_ENABLE_ARM_FP16_VECTOR |
164 | const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config(); |
165 | assert(hardware_config != NULL); |
166 | if (hardware_config->use_arm_neon_fp16_arith) { |
167 | f16_vmax_config.minmax.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_f16_vmax_ukernel__neonfp16arith_x16; |
168 | f16_vmax_config.minmax.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f16_vmaxc_ukernel__neonfp16arith_x16; |
169 | f16_vmax_config.minmax.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f16_vmaxc_ukernel__neonfp16arith_x16; |
170 | f16_vmax_config.element_tile = 16; |
171 | } |
172 | #elif (XNN_ARCH_X86 || XNN_ARCH_X86_64) && !XNN_PLATFORM_MOBILE |
173 | const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config(); |
174 | assert(hardware_config != NULL); |
175 | if (hardware_config->use_x86_avx2) { |
176 | f16_vmax_config.minmax.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_f16_vmax_ukernel__f16c_x16; |
177 | f16_vmax_config.minmax.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f16_vmaxc_ukernel__f16c_x16; |
178 | f16_vmax_config.minmax.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f16_vmaxc_ukernel__f16c_x16; |
179 | f16_vmax_config.element_tile = 16; |
180 | } |
181 | #endif |
182 | } |
183 | |
184 | static void init_f16_vmin_config(void) { |
185 | #if XNN_ARCH_ARM && XNN_ENABLE_ARM_FP16_VECTOR && XNN_ENABLE_ARM_FP16_SCALAR |
186 | const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config(); |
187 | assert(hardware_config != NULL); |
188 | if (hardware_config->use_arm_neon_fp16_arith) { |
189 | f16_vmin_config.minmax.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_f16_vmin_ukernel__neonfp16arith_x16; |
190 | f16_vmin_config.minmax.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f16_vminc_ukernel__neonfp16arith_x16; |
191 | f16_vmin_config.minmax.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f16_vminc_ukernel__neonfp16arith_x16; |
192 | f16_vmin_config.element_tile = 16; |
193 | } |
194 | #elif XNN_ARCH_ARM64 && XNN_ENABLE_ARM_FP16_VECTOR |
195 | const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config(); |
196 | assert(hardware_config != NULL); |
197 | if (hardware_config->use_arm_neon_fp16_arith) { |
198 | f16_vmin_config.minmax.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_f16_vmin_ukernel__neonfp16arith_x16; |
199 | f16_vmin_config.minmax.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f16_vminc_ukernel__neonfp16arith_x16; |
200 | f16_vmin_config.minmax.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f16_vminc_ukernel__neonfp16arith_x16; |
201 | f16_vmin_config.element_tile = 16; |
202 | } |
203 | #elif (XNN_ARCH_X86 || XNN_ARCH_X86_64) && !XNN_PLATFORM_MOBILE |
204 | const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config(); |
205 | assert(hardware_config != NULL); |
206 | if (hardware_config->use_x86_avx2) { |
207 | f16_vmin_config.minmax.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_f16_vmin_ukernel__f16c_x16; |
208 | f16_vmin_config.minmax.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f16_vminc_ukernel__f16c_x16; |
209 | f16_vmin_config.minmax.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f16_vminc_ukernel__f16c_x16; |
210 | f16_vmin_config.element_tile = 16; |
211 | } |
212 | #endif |
213 | } |
214 | |
215 | static void init_f16_vmul_config(void) { |
216 | #if XNN_ARCH_ARM && XNN_ENABLE_ARM_FP16_VECTOR && XNN_ENABLE_ARM_FP16_SCALAR |
217 | const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config(); |
218 | assert(hardware_config != NULL); |
219 | if (hardware_config->use_arm_neon_fp16_arith) { |
220 | f16_vmul_config.minmax.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_f16_vmul_minmax_ukernel__neonfp16arith_x16; |
221 | f16_vmul_config.minmax.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f16_vmulc_minmax_ukernel__neonfp16arith_x16; |
222 | f16_vmul_config.minmax.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f16_vmulc_minmax_ukernel__neonfp16arith_x16; |
223 | f16_vmul_config.init.f16_minmax = xnn_init_f16_minmax_fp16arith_params; |
224 | f16_vmul_config.element_tile = 16; |
225 | } |
226 | #elif XNN_ARCH_ARM64 && XNN_ENABLE_ARM_FP16_VECTOR |
227 | const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config(); |
228 | assert(hardware_config != NULL); |
229 | if (hardware_config->use_arm_neon_fp16_arith) { |
230 | f16_vmul_config.minmax.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_f16_vmul_minmax_ukernel__neonfp16arith_x16; |
231 | f16_vmul_config.minmax.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f16_vmulc_minmax_ukernel__neonfp16arith_x16; |
232 | f16_vmul_config.minmax.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f16_vmulc_minmax_ukernel__neonfp16arith_x16; |
233 | f16_vmul_config.init.f16_minmax = xnn_init_f16_minmax_fp16arith_params; |
234 | f16_vmul_config.element_tile = 16; |
235 | } |
236 | #elif (XNN_ARCH_X86 || XNN_ARCH_X86_64) && !XNN_PLATFORM_MOBILE |
237 | const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config(); |
238 | assert(hardware_config != NULL); |
239 | if (hardware_config->use_x86_avx2) { |
240 | f16_vmul_config.minmax.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_f16_vmul_minmax_ukernel__f16c_x16; |
241 | f16_vmul_config.minmax.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f16_vmulc_minmax_ukernel__f16c_x16; |
242 | f16_vmul_config.minmax.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f16_vmulc_minmax_ukernel__f16c_x16; |
243 | f16_vmul_config.init.f16_minmax = xnn_init_f16_minmax_avx_params; |
244 | f16_vmul_config.element_tile = 16; |
245 | } |
246 | #endif |
247 | } |
248 | |
249 | static void init_f16_vsub_config(void) { |
250 | #if XNN_ARCH_ARM && XNN_ENABLE_ARM_FP16_VECTOR && XNN_ENABLE_ARM_FP16_SCALAR |
251 | const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config(); |
252 | assert(hardware_config != NULL); |
253 | if (hardware_config->use_arm_neon_fp16_arith) { |
254 | f16_vsub_config.minmax.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_f16_vsub_minmax_ukernel__neonfp16arith_x16; |
255 | f16_vsub_config.minmax.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f16_vsubc_minmax_ukernel__neonfp16arith_x16; |
256 | f16_vsub_config.minmax.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f16_vrsubc_minmax_ukernel__neonfp16arith_x16; |
257 | f16_vsub_config.init.f16_minmax = xnn_init_f16_minmax_fp16arith_params; |
258 | f16_vsub_config.element_tile = 16; |
259 | } |
260 | #elif XNN_ARCH_ARM64 && XNN_ENABLE_ARM_FP16_VECTOR |
261 | const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config(); |
262 | assert(hardware_config != NULL); |
263 | if (hardware_config->use_arm_neon_fp16_arith) { |
264 | f16_vsub_config.minmax.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_f16_vsub_minmax_ukernel__neonfp16arith_x16; |
265 | f16_vsub_config.minmax.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f16_vsubc_minmax_ukernel__neonfp16arith_x16; |
266 | f16_vsub_config.minmax.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f16_vrsubc_minmax_ukernel__neonfp16arith_x16; |
267 | f16_vsub_config.init.f16_minmax = xnn_init_f16_minmax_fp16arith_params; |
268 | f16_vsub_config.element_tile = 16; |
269 | } |
270 | #elif (XNN_ARCH_X86 || XNN_ARCH_X86_64) && !XNN_PLATFORM_MOBILE |
271 | const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config(); |
272 | assert(hardware_config != NULL); |
273 | if (hardware_config->use_x86_avx2) { |
274 | f16_vsub_config.minmax.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_f16_vsub_minmax_ukernel__f16c_x16; |
275 | f16_vsub_config.minmax.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f16_vsubc_minmax_ukernel__f16c_x16; |
276 | f16_vsub_config.minmax.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f16_vrsubc_minmax_ukernel__f16c_x16; |
277 | f16_vsub_config.init.f16_minmax = xnn_init_f16_minmax_avx_params; |
278 | f16_vsub_config.element_tile = 16; |
279 | } |
280 | #endif |
281 | } |
282 | |
283 | static void init_f16_vsqrdiff_config(void) { |
284 | #if XNN_ARCH_ARM && XNN_ENABLE_ARM_FP16_VECTOR && XNN_ENABLE_ARM_FP16_SCALAR |
285 | const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config(); |
286 | assert(hardware_config != NULL); |
287 | if (hardware_config->use_arm_neon_fp16_arith) { |
288 | f16_vsqrdiff_config.minmax.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_f16_vsqrdiff_ukernel__neonfp16arith_x16; |
289 | f16_vsqrdiff_config.minmax.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f16_vsqrdiffc_ukernel__neonfp16arith_x16; |
290 | f16_vsqrdiff_config.minmax.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f16_vsqrdiffc_ukernel__neonfp16arith_x16; |
291 | f16_vsqrdiff_config.init.f16_minmax = xnn_init_f16_minmax_fp16arith_params; |
292 | f16_vsqrdiff_config.element_tile = 16; |
293 | } |
294 | #elif XNN_ARCH_ARM64 && XNN_ENABLE_ARM_FP16_VECTOR |
295 | const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config(); |
296 | assert(hardware_config != NULL); |
297 | if (hardware_config->use_arm_neon_fp16_arith) { |
298 | f16_vsqrdiff_config.minmax.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_f16_vsqrdiff_ukernel__neonfp16arith_x16; |
299 | f16_vsqrdiff_config.minmax.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f16_vsqrdiffc_ukernel__neonfp16arith_x16; |
300 | f16_vsqrdiff_config.minmax.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f16_vsqrdiffc_ukernel__neonfp16arith_x16; |
301 | f16_vsqrdiff_config.init.f16_minmax = xnn_init_f16_minmax_fp16arith_params; |
302 | f16_vsqrdiff_config.element_tile = 16; |
303 | } |
304 | #elif (XNN_ARCH_X86 || XNN_ARCH_X86_64) && !XNN_PLATFORM_MOBILE |
305 | const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config(); |
306 | assert(hardware_config != NULL); |
307 | if (hardware_config->use_x86_avx2) { |
308 | f16_vsqrdiff_config.minmax.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_f16_vsqrdiff_ukernel__f16c_x16; |
309 | f16_vsqrdiff_config.minmax.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f16_vsqrdiffc_ukernel__f16c_x16; |
310 | f16_vsqrdiff_config.minmax.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f16_vsqrdiffc_ukernel__f16c_x16; |
311 | f16_vsqrdiff_config.init.f16_minmax = xnn_init_f16_minmax_avx_params; |
312 | f16_vsqrdiff_config.element_tile = 16; |
313 | } |
314 | #endif |
315 | } |
316 | |
317 | static void init_f32_vadd_config(void) { |
318 | #if XNN_ARCH_ARM |
319 | const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config(); |
320 | assert(hardware_config != NULL); |
321 | if (hardware_config->use_arm_neon){ |
322 | f32_vadd_config.minmax.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vadd_minmax_ukernel__neon_x8; |
323 | f32_vadd_config.minmax.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vaddc_minmax_ukernel__neon_x8; |
324 | f32_vadd_config.minmax.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vaddc_minmax_ukernel__neon_x8; |
325 | f32_vadd_config.init.f32_minmax = xnn_init_f32_minmax_scalar_params; |
326 | f32_vadd_config.element_tile = 8; |
327 | } else if (!XNN_PLATFORM_MOBILE) { |
328 | f32_vadd_config.minmax.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vadd_minmax_ukernel__scalar_x8; |
329 | f32_vadd_config.minmax.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vaddc_minmax_ukernel__scalar_x8; |
330 | f32_vadd_config.minmax.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vaddc_minmax_ukernel__scalar_x8; |
331 | f32_vadd_config.init.f32_minmax = xnn_init_f32_minmax_scalar_params; |
332 | f32_vadd_config.element_tile = 8; |
333 | } |
334 | #elif XNN_ARCH_ARM64 |
335 | f32_vadd_config.minmax.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vadd_minmax_ukernel__neon_x8; |
336 | f32_vadd_config.minmax.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vaddc_minmax_ukernel__neon_x8; |
337 | f32_vadd_config.minmax.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vaddc_minmax_ukernel__neon_x8; |
338 | f32_vadd_config.init.f32_minmax = xnn_init_f32_minmax_scalar_params; |
339 | f32_vadd_config.element_tile = 8; |
340 | #elif XNN_ARCH_X86 || XNN_ARCH_X86_64 |
341 | const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config(); |
342 | assert(hardware_config != NULL); |
343 | if (!XNN_PLATFORM_MOBILE && hardware_config->use_x86_avx512f) { |
344 | f32_vadd_config.minmax.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vadd_minmax_ukernel__avx512f_x32; |
345 | f32_vadd_config.minmax.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vaddc_minmax_ukernel__avx512f_x32; |
346 | f32_vadd_config.minmax.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vaddc_minmax_ukernel__avx512f_x32; |
347 | f32_vadd_config.init.f32_minmax = xnn_init_f32_minmax_scalar_params; |
348 | f32_vadd_config.element_tile = 32; |
349 | } else if (hardware_config->use_x86_avx) { |
350 | f32_vadd_config.minmax.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vadd_minmax_ukernel__avx_x16; |
351 | f32_vadd_config.minmax.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vaddc_minmax_ukernel__avx_x16; |
352 | f32_vadd_config.minmax.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vaddc_minmax_ukernel__avx_x16; |
353 | f32_vadd_config.init.f32_minmax = xnn_init_f32_minmax_avx_params; |
354 | f32_vadd_config.element_tile = 16; |
355 | } else { |
356 | f32_vadd_config.minmax.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vadd_minmax_ukernel__sse_x8; |
357 | f32_vadd_config.minmax.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vaddc_minmax_ukernel__sse_x8; |
358 | f32_vadd_config.minmax.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vaddc_minmax_ukernel__sse_x8; |
359 | f32_vadd_config.init.f32_minmax = xnn_init_f32_minmax_sse_params; |
360 | f32_vadd_config.element_tile = 8; |
361 | } |
362 | #elif XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD |
363 | const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config(); |
364 | assert(hardware_config != NULL); |
365 | if (hardware_config->is_x86) { |
366 | f32_vadd_config.minmax.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vadd_minmax_ukernel__wasmsimd_x86_x16; |
367 | f32_vadd_config.minmax.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vaddc_minmax_ukernel__wasmsimd_x86_x16; |
368 | f32_vadd_config.minmax.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vaddc_minmax_ukernel__wasmsimd_x86_x16; |
369 | f32_vadd_config.linear.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vadd_ukernel__wasmsimd_x16; |
370 | f32_vadd_config.linear.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vaddc_ukernel__wasmsimd_x16; |
371 | f32_vadd_config.linear.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vaddc_ukernel__wasmsimd_x16; |
372 | f32_vadd_config.init.f32_minmax = xnn_init_f32_minmax_wasmsimd_params; |
373 | f32_vadd_config.element_tile = 16; |
374 | } else { |
375 | f32_vadd_config.minmax.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vadd_minmax_ukernel__wasmsimd_arm_x16; |
376 | f32_vadd_config.minmax.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vaddc_minmax_ukernel__wasmsimd_arm_x16; |
377 | f32_vadd_config.minmax.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vaddc_minmax_ukernel__wasmsimd_arm_x16; |
378 | f32_vadd_config.linear.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vadd_ukernel__wasmsimd_x16; |
379 | f32_vadd_config.linear.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vaddc_ukernel__wasmsimd_x16; |
380 | f32_vadd_config.linear.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vaddc_ukernel__wasmsimd_x16; |
381 | f32_vadd_config.init.f32_minmax = xnn_init_f32_minmax_wasmsimd_params; |
382 | f32_vadd_config.element_tile = 16; |
383 | } |
384 | #elif XNN_ARCH_WASM |
385 | f32_vadd_config.minmax.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vadd_minmax_ukernel__wasm_x8; |
386 | f32_vadd_config.minmax.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vaddc_minmax_ukernel__wasm_x8; |
387 | f32_vadd_config.minmax.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vaddc_minmax_ukernel__wasm_x8; |
388 | f32_vadd_config.init.f32_minmax = xnn_init_f32_minmax_scalar_params; |
389 | f32_vadd_config.element_tile = 8; |
390 | #elif XNN_ARCH_RISCV |
391 | f32_vadd_config.minmax.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vadd_minmax_ukernel__scalar_x8; |
392 | f32_vadd_config.minmax.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vaddc_minmax_ukernel__scalar_x8; |
393 | f32_vadd_config.minmax.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vaddc_minmax_ukernel__scalar_x8; |
394 | f32_vadd_config.init.f32_minmax = xnn_init_f32_minmax_scalar_params; |
395 | f32_vadd_config.element_tile = 8; |
396 | #else |
397 | #error "Unsupported architecture" |
398 | #endif |
399 | } |
400 | |
401 | static void init_f32_vdiv_config(void) { |
402 | #if XNN_ARCH_ARM |
403 | const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config(); |
404 | assert(hardware_config != NULL); |
405 | if (hardware_config->use_arm_neon){ |
406 | f32_vdiv_config.minmax.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vdiv_minmax_ukernel__scalar_x2; |
407 | f32_vdiv_config.minmax.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vdivc_minmax_ukernel__scalar_x2; |
408 | f32_vdiv_config.minmax.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vrdivc_minmax_ukernel__scalar_x2; |
409 | f32_vdiv_config.init.f32_minmax = xnn_init_f32_minmax_scalar_params; |
410 | f32_vdiv_config.element_tile = 2; |
411 | } else if (!XNN_PLATFORM_MOBILE) { |
412 | f32_vdiv_config.minmax.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vdiv_minmax_ukernel__scalar_x2; |
413 | f32_vdiv_config.minmax.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vdivc_minmax_ukernel__scalar_x2; |
414 | f32_vdiv_config.minmax.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vrdivc_minmax_ukernel__scalar_x2; |
415 | f32_vdiv_config.init.f32_minmax = xnn_init_f32_minmax_scalar_params; |
416 | f32_vdiv_config.element_tile = 2; |
417 | } |
418 | #elif XNN_ARCH_ARM64 |
419 | f32_vdiv_config.minmax.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vdiv_minmax_ukernel__aarch64_neon_x8; |
420 | f32_vdiv_config.minmax.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vdivc_minmax_ukernel__aarch64_neon_x8; |
421 | f32_vdiv_config.minmax.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vrdivc_minmax_ukernel__aarch64_neon_x8; |
422 | f32_vdiv_config.init.f32_minmax = xnn_init_f32_minmax_scalar_params; |
423 | f32_vdiv_config.element_tile = 8; |
424 | #elif XNN_ARCH_X86 || XNN_ARCH_X86_64 |
425 | const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config(); |
426 | assert(hardware_config != NULL); |
427 | if (!XNN_PLATFORM_MOBILE && hardware_config->use_x86_avx512f) { |
428 | f32_vdiv_config.minmax.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vdiv_minmax_ukernel__avx512f_x32; |
429 | f32_vdiv_config.minmax.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vdivc_minmax_ukernel__avx512f_x32; |
430 | f32_vdiv_config.minmax.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vrdivc_minmax_ukernel__avx512f_x32; |
431 | f32_vdiv_config.init.f32_minmax = xnn_init_f32_minmax_scalar_params; |
432 | f32_vdiv_config.element_tile = 32; |
433 | } else if (hardware_config->use_x86_avx) { |
434 | f32_vdiv_config.minmax.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vdiv_minmax_ukernel__avx_x16; |
435 | f32_vdiv_config.minmax.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vdivc_minmax_ukernel__avx_x16; |
436 | f32_vdiv_config.minmax.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vrdivc_minmax_ukernel__avx_x16; |
437 | f32_vdiv_config.init.f32_minmax = xnn_init_f32_minmax_avx_params; |
438 | f32_vdiv_config.element_tile = 16; |
439 | } else { |
440 | f32_vdiv_config.minmax.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vdiv_minmax_ukernel__sse_x8; |
441 | f32_vdiv_config.minmax.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vdivc_minmax_ukernel__sse_x8; |
442 | f32_vdiv_config.minmax.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vrdivc_minmax_ukernel__sse_x8; |
443 | f32_vdiv_config.init.f32_minmax = xnn_init_f32_minmax_sse_params; |
444 | f32_vdiv_config.element_tile = 8; |
445 | } |
446 | #elif XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD |
447 | const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config(); |
448 | assert(hardware_config != NULL); |
449 | if (hardware_config->is_x86) { |
450 | f32_vdiv_config.minmax.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vdiv_minmax_ukernel__wasmsimd_x86_x16; |
451 | f32_vdiv_config.minmax.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vdivc_minmax_ukernel__wasmsimd_x86_x16; |
452 | f32_vdiv_config.minmax.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vrdivc_minmax_ukernel__wasmsimd_x86_x16; |
453 | f32_vdiv_config.linear.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vdiv_ukernel__wasmsimd_x16; |
454 | f32_vdiv_config.linear.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vdivc_ukernel__wasmsimd_x16; |
455 | f32_vdiv_config.linear.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vrdivc_ukernel__wasmsimd_x16; |
456 | f32_vdiv_config.init.f32_minmax = xnn_init_f32_minmax_wasmsimd_params; |
457 | f32_vdiv_config.element_tile = 16; |
458 | } else { |
459 | f32_vdiv_config.minmax.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vdiv_minmax_ukernel__wasmsimd_arm_x16; |
460 | f32_vdiv_config.minmax.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vdivc_minmax_ukernel__wasmsimd_arm_x16; |
461 | f32_vdiv_config.minmax.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vrdivc_minmax_ukernel__wasmsimd_arm_x16; |
462 | f32_vdiv_config.linear.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vdiv_ukernel__wasmsimd_x16; |
463 | f32_vdiv_config.linear.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vdivc_ukernel__wasmsimd_x16; |
464 | f32_vdiv_config.linear.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vrdivc_ukernel__wasmsimd_x16; |
465 | f32_vdiv_config.init.f32_minmax = xnn_init_f32_minmax_wasmsimd_params; |
466 | f32_vdiv_config.element_tile = 16; |
467 | } |
468 | #elif XNN_ARCH_WASM |
469 | f32_vdiv_config.minmax.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vdiv_minmax_ukernel__wasm_x8; |
470 | f32_vdiv_config.minmax.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vdivc_minmax_ukernel__wasm_x8; |
471 | f32_vdiv_config.minmax.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vrdivc_minmax_ukernel__wasm_x8; |
472 | f32_vdiv_config.init.f32_minmax = xnn_init_f32_minmax_scalar_params; |
473 | f32_vdiv_config.element_tile = 8; |
474 | #elif XNN_ARCH_RISCV |
475 | f32_vdiv_config.minmax.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vdiv_minmax_ukernel__scalar_x2; |
476 | f32_vdiv_config.minmax.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vdivc_minmax_ukernel__scalar_x2; |
477 | f32_vdiv_config.minmax.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vrdivc_minmax_ukernel__scalar_x2; |
478 | f32_vdiv_config.init.f32_minmax = xnn_init_f32_minmax_scalar_params; |
479 | f32_vdiv_config.element_tile = 2; |
480 | #else |
481 | #error "Unsupported architecture" |
482 | #endif |
483 | } |
484 | |
485 | static void init_f32_vmax_config(void) { |
486 | #if XNN_ARCH_ARM |
487 | const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config(); |
488 | assert(hardware_config != NULL); |
489 | if (hardware_config->use_arm_neon){ |
490 | f32_vmax_config.minmax.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vmax_ukernel__neon_x8; |
491 | f32_vmax_config.minmax.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vmaxc_ukernel__neon_x8; |
492 | f32_vmax_config.minmax.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vmaxc_ukernel__neon_x8; |
493 | f32_vmax_config.element_tile = 8; |
494 | } else if (!XNN_PLATFORM_MOBILE) { |
495 | f32_vmax_config.minmax.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vmax_ukernel__scalar_x8; |
496 | f32_vmax_config.minmax.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vmaxc_ukernel__scalar_x8; |
497 | f32_vmax_config.minmax.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vmaxc_ukernel__scalar_x8; |
498 | f32_vmax_config.element_tile = 8; |
499 | } |
500 | #elif XNN_ARCH_ARM64 |
501 | f32_vmax_config.minmax.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vmax_ukernel__neon_x8; |
502 | f32_vmax_config.minmax.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vmaxc_ukernel__neon_x8; |
503 | f32_vmax_config.minmax.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vmaxc_ukernel__neon_x8; |
504 | f32_vmax_config.element_tile = 8; |
505 | #elif XNN_ARCH_X86 || XNN_ARCH_X86_64 |
506 | const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config(); |
507 | assert(hardware_config != NULL); |
508 | if (!XNN_PLATFORM_MOBILE && hardware_config->use_x86_avx512f) { |
509 | f32_vmax_config.minmax.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vmax_ukernel__avx512f_x32; |
510 | f32_vmax_config.minmax.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vmaxc_ukernel__avx512f_x32; |
511 | f32_vmax_config.minmax.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vmaxc_ukernel__avx512f_x32; |
512 | f32_vmax_config.element_tile = 32; |
513 | } else if (hardware_config->use_x86_avx) { |
514 | f32_vmax_config.minmax.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vmax_ukernel__avx_x16; |
515 | f32_vmax_config.minmax.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vmaxc_ukernel__avx_x16; |
516 | f32_vmax_config.minmax.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vmaxc_ukernel__avx_x16; |
517 | f32_vmax_config.init.f32_default = xnn_init_f32_default_avx_params; |
518 | f32_vmax_config.element_tile = 16; |
519 | } else { |
520 | f32_vmax_config.minmax.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vmax_ukernel__sse_x8; |
521 | f32_vmax_config.minmax.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vmaxc_ukernel__sse_x8; |
522 | f32_vmax_config.minmax.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vmaxc_ukernel__sse_x8; |
523 | f32_vmax_config.element_tile = 8; |
524 | } |
525 | #elif XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD |
526 | const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config(); |
527 | assert(hardware_config != NULL); |
528 | if (hardware_config->is_x86) { |
529 | f32_vmax_config.minmax.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vmax_ukernel__wasmsimd_x86_x16; |
530 | f32_vmax_config.minmax.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vmaxc_ukernel__wasmsimd_x86_x16; |
531 | f32_vmax_config.minmax.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vmaxc_ukernel__wasmsimd_x86_x16; |
532 | f32_vmax_config.element_tile = 16; |
533 | } else { |
534 | f32_vmax_config.minmax.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vmax_ukernel__wasmsimd_arm_x16; |
535 | f32_vmax_config.minmax.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vmaxc_ukernel__wasmsimd_arm_x16; |
536 | f32_vmax_config.minmax.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vmaxc_ukernel__wasmsimd_arm_x16; |
537 | f32_vmax_config.element_tile = 16; |
538 | } |
539 | #elif XNN_ARCH_WASM |
540 | f32_vmax_config.minmax.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vmax_ukernel__wasm_x8; |
541 | f32_vmax_config.minmax.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vmaxc_ukernel__wasm_x8; |
542 | f32_vmax_config.minmax.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vmaxc_ukernel__wasm_x8; |
543 | f32_vmax_config.element_tile = 8; |
544 | #elif XNN_ARCH_RISCV |
545 | f32_vmax_config.minmax.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vmax_ukernel__scalar_x8; |
546 | f32_vmax_config.minmax.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vmaxc_ukernel__scalar_x8; |
547 | f32_vmax_config.minmax.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vmaxc_ukernel__scalar_x8; |
548 | f32_vmax_config.element_tile = 8; |
549 | #else |
550 | #error "Unsupported architecture" |
551 | #endif |
552 | } |
553 | |
554 | static void init_f32_vmin_config(void) { |
555 | #if XNN_ARCH_ARM |
556 | const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config(); |
557 | assert(hardware_config != NULL); |
558 | if (hardware_config->use_arm_neon){ |
559 | f32_vmin_config.minmax.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vmin_ukernel__neon_x8; |
560 | f32_vmin_config.minmax.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vminc_ukernel__neon_x8; |
561 | f32_vmin_config.minmax.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vminc_ukernel__neon_x8; |
562 | f32_vmin_config.element_tile = 8; |
563 | } else if (!XNN_PLATFORM_MOBILE) { |
564 | f32_vmin_config.minmax.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vmin_ukernel__scalar_x8; |
565 | f32_vmin_config.minmax.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vminc_ukernel__scalar_x8; |
566 | f32_vmin_config.minmax.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vminc_ukernel__scalar_x8; |
567 | f32_vmin_config.element_tile = 8; |
568 | } |
569 | #elif XNN_ARCH_ARM64 |
570 | f32_vmin_config.minmax.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vmin_ukernel__neon_x8; |
571 | f32_vmin_config.minmax.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vminc_ukernel__neon_x8; |
572 | f32_vmin_config.minmax.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vminc_ukernel__neon_x8; |
573 | f32_vmin_config.element_tile = 8; |
574 | #elif XNN_ARCH_X86 || XNN_ARCH_X86_64 |
575 | const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config(); |
576 | assert(hardware_config != NULL); |
577 | if (!XNN_PLATFORM_MOBILE && hardware_config->use_x86_avx512f) { |
578 | f32_vmin_config.minmax.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vmin_ukernel__avx512f_x32; |
579 | f32_vmin_config.minmax.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vminc_ukernel__avx512f_x32; |
580 | f32_vmin_config.minmax.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vminc_ukernel__avx512f_x32; |
581 | f32_vmin_config.element_tile = 32; |
582 | } else if (hardware_config->use_x86_avx) { |
583 | f32_vmin_config.minmax.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vmin_ukernel__avx_x16; |
584 | f32_vmin_config.minmax.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vminc_ukernel__avx_x16; |
585 | f32_vmin_config.minmax.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vminc_ukernel__avx_x16; |
586 | f32_vmin_config.init.f32_default = xnn_init_f32_default_avx_params; |
587 | f32_vmin_config.element_tile = 16; |
588 | } else { |
589 | f32_vmin_config.minmax.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vmin_ukernel__sse_x8; |
590 | f32_vmin_config.minmax.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vminc_ukernel__sse_x8; |
591 | f32_vmin_config.minmax.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vminc_ukernel__sse_x8; |
592 | f32_vmin_config.element_tile = 8; |
593 | } |
594 | #elif XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD |
595 | const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config(); |
596 | assert(hardware_config != NULL); |
597 | if (hardware_config->is_x86) { |
598 | f32_vmin_config.minmax.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vmin_ukernel__wasmsimd_x86_x16; |
599 | f32_vmin_config.minmax.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vminc_ukernel__wasmsimd_x86_x16; |
600 | f32_vmin_config.minmax.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vminc_ukernel__wasmsimd_x86_x16; |
601 | f32_vmin_config.element_tile = 16; |
602 | } else { |
603 | f32_vmin_config.minmax.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vmin_ukernel__wasmsimd_arm_x16; |
604 | f32_vmin_config.minmax.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vminc_ukernel__wasmsimd_arm_x16; |
605 | f32_vmin_config.minmax.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vminc_ukernel__wasmsimd_arm_x16; |
606 | f32_vmin_config.element_tile = 16; |
607 | } |
608 | #elif XNN_ARCH_WASM |
609 | f32_vmin_config.minmax.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vmin_ukernel__wasm_x8; |
610 | f32_vmin_config.minmax.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vminc_ukernel__wasm_x8; |
611 | f32_vmin_config.minmax.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vminc_ukernel__wasm_x8; |
612 | f32_vmin_config.element_tile = 8; |
613 | #elif XNN_ARCH_RISCV |
614 | f32_vmin_config.minmax.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vmin_ukernel__scalar_x8; |
615 | f32_vmin_config.minmax.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vminc_ukernel__scalar_x8; |
616 | f32_vmin_config.minmax.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vminc_ukernel__scalar_x8; |
617 | f32_vmin_config.element_tile = 8; |
618 | #else |
619 | #error "Unsupported architecture" |
620 | #endif |
621 | } |
622 | |
623 | static void init_f32_vmul_config(void) { |
624 | #if XNN_ARCH_ARM |
625 | const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config(); |
626 | assert(hardware_config != NULL); |
627 | if (hardware_config->use_arm_neon){ |
628 | f32_vmul_config.minmax.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vmul_minmax_ukernel__neon_x8; |
629 | f32_vmul_config.minmax.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vmulc_minmax_ukernel__neon_x8; |
630 | f32_vmul_config.minmax.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vmulc_minmax_ukernel__neon_x8; |
631 | f32_vmul_config.init.f32_minmax = xnn_init_f32_minmax_scalar_params; |
632 | f32_vmul_config.element_tile = 8; |
633 | } else if (!XNN_PLATFORM_MOBILE) { |
634 | f32_vmul_config.minmax.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vmul_minmax_ukernel__scalar_x8; |
635 | f32_vmul_config.minmax.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vmulc_minmax_ukernel__scalar_x8; |
636 | f32_vmul_config.minmax.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vmulc_minmax_ukernel__scalar_x8; |
637 | f32_vmul_config.init.f32_minmax = xnn_init_f32_minmax_scalar_params; |
638 | f32_vmul_config.element_tile = 8; |
639 | } |
640 | #elif XNN_ARCH_ARM64 |
641 | f32_vmul_config.minmax.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vmul_minmax_ukernel__neon_x8; |
642 | f32_vmul_config.minmax.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vmulc_minmax_ukernel__neon_x8; |
643 | f32_vmul_config.minmax.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vmulc_minmax_ukernel__neon_x8; |
644 | f32_vmul_config.init.f32_minmax = xnn_init_f32_minmax_scalar_params; |
645 | f32_vmul_config.element_tile = 8; |
646 | #elif XNN_ARCH_X86 || XNN_ARCH_X86_64 |
647 | const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config(); |
648 | assert(hardware_config != NULL); |
649 | if (!XNN_PLATFORM_MOBILE && hardware_config->use_x86_avx512f) { |
650 | f32_vmul_config.minmax.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vmul_minmax_ukernel__avx512f_x32; |
651 | f32_vmul_config.minmax.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vmulc_minmax_ukernel__avx512f_x32; |
652 | f32_vmul_config.minmax.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vmulc_minmax_ukernel__avx512f_x32; |
653 | f32_vmul_config.init.f32_minmax = xnn_init_f32_minmax_scalar_params; |
654 | f32_vmul_config.element_tile = 32; |
655 | } else if (hardware_config->use_x86_avx) { |
656 | f32_vmul_config.minmax.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vmul_minmax_ukernel__avx_x16; |
657 | f32_vmul_config.minmax.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vmulc_minmax_ukernel__avx_x16; |
658 | f32_vmul_config.minmax.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vmulc_minmax_ukernel__avx_x16; |
659 | f32_vmul_config.init.f32_minmax = xnn_init_f32_minmax_avx_params; |
660 | f32_vmul_config.element_tile = 16; |
661 | } else { |
662 | f32_vmul_config.minmax.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vmul_minmax_ukernel__sse_x8; |
663 | f32_vmul_config.minmax.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vmulc_minmax_ukernel__sse_x8; |
664 | f32_vmul_config.minmax.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vmulc_minmax_ukernel__sse_x8; |
665 | f32_vmul_config.init.f32_minmax = xnn_init_f32_minmax_sse_params; |
666 | f32_vmul_config.element_tile = 8; |
667 | } |
668 | #elif XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD |
669 | const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config(); |
670 | assert(hardware_config != NULL); |
671 | if (hardware_config->is_x86) { |
672 | f32_vmul_config.minmax.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vmul_minmax_ukernel__wasmsimd_x86_x16; |
673 | f32_vmul_config.minmax.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vmulc_minmax_ukernel__wasmsimd_x86_x16; |
674 | f32_vmul_config.minmax.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vmulc_minmax_ukernel__wasmsimd_x86_x16; |
675 | f32_vmul_config.linear.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vmul_ukernel__wasmsimd_x16; |
676 | f32_vmul_config.linear.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vmulc_ukernel__wasmsimd_x16; |
677 | f32_vmul_config.linear.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vmulc_ukernel__wasmsimd_x16; |
678 | f32_vmul_config.init.f32_minmax = xnn_init_f32_minmax_wasmsimd_params; |
679 | f32_vmul_config.element_tile = 16; |
680 | } else { |
681 | f32_vmul_config.minmax.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vmul_minmax_ukernel__wasmsimd_arm_x16; |
682 | f32_vmul_config.minmax.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vmulc_minmax_ukernel__wasmsimd_arm_x16; |
683 | f32_vmul_config.minmax.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vmulc_minmax_ukernel__wasmsimd_arm_x16; |
684 | f32_vmul_config.linear.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vmul_ukernel__wasmsimd_x16; |
685 | f32_vmul_config.linear.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vmulc_ukernel__wasmsimd_x16; |
686 | f32_vmul_config.linear.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vmulc_ukernel__wasmsimd_x16; |
687 | f32_vmul_config.init.f32_minmax = xnn_init_f32_minmax_wasmsimd_params; |
688 | f32_vmul_config.element_tile = 16; |
689 | } |
690 | #elif XNN_ARCH_WASM |
691 | f32_vmul_config.minmax.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vmul_minmax_ukernel__wasm_x8; |
692 | f32_vmul_config.minmax.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vmulc_minmax_ukernel__wasm_x8; |
693 | f32_vmul_config.minmax.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vmulc_minmax_ukernel__wasm_x8; |
694 | f32_vmul_config.init.f32_minmax = xnn_init_f32_minmax_scalar_params; |
695 | f32_vmul_config.element_tile = 8; |
696 | #elif XNN_ARCH_RISCV |
697 | f32_vmul_config.minmax.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vmul_minmax_ukernel__scalar_x8; |
698 | f32_vmul_config.minmax.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vmulc_minmax_ukernel__scalar_x8; |
699 | f32_vmul_config.minmax.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vmulc_minmax_ukernel__scalar_x8; |
700 | f32_vmul_config.init.f32_minmax = xnn_init_f32_minmax_scalar_params; |
701 | f32_vmul_config.element_tile = 8; |
702 | #else |
703 | #error "Unsupported architecture" |
704 | #endif |
705 | } |
706 | |
707 | static void init_f32_vsub_config(void) { |
708 | #if XNN_ARCH_ARM |
709 | const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config(); |
710 | assert(hardware_config != NULL); |
711 | if (hardware_config->use_arm_neon){ |
712 | f32_vsub_config.minmax.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vsub_minmax_ukernel__neon_x8; |
713 | f32_vsub_config.minmax.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vsubc_minmax_ukernel__neon_x8; |
714 | f32_vsub_config.minmax.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vrsubc_minmax_ukernel__neon_x8; |
715 | f32_vsub_config.init.f32_minmax = xnn_init_f32_minmax_scalar_params; |
716 | f32_vsub_config.element_tile = 8; |
717 | } else if (!XNN_PLATFORM_MOBILE) { |
718 | f32_vsub_config.minmax.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vsub_minmax_ukernel__scalar_x8; |
719 | f32_vsub_config.minmax.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vsubc_minmax_ukernel__scalar_x8; |
720 | f32_vsub_config.minmax.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vrsubc_minmax_ukernel__scalar_x8; |
721 | f32_vsub_config.init.f32_minmax = xnn_init_f32_minmax_scalar_params; |
722 | f32_vsub_config.element_tile = 8; |
723 | } |
724 | #elif XNN_ARCH_ARM64 |
725 | f32_vsub_config.minmax.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vsub_minmax_ukernel__neon_x8; |
726 | f32_vsub_config.minmax.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vsubc_minmax_ukernel__neon_x8; |
727 | f32_vsub_config.minmax.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vrsubc_minmax_ukernel__neon_x8; |
728 | f32_vsub_config.init.f32_minmax = xnn_init_f32_minmax_scalar_params; |
729 | f32_vsub_config.element_tile = 8; |
730 | #elif XNN_ARCH_X86 || XNN_ARCH_X86_64 |
731 | const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config(); |
732 | assert(hardware_config != NULL); |
733 | if (!XNN_PLATFORM_MOBILE && hardware_config->use_x86_avx512f) { |
734 | f32_vsub_config.minmax.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vsub_minmax_ukernel__avx512f_x32; |
735 | f32_vsub_config.minmax.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vsubc_minmax_ukernel__avx512f_x32; |
736 | f32_vsub_config.minmax.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vrsubc_minmax_ukernel__avx512f_x32; |
737 | f32_vsub_config.init.f32_minmax = xnn_init_f32_minmax_scalar_params; |
738 | f32_vsub_config.element_tile = 32; |
739 | } else if (hardware_config->use_x86_avx) { |
740 | f32_vsub_config.minmax.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vsub_minmax_ukernel__avx_x16; |
741 | f32_vsub_config.minmax.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vsubc_minmax_ukernel__avx_x16; |
742 | f32_vsub_config.minmax.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vrsubc_minmax_ukernel__avx_x16; |
743 | f32_vsub_config.init.f32_minmax = xnn_init_f32_minmax_avx_params; |
744 | f32_vsub_config.element_tile = 16; |
745 | } else { |
746 | f32_vsub_config.minmax.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vsub_minmax_ukernel__sse_x8; |
747 | f32_vsub_config.minmax.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vsubc_minmax_ukernel__sse_x8; |
748 | f32_vsub_config.minmax.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vrsubc_minmax_ukernel__sse_x8; |
749 | f32_vsub_config.init.f32_minmax = xnn_init_f32_minmax_sse_params; |
750 | f32_vsub_config.element_tile = 8; |
751 | } |
752 | #elif XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD |
753 | const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config(); |
754 | assert(hardware_config != NULL); |
755 | if (hardware_config->is_x86) { |
756 | f32_vsub_config.minmax.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vsub_minmax_ukernel__wasmsimd_x86_x16; |
757 | f32_vsub_config.minmax.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vsubc_minmax_ukernel__wasmsimd_x86_x16; |
758 | f32_vsub_config.minmax.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vrsubc_minmax_ukernel__wasmsimd_x86_x16; |
759 | f32_vsub_config.linear.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vsub_ukernel__wasmsimd_x16; |
760 | f32_vsub_config.linear.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vsubc_ukernel__wasmsimd_x16; |
761 | f32_vsub_config.linear.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vrsubc_ukernel__wasmsimd_x16; |
762 | f32_vsub_config.init.f32_minmax = xnn_init_f32_minmax_wasmsimd_params; |
763 | f32_vsub_config.element_tile = 16; |
764 | } else { |
765 | f32_vsub_config.minmax.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vsub_minmax_ukernel__wasmsimd_arm_x16; |
766 | f32_vsub_config.minmax.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vsubc_minmax_ukernel__wasmsimd_arm_x16; |
767 | f32_vsub_config.minmax.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vrsubc_minmax_ukernel__wasmsimd_arm_x16; |
768 | f32_vsub_config.linear.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vsub_ukernel__wasmsimd_x16; |
769 | f32_vsub_config.linear.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vsubc_ukernel__wasmsimd_x16; |
770 | f32_vsub_config.linear.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vrsubc_ukernel__wasmsimd_x16; |
771 | f32_vsub_config.init.f32_minmax = xnn_init_f32_minmax_wasmsimd_params; |
772 | f32_vsub_config.element_tile = 16; |
773 | } |
774 | #elif XNN_ARCH_WASM |
775 | f32_vsub_config.minmax.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vsub_minmax_ukernel__wasm_x8; |
776 | f32_vsub_config.minmax.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vsubc_minmax_ukernel__wasm_x8; |
777 | f32_vsub_config.minmax.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vrsubc_minmax_ukernel__wasm_x8; |
778 | f32_vsub_config.init.f32_minmax = xnn_init_f32_minmax_scalar_params; |
779 | f32_vsub_config.element_tile = 8; |
780 | #elif XNN_ARCH_RISCV |
781 | f32_vsub_config.minmax.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vsub_minmax_ukernel__scalar_x8; |
782 | f32_vsub_config.minmax.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vsubc_minmax_ukernel__scalar_x8; |
783 | f32_vsub_config.minmax.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vrsubc_minmax_ukernel__scalar_x8; |
784 | f32_vsub_config.init.f32_minmax = xnn_init_f32_minmax_scalar_params; |
785 | f32_vsub_config.element_tile = 8; |
786 | #else |
787 | #error "Unsupported architecture" |
788 | #endif |
789 | } |
790 | |
791 | static void init_f32_vsqrdiff_config(void) { |
792 | #if XNN_ARCH_ARM |
793 | const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config(); |
794 | assert(hardware_config != NULL); |
795 | if (hardware_config->use_arm_neon){ |
796 | f32_vsqrdiff_config.minmax.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vsqrdiff_ukernel__neon_x8; |
797 | f32_vsqrdiff_config.minmax.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vsqrdiffc_ukernel__neon_x8; |
798 | f32_vsqrdiff_config.minmax.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vsqrdiffc_ukernel__neon_x8; |
799 | f32_vsqrdiff_config.element_tile = 8; |
800 | } else if (!XNN_PLATFORM_MOBILE) { |
801 | f32_vsqrdiff_config.minmax.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vsqrdiff_ukernel__scalar_x8; |
802 | f32_vsqrdiff_config.minmax.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vsqrdiffc_ukernel__scalar_x8; |
803 | f32_vsqrdiff_config.minmax.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vsqrdiffc_ukernel__scalar_x8; |
804 | f32_vsqrdiff_config.element_tile = 8; |
805 | } |
806 | #elif XNN_ARCH_ARM64 |
807 | f32_vsqrdiff_config.minmax.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vsqrdiff_ukernel__neon_x8; |
808 | f32_vsqrdiff_config.minmax.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vsqrdiffc_ukernel__neon_x8; |
809 | f32_vsqrdiff_config.minmax.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vsqrdiffc_ukernel__neon_x8; |
810 | f32_vsqrdiff_config.element_tile = 8; |
811 | #elif XNN_ARCH_X86 || XNN_ARCH_X86_64 |
812 | const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config(); |
813 | assert(hardware_config != NULL); |
814 | if (!XNN_PLATFORM_MOBILE && hardware_config->use_x86_avx512f) { |
815 | f32_vsqrdiff_config.minmax.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vsqrdiff_ukernel__avx512f_x32; |
816 | f32_vsqrdiff_config.minmax.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vsqrdiffc_ukernel__avx512f_x32; |
817 | f32_vsqrdiff_config.minmax.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vsqrdiffc_ukernel__avx512f_x32; |
818 | f32_vsqrdiff_config.element_tile = 32; |
819 | } else if (hardware_config->use_x86_avx) { |
820 | f32_vsqrdiff_config.minmax.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vsqrdiff_ukernel__avx_x16; |
821 | f32_vsqrdiff_config.minmax.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vsqrdiffc_ukernel__avx_x16; |
822 | f32_vsqrdiff_config.minmax.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vsqrdiffc_ukernel__avx_x16; |
823 | f32_vsqrdiff_config.init.f32_default = xnn_init_f32_default_avx_params; |
824 | f32_vsqrdiff_config.element_tile = 16; |
825 | } else { |
826 | f32_vsqrdiff_config.minmax.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vsqrdiff_ukernel__sse_x8; |
827 | f32_vsqrdiff_config.minmax.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vsqrdiffc_ukernel__sse_x8; |
828 | f32_vsqrdiff_config.minmax.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vsqrdiffc_ukernel__sse_x8; |
829 | f32_vsqrdiff_config.element_tile = 8; |
830 | } |
831 | #elif XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD |
832 | f32_vsqrdiff_config.minmax.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vsqrdiff_ukernel__wasmsimd_x16; |
833 | f32_vsqrdiff_config.minmax.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vsqrdiffc_ukernel__wasmsimd_x16; |
834 | f32_vsqrdiff_config.minmax.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vsqrdiffc_ukernel__wasmsimd_x16; |
835 | f32_vsqrdiff_config.element_tile = 16; |
836 | #elif XNN_ARCH_WASM |
837 | f32_vsqrdiff_config.minmax.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vsqrdiff_ukernel__scalar_x8; |
838 | f32_vsqrdiff_config.minmax.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vsqrdiffc_ukernel__scalar_x8; |
839 | f32_vsqrdiff_config.minmax.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vsqrdiffc_ukernel__scalar_x8; |
840 | f32_vsqrdiff_config.element_tile = 8; |
841 | #elif XNN_ARCH_RISCV |
842 | f32_vsqrdiff_config.minmax.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vsqrdiff_ukernel__scalar_x8; |
843 | f32_vsqrdiff_config.minmax.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vsqrdiffc_ukernel__scalar_x8; |
844 | f32_vsqrdiff_config.minmax.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vsqrdiffc_ukernel__scalar_x8; |
845 | f32_vsqrdiff_config.element_tile = 8; |
846 | #else |
847 | #error "Unsupported architecture" |
848 | #endif |
849 | } |
850 | |
851 | static void init_qs8_vadd_config(void) { |
852 | #if XNN_ARCH_ARM |
853 | const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config(); |
854 | assert(hardware_config != NULL); |
855 | if (hardware_config->use_arm_neon){ |
856 | qs8_vadd_config.minmax.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_qs8_vadd_minmax_ukernel__neon_ld64_x16; |
857 | qs8_vadd_config.minmax.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_qs8_vaddc_minmax_ukernel__neon_ld64_x16; |
858 | qs8_vadd_config.minmax.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_qs8_vaddc_minmax_ukernel__neon_ld64_x16; |
859 | qs8_vadd_config.init.qs8_add = xnn_init_qs8_add_minmax_neon_params; |
860 | qs8_vadd_config.element_tile = 16; |
861 | } else if (!XNN_PLATFORM_MOBILE) { |
862 | qs8_vadd_config.minmax.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_qs8_vadd_minmax_ukernel__scalar_x1; |
863 | qs8_vadd_config.minmax.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_qs8_vaddc_minmax_ukernel__scalar_x1; |
864 | qs8_vadd_config.minmax.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_qs8_vaddc_minmax_ukernel__scalar_x1; |
865 | qs8_vadd_config.init.qs8_add = xnn_init_qs8_add_minmax_scalar_params; |
866 | qs8_vadd_config.element_tile = 1; |
867 | } |
868 | #elif XNN_ARCH_ARM64 |
869 | qs8_vadd_config.minmax.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_qs8_vadd_minmax_ukernel__neon_ld64_x32; |
870 | qs8_vadd_config.minmax.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_qs8_vaddc_minmax_ukernel__neon_ld64_x32; |
871 | qs8_vadd_config.minmax.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_qs8_vaddc_minmax_ukernel__neon_ld64_x32; |
872 | qs8_vadd_config.init.qs8_add = xnn_init_qs8_add_minmax_neon_params; |
873 | qs8_vadd_config.element_tile = 32; |
874 | #elif XNN_ARCH_X86 || XNN_ARCH_X86_64 |
875 | const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config(); |
876 | assert(hardware_config != NULL); |
877 | if (!XNN_PLATFORM_MOBILE && hardware_config->use_x86_avx512skx) { |
878 | qs8_vadd_config.minmax.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_qs8_vadd_minmax_ukernel__avx512skx_mul32_ld128_x16; |
879 | qs8_vadd_config.minmax.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_qs8_vaddc_minmax_ukernel__avx512skx_mul32_ld128_x16; |
880 | qs8_vadd_config.minmax.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_qs8_vaddc_minmax_ukernel__avx512skx_mul32_ld128_x16; |
881 | qs8_vadd_config.init.qs8_add = xnn_init_qs8_add_minmax_avx512_params; |
882 | qs8_vadd_config.element_tile = 16; |
883 | } else if (hardware_config->use_x86_xop) { |
884 | qs8_vadd_config.minmax.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_qs8_vadd_minmax_ukernel__xop_mul32_ld32_x8; |
885 | qs8_vadd_config.minmax.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_qs8_vaddc_minmax_ukernel__xop_mul32_ld32_x8; |
886 | qs8_vadd_config.minmax.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_qs8_vaddc_minmax_ukernel__xop_mul32_ld32_x8; |
887 | qs8_vadd_config.init.qs8_add = xnn_init_qs8_add_minmax_sse4_mul32_params; |
888 | qs8_vadd_config.element_tile = 8; |
889 | } else if (hardware_config->use_x86_avx2) { |
890 | qs8_vadd_config.minmax.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_qs8_vadd_minmax_ukernel__avx2_mul32_ld64_x16; |
891 | qs8_vadd_config.minmax.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_qs8_vaddc_minmax_ukernel__avx2_mul32_ld64_x16; |
892 | qs8_vadd_config.minmax.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_qs8_vaddc_minmax_ukernel__avx2_mul32_ld64_x16; |
893 | qs8_vadd_config.init.qs8_add = xnn_init_qs8_add_minmax_avx2_params; |
894 | qs8_vadd_config.element_tile = 16; |
895 | } else if (hardware_config->use_x86_avx) { |
896 | qs8_vadd_config.minmax.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_qs8_vadd_minmax_ukernel__avx_mul32_ld32_x8; |
897 | qs8_vadd_config.minmax.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_qs8_vaddc_minmax_ukernel__avx_mul32_ld32_x8; |
898 | qs8_vadd_config.minmax.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_qs8_vaddc_minmax_ukernel__avx_mul32_ld32_x8; |
899 | qs8_vadd_config.init.qs8_add = xnn_init_qs8_add_minmax_sse4_mul32_params; |
900 | qs8_vadd_config.element_tile = 8; |
901 | } else if (hardware_config->use_x86_sse4_1) { |
902 | qs8_vadd_config.minmax.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_qs8_vadd_minmax_ukernel__sse41_mul16_ld64_x8; |
903 | qs8_vadd_config.minmax.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_qs8_vaddc_minmax_ukernel__sse41_mul16_ld64_x8; |
904 | qs8_vadd_config.minmax.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_qs8_vaddc_minmax_ukernel__sse41_mul16_ld64_x8; |
905 | qs8_vadd_config.init.qs8_add = xnn_init_qs8_add_minmax_sse4_mul16_params; |
906 | qs8_vadd_config.element_tile = 8; |
907 | } else { |
908 | qs8_vadd_config.minmax.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_qs8_vadd_minmax_ukernel__sse2_mul16_ld64_x8; |
909 | qs8_vadd_config.minmax.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_qs8_vaddc_minmax_ukernel__sse2_mul16_ld64_x8; |
910 | qs8_vadd_config.minmax.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_qs8_vaddc_minmax_ukernel__sse2_mul16_ld64_x8; |
911 | qs8_vadd_config.init.qs8_add = xnn_init_qs8_add_minmax_sse2_params; |
912 | qs8_vadd_config.element_tile = 8; |
913 | } |
914 | #elif XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD |
915 | qs8_vadd_config.minmax.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_qs8_vadd_minmax_ukernel__wasmsimd_x32; |
916 | qs8_vadd_config.minmax.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_qs8_vaddc_minmax_ukernel__wasmsimd_x32; |
917 | qs8_vadd_config.minmax.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_qs8_vaddc_minmax_ukernel__wasmsimd_x32; |
918 | qs8_vadd_config.init.qs8_add = xnn_init_qs8_add_minmax_wasmsimd_params; |
919 | qs8_vadd_config.element_tile = 32; |
920 | #elif XNN_ARCH_WASM |
921 | qs8_vadd_config.minmax.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_qs8_vadd_minmax_ukernel__scalar_x4; |
922 | qs8_vadd_config.minmax.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_qs8_vaddc_minmax_ukernel__scalar_x4; |
923 | qs8_vadd_config.minmax.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_qs8_vaddc_minmax_ukernel__scalar_x4; |
924 | qs8_vadd_config.init.qs8_add = xnn_init_qs8_add_minmax_scalar_params; |
925 | qs8_vadd_config.element_tile = 4; |
926 | #elif XNN_ARCH_RISCV |
927 | qs8_vadd_config.minmax.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_qs8_vadd_minmax_ukernel__scalar_x4; |
928 | qs8_vadd_config.minmax.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_qs8_vaddc_minmax_ukernel__scalar_x4; |
929 | qs8_vadd_config.minmax.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_qs8_vaddc_minmax_ukernel__scalar_x4; |
930 | qs8_vadd_config.init.qs8_add = xnn_init_qs8_add_minmax_scalar_params; |
931 | qs8_vadd_config.element_tile = 4; |
932 | #else |
933 | #error "Unsupported architecture" |
934 | #endif |
935 | } |
936 | |
937 | static void init_qs8_vmul_config(void) { |
938 | #if XNN_ARCH_ARM |
939 | const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config(); |
940 | assert(hardware_config != NULL); |
941 | if (hardware_config->use_arm_neon){ |
942 | qs8_vmul_config.minmax.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_qs8_vmul_minmax_rndnu_ukernel__neon_ld64_x16; |
943 | qs8_vmul_config.minmax.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_qs8_vmulc_minmax_rndnu_ukernel__neon_ld64_x16; |
944 | qs8_vmul_config.minmax.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_qs8_vmulc_minmax_rndnu_ukernel__neon_ld64_x16; |
945 | qs8_vmul_config.init.qs8_mul = xnn_init_qs8_mul_minmax_rndnu_neon_params; |
946 | qs8_vmul_config.element_tile = 16; |
947 | } else if (!XNN_PLATFORM_MOBILE) { |
948 | qs8_vmul_config.minmax.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_qs8_vmul_minmax_fp32_ukernel__scalar_x4; |
949 | qs8_vmul_config.minmax.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_qs8_vmulc_minmax_fp32_ukernel__scalar_x4; |
950 | qs8_vmul_config.minmax.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_qs8_vmulc_minmax_fp32_ukernel__scalar_x4; |
951 | qs8_vmul_config.init.qs8_mul = xnn_init_qs8_mul_minmax_fp32_scalar_params; |
952 | qs8_vmul_config.element_tile = 4; |
953 | } |
954 | #elif XNN_ARCH_ARM64 |
955 | qs8_vmul_config.minmax.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_qs8_vmul_minmax_rndnu_ukernel__neon_ld64_x16; |
956 | qs8_vmul_config.minmax.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_qs8_vmulc_minmax_rndnu_ukernel__neon_ld64_x16; |
957 | qs8_vmul_config.minmax.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_qs8_vmulc_minmax_rndnu_ukernel__neon_ld64_x16; |
958 | qs8_vmul_config.init.qs8_mul = xnn_init_qs8_mul_minmax_rndnu_neon_params; |
959 | qs8_vmul_config.element_tile = 16; |
960 | #elif XNN_ARCH_X86 || XNN_ARCH_X86_64 |
961 | const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config(); |
962 | assert(hardware_config != NULL); |
963 | if (hardware_config->use_x86_avx) { |
964 | qs8_vmul_config.minmax.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_qs8_vmul_minmax_fp32_ukernel__avx_mul16_ld64_x16; |
965 | qs8_vmul_config.minmax.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_qs8_vmulc_minmax_fp32_ukernel__avx_mul16_ld64_x16; |
966 | qs8_vmul_config.minmax.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_qs8_vmulc_minmax_fp32_ukernel__avx_mul16_ld64_x16; |
967 | qs8_vmul_config.init.qs8_mul = xnn_init_qs8_mul_minmax_fp32_sse4_params; |
968 | qs8_vmul_config.element_tile = 16; |
969 | } else if (hardware_config->use_x86_sse4_1) { |
970 | qs8_vmul_config.minmax.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_qs8_vmul_minmax_fp32_ukernel__sse41_mul16_ld64_x16; |
971 | qs8_vmul_config.minmax.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_qs8_vmulc_minmax_fp32_ukernel__sse41_mul16_ld64_x16; |
972 | qs8_vmul_config.minmax.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_qs8_vmulc_minmax_fp32_ukernel__sse41_mul16_ld64_x16; |
973 | qs8_vmul_config.init.qs8_mul = xnn_init_qs8_mul_minmax_fp32_sse4_params; |
974 | qs8_vmul_config.element_tile = 16; |
975 | } else { |
976 | qs8_vmul_config.minmax.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_qs8_vmul_minmax_fp32_ukernel__sse2_mul16_ld64_x8; |
977 | qs8_vmul_config.minmax.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_qs8_vmulc_minmax_fp32_ukernel__sse2_mul16_ld64_x8; |
978 | qs8_vmul_config.minmax.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_qs8_vmulc_minmax_fp32_ukernel__sse2_mul16_ld64_x8; |
979 | qs8_vmul_config.init.qs8_mul = xnn_init_qs8_mul_minmax_fp32_sse2_params; |
980 | qs8_vmul_config.element_tile = 8; |
981 | } |
982 | #elif XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD |
983 | qs8_vmul_config.minmax.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_qs8_vmul_minmax_fp32_ukernel__wasmsimd_mul32_ld64_x8; |
984 | qs8_vmul_config.minmax.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_qs8_vmulc_minmax_fp32_ukernel__wasmsimd_mul32_ld64_x8; |
985 | qs8_vmul_config.minmax.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_qs8_vmulc_minmax_fp32_ukernel__wasmsimd_mul32_ld64_x8; |
986 | qs8_vmul_config.init.qs8_mul = xnn_init_qs8_mul_minmax_fp32_wasmsimd_params; |
987 | qs8_vmul_config.element_tile = 8; |
988 | #elif XNN_ARCH_WASM |
989 | qs8_vmul_config.minmax.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_qs8_vmul_minmax_fp32_ukernel__scalar_x4; |
990 | qs8_vmul_config.minmax.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_qs8_vmulc_minmax_fp32_ukernel__scalar_x4; |
991 | qs8_vmul_config.minmax.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_qs8_vmulc_minmax_fp32_ukernel__scalar_x4; |
992 | qs8_vmul_config.init.qs8_mul = xnn_init_qs8_mul_minmax_fp32_scalar_params; |
993 | qs8_vmul_config.element_tile = 4; |
994 | #elif XNN_ARCH_RISCV |
995 | qs8_vmul_config.minmax.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_qs8_vmul_minmax_fp32_ukernel__scalar_x4; |
996 | qs8_vmul_config.minmax.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_qs8_vmulc_minmax_fp32_ukernel__scalar_x4; |
997 | qs8_vmul_config.minmax.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_qs8_vmulc_minmax_fp32_ukernel__scalar_x4; |
998 | qs8_vmul_config.init.qs8_mul = xnn_init_qs8_mul_minmax_fp32_scalar_params; |
999 | qs8_vmul_config.element_tile = 4; |
1000 | #else |
1001 | #error "Unsupported architecture" |
1002 | #endif |
1003 | } |
1004 | |
1005 | static void init_qu8_vadd_config(void) { |
1006 | #if XNN_ARCH_ARM |
1007 | const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config(); |
1008 | assert(hardware_config != NULL); |
1009 | if (hardware_config->use_arm_neon){ |
1010 | qu8_vadd_config.minmax.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_qu8_vadd_minmax_ukernel__neon_ld64_x16; |
1011 | qu8_vadd_config.minmax.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_qu8_vaddc_minmax_ukernel__neon_ld64_x16; |
1012 | qu8_vadd_config.minmax.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_qu8_vaddc_minmax_ukernel__neon_ld64_x16; |
1013 | qu8_vadd_config.init.qu8_add = xnn_init_qu8_add_minmax_neon_params; |
1014 | qu8_vadd_config.element_tile = 8; |
1015 | } else if (!XNN_PLATFORM_MOBILE) { |
1016 | qu8_vadd_config.minmax.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_qu8_vadd_minmax_ukernel__scalar_x1; |
1017 | qu8_vadd_config.minmax.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_qu8_vaddc_minmax_ukernel__scalar_x1; |
1018 | qu8_vadd_config.minmax.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_qu8_vaddc_minmax_ukernel__scalar_x1; |
1019 | qu8_vadd_config.init.qu8_add = xnn_init_qu8_add_minmax_scalar_params; |
1020 | qu8_vadd_config.element_tile = 1; |
1021 | } |
1022 | #elif XNN_ARCH_ARM64 |
1023 | qu8_vadd_config.minmax.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_qu8_vadd_minmax_ukernel__neon_ld64_x32; |
1024 | qu8_vadd_config.minmax.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_qu8_vaddc_minmax_ukernel__neon_ld64_x32; |
1025 | qu8_vadd_config.minmax.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_qu8_vaddc_minmax_ukernel__neon_ld64_x32; |
1026 | qu8_vadd_config.init.qu8_add = xnn_init_qu8_add_minmax_neon_params; |
1027 | qu8_vadd_config.element_tile = 8; |
1028 | #elif XNN_ARCH_X86 || XNN_ARCH_X86_64 |
1029 | const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config(); |
1030 | assert(hardware_config != NULL); |
1031 | if (!XNN_PLATFORM_MOBILE && hardware_config->use_x86_avx512skx) { |
1032 | qu8_vadd_config.minmax.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_qu8_vadd_minmax_ukernel__avx512skx_mul32_ld128_x16; |
1033 | qu8_vadd_config.minmax.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_qu8_vaddc_minmax_ukernel__avx512skx_mul32_ld128_x16; |
1034 | qu8_vadd_config.minmax.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_qu8_vaddc_minmax_ukernel__avx512skx_mul32_ld128_x16; |
1035 | qu8_vadd_config.init.qu8_add = xnn_init_qu8_add_minmax_avx512_params; |
1036 | qu8_vadd_config.element_tile = 16; |
1037 | } else if (hardware_config->use_x86_xop) { |
1038 | qu8_vadd_config.minmax.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_qu8_vadd_minmax_ukernel__xop_mul32_ld32_x8; |
1039 | qu8_vadd_config.minmax.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_qu8_vaddc_minmax_ukernel__xop_mul32_ld32_x8; |
1040 | qu8_vadd_config.minmax.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_qu8_vaddc_minmax_ukernel__xop_mul32_ld32_x8; |
1041 | qu8_vadd_config.init.qu8_add = xnn_init_qu8_add_minmax_sse4_params; |
1042 | qu8_vadd_config.element_tile = 8; |
1043 | } else if (hardware_config->use_x86_avx2) { |
1044 | qu8_vadd_config.minmax.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_qu8_vadd_minmax_ukernel__avx2_mul32_ld64_x16; |
1045 | qu8_vadd_config.minmax.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_qu8_vaddc_minmax_ukernel__avx2_mul32_ld64_x16; |
1046 | qu8_vadd_config.minmax.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_qu8_vaddc_minmax_ukernel__avx2_mul32_ld64_x16; |
1047 | qu8_vadd_config.init.qu8_add = xnn_init_qu8_add_minmax_avx2_params; |
1048 | qu8_vadd_config.element_tile = 16; |
1049 | } else if (hardware_config->use_x86_avx) { |
1050 | qu8_vadd_config.minmax.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_qu8_vadd_minmax_ukernel__avx_mul32_ld32_x8; |
1051 | qu8_vadd_config.minmax.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_qu8_vaddc_minmax_ukernel__avx_mul32_ld32_x8; |
1052 | qu8_vadd_config.minmax.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_qu8_vaddc_minmax_ukernel__avx_mul32_ld32_x8; |
1053 | qu8_vadd_config.init.qu8_add = xnn_init_qu8_add_minmax_sse4_params; |
1054 | qu8_vadd_config.element_tile = 8; |
1055 | } else if (hardware_config->use_x86_sse4_1) { |
1056 | qu8_vadd_config.minmax.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_qu8_vadd_minmax_ukernel__sse41_mul16_ld64_x8; |
1057 | qu8_vadd_config.minmax.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_qu8_vaddc_minmax_ukernel__sse41_mul16_ld64_x8; |
1058 | qu8_vadd_config.minmax.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_qu8_vaddc_minmax_ukernel__sse41_mul16_ld64_x8; |
1059 | qu8_vadd_config.init.qu8_add = xnn_init_qu8_add_minmax_sse2_params; |
1060 | qu8_vadd_config.element_tile = 8; |
1061 | } else { |
1062 | qu8_vadd_config.minmax.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_qu8_vadd_minmax_ukernel__sse2_mul16_ld64_x8; |
1063 | qu8_vadd_config.minmax.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_qu8_vaddc_minmax_ukernel__sse2_mul16_ld64_x8; |
1064 | qu8_vadd_config.minmax.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_qu8_vaddc_minmax_ukernel__sse2_mul16_ld64_x8; |
1065 | qu8_vadd_config.init.qu8_add = xnn_init_qu8_add_minmax_sse2_params; |
1066 | qu8_vadd_config.element_tile = 8; |
1067 | } |
1068 | #elif XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD |
1069 | qu8_vadd_config.minmax.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_qu8_vadd_minmax_ukernel__wasmsimd_x32; |
1070 | qu8_vadd_config.minmax.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_qu8_vaddc_minmax_ukernel__wasmsimd_x32; |
1071 | qu8_vadd_config.minmax.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_qu8_vaddc_minmax_ukernel__wasmsimd_x32; |
1072 | qu8_vadd_config.init.qu8_add = xnn_init_qu8_add_minmax_wasmsimd_params; |
1073 | qu8_vadd_config.element_tile = 32; |
1074 | #elif XNN_ARCH_WASM |
1075 | qu8_vadd_config.minmax.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_qu8_vadd_minmax_ukernel__scalar_x4; |
1076 | qu8_vadd_config.minmax.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_qu8_vaddc_minmax_ukernel__scalar_x4; |
1077 | qu8_vadd_config.minmax.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_qu8_vaddc_minmax_ukernel__scalar_x4; |
1078 | qu8_vadd_config.init.qu8_add = xnn_init_qu8_add_minmax_scalar_params; |
1079 | qu8_vadd_config.element_tile = 4; |
1080 | #elif XNN_ARCH_RISCV |
1081 | qu8_vadd_config.minmax.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_qu8_vadd_minmax_ukernel__scalar_x4; |
1082 | qu8_vadd_config.minmax.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_qu8_vaddc_minmax_ukernel__scalar_x4; |
1083 | qu8_vadd_config.minmax.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_qu8_vaddc_minmax_ukernel__scalar_x4; |
1084 | qu8_vadd_config.init.qu8_add = xnn_init_qu8_add_minmax_scalar_params; |
1085 | qu8_vadd_config.element_tile = 4; |
1086 | #else |
1087 | #error "Unsupported architecture" |
1088 | #endif |
1089 | } |
1090 | |
1091 | static void init_qu8_vmul_config(void) { |
1092 | #if XNN_ARCH_ARM |
1093 | const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config(); |
1094 | assert(hardware_config != NULL); |
1095 | if (hardware_config->use_arm_neon){ |
1096 | qu8_vmul_config.minmax.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_qu8_vmul_minmax_rndnu_ukernel__neon_ld64_x16; |
1097 | qu8_vmul_config.minmax.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_qu8_vmulc_minmax_rndnu_ukernel__neon_ld64_x16; |
1098 | qu8_vmul_config.minmax.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_qu8_vmulc_minmax_rndnu_ukernel__neon_ld64_x16; |
1099 | qu8_vmul_config.init.qu8_mul = xnn_init_qu8_mul_minmax_rndnu_neon_params; |
1100 | qu8_vmul_config.element_tile = 16; |
1101 | } else if (!XNN_PLATFORM_MOBILE) { |
1102 | qu8_vmul_config.minmax.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_qu8_vmul_minmax_fp32_ukernel__scalar_x4; |
1103 | qu8_vmul_config.minmax.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_qu8_vmulc_minmax_fp32_ukernel__scalar_x4; |
1104 | qu8_vmul_config.minmax.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_qu8_vmulc_minmax_fp32_ukernel__scalar_x4; |
1105 | qu8_vmul_config.init.qu8_mul = xnn_init_qu8_mul_minmax_fp32_scalar_params; |
1106 | qu8_vmul_config.element_tile = 4; |
1107 | } |
1108 | #elif XNN_ARCH_ARM64 |
1109 | qu8_vmul_config.minmax.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_qu8_vmul_minmax_rndnu_ukernel__neon_ld64_x16; |
1110 | qu8_vmul_config.minmax.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_qu8_vmulc_minmax_rndnu_ukernel__neon_ld64_x16; |
1111 | qu8_vmul_config.minmax.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_qu8_vmulc_minmax_rndnu_ukernel__neon_ld64_x16; |
1112 | qu8_vmul_config.init.qu8_mul = xnn_init_qu8_mul_minmax_rndnu_neon_params; |
1113 | qu8_vmul_config.element_tile = 16; |
1114 | #elif XNN_ARCH_X86 || XNN_ARCH_X86_64 |
1115 | const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config(); |
1116 | assert(hardware_config != NULL); |
1117 | if (hardware_config->use_x86_avx) { |
1118 | qu8_vmul_config.minmax.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_qu8_vmul_minmax_fp32_ukernel__avx_mul16_ld64_x16; |
1119 | qu8_vmul_config.minmax.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_qu8_vmulc_minmax_fp32_ukernel__avx_mul16_ld64_x16; |
1120 | qu8_vmul_config.minmax.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_qu8_vmulc_minmax_fp32_ukernel__avx_mul16_ld64_x16; |
1121 | qu8_vmul_config.init.qu8_mul = xnn_init_qu8_mul_minmax_fp32_sse2_params; |
1122 | qu8_vmul_config.element_tile = 16; |
1123 | } else if (hardware_config->use_x86_sse4_1) { |
1124 | qu8_vmul_config.minmax.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_qu8_vmul_minmax_fp32_ukernel__sse41_mul16_ld64_x16; |
1125 | qu8_vmul_config.minmax.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_qu8_vmulc_minmax_fp32_ukernel__sse41_mul16_ld64_x16; |
1126 | qu8_vmul_config.minmax.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_qu8_vmulc_minmax_fp32_ukernel__sse41_mul16_ld64_x16; |
1127 | qu8_vmul_config.init.qu8_mul = xnn_init_qu8_mul_minmax_fp32_sse2_params; |
1128 | qu8_vmul_config.element_tile = 16; |
1129 | } else { |
1130 | qu8_vmul_config.minmax.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_qu8_vmul_minmax_fp32_ukernel__sse2_mul16_ld64_x8; |
1131 | qu8_vmul_config.minmax.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_qu8_vmulc_minmax_fp32_ukernel__sse2_mul16_ld64_x8; |
1132 | qu8_vmul_config.minmax.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_qu8_vmulc_minmax_fp32_ukernel__sse2_mul16_ld64_x8; |
1133 | qu8_vmul_config.init.qu8_mul = xnn_init_qu8_mul_minmax_fp32_sse2_params; |
1134 | qu8_vmul_config.element_tile = 8; |
1135 | } |
1136 | #elif XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD |
1137 | qu8_vmul_config.minmax.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_qu8_vmul_minmax_fp32_ukernel__wasmsimd_mul32_ld64_x8; |
1138 | qu8_vmul_config.minmax.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_qu8_vmulc_minmax_fp32_ukernel__wasmsimd_mul32_ld64_x8; |
1139 | qu8_vmul_config.minmax.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_qu8_vmulc_minmax_fp32_ukernel__wasmsimd_mul32_ld64_x8; |
1140 | qu8_vmul_config.init.qu8_mul = xnn_init_qu8_mul_minmax_fp32_wasmsimd_params; |
1141 | qu8_vmul_config.element_tile = 8; |
1142 | #elif XNN_ARCH_WASM |
1143 | qu8_vmul_config.minmax.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_qu8_vmul_minmax_fp32_ukernel__scalar_x4; |
1144 | qu8_vmul_config.minmax.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_qu8_vmulc_minmax_fp32_ukernel__scalar_x4; |
1145 | qu8_vmul_config.minmax.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_qu8_vmulc_minmax_fp32_ukernel__scalar_x4; |
1146 | qu8_vmul_config.init.qu8_mul = xnn_init_qu8_mul_minmax_fp32_scalar_params; |
1147 | qu8_vmul_config.element_tile = 4; |
1148 | #elif XNN_ARCH_RISCV |
1149 | qu8_vmul_config.minmax.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_qu8_vmul_minmax_fp32_ukernel__scalar_x4; |
1150 | qu8_vmul_config.minmax.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_qu8_vmulc_minmax_fp32_ukernel__scalar_x4; |
1151 | qu8_vmul_config.minmax.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_qu8_vmulc_minmax_fp32_ukernel__scalar_x4; |
1152 | qu8_vmul_config.init.qu8_mul = xnn_init_qu8_mul_minmax_fp32_scalar_params; |
1153 | qu8_vmul_config.element_tile = 4; |
1154 | #else |
1155 | #error "Unsupported architecture" |
1156 | #endif |
1157 | } |
1158 | |
1159 | #if XNN_PLATFORM_WINDOWS |
1160 | static BOOL CALLBACK init_f16_vadd_config_windows(PINIT_ONCE init_once, PVOID parameter, PVOID* context) { |
1161 | init_f16_vadd_config(); |
1162 | return TRUE; |
1163 | } |
1164 | |
1165 | static BOOL CALLBACK init_f16_vdiv_config_windows(PINIT_ONCE init_once, PVOID parameter, PVOID* context) { |
1166 | init_f16_vdiv_config(); |
1167 | return TRUE; |
1168 | } |
1169 | |
1170 | static BOOL CALLBACK init_f16_vmax_config_windows(PINIT_ONCE init_once, PVOID parameter, PVOID* context) { |
1171 | init_f16_vmax_config(); |
1172 | return TRUE; |
1173 | } |
1174 | |
1175 | static BOOL CALLBACK init_f16_vmin_config_windows(PINIT_ONCE init_once, PVOID parameter, PVOID* context) { |
1176 | init_f16_vmin_config(); |
1177 | return TRUE; |
1178 | } |
1179 | |
1180 | static BOOL CALLBACK init_f16_vmul_config_windows(PINIT_ONCE init_once, PVOID parameter, PVOID* context) { |
1181 | init_f16_vmul_config(); |
1182 | return TRUE; |
1183 | } |
1184 | |
1185 | static BOOL CALLBACK init_f16_vsub_config_windows(PINIT_ONCE init_once, PVOID parameter, PVOID* context) { |
1186 | init_f16_vsub_config(); |
1187 | return TRUE; |
1188 | } |
1189 | |
1190 | static BOOL CALLBACK init_f16_vsqrdiff_config_windows(PINIT_ONCE init_once, PVOID parameter, PVOID* context) { |
1191 | init_f16_vsqrdiff_config(); |
1192 | return TRUE; |
1193 | } |
1194 | |
1195 | static BOOL CALLBACK init_f32_vadd_config_windows(PINIT_ONCE init_once, PVOID parameter, PVOID* context) { |
1196 | init_f32_vadd_config(); |
1197 | return TRUE; |
1198 | } |
1199 | |
1200 | static BOOL CALLBACK init_f32_vdiv_config_windows(PINIT_ONCE init_once, PVOID parameter, PVOID* context) { |
1201 | init_f32_vdiv_config(); |
1202 | return TRUE; |
1203 | } |
1204 | |
1205 | static BOOL CALLBACK init_f32_vmax_config_windows(PINIT_ONCE init_once, PVOID parameter, PVOID* context) { |
1206 | init_f32_vmax_config(); |
1207 | return TRUE; |
1208 | } |
1209 | |
1210 | static BOOL CALLBACK init_f32_vmin_config_windows(PINIT_ONCE init_once, PVOID parameter, PVOID* context) { |
1211 | init_f32_vmin_config(); |
1212 | return TRUE; |
1213 | } |
1214 | |
1215 | static BOOL CALLBACK init_f32_vmul_config_windows(PINIT_ONCE init_once, PVOID parameter, PVOID* context) { |
1216 | init_f32_vmul_config(); |
1217 | return TRUE; |
1218 | } |
1219 | |
1220 | static BOOL CALLBACK init_f32_vsub_config_windows(PINIT_ONCE init_once, PVOID parameter, PVOID* context) { |
1221 | init_f32_vsub_config(); |
1222 | return TRUE; |
1223 | } |
1224 | |
1225 | static BOOL CALLBACK init_f32_vsqrdiff_config_windows(PINIT_ONCE init_once, PVOID parameter, PVOID* context) { |
1226 | init_f32_vsqrdiff_config(); |
1227 | return TRUE; |
1228 | } |
1229 | |
1230 | static BOOL CALLBACK init_qs8_vadd_config_windows(PINIT_ONCE init_once, PVOID parameter, PVOID* context) { |
1231 | init_qs8_vadd_config(); |
1232 | return TRUE; |
1233 | } |
1234 | |
1235 | static BOOL CALLBACK init_qs8_vmul_config_windows(PINIT_ONCE init_once, PVOID parameter, PVOID* context) { |
1236 | init_qs8_vmul_config(); |
1237 | return TRUE; |
1238 | } |
1239 | |
1240 | static BOOL CALLBACK init_qu8_vadd_config_windows(PINIT_ONCE init_once, PVOID parameter, PVOID* context) { |
1241 | init_qu8_vadd_config(); |
1242 | return TRUE; |
1243 | } |
1244 | |
1245 | static BOOL CALLBACK init_qu8_vmul_config_windows(PINIT_ONCE init_once, PVOID parameter, PVOID* context) { |
1246 | init_qu8_vmul_config(); |
1247 | return TRUE; |
1248 | } |
1249 | #endif |
1250 | |
1251 | static bool is_f16_compatible_config(const struct xnn_hardware_config hardware_config[restrict XNN_MIN_ELEMENTS(1)]) { |
1252 | #if (XNN_ARCH_ARM && XNN_ENABLE_ARM_FP16_VECTOR && XNN_ENABLE_ARM_FP16_SCALAR) || (XNN_ARCH_ARM64 && XNN_ENABLE_ARM_FP16_VECTOR) |
1253 | return hardware_config->use_arm_neon_fp16_arith; |
1254 | #elif (XNN_ARCH_X86 || XNN_ARCH_X86_64) && !XNN_PLATFORM_MOBILE |
1255 | return hardware_config->use_x86_avx2; |
1256 | #else |
1257 | return false; |
1258 | #endif |
1259 | } |
1260 | |
1261 | const struct xnn_binary_elementwise_config* xnn_init_f16_vadd_config() { |
1262 | const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config(); |
1263 | if (hardware_config == NULL || !is_f16_compatible_config(hardware_config)) { |
1264 | return NULL; |
1265 | } |
1266 | #if XNN_PLATFORM_WINDOWS |
1267 | InitOnceExecuteOnce(&init_guard_f16_vadd, &init_f16_vadd_config_windows, NULL, NULL); |
1268 | #else |
1269 | pthread_once(&init_guard_f16_vadd, &init_f16_vadd_config); |
1270 | #endif |
1271 | return &f16_vadd_config; |
1272 | } |
1273 | |
1274 | const struct xnn_binary_elementwise_config* xnn_init_f16_vdiv_config() { |
1275 | const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config(); |
1276 | if (hardware_config == NULL || !is_f16_compatible_config(hardware_config)) { |
1277 | return NULL; |
1278 | } |
1279 | #if XNN_PLATFORM_WINDOWS |
1280 | InitOnceExecuteOnce(&init_guard_f16_vdiv, &init_f16_vdiv_config_windows, NULL, NULL); |
1281 | #else |
1282 | pthread_once(&init_guard_f16_vdiv, &init_f16_vdiv_config); |
1283 | #endif |
1284 | return &f16_vdiv_config; |
1285 | } |
1286 | |
1287 | const struct xnn_binary_elementwise_config* xnn_init_f16_vmax_config() { |
1288 | const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config(); |
1289 | if (hardware_config == NULL || !is_f16_compatible_config(hardware_config)) { |
1290 | return NULL; |
1291 | } |
1292 | #if XNN_PLATFORM_WINDOWS |
1293 | InitOnceExecuteOnce(&init_guard_f16_vmax, &init_f16_vmax_config_windows, NULL, NULL); |
1294 | #else |
1295 | pthread_once(&init_guard_f16_vmax, &init_f16_vmax_config); |
1296 | #endif |
1297 | return &f16_vmax_config; |
1298 | } |
1299 | |
1300 | const struct xnn_binary_elementwise_config* xnn_init_f16_vmin_config() { |
1301 | const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config(); |
1302 | if (hardware_config == NULL || !is_f16_compatible_config(hardware_config)) { |
1303 | return NULL; |
1304 | } |
1305 | #if XNN_PLATFORM_WINDOWS |
1306 | InitOnceExecuteOnce(&init_guard_f16_vmin, &init_f16_vmin_config_windows, NULL, NULL); |
1307 | #else |
1308 | pthread_once(&init_guard_f16_vmin, &init_f16_vmin_config); |
1309 | #endif |
1310 | return &f16_vmin_config; |
1311 | } |
1312 | |
1313 | const struct xnn_binary_elementwise_config* xnn_init_f16_vmul_config() { |
1314 | const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config(); |
1315 | if (hardware_config == NULL || !is_f16_compatible_config(hardware_config)) { |
1316 | return NULL; |
1317 | } |
1318 | #if XNN_PLATFORM_WINDOWS |
1319 | InitOnceExecuteOnce(&init_guard_f16_vmul, &init_f16_vmul_config_windows, NULL, NULL); |
1320 | #else |
1321 | pthread_once(&init_guard_f16_vmul, &init_f16_vmul_config); |
1322 | #endif |
1323 | return &f16_vmul_config; |
1324 | } |
1325 | |
1326 | const struct xnn_binary_elementwise_config* xnn_init_f16_vsub_config() { |
1327 | const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config(); |
1328 | if (hardware_config == NULL || !is_f16_compatible_config(hardware_config)) { |
1329 | return NULL; |
1330 | } |
1331 | #if XNN_PLATFORM_WINDOWS |
1332 | InitOnceExecuteOnce(&init_guard_f16_vsub, &init_f16_vsub_config_windows, NULL, NULL); |
1333 | #else |
1334 | pthread_once(&init_guard_f16_vsub, &init_f16_vsub_config); |
1335 | #endif |
1336 | return &f16_vsub_config; |
1337 | } |
1338 | |
1339 | const struct xnn_binary_elementwise_config* xnn_init_f16_vsqrdiff_config() { |
1340 | const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config(); |
1341 | if (hardware_config == NULL || !is_f16_compatible_config(hardware_config)) { |
1342 | return NULL; |
1343 | } |
1344 | #if XNN_PLATFORM_WINDOWS |
1345 | InitOnceExecuteOnce(&init_guard_f16_vsqrdiff, &init_f16_vsqrdiff_config_windows, NULL, NULL); |
1346 | #else |
1347 | pthread_once(&init_guard_f16_vsqrdiff, &init_f16_vsqrdiff_config); |
1348 | #endif |
1349 | return &f16_vsqrdiff_config; |
1350 | } |
1351 | |
1352 | const struct xnn_binary_elementwise_config* xnn_init_f32_vadd_config() { |
1353 | const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config(); |
1354 | if (hardware_config == NULL) { |
1355 | return NULL; |
1356 | } |
1357 | #if XNN_PLATFORM_WINDOWS |
1358 | InitOnceExecuteOnce(&init_guard_f32_vadd, &init_f32_vadd_config_windows, NULL, NULL); |
1359 | #else |
1360 | pthread_once(&init_guard_f32_vadd, &init_f32_vadd_config); |
1361 | #endif |
1362 | return &f32_vadd_config; |
1363 | } |
1364 | |
1365 | const struct xnn_binary_elementwise_config* xnn_init_f32_vdiv_config() { |
1366 | const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config(); |
1367 | if (hardware_config == NULL) { |
1368 | return NULL; |
1369 | } |
1370 | #if XNN_PLATFORM_WINDOWS |
1371 | InitOnceExecuteOnce(&init_guard_f32_vdiv, &init_f32_vdiv_config_windows, NULL, NULL); |
1372 | #else |
1373 | pthread_once(&init_guard_f32_vdiv, &init_f32_vdiv_config); |
1374 | #endif |
1375 | return &f32_vdiv_config; |
1376 | } |
1377 | |
1378 | const struct xnn_binary_elementwise_config* xnn_init_f32_vmax_config() { |
1379 | const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config(); |
1380 | if (hardware_config == NULL) { |
1381 | return NULL; |
1382 | } |
1383 | #if XNN_PLATFORM_WINDOWS |
1384 | InitOnceExecuteOnce(&init_guard_f32_vmax, &init_f32_vmax_config_windows, NULL, NULL); |
1385 | #else |
1386 | pthread_once(&init_guard_f32_vmax, &init_f32_vmax_config); |
1387 | #endif |
1388 | return &f32_vmax_config; |
1389 | } |
1390 | |
1391 | const struct xnn_binary_elementwise_config* xnn_init_f32_vmin_config() { |
1392 | const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config(); |
1393 | if (hardware_config == NULL) { |
1394 | return NULL; |
1395 | } |
1396 | #if XNN_PLATFORM_WINDOWS |
1397 | InitOnceExecuteOnce(&init_guard_f32_vmin, &init_f32_vmin_config_windows, NULL, NULL); |
1398 | #else |
1399 | pthread_once(&init_guard_f32_vmin, &init_f32_vmin_config); |
1400 | #endif |
1401 | return &f32_vmin_config; |
1402 | } |
1403 | |
1404 | const struct xnn_binary_elementwise_config* xnn_init_f32_vmul_config() { |
1405 | const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config(); |
1406 | if (hardware_config == NULL) { |
1407 | return NULL; |
1408 | } |
1409 | #if XNN_PLATFORM_WINDOWS |
1410 | InitOnceExecuteOnce(&init_guard_f32_vmul, &init_f32_vmul_config_windows, NULL, NULL); |
1411 | #else |
1412 | pthread_once(&init_guard_f32_vmul, &init_f32_vmul_config); |
1413 | #endif |
1414 | return &f32_vmul_config; |
1415 | } |
1416 | |
1417 | const struct xnn_binary_elementwise_config* xnn_init_f32_vsub_config() { |
1418 | const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config(); |
1419 | if (hardware_config == NULL) { |
1420 | return NULL; |
1421 | } |
1422 | #if XNN_PLATFORM_WINDOWS |
1423 | InitOnceExecuteOnce(&init_guard_f32_vsub, &init_f32_vsub_config_windows, NULL, NULL); |
1424 | #else |
1425 | pthread_once(&init_guard_f32_vsub, &init_f32_vsub_config); |
1426 | #endif |
1427 | return &f32_vsub_config; |
1428 | } |
1429 | |
1430 | const struct xnn_binary_elementwise_config* xnn_init_f32_vsqrdiff_config() { |
1431 | const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config(); |
1432 | if (hardware_config == NULL) { |
1433 | return NULL; |
1434 | } |
1435 | #if XNN_PLATFORM_WINDOWS |
1436 | InitOnceExecuteOnce(&init_guard_f32_vsqrdiff, &init_f32_vsqrdiff_config_windows, NULL, NULL); |
1437 | #else |
1438 | pthread_once(&init_guard_f32_vsqrdiff, &init_f32_vsqrdiff_config); |
1439 | #endif |
1440 | return &f32_vsqrdiff_config; |
1441 | } |
1442 | |
1443 | const struct xnn_binary_elementwise_config* xnn_init_qs8_vadd_config() { |
1444 | const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config(); |
1445 | if (hardware_config == NULL) { |
1446 | return NULL; |
1447 | } |
1448 | #if XNN_PLATFORM_WINDOWS |
1449 | InitOnceExecuteOnce(&init_guard_qs8_vadd, &init_qs8_vadd_config_windows, NULL, NULL); |
1450 | #else |
1451 | pthread_once(&init_guard_qs8_vadd, &init_qs8_vadd_config); |
1452 | #endif |
1453 | return &qs8_vadd_config; |
1454 | } |
1455 | |
1456 | const struct xnn_binary_elementwise_config* xnn_init_qs8_vmul_config() { |
1457 | const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config(); |
1458 | if (hardware_config == NULL) { |
1459 | return NULL; |
1460 | } |
1461 | #if XNN_PLATFORM_WINDOWS |
1462 | InitOnceExecuteOnce(&init_guard_qs8_vmul, &init_qs8_vmul_config_windows, NULL, NULL); |
1463 | #else |
1464 | pthread_once(&init_guard_qs8_vmul, &init_qs8_vmul_config); |
1465 | #endif |
1466 | return &qs8_vmul_config; |
1467 | } |
1468 | |
1469 | const struct xnn_binary_elementwise_config* xnn_init_qu8_vadd_config() { |
1470 | const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config(); |
1471 | if (hardware_config == NULL) { |
1472 | return NULL; |
1473 | } |
1474 | #if XNN_PLATFORM_WINDOWS |
1475 | InitOnceExecuteOnce(&init_guard_qu8_vadd, &init_qu8_vadd_config_windows, NULL, NULL); |
1476 | #else |
1477 | pthread_once(&init_guard_qu8_vadd, &init_qu8_vadd_config); |
1478 | #endif |
1479 | return &qu8_vadd_config; |
1480 | } |
1481 | |
1482 | const struct xnn_binary_elementwise_config* xnn_init_qu8_vmul_config() { |
1483 | const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config(); |
1484 | if (hardware_config == NULL) { |
1485 | return NULL; |
1486 | } |
1487 | #if XNN_PLATFORM_WINDOWS |
1488 | InitOnceExecuteOnce(&init_guard_qu8_vmul, &init_qu8_vmul_config_windows, NULL, NULL); |
1489 | #else |
1490 | pthread_once(&init_guard_qu8_vmul, &init_qu8_vmul_config); |
1491 | #endif |
1492 | return &qu8_vmul_config; |
1493 | } |
1494 | |