1// Copyright 2021 Google LLC
2//
3// This source code is licensed under the BSD-style license found in the
4// LICENSE file in the root directory of this source tree.
5
6#include <assert.h>
7#include <math.h>
8#include <stddef.h>
9#include <stdint.h>
10#include <string.h>
11
12#include <fp16.h>
13
14#include <xnnpack/math.h>
15#include <xnnpack/microparams-init.h>
16#include <xnnpack/unaligned.h>
17
18
19size_t xnn_init_qc8_conv_minmax_fp32_scalar_fmagic_params(
20 union xnn_qc8_conv_minmax_params params[XNN_MIN_ELEMENTS(1)],
21 int8_t output_zero_point,
22 int8_t output_min,
23 int8_t output_max)
24{
25 params->fp32_scalar_fmagic.output_min_less_zero_point = (float) ((int32_t) output_min - (int32_t) output_zero_point);
26 params->fp32_scalar_fmagic.output_max_less_zero_point = (float) ((int32_t) output_max - (int32_t) output_zero_point);
27 params->fp32_scalar_fmagic.magic_bias = 12582912.0f;
28 params->fp32_scalar_fmagic.magic_bias_less_output_zero_point = INT32_C(0x4B400000) - (int32_t) output_zero_point;
29 return sizeof(params->fp32_scalar_fmagic);
30}
31
32size_t xnn_init_qc8_conv_minmax_fp32_scalar_imagic_params(
33 union xnn_qc8_conv_minmax_params params[XNN_MIN_ELEMENTS(1)],
34 int8_t output_zero_point,
35 int8_t output_min,
36 int8_t output_max)
37{
38 const float output_min_less_zero_point = (float) ((int32_t) output_min - (int32_t) output_zero_point);
39 const float output_max_less_zero_point = (float) ((int32_t) output_max - (int32_t) output_zero_point);
40 params->fp32_scalar_imagic.magic_bias = 12582912.0f;
41 params->fp32_scalar_imagic.magic_min = (int32_t) float_as_uint32(12582912.0f + output_min_less_zero_point);
42 params->fp32_scalar_imagic.magic_max = (int32_t) float_as_uint32(12582912.0f + output_max_less_zero_point);
43 params->fp32_scalar_imagic.magic_bias_less_zero_point = INT32_C(0x4B400000) - (int32_t) output_zero_point;
44 return sizeof(params->fp32_scalar_imagic);
45}
46
47size_t xnn_init_qc8_conv_minmax_fp32_scalar_lrintf_params(
48 union xnn_qc8_conv_minmax_params params[XNN_MIN_ELEMENTS(1)],
49 int8_t output_zero_point,
50 int8_t output_min,
51 int8_t output_max)
52{
53 params->fp32_scalar_lrintf.output_min_less_zero_point = (float) ((int32_t) output_min - (int32_t) output_zero_point);
54 params->fp32_scalar_lrintf.output_max_less_zero_point = (float) ((int32_t) output_max - (int32_t) output_zero_point);
55 params->fp32_scalar_lrintf.output_zero_point = (int32_t) output_zero_point;
56 return sizeof(params->fp32_scalar_lrintf);
57}
58
59#if XNN_ARCH_X86 || XNN_ARCH_X86_64
60size_t xnn_init_qc8_conv_minmax_fp32_sse2_params(
61 union xnn_qc8_conv_minmax_params params[XNN_MIN_ELEMENTS(1)],
62 int8_t output_zero_point,
63 int8_t output_min,
64 int8_t output_max)
65{
66 const float output_max_less_zero_point = (float) ((int32_t) output_max - (int32_t) output_zero_point);
67 for (uint32_t i = 0; i < 4; i++) {
68 params->fp32_sse2.output_max_less_zero_point[i] = output_max_less_zero_point;
69 }
70 for (uint32_t i = 0; i < 8; i++) {
71 params->fp32_sse2.output_zero_point[i] = (int16_t) output_zero_point;
72 params->fp32_sse2.output_min[i] = (int16_t) output_min;
73 }
74 return sizeof(params->fp32_sse2);
75}
76
77size_t xnn_init_qc8_conv_minmax_fp32_sse4_params(
78 union xnn_qc8_conv_minmax_params params[XNN_MIN_ELEMENTS(1)],
79 int8_t output_zero_point,
80 int8_t output_min,
81 int8_t output_max)
82{
83 const float output_max_less_zero_point = (float) ((int32_t) output_max - (int32_t) output_zero_point);
84 for (uint32_t i = 0; i < 4; i++) {
85 params->fp32_sse4.output_max_less_zero_point[i] = output_max_less_zero_point;
86 }
87 for (uint32_t i = 0; i < 8; i++) {
88 params->fp32_sse4.output_zero_point[i] = (int16_t) output_zero_point;
89 }
90 for (uint32_t i = 0; i < 16; i++) {
91 params->fp32_sse4.output_min[i] = output_min;
92 }
93 return sizeof(params->fp32_sse4);
94}
95
96size_t xnn_init_qc8_conv_minmax_fp32_avx2_params(
97 union xnn_qc8_conv_minmax_params params[XNN_MIN_ELEMENTS(1)],
98 int8_t output_zero_point,
99 int8_t output_min,
100 int8_t output_max)
101{
102 const float output_max_less_zero_point = (float) ((int32_t) output_max - (int32_t) output_zero_point);
103 for (uint32_t i = 0; i < 8; i++) {
104 params->fp32_avx2.output_max_less_zero_point[i] = output_max_less_zero_point;
105 }
106 for (uint32_t i = 0; i < 16; i++) {
107 params->fp32_avx2.output_zero_point[i] = (int16_t) output_zero_point;
108 }
109 for (uint32_t i = 0; i < 32; i++) {
110 params->fp32_avx2.output_min[i] = output_min;
111 }
112 return sizeof(params->fp32_avx2);
113}
114
115size_t xnn_init_qc8_conv_minmax_fp32_avx512_params(
116 union xnn_qc8_conv_minmax_params params[XNN_MIN_ELEMENTS(1)],
117 int8_t output_zero_point,
118 int8_t output_min,
119 int8_t output_max)
120{
121 const float output_max_less_zero_point = (float) ((int32_t) output_max - (int32_t) output_zero_point);
122 for (uint32_t i = 0; i < 16; i++) {
123 params->fp32_avx512.output_max_less_zero_point[i] = output_max_less_zero_point;
124 }
125 for (uint32_t i = 0; i < 32; i++) {
126 params->fp32_avx512.output_zero_point[i] = (int16_t) output_zero_point;
127 }
128 for (uint32_t i = 0; i < 64; i++) {
129 params->fp32_avx512.output_min[i] = output_min;
130 }
131 return sizeof(params->fp32_avx512);
132}
133#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
134
135#if XNN_ARCH_ARM
136size_t xnn_init_qc8_conv_minmax_fp32_armsimd32_params(
137 union xnn_qc8_conv_minmax_params params[XNN_MIN_ELEMENTS(1)],
138 int8_t output_zero_point,
139 int8_t output_min,
140 int8_t output_max)
141{
142 params->fp32_armsimd32.magic_bias = 12582912.0f;
143 params->fp32_armsimd32.magic_bias_less_zero_point = INT32_C(0x4B400000) - (int32_t) output_zero_point;
144 params->fp32_armsimd32.output_min = (uint32_t) (uint8_t) output_min * UINT32_C(0x01010101);
145 params->fp32_armsimd32.output_max = (uint32_t) (uint8_t) output_max * UINT32_C(0x01010101);
146 return sizeof(params->fp32_armsimd32);
147}
148#endif // XNN_ARCH_ARM
149
150#if XNN_ARCH_ARM || XNN_ARCH_ARM64
151size_t xnn_init_qc8_conv_minmax_fp32_neon_params(
152 union xnn_qc8_conv_minmax_params params[XNN_MIN_ELEMENTS(1)],
153 int8_t output_zero_point,
154 int8_t output_min,
155 int8_t output_max)
156{
157 params->fp32_neon.magic_bias = 12582912.0f;
158 params->fp32_neon.magic_bias_less_output_zero_point = INT32_C(0x4B400000) - (int32_t) output_zero_point;
159 params->fp32_neon.output_min = output_min;
160 params->fp32_neon.output_max = output_max;
161 return sizeof(params->fp32_neon);
162}
163
164size_t xnn_init_qc8_conv_minmax_fp32_neonv8_params(
165 union xnn_qc8_conv_minmax_params params[XNN_MIN_ELEMENTS(1)],
166 int8_t output_zero_point,
167 int8_t output_min,
168 int8_t output_max)
169{
170 params->fp32_neonv8.output_zero_point = (int16_t) output_zero_point;
171 params->fp32_neonv8.output_min = output_min;
172 params->fp32_neonv8.output_max = output_max;
173 return sizeof(params->fp32_neonv8);
174}
175#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64
176
177#if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
178size_t xnn_init_qc8_conv_minmax_fp32_wasmsimd_params(
179 union xnn_qc8_conv_minmax_params params[XNN_MIN_ELEMENTS(1)],
180 int8_t output_zero_point,
181 int8_t output_min,
182 int8_t output_max)
183{
184 const float output_min_less_zero_point = (float) ((int32_t) output_min - (int32_t) output_zero_point);
185 const int32_t magic_min = (int32_t) float_as_uint32(12582912.0f + output_min_less_zero_point);
186 const int32_t magic_bias_less_zero_point = INT32_C(0x4B400000) - (int32_t) output_zero_point;
187 for (uint32_t i = 0; i < 2; i++) {
188 params->fp32_wasmsimd.magic_bias[i] = 12582912.0f;
189 params->fp32_wasmsimd.magic_min[i] = magic_min;
190 params->fp32_wasmsimd.magic_bias_less_output_zero_point[i] = magic_bias_less_zero_point;
191 }
192 for (uint32_t i = 0; i < 8; i++) {
193 params->fp32_wasmsimd.output_max[i] = output_max;
194 }
195 return sizeof(params->fp32_wasmsimd);
196}
197#endif // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
198
199size_t xnn_init_qs8_conv_minmax_fp32_scalar_fmagic_params(
200 union xnn_qs8_conv_minmax_params params[XNN_MIN_ELEMENTS(1)],
201 float scale,
202 int8_t output_zero_point,
203 int8_t output_min,
204 int8_t output_max)
205{
206 assert(scale >= 0x1.0p-32f);
207 assert(scale < 256.0f);
208
209 params->fp32_scalar_fmagic.scale = scale;
210 params->fp32_scalar_fmagic.output_min_less_zero_point = (float) ((int32_t) output_min - (int32_t) output_zero_point);
211 params->fp32_scalar_fmagic.output_max_less_zero_point = (float) ((int32_t) output_max - (int32_t) output_zero_point);
212 params->fp32_scalar_fmagic.magic_bias = 12582912.0f;
213 params->fp32_scalar_fmagic.magic_bias_less_output_zero_point = INT32_C(0x4B400000) - (int32_t) output_zero_point;
214 return sizeof(params->fp32_scalar_fmagic);
215}
216
217size_t xnn_init_qs8_conv_minmax_fp32_scalar_imagic_params(
218 union xnn_qs8_conv_minmax_params params[XNN_MIN_ELEMENTS(1)],
219 float scale,
220 int8_t output_zero_point,
221 int8_t output_min,
222 int8_t output_max)
223{
224 assert(scale >= 0x1.0p-32f);
225 assert(scale < 256.0f);
226
227 const float output_min_less_zero_point = (float) ((int32_t) output_min - (int32_t) output_zero_point);
228 const float output_max_less_zero_point = (float) ((int32_t) output_max - (int32_t) output_zero_point);
229 params->fp32_scalar_imagic.scale = scale;
230 params->fp32_scalar_imagic.magic_bias = 12582912.0f;
231 params->fp32_scalar_imagic.magic_min = (int32_t) float_as_uint32(12582912.0f + output_min_less_zero_point);
232 params->fp32_scalar_imagic.magic_max = (int32_t) float_as_uint32(12582912.0f + output_max_less_zero_point);
233 params->fp32_scalar_imagic.magic_bias_less_zero_point = INT32_C(0x4B400000) - (int32_t) output_zero_point;
234 return sizeof(params->fp32_scalar_imagic);
235}
236
237size_t xnn_init_qs8_conv_minmax_fp32_scalar_lrintf_params(
238 union xnn_qs8_conv_minmax_params params[XNN_MIN_ELEMENTS(1)],
239 float scale,
240 int8_t output_zero_point,
241 int8_t output_min,
242 int8_t output_max)
243{
244 assert(scale >= 0x1.0p-32f);
245 assert(scale < 256.0f);
246
247 params->fp32_scalar_lrintf.scale = scale;
248 params->fp32_scalar_lrintf.output_min_less_zero_point = (float) ((int32_t) output_min - (int32_t) output_zero_point);
249 params->fp32_scalar_lrintf.output_max_less_zero_point = (float) ((int32_t) output_max - (int32_t) output_zero_point);
250 params->fp32_scalar_lrintf.output_zero_point = (int32_t) output_zero_point;
251 return sizeof(params->fp32_scalar_lrintf);
252}
253
254size_t xnn_init_qs8_conv_minmax_rndnu_scalar_params(
255 union xnn_qs8_conv_minmax_params params[XNN_MIN_ELEMENTS(1)],
256 float scale,
257 int8_t output_zero_point,
258 int8_t output_min,
259 int8_t output_max)
260{
261 assert(scale >= 0x1.0p-32f);
262 assert(scale < 256.0f);
263
264 // Compute requantization parameters.
265 const uint32_t scale_bits = float_as_uint32(scale);
266
267 const int32_t multiplier = ((int32_t) scale_bits & INT32_C(0x007FFFFF)) | INT32_C(0x00800000);
268 assert(multiplier >= INT32_C(0x00800000));
269 assert(multiplier <= INT32_C(0x00FFFFFF));
270
271 // Shift is in [16, 55] range.
272 const uint32_t shift = 127 + 23 - (scale_bits >> 23);
273 assert(shift >= 16);
274 assert(shift < 56);
275
276 const int64_t rounding = INT64_C(1) << (shift - 1);
277 const int32_t output_min_less_zero_point = (int32_t) output_min - (int32_t) output_zero_point;
278 const int32_t output_max_less_zero_point = (int32_t) output_max - (int32_t) output_zero_point;
279
280 params->rndnu_scalar.multiplier = multiplier;
281 params->rndnu_scalar.shift = shift;
282 params->rndnu_scalar.rounding = rounding;
283 params->rndnu_scalar.output_min_less_zero_point = output_min_less_zero_point;
284 params->rndnu_scalar.output_max_less_zero_point = output_max_less_zero_point;
285 params->rndnu_scalar.output_zero_point = (int32_t) output_zero_point;
286 return sizeof(params->rndnu_scalar);
287}
288
289#if XNN_ARCH_X86 || XNN_ARCH_X86_64
290size_t xnn_init_qs8_conv_minmax_fp32_sse2_params(
291 union xnn_qs8_conv_minmax_params params[XNN_MIN_ELEMENTS(1)],
292 float scale,
293 int8_t output_zero_point,
294 int8_t output_min,
295 int8_t output_max)
296{
297 assert(scale >= 0x1.0p-32f);
298 assert(scale < 256.0f);
299
300 const float output_max_less_zero_point = (float) ((int32_t) output_max - (int32_t) output_zero_point);
301 for (uint32_t i = 0; i < 4; i++) {
302 params->fp32_sse2.scale[i] = scale;
303 params->fp32_sse2.output_max_less_zero_point[i] = output_max_less_zero_point;
304 }
305 for (uint32_t i = 0; i < 8; i++) {
306 params->fp32_sse2.output_zero_point[i] = (int16_t) output_zero_point;
307 params->fp32_sse2.output_min[i] = (int16_t) output_min;
308 }
309 return sizeof(params->fp32_sse2);
310}
311
312size_t xnn_init_qs8_conv_minmax_fp32_sse4_params(
313 union xnn_qs8_conv_minmax_params params[XNN_MIN_ELEMENTS(1)],
314 float scale,
315 int8_t output_zero_point,
316 int8_t output_min,
317 int8_t output_max)
318{
319 assert(scale >= 0x1.0p-32f);
320 assert(scale < 256.0f);
321
322 const float output_max_less_zero_point = (float) ((int32_t) output_max - (int32_t) output_zero_point);
323 for (uint32_t i = 0; i < 4; i++) {
324 params->fp32_sse4.scale[i] = scale;
325 params->fp32_sse4.output_max_less_zero_point[i] = output_max_less_zero_point;
326 }
327 for (uint32_t i = 0; i < 8; i++) {
328 params->fp32_sse4.output_zero_point[i] = (int16_t) output_zero_point;
329 }
330 for (uint32_t i = 0; i < 16; i++) {
331 params->fp32_sse4.output_min[i] = output_min;
332 }
333 return sizeof(params->fp32_sse4);
334}
335
336size_t xnn_init_qs8_conv_minmax_fp32_avx2_params(
337 union xnn_qs8_conv_minmax_params params[XNN_MIN_ELEMENTS(1)],
338 float scale,
339 int8_t output_zero_point,
340 int8_t output_min,
341 int8_t output_max)
342{
343 assert(scale >= 0x1.0p-32f);
344 assert(scale < 256.0f);
345
346 const float output_max_less_zero_point = (float) ((int32_t) output_max - (int32_t) output_zero_point);
347 for (uint32_t i = 0; i < 8; i++) {
348 params->fp32_avx2.scale[i] = scale;
349 params->fp32_avx2.output_max_less_zero_point[i] = output_max_less_zero_point;
350 }
351 for (uint32_t i = 0; i < 16; i++) {
352 params->fp32_avx2.output_zero_point[i] = (int16_t) output_zero_point;
353 }
354 for (uint32_t i = 0; i < 32; i++) {
355 params->fp32_avx2.output_min[i] = output_min;
356 }
357 return sizeof(params->fp32_avx2);
358}
359
360size_t xnn_init_qs8_conv_minmax_fp32_avx512_params(
361 union xnn_qs8_conv_minmax_params params[XNN_MIN_ELEMENTS(1)],
362 float scale,
363 int8_t output_zero_point,
364 int8_t output_min,
365 int8_t output_max)
366{
367 assert(scale >= 0x1.0p-32f);
368 assert(scale < 256.0f);
369
370 const float output_max_less_zero_point = (float) ((int32_t) output_max - (int32_t) output_zero_point);
371 for (uint32_t i = 0; i < 16; i++) {
372 params->fp32_avx512.scale[i] = scale;
373 params->fp32_avx512.output_max_less_zero_point[i] = output_max_less_zero_point;
374 }
375 for (uint32_t i = 0; i < 32; i++) {
376 params->fp32_avx512.output_zero_point[i] = (int16_t) output_zero_point;
377 }
378 for (uint32_t i = 0; i < 64; i++) {
379 params->fp32_avx512.output_min[i] = output_min;
380 }
381 return sizeof(params->fp32_avx512);
382}
383#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
384
385#if XNN_ARCH_ARM
386size_t xnn_init_qs8_conv_minmax_fp32_armsimd32_params(
387 union xnn_qs8_conv_minmax_params params[XNN_MIN_ELEMENTS(1)],
388 float scale,
389 int8_t output_zero_point,
390 int8_t output_min,
391 int8_t output_max)
392{
393 assert(scale >= 0x1.0p-32f);
394 assert(scale < 256.0f);
395
396 params->fp32_armsimd32.scale = scale;
397 params->fp32_armsimd32.magic_bias = 12582912.0f;
398 params->fp32_armsimd32.magic_bias_less_zero_point = INT32_C(0x4B400000) - (int32_t) output_zero_point;
399 params->fp32_armsimd32.output_min = (uint32_t) (uint8_t) output_min * UINT32_C(0x01010101);
400 params->fp32_armsimd32.output_max = (uint32_t) (uint8_t) output_max * UINT32_C(0x01010101);
401 return sizeof(params->fp32_armsimd32);
402}
403#endif // XNN_ARCH_ARM
404
405#if XNN_ARCH_ARM || XNN_ARCH_ARM64
406size_t xnn_init_qs8_conv_minmax_fp32_neon_params(
407 union xnn_qs8_conv_minmax_params params[XNN_MIN_ELEMENTS(1)],
408 float scale,
409 int8_t output_zero_point,
410 int8_t output_min,
411 int8_t output_max)
412{
413 assert(scale >= 0x1.0p-32f);
414 assert(scale < 256.0f);
415
416 params->fp32_neon.scale = scale;
417 params->fp32_neon.magic_bias = 12582912.0f;
418 params->fp32_neon.magic_bias_less_output_zero_point = INT32_C(0x4B400000) - (int32_t) output_zero_point;
419 params->fp32_neon.output_min = output_min;
420 params->fp32_neon.output_max = output_max;
421 return sizeof(params->fp32_neon);
422}
423
424size_t xnn_init_qs8_conv_minmax_fp32_neonv8_params(
425 union xnn_qs8_conv_minmax_params params[XNN_MIN_ELEMENTS(1)],
426 float scale,
427 int8_t output_zero_point,
428 int8_t output_min,
429 int8_t output_max)
430{
431 assert(scale >= 0x1.0p-32f);
432 assert(scale < 256.0f);
433
434 params->fp32_neonv8.scale = scale;
435 params->fp32_neonv8.output_zero_point = (int16_t) output_zero_point;
436 params->fp32_neonv8.output_min = output_min;
437 params->fp32_neonv8.output_max = output_max;
438 return sizeof(params->fp32_neonv8);
439}
440
441size_t xnn_init_qs8_conv_minmax_rndnu_neon_params(
442 union xnn_qs8_conv_minmax_params params[XNN_MIN_ELEMENTS(1)],
443 float scale,
444 int8_t output_zero_point,
445 int8_t output_min,
446 int8_t output_max)
447{
448 assert(scale >= 0x1.0p-32f);
449 assert(scale < 256.0f);
450
451 // Compute requantization parameters.
452 const uint32_t scale_bits = float_as_uint32(scale);
453
454 // Multiplier is in [0x40000000, 0x7FFFFF80] range.
455 const int32_t multiplier = (int32_t) (((scale_bits & UINT32_C(0x007FFFFF)) | UINT32_C(0x00800000)) << 7);
456 assert(multiplier >= INT32_C(0x40000000));
457 assert(multiplier <= INT32_C(0x7FFFFF80));
458
459 // Shift is in [-8, 31] range.
460 const int32_t shift = 127 + 31 - 32 - (scale_bits >> 23);
461 assert(shift >= -8);
462 assert(shift < 32);
463
464 // Split shift into pre_shift + post_shift, post_shift in [1, 31] range.
465 const int32_t post_shift = math_max_s32(shift, 1);
466 const int32_t pre_shift = shift - post_shift;
467
468 params->rndnu_neon.right_pre_shift = -pre_shift;
469 params->rndnu_neon.multiplier = multiplier;
470 params->rndnu_neon.right_post_shift = -post_shift;
471 params->rndnu_neon.output_zero_point = (int16_t) output_zero_point;
472 params->rndnu_neon.output_min = output_min;
473 params->rndnu_neon.output_max = output_max;
474 return sizeof(params->rndnu_neon);
475}
476#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64
477
478#if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
479size_t xnn_init_qs8_conv_minmax_fp32_wasmsimd_params(
480 union xnn_qs8_conv_minmax_params params[XNN_MIN_ELEMENTS(1)],
481 float scale,
482 int8_t output_zero_point,
483 int8_t output_min,
484 int8_t output_max)
485{
486 assert(scale >= 0x1.0p-32f);
487 assert(scale < 256.0f);
488
489 const float output_min_less_zero_point = (float) ((int32_t) output_min - (int32_t) output_zero_point);
490 const int32_t magic_min = (int32_t) float_as_uint32(12582912.0f + output_min_less_zero_point);
491 const int32_t magic_bias_less_zero_point = INT32_C(0x4B400000) - (int32_t) output_zero_point;
492 for (uint32_t i = 0; i < 2; i++) {
493 params->fp32_wasmsimd.scale[i] = scale;
494 params->fp32_wasmsimd.magic_bias[i] = 12582912.0f;
495 params->fp32_wasmsimd.magic_min[i] = magic_min;
496 params->fp32_wasmsimd.magic_bias_less_output_zero_point[i] = magic_bias_less_zero_point;
497 }
498 for (uint32_t i = 0; i < 8; i++) {
499 params->fp32_wasmsimd.output_max[i] = output_max;
500 }
501 return sizeof(params->fp32_wasmsimd);
502}
503#endif // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
504
505size_t xnn_init_qu8_conv_minmax_fp32_scalar_fmagic_params(
506 union xnn_qu8_conv_minmax_params params[XNN_MIN_ELEMENTS(1)],
507 uint8_t kernel_zero_point,
508 float scale,
509 uint8_t output_zero_point,
510 uint8_t output_min,
511 uint8_t output_max)
512{
513 assert(scale >= 0x1.0p-32f);
514 assert(scale < 256.0f);
515
516 params->fp32_scalar_fmagic.kernel_zero_point = (int32_t) kernel_zero_point;
517 params->fp32_scalar_fmagic.scale = scale;
518 params->fp32_scalar_fmagic.output_min_less_zero_point = (float) ((int32_t) output_min - (int32_t) output_zero_point);
519 params->fp32_scalar_fmagic.output_max_less_zero_point = (float) ((int32_t) output_max - (int32_t) output_zero_point);
520 params->fp32_scalar_fmagic.magic_bias = 12582912.0f;
521 params->fp32_scalar_fmagic.magic_bias_less_output_zero_point = INT32_C(0x4B400000) - (int32_t) output_zero_point;
522 return sizeof(params->fp32_scalar_fmagic);
523}
524
525size_t xnn_init_qu8_conv_minmax_fp32_scalar_imagic_params(
526 union xnn_qu8_conv_minmax_params params[XNN_MIN_ELEMENTS(1)],
527 uint8_t kernel_zero_point,
528 float scale,
529 uint8_t output_zero_point,
530 uint8_t output_min,
531 uint8_t output_max)
532{
533 assert(scale >= 0x1.0p-32f);
534 assert(scale < 256.0f);
535
536 const float output_min_less_zero_point = (float) ((int32_t) output_min - (int32_t) output_zero_point);
537 const float output_max_less_zero_point = (float) ((int32_t) output_max - (int32_t) output_zero_point);
538 params->fp32_scalar_imagic.kernel_zero_point = (int32_t) kernel_zero_point;
539 params->fp32_scalar_imagic.scale = scale;
540 params->fp32_scalar_imagic.magic_bias = 12582912.0f;
541 params->fp32_scalar_imagic.magic_min = (int32_t) float_as_uint32(12582912.0f + output_min_less_zero_point);
542 params->fp32_scalar_imagic.magic_max = (int32_t) float_as_uint32(12582912.0f + output_max_less_zero_point);
543 params->fp32_scalar_imagic.magic_bias_less_zero_point = INT32_C(0x4B400000) - (int32_t) output_zero_point;
544 return sizeof(params->fp32_scalar_imagic);
545}
546
547size_t xnn_init_qu8_conv_minmax_fp32_scalar_lrintf_params(
548 union xnn_qu8_conv_minmax_params params[XNN_MIN_ELEMENTS(1)],
549 uint8_t kernel_zero_point,
550 float scale,
551 uint8_t output_zero_point,
552 uint8_t output_min,
553 uint8_t output_max)
554{
555 assert(scale >= 0x1.0p-32f);
556 assert(scale < 256.0f);
557
558 params->fp32_scalar_lrintf.kernel_zero_point = (int32_t) kernel_zero_point;
559 params->fp32_scalar_lrintf.scale = scale;
560 params->fp32_scalar_lrintf.output_min_less_zero_point = (float) ((int32_t) output_min - (int32_t) output_zero_point);
561 params->fp32_scalar_lrintf.output_max_less_zero_point = (float) ((int32_t) output_max - (int32_t) output_zero_point);
562 params->fp32_scalar_lrintf.output_zero_point = (int32_t) output_zero_point;
563 return sizeof(params->fp32_scalar_lrintf);
564}
565
566size_t xnn_init_qu8_conv_minmax_rndnu_scalar_params(
567 union xnn_qu8_conv_minmax_params params[XNN_MIN_ELEMENTS(1)],
568 uint8_t kernel_zero_point,
569 float scale,
570 uint8_t output_zero_point,
571 uint8_t output_min,
572 uint8_t output_max)
573{
574 assert(scale >= 0x1.0p-32f);
575 assert(scale < 256.0f);
576
577 // Compute requantization parameters.
578 const uint32_t scale_bits = float_as_uint32(scale);
579
580 // Multiplier is in [0x00800000, 0x007FFFFF] range.
581 const int32_t multiplier = ((int32_t) scale_bits & INT32_C(0x007FFFFF)) | INT32_C(0x00800000);
582 assert(multiplier >= INT32_C(0x00800000));
583 assert(multiplier <= INT32_C(0x00FFFFFF));
584
585 // Shift is in [16, 55] range.
586 const uint32_t shift = 127 + 23 - (scale_bits >> 23);
587 assert(shift >= 16);
588 assert(shift < 56);
589
590 const int64_t rounding = INT64_C(1) << (shift - 1);
591 const int32_t output_min_less_zero_point = (int32_t) output_min - (int32_t) output_zero_point;
592 const int32_t output_max_less_zero_point = (int32_t) output_max - (int32_t) output_zero_point;
593
594 params->rndnu_scalar.kernel_zero_point = (int32_t) kernel_zero_point;
595 params->rndnu_scalar.multiplier = multiplier;
596 params->rndnu_scalar.rounding = rounding;
597 params->rndnu_scalar.shift = shift;
598 params->rndnu_scalar.output_min_less_zero_point = output_min_less_zero_point;
599 params->rndnu_scalar.output_max_less_zero_point = output_max_less_zero_point;
600 params->rndnu_scalar.output_zero_point = (int32_t) output_zero_point;
601 return sizeof(params->rndnu_scalar);
602}
603
604#if XNN_ARCH_X86 || XNN_ARCH_X86_64
605size_t xnn_init_qu8_conv_minmax_fp32_sse2_params(
606 union xnn_qu8_conv_minmax_params params[XNN_MIN_ELEMENTS(1)],
607 uint8_t kernel_zero_point,
608 float scale,
609 uint8_t output_zero_point,
610 uint8_t output_min,
611 uint8_t output_max)
612{
613 assert(scale >= 0x1.0p-32f);
614 assert(scale < 256.0f);
615
616 const float output_max_less_zero_point = (float) ((int32_t) output_max - (int32_t) output_zero_point);
617 for (uint32_t i = 0; i < 4; i++) {
618 params->fp32_sse2.scale[i] = scale;
619 params->fp32_sse2.output_max_less_zero_point[i] = output_max_less_zero_point;
620 }
621 for (uint32_t i = 0; i < 8; i++) {
622 params->fp32_sse2.kernel_zero_point[i] = (int16_t) kernel_zero_point;
623 params->fp32_sse2.output_zero_point[i] = (int16_t) output_zero_point;
624 }
625 for (uint32_t i = 0; i < 16; i++) {
626 params->fp32_sse2.output_min[i] = output_min;
627 }
628 return sizeof(params->fp32_sse2);
629}
630
631size_t xnn_init_qu8_conv_minmax_fp32_avx2_params(
632 union xnn_qu8_conv_minmax_params params[XNN_MIN_ELEMENTS(1)],
633 uint8_t kernel_zero_point,
634 float scale,
635 uint8_t output_zero_point,
636 uint8_t output_min,
637 uint8_t output_max)
638{
639 assert(scale >= 0x1.0p-32f);
640 assert(scale < 256.0f);
641
642 const float output_max_less_zero_point = (float) ((int32_t) output_max - (int32_t) output_zero_point);
643 for (uint32_t i = 0; i < 8; i++) {
644 params->fp32_avx2.scale[i] = scale;
645 params->fp32_avx2.output_max_less_zero_point[i] = output_max_less_zero_point;
646 }
647 for (uint32_t i = 0; i < 16; i++) {
648 params->fp32_avx2.kernel_zero_point[i] = (int16_t) kernel_zero_point;
649 params->fp32_avx2.output_zero_point[i] = (int16_t) output_zero_point;
650 }
651 for (uint32_t i = 0; i < 32; i++) {
652 params->fp32_avx2.output_min[i] = output_min;
653 }
654 return sizeof(params->fp32_avx2);
655}
656
657size_t xnn_init_qu8_conv_minmax_fp32_avx512_params(
658 union xnn_qu8_conv_minmax_params params[XNN_MIN_ELEMENTS(1)],
659 uint8_t kernel_zero_point,
660 float scale,
661 uint8_t output_zero_point,
662 uint8_t output_min,
663 uint8_t output_max)
664{
665 assert(scale >= 0x1.0p-32f);
666 assert(scale < 256.0f);
667
668 const float output_max_less_zero_point = (float) ((int32_t) output_max - (int32_t) output_zero_point);
669 for (uint32_t i = 0; i < 16; i++) {
670 params->fp32_avx512.scale[i] = scale;
671 params->fp32_avx512.output_max_less_zero_point[i] = output_max_less_zero_point;
672 }
673 for (uint32_t i = 0; i < 32; i++) {
674 params->fp32_avx512.kernel_zero_point[i] = (int16_t) (uint16_t) kernel_zero_point;
675 params->fp32_avx512.output_zero_point[i] = (int16_t) (uint16_t) output_zero_point;
676 }
677 for (uint32_t i = 0; i < 64; i++) {
678 params->fp32_avx512.output_min[i] = output_min;
679 }
680 return sizeof(params->fp32_avx512);
681}
682#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
683
684#if XNN_ARCH_ARM
685size_t xnn_init_qu8_conv_minmax_fp32_armsimd32_params(
686 union xnn_qu8_conv_minmax_params params[XNN_MIN_ELEMENTS(1)],
687 uint8_t kernel_zero_point,
688 float scale,
689 uint8_t output_zero_point,
690 uint8_t output_min,
691 uint8_t output_max)
692{
693 assert(scale >= 0x1.0p-32f);
694 assert(scale < 256.0f);
695
696 const int32_t minus_kernel_zero_point = -(int32_t) kernel_zero_point;
697 params->fp32_armsimd32.scale = scale;
698 params->fp32_armsimd32.magic_bias = 12582912.0f;
699 params->fp32_armsimd32.minus_kernel_zero_point = (uint32_t) (uint16_t) minus_kernel_zero_point * UINT32_C(0x00010001);
700 params->fp32_armsimd32.magic_bias_less_zero_point = INT32_C(0x4B400000) - (int32_t) output_zero_point;
701 params->fp32_armsimd32.output_min = (uint32_t) output_min * UINT32_C(0x01010101);
702 params->fp32_armsimd32.output_max = (uint32_t) output_max * UINT32_C(0x01010101);
703 return sizeof(params->fp32_armsimd32);
704}
705#endif // XNN_ARCH_ARM
706
707#if XNN_ARCH_ARM || XNN_ARCH_ARM64
708size_t xnn_init_qu8_conv_minmax_fp32_neon_params(
709 union xnn_qu8_conv_minmax_params params[XNN_MIN_ELEMENTS(1)],
710 uint8_t kernel_zero_point,
711 float scale,
712 uint8_t output_zero_point,
713 uint8_t output_min,
714 uint8_t output_max)
715{
716 assert(scale >= 0x1.0p-32f);
717 assert(scale < 256.0f);
718
719 params->fp32_neon.kernel_zero_point[0] = kernel_zero_point;
720 params->fp32_neon.kernel_zero_point[1] = kernel_zero_point;
721 params->fp32_neon.kernel_zero_point[2] = kernel_zero_point;
722 params->fp32_neon.kernel_zero_point[3] = kernel_zero_point;
723 params->fp32_neon.scale = scale;
724 params->fp32_neon.magic_bias = 12582912.0f;
725 params->fp32_neon.magic_bias_less_output_zero_point = INT32_C(0x4B400000) - (int32_t) output_zero_point;
726 params->fp32_neon.output_min = output_min;
727 params->fp32_neon.output_max = output_max;
728 return sizeof(params->fp32_neon);
729}
730
731size_t xnn_init_qu8_conv_minmax_fp32_neonv8_params(
732 union xnn_qu8_conv_minmax_params params[XNN_MIN_ELEMENTS(1)],
733 uint8_t kernel_zero_point,
734 float scale,
735 uint8_t output_zero_point,
736 uint8_t output_min,
737 uint8_t output_max)
738{
739 assert(scale >= 0x1.0p-32f);
740 assert(scale < 256.0f);
741
742 params->fp32_neonv8.kernel_zero_point[0] = kernel_zero_point;
743 params->fp32_neonv8.kernel_zero_point[1] = kernel_zero_point;
744 params->fp32_neonv8.kernel_zero_point[2] = kernel_zero_point;
745 params->fp32_neonv8.kernel_zero_point[3] = kernel_zero_point;
746 params->fp32_neonv8.scale = scale;
747 params->fp32_neonv8.output_zero_point = (int16_t) (uint16_t) output_zero_point;
748 params->fp32_neonv8.output_min = output_min;
749 params->fp32_neonv8.output_max = output_max;
750 return sizeof(params->fp32_neonv8);
751}
752
753size_t xnn_init_qu8_conv_minmax_rndnu_neon_params(
754 union xnn_qu8_conv_minmax_params params[XNN_MIN_ELEMENTS(1)],
755 uint8_t kernel_zero_point,
756 float scale,
757 uint8_t output_zero_point,
758 uint8_t output_min,
759 uint8_t output_max)
760{
761 assert(scale >= 0x1.0p-32f);
762 assert(scale < 256.0f);
763
764 // Compute requantization parameters.
765 const uint32_t scale_bits = float_as_uint32(scale);
766
767 // Multiplier is in [0x40000000, 0x7FFFFF80] range.
768 const int32_t multiplier = (int32_t) (((scale_bits & UINT32_C(0x007FFFFF)) | UINT32_C(0x00800000)) << 7);
769 assert(multiplier >= INT32_C(0x40000000));
770 assert(multiplier <= INT32_C(0x7FFFFF80));
771
772 // Shift is in [-8, 31] range.
773 const int32_t shift = 127 + 31 - 32 - (scale_bits >> 23);
774 assert(shift >= -8);
775 assert(shift < 32);
776
777 // Split shift into pre_shift + post_shift, post_shift in [1, 31] range.
778 const int32_t post_shift = math_max_s32(shift, 1);
779 const int32_t pre_shift = shift - post_shift;
780
781 params->rndnu_neon.kernel_zero_point[0] = kernel_zero_point;
782 params->rndnu_neon.kernel_zero_point[1] = kernel_zero_point;
783 params->rndnu_neon.kernel_zero_point[2] = kernel_zero_point;
784 params->rndnu_neon.kernel_zero_point[3] = kernel_zero_point;
785 params->rndnu_neon.right_pre_shift = -pre_shift;
786 params->rndnu_neon.multiplier = multiplier;
787 params->rndnu_neon.right_post_shift = -post_shift;
788 params->rndnu_neon.output_zero_point = (int16_t) (uint16_t) output_zero_point;
789 params->rndnu_neon.output_min = output_min;
790 params->rndnu_neon.output_max = output_max;
791 return sizeof(params->rndnu_neon);
792}
793#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64
794
795#if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
796size_t xnn_init_qu8_conv_minmax_fp32_wasmsimd_params(
797 union xnn_qu8_conv_minmax_params params[XNN_MIN_ELEMENTS(1)],
798 uint8_t kernel_zero_point,
799 float scale,
800 uint8_t output_zero_point,
801 uint8_t output_min,
802 uint8_t output_max)
803{
804 assert(scale >= 0x1.0p-32f);
805 assert(scale < 256.0f);
806
807 const float output_min_less_zero_point = (float) ((int32_t) output_min - (int32_t) output_zero_point);
808 const int32_t magic_min = (int32_t) float_as_uint32(12582912.0f + output_min_less_zero_point);
809 const int32_t magic_bias_less_zero_point = INT32_C(0x4B400000) - (int32_t) output_zero_point;
810 for (uint32_t i = 0; i < 4; i++) {
811 params->fp32_wasmsimd.kernel_zero_point[i] = (int16_t) (uint16_t) kernel_zero_point;
812 }
813 for (uint32_t i = 0; i < 2; i++) {
814 params->fp32_wasmsimd.scale[i] = scale;
815 params->fp32_wasmsimd.magic_bias[i] = 12582912.0f;
816 params->fp32_wasmsimd.magic_min[i] = magic_min;
817 params->fp32_wasmsimd.magic_bias_less_output_zero_point[i] = magic_bias_less_zero_point;
818 }
819 for (uint32_t i = 0; i < 8; i++) {
820 params->fp32_wasmsimd.output_max[i] = output_max;
821 }
822 return sizeof(params->fp32_wasmsimd);
823}
824#endif // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
825
826void xnn_init_qc8_scale_fp32_params(
827 size_t channels,
828 size_t channels_tile,
829 size_t stride,
830 const float scale[XNN_MIN_ELEMENTS(1)],
831 void* packed_w)
832{
833 for (size_t tile_start = 0; tile_start < channels; tile_start += channels_tile) {
834 const size_t tile_size = min(channels - tile_start, channels_tile);
835 for (size_t tile_offset = 0; tile_offset < tile_size; tile_offset++) {
836 unaligned_indexed_store_f32(packed_w, tile_offset, scale[tile_start + tile_offset]);
837 }
838 packed_w = (void*) ((uintptr_t) packed_w + stride);
839 }
840}
841
842size_t xnn_init_qs8_avgpool_minmax_fp32_scalar_fmagic_params(
843 union xnn_qs8_avgpool_minmax_params params[XNN_MIN_ELEMENTS(1)],
844 int32_t init_bias,
845 float scale,
846 int8_t output_zero_point,
847 int8_t output_min,
848 int8_t output_max)
849{
850 assert(scale >= 0x1.0p-32f);
851 assert(scale < 256.0f);
852
853 params->fp32_scalar_fmagic.init_bias = init_bias;
854 params->fp32_scalar_fmagic.scale = scale;
855 params->fp32_scalar_fmagic.output_min_less_zero_point = (float) ((int32_t) output_min - (int32_t) output_zero_point);
856 params->fp32_scalar_fmagic.output_max_less_zero_point = (float) ((int32_t) output_max - (int32_t) output_zero_point);
857 params->fp32_scalar_fmagic.magic_bias = 12582912.0f;
858 params->fp32_scalar_fmagic.magic_bias_less_output_zero_point = INT32_C(0x4B400000) - (int32_t) output_zero_point;
859 return sizeof(params->fp32_scalar_fmagic);
860}
861
862void xnn_update_qs8_avgpool_minmax_fp32_scalar_fmagic_params(
863 union xnn_qs8_avgpool_minmax_params params[XNN_MIN_ELEMENTS(1)],
864 int32_t init_bias,
865 float scale)
866{
867 assert(scale >= 0x1.0p-32f);
868 assert(scale < 256.0f);
869
870 params->fp32_scalar_fmagic.init_bias = init_bias;
871 params->fp32_scalar_fmagic.scale = scale;
872}
873
874size_t xnn_init_qs8_avgpool_minmax_fp32_scalar_imagic_params(
875 union xnn_qs8_avgpool_minmax_params params[XNN_MIN_ELEMENTS(1)],
876 int32_t init_bias,
877 float scale,
878 int8_t output_zero_point,
879 int8_t output_min,
880 int8_t output_max)
881{
882 assert(scale >= 0x1.0p-32f);
883 assert(scale < 256.0f);
884
885 const float output_min_less_zero_point = (float) ((int32_t) output_min - (int32_t) output_zero_point);
886 const float output_max_less_zero_point = (float) ((int32_t) output_max - (int32_t) output_zero_point);
887 params->fp32_scalar_imagic.init_bias = init_bias;
888 params->fp32_scalar_imagic.scale = scale;
889 params->fp32_scalar_imagic.magic_bias = 12582912.0f;
890 params->fp32_scalar_imagic.magic_min = (int32_t) float_as_uint32(12582912.0f + output_min_less_zero_point);
891 params->fp32_scalar_imagic.magic_max = (int32_t) float_as_uint32(12582912.0f + output_max_less_zero_point);
892 params->fp32_scalar_imagic.magic_bias_less_zero_point = INT32_C(0x4B400000) - (int32_t) output_zero_point;
893 return sizeof(params->fp32_scalar_imagic);
894}
895
896void xnn_update_qs8_avgpool_minmax_fp32_scalar_imagic_params(
897 union xnn_qs8_avgpool_minmax_params params[XNN_MIN_ELEMENTS(1)],
898 int32_t init_bias,
899 float scale)
900{
901 assert(scale >= 0x1.0p-32f);
902 assert(scale < 256.0f);
903
904 params->fp32_scalar_imagic.init_bias = init_bias;
905 params->fp32_scalar_imagic.scale = scale;
906}
907
908size_t xnn_init_qs8_avgpool_minmax_fp32_scalar_lrintf_params(
909 union xnn_qs8_avgpool_minmax_params params[XNN_MIN_ELEMENTS(1)],
910 int32_t init_bias,
911 float scale,
912 int8_t output_zero_point,
913 int8_t output_min,
914 int8_t output_max)
915{
916 assert(scale >= 0x1.0p-32f);
917 assert(scale < 256.0f);
918
919 params->fp32_scalar_lrintf.init_bias = init_bias;
920 params->fp32_scalar_lrintf.scale = scale;
921 params->fp32_scalar_lrintf.output_min_less_zero_point = (float) ((int32_t) output_min - (int32_t) output_zero_point);
922 params->fp32_scalar_lrintf.output_max_less_zero_point = (float) ((int32_t) output_max - (int32_t) output_zero_point);
923 params->fp32_scalar_lrintf.output_zero_point = (int32_t) output_zero_point;
924 return sizeof(params->fp32_scalar_lrintf);
925}
926
927void xnn_update_qs8_avgpool_minmax_fp32_scalar_lrintf_params(
928 union xnn_qs8_avgpool_minmax_params params[XNN_MIN_ELEMENTS(1)],
929 int32_t init_bias,
930 float scale)
931{
932 assert(scale >= 0x1.0p-32f);
933 assert(scale < 256.0f);
934
935 params->fp32_scalar_lrintf.init_bias = init_bias;
936 params->fp32_scalar_lrintf.scale = scale;
937}
938
939#if XNN_ARCH_X86 || XNN_ARCH_X86_64
940size_t xnn_init_qs8_avgpool_minmax_fp32_sse2_params(
941 union xnn_qs8_avgpool_minmax_params params[XNN_MIN_ELEMENTS(1)],
942 int32_t init_bias,
943 float scale,
944 int8_t output_zero_point,
945 int8_t output_min,
946 int8_t output_max)
947{
948 assert(scale >= 0x1.0p-32f);
949 assert(scale < 256.0f);
950
951 const float output_max_less_zero_point = (float) ((int32_t) output_max - (int32_t) output_zero_point);
952 for (uint32_t i = 0; i < 4; i++) {
953 params->fp32_sse2.init_bias[i] = init_bias;
954 params->fp32_sse2.scale[i] = scale;
955 params->fp32_sse2.output_max_less_zero_point[i] = output_max_less_zero_point;
956 }
957 for (uint32_t i = 0; i < 8; i++) {
958 params->fp32_sse2.output_zero_point[i] = (int16_t) output_zero_point;
959 params->fp32_sse2.output_min[i] = (int16_t) output_min;
960 }
961 return sizeof(params->fp32_sse2);
962}
963
964void xnn_update_qs8_avgpool_minmax_fp32_sse2_params(
965 union xnn_qs8_avgpool_minmax_params params[XNN_MIN_ELEMENTS(1)],
966 int32_t init_bias,
967 float scale)
968{
969 assert(scale >= 0x1.0p-32f);
970 assert(scale < 256.0f);
971
972 for (uint32_t i = 0; i < 4; i++) {
973 params->fp32_sse2.init_bias[i] = init_bias;
974 params->fp32_sse2.scale[i] = scale;
975 }
976}
977
978size_t xnn_init_qs8_avgpool_minmax_fp32_sse4_params(
979 union xnn_qs8_avgpool_minmax_params params[XNN_MIN_ELEMENTS(1)],
980 int32_t init_bias,
981 float scale,
982 int8_t output_zero_point,
983 int8_t output_min,
984 int8_t output_max)
985{
986 assert(scale >= 0x1.0p-32f);
987 assert(scale < 256.0f);
988
989 const float output_max_less_zero_point = (float) ((int32_t) output_max - (int32_t) output_zero_point);
990 for (uint32_t i = 0; i < 4; i++) {
991 params->fp32_sse4.init_bias[i] = init_bias;
992 params->fp32_sse4.scale[i] = scale;
993 params->fp32_sse4.output_max_less_zero_point[i] = output_max_less_zero_point;
994 }
995 for (uint32_t i = 0; i < 8; i++) {
996 params->fp32_sse4.output_zero_point[i] = (int16_t) output_zero_point;
997 }
998 for (uint32_t i = 0; i < 16; i++) {
999 params->fp32_sse4.output_min[i] = output_min;
1000 }
1001 return sizeof(params->fp32_sse4);
1002}
1003
1004void xnn_update_qs8_avgpool_minmax_fp32_sse4_params(
1005 union xnn_qs8_avgpool_minmax_params params[XNN_MIN_ELEMENTS(1)],
1006 int32_t init_bias,
1007 float scale)
1008{
1009 assert(scale >= 0x1.0p-32f);
1010 assert(scale < 256.0f);
1011
1012 for (uint32_t i = 0; i < 4; i++) {
1013 params->fp32_sse4.init_bias[i] = init_bias;
1014 params->fp32_sse4.scale[i] = scale;
1015 }
1016}
1017#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
1018
1019#if XNN_ARCH_ARM || XNN_ARCH_ARM64
1020size_t xnn_init_qs8_avgpool_minmax_fp32_neon_params(
1021 union xnn_qs8_avgpool_minmax_params params[XNN_MIN_ELEMENTS(1)],
1022 int32_t init_bias,
1023 float scale,
1024 int8_t output_zero_point,
1025 int8_t output_min,
1026 int8_t output_max)
1027{
1028 assert(scale >= 0x1.0p-32f);
1029 assert(scale < 256.0f);
1030
1031 params->fp32_neon.init_bias = init_bias;
1032 params->fp32_neon.scale = scale;
1033 params->fp32_neon.magic_bias = 12582912.0f;
1034 params->fp32_neon.magic_bias_less_output_zero_point = INT32_C(0x4B400000) - (int32_t) output_zero_point;
1035 params->fp32_neon.output_min = output_min;
1036 params->fp32_neon.output_max = output_max;
1037 return sizeof(params->fp32_neon);
1038}
1039
1040void xnn_update_qs8_avgpool_minmax_fp32_neon_params(
1041 union xnn_qs8_avgpool_minmax_params params[XNN_MIN_ELEMENTS(1)],
1042 int32_t init_bias,
1043 float scale)
1044{
1045 assert(scale >= 0x1.0p-32f);
1046 assert(scale < 256.0f);
1047
1048 params->fp32_neon.init_bias = init_bias;
1049 params->fp32_neon.scale = scale;
1050}
1051
1052size_t xnn_init_qs8_avgpool_minmax_fp32_neonv8_params(
1053 union xnn_qs8_avgpool_minmax_params params[XNN_MIN_ELEMENTS(1)],
1054 int32_t init_bias,
1055 float scale,
1056 int8_t output_zero_point,
1057 int8_t output_min,
1058 int8_t output_max)
1059{
1060 assert(scale >= 0x1.0p-32f);
1061 assert(scale < 256.0f);
1062
1063 params->fp32_neonv8.init_bias = init_bias;
1064 params->fp32_neonv8.scale = scale;
1065 params->fp32_neonv8.output_zero_point = (int16_t) output_zero_point;
1066 params->fp32_neonv8.output_min = output_min;
1067 params->fp32_neonv8.output_max = output_max;
1068 return sizeof(params->fp32_neonv8);
1069}
1070
1071void xnn_update_qs8_avgpool_minmax_fp32_neonv8_params(
1072 union xnn_qs8_avgpool_minmax_params params[XNN_MIN_ELEMENTS(1)],
1073 int32_t init_bias,
1074 float scale)
1075{
1076 assert(scale >= 0x1.0p-32f);
1077 assert(scale < 256.0f);
1078
1079 params->fp32_neonv8.init_bias = init_bias;
1080 params->fp32_neonv8.scale = scale;
1081}
1082
1083size_t xnn_init_qs8_avgpool_minmax_rndnu_neon_params(
1084 union xnn_qs8_avgpool_minmax_params params[XNN_MIN_ELEMENTS(1)],
1085 int32_t init_bias,
1086 float scale,
1087 int8_t output_zero_point,
1088 int8_t output_min,
1089 int8_t output_max)
1090{
1091 assert(scale >= 0x1.0p-32f);
1092 assert(scale < 256.0f);
1093
1094 // Compute requantization parameters.
1095 const uint32_t scale_bits = float_as_uint32(scale);
1096
1097 // Multiplier is in [0x40000000, 0x7FFFFF80] range.
1098 const int32_t multiplier = (int32_t) (((scale_bits & UINT32_C(0x007FFFFF)) | UINT32_C(0x00800000)) << 7);
1099 assert(multiplier >= INT32_C(0x40000000));
1100 assert(multiplier <= INT32_C(0x7FFFFF80));
1101
1102 // Shift is in [-8, 31] range.
1103 const int32_t shift = 127 + 31 - 32 - (scale_bits >> 23);
1104 assert(shift >= -8);
1105 assert(shift < 32);
1106
1107 // Split shift into pre_shift + post_shift, post_shift in [1, 31] range.
1108 const int32_t post_shift = math_max_s32(shift, 1);
1109 const int32_t pre_shift = shift - post_shift;
1110
1111 params->rndnu_neon.init_bias = init_bias;
1112 params->rndnu_neon.left_pre_shift = -pre_shift;
1113 params->rndnu_neon.multiplier = multiplier;
1114 params->rndnu_neon.left_post_shift = -post_shift;
1115 params->rndnu_neon.output_zero_point = (int16_t) output_zero_point;
1116 params->rndnu_neon.output_min = output_min;
1117 params->rndnu_neon.output_max = output_max;
1118 return sizeof(params->rndnu_neon);
1119}
1120
1121void xnn_update_qs8_avgpool_minmax_rndnu_neon_params(
1122 union xnn_qs8_avgpool_minmax_params params[XNN_MIN_ELEMENTS(1)],
1123 int32_t init_bias,
1124 float scale)
1125{
1126 assert(scale >= 0x1.0p-32f);
1127 assert(scale < 256.0f);
1128
1129 // Compute requantization parameters.
1130 const uint32_t scale_bits = float_as_uint32(scale);
1131
1132 // Multiplier is in [0x40000000, 0x7FFFFF80] range.
1133 const int32_t multiplier = (int32_t) (((scale_bits & UINT32_C(0x007FFFFF)) | UINT32_C(0x00800000)) << 7);
1134 assert(multiplier >= INT32_C(0x40000000));
1135 assert(multiplier <= INT32_C(0x7FFFFF80));
1136
1137 // Shift is in [-8, 31] range.
1138 const int32_t shift = 127 + 31 - 32 - (scale_bits >> 23);
1139 assert(shift >= -8);
1140 assert(shift < 32);
1141
1142 // Split shift into pre_shift + post_shift, post_shift in [1, 31] range.
1143 const int32_t post_shift = math_max_s32(shift, 1);
1144 const int32_t pre_shift = shift - post_shift;
1145
1146 params->rndnu_neon.init_bias = init_bias;
1147 params->rndnu_neon.left_pre_shift = -pre_shift;
1148 params->rndnu_neon.multiplier = multiplier;
1149 params->rndnu_neon.left_post_shift = -post_shift;
1150}
1151#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64
1152
1153#if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
1154size_t xnn_init_qs8_avgpool_minmax_fp32_wasmsimd_params(
1155 union xnn_qs8_avgpool_minmax_params params[XNN_MIN_ELEMENTS(1)],
1156 int32_t init_bias,
1157 float scale,
1158 int8_t output_zero_point,
1159 int8_t output_min,
1160 int8_t output_max)
1161{
1162 assert(scale >= 0x1.0p-32f);
1163 assert(scale < 256.0f);
1164
1165 const float output_min_less_zero_point = (float) ((int32_t) output_min - (int32_t) output_zero_point);
1166 const int32_t magic_min = (int32_t) float_as_uint32(12582912.0f + output_min_less_zero_point);
1167 const int32_t magic_bias_less_zero_point = INT32_C(0x4B400000) - (int32_t) output_zero_point;
1168 for (uint32_t i = 0; i < 2; i++) {
1169 params->fp32_wasmsimd.init_bias[i] = init_bias;
1170 params->fp32_wasmsimd.scale[i] = scale;
1171 params->fp32_wasmsimd.magic_bias[i] = 12582912.0f;
1172 params->fp32_wasmsimd.magic_min[i] = magic_min;
1173 params->fp32_wasmsimd.magic_bias_less_output_zero_point[i] = magic_bias_less_zero_point;
1174 }
1175 for (uint32_t i = 0; i < 8; i++) {
1176 params->fp32_wasmsimd.output_max[i] = output_max;
1177 }
1178 return sizeof(params->fp32_wasmsimd);
1179}
1180
1181void xnn_update_qs8_avgpool_minmax_fp32_wasmsimd_params(
1182 union xnn_qs8_avgpool_minmax_params params[XNN_MIN_ELEMENTS(1)],
1183 int32_t init_bias,
1184 float scale)
1185{
1186 assert(scale >= 0x1.0p-32f);
1187 assert(scale < 256.0f);
1188
1189 for (uint32_t i = 0; i < 2; i++) {
1190 params->fp32_wasmsimd.init_bias[i] = init_bias;
1191 params->fp32_wasmsimd.scale[i] = scale;
1192 }
1193}
1194#endif // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
1195
1196size_t xnn_init_qu8_avgpool_minmax_fp32_scalar_fmagic_params(
1197 union xnn_qu8_avgpool_minmax_params params[XNN_MIN_ELEMENTS(1)],
1198 int32_t init_bias,
1199 float scale,
1200 uint8_t output_zero_point,
1201 uint8_t output_min,
1202 uint8_t output_max)
1203{
1204 assert(scale >= 0x1.0p-32f);
1205 assert(scale < 256.0f);
1206
1207 params->fp32_scalar_fmagic.init_bias = init_bias;
1208 params->fp32_scalar_fmagic.scale = scale;
1209 params->fp32_scalar_fmagic.output_min_less_zero_point = (float) ((int32_t) output_min - (int32_t) output_zero_point);
1210 params->fp32_scalar_fmagic.output_max_less_zero_point = (float) ((int32_t) output_max - (int32_t) output_zero_point);
1211 params->fp32_scalar_fmagic.magic_bias = 12582912.0f;
1212 params->fp32_scalar_fmagic.magic_bias_less_output_zero_point = INT32_C(0x4B400000) - (int32_t) output_zero_point;
1213 return sizeof(params->fp32_scalar_fmagic);
1214}
1215
1216void xnn_update_qu8_avgpool_minmax_fp32_scalar_fmagic_params(
1217 union xnn_qu8_avgpool_minmax_params params[XNN_MIN_ELEMENTS(1)],
1218 int32_t init_bias,
1219 float scale)
1220{
1221 assert(scale >= 0x1.0p-32f);
1222 assert(scale < 256.0f);
1223
1224 params->fp32_scalar_fmagic.init_bias = init_bias;
1225 params->fp32_scalar_fmagic.scale = scale;
1226}
1227
1228size_t xnn_init_qu8_avgpool_minmax_fp32_scalar_imagic_params(
1229 union xnn_qu8_avgpool_minmax_params params[XNN_MIN_ELEMENTS(1)],
1230 int32_t init_bias,
1231 float scale,
1232 uint8_t output_zero_point,
1233 uint8_t output_min,
1234 uint8_t output_max)
1235{
1236 assert(scale >= 0x1.0p-32f);
1237 assert(scale < 256.0f);
1238
1239 const float output_min_less_zero_point = (float) ((int32_t) output_min - (int32_t) output_zero_point);
1240 const float output_max_less_zero_point = (float) ((int32_t) output_max - (int32_t) output_zero_point);
1241 params->fp32_scalar_imagic.init_bias = init_bias;
1242 params->fp32_scalar_imagic.scale = scale;
1243 params->fp32_scalar_imagic.magic_bias = 12582912.0f;
1244 params->fp32_scalar_imagic.magic_min = (int32_t) float_as_uint32(12582912.0f + output_min_less_zero_point);
1245 params->fp32_scalar_imagic.magic_max = (int32_t) float_as_uint32(12582912.0f + output_max_less_zero_point);
1246 params->fp32_scalar_imagic.magic_bias_less_zero_point = INT32_C(0x4B400000) - (int32_t) output_zero_point;
1247 return sizeof(params->fp32_scalar_imagic);
1248}
1249
1250void xnn_update_qu8_avgpool_minmax_fp32_scalar_imagic_params(
1251 union xnn_qu8_avgpool_minmax_params params[XNN_MIN_ELEMENTS(1)],
1252 int32_t init_bias,
1253 float scale)
1254{
1255 assert(scale >= 0x1.0p-32f);
1256 assert(scale < 256.0f);
1257
1258 params->fp32_scalar_imagic.init_bias = init_bias;
1259 params->fp32_scalar_imagic.scale = scale;
1260}
1261
1262size_t xnn_init_qu8_avgpool_minmax_fp32_scalar_lrintf_params(
1263 union xnn_qu8_avgpool_minmax_params params[XNN_MIN_ELEMENTS(1)],
1264 int32_t init_bias,
1265 float scale,
1266 uint8_t output_zero_point,
1267 uint8_t output_min,
1268 uint8_t output_max)
1269{
1270 assert(scale >= 0x1.0p-32f);
1271 assert(scale < 256.0f);
1272
1273 params->fp32_scalar_lrintf.init_bias = init_bias;
1274 params->fp32_scalar_lrintf.scale = scale;
1275 params->fp32_scalar_lrintf.output_min_less_zero_point = (float) ((int32_t) output_min - (int32_t) output_zero_point);
1276 params->fp32_scalar_lrintf.output_max_less_zero_point = (float) ((int32_t) output_max - (int32_t) output_zero_point);
1277 params->fp32_scalar_lrintf.output_zero_point = (int32_t) output_zero_point;
1278 return sizeof(params->fp32_scalar_lrintf);
1279}
1280
1281void xnn_update_qu8_avgpool_minmax_fp32_scalar_lrintf_params(
1282 union xnn_qu8_avgpool_minmax_params params[XNN_MIN_ELEMENTS(1)],
1283 int32_t init_bias,
1284 float scale)
1285{
1286 assert(scale >= 0x1.0p-32f);
1287 assert(scale < 256.0f);
1288
1289 params->fp32_scalar_lrintf.init_bias = init_bias;
1290 params->fp32_scalar_lrintf.scale = scale;
1291}
1292
1293#if XNN_ARCH_X86 || XNN_ARCH_X86_64
1294size_t xnn_init_qu8_avgpool_minmax_fp32_sse2_params(
1295 union xnn_qu8_avgpool_minmax_params params[XNN_MIN_ELEMENTS(1)],
1296 int32_t init_bias,
1297 float scale,
1298 uint8_t output_zero_point,
1299 uint8_t output_min,
1300 uint8_t output_max)
1301{
1302 assert(scale >= 0x1.0p-32f);
1303 assert(scale < 256.0f);
1304
1305 const float output_max_less_zero_point = (float) ((int32_t) output_max - (int32_t) output_zero_point);
1306 for (uint32_t i = 0; i < 4; i++) {
1307 params->fp32_sse2.init_bias[i] = init_bias;
1308 params->fp32_sse2.scale[i] = scale;
1309 params->fp32_sse2.output_max_less_zero_point[i] = output_max_less_zero_point;
1310 }
1311 for (uint32_t i = 0; i < 8; i++) {
1312 params->fp32_sse2.output_zero_point[i] = (int16_t) output_zero_point;
1313 }
1314 for (uint32_t i = 0; i < 16; i++) {
1315 params->fp32_sse2.output_min[i] = output_min;
1316 }
1317 return sizeof(params->fp32_sse2);
1318}
1319
1320void xnn_update_qu8_avgpool_minmax_fp32_sse2_params(
1321 union xnn_qu8_avgpool_minmax_params params[XNN_MIN_ELEMENTS(1)],
1322 int32_t init_bias,
1323 float scale)
1324{
1325 assert(scale >= 0x1.0p-32f);
1326 assert(scale < 256.0f);
1327
1328 for (uint32_t i = 0; i < 4; i++) {
1329 params->fp32_sse2.init_bias[i] = init_bias;
1330 params->fp32_sse2.scale[i] = scale;
1331 }
1332}
1333
1334size_t xnn_init_qu8_avgpool_minmax_fp32_sse4_params(
1335 union xnn_qu8_avgpool_minmax_params params[XNN_MIN_ELEMENTS(1)],
1336 int32_t init_bias,
1337 float scale,
1338 uint8_t output_zero_point,
1339 uint8_t output_min,
1340 uint8_t output_max)
1341{
1342 assert(scale >= 0x1.0p-32f);
1343 assert(scale < 256.0f);
1344
1345 const float output_max_less_zero_point = (float) ((int32_t) output_max - (int32_t) output_zero_point);
1346 for (uint32_t i = 0; i < 4; i++) {
1347 params->fp32_sse4.init_bias[i] = init_bias;
1348 params->fp32_sse4.scale[i] = scale;
1349 params->fp32_sse4.output_max_less_zero_point[i] = output_max_less_zero_point;
1350 }
1351 for (uint32_t i = 0; i < 8; i++) {
1352 params->fp32_sse4.output_zero_point[i] = (int16_t) output_zero_point;
1353 }
1354 for (uint32_t i = 0; i < 16; i++) {
1355 params->fp32_sse4.output_min[i] = output_min;
1356 }
1357 return sizeof(params->fp32_sse4);
1358}
1359
1360void xnn_update_qu8_avgpool_minmax_fp32_sse4_params(
1361 union xnn_qu8_avgpool_minmax_params params[XNN_MIN_ELEMENTS(1)],
1362 int32_t init_bias,
1363 float scale)
1364{
1365 assert(scale >= 0x1.0p-32f);
1366 assert(scale < 256.0f);
1367
1368 for (uint32_t i = 0; i < 4; i++) {
1369 params->fp32_sse4.init_bias[i] = init_bias;
1370 params->fp32_sse4.scale[i] = scale;
1371 }
1372}
1373#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
1374
1375#if XNN_ARCH_ARM || XNN_ARCH_ARM64
1376size_t xnn_init_qu8_avgpool_minmax_fp32_neon_params(
1377 union xnn_qu8_avgpool_minmax_params params[XNN_MIN_ELEMENTS(1)],
1378 int32_t init_bias,
1379 float scale,
1380 uint8_t output_zero_point,
1381 uint8_t output_min,
1382 uint8_t output_max)
1383{
1384 assert(scale >= 0x1.0p-32f);
1385 assert(scale < 256.0f);
1386
1387 params->fp32_neon.init_bias = init_bias;
1388 params->fp32_neon.scale = scale;
1389 params->fp32_neon.magic_bias = 12582912.0f;
1390 params->fp32_neon.magic_bias_less_output_zero_point = INT32_C(0x4B400000) - (int32_t) output_zero_point;
1391 params->fp32_neon.output_min = output_min;
1392 params->fp32_neon.output_max = output_max;
1393 return sizeof(params->fp32_neon);
1394}
1395
1396void xnn_update_qu8_avgpool_minmax_fp32_neon_params(
1397 union xnn_qu8_avgpool_minmax_params params[XNN_MIN_ELEMENTS(1)],
1398 int32_t init_bias,
1399 float scale)
1400{
1401 assert(scale >= 0x1.0p-32f);
1402 assert(scale < 256.0f);
1403
1404 params->fp32_neon.init_bias = init_bias;
1405 params->fp32_neon.scale = scale;
1406}
1407
1408size_t xnn_init_qu8_avgpool_minmax_fp32_neonv8_params(
1409 union xnn_qu8_avgpool_minmax_params params[XNN_MIN_ELEMENTS(1)],
1410 int32_t init_bias,
1411 float scale,
1412 uint8_t output_zero_point,
1413 uint8_t output_min,
1414 uint8_t output_max)
1415{
1416 assert(scale >= 0x1.0p-32f);
1417 assert(scale < 256.0f);
1418
1419 params->fp32_neonv8.init_bias = init_bias;
1420 params->fp32_neonv8.scale = scale;
1421 params->fp32_neonv8.output_zero_point = (int16_t) output_zero_point;
1422 params->fp32_neonv8.output_min = output_min;
1423 params->fp32_neonv8.output_max = output_max;
1424 return sizeof(params->fp32_neonv8);
1425}
1426
1427void xnn_update_qu8_avgpool_minmax_fp32_neonv8_params(
1428 union xnn_qu8_avgpool_minmax_params params[XNN_MIN_ELEMENTS(1)],
1429 int32_t init_bias,
1430 float scale)
1431{
1432 assert(scale >= 0x1.0p-32f);
1433 assert(scale < 256.0f);
1434
1435 params->fp32_neonv8.init_bias = init_bias;
1436 params->fp32_neonv8.scale = scale;
1437}
1438
1439size_t xnn_init_qu8_avgpool_minmax_rndnu_neon_params(
1440 union xnn_qu8_avgpool_minmax_params params[XNN_MIN_ELEMENTS(1)],
1441 int32_t init_bias,
1442 float scale,
1443 uint8_t output_zero_point,
1444 uint8_t output_min,
1445 uint8_t output_max)
1446{
1447 assert(scale >= 0x1.0p-32f);
1448 assert(scale < 256.0f);
1449
1450 // Compute requantization parameters.
1451 const uint32_t scale_bits = float_as_uint32(scale);
1452
1453 // Multiplier is in [0x40000000, 0x7FFFFF80] range.
1454 const int32_t multiplier = (int32_t) (((scale_bits & UINT32_C(0x007FFFFF)) | UINT32_C(0x00800000)) << 7);
1455 assert(multiplier >= INT32_C(0x40000000));
1456 assert(multiplier <= INT32_C(0x7FFFFF80));
1457
1458 // Shift is in [-8, 31] range.
1459 const int32_t shift = 127 + 31 - 32 - (scale_bits >> 23);
1460 assert(shift >= -8);
1461 assert(shift < 32);
1462
1463 // Split shift into pre_shift + post_shift, post_shift in [1, 31] range.
1464 const int32_t post_shift = math_max_s32(shift, 1);
1465 const int32_t pre_shift = shift - post_shift;
1466
1467 params->rndnu_neon.init_bias = init_bias;
1468 params->rndnu_neon.left_pre_shift = -pre_shift;
1469 params->rndnu_neon.multiplier = multiplier;
1470 params->rndnu_neon.left_post_shift = -post_shift;
1471 params->rndnu_neon.output_zero_point = (int16_t) output_zero_point;
1472 params->rndnu_neon.output_min = output_min;
1473 params->rndnu_neon.output_max = output_max;
1474 return sizeof(params->rndnu_neon);
1475}
1476
1477void xnn_update_qu8_avgpool_minmax_rndnu_neon_params(
1478 union xnn_qu8_avgpool_minmax_params params[XNN_MIN_ELEMENTS(1)],
1479 int32_t init_bias,
1480 float scale)
1481{
1482 assert(scale >= 0x1.0p-32f);
1483 assert(scale < 256.0f);
1484
1485 // Compute requantization parameters.
1486 const uint32_t scale_bits = float_as_uint32(scale);
1487
1488 // Multiplier is in [0x40000000, 0x7FFFFF80] range.
1489 const int32_t multiplier = (int32_t) (((scale_bits & UINT32_C(0x007FFFFF)) | UINT32_C(0x00800000)) << 7);
1490 assert(multiplier >= INT32_C(0x40000000));
1491 assert(multiplier <= INT32_C(0x7FFFFF80));
1492
1493 // Shift is in [-8, 31] range.
1494 const int32_t shift = 127 + 31 - 32 - (scale_bits >> 23);
1495 assert(shift >= -8);
1496 assert(shift < 32);
1497
1498 // Split shift into pre_shift + post_shift, post_shift in [1, 31] range.
1499 const int32_t post_shift = math_max_s32(shift, 1);
1500 const int32_t pre_shift = shift - post_shift;
1501
1502 params->rndnu_neon.init_bias = init_bias;
1503 params->rndnu_neon.left_pre_shift = -pre_shift;
1504 params->rndnu_neon.multiplier = multiplier;
1505 params->rndnu_neon.left_post_shift = -post_shift;
1506}
1507#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64
1508
1509#if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
1510size_t xnn_init_qu8_avgpool_minmax_fp32_wasmsimd_params(
1511 union xnn_qu8_avgpool_minmax_params params[XNN_MIN_ELEMENTS(1)],
1512 int32_t init_bias,
1513 float scale,
1514 uint8_t output_zero_point,
1515 uint8_t output_min,
1516 uint8_t output_max)
1517{
1518 assert(scale >= 0x1.0p-32f);
1519 assert(scale < 256.0f);
1520
1521 const float output_min_less_zero_point = (float) ((int32_t) output_min - (int32_t) output_zero_point);
1522 const int32_t magic_min = (int32_t) float_as_uint32(12582912.0f + output_min_less_zero_point);
1523 const int32_t magic_bias_less_zero_point = INT32_C(0x4B400000) - (int32_t) output_zero_point;
1524 for (uint32_t i = 0; i < 2; i++) {
1525 params->fp32_wasmsimd.init_bias[i] = init_bias;
1526 params->fp32_wasmsimd.scale[i] = scale;
1527 params->fp32_wasmsimd.magic_bias[i] = 12582912.0f;
1528 params->fp32_wasmsimd.magic_min[i] = magic_min;
1529 params->fp32_wasmsimd.magic_bias_less_output_zero_point[i] = magic_bias_less_zero_point;
1530 }
1531 for (uint32_t i = 0; i < 8; i++) {
1532 params->fp32_wasmsimd.output_max[i] = output_max;
1533 }
1534 return sizeof(params->fp32_wasmsimd);
1535}
1536
1537void xnn_update_qu8_avgpool_minmax_fp32_wasmsimd_params(
1538 union xnn_qu8_avgpool_minmax_params params[XNN_MIN_ELEMENTS(1)],
1539 int32_t init_bias,
1540 float scale)
1541{
1542 assert(scale >= 0x1.0p-32f);
1543 assert(scale < 256.0f);
1544
1545 for (uint32_t i = 0; i < 2; i++) {
1546 params->fp32_wasmsimd.init_bias[i] = init_bias;
1547 params->fp32_wasmsimd.scale[i] = scale;
1548 }
1549}
1550#endif // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
1551
1552void xnn_update_f32_scaleminmax_scalar_params(
1553 union xnn_f32_scaleminmax_params* params,
1554 float scale)
1555{
1556 params->scalar.scale = scale;
1557}
1558
1559#if XNN_ARCH_X86 || XNN_ARCH_X86_64
1560void xnn_update_f32_scaleminmax_sse_params(
1561 union xnn_f32_scaleminmax_params* params,
1562 float scale)
1563{
1564 for (uint32_t i = 0; i < 4; i++) {
1565 params->sse.scale[i] = scale;
1566 }
1567}
1568#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
1569
1570#if XNN_ARCH_ARM || XNN_ARCH_ARM64
1571size_t xnn_init_f16_scaleminmax_fp16arith_params(
1572 union xnn_f16_scaleminmax_params params[XNN_MIN_ELEMENTS(1)],
1573 uint16_t scale,
1574 uint16_t min,
1575 uint16_t max)
1576{
1577 params->fp16arith.scale = scale;
1578 params->fp16arith.min = min;
1579 params->fp16arith.max = max;
1580 return sizeof(params->fp16arith);
1581}
1582#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64
1583
1584#if XNN_ARCH_X86 || XNN_ARCH_X86_64
1585size_t xnn_init_f16_scaleminmax_avx_params(
1586 union xnn_f16_scaleminmax_params params[XNN_MIN_ELEMENTS(1)],
1587 uint16_t scale,
1588 uint16_t min,
1589 uint16_t max)
1590{
1591 const float scale_f32 = fp16_ieee_to_fp32_value(scale);
1592 const float min_f32 = fp16_ieee_to_fp32_value(min);
1593 const float max_f32 = fp16_ieee_to_fp32_value(max);
1594 for (uint32_t i = 0; i < 8; i++) {
1595 params->avx.scale[i] = scale_f32;
1596 params->avx.min[i] = min_f32;
1597 params->avx.max[i] = max_f32;
1598 }
1599 return sizeof(params->avx);
1600}
1601#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
1602
1603#if XNN_ARCH_ARM || XNN_ARCH_ARM64
1604void xnn_update_f16_scaleminmax_fp16arith_params(
1605 union xnn_f16_scaleminmax_params params[XNN_MIN_ELEMENTS(1)],
1606 uint16_t scale)
1607{
1608 params->fp16arith.scale = scale;
1609}
1610#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64
1611
1612#if XNN_ARCH_X86 || XNN_ARCH_X86_64
1613void xnn_update_f16_scaleminmax_avx_params(
1614 union xnn_f16_scaleminmax_params params[XNN_MIN_ELEMENTS(1)],
1615 uint16_t scale)
1616{
1617 const float scale_f32 = fp16_ieee_to_fp32_value(scale);
1618 for (uint32_t i = 0; i < 8; i++) {
1619 params->avx.scale[i] = scale_f32;
1620 }
1621}
1622#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
1623
1624size_t xnn_init_f32_scaleminmax_scalar_params(
1625 union xnn_f32_scaleminmax_params params[XNN_MIN_ELEMENTS(1)],
1626 float scale,
1627 float min,
1628 float max)
1629{
1630 params->scalar.scale = scale;
1631 params->scalar.min = min;
1632 params->scalar.max = max;
1633 return sizeof(params->scalar);
1634}
1635
1636#if XNN_ARCH_X86 || XNN_ARCH_X86_64
1637size_t xnn_init_f32_scaleminmax_sse_params(
1638 union xnn_f32_scaleminmax_params params[XNN_MIN_ELEMENTS(1)],
1639 float scale,
1640 float min,
1641 float max)
1642{
1643 for (uint32_t i = 0; i < 4; i++) {
1644 params->sse.scale[i] = scale;
1645 params->sse.min[i] = min;
1646 params->sse.max[i] = max;
1647 }
1648 return sizeof(params->sse);
1649}
1650#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
1651
1652size_t xnn_init_f32_gavgpool_params(
1653 union xnn_f32_gavgpool_params params[XNN_MIN_ELEMENTS(1)],
1654 float multiplier,
1655 float output_min,
1656 float output_max,
1657 uint32_t width)
1658{
1659 #if XNN_ARCH_X86 || XNN_ARCH_X86_64
1660 for (uint32_t i = 0; i < 4; i++) {
1661 params->sse.multiplier[i] = multiplier;
1662 params->sse.output_min[i] = output_min;
1663 params->sse.output_max[i] = output_max;
1664 }
1665
1666 const uint32_t w = (width - 1) & 3;
1667 params->sse.mask[0] = UINT32_C(0xFFFFFFFF);
1668 params->sse.mask[1] = -(uint32_t) (w >= 1);
1669 params->sse.mask[2] = -(uint32_t) (w >= 2);
1670 params->sse.mask[3] = -(uint32_t) (w >= 3);
1671 return sizeof(params->sse);
1672 #elif XNN_ARCH_ARM || XNN_ARCH_ARM64
1673 params->neon.multiplier = multiplier;
1674 params->neon.output_min = output_min;
1675 params->neon.output_max = output_max;
1676
1677 const uint32_t w = (width - 1) & 3;
1678 params->neon.mask[0] = UINT32_C(0xFFFFFFFF);
1679 params->neon.mask[1] = -(uint32_t) (w >= 1);
1680 params->neon.mask[2] = -(uint32_t) (w >= 2);
1681 params->neon.mask[3] = -(uint32_t) (w >= 3);
1682 return sizeof(params->neon);
1683 #else
1684 params->scalar.multiplier = multiplier;
1685 params->scalar.output_min = output_min;
1686 params->scalar.output_max = output_max;
1687
1688 const uint32_t w = (width - 1) & 3;
1689 params->scalar.mask[0] = UINT32_C(0xFFFFFFFF);
1690 params->scalar.mask[1] = -(int32_t) (w >= 1);
1691 params->scalar.mask[2] = -(int32_t) (w >= 2);
1692 params->scalar.mask[3] = -(int32_t) (w >= 3);
1693 return sizeof(params->scalar);
1694 #endif
1695}
1696
1697#if XNN_ARCH_ARM || XNN_ARCH_ARM64
1698size_t xnn_init_f16_gavgpool_neonfp16arith_params(
1699 union xnn_f16_gavgpool_params params[XNN_MIN_ELEMENTS(1)],
1700 uint16_t multiplier,
1701 uint16_t output_min,
1702 uint16_t output_max,
1703 uint32_t width)
1704{
1705 params->neonfp16arith.multiplier = multiplier;
1706 params->neonfp16arith.output_min = output_min;
1707 params->neonfp16arith.output_max = output_max;
1708
1709 const uint32_t w = (width - 1) & 7;
1710 params->neonfp16arith.mask[0] = UINT16_C(0xFFFF);
1711 params->neonfp16arith.mask[1] = -(uint16_t) (w >= 1);
1712 params->neonfp16arith.mask[2] = -(uint16_t) (w >= 2);
1713 params->neonfp16arith.mask[3] = -(uint16_t) (w >= 3);
1714 params->neonfp16arith.mask[4] = -(uint16_t) (w >= 4);
1715 params->neonfp16arith.mask[5] = -(uint16_t) (w >= 5);
1716 params->neonfp16arith.mask[6] = -(uint16_t) (w >= 6);
1717 params->neonfp16arith.mask[7] = -(uint16_t) (w >= 7);
1718 return sizeof(params->neonfp16arith);
1719}
1720#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64
1721
1722void xnn_update_f32_gavgpool_params(
1723 union xnn_f32_gavgpool_params* params,
1724 float multiplier,
1725 uint32_t width)
1726{
1727 #if XNN_ARCH_X86 || XNN_ARCH_X86_64
1728 for (uint32_t i = 0; i < 4; i++) {
1729 params->sse.multiplier[i] = multiplier;
1730 }
1731
1732 const uint32_t w = (width - 1) & 3;
1733 params->sse.mask[0] = UINT32_C(0xFFFFFFFF);
1734 params->sse.mask[1] = -(uint32_t) (w >= 1);
1735 params->sse.mask[2] = -(uint32_t) (w >= 2);
1736 params->sse.mask[3] = -(uint32_t) (w >= 3);
1737 #elif XNN_ARCH_ARM || XNN_ARCH_ARM64
1738 params->neon.multiplier = multiplier;
1739
1740 const uint32_t w = (width - 1) & 3;
1741 params->neon.mask[0] = UINT32_C(0xFFFFFFFF);
1742 params->neon.mask[1] = -(uint32_t) (w >= 1);
1743 params->neon.mask[2] = -(uint32_t) (w >= 2);
1744 params->neon.mask[3] = -(uint32_t) (w >= 3);
1745 #else
1746 params->scalar.multiplier = multiplier;
1747
1748 const uint32_t w = (width - 1) & 3;
1749 params->scalar.mask[0] = UINT32_C(0xFFFFFFFF);
1750 params->scalar.mask[1] = -(int32_t) (w >= 1);
1751 params->scalar.mask[2] = -(int32_t) (w >= 2);
1752 params->scalar.mask[3] = -(int32_t) (w >= 3);
1753 #endif
1754}
1755
1756#if XNN_ARCH_ARM || XNN_ARCH_ARM64
1757void xnn_update_f16_gavgpool_neonfp16arith_params(
1758 union xnn_f16_gavgpool_params* params,
1759 uint16_t multiplier,
1760 uint32_t width)
1761{
1762 params->neonfp16arith.multiplier = multiplier;
1763
1764 const uint32_t w = (width - 1) & 7;
1765 params->neonfp16arith.mask[0] = UINT16_C(0xFFFF);
1766 params->neonfp16arith.mask[1] = -(uint16_t) (w >= 1);
1767 params->neonfp16arith.mask[2] = -(uint16_t) (w >= 2);
1768 params->neonfp16arith.mask[3] = -(uint16_t) (w >= 3);
1769 params->neonfp16arith.mask[4] = -(uint16_t) (w >= 4);
1770 params->neonfp16arith.mask[5] = -(uint16_t) (w >= 5);
1771 params->neonfp16arith.mask[6] = -(uint16_t) (w >= 6);
1772 params->neonfp16arith.mask[7] = -(uint16_t) (w >= 7);
1773}
1774#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64
1775
1776size_t xnn_init_scalar_f32_gavgpool_params(
1777 union xnn_f32_gavgpool_params params[XNN_MIN_ELEMENTS(1)],
1778 float multiplier,
1779 float output_min,
1780 float output_max,
1781 uint32_t width)
1782{
1783 params->scalar.multiplier = multiplier;
1784 params->scalar.output_min = output_min;
1785 params->scalar.output_max = output_max;
1786
1787 const uint32_t w = (width - 1) & 3;
1788 params->scalar.mask[0] = UINT32_C(0xFFFFFFFF);
1789 params->scalar.mask[1] = -(int32_t) (w >= 1);
1790 params->scalar.mask[2] = -(int32_t) (w >= 2);
1791 params->scalar.mask[3] = -(int32_t) (w >= 3);
1792 return sizeof(params->scalar);
1793}
1794
1795size_t xnn_init_bf16_minmax_scalar_params(
1796 union xnn_bf16_minmax_params params[XNN_MIN_ELEMENTS(1)],
1797 uint16_t output_min,
1798 uint16_t output_max)
1799{
1800 params->scalar.min = uint32_as_float((uint32_t) output_min << 16);
1801 params->scalar.max = uint32_as_float((uint32_t) output_max << 16);
1802 return sizeof(params->scalar);
1803}
1804
1805#if XNN_ARCH_ARM || XNN_ARCH_ARM64
1806size_t xnn_init_f16_minmax_fp16arith_params(
1807 union xnn_f16_minmax_params params[XNN_MIN_ELEMENTS(1)],
1808 uint16_t min,
1809 uint16_t max)
1810{
1811 params->fp16arith.min = min;
1812 params->fp16arith.max = max;
1813 return sizeof(params->fp16arith);
1814}
1815#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64
1816
1817#if XNN_ARCH_X86 || XNN_ARCH_X86_64
1818size_t xnn_init_f16_minmax_avx_params(
1819 union xnn_f16_minmax_params params[XNN_MIN_ELEMENTS(1)],
1820 uint16_t min,
1821 uint16_t max)
1822{
1823 const float min_f32 = fp16_ieee_to_fp32_value(min);
1824 const float max_f32 = fp16_ieee_to_fp32_value(max);
1825 for (uint32_t i = 0; i < 8; i++) {
1826 params->avx.min[i] = min_f32;
1827 params->avx.max[i] = max_f32;
1828 }
1829 return sizeof(params->avx);
1830}
1831#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
1832
1833#if XNN_ARCH_X86 || XNN_ARCH_X86_64
1834size_t xnn_init_f32_default_avx_params(
1835 union xnn_f32_default_params params[XNN_MIN_ELEMENTS(1)])
1836{
1837 for (uint32_t i = 0; i < 7; i++) {
1838 params->avx.mask_table[i] = -1;
1839 }
1840 for (uint32_t i = 7; i < 14; i++) {
1841 params->avx.mask_table[i] = 0;
1842 }
1843 return sizeof(params->avx);
1844}
1845#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
1846
1847#if XNN_ARCH_X86 || XNN_ARCH_X86_64
1848size_t xnn_init_f32_minmax_sse_params(
1849 union xnn_f32_minmax_params params[XNN_MIN_ELEMENTS(1)],
1850 float output_min,
1851 float output_max)
1852{
1853 for (uint32_t i = 0; i < 4; i++) {
1854 params->sse.min[i] = output_min;
1855 params->sse.max[i] = output_max;
1856 }
1857 return sizeof(params->sse);
1858}
1859
1860size_t xnn_init_f32_minmax_avx_params(
1861 union xnn_f32_minmax_params params[XNN_MIN_ELEMENTS(1)],
1862 float output_min,
1863 float output_max)
1864{
1865 for (uint32_t i = 0; i < 8; i++) {
1866 params->avx.min[i] = output_min;
1867 params->avx.max[i] = output_max;
1868 }
1869 for (uint32_t i = 0; i < 7; i++) {
1870 params->avx.mask_table[i] = -1;
1871 }
1872 for (uint32_t i = 7; i < 14; i++) {
1873 params->avx.mask_table[i] = 0;
1874 }
1875 return sizeof(params->avx);
1876}
1877#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
1878
1879#if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
1880size_t xnn_init_f32_minmax_wasmsimd_params(
1881 union xnn_f32_minmax_params params[XNN_MIN_ELEMENTS(1)],
1882 float output_min,
1883 float output_max)
1884{
1885 params->wasmsimd.min[0] = output_min;
1886 params->wasmsimd.min[1] = output_min;
1887 params->wasmsimd.max[0] = output_max;
1888 params->wasmsimd.max[1] = output_max;
1889 return sizeof(params->wasmsimd);
1890}
1891#endif // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
1892
1893size_t xnn_init_f32_minmax_scalar_params(
1894 union xnn_f32_minmax_params params[XNN_MIN_ELEMENTS(1)],
1895 float output_min,
1896 float output_max)
1897{
1898 params->scalar.min = output_min;
1899 params->scalar.max = output_max;
1900 return sizeof(params->scalar);
1901}
1902
1903#if XNN_ARCH_ARM || XNN_ARCH_ARM64
1904size_t xnn_init_f16_hswish_fp16arith_params(
1905 union xnn_f16_hswish_params params[XNN_MIN_ELEMENTS(1)])
1906{
1907 params->fp16arith.sixth = UINT16_C(0x3155);
1908 params->fp16arith.three = UINT16_C(0x4200);
1909 params->fp16arith.six = UINT16_C(0x4600);
1910 return sizeof(params->fp16arith);
1911}
1912#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64
1913
1914#if XNN_ARCH_X86 || XNN_ARCH_X86_64
1915size_t xnn_init_f16_hswish_avx_params(
1916 union xnn_f16_hswish_params params[XNN_MIN_ELEMENTS(1)])
1917{
1918 for (uint32_t i = 0; i < 8; i++) {
1919 params->avx.sixth[i] = 0x1.554000p-3f;
1920 params->avx.three[i] = 3.0f;
1921 params->avx.six[i] = UINT16_C(0x4600);
1922 }
1923 return sizeof(params->avx);
1924}
1925#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
1926
1927size_t xnn_init_f32_hswish_scalar_params(
1928 union xnn_f32_hswish_params params[XNN_MIN_ELEMENTS(1)])
1929{
1930 params->scalar.sixth = 0x1.555556p-3f;
1931 params->scalar.three = 3.0f;
1932 params->scalar.six = 6.0f;
1933 return sizeof(params->scalar);
1934}
1935
1936#if XNN_ARCH_X86 || XNN_ARCH_X86_64
1937size_t xnn_init_f32_hswish_sse_params(
1938 union xnn_f32_hswish_params params[XNN_MIN_ELEMENTS(1)])
1939{
1940 for (uint32_t i = 0; i < 4; i++) {
1941 params->sse.sixth[i] = 0x1.555556p-3f;
1942 params->sse.half[i] = 0.5f;
1943 params->sse.one[i] = 1.0f;
1944 }
1945 return sizeof(params->sse);
1946}
1947
1948size_t xnn_init_f32_hswish_avx_params(
1949 union xnn_f32_hswish_params params[XNN_MIN_ELEMENTS(1)])
1950{
1951 for (uint32_t i = 0; i < 8; i++) {
1952 params->avx.sixth[i] = 0x1.555556p-3f;
1953 params->avx.half[i] = 0.5f;
1954 params->avx.one[i] = 1.0f;
1955 }
1956 for (uint32_t i = 0; i < 7; i++) {
1957 params->avx.mask_table[i] = -1;
1958 }
1959 for (uint32_t i = 7; i < 14; i++) {
1960 params->avx.mask_table[i] = 0;
1961 }
1962 return sizeof(params->avx);
1963}
1964
1965size_t xnn_init_f32_hswish_avx512_params(
1966 union xnn_f32_hswish_params params[XNN_MIN_ELEMENTS(1)])
1967{
1968 params->avx512.sixth = 0x1.555556p-3f;
1969 params->avx512.half = 0.5f;
1970 params->avx512.one = 1.0f;
1971 return sizeof(params->avx512);
1972}
1973#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
1974
1975#if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
1976size_t xnn_init_f32_hswish_wasmsimd_params(
1977 union xnn_f32_hswish_params params[XNN_MIN_ELEMENTS(1)])
1978{
1979 for (uint32_t i = 0; i < 2; i++) {
1980 params->wasmsimd.sixth[i] = 0x1.555556p-3f;
1981 params->wasmsimd.three[i] = 3.0f;
1982 params->wasmsimd.six[i] = 6.0f;
1983 }
1984 return sizeof(params->wasmsimd);
1985}
1986#endif // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
1987
1988#if XNN_ARCH_ARM || XNN_ARCH_ARM64
1989size_t xnn_init_f16_sigmoid_fp16arith_rr2_p2_params(
1990 union xnn_f16_sigmoid_params params[XNN_MIN_ELEMENTS(1)])
1991{
1992 params->fp16arith_rr2_p2.magic_bias = UINT16_C(0x660F); // 0x1.83Cp+10h
1993 params->fp16arith_rr2_p2.minus_log2e = UINT16_C(0xBDC5); // -0x1.714p+0h
1994 params->fp16arith_rr2_p2.ln2_hi = UINT16_C(0x398C); // 0x1.630p-1h
1995 params->fp16arith_rr2_p2.ln2_lo = UINT16_C(0x8AF4); // -0x1.BD0p-13h
1996 params->fp16arith_rr2_p2.c2 = UINT16_C(0x37F9); // 0x1.FE4p-2h
1997 params->fp16arith_rr2_p2.c1 = UINT16_C(0xBC0E); // -0x1.038p+0h
1998 params->fp16arith_rr2_p2.denorm_cutoff = UINT16_C(0xC8DA); // -0x1.368p+3h
1999 return sizeof(params->fp16arith_rr2_p2);
2000}
2001#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64
2002
2003#if XNN_ARCH_X86 || XNN_ARCH_X86_64
2004size_t xnn_init_f16_sigmoid_avx2_rr1_p2_params(
2005 union xnn_f16_sigmoid_params params[XNN_MIN_ELEMENTS(1)])
2006{
2007 for (uint32_t i = 0; i < 8; i++) {
2008 params->avx2_rr1_p2.sign_mask[i] = -0.0f;
2009 params->avx2_rr1_p2.magic_bias[i] = 0x1.8000FEp23f;
2010 params->avx2_rr1_p2.log2e[i] = 0x1.715476p0f;
2011 params->avx2_rr1_p2.minus_ln2[i] = -0x1.62E43p-1f;
2012 params->avx2_rr1_p2.c2[i] = 0x1.FF3A32p-2f;
2013 params->avx2_rr1_p2.c1[i] = 0x1.039E10p+0f;
2014 params->avx2_rr1_p2.one[i] = 1.0f;
2015 params->avx2_rr1_p2.denorm_cutoff[i] = -0x1.368000p+3f;
2016 }
2017 return sizeof(params->avx2_rr1_p2);
2018}
2019#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
2020
2021size_t xnn_init_f32_sigmoid_scalar_rr2_lut64_p2_params(
2022 union xnn_f32_sigmoid_params params[XNN_MIN_ELEMENTS(1)])
2023{
2024 params->scalar_rr2_lut64_p2.magic_bias = 0x1.800000p17f;
2025 params->scalar_rr2_lut64_p2.minus_log2e = -0x1.715476p0f;
2026 params->scalar_rr2_lut64_p2.ln2_hi = 0x1.630000p-1f;
2027 params->scalar_rr2_lut64_p2.ln2_lo = -0x1.BD0106p-13f;
2028 params->scalar_rr2_lut64_p2.c2 = 0x1.FFFF0Ap-2f;
2029 params->scalar_rr2_lut64_p2.one = 1.0f;
2030 params->scalar_rr2_lut64_p2.denorm_cutoff = 0x1.5D589Ep+6f;
2031 return sizeof(params->scalar_rr2_lut64_p2);
2032}
2033
2034size_t xnn_init_f32_sigmoid_scalar_rr2_lut2048_p1_params(
2035 union xnn_f32_sigmoid_params params[XNN_MIN_ELEMENTS(1)])
2036{
2037 params->scalar_rr2_lut2048_p1.magic_bias = 0x1.800000p12f;
2038 params->scalar_rr2_lut2048_p1.minus_log2e = -0x1.715476p0f;
2039 params->scalar_rr2_lut2048_p1.ln2_hi = 0x1.600000p-1f;
2040 params->scalar_rr2_lut2048_p1.ln2_lo = 0x1.7217F8p-8f;
2041 params->scalar_rr2_lut2048_p1.c1 = -0x1.FFFFFEp-1f;
2042 params->scalar_rr2_lut2048_p1.one = 1.0f;
2043 params->scalar_rr2_lut2048_p1.denorm_cutoff = 0x1.5D589Ep+6f;
2044 return sizeof(params->scalar_rr2_lut2048_p1);
2045}
2046
2047size_t xnn_init_f32_sigmoid_scalar_rr2_p5_params(
2048 union xnn_f32_sigmoid_params params[XNN_MIN_ELEMENTS(1)])
2049{
2050 params->scalar_rr2_p5.magic_bias = 0x1.8000FEp23f;
2051 params->scalar_rr2_p5.minus_log2e = -0x1.715476p0f;
2052 params->scalar_rr2_p5.ln2_hi = 0x1.62E400p-1f;
2053 params->scalar_rr2_p5.ln2_lo = 0x1.7F7D1Cp-20f;
2054 params->scalar_rr2_p5.c5 = -0x1.0F9F9Cp-7f;
2055 params->scalar_rr2_p5.c4 = 0x1.573A1Ap-5f;
2056 params->scalar_rr2_p5.c3 = -0x1.555A80p-3f;
2057 params->scalar_rr2_p5.c2 = 0x1.FFFDC6p-2f;
2058 params->scalar_rr2_p5.c1 = -0x1.FFFFF6p-1f;
2059 params->scalar_rr2_p5.one = 1.0f;
2060 params->scalar_rr2_p5.denorm_cutoff = 0x1.5D589Ep+6f;
2061 return sizeof(params->scalar_rr2_p5);
2062}
2063
2064#if XNN_ARCH_ARM || XNN_ARCH_ARM64
2065size_t xnn_init_f32_sigmoid_neon_rr2_lut64_p2_params(
2066 union xnn_f32_sigmoid_params params[XNN_MIN_ELEMENTS(1)])
2067{
2068 params->neon_rr2_lut64_p2.magic_bias = 0x1.800000p17f;
2069 params->neon_rr2_lut64_p2.minus_log2e = -0x1.715476p0f;
2070 params->neon_rr2_lut64_p2.ln2_hi = 0x1.630000p-1f;
2071 params->neon_rr2_lut64_p2.ln2_lo = -0x1.BD0106p-13f;
2072 params->neon_rr2_lut64_p2.c2 = 0x1.FFFF0Ap-2f;
2073 params->neon_rr2_lut64_p2.denorm_cutoff = 0x1.5D589Ep+6f;
2074 return sizeof(params->neon_rr2_lut64_p2);
2075}
2076
2077size_t xnn_init_f32_sigmoid_neon_rr2_lut2048_p1_params(
2078 union xnn_f32_sigmoid_params params[XNN_MIN_ELEMENTS(1)])
2079{
2080 params->neon_rr2_lut2048_p1.magic_bias = 0x1.800000p12f;
2081 params->neon_rr2_lut2048_p1.minus_log2e = -0x1.715476p0f;
2082 params->neon_rr2_lut2048_p1.ln2_hi = 0x1.600000p-1f;
2083 params->neon_rr2_lut2048_p1.ln2_lo = 0x1.7217F8p-8f;
2084 params->neon_rr2_lut2048_p1.c1 = -0x1.FFFFFEp-1f;
2085 params->neon_rr2_lut2048_p1.denorm_cutoff = 0x1.5D589Ep+6f;
2086 return sizeof(params->neon_rr2_lut2048_p1);
2087}
2088
2089size_t xnn_init_f32_sigmoid_neon_rr2_p5_params(
2090 union xnn_f32_sigmoid_params params[XNN_MIN_ELEMENTS(1)])
2091{
2092 params->neon_rr2_p5.magic_bias = 0x1.8000FEp23f;
2093 params->neon_rr2_p5.minus_log2e = -0x1.715476p0f;
2094 params->neon_rr2_p5.ln2_hi = 0x1.62E400p-1f;
2095 params->neon_rr2_p5.ln2_lo = 0x1.7F7D1Cp-20f;
2096 params->neon_rr2_p5.c5 = -0x1.0F9F9Cp-7f;
2097 params->neon_rr2_p5.c4 = 0x1.573A1Ap-5f;
2098 params->neon_rr2_p5.c3 = -0x1.555A80p-3f;
2099 params->neon_rr2_p5.c2 = 0x1.FFFDC6p-2f;
2100 params->neon_rr2_p5.c1 = -0x1.FFFFF6p-1f;
2101 params->neon_rr2_p5.denorm_cutoff = 0x1.5D589Ep+6f;
2102 return sizeof(params->neon_rr2_p5);
2103}
2104
2105size_t xnn_init_f32_sigmoid_neonfma_rr1_lut2048_p1_params(
2106 union xnn_f32_sigmoid_params params[XNN_MIN_ELEMENTS(1)])
2107{
2108 params->neonfma_rr1_lut2048_p1.magic_bias = 0x1.800000p12f;
2109 params->neonfma_rr1_lut2048_p1.minus_log2e = -0x1.715476p0f;
2110 params->neonfma_rr1_lut2048_p1.ln2 = 0x1.62E430p-1f;
2111 params->neonfma_rr1_lut2048_p1.c1 = -0x1.FFFFFEp-1f;
2112 params->neonfma_rr1_lut2048_p1.denorm_cutoff = 0x1.5D589Ep+6f;
2113 return sizeof(params->neonfma_rr1_lut2048_p1);
2114}
2115
2116size_t xnn_init_f32_sigmoid_neonfma_rr1_lut64_p2_params(
2117 union xnn_f32_sigmoid_params params[XNN_MIN_ELEMENTS(1)])
2118{
2119 params->neonfma_rr1_lut64_p2.magic_bias = 0x1.800000p17f;
2120 params->neonfma_rr1_lut64_p2.minus_log2e = -0x1.715476p0f;
2121 params->neonfma_rr1_lut64_p2.ln2 = 0x1.62E430p-1f;
2122 params->neonfma_rr1_lut64_p2.c2 = 0x1.FFFF0Ap-2f;
2123 params->neonfma_rr1_lut64_p2.denorm_cutoff = 0x1.5D589Ep+6f;
2124 return sizeof(params->neonfma_rr1_lut64_p2);
2125}
2126
2127size_t xnn_init_f32_sigmoid_neonfma_rr1_p5_params(
2128 union xnn_f32_sigmoid_params params[XNN_MIN_ELEMENTS(1)])
2129{
2130 params->neonfma_rr1_p5.magic_bias = 0x1.8000FEp23f;
2131 params->neonfma_rr1_p5.minus_log2e = -0x1.715476p0f;
2132 params->neonfma_rr1_p5.ln2 = 0x1.62E430p-1f;
2133 params->neonfma_rr1_p5.c5 = -0x1.0F9F9Cp-7f;
2134 params->neonfma_rr1_p5.c4 = 0x1.573A1Ap-5f;
2135 params->neonfma_rr1_p5.c3 = -0x1.555A80p-3f;
2136 params->neonfma_rr1_p5.c2 = 0x1.FFFDC6p-2f;
2137 params->neonfma_rr1_p5.c1 = -0x1.FFFFF6p-1f;
2138 params->neonfma_rr1_p5.denorm_cutoff = 0x1.5D589Ep+6f;
2139 return sizeof(params->neonfma_rr1_p5);
2140}
2141#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64
2142
2143#if XNN_ARCH_X86 || XNN_ARCH_X86_64
2144size_t xnn_init_f32_sigmoid_sse2_rr2_lut64_p2_params(
2145 union xnn_f32_sigmoid_params params[XNN_MIN_ELEMENTS(1)])
2146{
2147 for (uint32_t i = 0; i < 4; i++) {
2148 params->sse2_rr2_lut64_p2.sign_mask[i] = -0.0f;
2149 params->sse2_rr2_lut64_p2.magic_bias[i] = 0x1.800000p17f;
2150 params->sse2_rr2_lut64_p2.log2e[i] = 0x1.715476p0f;
2151 params->sse2_rr2_lut64_p2.index_mask[i] = UINT32_C(0x3F);
2152 params->sse2_rr2_lut64_p2.minus_ln2_hi[i] = -0x1.630000p-1f;
2153 params->sse2_rr2_lut64_p2.minus_ln2_lo[i] = 0x1.BD0106p-13f;
2154 params->sse2_rr2_lut64_p2.c2[i] = 0x1.FFFF0Ap-2f;
2155 params->sse2_rr2_lut64_p2.one[i] = 1.0f;
2156 params->sse2_rr2_lut64_p2.denorm_cutoff[i] = -0x1.5D589Ep+6f;
2157 }
2158 return sizeof(params->sse2_rr2_lut64_p2);
2159}
2160
2161size_t xnn_init_f32_sigmoid_sse2_rr2_p5_params(
2162 union xnn_f32_sigmoid_params params[XNN_MIN_ELEMENTS(1)])
2163{
2164 for (uint32_t i = 0; i < 4; i++) {
2165 params->sse2_rr2_p5.sign_mask[i] = -0.0f;
2166 params->sse2_rr2_p5.magic_bias[i] = 0x1.8000FEp23f;
2167 params->sse2_rr2_p5.log2e[i] = 0x1.715476p0f;
2168 params->sse2_rr2_p5.minus_ln2_hi[i] = -0x1.62E400p-1f;
2169 params->sse2_rr2_p5.minus_ln2_lo[i] = -0x1.7F7D1Cp-20f;
2170 params->sse2_rr2_p5.c5[i] = 0x1.0F9F9Cp-7f;
2171 params->sse2_rr2_p5.c4[i] = 0x1.573A1Ap-5f;
2172 params->sse2_rr2_p5.c3[i] = 0x1.555A80p-3f;
2173 params->sse2_rr2_p5.c2[i] = 0x1.FFFDC6p-2f;
2174 params->sse2_rr2_p5.c1[i] = 0x1.FFFFF6p-1f;
2175 params->sse2_rr2_p5.one[i] = 1.0f;
2176 params->sse2_rr2_p5.denorm_cutoff[i] = -0x1.5D589Ep+6f;
2177 }
2178 return sizeof(params->sse2_rr2_p5);
2179}
2180
2181size_t xnn_init_f32_sigmoid_avx_rr2_p5_params(
2182 union xnn_f32_sigmoid_params params[XNN_MIN_ELEMENTS(1)])
2183{
2184 for (uint32_t i = 0; i < 8; i++) {
2185 params->avx_rr2_p5.sign_mask[i] = -0.0f;
2186 params->avx_rr2_p5.magic_bias[i] = 0x1.8000FEp23f;
2187 params->avx_rr2_p5.log2e[i] = 0x1.715476p0f;
2188 params->avx_rr2_p5.minus_ln2_hi[i] = -0x1.62E400p-1f;
2189 params->avx_rr2_p5.minus_ln2_lo[i] = -0x1.7F7D1Cp-20f;
2190 params->avx_rr2_p5.c5[i] = 0x1.0F9F9Cp-7f;
2191 params->avx_rr2_p5.c4[i] = 0x1.573A1Ap-5f;
2192 params->avx_rr2_p5.c3[i] = 0x1.555A80p-3f;
2193 params->avx_rr2_p5.c2[i] = 0x1.FFFDC6p-2f;
2194 params->avx_rr2_p5.c1[i] = 0x1.FFFFF6p-1f;
2195 params->avx_rr2_p5.one[i] = 1.0f;
2196 params->avx_rr2_p5.two[i] = 2.0f;
2197 params->avx_rr2_p5.denorm_cutoff[i] = -0x1.5D589Ep+6f;
2198 }
2199 for (uint32_t i = 0; i < 7; i++) {
2200 params->avx_rr2_p5.mask_table[i] = -1;
2201 }
2202 for (uint32_t i = 7; i < 14; i++) {
2203 params->avx_rr2_p5.mask_table[i] = 0;
2204 }
2205 return sizeof(params->avx_rr2_p5);
2206}
2207
2208size_t xnn_init_f32_sigmoid_avx2_rr1_p5_params(
2209 union xnn_f32_sigmoid_params params[XNN_MIN_ELEMENTS(1)])
2210{
2211 for (uint32_t i = 0; i < 8; i++) {
2212 params->avx2_rr1_p5.sign_mask[i] = -0.0f;
2213 params->avx2_rr1_p5.magic_bias[i] = 0x1.8000FEp23f;
2214 params->avx2_rr1_p5.log2e[i] = 0x1.715476p0f;
2215 params->avx2_rr1_p5.minus_ln2[i] = -0x1.62E430p-1f;
2216 params->avx2_rr1_p5.c5[i] = 0x1.0F9F9Cp-7f;
2217 params->avx2_rr1_p5.c4[i] = 0x1.573A1Ap-5f;
2218 params->avx2_rr1_p5.c3[i] = 0x1.555A80p-3f;
2219 params->avx2_rr1_p5.c2[i] = 0x1.FFFDC6p-2f;
2220 params->avx2_rr1_p5.c1[i] = 0x1.FFFFF6p-1f;
2221 params->avx2_rr1_p5.one[i] = 1.0f;
2222 params->avx2_rr1_p5.denorm_cutoff[i] = -0x1.5D589Ep+6f;
2223 }
2224 for (uint32_t i = 0; i < 7; i++) {
2225 params->avx2_rr1_p5.mask_table[i] = -1;
2226 }
2227 for (uint32_t i = 7; i < 14; i++) {
2228 params->avx2_rr1_p5.mask_table[i] = 0;
2229 }
2230 return sizeof(params->avx2_rr1_p5);
2231}
2232
2233size_t xnn_init_f32_sigmoid_avx512_rr1_lut16_p3_params(
2234 union xnn_f32_sigmoid_params params[XNN_MIN_ELEMENTS(1)])
2235{
2236 params->avx512_rr1_lut16_p3.sign_mask = UINT32_C(0x80000000);
2237 params->avx512_rr1_lut16_p3.magic_bias = 0x1.800000p19f;
2238 params->avx512_rr1_lut16_p3.log2e = 0x1.715476p0f;
2239 params->avx512_rr1_lut16_p3.minus_ln2 = -0x1.62E430p-1f;
2240 params->avx512_rr1_lut16_p3.c3 = 0x1.55559Ap-3f;
2241 params->avx512_rr1_lut16_p3.c2 = 0x1.00021Ep-1f;
2242 params->avx512_rr1_lut16_p3.one = 1.0f;
2243 params->avx512_rr1_lut16_p3.table[ 0] = 0x1.000000p+0f;
2244 params->avx512_rr1_lut16_p3.table[ 1] = 0x1.0B5586p+0f;
2245 params->avx512_rr1_lut16_p3.table[ 2] = 0x1.172B84p+0f;
2246 params->avx512_rr1_lut16_p3.table[ 3] = 0x1.2387A6p+0f;
2247 params->avx512_rr1_lut16_p3.table[ 4] = 0x1.306FE0p+0f;
2248 params->avx512_rr1_lut16_p3.table[ 5] = 0x1.3DEA64p+0f;
2249 params->avx512_rr1_lut16_p3.table[ 6] = 0x1.4BFDAEp+0f;
2250 params->avx512_rr1_lut16_p3.table[ 7] = 0x1.5AB07Ep+0f;
2251 params->avx512_rr1_lut16_p3.table[ 8] = 0x1.6A09E6p+0f;
2252 params->avx512_rr1_lut16_p3.table[ 9] = 0x1.7A1148p+0f;
2253 params->avx512_rr1_lut16_p3.table[10] = 0x1.8ACE54p+0f;
2254 params->avx512_rr1_lut16_p3.table[11] = 0x1.9C4918p+0f;
2255 params->avx512_rr1_lut16_p3.table[12] = 0x1.AE89FAp+0f;
2256 params->avx512_rr1_lut16_p3.table[13] = 0x1.C199BEp+0f;
2257 params->avx512_rr1_lut16_p3.table[14] = 0x1.D5818Ep+0f;
2258 params->avx512_rr1_lut16_p3.table[15] = 0x1.EA4AFAp+0f;
2259 return sizeof(params->avx512_rr1_lut16_p3);
2260}
2261
2262size_t xnn_init_f32_sigmoid_avx512_rr2_lut32_p2_params(
2263 union xnn_f32_sigmoid_params params[XNN_MIN_ELEMENTS(1)])
2264{
2265 params->avx512_rr2_lut32_p2.sign_mask = UINT32_C(0x80000000);
2266 params->avx512_rr2_lut32_p2.magic_bias = 0x1.800000p18f;
2267 params->avx512_rr2_lut32_p2.log2e = 0x1.715476p0f;
2268 params->avx512_rr2_lut32_p2.minus_ln2_hi = -0x1.62E430p-1f;
2269 params->avx512_rr2_lut32_p2.minus_ln2_lo = 0x1.05C61p-29f;
2270 params->avx512_rr2_lut32_p2.c2 = 0x1.000000p-1f;
2271 params->avx512_rr2_lut32_p2.c1 = 0x1.0000F6p-0f;
2272 params->avx512_rr2_lut32_p2.one = 1.0f;
2273
2274 params->avx512_rr2_lut32_p2.table_lo[ 0] = 0x1.000000p+0f;
2275 params->avx512_rr2_lut32_p2.table_lo[ 1] = 0x1.059B0Ep+0f;
2276 params->avx512_rr2_lut32_p2.table_lo[ 2] = 0x1.0B5586p+0f;
2277 params->avx512_rr2_lut32_p2.table_lo[ 3] = 0x1.11301Ep+0f;
2278 params->avx512_rr2_lut32_p2.table_lo[ 4] = 0x1.172B84p+0f;
2279 params->avx512_rr2_lut32_p2.table_lo[ 5] = 0x1.1D4874p+0f;
2280 params->avx512_rr2_lut32_p2.table_lo[ 6] = 0x1.2387A6p+0f;
2281 params->avx512_rr2_lut32_p2.table_lo[ 7] = 0x1.29E9E0p+0f;
2282 params->avx512_rr2_lut32_p2.table_lo[ 8] = 0x1.306FE0p+0f;
2283 params->avx512_rr2_lut32_p2.table_lo[ 9] = 0x1.371A74p+0f;
2284 params->avx512_rr2_lut32_p2.table_lo[10] = 0x1.3DEA64p+0f;
2285 params->avx512_rr2_lut32_p2.table_lo[11] = 0x1.44E086p+0f;
2286 params->avx512_rr2_lut32_p2.table_lo[12] = 0x1.4BFDAEp+0f;
2287 params->avx512_rr2_lut32_p2.table_lo[13] = 0x1.5342B6p+0f;
2288 params->avx512_rr2_lut32_p2.table_lo[14] = 0x1.5AB07Ep+0f;
2289 params->avx512_rr2_lut32_p2.table_lo[15] = 0x1.6247ECp+0f;
2290
2291 params->avx512_rr2_lut32_p2.table_hi[ 0] = 0x1.6A09E6p+0f;
2292 params->avx512_rr2_lut32_p2.table_hi[ 1] = 0x1.71F75Ep+0f;
2293 params->avx512_rr2_lut32_p2.table_hi[ 2] = 0x1.7A1148p+0f;
2294 params->avx512_rr2_lut32_p2.table_hi[ 3] = 0x1.82589Ap+0f;
2295 params->avx512_rr2_lut32_p2.table_hi[ 4] = 0x1.8ACE54p+0f;
2296 params->avx512_rr2_lut32_p2.table_hi[ 5] = 0x1.93737Cp+0f;
2297 params->avx512_rr2_lut32_p2.table_hi[ 6] = 0x1.9C4918p+0f;
2298 params->avx512_rr2_lut32_p2.table_hi[ 7] = 0x1.A5503Cp+0f;
2299 params->avx512_rr2_lut32_p2.table_hi[ 8] = 0x1.AE89FAp+0f;
2300 params->avx512_rr2_lut32_p2.table_hi[ 9] = 0x1.B7F770p+0f;
2301 params->avx512_rr2_lut32_p2.table_hi[10] = 0x1.C199BEp+0f;
2302 params->avx512_rr2_lut32_p2.table_hi[11] = 0x1.CB720Ep+0f;
2303 params->avx512_rr2_lut32_p2.table_hi[12] = 0x1.D5818Ep+0f;
2304 params->avx512_rr2_lut32_p2.table_hi[13] = 0x1.DFC974p+0f;
2305 params->avx512_rr2_lut32_p2.table_hi[14] = 0x1.EA4AFAp+0f;
2306 params->avx512_rr2_lut32_p2.table_hi[15] = 0x1.F50766p+0f;
2307 return sizeof(params->avx512_rr2_lut32_p2);
2308}
2309
2310size_t xnn_init_f32_sigmoid_avx512_rr1_p5_params(
2311 union xnn_f32_sigmoid_params params[XNN_MIN_ELEMENTS(1)])
2312{
2313 params->avx512_rr1_p5.sign_mask = UINT32_C(0x80000000);
2314 params->avx512_rr1_p5.log2e = 0x1.715476p0f;
2315 params->avx512_rr1_p5.minus_ln2 = -0x1.62E430p-1f;
2316 params->avx512_rr1_p5.c5 = 0x1.0F9F9Cp-7f;
2317 params->avx512_rr1_p5.c4 = 0x1.573A1Ap-5f;
2318 params->avx512_rr1_p5.c3 = 0x1.555A80p-3f;
2319 params->avx512_rr1_p5.c2 = 0x1.FFFDC6p-2f;
2320 params->avx512_rr1_p5.c1 = 0x1.FFFFF6p-1f;
2321 params->avx512_rr1_p5.one = 1.0f;
2322 return sizeof(params->avx512_rr1_p5);
2323}
2324#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
2325
2326#if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
2327size_t xnn_init_f32_sigmoid_wasmsimd_rr2_lut64_p2_params(
2328 union xnn_f32_sigmoid_params params[XNN_MIN_ELEMENTS(1)])
2329{
2330 for (uint32_t i = 0; i < 2; i++) {
2331 params->wasmsimd_rr2_lut64_p2.magic_bias[i] = 0x1.800000p17f;
2332 params->wasmsimd_rr2_lut64_p2.minus_log2e[i] = -0x1.715476p0f;
2333 params->wasmsimd_rr2_lut64_p2.index_mask[i] = UINT32_C(0x3F);
2334 params->wasmsimd_rr2_lut64_p2.ln2_hi[i] = 0x1.630000p-1f;
2335 params->wasmsimd_rr2_lut64_p2.ln2_lo[i] = -0x1.BD0106p-13f;
2336 params->wasmsimd_rr2_lut64_p2.c2[i] = 0x1.FFFF0Ap-2f;
2337 params->wasmsimd_rr2_lut64_p2.one[i] = 1.0f;
2338 params->wasmsimd_rr2_lut64_p2.denorm_cutoff[i] = 0x1.5D589Ep+6f;
2339 }
2340 return sizeof(params->wasmsimd_rr2_lut64_p2);
2341}
2342
2343size_t xnn_init_f32_sigmoid_wasmsimd_rr2_p5_params(
2344 union xnn_f32_sigmoid_params params[XNN_MIN_ELEMENTS(1)])
2345{
2346 for (uint32_t i = 0; i < 2; i++) {
2347 params->wasmsimd_rr2_p5.magic_bias[i] = 0x1.8000FEp23f;
2348 params->wasmsimd_rr2_p5.minus_log2e[i] = -0x1.715476p+0f;
2349 params->wasmsimd_rr2_p5.ln2_hi[i] = 0x1.62E400p-1f;
2350 params->wasmsimd_rr2_p5.ln2_lo[i] = 0x1.7F7D1Cp-20f;
2351 params->wasmsimd_rr2_p5.c5[i] = -0x1.0F9F9Cp-7f;
2352 params->wasmsimd_rr2_p5.c4[i] = 0x1.573A1Ap-5f;
2353 params->wasmsimd_rr2_p5.c3[i] = -0x1.555A80p-3f;
2354 params->wasmsimd_rr2_p5.c2[i] = 0x1.FFFDC6p-2f;
2355 params->wasmsimd_rr2_p5.c1[i] = -0x1.FFFFF6p-1f;
2356 params->wasmsimd_rr2_p5.one[i] = 1.0f;
2357 params->wasmsimd_rr2_p5.denorm_cutoff[i] = 0x1.5D589Ep+6f;
2358 }
2359 return sizeof(params->wasmsimd_rr2_p5);
2360}
2361#endif // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
2362
2363#if XNN_ARCH_X86 || XNN_ARCH_X86_64
2364size_t xnn_init_f16_abs_sse_params(
2365 union xnn_f16_abs_params params[XNN_MIN_ELEMENTS(1)])
2366{
2367 for (uint32_t i = 0; i < 8; i++) {
2368 params->sse.nonsign_mask[i] = UINT16_C(0x7FFF);
2369 }
2370 return sizeof(params->sse);
2371}
2372#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
2373
2374#if XNN_ARCH_X86 || XNN_ARCH_X86_64
2375size_t xnn_init_f32_abs_sse_params(
2376 union xnn_f32_abs_params params[XNN_MIN_ELEMENTS(1)])
2377{
2378 for (uint32_t i = 0; i < 4; i++) {
2379 params->sse.nonsign_mask[i] = math_nonsign_mask_f32();
2380 }
2381 return sizeof(params->sse);
2382}
2383
2384size_t xnn_init_f32_abs_avx_params(
2385 union xnn_f32_abs_params params[XNN_MIN_ELEMENTS(1)])
2386{
2387 for (uint32_t i = 0; i < 8; i++) {
2388 params->avx.nonsign_mask[i] = math_nonsign_mask_f32();
2389 }
2390 for (uint32_t i = 0; i < 7; i++) {
2391 params->avx.mask_table[i] = -1;
2392 }
2393 for (uint32_t i = 7; i < 14; i++) {
2394 params->avx.mask_table[i] = 0;
2395 }
2396 return sizeof(params->avx);
2397}
2398
2399size_t xnn_init_f32_abs_avx512_params(
2400 union xnn_f32_abs_params params[XNN_MIN_ELEMENTS(1)])
2401{
2402 params->avx512.nonsign_mask = UINT32_C(0x7FFFFFFF);
2403 return sizeof(params->avx512);
2404}
2405#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
2406
2407#if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
2408size_t xnn_init_f32_abs_wasmsimd_params(
2409 union xnn_f32_abs_params params[XNN_MIN_ELEMENTS(1)])
2410{
2411 params->wasmsimd.nonsign_mask[0] = math_nonsign_mask_f32();
2412 params->wasmsimd.nonsign_mask[1] = math_nonsign_mask_f32();
2413 return sizeof(params->wasmsimd);
2414}
2415#endif // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
2416
2417#if XNN_ARCH_X86 || XNN_ARCH_X86_64
2418size_t xnn_init_f16_neg_sse_params(
2419 union xnn_f16_neg_params params[XNN_MIN_ELEMENTS(1)])
2420{
2421 for (uint32_t i = 0; i < 8; i++) {
2422 params->sse.sign_mask[i] = UINT16_C(0x8000);
2423 }
2424 return sizeof(params->sse);
2425}
2426#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
2427
2428#if XNN_ARCH_X86 || XNN_ARCH_X86_64
2429size_t xnn_init_f32_neg_sse_params(
2430 union xnn_f32_neg_params params[XNN_MIN_ELEMENTS(1)])
2431{
2432 for (uint32_t i = 0; i < 4; i++) {
2433 params->sse.sign_mask[i] = -0.0f;
2434 }
2435 return sizeof(params->sse);
2436}
2437
2438size_t xnn_init_f32_neg_avx_params(
2439 union xnn_f32_neg_params params[XNN_MIN_ELEMENTS(1)])
2440{
2441 for (uint32_t i = 0; i < 8; i++) {
2442 params->avx.sign_mask[i] = -0.0f;
2443 }
2444 for (uint32_t i = 0; i < 7; i++) {
2445 params->avx.mask_table[i] = -1;
2446 }
2447 for (uint32_t i = 7; i < 14; i++) {
2448 params->avx.mask_table[i] = 0;
2449 }
2450 return sizeof(params->avx);
2451}
2452
2453size_t xnn_init_f32_neg_avx512_params(
2454 union xnn_f32_neg_params params[XNN_MIN_ELEMENTS(1)])
2455{
2456 params->avx512.sign_mask = UINT32_C(0x80000000);
2457 return sizeof(params->avx512);
2458}
2459#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
2460
2461#if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
2462size_t xnn_init_f32_neg_wasmsimd_params(
2463 union xnn_f32_neg_params params[XNN_MIN_ELEMENTS(1)])
2464{
2465 params->wasmsimd.sign_mask[0] = -0.0f;
2466 params->wasmsimd.sign_mask[1] = -0.0f;
2467 return sizeof(params->wasmsimd);
2468}
2469#endif // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
2470
2471#if XNN_ARCH_X86 || XNN_ARCH_X86_64
2472size_t xnn_init_f32_rnd_sse2_params(
2473 union xnn_f32_rnd_params params[XNN_MIN_ELEMENTS(1)])
2474{
2475 for (uint32_t i = 0; i < 4; i++) {
2476 params->sse2.sign_mask[i] = -0.0f;
2477 params->sse2.one[i] = 1.0f;
2478 }
2479 return sizeof(params->sse2);
2480}
2481
2482size_t xnn_init_f32_rnd_avx_params(
2483 union xnn_f32_rnd_params params[XNN_MIN_ELEMENTS(1)])
2484{
2485 for (uint32_t i = 0; i < 7; i++) {
2486 params->avx.mask_table[i] = -1;
2487 }
2488 for (uint32_t i = 7; i < 14; i++) {
2489 params->avx.mask_table[i] = 0;
2490 }
2491 return sizeof(params->avx);
2492}
2493#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
2494
2495#if XNN_ARCH_ARM || XNN_ARCH_ARM64
2496size_t xnn_init_f16_elu_fp16arith_rr1_p3_params(
2497 union xnn_f16_elu_params params[XNN_MIN_ELEMENTS(1)],
2498 uint16_t prescale,
2499 uint16_t alpha,
2500 uint16_t beta)
2501{
2502 params->fp16arith_rr1_p3.prescale = prescale;
2503 params->fp16arith_rr1_p3.sat_cutoff = UINT16_C(0xC829); // -0x1.0A4p+3h;
2504 params->fp16arith_rr1_p3.magic_bias = UINT16_C(0x660F); // 0x1.83Cp+10h
2505 params->fp16arith_rr1_p3.log2e = UINT16_C(0x3DC5); // 0x1.714p+0h
2506 params->fp16arith_rr1_p3.minus_ln2 = UINT16_C(0xB98C); // -0x1.62E430p-1h
2507 params->fp16arith_rr1_p3.c3 = UINT16_C(0x315B); // 0x1.56Cp-3h
2508 params->fp16arith_rr1_p3.c2 = UINT16_C(0x3808); // 0x1.020p-1h
2509 params->fp16arith_rr1_p3.minus_alpha = alpha ^ UINT16_C(0x8000);
2510 params->fp16arith_rr1_p3.beta = beta;
2511 return sizeof(params->fp16arith_rr1_p3);
2512}
2513#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64
2514
2515#if XNN_ARCH_X86 || XNN_ARCH_X86_64
2516size_t xnn_init_f16_elu_avx2_rr1_p3_params(
2517 union xnn_f16_elu_params params[XNN_MIN_ELEMENTS(1)],
2518 uint16_t prescale,
2519 uint16_t alpha,
2520 uint16_t beta)
2521{
2522 for (uint32_t i = 0; i < 8; i++) {
2523 params->avx2_rr1_p3.prescale[i] = fp16_ieee_to_fp32_value(prescale);
2524 params->avx2_rr1_p3.sat_cutoff[i] = -0x1.0A4000p+3f;
2525 params->avx2_rr1_p3.magic_bias[i] = 0x1.8000FEp23f;
2526 params->avx2_rr1_p3.log2e[i] = 0x1.715476p+0f;
2527 params->avx2_rr1_p3.minus_ln2[i] = -0x1.62E430p-1f;
2528 params->avx2_rr1_p3.c3[i] = 0x1.5554DCp-3f;
2529 params->avx2_rr1_p3.c2[i] = 0x1.01EBB2p-1f;
2530 params->avx2_rr1_p3.c1[i] = 0x1.0002F2p+0f;
2531 params->avx2_rr1_p3.alpha[i] = fp16_ieee_to_fp32_value(alpha);
2532 params->avx2_rr1_p3.beta[i] = fp16_ieee_to_fp32_value(beta);
2533 }
2534 return sizeof(params->avx2_rr1_p3);
2535}
2536#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
2537
2538size_t xnn_init_f32_elu_scalar_rr2_lut16_p3_params(
2539 union xnn_f32_elu_params params[XNN_MIN_ELEMENTS(1)],
2540 float prescale,
2541 float alpha,
2542 float beta)
2543{
2544 params->scalar_rr2_lut16_p3.prescale = prescale;
2545 params->scalar_rr2_lut16_p3.alpha = alpha;
2546 params->scalar_rr2_lut16_p3.beta = beta;
2547 params->scalar_rr2_lut16_p3.sat_cutoff = -0x1.154246p+4f;
2548 params->scalar_rr2_lut16_p3.magic_bias = 0x1.800000p19f;
2549 params->scalar_rr2_lut16_p3.log2e = 0x1.715476p+0f;
2550 params->scalar_rr2_lut16_p3.minus_ln2_hi = -0x1.62E400p-1f;
2551 params->scalar_rr2_lut16_p3.minus_ln2_lo = -0x1.7F7D1Cp-20f;
2552 params->scalar_rr2_lut16_p3.c3 = 0x1.55561Cp-3f;
2553 params->scalar_rr2_lut16_p3.c2 = 0x1.0001ECp-1f;
2554 params->scalar_rr2_lut16_p3.one = 1.0f;
2555 return sizeof(params->scalar_rr2_lut16_p3);
2556}
2557
2558size_t xnn_init_f32_elu_scalar_rr2_p6_params(
2559 union xnn_f32_elu_params params[XNN_MIN_ELEMENTS(1)],
2560 float prescale,
2561 float alpha,
2562 float beta)
2563{
2564 params->scalar_rr2_p6.prescale = prescale;
2565 params->scalar_rr2_p6.alpha = alpha;
2566 params->scalar_rr2_p6.beta = beta;
2567 params->scalar_rr2_p6.sat_cutoff = -0x1.154246p+4f;
2568 params->scalar_rr2_p6.magic_bias = 0x1.8000FEp23f;
2569 params->scalar_rr2_p6.log2e = 0x1.715476p+0f;
2570 params->scalar_rr2_p6.minus_ln2_hi = -0x1.62E440p-1f;
2571 params->scalar_rr2_p6.minus_ln2_lo = 0x1.0105C6p-21f;
2572 params->scalar_rr2_p6.c6 = 0x1.6b7338p-10f;
2573 params->scalar_rr2_p6.c5 = 0x1.12278Ep-7f;
2574 params->scalar_rr2_p6.c4 = 0x1.555716p-5f;
2575 params->scalar_rr2_p6.c3 = 0x1.5554B0p-3f;
2576 params->scalar_rr2_p6.c2 = 0x1.FFFFFEp-2f;
2577 params->scalar_rr2_p6.one = 1.0f;
2578 return sizeof(params->scalar_rr2_p6);
2579}
2580
2581#if XNN_ARCH_ARM || XNN_ARCH_ARM64
2582size_t xnn_init_f32_elu_neon_rr2_lut16_p3_params(
2583 union xnn_f32_elu_params params[XNN_MIN_ELEMENTS(1)],
2584 float prescale,
2585 float alpha,
2586 float beta)
2587{
2588 params->neon_rr2_lut16_p3.prescale = prescale;
2589 params->neon_rr2_lut16_p3.alpha = alpha;
2590 params->neon_rr2_lut16_p3.beta = beta;
2591 params->neon_rr2_lut16_p3.sat_cutoff = -0x1.154246p+4f;
2592 params->neon_rr2_lut16_p3.magic_bias = 0x1.800000p19f;
2593 params->neon_rr2_lut16_p3.log2e = 0x1.715476p+0f;
2594 params->neon_rr2_lut16_p3.minus_ln2_hi = -0x1.62E400p-1f;
2595 params->neon_rr2_lut16_p3.minus_ln2_lo = -0x1.7F7D1Cp-20f;
2596 params->neon_rr2_lut16_p3.c3 = 0x1.55561Cp-3f;
2597 params->neon_rr2_lut16_p3.c2 = 0x1.0001ECp-1f;
2598 return sizeof(params->neon_rr2_lut16_p3);
2599}
2600
2601size_t xnn_init_f32_elu_neon_rr2_p6_params(
2602 union xnn_f32_elu_params params[XNN_MIN_ELEMENTS(1)],
2603 float prescale,
2604 float alpha,
2605 float beta)
2606{
2607 params->neon_rr2_p6.prescale = prescale;
2608 params->neon_rr2_p6.alpha = alpha;
2609 params->neon_rr2_p6.beta = beta;
2610 params->neon_rr2_p6.sat_cutoff = -0x1.154246p+4f;
2611 params->neon_rr2_p6.magic_bias = 0x1.8000FEp23f;
2612 params->neon_rr2_p6.log2e = 0x1.715476p+0f;
2613 params->neon_rr2_p6.minus_ln2_hi = -0x1.62E440p-1f;
2614 params->neon_rr2_p6.minus_ln2_lo = 0x1.0105C6p-21f;
2615 params->neon_rr2_p6.c6 = 0x1.6b7338p-10f;
2616 params->neon_rr2_p6.c5 = 0x1.12278Ep-7f;
2617 params->neon_rr2_p6.c4 = 0x1.555716p-5f;
2618 params->neon_rr2_p6.c3 = 0x1.5554B0p-3f;
2619 params->neon_rr2_p6.c2 = 0x1.FFFFFEp-2f;
2620 return sizeof(params->neon_rr2_p6);
2621}
2622
2623size_t xnn_init_f32_elu_neonfma_rr1_lut16_p3_params(
2624 union xnn_f32_elu_params params[XNN_MIN_ELEMENTS(1)],
2625 float prescale,
2626 float alpha,
2627 float beta)
2628{
2629 params->neonfma_rr1_lut16_p3.prescale = prescale;
2630 params->neonfma_rr1_lut16_p3.alpha = alpha;
2631 params->neonfma_rr1_lut16_p3.beta = beta;
2632 params->neonfma_rr1_lut16_p3.sat_cutoff = -0x1.154246p+4f;
2633 params->neonfma_rr1_lut16_p3.magic_bias = 0x1.800000p19f;
2634 params->neonfma_rr1_lut16_p3.log2e = 0x1.715476p+0f;
2635 params->neonfma_rr1_lut16_p3.minus_ln2 = -0x1.62E430p-1f;
2636 params->neonfma_rr1_lut16_p3.c3 = 0x1.55561Cp-3f;
2637 params->neonfma_rr1_lut16_p3.c2 = 0x1.0001ECp-1f;
2638 return sizeof(params->neonfma_rr1_lut16_p3);
2639}
2640
2641size_t xnn_init_f32_elu_neonfma_rr1_p6_params(
2642 union xnn_f32_elu_params params[XNN_MIN_ELEMENTS(1)],
2643 float prescale,
2644 float alpha,
2645 float beta)
2646{
2647 params->neonfma_rr1_p6.prescale = prescale;
2648 params->neonfma_rr1_p6.alpha = alpha;
2649 params->neonfma_rr1_p6.beta = beta;
2650 params->neonfma_rr1_p6.sat_cutoff = -0x1.154246p+4f;
2651 params->neonfma_rr1_p6.magic_bias = 0x1.8000FEp23f;
2652 params->neonfma_rr1_p6.log2e = 0x1.715476p+0f;
2653 params->neonfma_rr1_p6.minus_ln2 = -0x1.62E430p-1f;
2654 params->neonfma_rr1_p6.c6 = 0x1.6b7338p-10f;
2655 params->neonfma_rr1_p6.c5 = 0x1.12278Ep-7f;
2656 params->neonfma_rr1_p6.c4 = 0x1.555716p-5f;
2657 params->neonfma_rr1_p6.c3 = 0x1.5554B0p-3f;
2658 params->neonfma_rr1_p6.c2 = 0x1.FFFFFEp-2f;
2659 return sizeof(params->neonfma_rr1_p6);
2660}
2661#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64
2662
2663#if XNN_ARCH_X86 || XNN_ARCH_X86_64
2664size_t xnn_init_f32_elu_sse2_rr2_lut16_p3_params(
2665 union xnn_f32_elu_params params[XNN_MIN_ELEMENTS(1)],
2666 float prescale,
2667 float alpha,
2668 float beta)
2669{
2670 for (uint32_t i = 0; i < 4; i++) {
2671 params->sse2_rr2_lut16_p3.prescale[i] = prescale;
2672 params->sse2_rr2_lut16_p3.alpha[i] = alpha;
2673 params->sse2_rr2_lut16_p3.beta[i] = beta;
2674 params->sse2_rr2_lut16_p3.sat_cutoff[i] = -0x1.154246p+4f;
2675 params->sse2_rr2_lut16_p3.magic_bias[i] = 0x1.800000p19f;
2676 params->sse2_rr2_lut16_p3.log2e[i] = 0x1.715476p+0f;
2677 params->sse2_rr2_lut16_p3.index_mask[i] = UINT32_C(0xF);
2678 params->sse2_rr2_lut16_p3.minus_ln2_hi[i] = -0x1.62E400p-1f;
2679 params->sse2_rr2_lut16_p3.minus_ln2_lo[i] = -0x1.7F7D1Cp-20f;
2680 params->sse2_rr2_lut16_p3.c3[i] = 0x1.55561Cp-3f;
2681 params->sse2_rr2_lut16_p3.c2[i] = 0x1.0001ECp-1f;
2682 params->sse2_rr2_lut16_p3.one[i] = 1.0f;
2683 }
2684 return sizeof(params->sse2_rr2_lut16_p3);
2685}
2686
2687size_t xnn_init_f32_elu_sse2_rr2_p6_params(
2688 union xnn_f32_elu_params params[XNN_MIN_ELEMENTS(1)],
2689 float prescale,
2690 float alpha,
2691 float beta)
2692{
2693 for (uint32_t i = 0; i < 4; i++) {
2694 params->sse2_rr2_p6.prescale[i] = prescale;
2695 params->sse2_rr2_p6.alpha[i] = alpha;
2696 params->sse2_rr2_p6.beta[i] = beta;
2697 params->sse2_rr2_p6.sat_cutoff[i] = -0x1.154246p+4f;
2698 params->sse2_rr2_p6.magic_bias[i] = 0x1.8000FEp23f;
2699 params->sse2_rr2_p6.log2e[i] = 0x1.715476p+0f;
2700 params->sse2_rr2_p6.minus_ln2_hi[i] = -0x1.62E440p-1f;
2701 params->sse2_rr2_p6.minus_ln2_lo[i] = 0x1.0105C6p-21f;
2702 params->sse2_rr2_p6.c6[i] = 0x1.6b7338p-10f;
2703 params->sse2_rr2_p6.c5[i] = 0x1.12278Ep-7f;
2704 params->sse2_rr2_p6.c4[i] = 0x1.555716p-5f;
2705 params->sse2_rr2_p6.c3[i] = 0x1.5554B0p-3f;
2706 params->sse2_rr2_p6.c2[i] = 0x1.FFFFFEp-2f;
2707 params->sse2_rr2_p6.one[i] = 1.0f;
2708 }
2709 return sizeof(params->sse2_rr2_p6);
2710}
2711
2712size_t xnn_init_f32_elu_avx_rr2_lut16_p3_params(
2713 union xnn_f32_elu_params params[XNN_MIN_ELEMENTS(1)],
2714 float prescale,
2715 float alpha,
2716 float beta)
2717{
2718 for (uint32_t i = 0; i < 8; i++) {
2719 params->avx_rr2_lut16_p3.prescale[i] = prescale;
2720 params->avx_rr2_lut16_p3.alpha[i] = alpha;
2721 params->avx_rr2_lut16_p3.beta[i] = beta;
2722 params->avx_rr2_lut16_p3.sat_cutoff[i] = -0x1.154246p+4f;
2723 params->avx_rr2_lut16_p3.magic_bias[i] = 0x1.800000p19f;
2724 params->avx_rr2_lut16_p3.log2e[i] = 0x1.715476p+0f;
2725 params->avx_rr2_lut16_p3.index_mask[i] = UINT32_C(0xF);
2726 params->avx_rr2_lut16_p3.minus_ln2_hi[i] = -0x1.62E400p-1f;
2727 params->avx_rr2_lut16_p3.minus_ln2_lo[i] = -0x1.7F7D1Cp-20f;
2728 params->avx_rr2_lut16_p3.c3[i] = 0x1.55561Cp-3f;
2729 params->avx_rr2_lut16_p3.c2[i] = 0x1.0001ECp-1f;
2730 params->avx_rr2_lut16_p3.one[i] = 1.0f;
2731 }
2732 for (uint32_t i = 0; i < 7; i++) {
2733 params->avx_rr2_lut16_p3.mask_table[i] = -1;
2734 }
2735 for (uint32_t i = 7; i < 14; i++) {
2736 params->avx_rr2_lut16_p3.mask_table[i] = 0;
2737 }
2738 return sizeof(params->avx_rr2_lut16_p3);
2739}
2740
2741size_t xnn_init_f32_elu_avx_rr2_lut4_p4_params(
2742 union xnn_f32_elu_params params[XNN_MIN_ELEMENTS(1)],
2743 float prescale,
2744 float alpha,
2745 float beta)
2746{
2747 for (uint32_t i = 0; i < 8; i++) {
2748 params->avx_rr2_lut4_p4.prescale[i] = prescale;
2749 params->avx_rr2_lut4_p4.alpha[i] = alpha;
2750 params->avx_rr2_lut4_p4.beta[i] = beta;
2751 params->avx_rr2_lut4_p4.sat_cutoff[i] = -0x1.154246p+4f;
2752 params->avx_rr2_lut4_p4.magic_bias[i] = 0x1.8003F8p21f;
2753 params->avx_rr2_lut4_p4.log2e[i] = 0x1.715476p+0f;
2754 params->avx_rr2_lut4_p4.index_mask[i] = UINT32_C(0x3);
2755 }
2756 params->avx_rr2_lut4_p4.table[0] = 0x1.000000p+0f;
2757 params->avx_rr2_lut4_p4.table[1] = 0x1.306FE0p+0f;
2758 params->avx_rr2_lut4_p4.table[2] = 0x1.6A09E6p+0f;
2759 params->avx_rr2_lut4_p4.table[3] = 0x1.AE89FAp+0f;
2760 params->avx_rr2_lut4_p4.table[4] = 0x1.000000p+0f;
2761 params->avx_rr2_lut4_p4.table[5] = 0x1.306FE0p+0f;
2762 params->avx_rr2_lut4_p4.table[6] = 0x1.6A09E6p+0f;
2763 params->avx_rr2_lut4_p4.table[7] = 0x1.AE89FAp+0f;
2764 for (uint32_t i = 0; i < 8; i++) {
2765 params->avx_rr2_lut4_p4.minus_ln2_hi[i] = -0x1.62E400p-1f;
2766 params->avx_rr2_lut4_p4.minus_ln2_lo[i] = -0x1.7F7D1Cp-20f;
2767 params->avx_rr2_lut4_p4.c4[i] = 0x1.554F9Ap-5f;
2768 params->avx_rr2_lut4_p4.c3[i] = 0x1.557082p-3f;
2769 params->avx_rr2_lut4_p4.c2[i] = 0x1.000002p-1f;
2770 params->avx_rr2_lut4_p4.one[i] = 1.0f;
2771 }
2772 for (uint32_t i = 0; i < 7; i++) {
2773 params->avx_rr2_lut4_p4.mask_table[i] = -1;
2774 }
2775 for (uint32_t i = 7; i < 14; i++) {
2776 params->avx_rr2_lut4_p4.mask_table[i] = 0;
2777 }
2778 return sizeof(params->avx_rr2_lut4_p4);
2779}
2780
2781size_t xnn_init_f32_elu_avx_rr2_p6_params(
2782 union xnn_f32_elu_params params[XNN_MIN_ELEMENTS(1)],
2783 float prescale,
2784 float alpha,
2785 float beta)
2786{
2787 for (uint32_t i = 0; i < 8; i++) {
2788 params->avx_rr2_p6.prescale[i] = prescale;
2789 params->avx_rr2_p6.alpha[i] = alpha;
2790 params->avx_rr2_p6.beta[i] = beta;
2791 params->avx_rr2_p6.sat_cutoff[i] = -0x1.154246p+4f;
2792 params->avx_rr2_p6.magic_bias[i] = 0x1.8000FEp23f;
2793 params->avx_rr2_p6.log2e[i] = 0x1.715476p+0f;
2794 params->avx_rr2_p6.minus_ln2_hi[i] = -0x1.62E440p-1f;
2795 params->avx_rr2_p6.minus_ln2_lo[i] = 0x1.0105C6p-21f;
2796 params->avx_rr2_p6.c6[i] = 0x1.6b7338p-10f;
2797 params->avx_rr2_p6.c5[i] = 0x1.12278Ep-7f;
2798 params->avx_rr2_p6.c4[i] = 0x1.555716p-5f;
2799 params->avx_rr2_p6.c3[i] = 0x1.5554B0p-3f;
2800 params->avx_rr2_p6.c2[i] = 0x1.FFFFFEp-2f;
2801 params->avx_rr2_p6.one[i] = 1.0f;
2802 }
2803 for (uint32_t i = 0; i < 7; i++) {
2804 params->avx_rr2_p6.mask_table[i] = -1;
2805 }
2806 for (uint32_t i = 7; i < 14; i++) {
2807 params->avx_rr2_p6.mask_table[i] = 0;
2808 }
2809 return sizeof(params->avx_rr2_p6);
2810}
2811
2812size_t xnn_init_f32_elu_avx2_rr1_lut16_p3_params(
2813 union xnn_f32_elu_params params[XNN_MIN_ELEMENTS(1)],
2814 float prescale,
2815 float alpha,
2816 float beta)
2817{
2818 for (uint32_t i = 0; i < 8; i++) {
2819 params->avx2_rr1_lut16_p3.prescale[i] = prescale;
2820 params->avx2_rr1_lut16_p3.alpha[i] = alpha;
2821 params->avx2_rr1_lut16_p3.beta[i] = beta;
2822 params->avx2_rr1_lut16_p3.sat_cutoff[i] = -0x1.154246p+4f;
2823 params->avx2_rr1_lut16_p3.magic_bias[i] = 0x1.800000p19f;
2824 params->avx2_rr1_lut16_p3.log2e[i] = 0x1.715476p+0f;
2825 params->avx2_rr1_lut16_p3.index_mask[i] = UINT32_C(0xF);
2826 params->avx2_rr1_lut16_p3.minus_ln2[i] = -0x1.62E430p-1f;
2827 params->avx2_rr1_lut16_p3.c3[i] = 0x1.55561Cp-3f;
2828 params->avx2_rr1_lut16_p3.c2[i] = 0x1.0001ECp-1f;
2829 }
2830 for (uint32_t i = 0; i < 7; i++) {
2831 params->avx2_rr1_lut16_p3.mask_table[i] = -1;
2832 }
2833 for (uint32_t i = 7; i < 14; i++) {
2834 params->avx2_rr1_lut16_p3.mask_table[i] = 0;
2835 }
2836 return sizeof(params->avx2_rr1_lut16_p3);
2837}
2838
2839size_t xnn_init_f32_elu_avx2_rr1_lut8_p4_params(
2840 union xnn_f32_elu_params params[XNN_MIN_ELEMENTS(1)],
2841 float prescale,
2842 float alpha,
2843 float beta)
2844{
2845 for (uint32_t i = 0; i < 8; i++) {
2846 params->avx2_rr1_lut8_p4.prescale[i] = prescale;
2847 params->avx2_rr1_lut8_p4.alpha[i] = alpha;
2848 params->avx2_rr1_lut8_p4.beta[i] = beta;
2849 params->avx2_rr1_lut8_p4.sat_cutoff[i] = -0x1.154246p+4f;
2850 params->avx2_rr1_lut8_p4.magic_bias[i] = 0x1.800000p20f;
2851 params->avx2_rr1_lut8_p4.log2e[i] = 0x1.715476p+0f;
2852 }
2853 params->avx2_rr1_lut8_p4.table[0] = UINT32_C(0x3F800000);
2854 params->avx2_rr1_lut8_p4.table[1] = UINT32_C(0x3F7B95C2);
2855 params->avx2_rr1_lut8_p4.table[2] = UINT32_C(0x3F7837F0);
2856 params->avx2_rr1_lut8_p4.table[3] = UINT32_C(0x3F75FED7);
2857 params->avx2_rr1_lut8_p4.table[4] = UINT32_C(0x3F7504F3);
2858 params->avx2_rr1_lut8_p4.table[5] = UINT32_C(0x3F75672A);
2859 params->avx2_rr1_lut8_p4.table[6] = UINT32_C(0x3F7744FD);
2860 params->avx2_rr1_lut8_p4.table[7] = UINT32_C(0x3F7AC0C7);
2861 for (uint32_t i = 0; i < 8; i++) {
2862 params->avx2_rr1_lut8_p4.minus_ln2[i] = -0x1.62E430p-1f;
2863 params->avx2_rr1_lut8_p4.c4[i] = 0x1.5558ECp-5f;
2864 params->avx2_rr1_lut8_p4.c3[i] = 0x1.555C20p-3f;
2865 params->avx2_rr1_lut8_p4.c2[i] = 0x1.000000p-1f;
2866 }
2867 for (uint32_t i = 0; i < 7; i++) {
2868 params->avx2_rr1_lut8_p4.mask_table[i] = -1;
2869 }
2870 for (uint32_t i = 7; i < 14; i++) {
2871 params->avx2_rr1_lut8_p4.mask_table[i] = 0;
2872 }
2873 return sizeof(params->avx2_rr1_lut8_p4);
2874}
2875
2876size_t xnn_init_f32_elu_avx2_rr1_lut4_p4_params(
2877 union xnn_f32_elu_params params[XNN_MIN_ELEMENTS(1)],
2878 float prescale,
2879 float alpha,
2880 float beta)
2881{
2882 for (uint32_t i = 0; i < 8; i++) {
2883 params->avx2_rr1_lut4_p4.prescale[i] = prescale;
2884 params->avx2_rr1_lut4_p4.alpha[i] = alpha;
2885 params->avx2_rr1_lut4_p4.beta[i] = beta;
2886 params->avx2_rr1_lut4_p4.sat_cutoff[i] = -0x1.154246p+4f;
2887 params->avx2_rr1_lut4_p4.magic_bias[i] = 0x1.800000p21f;
2888 params->avx2_rr1_lut4_p4.log2e[i] = 0x1.715476p+0f;
2889 }
2890 params->avx2_rr1_lut4_p4.table[0] = 0x1.000000p+0f;
2891 params->avx2_rr1_lut4_p4.table[1] = 0x1.F06FE0p-1f;
2892 params->avx2_rr1_lut4_p4.table[2] = 0x1.EA09E6p-1f;
2893 params->avx2_rr1_lut4_p4.table[3] = 0x1.EE89FAp-1f;
2894 params->avx2_rr1_lut4_p4.table[4] = 0x1.000000p+0f;
2895 params->avx2_rr1_lut4_p4.table[5] = 0x1.F06FE0p-1f;
2896 params->avx2_rr1_lut4_p4.table[6] = 0x1.EA09E6p-1f;
2897 params->avx2_rr1_lut4_p4.table[7] = 0x1.EE89FAp-1f;
2898 for (uint32_t i = 0; i < 8; i++) {
2899 params->avx2_rr1_lut4_p4.minus_ln2[i] = -0x1.62E430p-1f;
2900 params->avx2_rr1_lut4_p4.c4[i] = 0x1.554F9Ap-5f;
2901 params->avx2_rr1_lut4_p4.c3[i] = 0x1.557082p-3f;
2902 params->avx2_rr1_lut4_p4.c2[i] = 0x1.000002p-1f;
2903 }
2904 for (uint32_t i = 0; i < 7; i++) {
2905 params->avx2_rr1_lut4_p4.mask_table[i] = -1;
2906 }
2907 for (uint32_t i = 7; i < 14; i++) {
2908 params->avx2_rr1_lut4_p4.mask_table[i] = 0;
2909 }
2910 return sizeof(params->avx2_rr1_lut4_p4);
2911}
2912
2913size_t xnn_init_f32_elu_avx2_rr1_p6_params(
2914 union xnn_f32_elu_params params[XNN_MIN_ELEMENTS(1)],
2915 float prescale,
2916 float alpha,
2917 float beta)
2918{
2919 for (uint32_t i = 0; i < 8; i++) {
2920 params->avx2_rr1_p6.prescale[i] = prescale;
2921 params->avx2_rr1_p6.alpha[i] = alpha;
2922 params->avx2_rr1_p6.beta[i] = beta;
2923 params->avx2_rr1_p6.sat_cutoff[i] = -0x1.154246p+4f;
2924 params->avx2_rr1_p6.magic_bias[i] = 0x1.8000FEp23f;
2925 params->avx2_rr1_p6.log2e[i] = 0x1.715476p+0f;
2926 params->avx2_rr1_p6.minus_ln2[i] = -0x1.62E430p-1f;
2927 params->avx2_rr1_p6.c6[i] = 0x1.6B7338p-10f;
2928 params->avx2_rr1_p6.c5[i] = 0x1.12278Ep-7f;
2929 params->avx2_rr1_p6.c4[i] = 0x1.555716p-5f;
2930 params->avx2_rr1_p6.c3[i] = 0x1.5554B0p-3f;
2931 params->avx2_rr1_p6.c2[i] = 0x1.FFFFFEp-2f;
2932 }
2933 for (uint32_t i = 0; i < 7; i++) {
2934 params->avx2_rr1_p6.mask_table[i] = -1;
2935 }
2936 for (uint32_t i = 7; i < 14; i++) {
2937 params->avx2_rr1_p6.mask_table[i] = 0;
2938 }
2939 return sizeof(params->avx2_rr1_p6);
2940}
2941
2942size_t xnn_init_f32_elu_avx512_rr1_lut16_p3_params(
2943 union xnn_f32_elu_params params[XNN_MIN_ELEMENTS(1)],
2944 float prescale,
2945 float alpha,
2946 float beta)
2947{
2948 params->avx512_rr1_lut16_p3.prescale = prescale;
2949 params->avx512_rr1_lut16_p3.alpha = alpha;
2950 params->avx512_rr1_lut16_p3.beta = beta;
2951 params->avx512_rr1_lut16_p3.sat_cutoff = -0x1.154246p+4f;
2952 params->avx512_rr1_lut16_p3.magic_bias = 0x1.800000p19f;
2953 params->avx512_rr1_lut16_p3.log2e = 0x1.715476p+0f;
2954 params->avx512_rr1_lut16_p3.minus_ln2 = -0x1.62E430p-1f;
2955 params->avx512_rr1_lut16_p3.c3 = 0x1.55561Cp-3f;
2956 params->avx512_rr1_lut16_p3.c2 = 0x1.0001ECp-1f;
2957 params->avx512_rr1_lut16_p3.table[ 0] = UINT32_C(0x3F800000);
2958 params->avx512_rr1_lut16_p3.table[ 1] = UINT32_C(0x3F7DAAC3);
2959 params->avx512_rr1_lut16_p3.table[ 2] = UINT32_C(0x3F7B95C2);
2960 params->avx512_rr1_lut16_p3.table[ 3] = UINT32_C(0x3F79C3D3);
2961 params->avx512_rr1_lut16_p3.table[ 4] = UINT32_C(0x3F7837F0);
2962 params->avx512_rr1_lut16_p3.table[ 5] = UINT32_C(0x3F76F532);
2963 params->avx512_rr1_lut16_p3.table[ 6] = UINT32_C(0x3F75FED7);
2964 params->avx512_rr1_lut16_p3.table[ 7] = UINT32_C(0x3F75583F);
2965 params->avx512_rr1_lut16_p3.table[ 8] = UINT32_C(0x3F7504F3);
2966 params->avx512_rr1_lut16_p3.table[ 9] = UINT32_C(0x3F7508A4);
2967 params->avx512_rr1_lut16_p3.table[10] = UINT32_C(0x3F75672A);
2968 params->avx512_rr1_lut16_p3.table[11] = UINT32_C(0x3F76248C);
2969 params->avx512_rr1_lut16_p3.table[12] = UINT32_C(0x3F7744FD);
2970 params->avx512_rr1_lut16_p3.table[13] = UINT32_C(0x3F78CCDF);
2971 params->avx512_rr1_lut16_p3.table[14] = UINT32_C(0x3F7AC0C7);
2972 params->avx512_rr1_lut16_p3.table[15] = UINT32_C(0x3F7D257D);
2973 return sizeof(params->avx512_rr1_lut16_p3);
2974}
2975
2976size_t xnn_init_f32_elu_avx512_rr1_p6_params(
2977 union xnn_f32_elu_params params[XNN_MIN_ELEMENTS(1)],
2978 float prescale,
2979 float alpha,
2980 float beta)
2981{
2982 params->avx512_rr1_p6.prescale = prescale;
2983 params->avx512_rr1_p6.alpha = alpha;
2984 params->avx512_rr1_p6.beta = beta;
2985 params->avx512_rr1_p6.sat_cutoff = -0x1.154246p+4f;
2986 params->avx512_rr1_p6.magic_bias = 0x1.8000FEp23f;
2987 params->avx512_rr1_p6.log2e = 0x1.715476p+0f;
2988 params->avx512_rr1_p6.minus_ln2 = -0x1.62E430p-1f;
2989 params->avx512_rr1_p6.c6 = 0x1.6B7338p-10f;
2990 params->avx512_rr1_p6.c5 = 0x1.12278Ep-7f;
2991 params->avx512_rr1_p6.c4 = 0x1.555716p-5f;
2992 params->avx512_rr1_p6.c3 = 0x1.5554B0p-3f;
2993 params->avx512_rr1_p6.c2 = 0x1.FFFFFEp-2f;
2994 return sizeof(params->avx512_rr1_p6);
2995}
2996#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
2997
2998#if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
2999size_t xnn_init_f32_elu_wasmsimd_rr2_lut16_p3_params(
3000 union xnn_f32_elu_params params[XNN_MIN_ELEMENTS(1)],
3001 float prescale,
3002 float alpha,
3003 float beta)
3004{
3005 for (uint32_t i = 0; i < 2; i++) {
3006 params->wasmsimd_rr2_lut16_p3.prescale[i] = prescale;
3007 params->wasmsimd_rr2_lut16_p3.alpha[i] = alpha;
3008 params->wasmsimd_rr2_lut16_p3.beta[i] = beta;
3009 params->wasmsimd_rr2_lut16_p3.sat_cutoff[i] = -0x1.154246p+4f;
3010 params->wasmsimd_rr2_lut16_p3.magic_bias[i] = 0x1.800000p19f;
3011 params->wasmsimd_rr2_lut16_p3.log2e[i] = 0x1.715476p+0f;
3012 params->wasmsimd_rr2_lut16_p3.index_mask[i] = UINT32_C(0xF);
3013 params->wasmsimd_rr2_lut16_p3.minus_ln2_hi[i] = -0x1.62E400p-1f;
3014 params->wasmsimd_rr2_lut16_p3.minus_ln2_lo[i] = -0x1.7F7D1Cp-20f;
3015 params->wasmsimd_rr2_lut16_p3.c3[i] = 0x1.55561Cp-3f;
3016 params->wasmsimd_rr2_lut16_p3.c2[i] = 0x1.0001ECp-1f;
3017 params->wasmsimd_rr2_lut16_p3.one[i] = 1.0f;
3018 }
3019 return sizeof(params->wasmsimd_rr2_lut16_p3);
3020}
3021
3022size_t xnn_init_f32_elu_wasmsimd_rr2_p6_params(
3023 union xnn_f32_elu_params params[XNN_MIN_ELEMENTS(1)],
3024 float prescale,
3025 float alpha,
3026 float beta)
3027{
3028 for (uint32_t i = 0; i < 2; i++) {
3029 params->wasmsimd_rr2_p6.prescale[i] = prescale;
3030 params->wasmsimd_rr2_p6.alpha[i] = alpha;
3031 params->wasmsimd_rr2_p6.beta[i] = beta;
3032 params->wasmsimd_rr2_p6.sat_cutoff[i] = -0x1.154246p+4f;
3033 params->wasmsimd_rr2_p6.magic_bias[i] = 0x1.8000FEp23f;
3034 params->wasmsimd_rr2_p6.log2e[i] = 0x1.715476p+0f;
3035 params->wasmsimd_rr2_p6.minus_ln2_hi[i] = -0x1.62E440p-1f;
3036 params->wasmsimd_rr2_p6.minus_ln2_lo[i] = 0x1.0105C6p-21f;
3037 params->wasmsimd_rr2_p6.c6[i] = 0x1.6b7338p-10f;
3038 params->wasmsimd_rr2_p6.c5[i] = 0x1.12278Ep-7f;
3039 params->wasmsimd_rr2_p6.c4[i] = 0x1.555716p-5f;
3040 params->wasmsimd_rr2_p6.c3[i] = 0x1.5554B0p-3f;
3041 params->wasmsimd_rr2_p6.c2[i] = 0x1.FFFFFEp-2f;
3042 params->wasmsimd_rr2_p6.one[i] = 1.0f;
3043 }
3044 return sizeof(params->wasmsimd_rr2_p6);
3045}
3046#endif // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
3047
3048#if XNN_ARCH_ARM || XNN_ARCH_ARM64
3049size_t xnn_init_f16_expminus_fp16arith_rr2_p2_params(
3050 union xnn_f16_expminus_params params[XNN_MIN_ELEMENTS(1)])
3051{
3052 params->fp16arith_rr2_p2.magic_bias = UINT16_C(0x660F); // 0x1.83Cp+10h
3053 params->fp16arith_rr2_p2.log2e = UINT16_C(0x3DC5); // 0x1.714p+0h
3054 params->fp16arith_rr2_p2.minus_ln2_hi = UINT16_C(0xB98C); // -0x1.630p-1h
3055 params->fp16arith_rr2_p2.minus_ln2_lo = UINT16_C(0x0AF4); // 0x1.BD0p-13h
3056 params->fp16arith_rr2_p2.c2 = UINT16_C(0x37F9); // 0x1.FE4p-2h
3057 params->fp16arith_rr2_p2.c1 = UINT16_C(0x3C0E); // 0x1.038p+0h
3058 params->fp16arith_rr2_p2.denorm_cutoff = UINT16_C(0xC8DA); // -0x1.368p+3h
3059 return sizeof(params->fp16arith_rr2_p2);
3060}
3061#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64
3062
3063#if XNN_ARCH_X86 || XNN_ARCH_X86_64
3064size_t xnn_init_f16_expminus_avx2_rr1_p2_params(
3065 union xnn_f16_expminus_params params[XNN_MIN_ELEMENTS(1)])
3066{
3067 for (uint32_t i = 0; i < 8; i++) {
3068 params->avx2_rr1_p2.magic_bias[i] = 0x1.8000FEp23f;
3069 params->avx2_rr1_p2.log2e[i] = 0x1.715476p0f;
3070 params->avx2_rr1_p2.minus_ln2[i] = -0x1.62E43p-1f;
3071 params->avx2_rr1_p2.c2[i] = 0x1.FF3A32p-2f;
3072 params->avx2_rr1_p2.c1[i] = 0x1.039E10p+0f;
3073 params->avx2_rr1_p2.denorm_cutoff[i] = -0x1.368000p+3f;
3074 }
3075 return sizeof(params->avx2_rr1_p2);
3076}
3077#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
3078
3079size_t xnn_init_f32_expminus_scalar_rr2_p5_params(
3080 union xnn_f32_expminus_params params[XNN_MIN_ELEMENTS(1)])
3081{
3082 params->scalar_rr2_p5.log2e = 0x1.715476p+0f;
3083 params->scalar_rr2_p5.magic_bias = 0x1.8000FEp23f;
3084 params->scalar_rr2_p5.minus_ln2_hi = -0x1.62E400p-1f;
3085 params->scalar_rr2_p5.minus_ln2_lo = -0x1.7F7D1Cp-20f;
3086 params->scalar_rr2_p5.c5 = 0x1.0F9F9Cp-7f;
3087 params->scalar_rr2_p5.c4 = 0x1.573A1Ap-5f;
3088 params->scalar_rr2_p5.c3 = 0x1.555A80p-3f;
3089 params->scalar_rr2_p5.c2 = 0x1.FFFDC6p-2f;
3090 params->scalar_rr2_p5.c1 = 0x1.FFFFF6p-1f;
3091 params->scalar_rr2_p5.denorm_cutoff = -0x1.5D589Ep6f;
3092 return sizeof(params->scalar_rr2_p5);
3093}
3094
3095size_t xnn_init_f32_expminus_scalar_rr2_lut64_p2_params(
3096 union xnn_f32_expminus_params params[XNN_MIN_ELEMENTS(1)])
3097{
3098 params->scalar_rr2_lut64_p2.log2e = 0x1.715476p0f;
3099 params->scalar_rr2_lut64_p2.magic_bias = 0x1.800000p17f;
3100 params->scalar_rr2_lut64_p2.minus_ln2_hi = -0x1.630000p-1f;
3101 params->scalar_rr2_lut64_p2.minus_ln2_lo = 0x1.BD0106p-13f;
3102 params->scalar_rr2_lut64_p2.c2 = 0x1.FFFF0Ap-2f;
3103 params->scalar_rr2_lut64_p2.denorm_cutoff = -0x1.5D589Ep6f;
3104 return sizeof(params->scalar_rr2_lut64_p2);
3105}
3106
3107#if XNN_ARCH_ARM || XNN_ARCH_ARM64
3108size_t xnn_init_f32_expminus_neon_rr2_p5_params(
3109 union xnn_f32_expminus_params params[XNN_MIN_ELEMENTS(1)])
3110{
3111 params->neon_rr2_p5.log2e = 0x1.715476p+0f;
3112 params->neon_rr2_p5.magic_bias = 0x1.8000FEp23f;
3113 params->neon_rr2_p5.minus_ln2_hi = -0x1.62E400p-1f;
3114 params->neon_rr2_p5.minus_ln2_lo = -0x1.7F7D1Cp-20f;
3115 params->neon_rr2_p5.c5 = 0x1.0F9F9Cp-7f;
3116 params->neon_rr2_p5.c4 = 0x1.573A1Ap-5f;
3117 params->neon_rr2_p5.c3 = 0x1.555A80p-3f;
3118 params->neon_rr2_p5.c2 = 0x1.FFFDC6p-2f;
3119 params->neon_rr2_p5.c1 = 0x1.FFFFF6p-1f;
3120 params->neon_rr2_p5.denorm_cutoff = -0x1.5D589Ep6f;
3121 return sizeof(params->neon_rr2_p5);
3122}
3123
3124size_t xnn_init_f32_expminus_neon_rr2_lut64_p2_params(
3125 union xnn_f32_expminus_params params[XNN_MIN_ELEMENTS(1)])
3126{
3127 params->neon_rr2_lut64_p2.log2e = 0x1.715476p+0f;
3128 params->neon_rr2_lut64_p2.magic_bias = 0x1.800000p17f;
3129 params->neon_rr2_lut64_p2.minus_ln2_hi = -0x1.62E400p-1f;
3130 params->neon_rr2_lut64_p2.minus_ln2_lo = -0x1.7F7D1Cp-20f;
3131 params->neon_rr2_lut64_p2.c2 = 0x1.FFFF0Ap-2f;
3132 params->neon_rr2_lut64_p2.denorm_cutoff = -0x1.5D589Ep6f;
3133 return sizeof(params->neon_rr2_lut64_p2);
3134}
3135
3136size_t xnn_init_f32_expminus_neonfma_rr1_p5_params(
3137 union xnn_f32_expminus_params params[XNN_MIN_ELEMENTS(1)])
3138{
3139 params->neonfma_rr1_p5.log2e = 0x1.715476p+0f;
3140 params->neonfma_rr1_p5.magic_bias = 0x1.8000FEp23f;
3141 params->neonfma_rr1_p5.minus_ln2 = -0x1.62E430p-1f;
3142 params->neonfma_rr1_p5.c5 = 0x1.0F9F9Cp-7f;
3143 params->neonfma_rr1_p5.c4 = 0x1.573A1Ap-5f;
3144 params->neonfma_rr1_p5.c3 = 0x1.555A80p-3f;
3145 params->neonfma_rr1_p5.c2 = 0x1.FFFDC6p-2f;
3146 params->neonfma_rr1_p5.c1 = 0x1.FFFFF6p-1f;
3147 params->neonfma_rr1_p5.denorm_cutoff = -0x1.5D589Ep6f;
3148 return sizeof(params->neonfma_rr1_p5);
3149}
3150
3151size_t xnn_init_f32_expminus_neonfma_rr1_lut64_p2_params(
3152 union xnn_f32_expminus_params params[XNN_MIN_ELEMENTS(1)])
3153{
3154 params->neonfma_rr1_lut64_p2.log2e = 0x1.715476p+0f;
3155 params->neonfma_rr1_lut64_p2.magic_bias = 0x1.800000p17f;
3156 params->neonfma_rr1_lut64_p2.minus_ln2 = -0x1.62E430p-1f;
3157 params->neonfma_rr1_lut64_p2.c2 = 0x1.FFFF0Ap-2f;
3158 params->neonfma_rr1_lut64_p2.denorm_cutoff = -0x1.5D589Ep6f;
3159 return sizeof(params->neonfma_rr1_lut64_p2);
3160}
3161#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64
3162
3163#if XNN_ARCH_X86 || XNN_ARCH_X86_64
3164size_t xnn_init_f32_expminus_sse2_rr2_p5_params(
3165 union xnn_f32_expminus_params params[XNN_MIN_ELEMENTS(1)])
3166{
3167 for (uint32_t i = 0; i < 4; i++) {
3168 params->sse2_rr2_p5.log2e[i] = 0x1.715476p+0f;
3169 params->sse2_rr2_p5.magic_bias[i] = 0x1.8000FEp23f;
3170 params->sse2_rr2_p5.minus_ln2_hi[i] = -0x1.62E400p-1f;
3171 params->sse2_rr2_p5.minus_ln2_lo[i] = -0x1.7F7D1Cp-20f;
3172 params->sse2_rr2_p5.c5[i] = 0x1.0F9F9Cp-7f;
3173 params->sse2_rr2_p5.c4[i] = 0x1.573A1Ap-5f;
3174 params->sse2_rr2_p5.c3[i] = 0x1.555A80p-3f;
3175 params->sse2_rr2_p5.c2[i] = 0x1.FFFDC6p-2f;
3176 params->sse2_rr2_p5.c1[i] = 0x1.FFFFF6p-1f;
3177 params->sse2_rr2_p5.denorm_cutoff[i] = -0x1.5D589Ep6f;
3178 }
3179 return sizeof(params->sse2_rr2_p5);
3180}
3181
3182size_t xnn_init_f32_expminus_avx2_rr1_p5_params(
3183 union xnn_f32_expminus_params params[XNN_MIN_ELEMENTS(1)])
3184{
3185 for (uint32_t i = 0; i < 8; i++) {
3186 params->avx2_rr1_p5.log2e[i] = 0x1.715476p+0f;
3187 params->avx2_rr1_p5.magic_bias[i] = 0x1.8000FEp23f;
3188 params->avx2_rr1_p5.minus_ln2[i] = -0x1.62E430p-1f;
3189 params->avx2_rr1_p5.c5[i] = 0x1.0F9F9Cp-7f;
3190 params->avx2_rr1_p5.c4[i] = 0x1.573A1Ap-5f;
3191 params->avx2_rr1_p5.c3[i] = 0x1.555A80p-3f;
3192 params->avx2_rr1_p5.c2[i] = 0x1.FFFDC6p-2f;
3193 params->avx2_rr1_p5.c1[i] = 0x1.FFFFF6p-1f;
3194 params->avx2_rr1_p5.denorm_cutoff[i] = -0x1.5D589Ep6f;
3195 }
3196 for (uint32_t i = 0; i < 7; i++) {
3197 params->avx2_rr1_p5.mask_table[i] = -1;
3198 }
3199 for (uint32_t i = 7; i < 14; i++) {
3200 params->avx2_rr1_p5.mask_table[i] = 0;
3201 }
3202 return sizeof(params->avx2_rr1_p5);
3203}
3204
3205size_t xnn_init_f32_expminus_avx512_rr1_p5_params(
3206 union xnn_f32_expminus_params params[XNN_MIN_ELEMENTS(1)])
3207{
3208 params->avx512_rr1_p5.log2e = 0x1.715476p+0f;
3209 params->avx512_rr1_p5.minus_ln2 = -0x1.62E430p-1f;
3210 params->avx512_rr1_p5.c5 = 0x1.0F9F9Cp-7f;
3211 params->avx512_rr1_p5.c4 = 0x1.573A1Ap-5f;
3212 params->avx512_rr1_p5.c3 = 0x1.555A80p-3f;
3213 params->avx512_rr1_p5.c2 = 0x1.FFFDC6p-2f;
3214 params->avx512_rr1_p5.c1 = 0x1.FFFFF6p-1f;
3215 params->avx512_rr1_p5.c0 = 1.0f;
3216 return sizeof(params->avx512_rr1_p5);
3217}
3218#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
3219
3220#if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
3221size_t xnn_init_f32_expminus_wasmsimd_rr2_p5_params(
3222 union xnn_f32_expminus_params params[XNN_MIN_ELEMENTS(1)])
3223{
3224 for (uint32_t i = 0; i < 2; i++) {
3225 params->wasmsimd_rr2_p5.log2e[i] = 0x1.715476p+0f;
3226 params->wasmsimd_rr2_p5.magic_bias[i] = 0x1.8000FEp23f;
3227 params->wasmsimd_rr2_p5.minus_ln2_hi[i] = -0x1.62E400p-1f;
3228 params->wasmsimd_rr2_p5.minus_ln2_lo[i] = -0x1.7F7D1Cp-20f;
3229 params->wasmsimd_rr2_p5.c5[i] = 0x1.0F9F9Cp-7f;
3230 params->wasmsimd_rr2_p5.c4[i] = 0x1.573A1Ap-5f;
3231 params->wasmsimd_rr2_p5.c3[i] = 0x1.555A80p-3f;
3232 params->wasmsimd_rr2_p5.c2[i] = 0x1.FFFDC6p-2f;
3233 params->wasmsimd_rr2_p5.c1[i] = 0x1.FFFFF6p-1f;
3234 params->wasmsimd_rr2_p5.denorm_cutoff[i] = -0x1.5D589Ep6f;
3235 }
3236 return sizeof(params->wasmsimd_rr2_p5);
3237}
3238#endif // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
3239
3240#if XNN_ARCH_ARM || XNN_ARCH_ARM64
3241size_t xnn_init_f16_lrelu_fp16arith_params(
3242 union xnn_f16_lrelu_params params[XNN_MIN_ELEMENTS(1)],
3243 uint16_t slope)
3244{
3245 params->fp16arith.slope = slope;
3246 return sizeof(params->fp16arith);
3247}
3248#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64
3249
3250#if XNN_ARCH_X86 || XNN_ARCH_X86_64
3251size_t xnn_init_f16_lrelu_avx_params(
3252 union xnn_f16_lrelu_params params[XNN_MIN_ELEMENTS(1)],
3253 uint16_t slope)
3254{
3255 for (uint32_t i = 0; i < 8; i++) {
3256 params->avx.slope[i] = fp16_ieee_to_fp32_value(slope);
3257 }
3258 return sizeof(params->avx);
3259}
3260#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
3261
3262size_t xnn_init_f32_lrelu_scalar_params(
3263 union xnn_f32_lrelu_params params[XNN_MIN_ELEMENTS(1)],
3264 float slope)
3265{
3266 params->scalar.slope = slope;
3267 return sizeof(params->scalar);
3268}
3269
3270#if XNN_ARCH_X86 || XNN_ARCH_X86_64
3271size_t xnn_init_f32_lrelu_sse_params(
3272 union xnn_f32_lrelu_params params[XNN_MIN_ELEMENTS(1)],
3273 float slope)
3274{
3275 for (uint32_t i = 0; i < 4; i++) {
3276 params->sse.slope[i] = slope;
3277 }
3278 return sizeof(params->sse);
3279}
3280
3281size_t xnn_init_f32_lrelu_avx_params(
3282 union xnn_f32_lrelu_params params[XNN_MIN_ELEMENTS(1)],
3283 float slope)
3284{
3285 for (uint32_t i = 0; i < 8; i++) {
3286 params->avx.slope[i] = slope;
3287 }
3288 for (uint32_t i = 0; i < 7; i++) {
3289 params->avx.mask_table[i] = -1;
3290 }
3291 for (uint32_t i = 7; i < 14; i++) {
3292 params->avx.mask_table[i] = 0;
3293 }
3294 return sizeof(params->avx);
3295}
3296#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
3297
3298#if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
3299size_t xnn_init_f32_lrelu_wasmsimd_params(
3300 union xnn_f32_lrelu_params params[XNN_MIN_ELEMENTS(1)],
3301 float slope)
3302{
3303 params->wasmsimd.slope[0] = slope;
3304 params->wasmsimd.slope[1] = slope;
3305 return sizeof(params->wasmsimd);
3306}
3307#endif // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
3308
3309size_t xnn_init_qs8_lrelu_scalar_select_params(
3310 union xnn_qs8_lrelu_params params[XNN_MIN_ELEMENTS(1)],
3311 float positive_scale,
3312 float negative_scale,
3313 int8_t input_zero_point,
3314 int8_t output_zero_point)
3315{
3316 assert(positive_scale >= 0x1.0p-8f);
3317 assert(positive_scale <= 0x1.0p+7f);
3318 assert(negative_scale <= 0x1.0p+7f);
3319 assert(negative_scale >= -0x1.FFFC00p+6f);
3320 assert(fabsf(negative_scale) >= 0x1.0p-8f);
3321
3322 const long positive_multiplier = lrintf(256.0f * positive_scale);
3323 assert(positive_multiplier >= 1L);
3324 assert(positive_multiplier <= 32768L);
3325 const long negative_multiplier = lrintf(256.0f * negative_scale);
3326 assert(negative_multiplier <= 32768L);
3327 assert(negative_multiplier >= -32767L);
3328 assert(negative_multiplier != 0L);
3329 params->scalar_select.input_zero_point = (int32_t) input_zero_point;
3330 params->scalar_select.positive_multiplier = (int32_t) positive_multiplier;
3331 params->scalar_select.negative_multiplier = (int32_t) negative_multiplier;
3332 params->scalar_select.bias = ((int32_t) output_zero_point << 8) + INT32_C(0x80);
3333 return sizeof(params->scalar_select);
3334}
3335
3336size_t xnn_init_qs8_lrelu_scalar_andxor_params(
3337 union xnn_qs8_lrelu_params params[XNN_MIN_ELEMENTS(1)],
3338 float positive_scale,
3339 float negative_scale,
3340 int8_t input_zero_point,
3341 int8_t output_zero_point)
3342{
3343 assert(positive_scale >= 0x1.0p-8f);
3344 assert(positive_scale <= 0x1.0p+7f);
3345 assert(negative_scale <= 0x1.0p+7f);
3346 assert(negative_scale >= -0x1.FFFC00p+6f);
3347 assert(fabsf(negative_scale) >= 0x1.0p-8f);
3348
3349 const long positive_multiplier = lrintf(256.0f * positive_scale);
3350 assert(positive_multiplier >= 1L);
3351 assert(positive_multiplier <= 32768L);
3352 const long negative_multiplier = lrintf(256.0f * negative_scale);
3353 assert(negative_multiplier <= 32768L);
3354 assert(negative_multiplier >= -32767L);
3355 assert(negative_multiplier != 0L);
3356 params->scalar_andxor.input_zero_point = (int32_t) input_zero_point;
3357 params->scalar_andxor.multiplier_base = (int32_t) positive_multiplier;
3358 params->scalar_andxor.multiplier_diff = (int32_t) negative_multiplier ^ (int32_t) positive_multiplier;
3359 params->scalar_andxor.bias = ((int32_t) output_zero_point << 8) + INT32_C(0x80);
3360 return sizeof(params->scalar_andxor);
3361}
3362
3363#if XNN_ARCH_ARM
3364size_t xnn_init_qs8_lrelu_armsimd32_params(
3365 union xnn_qs8_lrelu_params params[XNN_MIN_ELEMENTS(1)],
3366 float positive_scale,
3367 float negative_scale,
3368 int8_t input_zero_point,
3369 int8_t output_zero_point)
3370{
3371 assert(positive_scale >= 0x1.0p-8f);
3372 assert(positive_scale <= 0x1.0p+7f);
3373 assert(negative_scale <= 0x1.0p+7f);
3374 assert(negative_scale >= -0x1.FFFC00p+6f);
3375 assert(fabsf(negative_scale) >= 0x1.0p-8f);
3376
3377 const long positive_multiplier = lrintf(-256.0f * positive_scale);
3378 assert(positive_multiplier <= -1L);
3379 assert(positive_multiplier >= -32768L);
3380 const long negative_multiplier = lrintf(-256.0f * negative_scale);
3381 assert(negative_multiplier >= -32768L);
3382 assert(negative_multiplier <= 32767L);
3383 assert(negative_multiplier != 0L);
3384 params->armsimd32.input_zero_point = (uint32_t) (uint16_t) (int16_t) input_zero_point * UINT32_C(0x00010001);
3385 params->armsimd32.positive_multiplier = (uint32_t) (uint16_t) (int16_t) positive_multiplier * UINT32_C(0x00010001);
3386 params->armsimd32.negative_multiplier = (uint32_t) (uint16_t) (int16_t) negative_multiplier * UINT32_C(0x00010001);
3387 params->armsimd32.bias = ((int32_t) output_zero_point << 8) + INT32_C(0x80);
3388 return sizeof(params->armsimd32);
3389}
3390#endif // XNN_ARCH_ARM
3391
3392#if XNN_ARCH_ARM || XNN_ARCH_ARM64
3393size_t xnn_init_qs8_lrelu_neon_params(
3394 union xnn_qs8_lrelu_params params[XNN_MIN_ELEMENTS(1)],
3395 float positive_scale,
3396 float negative_scale,
3397 int8_t input_zero_point,
3398 int8_t output_zero_point)
3399{
3400 assert(positive_scale >= 0x1.0p-8f);
3401 assert(positive_scale <= 0x1.0p+7f);
3402 assert(negative_scale <= 0x1.0p+7f);
3403 assert(negative_scale >= -0x1.FFFC00p+6f);
3404 assert(fabsf(negative_scale) >= 0x1.0p-8f);
3405
3406 const long positive_multiplier = lrintf(-256.0f * positive_scale);
3407 assert(positive_multiplier <= -1L);
3408 assert(positive_multiplier >= -32768L);
3409 const long negative_multiplier = lrintf(-256.0f * negative_scale);
3410 assert(negative_multiplier >= -32768L);
3411 assert(negative_multiplier <= 32767L);
3412 assert(negative_multiplier != 0L);
3413 params->neon.input_zero_point = (int16_t) input_zero_point;
3414 params->neon.positive_multiplier = (int16_t) positive_multiplier;
3415 params->neon.negative_multiplier = (int16_t) negative_multiplier;
3416 params->neon.output_zero_point = (int16_t) output_zero_point;
3417 return sizeof(params->neon);
3418}
3419#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64
3420
3421#if XNN_ARCH_X86 || XNN_ARCH_X86_64
3422size_t xnn_init_qs8_lrelu_sse2_params(
3423 union xnn_qs8_lrelu_params params[XNN_MIN_ELEMENTS(1)],
3424 float positive_scale,
3425 float negative_scale,
3426 int8_t input_zero_point,
3427 int8_t output_zero_point)
3428{
3429 assert(positive_scale >= 0x1.0p-8f);
3430 assert(positive_scale <= 0x1.0p+7f);
3431 assert(negative_scale <= 0x1.0p+7f);
3432 assert(negative_scale >= -0x1.FFFC00p+6f);
3433 assert(fabsf(negative_scale) >= 0x1.0p-8f);
3434
3435 const long positive_multiplier = lrintf(-256.0f * positive_scale);
3436 assert(positive_multiplier <= -1L);
3437 assert(positive_multiplier >= -32768L);
3438 const long negative_multiplier = lrintf(-256.0f * negative_scale);
3439 assert(negative_multiplier >= -32768L);
3440 assert(negative_multiplier <= 32767L);
3441 assert(negative_multiplier != 0L);
3442 const int16_t multiplier_base = (int16_t) negative_multiplier;
3443 const int16_t multiplier_diff = (int16_t) positive_multiplier ^ (int16_t) negative_multiplier;
3444 for (uint32_t i = 0; i < 8; i++) {
3445 params->sse2.input_zero_point[i] = (int16_t) input_zero_point;
3446 params->sse2.multiplier_diff[i] = multiplier_diff;
3447 params->sse2.multiplier_base[i] = multiplier_base;
3448 params->sse2.output_zero_point[i] = (int16_t) output_zero_point;
3449 }
3450 return sizeof(params->sse2);
3451}
3452
3453size_t xnn_init_qs8_lrelu_avx_params(
3454 union xnn_qs8_lrelu_params params[XNN_MIN_ELEMENTS(1)],
3455 float positive_scale,
3456 float negative_scale,
3457 int8_t input_zero_point,
3458 int8_t output_zero_point)
3459{
3460 assert(positive_scale >= 0x1.0p-8f);
3461 assert(positive_scale <= 0x1.0p+7f);
3462 assert(negative_scale <= 0x1.0p+7f);
3463 assert(negative_scale >= -0x1.FFFC00p+6f);
3464 assert(fabsf(negative_scale) >= 0x1.0p-8f);
3465
3466 const long positive_multiplier = lrintf(-256.0f * positive_scale);
3467 assert(positive_multiplier <= -1L);
3468 assert(positive_multiplier >= -32768L);
3469 const long negative_multiplier = lrintf(-256.0f * negative_scale);
3470 assert(negative_multiplier >= -32768L);
3471 assert(negative_multiplier <= 32767L);
3472 assert(negative_multiplier != 0L);
3473 for (uint32_t i = 0; i < 8; i++) {
3474 params->avx.input_zero_point[i] = (int16_t) input_zero_point;
3475 params->avx.positive_multiplier[i] = (int16_t) positive_multiplier;
3476 params->avx.negative_multiplier[i] = (int16_t) negative_multiplier;
3477 params->avx.output_zero_point[i] = (int16_t) output_zero_point;
3478 }
3479 return sizeof(params->avx);
3480}
3481
3482size_t xnn_init_qs8_lrelu_avx2_params(
3483 union xnn_qs8_lrelu_params params[XNN_MIN_ELEMENTS(1)],
3484 float positive_scale,
3485 float negative_scale,
3486 int8_t input_zero_point,
3487 int8_t output_zero_point)
3488{
3489 assert(positive_scale >= 0x1.0p-8f);
3490 assert(positive_scale <= 0x1.0p+7f);
3491 assert(negative_scale <= 0x1.0p+7f);
3492 assert(negative_scale >= -0x1.FFFC00p+6f);
3493 assert(fabsf(negative_scale) >= 0x1.0p-8f);
3494
3495 const long positive_multiplier = lrintf(-256.0f * positive_scale);
3496 assert(positive_multiplier <= -1L);
3497 assert(positive_multiplier >= -32768L);
3498 const long negative_multiplier = lrintf(-256.0f * negative_scale);
3499 assert(negative_multiplier >= -32768L);
3500 assert(negative_multiplier <= 32767L);
3501 assert(negative_multiplier != 0L);
3502 for (uint32_t i = 0; i < 16; i++) {
3503 params->avx2.input_zero_point[i] = (int16_t) input_zero_point;
3504 params->avx2.positive_multiplier[i] = (int16_t) positive_multiplier;
3505 params->avx2.negative_multiplier[i] = (int16_t) negative_multiplier;
3506 params->avx2.output_zero_point[i] = (int16_t) output_zero_point;
3507 }
3508 return sizeof(params->avx2);
3509}
3510#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
3511
3512#if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
3513size_t xnn_init_qs8_lrelu_wasmsimd_arm_params(
3514 union xnn_qs8_lrelu_params params[XNN_MIN_ELEMENTS(1)],
3515 float positive_scale,
3516 float negative_scale,
3517 int8_t input_zero_point,
3518 int8_t output_zero_point)
3519{
3520 assert(positive_scale >= 0x1.0p-8f);
3521 assert(positive_scale <= 0x1.0p+7f);
3522 assert(negative_scale <= 0x1.0p+7f);
3523 assert(negative_scale >= -0x1.FFFC00p+6f);
3524 assert(fabsf(negative_scale) >= 0x1.0p-8f);
3525
3526 const long positive_multiplier = lrintf(-256.0f * positive_scale);
3527 assert(positive_multiplier <= -1L);
3528 assert(positive_multiplier >= -32768L);
3529 const long negative_multiplier = lrintf(-256.0f * negative_scale);
3530 assert(negative_multiplier >= -32768L);
3531 assert(negative_multiplier <= 32767L);
3532 assert(negative_multiplier != 0L);
3533 for (uint32_t i = 0; i < 4; i++) {
3534 params->wasmsimd_arm.input_zero_point[i] = (int16_t) input_zero_point;
3535 params->wasmsimd_arm.positive_multiplier[i] = (int16_t) positive_multiplier;
3536 params->wasmsimd_arm.negative_multiplier[i] = (int16_t) negative_multiplier;
3537 params->wasmsimd_arm.output_zero_point[i] = (int16_t) output_zero_point;
3538 }
3539 return sizeof(params->wasmsimd_arm);
3540}
3541
3542size_t xnn_init_qs8_lrelu_wasmsimd_x86_params(
3543 union xnn_qs8_lrelu_params params[XNN_MIN_ELEMENTS(1)],
3544 float positive_scale,
3545 float negative_scale,
3546 int8_t input_zero_point,
3547 int8_t output_zero_point)
3548{
3549 assert(positive_scale >= 0x1.0p-8f);
3550 assert(positive_scale <= 0x1.0p+7f);
3551 assert(negative_scale <= 0x1.0p+7f);
3552 assert(negative_scale >= -0x1.FFFC00p+6f);
3553 assert(fabsf(negative_scale) >= 0x1.0p-8f);
3554
3555 const long positive_multiplier = lrintf(-256.0f * positive_scale);
3556 assert(positive_multiplier <= -1L);
3557 assert(positive_multiplier >= -32768L);
3558 const long negative_multiplier = lrintf(-256.0f * negative_scale);
3559 assert(negative_multiplier >= -32768L);
3560 assert(negative_multiplier <= 32767L);
3561 assert(negative_multiplier != 0L);
3562 const int16_t multiplier_base = (int16_t) negative_multiplier;
3563 const int16_t multiplier_diff = (int16_t) positive_multiplier ^ (int16_t) negative_multiplier;
3564 for (uint32_t i = 0; i < 4; i++) {
3565 params->wasmsimd_x86.input_zero_point[i] = (int16_t) input_zero_point;
3566 params->wasmsimd_x86.multiplier_diff[i] = multiplier_diff;
3567 params->wasmsimd_x86.multiplier_base[i] = multiplier_base;
3568 params->wasmsimd_x86.output_zero_point[i] = (int16_t) output_zero_point;
3569 }
3570 return sizeof(params->wasmsimd_x86);
3571}
3572#endif // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
3573
3574size_t xnn_init_qu8_lrelu_scalar_select_params(
3575 union xnn_qu8_lrelu_params params[XNN_MIN_ELEMENTS(1)],
3576 float positive_scale,
3577 float negative_scale,
3578 uint8_t input_zero_point,
3579 uint8_t output_zero_point)
3580{
3581 assert(positive_scale >= 0x1.0p-8f);
3582 assert(positive_scale <= 0x1.0p+7f);
3583 assert(negative_scale <= 0x1.0p+7f);
3584 assert(negative_scale >= -0x1.FFFC00p+6f);
3585 assert(fabsf(negative_scale) >= 0x1.0p-8f);
3586
3587 const long positive_multiplier = lrintf(256.0f * positive_scale);
3588 assert(positive_multiplier >= 1L);
3589 assert(positive_multiplier <= 32768L);
3590 const long negative_multiplier = lrintf(256.0f * negative_scale);
3591 assert(negative_multiplier <= 32768L);
3592 assert(negative_multiplier >= -32767L);
3593 assert(negative_multiplier != 0L);
3594 params->scalar_select.input_zero_point = (int32_t) input_zero_point;
3595 params->scalar_select.positive_multiplier = (int32_t) positive_multiplier;
3596 params->scalar_select.negative_multiplier = (int32_t) negative_multiplier;
3597 params->scalar_select.bias = ((int32_t) output_zero_point << 8) + INT32_C(0x80);
3598 return sizeof(params->scalar_select);
3599}
3600
3601size_t xnn_init_qu8_lrelu_scalar_andxor_params(
3602 union xnn_qu8_lrelu_params params[XNN_MIN_ELEMENTS(1)],
3603 float positive_scale,
3604 float negative_scale,
3605 uint8_t input_zero_point,
3606 uint8_t output_zero_point)
3607{
3608 assert(positive_scale >= 0x1.0p-8f);
3609 assert(positive_scale <= 0x1.0p+7f);
3610 assert(negative_scale <= 0x1.0p+7f);
3611 assert(negative_scale >= -0x1.FFFC00p+6f);
3612 assert(fabsf(negative_scale) >= 0x1.0p-8f);
3613
3614 const long positive_multiplier = lrintf(256.0f * positive_scale);
3615 assert(positive_multiplier >= 1L);
3616 assert(positive_multiplier <= 32768L);
3617 const long negative_multiplier = lrintf(256.0f * negative_scale);
3618 assert(negative_multiplier <= 32768L);
3619 assert(negative_multiplier >= -32767L);
3620 assert(negative_multiplier != 0L);
3621 params->scalar_andxor.input_zero_point = (int32_t) input_zero_point;
3622 params->scalar_andxor.multiplier_base = (int32_t) positive_multiplier;
3623 params->scalar_andxor.multiplier_diff = (int32_t) negative_multiplier ^ (int32_t) positive_multiplier;
3624 params->scalar_andxor.bias = ((int32_t) output_zero_point << 8) + INT32_C(0x80);
3625 return sizeof(params->scalar_andxor);
3626}
3627
3628#if XNN_ARCH_ARM
3629size_t xnn_init_qu8_lrelu_armsimd32_params(
3630 union xnn_qu8_lrelu_params params[XNN_MIN_ELEMENTS(1)],
3631 float positive_scale,
3632 float negative_scale,
3633 uint8_t input_zero_point,
3634 uint8_t output_zero_point)
3635{
3636 assert(positive_scale >= 0x1.0p-8f);
3637 assert(positive_scale <= 0x1.0p+7f);
3638 assert(negative_scale <= 0x1.0p+7f);
3639 assert(negative_scale >= -0x1.FFFC00p+6f);
3640 assert(fabsf(negative_scale) >= 0x1.0p-8f);
3641
3642 const long positive_multiplier = lrintf(-256.0f * positive_scale);
3643 assert(positive_multiplier <= -1L);
3644 assert(positive_multiplier >= -32768L);
3645 const long negative_multiplier = lrintf(-256.0f * negative_scale);
3646 assert(negative_multiplier >= -32768L);
3647 assert(negative_multiplier <= 32767L);
3648 assert(negative_multiplier != 0L);
3649 params->armsimd32.input_zero_point = (uint32_t) input_zero_point * UINT32_C(0x00010001);
3650 params->armsimd32.positive_multiplier = (uint32_t) (uint16_t) (int16_t) positive_multiplier * UINT32_C(0x00010001);
3651 params->armsimd32.negative_multiplier = (uint32_t) (uint16_t) (int16_t) negative_multiplier * UINT32_C(0x00010001);
3652 params->armsimd32.bias = ((int32_t) output_zero_point << 8) + INT32_C(0x80);
3653 return sizeof(params->armsimd32);
3654}
3655#endif // XNN_ARCH_ARM
3656
3657#if XNN_ARCH_ARM || XNN_ARCH_ARM64
3658size_t xnn_init_qu8_lrelu_neon_params(
3659 union xnn_qu8_lrelu_params params[XNN_MIN_ELEMENTS(1)],
3660 float positive_scale,
3661 float negative_scale,
3662 uint8_t input_zero_point,
3663 uint8_t output_zero_point)
3664{
3665 assert(positive_scale >= 0x1.0p-8f);
3666 assert(positive_scale <= 0x1.0p+7f);
3667 assert(negative_scale <= 0x1.0p+7f);
3668 assert(negative_scale >= -0x1.FFFC00p+6f);
3669 assert(fabsf(negative_scale) >= 0x1.0p-8f);
3670
3671 const long positive_multiplier = lrintf(-256.0f * positive_scale);
3672 assert(positive_multiplier <= -1L);
3673 assert(positive_multiplier >= -32768L);
3674 const long negative_multiplier = lrintf(-256.0f * negative_scale);
3675 assert(negative_multiplier >= -32768L);
3676 assert(negative_multiplier <= 32767L);
3677 assert(negative_multiplier != 0L);
3678 params->neon.input_zero_point = (uint16_t) input_zero_point;
3679 params->neon.positive_multiplier = (int16_t) positive_multiplier;
3680 params->neon.negative_multiplier = (int16_t) negative_multiplier;
3681 params->neon.output_zero_point = (int16_t) output_zero_point;
3682 return sizeof(params->neon);
3683}
3684#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64
3685
3686#if XNN_ARCH_X86 || XNN_ARCH_X86_64
3687size_t xnn_init_qu8_lrelu_sse2_params(
3688 union xnn_qu8_lrelu_params params[XNN_MIN_ELEMENTS(1)],
3689 float positive_scale,
3690 float negative_scale,
3691 uint8_t input_zero_point,
3692 uint8_t output_zero_point)
3693{
3694 assert(positive_scale >= 0x1.0p-8f);
3695 assert(positive_scale <= 0x1.0p+7f);
3696 assert(negative_scale <= 0x1.0p+7f);
3697 assert(negative_scale >= -0x1.FFFC00p+6f);
3698 assert(fabsf(negative_scale) >= 0x1.0p-8f);
3699
3700 const long positive_multiplier = lrintf(-256.0f * positive_scale);
3701 assert(positive_multiplier <= -1L);
3702 assert(positive_multiplier >= -32768L);
3703 const long negative_multiplier = lrintf(-256.0f * negative_scale);
3704 assert(negative_multiplier >= -32768L);
3705 assert(negative_multiplier <= 32767L);
3706 assert(negative_multiplier != 0L);
3707 const int16_t multiplier_base = (int16_t) negative_multiplier;
3708 const int16_t multiplier_diff = (int16_t) positive_multiplier ^ (int16_t) negative_multiplier;
3709 for (uint32_t i = 0; i < 8; i++) {
3710 params->sse2.input_zero_point[i] = (int16_t) (uint16_t) input_zero_point;
3711 params->sse2.multiplier_diff[i] = multiplier_diff;
3712 params->sse2.multiplier_base[i] = multiplier_base;
3713 params->sse2.output_zero_point[i] = (int16_t) (uint16_t) output_zero_point;
3714 }
3715 return sizeof(params->sse2);
3716}
3717
3718size_t xnn_init_qu8_lrelu_avx_params(
3719 union xnn_qu8_lrelu_params params[XNN_MIN_ELEMENTS(1)],
3720 float positive_scale,
3721 float negative_scale,
3722 uint8_t input_zero_point,
3723 uint8_t output_zero_point)
3724{
3725 assert(positive_scale >= 0x1.0p-8f);
3726 assert(positive_scale <= 0x1.0p+7f);
3727 assert(negative_scale <= 0x1.0p+7f);
3728 assert(negative_scale >= -0x1.FFFC00p+6f);
3729 assert(fabsf(negative_scale) >= 0x1.0p-8f);
3730
3731 const long positive_multiplier = lrintf(-256.0f * positive_scale);
3732 assert(positive_multiplier <= -1L);
3733 assert(positive_multiplier >= -32768L);
3734 const long negative_multiplier = lrintf(-256.0f * negative_scale);
3735 assert(negative_multiplier >= -32768L);
3736 assert(negative_multiplier <= 32767L);
3737 assert(negative_multiplier != 0L);
3738 for (uint32_t i = 0; i < 8; i++) {
3739 params->avx.input_zero_point[i] = (int16_t) (uint16_t) input_zero_point;
3740 params->avx.positive_multiplier[i] = (int16_t) positive_multiplier;
3741 params->avx.negative_multiplier[i] = (int16_t) negative_multiplier;
3742 params->avx.output_zero_point[i] = (int16_t) (uint16_t) output_zero_point;
3743 }
3744 return sizeof(params->avx);
3745}
3746
3747size_t xnn_init_qu8_lrelu_avx2_params(
3748 union xnn_qu8_lrelu_params params[XNN_MIN_ELEMENTS(1)],
3749 float positive_scale,
3750 float negative_scale,
3751 uint8_t input_zero_point,
3752 uint8_t output_zero_point)
3753{
3754 assert(positive_scale >= 0x1.0p-8f);
3755 assert(positive_scale <= 0x1.0p+7f);
3756 assert(negative_scale <= 0x1.0p+7f);
3757 assert(negative_scale >= -0x1.FFFC00p+6f);
3758 assert(fabsf(negative_scale) >= 0x1.0p-8f);
3759
3760 const long positive_multiplier = lrintf(-256.0f * positive_scale);
3761 assert(positive_multiplier <= -1L);
3762 assert(positive_multiplier >= -32768L);
3763 const long negative_multiplier = lrintf(-256.0f * negative_scale);
3764 assert(negative_multiplier >= -32768L);
3765 assert(negative_multiplier <= 32767L);
3766 assert(negative_multiplier != 0L);
3767 for (uint32_t i = 0; i < 16; i++) {
3768 params->avx2.input_zero_point[i] = (int16_t) (uint16_t) input_zero_point;
3769 params->avx2.positive_multiplier[i] = (int16_t) positive_multiplier;
3770 params->avx2.negative_multiplier[i] = (int16_t) negative_multiplier;
3771 params->avx2.output_zero_point[i] = (int16_t) (uint16_t) output_zero_point;
3772 }
3773 return sizeof(params->avx2);
3774}
3775#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
3776
3777#if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
3778size_t xnn_init_qu8_lrelu_wasmsimd_arm_params(
3779 union xnn_qu8_lrelu_params params[XNN_MIN_ELEMENTS(1)],
3780 float positive_scale,
3781 float negative_scale,
3782 uint8_t input_zero_point,
3783 uint8_t output_zero_point)
3784{
3785 assert(positive_scale >= 0x1.0p-8f);
3786 assert(positive_scale <= 0x1.0p+7f);
3787 assert(negative_scale <= 0x1.0p+7f);
3788 assert(negative_scale >= -0x1.FFFC00p+6f);
3789 assert(fabsf(negative_scale) >= 0x1.0p-8f);
3790
3791 const long positive_multiplier = lrintf(-256.0f * positive_scale);
3792 assert(positive_multiplier <= -1L);
3793 assert(positive_multiplier >= -32768L);
3794 const long negative_multiplier = lrintf(-256.0f * negative_scale);
3795 assert(negative_multiplier >= -32768L);
3796 assert(negative_multiplier <= 32767L);
3797 assert(negative_multiplier != 0L);
3798 for (uint32_t i = 0; i < 4; i++) {
3799 params->wasmsimd_arm.input_zero_point[i] = (int16_t) (uint16_t) input_zero_point;
3800 params->wasmsimd_arm.positive_multiplier[i] = (int16_t) positive_multiplier;
3801 params->wasmsimd_arm.negative_multiplier[i] = (int16_t) negative_multiplier;
3802 params->wasmsimd_arm.output_zero_point[i] = (int16_t) (uint16_t) output_zero_point;
3803 }
3804 return sizeof(params->wasmsimd_arm);
3805}
3806
3807size_t xnn_init_qu8_lrelu_wasmsimd_x86_params(
3808 union xnn_qu8_lrelu_params params[XNN_MIN_ELEMENTS(1)],
3809 float positive_scale,
3810 float negative_scale,
3811 uint8_t input_zero_point,
3812 uint8_t output_zero_point)
3813{
3814 assert(positive_scale >= 0x1.0p-8f);
3815 assert(positive_scale <= 0x1.0p+7f);
3816 assert(negative_scale <= 0x1.0p+7f);
3817 assert(negative_scale >= -0x1.FFFC00p+6f);
3818 assert(fabsf(negative_scale) >= 0x1.0p-8f);
3819
3820 const long positive_multiplier = lrintf(-256.0f * positive_scale);
3821 assert(positive_multiplier <= -1L);
3822 assert(positive_multiplier >= -32768L);
3823 const long negative_multiplier = lrintf(-256.0f * negative_scale);
3824 assert(negative_multiplier >= -32768L);
3825 assert(negative_multiplier <= 32767L);
3826 assert(negative_multiplier != 0L);
3827 const int16_t multiplier_base = (int16_t) negative_multiplier;
3828 const int16_t multiplier_diff = (int16_t) positive_multiplier ^ (int16_t) negative_multiplier;
3829 for (uint32_t i = 0; i < 4; i++) {
3830 params->wasmsimd_x86.input_zero_point[i] = (int16_t) (uint16_t) input_zero_point;
3831 params->wasmsimd_x86.multiplier_diff[i] = multiplier_diff;
3832 params->wasmsimd_x86.multiplier_base[i] = multiplier_base;
3833 params->wasmsimd_x86.output_zero_point[i] = (int16_t) (uint16_t) output_zero_point;
3834 }
3835 return sizeof(params->wasmsimd_x86);
3836}
3837#endif // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
3838
3839#if XNN_ARCH_X86 || XNN_ARCH_X86_64
3840size_t xnn_init_f32_sqrt_avx_params(
3841 union xnn_f32_sqrt_params params[XNN_MIN_ELEMENTS(1)])
3842{
3843 for (uint32_t i = 0; i < 7; i++) {
3844 params->avx.mask_table[i] = -1;
3845 }
3846 for (uint32_t i = 7; i < 14; i++) {
3847 params->avx.mask_table[i] = 0;
3848 }
3849 return sizeof(params->avx);
3850}
3851
3852size_t xnn_init_f32_sqrt_fma_params(
3853 union xnn_f32_sqrt_params params[XNN_MIN_ELEMENTS(1)])
3854{
3855 for (uint32_t i = 0; i < 8; i++) {
3856 params->fma.half[i] = 0.5f;
3857 }
3858 for (uint32_t i = 0; i < 7; i++) {
3859 params->fma.mask_table[i] = -1;
3860 }
3861 for (uint32_t i = 7; i < 14; i++) {
3862 params->fma.mask_table[i] = 0;
3863 }
3864 return sizeof(params->fma);
3865}
3866
3867size_t xnn_init_f32_sqrt_avx512_params(
3868 union xnn_f32_sqrt_params params[XNN_MIN_ELEMENTS(1)])
3869{
3870 params->avx512.half = 0.5f;
3871 return sizeof(params->avx512);
3872}
3873#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
3874
3875#if XNN_ARCH_ARM || XNN_ARCH_ARM64
3876size_t xnn_init_f16_chw_neonfp16arith_stride1_params(
3877 union xnn_f16_chw_params params[XNN_MIN_ELEMENTS(1)],
3878 uint32_t width,
3879 uint16_t output_min,
3880 uint16_t output_max)
3881{
3882 params->neonfp16arith_stride1.min = output_min;
3883 params->neonfp16arith_stride1.max = output_max;
3884
3885 const uint32_t w8 = (width - 1) & 7;
3886 params->neonfp16arith_stride1.mask[0] = UINT16_C(0xFFFF);
3887 params->neonfp16arith_stride1.mask[1] = -(uint16_t) (w8 >= 1);
3888 params->neonfp16arith_stride1.mask[2] = -(uint16_t) (w8 >= 2);
3889 params->neonfp16arith_stride1.mask[3] = -(uint16_t) (w8 >= 3);
3890 params->neonfp16arith_stride1.mask[4] = -(uint16_t) (w8 >= 4);
3891 params->neonfp16arith_stride1.mask[5] = -(uint16_t) (w8 >= 5);
3892 params->neonfp16arith_stride1.mask[6] = -(uint16_t) (w8 >= 6);
3893 params->neonfp16arith_stride1.mask[7] = -(uint16_t) (w8 >= 7);
3894
3895 return sizeof(params->neonfp16arith_stride1);
3896}
3897
3898size_t xnn_init_f16_chw_neonfp16arith_stride2_params(
3899 union xnn_f16_chw_params params[XNN_MIN_ELEMENTS(1)],
3900 uint32_t width,
3901 uint16_t output_min,
3902 uint16_t output_max)
3903{
3904 params->neonfp16arith_stride1.min = output_min;
3905 params->neonfp16arith_stride1.max = output_max;
3906
3907 const uint32_t w16 = (width - 1) & 15;
3908 params->neonfp16arith_stride2.mask_even[0] = UINT16_C(0xFFFF);
3909 params->neonfp16arith_stride2.mask_even[1] = -(uint16_t) (w16 >= 2);
3910 params->neonfp16arith_stride2.mask_even[2] = -(uint16_t) (w16 >= 4);
3911 params->neonfp16arith_stride2.mask_even[3] = -(uint16_t) (w16 >= 6);
3912 params->neonfp16arith_stride2.mask_even[4] = -(uint16_t) (w16 >= 8);
3913 params->neonfp16arith_stride2.mask_even[5] = -(uint16_t) (w16 >= 10);
3914 params->neonfp16arith_stride2.mask_even[6] = -(uint16_t) (w16 >= 12);
3915 params->neonfp16arith_stride2.mask_even[7] = -(uint16_t) (w16 >= 14);
3916 params->neonfp16arith_stride2.mask_odd[0] = -(uint16_t) (w16 >= 1);
3917 params->neonfp16arith_stride2.mask_odd[1] = -(uint16_t) (w16 >= 3);
3918 params->neonfp16arith_stride2.mask_odd[2] = -(uint16_t) (w16 >= 5);
3919 params->neonfp16arith_stride2.mask_odd[3] = -(uint16_t) (w16 >= 7);
3920 params->neonfp16arith_stride2.mask_odd[4] = -(uint16_t) (w16 >= 9);
3921 params->neonfp16arith_stride2.mask_odd[5] = -(uint16_t) (w16 >= 11);
3922 params->neonfp16arith_stride2.mask_odd[6] = -(uint16_t) (w16 >= 13);
3923 params->neonfp16arith_stride2.mask_odd[7] = -(uint16_t) (w16 >= 15);
3924
3925 return sizeof(params->neonfp16arith_stride2);
3926}
3927#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64
3928
3929size_t xnn_init_f32_chw_scalar_params(
3930 union xnn_f32_chw_params params[XNN_MIN_ELEMENTS(1)],
3931 uint32_t width,
3932 float output_min,
3933 float output_max)
3934{
3935 params->scalar.min = output_min;
3936 params->scalar.max = output_max;
3937 return sizeof(params->scalar);
3938}
3939
3940#if XNN_ARCH_ARM || XNN_ARCH_ARM64
3941size_t xnn_init_f32_chw_neon_stride1_params(
3942 union xnn_f32_chw_params params[XNN_MIN_ELEMENTS(1)],
3943 uint32_t width,
3944 float output_min,
3945 float output_max)
3946{
3947 params->neon_stride1.min = output_min;
3948 params->neon_stride1.max = output_max;
3949
3950 const uint32_t w4 = (width - 1) & 3;
3951 params->neon_stride1.mask[0] = UINT32_C(0xFFFFFFFF);
3952 params->neon_stride1.mask[1] = -(uint32_t) (w4 >= 1);
3953 params->neon_stride1.mask[2] = -(uint32_t) (w4 >= 2);
3954 params->neon_stride1.mask[3] = -(uint32_t) (w4 >= 3);
3955
3956 return sizeof(params->neon_stride1);
3957}
3958
3959size_t xnn_init_f32_chw_neon_stride2_params(
3960 union xnn_f32_chw_params params[XNN_MIN_ELEMENTS(1)],
3961 uint32_t width,
3962 float output_min,
3963 float output_max)
3964{
3965 params->neon_stride2.min = output_min;
3966 params->neon_stride2.max = output_max;
3967
3968 const uint32_t w8 = (width - 1) & 7;
3969 params->neon_stride2.mask_even[0] = UINT32_C(0xFFFFFFFF);
3970 params->neon_stride2.mask_even[1] = -(uint32_t) (w8 >= 2);
3971 params->neon_stride2.mask_even[2] = -(uint32_t) (w8 >= 4);
3972 params->neon_stride2.mask_even[3] = -(uint32_t) (w8 >= 6);
3973 params->neon_stride2.mask_odd[0] = -(uint32_t) (w8 >= 1);
3974 params->neon_stride2.mask_odd[1] = -(uint32_t) (w8 >= 3);
3975 params->neon_stride2.mask_odd[2] = -(uint32_t) (w8 >= 5);
3976 params->neon_stride2.mask_odd[3] = -(uint32_t) (w8 >= 7);
3977
3978 return sizeof(params->neon_stride2);
3979}
3980#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64
3981
3982#if XNN_ARCH_X86 || XNN_ARCH_X86_64
3983size_t xnn_init_f32_chw_sse_stride1_params(
3984 union xnn_f32_chw_params params[XNN_MIN_ELEMENTS(1)],
3985 uint32_t width,
3986 float output_min,
3987 float output_max)
3988{
3989 for (uint32_t i = 0; i < 4; i++) {
3990 params->sse_stride1.min[i] = output_min;
3991 params->sse_stride1.max[i] = output_max;
3992 }
3993
3994 const uint32_t w4 = (width - 1) & 3;
3995 params->sse_stride1.mask[0] = UINT32_C(0xFFFFFFFF);
3996 params->sse_stride1.mask[1] = -(uint32_t) (w4 >= 1);
3997 params->sse_stride1.mask[2] = -(uint32_t) (w4 >= 2);
3998 params->sse_stride1.mask[3] = -(uint32_t) (w4 >= 3);
3999
4000 return sizeof(params->sse_stride1);
4001}
4002
4003size_t xnn_init_f32_chw_sse_stride2_params(
4004 union xnn_f32_chw_params params[XNN_MIN_ELEMENTS(1)],
4005 uint32_t width,
4006 float output_min,
4007 float output_max)
4008{
4009 for (uint32_t i = 0; i < 4; i++) {
4010 params->sse_stride2.min[i] = output_min;
4011 params->sse_stride2.max[i] = output_max;
4012 }
4013
4014 const uint32_t w8 = (width - 1) & 7;
4015 params->sse_stride2.mask_even[0] = UINT32_C(0xFFFFFFFF);
4016 params->sse_stride2.mask_even[1] = -(uint32_t) (w8 >= 2);
4017 params->sse_stride2.mask_even[2] = -(uint32_t) (w8 >= 4);
4018 params->sse_stride2.mask_even[3] = -(uint32_t) (w8 >= 6);
4019 params->sse_stride2.mask_odd[0] = -(uint32_t) (w8 >= 1);
4020 params->sse_stride2.mask_odd[1] = -(uint32_t) (w8 >= 3);
4021 params->sse_stride2.mask_odd[2] = -(uint32_t) (w8 >= 5);
4022 params->sse_stride2.mask_odd[3] = -(uint32_t) (w8 >= 7);
4023
4024 return sizeof(params->sse_stride2);
4025}
4026#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
4027
4028#if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
4029size_t xnn_init_f32_chw_wasmsimd_stride1_params(
4030 union xnn_f32_chw_params params[XNN_MIN_ELEMENTS(1)],
4031 uint32_t width,
4032 float output_min,
4033 float output_max)
4034{
4035 params->wasmsimd_stride1.min[0] = output_min;
4036 params->wasmsimd_stride1.min[1] = output_min;
4037 params->wasmsimd_stride1.max[0] = output_max;
4038 params->wasmsimd_stride1.max[1] = output_max;
4039
4040 const uint32_t w4 = (width - 1) & 3;
4041 params->wasmsimd_stride1.mask[0] = UINT32_C(0xFFFFFFFF);
4042 params->wasmsimd_stride1.mask[1] = -(uint32_t) (w4 >= 1);
4043 params->wasmsimd_stride1.mask[2] = -(uint32_t) (w4 >= 2);
4044 params->wasmsimd_stride1.mask[3] = -(uint32_t) (w4 >= 3);
4045
4046 return sizeof(params->wasmsimd_stride1);
4047}
4048
4049size_t xnn_init_f32_chw_wasmsimd_stride2_params(
4050 union xnn_f32_chw_params params[XNN_MIN_ELEMENTS(1)],
4051 uint32_t width,
4052 float output_min,
4053 float output_max)
4054{
4055 params->wasmsimd_stride2.min[0] = output_min;
4056 params->wasmsimd_stride2.min[1] = output_min;
4057 params->wasmsimd_stride2.max[0] = output_max;
4058 params->wasmsimd_stride2.max[1] = output_max;
4059
4060 const uint32_t w8 = (width - 1) & 7;
4061 params->wasmsimd_stride2.mask_even[0] = UINT32_C(0xFFFFFFFF);
4062 params->wasmsimd_stride2.mask_even[1] = -(uint32_t) (w8 >= 2);
4063 params->wasmsimd_stride2.mask_even[2] = -(uint32_t) (w8 >= 4);
4064 params->wasmsimd_stride2.mask_even[3] = -(uint32_t) (w8 >= 6);
4065 params->wasmsimd_stride2.mask_odd[0] = -(uint32_t) (w8 >= 1);
4066 params->wasmsimd_stride2.mask_odd[1] = -(uint32_t) (w8 >= 3);
4067 params->wasmsimd_stride2.mask_odd[2] = -(uint32_t) (w8 >= 5);
4068 params->wasmsimd_stride2.mask_odd[3] = -(uint32_t) (w8 >= 7);
4069
4070 return sizeof(params->wasmsimd_stride2);
4071}
4072#endif // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
4073
4074#if XNN_ARCH_ARM || XNN_ARCH_ARM64
4075void xnn_update_f16_chw_neonfp16arith_stride1_params(
4076 union xnn_f16_chw_params* params,
4077 uint32_t width)
4078{
4079 const uint32_t w8 = (width - 1) & 7;
4080 params->neonfp16arith_stride1.mask[0] = UINT16_C(0xFFFF);
4081 params->neonfp16arith_stride1.mask[1] = -(uint16_t) (w8 >= 1);
4082 params->neonfp16arith_stride1.mask[2] = -(uint16_t) (w8 >= 2);
4083 params->neonfp16arith_stride1.mask[3] = -(uint16_t) (w8 >= 3);
4084 params->neonfp16arith_stride1.mask[4] = -(uint16_t) (w8 >= 4);
4085 params->neonfp16arith_stride1.mask[5] = -(uint16_t) (w8 >= 5);
4086 params->neonfp16arith_stride1.mask[6] = -(uint16_t) (w8 >= 6);
4087 params->neonfp16arith_stride1.mask[7] = -(uint16_t) (w8 >= 7);
4088}
4089
4090void xnn_update_f16_chw_neonfp16arith_stride2_params(
4091 union xnn_f16_chw_params* params,
4092 uint32_t width)
4093{
4094 const uint32_t w16 = (width - 1) & 15;
4095 params->neonfp16arith_stride2.mask_even[0] = UINT16_C(0xFFFF);
4096 params->neonfp16arith_stride2.mask_even[1] = -(uint16_t) (w16 >= 2);
4097 params->neonfp16arith_stride2.mask_even[2] = -(uint16_t) (w16 >= 4);
4098 params->neonfp16arith_stride2.mask_even[3] = -(uint16_t) (w16 >= 6);
4099 params->neonfp16arith_stride2.mask_even[4] = -(uint16_t) (w16 >= 8);
4100 params->neonfp16arith_stride2.mask_even[5] = -(uint16_t) (w16 >= 10);
4101 params->neonfp16arith_stride2.mask_even[6] = -(uint16_t) (w16 >= 12);
4102 params->neonfp16arith_stride2.mask_even[7] = -(uint16_t) (w16 >= 14);
4103 params->neonfp16arith_stride2.mask_odd[0] = -(uint16_t) (w16 >= 1);
4104 params->neonfp16arith_stride2.mask_odd[1] = -(uint16_t) (w16 >= 3);
4105 params->neonfp16arith_stride2.mask_odd[2] = -(uint16_t) (w16 >= 5);
4106 params->neonfp16arith_stride2.mask_odd[3] = -(uint16_t) (w16 >= 7);
4107 params->neonfp16arith_stride2.mask_odd[4] = -(uint16_t) (w16 >= 9);
4108 params->neonfp16arith_stride2.mask_odd[5] = -(uint16_t) (w16 >= 11);
4109 params->neonfp16arith_stride2.mask_odd[6] = -(uint16_t) (w16 >= 13);
4110 params->neonfp16arith_stride2.mask_odd[7] = -(uint16_t) (w16 >= 15);
4111}
4112#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64
4113
4114#if XNN_ARCH_ARM || XNN_ARCH_ARM64
4115void xnn_update_f32_chw_neon_stride1_params(
4116 union xnn_f32_chw_params* params,
4117 uint32_t width)
4118{
4119 const uint32_t w4 = (width - 1) & 3;
4120 params->neon_stride1.mask[0] = UINT32_C(0xFFFFFFFF);
4121 params->neon_stride1.mask[1] = -(uint32_t) (w4 >= 1);
4122 params->neon_stride1.mask[2] = -(uint32_t) (w4 >= 2);
4123 params->neon_stride1.mask[3] = -(uint32_t) (w4 >= 3);
4124}
4125
4126void xnn_update_f32_chw_neon_stride2_params(
4127 union xnn_f32_chw_params* params,
4128 uint32_t width)
4129{
4130 const uint32_t w8 = (width - 1) & 7;
4131 params->neon_stride2.mask_even[0] = UINT32_C(0xFFFFFFFF);
4132 params->neon_stride2.mask_even[1] = -(uint32_t) (w8 >= 2);
4133 params->neon_stride2.mask_even[2] = -(uint32_t) (w8 >= 4);
4134 params->neon_stride2.mask_even[3] = -(uint32_t) (w8 >= 6);
4135 params->neon_stride2.mask_odd[0] = -(uint32_t) (w8 >= 1);
4136 params->neon_stride2.mask_odd[1] = -(uint32_t) (w8 >= 3);
4137 params->neon_stride2.mask_odd[2] = -(uint32_t) (w8 >= 5);
4138 params->neon_stride2.mask_odd[3] = -(uint32_t) (w8 >= 7);
4139}
4140#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64
4141
4142#if XNN_ARCH_X86 || XNN_ARCH_X86_64
4143void xnn_update_f32_chw_sse_stride1_params(
4144 union xnn_f32_chw_params* params,
4145 uint32_t width)
4146{
4147 const uint32_t w4 = (width - 1) & 3;
4148 params->sse_stride1.mask[0] = UINT32_C(0xFFFFFFFF);
4149 params->sse_stride1.mask[1] = -(uint32_t) (w4 >= 1);
4150 params->sse_stride1.mask[2] = -(uint32_t) (w4 >= 2);
4151 params->sse_stride1.mask[3] = -(uint32_t) (w4 >= 3);
4152}
4153
4154void xnn_update_f32_chw_sse_stride2_params(
4155 union xnn_f32_chw_params* params,
4156 uint32_t width)
4157{
4158 const uint32_t w8 = (width - 1) & 7;
4159 params->sse_stride2.mask_even[0] = UINT32_C(0xFFFFFFFF);
4160 params->sse_stride2.mask_even[1] = -(uint32_t) (w8 >= 2);
4161 params->sse_stride2.mask_even[2] = -(uint32_t) (w8 >= 4);
4162 params->sse_stride2.mask_even[3] = -(uint32_t) (w8 >= 6);
4163 params->sse_stride2.mask_odd[0] = -(uint32_t) (w8 >= 1);
4164 params->sse_stride2.mask_odd[1] = -(uint32_t) (w8 >= 3);
4165 params->sse_stride2.mask_odd[2] = -(uint32_t) (w8 >= 5);
4166 params->sse_stride2.mask_odd[3] = -(uint32_t) (w8 >= 7);
4167}
4168#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
4169
4170#if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
4171void xnn_update_f32_chw_wasmsimd_stride1_params(
4172 union xnn_f32_chw_params* params,
4173 uint32_t width)
4174{
4175 const uint32_t w4 = (width - 1) & 3;
4176 params->wasmsimd_stride1.mask[0] = UINT32_C(0xFFFFFFFF);
4177 params->wasmsimd_stride1.mask[1] = -(uint32_t) (w4 >= 1);
4178 params->wasmsimd_stride1.mask[2] = -(uint32_t) (w4 >= 2);
4179 params->wasmsimd_stride1.mask[3] = -(uint32_t) (w4 >= 3);
4180}
4181
4182void xnn_update_f32_chw_wasmsimd_stride2_params(
4183 union xnn_f32_chw_params* params,
4184 uint32_t width)
4185{
4186 const uint32_t w8 = (width - 1) & 7;
4187 params->wasmsimd_stride2.mask_even[0] = UINT32_C(0xFFFFFFFF);
4188 params->wasmsimd_stride2.mask_even[1] = -(uint32_t) (w8 >= 2);
4189 params->wasmsimd_stride2.mask_even[2] = -(uint32_t) (w8 >= 4);
4190 params->wasmsimd_stride2.mask_even[3] = -(uint32_t) (w8 >= 6);
4191 params->wasmsimd_stride2.mask_odd[0] = -(uint32_t) (w8 >= 1);
4192 params->wasmsimd_stride2.mask_odd[1] = -(uint32_t) (w8 >= 3);
4193 params->wasmsimd_stride2.mask_odd[2] = -(uint32_t) (w8 >= 5);
4194 params->wasmsimd_stride2.mask_odd[3] = -(uint32_t) (w8 >= 7);
4195}
4196#endif // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
4197
4198#if XNN_ARCH_X86 || XNN_ARCH_X86_64
4199size_t xnn_init_s8_minmax_sse2_params(
4200 union xnn_s8_minmax_params params[XNN_MIN_ELEMENTS(1)],
4201 int8_t output_min,
4202 int8_t output_max)
4203{
4204 assert(output_min < output_max);
4205
4206 const uint8_t output_min_with_bias = UINT8_C(0x80) ^ (uint8_t) output_min;
4207 const uint8_t output_max_with_bias = UINT8_C(0x80) ^ (uint8_t) output_max;
4208 for (uint32_t i = 0; i < 16; i++) {
4209 params->sse2.bias[i] = UINT8_C(0x80);
4210 params->sse2.min_with_bias[i] = output_min_with_bias;
4211 params->sse2.max_with_bias[i] = output_max_with_bias;
4212 }
4213 return sizeof(params->sse2);
4214}
4215
4216size_t xnn_init_s8_minmax_sse4_params(
4217 union xnn_s8_minmax_params params[XNN_MIN_ELEMENTS(1)],
4218 int8_t output_min,
4219 int8_t output_max)
4220{
4221 assert(output_min < output_max);
4222
4223 for (uint32_t i = 0; i < 16; i++) {
4224 params->sse4.min[i] = output_min;
4225 params->sse4.max[i] = output_max;
4226 }
4227 return sizeof(params->sse4);
4228}
4229#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
4230
4231#if XNN_ARCH_ARM || XNN_ARCH_ARM64
4232size_t xnn_init_s8_minmax_neon_params(
4233 union xnn_s8_minmax_params params[XNN_MIN_ELEMENTS(1)],
4234 int8_t output_min,
4235 int8_t output_max)
4236{
4237 assert(output_min < output_max);
4238
4239 params->neon.min = output_min;
4240 params->neon.max = output_max;
4241 return sizeof(params->neon);
4242}
4243#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64
4244
4245#if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
4246size_t xnn_init_s8_minmax_wasmsimd_params(
4247 union xnn_s8_minmax_params params[XNN_MIN_ELEMENTS(1)],
4248 int8_t output_min,
4249 int8_t output_max)
4250{
4251 assert(output_min < output_max);
4252
4253 for (uint32_t i = 0; i < 8; i++) {
4254 params->wasmsimd.min[i] = output_min;
4255 params->wasmsimd.max[i] = output_max;
4256 }
4257 return sizeof(params->wasmsimd);
4258}
4259#endif // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
4260
4261size_t xnn_init_s8_minmax_scalar_params(
4262 union xnn_s8_minmax_params params[XNN_MIN_ELEMENTS(1)],
4263 int8_t output_min,
4264 int8_t output_max)
4265{
4266 assert(output_min < output_max);
4267
4268 params->scalar.min = (int32_t) output_min;
4269 params->scalar.max = (int32_t) output_max;
4270 return sizeof(params->scalar);
4271}
4272
4273size_t xnn_init_u8_minmax_params(
4274 union xnn_u8_minmax_params params[XNN_MIN_ELEMENTS(1)],
4275 uint8_t output_min,
4276 uint8_t output_max)
4277{
4278 assert(output_min < output_max);
4279
4280 #if XNN_ARCH_X86 || XNN_ARCH_X86_64
4281 for (uint32_t i = 0; i < 16; i++) {
4282 params->sse2.min[i] = output_min;
4283 params->sse2.max[i] = output_max;
4284 }
4285 return sizeof(params->sse2);
4286 #elif XNN_ARCH_ARM || XNN_ARCH_ARM64
4287 params->neon.min = output_min;
4288 params->neon.max = output_max;
4289 return sizeof(params->neon);
4290 #else
4291 params->scalar.min = (uint32_t) output_min;
4292 params->scalar.max = (uint32_t) output_max;
4293 return sizeof(params->scalar);
4294 #endif
4295}
4296
4297#if XNN_ARCH_X86 || XNN_ARCH_X86_64
4298size_t xnn_init_u8_minmax_sse2_params(
4299 union xnn_u8_minmax_params params[XNN_MIN_ELEMENTS(1)],
4300 uint8_t output_min,
4301 uint8_t output_max)
4302{
4303 assert(output_min < output_max);
4304
4305 for (uint32_t i = 0; i < 16; i++) {
4306 params->sse2.min[i] = output_min;
4307 params->sse2.max[i] = output_max;
4308 }
4309 return sizeof(params->sse2);
4310}
4311#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
4312
4313#if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
4314size_t xnn_init_u8_minmax_wasmsimd_params(
4315 union xnn_u8_minmax_params params[XNN_MIN_ELEMENTS(1)],
4316 uint8_t output_min,
4317 uint8_t output_max)
4318{
4319 assert(output_min < output_max);
4320
4321 for (uint32_t i = 0; i < 8; i++) {
4322 params->wasmsimd.min[i] = output_min;
4323 params->wasmsimd.max[i] = output_max;
4324 }
4325 return sizeof(params->wasmsimd);
4326}
4327#endif // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
4328
4329#if XNN_ARCH_ARM || XNN_ARCH_ARM64
4330size_t xnn_init_u8_minmax_neon_params(
4331 union xnn_u8_minmax_params params[XNN_MIN_ELEMENTS(1)],
4332 uint8_t output_min,
4333 uint8_t output_max)
4334{
4335 assert(output_min < output_max);
4336
4337 params->neon.min = output_min;
4338 params->neon.max = output_max;
4339 return sizeof(params->neon);
4340}
4341#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64
4342
4343size_t xnn_init_u8_minmax_scalar_params(
4344 union xnn_u8_minmax_params params[XNN_MIN_ELEMENTS(1)],
4345 uint8_t output_min,
4346 uint8_t output_max)
4347{
4348 assert(output_min < output_max);
4349
4350 params->scalar.min = (uint32_t) output_min;
4351 params->scalar.max = (uint32_t) output_max;
4352 return sizeof(params->scalar);
4353}
4354
4355#if XNN_ARCH_X86 || XNN_ARCH_X86_64
4356size_t xnn_init_qu8_add_minmax_sse2_params(
4357 union xnn_qu8_add_minmax_params params[XNN_MIN_ELEMENTS(1)],
4358 uint8_t a_zero_point,
4359 uint8_t b_zero_point,
4360 uint8_t output_zero_point,
4361 float a_output_scale,
4362 float b_output_scale,
4363 uint8_t output_min,
4364 uint8_t output_max)
4365{
4366 const float abs_a_output_scale = fabsf(a_output_scale);
4367 const float abs_b_output_scale = fabsf(b_output_scale);
4368 assert(abs_a_output_scale >= 0x1.0p-10f);
4369 assert(abs_b_output_scale >= 0x1.0p-10f);
4370 assert(abs_a_output_scale < 0x1.0p+8f);
4371 assert(abs_b_output_scale < 0x1.0p+8f);
4372
4373 // Compute requantization parameters.
4374 const float max_abs_output_scale = math_max_f32(abs_a_output_scale, abs_b_output_scale);
4375 assert(max_abs_output_scale >= 0x1.0p-10f);
4376 assert(max_abs_output_scale < 0x1.0p+8f);
4377 const uint32_t max_scale_bits = float_as_uint32(max_abs_output_scale);
4378 const int32_t max_scale_exponent = (int32_t) (max_scale_bits >> 23) - 127;
4379
4380 // Shift is in [12, 30] range.
4381 const uint32_t shift = (uint32_t) (20 /* multiplier bits */ - max_scale_exponent);
4382 assert(shift <= 30);
4383 assert(shift >= 12);
4384
4385 // Multipliers are in [0, 2**21) range, largest multiplier is in [2**20, 2**21) range.
4386 const int32_t abs_a_multiplier = (int32_t) lrintf(uint32_as_float(float_as_uint32(abs_a_output_scale) + (shift << 23)));
4387 const int32_t abs_b_multiplier = (int32_t) lrintf(uint32_as_float(float_as_uint32(abs_b_output_scale) + (shift << 23)));
4388 assert(math_max_s32(abs_a_multiplier, abs_b_multiplier) >= INT32_C(0x00100000));
4389 assert(abs_a_multiplier <= INT32_C(0x00200000));
4390 assert(abs_b_multiplier <= INT32_C(0x00200000));
4391
4392 const int32_t a_multiplier = signbit(a_output_scale) ? -abs_a_multiplier : abs_a_multiplier;
4393 const int32_t b_multiplier = signbit(b_output_scale) ? -abs_b_multiplier : abs_b_multiplier;
4394
4395 const int32_t rounding = INT32_C(1) << (shift - 1);
4396 const int32_t bias = rounding - a_multiplier * (int32_t) a_zero_point - b_multiplier * (int32_t) b_zero_point;
4397 for (uint32_t i = 0; i < 4; i++) {
4398 params->sse2.bias[i] = bias;
4399 }
4400 const uint16_t a_multiplier_lo = (uint16_t) a_multiplier;
4401 const uint16_t a_multiplier_hi = (uint16_t) ((uint32_t) a_multiplier >> 16);
4402 const uint16_t b_multiplier_lo = (uint16_t) b_multiplier;
4403 const uint16_t b_multiplier_hi = (uint16_t) ((uint32_t) b_multiplier >> 16);
4404 for (uint32_t i = 0; i < 8; i++) {
4405 params->sse2.a_multiplier_lo[i] = a_multiplier_lo;
4406 params->sse2.a_multiplier_hi[i] = a_multiplier_hi;
4407 params->sse2.b_multiplier_lo[i] = b_multiplier_lo;
4408 params->sse2.b_multiplier_hi[i] = b_multiplier_hi;
4409 }
4410 params->sse2.shift = shift;
4411 params->sse2.b_multiplier = (uint32_t) b_multiplier;
4412 for (uint32_t i = 0; i < 8; i++) {
4413 params->sse2.output_zero_point[i] = (int16_t) (uint16_t) output_zero_point;
4414 }
4415 for (uint32_t i = 0; i < 16; i++) {
4416 params->sse2.output_min[i] = output_min;
4417 params->sse2.output_max[i] = output_max;
4418 }
4419 return sizeof(params->sse2);
4420}
4421
4422size_t xnn_init_qu8_add_minmax_sse4_params(
4423 union xnn_qu8_add_minmax_params params[XNN_MIN_ELEMENTS(1)],
4424 uint8_t a_zero_point,
4425 uint8_t b_zero_point,
4426 uint8_t output_zero_point,
4427 float a_output_scale,
4428 float b_output_scale,
4429 uint8_t output_min,
4430 uint8_t output_max)
4431{
4432 const float abs_a_output_scale = fabsf(a_output_scale);
4433 const float abs_b_output_scale = fabsf(b_output_scale);
4434 assert(abs_a_output_scale >= 0x1.0p-10f);
4435 assert(abs_b_output_scale >= 0x1.0p-10f);
4436 assert(abs_a_output_scale < 0x1.0p+8f);
4437 assert(abs_b_output_scale < 0x1.0p+8f);
4438
4439 // Compute requantization parameters.
4440 const float max_abs_output_scale = math_max_f32(abs_a_output_scale, abs_b_output_scale);
4441 assert(max_abs_output_scale >= 0x1.0p-10f);
4442 assert(max_abs_output_scale < 0x1.0p+8f);
4443 const uint32_t max_scale_bits = float_as_uint32(max_abs_output_scale);
4444 const int32_t max_scale_exponent = (int32_t) (max_scale_bits >> 23) - 127;
4445
4446 // Shift is in [12, 30] range.
4447 const uint32_t shift = (uint32_t) (20 /* multiplier bits */ - max_scale_exponent);
4448 assert(shift <= 30);
4449 assert(shift >= 12);
4450
4451 // Multipliers are in [0, 2**21) range, largest multiplier is in [2**20, 2**21) range.
4452 const int32_t abs_a_multiplier = (int32_t) lrintf(uint32_as_float(float_as_uint32(abs_a_output_scale) + (shift << 23)));
4453 const int32_t abs_b_multiplier = (int32_t) lrintf(uint32_as_float(float_as_uint32(abs_b_output_scale) + (shift << 23)));
4454 assert(math_max_s32(abs_a_multiplier, abs_b_multiplier) >= INT32_C(0x00100000));
4455 assert(abs_a_multiplier <= INT32_C(0x00200000));
4456 assert(abs_b_multiplier <= INT32_C(0x00200000));
4457
4458 const int32_t a_multiplier = signbit(a_output_scale) ? -abs_a_multiplier : abs_a_multiplier;
4459 const int32_t b_multiplier = signbit(b_output_scale) ? -abs_b_multiplier : abs_b_multiplier;
4460
4461 const int32_t rounding = INT32_C(1) << (shift - 1);
4462 const int32_t bias = rounding - a_multiplier * (int32_t) (uint32_t) a_zero_point - b_multiplier * (int32_t) (uint32_t) b_zero_point;
4463 for (uint32_t i = 0; i < 4; i++) {
4464 params->sse4.bias[i] = bias;
4465 params->sse4.a_multiplier[i] = a_multiplier;
4466 params->sse4.b_multiplier[i] = b_multiplier;
4467 }
4468 for (uint32_t i = 0; i < 2; i++) {
4469 params->sse4.shift[i] = (uint64_t) shift;
4470 }
4471 for (uint32_t i = 0; i < 8; i++) {
4472 params->sse4.output_zero_point[i] = (int16_t) (uint16_t) output_zero_point;
4473 }
4474 for (uint32_t i = 0; i < 16; i++) {
4475 params->sse4.output_min[i] = output_min;
4476 params->sse4.output_max[i] = output_max;
4477 }
4478 return sizeof(params->sse4);
4479}
4480
4481size_t xnn_init_qu8_add_minmax_avx2_params(
4482 union xnn_qu8_add_minmax_params params[XNN_MIN_ELEMENTS(1)],
4483 uint8_t a_zero_point,
4484 uint8_t b_zero_point,
4485 uint8_t output_zero_point,
4486 float a_output_scale,
4487 float b_output_scale,
4488 uint8_t output_min,
4489 uint8_t output_max)
4490{
4491 const float abs_a_output_scale = fabsf(a_output_scale);
4492 const float abs_b_output_scale = fabsf(b_output_scale);
4493 assert(abs_a_output_scale >= 0x1.0p-10f);
4494 assert(abs_b_output_scale >= 0x1.0p-10f);
4495 assert(abs_a_output_scale < 0x1.0p+8f);
4496 assert(abs_b_output_scale < 0x1.0p+8f);
4497
4498 // Compute requantization parameters.
4499 const float max_abs_output_scale = math_max_f32(abs_a_output_scale, abs_b_output_scale);
4500 assert(max_abs_output_scale >= 0x1.0p-10f);
4501 assert(max_abs_output_scale < 0x1.0p+8f);
4502 const uint32_t max_scale_bits = float_as_uint32(max_abs_output_scale);
4503 const int32_t max_scale_exponent = (int32_t) (max_scale_bits >> 23) - 127;
4504
4505 // Shift is in [12, 30] range.
4506 const uint32_t shift = (uint32_t) (20 /* multiplier bits */ - max_scale_exponent);
4507 assert(shift <= 30);
4508 assert(shift >= 12);
4509
4510 // Multipliers are in [0, 2**21) range, largest multiplier is in [2**20, 2**21) range.
4511 const int32_t abs_a_multiplier = (int32_t) lrintf(uint32_as_float(float_as_uint32(abs_a_output_scale) + (shift << 23)));
4512 const int32_t abs_b_multiplier = (int32_t) lrintf(uint32_as_float(float_as_uint32(abs_b_output_scale) + (shift << 23)));
4513 assert(math_max_s32(abs_a_multiplier, abs_b_multiplier) >= INT32_C(0x00100000));
4514 assert(abs_a_multiplier <= INT32_C(0x00200000));
4515 assert(abs_b_multiplier <= INT32_C(0x00200000));
4516
4517 const int32_t a_multiplier = signbit(a_output_scale) ? -abs_a_multiplier : abs_a_multiplier;
4518 const int32_t b_multiplier = signbit(b_output_scale) ? -abs_b_multiplier : abs_b_multiplier;
4519
4520 const int32_t rounding = INT32_C(1) << (shift - 1);
4521 const int32_t bias = rounding - a_multiplier * (int32_t) (uint32_t) a_zero_point - b_multiplier * (int32_t) (uint32_t) b_zero_point;
4522 for (uint32_t i = 0; i < 8; i++) {
4523 params->avx2.bias[i] = bias;
4524 params->avx2.a_multiplier[i] = a_multiplier;
4525 params->avx2.b_multiplier[i] = b_multiplier;
4526 }
4527 for (uint32_t i = 0; i < 4; i++) {
4528 params->avx2.shift[i] = (uint64_t) shift;
4529 }
4530 for (uint32_t i = 0; i < 16; i++) {
4531 params->avx2.output_zero_point[i] = (int16_t) (uint16_t) output_zero_point;
4532 params->avx2.output_min[i] = output_min;
4533 params->avx2.output_max[i] = output_max;
4534 }
4535 return sizeof(params->avx2);
4536}
4537
4538size_t xnn_init_qu8_add_minmax_avx512_params(
4539 union xnn_qu8_add_minmax_params params[XNN_MIN_ELEMENTS(1)],
4540 uint8_t a_zero_point,
4541 uint8_t b_zero_point,
4542 uint8_t output_zero_point,
4543 float a_output_scale,
4544 float b_output_scale,
4545 uint8_t output_min,
4546 uint8_t output_max)
4547{
4548 const float abs_a_output_scale = fabsf(a_output_scale);
4549 const float abs_b_output_scale = fabsf(b_output_scale);
4550 assert(abs_a_output_scale >= 0x1.0p-10f);
4551 assert(abs_b_output_scale >= 0x1.0p-10f);
4552 assert(abs_a_output_scale < 0x1.0p+8f);
4553 assert(abs_b_output_scale < 0x1.0p+8f);
4554
4555 // Compute requantization parameters.
4556 const float max_abs_output_scale = math_max_f32(abs_a_output_scale, abs_b_output_scale);
4557 assert(max_abs_output_scale >= 0x1.0p-10f);
4558 assert(max_abs_output_scale < 0x1.0p+8f);
4559 const uint32_t max_scale_bits = float_as_uint32(max_abs_output_scale);
4560 const int32_t max_scale_exponent = (int32_t) (max_scale_bits >> 23) - 127;
4561
4562 // Shift is in [12, 30] range.
4563 const uint32_t shift = (uint32_t) (20 /* multiplier bits */ - max_scale_exponent);
4564 assert(shift <= 30);
4565 assert(shift >= 12);
4566
4567 // Multipliers are in [0, 2**21) range, largest multiplier is in [2**20, 2**21) range.
4568 const int32_t abs_a_multiplier = (int32_t) lrintf(uint32_as_float(float_as_uint32(abs_a_output_scale) + (shift << 23)));
4569 const int32_t abs_b_multiplier = (int32_t) lrintf(uint32_as_float(float_as_uint32(abs_b_output_scale) + (shift << 23)));
4570 assert(math_max_s32(abs_a_multiplier, abs_b_multiplier) >= INT32_C(0x00100000));
4571 assert(abs_a_multiplier <= INT32_C(0x00200000));
4572 assert(abs_b_multiplier <= INT32_C(0x00200000));
4573
4574 const int32_t a_multiplier = signbit(a_output_scale) ? -abs_a_multiplier : abs_a_multiplier;
4575 const int32_t b_multiplier = signbit(b_output_scale) ? -abs_b_multiplier : abs_b_multiplier;
4576
4577 const int32_t rounding = INT32_C(1) << (shift - 1);
4578 const int32_t bias = rounding - a_multiplier * (int32_t) (uint32_t) a_zero_point - b_multiplier * (int32_t) (uint32_t) b_zero_point;
4579 for (uint32_t i = 0; i < 16; i++) {
4580 params->avx512.bias[i] = bias;
4581 params->avx512.a_multiplier[i] = a_multiplier;
4582 params->avx512.b_multiplier[i] = b_multiplier;
4583 }
4584 for (uint32_t i = 0; i < 8; i++) {
4585 params->avx512.shift[i] = (uint64_t) shift;
4586 }
4587 for (uint32_t i = 0; i < 32; i++) {
4588 params->avx512.output_zero_point[i] = (int16_t) (uint16_t) output_zero_point;
4589 params->avx512.output_min[i] = output_min;
4590 params->avx512.output_max[i] = output_max;
4591 }
4592 return sizeof(params->avx512);
4593}
4594#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
4595
4596#if XNN_ARCH_ARM || XNN_ARCH_ARM64
4597size_t xnn_init_qu8_add_minmax_neon_params(
4598 union xnn_qu8_add_minmax_params params[XNN_MIN_ELEMENTS(1)],
4599 uint8_t a_zero_point,
4600 uint8_t b_zero_point,
4601 uint8_t output_zero_point,
4602 float a_output_scale,
4603 float b_output_scale,
4604 uint8_t output_min,
4605 uint8_t output_max)
4606{
4607 const float abs_a_output_scale = fabsf(a_output_scale);
4608 const float abs_b_output_scale = fabsf(b_output_scale);
4609 assert(abs_a_output_scale >= 0x1.0p-10f);
4610 assert(abs_b_output_scale >= 0x1.0p-10f);
4611 assert(abs_a_output_scale < 0x1.0p+8f);
4612 assert(abs_b_output_scale < 0x1.0p+8f);
4613
4614 // Compute requantization parameters.
4615 const float max_abs_output_scale = math_max_f32(abs_a_output_scale, abs_b_output_scale);
4616 assert(max_abs_output_scale >= 0x1.0p-10f);
4617 assert(max_abs_output_scale < 0x1.0p+8f);
4618 const uint32_t max_scale_bits = float_as_uint32(max_abs_output_scale);
4619 const int32_t max_scale_exponent = (int32_t) (max_scale_bits >> 23) - 127;
4620
4621 // Shift is in [12, 30] range.
4622 const uint32_t shift = (uint32_t) (20 /* multiplier bits */ - max_scale_exponent);
4623 assert(shift <= 30);
4624 assert(shift >= 12);
4625
4626 // Multipliers are in [0, 2**21) range, largest multiplier is in [2**20, 2**21) range.
4627 const int32_t abs_a_multiplier = (int32_t) lrintf(uint32_as_float(float_as_uint32(abs_a_output_scale) + (shift << 23)));
4628 const int32_t abs_b_multiplier = (int32_t) lrintf(uint32_as_float(float_as_uint32(abs_b_output_scale) + (shift << 23)));
4629 assert(math_max_s32(abs_a_multiplier, abs_b_multiplier) >= INT32_C(0x00100000));
4630 assert(abs_a_multiplier <= INT32_C(0x00200000));
4631 assert(abs_b_multiplier <= INT32_C(0x00200000));
4632
4633 const int32_t a_multiplier = signbit(a_output_scale) ? -abs_a_multiplier : abs_a_multiplier;
4634 const int32_t b_multiplier = signbit(b_output_scale) ? -abs_b_multiplier : abs_b_multiplier;
4635
4636 params->neon.a_zero_point = a_zero_point;
4637 params->neon.b_zero_point = b_zero_point;
4638 params->neon.a_multiplier = (int32_t) a_multiplier;
4639 params->neon.b_multiplier = (int32_t) b_multiplier;
4640 params->neon.right_shift = (int32_t) -shift;
4641 params->neon.output_zero_point = (int16_t) (uint16_t) output_zero_point;
4642 params->neon.output_min = output_min;
4643 params->neon.output_max = output_max;
4644 return sizeof(params->neon);
4645}
4646#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64
4647
4648#if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
4649size_t xnn_init_qu8_add_minmax_wasmsimd_params(
4650 union xnn_qu8_add_minmax_params params[XNN_MIN_ELEMENTS(1)],
4651 uint8_t a_zero_point,
4652 uint8_t b_zero_point,
4653 uint8_t output_zero_point,
4654 float a_output_scale,
4655 float b_output_scale,
4656 uint8_t output_min,
4657 uint8_t output_max)
4658{
4659 const float abs_a_output_scale = fabsf(a_output_scale);
4660 const float abs_b_output_scale = fabsf(b_output_scale);
4661 assert(abs_a_output_scale >= 0x1.0p-10f);
4662 assert(abs_b_output_scale >= 0x1.0p-10f);
4663 assert(abs_a_output_scale < 0x1.0p+8f);
4664 assert(abs_b_output_scale < 0x1.0p+8f);
4665
4666 // Compute requantization parameters.
4667 const float max_abs_output_scale = math_max_f32(abs_a_output_scale, abs_b_output_scale);
4668 assert(max_abs_output_scale >= 0x1.0p-10f);
4669 assert(max_abs_output_scale < 0x1.0p+8f);
4670 const uint32_t max_scale_bits = float_as_uint32(max_abs_output_scale);
4671 const int32_t max_scale_exponent = (int32_t) (max_scale_bits >> 23) - 127;
4672
4673 // Shift is in [12, 30] range.
4674 const uint32_t shift = (uint32_t) (20 /* multiplier bits */ - max_scale_exponent);
4675 assert(shift <= 30);
4676 assert(shift >= 12);
4677
4678 // Multipliers are in [0, 2**21) range, largest multiplier is in [2**20, 2**21) range.
4679 const int32_t abs_a_multiplier = (int32_t) lrintf(uint32_as_float(float_as_uint32(abs_a_output_scale) + (shift << 23)));
4680 const int32_t abs_b_multiplier = (int32_t) lrintf(uint32_as_float(float_as_uint32(abs_b_output_scale) + (shift << 23)));
4681 assert(math_max_s32(abs_a_multiplier, abs_b_multiplier) >= INT32_C(0x00100000));
4682 assert(abs_a_multiplier <= INT32_C(0x00200000));
4683 assert(abs_b_multiplier <= INT32_C(0x00200000));
4684
4685 const int32_t a_multiplier = signbit(a_output_scale) ? -abs_a_multiplier : abs_a_multiplier;
4686 const int32_t b_multiplier = signbit(b_output_scale) ? -abs_b_multiplier : abs_b_multiplier;
4687
4688 const int32_t rounding = INT32_C(1) << (shift - 1);
4689 const int32_t bias = rounding - a_multiplier * (int32_t) (uint32_t) a_zero_point - b_multiplier * (int32_t) (uint32_t) b_zero_point;
4690 for (uint32_t i = 0; i < 2; i++) {
4691 params->wasmsimd.bias[i] = bias;
4692 params->wasmsimd.a_multiplier[i] = a_multiplier;
4693 params->wasmsimd.b_multiplier[i] = b_multiplier;
4694 }
4695 params->wasmsimd.shift = shift;
4696 for (uint32_t i = 0; i < 4; i++) {
4697 params->wasmsimd.output_zero_point[i] = (int16_t) (uint16_t) output_zero_point;
4698 }
4699 for (uint32_t i = 0; i < 8; i++) {
4700 params->wasmsimd.output_min[i] = output_min;
4701 params->wasmsimd.output_max[i] = output_max;
4702 }
4703 return sizeof(params->wasmsimd);
4704}
4705#endif // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
4706
4707size_t xnn_init_qu8_add_minmax_scalar_params(
4708 union xnn_qu8_add_minmax_params params[XNN_MIN_ELEMENTS(1)],
4709 uint8_t a_zero_point,
4710 uint8_t b_zero_point,
4711 uint8_t output_zero_point,
4712 float a_output_scale,
4713 float b_output_scale,
4714 uint8_t output_min,
4715 uint8_t output_max)
4716{
4717 const float abs_a_output_scale = fabsf(a_output_scale);
4718 const float abs_b_output_scale = fabsf(b_output_scale);
4719 assert(abs_a_output_scale >= 0x1.0p-10f);
4720 assert(abs_b_output_scale >= 0x1.0p-10f);
4721 assert(abs_a_output_scale < 0x1.0p+8f);
4722 assert(abs_b_output_scale < 0x1.0p+8f);
4723
4724 // Compute requantization parameters.
4725 const float max_abs_output_scale = math_max_f32(abs_a_output_scale, abs_b_output_scale);
4726 assert(max_abs_output_scale >= 0x1.0p-10f);
4727 assert(max_abs_output_scale < 0x1.0p+8f);
4728 const uint32_t max_scale_bits = float_as_uint32(max_abs_output_scale);
4729 const int32_t max_scale_exponent = (int32_t) (max_scale_bits >> 23) - 127;
4730
4731 // Shift is in [12, 30] range.
4732 const uint32_t shift = (uint32_t) (20 /* multiplier bits */ - max_scale_exponent);
4733 assert(shift <= 30);
4734 assert(shift >= 12);
4735
4736 // Multipliers are in [0, 2**21) range, largest multiplier is in [2**20, 2**21) range.
4737 const int32_t abs_a_multiplier = (int32_t) lrintf(uint32_as_float(float_as_uint32(abs_a_output_scale) + (shift << 23)));
4738 const int32_t abs_b_multiplier = (int32_t) lrintf(uint32_as_float(float_as_uint32(abs_b_output_scale) + (shift << 23)));
4739 assert(math_max_s32(abs_a_multiplier, abs_b_multiplier) >= INT32_C(0x00100000));
4740 assert(abs_a_multiplier <= INT32_C(0x00200000));
4741 assert(abs_b_multiplier <= INT32_C(0x00200000));
4742
4743 const int32_t a_multiplier = signbit(a_output_scale) ? -abs_a_multiplier : abs_a_multiplier;
4744 const int32_t b_multiplier = signbit(b_output_scale) ? -abs_b_multiplier : abs_b_multiplier;
4745
4746 const int32_t rounding = INT32_C(1) << (shift - 1);
4747 params->scalar.bias = rounding - a_multiplier * (int32_t) (uint32_t) a_zero_point - b_multiplier * (int32_t) (uint32_t) b_zero_point;
4748 params->scalar.a_multiplier = a_multiplier;
4749 params->scalar.b_multiplier = b_multiplier;
4750 params->scalar.shift = shift;
4751 params->scalar.output_min_less_zero_point = (int32_t) (uint32_t) output_min - (int32_t) (uint32_t) output_zero_point;
4752 params->scalar.output_max_less_zero_point = (int32_t) (uint32_t) output_max - (int32_t) (uint32_t) output_zero_point;
4753 params->scalar.output_zero_point = (int32_t) (uint32_t) output_zero_point;
4754 return sizeof(params->scalar);
4755}
4756
4757#if XNN_ARCH_X86 || XNN_ARCH_X86_64
4758size_t xnn_init_qs8_add_minmax_sse2_params(
4759 union xnn_qs8_add_minmax_params params[XNN_MIN_ELEMENTS(1)],
4760 int8_t a_zero_point,
4761 int8_t b_zero_point,
4762 int8_t output_zero_point,
4763 float a_output_scale,
4764 float b_output_scale,
4765 int8_t output_min,
4766 int8_t output_max)
4767{
4768 const float abs_a_output_scale = fabsf(a_output_scale);
4769 const float abs_b_output_scale = fabsf(b_output_scale);
4770 assert(abs_a_output_scale >= 0x1.0p-10f);
4771 assert(abs_b_output_scale >= 0x1.0p-10f);
4772 assert(abs_a_output_scale < 0x1.0p+8f);
4773 assert(abs_b_output_scale < 0x1.0p+8f);
4774
4775 // Compute requantization parameters.
4776 const float max_abs_output_scale = math_max_f32(abs_a_output_scale, abs_b_output_scale);
4777 assert(max_abs_output_scale >= 0x1.0p-10f);
4778 assert(max_abs_output_scale < 0x1.0p+8f);
4779 const uint32_t max_scale_bits = float_as_uint32(max_abs_output_scale);
4780 const int32_t max_scale_exponent = (int32_t) (max_scale_bits >> 23) - 127;
4781
4782 // Shift is in [12, 30] range.
4783 const uint32_t shift = (uint32_t) (20 /* multiplier bits */ - max_scale_exponent);
4784 assert(shift <= 30);
4785 assert(shift >= 12);
4786
4787 // Multipliers are in [0, 2**21) range, largest multiplier is in [2**20, 2**21) range.
4788 const int32_t abs_a_multiplier = (int32_t) lrintf(uint32_as_float(float_as_uint32(abs_a_output_scale) + (shift << 23)));
4789 const int32_t abs_b_multiplier = (int32_t) lrintf(uint32_as_float(float_as_uint32(abs_b_output_scale) + (shift << 23)));
4790 assert(math_max_s32(abs_a_multiplier, abs_b_multiplier) >= INT32_C(0x00100000));
4791 assert(abs_a_multiplier <= INT32_C(0x00200000));
4792 assert(abs_b_multiplier <= INT32_C(0x00200000));
4793
4794 const int32_t a_multiplier = signbit(a_output_scale) ? -abs_a_multiplier : abs_a_multiplier;
4795 const int32_t b_multiplier = signbit(b_output_scale) ? -abs_b_multiplier : abs_b_multiplier;
4796
4797 const int32_t rounding = INT32_C(1) << (shift - 1);
4798 const int32_t bias = rounding - a_multiplier * (int32_t) a_zero_point - b_multiplier * (int32_t) b_zero_point;
4799 for (uint32_t i = 0; i < 4; i++) {
4800 params->sse2.bias[i] = bias;
4801 }
4802 const uint16_t a_multiplier_lo = (uint16_t) a_multiplier;
4803 const uint16_t a_multiplier_hi = (uint16_t) ((uint32_t) a_multiplier >> 16);
4804 const uint16_t b_multiplier_lo = (uint16_t) b_multiplier;
4805 const uint16_t b_multiplier_hi = (uint16_t) ((uint32_t) b_multiplier >> 16);
4806 for (uint32_t i = 0; i < 8; i++) {
4807 params->sse2.a_multiplier_lo[i] = a_multiplier_lo;
4808 params->sse2.a_multiplier_hi[i] = a_multiplier_hi;
4809 params->sse2.b_multiplier_lo[i] = b_multiplier_lo;
4810 params->sse2.b_multiplier_hi[i] = b_multiplier_hi;
4811 }
4812 params->sse2.shift = shift;
4813 params->sse2.b_multiplier = (uint32_t) b_multiplier;
4814 for (uint32_t i = 0; i < 8; i++) {
4815 params->sse2.output_zero_point[i] = (int16_t) output_zero_point;
4816 params->sse2.output_min[i] = (int16_t) output_min;
4817 params->sse2.output_max[i] = (int16_t) output_max;
4818 }
4819 return sizeof(params->sse2);
4820}
4821
4822size_t xnn_init_qs8_add_minmax_sse4_mul16_params(
4823 union xnn_qs8_add_minmax_params params[XNN_MIN_ELEMENTS(1)],
4824 int8_t a_zero_point,
4825 int8_t b_zero_point,
4826 int8_t output_zero_point,
4827 float a_output_scale,
4828 float b_output_scale,
4829 int8_t output_min,
4830 int8_t output_max)
4831{
4832 const float abs_a_output_scale = fabsf(a_output_scale);
4833 const float abs_b_output_scale = fabsf(b_output_scale);
4834 assert(abs_a_output_scale >= 0x1.0p-10f);
4835 assert(abs_b_output_scale >= 0x1.0p-10f);
4836 assert(abs_a_output_scale < 0x1.0p+8f);
4837 assert(abs_b_output_scale < 0x1.0p+8f);
4838
4839 // Compute requantization parameters.
4840 const float max_abs_output_scale = math_max_f32(abs_a_output_scale, abs_b_output_scale);
4841 assert(max_abs_output_scale >= 0x1.0p-10f);
4842 assert(max_abs_output_scale < 0x1.0p+8f);
4843 const uint32_t max_scale_bits = float_as_uint32(max_abs_output_scale);
4844 const int32_t max_scale_exponent = (int32_t) (max_scale_bits >> 23) - 127;
4845
4846 // Shift is in [12, 30] range.
4847 const uint32_t shift = (uint32_t) (20 /* multiplier bits */ - max_scale_exponent);
4848 assert(shift <= 30);
4849 assert(shift >= 12);
4850
4851 // Multipliers are in [0, 2**21) range, largest multiplier is in [2**20, 2**21) range.
4852 const int32_t abs_a_multiplier = (int32_t) lrintf(uint32_as_float(float_as_uint32(abs_a_output_scale) + (shift << 23)));
4853 const int32_t abs_b_multiplier = (int32_t) lrintf(uint32_as_float(float_as_uint32(abs_b_output_scale) + (shift << 23)));
4854 assert(math_max_s32(abs_a_multiplier, abs_b_multiplier) >= INT32_C(0x00100000));
4855 assert(abs_a_multiplier <= INT32_C(0x00200000));
4856 assert(abs_b_multiplier <= INT32_C(0x00200000));
4857
4858 const int32_t a_multiplier = signbit(a_output_scale) ? -abs_a_multiplier : abs_a_multiplier;
4859 const int32_t b_multiplier = signbit(b_output_scale) ? -abs_b_multiplier : abs_b_multiplier;
4860
4861 const int32_t rounding = INT32_C(1) << (shift - 1);
4862 const int32_t bias = rounding - a_multiplier * (int32_t) a_zero_point - b_multiplier * (int32_t) b_zero_point;
4863 for (uint32_t i = 0; i < 4; i++) {
4864 params->sse4_mul16.bias[i] = bias;
4865 }
4866 const uint16_t a_multiplier_lo = (uint16_t) a_multiplier;
4867 const uint16_t a_multiplier_hi = (uint16_t) ((uint32_t) a_multiplier >> 16);
4868 const uint16_t b_multiplier_lo = (uint16_t) b_multiplier;
4869 const uint16_t b_multiplier_hi = (uint16_t) ((uint32_t) b_multiplier >> 16);
4870 for (uint32_t i = 0; i < 8; i++) {
4871 params->sse4_mul16.a_multiplier_lo[i] = a_multiplier_lo;
4872 params->sse4_mul16.a_multiplier_hi[i] = a_multiplier_hi;
4873 params->sse4_mul16.b_multiplier_lo[i] = b_multiplier_lo;
4874 params->sse4_mul16.b_multiplier_hi[i] = b_multiplier_hi;
4875 }
4876 params->sse4_mul16.shift = shift;
4877 params->sse4_mul16.b_multiplier = (uint32_t) b_multiplier;
4878 for (uint32_t i = 0; i < 8; i++) {
4879 params->sse4_mul16.output_zero_point[i] = (int16_t) output_zero_point;
4880 }
4881 for (uint32_t i = 0; i < 16; i++) {
4882 params->sse4_mul16.output_min[i] = output_min;
4883 params->sse4_mul16.output_max[i] = output_max;
4884 }
4885 return sizeof(params->sse4_mul16);
4886}
4887
4888size_t xnn_init_qs8_add_minmax_sse4_mul32_params(
4889 union xnn_qs8_add_minmax_params params[XNN_MIN_ELEMENTS(1)],
4890 int8_t a_zero_point,
4891 int8_t b_zero_point,
4892 int8_t output_zero_point,
4893 float a_output_scale,
4894 float b_output_scale,
4895 int8_t output_min,
4896 int8_t output_max)
4897{
4898 const float abs_a_output_scale = fabsf(a_output_scale);
4899 const float abs_b_output_scale = fabsf(b_output_scale);
4900 assert(abs_a_output_scale >= 0x1.0p-10f);
4901 assert(abs_b_output_scale >= 0x1.0p-10f);
4902 assert(abs_a_output_scale < 0x1.0p+8f);
4903 assert(abs_b_output_scale < 0x1.0p+8f);
4904
4905 // Compute requantization parameters.
4906 const float max_abs_output_scale = math_max_f32(abs_a_output_scale, abs_b_output_scale);
4907 assert(max_abs_output_scale >= 0x1.0p-10f);
4908 assert(max_abs_output_scale < 0x1.0p+8f);
4909 const uint32_t max_scale_bits = float_as_uint32(max_abs_output_scale);
4910 const int32_t max_scale_exponent = (int32_t) (max_scale_bits >> 23) - 127;
4911
4912 // Shift is in [12, 30] range.
4913 const uint32_t shift = (uint32_t) (20 /* multiplier bits */ - max_scale_exponent);
4914 assert(shift <= 30);
4915 assert(shift >= 12);
4916
4917 // Multipliers are in [0, 2**21) range, largest multiplier is in [2**20, 2**21) range.
4918 const int32_t abs_a_multiplier = (int32_t) lrintf(uint32_as_float(float_as_uint32(abs_a_output_scale) + (shift << 23)));
4919 const int32_t abs_b_multiplier = (int32_t) lrintf(uint32_as_float(float_as_uint32(abs_b_output_scale) + (shift << 23)));
4920 assert(math_max_s32(abs_a_multiplier, abs_b_multiplier) >= INT32_C(0x00100000));
4921 assert(abs_a_multiplier <= INT32_C(0x00200000));
4922 assert(abs_b_multiplier <= INT32_C(0x00200000));
4923
4924 const int32_t a_multiplier = signbit(a_output_scale) ? -abs_a_multiplier : abs_a_multiplier;
4925 const int32_t b_multiplier = signbit(b_output_scale) ? -abs_b_multiplier : abs_b_multiplier;
4926
4927 const int32_t rounding = INT32_C(1) << (shift - 1);
4928 const int32_t bias = rounding - a_multiplier * (int32_t) a_zero_point - b_multiplier * (int32_t) b_zero_point;
4929 for (uint32_t i = 0; i < 4; i++) {
4930 params->sse4_mul32.bias[i] = bias;
4931 params->sse4_mul32.a_multiplier[i] = a_multiplier;
4932 params->sse4_mul32.b_multiplier[i] = b_multiplier;
4933 }
4934 for (uint32_t i = 0; i < 2; i++) {
4935 params->sse4_mul32.shift[i] = (uint64_t) shift;
4936 }
4937 for (uint32_t i = 0; i < 8; i++) {
4938 params->sse4_mul32.output_zero_point[i] = (int16_t) output_zero_point;
4939 }
4940 for (uint32_t i = 0; i < 16; i++) {
4941 params->sse4_mul32.output_min[i] = output_min;
4942 params->sse4_mul32.output_max[i] = output_max;
4943 }
4944 return sizeof(params->sse4_mul32);
4945}
4946
4947size_t xnn_init_qs8_add_minmax_avx2_params(
4948 union xnn_qs8_add_minmax_params params[XNN_MIN_ELEMENTS(1)],
4949 int8_t a_zero_point,
4950 int8_t b_zero_point,
4951 int8_t output_zero_point,
4952 float a_output_scale,
4953 float b_output_scale,
4954 int8_t output_min,
4955 int8_t output_max)
4956{
4957 const float abs_a_output_scale = fabsf(a_output_scale);
4958 const float abs_b_output_scale = fabsf(b_output_scale);
4959 assert(abs_a_output_scale >= 0x1.0p-10f);
4960 assert(abs_b_output_scale >= 0x1.0p-10f);
4961 assert(abs_a_output_scale < 0x1.0p+8f);
4962 assert(abs_b_output_scale < 0x1.0p+8f);
4963
4964 // Compute requantization parameters.
4965 const float max_abs_output_scale = math_max_f32(abs_a_output_scale, abs_b_output_scale);
4966 assert(max_abs_output_scale >= 0x1.0p-10f);
4967 assert(max_abs_output_scale < 0x1.0p+8f);
4968 const uint32_t max_scale_bits = float_as_uint32(max_abs_output_scale);
4969 const int32_t max_scale_exponent = (int32_t) (max_scale_bits >> 23) - 127;
4970
4971 // Shift is in [12, 30] range.
4972 const uint32_t shift = (uint32_t) (20 /* multiplier bits */ - max_scale_exponent);
4973 assert(shift <= 30);
4974 assert(shift >= 12);
4975
4976 // Multipliers are in [0, 2**21) range, largest multiplier is in [2**20, 2**21) range.
4977 const int32_t abs_a_multiplier = (int32_t) lrintf(uint32_as_float(float_as_uint32(abs_a_output_scale) + (shift << 23)));
4978 const int32_t abs_b_multiplier = (int32_t) lrintf(uint32_as_float(float_as_uint32(abs_b_output_scale) + (shift << 23)));
4979 assert(math_max_s32(abs_a_multiplier, abs_b_multiplier) >= INT32_C(0x00100000));
4980 assert(abs_a_multiplier <= INT32_C(0x00200000));
4981 assert(abs_b_multiplier <= INT32_C(0x00200000));
4982
4983 const int32_t a_multiplier = signbit(a_output_scale) ? -abs_a_multiplier : abs_a_multiplier;
4984 const int32_t b_multiplier = signbit(b_output_scale) ? -abs_b_multiplier : abs_b_multiplier;
4985
4986 const int32_t rounding = INT32_C(1) << (shift - 1);
4987 const int32_t bias = rounding - a_multiplier * (int32_t) a_zero_point - b_multiplier * (int32_t) b_zero_point;
4988 for (uint32_t i = 0; i < 8; i++) {
4989 params->avx2.bias[i] = bias;
4990 params->avx2.a_multiplier[i] = a_multiplier;
4991 params->avx2.b_multiplier[i] = b_multiplier;
4992 }
4993 for (uint32_t i = 0; i < 4; i++) {
4994 params->avx2.shift[i] = (uint64_t) shift;
4995 }
4996 for (uint32_t i = 0; i < 16; i++) {
4997 params->avx2.output_zero_point[i] = (int16_t) output_zero_point;
4998 params->avx2.output_min[i] = output_min;
4999 params->avx2.output_max[i] = output_max;
5000 }
5001 return sizeof(params->avx2);
5002}
5003
5004size_t xnn_init_qs8_add_minmax_avx512_params(
5005 union xnn_qs8_add_minmax_params params[XNN_MIN_ELEMENTS(1)],
5006 int8_t a_zero_point,
5007 int8_t b_zero_point,
5008 int8_t output_zero_point,
5009 float a_output_scale,
5010 float b_output_scale,
5011 int8_t output_min,
5012 int8_t output_max)
5013{
5014 const float abs_a_output_scale = fabsf(a_output_scale);
5015 const float abs_b_output_scale = fabsf(b_output_scale);
5016 assert(abs_a_output_scale >= 0x1.0p-10f);
5017 assert(abs_b_output_scale >= 0x1.0p-10f);
5018 assert(abs_a_output_scale < 0x1.0p+8f);
5019 assert(abs_b_output_scale < 0x1.0p+8f);
5020
5021 // Compute requantization parameters.
5022 const float max_abs_output_scale = math_max_f32(abs_a_output_scale, abs_b_output_scale);
5023 assert(max_abs_output_scale >= 0x1.0p-10f);
5024 assert(max_abs_output_scale < 0x1.0p+8f);
5025 const uint32_t max_scale_bits = float_as_uint32(max_abs_output_scale);
5026 const int32_t max_scale_exponent = (int32_t) (max_scale_bits >> 23) - 127;
5027
5028 // Shift is in [12, 30] range.
5029 const uint32_t shift = (uint32_t) (20 /* multiplier bits */ - max_scale_exponent);
5030 assert(shift <= 30);
5031 assert(shift >= 12);
5032
5033 // Multipliers are in [0, 2**21) range, largest multiplier is in [2**20, 2**21) range.
5034 const int32_t abs_a_multiplier = (int32_t) lrintf(uint32_as_float(float_as_uint32(abs_a_output_scale) + (shift << 23)));
5035 const int32_t abs_b_multiplier = (int32_t) lrintf(uint32_as_float(float_as_uint32(abs_b_output_scale) + (shift << 23)));
5036 assert(math_max_s32(abs_a_multiplier, abs_b_multiplier) >= INT32_C(0x00100000));
5037 assert(abs_a_multiplier <= INT32_C(0x00200000));
5038 assert(abs_b_multiplier <= INT32_C(0x00200000));
5039
5040 const int32_t a_multiplier = signbit(a_output_scale) ? -abs_a_multiplier : abs_a_multiplier;
5041 const int32_t b_multiplier = signbit(b_output_scale) ? -abs_b_multiplier : abs_b_multiplier;
5042
5043 const int32_t rounding = INT32_C(1) << (shift - 1);
5044 const int32_t bias = rounding - a_multiplier * (int32_t) a_zero_point - b_multiplier * (int32_t) b_zero_point;
5045 for (uint32_t i = 0; i < 16; i++) {
5046 params->avx512.bias[i] = bias;
5047 params->avx512.a_multiplier[i] = a_multiplier;
5048 params->avx512.b_multiplier[i] = b_multiplier;
5049 }
5050 for (uint32_t i = 0; i < 8; i++) {
5051 params->avx512.shift[i] = (uint64_t) shift;
5052 }
5053 for (uint32_t i = 0; i < 32; i++) {
5054 params->avx512.output_zero_point[i] = (int16_t) output_zero_point;
5055 params->avx512.output_min[i] = output_min;
5056 params->avx512.output_max[i] = output_max;
5057 }
5058 return sizeof(params->avx512);
5059}
5060#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
5061
5062#if XNN_ARCH_ARM || XNN_ARCH_ARM64
5063size_t xnn_init_qs8_add_minmax_neon_params(
5064 union xnn_qs8_add_minmax_params params[XNN_MIN_ELEMENTS(1)],
5065 int8_t a_zero_point,
5066 int8_t b_zero_point,
5067 int8_t output_zero_point,
5068 float a_output_scale,
5069 float b_output_scale,
5070 int8_t output_min,
5071 int8_t output_max)
5072{
5073 const float abs_a_output_scale = fabsf(a_output_scale);
5074 const float abs_b_output_scale = fabsf(b_output_scale);
5075 assert(abs_a_output_scale >= 0x1.0p-10f);
5076 assert(abs_b_output_scale >= 0x1.0p-10f);
5077 assert(abs_a_output_scale < 0x1.0p+8f);
5078 assert(abs_b_output_scale < 0x1.0p+8f);
5079
5080 // Compute requantization parameters.
5081 const float max_abs_output_scale = math_max_f32(abs_a_output_scale, abs_b_output_scale);
5082 assert(max_abs_output_scale >= 0x1.0p-10f);
5083 assert(max_abs_output_scale < 0x1.0p+8f);
5084 const uint32_t max_scale_bits = float_as_uint32(max_abs_output_scale);
5085 const int32_t max_scale_exponent = (int32_t) (max_scale_bits >> 23) - 127;
5086
5087 // Shift is in [12, 30] range.
5088 const uint32_t shift = (uint32_t) (20 /* multiplier bits */ - max_scale_exponent);
5089 assert(shift <= 30);
5090 assert(shift >= 12);
5091
5092 // Multipliers are in [0, 2**21) range, largest multiplier is in [2**20, 2**21) range.
5093 const int32_t abs_a_multiplier = (int32_t) lrintf(uint32_as_float(float_as_uint32(abs_a_output_scale) + (shift << 23)));
5094 const int32_t abs_b_multiplier = (int32_t) lrintf(uint32_as_float(float_as_uint32(abs_b_output_scale) + (shift << 23)));
5095 assert(math_max_s32(abs_a_multiplier, abs_b_multiplier) >= INT32_C(0x00100000));
5096 assert(abs_a_multiplier <= INT32_C(0x00200000));
5097 assert(abs_b_multiplier <= INT32_C(0x00200000));
5098
5099 const int32_t a_multiplier = signbit(a_output_scale) ? -abs_a_multiplier : abs_a_multiplier;
5100 const int32_t b_multiplier = signbit(b_output_scale) ? -abs_b_multiplier : abs_b_multiplier;
5101
5102 params->neon.a_zero_point = a_zero_point;
5103 params->neon.b_zero_point = b_zero_point;
5104 params->neon.a_multiplier = (int32_t) a_multiplier;
5105 params->neon.b_multiplier = (int32_t) b_multiplier;
5106 params->neon.right_shift = (int32_t) -shift;
5107 params->neon.output_zero_point = (int16_t) output_zero_point;
5108 params->neon.output_min = output_min;
5109 params->neon.output_max = output_max;
5110 return sizeof(params->neon);
5111}
5112#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64
5113
5114#if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
5115size_t xnn_init_qs8_add_minmax_wasmsimd_params(
5116 union xnn_qs8_add_minmax_params params[XNN_MIN_ELEMENTS(1)],
5117 int8_t a_zero_point,
5118 int8_t b_zero_point,
5119 int8_t output_zero_point,
5120 float a_output_scale,
5121 float b_output_scale,
5122 int8_t output_min,
5123 int8_t output_max)
5124{
5125 const float abs_a_output_scale = fabsf(a_output_scale);
5126 const float abs_b_output_scale = fabsf(b_output_scale);
5127 assert(abs_a_output_scale >= 0x1.0p-10f);
5128 assert(abs_b_output_scale >= 0x1.0p-10f);
5129 assert(abs_a_output_scale < 0x1.0p+8f);
5130 assert(abs_b_output_scale < 0x1.0p+8f);
5131
5132 // Compute requantization parameters.
5133 const float max_abs_output_scale = math_max_f32(abs_a_output_scale, abs_b_output_scale);
5134 assert(max_abs_output_scale >= 0x1.0p-10f);
5135 assert(max_abs_output_scale < 0x1.0p+8f);
5136 const uint32_t max_scale_bits = float_as_uint32(max_abs_output_scale);
5137 const int32_t max_scale_exponent = (int32_t) (max_scale_bits >> 23) - 127;
5138
5139 // Shift is in [12, 30] range.
5140 const uint32_t shift = (uint32_t) (20 /* multiplier bits */ - max_scale_exponent);
5141 assert(shift <= 30);
5142 assert(shift >= 12);
5143
5144 // Multipliers are in [0, 2**21) range, largest multiplier is in [2**20, 2**21) range.
5145 const int32_t abs_a_multiplier = (int32_t) lrintf(uint32_as_float(float_as_uint32(abs_a_output_scale) + (shift << 23)));
5146 const int32_t abs_b_multiplier = (int32_t) lrintf(uint32_as_float(float_as_uint32(abs_b_output_scale) + (shift << 23)));
5147 assert(math_max_s32(abs_a_multiplier, abs_b_multiplier) >= INT32_C(0x00100000));
5148 assert(abs_a_multiplier <= INT32_C(0x00200000));
5149 assert(abs_b_multiplier <= INT32_C(0x00200000));
5150
5151 const int32_t a_multiplier = signbit(a_output_scale) ? -abs_a_multiplier : abs_a_multiplier;
5152 const int32_t b_multiplier = signbit(b_output_scale) ? -abs_b_multiplier : abs_b_multiplier;
5153
5154 const int32_t rounding = INT32_C(1) << (shift - 1);
5155 const int32_t bias = rounding - a_multiplier * (int32_t) a_zero_point - b_multiplier * (int32_t) b_zero_point;
5156 for (uint32_t i = 0; i < 2; i++) {
5157 params->wasmsimd.bias[i] = bias;
5158 params->wasmsimd.a_multiplier[i] = a_multiplier;
5159 params->wasmsimd.b_multiplier[i] = b_multiplier;
5160 }
5161 params->wasmsimd.shift = shift;
5162 for (uint32_t i = 0; i < 4; i++) {
5163 params->wasmsimd.output_zero_point[i] = (int16_t) output_zero_point;
5164 }
5165 for (uint32_t i = 0; i < 8; i++) {
5166 params->wasmsimd.output_min[i] = output_min;
5167 params->wasmsimd.output_max[i] = output_max;
5168 }
5169 return sizeof(params->wasmsimd);
5170}
5171#endif // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
5172
5173size_t xnn_init_qs8_add_minmax_scalar_params(
5174 union xnn_qs8_add_minmax_params params[XNN_MIN_ELEMENTS(1)],
5175 int8_t a_zero_point,
5176 int8_t b_zero_point,
5177 int8_t output_zero_point,
5178 float a_output_scale,
5179 float b_output_scale,
5180 int8_t output_min,
5181 int8_t output_max)
5182{
5183 const float abs_a_output_scale = fabsf(a_output_scale);
5184 const float abs_b_output_scale = fabsf(b_output_scale);
5185 assert(abs_a_output_scale >= 0x1.0p-10f);
5186 assert(abs_b_output_scale >= 0x1.0p-10f);
5187 assert(abs_a_output_scale < 0x1.0p+8f);
5188 assert(abs_b_output_scale < 0x1.0p+8f);
5189
5190 // Compute requantization parameters.
5191 const float max_abs_output_scale = math_max_f32(abs_a_output_scale, abs_b_output_scale);
5192 assert(max_abs_output_scale >= 0x1.0p-10f);
5193 assert(max_abs_output_scale < 0x1.0p+8f);
5194 const uint32_t max_scale_bits = float_as_uint32(max_abs_output_scale);
5195 const int32_t max_scale_exponent = (int32_t) (max_scale_bits >> 23) - 127;
5196
5197 // Shift is in [12, 30] range.
5198 const uint32_t shift = (uint32_t) (20 /* multiplier bits */ - max_scale_exponent);
5199 assert(shift <= 30);
5200 assert(shift >= 12);
5201
5202 // Multipliers are in [0, 2**21) range, largest multiplier is in [2**20, 2**21) range.
5203 const int32_t abs_a_multiplier = (int32_t) lrintf(uint32_as_float(float_as_uint32(abs_a_output_scale) + (shift << 23)));
5204 const int32_t abs_b_multiplier = (int32_t) lrintf(uint32_as_float(float_as_uint32(abs_b_output_scale) + (shift << 23)));
5205 assert(math_max_s32(abs_a_multiplier, abs_b_multiplier) >= INT32_C(0x00100000));
5206 assert(abs_a_multiplier <= INT32_C(0x00200000));
5207 assert(abs_b_multiplier <= INT32_C(0x00200000));
5208
5209 const int32_t a_multiplier = signbit(a_output_scale) ? -abs_a_multiplier : abs_a_multiplier;
5210 const int32_t b_multiplier = signbit(b_output_scale) ? -abs_b_multiplier : abs_b_multiplier;
5211
5212 const int32_t rounding = INT32_C(1) << (shift - 1);
5213 params->scalar.bias = rounding - a_multiplier * (int32_t) a_zero_point - b_multiplier * (int32_t) b_zero_point;
5214 params->scalar.a_multiplier = a_multiplier;
5215 params->scalar.b_multiplier = b_multiplier;
5216 params->scalar.shift = shift;
5217 params->scalar.output_min_less_zero_point = (int32_t) output_min - (int32_t) output_zero_point;
5218 params->scalar.output_max_less_zero_point = (int32_t) output_max - (int32_t) output_zero_point;
5219 params->scalar.output_zero_point = (int32_t) output_zero_point;
5220 return sizeof(params->scalar);
5221}
5222
5223size_t xnn_init_qu8_mul_minmax_fp32_scalar_params(
5224 union xnn_qu8_mul_minmax_params params[XNN_MIN_ELEMENTS(1)],
5225 uint8_t a_zero_point,
5226 uint8_t b_zero_point,
5227 uint8_t output_zero_point,
5228 float product_output_scale,
5229 uint8_t output_min,
5230 uint8_t output_max)
5231{
5232 assert(product_output_scale >= 0x1.0p-16f);
5233 assert(product_output_scale < 0x1.0p+8f);
5234
5235 params->fp32_scalar.a_zero_point = (int16_t) (uint16_t) a_zero_point;
5236 params->fp32_scalar.b_zero_point = (int16_t) (uint16_t) b_zero_point;
5237 params->fp32_scalar.scale = product_output_scale;
5238 params->fp32_scalar.output_min_less_zero_point = (float) (int32_t) ((uint32_t) output_min - (uint32_t) output_zero_point);
5239 params->fp32_scalar.output_max_less_zero_point = (float) (int32_t) ((uint32_t) output_max - (uint32_t) output_zero_point);
5240 params->fp32_scalar.magic_bias = 12582912.0f;
5241 params->fp32_scalar.magic_bias_less_output_zero_point = INT32_C(0x4B400000) - (int32_t) (uint32_t) output_zero_point;
5242 return sizeof(params->fp32_scalar);
5243}
5244
5245#if XNN_ARCH_ARM || XNN_ARCH_ARM64
5246size_t xnn_init_qu8_mul_minmax_fp32_neon_params(
5247 union xnn_qu8_mul_minmax_params params[XNN_MIN_ELEMENTS(1)],
5248 uint8_t a_zero_point,
5249 uint8_t b_zero_point,
5250 uint8_t output_zero_point,
5251 float product_output_scale,
5252 uint8_t output_min,
5253 uint8_t output_max)
5254{
5255 assert(product_output_scale >= 0x1.0p-16f);
5256 assert(product_output_scale < 0x1.0p+8f);
5257
5258 params->fp32_neon.a_zero_point[0] = a_zero_point;
5259 params->fp32_neon.a_zero_point[1] = a_zero_point;
5260 params->fp32_neon.b_zero_point[0] = b_zero_point;
5261 params->fp32_neon.b_zero_point[1] = b_zero_point;
5262 params->fp32_neon.scale = product_output_scale;
5263 params->fp32_neon.magic_bias = 12582912.0f;
5264 params->fp32_neon.magic_bias_less_output_zero_point = INT32_C(0x4B400000) - (int32_t) output_zero_point;
5265 params->fp32_neon.output_min = output_min;
5266 params->fp32_neon.output_max = output_max;
5267 return sizeof(params->fp32_neon);
5268}
5269
5270size_t xnn_init_qu8_mul_minmax_fp32_neonv8_params(
5271 union xnn_qu8_mul_minmax_params params[XNN_MIN_ELEMENTS(1)],
5272 uint8_t a_zero_point,
5273 uint8_t b_zero_point,
5274 uint8_t output_zero_point,
5275 float product_output_scale,
5276 uint8_t output_min,
5277 uint8_t output_max)
5278{
5279 assert(product_output_scale >= 0x1.0p-16f);
5280 assert(product_output_scale < 0x1.0p+8f);
5281
5282 params->fp32_neonv8.a_zero_point[0] = a_zero_point;
5283 params->fp32_neonv8.a_zero_point[1] = a_zero_point;
5284 params->fp32_neonv8.b_zero_point[0] = b_zero_point;
5285 params->fp32_neonv8.b_zero_point[1] = b_zero_point;
5286 params->fp32_neonv8.scale = product_output_scale;
5287 params->fp32_neonv8.output_zero_point = (int16_t) output_zero_point;
5288 params->fp32_neonv8.output_min = output_min;
5289 params->fp32_neonv8.output_max = output_max;
5290 return sizeof(params->fp32_neonv8);
5291}
5292
5293size_t xnn_init_qu8_mul_minmax_rndnu_neon_params(
5294 union xnn_qu8_mul_minmax_params params[XNN_MIN_ELEMENTS(1)],
5295 uint8_t a_zero_point,
5296 uint8_t b_zero_point,
5297 uint8_t output_zero_point,
5298 float product_output_scale,
5299 uint8_t output_min,
5300 uint8_t output_max)
5301{
5302 assert(product_output_scale >= 0x1.0p-16f);
5303 assert(product_output_scale < 0x1.0p+8f);
5304
5305 // Compute requantization parameters.
5306 const uint32_t scale_bits = float_as_uint32(product_output_scale);
5307
5308 // Multiplier is in [0x40000000, 0x7FFFFF80] range.
5309 const int32_t multiplier = (int32_t) (((scale_bits & UINT32_C(0x007FFFFF)) | UINT32_C(0x00800000)) << 7);
5310 assert(multiplier >= INT32_C(0x40000000));
5311 assert(multiplier <= INT32_C(0x7FFFFF80));
5312
5313 // Shift is in [-8, 15] range.
5314 const int32_t shift = 127 + 31 - 32 - (scale_bits >> 23);
5315 assert(shift >= -8);
5316 assert(shift < 16);
5317
5318 // Split shift into pre_shift + post_shift, post_shift in [1, 15] range.
5319 const int32_t post_shift = math_max_s32(shift, 1);
5320 const int32_t pre_shift = shift - post_shift;
5321
5322 params->rndnu_neon.a_zero_point[0] = a_zero_point;
5323 params->rndnu_neon.a_zero_point[1] = a_zero_point;
5324 params->rndnu_neon.b_zero_point[0] = b_zero_point;
5325 params->rndnu_neon.b_zero_point[1] = b_zero_point;
5326 params->rndnu_neon.left_pre_shift = -pre_shift;
5327 params->rndnu_neon.multiplier = multiplier;
5328 params->rndnu_neon.left_post_shift = -post_shift;
5329 params->rndnu_neon.output_zero_point = (int16_t) output_zero_point;
5330 params->rndnu_neon.output_min = output_min;
5331 params->rndnu_neon.output_max = output_max;
5332 return sizeof(params->rndnu_neon);
5333}
5334#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64
5335
5336#if XNN_ARCH_X86 || XNN_ARCH_X86_64
5337size_t xnn_init_qu8_mul_minmax_fp32_sse2_params(
5338 union xnn_qu8_mul_minmax_params params[XNN_MIN_ELEMENTS(1)],
5339 uint8_t a_zero_point,
5340 uint8_t b_zero_point,
5341 uint8_t output_zero_point,
5342 float product_output_scale,
5343 uint8_t output_min,
5344 uint8_t output_max)
5345{
5346 assert(product_output_scale >= 0x1.0p-16f);
5347 assert(product_output_scale < 0x1.0p+8f);
5348
5349 for (uint32_t i = 0; i < 8; i++) {
5350 params->fp32_sse2.a_zero_point[i] = (int16_t) (uint16_t) a_zero_point;
5351 params->fp32_sse2.b_zero_point[i] = (int16_t) (uint16_t) b_zero_point;
5352 }
5353 for (uint32_t i = 0; i < 4; i++) {
5354 params->fp32_sse2.scale[i] = product_output_scale;
5355 }
5356 for (uint32_t i = 0; i < 8; i++) {
5357 params->fp32_sse2.output_zero_point[i] = (int16_t) (uint16_t) output_zero_point;
5358 }
5359 for (uint32_t i = 0; i < 16; i++) {
5360 params->fp32_sse2.output_min[i] = output_min;
5361 params->fp32_sse2.output_max[i] = output_max;
5362 }
5363 return sizeof(params->fp32_sse2);
5364}
5365#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
5366
5367#if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
5368size_t xnn_init_qu8_mul_minmax_fp32_wasmsimd_params(
5369 union xnn_qu8_mul_minmax_params params[XNN_MIN_ELEMENTS(1)],
5370 uint8_t a_zero_point,
5371 uint8_t b_zero_point,
5372 uint8_t output_zero_point,
5373 float product_output_scale,
5374 uint8_t output_min,
5375 uint8_t output_max)
5376{
5377 assert(product_output_scale >= 0x1.0p-16f);
5378 assert(product_output_scale < 0x1.0p+8f);
5379
5380 const float output_min_less_zero_point = (float) ((int32_t) output_min - (int32_t) output_zero_point);
5381 const int32_t magic_min = (int32_t) float_as_uint32(12582912.0f + output_min_less_zero_point);
5382 const int32_t magic_bias_less_output_zero_point = INT32_C(0x4B400000) - (int32_t) output_zero_point;
5383 for (uint32_t i = 0; i < 4; i++) {
5384 params->fp32_wasmsimd.a_zero_point[i] = (int16_t) a_zero_point;
5385 params->fp32_wasmsimd.b_zero_point[i] = (int16_t) b_zero_point;
5386 }
5387 for (uint32_t i = 0; i < 2; i++) {
5388 params->fp32_wasmsimd.scale[i] = product_output_scale;
5389 params->fp32_wasmsimd.magic_bias[i] = 12582912.0f;
5390 params->fp32_wasmsimd.magic_min[i] = magic_min;
5391 params->fp32_wasmsimd.magic_bias_less_output_zero_point[i] = magic_bias_less_output_zero_point;
5392 }
5393 for (uint32_t i = 0; i < 8; i++) {
5394 params->fp32_wasmsimd.output_max[i] = output_max;
5395 }
5396 return sizeof(params->fp32_wasmsimd);
5397}
5398#endif // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
5399
5400size_t xnn_init_qs8_mul_minmax_fp32_scalar_params(
5401 union xnn_qs8_mul_minmax_params params[XNN_MIN_ELEMENTS(1)],
5402 int8_t a_zero_point,
5403 int8_t b_zero_point,
5404 int8_t output_zero_point,
5405 float product_output_scale,
5406 int8_t output_min,
5407 int8_t output_max)
5408{
5409 assert(product_output_scale >= 0x1.0p-16f);
5410 assert(product_output_scale < 0x1.0p+8f);
5411
5412 params->fp32_scalar.a_zero_point = (int16_t) a_zero_point;
5413 params->fp32_scalar.b_zero_point = (int16_t) b_zero_point;
5414 params->fp32_scalar.scale = product_output_scale;
5415 params->fp32_scalar.output_min_less_zero_point = (float) ((int32_t) output_min - (int32_t) output_zero_point);
5416 params->fp32_scalar.output_max_less_zero_point = (float) ((int32_t) output_max - (int32_t) output_zero_point);
5417 params->fp32_scalar.magic_bias = 12582912.0f;
5418 params->fp32_scalar.magic_bias_less_output_zero_point = INT32_C(0x4B400000) - (int32_t) output_zero_point;
5419 return sizeof(params->fp32_scalar);
5420}
5421
5422#if XNN_ARCH_ARM || XNN_ARCH_ARM64
5423size_t xnn_init_qs8_mul_minmax_fp32_neon_params(
5424 union xnn_qs8_mul_minmax_params params[XNN_MIN_ELEMENTS(1)],
5425 int8_t a_zero_point,
5426 int8_t b_zero_point,
5427 int8_t output_zero_point,
5428 float product_output_scale,
5429 int8_t output_min,
5430 int8_t output_max)
5431{
5432 assert(product_output_scale >= 0x1.0p-16f);
5433 assert(product_output_scale < 0x1.0p+8f);
5434
5435 params->fp32_neon.a_zero_point[0] = a_zero_point;
5436 params->fp32_neon.a_zero_point[1] = a_zero_point;
5437 params->fp32_neon.b_zero_point[0] = b_zero_point;
5438 params->fp32_neon.b_zero_point[1] = b_zero_point;
5439 params->fp32_neon.scale = product_output_scale;
5440 params->fp32_neon.magic_bias = 12582912.0f;
5441 params->fp32_neon.magic_bias_less_output_zero_point = INT32_C(0x4B400000) - (int32_t) output_zero_point;
5442 params->fp32_neon.output_min = output_min;
5443 params->fp32_neon.output_max = output_max;
5444 return sizeof(params->fp32_neon);
5445}
5446
5447size_t xnn_init_qs8_mul_minmax_fp32_neonv8_params(
5448 union xnn_qs8_mul_minmax_params params[XNN_MIN_ELEMENTS(1)],
5449 int8_t a_zero_point,
5450 int8_t b_zero_point,
5451 int8_t output_zero_point,
5452 float product_output_scale,
5453 int8_t output_min,
5454 int8_t output_max)
5455{
5456 assert(product_output_scale >= 0x1.0p-16f);
5457 assert(product_output_scale < 0x1.0p+8f);
5458
5459 params->fp32_neonv8.a_zero_point[0] = a_zero_point;
5460 params->fp32_neonv8.a_zero_point[1] = a_zero_point;
5461 params->fp32_neonv8.b_zero_point[0] = b_zero_point;
5462 params->fp32_neonv8.b_zero_point[1] = b_zero_point;
5463 params->fp32_neonv8.scale = product_output_scale;
5464 params->fp32_neonv8.output_zero_point = (int16_t) output_zero_point;
5465 params->fp32_neonv8.output_min = output_min;
5466 params->fp32_neonv8.output_max = output_max;
5467 return sizeof(params->fp32_neonv8);
5468}
5469
5470size_t xnn_init_qs8_mul_minmax_rndnu_neon_params(
5471 union xnn_qs8_mul_minmax_params params[XNN_MIN_ELEMENTS(1)],
5472 int8_t a_zero_point,
5473 int8_t b_zero_point,
5474 int8_t output_zero_point,
5475 float product_output_scale,
5476 int8_t output_min,
5477 int8_t output_max)
5478{
5479 assert(product_output_scale >= 0x1.0p-16f);
5480 assert(product_output_scale < 0x1.0p+8f);
5481
5482 // Compute requantization parameters.
5483 const uint32_t scale_bits = float_as_uint32(product_output_scale);
5484
5485 // Multiplier is in [0x40000000, 0x7FFFFF80] range.
5486 const int32_t multiplier = (int32_t) (((scale_bits & UINT32_C(0x007FFFFF)) | UINT32_C(0x00800000)) << 7);
5487 assert(multiplier >= INT32_C(0x40000000));
5488 assert(multiplier <= INT32_C(0x7FFFFF80));
5489
5490 // Shift is in [-8, 15] range.
5491 const int32_t shift = 127 + 31 - 32 - (scale_bits >> 23);
5492 assert(shift >= -8);
5493 assert(shift < 16);
5494
5495 // Split shift into pre_shift + post_shift, post_shift in [1, 15] range.
5496 const int32_t post_shift = math_max_s32(shift, 1);
5497 const int32_t pre_shift = shift - post_shift;
5498
5499 params->rndnu_neon.a_zero_point[0] = a_zero_point;
5500 params->rndnu_neon.a_zero_point[1] = a_zero_point;
5501 params->rndnu_neon.b_zero_point[0] = b_zero_point;
5502 params->rndnu_neon.b_zero_point[1] = b_zero_point;
5503 params->rndnu_neon.left_pre_shift = -pre_shift;
5504 params->rndnu_neon.multiplier = multiplier;
5505 params->rndnu_neon.left_post_shift = -post_shift;
5506 params->rndnu_neon.output_zero_point = (int16_t) output_zero_point;
5507 params->rndnu_neon.output_min = output_min;
5508 params->rndnu_neon.output_max = output_max;
5509 return sizeof(params->rndnu_neon);
5510}
5511#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64
5512
5513#if XNN_ARCH_X86 || XNN_ARCH_X86_64
5514size_t xnn_init_qs8_mul_minmax_fp32_sse2_params(
5515 union xnn_qs8_mul_minmax_params params[XNN_MIN_ELEMENTS(1)],
5516 int8_t a_zero_point,
5517 int8_t b_zero_point,
5518 int8_t output_zero_point,
5519 float product_output_scale,
5520 int8_t output_min,
5521 int8_t output_max)
5522{
5523 assert(product_output_scale >= 0x1.0p-16f);
5524 assert(product_output_scale < 0x1.0p+8f);
5525
5526 for (uint32_t i = 0; i < 8; i++) {
5527 params->fp32_sse2.a_zero_point[i] = (int16_t) a_zero_point;
5528 params->fp32_sse2.b_zero_point[i] = (int16_t) b_zero_point;
5529 }
5530 for (uint32_t i = 0; i < 4; i++) {
5531 params->fp32_sse2.scale[i] = product_output_scale;
5532 }
5533 for (uint32_t i = 0; i < 8; i++) {
5534 params->fp32_sse2.output_zero_point[i] = (int16_t) output_zero_point;
5535 }
5536 for (uint32_t i = 0; i < 8; i++) {
5537 params->fp32_sse2.output_min[i] = (int16_t) output_min;
5538 params->fp32_sse2.output_max[i] = (int16_t) output_max;
5539 }
5540 return sizeof(params->fp32_sse2);
5541}
5542
5543size_t xnn_init_qs8_mul_minmax_fp32_sse4_params(
5544 union xnn_qs8_mul_minmax_params params[XNN_MIN_ELEMENTS(1)],
5545 int8_t a_zero_point,
5546 int8_t b_zero_point,
5547 int8_t output_zero_point,
5548 float product_output_scale,
5549 int8_t output_min,
5550 int8_t output_max)
5551{
5552 assert(product_output_scale >= 0x1.0p-16f);
5553 assert(product_output_scale < 0x1.0p+8f);
5554
5555 for (uint32_t i = 0; i < 8; i++) {
5556 params->fp32_sse4.a_zero_point[i] = (int16_t) a_zero_point;
5557 params->fp32_sse4.b_zero_point[i] = (int16_t) b_zero_point;
5558 }
5559 for (uint32_t i = 0; i < 4; i++) {
5560 params->fp32_sse4.scale[i] = product_output_scale;
5561 }
5562 for (uint32_t i = 0; i < 8; i++) {
5563 params->fp32_sse4.output_zero_point[i] = (int16_t) output_zero_point;
5564 }
5565 for (uint32_t i = 0; i < 16; i++) {
5566 params->fp32_sse4.output_min[i] = output_min;
5567 params->fp32_sse4.output_max[i] = output_max;
5568 }
5569 return sizeof(params->fp32_sse4);
5570}
5571#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
5572
5573#if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
5574size_t xnn_init_qs8_mul_minmax_fp32_wasmsimd_params(
5575 union xnn_qs8_mul_minmax_params params[XNN_MIN_ELEMENTS(1)],
5576 int8_t a_zero_point,
5577 int8_t b_zero_point,
5578 int8_t output_zero_point,
5579 float product_output_scale,
5580 int8_t output_min,
5581 int8_t output_max)
5582{
5583 assert(product_output_scale >= 0x1.0p-16f);
5584 assert(product_output_scale < 0x1.0p+8f);
5585
5586 const float output_min_less_zero_point = (float) ((int32_t) output_min - (int32_t) output_zero_point);
5587 const int32_t magic_min = (int32_t) float_as_uint32(12582912.0f + output_min_less_zero_point);
5588 const int32_t magic_bias_less_output_zero_point = INT32_C(0x4B400000) - (int32_t) output_zero_point;
5589 for (uint32_t i = 0; i < 4; i++) {
5590 params->fp32_wasmsimd.a_zero_point[i] = (int16_t) a_zero_point;
5591 params->fp32_wasmsimd.b_zero_point[i] = (int16_t) b_zero_point;
5592 }
5593 for (uint32_t i = 0; i < 2; i++) {
5594 params->fp32_wasmsimd.scale[i] = product_output_scale;
5595 params->fp32_wasmsimd.magic_bias[i] = 12582912.0f;
5596 params->fp32_wasmsimd.magic_min[i] = magic_min;
5597 params->fp32_wasmsimd.magic_bias_less_output_zero_point[i] = magic_bias_less_output_zero_point;
5598 }
5599 for (uint32_t i = 0; i < 8; i++) {
5600 params->fp32_wasmsimd.output_max[i] = output_max;
5601 }
5602 return sizeof(params->fp32_wasmsimd);
5603}
5604#endif // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
5605
5606size_t xnn_init_f16_f32_cvt_scalar_params(
5607 union xnn_f16_f32_cvt_params params[XNN_MIN_ELEMENTS(1)])
5608{
5609 params->scalar.sign_mask = UINT32_C(0x80000000);
5610 params->scalar.exp_offset = UINT32_C(0x70000000);
5611 params->scalar.exp_scale = 0x1.0p-112f;
5612 params->scalar.magic_mask = UINT32_C(0x3F000000);
5613 params->scalar.magic_bias = 0.5f;
5614 params->scalar.denorm_cutoff = UINT32_C(0x08000000);
5615 return sizeof(params->scalar);
5616}
5617
5618#if XNN_ARCH_ARM || XNN_ARCH_ARM64
5619size_t xnn_init_f16_f32_cvt_neon_params(
5620 union xnn_f16_f32_cvt_params params[XNN_MIN_ELEMENTS(1)])
5621{
5622 params->neon.exp_scale = 0x1.0p-112f;
5623 return sizeof(params->neon);
5624}
5625#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64
5626
5627#if XNN_ARCH_X86 || XNN_ARCH_X86_64
5628size_t xnn_init_f16_f32_cvt_sse_int16_params(
5629 union xnn_f16_f32_cvt_params params[XNN_MIN_ELEMENTS(1)])
5630{
5631 for (uint32_t i = 0; i < 8; i++) {
5632 params->sse_int16.sign_mask[i] = UINT16_C(0x8000);
5633 params->sse_int16.exp_offset[i] = UINT16_C(0x7000);
5634 }
5635 for (uint32_t i = 0; i < 4; i++) {
5636 params->sse_int16.exp_scale[i] = 0x1.0p-112f;
5637 }
5638 for (uint32_t i = 0; i < 8; i++) {
5639 params->sse_int16.magic_mask[i] = UINT16_C(0x3F00);
5640 }
5641 for (uint32_t i = 0; i < 4; i++) {
5642 params->sse_int16.magic_bias[i] = 0.5f;
5643 }
5644 for (uint32_t i = 0; i < 8; i++) {
5645 params->sse_int16.denorm_cutoff[i] = INT16_C(0x0400);
5646 }
5647 return sizeof(params->sse_int16);
5648}
5649
5650size_t xnn_init_f16_f32_cvt_sse_int32_params(
5651 union xnn_f16_f32_cvt_params params[XNN_MIN_ELEMENTS(1)])
5652{
5653 for (uint32_t i = 0; i < 4; i++) {
5654 params->sse_int32.sign_mask[i] = UINT32_C(0x80000000);
5655 params->sse_int32.exp_offset[i] = UINT32_C(0x70000000);
5656 params->sse_int32.exp_scale[i] = 0x1.0p-112f;
5657 params->sse_int32.magic_bias[i] = UINT32_C(0x3F000000);
5658 params->sse_int32.denorm_cutoff[i] = INT32_C(0x04000000);
5659 }
5660 return sizeof(params->sse_int32);
5661}
5662#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
5663
5664#if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
5665size_t xnn_init_f16_f32_cvt_wasmsimd_int16_params(
5666 union xnn_f16_f32_cvt_params params[XNN_MIN_ELEMENTS(1)])
5667{
5668 for (uint32_t i = 0; i < 4; i++) {
5669 params->wasmsimd_int16.sign_mask[i] = UINT16_C(0x8000);
5670 params->wasmsimd_int16.exp_offset[i] = UINT16_C(0x7000);
5671 }
5672 for (uint32_t i = 0; i < 2; i++) {
5673 params->wasmsimd_int16.exp_scale[i] = 0x1.0p-112f;
5674 }
5675 for (uint32_t i = 0; i < 4; i++) {
5676 params->wasmsimd_int16.magic_mask[i] = UINT16_C(0x3F00);
5677 }
5678 for (uint32_t i = 0; i < 2; i++) {
5679 params->wasmsimd_int16.magic_bias[i] = 0.5f;
5680 }
5681 for (uint32_t i = 0; i < 4; i++) {
5682 params->wasmsimd_int16.denorm_cutoff[i] = INT16_C(0x0400);
5683 }
5684 return sizeof(params->wasmsimd_int16);
5685}
5686
5687size_t xnn_init_f16_f32_cvt_wasmsimd_int32_params(
5688 union xnn_f16_f32_cvt_params params[XNN_MIN_ELEMENTS(1)])
5689{
5690 for (uint32_t i = 0; i < 2; i++) {
5691 params->wasmsimd_int32.sign_mask[i] = UINT32_C(0x80000000);
5692 params->wasmsimd_int32.exp_offset[i] = UINT32_C(0x70000000);
5693 params->wasmsimd_int32.exp_scale[i] = 0x1.0p-112f;
5694 params->wasmsimd_int32.magic_bias[i] = UINT32_C(0x3F000000);
5695 params->wasmsimd_int32.denorm_cutoff[i] = INT32_C(0x04000000);
5696 }
5697 return sizeof(params->wasmsimd_int32);
5698}
5699#endif // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
5700
5701size_t xnn_init_f32_f16_cvt_scalar_bitcast_params(
5702 union xnn_f32_f16_cvt_params params[XNN_MIN_ELEMENTS(1)])
5703{
5704 params->scalar_bitcast.nonsign_mask = UINT32_C(0x7FFFFFFF);
5705 params->scalar_bitcast.exp_bias = UINT32_C(0x07800000);
5706 params->scalar_bitcast.scale_to_inf = 0x1.0p+112f;
5707 params->scalar_bitcast.expw_max = UINT32_C(0x7F800000);
5708 params->scalar_bitcast.scale_to_zero = 0x1.0p-110f;
5709 params->scalar_bitcast.bias_min = UINT32_C(0x40000000);
5710 params->scalar_bitcast.exph_mask = UINT16_C(0x7C00);
5711 params->scalar_bitcast.manth_mask = UINT16_C(0x0FFF);
5712 params->scalar_bitcast.nanh = UINT16_C(0x7E00);
5713 return sizeof(params->scalar_bitcast);
5714}
5715
5716size_t xnn_init_f32_f16_cvt_scalar_fabsf_params(
5717 union xnn_f32_f16_cvt_params params[XNN_MIN_ELEMENTS(1)])
5718{
5719 params->scalar_fabsf.scale_to_inf = 0x1.0p+112f;
5720 params->scalar_fabsf.exp_bias = UINT32_C(0x07800000);
5721 params->scalar_fabsf.scale_to_zero = 0x1.0p-110f;
5722 params->scalar_fabsf.expw_max = UINT32_C(0x7F800000);
5723 params->scalar_fabsf.bias_min = UINT32_C(0x40000000);
5724 params->scalar_fabsf.exph_mask = UINT16_C(0x7C00);
5725 params->scalar_fabsf.manth_mask = UINT16_C(0x0FFF);
5726 params->scalar_fabsf.nanh = UINT16_C(0x7E00);
5727 return sizeof(params->scalar_fabsf);
5728}
5729
5730#if XNN_ARCH_ARM || XNN_ARCH_ARM64
5731size_t xnn_init_f32_f16_cvt_neon_params(
5732 union xnn_f32_f16_cvt_params params[XNN_MIN_ELEMENTS(1)])
5733{
5734 params->neon.exp_bias = UINT32_C(0x07800000);
5735 params->neon.scale_to_inf = 0x1.0p+112f;
5736 params->neon.expw_max = UINT32_C(0x7F800000);
5737 params->neon.scale_to_zero = 0x1.0p-110f;
5738 return sizeof(params->neon);
5739}
5740#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64
5741
5742#if XNN_ARCH_X86 || XNN_ARCH_X86_64
5743size_t xnn_init_f32_f16_cvt_sse2_params(
5744 union xnn_f32_f16_cvt_params params[XNN_MIN_ELEMENTS(1)])
5745{
5746 for (uint32_t i = 0; i < 4; i++) {
5747 params->sse2.nonsign_mask[i] = UINT32_C(0x7FFFFFFF);
5748 params->sse2.exp_bias[i] = UINT32_C(0x07800000);
5749 params->sse2.scale_to_inf[i] = 0x1.0p+112f;
5750 params->sse2.expw_max[i] = UINT32_C(0x7F800000);
5751 params->sse2.scale_to_zero[i] = 0x1.0p-110f;
5752 }
5753 params->sse2.bias_min[0] = INT16_C(0x8000);
5754 params->sse2.bias_min[1] = INT16_C(0x4000);
5755 params->sse2.bias_min[2] = INT16_C(0x8000);
5756 params->sse2.bias_min[3] = INT16_C(0x4000);
5757 params->sse2.bias_min[4] = INT16_C(0x8000);
5758 params->sse2.bias_min[5] = INT16_C(0x4000);
5759 params->sse2.bias_min[6] = INT16_C(0x8000);
5760 params->sse2.bias_min[7] = INT16_C(0x4000);
5761 for (uint32_t i = 0; i < 4; i++) {
5762 params->sse2.manth_mask[i] = UINT32_C(0x00000FFF);
5763 params->sse2.exph_mask[i] = UINT32_C(0x00007C00);
5764 }
5765 for (uint32_t i = 0; i < 8; i++) {
5766 params->sse2.nanh[i] = UINT16_C(0x7E00);
5767 }
5768 return sizeof(params->sse2);
5769}
5770
5771size_t xnn_init_f32_f16_cvt_f16c_params(
5772 union xnn_f32_f16_cvt_params params[XNN_MIN_ELEMENTS(1)])
5773{
5774 for (uint32_t i = 0; i < 7; i++) {
5775 params->f16c.mask_table[i] = -1;
5776 }
5777 for (uint32_t i = 7; i < 14; i++) {
5778 params->f16c.mask_table[i] = 0;
5779 }
5780 return sizeof(params->f16c);
5781}
5782#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
5783
5784#if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
5785size_t xnn_init_f32_f16_cvt_wasmsimd_params(
5786 union xnn_f32_f16_cvt_params params[XNN_MIN_ELEMENTS(1)])
5787{
5788 for (uint32_t i = 0; i < 2; i++) {
5789 params->wasmsimd.exp_bias[i] = UINT32_C(0x07800000);
5790 params->wasmsimd.scale_to_inf[i] = 0x1.0p+112f;
5791 params->wasmsimd.expw_max[i] = UINT32_C(0x7F800000);
5792 params->wasmsimd.scale_to_zero[i] = 0x1.0p-110f;
5793 }
5794 params->wasmsimd.bias_min[0] = INT16_C(0x8000);
5795 params->wasmsimd.bias_min[1] = INT16_C(0x4000);
5796 params->wasmsimd.bias_min[2] = INT16_C(0x8000);
5797 params->wasmsimd.bias_min[3] = INT16_C(0x4000);
5798 for (uint32_t i = 0; i < 2; i++) {
5799 params->wasmsimd.manth_mask[i] = UINT32_C(0x00000FFF);
5800 params->wasmsimd.exph_mask[i] = UINT32_C(0x00007C00);
5801 }
5802 for (uint32_t i = 0; i < 4; i++) {
5803 params->wasmsimd.nanh[i] = UINT16_C(0x7E00);
5804 }
5805 return sizeof(params->wasmsimd);
5806}
5807#endif // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
5808
5809size_t xnn_init_f32_qs8_cvt_scalar_fmagic_params(
5810 union xnn_f32_qs8_cvt_params params[XNN_MIN_ELEMENTS(1)],
5811 float scale,
5812 int8_t output_zero_point,
5813 int8_t output_min,
5814 int8_t output_max)
5815{
5816 params->scalar_fmagic.scale = scale;
5817 params->scalar_fmagic.output_min_less_zero_point = (float) ((int32_t) output_min - (int32_t) output_zero_point);
5818 params->scalar_fmagic.output_max_less_zero_point = (float) ((int32_t) output_max - (int32_t) output_zero_point);
5819 params->scalar_fmagic.magic_bias = 12582912.0f;
5820 params->scalar_fmagic.magic_bias_less_zero_point = INT32_C(0x4B400000) - (int32_t) output_zero_point;
5821 return sizeof(params->scalar_fmagic);
5822}
5823
5824size_t xnn_init_f32_qs8_cvt_scalar_imagic_params(
5825 union xnn_f32_qs8_cvt_params params[XNN_MIN_ELEMENTS(1)],
5826 float scale,
5827 int8_t output_zero_point,
5828 int8_t output_min,
5829 int8_t output_max)
5830{
5831 const float output_min_less_zero_point = (float) ((int32_t) output_min - (int32_t) output_zero_point);
5832 const float output_max_less_zero_point = (float) ((int32_t) output_max - (int32_t) output_zero_point);
5833 params->scalar_imagic.scale = scale;
5834 params->scalar_imagic.magic_bias = 12582912.0f;
5835 params->scalar_imagic.magic_min = (int32_t) float_as_uint32(12582912.0f + output_min_less_zero_point);
5836 params->scalar_imagic.magic_max = (int32_t) float_as_uint32(12582912.0f + output_max_less_zero_point);
5837 params->scalar_imagic.magic_bias_less_zero_point = INT32_C(0x4B400000) - (int32_t) output_zero_point;
5838 return sizeof(params->scalar_imagic);
5839}
5840
5841size_t xnn_init_f32_qs8_cvt_scalar_lrintf_params(
5842 union xnn_f32_qs8_cvt_params params[XNN_MIN_ELEMENTS(1)],
5843 float scale,
5844 int8_t output_zero_point,
5845 int8_t output_min,
5846 int8_t output_max)
5847{
5848 params->scalar_lrintf.scale = scale;
5849 params->scalar_lrintf.output_min_less_zero_point = (float) ((int32_t) output_min - (int32_t) output_zero_point);
5850 params->scalar_lrintf.output_max_less_zero_point = (float) ((int32_t) output_max - (int32_t) output_zero_point);
5851 params->scalar_lrintf.output_zero_point = (int32_t) output_zero_point;
5852 return sizeof(params->scalar_lrintf);
5853}
5854
5855#if XNN_ARCH_ARM || XNN_ARCH_ARM64
5856size_t xnn_init_f32_qs8_cvt_neon_params(
5857 union xnn_f32_qs8_cvt_params params[XNN_MIN_ELEMENTS(1)],
5858 float scale,
5859 int8_t output_zero_point,
5860 int8_t output_min,
5861 int8_t output_max)
5862{
5863 params->neon.scale = scale;
5864 params->neon.magic_bias = 12582912.0f;
5865 params->neon.magic_bias_less_zero_point = INT32_C(0x4B400000) - (int32_t) output_zero_point;
5866 params->neon.output_min = output_min;
5867 params->neon.output_max = output_max;
5868 return sizeof(params->neon);
5869}
5870
5871size_t xnn_init_f32_qs8_cvt_neonv8_params(
5872 union xnn_f32_qs8_cvt_params params[XNN_MIN_ELEMENTS(1)],
5873 float scale,
5874 int8_t output_zero_point,
5875 int8_t output_min,
5876 int8_t output_max)
5877{
5878 params->neonv8.scale = scale;
5879 params->neonv8.output_zero_point = (int16_t) output_zero_point;
5880 params->neonv8.output_min = output_min;
5881 params->neonv8.output_max = output_max;
5882 return sizeof(params->neonv8);
5883}
5884#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64
5885
5886#if XNN_ARCH_X86 || XNN_ARCH_X86_64
5887size_t xnn_init_f32_qs8_cvt_sse2_params(
5888 union xnn_f32_qs8_cvt_params params[XNN_MIN_ELEMENTS(1)],
5889 float scale,
5890 int8_t output_zero_point,
5891 int8_t output_min,
5892 int8_t output_max)
5893{
5894 const float output_max_less_zero_point = (float) ((int32_t) output_max - (int32_t) output_zero_point);
5895 for (uint32_t i = 0; i < 4; i++) {
5896 params->sse2.scale[i] = scale;
5897 params->sse2.output_max_less_zero_point[i] = output_max_less_zero_point;
5898 }
5899 for (uint32_t i = 0; i < 8; i++) {
5900 params->sse2.output_zero_point[i] = (int16_t) output_zero_point;
5901 params->sse2.output_min[i] = (int16_t) output_min;
5902 }
5903 return sizeof(params->sse2);
5904}
5905
5906size_t xnn_init_f32_qs8_cvt_sse4_params(
5907 union xnn_f32_qs8_cvt_params params[XNN_MIN_ELEMENTS(1)],
5908 float scale,
5909 int8_t output_zero_point,
5910 int8_t output_min,
5911 int8_t output_max)
5912{
5913 const float output_max_less_zero_point = (float) ((int32_t) output_max - (int32_t) output_zero_point);
5914 for (uint32_t i = 0; i < 4; i++) {
5915 params->sse4.scale[i] = scale;
5916 params->sse4.output_max_less_zero_point[i] = output_max_less_zero_point;
5917 }
5918 for (uint32_t i = 0; i < 8; i++) {
5919 params->sse4.output_zero_point[i] = (int16_t) output_zero_point;
5920 }
5921 for (uint32_t i = 0; i < 16; i++) {
5922 params->sse4.output_min[i] = output_min;
5923 }
5924 return sizeof(params->sse4);
5925}
5926
5927size_t xnn_init_f32_qs8_cvt_avx_params(
5928 union xnn_f32_qs8_cvt_params params[XNN_MIN_ELEMENTS(1)],
5929 float scale,
5930 int8_t output_zero_point,
5931 int8_t output_min,
5932 int8_t output_max)
5933{
5934 const float output_max_less_zero_point = (float) ((int32_t) output_max - (int32_t) output_zero_point);
5935 for (uint32_t i = 0; i < 8; i++) {
5936 params->avx.scale[i] = scale;
5937 params->avx.output_max_less_zero_point[i] = output_max_less_zero_point;
5938 }
5939 for (uint32_t i = 0; i < 8; i++) {
5940 params->avx.output_zero_point[i] = (int16_t) output_zero_point;
5941 }
5942 for (uint32_t i = 0; i < 16; i++) {
5943 params->avx.output_min[i] = output_min;
5944 }
5945 for (uint32_t i = 0; i < 7; i++) {
5946 params->avx.mask_table[i] = -1;
5947 }
5948 for (uint32_t i = 7; i < 14; i++) {
5949 params->avx.mask_table[i] = 0;
5950 }
5951 return sizeof(params->avx);
5952}
5953
5954size_t xnn_init_f32_qs8_cvt_avx2_params(
5955 union xnn_f32_qs8_cvt_params params[XNN_MIN_ELEMENTS(1)],
5956 float scale,
5957 int8_t output_zero_point,
5958 int8_t output_min,
5959 int8_t output_max)
5960{
5961 const float output_max_less_zero_point = (float) ((int32_t) output_max - (int32_t) output_zero_point);
5962 for (uint32_t i = 0; i < 8; i++) {
5963 params->avx2.scale[i] = scale;
5964 params->avx2.output_max_less_zero_point[i] = output_max_less_zero_point;
5965 }
5966 for (uint32_t i = 0; i < 16; i++) {
5967 params->avx2.output_zero_point[i] = (int16_t) output_zero_point;
5968 }
5969 params->avx2.shuffle_mask[0] = 0;
5970 params->avx2.shuffle_mask[1] = 4;
5971 params->avx2.shuffle_mask[2] = 1;
5972 params->avx2.shuffle_mask[3] = 5;
5973 params->avx2.shuffle_mask[4] = 2;
5974 params->avx2.shuffle_mask[5] = 6;
5975 params->avx2.shuffle_mask[6] = 3;
5976 params->avx2.shuffle_mask[7] = 7;
5977 for (uint32_t i = 0; i < 32; i++) {
5978 params->avx2.output_min[i] = output_min;
5979 }
5980 for (uint32_t i = 0; i < 7; i++) {
5981 params->avx2.mask_table[i] = -1;
5982 }
5983 for (uint32_t i = 7; i < 14; i++) {
5984 params->avx2.mask_table[i] = 0;
5985 }
5986 return sizeof(params->avx2);
5987}
5988
5989size_t xnn_init_f32_qs8_cvt_avx512_params(
5990 union xnn_f32_qs8_cvt_params params[XNN_MIN_ELEMENTS(1)],
5991 float scale,
5992 int8_t output_zero_point,
5993 int8_t output_min,
5994 int8_t output_max)
5995{
5996 const float output_max_less_zero_point = (float) ((int32_t) output_max - (int32_t) output_zero_point);
5997 for (uint32_t i = 0; i < 16; i++) {
5998 params->avx512.scale[i] = scale;
5999 params->avx512.output_max_less_zero_point[i] = output_max_less_zero_point;
6000 }
6001 for (uint32_t i = 0; i < 32; i++) {
6002 params->avx512.output_zero_point[i] = (int16_t) output_zero_point;
6003 }
6004 for (uint32_t i = 0; i < 64; i++) {
6005 params->avx512.output_min[i] = output_min;
6006 }
6007 params->avx512.shuffle512_mask[0] = 0;
6008 params->avx512.shuffle512_mask[1] = 4;
6009 params->avx512.shuffle512_mask[2] = 8;
6010 params->avx512.shuffle512_mask[3] = 12;
6011 params->avx512.shuffle512_mask[4] = 1;
6012 params->avx512.shuffle512_mask[5] = 5;
6013 params->avx512.shuffle512_mask[6] = 9;
6014 params->avx512.shuffle512_mask[7] = 13;
6015 params->avx512.shuffle512_mask[8] = 2;
6016 params->avx512.shuffle512_mask[9] = 6;
6017 params->avx512.shuffle512_mask[10] = 10;
6018 params->avx512.shuffle512_mask[11] = 14;
6019 params->avx512.shuffle512_mask[12] = 3;
6020 params->avx512.shuffle512_mask[13] = 7;
6021 params->avx512.shuffle512_mask[14] = 11;
6022 params->avx512.shuffle512_mask[15] = 15;
6023 params->avx512.shuffle256_mask[0] = 0;
6024 params->avx512.shuffle256_mask[1] = 4;
6025 params->avx512.shuffle256_mask[2] = 2;
6026 params->avx512.shuffle256_mask[3] = 6;
6027 params->avx512.shuffle256_mask[4] = 1;
6028 params->avx512.shuffle256_mask[5] = 5;
6029 params->avx512.shuffle256_mask[6] = 3;
6030 params->avx512.shuffle256_mask[7] = 7;
6031 return sizeof(params->avx512);
6032}
6033#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
6034
6035#if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
6036size_t xnn_init_f32_qs8_cvt_wasmsimd_cvt_params(
6037 union xnn_f32_qs8_cvt_params params[XNN_MIN_ELEMENTS(1)],
6038 float scale,
6039 int8_t output_zero_point,
6040 int8_t output_min,
6041 int8_t output_max)
6042{
6043 for (uint32_t i = 0; i < 2; i++) {
6044 params->wasmsimd_cvt.scale[i] = scale;
6045 }
6046 for (uint32_t i = 0; i < 4; i++) {
6047 params->wasmsimd_cvt.output_zero_point[i] = (int16_t) output_zero_point;
6048 }
6049 for (uint32_t i = 0; i < 8; i++) {
6050 params->wasmsimd_cvt.output_min[i] = output_min;
6051 params->wasmsimd_cvt.output_max[i] = output_max;
6052 }
6053 return sizeof(params->wasmsimd_cvt);
6054}
6055
6056size_t xnn_init_f32_qs8_cvt_wasmsimd_magic_params(
6057 union xnn_f32_qs8_cvt_params params[XNN_MIN_ELEMENTS(1)],
6058 float scale,
6059 int8_t output_zero_point,
6060 int8_t output_min,
6061 int8_t output_max)
6062{
6063 const float output_min_less_zero_point = (float) ((int32_t) output_min - (int32_t) output_zero_point);
6064 const int32_t magic_min = (int32_t) float_as_uint32(12582912.0f + output_min_less_zero_point);
6065 const int32_t magic_bias_less_zero_point = INT32_C(0x4B400000) - (int32_t) output_zero_point;
6066 for (uint32_t i = 0; i < 2; i++) {
6067 params->wasmsimd_magic.scale[i] = scale;
6068 params->wasmsimd_magic.magic_bias[i] = 12582912.0f;
6069 params->wasmsimd_magic.magic_min[i] = magic_min;
6070 params->wasmsimd_magic.magic_bias_less_zero_point[i] = magic_bias_less_zero_point;
6071 }
6072 for (uint32_t i = 0; i < 8; i++) {
6073 params->wasmsimd_magic.output_max[i] = output_max;
6074 }
6075 return sizeof(params->wasmsimd_magic);
6076}
6077#endif // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
6078
6079size_t xnn_init_f32_qu8_cvt_scalar_fmagic_params(
6080 union xnn_f32_qu8_cvt_params params[XNN_MIN_ELEMENTS(1)],
6081 float scale,
6082 uint8_t output_zero_point,
6083 uint8_t output_min,
6084 uint8_t output_max)
6085{
6086 params->scalar_fmagic.scale = scale;
6087 params->scalar_fmagic.output_min_less_zero_point = (float) ((int32_t) output_min - (int32_t) output_zero_point);
6088 params->scalar_fmagic.output_max_less_zero_point = (float) ((int32_t) output_max - (int32_t) output_zero_point);
6089 params->scalar_fmagic.magic_bias = 12582912.0f;
6090 params->scalar_fmagic.magic_bias_less_zero_point = INT32_C(0x4B400000) - (int32_t) output_zero_point;
6091 return sizeof(params->scalar_fmagic);
6092}
6093
6094size_t xnn_init_f32_qu8_cvt_scalar_imagic_params(
6095 union xnn_f32_qu8_cvt_params params[XNN_MIN_ELEMENTS(1)],
6096 float scale,
6097 uint8_t output_zero_point,
6098 uint8_t output_min,
6099 uint8_t output_max)
6100{
6101 const float output_min_less_zero_point = (float) ((int32_t) output_min - (int32_t) output_zero_point);
6102 const float output_max_less_zero_point = (float) ((int32_t) output_max - (int32_t) output_zero_point);
6103 params->scalar_imagic.scale = scale;
6104 params->scalar_imagic.magic_bias = 12582912.0f;
6105 params->scalar_imagic.magic_min = (int32_t) float_as_uint32(12582912.0f + output_min_less_zero_point);
6106 params->scalar_imagic.magic_max = (int32_t) float_as_uint32(12582912.0f + output_max_less_zero_point);
6107 params->scalar_imagic.magic_bias_less_zero_point = INT32_C(0x4B400000) - (int32_t) output_zero_point;
6108 return sizeof(params->scalar_imagic);
6109}
6110
6111size_t xnn_init_f32_qu8_cvt_scalar_lrintf_params(
6112 union xnn_f32_qu8_cvt_params params[XNN_MIN_ELEMENTS(1)],
6113 float scale,
6114 uint8_t output_zero_point,
6115 uint8_t output_min,
6116 uint8_t output_max)
6117{
6118 params->scalar_lrintf.scale = scale;
6119 params->scalar_lrintf.output_min_less_zero_point = (float) ((int32_t) output_min - (int32_t) output_zero_point);
6120 params->scalar_lrintf.output_max_less_zero_point = (float) ((int32_t) output_max - (int32_t) output_zero_point);
6121 params->scalar_lrintf.output_zero_point = (int32_t) output_zero_point;
6122 return sizeof(params->scalar_lrintf);
6123}
6124
6125#if XNN_ARCH_ARM || XNN_ARCH_ARM64
6126size_t xnn_init_f32_qu8_cvt_neon_params(
6127 union xnn_f32_qu8_cvt_params params[XNN_MIN_ELEMENTS(1)],
6128 float scale,
6129 uint8_t output_zero_point,
6130 uint8_t output_min,
6131 uint8_t output_max)
6132{
6133 params->neon.scale = scale;
6134 params->neon.magic_bias = 12582912.0f;
6135 params->neon.magic_bias_less_zero_point = INT32_C(0x4B400000) - (int32_t) output_zero_point;
6136 params->neon.output_min = output_min;
6137 params->neon.output_max = output_max;
6138 return sizeof(params->neon);
6139}
6140
6141size_t xnn_init_f32_qu8_cvt_neonv8_params(
6142 union xnn_f32_qu8_cvt_params params[XNN_MIN_ELEMENTS(1)],
6143 float scale,
6144 uint8_t output_zero_point,
6145 uint8_t output_min,
6146 uint8_t output_max)
6147{
6148 params->neonv8.scale = scale;
6149 params->neonv8.output_zero_point = (int16_t) output_zero_point;
6150 params->neonv8.output_min = output_min;
6151 params->neonv8.output_max = output_max;
6152 return sizeof(params->neonv8);
6153}
6154#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64
6155
6156#if XNN_ARCH_X86 || XNN_ARCH_X86_64
6157size_t xnn_init_f32_qu8_cvt_sse2_params(
6158 union xnn_f32_qu8_cvt_params params[XNN_MIN_ELEMENTS(1)],
6159 float scale,
6160 uint8_t output_zero_point,
6161 uint8_t output_min,
6162 uint8_t output_max)
6163{
6164 const float output_max_less_zero_point = (float) ((int32_t) output_max - (int32_t) output_zero_point);
6165 for (uint32_t i = 0; i < 4; i++) {
6166 params->sse2.scale[i] = scale;
6167 params->sse2.output_max_less_zero_point[i] = output_max_less_zero_point;
6168 }
6169 for (uint32_t i = 0; i < 8; i++) {
6170 params->sse2.output_zero_point[i] = (int16_t) output_zero_point;
6171 }
6172 for (uint32_t i = 0; i < 16; i++) {
6173 params->sse2.output_min[i] = output_min;
6174 }
6175 return sizeof(params->sse2);
6176}
6177
6178size_t xnn_init_f32_qu8_cvt_avx_params(
6179 union xnn_f32_qu8_cvt_params params[XNN_MIN_ELEMENTS(1)],
6180 float scale,
6181 uint8_t output_zero_point,
6182 uint8_t output_min,
6183 uint8_t output_max)
6184{
6185 const float output_max_less_zero_point = (float) ((int32_t) output_max - (int32_t) output_zero_point);
6186 for (uint32_t i = 0; i < 8; i++) {
6187 params->avx.scale[i] = scale;
6188 params->avx.output_max_less_zero_point[i] = output_max_less_zero_point;
6189 }
6190 for (uint32_t i = 0; i < 8; i++) {
6191 params->avx.output_zero_point[i] = (int16_t) output_zero_point;
6192 }
6193 for (uint32_t i = 0; i < 16; i++) {
6194 params->avx.output_min[i] = output_min;
6195 }
6196 for (uint32_t i = 0; i < 7; i++) {
6197 params->avx.mask_table[i] = -1;
6198 }
6199 for (uint32_t i = 7; i < 14; i++) {
6200 params->avx.mask_table[i] = 0;
6201 }
6202 return sizeof(params->avx);
6203}
6204
6205size_t xnn_init_f32_qu8_cvt_avx2_params(
6206 union xnn_f32_qu8_cvt_params params[XNN_MIN_ELEMENTS(1)],
6207 float scale,
6208 uint8_t output_zero_point,
6209 uint8_t output_min,
6210 uint8_t output_max)
6211{
6212 const float output_max_less_zero_point = (float) ((int32_t) output_max - (int32_t) output_zero_point);
6213 for (uint32_t i = 0; i < 8; i++) {
6214 params->avx2.scale[i] = scale;
6215 params->avx2.output_max_less_zero_point[i] = output_max_less_zero_point;
6216 }
6217 for (uint32_t i = 0; i < 16; i++) {
6218 params->avx2.output_zero_point[i] = (int16_t) output_zero_point;
6219 }
6220 params->avx2.shuffle_mask[0] = 0;
6221 params->avx2.shuffle_mask[1] = 4;
6222 params->avx2.shuffle_mask[2] = 1;
6223 params->avx2.shuffle_mask[3] = 5;
6224 params->avx2.shuffle_mask[4] = 2;
6225 params->avx2.shuffle_mask[5] = 6;
6226 params->avx2.shuffle_mask[6] = 3;
6227 params->avx2.shuffle_mask[7] = 7;
6228 for (uint32_t i = 0; i < 32; i++) {
6229 params->avx2.output_min[i] = output_min;
6230 }
6231 for (uint32_t i = 0; i < 7; i++) {
6232 params->avx2.mask_table[i] = -1;
6233 }
6234 for (uint32_t i = 7; i < 14; i++) {
6235 params->avx2.mask_table[i] = 0;
6236 }
6237 return sizeof(params->avx2);
6238}
6239
6240size_t xnn_init_f32_qu8_cvt_avx512_params(
6241 union xnn_f32_qu8_cvt_params params[XNN_MIN_ELEMENTS(1)],
6242 float scale,
6243 uint8_t output_zero_point,
6244 uint8_t output_min,
6245 uint8_t output_max)
6246{
6247 const float output_max_less_zero_point = (float) ((int32_t) output_max - (int32_t) output_zero_point);
6248 for (uint32_t i = 0; i < 16; i++) {
6249 params->avx512.scale[i] = scale;
6250 params->avx512.output_max_less_zero_point[i] = output_max_less_zero_point;
6251 }
6252 for (uint32_t i = 0; i < 32; i++) {
6253 params->avx512.output_zero_point[i] = (int16_t) output_zero_point;
6254 }
6255 for (uint32_t i = 0; i < 64; i++) {
6256 params->avx512.output_min[i] = output_min;
6257 }
6258 params->avx512.shuffle512_mask[0] = 0;
6259 params->avx512.shuffle512_mask[1] = 4;
6260 params->avx512.shuffle512_mask[2] = 8;
6261 params->avx512.shuffle512_mask[3] = 12;
6262 params->avx512.shuffle512_mask[4] = 1;
6263 params->avx512.shuffle512_mask[5] = 5;
6264 params->avx512.shuffle512_mask[6] = 9;
6265 params->avx512.shuffle512_mask[7] = 13;
6266 params->avx512.shuffle512_mask[8] = 2;
6267 params->avx512.shuffle512_mask[9] = 6;
6268 params->avx512.shuffle512_mask[10] = 10;
6269 params->avx512.shuffle512_mask[11] = 14;
6270 params->avx512.shuffle512_mask[12] = 3;
6271 params->avx512.shuffle512_mask[13] = 7;
6272 params->avx512.shuffle512_mask[14] = 11;
6273 params->avx512.shuffle512_mask[15] = 15;
6274 params->avx512.shuffle256_mask[0] = 0;
6275 params->avx512.shuffle256_mask[1] = 4;
6276 params->avx512.shuffle256_mask[2] = 2;
6277 params->avx512.shuffle256_mask[3] = 6;
6278 params->avx512.shuffle256_mask[4] = 1;
6279 params->avx512.shuffle256_mask[5] = 5;
6280 params->avx512.shuffle256_mask[6] = 3;
6281 params->avx512.shuffle256_mask[7] = 7;
6282 return sizeof(params->avx512);
6283}
6284#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
6285
6286#if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
6287size_t xnn_init_f32_qu8_cvt_wasmsimd_cvt_params(
6288 union xnn_f32_qu8_cvt_params params[XNN_MIN_ELEMENTS(1)],
6289 float scale,
6290 uint8_t output_zero_point,
6291 uint8_t output_min,
6292 uint8_t output_max)
6293{
6294 for (uint32_t i = 0; i < 2; i++) {
6295 params->wasmsimd_cvt.scale[i] = scale;
6296 }
6297 for (uint32_t i = 0; i < 4; i++) {
6298 params->wasmsimd_cvt.output_zero_point[i] = (int16_t) output_zero_point;
6299 }
6300 for (uint32_t i = 0; i < 8; i++) {
6301 params->wasmsimd_cvt.output_min[i] = output_min;
6302 params->wasmsimd_cvt.output_max[i] = output_max;
6303 }
6304 return sizeof(params->wasmsimd_cvt);
6305}
6306
6307size_t xnn_init_f32_qu8_cvt_wasmsimd_magic_params(
6308 union xnn_f32_qu8_cvt_params params[XNN_MIN_ELEMENTS(1)],
6309 float scale,
6310 uint8_t output_zero_point,
6311 uint8_t output_min,
6312 uint8_t output_max)
6313{
6314 const float output_min_less_zero_point = (float) ((int32_t) output_min - (int32_t) output_zero_point);
6315 const int32_t magic_min = (int32_t) float_as_uint32(12582912.0f + output_min_less_zero_point);
6316 const int32_t magic_bias_less_zero_point = INT32_C(0x4B400000) - (int32_t) output_zero_point;
6317 for (uint32_t i = 0; i < 2; i++) {
6318 params->wasmsimd_magic.scale[i] = scale;
6319 params->wasmsimd_magic.magic_bias[i] = 12582912.0f;
6320 params->wasmsimd_magic.magic_min[i] = magic_min;
6321 params->wasmsimd_magic.magic_bias_less_zero_point[i] = magic_bias_less_zero_point;
6322 }
6323 for (uint32_t i = 0; i < 8; i++) {
6324 params->wasmsimd_magic.output_max[i] = output_max;
6325 }
6326 return sizeof(params->wasmsimd_magic);
6327}
6328#endif // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
6329
6330size_t xnn_init_qs8_cvt_scalar_params(
6331 union xnn_qs8_cvt_params params[XNN_MIN_ELEMENTS(1)],
6332 float input_output_scale,
6333 int8_t input_zero_point,
6334 int8_t output_zero_point)
6335{
6336 assert(input_output_scale >= 0x1.0p-8);
6337 assert(input_output_scale <= 0x1.0p+7);
6338
6339 const long multiplier = lrintf(256.0f * input_output_scale);
6340 assert(multiplier >= 1L);
6341 assert(multiplier <= 32768L);
6342 params->scalar.bias = ((int32_t) output_zero_point << 8) - (int32_t) multiplier * (int32_t) input_zero_point + INT32_C(0x80);
6343 params->scalar.multiplier = (int32_t) multiplier;
6344 return sizeof(params->scalar);
6345}
6346
6347#if XNN_ARCH_ARM
6348size_t xnn_init_qs8_cvt_armsimd32_params(
6349 union xnn_qs8_cvt_params params[XNN_MIN_ELEMENTS(1)],
6350 float input_output_scale,
6351 int8_t input_zero_point,
6352 int8_t output_zero_point)
6353{
6354 assert(input_output_scale >= 0x1.0p-8);
6355 assert(input_output_scale <= 0x1.0p+7);
6356
6357 const long multiplier = lrintf(131072.0f * input_output_scale);
6358 assert(multiplier >= 512L);
6359 assert(multiplier <= 16777216L);
6360 const uint16_t minus_input_zero_point = -(int16_t) input_zero_point;
6361 params->armsimd32.minus_input_zero_point = (uint32_t) minus_input_zero_point * UINT32_C(0x00010001);
6362 params->armsimd32.multiplier = (int32_t) multiplier;
6363 params->armsimd32.bias = ((int32_t) output_zero_point << 1) + INT32_C(1);
6364 return sizeof(params->armsimd32);
6365}
6366#endif // XNN_ARCH_ARM
6367
6368#if XNN_ARCH_ARM || XNN_ARCH_ARM64
6369size_t xnn_init_qs8_cvt_neon_params(
6370 union xnn_qs8_cvt_params params[XNN_MIN_ELEMENTS(1)],
6371 float input_output_scale,
6372 int8_t input_zero_point,
6373 int8_t output_zero_point)
6374{
6375 assert(input_output_scale >= 0x1.0p-8);
6376 assert(input_output_scale <= 0x1.0p+7);
6377
6378 const long multiplier = lrintf(-256.0f * input_output_scale);
6379 assert(multiplier <= -1L);
6380 assert(multiplier >= -32768L);
6381 params->neon.input_zero_point = (int16_t) input_zero_point;
6382 params->neon.multiplier = (int16_t) multiplier;
6383 params->neon.output_zero_point = (int16_t) output_zero_point;
6384 return sizeof(params->neon);
6385}
6386#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64
6387
6388#if XNN_ARCH_X86 || XNN_ARCH_X86_64
6389size_t xnn_init_qs8_cvt_sse2_params(
6390 union xnn_qs8_cvt_params params[XNN_MIN_ELEMENTS(1)],
6391 float input_output_scale,
6392 int8_t input_zero_point,
6393 int8_t output_zero_point)
6394{
6395 assert(input_output_scale >= 0x1.0p-8);
6396 assert(input_output_scale <= 0x1.0p+7);
6397
6398 const long multiplier = lrintf(-256.0f * input_output_scale);
6399 assert(multiplier <= -1L);
6400 assert(multiplier >= -32768L);
6401 const int32_t bias = ((int32_t) output_zero_point << 8) + (int32_t) multiplier * (int32_t) input_zero_point + INT32_C(0x80);
6402 for (uint32_t i = 0; i < 8; i++) {
6403 params->sse2.multiplier[i] = (int16_t) multiplier;
6404 }
6405 for (uint32_t i = 0; i < 4; i++) {
6406 params->sse2.bias[i] = bias;
6407 }
6408 return sizeof(params->sse2);
6409}
6410
6411size_t xnn_init_qs8_cvt_ssse3_params(
6412 union xnn_qs8_cvt_params params[XNN_MIN_ELEMENTS(1)],
6413 float input_output_scale,
6414 int8_t input_zero_point,
6415 int8_t output_zero_point)
6416{
6417 assert(input_output_scale >= 0x1.0p-8);
6418 assert(input_output_scale <= 0x1.0p+7);
6419
6420 const long multiplier = lrintf(-256.0f * input_output_scale);
6421 assert(multiplier <= -1L);
6422 assert(multiplier >= -32768L);
6423 for (uint32_t i = 0; i < 8; i++) {
6424 params->ssse3.input_zero_point[i] = (int16_t) input_zero_point;
6425 params->ssse3.multiplier[i] = (int16_t) multiplier;
6426 params->ssse3.output_zero_point[i] = (int16_t) output_zero_point;
6427 }
6428 return sizeof(params->ssse3);
6429}
6430
6431size_t xnn_init_qs8_cvt_avx2_params(
6432 union xnn_qs8_cvt_params params[XNN_MIN_ELEMENTS(1)],
6433 float input_output_scale,
6434 int8_t input_zero_point,
6435 int8_t output_zero_point)
6436{
6437 assert(input_output_scale >= 0x1.0p-8);
6438 assert(input_output_scale <= 0x1.0p+7);
6439
6440 const long multiplier = lrintf(-256.0f * input_output_scale);
6441 assert(multiplier <= -1L);
6442 assert(multiplier >= -32768L);
6443 for (uint32_t i = 0; i < 16; i++) {
6444 params->avx2.input_zero_point[i] = (int16_t) input_zero_point;
6445 params->avx2.multiplier[i] = (int16_t) multiplier;
6446 params->avx2.output_zero_point[i] = (int16_t) output_zero_point;
6447 }
6448 return sizeof(params->avx2);
6449}
6450#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
6451
6452#if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
6453size_t xnn_init_qs8_cvt_wasmsimd_params(
6454 union xnn_qs8_cvt_params params[XNN_MIN_ELEMENTS(1)],
6455 float input_output_scale,
6456 int8_t input_zero_point,
6457 int8_t output_zero_point)
6458{
6459 assert(input_output_scale >= 0x1.0p-8);
6460 assert(input_output_scale <= 0x1.0p+7);
6461
6462 const long multiplier = lrintf(-256.0f * input_output_scale);
6463 assert(multiplier <= -1L);
6464 assert(multiplier >= -32768L);
6465 for (uint32_t i = 0; i < 4; i++) {
6466 params->wasmsimd.input_zero_point[i] = (int16_t) input_zero_point;
6467 params->wasmsimd.multiplier[i] = (int16_t) multiplier;
6468 params->wasmsimd.output_zero_point[i] = (int16_t) output_zero_point;
6469 }
6470 return sizeof(params->wasmsimd);
6471}
6472#endif // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
6473
6474size_t xnn_init_qs8_f32_cvt_scalar_params(
6475 union xnn_qs8_f32_cvt_params params[XNN_MIN_ELEMENTS(1)],
6476 float scale,
6477 int8_t zero_point)
6478{
6479 params->scalar.zero_point = (int32_t) zero_point;
6480 params->scalar.scale = scale;
6481 return sizeof(params->scalar);
6482}
6483
6484#if XNN_ARCH_ARM || XNN_ARCH_ARM64
6485size_t xnn_init_qs8_f32_cvt_neon_params(
6486 union xnn_qs8_f32_cvt_params params[XNN_MIN_ELEMENTS(1)],
6487 float scale,
6488 int8_t zero_point)
6489{
6490 params->neon.minus_zero_point[0] = -(int16_t) zero_point;
6491 params->neon.minus_zero_point[1] = -(int16_t) zero_point;
6492 params->neon.scale = scale;
6493 return sizeof(params->neon);
6494}
6495#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64
6496
6497#if XNN_ARCH_X86 || XNN_ARCH_X86_64
6498size_t xnn_init_qs8_f32_cvt_sse2_params(
6499 union xnn_qs8_f32_cvt_params params[XNN_MIN_ELEMENTS(1)],
6500 float scale,
6501 int8_t zero_point)
6502{
6503 for (uint32_t i = 0; i < 16; i++) {
6504 params->sse2.sign_mask[i] = UINT8_C(0x80);
6505 }
6506 for (uint32_t i = 0; i < 8; i++) {
6507 params->sse2.magic_exp[i] = UINT16_C(0x4B00);
6508 }
6509 const float magic_bias = (float) (INT32_C(0x00800080) + (int32_t) zero_point);
6510 for (uint32_t i = 0; i < 4; i++) {
6511 params->sse2.magic_bias[i] = magic_bias;
6512 params->sse2.scale[i] = scale;
6513 }
6514 return sizeof(params->sse2);
6515}
6516
6517size_t xnn_init_qs8_f32_cvt_sse4_params(
6518 union xnn_qs8_f32_cvt_params params[XNN_MIN_ELEMENTS(1)],
6519 float scale,
6520 int8_t zero_point)
6521{
6522 for (uint32_t i = 0; i < 4; i++) {
6523 params->sse4.minus_zero_point[i] = -(int32_t) zero_point;
6524 params->sse4.scale[i] = scale;
6525 }
6526 return sizeof(params->sse4);
6527}
6528
6529size_t xnn_init_qs8_f32_cvt_avx_params(
6530 union xnn_qs8_f32_cvt_params params[XNN_MIN_ELEMENTS(1)],
6531 float scale,
6532 int8_t zero_point)
6533{
6534 for (uint32_t i = 0; i < 8; i++) {
6535 params->avx.minus_zero_point[i] = -(int32_t) zero_point;
6536 params->avx.scale[i] = scale;
6537 }
6538 return sizeof(params->avx);
6539}
6540
6541size_t xnn_init_qs8_f32_cvt_avx512_params(
6542 union xnn_qs8_f32_cvt_params params[XNN_MIN_ELEMENTS(1)],
6543 float scale,
6544 int8_t zero_point)
6545{
6546 for (uint32_t i = 0; i < 16; i++) {
6547 params->avx512.minus_zero_point[i] = -(int32_t) zero_point;
6548 params->avx512.scale[i] = scale;
6549 }
6550 return sizeof(params->avx512);
6551}
6552#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
6553
6554#if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
6555size_t xnn_init_qs8_f32_cvt_wasmsimd_params(
6556 union xnn_qs8_f32_cvt_params params[XNN_MIN_ELEMENTS(1)],
6557 float scale,
6558 int8_t zero_point)
6559{
6560 for (uint32_t i = 0; i < 4; i++) {
6561 params->wasmsimd.minus_zero_point[i] = -(int16_t) zero_point;
6562 }
6563 for (uint32_t i = 0; i < 2; i++) {
6564 params->wasmsimd.scale[i] = scale;
6565 }
6566 return sizeof(params->wasmsimd);
6567}
6568#endif // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
6569
6570size_t xnn_init_qu8_cvt_scalar_params(
6571 union xnn_qu8_cvt_params params[XNN_MIN_ELEMENTS(1)],
6572 float input_output_scale,
6573 uint8_t input_zero_point,
6574 uint8_t output_zero_point)
6575{
6576 assert(input_output_scale >= 0x1.0p-8);
6577 assert(input_output_scale <= 0x1.0p+7);
6578
6579 const long multiplier = lrintf(256.0f * input_output_scale);
6580 assert(multiplier >= 1L);
6581 assert(multiplier <= 32768L);
6582 params->scalar.bias = ((int32_t) output_zero_point << 8) - (int32_t) multiplier * (int32_t) input_zero_point + INT32_C(0x80);
6583 params->scalar.multiplier = (int32_t) multiplier;
6584 return sizeof(params->scalar);
6585}
6586
6587#if XNN_ARCH_ARM
6588size_t xnn_init_qu8_cvt_armsimd32_params(
6589 union xnn_qu8_cvt_params params[XNN_MIN_ELEMENTS(1)],
6590 float input_output_scale,
6591 uint8_t input_zero_point,
6592 uint8_t output_zero_point)
6593{
6594 assert(input_output_scale >= 0x1.0p-8);
6595 assert(input_output_scale <= 0x1.0p+7);
6596
6597 const long multiplier = lrintf(131072.0f * input_output_scale);
6598 assert(multiplier >= 512L);
6599 assert(multiplier <= 16777216L);
6600 const uint16_t minus_input_zero_point = -(int16_t) input_zero_point;
6601 params->armsimd32.minus_input_zero_point = (uint32_t) minus_input_zero_point * UINT32_C(0x00010001);
6602 params->armsimd32.multiplier = (int32_t) multiplier;
6603 params->armsimd32.bias = ((int32_t) output_zero_point << 1) + INT32_C(1);
6604 return sizeof(params->armsimd32);
6605}
6606#endif // XNN_ARCH_ARM
6607
6608#if XNN_ARCH_ARM || XNN_ARCH_ARM64
6609size_t xnn_init_qu8_cvt_neon_params(
6610 union xnn_qu8_cvt_params params[XNN_MIN_ELEMENTS(1)],
6611 float input_output_scale,
6612 uint8_t input_zero_point,
6613 uint8_t output_zero_point)
6614{
6615 assert(input_output_scale >= 0x1.0p-8);
6616 assert(input_output_scale <= 0x1.0p+7);
6617
6618 const long multiplier = lrintf(-256.0f * input_output_scale);
6619 assert(multiplier <= -1L);
6620 assert(multiplier >= -32768L);
6621 params->neon.input_zero_point = (uint16_t) input_zero_point;
6622 params->neon.multiplier = (int16_t) multiplier;
6623 params->neon.output_zero_point = (int16_t) output_zero_point;
6624 return sizeof(params->neon);
6625}
6626#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64
6627
6628#if XNN_ARCH_X86 || XNN_ARCH_X86_64
6629size_t xnn_init_qu8_cvt_sse2_params(
6630 union xnn_qu8_cvt_params params[XNN_MIN_ELEMENTS(1)],
6631 float input_output_scale,
6632 uint8_t input_zero_point,
6633 uint8_t output_zero_point)
6634{
6635 assert(input_output_scale >= 0x1.0p-8);
6636 assert(input_output_scale <= 0x1.0p+7);
6637
6638 const long multiplier = lrintf(256.0f * input_output_scale);
6639 assert(multiplier >= 1L);
6640 assert(multiplier <= 32768L);
6641 const int32_t bias = ((int32_t) output_zero_point << 8) - (int32_t) multiplier * (int32_t) input_zero_point + INT32_C(0x80);
6642 for (uint32_t i = 0; i < 8; i++) {
6643 params->sse2.multiplier[i] = (uint16_t) multiplier;
6644 }
6645 for (uint32_t i = 0; i < 4; i++) {
6646 params->sse2.bias[i] = bias;
6647 }
6648 return sizeof(params->sse2);
6649}
6650
6651size_t xnn_init_qu8_cvt_ssse3_params(
6652 union xnn_qu8_cvt_params params[XNN_MIN_ELEMENTS(1)],
6653 float input_output_scale,
6654 uint8_t input_zero_point,
6655 uint8_t output_zero_point)
6656{
6657 assert(input_output_scale >= 0x1.0p-8);
6658 assert(input_output_scale <= 0x1.0p+7);
6659
6660 const long multiplier = lrintf(-256.0f * input_output_scale);
6661 assert(multiplier <= -1L);
6662 assert(multiplier >= -32768L);
6663 for (uint32_t i = 0; i < 8; i++) {
6664 params->ssse3.input_zero_point[i] = (uint16_t) input_zero_point;
6665 params->ssse3.multiplier[i] = (int16_t) multiplier;
6666 params->ssse3.output_zero_point[i] = (int16_t) output_zero_point;
6667 }
6668 return sizeof(params->ssse3);
6669}
6670
6671size_t xnn_init_qu8_cvt_avx2_params(
6672 union xnn_qu8_cvt_params params[XNN_MIN_ELEMENTS(1)],
6673 float input_output_scale,
6674 uint8_t input_zero_point,
6675 uint8_t output_zero_point)
6676{
6677 assert(input_output_scale >= 0x1.0p-8);
6678 assert(input_output_scale <= 0x1.0p+7);
6679
6680 const long multiplier = lrintf(-256.0f * input_output_scale);
6681 assert(multiplier <= -1L);
6682 assert(multiplier >= -32768L);
6683 for (uint32_t i = 0; i < 16; i++) {
6684 params->avx2.input_zero_point[i] = (uint16_t) input_zero_point;
6685 params->avx2.multiplier[i] = (int16_t) multiplier;
6686 params->avx2.output_zero_point[i] = (int16_t) output_zero_point;
6687 }
6688 return sizeof(params->avx2);
6689}
6690#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
6691
6692#if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
6693size_t xnn_init_qu8_cvt_wasmsimd_params(
6694 union xnn_qu8_cvt_params params[XNN_MIN_ELEMENTS(1)],
6695 float input_output_scale,
6696 uint8_t input_zero_point,
6697 uint8_t output_zero_point)
6698{
6699 assert(input_output_scale >= 0x1.0p-8);
6700 assert(input_output_scale <= 0x1.0p+7);
6701
6702 const long multiplier = lrintf(-256.0f * input_output_scale);
6703 assert(multiplier <= -1L);
6704 assert(multiplier >= -32768L);
6705 for (uint32_t i = 0; i < 4; i++) {
6706 params->wasmsimd.input_zero_point[i] = (uint16_t) input_zero_point;
6707 params->wasmsimd.multiplier[i] = (int16_t) multiplier;
6708 params->wasmsimd.output_zero_point[i] = (int16_t) output_zero_point;
6709 }
6710 return sizeof(params->wasmsimd);
6711}
6712#endif // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
6713
6714size_t xnn_init_qu8_f32_cvt_scalar_params(
6715 union xnn_qu8_f32_cvt_params params[XNN_MIN_ELEMENTS(1)],
6716 float scale,
6717 uint8_t zero_point)
6718{
6719 params->scalar.zero_point = (int32_t) zero_point;
6720 params->scalar.scale = scale;
6721 return sizeof(params->scalar);
6722}
6723
6724#if XNN_ARCH_ARM || XNN_ARCH_ARM64
6725size_t xnn_init_qu8_f32_cvt_neon_params(
6726 union xnn_qu8_f32_cvt_params params[XNN_MIN_ELEMENTS(1)],
6727 float scale,
6728 uint8_t zero_point)
6729{
6730 params->neon.minus_zero_point[0] = -(int16_t) zero_point;
6731 params->neon.minus_zero_point[1] = -(int16_t) zero_point;
6732 params->neon.scale = scale;
6733 return sizeof(params->neon);
6734}
6735
6736#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64
6737
6738#if XNN_ARCH_X86 || XNN_ARCH_X86_64
6739size_t xnn_init_x24_transpose_ssse3_params(union xnn_x24_transpose_params params[XNN_MIN_ELEMENTS(1)]) {
6740 memset(&params->ssse3.pos0, -1, sizeof(params->ssse3.pos0));
6741 memset(&params->ssse3.pos1, -1, sizeof(params->ssse3.pos1));
6742 memset(&params->ssse3.pos2, -1, sizeof(params->ssse3.pos2));
6743 memset(&params->ssse3.pos3, -1, sizeof(params->ssse3.pos3));
6744 memset(&params->ssse3.pos4, -1, sizeof(params->ssse3.pos4));
6745 memset(&params->ssse3.pos5, -1, sizeof(params->ssse3.pos5));
6746 params->ssse3.pos0[0] = 0;
6747 params->ssse3.pos0[1] = 4;
6748 params->ssse3.pos0[2] = 8;
6749 params->ssse3.pos0[3] = 2;
6750 params->ssse3.pos0[4] = 6;
6751 params->ssse3.pos0[5] = 10;
6752 params->ssse3.pos0[6] = 1;
6753 params->ssse3.pos0[7] = 5;
6754 params->ssse3.pos0[8] = 9;
6755 params->ssse3.pos0[9] = 3;
6756 params->ssse3.pos0[10] = 7;
6757 params->ssse3.pos0[11] = 11;
6758
6759 params->ssse3.pos1[0] = 4;
6760 params->ssse3.pos1[1] = 8;
6761 params->ssse3.pos1[2] = 12;
6762 params->ssse3.pos1[3] = 6;
6763 params->ssse3.pos1[4] = 10;
6764 params->ssse3.pos1[5] = 14;
6765 params->ssse3.pos1[6] = 5;
6766 params->ssse3.pos1[7] = 9;
6767 params->ssse3.pos1[8] = 13;
6768 params->ssse3.pos1[9] = 7;
6769 params->ssse3.pos1[10] = 11;
6770 params->ssse3.pos1[11] = 15;
6771
6772 params->ssse3.pos2[0] = 12;
6773 params->ssse3.pos2[3] = 14;
6774 params->ssse3.pos2[6] = 13;
6775 params->ssse3.pos2[9] = 15;
6776
6777 params->ssse3.pos3[1] = 0;
6778 params->ssse3.pos3[2] = 4;
6779 params->ssse3.pos3[4] = 2;
6780 params->ssse3.pos3[5] = 6;
6781 params->ssse3.pos3[7] = 1;
6782 params->ssse3.pos3[8] = 5;
6783 params->ssse3.pos3[10] = 3;
6784 params->ssse3.pos3[11] = 7;
6785
6786 params->ssse3.pos4[0] = 8;
6787 params->ssse3.pos4[1] = 12;
6788 params->ssse3.pos4[3] = 10;
6789 params->ssse3.pos4[4] = 14;
6790 params->ssse3.pos4[6] = 9;
6791 params->ssse3.pos4[7] = 13;
6792 params->ssse3.pos4[9] = 11;
6793 params->ssse3.pos4[10] = 15;
6794
6795 params->ssse3.pos5[2] = 0;
6796 params->ssse3.pos5[5] = 2;
6797 params->ssse3.pos5[8] = 1;
6798 params->ssse3.pos5[11] = 3;
6799 return sizeof(params->ssse3);
6800}
6801
6802size_t xnn_init_x8_transpose_avx2_params(union xnn_x8_transpose_params params[XNN_MIN_ELEMENTS(1)]) {
6803 memset(&params->avx2.mask_table[0], -1, sizeof(uint32_t) * 8);
6804 memset(&params->avx2.mask_table[8], 0, sizeof(uint32_t) * 7);
6805 return sizeof(params->avx2);
6806}
6807
6808size_t xnn_init_x16_transpose_avx2_params(union xnn_x16_transpose_params params[XNN_MIN_ELEMENTS(1)]) {
6809 memset(&params->avx2.mask_table[0], -1, sizeof(uint32_t) * 8);
6810 memset(&params->avx2.mask_table[8], 0, sizeof(uint32_t) * 7);
6811 return sizeof(params->avx2);
6812}
6813
6814size_t xnn_init_x32_transpose_avx_params(union xnn_x32_transpose_params params[XNN_MIN_ELEMENTS(1)]) {
6815 memset(&params->avx.mask_table[0], -1, sizeof(uint32_t) * 8);
6816 memset(&params->avx.mask_table[8], 0, sizeof(uint32_t) * 7);
6817 return sizeof(params->avx);
6818}
6819
6820size_t xnn_init_x64_transpose_avx_params(union xnn_x64_transpose_params params[XNN_MIN_ELEMENTS(1)]) {
6821 memset(&params->avx.mask_table[0], -1, sizeof(uint64_t) * 4);
6822 memset(&params->avx.mask_table[4], 0, sizeof(uint64_t) * 3);
6823 return sizeof(params->avx);
6824}
6825#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
6826
6827#if XNN_ARCH_ARM || XNN_ARCH_ARM64
6828size_t xnn_init_x24_transpose_neon_tbl64_params(union xnn_x24_transpose_params params[XNN_MIN_ELEMENTS(1)]) {
6829 memset(&params->neon_tbl64.pos0, 0, sizeof(params->neon_tbl64.pos0));
6830 memset(&params->neon_tbl64.pos1, 0, sizeof(params->neon_tbl64.pos1));
6831 params->neon_tbl64.pos0[0] = 0;
6832 params->neon_tbl64.pos0[1] = 1;
6833 params->neon_tbl64.pos0[2] = 2;
6834 params->neon_tbl64.pos0[3] = 8;
6835 params->neon_tbl64.pos0[4] = 9;
6836 params->neon_tbl64.pos0[5] = 10;
6837 params->neon_tbl64.pos1[0] = 3;
6838 params->neon_tbl64.pos1[1] = 4;
6839 params->neon_tbl64.pos1[2] = 5;
6840 params->neon_tbl64.pos1[3] = 11;
6841 params->neon_tbl64.pos1[4] = 12;
6842 params->neon_tbl64.pos1[5] = 13;
6843 return sizeof(params->neon_tbl64);
6844}
6845#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64
6846
6847#if XNN_ARCH_ARM64
6848size_t xnn_init_x24_transpose_neon_tbl128_params(union xnn_x24_transpose_params params[XNN_MIN_ELEMENTS(1)]) {
6849 memset(&params->neon_tbl128.pos0, 0, sizeof(params->neon_tbl128.pos0));
6850 memset(&params->neon_tbl128.pos1, 0, sizeof(params->neon_tbl128.pos1));
6851 memset(&params->neon_tbl128.pos2, 0, sizeof(params->neon_tbl128.pos2));
6852 memset(&params->neon_tbl128.pos3, 0, sizeof(params->neon_tbl128.pos3));
6853 params->neon_tbl128.pos0[0] = 0;
6854 params->neon_tbl128.pos0[1] = 1;
6855 params->neon_tbl128.pos0[2] = 2;
6856 params->neon_tbl128.pos0[3] = 16;
6857 params->neon_tbl128.pos0[4] = 17;
6858 params->neon_tbl128.pos0[5] = 18;
6859 params->neon_tbl128.pos0[6] = 32;
6860 params->neon_tbl128.pos0[7] = 33;
6861 params->neon_tbl128.pos0[8] = 34;
6862 params->neon_tbl128.pos0[9] = 48;
6863 params->neon_tbl128.pos0[10] = 49;
6864 params->neon_tbl128.pos0[11] = 50;
6865 params->neon_tbl128.pos1[0] = 3;
6866 params->neon_tbl128.pos1[1] = 4;
6867 params->neon_tbl128.pos1[2] = 5;
6868 params->neon_tbl128.pos1[3] = 19;
6869 params->neon_tbl128.pos1[4] = 20;
6870 params->neon_tbl128.pos1[5] = 21;
6871 params->neon_tbl128.pos1[6] = 35;
6872 params->neon_tbl128.pos1[7] = 36;
6873 params->neon_tbl128.pos1[8] = 37;
6874 params->neon_tbl128.pos1[9] = 51;
6875 params->neon_tbl128.pos1[10] = 52;
6876 params->neon_tbl128.pos1[11] = 53;
6877 params->neon_tbl128.pos2[0] = 6;
6878 params->neon_tbl128.pos2[1] = 7;
6879 params->neon_tbl128.pos2[2] = 8;
6880 params->neon_tbl128.pos2[3] = 22;
6881 params->neon_tbl128.pos2[4] = 23;
6882 params->neon_tbl128.pos2[5] = 24;
6883 params->neon_tbl128.pos2[6] = 38;
6884 params->neon_tbl128.pos2[7] = 39;
6885 params->neon_tbl128.pos2[8] = 40;
6886 params->neon_tbl128.pos2[9] = 54;
6887 params->neon_tbl128.pos2[10] = 55;
6888 params->neon_tbl128.pos2[11] = 56;
6889 params->neon_tbl128.pos3[0] = 9;
6890 params->neon_tbl128.pos3[1] = 10;
6891 params->neon_tbl128.pos3[2] = 11;
6892 params->neon_tbl128.pos3[3] = 25;
6893 params->neon_tbl128.pos3[4] = 26;
6894 params->neon_tbl128.pos3[5] = 27;
6895 params->neon_tbl128.pos3[6] = 41;
6896 params->neon_tbl128.pos3[7] = 42;
6897 params->neon_tbl128.pos3[8] = 43;
6898 params->neon_tbl128.pos3[9] = 57;
6899 params->neon_tbl128.pos3[10] = 58;
6900 params->neon_tbl128.pos3[11] = 59;
6901 return sizeof(params->neon_tbl128);
6902}
6903
6904size_t xnn_init_x32_transpose_neon_tbl128_params(union xnn_x32_transpose_params params[XNN_MIN_ELEMENTS(1)]) {
6905 params->neon_tbl128.pos0[0] = 0;
6906 params->neon_tbl128.pos0[1] = 1;
6907 params->neon_tbl128.pos0[2] = 2;
6908 params->neon_tbl128.pos0[3] = 3;
6909 params->neon_tbl128.pos0[4] = 16;
6910 params->neon_tbl128.pos0[5] = 17;
6911 params->neon_tbl128.pos0[6] = 18;
6912 params->neon_tbl128.pos0[7] = 19;
6913 params->neon_tbl128.pos0[8] = 32;
6914 params->neon_tbl128.pos0[9] = 33;
6915 params->neon_tbl128.pos0[10] = 34;
6916 params->neon_tbl128.pos0[11] = 35;
6917 params->neon_tbl128.pos0[12] = 48;
6918 params->neon_tbl128.pos0[13] = 49;
6919 params->neon_tbl128.pos0[14] = 50;
6920 params->neon_tbl128.pos0[15] = 51;
6921 params->neon_tbl128.pos1[0] = 4;
6922 params->neon_tbl128.pos1[1] = 5;
6923 params->neon_tbl128.pos1[2] = 6;
6924 params->neon_tbl128.pos1[3] = 7;
6925 params->neon_tbl128.pos1[4] = 20;
6926 params->neon_tbl128.pos1[5] = 21;
6927 params->neon_tbl128.pos1[6] = 22;
6928 params->neon_tbl128.pos1[7] = 23;
6929 params->neon_tbl128.pos1[8] = 36;
6930 params->neon_tbl128.pos1[9] = 37;
6931 params->neon_tbl128.pos1[10] = 38;
6932 params->neon_tbl128.pos1[11] = 39;
6933 params->neon_tbl128.pos1[12] = 52;
6934 params->neon_tbl128.pos1[13] = 53;
6935 params->neon_tbl128.pos1[14] = 54;
6936 params->neon_tbl128.pos1[15] = 55;
6937 params->neon_tbl128.pos2[0] = 8;
6938 params->neon_tbl128.pos2[1] = 9;
6939 params->neon_tbl128.pos2[2] = 10;
6940 params->neon_tbl128.pos2[3] = 11;
6941 params->neon_tbl128.pos2[4] = 24;
6942 params->neon_tbl128.pos2[5] = 25;
6943 params->neon_tbl128.pos2[6] = 26;
6944 params->neon_tbl128.pos2[7] = 27;
6945 params->neon_tbl128.pos2[8] = 40;
6946 params->neon_tbl128.pos2[9] = 41;
6947 params->neon_tbl128.pos2[10] = 42;
6948 params->neon_tbl128.pos2[11] = 43;
6949 params->neon_tbl128.pos2[12] = 56;
6950 params->neon_tbl128.pos2[13] = 57;
6951 params->neon_tbl128.pos2[14] = 58;
6952 params->neon_tbl128.pos2[15] = 59;
6953 params->neon_tbl128.pos3[0] = 12;
6954 params->neon_tbl128.pos3[1] = 13;
6955 params->neon_tbl128.pos3[2] = 14;
6956 params->neon_tbl128.pos3[3] = 15;
6957 params->neon_tbl128.pos3[4] = 28;
6958 params->neon_tbl128.pos3[5] = 29;
6959 params->neon_tbl128.pos3[6] = 30;
6960 params->neon_tbl128.pos3[7] = 31;
6961 params->neon_tbl128.pos3[8] = 44;
6962 params->neon_tbl128.pos3[9] = 45;
6963 params->neon_tbl128.pos3[10] = 46;
6964 params->neon_tbl128.pos3[11] = 47;
6965 params->neon_tbl128.pos3[12] = 60;
6966 params->neon_tbl128.pos3[13] = 61;
6967 params->neon_tbl128.pos3[14] = 62;
6968 params->neon_tbl128.pos3[15] = 63;
6969 return sizeof(params->neon_tbl128);
6970}
6971#endif // XNN_ARCH_ARM64
6972
6973#if XNN_ARCH_X86 || XNN_ARCH_X86_64
6974size_t xnn_init_qu8_f32_cvt_sse2_params(
6975 union xnn_qu8_f32_cvt_params params[XNN_MIN_ELEMENTS(1)],
6976 float scale,
6977 uint8_t zero_point)
6978{
6979 for (uint32_t i = 0; i < 8; i++) {
6980 params->sse2.magic_exp[i] = UINT16_C(0x4B00);
6981 }
6982 const float magic_bias = (float) (INT32_C(0x00800000) + (int32_t) zero_point);
6983 for (uint32_t i = 0; i < 4; i++) {
6984 params->sse2.magic_bias[i] = magic_bias;
6985 params->sse2.scale[i] = scale;
6986 }
6987 return sizeof(params->sse2);
6988}
6989
6990size_t xnn_init_qu8_f32_cvt_sse4_params(
6991 union xnn_qu8_f32_cvt_params params[XNN_MIN_ELEMENTS(1)],
6992 float scale,
6993 uint8_t zero_point)
6994{
6995 for (uint32_t i = 0; i < 4; i++) {
6996 params->sse4.minus_zero_point[i] = -(int32_t) zero_point;
6997 params->sse4.scale[i] = scale;
6998 }
6999 return sizeof(params->sse4);
7000}
7001
7002size_t xnn_init_qu8_f32_cvt_avx_params(
7003 union xnn_qu8_f32_cvt_params params[XNN_MIN_ELEMENTS(1)],
7004 float scale,
7005 uint8_t zero_point)
7006{
7007 for (uint32_t i = 0; i < 8; i++) {
7008 params->avx.minus_zero_point[i] = -(int32_t) zero_point;
7009 params->avx.scale[i] = scale;
7010 }
7011 return sizeof(params->avx);
7012}
7013
7014size_t xnn_init_qu8_f32_cvt_avx512_params(
7015 union xnn_qu8_f32_cvt_params params[XNN_MIN_ELEMENTS(1)],
7016 float scale,
7017 uint8_t zero_point)
7018{
7019 for (uint32_t i = 0; i < 16; i++) {
7020 params->avx512.minus_zero_point[i] = -(int32_t) zero_point;
7021 params->avx512.scale[i] = scale;
7022 }
7023 return sizeof(params->avx512);
7024}
7025#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
7026
7027#if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
7028size_t xnn_init_qu8_f32_cvt_wasmsimd_params(
7029 union xnn_qu8_f32_cvt_params params[XNN_MIN_ELEMENTS(1)],
7030 float scale,
7031 uint8_t zero_point)
7032{
7033 for (uint32_t i = 0; i < 4; i++) {
7034 params->wasmsimd.minus_zero_point[i] = -(int16_t) zero_point;
7035 }
7036 for (uint32_t i = 0; i < 2; i++) {
7037 params->wasmsimd.scale[i] = scale;
7038 }
7039 return sizeof(params->wasmsimd);
7040}
7041#endif // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
7042