1#pragma once
2#ifndef PSIMD_H
3#define PSIMD_H
4
5#if defined(__CUDA_ARCH__)
6 /* CUDA compiler */
7 #define PSIMD_INTRINSIC __forceinline__ __device__
8#elif defined(__OPENCL_VERSION__)
9 /* OpenCL compiler */
10 #define PSIMD_INTRINSIC inline static
11#elif defined(__INTEL_COMPILER)
12 /* Intel compiler, even on Windows */
13 #define PSIMD_INTRINSIC inline static __attribute__((__always_inline__))
14#elif defined(__GNUC__)
15 /* GCC-compatible compiler (gcc/clang/icc) */
16 #define PSIMD_INTRINSIC inline static __attribute__((__always_inline__))
17#elif defined(_MSC_VER)
18 /* MSVC-compatible compiler (cl/icl/clang-cl) */
19 #define PSIMD_INTRINSIC __forceinline static
20#elif defined(__cplusplus)
21 /* Generic C++ compiler */
22 #define PSIMD_INTRINSIC inline static
23#elif defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L)
24 /* Generic C99 compiler */
25 #define PSIMD_INTRINSIC inline static
26#else
27 /* Generic C compiler */
28 #define PSIMD_INTRINSIC static
29#endif
30
31#if defined(__GNUC__) || defined(__clang__)
32 #if defined(__ARM_NEON__) || defined(__ARM_NEON)
33 #include <arm_neon.h>
34 #endif
35
36 #if defined(__SSE2__)
37 #include <emmintrin.h>
38 #endif
39
40 #if defined(__SSE3__)
41 #include <pmmintrin.h>
42 #endif
43
44 #if defined(__SSSE3__)
45 #include <tmmintrin.h>
46 #endif
47
48 #if defined(__SSE4_1__)
49 #include <smmintrin.h>
50 #endif
51
52 #if defined(__SSE4_2__)
53 #include <nmmintrin.h>
54 #endif
55
56 #if defined(__AVX__)
57 #include <immintrin.h>
58 #endif
59#elif defined(_MSC_VER)
60 #include <intrin.h>
61#endif
62
63#if defined(__cplusplus)
64 #define PSIMD_CXX_SYNTAX
65#elif defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 201112L)
66 #define PSIMD_C11_SYNTAX
67#elif defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L)
68 #define PSIMD_C99_SYNTAX
69#else
70 #define PSIMD_C89_SYNTAX
71#endif
72
73#if defined(__cplusplus) && (__cplusplus >= 201103L)
74 #include <cstddef>
75 #include <cstdint>
76#elif !defined(__OPENCL_VERSION__)
77 #include <stddef.h>
78 #include <stdint.h>
79#endif
80
81#if defined(__GNUC__) || defined(__clang__)
82 #define PSIMD_HAVE_F64 0
83 #define PSIMD_HAVE_F32 1
84 #define PSIMD_HAVE_U8 1
85 #define PSIMD_HAVE_S8 1
86 #define PSIMD_HAVE_U16 1
87 #define PSIMD_HAVE_S16 1
88 #define PSIMD_HAVE_U32 1
89 #define PSIMD_HAVE_S32 1
90 #define PSIMD_HAVE_U64 0
91 #define PSIMD_HAVE_S64 0
92
93 typedef int8_t psimd_s8 __attribute__((vector_size(16), aligned(1)));
94 typedef uint8_t psimd_u8 __attribute__((vector_size(16), aligned(1)));
95 typedef int16_t psimd_s16 __attribute__((vector_size(16), aligned(2)));
96 typedef uint16_t psimd_u16 __attribute__((vector_size(16), aligned(2)));
97 typedef int32_t psimd_s32 __attribute__((vector_size(16), aligned(4)));
98 typedef uint32_t psimd_u32 __attribute__((vector_size(16), aligned(4)));
99 typedef float psimd_f32 __attribute__((vector_size(16), aligned(4)));
100
101 typedef struct {
102 psimd_s8 lo;
103 psimd_s8 hi;
104 } psimd_s8x2;
105
106 typedef struct {
107 psimd_u8 lo;
108 psimd_u8 hi;
109 } psimd_u8x2;
110
111 typedef struct {
112 psimd_s16 lo;
113 psimd_s16 hi;
114 } psimd_s16x2;
115
116 typedef struct {
117 psimd_u16 lo;
118 psimd_u16 hi;
119 } psimd_u16x2;
120
121 typedef struct {
122 psimd_s32 lo;
123 psimd_s32 hi;
124 } psimd_s32x2;
125
126 typedef struct {
127 psimd_u32 lo;
128 psimd_u32 hi;
129 } psimd_u32x2;
130
131 typedef struct {
132 psimd_f32 lo;
133 psimd_f32 hi;
134 } psimd_f32x2;
135
136 /* Bit casts */
137 PSIMD_INTRINSIC psimd_u32x2 psimd_cast_s32x2_u32x2(psimd_s32x2 v) {
138 return (psimd_u32x2) { .lo = (psimd_u32) v.lo, .hi = (psimd_u32) v.hi };
139 }
140
141 PSIMD_INTRINSIC psimd_f32x2 psimd_cast_s32x2_f32x2(psimd_s32x2 v) {
142 return (psimd_f32x2) { .lo = (psimd_f32) v.lo, .hi = (psimd_f32) v.hi };
143 }
144
145 PSIMD_INTRINSIC psimd_s32x2 psimd_cast_u32x2_s32x2(psimd_u32x2 v) {
146 return (psimd_s32x2) { .lo = (psimd_s32) v.lo, .hi = (psimd_s32) v.hi };
147 }
148
149 PSIMD_INTRINSIC psimd_f32x2 psimd_cast_u32x2_f32x2(psimd_u32x2 v) {
150 return (psimd_f32x2) { .lo = (psimd_f32) v.lo, .hi = (psimd_f32) v.hi };
151 }
152
153 PSIMD_INTRINSIC psimd_s32x2 psimd_cast_f32x2_s32x2(psimd_f32x2 v) {
154 return (psimd_s32x2) { .lo = (psimd_s32) v.lo, .hi = (psimd_s32) v.hi };
155 }
156
157 PSIMD_INTRINSIC psimd_u32x2 psimd_cast_f32x2_u32x2(psimd_f32x2 v) {
158 return (psimd_u32x2) { .lo = (psimd_u32) v.lo, .hi = (psimd_u32) v.hi };
159 }
160
161 /* Swap */
162 PSIMD_INTRINSIC void psimd_swap_s8(psimd_s8 a[1], psimd_s8 b[1]) {
163 const psimd_s8 new_a = *b;
164 const psimd_s8 new_b = *a;
165 *a = new_a;
166 *b = new_b;
167 }
168
169 PSIMD_INTRINSIC void psimd_swap_u8(psimd_u8 a[1], psimd_u8 b[1]) {
170 const psimd_u8 new_a = *b;
171 const psimd_u8 new_b = *a;
172 *a = new_a;
173 *b = new_b;
174 }
175
176 PSIMD_INTRINSIC void psimd_swap_s16(psimd_s16 a[1], psimd_s16 b[1]) {
177 const psimd_s16 new_a = *b;
178 const psimd_s16 new_b = *a;
179 *a = new_a;
180 *b = new_b;
181 }
182
183 PSIMD_INTRINSIC void psimd_swap_u16(psimd_u16 a[1], psimd_u16 b[1]) {
184 const psimd_u16 new_a = *b;
185 const psimd_u16 new_b = *a;
186 *a = new_a;
187 *b = new_b;
188 }
189
190 PSIMD_INTRINSIC void psimd_swap_s32(psimd_s32 a[1], psimd_s32 b[1]) {
191 const psimd_s32 new_a = *b;
192 const psimd_s32 new_b = *a;
193 *a = new_a;
194 *b = new_b;
195 }
196
197 PSIMD_INTRINSIC void psimd_swap_u32(psimd_u32 a[1], psimd_u32 b[1]) {
198 const psimd_u32 new_a = *b;
199 const psimd_u32 new_b = *a;
200 *a = new_a;
201 *b = new_b;
202 }
203
204 PSIMD_INTRINSIC void psimd_swap_f32(psimd_f32 a[1], psimd_f32 b[1]) {
205 const psimd_f32 new_a = *b;
206 const psimd_f32 new_b = *a;
207 *a = new_a;
208 *b = new_b;
209 }
210
211 /* Zero-initialization */
212 PSIMD_INTRINSIC psimd_s8 psimd_zero_s8(void) {
213 return (psimd_s8) { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 };
214 }
215
216 PSIMD_INTRINSIC psimd_u8 psimd_zero_u8(void) {
217 return (psimd_u8) { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 };
218 }
219
220 PSIMD_INTRINSIC psimd_s16 psimd_zero_s16(void) {
221 return (psimd_s16) { 0, 0, 0, 0, 0, 0, 0, 0 };
222 }
223
224 PSIMD_INTRINSIC psimd_u16 psimd_zero_u16(void) {
225 return (psimd_u16) { 0, 0, 0, 0, 0, 0, 0, 0 };
226 }
227
228 PSIMD_INTRINSIC psimd_s32 psimd_zero_s32(void) {
229 return (psimd_s32) { 0, 0, 0, 0 };
230 }
231
232 PSIMD_INTRINSIC psimd_u32 psimd_zero_u32(void) {
233 return (psimd_u32) { 0, 0, 0, 0 };
234 }
235
236 PSIMD_INTRINSIC psimd_f32 psimd_zero_f32(void) {
237 return (psimd_f32) { 0.0f, 0.0f, 0.0f, 0.0f };
238 }
239
240 /* Initialization to the same constant */
241 PSIMD_INTRINSIC psimd_s8 psimd_splat_s8(int8_t c) {
242 return (psimd_s8) { c, c, c, c, c, c, c, c, c, c, c, c, c, c, c, c };
243 }
244
245 PSIMD_INTRINSIC psimd_u8 psimd_splat_u8(uint8_t c) {
246 return (psimd_u8) { c, c, c, c, c, c, c, c, c, c, c, c, c, c, c, c };
247 }
248
249 PSIMD_INTRINSIC psimd_s16 psimd_splat_s16(int16_t c) {
250 return (psimd_s16) { c, c, c, c, c, c, c, c };
251 }
252
253 PSIMD_INTRINSIC psimd_u16 psimd_splat_u16(uint16_t c) {
254 return (psimd_u16) { c, c, c, c, c, c, c, c };
255 }
256
257 PSIMD_INTRINSIC psimd_s32 psimd_splat_s32(int32_t c) {
258 return (psimd_s32) { c, c, c, c };
259 }
260
261 PSIMD_INTRINSIC psimd_u32 psimd_splat_u32(uint32_t c) {
262 return (psimd_u32) { c, c, c, c };
263 }
264
265 PSIMD_INTRINSIC psimd_f32 psimd_splat_f32(float c) {
266 return (psimd_f32) { c, c, c, c };
267 }
268
269 /* Load vector */
270 PSIMD_INTRINSIC psimd_s8 psimd_load_s8(const void* address) {
271 return *((const psimd_s8*) address);
272 }
273
274 PSIMD_INTRINSIC psimd_u8 psimd_load_u8(const void* address) {
275 return *((const psimd_u8*) address);
276 }
277
278 PSIMD_INTRINSIC psimd_s16 psimd_load_s16(const void* address) {
279 return *((const psimd_s16*) address);
280 }
281
282 PSIMD_INTRINSIC psimd_u16 psimd_load_u16(const void* address) {
283 return *((const psimd_u16*) address);
284 }
285
286 PSIMD_INTRINSIC psimd_s32 psimd_load_s32(const void* address) {
287 return *((const psimd_s32*) address);
288 }
289
290 PSIMD_INTRINSIC psimd_u32 psimd_load_u32(const void* address) {
291 return *((const psimd_u32*) address);
292 }
293
294 PSIMD_INTRINSIC psimd_f32 psimd_load_f32(const void* address) {
295 return *((const psimd_f32*) address);
296 }
297
298 PSIMD_INTRINSIC psimd_s8 psimd_load_splat_s8(const void* address) {
299 return psimd_splat_s8(*((const int8_t*) address));
300 }
301
302 PSIMD_INTRINSIC psimd_u8 psimd_load_splat_u8(const void* address) {
303 return psimd_splat_u8(*((const uint8_t*) address));
304 }
305
306 PSIMD_INTRINSIC psimd_s16 psimd_load_splat_s16(const void* address) {
307 return psimd_splat_s16(*((const int16_t*) address));
308 }
309
310 PSIMD_INTRINSIC psimd_u16 psimd_load_splat_u16(const void* address) {
311 return psimd_splat_u16(*((const uint16_t*) address));
312 }
313
314 PSIMD_INTRINSIC psimd_s32 psimd_load_splat_s32(const void* address) {
315 return psimd_splat_s32(*((const int32_t*) address));
316 }
317
318 PSIMD_INTRINSIC psimd_u32 psimd_load_splat_u32(const void* address) {
319 return psimd_splat_u32(*((const uint32_t*) address));
320 }
321
322 PSIMD_INTRINSIC psimd_f32 psimd_load_splat_f32(const void* address) {
323 return psimd_splat_f32(*((const float*) address));
324 }
325
326 PSIMD_INTRINSIC psimd_s32 psimd_load1_s32(const void* address) {
327 return (psimd_s32) { *((const int32_t*) address), 0, 0, 0 };
328 }
329
330 PSIMD_INTRINSIC psimd_u32 psimd_load1_u32(const void* address) {
331 return (psimd_u32) { *((const uint32_t*) address), 0, 0, 0 };
332 }
333
334 PSIMD_INTRINSIC psimd_f32 psimd_load1_f32(const void* address) {
335 return (psimd_f32) { *((const float*) address), 0.0f, 0.0f, 0.0f };
336 }
337
338 PSIMD_INTRINSIC psimd_s32 psimd_load2_s32(const void* address) {
339 const int32_t* address_s32 = (const int32_t*) address;
340 return (psimd_s32) { address_s32[0], address_s32[1], 0, 0 };
341 }
342
343 PSIMD_INTRINSIC psimd_u32 psimd_load2_u32(const void* address) {
344 const uint32_t* address_u32 = (const uint32_t*) address;
345 return (psimd_u32) { address_u32[0], address_u32[1], 0, 0 };
346 }
347
348 PSIMD_INTRINSIC psimd_f32 psimd_load2_f32(const void* address) {
349 const float* address_f32 = (const float*) address;
350 return (psimd_f32) { address_f32[0], address_f32[1], 0.0f, 0.0f };
351 }
352
353 PSIMD_INTRINSIC psimd_s32 psimd_load3_s32(const void* address) {
354 const int32_t* address_s32 = (const int32_t*) address;
355 return (psimd_s32) { address_s32[0], address_s32[1], address_s32[2], 0 };
356 }
357
358 PSIMD_INTRINSIC psimd_u32 psimd_load3_u32(const void* address) {
359 const uint32_t* address_u32 = (const uint32_t*) address;
360 return (psimd_u32) { address_u32[0], address_u32[1], address_u32[2], 0 };
361 }
362
363 PSIMD_INTRINSIC psimd_f32 psimd_load3_f32(const void* address) {
364 const float* address_f32 = (const float*) address;
365 return (psimd_f32) { address_f32[0], address_f32[1], address_f32[2], 0.0f };
366 }
367
368 PSIMD_INTRINSIC psimd_s32 psimd_load4_s32(const void* address) {
369 return psimd_load_s32(address);
370 }
371
372 PSIMD_INTRINSIC psimd_u32 psimd_load4_u32(const void* address) {
373 return psimd_load_u32(address);
374 }
375
376 PSIMD_INTRINSIC psimd_f32 psimd_load4_f32(const void* address) {
377 return psimd_load_f32(address);
378 }
379
380 PSIMD_INTRINSIC psimd_f32 psimd_load_stride2_f32(const void* address) {
381 const psimd_f32 v0x1x = psimd_load_f32(address);
382 const psimd_f32 vx2x3 = psimd_load_f32((const float*) address + 3);
383 #if defined(__clang__)
384 return __builtin_shufflevector(v0x1x, vx2x3, 0, 2, 5, 7);
385 #else
386 return __builtin_shuffle(v0x1x, vx2x3, (psimd_s32) { 0, 2, 5, 7 });
387 #endif
388 }
389
390 PSIMD_INTRINSIC psimd_f32 psimd_load1_stride2_f32(const void* address) {
391 return psimd_load_f32(address);
392 }
393
394 PSIMD_INTRINSIC psimd_f32 psimd_load2_stride2_f32(const void* address) {
395 const float* address_f32 = (const float*) address;
396 return (psimd_f32) { address_f32[0], address_f32[2], 0.0f, 0.0f };
397 }
398
399 PSIMD_INTRINSIC psimd_f32 psimd_load3_stride2_f32(const void* address) {
400 const psimd_f32 v0x1x = psimd_load_f32(address);
401 const psimd_f32 v2zzz = psimd_load1_f32((const float*) address + 2);
402 #if defined(__clang__)
403 return __builtin_shufflevector(v0x1x, v2zzz, 0, 2, 4, 6);
404 #else
405 return __builtin_shuffle(v0x1x, v2zzz, (psimd_s32) { 0, 2, 4, 6 });
406 #endif
407 }
408
409 PSIMD_INTRINSIC psimd_f32 psimd_load4_stride2_f32(const void* address) {
410 return psimd_load_stride2_f32(address);
411 }
412
413 PSIMD_INTRINSIC psimd_f32 psimd_load_stride_f32(const void* address, size_t stride) {
414 const float* address0_f32 = (const float*) address;
415 const float* address1_f32 = address0_f32 + stride;
416 const float* address2_f32 = address1_f32 + stride;
417 const float* address3_f32 = address2_f32 + stride;
418 return (psimd_f32) { *address0_f32, *address1_f32, *address2_f32, *address3_f32 };
419 }
420
421 PSIMD_INTRINSIC psimd_f32 psimd_load1_stride_f32(const void* address, size_t stride) {
422 return psimd_load1_f32(address);
423 }
424
425 PSIMD_INTRINSIC psimd_f32 psimd_load2_stride_f32(const void* address, size_t stride) {
426 const float* address_f32 = (const float*) address;
427 return (psimd_f32) { address_f32[0], address_f32[stride], 0.0f, 0.0f };
428 }
429
430 PSIMD_INTRINSIC psimd_f32 psimd_load3_stride_f32(const void* address, size_t stride) {
431 const float* address0_f32 = (const float*) address;
432 const float* address1_f32 = address0_f32 + stride;
433 const float* address2_f32 = address1_f32 + stride;
434 return (psimd_f32) { *address0_f32, *address1_f32, *address2_f32, 0.0f };
435 }
436
437 PSIMD_INTRINSIC psimd_f32 psimd_load4_stride_f32(const void* address, size_t stride) {
438 return psimd_load_stride_f32(address, stride);
439 }
440
441 /* Store vector */
442 PSIMD_INTRINSIC void psimd_store_s8(void* address, psimd_s8 value) {
443 *((psimd_s8*) address) = value;
444 }
445
446 PSIMD_INTRINSIC void psimd_store_u8(void* address, psimd_u8 value) {
447 *((psimd_u8*) address) = value;
448 }
449
450 PSIMD_INTRINSIC void psimd_store_s16(void* address, psimd_s16 value) {
451 *((psimd_s16*) address) = value;
452 }
453
454 PSIMD_INTRINSIC void psimd_store_u16(void* address, psimd_u16 value) {
455 *((psimd_u16*) address) = value;
456 }
457
458 PSIMD_INTRINSIC void psimd_store_s32(void* address, psimd_s32 value) {
459 *((psimd_s32*) address) = value;
460 }
461
462 PSIMD_INTRINSIC void psimd_store_u32(void* address, psimd_u32 value) {
463 *((psimd_u32*) address) = value;
464 }
465
466 PSIMD_INTRINSIC void psimd_store_f32(void* address, psimd_f32 value) {
467 *((psimd_f32*) address) = value;
468 }
469
470 PSIMD_INTRINSIC void psimd_store1_s32(void* address, psimd_s32 value) {
471 *((int32_t*) address) = value[0];
472 }
473
474 PSIMD_INTRINSIC void psimd_store1_u32(void* address, psimd_u32 value) {
475 *((uint32_t*) address) = value[0];
476 }
477
478 PSIMD_INTRINSIC void psimd_store1_f32(void* address, psimd_f32 value) {
479 *((float*) address) = value[0];
480 }
481
482 PSIMD_INTRINSIC void psimd_store2_s32(void* address, psimd_s32 value) {
483 int32_t* address_s32 = (int32_t*) address;
484 address_s32[0] = value[0];
485 address_s32[1] = value[1];
486 }
487
488 PSIMD_INTRINSIC void psimd_store2_u32(void* address, psimd_u32 value) {
489 uint32_t* address_u32 = (uint32_t*) address;
490 address_u32[0] = value[0];
491 address_u32[1] = value[1];
492 }
493
494 PSIMD_INTRINSIC void psimd_store2_f32(void* address, psimd_f32 value) {
495 float* address_f32 = (float*) address;
496 address_f32[0] = value[0];
497 address_f32[1] = value[1];
498 }
499
500 PSIMD_INTRINSIC void psimd_store3_s32(void* address, psimd_s32 value) {
501 int32_t* address_s32 = (int32_t*) address;
502 address_s32[0] = value[0];
503 address_s32[1] = value[1];
504 address_s32[2] = value[2];
505 }
506
507 PSIMD_INTRINSIC void psimd_store3_u32(void* address, psimd_u32 value) {
508 uint32_t* address_u32 = (uint32_t*) address;
509 address_u32[0] = value[0];
510 address_u32[1] = value[1];
511 address_u32[2] = value[2];
512 }
513
514 PSIMD_INTRINSIC void psimd_store3_f32(void* address, psimd_f32 value) {
515 float* address_f32 = (float*) address;
516 address_f32[0] = value[0];
517 address_f32[1] = value[1];
518 address_f32[2] = value[2];
519 }
520
521 PSIMD_INTRINSIC void psimd_store4_s32(void* address, psimd_s32 value) {
522 psimd_store_s32(address, value);
523 }
524
525 PSIMD_INTRINSIC void psimd_store4_u32(void* address, psimd_u32 value) {
526 psimd_store_u32(address, value);
527 }
528
529 PSIMD_INTRINSIC void psimd_store4_f32(void* address, psimd_f32 value) {
530 psimd_store_f32(address, value);
531 }
532
533 PSIMD_INTRINSIC void psimd_store_stride_f32(void* address, size_t stride, psimd_f32 value) {
534 float* address0_f32 = (float*) address;
535 float* address1_f32 = address0_f32 + stride;
536 float* address2_f32 = address1_f32 + stride;
537 float* address3_f32 = address2_f32 + stride;
538 *address0_f32 = value[0];
539 *address1_f32 = value[1];
540 *address2_f32 = value[2];
541 *address3_f32 = value[3];
542 }
543
544 PSIMD_INTRINSIC void psimd_store1_stride_f32(void* address, size_t stride, psimd_f32 value) {
545 psimd_store1_f32(address, value);
546 }
547
548 PSIMD_INTRINSIC void psimd_store2_stride_f32(void* address, size_t stride, psimd_f32 value) {
549 float* address_f32 = (float*) address;
550 address_f32[0] = value[0];
551 address_f32[stride] = value[1];
552 }
553
554 PSIMD_INTRINSIC void psimd_store3_stride_f32(void* address, size_t stride, psimd_f32 value) {
555 float* address0_f32 = (float*) address;
556 float* address1_f32 = address0_f32 + stride;
557 float* address2_f32 = address1_f32 + stride;
558 *address0_f32 = value[0];
559 *address1_f32 = value[1];
560 *address2_f32 = value[2];
561 }
562
563 /* Vector addition */
564 PSIMD_INTRINSIC psimd_s8 psimd_add_s8(psimd_s8 a, psimd_s8 b) {
565 return a + b;
566 }
567
568 PSIMD_INTRINSIC psimd_u8 psimd_add_u8(psimd_u8 a, psimd_u8 b) {
569 return a + b;
570 }
571
572 PSIMD_INTRINSIC psimd_s16 psimd_add_s16(psimd_s16 a, psimd_s16 b) {
573 return a + b;
574 }
575
576 PSIMD_INTRINSIC psimd_u16 psimd_add_u16(psimd_u16 a, psimd_u16 b) {
577 return a + b;
578 }
579
580 PSIMD_INTRINSIC psimd_s32 psimd_add_s32(psimd_s32 a, psimd_s32 b) {
581 return a + b;
582 }
583
584 PSIMD_INTRINSIC psimd_u32 psimd_add_u32(psimd_u32 a, psimd_u32 b) {
585 return a + b;
586 }
587
588 PSIMD_INTRINSIC psimd_f32 psimd_add_f32(psimd_f32 a, psimd_f32 b) {
589 #if defined(__ARM_ARCH_7A__) && defined(__ARM_NEON__) && !defined(__FAST_MATH__)
590 return (psimd_f32) vaddq_f32((float32x4_t) a, (float32x4_t) b);
591 #else
592 return a + b;
593 #endif
594 }
595
596 /* Vector subtraction */
597 PSIMD_INTRINSIC psimd_s8 psimd_sub_s8(psimd_s8 a, psimd_s8 b) {
598 return a - b;
599 }
600
601 PSIMD_INTRINSIC psimd_u8 psimd_sub_u8(psimd_u8 a, psimd_u8 b) {
602 return a - b;
603 }
604
605 PSIMD_INTRINSIC psimd_s16 psimd_sub_s16(psimd_s16 a, psimd_s16 b) {
606 return a - b;
607 }
608
609 PSIMD_INTRINSIC psimd_u16 psimd_sub_u16(psimd_u16 a, psimd_u16 b) {
610 return a - b;
611 }
612
613 PSIMD_INTRINSIC psimd_s32 psimd_sub_s32(psimd_s32 a, psimd_s32 b) {
614 return a - b;
615 }
616
617 PSIMD_INTRINSIC psimd_u32 psimd_sub_u32(psimd_u32 a, psimd_u32 b) {
618 return a - b;
619 }
620
621 PSIMD_INTRINSIC psimd_f32 psimd_sub_f32(psimd_f32 a, psimd_f32 b) {
622 #if defined(__ARM_ARCH_7A__) && defined(__ARM_NEON__) && !defined(__FAST_MATH__)
623 return (psimd_f32) vsubq_f32((float32x4_t) a, (float32x4_t) b);
624 #else
625 return a - b;
626 #endif
627 }
628
629 /* Vector multiplication */
630 PSIMD_INTRINSIC psimd_s8 psimd_mul_s8(psimd_s8 a, psimd_s8 b) {
631 return a * b;
632 }
633
634 PSIMD_INTRINSIC psimd_u8 psimd_mul_u8(psimd_u8 a, psimd_u8 b) {
635 return a * b;
636 }
637
638 PSIMD_INTRINSIC psimd_s16 psimd_mul_s16(psimd_s16 a, psimd_s16 b) {
639 return a * b;
640 }
641
642 PSIMD_INTRINSIC psimd_u16 psimd_mul_u16(psimd_u16 a, psimd_u16 b) {
643 return a * b;
644 }
645
646 PSIMD_INTRINSIC psimd_s32 psimd_mul_s32(psimd_s32 a, psimd_s32 b) {
647 return a * b;
648 }
649
650 PSIMD_INTRINSIC psimd_u32 psimd_mul_u32(psimd_u32 a, psimd_u32 b) {
651 return a * b;
652 }
653
654 PSIMD_INTRINSIC psimd_f32 psimd_mul_f32(psimd_f32 a, psimd_f32 b) {
655 #if defined(__ARM_ARCH_7A__) && defined(__ARM_NEON__) && !defined(__FAST_MATH__)
656 return (psimd_f32) vmulq_f32((float32x4_t) a, (float32x4_t) b);
657 #else
658 return a * b;
659 #endif
660 }
661
662 /* Quasi-Fused Multiply-Add */
663 PSIMD_INTRINSIC psimd_f32 psimd_qfma_f32(psimd_f32 a, psimd_f32 b, psimd_f32 c) {
664 #if defined(__aarch64__) || defined(__ARM_NEON__) && defined(__ARM_FEATURE_FMA)
665 return (psimd_f32) vfmaq_f32((float32x4_t) a, (float32x4_t) b, (float32x4_t) c);
666 #elif (defined(__x86_64__) || defined(__i386__) || defined(__i686__)) && defined(__FMA__)
667 return (psimd_f32) _mm_fmadd_ps((__m128) b, (__m128) c, (__m128) a);
668 #elif (defined(__x86_64__) || defined(__i386__) || defined(__i686__)) && defined(__FMA4__)
669 return (psimd_f32) _mm_macc_ps((__m128) b, (__m128) c, (__m128) a);
670 #elif defined(__wasm__) && defined(__wasm_simd128__) && defined(__clang__) && PSIMD_ENABLE_WASM_QFMA
671 return (psimd_f32) __builtin_wasm_qfma_f32x4(a, b, c);
672 #else
673 return a + b * c;
674 #endif
675 }
676
677 PSIMD_INTRINSIC psimd_f32 psimd_div_f32(psimd_f32 a, psimd_f32 b) {
678 return a / b;
679 }
680
681 /* Vector and */
682 PSIMD_INTRINSIC psimd_f32 psimd_andmask_f32(psimd_s32 mask, psimd_f32 v) {
683 return (psimd_f32) (mask & (psimd_s32) v);
684 }
685
686 /* Vector and-not */
687 PSIMD_INTRINSIC psimd_f32 psimd_andnotmask_f32(psimd_s32 mask, psimd_f32 v) {
688 return (psimd_f32) (~mask & (psimd_s32) v);
689 }
690
691 /* Vector blend */
692 PSIMD_INTRINSIC psimd_s8 psimd_blend_s8(psimd_s8 mask, psimd_s8 a, psimd_s8 b) {
693 #if defined(__ARM_NEON__) || defined(__ARM_NEON)
694 return (psimd_s8) vbslq_s8((uint8x16_t) mask, (int8x16_t) a, (int8x16_t) b);
695 #elif defined(__wasm__) && defined(__wasm_simd128__) && defined(__clang__)
696 return (psimd_s8) __builtin_wasm_bitselect(a, b, mask);
697 #else
698 return (mask & a) | (~mask & b);
699 #endif
700 }
701
702 PSIMD_INTRINSIC psimd_u8 psimd_blend_u8(psimd_s8 mask, psimd_u8 a, psimd_u8 b) {
703 #if defined(__ARM_NEON__) || defined(__ARM_NEON)
704 return (psimd_u8) vbslq_u8((uint8x16_t) mask, (uint8x16_t) a, (uint8x16_t) b);
705 #elif defined(__wasm__) && defined(__wasm_simd128__) && defined(__clang__)
706 return (psimd_u8) __builtin_wasm_bitselect(a, b, mask);
707 #else
708 return (psimd_u8) ((mask & (psimd_s8) a) | (~mask & (psimd_s8) b));
709 #endif
710 }
711
712 PSIMD_INTRINSIC psimd_s16 psimd_blend_s16(psimd_s16 mask, psimd_s16 a, psimd_s16 b) {
713 #if defined(__ARM_NEON__) || defined(__ARM_NEON)
714 return (psimd_s16) vbslq_s16((uint16x8_t) mask, (int16x8_t) a, (int16x8_t) b);
715 #elif defined(__wasm__) && defined(__wasm_simd128__) && defined(__clang__)
716 return (psimd_s16) __builtin_wasm_bitselect(a, b, mask);
717 #else
718 return (mask & a) | (~mask & b);
719 #endif
720 }
721
722 PSIMD_INTRINSIC psimd_u16 psimd_blend_u16(psimd_s16 mask, psimd_u16 a, psimd_u16 b) {
723 #if defined(__ARM_NEON__) || defined(__ARM_NEON)
724 return (psimd_u16) vbslq_u16((uint16x8_t) mask, (uint16x8_t) a, (uint16x8_t) b);
725 #elif defined(__wasm__) && defined(__wasm_simd128__) && defined(__clang__)
726 return (psimd_u16) __builtin_wasm_bitselect(a, b, mask);
727 #else
728 return (psimd_u16) ((mask & (psimd_s16) a) | (~mask & (psimd_s16) b));
729 #endif
730 }
731
732 PSIMD_INTRINSIC psimd_s32 psimd_blend_s32(psimd_s32 mask, psimd_s32 a, psimd_s32 b) {
733 #if defined(__ARM_NEON__) || defined(__ARM_NEON)
734 return (psimd_s32) vbslq_s32((uint32x4_t) mask, (int32x4_t) a, (int32x4_t) b);
735 #elif defined(__wasm__) && defined(__wasm_simd128__) && defined(__clang__)
736 return (psimd_s32) __builtin_wasm_bitselect(a, b, mask);
737 #else
738 return (mask & a) | (~mask & b);
739 #endif
740 }
741
742 PSIMD_INTRINSIC psimd_u32 psimd_blend_u32(psimd_s32 mask, psimd_u32 a, psimd_u32 b) {
743 #if defined(__ARM_NEON__) || defined(__ARM_NEON)
744 return (psimd_u32) vbslq_u32((uint32x4_t) mask, (uint32x4_t) a, (uint32x4_t) b);
745 #elif defined(__wasm__) && defined(__wasm_simd128__) && defined(__clang__)
746 return (psimd_u32) __builtin_wasm_bitselect(a, b, mask);
747 #else
748 return (psimd_u32) ((mask & (psimd_s32) a) | (~mask & (psimd_s32) b));
749 #endif
750 }
751
752 PSIMD_INTRINSIC psimd_f32 psimd_blend_f32(psimd_s32 mask, psimd_f32 a, psimd_f32 b) {
753 #if defined(__ARM_NEON__) || defined(__ARM_NEON)
754 return (psimd_f32) vbslq_f32((uint32x4_t) mask, (float32x4_t) a, (float32x4_t) b);
755 #elif defined(__wasm__) && defined(__wasm_simd128__) && defined(__clang__)
756 return (psimd_f32) __builtin_wasm_bitselect(a, b, mask);
757 #else
758 return (psimd_f32) ((mask & (psimd_s32) a) | (~mask & (psimd_s32) b));
759 #endif
760 }
761
762 /* Vector blend on sign */
763 PSIMD_INTRINSIC psimd_s8 psimd_signblend_s8(psimd_s8 x, psimd_s8 a, psimd_s8 b) {
764 return psimd_blend_s8(x >> psimd_splat_s8(7), a, b);
765 }
766
767 PSIMD_INTRINSIC psimd_u8 psimd_signblend_u8(psimd_s8 x, psimd_u8 a, psimd_u8 b) {
768 return psimd_blend_u8((x >> psimd_splat_s8(7)), a, b);
769 }
770
771 PSIMD_INTRINSIC psimd_s16 psimd_signblend_s16(psimd_s16 x, psimd_s16 a, psimd_s16 b) {
772 return psimd_blend_s16(x >> psimd_splat_s16(15), a, b);
773 }
774
775 PSIMD_INTRINSIC psimd_u16 psimd_signblend_u16(psimd_s16 x, psimd_u16 a, psimd_u16 b) {
776 return psimd_blend_u16((x >> psimd_splat_s16(15)), a, b);
777 }
778
779 PSIMD_INTRINSIC psimd_s32 psimd_signblend_s32(psimd_s32 x, psimd_s32 a, psimd_s32 b) {
780 return psimd_blend_s32(x >> psimd_splat_s32(31), a, b);
781 }
782
783 PSIMD_INTRINSIC psimd_u32 psimd_signblend_u32(psimd_s32 x, psimd_u32 a, psimd_u32 b) {
784 return psimd_blend_u32((x >> psimd_splat_s32(31)), a, b);
785 }
786
787 PSIMD_INTRINSIC psimd_f32 psimd_signblend_f32(psimd_f32 x, psimd_f32 a, psimd_f32 b) {
788 const psimd_s32 mask = (psimd_s32) x >> psimd_splat_s32(31);
789 return psimd_blend_f32(mask, a, b);
790 }
791
792 /* Vector absolute value */
793 PSIMD_INTRINSIC psimd_f32 psimd_abs_f32(psimd_f32 v) {
794 const psimd_s32 mask = (psimd_s32) psimd_splat_f32(-0.0f);
795 return (psimd_f32) ((psimd_s32) v & ~mask);
796 }
797
798 /* Vector negation */
799 PSIMD_INTRINSIC psimd_f32 psimd_neg_f32(psimd_f32 v) {
800 const psimd_s32 mask = (psimd_s32) psimd_splat_f32(-0.0f);
801 return (psimd_f32) ((psimd_s32) v ^ mask);
802 }
803
804 /* Vector maximum */
805 PSIMD_INTRINSIC psimd_s8 psimd_max_s8(psimd_s8 a, psimd_s8 b) {
806 #if defined(__ARM_NEON__) || defined(__ARM_NEON)
807 return (psimd_s8) vmaxq_s8((int8x16_t) a, (int8x16_t) b);
808 #else
809 return psimd_blend_s8(a > b, a, b);
810 #endif
811 }
812
813 PSIMD_INTRINSIC psimd_u8 psimd_max_u8(psimd_u8 a, psimd_u8 b) {
814 #if defined(__ARM_NEON__) || defined(__ARM_NEON)
815 return (psimd_u8) vmaxq_u8((uint8x16_t) a, (uint8x16_t) b);
816 #else
817 return psimd_blend_u8(a > b, a, b);
818 #endif
819 }
820
821 PSIMD_INTRINSIC psimd_s16 psimd_max_s16(psimd_s16 a, psimd_s16 b) {
822 #if defined(__ARM_NEON__) || defined(__ARM_NEON)
823 return (psimd_s16) vmaxq_s16((int16x8_t) a, (int16x8_t) b);
824 #else
825 return psimd_blend_s16(a > b, a, b);
826 #endif
827 }
828
829 PSIMD_INTRINSIC psimd_u16 psimd_max_u16(psimd_u16 a, psimd_u16 b) {
830 #if defined(__ARM_NEON__) || defined(__ARM_NEON)
831 return (psimd_u16) vmaxq_u16((uint16x8_t) a, (uint16x8_t) b);
832 #else
833 return psimd_blend_u16(a > b, a, b);
834 #endif
835 }
836
837 PSIMD_INTRINSIC psimd_s32 psimd_max_s32(psimd_s32 a, psimd_s32 b) {
838 #if defined(__ARM_NEON__) || defined(__ARM_NEON)
839 return (psimd_s32) vmaxq_s32((int32x4_t) a, (int32x4_t) b);
840 #else
841 return psimd_blend_s32(a > b, a, b);
842 #endif
843 }
844
845 PSIMD_INTRINSIC psimd_u32 psimd_max_u32(psimd_u32 a, psimd_u32 b) {
846 #if defined(__ARM_NEON__) || defined(__ARM_NEON)
847 return (psimd_u32) vmaxq_u32((uint32x4_t) a, (uint32x4_t) b);
848 #else
849 return psimd_blend_u32(a > b, a, b);
850 #endif
851 }
852
853 PSIMD_INTRINSIC psimd_f32 psimd_max_f32(psimd_f32 a, psimd_f32 b) {
854 #if defined(__ARM_NEON__) || defined(__ARM_NEON)
855 return (psimd_f32) vmaxq_f32((float32x4_t) a, (float32x4_t) b);
856 #elif defined(__wasm__) && defined(__wasm_simd128__) && defined(__clang__)
857 return __builtin_wasm_max_f32x4(a, b);
858 #else
859 return psimd_blend_f32(a > b, a, b);
860 #endif
861 }
862
863 /* Vector minimum */
864 PSIMD_INTRINSIC psimd_s8 psimd_min_s8(psimd_s8 a, psimd_s8 b) {
865 #if defined(__ARM_NEON__) || defined(__ARM_NEON)
866 return (psimd_s8) vminq_s8((int8x16_t) a, (int8x16_t) b);
867 #else
868 return psimd_blend_s8(a < b, a, b);
869 #endif
870 }
871
872 PSIMD_INTRINSIC psimd_u8 psimd_min_u8(psimd_u8 a, psimd_u8 b) {
873 #if defined(__ARM_NEON__) || defined(__ARM_NEON)
874 return (psimd_u8) vminq_u8((uint8x16_t) a, (uint8x16_t) b);
875 #else
876 return psimd_blend_u8(a < b, a, b);
877 #endif
878 }
879
880 PSIMD_INTRINSIC psimd_s16 psimd_min_s16(psimd_s16 a, psimd_s16 b) {
881 #if defined(__ARM_NEON__) || defined(__ARM_NEON)
882 return (psimd_s16) vminq_s16((int16x8_t) a, (int16x8_t) b);
883 #else
884 return psimd_blend_s16(a < b, a, b);
885 #endif
886 }
887
888 PSIMD_INTRINSIC psimd_u16 psimd_min_u16(psimd_u16 a, psimd_u16 b) {
889 #if defined(__ARM_NEON__) || defined(__ARM_NEON)
890 return (psimd_u16) vminq_u16((uint16x8_t) a, (uint16x8_t) b);
891 #else
892 return psimd_blend_u16(a < b, a, b);
893 #endif
894 }
895
896 PSIMD_INTRINSIC psimd_s32 psimd_min_s32(psimd_s32 a, psimd_s32 b) {
897 #if defined(__ARM_NEON__) || defined(__ARM_NEON)
898 return (psimd_s32) vminq_s32((int32x4_t) a, (int32x4_t) b);
899 #else
900 return psimd_blend_s32(a < b, a, b);
901 #endif
902 }
903
904 PSIMD_INTRINSIC psimd_u32 psimd_min_u32(psimd_u32 a, psimd_u32 b) {
905 #if defined(__ARM_NEON__) || defined(__ARM_NEON)
906 return (psimd_u32) vminq_u32((uint32x4_t) a, (uint32x4_t) b);
907 #else
908 return psimd_blend_u32(a < b, a, b);
909 #endif
910 }
911
912 PSIMD_INTRINSIC psimd_f32 psimd_min_f32(psimd_f32 a, psimd_f32 b) {
913 #if defined(__ARM_NEON__) || defined(__ARM_NEON)
914 return (psimd_f32) vminq_f32((float32x4_t) a, (float32x4_t) b);
915 #elif defined(__wasm__) && defined(__wasm_simd128__) && defined(__clang__)
916 return __builtin_wasm_min_f32x4(a, b);
917 #else
918 return psimd_blend_f32(a < b, a, b);
919 #endif
920 }
921
922 PSIMD_INTRINSIC psimd_f32 psimd_cvt_s32_f32(psimd_s32 v) {
923 #if defined(__clang__)
924 return __builtin_convertvector(v, psimd_f32);
925 #elif defined(__ARM_NEON__) || defined(__ARM_NEON)
926 return (psimd_f32) vcvtq_f32_s32((int32x4_t) v);
927 #elif defined(__SSE2__)
928 return (psimd_f32) _mm_cvtepi32_ps((__m128i) v);
929 #else
930 return (psimd_f32) { (float) v[0], (float) v[1], (float) v[2], (float) v[3] };
931 #endif
932 }
933
934 /* Broadcast vector element */
935 #if defined(__clang__)
936 PSIMD_INTRINSIC psimd_f32 psimd_splat0_f32(psimd_f32 v) {
937 return __builtin_shufflevector(v, v, 0, 0, 0, 0);
938 }
939
940 PSIMD_INTRINSIC psimd_f32 psimd_splat1_f32(psimd_f32 v) {
941 return __builtin_shufflevector(v, v, 1, 1, 1, 1);
942 }
943
944 PSIMD_INTRINSIC psimd_f32 psimd_splat2_f32(psimd_f32 v) {
945 return __builtin_shufflevector(v, v, 2, 2, 2, 2);
946 }
947
948 PSIMD_INTRINSIC psimd_f32 psimd_splat3_f32(psimd_f32 v) {
949 return __builtin_shufflevector(v, v, 3, 3, 3, 3);
950 }
951 #else
952 PSIMD_INTRINSIC psimd_f32 psimd_splat0_f32(psimd_f32 v) {
953 return __builtin_shuffle(v, (psimd_s32) { 0, 0, 0, 0 });
954 }
955
956 PSIMD_INTRINSIC psimd_f32 psimd_splat1_f32(psimd_f32 v) {
957 return __builtin_shuffle(v, (psimd_s32) { 1, 1, 1, 1 });
958 }
959
960 PSIMD_INTRINSIC psimd_f32 psimd_splat2_f32(psimd_f32 v) {
961 return __builtin_shuffle(v, (psimd_s32) { 2, 2, 2, 2 });
962 }
963
964 PSIMD_INTRINSIC psimd_f32 psimd_splat3_f32(psimd_f32 v) {
965 return __builtin_shuffle(v, (psimd_s32) { 3, 3, 3, 3 });
966 }
967 #endif
968
969 /* Reversal of vector elements */
970 #if defined(__clang__)
971 PSIMD_INTRINSIC psimd_s8 psimd_reverse_s8(psimd_s8 v) {
972 return __builtin_shufflevector(v, v, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
973 }
974
975 PSIMD_INTRINSIC psimd_u8 psimd_reverse_u8(psimd_u8 v) {
976 return __builtin_shufflevector(v, v, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
977 }
978
979 PSIMD_INTRINSIC psimd_s16 psimd_reverse_s16(psimd_s16 v) {
980 return __builtin_shufflevector(v, v, 7, 6, 5, 4, 3, 2, 1, 0);
981 }
982
983 PSIMD_INTRINSIC psimd_u16 psimd_reverse_u16(psimd_u16 v) {
984 return __builtin_shufflevector(v, v, 7, 6, 5, 4, 3, 2, 1, 0);
985 }
986
987 PSIMD_INTRINSIC psimd_s32 psimd_reverse_s32(psimd_s32 v) {
988 return __builtin_shufflevector(v, v, 3, 2, 1, 0);
989 }
990
991 PSIMD_INTRINSIC psimd_u32 psimd_reverse_u32(psimd_u32 v) {
992 return __builtin_shufflevector(v, v, 3, 2, 1, 0);
993 }
994
995 PSIMD_INTRINSIC psimd_f32 psimd_reverse_f32(psimd_f32 v) {
996 return __builtin_shufflevector(v, v, 3, 2, 1, 0);
997 }
998 #else
999 PSIMD_INTRINSIC psimd_s8 psimd_reverse_s8(psimd_s8 v) {
1000 return __builtin_shuffle(v, (psimd_s8) { 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 });
1001 }
1002
1003 PSIMD_INTRINSIC psimd_u8 psimd_reverse_u8(psimd_u8 v) {
1004 return __builtin_shuffle(v, (psimd_s8) { 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 });
1005 }
1006
1007 PSIMD_INTRINSIC psimd_s16 psimd_reverse_s16(psimd_s16 v) {
1008 return __builtin_shuffle(v, (psimd_s16) { 7, 6, 5, 4, 3, 2, 1, 0 });
1009 }
1010
1011 PSIMD_INTRINSIC psimd_u16 psimd_reverse_u16(psimd_u16 v) {
1012 return __builtin_shuffle(v, (psimd_s16) { 7, 6, 5, 4, 3, 2, 1, 0 });
1013 }
1014
1015 PSIMD_INTRINSIC psimd_s32 psimd_reverse_s32(psimd_s32 v) {
1016 return __builtin_shuffle(v, (psimd_s32) { 3, 2, 1, 0 });
1017 }
1018
1019 PSIMD_INTRINSIC psimd_u32 psimd_reverse_u32(psimd_u32 v) {
1020 return __builtin_shuffle(v, (psimd_s32) { 3, 2, 1, 0 });
1021 }
1022
1023 PSIMD_INTRINSIC psimd_f32 psimd_reverse_f32(psimd_f32 v) {
1024 return __builtin_shuffle(v, (psimd_s32) { 3, 2, 1, 0 });
1025 }
1026 #endif
1027
1028 /* Interleaving of vector elements */
1029 #if defined(__clang__)
1030 PSIMD_INTRINSIC psimd_s16 psimd_interleave_lo_s16(psimd_s16 a, psimd_s16 b) {
1031 return __builtin_shufflevector(a, b, 0, 8+0, 1, 8+1, 2, 8+2, 3, 8+3);
1032 }
1033
1034 PSIMD_INTRINSIC psimd_s16 psimd_interleave_hi_s16(psimd_s16 a, psimd_s16 b) {
1035 return __builtin_shufflevector(a, b, 4, 8+4, 5, 8+5, 6, 8+6, 7, 8+7);
1036 }
1037
1038 PSIMD_INTRINSIC psimd_u16 psimd_interleave_lo_u16(psimd_u16 a, psimd_u16 b) {
1039 return __builtin_shufflevector(a, b, 0, 8+0, 1, 8+1, 2, 8+2, 3, 8+3);
1040 }
1041
1042 PSIMD_INTRINSIC psimd_u16 psimd_interleave_hi_u16(psimd_u16 a, psimd_u16 b) {
1043 return __builtin_shufflevector(a, b, 4, 8+4, 5, 8+5, 6, 8+6, 7, 8+7);
1044 }
1045
1046 PSIMD_INTRINSIC psimd_s32 psimd_interleave_lo_s32(psimd_s32 a, psimd_s32 b) {
1047 return __builtin_shufflevector(a, b, 0, 4+0, 1, 4+1);
1048 }
1049
1050 PSIMD_INTRINSIC psimd_s32 psimd_interleave_hi_s32(psimd_s32 a, psimd_s32 b) {
1051 return __builtin_shufflevector(a, b, 2, 4+2, 3, 4+3);
1052 }
1053
1054 PSIMD_INTRINSIC psimd_u32 psimd_interleave_lo_u32(psimd_u32 a, psimd_u32 b) {
1055 return __builtin_shufflevector(a, b, 0, 4+0, 1, 4+1);
1056 }
1057
1058 PSIMD_INTRINSIC psimd_u32 psimd_interleave_hi_u32(psimd_u32 a, psimd_u32 b) {
1059 return __builtin_shufflevector(a, b, 2, 4+2, 3, 4+3);
1060 }
1061
1062 PSIMD_INTRINSIC psimd_f32 psimd_interleave_lo_f32(psimd_f32 a, psimd_f32 b) {
1063 return __builtin_shufflevector(a, b, 0, 4+0, 1, 4+1);
1064 }
1065
1066 PSIMD_INTRINSIC psimd_f32 psimd_interleave_hi_f32(psimd_f32 a, psimd_f32 b) {
1067 return __builtin_shufflevector(a, b, 2, 4+2, 3, 4+3);
1068 }
1069 #else
1070 PSIMD_INTRINSIC psimd_s16 psimd_interleave_lo_s16(psimd_s16 a, psimd_s16 b) {
1071 return __builtin_shuffle(a, b, (psimd_s16) { 0, 8+0, 1, 8+1, 2, 8+2, 3, 8+3 });
1072 }
1073
1074 PSIMD_INTRINSIC psimd_s16 psimd_interleave_hi_s16(psimd_s16 a, psimd_s16 b) {
1075 return __builtin_shuffle(a, b, (psimd_s16) { 4, 8+4, 5, 8+5, 6, 8+6, 7, 8+7 });
1076 }
1077
1078 PSIMD_INTRINSIC psimd_u16 psimd_interleave_lo_u16(psimd_u16 a, psimd_u16 b) {
1079 return __builtin_shuffle(a, b, (psimd_s16) { 0, 8+0, 1, 8+1, 2, 8+2, 3, 8+3 });
1080 }
1081
1082 PSIMD_INTRINSIC psimd_u16 psimd_interleave_hi_u16(psimd_u16 a, psimd_u16 b) {
1083 return __builtin_shuffle(a, b, (psimd_s16) { 4, 8+4, 5, 8+5, 6, 8+6, 7, 8+7 });
1084 }
1085
1086 PSIMD_INTRINSIC psimd_s32 psimd_interleave_lo_s32(psimd_s32 a, psimd_s32 b) {
1087 return __builtin_shuffle(a, b, (psimd_s32) { 0, 4+0, 1, 4+1 });
1088 }
1089
1090 PSIMD_INTRINSIC psimd_s32 psimd_interleave_hi_s32(psimd_s32 a, psimd_s32 b) {
1091 return __builtin_shuffle(a, b, (psimd_s32) { 2, 4+2, 3, 4+3 });
1092 }
1093
1094 PSIMD_INTRINSIC psimd_u32 psimd_interleave_lo_u32(psimd_u32 a, psimd_u32 b) {
1095 return __builtin_shuffle(a, b, (psimd_s32) { 0, 4+0, 1, 4+1 });
1096 }
1097
1098 PSIMD_INTRINSIC psimd_u32 psimd_interleave_hi_u32(psimd_u32 a, psimd_u32 b) {
1099 return __builtin_shuffle(a, b, (psimd_s32) { 2, 4+2, 3, 4+3 });
1100 }
1101
1102 PSIMD_INTRINSIC psimd_f32 psimd_interleave_lo_f32(psimd_f32 a, psimd_f32 b) {
1103 return __builtin_shuffle(a, b, (psimd_s32) { 0, 4+0, 1, 4+1 });
1104 }
1105
1106 PSIMD_INTRINSIC psimd_f32 psimd_interleave_hi_f32(psimd_f32 a, psimd_f32 b) {
1107 return __builtin_shuffle(a, b, (psimd_s32) { 2, 4+2, 3, 4+3 });
1108 }
1109 #endif
1110
1111 /* Concatenation of low/high vector elements */
1112 #if defined(__clang__)
1113 PSIMD_INTRINSIC psimd_s16 psimd_concat_lo_s16(psimd_s16 a, psimd_s16 b) {
1114 return __builtin_shufflevector(a, b, 0, 1, 2, 3, 8+0, 8+1, 8+2, 8+3);
1115 }
1116
1117 PSIMD_INTRINSIC psimd_s16 psimd_concat_hi_s16(psimd_s16 a, psimd_s16 b) {
1118 return __builtin_shufflevector(a, b, 4, 5, 6, 7, 8+4, 8+5, 8+6, 8+7);
1119 }
1120
1121 PSIMD_INTRINSIC psimd_u16 psimd_concat_lo_u16(psimd_u16 a, psimd_u16 b) {
1122 return __builtin_shufflevector(a, b, 0, 1, 2, 3, 8+0, 8+1, 8+2, 8+3);
1123 }
1124
1125 PSIMD_INTRINSIC psimd_u16 psimd_concat_hi_u16(psimd_u16 a, psimd_u16 b) {
1126 return __builtin_shufflevector(a, b, 4, 5, 6, 7, 8+4, 8+5, 8+6, 8+7);
1127 }
1128
1129 PSIMD_INTRINSIC psimd_s32 psimd_concat_lo_s32(psimd_s32 a, psimd_s32 b) {
1130 return __builtin_shufflevector(a, b, 0, 1, 4+0, 4+1);
1131 }
1132
1133 PSIMD_INTRINSIC psimd_s32 psimd_concat_hi_s32(psimd_s32 a, psimd_s32 b) {
1134 return __builtin_shufflevector(a, b, 2, 3, 4+2, 4+3);
1135 }
1136
1137 PSIMD_INTRINSIC psimd_u32 psimd_concat_lo_u32(psimd_u32 a, psimd_u32 b) {
1138 return __builtin_shufflevector(a, b, 0, 1, 4+0, 4+1);
1139 }
1140
1141 PSIMD_INTRINSIC psimd_u32 psimd_concat_hi_u32(psimd_u32 a, psimd_u32 b) {
1142 return __builtin_shufflevector(a, b, 2, 3, 4+2, 4+3);
1143 }
1144
1145 PSIMD_INTRINSIC psimd_f32 psimd_concat_lo_f32(psimd_f32 a, psimd_f32 b) {
1146 return __builtin_shufflevector(a, b, 0, 1, 4+0, 4+1);
1147 }
1148
1149 PSIMD_INTRINSIC psimd_f32 psimd_concat_hi_f32(psimd_f32 a, psimd_f32 b) {
1150 return __builtin_shufflevector(a, b, 2, 3, 4+2, 4+3);
1151 }
1152 #else
1153 PSIMD_INTRINSIC psimd_s16 psimd_concat_lo_s16(psimd_s16 a, psimd_s16 b) {
1154 return __builtin_shuffle(a, b, (psimd_s16) { 0, 1, 2, 3, 8+0, 8+1, 8+2, 8+3 });
1155 }
1156
1157 PSIMD_INTRINSIC psimd_s16 psimd_concat_hi_s16(psimd_s16 a, psimd_s16 b) {
1158 return __builtin_shuffle(a, b, (psimd_s16) { 4, 5, 6, 7, 8+4, 8+5, 8+6, 8+7 });
1159 }
1160
1161 PSIMD_INTRINSIC psimd_u16 psimd_concat_lo_u16(psimd_u16 a, psimd_u16 b) {
1162 return __builtin_shuffle(a, b, (psimd_s16) { 0, 1, 2, 3, 8+0, 8+1, 8+2, 8+3 });
1163 }
1164
1165 PSIMD_INTRINSIC psimd_u16 psimd_concat_hi_u16(psimd_u16 a, psimd_u16 b) {
1166 return __builtin_shuffle(a, b, (psimd_s16) { 4, 5, 6, 7, 8+4, 8+5, 8+6, 8+7 });
1167 }
1168
1169 PSIMD_INTRINSIC psimd_s32 psimd_concat_lo_s32(psimd_s32 a, psimd_s32 b) {
1170 return __builtin_shuffle(a, b, (psimd_s32) { 0, 1, 4+0, 4+1 });
1171 }
1172
1173 PSIMD_INTRINSIC psimd_s32 psimd_concat_hi_s32(psimd_s32 a, psimd_s32 b) {
1174 return __builtin_shuffle(a, b, (psimd_s32) { 2, 3, 4+2, 4+3 });
1175 }
1176
1177 PSIMD_INTRINSIC psimd_u32 psimd_concat_lo_u32(psimd_u32 a, psimd_u32 b) {
1178 return __builtin_shuffle(a, b, (psimd_s32) { 0, 1, 4+0, 4+1 });
1179 }
1180
1181 PSIMD_INTRINSIC psimd_u32 psimd_concat_hi_u32(psimd_u32 a, psimd_u32 b) {
1182 return __builtin_shuffle(a, b, (psimd_s32) { 2, 3, 4+2, 4+3 });
1183 }
1184
1185 PSIMD_INTRINSIC psimd_f32 psimd_concat_lo_f32(psimd_f32 a, psimd_f32 b) {
1186 return __builtin_shuffle(a, b, (psimd_s32) { 0, 1, 4+0, 4+1 });
1187 }
1188
1189 PSIMD_INTRINSIC psimd_f32 psimd_concat_hi_f32(psimd_f32 a, psimd_f32 b) {
1190 return __builtin_shuffle(a, b, (psimd_s32) { 2, 3, 4+2, 4+3 });
1191 }
1192 #endif
1193
1194 /* Concatenation of even/odd vector elements */
1195 #if defined(__clang__)
1196 PSIMD_INTRINSIC psimd_s8 psimd_concat_even_s8(psimd_s8 a, psimd_s8 b) {
1197 return __builtin_shufflevector(a, b,
1198 0, 2, 4, 6, 8, 10, 12, 14, 16+0, 16+2, 16+4, 16+6, 16+8, 16+10, 16+12, 16+14);
1199 }
1200
1201 PSIMD_INTRINSIC psimd_s8 psimd_concat_odd_s8(psimd_s8 a, psimd_s8 b) {
1202 return __builtin_shufflevector(a, b,
1203 1, 3, 5, 7, 9, 11, 13, 15, 16+1, 16+3, 16+5, 16+7, 16+9, 16+11, 16+13, 16+15);
1204 }
1205
1206 PSIMD_INTRINSIC psimd_u8 psimd_concat_even_u8(psimd_u8 a, psimd_u8 b) {
1207 return __builtin_shufflevector(a, b,
1208 0, 2, 4, 6, 8, 10, 12, 14, 16+0, 16+2, 16+4, 16+6, 16+8, 16+10, 16+12, 16+14);
1209 }
1210
1211 PSIMD_INTRINSIC psimd_u8 psimd_concat_odd_u8(psimd_u8 a, psimd_u8 b) {
1212 return __builtin_shufflevector(a, b,
1213 1, 3, 5, 7, 9, 11, 13, 15, 16+1, 16+3, 16+5, 16+7, 16+9, 16+11, 16+13, 16+15);
1214 }
1215
1216 PSIMD_INTRINSIC psimd_s16 psimd_concat_even_s16(psimd_s16 a, psimd_s16 b) {
1217 return __builtin_shufflevector(a, b, 0, 2, 4, 6, 8+0, 8+2, 8+4, 8+6);
1218 }
1219
1220 PSIMD_INTRINSIC psimd_s16 psimd_concat_odd_s16(psimd_s16 a, psimd_s16 b) {
1221 return __builtin_shufflevector(a, b, 1, 3, 5, 7, 8+1, 8+3, 8+5, 8+7);
1222 }
1223
1224 PSIMD_INTRINSIC psimd_u16 psimd_concat_even_u16(psimd_u16 a, psimd_u16 b) {
1225 return __builtin_shufflevector(a, b, 0, 2, 4, 6, 8+0, 8+2, 8+4, 8+6);
1226 }
1227
1228 PSIMD_INTRINSIC psimd_u16 psimd_concat_odd_u16(psimd_u16 a, psimd_u16 b) {
1229 return __builtin_shufflevector(a, b, 1, 3, 5, 7, 8+1, 8+3, 8+5, 8+7);
1230 }
1231
1232 PSIMD_INTRINSIC psimd_s32 psimd_concat_even_s32(psimd_s32 a, psimd_s32 b) {
1233 return __builtin_shufflevector(a, b, 0, 2, 4+0, 4+2);
1234 }
1235
1236 PSIMD_INTRINSIC psimd_s32 psimd_concat_odd_s32(psimd_s32 a, psimd_s32 b) {
1237 return __builtin_shufflevector(a, b, 1, 3, 4+1, 4+3);
1238 }
1239
1240 PSIMD_INTRINSIC psimd_u32 psimd_concat_even_u32(psimd_u32 a, psimd_u32 b) {
1241 return __builtin_shufflevector(a, b, 0, 2, 4+0, 4+2);
1242 }
1243
1244 PSIMD_INTRINSIC psimd_u32 psimd_concat_odd_u32(psimd_u32 a, psimd_u32 b) {
1245 return __builtin_shufflevector(a, b, 1, 3, 4+1, 4+3);
1246 }
1247
1248 PSIMD_INTRINSIC psimd_f32 psimd_concat_even_f32(psimd_f32 a, psimd_f32 b) {
1249 return __builtin_shufflevector(a, b, 0, 2, 4+0, 4+2);
1250 }
1251
1252 PSIMD_INTRINSIC psimd_f32 psimd_concat_odd_f32(psimd_f32 a, psimd_f32 b) {
1253 return __builtin_shufflevector(a, b, 1, 3, 4+1, 4+3);
1254 }
1255 #else
1256 PSIMD_INTRINSIC psimd_s8 psimd_concat_even_s8(psimd_s8 a, psimd_s8 b) {
1257 return __builtin_shuffle(a, b,
1258 (psimd_s8) { 0, 2, 4, 6, 8, 10, 12, 14, 16+0, 16+2, 16+4, 16+6, 16+8, 16+10, 16+12, 16+14 });
1259 }
1260
1261 PSIMD_INTRINSIC psimd_s8 psimd_concat_odd_s8(psimd_s8 a, psimd_s8 b) {
1262 return __builtin_shuffle(a, b,
1263 (psimd_s8) { 1, 3, 5, 7, 9, 11, 13, 15, 16+1, 16+3, 16+5, 16+7, 16+9, 16+11, 16+13, 16+15 });
1264 }
1265
1266 PSIMD_INTRINSIC psimd_u8 psimd_concat_even_u8(psimd_u8 a, psimd_u8 b) {
1267 return __builtin_shuffle(a, b,
1268 (psimd_s8) { 0, 2, 4, 6, 8, 10, 12, 14, 16+0, 16+2, 16+4, 16+6, 16+8, 16+10, 16+12, 16+14 });
1269 }
1270
1271 PSIMD_INTRINSIC psimd_u8 psimd_concat_odd_u8(psimd_u8 a, psimd_u8 b) {
1272 return __builtin_shuffle(a, b,
1273 (psimd_s8) { 1, 3, 5, 7, 9, 11, 13, 15, 16+1, 16+3, 16+5, 16+7, 16+9, 16+11, 16+13, 16+15 });
1274 }
1275
1276 PSIMD_INTRINSIC psimd_s16 psimd_concat_even_s16(psimd_s16 a, psimd_s16 b) {
1277 return __builtin_shuffle(a, b, (psimd_s16) { 0, 2, 4, 6, 8+0, 8+2, 8+4, 8+6 });
1278 }
1279
1280 PSIMD_INTRINSIC psimd_s16 psimd_concat_odd_s16(psimd_s16 a, psimd_s16 b) {
1281 return __builtin_shuffle(a, b, (psimd_s16) { 1, 3, 5, 7, 8+1, 8+3, 8+5, 8+7 });
1282 }
1283
1284 PSIMD_INTRINSIC psimd_u16 psimd_concat_even_u16(psimd_u16 a, psimd_u16 b) {
1285 return __builtin_shuffle(a, b, (psimd_s16) { 0, 2, 4, 6, 8+0, 8+2, 8+4, 8+6 });
1286 }
1287
1288 PSIMD_INTRINSIC psimd_u16 psimd_concat_odd_u16(psimd_u16 a, psimd_u16 b) {
1289 return __builtin_shuffle(a, b, (psimd_s16) { 1, 3, 5, 7, 8+1, 8+3, 8+5, 8+7 });
1290 }
1291
1292 PSIMD_INTRINSIC psimd_s32 psimd_concat_even_s32(psimd_s32 a, psimd_s32 b) {
1293 return __builtin_shuffle(a, b, (psimd_s32) { 0, 2, 4+0, 4+2 });
1294 }
1295
1296 PSIMD_INTRINSIC psimd_s32 psimd_concat_odd_s32(psimd_s32 a, psimd_s32 b) {
1297 return __builtin_shuffle(a, b, (psimd_s32) { 1, 3, 4+1, 4+3 });
1298 }
1299
1300 PSIMD_INTRINSIC psimd_u32 psimd_concat_even_u32(psimd_u32 a, psimd_u32 b) {
1301 return __builtin_shuffle(a, b, (psimd_s32) { 0, 2, 4+0, 4+2 });
1302 }
1303
1304 PSIMD_INTRINSIC psimd_u32 psimd_concat_odd_u32(psimd_u32 a, psimd_u32 b) {
1305 return __builtin_shuffle(a, b, (psimd_s32) { 1, 3, 4+1, 4+3 });
1306 }
1307
1308 PSIMD_INTRINSIC psimd_f32 psimd_concat_even_f32(psimd_f32 a, psimd_f32 b) {
1309 return __builtin_shuffle(a, b, (psimd_s32) { 0, 2, 4+0, 4+2 });
1310 }
1311
1312 PSIMD_INTRINSIC psimd_f32 psimd_concat_odd_f32(psimd_f32 a, psimd_f32 b) {
1313 return __builtin_shuffle(a, b, (psimd_s32) { 1, 3, 4+1, 4+3 });
1314 }
1315 #endif
1316
1317 /* Vector reduce */
1318 #if defined(__clang__)
1319 PSIMD_INTRINSIC psimd_f32 psimd_allreduce_sum_f32(psimd_f32 v) {
1320 const psimd_f32 temp = v + __builtin_shufflevector(v, v, 2, 3, 0, 1);
1321 return temp + __builtin_shufflevector(temp, temp, 1, 0, 3, 2);
1322 }
1323
1324 PSIMD_INTRINSIC psimd_f32 psimd_allreduce_max_f32(psimd_f32 v) {
1325 const psimd_f32 temp = psimd_max_f32(v, __builtin_shufflevector(v, v, 2, 3, 0, 1));
1326 return psimd_max_f32(temp, __builtin_shufflevector(temp, temp, 1, 0, 3, 2));
1327 }
1328
1329 PSIMD_INTRINSIC psimd_f32 psimd_allreduce_min_f32(psimd_f32 v) {
1330 const psimd_f32 temp = psimd_min_f32(v, __builtin_shufflevector(v, v, 2, 3, 0, 1));
1331 return psimd_min_f32(temp, __builtin_shufflevector(temp, temp, 1, 0, 3, 2));
1332 }
1333
1334 PSIMD_INTRINSIC float psimd_reduce_sum_f32(psimd_f32 v) {
1335 const psimd_f32 temp = v + __builtin_shufflevector(v, v, 2, 3, -1, -1);
1336 const psimd_f32 result = temp + __builtin_shufflevector(temp, temp, 1, -1, -1, -1);
1337 return result[0];
1338 }
1339
1340 PSIMD_INTRINSIC float psimd_reduce_max_f32(psimd_f32 v) {
1341 const psimd_f32 temp = psimd_max_f32(v, __builtin_shufflevector(v, v, 2, 3, -1, -1));
1342 const psimd_f32 result = psimd_max_f32(temp, __builtin_shufflevector(temp, temp, 1, -1, -1, -1));
1343 return result[0];
1344 }
1345
1346 PSIMD_INTRINSIC float psimd_reduce_min_f32(psimd_f32 v) {
1347 const psimd_f32 temp = psimd_min_f32(v, __builtin_shufflevector(v, v, 2, 3, -1, -1));
1348 const psimd_f32 result = psimd_min_f32(temp, __builtin_shufflevector(temp, temp, 1, -1, -1, -1));
1349 return result[0];
1350 }
1351 #else
1352 PSIMD_INTRINSIC psimd_f32 psimd_allreduce_sum_f32(psimd_f32 v) {
1353 const psimd_f32 temp = v + __builtin_shuffle(v, (psimd_s32) { 2, 3, 0, 1 });
1354 return temp + __builtin_shuffle(temp, (psimd_s32) { 1, 0, 3, 2 });
1355 }
1356
1357 PSIMD_INTRINSIC psimd_f32 psimd_allreduce_max_f32(psimd_f32 v) {
1358 const psimd_f32 temp = psimd_max_f32(v, __builtin_shuffle(v, (psimd_s32) { 2, 3, 0, 1 }));
1359 return psimd_max_f32(temp, __builtin_shuffle(temp, (psimd_s32) { 1, 0, 3, 2 }));
1360 }
1361
1362 PSIMD_INTRINSIC psimd_f32 psimd_allreduce_min_f32(psimd_f32 v) {
1363 const psimd_f32 temp = psimd_min_f32(v, __builtin_shuffle(v, (psimd_s32) { 2, 3, 0, 1 }));
1364 return psimd_min_f32(temp, __builtin_shuffle(temp, (psimd_s32) { 1, 0, 3, 2 }));
1365 }
1366
1367 PSIMD_INTRINSIC float psimd_reduce_sum_f32(psimd_f32 v) {
1368 const psimd_f32 result = psimd_allreduce_sum_f32(v);
1369 return result[0];
1370 }
1371
1372 PSIMD_INTRINSIC float psimd_reduce_max_f32(psimd_f32 v) {
1373 const psimd_f32 result = psimd_allreduce_max_f32(v);
1374 return result[0];
1375 }
1376
1377 PSIMD_INTRINSIC float psimd_reduce_min_f32(psimd_f32 v) {
1378 const psimd_f32 result = psimd_allreduce_min_f32(v);
1379 return result[0];
1380 }
1381 #endif
1382#endif
1383
1384#endif /* PSIMD_H */
1385