1 | #pragma once |
2 | #ifndef PSIMD_H |
3 | #define PSIMD_H |
4 | |
5 | #if defined(__CUDA_ARCH__) |
6 | /* CUDA compiler */ |
7 | #define PSIMD_INTRINSIC __forceinline__ __device__ |
8 | #elif defined(__OPENCL_VERSION__) |
9 | /* OpenCL compiler */ |
10 | #define PSIMD_INTRINSIC inline static |
11 | #elif defined(__INTEL_COMPILER) |
12 | /* Intel compiler, even on Windows */ |
13 | #define PSIMD_INTRINSIC inline static __attribute__((__always_inline__)) |
14 | #elif defined(__GNUC__) |
15 | /* GCC-compatible compiler (gcc/clang/icc) */ |
16 | #define PSIMD_INTRINSIC inline static __attribute__((__always_inline__)) |
17 | #elif defined(_MSC_VER) |
18 | /* MSVC-compatible compiler (cl/icl/clang-cl) */ |
19 | #define PSIMD_INTRINSIC __forceinline static |
20 | #elif defined(__cplusplus) |
21 | /* Generic C++ compiler */ |
22 | #define PSIMD_INTRINSIC inline static |
23 | #elif defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) |
24 | /* Generic C99 compiler */ |
25 | #define PSIMD_INTRINSIC inline static |
26 | #else |
27 | /* Generic C compiler */ |
28 | #define PSIMD_INTRINSIC static |
29 | #endif |
30 | |
31 | #if defined(__GNUC__) || defined(__clang__) |
32 | #if defined(__ARM_NEON__) || defined(__ARM_NEON) |
33 | #include <arm_neon.h> |
34 | #endif |
35 | |
36 | #if defined(__SSE2__) |
37 | #include <emmintrin.h> |
38 | #endif |
39 | |
40 | #if defined(__SSE3__) |
41 | #include <pmmintrin.h> |
42 | #endif |
43 | |
44 | #if defined(__SSSE3__) |
45 | #include <tmmintrin.h> |
46 | #endif |
47 | |
48 | #if defined(__SSE4_1__) |
49 | #include <smmintrin.h> |
50 | #endif |
51 | |
52 | #if defined(__SSE4_2__) |
53 | #include <nmmintrin.h> |
54 | #endif |
55 | |
56 | #if defined(__AVX__) |
57 | #include <immintrin.h> |
58 | #endif |
59 | #elif defined(_MSC_VER) |
60 | #include <intrin.h> |
61 | #endif |
62 | |
63 | #if defined(__cplusplus) |
64 | #define PSIMD_CXX_SYNTAX |
65 | #elif defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 201112L) |
66 | #define PSIMD_C11_SYNTAX |
67 | #elif defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) |
68 | #define PSIMD_C99_SYNTAX |
69 | #else |
70 | #define PSIMD_C89_SYNTAX |
71 | #endif |
72 | |
73 | #if defined(__cplusplus) && (__cplusplus >= 201103L) |
74 | #include <cstddef> |
75 | #include <cstdint> |
76 | #elif !defined(__OPENCL_VERSION__) |
77 | #include <stddef.h> |
78 | #include <stdint.h> |
79 | #endif |
80 | |
81 | #if defined(__GNUC__) || defined(__clang__) |
82 | #define PSIMD_HAVE_F64 0 |
83 | #define PSIMD_HAVE_F32 1 |
84 | #define PSIMD_HAVE_U8 1 |
85 | #define PSIMD_HAVE_S8 1 |
86 | #define PSIMD_HAVE_U16 1 |
87 | #define PSIMD_HAVE_S16 1 |
88 | #define PSIMD_HAVE_U32 1 |
89 | #define PSIMD_HAVE_S32 1 |
90 | #define PSIMD_HAVE_U64 0 |
91 | #define PSIMD_HAVE_S64 0 |
92 | |
93 | typedef int8_t psimd_s8 __attribute__((vector_size(16), aligned(1))); |
94 | typedef uint8_t psimd_u8 __attribute__((vector_size(16), aligned(1))); |
95 | typedef int16_t psimd_s16 __attribute__((vector_size(16), aligned(2))); |
96 | typedef uint16_t psimd_u16 __attribute__((vector_size(16), aligned(2))); |
97 | typedef int32_t psimd_s32 __attribute__((vector_size(16), aligned(4))); |
98 | typedef uint32_t psimd_u32 __attribute__((vector_size(16), aligned(4))); |
99 | typedef float psimd_f32 __attribute__((vector_size(16), aligned(4))); |
100 | |
101 | typedef struct { |
102 | psimd_s8 lo; |
103 | psimd_s8 hi; |
104 | } psimd_s8x2; |
105 | |
106 | typedef struct { |
107 | psimd_u8 lo; |
108 | psimd_u8 hi; |
109 | } psimd_u8x2; |
110 | |
111 | typedef struct { |
112 | psimd_s16 lo; |
113 | psimd_s16 hi; |
114 | } psimd_s16x2; |
115 | |
116 | typedef struct { |
117 | psimd_u16 lo; |
118 | psimd_u16 hi; |
119 | } psimd_u16x2; |
120 | |
121 | typedef struct { |
122 | psimd_s32 lo; |
123 | psimd_s32 hi; |
124 | } psimd_s32x2; |
125 | |
126 | typedef struct { |
127 | psimd_u32 lo; |
128 | psimd_u32 hi; |
129 | } psimd_u32x2; |
130 | |
131 | typedef struct { |
132 | psimd_f32 lo; |
133 | psimd_f32 hi; |
134 | } psimd_f32x2; |
135 | |
136 | /* Bit casts */ |
137 | PSIMD_INTRINSIC psimd_u32x2 psimd_cast_s32x2_u32x2(psimd_s32x2 v) { |
138 | return (psimd_u32x2) { .lo = (psimd_u32) v.lo, .hi = (psimd_u32) v.hi }; |
139 | } |
140 | |
141 | PSIMD_INTRINSIC psimd_f32x2 psimd_cast_s32x2_f32x2(psimd_s32x2 v) { |
142 | return (psimd_f32x2) { .lo = (psimd_f32) v.lo, .hi = (psimd_f32) v.hi }; |
143 | } |
144 | |
145 | PSIMD_INTRINSIC psimd_s32x2 psimd_cast_u32x2_s32x2(psimd_u32x2 v) { |
146 | return (psimd_s32x2) { .lo = (psimd_s32) v.lo, .hi = (psimd_s32) v.hi }; |
147 | } |
148 | |
149 | PSIMD_INTRINSIC psimd_f32x2 psimd_cast_u32x2_f32x2(psimd_u32x2 v) { |
150 | return (psimd_f32x2) { .lo = (psimd_f32) v.lo, .hi = (psimd_f32) v.hi }; |
151 | } |
152 | |
153 | PSIMD_INTRINSIC psimd_s32x2 psimd_cast_f32x2_s32x2(psimd_f32x2 v) { |
154 | return (psimd_s32x2) { .lo = (psimd_s32) v.lo, .hi = (psimd_s32) v.hi }; |
155 | } |
156 | |
157 | PSIMD_INTRINSIC psimd_u32x2 psimd_cast_f32x2_u32x2(psimd_f32x2 v) { |
158 | return (psimd_u32x2) { .lo = (psimd_u32) v.lo, .hi = (psimd_u32) v.hi }; |
159 | } |
160 | |
161 | /* Swap */ |
162 | PSIMD_INTRINSIC void psimd_swap_s8(psimd_s8 a[1], psimd_s8 b[1]) { |
163 | const psimd_s8 new_a = *b; |
164 | const psimd_s8 new_b = *a; |
165 | *a = new_a; |
166 | *b = new_b; |
167 | } |
168 | |
169 | PSIMD_INTRINSIC void psimd_swap_u8(psimd_u8 a[1], psimd_u8 b[1]) { |
170 | const psimd_u8 new_a = *b; |
171 | const psimd_u8 new_b = *a; |
172 | *a = new_a; |
173 | *b = new_b; |
174 | } |
175 | |
176 | PSIMD_INTRINSIC void psimd_swap_s16(psimd_s16 a[1], psimd_s16 b[1]) { |
177 | const psimd_s16 new_a = *b; |
178 | const psimd_s16 new_b = *a; |
179 | *a = new_a; |
180 | *b = new_b; |
181 | } |
182 | |
183 | PSIMD_INTRINSIC void psimd_swap_u16(psimd_u16 a[1], psimd_u16 b[1]) { |
184 | const psimd_u16 new_a = *b; |
185 | const psimd_u16 new_b = *a; |
186 | *a = new_a; |
187 | *b = new_b; |
188 | } |
189 | |
190 | PSIMD_INTRINSIC void psimd_swap_s32(psimd_s32 a[1], psimd_s32 b[1]) { |
191 | const psimd_s32 new_a = *b; |
192 | const psimd_s32 new_b = *a; |
193 | *a = new_a; |
194 | *b = new_b; |
195 | } |
196 | |
197 | PSIMD_INTRINSIC void psimd_swap_u32(psimd_u32 a[1], psimd_u32 b[1]) { |
198 | const psimd_u32 new_a = *b; |
199 | const psimd_u32 new_b = *a; |
200 | *a = new_a; |
201 | *b = new_b; |
202 | } |
203 | |
204 | PSIMD_INTRINSIC void psimd_swap_f32(psimd_f32 a[1], psimd_f32 b[1]) { |
205 | const psimd_f32 new_a = *b; |
206 | const psimd_f32 new_b = *a; |
207 | *a = new_a; |
208 | *b = new_b; |
209 | } |
210 | |
211 | /* Zero-initialization */ |
212 | PSIMD_INTRINSIC psimd_s8 psimd_zero_s8(void) { |
213 | return (psimd_s8) { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }; |
214 | } |
215 | |
216 | PSIMD_INTRINSIC psimd_u8 psimd_zero_u8(void) { |
217 | return (psimd_u8) { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }; |
218 | } |
219 | |
220 | PSIMD_INTRINSIC psimd_s16 psimd_zero_s16(void) { |
221 | return (psimd_s16) { 0, 0, 0, 0, 0, 0, 0, 0 }; |
222 | } |
223 | |
224 | PSIMD_INTRINSIC psimd_u16 psimd_zero_u16(void) { |
225 | return (psimd_u16) { 0, 0, 0, 0, 0, 0, 0, 0 }; |
226 | } |
227 | |
228 | PSIMD_INTRINSIC psimd_s32 psimd_zero_s32(void) { |
229 | return (psimd_s32) { 0, 0, 0, 0 }; |
230 | } |
231 | |
232 | PSIMD_INTRINSIC psimd_u32 psimd_zero_u32(void) { |
233 | return (psimd_u32) { 0, 0, 0, 0 }; |
234 | } |
235 | |
236 | PSIMD_INTRINSIC psimd_f32 psimd_zero_f32(void) { |
237 | return (psimd_f32) { 0.0f, 0.0f, 0.0f, 0.0f }; |
238 | } |
239 | |
240 | /* Initialization to the same constant */ |
241 | PSIMD_INTRINSIC psimd_s8 psimd_splat_s8(int8_t c) { |
242 | return (psimd_s8) { c, c, c, c, c, c, c, c, c, c, c, c, c, c, c, c }; |
243 | } |
244 | |
245 | PSIMD_INTRINSIC psimd_u8 psimd_splat_u8(uint8_t c) { |
246 | return (psimd_u8) { c, c, c, c, c, c, c, c, c, c, c, c, c, c, c, c }; |
247 | } |
248 | |
249 | PSIMD_INTRINSIC psimd_s16 psimd_splat_s16(int16_t c) { |
250 | return (psimd_s16) { c, c, c, c, c, c, c, c }; |
251 | } |
252 | |
253 | PSIMD_INTRINSIC psimd_u16 psimd_splat_u16(uint16_t c) { |
254 | return (psimd_u16) { c, c, c, c, c, c, c, c }; |
255 | } |
256 | |
257 | PSIMD_INTRINSIC psimd_s32 psimd_splat_s32(int32_t c) { |
258 | return (psimd_s32) { c, c, c, c }; |
259 | } |
260 | |
261 | PSIMD_INTRINSIC psimd_u32 psimd_splat_u32(uint32_t c) { |
262 | return (psimd_u32) { c, c, c, c }; |
263 | } |
264 | |
265 | PSIMD_INTRINSIC psimd_f32 psimd_splat_f32(float c) { |
266 | return (psimd_f32) { c, c, c, c }; |
267 | } |
268 | |
269 | /* Load vector */ |
270 | PSIMD_INTRINSIC psimd_s8 psimd_load_s8(const void* address) { |
271 | return *((const psimd_s8*) address); |
272 | } |
273 | |
274 | PSIMD_INTRINSIC psimd_u8 psimd_load_u8(const void* address) { |
275 | return *((const psimd_u8*) address); |
276 | } |
277 | |
278 | PSIMD_INTRINSIC psimd_s16 psimd_load_s16(const void* address) { |
279 | return *((const psimd_s16*) address); |
280 | } |
281 | |
282 | PSIMD_INTRINSIC psimd_u16 psimd_load_u16(const void* address) { |
283 | return *((const psimd_u16*) address); |
284 | } |
285 | |
286 | PSIMD_INTRINSIC psimd_s32 psimd_load_s32(const void* address) { |
287 | return *((const psimd_s32*) address); |
288 | } |
289 | |
290 | PSIMD_INTRINSIC psimd_u32 psimd_load_u32(const void* address) { |
291 | return *((const psimd_u32*) address); |
292 | } |
293 | |
294 | PSIMD_INTRINSIC psimd_f32 psimd_load_f32(const void* address) { |
295 | return *((const psimd_f32*) address); |
296 | } |
297 | |
298 | PSIMD_INTRINSIC psimd_s8 psimd_load_splat_s8(const void* address) { |
299 | return psimd_splat_s8(*((const int8_t*) address)); |
300 | } |
301 | |
302 | PSIMD_INTRINSIC psimd_u8 psimd_load_splat_u8(const void* address) { |
303 | return psimd_splat_u8(*((const uint8_t*) address)); |
304 | } |
305 | |
306 | PSIMD_INTRINSIC psimd_s16 psimd_load_splat_s16(const void* address) { |
307 | return psimd_splat_s16(*((const int16_t*) address)); |
308 | } |
309 | |
310 | PSIMD_INTRINSIC psimd_u16 psimd_load_splat_u16(const void* address) { |
311 | return psimd_splat_u16(*((const uint16_t*) address)); |
312 | } |
313 | |
314 | PSIMD_INTRINSIC psimd_s32 psimd_load_splat_s32(const void* address) { |
315 | return psimd_splat_s32(*((const int32_t*) address)); |
316 | } |
317 | |
318 | PSIMD_INTRINSIC psimd_u32 psimd_load_splat_u32(const void* address) { |
319 | return psimd_splat_u32(*((const uint32_t*) address)); |
320 | } |
321 | |
322 | PSIMD_INTRINSIC psimd_f32 psimd_load_splat_f32(const void* address) { |
323 | return psimd_splat_f32(*((const float*) address)); |
324 | } |
325 | |
326 | PSIMD_INTRINSIC psimd_s32 psimd_load1_s32(const void* address) { |
327 | return (psimd_s32) { *((const int32_t*) address), 0, 0, 0 }; |
328 | } |
329 | |
330 | PSIMD_INTRINSIC psimd_u32 psimd_load1_u32(const void* address) { |
331 | return (psimd_u32) { *((const uint32_t*) address), 0, 0, 0 }; |
332 | } |
333 | |
334 | PSIMD_INTRINSIC psimd_f32 psimd_load1_f32(const void* address) { |
335 | return (psimd_f32) { *((const float*) address), 0.0f, 0.0f, 0.0f }; |
336 | } |
337 | |
338 | PSIMD_INTRINSIC psimd_s32 psimd_load2_s32(const void* address) { |
339 | const int32_t* address_s32 = (const int32_t*) address; |
340 | return (psimd_s32) { address_s32[0], address_s32[1], 0, 0 }; |
341 | } |
342 | |
343 | PSIMD_INTRINSIC psimd_u32 psimd_load2_u32(const void* address) { |
344 | const uint32_t* address_u32 = (const uint32_t*) address; |
345 | return (psimd_u32) { address_u32[0], address_u32[1], 0, 0 }; |
346 | } |
347 | |
348 | PSIMD_INTRINSIC psimd_f32 psimd_load2_f32(const void* address) { |
349 | const float* address_f32 = (const float*) address; |
350 | return (psimd_f32) { address_f32[0], address_f32[1], 0.0f, 0.0f }; |
351 | } |
352 | |
353 | PSIMD_INTRINSIC psimd_s32 psimd_load3_s32(const void* address) { |
354 | const int32_t* address_s32 = (const int32_t*) address; |
355 | return (psimd_s32) { address_s32[0], address_s32[1], address_s32[2], 0 }; |
356 | } |
357 | |
358 | PSIMD_INTRINSIC psimd_u32 psimd_load3_u32(const void* address) { |
359 | const uint32_t* address_u32 = (const uint32_t*) address; |
360 | return (psimd_u32) { address_u32[0], address_u32[1], address_u32[2], 0 }; |
361 | } |
362 | |
363 | PSIMD_INTRINSIC psimd_f32 psimd_load3_f32(const void* address) { |
364 | const float* address_f32 = (const float*) address; |
365 | return (psimd_f32) { address_f32[0], address_f32[1], address_f32[2], 0.0f }; |
366 | } |
367 | |
368 | PSIMD_INTRINSIC psimd_s32 psimd_load4_s32(const void* address) { |
369 | return psimd_load_s32(address); |
370 | } |
371 | |
372 | PSIMD_INTRINSIC psimd_u32 psimd_load4_u32(const void* address) { |
373 | return psimd_load_u32(address); |
374 | } |
375 | |
376 | PSIMD_INTRINSIC psimd_f32 psimd_load4_f32(const void* address) { |
377 | return psimd_load_f32(address); |
378 | } |
379 | |
380 | PSIMD_INTRINSIC psimd_f32 psimd_load_stride2_f32(const void* address) { |
381 | const psimd_f32 v0x1x = psimd_load_f32(address); |
382 | const psimd_f32 vx2x3 = psimd_load_f32((const float*) address + 3); |
383 | #if defined(__clang__) |
384 | return __builtin_shufflevector(v0x1x, vx2x3, 0, 2, 5, 7); |
385 | #else |
386 | return __builtin_shuffle(v0x1x, vx2x3, (psimd_s32) { 0, 2, 5, 7 }); |
387 | #endif |
388 | } |
389 | |
390 | PSIMD_INTRINSIC psimd_f32 psimd_load1_stride2_f32(const void* address) { |
391 | return psimd_load_f32(address); |
392 | } |
393 | |
394 | PSIMD_INTRINSIC psimd_f32 psimd_load2_stride2_f32(const void* address) { |
395 | const float* address_f32 = (const float*) address; |
396 | return (psimd_f32) { address_f32[0], address_f32[2], 0.0f, 0.0f }; |
397 | } |
398 | |
399 | PSIMD_INTRINSIC psimd_f32 psimd_load3_stride2_f32(const void* address) { |
400 | const psimd_f32 v0x1x = psimd_load_f32(address); |
401 | const psimd_f32 v2zzz = psimd_load1_f32((const float*) address + 2); |
402 | #if defined(__clang__) |
403 | return __builtin_shufflevector(v0x1x, v2zzz, 0, 2, 4, 6); |
404 | #else |
405 | return __builtin_shuffle(v0x1x, v2zzz, (psimd_s32) { 0, 2, 4, 6 }); |
406 | #endif |
407 | } |
408 | |
409 | PSIMD_INTRINSIC psimd_f32 psimd_load4_stride2_f32(const void* address) { |
410 | return psimd_load_stride2_f32(address); |
411 | } |
412 | |
413 | PSIMD_INTRINSIC psimd_f32 psimd_load_stride_f32(const void* address, size_t stride) { |
414 | const float* address0_f32 = (const float*) address; |
415 | const float* address1_f32 = address0_f32 + stride; |
416 | const float* address2_f32 = address1_f32 + stride; |
417 | const float* address3_f32 = address2_f32 + stride; |
418 | return (psimd_f32) { *address0_f32, *address1_f32, *address2_f32, *address3_f32 }; |
419 | } |
420 | |
421 | PSIMD_INTRINSIC psimd_f32 psimd_load1_stride_f32(const void* address, size_t stride) { |
422 | return psimd_load1_f32(address); |
423 | } |
424 | |
425 | PSIMD_INTRINSIC psimd_f32 psimd_load2_stride_f32(const void* address, size_t stride) { |
426 | const float* address_f32 = (const float*) address; |
427 | return (psimd_f32) { address_f32[0], address_f32[stride], 0.0f, 0.0f }; |
428 | } |
429 | |
430 | PSIMD_INTRINSIC psimd_f32 psimd_load3_stride_f32(const void* address, size_t stride) { |
431 | const float* address0_f32 = (const float*) address; |
432 | const float* address1_f32 = address0_f32 + stride; |
433 | const float* address2_f32 = address1_f32 + stride; |
434 | return (psimd_f32) { *address0_f32, *address1_f32, *address2_f32, 0.0f }; |
435 | } |
436 | |
437 | PSIMD_INTRINSIC psimd_f32 psimd_load4_stride_f32(const void* address, size_t stride) { |
438 | return psimd_load_stride_f32(address, stride); |
439 | } |
440 | |
441 | /* Store vector */ |
442 | PSIMD_INTRINSIC void psimd_store_s8(void* address, psimd_s8 value) { |
443 | *((psimd_s8*) address) = value; |
444 | } |
445 | |
446 | PSIMD_INTRINSIC void psimd_store_u8(void* address, psimd_u8 value) { |
447 | *((psimd_u8*) address) = value; |
448 | } |
449 | |
450 | PSIMD_INTRINSIC void psimd_store_s16(void* address, psimd_s16 value) { |
451 | *((psimd_s16*) address) = value; |
452 | } |
453 | |
454 | PSIMD_INTRINSIC void psimd_store_u16(void* address, psimd_u16 value) { |
455 | *((psimd_u16*) address) = value; |
456 | } |
457 | |
458 | PSIMD_INTRINSIC void psimd_store_s32(void* address, psimd_s32 value) { |
459 | *((psimd_s32*) address) = value; |
460 | } |
461 | |
462 | PSIMD_INTRINSIC void psimd_store_u32(void* address, psimd_u32 value) { |
463 | *((psimd_u32*) address) = value; |
464 | } |
465 | |
466 | PSIMD_INTRINSIC void psimd_store_f32(void* address, psimd_f32 value) { |
467 | *((psimd_f32*) address) = value; |
468 | } |
469 | |
470 | PSIMD_INTRINSIC void psimd_store1_s32(void* address, psimd_s32 value) { |
471 | *((int32_t*) address) = value[0]; |
472 | } |
473 | |
474 | PSIMD_INTRINSIC void psimd_store1_u32(void* address, psimd_u32 value) { |
475 | *((uint32_t*) address) = value[0]; |
476 | } |
477 | |
478 | PSIMD_INTRINSIC void psimd_store1_f32(void* address, psimd_f32 value) { |
479 | *((float*) address) = value[0]; |
480 | } |
481 | |
482 | PSIMD_INTRINSIC void psimd_store2_s32(void* address, psimd_s32 value) { |
483 | int32_t* address_s32 = (int32_t*) address; |
484 | address_s32[0] = value[0]; |
485 | address_s32[1] = value[1]; |
486 | } |
487 | |
488 | PSIMD_INTRINSIC void psimd_store2_u32(void* address, psimd_u32 value) { |
489 | uint32_t* address_u32 = (uint32_t*) address; |
490 | address_u32[0] = value[0]; |
491 | address_u32[1] = value[1]; |
492 | } |
493 | |
494 | PSIMD_INTRINSIC void psimd_store2_f32(void* address, psimd_f32 value) { |
495 | float* address_f32 = (float*) address; |
496 | address_f32[0] = value[0]; |
497 | address_f32[1] = value[1]; |
498 | } |
499 | |
500 | PSIMD_INTRINSIC void psimd_store3_s32(void* address, psimd_s32 value) { |
501 | int32_t* address_s32 = (int32_t*) address; |
502 | address_s32[0] = value[0]; |
503 | address_s32[1] = value[1]; |
504 | address_s32[2] = value[2]; |
505 | } |
506 | |
507 | PSIMD_INTRINSIC void psimd_store3_u32(void* address, psimd_u32 value) { |
508 | uint32_t* address_u32 = (uint32_t*) address; |
509 | address_u32[0] = value[0]; |
510 | address_u32[1] = value[1]; |
511 | address_u32[2] = value[2]; |
512 | } |
513 | |
514 | PSIMD_INTRINSIC void psimd_store3_f32(void* address, psimd_f32 value) { |
515 | float* address_f32 = (float*) address; |
516 | address_f32[0] = value[0]; |
517 | address_f32[1] = value[1]; |
518 | address_f32[2] = value[2]; |
519 | } |
520 | |
521 | PSIMD_INTRINSIC void psimd_store4_s32(void* address, psimd_s32 value) { |
522 | psimd_store_s32(address, value); |
523 | } |
524 | |
525 | PSIMD_INTRINSIC void psimd_store4_u32(void* address, psimd_u32 value) { |
526 | psimd_store_u32(address, value); |
527 | } |
528 | |
529 | PSIMD_INTRINSIC void psimd_store4_f32(void* address, psimd_f32 value) { |
530 | psimd_store_f32(address, value); |
531 | } |
532 | |
533 | PSIMD_INTRINSIC void psimd_store_stride_f32(void* address, size_t stride, psimd_f32 value) { |
534 | float* address0_f32 = (float*) address; |
535 | float* address1_f32 = address0_f32 + stride; |
536 | float* address2_f32 = address1_f32 + stride; |
537 | float* address3_f32 = address2_f32 + stride; |
538 | *address0_f32 = value[0]; |
539 | *address1_f32 = value[1]; |
540 | *address2_f32 = value[2]; |
541 | *address3_f32 = value[3]; |
542 | } |
543 | |
544 | PSIMD_INTRINSIC void psimd_store1_stride_f32(void* address, size_t stride, psimd_f32 value) { |
545 | psimd_store1_f32(address, value); |
546 | } |
547 | |
548 | PSIMD_INTRINSIC void psimd_store2_stride_f32(void* address, size_t stride, psimd_f32 value) { |
549 | float* address_f32 = (float*) address; |
550 | address_f32[0] = value[0]; |
551 | address_f32[stride] = value[1]; |
552 | } |
553 | |
554 | PSIMD_INTRINSIC void psimd_store3_stride_f32(void* address, size_t stride, psimd_f32 value) { |
555 | float* address0_f32 = (float*) address; |
556 | float* address1_f32 = address0_f32 + stride; |
557 | float* address2_f32 = address1_f32 + stride; |
558 | *address0_f32 = value[0]; |
559 | *address1_f32 = value[1]; |
560 | *address2_f32 = value[2]; |
561 | } |
562 | |
563 | /* Vector addition */ |
564 | PSIMD_INTRINSIC psimd_s8 psimd_add_s8(psimd_s8 a, psimd_s8 b) { |
565 | return a + b; |
566 | } |
567 | |
568 | PSIMD_INTRINSIC psimd_u8 psimd_add_u8(psimd_u8 a, psimd_u8 b) { |
569 | return a + b; |
570 | } |
571 | |
572 | PSIMD_INTRINSIC psimd_s16 psimd_add_s16(psimd_s16 a, psimd_s16 b) { |
573 | return a + b; |
574 | } |
575 | |
576 | PSIMD_INTRINSIC psimd_u16 psimd_add_u16(psimd_u16 a, psimd_u16 b) { |
577 | return a + b; |
578 | } |
579 | |
580 | PSIMD_INTRINSIC psimd_s32 psimd_add_s32(psimd_s32 a, psimd_s32 b) { |
581 | return a + b; |
582 | } |
583 | |
584 | PSIMD_INTRINSIC psimd_u32 psimd_add_u32(psimd_u32 a, psimd_u32 b) { |
585 | return a + b; |
586 | } |
587 | |
588 | PSIMD_INTRINSIC psimd_f32 psimd_add_f32(psimd_f32 a, psimd_f32 b) { |
589 | #if defined(__ARM_ARCH_7A__) && defined(__ARM_NEON__) && !defined(__FAST_MATH__) |
590 | return (psimd_f32) vaddq_f32((float32x4_t) a, (float32x4_t) b); |
591 | #else |
592 | return a + b; |
593 | #endif |
594 | } |
595 | |
596 | /* Vector subtraction */ |
597 | PSIMD_INTRINSIC psimd_s8 psimd_sub_s8(psimd_s8 a, psimd_s8 b) { |
598 | return a - b; |
599 | } |
600 | |
601 | PSIMD_INTRINSIC psimd_u8 psimd_sub_u8(psimd_u8 a, psimd_u8 b) { |
602 | return a - b; |
603 | } |
604 | |
605 | PSIMD_INTRINSIC psimd_s16 psimd_sub_s16(psimd_s16 a, psimd_s16 b) { |
606 | return a - b; |
607 | } |
608 | |
609 | PSIMD_INTRINSIC psimd_u16 psimd_sub_u16(psimd_u16 a, psimd_u16 b) { |
610 | return a - b; |
611 | } |
612 | |
613 | PSIMD_INTRINSIC psimd_s32 psimd_sub_s32(psimd_s32 a, psimd_s32 b) { |
614 | return a - b; |
615 | } |
616 | |
617 | PSIMD_INTRINSIC psimd_u32 psimd_sub_u32(psimd_u32 a, psimd_u32 b) { |
618 | return a - b; |
619 | } |
620 | |
621 | PSIMD_INTRINSIC psimd_f32 psimd_sub_f32(psimd_f32 a, psimd_f32 b) { |
622 | #if defined(__ARM_ARCH_7A__) && defined(__ARM_NEON__) && !defined(__FAST_MATH__) |
623 | return (psimd_f32) vsubq_f32((float32x4_t) a, (float32x4_t) b); |
624 | #else |
625 | return a - b; |
626 | #endif |
627 | } |
628 | |
629 | /* Vector multiplication */ |
630 | PSIMD_INTRINSIC psimd_s8 psimd_mul_s8(psimd_s8 a, psimd_s8 b) { |
631 | return a * b; |
632 | } |
633 | |
634 | PSIMD_INTRINSIC psimd_u8 psimd_mul_u8(psimd_u8 a, psimd_u8 b) { |
635 | return a * b; |
636 | } |
637 | |
638 | PSIMD_INTRINSIC psimd_s16 psimd_mul_s16(psimd_s16 a, psimd_s16 b) { |
639 | return a * b; |
640 | } |
641 | |
642 | PSIMD_INTRINSIC psimd_u16 psimd_mul_u16(psimd_u16 a, psimd_u16 b) { |
643 | return a * b; |
644 | } |
645 | |
646 | PSIMD_INTRINSIC psimd_s32 psimd_mul_s32(psimd_s32 a, psimd_s32 b) { |
647 | return a * b; |
648 | } |
649 | |
650 | PSIMD_INTRINSIC psimd_u32 psimd_mul_u32(psimd_u32 a, psimd_u32 b) { |
651 | return a * b; |
652 | } |
653 | |
654 | PSIMD_INTRINSIC psimd_f32 psimd_mul_f32(psimd_f32 a, psimd_f32 b) { |
655 | #if defined(__ARM_ARCH_7A__) && defined(__ARM_NEON__) && !defined(__FAST_MATH__) |
656 | return (psimd_f32) vmulq_f32((float32x4_t) a, (float32x4_t) b); |
657 | #else |
658 | return a * b; |
659 | #endif |
660 | } |
661 | |
662 | /* Quasi-Fused Multiply-Add */ |
663 | PSIMD_INTRINSIC psimd_f32 psimd_qfma_f32(psimd_f32 a, psimd_f32 b, psimd_f32 c) { |
664 | #if defined(__aarch64__) || defined(__ARM_NEON__) && defined(__ARM_FEATURE_FMA) |
665 | return (psimd_f32) vfmaq_f32((float32x4_t) a, (float32x4_t) b, (float32x4_t) c); |
666 | #elif (defined(__x86_64__) || defined(__i386__) || defined(__i686__)) && defined(__FMA__) |
667 | return (psimd_f32) _mm_fmadd_ps((__m128) b, (__m128) c, (__m128) a); |
668 | #elif (defined(__x86_64__) || defined(__i386__) || defined(__i686__)) && defined(__FMA4__) |
669 | return (psimd_f32) _mm_macc_ps((__m128) b, (__m128) c, (__m128) a); |
670 | #elif defined(__wasm__) && defined(__wasm_simd128__) && defined(__clang__) && PSIMD_ENABLE_WASM_QFMA |
671 | return (psimd_f32) __builtin_wasm_qfma_f32x4(a, b, c); |
672 | #else |
673 | return a + b * c; |
674 | #endif |
675 | } |
676 | |
677 | PSIMD_INTRINSIC psimd_f32 psimd_div_f32(psimd_f32 a, psimd_f32 b) { |
678 | return a / b; |
679 | } |
680 | |
681 | /* Vector and */ |
682 | PSIMD_INTRINSIC psimd_f32 psimd_andmask_f32(psimd_s32 mask, psimd_f32 v) { |
683 | return (psimd_f32) (mask & (psimd_s32) v); |
684 | } |
685 | |
686 | /* Vector and-not */ |
687 | PSIMD_INTRINSIC psimd_f32 psimd_andnotmask_f32(psimd_s32 mask, psimd_f32 v) { |
688 | return (psimd_f32) (~mask & (psimd_s32) v); |
689 | } |
690 | |
691 | /* Vector blend */ |
692 | PSIMD_INTRINSIC psimd_s8 psimd_blend_s8(psimd_s8 mask, psimd_s8 a, psimd_s8 b) { |
693 | #if defined(__ARM_NEON__) || defined(__ARM_NEON) |
694 | return (psimd_s8) vbslq_s8((uint8x16_t) mask, (int8x16_t) a, (int8x16_t) b); |
695 | #elif defined(__wasm__) && defined(__wasm_simd128__) && defined(__clang__) |
696 | return (psimd_s8) __builtin_wasm_bitselect(a, b, mask); |
697 | #else |
698 | return (mask & a) | (~mask & b); |
699 | #endif |
700 | } |
701 | |
702 | PSIMD_INTRINSIC psimd_u8 psimd_blend_u8(psimd_s8 mask, psimd_u8 a, psimd_u8 b) { |
703 | #if defined(__ARM_NEON__) || defined(__ARM_NEON) |
704 | return (psimd_u8) vbslq_u8((uint8x16_t) mask, (uint8x16_t) a, (uint8x16_t) b); |
705 | #elif defined(__wasm__) && defined(__wasm_simd128__) && defined(__clang__) |
706 | return (psimd_u8) __builtin_wasm_bitselect(a, b, mask); |
707 | #else |
708 | return (psimd_u8) ((mask & (psimd_s8) a) | (~mask & (psimd_s8) b)); |
709 | #endif |
710 | } |
711 | |
712 | PSIMD_INTRINSIC psimd_s16 psimd_blend_s16(psimd_s16 mask, psimd_s16 a, psimd_s16 b) { |
713 | #if defined(__ARM_NEON__) || defined(__ARM_NEON) |
714 | return (psimd_s16) vbslq_s16((uint16x8_t) mask, (int16x8_t) a, (int16x8_t) b); |
715 | #elif defined(__wasm__) && defined(__wasm_simd128__) && defined(__clang__) |
716 | return (psimd_s16) __builtin_wasm_bitselect(a, b, mask); |
717 | #else |
718 | return (mask & a) | (~mask & b); |
719 | #endif |
720 | } |
721 | |
722 | PSIMD_INTRINSIC psimd_u16 psimd_blend_u16(psimd_s16 mask, psimd_u16 a, psimd_u16 b) { |
723 | #if defined(__ARM_NEON__) || defined(__ARM_NEON) |
724 | return (psimd_u16) vbslq_u16((uint16x8_t) mask, (uint16x8_t) a, (uint16x8_t) b); |
725 | #elif defined(__wasm__) && defined(__wasm_simd128__) && defined(__clang__) |
726 | return (psimd_u16) __builtin_wasm_bitselect(a, b, mask); |
727 | #else |
728 | return (psimd_u16) ((mask & (psimd_s16) a) | (~mask & (psimd_s16) b)); |
729 | #endif |
730 | } |
731 | |
732 | PSIMD_INTRINSIC psimd_s32 psimd_blend_s32(psimd_s32 mask, psimd_s32 a, psimd_s32 b) { |
733 | #if defined(__ARM_NEON__) || defined(__ARM_NEON) |
734 | return (psimd_s32) vbslq_s32((uint32x4_t) mask, (int32x4_t) a, (int32x4_t) b); |
735 | #elif defined(__wasm__) && defined(__wasm_simd128__) && defined(__clang__) |
736 | return (psimd_s32) __builtin_wasm_bitselect(a, b, mask); |
737 | #else |
738 | return (mask & a) | (~mask & b); |
739 | #endif |
740 | } |
741 | |
742 | PSIMD_INTRINSIC psimd_u32 psimd_blend_u32(psimd_s32 mask, psimd_u32 a, psimd_u32 b) { |
743 | #if defined(__ARM_NEON__) || defined(__ARM_NEON) |
744 | return (psimd_u32) vbslq_u32((uint32x4_t) mask, (uint32x4_t) a, (uint32x4_t) b); |
745 | #elif defined(__wasm__) && defined(__wasm_simd128__) && defined(__clang__) |
746 | return (psimd_u32) __builtin_wasm_bitselect(a, b, mask); |
747 | #else |
748 | return (psimd_u32) ((mask & (psimd_s32) a) | (~mask & (psimd_s32) b)); |
749 | #endif |
750 | } |
751 | |
752 | PSIMD_INTRINSIC psimd_f32 psimd_blend_f32(psimd_s32 mask, psimd_f32 a, psimd_f32 b) { |
753 | #if defined(__ARM_NEON__) || defined(__ARM_NEON) |
754 | return (psimd_f32) vbslq_f32((uint32x4_t) mask, (float32x4_t) a, (float32x4_t) b); |
755 | #elif defined(__wasm__) && defined(__wasm_simd128__) && defined(__clang__) |
756 | return (psimd_f32) __builtin_wasm_bitselect(a, b, mask); |
757 | #else |
758 | return (psimd_f32) ((mask & (psimd_s32) a) | (~mask & (psimd_s32) b)); |
759 | #endif |
760 | } |
761 | |
762 | /* Vector blend on sign */ |
763 | PSIMD_INTRINSIC psimd_s8 psimd_signblend_s8(psimd_s8 x, psimd_s8 a, psimd_s8 b) { |
764 | return psimd_blend_s8(x >> psimd_splat_s8(7), a, b); |
765 | } |
766 | |
767 | PSIMD_INTRINSIC psimd_u8 psimd_signblend_u8(psimd_s8 x, psimd_u8 a, psimd_u8 b) { |
768 | return psimd_blend_u8((x >> psimd_splat_s8(7)), a, b); |
769 | } |
770 | |
771 | PSIMD_INTRINSIC psimd_s16 psimd_signblend_s16(psimd_s16 x, psimd_s16 a, psimd_s16 b) { |
772 | return psimd_blend_s16(x >> psimd_splat_s16(15), a, b); |
773 | } |
774 | |
775 | PSIMD_INTRINSIC psimd_u16 psimd_signblend_u16(psimd_s16 x, psimd_u16 a, psimd_u16 b) { |
776 | return psimd_blend_u16((x >> psimd_splat_s16(15)), a, b); |
777 | } |
778 | |
779 | PSIMD_INTRINSIC psimd_s32 psimd_signblend_s32(psimd_s32 x, psimd_s32 a, psimd_s32 b) { |
780 | return psimd_blend_s32(x >> psimd_splat_s32(31), a, b); |
781 | } |
782 | |
783 | PSIMD_INTRINSIC psimd_u32 psimd_signblend_u32(psimd_s32 x, psimd_u32 a, psimd_u32 b) { |
784 | return psimd_blend_u32((x >> psimd_splat_s32(31)), a, b); |
785 | } |
786 | |
787 | PSIMD_INTRINSIC psimd_f32 psimd_signblend_f32(psimd_f32 x, psimd_f32 a, psimd_f32 b) { |
788 | const psimd_s32 mask = (psimd_s32) x >> psimd_splat_s32(31); |
789 | return psimd_blend_f32(mask, a, b); |
790 | } |
791 | |
792 | /* Vector absolute value */ |
793 | PSIMD_INTRINSIC psimd_f32 psimd_abs_f32(psimd_f32 v) { |
794 | const psimd_s32 mask = (psimd_s32) psimd_splat_f32(-0.0f); |
795 | return (psimd_f32) ((psimd_s32) v & ~mask); |
796 | } |
797 | |
798 | /* Vector negation */ |
799 | PSIMD_INTRINSIC psimd_f32 psimd_neg_f32(psimd_f32 v) { |
800 | const psimd_s32 mask = (psimd_s32) psimd_splat_f32(-0.0f); |
801 | return (psimd_f32) ((psimd_s32) v ^ mask); |
802 | } |
803 | |
804 | /* Vector maximum */ |
805 | PSIMD_INTRINSIC psimd_s8 psimd_max_s8(psimd_s8 a, psimd_s8 b) { |
806 | #if defined(__ARM_NEON__) || defined(__ARM_NEON) |
807 | return (psimd_s8) vmaxq_s8((int8x16_t) a, (int8x16_t) b); |
808 | #else |
809 | return psimd_blend_s8(a > b, a, b); |
810 | #endif |
811 | } |
812 | |
813 | PSIMD_INTRINSIC psimd_u8 psimd_max_u8(psimd_u8 a, psimd_u8 b) { |
814 | #if defined(__ARM_NEON__) || defined(__ARM_NEON) |
815 | return (psimd_u8) vmaxq_u8((uint8x16_t) a, (uint8x16_t) b); |
816 | #else |
817 | return psimd_blend_u8(a > b, a, b); |
818 | #endif |
819 | } |
820 | |
821 | PSIMD_INTRINSIC psimd_s16 psimd_max_s16(psimd_s16 a, psimd_s16 b) { |
822 | #if defined(__ARM_NEON__) || defined(__ARM_NEON) |
823 | return (psimd_s16) vmaxq_s16((int16x8_t) a, (int16x8_t) b); |
824 | #else |
825 | return psimd_blend_s16(a > b, a, b); |
826 | #endif |
827 | } |
828 | |
829 | PSIMD_INTRINSIC psimd_u16 psimd_max_u16(psimd_u16 a, psimd_u16 b) { |
830 | #if defined(__ARM_NEON__) || defined(__ARM_NEON) |
831 | return (psimd_u16) vmaxq_u16((uint16x8_t) a, (uint16x8_t) b); |
832 | #else |
833 | return psimd_blend_u16(a > b, a, b); |
834 | #endif |
835 | } |
836 | |
837 | PSIMD_INTRINSIC psimd_s32 psimd_max_s32(psimd_s32 a, psimd_s32 b) { |
838 | #if defined(__ARM_NEON__) || defined(__ARM_NEON) |
839 | return (psimd_s32) vmaxq_s32((int32x4_t) a, (int32x4_t) b); |
840 | #else |
841 | return psimd_blend_s32(a > b, a, b); |
842 | #endif |
843 | } |
844 | |
845 | PSIMD_INTRINSIC psimd_u32 psimd_max_u32(psimd_u32 a, psimd_u32 b) { |
846 | #if defined(__ARM_NEON__) || defined(__ARM_NEON) |
847 | return (psimd_u32) vmaxq_u32((uint32x4_t) a, (uint32x4_t) b); |
848 | #else |
849 | return psimd_blend_u32(a > b, a, b); |
850 | #endif |
851 | } |
852 | |
853 | PSIMD_INTRINSIC psimd_f32 psimd_max_f32(psimd_f32 a, psimd_f32 b) { |
854 | #if defined(__ARM_NEON__) || defined(__ARM_NEON) |
855 | return (psimd_f32) vmaxq_f32((float32x4_t) a, (float32x4_t) b); |
856 | #elif defined(__wasm__) && defined(__wasm_simd128__) && defined(__clang__) |
857 | return __builtin_wasm_max_f32x4(a, b); |
858 | #else |
859 | return psimd_blend_f32(a > b, a, b); |
860 | #endif |
861 | } |
862 | |
863 | /* Vector minimum */ |
864 | PSIMD_INTRINSIC psimd_s8 psimd_min_s8(psimd_s8 a, psimd_s8 b) { |
865 | #if defined(__ARM_NEON__) || defined(__ARM_NEON) |
866 | return (psimd_s8) vminq_s8((int8x16_t) a, (int8x16_t) b); |
867 | #else |
868 | return psimd_blend_s8(a < b, a, b); |
869 | #endif |
870 | } |
871 | |
872 | PSIMD_INTRINSIC psimd_u8 psimd_min_u8(psimd_u8 a, psimd_u8 b) { |
873 | #if defined(__ARM_NEON__) || defined(__ARM_NEON) |
874 | return (psimd_u8) vminq_u8((uint8x16_t) a, (uint8x16_t) b); |
875 | #else |
876 | return psimd_blend_u8(a < b, a, b); |
877 | #endif |
878 | } |
879 | |
880 | PSIMD_INTRINSIC psimd_s16 psimd_min_s16(psimd_s16 a, psimd_s16 b) { |
881 | #if defined(__ARM_NEON__) || defined(__ARM_NEON) |
882 | return (psimd_s16) vminq_s16((int16x8_t) a, (int16x8_t) b); |
883 | #else |
884 | return psimd_blend_s16(a < b, a, b); |
885 | #endif |
886 | } |
887 | |
888 | PSIMD_INTRINSIC psimd_u16 psimd_min_u16(psimd_u16 a, psimd_u16 b) { |
889 | #if defined(__ARM_NEON__) || defined(__ARM_NEON) |
890 | return (psimd_u16) vminq_u16((uint16x8_t) a, (uint16x8_t) b); |
891 | #else |
892 | return psimd_blend_u16(a < b, a, b); |
893 | #endif |
894 | } |
895 | |
896 | PSIMD_INTRINSIC psimd_s32 psimd_min_s32(psimd_s32 a, psimd_s32 b) { |
897 | #if defined(__ARM_NEON__) || defined(__ARM_NEON) |
898 | return (psimd_s32) vminq_s32((int32x4_t) a, (int32x4_t) b); |
899 | #else |
900 | return psimd_blend_s32(a < b, a, b); |
901 | #endif |
902 | } |
903 | |
904 | PSIMD_INTRINSIC psimd_u32 psimd_min_u32(psimd_u32 a, psimd_u32 b) { |
905 | #if defined(__ARM_NEON__) || defined(__ARM_NEON) |
906 | return (psimd_u32) vminq_u32((uint32x4_t) a, (uint32x4_t) b); |
907 | #else |
908 | return psimd_blend_u32(a < b, a, b); |
909 | #endif |
910 | } |
911 | |
912 | PSIMD_INTRINSIC psimd_f32 psimd_min_f32(psimd_f32 a, psimd_f32 b) { |
913 | #if defined(__ARM_NEON__) || defined(__ARM_NEON) |
914 | return (psimd_f32) vminq_f32((float32x4_t) a, (float32x4_t) b); |
915 | #elif defined(__wasm__) && defined(__wasm_simd128__) && defined(__clang__) |
916 | return __builtin_wasm_min_f32x4(a, b); |
917 | #else |
918 | return psimd_blend_f32(a < b, a, b); |
919 | #endif |
920 | } |
921 | |
922 | PSIMD_INTRINSIC psimd_f32 psimd_cvt_s32_f32(psimd_s32 v) { |
923 | #if defined(__clang__) |
924 | return __builtin_convertvector(v, psimd_f32); |
925 | #elif defined(__ARM_NEON__) || defined(__ARM_NEON) |
926 | return (psimd_f32) vcvtq_f32_s32((int32x4_t) v); |
927 | #elif defined(__SSE2__) |
928 | return (psimd_f32) _mm_cvtepi32_ps((__m128i) v); |
929 | #else |
930 | return (psimd_f32) { (float) v[0], (float) v[1], (float) v[2], (float) v[3] }; |
931 | #endif |
932 | } |
933 | |
934 | /* Broadcast vector element */ |
935 | #if defined(__clang__) |
936 | PSIMD_INTRINSIC psimd_f32 psimd_splat0_f32(psimd_f32 v) { |
937 | return __builtin_shufflevector(v, v, 0, 0, 0, 0); |
938 | } |
939 | |
940 | PSIMD_INTRINSIC psimd_f32 psimd_splat1_f32(psimd_f32 v) { |
941 | return __builtin_shufflevector(v, v, 1, 1, 1, 1); |
942 | } |
943 | |
944 | PSIMD_INTRINSIC psimd_f32 psimd_splat2_f32(psimd_f32 v) { |
945 | return __builtin_shufflevector(v, v, 2, 2, 2, 2); |
946 | } |
947 | |
948 | PSIMD_INTRINSIC psimd_f32 psimd_splat3_f32(psimd_f32 v) { |
949 | return __builtin_shufflevector(v, v, 3, 3, 3, 3); |
950 | } |
951 | #else |
952 | PSIMD_INTRINSIC psimd_f32 psimd_splat0_f32(psimd_f32 v) { |
953 | return __builtin_shuffle(v, (psimd_s32) { 0, 0, 0, 0 }); |
954 | } |
955 | |
956 | PSIMD_INTRINSIC psimd_f32 psimd_splat1_f32(psimd_f32 v) { |
957 | return __builtin_shuffle(v, (psimd_s32) { 1, 1, 1, 1 }); |
958 | } |
959 | |
960 | PSIMD_INTRINSIC psimd_f32 psimd_splat2_f32(psimd_f32 v) { |
961 | return __builtin_shuffle(v, (psimd_s32) { 2, 2, 2, 2 }); |
962 | } |
963 | |
964 | PSIMD_INTRINSIC psimd_f32 psimd_splat3_f32(psimd_f32 v) { |
965 | return __builtin_shuffle(v, (psimd_s32) { 3, 3, 3, 3 }); |
966 | } |
967 | #endif |
968 | |
969 | /* Reversal of vector elements */ |
970 | #if defined(__clang__) |
971 | PSIMD_INTRINSIC psimd_s8 psimd_reverse_s8(psimd_s8 v) { |
972 | return __builtin_shufflevector(v, v, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); |
973 | } |
974 | |
975 | PSIMD_INTRINSIC psimd_u8 psimd_reverse_u8(psimd_u8 v) { |
976 | return __builtin_shufflevector(v, v, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); |
977 | } |
978 | |
979 | PSIMD_INTRINSIC psimd_s16 psimd_reverse_s16(psimd_s16 v) { |
980 | return __builtin_shufflevector(v, v, 7, 6, 5, 4, 3, 2, 1, 0); |
981 | } |
982 | |
983 | PSIMD_INTRINSIC psimd_u16 psimd_reverse_u16(psimd_u16 v) { |
984 | return __builtin_shufflevector(v, v, 7, 6, 5, 4, 3, 2, 1, 0); |
985 | } |
986 | |
987 | PSIMD_INTRINSIC psimd_s32 psimd_reverse_s32(psimd_s32 v) { |
988 | return __builtin_shufflevector(v, v, 3, 2, 1, 0); |
989 | } |
990 | |
991 | PSIMD_INTRINSIC psimd_u32 psimd_reverse_u32(psimd_u32 v) { |
992 | return __builtin_shufflevector(v, v, 3, 2, 1, 0); |
993 | } |
994 | |
995 | PSIMD_INTRINSIC psimd_f32 psimd_reverse_f32(psimd_f32 v) { |
996 | return __builtin_shufflevector(v, v, 3, 2, 1, 0); |
997 | } |
998 | #else |
999 | PSIMD_INTRINSIC psimd_s8 psimd_reverse_s8(psimd_s8 v) { |
1000 | return __builtin_shuffle(v, (psimd_s8) { 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 }); |
1001 | } |
1002 | |
1003 | PSIMD_INTRINSIC psimd_u8 psimd_reverse_u8(psimd_u8 v) { |
1004 | return __builtin_shuffle(v, (psimd_s8) { 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 }); |
1005 | } |
1006 | |
1007 | PSIMD_INTRINSIC psimd_s16 psimd_reverse_s16(psimd_s16 v) { |
1008 | return __builtin_shuffle(v, (psimd_s16) { 7, 6, 5, 4, 3, 2, 1, 0 }); |
1009 | } |
1010 | |
1011 | PSIMD_INTRINSIC psimd_u16 psimd_reverse_u16(psimd_u16 v) { |
1012 | return __builtin_shuffle(v, (psimd_s16) { 7, 6, 5, 4, 3, 2, 1, 0 }); |
1013 | } |
1014 | |
1015 | PSIMD_INTRINSIC psimd_s32 psimd_reverse_s32(psimd_s32 v) { |
1016 | return __builtin_shuffle(v, (psimd_s32) { 3, 2, 1, 0 }); |
1017 | } |
1018 | |
1019 | PSIMD_INTRINSIC psimd_u32 psimd_reverse_u32(psimd_u32 v) { |
1020 | return __builtin_shuffle(v, (psimd_s32) { 3, 2, 1, 0 }); |
1021 | } |
1022 | |
1023 | PSIMD_INTRINSIC psimd_f32 psimd_reverse_f32(psimd_f32 v) { |
1024 | return __builtin_shuffle(v, (psimd_s32) { 3, 2, 1, 0 }); |
1025 | } |
1026 | #endif |
1027 | |
1028 | /* Interleaving of vector elements */ |
1029 | #if defined(__clang__) |
1030 | PSIMD_INTRINSIC psimd_s16 psimd_interleave_lo_s16(psimd_s16 a, psimd_s16 b) { |
1031 | return __builtin_shufflevector(a, b, 0, 8+0, 1, 8+1, 2, 8+2, 3, 8+3); |
1032 | } |
1033 | |
1034 | PSIMD_INTRINSIC psimd_s16 psimd_interleave_hi_s16(psimd_s16 a, psimd_s16 b) { |
1035 | return __builtin_shufflevector(a, b, 4, 8+4, 5, 8+5, 6, 8+6, 7, 8+7); |
1036 | } |
1037 | |
1038 | PSIMD_INTRINSIC psimd_u16 psimd_interleave_lo_u16(psimd_u16 a, psimd_u16 b) { |
1039 | return __builtin_shufflevector(a, b, 0, 8+0, 1, 8+1, 2, 8+2, 3, 8+3); |
1040 | } |
1041 | |
1042 | PSIMD_INTRINSIC psimd_u16 psimd_interleave_hi_u16(psimd_u16 a, psimd_u16 b) { |
1043 | return __builtin_shufflevector(a, b, 4, 8+4, 5, 8+5, 6, 8+6, 7, 8+7); |
1044 | } |
1045 | |
1046 | PSIMD_INTRINSIC psimd_s32 psimd_interleave_lo_s32(psimd_s32 a, psimd_s32 b) { |
1047 | return __builtin_shufflevector(a, b, 0, 4+0, 1, 4+1); |
1048 | } |
1049 | |
1050 | PSIMD_INTRINSIC psimd_s32 psimd_interleave_hi_s32(psimd_s32 a, psimd_s32 b) { |
1051 | return __builtin_shufflevector(a, b, 2, 4+2, 3, 4+3); |
1052 | } |
1053 | |
1054 | PSIMD_INTRINSIC psimd_u32 psimd_interleave_lo_u32(psimd_u32 a, psimd_u32 b) { |
1055 | return __builtin_shufflevector(a, b, 0, 4+0, 1, 4+1); |
1056 | } |
1057 | |
1058 | PSIMD_INTRINSIC psimd_u32 psimd_interleave_hi_u32(psimd_u32 a, psimd_u32 b) { |
1059 | return __builtin_shufflevector(a, b, 2, 4+2, 3, 4+3); |
1060 | } |
1061 | |
1062 | PSIMD_INTRINSIC psimd_f32 psimd_interleave_lo_f32(psimd_f32 a, psimd_f32 b) { |
1063 | return __builtin_shufflevector(a, b, 0, 4+0, 1, 4+1); |
1064 | } |
1065 | |
1066 | PSIMD_INTRINSIC psimd_f32 psimd_interleave_hi_f32(psimd_f32 a, psimd_f32 b) { |
1067 | return __builtin_shufflevector(a, b, 2, 4+2, 3, 4+3); |
1068 | } |
1069 | #else |
1070 | PSIMD_INTRINSIC psimd_s16 psimd_interleave_lo_s16(psimd_s16 a, psimd_s16 b) { |
1071 | return __builtin_shuffle(a, b, (psimd_s16) { 0, 8+0, 1, 8+1, 2, 8+2, 3, 8+3 }); |
1072 | } |
1073 | |
1074 | PSIMD_INTRINSIC psimd_s16 psimd_interleave_hi_s16(psimd_s16 a, psimd_s16 b) { |
1075 | return __builtin_shuffle(a, b, (psimd_s16) { 4, 8+4, 5, 8+5, 6, 8+6, 7, 8+7 }); |
1076 | } |
1077 | |
1078 | PSIMD_INTRINSIC psimd_u16 psimd_interleave_lo_u16(psimd_u16 a, psimd_u16 b) { |
1079 | return __builtin_shuffle(a, b, (psimd_s16) { 0, 8+0, 1, 8+1, 2, 8+2, 3, 8+3 }); |
1080 | } |
1081 | |
1082 | PSIMD_INTRINSIC psimd_u16 psimd_interleave_hi_u16(psimd_u16 a, psimd_u16 b) { |
1083 | return __builtin_shuffle(a, b, (psimd_s16) { 4, 8+4, 5, 8+5, 6, 8+6, 7, 8+7 }); |
1084 | } |
1085 | |
1086 | PSIMD_INTRINSIC psimd_s32 psimd_interleave_lo_s32(psimd_s32 a, psimd_s32 b) { |
1087 | return __builtin_shuffle(a, b, (psimd_s32) { 0, 4+0, 1, 4+1 }); |
1088 | } |
1089 | |
1090 | PSIMD_INTRINSIC psimd_s32 psimd_interleave_hi_s32(psimd_s32 a, psimd_s32 b) { |
1091 | return __builtin_shuffle(a, b, (psimd_s32) { 2, 4+2, 3, 4+3 }); |
1092 | } |
1093 | |
1094 | PSIMD_INTRINSIC psimd_u32 psimd_interleave_lo_u32(psimd_u32 a, psimd_u32 b) { |
1095 | return __builtin_shuffle(a, b, (psimd_s32) { 0, 4+0, 1, 4+1 }); |
1096 | } |
1097 | |
1098 | PSIMD_INTRINSIC psimd_u32 psimd_interleave_hi_u32(psimd_u32 a, psimd_u32 b) { |
1099 | return __builtin_shuffle(a, b, (psimd_s32) { 2, 4+2, 3, 4+3 }); |
1100 | } |
1101 | |
1102 | PSIMD_INTRINSIC psimd_f32 psimd_interleave_lo_f32(psimd_f32 a, psimd_f32 b) { |
1103 | return __builtin_shuffle(a, b, (psimd_s32) { 0, 4+0, 1, 4+1 }); |
1104 | } |
1105 | |
1106 | PSIMD_INTRINSIC psimd_f32 psimd_interleave_hi_f32(psimd_f32 a, psimd_f32 b) { |
1107 | return __builtin_shuffle(a, b, (psimd_s32) { 2, 4+2, 3, 4+3 }); |
1108 | } |
1109 | #endif |
1110 | |
1111 | /* Concatenation of low/high vector elements */ |
1112 | #if defined(__clang__) |
1113 | PSIMD_INTRINSIC psimd_s16 psimd_concat_lo_s16(psimd_s16 a, psimd_s16 b) { |
1114 | return __builtin_shufflevector(a, b, 0, 1, 2, 3, 8+0, 8+1, 8+2, 8+3); |
1115 | } |
1116 | |
1117 | PSIMD_INTRINSIC psimd_s16 psimd_concat_hi_s16(psimd_s16 a, psimd_s16 b) { |
1118 | return __builtin_shufflevector(a, b, 4, 5, 6, 7, 8+4, 8+5, 8+6, 8+7); |
1119 | } |
1120 | |
1121 | PSIMD_INTRINSIC psimd_u16 psimd_concat_lo_u16(psimd_u16 a, psimd_u16 b) { |
1122 | return __builtin_shufflevector(a, b, 0, 1, 2, 3, 8+0, 8+1, 8+2, 8+3); |
1123 | } |
1124 | |
1125 | PSIMD_INTRINSIC psimd_u16 psimd_concat_hi_u16(psimd_u16 a, psimd_u16 b) { |
1126 | return __builtin_shufflevector(a, b, 4, 5, 6, 7, 8+4, 8+5, 8+6, 8+7); |
1127 | } |
1128 | |
1129 | PSIMD_INTRINSIC psimd_s32 psimd_concat_lo_s32(psimd_s32 a, psimd_s32 b) { |
1130 | return __builtin_shufflevector(a, b, 0, 1, 4+0, 4+1); |
1131 | } |
1132 | |
1133 | PSIMD_INTRINSIC psimd_s32 psimd_concat_hi_s32(psimd_s32 a, psimd_s32 b) { |
1134 | return __builtin_shufflevector(a, b, 2, 3, 4+2, 4+3); |
1135 | } |
1136 | |
1137 | PSIMD_INTRINSIC psimd_u32 psimd_concat_lo_u32(psimd_u32 a, psimd_u32 b) { |
1138 | return __builtin_shufflevector(a, b, 0, 1, 4+0, 4+1); |
1139 | } |
1140 | |
1141 | PSIMD_INTRINSIC psimd_u32 psimd_concat_hi_u32(psimd_u32 a, psimd_u32 b) { |
1142 | return __builtin_shufflevector(a, b, 2, 3, 4+2, 4+3); |
1143 | } |
1144 | |
1145 | PSIMD_INTRINSIC psimd_f32 psimd_concat_lo_f32(psimd_f32 a, psimd_f32 b) { |
1146 | return __builtin_shufflevector(a, b, 0, 1, 4+0, 4+1); |
1147 | } |
1148 | |
1149 | PSIMD_INTRINSIC psimd_f32 psimd_concat_hi_f32(psimd_f32 a, psimd_f32 b) { |
1150 | return __builtin_shufflevector(a, b, 2, 3, 4+2, 4+3); |
1151 | } |
1152 | #else |
1153 | PSIMD_INTRINSIC psimd_s16 psimd_concat_lo_s16(psimd_s16 a, psimd_s16 b) { |
1154 | return __builtin_shuffle(a, b, (psimd_s16) { 0, 1, 2, 3, 8+0, 8+1, 8+2, 8+3 }); |
1155 | } |
1156 | |
1157 | PSIMD_INTRINSIC psimd_s16 psimd_concat_hi_s16(psimd_s16 a, psimd_s16 b) { |
1158 | return __builtin_shuffle(a, b, (psimd_s16) { 4, 5, 6, 7, 8+4, 8+5, 8+6, 8+7 }); |
1159 | } |
1160 | |
1161 | PSIMD_INTRINSIC psimd_u16 psimd_concat_lo_u16(psimd_u16 a, psimd_u16 b) { |
1162 | return __builtin_shuffle(a, b, (psimd_s16) { 0, 1, 2, 3, 8+0, 8+1, 8+2, 8+3 }); |
1163 | } |
1164 | |
1165 | PSIMD_INTRINSIC psimd_u16 psimd_concat_hi_u16(psimd_u16 a, psimd_u16 b) { |
1166 | return __builtin_shuffle(a, b, (psimd_s16) { 4, 5, 6, 7, 8+4, 8+5, 8+6, 8+7 }); |
1167 | } |
1168 | |
1169 | PSIMD_INTRINSIC psimd_s32 psimd_concat_lo_s32(psimd_s32 a, psimd_s32 b) { |
1170 | return __builtin_shuffle(a, b, (psimd_s32) { 0, 1, 4+0, 4+1 }); |
1171 | } |
1172 | |
1173 | PSIMD_INTRINSIC psimd_s32 psimd_concat_hi_s32(psimd_s32 a, psimd_s32 b) { |
1174 | return __builtin_shuffle(a, b, (psimd_s32) { 2, 3, 4+2, 4+3 }); |
1175 | } |
1176 | |
1177 | PSIMD_INTRINSIC psimd_u32 psimd_concat_lo_u32(psimd_u32 a, psimd_u32 b) { |
1178 | return __builtin_shuffle(a, b, (psimd_s32) { 0, 1, 4+0, 4+1 }); |
1179 | } |
1180 | |
1181 | PSIMD_INTRINSIC psimd_u32 psimd_concat_hi_u32(psimd_u32 a, psimd_u32 b) { |
1182 | return __builtin_shuffle(a, b, (psimd_s32) { 2, 3, 4+2, 4+3 }); |
1183 | } |
1184 | |
1185 | PSIMD_INTRINSIC psimd_f32 psimd_concat_lo_f32(psimd_f32 a, psimd_f32 b) { |
1186 | return __builtin_shuffle(a, b, (psimd_s32) { 0, 1, 4+0, 4+1 }); |
1187 | } |
1188 | |
1189 | PSIMD_INTRINSIC psimd_f32 psimd_concat_hi_f32(psimd_f32 a, psimd_f32 b) { |
1190 | return __builtin_shuffle(a, b, (psimd_s32) { 2, 3, 4+2, 4+3 }); |
1191 | } |
1192 | #endif |
1193 | |
1194 | /* Concatenation of even/odd vector elements */ |
1195 | #if defined(__clang__) |
1196 | PSIMD_INTRINSIC psimd_s8 psimd_concat_even_s8(psimd_s8 a, psimd_s8 b) { |
1197 | return __builtin_shufflevector(a, b, |
1198 | 0, 2, 4, 6, 8, 10, 12, 14, 16+0, 16+2, 16+4, 16+6, 16+8, 16+10, 16+12, 16+14); |
1199 | } |
1200 | |
1201 | PSIMD_INTRINSIC psimd_s8 psimd_concat_odd_s8(psimd_s8 a, psimd_s8 b) { |
1202 | return __builtin_shufflevector(a, b, |
1203 | 1, 3, 5, 7, 9, 11, 13, 15, 16+1, 16+3, 16+5, 16+7, 16+9, 16+11, 16+13, 16+15); |
1204 | } |
1205 | |
1206 | PSIMD_INTRINSIC psimd_u8 psimd_concat_even_u8(psimd_u8 a, psimd_u8 b) { |
1207 | return __builtin_shufflevector(a, b, |
1208 | 0, 2, 4, 6, 8, 10, 12, 14, 16+0, 16+2, 16+4, 16+6, 16+8, 16+10, 16+12, 16+14); |
1209 | } |
1210 | |
1211 | PSIMD_INTRINSIC psimd_u8 psimd_concat_odd_u8(psimd_u8 a, psimd_u8 b) { |
1212 | return __builtin_shufflevector(a, b, |
1213 | 1, 3, 5, 7, 9, 11, 13, 15, 16+1, 16+3, 16+5, 16+7, 16+9, 16+11, 16+13, 16+15); |
1214 | } |
1215 | |
1216 | PSIMD_INTRINSIC psimd_s16 psimd_concat_even_s16(psimd_s16 a, psimd_s16 b) { |
1217 | return __builtin_shufflevector(a, b, 0, 2, 4, 6, 8+0, 8+2, 8+4, 8+6); |
1218 | } |
1219 | |
1220 | PSIMD_INTRINSIC psimd_s16 psimd_concat_odd_s16(psimd_s16 a, psimd_s16 b) { |
1221 | return __builtin_shufflevector(a, b, 1, 3, 5, 7, 8+1, 8+3, 8+5, 8+7); |
1222 | } |
1223 | |
1224 | PSIMD_INTRINSIC psimd_u16 psimd_concat_even_u16(psimd_u16 a, psimd_u16 b) { |
1225 | return __builtin_shufflevector(a, b, 0, 2, 4, 6, 8+0, 8+2, 8+4, 8+6); |
1226 | } |
1227 | |
1228 | PSIMD_INTRINSIC psimd_u16 psimd_concat_odd_u16(psimd_u16 a, psimd_u16 b) { |
1229 | return __builtin_shufflevector(a, b, 1, 3, 5, 7, 8+1, 8+3, 8+5, 8+7); |
1230 | } |
1231 | |
1232 | PSIMD_INTRINSIC psimd_s32 psimd_concat_even_s32(psimd_s32 a, psimd_s32 b) { |
1233 | return __builtin_shufflevector(a, b, 0, 2, 4+0, 4+2); |
1234 | } |
1235 | |
1236 | PSIMD_INTRINSIC psimd_s32 psimd_concat_odd_s32(psimd_s32 a, psimd_s32 b) { |
1237 | return __builtin_shufflevector(a, b, 1, 3, 4+1, 4+3); |
1238 | } |
1239 | |
1240 | PSIMD_INTRINSIC psimd_u32 psimd_concat_even_u32(psimd_u32 a, psimd_u32 b) { |
1241 | return __builtin_shufflevector(a, b, 0, 2, 4+0, 4+2); |
1242 | } |
1243 | |
1244 | PSIMD_INTRINSIC psimd_u32 psimd_concat_odd_u32(psimd_u32 a, psimd_u32 b) { |
1245 | return __builtin_shufflevector(a, b, 1, 3, 4+1, 4+3); |
1246 | } |
1247 | |
1248 | PSIMD_INTRINSIC psimd_f32 psimd_concat_even_f32(psimd_f32 a, psimd_f32 b) { |
1249 | return __builtin_shufflevector(a, b, 0, 2, 4+0, 4+2); |
1250 | } |
1251 | |
1252 | PSIMD_INTRINSIC psimd_f32 psimd_concat_odd_f32(psimd_f32 a, psimd_f32 b) { |
1253 | return __builtin_shufflevector(a, b, 1, 3, 4+1, 4+3); |
1254 | } |
1255 | #else |
1256 | PSIMD_INTRINSIC psimd_s8 psimd_concat_even_s8(psimd_s8 a, psimd_s8 b) { |
1257 | return __builtin_shuffle(a, b, |
1258 | (psimd_s8) { 0, 2, 4, 6, 8, 10, 12, 14, 16+0, 16+2, 16+4, 16+6, 16+8, 16+10, 16+12, 16+14 }); |
1259 | } |
1260 | |
1261 | PSIMD_INTRINSIC psimd_s8 psimd_concat_odd_s8(psimd_s8 a, psimd_s8 b) { |
1262 | return __builtin_shuffle(a, b, |
1263 | (psimd_s8) { 1, 3, 5, 7, 9, 11, 13, 15, 16+1, 16+3, 16+5, 16+7, 16+9, 16+11, 16+13, 16+15 }); |
1264 | } |
1265 | |
1266 | PSIMD_INTRINSIC psimd_u8 psimd_concat_even_u8(psimd_u8 a, psimd_u8 b) { |
1267 | return __builtin_shuffle(a, b, |
1268 | (psimd_s8) { 0, 2, 4, 6, 8, 10, 12, 14, 16+0, 16+2, 16+4, 16+6, 16+8, 16+10, 16+12, 16+14 }); |
1269 | } |
1270 | |
1271 | PSIMD_INTRINSIC psimd_u8 psimd_concat_odd_u8(psimd_u8 a, psimd_u8 b) { |
1272 | return __builtin_shuffle(a, b, |
1273 | (psimd_s8) { 1, 3, 5, 7, 9, 11, 13, 15, 16+1, 16+3, 16+5, 16+7, 16+9, 16+11, 16+13, 16+15 }); |
1274 | } |
1275 | |
1276 | PSIMD_INTRINSIC psimd_s16 psimd_concat_even_s16(psimd_s16 a, psimd_s16 b) { |
1277 | return __builtin_shuffle(a, b, (psimd_s16) { 0, 2, 4, 6, 8+0, 8+2, 8+4, 8+6 }); |
1278 | } |
1279 | |
1280 | PSIMD_INTRINSIC psimd_s16 psimd_concat_odd_s16(psimd_s16 a, psimd_s16 b) { |
1281 | return __builtin_shuffle(a, b, (psimd_s16) { 1, 3, 5, 7, 8+1, 8+3, 8+5, 8+7 }); |
1282 | } |
1283 | |
1284 | PSIMD_INTRINSIC psimd_u16 psimd_concat_even_u16(psimd_u16 a, psimd_u16 b) { |
1285 | return __builtin_shuffle(a, b, (psimd_s16) { 0, 2, 4, 6, 8+0, 8+2, 8+4, 8+6 }); |
1286 | } |
1287 | |
1288 | PSIMD_INTRINSIC psimd_u16 psimd_concat_odd_u16(psimd_u16 a, psimd_u16 b) { |
1289 | return __builtin_shuffle(a, b, (psimd_s16) { 1, 3, 5, 7, 8+1, 8+3, 8+5, 8+7 }); |
1290 | } |
1291 | |
1292 | PSIMD_INTRINSIC psimd_s32 psimd_concat_even_s32(psimd_s32 a, psimd_s32 b) { |
1293 | return __builtin_shuffle(a, b, (psimd_s32) { 0, 2, 4+0, 4+2 }); |
1294 | } |
1295 | |
1296 | PSIMD_INTRINSIC psimd_s32 psimd_concat_odd_s32(psimd_s32 a, psimd_s32 b) { |
1297 | return __builtin_shuffle(a, b, (psimd_s32) { 1, 3, 4+1, 4+3 }); |
1298 | } |
1299 | |
1300 | PSIMD_INTRINSIC psimd_u32 psimd_concat_even_u32(psimd_u32 a, psimd_u32 b) { |
1301 | return __builtin_shuffle(a, b, (psimd_s32) { 0, 2, 4+0, 4+2 }); |
1302 | } |
1303 | |
1304 | PSIMD_INTRINSIC psimd_u32 psimd_concat_odd_u32(psimd_u32 a, psimd_u32 b) { |
1305 | return __builtin_shuffle(a, b, (psimd_s32) { 1, 3, 4+1, 4+3 }); |
1306 | } |
1307 | |
1308 | PSIMD_INTRINSIC psimd_f32 psimd_concat_even_f32(psimd_f32 a, psimd_f32 b) { |
1309 | return __builtin_shuffle(a, b, (psimd_s32) { 0, 2, 4+0, 4+2 }); |
1310 | } |
1311 | |
1312 | PSIMD_INTRINSIC psimd_f32 psimd_concat_odd_f32(psimd_f32 a, psimd_f32 b) { |
1313 | return __builtin_shuffle(a, b, (psimd_s32) { 1, 3, 4+1, 4+3 }); |
1314 | } |
1315 | #endif |
1316 | |
1317 | /* Vector reduce */ |
1318 | #if defined(__clang__) |
1319 | PSIMD_INTRINSIC psimd_f32 psimd_allreduce_sum_f32(psimd_f32 v) { |
1320 | const psimd_f32 temp = v + __builtin_shufflevector(v, v, 2, 3, 0, 1); |
1321 | return temp + __builtin_shufflevector(temp, temp, 1, 0, 3, 2); |
1322 | } |
1323 | |
1324 | PSIMD_INTRINSIC psimd_f32 psimd_allreduce_max_f32(psimd_f32 v) { |
1325 | const psimd_f32 temp = psimd_max_f32(v, __builtin_shufflevector(v, v, 2, 3, 0, 1)); |
1326 | return psimd_max_f32(temp, __builtin_shufflevector(temp, temp, 1, 0, 3, 2)); |
1327 | } |
1328 | |
1329 | PSIMD_INTRINSIC psimd_f32 psimd_allreduce_min_f32(psimd_f32 v) { |
1330 | const psimd_f32 temp = psimd_min_f32(v, __builtin_shufflevector(v, v, 2, 3, 0, 1)); |
1331 | return psimd_min_f32(temp, __builtin_shufflevector(temp, temp, 1, 0, 3, 2)); |
1332 | } |
1333 | |
1334 | PSIMD_INTRINSIC float psimd_reduce_sum_f32(psimd_f32 v) { |
1335 | const psimd_f32 temp = v + __builtin_shufflevector(v, v, 2, 3, -1, -1); |
1336 | const psimd_f32 result = temp + __builtin_shufflevector(temp, temp, 1, -1, -1, -1); |
1337 | return result[0]; |
1338 | } |
1339 | |
1340 | PSIMD_INTRINSIC float psimd_reduce_max_f32(psimd_f32 v) { |
1341 | const psimd_f32 temp = psimd_max_f32(v, __builtin_shufflevector(v, v, 2, 3, -1, -1)); |
1342 | const psimd_f32 result = psimd_max_f32(temp, __builtin_shufflevector(temp, temp, 1, -1, -1, -1)); |
1343 | return result[0]; |
1344 | } |
1345 | |
1346 | PSIMD_INTRINSIC float psimd_reduce_min_f32(psimd_f32 v) { |
1347 | const psimd_f32 temp = psimd_min_f32(v, __builtin_shufflevector(v, v, 2, 3, -1, -1)); |
1348 | const psimd_f32 result = psimd_min_f32(temp, __builtin_shufflevector(temp, temp, 1, -1, -1, -1)); |
1349 | return result[0]; |
1350 | } |
1351 | #else |
1352 | PSIMD_INTRINSIC psimd_f32 psimd_allreduce_sum_f32(psimd_f32 v) { |
1353 | const psimd_f32 temp = v + __builtin_shuffle(v, (psimd_s32) { 2, 3, 0, 1 }); |
1354 | return temp + __builtin_shuffle(temp, (psimd_s32) { 1, 0, 3, 2 }); |
1355 | } |
1356 | |
1357 | PSIMD_INTRINSIC psimd_f32 psimd_allreduce_max_f32(psimd_f32 v) { |
1358 | const psimd_f32 temp = psimd_max_f32(v, __builtin_shuffle(v, (psimd_s32) { 2, 3, 0, 1 })); |
1359 | return psimd_max_f32(temp, __builtin_shuffle(temp, (psimd_s32) { 1, 0, 3, 2 })); |
1360 | } |
1361 | |
1362 | PSIMD_INTRINSIC psimd_f32 psimd_allreduce_min_f32(psimd_f32 v) { |
1363 | const psimd_f32 temp = psimd_min_f32(v, __builtin_shuffle(v, (psimd_s32) { 2, 3, 0, 1 })); |
1364 | return psimd_min_f32(temp, __builtin_shuffle(temp, (psimd_s32) { 1, 0, 3, 2 })); |
1365 | } |
1366 | |
1367 | PSIMD_INTRINSIC float psimd_reduce_sum_f32(psimd_f32 v) { |
1368 | const psimd_f32 result = psimd_allreduce_sum_f32(v); |
1369 | return result[0]; |
1370 | } |
1371 | |
1372 | PSIMD_INTRINSIC float psimd_reduce_max_f32(psimd_f32 v) { |
1373 | const psimd_f32 result = psimd_allreduce_max_f32(v); |
1374 | return result[0]; |
1375 | } |
1376 | |
1377 | PSIMD_INTRINSIC float psimd_reduce_min_f32(psimd_f32 v) { |
1378 | const psimd_f32 result = psimd_allreduce_min_f32(v); |
1379 | return result[0]; |
1380 | } |
1381 | #endif |
1382 | #endif |
1383 | |
1384 | #endif /* PSIMD_H */ |
1385 | |