1//created by Victoria Zhislina, the Senior Application Engineer, Intel Corporation, [email protected]
2
3//*** Copyright (C) 2012-2022 Intel Corporation. All rights reserved.
4
5//IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
6
7//By downloading, copying, installing or using the software you agree to this license.
8//If you do not agree to this license, do not download, install, copy or use the software.
9
10// License Agreement
11//Redistribution and use in source and binary forms, with or without modification,
12//are permitted provided that the following conditions are met:
13
14// * Redistributions of source code must retain the above copyright notice,
15// this list of conditions and the following disclaimer.
16
17// * The name of the copyright holders may not be used to endorse or promote products
18// derived from this software without specific prior written permission.
19
20//This software is provided by the copyright holders and contributors "as is" and
21//any express or implied warranties, including, but not limited to, the implied
22//warranties of merchantability and fitness for a particular purpose are disclaimed.
23//In no event shall the Intel Corporation or contributors be liable for any direct,
24//indirect, incidental, special, exemplary, or consequential damages
25//(including, but not limited to, procurement of substitute goods or services;
26//loss of use, data, or profits; or business interruption) however caused
27//and on any theory of liability, whether in contract, strict liability,
28//or tort (including negligence or otherwise) arising in any way out of
29//the use of this software, even if advised of the possibility of such damage.
30
31//*****************************************************************************************
32// This file is intended to simplify ARM->IA32 porting
33// It makes the correspondence between ARM NEON intrinsics (as defined in "arm_neon.h")
34// and x86 SSE(up to SSE4.2) intrinsic functions as defined in headers files below
35//MMX instruction set is not used due to non availability on x64 systems,
36//performance overhead and the necessity to use the EMMS instruction (_mm_empty())for mmx-x87 floating point switching
37//*****************************************************************************************
38
39//!!!!!!!!!!!!!! To use this file just include it in your project that uses ARM NEON intrinsics instead of "arm_neon.h" and compile it as usual
40//!!!!!!!!!!!!!! but please pay attention at #define USE_SSE4 below - you might need to define it manualy for newest Intel Atom or any Intel Core platforms for greater performance.
41
42#ifndef NEON2SSE_H
43#define NEON2SSE_H
44
45/*********************************************************************************************************************/
46//!!!!!!!!!!!!!!
47//if USE_SSE4 is defined, some functions use SSE4 instructions instead of earlier SSE versions, when undefined - SIMD up to SSSE3 are used
48//For older devices without SSE4 support it should be undefined, for newer devices - defined, probably manualy if your compiler doesn't set __SSE4_2__ predefine
49#ifndef USE_SSE4
50# if defined(__SSE4_2__)
51# define USE_SSE4
52# endif
53#endif
54/*********************************************************************************************************************/
55
56#include <xmmintrin.h> //SSE
57#include <emmintrin.h> //SSE2
58#include <pmmintrin.h> //SSE3
59#include <tmmintrin.h> //SSSE3
60#ifdef USE_SSE4
61# include <smmintrin.h> //SSE4.1
62# include <nmmintrin.h> //SSE4.2
63#endif
64
65#include <math.h>
66
67//*************** functions and data attributes, compiler dependent *********************************
68//***********************************************************************************
69#ifdef __GNUC__
70# define _GCC_VERSION (__GNUC__ * 10000 + __GNUC_MINOR__ * 100 + __GNUC_PATCHLEVEL__)
71# define _NEON2SSESTORAGE static
72# define _NEON2SSE_ALIGN_16 __attribute__((aligned(16)))
73# ifdef __clang__
74# define _NEON2SSE_INLINE _NEON2SSESTORAGE inline __attribute__((__gnu_inline__, __always_inline__))
75# else
76# define _NEON2SSE_INLINE _NEON2SSESTORAGE inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
77# endif
78# ifndef NEON2SSE_DISABLE_PERFORMANCE_WARNING
79# if _GCC_VERSION < 40500
80# define _NEON2SSE_PERFORMANCE_WARNING(function, explanation) __attribute__((deprecated)) function
81# else
82# define _NEON2SSE_PERFORMANCE_WARNING(function, explanation) __attribute__((deprecated(explanation))) function
83# endif
84# else
85# define _NEON2SSE_PERFORMANCE_WARNING(function, explanation) function
86# endif
87# if defined(__x86_64__)
88# define _NEON2SSE_64BIT __x86_64__
89# endif
90#else
91# define _NEON2SSESTORAGE static
92# define _NEON2SSE_ALIGN_16 __declspec(align(16))
93# define _NEON2SSE_INLINE _NEON2SSESTORAGE __inline
94# if (defined(_MSC_VER) || defined (__INTEL_COMPILER)) && !defined(NEON2SSE_DISABLE_PERFORMANCE_WARNING)
95# define _NEON2SSE_PERFORMANCE_WARNING(function, EXPLANATION) __declspec(deprecated(EXPLANATION)) function
96# if defined(_M_X64)
97# define _NEON2SSE_64BIT _M_X64
98# endif
99# else
100# define _NEON2SSE_PERFORMANCE_WARNING(function, explanation) function
101# endif
102#endif
103
104/* Used to mark the intinsics that are declared as functions, but implemented as macros */
105#define _NEON2SSE_GLOBAL
106
107#if defined (_NEON2SSE_64BIT) && defined (USE_SSE4)
108# define _NEON2SSE_64BIT_SSE4
109#endif
110
111#ifndef UNREFERENCED_PARAMETER
112# define UNREFERENCED_PARAMETER(P) ((void)(P))
113#endif
114
115/*********************************************************************************************************************/
116// data types conversion
117/*********************************************************************************************************************/
118#if defined(_MSC_VER) && (_MSC_VER < 1300)
119 typedef signed char int8_t;
120 typedef unsigned char uint8_t;
121 typedef signed short int16_t;
122 typedef unsigned short uint16_t;
123 typedef signed int int32_t;
124 typedef unsigned int uint32_t;
125 typedef signed long long int64_t;
126 typedef unsigned long long uint64_t;
127#elif defined(_MSC_VER)
128 typedef signed __int8 int8_t;
129 typedef unsigned __int8 uint8_t;
130 typedef signed __int16 int16_t;
131 typedef unsigned __int16 uint16_t;
132 typedef signed __int32 int32_t;
133 typedef unsigned __int32 uint32_t;
134
135 typedef signed long long int64_t;
136 typedef unsigned long long uint64_t;
137#else
138# include <stdint.h>
139# include <limits.h>
140#endif
141
142
143typedef float float32_t;
144#if !defined(__clang__)
145typedef float __fp16;
146#endif
147
148typedef double float64_t;
149
150typedef union __m64_128 {
151 uint64_t m64_u64[1];
152 int64_t m64_i64[1];
153 float64_t m64_d64[1];
154 uint32_t m64_u32[2];
155 int32_t m64_i32[2];
156 float32_t m64_f32[2];
157 int16_t m64_i16[4];
158 uint16_t m64_u16[4];
159 int8_t m64_i8[8];
160 uint8_t m64_u8[8];
161} __m64_128;
162
163typedef __m64_128 int8x8_t;
164typedef __m64_128 uint8x8_t;
165typedef __m64_128 int16x4_t;
166typedef __m64_128 uint16x4_t;
167typedef __m64_128 int32x2_t;
168typedef __m64_128 uint32x2_t;
169typedef __m64_128 int64x1_t;
170typedef __m64_128 uint64x1_t;
171typedef __m64_128 poly8x8_t;
172typedef __m64_128 poly16x4_t;
173
174typedef __m64_128 float32x2_t;
175typedef __m128 float32x4_t;
176
177typedef __m128 float16x4_t; //not supported by IA, for compartibility
178typedef __m128 float16x8_t; //not supported by IA, for compartibility
179
180typedef __m64_128 float64x1_t;
181typedef __m128d float64x2_t;
182
183typedef __m128i int8x16_t;
184typedef __m128i int16x8_t;
185typedef __m128i int32x4_t;
186typedef __m128i int64x2_t;
187typedef __m128i uint8x16_t;
188typedef __m128i uint16x8_t;
189typedef __m128i uint32x4_t;
190typedef __m128i uint64x2_t;
191typedef __m128i poly8x16_t;
192typedef __m128i poly16x8_t;
193
194#if defined(_MSC_VER)
195# define SINT_MIN (-2147483647 - 1) /* min signed int value */
196# define SINT_MAX 2147483647 /* max signed int value */
197#else
198# define SINT_MIN INT_MIN /* min signed int value */
199# define SINT_MAX INT_MAX /* max signed int value */
200#endif
201
202typedef uint8_t poly8_t;
203typedef uint16_t poly16_t;
204
205
206//MSVC compilers (tested up to 2012 VS version) doesn't allow using structures or arrays of __m128x type as functions arguments resulting in
207//error C2719: 'src': formal parameter with __declspec(align('16')) won't be aligned. To avoid it we need the special trick for functions that use these types
208struct int8x16x2_t {
209 int8x16_t val[2];
210};
211struct int16x8x2_t {
212 int16x8_t val[2];
213};
214struct int32x4x2_t {
215 int32x4_t val[2];
216};
217struct int64x2x2_t {
218 int64x2_t val[2];
219};
220//Unfortunately we are unable to merge two 64-bits in on 128 bit register because user should be able to access val[n] members explicitly!!!
221struct int8x8x2_t {
222 int8x8_t val[2];
223};
224struct int16x4x2_t {
225 int16x4_t val[2];
226};
227struct int32x2x2_t {
228 int32x2_t val[2];
229};
230struct int64x1x2_t {
231 int64x1_t val[2];
232};
233
234typedef struct int8x16x2_t int8x16x2_t; //for C compilers to make them happy
235typedef struct int16x8x2_t int16x8x2_t; //for C compilers to make them happy
236typedef struct int32x4x2_t int32x4x2_t; //for C compilers to make them happy
237typedef struct int64x2x2_t int64x2x2_t; //for C compilers to make them happy
238
239typedef struct int8x8x2_t int8x8x2_t; //for C compilers to make them happy
240typedef struct int16x4x2_t int16x4x2_t; //for C compilers to make them happy
241typedef struct int32x2x2_t int32x2x2_t; //for C compilers to make them happy
242typedef struct int64x1x2_t int64x1x2_t; //for C compilers to make them happy
243
244/* to avoid pointer conversions the following unsigned integers structures are defined via the corresponding signed integers structures above */
245typedef struct int8x16x2_t uint8x16x2_t;
246typedef struct int16x8x2_t uint16x8x2_t;
247typedef struct int32x4x2_t uint32x4x2_t;
248typedef struct int64x2x2_t uint64x2x2_t;
249typedef struct int8x16x2_t poly8x16x2_t;
250typedef struct int16x8x2_t poly16x8x2_t;
251
252typedef struct int8x8x2_t uint8x8x2_t;
253typedef struct int16x4x2_t uint16x4x2_t;
254typedef struct int32x2x2_t uint32x2x2_t;
255typedef struct int64x1x2_t uint64x1x2_t;
256typedef struct int8x8x2_t poly8x8x2_t;
257typedef struct int16x4x2_t poly16x4x2_t;
258
259//float
260struct float32x4x2_t {
261 float32x4_t val[2];
262};
263struct float16x8x2_t {
264 float16x8_t val[2];
265};
266struct float32x2x2_t {
267 float32x2_t val[2];
268};
269
270typedef struct float32x4x2_t float32x4x2_t; //for C compilers to make them happy
271typedef struct float16x8x2_t float16x8x2_t; //for C compilers to make them happy
272typedef struct float32x2x2_t float32x2x2_t; //for C compilers to make them happy
273typedef float16x8x2_t float16x4x2_t;
274
275//4
276struct int8x16x4_t {
277 int8x16_t val[4];
278};
279struct int16x8x4_t {
280 int16x8_t val[4];
281};
282struct int32x4x4_t {
283 int32x4_t val[4];
284};
285struct int64x2x4_t {
286 int64x2_t val[4];
287};
288
289struct int8x8x4_t {
290 int8x8_t val[4];
291};
292struct int16x4x4_t {
293 int16x4_t val[4];
294};
295struct int32x2x4_t {
296 int32x2_t val[4];
297};
298struct int64x1x4_t {
299 int64x1_t val[4];
300};
301
302typedef struct int8x16x4_t int8x16x4_t; //for C compilers to make them happy
303typedef struct int16x8x4_t int16x8x4_t; //for C compilers to make them happy
304typedef struct int32x4x4_t int32x4x4_t; //for C compilers to make them happy
305typedef struct int64x2x4_t int64x2x4_t; //for C compilers to make them happy
306
307typedef struct int8x8x4_t int8x8x4_t; //for C compilers to make them happy
308typedef struct int16x4x4_t int16x4x4_t; //for C compilers to make them happy
309typedef struct int32x2x4_t int32x2x4_t; //for C compilers to make them happy
310typedef struct int64x1x4_t int64x1x4_t; //for C compilers to make them happy
311
312/* to avoid pointer conversions the following unsigned integers structures are defined via the corresponding signed integers dealing structures above:*/
313typedef struct int8x8x4_t uint8x8x4_t;
314typedef struct int16x4x4_t uint16x4x4_t;
315typedef struct int32x2x4_t uint32x2x4_t;
316typedef struct int64x1x4_t uint64x1x4_t;
317typedef struct int8x8x4_t poly8x8x4_t;
318typedef struct int16x4x4_t poly16x4x4_t;
319
320typedef struct int8x16x4_t uint8x16x4_t;
321typedef struct int16x8x4_t uint16x8x4_t;
322typedef struct int32x4x4_t uint32x4x4_t;
323typedef struct int64x2x4_t uint64x2x4_t;
324typedef struct int8x16x4_t poly8x16x4_t;
325typedef struct int16x8x4_t poly16x8x4_t;
326
327struct float32x4x4_t {
328 float32x4_t val[4];
329};
330struct float16x8x4_t {
331 float16x8_t val[4];
332};
333struct float32x2x4_t {
334 float32x2_t val[4];
335};
336
337typedef struct float32x4x4_t float32x4x4_t; //for C compilers to make them happy
338typedef struct float16x8x4_t float16x8x4_t; //for C compilers to make them happy
339typedef struct float32x2x4_t float32x2x4_t; //for C compilers to make them happy
340typedef float16x8x4_t float16x4x4_t;
341
342//3
343struct int16x8x3_t {
344 int16x8_t val[3];
345};
346struct int32x4x3_t {
347 int32x4_t val[3];
348};
349struct int64x2x3_t {
350 int64x2_t val[3];
351};
352struct int8x16x3_t {
353 int8x16_t val[3];
354};
355
356struct int16x4x3_t {
357 int16x4_t val[3];
358};
359struct int32x2x3_t {
360 int32x2_t val[3];
361};
362struct int64x1x3_t {
363 int64x1_t val[3];
364};
365struct int8x8x3_t {
366 int8x8_t val[3];
367};
368typedef struct int16x8x3_t int16x8x3_t; //for C compilers to make them happy
369typedef struct int32x4x3_t int32x4x3_t; //for C compilers to make them happy
370typedef struct int64x2x3_t int64x2x3_t; //for C compilers to make them happy
371typedef struct int8x16x3_t int8x16x3_t; //for C compilers to make them happy
372
373typedef struct int8x8x3_t int8x8x3_t; //for C compilers to make them happy
374typedef struct int16x4x3_t int16x4x3_t; //for C compilers to make them happy
375typedef struct int32x2x3_t int32x2x3_t; //for C compilers to make them happy
376typedef struct int64x1x3_t int64x1x3_t; //for C compilers to make them happy
377
378
379/* to avoid pointer conversions the following unsigned integers structures are defined via the corresponding signed integers dealing structures above:*/
380typedef struct int8x16x3_t uint8x16x3_t;
381typedef struct int16x8x3_t uint16x8x3_t;
382typedef struct int32x4x3_t uint32x4x3_t;
383typedef struct int64x2x3_t uint64x2x3_t;
384typedef struct int8x16x3_t poly8x16x3_t;
385typedef struct int16x8x3_t poly16x8x3_t;
386typedef struct int8x8x3_t uint8x8x3_t;
387typedef struct int16x4x3_t uint16x4x3_t;
388typedef struct int32x2x3_t uint32x2x3_t;
389typedef struct int64x1x3_t uint64x1x3_t;
390typedef struct int8x8x3_t poly8x8x3_t;
391typedef struct int16x4x3_t poly16x4x3_t;
392
393//float
394struct float32x4x3_t {
395 float32x4_t val[3];
396};
397struct float32x2x3_t {
398 float32x2_t val[3];
399};
400struct float16x8x3_t {
401 float16x8_t val[3];
402};
403
404typedef struct float32x4x3_t float32x4x3_t; //for C compilers to make them happy
405typedef struct float16x8x3_t float16x8x3_t; //for C compilers to make them happy
406typedef struct float32x2x3_t float32x2x3_t; //for C compilers to make them happy
407typedef float16x8x3_t float16x4x3_t;
408
409
410//****************************************************************************
411//****** Porting auxiliary macros ********************************************
412
413//** floating point related macros **
414#define _M128i(a) _mm_castps_si128(a)
415#define _M128(a) _mm_castsi128_ps(a)
416//here the most performance effective implementation is compiler and 32/64 bits build dependent
417#if defined (_NEON2SSE_64BIT) || (defined (__INTEL_COMPILER) && (__INTEL_COMPILER >= 1500) )
418# define _pM128i(a) _mm_cvtsi64_si128(*(int64_t*)(&(a)))
419# define _M64(out, inp) out.m64_i64[0] = _mm_cvtsi128_si64 (inp);
420# define _M64f(out, inp) out.m64_i64[0] = _mm_cvtsi128_si64 (_M128i(inp));
421#else
422 //for 32bit gcc and Microsoft compilers builds
423# define _pM128i(a) _mm_loadl_epi64((__m128i*)&(a))
424# define _M64(out, inp) _mm_storel_epi64 ((__m128i*)&(out), inp)
425# define _M64f(out, inp) _mm_storel_epi64 ((__m128i*)&(out), _M128i(inp))
426#endif
427#define _pM128(a) _mm_castsi128_ps(_pM128i(a))
428
429#define return64(a) _M64(res64,a); return res64;
430#define return64f(a) _M64f(res64,a); return res64;
431
432#define _Ui64(a) (*(uint64_t*)&(a))
433#define _UNSIGNED_T(a) u ## a
434
435#define _SIGNBIT64 ((uint64_t)1 << 63)
436#define _SWAP_HI_LOW32 (2 | (3 << 2) | (0 << 4) | (1 << 6))
437#define _INSERTPS_NDX(srcField, dstField) (((srcField) << 6) | ((dstField) << 4) )
438
439#define _NEON2SSE_REASON_SLOW_SERIAL "The function may be very slow due to the serial implementation, please try to avoid it"
440#define _NEON2SSE_REASON_SLOW_UNEFFECTIVE "The function may be slow due to inefficient x86 SIMD implementation, please try to avoid it"
441
442//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
443#define __constrange(min,max) const
444#define __transfersize(size)
445//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
446
447//&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&& mask constants used in porting &&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&
448_NEON2SSE_ALIGN_16 static const int8_t mask8_16_even_odd[16] = { 0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15 };
449_NEON2SSE_ALIGN_16 static const int8_t mask8_32_even_odd[16] = { 0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15 };
450//&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&
451
452//*************************************************************************
453//*************************************************************************
454//********* Functions declarations as declared in original arm_neon.h *****
455//*************************************************************************
456//Vector add: vadd -> Vr[i]:=Va[i]+Vb[i], Vr, Va, Vb have equal lane sizes.
457_NEON2SSESTORAGE int8x8_t vadd_s8(int8x8_t a, int8x8_t b); // VADD.I8 d0,d0,d0
458_NEON2SSESTORAGE int16x4_t vadd_s16(int16x4_t a, int16x4_t b); // VADD.I16 d0,d0,d0
459_NEON2SSESTORAGE int32x2_t vadd_s32(int32x2_t a, int32x2_t b); // VADD.I32 d0,d0,d0
460_NEON2SSESTORAGE int64x1_t vadd_s64(int64x1_t a, int64x1_t b); // VADD.I64 d0,d0,d0
461_NEON2SSESTORAGE float32x2_t vadd_f32(float32x2_t a, float32x2_t b); // VADD.F32 d0,d0,d0
462_NEON2SSE_GLOBAL uint8x8_t vadd_u8(uint8x8_t a, uint8x8_t b); // VADD.I8 d0,d0,d0
463_NEON2SSE_GLOBAL uint16x4_t vadd_u16(uint16x4_t a, uint16x4_t b); // VADD.I16 d0,d0,d0
464_NEON2SSE_GLOBAL uint32x2_t vadd_u32(uint32x2_t a, uint32x2_t b); // VADD.I32 d0,d0,d0
465_NEON2SSESTORAGE uint64x1_t vadd_u64(uint64x1_t a, uint64x1_t b); // VADD.I64 d0,d0,d0
466_NEON2SSE_GLOBAL int8x16_t vaddq_s8(int8x16_t a, int8x16_t b); // VADD.I8 q0,q0,q0
467_NEON2SSE_GLOBAL int16x8_t vaddq_s16(int16x8_t a, int16x8_t b); // VADD.I16 q0,q0,q0
468_NEON2SSE_GLOBAL int32x4_t vaddq_s32(int32x4_t a, int32x4_t b); // VADD.I32 q0,q0,q0
469_NEON2SSE_GLOBAL int64x2_t vaddq_s64(int64x2_t a, int64x2_t b); // VADD.I64 q0,q0,q0
470_NEON2SSE_GLOBAL float32x4_t vaddq_f32(float32x4_t a, float32x4_t b); // VADD.F32 q0,q0,q0
471_NEON2SSE_GLOBAL uint8x16_t vaddq_u8(uint8x16_t a, uint8x16_t b); // VADD.I8 q0,q0,q0
472_NEON2SSE_GLOBAL uint16x8_t vaddq_u16(uint16x8_t a, uint16x8_t b); // VADD.I16 q0,q0,q0
473_NEON2SSE_GLOBAL uint32x4_t vaddq_u32(uint32x4_t a, uint32x4_t b); // VADD.I32 q0,q0,q0
474_NEON2SSE_GLOBAL uint64x2_t vaddq_u64(uint64x2_t a, uint64x2_t b); // VADD.I64 q0,q0,q0
475//Vector long add: vaddl -> Vr[i]:=Va[i]+Vb[i], Va, Vb have equal lane sizes, result is a 128 bit vector of lanes that are twice the width.
476_NEON2SSESTORAGE int16x8_t vaddl_s8(int8x8_t a, int8x8_t b); // VADDL.S8 q0,d0,d0
477_NEON2SSESTORAGE int32x4_t vaddl_s16(int16x4_t a, int16x4_t b); // VADDL.S16 q0,d0,d0
478_NEON2SSESTORAGE int64x2_t vaddl_s32(int32x2_t a, int32x2_t b); // VADDL.S32 q0,d0,d0
479_NEON2SSESTORAGE uint16x8_t vaddl_u8(uint8x8_t a, uint8x8_t b); // VADDL.U8 q0,d0,d0
480_NEON2SSESTORAGE uint32x4_t vaddl_u16(uint16x4_t a, uint16x4_t b); // VADDL.U16 q0,d0,d0
481_NEON2SSESTORAGE uint64x2_t vaddl_u32(uint32x2_t a, uint32x2_t b); // VADDL.U32 q0,d0,d0
482//Vector wide addw: vadd -> Vr[i]:=Va[i]+Vb[i]
483_NEON2SSESTORAGE int16x8_t vaddw_s8(int16x8_t a, int8x8_t b); // VADDW.S8 q0,q0,d0
484_NEON2SSESTORAGE int32x4_t vaddw_s16(int32x4_t a, int16x4_t b); // VADDW.S16 q0,q0,d0
485_NEON2SSESTORAGE int64x2_t vaddw_s32(int64x2_t a, int32x2_t b); // VADDW.S32 q0,q0,d0
486_NEON2SSESTORAGE uint16x8_t vaddw_u8(uint16x8_t a, uint8x8_t b); // VADDW.U8 q0,q0,d0
487_NEON2SSESTORAGE uint32x4_t vaddw_u16(uint32x4_t a, uint16x4_t b); // VADDW.U16 q0,q0,d0
488_NEON2SSESTORAGE uint64x2_t vaddw_u32(uint64x2_t a, uint32x2_t b); // VADDW.U32 q0,q0,d0
489//Vector halving add: vhadd -> Vr[i]:=(Va[i]+Vb[i])>>1
490_NEON2SSESTORAGE int8x8_t vhadd_s8(int8x8_t a, int8x8_t b); // VHADD.S8 d0,d0,d0
491_NEON2SSESTORAGE int16x4_t vhadd_s16(int16x4_t a, int16x4_t b); // VHADD.S16 d0,d0,d0
492_NEON2SSESTORAGE int32x2_t vhadd_s32(int32x2_t a, int32x2_t b); // VHADD.S32 d0,d0,d0
493_NEON2SSESTORAGE uint8x8_t vhadd_u8(uint8x8_t a, uint8x8_t b); // VHADD.U8 d0,d0,d0
494_NEON2SSESTORAGE uint16x4_t vhadd_u16(uint16x4_t a, uint16x4_t b); // VHADD.U16 d0,d0,d0
495_NEON2SSESTORAGE uint32x2_t vhadd_u32(uint32x2_t a, uint32x2_t b); // VHADD.U32 d0,d0,d0
496_NEON2SSESTORAGE int8x16_t vhaddq_s8(int8x16_t a, int8x16_t b); // VHADD.S8 q0,q0,q0
497_NEON2SSESTORAGE int16x8_t vhaddq_s16(int16x8_t a, int16x8_t b); // VHADD.S16 q0,q0,q0
498_NEON2SSESTORAGE int32x4_t vhaddq_s32(int32x4_t a, int32x4_t b); // VHADD.S32 q0,q0,q0
499_NEON2SSESTORAGE uint8x16_t vhaddq_u8(uint8x16_t a, uint8x16_t b); // VHADD.U8 q0,q0,q0
500_NEON2SSESTORAGE uint16x8_t vhaddq_u16(uint16x8_t a, uint16x8_t b); // VHADD.U16 q0,q0,q0
501_NEON2SSESTORAGE uint32x4_t vhaddq_u32(uint32x4_t a, uint32x4_t b); // VHADD.U32 q0,q0,q0
502//Vector rounding halving add: vrhadd -> Vr[i]:=(Va[i]+Vb[i]+1)>>1
503_NEON2SSESTORAGE int8x8_t vrhadd_s8(int8x8_t a, int8x8_t b); // VRHADD.S8 d0,d0,d0
504_NEON2SSESTORAGE int16x4_t vrhadd_s16(int16x4_t a, int16x4_t b); // VRHADD.S16 d0,d0,d0
505_NEON2SSESTORAGE int32x2_t vrhadd_s32(int32x2_t a, int32x2_t b); // VRHADD.S32 d0,d0,d0
506_NEON2SSESTORAGE uint8x8_t vrhadd_u8(uint8x8_t a, uint8x8_t b); // VRHADD.U8 d0,d0,d0
507_NEON2SSESTORAGE uint16x4_t vrhadd_u16(uint16x4_t a, uint16x4_t b); // VRHADD.U16 d0,d0,d0
508_NEON2SSESTORAGE uint32x2_t vrhadd_u32(uint32x2_t a, uint32x2_t b); // VRHADD.U32 d0,d0,d0
509_NEON2SSESTORAGE int8x16_t vrhaddq_s8(int8x16_t a, int8x16_t b); // VRHADD.S8 q0,q0,q0
510_NEON2SSESTORAGE int16x8_t vrhaddq_s16(int16x8_t a, int16x8_t b); // VRHADD.S16 q0,q0,q0
511_NEON2SSESTORAGE int32x4_t vrhaddq_s32(int32x4_t a, int32x4_t b); // VRHADD.S32 q0,q0,q0
512_NEON2SSE_GLOBAL uint8x16_t vrhaddq_u8(uint8x16_t a, uint8x16_t b); // VRHADD.U8 q0,q0,q0
513_NEON2SSE_GLOBAL uint16x8_t vrhaddq_u16(uint16x8_t a, uint16x8_t b); // VRHADD.U16 q0,q0,q0
514_NEON2SSESTORAGE uint32x4_t vrhaddq_u32(uint32x4_t a, uint32x4_t b); // VRHADD.U32 q0,q0,q0
515//Vector saturating add: vqadd -> Vr[i]:=sat<size>(Va[i]+Vb[i])
516_NEON2SSESTORAGE int8x8_t vqadd_s8(int8x8_t a, int8x8_t b); // VQADD.S8 d0,d0,d0
517_NEON2SSESTORAGE int16x4_t vqadd_s16(int16x4_t a, int16x4_t b); // VQADD.S16 d0,d0,d0
518_NEON2SSESTORAGE int32x2_t vqadd_s32(int32x2_t a, int32x2_t b); // VQADD.S32 d0,d0,d0
519_NEON2SSESTORAGE int64x1_t vqadd_s64(int64x1_t a, int64x1_t b); // VQADD.S64 d0,d0,d0
520_NEON2SSESTORAGE uint8x8_t vqadd_u8(uint8x8_t a, uint8x8_t b); // VQADD.U8 d0,d0,d0
521_NEON2SSESTORAGE uint16x4_t vqadd_u16(uint16x4_t a, uint16x4_t b); // VQADD.U16 d0,d0,d0
522_NEON2SSESTORAGE uint32x2_t vqadd_u32(uint32x2_t a, uint32x2_t b); // VQADD.U32 d0,d0,d0
523_NEON2SSESTORAGE uint64x1_t vqadd_u64(uint64x1_t a, uint64x1_t b); // VQADD.U64 d0,d0,d0
524_NEON2SSE_GLOBAL int8x16_t vqaddq_s8(int8x16_t a, int8x16_t b); // VQADD.S8 q0,q0,q0
525_NEON2SSE_GLOBAL int16x8_t vqaddq_s16(int16x8_t a, int16x8_t b); // VQADD.S16 q0,q0,q0
526_NEON2SSESTORAGE int32x4_t vqaddq_s32(int32x4_t a, int32x4_t b); // VQADD.S32 q0,q0,q0
527_NEON2SSESTORAGE int64x2_t vqaddq_s64(int64x2_t a, int64x2_t b); // VQADD.S64 q0,q0,q0
528_NEON2SSE_GLOBAL uint8x16_t vqaddq_u8(uint8x16_t a, uint8x16_t b); // VQADD.U8 q0,q0,q0
529_NEON2SSE_GLOBAL uint16x8_t vqaddq_u16(uint16x8_t a, uint16x8_t b); // VQADD.U16 q0,q0,q0
530_NEON2SSESTORAGE uint32x4_t vqaddq_u32(uint32x4_t a, uint32x4_t b); // VQADD.U32 q0,q0,q0
531_NEON2SSESTORAGE uint64x2_t vqaddq_u64(uint64x2_t a, uint64x2_t b); // VQADD.U64 q0,q0,q0
532//Vector add high half: vaddhn-> Vr[i]:=Va[i]+Vb[i]
533_NEON2SSESTORAGE int8x8_t vaddhn_s16(int16x8_t a, int16x8_t b); // VADDHN.I16 d0,q0,q0
534_NEON2SSESTORAGE int16x4_t vaddhn_s32(int32x4_t a, int32x4_t b); // VADDHN.I32 d0,q0,q0
535_NEON2SSESTORAGE int32x2_t vaddhn_s64(int64x2_t a, int64x2_t b); // VADDHN.I64 d0,q0,q0
536_NEON2SSESTORAGE uint8x8_t vaddhn_u16(uint16x8_t a, uint16x8_t b); // VADDHN.I16 d0,q0,q0
537_NEON2SSESTORAGE uint16x4_t vaddhn_u32(uint32x4_t a, uint32x4_t b); // VADDHN.I32 d0,q0,q0
538_NEON2SSE_GLOBAL uint32x2_t vaddhn_u64(uint64x2_t a, uint64x2_t b); // VADDHN.I64 d0,q0,q0
539//Vector rounding add high half: vraddhn
540_NEON2SSESTORAGE int8x8_t vraddhn_s16(int16x8_t a, int16x8_t b); // VRADDHN.I16 d0,q0,q0
541_NEON2SSESTORAGE int16x4_t vraddhn_s32(int32x4_t a, int32x4_t b); // VRADDHN.I32 d0,q0,q0
542_NEON2SSESTORAGE int32x2_t vraddhn_s64(int64x2_t a, int64x2_t b); // VRADDHN.I64 d0,q0,q0
543_NEON2SSESTORAGE uint8x8_t vraddhn_u16(uint16x8_t a, uint16x8_t b); // VRADDHN.I16 d0,q0,q0
544_NEON2SSESTORAGE uint16x4_t vraddhn_u32(uint32x4_t a, uint32x4_t b); // VRADDHN.I32 d0,q0,q0
545_NEON2SSE_GLOBAL uint32x2_t vraddhn_u64(uint64x2_t a, uint64x2_t b); // VRADDHN.I64 d0,q0,q0
546//Multiplication
547//Vector multiply: vmul -> Vr[i] := Va[i] * Vb[i]
548_NEON2SSESTORAGE int8x8_t vmul_s8(int8x8_t a, int8x8_t b); // VMUL.I8 d0,d0,d0
549_NEON2SSE_GLOBAL int16x4_t vmul_s16(int16x4_t a, int16x4_t b); // VMUL.I16 d0,d0,d0
550_NEON2SSE_GLOBAL int32x2_t vmul_s32(int32x2_t a, int32x2_t b); // VMUL.I32 d0,d0,d0
551_NEON2SSESTORAGE float32x2_t vmul_f32(float32x2_t a, float32x2_t b); // VMUL.F32 d0,d0,d0
552_NEON2SSESTORAGE uint8x8_t vmul_u8(uint8x8_t a, uint8x8_t b); // VMUL.I8 d0,d0,d0
553_NEON2SSESTORAGE uint16x4_t vmul_u16(uint16x4_t a, uint16x4_t b); // VMUL.I16 d0,d0,d0
554_NEON2SSESTORAGE uint32x2_t vmul_u32(uint32x2_t a, uint32x2_t b); // VMUL.I32 d0,d0,d0
555_NEON2SSESTORAGE poly8x8_t vmul_p8(poly8x8_t a, poly8x8_t b); // VMUL.P8 d0,d0,d0
556_NEON2SSESTORAGE int8x16_t vmulq_s8(int8x16_t a, int8x16_t b); // VMUL.I8 q0,q0,q0
557_NEON2SSE_GLOBAL int16x8_t vmulq_s16(int16x8_t a, int16x8_t b); // VMUL.I16 q0,q0,q0
558_NEON2SSE_GLOBAL int32x4_t vmulq_s32(int32x4_t a, int32x4_t b); // VMUL.I32 q0,q0,q0
559_NEON2SSE_GLOBAL float32x4_t vmulq_f32(float32x4_t a, float32x4_t b); // VMUL.F32 q0,q0,q0
560_NEON2SSESTORAGE uint8x16_t vmulq_u8(uint8x16_t a, uint8x16_t b); // VMUL.I8 q0,q0,q0
561_NEON2SSE_GLOBAL uint16x8_t vmulq_u16(uint16x8_t a, uint16x8_t b); // VMUL.I16 q0,q0,q0
562_NEON2SSE_GLOBAL uint32x4_t vmulq_u32(uint32x4_t a, uint32x4_t b); // VMUL.I32 q0,q0,q0
563_NEON2SSESTORAGE poly8x16_t vmulq_p8(poly8x16_t a, poly8x16_t b); // VMUL.P8 q0,q0,q0
564//multiply lane
565_NEON2SSESTORAGE int16x4_t vmul_lane_s16 (int16x4_t a, int16x4_t b, __constrange(0,3) int c);
566_NEON2SSESTORAGE int32x2_t vmul_lane_s32 (int32x2_t a, int32x2_t b, __constrange(0,1) int c);
567_NEON2SSESTORAGE float32x2_t vmul_lane_f32 (float32x2_t a, float32x2_t b, __constrange(0,1) int c);
568_NEON2SSE_GLOBAL uint16x4_t vmul_lane_u16 (uint16x4_t a, uint16x4_t b, __constrange(0,3) int c);
569_NEON2SSE_GLOBAL uint32x2_t vmul_lane_u32 (uint32x2_t a, uint32x2_t b, __constrange(0,1) int c);
570_NEON2SSESTORAGE int16x8_t vmulq_lane_s16 (int16x8_t a, int16x4_t b, __constrange(0,3) int c);
571_NEON2SSESTORAGE int32x4_t vmulq_lane_s32 (int32x4_t a, int32x2_t b, __constrange(0,1) int c);
572_NEON2SSESTORAGE float32x4_t vmulq_lane_f32 (float32x4_t a, float32x2_t b, __constrange(0,1) int c);
573_NEON2SSE_GLOBAL uint16x8_t vmulq_lane_u16 (uint16x8_t a, uint16x4_t b, __constrange(0,3) int c);
574_NEON2SSE_GLOBAL uint32x4_t vmulq_lane_u32 (uint32x4_t a, uint32x2_t b, __constrange(0,1) int c);
575//Vector multiply accumulate: vmla -> Vr[i] := Va[i] + Vb[i] * Vc[i]
576_NEON2SSESTORAGE int8x8_t vmla_s8(int8x8_t a, int8x8_t b, int8x8_t c); // VMLA.I8 d0,d0,d0
577_NEON2SSESTORAGE int16x4_t vmla_s16(int16x4_t a, int16x4_t b, int16x4_t c); // VMLA.I16 d0,d0,d0
578_NEON2SSESTORAGE int32x2_t vmla_s32(int32x2_t a, int32x2_t b, int32x2_t c); // VMLA.I32 d0,d0,d0
579_NEON2SSESTORAGE float32x2_t vmla_f32(float32x2_t a, float32x2_t b, float32x2_t c); // VMLA.F32 d0,d0,d0
580_NEON2SSESTORAGE uint8x8_t vmla_u8(uint8x8_t a, uint8x8_t b, uint8x8_t c); // VMLA.I8 d0,d0,d0
581_NEON2SSE_GLOBAL uint16x4_t vmla_u16(uint16x4_t a, uint16x4_t b, uint16x4_t c); // VMLA.I16 d0,d0,d0
582_NEON2SSE_GLOBAL uint32x2_t vmla_u32(uint32x2_t a, uint32x2_t b, uint32x2_t c); // VMLA.I32 d0,d0,d0
583_NEON2SSESTORAGE int8x16_t vmlaq_s8(int8x16_t a, int8x16_t b, int8x16_t c); // VMLA.I8 q0,q0,q0
584_NEON2SSESTORAGE int16x8_t vmlaq_s16(int16x8_t a, int16x8_t b, int16x8_t c); // VMLA.I16 q0,q0,q0
585_NEON2SSESTORAGE int32x4_t vmlaq_s32(int32x4_t a, int32x4_t b, int32x4_t c); // VMLA.I32 q0,q0,q0
586_NEON2SSESTORAGE float32x4_t vmlaq_f32(float32x4_t a, float32x4_t b, float32x4_t c); // VMLA.F32 q0,q0,q0
587_NEON2SSESTORAGE uint8x16_t vmlaq_u8(uint8x16_t a, uint8x16_t b, uint8x16_t c); // VMLA.I8 q0,q0,q0
588_NEON2SSE_GLOBAL uint16x8_t vmlaq_u16(uint16x8_t a, uint16x8_t b, uint16x8_t c); // VMLA.I16 q0,q0,q0
589_NEON2SSE_GLOBAL uint32x4_t vmlaq_u32(uint32x4_t a, uint32x4_t b, uint32x4_t c); // VMLA.I32 q0,q0,q0
590//Vector multiply accumulate long: vmlal -> Vr[i] := Va[i] + Vb[i] * Vc[i]
591_NEON2SSESTORAGE int16x8_t vmlal_s8(int16x8_t a, int8x8_t b, int8x8_t c); // VMLAL.S8 q0,d0,d0
592_NEON2SSESTORAGE int32x4_t vmlal_s16(int32x4_t a, int16x4_t b, int16x4_t c); // VMLAL.S16 q0,d0,d0
593_NEON2SSESTORAGE int64x2_t vmlal_s32(int64x2_t a, int32x2_t b, int32x2_t c); // VMLAL.S32 q0,d0,d0
594_NEON2SSESTORAGE uint16x8_t vmlal_u8(uint16x8_t a, uint8x8_t b, uint8x8_t c); // VMLAL.U8 q0,d0,d0
595_NEON2SSESTORAGE uint32x4_t vmlal_u16(uint32x4_t a, uint16x4_t b, uint16x4_t c); // VMLAL.U16 q0,d0,d0
596_NEON2SSESTORAGE uint64x2_t vmlal_u32(uint64x2_t a, uint32x2_t b, uint32x2_t c); // VMLAL.U32 q0,d0,d0
597//Vector multiply subtract: vmls -> Vr[i] := Va[i] - Vb[i] * Vc[i]
598_NEON2SSESTORAGE int8x8_t vmls_s8(int8x8_t a, int8x8_t b, int8x8_t c); // VMLS.I8 d0,d0,d0
599_NEON2SSESTORAGE int16x4_t vmls_s16(int16x4_t a, int16x4_t b, int16x4_t c); // VMLS.I16 d0,d0,d0
600_NEON2SSESTORAGE int32x2_t vmls_s32(int32x2_t a, int32x2_t b, int32x2_t c); // VMLS.I32 d0,d0,d0
601_NEON2SSESTORAGE float32x2_t vmls_f32(float32x2_t a, float32x2_t b, float32x2_t c); // VMLS.F32 d0,d0,d0
602_NEON2SSESTORAGE uint8x8_t vmls_u8(uint8x8_t a, uint8x8_t b, uint8x8_t c); // VMLS.I8 d0,d0,d0
603_NEON2SSE_GLOBAL uint16x4_t vmls_u16(uint16x4_t a, uint16x4_t b, uint16x4_t c); // VMLS.I16 d0,d0,d0
604_NEON2SSE_GLOBAL uint32x2_t vmls_u32(uint32x2_t a, uint32x2_t b, uint32x2_t c); // VMLS.I32 d0,d0,d0
605_NEON2SSESTORAGE int8x16_t vmlsq_s8(int8x16_t a, int8x16_t b, int8x16_t c); // VMLS.I8 q0,q0,q0
606_NEON2SSESTORAGE int16x8_t vmlsq_s16(int16x8_t a, int16x8_t b, int16x8_t c); // VMLS.I16 q0,q0,q0
607_NEON2SSESTORAGE int32x4_t vmlsq_s32(int32x4_t a, int32x4_t b, int32x4_t c); // VMLS.I32 q0,q0,q0
608_NEON2SSESTORAGE float32x4_t vmlsq_f32(float32x4_t a, float32x4_t b, float32x4_t c); // VMLS.F32 q0,q0,q0
609_NEON2SSESTORAGE uint8x16_t vmlsq_u8(uint8x16_t a, uint8x16_t b, uint8x16_t c); // VMLS.I8 q0,q0,q0
610_NEON2SSE_GLOBAL uint16x8_t vmlsq_u16(uint16x8_t a, uint16x8_t b, uint16x8_t c); // VMLS.I16 q0,q0,q0
611_NEON2SSE_GLOBAL uint32x4_t vmlsq_u32(uint32x4_t a, uint32x4_t b, uint32x4_t c); // VMLS.I32 q0,q0,q0
612//Vector multiply subtract long
613_NEON2SSESTORAGE int16x8_t vmlsl_s8(int16x8_t a, int8x8_t b, int8x8_t c); // VMLSL.S8 q0,d0,d0
614_NEON2SSESTORAGE int32x4_t vmlsl_s16(int32x4_t a, int16x4_t b, int16x4_t c); // VMLSL.S16 q0,d0,d0
615_NEON2SSESTORAGE int64x2_t vmlsl_s32(int64x2_t a, int32x2_t b, int32x2_t c); // VMLSL.S32 q0,d0,d0
616_NEON2SSESTORAGE uint16x8_t vmlsl_u8(uint16x8_t a, uint8x8_t b, uint8x8_t c); // VMLSL.U8 q0,d0,d0
617_NEON2SSESTORAGE uint32x4_t vmlsl_u16(uint32x4_t a, uint16x4_t b, uint16x4_t c); // VMLSL.U16 q0,d0,d0
618_NEON2SSESTORAGE uint64x2_t vmlsl_u32(uint64x2_t a, uint32x2_t b, uint32x2_t c); // VMLSL.U32 q0,d0,d0
619//Vector saturating doubling multiply high
620_NEON2SSESTORAGE int16x4_t vqdmulh_s16(int16x4_t a, int16x4_t b); // VQDMULH.S16 d0,d0,d0
621_NEON2SSESTORAGE int32x2_t vqdmulh_s32(int32x2_t a, int32x2_t b); // VQDMULH.S32 d0,d0,d0
622_NEON2SSESTORAGE int16x8_t vqdmulhq_s16(int16x8_t a, int16x8_t b); // VQDMULH.S16 q0,q0,q0
623_NEON2SSESTORAGE int32x4_t vqdmulhq_s32(int32x4_t a, int32x4_t b); // VQDMULH.S32 q0,q0,q0
624//Vector saturating rounding doubling multiply high
625_NEON2SSESTORAGE int16x4_t vqrdmulh_s16(int16x4_t a, int16x4_t b); // VQRDMULH.S16 d0,d0,d0
626_NEON2SSESTORAGE int32x2_t vqrdmulh_s32(int32x2_t a, int32x2_t b); // VQRDMULH.S32 d0,d0,d0
627_NEON2SSESTORAGE int16x8_t vqrdmulhq_s16(int16x8_t a, int16x8_t b); // VQRDMULH.S16 q0,q0,q0
628_NEON2SSESTORAGE int32x4_t vqrdmulhq_s32(int32x4_t a, int32x4_t b); // VQRDMULH.S32 q0,q0,q0
629//Vector saturating doubling multiply accumulate long
630_NEON2SSESTORAGE int32x4_t vqdmlal_s16(int32x4_t a, int16x4_t b, int16x4_t c); // VQDMLAL.S16 q0,d0,d0
631_NEON2SSESTORAGE int64x2_t vqdmlal_s32(int64x2_t a, int32x2_t b, int32x2_t c); // VQDMLAL.S32 q0,d0,d0
632//Vector saturating doubling multiply subtract long
633_NEON2SSESTORAGE int32x4_t vqdmlsl_s16(int32x4_t a, int16x4_t b, int16x4_t c); // VQDMLSL.S16 q0,d0,d0
634_NEON2SSESTORAGE int64x2_t vqdmlsl_s32(int64x2_t a, int32x2_t b, int32x2_t c); // VQDMLSL.S32 q0,d0,d0
635//Vector long multiply
636_NEON2SSESTORAGE int16x8_t vmull_s8(int8x8_t a, int8x8_t b); // VMULL.S8 q0,d0,d0
637_NEON2SSESTORAGE int32x4_t vmull_s16(int16x4_t a, int16x4_t b); // VMULL.S16 q0,d0,d0
638_NEON2SSESTORAGE int64x2_t vmull_s32(int32x2_t a, int32x2_t b); // VMULL.S32 q0,d0,d0
639_NEON2SSESTORAGE uint16x8_t vmull_u8(uint8x8_t a, uint8x8_t b); // VMULL.U8 q0,d0,d0
640_NEON2SSESTORAGE uint32x4_t vmull_u16(uint16x4_t a, uint16x4_t b); // VMULL.U16 q0,d0,d0
641_NEON2SSESTORAGE uint64x2_t vmull_u32(uint32x2_t a, uint32x2_t b); // VMULL.U32 q0,d0,d0
642_NEON2SSESTORAGE poly16x8_t vmull_p8(poly8x8_t a, poly8x8_t b); // VMULL.P8 q0,d0,d0
643//Vector saturating doubling long multiply
644_NEON2SSESTORAGE int32x4_t vqdmull_s16(int16x4_t a, int16x4_t b); // VQDMULL.S16 q0,d0,d0
645_NEON2SSESTORAGE int64x2_t vqdmull_s32(int32x2_t a, int32x2_t b); // VQDMULL.S32 q0,d0,d0
646//Subtraction
647//Vector subtract
648_NEON2SSESTORAGE int8x8_t vsub_s8(int8x8_t a, int8x8_t b); // VSUB.I8 d0,d0,d0
649_NEON2SSESTORAGE int16x4_t vsub_s16(int16x4_t a, int16x4_t b); // VSUB.I16 d0,d0,d0
650_NEON2SSESTORAGE int32x2_t vsub_s32(int32x2_t a, int32x2_t b); // VSUB.I32 d0,d0,d0
651_NEON2SSESTORAGE int64x1_t vsub_s64(int64x1_t a, int64x1_t b); // VSUB.I64 d0,d0,d0
652_NEON2SSESTORAGE float32x2_t vsub_f32(float32x2_t a, float32x2_t b); // VSUB.F32 d0,d0,d0
653_NEON2SSE_GLOBAL uint8x8_t vsub_u8(uint8x8_t a, uint8x8_t b); // VSUB.I8 d0,d0,d0
654_NEON2SSE_GLOBAL uint16x4_t vsub_u16(uint16x4_t a, uint16x4_t b); // VSUB.I16 d0,d0,d0
655_NEON2SSE_GLOBAL uint32x2_t vsub_u32(uint32x2_t a, uint32x2_t b); // VSUB.I32 d0,d0,d0
656_NEON2SSESTORAGE uint64x1_t vsub_u64(uint64x1_t a, uint64x1_t b); // VSUB.I64 d0,d0,d0
657_NEON2SSE_GLOBAL int8x16_t vsubq_s8(int8x16_t a, int8x16_t b); // VSUB.I8 q0,q0,q0
658_NEON2SSE_GLOBAL int16x8_t vsubq_s16(int16x8_t a, int16x8_t b); // VSUB.I16 q0,q0,q0
659_NEON2SSE_GLOBAL int32x4_t vsubq_s32(int32x4_t a, int32x4_t b); // VSUB.I32 q0,q0,q0
660_NEON2SSE_GLOBAL int64x2_t vsubq_s64(int64x2_t a, int64x2_t b); // VSUB.I64 q0,q0,q0
661_NEON2SSE_GLOBAL float32x4_t vsubq_f32(float32x4_t a, float32x4_t b); // VSUB.F32 q0,q0,q0
662_NEON2SSE_GLOBAL uint8x16_t vsubq_u8(uint8x16_t a, uint8x16_t b); // VSUB.I8 q0,q0,q0
663_NEON2SSE_GLOBAL uint16x8_t vsubq_u16(uint16x8_t a, uint16x8_t b); // VSUB.I16 q0,q0,q0
664_NEON2SSE_GLOBAL uint32x4_t vsubq_u32(uint32x4_t a, uint32x4_t b); // VSUB.I32 q0,q0,q0
665_NEON2SSE_GLOBAL uint64x2_t vsubq_u64(uint64x2_t a, uint64x2_t b); // VSUB.I64 q0,q0,q0
666//Vector long subtract: vsub -> Vr[i]:=Va[i]+Vb[i]
667_NEON2SSESTORAGE int16x8_t vsubl_s8(int8x8_t a, int8x8_t b); // VSUBL.S8 q0,d0,d0
668_NEON2SSESTORAGE int32x4_t vsubl_s16(int16x4_t a, int16x4_t b); // VSUBL.S16 q0,d0,d0
669_NEON2SSESTORAGE int64x2_t vsubl_s32(int32x2_t a, int32x2_t b); // VSUBL.S32 q0,d0,d0
670_NEON2SSESTORAGE uint16x8_t vsubl_u8(uint8x8_t a, uint8x8_t b); // VSUBL.U8 q0,d0,d0
671_NEON2SSESTORAGE uint32x4_t vsubl_u16(uint16x4_t a, uint16x4_t b); // VSUBL.U16 q0,d0,d0
672_NEON2SSESTORAGE uint64x2_t vsubl_u32(uint32x2_t a, uint32x2_t b); // VSUBL.U32 q0,d0,d0
673//Vector wide subtract: vsub -> Vr[i]:=Va[i]+Vb[i]
674_NEON2SSESTORAGE int16x8_t vsubw_s8(int16x8_t a, int8x8_t b); // VSUBW.S8 q0,q0,d0
675_NEON2SSESTORAGE int32x4_t vsubw_s16(int32x4_t a, int16x4_t b); // VSUBW.S16 q0,q0,d0
676_NEON2SSESTORAGE int64x2_t vsubw_s32(int64x2_t a, int32x2_t b); // VSUBW.S32 q0,q0,d0
677_NEON2SSESTORAGE uint16x8_t vsubw_u8(uint16x8_t a, uint8x8_t b); // VSUBW.U8 q0,q0,d0
678_NEON2SSESTORAGE uint32x4_t vsubw_u16(uint32x4_t a, uint16x4_t b); // VSUBW.U16 q0,q0,d0
679_NEON2SSESTORAGE uint64x2_t vsubw_u32(uint64x2_t a, uint32x2_t b); // VSUBW.U32 q0,q0,d0
680//Vector saturating subtract
681_NEON2SSESTORAGE int8x8_t vqsub_s8(int8x8_t a, int8x8_t b); // VQSUB.S8 d0,d0,d0
682_NEON2SSESTORAGE int16x4_t vqsub_s16(int16x4_t a, int16x4_t b); // VQSUB.S16 d0,d0,d0
683_NEON2SSESTORAGE int32x2_t vqsub_s32(int32x2_t a, int32x2_t b); // VQSUB.S32 d0,d0,d0
684_NEON2SSESTORAGE int64x1_t vqsub_s64(int64x1_t a, int64x1_t b); // VQSUB.S64 d0,d0,d0
685_NEON2SSESTORAGE uint8x8_t vqsub_u8(uint8x8_t a, uint8x8_t b); // VQSUB.U8 d0,d0,d0
686_NEON2SSESTORAGE uint16x4_t vqsub_u16(uint16x4_t a, uint16x4_t b); // VQSUB.U16 d0,d0,d0
687_NEON2SSESTORAGE uint32x2_t vqsub_u32(uint32x2_t a, uint32x2_t b); // VQSUB.U32 d0,d0,d0
688_NEON2SSESTORAGE uint64x1_t vqsub_u64(uint64x1_t a, uint64x1_t b); // VQSUB.U64 d0,d0,d0
689_NEON2SSE_GLOBAL int8x16_t vqsubq_s8(int8x16_t a, int8x16_t b); // VQSUB.S8 q0,q0,q0
690_NEON2SSE_GLOBAL int16x8_t vqsubq_s16(int16x8_t a, int16x8_t b); // VQSUB.S16 q0,q0,q0
691_NEON2SSESTORAGE int32x4_t vqsubq_s32(int32x4_t a, int32x4_t b); // VQSUB.S32 q0,q0,q0
692_NEON2SSESTORAGE int64x2_t vqsubq_s64(int64x2_t a, int64x2_t b); // VQSUB.S64 q0,q0,q0
693_NEON2SSE_GLOBAL uint8x16_t vqsubq_u8(uint8x16_t a, uint8x16_t b); // VQSUB.U8 q0,q0,q0
694_NEON2SSE_GLOBAL uint16x8_t vqsubq_u16(uint16x8_t a, uint16x8_t b); // VQSUB.U16 q0,q0,q0
695_NEON2SSESTORAGE uint32x4_t vqsubq_u32(uint32x4_t a, uint32x4_t b); // VQSUB.U32 q0,q0,q0
696_NEON2SSESTORAGE uint64x2_t vqsubq_u64(uint64x2_t a, uint64x2_t b); // VQSUB.U64 q0,q0,q0
697//Vector halving subtract
698_NEON2SSESTORAGE int8x8_t vhsub_s8(int8x8_t a, int8x8_t b); // VHSUB.S8 d0,d0,d0
699_NEON2SSESTORAGE int16x4_t vhsub_s16(int16x4_t a, int16x4_t b); // VHSUB.S16 d0,d0,d0
700_NEON2SSESTORAGE int32x2_t vhsub_s32(int32x2_t a, int32x2_t b); // VHSUB.S32 d0,d0,d0
701_NEON2SSESTORAGE uint8x8_t vhsub_u8(uint8x8_t a, uint8x8_t b); // VHSUB.U8 d0,d0,d0
702_NEON2SSESTORAGE uint16x4_t vhsub_u16(uint16x4_t a, uint16x4_t b); // VHSUB.U16 d0,d0,d0
703_NEON2SSESTORAGE uint32x2_t vhsub_u32(uint32x2_t a, uint32x2_t b); // VHSUB.U32 d0,d0,d0
704_NEON2SSESTORAGE int8x16_t vhsubq_s8(int8x16_t a, int8x16_t b); // VHSUB.S8 q0,q0,q0
705_NEON2SSESTORAGE int16x8_t vhsubq_s16(int16x8_t a, int16x8_t b); // VHSUB.S16 q0,q0,q0
706_NEON2SSESTORAGE int32x4_t vhsubq_s32(int32x4_t a, int32x4_t b); // VHSUB.S32 q0,q0,q0
707_NEON2SSESTORAGE uint8x16_t vhsubq_u8(uint8x16_t a, uint8x16_t b); // VHSUB.U8 q0,q0,q0
708_NEON2SSESTORAGE uint16x8_t vhsubq_u16(uint16x8_t a, uint16x8_t b); // VHSUB.U16 q0,q0,q0
709_NEON2SSESTORAGE uint32x4_t vhsubq_u32(uint32x4_t a, uint32x4_t b); // VHSUB.U32 q0,q0,q0
710//Vector subtract high half
711_NEON2SSESTORAGE int8x8_t vsubhn_s16(int16x8_t a, int16x8_t b); // VSUBHN.I16 d0,q0,q0
712_NEON2SSESTORAGE int16x4_t vsubhn_s32(int32x4_t a, int32x4_t b); // VSUBHN.I32 d0,q0,q0
713_NEON2SSESTORAGE int32x2_t vsubhn_s64(int64x2_t a, int64x2_t b); // VSUBHN.I64 d0,q0,q0
714_NEON2SSESTORAGE uint8x8_t vsubhn_u16(uint16x8_t a, uint16x8_t b); // VSUBHN.I16 d0,q0,q0
715_NEON2SSESTORAGE uint16x4_t vsubhn_u32(uint32x4_t a, uint32x4_t b); // VSUBHN.I32 d0,q0,q0
716_NEON2SSE_GLOBAL uint32x2_t vsubhn_u64(uint64x2_t a, uint64x2_t b); // VSUBHN.I64 d0,q0,q0
717//Vector rounding subtract high half
718_NEON2SSESTORAGE int8x8_t vrsubhn_s16(int16x8_t a, int16x8_t b); // VRSUBHN.I16 d0,q0,q0
719_NEON2SSESTORAGE int16x4_t vrsubhn_s32(int32x4_t a, int32x4_t b); // VRSUBHN.I32 d0,q0,q0
720_NEON2SSESTORAGE int32x2_t vrsubhn_s64(int64x2_t a, int64x2_t b); // VRSUBHN.I64 d0,q0,q0
721_NEON2SSESTORAGE uint8x8_t vrsubhn_u16(uint16x8_t a, uint16x8_t b); // VRSUBHN.I16 d0,q0,q0
722_NEON2SSESTORAGE uint16x4_t vrsubhn_u32(uint32x4_t a, uint32x4_t b); // VRSUBHN.I32 d0,q0,q0
723_NEON2SSE_GLOBAL uint32x2_t vrsubhn_u64(uint64x2_t a, uint64x2_t b); // VRSUBHN.I64 d0,q0,q0
724//Comparison
725//Vector compare equal
726_NEON2SSESTORAGE uint8x8_t vceq_s8(int8x8_t a, int8x8_t b); // VCEQ.I8 d0, d0, d0
727_NEON2SSESTORAGE uint16x4_t vceq_s16(int16x4_t a, int16x4_t b); // VCEQ.I16 d0, d0, d0
728_NEON2SSESTORAGE uint32x2_t vceq_s32(int32x2_t a, int32x2_t b); // VCEQ.I32 d0, d0, d0
729_NEON2SSESTORAGE uint32x2_t vceq_f32(float32x2_t a, float32x2_t b); // VCEQ.F32 d0, d0, d0
730_NEON2SSESTORAGE uint8x8_t vceq_u8(uint8x8_t a, uint8x8_t b); // VCEQ.I8 d0, d0, d0
731_NEON2SSESTORAGE uint16x4_t vceq_u16(uint16x4_t a, uint16x4_t b); // VCEQ.I16 d0, d0, d0
732_NEON2SSESTORAGE uint32x2_t vceq_u32(uint32x2_t a, uint32x2_t b); // VCEQ.I32 d0, d0, d0
733_NEON2SSE_GLOBAL uint8x8_t vceq_p8(poly8x8_t a, poly8x8_t b); // VCEQ.I8 d0, d0, d0
734_NEON2SSE_GLOBAL uint8x16_t vceqq_s8(int8x16_t a, int8x16_t b); // VCEQ.I8 q0, q0, q0
735_NEON2SSE_GLOBAL uint16x8_t vceqq_s16(int16x8_t a, int16x8_t b); // VCEQ.I16 q0, q0, q0
736_NEON2SSE_GLOBAL uint32x4_t vceqq_s32(int32x4_t a, int32x4_t b); // VCEQ.I32 q0, q0, q0
737_NEON2SSESTORAGE uint32x4_t vceqq_f32(float32x4_t a, float32x4_t b); // VCEQ.F32 q0, q0, q0
738_NEON2SSE_GLOBAL uint8x16_t vceqq_u8(uint8x16_t a, uint8x16_t b); // VCEQ.I8 q0, q0, q0
739_NEON2SSE_GLOBAL uint16x8_t vceqq_u16(uint16x8_t a, uint16x8_t b); // VCEQ.I16 q0, q0, q0
740_NEON2SSE_GLOBAL uint32x4_t vceqq_u32(uint32x4_t a, uint32x4_t b); // VCEQ.I32 q0, q0, q0
741_NEON2SSE_GLOBAL uint8x16_t vceqq_p8(poly8x16_t a, poly8x16_t b); // VCEQ.I8 q0, q0, q0
742//Vector compare greater-than or equal
743_NEON2SSESTORAGE uint8x8_t vcge_s8(int8x8_t a, int8x8_t b); // VCGE.S8 d0, d0, d0
744_NEON2SSESTORAGE uint16x4_t vcge_s16(int16x4_t a, int16x4_t b); // VCGE.S16 d0, d0, d0
745_NEON2SSESTORAGE uint32x2_t vcge_s32(int32x2_t a, int32x2_t b); // VCGE.S32 d0, d0, d0
746_NEON2SSESTORAGE uint32x2_t vcge_f32(float32x2_t a, float32x2_t b); // VCGE.F32 d0, d0, d0
747_NEON2SSESTORAGE uint8x8_t vcge_u8(uint8x8_t a, uint8x8_t b); // VCGE.U8 d0, d0, d0
748_NEON2SSESTORAGE uint16x4_t vcge_u16(uint16x4_t a, uint16x4_t b); // VCGE.U16 d0, d0, d0
749_NEON2SSESTORAGE uint32x2_t vcge_u32(uint32x2_t a, uint32x2_t b); // VCGE.U32 d0, d0, d0
750_NEON2SSESTORAGE uint8x16_t vcgeq_s8(int8x16_t a, int8x16_t b); // VCGE.S8 q0, q0, q0
751_NEON2SSESTORAGE uint16x8_t vcgeq_s16(int16x8_t a, int16x8_t b); // VCGE.S16 q0, q0, q0
752_NEON2SSESTORAGE uint32x4_t vcgeq_s32(int32x4_t a, int32x4_t b); // VCGE.S32 q0, q0, q0
753_NEON2SSESTORAGE uint32x4_t vcgeq_f32(float32x4_t a, float32x4_t b); // VCGE.F32 q0, q0, q0
754_NEON2SSESTORAGE uint8x16_t vcgeq_u8(uint8x16_t a, uint8x16_t b); // VCGE.U8 q0, q0, q0
755_NEON2SSESTORAGE uint16x8_t vcgeq_u16(uint16x8_t a, uint16x8_t b); // VCGE.U16 q0, q0, q0
756_NEON2SSESTORAGE uint32x4_t vcgeq_u32(uint32x4_t a, uint32x4_t b); // VCGE.U32 q0, q0, q0
757//Vector compare less-than or equal
758_NEON2SSESTORAGE uint8x8_t vcle_s8(int8x8_t a, int8x8_t b); // VCGE.S8 d0, d0, d0
759_NEON2SSESTORAGE uint16x4_t vcle_s16(int16x4_t a, int16x4_t b); // VCGE.S16 d0, d0, d0
760_NEON2SSESTORAGE uint32x2_t vcle_s32(int32x2_t a, int32x2_t b); // VCGE.S32 d0, d0, d0
761_NEON2SSESTORAGE uint32x2_t vcle_f32(float32x2_t a, float32x2_t b); // VCGE.F32 d0, d0, d0
762_NEON2SSE_GLOBAL uint8x8_t vcle_u8(uint8x8_t a, uint8x8_t b); // VCGE.U8 d0, d0, d0
763_NEON2SSE_GLOBAL uint16x4_t vcle_u16(uint16x4_t a, uint16x4_t b); // VCGE.U16 d0, d0, d0
764_NEON2SSE_GLOBAL uint32x2_t vcle_u32(uint32x2_t a, uint32x2_t b); // VCGE.U32 d0, d0, d0
765_NEON2SSESTORAGE uint8x16_t vcleq_s8(int8x16_t a, int8x16_t b); // VCGE.S8 q0, q0, q0
766_NEON2SSESTORAGE uint16x8_t vcleq_s16(int16x8_t a, int16x8_t b); // VCGE.S16 q0, q0, q0
767_NEON2SSESTORAGE uint32x4_t vcleq_s32(int32x4_t a, int32x4_t b); // VCGE.S32 q0, q0, q0
768_NEON2SSESTORAGE uint32x4_t vcleq_f32(float32x4_t a, float32x4_t b); // VCGE.F32 q0, q0, q0
769_NEON2SSESTORAGE uint8x16_t vcleq_u8(uint8x16_t a, uint8x16_t b); // VCGE.U8 q0, q0, q0
770_NEON2SSESTORAGE uint16x8_t vcleq_u16(uint16x8_t a, uint16x8_t b); // VCGE.U16 q0, q0, q0
771_NEON2SSESTORAGE uint32x4_t vcleq_u32(uint32x4_t a, uint32x4_t b); // VCGE.U32 q0, q0, q0
772//Vector compare greater-than
773_NEON2SSESTORAGE uint8x8_t vcgt_s8(int8x8_t a, int8x8_t b); // VCGT.S8 d0, d0, d0
774_NEON2SSESTORAGE uint16x4_t vcgt_s16(int16x4_t a, int16x4_t b); // VCGT.S16 d0, d0, d0
775_NEON2SSESTORAGE uint32x2_t vcgt_s32(int32x2_t a, int32x2_t b); // VCGT.S32 d0, d0, d0
776_NEON2SSESTORAGE uint32x2_t vcgt_f32(float32x2_t a, float32x2_t b); // VCGT.F32 d0, d0, d0
777_NEON2SSESTORAGE uint8x8_t vcgt_u8(uint8x8_t a, uint8x8_t b); // VCGT.U8 d0, d0, d0
778_NEON2SSESTORAGE uint16x4_t vcgt_u16(uint16x4_t a, uint16x4_t b); // VCGT.U16 d0, d0, d0
779_NEON2SSESTORAGE uint32x2_t vcgt_u32(uint32x2_t a, uint32x2_t b); // VCGT.U32 d0, d0, d0
780_NEON2SSE_GLOBAL uint8x16_t vcgtq_s8(int8x16_t a, int8x16_t b); // VCGT.S8 q0, q0, q0
781_NEON2SSE_GLOBAL uint16x8_t vcgtq_s16(int16x8_t a, int16x8_t b); // VCGT.S16 q0, q0, q0
782_NEON2SSE_GLOBAL uint32x4_t vcgtq_s32(int32x4_t a, int32x4_t b); // VCGT.S32 q0, q0, q0
783_NEON2SSESTORAGE uint32x4_t vcgtq_f32(float32x4_t a, float32x4_t b); // VCGT.F32 q0, q0, q0
784_NEON2SSESTORAGE uint8x16_t vcgtq_u8(uint8x16_t a, uint8x16_t b); // VCGT.U8 q0, q0, q0
785_NEON2SSESTORAGE uint16x8_t vcgtq_u16(uint16x8_t a, uint16x8_t b); // VCGT.U16 q0, q0, q0
786_NEON2SSESTORAGE uint32x4_t vcgtq_u32(uint32x4_t a, uint32x4_t b); // VCGT.U32 q0, q0, q0
787//Vector compare less-than
788_NEON2SSE_GLOBAL uint8x8_t vclt_s8(int8x8_t a, int8x8_t b); // VCGT.S8 d0, d0, d0
789_NEON2SSE_GLOBAL uint16x4_t vclt_s16(int16x4_t a, int16x4_t b); // VCGT.S16 d0, d0, d0
790_NEON2SSE_GLOBAL uint32x2_t vclt_s32(int32x2_t a, int32x2_t b); // VCGT.S32 d0, d0, d0
791_NEON2SSE_GLOBAL uint32x2_t vclt_f32(float32x2_t a, float32x2_t b); // VCGT.F32 d0, d0, d0
792_NEON2SSE_GLOBAL uint8x8_t vclt_u8(uint8x8_t a, uint8x8_t b); // VCGT.U8 d0, d0, d0
793_NEON2SSE_GLOBAL uint16x4_t vclt_u16(uint16x4_t a, uint16x4_t b); // VCGT.U16 d0, d0, d0
794_NEON2SSE_GLOBAL uint32x2_t vclt_u32(uint32x2_t a, uint32x2_t b); // VCGT.U32 d0, d0, d0
795_NEON2SSE_GLOBAL uint8x16_t vcltq_s8(int8x16_t a, int8x16_t b); // VCGT.S8 q0, q0, q0
796_NEON2SSE_GLOBAL uint16x8_t vcltq_s16(int16x8_t a, int16x8_t b); // VCGT.S16 q0, q0, q0
797_NEON2SSE_GLOBAL uint32x4_t vcltq_s32(int32x4_t a, int32x4_t b); // VCGT.S32 q0, q0, q0
798_NEON2SSE_GLOBAL uint32x4_t vcltq_f32(float32x4_t a, float32x4_t b); // VCGT.F32 q0, q0, q0
799_NEON2SSE_GLOBAL uint8x16_t vcltq_u8(uint8x16_t a, uint8x16_t b); // VCGT.U8 q0, q0, q0
800_NEON2SSE_GLOBAL uint16x8_t vcltq_u16(uint16x8_t a, uint16x8_t b); // VCGT.U16 q0, q0, q0
801_NEON2SSE_GLOBAL uint32x4_t vcltq_u32(uint32x4_t a, uint32x4_t b); // VCGT.U32 q0, q0, q0
802//Vector compare absolute greater-than or equal
803_NEON2SSESTORAGE uint32x2_t vcage_f32(float32x2_t a, float32x2_t b); // VACGE.F32 d0, d0, d0
804_NEON2SSESTORAGE uint32x4_t vcageq_f32(float32x4_t a, float32x4_t b); // VACGE.F32 q0, q0, q0
805//Vector compare absolute less-than or equal
806_NEON2SSESTORAGE uint32x2_t vcale_f32(float32x2_t a, float32x2_t b); // VACGE.F32 d0, d0, d0
807_NEON2SSESTORAGE uint32x4_t vcaleq_f32(float32x4_t a, float32x4_t b); // VACGE.F32 q0, q0, q0
808//Vector compare absolute greater-than
809_NEON2SSESTORAGE uint32x2_t vcagt_f32(float32x2_t a, float32x2_t b); // VACGT.F32 d0, d0, d0
810_NEON2SSESTORAGE uint32x4_t vcagtq_f32(float32x4_t a, float32x4_t b); // VACGT.F32 q0, q0, q0
811//Vector compare absolute less-than
812_NEON2SSESTORAGE uint32x2_t vcalt_f32(float32x2_t a, float32x2_t b); // VACGT.F32 d0, d0, d0
813_NEON2SSESTORAGE uint32x4_t vcaltq_f32(float32x4_t a, float32x4_t b); // VACGT.F32 q0, q0, q0
814//Vector test bits
815_NEON2SSESTORAGE uint8x8_t vtst_s8(int8x8_t a, int8x8_t b); // VTST.8 d0, d0, d0
816_NEON2SSESTORAGE uint16x4_t vtst_s16(int16x4_t a, int16x4_t b); // VTST.16 d0, d0, d0
817_NEON2SSESTORAGE uint32x2_t vtst_s32(int32x2_t a, int32x2_t b); // VTST.32 d0, d0, d0
818_NEON2SSE_GLOBAL uint8x8_t vtst_u8(uint8x8_t a, uint8x8_t b); // VTST.8 d0, d0, d0
819_NEON2SSE_GLOBAL uint16x4_t vtst_u16(uint16x4_t a, uint16x4_t b); // VTST.16 d0, d0, d0
820_NEON2SSE_GLOBAL uint32x2_t vtst_u32(uint32x2_t a, uint32x2_t b); // VTST.32 d0, d0, d0
821_NEON2SSE_GLOBAL uint8x8_t vtst_p8(poly8x8_t a, poly8x8_t b); // VTST.8 d0, d0, d0
822_NEON2SSESTORAGE uint8x16_t vtstq_s8(int8x16_t a, int8x16_t b); // VTST.8 q0, q0, q0
823_NEON2SSESTORAGE uint16x8_t vtstq_s16(int16x8_t a, int16x8_t b); // VTST.16 q0, q0, q0
824_NEON2SSESTORAGE uint32x4_t vtstq_s32(int32x4_t a, int32x4_t b); // VTST.32 q0, q0, q0
825_NEON2SSE_GLOBAL uint8x16_t vtstq_u8(uint8x16_t a, uint8x16_t b); // VTST.8 q0, q0, q0
826_NEON2SSE_GLOBAL uint16x8_t vtstq_u16(uint16x8_t a, uint16x8_t b); // VTST.16 q0, q0, q0
827_NEON2SSE_GLOBAL uint32x4_t vtstq_u32(uint32x4_t a, uint32x4_t b); // VTST.32 q0, q0, q0
828_NEON2SSE_GLOBAL uint8x16_t vtstq_p8(poly8x16_t a, poly8x16_t b); // VTST.8 q0, q0, q0
829//Absolute difference
830//Absolute difference between the arguments: Vr[i] = | Va[i] - Vb[i] |
831_NEON2SSESTORAGE int8x8_t vabd_s8(int8x8_t a, int8x8_t b); // VABD.S8 d0,d0,d0
832_NEON2SSESTORAGE int16x4_t vabd_s16(int16x4_t a, int16x4_t b); // VABD.S16 d0,d0,d0
833_NEON2SSESTORAGE int32x2_t vabd_s32(int32x2_t a, int32x2_t b); // VABD.S32 d0,d0,d0
834_NEON2SSESTORAGE uint8x8_t vabd_u8(uint8x8_t a, uint8x8_t b); // VABD.U8 d0,d0,d0
835_NEON2SSESTORAGE uint16x4_t vabd_u16(uint16x4_t a, uint16x4_t b); // VABD.U16 d0,d0,d0
836_NEON2SSESTORAGE uint32x2_t vabd_u32(uint32x2_t a, uint32x2_t b); // VABD.U32 d0,d0,d0
837_NEON2SSESTORAGE float32x2_t vabd_f32(float32x2_t a, float32x2_t b); // VABD.F32 d0,d0,d0
838_NEON2SSESTORAGE int8x16_t vabdq_s8(int8x16_t a, int8x16_t b); // VABD.S8 q0,q0,q0
839_NEON2SSESTORAGE int16x8_t vabdq_s16(int16x8_t a, int16x8_t b); // VABD.S16 q0,q0,q0
840_NEON2SSESTORAGE int32x4_t vabdq_s32(int32x4_t a, int32x4_t b); // VABD.S32 q0,q0,q0
841_NEON2SSESTORAGE uint8x16_t vabdq_u8(uint8x16_t a, uint8x16_t b); // VABD.U8 q0,q0,q0
842_NEON2SSESTORAGE uint16x8_t vabdq_u16(uint16x8_t a, uint16x8_t b); // VABD.U16 q0,q0,q0
843_NEON2SSESTORAGE uint32x4_t vabdq_u32(uint32x4_t a, uint32x4_t b); // VABD.U32 q0,q0,q0
844_NEON2SSESTORAGE float32x4_t vabdq_f32(float32x4_t a, float32x4_t b); // VABD.F32 q0,q0,q0
845//Absolute difference - long
846_NEON2SSESTORAGE int16x8_t vabdl_s8(int8x8_t a, int8x8_t b); // VABDL.S8 q0,d0,d0
847_NEON2SSESTORAGE int32x4_t vabdl_s16(int16x4_t a, int16x4_t b); // VABDL.S16 q0,d0,d0
848_NEON2SSESTORAGE int64x2_t vabdl_s32(int32x2_t a, int32x2_t b); // VABDL.S32 q0,d0,d0
849_NEON2SSESTORAGE uint16x8_t vabdl_u8(uint8x8_t a, uint8x8_t b); // VABDL.U8 q0,d0,d0
850_NEON2SSESTORAGE uint32x4_t vabdl_u16(uint16x4_t a, uint16x4_t b); // VABDL.U16 q0,d0,d0
851_NEON2SSESTORAGE uint64x2_t vabdl_u32(uint32x2_t a, uint32x2_t b); // VABDL.U32 q0,d0,d0
852//Absolute difference and accumulate: Vr[i] = Va[i] + | Vb[i] - Vc[i] |
853_NEON2SSESTORAGE int8x8_t vaba_s8(int8x8_t a, int8x8_t b, int8x8_t c); // VABA.S8 d0,d0,d0
854_NEON2SSESTORAGE int16x4_t vaba_s16(int16x4_t a, int16x4_t b, int16x4_t c); // VABA.S16 d0,d0,d0
855_NEON2SSESTORAGE int32x2_t vaba_s32(int32x2_t a, int32x2_t b, int32x2_t c); // VABA.S32 d0,d0,d0
856_NEON2SSESTORAGE uint8x8_t vaba_u8(uint8x8_t a, uint8x8_t b, uint8x8_t c); // VABA.U8 d0,d0,d0
857_NEON2SSESTORAGE uint16x4_t vaba_u16(uint16x4_t a, uint16x4_t b, uint16x4_t c); // VABA.U16 d0,d0,d0
858_NEON2SSESTORAGE uint32x2_t vaba_u32(uint32x2_t a, uint32x2_t b, uint32x2_t c); // VABA.U32 d0,d0,d0
859_NEON2SSESTORAGE int8x16_t vabaq_s8(int8x16_t a, int8x16_t b, int8x16_t c); // VABA.S8 q0,q0,q0
860_NEON2SSESTORAGE int16x8_t vabaq_s16(int16x8_t a, int16x8_t b, int16x8_t c); // VABA.S16 q0,q0,q0
861_NEON2SSESTORAGE int32x4_t vabaq_s32(int32x4_t a, int32x4_t b, int32x4_t c); // VABA.S32 q0,q0,q0
862_NEON2SSESTORAGE uint8x16_t vabaq_u8(uint8x16_t a, uint8x16_t b, uint8x16_t c); // VABA.U8 q0,q0,q0
863_NEON2SSESTORAGE uint16x8_t vabaq_u16(uint16x8_t a, uint16x8_t b, uint16x8_t c); // VABA.U16 q0,q0,q0
864_NEON2SSESTORAGE uint32x4_t vabaq_u32(uint32x4_t a, uint32x4_t b, uint32x4_t c); // VABA.U32 q0,q0,q0
865//Absolute difference and accumulate - long
866_NEON2SSESTORAGE int16x8_t vabal_s8(int16x8_t a, int8x8_t b, int8x8_t c); // VABAL.S8 q0,d0,d0
867_NEON2SSESTORAGE int32x4_t vabal_s16(int32x4_t a, int16x4_t b, int16x4_t c); // VABAL.S16 q0,d0,d0
868_NEON2SSESTORAGE int64x2_t vabal_s32(int64x2_t a, int32x2_t b, int32x2_t c); // VABAL.S32 q0,d0,d0
869_NEON2SSESTORAGE uint16x8_t vabal_u8(uint16x8_t a, uint8x8_t b, uint8x8_t c); // VABAL.U8 q0,d0,d0
870_NEON2SSESTORAGE uint32x4_t vabal_u16(uint32x4_t a, uint16x4_t b, uint16x4_t c); // VABAL.U16 q0,d0,d0
871_NEON2SSESTORAGE uint64x2_t vabal_u32(uint64x2_t a, uint32x2_t b, uint32x2_t c); // VABAL.U32 q0,d0,d0
872//Max/Min
873//vmax -> Vr[i] := (Va[i] >= Vb[i]) ? Va[i] : Vb[i]
874_NEON2SSESTORAGE int8x8_t vmax_s8(int8x8_t a, int8x8_t b); // VMAX.S8 d0,d0,d0
875_NEON2SSESTORAGE int16x4_t vmax_s16(int16x4_t a, int16x4_t b); // VMAX.S16 d0,d0,d0
876_NEON2SSESTORAGE int32x2_t vmax_s32(int32x2_t a, int32x2_t b); // VMAX.S32 d0,d0,d0
877_NEON2SSESTORAGE uint8x8_t vmax_u8(uint8x8_t a, uint8x8_t b); // VMAX.U8 d0,d0,d0
878_NEON2SSESTORAGE uint16x4_t vmax_u16(uint16x4_t a, uint16x4_t b); // VMAX.U16 d0,d0,d0
879_NEON2SSESTORAGE uint32x2_t vmax_u32(uint32x2_t a, uint32x2_t b); // VMAX.U32 d0,d0,d0
880_NEON2SSESTORAGE float32x2_t vmax_f32(float32x2_t a, float32x2_t b); // VMAX.F32 d0,d0,d0
881_NEON2SSE_GLOBAL int8x16_t vmaxq_s8(int8x16_t a, int8x16_t b); // VMAX.S8 q0,q0,q0
882_NEON2SSE_GLOBAL int16x8_t vmaxq_s16(int16x8_t a, int16x8_t b); // VMAX.S16 q0,q0,q0
883_NEON2SSE_GLOBAL int32x4_t vmaxq_s32(int32x4_t a, int32x4_t b); // VMAX.S32 q0,q0,q0
884_NEON2SSE_GLOBAL uint8x16_t vmaxq_u8(uint8x16_t a, uint8x16_t b); // VMAX.U8 q0,q0,q0
885_NEON2SSE_GLOBAL uint16x8_t vmaxq_u16(uint16x8_t a, uint16x8_t b); // VMAX.U16 q0,q0,q0
886_NEON2SSE_GLOBAL uint32x4_t vmaxq_u32(uint32x4_t a, uint32x4_t b); // VMAX.U32 q0,q0,q0
887_NEON2SSE_GLOBAL float32x4_t vmaxq_f32(float32x4_t a, float32x4_t b); // VMAX.F32 q0,q0,q0
888
889_NEON2SSE_GLOBAL float64x2_t vmaxq_f64(float64x2_t a, float64x2_t b); // VMAX.F64 q0,q0,q0
890
891//vmin -> Vr[i] := (Va[i] >= Vb[i]) ? Vb[i] : Va[i]
892_NEON2SSESTORAGE int8x8_t vmin_s8(int8x8_t a, int8x8_t b); // VMIN.S8 d0,d0,d0
893_NEON2SSESTORAGE int16x4_t vmin_s16(int16x4_t a, int16x4_t b); // VMIN.S16 d0,d0,d0
894_NEON2SSESTORAGE int32x2_t vmin_s32(int32x2_t a, int32x2_t b); // VMIN.S32 d0,d0,d0
895_NEON2SSESTORAGE uint8x8_t vmin_u8(uint8x8_t a, uint8x8_t b); // VMIN.U8 d0,d0,d0
896_NEON2SSESTORAGE uint16x4_t vmin_u16(uint16x4_t a, uint16x4_t b); // VMIN.U16 d0,d0,d0
897_NEON2SSESTORAGE uint32x2_t vmin_u32(uint32x2_t a, uint32x2_t b); // VMIN.U32 d0,d0,d0
898_NEON2SSESTORAGE float32x2_t vmin_f32(float32x2_t a, float32x2_t b); // VMIN.F32 d0,d0,d0
899_NEON2SSE_GLOBAL int8x16_t vminq_s8(int8x16_t a, int8x16_t b); // VMIN.S8 q0,q0,q0
900_NEON2SSE_GLOBAL int16x8_t vminq_s16(int16x8_t a, int16x8_t b); // VMIN.S16 q0,q0,q0
901_NEON2SSE_GLOBAL int32x4_t vminq_s32(int32x4_t a, int32x4_t b); // VMIN.S32 q0,q0,q0
902_NEON2SSE_GLOBAL uint8x16_t vminq_u8(uint8x16_t a, uint8x16_t b); // VMIN.U8 q0,q0,q0
903_NEON2SSE_GLOBAL uint16x8_t vminq_u16(uint16x8_t a, uint16x8_t b); // VMIN.U16 q0,q0,q0
904_NEON2SSE_GLOBAL uint32x4_t vminq_u32(uint32x4_t a, uint32x4_t b); // VMIN.U32 q0,q0,q0
905_NEON2SSE_GLOBAL float32x4_t vminq_f32(float32x4_t a, float32x4_t b); // VMIN.F32 q0,q0,q0
906
907_NEON2SSE_GLOBAL float64x2_t vminq_f64(float64x2_t a, float64x2_t b); // VMIN.F64 q0,q0,q0
908
909//Pairwise addition
910//Pairwise add
911_NEON2SSESTORAGE int8x8_t vpadd_s8(int8x8_t a, int8x8_t b); // VPADD.I8 d0,d0,d0
912_NEON2SSESTORAGE int16x4_t vpadd_s16(int16x4_t a, int16x4_t b); // VPADD.I16 d0,d0,d0
913_NEON2SSESTORAGE int32x2_t vpadd_s32(int32x2_t a, int32x2_t b); // VPADD.I32 d0,d0,d0
914_NEON2SSESTORAGE uint8x8_t vpadd_u8(uint8x8_t a, uint8x8_t b); // VPADD.I8 d0,d0,d0
915_NEON2SSESTORAGE uint16x4_t vpadd_u16(uint16x4_t a, uint16x4_t b); // VPADD.I16 d0,d0,d0
916_NEON2SSESTORAGE uint32x2_t vpadd_u32(uint32x2_t a, uint32x2_t b); // VPADD.I32 d0,d0,d0
917_NEON2SSESTORAGE float32x2_t vpadd_f32(float32x2_t a, float32x2_t b); // VPADD.F32 d0,d0,d0
918//Long pairwise add
919_NEON2SSESTORAGE int16x4_t vpaddl_s8(int8x8_t a); // VPADDL.S8 d0,d0
920_NEON2SSESTORAGE int32x2_t vpaddl_s16(int16x4_t a); // VPADDL.S16 d0,d0
921_NEON2SSESTORAGE int64x1_t vpaddl_s32(int32x2_t a); // VPADDL.S32 d0,d0
922_NEON2SSESTORAGE uint16x4_t vpaddl_u8(uint8x8_t a); // VPADDL.U8 d0,d0
923_NEON2SSESTORAGE uint32x2_t vpaddl_u16(uint16x4_t a); // VPADDL.U16 d0,d0
924_NEON2SSESTORAGE uint64x1_t vpaddl_u32(uint32x2_t a); // VPADDL.U32 d0,d0
925_NEON2SSESTORAGE int16x8_t vpaddlq_s8(int8x16_t a); // VPADDL.S8 q0,q0
926_NEON2SSESTORAGE int32x4_t vpaddlq_s16(int16x8_t a); // VPADDL.S16 q0,q0
927_NEON2SSESTORAGE int64x2_t vpaddlq_s32(int32x4_t a); // VPADDL.S32 q0,q0
928_NEON2SSESTORAGE uint16x8_t vpaddlq_u8(uint8x16_t a); // VPADDL.U8 q0,q0
929_NEON2SSESTORAGE uint32x4_t vpaddlq_u16(uint16x8_t a); // VPADDL.U16 q0,q0
930_NEON2SSESTORAGE uint64x2_t vpaddlq_u32(uint32x4_t a); // VPADDL.U32 q0,q0
931//Long pairwise add and accumulate
932_NEON2SSESTORAGE int16x4_t vpadal_s8(int16x4_t a, int8x8_t b); // VPADAL.S8 d0,d0
933_NEON2SSESTORAGE int32x2_t vpadal_s16(int32x2_t a, int16x4_t b); // VPADAL.S16 d0,d0
934_NEON2SSESTORAGE int64x1_t vpadal_s32(int64x1_t a, int32x2_t b); // VPADAL.S32 d0,d0
935_NEON2SSESTORAGE uint16x4_t vpadal_u8(uint16x4_t a, uint8x8_t b); // VPADAL.U8 d0,d0
936_NEON2SSESTORAGE uint32x2_t vpadal_u16(uint32x2_t a, uint16x4_t b); // VPADAL.U16 d0,d0
937_NEON2SSESTORAGE uint64x1_t vpadal_u32(uint64x1_t a, uint32x2_t b); // VPADAL.U32 d0,d0
938_NEON2SSESTORAGE int16x8_t vpadalq_s8(int16x8_t a, int8x16_t b); // VPADAL.S8 q0,q0
939_NEON2SSESTORAGE int32x4_t vpadalq_s16(int32x4_t a, int16x8_t b); // VPADAL.S16 q0,q0
940_NEON2SSESTORAGE int64x2_t vpadalq_s32(int64x2_t a, int32x4_t b); // VPADAL.S32 q0,q0
941_NEON2SSESTORAGE uint16x8_t vpadalq_u8(uint16x8_t a, uint8x16_t b); // VPADAL.U8 q0,q0
942_NEON2SSESTORAGE uint32x4_t vpadalq_u16(uint32x4_t a, uint16x8_t b); // VPADAL.U16 q0,q0
943_NEON2SSESTORAGE uint64x2_t vpadalq_u32(uint64x2_t a, uint32x4_t b); // VPADAL.U32 q0,q0
944//Folding maximum vpmax -> takes maximum of adjacent pairs
945_NEON2SSESTORAGE int8x8_t vpmax_s8(int8x8_t a, int8x8_t b); // VPMAX.S8 d0,d0,d0
946_NEON2SSESTORAGE int16x4_t vpmax_s16(int16x4_t a, int16x4_t b); // VPMAX.S16 d0,d0,d0
947_NEON2SSESTORAGE int32x2_t vpmax_s32(int32x2_t a, int32x2_t b); // VPMAX.S32 d0,d0,d0
948_NEON2SSESTORAGE uint8x8_t vpmax_u8(uint8x8_t a, uint8x8_t b); // VPMAX.U8 d0,d0,d0
949_NEON2SSESTORAGE uint16x4_t vpmax_u16(uint16x4_t a, uint16x4_t b); // VPMAX.U16 d0,d0,d0
950_NEON2SSESTORAGE uint32x2_t vpmax_u32(uint32x2_t a, uint32x2_t b); // VPMAX.U32 d0,d0,d0
951_NEON2SSESTORAGE float32x2_t vpmax_f32(float32x2_t a, float32x2_t b); // VPMAX.F32 d0,d0,d0
952//Folding minimum vpmin -> takes minimum of adjacent pairs
953_NEON2SSESTORAGE int8x8_t vpmin_s8(int8x8_t a, int8x8_t b); // VPMIN.S8 d0,d0,d0
954_NEON2SSESTORAGE int16x4_t vpmin_s16(int16x4_t a, int16x4_t b); // VPMIN.S16 d0,d0,d0
955_NEON2SSESTORAGE int32x2_t vpmin_s32(int32x2_t a, int32x2_t b); // VPMIN.S32 d0,d0,d0
956_NEON2SSESTORAGE uint8x8_t vpmin_u8(uint8x8_t a, uint8x8_t b); // VPMIN.U8 d0,d0,d0
957_NEON2SSESTORAGE uint16x4_t vpmin_u16(uint16x4_t a, uint16x4_t b); // VPMIN.U16 d0,d0,d0
958_NEON2SSESTORAGE uint32x2_t vpmin_u32(uint32x2_t a, uint32x2_t b); // VPMIN.U32 d0,d0,d0
959_NEON2SSESTORAGE float32x2_t vpmin_f32(float32x2_t a, float32x2_t b); // VPMIN.F32 d0,d0,d0
960//Reciprocal/Sqrt
961_NEON2SSESTORAGE float32x2_t vrecps_f32(float32x2_t a, float32x2_t b); // VRECPS.F32 d0, d0, d0
962_NEON2SSESTORAGE float32x4_t vrecpsq_f32(float32x4_t a, float32x4_t b); // VRECPS.F32 q0, q0, q0
963_NEON2SSESTORAGE float32x2_t vrsqrts_f32(float32x2_t a, float32x2_t b); // VRSQRTS.F32 d0, d0, d0
964_NEON2SSESTORAGE float32x4_t vrsqrtsq_f32(float32x4_t a, float32x4_t b); // VRSQRTS.F32 q0, q0, q0
965//Shifts by signed variable
966//Vector shift left: Vr[i] := Va[i] << Vb[i] (negative values shift right)
967_NEON2SSESTORAGE int8x8_t vshl_s8(int8x8_t a, int8x8_t b); // VSHL.S8 d0,d0,d0
968_NEON2SSESTORAGE int16x4_t vshl_s16(int16x4_t a, int16x4_t b); // VSHL.S16 d0,d0,d0
969_NEON2SSESTORAGE int32x2_t vshl_s32(int32x2_t a, int32x2_t b); // VSHL.S32 d0,d0,d0
970_NEON2SSESTORAGE int64x1_t vshl_s64(int64x1_t a, int64x1_t b); // VSHL.S64 d0,d0,d0
971_NEON2SSESTORAGE uint8x8_t vshl_u8(uint8x8_t a, int8x8_t b); // VSHL.U8 d0,d0,d0
972_NEON2SSESTORAGE uint16x4_t vshl_u16(uint16x4_t a, int16x4_t b); // VSHL.U16 d0,d0,d0
973_NEON2SSESTORAGE uint32x2_t vshl_u32(uint32x2_t a, int32x2_t b); // VSHL.U32 d0,d0,d0
974_NEON2SSESTORAGE uint64x1_t vshl_u64(uint64x1_t a, int64x1_t b); // VSHL.U64 d0,d0,d0
975_NEON2SSESTORAGE int8x16_t vshlq_s8(int8x16_t a, int8x16_t b); // VSHL.S8 q0,q0,q0
976_NEON2SSESTORAGE int16x8_t vshlq_s16(int16x8_t a, int16x8_t b); // VSHL.S16 q0,q0,q0
977_NEON2SSESTORAGE int32x4_t vshlq_s32(int32x4_t a, int32x4_t b); // VSHL.S32 q0,q0,q0
978_NEON2SSESTORAGE int64x2_t vshlq_s64(int64x2_t a, int64x2_t b); // VSHL.S64 q0,q0,q0
979_NEON2SSESTORAGE uint8x16_t vshlq_u8(uint8x16_t a, int8x16_t b); // VSHL.U8 q0,q0,q0
980_NEON2SSESTORAGE uint16x8_t vshlq_u16(uint16x8_t a, int16x8_t b); // VSHL.U16 q0,q0,q0
981_NEON2SSESTORAGE uint32x4_t vshlq_u32(uint32x4_t a, int32x4_t b); // VSHL.U32 q0,q0,q0
982_NEON2SSESTORAGE uint64x2_t vshlq_u64(uint64x2_t a, int64x2_t b); // VSHL.U64 q0,q0,q0
983//Vector saturating shift left: (negative values shift right)
984_NEON2SSESTORAGE int8x8_t vqshl_s8(int8x8_t a, int8x8_t b); // VQSHL.S8 d0,d0,d0
985_NEON2SSESTORAGE int16x4_t vqshl_s16(int16x4_t a, int16x4_t b); // VQSHL.S16 d0,d0,d0
986_NEON2SSESTORAGE int32x2_t vqshl_s32(int32x2_t a, int32x2_t b); // VQSHL.S32 d0,d0,d0
987_NEON2SSESTORAGE int64x1_t vqshl_s64(int64x1_t a, int64x1_t b); // VQSHL.S64 d0,d0,d0
988_NEON2SSESTORAGE uint8x8_t vqshl_u8(uint8x8_t a, int8x8_t b); // VQSHL.U8 d0,d0,d0
989_NEON2SSESTORAGE uint16x4_t vqshl_u16(uint16x4_t a, int16x4_t b); // VQSHL.U16 d0,d0,d0
990_NEON2SSESTORAGE uint32x2_t vqshl_u32(uint32x2_t a, int32x2_t b); // VQSHL.U32 d0,d0,d0
991_NEON2SSESTORAGE uint64x1_t vqshl_u64(uint64x1_t a, int64x1_t b); // VQSHL.U64 d0,d0,d0
992_NEON2SSESTORAGE int8x16_t vqshlq_s8(int8x16_t a, int8x16_t b); // VQSHL.S8 q0,q0,q0
993_NEON2SSESTORAGE int16x8_t vqshlq_s16(int16x8_t a, int16x8_t b); // VQSHL.S16 q0,q0,q0
994_NEON2SSESTORAGE int32x4_t vqshlq_s32(int32x4_t a, int32x4_t b); // VQSHL.S32 q0,q0,q0
995_NEON2SSESTORAGE int64x2_t vqshlq_s64(int64x2_t a, int64x2_t b); // VQSHL.S64 q0,q0,q0
996_NEON2SSESTORAGE uint8x16_t vqshlq_u8(uint8x16_t a, int8x16_t b); // VQSHL.U8 q0,q0,q0
997_NEON2SSESTORAGE uint16x8_t vqshlq_u16(uint16x8_t a, int16x8_t b); // VQSHL.U16 q0,q0,q0
998_NEON2SSESTORAGE uint32x4_t vqshlq_u32(uint32x4_t a, int32x4_t b); // VQSHL.U32 q0,q0,q0
999_NEON2SSESTORAGE uint64x2_t vqshlq_u64(uint64x2_t a, int64x2_t b); // VQSHL.U64 q0,q0,q0
1000//Vector rounding shift left: (negative values shift right)
1001_NEON2SSESTORAGE int8x8_t vrshl_s8(int8x8_t a, int8x8_t b); // VRSHL.S8 d0,d0,d0
1002_NEON2SSESTORAGE int16x4_t vrshl_s16(int16x4_t a, int16x4_t b); // VRSHL.S16 d0,d0,d0
1003_NEON2SSESTORAGE int32x2_t vrshl_s32(int32x2_t a, int32x2_t b); // VRSHL.S32 d0,d0,d0
1004_NEON2SSESTORAGE int64x1_t vrshl_s64(int64x1_t a, int64x1_t b); // VRSHL.S64 d0,d0,d0
1005_NEON2SSESTORAGE uint8x8_t vrshl_u8(uint8x8_t a, int8x8_t b); // VRSHL.U8 d0,d0,d0
1006_NEON2SSESTORAGE uint16x4_t vrshl_u16(uint16x4_t a, int16x4_t b); // VRSHL.U16 d0,d0,d0
1007_NEON2SSESTORAGE uint32x2_t vrshl_u32(uint32x2_t a, int32x2_t b); // VRSHL.U32 d0,d0,d0
1008_NEON2SSESTORAGE uint64x1_t vrshl_u64(uint64x1_t a, int64x1_t b); // VRSHL.U64 d0,d0,d0
1009_NEON2SSESTORAGE int8x16_t vrshlq_s8(int8x16_t a, int8x16_t b); // VRSHL.S8 q0,q0,q0
1010_NEON2SSESTORAGE int16x8_t vrshlq_s16(int16x8_t a, int16x8_t b); // VRSHL.S16 q0,q0,q0
1011_NEON2SSESTORAGE int32x4_t vrshlq_s32(int32x4_t a, int32x4_t b); // VRSHL.S32 q0,q0,q0
1012_NEON2SSESTORAGE int64x2_t vrshlq_s64(int64x2_t a, int64x2_t b); // VRSHL.S64 q0,q0,q0
1013_NEON2SSESTORAGE uint8x16_t vrshlq_u8(uint8x16_t a, int8x16_t b); // VRSHL.U8 q0,q0,q0
1014_NEON2SSESTORAGE uint16x8_t vrshlq_u16(uint16x8_t a, int16x8_t b); // VRSHL.U16 q0,q0,q0
1015_NEON2SSESTORAGE uint32x4_t vrshlq_u32(uint32x4_t a, int32x4_t b); // VRSHL.U32 q0,q0,q0
1016_NEON2SSESTORAGE uint64x2_t vrshlq_u64(uint64x2_t a, int64x2_t b); // VRSHL.U64 q0,q0,q0
1017//Vector saturating rounding shift left: (negative values shift right)
1018_NEON2SSESTORAGE int8x8_t vqrshl_s8(int8x8_t a, int8x8_t b); // VQRSHL.S8 d0,d0,d0
1019_NEON2SSESTORAGE int16x4_t vqrshl_s16(int16x4_t a, int16x4_t b); // VQRSHL.S16 d0,d0,d0
1020_NEON2SSESTORAGE int32x2_t vqrshl_s32(int32x2_t a, int32x2_t b); // VQRSHL.S32 d0,d0,d0
1021_NEON2SSESTORAGE int64x1_t vqrshl_s64(int64x1_t a, int64x1_t b); // VQRSHL.S64 d0,d0,d0
1022_NEON2SSESTORAGE uint8x8_t vqrshl_u8(uint8x8_t a, int8x8_t b); // VQRSHL.U8 d0,d0,d0
1023_NEON2SSESTORAGE uint16x4_t vqrshl_u16(uint16x4_t a, int16x4_t b); // VQRSHL.U16 d0,d0,d0
1024_NEON2SSESTORAGE uint32x2_t vqrshl_u32(uint32x2_t a, int32x2_t b); // VQRSHL.U32 d0,d0,d0
1025_NEON2SSESTORAGE uint64x1_t vqrshl_u64(uint64x1_t a, int64x1_t b); // VQRSHL.U64 d0,d0,d0
1026_NEON2SSESTORAGE int8x16_t vqrshlq_s8(int8x16_t a, int8x16_t b); // VQRSHL.S8 q0,q0,q0
1027_NEON2SSESTORAGE int16x8_t vqrshlq_s16(int16x8_t a, int16x8_t b); // VQRSHL.S16 q0,q0,q0
1028_NEON2SSESTORAGE int32x4_t vqrshlq_s32(int32x4_t a, int32x4_t b); // VQRSHL.S32 q0,q0,q0
1029_NEON2SSESTORAGE int64x2_t vqrshlq_s64(int64x2_t a, int64x2_t b); // VQRSHL.S64 q0,q0,q0
1030_NEON2SSESTORAGE uint8x16_t vqrshlq_u8(uint8x16_t a, int8x16_t b); // VQRSHL.U8 q0,q0,q0
1031_NEON2SSESTORAGE uint16x8_t vqrshlq_u16(uint16x8_t a, int16x8_t b); // VQRSHL.U16 q0,q0,q0
1032_NEON2SSESTORAGE uint32x4_t vqrshlq_u32(uint32x4_t a, int32x4_t b); // VQRSHL.U32 q0,q0,q0
1033_NEON2SSESTORAGE uint64x2_t vqrshlq_u64(uint64x2_t a, int64x2_t b); // VQRSHL.U64 q0,q0,q0
1034//Shifts by a constant
1035//Vector shift right by constant
1036_NEON2SSESTORAGE int8x8_t vshr_n_s8(int8x8_t a, __constrange(1,8) int b); // VSHR.S8 d0,d0,#8
1037_NEON2SSESTORAGE int16x4_t vshr_n_s16(int16x4_t a, __constrange(1,16) int b); // VSHR.S16 d0,d0,#16
1038_NEON2SSESTORAGE int32x2_t vshr_n_s32(int32x2_t a, __constrange(1,32) int b); // VSHR.S32 d0,d0,#32
1039_NEON2SSESTORAGE int64x1_t vshr_n_s64(int64x1_t a, __constrange(1,64) int b); // VSHR.S64 d0,d0,#64
1040_NEON2SSESTORAGE uint8x8_t vshr_n_u8(uint8x8_t a, __constrange(1,8) int b); // VSHR.U8 d0,d0,#8
1041_NEON2SSESTORAGE uint16x4_t vshr_n_u16(uint16x4_t a, __constrange(1,16) int b); // VSHR.U16 d0,d0,#16
1042_NEON2SSESTORAGE uint32x2_t vshr_n_u32(uint32x2_t a, __constrange(1,32) int b); // VSHR.U32 d0,d0,#32
1043_NEON2SSESTORAGE uint64x1_t vshr_n_u64(uint64x1_t a, __constrange(1,64) int b); // VSHR.U64 d0,d0,#64
1044_NEON2SSESTORAGE int8x16_t vshrq_n_s8(int8x16_t a, __constrange(1,8) int b); // VSHR.S8 q0,q0,#8
1045_NEON2SSE_GLOBAL int16x8_t vshrq_n_s16(int16x8_t a, __constrange(1,16) int b); // VSHR.S16 q0,q0,#16
1046_NEON2SSE_GLOBAL int32x4_t vshrq_n_s32(int32x4_t a, __constrange(1,32) int b); // VSHR.S32 q0,q0,#32
1047_NEON2SSESTORAGE int64x2_t vshrq_n_s64(int64x2_t a, __constrange(1,64) int b); // VSHR.S64 q0,q0,#64
1048_NEON2SSESTORAGE uint8x16_t vshrq_n_u8(uint8x16_t a, __constrange(1,8) int b); // VSHR.U8 q0,q0,#8
1049_NEON2SSE_GLOBAL uint16x8_t vshrq_n_u16(uint16x8_t a, __constrange(1,16) int b); // VSHR.U16 q0,q0,#16
1050_NEON2SSE_GLOBAL uint32x4_t vshrq_n_u32(uint32x4_t a, __constrange(1,32) int b); // VSHR.U32 q0,q0,#32
1051_NEON2SSE_GLOBAL uint64x2_t vshrq_n_u64(uint64x2_t a, __constrange(1,64) int b); // VSHR.U64 q0,q0,#64
1052//Vector shift left by constant
1053_NEON2SSESTORAGE int8x8_t vshl_n_s8(int8x8_t a, __constrange(0,7) int b); // VSHL.I8 d0,d0,#0
1054_NEON2SSESTORAGE int16x4_t vshl_n_s16(int16x4_t a, __constrange(0,15) int b); // VSHL.I16 d0,d0,#0
1055_NEON2SSESTORAGE int32x2_t vshl_n_s32(int32x2_t a, __constrange(0,31) int b); // VSHL.I32 d0,d0,#0
1056_NEON2SSESTORAGE int64x1_t vshl_n_s64(int64x1_t a, __constrange(0,63) int b); // VSHL.I64 d0,d0,#0
1057_NEON2SSESTORAGE uint8x8_t vshl_n_u8(uint8x8_t a, __constrange(0,7) int b); // VSHL.I8 d0,d0,#0
1058_NEON2SSE_GLOBAL uint16x4_t vshl_n_u16(uint16x4_t a, __constrange(0,15) int b); // VSHL.I16 d0,d0,#0
1059_NEON2SSE_GLOBAL uint32x2_t vshl_n_u32(uint32x2_t a, __constrange(0,31) int b); // VSHL.I32 d0,d0,#0
1060_NEON2SSE_GLOBAL uint64x1_t vshl_n_u64(uint64x1_t a, __constrange(0,63) int b); // VSHL.I64 d0,d0,#0
1061_NEON2SSE_GLOBAL int8x16_t vshlq_n_s8(int8x16_t a, __constrange(0,7) int b); // VSHL.I8 q0,q0,#0
1062_NEON2SSE_GLOBAL int16x8_t vshlq_n_s16(int16x8_t a, __constrange(0,15) int b); // VSHL.I16 q0,q0,#0
1063_NEON2SSE_GLOBAL int32x4_t vshlq_n_s32(int32x4_t a, __constrange(0,31) int b); // VSHL.I32 q0,q0,#0
1064_NEON2SSE_GLOBAL int64x2_t vshlq_n_s64(int64x2_t a, __constrange(0,63) int b); // VSHL.I64 q0,q0,#0
1065_NEON2SSESTORAGE uint8x16_t vshlq_n_u8(uint8x16_t a, __constrange(0,7) int b); // VSHL.I8 q0,q0,#0
1066_NEON2SSE_GLOBAL uint16x8_t vshlq_n_u16(uint16x8_t a, __constrange(0,15) int b); // VSHL.I16 q0,q0,#0
1067_NEON2SSE_GLOBAL uint32x4_t vshlq_n_u32(uint32x4_t a, __constrange(0,31) int b); // VSHL.I32 q0,q0,#0
1068_NEON2SSE_GLOBAL uint64x2_t vshlq_n_u64(uint64x2_t a, __constrange(0,63) int b); // VSHL.I64 q0,q0,#0
1069//Vector rounding shift right by constant
1070_NEON2SSESTORAGE int8x8_t vrshr_n_s8(int8x8_t a, __constrange(1,8) int b); // VRSHR.S8 d0,d0,#8
1071_NEON2SSESTORAGE int16x4_t vrshr_n_s16(int16x4_t a, __constrange(1,16) int b); // VRSHR.S16 d0,d0,#16
1072_NEON2SSESTORAGE int32x2_t vrshr_n_s32(int32x2_t a, __constrange(1,32) int b); // VRSHR.S32 d0,d0,#32
1073_NEON2SSESTORAGE int64x1_t vrshr_n_s64(int64x1_t a, __constrange(1,64) int b); // VRSHR.S64 d0,d0,#64
1074_NEON2SSESTORAGE uint8x8_t vrshr_n_u8(uint8x8_t a, __constrange(1,8) int b); // VRSHR.U8 d0,d0,#8
1075_NEON2SSESTORAGE uint16x4_t vrshr_n_u16(uint16x4_t a, __constrange(1,16) int b); // VRSHR.U16 d0,d0,#16
1076_NEON2SSESTORAGE uint32x2_t vrshr_n_u32(uint32x2_t a, __constrange(1,32) int b); // VRSHR.U32 d0,d0,#32
1077_NEON2SSESTORAGE uint64x1_t vrshr_n_u64(uint64x1_t a, __constrange(1,64) int b); // VRSHR.U64 d0,d0,#64
1078_NEON2SSESTORAGE int8x16_t vrshrq_n_s8(int8x16_t a, __constrange(1,8) int b); // VRSHR.S8 q0,q0,#8
1079_NEON2SSESTORAGE int16x8_t vrshrq_n_s16(int16x8_t a, __constrange(1,16) int b); // VRSHR.S16 q0,q0,#16
1080_NEON2SSESTORAGE int32x4_t vrshrq_n_s32(int32x4_t a, __constrange(1,32) int b); // VRSHR.S32 q0,q0,#32
1081_NEON2SSESTORAGE int64x2_t vrshrq_n_s64(int64x2_t a, __constrange(1,64) int b); // VRSHR.S64 q0,q0,#64
1082_NEON2SSESTORAGE uint8x16_t vrshrq_n_u8(uint8x16_t a, __constrange(1,8) int b); // VRSHR.U8 q0,q0,#8
1083_NEON2SSESTORAGE uint16x8_t vrshrq_n_u16(uint16x8_t a, __constrange(1,16) int b); // VRSHR.U16 q0,q0,#16
1084_NEON2SSESTORAGE uint32x4_t vrshrq_n_u32(uint32x4_t a, __constrange(1,32) int b); // VRSHR.U32 q0,q0,#32
1085_NEON2SSESTORAGE uint64x2_t vrshrq_n_u64(uint64x2_t a, __constrange(1,64) int b); // VRSHR.U64 q0,q0,#64
1086//Vector shift right by constant and accumulate
1087_NEON2SSESTORAGE int8x8_t vsra_n_s8(int8x8_t a, int8x8_t b, __constrange(1,8) int c); // VSRA.S8 d0,d0,#8
1088_NEON2SSESTORAGE int16x4_t vsra_n_s16(int16x4_t a, int16x4_t b, __constrange(1,16) int c); // VSRA.S16 d0,d0,#16
1089_NEON2SSESTORAGE int32x2_t vsra_n_s32(int32x2_t a, int32x2_t b, __constrange(1,32) int c); // VSRA.S32 d0,d0,#32
1090_NEON2SSESTORAGE int64x1_t vsra_n_s64(int64x1_t a, int64x1_t b, __constrange(1,64) int c); // VSRA.S64 d0,d0,#64
1091_NEON2SSESTORAGE uint8x8_t vsra_n_u8(uint8x8_t a, uint8x8_t b, __constrange(1,8) int c); // VSRA.U8 d0,d0,#8
1092_NEON2SSESTORAGE uint16x4_t vsra_n_u16(uint16x4_t a, uint16x4_t b, __constrange(1,16) int c); // VSRA.U16 d0,d0,#16
1093_NEON2SSESTORAGE uint32x2_t vsra_n_u32(uint32x2_t a, uint32x2_t b, __constrange(1,32) int c); // VSRA.U32 d0,d0,#32
1094_NEON2SSESTORAGE uint64x1_t vsra_n_u64(uint64x1_t a, uint64x1_t b, __constrange(1,64) int c); // VSRA.U64 d0,d0,#64
1095_NEON2SSESTORAGE int8x16_t vsraq_n_s8(int8x16_t a, int8x16_t b, __constrange(1,8) int c); // VSRA.S8 q0,q0,#8
1096_NEON2SSESTORAGE int16x8_t vsraq_n_s16(int16x8_t a, int16x8_t b, __constrange(1,16) int c); // VSRA.S16 q0,q0,#16
1097_NEON2SSESTORAGE int32x4_t vsraq_n_s32(int32x4_t a, int32x4_t b, __constrange(1,32) int c); // VSRA.S32 q0,q0,#32
1098_NEON2SSESTORAGE int64x2_t vsraq_n_s64(int64x2_t a, int64x2_t b, __constrange(1,64) int c); // VSRA.S64 q0,q0,#64
1099_NEON2SSESTORAGE uint8x16_t vsraq_n_u8(uint8x16_t a, uint8x16_t b, __constrange(1,8) int c); // VSRA.U8 q0,q0,#8
1100_NEON2SSESTORAGE uint16x8_t vsraq_n_u16(uint16x8_t a, uint16x8_t b, __constrange(1,16) int c); // VSRA.U16 q0,q0,#16
1101_NEON2SSESTORAGE uint32x4_t vsraq_n_u32(uint32x4_t a, uint32x4_t b, __constrange(1,32) int c); // VSRA.U32 q0,q0,#32
1102_NEON2SSESTORAGE uint64x2_t vsraq_n_u64(uint64x2_t a, uint64x2_t b, __constrange(1,64) int c); // VSRA.U64 q0,q0,#64
1103//Vector rounding shift right by constant and accumulate
1104_NEON2SSESTORAGE int8x8_t vrsra_n_s8(int8x8_t a, int8x8_t b, __constrange(1,8) int c); // VRSRA.S8 d0,d0,#8
1105_NEON2SSESTORAGE int16x4_t vrsra_n_s16(int16x4_t a, int16x4_t b, __constrange(1,16) int c); // VRSRA.S16 d0,d0,#16
1106_NEON2SSESTORAGE int32x2_t vrsra_n_s32(int32x2_t a, int32x2_t b, __constrange(1,32) int c); // VRSRA.S32 d0,d0,#32
1107_NEON2SSESTORAGE int64x1_t vrsra_n_s64(int64x1_t a, int64x1_t b, __constrange(1,64) int c); // VRSRA.S64 d0,d0,#64
1108_NEON2SSESTORAGE uint8x8_t vrsra_n_u8(uint8x8_t a, uint8x8_t b, __constrange(1,8) int c); // VRSRA.U8 d0,d0,#8
1109_NEON2SSESTORAGE uint16x4_t vrsra_n_u16(uint16x4_t a, uint16x4_t b, __constrange(1,16) int c); // VRSRA.U16 d0,d0,#16
1110_NEON2SSESTORAGE uint32x2_t vrsra_n_u32(uint32x2_t a, uint32x2_t b, __constrange(1,32) int c); // VRSRA.U32 d0,d0,#32
1111_NEON2SSESTORAGE uint64x1_t vrsra_n_u64(uint64x1_t a, uint64x1_t b, __constrange(1,64) int c); // VRSRA.U64 d0,d0,#64
1112_NEON2SSESTORAGE int8x16_t vrsraq_n_s8(int8x16_t a, int8x16_t b, __constrange(1,8) int c); // VRSRA.S8 q0,q0,#8
1113_NEON2SSESTORAGE int16x8_t vrsraq_n_s16(int16x8_t a, int16x8_t b, __constrange(1,16) int c); // VRSRA.S16 q0,q0,#16
1114_NEON2SSESTORAGE int32x4_t vrsraq_n_s32(int32x4_t a, int32x4_t b, __constrange(1,32) int c); // VRSRA.S32 q0,q0,#32
1115_NEON2SSESTORAGE int64x2_t vrsraq_n_s64(int64x2_t a, int64x2_t b, __constrange(1,64) int c); // VRSRA.S64 q0,q0,#64
1116_NEON2SSESTORAGE uint8x16_t vrsraq_n_u8(uint8x16_t a, uint8x16_t b, __constrange(1,8) int c); // VRSRA.U8 q0,q0,#8
1117_NEON2SSESTORAGE uint16x8_t vrsraq_n_u16(uint16x8_t a, uint16x8_t b, __constrange(1,16) int c); // VRSRA.U16 q0,q0,#16
1118_NEON2SSESTORAGE uint32x4_t vrsraq_n_u32(uint32x4_t a, uint32x4_t b, __constrange(1,32) int c); // VRSRA.U32 q0,q0,#32
1119_NEON2SSESTORAGE uint64x2_t vrsraq_n_u64(uint64x2_t a, uint64x2_t b, __constrange(1,64) int c); // VRSRA.U64 q0,q0,#64
1120//Vector saturating shift left by constant
1121_NEON2SSESTORAGE int8x8_t vqshl_n_s8(int8x8_t a, __constrange(0,7) int b); // VQSHL.S8 d0,d0,#0
1122_NEON2SSESTORAGE int16x4_t vqshl_n_s16(int16x4_t a, __constrange(0,15) int b); // VQSHL.S16 d0,d0,#0
1123_NEON2SSESTORAGE int32x2_t vqshl_n_s32(int32x2_t a, __constrange(0,31) int b); // VQSHL.S32 d0,d0,#0
1124_NEON2SSESTORAGE int64x1_t vqshl_n_s64(int64x1_t a, __constrange(0,63) int b); // VQSHL.S64 d0,d0,#0
1125_NEON2SSESTORAGE uint8x8_t vqshl_n_u8(uint8x8_t a, __constrange(0,7) int b); // VQSHL.U8 d0,d0,#0
1126_NEON2SSESTORAGE uint16x4_t vqshl_n_u16(uint16x4_t a, __constrange(0,15) int b); // VQSHL.U16 d0,d0,#0
1127_NEON2SSESTORAGE uint32x2_t vqshl_n_u32(uint32x2_t a, __constrange(0,31) int b); // VQSHL.U32 d0,d0,#0
1128_NEON2SSESTORAGE uint64x1_t vqshl_n_u64(uint64x1_t a, __constrange(0,63) int b); // VQSHL.U64 d0,d0,#0
1129_NEON2SSESTORAGE int8x16_t vqshlq_n_s8(int8x16_t a, __constrange(0,7) int b); // VQSHL.S8 q0,q0,#0
1130_NEON2SSESTORAGE int16x8_t vqshlq_n_s16(int16x8_t a, __constrange(0,15) int b); // VQSHL.S16 q0,q0,#0
1131_NEON2SSESTORAGE int32x4_t vqshlq_n_s32(int32x4_t a, __constrange(0,31) int b); // VQSHL.S32 q0,q0,#0
1132_NEON2SSESTORAGE int64x2_t vqshlq_n_s64(int64x2_t a, __constrange(0,63) int b); // VQSHL.S64 q0,q0,#0
1133_NEON2SSESTORAGE uint8x16_t vqshlq_n_u8(uint8x16_t a, __constrange(0,7) int b); // VQSHL.U8 q0,q0,#0
1134_NEON2SSESTORAGE uint16x8_t vqshlq_n_u16(uint16x8_t a, __constrange(0,15) int b); // VQSHL.U16 q0,q0,#0
1135_NEON2SSESTORAGE uint32x4_t vqshlq_n_u32(uint32x4_t a, __constrange(0,31) int b); // VQSHL.U32 q0,q0,#0
1136_NEON2SSESTORAGE uint64x2_t vqshlq_n_u64(uint64x2_t a, __constrange(0,63) int b); // VQSHL.U64 q0,q0,#0
1137//Vector signed->unsigned saturating shift left by constant
1138_NEON2SSESTORAGE uint8x8_t vqshlu_n_s8(int8x8_t a, __constrange(0,7) int b); // VQSHLU.S8 d0,d0,#0
1139_NEON2SSESTORAGE uint16x4_t vqshlu_n_s16(int16x4_t a, __constrange(0,15) int b); // VQSHLU.S16 d0,d0,#0
1140_NEON2SSESTORAGE uint32x2_t vqshlu_n_s32(int32x2_t a, __constrange(0,31) int b); // VQSHLU.S32 d0,d0,#0
1141_NEON2SSESTORAGE uint64x1_t vqshlu_n_s64(int64x1_t a, __constrange(0,63) int b); // VQSHLU.S64 d0,d0,#0
1142_NEON2SSESTORAGE uint8x16_t vqshluq_n_s8(int8x16_t a, __constrange(0,7) int b); // VQSHLU.S8 q0,q0,#0
1143_NEON2SSESTORAGE uint16x8_t vqshluq_n_s16(int16x8_t a, __constrange(0,15) int b); // VQSHLU.S16 q0,q0,#0
1144_NEON2SSESTORAGE uint32x4_t vqshluq_n_s32(int32x4_t a, __constrange(0,31) int b); // VQSHLU.S32 q0,q0,#0
1145_NEON2SSESTORAGE uint64x2_t vqshluq_n_s64(int64x2_t a, __constrange(0,63) int b); // VQSHLU.S64 q0,q0,#0
1146//Vector narrowing shift right by constant
1147_NEON2SSESTORAGE int8x8_t vshrn_n_s16(int16x8_t a, __constrange(1,8) int b); // VSHRN.I16 d0,q0,#8
1148_NEON2SSESTORAGE int16x4_t vshrn_n_s32(int32x4_t a, __constrange(1,16) int b); // VSHRN.I32 d0,q0,#16
1149_NEON2SSESTORAGE int32x2_t vshrn_n_s64(int64x2_t a, __constrange(1,32) int b); // VSHRN.I64 d0,q0,#32
1150_NEON2SSESTORAGE uint8x8_t vshrn_n_u16(uint16x8_t a, __constrange(1,8) int b); // VSHRN.I16 d0,q0,#8
1151_NEON2SSESTORAGE uint16x4_t vshrn_n_u32(uint32x4_t a, __constrange(1,16) int b); // VSHRN.I32 d0,q0,#16
1152_NEON2SSESTORAGE uint32x2_t vshrn_n_u64(uint64x2_t a, __constrange(1,32) int b); // VSHRN.I64 d0,q0,#32
1153//Vector signed->unsigned narrowing saturating shift right by constant
1154_NEON2SSESTORAGE uint8x8_t vqshrun_n_s16(int16x8_t a, __constrange(1,8) int b); // VQSHRUN.S16 d0,q0,#8
1155_NEON2SSESTORAGE uint16x4_t vqshrun_n_s32(int32x4_t a, __constrange(1,16) int b); // VQSHRUN.S32 d0,q0,#16
1156_NEON2SSESTORAGE uint32x2_t vqshrun_n_s64(int64x2_t a, __constrange(1,32) int b); // VQSHRUN.S64 d0,q0,#32
1157//Vector signed->unsigned rounding narrowing saturating shift right by constant
1158_NEON2SSESTORAGE uint8x8_t vqrshrun_n_s16(int16x8_t a, __constrange(1,8) int b); // VQRSHRUN.S16 d0,q0,#8
1159_NEON2SSESTORAGE uint16x4_t vqrshrun_n_s32(int32x4_t a, __constrange(1,16) int b); // VQRSHRUN.S32 d0,q0,#16
1160_NEON2SSESTORAGE uint32x2_t vqrshrun_n_s64(int64x2_t a, __constrange(1,32) int b); // VQRSHRUN.S64 d0,q0,#32
1161//Vector narrowing saturating shift right by constant
1162_NEON2SSESTORAGE int8x8_t vqshrn_n_s16(int16x8_t a, __constrange(1,8) int b); // VQSHRN.S16 d0,q0,#8
1163_NEON2SSESTORAGE int16x4_t vqshrn_n_s32(int32x4_t a, __constrange(1,16) int b); // VQSHRN.S32 d0,q0,#16
1164_NEON2SSESTORAGE int32x2_t vqshrn_n_s64(int64x2_t a, __constrange(1,32) int b); // VQSHRN.S64 d0,q0,#32
1165_NEON2SSESTORAGE uint8x8_t vqshrn_n_u16(uint16x8_t a, __constrange(1,8) int b); // VQSHRN.U16 d0,q0,#8
1166_NEON2SSESTORAGE uint16x4_t vqshrn_n_u32(uint32x4_t a, __constrange(1,16) int b); // VQSHRN.U32 d0,q0,#16
1167_NEON2SSESTORAGE uint32x2_t vqshrn_n_u64(uint64x2_t a, __constrange(1,32) int b); // VQSHRN.U64 d0,q0,#32
1168//Vector rounding narrowing shift right by constant
1169_NEON2SSESTORAGE int8x8_t vrshrn_n_s16(int16x8_t a, __constrange(1,8) int b); // VRSHRN.I16 d0,q0,#8
1170_NEON2SSESTORAGE int16x4_t vrshrn_n_s32(int32x4_t a, __constrange(1,16) int b); // VRSHRN.I32 d0,q0,#16
1171_NEON2SSESTORAGE int32x2_t vrshrn_n_s64(int64x2_t a, __constrange(1,32) int b); // VRSHRN.I64 d0,q0,#32
1172_NEON2SSESTORAGE uint8x8_t vrshrn_n_u16(uint16x8_t a, __constrange(1,8) int b); // VRSHRN.I16 d0,q0,#8
1173_NEON2SSESTORAGE uint16x4_t vrshrn_n_u32(uint32x4_t a, __constrange(1,16) int b); // VRSHRN.I32 d0,q0,#16
1174_NEON2SSESTORAGE uint32x2_t vrshrn_n_u64(uint64x2_t a, __constrange(1,32) int b); // VRSHRN.I64 d0,q0,#32
1175//Vector rounding narrowing saturating shift right by constant
1176_NEON2SSESTORAGE int8x8_t vqrshrn_n_s16(int16x8_t a, __constrange(1,8) int b); // VQRSHRN.S16 d0,q0,#8
1177_NEON2SSESTORAGE int16x4_t vqrshrn_n_s32(int32x4_t a, __constrange(1,16) int b); // VQRSHRN.S32 d0,q0,#16
1178_NEON2SSESTORAGE int32x2_t vqrshrn_n_s64(int64x2_t a, __constrange(1,32) int b); // VQRSHRN.S64 d0,q0,#32
1179_NEON2SSESTORAGE uint8x8_t vqrshrn_n_u16(uint16x8_t a, __constrange(1,8) int b); // VQRSHRN.U16 d0,q0,#8
1180_NEON2SSESTORAGE uint16x4_t vqrshrn_n_u32(uint32x4_t a, __constrange(1,16) int b); // VQRSHRN.U32 d0,q0,#16
1181_NEON2SSESTORAGE uint32x2_t vqrshrn_n_u64(uint64x2_t a, __constrange(1,32) int b); // VQRSHRN.U64 d0,q0,#32
1182//Vector widening shift left by constant
1183_NEON2SSESTORAGE int16x8_t vshll_n_s8(int8x8_t a, __constrange(0,8) int b); // VSHLL.S8 q0,d0,#0
1184_NEON2SSESTORAGE int32x4_t vshll_n_s16(int16x4_t a, __constrange(0,16) int b); // VSHLL.S16 q0,d0,#0
1185_NEON2SSESTORAGE int64x2_t vshll_n_s32(int32x2_t a, __constrange(0,32) int b); // VSHLL.S32 q0,d0,#0
1186_NEON2SSESTORAGE uint16x8_t vshll_n_u8(uint8x8_t a, __constrange(0,8) int b); // VSHLL.U8 q0,d0,#0
1187_NEON2SSESTORAGE uint32x4_t vshll_n_u16(uint16x4_t a, __constrange(0,16) int b); // VSHLL.U16 q0,d0,#0
1188_NEON2SSESTORAGE uint64x2_t vshll_n_u32(uint32x2_t a, __constrange(0,32) int b); // VSHLL.U32 q0,d0,#0
1189//Shifts with insert
1190//Vector shift right and insert
1191_NEON2SSESTORAGE int8x8_t vsri_n_s8(int8x8_t a, int8x8_t b, __constrange(1,8) int c); // VSRI.8 d0,d0,#8
1192_NEON2SSESTORAGE int16x4_t vsri_n_s16(int16x4_t a, int16x4_t b, __constrange(1,16) int c); // VSRI.16 d0,d0,#16
1193_NEON2SSESTORAGE int32x2_t vsri_n_s32(int32x2_t a, int32x2_t b, __constrange(1,32) int c); // VSRI.32 d0,d0,#32
1194_NEON2SSESTORAGE int64x1_t vsri_n_s64(int64x1_t a, int64x1_t b, __constrange(1,64) int c); // VSRI.64 d0,d0,#64
1195_NEON2SSE_GLOBAL uint8x8_t vsri_n_u8(uint8x8_t a, uint8x8_t b, __constrange(1,8) int c); // VSRI.8 d0,d0,#8
1196_NEON2SSE_GLOBAL uint16x4_t vsri_n_u16(uint16x4_t a, uint16x4_t b, __constrange(1,16) int c); // VSRI.16 d0,d0,#16
1197_NEON2SSE_GLOBAL uint32x2_t vsri_n_u32(uint32x2_t a, uint32x2_t b, __constrange(1,32) int c); // VSRI.32 d0,d0,#32
1198_NEON2SSE_GLOBAL uint64x1_t vsri_n_u64(uint64x1_t a, uint64x1_t b, __constrange(1,64) int c); // VSRI.64 d0,d0,#64
1199_NEON2SSE_GLOBAL poly8x8_t vsri_n_p8(poly8x8_t a, poly8x8_t b, __constrange(1,8) int c); // VSRI.8 d0,d0,#8
1200_NEON2SSE_GLOBAL poly16x4_t vsri_n_p16(poly16x4_t a, poly16x4_t b, __constrange(1,16) int c); // VSRI.16 d0,d0,#16
1201_NEON2SSESTORAGE int8x16_t vsriq_n_s8(int8x16_t a, int8x16_t b, __constrange(1,8) int c); // VSRI.8 q0,q0,#8
1202_NEON2SSESTORAGE int16x8_t vsriq_n_s16(int16x8_t a, int16x8_t b, __constrange(1,16) int c); // VSRI.16 q0,q0,#16
1203_NEON2SSESTORAGE int32x4_t vsriq_n_s32(int32x4_t a, int32x4_t b, __constrange(1,32) int c); // VSRI.32 q0,q0,#32
1204_NEON2SSESTORAGE int64x2_t vsriq_n_s64(int64x2_t a, int64x2_t b, __constrange(1,64) int c); // VSRI.64 q0,q0,#64
1205_NEON2SSE_GLOBAL uint8x16_t vsriq_n_u8(uint8x16_t a, uint8x16_t b, __constrange(1,8) int c); // VSRI.8 q0,q0,#8
1206_NEON2SSE_GLOBAL uint16x8_t vsriq_n_u16(uint16x8_t a, uint16x8_t b, __constrange(1,16) int c); // VSRI.16 q0,q0,#16
1207_NEON2SSE_GLOBAL uint32x4_t vsriq_n_u32(uint32x4_t a, uint32x4_t b, __constrange(1,32) int c); // VSRI.32 q0,q0,#32
1208_NEON2SSE_GLOBAL uint64x2_t vsriq_n_u64(uint64x2_t a, uint64x2_t b, __constrange(1,64) int c); // VSRI.64 q0,q0,#64
1209_NEON2SSE_GLOBAL poly8x16_t vsriq_n_p8(poly8x16_t a, poly8x16_t b, __constrange(1,8) int c); // VSRI.8 q0,q0,#8
1210_NEON2SSE_GLOBAL poly16x8_t vsriq_n_p16(poly16x8_t a, poly16x8_t b, __constrange(1,16) int c); // VSRI.16 q0,q0,#16
1211//Vector shift left and insert
1212_NEON2SSESTORAGE int8x8_t vsli_n_s8(int8x8_t a, int8x8_t b, __constrange(0,7) int c); // VSLI.8 d0,d0,#0
1213_NEON2SSESTORAGE int16x4_t vsli_n_s16(int16x4_t a, int16x4_t b, __constrange(0,15) int c); // VSLI.16 d0,d0,#0
1214_NEON2SSESTORAGE int32x2_t vsli_n_s32(int32x2_t a, int32x2_t b, __constrange(0,31) int c); // VSLI.32 d0,d0,#0
1215_NEON2SSESTORAGE int64x1_t vsli_n_s64(int64x1_t a, int64x1_t b, __constrange(0,63) int c); // VSLI.64 d0,d0,#0
1216_NEON2SSE_GLOBAL uint8x8_t vsli_n_u8(uint8x8_t a, uint8x8_t b, __constrange(0,7) int c); // VSLI.8 d0,d0,#0
1217_NEON2SSE_GLOBAL uint16x4_t vsli_n_u16(uint16x4_t a, uint16x4_t b, __constrange(0,15) int c); // VSLI.16 d0,d0,#0
1218_NEON2SSE_GLOBAL uint32x2_t vsli_n_u32(uint32x2_t a, uint32x2_t b, __constrange(0,31) int c); // VSLI.32 d0,d0,#0
1219_NEON2SSE_GLOBAL uint64x1_t vsli_n_u64(uint64x1_t a, uint64x1_t b, __constrange(0,63) int c); // VSLI.64 d0,d0,#0
1220_NEON2SSE_GLOBAL poly8x8_t vsli_n_p8(poly8x8_t a, poly8x8_t b, __constrange(0,7) int c); // VSLI.8 d0,d0,#0
1221_NEON2SSE_GLOBAL poly16x4_t vsli_n_p16(poly16x4_t a, poly16x4_t b, __constrange(0,15) int c); // VSLI.16 d0,d0,#0
1222_NEON2SSESTORAGE int8x16_t vsliq_n_s8(int8x16_t a, int8x16_t b, __constrange(0,7) int c); // VSLI.8 q0,q0,#0
1223_NEON2SSESTORAGE int16x8_t vsliq_n_s16(int16x8_t a, int16x8_t b, __constrange(0,15) int c); // VSLI.16 q0,q0,#0
1224_NEON2SSESTORAGE int32x4_t vsliq_n_s32(int32x4_t a, int32x4_t b, __constrange(0,31) int c); // VSLI.32 q0,q0,#0
1225_NEON2SSESTORAGE int64x2_t vsliq_n_s64(int64x2_t a, int64x2_t b, __constrange(0,63) int c); // VSLI.64 q0,q0,#0
1226_NEON2SSE_GLOBAL uint8x16_t vsliq_n_u8(uint8x16_t a, uint8x16_t b, __constrange(0,7) int c); // VSLI.8 q0,q0,#0
1227_NEON2SSE_GLOBAL uint16x8_t vsliq_n_u16(uint16x8_t a, uint16x8_t b, __constrange(0,15) int c); // VSLI.16 q0,q0,#0
1228_NEON2SSE_GLOBAL uint32x4_t vsliq_n_u32(uint32x4_t a, uint32x4_t b, __constrange(0,31) int c); // VSLI.32 q0,q0,#0
1229_NEON2SSE_GLOBAL uint64x2_t vsliq_n_u64(uint64x2_t a, uint64x2_t b, __constrange(0,63) int c); // VSLI.64 q0,q0,#0
1230_NEON2SSE_GLOBAL poly8x16_t vsliq_n_p8(poly8x16_t a, poly8x16_t b, __constrange(0,7) int c); // VSLI.8 q0,q0,#0
1231_NEON2SSE_GLOBAL poly16x8_t vsliq_n_p16(poly16x8_t a, poly16x8_t b, __constrange(0,15) int c); // VSLI.16 q0,q0,#0
1232//Loads of a single vector or lane. Perform loads and stores of a single vector of some type.
1233//Load a single vector from memory
1234_NEON2SSE_GLOBAL uint8x16_t vld1q_u8(__transfersize(16) uint8_t const * ptr); // VLD1.8 {d0, d1}, [r0]
1235_NEON2SSE_GLOBAL uint16x8_t vld1q_u16(__transfersize(8) uint16_t const * ptr); // VLD1.16 {d0, d1}, [r0]
1236_NEON2SSE_GLOBAL uint32x4_t vld1q_u32(__transfersize(4) uint32_t const * ptr); // VLD1.32 {d0, d1}, [r0]
1237_NEON2SSE_GLOBAL uint64x2_t vld1q_u64(__transfersize(2) uint64_t const * ptr); // VLD1.64 {d0, d1}, [r0]
1238_NEON2SSE_GLOBAL int8x16_t vld1q_s8(__transfersize(16) int8_t const * ptr); // VLD1.8 {d0, d1}, [r0]
1239_NEON2SSE_GLOBAL int16x8_t vld1q_s16(__transfersize(8) int16_t const * ptr); // VLD1.16 {d0, d1}, [r0]
1240_NEON2SSE_GLOBAL int32x4_t vld1q_s32(__transfersize(4) int32_t const * ptr); // VLD1.32 {d0, d1}, [r0]
1241_NEON2SSE_GLOBAL int64x2_t vld1q_s64(__transfersize(2) int64_t const * ptr); // VLD1.64 {d0, d1}, [r0]
1242_NEON2SSE_GLOBAL float16x8_t vld1q_f16(__transfersize(8) __fp16 const * ptr); // VLD1.16 {d0, d1}, [r0]
1243_NEON2SSESTORAGE float32x4_t vld1q_f32(__transfersize(4) float32_t const * ptr); // VLD1.32 {d0, d1}, [r0]
1244_NEON2SSE_GLOBAL poly8x16_t vld1q_p8(__transfersize(16) poly8_t const * ptr); // VLD1.8 {d0, d1}, [r0]
1245_NEON2SSE_GLOBAL poly16x8_t vld1q_p16(__transfersize(8) poly16_t const * ptr); // VLD1.16 {d0, d1}, [r0]
1246_NEON2SSE_GLOBAL uint8x8_t vld1_u8(__transfersize(8) uint8_t const * ptr); // VLD1.8 {d0}, [r0]
1247_NEON2SSE_GLOBAL uint16x4_t vld1_u16(__transfersize(4) uint16_t const * ptr); // VLD1.16 {d0}, [r0]
1248_NEON2SSE_GLOBAL uint32x2_t vld1_u32(__transfersize(2) uint32_t const * ptr); // VLD1.32 {d0}, [r0]
1249_NEON2SSE_GLOBAL uint64x1_t vld1_u64(__transfersize(1) uint64_t const * ptr); // VLD1.64 {d0}, [r0]
1250_NEON2SSE_GLOBAL int8x8_t vld1_s8(__transfersize(8) int8_t const * ptr); // VLD1.8 {d0}, [r0]
1251_NEON2SSE_GLOBAL int16x4_t vld1_s16(__transfersize(4) int16_t const * ptr); // VLD1.16 {d0}, [r0]
1252_NEON2SSE_GLOBAL int32x2_t vld1_s32(__transfersize(2) int32_t const * ptr); // VLD1.32 {d0}, [r0]
1253_NEON2SSE_GLOBAL int64x1_t vld1_s64(__transfersize(1) int64_t const * ptr); // VLD1.64 {d0}, [r0]
1254_NEON2SSE_GLOBAL float16x4_t vld1_f16(__transfersize(4) __fp16 const * ptr); // VLD1.16 {d0}, [r0]
1255_NEON2SSESTORAGE float32x2_t vld1_f32(__transfersize(2) float32_t const * ptr); // VLD1.32 {d0}, [r0]
1256_NEON2SSE_GLOBAL poly8x8_t vld1_p8(__transfersize(8) poly8_t const * ptr); // VLD1.8 {d0}, [r0]
1257_NEON2SSE_GLOBAL poly16x4_t vld1_p16(__transfersize(4) poly16_t const * ptr); // VLD1.16 {d0}, [r0]
1258
1259_NEON2SSESTORAGE float64x2_t vld1q_f64(__transfersize(4) float64_t const * ptr); // VLD1.64 {d0, d1}, [r0]
1260
1261//Load a single lane from memory
1262_NEON2SSE_GLOBAL uint8x16_t vld1q_lane_u8(__transfersize(1) uint8_t const * ptr, uint8x16_t vec, __constrange(0,15) int lane); //VLD1.8 {d0[0]}, [r0]
1263_NEON2SSE_GLOBAL uint16x8_t vld1q_lane_u16(__transfersize(1) uint16_t const * ptr, uint16x8_t vec, __constrange(0,7) int lane); // VLD1.16 {d0[0]}, [r0]
1264_NEON2SSE_GLOBAL uint32x4_t vld1q_lane_u32(__transfersize(1) uint32_t const * ptr, uint32x4_t vec, __constrange(0,3) int lane); // VLD1.32 {d0[0]}, [r0]
1265_NEON2SSE_GLOBAL uint64x2_t vld1q_lane_u64(__transfersize(1) uint64_t const * ptr, uint64x2_t vec, __constrange(0,1) int lane); // VLD1.64 {d0}, [r0]
1266_NEON2SSE_GLOBAL int8x16_t vld1q_lane_s8(__transfersize(1) int8_t const * ptr, int8x16_t vec, __constrange(0,15) int lane); //VLD1.8 {d0[0]}, [r0]
1267_NEON2SSE_GLOBAL int16x8_t vld1q_lane_s16(__transfersize(1) int16_t const * ptr, int16x8_t vec, __constrange(0,7) int lane); //VLD1.16 {d0[0]}, [r0]
1268_NEON2SSE_GLOBAL int32x4_t vld1q_lane_s32(__transfersize(1) int32_t const * ptr, int32x4_t vec, __constrange(0,3) int lane); //VLD1.32 {d0[0]}, [r0]
1269_NEON2SSE_GLOBAL float16x8_t vld1q_lane_f16(__transfersize(1) __fp16 const * ptr, float16x8_t vec, __constrange(0,7) int lane); //VLD1.16 {d0[0]}, [r0]
1270_NEON2SSESTORAGE float32x4_t vld1q_lane_f32(__transfersize(1) float32_t const * ptr, float32x4_t vec, __constrange(0,3) int lane); // VLD1.32 {d0[0]}, [r0]
1271_NEON2SSE_GLOBAL int64x2_t vld1q_lane_s64(__transfersize(1) int64_t const * ptr, int64x2_t vec, __constrange(0,1) int lane); //VLD1.64 {d0}, [r0]
1272_NEON2SSE_GLOBAL poly8x16_t vld1q_lane_p8(__transfersize(1) poly8_t const * ptr, poly8x16_t vec, __constrange(0,15) int lane); //VLD1.8 {d0[0]}, [r0]
1273_NEON2SSE_GLOBAL poly16x8_t vld1q_lane_p16(__transfersize(1) poly16_t const * ptr, poly16x8_t vec, __constrange(0,7) int lane); // VLD1.16 {d0[0]}, [r0]
1274_NEON2SSESTORAGE uint8x8_t vld1_lane_u8(__transfersize(1) uint8_t const * ptr, uint8x8_t vec, __constrange(0,7) int lane); //VLD1.8 {d0[0]}, [r0]
1275_NEON2SSESTORAGE uint16x4_t vld1_lane_u16(__transfersize(1) uint16_t const * ptr, uint16x4_t vec, __constrange(0,3) int lane); //VLD1.16 {d0[0]}, [r0]
1276_NEON2SSESTORAGE uint32x2_t vld1_lane_u32(__transfersize(1) uint32_t const * ptr, uint32x2_t vec, __constrange(0,1) int lane); //VLD1.32 {d0[0]}, [r0]
1277_NEON2SSESTORAGE uint64x1_t vld1_lane_u64(__transfersize(1) uint64_t const * ptr, uint64x1_t vec, __constrange(0,0) int lane); //VLD1.64 {d0}, [r0]
1278_NEON2SSE_GLOBAL int8x8_t vld1_lane_s8(__transfersize(1) int8_t const * ptr, int8x8_t vec, __constrange(0,7) int lane); // VLD1.8{d0[0]}, [r0]
1279_NEON2SSE_GLOBAL int16x4_t vld1_lane_s16(__transfersize(1) int16_t const * ptr, int16x4_t vec, __constrange(0,3) int lane); //VLD1.16 {d0[0]}, [r0]
1280_NEON2SSE_GLOBAL int32x2_t vld1_lane_s32(__transfersize(1) int32_t const * ptr, int32x2_t vec, __constrange(0,1) int lane); //VLD1.32 {d0[0]}, [r0]
1281_NEON2SSE_GLOBAL float16x4_t vld1q_lane_f16(__transfersize(1) __fp16 const * ptr, float16x4_t vec, __constrange(0,3) int lane); //VLD1.16 {d0[0]}, [r0]
1282_NEON2SSESTORAGE float32x2_t vld1_lane_f32(__transfersize(1) float32_t const * ptr, float32x2_t vec, __constrange(0,1) int lane); // VLD1.32 {d0[0]}, [r0]
1283_NEON2SSE_GLOBAL int64x1_t vld1_lane_s64(__transfersize(1) int64_t const * ptr, int64x1_t vec, __constrange(0,0) int lane); //VLD1.64 {d0}, [r0]
1284_NEON2SSE_GLOBAL poly8x8_t vld1_lane_p8(__transfersize(1) poly8_t const * ptr, poly8x8_t vec, __constrange(0,7) int lane); //VLD1.8 {d0[0]}, [r0]
1285_NEON2SSE_GLOBAL poly16x4_t vld1_lane_p16(__transfersize(1) poly16_t const * ptr, poly16x4_t vec, __constrange(0,3) int lane); //VLD1.16 {d0[0]}, [r0]
1286//Load all lanes of vector with same value from memory
1287_NEON2SSE_GLOBAL uint8x16_t vld1q_dup_u8(__transfersize(1) uint8_t const * ptr); // VLD1.8 {d0[]}, [r0]
1288_NEON2SSE_GLOBAL uint16x8_t vld1q_dup_u16(__transfersize(1) uint16_t const * ptr); // VLD1.16 {d0[]}, [r0]
1289_NEON2SSE_GLOBAL uint32x4_t vld1q_dup_u32(__transfersize(1) uint32_t const * ptr); // VLD1.32 {d0[]}, [r0]
1290_NEON2SSESTORAGE uint64x2_t vld1q_dup_u64(__transfersize(1) uint64_t const * ptr); // VLD1.64 {d0}, [r0]
1291_NEON2SSE_GLOBAL int8x16_t vld1q_dup_s8(__transfersize(1) int8_t const * ptr); // VLD1.8 {d0[]}, [r0]
1292_NEON2SSE_GLOBAL int16x8_t vld1q_dup_s16(__transfersize(1) int16_t const * ptr); // VLD1.16 {d0[]}, [r0]
1293_NEON2SSE_GLOBAL int32x4_t vld1q_dup_s32(__transfersize(1) int32_t const * ptr); // VLD1.32 {d0[]}, [r0]
1294_NEON2SSE_GLOBAL int64x2_t vld1q_dup_s64(__transfersize(1) int64_t const * ptr); // VLD1.64 {d0}, [r0]
1295_NEON2SSE_GLOBAL float16x8_t vld1q_dup_f16(__transfersize(1) __fp16 const * ptr); // VLD1.16 {d0[]}, [r0]
1296_NEON2SSE_GLOBAL float32x4_t vld1q_dup_f32(__transfersize(1) float32_t const * ptr); // VLD1.32 {d0[]}, [r0]
1297_NEON2SSE_GLOBAL poly8x16_t vld1q_dup_p8(__transfersize(1) poly8_t const * ptr); // VLD1.8 {d0[]}, [r0]
1298_NEON2SSE_GLOBAL poly16x8_t vld1q_dup_p16(__transfersize(1) poly16_t const * ptr); // VLD1.16 {d0[]}, [r0]
1299_NEON2SSESTORAGE uint8x8_t vld1_dup_u8(__transfersize(1) uint8_t const * ptr); // VLD1.8 {d0[]}, [r0]
1300_NEON2SSESTORAGE uint16x4_t vld1_dup_u16(__transfersize(1) uint16_t const * ptr); // VLD1.16 {d0[]}, [r0]
1301_NEON2SSESTORAGE uint32x2_t vld1_dup_u32(__transfersize(1) uint32_t const * ptr); // VLD1.32 {d0[]}, [r0]
1302_NEON2SSESTORAGE uint64x1_t vld1_dup_u64(__transfersize(1) uint64_t const * ptr); // VLD1.64 {d0}, [r0]
1303_NEON2SSE_GLOBAL int8x8_t vld1_dup_s8(__transfersize(1) int8_t const * ptr); // VLD1.8 {d0[]}, [r0]
1304_NEON2SSE_GLOBAL int16x4_t vld1_dup_s16(__transfersize(1) int16_t const * ptr); // VLD1.16 {d0[]}, [r0]
1305_NEON2SSE_GLOBAL int32x2_t vld1_dup_s32(__transfersize(1) int32_t const * ptr); // VLD1.32 {d0[]}, [r0]
1306_NEON2SSE_GLOBAL int64x1_t vld1_dup_s64(__transfersize(1) int64_t const * ptr); // VLD1.64 {d0}, [r0]
1307_NEON2SSE_GLOBAL float16x4_t vld1_dup_f16(__transfersize(1) __fp16 const * ptr); // VLD1.16 {d0[]}, [r0]
1308_NEON2SSESTORAGE float32x2_t vld1_dup_f32(__transfersize(1) float32_t const * ptr); // VLD1.32 {d0[]}, [r0]
1309_NEON2SSE_GLOBAL poly8x8_t vld1_dup_p8(__transfersize(1) poly8_t const * ptr); // VLD1.8 {d0[]}, [r0]
1310_NEON2SSE_GLOBAL poly16x4_t vld1_dup_p16(__transfersize(1) poly16_t const * ptr); // VLD1.16 {d0[]}, [r0]
1311//Store a single vector or lane. Stores all lanes or a single lane of a vector.
1312//Store a single vector into memory
1313_NEON2SSE_GLOBAL void vst1q_u8(__transfersize(16) uint8_t * ptr, uint8x16_t val); // VST1.8 {d0, d1}, [r0]
1314_NEON2SSE_GLOBAL void vst1q_u16(__transfersize(8) uint16_t * ptr, uint16x8_t val); // VST1.16 {d0, d1}, [r0]
1315_NEON2SSE_GLOBAL void vst1q_u32(__transfersize(4) uint32_t * ptr, uint32x4_t val); // VST1.32 {d0, d1}, [r0]
1316_NEON2SSE_GLOBAL void vst1q_u64(__transfersize(2) uint64_t * ptr, uint64x2_t val); // VST1.64 {d0, d1}, [r0]
1317_NEON2SSE_GLOBAL void vst1q_s8(__transfersize(16) int8_t * ptr, int8x16_t val); // VST1.8 {d0, d1}, [r0]
1318_NEON2SSE_GLOBAL void vst1q_s16(__transfersize(8) int16_t * ptr, int16x8_t val); // VST1.16 {d0, d1}, [r0]
1319_NEON2SSE_GLOBAL void vst1q_s32(__transfersize(4) int32_t * ptr, int32x4_t val); // VST1.32 {d0, d1}, [r0]
1320_NEON2SSE_GLOBAL void vst1q_s64(__transfersize(2) int64_t * ptr, int64x2_t val); // VST1.64 {d0, d1}, [r0]
1321_NEON2SSE_GLOBAL void vst1q_f16(__transfersize(8) __fp16 * ptr, float16x8_t val); // VST1.16 {d0, d1}, [r0]
1322_NEON2SSESTORAGE void vst1q_f32(__transfersize(4) float32_t * ptr, float32x4_t val); // VST1.32 {d0, d1}, [r0]
1323_NEON2SSE_GLOBAL void vst1q_p8(__transfersize(16) poly8_t * ptr, poly8x16_t val); // VST1.8 {d0, d1}, [r0]
1324_NEON2SSE_GLOBAL void vst1q_p16(__transfersize(8) poly16_t * ptr, poly16x8_t val); // VST1.16 {d0, d1}, [r0]
1325_NEON2SSESTORAGE void vst1_u8(__transfersize(8) uint8_t * ptr, uint8x8_t val); // VST1.8 {d0}, [r0]
1326_NEON2SSESTORAGE void vst1_u16(__transfersize(4) uint16_t * ptr, uint16x4_t val); // VST1.16 {d0}, [r0]
1327_NEON2SSESTORAGE void vst1_u32(__transfersize(2) uint32_t * ptr, uint32x2_t val); // VST1.32 {d0}, [r0]
1328_NEON2SSESTORAGE void vst1_u64(__transfersize(1) uint64_t * ptr, uint64x1_t val); // VST1.64 {d0}, [r0]
1329_NEON2SSE_GLOBAL void vst1_s8(__transfersize(8) int8_t * ptr, int8x8_t val); // VST1.8 {d0}, [r0]
1330_NEON2SSE_GLOBAL void vst1_s16(__transfersize(4) int16_t * ptr, int16x4_t val); // VST1.16 {d0}, [r0]
1331_NEON2SSE_GLOBAL void vst1_s32(__transfersize(2) int32_t * ptr, int32x2_t val); // VST1.32 {d0}, [r0]
1332_NEON2SSE_GLOBAL void vst1_s64(__transfersize(1) int64_t * ptr, int64x1_t val); // VST1.64 {d0}, [r0]
1333_NEON2SSE_GLOBAL void vst1_f16(__transfersize(4) __fp16 * ptr, float16x4_t val); // VST1.16 {d0}, [r0]
1334_NEON2SSESTORAGE void vst1_f32(__transfersize(2) float32_t * ptr, float32x2_t val); // VST1.32 {d0}, [r0]
1335_NEON2SSE_GLOBAL void vst1_p8(__transfersize(8) poly8_t * ptr, poly8x8_t val); // VST1.8 {d0}, [r0]
1336_NEON2SSE_GLOBAL void vst1_p16(__transfersize(4) poly16_t * ptr, poly16x4_t val); // VST1.16 {d0}, [r0]
1337//Store a lane of a vector into memory
1338//Loads of an N-element structure
1339//Load N-element structure from memory
1340_NEON2SSESTORAGE uint8x16x2_t vld2q_u8(__transfersize(32) uint8_t const * ptr); // VLD2.8 {d0, d2}, [r0]
1341_NEON2SSESTORAGE uint16x8x2_t vld2q_u16(__transfersize(16) uint16_t const * ptr); // VLD2.16 {d0, d2}, [r0]
1342_NEON2SSESTORAGE uint32x4x2_t vld2q_u32(__transfersize(8) uint32_t const * ptr); // VLD2.32 {d0, d2}, [r0]
1343_NEON2SSE_GLOBAL int8x16x2_t vld2q_s8(__transfersize(32) int8_t const * ptr); // VLD2.8 {d0, d2}, [r0]
1344_NEON2SSE_GLOBAL int16x8x2_t vld2q_s16(__transfersize(16) int16_t const * ptr); // VLD2.16 {d0, d2}, [r0]
1345_NEON2SSE_GLOBAL int32x4x2_t vld2q_s32(__transfersize(8) int32_t const * ptr); // VLD2.32 {d0, d2}, [r0]
1346_NEON2SSE_GLOBAL float16x8x2_t vld2q_f16(__transfersize(16) __fp16 const * ptr); // VLD2.16 {d0, d2}, [r0]
1347_NEON2SSESTORAGE float32x4x2_t vld2q_f32(__transfersize(8) float32_t const * ptr); // VLD2.32 {d0, d2}, [r0]
1348_NEON2SSE_GLOBAL poly8x16x2_t vld2q_p8(__transfersize(32) poly8_t const * ptr); // VLD2.8 {d0, d2}, [r0]
1349_NEON2SSE_GLOBAL poly16x8x2_t vld2q_p16(__transfersize(16) poly16_t const * ptr); // VLD2.16 {d0, d2}, [r0]
1350_NEON2SSESTORAGE uint8x8x2_t vld2_u8(__transfersize(16) uint8_t const * ptr); // VLD2.8 {d0, d1}, [r0]
1351_NEON2SSESTORAGE uint16x4x2_t vld2_u16(__transfersize(8) uint16_t const * ptr); // VLD2.16 {d0, d1}, [r0]
1352_NEON2SSESTORAGE uint32x2x2_t vld2_u32(__transfersize(4) uint32_t const * ptr); // VLD2.32 {d0, d1}, [r0]
1353_NEON2SSESTORAGE uint64x1x2_t vld2_u64(__transfersize(2) uint64_t const * ptr); // VLD1.64 {d0, d1}, [r0]
1354_NEON2SSE_GLOBAL int8x8x2_t vld2_s8(__transfersize(16) int8_t const * ptr); // VLD2.8 {d0, d1}, [r0]
1355_NEON2SSE_GLOBAL int16x4x2_t vld2_s16(__transfersize(8) int16_t const * ptr); // VLD2.16 {d0, d1}, [r0]
1356_NEON2SSE_GLOBAL int32x2x2_t vld2_s32(__transfersize(4) int32_t const * ptr); // VLD2.32 {d0, d1}, [r0]
1357_NEON2SSE_GLOBAL int64x1x2_t vld2_s64(__transfersize(2) int64_t const * ptr); // VLD1.64 {d0, d1}, [r0]
1358//float16x4x2_t vld2_f16(__transfersize(8) __fp16 const * ptr); // VLD2.16 {d0, d1}, [r0]
1359_NEON2SSESTORAGE float32x2x2_t vld2_f32(__transfersize(4) float32_t const * ptr); // VLD2.32 {d0, d1}, [r0]
1360_NEON2SSE_GLOBAL poly8x8x2_t vld2_p8(__transfersize(16) poly8_t const * ptr); // VLD2.8 {d0, d1}, [r0]
1361_NEON2SSE_GLOBAL poly16x4x2_t vld2_p16(__transfersize(8) poly16_t const * ptr); // VLD2.16 {d0, d1}, [r0]
1362_NEON2SSESTORAGE uint8x16x3_t vld3q_u8(__transfersize(48) uint8_t const * ptr); // VLD3.8 {d0, d2, d4}, [r0]
1363_NEON2SSESTORAGE uint16x8x3_t vld3q_u16(__transfersize(24) uint16_t const * ptr); // VLD3.16 {d0, d2, d4}, [r0]
1364_NEON2SSESTORAGE uint32x4x3_t vld3q_u32(__transfersize(12) uint32_t const * ptr); // VLD3.32 {d0, d2, d4}, [r0]
1365_NEON2SSE_GLOBAL int8x16x3_t vld3q_s8(__transfersize(48) int8_t const * ptr); // VLD3.8 {d0, d2, d4}, [r0]
1366_NEON2SSE_GLOBAL int16x8x3_t vld3q_s16(__transfersize(24) int16_t const * ptr); // VLD3.16 {d0, d2, d4}, [r0]
1367_NEON2SSE_GLOBAL int32x4x3_t vld3q_s32(__transfersize(12) int32_t const * ptr); // VLD3.32 {d0, d2, d4}, [r0]
1368_NEON2SSE_GLOBAL float16x8x3_t vld3q_f16(__transfersize(24) __fp16 const * ptr); // VLD3.16 {d0, d2, d4}, [r0]
1369_NEON2SSESTORAGE float32x4x3_t vld3q_f32(__transfersize(12) float32_t const * ptr); // VLD3.32 {d0, d2, d4}, [r0]
1370poly8x16x3_t vld3q_p8(__transfersize(48) poly8_t const * ptr); // VLD3.8 {d0, d2, d4}, [r0]
1371_NEON2SSE_GLOBAL poly16x8x3_t vld3q_p16(__transfersize(24) poly16_t const * ptr); // VLD3.16 {d0, d2, d4}, [r0]
1372_NEON2SSESTORAGE uint8x8x3_t vld3_u8(__transfersize(24) uint8_t const * ptr); // VLD3.8 {d0, d1, d2}, [r0]
1373_NEON2SSESTORAGE uint16x4x3_t vld3_u16(__transfersize(12) uint16_t const * ptr); // VLD3.16 {d0, d1, d2}, [r0]
1374_NEON2SSESTORAGE uint32x2x3_t vld3_u32(__transfersize(6) uint32_t const * ptr); // VLD3.32 {d0, d1, d2}, [r0]
1375_NEON2SSESTORAGE uint64x1x3_t vld3_u64(__transfersize(3) uint64_t const * ptr); // VLD1.64 {d0, d1, d2}, [r0]
1376_NEON2SSE_GLOBAL int8x8x3_t vld3_s8(__transfersize(24) int8_t const * ptr); // VLD3.8 {d0, d1, d2}, [r0]
1377_NEON2SSE_GLOBAL int16x4x3_t vld3_s16(__transfersize(12) int16_t const * ptr); // VLD3.16 {d0, d1, d2}, [r0]
1378_NEON2SSE_GLOBAL int32x2x3_t vld3_s32(__transfersize(6) int32_t const * ptr); // VLD3.32 {d0, d1, d2}, [r0]
1379_NEON2SSE_GLOBAL int64x1x3_t vld3_s64(__transfersize(3) int64_t const * ptr); // VLD1.64 {d0, d1, d2}, [r0]
1380_NEON2SSE_GLOBAL float16x4x3_t vld3_f16(__transfersize(12) __fp16 const * ptr); // VLD3.16 {d0, d1, d2}, [r0]
1381_NEON2SSESTORAGE float32x2x3_t vld3_f32(__transfersize(6) float32_t const * ptr); // VLD3.32 {d0, d1, d2}, [r0]
1382_NEON2SSE_GLOBAL poly8x8x3_t vld3_p8(__transfersize(24) poly8_t const * ptr); // VLD3.8 {d0, d1, d2}, [r0]
1383_NEON2SSE_GLOBAL poly16x4x3_t vld3_p16(__transfersize(12) poly16_t const * ptr); // VLD3.16 {d0, d1, d2}, [r0]
1384_NEON2SSESTORAGE uint8x16x4_t vld4q_u8(__transfersize(64) uint8_t const * ptr); // VLD4.8 {d0, d2, d4, d6}, [r0]
1385_NEON2SSESTORAGE uint16x8x4_t vld4q_u16(__transfersize(32) uint16_t const * ptr); // VLD4.16 {d0, d2, d4, d6}, [r0]
1386_NEON2SSESTORAGE uint32x4x4_t vld4q_u32(__transfersize(16) uint32_t const * ptr); // VLD4.32 {d0, d2, d4, d6}, [r0]
1387_NEON2SSE_GLOBAL int8x16x4_t vld4q_s8(__transfersize(64) int8_t const * ptr); // VLD4.8 {d0, d2, d4, d6}, [r0]
1388_NEON2SSE_GLOBAL int16x8x4_t vld4q_s16(__transfersize(32) int16_t const * ptr); // VLD4.16 {d0, d2, d4, d6}, [r0]
1389_NEON2SSE_GLOBAL int32x4x4_t vld4q_s32(__transfersize(16) int32_t const * ptr); // VLD4.32 {d0, d2, d4, d6}, [r0]
1390_NEON2SSE_GLOBAL float16x8x4_t vld4q_f16(__transfersize(32) __fp16 const * ptr); // VLD4.16 {d0, d2, d4, d6}, [r0]
1391_NEON2SSESTORAGE float32x4x4_t vld4q_f32(__transfersize(16) float32_t const * ptr); // VLD4.32 {d0, d2, d4, d6}, [r0]
1392_NEON2SSE_GLOBAL poly8x16x4_t vld4q_p8(__transfersize(64) poly8_t const * ptr); // VLD4.8 {d0, d2, d4, d6}, [r0]
1393_NEON2SSE_GLOBAL poly16x8x4_t vld4q_p16(__transfersize(32) poly16_t const * ptr); // VLD4.16 {d0, d2, d4, d6}, [r0]
1394_NEON2SSESTORAGE uint8x8x4_t vld4_u8(__transfersize(32) uint8_t const * ptr); // VLD4.8 {d0, d1, d2, d3}, [r0]
1395_NEON2SSESTORAGE uint16x4x4_t vld4_u16(__transfersize(16) uint16_t const * ptr); // VLD4.16 {d0, d1, d2, d3}, [r0]
1396_NEON2SSESTORAGE uint32x2x4_t vld4_u32(__transfersize(8) uint32_t const * ptr); // VLD4.32 {d0, d1, d2, d3}, [r0]
1397_NEON2SSESTORAGE uint64x1x4_t vld4_u64(__transfersize(4) uint64_t const * ptr); // VLD1.64 {d0, d1, d2, d3}, [r0]
1398_NEON2SSE_GLOBAL int8x8x4_t vld4_s8(__transfersize(32) int8_t const * ptr); // VLD4.8 {d0, d1, d2, d3}, [r0]
1399_NEON2SSE_GLOBAL int16x4x4_t vld4_s16(__transfersize(16) int16_t const * ptr); // VLD4.16 {d0, d1, d2, d3}, [r0]
1400_NEON2SSE_GLOBAL int32x2x4_t vld4_s32(__transfersize(8) int32_t const * ptr); // VLD4.32 {d0, d1, d2, d3}, [r0]
1401_NEON2SSE_GLOBAL int64x1x4_t vld4_s64(__transfersize(4) int64_t const * ptr); // VLD1.64 {d0, d1, d2, d3}, [r0]
1402_NEON2SSE_GLOBAL float16x4x4_t vld4_f16(__transfersize(16) __fp16 const * ptr); // VLD4.16 {d0, d1, d2, d3}, [r0]
1403_NEON2SSESTORAGE float32x2x4_t vld4_f32(__transfersize(8) float32_t const * ptr); // VLD4.32 {d0, d1, d2, d3}, [r0]
1404_NEON2SSE_GLOBAL poly8x8x4_t vld4_p8(__transfersize(32) poly8_t const * ptr); // VLD4.8 {d0, d1, d2, d3}, [r0]
1405_NEON2SSE_GLOBAL poly16x4x4_t vld4_p16(__transfersize(16) poly16_t const * ptr); // VLD4.16 {d0, d1, d2, d3}, [r0]
1406//Load all lanes of N-element structure with same value from memory
1407_NEON2SSESTORAGE uint8x8x2_t vld2_dup_u8(__transfersize(2) uint8_t const * ptr); // VLD2.8 {d0[], d1[]}, [r0]
1408_NEON2SSESTORAGE uint16x4x2_t vld2_dup_u16(__transfersize(2) uint16_t const * ptr); // VLD2.16 {d0[], d1[]}, [r0]
1409_NEON2SSESTORAGE uint32x2x2_t vld2_dup_u32(__transfersize(2) uint32_t const * ptr); // VLD2.32 {d0[], d1[]}, [r0]
1410_NEON2SSE_GLOBAL uint64x1x2_t vld2_dup_u64(__transfersize(2) uint64_t const * ptr); // VLD1.64 {d0, d1}, [r0]
1411_NEON2SSE_GLOBAL int8x8x2_t vld2_dup_s8(__transfersize(2) int8_t const * ptr); // VLD2.8 {d0[], d1[]}, [r0]
1412_NEON2SSE_GLOBAL int16x4x2_t vld2_dup_s16(__transfersize(2) int16_t const * ptr); // VLD2.16 {d0[], d1[]}, [r0]
1413_NEON2SSE_GLOBAL int32x2x2_t vld2_dup_s32(__transfersize(2) int32_t const * ptr); // VLD2.32 {d0[], d1[]}, [r0]
1414_NEON2SSE_GLOBAL int64x1x2_t vld2_dup_s64(__transfersize(2) int64_t const * ptr); // VLD1.64 {d0, d1}, [r0]
1415//float16x4x2_t vld2_dup_f16(__transfersize(2) __fp16 const * ptr); // VLD2.16 {d0[], d1[]}, [r0]
1416_NEON2SSESTORAGE float32x2x2_t vld2_dup_f32(__transfersize(2) float32_t const * ptr); // VLD2.32 {d0[], d1[]}, [r0]
1417_NEON2SSE_GLOBAL poly8x8x2_t vld2_dup_p8(__transfersize(2) poly8_t const * ptr); // VLD2.8 {d0[], d1[]}, [r0]
1418_NEON2SSE_GLOBAL poly16x4x2_t vld2_dup_p16(__transfersize(2) poly16_t const * ptr); // VLD2.16 {d0[], d1[]}, [r0]
1419_NEON2SSESTORAGE uint8x8x3_t vld3_dup_u8(__transfersize(3) uint8_t const * ptr); // VLD3.8 {d0[], d1[], d2[]}, [r0]
1420_NEON2SSESTORAGE uint16x4x3_t vld3_dup_u16(__transfersize(3) uint16_t const * ptr); // VLD3.16 {d0[], d1[], d2[]}, [r0]
1421_NEON2SSESTORAGE uint32x2x3_t vld3_dup_u32(__transfersize(3) uint32_t const * ptr); // VLD3.32 {d0[], d1[], d2[]}, [r0]
1422_NEON2SSESTORAGE uint64x1x3_t vld3_dup_u64(__transfersize(3) uint64_t const * ptr); // VLD1.64 {d0, d1, d2}, [r0]
1423_NEON2SSE_GLOBAL int8x8x3_t vld3_dup_s8(__transfersize(3) int8_t const * ptr); // VLD3.8 {d0[], d1[], d2[]}, [r0]
1424_NEON2SSE_GLOBAL int16x4x3_t vld3_dup_s16(__transfersize(3) int16_t const * ptr); // VLD3.16 {d0[], d1[], d2[]}, [r0]
1425_NEON2SSE_GLOBAL int32x2x3_t vld3_dup_s32(__transfersize(3) int32_t const * ptr); // VLD3.32 {d0[], d1[], d2[]}, [r0]
1426_NEON2SSE_GLOBAL int64x1x3_t vld3_dup_s64(__transfersize(3) int64_t const * ptr); // VLD1.64 {d0, d1, d2}, [r0]
1427_NEON2SSE_GLOBAL float16x4x3_t vld3_dup_f16(__transfersize(3) __fp16 const * ptr); // VLD3.16 {d0[], d1[], d2[]}, [r0]
1428_NEON2SSESTORAGE float32x2x3_t vld3_dup_f32(__transfersize(3) float32_t const * ptr); // VLD3.32 {d0[], d1[], d2[]}, [r0]
1429_NEON2SSE_GLOBAL poly8x8x3_t vld3_dup_p8(__transfersize(3) poly8_t const * ptr); // VLD3.8 {d0[], d1[], d2[]}, [r0]
1430_NEON2SSE_GLOBAL poly16x4x3_t vld3_dup_p16(__transfersize(3) poly16_t const * ptr); // VLD3.16 {d0[], d1[], d2[]}, [r0]
1431_NEON2SSESTORAGE uint8x8x4_t vld4_dup_u8(__transfersize(4) uint8_t const * ptr); // VLD4.8 {d0[], d1[], d2[], d3[]}, [r0]
1432_NEON2SSESTORAGE uint16x4x4_t vld4_dup_u16(__transfersize(4) uint16_t const * ptr); // VLD4.16 {d0[], d1[], d2[], d3[]}, [r0]
1433_NEON2SSESTORAGE uint32x2x4_t vld4_dup_u32(__transfersize(4) uint32_t const * ptr); // VLD4.32 {d0[], d1[], d2[], d3[]}, [r0]
1434_NEON2SSESTORAGE uint64x1x4_t vld4_dup_u64(__transfersize(4) uint64_t const * ptr); // VLD1.64 {d0, d1, d2, d3}, [r0]
1435_NEON2SSE_GLOBAL int8x8x4_t vld4_dup_s8(__transfersize(4) int8_t const * ptr); // VLD4.8 {d0[], d1[], d2[], d3[]}, [r0]
1436_NEON2SSE_GLOBAL int16x4x4_t vld4_dup_s16(__transfersize(4) int16_t const * ptr); // VLD4.16 {d0[], d1[], d2[], d3[]}, [r0]
1437_NEON2SSE_GLOBAL int32x2x4_t vld4_dup_s32(__transfersize(4) int32_t const * ptr); // VLD4.32 {d0[], d1[], d2[], d3[]}, [r0]
1438_NEON2SSE_GLOBAL int64x1x4_t vld4_dup_s64(__transfersize(4) int64_t const * ptr); // VLD1.64 {d0, d1, d2, d3}, [r0]
1439_NEON2SSE_GLOBAL float16x4x4_t vld4_dup_f16(__transfersize(4) __fp16 const * ptr); // VLD4.16 {d0[], d1[], d2[], d3[]}, [r0]
1440_NEON2SSESTORAGE float32x2x4_t vld4_dup_f32(__transfersize(4) float32_t const * ptr); // VLD4.32 {d0[], d1[], d2[], d3[]}, [r0]
1441_NEON2SSE_GLOBAL poly8x8x4_t vld4_dup_p8(__transfersize(4) poly8_t const * ptr); // VLD4.8 {d0[], d1[], d2[], d3[]}, [r0]
1442_NEON2SSE_GLOBAL poly16x4x4_t vld4_dup_p16(__transfersize(4) poly16_t const * ptr); // VLD4.16 {d0[], d1[], d2[], d3[]}, [r0]
1443//Load a single lane of N-element structure from memory
1444//the functions below are modified to deal with the error C2719: 'src': formal parameter with __declspec(align('16')) won't be aligned
1445_NEON2SSESTORAGE uint16x8x2_t vld2q_lane_u16_ptr(__transfersize(2) uint16_t const * ptr, uint16x8x2_t * src, __constrange(0,7) int lane); // VLD2.16 {d0[0], d2[0]}, [r0]
1446_NEON2SSESTORAGE uint32x4x2_t vld2q_lane_u32_ptr(__transfersize(2) uint32_t const * ptr, uint32x4x2_t * src, __constrange(0,3) int lane); // VLD2.32 {d0[0], d2[0]}, [r0]
1447_NEON2SSESTORAGE int16x8x2_t vld2q_lane_s16_ptr(__transfersize(2) int16_t const * ptr, int16x8x2_t * src, __constrange(0,7) int lane); // VLD2.16 {d0[0], d2[0]}, [r0]
1448_NEON2SSESTORAGE int32x4x2_t vld2q_lane_s32_ptr(__transfersize(2) int32_t const * ptr, int32x4x2_t * src, __constrange(0,3) int lane); // VLD2.32 {d0[0], d2[0]}, [r0]
1449_NEON2SSE_GLOBAL float16x8x2_t vld2q_lane_f16_ptr(__transfersize(2) __fp16 const * ptr, float16x8x2_t * src, __constrange(0,7) int lane); // VLD2.16 {d0[0], d2[0]}, [r0]
1450_NEON2SSESTORAGE float32x4x2_t vld2q_lane_f32_ptr(__transfersize(2) float32_t const * ptr, float32x4x2_t * src, __constrange(0,3) int lane); // VLD2.32 {d0[0], d2[0]}, [r0]
1451_NEON2SSE_GLOBAL poly16x8x2_t vld2q_lane_p16_ptr(__transfersize(2) poly16_t const * ptr, poly16x8x2_t * src, __constrange(0,7) int lane); // VLD2.16 {d0[0], d2[0]}, [r0]
1452_NEON2SSESTORAGE uint8x8x2_t vld2_lane_u8(__transfersize(2) uint8_t const * ptr, uint8x8x2_t src, __constrange(0,7) int lane); //VLD2.8 {d0[0], d1[0]}, [r0]
1453_NEON2SSESTORAGE uint16x4x2_t vld2_lane_u16(__transfersize(2) uint16_t const * ptr, uint16x4x2_t src, __constrange(0,3) int lane); // VLD2.16 {d0[0], d1[0]}, [r0]
1454_NEON2SSESTORAGE uint32x2x2_t vld2_lane_u32(__transfersize(2) uint32_t const * ptr, uint32x2x2_t src, __constrange(0,1) int lane); // VLD2.32 {d0[0], d1[0]}, [r0]
1455_NEON2SSE_GLOBAL int8x8x2_t vld2_lane_s8(__transfersize(2) int8_t const * ptr, int8x8x2_t src, __constrange(0,7) int lane); //VLD2.8 {d0[0], d1[0]}, [r0]
1456_NEON2SSE_GLOBAL int16x4x2_t vld2_lane_s16(__transfersize(2) int16_t const * ptr, int16x4x2_t src, __constrange(0,3) int lane); //VLD2.16 {d0[0], d1[0]}, [r0]
1457_NEON2SSE_GLOBAL int32x2x2_t vld2_lane_s32(__transfersize(2) int32_t const * ptr, int32x2x2_t src, __constrange(0,1) int lane); //VLD2.32 {d0[0], d1[0]}, [r0]
1458//float16x4x2_t vld2_lane_f16_ptr(__transfersize(2) __fp16 const * ptr, float16x4x2_t * src, __constrange(0,3) int lane); // VLD2.16 {d0[0], d1[0]}, [r0]
1459_NEON2SSESTORAGE float32x2x2_t vld2_lane_f32(__transfersize(2) float32_t const * ptr, float32x2x2_t src, __constrange(0,1) int lane); // VLD2.32 {d0[0], d1[0]}, [r0]
1460_NEON2SSE_GLOBAL poly8x8x2_t vld2_lane_p8(__transfersize(2) poly8_t const * ptr, poly8x8x2_t src, __constrange(0,7) int lane); //VLD2.8 {d0[0], d1[0]}, [r0]
1461_NEON2SSE_GLOBAL poly16x4x2_t vld2_lane_p16(__transfersize(2) poly16_t const * ptr, poly16x4x2_t src, __constrange(0,3) int lane); // VLD2.16 {d0[0], d1[0]}, [r0]
1462_NEON2SSESTORAGE uint16x8x3_t vld3q_lane_u16_ptr(__transfersize(3) uint16_t const * ptr, uint16x8x3_t * src, __constrange(0,7) int lane); // VLD3.16 {d0[0], d2[0], d4[0]}, [r0]
1463_NEON2SSESTORAGE uint32x4x3_t vld3q_lane_u32_ptr(__transfersize(3) uint32_t const * ptr, uint32x4x3_t * src, __constrange(0,3) int lane); // VLD3.32 {d0[0], d2[0], d4[0]}, [r0]
1464_NEON2SSESTORAGE int16x8x3_t vld3q_lane_s16_ptr(__transfersize(3) int16_t const * ptr, int16x8x3_t * src, __constrange(0,7) int lane); // VLD3.16 {d0[0], d2[0], d4[0]}, [r0]
1465_NEON2SSESTORAGE int32x4x3_t vld3q_lane_s32_ptr(__transfersize(3) int32_t const * ptr, int32x4x3_t * src, __constrange(0,3) int lane); // VLD3.32 {d0[0], d2[0], d4[0]}, [r0]
1466_NEON2SSE_GLOBAL float16x8x3_t vld3q_lane_f16_ptr(__transfersize(3) __fp16 const * ptr, float16x8x3_t * src, __constrange(0,7) int lane); // VLD3.16 {d0[0], d2[0], d4[0]}, [r0]
1467_NEON2SSESTORAGE float32x4x3_t vld3q_lane_f32_ptr(__transfersize(3) float32_t const * ptr, float32x4x3_t * src, __constrange(0,3) int lane); // VLD3.32 {d0[0], d2[0], d4[0]}, [r0]
1468_NEON2SSE_GLOBAL poly16x8x3_t vld3q_lane_p16_ptr(__transfersize(3) poly16_t const * ptr, poly16x8x3_t * src, __constrange(0,7) int lane); // VLD3.16 {d0[0], d2[0], d4[0]}, [r0]
1469_NEON2SSESTORAGE uint8x8x3_t vld3_lane_u8(__transfersize(3) uint8_t const * ptr, uint8x8x3_t src, __constrange(0,7) int lane); //VLD3.8 {d0[0], d1[0], d2[0]}, [r0]
1470_NEON2SSESTORAGE uint16x4x3_t vld3_lane_u16(__transfersize(3) uint16_t const * ptr, uint16x4x3_t src, __constrange(0,3) int lane); // VLD3.16 {d0[0], d1[0], d2[0]}, [r0]
1471_NEON2SSESTORAGE uint32x2x3_t vld3_lane_u32(__transfersize(3) uint32_t const * ptr, uint32x2x3_t src, __constrange(0,1) int lane); // VLD3.32 {d0[0], d1[0], d2[0]}, [r0]
1472_NEON2SSE_GLOBAL int8x8x3_t vld3_lane_s8(__transfersize(3) int8_t const * ptr, int8x8x3_t src, __constrange(0,7) int lane); //VLD3.8 {d0[0], d1[0], d2[0]}, [r0]
1473_NEON2SSE_GLOBAL int16x4x3_t vld3_lane_s16(__transfersize(3) int16_t const * ptr, int16x4x3_t src, __constrange(0,3) int lane); //VLD3.16 {d0[0], d1[0], d2[0]}, [r0]
1474_NEON2SSE_GLOBAL int32x2x3_t vld3_lane_s32(__transfersize(3) int32_t const * ptr, int32x2x3_t src, __constrange(0,1) int lane); //VLD3.32 {d0[0], d1[0], d2[0]}, [r0]
1475_NEON2SSE_GLOBAL float16x4x3_t vld3_lane_f16_ptr(__transfersize(3) __fp16 const * ptr, float16x4x3_t * src, __constrange(0,3) int lane); // VLD3.16 {d0[0], d1[0], d2[0]}, [r0]
1476_NEON2SSESTORAGE float32x2x3_t vld3_lane_f32(__transfersize(3) float32_t const * ptr, float32x2x3_t src, __constrange(0,1) int lane); // VLD3.32 {d0[0], d1[0], d2[0]}, [r0]
1477_NEON2SSE_GLOBAL poly8x8x3_t vld3_lane_p8(__transfersize(3) poly8_t const * ptr, poly8x8x3_t src, __constrange(0,7) int lane); //VLD3.8 {d0[0], d1[0], d2[0]}, [r0]
1478_NEON2SSE_GLOBAL poly16x4x3_t vld3_lane_p16(__transfersize(3) poly16_t const * ptr, poly16x4x3_t src, __constrange(0,3) int lane); // VLD3.16 {d0[0], d1[0], d2[0]}, [r0]
1479_NEON2SSESTORAGE uint16x8x4_t vld4q_lane_u16_ptr(__transfersize(4) uint16_t const * ptr, uint16x8x4_t * src, __constrange(0,7) int lane); // VLD4.16 {d0[0], d2[0], d4[0], d6[0]}, [r0]
1480_NEON2SSESTORAGE uint32x4x4_t vld4q_lane_u32_ptr(__transfersize(4) uint32_t const * ptr, uint32x4x4_t * src, __constrange(0,3) int lane); // VLD4.32 {d0[0], d2[0], d4[0], d6[0]}, [r0]
1481_NEON2SSE_GLOBAL int16x8x4_t vld4q_lane_s16_ptr(__transfersize(4) int16_t const * ptr, int16x8x4_t * src, __constrange(0,7) int lane); // VLD4.16 {d0[0], d2[0], d4[0], d6[0]}, [r0]
1482_NEON2SSE_GLOBAL int32x4x4_t vld4q_lane_s32_ptr(__transfersize(4) int32_t const * ptr, int32x4x4_t * src, __constrange(0,3) int lane); // VLD4.32 {d0[0], d2[0], d4[0], d6[0]}, [r0]
1483_NEON2SSE_GLOBAL float16x8x4_t vld4q_lane_f16_ptr(__transfersize(4) __fp16 const * ptr, float16x8x4_t * src, __constrange(0,7) int lane); // VLD4.16 {d0[0], d2[0], d4[0], d6[0]}, [r0]
1484_NEON2SSESTORAGE float32x4x4_t vld4q_lane_f32_ptr(__transfersize(4) float32_t const * ptr, float32x4x4_t * src, __constrange(0,3) int lane); // VLD4.32 {d0[0], d2[0], d4[0], d6[0]}, [r0]
1485_NEON2SSE_GLOBAL poly16x8x4_t vld4q_lane_p16_ptr(__transfersize(4) poly16_t const * ptr, poly16x8x4_t * src, __constrange(0,7) int lane); // VLD4.16 {d0[0], d2[0], d4[0], d6[0]}, [r0]
1486_NEON2SSESTORAGE uint8x8x4_t vld4_lane_u8(__transfersize(4) uint8_t const * ptr, uint8x8x4_t src, __constrange(0,7) int lane); //VLD4.8 {d0[0], d1[0], d2[0], d3[0]}, [r0]
1487_NEON2SSESTORAGE uint16x4x4_t vld4_lane_u16(__transfersize(4) uint16_t const * ptr, uint16x4x4_t src, __constrange(0,3) int lane); // VLD4.16 {d0[0], d1[0], d2[0], d3[0]}, [r0]
1488_NEON2SSESTORAGE uint32x2x4_t vld4_lane_u32(__transfersize(4) uint32_t const * ptr, uint32x2x4_t src, __constrange(0,1) int lane); // VLD4.32 {d0[0], d1[0], d2[0], d3[0]}, [r0]
1489_NEON2SSE_GLOBAL int8x8x4_t vld4_lane_s8(__transfersize(4) int8_t const * ptr, int8x8x4_t src, __constrange(0,7) int lane); //VLD4.8 {d0[0], d1[0], d2[0], d3[0]}, [r0]
1490_NEON2SSE_GLOBAL int16x4x4_t vld4_lane_s16(__transfersize(4) int16_t const * ptr, int16x4x4_t src, __constrange(0,3) int lane); //VLD4.16 {d0[0], d1[0], d2[0], d3[0]}, [r0]
1491_NEON2SSE_GLOBAL int32x2x4_t vld4_lane_s32(__transfersize(4) int32_t const * ptr, int32x2x4_t src, __constrange(0,1) int lane); //VLD4.32 {d0[0], d1[0], d2[0], d3[0]}, [r0]
1492_NEON2SSE_GLOBAL float16x4x4_t vld4_lane_f16_ptr(__transfersize(4) __fp16 const * ptr, float16x4x4_t * src, __constrange(0,3) int lane); // VLD4.16 {d0[0], d1[0], d2[0], d3[0]}, [r0]
1493_NEON2SSESTORAGE float32x2x4_t vld4_lane_f32(__transfersize(4) float32_t const * ptr, float32x2x4_t src, __constrange(0,1) int lane); // VLD4.32 {d0[0], d1[0], d2[0], d3[0]}, [r0]
1494_NEON2SSE_GLOBAL poly8x8x4_t vld4_lane_p8(__transfersize(4) poly8_t const * ptr, poly8x8x4_t src, __constrange(0,7) int lane); //VLD4.8 {d0[0], d1[0], d2[0], d3[0]}, [r0]
1495_NEON2SSE_GLOBAL poly16x4x4_t vld4_lane_p16(__transfersize(4) poly16_t const * ptr, poly16x4x4_t src, __constrange(0,3) int lane); // VLD4.16 {d0[0], d1[0], d2[0], d3[0]}, [r0]
1496//Store N-element structure to memory
1497_NEON2SSESTORAGE void vst2q_u8_ptr(__transfersize(32) uint8_t * ptr, uint8x16x2_t const * val); // VST2.8 {d0, d2}, [r0]
1498_NEON2SSESTORAGE void vst2q_u16_ptr(__transfersize(16) uint16_t * ptr, uint16x8x2_t const * val); // VST2.16 {d0, d2}, [r0]
1499_NEON2SSESTORAGE void vst2q_u32_ptr(__transfersize(8) uint32_t * ptr, uint32x4x2_t const * val); // VST2.32 {d0, d2}, [r0]
1500_NEON2SSE_GLOBAL void vst2q_s8_ptr(__transfersize(32) int8_t * ptr, int8x16x2_t const * val); // VST2.8 {d0, d2}, [r0]
1501_NEON2SSE_GLOBAL void vst2q_s16_ptr(__transfersize(16) int16_t * ptr, int16x8x2_t const * val); // VST2.16 {d0, d2}, [r0]
1502_NEON2SSE_GLOBAL void vst2q_s32_ptr(__transfersize(8) int32_t * ptr, int32x4x2_t const * val); // VST2.32 {d0, d2}, [r0]
1503_NEON2SSE_GLOBAL void vst2q_f16_ptr(__transfersize(16) __fp16 * ptr, float16x8x2_t const * val); // VST2.16 {d0, d2}, [r0]
1504_NEON2SSESTORAGE void vst2q_f32_ptr(__transfersize(8) float32_t * ptr, float32x4x2_t const * val); // VST2.32 {d0, d2}, [r0]
1505_NEON2SSE_GLOBAL void vst2q_p8_ptr(__transfersize(32) poly8_t * ptr, poly8x16x2_t const * val); // VST2.8 {d0, d2}, [r0]
1506_NEON2SSE_GLOBAL void vst2q_p16_ptr(__transfersize(16) poly16_t * ptr, poly16x8x2_t const * val); // VST2.16 {d0, d2}, [r0]
1507_NEON2SSESTORAGE void vst2_u8(__transfersize(16) uint8_t * ptr, uint8x8x2_t val); // VST2.8 {d0, d1}, [r0]
1508_NEON2SSESTORAGE void vst2_u16(__transfersize(8) uint16_t * ptr, uint16x4x2_t val); // VST2.16 {d0, d1}, [r0]
1509_NEON2SSESTORAGE void vst2_u32(__transfersize(4) uint32_t * ptr, uint32x2x2_t val); // VST2.32 {d0, d1}, [r0]
1510_NEON2SSESTORAGE void vst2_u64(__transfersize(2) uint64_t * ptr, uint64x1x2_t val); // VST1.64 {d0, d1}, [r0]
1511_NEON2SSE_GLOBAL void vst2_s8(__transfersize(16) int8_t * ptr, int8x8x2_t val); // VST2.8 {d0, d1}, [r0]
1512_NEON2SSE_GLOBAL void vst2_s16(__transfersize(8) int16_t * ptr, int16x4x2_t val); // VST2.16 {d0, d1}, [r0]
1513_NEON2SSE_GLOBAL void vst2_s32(__transfersize(4) int32_t * ptr, int32x2x2_t val); // VST2.32 {d0, d1}, [r0]
1514_NEON2SSE_GLOBAL void vst2_s64(__transfersize(2) int64_t * ptr, int64x1x2_t val); // VST1.64 {d0, d1}, [r0]
1515//void vst2_f16_ptr(__transfersize(8) __fp16 * ptr, float16x4x2_t const * val); // VST2.16 {d0, d1}, [r0]
1516_NEON2SSE_GLOBAL void vst2_f32_ptr(__transfersize(4) float32_t * ptr, float32x2x2_t const * val); // VST2.32 {d0, d1}, [r0]
1517_NEON2SSE_GLOBAL void vst2_p8(__transfersize(16) poly8_t * ptr, poly8x8x2_t val); // VST2.8 {d0, d1}, [r0]
1518_NEON2SSE_GLOBAL void vst2_p16(__transfersize(8) poly16_t * ptr, poly16x4x2_t val); // VST2.16 {d0, d1}, [r0]
1519_NEON2SSESTORAGE void vst3q_u8_ptr(__transfersize(48) uint8_t * ptr, uint8x16x3_t const * val); // VST3.8 {d0, d2, d4}, [r0]
1520_NEON2SSESTORAGE void vst3q_u16_ptr(__transfersize(24) uint16_t * ptr, uint16x8x3_t const * val); // VST3.16 {d0, d2, d4}, [r0]
1521_NEON2SSESTORAGE void vst3q_u32_ptr(__transfersize(12) uint32_t * ptr, uint32x4x3_t const * val); // VST3.32 {d0, d2, d4}, [r0]
1522_NEON2SSE_GLOBAL void vst3q_s8_ptr(__transfersize(48) int8_t * ptr, int8x16x3_t const * val); // VST3.8 {d0, d2, d4}, [r0]
1523_NEON2SSE_GLOBAL void vst3q_s16_ptr(__transfersize(24) int16_t * ptr, int16x8x3_t const * val); // VST3.16 {d0, d2, d4}, [r0]
1524_NEON2SSE_GLOBAL void vst3q_s32_ptr(__transfersize(12) int32_t * ptr, int32x4x3_t const * val); // VST3.32 {d0, d2, d4}, [r0]
1525_NEON2SSE_GLOBAL void vst3q_f16_ptr(__transfersize(24) __fp16 * ptr, float16x8x3_t const * val); // VST3.16 {d0, d2, d4}, [r0]
1526_NEON2SSESTORAGE void vst3q_f32_ptr(__transfersize(12) float32_t * ptr, float32x4x3_t const * val); // VST3.32 {d0, d2, d4}, [r0]
1527_NEON2SSE_GLOBAL void vst3q_p8_ptr(__transfersize(48) poly8_t * ptr, poly8x16x3_t const * val); // VST3.8 {d0, d2, d4}, [r0]
1528_NEON2SSE_GLOBAL void vst3q_p16_ptr(__transfersize(24) poly16_t * ptr, poly16x8x3_t const * val); // VST3.16 {d0, d2, d4}, [r0]
1529_NEON2SSESTORAGE void vst3_u8(__transfersize(24) uint8_t * ptr, uint8x8x3_t val); // VST3.8 {d0, d1, d2}, [r0]
1530_NEON2SSESTORAGE void vst3_u16(__transfersize(12) uint16_t * ptr, uint16x4x3_t val); // VST3.16 {d0, d1, d2}, [r0]
1531_NEON2SSESTORAGE void vst3_u32(__transfersize(6) uint32_t * ptr, uint32x2x3_t val); // VST3.32 {d0, d1, d2}, [r0]
1532_NEON2SSESTORAGE void vst3_u64(__transfersize(3) uint64_t * ptr, uint64x1x3_t val); // VST1.64 {d0, d1, d2}, [r0]
1533_NEON2SSE_GLOBAL void vst3_s8(__transfersize(24) int8_t * ptr, int8x8x3_t val); // VST3.8 {d0, d1, d2}, [r0]
1534_NEON2SSE_GLOBAL void vst3_s16(__transfersize(12) int16_t * ptr, int16x4x3_t val); // VST3.16 {d0, d1, d2}, [r0]
1535_NEON2SSE_GLOBAL void vst3_s32(__transfersize(6) int32_t * ptr, int32x2x3_t val); // VST3.32 {d0, d1, d2}, [r0]
1536_NEON2SSE_GLOBAL void vst3_s64(__transfersize(3) int64_t * ptr, int64x1x3_t val); // VST1.64 {d0, d1, d2}, [r0]
1537_NEON2SSE_GLOBAL void vst3_f16_ptr(__transfersize(12) __fp16 * ptr, float16x4x3_t const * val); // VST3.16 {d0, d1, d2}, [r0]
1538_NEON2SSESTORAGE void vst3_f32(__transfersize(6) float32_t * ptr, float32x2x3_t val); // VST3.32 {d0, d1, d2}, [r0]
1539_NEON2SSE_GLOBAL void vst3_p8(__transfersize(24) poly8_t * ptr, poly8x8x3_t val); // VST3.8 {d0, d1, d2}, [r0]
1540_NEON2SSE_GLOBAL void vst3_p16(__transfersize(12) poly16_t * ptr, poly16x4x3_t val); // VST3.16 {d0, d1, d2}, [r0]
1541_NEON2SSESTORAGE void vst4q_u8_ptr(__transfersize(64) uint8_t * ptr, uint8x16x4_t const * val); // VST4.8 {d0, d2, d4, d6}, [r0]
1542_NEON2SSESTORAGE void vst4q_u16_ptr(__transfersize(32) uint16_t * ptr, uint16x8x4_t const * val); // VST4.16 {d0, d2, d4, d6}, [r0]
1543_NEON2SSESTORAGE void vst4q_u32_ptr(__transfersize(16) uint32_t * ptr, uint32x4x4_t const * val); // VST4.32 {d0, d2, d4, d6}, [r0]
1544_NEON2SSE_GLOBAL void vst4q_s8_ptr(__transfersize(64) int8_t * ptr, int8x16x4_t const * val); // VST4.8 {d0, d2, d4, d6}, [r0]
1545_NEON2SSE_GLOBAL void vst4q_s16_ptr(__transfersize(32) int16_t * ptr, int16x8x4_t const * val); // VST4.16 {d0, d2, d4, d6}, [r0]
1546_NEON2SSE_GLOBAL void vst4q_s32_ptr(__transfersize(16) int32_t * ptr, int32x4x4_t const * val); // VST4.32 {d0, d2, d4, d6}, [r0]
1547_NEON2SSE_GLOBAL void vst4q_f16_ptr(__transfersize(32) __fp16 * ptr, float16x8x4_t const * val); // VST4.16 {d0, d2, d4, d6}, [r0]
1548_NEON2SSESTORAGE void vst4q_f32_ptr(__transfersize(16) float32_t * ptr, float32x4x4_t const * val); // VST4.32 {d0, d2, d4, d6}, [r0]
1549_NEON2SSE_GLOBAL void vst4q_p8_ptr(__transfersize(64) poly8_t * ptr, poly8x16x4_t const * val); // VST4.8 {d0, d2, d4, d6}, [r0]
1550_NEON2SSE_GLOBAL void vst4q_p16_ptr(__transfersize(32) poly16_t * ptr, poly16x8x4_t const * val); // VST4.16 {d0, d2, d4, d6}, [r0]
1551_NEON2SSESTORAGE void vst4_u8(__transfersize(32) uint8_t * ptr, uint8x8x4_t val); // VST4.8 {d0, d1, d2, d3}, [r0]
1552_NEON2SSESTORAGE void vst4_u16(__transfersize(16) uint16_t * ptr, uint16x4x4_t val); // VST4.16 {d0, d1, d2, d3}, [r0]
1553_NEON2SSESTORAGE void vst4_u32(__transfersize(8) uint32_t * ptr, uint32x2x4_t val); // VST4.32 {d0, d1, d2, d3}, [r0]
1554_NEON2SSESTORAGE void vst4_u64(__transfersize(4) uint64_t * ptr, uint64x1x4_t val); // VST1.64 {d0, d1, d2, d3}, [r0]
1555_NEON2SSE_GLOBAL void vst4_s8(__transfersize(32) int8_t * ptr, int8x8x4_t val); // VST4.8 {d0, d1, d2, d3}, [r0]
1556_NEON2SSE_GLOBAL void vst4_s16(__transfersize(16) int16_t * ptr, int16x4x4_t val); // VST4.16 {d0, d1, d2, d3}, [r0]
1557_NEON2SSE_GLOBAL void vst4_s32(__transfersize(8) int32_t * ptr, int32x2x4_t val); // VST4.32 {d0, d1, d2, d3}, [r0]
1558_NEON2SSE_GLOBAL void vst4_s64(__transfersize(4) int64_t * ptr, int64x1x4_t val); // VST1.64 {d0, d1, d2, d3}, [r0]
1559_NEON2SSE_GLOBAL void vst4_f16_ptr(__transfersize(16) __fp16 * ptr, float16x4x4_t const * val); // VST4.16 {d0, d1, d2, d3}, [r0]
1560_NEON2SSESTORAGE void vst4_f32(__transfersize(8) float32_t * ptr, float32x2x4_t val); // VST4.32 {d0, d1, d2, d3}, [r0]
1561_NEON2SSE_GLOBAL void vst4_p8(__transfersize(32) poly8_t * ptr, poly8x8x4_t val); // VST4.8 {d0, d1, d2, d3}, [r0]
1562_NEON2SSE_GLOBAL void vst4_p16(__transfersize(16) poly16_t * ptr, poly16x4x4_t val); // VST4.16 {d0, d1, d2, d3}, [r0]
1563//Store a single lane of N-element structure to memory
1564_NEON2SSESTORAGE void vst2q_lane_u16_ptr(__transfersize(2) uint16_t * ptr, uint16x8x2_t const * val, __constrange(0,7) int lane); // VST2.16{d0[0], d2[0]}, [r0]
1565_NEON2SSESTORAGE void vst2q_lane_u32_ptr(__transfersize(2) uint32_t * ptr, uint32x4x2_t const * val, __constrange(0,3) int lane); // VST2.32{d0[0], d2[0]}, [r0]
1566_NEON2SSE_GLOBAL void vst2q_lane_s16_ptr(__transfersize(2) int16_t * ptr, int16x8x2_t const * val, __constrange(0,7) int lane); // VST2.16{d0[0], d2[0]}, [r0]
1567_NEON2SSE_GLOBAL void vst2q_lane_s32_ptr(__transfersize(2) int32_t * ptr, int32x4x2_t const * val, __constrange(0,3) int lane); // VST2.32{d0[0], d2[0]}, [r0]
1568_NEON2SSE_GLOBAL void vst2q_lane_f16_ptr(__transfersize(2) __fp16 * ptr, float16x8x2_t const * val, __constrange(0,7) int lane); // VST2.16{d0[0], d2[0]}, [r0]
1569_NEON2SSESTORAGE void vst2q_lane_f32_ptr(__transfersize(2) float32_t * ptr, float32x4x2_t const * val, __constrange(0,3) int lane); //VST2.32 {d0[0], d2[0]}, [r0]
1570_NEON2SSE_GLOBAL void vst2q_lane_p16_ptr(__transfersize(2) poly16_t * ptr, poly16x8x2_t const * val, __constrange(0,7) int lane); // VST2.16{d0[0], d2[0]}, [r0]
1571_NEON2SSESTORAGE void vst2_lane_u8(__transfersize(2) uint8_t * ptr, uint8x8x2_t val, __constrange(0,7) int lane); // VST2.8{d0[0], d1[0]}, [r0]
1572_NEON2SSESTORAGE void vst2_lane_u16(__transfersize(2) uint16_t * ptr, uint16x4x2_t val, __constrange(0,3) int lane); // VST2.16{d0[0], d1[0]}, [r0]
1573_NEON2SSESTORAGE void vst2_lane_u32(__transfersize(2) uint32_t * ptr, uint32x2x2_t val, __constrange(0,1) int lane); // VST2.32{d0[0], d1[0]}, [r0]
1574_NEON2SSE_GLOBAL void vst2_lane_s8(__transfersize(2) int8_t * ptr, int8x8x2_t val, __constrange(0,7) int lane); // VST2.8 {d0[0],d1[0]}, [r0]
1575_NEON2SSE_GLOBAL void vst2_lane_s16(__transfersize(2) int16_t * ptr, int16x4x2_t val, __constrange(0,3) int lane); // VST2.16{d0[0], d1[0]}, [r0]
1576_NEON2SSE_GLOBAL void vst2_lane_s32(__transfersize(2) int32_t * ptr, int32x2x2_t val, __constrange(0,1) int lane); // VST2.32{d0[0], d1[0]}, [r0]
1577_NEON2SSE_GLOBAL void vst2_lane_f16_ptr(__transfersize(2) __fp16 * ptr, float16x4x2_t const * val, __constrange(0,3) int lane); // VST2.16{d0[0], d1[0]}, [r0]
1578_NEON2SSESTORAGE void vst2_lane_f32(__transfersize(2) float32_t * ptr, float32x2x2_t val, __constrange(0,1) int lane); // VST2.32{d0[0], d1[0]}, [r0]
1579_NEON2SSE_GLOBAL void vst2_lane_p8(__transfersize(2) poly8_t * ptr, poly8x8x2_t val, __constrange(0,7) int lane); // VST2.8{d0[0], d1[0]}, [r0]
1580_NEON2SSE_GLOBAL void vst2_lane_p16(__transfersize(2) poly16_t * ptr, poly16x4x2_t val, __constrange(0,3) int lane); // VST2.16{d0[0], d1[0]}, [r0]
1581_NEON2SSESTORAGE void vst3q_lane_u16_ptr(__transfersize(3) uint16_t * ptr, uint16x8x3_t const * val, __constrange(0,7) int lane); // VST3.16{d0[0], d2[0], d4[0]}, [r0]
1582_NEON2SSESTORAGE void vst3q_lane_u32_ptr(__transfersize(3) uint32_t * ptr, uint32x4x3_t const * val, __constrange(0,3) int lane); // VST3.32{d0[0], d2[0], d4[0]}, [r0]
1583_NEON2SSE_GLOBAL void vst3q_lane_s16_ptr(__transfersize(3) int16_t * ptr, int16x8x3_t const * val, __constrange(0,7) int lane); // VST3.16{d0[0], d2[0], d4[0]}, [r0]
1584_NEON2SSE_GLOBAL void vst3q_lane_s32_ptr(__transfersize(3) int32_t * ptr, int32x4x3_t const * val, __constrange(0,3) int lane); // VST3.32{d0[0], d2[0], d4[0]}, [r0]
1585_NEON2SSE_GLOBAL void vst3q_lane_f16_ptr(__transfersize(3) __fp16 * ptr, float16x8x3_t const * val, __constrange(0,7) int lane); // VST3.16{d0[0], d2[0], d4[0]}, [r0]
1586_NEON2SSESTORAGE void vst3q_lane_f32_ptr(__transfersize(3) float32_t * ptr, float32x4x3_t const * val, __constrange(0,3) int lane); //VST3.32 {d0[0], d2[0], d4[0]}, [r0]
1587_NEON2SSE_GLOBAL void vst3q_lane_p16_ptr(__transfersize(3) poly16_t * ptr, poly16x8x3_t const * val, __constrange(0,7) int lane); // VST3.16{d0[0], d2[0], d4[0]}, [r0]
1588_NEON2SSESTORAGE void vst3_lane_u8(__transfersize(3) uint8_t * ptr, uint8x8x3_t val, __constrange(0,7) int lane); // VST3.8{d0[0], d1[0], d2[0]}, [r0]
1589_NEON2SSESTORAGE void vst3_lane_u16(__transfersize(3) uint16_t * ptr, uint16x4x3_t val, __constrange(0,3) int lane); // VST3.16{d0[0], d1[0], d2[0]}, [r0]
1590_NEON2SSESTORAGE void vst3_lane_u32(__transfersize(3) uint32_t * ptr, uint32x2x3_t val, __constrange(0,1) int lane); // VST3.32{d0[0], d1[0], d2[0]}, [r0]
1591_NEON2SSE_GLOBAL void vst3_lane_s8(__transfersize(3) int8_t * ptr, int8x8x3_t val, __constrange(0,7) int lane); // VST3.8 {d0[0],d1[0], d2[0]}, [r0]
1592_NEON2SSE_GLOBAL void vst3_lane_s16(__transfersize(3) int16_t * ptr, int16x4x3_t val, __constrange(0,3) int lane); // VST3.16{d0[0], d1[0], d2[0]}, [r0]
1593_NEON2SSE_GLOBAL void vst3_lane_s32(__transfersize(3) int32_t * ptr, int32x2x3_t val, __constrange(0,1) int lane); // VST3.32{d0[0], d1[0], d2[0]}, [r0]
1594_NEON2SSE_GLOBAL void vst3_lane_f16_ptr(__transfersize(3) __fp16 * ptr, float16x4x3_t const * val, __constrange(0,3) int lane); // VST3.16{d0[0], d1[0], d2[0]}, [r0]
1595_NEON2SSESTORAGE void vst3_lane_f32(__transfersize(3) float32_t * ptr, float32x2x3_t val, __constrange(0,1) int lane); // VST3.32{d0[0], d1[0], d2[0]}, [r0]
1596_NEON2SSE_GLOBAL void vst3_lane_p8(__transfersize(3) poly8_t * ptr, poly8x8x3_t val, __constrange(0,7) int lane); // VST3.8{d0[0], d1[0], d2[0]}, [r0]
1597_NEON2SSE_GLOBAL void vst3_lane_p16(__transfersize(3) poly16_t * ptr, poly16x4x3_t val, __constrange(0,3) int lane); // VST3.16{d0[0], d1[0], d2[0]}, [r0]
1598_NEON2SSESTORAGE void vst4q_lane_u16_ptr(__transfersize(4) uint16_t * ptr, uint16x8x4_t const * val, __constrange(0,7) int lane); // VST4.16{d0[0], d2[0], d4[0], d6[0]}, [r0]
1599_NEON2SSESTORAGE void vst4q_lane_u32_ptr(__transfersize(4) uint32_t * ptr, uint32x4x4_t const * val, __constrange(0,3) int lane); // VST4.32{d0[0], d2[0], d4[0], d6[0]}, [r0]
1600_NEON2SSE_GLOBAL void vst4q_lane_s16_ptr(__transfersize(4) int16_t * ptr, int16x8x4_t const * val, __constrange(0,7) int lane); // VST4.16{d0[0], d2[0], d4[0], d6[0]}, [r0]
1601_NEON2SSE_GLOBAL void vst4q_lane_s32_ptr(__transfersize(4) int32_t * ptr, int32x4x4_t const * val, __constrange(0,3) int lane); // VST4.32{d0[0], d2[0], d4[0], d6[0]}, [r0]
1602_NEON2SSE_GLOBAL void vst4q_lane_f16_ptr(__transfersize(4) __fp16 * ptr, float16x8x4_t const * val, __constrange(0,7) int lane); // VST4.16{d0[0], d2[0], d4[0], d6[0]}, [r0]
1603_NEON2SSESTORAGE void vst4q_lane_f32_ptr(__transfersize(4) float32_t * ptr, float32x4x4_t const * val, __constrange(0,3) int lane); //VST4.32 {d0[0], d2[0], d4[0], d6[0]}, [r0]
1604_NEON2SSE_GLOBAL void vst4q_lane_p16_ptr(__transfersize(4) poly16_t * ptr, poly16x8x4_t const * val, __constrange(0,7) int lane); // VST4.16{d0[0], d2[0], d4[0], d6[0]}, [r0]
1605_NEON2SSESTORAGE void vst4_lane_u8(__transfersize(4) uint8_t * ptr, uint8x8x4_t val, __constrange(0,7) int lane); // VST4.8{d0[0], d1[0], d2[0], d3[0]}, [r0]
1606_NEON2SSESTORAGE void vst4_lane_u16(__transfersize(4) uint16_t * ptr, uint16x4x4_t val, __constrange(0,3) int lane); // VST4.16{d0[0], d1[0], d2[0], d3[0]}, [r0]
1607_NEON2SSESTORAGE void vst4_lane_u32(__transfersize(4) uint32_t * ptr, uint32x2x4_t val, __constrange(0,1) int lane); // VST4.32{d0[0], d1[0], d2[0], d3[0]}, [r0]
1608_NEON2SSE_GLOBAL void vst4_lane_s8(__transfersize(4) int8_t * ptr, int8x8x4_t val, __constrange(0,7) int lane); // VST4.8 {d0[0],d1[0], d2[0], d3[0]}, [r0]
1609_NEON2SSE_GLOBAL void vst4_lane_s16(__transfersize(4) int16_t * ptr, int16x4x4_t val, __constrange(0,3) int lane); // VST4.16{d0[0], d1[0], d2[0], d3[0]}, [r0]
1610_NEON2SSE_GLOBAL void vst4_lane_s32(__transfersize(4) int32_t * ptr, int32x2x4_t val, __constrange(0,1) int lane); // VST4.32{d0[0], d1[0], d2[0], d3[0]}, [r0]
1611_NEON2SSE_GLOBAL void vst4_lane_f16_ptr(__transfersize(4) __fp16 * ptr, float16x4x4_t const * val, __constrange(0,3) int lane); // VST4.16{d0[0], d1[0], d2[0], d3[0]}, [r0]
1612_NEON2SSESTORAGE void vst4_lane_f32(__transfersize(4) float32_t * ptr, float32x2x4_t val, __constrange(0,1) int lane); // VST4.32{d0[0], d1[0], d2[0], d3[0]}, [r0]
1613_NEON2SSE_GLOBAL void vst4_lane_p8(__transfersize(4) poly8_t * ptr, poly8x8x4_t val, __constrange(0,7) int lane); // VST4.8{d0[0], d1[0], d2[0], d3[0]}, [r0]
1614_NEON2SSE_GLOBAL void vst4_lane_p16(__transfersize(4) poly16_t * ptr, poly16x4x4_t val, __constrange(0,3) int lane); // VST4.16{d0[0], d1[0], d2[0], d3[0]}, [r0]
1615//Extract lanes from a vector and put into a register. These intrinsics extract a single lane (element) from a vector.
1616_NEON2SSE_GLOBAL uint8_t vget_lane_u8(uint8x8_t vec, __constrange(0,7) int lane); // VMOV.U8 r0, d0[0]
1617_NEON2SSE_GLOBAL uint16_t vget_lane_u16(uint16x4_t vec, __constrange(0,3) int lane); // VMOV.U16 r0, d0[0]
1618_NEON2SSE_GLOBAL uint32_t vget_lane_u32(uint32x2_t vec, __constrange(0,1) int lane); // VMOV.32 r0, d0[0]
1619_NEON2SSE_GLOBAL int8_t vget_lane_s8(int8x8_t vec, __constrange(0,7) int lane); // VMOV.S8 r0, d0[0]
1620_NEON2SSE_GLOBAL int16_t vget_lane_s16(int16x4_t vec, __constrange(0,3) int lane); // VMOV.S16 r0, d0[0]
1621_NEON2SSE_GLOBAL int32_t vget_lane_s32(int32x2_t vec, __constrange(0,1) int lane); // VMOV.32 r0, d0[0]
1622_NEON2SSE_GLOBAL poly8_t vget_lane_p8(poly8x8_t vec, __constrange(0,7) int lane); // VMOV.U8 r0, d0[0]
1623_NEON2SSE_GLOBAL poly16_t vget_lane_p16(poly16x4_t vec, __constrange(0,3) int lane); // VMOV.U16 r0, d0[0]
1624_NEON2SSE_GLOBAL float32_t vget_lane_f32(float32x2_t vec, __constrange(0,1) int lane); // VMOV.32 r0, d0[0]
1625_NEON2SSE_GLOBAL uint8_t vgetq_lane_u8(uint8x16_t vec, __constrange(0,15) int lane); // VMOV.U8 r0, d0[0]
1626_NEON2SSE_GLOBAL uint16_t vgetq_lane_u16(uint16x8_t vec, __constrange(0,7) int lane); // VMOV.U16 r0, d0[0]
1627_NEON2SSE_GLOBAL uint32_t vgetq_lane_u32(uint32x4_t vec, __constrange(0,3) int lane); // VMOV.32 r0, d0[0]
1628_NEON2SSE_GLOBAL int8_t vgetq_lane_s8(int8x16_t vec, __constrange(0,15) int lane); // VMOV.S8 r0, d0[0]
1629_NEON2SSE_GLOBAL int16_t vgetq_lane_s16(int16x8_t vec, __constrange(0,7) int lane); // VMOV.S16 r0, d0[0]
1630_NEON2SSE_GLOBAL int32_t vgetq_lane_s32(int32x4_t vec, __constrange(0,3) int lane); // VMOV.32 r0, d0[0]
1631_NEON2SSE_GLOBAL poly8_t vgetq_lane_p8(poly8x16_t vec, __constrange(0,15) int lane); // VMOV.U8 r0, d0[0]
1632_NEON2SSE_GLOBAL poly16_t vgetq_lane_p16(poly16x8_t vec, __constrange(0,7) int lane); // VMOV.U16 r0, d0[0]
1633_NEON2SSESTORAGE float32_t vgetq_lane_f32(float32x4_t vec, __constrange(0,3) int lane); // VMOV.32 r0, d0[0]
1634_NEON2SSE_GLOBAL int64_t vget_lane_s64(int64x1_t vec, __constrange(0,0) int lane); // VMOV r0,r0,d0
1635_NEON2SSE_GLOBAL uint64_t vget_lane_u64(uint64x1_t vec, __constrange(0,0) int lane); // VMOV r0,r0,d0
1636_NEON2SSE_GLOBAL int64_t vgetq_lane_s64(int64x2_t vec, __constrange(0,1) int lane); // VMOV r0,r0,d0
1637_NEON2SSE_GLOBAL uint64_t vgetq_lane_u64(uint64x2_t vec, __constrange(0,1) int lane); // VMOV r0,r0,d0
1638//Load a single lane of a vector from a literal. These intrinsics set a single lane (element) within a vector.
1639_NEON2SSESTORAGE uint8x8_t vset_lane_u8(uint8_t value, uint8x8_t vec, __constrange(0,7) int lane); // VMOV.8 d0[0],r0
1640_NEON2SSESTORAGE uint16x4_t vset_lane_u16(uint16_t value, uint16x4_t vec, __constrange(0,3) int lane); // VMOV.16 d0[0],r0
1641_NEON2SSESTORAGE uint32x2_t vset_lane_u32(uint32_t value, uint32x2_t vec, __constrange(0,1) int lane); // VMOV.32 d0[0],r0
1642_NEON2SSESTORAGE int8x8_t vset_lane_s8(int8_t value, int8x8_t vec, __constrange(0,7) int lane); // VMOV.8 d0[0],r0
1643_NEON2SSESTORAGE int16x4_t vset_lane_s16(int16_t value, int16x4_t vec, __constrange(0,3) int lane); // VMOV.16 d0[0],r0
1644_NEON2SSESTORAGE int32x2_t vset_lane_s32(int32_t value, int32x2_t vec, __constrange(0,1) int lane); // VMOV.32 d0[0],r0
1645_NEON2SSE_GLOBAL poly8x8_t vset_lane_p8(poly8_t value, poly8x8_t vec, __constrange(0,7) int lane); // VMOV.8 d0[0],r0
1646_NEON2SSE_GLOBAL poly16x4_t vset_lane_p16(poly16_t value, poly16x4_t vec, __constrange(0,3) int lane); // VMOV.16 d0[0],r0
1647_NEON2SSESTORAGE float32x2_t vset_lane_f32(float32_t value, float32x2_t vec, __constrange(0,1) int lane); // VMOV.32 d0[0],r0
1648_NEON2SSESTORAGE uint8x16_t vsetq_lane_u8(uint8_t value, uint8x16_t vec, __constrange(0,15) int lane); // VMOV.8 d0[0],r0
1649_NEON2SSESTORAGE uint16x8_t vsetq_lane_u16(uint16_t value, uint16x8_t vec, __constrange(0,7) int lane); // VMOV.16 d0[0],r0
1650_NEON2SSESTORAGE uint32x4_t vsetq_lane_u32(uint32_t value, uint32x4_t vec, __constrange(0,3) int lane); // VMOV.32 d0[0],r0
1651_NEON2SSESTORAGE int8x16_t vsetq_lane_s8(int8_t value, int8x16_t vec, __constrange(0,15) int lane); // VMOV.8 d0[0],r0
1652_NEON2SSESTORAGE int16x8_t vsetq_lane_s16(int16_t value, int16x8_t vec, __constrange(0,7) int lane); // VMOV.16 d0[0],r0
1653_NEON2SSESTORAGE int32x4_t vsetq_lane_s32(int32_t value, int32x4_t vec, __constrange(0,3) int lane); // VMOV.32 d0[0],r0
1654_NEON2SSE_GLOBAL poly8x16_t vsetq_lane_p8(poly8_t value, poly8x16_t vec, __constrange(0,15) int lane); // VMOV.8 d0[0],r0
1655_NEON2SSE_GLOBAL poly16x8_t vsetq_lane_p16(poly16_t value, poly16x8_t vec, __constrange(0,7) int lane); // VMOV.16 d0[0],r0
1656_NEON2SSESTORAGE float32x4_t vsetq_lane_f32(float32_t value, float32x4_t vec, __constrange(0,3) int lane); // VMOV.32 d0[0],r0
1657_NEON2SSESTORAGE int64x1_t vset_lane_s64(int64_t value, int64x1_t vec, __constrange(0,0) int lane); // VMOV d0,r0,r0
1658_NEON2SSESTORAGE uint64x1_t vset_lane_u64(uint64_t value, uint64x1_t vec, __constrange(0,0) int lane); // VMOV d0,r0,r0
1659_NEON2SSESTORAGE int64x2_t vsetq_lane_s64(int64_t value, int64x2_t vec, __constrange(0,1) int lane); // VMOV d0,r0,r0
1660_NEON2SSE_GLOBAL uint64x2_t vsetq_lane_u64(uint64_t value, uint64x2_t vec, __constrange(0,1) int lane); // VMOV d0,r0,r0
1661//Initialize a vector from a literal bit pattern.
1662_NEON2SSESTORAGE int8x8_t vcreate_s8(uint64_t a); // VMOV d0,r0,r0
1663_NEON2SSE_GLOBAL int16x4_t vcreate_s16(uint64_t a); // VMOV d0,r0,r0
1664_NEON2SSE_GLOBAL int32x2_t vcreate_s32(uint64_t a); // VMOV d0,r0,r0
1665_NEON2SSE_GLOBAL float16x4_t vcreate_f16(uint64_t a); // VMOV d0,r0,r0
1666_NEON2SSESTORAGE float32x2_t vcreate_f32(uint64_t a); // VMOV d0,r0,r0
1667_NEON2SSE_GLOBAL uint8x8_t vcreate_u8(uint64_t a); // VMOV d0,r0,r0
1668_NEON2SSE_GLOBAL uint16x4_t vcreate_u16(uint64_t a); // VMOV d0,r0,r0
1669_NEON2SSE_GLOBAL uint32x2_t vcreate_u32(uint64_t a); // VMOV d0,r0,r0
1670_NEON2SSE_GLOBAL uint64x1_t vcreate_u64(uint64_t a); // VMOV d0,r0,r0
1671_NEON2SSE_GLOBAL poly8x8_t vcreate_p8(uint64_t a); // VMOV d0,r0,r0
1672_NEON2SSE_GLOBAL poly16x4_t vcreate_p16(uint64_t a); // VMOV d0,r0,r0
1673_NEON2SSE_GLOBAL int64x1_t vcreate_s64(uint64_t a); // VMOV d0,r0,r0
1674//Set all lanes to same value
1675//Load all lanes of vector to the same literal value
1676_NEON2SSESTORAGE uint8x8_t vdup_n_u8(uint8_t value); // VDUP.8 d0,r0
1677_NEON2SSESTORAGE uint16x4_t vdup_n_u16(uint16_t value); // VDUP.16 d0,r0
1678_NEON2SSESTORAGE uint32x2_t vdup_n_u32(uint32_t value); // VDUP.32 d0,r0
1679_NEON2SSESTORAGE int8x8_t vdup_n_s8(int8_t value); // VDUP.8 d0,r0
1680_NEON2SSESTORAGE int16x4_t vdup_n_s16(int16_t value); // VDUP.16 d0,r0
1681_NEON2SSESTORAGE int32x2_t vdup_n_s32(int32_t value); // VDUP.32 d0,r0
1682_NEON2SSE_GLOBAL poly8x8_t vdup_n_p8(poly8_t value); // VDUP.8 d0,r0
1683_NEON2SSE_GLOBAL poly16x4_t vdup_n_p16(poly16_t value); // VDUP.16 d0,r0
1684_NEON2SSESTORAGE float32x2_t vdup_n_f32(float32_t value); // VDUP.32 d0,r0
1685_NEON2SSE_GLOBAL uint8x16_t vdupq_n_u8(uint8_t value); // VDUP.8 q0,r0
1686_NEON2SSE_GLOBAL uint16x8_t vdupq_n_u16(uint16_t value); // VDUP.16 q0,r0
1687_NEON2SSE_GLOBAL uint32x4_t vdupq_n_u32(uint32_t value); // VDUP.32 q0,r0
1688_NEON2SSE_GLOBAL int8x16_t vdupq_n_s8(int8_t value); // VDUP.8 q0,r0
1689_NEON2SSE_GLOBAL int16x8_t vdupq_n_s16(int16_t value); // VDUP.16 q0,r0
1690_NEON2SSE_GLOBAL int32x4_t vdupq_n_s32(int32_t value); // VDUP.32 q0,r0
1691_NEON2SSE_GLOBAL poly8x16_t vdupq_n_p8(poly8_t value); // VDUP.8 q0,r0
1692_NEON2SSE_GLOBAL poly16x8_t vdupq_n_p16(poly16_t value); // VDUP.16 q0,r0
1693_NEON2SSE_GLOBAL float32x4_t vdupq_n_f32(float32_t value); // VDUP.32 q0,r0
1694_NEON2SSESTORAGE int64x1_t vdup_n_s64(int64_t value); // VMOV d0,r0,r0
1695_NEON2SSESTORAGE uint64x1_t vdup_n_u64(uint64_t value); // VMOV d0,r0,r0
1696_NEON2SSESTORAGE int64x2_t vdupq_n_s64(int64_t value); // VMOV d0,r0,r0
1697_NEON2SSESTORAGE uint64x2_t vdupq_n_u64(uint64_t value); // VMOV d0,r0,r0
1698_NEON2SSE_GLOBAL uint8x8_t vmov_n_u8(uint8_t value); // VDUP.8 d0,r0
1699_NEON2SSE_GLOBAL uint16x4_t vmov_n_u16(uint16_t value); // VDUP.16 d0,r0
1700_NEON2SSE_GLOBAL uint32x2_t vmov_n_u32(uint32_t value); // VDUP.32 d0,r0
1701_NEON2SSE_GLOBAL int8x8_t vmov_n_s8(int8_t value); // VDUP.8 d0,r0
1702_NEON2SSE_GLOBAL int16x4_t vmov_n_s16(int16_t value); // VDUP.16 d0,r0
1703_NEON2SSE_GLOBAL int32x2_t vmov_n_s32(int32_t value); // VDUP.32 d0,r0
1704_NEON2SSE_GLOBAL poly8x8_t vmov_n_p8(poly8_t value); // VDUP.8 d0,r0
1705_NEON2SSE_GLOBAL poly16x4_t vmov_n_p16(poly16_t value); // VDUP.16 d0,r0
1706_NEON2SSE_GLOBAL float32x2_t vmov_n_f32(float32_t value); // VDUP.32 d0,r0
1707_NEON2SSE_GLOBAL uint8x16_t vmovq_n_u8(uint8_t value); // VDUP.8 q0,r0
1708_NEON2SSE_GLOBAL uint16x8_t vmovq_n_u16(uint16_t value); // VDUP.16 q0,r0
1709_NEON2SSE_GLOBAL uint32x4_t vmovq_n_u32(uint32_t value); // VDUP.32 q0,r0
1710_NEON2SSE_GLOBAL int8x16_t vmovq_n_s8(int8_t value); // VDUP.8 q0,r0
1711_NEON2SSE_GLOBAL int16x8_t vmovq_n_s16(int16_t value); // VDUP.16 q0,r0
1712_NEON2SSE_GLOBAL int32x4_t vmovq_n_s32(int32_t value); // VDUP.32 q0,r0
1713_NEON2SSE_GLOBAL poly8x16_t vmovq_n_p8(poly8_t value); // VDUP.8 q0,r0
1714_NEON2SSE_GLOBAL poly16x8_t vmovq_n_p16(poly16_t value); // VDUP.16 q0,r0
1715_NEON2SSE_GLOBAL float32x4_t vmovq_n_f32(float32_t value); // VDUP.32 q0,r0
1716_NEON2SSE_GLOBAL int64x1_t vmov_n_s64(int64_t value); // VMOV d0,r0,r0
1717_NEON2SSE_GLOBAL uint64x1_t vmov_n_u64(uint64_t value); // VMOV d0,r0,r0
1718_NEON2SSE_GLOBAL int64x2_t vmovq_n_s64(int64_t value); // VMOV d0,r0,r0
1719_NEON2SSE_GLOBAL uint64x2_t vmovq_n_u64(uint64_t value); // VMOV d0,r0,r0
1720//Load all lanes of the vector to the value of a lane of a vector
1721_NEON2SSESTORAGE uint8x8_t vdup_lane_u8(uint8x8_t vec, __constrange(0,7) int lane); // VDUP.8 d0,d0[0]
1722_NEON2SSESTORAGE uint16x4_t vdup_lane_u16(uint16x4_t vec, __constrange(0,3) int lane); // VDUP.16 d0,d0[0]
1723_NEON2SSESTORAGE uint32x2_t vdup_lane_u32(uint32x2_t vec, __constrange(0,1) int lane); // VDUP.32 d0,d0[0]
1724_NEON2SSE_GLOBAL int8x8_t vdup_lane_s8(int8x8_t vec, __constrange(0,7) int lane); // VDUP.8 d0,d0[0]
1725_NEON2SSE_GLOBAL int16x4_t vdup_lane_s16(int16x4_t vec, __constrange(0,3) int lane); // VDUP.16 d0,d0[0]
1726_NEON2SSE_GLOBAL int32x2_t vdup_lane_s32(int32x2_t vec, __constrange(0,1) int lane); // VDUP.32 d0,d0[0]
1727_NEON2SSE_GLOBAL poly8x8_t vdup_lane_p8(poly8x8_t vec, __constrange(0,7) int lane); // VDUP.8 d0,d0[0]
1728_NEON2SSE_GLOBAL poly16x4_t vdup_lane_p16(poly16x4_t vec, __constrange(0,3) int lane); // VDUP.16 d0,d0[0]
1729_NEON2SSESTORAGE float32x2_t vdup_lane_f32(float32x2_t vec, __constrange(0,1) int lane); // VDUP.32 d0,d0[0]
1730_NEON2SSESTORAGE uint8x16_t vdupq_lane_u8(uint8x8_t vec, __constrange(0,7) int lane); // VDUP.8 q0,d0[0]
1731_NEON2SSESTORAGE uint16x8_t vdupq_lane_u16(uint16x4_t vec, __constrange(0,3) int lane); // VDUP.16 q0,d0[0]
1732_NEON2SSESTORAGE uint32x4_t vdupq_lane_u32(uint32x2_t vec, __constrange(0,1) int lane); // VDUP.32 q0,d0[0]
1733_NEON2SSE_GLOBAL int8x16_t vdupq_lane_s8(int8x8_t vec, __constrange(0,7) int lane); // VDUP.8 q0,d0[0]
1734_NEON2SSE_GLOBAL int16x8_t vdupq_lane_s16(int16x4_t vec, __constrange(0,3) int lane); // VDUP.16 q0,d0[0]
1735_NEON2SSE_GLOBAL int32x4_t vdupq_lane_s32(int32x2_t vec, __constrange(0,1) int lane); // VDUP.32 q0,d0[0]
1736_NEON2SSE_GLOBAL poly8x16_t vdupq_lane_p8(poly8x8_t vec, __constrange(0,7) int lane); // VDUP.8 q0,d0[0]
1737_NEON2SSE_GLOBAL poly16x8_t vdupq_lane_p16(poly16x4_t vec, __constrange(0,3) int lane); // VDUP.16 q0,d0[0]
1738_NEON2SSE_GLOBAL float32x4_t vdupq_lane_f32(float32x2_t vec, __constrange(0,1) int lane); // VDUP.32 q0,d0[0]
1739_NEON2SSE_GLOBAL int64x1_t vdup_lane_s64(int64x1_t vec, __constrange(0,0) int lane); // VMOV d0,d0
1740_NEON2SSE_GLOBAL uint64x1_t vdup_lane_u64(uint64x1_t vec, __constrange(0,0) int lane); // VMOV d0,d0
1741_NEON2SSESTORAGE int64x2_t vdupq_lane_s64(int64x1_t vec, __constrange(0,0) int lane); // VMOV q0,q0
1742_NEON2SSE_GLOBAL uint64x2_t vdupq_lane_u64(uint64x1_t vec, __constrange(0,0) int lane); // VMOV q0,q0
1743//Combining vectors. These intrinsics join two 64 bit vectors into a single 128bit vector.
1744_NEON2SSESTORAGE int8x16_t vcombine_s8(int8x8_t low, int8x8_t high); // VMOV d0,d0
1745_NEON2SSE_GLOBAL int16x8_t vcombine_s16(int16x4_t low, int16x4_t high); // VMOV d0,d0
1746_NEON2SSE_GLOBAL int32x4_t vcombine_s32(int32x2_t low, int32x2_t high); // VMOV d0,d0
1747_NEON2SSE_GLOBAL int64x2_t vcombine_s64(int64x1_t low, int64x1_t high); // VMOV d0,d0
1748_NEON2SSE_GLOBAL float16x8_t vcombine_f16(float16x4_t low, float16x4_t high); // VMOV d0,d0
1749_NEON2SSESTORAGE float32x4_t vcombine_f32(float32x2_t low, float32x2_t high); // VMOV d0,d0
1750_NEON2SSE_GLOBAL uint8x16_t vcombine_u8(uint8x8_t low, uint8x8_t high); // VMOV d0,d0
1751_NEON2SSE_GLOBAL uint16x8_t vcombine_u16(uint16x4_t low, uint16x4_t high); // VMOV d0,d0
1752_NEON2SSE_GLOBAL uint32x4_t vcombine_u32(uint32x2_t low, uint32x2_t high); // VMOV d0,d0
1753_NEON2SSE_GLOBAL uint64x2_t vcombine_u64(uint64x1_t low, uint64x1_t high); // VMOV d0,d0
1754_NEON2SSE_GLOBAL poly8x16_t vcombine_p8(poly8x8_t low, poly8x8_t high); // VMOV d0,d0
1755_NEON2SSE_GLOBAL poly16x8_t vcombine_p16(poly16x4_t low, poly16x4_t high); // VMOV d0,d0
1756//Splitting vectors. These intrinsics split a 128 bit vector into 2 component 64 bit vectors
1757_NEON2SSESTORAGE int8x8_t vget_high_s8(int8x16_t a); // VMOV d0,d0
1758_NEON2SSESTORAGE int16x4_t vget_high_s16(int16x8_t a); // VMOV d0,d0
1759_NEON2SSESTORAGE int32x2_t vget_high_s32(int32x4_t a); // VMOV d0,d0
1760_NEON2SSESTORAGE int64x1_t vget_high_s64(int64x2_t a); // VMOV d0,d0
1761_NEON2SSE_GLOBAL float16x4_t vget_high_f16(float16x8_t a); // VMOV d0,d0
1762_NEON2SSESTORAGE float32x2_t vget_high_f32(float32x4_t a); // VMOV d0,d0
1763_NEON2SSE_GLOBAL uint8x8_t vget_high_u8(uint8x16_t a); // VMOV d0,d0
1764_NEON2SSE_GLOBAL uint16x4_t vget_high_u16(uint16x8_t a); // VMOV d0,d0
1765_NEON2SSE_GLOBAL uint32x2_t vget_high_u32(uint32x4_t a); // VMOV d0,d0
1766_NEON2SSE_GLOBAL uint64x1_t vget_high_u64(uint64x2_t a); // VMOV d0,d0
1767_NEON2SSE_GLOBAL poly8x8_t vget_high_p8(poly8x16_t a); // VMOV d0,d0
1768_NEON2SSE_GLOBAL poly16x4_t vget_high_p16(poly16x8_t a); // VMOV d0,d0
1769_NEON2SSESTORAGE int8x8_t vget_low_s8(int8x16_t a); // VMOV d0,d0
1770_NEON2SSESTORAGE int16x4_t vget_low_s16(int16x8_t a); // VMOV d0,d0
1771_NEON2SSESTORAGE int32x2_t vget_low_s32(int32x4_t a); // VMOV d0,d0
1772_NEON2SSESTORAGE int64x1_t vget_low_s64(int64x2_t a); // VMOV d0,d0
1773_NEON2SSE_GLOBAL float16x4_t vget_low_f16(float16x8_t a); // VMOV d0,d0
1774_NEON2SSESTORAGE float32x2_t vget_low_f32(float32x4_t a); // VMOV d0,d0
1775_NEON2SSE_GLOBAL uint8x8_t vget_low_u8(uint8x16_t a); // VMOV d0,d0
1776_NEON2SSE_GLOBAL uint16x4_t vget_low_u16(uint16x8_t a); // VMOV d0,d0
1777_NEON2SSE_GLOBAL uint32x2_t vget_low_u32(uint32x4_t a); // VMOV d0,d0
1778_NEON2SSE_GLOBAL uint64x1_t vget_low_u64(uint64x2_t a); // VMOV d0,d0
1779_NEON2SSE_GLOBAL poly8x8_t vget_low_p8(poly8x16_t a); // VMOV d0,d0
1780_NEON2SSE_GLOBAL poly16x4_t vget_low_p16(poly16x8_t a); // VMOV d0,d0
1781//Converting vectors. These intrinsics are used to convert vectors.
1782//Convert from float
1783_NEON2SSESTORAGE int32x2_t vcvt_s32_f32(float32x2_t a); // VCVT.S32.F32 d0, d0
1784_NEON2SSESTORAGE uint32x2_t vcvt_u32_f32(float32x2_t a); // VCVT.U32.F32 d0, d0
1785_NEON2SSESTORAGE int32x4_t vcvtq_s32_f32(float32x4_t a); // VCVT.S32.F32 q0, q0
1786_NEON2SSESTORAGE uint32x4_t vcvtq_u32_f32(float32x4_t a); // VCVT.U32.F32 q0, q0
1787_NEON2SSESTORAGE int32x2_t vcvt_n_s32_f32(float32x2_t a, __constrange(1,32) int b); // VCVT.S32.F32 d0, d0, #32
1788_NEON2SSESTORAGE uint32x2_t vcvt_n_u32_f32(float32x2_t a, __constrange(1,32) int b); // VCVT.U32.F32 d0, d0, #32
1789_NEON2SSESTORAGE int32x4_t vcvtq_n_s32_f32(float32x4_t a, __constrange(1,32) int b); // VCVT.S32.F32 q0, q0, #32
1790_NEON2SSESTORAGE uint32x4_t vcvtq_n_u32_f32(float32x4_t a, __constrange(1,32) int b); // VCVT.U32.F32 q0, q0, #32
1791_NEON2SSESTORAGE int32x4_t vcvtnq_s32_f32(float32x4_t a); // VCVTN.S32.F32 q0, q0
1792//Convert to float
1793_NEON2SSESTORAGE float32x2_t vcvt_f32_s32(int32x2_t a); // VCVT.F32.S32 d0, d0
1794_NEON2SSESTORAGE float32x2_t vcvt_f32_u32(uint32x2_t a); // VCVT.F32.U32 d0, d0
1795_NEON2SSE_GLOBAL float32x4_t vcvtq_f32_s32(int32x4_t a); // VCVT.F32.S32 q0, q0
1796_NEON2SSESTORAGE float32x4_t vcvtq_f32_u32(uint32x4_t a); // VCVT.F32.U32 q0, q0
1797_NEON2SSESTORAGE float32x2_t vcvt_n_f32_s32(int32x2_t a, __constrange(1,32) int b); // VCVT.F32.S32 d0, d0, #32
1798_NEON2SSESTORAGE float32x2_t vcvt_n_f32_u32(uint32x2_t a, __constrange(1,32) int b); // VCVT.F32.U32 d0, d0, #32
1799_NEON2SSESTORAGE float32x4_t vcvtq_n_f32_s32(int32x4_t a, __constrange(1,32) int b); // VCVT.F32.S32 q0, q0, #32
1800_NEON2SSESTORAGE float32x4_t vcvtq_n_f32_u32(uint32x4_t a, __constrange(1,32) int b); // VCVT.F32.U32 q0, q0, #32
1801//Convert between floats
1802_NEON2SSE_GLOBAL float16x4_t vcvt_f16_f32(float32x4_t a); // VCVT.F16.F32 d0, q0
1803_NEON2SSE_GLOBAL float32x4_t vcvt_f32_f16(float16x4_t a); // VCVT.F32.F16 q0, d0
1804//Vector narrow integer
1805_NEON2SSESTORAGE int8x8_t vmovn_s16(int16x8_t a); // VMOVN.I16 d0,q0
1806_NEON2SSESTORAGE int16x4_t vmovn_s32(int32x4_t a); // VMOVN.I32 d0,q0
1807_NEON2SSESTORAGE int32x2_t vmovn_s64(int64x2_t a); // VMOVN.I64 d0,q0
1808_NEON2SSE_GLOBAL uint8x8_t vmovn_u16(uint16x8_t a); // VMOVN.I16 d0,q0
1809_NEON2SSE_GLOBAL uint16x4_t vmovn_u32(uint32x4_t a); // VMOVN.I32 d0,q0
1810_NEON2SSE_GLOBAL uint32x2_t vmovn_u64(uint64x2_t a); // VMOVN.I64 d0,q0
1811//Vector long move
1812_NEON2SSESTORAGE int16x8_t vmovl_s8(int8x8_t a); // VMOVL.S8 q0,d0
1813_NEON2SSESTORAGE int32x4_t vmovl_s16(int16x4_t a); // VMOVL.S16 q0,d0
1814_NEON2SSESTORAGE int64x2_t vmovl_s32(int32x2_t a); // VMOVL.S32 q0,d0
1815_NEON2SSESTORAGE uint16x8_t vmovl_u8(uint8x8_t a); // VMOVL.U8 q0,d0
1816_NEON2SSESTORAGE uint32x4_t vmovl_u16(uint16x4_t a); // VMOVL.U16 q0,d0
1817_NEON2SSESTORAGE uint64x2_t vmovl_u32(uint32x2_t a); // VMOVL.U32 q0,d0
1818//Vector saturating narrow integer
1819_NEON2SSESTORAGE int8x8_t vqmovn_s16(int16x8_t a); // VQMOVN.S16 d0,q0
1820_NEON2SSESTORAGE int16x4_t vqmovn_s32(int32x4_t a); // VQMOVN.S32 d0,q0
1821_NEON2SSESTORAGE int32x2_t vqmovn_s64(int64x2_t a); // VQMOVN.S64 d0,q0
1822_NEON2SSESTORAGE uint8x8_t vqmovn_u16(uint16x8_t a); // VQMOVN.U16 d0,q0
1823_NEON2SSESTORAGE uint16x4_t vqmovn_u32(uint32x4_t a); // VQMOVN.U32 d0,q0
1824_NEON2SSESTORAGE uint32x2_t vqmovn_u64(uint64x2_t a); // VQMOVN.U64 d0,q0
1825//Vector saturating narrow integer signed->unsigned
1826_NEON2SSESTORAGE uint8x8_t vqmovun_s16(int16x8_t a); // VQMOVUN.S16 d0,q0
1827_NEON2SSESTORAGE uint16x4_t vqmovun_s32(int32x4_t a); // VQMOVUN.S32 d0,q0
1828_NEON2SSESTORAGE uint32x2_t vqmovun_s64(int64x2_t a); // VQMOVUN.S64 d0,q0
1829//Table look up
1830_NEON2SSESTORAGE uint8x8_t vtbl1_u8(uint8x8_t a, uint8x8_t b); // VTBL.8 d0, {d0}, d0
1831_NEON2SSE_GLOBAL int8x8_t vtbl1_s8(int8x8_t a, int8x8_t b); // VTBL.8 d0, {d0}, d0
1832_NEON2SSE_GLOBAL poly8x8_t vtbl1_p8(poly8x8_t a, uint8x8_t b); // VTBL.8 d0, {d0}, d0
1833//Extended table look up intrinsics
1834_NEON2SSESTORAGE uint8x8_t vtbx1_u8(uint8x8_t a, uint8x8_t b, uint8x8_t c); // VTBX.8 d0, {d0}, d0
1835_NEON2SSE_GLOBAL int8x8_t vtbx1_s8(int8x8_t a, int8x8_t b, int8x8_t c); // VTBX.8 d0, {d0}, d0
1836_NEON2SSE_GLOBAL poly8x8_t vtbx1_p8(poly8x8_t a, poly8x8_t b, uint8x8_t c); // VTBX.8 d0, {d0}, d0
1837_NEON2SSESTORAGE uint8x8_t vtbx2_u8(uint8x8_t a, uint8x8x2_t b, uint8x8_t c); // VTBX.8 d0, {d0, d1}, d0
1838_NEON2SSE_GLOBAL int8x8_t vtbx2_s8(int8x8_t a, int8x8x2_t b, int8x8_t c); // VTBX.8 d0, {d0, d1}, d0
1839_NEON2SSE_GLOBAL poly8x8_t vtbx2_p8(poly8x8_t a, poly8x8x2_t b, uint8x8_t c); // VTBX.8 d0, {d0, d1}, d0
1840_NEON2SSESTORAGE uint8x8_t vtbx3_u8(uint8x8_t a, uint8x8x3_t b, uint8x8_t c); // VTBX.8 d0, {d0, d1, d2}, d0
1841_NEON2SSE_GLOBAL int8x8_t vtbx3_s8(int8x8_t a, int8x8x3_t b, int8x8_t c); // VTBX.8 d0, {d0, d1, d2}, d0
1842_NEON2SSE_GLOBAL poly8x8_t vtbx3_p8(poly8x8_t a, poly8x8x3_t b, uint8x8_t c); // VTBX.8 d0, {d0, d1, d2}, d0
1843_NEON2SSESTORAGE uint8x8_t vtbx4_u8(uint8x8_t a, uint8x8x4_t b, uint8x8_t c); // VTBX.8 d0, {d0, d1, d2, d3}, d0
1844_NEON2SSE_GLOBAL int8x8_t vtbx4_s8(int8x8_t a, int8x8x4_t b, int8x8_t c); // VTBX.8 d0, {d0, d1, d2, d3}, d0
1845_NEON2SSE_GLOBAL poly8x8_t vtbx4_p8(poly8x8_t a, poly8x8x4_t b, uint8x8_t c); // VTBX.8 d0, {d0, d1, d2, d3}, d0
1846//Operations with a scalar value
1847//Vector multiply accumulate with scalar
1848_NEON2SSESTORAGE int16x4_t vmla_lane_s16(int16x4_t a, int16x4_t b, int16x4_t v, __constrange(0,3) int l); // VMLA.I16 d0, d0,d0[0]
1849_NEON2SSESTORAGE int32x2_t vmla_lane_s32(int32x2_t a, int32x2_t b, int32x2_t v, __constrange(0,1) int l); // VMLA.I32 d0, d0,d0[0]
1850_NEON2SSE_GLOBAL uint16x4_t vmla_lane_u16(uint16x4_t a, uint16x4_t b, uint16x4_t v, __constrange(0,3) int l); // VMLA.I16 d0, d0,d0[0]
1851_NEON2SSE_GLOBAL uint32x2_t vmla_lane_u32(uint32x2_t a, uint32x2_t b, uint32x2_t v, __constrange(0,1) int l); // VMLA.I32 d0, d0,d0[0]
1852_NEON2SSESTORAGE float32x2_t vmla_lane_f32(float32x2_t a, float32x2_t b, float32x2_t v, __constrange(0,1) int l); // VMLA.F32 d0,d0, d0[0]
1853_NEON2SSESTORAGE int16x8_t vmlaq_lane_s16(int16x8_t a, int16x8_t b, int16x4_t v, __constrange(0,3) int l); // VMLA.I16 q0, q0,d0[0]
1854_NEON2SSESTORAGE int32x4_t vmlaq_lane_s32(int32x4_t a, int32x4_t b, int32x2_t v, __constrange(0,1) int l); // VMLA.I32 q0, q0,d0[0]
1855_NEON2SSE_GLOBAL uint16x8_t vmlaq_lane_u16(uint16x8_t a, uint16x8_t b, uint16x4_t v, __constrange(0,3) int l); // VMLA.I16 q0,q0, d0[0]
1856_NEON2SSE_GLOBAL uint32x4_t vmlaq_lane_u32(uint32x4_t a, uint32x4_t b, uint32x2_t v, __constrange(0,1) int l); // VMLA.I32 q0,q0, d0[0]
1857_NEON2SSESTORAGE float32x4_t vmlaq_lane_f32(float32x4_t a, float32x4_t b, float32x2_t v, __constrange(0,1) int l); // VMLA.F32 q0,q0, d0[0]
1858//Vector widening multiply accumulate with scalar
1859_NEON2SSESTORAGE int32x4_t vmlal_lane_s16(int32x4_t a, int16x4_t b, int16x4_t v, __constrange(0,3) int l); //VMLAL.S16 q0, d0,d0[0]
1860_NEON2SSESTORAGE int64x2_t vmlal_lane_s32(int64x2_t a, int32x2_t b, int32x2_t v, __constrange(0,1) int l); //VMLAL.S32 q0, d0,d0[0]
1861_NEON2SSESTORAGE uint32x4_t vmlal_lane_u16(uint32x4_t a, uint16x4_t b, uint16x4_t v, __constrange(0,3) int l); // VMLAL.U16 q0,d0, d0[0]
1862_NEON2SSESTORAGE uint64x2_t vmlal_lane_u32(uint64x2_t a, uint32x2_t b, uint32x2_t v, __constrange(0,1) int l); // VMLAL.U32 q0,d0, d0[0]
1863//Vector widening saturating doubling multiply accumulate with scalar
1864_NEON2SSESTORAGE int32x4_t vqdmlal_lane_s16(int32x4_t a, int16x4_t b, int16x4_t v, __constrange(0,3) int l); // VQDMLAL.S16 q0,d0, d0[0]
1865_NEON2SSESTORAGE int64x2_t vqdmlal_lane_s32(int64x2_t a, int32x2_t b, int32x2_t v, __constrange(0,1) int l); // VQDMLAL.S32 q0,d0, d0[0]
1866//Vector multiply subtract with scalar
1867_NEON2SSESTORAGE int16x4_t vmls_lane_s16(int16x4_t a, int16x4_t b, int16x4_t v, __constrange(0,3) int l); // VMLS.I16 d0, d0,d0[0]
1868_NEON2SSESTORAGE int32x2_t vmls_lane_s32(int32x2_t a, int32x2_t b, int32x2_t v, __constrange(0,1) int l); // VMLS.I32 d0, d0,d0[0]
1869_NEON2SSESTORAGE uint16x4_t vmls_lane_u16(uint16x4_t a, uint16x4_t b, uint16x4_t v, __constrange(0,3) int l); // VMLS.I16 d0, d0,d0[0]
1870_NEON2SSESTORAGE uint32x2_t vmls_lane_u32(uint32x2_t a, uint32x2_t b, uint32x2_t v, __constrange(0,1) int l); // VMLS.I32 d0, d0,d0[0]
1871_NEON2SSESTORAGE float32x2_t vmls_lane_f32(float32x2_t a, float32x2_t b, float32x2_t v, __constrange(0,1) int l); // VMLS.F32 d0,d0, d0[0]
1872_NEON2SSESTORAGE int16x8_t vmlsq_lane_s16(int16x8_t a, int16x8_t b, int16x4_t v, __constrange(0,3) int l); // VMLS.I16 q0, q0,d0[0]
1873_NEON2SSESTORAGE int32x4_t vmlsq_lane_s32(int32x4_t a, int32x4_t b, int32x2_t v, __constrange(0,1) int l); // VMLS.I32 q0, q0,d0[0]
1874_NEON2SSESTORAGE uint16x8_t vmlsq_lane_u16(uint16x8_t a, uint16x8_t b, uint16x4_t v, __constrange(0,3) int l); // VMLS.I16 q0,q0, d0[0]
1875_NEON2SSESTORAGE uint32x4_t vmlsq_lane_u32(uint32x4_t a, uint32x4_t b, uint32x2_t v, __constrange(0,1) int l); // VMLS.I32 q0,q0, d0[0]
1876_NEON2SSESTORAGE float32x4_t vmlsq_lane_f32(float32x4_t a, float32x4_t b, float32x2_t v, __constrange(0,1) int l); // VMLS.F32 q0,q0, d0[0]
1877//Vector widening multiply subtract with scalar
1878_NEON2SSESTORAGE int32x4_t vmlsl_lane_s16(int32x4_t a, int16x4_t b, int16x4_t v, __constrange(0,3) int l); // VMLSL.S16 q0, d0,d0[0]
1879_NEON2SSESTORAGE int64x2_t vmlsl_lane_s32(int64x2_t a, int32x2_t b, int32x2_t v, __constrange(0,1) int l); // VMLSL.S32 q0, d0,d0[0]
1880_NEON2SSESTORAGE uint32x4_t vmlsl_lane_u16(uint32x4_t a, uint16x4_t b, uint16x4_t v, __constrange(0,3) int l); // VMLSL.U16 q0,d0, d0[0]
1881_NEON2SSESTORAGE uint64x2_t vmlsl_lane_u32(uint64x2_t a, uint32x2_t b, uint32x2_t v, __constrange(0,1) int l); // VMLSL.U32 q0,d0, d0[0]
1882//Vector widening saturating doubling multiply subtract with scalar
1883_NEON2SSESTORAGE int32x4_t vqdmlsl_lane_s16(int32x4_t a, int16x4_t b, int16x4_t v, __constrange(0,3) int l); // VQDMLSL.S16 q0,d0, d0[0]
1884_NEON2SSESTORAGE int64x2_t vqdmlsl_lane_s32(int64x2_t a, int32x2_t b, int32x2_t v, __constrange(0,1) int l); // VQDMLSL.S32 q0,d0, d0[0]
1885//Vector multiply by scalar
1886_NEON2SSESTORAGE int16x4_t vmul_n_s16(int16x4_t a, int16_t b); // VMUL.I16 d0,d0,d0[0]
1887_NEON2SSESTORAGE int32x2_t vmul_n_s32(int32x2_t a, int32_t b); // VMUL.I32 d0,d0,d0[0]
1888_NEON2SSESTORAGE float32x2_t vmul_n_f32(float32x2_t a, float32_t b); // VMUL.F32 d0,d0,d0[0]
1889_NEON2SSESTORAGE uint16x4_t vmul_n_u16(uint16x4_t a, uint16_t b); // VMUL.I16 d0,d0,d0[0]
1890_NEON2SSESTORAGE uint32x2_t vmul_n_u32(uint32x2_t a, uint32_t b); // VMUL.I32 d0,d0,d0[0]
1891_NEON2SSESTORAGE int16x8_t vmulq_n_s16(int16x8_t a, int16_t b); // VMUL.I16 q0,q0,d0[0]
1892_NEON2SSESTORAGE int32x4_t vmulq_n_s32(int32x4_t a, int32_t b); // VMUL.I32 q0,q0,d0[0]
1893_NEON2SSESTORAGE float32x4_t vmulq_n_f32(float32x4_t a, float32_t b); // VMUL.F32 q0,q0,d0[0]
1894_NEON2SSESTORAGE uint16x8_t vmulq_n_u16(uint16x8_t a, uint16_t b); // VMUL.I16 q0,q0,d0[0]
1895_NEON2SSESTORAGE uint32x4_t vmulq_n_u32(uint32x4_t a, uint32_t b); // VMUL.I32 q0,q0,d0[0]
1896//Vector long multiply with scalar
1897_NEON2SSESTORAGE int32x4_t vmull_n_s16(int16x4_t vec1, int16_t val2); // VMULL.S16 q0,d0,d0[0]
1898_NEON2SSESTORAGE int64x2_t vmull_n_s32(int32x2_t vec1, int32_t val2); // VMULL.S32 q0,d0,d0[0]
1899_NEON2SSESTORAGE uint32x4_t vmull_n_u16(uint16x4_t vec1, uint16_t val2); // VMULL.U16 q0,d0,d0[0]
1900_NEON2SSESTORAGE uint64x2_t vmull_n_u32(uint32x2_t vec1, uint32_t val2); // VMULL.U32 q0,d0,d0[0]
1901//Vector long multiply by scalar
1902_NEON2SSESTORAGE int32x4_t vmull_lane_s16(int16x4_t vec1, int16x4_t val2, __constrange(0, 3) int val3); // VMULL.S16 q0,d0,d0[0]
1903_NEON2SSESTORAGE int64x2_t vmull_lane_s32(int32x2_t vec1, int32x2_t val2, __constrange(0, 1) int val3); // VMULL.S32 q0,d0,d0[0]
1904_NEON2SSESTORAGE uint32x4_t vmull_lane_u16(uint16x4_t vec1, uint16x4_t val2, __constrange(0, 3) int val3); // VMULL.U16 q0,d0,d0[0]
1905_NEON2SSESTORAGE uint64x2_t vmull_lane_u32(uint32x2_t vec1, uint32x2_t val2, __constrange(0, 1) int val3); // VMULL.U32 q0,d0,d0[0]
1906//Vector saturating doubling long multiply with scalar
1907_NEON2SSESTORAGE int32x4_t vqdmull_n_s16(int16x4_t vec1, int16_t val2); // VQDMULL.S16 q0,d0,d0[0]
1908_NEON2SSESTORAGE int64x2_t vqdmull_n_s32(int32x2_t vec1, int32_t val2); // VQDMULL.S32 q0,d0,d0[0]
1909//Vector saturating doubling long multiply by scalar
1910_NEON2SSESTORAGE int32x4_t vqdmull_lane_s16(int16x4_t vec1, int16x4_t val2, __constrange(0, 3) int val3); // VQDMULL.S16 q0,d0,d0[0]
1911_NEON2SSESTORAGE int64x2_t vqdmull_lane_s32(int32x2_t vec1, int32x2_t val2, __constrange(0, 1) int val3); // VQDMULL.S32 q0,d0,d0[0]
1912//Vector saturating doubling multiply high with scalar
1913_NEON2SSESTORAGE int16x4_t vqdmulh_n_s16(int16x4_t vec1, int16_t val2); // VQDMULH.S16 d0,d0,d0[0]
1914_NEON2SSESTORAGE int32x2_t vqdmulh_n_s32(int32x2_t vec1, int32_t val2); // VQDMULH.S32 d0,d0,d0[0]
1915_NEON2SSESTORAGE int16x8_t vqdmulhq_n_s16(int16x8_t vec1, int16_t val2); // VQDMULH.S16 q0,q0,d0[0]
1916_NEON2SSESTORAGE int32x4_t vqdmulhq_n_s32(int32x4_t vec1, int32_t val2); // VQDMULH.S32 q0,q0,d0[0]
1917//Vector saturating doubling multiply high by scalar
1918_NEON2SSESTORAGE int16x4_t vqdmulh_lane_s16(int16x4_t vec1, int16x4_t val2, __constrange(0, 3) int val3); // VQDMULH.S16 d0,d0,d0[0]
1919_NEON2SSESTORAGE int32x2_t vqdmulh_lane_s32(int32x2_t vec1, int32x2_t val2, __constrange(0, 1) int val3); // VQDMULH.S32 d0,d0,d0[0]
1920_NEON2SSESTORAGE int16x8_t vqdmulhq_lane_s16(int16x8_t vec1, int16x4_t val2, __constrange(0, 3) int val3); // VQDMULH.S16 q0,q0,d0[0]
1921_NEON2SSESTORAGE int32x4_t vqdmulhq_lane_s32(int32x4_t vec1, int32x2_t val2, __constrange(0, 1) int val3); // VQDMULH.S32 q0,q0,d0[0]
1922//Vector saturating rounding doubling multiply high with scalar
1923_NEON2SSESTORAGE int16x4_t vqrdmulh_n_s16(int16x4_t vec1, int16_t val2); // VQRDMULH.S16 d0,d0,d0[0]
1924_NEON2SSESTORAGE int32x2_t vqrdmulh_n_s32(int32x2_t vec1, int32_t val2); // VQRDMULH.S32 d0,d0,d0[0]
1925_NEON2SSESTORAGE int16x8_t vqrdmulhq_n_s16(int16x8_t vec1, int16_t val2); // VQRDMULH.S16 q0,q0,d0[0]
1926_NEON2SSESTORAGE int32x4_t vqrdmulhq_n_s32(int32x4_t vec1, int32_t val2); // VQRDMULH.S32 q0,q0,d0[0]
1927//Vector rounding saturating doubling multiply high by scalar
1928_NEON2SSESTORAGE int16x4_t vqrdmulh_lane_s16(int16x4_t vec1, int16x4_t val2, __constrange(0, 3) int val3); // VQRDMULH.S16 d0,d0,d0[0]
1929_NEON2SSESTORAGE int32x2_t vqrdmulh_lane_s32(int32x2_t vec1, int32x2_t val2, __constrange(0, 1) int val3); // VQRDMULH.S32 d0,d0,d0[0]
1930_NEON2SSESTORAGE int16x8_t vqrdmulhq_lane_s16(int16x8_t vec1, int16x4_t val2, __constrange(0, 3) int val3); // VQRDMULH.S16 q0,q0,d0[0]
1931_NEON2SSESTORAGE int32x4_t vqrdmulhq_lane_s32(int32x4_t vec1, int32x2_t val2, __constrange(0, 1) int val3); // VQRDMULH.S32 q0,q0,d0[0]
1932//Vector multiply accumulate with scalar
1933_NEON2SSESTORAGE int16x4_t vmla_n_s16(int16x4_t a, int16x4_t b, int16_t c); // VMLA.I16 d0, d0, d0[0]
1934_NEON2SSESTORAGE int32x2_t vmla_n_s32(int32x2_t a, int32x2_t b, int32_t c); // VMLA.I32 d0, d0, d0[0]
1935_NEON2SSE_GLOBAL uint16x4_t vmla_n_u16(uint16x4_t a, uint16x4_t b, uint16_t c); // VMLA.I16 d0, d0, d0[0]
1936_NEON2SSE_GLOBAL uint32x2_t vmla_n_u32(uint32x2_t a, uint32x2_t b, uint32_t c); // VMLA.I32 d0, d0, d0[0]
1937_NEON2SSESTORAGE float32x2_t vmla_n_f32(float32x2_t a, float32x2_t b, float32_t c); // VMLA.F32 d0, d0, d0[0]
1938_NEON2SSESTORAGE int16x8_t vmlaq_n_s16(int16x8_t a, int16x8_t b, int16_t c); // VMLA.I16 q0, q0, d0[0]
1939_NEON2SSESTORAGE int32x4_t vmlaq_n_s32(int32x4_t a, int32x4_t b, int32_t c); // VMLA.I32 q0, q0, d0[0]
1940_NEON2SSE_GLOBAL uint16x8_t vmlaq_n_u16(uint16x8_t a, uint16x8_t b, uint16_t c); // VMLA.I16 q0, q0, d0[0]
1941_NEON2SSE_GLOBAL uint32x4_t vmlaq_n_u32(uint32x4_t a, uint32x4_t b, uint32_t c); // VMLA.I32 q0, q0, d0[0]
1942_NEON2SSESTORAGE float32x4_t vmlaq_n_f32(float32x4_t a, float32x4_t b, float32_t c); // VMLA.F32 q0, q0, d0[0]
1943//Vector widening multiply accumulate with scalar
1944_NEON2SSESTORAGE int32x4_t vmlal_n_s16(int32x4_t a, int16x4_t b, int16_t c); // VMLAL.S16 q0, d0, d0[0]
1945_NEON2SSESTORAGE int64x2_t vmlal_n_s32(int64x2_t a, int32x2_t b, int32_t c); // VMLAL.S32 q0, d0, d0[0]
1946_NEON2SSESTORAGE uint32x4_t vmlal_n_u16(uint32x4_t a, uint16x4_t b, uint16_t c); // VMLAL.U16 q0, d0, d0[0]
1947_NEON2SSESTORAGE uint64x2_t vmlal_n_u32(uint64x2_t a, uint32x2_t b, uint32_t c); // VMLAL.U32 q0, d0, d0[0]
1948//Vector widening saturating doubling multiply accumulate with scalar
1949_NEON2SSESTORAGE int32x4_t vqdmlal_n_s16(int32x4_t a, int16x4_t b, int16_t c); // VQDMLAL.S16 q0, d0, d0[0]
1950_NEON2SSESTORAGE int64x2_t vqdmlal_n_s32(int64x2_t a, int32x2_t b, int32_t c); // VQDMLAL.S32 q0, d0, d0[0]
1951//Vector multiply subtract with scalar
1952_NEON2SSESTORAGE int16x4_t vmls_n_s16(int16x4_t a, int16x4_t b, int16_t c); // VMLS.I16 d0, d0, d0[0]
1953_NEON2SSESTORAGE int32x2_t vmls_n_s32(int32x2_t a, int32x2_t b, int32_t c); // VMLS.I32 d0, d0, d0[0]
1954_NEON2SSESTORAGE uint16x4_t vmls_n_u16(uint16x4_t a, uint16x4_t b, uint16_t c); // VMLS.I16 d0, d0, d0[0]
1955_NEON2SSESTORAGE uint32x2_t vmls_n_u32(uint32x2_t a, uint32x2_t b, uint32_t c); // VMLS.I32 d0, d0, d0[0]
1956_NEON2SSESTORAGE float32x2_t vmls_n_f32(float32x2_t a, float32x2_t b, float32_t c); // VMLS.F32 d0, d0, d0[0]
1957_NEON2SSESTORAGE int16x8_t vmlsq_n_s16(int16x8_t a, int16x8_t b, int16_t c); // VMLS.I16 q0, q0, d0[0]
1958_NEON2SSESTORAGE int32x4_t vmlsq_n_s32(int32x4_t a, int32x4_t b, int32_t c); // VMLS.I32 q0, q0, d0[0]
1959_NEON2SSESTORAGE uint16x8_t vmlsq_n_u16(uint16x8_t a, uint16x8_t b, uint16_t c); // VMLS.I16 q0, q0, d0[0]
1960_NEON2SSESTORAGE uint32x4_t vmlsq_n_u32(uint32x4_t a, uint32x4_t b, uint32_t c); // VMLS.I32 q0, q0, d0[0]
1961_NEON2SSESTORAGE float32x4_t vmlsq_n_f32(float32x4_t a, float32x4_t b, float32_t c); // VMLS.F32 q0, q0, d0[0]
1962//Vector widening multiply subtract with scalar
1963_NEON2SSESTORAGE int32x4_t vmlsl_n_s16(int32x4_t a, int16x4_t b, int16_t c); // VMLSL.S16 q0, d0, d0[0]
1964_NEON2SSESTORAGE int64x2_t vmlsl_n_s32(int64x2_t a, int32x2_t b, int32_t c); // VMLSL.S32 q0, d0, d0[0]
1965_NEON2SSESTORAGE uint32x4_t vmlsl_n_u16(uint32x4_t a, uint16x4_t b, uint16_t c); // VMLSL.U16 q0, d0, d0[0]
1966_NEON2SSESTORAGE uint64x2_t vmlsl_n_u32(uint64x2_t a, uint32x2_t b, uint32_t c); // VMLSL.U32 q0, d0, d0[0]
1967//Vector widening saturating doubling multiply subtract with scalar
1968_NEON2SSESTORAGE int32x4_t vqdmlsl_n_s16(int32x4_t a, int16x4_t b, int16_t c); // VQDMLSL.S16 q0, d0, d0[0]
1969_NEON2SSESTORAGE int64x2_t vqdmlsl_n_s32(int64x2_t a, int32x2_t b, int32_t c); // VQDMLSL.S32 q0, d0, d0[0]
1970//Vector extract
1971_NEON2SSESTORAGE int8x8_t vext_s8(int8x8_t a, int8x8_t b, __constrange(0,7) int c); // VEXT.8 d0,d0,d0,#0
1972_NEON2SSE_GLOBAL uint8x8_t vext_u8(uint8x8_t a, uint8x8_t b, __constrange(0,7) int c); // VEXT.8 d0,d0,d0,#0
1973_NEON2SSE_GLOBAL poly8x8_t vext_p8(poly8x8_t a, poly8x8_t b, __constrange(0,7) int c); // VEXT.8 d0,d0,d0,#0
1974_NEON2SSESTORAGE int16x4_t vext_s16(int16x4_t a, int16x4_t b, __constrange(0,3) int c); // VEXT.16 d0,d0,d0,#0
1975_NEON2SSE_GLOBAL uint16x4_t vext_u16(uint16x4_t a, uint16x4_t b, __constrange(0,3) int c); // VEXT.16 d0,d0,d0,#0
1976_NEON2SSE_GLOBAL poly16x4_t vext_p16(poly16x4_t a, poly16x4_t b, __constrange(0,3) int c); // VEXT.16 d0,d0,d0,#0
1977_NEON2SSESTORAGE int32x2_t vext_s32(int32x2_t a, int32x2_t b, __constrange(0,1) int c); // VEXT.32 d0,d0,d0,#0
1978_NEON2SSE_GLOBAL uint32x2_t vext_u32(uint32x2_t a, uint32x2_t b, __constrange(0,1) int c); // VEXT.32 d0,d0,d0,#0
1979_NEON2SSE_GLOBAL int64x1_t vext_s64(int64x1_t a, int64x1_t b, __constrange(0,0) int c); // VEXT.64 d0,d0,d0,#0
1980_NEON2SSE_GLOBAL uint64x1_t vext_u64(uint64x1_t a, uint64x1_t b, __constrange(0,0) int c); // VEXT.64 d0,d0,d0,#0
1981_NEON2SSESTORAGE float32x2_t vext_f32(float32x2_t a, float32x2_t b, __constrange(0,1) int c); // VEXT.32 d0,d0,d0,#0
1982_NEON2SSE_GLOBAL int8x16_t vextq_s8(int8x16_t a, int8x16_t b, __constrange(0,15) int c); // VEXT.8 q0,q0,q0,#0
1983_NEON2SSE_GLOBAL uint8x16_t vextq_u8(uint8x16_t a, uint8x16_t b, __constrange(0,15) int c); // VEXT.8 q0,q0,q0,#0
1984_NEON2SSE_GLOBAL poly8x16_t vextq_p8(poly8x16_t a, poly8x16_t b, __constrange(0,15) int c); // VEXT.8 q0,q0,q0,#0
1985_NEON2SSE_GLOBAL int16x8_t vextq_s16(int16x8_t a, int16x8_t b, __constrange(0,7) int c); // VEXT.16 q0,q0,q0,#0
1986_NEON2SSE_GLOBAL uint16x8_t vextq_u16(uint16x8_t a, uint16x8_t b, __constrange(0,7) int c); // VEXT.16 q0,q0,q0,#0
1987_NEON2SSE_GLOBAL poly16x8_t vextq_p16(poly16x8_t a, poly16x8_t b, __constrange(0,7) int c); // VEXT.16 q0,q0,q0,#0
1988_NEON2SSE_GLOBAL int32x4_t vextq_s32(int32x4_t a, int32x4_t b, __constrange(0,3) int c); // VEXT.32 q0,q0,q0,#0
1989_NEON2SSE_GLOBAL uint32x4_t vextq_u32(uint32x4_t a, uint32x4_t b, __constrange(0,3) int c); // VEXT.32 q0,q0,q0,#0
1990_NEON2SSE_GLOBAL int64x2_t vextq_s64(int64x2_t a, int64x2_t b, __constrange(0,1) int c); // VEXT.64 q0,q0,q0,#0
1991_NEON2SSE_GLOBAL uint64x2_t vextq_u64(uint64x2_t a, uint64x2_t b, __constrange(0,1) int c); // VEXT.64 q0,q0,q0,#0
1992_NEON2SSE_GLOBAL float32x4_t vextq_f32(float32x4_t a, float32x4_t b, __constrange(0,3) float c); // VEXT.32 q0,q0,q0,#0
1993//Reverse vector elements (swap endianness). VREVn.m reverses the order of the m-bit lanes within a set that is n bits wide.
1994_NEON2SSESTORAGE int8x8_t vrev64_s8(int8x8_t vec); // VREV64.8 d0,d0
1995_NEON2SSESTORAGE int16x4_t vrev64_s16(int16x4_t vec); // VREV64.16 d0,d0
1996_NEON2SSESTORAGE int32x2_t vrev64_s32(int32x2_t vec); // VREV64.32 d0,d0
1997_NEON2SSE_GLOBAL uint8x8_t vrev64_u8(uint8x8_t vec); // VREV64.8 d0,d0
1998_NEON2SSE_GLOBAL uint16x4_t vrev64_u16(uint16x4_t vec); // VREV64.16 d0,d0
1999_NEON2SSE_GLOBAL uint32x2_t vrev64_u32(uint32x2_t vec); // VREV64.32 d0,d0
2000_NEON2SSE_GLOBAL poly8x8_t vrev64_p8(poly8x8_t vec); // VREV64.8 d0,d0
2001_NEON2SSE_GLOBAL poly16x4_t vrev64_p16(poly16x4_t vec); // VREV64.16 d0,d0
2002_NEON2SSESTORAGE float32x2_t vrev64_f32(float32x2_t vec); // VREV64.32 d0,d0
2003_NEON2SSESTORAGE int8x16_t vrev64q_s8(int8x16_t vec); // VREV64.8 q0,q0
2004_NEON2SSESTORAGE int16x8_t vrev64q_s16(int16x8_t vec); // VREV64.16 q0,q0
2005_NEON2SSESTORAGE int32x4_t vrev64q_s32(int32x4_t vec); // VREV64.32 q0,q0
2006_NEON2SSE_GLOBAL uint8x16_t vrev64q_u8(uint8x16_t vec); // VREV64.8 q0,q0
2007_NEON2SSE_GLOBAL uint16x8_t vrev64q_u16(uint16x8_t vec); // VREV64.16 q0,q0
2008_NEON2SSE_GLOBAL uint32x4_t vrev64q_u32(uint32x4_t vec); // VREV64.32 q0,q0
2009_NEON2SSE_GLOBAL poly8x16_t vrev64q_p8(poly8x16_t vec); // VREV64.8 q0,q0
2010_NEON2SSE_GLOBAL poly16x8_t vrev64q_p16(poly16x8_t vec); // VREV64.16 q0,q0
2011_NEON2SSE_GLOBAL float32x4_t vrev64q_f32(float32x4_t vec); // VREV64.32 q0,q0
2012_NEON2SSESTORAGE int8x8_t vrev32_s8(int8x8_t vec); // VREV32.8 d0,d0
2013_NEON2SSESTORAGE int16x4_t vrev32_s16(int16x4_t vec); // VREV32.16 d0,d0
2014_NEON2SSE_GLOBAL uint8x8_t vrev32_u8(uint8x8_t vec); // VREV32.8 d0,d0
2015_NEON2SSE_GLOBAL uint16x4_t vrev32_u16(uint16x4_t vec); // VREV32.16 d0,d0
2016_NEON2SSE_GLOBAL poly8x8_t vrev32_p8(poly8x8_t vec); // VREV32.8 d0,d0
2017_NEON2SSE_GLOBAL poly16x4_t vrev32_p16(poly16x4_t vec); // VREV32.16 d0,d0
2018_NEON2SSESTORAGE int8x16_t vrev32q_s8(int8x16_t vec); // VREV32.8 q0,q0
2019_NEON2SSESTORAGE int16x8_t vrev32q_s16(int16x8_t vec); // VREV32.16 q0,q0
2020_NEON2SSE_GLOBAL uint8x16_t vrev32q_u8(uint8x16_t vec); // VREV32.8 q0,q0
2021_NEON2SSE_GLOBAL uint16x8_t vrev32q_u16(uint16x8_t vec); // VREV32.16 q0,q0
2022_NEON2SSE_GLOBAL poly8x16_t vrev32q_p8(poly8x16_t vec); // VREV32.8 q0,q0
2023_NEON2SSE_GLOBAL poly16x8_t vrev32q_p16(poly16x8_t vec); // VREV32.16 q0,q0
2024_NEON2SSESTORAGE int8x8_t vrev16_s8(int8x8_t vec); // VREV16.8 d0,d0
2025_NEON2SSE_GLOBAL uint8x8_t vrev16_u8(uint8x8_t vec); // VREV16.8 d0,d0
2026_NEON2SSE_GLOBAL poly8x8_t vrev16_p8(poly8x8_t vec); // VREV16.8 d0,d0
2027_NEON2SSESTORAGE int8x16_t vrev16q_s8(int8x16_t vec); // VREV16.8 q0,q0
2028_NEON2SSE_GLOBAL uint8x16_t vrev16q_u8(uint8x16_t vec); // VREV16.8 q0,q0
2029_NEON2SSE_GLOBAL poly8x16_t vrev16q_p8(poly8x16_t vec); // VREV16.8 q0,q0
2030//Other single operand arithmetic
2031//Absolute: Vd[i] = |Va[i]|
2032_NEON2SSESTORAGE int8x8_t vabs_s8(int8x8_t a); // VABS.S8 d0,d0
2033_NEON2SSESTORAGE int16x4_t vabs_s16(int16x4_t a); // VABS.S16 d0,d0
2034_NEON2SSESTORAGE int32x2_t vabs_s32(int32x2_t a); // VABS.S32 d0,d0
2035_NEON2SSESTORAGE float32x2_t vabs_f32(float32x2_t a); // VABS.F32 d0,d0
2036_NEON2SSE_GLOBAL int8x16_t vabsq_s8(int8x16_t a); // VABS.S8 q0,q0
2037_NEON2SSE_GLOBAL int16x8_t vabsq_s16(int16x8_t a); // VABS.S16 q0,q0
2038_NEON2SSE_GLOBAL int32x4_t vabsq_s32(int32x4_t a); // VABS.S32 q0,q0
2039_NEON2SSESTORAGE float32x4_t vabsq_f32(float32x4_t a); // VABS.F32 q0,q0
2040
2041#ifdef _NEON2SSE_64BIT
2042_NEON2SSESTORAGE int64x2_t vabsq_s64(int64x2_t a); // VABS.S64 q0,q0
2043_NEON2SSESTORAGE float64x2_t vabsq_f64(float64x2_t a); // VABS.F64 q0,q0
2044#endif
2045
2046//Saturating absolute: Vd[i] = sat(|Va[i]|)
2047_NEON2SSESTORAGE int8x8_t vqabs_s8(int8x8_t a); // VQABS.S8 d0,d0
2048_NEON2SSESTORAGE int16x4_t vqabs_s16(int16x4_t a); // VQABS.S16 d0,d0
2049_NEON2SSESTORAGE int32x2_t vqabs_s32(int32x2_t a); // VQABS.S32 d0,d0
2050_NEON2SSESTORAGE int8x16_t vqabsq_s8(int8x16_t a); // VQABS.S8 q0,q0
2051_NEON2SSESTORAGE int16x8_t vqabsq_s16(int16x8_t a); // VQABS.S16 q0,q0
2052_NEON2SSESTORAGE int32x4_t vqabsq_s32(int32x4_t a); // VQABS.S32 q0,q0
2053//Negate: Vd[i] = - Va[i]
2054_NEON2SSESTORAGE int8x8_t vneg_s8(int8x8_t a); // VNE//d0,d0
2055_NEON2SSESTORAGE int16x4_t vneg_s16(int16x4_t a); // VNE//d0,d0
2056_NEON2SSESTORAGE int32x2_t vneg_s32(int32x2_t a); // VNE//d0,d0
2057_NEON2SSESTORAGE float32x2_t vneg_f32(float32x2_t a); // VNE//d0,d0
2058_NEON2SSESTORAGE int8x16_t vnegq_s8(int8x16_t a); // VNE//q0,q0
2059_NEON2SSESTORAGE int16x8_t vnegq_s16(int16x8_t a); // VNE//q0,q0
2060_NEON2SSESTORAGE int32x4_t vnegq_s32(int32x4_t a); // VNE//q0,q0
2061_NEON2SSESTORAGE float32x4_t vnegq_f32(float32x4_t a); // VNE//q0,q0
2062//Saturating Negate: sat(Vd[i] = - Va[i])
2063_NEON2SSESTORAGE int8x8_t vqneg_s8(int8x8_t a); // VQNE//d0,d0
2064_NEON2SSESTORAGE int16x4_t vqneg_s16(int16x4_t a); // VQNE//d0,d0
2065_NEON2SSESTORAGE int32x2_t vqneg_s32(int32x2_t a); // VQNE//d0,d0
2066_NEON2SSESTORAGE int8x16_t vqnegq_s8(int8x16_t a); // VQNE//q0,q0
2067_NEON2SSESTORAGE int16x8_t vqnegq_s16(int16x8_t a); // VQNE//q0,q0
2068_NEON2SSESTORAGE int32x4_t vqnegq_s32(int32x4_t a); // VQNE//q0,q0
2069//Count leading sign bits
2070_NEON2SSESTORAGE int8x8_t vcls_s8(int8x8_t a); // VCLS.S8 d0,d0
2071_NEON2SSESTORAGE int16x4_t vcls_s16(int16x4_t a); // VCLS.S16 d0,d0
2072_NEON2SSESTORAGE int32x2_t vcls_s32(int32x2_t a); // VCLS.S32 d0,d0
2073_NEON2SSESTORAGE int8x16_t vclsq_s8(int8x16_t a); // VCLS.S8 q0,q0
2074_NEON2SSESTORAGE int16x8_t vclsq_s16(int16x8_t a); // VCLS.S16 q0,q0
2075_NEON2SSESTORAGE int32x4_t vclsq_s32(int32x4_t a); // VCLS.S32 q0,q0
2076//Count leading zeros
2077_NEON2SSESTORAGE int8x8_t vclz_s8(int8x8_t a); // VCLZ.I8 d0,d0
2078_NEON2SSESTORAGE int16x4_t vclz_s16(int16x4_t a); // VCLZ.I16 d0,d0
2079_NEON2SSESTORAGE int32x2_t vclz_s32(int32x2_t a); // VCLZ.I32 d0,d0
2080_NEON2SSE_GLOBAL uint8x8_t vclz_u8(uint8x8_t a); // VCLZ.I8 d0,d0
2081_NEON2SSE_GLOBAL uint16x4_t vclz_u16(uint16x4_t a); // VCLZ.I16 d0,d0
2082_NEON2SSE_GLOBAL uint32x2_t vclz_u32(uint32x2_t a); // VCLZ.I32 d0,d0
2083_NEON2SSESTORAGE int8x16_t vclzq_s8(int8x16_t a); // VCLZ.I8 q0,q0
2084_NEON2SSESTORAGE int16x8_t vclzq_s16(int16x8_t a); // VCLZ.I16 q0,q0
2085_NEON2SSESTORAGE int32x4_t vclzq_s32(int32x4_t a); // VCLZ.I32 q0,q0
2086_NEON2SSE_GLOBAL uint8x16_t vclzq_u8(uint8x16_t a); // VCLZ.I8 q0,q0
2087_NEON2SSE_GLOBAL uint16x8_t vclzq_u16(uint16x8_t a); // VCLZ.I16 q0,q0
2088_NEON2SSE_GLOBAL uint32x4_t vclzq_u32(uint32x4_t a); // VCLZ.I32 q0,q0
2089//Count number of set bits
2090_NEON2SSESTORAGE uint8x8_t vcnt_u8(uint8x8_t a); // VCNT.8 d0,d0
2091_NEON2SSE_GLOBAL int8x8_t vcnt_s8(int8x8_t a); // VCNT.8 d0,d0
2092_NEON2SSE_GLOBAL poly8x8_t vcnt_p8(poly8x8_t a); // VCNT.8 d0,d0
2093_NEON2SSESTORAGE uint8x16_t vcntq_u8(uint8x16_t a); // VCNT.8 q0,q0
2094_NEON2SSE_GLOBAL int8x16_t vcntq_s8(int8x16_t a); // VCNT.8 q0,q0
2095_NEON2SSE_GLOBAL poly8x16_t vcntq_p8(poly8x16_t a); // VCNT.8 q0,q0
2096//Reciprocal estimate
2097_NEON2SSESTORAGE float32x2_t vrecpe_f32(float32x2_t a); // VRECPE.F32 d0,d0
2098_NEON2SSESTORAGE uint32x2_t vrecpe_u32(uint32x2_t a); // VRECPE.U32 d0,d0
2099_NEON2SSE_GLOBAL float32x4_t vrecpeq_f32(float32x4_t a); // VRECPE.F32 q0,q0
2100_NEON2SSESTORAGE uint32x4_t vrecpeq_u32(uint32x4_t a); // VRECPE.U32 q0,q0
2101//Reciprocal square root estimate
2102_NEON2SSESTORAGE float32x2_t vrsqrte_f32(float32x2_t a); // VRSQRTE.F32 d0,d0
2103_NEON2SSESTORAGE uint32x2_t vrsqrte_u32(uint32x2_t a); // VRSQRTE.U32 d0,d0
2104_NEON2SSE_GLOBAL float32x4_t vrsqrteq_f32(float32x4_t a); // VRSQRTE.F32 q0,q0
2105_NEON2SSESTORAGE uint32x4_t vrsqrteq_u32(uint32x4_t a); // VRSQRTE.U32 q0,q0
2106//Logical operations
2107//Bitwise not
2108_NEON2SSESTORAGE int8x8_t vmvn_s8(int8x8_t a); // VMVN d0,d0
2109_NEON2SSESTORAGE int16x4_t vmvn_s16(int16x4_t a); // VMVN d0,d0
2110_NEON2SSESTORAGE int32x2_t vmvn_s32(int32x2_t a); // VMVN d0,d0
2111_NEON2SSE_GLOBAL uint8x8_t vmvn_u8(uint8x8_t a); // VMVN d0,d0
2112_NEON2SSE_GLOBAL uint16x4_t vmvn_u16(uint16x4_t a); // VMVN d0,d0
2113_NEON2SSE_GLOBAL uint32x2_t vmvn_u32(uint32x2_t a); // VMVN d0,d0
2114_NEON2SSE_GLOBAL poly8x8_t vmvn_p8(poly8x8_t a); // VMVN d0,d0
2115_NEON2SSESTORAGE int8x16_t vmvnq_s8(int8x16_t a); // VMVN q0,q0
2116_NEON2SSESTORAGE int16x8_t vmvnq_s16(int16x8_t a); // VMVN q0,q0
2117_NEON2SSESTORAGE int32x4_t vmvnq_s32(int32x4_t a); // VMVN q0,q0
2118_NEON2SSE_GLOBAL uint8x16_t vmvnq_u8(uint8x16_t a); // VMVN q0,q0
2119_NEON2SSE_GLOBAL uint16x8_t vmvnq_u16(uint16x8_t a); // VMVN q0,q0
2120_NEON2SSE_GLOBAL uint32x4_t vmvnq_u32(uint32x4_t a); // VMVN q0,q0
2121_NEON2SSE_GLOBAL poly8x16_t vmvnq_p8(poly8x16_t a); // VMVN q0,q0
2122//Bitwise and
2123_NEON2SSESTORAGE int8x8_t vand_s8(int8x8_t a, int8x8_t b); // VAND d0,d0,d0
2124_NEON2SSESTORAGE int16x4_t vand_s16(int16x4_t a, int16x4_t b); // VAND d0,d0,d0
2125_NEON2SSESTORAGE int32x2_t vand_s32(int32x2_t a, int32x2_t b); // VAND d0,d0,d0
2126_NEON2SSESTORAGE int64x1_t vand_s64(int64x1_t a, int64x1_t b); // VAND d0,d0,d0
2127_NEON2SSE_GLOBAL uint8x8_t vand_u8(uint8x8_t a, uint8x8_t b); // VAND d0,d0,d0
2128_NEON2SSE_GLOBAL uint16x4_t vand_u16(uint16x4_t a, uint16x4_t b); // VAND d0,d0,d0
2129_NEON2SSE_GLOBAL uint32x2_t vand_u32(uint32x2_t a, uint32x2_t b); // VAND d0,d0,d0
2130_NEON2SSE_GLOBAL uint64x1_t vand_u64(uint64x1_t a, uint64x1_t b); // VAND d0,d0,d0
2131_NEON2SSE_GLOBAL int8x16_t vandq_s8(int8x16_t a, int8x16_t b); // VAND q0,q0,q0
2132_NEON2SSE_GLOBAL int16x8_t vandq_s16(int16x8_t a, int16x8_t b); // VAND q0,q0,q0
2133_NEON2SSE_GLOBAL int32x4_t vandq_s32(int32x4_t a, int32x4_t b); // VAND q0,q0,q0
2134_NEON2SSE_GLOBAL int64x2_t vandq_s64(int64x2_t a, int64x2_t b); // VAND q0,q0,q0
2135_NEON2SSE_GLOBAL uint8x16_t vandq_u8(uint8x16_t a, uint8x16_t b); // VAND q0,q0,q0
2136_NEON2SSE_GLOBAL uint16x8_t vandq_u16(uint16x8_t a, uint16x8_t b); // VAND q0,q0,q0
2137_NEON2SSE_GLOBAL uint32x4_t vandq_u32(uint32x4_t a, uint32x4_t b); // VAND q0,q0,q0
2138_NEON2SSE_GLOBAL uint64x2_t vandq_u64(uint64x2_t a, uint64x2_t b); // VAND q0,q0,q0
2139//Bitwise or
2140_NEON2SSESTORAGE int8x8_t vorr_s8(int8x8_t a, int8x8_t b); // VORR d0,d0,d0
2141_NEON2SSESTORAGE int16x4_t vorr_s16(int16x4_t a, int16x4_t b); // VORR d0,d0,d0
2142_NEON2SSESTORAGE int32x2_t vorr_s32(int32x2_t a, int32x2_t b); // VORR d0,d0,d0
2143_NEON2SSESTORAGE int64x1_t vorr_s64(int64x1_t a, int64x1_t b); // VORR d0,d0,d0
2144_NEON2SSE_GLOBAL uint8x8_t vorr_u8(uint8x8_t a, uint8x8_t b); // VORR d0,d0,d0
2145_NEON2SSE_GLOBAL uint16x4_t vorr_u16(uint16x4_t a, uint16x4_t b); // VORR d0,d0,d0
2146_NEON2SSE_GLOBAL uint32x2_t vorr_u32(uint32x2_t a, uint32x2_t b); // VORR d0,d0,d0
2147_NEON2SSE_GLOBAL uint64x1_t vorr_u64(uint64x1_t a, uint64x1_t b); // VORR d0,d0,d0
2148_NEON2SSE_GLOBAL int8x16_t vorrq_s8(int8x16_t a, int8x16_t b); // VORR q0,q0,q0
2149_NEON2SSE_GLOBAL int16x8_t vorrq_s16(int16x8_t a, int16x8_t b); // VORR q0,q0,q0
2150_NEON2SSE_GLOBAL int32x4_t vorrq_s32(int32x4_t a, int32x4_t b); // VORR q0,q0,q0
2151_NEON2SSE_GLOBAL int64x2_t vorrq_s64(int64x2_t a, int64x2_t b); // VORR q0,q0,q0
2152_NEON2SSE_GLOBAL uint8x16_t vorrq_u8(uint8x16_t a, uint8x16_t b); // VORR q0,q0,q0
2153_NEON2SSE_GLOBAL uint16x8_t vorrq_u16(uint16x8_t a, uint16x8_t b); // VORR q0,q0,q0
2154_NEON2SSE_GLOBAL uint32x4_t vorrq_u32(uint32x4_t a, uint32x4_t b); // VORR q0,q0,q0
2155_NEON2SSE_GLOBAL uint64x2_t vorrq_u64(uint64x2_t a, uint64x2_t b); // VORR q0,q0,q0
2156//Bitwise exclusive or (EOR or XOR)
2157_NEON2SSESTORAGE int8x8_t veor_s8(int8x8_t a, int8x8_t b); // VEOR d0,d0,d0
2158_NEON2SSE_GLOBAL int16x4_t veor_s16(int16x4_t a, int16x4_t b); // VEOR d0,d0,d0
2159_NEON2SSE_GLOBAL int32x2_t veor_s32(int32x2_t a, int32x2_t b); // VEOR d0,d0,d0
2160_NEON2SSESTORAGE int64x1_t veor_s64(int64x1_t a, int64x1_t b); // VEOR d0,d0,d0
2161_NEON2SSE_GLOBAL uint8x8_t veor_u8(uint8x8_t a, uint8x8_t b); // VEOR d0,d0,d0
2162_NEON2SSE_GLOBAL uint16x4_t veor_u16(uint16x4_t a, uint16x4_t b); // VEOR d0,d0,d0
2163_NEON2SSE_GLOBAL uint32x2_t veor_u32(uint32x2_t a, uint32x2_t b); // VEOR d0,d0,d0
2164_NEON2SSE_GLOBAL uint64x1_t veor_u64(uint64x1_t a, uint64x1_t b); // VEOR d0,d0,d0
2165_NEON2SSE_GLOBAL int8x16_t veorq_s8(int8x16_t a, int8x16_t b); // VEOR q0,q0,q0
2166_NEON2SSE_GLOBAL int16x8_t veorq_s16(int16x8_t a, int16x8_t b); // VEOR q0,q0,q0
2167_NEON2SSE_GLOBAL int32x4_t veorq_s32(int32x4_t a, int32x4_t b); // VEOR q0,q0,q0
2168_NEON2SSE_GLOBAL int64x2_t veorq_s64(int64x2_t a, int64x2_t b); // VEOR q0,q0,q0
2169_NEON2SSE_GLOBAL uint8x16_t veorq_u8(uint8x16_t a, uint8x16_t b); // VEOR q0,q0,q0
2170_NEON2SSE_GLOBAL uint16x8_t veorq_u16(uint16x8_t a, uint16x8_t b); // VEOR q0,q0,q0
2171_NEON2SSE_GLOBAL uint32x4_t veorq_u32(uint32x4_t a, uint32x4_t b); // VEOR q0,q0,q0
2172_NEON2SSE_GLOBAL uint64x2_t veorq_u64(uint64x2_t a, uint64x2_t b); // VEOR q0,q0,q0
2173//Bit Clear
2174_NEON2SSESTORAGE int8x8_t vbic_s8(int8x8_t a, int8x8_t b); // VBIC d0,d0,d0
2175_NEON2SSE_GLOBAL int16x4_t vbic_s16(int16x4_t a, int16x4_t b); // VBIC d0,d0,d0
2176_NEON2SSE_GLOBAL int32x2_t vbic_s32(int32x2_t a, int32x2_t b); // VBIC d0,d0,d0
2177_NEON2SSESTORAGE int64x1_t vbic_s64(int64x1_t a, int64x1_t b); // VBIC d0,d0,d0
2178_NEON2SSE_GLOBAL uint8x8_t vbic_u8(uint8x8_t a, uint8x8_t b); // VBIC d0,d0,d0
2179_NEON2SSE_GLOBAL uint16x4_t vbic_u16(uint16x4_t a, uint16x4_t b); // VBIC d0,d0,d0
2180_NEON2SSE_GLOBAL uint32x2_t vbic_u32(uint32x2_t a, uint32x2_t b); // VBIC d0,d0,d0
2181_NEON2SSE_GLOBAL uint64x1_t vbic_u64(uint64x1_t a, uint64x1_t b); // VBIC d0,d0,d0
2182_NEON2SSE_GLOBAL int8x16_t vbicq_s8(int8x16_t a, int8x16_t b); // VBIC q0,q0,q0
2183_NEON2SSE_GLOBAL int16x8_t vbicq_s16(int16x8_t a, int16x8_t b); // VBIC q0,q0,q0
2184_NEON2SSE_GLOBAL int32x4_t vbicq_s32(int32x4_t a, int32x4_t b); // VBIC q0,q0,q0
2185_NEON2SSE_GLOBAL int64x2_t vbicq_s64(int64x2_t a, int64x2_t b); // VBIC q0,q0,q0
2186_NEON2SSE_GLOBAL uint8x16_t vbicq_u8(uint8x16_t a, uint8x16_t b); // VBIC q0,q0,q0
2187_NEON2SSE_GLOBAL uint16x8_t vbicq_u16(uint16x8_t a, uint16x8_t b); // VBIC q0,q0,q0
2188_NEON2SSE_GLOBAL uint32x4_t vbicq_u32(uint32x4_t a, uint32x4_t b); // VBIC q0,q0,q0
2189_NEON2SSE_GLOBAL uint64x2_t vbicq_u64(uint64x2_t a, uint64x2_t b); // VBIC q0,q0,q0
2190//Bitwise OR complement
2191_NEON2SSESTORAGE int8x8_t vorn_s8(int8x8_t a, int8x8_t b); // VORN d0,d0,d0
2192_NEON2SSESTORAGE int16x4_t vorn_s16(int16x4_t a, int16x4_t b); // VORN d0,d0,d0
2193_NEON2SSESTORAGE int32x2_t vorn_s32(int32x2_t a, int32x2_t b); // VORN d0,d0,d0
2194_NEON2SSESTORAGE int64x1_t vorn_s64(int64x1_t a, int64x1_t b); // VORN d0,d0,d0
2195_NEON2SSE_GLOBAL uint8x8_t vorn_u8(uint8x8_t a, uint8x8_t b); // VORN d0,d0,d0
2196_NEON2SSE_GLOBAL uint16x4_t vorn_u16(uint16x4_t a, uint16x4_t b); // VORN d0,d0,d0
2197_NEON2SSE_GLOBAL uint32x2_t vorn_u32(uint32x2_t a, uint32x2_t b); // VORN d0,d0,d0
2198_NEON2SSE_GLOBAL uint64x1_t vorn_u64(uint64x1_t a, uint64x1_t b); // VORN d0,d0,d0
2199_NEON2SSESTORAGE int8x16_t vornq_s8(int8x16_t a, int8x16_t b); // VORN q0,q0,q0
2200_NEON2SSESTORAGE int16x8_t vornq_s16(int16x8_t a, int16x8_t b); // VORN q0,q0,q0
2201_NEON2SSESTORAGE int32x4_t vornq_s32(int32x4_t a, int32x4_t b); // VORN q0,q0,q0
2202_NEON2SSESTORAGE int64x2_t vornq_s64(int64x2_t a, int64x2_t b); // VORN q0,q0,q0
2203_NEON2SSESTORAGE uint8x16_t vornq_u8(uint8x16_t a, uint8x16_t b); // VORN q0,q0,q0
2204_NEON2SSESTORAGE uint16x8_t vornq_u16(uint16x8_t a, uint16x8_t b); // VORN q0,q0,q0
2205_NEON2SSESTORAGE uint32x4_t vornq_u32(uint32x4_t a, uint32x4_t b); // VORN q0,q0,q0
2206_NEON2SSE_GLOBAL uint64x2_t vornq_u64(uint64x2_t a, uint64x2_t b); // VORN q0,q0,q0
2207//Bitwise Select
2208_NEON2SSESTORAGE int8x8_t vbsl_s8(uint8x8_t a, int8x8_t b, int8x8_t c); // VBSL d0,d0,d0
2209_NEON2SSE_GLOBAL int16x4_t vbsl_s16(uint16x4_t a, int16x4_t b, int16x4_t c); // VBSL d0,d0,d0
2210_NEON2SSE_GLOBAL int32x2_t vbsl_s32(uint32x2_t a, int32x2_t b, int32x2_t c); // VBSL d0,d0,d0
2211_NEON2SSESTORAGE int64x1_t vbsl_s64(uint64x1_t a, int64x1_t b, int64x1_t c); // VBSL d0,d0,d0
2212_NEON2SSE_GLOBAL uint8x8_t vbsl_u8(uint8x8_t a, uint8x8_t b, uint8x8_t c); // VBSL d0,d0,d0
2213_NEON2SSE_GLOBAL uint16x4_t vbsl_u16(uint16x4_t a, uint16x4_t b, uint16x4_t c); // VBSL d0,d0,d0
2214_NEON2SSE_GLOBAL uint32x2_t vbsl_u32(uint32x2_t a, uint32x2_t b, uint32x2_t c); // VBSL d0,d0,d0
2215_NEON2SSE_GLOBAL uint64x1_t vbsl_u64(uint64x1_t a, uint64x1_t b, uint64x1_t c); // VBSL d0,d0,d0
2216_NEON2SSESTORAGE float32x2_t vbsl_f32(uint32x2_t a, float32x2_t b, float32x2_t c); // VBSL d0,d0,d0
2217_NEON2SSE_GLOBAL poly8x8_t vbsl_p8(uint8x8_t a, poly8x8_t b, poly8x8_t c); // VBSL d0,d0,d0
2218_NEON2SSE_GLOBAL poly16x4_t vbsl_p16(uint16x4_t a, poly16x4_t b, poly16x4_t c); // VBSL d0,d0,d0
2219_NEON2SSESTORAGE int8x16_t vbslq_s8(uint8x16_t a, int8x16_t b, int8x16_t c); // VBSL q0,q0,q0
2220_NEON2SSE_GLOBAL int16x8_t vbslq_s16(uint16x8_t a, int16x8_t b, int16x8_t c); // VBSL q0,q0,q0
2221_NEON2SSE_GLOBAL int32x4_t vbslq_s32(uint32x4_t a, int32x4_t b, int32x4_t c); // VBSL q0,q0,q0
2222_NEON2SSE_GLOBAL int64x2_t vbslq_s64(uint64x2_t a, int64x2_t b, int64x2_t c); // VBSL q0,q0,q0
2223_NEON2SSE_GLOBAL uint8x16_t vbslq_u8(uint8x16_t a, uint8x16_t b, uint8x16_t c); // VBSL q0,q0,q0
2224_NEON2SSE_GLOBAL uint16x8_t vbslq_u16(uint16x8_t a, uint16x8_t b, uint16x8_t c); // VBSL q0,q0,q0
2225_NEON2SSE_GLOBAL uint32x4_t vbslq_u32(uint32x4_t a, uint32x4_t b, uint32x4_t c); // VBSL q0,q0,q0
2226_NEON2SSE_GLOBAL uint64x2_t vbslq_u64(uint64x2_t a, uint64x2_t b, uint64x2_t c); // VBSL q0,q0,q0
2227_NEON2SSESTORAGE float32x4_t vbslq_f32(uint32x4_t a, float32x4_t b, float32x4_t c); // VBSL q0,q0,q0
2228_NEON2SSE_GLOBAL poly8x16_t vbslq_p8(uint8x16_t a, poly8x16_t b, poly8x16_t c); // VBSL q0,q0,q0
2229_NEON2SSE_GLOBAL poly16x8_t vbslq_p16(uint16x8_t a, poly16x8_t b, poly16x8_t c); // VBSL q0,q0,q0
2230//Transposition operations
2231//Transpose elements
2232_NEON2SSESTORAGE int8x8x2_t vtrn_s8(int8x8_t a, int8x8_t b); // VTRN.8 d0,d0
2233_NEON2SSESTORAGE int16x4x2_t vtrn_s16(int16x4_t a, int16x4_t b); // VTRN.16 d0,d0
2234_NEON2SSESTORAGE int32x2x2_t vtrn_s32(int32x2_t a, int32x2_t b); // VTRN.32 d0,d0
2235_NEON2SSE_GLOBAL uint8x8x2_t vtrn_u8(uint8x8_t a, uint8x8_t b); // VTRN.8 d0,d0
2236_NEON2SSE_GLOBAL uint16x4x2_t vtrn_u16(uint16x4_t a, uint16x4_t b); // VTRN.16 d0,d0
2237_NEON2SSE_GLOBAL uint32x2x2_t vtrn_u32(uint32x2_t a, uint32x2_t b); // VTRN.32 d0,d0
2238_NEON2SSESTORAGE float32x2x2_t vtrn_f32(float32x2_t a, float32x2_t b); // VTRN.32 d0,d0
2239_NEON2SSE_GLOBAL poly8x8x2_t vtrn_p8(poly8x8_t a, poly8x8_t b); // VTRN.8 d0,d0
2240_NEON2SSE_GLOBAL poly16x4x2_t vtrn_p16(poly16x4_t a, poly16x4_t b); // VTRN.16 d0,d0
2241_NEON2SSESTORAGE int8x16x2_t vtrnq_s8(int8x16_t a, int8x16_t b); // VTRN.8 q0,q0
2242_NEON2SSESTORAGE int16x8x2_t vtrnq_s16(int16x8_t a, int16x8_t b); // VTRN.16 q0,q0
2243_NEON2SSESTORAGE int32x4x2_t vtrnq_s32(int32x4_t a, int32x4_t b); // VTRN.32 q0,q0
2244_NEON2SSE_GLOBAL uint8x16x2_t vtrnq_u8(uint8x16_t a, uint8x16_t b); // VTRN.8 q0,q0
2245_NEON2SSE_GLOBAL uint16x8x2_t vtrnq_u16(uint16x8_t a, uint16x8_t b); // VTRN.16 q0,q0
2246_NEON2SSE_GLOBAL uint32x4x2_t vtrnq_u32(uint32x4_t a, uint32x4_t b); // VTRN.32 q0,q0
2247_NEON2SSESTORAGE float32x4x2_t vtrnq_f32(float32x4_t a, float32x4_t b); // VTRN.32 q0,q0
2248_NEON2SSE_GLOBAL poly8x16x2_t vtrnq_p8(poly8x16_t a, poly8x16_t b); // VTRN.8 q0,q0
2249_NEON2SSE_GLOBAL poly16x8x2_t vtrnq_p16(poly16x8_t a, poly16x8_t b); // VTRN.16 q0,q0
2250//Interleave elements
2251_NEON2SSESTORAGE int8x8x2_t vzip_s8(int8x8_t a, int8x8_t b); // VZIP.8 d0,d0
2252_NEON2SSESTORAGE int16x4x2_t vzip_s16(int16x4_t a, int16x4_t b); // VZIP.16 d0,d0
2253_NEON2SSE_GLOBAL int32x2x2_t vzip_s32(int32x2_t a, int32x2_t b); // VZIP.32 d0,d0
2254_NEON2SSE_GLOBAL uint8x8x2_t vzip_u8(uint8x8_t a, uint8x8_t b); // VZIP.8 d0,d0
2255_NEON2SSE_GLOBAL uint16x4x2_t vzip_u16(uint16x4_t a, uint16x4_t b); // VZIP.16 d0,d0
2256_NEON2SSE_GLOBAL uint32x2x2_t vzip_u32(uint32x2_t a, uint32x2_t b); // VZIP.32 d0,d0
2257_NEON2SSE_GLOBAL float32x2x2_t vzip_f32(float32x2_t a, float32x2_t b); // VZIP.32 d0,d0
2258_NEON2SSE_GLOBAL poly8x8x2_t vzip_p8(poly8x8_t a, poly8x8_t b); // VZIP.8 d0,d0
2259_NEON2SSE_GLOBAL poly16x4x2_t vzip_p16(poly16x4_t a, poly16x4_t b); // VZIP.16 d0,d0
2260_NEON2SSESTORAGE int8x16x2_t vzipq_s8(int8x16_t a, int8x16_t b); // VZIP.8 q0,q0
2261_NEON2SSESTORAGE int16x8x2_t vzipq_s16(int16x8_t a, int16x8_t b); // VZIP.16 q0,q0
2262_NEON2SSESTORAGE int32x4x2_t vzipq_s32(int32x4_t a, int32x4_t b); // VZIP.32 q0,q0
2263_NEON2SSE_GLOBAL uint8x16x2_t vzipq_u8(uint8x16_t a, uint8x16_t b); // VZIP.8 q0,q0
2264_NEON2SSE_GLOBAL uint16x8x2_t vzipq_u16(uint16x8_t a, uint16x8_t b); // VZIP.16 q0,q0
2265_NEON2SSE_GLOBAL uint32x4x2_t vzipq_u32(uint32x4_t a, uint32x4_t b); // VZIP.32 q0,q0
2266_NEON2SSESTORAGE float32x4x2_t vzipq_f32(float32x4_t a, float32x4_t b); // VZIP.32 q0,q0
2267_NEON2SSE_GLOBAL poly8x16x2_t vzipq_p8(poly8x16_t a, poly8x16_t b); // VZIP.8 q0,q0
2268_NEON2SSE_GLOBAL poly16x8x2_t vzipq_p16(poly16x8_t a, poly16x8_t b); // VZIP.16 q0,q0
2269//De-Interleave elements
2270_NEON2SSESTORAGE int8x8x2_t vuzp_s8(int8x8_t a, int8x8_t b); // VUZP.8 d0,d0
2271_NEON2SSESTORAGE int16x4x2_t vuzp_s16(int16x4_t a, int16x4_t b); // VUZP.16 d0,d0
2272_NEON2SSESTORAGE int32x2x2_t vuzp_s32(int32x2_t a, int32x2_t b); // VUZP.32 d0,d0
2273_NEON2SSE_GLOBAL uint8x8x2_t vuzp_u8(uint8x8_t a, uint8x8_t b); // VUZP.8 d0,d0
2274_NEON2SSE_GLOBAL uint16x4x2_t vuzp_u16(uint16x4_t a, uint16x4_t b); // VUZP.16 d0,d0
2275_NEON2SSE_GLOBAL uint32x2x2_t vuzp_u32(uint32x2_t a, uint32x2_t b); // VUZP.32 d0,d0
2276_NEON2SSE_GLOBAL float32x2x2_t vuzp_f32(float32x2_t a, float32x2_t b); // VUZP.32 d0,d0
2277_NEON2SSE_GLOBAL poly8x8x2_t vuzp_p8(poly8x8_t a, poly8x8_t b); // VUZP.8 d0,d0
2278_NEON2SSE_GLOBAL poly16x4x2_t vuzp_p16(poly16x4_t a, poly16x4_t b); // VUZP.16 d0,d0
2279_NEON2SSESTORAGE int8x16x2_t vuzpq_s8(int8x16_t a, int8x16_t b); // VUZP.8 q0,q0
2280_NEON2SSESTORAGE int16x8x2_t vuzpq_s16(int16x8_t a, int16x8_t b); // VUZP.16 q0,q0
2281_NEON2SSESTORAGE int32x4x2_t vuzpq_s32(int32x4_t a, int32x4_t b); // VUZP.32 q0,q0
2282_NEON2SSE_GLOBAL uint8x16x2_t vuzpq_u8(uint8x16_t a, uint8x16_t b); // VUZP.8 q0,q0
2283_NEON2SSE_GLOBAL uint16x8x2_t vuzpq_u16(uint16x8_t a, uint16x8_t b); // VUZP.16 q0,q0
2284_NEON2SSE_GLOBAL uint32x4x2_t vuzpq_u32(uint32x4_t a, uint32x4_t b); // VUZP.32 q0,q0
2285_NEON2SSESTORAGE float32x4x2_t vuzpq_f32(float32x4_t a, float32x4_t b); // VUZP.32 q0,q0
2286_NEON2SSE_GLOBAL poly8x16x2_t vuzpq_p8(poly8x16_t a, poly8x16_t b); // VUZP.8 q0,q0
2287_NEON2SSE_GLOBAL poly16x8x2_t vuzpq_p16(poly16x8_t a, poly16x8_t b); // VUZP.16 q0,q0
2288
2289_NEON2SSESTORAGE float32x4_t vrndnq_f32(float32x4_t a); // VRND.F32 q0,q0
2290_NEON2SSESTORAGE float64x2_t vrndnq_f64(float64x2_t a); // VRND.F64 q0,q0
2291
2292//Sqrt
2293_NEON2SSE_GLOBAL float32x4_t vsqrtq_f32(float32x4_t a); // VSQRT.F32 q0,q0
2294_NEON2SSE_GLOBAL float64x2_t vsqrtq_f64(float64x2_t a); // VSQRT.F64 q0,q0
2295
2296
2297//^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
2298// the following macros solve the problem of the "immediate parameters requirement" for some x86 intrinsics.
2299// we need it to compile the code unless the "Intrinsic parameter must be an immediate value" error is our goal
2300//
2301#if ( defined (__INTEL_COMPILER) && !defined(__llvm__) )
2302# define _MM_ALIGNR_EPI8 _mm_alignr_epi8
2303# define _MM_EXTRACT_EPI16 (int16_t) _mm_extract_epi16
2304# define _MM_INSERT_EPI16 _mm_insert_epi16
2305# ifdef USE_SSE4
2306# define _MM_EXTRACT_EPI8 _mm_extract_epi8
2307# define _MM_EXTRACT_EPI32 _mm_extract_epi32
2308# define _MM_EXTRACT_PS _mm_extract_ps
2309# define _MM_INSERT_EPI8 _mm_insert_epi8
2310# define _MM_INSERT_EPI32 _mm_insert_epi32
2311# define _MM_INSERT_PS _mm_insert_ps
2312# ifdef _NEON2SSE_64BIT
2313# define _MM_INSERT_EPI64 _mm_insert_epi64
2314# define _MM_EXTRACT_EPI64 _mm_extract_epi64
2315# endif
2316# endif //SSE4
2317#else
2318# define _NEON2SSE_COMMA ,
2319# define _NEON2SSE_SWITCH16(NAME, a, b, LANE) \
2320 switch(LANE) \
2321 { \
2322 case 0: return NAME(a b, 0); \
2323 case 1: return NAME(a b, 1); \
2324 case 2: return NAME(a b, 2); \
2325 case 3: return NAME(a b, 3); \
2326 case 4: return NAME(a b, 4); \
2327 case 5: return NAME(a b, 5); \
2328 case 6: return NAME(a b, 6); \
2329 case 7: return NAME(a b, 7); \
2330 case 8: return NAME(a b, 8); \
2331 case 9: return NAME(a b, 9); \
2332 case 10: return NAME(a b, 10); \
2333 case 11: return NAME(a b, 11); \
2334 case 12: return NAME(a b, 12); \
2335 case 13: return NAME(a b, 13); \
2336 case 14: return NAME(a b, 14); \
2337 case 15: return NAME(a b, 15); \
2338 default: return NAME(a b, 0); \
2339 }
2340
2341# define _NEON2SSE_SWITCH8(NAME, vec, LANE, p) \
2342 switch(LANE) \
2343 { \
2344 case 0: return NAME(vec p,0); \
2345 case 1: return NAME(vec p,1); \
2346 case 2: return NAME(vec p,2); \
2347 case 3: return NAME(vec p,3); \
2348 case 4: return NAME(vec p,4); \
2349 case 5: return NAME(vec p,5); \
2350 case 6: return NAME(vec p,6); \
2351 case 7: return NAME(vec p,7); \
2352 default: return NAME(vec p,0); \
2353 }
2354
2355# define _NEON2SSE_SWITCH4(NAME, case0, case1, case2, case3, vec, LANE, p) \
2356 switch(LANE) \
2357 { \
2358 case case0: return NAME(vec p,case0); \
2359 case case1: return NAME(vec p,case1); \
2360 case case2: return NAME(vec p,case2); \
2361 case case3: return NAME(vec p,case3); \
2362 default: return NAME(vec p,case0); \
2363 }
2364
2365 _NEON2SSE_INLINE __m128i _MM_ALIGNR_EPI8(__m128i a, __m128i b, int LANE)
2366 {
2367 _NEON2SSE_SWITCH16(_mm_alignr_epi8, a, _NEON2SSE_COMMA b, LANE)
2368 }
2369
2370 _NEON2SSE_INLINE __m128i _MM_INSERT_EPI16(__m128i vec, int p, const int LANE)
2371 {
2372 _NEON2SSE_SWITCH8(_mm_insert_epi16, vec, LANE, _NEON2SSE_COMMA p)
2373 }
2374
2375 _NEON2SSE_INLINE int16_t _MM_EXTRACT_EPI16(__m128i vec, const int LANE)
2376 {
2377 _NEON2SSE_SWITCH8((int16_t)_mm_extract_epi16, vec, LANE,)
2378 }
2379
2380#ifdef USE_SSE4
2381 _NEON2SSE_INLINE int _MM_EXTRACT_EPI32(__m128i vec, const int LANE)
2382 {
2383 _NEON2SSE_SWITCH4(_mm_extract_epi32, 0,1,2,3, vec, LANE,)
2384 }
2385
2386 _NEON2SSE_INLINE int _MM_EXTRACT_PS(__m128 vec, const int LANE)
2387 {
2388 _NEON2SSE_SWITCH4(_mm_extract_ps, 0,1,2,3, vec, LANE,)
2389 }
2390
2391 _NEON2SSE_INLINE int _MM_EXTRACT_EPI8(__m128i vec, const int LANE)
2392 {
2393 _NEON2SSE_SWITCH16(_mm_extract_epi8, vec, , LANE)
2394 }
2395
2396 _NEON2SSE_INLINE __m128i _MM_INSERT_EPI32(__m128i vec, int p, const int LANE)
2397 {
2398 _NEON2SSE_SWITCH4(_mm_insert_epi32, 0, 1, 2, 3, vec, LANE, _NEON2SSE_COMMA p)
2399 }
2400
2401 _NEON2SSE_INLINE __m128i _MM_INSERT_EPI8(__m128i vec, int p, const int LANE)
2402 {
2403 _NEON2SSE_SWITCH16(_mm_insert_epi8, vec, _NEON2SSE_COMMA p, LANE)
2404 }
2405
2406#ifdef _NEON2SSE_64BIT
2407 //the special case of functions available only for SSE4 and 64-bit build.
2408 _NEON2SSE_INLINE __m128i _MM_INSERT_EPI64(__m128i vec, int64_t p, const int LANE)
2409 {
2410 switch(LANE) {
2411 case 0:
2412 return _mm_insert_epi64(vec, p, 0);
2413 case 1:
2414 return _mm_insert_epi64(vec, p, 1);
2415 default:
2416 return _mm_insert_epi64(vec, p, 0);
2417 }
2418 }
2419
2420 _NEON2SSE_INLINE int64_t _MM_EXTRACT_EPI64(__m128i val, const int LANE)
2421 {
2422 if (LANE ==0) return _mm_extract_epi64(val, 0);
2423 else return _mm_extract_epi64(val, 1);
2424 }
2425#endif
2426
2427 _NEON2SSE_INLINE __m128 _MM_INSERT_PS(__m128 vec, __m128 p, const int LANE)
2428 {
2429 _NEON2SSE_SWITCH4(_mm_insert_ps, 0, 16, 32, 48, vec, LANE, _NEON2SSE_COMMA p)
2430 }
2431
2432#endif //USE_SSE4
2433
2434#endif //#ifdef NDEBUG
2435
2436//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
2437// Below are some helper functions used either for SSE4 intrinsics "emulation" for SSSE3 limited devices
2438// or for some specific commonly used operations implementation missing in SSE
2439#ifdef USE_SSE4
2440# define _MM_CVTEPU8_EPI16 _mm_cvtepu8_epi16
2441# define _MM_CVTEPU16_EPI32 _mm_cvtepu16_epi32
2442# define _MM_CVTEPU32_EPI64 _mm_cvtepu32_epi64
2443
2444# define _MM_CVTEPI8_EPI16 _mm_cvtepi8_epi16
2445# define _MM_CVTEPI16_EPI32 _mm_cvtepi16_epi32
2446# define _MM_CVTEPI32_EPI64 _mm_cvtepi32_epi64
2447
2448# define _MM_MAX_EPI8 _mm_max_epi8
2449# define _MM_MAX_EPI32 _mm_max_epi32
2450# define _MM_MAX_EPU16 _mm_max_epu16
2451# define _MM_MAX_EPU32 _mm_max_epu32
2452
2453# define _MM_MIN_EPI8 _mm_min_epi8
2454# define _MM_MIN_EPI32 _mm_min_epi32
2455# define _MM_MIN_EPU16 _mm_min_epu16
2456# define _MM_MIN_EPU32 _mm_min_epu32
2457
2458# define _MM_BLENDV_EPI8 _mm_blendv_epi8
2459# define _MM_PACKUS_EPI32 _mm_packus_epi32
2460# define _MM_PACKUS1_EPI32(a) _mm_packus_epi32(a, a)
2461
2462# define _MM_MULLO_EPI32 _mm_mullo_epi32
2463# define _MM_MUL_EPI32 _mm_mul_epi32
2464
2465# define _MM_CMPEQ_EPI64 _mm_cmpeq_epi64
2466#else //no SSE4 !!!!!!
2467 _NEON2SSE_INLINE __m128i _MM_CVTEPU8_EPI16(__m128i a)
2468 {
2469 __m128i zero = _mm_setzero_si128();
2470 return _mm_unpacklo_epi8(a, zero);
2471 }
2472
2473 _NEON2SSE_INLINE __m128i _MM_CVTEPU16_EPI32(__m128i a)
2474 {
2475 __m128i zero = _mm_setzero_si128();
2476 return _mm_unpacklo_epi16(a, zero);
2477 }
2478
2479 _NEON2SSE_INLINE __m128i _MM_CVTEPU32_EPI64(__m128i a)
2480 {
2481 __m128i zero = _mm_setzero_si128();
2482 return _mm_unpacklo_epi32(a, zero);
2483 }
2484
2485 _NEON2SSE_INLINE __m128i _MM_CVTEPI8_EPI16(__m128i a)
2486 {
2487 __m128i zero = _mm_setzero_si128();
2488 __m128i sign = _mm_cmpgt_epi8(zero, a);
2489 return _mm_unpacklo_epi8(a, sign);
2490 }
2491
2492 _NEON2SSE_INLINE __m128i _MM_CVTEPI16_EPI32(__m128i a)
2493 {
2494 __m128i zero = _mm_setzero_si128();
2495 __m128i sign = _mm_cmpgt_epi16(zero, a);
2496 return _mm_unpacklo_epi16(a, sign);
2497 }
2498
2499 _NEON2SSE_INLINE __m128i _MM_CVTEPI32_EPI64(__m128i a)
2500 {
2501 __m128i zero = _mm_setzero_si128();
2502 __m128i sign = _mm_cmpgt_epi32(zero, a);
2503 return _mm_unpacklo_epi32(a, sign);
2504 }
2505
2506 _NEON2SSE_INLINE int _MM_EXTRACT_EPI32(__m128i vec, const int LANE)
2507 {
2508 _NEON2SSE_ALIGN_16 int32_t tmp[4];
2509 _mm_store_si128((__m128i*)tmp, vec);
2510 return tmp[LANE];
2511 }
2512
2513 _NEON2SSE_INLINE int _MM_EXTRACT_EPI8(__m128i vec, const int LANE)
2514 {
2515 _NEON2SSE_ALIGN_16 int8_t tmp[16];
2516 _mm_store_si128((__m128i*)tmp, vec);
2517 return (int)tmp[LANE];
2518 }
2519
2520 _NEON2SSE_INLINE int _MM_EXTRACT_PS(__m128 vec, const int LANE)
2521 {
2522 _NEON2SSE_ALIGN_16 int32_t tmp[4];
2523 _mm_store_si128((__m128i*)tmp, _M128i(vec));
2524 return tmp[LANE];
2525 }
2526
2527 _NEON2SSE_INLINE __m128i _MM_INSERT_EPI32(__m128i vec, int p, const int LANE)
2528 {
2529 _NEON2SSE_ALIGN_16 int32_t pvec[4] = {0,0,0,0};
2530 _NEON2SSE_ALIGN_16 uint32_t mask[4] = {0xffffffff,0xffffffff,0xffffffff,0xffffffff};
2531 __m128i vec_masked, p_masked;
2532 pvec[LANE] = p;
2533 mask[LANE] = 0x0;
2534 vec_masked = _mm_and_si128 (*(__m128i*)mask,vec); //ready for p
2535 p_masked = _mm_andnot_si128 (*(__m128i*)mask,*(__m128i*)pvec); //ready for vec
2536 return _mm_or_si128(vec_masked, p_masked);
2537 }
2538
2539 _NEON2SSE_INLINE __m128i _MM_INSERT_EPI8(__m128i vec, int p, const int LANE)
2540 {
2541 _NEON2SSE_ALIGN_16 int8_t pvec[16] = {0,0,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0};
2542 _NEON2SSE_ALIGN_16 uint8_t mask[16] = {0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff};
2543 __m128i vec_masked, p_masked;
2544 pvec[LANE] = (int8_t)p;
2545 mask[LANE] = 0x0;
2546 vec_masked = _mm_and_si128 (*(__m128i*)mask,vec); //ready for p
2547 p_masked = _mm_andnot_si128 (*(__m128i*)mask,*(__m128i*)pvec); //ready for vec
2548 return _mm_or_si128(vec_masked, p_masked);
2549 }
2550
2551 _NEON2SSE_INLINE __m128 _MM_INSERT_PS(__m128 vec, __m128 p, const int LANE)
2552 {
2553 _NEON2SSE_ALIGN_16 uint32_t mask[4] = {0xffffffff,0xffffffff,0xffffffff,0xffffffff};
2554 __m128 tmp, vec_masked, p_masked;
2555 mask[LANE >> 4] = 0x0; //here the LANE is not actural lane, need to deal with it
2556 vec_masked = _mm_and_ps (*(__m128*)mask,vec); //ready for p
2557 p_masked = _mm_andnot_ps (*(__m128*)mask, p); //ready for vec
2558 tmp = _mm_or_ps(vec_masked, p_masked);
2559 return tmp;
2560 }
2561
2562 _NEON2SSE_INLINE __m128i _MM_MAX_EPI8(__m128i a, __m128i b)
2563 {
2564 __m128i cmp, resa, resb;
2565 cmp = _mm_cmpgt_epi8 (a, b);
2566 resa = _mm_and_si128 (cmp, a);
2567 resb = _mm_andnot_si128 (cmp,b);
2568 return _mm_or_si128(resa, resb);
2569 }
2570
2571 _NEON2SSE_INLINE __m128i _MM_MAX_EPI32(__m128i a, __m128i b)
2572 {
2573 __m128i cmp, resa, resb;
2574 cmp = _mm_cmpgt_epi32(a, b);
2575 resa = _mm_and_si128 (cmp, a);
2576 resb = _mm_andnot_si128 (cmp,b);
2577 return _mm_or_si128(resa, resb);
2578 }
2579
2580 _NEON2SSE_INLINE __m128i _MM_MAX_EPU16(__m128i a, __m128i b)
2581 {
2582 __m128i c8000, b_s, a_s, cmp;
2583 c8000 = _mm_cmpeq_epi16 (a,a); //0xffff
2584 c8000 = _mm_slli_epi16 (c8000, 15); //0x8000
2585 b_s = _mm_sub_epi16 (b, c8000);
2586 a_s = _mm_sub_epi16 (a, c8000);
2587 cmp = _mm_cmpgt_epi16 (a_s, b_s); //no unsigned comparison, need to go to signed
2588 a_s = _mm_and_si128 (cmp,a);
2589 b_s = _mm_andnot_si128 (cmp,b);
2590 return _mm_or_si128(a_s, b_s);
2591 }
2592
2593 _NEON2SSE_INLINE __m128i _MM_MAX_EPU32(__m128i a, __m128i b)
2594 {
2595 __m128i c80000000, b_s, a_s, cmp;
2596 c80000000 = _mm_cmpeq_epi32 (a,a); //0xffffffff
2597 c80000000 = _mm_slli_epi32 (c80000000, 31); //0x80000000
2598 b_s = _mm_sub_epi32 (b, c80000000);
2599 a_s = _mm_sub_epi32 (a, c80000000);
2600 cmp = _mm_cmpgt_epi32 (a_s, b_s); //no unsigned comparison, need to go to signed
2601 a_s = _mm_and_si128 (cmp,a);
2602 b_s = _mm_andnot_si128 (cmp,b);
2603 return _mm_or_si128(a_s, b_s);
2604 }
2605
2606 _NEON2SSE_INLINE __m128i _MM_MIN_EPI8(__m128i a, __m128i b)
2607 {
2608 __m128i cmp, resa, resb;
2609 cmp = _mm_cmpgt_epi8 (b, a);
2610 resa = _mm_and_si128 (cmp, a);
2611 resb = _mm_andnot_si128 (cmp,b);
2612 return _mm_or_si128(resa, resb);
2613 }
2614
2615 _NEON2SSE_INLINE __m128i _MM_MIN_EPI32(__m128i a, __m128i b)
2616 {
2617 __m128i cmp, resa, resb;
2618 cmp = _mm_cmpgt_epi32(b, a);
2619 resa = _mm_and_si128 (cmp, a);
2620 resb = _mm_andnot_si128 (cmp,b);
2621 return _mm_or_si128(resa, resb);
2622 }
2623
2624 _NEON2SSE_INLINE __m128i _MM_MIN_EPU16(__m128i a, __m128i b)
2625 {
2626 __m128i c8000, b_s, a_s, cmp;
2627 c8000 = _mm_cmpeq_epi16 (a,a); //0xffff
2628 c8000 = _mm_slli_epi16 (c8000, 15); //0x8000
2629 b_s = _mm_sub_epi16 (b, c8000);
2630 a_s = _mm_sub_epi16 (a, c8000);
2631 cmp = _mm_cmpgt_epi16 (b_s, a_s); //no unsigned comparison, need to go to signed
2632 a_s = _mm_and_si128 (cmp,a);
2633 b_s = _mm_andnot_si128 (cmp,b);
2634 return _mm_or_si128(a_s, b_s);
2635 }
2636
2637 _NEON2SSE_INLINE __m128i _MM_MIN_EPU32(__m128i a, __m128i b)
2638 {
2639 __m128i c80000000, b_s, a_s, cmp;
2640 c80000000 = _mm_cmpeq_epi32 (a,a); //0xffffffff
2641 c80000000 = _mm_slli_epi32 (c80000000, 31); //0x80000000
2642 b_s = _mm_sub_epi32 (b, c80000000);
2643 a_s = _mm_sub_epi32 (a, c80000000);
2644 cmp = _mm_cmpgt_epi32 (b_s, a_s); //no unsigned comparison, need to go to signed
2645 a_s = _mm_and_si128 (cmp,a);
2646 b_s = _mm_andnot_si128 (cmp,b);
2647 return _mm_or_si128(a_s, b_s);
2648 }
2649
2650 _NEON2SSE_INLINE __m128i _MM_BLENDV_EPI8(__m128i a, __m128i b, __m128i mask) //this is NOT exact implementation of _mm_blendv_epi8 !!!!! - please see below
2651 {
2652 //it assumes mask is either 0xff or 0 always (like in all usecases below) while for the original _mm_blendv_epi8 only MSB mask byte matters.
2653 __m128i a_masked, b_masked;
2654 b_masked = _mm_and_si128 (mask,b); //use b if mask 0xff
2655 a_masked = _mm_andnot_si128 (mask,a);
2656 return _mm_or_si128(a_masked, b_masked);
2657 }
2658
2659 _NEON2SSE_INLINE __m128i _MM_PACKUS_EPI32(__m128i a, __m128i b)
2660 {
2661 __m128i a16, b16, res, reshi,cmp, zero;
2662 zero = _mm_setzero_si128();
2663 a16 = _mm_shuffle_epi8 (a, *(__m128i*) mask8_32_even_odd);
2664 b16 = _mm_shuffle_epi8 (b, *(__m128i*) mask8_32_even_odd);
2665 res = _mm_unpacklo_epi64(a16, b16); //result without saturation
2666 reshi = _mm_unpackhi_epi64(a16, b16); //hi part of result used for saturation
2667 cmp = _mm_cmpgt_epi16(zero, reshi); //if cmp<0 the result should be zero
2668 res = _mm_andnot_si128(cmp,res); //if cmp zero - do nothing, otherwise cmp <0 and the result is 0
2669 cmp = _mm_cmpgt_epi16(reshi,zero); //if cmp positive
2670 return _mm_or_si128(res, cmp); //if cmp positive we are out of 16bits need to saturaate to 0xffff
2671 }
2672
2673 _NEON2SSE_INLINE __m128i _MM_PACKUS1_EPI32(__m128i a)
2674 {
2675 __m128i a16, res, reshi,cmp, zero;
2676 zero = _mm_setzero_si128();
2677 a16 = _mm_shuffle_epi8 (a, *(__m128i*)mask8_32_even_odd);
2678 reshi = _mm_unpackhi_epi64(a16, a16); //hi part of result used for saturation
2679 cmp = _mm_cmpgt_epi16(zero, reshi); //if cmp<0 the result should be zero
2680 res = _mm_andnot_si128(cmp, a16); //if cmp zero - do nothing, otherwise cmp <0 and the result is 0
2681 cmp = _mm_cmpgt_epi16(reshi,zero); //if cmp positive
2682 return _mm_or_si128(res, cmp); //if cmp positive we are out of 16bits need to saturaate to 0xffff
2683 }
2684
2685 // method used by GCC with generic vector extensions
2686 _NEON2SSE_INLINE __m128i _MM_MULLO_EPI32(__m128i a, __m128i b)
2687 {
2688 __m128i a_high = _mm_srli_epi64(a, 32);
2689 __m128i low = _mm_mul_epu32(a, b);
2690 __m128i b_high = _mm_srli_epi64(b, 32);
2691 __m128i high = _mm_mul_epu32(a_high, b_high);
2692 low = _mm_shuffle_epi32(low, _MM_SHUFFLE(0, 0, 2, 0));
2693 high = _mm_shuffle_epi32(high, _MM_SHUFFLE(0, 0, 2, 0));
2694 return _mm_unpacklo_epi32(low, high);
2695 }
2696
2697 _NEON2SSE_INLINE __m128i _MM_MUL_EPI32(__m128i a, __m128i b)
2698 {
2699 __m128i sign, zero, mul_us, a_neg, b_neg, mul_us_neg;
2700 sign = _mm_xor_si128 (a, b);
2701 sign = _mm_srai_epi32 (sign, 31); //promote sign bit to all fields, all fff if negative and all 0 if positive
2702 sign = _mm_shuffle_epi32(sign, _MM_SHUFFLE(2, 2, 0, 0)); //promote sign bit to 3 and 1st data lanes
2703 zero = _mm_setzero_si128();
2704 a_neg = _mm_abs_epi32 (a); //negate a and b
2705 b_neg = _mm_abs_epi32 (b); //negate a and b
2706 mul_us = _mm_mul_epu32 (a_neg, b_neg); //uses 0 and 2nd data lanes, (abs), the multiplication gives 64 bit result
2707 mul_us_neg = _mm_sub_epi64(zero, mul_us);
2708 mul_us_neg = _mm_and_si128(sign, mul_us_neg);
2709 mul_us = _mm_andnot_si128(sign, mul_us);
2710 return _mm_or_si128 (mul_us, mul_us_neg);
2711 }
2712
2713 _NEON2SSE_INLINE __m128i _MM_CMPEQ_EPI64(__m128i a, __m128i b)
2714 {
2715 __m128i res;
2716 res = _mm_cmpeq_epi32 (a, b);
2717 return _mm_shuffle_epi32 (res, 1 | (1 << 2) | (3 << 4) | (3 << 6)); //copy the information from hi to low part of the 64 bit data
2718 }
2719#endif //SSE4
2720
2721//the special case of functions working only for 32 bits, no SSE4
2722_NEON2SSE_INLINE __m128i _MM_INSERT_EPI64_32(__m128i vec, int64_t p, const int LANE)
2723{
2724 _NEON2SSE_ALIGN_16 uint64_t pvec[2] = {0,0};
2725 _NEON2SSE_ALIGN_16 uint64_t mask[2] = {0xffffffffffffffff, 0xffffffffffffffff};
2726 __m128i vec_masked, p_masked;
2727 pvec[LANE] = p;
2728 mask[LANE] = 0x0;
2729 vec_masked = _mm_and_si128 (*(__m128i*)mask,vec); //ready for p
2730 p_masked = _mm_andnot_si128 (*(__m128i*)mask,*(__m128i*)pvec); //ready for vec
2731 return _mm_or_si128(vec_masked, p_masked);
2732}
2733
2734_NEON2SSE_INLINE int64_t _MM_EXTRACT_EPI64_32(__m128i val, const int LANE)
2735{
2736 _NEON2SSE_ALIGN_16 int64_t tmp[2];
2737 _mm_store_si128((__m128i*)tmp, val);
2738 return tmp[LANE];
2739}
2740
2741#ifndef _NEON2SSE_64BIT_SSE4
2742# define _MM_INSERT_EPI64 _MM_INSERT_EPI64_32
2743# define _MM_EXTRACT_EPI64 _MM_EXTRACT_EPI64_32
2744#endif
2745
2746_NEON2SSESTORAGE int32x4_t vqd_s32(int32x4_t a); //Doubling saturation for signed ints
2747_NEON2SSE_INLINE int32x4_t vqd_s32(int32x4_t a)
2748{
2749 //Overflow happens only if a and sum have the opposite signs
2750 __m128i c7fffffff, res, res_sat, res_xor_a;
2751 c7fffffff = _mm_set1_epi32(0x7fffffff);
2752 res = _mm_slli_epi32 (a, 1); // res = a*2
2753 res_sat = _mm_srli_epi32(a, 31);
2754 res_sat = _mm_add_epi32(res_sat, c7fffffff);
2755 res_xor_a = _mm_xor_si128(res, a);
2756 res_xor_a = _mm_srai_epi32(res_xor_a,31); //propagate the sigh bit, all ffff if <0 all ones otherwise
2757 res_sat = _mm_and_si128(res_xor_a, res_sat);
2758 res = _mm_andnot_si128(res_xor_a, res);
2759 return _mm_or_si128(res, res_sat);
2760}
2761
2762
2763//!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
2764//*************************************************************************
2765//*************************************************************************
2766//***************** Functions redefinition\implementatin starts here *****
2767//*************************************************************************
2768//*************************************************************************
2769//!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
2770
2771/*If the unified intrinsics solutions is necessary please define your SSE intrinsics wrap here like in the following sample:
2772#ifdef ARM
2773#define vector_addq_s32 _mm_add_epi32
2774#else //if we have IA
2775#define vector_addq_s32 vadd_s32
2776#endif
2777
2778********************************************************************************************
2779Functions below are organised in the following way:
2780
2781Each NEON intrinsic function has one of the following options:
27821. its x86 full equivalent SSE intrinsic - in this case x86 version just follows the NEON one under the corresponding #define statement
27832. x86 implementation using more than one x86 intrinsics. In this case it is shaped as inlined C function with return statement
27843. the reference to the NEON function returning the same result and implemented in x86 as above. In this case it is shaped as matching NEON function definition
27854. for about 5% of functions due to the corresponding x86 SIMD unavailability or inefficiency in terms of performance
2786the serial implementation is provided along with the corresponding compiler warning. If these functions are on your app critical path
2787- please consider such functions removal from your code.
2788*/
2789
2790//***********************************************************************
2791//************************ Vector add *****************************
2792//***********************************************************************
2793_NEON2SSESTORAGE int8x8_t vadd_s8(int8x8_t a, int8x8_t b); // VADD.I8 d0,d0,d0
2794_NEON2SSE_INLINE int8x8_t vadd_s8(int8x8_t a, int8x8_t b)
2795{
2796 int8x8_t res64;
2797 return64(_mm_add_epi8(_pM128i(a),_pM128i(b)));
2798}
2799
2800
2801_NEON2SSESTORAGE int16x4_t vadd_s16(int16x4_t a, int16x4_t b); // VADD.I16 d0,d0,d0
2802_NEON2SSE_INLINE int16x4_t vadd_s16(int16x4_t a, int16x4_t b)
2803{
2804 int16x4_t res64;
2805 return64(_mm_add_epi16(_pM128i(a),_pM128i(b)));
2806}
2807
2808
2809_NEON2SSESTORAGE int32x2_t vadd_s32(int32x2_t a, int32x2_t b); // VADD.I32 d0,d0,d0
2810_NEON2SSE_INLINE int32x2_t vadd_s32(int32x2_t a, int32x2_t b)
2811{
2812 int32x2_t res64;
2813 return64(_mm_add_epi32(_pM128i(a),_pM128i(b)));
2814}
2815
2816
2817_NEON2SSESTORAGE int64x1_t vadd_s64(int64x1_t a, int64x1_t b); // VADD.I64 d0,d0,d0
2818_NEON2SSE_INLINE int64x1_t vadd_s64(int64x1_t a, int64x1_t b)
2819{
2820 int64x1_t res64;
2821 res64.m64_i64[0] = a.m64_i64[0] + b.m64_i64[0];
2822 return res64;
2823}
2824
2825
2826_NEON2SSESTORAGE float32x2_t vadd_f32(float32x2_t a, float32x2_t b); // VADD.F32 d0,d0,d0
2827_NEON2SSE_INLINE float32x2_t vadd_f32(float32x2_t a, float32x2_t b)
2828{
2829 __m128 res;
2830 __m64_128 res64;
2831 res = _mm_add_ps(_pM128(a),_pM128(b)); //SSE, use only low 64 bits
2832 _M64f(res64, res);
2833 return res64;
2834}
2835
2836_NEON2SSE_GLOBAL uint8x8_t vadd_u8(uint8x8_t a, uint8x8_t b); // VADD.I8 d0,d0,d0
2837#define vadd_u8 vadd_s8
2838
2839_NEON2SSE_GLOBAL uint16x4_t vadd_u16(uint16x4_t a, uint16x4_t b); // VADD.I16 d0,d0,d0
2840#define vadd_u16 vadd_s16
2841
2842_NEON2SSE_GLOBAL uint32x2_t vadd_u32(uint32x2_t a, uint32x2_t b); // VADD.I32 d0,d0,d0
2843#define vadd_u32 vadd_s32
2844
2845_NEON2SSESTORAGE uint64x1_t vadd_u64(uint64x1_t a, uint64x1_t b); // VADD.I64 d0,d0,d0
2846_NEON2SSE_INLINE uint64x1_t vadd_u64(uint64x1_t a, uint64x1_t b)
2847{
2848 uint64x1_t res64;
2849 res64.m64_u64[0] = a.m64_u64[0] + b.m64_u64[0];
2850 return res64;
2851}
2852
2853
2854_NEON2SSE_GLOBAL int8x16_t vaddq_s8(int8x16_t a, int8x16_t b); // VADD.I8 q0,q0,q0
2855#define vaddq_s8 _mm_add_epi8
2856
2857_NEON2SSE_GLOBAL int16x8_t vaddq_s16(int16x8_t a, int16x8_t b); // VADD.I16 q0,q0,q0
2858#define vaddq_s16 _mm_add_epi16
2859
2860_NEON2SSE_GLOBAL int32x4_t vaddq_s32(int32x4_t a, int32x4_t b); // VADD.I32 q0,q0,q0
2861#define vaddq_s32 _mm_add_epi32
2862
2863_NEON2SSE_GLOBAL int64x2_t vaddq_s64(int64x2_t a, int64x2_t b); // VADD.I64 q0,q0,q0
2864#define vaddq_s64 _mm_add_epi64
2865
2866_NEON2SSE_GLOBAL float32x4_t vaddq_f32(float32x4_t a, float32x4_t b); // VADD.F32 q0,q0,q0
2867#define vaddq_f32 _mm_add_ps
2868
2869_NEON2SSE_GLOBAL uint8x16_t vaddq_u8(uint8x16_t a, uint8x16_t b); // VADD.I8 q0,q0,q0
2870#define vaddq_u8 _mm_add_epi8
2871
2872_NEON2SSE_GLOBAL uint16x8_t vaddq_u16(uint16x8_t a, uint16x8_t b); // VADD.I16 q0,q0,q0
2873#define vaddq_u16 _mm_add_epi16
2874
2875_NEON2SSE_GLOBAL uint32x4_t vaddq_u32(uint32x4_t a, uint32x4_t b); // VADD.I32 q0,q0,q0
2876#define vaddq_u32 _mm_add_epi32
2877
2878_NEON2SSE_GLOBAL uint64x2_t vaddq_u64(uint64x2_t a, uint64x2_t b); // VADD.I64 q0,q0,q0
2879#define vaddq_u64 _mm_add_epi64
2880
2881//**************************** Vector long add *****************************:
2882//***********************************************************************
2883//Va, Vb have equal lane sizes, result is a 128 bit vector of lanes that are twice the width.
2884_NEON2SSESTORAGE int16x8_t vaddl_s8(int8x8_t a, int8x8_t b); // VADDL.S8 q0,d0,d0
2885_NEON2SSE_INLINE int16x8_t vaddl_s8(int8x8_t a, int8x8_t b) // VADDL.S8 q0,d0,d0
2886{
2887 __m128i a16, b16;
2888 a16 = _MM_CVTEPI8_EPI16 (_pM128i(a)); //SSE4.1,
2889 b16 = _MM_CVTEPI8_EPI16 (_pM128i(b)); //SSE4.1,
2890 return _mm_add_epi16 (a16, b16);
2891}
2892
2893_NEON2SSESTORAGE int32x4_t vaddl_s16(int16x4_t a, int16x4_t b); // VADDL.S16 q0,d0,d0
2894_NEON2SSE_INLINE int32x4_t vaddl_s16(int16x4_t a, int16x4_t b) // VADDL.S16 q0,d0,d0
2895{
2896 __m128i a32, b32;
2897 a32 = _MM_CVTEPI16_EPI32 (_pM128i(a)); //SSE4.1
2898 b32 = _MM_CVTEPI16_EPI32 (_pM128i(b)); //SSE4.1
2899 return _mm_add_epi32 (a32, b32);
2900}
2901
2902_NEON2SSESTORAGE int64x2_t vaddl_s32(int32x2_t a, int32x2_t b); // VADDL.S32 q0,d0,d0
2903_NEON2SSE_INLINE int64x2_t vaddl_s32(int32x2_t a, int32x2_t b) // VADDL.S32 q0,d0,d0
2904{
2905 //may be not optimal
2906 __m128i a64, b64;
2907 a64 = _MM_CVTEPI32_EPI64 (_pM128i(a)); //SSE4.1
2908 b64 = _MM_CVTEPI32_EPI64 (_pM128i(b)); //SSE4.1
2909 return _mm_add_epi64 ( a64, b64);
2910}
2911
2912_NEON2SSESTORAGE uint16x8_t vaddl_u8(uint8x8_t a, uint8x8_t b); // VADDL.U8 q0,d0,d0
2913_NEON2SSE_INLINE uint16x8_t vaddl_u8(uint8x8_t a, uint8x8_t b) // VADDL.U8 q0,d0,d0
2914{
2915 __m128i a16, b16;
2916 a16 = _MM_CVTEPU8_EPI16 (_pM128i(a)); //SSE4.1
2917 b16 = _MM_CVTEPU8_EPI16 (_pM128i(b)); //SSE4.1
2918 return _mm_add_epi16 (a16, b16);
2919}
2920
2921_NEON2SSESTORAGE uint32x4_t vaddl_u16(uint16x4_t a, uint16x4_t b); // VADDL.s16 q0,d0,d0
2922_NEON2SSE_INLINE uint32x4_t vaddl_u16(uint16x4_t a, uint16x4_t b) // VADDL.s16 q0,d0,d0
2923{
2924 __m128i a32, b32;
2925 a32 = _MM_CVTEPU16_EPI32 (_pM128i(a)); //SSE4.1
2926 b32 = _MM_CVTEPU16_EPI32 (_pM128i(b)); //SSE4.1
2927 return _mm_add_epi32 (a32, b32);
2928}
2929
2930_NEON2SSESTORAGE uint64x2_t vaddl_u32(uint32x2_t a, uint32x2_t b); // VADDL.U32 q0,d0,d0
2931_NEON2SSE_INLINE uint64x2_t vaddl_u32(uint32x2_t a, uint32x2_t b) // VADDL.U32 q0,d0,d0
2932{
2933 //may be not optimal
2934 __m128i a64, b64;
2935 a64 = _MM_CVTEPU32_EPI64 (_pM128i(a)); //SSE4.1
2936 b64 = _MM_CVTEPU32_EPI64 (_pM128i(b)); //SSE4.1
2937 return _mm_add_epi64 (a64, b64);
2938}
2939
2940//*************** Vector wide add: vaddw_<type>. Vr[i]:=Va[i]+Vb[i] ******************
2941//*************** *********************************************************************
2942_NEON2SSESTORAGE int16x8_t vaddw_s8(int16x8_t a, int8x8_t b); // VADDW.S8 q0,q0,d0
2943_NEON2SSE_INLINE int16x8_t vaddw_s8(int16x8_t a, int8x8_t b) // VADDW.S8 q0,q0,d0
2944{
2945 __m128i b16;
2946 b16 = _MM_CVTEPI8_EPI16 (_pM128i(b)); //SSE4.1,
2947 return _mm_add_epi16 (a, b16);
2948}
2949
2950_NEON2SSESTORAGE int32x4_t vaddw_s16(int32x4_t a, int16x4_t b); // VADDW.S16 q0,q0,d0
2951_NEON2SSE_INLINE int32x4_t vaddw_s16(int32x4_t a, int16x4_t b) // VADDW.S16 q0,q0,d0
2952{
2953 __m128i b32;
2954 b32 = _MM_CVTEPI16_EPI32(_pM128i(b)); //SSE4.1,
2955 return _mm_add_epi32 (a, b32);
2956}
2957
2958_NEON2SSESTORAGE int64x2_t vaddw_s32(int64x2_t a, int32x2_t b); // VADDW.S32 q0,q0,d0
2959_NEON2SSE_INLINE int64x2_t vaddw_s32(int64x2_t a, int32x2_t b) // VADDW.S32 q0,q0,d0
2960{
2961 __m128i b64;
2962 b64 = _MM_CVTEPI32_EPI64 (_pM128i(b)); //SSE4.1
2963 return _mm_add_epi64 (a, b64);
2964}
2965
2966_NEON2SSESTORAGE uint16x8_t vaddw_u8(uint16x8_t a, uint8x8_t b); // VADDW.U8 q0,q0,d0
2967_NEON2SSE_INLINE uint16x8_t vaddw_u8(uint16x8_t a, uint8x8_t b) // VADDW.U8 q0,q0,d0
2968{
2969 __m128i b16;
2970 b16 = _MM_CVTEPU8_EPI16 (_pM128i(b)); //SSE4.1
2971 return _mm_add_epi16 (a, b16);
2972}
2973
2974_NEON2SSESTORAGE uint32x4_t vaddw_u16(uint32x4_t a, uint16x4_t b); // VADDW.s16 q0,q0,d0
2975_NEON2SSE_INLINE uint32x4_t vaddw_u16(uint32x4_t a, uint16x4_t b) // VADDW.s16 q0,q0,d0
2976{
2977 __m128i b32;
2978 b32 = _MM_CVTEPU16_EPI32 (_pM128i(b)); //SSE4.1
2979 return _mm_add_epi32 (a, b32);
2980}
2981
2982_NEON2SSESTORAGE uint64x2_t vaddw_u32(uint64x2_t a, uint32x2_t b); // VADDW.U32 q0,q0,d0
2983_NEON2SSE_INLINE uint64x2_t vaddw_u32(uint64x2_t a, uint32x2_t b) // VADDW.U32 q0,q0,d0
2984{
2985 __m128i b64;
2986 b64 = _MM_CVTEPU32_EPI64 (_pM128i(b)); //SSE4.1
2987 return _mm_add_epi64 (a, b64);
2988}
2989
2990//******************************Vector halving add: vhadd -> Vr[i]:=(Va[i]+Vb[i])>>1 , result truncated *******************************
2991//*************************************************************************************************************************
2992_NEON2SSESTORAGE int8x8_t vhadd_s8(int8x8_t a, int8x8_t b); // VHADD.S8 d0,d0,d0
2993_NEON2SSE_INLINE int8x8_t vhadd_s8(int8x8_t a, int8x8_t b)
2994{
2995 int8x8_t res64;
2996 return64(vhaddq_s8(_pM128i(a), _pM128i(b)));
2997}
2998
2999
3000_NEON2SSESTORAGE int16x4_t vhadd_s16(int16x4_t a, int16x4_t b); // VHADD.S16 d0,d0,d0
3001_NEON2SSE_INLINE int16x4_t vhadd_s16(int16x4_t a, int16x4_t b)
3002{
3003 int16x4_t res64;
3004 return64( vhaddq_s16(_pM128i(a), _pM128i(b)));
3005}
3006
3007
3008_NEON2SSESTORAGE int32x2_t vhadd_s32(int32x2_t a, int32x2_t b); // VHADD.S32 d0,d0,d0
3009_NEON2SSE_INLINE int32x2_t vhadd_s32(int32x2_t a, int32x2_t b)
3010{
3011 int32x2_t res64;
3012 return64( vhaddq_s32(_pM128i(a), _pM128i(b)));
3013}
3014
3015
3016_NEON2SSESTORAGE uint8x8_t vhadd_u8(uint8x8_t a, uint8x8_t b); // VHADD.w d0,d0,d0
3017_NEON2SSE_INLINE uint8x8_t vhadd_u8(uint8x8_t a, uint8x8_t b)
3018{
3019 uint8x8_t res64;
3020 return64( vhaddq_u8(_pM128i(a), _pM128i(b)));
3021}
3022
3023
3024_NEON2SSESTORAGE uint16x4_t vhadd_u16(uint16x4_t a, uint16x4_t b); // VHADD.s16 d0,d0,d0
3025_NEON2SSE_INLINE uint16x4_t vhadd_u16(uint16x4_t a, uint16x4_t b)
3026{
3027 uint16x4_t res64;
3028 return64( vhaddq_u16(_pM128i(a), _pM128i(b)));
3029}
3030
3031
3032_NEON2SSESTORAGE uint32x2_t vhadd_u32(uint32x2_t a, uint32x2_t b); // VHADD.U32 d0,d0,d0
3033_NEON2SSE_INLINE uint32x2_t vhadd_u32(uint32x2_t a, uint32x2_t b)
3034{
3035 uint32x2_t res64;
3036 return64( vhaddq_u32(_pM128i(a), _pM128i(b)));
3037}
3038
3039
3040_NEON2SSESTORAGE int8x16_t vhaddq_s8(int8x16_t a, int8x16_t b); // VHADD.S8 q0,q0,q0
3041_NEON2SSE_INLINE int8x16_t vhaddq_s8(int8x16_t a, int8x16_t b)
3042{
3043 //need to avoid internal overflow, will use the (x&y)+((x^y)>>1).
3044 __m128i tmp1, tmp2;
3045 tmp1 = _mm_and_si128(a,b);
3046 tmp2 = _mm_xor_si128(a,b);
3047 tmp2 = vshrq_n_s8(tmp2,1);
3048 return _mm_add_epi8(tmp1,tmp2);
3049}
3050
3051_NEON2SSESTORAGE int16x8_t vhaddq_s16(int16x8_t a, int16x8_t b); // VHADD.S1 6 q0,q0,q0
3052_NEON2SSE_INLINE int16x8_t vhaddq_s16(int16x8_t a, int16x8_t b)
3053{
3054 //need to avoid internal overflow, will use the (x&y)+((x^y)>>1).
3055 __m128i tmp1, tmp2;
3056 tmp1 = _mm_and_si128(a,b);
3057 tmp2 = _mm_xor_si128(a,b);
3058 tmp2 = _mm_srai_epi16(tmp2,1);
3059 return _mm_add_epi16(tmp1,tmp2);
3060}
3061
3062_NEON2SSESTORAGE int32x4_t vhaddq_s32(int32x4_t a, int32x4_t b); // VHADD.S32 q0,q0,q0
3063_NEON2SSE_INLINE int32x4_t vhaddq_s32(int32x4_t a, int32x4_t b) // VHADD.S32 q0,q0,q0
3064{
3065 //need to avoid internal overflow, will use the (x&y)+((x^y)>>1).
3066 __m128i tmp1, tmp2;
3067 tmp1 = _mm_and_si128(a,b);
3068 tmp2 = _mm_xor_si128(a,b);
3069 tmp2 = _mm_srai_epi32(tmp2,1);
3070 return _mm_add_epi32(tmp1,tmp2);
3071}
3072
3073_NEON2SSESTORAGE uint8x16_t vhaddq_u8(uint8x16_t a, uint8x16_t b); // VHADD.U8 q0,q0,q0
3074_NEON2SSE_INLINE uint8x16_t vhaddq_u8(uint8x16_t a, uint8x16_t b) // VHADD.U8 q0,q0,q0
3075{
3076 __m128i c1, sum, res;
3077 c1 = _mm_set1_epi8(1);
3078 sum = _mm_avg_epu8(a, b); //result is rounded, need to compensate it
3079 res = _mm_xor_si128(a, b); //for rounding compensation
3080 res = _mm_and_si128(res,c1); //for rounding compensation
3081 return _mm_sub_epi8 (sum, res); //actual rounding compensation
3082}
3083
3084_NEON2SSESTORAGE uint16x8_t vhaddq_u16(uint16x8_t a, uint16x8_t b); // VHADD.s16 q0,q0,q0
3085_NEON2SSE_INLINE uint16x8_t vhaddq_u16(uint16x8_t a, uint16x8_t b) // VHADD.s16 q0,q0,q0
3086{
3087 __m128i sum, res;
3088 sum = _mm_avg_epu16(a, b); //result is rounded, need to compensate it
3089 res = _mm_xor_si128(a, b); //for rounding compensation
3090 res = _mm_slli_epi16 (res,15); //shift left then back right to
3091 res = _mm_srli_epi16 (res,15); //get 1 or zero
3092 return _mm_sub_epi16 (sum, res); //actual rounding compensation
3093}
3094
3095_NEON2SSESTORAGE uint32x4_t vhaddq_u32(uint32x4_t a, uint32x4_t b); // VHADD.U32 q0,q0,q0
3096_NEON2SSE_INLINE uint32x4_t vhaddq_u32(uint32x4_t a, uint32x4_t b) // VHADD.U32 q0,q0,q0
3097{
3098 //need to avoid internal overflow, will use the (x&y)+((x^y)>>1).
3099 __m128i tmp1, tmp2;
3100 tmp1 = _mm_and_si128(a,b);
3101 tmp2 = _mm_xor_si128(a,b);
3102 tmp2 = _mm_srli_epi32(tmp2,1);
3103 return _mm_add_epi32(tmp1,tmp2);
3104}
3105
3106//************************Vector rounding halving add: vrhadd{q}_<type>. Vr[i]:=(Va[i]+Vb[i]+1)>>1 ***************************
3107//*****************************************************************************************************************************
3108_NEON2SSESTORAGE int8x8_t vrhadd_s8(int8x8_t a, int8x8_t b); // VRHADD.S8 d0,d0,d0
3109_NEON2SSE_INLINE int8x8_t vrhadd_s8(int8x8_t a, int8x8_t b)
3110{
3111 int8x8_t res64;
3112 return64(vrhaddq_s8(_pM128i(a), _pM128i(b)));
3113}
3114
3115
3116_NEON2SSESTORAGE int16x4_t vrhadd_s16(int16x4_t a, int16x4_t b); // VRHADD.S16 d0,d0,d0
3117_NEON2SSE_INLINE int16x4_t vrhadd_s16(int16x4_t a, int16x4_t b)
3118{
3119 int16x4_t res64;
3120 return64(vrhaddq_s16(_pM128i(a), _pM128i(b)));
3121}
3122
3123
3124_NEON2SSESTORAGE int32x2_t vrhadd_s32(int32x2_t a, int32x2_t b); // VRHADD.S32 d0,d0,d0
3125_NEON2SSE_INLINE int32x2_t vrhadd_s32(int32x2_t a, int32x2_t b)
3126{
3127 int32x2_t res64;
3128 return64(vrhaddq_s32(_pM128i(a), _pM128i(b)));
3129}
3130
3131
3132_NEON2SSESTORAGE uint8x8_t vrhadd_u8(uint8x8_t a, uint8x8_t b); // VRHADD.U8 d0,d0,d0
3133_NEON2SSE_INLINE uint8x8_t vrhadd_u8(uint8x8_t a, uint8x8_t b)
3134{
3135 uint8x8_t res64;
3136 return64(_mm_avg_epu8(_pM128i(a),_pM128i(b))); //SSE, result rounding!!!
3137}
3138
3139
3140_NEON2SSESTORAGE uint16x4_t vrhadd_u16(uint16x4_t a, uint16x4_t b); // VRHADD.s16 d0,d0,d0
3141_NEON2SSE_INLINE uint16x4_t vrhadd_u16(uint16x4_t a, uint16x4_t b)
3142{
3143 uint16x4_t res64;
3144 return64(_mm_avg_epu16(_pM128i(a),_pM128i(b))); //SSE, result rounding!!!
3145}
3146
3147
3148_NEON2SSESTORAGE uint32x2_t vrhadd_u32(uint32x2_t a, uint32x2_t b); // VRHADD.U32 d0,d0,d0
3149_NEON2SSE_INLINE uint32x2_t vrhadd_u32(uint32x2_t a, uint32x2_t b)
3150{
3151 uint32x2_t res64;
3152 return64(vrhaddq_u32(_pM128i(a), _pM128i(b)));
3153}
3154
3155
3156_NEON2SSESTORAGE int8x16_t vrhaddq_s8(int8x16_t a, int8x16_t b); // VRHADD.S8 q0,q0,q0
3157_NEON2SSE_INLINE int8x16_t vrhaddq_s8(int8x16_t a, int8x16_t b) // VRHADD.S8 q0,q0,q0
3158{
3159 //no signed average in x86 SIMD, go to unsigned
3160 __m128i c128, au, bu, sum;
3161 c128 = _mm_set1_epi8(-128); //(int8_t)0x80
3162 au = _mm_sub_epi8(a, c128); //add 128
3163 bu = _mm_sub_epi8(b, c128); //add 128
3164 sum = _mm_avg_epu8(au, bu);
3165 return _mm_add_epi8 (sum, c128); //sub 128
3166}
3167
3168_NEON2SSESTORAGE int16x8_t vrhaddq_s16(int16x8_t a, int16x8_t b); // VRHADD.S16 q0,q0,q0
3169_NEON2SSE_INLINE int16x8_t vrhaddq_s16(int16x8_t a, int16x8_t b) // VRHADD.S16 q0,q0,q0
3170{
3171 //no signed average in x86 SIMD, go to unsigned
3172 __m128i cx8000, au, bu, sum;
3173 cx8000 = _mm_set1_epi16(-32768); //(int16_t)0x8000
3174 au = _mm_sub_epi16(a, cx8000); //add 32768
3175 bu = _mm_sub_epi16(b, cx8000); //add 32768
3176 sum = _mm_avg_epu16(au, bu);
3177 return _mm_add_epi16 (sum, cx8000); //sub 32768
3178}
3179
3180_NEON2SSESTORAGE int32x4_t vrhaddq_s32(int32x4_t a, int32x4_t b); // VRHADD.S32 q0,q0,q0
3181_NEON2SSE_INLINE int32x4_t vrhaddq_s32(int32x4_t a, int32x4_t b)
3182{
3183 //need to avoid overflow
3184 __m128i a2, b2, res, sum;
3185 a2 = _mm_srai_epi32(a,1); //a2=a/2;
3186 b2 = _mm_srai_epi32(b,1); // b2=b/2;
3187 res = _mm_or_si128(a,b); //for rounding
3188 res = _mm_slli_epi32 (res,31); //shift left then back right to
3189 res = _mm_srli_epi32 (res,31); //get 1 or zero
3190 sum = _mm_add_epi32(a2,b2);
3191 return _mm_add_epi32(sum,res);
3192}
3193
3194_NEON2SSE_GLOBAL uint8x16_t vrhaddq_u8(uint8x16_t a, uint8x16_t b); // VRHADD.U8 q0,q0,q0
3195#define vrhaddq_u8 _mm_avg_epu8 //SSE2, results rounded
3196
3197_NEON2SSE_GLOBAL uint16x8_t vrhaddq_u16(uint16x8_t a, uint16x8_t b); // VRHADD.s16 q0,q0,q0
3198#define vrhaddq_u16 _mm_avg_epu16 //SSE2, results rounded
3199
3200
3201_NEON2SSESTORAGE uint32x4_t vrhaddq_u32(uint32x4_t a, uint32x4_t b); // VRHADD.U32 q0,q0,q0
3202_NEON2SSE_INLINE uint32x4_t vrhaddq_u32(uint32x4_t a, uint32x4_t b) // VRHADD.U32 q0,q0,q0
3203{
3204 //need to avoid overflow
3205 __m128i a2, b2, res, sum;
3206 a2 = _mm_srli_epi32(a,1); //a2=a/2;
3207 b2 = _mm_srli_epi32(b,1); // b2=b/2;
3208 res = _mm_or_si128(a,b); //for rounding
3209 res = _mm_slli_epi32 (res,31); //shift left then back right to
3210 res = _mm_srli_epi32 (res,31); //get 1 or zero
3211 sum = _mm_add_epi32(a2,b2);
3212 return _mm_add_epi32(sum,res);
3213}
3214
3215//****************** VQADD: Vector saturating add ************************
3216//************************************************************************
3217_NEON2SSESTORAGE int8x8_t vqadd_s8(int8x8_t a, int8x8_t b); // VQADD.S8 d0,d0,d0
3218_NEON2SSE_INLINE int8x8_t vqadd_s8(int8x8_t a, int8x8_t b)
3219{
3220 int8x8_t res64;
3221 return64(_mm_adds_epi8(_pM128i(a),_pM128i(b)));
3222}
3223
3224
3225_NEON2SSESTORAGE int16x4_t vqadd_s16(int16x4_t a, int16x4_t b); // VQADD.S16 d0,d0,d0
3226_NEON2SSE_INLINE int16x4_t vqadd_s16(int16x4_t a, int16x4_t b)
3227{
3228 int16x4_t res64;
3229 return64(_mm_adds_epi16(_pM128i(a),_pM128i(b)));
3230}
3231
3232
3233_NEON2SSESTORAGE int32x2_t vqadd_s32(int32x2_t a, int32x2_t b); // VQADD.S32 d0,d0,d0
3234_NEON2SSE_INLINE int32x2_t vqadd_s32(int32x2_t a, int32x2_t b)
3235{
3236 int32x2_t res64;
3237 return64(vqaddq_s32(_pM128i(a), _pM128i(b)));
3238}
3239
3240
3241_NEON2SSESTORAGE int64x1_t vqadd_s64(int64x1_t a, int64x1_t b); // VQADD.S64 d0,d0,d0
3242_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int64x1_t vqadd_s64(int64x1_t a, int64x1_t b), _NEON2SSE_REASON_SLOW_SERIAL)
3243{
3244 int64x1_t res;
3245 uint64_t a64, b64;
3246 a64 = a.m64_u64[0];
3247 b64 = b.m64_u64[0];
3248 res.m64_u64[0] = a64 + b64;
3249 a64 = (a64 >> 63) + (~_SIGNBIT64);
3250 if ((int64_t)((b64 ^ a64) | ~(res.m64_u64[0] ^ b64))>=0) {
3251 res.m64_u64[0] = a64;
3252 }
3253 return res;
3254}
3255
3256_NEON2SSESTORAGE uint8x8_t vqadd_u8(uint8x8_t a, uint8x8_t b); // VQADD.U8 d0,d0,d0
3257_NEON2SSE_INLINE uint8x8_t vqadd_u8(uint8x8_t a, uint8x8_t b)
3258{
3259 uint8x8_t res64;
3260 return64(_mm_adds_epu8(_pM128i(a),_pM128i(b)));
3261}
3262
3263
3264_NEON2SSESTORAGE uint16x4_t vqadd_u16(uint16x4_t a, uint16x4_t b); // VQADD.s16 d0,d0,d0
3265_NEON2SSE_INLINE uint16x4_t vqadd_u16(uint16x4_t a, uint16x4_t b)
3266{
3267 uint16x4_t res64;
3268 return64(_mm_adds_epu16(_pM128i(a),_pM128i(b)));
3269}
3270
3271
3272_NEON2SSESTORAGE uint32x2_t vqadd_u32(uint32x2_t a, uint32x2_t b); // VQADD.U32 d0,d0,d0
3273_NEON2SSE_INLINE uint32x2_t vqadd_u32(uint32x2_t a, uint32x2_t b)
3274{
3275 uint32x2_t res64;
3276 return64(vqaddq_u32(_pM128i(a), _pM128i(b)));
3277}
3278
3279
3280_NEON2SSESTORAGE uint64x1_t vqadd_u64(uint64x1_t a, uint64x1_t b); // VQADD.U64 d0,d0,d0
3281_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint64x1_t vqadd_u64(uint64x1_t a, uint64x1_t b), _NEON2SSE_REASON_SLOW_SERIAL)
3282{
3283 _NEON2SSE_ALIGN_16 uint64_t a64, b64;
3284 uint64x1_t res;
3285 a64 = a.m64_u64[0];
3286 b64 = b.m64_u64[0];
3287 res.m64_u64[0] = a64 + b64;
3288 if (res.m64_u64[0] < a64) {
3289 res.m64_u64[0] = ~(uint64_t)0;
3290 }
3291 return res;
3292}
3293
3294_NEON2SSE_GLOBAL int8x16_t vqaddq_s8(int8x16_t a, int8x16_t b); // VQADD.S8 q0,q0,q0
3295#define vqaddq_s8 _mm_adds_epi8
3296
3297_NEON2SSE_GLOBAL int16x8_t vqaddq_s16(int16x8_t a, int16x8_t b); // VQADD.S16 q0,q0,q0
3298#define vqaddq_s16 _mm_adds_epi16
3299
3300_NEON2SSESTORAGE int32x4_t vqaddq_s32(int32x4_t a, int32x4_t b); // VQADD.S32 q0,q0,q0
3301_NEON2SSE_INLINE int32x4_t vqaddq_s32(int32x4_t a, int32x4_t b)
3302{
3303 //no corresponding x86 SIMD soulution, special tricks are necessary. Overflow happens only if a and b have the same sign and sum has the opposite sign
3304 __m128i c7fffffff, res, res_sat, res_xor_a, b_xor_a_;
3305 c7fffffff = _mm_set1_epi32(0x7fffffff);
3306 res = _mm_add_epi32(a, b);
3307 res_sat = _mm_srli_epi32(a, 31);
3308 res_sat = _mm_add_epi32(res_sat, c7fffffff);
3309 res_xor_a = _mm_xor_si128(res, a);
3310 b_xor_a_ = _mm_xor_si128(b, a);
3311 res_xor_a = _mm_andnot_si128(b_xor_a_, res_xor_a);
3312 res_xor_a = _mm_srai_epi32(res_xor_a,31); //propagate the sigh bit, all ffff if <0 all ones otherwise
3313 res_sat = _mm_and_si128(res_xor_a, res_sat);
3314 res = _mm_andnot_si128(res_xor_a, res);
3315 return _mm_or_si128(res, res_sat);
3316}
3317
3318_NEON2SSESTORAGE int64x2_t vqaddq_s64(int64x2_t a, int64x2_t b); // VQADD.S64 q0,q0,q0
3319_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int64x2_t vqaddq_s64(int64x2_t a, int64x2_t b), _NEON2SSE_REASON_SLOW_SERIAL)
3320{
3321 _NEON2SSE_ALIGN_16 uint64_t atmp[2], btmp[2], res[2];
3322 _mm_store_si128((__m128i*)atmp, a);
3323 _mm_store_si128((__m128i*)btmp, b);
3324 res[0] = atmp[0] + btmp[0];
3325 res[1] = atmp[1] + btmp[1];
3326
3327 atmp[0] = (atmp[0] >> 63) + (~_SIGNBIT64);
3328 atmp[1] = (atmp[1] >> 63) + (~_SIGNBIT64);
3329
3330 if ((int64_t)((btmp[0] ^ atmp[0]) | ~(res[0] ^ btmp[0]))>=0) {
3331 res[0] = atmp[0];
3332 }
3333 if ((int64_t)((btmp[1] ^ atmp[1]) | ~(res[1] ^ btmp[1]))>=0) {
3334 res[1] = atmp[1];
3335 }
3336 return _mm_load_si128((__m128i*)res);
3337}
3338
3339_NEON2SSE_GLOBAL uint8x16_t vqaddq_u8(uint8x16_t a, uint8x16_t b); // VQADD.U8 q0,q0,q0
3340#define vqaddq_u8 _mm_adds_epu8
3341
3342_NEON2SSE_GLOBAL uint16x8_t vqaddq_u16(uint16x8_t a, uint16x8_t b); // VQADD.s16 q0,q0,q0
3343#define vqaddq_u16 _mm_adds_epu16
3344
3345_NEON2SSESTORAGE uint32x4_t vqaddq_u32(uint32x4_t a, uint32x4_t b); // VQADD.U32 q0,q0,q0
3346_NEON2SSE_INLINE uint32x4_t vqaddq_u32(uint32x4_t a, uint32x4_t b)
3347{
3348 __m128i c80000000, cmp, subsum, suba, sum;
3349 c80000000 = _mm_set1_epi32 (0x80000000);
3350 sum = _mm_add_epi32 (a, b);
3351 subsum = _mm_sub_epi32 (sum, c80000000);
3352 suba = _mm_sub_epi32 (a, c80000000);
3353 cmp = _mm_cmpgt_epi32 ( suba, subsum); //no unsigned comparison, need to go to signed
3354 return _mm_or_si128 (sum, cmp); //saturation
3355}
3356
3357_NEON2SSESTORAGE uint64x2_t vqaddq_u64(uint64x2_t a, uint64x2_t b); // VQADD.U64 q0,q0,q0
3358#ifdef USE_SSE4
3359 _NEON2SSE_INLINE uint64x2_t vqaddq_u64(uint64x2_t a, uint64x2_t b)
3360 {
3361 __m128i c80000000, sum, cmp, suba, subsum;
3362 c80000000 = _mm_set_epi32 (0x80000000, 0x0, 0x80000000, 0x0);
3363 sum = _mm_add_epi64 (a, b);
3364 subsum = _mm_sub_epi64 (sum, c80000000);
3365 suba = _mm_sub_epi64 (a, c80000000);
3366 cmp = _mm_cmpgt_epi64 ( suba, subsum); //no unsigned comparison, need to go to signed, SSE4.2!!!
3367 return _mm_or_si128 (sum, cmp); //saturation
3368 }
3369#else
3370 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint64x2_t vqaddq_u64(uint64x2_t a, uint64x2_t b), _NEON2SSE_REASON_SLOW_SERIAL)
3371 {
3372 _NEON2SSE_ALIGN_16 uint64_t atmp[2], btmp[2], res[2];
3373 _mm_store_si128((__m128i*)atmp, a);
3374 _mm_store_si128((__m128i*)btmp, b);
3375 res[0] = atmp[0] + btmp[0];
3376 res[1] = atmp[1] + btmp[1];
3377 if (res[0] < atmp[0]) res[0] = ~(uint64_t)0;
3378 if (res[1] < atmp[1]) res[1] = ~(uint64_t)0;
3379 return _mm_load_si128((__m128i*)(res));
3380 }
3381#endif
3382
3383
3384//******************* Vector add high half (truncated) ******************
3385//************************************************************************
3386_NEON2SSESTORAGE int8x8_t vaddhn_s16(int16x8_t a, int16x8_t b); // VADDHN.I16 d0,q0,q0
3387_NEON2SSE_INLINE int8x8_t vaddhn_s16(int16x8_t a, int16x8_t b) // VADDHN.I16 d0,q0,q0
3388{
3389 int8x8_t res64;
3390 __m128i sum;
3391 sum = _mm_add_epi16 (a, b);
3392 sum = _mm_srai_epi16 (sum, 8);
3393 sum = _mm_packs_epi16 (sum, sum); //use 64 low bits only
3394 return64(sum);
3395}
3396
3397_NEON2SSESTORAGE int16x4_t vaddhn_s32(int32x4_t a, int32x4_t b); // VADDHN.I32 d0,q0,q0
3398_NEON2SSE_INLINE int16x4_t vaddhn_s32(int32x4_t a, int32x4_t b) // VADDHN.I32 d0,q0,q0
3399{
3400 int16x4_t res64;
3401 __m128i sum;
3402 sum = _mm_add_epi32 (a, b);
3403 sum = _mm_srai_epi32(sum, 16);
3404 sum = _mm_packs_epi32 (sum, sum); //use 64 low bits only
3405 return64(sum);
3406}
3407
3408_NEON2SSESTORAGE int32x2_t vaddhn_s64(int64x2_t a, int64x2_t b); // VADDHN.I64 d0,q0,q0
3409_NEON2SSE_INLINE int32x2_t vaddhn_s64(int64x2_t a, int64x2_t b)
3410{
3411 int32x2_t res64;
3412 __m128i sum;
3413 sum = _mm_add_epi64 (a, b);
3414 sum = _mm_shuffle_epi32(sum, 1 | (3 << 2) | (0 << 4) | (2 << 6));
3415 return64(sum);
3416}
3417
3418_NEON2SSESTORAGE uint8x8_t vaddhn_u16(uint16x8_t a, uint16x8_t b); // VADDHN.I16 d0,q0,q0
3419_NEON2SSE_INLINE uint8x8_t vaddhn_u16(uint16x8_t a, uint16x8_t b) // VADDHN.I16 d0,q0,q0
3420{
3421 uint8x8_t res64;
3422 __m128i sum;
3423 sum = _mm_add_epi16 (a, b);
3424 sum = _mm_srli_epi16 (sum, 8);
3425 sum = _mm_packus_epi16 (sum,sum); //use 64 low bits only
3426 return64(sum);
3427}
3428
3429_NEON2SSESTORAGE uint16x4_t vaddhn_u32(uint32x4_t a, uint32x4_t b); // VADDHN.I32 d0,q0,q0
3430_NEON2SSE_INLINE uint16x4_t vaddhn_u32(uint32x4_t a, uint32x4_t b) // VADDHN.I32 d0,q0,q0
3431{
3432 uint16x4_t res64;
3433 __m128i sum;
3434 sum = _mm_add_epi32 (a, b);
3435 sum = _mm_srli_epi32 (sum, 16);
3436#ifdef USE_SSE4
3437 sum = _MM_PACKUS1_EPI32 (sum); //use 64 low bits only
3438#else
3439 sum = _mm_shuffle_epi8 (sum, *(__m128i*) mask8_32_even_odd); //go to 16 bits
3440#endif
3441 return64(sum);
3442}
3443
3444_NEON2SSE_GLOBAL uint32x2_t vaddhn_u64(uint64x2_t a, uint64x2_t b); // VADDHN.I64 d0,q0,q0
3445#define vaddhn_u64 vaddhn_s64
3446
3447//*********** Vector rounding add high half: vraddhn_<type> ******************.
3448//***************************************************************************
3449_NEON2SSESTORAGE int8x8_t vraddhn_s16(int16x8_t a, int16x8_t b); // VRADDHN.I16 d0,q0,q0
3450_NEON2SSE_INLINE int8x8_t vraddhn_s16(int16x8_t a, int16x8_t b) // VRADDHN.I16 d0,q0,q0
3451{
3452 int8x8_t res64;
3453 __m128i sum, mask1;
3454 sum = _mm_add_epi16 (a, b);
3455 mask1 = _mm_slli_epi16(sum, 8); //shift left then back right to
3456 mask1 = _mm_srli_epi16(mask1, 15); //get 7-th bit 1 or zero
3457 sum = _mm_srai_epi16 (sum, 8); //get high half
3458 sum = _mm_add_epi16 (sum, mask1); //actual rounding
3459 sum = _mm_packs_epi16 (sum, sum);
3460 return64(sum);
3461}
3462
3463_NEON2SSESTORAGE int16x4_t vraddhn_s32(int32x4_t a, int32x4_t b); // VRADDHN.I32 d0,q0,q0
3464_NEON2SSE_INLINE int16x4_t vraddhn_s32(int32x4_t a, int32x4_t b) // VRADDHN.I32 d0,q0,q0
3465{
3466 //SIMD may be not optimal, serial may be faster
3467 int16x4_t res64;
3468 __m128i sum, mask1;
3469 sum = _mm_add_epi32 (a, b);
3470 mask1 = _mm_slli_epi32(sum, 16); //shift left then back right to
3471 mask1 = _mm_srli_epi32(mask1,31); //get 15-th bit 1 or zero
3472 sum = _mm_srai_epi32 (sum, 16); //get high half
3473 sum = _mm_add_epi32 (sum, mask1); //actual rounding
3474 sum = _mm_packs_epi32 (sum, sum);
3475 return64(sum);
3476}
3477
3478_NEON2SSESTORAGE int32x2_t vraddhn_s64(int64x2_t a, int64x2_t b); // VRADDHN.I64 d0,q0,q0
3479_NEON2SSE_INLINE int32x2_t vraddhn_s64(int64x2_t a, int64x2_t b)
3480{
3481 //SIMD may be not optimal, serial may be faster
3482 int32x2_t res64;
3483 __m128i sum, mask1;
3484 sum = _mm_add_epi64 (a, b);
3485 mask1 = _mm_slli_epi64(sum, 32); //shift left then back right to
3486 mask1 = _mm_srli_epi64(mask1,31); //get 31-th bit 1 or zero
3487 sum = _mm_add_epi32 (sum, mask1); //actual high half rounding
3488 sum = _mm_shuffle_epi32(sum, 1 | (3 << 2) | (1 << 4) | (3 << 6));
3489 return64(sum);
3490}
3491
3492_NEON2SSESTORAGE uint8x8_t vraddhn_u16(uint16x8_t a, uint16x8_t b); // VRADDHN.I16 d0,q0,q0
3493_NEON2SSE_INLINE uint8x8_t vraddhn_u16(uint16x8_t a, uint16x8_t b) // VRADDHN.I16 d0,q0,q0
3494{
3495 uint8x8_t res64;
3496 __m128i sum, mask1;
3497 sum = _mm_add_epi16 (a, b);
3498 mask1 = _mm_slli_epi16(sum, 8); //shift left then back right to
3499 mask1 = _mm_srli_epi16(mask1, 15); //get 7-th bit 1 or zero
3500 sum = _mm_srai_epi16 (sum, 8); //get high half
3501 sum = _mm_add_epi16 (sum, mask1); //actual rounding
3502 sum = _mm_packus_epi16 (sum, sum);
3503 return64(sum);
3504}
3505
3506_NEON2SSESTORAGE uint16x4_t vraddhn_u32(uint32x4_t a, uint32x4_t b); // VRADDHN.I32 d0,q0,q0
3507_NEON2SSE_INLINE uint16x4_t vraddhn_u32(uint32x4_t a, uint32x4_t b)
3508{
3509 //SIMD may be not optimal, serial may be faster
3510 uint16x4_t res64;
3511 __m128i sum, mask1;
3512 sum = _mm_add_epi32 (a, b);
3513 mask1 = _mm_slli_epi32(sum, 16); //shift left then back right to
3514 mask1 = _mm_srli_epi32(mask1,31); //get 15-th bit 1 or zero
3515 sum = _mm_srai_epi32 (sum, 16); //get high half
3516 sum = _mm_add_epi32 (sum, mask1); //actual rounding
3517 sum = _MM_PACKUS1_EPI32 (sum);
3518 return64(sum);
3519}
3520
3521_NEON2SSE_GLOBAL uint32x2_t vraddhn_u64(uint64x2_t a, uint64x2_t b); // VRADDHN.I64 d0,q0,q0
3522#define vraddhn_u64 vraddhn_s64
3523
3524//**********************************************************************************
3525//********* Multiplication *************************************
3526//**************************************************************************************
3527
3528//Vector multiply: vmul -> Vr[i] := Va[i] * Vb[i]
3529//As we don't go to wider result functions are equal to "multiply low" in x86
3530_NEON2SSESTORAGE int8x8_t vmul_s8(int8x8_t a, int8x8_t b); // VMUL.I8 d0,d0,d0
3531_NEON2SSE_INLINE int8x8_t vmul_s8(int8x8_t a, int8x8_t b) // VMUL.I8 d0,d0,d0
3532{
3533 // no 8 bit simd multiply, need to go to 16 bits in SSE
3534 int8x8_t res64;
3535 __m128i a128, b128, res;
3536 a128 = _MM_CVTEPI8_EPI16 (_pM128i(a)); // SSE 4.1 use low 64 bits
3537 b128 = _MM_CVTEPI8_EPI16 (_pM128i(b)); // SSE 4.1 use low 64 bits
3538 res = _mm_mullo_epi16 (a128, b128);
3539 res = _mm_shuffle_epi8 (res, *(__m128i*) mask8_16_even_odd); //return to 8 bit from 16, use 64 low bits only
3540 return64(res);
3541}
3542
3543_NEON2SSE_GLOBAL int16x4_t vmul_s16(int16x4_t a, int16x4_t b); // VMUL.I16 d0,d0,d0
3544#define vmul_s16 vmul_u16
3545
3546_NEON2SSE_GLOBAL int32x2_t vmul_s32(int32x2_t a, int32x2_t b); // VMUL.I32 d0,d0,d0
3547#define vmul_s32 vmul_u32
3548
3549_NEON2SSESTORAGE float32x2_t vmul_f32(float32x2_t a, float32x2_t b); // VMUL.F32 d0,d0,d0
3550_NEON2SSE_INLINE float32x2_t vmul_f32(float32x2_t a, float32x2_t b)
3551{
3552 float32x4_t tmp;
3553 __m64_128 res64;
3554 tmp = _mm_mul_ps(_pM128(a),_pM128(b));
3555 _M64f(res64, tmp); //use low 64 bits
3556 return res64;
3557}
3558
3559_NEON2SSESTORAGE uint8x8_t vmul_u8(uint8x8_t a, uint8x8_t b); // VMUL.I8 d0,d0,d0
3560_NEON2SSE_INLINE uint8x8_t vmul_u8(uint8x8_t a, uint8x8_t b) // VMUL.I8 d0,d0,d0
3561{
3562 // no 8 bit simd multiply, need to go to 16 bits in SSE
3563 uint8x8_t res64;
3564 __m128i mask, a128, b128, res;
3565 mask = _mm_set1_epi16(0xff);
3566 a128 = _MM_CVTEPU8_EPI16 (_pM128i(a));
3567 b128 = _MM_CVTEPU8_EPI16 (_pM128i(b));
3568 res = _mm_mullo_epi16 (a128, b128);
3569 res = _mm_and_si128(res, mask); //to avoid saturation
3570 res = _mm_packus_epi16 (res,res); //use only low 64 bits
3571 return64(res);
3572}
3573
3574_NEON2SSESTORAGE uint16x4_t vmul_u16(uint16x4_t a, uint16x4_t b); // VMUL.I16 d0,d0,d0
3575_NEON2SSE_INLINE uint16x4_t vmul_u16(uint16x4_t a, uint16x4_t b)
3576{
3577 uint16x4_t res64;
3578 return64(_mm_mullo_epi16(_pM128i(a),_pM128i(b)));
3579}
3580
3581_NEON2SSESTORAGE uint32x2_t vmul_u32(uint32x2_t a, uint32x2_t b); // VMUL.I32 d0,d0,d0
3582_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING( uint32x2_t vmul_u32(uint32x2_t a, uint32x2_t b), _NEON2SSE_REASON_SLOW_SERIAL)
3583{
3584 uint32x2_t res;
3585 res.m64_u32[0] = a.m64_u32[0] * b.m64_u32[0];
3586 res.m64_u32[1] = a.m64_u32[1] * b.m64_u32[1];
3587 return res;
3588}
3589
3590_NEON2SSESTORAGE poly8x8_t vmul_p8(poly8x8_t a, poly8x8_t b); // VMUL.P8 d0,d0,d0
3591_NEON2SSE_INLINE poly8x8_t vmul_p8(poly8x8_t a, poly8x8_t b)
3592{
3593 //may be optimized
3594 poly8x8_t res64;
3595 __m128i a64, b64, c1, res, tmp, bmasked;
3596 int i;
3597 a64 = _pM128i(a);
3598 b64 = _pM128i(b);
3599 c1 = _mm_cmpeq_epi8 (a64,a64); //all ones 0xff....
3600 c1 = vshrq_n_u8(c1,7); //0x1
3601 bmasked = _mm_and_si128(b64, c1); //0x1
3602 res = vmulq_u8(a64, bmasked);
3603 for(i = 1; i<8; i++) {
3604 c1 = _mm_slli_epi16(c1,1); //shift mask left by 1, 16 bit shift is OK here
3605 bmasked = _mm_and_si128(b64, c1); //0x1
3606 tmp = vmulq_u8(a64, bmasked);
3607 res = _mm_xor_si128(res, tmp);
3608 }
3609 return64 (res);
3610}
3611
3612_NEON2SSESTORAGE int8x16_t vmulq_s8(int8x16_t a, int8x16_t b); // VMUL.I8 q0,q0,q0
3613_NEON2SSE_INLINE int8x16_t vmulq_s8(int8x16_t a, int8x16_t b) // VMUL.I8 q0,q0,q0
3614{
3615 // no 8 bit simd multiply, need to go to 16 bits
3616 //solution may be not optimal
3617 __m128i a16, b16, r16_1, r16_2;
3618 a16 = _MM_CVTEPI8_EPI16 (a); // SSE 4.1
3619 b16 = _MM_CVTEPI8_EPI16 (b); // SSE 4.1
3620 r16_1 = _mm_mullo_epi16 (a16, b16);
3621 //swap hi and low part of a and b to process the remaining data
3622 a16 = _mm_shuffle_epi32 (a, _SWAP_HI_LOW32);
3623 b16 = _mm_shuffle_epi32 (b, _SWAP_HI_LOW32);
3624 a16 = _MM_CVTEPI8_EPI16 (a16); // SSE 4.1
3625 b16 = _MM_CVTEPI8_EPI16 (b16); // SSE 4.1 __m128i r16_2
3626
3627 r16_2 = _mm_mullo_epi16 (a16, b16);
3628 r16_1 = _mm_shuffle_epi8 (r16_1, *(__m128i*)mask8_16_even_odd); //return to 8 bit
3629 r16_2 = _mm_shuffle_epi8 (r16_2, *(__m128i*)mask8_16_even_odd); //return to 8 bit
3630
3631 return _mm_unpacklo_epi64(r16_1, r16_2);
3632}
3633
3634_NEON2SSE_GLOBAL int16x8_t vmulq_s16(int16x8_t a, int16x8_t b); // VMUL.I16 q0,q0,q0
3635#define vmulq_s16 _mm_mullo_epi16
3636
3637_NEON2SSE_GLOBAL int32x4_t vmulq_s32(int32x4_t a, int32x4_t b); // VMUL.I32 q0,q0,q0
3638#define vmulq_s32 _MM_MULLO_EPI32 //SSE4.1
3639
3640_NEON2SSE_GLOBAL float32x4_t vmulq_f32(float32x4_t a, float32x4_t b); // VMUL.F32 q0,q0,q0
3641#define vmulq_f32 _mm_mul_ps
3642
3643_NEON2SSESTORAGE uint8x16_t vmulq_u8(uint8x16_t a, uint8x16_t b); // VMUL.I8 q0,q0,q0
3644_NEON2SSE_INLINE uint8x16_t vmulq_u8(uint8x16_t a, uint8x16_t b) // VMUL.I8 q0,q0,q0
3645{
3646 // no 8 bit simd multiply, need to go to 16 bits
3647 //solution may be not optimal
3648 __m128i maskff, a16, b16, r16_1, r16_2;
3649 maskff = _mm_set1_epi16(0xff);
3650 a16 = _MM_CVTEPU8_EPI16 (a); // SSE 4.1
3651 b16 = _MM_CVTEPU8_EPI16 (b); // SSE 4.1
3652 r16_1 = _mm_mullo_epi16 (a16, b16);
3653 r16_1 = _mm_and_si128(r16_1, maskff); //to avoid saturation
3654 //swap hi and low part of a and b to process the remaining data
3655 a16 = _mm_shuffle_epi32 (a, _SWAP_HI_LOW32);
3656 b16 = _mm_shuffle_epi32 (b, _SWAP_HI_LOW32);
3657 a16 = _MM_CVTEPI8_EPI16 (a16); // SSE 4.1
3658 b16 = _MM_CVTEPI8_EPI16 (b16); // SSE 4.1
3659
3660 r16_2 = _mm_mullo_epi16 (a16, b16);
3661 r16_2 = _mm_and_si128(r16_2, maskff); //to avoid saturation
3662 return _mm_packus_epi16 (r16_1, r16_2);
3663}
3664
3665_NEON2SSE_GLOBAL uint16x8_t vmulq_u16(uint16x8_t a, uint16x8_t b); // VMUL.I16 q0,q0,q0
3666#define vmulq_u16 _mm_mullo_epi16
3667
3668_NEON2SSE_GLOBAL uint32x4_t vmulq_u32(uint32x4_t a, uint32x4_t b); // VMUL.I32 q0,q0,q0
3669#define vmulq_u32 _MM_MULLO_EPI32 //SSE4.1
3670
3671_NEON2SSESTORAGE poly8x16_t vmulq_p8(poly8x16_t a, poly8x16_t b); // VMUL.P8 q0,q0,q0
3672_NEON2SSE_INLINE poly8x16_t vmulq_p8(poly8x16_t a, poly8x16_t b)
3673{
3674 //may be optimized
3675 __m128i c1, res, tmp, bmasked;
3676 int i;
3677 c1 = _mm_cmpeq_epi8 (a,a); //all ones 0xff....
3678 c1 = vshrq_n_u8(c1,7); //0x1
3679 bmasked = _mm_and_si128(b, c1); //0x1
3680 res = vmulq_u8(a, bmasked);
3681 for(i = 1; i<8; i++) {
3682 c1 = _mm_slli_epi16(c1,1); //shift mask left by 1, 16 bit shift is OK here
3683 bmasked = _mm_and_si128(b, c1); //0x1
3684 tmp = vmulq_u8(a, bmasked);
3685 res = _mm_xor_si128(res, tmp);
3686 }
3687 return res;
3688}
3689
3690//************************* Vector long multiply ***********************************
3691//****************************************************************************
3692_NEON2SSESTORAGE int16x8_t vmull_s8(int8x8_t a, int8x8_t b); // VMULL.S8 q0,d0,d0
3693_NEON2SSE_INLINE int16x8_t vmull_s8(int8x8_t a, int8x8_t b) // VMULL.S8 q0,d0,d0
3694{
3695 //no 8 bit simd multiply, need to go to 16 bits
3696 __m128i a16, b16;
3697 a16 = _MM_CVTEPI8_EPI16 (_pM128i(a)); // SSE 4.1
3698 b16 = _MM_CVTEPI8_EPI16 (_pM128i(b)); // SSE 4.1
3699 return _mm_mullo_epi16 (a16, b16); //should fit into 16 bit
3700}
3701
3702_NEON2SSESTORAGE int32x4_t vmull_s16(int16x4_t a, int16x4_t b); // VMULL.S16 q0,d0,d0
3703_NEON2SSE_INLINE int32x4_t vmull_s16(int16x4_t a, int16x4_t b) // VMULL.S16 q0,d0,d0
3704{
3705#ifdef USE_SSE4
3706 __m128i a16, b16;
3707 a16 = _MM_CVTEPI16_EPI32 (_pM128i(a)); // SSE 4.1
3708 b16 = _MM_CVTEPI16_EPI32 (_pM128i(b)); // SSE 4.1
3709 return _MM_MULLO_EPI32 (a16, b16); // SSE 4.1
3710#else
3711 __m128i low, hi, a128,b128;
3712 a128 = _pM128i(a);
3713 b128 = _pM128i(b);
3714 low = _mm_mullo_epi16(a128,b128);
3715 hi = _mm_mulhi_epi16(a128,b128);
3716 return _mm_unpacklo_epi16(low,hi);
3717#endif
3718}
3719
3720_NEON2SSESTORAGE int64x2_t vmull_s32(int32x2_t a, int32x2_t b); // VMULL.S32 q0,d0,d0
3721_NEON2SSE_INLINE int64x2_t vmull_s32(int32x2_t a, int32x2_t b) // VMULL.S32 q0,d0,d0
3722{
3723 __m128i ab, ba, a128, b128;
3724 a128 = _pM128i(a);
3725 b128 = _pM128i(b);
3726 ab = _mm_unpacklo_epi32 (a128, b128); //a0, b0, a1,b1
3727 ba = _mm_unpacklo_epi32 (b128, a128); //b0, a0, b1,a1
3728 return _MM_MUL_EPI32(ab, ba); //uses 1rst and 3rd data lanes, the multiplication gives 64 bit result
3729}
3730
3731_NEON2SSESTORAGE uint16x8_t vmull_u8(uint8x8_t a, uint8x8_t b); // VMULL.U8 q0,d0,d0
3732_NEON2SSE_INLINE uint16x8_t vmull_u8(uint8x8_t a, uint8x8_t b) // VMULL.U8 q0,d0,d0
3733{
3734 //no 8 bit simd multiply, need to go to 16 bits
3735 __m128i a16, b16;
3736 a16 = _MM_CVTEPU8_EPI16 (_pM128i(a)); // SSE 4.1
3737 b16 = _MM_CVTEPU8_EPI16 (_pM128i(b)); // SSE 4.1
3738 return _mm_mullo_epi16 (a16, b16); //should fit into 16 bit
3739}
3740
3741_NEON2SSESTORAGE uint32x4_t vmull_u16(uint16x4_t a, uint16x4_t b); // VMULL.s16 q0,d0,d0
3742_NEON2SSE_INLINE uint32x4_t vmull_u16(uint16x4_t a, uint16x4_t b) // VMULL.s16 q0,d0,d0
3743{
3744#ifdef USE_SSE4
3745 __m128i a16, b16;
3746 a16 = _MM_CVTEPU16_EPI32 (_pM128i(a)); // SSE 4.1
3747 b16 = _MM_CVTEPU16_EPI32 (_pM128i(b)); // SSE 4.1
3748 return _MM_MULLO_EPI32 (a16, b16); // SSE 4.1
3749#else
3750 __m128i a128,b128,low, hi;
3751 a128 = _pM128i(a);
3752 b128 = _pM128i(b);
3753 low = _mm_mullo_epi16(a128,b128);
3754 hi = _mm_mulhi_epu16(a128,b128);
3755 return _mm_unpacklo_epi16(low,hi);
3756#endif
3757}
3758
3759_NEON2SSESTORAGE uint64x2_t vmull_u32(uint32x2_t a, uint32x2_t b); // VMULL.U32 q0,d0,d0
3760_NEON2SSE_INLINE uint64x2_t vmull_u32(uint32x2_t a, uint32x2_t b) // VMULL.U32 q0,d0,d0
3761{
3762 ///may be not optimal compared with serial implementation
3763 __m128i ab, ba, a128, b128;
3764 a128 = _pM128i(a);
3765 b128 = _pM128i(b);
3766 ab = _mm_unpacklo_epi32 (a128, b128); //a0, b0, a1,b1
3767 ba = _mm_unpacklo_epi32 (b128, a128); //b0, a0, b1,a1
3768 return _mm_mul_epu32 (ab, ba); //uses 1rst and 3rd data lanes, the multiplication gives 64 bit result
3769}
3770
3771_NEON2SSESTORAGE poly16x8_t vmull_p8(poly8x8_t a, poly8x8_t b); // VMULL.P8 q0,d0,d0
3772_NEON2SSE_INLINE poly16x8_t vmull_p8(poly8x8_t a, poly8x8_t b)
3773{
3774 //may be optimized
3775 __m128i a128,b128, c1, a128_16, bmasked_16, res, tmp, bmasked;
3776 int i;
3777 a128 = _pM128i(a);
3778 b128 = _pM128i(b);
3779 c1 = _mm_cmpeq_epi8 (a128,a128); //all ones 0xff....
3780 c1 = vshrq_n_u8(c1,7); //0x1
3781 bmasked = _mm_and_si128(b128, c1); //0x1
3782
3783 a128_16 = _MM_CVTEPU8_EPI16 (a128); // SSE 4.1
3784 bmasked_16 = _MM_CVTEPU8_EPI16 (bmasked); // SSE 4.1
3785 res = _mm_mullo_epi16 (a128_16, bmasked_16); //should fit into 16 bit
3786 for(i = 1; i<8; i++) {
3787 c1 = _mm_slli_epi16(c1,1); //shift mask left by 1, 16 bit shift is OK here
3788 bmasked = _mm_and_si128(b128, c1); //0x1
3789 bmasked_16 = _MM_CVTEPU8_EPI16 (bmasked); // SSE 4.1
3790 tmp = _mm_mullo_epi16 (a128_16, bmasked_16); //should fit into 16 bit, vmull_u8(a, bmasked);
3791 res = _mm_xor_si128(res, tmp);
3792 }
3793 return res;
3794}
3795
3796//****************Vector saturating doubling long multiply **************************
3797//*****************************************************************
3798_NEON2SSESTORAGE int32x4_t vqdmull_s16(int16x4_t a, int16x4_t b); // VQDMULL.S16 q0,d0,d0
3799_NEON2SSE_INLINE int32x4_t vqdmull_s16(int16x4_t a, int16x4_t b)
3800{
3801 //the serial soulution may be faster due to saturation
3802 __m128i res;
3803 res = vmull_s16(a, b);
3804 return vqd_s32(res);
3805}
3806
3807_NEON2SSESTORAGE int64x2_t vqdmull_s32(int32x2_t a, int32x2_t b); // VQDMULL.S32 q0,d0,d0
3808_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int64x2_t vqdmull_s32(int32x2_t a, int32x2_t b),_NEON2SSE_REASON_SLOW_SERIAL)
3809{
3810 //the serial soulution may be faster due to saturation
3811 __m128i res;
3812 res = vmull_s32(a,b);
3813 return vqaddq_s64(res,res); //slow serial function!!!!
3814}
3815
3816//********************* Vector multiply accumulate: vmla -> Vr[i] := Va[i] + Vb[i] * Vc[i] ************************
3817//******************************************************************************************
3818_NEON2SSESTORAGE int8x8_t vmla_s8(int8x8_t a, int8x8_t b, int8x8_t c); // VMLA.I8 d0,d0,d0
3819_NEON2SSE_INLINE int8x8_t vmla_s8(int8x8_t a, int8x8_t b, int8x8_t c) // VMLA.I8 d0,d0,d0
3820{
3821 // no 8 bit x86 simd multiply, need to go to 16 bits, and use the low 64 bits
3822 int8x8_t res64;
3823 __m128i b128, c128, res;
3824 b128 = _MM_CVTEPI8_EPI16 (_pM128i(b)); // SSE 4.1 use low 64 bits
3825 c128 = _MM_CVTEPI8_EPI16 (_pM128i(c)); // SSE 4.1 use low 64 bits
3826 res = _mm_mullo_epi16 (c128, b128);
3827 res = _mm_shuffle_epi8 (res, *(__m128i*) mask8_16_even_odd);
3828 res = _mm_add_epi8 (res, _pM128i(a)); //use the low 64 bits
3829 return64(res);
3830}
3831
3832_NEON2SSESTORAGE int16x4_t vmla_s16(int16x4_t a, int16x4_t b, int16x4_t c); // VMLA.I16 d0,d0,d0
3833_NEON2SSE_INLINE int16x4_t vmla_s16(int16x4_t a, int16x4_t b, int16x4_t c)
3834{
3835 int16x4_t res64;
3836 return64(vmlaq_s16(_pM128i(a),_pM128i(b), _pM128i(c)));
3837}
3838
3839
3840_NEON2SSESTORAGE int32x2_t vmla_s32(int32x2_t a, int32x2_t b, int32x2_t c); // VMLA.I32 d0,d0,d0
3841_NEON2SSE_INLINE int32x2_t vmla_s32(int32x2_t a, int32x2_t b, int32x2_t c) // VMLA.I32 d0,d0,d0
3842{
3843 int32x2_t res64;
3844 __m128i res;
3845 res = _MM_MULLO_EPI32 (_pM128i(b), _pM128i(c)); //SSE4.1
3846 res = _mm_add_epi32 (res, _pM128i(a)); //use the low 64 bits
3847 return64(res);
3848}
3849
3850_NEON2SSESTORAGE float32x2_t vmla_f32(float32x2_t a, float32x2_t b, float32x2_t c); // VMLA.F32 d0,d0,d0
3851_NEON2SSE_INLINE float32x2_t vmla_f32(float32x2_t a, float32x2_t b, float32x2_t c)
3852{
3853 //fma is coming soon, but right now:
3854 __m128 res;
3855 __m64_128 res64;
3856 res = _mm_mul_ps (_pM128(c), _pM128(b));
3857 res = _mm_add_ps (_pM128(a), res);
3858 _M64f(res64, res);
3859 return res64;
3860}
3861
3862_NEON2SSESTORAGE uint8x8_t vmla_u8(uint8x8_t a, uint8x8_t b, uint8x8_t c); // VMLA.I8 d0,d0,d0
3863_NEON2SSE_INLINE uint8x8_t vmla_u8(uint8x8_t a, uint8x8_t b, uint8x8_t c) // VMLA.I8 d0,d0,d0
3864{
3865 // no 8 bit x86 simd multiply, need to go to 16 bits, and use the low 64 bits
3866 uint8x8_t res64;
3867 __m128i mask, b128, c128, res;
3868 mask = _mm_set1_epi16(0xff);
3869 b128 = _MM_CVTEPU8_EPI16 (_pM128i(b)); // SSE 4.1 use low 64 bits
3870 c128 = _MM_CVTEPU8_EPI16 (_pM128i(c)); // SSE 4.1 use low 64 bits
3871 res = _mm_mullo_epi16 (c128, b128);
3872 res = _mm_and_si128(res, mask); //to avoid saturation
3873 res = _mm_packus_epi16 (res, res);
3874 res = _mm_add_epi8 (res, _pM128i(a)); //use the low 64 bits
3875 return64(res);
3876}
3877
3878_NEON2SSE_GLOBAL uint16x4_t vmla_u16(uint16x4_t a, uint16x4_t b, uint16x4_t c); // VMLA.I16 d0,d0,d0
3879#define vmla_u16 vmla_s16
3880
3881_NEON2SSE_GLOBAL uint32x2_t vmla_u32(uint32x2_t a, uint32x2_t b, uint32x2_t c); // VMLA.I32 d0,d0,d0
3882#define vmla_u32 vmla_s32
3883
3884_NEON2SSESTORAGE int8x16_t vmlaq_s8(int8x16_t a, int8x16_t b, int8x16_t c); // VMLA.I8 q0,q0,q0
3885_NEON2SSE_INLINE int8x16_t vmlaq_s8(int8x16_t a, int8x16_t b, int8x16_t c) // VMLA.I8 q0,q0,q0
3886{
3887 //solution may be not optimal
3888 // no 8 bit simd multiply, need to go to 16 bits
3889 __m128i b16, c16, r16_1, a_2,r16_2;
3890 b16 = _MM_CVTEPI8_EPI16 (b); // SSE 4.1
3891 c16 = _MM_CVTEPI8_EPI16 (c); // SSE 4.1
3892 r16_1 = _mm_mullo_epi16 (b16, c16);
3893 r16_1 = _mm_shuffle_epi8 (r16_1, *(__m128i*) mask8_16_even_odd); //return to 8 bits
3894 r16_1 = _mm_add_epi8 (r16_1, a);
3895 //swap hi and low part of a, b and c to process the remaining data
3896 a_2 = _mm_shuffle_epi32 (a, _SWAP_HI_LOW32);
3897 b16 = _mm_shuffle_epi32 (b, _SWAP_HI_LOW32);
3898 c16 = _mm_shuffle_epi32 (c, _SWAP_HI_LOW32);
3899 b16 = _MM_CVTEPI8_EPI16 (b16); // SSE 4.1
3900 c16 = _MM_CVTEPI8_EPI16 (c16); // SSE 4.1
3901
3902 r16_2 = _mm_mullo_epi16 (b16, c16);
3903 r16_2 = _mm_shuffle_epi8 (r16_2, *(__m128i*) mask8_16_even_odd);
3904 r16_2 = _mm_add_epi8(r16_2, a_2);
3905 return _mm_unpacklo_epi64(r16_1,r16_2);
3906}
3907
3908_NEON2SSESTORAGE int16x8_t vmlaq_s16(int16x8_t a, int16x8_t b, int16x8_t c); // VMLA.I16 q0,q0,q0
3909_NEON2SSE_INLINE int16x8_t vmlaq_s16(int16x8_t a, int16x8_t b, int16x8_t c) // VMLA.I16 q0,q0,q0
3910{
3911 __m128i res;
3912 res = _mm_mullo_epi16 (c, b);
3913 return _mm_add_epi16 (res, a);
3914}
3915
3916_NEON2SSESTORAGE int32x4_t vmlaq_s32(int32x4_t a, int32x4_t b, int32x4_t c); // VMLA.I32 q0,q0,q0
3917_NEON2SSE_INLINE int32x4_t vmlaq_s32(int32x4_t a, int32x4_t b, int32x4_t c) // VMLA.I32 q0,q0,q0
3918{
3919 __m128i res;
3920 res = _MM_MULLO_EPI32 (c, b); //SSE4.1
3921 return _mm_add_epi32 (res, a);
3922}
3923
3924_NEON2SSESTORAGE float32x4_t vmlaq_f32(float32x4_t a, float32x4_t b, float32x4_t c); // VMLA.F32 q0,q0,q0
3925_NEON2SSE_INLINE float32x4_t vmlaq_f32(float32x4_t a, float32x4_t b, float32x4_t c) // VMLA.F32 q0,q0,q0
3926{
3927 //fma is coming soon, but right now:
3928 __m128 res;
3929 res = _mm_mul_ps (c, b);
3930 return _mm_add_ps (a, res);
3931}
3932
3933_NEON2SSESTORAGE uint8x16_t vmlaq_u8(uint8x16_t a, uint8x16_t b, uint8x16_t c); // VMLA.I8 q0,q0,q0
3934_NEON2SSE_INLINE uint8x16_t vmlaq_u8(uint8x16_t a, uint8x16_t b, uint8x16_t c) // VMLA.I8 q0,q0,q0
3935{
3936 //solution may be not optimal
3937 // no 8 bit simd multiply, need to go to 16 bits
3938 __m128i b16, c16, r16_1, a_2, r16_2;
3939 b16 = _MM_CVTEPU8_EPI16 (b); // SSE 4.1
3940 c16 = _MM_CVTEPU8_EPI16 (c); // SSE 4.1
3941 r16_1 = _mm_mullo_epi16 (b16, c16);
3942 r16_1 = _mm_shuffle_epi8 (r16_1, *(__m128i*) mask8_16_even_odd); //return to 8 bits
3943 r16_1 = _mm_add_epi8 (r16_1, a);
3944 //swap hi and low part of a, b and c to process the remaining data
3945 a_2 = _mm_shuffle_epi32 (a, _SWAP_HI_LOW32);
3946 b16 = _mm_shuffle_epi32 (b, _SWAP_HI_LOW32);
3947 c16 = _mm_shuffle_epi32 (c, _SWAP_HI_LOW32);
3948 b16 = _MM_CVTEPU8_EPI16 (b16); // SSE 4.1
3949 c16 = _MM_CVTEPU8_EPI16 (c16); // SSE 4.1
3950
3951 r16_2 = _mm_mullo_epi16 (b16, c16);
3952 r16_2 = _mm_shuffle_epi8 (r16_2, *(__m128i*) mask8_16_even_odd);
3953 r16_2 = _mm_add_epi8(r16_2, a_2);
3954 return _mm_unpacklo_epi64(r16_1,r16_2);
3955}
3956
3957_NEON2SSE_GLOBAL uint16x8_t vmlaq_u16(uint16x8_t a, uint16x8_t b, uint16x8_t c); // VMLA.I16 q0,q0,q0
3958#define vmlaq_u16 vmlaq_s16
3959
3960_NEON2SSE_GLOBAL uint32x4_t vmlaq_u32(uint32x4_t a, uint32x4_t b, uint32x4_t c); // VMLA.I32 q0,q0,q0
3961#define vmlaq_u32 vmlaq_s32
3962
3963//********************** Vector widening multiply accumulate (long multiply accumulate):
3964// vmla -> Vr[i] := Va[i] + Vb[i] * Vc[i] **************
3965//********************************************************************************************
3966_NEON2SSESTORAGE int16x8_t vmlal_s8(int16x8_t a, int8x8_t b, int8x8_t c); // VMLAL.S8 q0,d0,d0
3967_NEON2SSE_INLINE int16x8_t vmlal_s8(int16x8_t a, int8x8_t b, int8x8_t c) // VMLAL.S8 q0,d0,d0
3968{
3969 int16x8_t res;
3970 res = vmull_s8(b, c);
3971 return _mm_add_epi16 (res, a);
3972}
3973
3974_NEON2SSESTORAGE int32x4_t vmlal_s16(int32x4_t a, int16x4_t b, int16x4_t c); // VMLAL.S16 q0,d0,d0
3975_NEON2SSE_INLINE int32x4_t vmlal_s16(int32x4_t a, int16x4_t b, int16x4_t c) // VMLAL.S16 q0,d0,d0
3976{
3977 //may be not optimal compared with serial implementation
3978 int32x4_t res;
3979 res = vmull_s16(b, c);
3980 return _mm_add_epi32 (res, a);
3981}
3982
3983_NEON2SSESTORAGE int64x2_t vmlal_s32(int64x2_t a, int32x2_t b, int32x2_t c); // VMLAL.S32 q0,d0,d0
3984_NEON2SSE_INLINE int64x2_t vmlal_s32(int64x2_t a, int32x2_t b, int32x2_t c) // VMLAL.S32 q0,d0,d0
3985{
3986 //may be not optimal compared with serial implementation
3987 int64x2_t res;
3988 res = vmull_s32( b, c);
3989 return _mm_add_epi64 (res, a);
3990}
3991
3992_NEON2SSESTORAGE uint16x8_t vmlal_u8(uint16x8_t a, uint8x8_t b, uint8x8_t c); // VMLAL.U8 q0,d0,d0
3993_NEON2SSE_INLINE uint16x8_t vmlal_u8(uint16x8_t a, uint8x8_t b, uint8x8_t c) // VMLAL.U8 q0,d0,d0
3994{
3995 uint16x8_t res;
3996 res = vmull_u8(b, c);
3997 return _mm_add_epi16 (res, a);
3998}
3999
4000_NEON2SSESTORAGE uint32x4_t vmlal_u16(uint32x4_t a, uint16x4_t b, uint16x4_t c); // VMLAL.s16 q0,d0,d0
4001_NEON2SSE_INLINE uint32x4_t vmlal_u16(uint32x4_t a, uint16x4_t b, uint16x4_t c) // VMLAL.s16 q0,d0,d0
4002{
4003 //may be not optimal compared with serial implementation
4004 uint32x4_t res;
4005 res = vmull_u16(b, c);
4006 return _mm_add_epi32 (res, a);
4007}
4008
4009_NEON2SSESTORAGE uint64x2_t vmlal_u32(uint64x2_t a, uint32x2_t b, uint32x2_t c); // VMLAL.U32 q0,d0,d0
4010_NEON2SSE_INLINE uint64x2_t vmlal_u32(uint64x2_t a, uint32x2_t b, uint32x2_t c) // VMLAL.U32 q0,d0,d0
4011{
4012 //may be not optimal compared with serial implementation
4013 int64x2_t res;
4014 res = vmull_u32( b,c);
4015 return _mm_add_epi64 (res, a);
4016}
4017
4018//******************** Vector multiply subtract: vmls -> Vr[i] := Va[i] - Vb[i] * Vc[i] ***************************************
4019//********************************************************************************************
4020_NEON2SSESTORAGE int8x8_t vmls_s8(int8x8_t a, int8x8_t b, int8x8_t c); // VMLS.I8 d0,d0,d0
4021_NEON2SSE_INLINE int8x8_t vmls_s8(int8x8_t a, int8x8_t b, int8x8_t c) // VMLS.I8 d0,d0,d0
4022{
4023 // no 8 bit simd multiply, need to go to 16 bits - and use the low 64 bits
4024 int8x8_t res64;
4025 __m128i res;
4026 res64 = vmul_s8(b,c);
4027 res = _mm_sub_epi8 (_pM128i(a), _pM128i(res64));
4028 return64(res);
4029}
4030
4031_NEON2SSESTORAGE int16x4_t vmls_s16(int16x4_t a, int16x4_t b, int16x4_t c); // VMLS.I16 d0,d0,d0
4032_NEON2SSE_INLINE int16x4_t vmls_s16(int16x4_t a, int16x4_t b, int16x4_t c)
4033{
4034 int16x4_t res64;
4035 return64(vmlsq_s16(_pM128i(a),_pM128i(b), _pM128i(c)));
4036}
4037
4038
4039_NEON2SSESTORAGE int32x2_t vmls_s32(int32x2_t a, int32x2_t b, int32x2_t c); // VMLS.I32 d0,d0,d0
4040_NEON2SSE_INLINE int32x2_t vmls_s32(int32x2_t a, int32x2_t b, int32x2_t c) // VMLS.I32 d0,d0,d0
4041{
4042 int32x2_t res64;
4043 __m128i res;
4044 res = _MM_MULLO_EPI32 (_pM128i(c),_pM128i( b)); //SSE4.1
4045 res = _mm_sub_epi32 (_pM128i(a),res); //use low 64 bits only
4046 return64(res);
4047}
4048
4049_NEON2SSESTORAGE float32x2_t vmls_f32(float32x2_t a, float32x2_t b, float32x2_t c); // VMLS.F32 d0,d0,d0
4050_NEON2SSE_INLINE float32x2_t vmls_f32(float32x2_t a, float32x2_t b, float32x2_t c)
4051{
4052 __m128 res;
4053 __m64_128 res64;
4054 res = _mm_mul_ps (_pM128(c), _pM128(b));
4055 res = _mm_sub_ps (_pM128(a), res);
4056 _M64f(res64, res);
4057 return res64;
4058}
4059
4060_NEON2SSESTORAGE uint8x8_t vmls_u8(uint8x8_t a, uint8x8_t b, uint8x8_t c); // VMLS.I8 d0,d0,d0
4061_NEON2SSE_INLINE uint8x8_t vmls_u8(uint8x8_t a, uint8x8_t b, uint8x8_t c)
4062{
4063 // no 8 bit simd multiply, need to go to 16 bits - and use the low 64 bits
4064 uint8x8_t res64;
4065 __m128i res;
4066 res64 = vmul_u8(b,c);
4067 res = _mm_sub_epi8 (_pM128i(a), _pM128i(res64));
4068 return64(res);
4069}
4070
4071_NEON2SSE_GLOBAL uint16x4_t vmls_u16(uint16x4_t a, uint16x4_t b, uint16x4_t c); // VMLS.I16 d0,d0,d0
4072#define vmls_u16 vmls_s16
4073
4074_NEON2SSE_GLOBAL uint32x2_t vmls_u32(uint32x2_t a, uint32x2_t b, uint32x2_t c); // VMLS.I32 d0,d0,d0
4075#define vmls_u32 vmls_s32
4076
4077
4078_NEON2SSESTORAGE int8x16_t vmlsq_s8(int8x16_t a, int8x16_t b, int8x16_t c); // VMLS.I8 q0,q0,q0
4079_NEON2SSE_INLINE int8x16_t vmlsq_s8(int8x16_t a, int8x16_t b, int8x16_t c) // VMLS.I8 q0,q0,q0
4080{
4081 //solution may be not optimal
4082 // no 8 bit simd multiply, need to go to 16 bits
4083 __m128i b16, c16, r16_1, a_2, r16_2;
4084 b16 = _MM_CVTEPI8_EPI16 (b); // SSE 4.1
4085 c16 = _MM_CVTEPI8_EPI16 (c); // SSE 4.1
4086 r16_1 = _mm_mullo_epi16 (b16, c16);
4087 r16_1 = _mm_shuffle_epi8 (r16_1, *(__m128i*) mask8_16_even_odd);
4088 r16_1 = _mm_sub_epi8 (a, r16_1);
4089 //swap hi and low part of a, b, c to process the remaining data
4090 a_2 = _mm_shuffle_epi32 (a, _SWAP_HI_LOW32);
4091 b16 = _mm_shuffle_epi32 (b, _SWAP_HI_LOW32);
4092 c16 = _mm_shuffle_epi32 (c, _SWAP_HI_LOW32);
4093 b16 = _MM_CVTEPI8_EPI16 (b16); // SSE 4.1
4094 c16 = _MM_CVTEPI8_EPI16 (c16); // SSE 4.1
4095
4096 r16_2 = _mm_mullo_epi16 (b16, c16);
4097 r16_2 = _mm_shuffle_epi8 (r16_2, *(__m128i*) mask8_16_even_odd);
4098 r16_2 = _mm_sub_epi8 (a_2, r16_2);
4099 return _mm_unpacklo_epi64(r16_1,r16_2);
4100}
4101
4102_NEON2SSESTORAGE int16x8_t vmlsq_s16(int16x8_t a, int16x8_t b, int16x8_t c); // VMLS.I16 q0,q0,q0
4103_NEON2SSE_INLINE int16x8_t vmlsq_s16(int16x8_t a, int16x8_t b, int16x8_t c) // VMLS.I16 q0,q0,q0
4104{
4105 __m128i res;
4106 res = _mm_mullo_epi16 (c, b);
4107 return _mm_sub_epi16 (a, res);
4108}
4109
4110_NEON2SSESTORAGE int32x4_t vmlsq_s32(int32x4_t a, int32x4_t b, int32x4_t c); // VMLS.I32 q0,q0,q0
4111_NEON2SSE_INLINE int32x4_t vmlsq_s32(int32x4_t a, int32x4_t b, int32x4_t c) // VMLS.I32 q0,q0,q0
4112{
4113 __m128i res;
4114 res = _MM_MULLO_EPI32 (c, b); //SSE4.1
4115 return _mm_sub_epi32 (a, res);
4116}
4117
4118_NEON2SSESTORAGE float32x4_t vmlsq_f32(float32x4_t a, float32x4_t b, float32x4_t c); // VMLS.F32 q0,q0,q0
4119_NEON2SSE_INLINE float32x4_t vmlsq_f32(float32x4_t a, float32x4_t b, float32x4_t c) // VMLS.F32 q0,q0,q0
4120{
4121 __m128 res;
4122 res = _mm_mul_ps (c, b);
4123 return _mm_sub_ps (a, res);
4124}
4125
4126_NEON2SSESTORAGE uint8x16_t vmlsq_u8(uint8x16_t a, uint8x16_t b, uint8x16_t c); // VMLS.I8 q0,q0,q0
4127_NEON2SSE_INLINE uint8x16_t vmlsq_u8(uint8x16_t a, uint8x16_t b, uint8x16_t c) // VMLS.I8 q0,q0,q0
4128{
4129 //solution may be not optimal
4130 // no 8 bit simd multiply, need to go to 16 bits
4131 __m128i b16, c16, r16_1, a_2, r16_2;
4132 b16 = _MM_CVTEPU8_EPI16 (b); // SSE 4.1
4133 c16 = _MM_CVTEPU8_EPI16 (c); // SSE 4.1
4134 r16_1 = _mm_mullo_epi16 (b16, c16);
4135 r16_1 = _mm_shuffle_epi8 (r16_1, *(__m128i*) mask8_16_even_odd); //return to 8 bits
4136 r16_1 = _mm_sub_epi8 (a, r16_1);
4137 //swap hi and low part of a, b and c to process the remaining data
4138 a_2 = _mm_shuffle_epi32 (a, _SWAP_HI_LOW32);
4139 b16 = _mm_shuffle_epi32 (b, _SWAP_HI_LOW32);
4140 c16 = _mm_shuffle_epi32 (c, _SWAP_HI_LOW32);
4141 b16 = _MM_CVTEPU8_EPI16 (b16); // SSE 4.1
4142 c16 = _MM_CVTEPU8_EPI16 (c16); // SSE 4.1
4143
4144 r16_2 = _mm_mullo_epi16 (b16, c16);
4145 r16_2 = _mm_shuffle_epi8 (r16_2, *(__m128i*) mask8_16_even_odd);
4146 r16_2 = _mm_sub_epi8(a_2, r16_2);
4147 return _mm_unpacklo_epi64(r16_1,r16_2);
4148}
4149
4150_NEON2SSE_GLOBAL uint16x8_t vmlsq_u16(uint16x8_t a, uint16x8_t b, uint16x8_t c); // VMLS.I16 q0,q0,q0
4151#define vmlsq_u16 vmlsq_s16
4152
4153_NEON2SSE_GLOBAL uint32x4_t vmlsq_u32(uint32x4_t a, uint32x4_t b, uint32x4_t c); // VMLS.I32 q0,q0,q0
4154#define vmlsq_u32 vmlsq_s32
4155
4156//******************** Vector multiply subtract long (widening multiply subtract) ************************************
4157//*************************************************************************************************************
4158_NEON2SSESTORAGE int16x8_t vmlsl_s8(int16x8_t a, int8x8_t b, int8x8_t c); // VMLSL.S8 q0,d0,d0
4159_NEON2SSE_INLINE int16x8_t vmlsl_s8(int16x8_t a, int8x8_t b, int8x8_t c) // VMLSL.S8 q0,d0,d0
4160{
4161 int16x8_t res;
4162 res = vmull_s8(b, c);
4163 return _mm_sub_epi16 (a, res);
4164}
4165
4166_NEON2SSESTORAGE int32x4_t vmlsl_s16(int32x4_t a, int16x4_t b, int16x4_t c); // VMLSL.S16 q0,d0,d0
4167_NEON2SSE_INLINE int32x4_t vmlsl_s16(int32x4_t a, int16x4_t b, int16x4_t c) // VMLSL.S16 q0,d0,d0
4168{
4169 //may be not optimal compared with serial implementation
4170 int32x4_t res;
4171 res = vmull_s16(b, c);
4172 return _mm_sub_epi32 (a, res);
4173}
4174
4175_NEON2SSESTORAGE int64x2_t vmlsl_s32(int64x2_t a, int32x2_t b, int32x2_t c); // VMLSL.S32 q0,d0,d0
4176_NEON2SSE_INLINE int64x2_t vmlsl_s32(int64x2_t a, int32x2_t b, int32x2_t c) // VMLSL.S32 q0,d0,d0
4177{
4178 //may be not optimal compared with serial implementation
4179 int64x2_t res;
4180 res = vmull_s32( b,c);
4181 return _mm_sub_epi64 (a, res);
4182}
4183
4184_NEON2SSESTORAGE uint16x8_t vmlsl_u8(uint16x8_t a, uint8x8_t b, uint8x8_t c); // VMLSL.U8 q0,d0,d0
4185_NEON2SSE_INLINE uint16x8_t vmlsl_u8(uint16x8_t a, uint8x8_t b, uint8x8_t c) // VMLSL.U8 q0,d0,d0
4186{
4187 uint16x8_t res;
4188 res = vmull_u8(b, c);
4189 return _mm_sub_epi16 (a, res);
4190}
4191
4192_NEON2SSESTORAGE uint32x4_t vmlsl_u16(uint32x4_t a, uint16x4_t b, uint16x4_t c); // VMLSL.s16 q0,d0,d0
4193_NEON2SSE_INLINE uint32x4_t vmlsl_u16(uint32x4_t a, uint16x4_t b, uint16x4_t c) // VMLSL.s16 q0,d0,d0
4194{
4195 //may be not optimal compared with serial implementation
4196 uint32x4_t res;
4197 res = vmull_u16(b, c);
4198 return _mm_sub_epi32 (a, res);
4199}
4200
4201_NEON2SSESTORAGE uint64x2_t vmlsl_u32(uint64x2_t a, uint32x2_t b, uint32x2_t c); // VMLSL.U32 q0,d0,d0
4202_NEON2SSE_INLINE uint64x2_t vmlsl_u32(uint64x2_t a, uint32x2_t b, uint32x2_t c) // VMLSL.U32 q0,d0,d0
4203{
4204 //may be not optimal compared with serial implementation
4205 int64x2_t res;
4206 res = vmull_u32( b,c);
4207 return _mm_sub_epi64 (a, res);
4208}
4209
4210//****** Vector saturating doubling multiply high **********************
4211//*************************************************************************
4212_NEON2SSESTORAGE int16x4_t vqdmulh_s16(int16x4_t a, int16x4_t b); // VQDMULH.S16 d0,d0,d0
4213_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int16x4_t vqdmulh_s16(int16x4_t a, int16x4_t b), _NEON2SSE_REASON_SLOW_SERIAL)
4214{
4215 int16x4_t res;
4216 int32_t a32, b32, i;
4217 for (i = 0; i<4; i++) {
4218 a32 = (int32_t) a.m64_i16[i];
4219 b32 = (int32_t) b.m64_i16[i];
4220 a32 = (a32 * b32) >> 15;
4221 res.m64_i16[i] = (a32 == 0x8000) ? 0x7fff : (int16_t) a32;
4222 }
4223 return res;
4224}
4225
4226_NEON2SSESTORAGE int32x2_t vqdmulh_s32(int32x2_t a, int32x2_t b); // VQDMULH.S32 d0,d0,d0
4227_NEON2SSE_INLINE int32x2_t vqdmulh_s32(int32x2_t a, int32x2_t b) // no multiply high 32 bit SIMD in IA32, so need to do some tricks, serial solution may be faster
4228{
4229 //may be not optimal compared with a serial solution
4230 int32x2_t res64;
4231 __m128i mask;
4232 _NEON2SSE_ALIGN_16 static const uint32_t cmask32[] = {0x80000000, 0x80000000, 0x80000000, 0x80000000};
4233 int64x2_t mul;
4234 mul = vmull_s32(a,b);
4235 mul = _mm_slli_epi64(mul,1); //double the result
4236 //at this point start treating 2 64-bit numbers as 4 32-bit
4237 mul = _mm_shuffle_epi32 (mul, 1 | (3 << 2) | (0 << 4) | (2 << 6)); //shuffle the data to get 2 32-bits
4238 mask = _mm_cmpeq_epi32 (mul, *(__m128i*)cmask32);
4239 mul = _mm_xor_si128 (mul, mask); //res saturated for 0x80000000
4240 return64(mul);
4241}
4242
4243_NEON2SSESTORAGE int16x8_t vqdmulhq_s16(int16x8_t a, int16x8_t b); // VQDMULH.S16 q0,q0,q0
4244_NEON2SSE_INLINE int16x8_t vqdmulhq_s16(int16x8_t a, int16x8_t b) // VQDMULH.S16 q0,q0,q0
4245{
4246 __m128i res, res_lo, mask;
4247 _NEON2SSE_ALIGN_16 static const uint16_t cmask[] = {0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000};
4248 res = _mm_mulhi_epi16 (a, b);
4249 res = _mm_slli_epi16 (res, 1); //double the result, don't care about saturation
4250 res_lo = _mm_mullo_epi16 (a, b);
4251 res_lo = _mm_srli_epi16(res_lo,15); //take the highest bit
4252 res = _mm_add_epi16(res, res_lo); //combine results
4253 mask = _mm_cmpeq_epi16 (res, *(__m128i*)cmask);
4254 return _mm_xor_si128 (res, mask); //res saturated for 0x8000
4255}
4256
4257_NEON2SSESTORAGE int32x4_t vqdmulhq_s32(int32x4_t a, int32x4_t b); // VQDMULH.S32 q0,q0,q0
4258_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int32x4_t vqdmulhq_s32(int32x4_t a, int32x4_t b), _NEON2SSE_REASON_SLOW_UNEFFECTIVE)
4259{
4260 // no multiply high 32 bit SIMD in IA32, may be not optimal compared with a serial solution for the SSSE3 target
4261 __m128i ab, ba, mask, mul, mul1;
4262 _NEON2SSE_ALIGN_16 static const uint32_t cmask32[] = {0x80000000, 0x80000000, 0x80000000, 0x80000000};
4263 ab = _mm_unpacklo_epi32 (a, b); //a0, b0, a1,b1
4264 ba = _mm_unpacklo_epi32 (b, a); //b0, a0, b1,a1
4265 mul = _MM_MUL_EPI32(ab, ba); //uses 1rst and 3rd data lanes, the multiplication gives 64 bit result
4266 mul = _mm_slli_epi64(mul,1); //double the result
4267 ab = _mm_unpackhi_epi32 (a, b); //a2, b2, a3,b3
4268 ba = _mm_unpackhi_epi32 (b, a); //b2, a2, b3,a3
4269 mul1 = _MM_MUL_EPI32(ab, ba); //uses 1rst and 3rd data lanes, the multiplication gives 64 bit result
4270 mul1 = _mm_slli_epi64(mul1,1); //double the result
4271 mul = _mm_shuffle_epi32 (mul, 1 | (3 << 2) | (0 << 4) | (2 << 6)); //shuffle the data to get 2 32-bits
4272 mul1 = _mm_shuffle_epi32 (mul1, 1 | (3 << 2) | (0 << 4) | (2 << 6)); //shuffle the data to get 2 32-bits
4273 mul = _mm_unpacklo_epi64(mul, mul1);
4274 mask = _mm_cmpeq_epi32 (mul, *(__m128i*)cmask32);
4275 return _mm_xor_si128 (mul, mask); //res saturated for 0x80000000
4276}
4277
4278//********* Vector saturating rounding doubling multiply high ****************
4279//****************************************************************************
4280//If use _mm_mulhrs_xx functions the result may differ from NEON one a little due to different rounding rules and order
4281_NEON2SSESTORAGE int16x4_t vqrdmulh_s16(int16x4_t a, int16x4_t b); // VQRDMULH.S16 d0,d0,d0
4282_NEON2SSE_INLINE int16x4_t vqrdmulh_s16(int16x4_t a, int16x4_t b)
4283{
4284 int16x4_t res64;
4285 return64(vqrdmulhq_s16(_pM128i(a), _pM128i(b)));
4286}
4287
4288_NEON2SSESTORAGE int32x2_t vqrdmulh_s32(int32x2_t a, int32x2_t b); // VQRDMULH.S32 d0,d0,d0
4289_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int32x2_t vqrdmulh_s32(int32x2_t a, int32x2_t b), _NEON2SSE_REASON_SLOW_UNEFFECTIVE)
4290{
4291 //may be not optimal compared with a serial solution
4292 int32x2_t res64;
4293 _NEON2SSE_ALIGN_16 static const uint32_t cmask32[] = {0x80000000, 0x80000000, 0x80000000, 0x80000000};
4294 __m128i res_sat, mask, mask1;
4295 int64x2_t mul;
4296 mul = vmull_s32(a,b);
4297 res_sat = _mm_slli_epi64 (mul, 1); //double the result, saturation not considered
4298 mask1 = _mm_slli_epi64(res_sat, 32); //shift left then back right to
4299 mask1 = _mm_srli_epi64(mask1,31); //get 31-th bit 1 or zero
4300 mul = _mm_add_epi32 (res_sat, mask1); //actual rounding
4301 //at this point start treating 2 64-bit numbers as 4 32-bit
4302 mul = _mm_shuffle_epi32 (mul, 1 | (3 << 2) | (0 << 4) | (2 << 6)); //shuffle the data to get 2 32-bits from each 64-bit
4303 mask = _mm_cmpeq_epi32 (mul, *(__m128i*)cmask32);
4304 mul = _mm_xor_si128 (mul, mask); //res saturated for 0x80000000
4305 return64(mul);
4306}
4307
4308_NEON2SSESTORAGE int16x8_t vqrdmulhq_s16(int16x8_t a, int16x8_t b); // VQRDMULH.S16 q0,q0,q0
4309_NEON2SSE_INLINE int16x8_t vqrdmulhq_s16(int16x8_t a, int16x8_t b) // VQRDMULH.S16 q0,q0,q0
4310{
4311 __m128i mask, res;
4312 _NEON2SSE_ALIGN_16 static const uint16_t cmask[] = {0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000};
4313 res = _mm_mulhrs_epi16 (a, b);
4314 mask = _mm_cmpeq_epi16 (res, *(__m128i*)cmask);
4315 return _mm_xor_si128 (res, mask); //res saturated for 0x8000
4316}
4317
4318_NEON2SSESTORAGE int32x4_t vqrdmulhq_s32(int32x4_t a, int32x4_t b); // VQRDMULH.S32 q0,q0,q0
4319_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int32x4_t vqrdmulhq_s32(int32x4_t a, int32x4_t b), _NEON2SSE_REASON_SLOW_UNEFFECTIVE)
4320{
4321 // no multiply high 32 bit SIMD in IA32, may be not optimal compared with a serial solution for the SSSE3 target
4322 __m128i ab, ba, mask, mul, mul1, mask1;
4323 _NEON2SSE_ALIGN_16 static const uint32_t cmask32[] = {0x80000000, 0x80000000, 0x80000000, 0x80000000};
4324 ab = _mm_unpacklo_epi32 (a, b); //a0, b0, a1,b1
4325 ba = _mm_unpacklo_epi32 (b, a); //b0, a0, b1,a1
4326 mul = _MM_MUL_EPI32(ab, ba); //uses 1rst and 3rd data lanes, the multiplication gives 64 bit result
4327 mul = _mm_slli_epi64 (mul, 1); //double the result, saturation not considered
4328 mask1 = _mm_slli_epi64(mul, 32); //shift left then back right to
4329 mask1 = _mm_srli_epi64(mask1,31); //get 31-th bit 1 or zero
4330 mul = _mm_add_epi32 (mul, mask1); //actual rounding
4331
4332 ab = _mm_unpackhi_epi32 (a, b); //a2, b2, a3,b3
4333 ba = _mm_unpackhi_epi32 (b, a); //b2, a2, b3,a3
4334 mul1 = _MM_MUL_EPI32(ab, ba); //uses 1rst and 3rd data lanes, the multiplication gives 64 bit result
4335 mul1 = _mm_slli_epi64 (mul1, 1); //double the result, saturation not considered
4336 mask1 = _mm_slli_epi64(mul1, 32); //shift left then back right to
4337 mask1 = _mm_srli_epi64(mask1,31); //get 31-th bit 1 or zero
4338 mul1 = _mm_add_epi32 (mul1, mask1); //actual rounding
4339 //at this point start treating 2 64-bit numbers as 4 32-bit
4340 mul = _mm_shuffle_epi32 (mul, 1 | (3 << 2) | (0 << 4) | (2 << 6)); //shuffle the data to get 2 32-bits from each 64-bit
4341 mul1 = _mm_shuffle_epi32 (mul1, 1 | (3 << 2) | (0 << 4) | (2 << 6)); //shuffle the data to get 2 32-bits from each 64-bit
4342 mul = _mm_unpacklo_epi64(mul, mul1);
4343 mask = _mm_cmpeq_epi32 (mul, *(__m128i*)cmask32);
4344 return _mm_xor_si128 (mul, mask); //res saturated for 0x80000000
4345}
4346
4347//*************Vector widening saturating doubling multiply accumulate (long saturating doubling multiply accumulate) *****
4348//*************************************************************************************************************************
4349_NEON2SSESTORAGE int32x4_t vqdmlal_s16(int32x4_t a, int16x4_t b, int16x4_t c); // VQDMLAL.S16 q0,d0,d0
4350_NEON2SSE_INLINE int32x4_t vqdmlal_s16(int32x4_t a, int16x4_t b, int16x4_t c) // VQDMLAL.S16 q0,d0,d0
4351{
4352 //not optimal SIMD soulution, serial may be faster
4353 __m128i res32;
4354 res32 = vmull_s16(b, c);
4355 res32 = vqd_s32(res32); //doubling & saturation ,if no saturation we could use _mm_slli_epi32 (res, 1);
4356 return vqaddq_s32(res32, a); //saturation
4357}
4358
4359_NEON2SSESTORAGE int64x2_t vqdmlal_s32(int64x2_t a, int32x2_t b, int32x2_t c); // VQDMLAL.S32 q0,d0,d0
4360_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int64x2_t vqdmlal_s32(int64x2_t a, int32x2_t b, int32x2_t c),_NEON2SSE_REASON_SLOW_SERIAL)
4361{
4362 __m128i res64;
4363 res64 = vmull_s32(b,c);
4364 res64 = vqaddq_s64(res64, res64); //doubling & saturation ,if no saturation we could use _mm_slli_epi64 (res, 1);
4365 return vqaddq_s64(res64, a); //saturation
4366}
4367
4368//************************************************************************************
4369//****************** Vector subtract ***********************************************
4370//************************************************************************************
4371_NEON2SSESTORAGE int8x8_t vsub_s8(int8x8_t a, int8x8_t b); // VSUB.I8 d0,d0,d0
4372_NEON2SSE_INLINE int8x8_t vsub_s8(int8x8_t a, int8x8_t b)
4373{
4374 int8x8_t res64;
4375 return64(_mm_sub_epi8(_pM128i(a),_pM128i(b)));
4376}
4377
4378
4379_NEON2SSESTORAGE int16x4_t vsub_s16(int16x4_t a, int16x4_t b); // VSUB.I16 d0,d0,d0
4380_NEON2SSE_INLINE int16x4_t vsub_s16(int16x4_t a, int16x4_t b)
4381{
4382 int16x4_t res64;
4383 return64(_mm_sub_epi16(_pM128i(a),_pM128i(b)));
4384}
4385
4386
4387_NEON2SSESTORAGE int32x2_t vsub_s32(int32x2_t a, int32x2_t b); // VSUB.I32 d0,d0,d0
4388_NEON2SSE_INLINE int32x2_t vsub_s32(int32x2_t a, int32x2_t b)
4389{
4390 int32x2_t res64;
4391 return64(_mm_sub_epi32(_pM128i(a),_pM128i(b)));
4392}
4393
4394
4395_NEON2SSESTORAGE int64x1_t vsub_s64(int64x1_t a, int64x1_t b); // VSUB.I64 d0,d0,d0
4396_NEON2SSE_INLINE int64x1_t vsub_s64(int64x1_t a, int64x1_t b)
4397{
4398 int64x1_t res64;
4399 res64.m64_i64[0] = a.m64_i64[0] - b.m64_i64[0];
4400 return res64;
4401}
4402
4403
4404_NEON2SSESTORAGE float32x2_t vsub_f32(float32x2_t a, float32x2_t b); // VSUB.F32 d0,d0,d0
4405_NEON2SSE_INLINE float32x2_t vsub_f32(float32x2_t a, float32x2_t b)
4406{
4407 float32x2_t res;
4408 res.m64_f32[0] = a.m64_f32[0] - b.m64_f32[0];
4409 res.m64_f32[1] = a.m64_f32[1] - b.m64_f32[1];
4410 return res;
4411}
4412
4413_NEON2SSE_GLOBAL uint8x8_t vsub_u8(uint8x8_t a, uint8x8_t b); // VSUB.I8 d0,d0,d0
4414#define vsub_u8 vsub_s8
4415
4416_NEON2SSE_GLOBAL uint16x4_t vsub_u16(uint16x4_t a, uint16x4_t b); // VSUB.I16 d0,d0,d0
4417#define vsub_u16 vsub_s16
4418
4419_NEON2SSE_GLOBAL uint32x2_t vsub_u32(uint32x2_t a, uint32x2_t b); // VSUB.I32 d0,d0,d0
4420#define vsub_u32 vsub_s32
4421
4422
4423_NEON2SSESTORAGE uint64x1_t vsub_u64(uint64x1_t a, uint64x1_t b); // VSUB.I64 d0,d0,d0
4424_NEON2SSE_INLINE uint64x1_t vsub_u64(uint64x1_t a, uint64x1_t b)
4425{
4426 int64x1_t res64;
4427 res64.m64_u64[0] = a.m64_u64[0] - b.m64_u64[0];
4428 return res64;
4429}
4430
4431
4432_NEON2SSE_GLOBAL int8x16_t vsubq_s8(int8x16_t a, int8x16_t b); // VSUB.I8 q0,q0,q0
4433#define vsubq_s8 _mm_sub_epi8
4434
4435_NEON2SSE_GLOBAL int16x8_t vsubq_s16(int16x8_t a, int16x8_t b); // VSUB.I16 q0,q0,q0
4436#define vsubq_s16 _mm_sub_epi16
4437
4438_NEON2SSE_GLOBAL int32x4_t vsubq_s32(int32x4_t a, int32x4_t b); // VSUB.I32 q0,q0,q0
4439#define vsubq_s32 _mm_sub_epi32
4440
4441_NEON2SSE_GLOBAL int64x2_t vsubq_s64(int64x2_t a, int64x2_t b); // VSUB.I64 q0,q0,q0
4442#define vsubq_s64 _mm_sub_epi64
4443
4444_NEON2SSE_GLOBAL float32x4_t vsubq_f32(float32x4_t a, float32x4_t b); // VSUB.F32 q0,q0,q0
4445#define vsubq_f32 _mm_sub_ps
4446
4447_NEON2SSE_GLOBAL uint8x16_t vsubq_u8(uint8x16_t a, uint8x16_t b); // VSUB.I8 q0,q0,q0
4448#define vsubq_u8 _mm_sub_epi8
4449
4450_NEON2SSE_GLOBAL uint16x8_t vsubq_u16(uint16x8_t a, uint16x8_t b); // VSUB.I16 q0,q0,q0
4451#define vsubq_u16 _mm_sub_epi16
4452
4453_NEON2SSE_GLOBAL uint32x4_t vsubq_u32(uint32x4_t a, uint32x4_t b); // VSUB.I32 q0,q0,q0
4454#define vsubq_u32 _mm_sub_epi32
4455
4456_NEON2SSE_GLOBAL uint64x2_t vsubq_u64(uint64x2_t a, uint64x2_t b); // VSUB.I64 q0,q0,q0
4457#define vsubq_u64 _mm_sub_epi64
4458
4459//***************Vector long subtract: vsub -> Vr[i]:=Va[i]-Vb[i] ******************
4460//***********************************************************************************
4461//Va, Vb have equal lane sizes, result is a 128 bit vector of lanes that are twice the width.
4462_NEON2SSESTORAGE int16x8_t vsubl_s8(int8x8_t a, int8x8_t b); // VSUBL.S8 q0,d0,d0
4463_NEON2SSE_INLINE int16x8_t vsubl_s8(int8x8_t a, int8x8_t b) // VSUBL.S8 q0,d0,d0
4464{
4465 __m128i a16, b16;
4466 a16 = _MM_CVTEPI8_EPI16 (_pM128i(a)); //SSE4.1,
4467 b16 = _MM_CVTEPI8_EPI16 (_pM128i(b)); //SSE4.1,
4468 return _mm_sub_epi16 (a16, b16);
4469}
4470
4471_NEON2SSESTORAGE int32x4_t vsubl_s16(int16x4_t a, int16x4_t b); // VSUBL.S16 q0,d0,d0
4472_NEON2SSE_INLINE int32x4_t vsubl_s16(int16x4_t a, int16x4_t b) // VSUBL.S16 q0,d0,d0
4473{
4474 __m128i a32, b32;
4475 a32 = _MM_CVTEPI16_EPI32 (_pM128i(a)); //SSE4.1
4476 b32 = _MM_CVTEPI16_EPI32 (_pM128i(b)); //SSE4.1,
4477 return _mm_sub_epi32 (a32, b32);
4478}
4479
4480_NEON2SSESTORAGE int64x2_t vsubl_s32(int32x2_t a, int32x2_t b); // VSUBL.S32 q0,d0,d0
4481_NEON2SSE_INLINE int64x2_t vsubl_s32(int32x2_t a, int32x2_t b) // VSUBL.S32 q0,d0,d0
4482{
4483 //may be not optimal
4484 __m128i a64, b64;
4485 a64 = _MM_CVTEPI32_EPI64 (_pM128i(a)); //SSE4.1
4486 b64 = _MM_CVTEPI32_EPI64 (_pM128i(b)); //SSE4.1,
4487 return _mm_sub_epi64 (a64, b64);
4488}
4489
4490_NEON2SSESTORAGE uint16x8_t vsubl_u8(uint8x8_t a, uint8x8_t b); // VSUBL.U8 q0,d0,d0
4491_NEON2SSE_INLINE uint16x8_t vsubl_u8(uint8x8_t a, uint8x8_t b) // VSUBL.U8 q0,d0,d0
4492{
4493 __m128i a16, b16;
4494 a16 = _MM_CVTEPU8_EPI16 (_pM128i(a)); //SSE4.1,
4495 b16 = _MM_CVTEPU8_EPI16 (_pM128i(b)); //SSE4.1,
4496 return _mm_sub_epi16 (a16, b16);
4497}
4498
4499_NEON2SSESTORAGE uint32x4_t vsubl_u16(uint16x4_t a, uint16x4_t b); // VSUBL.s16 q0,d0,d0
4500_NEON2SSE_INLINE uint32x4_t vsubl_u16(uint16x4_t a, uint16x4_t b) // VSUBL.s16 q0,d0,d0
4501{
4502 __m128i a32, b32;
4503 a32 = _MM_CVTEPU16_EPI32 (_pM128i(a)); //SSE4.1
4504 b32 = _MM_CVTEPU16_EPI32 (_pM128i(b)); //SSE4.1,
4505 return _mm_sub_epi32 (a32, b32);
4506}
4507
4508_NEON2SSESTORAGE uint64x2_t vsubl_u32(uint32x2_t a, uint32x2_t b); // VSUBL.U32 q0,d0,d0
4509_NEON2SSE_INLINE uint64x2_t vsubl_u32(uint32x2_t a, uint32x2_t b) // VSUBL.U32 q0,d0,d0
4510{
4511 //may be not optimal
4512 __m128i a64, b64;
4513 a64 = _MM_CVTEPU32_EPI64 (_pM128i(a)); //SSE4.1
4514 b64 = _MM_CVTEPU32_EPI64 (_pM128i(b)); //SSE4.1,
4515 return _mm_sub_epi64 (a64, b64);
4516}
4517
4518//***************** Vector wide subtract: vsub -> Vr[i]:=Va[i]-Vb[i] **********************************
4519//*****************************************************************************************************
4520_NEON2SSESTORAGE int16x8_t vsubw_s8(int16x8_t a, int8x8_t b); // VSUBW.S8 q0,q0,d0
4521_NEON2SSE_INLINE int16x8_t vsubw_s8(int16x8_t a, int8x8_t b) // VSUBW.S8 q0,q0,d0
4522{
4523 __m128i b16;
4524 b16 = _MM_CVTEPI8_EPI16 (_pM128i(b)); //SSE4.1,
4525 return _mm_sub_epi16 (a, b16);
4526}
4527
4528_NEON2SSESTORAGE int32x4_t vsubw_s16(int32x4_t a, int16x4_t b); // VSUBW.S16 q0,q0,d0
4529_NEON2SSE_INLINE int32x4_t vsubw_s16(int32x4_t a, int16x4_t b) // VSUBW.S16 q0,q0,d0
4530{
4531 __m128i b32;
4532 b32 = _MM_CVTEPI16_EPI32 (_pM128i(b)); //SSE4.1,
4533 return _mm_sub_epi32 (a, b32);
4534}
4535
4536_NEON2SSESTORAGE int64x2_t vsubw_s32(int64x2_t a, int32x2_t b); // VSUBW.S32 q0,q0,d0
4537_NEON2SSE_INLINE int64x2_t vsubw_s32(int64x2_t a, int32x2_t b) // VSUBW.S32 q0,q0,d0
4538{
4539 __m128i b64;
4540 b64 = _MM_CVTEPI32_EPI64 (_pM128i(b)); //SSE4.1
4541 return _mm_sub_epi64 (a, b64);
4542}
4543
4544_NEON2SSESTORAGE uint16x8_t vsubw_u8(uint16x8_t a, uint8x8_t b); // VSUBW.U8 q0,q0,d0
4545_NEON2SSE_INLINE uint16x8_t vsubw_u8(uint16x8_t a, uint8x8_t b) // VSUBW.U8 q0,q0,d0
4546{
4547 __m128i b16;
4548 b16 = _MM_CVTEPU8_EPI16 (_pM128i(b)); //SSE4.1,
4549 return _mm_sub_epi16 (a, b16);
4550}
4551
4552_NEON2SSESTORAGE uint32x4_t vsubw_u16(uint32x4_t a, uint16x4_t b); // VSUBW.s16 q0,q0,d0
4553_NEON2SSE_INLINE uint32x4_t vsubw_u16(uint32x4_t a, uint16x4_t b) // VSUBW.s16 q0,q0,d0
4554{
4555 __m128i b32;
4556 b32 = _MM_CVTEPU16_EPI32 (_pM128i(b)); //SSE4.1,
4557 return _mm_sub_epi32 (a, b32);
4558}
4559
4560_NEON2SSESTORAGE uint64x2_t vsubw_u32(uint64x2_t a, uint32x2_t b); // VSUBW.U32 q0,q0,d0
4561_NEON2SSE_INLINE uint64x2_t vsubw_u32(uint64x2_t a, uint32x2_t b) // VSUBW.U32 q0,q0,d0
4562{
4563 __m128i b64;
4564 b64 = _MM_CVTEPU32_EPI64 (_pM128i(b)); //SSE4.1
4565 return _mm_sub_epi64 (a, b64);
4566}
4567
4568//************************Vector saturating subtract *********************************
4569//*************************************************************************************
4570_NEON2SSESTORAGE int8x8_t vqsub_s8(int8x8_t a, int8x8_t b); // VQSUB.S8 d0,d0,d0
4571_NEON2SSE_INLINE int8x8_t vqsub_s8(int8x8_t a, int8x8_t b)
4572{
4573 int8x8_t res64;
4574 return64(_mm_subs_epi8(_pM128i(a),_pM128i(b)));
4575}
4576
4577
4578_NEON2SSESTORAGE int16x4_t vqsub_s16(int16x4_t a, int16x4_t b); // VQSUB.S16 d0,d0,d0
4579_NEON2SSE_INLINE int16x4_t vqsub_s16(int16x4_t a, int16x4_t b)
4580{
4581 int16x4_t res64;
4582 return64(_mm_subs_epi16(_pM128i(a),_pM128i(b)));
4583}
4584
4585
4586_NEON2SSESTORAGE int32x2_t vqsub_s32(int32x2_t a, int32x2_t b); // VQSUB.S32 d0,d0,d0
4587_NEON2SSE_INLINE int32x2_t vqsub_s32(int32x2_t a, int32x2_t b)
4588{
4589 int32x2_t res64;
4590 return64(vqsubq_s32(_pM128i(a), _pM128i(b)));
4591}
4592
4593
4594_NEON2SSESTORAGE int64x1_t vqsub_s64(int64x1_t a, int64x1_t b); // VQSUB.S64 d0,d0,d0
4595_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int64x1_t vqsub_s64(int64x1_t a, int64x1_t b), _NEON2SSE_REASON_SLOW_SERIAL) //no optimal SIMD soulution
4596{
4597 uint64x1_t res;
4598 uint64_t a64,b64;
4599 a64 = a.m64_u64[0];
4600 b64 = b.m64_u64[0];
4601 res.m64_u64[0] = a64 - b64;
4602
4603 a64 = (a64 >> 63) + (~_SIGNBIT64);
4604 if ((int64_t)((a64 ^ b64) & (a64 ^ res.m64_u64[0])) < 0) {
4605 res.m64_u64[0] = a64;
4606 }
4607 return res;
4608}
4609
4610_NEON2SSESTORAGE uint8x8_t vqsub_u8(uint8x8_t a, uint8x8_t b); // VQSUB.U8 d0,d0,d0
4611_NEON2SSE_INLINE uint8x8_t vqsub_u8(uint8x8_t a, uint8x8_t b)
4612{
4613 uint8x8_t res64;
4614 return64(_mm_subs_epu8(_pM128i(a),_pM128i(b)));
4615}
4616
4617
4618_NEON2SSESTORAGE uint16x4_t vqsub_u16(uint16x4_t a, uint16x4_t b); // VQSUB.s16 d0,d0,d0
4619_NEON2SSE_INLINE uint16x4_t vqsub_u16(uint16x4_t a, uint16x4_t b)
4620{
4621 uint16x4_t res64;
4622 return64(_mm_subs_epu16(_pM128i(a),_pM128i(b)));
4623}
4624
4625
4626_NEON2SSESTORAGE uint32x2_t vqsub_u32(uint32x2_t a, uint32x2_t b); // VQSUB.U32 d0,d0,d0
4627_NEON2SSE_INLINE uint32x2_t vqsub_u32(uint32x2_t a, uint32x2_t b)
4628{
4629 uint32x2_t res64;
4630 return64(vqsubq_u32(_pM128i(a), _pM128i(b)));
4631}
4632
4633
4634_NEON2SSESTORAGE uint64x1_t vqsub_u64(uint64x1_t a, uint64x1_t b); // VQSUB.U64 d0,d0,d0
4635_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint64x1_t vqsub_u64(uint64x1_t a, uint64x1_t b), _NEON2SSE_REASON_SLOW_SERIAL)
4636{
4637 uint64x1_t res;
4638 uint64_t a64, b64;
4639 a64 = _Ui64(a);
4640 b64 = _Ui64(b);
4641 if (a64 > b64) {
4642 res.m64_u64[0] = a64 - b64;
4643 } else {
4644 res.m64_u64[0] = 0;
4645 }
4646 return res;
4647}
4648
4649_NEON2SSE_GLOBAL int8x16_t vqsubq_s8(int8x16_t a, int8x16_t b); // VQSUB.S8 q0,q0,q0
4650#define vqsubq_s8 _mm_subs_epi8
4651
4652_NEON2SSE_GLOBAL int16x8_t vqsubq_s16(int16x8_t a, int16x8_t b); // VQSUB.S16 q0,q0,q0
4653#define vqsubq_s16 _mm_subs_epi16
4654
4655_NEON2SSESTORAGE int32x4_t vqsubq_s32(int32x4_t a, int32x4_t b); // VQSUB.S32 q0,q0,q0
4656_NEON2SSE_INLINE int32x4_t vqsubq_s32(int32x4_t a, int32x4_t b)
4657{
4658 //no corresponding x86 SIMD soulution, special tricks are necessary. The overflow is possible only if a and b have opposite signs and sub has opposite sign to a
4659 __m128i c7fffffff, res, res_sat, res_xor_a, b_xor_a;
4660 c7fffffff = _mm_set1_epi32(0x7fffffff);
4661 res = _mm_sub_epi32(a, b);
4662 res_sat = _mm_srli_epi32(a, 31);
4663 res_sat = _mm_add_epi32(res_sat, c7fffffff);
4664 res_xor_a = _mm_xor_si128(res, a);
4665 b_xor_a = _mm_xor_si128(b, a);
4666 res_xor_a = _mm_and_si128(b_xor_a, res_xor_a);
4667 res_xor_a = _mm_srai_epi32(res_xor_a,31); //propagate the sigh bit, all ffff if <0 all ones otherwise
4668 res_sat = _mm_and_si128(res_xor_a, res_sat);
4669 res = _mm_andnot_si128(res_xor_a, res);
4670 return _mm_or_si128(res, res_sat);
4671}
4672
4673_NEON2SSESTORAGE int64x2_t vqsubq_s64(int64x2_t a, int64x2_t b); // VQSUB.S64 q0,q0,q0
4674_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int64x2_t vqsubq_s64(int64x2_t a, int64x2_t b), _NEON2SSE_REASON_SLOW_SERIAL) //no optimal SIMD soulution
4675{
4676 _NEON2SSE_ALIGN_16 int64_t atmp[2], btmp[2];
4677 _NEON2SSE_ALIGN_16 uint64_t res[2];
4678 _mm_store_si128((__m128i*)atmp, a);
4679 _mm_store_si128((__m128i*)btmp, b);
4680 res[0] = atmp[0] - btmp[0];
4681 res[1] = atmp[1] - btmp[1];
4682 if (((res[0] ^ atmp[0]) & _SIGNBIT64) && ((atmp[0] ^ btmp[0]) & _SIGNBIT64)) {
4683 res[0] = (atmp[0] >> 63) ^ ~_SIGNBIT64;
4684 }
4685 if (((res[1] ^ atmp[1]) & _SIGNBIT64) && ((atmp[1] ^ btmp[1]) & _SIGNBIT64)) {
4686 res[1] = (atmp[1] >> 63) ^ ~_SIGNBIT64;
4687 }
4688 return _mm_load_si128((__m128i*)res);
4689}
4690
4691_NEON2SSE_GLOBAL uint8x16_t vqsubq_u8(uint8x16_t a, uint8x16_t b); // VQSUB.U8 q0,q0,q0
4692#define vqsubq_u8 _mm_subs_epu8
4693
4694_NEON2SSE_GLOBAL uint16x8_t vqsubq_u16(uint16x8_t a, uint16x8_t b); // VQSUB.s16 q0,q0,q0
4695#define vqsubq_u16 _mm_subs_epu16
4696
4697_NEON2SSESTORAGE uint32x4_t vqsubq_u32(uint32x4_t a, uint32x4_t b); // VQSUB.U32 q0,q0,q0
4698_NEON2SSE_INLINE uint32x4_t vqsubq_u32(uint32x4_t a, uint32x4_t b) // VQSUB.U32 q0,q0,q0
4699{
4700 __m128i min, mask, sub;
4701 min = _MM_MIN_EPU32(a, b); //SSE4.1
4702 mask = _mm_cmpeq_epi32 (min, b);
4703 sub = _mm_sub_epi32 (a, b);
4704 return _mm_and_si128 ( sub, mask);
4705}
4706
4707_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint64x2_t vqsubq_u64(uint64x2_t a, uint64x2_t b), _NEON2SSE_REASON_SLOW_SERIAL); // VQSUB.U64 q0,q0,q0
4708#ifdef USE_SSE4
4709 _NEON2SSE_INLINE uint64x2_t vqsubq_u64(uint64x2_t a, uint64x2_t b)
4710 {
4711 __m128i c80000000, subb, suba, cmp, sub;
4712 c80000000 = _mm_set_epi32 (0x80000000, 0x0, 0x80000000, 0x0);
4713 sub = _mm_sub_epi64 (a, b);
4714 suba = _mm_sub_epi64 (a, c80000000);
4715 subb = _mm_sub_epi64 (b, c80000000);
4716 cmp = _mm_cmpgt_epi64 ( suba, subb); //no unsigned comparison, need to go to signed, SSE4.2!!!
4717 return _mm_and_si128 (sub, cmp); //saturation
4718 }
4719#else
4720 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint64x2_t vqsubq_u64(uint64x2_t a, uint64x2_t b), _NEON2SSE_REASON_SLOW_SERIAL)
4721 {
4722 _NEON2SSE_ALIGN_16 uint64_t atmp[2], btmp[2], res[2];
4723 _mm_store_si128((__m128i*)atmp, a);
4724 _mm_store_si128((__m128i*)btmp, b);
4725 res[0] = (atmp[0] > btmp[0]) ? atmp[0] - btmp[0] : 0;
4726 res[1] = (atmp[1] > btmp[1]) ? atmp[1] - btmp[1] : 0;
4727 return _mm_load_si128((__m128i*)(res));
4728 }
4729#endif
4730
4731//**********Vector halving subtract Vr[i]:=(Va[i]-Vb[i])>>1 ******************************************************
4732//****************************************************************
4733_NEON2SSESTORAGE int8x8_t vhsub_s8(int8x8_t a, int8x8_t b); // VHSUB.S8 d0,d0,d0
4734_NEON2SSE_INLINE int8x8_t vhsub_s8(int8x8_t a, int8x8_t b) // VHSUB.S8 d0,d0,d0
4735{
4736 //no 8 bit shift available, internal overflow is possible, so let's go to 16 bit,
4737 int8x8_t res64;
4738 __m128i r16;
4739 int8x8_t r;
4740 r = vsub_s8 (a, b);
4741 r16 = _MM_CVTEPI8_EPI16 (_pM128i(r)); //SSE 4.1
4742 r16 = _mm_srai_epi16 (r16, 1); //SSE2
4743 r16 = _mm_packs_epi16 (r16,r16); //use low 64 bits
4744 return64(r16);
4745}
4746
4747_NEON2SSESTORAGE int16x4_t vhsub_s16(int16x4_t a, int16x4_t b); // VHSUB.S16 d0,d0,d0
4748_NEON2SSE_INLINE int16x4_t vhsub_s16(int16x4_t a, int16x4_t b)
4749{
4750 int16x4_t res64;
4751 return64(vhsubq_s16(_pM128i(a), _pM128i(b)));
4752}
4753
4754
4755
4756_NEON2SSESTORAGE int32x2_t vhsub_s32(int32x2_t a, int32x2_t b); // VHSUB.S32 d0,d0,d0
4757_NEON2SSE_INLINE int32x2_t vhsub_s32(int32x2_t a, int32x2_t b)
4758{
4759 int32x2_t res64;
4760 return64(vhsubq_s32(_pM128i(a), _pM128i(b)));
4761}
4762
4763
4764_NEON2SSESTORAGE uint8x8_t vhsub_u8(uint8x8_t a, uint8x8_t b); // VHSUB.U8 d0,d0,d0
4765_NEON2SSE_INLINE uint8x8_t vhsub_u8(uint8x8_t a, uint8x8_t b)
4766{
4767 uint8x8_t res64;
4768 return64(vhsubq_u8(_pM128i(a), _pM128i(b)));
4769}
4770
4771_NEON2SSESTORAGE uint16x4_t vhsub_u16(uint16x4_t a, uint16x4_t b); // VHSUB.s16 d0,d0,d0
4772_NEON2SSE_INLINE uint16x4_t vhsub_u16(uint16x4_t a, uint16x4_t b)
4773{
4774 uint16x4_t res64;
4775 return64(vhsubq_u16(_pM128i(a), _pM128i(b)));
4776}
4777
4778_NEON2SSESTORAGE uint32x2_t vhsub_u32(uint32x2_t a, uint32x2_t b); // VHSUB.U32 d0,d0,d0
4779_NEON2SSE_INLINE uint32x2_t vhsub_u32(uint32x2_t a, uint32x2_t b)
4780{
4781 uint32x2_t res64;
4782 return64(vhsubq_u32(_pM128i(a), _pM128i(b)));
4783}
4784
4785_NEON2SSESTORAGE int8x16_t vhsubq_s8(int8x16_t a, int8x16_t b); // VHSUB.S8 q0,q0,q0
4786_NEON2SSE_INLINE int8x16_t vhsubq_s8(int8x16_t a, int8x16_t b) // VHSUB.S8 q0,q0,q0
4787{
4788 //need to deal with the possibility of internal overflow
4789 __m128i c128, au,bu;
4790 c128 = _mm_set1_epi8(-128); //(int8_t)0x80
4791 au = _mm_add_epi8( a, c128);
4792 bu = _mm_add_epi8( b, c128);
4793 return vhsubq_u8(au,bu);
4794}
4795
4796_NEON2SSESTORAGE int16x8_t vhsubq_s16(int16x8_t a, int16x8_t b); // VHSUB.S16 q0,q0,q0
4797_NEON2SSE_INLINE int16x8_t vhsubq_s16(int16x8_t a, int16x8_t b) // VHSUB.S16 q0,q0,q0
4798{
4799 //need to deal with the possibility of internal overflow
4800 __m128i c8000, au,bu;
4801 c8000 = _mm_set1_epi16(-32768); //(int16_t)0x8000
4802 au = _mm_add_epi16( a, c8000);
4803 bu = _mm_add_epi16( b, c8000);
4804 return vhsubq_u16(au,bu);
4805}
4806
4807_NEON2SSESTORAGE int32x4_t vhsubq_s32(int32x4_t a, int32x4_t b); // VHSUB.S32 q0,q0,q0
4808_NEON2SSE_INLINE int32x4_t vhsubq_s32(int32x4_t a, int32x4_t b) // VHSUB.S32 q0,q0,q0
4809{
4810 //need to deal with the possibility of internal overflow
4811 __m128i a2, b2,r, b_1;
4812 a2 = _mm_srai_epi32 (a,1);
4813 b2 = _mm_srai_epi32 (b,1);
4814 r = _mm_sub_epi32 (a2, b2);
4815 b_1 = _mm_andnot_si128(a, b); //!a and b
4816 b_1 = _mm_slli_epi32 (b_1,31);
4817 b_1 = _mm_srli_epi32 (b_1,31); //0 or 1, last b bit
4818 return _mm_sub_epi32(r,b_1);
4819}
4820
4821_NEON2SSESTORAGE uint8x16_t vhsubq_u8(uint8x16_t a, uint8x16_t b); // VHSUB.U8 q0,q0,q0
4822_NEON2SSE_INLINE uint8x16_t vhsubq_u8(uint8x16_t a, uint8x16_t b) // VHSUB.U8 q0,q0,q0
4823{
4824 __m128i avg;
4825 avg = _mm_avg_epu8 (a, b);
4826 return _mm_sub_epi8(a, avg);
4827}
4828
4829_NEON2SSESTORAGE uint16x8_t vhsubq_u16(uint16x8_t a, uint16x8_t b); // VHSUB.s16 q0,q0,q0
4830_NEON2SSE_INLINE uint16x8_t vhsubq_u16(uint16x8_t a, uint16x8_t b) // VHSUB.s16 q0,q0,q0
4831{
4832 __m128i avg;
4833 avg = _mm_avg_epu16 (a, b);
4834 return _mm_sub_epi16(a, avg);
4835}
4836
4837_NEON2SSESTORAGE uint32x4_t vhsubq_u32(uint32x4_t a, uint32x4_t b); // VHSUB.U32 q0,q0,q0
4838_NEON2SSE_INLINE uint32x4_t vhsubq_u32(uint32x4_t a, uint32x4_t b) // VHSUB.U32 q0,q0,q0
4839{
4840 //need to deal with the possibility of internal overflow
4841 __m128i a2, b2,r, b_1;
4842 a2 = _mm_srli_epi32 (a,1);
4843 b2 = _mm_srli_epi32 (b,1);
4844 r = _mm_sub_epi32 (a2, b2);
4845 b_1 = _mm_andnot_si128(a, b); //!a and b
4846 b_1 = _mm_slli_epi32 (b_1,31);
4847 b_1 = _mm_srli_epi32 (b_1,31); //0 or 1, last b bit
4848 return _mm_sub_epi32(r,b_1);
4849}
4850
4851//******* Vector subtract high half (truncated) ** ************
4852//************************************************************
4853_NEON2SSESTORAGE int8x8_t vsubhn_s16(int16x8_t a, int16x8_t b); // VSUBHN.I16 d0,q0,q0
4854_NEON2SSE_INLINE int8x8_t vsubhn_s16(int16x8_t a, int16x8_t b) // VSUBHN.I16 d0,q0,q0
4855{
4856 int8x8_t res64;
4857 __m128i sum, sum8;
4858 sum = _mm_sub_epi16 (a, b);
4859 sum8 = _mm_srai_epi16 (sum, 8);
4860 sum8 = _mm_packs_epi16(sum8,sum8);
4861 return64(sum8);
4862}
4863
4864_NEON2SSESTORAGE int16x4_t vsubhn_s32(int32x4_t a, int32x4_t b); // VSUBHN.I32 d0,q0,q0
4865_NEON2SSE_INLINE int16x4_t vsubhn_s32(int32x4_t a, int32x4_t b) // VSUBHN.I32 d0,q0,q0
4866{
4867 int16x4_t res64;
4868 __m128i sum, sum16;
4869 sum = _mm_sub_epi32 (a, b);
4870 sum16 = _mm_srai_epi32 (sum, 16);
4871 sum16 = _mm_packs_epi32(sum16,sum16);
4872 return64(sum16);
4873}
4874
4875_NEON2SSESTORAGE int32x2_t vsubhn_s64(int64x2_t a, int64x2_t b); // VSUBHN.I64 d0,q0,q0
4876_NEON2SSE_INLINE int32x2_t vsubhn_s64(int64x2_t a, int64x2_t b)
4877{
4878 int32x2_t res64;
4879 __m128i sub;
4880 sub = _mm_sub_epi64 (a, b);
4881 sub = _mm_shuffle_epi32(sub, 1 | (3 << 2) | (0 << 4) | (2 << 6));
4882 return64(sub);
4883}
4884
4885_NEON2SSESTORAGE uint8x8_t vsubhn_u16(uint16x8_t a, uint16x8_t b); // VSUBHN.I16 d0,q0,q0
4886_NEON2SSE_INLINE uint8x8_t vsubhn_u16(uint16x8_t a, uint16x8_t b) // VSUBHN.I16 d0,q0,q0
4887{
4888 uint8x8_t res64;
4889 __m128i sum, sum8;
4890 sum = _mm_sub_epi16 (a, b);
4891 sum8 = _mm_srli_epi16 (sum, 8);
4892 sum8 = _mm_packus_epi16(sum8,sum8);
4893 return64(sum8);
4894}
4895
4896_NEON2SSESTORAGE uint16x4_t vsubhn_u32(uint32x4_t a, uint32x4_t b); // VSUBHN.I32 d0,q0,q0
4897_NEON2SSE_INLINE uint16x4_t vsubhn_u32(uint32x4_t a, uint32x4_t b) // VSUBHN.I32 d0,q0,q0
4898{
4899 uint16x4_t res64;
4900 __m128i sum, sum16;
4901 sum = _mm_sub_epi32 (a, b);
4902 sum16 = _mm_srli_epi32 (sum, 16);
4903#ifdef USE_SSE4
4904 sum16 = _MM_PACKUS1_EPI32(sum16);
4905#else
4906 sum16 = _mm_shuffle_epi8 (sum16, *(__m128i*) mask8_32_even_odd); //go to 16 bits
4907#endif
4908 return64(sum16);
4909}
4910
4911_NEON2SSE_GLOBAL uint32x2_t vsubhn_u64(uint64x2_t a, uint64x2_t b); // VSUBHN.I64 d0,q0,q0
4912#define vsubhn_u64 vsubhn_s64
4913
4914//************ Vector rounding subtract high half *********************
4915//*********************************************************************
4916_NEON2SSESTORAGE int8x8_t vrsubhn_s16(int16x8_t a, int16x8_t b); // VRSUBHN.I16 d0,q0,q0
4917_NEON2SSE_INLINE int8x8_t vrsubhn_s16(int16x8_t a, int16x8_t b) // VRSUBHN.I16 d0,q0,q0
4918{
4919 int8x8_t res64;
4920 __m128i sub, mask1;
4921 sub = _mm_sub_epi16 (a, b);
4922 mask1 = _mm_slli_epi16(sub, 8); //shift left then back right to
4923 mask1 = _mm_srli_epi16(mask1, 15); //get 7-th bit 1 or zero
4924 sub = _mm_srai_epi16 (sub, 8); //get high half
4925 sub = _mm_add_epi16 (sub, mask1); //actual rounding
4926 sub = _mm_packs_epi16 (sub, sub);
4927 return64(sub);
4928}
4929
4930_NEON2SSESTORAGE int16x4_t vrsubhn_s32(int32x4_t a, int32x4_t b); // VRSUBHN.I32 d0,q0,q0
4931_NEON2SSE_INLINE int16x4_t vrsubhn_s32(int32x4_t a, int32x4_t b) // VRSUBHN.I32 d0,q0,q0
4932{
4933 //SIMD may be not optimal, serial may be faster
4934 int16x4_t res64;
4935 __m128i sub, mask1;
4936 sub = _mm_sub_epi32 (a, b);
4937 mask1 = _mm_slli_epi32(sub, 16); //shift left then back right to
4938 mask1 = _mm_srli_epi32(mask1,31); //get 15-th bit 1 or zero
4939 sub = _mm_srai_epi32 (sub, 16); //get high half
4940 sub = _mm_add_epi32 (sub, mask1); //actual rounding
4941 sub = _mm_packs_epi32 (sub, sub);
4942 return64(sub);
4943}
4944
4945_NEON2SSESTORAGE int32x2_t vrsubhn_s64(int64x2_t a, int64x2_t b); // VRSUBHN.I64 d0,q0,q0
4946_NEON2SSE_INLINE int32x2_t vrsubhn_s64(int64x2_t a, int64x2_t b)
4947{
4948 //SIMD may be not optimal, serial may be faster
4949 int32x2_t res64;
4950 __m128i sub, mask1;
4951 sub = _mm_sub_epi64 (a, b);
4952 mask1 = _mm_slli_epi64(sub, 32); //shift left then back right to
4953 mask1 = _mm_srli_epi64(mask1,31); //get 31-th bit 1 or zero
4954 sub = _mm_add_epi32 (sub, mask1); //actual high half rounding
4955 sub = _mm_shuffle_epi32(sub, 1 | (3 << 2) | (0 << 4) | (2 << 6));
4956 return64(sub);
4957}
4958
4959_NEON2SSESTORAGE uint8x8_t vrsubhn_u16(uint16x8_t a, uint16x8_t b); // VRSUBHN.I16 d0,q0,q0
4960_NEON2SSE_INLINE uint8x8_t vrsubhn_u16(uint16x8_t a, uint16x8_t b) // VRSUBHN.I16 d0,q0,q0
4961{
4962 uint8x8_t res64;
4963 __m128i sub, mask1;
4964 sub = _mm_sub_epi16 (a, b);
4965 mask1 = _mm_slli_epi16(sub, 8); //shift left then back right to
4966 mask1 = _mm_srli_epi16(mask1, 15); //get 7-th bit 1 or zero
4967 sub = _mm_srai_epi16 (sub, 8); //get high half
4968 sub = _mm_add_epi16 (sub, mask1); //actual rounding
4969 sub = _mm_packus_epi16 (sub, sub);
4970 return64(sub);
4971}
4972
4973_NEON2SSESTORAGE uint16x4_t vrsubhn_u32(uint32x4_t a, uint32x4_t b); // VRSUBHN.I32 d0,q0,q0
4974_NEON2SSE_INLINE uint16x4_t vrsubhn_u32(uint32x4_t a, uint32x4_t b) // VRSUBHN.I32 d0,q0,q0
4975{
4976 //SIMD may be not optimal, serial may be faster
4977 uint16x4_t res64;
4978 __m128i sub, mask1;
4979 sub = _mm_sub_epi32 (a, b);
4980 mask1 = _mm_slli_epi32(sub, 16); //shift left then back right to
4981 mask1 = _mm_srli_epi32(mask1,31); //get 15-th bit 1 or zero
4982 sub = _mm_srai_epi32 (sub, 16); //get high half
4983 sub = _mm_add_epi32 (sub, mask1); //actual rounding
4984#ifdef USE_SSE4
4985 sub = _MM_PACKUS1_EPI32 (sub);
4986#else
4987 sub = _mm_shuffle_epi8 (sub, *(__m128i*) mask8_32_even_odd); //go to 16 bits
4988#endif
4989 return64(sub);
4990}
4991
4992_NEON2SSE_GLOBAL uint32x2_t vrsubhn_u64(uint64x2_t a, uint64x2_t b); // VRSUBHN.I64 d0,q0,q0
4993#define vrsubhn_u64 vrsubhn_s64
4994
4995//*********** Vector saturating doubling multiply subtract long ********************
4996//************************************************************************************
4997_NEON2SSESTORAGE int32x4_t vqdmlsl_s16(int32x4_t a, int16x4_t b, int16x4_t c); // VQDMLSL.S16 q0,d0,d0
4998_NEON2SSE_INLINE int32x4_t vqdmlsl_s16(int32x4_t a, int16x4_t b, int16x4_t c)
4999{
5000 //not optimal SIMD soulution, serial may be faster
5001 __m128i res32, mask;
5002 int32x4_t res;
5003 _NEON2SSE_ALIGN_16 static const uint32_t cmask[] = {0x80000000, 0x80000000, 0x80000000, 0x80000000};
5004 res = vmull_s16(b, c);
5005 res32 = _mm_slli_epi32 (res, 1); //double the result, saturation not considered
5006 mask = _mm_cmpeq_epi32 (res32, *(__m128i*)cmask);
5007 res32 = _mm_xor_si128 (res32, mask); //res32 saturated for 0x80000000
5008 return vqsubq_s32(a, res32); //saturation
5009}
5010
5011_NEON2SSESTORAGE int64x2_t vqdmlsl_s32(int64x2_t a, int32x2_t b, int32x2_t c); // VQDMLSL.S32 q0,d0,d0
5012_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int64x2_t vqdmlsl_s32(int64x2_t a, int32x2_t b, int32x2_t c), _NEON2SSE_REASON_SLOW_SERIAL)
5013{
5014 __m128i res64, mask;
5015 int64x2_t res;
5016 _NEON2SSE_ALIGN_16 static const uint64_t cmask[] = {0x8000000000000000, 0x8000000000000000};
5017 res = vmull_s32(b, c);
5018 res64 = _mm_slli_epi64 (res, 1); //double the result, saturation not considered
5019 mask = _MM_CMPEQ_EPI64 (res64, *(__m128i*)cmask);
5020 res64 = _mm_xor_si128 (res64, mask); //res32 saturated for 0x80000000
5021 return vqsubq_s64(a, res64); //saturation
5022}
5023
5024//****************** COMPARISON ***************************************
5025//******************* Vector compare equal *************************************
5026//****************************************************************************
5027_NEON2SSESTORAGE uint8x8_t vceq_s8(int8x8_t a, int8x8_t b); // VCEQ.I8 d0, d0, d0
5028_NEON2SSE_INLINE int8x8_t vceq_s8(int8x8_t a, int8x8_t b)
5029{
5030 int8x8_t res64;
5031 return64(_mm_cmpeq_epi8(_pM128i(a),_pM128i(b)));
5032}
5033
5034
5035_NEON2SSESTORAGE uint16x4_t vceq_s16(int16x4_t a, int16x4_t b); // VCEQ.I16 d0, d0, d0
5036_NEON2SSE_INLINE int16x4_t vceq_s16(int16x4_t a, int16x4_t b)
5037{
5038 int16x4_t res64;
5039 return64(_mm_cmpeq_epi16(_pM128i(a),_pM128i(b)));
5040}
5041
5042
5043_NEON2SSESTORAGE uint32x2_t vceq_s32(int32x2_t a, int32x2_t b); // VCEQ.I32 d0, d0, d0
5044_NEON2SSE_INLINE int32x2_t vceq_s32(int32x2_t a, int32x2_t b)
5045{
5046 int32x2_t res64;
5047 return64(_mm_cmpeq_epi32(_pM128i(a),_pM128i(b)));
5048}
5049
5050
5051_NEON2SSESTORAGE uint32x2_t vceq_f32(float32x2_t a, float32x2_t b); // VCEQ.F32 d0, d0, d0
5052_NEON2SSE_INLINE uint32x2_t vceq_f32(float32x2_t a, float32x2_t b)
5053{
5054 uint32x2_t res64;
5055 __m128 res;
5056 res = _mm_cmpeq_ps(_pM128(a), _pM128(b) );
5057 return64f(res);
5058}
5059
5060_NEON2SSESTORAGE uint8x8_t vceq_u8(uint8x8_t a, uint8x8_t b); // VCEQ.I8 d0, d0, d0
5061_NEON2SSE_INLINE uint8x8_t vceq_u8(uint8x8_t a, uint8x8_t b)
5062{
5063 uint8x8_t res64;
5064 return64(_mm_cmpeq_epi8(_pM128i(a),_pM128i(b)));
5065}
5066
5067
5068_NEON2SSESTORAGE uint16x4_t vceq_u16(uint16x4_t a, uint16x4_t b); // VCEQ.I16 d0, d0, d0
5069_NEON2SSE_INLINE uint16x4_t vceq_u16(uint16x4_t a, uint16x4_t b)
5070{
5071 uint16x4_t res64;
5072 return64(_mm_cmpeq_epi16(_pM128i(a),_pM128i(b)));
5073}
5074
5075
5076_NEON2SSESTORAGE uint32x2_t vceq_u32(uint32x2_t a, uint32x2_t b); // VCEQ.I32 d0, d0, d0
5077_NEON2SSE_INLINE uint32x2_t vceq_u32(uint32x2_t a, uint32x2_t b)
5078{
5079 uint32x2_t res64;
5080 return64(_mm_cmpeq_epi32(_pM128i(a),_pM128i(b)));
5081}
5082
5083
5084_NEON2SSE_GLOBAL uint8x8_t vceq_p8(poly8x8_t a, poly8x8_t b); // VCEQ.I8 d0, d0, d0
5085#define vceq_p8 vceq_u8
5086
5087
5088_NEON2SSE_GLOBAL uint8x16_t vceqq_s8(int8x16_t a, int8x16_t b); // VCEQ.I8 q0, q0, q0
5089#define vceqq_s8 _mm_cmpeq_epi8
5090
5091_NEON2SSE_GLOBAL uint16x8_t vceqq_s16(int16x8_t a, int16x8_t b); // VCEQ.I16 q0, q0, q0
5092#define vceqq_s16 _mm_cmpeq_epi16
5093
5094_NEON2SSE_GLOBAL uint32x4_t vceqq_s32(int32x4_t a, int32x4_t b); // VCEQ.I32 q0, q0, q0
5095#define vceqq_s32 _mm_cmpeq_epi32
5096
5097_NEON2SSESTORAGE uint32x4_t vceqq_f32(float32x4_t a, float32x4_t b); // VCEQ.F32 q0, q0, q0
5098_NEON2SSE_INLINE uint32x4_t vceqq_f32(float32x4_t a, float32x4_t b)
5099{
5100 __m128 res;
5101 res = _mm_cmpeq_ps(a,b);
5102 return _M128i(res);
5103}
5104
5105_NEON2SSE_GLOBAL uint8x16_t vceqq_u8(uint8x16_t a, uint8x16_t b); // VCEQ.I8 q0, q0, q0
5106#define vceqq_u8 _mm_cmpeq_epi8
5107
5108_NEON2SSE_GLOBAL uint16x8_t vceqq_u16(uint16x8_t a, uint16x8_t b); // VCEQ.I16 q0, q0, q0
5109#define vceqq_u16 _mm_cmpeq_epi16
5110
5111_NEON2SSE_GLOBAL uint32x4_t vceqq_u32(uint32x4_t a, uint32x4_t b); // VCEQ.I32 q0, q0, q0
5112#define vceqq_u32 _mm_cmpeq_epi32
5113
5114_NEON2SSE_GLOBAL uint8x16_t vceqq_p8(poly8x16_t a, poly8x16_t b); // VCEQ.I8 q0, q0, q0
5115#define vceqq_p8 _mm_cmpeq_epi8
5116
5117//******************Vector compare greater-than or equal*************************
5118//*******************************************************************************
5119//in IA SIMD no greater-than-or-equal comparison for integers,
5120// there is greater-than available only, so we need the following tricks
5121
5122_NEON2SSESTORAGE uint8x8_t vcge_s8(int8x8_t a, int8x8_t b); // VCGE.S8 d0, d0, d0
5123_NEON2SSE_INLINE int8x8_t vcge_s8(int8x8_t a, int8x8_t b)
5124{
5125 int8x8_t res64;
5126 return64(vcgeq_s8(_pM128i(a), _pM128i(b)));
5127}
5128
5129
5130_NEON2SSESTORAGE uint16x4_t vcge_s16(int16x4_t a, int16x4_t b); // VCGE.S16 d0, d0, d0
5131_NEON2SSE_INLINE int16x4_t vcge_s16(int16x4_t a, int16x4_t b)
5132{
5133 int16x4_t res64;
5134 return64(vcgeq_s16(_pM128i(a), _pM128i(b)));
5135}
5136
5137
5138_NEON2SSESTORAGE uint32x2_t vcge_s32(int32x2_t a, int32x2_t b); // VCGE.S32 d0, d0, d0
5139_NEON2SSE_INLINE int32x2_t vcge_s32(int32x2_t a, int32x2_t b)
5140{
5141 int32x2_t res64;
5142 return64(vcgeq_s32(_pM128i(a), _pM128i(b)));
5143}
5144
5145
5146_NEON2SSESTORAGE uint32x2_t vcge_f32(float32x2_t a, float32x2_t b); // VCGE.F32 d0, d0, d0
5147_NEON2SSE_INLINE uint32x2_t vcge_f32(float32x2_t a, float32x2_t b)
5148{
5149 uint32x2_t res64;
5150 __m128 res;
5151 res = _mm_cmpge_ps(_pM128(a),_pM128(b)); //use only 2 first entries
5152 return64f(res);
5153}
5154
5155_NEON2SSESTORAGE uint8x8_t vcge_u8(uint8x8_t a, uint8x8_t b); // VCGE.U8 d0, d0, d0
5156_NEON2SSE_INLINE uint8x8_t vcge_u8(uint8x8_t a, uint8x8_t b)
5157{
5158 uint8x8_t res64;
5159 return64(vcgeq_u8(_pM128i(a), _pM128i(b)));
5160}
5161
5162
5163_NEON2SSESTORAGE uint16x4_t vcge_u16(uint16x4_t a, uint16x4_t b); // VCGE.s16 d0, d0, d0
5164_NEON2SSE_INLINE uint16x4_t vcge_u16(uint16x4_t a, uint16x4_t b)
5165{
5166 uint16x4_t res64;
5167 return64(vcgeq_u16(_pM128i(a), _pM128i(b)));
5168}
5169
5170
5171_NEON2SSESTORAGE uint32x2_t vcge_u32(uint32x2_t a, uint32x2_t b); // VCGE.U32 d0, d0, d0
5172_NEON2SSE_INLINE uint32x2_t vcge_u32(uint32x2_t a, uint32x2_t b)
5173{
5174 //serial solution looks faster
5175 uint32x2_t res64;
5176 return64(vcgeq_u32 (_pM128i(a), _pM128i(b)));
5177}
5178
5179
5180
5181_NEON2SSESTORAGE uint8x16_t vcgeq_s8(int8x16_t a, int8x16_t b); // VCGE.S8 q0, q0, q0
5182_NEON2SSE_INLINE uint8x16_t vcgeq_s8(int8x16_t a, int8x16_t b) // VCGE.S8 q0, q0, q0
5183{
5184 __m128i m1, m2;
5185 m1 = _mm_cmpgt_epi8 ( a, b);
5186 m2 = _mm_cmpeq_epi8 ( a, b);
5187 return _mm_or_si128 ( m1, m2);
5188}
5189
5190_NEON2SSESTORAGE uint16x8_t vcgeq_s16(int16x8_t a, int16x8_t b); // VCGE.S16 q0, q0, q0
5191_NEON2SSE_INLINE uint16x8_t vcgeq_s16(int16x8_t a, int16x8_t b) // VCGE.S16 q0, q0, q0
5192{
5193 __m128i m1, m2;
5194 m1 = _mm_cmpgt_epi16 ( a, b);
5195 m2 = _mm_cmpeq_epi16 ( a, b);
5196 return _mm_or_si128 ( m1,m2);
5197}
5198
5199_NEON2SSESTORAGE uint32x4_t vcgeq_s32(int32x4_t a, int32x4_t b); // VCGE.S32 q0, q0, q0
5200_NEON2SSE_INLINE uint32x4_t vcgeq_s32(int32x4_t a, int32x4_t b) // VCGE.S32 q0, q0, q0
5201{
5202 __m128i m1, m2;
5203 m1 = _mm_cmpgt_epi32 (a, b);
5204 m2 = _mm_cmpeq_epi32 (a, b);
5205 return _mm_or_si128 (m1, m2);
5206}
5207
5208_NEON2SSESTORAGE uint32x4_t vcgeq_f32(float32x4_t a, float32x4_t b); // VCGE.F32 q0, q0, q0
5209_NEON2SSE_INLINE uint32x4_t vcgeq_f32(float32x4_t a, float32x4_t b)
5210{
5211 __m128 res;
5212 res = _mm_cmpge_ps(a,b); //use only 2 first entries
5213 return *(__m128i*)&res;
5214}
5215
5216_NEON2SSESTORAGE uint8x16_t vcgeq_u8(uint8x16_t a, uint8x16_t b); // VCGE.U8 q0, q0, q0
5217_NEON2SSE_INLINE uint8x16_t vcgeq_u8(uint8x16_t a, uint8x16_t b) // VCGE.U8 q0, q0, q0
5218{
5219 //no unsigned chars comparison, only signed available,so need the trick
5220 __m128i cmp;
5221 cmp = _mm_max_epu8(a, b);
5222 return _mm_cmpeq_epi8(cmp, a); //a>=b
5223}
5224
5225_NEON2SSESTORAGE uint16x8_t vcgeq_u16(uint16x8_t a, uint16x8_t b); // VCGE.s16 q0, q0, q0
5226_NEON2SSE_INLINE uint16x8_t vcgeq_u16(uint16x8_t a, uint16x8_t b) // VCGE.s16 q0, q0, q0
5227{
5228 //no unsigned shorts comparison, only signed available,so need the trick
5229#ifdef USE_SSE4
5230 __m128i cmp;
5231 cmp = _mm_max_epu16(a, b);
5232 return _mm_cmpeq_epi16(cmp, a); //a>=b
5233#else
5234 __m128i zero = _mm_setzero_si128();
5235 __m128i as = _mm_subs_epu16(b, a);
5236 return _mm_cmpeq_epi16(as, zero);
5237#endif
5238}
5239
5240_NEON2SSESTORAGE uint32x4_t vcgeq_u32(uint32x4_t a, uint32x4_t b); // VCGE.U32 q0, q0, q0
5241_NEON2SSE_INLINE uint32x4_t vcgeq_u32(uint32x4_t a, uint32x4_t b) // VCGE.U32 q0, q0, q0
5242{
5243 //no unsigned ints comparison, only signed available,so need the trick
5244#ifdef USE_SSE4
5245 __m128i cmp;
5246 cmp = _mm_max_epu32(a, b);
5247 return _mm_cmpeq_epi32(cmp, a); //a>=b
5248#else
5249 //serial solution may be faster
5250 __m128i c80000000, as, bs, m1, m2;
5251 c80000000 = _mm_set1_epi32 (0x80000000);
5252 as = _mm_sub_epi32(a,c80000000);
5253 bs = _mm_sub_epi32(b,c80000000);
5254 m1 = _mm_cmpgt_epi32 (as, bs);
5255 m2 = _mm_cmpeq_epi32 (as, bs);
5256 return _mm_or_si128 ( m1, m2);
5257#endif
5258}
5259
5260//**********************Vector compare less-than or equal******************************
5261//***************************************************************************************
5262//in IA SIMD no less-than-or-equal comparison for integers present, so we need the tricks
5263
5264_NEON2SSESTORAGE uint8x8_t vcle_s8(int8x8_t a, int8x8_t b); // VCGE.S8 d0, d0, d0
5265_NEON2SSE_INLINE int8x8_t vcle_s8(int8x8_t a, int8x8_t b)
5266{
5267 int8x8_t res64;
5268 return64(vcleq_s8(_pM128i(a), _pM128i(b)));
5269}
5270
5271
5272_NEON2SSESTORAGE uint16x4_t vcle_s16(int16x4_t a, int16x4_t b); // VCGE.S16 d0, d0, d0
5273_NEON2SSE_INLINE int16x4_t vcle_s16(int16x4_t a, int16x4_t b)
5274{
5275 int16x4_t res64;
5276 return64(vcleq_s16(_pM128i(a), _pM128i(b)));
5277}
5278
5279
5280_NEON2SSESTORAGE uint32x2_t vcle_s32(int32x2_t a, int32x2_t b); // VCGE.S32 d0, d0, d0
5281_NEON2SSE_INLINE int32x2_t vcle_s32(int32x2_t a, int32x2_t b)
5282{
5283 int32x2_t res64;
5284 return64(vcleq_s32(_pM128i(a), _pM128i(b)));
5285}
5286
5287
5288_NEON2SSESTORAGE uint32x2_t vcle_f32(float32x2_t a, float32x2_t b); // VCGE.F32 d0, d0, d0?
5289_NEON2SSE_INLINE uint32x2_t vcle_f32(float32x2_t a, float32x2_t b)
5290{
5291 uint32x2_t res64;
5292 __m128 res;
5293 res = _mm_cmple_ps(_pM128(a),_pM128(b));
5294 return64f(res);
5295}
5296
5297_NEON2SSE_GLOBAL uint8x8_t vcle_u8(uint8x8_t a, uint8x8_t b); // VCGE.U8 d0, d0, d0
5298#define vcle_u8(a,b) vcge_u8(b,a)
5299
5300
5301_NEON2SSE_GLOBAL uint16x4_t vcle_u16(uint16x4_t a, uint16x4_t b); // VCGE.s16 d0, d0, d0
5302#define vcle_u16(a,b) vcge_u16(b,a)
5303
5304
5305_NEON2SSE_GLOBAL uint32x2_t vcle_u32(uint32x2_t a, uint32x2_t b); // VCGE.U32 d0, d0, d0
5306#define vcle_u32(a,b) vcge_u32(b,a)
5307
5308_NEON2SSESTORAGE uint8x16_t vcleq_s8(int8x16_t a, int8x16_t b); // VCGE.S8 q0, q0, q0
5309_NEON2SSE_INLINE uint8x16_t vcleq_s8(int8x16_t a, int8x16_t b) // VCGE.S8 q0, q0, q0
5310{
5311 __m128i c1, res;
5312 c1 = _mm_cmpeq_epi8 (a,a); //all ones 0xff....
5313 res = _mm_cmpgt_epi8 ( a, b);
5314 return _mm_andnot_si128 (res, c1); //inverse the cmpgt result, get less-than-or-equal
5315}
5316
5317_NEON2SSESTORAGE uint16x8_t vcleq_s16(int16x8_t a, int16x8_t b); // VCGE.S16 q0, q0, q0
5318_NEON2SSE_INLINE uint16x8_t vcleq_s16(int16x8_t a, int16x8_t b) // VCGE.S16 q0, q0, q0
5319{
5320 __m128i c1, res;
5321 c1 = _mm_cmpeq_epi16 (a,a); //all ones 0xff....
5322 res = _mm_cmpgt_epi16 ( a, b);
5323 return _mm_andnot_si128 (res, c1);
5324}
5325
5326_NEON2SSESTORAGE uint32x4_t vcleq_s32(int32x4_t a, int32x4_t b); // VCGE.S32 q0, q0, q0
5327_NEON2SSE_INLINE uint32x4_t vcleq_s32(int32x4_t a, int32x4_t b) // VCGE.S32 q0, q0, q0
5328{
5329 __m128i c1, res;
5330 c1 = _mm_cmpeq_epi32 (a,a); //all ones 0xff....
5331 res = _mm_cmpgt_epi32 ( a, b);
5332 return _mm_andnot_si128 (res, c1);
5333}
5334
5335_NEON2SSESTORAGE uint32x4_t vcleq_f32(float32x4_t a, float32x4_t b); // VCGE.F32 q0, q0, q0
5336_NEON2SSE_INLINE uint32x4_t vcleq_f32(float32x4_t a, float32x4_t b)
5337{
5338 __m128 res;
5339 res = _mm_cmple_ps(a,b);
5340 return *(__m128i*)&res;
5341}
5342
5343_NEON2SSESTORAGE uint8x16_t vcleq_u8(uint8x16_t a, uint8x16_t b); // VCGE.U8 q0, q0, q0
5344#ifdef USE_SSE4
5345 _NEON2SSE_INLINE uint8x16_t vcleq_u8(uint8x16_t a, uint8x16_t b) // VCGE.U8 q0, q0, q0
5346 {
5347 //no unsigned chars comparison in SSE, only signed available,so need the trick
5348 __m128i cmp;
5349 cmp = _mm_min_epu8(a, b);
5350 return _mm_cmpeq_epi8(cmp, a); //a<=b
5351 }
5352#else
5353 _NEON2SSE_INLINE uint8x16_t vcleq_u8(uint8x16_t a, uint8x16_t b) // VCGE.U8 q0, q0, q0
5354 {
5355 return vcgeq_u8(b, a);
5356 }
5357#endif
5358
5359_NEON2SSESTORAGE uint16x8_t vcleq_u16(uint16x8_t a, uint16x8_t b); // VCGE.s16 q0, q0, q0
5360#ifdef USE_SSE4
5361 _NEON2SSE_INLINE uint16x8_t vcleq_u16(uint16x8_t a, uint16x8_t b) // VCGE.s16 q0, q0, q0
5362 {
5363 //no unsigned shorts comparison in SSE, only signed available,so need the trick
5364 __m128i cmp;
5365 cmp = _mm_min_epu16(a, b);
5366 return _mm_cmpeq_epi16(cmp, a); //a<=b
5367 }
5368#else
5369 _NEON2SSE_INLINE uint16x8_t vcleq_u16(uint16x8_t a, uint16x8_t b) // VCGE.s16 q0, q0, q0
5370 {
5371 return vcgeq_u16(b, a);
5372 }
5373#endif
5374
5375_NEON2SSESTORAGE uint32x4_t vcleq_u32(uint32x4_t a, uint32x4_t b); // VCGE.U32 q0, q0, q0
5376#ifdef USE_SSE4
5377 _NEON2SSE_INLINE uint32x4_t vcleq_u32(uint32x4_t a, uint32x4_t b) // VCGE.U32 q0, q0, q0
5378 {
5379 //no unsigned chars comparison in SSE, only signed available,so need the trick
5380 __m128i cmp;
5381 cmp = _mm_min_epu32(a, b);
5382 return _mm_cmpeq_epi32(cmp, a); //a<=b
5383 }
5384#else
5385 _NEON2SSE_INLINE uint32x4_t vcleq_u32(uint32x4_t a, uint32x4_t b) // VCGE.U32 q0, q0, q0
5386 {
5387 return vcgeq_u32(b, a);
5388 }
5389#endif
5390
5391
5392//****** Vector compare greater-than ******************************************
5393//**************************************************************************
5394_NEON2SSESTORAGE uint8x8_t vcgt_s8(int8x8_t a, int8x8_t b); // VCGT.S8 d0, d0, d0
5395_NEON2SSE_INLINE int8x8_t vcgt_s8(int8x8_t a, int8x8_t b)
5396{
5397 int8x8_t res64;
5398 return64(_mm_cmpgt_epi8(_pM128i(a),_pM128i(b)));
5399}
5400
5401
5402_NEON2SSESTORAGE uint16x4_t vcgt_s16(int16x4_t a, int16x4_t b); // VCGT.S16 d0, d0, d0
5403_NEON2SSE_INLINE int16x4_t vcgt_s16(int16x4_t a, int16x4_t b)
5404{
5405 int16x4_t res64;
5406 return64(_mm_cmpgt_epi16(_pM128i(a),_pM128i(b)));
5407}
5408
5409
5410_NEON2SSESTORAGE uint32x2_t vcgt_s32(int32x2_t a, int32x2_t b); // VCGT.S32 d0, d0, d0
5411_NEON2SSE_INLINE int32x2_t vcgt_s32(int32x2_t a, int32x2_t b)
5412{
5413 int32x2_t res64;
5414 return64(_mm_cmpgt_epi32(_pM128i(a),_pM128i(b)));
5415}
5416
5417
5418_NEON2SSESTORAGE uint32x2_t vcgt_f32(float32x2_t a, float32x2_t b); // VCGT.F32 d0, d0, d0
5419_NEON2SSE_INLINE uint32x2_t vcgt_f32(float32x2_t a, float32x2_t b)
5420{
5421 uint32x2_t res64;
5422 __m128 res;
5423 res = _mm_cmpgt_ps(_pM128(a),_pM128(b)); //use only 2 first entries
5424 return64f(res);
5425}
5426
5427_NEON2SSESTORAGE uint8x8_t vcgt_u8(uint8x8_t a, uint8x8_t b); // VCGT.U8 d0, d0, d0
5428_NEON2SSE_INLINE uint8x8_t vcgt_u8(uint8x8_t a, uint8x8_t b)
5429{
5430 uint8x8_t res64;
5431 return64(vcgtq_u8(_pM128i(a), _pM128i(b)));
5432}
5433
5434
5435_NEON2SSESTORAGE uint16x4_t vcgt_u16(uint16x4_t a, uint16x4_t b); // VCGT.s16 d0, d0, d0
5436_NEON2SSE_INLINE uint16x4_t vcgt_u16(uint16x4_t a, uint16x4_t b)
5437{
5438 uint16x4_t res64;
5439 return64(vcgtq_u16(_pM128i(a), _pM128i(b)));
5440}
5441
5442
5443_NEON2SSESTORAGE uint32x2_t vcgt_u32(uint32x2_t a, uint32x2_t b); // VCGT.U32 d0, d0, d0
5444_NEON2SSE_INLINE uint32x2_t vcgt_u32(uint32x2_t a, uint32x2_t b)
5445{
5446 uint32x2_t res64;
5447 return64(vcgtq_u32(_pM128i(a), _pM128i(b)));
5448}
5449
5450
5451_NEON2SSE_GLOBAL uint8x16_t vcgtq_s8(int8x16_t a, int8x16_t b); // VCGT.S8 q0, q0, q0
5452#define vcgtq_s8 _mm_cmpgt_epi8
5453
5454_NEON2SSE_GLOBAL uint16x8_t vcgtq_s16(int16x8_t a, int16x8_t b); // VCGT.S16 q0, q0, q0
5455#define vcgtq_s16 _mm_cmpgt_epi16
5456
5457_NEON2SSE_GLOBAL uint32x4_t vcgtq_s32(int32x4_t a, int32x4_t b); // VCGT.S32 q0, q0, q0
5458#define vcgtq_s32 _mm_cmpgt_epi32
5459
5460_NEON2SSESTORAGE uint32x4_t vcgtq_f32(float32x4_t a, float32x4_t b); // VCGT.F32 q0, q0, q0
5461_NEON2SSE_INLINE uint32x4_t vcgtq_f32(float32x4_t a, float32x4_t b)
5462{
5463 __m128 res;
5464 res = _mm_cmpgt_ps(a,b); //use only 2 first entries
5465 return *(__m128i*)&res;
5466}
5467
5468_NEON2SSESTORAGE uint8x16_t vcgtq_u8(uint8x16_t a, uint8x16_t b); // VCGT.U8 q0, q0, q0
5469_NEON2SSE_INLINE uint8x16_t vcgtq_u8(uint8x16_t a, uint8x16_t b) // VCGT.U8 q0, q0, q0
5470{
5471 //no unsigned chars comparison, only signed available,so need the trick
5472 __m128i c128, as, bs;
5473 c128 = _mm_set1_epi8(-128); //(int8_t)0x80
5474 as = _mm_sub_epi8(a, c128);
5475 bs = _mm_sub_epi8(b, c128);
5476 return _mm_cmpgt_epi8(as, bs);
5477}
5478
5479_NEON2SSESTORAGE uint16x8_t vcgtq_u16(uint16x8_t a, uint16x8_t b); // VCGT.s16 q0, q0, q0
5480_NEON2SSE_INLINE uint16x8_t vcgtq_u16(uint16x8_t a, uint16x8_t b) // VCGT.s16 q0, q0, q0
5481{
5482 //no unsigned short comparison, only signed available,so need the trick
5483 __m128i c8000, as, bs;
5484 c8000 = _mm_set1_epi16(-32768); //(int16_t)0x8000
5485 as = _mm_sub_epi16(a, c8000);
5486 bs = _mm_sub_epi16(b, c8000);
5487 return _mm_cmpgt_epi16(as, bs);
5488}
5489
5490_NEON2SSESTORAGE uint32x4_t vcgtq_u32(uint32x4_t a, uint32x4_t b); // VCGT.U32 q0, q0, q0
5491_NEON2SSE_INLINE uint32x4_t vcgtq_u32(uint32x4_t a, uint32x4_t b) // VCGT.U32 q0, q0, q0
5492{
5493 //no unsigned int comparison, only signed available,so need the trick
5494 __m128i c80000000, as, bs;
5495 c80000000 = _mm_set1_epi32 (0x80000000);
5496 as = _mm_sub_epi32(a,c80000000);
5497 bs = _mm_sub_epi32(b,c80000000);
5498 return _mm_cmpgt_epi32 ( as, bs);
5499}
5500
5501//********************* Vector compare less-than **************************
5502//*************************************************************************
5503_NEON2SSE_GLOBAL uint8x8_t vclt_s8(int8x8_t a, int8x8_t b); // VCGT.S8 d0, d0, d0
5504#define vclt_s8(a,b) vcgt_s8(b,a) //swap the arguments!!
5505
5506
5507_NEON2SSE_GLOBAL uint16x4_t vclt_s16(int16x4_t a, int16x4_t b); // VCGT.S16 d0, d0, d0
5508#define vclt_s16(a,b) vcgt_s16(b,a) //swap the arguments!!
5509
5510
5511_NEON2SSE_GLOBAL uint32x2_t vclt_s32(int32x2_t a, int32x2_t b); // VCGT.S32 d0, d0, d0
5512#define vclt_s32(a,b) vcgt_s32(b,a) //swap the arguments!!
5513
5514
5515_NEON2SSE_GLOBAL uint32x2_t vclt_f32(float32x2_t a, float32x2_t b); // VCGT.F32 d0, d0, d0
5516#define vclt_f32(a,b) vcgt_f32(b, a) //swap the arguments!!
5517
5518_NEON2SSE_GLOBAL uint8x8_t vclt_u8(uint8x8_t a, uint8x8_t b); // VCGT.U8 d0, d0, d0
5519#define vclt_u8(a,b) vcgt_u8(b,a) //swap the arguments!!
5520
5521_NEON2SSE_GLOBAL uint16x4_t vclt_u16(uint16x4_t a, uint16x4_t b); // VCGT.s16 d0, d0, d0
5522#define vclt_u16(a,b) vcgt_u16(b,a) //swap the arguments!!
5523
5524_NEON2SSE_GLOBAL uint32x2_t vclt_u32(uint32x2_t a, uint32x2_t b); // VCGT.U32 d0, d0, d0
5525#define vclt_u32(a,b) vcgt_u32(b,a) //swap the arguments!!
5526
5527_NEON2SSE_GLOBAL uint8x16_t vcltq_s8(int8x16_t a, int8x16_t b); // VCGT.S8 q0, q0, q0
5528#define vcltq_s8(a,b) vcgtq_s8(b, a) //swap the arguments!!
5529
5530_NEON2SSE_GLOBAL uint16x8_t vcltq_s16(int16x8_t a, int16x8_t b); // VCGT.S16 q0, q0, q0
5531#define vcltq_s16(a,b) vcgtq_s16(b, a) //swap the arguments!!
5532
5533_NEON2SSE_GLOBAL uint32x4_t vcltq_s32(int32x4_t a, int32x4_t b); // VCGT.S32 q0, q0, q0
5534#define vcltq_s32(a,b) vcgtq_s32(b, a) //swap the arguments!!
5535
5536_NEON2SSE_GLOBAL uint32x4_t vcltq_f32(float32x4_t a, float32x4_t b); // VCGT.F32 q0, q0, q0
5537#define vcltq_f32(a,b) vcgtq_f32(b, a) //swap the arguments!!
5538
5539_NEON2SSE_GLOBAL uint8x16_t vcltq_u8(uint8x16_t a, uint8x16_t b); // VCGT.U8 q0, q0, q0
5540#define vcltq_u8(a,b) vcgtq_u8(b, a) //swap the arguments!!
5541
5542_NEON2SSE_GLOBAL uint16x8_t vcltq_u16(uint16x8_t a, uint16x8_t b); // VCGT.s16 q0, q0, q0
5543#define vcltq_u16(a,b) vcgtq_u16(b, a) //swap the arguments!!
5544
5545_NEON2SSE_GLOBAL uint32x4_t vcltq_u32(uint32x4_t a, uint32x4_t b); // VCGT.U32 q0, q0, q0
5546#define vcltq_u32(a,b) vcgtq_u32(b, a) //swap the arguments!!
5547
5548//*****************Vector compare absolute greater-than or equal ************
5549//***************************************************************************
5550_NEON2SSESTORAGE uint32x2_t vcage_f32(float32x2_t a, float32x2_t b); // VACGE.F32 d0, d0, d0
5551_NEON2SSE_INLINE uint32x2_t vcage_f32(float32x2_t a, float32x2_t b)
5552{
5553 uint32x2_t res64;
5554 __m128i c7fffffff;
5555 __m128 a0, b0;
5556 c7fffffff = _mm_set1_epi32 (0x7fffffff);
5557 a0 = _mm_and_ps (_pM128(a), *(__m128*)&c7fffffff);
5558 b0 = _mm_and_ps (_pM128(b), *(__m128*)&c7fffffff);
5559 a0 = _mm_cmpge_ps ( a0, b0);
5560 return64f(a0);
5561}
5562
5563_NEON2SSESTORAGE uint32x4_t vcageq_f32(float32x4_t a, float32x4_t b); // VACGE.F32 q0, q0, q0
5564_NEON2SSE_INLINE uint32x4_t vcageq_f32(float32x4_t a, float32x4_t b) // VACGE.F32 q0, q0, q0
5565{
5566 __m128i c7fffffff;
5567 __m128 a0, b0;
5568 c7fffffff = _mm_set1_epi32 (0x7fffffff);
5569 a0 = _mm_and_ps (a, *(__m128*)&c7fffffff);
5570 b0 = _mm_and_ps (b, *(__m128*)&c7fffffff);
5571 a0 = _mm_cmpge_ps ( a0, b0);
5572 return (*(__m128i*)&a0);
5573}
5574
5575//********Vector compare absolute less-than or equal ******************
5576//********************************************************************
5577_NEON2SSESTORAGE uint32x2_t vcale_f32(float32x2_t a, float32x2_t b); // VACGE.F32 d0, d0, d0
5578_NEON2SSE_INLINE uint32x2_t vcale_f32(float32x2_t a, float32x2_t b)
5579{
5580 uint32x2_t res64;
5581 __m128i c7fffffff;
5582 __m128 a0, b0;
5583 c7fffffff = _mm_set1_epi32 (0x7fffffff);
5584 a0 = _mm_and_ps (_pM128(a), *(__m128*)&c7fffffff);
5585 b0 = _mm_and_ps (_pM128(b), *(__m128*)&c7fffffff);
5586 a0 = _mm_cmple_ps (a0, b0);
5587 return64f(a0);
5588}
5589
5590_NEON2SSESTORAGE uint32x4_t vcaleq_f32(float32x4_t a, float32x4_t b); // VACGE.F32 q0, q0, q0
5591_NEON2SSE_INLINE uint32x4_t vcaleq_f32(float32x4_t a, float32x4_t b) // VACGE.F32 q0, q0, q0
5592{
5593 __m128i c7fffffff;
5594 __m128 a0, b0;
5595 c7fffffff = _mm_set1_epi32 (0x7fffffff);
5596 a0 = _mm_and_ps (a, *(__m128*)&c7fffffff);
5597 b0 = _mm_and_ps (b, *(__m128*)&c7fffffff);
5598 a0 = _mm_cmple_ps (a0, b0);
5599 return (*(__m128i*)&a0);
5600}
5601
5602//******** Vector compare absolute greater-than ******************
5603//******************************************************************
5604_NEON2SSESTORAGE uint32x2_t vcagt_f32(float32x2_t a, float32x2_t b); // VACGT.F32 d0, d0, d0
5605_NEON2SSE_INLINE uint32x2_t vcagt_f32(float32x2_t a, float32x2_t b)
5606{
5607 uint32x2_t res64;
5608 __m128i c7fffffff;
5609 __m128 a0, b0;
5610 c7fffffff = _mm_set1_epi32 (0x7fffffff);
5611 a0 = _mm_and_ps (_pM128(a), *(__m128*)&c7fffffff);
5612 b0 = _mm_and_ps (_pM128(b), *(__m128*)&c7fffffff);
5613 a0 = _mm_cmpgt_ps (a0, b0);
5614 return64f(a0);
5615}
5616
5617_NEON2SSESTORAGE uint32x4_t vcagtq_f32(float32x4_t a, float32x4_t b); // VACGT.F32 q0, q0, q0
5618_NEON2SSE_INLINE uint32x4_t vcagtq_f32(float32x4_t a, float32x4_t b) // VACGT.F32 q0, q0, q0
5619{
5620 __m128i c7fffffff;
5621 __m128 a0, b0;
5622 c7fffffff = _mm_set1_epi32 (0x7fffffff);
5623 a0 = _mm_and_ps (a, *(__m128*)&c7fffffff);
5624 b0 = _mm_and_ps (b, *(__m128*)&c7fffffff);
5625 a0 = _mm_cmpgt_ps (a0, b0);
5626 return (*(__m128i*)&a0);
5627}
5628
5629//***************Vector compare absolute less-than ***********************
5630//*************************************************************************
5631_NEON2SSESTORAGE uint32x2_t vcalt_f32(float32x2_t a, float32x2_t b); // VACGT.F32 d0, d0, d0
5632_NEON2SSE_INLINE uint32x2_t vcalt_f32(float32x2_t a, float32x2_t b)
5633{
5634 uint32x2_t res64;
5635 __m128i c7fffffff;
5636 __m128 a0, b0;
5637 c7fffffff = _mm_set1_epi32 (0x7fffffff);
5638 a0 = _mm_and_ps (_pM128(a), *(__m128*)&c7fffffff);
5639 b0 = _mm_and_ps (_pM128(b), *(__m128*)&c7fffffff);
5640 a0 = _mm_cmplt_ps (a0, b0);
5641 return64f(a0);
5642}
5643
5644_NEON2SSESTORAGE uint32x4_t vcaltq_f32(float32x4_t a, float32x4_t b); // VACGT.F32 q0, q0, q0
5645_NEON2SSE_INLINE uint32x4_t vcaltq_f32(float32x4_t a, float32x4_t b) // VACGT.F32 q0, q0, q0
5646{
5647 __m128i c7fffffff;
5648 __m128 a0, b0;
5649 c7fffffff = _mm_set1_epi32 (0x7fffffff);
5650 a0 = _mm_and_ps (a, *(__m128*)&c7fffffff);
5651 b0 = _mm_and_ps (b, *(__m128*)&c7fffffff);
5652 a0 = _mm_cmplt_ps (a0, b0);
5653 return (*(__m128i*)&a0);
5654}
5655
5656//*************************Vector test bits************************************
5657//*****************************************************************************
5658/*VTST (Vector Test Bits) takes each element in a vector, and bitwise logical ANDs them
5659with the corresponding element of a second vector. If the result is not zero, the
5660corresponding element in the destination vector is set to all ones. Otherwise, it is set to
5661all zeros. */
5662
5663_NEON2SSESTORAGE uint8x8_t vtst_s8(int8x8_t a, int8x8_t b); // VTST.8 d0, d0, d0
5664_NEON2SSE_INLINE uint8x8_t vtst_s8(int8x8_t a, int8x8_t b)
5665{
5666 int8x8_t res64;
5667 return64(vtstq_s8(_pM128i(a), _pM128i(b)));
5668}
5669
5670
5671_NEON2SSESTORAGE uint16x4_t vtst_s16(int16x4_t a, int16x4_t b); // VTST.16 d0, d0, d0
5672_NEON2SSE_INLINE uint16x4_t vtst_s16(int16x4_t a, int16x4_t b)
5673{
5674 int16x4_t res64;
5675 return64(vtstq_s16(_pM128i(a), _pM128i(b)));
5676}
5677
5678
5679_NEON2SSESTORAGE uint32x2_t vtst_s32(int32x2_t a, int32x2_t b); // VTST.32 d0, d0, d0
5680_NEON2SSE_INLINE uint32x2_t vtst_s32(int32x2_t a, int32x2_t b)
5681{
5682 int32x2_t res64;
5683 return64(vtstq_s32(_pM128i(a), _pM128i(b)));
5684}
5685
5686
5687_NEON2SSE_GLOBAL uint8x8_t vtst_u8(uint8x8_t a, uint8x8_t b); // VTST.8 d0, d0, d0
5688#define vtst_u8 vtst_s8
5689
5690_NEON2SSE_GLOBAL uint16x4_t vtst_u16(uint16x4_t a, uint16x4_t b); // VTST.16 d0, d0, d0
5691#define vtst_u16 vtst_s16
5692
5693_NEON2SSE_GLOBAL uint32x2_t vtst_u32(uint32x2_t a, uint32x2_t b); // VTST.32 d0, d0, d0
5694#define vtst_u32 vtst_s32
5695
5696
5697_NEON2SSE_GLOBAL uint8x8_t vtst_p8(poly8x8_t a, poly8x8_t b); // VTST.8 d0, d0, d0
5698#define vtst_p8 vtst_u8
5699
5700_NEON2SSESTORAGE uint8x16_t vtstq_s8(int8x16_t a, int8x16_t b); // VTST.8 q0, q0, q0
5701_NEON2SSE_INLINE uint8x16_t vtstq_s8(int8x16_t a, int8x16_t b) // VTST.8 q0, q0, q0
5702{
5703 __m128i zero, one, res;
5704 zero = _mm_setzero_si128 ();
5705 one = _mm_cmpeq_epi8(zero,zero); //0xfff..ffff
5706 res = _mm_and_si128 (a, b);
5707 res = _mm_cmpeq_epi8 (res, zero);
5708 return _mm_xor_si128(res, one); //invert result
5709}
5710
5711_NEON2SSESTORAGE uint16x8_t vtstq_s16(int16x8_t a, int16x8_t b); // VTST.16 q0, q0, q0
5712_NEON2SSE_INLINE uint16x8_t vtstq_s16(int16x8_t a, int16x8_t b) // VTST.16 q0, q0, q0
5713{
5714 __m128i zero, one, res;
5715 zero = _mm_setzero_si128 ();
5716 one = _mm_cmpeq_epi8(zero,zero); //0xfff..ffff
5717 res = _mm_and_si128 (a, b);
5718 res = _mm_cmpeq_epi16 (res, zero);
5719 return _mm_xor_si128(res, one); //invert result
5720}
5721
5722_NEON2SSESTORAGE uint32x4_t vtstq_s32(int32x4_t a, int32x4_t b); // VTST.32 q0, q0, q0
5723_NEON2SSE_INLINE uint32x4_t vtstq_s32(int32x4_t a, int32x4_t b) // VTST.32 q0, q0, q0
5724{
5725 __m128i zero, one, res;
5726 zero = _mm_setzero_si128 ();
5727 one = _mm_cmpeq_epi8(zero,zero); //0xfff..ffff
5728 res = _mm_and_si128 (a, b);
5729 res = _mm_cmpeq_epi32 (res, zero);
5730 return _mm_xor_si128(res, one); //invert result
5731}
5732
5733_NEON2SSE_GLOBAL uint8x16_t vtstq_u8(uint8x16_t a, uint8x16_t b); // VTST.8 q0, q0, q0
5734#define vtstq_u8 vtstq_s8
5735
5736_NEON2SSE_GLOBAL uint16x8_t vtstq_u16(uint16x8_t a, uint16x8_t b); // VTST.16 q0, q0, q0
5737#define vtstq_u16 vtstq_s16
5738
5739_NEON2SSE_GLOBAL uint32x4_t vtstq_u32(uint32x4_t a, uint32x4_t b); // VTST.32 q0, q0, q0
5740#define vtstq_u32 vtstq_s32
5741
5742_NEON2SSE_GLOBAL uint8x16_t vtstq_p8(poly8x16_t a, poly8x16_t b); // VTST.8 q0, q0, q0
5743#define vtstq_p8 vtstq_u8
5744
5745//****************** Absolute difference ********************
5746//*** Absolute difference between the arguments: Vr[i] = | Va[i] - Vb[i] |*****
5747//************************************************************
5748_NEON2SSESTORAGE int8x8_t vabd_s8(int8x8_t a, int8x8_t b); // VABD.S8 d0,d0,d0
5749_NEON2SSE_INLINE int8x8_t vabd_s8(int8x8_t a, int8x8_t b)
5750{
5751 int8x8_t res64;
5752 return64(vabdq_s8(_pM128i(a), _pM128i(b)));
5753}
5754
5755_NEON2SSESTORAGE int16x4_t vabd_s16(int16x4_t a, int16x4_t b); // VABD.S16 d0,d0,d0
5756_NEON2SSE_INLINE int16x4_t vabd_s16(int16x4_t a, int16x4_t b)
5757{
5758 int16x4_t res64;
5759 return64(vabdq_s16(_pM128i(a), _pM128i(b)));
5760}
5761
5762_NEON2SSESTORAGE int32x2_t vabd_s32(int32x2_t a, int32x2_t b); // VABD.S32 d0,d0,d0
5763_NEON2SSE_INLINE int32x2_t vabd_s32(int32x2_t a, int32x2_t b)
5764{//need to deal with an intermediate overflow
5765 int32x2_t res;
5766 res.m64_i32[0] = (a.m64_i32[0] > b.m64_i32[0]) ? a.m64_i32[0] - b.m64_i32[0]: b.m64_i32[0] - a.m64_i32[0];
5767 res.m64_i32[1] = (a.m64_i32[1] > b.m64_i32[1]) ? a.m64_i32[1] - b.m64_i32[1]: b.m64_i32[1] - a.m64_i32[1];
5768 return res;
5769}
5770
5771_NEON2SSESTORAGE uint8x8_t vabd_u8(uint8x8_t a, uint8x8_t b); // VABD.U8 d0,d0,d0
5772_NEON2SSE_INLINE uint8x8_t vabd_u8(uint8x8_t a, uint8x8_t b)
5773{
5774 uint8x8_t res64;
5775 return64(vabdq_u8(_pM128i(a), _pM128i(b)));
5776}
5777
5778_NEON2SSESTORAGE uint16x4_t vabd_u16(uint16x4_t a, uint16x4_t b); // VABD.s16 d0,d0,d0
5779_NEON2SSE_INLINE uint16x4_t vabd_u16(uint16x4_t a, uint16x4_t b)
5780{
5781 uint16x4_t res64;
5782 return64(vabdq_u16(_pM128i(a), _pM128i(b)));
5783}
5784
5785_NEON2SSESTORAGE uint32x2_t vabd_u32(uint32x2_t a, uint32x2_t b); // VABD.U32 d0,d0,d0
5786_NEON2SSE_INLINE uint32x2_t vabd_u32(uint32x2_t a, uint32x2_t b)
5787{
5788 uint32x2_t res64;
5789 return64(vabdq_u32(_pM128i(a), _pM128i(b)));
5790}
5791
5792_NEON2SSESTORAGE float32x2_t vabd_f32(float32x2_t a, float32x2_t b); // VABD.F32 d0,d0,d0
5793_NEON2SSE_INLINE float32x2_t vabd_f32(float32x2_t a, float32x2_t b)
5794{
5795 float32x4_t res;
5796 __m64_128 res64;
5797 res = vabdq_f32(_pM128(a), _pM128(b));
5798 _M64f(res64, res);
5799 return res64;
5800}
5801
5802_NEON2SSESTORAGE int8x16_t vabdq_s8(int8x16_t a, int8x16_t b); // VABD.S8 q0,q0,q0
5803_NEON2SSE_INLINE int8x16_t vabdq_s8(int8x16_t a, int8x16_t b) // VABD.S8 q0,q0,q0
5804{ //need to deal with an intermediate overflow
5805 __m128i cmp, difab, difba;
5806 cmp = vcgtq_s8(a,b);
5807 difab = _mm_sub_epi8(a,b);
5808 difba = _mm_sub_epi8(b,a);
5809 difab = _mm_and_si128(cmp, difab);
5810 difba = _mm_andnot_si128(cmp, difba);
5811 return _mm_or_si128(difab, difba);
5812}
5813
5814_NEON2SSESTORAGE int16x8_t vabdq_s16(int16x8_t a, int16x8_t b); // VABD.S16 q0,q0,q0
5815_NEON2SSE_INLINE int16x8_t vabdq_s16(int16x8_t a, int16x8_t b) // VABD.S16 q0,q0,q0
5816{//need to deal with an intermediate overflow
5817 __m128i cmp, difab, difba;
5818 cmp = vcgtq_s16(a,b);
5819 difab = _mm_sub_epi16(a,b);
5820 difba = _mm_sub_epi16 (b,a);
5821 difab = _mm_and_si128(cmp, difab);
5822 difba = _mm_andnot_si128(cmp, difba);
5823 return _mm_or_si128(difab, difba);
5824}
5825
5826_NEON2SSESTORAGE int32x4_t vabdq_s32(int32x4_t a, int32x4_t b); // VABD.S32 q0,q0,q0
5827_NEON2SSE_INLINE int32x4_t vabdq_s32(int32x4_t a, int32x4_t b) // VABD.S32 q0,q0,q0
5828{//need to deal with an intermediate overflow
5829 __m128i cmp, difab, difba;
5830 cmp = vcgtq_s32(a,b);
5831 difab = _mm_sub_epi32(a,b);
5832 difba = _mm_sub_epi32(b,a);
5833 difab = _mm_and_si128(cmp, difab);
5834 difba = _mm_andnot_si128(cmp, difba);
5835 return _mm_or_si128(difab, difba);
5836}
5837
5838_NEON2SSESTORAGE uint8x16_t vabdq_u8(uint8x16_t a, uint8x16_t b); // VABD.U8 q0,q0,q0
5839_NEON2SSE_INLINE uint8x16_t vabdq_u8(uint8x16_t a, uint8x16_t b) //no abs for unsigned
5840{
5841 __m128i difab, difba;
5842 difab = _mm_subs_epu8(a,b);
5843 difba = _mm_subs_epu8 (b,a);
5844 return _mm_or_si128(difab, difba);
5845}
5846
5847_NEON2SSESTORAGE uint16x8_t vabdq_u16(uint16x8_t a, uint16x8_t b); // VABD.s16 q0,q0,q0
5848_NEON2SSE_INLINE uint16x8_t vabdq_u16(uint16x8_t a, uint16x8_t b)
5849{
5850 __m128i difab, difba;
5851 difab = _mm_subs_epu16(a,b);
5852 difba = _mm_subs_epu16 (b,a);
5853 return _mm_or_si128(difab, difba);
5854}
5855
5856_NEON2SSESTORAGE uint32x4_t vabdq_u32(uint32x4_t a, uint32x4_t b); // VABD.U32 q0,q0,q0
5857_NEON2SSE_INLINE uint32x4_t vabdq_u32(uint32x4_t a, uint32x4_t b)
5858{
5859 __m128i cmp, difab, difba;
5860 cmp = vcgtq_u32(a,b);
5861 difab = _mm_sub_epi32(a,b);
5862 difba = _mm_sub_epi32 (b,a);
5863 difab = _mm_and_si128(cmp, difab);
5864 difba = _mm_andnot_si128(cmp, difba);
5865 return _mm_or_si128(difab, difba);
5866}
5867
5868_NEON2SSESTORAGE float32x4_t vabdq_f32(float32x4_t a, float32x4_t b); // VABD.F32 q0,q0,q0
5869_NEON2SSE_INLINE float32x4_t vabdq_f32(float32x4_t a, float32x4_t b) // VABD.F32 q0,q0,q0
5870{
5871 __m128i c1;
5872 __m128 res;
5873 c1 = _mm_set1_epi32(0x7fffffff);
5874 res = _mm_sub_ps (a, b);
5875 return _mm_and_ps (res, *(__m128*)&c1);
5876}
5877
5878//************ Absolute difference - long **************************
5879//********************************************************************
5880_NEON2SSESTORAGE int16x8_t vabdl_s8(int8x8_t a, int8x8_t b); // VABDL.S8 q0,d0,d0
5881_NEON2SSE_INLINE int16x8_t vabdl_s8(int8x8_t a, int8x8_t b) // VABDL.S8 q0,d0,d0
5882{
5883 __m128i a16, b16;
5884 a16 = _MM_CVTEPI8_EPI16 (_pM128i(a)); //SSE4.1,
5885 b16 = _MM_CVTEPI8_EPI16 (_pM128i(b)); //SSE4.1,
5886 return vabdq_s16(a16, b16);
5887
5888}
5889
5890_NEON2SSESTORAGE int32x4_t vabdl_s16(int16x4_t a, int16x4_t b); // VABDL.S16 q0,d0,d0
5891_NEON2SSE_INLINE int32x4_t vabdl_s16(int16x4_t a, int16x4_t b) // VABDL.S16 q0,d0,d0
5892{
5893 __m128i a32, b32;
5894 a32 = _MM_CVTEPI16_EPI32 (_pM128i(a)); //SSE4.1
5895 b32 = _MM_CVTEPI16_EPI32 (_pM128i(b)); //SSE4.1,
5896 return vabdq_s32(a32, b32);
5897}
5898
5899_NEON2SSESTORAGE int64x2_t vabdl_s32(int32x2_t a, int32x2_t b); // VABDL.S32 q0,d0,d0
5900_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING (int64x2_t vabdl_s32(int32x2_t a, int32x2_t b),_NEON2SSE_REASON_SLOW_SERIAL)
5901{
5902 //no optimal SIMD solution, serial looks faster
5903 _NEON2SSE_ALIGN_16 int64_t res[2];
5904 if(a.m64_i32[0] > b.m64_i32[0]) res[0] = ( int64_t) a.m64_i32[0] - ( int64_t) b.m64_i32[0];
5905 else res[0] = ( int64_t) b.m64_i32[0] - ( int64_t) a.m64_i32[0];
5906 if(a.m64_i32[1] > b.m64_i32[1]) res[1] = ( int64_t) a.m64_i32[1] - ( int64_t) b.m64_i32[1];
5907 else res[1] = ( int64_t) b.m64_i32[1] - ( int64_t) a.m64_i32[1];
5908 return _mm_load_si128((__m128i*)res);
5909}
5910
5911_NEON2SSESTORAGE uint16x8_t vabdl_u8(uint8x8_t a, uint8x8_t b); // VABDL.U8 q0,d0,d0
5912_NEON2SSE_INLINE uint16x8_t vabdl_u8(uint8x8_t a, uint8x8_t b)
5913{
5914 __m128i res;
5915 res = vsubl_u8(a,b);
5916 return _mm_abs_epi16(res);
5917}
5918
5919_NEON2SSESTORAGE uint32x4_t vabdl_u16(uint16x4_t a, uint16x4_t b); // VABDL.s16 q0,d0,d0
5920_NEON2SSE_INLINE uint32x4_t vabdl_u16(uint16x4_t a, uint16x4_t b)
5921{
5922 __m128i res;
5923 res = vsubl_u16(a,b);
5924 return _mm_abs_epi32(res);
5925}
5926
5927_NEON2SSESTORAGE uint64x2_t vabdl_u32(uint32x2_t a, uint32x2_t b); // VABDL.U32 q0,d0,d0
5928_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING (uint64x2_t vabdl_u32(uint32x2_t a, uint32x2_t b), _NEON2SSE_REASON_SLOW_SERIAL)
5929{
5930 _NEON2SSE_ALIGN_16 uint64_t res[2];
5931 if(a.m64_u32[0] > b.m64_u32[0]) res[0] = ( uint64_t) a.m64_u32[0] - ( uint64_t) b.m64_u32[0];
5932 else res[0] = ( uint64_t) b.m64_u32[0] - ( uint64_t) a.m64_u32[0];
5933 if(a.m64_u32[1] > b.m64_u32[1]) res[1] = ( uint64_t) a.m64_u32[1] - ( uint64_t) b.m64_u32[1];
5934 else res[1] = ( uint64_t) b.m64_u32[1] - ( uint64_t) a.m64_u32[1];
5935 return _mm_load_si128((__m128i*)res);
5936}
5937
5938//**********Absolute difference and accumulate: Vr[i] = Va[i] + | Vb[i] - Vc[i] | *************
5939//*********************************************************************************************
5940_NEON2SSESTORAGE int8x8_t vaba_s8(int8x8_t a, int8x8_t b, int8x8_t c); // VABA.S8 d0,d0,d0
5941_NEON2SSE_INLINE int8x8_t vaba_s8(int8x8_t a, int8x8_t b, int8x8_t c)
5942{
5943 int8x8_t res64;
5944 return64(vabaq_s8(_pM128i(a),_pM128i(b), _pM128i(c)));
5945}
5946
5947_NEON2SSESTORAGE int16x4_t vaba_s16(int16x4_t a, int16x4_t b, int16x4_t c); // VABA.S16 d0,d0,d0
5948_NEON2SSE_INLINE int16x4_t vaba_s16(int16x4_t a, int16x4_t b, int16x4_t c)
5949{
5950 int16x4_t res64;
5951 return64(vabaq_s16(_pM128i(a), _pM128i(b), _pM128i(c)));
5952}
5953
5954_NEON2SSESTORAGE int32x2_t vaba_s32(int32x2_t a, int32x2_t b, int32x2_t c); // VABA.S32 d0,d0,d0
5955_NEON2SSE_INLINE int32x2_t vaba_s32(int32x2_t a, int32x2_t b, int32x2_t c)
5956{
5957 int32x2_t res64;
5958 return64(vabaq_s32(_pM128i(a), _pM128i(b), _pM128i(c)));
5959}
5960
5961_NEON2SSESTORAGE uint8x8_t vaba_u8(uint8x8_t a, uint8x8_t b, uint8x8_t c); // VABA.U8 d0,d0,d0
5962_NEON2SSE_INLINE uint8x8_t vaba_u8(uint8x8_t a, uint8x8_t b, uint8x8_t c)
5963{
5964 int8x8_t res64;
5965 return64(vabaq_u8(_pM128i(a),_pM128i(b), _pM128i(c)));
5966}
5967
5968
5969_NEON2SSESTORAGE uint16x4_t vaba_u16(uint16x4_t a, uint16x4_t b, uint16x4_t c); // VABA.s16 d0,d0,d0
5970_NEON2SSE_INLINE uint16x4_t vaba_u16(uint16x4_t a, uint16x4_t b, uint16x4_t c)
5971{
5972 int16x4_t res64;
5973 return64(vabaq_u16(_pM128i(a), _pM128i(b), _pM128i(c)));
5974}
5975
5976_NEON2SSESTORAGE uint32x2_t vaba_u32(uint32x2_t a, uint32x2_t b, uint32x2_t c); // VABA.U32 d0,d0,d0
5977_NEON2SSE_INLINE uint32x2_t vaba_u32(uint32x2_t a, uint32x2_t b, uint32x2_t c)
5978{
5979 uint32x2_t res64;
5980 return64(vabaq_u32(_pM128i(a), _pM128i(b), _pM128i(c)));
5981}
5982
5983_NEON2SSESTORAGE int8x16_t vabaq_s8(int8x16_t a, int8x16_t b, int8x16_t c); // VABA.S8 q0,q0,q0
5984_NEON2SSE_INLINE int8x16_t vabaq_s8(int8x16_t a, int8x16_t b, int8x16_t c) // VABA.S8 q0,q0,q0
5985{
5986 int8x16_t sub;
5987 sub = vabdq_s8(b, c);
5988 return vaddq_s8( a, sub);
5989}
5990
5991_NEON2SSESTORAGE int16x8_t vabaq_s16(int16x8_t a, int16x8_t b, int16x8_t c); // VABA.S16 q0,q0,q0
5992_NEON2SSE_INLINE int16x8_t vabaq_s16(int16x8_t a, int16x8_t b, int16x8_t c) // VABA.S16 q0,q0,q0
5993{
5994 int16x8_t sub;
5995 sub = vabdq_s16(b, c);
5996 return vaddq_s16( a, sub);
5997}
5998
5999_NEON2SSESTORAGE int32x4_t vabaq_s32(int32x4_t a, int32x4_t b, int32x4_t c); // VABA.S32 q0,q0,q0
6000_NEON2SSE_INLINE int32x4_t vabaq_s32(int32x4_t a, int32x4_t b, int32x4_t c) // VABA.S32 q0,q0,q0
6001{
6002 int32x4_t sub;
6003 sub = vabdq_s32(b, c);
6004 return vaddq_s32( a, sub);
6005}
6006
6007_NEON2SSESTORAGE uint8x16_t vabaq_u8(uint8x16_t a, uint8x16_t b, uint8x16_t c); // VABA.U8 q0,q0,q0
6008_NEON2SSE_INLINE uint8x16_t vabaq_u8(uint8x16_t a, uint8x16_t b, uint8x16_t c)
6009{
6010 uint8x16_t sub;
6011 sub = vabdq_u8(b, c);
6012 return vaddq_u8( a, sub);
6013}
6014
6015_NEON2SSESTORAGE uint16x8_t vabaq_u16(uint16x8_t a, uint16x8_t b, uint16x8_t c); // VABA.s16 q0,q0,q0
6016_NEON2SSE_INLINE uint16x8_t vabaq_u16(uint16x8_t a, uint16x8_t b, uint16x8_t c)
6017{
6018 uint16x8_t sub;
6019 sub = vabdq_u16(b, c);
6020 return vaddq_u16( a, sub);
6021}
6022
6023_NEON2SSESTORAGE uint32x4_t vabaq_u32(uint32x4_t a, uint32x4_t b, uint32x4_t c); // VABA.U32 q0,q0,q0
6024_NEON2SSE_INLINE uint32x4_t vabaq_u32(uint32x4_t a, uint32x4_t b, uint32x4_t c)
6025{
6026 uint32x4_t sub;
6027 sub = vabdq_u32(b, c);
6028 return vaddq_u32( a, sub);
6029}
6030
6031//************** Absolute difference and accumulate - long ********************************
6032//*************************************************************************************
6033_NEON2SSESTORAGE int16x8_t vabal_s8(int16x8_t a, int8x8_t b, int8x8_t c); // VABAL.S8 q0,d0,d0
6034_NEON2SSE_INLINE int16x8_t vabal_s8(int16x8_t a, int8x8_t b, int8x8_t c) // VABAL.S8 q0,d0,d0
6035{
6036 __m128i b16, c16, res;
6037 b16 = _MM_CVTEPI8_EPI16 (_pM128i(b)); //SSE4.1,
6038 c16 = _MM_CVTEPI8_EPI16 (_pM128i(c)); //SSE4.1,
6039 res = _mm_abs_epi16 (_mm_sub_epi16 (b16, c16) );
6040 return _mm_add_epi16 (a, res);
6041}
6042
6043_NEON2SSESTORAGE int32x4_t vabal_s16(int32x4_t a, int16x4_t b, int16x4_t c); // VABAL.S16 q0,d0,d0
6044_NEON2SSE_INLINE int32x4_t vabal_s16(int32x4_t a, int16x4_t b, int16x4_t c) // VABAL.S16 q0,d0,d0
6045{
6046 __m128i b32, c32, res;
6047 b32 = _MM_CVTEPI16_EPI32(_pM128i(b)); //SSE4.1
6048 c32 = _MM_CVTEPI16_EPI32(_pM128i(c)); //SSE4.1
6049 res = _mm_abs_epi32 (_mm_sub_epi32 (b32, c32) );
6050 return _mm_add_epi32 (a, res);
6051}
6052
6053_NEON2SSESTORAGE int64x2_t vabal_s32(int64x2_t a, int32x2_t b, int32x2_t c); // VABAL.S32 q0,d0,d0
6054_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING (int64x2_t vabal_s32(int64x2_t a, int32x2_t b, int32x2_t c), _NEON2SSE_REASON_SLOW_SERIAL)
6055{
6056 __m128i res;
6057 res = vabdl_s32(b,c);
6058 return _mm_add_epi64(a, res);
6059}
6060
6061_NEON2SSESTORAGE uint16x8_t vabal_u8(uint16x8_t a, uint8x8_t b, uint8x8_t c); // VABAL.U8 q0,d0,d0
6062_NEON2SSE_INLINE uint16x8_t vabal_u8(uint16x8_t a, uint8x8_t b, uint8x8_t c)
6063{
6064 __m128i b16, c16, res;
6065 b16 = _MM_CVTEPU8_EPI16 (_pM128i(b)); //SSE4.1,
6066 c16 = _MM_CVTEPU8_EPI16 (_pM128i(c)); //SSE4.1,
6067 res = _mm_abs_epi16 (_mm_sub_epi16 (b16, c16) );
6068 return _mm_add_epi16 (a, res);
6069}
6070
6071_NEON2SSESTORAGE uint32x4_t vabal_u16(uint32x4_t a, uint16x4_t b, uint16x4_t c); // VABAL.s16 q0,d0,d0
6072_NEON2SSE_INLINE uint32x4_t vabal_u16(uint32x4_t a, uint16x4_t b, uint16x4_t c)
6073{
6074 __m128i b32, c32, res;
6075 b32 = _MM_CVTEPU16_EPI32(_pM128i(b)); //SSE4.1
6076 c32 = _MM_CVTEPU16_EPI32(_pM128i(c)); //SSE4.1
6077 res = _mm_abs_epi32 (_mm_sub_epi32 (b32, c32) );
6078 return _mm_add_epi32 (a, res);
6079}
6080
6081_NEON2SSESTORAGE uint64x2_t vabal_u32(uint64x2_t a, uint32x2_t b, uint32x2_t c); // VABAL.U32 q0,d0,d0
6082_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING (uint64x2_t vabal_u32(uint64x2_t a, uint32x2_t b, uint32x2_t c), _NEON2SSE_REASON_SLOW_SERIAL)
6083{
6084 __m128i res;
6085 res = vabdl_u32(b,c);
6086 return _mm_add_epi64(a, res);
6087}
6088
6089//***********************************************************************************
6090//**************** Maximum and minimum operations **********************************
6091//***********************************************************************************
6092//************* Maximum: vmax -> Vr[i] := (Va[i] >= Vb[i]) ? Va[i] : Vb[i] *******
6093//***********************************************************************************
6094_NEON2SSESTORAGE int8x8_t vmax_s8(int8x8_t a, int8x8_t b); // VMAX.S8 d0,d0,d0
6095_NEON2SSE_INLINE int8x8_t vmax_s8(int8x8_t a, int8x8_t b)
6096{
6097 int8x8_t res64;
6098 __m128i res;
6099 res = _MM_MAX_EPI8(_pM128i(a),_pM128i(b)); //SSE4.1, use only lower 64 bits
6100 return64(res);
6101}
6102
6103_NEON2SSESTORAGE int16x4_t vmax_s16(int16x4_t a, int16x4_t b); // VMAX.S16 d0,d0,d0
6104_NEON2SSE_INLINE int16x4_t vmax_s16(int16x4_t a, int16x4_t b)
6105{
6106 int16x4_t res64;
6107 return64(_mm_max_epi16(_pM128i(a),_pM128i(b)));
6108}
6109
6110_NEON2SSESTORAGE int32x2_t vmax_s32(int32x2_t a, int32x2_t b); // VMAX.S32 d0,d0,d0
6111_NEON2SSE_INLINE int32x2_t vmax_s32(int32x2_t a, int32x2_t b)
6112{
6113 int32x2_t res64;
6114 __m128i res;
6115 res = _MM_MAX_EPI32(_pM128i(a),_pM128i(b)); //SSE4.1, use only lower 64 bits
6116 return64(res);
6117}
6118
6119_NEON2SSESTORAGE uint8x8_t vmax_u8(uint8x8_t a, uint8x8_t b); // VMAX.U8 d0,d0,d0
6120_NEON2SSE_INLINE uint8x8_t vmax_u8(uint8x8_t a, uint8x8_t b)
6121{
6122 uint8x8_t res64;
6123 return64(_mm_max_epu8(_pM128i(a),_pM128i(b)));
6124}
6125
6126
6127_NEON2SSESTORAGE uint16x4_t vmax_u16(uint16x4_t a, uint16x4_t b); // VMAX.s16 d0,d0,d0
6128_NEON2SSE_INLINE uint16x4_t vmax_u16(uint16x4_t a, uint16x4_t b)
6129{
6130 uint16x4_t res64;
6131 return64(_MM_MAX_EPU16(_pM128i(a),_pM128i(b)));
6132}
6133
6134
6135_NEON2SSESTORAGE uint32x2_t vmax_u32(uint32x2_t a, uint32x2_t b); // VMAX.U32 d0,d0,d0
6136_NEON2SSE_INLINE uint32x2_t vmax_u32(uint32x2_t a, uint32x2_t b)
6137{
6138 uint32x2_t res64;
6139 __m128i res;
6140 res = _MM_MAX_EPU32(_pM128i(a),_pM128i(b)); //SSE4.1, use only lower 64 bits, may be not effective compared with serial
6141 return64(res);
6142}
6143
6144_NEON2SSESTORAGE float32x2_t vmax_f32(float32x2_t a, float32x2_t b); // VMAX.F32 d0,d0,d0
6145_NEON2SSE_INLINE float32x2_t vmax_f32(float32x2_t a, float32x2_t b)
6146{
6147 //serial solution looks faster than SIMD one
6148 float32x2_t res;
6149 res.m64_f32[0] = (a.m64_f32[0] > b.m64_f32[0]) ? a.m64_f32[0] : b.m64_f32[0];
6150 res.m64_f32[1] = (a.m64_f32[1] > b.m64_f32[1]) ? a.m64_f32[1] : b.m64_f32[1];
6151 return res;
6152}
6153
6154_NEON2SSE_GLOBAL int8x16_t vmaxq_s8(int8x16_t a, int8x16_t b); // VMAX.S8 q0,q0,q0
6155#define vmaxq_s8 _MM_MAX_EPI8 //SSE4.1
6156
6157_NEON2SSE_GLOBAL int16x8_t vmaxq_s16(int16x8_t a, int16x8_t b); // VMAX.S16 q0,q0,q0
6158#define vmaxq_s16 _mm_max_epi16
6159
6160_NEON2SSE_GLOBAL int32x4_t vmaxq_s32(int32x4_t a, int32x4_t b); // VMAX.S32 q0,q0,q0
6161#define vmaxq_s32 _MM_MAX_EPI32 //SSE4.1
6162
6163_NEON2SSE_GLOBAL uint8x16_t vmaxq_u8(uint8x16_t a, uint8x16_t b); // VMAX.U8 q0,q0,q0
6164#define vmaxq_u8 _mm_max_epu8
6165
6166_NEON2SSE_GLOBAL uint16x8_t vmaxq_u16(uint16x8_t a, uint16x8_t b); // VMAX.s16 q0,q0,q0
6167#define vmaxq_u16 _MM_MAX_EPU16 //SSE4.1
6168
6169_NEON2SSE_GLOBAL uint32x4_t vmaxq_u32(uint32x4_t a, uint32x4_t b); // VMAX.U32 q0,q0,q0
6170#define vmaxq_u32 _MM_MAX_EPU32 //SSE4.1
6171
6172
6173_NEON2SSE_GLOBAL float32x4_t vmaxq_f32(float32x4_t a, float32x4_t b); // VMAX.F32 q0,q0,q0
6174#define vmaxq_f32 _mm_max_ps
6175
6176
6177_NEON2SSE_GLOBAL float64x2_t vmaxq_f64(float64x2_t a, float64x2_t b); // VMAX.F64 q0,q0,q0
6178#define vmaxq_f64 _mm_max_pd
6179
6180
6181//*************** Minimum: vmin -> Vr[i] := (Va[i] >= Vb[i]) ? Vb[i] : Va[i] ********************************
6182//***********************************************************************************************************
6183_NEON2SSESTORAGE int8x8_t vmin_s8(int8x8_t a, int8x8_t b); // VMIN.S8 d0,d0,d0
6184_NEON2SSE_INLINE int8x8_t vmin_s8(int8x8_t a, int8x8_t b)
6185{
6186 int8x8_t res64;
6187 __m128i res;
6188 res = _MM_MIN_EPI8(_pM128i(a),_pM128i(b)); //SSE4.1, use only lower 64 bits
6189 return64(res);
6190}
6191
6192_NEON2SSESTORAGE int16x4_t vmin_s16(int16x4_t a, int16x4_t b); // VMIN.S16 d0,d0,d0
6193_NEON2SSE_INLINE int16x4_t vmin_s16(int16x4_t a, int16x4_t b)
6194{
6195 int16x4_t res64;
6196 return64(_mm_min_epi16(_pM128i(a),_pM128i(b)));
6197}
6198
6199
6200_NEON2SSESTORAGE int32x2_t vmin_s32(int32x2_t a, int32x2_t b); // VMIN.S32 d0,d0,d0
6201_NEON2SSE_INLINE int32x2_t vmin_s32(int32x2_t a, int32x2_t b)
6202{
6203 int32x2_t res64;
6204 __m128i res;
6205 res = _MM_MIN_EPI32(_pM128i(a),_pM128i(b)); //SSE4.1, use only lower 64 bits
6206 return64(res);
6207}
6208
6209_NEON2SSESTORAGE uint8x8_t vmin_u8(uint8x8_t a, uint8x8_t b); // VMIN.U8 d0,d0,d0
6210_NEON2SSE_INLINE uint8x8_t vmin_u8(uint8x8_t a, uint8x8_t b)
6211{
6212 uint8x8_t res64;
6213 return64(_mm_min_epu8(_pM128i(a),_pM128i(b)));
6214}
6215
6216
6217_NEON2SSESTORAGE uint16x4_t vmin_u16(uint16x4_t a, uint16x4_t b); // VMIN.s16 d0,d0,d0
6218_NEON2SSE_INLINE uint16x4_t vmin_u16(uint16x4_t a, uint16x4_t b)
6219{
6220 uint16x4_t res64;
6221 return64(_MM_MIN_EPU16(_pM128i(a),_pM128i(b)));
6222}
6223
6224
6225_NEON2SSESTORAGE uint32x2_t vmin_u32(uint32x2_t a, uint32x2_t b); // VMIN.U32 d0,d0,d0
6226_NEON2SSE_INLINE uint32x2_t vmin_u32(uint32x2_t a, uint32x2_t b)
6227{
6228 uint32x2_t res64;
6229 __m128i res;
6230 res = _MM_MIN_EPU32(_pM128i(a),_pM128i(b)); //SSE4.1, use only lower 64 bits, may be not effective compared with serial
6231 return64(res);
6232}
6233
6234_NEON2SSESTORAGE float32x2_t vmin_f32(float32x2_t a, float32x2_t b); // VMIN.F32 d0,d0,d0
6235_NEON2SSE_INLINE float32x2_t vmin_f32(float32x2_t a, float32x2_t b)
6236{
6237 //serial solution looks faster than SIMD one
6238 float32x2_t res;
6239 res.m64_f32[0] = (a.m64_f32[0] < b.m64_f32[0]) ? a.m64_f32[0] : b.m64_f32[0];
6240 res.m64_f32[1] = (a.m64_f32[1] < b.m64_f32[1]) ? a.m64_f32[1] : b.m64_f32[1];
6241 return res;
6242}
6243
6244_NEON2SSE_GLOBAL int8x16_t vminq_s8(int8x16_t a, int8x16_t b); // VMIN.S8 q0,q0,q0
6245#define vminq_s8 _MM_MIN_EPI8 //SSE4.1
6246
6247_NEON2SSE_GLOBAL int16x8_t vminq_s16(int16x8_t a, int16x8_t b); // VMIN.S16 q0,q0,q0
6248#define vminq_s16 _mm_min_epi16
6249
6250_NEON2SSE_GLOBAL int32x4_t vminq_s32(int32x4_t a, int32x4_t b); // VMIN.S32 q0,q0,q0
6251#define vminq_s32 _MM_MIN_EPI32 //SSE4.1
6252
6253_NEON2SSE_GLOBAL uint8x16_t vminq_u8(uint8x16_t a, uint8x16_t b); // VMIN.U8 q0,q0,q0
6254#define vminq_u8 _mm_min_epu8
6255
6256_NEON2SSE_GLOBAL uint16x8_t vminq_u16(uint16x8_t a, uint16x8_t b); // VMIN.s16 q0,q0,q0
6257#define vminq_u16 _MM_MIN_EPU16 //SSE4.1
6258
6259_NEON2SSE_GLOBAL uint32x4_t vminq_u32(uint32x4_t a, uint32x4_t b); // VMIN.U32 q0,q0,q0
6260#define vminq_u32 _MM_MIN_EPU32 //SSE4.1
6261
6262_NEON2SSE_GLOBAL float32x4_t vminq_f32(float32x4_t a, float32x4_t b); // VMIN.F32 q0,q0,q0
6263#define vminq_f32 _mm_min_ps
6264
6265
6266_NEON2SSE_GLOBAL float64x2_t vminq_f64(float64x2_t a, float64x2_t b); // VMIN.F64 q0,q0,q0
6267#define vminq_f64 _mm_min_pd
6268
6269
6270//************* Pairwise addition operations. **************************************
6271//************************************************************************************
6272//Pairwise add - adds adjacent pairs of elements of two vectors, and places the results in the destination vector
6273_NEON2SSESTORAGE int8x8_t vpadd_s8(int8x8_t a, int8x8_t b); // VPADD.I8 d0,d0,d0
6274_NEON2SSE_INLINE int8x8_t vpadd_s8(int8x8_t a, int8x8_t b) // VPADD.I8 d0,d0,d0
6275{
6276 //no 8 bit hadd in IA32, need to go to 16 bit and then pack
6277 int8x8_t res64;
6278 __m128i a16, b16, res;
6279 a16 = _MM_CVTEPI8_EPI16 (_pM128i(a)); // SSE 4.1
6280 b16 = _MM_CVTEPI8_EPI16 (_pM128i(b)); // SSE 4.1
6281 res = _mm_hadd_epi16 (a16, b16);
6282 res = _mm_shuffle_epi8 (res, *(__m128i*) mask8_16_even_odd); //return to 8 bit, use low 64 bits
6283 return64(res);
6284}
6285
6286_NEON2SSESTORAGE int16x4_t vpadd_s16(int16x4_t a, int16x4_t b); // VPADD.I16 d0,d0,d0
6287_NEON2SSE_INLINE int16x4_t vpadd_s16(int16x4_t a, int16x4_t b)
6288{
6289 int16x4_t res64;
6290 __m128i hadd128;
6291 hadd128 = _mm_hadd_epi16 (_pM128i(a), _pM128i(b));
6292 hadd128 = _mm_shuffle_epi32 (hadd128, 0 | (2 << 2) | (1 << 4) | (3 << 6));
6293 return64(hadd128);
6294}
6295
6296
6297_NEON2SSESTORAGE int32x2_t vpadd_s32(int32x2_t a, int32x2_t b); // VPADD.I32 d0,d0,d0
6298_NEON2SSE_INLINE int32x2_t vpadd_s32(int32x2_t a, int32x2_t b)
6299{
6300 int32x2_t res64;
6301 __m128i hadd128;
6302 hadd128 = _mm_hadd_epi32 (_pM128i(a), _pM128i(b));
6303 hadd128 = _mm_shuffle_epi32 (hadd128, 0 | (2 << 2) | (1 << 4) | (3 << 6));
6304 return64(hadd128);
6305}
6306
6307
6308_NEON2SSESTORAGE uint8x8_t vpadd_u8(uint8x8_t a, uint8x8_t b); // VPADD.I8 d0,d0,d0
6309_NEON2SSE_INLINE uint8x8_t vpadd_u8(uint8x8_t a, uint8x8_t b) // VPADD.I8 d0,d0,d0
6310{
6311 // no 8 bit hadd in IA32, need to go to 16 bit and then pack
6312 uint8x8_t res64;
6313// no unsigned _mm_hadd_ functions in IA32, but 8 unsigned is less then 16 signed, so it should work
6314 __m128i mask8, a16, b16, res;
6315 mask8 = _mm_set1_epi16(0xff);
6316 a16 = _MM_CVTEPU8_EPI16 (_pM128i(a)); // SSE 4.1
6317 b16 = _MM_CVTEPU8_EPI16 (_pM128i(b)); // SSE 4.1
6318 res = _mm_hadd_epi16 (a16, b16);
6319 res = _mm_and_si128(res, mask8); //to avoid saturation
6320 res = _mm_packus_epi16 (res,res); //use low 64 bits
6321 return64(res);
6322}
6323
6324_NEON2SSESTORAGE uint16x4_t vpadd_u16(uint16x4_t a, uint16x4_t b); // VPADD.I16 d0,d0,d0
6325_NEON2SSE_INLINE uint16x4_t vpadd_u16(uint16x4_t a, uint16x4_t b) // VPADD.I16 d0,d0,d0
6326{
6327 // solution may be not optimal, serial execution may be faster
6328 // no unsigned _mm_hadd_ functions in IA32, need to move from unsigned to signed
6329 uint16x4_t res64;
6330 __m128i c32767, cfffe, as, bs, res;
6331 c32767 = _mm_set1_epi16 (32767);
6332 cfffe = _mm_set1_epi16 (-2); //(int16_t)0xfffe
6333 as = _mm_sub_epi16 (_pM128i(a), c32767);
6334 bs = _mm_sub_epi16 (_pM128i(b), c32767);
6335 res = _mm_hadd_epi16 (as, bs);
6336 res = _mm_add_epi16 (res, cfffe);
6337 res = _mm_shuffle_epi32 (res, 0 | (2 << 2) | (1 << 4) | (3 << 6));
6338 return64(res);
6339}
6340
6341_NEON2SSESTORAGE uint32x2_t vpadd_u32(uint32x2_t a, uint32x2_t b); // VPADD.I32 d0,d0,d0
6342_NEON2SSE_INLINE uint32x2_t vpadd_u32(uint32x2_t a, uint32x2_t b) //serial may be faster
6343{
6344 //hadd doesn't work for unsigned values
6345 uint32x2_t res64;
6346 __m128i ab, ab_sh, res;
6347 ab = _mm_unpacklo_epi64 ( _pM128i(a), _pM128i(b)); //a0 a1 b0 b1
6348 ab_sh = _mm_shuffle_epi32(ab, 1 | (0 << 2) | (3 << 4) | (2 << 6)); //a1, a0, b1, b0
6349 res = _mm_add_epi32(ab, ab_sh);
6350 res = _mm_shuffle_epi32(res, 0 | (2 << 2) | (1 << 4) | (3 << 6));
6351 return64(res);
6352}
6353
6354_NEON2SSESTORAGE float32x2_t vpadd_f32(float32x2_t a, float32x2_t b); // VPADD.F32 d0,d0,d0
6355_NEON2SSE_INLINE float32x2_t vpadd_f32(float32x2_t a, float32x2_t b)
6356{
6357 __m128 hadd128;
6358 __m64_128 res64;
6359 hadd128 = _mm_hadd_ps (_pM128(a), _pM128(b));
6360 hadd128 = _mm_shuffle_ps (hadd128, hadd128, _MM_SHUFFLE(3,1, 2, 0)); //use low 64 bits
6361 _M64f(res64, hadd128);
6362 return res64;
6363}
6364
6365
6366//************************** Long pairwise add **********************************
6367//*********************************************************************************
6368//Adds adjacent pairs of elements of a vector,sign or zero extends the results to twice their original width,
6369// and places the final results in the destination vector.
6370
6371_NEON2SSESTORAGE int16x4_t vpaddl_s8(int8x8_t a); // VPADDL.S8 d0,d0
6372_NEON2SSE_INLINE int16x4_t vpaddl_s8(int8x8_t a) // VPADDL.S8 d0,d0
6373{
6374 //no 8 bit hadd in IA32, need to go to 16 bit anyway
6375 __m128i a16;
6376 int16x4_t res64;
6377 a16 = _MM_CVTEPI8_EPI16 (_pM128i(a)); // SSE 4.1
6378 a16 = _mm_hadd_epi16 (a16, a16); //use low 64 bits
6379 return64(a16);
6380}
6381
6382_NEON2SSESTORAGE int32x2_t vpaddl_s16(int16x4_t a); // VPADDL.S16 d0,d0
6383_NEON2SSE_INLINE int32x2_t vpaddl_s16(int16x4_t a) // VPADDL.S16 d0,d0
6384{
6385 // solution may be not optimal, serial execution may be faster
6386 int32x2_t res64;
6387 __m128i r32_1;
6388 r32_1 = _MM_CVTEPI16_EPI32 (_pM128i(a));
6389 r32_1 = _mm_hadd_epi32(r32_1, r32_1); //use low 64 bits
6390 return64(r32_1);
6391}
6392
6393_NEON2SSESTORAGE int64x1_t vpaddl_s32(int32x2_t a); // VPADDL.S32 d0,d0
6394_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int64x1_t vpaddl_s32(int32x2_t a), _NEON2SSE_REASON_SLOW_SERIAL) //serial solution looks faster
6395{
6396 int64x1_t res;
6397 res.m64_i64[0] = (int64_t)a.m64_i32[0] + (int64_t)a.m64_i32[1];
6398 return res;
6399}
6400
6401_NEON2SSESTORAGE uint16x4_t vpaddl_u8(uint8x8_t a); // VPADDL.U8 d0,d0
6402_NEON2SSE_INLINE uint16x4_t vpaddl_u8(uint8x8_t a) // VPADDL.U8 d0,d0
6403{
6404 // no 8 bit hadd in IA32, need to go to 16 bit
6405// no unsigned _mm_hadd_ functions in IA32, but 8 unsigned is less then 16 signed, so it should work
6406 uint16x4_t res64;
6407 __m128i a16;
6408 a16 = _MM_CVTEPU8_EPI16 (_pM128i(a)); // SSE 4.1 use low 64 bits
6409 a16 = _mm_hadd_epi16 (a16, a16); //use low 64 bits
6410 return64(a16);
6411}
6412
6413_NEON2SSESTORAGE uint32x2_t vpaddl_u16(uint16x4_t a); // VPADDL.s16 d0,d0
6414_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint32x2_t vpaddl_u16(uint16x4_t a), _NEON2SSE_REASON_SLOW_SERIAL)
6415{
6416 //serial solution looks faster than a SIMD one
6417 uint32x2_t res;
6418 res.m64_u32[0] = (uint32_t)a.m64_u16[0] + (uint32_t)a.m64_u16[1];
6419 res.m64_u32[1] = (uint32_t)a.m64_u16[2] + (uint32_t)a.m64_u16[3];
6420 return res;
6421}
6422
6423_NEON2SSESTORAGE uint64x1_t vpaddl_u32(uint32x2_t a); // VPADDL.U32 d0,d0
6424_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint64x1_t vpaddl_u32(uint32x2_t a), _NEON2SSE_REASON_SLOW_SERIAL) //serial solution looks faster
6425{
6426 uint64x1_t res;
6427 res.m64_u64[0] = (uint64_t)a.m64_u32[0] + (uint64_t)a.m64_u32[1];
6428 return res;
6429}
6430
6431_NEON2SSESTORAGE int16x8_t vpaddlq_s8(int8x16_t a); // VPADDL.S8 q0,q0
6432_NEON2SSE_INLINE int16x8_t vpaddlq_s8(int8x16_t a) // VPADDL.S8 q0,q0
6433{
6434 //no 8 bit hadd in IA32, need to go to 16 bit
6435 __m128i r16_1, r16_2;
6436 r16_1 = _MM_CVTEPI8_EPI16 (a); // SSE 4.1
6437 //swap hi and low part of r to process the remaining data
6438 r16_2 = _mm_shuffle_epi32 (a, _SWAP_HI_LOW32);
6439 r16_2 = _MM_CVTEPI8_EPI16 (r16_2);
6440 return _mm_hadd_epi16 (r16_1, r16_2);
6441}
6442
6443_NEON2SSESTORAGE int32x4_t vpaddlq_s16(int16x8_t a); // VPADDL.S16 q0,q0
6444_NEON2SSE_INLINE int32x4_t vpaddlq_s16(int16x8_t a) // VPADDL.S16 q0,q0
6445{
6446 //no 8 bit hadd in IA32, need to go to 16 bit
6447 __m128i r32_1, r32_2;
6448 r32_1 = _MM_CVTEPI16_EPI32(a);
6449 //swap hi and low part of r to process the remaining data
6450 r32_2 = _mm_shuffle_epi32 (a, _SWAP_HI_LOW32);
6451 r32_2 = _MM_CVTEPI16_EPI32 (r32_2);
6452 return _mm_hadd_epi32 (r32_1, r32_2);
6453}
6454
6455_NEON2SSESTORAGE int64x2_t vpaddlq_s32(int32x4_t a); // VPADDL.S32 q0,q0
6456_NEON2SSE_INLINE int64x2_t vpaddlq_s32(int32x4_t a)
6457{
6458 __m128i top, bot;
6459 bot = _mm_shuffle_epi32(a, _MM_SHUFFLE(0, 0, 2, 0));
6460 bot = _MM_CVTEPI32_EPI64(bot);
6461 top = _mm_shuffle_epi32(a, _MM_SHUFFLE(0, 0, 3, 1));
6462 top = _MM_CVTEPI32_EPI64(top);
6463 return _mm_add_epi64(top, bot);
6464}
6465
6466_NEON2SSESTORAGE uint16x8_t vpaddlq_u8(uint8x16_t a); // VPADDL.U8 q0,q0
6467_NEON2SSE_INLINE uint16x8_t vpaddlq_u8(uint8x16_t a) // VPADDL.U8 q0,q0
6468{
6469 const __m128i ff = _mm_set1_epi16(0xFF);
6470 __m128i low = _mm_and_si128(a, ff);
6471 __m128i high = _mm_srli_epi16(a, 8);
6472 return _mm_add_epi16(low, high);
6473}
6474
6475#ifdef USE_SSE4
6476_NEON2SSESTORAGE uint32x4_t vpaddlq_u16(uint16x8_t a); // VPADDL.s16 q0,q0
6477_NEON2SSE_INLINE uint32x4_t vpaddlq_u16(uint16x8_t a)
6478{
6479 const __m128i zero = _mm_setzero_si128();
6480 __m128i low = _mm_blend_epi16(zero, a, 0x55); // 0b1010101
6481 __m128i high = _mm_srli_epi32(a, 16);
6482 return _mm_add_epi32(low, high);
6483}
6484
6485_NEON2SSESTORAGE uint64x2_t vpaddlq_u32(uint32x4_t a); // VPADDL.U32 q0,q0
6486_NEON2SSE_INLINE uint64x2_t vpaddlq_u32(uint32x4_t a)
6487{
6488 const __m128i zero = _mm_setzero_si128();
6489 __m128i low = _mm_blend_epi16(zero, a, 0x33); // 0b00110011
6490 __m128i high = _mm_srli_epi64(a, 32);
6491 return _mm_add_epi64(low, high);
6492}
6493#else
6494_NEON2SSESTORAGE uint32x4_t vpaddlq_u16(uint16x8_t a); // VPADDL.s16 q0,q0
6495_NEON2SSE_INLINE uint32x4_t vpaddlq_u16(uint16x8_t a)
6496{
6497 const __m128i ff = _mm_set1_epi32(0xFFFF);
6498 __m128i low = _mm_and_si128(a, ff);
6499 __m128i high = _mm_srli_epi32(a, 16);
6500 return _mm_add_epi32(low, high);
6501}
6502
6503_NEON2SSESTORAGE uint64x2_t vpaddlq_u32(uint32x4_t a); // VPADDL.U32 q0,q0
6504_NEON2SSE_INLINE uint64x2_t vpaddlq_u32(uint32x4_t a)
6505{
6506 const __m128i ff = _mm_set_epi32(0x00000000, 0xFFFFFFFF, 0x00000000, 0xFFFFFFFF);
6507 __m128i low = _mm_and_si128(a, ff);
6508 __m128i high = _mm_srli_epi64(a, 32);
6509 return _mm_add_epi64(low, high);
6510}
6511#endif
6512
6513//************************ Long pairwise add and accumulate **************************
6514//****************************************************************************************
6515//VPADAL (Vector Pairwise Add and Accumulate Long) adds adjacent pairs of elements of a vector,
6516// and accumulates the values of the results into the elements of the destination (wide) vector
6517_NEON2SSESTORAGE int16x4_t vpadal_s8(int16x4_t a, int8x8_t b); // VPADAL.S8 d0,d0
6518_NEON2SSE_INLINE int16x4_t vpadal_s8(int16x4_t a, int8x8_t b)
6519{
6520 int16x4_t res64;
6521 return64(vpadalq_s8(_pM128i(a), _pM128i(b)));
6522}
6523
6524_NEON2SSESTORAGE int32x2_t vpadal_s16(int32x2_t a, int16x4_t b); // VPADAL.S16 d0,d0
6525_NEON2SSE_INLINE int32x2_t vpadal_s16(int32x2_t a, int16x4_t b)
6526{
6527 int32x2_t res64;
6528 return64(vpadalq_s16(_pM128i(a), _pM128i(b)));
6529}
6530
6531
6532_NEON2SSESTORAGE int64x1_t vpadal_s32(int64x1_t a, int32x2_t b); // VPADAL.S32 d0,d0
6533_NEON2SSE_INLINE int64x1_t vpadal_s32(int64x1_t a, int32x2_t b)
6534{
6535 int64x1_t res;
6536 res.m64_i64[0] = (int64_t)b.m64_i32[0] + (int64_t)b.m64_i32[1] + a.m64_i64[0];
6537 return res;
6538}
6539
6540_NEON2SSESTORAGE uint16x4_t vpadal_u8(uint16x4_t a, uint8x8_t b); // VPADAL.U8 d0,d0
6541_NEON2SSE_INLINE uint16x4_t vpadal_u8(uint16x4_t a, uint8x8_t b)
6542{
6543 uint16x4_t res64;
6544 return64(vpadalq_u8(_pM128i(a), _pM128i(b)));
6545}
6546
6547
6548_NEON2SSESTORAGE uint32x2_t vpadal_u16(uint32x2_t a, uint16x4_t b); // VPADAL.s16 d0,d0
6549_NEON2SSE_INLINE uint32x2_t vpadal_u16(uint32x2_t a, uint16x4_t b)
6550{
6551 uint32x2_t res64;
6552 return64(vpadalq_u16(_pM128i(a), _pM128i(b)));
6553}
6554
6555_NEON2SSESTORAGE uint64x1_t vpadal_u32(uint64x1_t a, uint32x2_t b); // VPADAL.U32 d0,d0
6556_NEON2SSE_INLINE uint64x1_t vpadal_u32(uint64x1_t a, uint32x2_t b)
6557{
6558 uint64x1_t res;
6559 res.m64_u64[0] = (uint64_t)b.m64_u32[0] + (uint64_t)b.m64_u32[1] + a.m64_u64[0];
6560 return res;
6561}
6562
6563_NEON2SSESTORAGE int16x8_t vpadalq_s8(int16x8_t a, int8x16_t b); // VPADAL.S8 q0,q0
6564_NEON2SSE_INLINE int16x8_t vpadalq_s8(int16x8_t a, int8x16_t b) // VPADAL.S8 q0,q0
6565{
6566 int16x8_t pad;
6567 pad = vpaddlq_s8(b);
6568 return _mm_add_epi16 (a, pad);
6569}
6570
6571_NEON2SSESTORAGE int32x4_t vpadalq_s16(int32x4_t a, int16x8_t b); // VPADAL.S16 q0,q0
6572_NEON2SSE_INLINE int32x4_t vpadalq_s16(int32x4_t a, int16x8_t b) // VPADAL.S16 q0,q0
6573{
6574 int32x4_t pad;
6575 pad = vpaddlq_s16(b);
6576 return _mm_add_epi32(a, pad);
6577}
6578
6579_NEON2SSESTORAGE int64x2_t vpadalq_s32(int64x2_t a, int32x4_t b); // VPADAL.S32 q0,q0
6580_NEON2SSE_INLINE int64x2_t vpadalq_s32(int64x2_t a, int32x4_t b)
6581{
6582 int64x2_t pad;
6583 pad = vpaddlq_s32(b);
6584 return _mm_add_epi64 (a, pad);
6585}
6586
6587_NEON2SSESTORAGE uint16x8_t vpadalq_u8(uint16x8_t a, uint8x16_t b); // VPADAL.U8 q0,q0
6588_NEON2SSE_INLINE uint16x8_t vpadalq_u8(uint16x8_t a, uint8x16_t b) // VPADAL.U8 q0,q0
6589{
6590 uint16x8_t pad;
6591 pad = vpaddlq_u8(b);
6592 return _mm_add_epi16 (a, pad);
6593}
6594
6595_NEON2SSESTORAGE uint32x4_t vpadalq_u16(uint32x4_t a, uint16x8_t b); // VPADAL.s16 q0,q0
6596_NEON2SSE_INLINE uint32x4_t vpadalq_u16(uint32x4_t a, uint16x8_t b)
6597{
6598 uint32x4_t pad;
6599 pad = vpaddlq_u16(b);
6600 return _mm_add_epi32(a, pad);
6601} //no optimal SIMD solution, serial is faster
6602
6603_NEON2SSESTORAGE uint64x2_t vpadalq_u32(uint64x2_t a, uint32x4_t b); // VPADAL.U32 q0,q0
6604_NEON2SSE_INLINE uint64x2_t vpadalq_u32(uint64x2_t a, uint32x4_t b)
6605{
6606 uint64x2_t pad;
6607 pad = vpaddlq_u32(b);
6608 return _mm_add_epi64(a, pad);
6609}
6610
6611//********** Folding maximum *************************************
6612//*******************************************************************
6613//VPMAX (Vector Pairwise Maximum) compares adjacent pairs of elements in two vectors,
6614//and copies the larger of each pair into the corresponding element in the destination
6615// no corresponding functionality in IA32 SIMD, so we need to do the vertical comparison
6616_NEON2SSESTORAGE int8x8_t vpmax_s8(int8x8_t a, int8x8_t b); // VPMAX.S8 d0,d0,d0
6617_NEON2SSE_INLINE int8x8_t vpmax_s8(int8x8_t a, int8x8_t b) // VPMAX.S8 d0,d0,d0
6618{
6619 int8x8_t res64;
6620 __m128i ab, ab1, max;
6621 _NEON2SSE_ALIGN_16 static const uint8_t mask8_sab[16] = { 1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14};
6622 _NEON2SSE_ALIGN_16 static const uint8_t mask8_odd[16] = { 1, 3, 5, 7, 9, 11, 13, 15, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff};
6623 ab = _mm_unpacklo_epi64 ( _pM128i(a), _pM128i(b)); //ab
6624 ab1 = _mm_shuffle_epi8 (ab, *(__m128i*) mask8_sab); //horisontal pairs swap for vertical max finding
6625 max = _MM_MAX_EPI8 (ab, ab1); // SSE4.1
6626 max = _mm_shuffle_epi8 (max, *(__m128i*) mask8_odd); //remove repetitive data
6627 return64(max); //we need 64 bits only
6628}
6629
6630_NEON2SSESTORAGE int16x4_t vpmax_s16(int16x4_t a, int16x4_t b); // VPMAX.S16 d0,d0,d0
6631_NEON2SSE_INLINE int16x4_t vpmax_s16(int16x4_t a, int16x4_t b) // VPMAX.S16 d0,d0,d0
6632{
6633 //solution may be not optimal compared with the serial one
6634 int16x4_t res64;
6635 __m128i ab, ab1, max;
6636 _NEON2SSE_ALIGN_16 static const int8_t mask16_sab[16] = { 2, 3, 0, 1, 6, 7, 4, 5, 10, 11, 8, 9, 14, 15, 12, 13}; //each chars pair is considerd to be 16 bit number
6637 ab = _mm_unpacklo_epi64 ( _pM128i(a), _pM128i(b)); //ab
6638 ab1 = _mm_shuffle_epi8 (ab, *(__m128i*) mask16_sab); //horisontal pairs swap for vertical max finding, use 8bit fn and the corresponding mask
6639 max = _mm_max_epi16 (ab, ab1);
6640 max = _mm_shuffle_epi8 (max, *(__m128i*) mask8_32_even_odd); //remove repetitive data, only the low part of mask is used
6641 return64(max);
6642}
6643
6644_NEON2SSESTORAGE int32x2_t vpmax_s32(int32x2_t a, int32x2_t b); // VPMAX.S32 d0,d0,d0
6645_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int32x2_t vpmax_s32(int32x2_t a, int32x2_t b), _NEON2SSE_REASON_SLOW_SERIAL)
6646{
6647 //serial solution looks faster than SIMD one
6648 int32x2_t res;
6649 res.m64_i32[0] = (a.m64_i32[0] < a.m64_i32[1]) ? a.m64_i32[1] : a.m64_i32[0];
6650 res.m64_i32[1] = (b.m64_i32[0] < b.m64_i32[1]) ? b.m64_i32[1] : b.m64_i32[0];
6651 return res;
6652}
6653
6654_NEON2SSESTORAGE uint8x8_t vpmax_u8(uint8x8_t a, uint8x8_t b); // VPMAX.U8 d0,d0,d0
6655_NEON2SSE_INLINE uint8x8_t vpmax_u8(uint8x8_t a, uint8x8_t b) // VPMAX.U8 d0,d0,d0
6656{
6657 uint8x8_t res64;
6658 __m128i ab, ab1, max;
6659 _NEON2SSE_ALIGN_16 static const int8_t mask8_sab[16] = { 1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14};
6660 _NEON2SSE_ALIGN_16 static const uint8_t mask8_odd[16] = { 1, 3, 5, 7, 9, 11, 13, 15, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff};
6661 ab = _mm_unpacklo_epi64 (_pM128i(a), _pM128i(b)); //ab
6662 ab1 = _mm_shuffle_epi8 (ab, *(__m128i*) mask8_sab); //horisontal pairs swap for vertical max finding
6663 max = _mm_max_epu8 (ab, ab1); // SSE4.1
6664 max = _mm_shuffle_epi8 (max, *(__m128i*) mask8_odd); //remove repetitive data
6665 return64(max);
6666}
6667
6668_NEON2SSESTORAGE uint16x4_t vpmax_u16(uint16x4_t a, uint16x4_t b); // VPMAX.s16 d0,d0,d0
6669_NEON2SSE_INLINE uint16x4_t vpmax_u16(uint16x4_t a, uint16x4_t b) // VPMAX.s16 d0,d0,d0
6670{
6671 //solution may be not optimal compared with the serial one
6672 uint16x4_t res64;
6673 __m128i ab, ab1, max;
6674 _NEON2SSE_ALIGN_16 static const uint8_t mask16_sab[16] = { 2, 3, 0, 1, 6, 7, 4, 5, 10, 11, 8, 9, 14, 15, 12, 13}; //each chars pair is considerd to be 16 bit number
6675 ab = _mm_unpacklo_epi64 ( _pM128i(a), _pM128i(b)); //ab
6676 ab1 = _mm_shuffle_epi8 (ab, *(__m128i*) mask16_sab); //horisontal pairs swap for vertical max finding, use 8bit fn and the corresponding mask
6677 max = _MM_MAX_EPU16 (ab, ab1);
6678 max = _mm_shuffle_epi8 (max, *(__m128i*) mask8_32_even_odd); //remove repetitive data, only the low part of mask is used
6679 return64(max);
6680}
6681
6682_NEON2SSESTORAGE uint32x2_t vpmax_u32(uint32x2_t a, uint32x2_t b); // VPMAX.U32 d0,d0,d0
6683_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint32x2_t vpmax_u32(uint32x2_t a, uint32x2_t b), _NEON2SSE_REASON_SLOW_SERIAL)
6684{
6685 //serial solution looks faster than SIMD one
6686 uint32x2_t res;
6687 res.m64_u32[0] = (a.m64_u32[0] < a.m64_u32[1]) ? a.m64_u32[1] : a.m64_u32[0];
6688 res.m64_u32[1] = (b.m64_u32[0] < b.m64_u32[1]) ? b.m64_u32[1] : b.m64_u32[0];
6689 return res;
6690}
6691
6692_NEON2SSESTORAGE float32x2_t vpmax_f32(float32x2_t a, float32x2_t b); // VPMAX.F32 d0,d0,d0
6693_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(float32x2_t vpmax_f32(float32x2_t a, float32x2_t b), _NEON2SSE_REASON_SLOW_SERIAL)
6694{
6695 //serial solution looks faster than SIMD one
6696 float32x2_t res;
6697 res.m64_f32[0] = (a.m64_f32[0] < a.m64_f32[1]) ? a.m64_f32[1] : a.m64_f32[0];
6698 res.m64_f32[1] = (b.m64_f32[0] < b.m64_f32[1]) ? b.m64_f32[1] : b.m64_f32[0];
6699 return res;
6700}
6701
6702// ***************** Folding minimum ****************************
6703// **************************************************************
6704//vpmin -> takes minimum of adjacent pairs
6705_NEON2SSESTORAGE int8x8_t vpmin_s8(int8x8_t a, int8x8_t b); // VPMIN.S8 d0,d0,d0
6706_NEON2SSE_INLINE int8x8_t vpmin_s8(int8x8_t a, int8x8_t b) // VPMIN.S8 d0,d0,d0
6707{
6708 int8x8_t res64;
6709 __m128i ab, ab1, min;
6710 _NEON2SSE_ALIGN_16 static const uint8_t mask8_sab[16] = { 1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14};
6711 _NEON2SSE_ALIGN_16 static const uint8_t mask8_odd[16] = { 1, 3, 5, 7, 9, 11, 13, 15, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff};
6712 ab = _mm_unpacklo_epi64 ( _pM128i(a), _pM128i(b)); //ab
6713 ab1 = _mm_shuffle_epi8 (ab, *(__m128i*) mask8_sab); //horisontal pairs swap for vertical min finding
6714 min = _MM_MIN_EPI8 (ab, ab1); // SSE4.1
6715 min = _mm_shuffle_epi8 (min, *(__m128i*) mask8_odd); //remove repetitive data
6716 return64(min);
6717}
6718
6719_NEON2SSESTORAGE int16x4_t vpmin_s16(int16x4_t a, int16x4_t b); // VPMIN.S16 d0,d0,d0
6720_NEON2SSE_INLINE int16x4_t vpmin_s16(int16x4_t a, int16x4_t b) // VPMIN.S16 d0,d0,d0
6721{
6722 //solution may be not optimal compared with the serial one
6723 int16x4_t res64;
6724 __m128i ab, ab1, min;
6725 _NEON2SSE_ALIGN_16 static const int8_t mask16_sab[16] = { 2, 3, 0, 1, 6, 7, 4, 5, 10, 11, 8, 9, 14, 15, 12, 13}; //each chars pair is considerd to be 16 bit number
6726 ab = _mm_unpacklo_epi64 ( _pM128i(a), _pM128i(b)); //ab
6727 ab1 = _mm_shuffle_epi8 (ab, *(__m128i*) mask16_sab); //horisontal pairs swap for vertical max finding, use 8bit fn and the corresponding mask
6728 min = _mm_min_epi16 (ab, ab1);
6729 min = _mm_shuffle_epi8 (min, *(__m128i*) mask8_32_even_odd); //remove repetitive data, only the low part of mask is used
6730 return64(min);
6731}
6732
6733_NEON2SSESTORAGE int32x2_t vpmin_s32(int32x2_t a, int32x2_t b); // VPMIN.S32 d0,d0,d0
6734_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int32x2_t vpmin_s32(int32x2_t a, int32x2_t b), _NEON2SSE_REASON_SLOW_SERIAL)
6735{
6736 //serial solution looks faster than SIMD one
6737 int32x2_t res;
6738 res.m64_i32[0] = (a.m64_i32[0] > a.m64_i32[1]) ? a.m64_i32[1] : a.m64_i32[0];
6739 res.m64_i32[1] = (b.m64_i32[0] > b.m64_i32[1]) ? b.m64_i32[1] : b.m64_i32[0];
6740 return res;
6741}
6742
6743_NEON2SSESTORAGE uint8x8_t vpmin_u8(uint8x8_t a, uint8x8_t b); // VPMIN.U8 d0,d0,d0
6744_NEON2SSE_INLINE uint8x8_t vpmin_u8(uint8x8_t a, uint8x8_t b) // VPMIN.U8 d0,d0,d0
6745{
6746 uint8x8_t res64;
6747 __m128i ab, ab1, min;
6748 _NEON2SSE_ALIGN_16 static const uint8_t mask8_sab[16] = { 1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14};
6749 _NEON2SSE_ALIGN_16 static const uint8_t mask8_odd[16] = { 1, 3, 5, 7, 9, 11, 13, 15, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff};
6750 ab = _mm_unpacklo_epi64 ( _pM128i(a), _pM128i(b)); //ab
6751 ab1 = _mm_shuffle_epi8 (ab, *(__m128i*) mask8_sab); //horisontal pairs swap for vertical max finding
6752 min = _mm_min_epu8 (ab, ab1); // SSE4.1
6753 min = _mm_shuffle_epi8 (min, *(__m128i*) mask8_odd); //remove repetitive data
6754 return64(min);
6755}
6756
6757_NEON2SSESTORAGE uint16x4_t vpmin_u16(uint16x4_t a, uint16x4_t b); // VPMIN.s16 d0,d0,d0
6758_NEON2SSE_INLINE uint16x4_t vpmin_u16(uint16x4_t a, uint16x4_t b) // VPMIN.s16 d0,d0,d0
6759{
6760 //solution may be not optimal compared with the serial one
6761 uint16x4_t res64;
6762 __m128i ab, ab1, min;
6763 _NEON2SSE_ALIGN_16 static const uint8_t mask16_sab[16] = { 2, 3, 0, 1, 6, 7, 4, 5, 10, 11, 8, 9, 14, 15, 12, 13}; //each chars pair is considerd to be 16 bit number
6764 ab = _mm_unpacklo_epi64 ( _pM128i(a), _pM128i(b)); //ab
6765 ab1 = _mm_shuffle_epi8 (ab, *(__m128i*) mask16_sab); //horisontal pairs swap for vertical min finding, use 8bit fn and the corresponding mask
6766 min = _MM_MIN_EPU16 (ab, ab1);
6767 min = _mm_shuffle_epi8 (min, *(__m128i*) mask8_32_even_odd); //remove repetitive data, only the low part of mask is used
6768 return64(min);
6769}
6770
6771_NEON2SSESTORAGE uint32x2_t vpmin_u32(uint32x2_t a, uint32x2_t b); // VPMIN.U32 d0,d0,d0
6772_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint32x2_t vpmin_u32(uint32x2_t a, uint32x2_t b), _NEON2SSE_REASON_SLOW_SERIAL)
6773{
6774 //serial solution looks faster than SIMD one
6775 uint32x2_t res;
6776 res.m64_u32[0] = (a.m64_u32[0] > a.m64_u32[1]) ? a.m64_u32[1] : a.m64_u32[0];
6777 res.m64_u32[1] = (b.m64_u32[0] > b.m64_u32[1]) ? b.m64_u32[1] : b.m64_u32[0];
6778 return res;
6779}
6780
6781_NEON2SSESTORAGE float32x2_t vpmin_f32(float32x2_t a, float32x2_t b); // VPMIN.F32 d0,d0,d0
6782_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(float32x2_t vpmin_f32(float32x2_t a, float32x2_t b), _NEON2SSE_REASON_SLOW_SERIAL)
6783{
6784 //serial solution looks faster than SIMD one
6785 float32x2_t res;
6786 res.m64_f32[0] = (a.m64_f32[0] > a.m64_f32[1]) ? a.m64_f32[1] : a.m64_f32[0];
6787 res.m64_f32[1] = (b.m64_f32[0] > b.m64_f32[1]) ? b.m64_f32[1] : b.m64_f32[0];
6788 return res;
6789}
6790
6791//***************************************************************
6792//*********** Reciprocal/Sqrt ************************************
6793//***************************************************************
6794//****************** Reciprocal estimate *******************************
6795//the ARM NEON and x86 SIMD results may be slightly different
6796_NEON2SSESTORAGE float32x2_t vrecpe_f32(float32x2_t a); // VRECPE.F32 d0,d0
6797_NEON2SSE_INLINE float32x2_t vrecpe_f32(float32x2_t a) //use low 64 bits
6798{
6799 float32x4_t res;
6800 __m64_128 res64;
6801 res = _mm_rcp_ps(_pM128(a));
6802 _M64f(res64, res);
6803 return res64;
6804}
6805
6806_NEON2SSESTORAGE uint32x2_t vrecpe_u32(uint32x2_t a); // VRECPE.U32 d0,d0
6807_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint32x2_t vrecpe_u32(uint32x2_t a), _NEON2SSE_REASON_SLOW_SERIAL)
6808{
6809 //Input is fixed point number!!! No reciprocal for ints in IA32 available
6810 uint32x2_t res;
6811 float resf, r;
6812 int i, q, s;
6813 for (i =0; i<2; i++){
6814 if((a.m64_u32[i] & 0x80000000) == 0) {
6815 res.m64_u32[i] = 0xffffffff;
6816 }else{
6817 resf = (float) (a.m64_u32[i] * (0.5f / (uint32_t)(1 << 31)));
6818 q = (int)(resf * 512.0f); /* a in units of 1/512 rounded down */
6819 r = (float)(1.0f / (((float)q + 0.5f) / 512.0f)); /* reciprocal r */
6820 s = (int)(256.0f * r + 0.5f); /* r in units of 1/256 rounded to nearest */
6821 r = (float)s / 256.0f;
6822 res.m64_u32[i] = (uint32_t)(r * (uint32_t)(1 << 31));
6823 }
6824 }
6825 return res;
6826}
6827
6828_NEON2SSE_GLOBAL float32x4_t vrecpeq_f32(float32x4_t a); // VRECPE.F32 q0,q0
6829#define vrecpeq_f32 _mm_rcp_ps
6830
6831
6832_NEON2SSESTORAGE uint32x4_t vrecpeq_u32(uint32x4_t a); // VRECPE.U32 q0,q0
6833_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint32x4_t vrecpeq_u32(uint32x4_t a), _NEON2SSE_REASON_SLOW_SERIAL)
6834{
6835 //Input is fixed point number!!!
6836 //We implement the recip_estimate function as described in ARMv7 reference manual (VRECPE instruction) but use float instead of double
6837 _NEON2SSE_ALIGN_16 uint32_t atmp[4];
6838 _NEON2SSE_ALIGN_16 uint32_t res[4];
6839 _NEON2SSE_ALIGN_16 static const uint32_t c80000000[4] = {0x80000000,0x80000000, 0x80000000,0x80000000};
6840 float resf, r;
6841 int i, q, s;
6842 __m128i res128, mask, zero;
6843 _mm_store_si128((__m128i*)atmp, a);
6844 zero = _mm_setzero_si128();
6845 for (i =0; i<4; i++){
6846 resf = (atmp[i] * (0.5f / (uint32_t) (1 << 31))); // 2.3283064365386963E-10 ~(0.5f / (uint32_t) (1 << 31))
6847 q = (int)(resf * 512.0f); /* a in units of 1/512 rounded down */
6848 r = 1.0f / (((float)q + 0.5f) / 512.0f); /* reciprocal r */
6849 s = (int)(256.0f * r + 0.5f); /* r in units of 1/256 rounded to nearest */
6850 r = (float)s / 256.0f;
6851 res[i] = (uint32_t) (r * (((uint32_t)1) << 31) );
6852 }
6853 res128 = _mm_load_si128((__m128i*)res);
6854 mask = _mm_and_si128(a, *(__m128i*)c80000000);
6855 mask = _mm_cmpeq_epi32(zero, mask); //0xffffffff if atmp[i] <= 0x7fffffff
6856 return _mm_or_si128(res128, mask);
6857}
6858
6859//**********Reciprocal square root estimate ****************
6860//**********************************************************
6861//no reciprocal square root for ints in IA32 available, neither for unsigned int to float4 lanes conversion, so a serial solution looks faster
6862//but the particular implementation for vrsqrte_u32 may vary for various ARM compilers
6863////the ARM NEON and x86 SIMD results may be slightly different
6864_NEON2SSESTORAGE float32x2_t vrsqrte_f32(float32x2_t a); // VRSQRTE.F32 d0,d0
6865_NEON2SSE_INLINE float32x2_t vrsqrte_f32(float32x2_t a) //use low 64 bits
6866{
6867 float32x4_t res;
6868 __m64_128 res64;
6869 res = _mm_rsqrt_ps(_pM128(a));
6870 _M64f(res64, res);
6871 return res64;
6872}
6873
6874_NEON2SSESTORAGE uint32x2_t vrsqrte_u32(uint32x2_t a); // VRSQRTE.U32 d0,d0
6875_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint32x2_t vrsqrte_u32(uint32x2_t a), _NEON2SSE_REASON_SLOW_SERIAL)
6876{
6877 // Input is fixed point number!!!
6878 // We implement the recip_sqrt_estimate function as described in ARMv7
6879 // reference manual (VRSQRTE instruction) But results may be slightly different
6880 // from ARM implementation due to _mm_rsqrt_ps precision
6881 uint32x2_t res;
6882 __m64_128 res64[2];
6883 int i;
6884 _NEON2SSE_ALIGN_16 float coeff[2];
6885 for (i = 0; i < 2; i++) {
6886 // Generate double-precision value = operand * 2^(-32). This has zero sign
6887 // bit, with:
6888 // exponent = 1022 or 1021 = double-precision representation of 2^(-1)
6889 // or 2^(-2) fraction taken from operand, excluding its most significant
6890 // one or two bits.
6891 uint64_t dp_operand;
6892 if (a.m64_u32[i] & 0x80000000) {
6893 dp_operand =
6894 (0x3feLL << 52) | (((uint64_t)a.m64_u32[i] & 0x7FFFFFFF) << 21);
6895 } else {
6896 dp_operand =
6897 (0x3fdLL << 52) | (((uint64_t)a.m64_u32[i] & 0x3FFFFFFF) << 22);
6898 }
6899 res64[i].m64_u64[0] = dp_operand;
6900 coeff[i] = (res64[i].m64_d64[0] < 0.5) ? 512.0f : 256.0f; /* range 0.25 <= resf < 0.5 or range 0.5 <= resf < 1.0*/
6901 }
6902 __m128 coeff_f = _mm_load_ps(coeff);
6903 __m128d q0_d = _mm_mul_pd(_mm_loadu_pd(&res64[0].m64_d64[0]), _mm_cvtps_pd(coeff_f));
6904 __m128i q0_i = _mm_cvttpd_epi32(q0_d);
6905 __m128 c05_f = _mm_set1_ps(0.5);
6906 __m128 r_f = _mm_div_ps(_mm_add_ps(_mm_cvtepi32_ps(q0_i), c05_f), coeff_f);
6907 __m128 rsqrt_f = _mm_rsqrt_ps(r_f);
6908 __m128 c256_f = _mm_set1_ps(256.0);
6909 __m128 s_f = _mm_add_ps(_mm_mul_ps(rsqrt_f, c256_f), c05_f);
6910#ifdef USE_SSE4
6911 s_f = _mm_floor_ps(s_f);
6912#else
6913 s_f = _mm_cvtepi32_ps(_mm_cvttps_epi32(s_f));
6914#endif
6915 s_f = _mm_div_ps(s_f, c256_f);
6916 _M64f(res64[0], s_f);
6917
6918 for (i = 0; i < 2; i++) {
6919 if ((a.m64_u32[i] & 0xc0000000) == 0) { // a <=0x3fffffff
6920 res.m64_u32[i] = 0xffffffff;
6921 } else {
6922 res.m64_u32[i] = (uint32_t)(res64[0].m64_f32[i] * (((uint32_t)1) << 31));
6923 }
6924 }
6925 return res;
6926}
6927
6928_NEON2SSE_GLOBAL float32x4_t vrsqrteq_f32(float32x4_t a); // VRSQRTE.F32 q0,q0
6929#define vrsqrteq_f32 _mm_rsqrt_ps
6930
6931_NEON2SSESTORAGE uint32x4_t vrsqrteq_u32(uint32x4_t a); // VRSQRTE.U32 q0,q0
6932_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint32x4_t vrsqrteq_u32(uint32x4_t a), _NEON2SSE_REASON_SLOW_SERIAL)
6933{
6934 // Input is fixed point number!!!
6935 // We implement the recip_sqrt_estimate function as described in ARMv7
6936 // reference manual (VRSQRTE instruction) But results may be slightly different
6937 // from ARM implementation due to _mm_rsqrt_ps precision
6938 int i;
6939 _NEON2SSE_ALIGN_16 uint32_t atmp[4], res[4];
6940 _NEON2SSE_ALIGN_16 float coeff[4], rr[4];
6941 char* coeff_f2_c = (char*)&coeff[2];
6942 __m64_128 res64[4];
6943 _mm_store_si128((__m128i *)atmp, a);
6944 for (i = 0; i < 4; i++) {
6945 // Generate double-precision value = operand * 2^(-32). This has zero sign
6946 // bit, with:
6947 // exponent = 1022 or 1021 = double-precision representation of 2^(-1)
6948 // or 2^(-2) fraction taken from operand, excluding its most significant
6949 // one or two bits.
6950 uint64_t dp_operand;
6951 if (atmp[i] & 0x80000000) {
6952 dp_operand = (0x3feLL << 52) | (((uint64_t)atmp[i] & 0x7FFFFFFF) << 21);
6953 } else {
6954 dp_operand = (0x3fdLL << 52) | (((uint64_t)atmp[i] & 0x3FFFFFFF) << 22);
6955 }
6956 res64[i].m64_u64[0] = dp_operand;
6957 coeff[i] = (res64[i].m64_d64[0] < 0.5) ? 512.0f : 256.0f; /* range 0.25 <= resf < 0.5 or range 0.5 <= resf < 1.0*/
6958 }
6959 __m128 c05_f = _mm_set1_ps(0.5);
6960 __m128 coeff_f = _mm_load_ps(coeff);
6961 __m128d q0_d = _mm_mul_pd(_mm_loadu_pd(&res64[0].m64_d64[0]), _mm_cvtps_pd(coeff_f));
6962 __m128i q0_i = _mm_cvttpd_epi32(q0_d);
6963
6964 __m128 coeff_f2 = _M128(_pM128i(*coeff_f2_c));
6965 q0_d = _mm_mul_pd(_mm_loadu_pd(&res64[2].m64_d64[0]), _mm_cvtps_pd(coeff_f2));
6966 __m128i q0_i2 = _mm_cvttpd_epi32(q0_d);
6967 coeff_f = _M128(_mm_unpacklo_epi64(_M128i(coeff_f), _M128i(coeff_f2)));
6968 q0_i = _mm_unpacklo_epi64(q0_i, q0_i2);
6969
6970 __m128 r_f = _mm_div_ps(_mm_add_ps(_mm_cvtepi32_ps(q0_i), c05_f), coeff_f);
6971 __m128 rsqrt_f = _mm_rsqrt_ps(r_f);
6972 __m128 c256_f = _mm_set1_ps(256.0);
6973 __m128 s_f = _mm_add_ps(_mm_mul_ps(rsqrt_f, c256_f), c05_f);
6974#ifdef USE_SSE4
6975 s_f = _mm_floor_ps(s_f);
6976#else
6977 s_f = _mm_cvtepi32_ps(_mm_cvttps_epi32(s_f));
6978#endif
6979 s_f = _mm_div_ps(s_f, c256_f);
6980 _mm_store_ps(rr, s_f);
6981
6982 for (i = 0; i < 4; i++) {
6983 if ((atmp[i] & 0xc0000000) == 0) { // a <=0x3fffffff
6984 res[i] = 0xffffffff;
6985 } else {
6986 res[i] = (uint32_t)(rr[i] * (((uint32_t)1) << 31));
6987 }
6988 }
6989 return _mm_load_si128((__m128i *)res);
6990}
6991
6992//************ Reciprocal estimate/step and 1/sqrt estimate/step ***************************
6993//******************************************************************************************
6994//******VRECPS (Vector Reciprocal Step) ***************************************************
6995//multiplies the elements of one vector by the corresponding elements of another vector,
6996//subtracts each of the results from 2, and places the final results into the elements of the destination vector.
6997
6998_NEON2SSESTORAGE float32x2_t vrecps_f32(float32x2_t a, float32x2_t b); // VRECPS.F32 d0, d0, d0
6999_NEON2SSE_INLINE float32x2_t vrecps_f32(float32x2_t a, float32x2_t b)
7000{
7001 float32x4_t res;
7002 __m64_128 res64;
7003 res = vrecpsq_f32(_pM128(a), _pM128(b));
7004 _M64f(res64, res);
7005 return res64;
7006}
7007
7008_NEON2SSESTORAGE float32x4_t vrecpsq_f32(float32x4_t a, float32x4_t b); // VRECPS.F32 q0, q0, q0
7009_NEON2SSE_INLINE float32x4_t vrecpsq_f32(float32x4_t a, float32x4_t b) // VRECPS.F32 q0, q0, q0
7010{
7011 __m128 f2, mul;
7012 f2 = _mm_set1_ps(2.);
7013 mul = _mm_mul_ps(a,b);
7014 return _mm_sub_ps(f2,mul);
7015}
7016
7017//*****************VRSQRTS (Vector Reciprocal Square Root Step) *****************************
7018//multiplies the elements of one vector by the corresponding elements of another vector,
7019//subtracts each of the results from 3, divides these results by two, and places the final results into the elements of the destination vector.
7020
7021_NEON2SSESTORAGE float32x2_t vrsqrts_f32(float32x2_t a, float32x2_t b); // VRSQRTS.F32 d0, d0, d0
7022_NEON2SSE_INLINE float32x2_t vrsqrts_f32(float32x2_t a, float32x2_t b)
7023{
7024 float32x2_t res;
7025 res.m64_f32[0] = (3 - a.m64_f32[0] * b.m64_f32[0]) / 2;
7026 res.m64_f32[1] = (3 - a.m64_f32[1] * b.m64_f32[1]) / 2;
7027 return res;
7028}
7029
7030_NEON2SSESTORAGE float32x4_t vrsqrtsq_f32(float32x4_t a, float32x4_t b); // VRSQRTS.F32 q0, q0, q0
7031_NEON2SSE_INLINE float32x4_t vrsqrtsq_f32(float32x4_t a, float32x4_t b) // VRSQRTS.F32 q0, q0, q0
7032{
7033 __m128 f3, f05, mul;
7034 f3 = _mm_set1_ps(3.f);
7035 f05 = _mm_set1_ps(0.5f);
7036 mul = _mm_mul_ps(a,b);
7037 f3 = _mm_sub_ps(f3,mul);
7038 return _mm_mul_ps (f3, f05);
7039}
7040//********************************************************************************************
7041//***************************** Shifts by signed variable ***********************************
7042//********************************************************************************************
7043//***** Vector shift left: Vr[i] := Va[i] << Vb[i] (negative values shift right) ***********************
7044//********************************************************************************************
7045//No such operations in IA32 SIMD unfortunately, constant shift only available, so need to do the serial solution
7046//helper macro. It matches ARM implementation for big shifts
7047#define SERIAL_SHIFT(TYPE, INTERNAL_TYPE, LENMAX, LEN) \
7048 _NEON2SSE_ALIGN_16 TYPE atmp[LENMAX], res[LENMAX]; _NEON2SSE_ALIGN_16 INTERNAL_TYPE btmp[LENMAX]; int i, lanesize = sizeof(INTERNAL_TYPE) << 3; \
7049 _mm_store_si128((__m128i*)atmp, a); _mm_store_si128((__m128i*)btmp, b); \
7050 for (i = 0; i<LEN; i++) { \
7051 if( (btmp[i] >= lanesize)||(btmp[i] <= -lanesize) ) res[i] = 0; \
7052 else res[i] = (btmp[i] >=0) ? atmp[i] << btmp[i] : atmp[i] >> (-btmp[i]); } \
7053 return _mm_load_si128((__m128i*)res);
7054
7055#define SERIAL_SHIFT_64(TYPE, SIGN, LEN) \
7056 int ## TYPE ## x ## LEN ## _t res; int i, lanesize = sizeof(int ## TYPE ## _t) << 3; \
7057 for (i = 0; i<LEN; i++) { \
7058 if( (b.m64_i ## TYPE[i] >= lanesize)||(b.m64_i ## TYPE[i] <= -lanesize) ) res.m64_ ## SIGN ## TYPE[i] = 0; \
7059 else res.m64_ ## SIGN ## TYPE[i] = (b.m64_i ## TYPE[i] >=0) ? a.m64_ ## SIGN ## TYPE[i] << b.m64_i ## TYPE[i] : a.m64_ ## SIGN ## TYPE[i] >> (-b.m64_i ## TYPE[i]); } \
7060 return res;
7061
7062_NEON2SSESTORAGE int8x8_t vshl_s8(int8x8_t a, int8x8_t b); // VSHL.S8 d0,d0,d0
7063_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int8x8_t vshl_s8(int8x8_t a, int8x8_t b), _NEON2SSE_REASON_SLOW_SERIAL)
7064{
7065 SERIAL_SHIFT_64(8, i, 8)
7066}
7067
7068_NEON2SSESTORAGE int16x4_t vshl_s16(int16x4_t a, int16x4_t b); // VSHL.S16 d0,d0,d0
7069_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int16x4_t vshl_s16(int16x4_t a, int16x4_t b), _NEON2SSE_REASON_SLOW_SERIAL)
7070{
7071 SERIAL_SHIFT_64(16, i, 4)
7072}
7073
7074_NEON2SSESTORAGE int32x2_t vshl_s32(int32x2_t a, int32x2_t b); // VSHL.S32 d0,d0,d0
7075_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int32x2_t vshl_s32(int32x2_t a, int32x2_t b), _NEON2SSE_REASON_SLOW_SERIAL)
7076{
7077 SERIAL_SHIFT_64(32, i, 2)
7078}
7079
7080_NEON2SSESTORAGE int64x1_t vshl_s64(int64x1_t a, int64x1_t b); // VSHL.S64 d0,d0,d0
7081_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int64x1_t vshl_s64(int64x1_t a, int64x1_t b), _NEON2SSE_REASON_SLOW_SERIAL)
7082{
7083 SERIAL_SHIFT_64(64, i, 1)
7084}
7085
7086_NEON2SSESTORAGE uint8x8_t vshl_u8(uint8x8_t a, int8x8_t b); // VSHL.U8 d0,d0,d0
7087_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint8x8_t vshl_u8(uint8x8_t a, int8x8_t b), _NEON2SSE_REASON_SLOW_SERIAL)
7088{
7089 SERIAL_SHIFT_64(8, u, 8)
7090}
7091
7092_NEON2SSESTORAGE uint16x4_t vshl_u16(uint16x4_t a, int16x4_t b); // VSHL.s16 d0,d0,d0
7093_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint16x4_t vshl_u16(uint16x4_t a, int16x4_t b), _NEON2SSE_REASON_SLOW_SERIAL)
7094{
7095 SERIAL_SHIFT_64(16, u, 4)
7096}
7097
7098_NEON2SSESTORAGE uint32x2_t vshl_u32(uint32x2_t a, int32x2_t b); // VSHL.U32 d0,d0,d0
7099_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint32x2_t vshl_u32(uint32x2_t a, int32x2_t b), _NEON2SSE_REASON_SLOW_SERIAL)
7100{
7101 SERIAL_SHIFT_64(32, u, 2)
7102}
7103
7104_NEON2SSESTORAGE uint64x1_t vshl_u64(uint64x1_t a, int64x1_t b); // VSHL.U64 d0,d0,d0
7105_NEON2SSE_INLINE uint64x1_t vshl_u64(uint64x1_t a, int64x1_t b) //if we use the SERIAL_SHIFT macro need to have the special processing for large numbers
7106{
7107 SERIAL_SHIFT_64(64, u, 1)
7108}
7109
7110_NEON2SSESTORAGE int8x16_t vshlq_s8(int8x16_t a, int8x16_t b); // VSHL.S8 q0,q0,q0
7111_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int8x16_t vshlq_s8(int8x16_t a, int8x16_t b), _NEON2SSE_REASON_SLOW_SERIAL)
7112{
7113 SERIAL_SHIFT(int8_t, int8_t, 16, 16)
7114}
7115
7116_NEON2SSESTORAGE int16x8_t vshlq_s16(int16x8_t a, int16x8_t b); // VSHL.S16 q0,q0,q0
7117_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int16x8_t vshlq_s16(int16x8_t a, int16x8_t b), _NEON2SSE_REASON_SLOW_SERIAL)
7118{
7119 SERIAL_SHIFT(int16_t, int16_t, 8, 8)
7120}
7121
7122_NEON2SSESTORAGE int32x4_t vshlq_s32(int32x4_t a, int32x4_t b); // VSHL.S32 q0,q0,q0
7123_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int32x4_t vshlq_s32(int32x4_t a, int32x4_t b), _NEON2SSE_REASON_SLOW_SERIAL)
7124{
7125 SERIAL_SHIFT(int32_t, int32_t, 4, 4)
7126}
7127
7128_NEON2SSESTORAGE int64x2_t vshlq_s64(int64x2_t a, int64x2_t b); // VSHL.S64 q0,q0,q0
7129_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int64x2_t vshlq_s64(int64x2_t a, int64x2_t b), _NEON2SSE_REASON_SLOW_SERIAL)
7130{
7131 SERIAL_SHIFT(int64_t, int64_t, 2, 2)
7132}
7133
7134_NEON2SSESTORAGE uint8x16_t vshlq_u8(uint8x16_t a, int8x16_t b); // VSHL.U8 q0,q0,q0
7135_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint8x16_t vshlq_u8(uint8x16_t a, int8x16_t b), _NEON2SSE_REASON_SLOW_SERIAL)
7136{
7137 SERIAL_SHIFT(uint8_t, int8_t, 16, 16)
7138}
7139
7140_NEON2SSESTORAGE uint16x8_t vshlq_u16(uint16x8_t a, int16x8_t b); // VSHL.s16 q0,q0,q0
7141_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint16x8_t vshlq_u16(uint16x8_t a, int16x8_t b), _NEON2SSE_REASON_SLOW_SERIAL)
7142{
7143 SERIAL_SHIFT(uint16_t, int16_t, 8, 8)
7144}
7145
7146_NEON2SSESTORAGE uint32x4_t vshlq_u32(uint32x4_t a, int32x4_t b); // VSHL.U32 q0,q0,q0
7147_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint32x4_t vshlq_u32(uint32x4_t a, int32x4_t b), _NEON2SSE_REASON_SLOW_SERIAL)
7148{
7149 SERIAL_SHIFT(uint32_t, int32_t, 4, 4)
7150}
7151
7152_NEON2SSESTORAGE uint64x2_t vshlq_u64(uint64x2_t a, int64x2_t b); // VSHL.U64 q0,q0,q0
7153_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING( uint64x2_t vshlq_u64(uint64x2_t a, int64x2_t b), _NEON2SSE_REASON_SLOW_SERIAL)
7154{
7155 SERIAL_SHIFT(uint64_t, int64_t, 2, 2)
7156}
7157
7158
7159//*********** Vector saturating shift left: (negative values shift right) **********************
7160//********************************************************************************************
7161//No such operations in IA32 SIMD available yet, constant shift only available, so need to do the serial solution
7162#define SERIAL_SATURATING_SHIFT_SIGNED(TYPE, LENMAX, LEN) \
7163 _NEON2SSE_ALIGN_16 TYPE atmp[LENMAX], res[LENMAX], btmp[LENMAX]; TYPE limit; int i; \
7164 int lanesize_1 = (sizeof(TYPE) << 3) - 1; \
7165 _mm_store_si128((__m128i*)atmp, a); _mm_store_si128((__m128i*)btmp, b); \
7166 for (i = 0; i<LEN; i++) { \
7167 if ((atmp[i] ==0)||(btmp[i] ==0)) res[i] = atmp[i]; \
7168 else{ \
7169 if(btmp[i] <0) res[i] = atmp[i] >> (-btmp[i]); \
7170 else{ \
7171 if (btmp[i]>lanesize_1) { \
7172 res[i] = ((_UNSIGNED_T(TYPE))atmp[i] >> lanesize_1 ) + ((TYPE)1 << lanesize_1) - 1; \
7173 }else{ \
7174 limit = (TYPE)1 << (lanesize_1 - btmp[i]); \
7175 if((atmp[i] >= limit)||(atmp[i] <= -limit)) \
7176 res[i] = ((_UNSIGNED_T(TYPE))atmp[i] >> lanesize_1 ) + ((TYPE)1 << lanesize_1) - 1; \
7177 else res[i] = atmp[i] << btmp[i]; }}}} \
7178 return _mm_load_si128((__m128i*)res);
7179
7180#define SERIAL_SATURATING_SHIFT_UNSIGNED(TYPE, LENMAX, LEN) \
7181 _NEON2SSE_ALIGN_16 _UNSIGNED_T(TYPE) atmp[LENMAX], res[LENMAX]; _NEON2SSE_ALIGN_16 TYPE btmp[LENMAX]; _UNSIGNED_T(TYPE) limit; int i; \
7182 TYPE lanesize = (sizeof(TYPE) << 3); \
7183 _mm_store_si128((__m128i*)atmp, a); _mm_store_si128((__m128i*)btmp, b); \
7184 for (i = 0; i<LEN; i++) { \
7185 if ((atmp[i] ==0)||(btmp[i] ==0)) { res[i] = atmp[i]; \
7186 }else{ \
7187 if(btmp[i] < 0) res[i] = atmp[i] >> (-btmp[i]); \
7188 else{ \
7189 if (btmp[i]>lanesize) res[i] = (_UNSIGNED_T(TYPE))(~0ll); \
7190 else{ \
7191 limit = (TYPE) 1 << (lanesize - btmp[i]); \
7192 res[i] = ( atmp[i] >= limit) ? (_UNSIGNED_T(TYPE))(~0ll) : atmp[i] << btmp[i]; }}}} \
7193 return _mm_load_si128((__m128i*)res);
7194
7195#define SERIAL_SATURATING_SHIFT_SIGNED_64(TYPE, LEN) \
7196 int ## TYPE ## x ## LEN ## _t res; int ## TYPE ## _t limit; int i; \
7197 int lanesize_1 = (sizeof( int ## TYPE ## _t) << 3) - 1; \
7198 for (i = 0; i<LEN; i++) { \
7199 if ((a.m64_i ## TYPE[i] == 0) ||(b.m64_i ## TYPE[i] == 0)) res.m64_i ## TYPE[i] = a.m64_i ## TYPE[i]; \
7200 else{ \
7201 if(b.m64_i ## TYPE[i] <0) res.m64_i ## TYPE[i] = a.m64_i ## TYPE[i] >> (-(b.m64_i ## TYPE[i])); \
7202 else{ \
7203 if (b.m64_i ## TYPE[i]>lanesize_1) { \
7204 res.m64_i ## TYPE[i] = ((_UNSIGNED_T(int ## TYPE ## _t))a.m64_i ## TYPE[i] >> lanesize_1 ) + ((int ## TYPE ## _t) 1 << lanesize_1) - 1; \
7205 }else{ \
7206 limit = (int ## TYPE ## _t) 1 << (lanesize_1 - b.m64_i ## TYPE[i]); \
7207 if((a.m64_i ## TYPE[i] >= limit)||(a.m64_i ## TYPE[i] <= -limit)) \
7208 res.m64_i ## TYPE[i] = ((_UNSIGNED_T(int ## TYPE ## _t))a.m64_i ## TYPE[i] >> lanesize_1 ) + ((int ## TYPE ## _t) 1 << lanesize_1) - 1; \
7209 else res.m64_i ## TYPE[i] = a.m64_i ## TYPE[i] << b.m64_i ## TYPE[i]; }}}} \
7210 return res;
7211
7212#define SERIAL_SATURATING_SHIFT_UNSIGNED_64(TYPE, LEN) \
7213 int ## TYPE ## x ## LEN ## _t res; _UNSIGNED_T(int ## TYPE ## _t) limit; int i; \
7214 int ## TYPE ## _t lanesize = (sizeof(int ## TYPE ## _t) << 3); \
7215 for (i = 0; i<LEN; i++) { \
7216 if ((a.m64_u ## TYPE[i] == 0) ||(b.m64_u ## TYPE[i] == 0)) {res.m64_u ## TYPE[i] = a.m64_u ## TYPE[i]; \
7217 }else{ \
7218 if(b.m64_i ## TYPE[i] < 0) res.m64_u ## TYPE[i] = a.m64_u ## TYPE[i] >> (-(b.m64_i ## TYPE[i])); \
7219 else{ \
7220 if (b.m64_i ## TYPE[i]>lanesize) res.m64_u ## TYPE[i] = (_UNSIGNED_T(int ## TYPE ## _t))(~0ll); \
7221 else{ \
7222 limit = (int ## TYPE ## _t) 1 << (lanesize - b.m64_i ## TYPE[i]); \
7223 res.m64_u ## TYPE[i] = ( a.m64_u ## TYPE[i] >= limit) ? (_UNSIGNED_T(int ## TYPE ## _t))(~0ll) : a.m64_u ## TYPE[i] << b.m64_u ## TYPE[i]; }}}} \
7224 return res;
7225
7226_NEON2SSESTORAGE int8x8_t vqshl_s8(int8x8_t a, int8x8_t b); // VQSHL.S8 d0,d0,d0
7227_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int8x8_t vqshl_s8(int8x8_t a, int8x8_t b), _NEON2SSE_REASON_SLOW_SERIAL)
7228{
7229 SERIAL_SATURATING_SHIFT_SIGNED_64(8,8)
7230}
7231
7232_NEON2SSESTORAGE int16x4_t vqshl_s16(int16x4_t a, int16x4_t b); // VQSHL.S16 d0,d0,d0
7233_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int16x4_t vqshl_s16(int16x4_t a, int16x4_t b), _NEON2SSE_REASON_SLOW_SERIAL)
7234{
7235 SERIAL_SATURATING_SHIFT_SIGNED_64(16,4)
7236}
7237
7238_NEON2SSESTORAGE int32x2_t vqshl_s32(int32x2_t a, int32x2_t b); // VQSHL.S32 d0,d0,d0
7239_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int32x2_t vqshl_s32(int32x2_t a, int32x2_t b), _NEON2SSE_REASON_SLOW_SERIAL)
7240{
7241 SERIAL_SATURATING_SHIFT_SIGNED_64(32,2)
7242}
7243
7244_NEON2SSESTORAGE int64x1_t vqshl_s64(int64x1_t a, int64x1_t b); // VQSHL.S64 d0,d0,d0
7245_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int64x1_t vqshl_s64(int64x1_t a, int64x1_t b), _NEON2SSE_REASON_SLOW_SERIAL)
7246{
7247 SERIAL_SATURATING_SHIFT_SIGNED_64(64,1)
7248}
7249
7250_NEON2SSESTORAGE uint8x8_t vqshl_u8(uint8x8_t a, int8x8_t b); // VQSHL.U8 d0,d0,d0
7251_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint8x8_t vqshl_u8(uint8x8_t a, int8x8_t b), _NEON2SSE_REASON_SLOW_SERIAL)
7252{
7253 SERIAL_SATURATING_SHIFT_UNSIGNED_64(8,8)
7254}
7255
7256_NEON2SSESTORAGE uint16x4_t vqshl_u16(uint16x4_t a, int16x4_t b); // VQSHL.s16 d0,d0,d0
7257_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint16x4_t vqshl_u16(uint16x4_t a, int16x4_t b), _NEON2SSE_REASON_SLOW_SERIAL)
7258{
7259 SERIAL_SATURATING_SHIFT_UNSIGNED_64(16,4)
7260}
7261
7262_NEON2SSESTORAGE uint32x2_t vqshl_u32(uint32x2_t a, int32x2_t b); // VQSHL.U32 d0,d0,d0
7263_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint32x2_t vqshl_u32(uint32x2_t a, int32x2_t b), _NEON2SSE_REASON_SLOW_SERIAL)
7264{
7265 SERIAL_SATURATING_SHIFT_UNSIGNED_64(32,2)
7266}
7267
7268_NEON2SSESTORAGE uint64x1_t vqshl_u64(uint64x1_t a, int64x1_t b); // VQSHL.U64 d0,d0,d0
7269_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint64x1_t vqshl_u64(uint64x1_t a, int64x1_t b), _NEON2SSE_REASON_SLOW_SERIAL)
7270{
7271 SERIAL_SATURATING_SHIFT_UNSIGNED_64(64,1)
7272}
7273
7274_NEON2SSESTORAGE int8x16_t vqshlq_s8(int8x16_t a, int8x16_t b); // VQSHL.S8 q0,q0,q0
7275_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int8x16_t vqshlq_s8(int8x16_t a, int8x16_t b), _NEON2SSE_REASON_SLOW_SERIAL)
7276{
7277 SERIAL_SATURATING_SHIFT_SIGNED(int8_t, 16, 16)
7278}
7279
7280_NEON2SSESTORAGE int16x8_t vqshlq_s16(int16x8_t a, int16x8_t b); // VQSHL.S16 q0,q0,q0
7281_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int16x8_t vqshlq_s16(int16x8_t a, int16x8_t b), _NEON2SSE_REASON_SLOW_SERIAL)
7282{
7283 SERIAL_SATURATING_SHIFT_SIGNED(int16_t, 8, 8)
7284}
7285
7286_NEON2SSESTORAGE int32x4_t vqshlq_s32(int32x4_t a, int32x4_t b); // VQSHL.S32 q0,q0,q0
7287_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int32x4_t vqshlq_s32(int32x4_t a, int32x4_t b), _NEON2SSE_REASON_SLOW_SERIAL)
7288{
7289 SERIAL_SATURATING_SHIFT_SIGNED(int32_t, 4, 4)
7290}
7291
7292_NEON2SSESTORAGE int64x2_t vqshlq_s64(int64x2_t a, int64x2_t b); // VQSHL.S64 q0,q0,q0
7293_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int64x2_t vqshlq_s64(int64x2_t a, int64x2_t b), _NEON2SSE_REASON_SLOW_SERIAL)
7294{
7295 SERIAL_SATURATING_SHIFT_SIGNED(int64_t, 2, 2)
7296}
7297
7298_NEON2SSESTORAGE uint8x16_t vqshlq_u8(uint8x16_t a, int8x16_t b); // VQSHL.U8 q0,q0,q0
7299_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint8x16_t vqshlq_u8(uint8x16_t a, int8x16_t b), _NEON2SSE_REASON_SLOW_SERIAL)
7300{
7301 SERIAL_SATURATING_SHIFT_UNSIGNED(int8_t, 16, 16)
7302}
7303
7304_NEON2SSESTORAGE uint16x8_t vqshlq_u16(uint16x8_t a, int16x8_t b); // VQSHL.s16 q0,q0,q0
7305_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint16x8_t vqshlq_u16(uint16x8_t a, int16x8_t b), _NEON2SSE_REASON_SLOW_SERIAL)
7306{
7307 SERIAL_SATURATING_SHIFT_UNSIGNED(int16_t, 8, 8)
7308}
7309
7310_NEON2SSESTORAGE uint32x4_t vqshlq_u32(uint32x4_t a, int32x4_t b); // VQSHL.U32 q0,q0,q0
7311_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint32x4_t vqshlq_u32(uint32x4_t a, int32x4_t b), _NEON2SSE_REASON_SLOW_SERIAL)
7312{
7313 SERIAL_SATURATING_SHIFT_UNSIGNED(int32_t, 4, 4)
7314}
7315
7316_NEON2SSESTORAGE uint64x2_t vqshlq_u64(uint64x2_t a, int64x2_t b); // VQSHL.U64 q0,q0,q0
7317_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint64x2_t vqshlq_u64(uint64x2_t a, int64x2_t b), _NEON2SSE_REASON_SLOW_SERIAL)
7318{
7319 SERIAL_SATURATING_SHIFT_UNSIGNED(int64_t, 2, 2)
7320}
7321
7322
7323//******** Vector rounding shift left: (negative values shift right) **********
7324//****************************************************************************
7325//No such operations in IA32 SIMD available yet, constant shift only available, so need to do the serial solution
7326//rounding makes sense for right shifts only.
7327#define SERIAL_ROUNDING_SHIFT(TYPE, INTERNAL_TYPE, LENMAX, LEN) \
7328 _NEON2SSE_ALIGN_16 TYPE atmp[LENMAX], res[LENMAX]; _NEON2SSE_ALIGN_16 INTERNAL_TYPE btmp[LENMAX]; INTERNAL_TYPE i, lanesize = sizeof(INTERNAL_TYPE) << 3; \
7329 _mm_store_si128((__m128i*)atmp, a); _mm_store_si128((__m128i*)btmp, b); \
7330 for (i = 0; i<LEN; i++) { \
7331 if( btmp[i] >= 0) { \
7332 if(btmp[i] >= lanesize) res[i] = 0; \
7333 else res[i] = (atmp[i] << btmp[i]); \
7334 }else{ \
7335 res[i] = (btmp[i] < -lanesize) ? 0 : \
7336 (btmp[i] == -lanesize) ? (atmp[i] & ((INTERNAL_TYPE)1 << (-btmp[i] - 1))) >> (-btmp[i] - 1) : \
7337 (atmp[i] >> (-btmp[i])) + ( (atmp[i] & ((INTERNAL_TYPE)1 << (-btmp[i] - 1))) >> (-btmp[i] - 1) ); }} \
7338 return _mm_load_si128((__m128i*)res);
7339
7340
7341#define SERIAL_ROUNDING_SHIFT_64(TYPE, SIGN, LEN) \
7342 int ## TYPE ## x ## LEN ## _t res; int i; int lanesize = sizeof(int ## TYPE ## _t) << 3; \
7343 for (i = 0; i<LEN; i++) { \
7344 if( b.m64_i ## TYPE[i] >= 0) { \
7345 if(b.m64_i ## TYPE[i] >= lanesize) res.m64_ ## SIGN ## TYPE[i] = 0; \
7346 else res.m64_ ## SIGN ## TYPE[i] = (a.m64_ ## SIGN ## TYPE[i] << b.m64_i ## TYPE[i]); \
7347 }else{ \
7348 res.m64_ ## SIGN ## TYPE[i] = (b.m64_i ## TYPE[i] < -lanesize) ? 0 : \
7349 (b.m64_i ## TYPE[i] == -lanesize) ? (a.m64_ ## SIGN ## TYPE[i] & ((int ## TYPE ## _t) 1 << (-(b.m64_i ## TYPE[i]) - 1))) >> (-(b.m64_i ## TYPE[i]) - 1) : \
7350 (a.m64_ ## SIGN ## TYPE[i] >> (-(b.m64_i ## TYPE[i]))) + ( (a.m64_ ## SIGN ## TYPE[i] & ((int ## TYPE ## _t) 1 << (-(b.m64_i ## TYPE[i]) - 1))) >> (-(b.m64_i ## TYPE[i]) - 1) ); }} \
7351 return res;
7352
7353
7354_NEON2SSESTORAGE int8x8_t vrshl_s8(int8x8_t a, int8x8_t b); // VRSHL.S8 d0,d0,d0
7355_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int8x8_t vrshl_s8(int8x8_t a, int8x8_t b), _NEON2SSE_REASON_SLOW_SERIAL)
7356{
7357 SERIAL_ROUNDING_SHIFT_64(8,i,8)
7358}
7359
7360_NEON2SSESTORAGE int16x4_t vrshl_s16(int16x4_t a, int16x4_t b); // VRSHL.S16 d0,d0,d0
7361_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int16x4_t vrshl_s16(int16x4_t a, int16x4_t b), _NEON2SSE_REASON_SLOW_SERIAL)
7362{
7363 SERIAL_ROUNDING_SHIFT_64(16,i,4)
7364}
7365
7366_NEON2SSESTORAGE int32x2_t vrshl_s32(int32x2_t a, int32x2_t b); // VRSHL.S32 d0,d0,d0
7367_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int32x2_t vrshl_s32(int32x2_t a, int32x2_t b), _NEON2SSE_REASON_SLOW_SERIAL)
7368{
7369 SERIAL_ROUNDING_SHIFT_64(32,i,2)
7370}
7371
7372_NEON2SSESTORAGE int64x1_t vrshl_s64(int64x1_t a, int64x1_t b); // VRSHL.S64 d0,d0,d0
7373_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int64x1_t vrshl_s64(int64x1_t a, int64x1_t b), _NEON2SSE_REASON_SLOW_SERIAL)
7374{
7375 SERIAL_ROUNDING_SHIFT_64(64,i,1)
7376}
7377
7378_NEON2SSESTORAGE uint8x8_t vrshl_u8(uint8x8_t a, int8x8_t b); // VRSHL.U8 d0,d0,d0
7379_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint8x8_t vrshl_u8(uint8x8_t a, int8x8_t b), _NEON2SSE_REASON_SLOW_SERIAL)
7380{
7381 SERIAL_ROUNDING_SHIFT_64(8,u,8)
7382}
7383
7384_NEON2SSESTORAGE uint16x4_t vrshl_u16(uint16x4_t a, int16x4_t b); // VRSHL.s16 d0,d0,d0
7385_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint16x4_t vrshl_u16(uint16x4_t a, int16x4_t b), _NEON2SSE_REASON_SLOW_SERIAL)
7386{
7387 SERIAL_ROUNDING_SHIFT_64(16,u,4)
7388}
7389
7390_NEON2SSESTORAGE uint32x2_t vrshl_u32(uint32x2_t a, int32x2_t b); // VRSHL.U32 d0,d0,d0
7391_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint32x2_t vrshl_u32(uint32x2_t a, int32x2_t b), _NEON2SSE_REASON_SLOW_SERIAL)
7392{
7393 SERIAL_ROUNDING_SHIFT_64(32,u,2)
7394}
7395
7396_NEON2SSESTORAGE uint64x1_t vrshl_u64(uint64x1_t a, int64x1_t b); // VRSHL.U64 d0,d0,d0
7397_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint64x1_t vrshl_u64(uint64x1_t a, int64x1_t b), _NEON2SSE_REASON_SLOW_SERIAL)
7398{
7399 SERIAL_ROUNDING_SHIFT_64(64,u,1)
7400}
7401
7402_NEON2SSESTORAGE int8x16_t vrshlq_s8(int8x16_t a, int8x16_t b); // VRSHL.S8 q0,q0,q0
7403_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int8x16_t vrshlq_s8(int8x16_t a, int8x16_t b), _NEON2SSE_REASON_SLOW_SERIAL)
7404{
7405 SERIAL_ROUNDING_SHIFT(int8_t, int8_t, 16, 16)
7406}
7407
7408_NEON2SSESTORAGE int16x8_t vrshlq_s16(int16x8_t a, int16x8_t b); // VRSHL.S16 q0,q0,q0
7409_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int16x8_t vrshlq_s16(int16x8_t a, int16x8_t b), _NEON2SSE_REASON_SLOW_SERIAL)
7410{
7411 SERIAL_ROUNDING_SHIFT(int16_t, int16_t, 8, 8)
7412}
7413
7414_NEON2SSESTORAGE int32x4_t vrshlq_s32(int32x4_t a, int32x4_t b); // VRSHL.S32 q0,q0,q0
7415_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int32x4_t vrshlq_s32(int32x4_t a, int32x4_t b), _NEON2SSE_REASON_SLOW_SERIAL)
7416{
7417 SERIAL_ROUNDING_SHIFT(int32_t, int32_t, 4, 4)
7418}
7419
7420_NEON2SSESTORAGE int64x2_t vrshlq_s64(int64x2_t a, int64x2_t b); // VRSHL.S64 q0,q0,q0
7421_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int64x2_t vrshlq_s64(int64x2_t a, int64x2_t b), _NEON2SSE_REASON_SLOW_SERIAL)
7422{
7423 SERIAL_ROUNDING_SHIFT(int64_t, int64_t, 2, 2)
7424}
7425
7426_NEON2SSESTORAGE uint8x16_t vrshlq_u8(uint8x16_t a, int8x16_t b); // VRSHL.U8 q0,q0,q0
7427_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint8x16_t vrshlq_u8(uint8x16_t a, int8x16_t b), _NEON2SSE_REASON_SLOW_SERIAL)
7428{
7429 SERIAL_ROUNDING_SHIFT(uint8_t, int8_t, 16, 16)
7430}
7431
7432_NEON2SSESTORAGE uint16x8_t vrshlq_u16(uint16x8_t a, int16x8_t b); // VRSHL.s16 q0,q0,q0
7433_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint16x8_t vrshlq_u16(uint16x8_t a, int16x8_t b), _NEON2SSE_REASON_SLOW_SERIAL)
7434{
7435 SERIAL_ROUNDING_SHIFT(uint16_t, int16_t, 8, 8)
7436}
7437
7438_NEON2SSESTORAGE uint32x4_t vrshlq_u32(uint32x4_t a, int32x4_t b); // VRSHL.U32 q0,q0,q0
7439_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint32x4_t vrshlq_u32(uint32x4_t a, int32x4_t b), _NEON2SSE_REASON_SLOW_SERIAL)
7440{
7441 SERIAL_ROUNDING_SHIFT(uint32_t, int32_t, 4, 4)
7442}
7443
7444_NEON2SSESTORAGE uint64x2_t vrshlq_u64(uint64x2_t a, int64x2_t b); // VRSHL.U64 q0,q0,q0
7445_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint64x2_t vrshlq_u64(uint64x2_t a, int64x2_t b), _NEON2SSE_REASON_SLOW_SERIAL)
7446{
7447 SERIAL_ROUNDING_SHIFT(uint64_t, int64_t, 2, 2)
7448}
7449
7450
7451//********** Vector saturating rounding shift left: (negative values shift right) ****************
7452//*************************************************************************************************
7453//No such operations in IA32 SIMD unfortunately, constant shift only available, so need to do the serial solution
7454//Saturation happens for left shifts only while rounding makes sense for right shifts only.
7455#define SERIAL_SATURATING_ROUNDING_SHIFT_SIGNED(TYPE, LENMAX, LEN) \
7456 _NEON2SSE_ALIGN_16 TYPE atmp[LENMAX], res[LENMAX], btmp[LENMAX]; TYPE limit; int i; \
7457 int lanesize_1 = (sizeof(TYPE) << 3) - 1; \
7458 _mm_store_si128((__m128i*)atmp, a); _mm_store_si128((__m128i*)btmp, b); \
7459 for (i = 0; i<LEN; i++) { \
7460 if (atmp[i] ==0) res[i] = 0; \
7461 else{ \
7462 if(btmp[i] <0) res[i] = (btmp[i] < (-lanesize_1)) ? 0 : (atmp[i] >> (-btmp[i])) + ( (atmp[i] & ((TYPE)1 << (-btmp[i] - 1))) >> (-btmp[i] - 1) ); \
7463 else{ \
7464 if (btmp[i]>lanesize_1) { \
7465 res[i] = ((_UNSIGNED_T(TYPE))atmp[i] >> lanesize_1 ) + ((TYPE)1 << lanesize_1) - 1; \
7466 }else{ \
7467 limit = (TYPE)1 << (lanesize_1 - btmp[i]); \
7468 if((atmp[i] >= limit)||(atmp[i] <= -limit)) \
7469 res[i] = ((_UNSIGNED_T(TYPE))atmp[i] >> lanesize_1 ) + ((TYPE)1 << lanesize_1) - 1; \
7470 else res[i] = atmp[i] << btmp[i]; }}}} \
7471 return _mm_load_si128((__m128i*)res);
7472
7473#define SERIAL_SATURATING_ROUNDING_SHIFT_UNSIGNED(TYPE, LENMAX, LEN) \
7474 _NEON2SSE_ALIGN_16 _UNSIGNED_T(TYPE) atmp[LENMAX], res[LENMAX]; _NEON2SSE_ALIGN_16 TYPE btmp[LENMAX]; _UNSIGNED_T(TYPE) limit; int i; \
7475 int lanesize = (sizeof(TYPE) << 3); \
7476 _mm_store_si128((__m128i*)atmp, a); _mm_store_si128((__m128i*)btmp, b); \
7477 for (i = 0; i<LEN; i++) { \
7478 if (atmp[i] ==0) {res[i] = 0; \
7479 }else{ \
7480 if(btmp[i] < 0) res[i] = (btmp[i] < (-lanesize)) ? 0 : (atmp[i] >> (-btmp[i])) + ( (atmp[i] & ((TYPE)1 << (-btmp[i] - 1))) >> (-btmp[i] - 1) ); \
7481 else{ \
7482 if (btmp[i]>lanesize) res[i] = (_UNSIGNED_T(TYPE))(~0ll); \
7483 else{ \
7484 limit = (TYPE) 1 << (lanesize - btmp[i]); \
7485 res[i] = ( atmp[i] >= limit) ? (_UNSIGNED_T(TYPE))(~0ll) : atmp[i] << btmp[i]; }}}} \
7486 return _mm_load_si128((__m128i*)res);
7487
7488#define SERIAL_SATURATING_ROUNDING_SHIFT_SIGNED_64(TYPE, LEN) \
7489 __m64_128 res; int ## TYPE ## _t limit; int i; \
7490 int lanesize_1 = (sizeof(int ## TYPE ## _t ) << 3) - 1; \
7491 for (i = 0; i<LEN; i++) { \
7492 if (a.m64_i ## TYPE[i] ==0) res.m64_i ## TYPE[i] = 0; \
7493 else{ \
7494 if(b.m64_i ## TYPE[i] <0) res.m64_i ## TYPE[i] = (b.m64_i ## TYPE[i] < (-lanesize_1)) ? 0 : (a.m64_i ## TYPE[i] >> (-(b.m64_i ## TYPE[i]))) + ( (a.m64_i ## TYPE[i] & ((int ## TYPE ## _t ) 1 << (-(b.m64_i ## TYPE[i]) - 1))) >> (-(b.m64_i ## TYPE[i]) - 1) ); \
7495 else{ \
7496 if (b.m64_i ## TYPE[i]>lanesize_1) { \
7497 res.m64_i ## TYPE[i] = ((_UNSIGNED_T(int ## TYPE ## _t ))a.m64_i ## TYPE[i] >> lanesize_1 ) + ((int ## TYPE ## _t ) 1 << lanesize_1) - 1; \
7498 }else{ \
7499 limit = (int ## TYPE ## _t ) 1 << (lanesize_1 - b.m64_i ## TYPE[i]); \
7500 if((a.m64_i ## TYPE[i] >= limit)||(a.m64_i ## TYPE[i] <= -limit)) \
7501 res.m64_i ## TYPE[i] = ((_UNSIGNED_T(int ## TYPE ## _t ))a.m64_i ## TYPE[i] >> lanesize_1 ) + ((int ## TYPE ## _t ) 1 << lanesize_1) - 1; \
7502 else res.m64_i ## TYPE[i] = a.m64_i ## TYPE[i] << b.m64_i ## TYPE[i]; }}}} \
7503 return res;
7504
7505#define SERIAL_SATURATING_ROUNDING_SHIFT_UNSIGNED_64(TYPE, LEN) \
7506 __m64_128 res; _UNSIGNED_T(int ## TYPE ## _t) limit; int i; \
7507 int lanesize = (sizeof(int ## TYPE ## _t) << 3); \
7508 for (i = 0; i<LEN; i++) { \
7509 if (a.m64_u ## TYPE[i] ==0) {res.m64_u ## TYPE[i] = 0; \
7510 }else{ \
7511 if(b.m64_i ## TYPE[i] < 0) res.m64_u ## TYPE[i] = (b.m64_i ## TYPE[i] < (-lanesize)) ? 0 : (a.m64_u ## TYPE[i] >> (-(b.m64_i ## TYPE[i]))) + ( (a.m64_u ## TYPE[i] & ((int ## TYPE ## _t) 1 << (-(b.m64_i ## TYPE[i]) - 1))) >> (-(b.m64_i ## TYPE[i]) - 1) ); \
7512 else{ \
7513 if (b.m64_i ## TYPE[i]>lanesize) res.m64_u ## TYPE[i] = (_UNSIGNED_T(int ## TYPE ## _t))(~0ll); \
7514 else{ \
7515 limit = (int ## TYPE ## _t) 1 << (lanesize - b.m64_i ## TYPE[i]); \
7516 res.m64_u ## TYPE[i] = ( a.m64_u ## TYPE[i] >= limit) ? (_UNSIGNED_T(int ## TYPE ## _t))(~0ll) : a.m64_u ## TYPE[i] << b.m64_i ## TYPE[i]; }}}} \
7517 return res;
7518
7519_NEON2SSESTORAGE int8x8_t vqrshl_s8(int8x8_t a, int8x8_t b); // VQRSHL.S8 d0,d0,d0
7520_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int8x8_t vqrshl_s8(int8x8_t a, int8x8_t b), _NEON2SSE_REASON_SLOW_SERIAL)
7521{
7522 SERIAL_SATURATING_ROUNDING_SHIFT_SIGNED_64(8,8)
7523}
7524
7525_NEON2SSESTORAGE int16x4_t vqrshl_s16(int16x4_t a, int16x4_t b); // VQRSHL.S16 d0,d0,d0
7526_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int16x4_t vqrshl_s16(int16x4_t a, int16x4_t b), _NEON2SSE_REASON_SLOW_SERIAL)
7527{
7528 SERIAL_SATURATING_ROUNDING_SHIFT_SIGNED_64(16,4)
7529}
7530
7531_NEON2SSESTORAGE int32x2_t vqrshl_s32(int32x2_t a, int32x2_t b); // VQRSHL.S32 d0,d0,d0
7532_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int32x2_t vqrshl_s32(int32x2_t a, int32x2_t b), _NEON2SSE_REASON_SLOW_SERIAL)
7533{
7534 SERIAL_SATURATING_ROUNDING_SHIFT_SIGNED_64(32,2)
7535}
7536
7537_NEON2SSESTORAGE int64x1_t vqrshl_s64(int64x1_t a, int64x1_t b); // VQRSHL.S64 d0,d0,d0
7538_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int64x1_t vqrshl_s64(int64x1_t a, int64x1_t b), _NEON2SSE_REASON_SLOW_SERIAL)
7539{
7540 SERIAL_SATURATING_ROUNDING_SHIFT_SIGNED_64(64,1)
7541}
7542
7543_NEON2SSESTORAGE uint8x8_t vqrshl_u8(uint8x8_t a, int8x8_t b); // VQRSHL.U8 d0,d0,d0
7544_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint8x8_t vqrshl_u8(uint8x8_t a, int8x8_t b), _NEON2SSE_REASON_SLOW_SERIAL)
7545{
7546 SERIAL_SATURATING_ROUNDING_SHIFT_UNSIGNED_64(8,8)
7547}
7548
7549_NEON2SSESTORAGE uint16x4_t vqrshl_u16(uint16x4_t a, int16x4_t b); // VQRSHL.s16 d0,d0,d0
7550_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint16x4_t vqrshl_u16(uint16x4_t a, int16x4_t b), _NEON2SSE_REASON_SLOW_SERIAL)
7551{
7552 SERIAL_SATURATING_ROUNDING_SHIFT_UNSIGNED_64(16,4)
7553}
7554
7555_NEON2SSESTORAGE uint32x2_t vqrshl_u32(uint32x2_t a, int32x2_t b); // VQRSHL.U32 d0,d0,d0
7556_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint32x2_t vqrshl_u32(uint32x2_t a, int32x2_t b), _NEON2SSE_REASON_SLOW_SERIAL)
7557{
7558 SERIAL_SATURATING_ROUNDING_SHIFT_UNSIGNED_64(32,2)
7559}
7560
7561_NEON2SSESTORAGE uint64x1_t vqrshl_u64(uint64x1_t a, int64x1_t b); // VQRSHL.U64 d0,d0,d0
7562_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint64x1_t vqrshl_u64(uint64x1_t a, int64x1_t b), _NEON2SSE_REASON_SLOW_SERIAL)
7563{
7564 SERIAL_SATURATING_ROUNDING_SHIFT_UNSIGNED_64(64,1)
7565}
7566
7567_NEON2SSESTORAGE int8x16_t vqrshlq_s8(int8x16_t a, int8x16_t b); // VQRSHL.S8 q0,q0,q0
7568_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int8x16_t vqrshlq_s8(int8x16_t a, int8x16_t b), _NEON2SSE_REASON_SLOW_SERIAL)
7569{
7570 SERIAL_SATURATING_ROUNDING_SHIFT_SIGNED(int8_t, 16, 16)
7571}
7572
7573_NEON2SSESTORAGE int16x8_t vqrshlq_s16(int16x8_t a, int16x8_t b); // VQRSHL.S16 q0,q0,q0
7574_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int16x8_t vqrshlq_s16(int16x8_t a, int16x8_t b), _NEON2SSE_REASON_SLOW_SERIAL)
7575{
7576 SERIAL_SATURATING_ROUNDING_SHIFT_SIGNED(int16_t, 8, 8)
7577}
7578
7579_NEON2SSESTORAGE int32x4_t vqrshlq_s32(int32x4_t a, int32x4_t b); // VQRSHL.S32 q0,q0,q0
7580_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int32x4_t vqrshlq_s32(int32x4_t a, int32x4_t b), _NEON2SSE_REASON_SLOW_SERIAL)
7581{
7582 SERIAL_SATURATING_ROUNDING_SHIFT_SIGNED(int32_t, 4, 4)
7583}
7584
7585_NEON2SSESTORAGE int64x2_t vqrshlq_s64(int64x2_t a, int64x2_t b); // VQRSHL.S64 q0,q0,q0
7586_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int64x2_t vqrshlq_s64(int64x2_t a, int64x2_t b), _NEON2SSE_REASON_SLOW_SERIAL)
7587{
7588 SERIAL_SATURATING_ROUNDING_SHIFT_SIGNED(int64_t, 2, 2)
7589}
7590
7591_NEON2SSESTORAGE uint8x16_t vqrshlq_u8(uint8x16_t a, int8x16_t b); // VQRSHL.U8 q0,q0,q0
7592_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint8x16_t vqrshlq_u8(uint8x16_t a, int8x16_t b), _NEON2SSE_REASON_SLOW_SERIAL)
7593{
7594 SERIAL_SATURATING_ROUNDING_SHIFT_UNSIGNED(int8_t, 16, 16)
7595}
7596
7597_NEON2SSESTORAGE uint16x8_t vqrshlq_u16(uint16x8_t a, int16x8_t b); // VQRSHL.s16 q0,q0,q0
7598_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint16x8_t vqrshlq_u16(uint16x8_t a, int16x8_t b), _NEON2SSE_REASON_SLOW_SERIAL)
7599{
7600 SERIAL_SATURATING_ROUNDING_SHIFT_UNSIGNED(int16_t, 8, 8)
7601}
7602
7603_NEON2SSESTORAGE uint32x4_t vqrshlq_u32(uint32x4_t a, int32x4_t b); // VQRSHL.U32 q0,q0,q0
7604_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint32x4_t vqrshlq_u32(uint32x4_t a, int32x4_t b), _NEON2SSE_REASON_SLOW_SERIAL)
7605{
7606 SERIAL_SATURATING_ROUNDING_SHIFT_UNSIGNED(int32_t, 4, 4)
7607}
7608
7609_NEON2SSESTORAGE uint64x2_t vqrshlq_u64(uint64x2_t a, int64x2_t b); // VQRSHL.U64 q0,q0,q0
7610_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint64x2_t vqrshlq_u64(uint64x2_t a, int64x2_t b), _NEON2SSE_REASON_SLOW_SERIAL)
7611{
7612 SERIAL_SATURATING_ROUNDING_SHIFT_UNSIGNED(int64_t, 2, 2)
7613}
7614
7615// *********************************************************************************
7616// ***************************** Shifts by a constant *****************************
7617// *********************************************************************************
7618//**************** Vector shift right by constant*************************************
7619//************************************************************************************
7620_NEON2SSESTORAGE int8x8_t vshr_n_s8(int8x8_t a, __constrange(1,8) int b); // VSHR.S8 d0,d0,#8
7621_NEON2SSE_INLINE int8x8_t vshr_n_s8(int8x8_t a, __constrange(1,8) int b) // VSHR.S8 d0,d0,#8
7622{
7623 //no 8 bit shift available, go to 16 bit
7624 int8x8_t res64;
7625 __m128i r;
7626 r = _MM_CVTEPI8_EPI16 (_pM128i(a)); //SSE 4.1
7627 r = _mm_srai_epi16 (r, b); //SSE2
7628 r = _mm_packs_epi16 (r,r); //we need 64 bits only
7629 return64(r);
7630}
7631
7632_NEON2SSESTORAGE int16x4_t vshr_n_s16(int16x4_t a, __constrange(1,16) int b); // VSHR.S16 d0,d0,#16
7633_NEON2SSE_INLINE int16x4_t vshr_n_s16(int16x4_t a, __constrange(1,16) int b)
7634{
7635 int16x4_t res64;
7636 return64(_mm_srai_epi16(_pM128i(a), b));
7637}
7638
7639
7640_NEON2SSESTORAGE int32x2_t vshr_n_s32(int32x2_t a, __constrange(1,32) int b); // VSHR.S32 d0,d0,#32
7641_NEON2SSE_INLINE int32x2_t vshr_n_s32(int32x2_t a, __constrange(1,32) int b)
7642{
7643 int32x2_t res64;
7644 return64(_mm_srai_epi32(_pM128i(a), b));
7645}
7646
7647_NEON2SSESTORAGE int64x1_t vshr_n_s64(int64x1_t a, __constrange(1,64) int b); // VSHR.S64 d0,d0,#64
7648_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int64x1_t vshr_n_s64(int64x1_t a, __constrange(1,64) int b), _NEON2SSE_REASON_SLOW_SERIAL)
7649{
7650 //no arithmetic shift for 64bit values, serial solution used
7651 int64x1_t res;
7652 if(b>=64) res.m64_i64[0] = 0;
7653 else res.m64_i64[0] = (*(int64_t*)&a) >> b;
7654 return res;
7655}
7656
7657_NEON2SSESTORAGE uint8x8_t vshr_n_u8(uint8x8_t a, __constrange(1,8) int b); // VSHR.U8 d0,d0,#8
7658_NEON2SSE_INLINE uint8x8_t vshr_n_u8(uint8x8_t a, __constrange(1,8) int b) // VSHR.U8 d0,d0,#8
7659{
7660 //no 8 bit shift available, go to 16 bit
7661 uint8x8_t res64;
7662 __m128i r;
7663 r = _MM_CVTEPU8_EPI16 (_pM128i(a)); //SSE 4.1
7664 r = _mm_srli_epi16 (r, b); //for unsigned variables we use the logical shift not arithmetical one
7665 r = _mm_packus_epi16 (r,r); //we need 64 bits only
7666 return64(r);
7667}
7668
7669_NEON2SSESTORAGE uint16x4_t vshr_n_u16(uint16x4_t a, __constrange(1,16) int b); // VSHR.s16 d0,d0,#16
7670_NEON2SSE_INLINE uint16x4_t vshr_n_u16(uint16x4_t a, __constrange(1,16) int b)
7671{
7672 uint16x4_t res64;
7673 return64(_mm_srli_epi16(_pM128i(a), b));
7674}
7675
7676
7677_NEON2SSESTORAGE uint32x2_t vshr_n_u32(uint32x2_t a, __constrange(1,32) int b); // VSHR.U32 d0,d0,#32
7678_NEON2SSE_INLINE uint32x2_t vshr_n_u32(uint32x2_t a, __constrange(1,32) int b)
7679{
7680 uint32x2_t res64;
7681 return64(_mm_srli_epi32(_pM128i(a), b));
7682}
7683
7684
7685_NEON2SSESTORAGE uint64x1_t vshr_n_u64(uint64x1_t a, __constrange(1,64) int b); // VSHR.U64 d0,d0,#64
7686_NEON2SSE_INLINE uint64x1_t vshr_n_u64(uint64x1_t a, __constrange(1,64) int b)
7687{
7688 uint64x1_t res64;
7689 return64(_mm_srli_epi64(_pM128i(a), b));
7690}
7691
7692
7693_NEON2SSESTORAGE int8x16_t vshrq_n_s8(int8x16_t a, __constrange(1,8) int b); // VSHR.S8 q0,q0,#8
7694_NEON2SSE_INLINE int8x16_t vshrq_n_s8(int8x16_t a, __constrange(1,8) int b) // VSHR.S8 q0,q0,#8
7695{
7696 //no 8 bit shift available, go to 16 bit trick
7697 __m128i zero, mask0, a_sign, r, a_sign_mask;
7698 _NEON2SSE_ALIGN_16 static const int16_t mask0_16[9] = {0x0000, 0x0080, 0x00c0, 0x00e0, 0x00f0, 0x00f8, 0x00fc, 0x00fe, 0x00ff};
7699 zero = _mm_setzero_si128();
7700 mask0 = _mm_set1_epi16(mask0_16[b]); //to mask the bits to be "spoiled" by 16 bit shift
7701 a_sign = _mm_cmpgt_epi8 (zero, a); //ff if a<0 or zero if a>0
7702 r = _mm_srai_epi16 (a, b);
7703 a_sign_mask = _mm_and_si128 (mask0, a_sign);
7704 r = _mm_andnot_si128 (mask0, r);
7705 return _mm_or_si128 (r, a_sign_mask);
7706}
7707
7708_NEON2SSE_GLOBAL int16x8_t vshrq_n_s16(int16x8_t a, __constrange(1,16) int b); // VSHR.S16 q0,q0,#16
7709#define vshrq_n_s16 _mm_srai_epi16
7710
7711_NEON2SSE_GLOBAL int32x4_t vshrq_n_s32(int32x4_t a, __constrange(1,32) int b); // VSHR.S32 q0,q0,#32
7712#define vshrq_n_s32 _mm_srai_epi32
7713
7714_NEON2SSESTORAGE int64x2_t vshrq_n_s64(int64x2_t a, __constrange(1,64) int b); // VSHR.S64 q0,q0,#64
7715_NEON2SSE_INLINE int64x2_t vshrq_n_s64(int64x2_t a, __constrange(1,64) int b)
7716{
7717 //SIMD implementation may be not optimal due to 64 bit arithmetic shift absense in x86 SIMD
7718 __m128i c1, signmask,a0, res64;
7719 _NEON2SSE_ALIGN_16 static const uint64_t mask[] = {0x8000000000000000, 0x8000000000000000};
7720 c1 = _mm_cmpeq_epi32(a,a); //0xffffffffffffffff
7721 signmask = _mm_slli_epi64 (c1, (64 - b));
7722 a0 = _mm_or_si128(a, *(__m128i*)mask); //get the first bit
7723 a0 = _MM_CMPEQ_EPI64 (a, a0);
7724 signmask = _mm_and_si128(a0, signmask);
7725 res64 = _mm_srli_epi64 (a, b);
7726 return _mm_or_si128(res64, signmask);
7727}
7728
7729_NEON2SSESTORAGE uint8x16_t vshrq_n_u8(uint8x16_t a, __constrange(1,8) int b); // VSHR.U8 q0,q0,#8
7730_NEON2SSE_INLINE uint8x16_t vshrq_n_u8(uint8x16_t a, __constrange(1,8) int b) // VSHR.U8 q0,q0,#8
7731{
7732 //no 8 bit shift available, need the special trick
7733 __m128i mask0, r;
7734 _NEON2SSE_ALIGN_16 static const uint16_t mask10_16[9] = {0xffff, 0xff7f, 0xff3f, 0xff1f, 0xff0f, 0xff07, 0xff03, 0xff01, 0xff00};
7735 mask0 = _mm_set1_epi16(mask10_16[b]); //to mask the bits to be "spoiled" by 16 bit shift
7736 r = _mm_srli_epi16 ( a, b);
7737 return _mm_and_si128 (r, mask0);
7738}
7739
7740_NEON2SSE_GLOBAL uint16x8_t vshrq_n_u16(uint16x8_t a, __constrange(1,16) int b); // VSHR.s16 q0,q0,#16
7741#define vshrq_n_u16 _mm_srli_epi16
7742
7743_NEON2SSE_GLOBAL uint32x4_t vshrq_n_u32(uint32x4_t a, __constrange(1,32) int b); // VSHR.U32 q0,q0,#32
7744#define vshrq_n_u32 _mm_srli_epi32
7745
7746_NEON2SSE_GLOBAL uint64x2_t vshrq_n_u64(uint64x2_t a, __constrange(1,64) int b); // VSHR.U64 q0,q0,#64
7747#define vshrq_n_u64 _mm_srli_epi64
7748
7749//*************************** Vector shift left by constant *************************
7750//*********************************************************************************
7751_NEON2SSESTORAGE int8x8_t vshl_n_s8(int8x8_t a, __constrange(0,7) int b); // VSHL.I8 d0,d0,#0
7752_NEON2SSE_INLINE int8x8_t vshl_n_s8(int8x8_t a, __constrange(0,7) int b) // VSHL.I8 d0,d0,#0
7753{
7754 //no 8 bit shift available, go to 16 bit
7755 int8x8_t res64;
7756 __m128i r;
7757 r = _MM_CVTEPI8_EPI16 (_pM128i(a)); //SSE 4.1
7758 r = _mm_slli_epi16 (r, b); //SSE2
7759 r = _mm_shuffle_epi8 (r, *(__m128i*) mask8_16_even_odd); //return to 8 bit, we need 64 bits only
7760 return64(r);
7761}
7762
7763_NEON2SSESTORAGE int16x4_t vshl_n_s16(int16x4_t a, __constrange(0,15) int b); // VSHL.I16 d0,d0,#0
7764_NEON2SSE_INLINE int16x4_t vshl_n_s16(int16x4_t a, __constrange(0,15) int b)
7765{
7766 int16x4_t res64;
7767 return64(_mm_slli_epi16(_pM128i(a), b));
7768}
7769
7770
7771_NEON2SSESTORAGE int32x2_t vshl_n_s32(int32x2_t a, __constrange(0,31) int b); // VSHL.I32 d0,d0,#0
7772_NEON2SSE_INLINE int32x2_t vshl_n_s32(int32x2_t a, __constrange(0,31) int b)
7773{
7774 int32x2_t res64;
7775 return64(_mm_slli_epi32(_pM128i(a), b));
7776}
7777
7778
7779_NEON2SSESTORAGE int64x1_t vshl_n_s64(int64x1_t a, __constrange(0,63) int b); // VSHL.I64 d0,d0,#0
7780_NEON2SSE_INLINE int64x1_t vshl_n_s64(int64x1_t a, __constrange(0,63) int b)
7781{
7782 int64x1_t res64;
7783 return64(_mm_slli_epi64(_pM128i(a), b));
7784}
7785
7786
7787_NEON2SSESTORAGE uint8x8_t vshl_n_u8(uint8x8_t a, __constrange(0,7) int b); // VSHL.I8 d0,d0,#0
7788_NEON2SSE_INLINE uint8x8_t vshl_n_u8(uint8x8_t a, __constrange(0,7) int b)
7789{
7790 //no 8 bit shift available, go to 16 bit
7791 uint8x8_t res64;
7792 __m128i mask8;
7793 __m128i r;
7794 mask8 = _mm_set1_epi16(0xff);
7795 r = _MM_CVTEPU8_EPI16 (_pM128i(a)); //SSE 4.1
7796 r = _mm_slli_epi16 (r, b); //SSE2
7797 r = _mm_and_si128(r, mask8); //to avoid saturation
7798 r = _mm_packus_epi16 (r,r); //we need 64 bits only
7799 return64(r);
7800}
7801
7802_NEON2SSE_GLOBAL uint16x4_t vshl_n_u16(uint16x4_t a, __constrange(0,15) int b); // VSHL.I16 d0,d0,#0
7803#define vshl_n_u16 vshl_n_s16
7804
7805
7806_NEON2SSE_GLOBAL uint32x2_t vshl_n_u32(uint32x2_t a, __constrange(0,31) int b); // VSHL.I32 d0,d0,#0
7807#define vshl_n_u32 vshl_n_s32
7808
7809_NEON2SSE_GLOBAL uint64x1_t vshl_n_u64(uint64x1_t a, __constrange(0,63) int b); // VSHL.I64 d0,d0,#0
7810#define vshl_n_u64 vshl_n_s64
7811
7812_NEON2SSE_GLOBAL int8x16_t vshlq_n_s8(int8x16_t a, __constrange(0,7) int b); // VSHL.I8 q0,q0,#0
7813#define vshlq_n_s8 vshlq_n_u8
7814
7815_NEON2SSE_GLOBAL int16x8_t vshlq_n_s16(int16x8_t a, __constrange(0,15) int b); // VSHL.I16 q0,q0,#0
7816#define vshlq_n_s16 _mm_slli_epi16
7817
7818_NEON2SSE_GLOBAL int32x4_t vshlq_n_s32(int32x4_t a, __constrange(0,31) int b); // VSHL.I32 q0,q0,#0
7819#define vshlq_n_s32 _mm_slli_epi32
7820
7821_NEON2SSE_GLOBAL int64x2_t vshlq_n_s64(int64x2_t a, __constrange(0,63) int b); // VSHL.I64 q0,q0,#0
7822#define vshlq_n_s64 _mm_slli_epi64
7823
7824_NEON2SSESTORAGE uint8x16_t vshlq_n_u8(uint8x16_t a, __constrange(0,7) int b); // VSHL.I8 q0,q0,#0
7825_NEON2SSE_INLINE uint8x16_t vshlq_n_u8(uint8x16_t a, __constrange(0,7) int b)
7826{
7827 //no 8 bit shift available, need the special trick
7828 __m128i mask0, r;
7829 _NEON2SSE_ALIGN_16 static const uint16_t mask10_16[9] = {0xffff, 0xfeff, 0xfcff, 0xf8ff, 0xf0ff, 0xe0ff, 0xc0ff, 0x80ff, 0xff};
7830 mask0 = _mm_set1_epi16(mask10_16[b]); //to mask the bits to be "spoiled" by 16 bit shift
7831 r = _mm_slli_epi16 ( a, b);
7832 return _mm_and_si128 (r, mask0);
7833}
7834
7835_NEON2SSE_GLOBAL uint16x8_t vshlq_n_u16(uint16x8_t a, __constrange(0,15) int b); // VSHL.I16 q0,q0,#0
7836#define vshlq_n_u16 vshlq_n_s16
7837
7838_NEON2SSE_GLOBAL uint32x4_t vshlq_n_u32(uint32x4_t a, __constrange(0,31) int b); // VSHL.I32 q0,q0,#0
7839#define vshlq_n_u32 vshlq_n_s32
7840
7841_NEON2SSE_GLOBAL uint64x2_t vshlq_n_u64(uint64x2_t a, __constrange(0,63) int b); // VSHL.I64 q0,q0,#0
7842#define vshlq_n_u64 vshlq_n_s64
7843
7844//************* Vector rounding shift right by constant ******************
7845//*************************************************************************
7846//No corresponding x86 intrinsics exist, need to do some tricks
7847_NEON2SSESTORAGE int8x8_t vrshr_n_s8(int8x8_t a, __constrange(1,8) int b); // VRSHR.S8 d0,d0,#8
7848_NEON2SSE_INLINE int8x8_t vrshr_n_s8(int8x8_t a, __constrange(1,8) int b) // VRSHR.S8 d0,d0,#8
7849{
7850 //no 8 bit shift available, go to 16 bit
7851 int8x8_t res64;
7852 __m128i r, maskb;
7853 r = _MM_CVTEPI8_EPI16 (_pM128i(a)); //SSE 4.1
7854 maskb = _mm_slli_epi16 (r, (16 - b)); //to get rounding (b-1)th bit
7855 maskb = _mm_srli_epi16 (maskb, 15); //1 or 0
7856 r = _mm_srai_epi16 (r, b);
7857 r = _mm_add_epi16 (r, maskb); //actual rounding
7858 r = _mm_packs_epi16 (r,r); ////we need 64 bits only
7859 return64(r);
7860}
7861
7862_NEON2SSESTORAGE int16x4_t vrshr_n_s16(int16x4_t a, __constrange(1,16) int b); // VRSHR.S16 d0,d0,#16
7863_NEON2SSE_INLINE int16x4_t vrshr_n_s16(int16x4_t a, __constrange(1,16) int b)
7864{
7865 int16x4_t res64;
7866 return64(vrshrq_n_s16(_pM128i(a), b));
7867}
7868
7869
7870_NEON2SSESTORAGE int32x2_t vrshr_n_s32(int32x2_t a, __constrange(1,32) int b); // VRSHR.S32 d0,d0,#32
7871_NEON2SSE_INLINE int32x2_t vrshr_n_s32(int32x2_t a, __constrange(1,32) int b)
7872{
7873 int32x2_t res64;
7874 return64(vrshrq_n_s32(_pM128i(a), b));
7875}
7876
7877
7878_NEON2SSESTORAGE int64x1_t vrshr_n_s64(int64x1_t a, __constrange(1,64) int b); // VRSHR.S64 d0,d0,#64
7879_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int64x1_t vrshr_n_s64(int64x1_t a, __constrange(1,64) int b), _NEON2SSE_REASON_SLOW_SERIAL)
7880{
7881 //serial solution is faster
7882 int64x1_t res;
7883 int64_t a_i64 = *( int64_t*)&a;
7884 if(b==64) {
7885 res.m64_i64[0] = 0; //for some compilers rounding happens and we need to use(a_i64 & _SIGNBIT64)>>63;
7886 } else {
7887 int64_t maskb = a_i64 & (( int64_t)1 << (b - 1));
7888 res.m64_i64[0] = (a_i64 >> b) + (maskb >> (b - 1));
7889 }
7890 return res;
7891}
7892
7893_NEON2SSESTORAGE uint8x8_t vrshr_n_u8(uint8x8_t a, __constrange(1,8) int b); // VRSHR.U8 d0,d0,#8
7894_NEON2SSE_INLINE uint8x8_t vrshr_n_u8(uint8x8_t a, __constrange(1,8) int b) // VRSHR.U8 d0,d0,#8
7895{
7896 //no 8 bit shift available, go to 16 bit, solution may be not optimal compared with the serial one
7897 uint8x8_t res64;
7898 __m128i r, maskb;
7899 r = _MM_CVTEPU8_EPI16 (_pM128i(a)); //SSE 4.1
7900 maskb = _mm_slli_epi16 (r, (16 - b)); //to get rounding (b-1)th bit
7901 maskb = _mm_srli_epi16 (maskb, 15); //1 or 0
7902 r = _mm_srli_epi16 (r, b);
7903 r = _mm_add_epi16 (r, maskb); //actual rounding
7904 r = _mm_packus_epi16 (r,r); ////we need 64 bits only
7905 return64(r);
7906}
7907
7908_NEON2SSESTORAGE uint16x4_t vrshr_n_u16(uint16x4_t a, __constrange(1,16) int b); // VRSHR.s16 d0,d0,#16
7909_NEON2SSE_INLINE uint16x4_t vrshr_n_u16(uint16x4_t a, __constrange(1,16) int b)
7910{
7911 uint16x4_t res64;
7912 return64(vrshrq_n_u16(_pM128i(a), b));
7913}
7914
7915
7916_NEON2SSESTORAGE uint32x2_t vrshr_n_u32(uint32x2_t a, __constrange(1,32) int b); // VRSHR.U32 d0,d0,#32
7917_NEON2SSE_INLINE uint32x2_t vrshr_n_u32(uint32x2_t a, __constrange(1,32) int b)
7918{
7919 uint32x2_t res64;
7920 return64(vrshrq_n_u32(_pM128i(a), b));
7921}
7922
7923
7924_NEON2SSESTORAGE uint64x1_t vrshr_n_u64(uint64x1_t a, __constrange(1,64) int b); // VRSHR.U64 d0,d0,#64
7925_NEON2SSE_INLINE uint64x1_t vrshr_n_u64(uint64x1_t a, __constrange(1,64) int b)
7926{
7927 uint64x1_t res64;
7928 return64(vrshrq_n_u64(_pM128i(a), b));
7929}
7930
7931_NEON2SSESTORAGE int8x16_t vrshrq_n_s8(int8x16_t a, __constrange(1,8) int b); // VRSHR.S8 q0,q0,#8
7932_NEON2SSE_INLINE int8x16_t vrshrq_n_s8(int8x16_t a, __constrange(1,8) int b) // VRSHR.S8 q0,q0,#8
7933{
7934 //no 8 bit shift available, go to 16 bit trick
7935 __m128i r, mask1, maskb;
7936 _NEON2SSE_ALIGN_16 static const uint16_t mask2b[9] = {0x0000, 0x0101, 0x0202, 0x0404, 0x0808, 0x1010, 0x2020, 0x4040, 0x8080}; // 2^b-th bit set to 1
7937 r = vshrq_n_s8 (a, b);
7938 mask1 = _mm_set1_epi16(mask2b[b]); // 2^b-th bit set to 1 for 16bit, need it for rounding
7939 maskb = _mm_and_si128(a, mask1); //get b or 0 for rounding
7940 maskb = _mm_srli_epi16 (maskb, b - 1); // to add 1
7941 return _mm_add_epi8(r, maskb); //actual rounding
7942}
7943
7944_NEON2SSESTORAGE int16x8_t vrshrq_n_s16(int16x8_t a, __constrange(1,16) int b); // VRSHR.S16 q0,q0,#16
7945_NEON2SSE_INLINE int16x8_t vrshrq_n_s16(int16x8_t a, __constrange(1,16) int b) // VRSHR.S16 q0,q0,#16
7946{
7947 __m128i maskb, r;
7948 maskb = _mm_slli_epi16(a, (16 - b)); //to get rounding (b-1)th bit
7949 maskb = _mm_srli_epi16(maskb, 15); //1 or 0
7950 r = _mm_srai_epi16 (a, b);
7951 return _mm_add_epi16 (r, maskb); //actual rounding
7952}
7953
7954_NEON2SSESTORAGE int32x4_t vrshrq_n_s32(int32x4_t a, __constrange(1,32) int b); // VRSHR.S32 q0,q0,#32
7955_NEON2SSE_INLINE int32x4_t vrshrq_n_s32(int32x4_t a, __constrange(1,32) int b) // VRSHR.S32 q0,q0,#32
7956{
7957 __m128i maskb, r;
7958 maskb = _mm_slli_epi32 (a, (32 - b)); //to get rounding (b-1)th bit
7959 maskb = _mm_srli_epi32 (maskb,31); //1 or 0
7960 r = _mm_srai_epi32(a, b);
7961 return _mm_add_epi32 (r, maskb); //actual rounding
7962}
7963
7964_NEON2SSESTORAGE int64x2_t vrshrq_n_s64(int64x2_t a, __constrange(1,64) int b); // VRSHR.S64 q0,q0,#64
7965_NEON2SSE_INLINE int64x2_t vrshrq_n_s64(int64x2_t a, __constrange(1,64) int b)
7966{
7967 //solution may be not optimal compared with a serial one
7968 __m128i maskb;
7969 int64x2_t r;
7970 maskb = _mm_slli_epi64 (a, (64 - b)); //to get rounding (b-1)th bit
7971 maskb = _mm_srli_epi64 (maskb,63); //1 or 0
7972 r = vshrq_n_s64(a, b);
7973 return _mm_add_epi64 (r, maskb); //actual rounding
7974}
7975
7976_NEON2SSESTORAGE uint8x16_t vrshrq_n_u8(uint8x16_t a, __constrange(1,8) int b); // VRSHR.U8 q0,q0,#8
7977_NEON2SSE_INLINE uint8x16_t vrshrq_n_u8(uint8x16_t a, __constrange(1,8) int b) // VRSHR.U8 q0,q0,#8
7978{
7979 //no 8 bit shift available, go to 16 bit trick
7980 __m128i r, mask1, maskb;
7981 _NEON2SSE_ALIGN_16 static const uint16_t mask2b[9] = {0x0000, 0x0101, 0x0202, 0x0404, 0x0808, 0x1010, 0x2020, 0x4040, 0x8080}; // 2^b-th bit set to 1
7982 r = vshrq_n_u8 (a, b);
7983 mask1 = _mm_set1_epi16(mask2b[b]); // 2^b-th bit set to 1 for 16bit, need it for rounding
7984 maskb = _mm_and_si128(a, mask1); //get b or 0 for rounding
7985 maskb = _mm_srli_epi16 (maskb, b - 1); // to add 1
7986 return _mm_add_epi8(r, maskb); //actual rounding
7987}
7988
7989_NEON2SSESTORAGE uint16x8_t vrshrq_n_u16(uint16x8_t a, __constrange(1,16) int b); // VRSHR.s16 q0,q0,#16
7990_NEON2SSE_INLINE uint16x8_t vrshrq_n_u16(uint16x8_t a, __constrange(1,16) int b) // VRSHR.S16 q0,q0,#16
7991{
7992 __m128i maskb, r;
7993 maskb = _mm_slli_epi16(a, (16 - b)); //to get rounding (b-1)th bit
7994 maskb = _mm_srli_epi16(maskb, 15); //1 or 0
7995 r = _mm_srli_epi16 (a, b);
7996 return _mm_add_epi16 (r, maskb); //actual rounding
7997}
7998
7999_NEON2SSESTORAGE uint32x4_t vrshrq_n_u32(uint32x4_t a, __constrange(1,32) int b); // VRSHR.U32 q0,q0,#32
8000_NEON2SSE_INLINE uint32x4_t vrshrq_n_u32(uint32x4_t a, __constrange(1,32) int b) // VRSHR.S32 q0,q0,#32
8001{
8002 __m128i maskb, r;
8003 maskb = _mm_slli_epi32 (a, (32 - b)); //to get rounding (b-1)th bit
8004 maskb = _mm_srli_epi32 (maskb,31); //1 or 0
8005 r = _mm_srli_epi32(a, b);
8006 return _mm_add_epi32 (r, maskb); //actual rounding
8007}
8008
8009_NEON2SSESTORAGE uint64x2_t vrshrq_n_u64(uint64x2_t a, __constrange(1,64) int b); // VRSHR.U64 q0,q0,#64
8010_NEON2SSE_INLINE uint64x2_t vrshrq_n_u64(uint64x2_t a, __constrange(1,64) int b)
8011{
8012 //solution may be not optimal compared with a serial one
8013 __m128i maskb, r;
8014 maskb = _mm_slli_epi64 (a, (64 - b)); //to get rounding (b-1)th bit
8015 maskb = _mm_srli_epi64 (maskb,63); //1 or 0
8016 r = _mm_srli_epi64(a, b);
8017 return _mm_add_epi64 (r, maskb); //actual rounding
8018}
8019
8020//************* Vector shift right by constant and accumulate *********
8021//*********************************************************************
8022_NEON2SSESTORAGE int8x8_t vsra_n_s8(int8x8_t a, int8x8_t b, __constrange(1,8) int c); // VSRA.S8 d0,d0,#8
8023_NEON2SSE_INLINE int8x8_t vsra_n_s8(int8x8_t a, int8x8_t b, __constrange(1,8) int c) // VSRA.S8 d0,d0,#8
8024{
8025 int8x8_t shift;
8026 shift = vshr_n_s8(b, c);
8027 return vadd_s8( a, shift);
8028}
8029
8030_NEON2SSESTORAGE int16x4_t vsra_n_s16(int16x4_t a, int16x4_t b, __constrange(1,16) int c); // VSRA.S16 d0,d0,#16
8031_NEON2SSE_INLINE int16x4_t vsra_n_s16(int16x4_t a, int16x4_t b, __constrange(1,16) int c) // VSRA.S16 d0,d0,#16
8032{
8033 int16x4_t shift;
8034 shift = vshr_n_s16( b, c);
8035 return vadd_s16(a, shift);
8036}
8037
8038_NEON2SSESTORAGE int32x2_t vsra_n_s32(int32x2_t a, int32x2_t b, __constrange(1,32) int c); // VSRA.S32 d0,d0,#32
8039_NEON2SSE_INLINE int32x2_t vsra_n_s32(int32x2_t a, int32x2_t b, __constrange(1,32) int c) // VSRA.S32 d0,d0,#32
8040{
8041 //may be not optimal compared with the serial execution
8042 int32x2_t shift;
8043 shift = vshr_n_s32(b, c);
8044 return vadd_s32( a, shift);
8045}
8046
8047_NEON2SSESTORAGE int64x1_t vsra_n_s64(int64x1_t a, int64x1_t b, __constrange(1,64) int c); // VSRA.S64 d0,d0,#64
8048_NEON2SSE_INLINE int64x1_t vsra_n_s64(int64x1_t a, int64x1_t b, __constrange(1,64) int c)
8049{
8050 //may be not optimal compared with a serial solution
8051 int64x1_t shift;
8052 shift = vshr_n_s64(b, c);
8053 return vadd_s64( a, shift);
8054}
8055
8056_NEON2SSESTORAGE uint8x8_t vsra_n_u8(uint8x8_t a, uint8x8_t b, __constrange(1,8) int c); // VSRA.U8 d0,d0,#8
8057_NEON2SSE_INLINE uint8x8_t vsra_n_u8(uint8x8_t a, uint8x8_t b, __constrange(1,8) int c) // VSRA.U8 d0,d0,#8
8058{
8059 uint8x8_t shift;
8060 shift = vshr_n_u8(b, c);
8061 return vadd_u8(a, shift);
8062}
8063
8064_NEON2SSESTORAGE uint16x4_t vsra_n_u16(uint16x4_t a, uint16x4_t b, __constrange(1,16) int c); // VSRA.s16 d0,d0,#16
8065_NEON2SSE_INLINE uint16x4_t vsra_n_u16(uint16x4_t a, uint16x4_t b, __constrange(1,16) int c) // VSRA.s16 d0,d0,#16
8066{
8067 uint16x4_t shift;
8068 shift = vshr_n_u16(b, c);
8069 return vadd_u16(a,shift);
8070}
8071
8072_NEON2SSESTORAGE uint32x2_t vsra_n_u32(uint32x2_t a, uint32x2_t b, __constrange(1,32) int c); // VSRA.U32 d0,d0,#32
8073_NEON2SSE_INLINE uint32x2_t vsra_n_u32(uint32x2_t a, uint32x2_t b, __constrange(1,32) int c) // VSRA.U32 d0,d0,#32
8074{
8075 //may be not optimal compared with the serial execution
8076 uint32x2_t shift;
8077 shift = vshr_n_u32(b, c);
8078 return vadd_u32( a, shift);
8079}
8080
8081_NEON2SSESTORAGE uint64x1_t vsra_n_u64(uint64x1_t a, uint64x1_t b, __constrange(1,64) int c); // VSRA.U64 d0,d0,#64
8082_NEON2SSE_INLINE uint64x1_t vsra_n_u64(uint64x1_t a, uint64x1_t b, __constrange(1,64) int c) // VSRA.U64 d0,d0,#64
8083{
8084 //may be not optimal compared with the serial execution
8085 uint64x1_t shift;
8086 shift = vshr_n_u64(b, c);
8087 return vadd_u64(a, shift);
8088}
8089
8090_NEON2SSESTORAGE int8x16_t vsraq_n_s8(int8x16_t a, int8x16_t b, __constrange(1,8) int c); // VSRA.S8 q0,q0,#8
8091_NEON2SSE_INLINE int8x16_t vsraq_n_s8(int8x16_t a, int8x16_t b, __constrange(1,8) int c) // VSRA.S8 q0,q0,#8
8092{
8093 int8x16_t shift;
8094 shift = vshrq_n_s8(b, c);
8095 return vaddq_s8(a, shift);
8096}
8097
8098_NEON2SSESTORAGE int16x8_t vsraq_n_s16(int16x8_t a, int16x8_t b, __constrange(1,16) int c); // VSRA.S16 q0,q0,#16
8099_NEON2SSE_INLINE int16x8_t vsraq_n_s16(int16x8_t a, int16x8_t b, __constrange(1,16) int c) // VSRA.S16 q0,q0,#16
8100{
8101 int16x8_t shift;
8102 shift = vshrq_n_s16(b, c);
8103 return vaddq_s16(a, shift);
8104}
8105
8106_NEON2SSESTORAGE int32x4_t vsraq_n_s32(int32x4_t a, int32x4_t b, __constrange(1,32) int c); // VSRA.S32 q0,q0,#32
8107_NEON2SSE_INLINE int32x4_t vsraq_n_s32(int32x4_t a, int32x4_t b, __constrange(1,32) int c) // VSRA.S32 q0,q0,#32
8108{
8109 int32x4_t shift;
8110 shift = vshrq_n_s32(b, c);
8111 return vaddq_s32(a, shift);
8112}
8113
8114_NEON2SSESTORAGE int64x2_t vsraq_n_s64(int64x2_t a, int64x2_t b, __constrange(1,64) int c); // VSRA.S64 q0,q0,#64
8115_NEON2SSE_INLINE int64x2_t vsraq_n_s64(int64x2_t a, int64x2_t b, __constrange(1,64) int c) // VSRA.S64 q0,q0,#64
8116{
8117 int64x2_t shift;
8118 shift = vshrq_n_s64(b, c);
8119 return vaddq_s64( a, shift);
8120}
8121
8122_NEON2SSESTORAGE uint8x16_t vsraq_n_u8(uint8x16_t a, uint8x16_t b, __constrange(1,8) int c); // VSRA.U8 q0,q0,#8
8123_NEON2SSE_INLINE uint8x16_t vsraq_n_u8(uint8x16_t a, uint8x16_t b, __constrange(1,8) int c) // VSRA.U8 q0,q0,#8
8124{
8125 uint8x16_t shift;
8126 shift = vshrq_n_u8(b, c);
8127 return vaddq_u8(a, shift);
8128}
8129
8130_NEON2SSESTORAGE uint16x8_t vsraq_n_u16(uint16x8_t a, uint16x8_t b, __constrange(1,16) int c); // VSRA.s16 q0,q0,#16
8131_NEON2SSE_INLINE uint16x8_t vsraq_n_u16(uint16x8_t a, uint16x8_t b, __constrange(1,16) int c) // VSRA.s16 q0,q0,#16
8132{
8133 uint16x8_t shift;
8134 shift = vshrq_n_u16(b, c);
8135 return vaddq_u16(a, shift);
8136}
8137
8138_NEON2SSESTORAGE uint32x4_t vsraq_n_u32(uint32x4_t a, uint32x4_t b, __constrange(1,32) int c); // VSRA.U32 q0,q0,#32
8139_NEON2SSE_INLINE uint32x4_t vsraq_n_u32(uint32x4_t a, uint32x4_t b, __constrange(1,32) int c) // VSRA.U32 q0,q0,#32
8140{
8141 uint32x4_t shift;
8142 shift = vshrq_n_u32(b, c);
8143 return vaddq_u32(a, shift);
8144}
8145
8146_NEON2SSESTORAGE uint64x2_t vsraq_n_u64(uint64x2_t a, uint64x2_t b, __constrange(1,64) int c); // VSRA.U64 q0,q0,#64
8147_NEON2SSE_INLINE uint64x2_t vsraq_n_u64(uint64x2_t a, uint64x2_t b, __constrange(1,64) int c) // VSRA.U64 q0,q0,#64
8148{
8149 uint64x2_t shift;
8150 shift = vshrq_n_u64(b, c);
8151 return vaddq_u64(a, shift);
8152}
8153
8154//************* Vector rounding shift right by constant and accumulate ****************************
8155//************************************************************************************************
8156_NEON2SSESTORAGE int8x8_t vrsra_n_s8(int8x8_t a, int8x8_t b, __constrange(1,8) int c); // VRSRA.S8 d0,d0,#8
8157_NEON2SSE_INLINE int8x8_t vrsra_n_s8(int8x8_t a, int8x8_t b, __constrange(1,8) int c) // VRSRA.S8 d0,d0,#8
8158{
8159 int8x8_t shift;
8160 shift = vrshr_n_s8(b, c);
8161 return vadd_s8( a, shift);
8162}
8163
8164_NEON2SSESTORAGE int16x4_t vrsra_n_s16(int16x4_t a, int16x4_t b, __constrange(1,16) int c); // VRSRA.S16 d0,d0,#16
8165_NEON2SSE_INLINE int16x4_t vrsra_n_s16(int16x4_t a, int16x4_t b, __constrange(1,16) int c) // VRSRA.S16 d0,d0,#16
8166{
8167 int16x4_t shift;
8168 shift = vrshr_n_s16( b, c);
8169 return vadd_s16(a, shift);
8170}
8171
8172_NEON2SSESTORAGE int32x2_t vrsra_n_s32(int32x2_t a, int32x2_t b, __constrange(1,32) int c); // VRSRA.S32 d0,d0,#32
8173_NEON2SSE_INLINE int32x2_t vrsra_n_s32(int32x2_t a, int32x2_t b, __constrange(1,32) int c) // VRSRA.S32 d0,d0,#32
8174{
8175 //may be not optimal compared with the serial execution
8176 int32x2_t shift;
8177 shift = vrshr_n_s32(b, c);
8178 return vadd_s32( a, shift);
8179}
8180
8181_NEON2SSESTORAGE int64x1_t vrsra_n_s64(int64x1_t a, int64x1_t b, __constrange(1,64) int c); // VRSRA.S64 d0,d0,#64
8182_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int64x1_t vrsra_n_s64(int64x1_t a, int64x1_t b, __constrange(1,64) int c), _NEON2SSE_REASON_SLOW_SERIAL) //serial solution
8183{
8184 int64x1_t shift;
8185 shift = vrshr_n_s64(b, c);
8186 return vadd_s64( a, shift);
8187}
8188
8189_NEON2SSESTORAGE uint8x8_t vrsra_n_u8(uint8x8_t a, uint8x8_t b, __constrange(1,8) int c); // VRSRA.U8 d0,d0,#8
8190_NEON2SSE_INLINE uint8x8_t vrsra_n_u8(uint8x8_t a, uint8x8_t b, __constrange(1,8) int c) // VRSRA.U8 d0,d0,#8
8191{
8192 uint8x8_t shift;
8193 shift = vrshr_n_u8(b, c);
8194 return vadd_u8(a, shift);
8195}
8196
8197_NEON2SSESTORAGE uint16x4_t vrsra_n_u16(uint16x4_t a, uint16x4_t b, __constrange(1,16) int c); // VRSRA.s16 d0,d0,#16
8198_NEON2SSE_INLINE uint16x4_t vrsra_n_u16(uint16x4_t a, uint16x4_t b, __constrange(1,16) int c) // VRSRA.s16 d0,d0,#16
8199{
8200 uint16x4_t shift;
8201 shift = vrshr_n_u16(b, c);
8202 return vadd_u16(a,shift);
8203}
8204
8205_NEON2SSESTORAGE uint32x2_t vrsra_n_u32(uint32x2_t a, uint32x2_t b, __constrange(1,32) int c); // VRSRA.U32 d0,d0,#32
8206_NEON2SSE_INLINE uint32x2_t vrsra_n_u32(uint32x2_t a, uint32x2_t b, __constrange(1,32) int c) // VRSRA.U32 d0,d0,#32
8207{
8208 //may be not optimal compared with the serial execution
8209 uint32x2_t shift;
8210 shift = vrshr_n_u32(b, c);
8211 return vadd_u32( a, shift);
8212}
8213
8214_NEON2SSESTORAGE uint64x1_t vrsra_n_u64(uint64x1_t a, uint64x1_t b, __constrange(1,64) int c); // VRSRA.U64 d0,d0,#64
8215_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint64x1_t vrsra_n_u64(uint64x1_t a, uint64x1_t b, __constrange(1,64) int c), _NEON2SSE_REASON_SLOW_SERIAL) //serial solution
8216{
8217 //may be not optimal compared with the serial execution
8218 uint64x1_t shift;
8219 shift = vrshr_n_u64(b, c);
8220 return vadd_u64( a, shift);
8221}
8222
8223_NEON2SSESTORAGE int8x16_t vrsraq_n_s8(int8x16_t a, int8x16_t b, __constrange(1,8) int c); // VRSRA.S8 q0,q0,#8
8224_NEON2SSE_INLINE int8x16_t vrsraq_n_s8(int8x16_t a, int8x16_t b, __constrange(1,8) int c) // VRSRA.S8 q0,q0,#8
8225{
8226 int8x16_t shift;
8227 shift = vrshrq_n_s8(b, c);
8228 return vaddq_s8(a, shift);
8229}
8230
8231_NEON2SSESTORAGE int16x8_t vrsraq_n_s16(int16x8_t a, int16x8_t b, __constrange(1,16) int c); // VRSRA.S16 q0,q0,#16
8232_NEON2SSE_INLINE int16x8_t vrsraq_n_s16(int16x8_t a, int16x8_t b, __constrange(1,16) int c) // VRSRA.S16 q0,q0,#16
8233{
8234 int16x8_t shift;
8235 shift = vrshrq_n_s16(b, c);
8236 return vaddq_s16(a, shift);
8237}
8238
8239_NEON2SSESTORAGE int32x4_t vrsraq_n_s32(int32x4_t a, int32x4_t b, __constrange(1,32) int c); // VRSRA.S32 q0,q0,#32
8240_NEON2SSE_INLINE int32x4_t vrsraq_n_s32(int32x4_t a, int32x4_t b, __constrange(1,32) int c) // VRSRA.S32 q0,q0,#32
8241{
8242 int32x4_t shift;
8243 shift = vrshrq_n_s32(b, c);
8244 return vaddq_s32(a, shift);
8245}
8246
8247_NEON2SSESTORAGE int64x2_t vrsraq_n_s64(int64x2_t a, int64x2_t b, __constrange(1,64) int c); // VRSRA.S64 q0,q0,#64
8248_NEON2SSE_INLINE int64x2_t vrsraq_n_s64(int64x2_t a, int64x2_t b, __constrange(1,64) int c)
8249{
8250 int64x2_t shift;
8251 shift = vrshrq_n_s64(b, c);
8252 return vaddq_s64(a, shift);
8253}
8254
8255_NEON2SSESTORAGE uint8x16_t vrsraq_n_u8(uint8x16_t a, uint8x16_t b, __constrange(1,8) int c); // VRSRA.U8 q0,q0,#8
8256_NEON2SSE_INLINE uint8x16_t vrsraq_n_u8(uint8x16_t a, uint8x16_t b, __constrange(1,8) int c) // VRSRA.U8 q0,q0,#8
8257{
8258 uint8x16_t shift;
8259 shift = vrshrq_n_u8(b, c);
8260 return vaddq_u8(a, shift);
8261}
8262
8263_NEON2SSESTORAGE uint16x8_t vrsraq_n_u16(uint16x8_t a, uint16x8_t b, __constrange(1,16) int c); // VRSRA.s16 q0,q0,#16
8264_NEON2SSE_INLINE uint16x8_t vrsraq_n_u16(uint16x8_t a, uint16x8_t b, __constrange(1,16) int c) // VRSRA.s16 q0,q0,#16
8265{
8266 uint16x8_t shift;
8267 shift = vrshrq_n_u16(b, c);
8268 return vaddq_u16(a, shift);
8269}
8270
8271_NEON2SSESTORAGE uint32x4_t vrsraq_n_u32(uint32x4_t a, uint32x4_t b, __constrange(1,32) int c); // VRSRA.U32 q0,q0,#32
8272_NEON2SSE_INLINE uint32x4_t vrsraq_n_u32(uint32x4_t a, uint32x4_t b, __constrange(1,32) int c) // VRSRA.U32 q0,q0,#32
8273{
8274 uint32x4_t shift;
8275 shift = vrshrq_n_u32(b, c);
8276 return vaddq_u32(a, shift);
8277}
8278
8279_NEON2SSESTORAGE uint64x2_t vrsraq_n_u64(uint64x2_t a, uint64x2_t b, __constrange(1,64) int c); // VRSRA.U64 q0,q0,#64
8280_NEON2SSE_INLINE uint64x2_t vrsraq_n_u64(uint64x2_t a, uint64x2_t b, __constrange(1,64) int c)
8281{
8282 uint64x2_t shift;
8283 shift = vrshrq_n_u64(b, c);
8284 return vaddq_u64(a, shift);
8285}
8286
8287//**********************Vector saturating shift left by constant *****************************
8288//********************************************************************************************
8289//we don't check const ranges assuming they are met
8290_NEON2SSESTORAGE int8x8_t vqshl_n_s8(int8x8_t a, __constrange(0,7) int b); // VQSHL.S8 d0,d0,#0
8291_NEON2SSE_INLINE int8x8_t vqshl_n_s8(int8x8_t a, __constrange(0,7) int b) // VQSHL.S8 d0,d0,#0
8292{
8293 //no 8 bit shift available in IA32 SIMD, go to 16 bit. It also provides the auto saturation (in packs function)
8294 int8x8_t res64;
8295 __m128i a128, r128;
8296 a128 = _MM_CVTEPI8_EPI16 (_pM128i(a)); //SSE 4.1
8297 r128 = _mm_slli_epi16 (a128, b);
8298 r128 = _mm_packs_epi16 (r128,r128); //saturated s8, use 64 low bits only
8299 return64(r128);
8300}
8301
8302_NEON2SSESTORAGE int16x4_t vqshl_n_s16(int16x4_t a, __constrange(0,15) int b); // VQSHL.S16 d0,d0,#0
8303_NEON2SSE_INLINE int16x4_t vqshl_n_s16(int16x4_t a, __constrange(0,15) int b) // VQSHL.S16 d0,d0,#0
8304{
8305 // go to 32 bit to get the auto saturation (in packs function)
8306 int16x4_t res64;
8307 __m128i a128, r128;
8308 a128 = _MM_CVTEPI16_EPI32 (_pM128i(a)); //SSE 4.1
8309 r128 = _mm_slli_epi32 (a128, b); //shift_res
8310 r128 = _mm_packs_epi32 (r128,r128); //saturated s16, use 64 low bits only
8311 return64(r128);
8312}
8313
8314_NEON2SSESTORAGE int32x2_t vqshl_n_s32(int32x2_t a, __constrange(0,31) int b); // VQSHL.S32 d0,d0,#0
8315_NEON2SSE_INLINE int32x2_t vqshl_n_s32(int32x2_t a, __constrange(0,31) int b)
8316{
8317 //serial execution may be faster
8318 int32x2_t res64;
8319 return64(vqshlq_n_s32 (_pM128i(a), b));
8320}
8321
8322
8323_NEON2SSESTORAGE int64x1_t vqshl_n_s64(int64x1_t a, __constrange(0,63) int b); // VQSHL.S64 d0,d0,#0
8324_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int64x1_t vqshl_n_s64(int64x1_t a, __constrange(0,63) int b), _NEON2SSE_REASON_SLOW_SERIAL)
8325{
8326 // no effective SIMD solution here
8327 int64x1_t res;
8328 int64_t bmask;
8329 int64_t a_i64 = *( int64_t*)&a;
8330 bmask = ( int64_t)1 << (63 - b); //positive
8331 if (a_i64 >= bmask) {
8332 res.m64_i64[0] = ~(_SIGNBIT64);
8333 } else {
8334 res.m64_i64[0] = (a_i64 <= -bmask) ? (int64_t)_SIGNBIT64 : a_i64 << b;
8335 }
8336 return res;
8337}
8338
8339
8340_NEON2SSESTORAGE uint8x8_t vqshl_n_u8(uint8x8_t a, __constrange(0,7) int b); // VQSHL.U8 d0,d0,#0
8341_NEON2SSE_INLINE uint8x8_t vqshl_n_u8(uint8x8_t a, __constrange(0,7) int b) // VQSHL.U8 d0,d0,#0
8342{
8343 //no 8 bit shift available in IA32 SIMD, go to 16 bit
8344 uint8x8_t res64;
8345 __m128i a128, r128;
8346 a128 = _MM_CVTEPU8_EPI16 (_pM128i(a)); //SSE 4.1
8347 r128 = _mm_slli_epi16 (a128, b); //shift_res
8348 r128 = _mm_packus_epi16 (r128,r128); //saturated u8, use 64 low bits only
8349 return64(r128);
8350}
8351
8352_NEON2SSESTORAGE uint16x4_t vqshl_n_u16(uint16x4_t a, __constrange(0,15) int b); // VQSHL.s16 d0,d0,#0
8353_NEON2SSE_INLINE uint16x4_t vqshl_n_u16(uint16x4_t a, __constrange(0,15) int b) // VQSHL.s16 d0,d0,#0
8354{
8355 // go to 32 bit to get the auto saturation (in packus function)
8356 uint16x4_t res64;
8357 __m128i a128, r128;
8358 a128 = _MM_CVTEPU16_EPI32 (_pM128i(a)); //SSE 4.1
8359 r128 = _mm_slli_epi32 (a128, b); //shift_res
8360 r128 = _MM_PACKUS1_EPI32 (r128); //saturated s16
8361 return64(r128);
8362}
8363
8364_NEON2SSESTORAGE uint32x2_t vqshl_n_u32(uint32x2_t a, __constrange(0,31) int b); // VQSHL.U32 d0,d0,#0
8365_NEON2SSE_INLINE uint32x2_t vqshl_n_u32(uint32x2_t a, __constrange(0,31) int b)
8366{
8367 uint32x2_t res64;
8368 return64(vqshlq_n_u32(_pM128i(a), b));
8369}
8370
8371_NEON2SSESTORAGE uint64x1_t vqshl_n_u64(uint64x1_t a, __constrange(0,63) int b); // VQSHL.U64 d0,d0,#0
8372_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint64x1_t vqshl_n_u64(uint64x1_t a, __constrange(0,63) int b), _NEON2SSE_REASON_SLOW_SERIAL)
8373{
8374 // no effective SIMD solution here
8375 uint64x1_t res;
8376 uint64_t bmask;
8377 uint64_t a_i64 = *(uint64_t*)&a;
8378 bmask = ( uint64_t)1 << (64 - b);
8379 res.m64_u64[0] = (a_i64 >= bmask)&&(b>0) ? 0xffffffffffffffff : a_i64 << b; //if b=0 we are fine with any a
8380 return res;
8381}
8382
8383_NEON2SSESTORAGE int8x16_t vqshlq_n_s8(int8x16_t a, __constrange(0,7) int b); // VQSHL.S8 q0,q0,#0
8384_NEON2SSE_INLINE int8x16_t vqshlq_n_s8(int8x16_t a, __constrange(0,7) int b) // VQSHL.S8 q0,q0,#0
8385{
8386 // go to 16 bit to get the auto saturation (in packs function)
8387 __m128i a128, r128_1, r128_2;
8388 a128 = _MM_CVTEPI8_EPI16 (a); //SSE 4.1
8389 r128_1 = _mm_slli_epi16 (a128, b);
8390 //swap hi and low part of a128 to process the remaining data
8391 a128 = _mm_shuffle_epi32 (a, _SWAP_HI_LOW32);
8392 a128 = _MM_CVTEPI8_EPI16 (a128);
8393 r128_2 = _mm_slli_epi16 (a128, b);
8394 return _mm_packs_epi16 (r128_1, r128_2); //saturated s8
8395}
8396
8397_NEON2SSESTORAGE int16x8_t vqshlq_n_s16(int16x8_t a, __constrange(0,15) int b); // VQSHL.S16 q0,q0,#0
8398_NEON2SSE_INLINE int16x8_t vqshlq_n_s16(int16x8_t a, __constrange(0,15) int b) // VQSHL.S16 q0,q0,#0
8399{
8400 // manual saturation solution looks LESS optimal than 32 bits conversion one
8401 // go to 32 bit to get the auto saturation (in packs function)
8402 __m128i a128, r128_1, r128_2;
8403 a128 = _MM_CVTEPI16_EPI32 (a); //SSE 4.1
8404 r128_1 = _mm_slli_epi32 (a128, b); //shift_res
8405 //swap hi and low part of a128 to process the remaining data
8406 a128 = _mm_shuffle_epi32 (a, _SWAP_HI_LOW32);
8407 a128 = _MM_CVTEPI16_EPI32 (a128);
8408 r128_2 = _mm_slli_epi32 (a128, b);
8409 return _mm_packs_epi32 (r128_1, r128_2); //saturated s16
8410}
8411
8412_NEON2SSESTORAGE int32x4_t vqshlq_n_s32(int32x4_t a, __constrange(0,31) int b); // VQSHL.S32 q0,q0,#0
8413_NEON2SSE_INLINE int32x4_t vqshlq_n_s32(int32x4_t a, __constrange(0,31) int b) // VQSHL.S32 q0,q0,#0
8414{
8415 // no 64 bit saturation option available, special tricks necessary
8416 __m128i c1, maskA, saturation_mask, c7ffffff_mask, shift_res, shift_res_mask;
8417 c1 = _mm_cmpeq_epi32(a,a); //0xff..ff
8418 maskA = _mm_srli_epi32(c1, b + 1); //mask for positive numbers (32-b+1) zeros and b-1 ones
8419 saturation_mask = _mm_cmpgt_epi32 (a, maskA); //0xff...ff if we need saturation, 0 otherwise
8420 c7ffffff_mask = _mm_srli_epi32(saturation_mask, 1); //saturated to 0x7f..ff when needed and zeros if not
8421 shift_res = _mm_slli_epi32 (a, b);
8422 shift_res_mask = _mm_andnot_si128(saturation_mask, shift_res);
8423 //result with positive numbers saturated
8424 shift_res = _mm_or_si128 (c7ffffff_mask, shift_res_mask);
8425 //treat negative numbers
8426 maskA = _mm_slli_epi32(c1, 31 - b); //mask for negative numbers b-1 ones and (32-b+1) zeros
8427 saturation_mask = _mm_cmpgt_epi32 (maskA,a); //0xff...ff if we need saturation, 0 otherwise
8428 c7ffffff_mask = _mm_slli_epi32(saturation_mask, 31); //saturated to 0x80..00 when needed and zeros if not
8429 shift_res_mask = _mm_andnot_si128(saturation_mask, shift_res);
8430 return _mm_or_si128 (c7ffffff_mask, shift_res_mask);
8431}
8432
8433_NEON2SSESTORAGE int64x2_t vqshlq_n_s64(int64x2_t a, __constrange(0,63) int b); // VQSHL.S64 q0,q0,#0
8434_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int64x2_t vqshlq_n_s64(int64x2_t a, __constrange(0,63) int b), _NEON2SSE_REASON_SLOW_SERIAL)
8435{
8436 // no effective SIMD solution here
8437 _NEON2SSE_ALIGN_16 int64_t atmp[2], res[2];
8438 int64_t bmask;
8439 int i;
8440 bmask = ( int64_t)1 << (63 - b); //positive
8441 _mm_store_si128((__m128i*)atmp, a);
8442 for (i = 0; i<2; i++) {
8443 if (atmp[i] >= bmask) {
8444 res[i] = ~(_SIGNBIT64);
8445 } else {
8446 res[i] = (atmp[i] <= -bmask) ? (int64_t)_SIGNBIT64 : atmp[i] << b;
8447 }
8448 }
8449 return _mm_load_si128((__m128i*)res);
8450}
8451
8452_NEON2SSESTORAGE uint8x16_t vqshlq_n_u8(uint8x16_t a, __constrange(0,7) int b); // VQSHL.U8 q0,q0,#0
8453_NEON2SSE_INLINE uint8x16_t vqshlq_n_u8(uint8x16_t a, __constrange(0,7) int b) // VQSHL.U8 q0,q0,#0
8454{
8455 // go to 16 bit to get the auto saturation (in packs function)
8456 __m128i a128, r128_1, r128_2;
8457 a128 = _MM_CVTEPU8_EPI16 (a); //SSE 4.1
8458 r128_1 = _mm_slli_epi16 (a128, b);
8459 //swap hi and low part of a128 to process the remaining data
8460 a128 = _mm_shuffle_epi32 (a, _SWAP_HI_LOW32);
8461 a128 = _MM_CVTEPU8_EPI16 (a128);
8462 r128_2 = _mm_slli_epi16 (a128, b);
8463 return _mm_packus_epi16 (r128_1, r128_2); //saturated u8
8464}
8465
8466_NEON2SSESTORAGE uint16x8_t vqshlq_n_u16(uint16x8_t a, __constrange(0,15) int b); // VQSHL.s16 q0,q0,#0
8467_NEON2SSE_INLINE uint16x8_t vqshlq_n_u16(uint16x8_t a, __constrange(0,15) int b) // VQSHL.s16 q0,q0,#0
8468{
8469 // manual saturation solution looks more optimal than 32 bits conversion one
8470 __m128i cb, c8000, a_signed, saturation_mask, shift_res;
8471 cb = _mm_set1_epi16((1 << (16 - b)) - 1 - 0x8000 );
8472 c8000 = _mm_set1_epi16 (-32768); // (int16_t)0x8000
8473//no unsigned shorts comparison in SSE, only signed available, so need the trick
8474 a_signed = _mm_sub_epi16(a, c8000); //go to signed
8475 saturation_mask = _mm_cmpgt_epi16 (a_signed, cb);
8476 shift_res = _mm_slli_epi16 (a, b);
8477 return _mm_or_si128 (shift_res, saturation_mask);
8478}
8479
8480_NEON2SSESTORAGE uint32x4_t vqshlq_n_u32(uint32x4_t a, __constrange(0,31) int b); // VQSHL.U32 q0,q0,#0
8481_NEON2SSE_INLINE uint32x4_t vqshlq_n_u32(uint32x4_t a, __constrange(0,31) int b) // VQSHL.U32 q0,q0,#0
8482{
8483 // manual saturation solution, no 64 bit saturation option, the serial version may be faster
8484 __m128i cb, c80000000, a_signed, saturation_mask, shift_res;
8485 cb = _mm_set1_epi32((1 << (32 - b)) - 1 - 0x80000000 );
8486 c80000000 = _mm_set1_epi32 (0x80000000);
8487//no unsigned ints comparison in SSE, only signed available, so need the trick
8488 a_signed = _mm_sub_epi32(a, c80000000); //go to signed
8489 saturation_mask = _mm_cmpgt_epi32 (a_signed, cb);
8490 shift_res = _mm_slli_epi32 (a, b);
8491 return _mm_or_si128 (shift_res, saturation_mask);
8492}
8493
8494_NEON2SSESTORAGE uint64x2_t vqshlq_n_u64(uint64x2_t a, __constrange(0,63) int b); // VQSHL.U64 q0,q0,#0
8495_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint64x2_t vqshlq_n_u64(uint64x2_t a, __constrange(0,63) int b), _NEON2SSE_REASON_SLOW_SERIAL)
8496{
8497 // no effective SIMD solution here
8498 _NEON2SSE_ALIGN_16 uint64_t atmp[2], res[2];
8499 uint64_t bmask;
8500 int i;
8501 bmask = ( uint64_t)1 << (64 - b);
8502 _mm_store_si128((__m128i*)atmp, a);
8503 for (i = 0; i<2; i++) {
8504 res[i] = (atmp[i] >= bmask)&&(b>0) ? 0xffffffffffffffff : atmp[i] << b; //if b=0 we are fine with any a
8505 }
8506 return _mm_load_si128((__m128i*)res);
8507}
8508
8509//**************Vector signed->unsigned saturating shift left by constant *************
8510//*************************************************************************************
8511_NEON2SSESTORAGE uint8x8_t vqshlu_n_s8(int8x8_t a, __constrange(0,7) int b); // VQSHLU.S8 d0,d0,#0
8512_NEON2SSE_INLINE uint8x8_t vqshlu_n_s8(int8x8_t a, __constrange(0,7) int b) // VQSHLU.S8 d0,d0,#0
8513{
8514 //no 8 bit shift available in IA32 SIMD, go to 16 bit. It also provides the auto saturation (in packs function)
8515 uint8x8_t res64;
8516 __m128i a128, r128;
8517 a128 = _MM_CVTEPI8_EPI16 (_pM128i(a)); //SSE 4.1
8518 r128 = _mm_slli_epi16 (a128, b);
8519 r128 = _mm_packus_epi16 (r128,r128); //saturated u8, use 64 low bits only
8520 return64(r128);
8521}
8522
8523_NEON2SSESTORAGE uint16x4_t vqshlu_n_s16(int16x4_t a, __constrange(0,15) int b); // VQSHLU.S16 d0,d0,#0
8524_NEON2SSE_INLINE uint16x4_t vqshlu_n_s16(int16x4_t a, __constrange(0,15) int b) // VQSHLU.S16 d0,d0,#0
8525{
8526 uint16x4_t res64;
8527 __m128i a128, r128;
8528 a128 = _MM_CVTEPI16_EPI32 (_pM128i(a)); //SSE 4.1
8529 r128 = _mm_slli_epi32 (a128, b); //shift_res
8530 r128 = _MM_PACKUS1_EPI32 (r128); //saturated s16, use 64 low bits only
8531 return64(r128);
8532}
8533
8534_NEON2SSESTORAGE uint32x2_t vqshlu_n_s32(int32x2_t a, __constrange(0,31) int b); // VQSHLU.S32 d0,d0,#0
8535_NEON2SSE_INLINE int32x2_t vqshlu_n_s32(int32x2_t a, __constrange(0,31) int b)
8536{
8537 int32x2_t res64;
8538 return64( vqshluq_n_s32(_pM128i(a), b));
8539}
8540
8541_NEON2SSESTORAGE uint64x1_t vqshlu_n_s64(int64x1_t a, __constrange(0,63) int b); // VQSHLU.S64 d0,d0,#0
8542_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint64x1_t vqshlu_n_s64(int64x1_t a, __constrange(0,63) int b), _NEON2SSE_REASON_SLOW_SERIAL) // no effective SIMD solution here, serial execution looks faster
8543{
8544 uint64x1_t res;
8545 uint64_t limit;
8546 if (a.m64_i64[0]<=0) {
8547 res.m64_u64[0] = 0;
8548 } else {
8549 limit = (uint64_t) 1 << (64 - b);
8550 res.m64_u64[0] = ( ((uint64_t)a.m64_i64[0]) >= limit) ? ~((uint64_t)0) : (uint64_t)a.m64_i64[0] << b;
8551 }
8552 return res;
8553}
8554
8555_NEON2SSESTORAGE uint8x16_t vqshluq_n_s8(int8x16_t a, __constrange(0,7) int b); // VQSHLU.S8 q0,q0,#0
8556_NEON2SSE_INLINE uint8x16_t vqshluq_n_s8(int8x16_t a, __constrange(0,7) int b) // VQSHLU.S8 q0,q0,#0
8557{
8558 __m128i a128, r128_1, r128_2;
8559 a128 = _MM_CVTEPI8_EPI16 (a); //SSE 4.1
8560 r128_1 = _mm_slli_epi16 (a128, b);
8561 //swap hi and low part of a128 to process the remaining data
8562 a128 = _mm_shuffle_epi32 (a, _SWAP_HI_LOW32);
8563 a128 = _MM_CVTEPI8_EPI16 (a128);
8564 r128_2 = _mm_slli_epi16 (a128, b);
8565 return _mm_packus_epi16 (r128_1, r128_2); //saturated u8
8566}
8567
8568_NEON2SSESTORAGE uint16x8_t vqshluq_n_s16(int16x8_t a, __constrange(0,15) int b); // VQSHLU.S16 q0,q0,#0
8569_NEON2SSE_INLINE uint16x8_t vqshluq_n_s16(int16x8_t a, __constrange(0,15) int b) // VQSHLU.S16 q0,q0,#0
8570{
8571 // manual saturation solution looks LESS optimal than 32 bits conversion one
8572 __m128i a128, r128_1, r128_2;
8573 a128 = _MM_CVTEPI16_EPI32 (a); //SSE 4.1
8574 r128_1 = _mm_slli_epi32 (a128, b); //shift_res
8575 //swap hi and low part of a128 to process the remaining data
8576 a128 = _mm_shuffle_epi32 (a, _SWAP_HI_LOW32);
8577 a128 = _MM_CVTEPI16_EPI32 (a128);
8578 r128_2 = _mm_slli_epi32 (a128, b);
8579 return _MM_PACKUS_EPI32 (r128_1, r128_2); //saturated s16
8580}
8581
8582_NEON2SSESTORAGE uint32x4_t vqshluq_n_s32(int32x4_t a, __constrange(0,31) int b); // VQSHLU.S32 q0,q0,#0
8583_NEON2SSE_INLINE uint32x4_t vqshluq_n_s32(int32x4_t a, __constrange(0,31) int b) // VQSHLU.S32 q0,q0,#0
8584{
8585 //solution may be not optimal compared with the serial one
8586 __m128i zero, maskA, maskGT0, a0, a_masked, a_shift;
8587 zero = _mm_setzero_si128();
8588 maskA = _mm_cmpeq_epi32(a, a);
8589 maskA = _mm_slli_epi32(maskA,(32 - b)); // b ones and (32-b)zeros
8590 //saturate negative numbers to zero
8591 maskGT0 = _mm_cmpgt_epi32 (a, zero); // //0xffffffff if positive number and zero otherwise (negative numbers)
8592 a0 = _mm_and_si128 (a, maskGT0); //negative are zeros now
8593 //saturate positive to 0xffffffff
8594 a_masked = _mm_and_si128 (a0, maskA);
8595 a_masked = _mm_cmpgt_epi32 (a_masked, zero); //0xffffffff if saturation necessary 0 otherwise
8596 a_shift = _mm_slli_epi32 (a0, b);
8597 return _mm_or_si128 (a_shift, a_masked); //actual saturation
8598}
8599
8600_NEON2SSESTORAGE uint64x2_t vqshluq_n_s64(int64x2_t a, __constrange(0,63) int b); // VQSHLU.S64 q0,q0,#0
8601_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint64x2_t vqshluq_n_s64(int64x2_t a, __constrange(0,63) int b), _NEON2SSE_REASON_SLOW_SERIAL)
8602{
8603 // no effective SIMD solution here, serial execution looks faster
8604 _NEON2SSE_ALIGN_16 int64_t atmp[2];
8605 _NEON2SSE_ALIGN_16 uint64_t res[2];
8606 uint64_t limit;
8607 int i;
8608 _mm_store_si128((__m128i*)atmp, a);
8609 for (i = 0; i<2; i++) {
8610 if (atmp[i]<=0) {
8611 res[i] = 0;
8612 } else {
8613 limit = (uint64_t) 1 << (64 - b);
8614 res[i] = ( ((uint64_t)atmp[i]) >= limit) ? ~((uint64_t)0) : (uint64_t)atmp[i] << b;
8615 }
8616 }
8617 return _mm_load_si128((__m128i*)res);
8618}
8619
8620//************** Vector narrowing shift right by constant **************
8621//**********************************************************************
8622_NEON2SSESTORAGE int8x8_t vshrn_n_s16(int16x8_t a, __constrange(1,8) int b); // VSHRN.I16 d0,q0,#8
8623_NEON2SSE_INLINE int8x8_t vshrn_n_s16(int16x8_t a, __constrange(1,8) int b) // VSHRN.I16 d0,q0,#8
8624{
8625 int8x8_t res64;
8626 __m128i r16;
8627 r16 = vshrq_n_s16(a,b);
8628 r16 = _mm_shuffle_epi8 (r16, *(__m128i*) mask8_16_even_odd); //narrow, use low 64 bits only. Impossible to use _mm_packs because of negative saturation problems
8629 return64(r16);
8630}
8631
8632_NEON2SSESTORAGE int16x4_t vshrn_n_s32(int32x4_t a, __constrange(1,16) int b); // VSHRN.I32 d0,q0,#16
8633_NEON2SSE_INLINE int16x4_t vshrn_n_s32(int32x4_t a, __constrange(1,16) int b) // VSHRN.I32 d0,q0,#16
8634{
8635 int16x4_t res64;
8636 __m128i r32;
8637 r32 = vshrq_n_s32(a,b);
8638 r32 = _mm_shuffle_epi8 (r32, *(__m128i*) mask8_32_even_odd); //narrow, use low 64 bits only. Impossible to use _mm_packs because of negative saturation problems
8639 return64(r32);
8640}
8641
8642_NEON2SSESTORAGE int32x2_t vshrn_n_s64(int64x2_t a, __constrange(1,32) int b); // VSHRN.I64 d0,q0,#32
8643_NEON2SSE_INLINE int32x2_t vshrn_n_s64(int64x2_t a, __constrange(1,32) int b)
8644{
8645 int32x2_t res64;
8646 __m128i r64;
8647 r64 = vshrq_n_s64(a,b);
8648 r64 = _mm_shuffle_epi32(r64, 0 | (2 << 2) | (1 << 4) | (3 << 6)); //shuffle the data to get 2 32-bits
8649 return64(r64);
8650}
8651
8652_NEON2SSESTORAGE uint8x8_t vshrn_n_u16(uint16x8_t a, __constrange(1,8) int b); // VSHRN.I16 d0,q0,#8
8653_NEON2SSE_INLINE uint8x8_t vshrn_n_u16(uint16x8_t a, __constrange(1,8) int b) // VSHRN.I16 d0,q0,#8
8654{
8655 uint8x8_t res64;
8656 __m128i mask, r16;
8657 mask = _mm_set1_epi16(0xff);
8658 r16 = vshrq_n_s16(a,b); //after right shift b>=1 unsigned var fits into signed range, so we could use _mm_packus_epi16 (signed 16 to unsigned 8)
8659 r16 = _mm_and_si128(r16, mask); //to avoid saturation
8660 r16 = _mm_packus_epi16 (r16,r16); //narrow, use low 64 bits only
8661 return64(r16);
8662}
8663
8664_NEON2SSESTORAGE uint16x4_t vshrn_n_u32(uint32x4_t a, __constrange(1,16) int b); // VSHRN.I32 d0,q0,#16
8665_NEON2SSE_INLINE uint16x4_t vshrn_n_u32(uint32x4_t a, __constrange(1,16) int b) // VSHRN.I32 d0,q0,#16
8666{
8667 uint16x4_t res64;
8668 __m128i mask, r32;
8669 mask = _mm_set1_epi32(0xffff);
8670 r32 = vshrq_n_u32(a,b); //after right shift b>=1 unsigned var fits into signed range, so we could use _MM_PACKUS_EPI32 (signed 32 to unsigned 16)
8671 r32 = _mm_and_si128(r32, mask); //to avoid saturation
8672 r32 = _MM_PACKUS1_EPI32 (r32); //saturate and narrow, use low 64 bits only
8673 return64(r32);
8674}
8675
8676_NEON2SSESTORAGE uint32x2_t vshrn_n_u64(uint64x2_t a, __constrange(1,32) int b); // VSHRN.I64 d0,q0,#32
8677_NEON2SSE_INLINE uint32x2_t vshrn_n_u64(uint64x2_t a, __constrange(1,32) int b)
8678{
8679 uint32x2_t res64;
8680 __m128i r64;
8681 r64 = vshrq_n_u64(a,b);
8682 r64 = _mm_shuffle_epi32(r64, 0 | (2 << 2) | (1 << 4) | (3 << 6)); //shuffle the data to get 2 32-bits
8683 return64(r64);
8684}
8685
8686//************** Vector signed->unsigned narrowing saturating shift right by constant ********
8687//*********************************************************************************************
8688_NEON2SSESTORAGE uint8x8_t vqshrun_n_s16(int16x8_t a, __constrange(1,8) int b); // VQSHRUN.S16 d0,q0,#8
8689_NEON2SSE_INLINE uint8x8_t vqshrun_n_s16(int16x8_t a, __constrange(1,8) int b) // VQSHRUN.S16 d0,q0,#8
8690{
8691 uint8x8_t res64;
8692 __m128i r16;
8693 r16 = vshrq_n_s16(a,b);
8694 r16 = _mm_packus_epi16 (r16,r16); //saturate and narrow (signed to unsigned), use low 64 bits only
8695 return64(r16);
8696}
8697
8698_NEON2SSESTORAGE uint16x4_t vqshrun_n_s32(int32x4_t a, __constrange(1,16) int b); // VQSHRUN.S32 d0,q0,#16
8699_NEON2SSE_INLINE uint16x4_t vqshrun_n_s32(int32x4_t a, __constrange(1,16) int b) // VQSHRUN.S32 d0,q0,#16
8700{
8701 uint16x4_t res64;
8702 __m128i r32;
8703 r32 = vshrq_n_s32(a,b);
8704 r32 = _MM_PACKUS1_EPI32 (r32); //saturate and narrow(signed to unsigned), use low 64 bits only
8705 return64(r32);
8706}
8707
8708_NEON2SSESTORAGE uint32x2_t vqshrun_n_s64(int64x2_t a, __constrange(1,32) int b); // VQSHRUN.S64 d0,q0,#32
8709_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint32x2_t vqshrun_n_s64(int64x2_t a, __constrange(1,32) int b), _NEON2SSE_REASON_SLOW_SERIAL) //serial solution is faster
8710{
8711 _NEON2SSE_ALIGN_16 int64_t atmp[2];
8712 uint32x2_t res;
8713 int64_t res64;
8714 _mm_store_si128((__m128i*)atmp, a);
8715 if (atmp[0] < 0) {
8716 res.m64_u32[0] = 0;
8717 } else {
8718 res64 = (atmp[0] >> b);
8719 res.m64_u32[0] = (res64 > (int64_t)0xffffffff) ? 0xffffffff : (uint32_t) res64;
8720 }
8721 if (atmp[1] < 0) {
8722 res.m64_u32[1] = 0;
8723 } else {
8724 res64 = (atmp[1] >> b);
8725 res.m64_u32[1] = (res64 > (int64_t)0xffffffff) ? 0xffffffff : (uint32_t)res64;
8726 }
8727 return res;
8728}
8729
8730//**** Vector signed->unsigned rounding narrowing saturating shift right by constant *****
8731_NEON2SSESTORAGE uint8x8_t vqrshrun_n_s16(int16x8_t a, __constrange(1,8) int b); // VQRSHRUN.S16 d0,q0,#8
8732_NEON2SSE_INLINE uint8x8_t vqrshrun_n_s16(int16x8_t a, __constrange(1,8) int b) // VQRSHRUN.S16 d0,q0,#8
8733{
8734 //solution may be not optimal compared with the serial one
8735 __m128i r16;
8736 uint8x8_t res64;
8737 r16 = vrshrq_n_s16(a,b);
8738 r16 = _mm_packus_epi16 (r16,r16); //saturate and narrow (signed to unsigned), use low 64 bits only
8739 return64(r16);
8740}
8741
8742_NEON2SSESTORAGE uint16x4_t vqrshrun_n_s32(int32x4_t a, __constrange(1,16) int b); // VQRSHRUN.S32 d0,q0,#16
8743_NEON2SSE_INLINE uint16x4_t vqrshrun_n_s32(int32x4_t a, __constrange(1,16) int b) // VQRSHRUN.S32 d0,q0,#16
8744{
8745 //solution may be not optimal compared with the serial one
8746 __m128i r32;
8747 uint16x4_t res64;
8748 r32 = vrshrq_n_s32(a,b);
8749 r32 = _MM_PACKUS1_EPI32 (r32); //saturate and narrow (signed to unsigned), use low 64 bits only
8750 return64(r32);
8751}
8752
8753_NEON2SSESTORAGE uint32x2_t vqrshrun_n_s64(int64x2_t a, __constrange(1,32) int b); // VQRSHRUN.S64 d0,q0,#32
8754_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint32x2_t vqrshrun_n_s64(int64x2_t a, __constrange(1,32) int b), _NEON2SSE_REASON_SLOW_SERIAL) //serial solution is faster
8755{
8756 _NEON2SSE_ALIGN_16 int64_t atmp[2];
8757 uint32x2_t res;
8758 int64_t res64;
8759 _mm_store_si128((__m128i*)atmp, a);
8760 if (atmp[0] < 0) {
8761 res.m64_u32[0] = 0;
8762 } else {
8763 res64 = (atmp[0] >> b) + ( (atmp[0] & ((int64_t)1 << (b - 1))) >> (b - 1) );
8764 res.m64_u32[0] = (uint32_t) ((res64 > (int64_t)0xffffffff ) ? 0xffffffff : res64);
8765 }
8766 if (atmp[1] < 0) {
8767 res.m64_u32[1] = 0;
8768 } else {
8769 res64 = (atmp[1] >> b) + ( (atmp[0] & ((int64_t)1 << (b - 1))) >> (b - 1) );
8770 res.m64_u32[1] = (uint32_t)((res64 > (int64_t)0xffffffff ) ? 0xffffffff : res64);
8771 }
8772 return res;
8773}
8774
8775//***** Vector narrowing saturating shift right by constant ******
8776//*****************************************************************
8777_NEON2SSESTORAGE int8x8_t vqshrn_n_s16(int16x8_t a, __constrange(1,8) int b); // VQSHRN.S16 d0,q0,#8
8778_NEON2SSE_INLINE int8x8_t vqshrn_n_s16(int16x8_t a, __constrange(1,8) int b) // VQSHRN.S16 d0,q0,#8
8779{
8780 int8x8_t res64;
8781 __m128i r16;
8782 r16 = vshrq_n_s16(a,b);
8783 r16 = _mm_packs_epi16 (r16,r16); //saturate and narrow, use low 64 bits only
8784 return64(r16);
8785}
8786
8787_NEON2SSESTORAGE int16x4_t vqshrn_n_s32(int32x4_t a, __constrange(1,16) int b); // VQSHRN.S32 d0,q0,#16
8788_NEON2SSE_INLINE int16x4_t vqshrn_n_s32(int32x4_t a, __constrange(1,16) int b) // VQSHRN.S32 d0,q0,#16
8789{
8790 int16x4_t res64;
8791 __m128i r32;
8792 r32 = vshrq_n_s32(a,b);
8793 r32 = _mm_packs_epi32 (r32,r32); //saturate and narrow, use low 64 bits only
8794 return64(r32);
8795}
8796
8797_NEON2SSESTORAGE int32x2_t vqshrn_n_s64(int64x2_t a, __constrange(1,32) int b); // VQSHRN.S64 d0,q0,#32
8798_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int32x2_t vqshrn_n_s64(int64x2_t a, __constrange(1,32) int b), _NEON2SSE_REASON_SLOW_UNEFFECTIVE)
8799{
8800 //no optimal SIMD solution found
8801 _NEON2SSE_ALIGN_16 int64_t res64[2], atmp[2];
8802 int32x2_t res;
8803 _mm_store_si128((__m128i*)atmp, a);
8804 res64[0] = (atmp[0] >> b);
8805 res64[1] = (atmp[1] >> b);
8806 if(res64[0]>SINT_MAX) res64[0] = SINT_MAX;
8807 if(res64[0]<SINT_MIN) res64[0] = SINT_MIN;
8808 if(res64[1]>SINT_MAX) res64[1] = SINT_MAX;
8809 if(res64[1]<SINT_MIN) res64[1] = SINT_MIN;
8810 res.m64_i32[0] = (int32_t)res64[0];
8811 res.m64_i32[1] = (int32_t)res64[1];
8812 return res;
8813}
8814
8815_NEON2SSESTORAGE uint8x8_t vqshrn_n_u16(uint16x8_t a, __constrange(1,8) int b); // VQSHRN.s16 d0,q0,#8
8816_NEON2SSE_INLINE uint8x8_t vqshrn_n_u16(uint16x8_t a, __constrange(1,8) int b) // VQSHRN.s16 d0,q0,#8
8817{
8818 uint8x8_t res64;
8819 __m128i r16;
8820 r16 = vshrq_n_u16(a,b); //after right shift b>=1 unsigned var fits into signed range, so we could use _mm_packus_epi16 (signed 16 to unsigned 8)
8821 r16 = _mm_packus_epi16 (r16,r16); //saturate and narrow, use low 64 bits only
8822 return64(r16);
8823}
8824
8825_NEON2SSESTORAGE uint16x4_t vqshrn_n_u32(uint32x4_t a, __constrange(1,16) int b); // VQSHRN.U32 d0,q0,#16
8826_NEON2SSE_INLINE uint16x4_t vqshrn_n_u32(uint32x4_t a, __constrange(1,16) int b) // VQSHRN.U32 d0,q0,#16
8827{
8828 uint16x4_t res64;
8829 __m128i r32;
8830 r32 = vshrq_n_u32(a,b); //after right shift b>=1 unsigned var fits into signed range, so we could use _MM_PACKUS_EPI32 (signed 32 to unsigned 8)
8831 r32 = _MM_PACKUS1_EPI32 (r32); //saturate and narrow, use low 64 bits only
8832 return64(r32);
8833}
8834
8835_NEON2SSESTORAGE uint32x2_t vqshrn_n_u64(uint64x2_t a, __constrange(1,32) int b); // VQSHRN.U64 d0,q0,#32
8836_NEON2SSE_INLINE uint32x2_t vqshrn_n_u64(uint64x2_t a, __constrange(1,32) int b)
8837{
8838 //serial solution may be faster
8839 uint32x2_t res64;
8840 __m128i r64, res_hi, zero;
8841 zero = _mm_setzero_si128();
8842 r64 = vshrq_n_u64(a,b);
8843 res_hi = _mm_srli_epi64(r64, 32);
8844 res_hi = _mm_cmpgt_epi32(res_hi, zero);
8845 r64 = _mm_or_si128(r64, res_hi);
8846 r64 = _mm_shuffle_epi32(r64, 0 | (2 << 2) | (1 << 4) | (3 << 6)); //shuffle the data to get 2 32-bits
8847 return64(r64);
8848}
8849
8850
8851//********* Vector rounding narrowing shift right by constant *************************
8852//****************************************************************************************
8853_NEON2SSESTORAGE int8x8_t vrshrn_n_s16(int16x8_t a, __constrange(1,8) int b); // VRSHRN.I16 d0,q0,#8
8854_NEON2SSE_INLINE int8x8_t vrshrn_n_s16(int16x8_t a, __constrange(1,8) int b) // VRSHRN.I16 d0,q0,#8
8855{
8856 int8x8_t res64;
8857 __m128i r16;
8858 r16 = vrshrq_n_s16(a,b);
8859 r16 = _mm_shuffle_epi8 (r16, *(__m128i*) mask8_16_even_odd); //narrow, use low 64 bits only. Impossible to use _mm_packs because of negative saturation problems
8860 return64(r16);
8861}
8862
8863_NEON2SSESTORAGE int16x4_t vrshrn_n_s32(int32x4_t a, __constrange(1,16) int b); // VRSHRN.I32 d0,q0,#16
8864_NEON2SSE_INLINE int16x4_t vrshrn_n_s32(int32x4_t a, __constrange(1,16) int b) // VRSHRN.I32 d0,q0,#16
8865{
8866 int16x4_t res64;
8867 __m128i r32;
8868 r32 = vrshrq_n_s32(a,b);
8869 r32 = _mm_shuffle_epi8 (r32, *(__m128i*) mask8_32_even_odd); //narrow, use low 64 bits only. Impossible to use _mm_packs because of negative saturation problems
8870 return64(r32);
8871}
8872
8873_NEON2SSESTORAGE int32x2_t vrshrn_n_s64(int64x2_t a, __constrange(1,32) int b); // VRSHRN.I64 d0,q0,#32
8874_NEON2SSE_INLINE int32x2_t vrshrn_n_s64(int64x2_t a, __constrange(1,32) int b)
8875{
8876 int32x2_t res64;
8877 __m128i r64;
8878 r64 = vrshrq_n_s64(a,b);
8879 r64 = _mm_shuffle_epi32(r64, 0 | (2 << 2) | (1 << 4) | (3 << 6)); //shuffle the data to get 2 32-bits
8880 return64(r64);
8881}
8882
8883_NEON2SSESTORAGE uint8x8_t vrshrn_n_u16(uint16x8_t a, __constrange(1,8) int b); // VRSHRN.I16 d0,q0,#8
8884_NEON2SSE_INLINE uint8x8_t vrshrn_n_u16(uint16x8_t a, __constrange(1,8) int b) // VRSHRN.I16 d0,q0,#8
8885{
8886 uint8x8_t res64;
8887 __m128i mask, r16;
8888 mask = _mm_set1_epi16(0xff);
8889 r16 = vrshrq_n_s16(a,b); //after right shift b>=1 unsigned var fits into signed range, so we could use _mm_packus_epi16 (signed 16 to unsigned 8)
8890 r16 = _mm_and_si128(r16, mask); //to avoid saturation
8891 r16 = _mm_packus_epi16 (r16,r16); //saturate and narrow, use low 64 bits only
8892 return64(r16);
8893}
8894
8895_NEON2SSESTORAGE uint16x4_t vrshrn_n_u32(uint32x4_t a, __constrange(1,16) int b); // VRSHRN.I32 d0,q0,#16
8896_NEON2SSE_INLINE uint16x4_t vrshrn_n_u32(uint32x4_t a, __constrange(1,16) int b) // VRSHRN.I32 d0,q0,#16
8897{
8898 uint16x4_t res64;
8899 __m128i mask, r32;
8900 mask = _mm_set1_epi32(0xffff);
8901 r32 = vrshrq_n_u32(a,b); //after right shift b>=1 unsigned var fits into signed range, so we could use _MM_PACKUS_EPI32 (signed 32 to unsigned 8)
8902 r32 = _mm_and_si128(r32, mask); //to avoid saturation
8903 r32 = _MM_PACKUS1_EPI32 (r32); //saturate and narrow, use low 64 bits only
8904 return64(r32);
8905}
8906
8907_NEON2SSESTORAGE uint32x2_t vrshrn_n_u64(uint64x2_t a, __constrange(1,32) int b); // VRSHRN.I64 d0,q0,#32
8908_NEON2SSE_INLINE uint32x2_t vrshrn_n_u64(uint64x2_t a, __constrange(1,32) int b) //serial solution may be faster
8909{
8910 uint32x2_t res64;
8911 __m128i r64;
8912 r64 = vrshrq_n_u64(a,b);
8913 r64 = _mm_shuffle_epi32(r64, 0 | (2 << 2) | (1 << 4) | (3 << 6)); //shuffle the data to get 2 32-bits
8914 return64(r64);
8915}
8916
8917//************* Vector rounding narrowing saturating shift right by constant ************
8918//****************************************************************************************
8919_NEON2SSESTORAGE int8x8_t vqrshrn_n_s16(int16x8_t a, __constrange(1,8) int b); // VQRSHRN.S16 d0,q0,#8
8920_NEON2SSE_INLINE int8x8_t vqrshrn_n_s16(int16x8_t a, __constrange(1,8) int b) // VQRSHRN.S16 d0,q0,#8
8921{
8922 int8x8_t res64;
8923 __m128i r16;
8924 r16 = vrshrq_n_s16(a,b);
8925 r16 = _mm_packs_epi16 (r16,r16); //saturate and narrow, use low 64 bits only
8926 return64(r16);
8927}
8928
8929_NEON2SSESTORAGE int16x4_t vqrshrn_n_s32(int32x4_t a, __constrange(1,16) int b); // VQRSHRN.S32 d0,q0,#16
8930_NEON2SSE_INLINE int16x4_t vqrshrn_n_s32(int32x4_t a, __constrange(1,16) int b) // VQRSHRN.S32 d0,q0,#16
8931{
8932 int16x4_t res64;
8933 __m128i r32;
8934 r32 = vrshrq_n_s32(a,b);
8935 r32 = _mm_packs_epi32 (r32,r32); //saturate and narrow, use low 64 bits only
8936 return64(r32);
8937}
8938
8939_NEON2SSESTORAGE int32x2_t vqrshrn_n_s64(int64x2_t a, __constrange(1,32) int b); // VQRSHRN.S64 d0,q0,#32
8940_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int32x2_t vqrshrn_n_s64(int64x2_t a, __constrange(1,32) int b), _NEON2SSE_REASON_SLOW_UNEFFECTIVE)
8941{
8942 //no optimal SIMD solution found
8943 _NEON2SSE_ALIGN_16 int64_t res64[2], atmp[2], maskb[2];
8944 int32x2_t res;
8945 _mm_store_si128((__m128i*)atmp, a);
8946 maskb[0] = atmp[0] & (( int64_t)1 << (b - 1));
8947 res64[0] = (atmp[0] >> b) + (maskb[0] >> (b - 1)); //rounded result
8948 maskb[1] = atmp[1] & (( int64_t)1 << (b - 1));
8949 res64[1] = (atmp[1] >> b) + (maskb[1] >> (b - 1)); //rounded result
8950 if(res64[0]>SINT_MAX) res64[0] = SINT_MAX;
8951 if(res64[0]<SINT_MIN) res64[0] = SINT_MIN;
8952 if(res64[1]>SINT_MAX) res64[1] = SINT_MAX;
8953 if(res64[1]<SINT_MIN) res64[1] = SINT_MIN;
8954 res.m64_i32[0] = (int32_t)res64[0];
8955 res.m64_i32[1] = (int32_t)res64[1];
8956 return res;
8957}
8958
8959_NEON2SSESTORAGE uint8x8_t vqrshrn_n_u16(uint16x8_t a, __constrange(1,8) int b); // VQRSHRN.s16 d0,q0,#8
8960_NEON2SSE_INLINE uint8x8_t vqrshrn_n_u16(uint16x8_t a, __constrange(1,8) int b) // VQRSHRN.s16 d0,q0,#8
8961{
8962 uint8x8_t res64;
8963 __m128i r16;
8964 r16 = vrshrq_n_u16(a,b); //after right shift b>=1 unsigned var fits into signed range, so we could use _mm_packus_epi16 (signed 16 to unsigned 8)
8965 r16 = _mm_packus_epi16 (r16,r16); //saturate and narrow, use low 64 bits only
8966 return64(r16);
8967}
8968
8969_NEON2SSESTORAGE uint16x4_t vqrshrn_n_u32(uint32x4_t a, __constrange(1,16) int b); // VQRSHRN.U32 d0,q0,#16
8970_NEON2SSE_INLINE uint16x4_t vqrshrn_n_u32(uint32x4_t a, __constrange(1,16) int b) // VQRSHRN.U32 d0,q0,#16
8971{
8972 uint16x4_t res64;
8973 __m128i r32;
8974 r32 = vrshrq_n_u32(a,b); //after right shift b>=1 unsigned var fits into signed range, so we could use _MM_PACKUS_EPI32 (signed 32 to unsigned 8)
8975 r32 = _MM_PACKUS1_EPI32 (r32); //saturate and narrow, use low 64 bits only
8976 return64(r32);
8977}
8978
8979_NEON2SSESTORAGE uint32x2_t vqrshrn_n_u64(uint64x2_t a, __constrange(1,32) int b); // VQRSHRN.U64 d0,q0,#32
8980_NEON2SSE_INLINE uint32x2_t vqrshrn_n_u64(uint64x2_t a, __constrange(1,32) int b)
8981{
8982 //serial solution may be faster
8983 uint32x2_t res64;
8984 __m128i r64, res_hi, zero;
8985 zero = _mm_setzero_si128();
8986 r64 = vrshrq_n_u64(a,b);
8987 res_hi = _mm_srli_epi64(r64, 32);
8988 res_hi = _mm_cmpgt_epi32(res_hi, zero);
8989 r64 = _mm_or_si128(r64, res_hi);
8990 r64 = _mm_shuffle_epi32(r64, 0 | (2 << 2) | (1 << 4) | (3 << 6)); //shuffle the data to get 2 32-bits
8991 return64(r64);
8992}
8993
8994//************** Vector widening shift left by constant ****************
8995//************************************************************************
8996_NEON2SSESTORAGE int16x8_t vshll_n_s8(int8x8_t a, __constrange(0,8) int b); // VSHLL.S8 q0,d0,#0
8997_NEON2SSE_INLINE int16x8_t vshll_n_s8(int8x8_t a, __constrange(0,8) int b) // VSHLL.S8 q0,d0,#0
8998{
8999 __m128i r;
9000 r = _MM_CVTEPI8_EPI16 (_pM128i(a)); //SSE 4.1
9001 return _mm_slli_epi16 (r, b);
9002}
9003
9004_NEON2SSESTORAGE int32x4_t vshll_n_s16(int16x4_t a, __constrange(0,16) int b); // VSHLL.S16 q0,d0,#0
9005_NEON2SSE_INLINE int32x4_t vshll_n_s16(int16x4_t a, __constrange(0,16) int b) // VSHLL.S16 q0,d0,#0
9006{
9007 __m128i r;
9008 r = _MM_CVTEPI16_EPI32(_pM128i(a)); //SSE4.1,
9009 return _mm_slli_epi32 (r, b);
9010}
9011
9012_NEON2SSESTORAGE int64x2_t vshll_n_s32(int32x2_t a, __constrange(0,32) int b); // VSHLL.S32 q0,d0,#0
9013_NEON2SSE_INLINE int64x2_t vshll_n_s32(int32x2_t a, __constrange(0,32) int b) // VSHLL.S32 q0,d0,#0
9014{
9015 __m128i r;
9016 r = _MM_CVTEPI32_EPI64(_pM128i(a)); //SSE4.1,
9017 return _mm_slli_epi64 (r, b);
9018}
9019
9020_NEON2SSESTORAGE uint16x8_t vshll_n_u8(uint8x8_t a, __constrange(0,8) int b); // VSHLL.U8 q0,d0,#0
9021_NEON2SSE_INLINE uint16x8_t vshll_n_u8(uint8x8_t a, __constrange(0,8) int b) // VSHLL.U8 q0,d0,#0
9022{
9023 //no uint8 to uint16 conversion available, manual conversion used
9024 __m128i zero, r;
9025 zero = _mm_setzero_si128 ();
9026 r = _mm_unpacklo_epi8(_pM128i(a), zero);
9027 return _mm_slli_epi16 (r, b);
9028}
9029
9030_NEON2SSESTORAGE uint32x4_t vshll_n_u16(uint16x4_t a, __constrange(0,16) int b); // VSHLL.s16 q0,d0,#0
9031_NEON2SSE_INLINE uint32x4_t vshll_n_u16(uint16x4_t a, __constrange(0,16) int b) // VSHLL.s16 q0,d0,#0
9032{
9033 //no uint16 to uint32 conversion available, manual conversion used
9034 __m128i zero, r;
9035 zero = _mm_setzero_si128 ();
9036 r = _mm_unpacklo_epi16(_pM128i(a), zero);
9037 return _mm_slli_epi32 (r, b);
9038}
9039
9040_NEON2SSESTORAGE uint64x2_t vshll_n_u32(uint32x2_t a, __constrange(0,32) int b); // VSHLL.U32 q0,d0,#0
9041_NEON2SSE_INLINE uint64x2_t vshll_n_u32(uint32x2_t a, __constrange(0,32) int b) // VSHLL.U32 q0,d0,#0
9042{
9043 //no uint32 to uint64 conversion available, manual conversion used
9044 __m128i zero, r;
9045 zero = _mm_setzero_si128 ();
9046 r = _mm_unpacklo_epi32(_pM128i(a), zero);
9047 return _mm_slli_epi64 (r, b);
9048}
9049
9050//************************************************************************************
9051//**************************** Shifts with insert ************************************
9052//************************************************************************************
9053//takes each element in a vector, shifts them by an immediate value,
9054//and inserts the results in the destination vector. Bits shifted out of the each element are lost.
9055
9056//**************** Vector shift right and insert ************************************
9057//Actually the "c" left bits from "a" are the only bits remained from "a" after the shift.
9058//All other bits are taken from b shifted.
9059_NEON2SSESTORAGE int8x8_t vsri_n_s8(int8x8_t a, int8x8_t b, __constrange(1,8) int c); // VSRI.8 d0,d0,#8
9060_NEON2SSE_INLINE int8x8_t vsri_n_s8(int8x8_t a, int8x8_t b, __constrange(1,8) int c)
9061{
9062 int8x8_t res64;
9063 return64(vsriq_n_s8(_pM128i(a),_pM128i(b), c));
9064}
9065
9066
9067_NEON2SSESTORAGE int16x4_t vsri_n_s16(int16x4_t a, int16x4_t b, __constrange(1,16) int c); // VSRI.16 d0,d0,#16
9068_NEON2SSE_INLINE int16x4_t vsri_n_s16(int16x4_t a, int16x4_t b, __constrange(1,16) int c)
9069{
9070 int16x4_t res64;
9071 return64(vsriq_n_s16(_pM128i(a),_pM128i(b), c));
9072}
9073
9074
9075_NEON2SSESTORAGE int32x2_t vsri_n_s32(int32x2_t a, int32x2_t b, __constrange(1,32) int c); // VSRI.32 d0,d0,#32
9076_NEON2SSE_INLINE int32x2_t vsri_n_s32(int32x2_t a, int32x2_t b, __constrange(1,32) int c)
9077{
9078 int32x2_t res64;
9079 return64(vsriq_n_s32(_pM128i(a),_pM128i(b), c));
9080}
9081
9082
9083_NEON2SSESTORAGE int64x1_t vsri_n_s64(int64x1_t a, int64x1_t b, __constrange(1,64) int c); // VSRI.64 d0,d0,#64
9084_NEON2SSE_INLINE int64x1_t vsri_n_s64(int64x1_t a, int64x1_t b, __constrange(1,64) int c)
9085{
9086 int64x1_t res;
9087 if (c ==64)
9088 res = a;
9089 else{
9090 res.m64_i64[0] = (b.m64_u64[0] >> c) | ((a.m64_i64[0] >> (64 - c)) << (64 - c)); //treat b as unsigned for shift to get leading zeros
9091 }
9092 return res;
9093}
9094
9095_NEON2SSE_GLOBAL uint8x8_t vsri_n_u8(uint8x8_t a, uint8x8_t b, __constrange(1,8) int c); // VSRI.8 d0,d0,#8
9096#define vsri_n_u8 vsri_n_s8
9097
9098_NEON2SSE_GLOBAL uint16x4_t vsri_n_u16(uint16x4_t a, uint16x4_t b, __constrange(1,16) int c); // VSRI.16 d0,d0,#16
9099#define vsri_n_u16 vsri_n_s16
9100
9101_NEON2SSE_GLOBAL uint32x2_t vsri_n_u32(uint32x2_t a, uint32x2_t b, __constrange(1,32) int c); // VSRI.32 d0,d0,#32
9102#define vsri_n_u32 vsri_n_s32
9103
9104
9105_NEON2SSE_GLOBAL uint64x1_t vsri_n_u64(uint64x1_t a, uint64x1_t b, __constrange(1,64) int c); // VSRI.64 d0,d0,#64
9106#define vsri_n_u64 vsri_n_s64
9107
9108_NEON2SSE_GLOBAL poly8x8_t vsri_n_p8(poly8x8_t a, poly8x8_t b, __constrange(1,8) int c); // VSRI.8 d0,d0,#8
9109#define vsri_n_p8 vsri_n_u8
9110
9111_NEON2SSE_GLOBAL poly16x4_t vsri_n_p16(poly16x4_t a, poly16x4_t b, __constrange(1,16) int c); // VSRI.16 d0,d0,#16
9112#define vsri_n_p16 vsri_n_u16
9113
9114_NEON2SSESTORAGE int8x16_t vsriq_n_s8(int8x16_t a, int8x16_t b, __constrange(1,8) int c); // VSRI.8 q0,q0,#8
9115_NEON2SSE_INLINE int8x16_t vsriq_n_s8(int8x16_t a, int8x16_t b, __constrange(1,8) int c) // VSRI.8 q0,q0,#8
9116{
9117 __m128i maskA, a_masked;
9118 uint8x16_t b_shift;
9119 _NEON2SSE_ALIGN_16 static const uint8_t maskLeft[9] = {0x0, 0x80, 0xc0, 0xe0, 0xf0, 0xf8, 0xfc, 0xfe, 0xff}; //"a" bits mask, 0 bit not used
9120 maskA = _mm_set1_epi8(maskLeft[c]); // c ones and (8-c)zeros
9121 a_masked = _mm_and_si128 (a, maskA);
9122 b_shift = vshrq_n_u8( b, c); // c zeros on the left in b due to logical shift
9123 return _mm_or_si128 (a_masked, b_shift); //combine (insert b into a)
9124}
9125
9126_NEON2SSESTORAGE int16x8_t vsriq_n_s16(int16x8_t a, int16x8_t b, __constrange(1,16) int c); // VSRI.16 q0,q0,#16
9127_NEON2SSE_INLINE int16x8_t vsriq_n_s16(int16x8_t a, int16x8_t b, __constrange(1,16) int c) // VSRI.16 q0,q0,#16
9128{
9129 //to cut "c" left bits from a we do shift right and then shift back left providing c right zeros in a
9130 uint16x8_t b_shift;
9131 uint16x8_t a_c;
9132 b_shift = vshrq_n_u16( b, c); // c zeros on the left in b due to logical shift
9133 a_c = vshrq_n_u16( a, (16 - c));
9134 a_c = _mm_slli_epi16(a_c, (16 - c)); //logical shift provides right "c" bits zeros in a
9135 return _mm_or_si128 (a_c, b_shift); //combine (insert b into a)
9136}
9137
9138_NEON2SSESTORAGE int32x4_t vsriq_n_s32(int32x4_t a, int32x4_t b, __constrange(1,32) int c); // VSRI.32 q0,q0,#32
9139_NEON2SSE_INLINE int32x4_t vsriq_n_s32(int32x4_t a, int32x4_t b, __constrange(1,32) int c) // VSRI.32 q0,q0,#32
9140{
9141 //to cut "c" left bits from a we do shift right and then shift back left providing c right zeros in a
9142 uint32x4_t b_shift;
9143 uint32x4_t a_c;
9144 b_shift = vshrq_n_u32( b, c); // c zeros on the left in b due to logical shift
9145 a_c = vshrq_n_u32( a, (32 - c));
9146 a_c = _mm_slli_epi32(a_c, (32 - c)); //logical shift provides right "c" bits zeros in a
9147 return _mm_or_si128 (a_c, b_shift); //combine (insert b into a)
9148}
9149
9150_NEON2SSESTORAGE int64x2_t vsriq_n_s64(int64x2_t a, int64x2_t b, __constrange(1,64) int c); // VSRI.64 q0,q0,#64
9151_NEON2SSE_INLINE int64x2_t vsriq_n_s64(int64x2_t a, int64x2_t b, __constrange(1,64) int c)
9152{
9153 //serial solution may be faster
9154 uint64x2_t b_shift;
9155 uint64x2_t a_c;
9156 b_shift = _mm_srli_epi64(b, c); // c zeros on the left in b due to logical shift
9157 a_c = _mm_srli_epi64(a, (64 - c));
9158 a_c = _mm_slli_epi64(a_c, (64 - c)); //logical shift provides right "c" bits zeros in a
9159 return _mm_or_si128 (a_c, b_shift); //combine (insert b into a)
9160}
9161
9162_NEON2SSE_GLOBAL uint8x16_t vsriq_n_u8(uint8x16_t a, uint8x16_t b, __constrange(1,8) int c); // VSRI.8 q0,q0,#8
9163#define vsriq_n_u8 vsriq_n_s8
9164
9165_NEON2SSE_GLOBAL uint16x8_t vsriq_n_u16(uint16x8_t a, uint16x8_t b, __constrange(1,16) int c); // VSRI.16 q0,q0,#16
9166#define vsriq_n_u16 vsriq_n_s16
9167
9168_NEON2SSE_GLOBAL uint32x4_t vsriq_n_u32(uint32x4_t a, uint32x4_t b, __constrange(1,32) int c); // VSRI.32 q0,q0,#32
9169#define vsriq_n_u32 vsriq_n_s32
9170
9171_NEON2SSE_GLOBAL uint64x2_t vsriq_n_u64(uint64x2_t a, uint64x2_t b, __constrange(1,64) int c); // VSRI.64 q0,q0,#64
9172#define vsriq_n_u64 vsriq_n_s64
9173
9174_NEON2SSE_GLOBAL poly8x16_t vsriq_n_p8(poly8x16_t a, poly8x16_t b, __constrange(1,8) int c); // VSRI.8 q0,q0,#8
9175#define vsriq_n_p8 vsriq_n_u8
9176
9177_NEON2SSE_GLOBAL poly16x8_t vsriq_n_p16(poly16x8_t a, poly16x8_t b, __constrange(1,16) int c); // VSRI.16 q0,q0,#16
9178#define vsriq_n_p16 vsriq_n_u16
9179
9180//***** Vector shift left and insert *********************************************
9181//*********************************************************************************
9182//Actually the "c" right bits from "a" are the only bits remained from "a" after the shift.
9183//All other bits are taken from b shifted. Ending zeros are inserted in b in the shift proces. We need to combine "a" and "b shifted".
9184_NEON2SSESTORAGE int8x8_t vsli_n_s8(int8x8_t a, int8x8_t b, __constrange(0,7) int c); // VSLI.8 d0,d0,#0
9185_NEON2SSE_INLINE int8x8_t vsli_n_s8(int8x8_t a, int8x8_t b, __constrange(0,7) int c)
9186{
9187 int8x8_t res64;
9188 return64(vsliq_n_s8(_pM128i(a),_pM128i(b), c));
9189}
9190
9191
9192_NEON2SSESTORAGE int16x4_t vsli_n_s16(int16x4_t a, int16x4_t b, __constrange(0,15) int c); // VSLI.16 d0,d0,#0
9193_NEON2SSE_INLINE int16x4_t vsli_n_s16(int16x4_t a, int16x4_t b, __constrange(0,15) int c)
9194{
9195 int16x4_t res64;
9196 return64(vsliq_n_s16(_pM128i(a),_pM128i(b), c));
9197}
9198
9199
9200_NEON2SSESTORAGE int32x2_t vsli_n_s32(int32x2_t a, int32x2_t b, __constrange(0,31) int c); // VSLI.32 d0,d0,#0
9201_NEON2SSE_INLINE int32x2_t vsli_n_s32(int32x2_t a, int32x2_t b, __constrange(0,31) int c)
9202{
9203 int32x2_t res64;
9204 return64(vsliq_n_s32(_pM128i(a),_pM128i(b), c));
9205}
9206
9207_NEON2SSESTORAGE int64x1_t vsli_n_s64(int64x1_t a, int64x1_t b, __constrange(0,63) int c); // VSLI.64 d0,d0,#0
9208_NEON2SSE_INLINE int64x1_t vsli_n_s64(int64x1_t a, int64x1_t b, __constrange(0,63) int c)
9209{
9210 int64x1_t res;
9211 res.m64_i64[0] = (b.m64_i64[0] << c) | ((a.m64_u64[0] << (64 - c)) >> (64 - c)); //need to treat a as unsigned to get leading zeros
9212 return res;
9213}
9214
9215
9216_NEON2SSE_GLOBAL uint8x8_t vsli_n_u8(uint8x8_t a, uint8x8_t b, __constrange(0,7) int c); // VSLI.8 d0,d0,#0
9217#define vsli_n_u8 vsli_n_s8
9218
9219_NEON2SSE_GLOBAL uint16x4_t vsli_n_u16(uint16x4_t a, uint16x4_t b, __constrange(0,15) int c); // VSLI.16 d0,d0,#0
9220#define vsli_n_u16 vsli_n_s16
9221
9222_NEON2SSE_GLOBAL uint32x2_t vsli_n_u32(uint32x2_t a, uint32x2_t b, __constrange(0,31) int c); // VSLI.32 d0,d0,#0
9223#define vsli_n_u32 vsli_n_s32
9224
9225_NEON2SSE_GLOBAL uint64x1_t vsli_n_u64(uint64x1_t a, uint64x1_t b, __constrange(0,63) int c); // VSLI.64 d0,d0,#0
9226#define vsli_n_u64 vsli_n_s64
9227
9228_NEON2SSE_GLOBAL poly8x8_t vsli_n_p8(poly8x8_t a, poly8x8_t b, __constrange(0,7) int c); // VSLI.8 d0,d0,#0
9229#define vsli_n_p8 vsli_n_u8
9230
9231_NEON2SSE_GLOBAL poly16x4_t vsli_n_p16(poly16x4_t a, poly16x4_t b, __constrange(0,15) int c); // VSLI.16 d0,d0,#0
9232#define vsli_n_p16 vsli_n_u16
9233
9234_NEON2SSESTORAGE int8x16_t vsliq_n_s8(int8x16_t a, int8x16_t b, __constrange(0,7) int c); // VSLI.8 q0,q0,#0
9235_NEON2SSE_INLINE int8x16_t vsliq_n_s8(int8x16_t a, int8x16_t b, __constrange(0,7) int c) // VSLI.8 q0,q0,#0
9236{
9237 __m128i maskA, a_masked;
9238 int8x16_t b_shift;
9239 _NEON2SSE_ALIGN_16 static const uint8_t maskRight[8] = {0x0, 0x1, 0x3, 0x7, 0x0f, 0x1f, 0x3f, 0x7f}; //"a" bits mask
9240 maskA = _mm_set1_epi8(maskRight[c]); // (8-c)zeros and c ones
9241 b_shift = vshlq_n_s8( b, c);
9242 a_masked = _mm_and_si128 (a, maskA);
9243 return _mm_or_si128 (b_shift, a_masked); //combine (insert b into a)
9244}
9245
9246_NEON2SSESTORAGE int16x8_t vsliq_n_s16(int16x8_t a, int16x8_t b, __constrange(0,15) int c); // VSLI.16 q0,q0,#0
9247_NEON2SSE_INLINE int16x8_t vsliq_n_s16(int16x8_t a, int16x8_t b, __constrange(0,15) int c) // VSLI.16 q0,q0,#0
9248{
9249 //to cut "c" right bits from a we do shift left and then logical shift back right providing (16-c)zeros in a
9250 int16x8_t b_shift;
9251 int16x8_t a_c;
9252 b_shift = vshlq_n_s16( b, c);
9253 a_c = vshlq_n_s16( a, (16 - c));
9254 a_c = _mm_srli_epi16(a_c, (16 - c));
9255 return _mm_or_si128 (b_shift, a_c); //combine (insert b into a)
9256}
9257
9258_NEON2SSESTORAGE int32x4_t vsliq_n_s32(int32x4_t a, int32x4_t b, __constrange(0,31) int c); // VSLI.32 q0,q0,#0
9259_NEON2SSE_INLINE int32x4_t vsliq_n_s32(int32x4_t a, int32x4_t b, __constrange(0,31) int c) // VSLI.32 q0,q0,#0
9260{
9261 //solution may be not optimal compared with the serial one
9262 //to cut "c" right bits from a we do shift left and then logical shift back right providing (32-c)zeros in a
9263 int32x4_t b_shift;
9264 int32x4_t a_c;
9265 b_shift = vshlq_n_s32( b, c);
9266 a_c = vshlq_n_s32( a, (32 - c));
9267 a_c = _mm_srli_epi32(a_c, (32 - c));
9268 return _mm_or_si128 (b_shift, a_c); //combine (insert b into a)
9269}
9270
9271_NEON2SSESTORAGE int64x2_t vsliq_n_s64(int64x2_t a, int64x2_t b, __constrange(0,63) int c); // VSLI.64 q0,q0,#0
9272_NEON2SSE_INLINE int64x2_t vsliq_n_s64(int64x2_t a, int64x2_t b, __constrange(0,63) int c) // VSLI.64 q0,q0,#0
9273{
9274 //solution may be not optimal compared with the serial one
9275 //to cut "c" right bits from a we do shift left and then logical shift back right providing (64-c)zeros in a
9276 int64x2_t b_shift;
9277 int64x2_t a_c;
9278 b_shift = vshlq_n_s64( b, c);
9279 a_c = vshlq_n_s64( a, (64 - c));
9280 a_c = _mm_srli_epi64(a_c, (64 - c));
9281 return _mm_or_si128 (b_shift, a_c); //combine (insert b into a)
9282}
9283
9284_NEON2SSE_GLOBAL uint8x16_t vsliq_n_u8(uint8x16_t a, uint8x16_t b, __constrange(0,7) int c); // VSLI.8 q0,q0,#0
9285#define vsliq_n_u8 vsliq_n_s8
9286
9287_NEON2SSE_GLOBAL uint16x8_t vsliq_n_u16(uint16x8_t a, uint16x8_t b, __constrange(0,15) int c); // VSLI.16 q0,q0,#0
9288#define vsliq_n_u16 vsliq_n_s16
9289
9290_NEON2SSE_GLOBAL uint32x4_t vsliq_n_u32(uint32x4_t a, uint32x4_t b, __constrange(0,31) int c); // VSLI.32 q0,q0,#0
9291#define vsliq_n_u32 vsliq_n_s32
9292
9293_NEON2SSE_GLOBAL uint64x2_t vsliq_n_u64(uint64x2_t a, uint64x2_t b, __constrange(0,63) int c); // VSLI.64 q0,q0,#0
9294#define vsliq_n_u64 vsliq_n_s64
9295
9296_NEON2SSE_GLOBAL poly8x16_t vsliq_n_p8(poly8x16_t a, poly8x16_t b, __constrange(0,7) int c); // VSLI.8 q0,q0,#0
9297#define vsliq_n_p8 vsliq_n_u8
9298
9299_NEON2SSE_GLOBAL poly16x8_t vsliq_n_p16(poly16x8_t a, poly16x8_t b, __constrange(0,15) int c); // VSLI.16 q0,q0,#0
9300#define vsliq_n_p16 vsliq_n_u16
9301
9302// ***********************************************************************************************
9303// ****************** Loads and stores of a single vector ***************************************
9304// ***********************************************************************************************
9305//Performs loads and stores of a single vector of some type.
9306//******************************* Loads ********************************************************
9307// ***********************************************************************************************
9308//We assume ptr is NOT aligned in general case and use __m128i _mm_loadu_si128 ((__m128i*) ptr);.
9309//also for SSE3 supporting systems the __m128i _mm_lddqu_si128 (__m128i const* p) usage for unaligned access may be advantageous.
9310// it loads a 32-byte block aligned on a 16-byte boundary and extracts the 16 bytes corresponding to the unaligned access
9311//If the ptr is aligned then could use __m128i _mm_load_si128 ((__m128i*) ptr) instead;
9312#define LOAD_SI128(ptr) \
9313 ( ((uintptr_t)(ptr) & 15) == 0 ) ? _mm_load_si128((__m128i*)(ptr)) : _mm_loadu_si128((__m128i*)(ptr))
9314
9315_NEON2SSE_GLOBAL uint8x16_t vld1q_u8(__transfersize(16) uint8_t const * ptr); // VLD1.8 {d0, d1}, [r0]
9316#define vld1q_u8 LOAD_SI128
9317
9318_NEON2SSE_GLOBAL uint16x8_t vld1q_u16(__transfersize(8) uint16_t const * ptr); // VLD1.16 {d0, d1}, [r0]
9319#define vld1q_u16 LOAD_SI128
9320
9321_NEON2SSE_GLOBAL uint32x4_t vld1q_u32(__transfersize(4) uint32_t const * ptr); // VLD1.32 {d0, d1}, [r0]
9322#define vld1q_u32 LOAD_SI128
9323
9324_NEON2SSE_GLOBAL uint64x2_t vld1q_u64(__transfersize(2) uint64_t const * ptr); // VLD1.64 {d0, d1}, [r0]
9325#define vld1q_u64 LOAD_SI128
9326
9327_NEON2SSE_GLOBAL int8x16_t vld1q_s8(__transfersize(16) int8_t const * ptr); // VLD1.8 {d0, d1}, [r0]
9328#define vld1q_s8 LOAD_SI128
9329
9330_NEON2SSE_GLOBAL int16x8_t vld1q_s16(__transfersize(8) int16_t const * ptr); // VLD1.16 {d0, d1}, [r0]
9331#define vld1q_s16 LOAD_SI128
9332
9333_NEON2SSE_GLOBAL int32x4_t vld1q_s32(__transfersize(4) int32_t const * ptr); // VLD1.32 {d0, d1}, [r0]
9334#define vld1q_s32 LOAD_SI128
9335
9336_NEON2SSE_GLOBAL int64x2_t vld1q_s64(__transfersize(2) int64_t const * ptr); // VLD1.64 {d0, d1}, [r0]
9337#define vld1q_s64 LOAD_SI128
9338
9339_NEON2SSE_GLOBAL float16x8_t vld1q_f16(__transfersize(8) __fp16 const * ptr); // VLD1.16 {d0, d1}, [r0]
9340// IA32 SIMD doesn't work with 16bit floats currently, so need to go to 32 bit and then work with two 128bit registers
9341/* _NEON2SSE_INLINE float16x8_t vld1q_f16(__transfersize(8) __fp16 const * ptr)// VLD1.16 {d0, d1}, [r0]
9342{__m128 f1 = _mm_set_ps (ptr[3], ptr[2], ptr[1], ptr[0]);
9343__m128 f2;
9344f2 = _mm_set_ps (ptr[7], ptr[6], ptr[5], ptr[4]);
9345}*/
9346
9347_NEON2SSESTORAGE float32x4_t vld1q_f32(__transfersize(4) float32_t const * ptr); // VLD1.32 {d0, d1}, [r0]
9348_NEON2SSE_INLINE float32x4_t vld1q_f32(__transfersize(4) float32_t const * ptr)
9349{
9350 if( (((uintptr_t)(ptr)) & 15 ) == 0 ) //16 bits aligned
9351 return _mm_load_ps(ptr);
9352 else
9353 return _mm_loadu_ps(ptr);
9354}
9355
9356_NEON2SSE_GLOBAL poly8x16_t vld1q_p8(__transfersize(16) poly8_t const * ptr); // VLD1.8 {d0, d1}, [r0]
9357#define vld1q_p8 LOAD_SI128
9358
9359_NEON2SSE_GLOBAL poly16x8_t vld1q_p16(__transfersize(8) poly16_t const * ptr); // VLD1.16 {d0, d1}, [r0]
9360#define vld1q_p16 LOAD_SI128
9361
9362_NEON2SSE_GLOBAL uint8x8_t vld1_u8(__transfersize(8) uint8_t const * ptr); // VLD1.8 {d0}, [r0]
9363#define vld1_u8(ptr) *((__m64_128*)(ptr)) //was _mm_loadl_epi64((__m128i*)(ptr))
9364
9365_NEON2SSE_GLOBAL uint16x4_t vld1_u16(__transfersize(4) uint16_t const * ptr); // VLD1.16 {d0}, [r0]
9366#define vld1_u16 vld1_u8
9367
9368_NEON2SSE_GLOBAL uint32x2_t vld1_u32(__transfersize(2) uint32_t const * ptr); // VLD1.32 {d0}, [r0]
9369#define vld1_u32 vld1_u8
9370
9371
9372_NEON2SSE_GLOBAL uint64x1_t vld1_u64(__transfersize(1) uint64_t const * ptr); // VLD1.64 {d0}, [r0]
9373#define vld1_u64 vld1_u8
9374
9375_NEON2SSE_GLOBAL int8x8_t vld1_s8(__transfersize(8) int8_t const * ptr); // VLD1.8 {d0}, [r0]
9376#define vld1_s8 vld1_u8
9377
9378_NEON2SSE_GLOBAL int16x4_t vld1_s16(__transfersize(4) int16_t const * ptr); // VLD1.16 {d0}, [r0]
9379#define vld1_s16 vld1_u16
9380
9381_NEON2SSE_GLOBAL int32x2_t vld1_s32(__transfersize(2) int32_t const * ptr); // VLD1.32 {d0}, [r0]
9382#define vld1_s32 vld1_u32
9383
9384_NEON2SSE_GLOBAL int64x1_t vld1_s64(__transfersize(1) int64_t const * ptr); // VLD1.64 {d0}, [r0]
9385#define vld1_s64 vld1_u64
9386
9387_NEON2SSE_GLOBAL float16x4_t vld1_f16(__transfersize(4) __fp16 const * ptr); // VLD1.16 {d0}, [r0]
9388// IA32 SIMD doesn't work with 16bit floats currently, so need to go to 32 bit like _mm_set_ps (ptr[3], ptr[2], ptr[1], ptr[0]);
9389
9390_NEON2SSESTORAGE float32x2_t vld1_f32(__transfersize(2) float32_t const * ptr); // VLD1.32 {d0}, [r0]
9391_NEON2SSE_INLINE float32x2_t vld1_f32(__transfersize(2) float32_t const * ptr)
9392{
9393 float32x2_t res;
9394 res.m64_f32[0] = *(ptr);
9395 res.m64_f32[1] = *(ptr + 1);
9396 return res;
9397}
9398
9399_NEON2SSE_GLOBAL poly8x8_t vld1_p8(__transfersize(8) poly8_t const * ptr); // VLD1.8 {d0}, [r0]
9400#define vld1_p8 vld1_u8
9401
9402_NEON2SSE_GLOBAL poly16x4_t vld1_p16(__transfersize(4) poly16_t const * ptr); // VLD1.16 {d0}, [r0]
9403#define vld1_p16 vld1_u16
9404
9405
9406_NEON2SSESTORAGE float64x2_t vld1q_f64(__transfersize(4) float64_t const * ptr); // VLD1.64 {d0, d1}, [r0]
9407_NEON2SSE_INLINE float64x2_t vld1q_f64(__transfersize(4) float64_t const * ptr)
9408{
9409 if ((((uintptr_t)(ptr)) & 15) == 0) //16 bits aligned
9410 return _mm_load_pd(ptr);
9411 else
9412 return _mm_loadu_pd(ptr);
9413}
9414
9415
9416//***********************************************************************************************************
9417//******* Lane load functions - insert the data at vector's given position (lane) *************************
9418//***********************************************************************************************************
9419_NEON2SSE_GLOBAL uint8x16_t vld1q_lane_u8(__transfersize(1) uint8_t const * ptr, uint8x16_t vec, __constrange(0,15) int lane); // VLD1.8 {d0[0]}, [r0]
9420#define vld1q_lane_u8(ptr, vec, lane) _MM_INSERT_EPI8(vec, *(ptr), lane)
9421
9422_NEON2SSE_GLOBAL uint16x8_t vld1q_lane_u16(__transfersize(1) uint16_t const * ptr, uint16x8_t vec, __constrange(0,7) int lane); // VLD1.16 {d0[0]}, [r0]
9423#define vld1q_lane_u16(ptr, vec, lane) _MM_INSERT_EPI16(vec, *(ptr), lane)
9424
9425_NEON2SSE_GLOBAL uint32x4_t vld1q_lane_u32(__transfersize(1) uint32_t const * ptr, uint32x4_t vec, __constrange(0,3) int lane); // VLD1.32 {d0[0]}, [r0]
9426#define vld1q_lane_u32(ptr, vec, lane) _MM_INSERT_EPI32(vec, *(ptr), lane)
9427
9428_NEON2SSE_GLOBAL uint64x2_t vld1q_lane_u64(__transfersize(1) uint64_t const * ptr, uint64x2_t vec, __constrange(0,1) int lane); // VLD1.64 {d0}, [r0]
9429#define vld1q_lane_u64(ptr, vec, lane) _MM_INSERT_EPI64(vec, *(ptr), lane)
9430
9431
9432_NEON2SSE_GLOBAL int8x16_t vld1q_lane_s8(__transfersize(1) int8_t const * ptr, int8x16_t vec, __constrange(0,15) int lane); // VLD1.8 {d0[0]}, [r0]
9433#define vld1q_lane_s8(ptr, vec, lane) _MM_INSERT_EPI8(vec, *(ptr), lane)
9434
9435_NEON2SSE_GLOBAL int16x8_t vld1q_lane_s16(__transfersize(1) int16_t const * ptr, int16x8_t vec, __constrange(0,7) int lane); // VLD1.16 {d0[0]}, [r0]
9436#define vld1q_lane_s16(ptr, vec, lane) _MM_INSERT_EPI16(vec, *(ptr), lane)
9437
9438_NEON2SSE_GLOBAL int32x4_t vld1q_lane_s32(__transfersize(1) int32_t const * ptr, int32x4_t vec, __constrange(0,3) int lane); // VLD1.32 {d0[0]}, [r0]
9439#define vld1q_lane_s32(ptr, vec, lane) _MM_INSERT_EPI32(vec, *(ptr), lane)
9440
9441_NEON2SSE_GLOBAL float16x8_t vld1q_lane_f16(__transfersize(1) __fp16 const * ptr, float16x8_t vec, __constrange(0,7) int lane); // VLD1.16 {d0[0]}, [r0]
9442//current IA SIMD doesn't support float16
9443
9444_NEON2SSESTORAGE float32x4_t vld1q_lane_f32(__transfersize(1) float32_t const * ptr, float32x4_t vec, __constrange(0,3) int lane); // VLD1.32 {d0[0]}, [r0]
9445_NEON2SSE_INLINE float32x4_t vld1q_lane_f32(__transfersize(1) float32_t const * ptr, float32x4_t vec, __constrange(0,3) int lane)
9446{
9447 //we need to deal with ptr 16bit NOT aligned case
9448 __m128 p;
9449 p = _mm_set1_ps(*(ptr));
9450 return _MM_INSERT_PS(vec, p, _INSERTPS_NDX(0, lane));
9451}
9452
9453_NEON2SSE_GLOBAL int64x2_t vld1q_lane_s64(__transfersize(1) int64_t const * ptr, int64x2_t vec, __constrange(0,1) int lane); // VLD1.64 {d0}, [r0]
9454#define vld1q_lane_s64(ptr, vec, lane) _MM_INSERT_EPI64(vec, *(ptr), lane)
9455
9456_NEON2SSE_GLOBAL poly8x16_t vld1q_lane_p8(__transfersize(1) poly8_t const * ptr, poly8x16_t vec, __constrange(0,15) int lane); // VLD1.8 {d0[0]}, [r0]
9457#define vld1q_lane_p8(ptr, vec, lane) _MM_INSERT_EPI8(vec, *(ptr), lane)
9458
9459_NEON2SSE_GLOBAL poly16x8_t vld1q_lane_p16(__transfersize(1) poly16_t const * ptr, poly16x8_t vec, __constrange(0,7) int lane); // VLD1.16 {d0[0]}, [r0]
9460#define vld1q_lane_p16(ptr, vec, lane) _MM_INSERT_EPI16(vec, *(ptr), lane)
9461
9462_NEON2SSESTORAGE uint8x8_t vld1_lane_u8(__transfersize(1) uint8_t const * ptr, uint8x8_t vec, __constrange(0,7) int lane); // VLD1.8 {d0[0]}, [r0]
9463_NEON2SSE_INLINE uint8x8_t vld1_lane_u8(__transfersize(1) uint8_t const * ptr, uint8x8_t vec, __constrange(0,7) int lane)
9464{
9465 uint8x8_t res;
9466 res = vec;
9467 res.m64_u8[lane] = *(ptr);
9468 return res;
9469}
9470
9471_NEON2SSESTORAGE uint16x4_t vld1_lane_u16(__transfersize(1) uint16_t const * ptr, uint16x4_t vec, __constrange(0,3) int lane); // VLD1.16 {d0[0]}, [r0]
9472_NEON2SSE_INLINE uint16x4_t vld1_lane_u16(__transfersize(1) uint16_t const * ptr, uint16x4_t vec, __constrange(0,3) int lane)
9473{
9474 uint16x4_t res;
9475 res = vec;
9476 res.m64_u16[lane] = *(ptr);
9477 return res;
9478}
9479
9480_NEON2SSESTORAGE uint32x2_t vld1_lane_u32(__transfersize(1) uint32_t const * ptr, uint32x2_t vec, __constrange(0,1) int lane); // VLD1.32 {d0[0]}, [r0]
9481_NEON2SSE_INLINE uint32x2_t vld1_lane_u32(__transfersize(1) uint32_t const * ptr, uint32x2_t vec, __constrange(0,1) int lane)
9482{
9483 uint32x2_t res;
9484 res = vec;
9485 res.m64_u32[lane] = *(ptr);
9486 return res;
9487}
9488
9489_NEON2SSESTORAGE uint64x1_t vld1_lane_u64(__transfersize(1) uint64_t const * ptr, uint64x1_t vec, __constrange(0,0) int lane); // VLD1.64 {d0}, [r0]
9490_NEON2SSE_INLINE uint64x1_t vld1_lane_u64(__transfersize(1) uint64_t const * ptr, uint64x1_t vec, __constrange(0,0) int lane)
9491{
9492 uint64x1_t res;
9493 UNREFERENCED_PARAMETER(vec);
9494 UNREFERENCED_PARAMETER(lane);
9495 res.m64_u64[0] = *(ptr);
9496 return res;
9497}
9498
9499
9500_NEON2SSE_GLOBAL int8x8_t vld1_lane_s8(__transfersize(1) int8_t const * ptr, int8x8_t vec, __constrange(0,7) int lane); // VLD1.8 {d0[0]}, [r0]
9501#define vld1_lane_s8(ptr, vec, lane) vld1_lane_u8((uint8_t*)ptr, vec, lane)
9502
9503_NEON2SSE_GLOBAL int16x4_t vld1_lane_s16(__transfersize(1) int16_t const * ptr, int16x4_t vec, __constrange(0,3) int lane); // VLD1.16 {d0[0]}, [r0]
9504#define vld1_lane_s16(ptr, vec, lane) vld1_lane_u16((uint16_t*)ptr, vec, lane)
9505
9506_NEON2SSE_GLOBAL int32x2_t vld1_lane_s32(__transfersize(1) int32_t const * ptr, int32x2_t vec, __constrange(0,1) int lane); // VLD1.32 {d0[0]}, [r0]
9507#define vld1_lane_s32(ptr, vec, lane) vld1_lane_u32((uint32_t*)ptr, vec, lane)
9508
9509_NEON2SSE_GLOBAL float16x4_t vld1_lane_f16(__transfersize(1) __fp16 const * ptr, float16x4_t vec, __constrange(0,3) int lane); // VLD1.16 {d0[0]}, [r0]
9510//current IA SIMD doesn't support float16
9511
9512_NEON2SSESTORAGE float32x2_t vld1_lane_f32(__transfersize(1) float32_t const * ptr, float32x2_t vec, __constrange(0,1) int lane); // VLD1.32 {d0[0]}, [r0]
9513_NEON2SSE_INLINE float32x2_t vld1_lane_f32(__transfersize(1) float32_t const * ptr, float32x2_t vec, __constrange(0,1) int lane)
9514{
9515 float32x2_t res;
9516 res = vec;
9517 res.m64_f32[lane] = *(ptr);
9518 return res;
9519}
9520
9521_NEON2SSE_GLOBAL int64x1_t vld1_lane_s64(__transfersize(1) int64_t const * ptr, int64x1_t vec, __constrange(0,0) int lane); // VLD1.64 {d0}, [r0]
9522#define vld1_lane_s64(ptr, vec, lane) vld1_lane_u64((uint64_t*)ptr, vec, lane)
9523
9524_NEON2SSE_GLOBAL poly8x8_t vld1_lane_p8(__transfersize(1) poly8_t const * ptr, poly8x8_t vec, __constrange(0,7) int lane); // VLD1.8 {d0[0]}, [r0]
9525#define vld1_lane_p8 vld1_lane_u8
9526
9527_NEON2SSE_GLOBAL poly16x4_t vld1_lane_p16(__transfersize(1) poly16_t const * ptr, poly16x4_t vec, __constrange(0,3) int lane); // VLD1.16 {d0[0]}, [r0]
9528#define vld1_lane_p16 vld1_lane_s16
9529
9530// ****************** Load single value ( set all lanes of vector with same value from memory)**********************
9531// ******************************************************************************************************************
9532_NEON2SSE_GLOBAL uint8x16_t vld1q_dup_u8(__transfersize(1) uint8_t const * ptr); // VLD1.8 {d0[]}, [r0]
9533#define vld1q_dup_u8(ptr) _mm_set1_epi8(*(ptr))
9534
9535_NEON2SSE_GLOBAL uint16x8_t vld1q_dup_u16(__transfersize(1) uint16_t const * ptr); // VLD1.16 {d0[]}, [r0]
9536#define vld1q_dup_u16(ptr) _mm_set1_epi16(*(ptr))
9537
9538_NEON2SSE_GLOBAL uint32x4_t vld1q_dup_u32(__transfersize(1) uint32_t const * ptr); // VLD1.32 {d0[]}, [r0]
9539#define vld1q_dup_u32(ptr) _mm_set1_epi32(*(ptr))
9540
9541_NEON2SSESTORAGE uint64x2_t vld1q_dup_u64(__transfersize(1) uint64_t const * ptr); // VLD1.64 {d0}, [r0]
9542_NEON2SSE_INLINE uint64x2_t vld1q_dup_u64(__transfersize(1) uint64_t const * ptr)
9543{
9544 _NEON2SSE_ALIGN_16 uint64_t val[2];
9545
9546 val[0] = *(ptr);
9547 val[1] = *(ptr);
9548
9549 return LOAD_SI128(val);
9550}
9551
9552_NEON2SSE_GLOBAL int8x16_t vld1q_dup_s8(__transfersize(1) int8_t const * ptr); // VLD1.8 {d0[]}, [r0]
9553#define vld1q_dup_s8(ptr) _mm_set1_epi8(*(ptr))
9554
9555_NEON2SSE_GLOBAL int16x8_t vld1q_dup_s16(__transfersize(1) int16_t const * ptr); // VLD1.16 {d0[]}, [r0]
9556#define vld1q_dup_s16(ptr) _mm_set1_epi16 (*(ptr))
9557
9558_NEON2SSE_GLOBAL int32x4_t vld1q_dup_s32(__transfersize(1) int32_t const * ptr); // VLD1.32 {d0[]}, [r0]
9559#define vld1q_dup_s32(ptr) _mm_set1_epi32 (*(ptr))
9560
9561_NEON2SSE_GLOBAL int64x2_t vld1q_dup_s64(__transfersize(1) int64_t const * ptr); // VLD1.64 {d0}, [r0]
9562#define vld1q_dup_s64(ptr) vld1q_dup_u64((uint64_t*)ptr)
9563
9564_NEON2SSE_GLOBAL float16x8_t vld1q_dup_f16(__transfersize(1) __fp16 const * ptr); // VLD1.16 {d0[]}, [r0]
9565//current IA SIMD doesn't support float16, need to go to 32 bits
9566
9567_NEON2SSE_GLOBAL float32x4_t vld1q_dup_f32(__transfersize(1) float32_t const * ptr); // VLD1.32 {d0[]}, [r0]
9568#define vld1q_dup_f32(ptr) _mm_set1_ps (*(ptr))
9569
9570_NEON2SSE_GLOBAL poly8x16_t vld1q_dup_p8(__transfersize(1) poly8_t const * ptr); // VLD1.8 {d0[]}, [r0]
9571#define vld1q_dup_p8(ptr) _mm_set1_epi8(*(ptr))
9572
9573_NEON2SSE_GLOBAL poly16x8_t vld1q_dup_p16(__transfersize(1) poly16_t const * ptr); // VLD1.16 {d0[]}, [r0]
9574#define vld1q_dup_p16(ptr) _mm_set1_epi16 (*(ptr))
9575
9576_NEON2SSESTORAGE uint8x8_t vld1_dup_u8(__transfersize(1) uint8_t const * ptr); // VLD1.8 {d0[]}, [r0]
9577_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint8x8_t vld1_dup_u8(__transfersize(1) uint8_t const * ptr), _NEON2SSE_REASON_SLOW_SERIAL)
9578{
9579 uint8x8_t res;
9580 int i;
9581 for(i = 0; i<8; i++) {
9582 res.m64_u8[i] = *(ptr);
9583 }
9584 return res;
9585}
9586
9587_NEON2SSESTORAGE uint16x4_t vld1_dup_u16(__transfersize(1) uint16_t const * ptr); // VLD1.16 {d0[]}, [r0]
9588_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint16x4_t vld1_dup_u16(__transfersize(1) uint16_t const * ptr), _NEON2SSE_REASON_SLOW_SERIAL)
9589{
9590 uint16x4_t res;
9591 int i;
9592 for(i = 0; i<4; i++) {
9593 res.m64_u16[i] = *(ptr);
9594 }
9595 return res;
9596}
9597
9598_NEON2SSESTORAGE uint32x2_t vld1_dup_u32(__transfersize(1) uint32_t const * ptr); // VLD1.32 {d0[]}, [r0]
9599_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint32x2_t vld1_dup_u32(__transfersize(1) uint32_t const * ptr), _NEON2SSE_REASON_SLOW_SERIAL)
9600{
9601 uint32x2_t res;
9602 res.m64_u32[0] = *(ptr);
9603 res.m64_u32[1] = *(ptr);
9604 return res;
9605}
9606
9607_NEON2SSESTORAGE uint64x1_t vld1_dup_u64(__transfersize(1) uint64_t const * ptr); // VLD1.64 {d0}, [r0]
9608_NEON2SSE_INLINE uint64x1_t vld1_dup_u64(__transfersize(1) uint64_t const * ptr)
9609{
9610 uint64x1_t res;
9611 res.m64_u64[0] = *(ptr);
9612 return res;
9613}
9614
9615_NEON2SSE_GLOBAL int8x8_t vld1_dup_s8(__transfersize(1) int8_t const * ptr); // VLD1.8 {d0[]}, [r0]
9616#define vld1_dup_s8(ptr) vld1_dup_u8((uint8_t*)ptr)
9617
9618
9619_NEON2SSE_GLOBAL int16x4_t vld1_dup_s16(__transfersize(1) int16_t const * ptr); // VLD1.16 {d0[]}, [r0]
9620#define vld1_dup_s16(ptr) vld1_dup_u16((uint16_t*)ptr)
9621
9622
9623_NEON2SSE_GLOBAL int32x2_t vld1_dup_s32(__transfersize(1) int32_t const * ptr); // VLD1.32 {d0[]}, [r0]
9624#define vld1_dup_s32(ptr) vld1_dup_u32((uint32_t*)ptr)
9625
9626
9627_NEON2SSE_GLOBAL int64x1_t vld1_dup_s64(__transfersize(1) int64_t const * ptr); // VLD1.64 {d0}, [r0]
9628#define vld1_dup_s64(ptr) vld1_dup_u64((uint64_t*)ptr)
9629
9630_NEON2SSE_GLOBAL float16x4_t vld1_dup_f16(__transfersize(1) __fp16 const * ptr); // VLD1.16 {d0[]}, [r0]
9631//current IA SIMD doesn't support float16
9632
9633_NEON2SSESTORAGE float32x2_t vld1_dup_f32(__transfersize(1) float32_t const * ptr); // VLD1.32 {d0[]}, [r0]
9634_NEON2SSE_INLINE float32x2_t vld1_dup_f32(__transfersize(1) float32_t const * ptr)
9635{
9636 float32x2_t res;
9637 res.m64_f32[0] = *(ptr);
9638 res.m64_f32[1] = res.m64_f32[0];
9639 return res; // use last 64bits only
9640}
9641
9642_NEON2SSE_GLOBAL poly8x8_t vld1_dup_p8(__transfersize(1) poly8_t const * ptr); // VLD1.8 {d0[]}, [r0]
9643#define vld1_dup_p8 vld1_dup_u8
9644
9645
9646_NEON2SSE_GLOBAL poly16x4_t vld1_dup_p16(__transfersize(1) poly16_t const * ptr); // VLD1.16 {d0[]}, [r0]
9647#define vld1_dup_p16 vld1_dup_u16
9648
9649
9650//*************************************************************************************
9651//********************************* Store **********************************************
9652//*************************************************************************************
9653// If ptr is 16bit aligned and you need to store data without cache pollution then use void _mm_stream_si128 ((__m128i*)ptr, val);
9654//here we assume the case of NOT 16bit aligned ptr possible. If it is aligned we could to use _mm_store_si128 like shown in the following macro
9655#define STORE_SI128(ptr, val) \
9656 (((uintptr_t)(ptr) & 15) == 0 ) ? _mm_store_si128 ((__m128i*)(ptr), val) : _mm_storeu_si128 ((__m128i*)(ptr), val);
9657
9658_NEON2SSE_GLOBAL void vst1q_u8(__transfersize(16) uint8_t * ptr, uint8x16_t val); // VST1.8 {d0, d1}, [r0]
9659#define vst1q_u8 STORE_SI128
9660
9661_NEON2SSE_GLOBAL void vst1q_u16(__transfersize(8) uint16_t * ptr, uint16x8_t val); // VST1.16 {d0, d1}, [r0]
9662#define vst1q_u16 STORE_SI128
9663
9664_NEON2SSE_GLOBAL void vst1q_u32(__transfersize(4) uint32_t * ptr, uint32x4_t val); // VST1.32 {d0, d1}, [r0]
9665#define vst1q_u32 STORE_SI128
9666
9667_NEON2SSE_GLOBAL void vst1q_u64(__transfersize(2) uint64_t * ptr, uint64x2_t val); // VST1.64 {d0, d1}, [r0]
9668#define vst1q_u64 STORE_SI128
9669
9670_NEON2SSE_GLOBAL void vst1q_s8(__transfersize(16) int8_t * ptr, int8x16_t val); // VST1.8 {d0, d1}, [r0]
9671#define vst1q_s8 STORE_SI128
9672
9673_NEON2SSE_GLOBAL void vst1q_s16(__transfersize(8) int16_t * ptr, int16x8_t val); // VST1.16 {d0, d1}, [r0]
9674#define vst1q_s16 STORE_SI128
9675
9676_NEON2SSE_GLOBAL void vst1q_s32(__transfersize(4) int32_t * ptr, int32x4_t val); // VST1.32 {d0, d1}, [r0]
9677#define vst1q_s32 STORE_SI128
9678
9679_NEON2SSE_GLOBAL void vst1q_s64(__transfersize(2) int64_t * ptr, int64x2_t val); // VST1.64 {d0, d1}, [r0]
9680#define vst1q_s64 STORE_SI128
9681
9682_NEON2SSE_GLOBAL void vst1q_f16(__transfersize(8) __fp16 * ptr, float16x8_t val); // VST1.16 {d0, d1}, [r0]
9683// IA32 SIMD doesn't work with 16bit floats currently
9684
9685_NEON2SSESTORAGE void vst1q_f32(__transfersize(4) float32_t * ptr, float32x4_t val); // VST1.32 {d0, d1}, [r0]
9686_NEON2SSE_INLINE void vst1q_f32(__transfersize(4) float32_t * ptr, float32x4_t val)
9687{
9688 if( ((uintptr_t)(ptr) & 15) == 0 ) //16 bits aligned
9689 _mm_store_ps (ptr, val);
9690 else
9691 _mm_storeu_ps (ptr, val);
9692}
9693
9694_NEON2SSE_GLOBAL void vst1q_p8(__transfersize(16) poly8_t * ptr, poly8x16_t val); // VST1.8 {d0, d1}, [r0]
9695#define vst1q_p8 vst1q_u8
9696
9697_NEON2SSE_GLOBAL void vst1q_p16(__transfersize(8) poly16_t * ptr, poly16x8_t val); // VST1.16 {d0, d1}, [r0]
9698#define vst1q_p16 vst1q_u16
9699
9700_NEON2SSESTORAGE void vst1_u8(__transfersize(8) uint8_t * ptr, uint8x8_t val); // VST1.8 {d0}, [r0]
9701_NEON2SSE_INLINE void vst1_u8(__transfersize(8) uint8_t * ptr, uint8x8_t val)
9702{
9703 int i;
9704 for (i = 0; i<8; i++) {
9705 *(ptr + i) = ((uint8_t*)&val)[i];
9706 }
9707 //_mm_storel_epi64((__m128i*)ptr, val);
9708 return;
9709}
9710
9711_NEON2SSESTORAGE void vst1_u16(__transfersize(4) uint16_t * ptr, uint16x4_t val); // VST1.16 {d0}, [r0]
9712_NEON2SSE_INLINE void vst1_u16(__transfersize(4) uint16_t * ptr, uint16x4_t val)
9713{
9714 int i;
9715 for (i = 0; i<4; i++) {
9716 *(ptr + i) = ((uint16_t*)&val)[i];
9717 }
9718 //_mm_storel_epi64((__m128i*)ptr, val);
9719 return;
9720}
9721
9722_NEON2SSESTORAGE void vst1_u32(__transfersize(2) uint32_t * ptr, uint32x2_t val); // VST1.32 {d0}, [r0]
9723_NEON2SSE_INLINE void vst1_u32(__transfersize(2) uint32_t * ptr, uint32x2_t val)
9724{
9725 int i;
9726 for (i = 0; i<2; i++) {
9727 *(ptr + i) = ((uint32_t*)&val)[i];
9728 }
9729 //_mm_storel_epi64((__m128i*)ptr, val);
9730 return;
9731}
9732
9733_NEON2SSESTORAGE void vst1_u64(__transfersize(1) uint64_t * ptr, uint64x1_t val); // VST1.64 {d0}, [r0]
9734_NEON2SSE_INLINE void vst1_u64(__transfersize(1) uint64_t * ptr, uint64x1_t val)
9735{
9736 *(ptr) = *((uint64_t*)&val);
9737 //_mm_storel_epi64((__m128i*)ptr, val);
9738 return;
9739}
9740
9741_NEON2SSE_GLOBAL void vst1_s8(__transfersize(8) int8_t * ptr, int8x8_t val); // VST1.8 {d0}, [r0]
9742#define vst1_s8(ptr,val) vst1_u8((uint8_t*)ptr,val)
9743
9744_NEON2SSE_GLOBAL void vst1_s16(__transfersize(4) int16_t * ptr, int16x4_t val); // VST1.16 {d0}, [r0]
9745#define vst1_s16(ptr,val) vst1_u16((uint16_t*)ptr,val)
9746
9747_NEON2SSE_GLOBAL void vst1_s32(__transfersize(2) int32_t * ptr, int32x2_t val); // VST1.32 {d0}, [r0]
9748#define vst1_s32(ptr,val) vst1_u32((uint32_t*)ptr,val)
9749
9750_NEON2SSE_GLOBAL void vst1_s64(__transfersize(1) int64_t * ptr, int64x1_t val); // VST1.64 {d0}, [r0]
9751#define vst1_s64(ptr,val) vst1_u64((uint64_t*)ptr,val)
9752
9753_NEON2SSE_GLOBAL void vst1_f16(__transfersize(4) __fp16 * ptr, float16x4_t val); // VST1.16 {d0}, [r0]
9754//current IA SIMD doesn't support float16
9755
9756_NEON2SSESTORAGE void vst1_f32(__transfersize(2) float32_t * ptr, float32x2_t val); // VST1.32 {d0}, [r0]
9757_NEON2SSE_INLINE void vst1_f32(__transfersize(2) float32_t * ptr, float32x2_t val)
9758{
9759 *(ptr) = val.m64_f32[0];
9760 *(ptr + 1) = val.m64_f32[1];
9761 return;
9762}
9763
9764_NEON2SSE_GLOBAL void vst1_p8(__transfersize(8) poly8_t * ptr, poly8x8_t val); // VST1.8 {d0}, [r0]
9765#define vst1_p8 vst1_u8
9766
9767_NEON2SSE_GLOBAL void vst1_p16(__transfersize(4) poly16_t * ptr, poly16x4_t val); // VST1.16 {d0}, [r0]
9768#define vst1_p16 vst1_u16
9769
9770//***********Store a lane of a vector into memory (extract given lane) *********************
9771//******************************************************************************************
9772_NEON2SSE_GLOBAL void vst1q_lane_u8(__transfersize(1) uint8_t * ptr, uint8x16_t val, __constrange(0,15) int lane); // VST1.8 {d0[0]}, [r0]
9773#define vst1q_lane_u8(ptr, val, lane) *(ptr) = (uint8_t) _MM_EXTRACT_EPI8 (val, lane)
9774
9775_NEON2SSE_GLOBAL void vst1q_lane_u16(__transfersize(1) uint16_t * ptr, uint16x8_t val, __constrange(0,7) int lane); // VST1.16 {d0[0]}, [r0]
9776#define vst1q_lane_u16(ptr, val, lane) *(ptr) = (uint16_t) _MM_EXTRACT_EPI16 (val, lane)
9777
9778_NEON2SSE_GLOBAL void vst1q_lane_u32(__transfersize(1) uint32_t * ptr, uint32x4_t val, __constrange(0,3) int lane); // VST1.32 {d0[0]}, [r0]
9779#define vst1q_lane_u32(ptr, val, lane) *(ptr) = (uint32_t) _MM_EXTRACT_EPI32 (val, lane)
9780
9781_NEON2SSE_GLOBAL void vst1q_lane_u64(__transfersize(1) uint64_t * ptr, uint64x2_t val, __constrange(0,1) int lane); // VST1.64 {d0}, [r0]
9782#define vst1q_lane_u64(ptr, val, lane) *(ptr) = (uint64_t) _MM_EXTRACT_EPI64 (val, lane)
9783
9784_NEON2SSE_GLOBAL void vst1q_lane_s8(__transfersize(1) int8_t * ptr, int8x16_t val, __constrange(0,15) int lane); // VST1.8 {d0[0]}, [r0]
9785#define vst1q_lane_s8(ptr, val, lane) *(ptr) = (int8_t) _MM_EXTRACT_EPI8 (val, lane)
9786
9787_NEON2SSE_GLOBAL void vst1q_lane_s16(__transfersize(1) int16_t * ptr, int16x8_t val, __constrange(0,7) int lane); // VST1.16 {d0[0]}, [r0]
9788#define vst1q_lane_s16(ptr, val, lane) *(ptr) = (int16_t) _MM_EXTRACT_EPI16 (val, lane)
9789
9790_NEON2SSE_GLOBAL void vst1q_lane_s32(__transfersize(1) int32_t * ptr, int32x4_t val, __constrange(0,3) int lane); // VST1.32 {d0[0]}, [r0]
9791#define vst1q_lane_s32(ptr, val, lane) *(ptr) = _MM_EXTRACT_EPI32 (val, lane)
9792
9793_NEON2SSE_GLOBAL void vst1q_lane_s64(__transfersize(1) int64_t * ptr, int64x2_t val, __constrange(0,1) int lane); // VST1.64 {d0}, [r0]
9794#define vst1q_lane_s64(ptr, val, lane) *(ptr) = _MM_EXTRACT_EPI64 (val, lane)
9795
9796_NEON2SSE_GLOBAL void vst1q_lane_f16(__transfersize(1) __fp16 * ptr, float16x8_t val, __constrange(0,7) int lane); // VST1.16 {d0[0]}, [r0]
9797//current IA SIMD doesn't support float16
9798
9799_NEON2SSESTORAGE void vst1q_lane_f32(__transfersize(1) float32_t * ptr, float32x4_t val, __constrange(0,3) int lane); // VST1.32 {d0[0]}, [r0]
9800_NEON2SSE_INLINE void vst1q_lane_f32(__transfersize(1) float32_t * ptr, float32x4_t val, __constrange(0,3) int lane)
9801{
9802 *((int32_t*)ptr) = _MM_EXTRACT_PS(val,lane);
9803}
9804
9805_NEON2SSE_GLOBAL void vst1q_lane_p8(__transfersize(1) poly8_t * ptr, poly8x16_t val, __constrange(0,15) int lane); // VST1.8 {d0[0]}, [r0]
9806#define vst1q_lane_p8 vst1q_lane_u8
9807
9808_NEON2SSE_GLOBAL void vst1q_lane_p16(__transfersize(1) poly16_t * ptr, poly16x8_t val, __constrange(0,7) int lane); // VST1.16 {d0[0]}, [r0]
9809#define vst1q_lane_p16 vst1q_lane_s16
9810
9811_NEON2SSESTORAGE void vst1_lane_u8(__transfersize(1) uint8_t * ptr, uint8x8_t val, __constrange(0,7) int lane); // VST1.8 {d0[0]}, [r0]
9812_NEON2SSE_INLINE void vst1_lane_u8(__transfersize(1) uint8_t * ptr, uint8x8_t val, __constrange(0,7) int lane)
9813{
9814 *(ptr) = val.m64_u8[lane];
9815}
9816
9817_NEON2SSESTORAGE void vst1_lane_u16(__transfersize(1) uint16_t * ptr, uint16x4_t val, __constrange(0,3) int lane); // VST1.16 {d0[0]}, [r0]
9818_NEON2SSE_INLINE void vst1_lane_u16(__transfersize(1) uint16_t * ptr, uint16x4_t val, __constrange(0,3) int lane)
9819{
9820 *(ptr) = val.m64_u16[lane];
9821}
9822
9823_NEON2SSESTORAGE void vst1_lane_u32(__transfersize(1) uint32_t * ptr, uint32x2_t val, __constrange(0,1) int lane); // VST1.32 {d0[0]}, [r0]
9824_NEON2SSE_INLINE void vst1_lane_u32(__transfersize(1) uint32_t * ptr, uint32x2_t val, __constrange(0,1) int lane)
9825{
9826 *(ptr) = val.m64_u32[lane];
9827}
9828
9829_NEON2SSESTORAGE void vst1_lane_u64(__transfersize(1) uint64_t * ptr, uint64x1_t val, __constrange(0,0) int lane); // VST1.64 {d0}, [r0]
9830_NEON2SSE_INLINE void vst1_lane_u64(__transfersize(1) uint64_t * ptr, uint64x1_t val, __constrange(0,0) int lane)
9831{
9832 UNREFERENCED_PARAMETER(lane);
9833 *(ptr) = val.m64_u64[0];
9834}
9835
9836_NEON2SSE_GLOBAL void vst1_lane_s8(__transfersize(1) int8_t * ptr, int8x8_t val, __constrange(0,7) int lane); // VST1.8 {d0[0]}, [r0]
9837#define vst1_lane_s8(ptr, val, lane) vst1_lane_u8((uint8_t*)ptr, val, lane)
9838
9839_NEON2SSE_GLOBAL void vst1_lane_s16(__transfersize(1) int16_t * ptr, int16x4_t val, __constrange(0,3) int lane); // VST1.16 {d0[0]}, [r0]
9840#define vst1_lane_s16(ptr, val, lane) vst1_lane_u16((uint16_t*)ptr, val, lane)
9841
9842_NEON2SSE_GLOBAL void vst1_lane_s32(__transfersize(1) int32_t * ptr, int32x2_t val, __constrange(0,1) int lane); // VST1.32 {d0[0]}, [r0]
9843#define vst1_lane_s32(ptr, val, lane) vst1_lane_u32((uint32_t*)ptr, val, lane)
9844
9845
9846_NEON2SSE_GLOBAL void vst1_lane_s64(__transfersize(1) int64_t * ptr, int64x1_t val, __constrange(0,0) int lane); // VST1.64 {d0}, [r0]
9847#define vst1_lane_s64(ptr, val, lane) vst1_lane_u64((uint64_t*)ptr, val, lane)
9848
9849
9850_NEON2SSE_GLOBAL void vst1_lane_f16(__transfersize(1) __fp16 * ptr, float16x4_t val, __constrange(0,3) int lane); // VST1.16 {d0[0]}, [r0]
9851//current IA SIMD doesn't support float16
9852
9853_NEON2SSESTORAGE void vst1_lane_f32(__transfersize(1) float32_t * ptr, float32x2_t val, __constrange(0,1) int lane); // VST1.32 {d0[0]}, [r0]
9854_NEON2SSE_INLINE void vst1_lane_f32(__transfersize(1) float32_t * ptr, float32x2_t val, __constrange(0,1) int lane)
9855{
9856 *(ptr) = val.m64_f32[lane];
9857}
9858
9859_NEON2SSE_GLOBAL void vst1_lane_p8(__transfersize(1) poly8_t * ptr, poly8x8_t val, __constrange(0,7) int lane); // VST1.8 {d0[0]}, [r0]
9860#define vst1_lane_p8 vst1_lane_u8
9861
9862_NEON2SSE_GLOBAL void vst1_lane_p16(__transfersize(1) poly16_t * ptr, poly16x4_t val, __constrange(0,3) int lane); // VST1.16 {d0[0]}, [r0]
9863#define vst1_lane_p16 vst1_lane_s16
9864
9865//***********************************************************************************************
9866//**************** Loads and stores of an N-element structure **********************************
9867//***********************************************************************************************
9868//These intrinsics load or store an n-element structure. The array structures are defined in the beginning
9869//We assume ptr is NOT aligned in general case, for more details see "Loads and stores of a single vector functions"
9870//****************** 2 elements load *********************************************
9871_NEON2SSESTORAGE uint8x16x2_t vld2q_u8(__transfersize(32) uint8_t const * ptr); // VLD2.8 {d0, d2}, [r0]
9872_NEON2SSE_INLINE uint8x16x2_t vld2q_u8(__transfersize(32) uint8_t const * ptr) // VLD2.8 {d0, d2}, [r0]
9873{
9874 uint8x16x2_t v;
9875 v.val[0] = vld1q_u8(ptr);
9876 v.val[1] = vld1q_u8((ptr + 16));
9877 v = vuzpq_s8(v.val[0], v.val[1]);
9878 return v;
9879}
9880
9881_NEON2SSESTORAGE uint16x8x2_t vld2q_u16(__transfersize(16) uint16_t const * ptr); // VLD2.16 {d0, d2}, [r0]
9882_NEON2SSE_INLINE uint16x8x2_t vld2q_u16(__transfersize(16) uint16_t const * ptr) // VLD2.16 {d0, d2}, [r0]
9883{
9884 uint16x8x2_t v;
9885 v.val[0] = vld1q_u16( ptr);
9886 v.val[1] = vld1q_u16( (ptr + 8));
9887 v = vuzpq_s16(v.val[0], v.val[1]);
9888 return v;
9889}
9890
9891_NEON2SSESTORAGE uint32x4x2_t vld2q_u32(__transfersize(8) uint32_t const * ptr); // VLD2.32 {d0, d2}, [r0]
9892_NEON2SSE_INLINE uint32x4x2_t vld2q_u32(__transfersize(8) uint32_t const * ptr) // VLD2.32 {d0, d2}, [r0]
9893{
9894 uint32x4x2_t v;
9895 v.val[0] = vld1q_u32 ( ptr);
9896 v.val[1] = vld1q_u32 ( (ptr + 4));
9897 v = vuzpq_s32(v.val[0], v.val[1]);
9898 return v;
9899}
9900
9901_NEON2SSE_GLOBAL int8x16x2_t vld2q_s8(__transfersize(32) int8_t const * ptr);
9902#define vld2q_s8(ptr) vld2q_u8((uint8_t*) ptr)
9903
9904_NEON2SSE_GLOBAL int16x8x2_t vld2q_s16(__transfersize(16) int16_t const * ptr); // VLD2.16 {d0, d2}, [r0]
9905#define vld2q_s16(ptr) vld2q_u16((uint16_t*) ptr)
9906
9907_NEON2SSE_GLOBAL int32x4x2_t vld2q_s32(__transfersize(8) int32_t const * ptr); // VLD2.32 {d0, d2}, [r0]
9908#define vld2q_s32(ptr) vld2q_u32((uint32_t*) ptr)
9909
9910
9911_NEON2SSE_GLOBAL float16x8x2_t vld2q_f16(__transfersize(16) __fp16 const * ptr); // VLD2.16 {d0, d2}, [r0]
9912// IA32 SIMD doesn't work with 16bit floats currently, so need to go to 32 bit and then work with two 128bit registers. See vld1q_f16 for example
9913
9914_NEON2SSESTORAGE float32x4x2_t vld2q_f32(__transfersize(8) float32_t const * ptr); // VLD2.32 {d0, d2}, [r0]
9915_NEON2SSE_INLINE float32x4x2_t vld2q_f32(__transfersize(8) float32_t const * ptr) // VLD2.32 {d0, d2}, [r0]
9916{
9917 float32x4x2_t v;
9918 v.val[0] = vld1q_f32 (ptr);
9919 v.val[1] = vld1q_f32 ((ptr + 4));
9920 v = vuzpq_f32(v.val[0], v.val[1]);
9921 return v;
9922}
9923
9924_NEON2SSE_GLOBAL poly8x16x2_t vld2q_p8(__transfersize(32) poly8_t const * ptr); // VLD2.8 {d0, d2}, [r0]
9925#define vld2q_p8 vld2q_u8
9926
9927_NEON2SSE_GLOBAL poly16x8x2_t vld2q_p16(__transfersize(16) poly16_t const * ptr); // VLD2.16 {d0, d2}, [r0]
9928#define vld2q_p16 vld2q_u16
9929
9930_NEON2SSESTORAGE uint8x8x2_t vld2_u8(__transfersize(16) uint8_t const * ptr); // VLD2.8 {d0, d1}, [r0]
9931_NEON2SSE_INLINE uint8x8x2_t vld2_u8(__transfersize(16) uint8_t const * ptr)
9932{
9933 uint8x8x2_t v;
9934 __m128i ld128;
9935 ld128 = vld1q_u8(ptr); //merge two 64-bits in 128 bit
9936 ld128 = _mm_shuffle_epi8(ld128, *(__m128i*)mask8_16_even_odd);
9937 vst1q_u8((v.val), ld128); // v.val[1] = _mm_shuffle_epi32(v.val[0], _SWAP_HI_LOW32);
9938 return v;
9939}
9940
9941_NEON2SSESTORAGE uint16x4x2_t vld2_u16(__transfersize(8) uint16_t const * ptr); // VLD2.16 {d0, d1}, [r0]
9942_NEON2SSE_INLINE uint16x4x2_t vld2_u16(__transfersize(8) uint16_t const * ptr)
9943{
9944 _NEON2SSE_ALIGN_16 uint16x4x2_t v;
9945 __m128i ld128;
9946 ld128 = vld1q_u16(ptr); //merge two 64-bits in 128 bit
9947 ld128 = _mm_shuffle_epi8(ld128, *(__m128i*) mask8_32_even_odd);
9948 vst1q_u16((v.val), ld128);
9949 return v;
9950}
9951
9952_NEON2SSESTORAGE uint32x2x2_t vld2_u32(__transfersize(4) uint32_t const * ptr); // VLD2.32 {d0, d1}, [r0]
9953_NEON2SSE_INLINE uint32x2x2_t vld2_u32(__transfersize(4) uint32_t const * ptr)
9954{
9955 _NEON2SSE_ALIGN_16 uint32x2x2_t v;
9956 __m128i ld128;
9957 ld128 = vld1q_u32(ptr); //merge two 64-bits in 128 bit
9958 ld128 = _mm_shuffle_epi32(ld128, 0 | (2 << 2) | (1 << 4) | (3 << 6));
9959 vst1q_u32((v.val), ld128);
9960 return v;
9961}
9962
9963_NEON2SSESTORAGE uint64x1x2_t vld2_u64(__transfersize(2) uint64_t const * ptr); // VLD1.64 {d0, d1}, [r0]
9964_NEON2SSE_INLINE uint64x1x2_t vld2_u64(__transfersize(2) uint64_t const * ptr)
9965{
9966 uint64x1x2_t v;
9967 v.val[0].m64_u64[0] = *(ptr);
9968 v.val[1].m64_u64[0] = *(ptr + 1);
9969 return v;
9970}
9971
9972_NEON2SSE_GLOBAL int8x8x2_t vld2_s8(__transfersize(16) int8_t const * ptr); // VLD2.8 {d0, d1}, [r0]
9973#define vld2_s8(ptr) vld2_u8((uint8_t*)ptr)
9974
9975_NEON2SSE_GLOBAL int16x4x2_t vld2_s16(__transfersize(8) int16_t const * ptr); // VLD2.16 {d0, d1}, [r0]
9976#define vld2_s16(ptr) vld2_u16((uint16_t*)ptr)
9977
9978_NEON2SSE_GLOBAL int32x2x2_t vld2_s32(__transfersize(4) int32_t const * ptr); // VLD2.32 {d0, d1}, [r0]
9979#define vld2_s32(ptr) vld2_u32((uint32_t*)ptr)
9980
9981_NEON2SSE_GLOBAL int64x1x2_t vld2_s64(__transfersize(2) int64_t const * ptr); // VLD1.64 {d0, d1}, [r0]
9982#define vld2_s64(ptr) vld2_u64((uint64_t*)ptr)
9983
9984_NEON2SSE_GLOBAL float16x4x2_t vld2_f16(__transfersize(8) __fp16 const * ptr); // VLD2.16 {d0, d1}, [r0]
9985// IA32 SIMD doesn't work with 16bit floats currently, so need to go to 32 bit and then work with two 128bit registers. See vld1_f16 for example
9986
9987_NEON2SSESTORAGE float32x2x2_t vld2_f32(__transfersize(4) float32_t const * ptr); // VLD2.32 {d0, d1}, [r0]
9988_NEON2SSE_INLINE float32x2x2_t vld2_f32(__transfersize(4) float32_t const * ptr)
9989{
9990 float32x2x2_t v;
9991 v.val[0].m64_f32[0] = *(ptr);
9992 v.val[0].m64_f32[1] = *(ptr + 2);
9993 v.val[1].m64_f32[0] = *(ptr + 1);
9994 v.val[1].m64_f32[1] = *(ptr + 3);
9995 return v;
9996}
9997
9998_NEON2SSE_GLOBAL poly8x8x2_t vld2_p8(__transfersize(16) poly8_t const * ptr); // VLD2.8 {d0, d1}, [r0]
9999#define vld2_p8 vld2_u8
10000
10001_NEON2SSE_GLOBAL poly16x4x2_t vld2_p16(__transfersize(8) poly16_t const * ptr); // VLD2.16 {d0, d1}, [r0]
10002#define vld2_p16 vld2_u16
10003
10004//******************** Triplets ***************************************
10005//*********************************************************************
10006_NEON2SSESTORAGE uint8x16x3_t vld3q_u8(__transfersize(48) uint8_t const * ptr); // VLD3.8 {d0, d2, d4}, [r0]
10007_NEON2SSE_INLINE uint8x16x3_t vld3q_u8(__transfersize(48) uint8_t const * ptr) // VLD3.8 {d0, d2, d4}, [r0]
10008{
10009 //a0,a1,a2,a3,...a7,a8,...a15, b0,b1,b2,...b7,b8,...b15, c0,c1,c2,...c7,c8,...c15 ->
10010 //a:0,3,6,9,12,15,b:2,5,8,11,14, c:1,4,7,10,13
10011 //a:1,4,7,10,13, b:0,3,6,9,12,15,c:2,5,8,11,14,
10012 //a:2,5,8,11,14, b:1,4,7,10,13, c:0,3,6,9,12,15
10013 uint8x16x3_t v;
10014 __m128i tmp0, tmp1,tmp2, tmp3;
10015 _NEON2SSE_ALIGN_16 static const int8_t mask8_0[16] = {0,3,6,9,12,15,1,4,7,10,13,2,5,8,11,14};
10016 _NEON2SSE_ALIGN_16 static const int8_t mask8_1[16] = {2,5,8,11,14,0,3,6,9,12,15,1,4,7,10,13};
10017 _NEON2SSE_ALIGN_16 static const int8_t mask8_2[16] = {1,4,7,10,13,2,5,8,11,14,0,3,6,9,12,15};
10018
10019 v.val[0] = vld1q_u8 (ptr); //a0,a1,a2,a3,...a7, ...a15
10020 v.val[1] = vld1q_u8 ((ptr + 16)); //b0,b1,b2,b3...b7, ...b15
10021 v.val[2] = vld1q_u8 ((ptr + 32)); //c0,c1,c2,c3,...c7,...c15
10022
10023 tmp0 = _mm_shuffle_epi8(v.val[0], *(__m128i*)mask8_0); //a:0,3,6,9,12,15,1,4,7,10,13,2,5,8,11
10024 tmp1 = _mm_shuffle_epi8(v.val[1], *(__m128i*)mask8_1); //b:2,5,8,11,14,0,3,6,9,12,15,1,4,7,10,13
10025 tmp2 = _mm_shuffle_epi8(v.val[2], *(__m128i*)mask8_2); //c:1,4,7,10,13,2,5,8,11,14,3,6,9,12,15
10026
10027 tmp3 = _mm_slli_si128(tmp0,10); //0,0,0,0,0,0,0,0,0,0,a0,a3,a6,a9,a12,a15
10028 tmp3 = _mm_alignr_epi8(tmp1,tmp3, 10); //a:0,3,6,9,12,15,b:2,5,8,11,14,x,x,x,x,x
10029 tmp3 = _mm_slli_si128(tmp3, 5); //0,0,0,0,0,a:0,3,6,9,12,15,b:2,5,8,11,14,
10030 tmp3 = _mm_srli_si128(tmp3, 5); //a:0,3,6,9,12,15,b:2,5,8,11,14,:0,0,0,0,0
10031 v.val[0] = _mm_slli_si128(tmp2, 11); //0,0,0,0,0,0,0,0,0,0,0,0, 1,4,7,10,13,
10032 v.val[0] = _mm_or_si128(v.val[0],tmp3); //a:0,3,6,9,12,15,b:2,5,8,11,14,c:1,4,7,10,13,
10033
10034 tmp3 = _mm_slli_si128(tmp0, 5); //0,0,0,0,0,a:0,3,6,9,12,15,1,4,7,10,13,
10035 tmp3 = _mm_srli_si128(tmp3, 11); //a:1,4,7,10,13, 0,0,0,0,0,0,0,0,0,0,0
10036 v.val[1] = _mm_srli_si128(tmp1,5); //b:0,3,6,9,12,15,C:1,4,7,10,13, 0,0,0,0,0
10037 v.val[1] = _mm_slli_si128(v.val[1], 5); //0,0,0,0,0,b:0,3,6,9,12,15,C:1,4,7,10,13,
10038 v.val[1] = _mm_or_si128(v.val[1],tmp3); //a:1,4,7,10,13,b:0,3,6,9,12,15,C:1,4,7,10,13,
10039 v.val[1] = _mm_slli_si128(v.val[1],5); //0,0,0,0,0,a:1,4,7,10,13,b:0,3,6,9,12,15,
10040 v.val[1] = _mm_srli_si128(v.val[1], 5); //a:1,4,7,10,13,b:0,3,6,9,12,15,0,0,0,0,0
10041 tmp3 = _mm_srli_si128(tmp2,5); //c:2,5,8,11,14,0,3,6,9,12,15,0,0,0,0,0
10042 tmp3 = _mm_slli_si128(tmp3,11); //0,0,0,0,0,0,0,0,0,0,0,c:2,5,8,11,14,
10043 v.val[1] = _mm_or_si128(v.val[1],tmp3); //a:1,4,7,10,13,b:0,3,6,9,12,15,c:2,5,8,11,14,
10044
10045 tmp3 = _mm_srli_si128(tmp2,10); //c:0,3,6,9,12,15, 0,0,0,0,0,0,0,0,0,0,
10046 tmp3 = _mm_slli_si128(tmp3,10); //0,0,0,0,0,0,0,0,0,0, c:0,3,6,9,12,15,
10047 v.val[2] = _mm_srli_si128(tmp1,11); //b:1,4,7,10,13,0,0,0,0,0,0,0,0,0,0,0
10048 v.val[2] = _mm_slli_si128(v.val[2],5); //0,0,0,0,0,b:1,4,7,10,13, 0,0,0,0,0,0
10049 v.val[2] = _mm_or_si128(v.val[2],tmp3); //0,0,0,0,0,b:1,4,7,10,13,c:0,3,6,9,12,15,
10050 tmp0 = _mm_srli_si128(tmp0, 11); //a:2,5,8,11,14, 0,0,0,0,0,0,0,0,0,0,0,
10051 v.val[2] = _mm_or_si128(v.val[2],tmp0); //a:2,5,8,11,14,b:1,4,7,10,13,c:0,3,6,9,12,15,
10052 return v;
10053}
10054
10055_NEON2SSESTORAGE uint16x8x3_t vld3q_u16(__transfersize(24) uint16_t const * ptr); // VLD3.16 {d0, d2, d4}, [r0]
10056_NEON2SSE_INLINE uint16x8x3_t vld3q_u16(__transfersize(24) uint16_t const * ptr) // VLD3.16 {d0, d2, d4}, [r0]
10057{
10058 //a0, a1,a2,a3,...a7, b0,b1,b2,b3,...b7, c0,c1,c2,c3...c7 -> a0,a3,a6,b1,b4,b7,c2,c5, a1,a4,a7,b2,b5,c0,c3,c6, a2,a5,b0,b3,b6,c1,c4,c7
10059 uint16x8x3_t v;
10060 __m128i tmp0, tmp1,tmp2, tmp3;
10061 _NEON2SSE_ALIGN_16 static const int8_t mask16_0[16] = {0,1, 6,7, 12,13, 2,3, 8,9, 14,15, 4,5, 10,11};
10062 _NEON2SSE_ALIGN_16 static const int8_t mask16_1[16] = {2,3, 8,9, 14,15, 4,5, 10,11, 0,1, 6,7, 12,13};
10063 _NEON2SSE_ALIGN_16 static const int8_t mask16_2[16] = {4,5, 10,11, 0,1, 6,7, 12,13, 2,3, 8,9, 14,15};
10064
10065 v.val[0] = vld1q_u16 (ptr); //a0,a1,a2,a3,...a7,
10066 v.val[1] = vld1q_u16 ((ptr + 8)); //b0,b1,b2,b3...b7
10067 v.val[2] = vld1q_u16 ((ptr + 16)); //c0,c1,c2,c3,...c7
10068
10069 tmp0 = _mm_shuffle_epi8(v.val[0], *(__m128i*)mask16_0); //a0,a3,a6,a1,a4,a7,a2,a5,
10070 tmp1 = _mm_shuffle_epi8(v.val[1], *(__m128i*)mask16_1); //b1,b4,b7,b2,b5,b0,b3,b6
10071 tmp2 = _mm_shuffle_epi8(v.val[2], *(__m128i*)mask16_2); //c2,c5, c0,c3,c6, c1,c4,c7
10072
10073 tmp3 = _mm_slli_si128(tmp0,10); //0,0,0,0,0,a0,a3,a6,
10074 tmp3 = _mm_alignr_epi8(tmp1,tmp3, 10); //a0,a3,a6,b1,b4,b7,x,x
10075 tmp3 = _mm_slli_si128(tmp3, 4); //0,0, a0,a3,a6,b1,b4,b7
10076 tmp3 = _mm_srli_si128(tmp3, 4); //a0,a3,a6,b1,b4,b7,0,0
10077 v.val[0] = _mm_slli_si128(tmp2, 12); //0,0,0,0,0,0, c2,c5,
10078 v.val[0] = _mm_or_si128(v.val[0],tmp3); //a0,a3,a6,b1,b4,b7,c2,c5
10079
10080 tmp3 = _mm_slli_si128(tmp0, 4); //0,0,a0,a3,a6,a1,a4,a7
10081 tmp3 = _mm_srli_si128(tmp3,10); //a1,a4,a7, 0,0,0,0,0
10082 v.val[1] = _mm_srli_si128(tmp1,6); //b2,b5,b0,b3,b6,0,0
10083 v.val[1] = _mm_slli_si128(v.val[1], 6); //0,0,0,b2,b5,b0,b3,b6,
10084 v.val[1] = _mm_or_si128(v.val[1],tmp3); //a1,a4,a7,b2,b5,b0,b3,b6,
10085 v.val[1] = _mm_slli_si128(v.val[1],6); //0,0,0,a1,a4,a7,b2,b5,
10086 v.val[1] = _mm_srli_si128(v.val[1], 6); //a1,a4,a7,b2,b5,0,0,0,
10087 tmp3 = _mm_srli_si128(tmp2,4); //c0,c3,c6, c1,c4,c7,0,0
10088 tmp3 = _mm_slli_si128(tmp3,10); //0,0,0,0,0,c0,c3,c6,
10089 v.val[1] = _mm_or_si128(v.val[1],tmp3); //a1,a4,a7,b2,b5,c0,c3,c6,
10090
10091 tmp3 = _mm_srli_si128(tmp2,10); //c1,c4,c7, 0,0,0,0,0
10092 tmp3 = _mm_slli_si128(tmp3,10); //0,0,0,0,0, c1,c4,c7,
10093 v.val[2] = _mm_srli_si128(tmp1,10); //b0,b3,b6,0,0, 0,0,0
10094 v.val[2] = _mm_slli_si128(v.val[2],4); //0,0, b0,b3,b6,0,0,0
10095 v.val[2] = _mm_or_si128(v.val[2],tmp3); //0,0, b0,b3,b6,c1,c4,c7,
10096 tmp0 = _mm_srli_si128(tmp0, 12); //a2,a5,0,0,0,0,0,0
10097 v.val[2] = _mm_or_si128(v.val[2],tmp0); //a2,a5,b0,b3,b6,c1,c4,c7,
10098 return v;
10099}
10100
10101_NEON2SSESTORAGE uint32x4x3_t vld3q_u32(__transfersize(12) uint32_t const * ptr); // VLD3.32 {d0, d2, d4}, [r0]
10102_NEON2SSE_INLINE uint32x4x3_t vld3q_u32(__transfersize(12) uint32_t const * ptr) // VLD3.32 {d0, d2, d4}, [r0]
10103{
10104 //a0,a1,a2,a3, b0,b1,b2,b3, c0,c1,c2,c3 -> a0,a3,b2,c1, a1,b0,b3,c2, a2,b1,c0,c3,
10105 uint32x4x3_t v;
10106 __m128i tmp0, tmp1,tmp2, tmp3;
10107 v.val[0] = vld1q_u32 (ptr); //a0,a1,a2,a3,
10108 v.val[1] = vld1q_u32 ((ptr + 4)); //b0,b1,b2,b3
10109 v.val[2] = vld1q_u32 ((ptr + 8)); //c0,c1,c2,c3,
10110
10111 tmp0 = _mm_shuffle_epi32(v.val[0], 0 | (3 << 2) | (1 << 4) | (2 << 6)); //a0,a3,a1,a2
10112 tmp1 = _mm_shuffle_epi32(v.val[1], _SWAP_HI_LOW32); //b2,b3,b0,b1
10113 tmp2 = _mm_shuffle_epi32(v.val[2], 1 | (2 << 2) | (0 << 4) | (3 << 6)); //c1,c2, c0,c3
10114
10115 tmp3 = _mm_unpacklo_epi32(tmp1, tmp2); //b2,c1, b3,c2
10116 v.val[0] = _mm_unpacklo_epi64(tmp0,tmp3); //a0,a3,b2,c1
10117 tmp0 = _mm_unpackhi_epi32(tmp0, tmp1); //a1,b0, a2,b1
10118 v.val[1] = _mm_shuffle_epi32(tmp0, _SWAP_HI_LOW32 ); //a2,b1, a1,b0,
10119 v.val[1] = _mm_unpackhi_epi64(v.val[1], tmp3); //a1,b0, b3,c2
10120 v.val[2] = _mm_unpackhi_epi64(tmp0, tmp2); //a2,b1, c0,c3
10121 return v;
10122}
10123
10124_NEON2SSE_GLOBAL int8x16x3_t vld3q_s8(__transfersize(48) int8_t const * ptr); // VLD3.8 {d0, d2, d4}, [r0]
10125#define vld3q_s8(ptr) vld3q_u8((uint8_t*) (ptr))
10126
10127_NEON2SSE_GLOBAL int16x8x3_t vld3q_s16(__transfersize(24) int16_t const * ptr); // VLD3.16 {d0, d2, d4}, [r0]
10128#define vld3q_s16(ptr) vld3q_u16((uint16_t*) (ptr))
10129
10130_NEON2SSE_GLOBAL int32x4x3_t vld3q_s32(__transfersize(12) int32_t const * ptr); // VLD3.32 {d0, d2, d4}, [r0]
10131#define vld3q_s32(ptr) vld3q_u32((uint32_t*) (ptr))
10132
10133_NEON2SSE_GLOBAL float16x8x3_t vld3q_f16(__transfersize(24) __fp16 const * ptr); // VLD3.16 {d0, d2, d4}, [r0]
10134// IA32 SIMD doesn't work with 16bit floats currently, so need to go to 32 bit and then work with two 128bit registers. See vld1q_f16 for example
10135
10136_NEON2SSESTORAGE float32x4x3_t vld3q_f32(__transfersize(12) float32_t const * ptr); // VLD3.32 {d0, d2, d4}, [r0]
10137_NEON2SSE_INLINE float32x4x3_t vld3q_f32(__transfersize(12) float32_t const * ptr) // VLD3.32 {d0, d2, d4}, [r0]
10138{
10139 //a0,a1,a2,a3, b0,b1,b2,b3, c0,c1,c2,c3 -> a0,a3,b2,c1, a1,b0,b3,c2, a2,b1,c0,c3,
10140 float32x4x3_t v;
10141 __m128 tmp0, tmp1,tmp2, tmp3;
10142 v.val[0] = vld1q_f32 (ptr); //a0,a1,a2,a3,
10143 v.val[1] = vld1q_f32 ((ptr + 4)); //b0,b1,b2,b3
10144 v.val[2] = vld1q_f32 ((ptr + 8)); //c0,c1,c2,c3,
10145
10146 tmp0 = _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(v.val[0]), 0 | (3 << 2) | (1 << 4) | (2 << 6))); //a0,a3,a1,a2
10147 tmp1 = _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(v.val[1]), _SWAP_HI_LOW32)); //b2,b3,b0,b1
10148 tmp2 = _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(v.val[2]), 1 | (2 << 2) | (0 << 4) | (3 << 6))); //c1,c2, c0,c3
10149 tmp3 = _mm_unpacklo_ps(tmp1, tmp2); //b2,c1, b3,c2
10150
10151 v.val[0] = _mm_movelh_ps(tmp0,tmp3); //a0,a3,b2,c1
10152 tmp0 = _mm_unpackhi_ps(tmp0, tmp1); //a1,b0, a2,b1
10153 v.val[1] = _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(tmp0), _SWAP_HI_LOW32 )); //a2,b1, a1,b0,
10154 v.val[1] = _mm_movehl_ps(tmp3,v.val[1]); //a1,b0, b3,c2
10155 v.val[2] = _mm_movehl_ps(tmp2,tmp0); //a2,b1, c0,c3
10156 return v;
10157}
10158
10159//poly8x16x3_t vld3q_p8(__transfersize(48) poly8_t const * ptr); // VLD3.8 {d0, d2, d4}, [r0]
10160#define vld3q_p8 vld3q_u8
10161
10162_NEON2SSE_GLOBAL poly16x8x3_t vld3q_p16(__transfersize(24) poly16_t const * ptr); // VLD3.16 {d0, d2, d4}, [r0]
10163#define vld3q_p16 vld3q_u16
10164
10165_NEON2SSESTORAGE uint8x8x3_t vld3_u8(__transfersize(24) uint8_t const * ptr); // VLD3.8 {d0, d1, d2}, [r0]
10166_NEON2SSE_INLINE uint8x8x3_t vld3_u8(__transfersize(24) uint8_t const * ptr) // VLD3.8 {d0, d1, d2}, [r0]
10167{
10168 //a0, a1,a2,a3,...a7, b0,b1,b2,b3,...b7, c0,c1,c2,c3...c7 -> a0,a3,a6,b1,b4,b7,c2,c5, a1,a4,a7,b2,b5,c0,c3,c6, a2,a5,b0,b3,b6,c1,c4,c7
10169 uint8x8x3_t v;
10170 __m128i val0, val1, val2, tmp0, tmp1;
10171 _NEON2SSE_ALIGN_16 static const int8_t mask8_0[16] = {0,3,6,9,12,15, 1,4,7,10,13, 2,5,8,11,14};
10172 _NEON2SSE_ALIGN_16 static const int8_t mask8_1[16] = {2,5, 0,3,6, 1,4,7, 0,0,0,0,0,0,0,0};
10173 val0 = vld1q_u8 (ptr); //a0,a1,a2,a3,...a7, b0,b1,b2,b3...b7
10174 val2 = _mm_loadl_epi64((__m128i*)(ptr + 16)); //c0,c1,c2,c3,...c7
10175
10176 tmp0 = _mm_shuffle_epi8(val0, *(__m128i*)mask8_0); //a0,a3,a6,b1,b4,b7, a1,a4,a7,b2,b5, a2,a5,b0,b3,b6,
10177 tmp1 = _mm_shuffle_epi8(val2, *(__m128i*)mask8_1); //c2,c5, c0,c3,c6, c1,c4,c7,x,x,x,x,x,x,x,x
10178 val0 = _mm_slli_si128(tmp0,10);
10179 val0 = _mm_srli_si128(val0,10); //a0,a3,a6,b1,b4,b7, 0,0,0,0,0,0,0,0,0,0
10180 val2 = _mm_slli_si128(tmp1,6); //0,0,0,0,0,0,c2,c5,x,x,x,x,x,x,x,x
10181 val0 = _mm_or_si128(val0,val2); //a0,a3,a6,b1,b4,b7,c2,c5 x,x,x,x,x,x,x,x
10182 _M64(v.val[0], val0);
10183 val1 = _mm_slli_si128(tmp0,5); //0,0,0,0,0,0,0,0,0,0,0, a1,a4,a7,b2,b5,
10184 val1 = _mm_srli_si128(val1,11); //a1,a4,a7,b2,b5,0,0,0,0,0,0,0,0,0,0,0,
10185 val2 = _mm_srli_si128(tmp1,2); //c0,c3,c6,c1,c4,c7,x,x,x,x,x,x,x,x,0,0
10186 val2 = _mm_slli_si128(val2,5); //0,0,0,0,0,c0,c3,c6,0,0,0,0,0,0,0,0
10187 val1 = _mm_or_si128(val1,val2); //a1,a4,a7,b2,b5,c0,c3,c6,x,x,x,x,x,x,x,x
10188 _M64(v.val[1], val1);
10189
10190 tmp0 = _mm_srli_si128(tmp0,11); //a2,a5,b0,b3,b6,0,0,0,0,0,0,0,0,0,0,0,
10191 val2 = _mm_srli_si128(tmp1,5); //c1,c4,c7,0,0,0,0,0,0,0,0,0,0,0,0,0
10192 val2 = _mm_slli_si128(val2,5); //0,0,0,0,0,c1,c4,c7,
10193 val2 = _mm_or_si128(tmp0, val2); //a2,a5,b0,b3,b6,c1,c4,c7,x,x,x,x,x,x,x,x
10194 _M64(v.val[2], val2);
10195 return v;
10196}
10197
10198_NEON2SSESTORAGE uint16x4x3_t vld3_u16(__transfersize(12) uint16_t const * ptr); // VLD3.16 {d0, d1, d2}, [r0]
10199_NEON2SSE_INLINE uint16x4x3_t vld3_u16(__transfersize(12) uint16_t const * ptr) // VLD3.16 {d0, d1, d2}, [r0]
10200{
10201 //a0,a1,a2,a3, b0,b1,b2,b3, c0,c1,c2,c3 -> a0,a3,b2,c1, a1,b0,b3,c2, a2,b1,c0,c3,
10202 uint16x4x3_t v;
10203 __m128i val0, val1, val2, tmp0, tmp1;
10204 _NEON2SSE_ALIGN_16 static const int8_t mask16[16] = {0,1, 6,7, 12,13, 2,3, 8,9, 14,15, 4,5, 10,11};
10205 val0 = vld1q_u16 (ptr); //a0,a1,a2,a3, b0,b1,b2,b3
10206 val2 = _mm_loadl_epi64((__m128i*)(ptr + 8)); //c0,c1,c2,c3, x,x,x,x
10207
10208 tmp0 = _mm_shuffle_epi8(val0, *(__m128i*)mask16); //a0, a3, b2,a1, b0, b3, a2, b1
10209 tmp1 = _mm_shufflelo_epi16(val2, 201); //11 00 10 01 : c1, c2, c0, c3,
10210 val0 = _mm_slli_si128(tmp0,10);
10211 val0 = _mm_srli_si128(val0,10); //a0, a3, b2, 0,0, 0,0,
10212 val2 = _mm_slli_si128(tmp1,14); //0,0,0,0,0,0,0,c1
10213 val2 = _mm_srli_si128(val2,8); //0,0,0,c1,0,0,0,0
10214 val0 = _mm_or_si128(val0,val2); //a0, a3, b2, c1, x,x,x,x
10215 _M64(v.val[0], val0);
10216
10217 val1 = _mm_slli_si128(tmp0,4); //0,0,0,0,0,a1, b0, b3
10218 val1 = _mm_srli_si128(val1,10); //a1, b0, b3, 0,0, 0,0,
10219 val2 = _mm_srli_si128(tmp1,2); //c2, 0,0,0,0,0,0,0,
10220 val2 = _mm_slli_si128(val2,6); //0,0,0,c2,0,0,0,0
10221 val1 = _mm_or_si128(val1,val2); //a1, b0, b3, c2, x,x,x,x
10222 _M64(v.val[1], val1);
10223
10224 tmp0 = _mm_srli_si128(tmp0,12); //a2, b1,0,0,0,0,0,0
10225 tmp1 = _mm_srli_si128(tmp1,4);
10226 tmp1 = _mm_slli_si128(tmp1,4); //0,0,c0, c3,
10227 val2 = _mm_or_si128(tmp0, tmp1); //a2, b1, c0, c3,
10228 _M64(v.val[2], val2);
10229 return v;
10230}
10231
10232_NEON2SSESTORAGE uint32x2x3_t vld3_u32(__transfersize(6) uint32_t const * ptr); // VLD3.32 {d0, d1, d2}, [r0]
10233_NEON2SSE_INLINE uint32x2x3_t vld3_u32(__transfersize(6) uint32_t const * ptr) // VLD3.32 {d0, d1, d2}, [r0]
10234{
10235 //a0,a1, b0,b1, c0,c1, -> a0,b1, a1,c0, b0,c1
10236 uint32x2x3_t v;
10237 __m128i val0, val1, val2;
10238 val0 = vld1q_u32 (ptr); //a0,a1, b0,b1,
10239 val2 = _mm_loadl_epi64((__m128i*) (ptr + 4)); //c0,c1, x,x
10240
10241 val0 = _mm_shuffle_epi32(val0, 0 | (3 << 2) | (1 << 4) | (2 << 6)); //a0,b1, a1, b0
10242 _M64(v.val[0], val0);
10243 val2 = _mm_slli_si128(val2, 8); //x, x,c0,c1,
10244 val1 = _mm_unpackhi_epi32(val0,val2); //a1,c0, b0, c1
10245 _M64(v.val[1], val1);
10246 val2 = _mm_srli_si128(val1, 8); //b0, c1, x, x,
10247 _M64(v.val[2], val2);
10248 return v;
10249}
10250_NEON2SSESTORAGE uint64x1x3_t vld3_u64(__transfersize(3) uint64_t const * ptr); // VLD1.64 {d0, d1, d2}, [r0]
10251_NEON2SSE_INLINE uint64x1x3_t vld3_u64(__transfersize(3) uint64_t const * ptr) // VLD1.64 {d0, d1, d2}, [r0]
10252{
10253 uint64x1x3_t v;
10254 v.val[0].m64_u64[0] = *(ptr);
10255 v.val[1].m64_u64[0] = *(ptr + 1);
10256 v.val[2].m64_u64[0] = *(ptr + 2);
10257 return v;
10258}
10259
10260_NEON2SSE_GLOBAL int8x8x3_t vld3_s8(__transfersize(24) int8_t const * ptr); // VLD3.8 {d0, d1, d2}, [r0]
10261#define vld3_s8(ptr) vld3_u8((uint8_t*)ptr)
10262
10263_NEON2SSE_GLOBAL int16x4x3_t vld3_s16(__transfersize(12) int16_t const * ptr); // VLD3.16 {d0, d1, d2}, [r0]
10264#define vld3_s16(ptr) vld3_u16((uint16_t*)ptr)
10265
10266_NEON2SSE_GLOBAL int32x2x3_t vld3_s32(__transfersize(6) int32_t const * ptr); // VLD3.32 {d0, d1, d2}, [r0]
10267#define vld3_s32(ptr) vld3_u32((uint32_t*)ptr)
10268
10269//int64x1x3_t vld3_s64(__transfersize(3) int64_t const * ptr); // VLD1.64 {d0, d1, d2}, [r0]
10270#define vld3_s64(ptr) vld3_u64((uint64_t*)ptr)
10271
10272_NEON2SSE_GLOBAL float16x4x3_t vld3_f16(__transfersize(12) __fp16 const * ptr); // VLD3.16 {d0, d1, d2}, [r0]
10273// IA32 SIMD doesn't work with 16bit floats currently, so need to go to 32 bit and then work with two 128bit registers. See vld1q_f16 for example
10274
10275_NEON2SSESTORAGE float32x2x3_t vld3_f32(__transfersize(6) float32_t const * ptr); // VLD3.32 {d0, d1, d2}, [r0]
10276_NEON2SSE_INLINE float32x2x3_t vld3_f32(__transfersize(6) float32_t const * ptr)
10277{
10278 //a0,a1, b0,b1, c0,c1, -> a0,b1, a1,c0, b0,c1
10279 float32x2x3_t v;
10280 v.val[0].m64_f32[0] = *(ptr);
10281 v.val[0].m64_f32[1] = *(ptr + 3);
10282
10283 v.val[1].m64_f32[0] = *(ptr + 1);
10284 v.val[1].m64_f32[1] = *(ptr + 4);
10285
10286 v.val[2].m64_f32[0] = *(ptr + 2);
10287 v.val[2].m64_f32[1] = *(ptr + 5);
10288 return v;
10289}
10290
10291_NEON2SSE_GLOBAL poly8x8x3_t vld3_p8(__transfersize(24) poly8_t const * ptr); // VLD3.8 {d0, d1, d2}, [r0]
10292#define vld3_p8 vld3_u8
10293
10294_NEON2SSE_GLOBAL poly16x4x3_t vld3_p16(__transfersize(12) poly16_t const * ptr); // VLD3.16 {d0, d1, d2}, [r0]
10295#define vld3_p16 vld3_u16
10296
10297//*************** Quadruples load ********************************
10298//*****************************************************************
10299_NEON2SSESTORAGE uint8x16x4_t vld4q_u8(__transfersize(64) uint8_t const * ptr); // VLD4.8 {d0, d2, d4, d6}, [r0]
10300_NEON2SSE_INLINE uint8x16x4_t vld4q_u8(__transfersize(64) uint8_t const * ptr) // VLD4.8 {d0, d2, d4, d6}, [r0]
10301{
10302 uint8x16x4_t v;
10303 __m128i tmp3, tmp2, tmp1, tmp0;
10304
10305 v.val[0] = vld1q_u8 ( ptr); //a0,a1,a2,...a7, ...a15
10306 v.val[1] = vld1q_u8 ( (ptr + 16)); //b0, b1,b2,...b7.... b15
10307 v.val[2] = vld1q_u8 ( (ptr + 32)); //c0, c1,c2,...c7....c15
10308 v.val[3] = vld1q_u8 ( (ptr + 48)); //d0,d1,d2,...d7....d15
10309
10310 tmp0 = _mm_unpacklo_epi8(v.val[0],v.val[1]); //a0,b0, a1,b1, a2,b2, a3,b3,....a7,b7
10311 tmp1 = _mm_unpacklo_epi8(v.val[2],v.val[3]); //c0,d0, c1,d1, c2,d2, c3,d3,... c7,d7
10312 tmp2 = _mm_unpackhi_epi8(v.val[0],v.val[1]); //a8,b8, a9,b9, a10,b10, a11,b11,...a15,b15
10313 tmp3 = _mm_unpackhi_epi8(v.val[2],v.val[3]); //c8,d8, c9,d9, c10,d10, c11,d11,...c15,d15
10314
10315 v.val[0] = _mm_unpacklo_epi8(tmp0, tmp2); //a0,a8, b0,b8, a1,a9, b1,b9, ....a3,a11, b3,b11
10316 v.val[1] = _mm_unpackhi_epi8(tmp0, tmp2); //a4,a12, b4,b12, a5,a13, b5,b13,....a7,a15,b7,b15
10317 v.val[2] = _mm_unpacklo_epi8(tmp1, tmp3); //c0,c8, d0,d8, c1,c9, d1,d9.....d3,d11
10318 v.val[3] = _mm_unpackhi_epi8(tmp1, tmp3); //c4,c12,d4,d12, c5,c13, d5,d13,....d7,d15
10319
10320 tmp0 = _mm_unpacklo_epi32(v.val[0], v.val[2] ); ///a0,a8, b0,b8, c0,c8, d0,d8, a1,a9, b1,b9, c1,c9, d1,d9
10321 tmp1 = _mm_unpackhi_epi32(v.val[0], v.val[2] ); //a2,a10, b2,b10, c2,c10, d2,d10, a3,a11, b3,b11, c3,c11, d3,d11
10322 tmp2 = _mm_unpacklo_epi32(v.val[1], v.val[3] ); //a4,a12, b4,b12, c4,c12, d4,d12, a5,a13, b5,b13, c5,c13, d5,d13,
10323 tmp3 = _mm_unpackhi_epi32(v.val[1], v.val[3] ); //a6,a14, b6,b14, c6,c14, d6,d14, a7,a15,b7,b15,c7,c15,d7,d15
10324
10325 v.val[0] = _mm_unpacklo_epi8(tmp0, tmp2); //a0,a4,a8,a12,b0,b4,b8,b12,c0,c4,c8,c12,d0,d4,d8,d12
10326 v.val[1] = _mm_unpackhi_epi8(tmp0, tmp2); //a1,a5, a9, a13, b1,b5, b9,b13, c1,c5, c9, c13, d1,d5, d9,d13
10327 v.val[2] = _mm_unpacklo_epi8(tmp1, tmp3); //a2,a6, a10,a14, b2,b6, b10,b14,c2,c6, c10,c14, d2,d6, d10,d14
10328 v.val[3] = _mm_unpackhi_epi8(tmp1, tmp3); //a3,a7, a11,a15, b3,b7, b11,b15,c3,c7, c11, c15,d3,d7, d11,d15
10329 return v;
10330}
10331
10332_NEON2SSESTORAGE uint16x8x4_t vld4q_u16(__transfersize(32) uint16_t const * ptr); // VLD4.16 {d0, d2, d4, d6}, [r0]
10333_NEON2SSE_INLINE uint16x8x4_t vld4q_u16(__transfersize(32) uint16_t const * ptr) // VLD4.16 {d0, d2, d4, d6}, [r0]
10334{
10335 uint16x8x4_t v;
10336 __m128i tmp3, tmp2, tmp1, tmp0;
10337 tmp0 = vld1q_u16 (ptr); //a0,a1,a2,...a7
10338 tmp1 = vld1q_u16 ((ptr + 8)); //b0, b1,b2,...b7
10339 tmp2 = vld1q_u16 ((ptr + 16)); //c0, c1,c2,...c7
10340 tmp3 = vld1q_u16 ((ptr + 24)); //d0,d1,d2,...d7
10341 v.val[0] = _mm_unpacklo_epi16(tmp0,tmp1); //a0,b0, a1,b1, a2,b2, a3,b3,
10342 v.val[1] = _mm_unpacklo_epi16(tmp2,tmp3); //c0,d0, c1,d1, c2,d2, c3,d3,
10343 v.val[2] = _mm_unpackhi_epi16(tmp0,tmp1); //a4,b4, a5,b5, a6,b6, a7,b7
10344 v.val[3] = _mm_unpackhi_epi16(tmp2,tmp3); //c4,d4, c5,d5, c6,d6, c7,d7
10345 tmp0 = _mm_unpacklo_epi16(v.val[0], v.val[2]); //a0,a4, b0,b4, a1,a5, b1,b5
10346 tmp1 = _mm_unpackhi_epi16(v.val[0], v.val[2]); //a2,a6, b2,b6, a3,a7, b3,b7
10347 tmp2 = _mm_unpacklo_epi16(v.val[1], v.val[3]); //c0,c4, d0,d4, c1,c5, d1,d5
10348 tmp3 = _mm_unpackhi_epi16(v.val[1], v.val[3]); //c2,c6, d2,d6, c3,c7, d3,d7
10349 v.val[0] = _mm_unpacklo_epi64(tmp0, tmp2); //a0,a4, b0,b4, c0,c4, d0,d4,
10350 v.val[1] = _mm_unpackhi_epi64(tmp0, tmp2); //a1,a5, b1,b5, c1,c5, d1,d5
10351 v.val[2] = _mm_unpacklo_epi64(tmp1, tmp3); //a2,a6, b2,b6, c2,c6, d2,d6,
10352 v.val[3] = _mm_unpackhi_epi64(tmp1, tmp3); //a3,a7, b3,b7, c3,c7, d3,d7
10353 return v;
10354}
10355
10356_NEON2SSESTORAGE uint32x4x4_t vld4q_u32(__transfersize(16) uint32_t const * ptr); // VLD4.32 {d0, d2, d4, d6}, [r0]
10357_NEON2SSE_INLINE uint32x4x4_t vld4q_u32(__transfersize(16) uint32_t const * ptr) // VLD4.32 {d0, d2, d4, d6}, [r0]
10358{
10359 uint32x4x4_t v;
10360 __m128i tmp3, tmp2, tmp1, tmp0;
10361 v.val[0] = vld1q_u32 (ptr);
10362 v.val[1] = vld1q_u32 ((ptr + 4));
10363 v.val[2] = vld1q_u32 ((ptr + 8));
10364 v.val[3] = vld1q_u32 ((ptr + 12));
10365 tmp0 = _mm_unpacklo_epi32(v.val[0],v.val[1]);
10366 tmp1 = _mm_unpacklo_epi32(v.val[2],v.val[3]);
10367 tmp2 = _mm_unpackhi_epi32(v.val[0],v.val[1]);
10368 tmp3 = _mm_unpackhi_epi32(v.val[2],v.val[3]);
10369 v.val[0] = _mm_unpacklo_epi64(tmp0, tmp1);
10370 v.val[1] = _mm_unpackhi_epi64(tmp0, tmp1);
10371 v.val[2] = _mm_unpacklo_epi64(tmp2, tmp3);
10372 v.val[3] = _mm_unpackhi_epi64(tmp2, tmp3);
10373 return v;
10374}
10375
10376_NEON2SSE_GLOBAL int8x16x4_t vld4q_s8(__transfersize(64) int8_t const * ptr); // VLD4.8 {d0, d2, d4, d6}, [r0]
10377#define vld4q_s8(ptr) vld4q_u8((uint8_t*)ptr)
10378
10379_NEON2SSE_GLOBAL int16x8x4_t vld4q_s16(__transfersize(32) int16_t const * ptr); // VLD4.16 {d0, d2, d4, d6}, [r0]
10380#define vld4q_s16(ptr) vld4q_u16((uint16_t*)ptr)
10381
10382_NEON2SSE_GLOBAL int32x4x4_t vld4q_s32(__transfersize(16) int32_t const * ptr); // VLD4.32 {d0, d2, d4, d6}, [r0]
10383#define vld4q_s32(ptr) vld4q_u32((uint32_t*)ptr)
10384
10385_NEON2SSE_GLOBAL float16x8x4_t vld4q_f16(__transfersize(32) __fp16 const * ptr); // VLD4.16 {d0, d2, d4, d6}, [r0]
10386// IA32 SIMD doesn't work with 16bit floats currently, so need to go to 32 bit and then work with two 128bit registers. See vld1q_f16 for example
10387
10388_NEON2SSESTORAGE float32x4x4_t vld4q_f32(__transfersize(16) float32_t const * ptr); // VLD4.32 {d0, d2, d4, d6}, [r0]
10389_NEON2SSE_INLINE float32x4x4_t vld4q_f32(__transfersize(16) float32_t const * ptr) // VLD4.32 {d0, d2, d4, d6}, [r0]
10390{
10391 float32x4x4_t v;
10392 __m128 tmp3, tmp2, tmp1, tmp0;
10393
10394 v.val[0] = vld1q_f32 ((float*) ptr);
10395 v.val[1] = vld1q_f32 ((float*) (ptr + 4));
10396 v.val[2] = vld1q_f32 ((float*) (ptr + 8));
10397 v.val[3] = vld1q_f32 ((float*) (ptr + 12));
10398 tmp0 = _mm_unpacklo_ps(v.val[0], v.val[1]);
10399 tmp2 = _mm_unpacklo_ps(v.val[2], v.val[3]);
10400 tmp1 = _mm_unpackhi_ps(v.val[0], v.val[1]);
10401 tmp3 = _mm_unpackhi_ps(v.val[2], v.val[3]);
10402 v.val[0] = _mm_movelh_ps(tmp0, tmp2);
10403 v.val[1] = _mm_movehl_ps(tmp2, tmp0);
10404 v.val[2] = _mm_movelh_ps(tmp1, tmp3);
10405 v.val[3] = _mm_movehl_ps(tmp3, tmp1);
10406 return v;
10407}
10408
10409_NEON2SSE_GLOBAL poly8x16x4_t vld4q_p8(__transfersize(64) poly8_t const * ptr); // VLD4.8 {d0, d2, d4, d6}, [r0]
10410#define vld4q_p8 vld4q_u8
10411
10412_NEON2SSE_GLOBAL poly16x8x4_t vld4q_p16(__transfersize(32) poly16_t const * ptr); // VLD4.16 {d0, d2, d4, d6}, [r0]
10413#define vld4q_p16 vld4q_s16
10414
10415_NEON2SSESTORAGE uint8x8x4_t vld4_u8(__transfersize(32) uint8_t const * ptr); // VLD4.8 {d0, d1, d2, d3}, [r0]
10416_NEON2SSE_INLINE uint8x8x4_t vld4_u8(__transfersize(32) uint8_t const * ptr) // VLD4.8 {d0, d1, d2, d3}, [r0]
10417{
10418 uint8x8x4_t v;
10419 __m128i sh0, sh1;
10420 __m128i val0, val2;
10421 _NEON2SSE_ALIGN_16 static const int8_t mask4_8[16] = {0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15};
10422
10423 val0 = vld1q_u8(( ptr)); //load first 64-bits in val[0] and val[1]
10424 val2 = vld1q_u8(( ptr + 16)); //load third and forth 64-bits in val[2], val[3]
10425
10426 sh0 = _mm_shuffle_epi8(val0, *(__m128i*)mask4_8);
10427 sh1 = _mm_shuffle_epi8(val2, *(__m128i*)mask4_8);
10428 val0 = _mm_unpacklo_epi32(sh0,sh1); //0,4,8,12,16,20,24,28, 1,5,9,13,17,21,25,29
10429 vst1q_u8(&v.val[0], val0 );
10430 val2 = _mm_unpackhi_epi32(sh0,sh1); //2,6,10,14,18,22,26,30, 3,7,11,15,19,23,27,31
10431 vst1q_u8(&v.val[2], val2 );
10432 return v;
10433}
10434
10435_NEON2SSESTORAGE uint16x4x4_t vld4_u16(__transfersize(16) uint16_t const * ptr); // VLD4.16 {d0, d1, d2, d3}, [r0]
10436_NEON2SSE_INLINE uint16x4x4_t vld4_u16(__transfersize(16) uint16_t const * ptr) // VLD4.16 {d0, d1, d2, d3}, [r0]
10437{
10438 uint16x4x4_t v;
10439 __m128i sh0, sh1;
10440 __m128i val0, val2;
10441 _NEON2SSE_ALIGN_16 static const int8_t mask4_16[16] = {0,1, 8,9, 2,3, 10,11, 4,5, 12,13, 6,7, 14,15}; //0, 4, 1, 5, 2, 6, 3, 7
10442 val0 = vld1q_u16 ( (ptr)); //load first 64-bits in val[0] and val[1]
10443 val2 = vld1q_u16 ( (ptr + 8)); //load third and forth 64-bits in val[2], val[3]
10444 sh0 = _mm_shuffle_epi8(val0, *(__m128i*)mask4_16);
10445 sh1 = _mm_shuffle_epi8(val2, *(__m128i*)mask4_16);
10446 val0 = _mm_unpacklo_epi32(sh0,sh1); //0,4,8,12, 1,5,9,13
10447 vst1q_u16(&v.val[0], val0 );
10448 val2 = _mm_unpackhi_epi32(sh0,sh1); //2,6,10,14, 3,7,11,15
10449 vst1q_u16(&v.val[2], val2 );
10450 return v;
10451}
10452
10453_NEON2SSESTORAGE uint32x2x4_t vld4_u32(__transfersize(8) uint32_t const * ptr); // VLD4.32 {d0, d1, d2, d3}, [r0]
10454_NEON2SSE_INLINE uint32x2x4_t vld4_u32(__transfersize(8) uint32_t const * ptr)
10455{
10456 //a0,a1, b0,b1, c0,c1, d0,d1 -> a0,c0, a1,c1, b0,d0, b1,d1
10457 uint32x2x4_t v;
10458 __m128i val0, val01, val2;
10459 val0 = vld1q_u32 (ptr); //a0,a1, b0,b1,
10460 val2 = vld1q_u32 ((ptr + 4)); //c0,c1, d0,d1
10461 val01 = _mm_unpacklo_epi32(val0,val2); //a0, c0, a1,c1,
10462 val2 = _mm_unpackhi_epi32(val0,val2); //b0,d0, b1, d1
10463 vst1q_u32(&v.val[0], val01);
10464 vst1q_u32(&v.val[2], val2 );
10465 return v;
10466}
10467
10468_NEON2SSESTORAGE uint64x1x4_t vld4_u64(__transfersize(4) uint64_t const * ptr); // VLD1.64 {d0, d1, d2, d3}, [r0]
10469_NEON2SSE_INLINE uint64x1x4_t vld4_u64(__transfersize(4) uint64_t const * ptr) // VLD1.64 {d0, d1, d2, d3}, [r0]
10470{
10471 uint64x1x4_t v;
10472 v.val[0].m64_u64[0] = *(ptr); //load first 64-bits in val[0] and val[1]
10473 v.val[1].m64_u64[0] = *(ptr + 1); //load first 64-bits in val[0] and val[1]
10474 v.val[2].m64_u64[0] = *(ptr + 2); //load third and forth 64-bits in val[2], val[3]
10475 v.val[3].m64_u64[0] = *(ptr + 3); //load third and forth 64-bits in val[2], val[3]
10476 return v;
10477}
10478
10479_NEON2SSE_GLOBAL int8x8x4_t vld4_s8(__transfersize(32) int8_t const * ptr); // VLD4.8 {d0, d1, d2, d3}, [r0]
10480#define vld4_s8(ptr) vld4_u8((uint8_t*)ptr)
10481
10482_NEON2SSE_GLOBAL int16x4x4_t vld4_s16(__transfersize(16) int16_t const * ptr); // VLD4.16 {d0, d1, d2, d3}, [r0]
10483#define vld4_s16(ptr) vld4_u16((uint16_t*)ptr)
10484
10485_NEON2SSE_GLOBAL int32x2x4_t vld4_s32(__transfersize(8) int32_t const * ptr); // VLD4.32 {d0, d1, d2, d3}, [r0]
10486#define vld4_s32(ptr) vld4_u32((uint32_t*)ptr)
10487
10488//int64x1x4_t vld4_s64(__transfersize(4) int64_t const * ptr); // VLD1.64 {d0, d1, d2, d3}, [r0]
10489#define vld4_s64(ptr) vld4_u64((uint64_t*)ptr)
10490
10491_NEON2SSE_GLOBAL float16x4x4_t vld4_f16(__transfersize(16) __fp16 const * ptr); // VLD4.16 {d0, d1, d2, d3}, [r0]
10492// IA32 SIMD doesn't work with 16bit floats currently, so need to go to 32 bit and then work with two 128bit registers. See vld1q_f16 for example
10493
10494_NEON2SSESTORAGE float32x2x4_t vld4_f32(__transfersize(8) float32_t const * ptr); // VLD4.32 {d0, d1, d2, d3}, [r0]
10495_NEON2SSE_INLINE float32x2x4_t vld4_f32(__transfersize(8) float32_t const * ptr) // VLD4.32 {d0, d1, d2, d3}, [r0]
10496{
10497 //a0,a1, b0,b1, c0,c1, d0,d1 -> a0,c0, a1,c1, b0,d0, b1,d1
10498 float32x2x4_t res;
10499 res.val[0].m64_f32[0] = *(ptr);
10500 res.val[0].m64_f32[1] = *(ptr + 4);
10501 res.val[1].m64_f32[0] = *(ptr + 1);
10502 res.val[1].m64_f32[1] = *(ptr + 5);
10503 res.val[2].m64_f32[0] = *(ptr + 2);
10504 res.val[2].m64_f32[1] = *(ptr + 6);
10505 res.val[3].m64_f32[0] = *(ptr + 3);
10506 res.val[3].m64_f32[1] = *(ptr + 7);
10507 return res;
10508}
10509
10510_NEON2SSE_GLOBAL poly8x8x4_t vld4_p8(__transfersize(32) poly8_t const * ptr); // VLD4.8 {d0, d1, d2, d3}, [r0]
10511#define vld4_p8 vld4_u8
10512
10513_NEON2SSE_GLOBAL poly16x4x4_t vld4_p16(__transfersize(16) poly16_t const * ptr); // VLD4.16 {d0, d1, d2, d3}, [r0]
10514#define vld4_p16 vld4_u16
10515
10516//************* Duplicate (or propagate) ptr[0] to all val[0] lanes and ptr[1] to all val[1] lanes *******************
10517//*******************************************************************************************************************
10518_NEON2SSESTORAGE uint8x8x2_t vld2_dup_u8(__transfersize(2) uint8_t const * ptr); // VLD2.8 {d0[], d1[]}, [r0]
10519_NEON2SSE_INLINE uint8x8x2_t vld2_dup_u8(__transfersize(2) uint8_t const * ptr) // VLD2.8 {d0[], d1[]}, [r0]
10520{
10521 uint8x8x2_t v;
10522 __m128i val0, val1;
10523 val0 = LOAD_SI128(ptr); //0,1,x,x, x,x,x,x,x,x,x,x, x,x,x,x
10524 val1 = _mm_unpacklo_epi8(val0,val0); //0,0,1,1,x,x,x,x, x,x,x,x,x,x,x,x,
10525 val1 = _mm_unpacklo_epi16(val1,val1); //0,0,0,0, 1,1,1,1,x,x,x,x, x,x,x,x
10526 val0 = _mm_unpacklo_epi32(val1,val1); //0,0,0,0, 0,0,0,0,1,1,1,1,1,1,1,1,
10527 vst1q_u8(v.val, val0);
10528 return v;
10529}
10530
10531_NEON2SSESTORAGE uint16x4x2_t vld2_dup_u16(__transfersize(2) uint16_t const * ptr); // VLD2.16 {d0[], d1[]}, [r0]
10532_NEON2SSE_INLINE uint16x4x2_t vld2_dup_u16(__transfersize(2) uint16_t const * ptr) // VLD2.16 {d0[], d1[]}, [r0]
10533{
10534 uint16x4x2_t v;
10535 __m128i val0, val1;
10536 val1 = LOAD_SI128(ptr); //0,1,x,x, x,x,x,x
10537 val0 = _mm_shufflelo_epi16(val1, 0); //00 00 00 00 (all 0)
10538 _M64(v.val[0], val0);
10539 val1 = _mm_shufflelo_epi16(val1, 85); //01 01 01 01 (all 1)
10540 _M64(v.val[1], val1);
10541 return v;
10542}
10543
10544_NEON2SSESTORAGE uint32x2x2_t vld2_dup_u32(__transfersize(2) uint32_t const * ptr); // VLD2.32 {d0[], d1[]}, [r0]
10545_NEON2SSE_INLINE uint32x2x2_t vld2_dup_u32(__transfersize(2) uint32_t const * ptr) // VLD2.32 {d0[], d1[]}, [r0]
10546{
10547 uint32x2x2_t v;
10548 __m128i val0;
10549 val0 = LOAD_SI128(ptr); //0,1,x,x
10550 val0 = _mm_shuffle_epi32(val0, 0 | (0 << 2) | (1 << 4) | (1 << 6)); //0,0,1,1
10551 vst1q_u32(v.val, val0);
10552 return v;
10553}
10554
10555_NEON2SSE_GLOBAL uint64x1x2_t vld2_dup_u64(__transfersize(2) uint64_t const * ptr); // VLD1.64 {d0, d1}, [r0]
10556#define vld2_dup_u64 vld2_u64
10557
10558_NEON2SSE_GLOBAL int8x8x2_t vld2_dup_s8(__transfersize(2) int8_t const * ptr); // VLD2.8 {d0[], d1[]}, [r0]
10559#define vld2_dup_s8(ptr) vld2_dup_u8((uint8_t*)ptr)
10560
10561_NEON2SSE_GLOBAL int16x4x2_t vld2_dup_s16(__transfersize(2) int16_t const * ptr); // VLD2.16 {d0[], d1[]}, [r0]
10562#define vld2_dup_s16(ptr) vld2_dup_u16((uint16_t*)ptr)
10563
10564_NEON2SSE_GLOBAL int32x2x2_t vld2_dup_s32(__transfersize(2) int32_t const * ptr); // VLD2.32 {d0[], d1[]}, [r0]
10565#define vld2_dup_s32(ptr) vld2_dup_u32((uint32_t*)ptr)
10566
10567_NEON2SSE_GLOBAL int64x1x2_t vld2_dup_s64(__transfersize(2) int64_t const * ptr); // VLD1.64 {d0, d1}, [r0]
10568#define vld2_dup_s64(ptr) vld2_dup_u64((uint64_t*)ptr)
10569
10570_NEON2SSE_GLOBAL float16x4x2_t vld2_dup_f16(__transfersize(2) __fp16 const * ptr); // VLD2.16 {d0[], d1[]}, [r0]
10571// IA32 SIMD doesn't work with 16bit floats currently, so need to go to 32 bit and then work with two 128bit registers. See vld1q_f16 for example
10572
10573_NEON2SSESTORAGE float32x2x2_t vld2_dup_f32(__transfersize(2) float32_t const * ptr); // VLD2.32 {d0[], d1[]}, [r0]
10574_NEON2SSE_INLINE float32x2x2_t vld2_dup_f32(__transfersize(2) float32_t const * ptr) // VLD2.32 {d0[], d1[]}, [r0]
10575{
10576 float32x2x2_t v;
10577 v.val[0].m64_f32[0] = *(ptr); //0,0
10578 v.val[0].m64_f32[1] = *(ptr); //0,0
10579 v.val[1].m64_f32[0] = *(ptr + 1); //1,1
10580 v.val[1].m64_f32[1] = *(ptr + 1); //1,1
10581 return v;
10582}
10583
10584_NEON2SSE_GLOBAL poly8x8x2_t vld2_dup_p8(__transfersize(2) poly8_t const * ptr); // VLD2.8 {d0[], d1[]}, [r0]
10585#define vld2_dup_p8 vld2_dup_u8
10586
10587_NEON2SSE_GLOBAL poly16x4x2_t vld2_dup_p16(__transfersize(2) poly16_t const * ptr); // VLD2.16 {d0[], d1[]}, [r0]
10588#define vld2_dup_p16 vld2_dup_s16
10589
10590//************* Duplicate (or propagate)triplets: *******************
10591//********************************************************************
10592//ptr[0] to all val[0] lanes, ptr[1] to all val[1] lanes and ptr[2] to all val[2] lanes
10593_NEON2SSESTORAGE uint8x8x3_t vld3_dup_u8(__transfersize(3) uint8_t const * ptr); // VLD3.8 {d0[], d1[], d2[]}, [r0]
10594_NEON2SSE_INLINE uint8x8x3_t vld3_dup_u8(__transfersize(3) uint8_t const * ptr) // VLD3.8 {d0[], d1[], d2[]}, [r0]
10595{
10596 uint8x8x3_t v;
10597 __m128i val0, val1, val2;
10598 val0 = LOAD_SI128(ptr); //0,1,2,x, x,x,x,x,x,x,x,x, x,x,x,x
10599 val1 = _mm_unpacklo_epi8(val0,val0); //0,0,1,1,2,2,x,x, x,x,x,x,x,x,x,x,
10600 val1 = _mm_unpacklo_epi16(val1,val1); //0,0,0,0, 1,1,1,1,2,2,2,2,x,x,x,x,
10601 val0 = _mm_unpacklo_epi32(val1,val1); //0,0,0,0, 0,0,0,0,1,1,1,1,1,1,1,1,
10602 val2 = _mm_unpackhi_epi32(val1,val1); // 2,2,2,2,2,2,2,2, x,x,x,x,x,x,x,x,
10603 vst1q_u8(v.val, val0);
10604 _M64(v.val[2], val2);
10605 return v;
10606}
10607
10608_NEON2SSESTORAGE uint16x4x3_t vld3_dup_u16(__transfersize(3) uint16_t const * ptr); // VLD3.16 {d0[], d1[], d2[]}, [r0]
10609_NEON2SSE_INLINE uint16x4x3_t vld3_dup_u16(__transfersize(3) uint16_t const * ptr) // VLD3.16 {d0[], d1[], d2[]}, [r0]
10610{
10611 uint16x4x3_t v;
10612 __m128i val0, val1, val2;
10613 val2 = LOAD_SI128(ptr); //0,1,2,x, x,x,x,x
10614 val0 = _mm_shufflelo_epi16(val2, 0); //00 00 00 00 (all 0)
10615 val1 = _mm_shufflelo_epi16(val2, 85); //01 01 01 01 (all 1)
10616 val2 = _mm_shufflelo_epi16(val2, 170); //10 10 10 10 (all 2)
10617 _M64(v.val[0], val0);
10618 _M64(v.val[1], val1);
10619 _M64(v.val[2], val2);
10620 return v;
10621}
10622
10623_NEON2SSESTORAGE uint32x2x3_t vld3_dup_u32(__transfersize(3) uint32_t const * ptr); // VLD3.32 {d0[], d1[], d2[]}, [r0]
10624_NEON2SSE_INLINE uint32x2x3_t vld3_dup_u32(__transfersize(3) uint32_t const * ptr) // VLD3.32 {d0[], d1[], d2[]}, [r0]
10625{
10626 uint32x2x3_t v;
10627 __m128i val0, val1, val2;
10628 val2 = LOAD_SI128(ptr); //0,1,2,x
10629 val0 = _mm_shuffle_epi32(val2, 0 | (0 << 2) | (2 << 4) | (2 << 6)); //0,0,2,2
10630 val1 = _mm_shuffle_epi32(val2, 1 | (1 << 2) | (2 << 4) | (2 << 6)); //1,1,2,2
10631 val2 = _mm_srli_si128(val0, 8); //2,2,0x0,0x0
10632 _M64(v.val[0], val0);
10633 _M64(v.val[1], val1);
10634 _M64(v.val[2], val2);
10635 return v;
10636}
10637
10638_NEON2SSESTORAGE uint64x1x3_t vld3_dup_u64(__transfersize(3) uint64_t const * ptr); // VLD1.64 {d0, d1, d2}, [r0]
10639_NEON2SSE_INLINE uint64x1x3_t vld3_dup_u64(__transfersize(3) uint64_t const * ptr) // VLD1.64 {d0, d1, d2}, [r0]
10640{
10641 uint64x1x3_t v;
10642 v.val[0].m64_u64[0] = *(ptr);
10643 v.val[1].m64_u64[0] = *(ptr + 1);
10644 v.val[2].m64_u64[0] = *(ptr + 2);
10645 return v;
10646}
10647
10648_NEON2SSE_GLOBAL int8x8x3_t vld3_dup_s8(__transfersize(3) int8_t const * ptr); // VLD3.8 {d0[], d1[], d2[]}, [r0]
10649#define vld3_dup_s8(ptr) vld3_dup_u8((uint8_t*)ptr)
10650
10651_NEON2SSE_GLOBAL int16x4x3_t vld3_dup_s16(__transfersize(3) int16_t const * ptr); // VLD3.16 {d0[], d1[], d2[]}, [r0]
10652#define vld3_dup_s16(ptr) vld3_dup_u16((uint16_t*)ptr)
10653
10654_NEON2SSE_GLOBAL int32x2x3_t vld3_dup_s32(__transfersize(3) int32_t const * ptr); // VLD3.32 {d0[], d1[], d2[]}, [r0]
10655#define vld3_dup_s32(ptr) vld3_dup_u32((uint32_t*)ptr)
10656
10657//int64x1x3_t vld3_dup_s64(__transfersize(3) int64_t const * ptr); // VLD1.64 {d0, d1, d2}, [r0]
10658#define vld3_dup_s64(ptr) vld3_dup_u64((uint64_t*)ptr)
10659
10660
10661_NEON2SSE_GLOBAL float16x4x3_t vld3_dup_f16(__transfersize(3) __fp16 const * ptr); // VLD3.16 {d0[], d1[], d2[]}, [r0]
10662// IA32 SIMD doesn't work with 16bit floats currently, so need to go to 32 bit and then work with two 128bit registers. See vld1q_f16 for example
10663
10664_NEON2SSESTORAGE float32x2x3_t vld3_dup_f32(__transfersize(3) float32_t const * ptr); // VLD3.32 {d0[], d1[], d2[]}, [r0]
10665_NEON2SSE_INLINE float32x2x3_t vld3_dup_f32(__transfersize(3) float32_t const * ptr) // VLD3.32 {d0[], d1[], d2[]}, [r0]
10666{
10667 float32x2x3_t v;
10668 int i;
10669 for (i = 0; i<3; i++) {
10670 v.val[i].m64_f32[0] = *(ptr + i);
10671 v.val[i].m64_f32[1] = *(ptr + i);
10672 }
10673 return v;
10674}
10675
10676_NEON2SSE_GLOBAL poly8x8x3_t vld3_dup_p8(__transfersize(3) poly8_t const * ptr); // VLD3.8 {d0[], d1[], d2[]}, [r0]
10677#define vld3_dup_p8 vld3_dup_u8
10678
10679_NEON2SSE_GLOBAL poly16x4x3_t vld3_dup_p16(__transfersize(3) poly16_t const * ptr); // VLD3.16 {d0[], d1[], d2[]}, [r0]
10680#define vld3_dup_p16 vld3_dup_s16
10681
10682
10683//************* Duplicate (or propagate) quadruples: *******************
10684//***********************************************************************
10685//ptr[0] to all val[0] lanes, ptr[1] to all val[1] lanes, ptr[2] to all val[2] lanes and ptr[3] to all val[3] lanes
10686_NEON2SSESTORAGE uint8x8x4_t vld4_dup_u8(__transfersize(4) uint8_t const * ptr); // VLD4.8 {d0[], d1[], d2[], d3[]}, [r0]
10687_NEON2SSE_INLINE uint8x8x4_t vld4_dup_u8(__transfersize(4) uint8_t const * ptr) // VLD4.8 {d0[], d1[], d2[], d3[]}, [r0]
10688{
10689 uint8x8x4_t v;
10690 __m128i val0, val1, val2;
10691 val0 = LOAD_SI128(ptr); //0,1,2,3, x,x,x,x,x,x,x,x, x,x,x,x
10692 val1 = _mm_unpacklo_epi8(val0,val0); //0,0,1,1,2,2,3,3, x,x,x,x,x,x,x,x,
10693 val1 = _mm_unpacklo_epi16(val1,val1); //0,0,0,0, 1,1,1,1,2,2,2,2,3,3,3,3
10694 val0 = _mm_unpacklo_epi32(val1,val1); //0,0,0,0, 0,0,0,0,1,1,1,1,1,1,1,1,
10695 val2 = _mm_unpackhi_epi32(val1,val1); // 2,2,2,2,2,2,2,2, 3,3,3,3, 3,3,3,3
10696 vst1q_u8(&v.val[0], val0);
10697 vst1q_u8(&v.val[2], val2);
10698 return v;
10699}
10700
10701_NEON2SSESTORAGE uint16x4x4_t vld4_dup_u16(__transfersize(4) uint16_t const * ptr); // VLD4.16 {d0[], d1[], d2[], d3[]}, [r0]
10702_NEON2SSE_INLINE uint16x4x4_t vld4_dup_u16(__transfersize(4) uint16_t const * ptr) // VLD4.16 {d0[], d1[], d2[], d3[]}, [r0]
10703{
10704 uint16x4x4_t v;
10705 __m128i val0, val1, val2, val3;
10706 val3 = LOAD_SI128(ptr); //0,1,2,3, x,x,x,x
10707 val0 = _mm_shufflelo_epi16(val3, 0); //00 00 00 00 (all 0)
10708 val1 = _mm_shufflelo_epi16(val3, 85); //01 01 01 01 (all 1)
10709 val2 = _mm_shufflelo_epi16(val3, 170); //10 10 10 10 (all 2)
10710 val3 = _mm_shufflelo_epi16(val3, 255); //11 11 11 11 (all 3)
10711 _M64(v.val[0], val0);
10712 _M64(v.val[1], val1);
10713 _M64(v.val[2], val2);
10714 _M64(v.val[3], val3);
10715 return v;
10716}
10717
10718_NEON2SSESTORAGE uint32x2x4_t vld4_dup_u32(__transfersize(4) uint32_t const * ptr); // VLD4.32 {d0[], d1[], d2[], d3[]}, [r0]
10719_NEON2SSE_INLINE uint32x2x4_t vld4_dup_u32(__transfersize(4) uint32_t const * ptr) // VLD4.32 {d0[], d1[], d2[], d3[]}, [r0]
10720{
10721 uint32x2x4_t v;
10722 __m128i val0, val1, val2, val3;
10723 val3 = LOAD_SI128(ptr); //0,1,2,3
10724 val0 = _mm_shuffle_epi32(val3, 0 | (0 << 2) | (2 << 4) | (3 << 6)); //0,0,2,3
10725 val1 = _mm_shuffle_epi32(val3, 1 | (1 << 2) | (2 << 4) | (3 << 6)); //1,1,2,3
10726 val2 = _mm_shuffle_epi32(val3, 2 | (2 << 2) | (3 << 4) | (3 << 6)); //2,2,3,3
10727 val3 = _mm_shuffle_epi32(val3, 3 | (3 << 2) | (3 << 4) | (3 << 6)); //3,3,2,2
10728 _M64(v.val[0], val0);
10729 _M64(v.val[1], val1);
10730 _M64(v.val[2], val2);
10731 _M64(v.val[3], val3);
10732 return v;
10733}
10734
10735_NEON2SSESTORAGE uint64x1x4_t vld4_dup_u64(__transfersize(4) uint64_t const * ptr); // VLD1.64 {d0, d1, d2, d3}, [r0]
10736_NEON2SSE_INLINE uint64x1x4_t vld4_dup_u64(__transfersize(4) uint64_t const * ptr) // VLD1.64 {d0, d1, d2, d3}, [r0]
10737{
10738 uint64x1x4_t v;
10739 v.val[0].m64_u64[0] = *(ptr);
10740 v.val[1].m64_u64[0] = *(ptr + 1);
10741 v.val[2].m64_u64[0] = *(ptr + 2);
10742 v.val[3].m64_u64[0] = *(ptr + 3);
10743 return v;
10744}
10745
10746_NEON2SSE_GLOBAL int8x8x4_t vld4_dup_s8(__transfersize(4) int8_t const * ptr); // VLD4.8 {d0[], d1[], d2[], d3[]}, [r0]
10747#define vld4_dup_s8(ptr) vld4_dup_u8((uint8_t*)ptr)
10748
10749_NEON2SSE_GLOBAL int16x4x4_t vld4_dup_s16(__transfersize(4) int16_t const * ptr); // VLD4.16 {d0[], d1[], d2[], d3[]}, [r0]
10750#define vld4_dup_s16(ptr) vld4_dup_u16((uint16_t*)ptr)
10751
10752_NEON2SSE_GLOBAL int32x2x4_t vld4_dup_s32(__transfersize(4) int32_t const * ptr); // VLD4.32 {d0[], d1[], d2[], d3[]}, [r0]
10753#define vld4_dup_s32(ptr) vld4_dup_u32((uint32_t*)ptr)
10754
10755//int64x1x4_t vld4_dup_s64(__transfersize(4) int64_t const * ptr); // VLD1.64 {d0, d1, d2, d3}, [r0]
10756#define vld4_dup_s64(ptr) vld4_dup_u64((uint64_t*)ptr)
10757
10758_NEON2SSE_GLOBAL float16x4x4_t vld4_dup_f16(__transfersize(4) __fp16 const * ptr); // VLD4.16 {d0[], d1[], d2[], d3[]}, [r0]
10759// IA32 SIMD doesn't work with 16bit floats currently, so need to go to 32 bit and then work with two 128bit registers. See vld1q_f16 for example
10760
10761_NEON2SSESTORAGE float32x2x4_t vld4_dup_f32(__transfersize(4) float32_t const * ptr); // VLD4.32 {d0[], d1[], d2[], d3[]}, [r0]
10762_NEON2SSE_INLINE float32x2x4_t vld4_dup_f32(__transfersize(4) float32_t const * ptr) // VLD4.32 {d0[], d1[], d2[], d3[]}, [r0]
10763{
10764 float32x2x4_t v;
10765 int i;
10766 for (i = 0; i<4; i++) {
10767 v.val[i].m64_f32[0] = *(ptr + i);
10768 v.val[i].m64_f32[1] = *(ptr + i);
10769 }
10770 return v;
10771}
10772
10773_NEON2SSE_GLOBAL poly8x8x4_t vld4_dup_p8(__transfersize(4) poly8_t const * ptr); // VLD4.8 {d0[], d1[], d2[], d3[]}, [r0]
10774#define vld4_dup_p8 vld4_dup_u8
10775
10776_NEON2SSE_GLOBAL poly16x4x4_t vld4_dup_p16(__transfersize(4) poly16_t const * ptr); // VLD4.16 {d0[], d1[], d2[], d3[]}, [r0]
10777#define vld4_dup_p16 vld4_dup_u16
10778
10779
10780//**********************************************************************************
10781//*******************Lane loads for an N-element structures ***********************
10782//**********************************************************************************
10783//********************** Lane pairs ************************************************
10784//does vld1_lane_xx ptr[0] to src->val[0] at lane positon and ptr[1] to src->val[1] at lane positon
10785//we assume src is 16 bit aligned
10786
10787//!!!!!! Microsoft compiler does not allow xxxxxx_2t function arguments resulting in "formal parameter with __declspec(align('16')) won't be aligned" error
10788//to fix it the all functions below work with xxxxxx_2t pointers and the corresponding original functions are redefined
10789
10790//uint16x8x2_t vld2q_lane_u16(__transfersize(2) uint16_t const * ptr, uint16x8x2_t src,__constrange(0,7) int lane);// VLD2.16 {d0[0], d2[0]}, [r0]
10791_NEON2SSE_INLINE uint16x8x2_t vld2q_lane_u16_ptr(__transfersize(2) uint16_t const * ptr, uint16x8x2_t* src,__constrange(0,7) int lane) // VLD2.16 {d0[0], d2[0]}, [r0]
10792{
10793 uint16x8x2_t v;
10794 v.val[0] = vld1q_lane_s16 (ptr, src->val[0], lane);
10795 v.val[1] = vld1q_lane_s16 ((ptr + 1), src->val[1], lane);
10796 return v;
10797}
10798#define vld2q_lane_u16(ptr, src, lane) vld2q_lane_u16_ptr(ptr, &src, lane)
10799
10800//uint32x4x2_t vld2q_lane_u32(__transfersize(2) uint32_t const * ptr, uint32x4x2_t src,__constrange(0,3) int lane);// VLD2.32 {d0[0], d2[0]}, [r0]
10801_NEON2SSE_INLINE uint32x4x2_t vld2q_lane_u32_ptr(__transfersize(2) uint32_t const * ptr, uint32x4x2_t* src,__constrange(0,3) int lane) // VLD2.32 {d0[0], d2[0]}, [r0]
10802{
10803 uint32x4x2_t v;
10804 v.val[0] = _MM_INSERT_EPI32 (src->val[0], ptr[0], lane);
10805 v.val[1] = _MM_INSERT_EPI32 (src->val[1], ptr[1], lane);
10806 return v;
10807}
10808#define vld2q_lane_u32(ptr, src, lane) vld2q_lane_u32_ptr(ptr, &src, lane)
10809
10810//int16x8x2_t vld2q_lane_s16(__transfersize(2) int16_t const * ptr, int16x8x2_t src, __constrange(0,7)int lane);// VLD2.16 {d0[0], d2[0]}, [r0]
10811_NEON2SSE_INLINE int16x8x2_t vld2q_lane_s16_ptr(__transfersize(2) int16_t const * ptr, int16x8x2_t* src, __constrange(0,7) int lane)
10812{
10813 int16x8x2_t v;
10814 v.val[0] = vld1q_lane_s16 (ptr, src->val[0], lane);
10815 v.val[1] = vld1q_lane_s16 ((ptr + 1), src->val[1], lane);
10816 return v;
10817}
10818#define vld2q_lane_s16(ptr, src, lane) vld2q_lane_s16_ptr(ptr, &src, lane)
10819
10820//int32x4x2_t vld2q_lane_s32(__transfersize(2) int32_t const * ptr, int32x4x2_t src, __constrange(0,3)int lane);// VLD2.32 {d0[0], d2[0]}, [r0]
10821_NEON2SSE_INLINE int32x4x2_t vld2q_lane_s32_ptr(__transfersize(2) int32_t const * ptr, int32x4x2_t* src, __constrange(0,3) int lane)
10822{
10823 int32x4x2_t v;
10824 v.val[0] = _MM_INSERT_EPI32 (src->val[0], ptr[0], lane);
10825 v.val[1] = _MM_INSERT_EPI32 (src->val[1], ptr[1], lane);
10826 return v;
10827}
10828#define vld2q_lane_s32(ptr, src, lane) vld2q_lane_s32_ptr(ptr, &src, lane)
10829
10830//float16x8x2_t vld2q_lane_f16(__transfersize(2) __fp16 const * ptr, float16x8x2_t src, __constrange(0,7)int lane);// VLD2.16 {d0[0], d2[0]}, [r0]
10831//current IA SIMD doesn't support float16
10832
10833//float32x4x2_t vld2q_lane_f32_ptr(__transfersize(2) float32_t const * ptr, float32x4x2_t src,__constrange(0,3) int lane);// VLD2.32 {d0[0], d2[0]}, [r0]
10834_NEON2SSE_INLINE float32x4x2_t vld2q_lane_f32_ptr(__transfersize(2) float32_t const * ptr, float32x4x2_t* src,__constrange(0,3) int lane) // VLD2.32 {d0[0], d2[0]}, [r0]
10835{
10836 float32x4x2_t v;
10837 v.val[0] = vld1q_lane_f32(ptr, src->val[0], lane);
10838 v.val[1] = vld1q_lane_f32((ptr + 1), src->val[1], lane);
10839 return v;
10840}
10841#define vld2q_lane_f32(ptr,src,lane) vld2q_lane_f32_ptr(ptr,&src,lane)
10842
10843//poly16x8x2_t vld2q_lane_p16(__transfersize(2) poly16_t const * ptr, poly16x8x2_t src,__constrange(0,7) int lane);// VLD2.16 {d0[0], d2[0]}, [r0]
10844#define vld2q_lane_p16 vld2q_lane_u16
10845
10846_NEON2SSESTORAGE uint8x8x2_t vld2_lane_u8(__transfersize(2) uint8_t const * ptr, uint8x8x2_t src, __constrange(0,7) int lane);// VLD2.8 {d0[0], d1[0]}, [r0]
10847_NEON2SSE_INLINE uint8x8x2_t vld2_lane_u8(__transfersize(2) uint8_t const * ptr, uint8x8x2_t src, __constrange(0,7) int lane) // VLD2.8 {d0[0], d1[0]}, [r0]
10848{
10849 uint8x8x2_t v;
10850 v.val[0] = vld1_lane_u8(ptr, src.val[0], lane);
10851 v.val[1] = vld1_lane_u8((ptr + 1), src.val[1], lane);
10852 return v;
10853}
10854
10855_NEON2SSESTORAGE uint16x4x2_t vld2_lane_u16(__transfersize(2) uint16_t const * ptr, uint16x4x2_t src, __constrange(0,3)int lane);// VLD2.16 {d0[0], d1[0]}, [r0]
10856_NEON2SSE_INLINE uint16x4x2_t vld2_lane_u16(__transfersize(2) uint16_t const * ptr, uint16x4x2_t src, __constrange(0,3) int lane)
10857{
10858 uint16x4x2_t v;
10859 v.val[0] = vld1_lane_u16(ptr, src.val[0], lane);
10860 v.val[1] = vld1_lane_u16((ptr + 1), src.val[1], lane);
10861 return v;
10862}
10863
10864_NEON2SSESTORAGE uint32x2x2_t vld2_lane_u32(__transfersize(2) uint32_t const * ptr, uint32x2x2_t src, __constrange(0,1)int lane);// VLD2.32 {d0[0], d1[0]}, [r0]
10865_NEON2SSE_INLINE uint32x2x2_t vld2_lane_u32(__transfersize(2) uint32_t const * ptr, uint32x2x2_t src, __constrange(0,1) int lane)
10866{
10867 uint32x2x2_t v;
10868 v.val[0] = vld1_lane_u32(ptr, src.val[0], lane);
10869 v.val[1] = vld1_lane_u32((ptr + 1), src.val[1], lane);
10870 return v;
10871}
10872
10873_NEON2SSE_GLOBAL int8x8x2_t vld2_lane_s8(__transfersize(2) int8_t const * ptr, int8x8x2_t src, __constrange(0,7) int lane);// VLD2.8 {d0[0], d1[0]}, [r0]
10874#define vld2_lane_s8(ptr, src, lane) vld2_lane_u8(( uint8_t*) ptr, src, lane)
10875
10876_NEON2SSE_GLOBAL int16x4x2_t vld2_lane_s16(__transfersize(2) int16_t const * ptr, int16x4x2_t src, __constrange(0,3) int lane);// VLD2.16 {d0[0], d1[0]}, [r0]
10877#define vld2_lane_s16(ptr, src, lane) vld2_lane_u16(( uint16_t*) ptr, src, lane)
10878
10879_NEON2SSE_GLOBAL int32x2x2_t vld2_lane_s32(__transfersize(2) int32_t const * ptr, int32x2x2_t src, __constrange(0,1) int lane);// VLD2.32 {d0[0], d1[0]}, [r0]
10880#define vld2_lane_s32(ptr, src, lane) vld2_lane_u32(( uint32_t*) ptr, src, lane)
10881
10882//float16x4x2_t vld2_lane_f16(__transfersize(2) __fp16 const * ptr, float16x4x2_t src, __constrange(0,3) int lane); // VLD2.16 {d0[0], d1[0]}, [r0]
10883//current IA SIMD doesn't support float16
10884
10885_NEON2SSESTORAGE float32x2x2_t vld2_lane_f32(__transfersize(2) float32_t const * ptr, float32x2x2_t src,__constrange(0,1) int lane); // VLD2.32 {d0[0], d1[0]}, [r0]
10886_NEON2SSE_INLINE float32x2x2_t vld2_lane_f32(__transfersize(2) float32_t const * ptr, float32x2x2_t src,__constrange(0,1) int lane)
10887{
10888 float32x2x2_t v;
10889 v.val[0] = vld1_lane_f32(ptr, src.val[0], lane);
10890 v.val[1] = vld1_lane_f32((ptr + 1), src.val[1], lane);
10891 return v;
10892}
10893
10894//poly8x8x2_t vld2_lane_p8(__transfersize(2) poly8_t const * ptr, poly8x8x2_t src, __constrange(0,7) int lane);// VLD2.8 {d0[0], d1[0]}, [r0]
10895_NEON2SSE_GLOBAL poly8x8x2_t vld2_lane_p8_ptr(__transfersize(2) poly8_t const * ptr, poly8x8x2_t * src, __constrange(0,7) int lane); // VLD2.8 {d0[0], d1[0]}, [r0]
10896#define vld2_lane_p8 vld2_lane_u8
10897
10898//poly16x4x2_t vld2_lane_p16(__transfersize(2) poly16_t const * ptr, poly16x4x2_t src, __constrange(0,3)int lane);// VLD2.16 {d0[0], d1[0]}, [r0]
10899_NEON2SSE_GLOBAL poly16x4x2_t vld2_lane_p16_ptr(__transfersize(2) poly16_t const * ptr, poly16x4x2_t * src, __constrange(0,3) int lane); // VLD2.16 {d0[0], d1[0]}, [r0]
10900#define vld2_lane_p16 vld2_lane_u16
10901
10902//*********** Lane triplets **********************
10903//*************************************************
10904//does vld1_lane_xx ptr[0] to src->val[0], ptr[1] to src->val[1] and ptr[2] to src->val[2] at lane positon
10905//we assume src is 16 bit aligned
10906
10907//uint16x8x3_t vld3q_lane_u16(__transfersize(3) uint16_t const * ptr, uint16x8x3_t src,__constrange(0,7) int lane);// VLD3.16 {d0[0], d2[0], d4[0]}, [r0]
10908_NEON2SSE_INLINE uint16x8x3_t vld3q_lane_u16_ptr(__transfersize(3) uint16_t const * ptr, uint16x8x3_t* src,__constrange(0,7) int lane) // VLD3.16 {d0[0], d2[0], d4[0]}, [r0]
10909{
10910 uint16x8x3_t v;
10911 v.val[0] = _MM_INSERT_EPI16 ( src->val[0], ptr[0], lane);
10912 v.val[1] = _MM_INSERT_EPI16 ( src->val[1], ptr[1], lane);
10913 v.val[2] = _MM_INSERT_EPI16 ( src->val[2], ptr[2], lane);
10914 return v;
10915}
10916#define vld3q_lane_u16(ptr, src, lane) vld3q_lane_u16_ptr(ptr, &src, lane)
10917
10918//uint32x4x3_t vld3q_lane_u32(__transfersize(3) uint32_t const * ptr, uint32x4x3_t src,__constrange(0,3) int lane);// VLD3.32 {d0[0], d2[0], d4[0]}, [r0]
10919_NEON2SSE_INLINE uint32x4x3_t vld3q_lane_u32_ptr(__transfersize(3) uint32_t const * ptr, uint32x4x3_t* src,__constrange(0,3) int lane) // VLD3.32 {d0[0], d2[0], d4[0]}, [r0]
10920{
10921 uint32x4x3_t v;
10922 v.val[0] = _MM_INSERT_EPI32 ( src->val[0], ptr[0], lane);
10923 v.val[1] = _MM_INSERT_EPI32 ( src->val[1], ptr[1], lane);
10924 v.val[2] = _MM_INSERT_EPI32 ( src->val[2], ptr[2], lane);
10925 return v;
10926}
10927#define vld3q_lane_u32(ptr, src, lane) vld3q_lane_u32_ptr(ptr, &src, lane)
10928
10929//int16x8x3_t vld3q_lane_s16(__transfersize(3) int16_t const * ptr, int16x8x3_t src, __constrange(0,7)int lane);// VLD3.16 {d0[0], d2[0], d4[0]}, [r0]
10930_NEON2SSE_INLINE int16x8x3_t vld3q_lane_s16_ptr(__transfersize(3) int16_t const * ptr, int16x8x3_t* src, __constrange(0,7) int lane) // VLD3.16 {d0[0], d2[0], d4[0]}, [r0]
10931{
10932 int16x8x3_t v;
10933 v.val[0] = _MM_INSERT_EPI16 ( src->val[0], ptr[0], lane);
10934 v.val[1] = _MM_INSERT_EPI16 ( src->val[1], ptr[1], lane);
10935 v.val[2] = _MM_INSERT_EPI16 ( src->val[2], ptr[2], lane);
10936 return v;
10937}
10938#define vld3q_lane_s16(ptr, src, lane) vld3q_lane_s16_ptr(ptr, &src, lane)
10939
10940//int32x4x3_t vld3q_lane_s32(__transfersize(3) int32_t const * ptr, int32x4x3_t src, __constrange(0,3)int lane);// VLD3.32 {d0[0], d2[0], d4[0]}, [r0]
10941_NEON2SSE_INLINE int32x4x3_t vld3q_lane_s32_ptr(__transfersize(3) int32_t const * ptr, int32x4x3_t* src, __constrange(0,3) int lane) // VLD3.32 {d0[0], d2[0], d4[0]}, [r0]
10942{
10943 int32x4x3_t v;
10944 v.val[0] = _MM_INSERT_EPI32 ( src->val[0], ptr[0], lane);
10945 v.val[1] = _MM_INSERT_EPI32 ( src->val[1], ptr[1], lane);
10946 v.val[2] = _MM_INSERT_EPI32 ( src->val[2], ptr[2], lane);
10947 return v;
10948}
10949#define vld3q_lane_s32(ptr, src, lane) vld3q_lane_s32_ptr(ptr, &src, lane)
10950
10951_NEON2SSE_GLOBAL float16x8x3_t vld3q_lane_f16_ptr(__transfersize(3) __fp16 const * ptr, float16x8x3_t * src, __constrange(0,7) int lane); // VLD3.16 {d0[0], d2[0], d4[0]}, [r0]
10952//current IA SIMD doesn't support float16
10953#define vld3q_lane_f16(ptr, src, lane) vld3q_lane_f16_ptr(ptr, &src, lane)
10954
10955
10956//float32x4x3_t vld3q_lane_f32(__transfersize(3) float32_t const * ptr, float32x4x3_t src,__constrange(0,3) int lane);// VLD3.32 {d0[0], d2[0], d4[0]}, [r0]
10957_NEON2SSE_INLINE float32x4x3_t vld3q_lane_f32_ptr(__transfersize(3) float32_t const * ptr, float32x4x3_t* src,__constrange(0,3) int lane) // VLD3.32 {d0[0], d2[0], d4[0]}, [r0]
10958{
10959 float32x4x3_t v;
10960 v.val[0] = vld1q_lane_f32(&ptr[0], src->val[0], lane);
10961 v.val[1] = vld1q_lane_f32(&ptr[1], src->val[1], lane);
10962 v.val[2] = vld1q_lane_f32(&ptr[2], src->val[2], lane);
10963 return v;
10964}
10965#define vld3q_lane_f32(ptr,src,lane) vld3q_lane_f32_ptr(ptr,&src,lane)
10966
10967_NEON2SSE_GLOBAL poly16x8x3_t vld3q_lane_p16_ptr(__transfersize(3) poly16_t const * ptr, poly16x8x3_t * src,__constrange(0,7) int lane); // VLD3.16 {d0[0], d2[0], d4[0]}, [r0]
10968#define vld3q_lane_p16 vld3q_lane_u16
10969
10970_NEON2SSESTORAGE uint8x8x3_t vld3_lane_u8(__transfersize(3) uint8_t const * ptr, uint8x8x3_t src, __constrange(0,7) int lane);// VLD3.8 {d0[0], d1[0], d2[0]}, [r0]
10971_NEON2SSE_INLINE uint8x8x3_t vld3_lane_u8(__transfersize(3) uint8_t const * ptr, uint8x8x3_t src, __constrange(0,7) int lane) // VLD3.8 {d0[0], d1[0], d2[0]}, [r0]
10972{
10973 uint8x8x3_t v;
10974 v.val[0] = vld1_lane_u8(ptr, src.val[0], lane);
10975 v.val[1] = vld1_lane_u8((ptr + 1), src.val[1], lane);
10976 v.val[2] = vld1_lane_u8((ptr + 2), src.val[2], lane);
10977 return v;
10978}
10979
10980_NEON2SSESTORAGE uint16x4x3_t vld3_lane_u16(__transfersize(3) uint16_t const * ptr, uint16x4x3_t src, __constrange(0,3)int lane);// VLD3.16 {d0[0], d1[0], d2[0]}, [r0]
10981_NEON2SSE_INLINE uint16x4x3_t vld3_lane_u16(__transfersize(3) uint16_t const * ptr, uint16x4x3_t src, __constrange(0,3) int lane) // VLD3.16 {d0[0], d1[0], d2[0]}, [r0]
10982{
10983 uint16x4x3_t v;
10984 v.val[0] = vld1_lane_u16(ptr, src.val[0], lane);
10985 v.val[1] = vld1_lane_u16((ptr + 1), src.val[1], lane);
10986 v.val[2] = vld1_lane_u16((ptr + 2), src.val[2], lane);
10987 return v;
10988}
10989
10990_NEON2SSESTORAGE uint32x2x3_t vld3_lane_u32(__transfersize(3) uint32_t const * ptr, uint32x2x3_t src, __constrange(0,1)int lane);// VLD3.32 {d0[0], d1[0], d2[0]}, [r0]
10991_NEON2SSE_INLINE uint32x2x3_t vld3_lane_u32(__transfersize(3) uint32_t const * ptr, uint32x2x3_t src, __constrange(0,1) int lane) // VLD3.32 {d0[0], d1[0], d2[0]}, [r0]
10992{
10993 //need to merge into 128 bit anyway
10994 uint32x2x3_t v;
10995 v.val[0] = vld1_lane_u32(ptr, src.val[0], lane);;
10996 v.val[1] = vld1_lane_u32((ptr + 1), src.val[1], lane);;
10997 v.val[2] = vld1_lane_u32((ptr + 2), src.val[2], lane);;
10998 return v;
10999}
11000
11001_NEON2SSE_GLOBAL int8x8x3_t vld3_lane_s8(__transfersize(3) int8_t const * ptr, int8x8x3_t src, __constrange(0,7) int lane); // VLD3.8 {d0[0], d1[0], d2[0]}, [r0]
11002#define vld3_lane_s8(ptr, src, lane) vld3_lane_u8(( uint8_t*) ptr, src, lane)
11003
11004_NEON2SSE_GLOBAL int16x4x3_t vld3_lane_s16(__transfersize(3) int16_t const * ptr, int16x4x3_t src, __constrange(0,3) int lane); // VLD3.16 {d0[0], d1[0], d2[0]}, [r0]
11005#define vld3_lane_s16(ptr, src, lane) vld3_lane_u16(( uint16_t*) ptr, src, lane)
11006
11007_NEON2SSE_GLOBAL int32x2x3_t vld3_lane_s32(__transfersize(3) int32_t const * ptr, int32x2x3_t src, __constrange(0,1) int lane); // VLD3.32 {d0[0], d1[0], d2[0]}, [r0]
11008#define vld3_lane_s32(ptr, src, lane) vld3_lane_u32(( uint32_t*) ptr, src, lane)
11009
11010_NEON2SSE_GLOBAL float16x4x3_t vld3_lane_f16_ptr(__transfersize(3) __fp16 const * ptr, float16x4x3_t * src, __constrange(0,3) int lane); // VLD3.16 {d0[0], d1[0], d2[0]}, [r0]
11011//current IA SIMD doesn't support float16
11012
11013_NEON2SSESTORAGE float32x2x3_t vld3_lane_f32(__transfersize(3) float32_t const * ptr, float32x2x3_t src,__constrange(0,1) int lane);// VLD3.32 {d0[0], d1[0], d2[0]}, [r0]
11014_NEON2SSE_INLINE float32x2x3_t vld3_lane_f32(__transfersize(3) float32_t const * ptr, float32x2x3_t src,__constrange(0,1) int lane) // VLD3.32 {d0[0], d1[0], d2[0]}, [r0]
11015{
11016 float32x2x3_t v;
11017 v.val[0] = vld1_lane_f32(ptr, src.val[0], lane);
11018 v.val[1] = vld1_lane_f32((ptr + 1), src.val[1], lane);
11019 v.val[2] = vld1_lane_f32((ptr + 2), src.val[2], lane);
11020 return v;
11021}
11022
11023_NEON2SSE_GLOBAL poly8x8x3_t vld3_lane_p8(__transfersize(3) poly8_t const * ptr, poly8x8x3_t src, __constrange(0,7) int lane); // VLD3.8 {d0[0], d1[0], d2[0]}, [r0]
11024#define vld3_lane_p8 vld3_lane_u8
11025
11026_NEON2SSE_GLOBAL poly16x4x3_t vld3_lane_p16(__transfersize(3) poly16_t const * ptr, poly16x4x3_t src, __constrange(0,3) int lane); // VLD3.16 {d0[0], d1[0], d2[0]}, [r0]
11027#define vld3_lane_p16 vld3_lane_u16
11028
11029//******************* Lane Quadruples load ***************************
11030//*********************************************************************
11031//does vld1_lane_xx ptr[0] to src->val[0], ptr[1] to src->val[1], ptr[2] to src->val[2] and ptr[3] to src->val[3] at lane positon
11032//we assume src is 16 bit aligned
11033
11034//uint16x8x4_t vld4q_lane_u16(__transfersize(4) uint16_t const * ptr, uint16x8x4_t src,__constrange(0,7) int lane)// VLD4.16 {d0[0], d2[0], d4[0], d6[0]}, [r0]
11035_NEON2SSE_INLINE uint16x8x4_t vld4q_lane_u16_ptr(__transfersize(4) uint16_t const * ptr, uint16x8x4_t* src,__constrange(0,7) int lane)
11036{
11037 uint16x8x4_t v;
11038 v.val[0] = _MM_INSERT_EPI16 ( src->val[0], ptr[0], lane);
11039 v.val[1] = _MM_INSERT_EPI16 ( src->val[1], ptr[1], lane);
11040 v.val[2] = _MM_INSERT_EPI16 ( src->val[2], ptr[2], lane);
11041 v.val[3] = _MM_INSERT_EPI16 ( src->val[3], ptr[3], lane);
11042 return v;
11043}
11044#define vld4q_lane_u16(ptr, src, lane) vld4q_lane_u16_ptr(ptr, &src, lane)
11045
11046//uint32x4x4_t vld4q_lane_u32(__transfersize(4) uint32_t const * ptr, uint32x4x4_t src,__constrange(0,3) int lane)// VLD4.32 {d0[0], d2[0], d4[0], d6[0]}, [r0]
11047_NEON2SSE_INLINE uint32x4x4_t vld4q_lane_u32_ptr(__transfersize(4) uint32_t const * ptr, uint32x4x4_t* src,__constrange(0,3) int lane)
11048{
11049 uint32x4x4_t v;
11050 v.val[0] = _MM_INSERT_EPI32 ( src->val[0], ptr[0], lane);
11051 v.val[1] = _MM_INSERT_EPI32 ( src->val[1], ptr[1], lane);
11052 v.val[2] = _MM_INSERT_EPI32 ( src->val[2], ptr[2], lane);
11053 v.val[3] = _MM_INSERT_EPI32 ( src->val[3], ptr[3], lane);
11054 return v;
11055}
11056#define vld4q_lane_u32(ptr, src, lane) vld4q_lane_u32_ptr(ptr, &src, lane)
11057
11058//int16x8x4_t vld4q_lane_s16(__transfersize(4) int16_t const * ptr, int16x8x4_t src, __constrange(0,7)int lane);// VLD4.16 {d0[0], d2[0], d4[0], d6[0]}, [r0]
11059_NEON2SSE_GLOBAL int16x8x4_t vld4q_lane_s16_ptr(__transfersize(4) int16_t const * ptr, int16x8x4_t * src, __constrange(0,7) int lane); // VLD4.16 {d0[0], d2[0], d4[0], d6[0]}, [r0]
11060#define vld4q_lane_s16(ptr, src, lane) vld4q_lane_u16(( uint16_t*) ptr, src, lane)
11061
11062//int32x4x4_t vld4q_lane_s32(__transfersize(4) int32_t const * ptr, int32x4x4_t src, __constrange(0,3)int lane);// VLD4.32 {d0[0], d2[0], d4[0], d6[0]}, [r0]
11063_NEON2SSE_GLOBAL int32x4x4_t vld4q_lane_s32_ptr(__transfersize(4) int32_t const * ptr, int32x4x4_t * src, __constrange(0,3) int lane); // VLD4.32 {d0[0], d2[0], d4[0], d6[0]}, [r0]
11064#define vld4q_lane_s32(ptr, src, lane) vld4q_lane_u32(( uint32_t*) ptr, src, lane)
11065
11066//float16x8x4_t vld4q_lane_f16(__transfersize(4) __fp16 const * ptr, float16x8x4_t src, __constrange(0,7)int lane);// VLD4.16 {d0[0], d2[0], d4[0], d6[0]}, [r0]
11067_NEON2SSE_GLOBAL float16x8x4_t vld4q_lane_f16_ptr(__transfersize(4) __fp16 const * ptr, float16x8x4_t * src, __constrange(0,7) int lane); // VLD4.16 {d0[0], d2[0], d4[0], d6[0]}, [r0]
11068//current IA SIMD doesn't support float16
11069
11070//float32x4x4_t vld4q_lane_f32(__transfersize(4) float32_t const * ptr, float32x4x4_t src,__constrange(0,3) int lane)// VLD4.32 {d0[0], d2[0], d4[0], d6[0]}, [r0]
11071_NEON2SSE_INLINE float32x4x4_t vld4q_lane_f32_ptr(__transfersize(4) float32_t const * ptr, float32x4x4_t* src,__constrange(0,3) int lane)
11072{
11073 float32x4x4_t v;
11074 v.val[0] = vld1q_lane_f32(&ptr[0], src->val[0], lane);
11075 v.val[1] = vld1q_lane_f32(&ptr[1], src->val[1], lane);
11076 v.val[2] = vld1q_lane_f32(&ptr[2], src->val[2], lane);
11077 v.val[3] = vld1q_lane_f32(&ptr[3], src->val[3], lane);
11078 return v;
11079}
11080#define vld4q_lane_f32(ptr,val,lane) vld4q_lane_f32_ptr(ptr,&val,lane)
11081
11082//poly16x8x4_t vld4q_lane_p16(__transfersize(4) poly16_t const * ptr, poly16x8x4_t src,__constrange(0,7) int lane);// VLD4.16 {d0[0], d2[0], d4[0], d6[0]}, [r0]
11083_NEON2SSE_GLOBAL poly16x8x4_t vld4q_lane_p16_ptr(__transfersize(4) poly16_t const * ptr, poly16x8x4_t * src,__constrange(0,7) int lane); // VLD4.16 {d0[0], d2[0], d4[0], d6[0]}, [r0]
11084#define vld4q_lane_p16 vld4q_lane_u16
11085
11086_NEON2SSESTORAGE uint8x8x4_t vld4_lane_u8(__transfersize(4) uint8_t const * ptr, uint8x8x4_t src, __constrange(0,7) int lane);// VLD4.8 {d0[0], d1[0], d2[0], d3[0]}, [r0]
11087_NEON2SSE_INLINE uint8x8x4_t vld4_lane_u8(__transfersize(4) uint8_t const * ptr, uint8x8x4_t src, __constrange(0,7) int lane)
11088{
11089 uint8x8x4_t v;
11090 v.val[0] = vld1_lane_u8(ptr, src.val[0], lane);
11091 v.val[1] = vld1_lane_u8((ptr + 1), src.val[1], lane);
11092 v.val[2] = vld1_lane_u8((ptr + 2), src.val[2], lane);
11093 v.val[3] = vld1_lane_u8((ptr + 3), src.val[3], lane);
11094 return v;
11095}
11096
11097_NEON2SSESTORAGE uint16x4x4_t vld4_lane_u16(__transfersize(4) uint16_t const * ptr, uint16x4x4_t src, __constrange(0,3)int lane);// VLD4.16 {d0[0], d1[0], d2[0], d3[0]}, [r0]
11098_NEON2SSE_INLINE uint16x4x4_t vld4_lane_u16(__transfersize(4) uint16_t const * ptr, uint16x4x4_t src, __constrange(0,3) int lane)
11099{
11100 uint16x4x4_t v;
11101 v.val[0] = vld1_lane_u16(ptr, src.val[0], lane);
11102 v.val[1] = vld1_lane_u16((ptr + 1), src.val[1], lane);
11103 v.val[2] = vld1_lane_u16((ptr + 2), src.val[2], lane);
11104 v.val[3] = vld1_lane_u16((ptr + 3), src.val[3], lane);
11105 return v;
11106}
11107
11108_NEON2SSESTORAGE uint32x2x4_t vld4_lane_u32(__transfersize(4) uint32_t const * ptr, uint32x2x4_t src, __constrange(0,1)int lane);// VLD4.32 {d0[0], d1[0], d2[0], d3[0]}, [r0]
11109_NEON2SSE_INLINE uint32x2x4_t vld4_lane_u32(__transfersize(4) uint32_t const * ptr, uint32x2x4_t src, __constrange(0,1) int lane)
11110{
11111 uint32x2x4_t v;
11112 v.val[0] = vld1_lane_u32(ptr, src.val[0], lane);
11113 v.val[1] = vld1_lane_u32((ptr + 1), src.val[1], lane);
11114 v.val[2] = vld1_lane_u32((ptr + 2), src.val[2], lane);
11115 v.val[3] = vld1_lane_u32((ptr + 3), src.val[3], lane);
11116 return v;
11117}
11118
11119_NEON2SSE_GLOBAL int8x8x4_t vld4_lane_s8(__transfersize(4) int8_t const * ptr, int8x8x4_t src, __constrange(0,7) int lane);// VLD4.8 {d0[0], d1[0], d2[0], d3[0]}, [r0]
11120#define vld4_lane_s8(ptr,src,lane) vld4_lane_u8((uint8_t*)ptr,src,lane)
11121
11122_NEON2SSE_GLOBAL int16x4x4_t vld4_lane_s16(__transfersize(4) int16_t const * ptr, int16x4x4_t src, __constrange(0,3) int lane);// VLD4.16 {d0[0], d1[0], d2[0], d3[0]}, [r0]
11123#define vld4_lane_s16(ptr,src,lane) vld4_lane_u16((uint16_t*)ptr,src,lane)
11124
11125_NEON2SSE_GLOBAL int32x2x4_t vld4_lane_s32(__transfersize(4) int32_t const * ptr, int32x2x4_t src, __constrange(0,1) int lane);// VLD4.32 {d0[0], d1[0], d2[0], d3[0]}, [r0]
11126#define vld4_lane_s32(ptr,src,lane) vld4_lane_u32((uint32_t*)ptr,src,lane)
11127
11128//float16x4x4_t vld4_lane_f16(__transfersize(4) __fp16 const * ptr, float16x4x4_t src, __constrange(0,3)int lane);// VLD4.16 {d0[0], d1[0], d2[0], d3[0]}, [r0]
11129_NEON2SSE_GLOBAL float16x4x4_t vld4_lane_f16_ptr(__transfersize(4) __fp16 const * ptr, float16x4x4_t * src, __constrange(0,3) int lane);
11130//current IA SIMD doesn't support float16
11131
11132_NEON2SSESTORAGE float32x2x4_t vld4_lane_f32(__transfersize(4) float32_t const * ptr, float32x2x4_t src,__constrange(0,1) int lane);// VLD4.32 {d0[0], d1[0], d2[0], d3[0]}, [r0]
11133_NEON2SSE_INLINE float32x2x4_t vld4_lane_f32(__transfersize(4) float32_t const * ptr, float32x2x4_t src,__constrange(0,1) int lane)
11134{
11135 //serial solution may be faster
11136 float32x2x4_t v;
11137 v.val[0] = vld1_lane_f32(ptr, src.val[0], lane);
11138 v.val[1] = vld1_lane_f32((ptr + 1), src.val[1], lane);
11139 v.val[2] = vld1_lane_f32((ptr + 2), src.val[2], lane);
11140 v.val[3] = vld1_lane_f32((ptr + 3), src.val[3], lane);
11141 return v;
11142}
11143
11144_NEON2SSE_GLOBAL poly8x8x4_t vld4_lane_p8(__transfersize(4) poly8_t const * ptr, poly8x8x4_t src, __constrange(0,7) int lane);// VLD4.8 {d0[0], d1[0], d2[0], d3[0]}, [r0]
11145#define vld4_lane_p8 vld4_lane_u8
11146
11147_NEON2SSE_GLOBAL poly16x4x4_t vld4_lane_p16(__transfersize(4) poly16_t const * ptr, poly16x4x4_t src, __constrange(0,3)int lane);// VLD4.16 {d0[0], d1[0], d2[0], d3[0]}, [r0]
11148#define vld4_lane_p16 vld4_lane_u16
11149
11150//******************* Store duplets *********************************************
11151//********************************************************************************
11152//void vst2q_u8(__transfersize(32) uint8_t * ptr, uint8x16x2_t val)// VST2.8 {d0, d2}, [r0]
11153_NEON2SSE_INLINE void vst2q_u8_ptr(__transfersize(32) uint8_t * ptr, uint8x16x2_t const * val)
11154{
11155 uint8x16x2_t v;
11156 v.val[0] = _mm_unpacklo_epi8(val->val[0], val->val[1]);
11157 v.val[1] = _mm_unpackhi_epi8(val->val[0], val->val[1]);
11158 vst1q_u8 (ptr, v.val[0]);
11159 vst1q_u8 ((ptr + 16), v.val[1]);
11160}
11161#define vst2q_u8(ptr, val) vst2q_u8_ptr(ptr, &val)
11162
11163//void vst2q_u16(__transfersize(16) uint16_t * ptr, uint16x8x2_t val)// VST2.16 {d0, d2}, [r0]
11164_NEON2SSE_INLINE void vst2q_u16_ptr(__transfersize(16) uint16_t * ptr, uint16x8x2_t const * val)
11165{
11166 uint16x8x2_t v;
11167 v.val[0] = _mm_unpacklo_epi16(val->val[0], val->val[1]);
11168 v.val[1] = _mm_unpackhi_epi16(val->val[0], val->val[1]);
11169 vst1q_u16 (ptr, v.val[0]);
11170 vst1q_u16 ((ptr + 8), v.val[1]);
11171}
11172#define vst2q_u16(ptr, val) vst2q_u16_ptr(ptr, &val)
11173
11174//void vst2q_u32(__transfersize(8) uint32_t * ptr, uint32x4x2_t val)// VST2.32 {d0, d2}, [r0]
11175_NEON2SSE_INLINE void vst2q_u32_ptr(__transfersize(8) uint32_t* ptr, uint32x4x2_t const * val)
11176{
11177 uint32x4x2_t v;
11178 v.val[0] = _mm_unpacklo_epi32(val->val[0], val->val[1]);
11179 v.val[1] = _mm_unpackhi_epi32(val->val[0], val->val[1]);
11180 vst1q_u32 (ptr, v.val[0]);
11181 vst1q_u32 ((ptr + 4), v.val[1]);
11182}
11183#define vst2q_u32(ptr, val) vst2q_u32_ptr(ptr, &val)
11184
11185//void vst2q_s8(__transfersize(32) int8_t * ptr, int8x16x2_t val); // VST2.8 {d0, d2}, [r0]
11186_NEON2SSE_GLOBAL void vst2q_s8_ptr(__transfersize(32) int8_t * ptr, int8x16x2_t const * val);
11187#define vst2q_s8(ptr, val) vst2q_u8((uint8_t*)(ptr), val)
11188
11189//void vst2q_s16(__transfersize(16) int16_t * ptr, int16x8x2_t val);// VST2.16 {d0, d2}, [r0]
11190_NEON2SSE_GLOBAL void vst2q_s16_ptr(__transfersize(16) int16_t * ptr, int16x8x2_t const * val);
11191#define vst2q_s16(ptr, val) vst2q_u16((uint16_t*)(ptr), val)
11192
11193//void vst2q_s32(__transfersize(8) int32_t * ptr, int32x4x2_t val);// VST2.32 {d0, d2}, [r0]
11194_NEON2SSE_GLOBAL void vst2q_s32_ptr(__transfersize(8) int32_t * ptr, int32x4x2_t const * val);
11195#define vst2q_s32(ptr, val) vst2q_u32((uint32_t*)(ptr), val)
11196
11197//void vst2q_f16(__transfersize(16) __fp16 * ptr, float16x8x2_t val);// VST2.16 {d0, d2}, [r0]
11198_NEON2SSE_GLOBAL void vst2q_f16_ptr(__transfersize(16) __fp16 * ptr, float16x8x2_t const * val);
11199// IA32 SIMD doesn't work with 16bit floats currently
11200
11201//void vst2q_f32(__transfersize(8) float32_t * ptr, float32x4x2_t val)// VST2.32 {d0, d2}, [r0]
11202_NEON2SSE_INLINE void vst2q_f32_ptr(__transfersize(8) float32_t* ptr, float32x4x2_t const * val)
11203{
11204 float32x4x2_t v;
11205 v.val[0] = _mm_unpacklo_ps(val->val[0], val->val[1]);
11206 v.val[1] = _mm_unpackhi_ps(val->val[0], val->val[1]);
11207 vst1q_f32 (ptr, v.val[0]);
11208 vst1q_f32 ((ptr + 4), v.val[1]);
11209}
11210#define vst2q_f32(ptr, val) vst2q_f32_ptr(ptr, &val)
11211
11212//void vst2q_p8(__transfersize(32) poly8_t * ptr, poly8x16x2_t val);// VST2.8 {d0, d2}, [r0]
11213_NEON2SSE_GLOBAL void vst2q_p8_ptr(__transfersize(32) poly8_t * ptr, poly8x16x2_t const * val);
11214#define vst2q_p8 vst2q_u8
11215
11216//void vst2q_p16(__transfersize(16) poly16_t * ptr, poly16x8x2_t val);// VST2.16 {d0, d2}, [r0]
11217_NEON2SSE_GLOBAL void vst2q_p16_ptr(__transfersize(16) poly16_t * ptr, poly16x8x2_t const * val);
11218#define vst2q_p16 vst2q_u16
11219
11220_NEON2SSESTORAGE void vst2_u8(__transfersize(16) uint8_t * ptr, uint8x8x2_t val);// VST2.8 {d0, d1}, [r0]
11221_NEON2SSE_INLINE void vst2_u8(__transfersize(16) uint8_t * ptr, uint8x8x2_t val)
11222{
11223 __m128i v0;
11224 v0 = _mm_unpacklo_epi8(_pM128i(val.val[0]), _pM128i(val.val[1]));
11225 vst1q_u8 (ptr, v0);
11226}
11227
11228_NEON2SSESTORAGE void vst2_u16(__transfersize(8) uint16_t * ptr, uint16x4x2_t val);// VST2.16 {d0, d1}, [r0]
11229_NEON2SSE_INLINE void vst2_u16(__transfersize(8) uint16_t * ptr, uint16x4x2_t val)
11230{
11231 __m128i v0;
11232 v0 = _mm_unpacklo_epi16(_pM128i(val.val[0]), _pM128i(val.val[1]));
11233 vst1q_u16 (ptr, v0);
11234}
11235
11236_NEON2SSESTORAGE void vst2_u32(__transfersize(4) uint32_t * ptr, uint32x2x2_t val);// VST2.32 {d0, d1}, [r0]
11237_NEON2SSE_INLINE void vst2_u32(__transfersize(4) uint32_t * ptr, uint32x2x2_t val)
11238{
11239 __m128i v0;
11240 v0 = _mm_unpacklo_epi32(_pM128i(val.val[0]), _pM128i(val.val[1]));
11241 vst1q_u32 (ptr, v0);
11242}
11243
11244_NEON2SSESTORAGE void vst2_u64(__transfersize(2) uint64_t * ptr, uint64x1x2_t val);// VST1.64 {d0, d1}, [r0]
11245_NEON2SSE_INLINE void vst2_u64(__transfersize(2) uint64_t * ptr, uint64x1x2_t val)
11246{
11247 *(ptr) = val.val[0].m64_u64[0];
11248 *(ptr + 1) = val.val[1].m64_u64[0];
11249}
11250
11251_NEON2SSE_GLOBAL void vst2_s8(__transfersize(16) int8_t * ptr, int8x8x2_t val);// VST2.8 {d0, d1}, [r0]
11252#define vst2_s8(ptr, val) vst2_u8((uint8_t*) ptr, val)
11253
11254_NEON2SSE_GLOBAL void vst2_s16(__transfersize(8) int16_t * ptr, int16x4x2_t val); // VST2.16 {d0, d1}, [r0]
11255#define vst2_s16(ptr,val) vst2_u16((uint16_t*) ptr, val)
11256
11257_NEON2SSE_GLOBAL void vst2_s32(__transfersize(4) int32_t * ptr, int32x2x2_t val); // VST2.32 {d0, d1}, [r0]
11258#define vst2_s32(ptr,val) vst2_u32((uint32_t*) ptr, val)
11259
11260_NEON2SSE_GLOBAL void vst2_s64(__transfersize(2) int64_t * ptr, int64x1x2_t val);
11261#define vst2_s64(ptr,val) vst2_u64((uint64_t*) ptr,val)
11262
11263//void vst2_f16(__transfersize(8) __fp16 * ptr, float16x4x2_t val); // VST2.16 {d0, d1}, [r0]
11264//current IA SIMD doesn't support float16
11265
11266_NEON2SSESTORAGE void vst2_f32(__transfersize(4) float32_t * ptr, float32x2x2_t val); // VST2.32 {d0, d1}, [r0]
11267_NEON2SSE_INLINE void vst2_f32(__transfersize(4) float32_t* ptr, float32x2x2_t val)
11268{
11269 *(ptr) = val.val[0].m64_f32[0];
11270 *(ptr + 1) = val.val[1].m64_f32[0];
11271 *(ptr + 2) = val.val[0].m64_f32[1];
11272 *(ptr + 3) = val.val[1].m64_f32[1];
11273}
11274
11275_NEON2SSE_GLOBAL void vst2_p8(__transfersize(16) poly8_t * ptr, poly8x8x2_t val); // VST2.8 {d0, d1}, [r0]
11276#define vst2_p8 vst2_u8
11277
11278_NEON2SSE_GLOBAL void vst2_p16(__transfersize(8) poly16_t * ptr, poly16x4x2_t val); // VST2.16 {d0, d1}, [r0]
11279#define vst2_p16 vst2_u16
11280
11281//******************** Triplets store *****************************************
11282//******************************************************************************
11283//void vst3q_u8(__transfersize(48) uint8_t * ptr, uint8x16x3_t val)// VST3.8 {d0, d2, d4}, [r0]
11284_NEON2SSE_INLINE void vst3q_u8_ptr(__transfersize(48) uint8_t * ptr, uint8x16x3_t const * val)
11285{
11286 uint8x16x3_t v;
11287 __m128i v0,v1,v2, cff, bldmask;
11288 _NEON2SSE_ALIGN_16 static const uint8_t mask0[16] = {0, 1, 0xff, 2, 3,0xff, 4, 5,0xff, 6,7,0xff, 8,9,0xff, 10};
11289 _NEON2SSE_ALIGN_16 static const uint8_t mask1[16] = {0, 0xff, 1, 2, 0xff, 3, 4, 0xff, 5, 6, 0xff, 7,8,0xff, 9,10};
11290 _NEON2SSE_ALIGN_16 static const uint8_t mask2[16] = {0xff, 6, 7, 0xff, 8, 9,0xff, 10, 11,0xff, 12,13,0xff, 14,15,0xff};
11291 _NEON2SSE_ALIGN_16 static const uint8_t mask2lo[16] = {0xff,0xff, 0, 0xff,0xff, 1, 0xff,0xff, 2, 0xff,0xff, 3, 0xff,0xff, 4, 0xff};
11292 _NEON2SSE_ALIGN_16 static const uint8_t mask2med[16] = {0xff, 5, 0xff, 0xff, 6, 0xff,0xff, 7, 0xff,0xff, 8, 0xff,0xff, 9, 0xff, 0xff};
11293 _NEON2SSE_ALIGN_16 static const uint8_t mask2hi[16] = {10, 0xff,0xff, 11, 0xff,0xff, 12, 0xff,0xff, 13, 0xff,0xff, 14, 0xff, 0xff, 15};
11294
11295 v0 = _mm_unpacklo_epi8(val->val[0], val->val[1]); //0,1, 3,4, 6,7, 9,10, 12,13, 15,16, 18,19, 21,22
11296 v2 = _mm_unpackhi_epi8(val->val[0], val->val[1]); //24,25, 27,28, 30,31, 33,34, 36,37, 39,40, 42,43, 45,46
11297 v1 = _mm_alignr_epi8(v2, v0, 11); //12,13, 15,16, 18,19, 21,22, 24,25, 27,28, 30,31, 33,34
11298 v.val[0] = _mm_shuffle_epi8(v0, *(__m128i*)mask0); //make holes for the v.val[2] data embedding
11299 v.val[2] = _mm_shuffle_epi8(val->val[2], *(__m128i*)mask2lo); //make plugs for the v.val[2] data embedding
11300 cff = _mm_cmpeq_epi8(v0, v0); //all ff
11301 bldmask = _mm_cmpeq_epi8(*(__m128i*)mask0, cff);
11302 v.val[0] = _MM_BLENDV_EPI8(v.val[0], v.val[2], bldmask);
11303 vst1q_u8(ptr, v.val[0]);
11304 v.val[0] = _mm_shuffle_epi8(v1, *(__m128i*)mask1); //make holes for the v.val[2] data embedding
11305 v.val[2] = _mm_shuffle_epi8(val->val[2], *(__m128i*)mask2med); //make plugs for the v.val[2] data embedding
11306 bldmask = _mm_cmpeq_epi8(*(__m128i*)mask1, cff);
11307 v.val[1] = _MM_BLENDV_EPI8(v.val[0],v.val[2], bldmask);
11308 vst1q_u8((ptr + 16), v.val[1]);
11309 v.val[0] = _mm_shuffle_epi8(v2, *(__m128i*)mask2); //make holes for the v.val[2] data embedding
11310 v.val[2] = _mm_shuffle_epi8(val->val[2], *(__m128i*)mask2hi); //make plugs for the v.val[2] data embedding
11311 bldmask = _mm_cmpeq_epi8(*(__m128i*)mask2, cff);
11312 v.val[2] = _MM_BLENDV_EPI8(v.val[0],v.val[2], bldmask );
11313 vst1q_u8((ptr + 32), v.val[2]);
11314}
11315#define vst3q_u8(ptr, val) vst3q_u8_ptr(ptr, &val)
11316
11317//void vst3q_u16(__transfersize(24) uint16_t * ptr, uint16x8x3_t val)// VST3.16 {d0, d2, d4}, [r0]
11318_NEON2SSE_INLINE void vst3q_u16_ptr(__transfersize(24) uint16_t * ptr, uint16x8x3_t const * val)
11319{
11320 uint16x8x3_t v;
11321 __m128i v0,v1,v2, cff, bldmask;
11322 _NEON2SSE_ALIGN_16 static const uint8_t mask0[16] = {0,1, 2,3, 0xff,0xff, 4,5, 6,7,0xff,0xff, 8,9,10,11};
11323 _NEON2SSE_ALIGN_16 static const uint8_t mask1[16] = {0xff, 0xff, 0,1, 2,3, 0xff,0xff, 4,5, 6,7, 0xff,0xff, 8,9};
11324 _NEON2SSE_ALIGN_16 static const uint8_t mask2[16] = {6,7,0xff,0xff, 8,9,10,11, 0xff, 0xff, 12,13,14,15, 0xff, 0xff};
11325 _NEON2SSE_ALIGN_16 static const uint8_t mask2lo[16] = {0xff,0xff, 0xff,0xff, 0,1, 0xff,0xff, 0xff,0xff, 2,3, 0xff,0xff, 0xff,0xff};
11326 _NEON2SSE_ALIGN_16 static const uint8_t mask2med[16] = {4,5, 0xff,0xff,0xff,0xff, 6,7, 0xff, 0xff,0xff,0xff, 8,9, 0xff, 0xff};
11327 _NEON2SSE_ALIGN_16 static const uint8_t mask2hi[16] = {0xff, 0xff, 10,11, 0xff, 0xff, 0xff, 0xff, 12,13, 0xff, 0xff, 0xff, 0xff,14,15};
11328
11329 v0 = _mm_unpacklo_epi16(val->val[0], val->val[1]); //0,1, 3,4, 6,7, 9,10
11330 v2 = _mm_unpackhi_epi16(val->val[0], val->val[1]); //12,13, 15,16, 18,19, 21,22,
11331 v1 = _mm_alignr_epi8(v2, v0, 12); //9,10, 12,13, 15,16, 18,19
11332 v.val[0] = _mm_shuffle_epi8(v0, *(__m128i*)mask0); //make holes for the v.val[2] data embedding
11333 v.val[2] = _mm_shuffle_epi8(val->val[2], *(__m128i*)mask2lo); //make plugs for the v.val[2] data embedding
11334 cff = _mm_cmpeq_epi16(v0, v0); //all ff
11335 bldmask = _mm_cmpeq_epi16(*(__m128i*)mask0, cff);
11336 v.val[0] = _MM_BLENDV_EPI8(v.val[0], v.val[2], bldmask);
11337 vst1q_u16(ptr, v.val[0]);
11338 v.val[0] = _mm_shuffle_epi8(v1, *(__m128i*)mask1); //make holes for the v.val[2] data embedding
11339 v.val[2] = _mm_shuffle_epi8(val->val[2], *(__m128i*)mask2med); //make plugs for the v.val[2] data embedding
11340 bldmask = _mm_cmpeq_epi16(*(__m128i*)mask1, cff);
11341 v.val[1] = _MM_BLENDV_EPI8(v.val[0],v.val[2], bldmask);
11342 vst1q_u16((ptr + 8), v.val[1]);
11343 v.val[0] = _mm_shuffle_epi8(v2, *(__m128i*)mask2); //make holes for the v.val[2] data embedding
11344 v.val[2] = _mm_shuffle_epi8(val->val[2], *(__m128i*)mask2hi); //make plugs for the v.val[2] data embedding
11345 bldmask = _mm_cmpeq_epi16(*(__m128i*)mask2, cff);
11346 v.val[2] = _MM_BLENDV_EPI8(v.val[0],v.val[2], bldmask );
11347 vst1q_u16((ptr + 16), v.val[2]);
11348}
11349#define vst3q_u16(ptr, val) vst3q_u16_ptr(ptr, &val)
11350
11351//void vst3q_u32(__transfersize(12) uint32_t * ptr, uint32x4x3_t val)// VST3.32 {d0, d2, d4}, [r0]
11352_NEON2SSE_INLINE void vst3q_u32_ptr(__transfersize(12) uint32_t * ptr, uint32x4x3_t const * val)
11353{
11354 //a0,a1,a2,a3, b0,b1,b2,b3, c0,c1,c2,c3 -> a0,b0,c0,a1, b1,c1,a2,b2, c2,a3,b3,c3
11355 uint32x4x3_t v;
11356 __m128i tmp0, tmp1,tmp2;
11357 tmp0 = _mm_unpacklo_epi32(val->val[0], val->val[1]); //a0,b0,a1,b1
11358 tmp1 = _mm_unpackhi_epi32(val->val[0], val->val[1]); //a2,b2,a3,b3
11359 tmp2 = _mm_unpacklo_epi32(val->val[1], val->val[2]); //b0,c0,b1,c1
11360 v.val[1] = _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp2),_mm_castsi128_ps(tmp1), _MM_SHUFFLE(1,0,3,2))); //b1,c1,a2,b2,
11361 v.val[2] = _mm_unpackhi_epi64(tmp1, val->val[2]); //a3,b3, c2,c3
11362 v.val[2] = _mm_shuffle_epi32(v.val[2], 2 | (0 << 2) | (1 << 4) | (3 << 6)); //c2,a3,b3,c3
11363 tmp1 = _mm_unpacklo_epi32(tmp2,val->val[0]); //b0,a0,c0,a1
11364 v.val[0] = _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp0),_mm_castsi128_ps(tmp1), _MM_SHUFFLE(3,2,1,0))); //a0,b0,c0,a1,
11365
11366 vst1q_u32(ptr, v.val[0]);
11367 vst1q_u32((ptr + 4), v.val[1]);
11368 vst1q_u32((ptr + 8), v.val[2]);
11369}
11370#define vst3q_u32(ptr, val) vst3q_u32_ptr(ptr, &val)
11371
11372//void vst3q_s8(__transfersize(48) int8_t * ptr, int8x16x3_t val);
11373_NEON2SSE_GLOBAL void vst3q_s8_ptr(__transfersize(48) int8_t * ptr, int8x16x3_t const * val);
11374#define vst3q_s8(ptr, val) vst3q_u8((uint8_t*)(ptr), val)
11375
11376//void vst3q_s16(__transfersize(24) int16_t * ptr, int16x8x3_t val);
11377_NEON2SSE_GLOBAL void vst3q_s16_ptr(__transfersize(24) int16_t * ptr, int16x8x3_t const * val);
11378#define vst3q_s16(ptr, val) vst3q_u16((uint16_t*)(ptr), val)
11379
11380//void vst3q_s32(__transfersize(12) int32_t * ptr, int32x4x3_t val);
11381_NEON2SSE_GLOBAL void vst3q_s32_ptr(__transfersize(12) int32_t * ptr, int32x4x3_t const * val);
11382#define vst3q_s32(ptr, val) vst3q_u32((uint32_t*)(ptr), val)
11383
11384//void vst3q_f16(__transfersize(24) __fp16 * ptr, float16x8x3_t val);// VST3.16 {d0, d2, d4}, [r0]
11385_NEON2SSE_GLOBAL void vst3q_f16_ptr(__transfersize(24) __fp16 * ptr, float16x8x3_t const * val);
11386// IA32 SIMD doesn't work with 16bit floats currently
11387
11388//void vst3q_f32(__transfersize(12) float32_t * ptr, float32x4x3_t val)// VST3.32 {d0, d2, d4}, [r0]
11389_NEON2SSE_INLINE void vst3q_f32_ptr(__transfersize(12) float32_t * ptr, float32x4x3_t const * val)
11390{
11391 float32x4x3_t v;
11392 __m128 tmp0, tmp1,tmp2;
11393 tmp0 = _mm_unpacklo_ps(val->val[0], val->val[1]); //a0,b0,a1,b1
11394 tmp1 = _mm_unpackhi_ps(val->val[0], val->val[1]); //a2,b2,a3,b3
11395 tmp2 = _mm_unpacklo_ps(val->val[1], val->val[2]); //b0,c0,b1,c1
11396 v.val[1] = _mm_shuffle_ps(tmp2,tmp1, _MM_SHUFFLE(1,0,3,2)); //b1,c1,a2,b2,
11397 v.val[2] = _mm_movehl_ps(val->val[2],tmp1); //a3,b3, c2,c3
11398 v.val[2] = _mm_shuffle_ps(v.val[2],v.val[2], _MM_SHUFFLE(3,1,0,2)); //c2,a3,b3,c3
11399 tmp1 = _mm_unpacklo_ps(tmp2,val->val[0]); //b0,a0,c0,a1
11400 v.val[0] = _mm_shuffle_ps(tmp0,tmp1, _MM_SHUFFLE(3,2,1,0)); //a0,b0,c0,a1,
11401
11402 vst1q_f32( ptr, v.val[0]);
11403 vst1q_f32( (ptr + 4), v.val[1]);
11404 vst1q_f32( (ptr + 8), v.val[2]);
11405}
11406#define vst3q_f32(ptr, val) vst3q_f32_ptr(ptr, &val)
11407
11408//void vst3q_p8(__transfersize(48) poly8_t * ptr, poly8x16x3_t val);// VST3.8 {d0, d2, d4}, [r0]
11409_NEON2SSE_GLOBAL void vst3q_p8_ptr(__transfersize(48) poly8_t * ptr, poly8x16x3_t const * val);
11410#define vst3q_p8 vst3q_u8
11411
11412//void vst3q_p16(__transfersize(24) poly16_t * ptr, poly16x8x3_t val);// VST3.16 {d0, d2, d4}, [r0]
11413_NEON2SSE_GLOBAL void vst3q_p16_ptr(__transfersize(24) poly16_t * ptr, poly16x8x3_t const * val);
11414#define vst3q_p16 vst3q_u16
11415
11416_NEON2SSESTORAGE void vst3_u8(__transfersize(24) uint8_t * ptr, uint8x8x3_t val);// VST3.8 {d0, d1, d2}, [r0]
11417_NEON2SSE_INLINE void vst3_u8(__transfersize(24) uint8_t * ptr, uint8x8x3_t val)
11418{
11419 __m128i tmp, sh0, sh1, val0, val2;
11420 _NEON2SSE_ALIGN_16 static const int8_t mask0[16] = { 0, 8, 16, 1, 9, 17, 2, 10, 18, 3, 11, 19, 4, 12, 20, 5};
11421 _NEON2SSE_ALIGN_16 static const int8_t mask1[16] = {13, 21, 6, 14, 22, 7, 15, 23, 0,0,0,0,0,0,0,0};
11422 _NEON2SSE_ALIGN_16 static const uint8_t mask0_sel[16] = {0, 0, 0xff, 0, 0, 0xff, 0, 0, 0xff, 0, 0, 0xff, 0, 0, 0xff, 0};
11423 _NEON2SSE_ALIGN_16 static const uint8_t mask1_sel[16] = {0, 0xff, 0, 0, 0xff, 0, 0, 0xff, 0,0,0,0,0,0,0,0};
11424 tmp = _mm_unpacklo_epi64(_pM128i(val.val[0]), _pM128i(val.val[1]) );
11425 sh0 = _mm_shuffle_epi8(tmp, *(__m128i*)mask0); //for bi>15 bi is wrapped (bi-=15)
11426 val2 = _pM128i(val.val[2]);
11427 sh1 = _mm_shuffle_epi8(val2, *(__m128i*)mask0);
11428 val0 = _MM_BLENDV_EPI8(sh0, sh1, *(__m128i*)mask0_sel);
11429 vst1q_u8(ptr, val0); //store as 128 bit structure
11430 sh0 = _mm_shuffle_epi8(tmp, *(__m128i*)mask1); //for bi>15 bi is wrapped (bi-=15)
11431 sh1 = _mm_shuffle_epi8(val2, *(__m128i*)mask1);
11432 val2 = _MM_BLENDV_EPI8(sh0, sh1, *(__m128i*)mask1_sel);
11433 _M64((*(__m64_128*)(ptr + 16)), val2); //need it to fit into *ptr memory
11434}
11435
11436_NEON2SSESTORAGE void vst3_u16(__transfersize(12) uint16_t * ptr, uint16x4x3_t val);// VST3.16 {d0, d1, d2}, [r0]
11437_NEON2SSE_INLINE void vst3_u16(__transfersize(12) uint16_t * ptr, uint16x4x3_t val)
11438{
11439 __m128i tmp, val0, val1, val2;
11440 _NEON2SSE_ALIGN_16 static const int8_t mask0[16] = {0,1, 8,9, 16,17, 2,3, 10,11, 18,19, 4,5, 12,13};
11441 _NEON2SSE_ALIGN_16 static const int8_t mask1[16] = {20,21, 6,7, 14,15, 22,23, 0,0,0,0,0,0,0,0};
11442 _NEON2SSE_ALIGN_16 static const uint16_t mask0f[8] = {0xffff, 0xffff, 0, 0xffff, 0xffff, 0, 0xffff, 0xffff}; //if all ones we take the result from v.val[0] otherwise from v.val[1]
11443 _NEON2SSE_ALIGN_16 static const uint16_t mask1f[8] = {0xffff, 0, 0, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff}; //if all ones we take the result from v.val[1] otherwise from v.val[0]
11444 tmp = _mm_unpacklo_epi64(_pM128i(val.val[0]), _pM128i(val.val[1]));
11445 val0 = _mm_shuffle_epi8(tmp, *(__m128i*)mask0);
11446 val2 = _pM128i(val.val[2]);
11447 val1 = _mm_shuffle_epi8(val2, *(__m128i*)mask0);
11448 val0 = _MM_BLENDV_EPI8(val1, val0, *(__m128i*)mask0f);
11449 vst1q_u16(ptr, val0); //store as 128 bit structure
11450 val0 = _mm_shuffle_epi8(tmp, *(__m128i*)mask1);
11451 val1 = _mm_shuffle_epi8(val2, *(__m128i*)mask1);
11452 val1 = _MM_BLENDV_EPI8(val0, val1, *(__m128i*)mask1f); //change the operands order
11453 _M64((*(__m64_128*)(ptr + 8)), val1); //need it to fit into *ptr memory
11454}
11455
11456_NEON2SSESTORAGE void vst3_u32(__transfersize(6) uint32_t * ptr, uint32x2x3_t val);// VST3.32 {d0, d1, d2}, [r0]
11457_NEON2SSE_INLINE void vst3_u32(__transfersize(6) uint32_t * ptr, uint32x2x3_t val)
11458{
11459 //val.val[0]:0,3,val.val[1]:1,4; val.val[2]:2,5,x,x;
11460 __m128i val0, val1;
11461 val0 = _mm_unpacklo_epi64(_pM128i(val.val[1]), _pM128i(val.val[2])); //val[0]: 1,4,2,5
11462 val0 = _mm_shuffle_epi32(val0, 0 | (2 << 2) | (1 << 4) | (3 << 6)); //1,2,4,5
11463 val1 = _mm_srli_si128(val0, 8); //4,5, x,x
11464 _M64((*(__m64_128*)(ptr + 4)), val1);
11465 val0 = _mm_unpacklo_epi32(_pM128i(val.val[0]), val0); //0,1,3,2
11466 val0 = _mm_shuffle_epi32(val0, 0 | (1 << 2) | (3 << 4) | (2 << 6)); //0,1,2, 3
11467 vst1q_u32(ptr, val0); //store as 128 bit structure
11468}
11469
11470_NEON2SSESTORAGE void vst3_u64(__transfersize(3) uint64_t * ptr, uint64x1x3_t val);// VST1.64 {d0, d1, d2}, [r0]
11471_NEON2SSE_INLINE void vst3_u64(__transfersize(3) uint64_t * ptr, uint64x1x3_t val)
11472{
11473 *(ptr) = val.val[0].m64_u64[0];
11474 *(ptr + 1) = val.val[1].m64_u64[0];
11475 *(ptr + 2) = val.val[2].m64_u64[0];
11476}
11477
11478_NEON2SSE_GLOBAL void vst3_s8(__transfersize(24) int8_t * ptr, int8x8x3_t val); // VST3.8 {d0, d1, d2}, [r0]
11479#define vst3_s8(ptr, val) vst3_u8((uint8_t*)ptr, val)
11480
11481_NEON2SSE_GLOBAL void vst3_s16(__transfersize(12) int16_t * ptr, int16x4x3_t val); // VST3.16 {d0, d1, d2}, [r0]
11482#define vst3_s16(ptr, val) vst3_u16((uint16_t*)ptr, val)
11483
11484_NEON2SSE_GLOBAL void vst3_s32(__transfersize(6) int32_t * ptr, int32x2x3_t val); // VST3.32 {d0, d1, d2}, [r0]
11485#define vst3_s32(ptr, val) vst3_u32((uint32_t*)ptr, val)
11486
11487_NEON2SSE_GLOBAL void vst3_s64(__transfersize(3) int64_t * ptr, int64x1x3_t val); // VST1.64 {d0, d1, d2}, [r0]
11488#define vst3_s64(ptr, val) vst3_u64((uint64_t*)ptr, val)
11489
11490//void vst3_f16(__transfersize(12) __fp16 * ptr, float16x4x3_t val);// VST3.16 {d0, d1, d2}, [r0]
11491_NEON2SSE_GLOBAL void vst3_f16_ptr(__transfersize(12) __fp16 * ptr, float16x4x3_t const * val); // VST3.16 {d0, d1, d2}, [r0]
11492// IA32 SIMD doesn't work with 16bit floats currently, so need to go to 32 bit and then work with two 128bit registers. See vld1q_f16 for example
11493
11494_NEON2SSESTORAGE void vst3_f32(__transfersize(6) float32_t * ptr, float32x2x3_t val);// VST3.32 {d0, d1, d2}, [r0]
11495_NEON2SSE_INLINE void vst3_f32(__transfersize(6) float32_t * ptr, float32x2x3_t val)
11496{
11497 //val->val[0]:0,3,val->val[1]:1,4; val->val[2]:2,5,x,x; -> 0,2, 4,1, 3,5
11498 *(ptr) = val.val[0].m64_f32[0];
11499 *(ptr + 1) = val.val[1].m64_f32[0];
11500 *(ptr + 2) = val.val[2].m64_f32[0];
11501 *(ptr + 3) = val.val[0].m64_f32[1];
11502 *(ptr + 4) = val.val[1].m64_f32[1];
11503 *(ptr + 5) = val.val[2].m64_f32[1];
11504}
11505
11506_NEON2SSE_GLOBAL void vst3_p8(__transfersize(24) poly8_t * ptr, poly8x8x3_t val);// VST3.8 {d0, d1, d2}, [r0]
11507#define vst3_p8 vst3_u8
11508
11509_NEON2SSE_GLOBAL void vst3_p16(__transfersize(12) poly16_t * ptr, poly16x4x3_t val);// VST3.16 {d0, d1, d2}, [r0]
11510#define vst3_p16 vst3_u16
11511
11512//*************** Quadruples store ********************************
11513//*********************************************************************
11514//void vst4q_u8(__transfersize(64) uint8_t * ptr, uint8x16x4_t val)// VST4.8 {d0, d2, d4, d6}, [r0]
11515_NEON2SSE_INLINE void vst4q_u8_ptr(__transfersize(64) uint8_t * ptr, uint8x16x4_t const * val)
11516{
11517 __m128i tmp1, tmp2, res;
11518 tmp1 = _mm_unpacklo_epi8(val->val[0], val->val[1]); // 0,1, 4,5, 8,9, 12,13, 16,17, 20,21, 24,25, 28,29
11519 tmp2 = _mm_unpacklo_epi8(val->val[2], val->val[3]); // 2,3, 6,7, 10,11, 14,15, 18,19, 22,23, 26,27, 30,31
11520 res = _mm_unpacklo_epi16(tmp1, tmp2); //0,1, 2,3, 4,5, 6,7, 8,9, 10,11, 12,13, 14,15
11521 vst1q_u8(ptr, res);
11522 res = _mm_unpackhi_epi16(tmp1, tmp2); //16,17, 18,19, 20,21, 22,23, 24,25, 26,27, 28,29, 30,31
11523 vst1q_u8((ptr + 16), res);
11524 tmp1 = _mm_unpackhi_epi8(val->val[0], val->val[1]); //
11525 tmp2 = _mm_unpackhi_epi8(val->val[2], val->val[3]); //
11526 res = _mm_unpacklo_epi16(tmp1, tmp2); //
11527 vst1q_u8((ptr + 32), res);
11528 res = _mm_unpackhi_epi16(tmp1, tmp2); //
11529 vst1q_u8((ptr + 48), res);
11530}
11531#define vst4q_u8(ptr, val) vst4q_u8_ptr(ptr, &val)
11532
11533//void vst4q_u16(__transfersize(32) uint16_t * ptr, uint16x8x4_t val)// VST4.16 {d0, d2, d4, d6}, [r0]
11534_NEON2SSE_INLINE void vst4q_u16_ptr(__transfersize(32) uint16_t * ptr, uint16x8x4_t const * val)
11535{
11536 uint16x8x4_t v;
11537 __m128i tmp1, tmp2;
11538 tmp1 = _mm_unpacklo_epi16(val->val[0], val->val[1]); //0,1, 4,5, 8,9, 12,13
11539 tmp2 = _mm_unpacklo_epi16(val->val[2], val->val[3]); //2,3, 6,7 , 10,11, 14,15
11540 v.val[0] = _mm_unpacklo_epi32(tmp1, tmp2);
11541 v.val[1] = _mm_unpackhi_epi32(tmp1, tmp2);
11542 tmp1 = _mm_unpackhi_epi16(val->val[0], val->val[1]); //0,1, 4,5, 8,9, 12,13
11543 tmp2 = _mm_unpackhi_epi16(val->val[2], val->val[3]); //2,3, 6,7 , 10,11, 14,15
11544 v.val[2] = _mm_unpacklo_epi32(tmp1, tmp2);
11545 v.val[3] = _mm_unpackhi_epi32(tmp1, tmp2);
11546 vst1q_u16(ptr, v.val[0]);
11547 vst1q_u16((ptr + 8), v.val[1]);
11548 vst1q_u16((ptr + 16),v.val[2]);
11549 vst1q_u16((ptr + 24), v.val[3]);
11550}
11551#define vst4q_u16(ptr, val) vst4q_u16_ptr(ptr, &val)
11552
11553//void vst4q_u32(__transfersize(16) uint32_t * ptr, uint32x4x4_t val)// VST4.32 {d0, d2, d4, d6}, [r0]
11554_NEON2SSE_INLINE void vst4q_u32_ptr(__transfersize(16) uint32_t * ptr, uint32x4x4_t const * val)
11555{
11556 uint16x8x4_t v;
11557 __m128i tmp1, tmp2;
11558 tmp1 = _mm_unpacklo_epi32(val->val[0], val->val[1]); //0,1, 4,5, 8,9, 12,13
11559 tmp2 = _mm_unpacklo_epi32(val->val[2], val->val[3]); //2,3, 6,7 , 10,11, 14,15
11560 v.val[0] = _mm_unpacklo_epi64(tmp1, tmp2);
11561 v.val[1] = _mm_unpackhi_epi64(tmp1, tmp2);
11562 tmp1 = _mm_unpackhi_epi32(val->val[0], val->val[1]); //0,1, 4,5, 8,9, 12,13
11563 tmp2 = _mm_unpackhi_epi32(val->val[2], val->val[3]); //2,3, 6,7 , 10,11, 14,15
11564 v.val[2] = _mm_unpacklo_epi64(tmp1, tmp2);
11565 v.val[3] = _mm_unpackhi_epi64(tmp1, tmp2);
11566 vst1q_u32(ptr, v.val[0]);
11567 vst1q_u32((ptr + 4), v.val[1]);
11568 vst1q_u32((ptr + 8), v.val[2]);
11569 vst1q_u32((ptr + 12), v.val[3]);
11570}
11571#define vst4q_u32(ptr, val) vst4q_u32_ptr(ptr, &val)
11572
11573//void vst4q_s8(__transfersize(64) int8_t * ptr, int8x16x4_t val);
11574_NEON2SSE_GLOBAL void vst4q_s8_ptr(__transfersize(64) int8_t * ptr, int8x16x4_t const * val);
11575#define vst4q_s8(ptr, val) vst4q_u8((uint8_t*)(ptr), val)
11576
11577//void vst4q_s16(__transfersize(32) int16_t * ptr, int16x8x4_t val);
11578_NEON2SSE_GLOBAL void vst4q_s16_ptr(__transfersize(32) int16_t * ptr, int16x8x4_t const * val);
11579#define vst4q_s16(ptr, val) vst4q_u16((uint16_t*)(ptr), val)
11580
11581//void vst4q_s32(__transfersize(16) int32_t * ptr, int32x4x4_t val);
11582_NEON2SSE_GLOBAL void vst4q_s32_ptr(__transfersize(16) int32_t * ptr, int32x4x4_t const * val);
11583#define vst4q_s32(ptr, val) vst4q_u32((uint32_t*)(ptr), val)
11584
11585//void vst4q_f16(__transfersize(32) __fp16 * ptr, float16x8x4_t val);// VST4.16 {d0, d2, d4, d6}, [r0]
11586_NEON2SSE_GLOBAL void vst4q_f16_ptr(__transfersize(32) __fp16 * ptr, float16x8x4_t const * val);
11587// IA32 SIMD doesn't work with 16bit floats currently
11588
11589//void vst4q_f32(__transfersize(16) float32_t * ptr, float32x4x4_t val)// VST4.32 {d0, d2, d4, d6}, [r0]
11590_NEON2SSE_INLINE void vst4q_f32_ptr(__transfersize(16) float32_t * ptr, float32x4x4_t const * val)
11591{
11592 __m128 tmp3, tmp2, tmp1, tmp0;
11593 float32x4x4_t v;
11594 tmp0 = _mm_unpacklo_ps(val->val[0], val->val[1]);
11595 tmp2 = _mm_unpacklo_ps(val->val[2], val->val[3]);
11596 tmp1 = _mm_unpackhi_ps(val->val[0], val->val[1]);
11597 tmp3 = _mm_unpackhi_ps(val->val[2], val->val[3]);
11598 v.val[0] = _mm_movelh_ps(tmp0, tmp2);
11599 v.val[1] = _mm_movehl_ps(tmp2, tmp0);
11600 v.val[2] = _mm_movelh_ps(tmp1, tmp3);
11601 v.val[3] = _mm_movehl_ps(tmp3, tmp1);
11602 vst1q_f32(ptr, v.val[0]);
11603 vst1q_f32((ptr + 4), v.val[1]);
11604 vst1q_f32((ptr + 8), v.val[2]);
11605 vst1q_f32((ptr + 12), v.val[3]);
11606}
11607#define vst4q_f32(ptr, val) vst4q_f32_ptr(ptr, &val)
11608
11609//void vst4q_p8(__transfersize(64) poly8_t * ptr, poly8x16x4_t val);// VST4.8 {d0, d2, d4, d6}, [r0]
11610_NEON2SSE_GLOBAL void vst4q_p8_ptr(__transfersize(64) poly8_t * ptr, poly8x16x4_t const * val);
11611#define vst4q_p8 vst4q_u8
11612
11613//void vst4q_p16(__transfersize(32) poly16_t * ptr, poly16x8x4_t val);// VST4.16 {d0, d2, d4, d6}, [r0]
11614_NEON2SSE_GLOBAL void vst4q_p16_ptr(__transfersize(32) poly16_t * ptr, poly16x8x4_t const * val);
11615#define vst4q_p16 vst4q_s16
11616
11617_NEON2SSESTORAGE void vst4_u8(__transfersize(32) uint8_t * ptr, uint8x8x4_t val);// VST4.8 {d0, d1, d2, d3}, [r0]
11618_NEON2SSE_INLINE void vst4_u8(__transfersize(32) uint8_t * ptr, uint8x8x4_t val)
11619{
11620 __m128i sh0, sh1, val0, val2;
11621 sh0 = _mm_unpacklo_epi8(_pM128i(val.val[0]),_pM128i(val.val[1])); // a0,b0,a1,b1,a2,b2,a3,b3,a4,b4,a5,b5, a6,b6,a7,b7,
11622 sh1 = _mm_unpacklo_epi8(_pM128i(val.val[2]),_pM128i(val.val[3])); // c0,d0,c1,d1,c2,d2,c3,d3, c4,d4,c5,d5,c6,d6,c7,d7
11623 val0 = _mm_unpacklo_epi16(sh0,sh1); // a0,b0,c0,d0,a1,b1,c1,d1,a2,b2,c2,d2,a3,b3,c3,d3,
11624 val2 = _mm_unpackhi_epi16(sh0,sh1); //a4,b4,c4,d4,a5,b5,c5,d5, a6,b6,c6,d6,a7,b7,c7,d7
11625 vst1q_u8(ptr, val0);
11626 vst1q_u8((ptr + 16), val2);
11627}
11628
11629_NEON2SSESTORAGE void vst4_u16(__transfersize(16) uint16_t * ptr, uint16x4x4_t val);// VST4.16 {d0, d1, d2, d3}, [r0]
11630_NEON2SSE_INLINE void vst4_u16(__transfersize(16) uint16_t * ptr, uint16x4x4_t val)
11631{
11632 __m128i sh0, sh1, val0, val2;
11633 sh0 = _mm_unpacklo_epi16(_pM128i(val.val[0]),_pM128i(val.val[1])); //a0,a1,b0,b1,c0,c1,d0,d1,
11634 sh1 = _mm_unpacklo_epi16(_pM128i(val.val[2]),_pM128i(val.val[3])); //a2,a3,b2,b3,c2,c3,d2,d3
11635 val0 = _mm_unpacklo_epi32(sh0,sh1); // a0,a1,a2,a3,b0,b1,b2,b3
11636 val2 = _mm_unpackhi_epi32(sh0,sh1); // c0,c1,c2,c3,d0,d1,d2,d3
11637 vst1q_u16(ptr, val0); //store as 128 bit structure
11638 vst1q_u16((ptr + 8), val2);
11639}
11640
11641_NEON2SSESTORAGE void vst4_u32(__transfersize(8) uint32_t * ptr, uint32x2x4_t val);// VST4.32 {d0, d1, d2, d3}, [r0]
11642_NEON2SSE_INLINE void vst4_u32(__transfersize(8) uint32_t * ptr, uint32x2x4_t val)
11643{
11644 //0,4, 1,5, 2,6, 3,7
11645 __m128i sh0, sh1, val0, val1;
11646 sh0 = _mm_unpacklo_epi32(_pM128i(val.val[0]), _pM128i(val.val[1])); //0,1,4,5
11647 sh1 = _mm_unpacklo_epi32(_pM128i(val.val[2]), _pM128i(val.val[3])); //2,3,6,7
11648 val0 = _mm_unpacklo_epi64(sh0,sh1); //
11649 val1 = _mm_unpackhi_epi64(sh0,sh1); //
11650 vst1q_u32(ptr, val0); //store as 128 bit structure
11651 vst1q_u32((ptr + 4), val1);
11652}
11653
11654_NEON2SSESTORAGE void vst4_u64(__transfersize(4) uint64_t * ptr, uint64x1x4_t val);// VST1.64 {d0, d1, d2, d3}, [r0]
11655_NEON2SSE_INLINE void vst4_u64(__transfersize(4) uint64_t * ptr, uint64x1x4_t val)
11656{
11657 *(ptr) = val.val[0].m64_u64[0];
11658 *(ptr + 1) = val.val[1].m64_u64[0];
11659 *(ptr + 2) = val.val[2].m64_u64[0];
11660 *(ptr + 3) = val.val[3].m64_u64[0];
11661}
11662
11663//void vst4_s8(__transfersize(32) int8_t * ptr, int8x8x4_t val) //VST4.8 {d0, d1, d2, d3}, [r0]
11664#define vst4_s8(ptr, val) vst4_u8((uint8_t*)ptr, val)
11665
11666//void vst4_s16(__transfersize(16) int16_t * ptr, int16x4x4_t val) // VST4.16 {d0, d1, d2, d3}, [r0]
11667#define vst4_s16(ptr, val) vst4_u16((uint16_t*)ptr, val)
11668
11669//void vst4_s32(__transfersize(8) int32_t * ptr, int32x2x4_t val) // VST4.32 {d0, d1, d2, d3}, [r0]
11670#define vst4_s32(ptr, val) vst4_u32((uint32_t*)ptr, val)
11671
11672//void vst4_s64(__transfersize(4) int64_t * ptr, int64x1x4_t val); // VST1.64 {d0, d1, d2, d3}, [r0]
11673_NEON2SSE_GLOBAL void vst4_s64_ptr(__transfersize(4) int64_t * ptr, int64x1x4_t const * val);
11674#define vst4_s64(ptr, val) vst4_u64((uint64_t*)ptr, val)
11675
11676//void vst4_f16(__transfersize(16) __fp16 * ptr, float16x4x4_t val);// VST4.16 {d0, d1, d2, d3}, [r0]
11677_NEON2SSE_GLOBAL void vst4_f16_ptr(__transfersize(16) __fp16 * ptr, float16x4x4_t const * val);
11678// IA32 SIMD doesn't work with 16bit floats currently, so need to go to 32 bit and then work with two 128bit registers. See vld1q_f16 for example
11679
11680_NEON2SSESTORAGE void vst4_f32(__transfersize(8) float32_t * ptr, float32x2x4_t val);// VST4.32 {d0, d1, d2, d3}, [r0]
11681_NEON2SSE_INLINE void vst4_f32(__transfersize(8) float32_t * ptr, float32x2x4_t val)
11682{
11683 //0,4, 1,5, 2,6, 3,7 -> 0,1, 2,3, 4,5, 6,7
11684 *(ptr) = val.val[0].m64_f32[0];
11685 *(ptr + 1) = val.val[1].m64_f32[0];
11686 *(ptr + 2) = val.val[2].m64_f32[0];
11687 *(ptr + 3) = val.val[3].m64_f32[0];
11688 *(ptr + 4) = val.val[0].m64_f32[1];
11689 *(ptr + 5) = val.val[1].m64_f32[1];
11690 *(ptr + 6) = val.val[2].m64_f32[1];
11691 *(ptr + 7) = val.val[3].m64_f32[1];
11692}
11693
11694_NEON2SSE_GLOBAL void vst4_p8(__transfersize(32) poly8_t * ptr, poly8x8x4_t val);// VST4.8 {d0, d1, d2, d3}, [r0]
11695#define vst4_p8 vst4_u8
11696
11697_NEON2SSE_GLOBAL void vst4_p16(__transfersize(16) poly16_t * ptr, poly16x4x4_t val);// VST4.16 {d0, d1, d2, d3}, [r0]
11698#define vst4_p16 vst4_u16
11699
11700//*********** Store a lane of a vector into memory (extract given lane) for a couple of vectors *********************
11701//********************************************************************************************************************
11702//void vst2q_lane_u16(__transfersize(2) uint16_t * ptr, uint16x8x2_t val, __constrange(0,7) int lane)// VST2.16 {d0[0], d2[0]}, [r0]
11703_NEON2SSE_INLINE void vst2q_lane_u16_ptr(__transfersize(2) uint16_t * ptr, uint16x8x2_t const * val, __constrange(0,7) int lane)
11704{
11705 vst1q_lane_s16(ptr, val->val[0], lane);
11706 vst1q_lane_s16((ptr + 1), val->val[1], lane);
11707}
11708#define vst2q_lane_u16(ptr, val, lane) vst2q_lane_u16_ptr(ptr, &val, lane)
11709
11710//void vst2q_lane_u32(__transfersize(2) uint32_t * ptr, uint32x4x2_t val, __constrange(0,3) int lane)// VST2.32 {d0[0], d2[0]}, [r0]
11711_NEON2SSE_INLINE void vst2q_lane_u32_ptr(__transfersize(2) uint32_t* ptr, uint32x4x2_t const * val, __constrange(0,3) int lane)
11712{
11713 vst1q_lane_u32(ptr, val->val[0], lane);
11714 vst1q_lane_u32((ptr + 1), val->val[1], lane);
11715}
11716#define vst2q_lane_u32(ptr, val, lane) vst2q_lane_u32_ptr(ptr, &val, lane)
11717
11718//void vst2q_lane_s16(__transfersize(2) int16_t * ptr, int16x8x2_t val, __constrange(0,7) int lane);// VST2.16 {d0[0], d2[0]}, [r0]
11719_NEON2SSE_GLOBAL void vst2q_lane_s16_ptr(__transfersize(2) int16_t * ptr, int16x8x2_t const * val, __constrange(0,7) int lane);
11720#define vst2q_lane_s16(ptr, val, lane) vst2q_lane_u16((uint16_t*)ptr, val, lane)
11721
11722//void vst2q_lane_s32(__transfersize(2) int32_t * ptr, int32x4x2_t val, __constrange(0,3) int lane);// VST2.32 {d0[0], d2[0]}, [r0]
11723_NEON2SSE_GLOBAL void vst2q_lane_s32_ptr(__transfersize(2) int32_t * ptr, int32x4x2_t const * val, __constrange(0,3) int lane);
11724#define vst2q_lane_s32(ptr, val, lane) vst2q_lane_u32((uint32_t*)ptr, val, lane)
11725
11726//void vst2q_lane_f16(__transfersize(2) __fp16 * ptr, float16x8x2_t val, __constrange(0,7) int lane);// VST2.16 {d0[0], d2[0]}, [r0]
11727_NEON2SSE_GLOBAL void vst2q_lane_f16_ptr(__transfersize(2) __fp16 * ptr, float16x8x2_t const * val, __constrange(0,7) int lane);
11728//current IA SIMD doesn't support float16
11729
11730//void vst2q_lane_f32(__transfersize(2) float32_t * ptr, float32x4x2_t val, __constrange(0,3) int lane)// VST2.32 {d0[0], d2[0]}, [r0]
11731_NEON2SSE_INLINE void vst2q_lane_f32_ptr(__transfersize(2) float32_t* ptr, float32x4x2_t const * val, __constrange(0,3) int lane)
11732{
11733 vst1q_lane_f32(ptr, val->val[0], lane);
11734 vst1q_lane_f32((ptr + 1), val->val[1], lane);
11735}
11736#define vst2q_lane_f32(ptr,src,lane) vst2q_lane_f32_ptr(ptr,&src,lane)
11737
11738//void vst2q_lane_p16(__transfersize(2) poly16_t * ptr, poly16x8x2_t val, __constrange(0,7) int lane);// VST2.16 {d0[0], d2[0]}, [r0]
11739_NEON2SSE_GLOBAL void vst2q_lane_p16_ptr(__transfersize(2) poly16_t * ptr, poly16x8x2_t const * val, __constrange(0,7) int lane);
11740#define vst2q_lane_p16 vst2q_lane_s16
11741
11742_NEON2SSESTORAGE void vst2_lane_u8(__transfersize(2) uint8_t * ptr, uint8x8x2_t val, __constrange(0,7) int lane);// VST2.8 {d0[0], d1[0]}, [r0]
11743_NEON2SSE_INLINE void vst2_lane_u8(__transfersize(2) uint8_t * ptr, uint8x8x2_t val, __constrange(0,7) int lane) // VST2.8 {d0[0], d1[0]}, [r0]
11744{
11745 *(ptr) = val.val[0].m64_u8[lane];
11746 *(ptr + 1) = val.val[1].m64_u8[lane];
11747}
11748
11749_NEON2SSESTORAGE void vst2_lane_u16(__transfersize(2) uint16_t * ptr, uint16x4x2_t val, __constrange(0,3) int lane);// VST2.16 {d0[0], d1[0]}, [r0]
11750_NEON2SSE_INLINE void vst2_lane_u16(__transfersize(2) uint16_t * ptr, uint16x4x2_t val, __constrange(0,3) int lane)
11751{
11752 *(ptr) = val.val[0].m64_u16[lane];
11753 *(ptr + 1) = val.val[1].m64_u16[lane];
11754}
11755
11756_NEON2SSESTORAGE void vst2_lane_u32(__transfersize(2) uint32_t * ptr, uint32x2x2_t val, __constrange(0,1) int lane);// VST2.32 {d0[0], d1[0]}, [r0]
11757_NEON2SSE_INLINE void vst2_lane_u32(__transfersize(2) uint32_t * ptr, uint32x2x2_t val, __constrange(0,1) int lane)
11758{
11759 *(ptr) = val.val[0].m64_u32[lane];
11760 *(ptr + 1) = val.val[1].m64_u32[lane];
11761}
11762
11763_NEON2SSE_GLOBAL void vst2_lane_s8(__transfersize(2) int8_t * ptr, int8x8x2_t val, __constrange(0,7) int lane);// VST2.8 {d0[0], d1[0]}, [r0]
11764#define vst2_lane_s8(ptr, val, lane) vst2_lane_u8((uint8_t*)ptr, val, lane)
11765
11766_NEON2SSE_GLOBAL void vst2_lane_s16(__transfersize(2) int16_t * ptr, int16x4x2_t val, __constrange(0,3) int lane);// VST2.16 {d0[0], d1[0]}, [r0]
11767#define vst2_lane_s16(ptr, val, lane) vst2_lane_u16((uint16_t*)ptr, val, lane)
11768
11769_NEON2SSE_GLOBAL void vst2_lane_s32(__transfersize(2) int32_t * ptr, int32x2x2_t val, __constrange(0,1) int lane);// VST2.32 {d0[0], d1[0]}, [r0]
11770#define vst2_lane_s32(ptr, val, lane) vst2_lane_u32((uint32_t*)ptr, val, lane)
11771
11772//void vst2_lane_f16(__transfersize(2) __fp16 * ptr, float16x4x2_t val, __constrange(0,3) int lane); // VST2.16 {d0[0], d1[0]}, [r0]
11773//current IA SIMD doesn't support float16
11774
11775_NEON2SSESTORAGE void vst2_lane_f32(__transfersize(2) float32_t * ptr, float32x2x2_t val, __constrange(0,1) int lane); // VST2.32 {d0[0], d1[0]}, [r0]
11776_NEON2SSE_INLINE void vst2_lane_f32(__transfersize(2) float32_t * ptr, float32x2x2_t val, __constrange(0,1) int lane)
11777{
11778 *(ptr) = val.val[0].m64_f32[lane];
11779 *(ptr + 1) = val.val[1].m64_f32[lane];
11780}
11781
11782_NEON2SSE_GLOBAL void vst2_lane_p8(__transfersize(2) poly8_t * ptr, poly8x8x2_t val, __constrange(0,7) int lane);// VST2.8 {d0[0], d1[0]}, [r0]
11783#define vst2_lane_p8 vst2_lane_u8
11784
11785_NEON2SSE_GLOBAL void vst2_lane_p16(__transfersize(2) poly16_t * ptr, poly16x4x2_t val, __constrange(0,3) int lane);// VST2.16 {d0[0], d1[0]}, [r0]
11786#define vst2_lane_p16 vst2_lane_u16
11787
11788//************************* Triple lanes stores *******************************************************
11789//*******************************************************************************************************
11790//void vst3q_lane_u16(__transfersize(3) uint16_t * ptr, uint16x8x3_t val, __constrange(0,7) int lane)// VST3.16 {d0[0], d2[0], d4[0]}, [r0]
11791_NEON2SSE_INLINE void vst3q_lane_u16_ptr(__transfersize(3) uint16_t * ptr, uint16x8x3_t const * val, __constrange(0,7) int lane)
11792{
11793 vst2q_lane_u16_ptr(ptr, (uint16x8x2_t*)val, lane);
11794 vst1q_lane_u16((ptr + 2), val->val[2], lane);
11795}
11796#define vst3q_lane_u16(ptr, val, lane) vst3q_lane_u16_ptr(ptr, &val, lane)
11797
11798//void vst3q_lane_u32(__transfersize(3) uint32_t * ptr, uint32x4x3_t val, __constrange(0,3) int lane)// VST3.32 {d0[0], d2[0], d4[0]}, [r0]
11799_NEON2SSE_INLINE void vst3q_lane_u32_ptr(__transfersize(3) uint32_t * ptr, uint32x4x3_t const * val, __constrange(0,3) int lane)
11800{
11801 vst2q_lane_u32_ptr(ptr, (uint32x4x2_t*)val, lane);
11802 vst1q_lane_u32((ptr + 2), val->val[2], lane);
11803}
11804#define vst3q_lane_u32(ptr, val, lane) vst3q_lane_u32_ptr(ptr, &val, lane)
11805
11806//void vst3q_lane_s16(__transfersize(3) int16_t * ptr, int16x8x3_t val, __constrange(0,7) int lane);// VST3.16 {d0[0], d2[0], d4[0]}, [r0]
11807_NEON2SSE_GLOBAL void vst3q_lane_s16_ptr(__transfersize(3) int16_t * ptr, int16x8x3_t const * val, __constrange(0,7) int lane);
11808#define vst3q_lane_s16(ptr, val, lane) vst3q_lane_u16((uint16_t *)ptr, val, lane)
11809
11810//void vst3q_lane_s32(__transfersize(3) int32_t * ptr, int32x4x3_t val, __constrange(0,3) int lane);// VST3.32 {d0[0], d2[0], d4[0]}, [r0]
11811_NEON2SSE_GLOBAL void vst3q_lane_s32_ptr(__transfersize(3) int32_t * ptr, int32x4x3_t const * val, __constrange(0,3) int lane);
11812#define vst3q_lane_s32(ptr, val, lane) vst3q_lane_u32((uint32_t *)ptr, val, lane)
11813
11814//void vst3q_lane_f16(__transfersize(3) __fp16 * ptr, float16x8x3_t val, __constrange(0,7) int lane);// VST3.16 {d0[0], d2[0], d4[0]}, [r0]
11815_NEON2SSE_GLOBAL void vst3q_lane_f16_ptr(__transfersize(3) __fp16 * ptr, float16x8x3_t const * val, __constrange(0,7) int lane);
11816//current IA SIMD doesn't support float16
11817
11818//void vst3q_lane_f32(__transfersize(3) float32_t * ptr, float32x4x3_t val, __constrange(0,3) int lane)// VST3.32 {d0[0], d2[0], d4[0]}, [r0]
11819_NEON2SSE_INLINE void vst3q_lane_f32_ptr(__transfersize(3) float32_t * ptr, float32x4x3_t const * val, __constrange(0,3) int lane)
11820{
11821 vst1q_lane_f32(ptr, val->val[0], lane);
11822 vst1q_lane_f32((ptr + 1), val->val[1], lane);
11823 vst1q_lane_f32((ptr + 2), val->val[2], lane);
11824}
11825#define vst3q_lane_f32(ptr,val,lane) vst3q_lane_f32_ptr(ptr,&val,lane)
11826
11827//void vst3q_lane_p16(__transfersize(3) poly16_t * ptr, poly16x8x3_t val, __constrange(0,7) int lane);// VST3.16 {d0[0], d2[0], d4[0]}, [r0]
11828_NEON2SSE_GLOBAL void vst3q_lane_p16_ptr(__transfersize(3) poly16_t * ptr, poly16x8x3_t const * val, __constrange(0,7) int lane);
11829#define vst3q_lane_p16 vst3q_lane_s16
11830
11831_NEON2SSESTORAGE void vst3_lane_u8(__transfersize(3) uint8_t * ptr, uint8x8x3_t val, __constrange(0,7) int lane);// VST3.8 {d0[0], d1[0], d2[0]}, [r0]
11832_NEON2SSE_INLINE void vst3_lane_u8(__transfersize(3) uint8_t * ptr, uint8x8x3_t val, __constrange(0,7) int lane)
11833{
11834 *(ptr) = val.val[0].m64_u8[lane];
11835 *(ptr + 1) = val.val[1].m64_u8[lane];
11836 *(ptr + 2) = val.val[2].m64_u8[lane];
11837}
11838
11839_NEON2SSESTORAGE void vst3_lane_u16(__transfersize(3) uint16_t * ptr, uint16x4x3_t val, __constrange(0,3) int lane);// VST3.16 {d0[0], d1[0], d2[0]}, [r0]
11840_NEON2SSE_INLINE void vst3_lane_u16(__transfersize(3) uint16_t * ptr, uint16x4x3_t val, __constrange(0,3) int lane)
11841{
11842 *(ptr) = val.val[0].m64_u16[lane];
11843 *(ptr + 1) = val.val[1].m64_u16[lane];
11844 *(ptr + 2) = val.val[2].m64_u16[lane];
11845}
11846
11847_NEON2SSESTORAGE void vst3_lane_u32(__transfersize(3) uint32_t * ptr, uint32x2x3_t val, __constrange(0,1) int lane);// VST3.32 {d0[0], d1[0], d2[0]}, [r0]
11848_NEON2SSE_INLINE void vst3_lane_u32(__transfersize(3) uint32_t * ptr, uint32x2x3_t val, __constrange(0,1) int lane)
11849{
11850 *(ptr) = val.val[0].m64_u32[lane];
11851 *(ptr + 1) = val.val[1].m64_u32[lane];
11852 *(ptr + 2) = val.val[2].m64_u32[lane];
11853}
11854
11855_NEON2SSE_GLOBAL void vst3_lane_s8(__transfersize(3) int8_t * ptr, int8x8x3_t val, __constrange(0,7) int lane);// VST3.8 {d0[0], d1[0], d2[0]}, [r0]
11856#define vst3_lane_s8(ptr, val, lane) vst3_lane_u8((uint8_t *)ptr, val, lane)
11857
11858_NEON2SSE_GLOBAL void vst3_lane_s16(__transfersize(3) int16_t * ptr, int16x4x3_t val, __constrange(0,3) int lane);// VST3.16 {d0[0], d1[0], d2[0]}, [r0]
11859#define vst3_lane_s16(ptr, val, lane) vst3_lane_u16((uint16_t *)ptr, val, lane)
11860
11861_NEON2SSE_GLOBAL void vst3_lane_s32(__transfersize(3) int32_t * ptr, int32x2x3_t val, __constrange(0,1) int lane);// VST3.32 {d0[0], d1[0], d2[0]}, [r0]
11862#define vst3_lane_s32(ptr, val, lane) vst3_lane_u32((uint32_t *)ptr, val, lane)
11863
11864//void vst3_lane_f16(__transfersize(3) __fp16 * ptr, float16x4x3_t val, __constrange(0,3) int lane);// VST3.16 {d0[0], d1[0], d2[0]}, [r0]
11865_NEON2SSE_GLOBAL void vst3_lane_f16_ptr(__transfersize(3) __fp16 * ptr, float16x4x3_t const * val, __constrange(0,3) int lane);
11866//current IA SIMD doesn't support float16
11867
11868_NEON2SSESTORAGE void vst3_lane_f32(__transfersize(3) float32_t * ptr, float32x2x3_t val, __constrange(0,1) int lane);// VST3.32 {d0[0], d1[0], d2[0]}, [r0]
11869_NEON2SSE_INLINE void vst3_lane_f32(__transfersize(3) float32_t * ptr, float32x2x3_t val, __constrange(0,1) int lane)
11870{
11871 *(ptr) = val.val[0].m64_f32[lane];
11872 *(ptr + 1) = val.val[1].m64_f32[lane];
11873 *(ptr + 2) = val.val[2].m64_f32[lane];
11874}
11875
11876_NEON2SSE_GLOBAL void vst3_lane_p8(__transfersize(3) poly8_t * ptr, poly8x8x3_t val, __constrange(0,7) int lane);// VST3.8 {d0[0], d1[0], d2[0]}, [r0]
11877#define vst3_lane_p8 vst3_lane_u8
11878
11879_NEON2SSE_GLOBAL void vst3_lane_p16(__transfersize(3) poly16_t * ptr, poly16x4x3_t val, __constrange(0,3) int lane);// VST3.16 {d0[0], d1[0], d2[0]}, [r0]
11880#define vst3_lane_p16 vst3_lane_u16
11881
11882//******************************** Quadruple lanes stores ***********************************************
11883//*******************************************************************************************************
11884//void vst4q_lane_u16(__transfersize(4) uint16_t * ptr, uint16x8x4_t val, __constrange(0,7) int lane)// VST4.16 {d0[0], d2[0], d4[0], d6[0]}, [r0]
11885_NEON2SSE_INLINE void vst4q_lane_u16_ptr(__transfersize(4) uint16_t * ptr, uint16x8x4_t const * val4, __constrange(0,7) int lane)
11886{
11887 vst2q_lane_u16_ptr(ptr, (uint16x8x2_t*)val4->val, lane);
11888 vst2q_lane_u16_ptr((ptr + 2),((uint16x8x2_t*)val4->val + 1), lane);
11889}
11890#define vst4q_lane_u16(ptr, val, lane) vst4q_lane_u16_ptr(ptr, &val, lane)
11891
11892//void vst4q_lane_u32(__transfersize(4) uint32_t * ptr, uint32x4x4_t val, __constrange(0,3) int lane)// VST4.32 {d0[0], d2[0], d4[0], d6[0]}, [r0]
11893_NEON2SSE_INLINE void vst4q_lane_u32_ptr(__transfersize(4) uint32_t * ptr, uint32x4x4_t const * val4, __constrange(0,3) int lane)
11894{
11895 vst2q_lane_u32_ptr(ptr, (uint32x4x2_t*)val4->val, lane);
11896 vst2q_lane_u32_ptr((ptr + 2), ((uint32x4x2_t*)val4->val + 1), lane);
11897}
11898#define vst4q_lane_u32(ptr, val, lane) vst4q_lane_u32_ptr(ptr, &val, lane)
11899
11900//void vst4q_lane_s16(__transfersize(4) int16_t * ptr, int16x8x4_t val, __constrange(0,7) int lane);// VST4.16 {d0[0], d2[0], d4[0], d6[0]}, [r0]
11901_NEON2SSE_GLOBAL void vst4q_lane_s16_ptr(__transfersize(4) int16_t * ptr, int16x8x4_t const * val, __constrange(0,7) int lane);
11902#define vst4q_lane_s16(ptr,val,lane) vst4q_lane_u16((uint16_t *)ptr,val,lane)
11903
11904//void vst4q_lane_s32(__transfersize(4) int32_t * ptr, int32x4x4_t val, __constrange(0,3) int lane);// VST4.32 {d0[0], d2[0], d4[0], d6[0]}, [r0]
11905_NEON2SSE_GLOBAL void vst4q_lane_s32_ptr(__transfersize(4) int32_t * ptr, int32x4x4_t const * val, __constrange(0,3) int lane);
11906#define vst4q_lane_s32(ptr,val,lane) vst4q_lane_u32((uint32_t *)ptr,val,lane)
11907
11908//void vst4q_lane_f16(__transfersize(4) __fp16 * ptr, float16x8x4_t val, __constrange(0,7) int lane);// VST4.16 {d0[0], d2[0], d4[0], d6[0]}, [r0]
11909_NEON2SSE_GLOBAL void vst4q_lane_f16_ptr(__transfersize(4) __fp16 * ptr, float16x8x4_t const * val, __constrange(0,7) int lane);
11910//current IA SIMD doesn't support float16
11911
11912//void vst4q_lane_f32(__transfersize(4) float32_t * ptr, float32x4x4_t val, __constrange(0,3) int lane)// VST4.32 {d0[0], d2[0], d4[0], d6[0]}, [r0]
11913_NEON2SSE_INLINE void vst4q_lane_f32_ptr(__transfersize(4) float32_t * ptr, float32x4x4_t const * val, __constrange(0,3) int lane)
11914{
11915 vst1q_lane_f32(ptr, val->val[0], lane);
11916 vst1q_lane_f32((ptr + 1), val->val[1], lane);
11917 vst1q_lane_f32((ptr + 2), val->val[2], lane);
11918 vst1q_lane_f32((ptr + 3), val->val[3], lane);
11919}
11920#define vst4q_lane_f32(ptr,val,lane) vst4q_lane_f32_ptr(ptr,&val,lane)
11921
11922//void vst4q_lane_p16(__transfersize(4) poly16_t * ptr, poly16x8x4_t val, __constrange(0,7) int lane);// VST4.16 {d0[0], d2[0], d4[0], d6[0]}, [r0]
11923_NEON2SSE_GLOBAL void vst4q_lane_p16_ptr(__transfersize(4) poly16_t * ptr, poly16x8x4_t const * val, __constrange(0,7) int lane);
11924#define vst4q_lane_p16 vst4q_lane_u16
11925
11926_NEON2SSESTORAGE void vst4_lane_u8(__transfersize(4) uint8_t * ptr, uint8x8x4_t val, __constrange(0,7) int lane);// VST4.8 {d0[0], d1[0], d2[0], d3[0]}, [r0]
11927_NEON2SSE_INLINE void vst4_lane_u8(__transfersize(4) uint8_t * ptr, uint8x8x4_t val, __constrange(0,7) int lane)
11928{
11929 *(ptr) = val.val[0].m64_u8[lane];
11930 *(ptr + 1) = val.val[1].m64_u8[lane];
11931 *(ptr + 2) = val.val[2].m64_u8[lane];
11932 *(ptr + 3) = val.val[3].m64_u8[lane];
11933}
11934
11935_NEON2SSESTORAGE void vst4_lane_u16(__transfersize(4) uint16_t * ptr, uint16x4x4_t val, __constrange(0,3) int lane);// VST4.16 {d0[0], d1[0], d2[0], d3[0]}, [r0]
11936_NEON2SSE_INLINE void vst4_lane_u16(__transfersize(4) uint16_t * ptr, uint16x4x4_t val, __constrange(0,3) int lane)
11937{
11938 *(ptr) = val.val[0].m64_u16[lane];
11939 *(ptr + 1) = val.val[1].m64_u16[lane];
11940 *(ptr + 2) = val.val[2].m64_u16[lane];
11941 *(ptr + 3) = val.val[3].m64_u16[lane];
11942}
11943
11944_NEON2SSESTORAGE void vst4_lane_u32(__transfersize(4) uint32_t * ptr, uint32x2x4_t val, __constrange(0,1) int lane);// VST4.32 {d0[0], d1[0], d2[0], d3[0]}, [r0]
11945_NEON2SSE_INLINE void vst4_lane_u32(__transfersize(4) uint32_t * ptr, uint32x2x4_t val, __constrange(0,1) int lane)
11946{
11947 *(ptr) = val.val[0].m64_u32[lane];
11948 *(ptr + 1) = val.val[1].m64_u32[lane];
11949 *(ptr + 2) = val.val[2].m64_u32[lane];
11950 *(ptr + 3) = val.val[3].m64_u32[lane];
11951}
11952
11953_NEON2SSE_GLOBAL void vst4_lane_s8(__transfersize(4) int8_t * ptr, int8x8x4_t val, __constrange(0,7) int lane);// VST4.8 {d0[0], d1[0], d2[0], d3[0]}, [r0]
11954#define vst4_lane_s8(ptr, val, lane) vst4_lane_u8((uint8_t*)ptr, val, lane)
11955
11956_NEON2SSE_GLOBAL void vst4_lane_s16(__transfersize(4) int16_t * ptr, int16x4x4_t val, __constrange(0,3) int lane);// VST4.16 {d0[0], d1[0], d2[0], d3[0]}, [r0]
11957#define vst4_lane_s16(ptr, val, lane) vst4_lane_u16((uint16_t*)ptr, val, lane)
11958
11959_NEON2SSE_GLOBAL void vst4_lane_s32(__transfersize(4) int32_t * ptr, int32x2x4_t val, __constrange(0,1) int lane);// VST4.32 {d0[0], d1[0], d2[0], d3[0]}, [r0]
11960#define vst4_lane_s32(ptr, val, lane) vst4_lane_u32((uint32_t*)ptr, val, lane)
11961
11962//void vst4_lane_f16(__transfersize(4) __fp16 * ptr, float16x4x4_t val, __constrange(0,3) int lane);// VST4.16 {d0[0], d1[0], d2[0], d3[0]}, [r0]
11963_NEON2SSE_GLOBAL void vst4_lane_f16_ptr(__transfersize(4) __fp16 * ptr, float16x4x4_t const * val, __constrange(0,3) int lane);
11964//current IA SIMD doesn't support float16
11965
11966_NEON2SSESTORAGE void vst4_lane_f32(__transfersize(4) float32_t * ptr, float32x2x4_t val, __constrange(0,1) int lane); // VST4.32 {d0[0], d1[0], d2[0], d3[0]}, [r0]
11967_NEON2SSE_INLINE void vst4_lane_f32(__transfersize(4) float32_t * ptr, float32x2x4_t val, __constrange(0,1) int lane)
11968{
11969 *(ptr) = val.val[0].m64_f32[lane];
11970 *(ptr + 1) = val.val[1].m64_f32[lane];
11971 *(ptr + 2) = val.val[2].m64_f32[lane];
11972 *(ptr + 3) = val.val[3].m64_f32[lane];
11973}
11974
11975_NEON2SSE_GLOBAL void vst4_lane_p8(__transfersize(4) poly8_t * ptr, poly8x8x4_t val, __constrange(0,7) int lane);// VST4.8 {d0[0], d1[0], d2[0], d3[0]}, [r0]
11976#define vst4_lane_p8 vst4_lane_u8
11977
11978_NEON2SSE_GLOBAL void vst4_lane_p16(__transfersize(4) poly16_t * ptr, poly16x4x4_t val, __constrange(0,3) int lane);// VST4.16 {d0[0], d1[0], d2[0], d3[0]}, [r0]
11979#define vst4_lane_p16 vst4_lane_u16
11980
11981//**************************************************************************************************
11982//************************ Extract lanes from a vector ********************************************
11983//**************************************************************************************************
11984//These intrinsics extract a single lane (element) from a vector.
11985_NEON2SSE_GLOBAL uint8_t vget_lane_u8(uint8x8_t vec, __constrange(0,7) int lane); // VMOV.U8 r0, d0[0]
11986#define vget_lane_u8(vec, lane) vec.m64_u8[lane]
11987
11988_NEON2SSE_GLOBAL uint16_t vget_lane_u16(uint16x4_t vec, __constrange(0,3) int lane); // VMOV.s16 r0, d0[0]
11989#define vget_lane_u16(vec, lane) vec.m64_u16[lane]
11990
11991
11992_NEON2SSE_GLOBAL uint32_t vget_lane_u32(uint32x2_t vec, __constrange(0,1) int lane); // VMOV.32 r0, d0[0]
11993#define vget_lane_u32(vec, lane) vec.m64_u32[lane]
11994
11995_NEON2SSE_GLOBAL int8_t vget_lane_s8(int8x8_t vec, __constrange(0,7) int lane); // VMOV.S8 r0, d0[0]
11996#define vget_lane_s8(vec, lane) vec.m64_i8[lane]
11997
11998_NEON2SSE_GLOBAL int16_t vget_lane_s16(int16x4_t vec, __constrange(0,3) int lane); // VMOV.S16 r0, d0[0]
11999#define vget_lane_s16(vec, lane) vec.m64_i16[lane]
12000
12001_NEON2SSE_GLOBAL int32_t vget_lane_s32(int32x2_t vec, __constrange(0,1) int lane); // VMOV.32 r0, d0[0]
12002#define vget_lane_s32(vec, lane) vec.m64_i32[lane]
12003
12004_NEON2SSE_GLOBAL poly8_t vget_lane_p8(poly8x8_t vec, __constrange(0,7) int lane); // VMOV.U8 r0, d0[0]
12005#define vget_lane_p8 vget_lane_u8
12006
12007_NEON2SSE_GLOBAL poly16_t vget_lane_p16(poly16x4_t vec, __constrange(0,3) int lane); // VMOV.s16 r0, d0[0]
12008#define vget_lane_p16 vget_lane_u16
12009
12010_NEON2SSE_GLOBAL float32_t vget_lane_f32(float32x2_t vec, __constrange(0,1) int lane); // VMOV.32 r0, d0[0]
12011#define vget_lane_f32(vec, lane) vec.m64_f32[lane]
12012
12013_NEON2SSE_GLOBAL uint8_t vgetq_lane_u8(uint8x16_t vec, __constrange(0,15) int lane); // VMOV.U8 r0, d0[0]
12014#define vgetq_lane_u8 (uint8_t) _MM_EXTRACT_EPI8
12015
12016_NEON2SSE_GLOBAL uint16_t vgetq_lane_u16(uint16x8_t vec, __constrange(0,7) int lane); // VMOV.s16 r0, d0[0]
12017#define vgetq_lane_u16 (uint16_t) _MM_EXTRACT_EPI16
12018
12019_NEON2SSE_GLOBAL uint32_t vgetq_lane_u32(uint32x4_t vec, __constrange(0,3) int lane); // VMOV.32 r0, d0[0]
12020#define vgetq_lane_u32 (uint32_t) _MM_EXTRACT_EPI32
12021
12022_NEON2SSE_GLOBAL int8_t vgetq_lane_s8(int8x16_t vec, __constrange(0,15) int lane); // VMOV.S8 r0, d0[0]
12023#define vgetq_lane_s8 _MM_EXTRACT_EPI8
12024
12025_NEON2SSE_GLOBAL int16_t vgetq_lane_s16(int16x8_t vec, __constrange(0,7) int lane); // VMOV.S16 r0, d0[0]
12026#define vgetq_lane_s16 _MM_EXTRACT_EPI16
12027
12028_NEON2SSE_GLOBAL int32_t vgetq_lane_s32(int32x4_t vec, __constrange(0,3) int lane); // VMOV.32 r0, d0[0]
12029#define vgetq_lane_s32 _MM_EXTRACT_EPI32
12030
12031_NEON2SSE_GLOBAL poly8_t vgetq_lane_p8(poly8x16_t vec, __constrange(0,15) int lane); // VMOV.U8 r0, d0[0]
12032#define vgetq_lane_p8 vgetq_lane_u8
12033
12034_NEON2SSE_GLOBAL poly16_t vgetq_lane_p16(poly16x8_t vec, __constrange(0,7) int lane); // VMOV.s16 r0, d0[0]
12035#define vgetq_lane_p16 vgetq_lane_u16
12036
12037_NEON2SSESTORAGE float32_t vgetq_lane_f32(float32x4_t vec, __constrange(0,3) int lane); // VMOV.32 r0, d0[0]
12038_NEON2SSE_INLINE float32_t vgetq_lane_f32(float32x4_t vec, __constrange(0,3) int lane)
12039{
12040 float32_t floatVal;
12041 char * const floatVal_c = (char*)&floatVal;
12042 *((int32_t*)floatVal_c) = _MM_EXTRACT_PS(vec,lane);
12043 return floatVal;
12044}
12045
12046_NEON2SSE_GLOBAL int64_t vget_lane_s64(int64x1_t vec, __constrange(0,0) int lane); // VMOV r0,r0,d0
12047#define vget_lane_s64(vec, lane) vec.m64_i64[0]
12048
12049_NEON2SSE_GLOBAL uint64_t vget_lane_u64(uint64x1_t vec, __constrange(0,0) int lane); // VMOV r0,r0,d0
12050#define vget_lane_u64(vec, lane) vec.m64_u64[0]
12051
12052
12053_NEON2SSE_GLOBAL int64_t vgetq_lane_s64(int64x2_t vec, __constrange(0,1) int lane); // VMOV r0,r0,d0
12054#define vgetq_lane_s64 _MM_EXTRACT_EPI64
12055
12056_NEON2SSE_GLOBAL uint64_t vgetq_lane_u64(uint64x2_t vec, __constrange(0,1) int lane); // VMOV r0,r0,d0
12057#define vgetq_lane_u64 (uint64_t) _MM_EXTRACT_EPI64
12058
12059// ***************** Set lanes within a vector ********************************************
12060// **************************************************************************************
12061//These intrinsics set a single lane (element) within a vector.
12062//same functions as vld1_lane_xx ones, but take the value to be set directly.
12063
12064_NEON2SSESTORAGE uint8x8_t vset_lane_u8(uint8_t value, uint8x8_t vec, __constrange(0,7) int lane); // VMOV.8 d0[0],r0
12065_NEON2SSE_INLINE uint8x8_t vset_lane_u8(uint8_t value, uint8x8_t vec, __constrange(0,7) int lane)
12066{
12067 uint8_t val;
12068 val = value;
12069 return vld1_lane_u8(&val, vec, lane);
12070}
12071
12072_NEON2SSESTORAGE uint16x4_t vset_lane_u16(uint16_t value, uint16x4_t vec, __constrange(0,3) int lane); // VMOV.16 d0[0],r0
12073_NEON2SSE_INLINE uint16x4_t vset_lane_u16(uint16_t value, uint16x4_t vec, __constrange(0,3) int lane)
12074{
12075 uint16_t val;
12076 val = value;
12077 return vld1_lane_u16(&val, vec, lane);
12078}
12079
12080_NEON2SSESTORAGE uint32x2_t vset_lane_u32(uint32_t value, uint32x2_t vec, __constrange(0,1) int lane); // VMOV.32 d0[0],r0
12081_NEON2SSE_INLINE uint32x2_t vset_lane_u32(uint32_t value, uint32x2_t vec, __constrange(0,1) int lane)
12082{
12083 uint32_t val;
12084 val = value;
12085 return vld1_lane_u32(&val, vec, lane);
12086}
12087
12088_NEON2SSESTORAGE int8x8_t vset_lane_s8(int8_t value, int8x8_t vec, __constrange(0,7) int lane); // VMOV.8 d0[0],r0
12089_NEON2SSE_INLINE int8x8_t vset_lane_s8(int8_t value, int8x8_t vec, __constrange(0,7) int lane)
12090{
12091 int8_t val;
12092 val = value;
12093 return vld1_lane_s8(&val, vec, lane);
12094}
12095
12096_NEON2SSESTORAGE int16x4_t vset_lane_s16(int16_t value, int16x4_t vec, __constrange(0,3) int lane); // VMOV.16 d0[0],r0
12097_NEON2SSE_INLINE int16x4_t vset_lane_s16(int16_t value, int16x4_t vec, __constrange(0,3) int lane)
12098{
12099 int16_t val;
12100 val = value;
12101 return vld1_lane_s16(&val, vec, lane);
12102}
12103
12104_NEON2SSESTORAGE int32x2_t vset_lane_s32(int32_t value, int32x2_t vec, __constrange(0,1) int lane); // VMOV.32 d0[0],r0
12105_NEON2SSE_INLINE int32x2_t vset_lane_s32(int32_t value, int32x2_t vec, __constrange(0,1) int lane)
12106{
12107 int32_t val;
12108 val = value;
12109 return vld1_lane_s32(&val, vec, lane);
12110}
12111
12112_NEON2SSE_GLOBAL poly8x8_t vset_lane_p8(poly8_t value, poly8x8_t vec, __constrange(0,7) int lane); // VMOV.8 d0[0],r0
12113#define vset_lane_p8 vset_lane_u8
12114
12115_NEON2SSE_GLOBAL poly16x4_t vset_lane_p16(poly16_t value, poly16x4_t vec, __constrange(0,3) int lane); // VMOV.16 d0[0],r0
12116#define vset_lane_p16 vset_lane_u16
12117
12118_NEON2SSESTORAGE float32x2_t vset_lane_f32(float32_t value, float32x2_t vec, __constrange(0,1) int lane); // VMOV.32 d0[0],r0
12119_NEON2SSE_INLINE float32x2_t vset_lane_f32(float32_t value, float32x2_t vec, __constrange(0,1) int lane)
12120{
12121 float32_t val;
12122 val = value;
12123 return vld1_lane_f32(&val, vec, lane);
12124}
12125
12126_NEON2SSESTORAGE uint8x16_t vsetq_lane_u8(uint8_t value, uint8x16_t vec, __constrange(0,15) int lane); // VMOV.8 d0[0],r0
12127_NEON2SSE_INLINE uint8x16_t vsetq_lane_u8(uint8_t value, uint8x16_t vec, __constrange(0,15) int lane)
12128{
12129 uint8_t val;
12130 val = value;
12131 return vld1q_lane_u8(&val, vec, lane);
12132}
12133
12134_NEON2SSESTORAGE uint16x8_t vsetq_lane_u16(uint16_t value, uint16x8_t vec, __constrange(0,7) int lane); // VMOV.16 d0[0],r0
12135_NEON2SSE_INLINE uint16x8_t vsetq_lane_u16(uint16_t value, uint16x8_t vec, __constrange(0,7) int lane)
12136{
12137 uint16_t val;
12138 val = value;
12139 return vld1q_lane_u16(&val, vec, lane);
12140}
12141
12142_NEON2SSESTORAGE uint32x4_t vsetq_lane_u32(uint32_t value, uint32x4_t vec, __constrange(0,3) int lane); // VMOV.32 d0[0],r0
12143_NEON2SSE_INLINE uint32x4_t vsetq_lane_u32(uint32_t value, uint32x4_t vec, __constrange(0,3) int lane)
12144{
12145 uint32_t val;
12146 val = value;
12147 return vld1q_lane_u32(&val, vec, lane);
12148}
12149
12150_NEON2SSESTORAGE int8x16_t vsetq_lane_s8(int8_t value, int8x16_t vec, __constrange(0,15) int lane); // VMOV.8 d0[0],r0
12151_NEON2SSE_INLINE int8x16_t vsetq_lane_s8(int8_t value, int8x16_t vec, __constrange(0,15) int lane)
12152{
12153 int8_t val;
12154 val = value;
12155 return vld1q_lane_s8(&val, vec, lane);
12156}
12157
12158_NEON2SSESTORAGE int16x8_t vsetq_lane_s16(int16_t value, int16x8_t vec, __constrange(0,7) int lane); // VMOV.16 d0[0],r0
12159_NEON2SSE_INLINE int16x8_t vsetq_lane_s16(int16_t value, int16x8_t vec, __constrange(0,7) int lane)
12160{
12161 int16_t val;
12162 val = value;
12163 return vld1q_lane_s16(&val, vec, lane);
12164}
12165
12166_NEON2SSESTORAGE int32x4_t vsetq_lane_s32(int32_t value, int32x4_t vec, __constrange(0,3) int lane); // VMOV.32 d0[0],r0
12167_NEON2SSE_INLINE int32x4_t vsetq_lane_s32(int32_t value, int32x4_t vec, __constrange(0,3) int lane)
12168{
12169 int32_t val;
12170 val = value;
12171 return vld1q_lane_s32(&val, vec, lane);
12172}
12173
12174_NEON2SSE_GLOBAL poly8x16_t vsetq_lane_p8(poly8_t value, poly8x16_t vec, __constrange(0,15) int lane); // VMOV.8 d0[0],r0
12175#define vsetq_lane_p8 vsetq_lane_u8
12176
12177_NEON2SSE_GLOBAL poly16x8_t vsetq_lane_p16(poly16_t value, poly16x8_t vec, __constrange(0,7) int lane); // VMOV.16 d0[0],r0
12178#define vsetq_lane_p16 vsetq_lane_u16
12179
12180_NEON2SSESTORAGE float32x4_t vsetq_lane_f32(float32_t value, float32x4_t vec, __constrange(0,3) int lane); // VMOV.32 d0[0],r0
12181_NEON2SSE_INLINE float32x4_t vsetq_lane_f32(float32_t value, float32x4_t vec, __constrange(0,3) int lane)
12182{
12183 float32_t val;
12184 val = value;
12185 return vld1q_lane_f32(&val, vec, lane);
12186}
12187
12188_NEON2SSESTORAGE int64x1_t vset_lane_s64(int64_t value, int64x1_t vec, __constrange(0,0) int lane); // VMOV d0,r0,r0
12189_NEON2SSE_INLINE int64x1_t vset_lane_s64(int64_t value, int64x1_t vec, __constrange(0,0) int lane)
12190{
12191 int64_t val;
12192 val = value;
12193 return vld1_lane_s64(&val, vec, lane);
12194}
12195
12196_NEON2SSESTORAGE uint64x1_t vset_lane_u64(uint64_t value, uint64x1_t vec, __constrange(0,0) int lane); // VMOV d0,r0,r0
12197_NEON2SSE_INLINE uint64x1_t vset_lane_u64(uint64_t value, uint64x1_t vec, __constrange(0,0) int lane)
12198{
12199 uint64_t val;
12200 val = value;
12201 return vld1_lane_u64(&val, vec, lane);
12202}
12203
12204_NEON2SSESTORAGE int64x2_t vsetq_lane_s64(int64_t value, int64x2_t vec, __constrange(0,1) int lane); // VMOV d0,r0,r0
12205_NEON2SSE_INLINE int64x2_t vsetq_lane_s64(int64_t value, int64x2_t vec, __constrange(0,1) int lane)
12206{
12207 uint64_t val;
12208 val = value;
12209 return vld1q_lane_s64(&val, vec, lane);
12210}
12211
12212_NEON2SSE_GLOBAL uint64x2_t vsetq_lane_u64(uint64_t value, uint64x2_t vec, __constrange(0,1) int lane); // VMOV d0,r0,r0
12213#define vsetq_lane_u64 vsetq_lane_s64
12214
12215// *******************************************************************************
12216// **************** Initialize a vector from bit pattern ***************************
12217// *******************************************************************************
12218//These intrinsics create a vector from a literal bit pattern.
12219_NEON2SSESTORAGE int8x8_t vcreate_s8(uint64_t a); // VMOV d0,r0,r0
12220_NEON2SSE_INLINE int8x8_t vcreate_s8(uint64_t a)
12221{
12222 return (*(__m64_128*)&(a)); //here we couldn't use macro due to possible immediate value usage
12223}
12224
12225_NEON2SSE_GLOBAL int16x4_t vcreate_s16(uint64_t a); // VMOV d0,r0,r0
12226#define vcreate_s16 vcreate_s8
12227
12228_NEON2SSE_GLOBAL int32x2_t vcreate_s32(uint64_t a); // VMOV d0,r0,r0
12229#define vcreate_s32 vcreate_s8
12230
12231_NEON2SSE_GLOBAL float16x4_t vcreate_f16(uint64_t a); // VMOV d0,r0,r0
12232//no IA32 SIMD avalilable
12233
12234_NEON2SSESTORAGE float32x2_t vcreate_f32(uint64_t a); // VMOV d0,r0,r0
12235_NEON2SSE_INLINE float32x2_t vcreate_f32(uint64_t a)
12236{
12237 return (*(__m64_128*)&(a)); //here we couldn't use macro due to possible immediate value usage
12238}
12239
12240_NEON2SSE_GLOBAL uint8x8_t vcreate_u8(uint64_t a); // VMOV d0,r0,r0
12241#define vcreate_u8 vcreate_s8
12242
12243_NEON2SSE_GLOBAL uint16x4_t vcreate_u16(uint64_t a); // VMOV d0,r0,r0
12244#define vcreate_u16 vcreate_s16
12245
12246_NEON2SSE_GLOBAL uint32x2_t vcreate_u32(uint64_t a); // VMOV d0,r0,r0
12247#define vcreate_u32 vcreate_s32
12248
12249_NEON2SSE_GLOBAL uint64x1_t vcreate_u64(uint64_t a); // VMOV d0,r0,r0
12250#define vcreate_u64 vcreate_s8
12251
12252
12253_NEON2SSE_GLOBAL poly8x8_t vcreate_p8(uint64_t a); // VMOV d0,r0,r0
12254#define vcreate_p8 vcreate_u8
12255
12256_NEON2SSE_GLOBAL poly16x4_t vcreate_p16(uint64_t a); // VMOV d0,r0,r0
12257#define vcreate_p16 vcreate_u16
12258
12259_NEON2SSE_GLOBAL int64x1_t vcreate_s64(uint64_t a); // VMOV d0,r0,r0
12260#define vcreate_s64 vcreate_u64
12261
12262//********************* Set all lanes to same value ********************************
12263//*********************************************************************************
12264//These intrinsics set all lanes to the same value.
12265_NEON2SSESTORAGE uint8x8_t vdup_n_u8(uint8_t value); // VDUP.8 d0,r0
12266_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint8x8_t vdup_n_u8(uint8_t value), _NEON2SSE_REASON_SLOW_SERIAL)
12267{
12268 uint8x8_t res;
12269 int i;
12270 for (i = 0; i<8; i++) {
12271 res.m64_u8[i] = value;
12272 }
12273 return res;
12274}
12275
12276_NEON2SSESTORAGE uint16x4_t vdup_n_u16(uint16_t value); // VDUP.16 d0,r0
12277_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint16x4_t vdup_n_u16(uint16_t value), _NEON2SSE_REASON_SLOW_SERIAL)
12278{
12279 uint16x4_t res;
12280 int i;
12281 for (i = 0; i<4; i++) {
12282 res.m64_u16[i] = value;
12283 }
12284 return res;
12285}
12286
12287_NEON2SSESTORAGE uint32x2_t vdup_n_u32(uint32_t value); // VDUP.32 d0,r0
12288_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint32x2_t vdup_n_u32(uint32_t value), _NEON2SSE_REASON_SLOW_SERIAL)
12289{
12290 uint32x2_t res;
12291 res.m64_u32[0] = value;
12292 res.m64_u32[1] = value;
12293 return res;
12294}
12295
12296_NEON2SSESTORAGE int8x8_t vdup_n_s8(int8_t value); // VDUP.8 d0,r0
12297_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int8x8_t vdup_n_s8(int8_t value), _NEON2SSE_REASON_SLOW_SERIAL)
12298{
12299 int8x8_t res;
12300 int i;
12301 for (i = 0; i<8; i++) {
12302 res.m64_i8[i] = value;
12303 }
12304 return res;
12305}
12306
12307_NEON2SSESTORAGE int16x4_t vdup_n_s16(int16_t value); // VDUP.16 d0,r0
12308_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int16x4_t vdup_n_s16(int16_t value), _NEON2SSE_REASON_SLOW_SERIAL)
12309{
12310 int16x4_t res;
12311 int i;
12312 for (i = 0; i<4; i++) {
12313 res.m64_i16[i] = value;
12314 }
12315 return res;
12316}
12317
12318_NEON2SSESTORAGE int32x2_t vdup_n_s32(int32_t value); // VDUP.32 d0,r0
12319_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int32x2_t vdup_n_s32(int32_t value), _NEON2SSE_REASON_SLOW_SERIAL)
12320{
12321 int32x2_t res;
12322 res.m64_i32[0] = value;
12323 res.m64_i32[1] = value;
12324 return res;
12325}
12326
12327_NEON2SSE_GLOBAL poly8x8_t vdup_n_p8(poly8_t value); // VDUP.8 d0,r0
12328#define vdup_n_p8 vdup_n_u8
12329
12330_NEON2SSE_GLOBAL poly16x4_t vdup_n_p16(poly16_t value); // VDUP.16 d0,r0
12331#define vdup_n_p16 vdup_n_s16
12332
12333_NEON2SSESTORAGE float32x2_t vdup_n_f32(float32_t value); // VDUP.32 d0,r0
12334_NEON2SSE_INLINE float32x2_t vdup_n_f32(float32_t value)
12335{
12336 float32x2_t res;
12337 res.m64_f32[0] = value;
12338 res.m64_f32[1] = value;
12339 return res;
12340}
12341
12342_NEON2SSE_GLOBAL uint8x16_t vdupq_n_u8(uint8_t value); // VDUP.8 q0,r0
12343#define vdupq_n_u8(value) _mm_set1_epi8((int8_t) (value))
12344
12345_NEON2SSE_GLOBAL uint16x8_t vdupq_n_u16(uint16_t value); // VDUP.16 q0,r0
12346#define vdupq_n_u16(value) _mm_set1_epi16((int16_t) (value))
12347
12348_NEON2SSE_GLOBAL uint32x4_t vdupq_n_u32(uint32_t value); // VDUP.32 q0,r0
12349#define vdupq_n_u32(value) _mm_set1_epi32((int32_t) (value))
12350
12351_NEON2SSE_GLOBAL int8x16_t vdupq_n_s8(int8_t value); // VDUP.8 q0,r0
12352#define vdupq_n_s8 _mm_set1_epi8
12353
12354_NEON2SSE_GLOBAL int16x8_t vdupq_n_s16(int16_t value); // VDUP.16 q0,r0
12355#define vdupq_n_s16 _mm_set1_epi16
12356
12357_NEON2SSE_GLOBAL int32x4_t vdupq_n_s32(int32_t value); // VDUP.32 q0,r0
12358#define vdupq_n_s32 _mm_set1_epi32
12359
12360_NEON2SSE_GLOBAL poly8x16_t vdupq_n_p8(poly8_t value); // VDUP.8 q0,r0
12361#define vdupq_n_p8 vdupq_n_u8
12362
12363_NEON2SSE_GLOBAL poly16x8_t vdupq_n_p16(poly16_t value); // VDUP.16 q0,r0
12364#define vdupq_n_p16 vdupq_n_u16
12365
12366_NEON2SSE_GLOBAL float32x4_t vdupq_n_f32(float32_t value); // VDUP.32 q0,r0
12367#define vdupq_n_f32 _mm_set1_ps
12368
12369_NEON2SSESTORAGE int64x1_t vdup_n_s64(int64_t value); // VMOV d0,r0,r0
12370_NEON2SSE_INLINE int64x1_t vdup_n_s64(int64_t value)
12371{
12372 int64x1_t res;
12373 res.m64_i64[0] = value;
12374 return res;
12375}
12376
12377_NEON2SSESTORAGE uint64x1_t vdup_n_u64(uint64_t value); // VMOV d0,r0,r0
12378_NEON2SSE_INLINE uint64x1_t vdup_n_u64(uint64_t value)
12379{
12380 uint64x1_t res;
12381 res.m64_u64[0] = value;
12382 return res;
12383}
12384
12385_NEON2SSESTORAGE int64x2_t vdupq_n_s64(int64_t value); // VMOV d0,r0,r0
12386_NEON2SSE_INLINE int64x2_t vdupq_n_s64(int64_t value)
12387{
12388 _NEON2SSE_ALIGN_16 int64_t value2[2];
12389
12390 value2[0] = value;
12391 value2[1] = value;
12392
12393 return LOAD_SI128(value2);
12394}
12395
12396_NEON2SSESTORAGE uint64x2_t vdupq_n_u64(uint64_t value); // VMOV d0,r0,r0
12397_NEON2SSE_INLINE uint64x2_t vdupq_n_u64(uint64_t value)
12398{
12399 _NEON2SSE_ALIGN_16 uint64_t val[2];
12400
12401 val[0] = value;
12402 val[1] = value;
12403
12404 return LOAD_SI128(val);
12405}
12406
12407//**** Set all lanes to same value ************************
12408//Same functions as above - just aliaces.********************
12409//Probably they reflect the fact that 128-bit functions versions use VMOV instruction **********
12410_NEON2SSE_GLOBAL uint8x8_t vmov_n_u8(uint8_t value); // VDUP.8 d0,r0
12411#define vmov_n_u8 vdup_n_s8
12412
12413_NEON2SSE_GLOBAL uint16x4_t vmov_n_u16(uint16_t value); // VDUP.16 d0,r0
12414#define vmov_n_u16 vdup_n_s16
12415
12416_NEON2SSE_GLOBAL uint32x2_t vmov_n_u32(uint32_t value); // VDUP.32 d0,r0
12417#define vmov_n_u32 vdup_n_u32
12418
12419_NEON2SSE_GLOBAL int8x8_t vmov_n_s8(int8_t value); // VDUP.8 d0,r0
12420#define vmov_n_s8 vdup_n_s8
12421
12422_NEON2SSE_GLOBAL int16x4_t vmov_n_s16(int16_t value); // VDUP.16 d0,r0
12423#define vmov_n_s16 vdup_n_s16
12424
12425_NEON2SSE_GLOBAL int32x2_t vmov_n_s32(int32_t value); // VDUP.32 d0,r0
12426#define vmov_n_s32 vdup_n_s32
12427
12428_NEON2SSE_GLOBAL poly8x8_t vmov_n_p8(poly8_t value); // VDUP.8 d0,r0
12429#define vmov_n_p8 vdup_n_u8
12430
12431_NEON2SSE_GLOBAL poly16x4_t vmov_n_p16(poly16_t value); // VDUP.16 d0,r0
12432#define vmov_n_p16 vdup_n_s16
12433
12434_NEON2SSE_GLOBAL float32x2_t vmov_n_f32(float32_t value); // VDUP.32 d0,r0
12435#define vmov_n_f32 vdup_n_f32
12436
12437_NEON2SSE_GLOBAL uint8x16_t vmovq_n_u8(uint8_t value); // VDUP.8 q0,r0
12438#define vmovq_n_u8 vdupq_n_u8
12439
12440_NEON2SSE_GLOBAL uint16x8_t vmovq_n_u16(uint16_t value); // VDUP.16 q0,r0
12441#define vmovq_n_u16 vdupq_n_s16
12442
12443_NEON2SSE_GLOBAL uint32x4_t vmovq_n_u32(uint32_t value); // VDUP.32 q0,r0
12444#define vmovq_n_u32 vdupq_n_u32
12445
12446_NEON2SSE_GLOBAL int8x16_t vmovq_n_s8(int8_t value); // VDUP.8 q0,r0
12447#define vmovq_n_s8 vdupq_n_s8
12448
12449_NEON2SSE_GLOBAL int16x8_t vmovq_n_s16(int16_t value); // VDUP.16 q0,r0
12450#define vmovq_n_s16 vdupq_n_s16
12451
12452_NEON2SSE_GLOBAL int32x4_t vmovq_n_s32(int32_t value); // VDUP.32 q0,r0
12453#define vmovq_n_s32 vdupq_n_s32
12454
12455_NEON2SSE_GLOBAL poly8x16_t vmovq_n_p8(poly8_t value); // VDUP.8 q0,r0
12456#define vmovq_n_p8 vdupq_n_u8
12457
12458_NEON2SSE_GLOBAL poly16x8_t vmovq_n_p16(poly16_t value); // VDUP.16 q0,r0
12459#define vmovq_n_p16 vdupq_n_s16
12460
12461_NEON2SSE_GLOBAL float32x4_t vmovq_n_f32(float32_t value); // VDUP.32 q0,r0
12462#define vmovq_n_f32 vdupq_n_f32
12463
12464_NEON2SSE_GLOBAL int64x1_t vmov_n_s64(int64_t value); // VMOV d0,r0,r0
12465#define vmov_n_s64 vdup_n_s64
12466
12467_NEON2SSE_GLOBAL uint64x1_t vmov_n_u64(uint64_t value); // VMOV d0,r0,r0
12468#define vmov_n_u64 vdup_n_u64
12469
12470_NEON2SSE_GLOBAL int64x2_t vmovq_n_s64(int64_t value); // VMOV d0,r0,r0
12471#define vmovq_n_s64 vdupq_n_s64
12472
12473_NEON2SSE_GLOBAL uint64x2_t vmovq_n_u64(uint64_t value); // VMOV d0,r0,r0
12474#define vmovq_n_u64 vdupq_n_u64
12475
12476//**************Set all lanes to the value of one lane of a vector *************
12477//****************************************************************************
12478//here shuffle is better solution than lane extraction followed by set1 function
12479_NEON2SSESTORAGE uint8x8_t vdup_lane_u8(uint8x8_t vec, __constrange(0,7) int lane); // VDUP.8 d0,d0[0]
12480_NEON2SSE_INLINE uint8x8_t vdup_lane_u8(uint8x8_t vec, __constrange(0,7) int lane)
12481{
12482 uint8x8_t res;
12483 uint8_t valane;
12484 int i = 0;
12485 valane = vec.m64_u8[lane];
12486 for (i = 0; i<8; i++) {
12487 res.m64_u8[i] = valane;
12488 }
12489 return res;
12490}
12491
12492_NEON2SSESTORAGE uint16x4_t vdup_lane_u16(uint16x4_t vec, __constrange(0,3) int lane); // VDUP.16 d0,d0[0]
12493_NEON2SSE_INLINE uint16x4_t vdup_lane_u16(uint16x4_t vec, __constrange(0,3) int lane)
12494{
12495 uint16x4_t res;
12496 uint16_t valane;
12497 valane = vec.m64_u16[lane];
12498 res.m64_u16[0] = valane;
12499 res.m64_u16[1] = valane;
12500 res.m64_u16[2] = valane;
12501 res.m64_u16[3] = valane;
12502 return res;
12503}
12504
12505_NEON2SSESTORAGE uint32x2_t vdup_lane_u32(uint32x2_t vec, __constrange(0,1) int lane); // VDUP.32 d0,d0[0]
12506_NEON2SSE_INLINE uint32x2_t vdup_lane_u32(uint32x2_t vec, __constrange(0,1) int lane)
12507{
12508 uint32x2_t res;
12509 res.m64_u32[0] = vec.m64_u32[lane];
12510 res.m64_u32[1] = res.m64_u32[0];
12511 return res;
12512}
12513
12514_NEON2SSE_GLOBAL int8x8_t vdup_lane_s8(int8x8_t vec, __constrange(0,7) int lane); // VDUP.8 d0,d0[0]
12515#define vdup_lane_s8 vdup_lane_u8
12516
12517_NEON2SSE_GLOBAL int16x4_t vdup_lane_s16(int16x4_t vec, __constrange(0,3) int lane); // VDUP.16 d0,d0[0]
12518#define vdup_lane_s16 vdup_lane_u16
12519
12520_NEON2SSE_GLOBAL int32x2_t vdup_lane_s32(int32x2_t vec, __constrange(0,1) int lane); // VDUP.32 d0,d0[0]
12521#define vdup_lane_s32 vdup_lane_u32
12522
12523_NEON2SSE_GLOBAL poly8x8_t vdup_lane_p8(poly8x8_t vec, __constrange(0,7) int lane); // VDUP.8 d0,d0[0]
12524#define vdup_lane_p8 vdup_lane_u8
12525
12526_NEON2SSE_GLOBAL poly16x4_t vdup_lane_p16(poly16x4_t vec, __constrange(0,3) int lane); // VDUP.16 d0,d0[0]
12527#define vdup_lane_p16 vdup_lane_s16
12528
12529_NEON2SSESTORAGE float32x2_t vdup_lane_f32(float32x2_t vec, __constrange(0,1) int lane); // VDUP.32 d0,d0[0]
12530_NEON2SSE_INLINE float32x2_t vdup_lane_f32(float32x2_t vec, __constrange(0,1) int lane)
12531{
12532 float32x2_t res;
12533 res.m64_f32[0] = vec.m64_f32[lane];
12534 res.m64_f32[1] = res.m64_f32[0];
12535 return res;
12536}
12537
12538_NEON2SSESTORAGE uint8x16_t vdupq_lane_u8(uint8x8_t vec, __constrange(0,7) int lane); // VDUP.8 q0,d0[0]
12539_NEON2SSE_INLINE uint8x16_t vdupq_lane_u8(uint8x8_t vec, __constrange(0,7) int lane) // VDUP.8 q0,d0[0]
12540{
12541 const int8_t lane8 = (int8_t) lane;
12542 _NEON2SSE_ALIGN_16 int8_t lanemask8[16];
12543
12544 lanemask8[0] = lane8;
12545 lanemask8[1] = lane8;
12546 lanemask8[2] = lane8;
12547 lanemask8[3] = lane8;
12548 lanemask8[4] = lane8;
12549 lanemask8[5] = lane8;
12550 lanemask8[6] = lane8;
12551 lanemask8[7] = lane8;
12552 lanemask8[8] = lane8;
12553 lanemask8[9] = lane8;
12554 lanemask8[10] = lane8;
12555 lanemask8[11] = lane8;
12556 lanemask8[12] = lane8;
12557 lanemask8[13] = lane8;
12558 lanemask8[14] = lane8;
12559 lanemask8[15] = lane8;
12560
12561 return _mm_shuffle_epi8 (_pM128i(vec), *(__m128i*) lanemask8);
12562}
12563
12564_NEON2SSESTORAGE uint16x8_t vdupq_lane_u16(uint16x4_t vec, __constrange(0,3) int lane); // VDUP.16 q0,d0[0]
12565_NEON2SSE_INLINE uint16x8_t vdupq_lane_u16(uint16x4_t vec, __constrange(0,3) int lane) // VDUP.16 q0,d0[0]
12566{
12567 //we could use 8bit shuffle for 16 bit as well
12568 const int8_t lane16 = ((int8_t) lane) << 1;
12569 const int8_t lane16_1 = lane16 + 1;
12570 _NEON2SSE_ALIGN_16 int8_t lanemask_e16[16];
12571
12572 lanemask_e16[0] = lane16;
12573 lanemask_e16[1] = lane16_1;
12574 lanemask_e16[2] = lane16;
12575 lanemask_e16[3] = lane16_1;
12576 lanemask_e16[4] = lane16;
12577 lanemask_e16[5] = lane16_1;
12578 lanemask_e16[6] = lane16;
12579 lanemask_e16[7] = lane16_1;
12580 lanemask_e16[8] = lane16;
12581 lanemask_e16[9] = lane16_1;
12582 lanemask_e16[10] = lane16;
12583 lanemask_e16[11] = lane16_1;
12584 lanemask_e16[12] = lane16;
12585 lanemask_e16[13] = lane16_1;
12586 lanemask_e16[14] = lane16;
12587 lanemask_e16[15] = lane16_1;
12588
12589 return _mm_shuffle_epi8 (_pM128i(vec), *(__m128i*)lanemask_e16);
12590}
12591
12592_NEON2SSESTORAGE uint32x4_t vdupq_lane_u32(uint32x2_t vec, __constrange(0,1) int lane); // VDUP.32 q0,d0[0]
12593_NEON2SSE_INLINE uint32x4_t vdupq_lane_u32(uint32x2_t vec, __constrange(0,1) int lane)
12594{
12595 //need to use function not macro to make it gcc friendly and meet the immediate const requirement for _mm_shuffle_epi32
12596 if (lane == 1)
12597 return _mm_shuffle_epi32 (_pM128i(vec), (1 | (1 << 2) | (1 << 4) | (1 << 6)) );
12598 else
12599 return _mm_shuffle_epi32 (_pM128i(vec), 0);
12600}
12601
12602_NEON2SSE_GLOBAL int8x16_t vdupq_lane_s8(int8x8_t vec, __constrange(0,7) int lane); // VDUP.8 q0,d0[0]
12603#define vdupq_lane_s8 vdupq_lane_u8
12604
12605_NEON2SSE_GLOBAL int16x8_t vdupq_lane_s16(int16x4_t vec, __constrange(0,3) int lane); // VDUP.16 q0,d0[0]
12606#define vdupq_lane_s16 vdupq_lane_u16
12607
12608_NEON2SSE_GLOBAL int32x4_t vdupq_lane_s32(int32x2_t vec, __constrange(0,1) int lane); // VDUP.32 q0,d0[0]
12609#define vdupq_lane_s32 vdupq_lane_u32
12610
12611_NEON2SSE_GLOBAL poly8x16_t vdupq_lane_p8(poly8x8_t vec, __constrange(0,7) int lane); // VDUP.8 q0,d0[0]
12612#define vdupq_lane_p8 vdupq_lane_u8
12613
12614_NEON2SSE_GLOBAL poly16x8_t vdupq_lane_p16(poly16x4_t vec, __constrange(0,3) int lane); // VDUP.16 q0,d0[0]
12615#define vdupq_lane_p16 vdupq_lane_s16
12616
12617_NEON2SSE_GLOBAL float32x4_t vdupq_lane_f32(float32x2_t vec, __constrange(0,1) int lane); // VDUP.32 q0,d0[0]
12618#define vdupq_lane_f32(vec, lane) _mm_load1_ps((vec.m64_f32 + lane))
12619
12620_NEON2SSE_GLOBAL int64x1_t vdup_lane_s64(int64x1_t vec, __constrange(0,0) int lane); // VMOV d0,d0
12621#define vdup_lane_s64(vec,lane) vec
12622
12623_NEON2SSE_GLOBAL uint64x1_t vdup_lane_u64(uint64x1_t vec, __constrange(0,0) int lane); // VMOV d0,d0
12624#define vdup_lane_u64(vec,lane) vec
12625
12626_NEON2SSESTORAGE int64x2_t vdupq_lane_s64(int64x1_t vec, __constrange(0,0) int lane); // VMOV q0,q0
12627_NEON2SSE_INLINE int64x2_t vdupq_lane_s64(int64x1_t vec, __constrange(0,0) int lane)
12628{
12629 __m128i vec128;
12630 UNREFERENCED_PARAMETER(lane);
12631 vec128 = _pM128i(vec);
12632 return _mm_unpacklo_epi64(vec128,vec128);
12633}
12634
12635_NEON2SSE_GLOBAL uint64x2_t vdupq_lane_u64(uint64x1_t vec, __constrange(0,0) int lane); // VMOV q0,q0
12636#define vdupq_lane_u64 vdupq_lane_s64
12637
12638// ********************************************************************
12639// ******************** Combining vectors *****************************
12640// ********************************************************************
12641//These intrinsics join two 64 bit vectors into a single 128bit vector.
12642_NEON2SSESTORAGE int8x16_t vcombine_s8(int8x8_t low, int8x8_t high); // VMOV d0,d0
12643_NEON2SSE_INLINE int8x16_t vcombine_s8(int8x8_t low, int8x8_t high)
12644{
12645 return _mm_unpacklo_epi64 (_pM128i(low), _pM128i(high) );
12646}
12647
12648_NEON2SSE_GLOBAL int16x8_t vcombine_s16(int16x4_t low, int16x4_t high); // VMOV d0,d0
12649#define vcombine_s16 vcombine_s8
12650
12651_NEON2SSE_GLOBAL int32x4_t vcombine_s32(int32x2_t low, int32x2_t high); // VMOV d0,d0
12652#define vcombine_s32 vcombine_s8
12653
12654_NEON2SSE_GLOBAL int64x2_t vcombine_s64(int64x1_t low, int64x1_t high); // VMOV d0,d0
12655#define vcombine_s64 vcombine_s8
12656
12657_NEON2SSE_GLOBAL float16x8_t vcombine_f16(float16x4_t low, float16x4_t high); // VMOV d0,d0
12658//current IA SIMD doesn't support float16
12659
12660_NEON2SSESTORAGE float32x4_t vcombine_f32(float32x2_t low, float32x2_t high); // VMOV d0,d0
12661_NEON2SSE_INLINE float32x4_t vcombine_f32(float32x2_t low, float32x2_t high)
12662{
12663 __m128i res;
12664 res = _mm_unpacklo_epi64(_pM128i(low), _pM128i(high) );
12665 return _M128(res);
12666}
12667
12668_NEON2SSE_GLOBAL uint8x16_t vcombine_u8(uint8x8_t low, uint8x8_t high); // VMOV d0,d0
12669#define vcombine_u8 vcombine_s8
12670
12671_NEON2SSE_GLOBAL uint16x8_t vcombine_u16(uint16x4_t low, uint16x4_t high); // VMOV d0,d0
12672#define vcombine_u16 vcombine_s16
12673
12674_NEON2SSE_GLOBAL uint32x4_t vcombine_u32(uint32x2_t low, uint32x2_t high); // VMOV d0,d0
12675#define vcombine_u32 vcombine_s32
12676
12677_NEON2SSE_GLOBAL uint64x2_t vcombine_u64(uint64x1_t low, uint64x1_t high); // VMOV d0,d0
12678#define vcombine_u64 vcombine_s64
12679
12680_NEON2SSE_GLOBAL poly8x16_t vcombine_p8(poly8x8_t low, poly8x8_t high); // VMOV d0,d0
12681#define vcombine_p8 vcombine_u8
12682
12683_NEON2SSE_GLOBAL poly16x8_t vcombine_p16(poly16x4_t low, poly16x4_t high); // VMOV d0,d0
12684#define vcombine_p16 vcombine_u16
12685
12686//**********************************************************************
12687//************************* Splitting vectors **************************
12688//**********************************************************************
12689//**************** Get high part ******************************************
12690//These intrinsics split a 128 bit vector into 2 component 64 bit vectors
12691_NEON2SSESTORAGE int8x8_t vget_high_s8(int8x16_t a); // VMOV d0,d0
12692_NEON2SSE_INLINE int8x8_t vget_high_s8(int8x16_t a)
12693{
12694 int8x8_t res64;
12695 __m128i res;
12696 res = _mm_unpackhi_epi64(a,a); //SSE2
12697 return64(res);
12698}
12699
12700_NEON2SSESTORAGE int16x4_t vget_high_s16(int16x8_t a); // VMOV d0,d0
12701_NEON2SSE_INLINE int16x4_t vget_high_s16(int16x8_t a)
12702{
12703 int16x4_t res64;
12704 __m128i res;
12705 res = _mm_unpackhi_epi64(a,a); //SSE2
12706 return64(res);
12707}
12708
12709_NEON2SSESTORAGE int32x2_t vget_high_s32(int32x4_t a); // VMOV d0,d0
12710_NEON2SSE_INLINE int32x2_t vget_high_s32(int32x4_t a)
12711{
12712 int32x2_t res64;
12713 __m128i res;
12714 res = _mm_unpackhi_epi64(a,a); //SSE2
12715 return64(res);
12716}
12717
12718_NEON2SSESTORAGE int64x1_t vget_high_s64(int64x2_t a); // VMOV d0,d0
12719_NEON2SSE_INLINE int64x1_t vget_high_s64(int64x2_t a)
12720{
12721 int64x1_t res64;
12722 __m128i res;
12723 res = _mm_unpackhi_epi64(a,a); //SSE2
12724 return64(res);
12725}
12726
12727_NEON2SSE_GLOBAL float16x4_t vget_high_f16(float16x8_t a); // VMOV d0,d0
12728// IA32 SIMD doesn't work with 16bit floats currently
12729
12730_NEON2SSESTORAGE float32x2_t vget_high_f32(float32x4_t a); // VMOV d0,d0
12731_NEON2SSE_INLINE float32x2_t vget_high_f32(float32x4_t a)
12732{
12733 __m128i res;
12734 __m64_128 res64;
12735 res = _mm_unpackhi_epi64(_M128i(a),_M128i(a));
12736 return64(res);
12737}
12738
12739_NEON2SSE_GLOBAL uint8x8_t vget_high_u8(uint8x16_t a); // VMOV d0,d0
12740#define vget_high_u8 vget_high_s8
12741
12742_NEON2SSE_GLOBAL uint16x4_t vget_high_u16(uint16x8_t a); // VMOV d0,d0
12743#define vget_high_u16 vget_high_s16
12744
12745_NEON2SSE_GLOBAL uint32x2_t vget_high_u32(uint32x4_t a); // VMOV d0,d0
12746#define vget_high_u32 vget_high_s32
12747
12748_NEON2SSE_GLOBAL uint64x1_t vget_high_u64(uint64x2_t a); // VMOV d0,d0
12749#define vget_high_u64 vget_high_s64
12750
12751_NEON2SSE_GLOBAL poly8x8_t vget_high_p8(poly8x16_t a); // VMOV d0,d0
12752#define vget_high_p8 vget_high_u8
12753
12754_NEON2SSE_GLOBAL poly16x4_t vget_high_p16(poly16x8_t a); // VMOV d0,d0
12755#define vget_high_p16 vget_high_u16
12756
12757//********************** Get low part **********************
12758//**********************************************************
12759_NEON2SSESTORAGE int8x8_t vget_low_s8(int8x16_t a); // VMOV d0,d0
12760_NEON2SSE_INLINE int8x8_t vget_low_s8(int8x16_t a) // VMOV d0,d0
12761{
12762 int16x4_t res64;
12763 return64(a);
12764}
12765
12766_NEON2SSESTORAGE int16x4_t vget_low_s16(int16x8_t a); // VMOV d0,d0
12767_NEON2SSE_INLINE int16x4_t vget_low_s16(int16x8_t a) // VMOV d0,d0
12768{
12769 int16x4_t res64;
12770 return64(a);
12771}
12772
12773_NEON2SSESTORAGE int32x2_t vget_low_s32(int32x4_t a); // VMOV d0,d0
12774_NEON2SSE_INLINE int32x2_t vget_low_s32(int32x4_t a) // VMOV d0,d0
12775{
12776 int32x2_t res64;
12777 return64(a);
12778}
12779
12780_NEON2SSESTORAGE int64x1_t vget_low_s64(int64x2_t a); // VMOV d0,d0
12781_NEON2SSE_INLINE int64x1_t vget_low_s64(int64x2_t a) // VMOV d0,d0
12782{
12783 int64x1_t res64;
12784 return64 (a);
12785}
12786
12787_NEON2SSE_GLOBAL float16x4_t vget_low_f16(float16x8_t a); // VMOV d0,d0
12788// IA32 SIMD doesn't work with 16bit floats currently
12789
12790_NEON2SSESTORAGE float32x2_t vget_low_f32(float32x4_t a); // VMOV d0,d0
12791_NEON2SSE_INLINE float32x2_t vget_low_f32(float32x4_t a)
12792{
12793 float32x2_t res64;
12794 _M64f(res64, a);
12795 return res64;
12796}
12797
12798_NEON2SSE_GLOBAL uint8x8_t vget_low_u8(uint8x16_t a); // VMOV d0,d0
12799#define vget_low_u8 vget_low_s8
12800
12801_NEON2SSE_GLOBAL uint16x4_t vget_low_u16(uint16x8_t a); // VMOV d0,d0
12802#define vget_low_u16 vget_low_s16
12803
12804_NEON2SSE_GLOBAL uint32x2_t vget_low_u32(uint32x4_t a); // VMOV d0,d0
12805#define vget_low_u32 vget_low_s32
12806
12807_NEON2SSE_GLOBAL uint64x1_t vget_low_u64(uint64x2_t a); // VMOV d0,d0
12808#define vget_low_u64 vget_low_s64
12809
12810_NEON2SSE_GLOBAL poly8x8_t vget_low_p8(poly8x16_t a); // VMOV d0,d0
12811#define vget_low_p8 vget_low_u8
12812
12813_NEON2SSE_GLOBAL poly16x4_t vget_low_p16(poly16x8_t a); // VMOV d0,d0
12814#define vget_low_p16 vget_low_s16
12815
12816//**************************************************************************
12817//************************ Converting vectors **********************************
12818//**************************************************************************
12819//************* Convert from float ***************************************
12820// need to set _MM_SET_ROUNDING_MODE ( x) accordingly
12821_NEON2SSESTORAGE int32x2_t vcvt_s32_f32(float32x2_t a); // VCVT.S32.F32 d0, d0
12822_NEON2SSE_INLINE int32x2_t vcvt_s32_f32(float32x2_t a)
12823{
12824 int32x2_t res64;
12825 __m128i res;
12826 res = _mm_cvtps_epi32(_pM128(a)); //use low 64 bits of result only
12827 return64(res);
12828}
12829
12830_NEON2SSESTORAGE uint32x2_t vcvt_u32_f32(float32x2_t a); // VCVT.U32.F32 d0, d0
12831_NEON2SSE_INLINE uint32x2_t vcvt_u32_f32(float32x2_t a)
12832{
12833 uint32x2_t res64;
12834 __m128i res;
12835 res = vcvtq_u32_f32(_pM128(a));
12836 return64(res);
12837}
12838
12839_NEON2SSESTORAGE int32x4_t vcvtq_s32_f32(float32x4_t a); // VCVT.S32.F32 q0, q0
12840_NEON2SSE_INLINE int32x4_t vcvtq_s32_f32(float32x4_t a)
12841{
12842 __m128 dif;
12843 __m128i res;
12844 //_mm_cvttps_epi32 incorrectly treats the case a > =2.14748364e+009, therefore the special processing is necessary
12845 _NEON2SSE_ALIGN_16 static const float32_t fmax[] = { 2.14748364e+009f, 2.14748364e+009f, 2.14748364e+009f, 2.14748364e+009f };
12846 dif = _mm_cmpge_ps(a, *(__m128*)fmax);
12847 res = _mm_cvttps_epi32(a);
12848 return _mm_xor_si128(res, _M128i(dif));
12849}
12850
12851_NEON2SSESTORAGE uint32x4_t vcvtq_u32_f32(float32x4_t a); // VCVT.U32.F32 q0, q0
12852_NEON2SSE_INLINE uint32x4_t vcvtq_u32_f32(float32x4_t a) // VCVT.U32.F32 q0, q0
12853{
12854 //No single instruction SSE solution but we could implement it as following:
12855 __m128i res1, res2, zero, mask;
12856 __m128 max, min, dif;
12857 _NEON2SSE_ALIGN_16 static const float32_t fmax[] = { 2.14748364e+009f, 2.14748364e+009f, 2.14748364e+009f, 2.14748364e+009f };
12858 _NEON2SSE_ALIGN_16 static const float32_t fmax_unsigned[] = { 4.29496729e+009f, 4.29496729e+009f, 4.29496729e+009f, 4.29496729e+009f };
12859 zero = _mm_setzero_si128();
12860 mask = _mm_cmpgt_epi32(_M128i(a), zero);
12861 min = _mm_and_ps(_M128(mask), a);
12862 max = _mm_min_ps(min, *(__m128*)fmax_unsigned); //clamped in 0 - 4.29496729+009
12863
12864 dif = _mm_sub_ps(max, *(__m128*)fmax);
12865 mask = _mm_cmpgt_epi32(_M128i(dif),zero);
12866 dif = _mm_and_ps(_M128(mask), dif);
12867
12868 res1 = _mm_cvttps_epi32(dif);
12869 res2 = vcvtq_s32_f32(max);
12870 return _mm_add_epi32(res1, res2);
12871}
12872
12873// ***** Convert to the fixed point with the number of fraction bits specified by b ***********
12874//*************************************************************************************************
12875_NEON2SSESTORAGE uint32_t clamp_u32_f32(float v);
12876_NEON2SSE_INLINE uint32_t clamp_u32_f32(float v)
12877{
12878 return (v <= 0 ? 0 : (v >= (float)~0U ? ~0U : (uint32_t)(v)));
12879}
12880
12881_NEON2SSESTORAGE int32x2_t vcvt_n_s32_f32(float32x2_t a, __constrange(1,32) int b); // VCVT.S32.F32 d0, d0, #32
12882_NEON2SSE_INLINE int32x2_t vcvt_n_s32_f32(float32x2_t a, __constrange(1,32) int b)
12883{
12884 int32x2_t res64;
12885 return64(vcvtq_n_s32_f32(_pM128(a),b));
12886}
12887
12888_NEON2SSESTORAGE uint32x2_t vcvt_n_u32_f32(float32x2_t a, __constrange(1,32) int b); // VCVT.U32.F32 d0, d0, #32
12889_NEON2SSE_INLINE uint32x2_t vcvt_n_u32_f32(float32x2_t a, __constrange(1,32) int b)
12890{
12891 uint32x2_t res;
12892 float convconst;
12893 convconst = (float)((uint64_t)1 << b);
12894 res.m64_u32[0] = clamp_u32_f32(a.m64_f32[0] * convconst);
12895 res.m64_u32[1] = clamp_u32_f32(a.m64_f32[1] * convconst);
12896 return res;
12897}
12898
12899_NEON2SSESTORAGE int32x4_t vcvtq_n_s32_f32(float32x4_t a, __constrange(1,32) int b); // VCVT.S32.F32 q0, q0, #32
12900_NEON2SSE_INLINE int32x4_t vcvtq_n_s32_f32(float32x4_t a, __constrange(1,32) int b)
12901{
12902 float convconst;
12903 _NEON2SSE_ALIGN_16 static const uint32_t cmask[] = {0x80000000, 0x80000000, 0x80000000, 0x80000000};
12904 __m128 cconst128;
12905 __m128i mask, res;
12906 convconst = (float)((uint64_t)1 << b);
12907 cconst128 = vdupq_n_f32(convconst);
12908 res = _mm_cvttps_epi32(_mm_mul_ps(a,cconst128));
12909 mask = _mm_cmpeq_epi32 (res, *(__m128i*)cmask);
12910
12911 /* ...for negative values we do not want to negate the bits of saturated value */
12912 mask = _mm_and_si128(_mm_castps_si128(_mm_cmpgt_ps(a,_mm_setzero_ps())), mask);
12913
12914 return _mm_xor_si128 (res, mask); //res saturated for 0x80000000
12915}
12916
12917_NEON2SSESTORAGE uint32x4_t vcvtq_n_u32_f32(float32x4_t a, __constrange(1,32) int b); // VCVT.U32.F32 q0, q0, #32
12918_NEON2SSE_INLINE uint32x4_t vcvtq_n_u32_f32(float32x4_t a, __constrange(1,32) int b)
12919{
12920 float convconst;
12921 __m128 cconst128;
12922 convconst = (float)((uint64_t)1 << b);
12923 cconst128 = vdupq_n_f32(convconst);
12924 return vcvtq_u32_f32(_mm_mul_ps(a,cconst128));
12925}
12926
12927
12928_NEON2SSESTORAGE int32x4_t vcvtnq_s32_f32(float32x4_t a); // VCVTN.S32.F32 q0, q0
12929_NEON2SSE_INLINE int32x4_t vcvtnq_s32_f32(float32x4_t a)
12930{
12931 return _mm_cvtps_epi32(a);
12932}
12933
12934//***************** Convert to float *************************
12935//*************************************************************
12936_NEON2SSESTORAGE float32x2_t vcvt_f32_s32(int32x2_t a); // VCVT.F32.S32 d0, d0
12937_NEON2SSE_INLINE float32x2_t vcvt_f32_s32(int32x2_t a) //use low 64 bits
12938{
12939 float32x2_t res;
12940 res.m64_f32[0] = (float) a.m64_i32[0];
12941 res.m64_f32[1] = (float) a.m64_i32[1];
12942 return res;
12943}
12944
12945_NEON2SSESTORAGE float32x2_t vcvt_f32_u32(uint32x2_t a); // VCVT.F32.U32 d0, d0
12946_NEON2SSE_INLINE float32x2_t vcvt_f32_u32(uint32x2_t a)
12947{
12948 float32x2_t res;
12949 res.m64_f32[0] = (float) a.m64_u32[0];
12950 res.m64_f32[1] = (float) a.m64_u32[1];
12951 return res;
12952}
12953
12954_NEON2SSE_GLOBAL float32x4_t vcvtq_f32_s32(int32x4_t a); // VCVT.F32.S32 q0, q0
12955#define vcvtq_f32_s32(a) _mm_cvtepi32_ps(a)
12956
12957_NEON2SSESTORAGE float32x4_t vcvtq_f32_u32(uint32x4_t a); // VCVT.F32.U32 q0, q0
12958_NEON2SSE_INLINE float32x4_t vcvtq_f32_u32(uint32x4_t a) // VCVT.F32.U32 q0, q0
12959{
12960 //solution may be not optimal
12961 __m128 two16, fHi, fLo;
12962 __m128i hi, lo;
12963 two16 = _mm_set1_ps((float)0x10000); //2^16
12964 // Avoid double rounding by doing two exact conversions
12965 // of high and low 16-bit segments
12966 hi = _mm_srli_epi32(a, 16);
12967 lo = _mm_srli_epi32(_mm_slli_epi32(a, 16), 16);
12968 fHi = _mm_mul_ps(_mm_cvtepi32_ps(hi), two16);
12969 fLo = _mm_cvtepi32_ps(lo);
12970 // do single rounding according to current rounding mode
12971 return _mm_add_ps(fHi, fLo);
12972}
12973
12974// ***** Convert to the float from fixed point with the number of fraction bits specified by b ***********
12975_NEON2SSESTORAGE float32x2_t vcvt_n_f32_s32(int32x2_t a, __constrange(1,32) int b); // VCVT.F32.S32 d0, d0, #32
12976_NEON2SSE_INLINE float32x2_t vcvt_n_f32_s32(int32x2_t a, __constrange(1,32) int b)
12977{
12978 float32x2_t res;
12979 float convconst;
12980 convconst = (float)(1. / ((uint64_t)1 << b));
12981 res.m64_f32[0] = a.m64_i32[0] * convconst;
12982 res.m64_f32[1] = a.m64_i32[1] * convconst;
12983 return res;
12984}
12985
12986_NEON2SSESTORAGE float32x2_t vcvt_n_f32_u32(uint32x2_t a, __constrange(1,32) int b); // VCVT.F32.U32 d0, d0, #32
12987_NEON2SSE_INLINE float32x2_t vcvt_n_f32_u32(uint32x2_t a, __constrange(1,32) int b) // VCVT.F32.U32 d0, d0, #32
12988{
12989 float32x2_t res;
12990 float convconst;
12991 convconst = (float)(1. / ((uint64_t)1 << b));
12992 res.m64_f32[0] = a.m64_u32[0] * convconst;
12993 res.m64_f32[1] = a.m64_u32[1] * convconst;
12994 return res;
12995}
12996
12997_NEON2SSESTORAGE float32x4_t vcvtq_n_f32_s32(int32x4_t a, __constrange(1,32) int b); // VCVT.F32.S32 q0, q0, #32
12998_NEON2SSE_INLINE float32x4_t vcvtq_n_f32_s32(int32x4_t a, __constrange(1,32) int b)
12999{
13000 float convconst;
13001 __m128 cconst128, af;
13002 convconst = (float)(1. / ((uint64_t)1 << b));
13003 af = _mm_cvtepi32_ps(a);
13004 cconst128 = vdupq_n_f32(convconst);
13005 return _mm_mul_ps(af,cconst128);
13006}
13007
13008_NEON2SSESTORAGE float32x4_t vcvtq_n_f32_u32(uint32x4_t a, __constrange(1,32) int b); // VCVT.F32.U32 q0, q0, #32
13009_NEON2SSE_INLINE float32x4_t vcvtq_n_f32_u32(uint32x4_t a, __constrange(1,32) int b)
13010{
13011 float convconst;
13012 __m128 cconst128, af;
13013 convconst = (float)(1. / ((uint64_t)1 << b));
13014 af = vcvtq_f32_u32(a);
13015 cconst128 = vdupq_n_f32(convconst);
13016 return _mm_mul_ps(af,cconst128);
13017}
13018
13019//**************Convert between floats ***********************
13020//************************************************************
13021_NEON2SSE_GLOBAL float16x4_t vcvt_f16_f32(float32x4_t a); // VCVT.F16.F32 d0, q0
13022//Intel SIMD doesn't support 16bits floats curently
13023
13024_NEON2SSE_GLOBAL float32x4_t vcvt_f32_f16(float16x4_t a); // VCVT.F32.F16 q0, d0
13025//Intel SIMD doesn't support 16bits floats curently, the only solution is to store 16bit floats and load as 32 bits
13026
13027//************Vector narrow integer conversion (truncation) ******************
13028//****************************************************************************
13029_NEON2SSESTORAGE int8x8_t vmovn_s16(int16x8_t a); // VMOVN.I16 d0,q0
13030_NEON2SSE_INLINE int8x8_t vmovn_s16(int16x8_t a) // VMOVN.I16 d0,q0
13031{
13032 int8x8_t res64;
13033 __m128i res;
13034 res = _mm_shuffle_epi8 (a, *(__m128i*) mask8_16_even_odd); //use 64 low bits only
13035 return64(res);
13036}
13037
13038_NEON2SSESTORAGE int16x4_t vmovn_s32(int32x4_t a); // VMOVN.I32 d0,q0
13039_NEON2SSE_INLINE int16x4_t vmovn_s32(int32x4_t a) // VMOVN.I32 d0,q0
13040{
13041 int16x4_t res64;
13042 __m128i res;
13043 res = _mm_shuffle_epi8 (a, *(__m128i*) mask8_32_even_odd); //use 64 low bits only
13044 return64(res);
13045}
13046
13047_NEON2SSESTORAGE int32x2_t vmovn_s64(int64x2_t a); // VMOVN.I64 d0,q0
13048_NEON2SSE_INLINE int32x2_t vmovn_s64(int64x2_t a)
13049{
13050 //may be not effective compared with a serial implementation
13051 int32x2_t res64;
13052 __m128i res;
13053 res = _mm_shuffle_epi32 (a, 0 | (2 << 2) | (1 << 4) | (3 << 6)); //use 64 low bits only, _MM_SHUFFLE(3, 1, 2, 0)
13054 return64(res);
13055}
13056
13057_NEON2SSE_GLOBAL uint8x8_t vmovn_u16(uint16x8_t a); // VMOVN.I16 d0,q0
13058#define vmovn_u16 vmovn_s16
13059
13060_NEON2SSE_GLOBAL uint16x4_t vmovn_u32(uint32x4_t a); // VMOVN.I32 d0,q0
13061#define vmovn_u32 vmovn_s32
13062
13063_NEON2SSE_GLOBAL uint32x2_t vmovn_u64(uint64x2_t a); // VMOVN.I64 d0,q0
13064#define vmovn_u64 vmovn_s64
13065
13066//**************** Vector long move ***********************
13067//***********************************************************
13068_NEON2SSESTORAGE int16x8_t vmovl_s8(int8x8_t a); // VMOVL.S8 q0,d0
13069_NEON2SSE_INLINE int16x8_t vmovl_s8(int8x8_t a)
13070{
13071 return _MM_CVTEPI8_EPI16(_pM128i(a)); //SSE4.1
13072}
13073
13074_NEON2SSESTORAGE int32x4_t vmovl_s16(int16x4_t a); // VMOVL.S16 q0,d0
13075_NEON2SSE_INLINE int32x4_t vmovl_s16(int16x4_t a)
13076{
13077 return _MM_CVTEPI16_EPI32(_pM128i(a)); //SSE4.1
13078}
13079
13080_NEON2SSESTORAGE int64x2_t vmovl_s32(int32x2_t a); // VMOVL.S32 q0,d0
13081_NEON2SSE_INLINE int64x2_t vmovl_s32(int32x2_t a)
13082{
13083 return _MM_CVTEPI32_EPI64(_pM128i(a)); //SSE4.1
13084}
13085
13086_NEON2SSESTORAGE uint16x8_t vmovl_u8(uint8x8_t a); // VMOVL.U8 q0,d0
13087_NEON2SSE_INLINE uint16x8_t vmovl_u8(uint8x8_t a)
13088{
13089 return _MM_CVTEPU8_EPI16(_pM128i(a)); //SSE4.1
13090}
13091
13092_NEON2SSESTORAGE uint32x4_t vmovl_u16(uint16x4_t a); // VMOVL.s16 q0,d0
13093_NEON2SSE_INLINE uint32x4_t vmovl_u16(uint16x4_t a)
13094{
13095 return _MM_CVTEPU16_EPI32(_pM128i(a)); //SSE4.1
13096}
13097
13098_NEON2SSESTORAGE uint64x2_t vmovl_u32(uint32x2_t a); // VMOVL.U32 q0,d0
13099_NEON2SSE_INLINE uint64x2_t vmovl_u32(uint32x2_t a)
13100{
13101 return _MM_CVTEPU32_EPI64(_pM128i(a)); //SSE4.1
13102}
13103
13104//*************Vector saturating narrow integer*****************
13105//**************************************************************
13106_NEON2SSESTORAGE int8x8_t vqmovn_s16(int16x8_t a); // VQMOVN.S16 d0,q0
13107_NEON2SSE_INLINE int8x8_t vqmovn_s16(int16x8_t a)
13108{
13109 int8x8_t res64;
13110 __m128i res;
13111 res = _mm_packs_epi16(a, a);
13112 return64(res);
13113}
13114
13115_NEON2SSESTORAGE int16x4_t vqmovn_s32(int32x4_t a); // VQMOVN.S32 d0,q0
13116_NEON2SSE_INLINE int16x4_t vqmovn_s32(int32x4_t a)
13117{
13118 int16x4_t res64;
13119 __m128i res;
13120 res = _mm_packs_epi32(a, a);
13121 return64(res);
13122}
13123
13124_NEON2SSESTORAGE int32x2_t vqmovn_s64(int64x2_t a); // VQMOVN.S64 d0,q0
13125_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int32x2_t vqmovn_s64(int64x2_t a),_NEON2SSE_REASON_SLOW_SERIAL) //no effective SIMD solution
13126{
13127 int32x2_t res;
13128 _NEON2SSE_ALIGN_16 int64_t atmp[2];
13129 _mm_store_si128((__m128i*)atmp, a);
13130 if(atmp[0]>SINT_MAX) atmp[0] = SINT_MAX;
13131 if(atmp[0]<SINT_MIN) atmp[0] = SINT_MIN;
13132 if(atmp[1]>SINT_MAX) atmp[1] = SINT_MAX;
13133 if(atmp[1]<SINT_MIN) atmp[1] = SINT_MIN;
13134 res.m64_i32[0] = (int32_t)atmp[0];
13135 res.m64_i32[1] = (int32_t)atmp[1];
13136 return res;
13137}
13138
13139_NEON2SSESTORAGE uint8x8_t vqmovn_u16(uint16x8_t a); // VQMOVN.s16 d0,q0
13140_NEON2SSE_INLINE uint8x8_t vqmovn_u16(uint16x8_t a) // VQMOVN.s16 d0,q0
13141{
13142 //no uint16 to uint8 conversion in SSE, need truncate to max signed first. Also trying to avoid _mm_shuffle_epi8 because of its big latency for old Atom CPUs
13143 uint8x8_t res64;
13144 __m128i c7fff, a_trunc, mask_trunc;
13145 c7fff = _mm_set1_epi16 (0x7fff); // 15-th bit set to zero
13146 a_trunc = _mm_and_si128(a, c7fff); // a truncated to max signed
13147 mask_trunc = _mm_cmpgt_epi16(a_trunc, a); //if after the shift we have bigger value than before then the 15-th bit had been set initially.
13148 mask_trunc = _mm_and_si128(mask_trunc, c7fff); //zero or c7fff if the 15-th bit had been set initially
13149 a_trunc = _mm_or_si128(a_trunc, mask_trunc);
13150 a_trunc = _mm_packus_epi16 (a_trunc, a_trunc); //use low 64bits only
13151 return64(a_trunc);
13152}
13153
13154_NEON2SSESTORAGE uint16x4_t vqmovn_u32(uint32x4_t a); // VQMOVN.U32 d0,q0
13155_NEON2SSE_INLINE uint16x4_t vqmovn_u32(uint32x4_t a) // VQMOVN.U32 d0,q0
13156{
13157 #ifdef USE_SSE4
13158 //no uint32 to uint16 conversion in SSE, need truncate to max signed first
13159 uint16x4_t res64;
13160 __m128i c7fffffff, a_trunc, mask_trunc;
13161 c7fffffff = _mm_set1_epi32((uint32_t)0x7fffffff); // 31-th bit set to zero
13162 a_trunc = _mm_and_si128(a, c7fffffff); // a truncated to max signed
13163 mask_trunc = _mm_cmpgt_epi16(a_trunc, a); //if after the shift we have bigger value than before then the 15-th bit had been set initially.
13164 mask_trunc = _mm_and_si128(mask_trunc, c7fffffff); //zero or c7fff if the 15-th bit had been set initially
13165 a_trunc = _mm_or_si128(a_trunc, mask_trunc);
13166 a_trunc = _MM_PACKUS1_EPI32 (a_trunc); //use low 64bits only
13167 return64(a_trunc);
13168 #else
13169 uint16x4_t res64;
13170 __m128i res_hi, mask;
13171 mask = _mm_setzero_si128();
13172 res_hi = _mm_srli_epi32(a, 16);
13173 res_hi = _mm_cmpeq_epi16(res_hi, mask);
13174 mask = _mm_cmpeq_epi16(mask,mask); //all fff
13175 mask = _mm_andnot_si128(res_hi,mask); //inverst res_hi to get >16 bits numbers
13176 res_hi = _mm_or_si128(a, mask); //saturated res
13177 res_hi = _mm_shuffle_epi8 (res_hi, *(__m128i*) mask8_32_even_odd); //go to 16 bits
13178 return64(res_hi);
13179 #endif
13180}
13181
13182_NEON2SSESTORAGE uint32x2_t vqmovn_u64(uint64x2_t a); // VQMOVN.U64 d0,q0
13183_NEON2SSE_INLINE uint32x2_t vqmovn_u64(uint64x2_t a)
13184{
13185 //serial solution may be faster
13186 uint32x2_t res64;
13187 __m128i res_hi, mask;
13188 mask = _mm_setzero_si128();
13189 res_hi = _mm_srli_epi64(a, 32);
13190 res_hi = _mm_cmpeq_epi32(res_hi, mask);
13191 mask = _mm_cmpeq_epi32(mask,mask); //all fff
13192 mask = _mm_andnot_si128(res_hi,mask); //inverst res_hi to get >32 bits numbers
13193 res_hi = _mm_or_si128(a, mask);
13194 res_hi = _mm_shuffle_epi32(res_hi, 0 | (2 << 2) | (1 << 4) | (3 << 6)); //shuffle the data to get 2 32-bits
13195 return64(res_hi);
13196}
13197//************* Vector saturating narrow integer signed->unsigned **************
13198//*****************************************************************************
13199_NEON2SSESTORAGE uint8x8_t vqmovun_s16(int16x8_t a); // VQMOVUN.S16 d0,q0
13200_NEON2SSE_INLINE uint8x8_t vqmovun_s16(int16x8_t a)
13201{
13202 uint8x8_t res64;
13203 __m128i res;
13204 res = _mm_packus_epi16(a, a); //use low 64bits only
13205 return64(res);
13206}
13207
13208_NEON2SSESTORAGE uint16x4_t vqmovun_s32(int32x4_t a); // VQMOVUN.S32 d0,q0
13209_NEON2SSE_INLINE uint16x4_t vqmovun_s32(int32x4_t a)
13210{
13211 uint16x4_t res64;
13212 __m128i res;
13213 res = _MM_PACKUS1_EPI32(a); //use low 64bits only
13214 return64(res);
13215}
13216
13217_NEON2SSESTORAGE uint32x2_t vqmovun_s64(int64x2_t a); // VQMOVUN.S64 d0,q0
13218_NEON2SSE_INLINE uint32x2_t vqmovun_s64(int64x2_t a)
13219{
13220 uint32x2_t res64;
13221 __m128i res_hi,res_lo, zero, cmp;
13222 zero = _mm_setzero_si128();
13223 res_hi = _mm_srli_epi64(a, 32);
13224 cmp = _mm_cmpgt_epi32(zero, res_hi); //if cmp<0 the result should be zero
13225 res_lo = _mm_andnot_si128(cmp,a); //if cmp zero - do nothing, otherwise cmp <0 and the result is 0
13226 cmp = _mm_cmpgt_epi32(res_hi,zero); //if cmp positive
13227 res_lo = _mm_or_si128(res_lo, cmp); //if cmp positive we are out of 32bits need to saturaate to 0xffffffff
13228 res_lo = _mm_shuffle_epi32(res_lo, 0 | (2 << 2) | (1 << 4) | (3 << 6)); //shuffle the data to get 2 32-bits
13229 return64(res_lo);
13230}
13231
13232// ********************************************************
13233// **************** Table look up **************************
13234// ********************************************************
13235//VTBL (Vector Table Lookup) uses byte indexes in a control vector to look up byte values
13236//in a table and generate a new vector. Indexes out of range return 0.
13237
13238//for Intel SIMD we need to set the MSB to 1 for zero return
13239//if b is unsigned ( > max signed) or negative it has MSB 1 set and doesn't need any special processing
13240_NEON2SSESTORAGE uint8x8_t vtbl1_u8(uint8x8_t a, uint8x8_t b); // VTBL.8 d0, {d0}, d0
13241_NEON2SSE_INLINE uint8x8_t vtbl1_u8(uint8x8_t a, uint8x8_t b)
13242{
13243 uint8x8_t res64;
13244 __m128i c7, maskgt, bmask, b128;
13245 c7 = _mm_set1_epi8 (7);
13246 b128 = _pM128i(b);
13247 maskgt = _mm_cmpgt_epi8(b128,c7);
13248 bmask = _mm_or_si128(b128,maskgt);
13249 bmask = _mm_shuffle_epi8(_pM128i(a),bmask);
13250 return64(bmask);
13251}
13252
13253_NEON2SSE_GLOBAL int8x8_t vtbl1_s8(int8x8_t a, int8x8_t b); // VTBL.8 d0, {d0}, d0
13254#define vtbl1_s8 vtbl1_u8
13255
13256_NEON2SSE_GLOBAL poly8x8_t vtbl1_p8(poly8x8_t a, uint8x8_t b); // VTBL.8 d0, {d0}, d0
13257#define vtbl1_p8 vtbl1_u8
13258
13259_NEON2SSESTORAGE uint8x8_t vtbl2_u8(uint8x8x2_t a, uint8x8_t b); // VTBL.8 d0, {d0, d1}, d0
13260_NEON2SSE_INLINE uint8x8_t vtbl2_u8(uint8x8x2_t a, uint8x8_t b)
13261{
13262 uint8x8_t res64;
13263 __m128i c15, a01, maskgt15, bmask, b128;
13264 c15 = _mm_set1_epi8 (15);
13265 b128 = _pM128i(b);
13266 maskgt15 = _mm_cmpgt_epi8(b128,c15);
13267 bmask = _mm_or_si128(b128, maskgt15);
13268 a01 = _mm_unpacklo_epi64(_pM128i(a.val[0]), _pM128i(a.val[1]));
13269 a01 = _mm_shuffle_epi8(a01, bmask);
13270 return64(a01);
13271}
13272
13273//int8x8_t vtbl2_s8(int8x8x2_t a, int8x8_t b); // VTBL.8 d0, {d0, d1}, d0
13274#define vtbl2_s8 vtbl2_u8
13275
13276//poly8x8_t vtbl2_p8(poly8x8x2_t a, uint8x8_t b); // VTBL.8 d0, {d0, d1}, d0
13277#define vtbl2_p8 vtbl2_u8
13278
13279_NEON2SSESTORAGE uint8x8_t vtbl3_u8(uint8x8x3_t a, uint8x8_t b); // VTBL.8 d0, {d0, d1, d2}, d0
13280_NEON2SSE_INLINE uint8x8_t vtbl3_u8(uint8x8x3_t a, uint8x8_t b)
13281{
13282 //solution may be not optimal
13283 uint8x8_t res64;
13284 __m128i c15, c23, maskgt23, bmask, maskgt15, sh0, sh1, a01, b128;
13285 c15 = _mm_set1_epi8 (15);
13286 c23 = _mm_set1_epi8 (23);
13287 b128 = _pM128i(b);
13288 maskgt23 = _mm_cmpgt_epi8(b128,c23);
13289 bmask = _mm_or_si128(b128, maskgt23);
13290 maskgt15 = _mm_cmpgt_epi8(b128,c15);
13291 a01 = _mm_unpacklo_epi64(_pM128i(a.val[0]),_pM128i(a.val[1]));
13292 sh0 = _mm_shuffle_epi8(a01, bmask);
13293 sh1 = _mm_shuffle_epi8(_pM128i(a.val[2]), bmask); //for bi>15 bi is wrapped (bi-=15)
13294 sh0 = _MM_BLENDV_EPI8(sh0, sh1, maskgt15); //SSE4.1
13295 return64(sh0);
13296}
13297
13298_NEON2SSE_GLOBAL int8x8_t vtbl3_s8(int8x8x3_t a, int8x8_t b); // VTBL.8 d0, {d0, d1, d2}, d0
13299#define vtbl3_s8 vtbl3_u8
13300
13301_NEON2SSE_GLOBAL poly8x8_t vtbl3_p8(poly8x8x3_t a, uint8x8_t b); // VTBL.8 d0, {d0, d1, d2}, d0
13302#define vtbl3_p8 vtbl3_u8
13303
13304_NEON2SSESTORAGE uint8x8_t vtbl4_u8(uint8x8x4_t a, uint8x8_t b); // VTBL.8 d0, {d0, d1, d2, d3}, d0
13305_NEON2SSE_INLINE uint8x8_t vtbl4_u8(uint8x8x4_t a, uint8x8_t b)
13306{
13307 //solution may be not optimal
13308 uint8x8_t res64;
13309 __m128i c15, c31, maskgt31, bmask, maskgt15, sh0, sh1, a01, a23, b128;
13310 c15 = _mm_set1_epi8 (15);
13311 c31 = _mm_set1_epi8 (31);
13312 b128 = _pM128i(b);
13313 maskgt31 = _mm_cmpgt_epi8(b128,c31);
13314 bmask = _mm_or_si128(b128, maskgt31);
13315 maskgt15 = _mm_cmpgt_epi8(b128,c15);
13316 a01 = _mm_unpacklo_epi64(_pM128i(a.val[0]),_pM128i(a.val[1]));
13317 a23 = _mm_unpacklo_epi64(_pM128i(a.val[2]),_pM128i(a.val[3]));
13318 sh0 = _mm_shuffle_epi8(a01, bmask);
13319 sh1 = _mm_shuffle_epi8(a23, bmask); //for bi>15 bi is wrapped (bi-=15)
13320 sh0 = _MM_BLENDV_EPI8 (sh0, sh1, maskgt15); //SSE4.1
13321 return64(sh0);
13322}
13323
13324_NEON2SSE_GLOBAL int8x8_t vtbl4_s8(int8x8x4_t a, int8x8_t b); // VTBL.8 d0, {d0, d1, d2, d3}, d0
13325#define vtbl4_s8 vtbl4_u8
13326
13327_NEON2SSE_GLOBAL poly8x8_t vtbl4_p8(poly8x8x4_t a, uint8x8_t b); // VTBL.8 d0, {d0, d1, d2, d3}, d0
13328#define vtbl4_p8 vtbl4_u8
13329
13330//****************** Extended table look up intrinsics ***************************
13331//**********************************************************************************
13332//VTBX (Vector Table Extension) works in the same way as VTBL do,
13333// except that indexes out of range leave the destination element unchanged.
13334
13335_NEON2SSESTORAGE uint8x8_t vtbx1_u8(uint8x8_t a, uint8x8_t b, uint8x8_t c); // VTBX.8 d0, {d0}, d0
13336_NEON2SSE_INLINE uint8x8_t vtbx1_u8(uint8x8_t a, uint8x8_t b, uint8x8_t c)
13337{
13338 uint8x8_t res64;
13339 __m128i c8, maskgt, sh, c128;
13340 c8 = _mm_set1_epi8(8);
13341 c128 = _pM128i(c);
13342 //need to pre-clamp c values to avoid unsigned comparison
13343 c128 = _mm_min_epu8(c128, c8);
13344 maskgt = _mm_cmpgt_epi8(c8,c128);
13345 sh = _mm_shuffle_epi8(_pM128i(b),c128);
13346 sh = _mm_and_si128(maskgt,sh);
13347 c8 = _mm_andnot_si128(maskgt,_pM128i(a));
13348 sh = _mm_or_si128(sh,c8);
13349 return64(sh);
13350}
13351
13352_NEON2SSE_GLOBAL int8x8_t vtbx1_s8(int8x8_t a, int8x8_t b, int8x8_t c); // VTBX.8 d0, {d0}, d0
13353#define vtbx1_s8 vtbx1_u8
13354
13355_NEON2SSE_GLOBAL poly8x8_t vtbx1_p8(poly8x8_t a, poly8x8_t b, uint8x8_t c); // VTBX.8 d0, {d0}, d0
13356#define vtbx1_p8 vtbx1_u8
13357
13358_NEON2SSESTORAGE uint8x8_t vtbx2_u8(uint8x8_t a, uint8x8x2_t b, uint8x8_t c); // VTBX.8 d0, {d0, d1}, d0
13359_NEON2SSE_INLINE uint8x8_t vtbx2_u8(uint8x8_t a, uint8x8x2_t b, uint8x8_t c)
13360{
13361 uint8x8_t res64;
13362 __m128i c16, b01, maskgt15, sh, c128;
13363 c16 = _mm_set1_epi8(16);
13364 c128 = _pM128i(c);
13365 //need to pre-clamp c values to avoid unsigned comparison
13366 c128 = _mm_min_epu8(c128, c16);
13367 maskgt15 = _mm_cmpgt_epi8(c16,c128);
13368 b01 = _mm_unpacklo_epi64(_pM128i(b.val[0]), _pM128i(b.val[1]));
13369 sh = _mm_shuffle_epi8(b01, c128);
13370 sh = _mm_and_si128(maskgt15, sh);
13371 c16 = _mm_andnot_si128(maskgt15, _pM128i(a));
13372 sh = _mm_or_si128(sh,c16);
13373 return64(sh);
13374}
13375
13376//int8x8_t vtbx2_s8(int8x8_t a, int8x8x2_t b, int8x8_t c); // VTBX.8 d0, {d0, d1}, d0
13377#define vtbx2_s8 vtbx2_u8
13378
13379//poly8x8_t vtbx2_p8(poly8x8_t a, poly8x8x2_t b, uint8x8_t c); // VTBX.8 d0, {d0, d1}, d0
13380#define vtbx2_p8 vtbx2_u8
13381
13382_NEON2SSESTORAGE uint8x8_t vtbx3_u8(uint8x8_t a, uint8x8x3_t b, uint8x8_t c); // VTBX.8 d0, {d0, d1, d2}, d0
13383_NEON2SSE_INLINE uint8x8_t vtbx3_u8(uint8x8_t a, uint8x8x3_t b, uint8x8_t c)
13384{
13385 //solution may be not optimal
13386 uint8x8_t res64;
13387 __m128i c15, c24, maskgt15, maskgt23, sh0, sh1, b01, c128;
13388 c15 = _mm_set1_epi8 (15);
13389 c24 = _mm_set1_epi8 (24);
13390 c128 = _pM128i(c);
13391 //need to pre-clamp c values to avoid unsigned comparison
13392 c128 = _mm_min_epu8(c128, c24);
13393 maskgt23 = _mm_cmpgt_epi8(c24,c128);
13394 maskgt15 = _mm_cmpgt_epi8(c128,c15);
13395 c24 = _mm_andnot_si128(maskgt23, _pM128i(a));
13396 b01 = _mm_unpacklo_epi64(_pM128i(b.val[0]),_pM128i(b.val[1]));
13397 sh0 = _mm_shuffle_epi8(b01, c128);
13398 sh1 = _mm_shuffle_epi8(_pM128i(b.val[2]), c128); //for bi>15 bi is wrapped (bi-=15)
13399 sh0 = _MM_BLENDV_EPI8(sh0, sh1, maskgt15);
13400 sh0 = _mm_and_si128(maskgt23,sh0);
13401 sh0 = _mm_or_si128(sh0,c24);
13402 return64(sh0);
13403}
13404
13405_NEON2SSE_GLOBAL int8x8_t vtbx3_s8(int8x8_t a, int8x8x3_t b, int8x8_t c); // VTBX.8 d0, {d0, d1, d2}, d0
13406#define vtbx3_s8 vtbx3_u8
13407
13408_NEON2SSE_GLOBAL poly8x8_t vtbx3_p8(poly8x8_t a, poly8x8x3_t b, uint8x8_t c); // VTBX.8 d0, {d0, d1, d2}, d0
13409#define vtbx3_p8 vtbx3_u8
13410
13411_NEON2SSESTORAGE uint8x8_t vtbx4_u8(uint8x8_t a, uint8x8x4_t b, uint8x8_t c); // VTBX.8 d0, {d0, d1, d2, d3}, d0
13412_NEON2SSE_INLINE uint8x8_t vtbx4_u8(uint8x8_t a, uint8x8x4_t b, uint8x8_t c)
13413{
13414 //solution may be not optimal
13415 uint8x8_t res64;
13416 __m128i c15, c32, maskgt15, maskgt31, sh0, sh1, b01, b23, c128;
13417 c15 = _mm_set1_epi8 (15);
13418 c32 = _mm_set1_epi8 (32);
13419 c128 = _pM128i(c);
13420 //need to pre-clamp c values to avoid unsigned comparison
13421 c128 = _mm_min_epu8(c128, c32);
13422 maskgt15 = _mm_cmpgt_epi8(c128,c15);
13423 maskgt31 = _mm_cmpgt_epi8(c32,c128);
13424 c32 = _mm_andnot_si128(maskgt31, _pM128i(a));
13425
13426 b01 = _mm_unpacklo_epi64(_pM128i(b.val[0]),_pM128i(b.val[1]));
13427 b23 = _mm_unpacklo_epi64(_pM128i(b.val[2]),_pM128i(b.val[3]));
13428 sh0 = _mm_shuffle_epi8(b01, c128);
13429 sh1 = _mm_shuffle_epi8(b23, c128); //for bi>15 bi is wrapped (bi-=15)
13430 sh0 = _MM_BLENDV_EPI8(sh0, sh1, maskgt15);
13431 sh0 = _mm_and_si128(maskgt31,sh0);
13432 sh0 = _mm_or_si128(sh0,c32);
13433 return64(sh0);
13434}
13435
13436_NEON2SSE_GLOBAL int8x8_t vtbx4_s8(int8x8_t a, int8x8x4_t b, int8x8_t c); // VTBX.8 d0, {d0, d1, d2, d3}, d0
13437#define vtbx4_s8 vtbx4_u8
13438
13439_NEON2SSE_GLOBAL poly8x8_t vtbx4_p8(poly8x8_t a, poly8x8x4_t b, uint8x8_t c); // VTBX.8 d0, {d0, d1, d2, d3}, d0
13440#define vtbx4_p8 vtbx4_u8
13441
13442//*************************************************************************************************
13443// *************************** Operations with a scalar value *********************************
13444//*************************************************************************************************
13445
13446//******* Vector multiply accumulate by scalar *************************************************
13447//**********************************************************************************************
13448_NEON2SSESTORAGE int16x4_t vmla_lane_s16(int16x4_t a, int16x4_t b, int16x4_t v, __constrange(0,3) int l); // VMLA.I16 d0, d0, d0[0]
13449_NEON2SSE_INLINE int16x4_t vmla_lane_s16(int16x4_t a, int16x4_t b, int16x4_t v, __constrange(0,3) int l) // VMLA.I16 d0, d0, d0[0]
13450{
13451 int16_t c;
13452 int16x4_t scalar;
13453 c = vget_lane_s16(v, l);
13454 scalar = vdup_n_s16(c);
13455 return vmla_s16(a, b, scalar);
13456}
13457
13458_NEON2SSESTORAGE int32x2_t vmla_lane_s32(int32x2_t a, int32x2_t b, int32x2_t v, __constrange(0,1) int l); // VMLA.I32 d0, d0, d0[0]
13459_NEON2SSE_INLINE int32x2_t vmla_lane_s32(int32x2_t a, int32x2_t b, int32x2_t v, __constrange(0,1) int l) // VMLA.I32 d0, d0, d0[0]
13460{
13461 int32_t c;
13462 int32x2_t scalar;
13463 c = vget_lane_s32(v, l);
13464 scalar = vdup_n_s32(c);
13465 return vmla_s32(a, b, scalar);
13466}
13467
13468_NEON2SSE_GLOBAL uint16x4_t vmla_lane_u16(uint16x4_t a, uint16x4_t b, uint16x4_t v, __constrange(0,3) int l); // VMLA.I16 d0, d0, d0[0]
13469#define vmla_lane_u16 vmla_lane_s16
13470
13471
13472_NEON2SSE_GLOBAL uint32x2_t vmla_lane_u32(uint32x2_t a, uint32x2_t b, uint32x2_t v, __constrange(0,1) int l); // VMLA.I32 d0, d0, d0[0]
13473#define vmla_lane_u32 vmla_lane_s32
13474
13475_NEON2SSESTORAGE float32x2_t vmla_lane_f32(float32x2_t a, float32x2_t b, float32x2_t v, __constrange(0,1) int l); // VMLA.F32 d0, d0, d0[0]
13476_NEON2SSE_INLINE float32x2_t vmla_lane_f32(float32x2_t a, float32x2_t b, float32x2_t v, __constrange(0,1) int l)
13477{
13478 float32_t vlane;
13479 float32x2_t c;
13480 vlane = vget_lane_f32(v, l);
13481 c = vdup_n_f32(vlane);
13482 return vmla_f32(a,b,c);
13483}
13484
13485_NEON2SSESTORAGE int16x8_t vmlaq_lane_s16(int16x8_t a, int16x8_t b, int16x4_t v, __constrange(0,3) int l); // VMLA.I16 q0, q0, d0[0]
13486_NEON2SSE_INLINE int16x8_t vmlaq_lane_s16(int16x8_t a, int16x8_t b, int16x4_t v, __constrange(0,3) int l) // VMLA.I16 q0, q0, d0[0]
13487{
13488 int16_t vlane;
13489 int16x8_t c;
13490 vlane = vget_lane_s16(v, l);
13491 c = vdupq_n_s16(vlane);
13492 return vmlaq_s16(a,b,c);
13493}
13494
13495_NEON2SSESTORAGE int32x4_t vmlaq_lane_s32(int32x4_t a, int32x4_t b, int32x2_t v, __constrange(0,1) int l); // VMLA.I32 q0, q0, d0[0]
13496_NEON2SSE_INLINE int32x4_t vmlaq_lane_s32(int32x4_t a, int32x4_t b, int32x2_t v, __constrange(0,1) int l) // VMLA.I32 q0, q0, d0[0]
13497{
13498 int32_t vlane;
13499 int32x4_t c;
13500 vlane = vget_lane_s32(v, l);
13501 c = vdupq_n_s32(vlane);
13502 return vmlaq_s32(a,b,c);
13503}
13504
13505_NEON2SSE_GLOBAL uint16x8_t vmlaq_lane_u16(uint16x8_t a, uint16x8_t b, uint16x4_t v, __constrange(0,3) int l); // VMLA.I16 q0, q0, d0[0]
13506#define vmlaq_lane_u16 vmlaq_lane_s16
13507
13508_NEON2SSE_GLOBAL uint32x4_t vmlaq_lane_u32(uint32x4_t a, uint32x4_t b, uint32x2_t v, __constrange(0,1) int l); // VMLA.I32 q0, q0, d0[0]
13509#define vmlaq_lane_u32 vmlaq_lane_s32
13510
13511_NEON2SSESTORAGE float32x4_t vmlaq_lane_f32(float32x4_t a, float32x4_t b, float32x2_t v, __constrange(0,1) int l); // VMLA.F32 q0, q0, d0[0]
13512_NEON2SSE_INLINE float32x4_t vmlaq_lane_f32(float32x4_t a, float32x4_t b, float32x2_t v, __constrange(0,1) int l) // VMLA.F32 q0, q0, d0[0]
13513{
13514 float32_t vlane;
13515 float32x4_t c;
13516 vlane = vget_lane_f32(v, l);
13517 c = vdupq_n_f32(vlane);
13518 return vmlaq_f32(a,b,c);
13519}
13520
13521//***************** Vector widening multiply accumulate by scalar **********************
13522//***************************************************************************************
13523_NEON2SSESTORAGE int32x4_t vmlal_lane_s16(int32x4_t a, int16x4_t b, int16x4_t v, __constrange(0,3) int l); // VMLAL.S16 q0, d0, d0[0]
13524_NEON2SSE_INLINE int32x4_t vmlal_lane_s16(int32x4_t a, int16x4_t b, int16x4_t v, __constrange(0,3) int l) // VMLAL.S16 q0, d0, d0[0]
13525{
13526 int16_t vlane;
13527 int16x4_t c;
13528 vlane = vget_lane_s16(v, l);
13529 c = vdup_n_s16(vlane);
13530 return vmlal_s16(a, b, c);
13531}
13532
13533_NEON2SSESTORAGE int64x2_t vmlal_lane_s32(int64x2_t a, int32x2_t b, int32x2_t v, __constrange(0,1) int l); // VMLAL.S32 q0, d0, d0[0]
13534_NEON2SSE_INLINE int64x2_t vmlal_lane_s32(int64x2_t a, int32x2_t b, int32x2_t v, __constrange(0,1) int l) // VMLAL.S32 q0, d0, d0[0]
13535{
13536 int32_t vlane;
13537 int32x2_t c;
13538 vlane = vget_lane_s32(v, l);
13539 c = vdup_n_s32(vlane);
13540 return vmlal_s32(a, b, c);
13541}
13542
13543_NEON2SSESTORAGE uint32x4_t vmlal_lane_u16(uint32x4_t a, uint16x4_t b, uint16x4_t v, __constrange(0,3) int l); // VMLAL.s16 q0, d0, d0[0]
13544_NEON2SSE_INLINE uint32x4_t vmlal_lane_u16(uint32x4_t a, uint16x4_t b, uint16x4_t v, __constrange(0,3) int l) // VMLAL.s16 q0, d0, d0[0]
13545{
13546 uint16_t vlane;
13547 uint16x4_t c;
13548 vlane = vget_lane_u16(v, l);
13549 c = vdup_n_u16(vlane);
13550 return vmlal_u16(a, b, c);
13551}
13552
13553_NEON2SSESTORAGE uint64x2_t vmlal_lane_u32(uint64x2_t a, uint32x2_t b, uint32x2_t v, __constrange(0,1) int l); // VMLAL.U32 q0, d0, d0[0]
13554_NEON2SSE_INLINE uint64x2_t vmlal_lane_u32(uint64x2_t a, uint32x2_t b, uint32x2_t v, __constrange(0,1) int l) // VMLAL.U32 q0, d0, d0[0]
13555{
13556 uint32_t vlane;
13557 uint32x2_t c;
13558 vlane = vget_lane_u32(v, l);
13559 c = vdup_n_u32(vlane);
13560 return vmlal_u32(a, b, c);
13561}
13562
13563// ******** Vector widening saturating doubling multiply accumulate by scalar *******************************
13564// ************************************************************************************************
13565_NEON2SSESTORAGE int32x4_t vqdmlal_lane_s16(int32x4_t a, int16x4_t b, int16x4_t v, __constrange(0,3) int l); // VQDMLAL.S16 q0, d0, d0[0]
13566_NEON2SSE_INLINE int32x4_t vqdmlal_lane_s16(int32x4_t a, int16x4_t b, int16x4_t v, __constrange(0,3) int l)
13567{
13568 int16_t vlane;
13569 int16x4_t c;
13570 vlane = vget_lane_s16(v, l);
13571 c = vdup_n_s16(vlane);
13572 return vqdmlal_s16(a, b, c);
13573}
13574
13575_NEON2SSESTORAGE int64x2_t vqdmlal_lane_s32(int64x2_t a, int32x2_t b, int32x2_t v, __constrange(0,1) int l); // VQDMLAL.S32 q0, d0, d0[0]
13576_NEON2SSE_INLINE int64x2_t vqdmlal_lane_s32(int64x2_t a, int32x2_t b, int32x2_t v, __constrange(0,1) int l)
13577{
13578 int32_t vlane;
13579 uint32x2_t c;
13580 vlane = vget_lane_s32(v, l);
13581 c = vdup_n_s32(vlane);
13582 return vqdmlal_s32(a, b, c);
13583}
13584
13585// ****** Vector multiply subtract by scalar *****************
13586// *************************************************************
13587_NEON2SSESTORAGE int16x4_t vmls_lane_s16(int16x4_t a, int16x4_t b, int16x4_t v, __constrange(0,3) int l); // VMLS.I16 d0, d0, d0[0]
13588_NEON2SSE_INLINE int16x4_t vmls_lane_s16(int16x4_t a, int16x4_t b, int16x4_t v, __constrange(0,3) int l) // VMLS.I16 d0, d0, d0[0]
13589{
13590 int16_t vlane;
13591 int16x4_t c;
13592 vlane = vget_lane_s16(v, l);
13593 c = vdup_n_s16(vlane);
13594 return vmls_s16(a, b, c);
13595}
13596
13597_NEON2SSESTORAGE int32x2_t vmls_lane_s32(int32x2_t a, int32x2_t b, int32x2_t v, __constrange(0,1) int l); // VMLS.I32 d0, d0, d0[0]
13598_NEON2SSE_INLINE int32x2_t vmls_lane_s32(int32x2_t a, int32x2_t b, int32x2_t v, __constrange(0,1) int l) // VMLS.I32 d0, d0, d0[0]
13599{
13600 int32_t vlane;
13601 int32x2_t c;
13602 vlane = vget_lane_s32(v, l);
13603 c = vdup_n_s32(vlane);
13604 return vmls_s32(a, b, c);
13605}
13606
13607_NEON2SSESTORAGE uint16x4_t vmls_lane_u16(uint16x4_t a, uint16x4_t b, uint16x4_t v, __constrange(0,3) int l); // VMLS.I16 d0, d0, d0[0]
13608_NEON2SSE_INLINE uint16x4_t vmls_lane_u16(uint16x4_t a, uint16x4_t b, uint16x4_t v, __constrange(0,3) int l) // VMLS.I16 d0, d0, d0[0]
13609{
13610 uint16_t vlane;
13611 uint16x4_t c;
13612 vlane = vget_lane_s16(v, l);
13613 c = vdup_n_s16(vlane);
13614 return vmls_s16(a, b, c);
13615}
13616
13617_NEON2SSESTORAGE uint32x2_t vmls_lane_u32(uint32x2_t a, uint32x2_t b, uint32x2_t v, __constrange(0,1) int l); // VMLS.I32 d0, d0, d0[0]
13618_NEON2SSE_INLINE uint32x2_t vmls_lane_u32(uint32x2_t a, uint32x2_t b, uint32x2_t v, __constrange(0,1) int l) // VMLS.I32 d0, d0, d0[0]
13619{
13620 uint32_t vlane;
13621 uint32x2_t c;
13622 vlane = vget_lane_u32(v, l);
13623 c = vdup_n_u32(vlane);
13624 return vmls_u32(a, b, c);
13625}
13626
13627_NEON2SSESTORAGE float32x2_t vmls_lane_f32(float32x2_t a, float32x2_t b, float32x2_t v, __constrange(0,1) int l); // VMLS.F32 d0, d0, d0[0]
13628_NEON2SSE_INLINE float32x2_t vmls_lane_f32(float32x2_t a, float32x2_t b, float32x2_t v, __constrange(0,1) int l)
13629{
13630 float32_t vlane;
13631 float32x2_t c;
13632 vlane = (float) vget_lane_f32(v, l);
13633 c = vdup_n_f32(vlane);
13634 return vmls_f32(a,b,c);
13635}
13636
13637_NEON2SSESTORAGE int16x8_t vmlsq_lane_s16(int16x8_t a, int16x8_t b, int16x4_t v, __constrange(0,3) int l); // VMLS.I16 q0, q0, d0[0]
13638_NEON2SSE_INLINE int16x8_t vmlsq_lane_s16(int16x8_t a, int16x8_t b, int16x4_t v, __constrange(0,3) int l) // VMLS.I16 q0, q0, d0[0]
13639{
13640 int16_t vlane;
13641 int16x8_t c;
13642 vlane = vget_lane_s16(v, l);
13643 c = vdupq_n_s16(vlane);
13644 return vmlsq_s16(a, b,c);
13645}
13646
13647_NEON2SSESTORAGE int32x4_t vmlsq_lane_s32(int32x4_t a, int32x4_t b, int32x2_t v, __constrange(0,1) int l); // VMLS.I32 q0, q0, d0[0]
13648_NEON2SSE_INLINE int32x4_t vmlsq_lane_s32(int32x4_t a, int32x4_t b, int32x2_t v, __constrange(0,1) int l) // VMLS.I32 q0, q0, d0[0]
13649{
13650 int32_t vlane;
13651 int32x4_t c;
13652 vlane = vget_lane_s32(v, l);
13653 c = vdupq_n_s32(vlane);
13654 return vmlsq_s32(a,b,c);
13655}
13656
13657_NEON2SSESTORAGE uint16x8_t vmlsq_lane_u16(uint16x8_t a, uint16x8_t b, uint16x4_t v, __constrange(0,3) int l); // VMLA.I16 q0, q0, d0[0]
13658_NEON2SSE_INLINE uint16x8_t vmlsq_lane_u16(uint16x8_t a, uint16x8_t b, uint16x4_t v, __constrange(0,3) int l) // VMLA.I16 q0, q0, d0[0]
13659{
13660 uint16_t vlane;
13661 uint16x8_t c;
13662 vlane = vget_lane_u16(v, l);
13663 c = vdupq_n_u16(vlane);
13664 return vmlsq_u16(a,b,c);
13665}
13666
13667_NEON2SSESTORAGE uint32x4_t vmlsq_lane_u32(uint32x4_t a, uint32x4_t b, uint32x2_t v, __constrange(0,1) int l); // VMLA.I32 q0, q0, d0[0]
13668_NEON2SSE_INLINE uint32x4_t vmlsq_lane_u32(uint32x4_t a, uint32x4_t b, uint32x2_t v, __constrange(0,1) int l) // VMLA.I32 q0, q0, d0[0]
13669{
13670 uint32_t vlane;
13671 uint32x4_t c;
13672 vlane = vget_lane_u32(v, l);
13673 c = vdupq_n_u32(vlane);
13674 return vmlsq_u32(a,b,c);
13675}
13676
13677_NEON2SSESTORAGE float32x4_t vmlsq_lane_f32(float32x4_t a, float32x4_t b, float32x2_t v, __constrange(0,1) int l); // VMLA.F32 q0, q0, d0[0]
13678_NEON2SSE_INLINE float32x4_t vmlsq_lane_f32(float32x4_t a, float32x4_t b, float32x2_t v, __constrange(0,1) int l) // VMLA.F32 q0, q0, d0[0]
13679{
13680 float32_t vlane;
13681 float32x4_t c;
13682 vlane = (float) vget_lane_f32(v, l);
13683 c = vdupq_n_f32(vlane);
13684 return vmlsq_f32(a,b,c);
13685}
13686
13687// **** Vector widening multiply subtract by scalar ****
13688// ****************************************************
13689_NEON2SSESTORAGE int32x4_t vmlsl_lane_s16(int32x4_t a, int16x4_t b, int16x4_t v, __constrange(0,3) int l); // VMLAL.S16 q0, d0, d0[0]
13690_NEON2SSE_INLINE int32x4_t vmlsl_lane_s16(int32x4_t a, int16x4_t b, int16x4_t v, __constrange(0,3) int l) // VMLAL.S16 q0, d0, d0[0]
13691{
13692 int16_t vlane;
13693 int16x4_t c;
13694 vlane = vget_lane_s16(v, l);
13695 c = vdup_n_s16(vlane);
13696 return vmlsl_s16(a, b, c);
13697}
13698
13699_NEON2SSESTORAGE int64x2_t vmlsl_lane_s32(int64x2_t a, int32x2_t b, int32x2_t v, __constrange(0,1) int l); // VMLAL.S32 q0, d0, d0[0]
13700_NEON2SSE_INLINE int64x2_t vmlsl_lane_s32(int64x2_t a, int32x2_t b, int32x2_t v, __constrange(0,1) int l) // VMLAL.S32 q0, d0, d0[0]
13701{
13702 int32_t vlane;
13703 int32x2_t c;
13704 vlane = vget_lane_s32(v, l);
13705 c = vdup_n_s32(vlane);
13706 return vmlsl_s32(a, b, c);
13707}
13708
13709_NEON2SSESTORAGE uint32x4_t vmlsl_lane_u16(uint32x4_t a, uint16x4_t b, uint16x4_t v, __constrange(0,3) int l); // VMLAL.U16 q0, d0, d0[0]
13710_NEON2SSE_INLINE uint32x4_t vmlsl_lane_u16(uint32x4_t a, uint16x4_t b, uint16x4_t v, __constrange(0,3) int l) // VMLAL.U16 q0, d0, d0[0]
13711{
13712 uint16_t vlane;
13713 uint16x4_t c;
13714 vlane = vget_lane_u16(v, l);
13715 c = vdup_n_u16(vlane);
13716 return vmlsl_u16(a, b, c);
13717}
13718
13719_NEON2SSESTORAGE uint64x2_t vmlsl_lane_u32(uint64x2_t a, uint32x2_t b, uint32x2_t v, __constrange(0,1) int l); // VMLAL.U32 q0, d0, d0[0]
13720_NEON2SSE_INLINE uint64x2_t vmlsl_lane_u32(uint64x2_t a, uint32x2_t b, uint32x2_t v, __constrange(0,1) int l) // VMLAL.U32 q0, d0, d0[0]
13721{
13722 uint32_t vlane;
13723 uint32x2_t c;
13724 vlane = vget_lane_u32(v, l);
13725 c = vdup_n_u32(vlane);
13726 return vmlsl_u32(a, b, c);
13727}
13728
13729//********* Vector widening saturating doubling multiply subtract by scalar **************************
13730//******************************************************************************************************
13731_NEON2SSESTORAGE int32x4_t vqdmlsl_lane_s16(int32x4_t a, int16x4_t b, int16x4_t v, __constrange(0,3) int l); // VQDMLSL.S16 q0, d0, d0[0]
13732_NEON2SSE_INLINE int32x4_t vqdmlsl_lane_s16(int32x4_t a, int16x4_t b, int16x4_t v, __constrange(0,3) int l)
13733{
13734 int16_t vlane;
13735 int16x4_t c;
13736 vlane = vget_lane_s16(v, l);
13737 c = vdup_n_s16(vlane);
13738 return vqdmlsl_s16(a, b, c);
13739}
13740
13741_NEON2SSESTORAGE int64x2_t vqdmlsl_lane_s32(int64x2_t a, int32x2_t b, int32x2_t v, __constrange(0,1) int l); // VQDMLSL.S32 q0, d0, d0[0]
13742_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int64x2_t vqdmlsl_lane_s32(int64x2_t a, int32x2_t b, int32x2_t v, __constrange(0,1) int l), _NEON2SSE_REASON_SLOW_SERIAL)
13743{
13744 int32_t vlane;
13745 int32x2_t c;
13746 vlane = vget_lane_s32(v, l);
13747 c = vdup_n_s32(vlane);
13748 return vqdmlsl_s32(a, b, c);
13749}
13750//********** Vector multiply with scalar *****************************
13751_NEON2SSESTORAGE int16x4_t vmul_n_s16(int16x4_t a, int16_t b); // VMUL.I16 d0,d0,d0[0]
13752_NEON2SSE_INLINE int16x4_t vmul_n_s16(int16x4_t a, int16_t b) // VMUL.I16 d0,d0,d0[0]
13753{
13754 int16x4_t b16x4;
13755 b16x4 = vdup_n_s16(b);
13756 return vmul_s16(a, b16x4);
13757}
13758
13759_NEON2SSESTORAGE int32x2_t vmul_n_s32(int32x2_t a, int32_t b); // VMUL.I32 d0,d0,d0[0]
13760_NEON2SSE_INLINE int32x2_t vmul_n_s32(int32x2_t a, int32_t b) // VMUL.I32 d0,d0,d0[0]
13761{
13762 //serial solution looks faster
13763 int32x2_t b32x2;
13764 b32x2 = vdup_n_s32(b);
13765 return vmul_s32(a, b32x2);
13766}
13767
13768_NEON2SSESTORAGE float32x2_t vmul_n_f32(float32x2_t a, float32_t b); // VMUL.F32 d0,d0,d0[0]
13769_NEON2SSE_INLINE float32x2_t vmul_n_f32(float32x2_t a, float32_t b) // VMUL.F32 d0,d0,d0[0]
13770{
13771 float32x2_t b32x2;
13772 b32x2 = vdup_n_f32(b);
13773 return vmul_f32(a, b32x2);
13774}
13775
13776_NEON2SSESTORAGE uint16x4_t vmul_n_u16(uint16x4_t a, uint16_t b); // VMUL.I16 d0,d0,d0[0]
13777_NEON2SSE_INLINE uint16x4_t vmul_n_u16(uint16x4_t a, uint16_t b) // VMUL.I16 d0,d0,d0[0]
13778{
13779 uint16x4_t b16x4;
13780 b16x4 = vdup_n_s16(b);
13781 return vmul_s16(a, b16x4);
13782}
13783
13784_NEON2SSESTORAGE uint32x2_t vmul_n_u32(uint32x2_t a, uint32_t b); // VMUL.I32 d0,d0,d0[0]
13785_NEON2SSE_INLINE uint32x2_t vmul_n_u32(uint32x2_t a, uint32_t b) // VMUL.I32 d0,d0,d0[0]
13786{
13787 //serial solution looks faster
13788 uint32x2_t b32x2;
13789 b32x2 = vdup_n_u32(b);
13790 return vmul_u32(a, b32x2);
13791}
13792
13793_NEON2SSESTORAGE int16x8_t vmulq_n_s16(int16x8_t a, int16_t b); // VMUL.I16 q0,q0,d0[0]
13794_NEON2SSE_INLINE int16x8_t vmulq_n_s16(int16x8_t a, int16_t b) // VMUL.I16 q0,q0,d0[0]
13795{
13796 int16x8_t b16x8;
13797 b16x8 = vdupq_n_s16(b);
13798 return vmulq_s16(a, b16x8);
13799}
13800
13801_NEON2SSESTORAGE int32x4_t vmulq_n_s32(int32x4_t a, int32_t b); // VMUL.I32 q0,q0,d0[0]
13802_NEON2SSE_INLINE int32x4_t vmulq_n_s32(int32x4_t a, int32_t b) // VMUL.I32 q0,q0,d0[0]
13803{
13804 int32x4_t b32x4;
13805 b32x4 = vdupq_n_s32(b);
13806 return vmulq_s32(a, b32x4);
13807}
13808
13809_NEON2SSESTORAGE float32x4_t vmulq_n_f32(float32x4_t a, float32_t b); // VMUL.F32 q0,q0,d0[0]
13810_NEON2SSE_INLINE float32x4_t vmulq_n_f32(float32x4_t a, float32_t b) // VMUL.F32 q0,q0,d0[0]
13811{
13812 float32x4_t b32x4;
13813 b32x4 = vdupq_n_f32(b);
13814 return vmulq_f32(a, b32x4);
13815}
13816
13817_NEON2SSESTORAGE uint16x8_t vmulq_n_u16(uint16x8_t a, uint16_t b); // VMUL.I16 q0,q0,d0[0]
13818_NEON2SSE_INLINE uint16x8_t vmulq_n_u16(uint16x8_t a, uint16_t b) // VMUL.I16 q0,q0,d0[0]
13819{
13820 uint16x8_t b16x8;
13821 b16x8 = vdupq_n_s16(b);
13822 return vmulq_s16(a, b16x8);
13823}
13824
13825_NEON2SSESTORAGE uint32x4_t vmulq_n_u32(uint32x4_t a, uint32_t b); // VMUL.I32 q0,q0,d0[0]
13826_NEON2SSE_INLINE uint32x4_t vmulq_n_u32(uint32x4_t a, uint32_t b) // VMUL.I32 q0,q0,d0[0]
13827{
13828 uint32x4_t b32x4;
13829 b32x4 = vdupq_n_u32(b);
13830 return vmulq_u32(a, b32x4);
13831}
13832
13833//********** Vector multiply lane *****************************
13834_NEON2SSESTORAGE int16x4_t vmul_lane_s16 (int16x4_t a, int16x4_t b, __constrange(0,3) int c);
13835_NEON2SSE_INLINE int16x4_t vmul_lane_s16 (int16x4_t a, int16x4_t b, __constrange(0,3) int c)
13836{
13837 int16x4_t b16x4;
13838 int16_t vlane;
13839 vlane = vget_lane_s16(b, c);
13840 b16x4 = vdup_n_s16(vlane);
13841 return vmul_s16(a, b16x4);
13842}
13843
13844_NEON2SSESTORAGE int32x2_t vmul_lane_s32 (int32x2_t a, int32x2_t b, __constrange(0,1) int c);
13845_NEON2SSE_INLINE int32x2_t vmul_lane_s32 (int32x2_t a, int32x2_t b, __constrange(0,1) int c)
13846{
13847 int32x2_t b32x2;
13848 int32_t vlane;
13849 vlane = vget_lane_s32(b, c);
13850 b32x2 = vdup_n_s32(vlane);
13851 return vmul_s32(a, b32x2);
13852}
13853
13854_NEON2SSESTORAGE float32x2_t vmul_lane_f32 (float32x2_t a, float32x2_t b, __constrange(0,1) int c);
13855_NEON2SSE_INLINE float32x2_t vmul_lane_f32 (float32x2_t a, float32x2_t b, __constrange(0,1) int c)
13856{
13857 float32x2_t b32x2;
13858 float32_t vlane;
13859 vlane = vget_lane_f32(b, c);
13860 b32x2 = vdup_n_f32(vlane);
13861 return vmul_f32(a, b32x2);
13862}
13863
13864_NEON2SSE_GLOBAL uint16x4_t vmul_lane_u16 (uint16x4_t a, uint16x4_t b, __constrange(0,3) int c);
13865#define vmul_lane_u16 vmul_lane_s16
13866
13867_NEON2SSE_GLOBAL uint32x2_t vmul_lane_u32 (uint32x2_t a, uint32x2_t b, __constrange(0,1) int c);
13868#define vmul_lane_u32 vmul_lane_s32
13869
13870_NEON2SSESTORAGE int16x8_t vmulq_lane_s16(int16x8_t a, int16x4_t b, __constrange(0,3) int c);
13871_NEON2SSE_INLINE int16x8_t vmulq_lane_s16 (int16x8_t a, int16x4_t b, __constrange(0,3) int c)
13872{
13873 int16x8_t b16x8;
13874 int16_t vlane;
13875 vlane = vget_lane_s16(b, c);
13876 b16x8 = vdupq_n_s16(vlane);
13877 return vmulq_s16(a, b16x8);
13878}
13879
13880_NEON2SSESTORAGE int32x4_t vmulq_lane_s32 (int32x4_t a, int32x2_t b, __constrange(0,1) int c);
13881_NEON2SSE_INLINE int32x4_t vmulq_lane_s32 (int32x4_t a, int32x2_t b, __constrange(0,1) int c)
13882{
13883 int32x4_t b32x4;
13884 int32_t vlane;
13885 vlane = vget_lane_s32(b, c);
13886 b32x4 = vdupq_n_s32(vlane);
13887 return vmulq_s32(a, b32x4);
13888}
13889
13890_NEON2SSESTORAGE float32x4_t vmulq_lane_f32 (float32x4_t a, float32x2_t b, __constrange(0,1) int c);
13891_NEON2SSE_INLINE float32x4_t vmulq_lane_f32 (float32x4_t a, float32x2_t b, __constrange(0,1) int c)
13892{
13893 float32x4_t b32x4;
13894 float32_t vlane;
13895 vlane = vget_lane_f32(b, c);
13896 b32x4 = vdupq_n_f32(vlane);
13897 return vmulq_f32(a, b32x4);
13898}
13899
13900_NEON2SSE_GLOBAL uint16x8_t vmulq_lane_u16 (uint16x8_t a, uint16x4_t b, __constrange(0,3) int c);
13901#define vmulq_lane_u16 vmulq_lane_s16
13902
13903_NEON2SSE_GLOBAL uint32x4_t vmulq_lane_u32 (uint32x4_t a, uint32x2_t b, __constrange(0,1) int c);
13904#define vmulq_lane_u32 vmulq_lane_s32
13905
13906//**** Vector long multiply with scalar ************
13907_NEON2SSESTORAGE int32x4_t vmull_n_s16(int16x4_t vec1, int16_t val2); // VMULL.S16 q0,d0,d0[0]
13908_NEON2SSE_INLINE int32x4_t vmull_n_s16(int16x4_t vec1, int16_t val2) // VMULL.S16 q0,d0,d0[0]
13909{
13910 int16x4_t b16x4;
13911 b16x4 = vdup_n_s16(val2);
13912 return vmull_s16(vec1, b16x4);
13913}
13914
13915_NEON2SSESTORAGE int64x2_t vmull_n_s32(int32x2_t vec1, int32_t val2); // VMULL.S32 q0,d0,d0[0]
13916_NEON2SSE_INLINE int64x2_t vmull_n_s32(int32x2_t vec1, int32_t val2) // VMULL.S32 q0,d0,d0[0]
13917{
13918 int32x2_t b32x2;
13919 b32x2 = vdup_n_s32(val2);
13920 return vmull_s32(vec1, b32x2);
13921}
13922
13923_NEON2SSESTORAGE uint32x4_t vmull_n_u16(uint16x4_t vec1, uint16_t val2); // VMULL.s16 q0,d0,d0[0]
13924_NEON2SSE_INLINE uint32x4_t vmull_n_u16(uint16x4_t vec1, uint16_t val2) // VMULL.s16 q0,d0,d0[0]
13925{
13926 uint16x4_t b16x4;
13927 b16x4 = vdup_n_s16(val2);
13928 return vmull_u16(vec1, b16x4);
13929}
13930
13931_NEON2SSESTORAGE uint64x2_t vmull_n_u32(uint32x2_t vec1, uint32_t val2); // VMULL.U32 q0,d0,d0[0]
13932_NEON2SSE_INLINE uint64x2_t vmull_n_u32(uint32x2_t vec1, uint32_t val2) // VMULL.U32 q0,d0,d0[0]
13933{
13934 uint32x2_t b32x2;
13935 b32x2 = vdup_n_u32(val2);
13936 return vmull_u32(vec1, b32x2);
13937}
13938
13939//**** Vector long multiply by scalar ****
13940_NEON2SSESTORAGE int32x4_t vmull_lane_s16(int16x4_t vec1, int16x4_t val2, __constrange(0, 3) int val3); // VMULL.S16 q0,d0,d0[0]
13941_NEON2SSE_INLINE int32x4_t vmull_lane_s16(int16x4_t vec1, int16x4_t val2, __constrange(0, 3) int val3) // VMULL.S16 q0,d0,d0[0]
13942{
13943 int16_t vlane;
13944 int16x4_t b;
13945 vlane = vget_lane_s16(val2, val3);
13946 b = vdup_n_s16(vlane);
13947 return vmull_s16(vec1, b);
13948}
13949
13950_NEON2SSESTORAGE int64x2_t vmull_lane_s32(int32x2_t vec1, int32x2_t val2, __constrange(0, 1) int val3); // VMULL.S32 q0,d0,d0[0]
13951_NEON2SSE_INLINE int64x2_t vmull_lane_s32(int32x2_t vec1, int32x2_t val2, __constrange(0, 1) int val3) // VMULL.S32 q0,d0,d0[0]
13952{
13953 int32_t vlane;
13954 int32x2_t b;
13955 vlane = vget_lane_s32(val2, val3);
13956 b = vdup_n_s32(vlane);
13957 return vmull_s32(vec1, b);
13958}
13959
13960_NEON2SSESTORAGE uint32x4_t vmull_lane_u16(uint16x4_t vec1, uint16x4_t val2, __constrange(0, 3) int val3); // VMULL.s16 q0,d0,d0[0]
13961_NEON2SSE_INLINE uint32x4_t vmull_lane_u16(uint16x4_t vec1, uint16x4_t val2, __constrange(0, 3) int val3) // VMULL.s16 q0,d0,d0[0]
13962{
13963 uint16_t vlane;
13964 uint16x4_t b;
13965 vlane = vget_lane_s16(val2, val3);
13966 b = vdup_n_s16(vlane);
13967 return vmull_u16(vec1, b);
13968}
13969
13970_NEON2SSESTORAGE uint64x2_t vmull_lane_u32(uint32x2_t vec1, uint32x2_t val2, __constrange(0, 1) int val3); // VMULL.U32 q0,d0,d0[0]
13971_NEON2SSE_INLINE uint64x2_t vmull_lane_u32(uint32x2_t vec1, uint32x2_t val2, __constrange(0, 1) int val3) // VMULL.U32 q0,d0,d0[0]
13972{
13973 uint32_t vlane;
13974 uint32x2_t b;
13975 vlane = vget_lane_u32(val2, val3);
13976 b = vdup_n_u32(vlane);
13977 return vmull_u32(vec1, b);
13978}
13979
13980//********* Vector saturating doubling long multiply with scalar *******************
13981_NEON2SSESTORAGE int32x4_t vqdmull_n_s16(int16x4_t vec1, int16_t val2); // VQDMULL.S16 q0,d0,d0[0]
13982_NEON2SSE_INLINE int32x4_t vqdmull_n_s16(int16x4_t vec1, int16_t val2)
13983{
13984 //the serial soulution may be faster due to saturation
13985 int16x4_t b;
13986 b = vdup_n_s16(val2);
13987 return vqdmull_s16(vec1, b);
13988}
13989
13990_NEON2SSESTORAGE int64x2_t vqdmull_n_s32(int32x2_t vec1, int32_t val2); // VQDMULL.S32 q0,d0,d0[0]
13991_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int64x2_t vqdmull_n_s32(int32x2_t vec1, int32_t val2), _NEON2SSE_REASON_SLOW_SERIAL)
13992{
13993 int32x2_t b;
13994 b = vdup_n_s32(val2);
13995 return vqdmull_s32(vec1,b); //slow serial function!!!!
13996}
13997
13998//************* Vector saturating doubling long multiply by scalar ***********************************************
13999_NEON2SSESTORAGE int32x4_t vqdmull_lane_s16(int16x4_t vec1, int16x4_t val2, __constrange(0, 3) int val3); // VQDMULL.S16 q0,d0,d0[0]
14000_NEON2SSE_INLINE int32x4_t vqdmull_lane_s16(int16x4_t vec1, int16x4_t val2, __constrange(0, 3) int val3)
14001{
14002 int16_t c;
14003 int16x4_t scalar;
14004 c = vget_lane_s16(val2, val3);
14005 scalar = vdup_n_s16(c);
14006 return vqdmull_s16(vec1, scalar);
14007}
14008
14009
14010_NEON2SSESTORAGE int64x2_t vqdmull_lane_s32(int32x2_t vec1, int32x2_t val2, __constrange(0, 1) int val3); // VQDMULL.S32 q0,d0,d0[0]
14011_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int64x2_t vqdmull_lane_s32(int32x2_t vec1, int32x2_t val2, __constrange(0, 1) int val3), _NEON2SSE_REASON_SLOW_SERIAL)
14012{
14013 int32_t c;
14014 int32x2_t scalar;
14015 c = vget_lane_s32(val2, val3);
14016 scalar = vdup_n_s32(c);
14017 return vqdmull_s32(vec1,scalar); //slow serial function!!!!
14018}
14019
14020// *****Vector saturating doubling multiply high with scalar *****
14021_NEON2SSESTORAGE int16x4_t vqdmulh_n_s16(int16x4_t vec1, int16_t val2); // VQDMULH.S16 d0,d0,d0[0]
14022_NEON2SSE_INLINE int16x4_t vqdmulh_n_s16(int16x4_t vec1, int16_t val2)
14023{
14024 int16x4_t res64;
14025 return64(vqdmulhq_n_s16(_pM128i(vec1), val2));
14026}
14027
14028_NEON2SSESTORAGE int32x2_t vqdmulh_n_s32(int32x2_t vec1, int32_t val2); // VQDMULH.S32 d0,d0,d0[0]
14029_NEON2SSE_INLINE int32x2_t vqdmulh_n_s32(int32x2_t vec1, int32_t val2)
14030{
14031 int32x2_t res64;
14032 return64(vqdmulhq_n_s32(_pM128i(vec1), val2));
14033}
14034
14035_NEON2SSESTORAGE int16x8_t vqdmulhq_n_s16(int16x8_t vec1, int16_t val2); // VQDMULH.S16 q0,q0,d0[0]
14036_NEON2SSE_INLINE int16x8_t vqdmulhq_n_s16(int16x8_t vec1, int16_t val2) // VQDMULH.S16 q0,q0,d0[0]
14037{
14038 //solution may be not optimal
14039 int16x8_t scalar;
14040 scalar = vdupq_n_s16(val2);
14041 return vqdmulhq_s16(vec1, scalar);
14042}
14043
14044_NEON2SSESTORAGE int32x4_t vqdmulhq_n_s32(int32x4_t vec1, int32_t val2); // VQDMULH.S32 q0,q0,d0[0]
14045_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int32x4_t vqdmulhq_n_s32(int32x4_t vec1, int32_t val2), _NEON2SSE_REASON_SLOW_UNEFFECTIVE)
14046{
14047 int32x4_t scalar;
14048 scalar = vdupq_n_s32(val2);
14049 return vqdmulhq_s32(vec1, scalar);
14050}
14051
14052//***** Vector saturating doubling multiply high by scalar ****************
14053_NEON2SSESTORAGE int16x4_t vqdmulh_lane_s16(int16x4_t vec1, int16x4_t val2, __constrange(0, 3) int val3); // VQDMULH.S16 d0,d0,d0[0]
14054_NEON2SSE_INLINE int16x4_t vqdmulh_lane_s16(int16x4_t vec1, int16x4_t val2, __constrange(0, 3) int val3) // VQDMULH.S16 d0,d0,d0[0]
14055{
14056 //solution may be not optimal
14057 int16_t vlane;
14058 int16x4_t scalar;
14059 vlane = vget_lane_s16(val2, val3);
14060 scalar = vdup_n_s16(vlane);
14061 return vqdmulh_s16(vec1, scalar);
14062}
14063
14064_NEON2SSESTORAGE int32x2_t vqdmulh_lane_s32(int32x2_t vec1, int32x2_t val2, __constrange(0, 1) int val3); // VQDMULH.S32 d0,d0,d0[0]
14065_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int32x2_t vqdmulh_lane_s32(int32x2_t vec1, int32x2_t val2, __constrange(0, 1) int val3), _NEON2SSE_REASON_SLOW_UNEFFECTIVE)
14066{
14067 int32_t vlane;
14068 int32x2_t scalar;
14069 vlane = vget_lane_s32(val2, val3);
14070 scalar = vdup_n_s32(vlane);
14071 return vqdmulh_s32(vec1, scalar);
14072}
14073
14074_NEON2SSESTORAGE int16x8_t vqdmulhq_lane_s16(int16x8_t vec1, int16x4_t val2, __constrange(0, 3) int val3); // VQDMULH.S16 q0,q0,d0[0]
14075_NEON2SSE_INLINE int16x8_t vqdmulhq_lane_s16(int16x8_t vec1, int16x4_t val2, __constrange(0, 3) int val3) // VQDMULH.S16 q0,q0,d0[0]
14076{
14077 //solution may be not optimal
14078 int16_t vlane;
14079 int16x8_t scalar;
14080 vlane = vget_lane_s16(val2, val3);
14081 scalar = vdupq_n_s16(vlane );
14082 return vqdmulhq_s16(vec1, scalar);
14083}
14084
14085_NEON2SSESTORAGE int32x4_t vqdmulhq_lane_s32(int32x4_t vec1, int32x2_t val2, __constrange(0, 1) int val3); // VQDMULH.S32 q0,q0,d0[0]
14086_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int32x4_t vqdmulhq_lane_s32(int32x4_t vec1, int32x2_t val2, __constrange(0, 1) int val3), _NEON2SSE_REASON_SLOW_UNEFFECTIVE)
14087{
14088 //solution may be not optimal
14089 int32_t vlane;
14090 int32x4_t scalar;
14091 vlane = vgetq_lane_s32(_pM128i(val2), val3);
14092 scalar = vdupq_n_s32(vlane );
14093 return vqdmulhq_s32(vec1, scalar);
14094}
14095
14096//******** Vector saturating rounding doubling multiply high with scalar ***
14097_NEON2SSESTORAGE int16x4_t vqrdmulh_n_s16(int16x4_t vec1, int16_t val2); // VQRDMULH.S16 d0,d0,d0[0]
14098_NEON2SSE_INLINE int16x4_t vqrdmulh_n_s16(int16x4_t vec1, int16_t val2) // VQRDMULH.S16 d0,d0,d0[0]
14099{
14100 //solution may be not optimal
14101 int16x4_t scalar;
14102 scalar = vdup_n_s16(val2);
14103 return vqrdmulh_s16(vec1, scalar);
14104}
14105
14106_NEON2SSESTORAGE int32x2_t vqrdmulh_n_s32(int32x2_t vec1, int32_t val2); // VQRDMULH.S32 d0,d0,d0[0]
14107_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int32x2_t vqrdmulh_n_s32(int32x2_t vec1, int32_t val2), _NEON2SSE_REASON_SLOW_UNEFFECTIVE)
14108{
14109 int32x2_t scalar;
14110 scalar = vdup_n_s32(val2);
14111 return vqrdmulh_s32(vec1, scalar);
14112}
14113
14114_NEON2SSESTORAGE int16x8_t vqrdmulhq_n_s16(int16x8_t vec1, int16_t val2); // VQRDMULH.S16 q0,q0,d0[0]
14115_NEON2SSE_INLINE int16x8_t vqrdmulhq_n_s16(int16x8_t vec1, int16_t val2) // VQRDMULH.S16 q0,q0,d0[0]
14116{
14117 //solution may be not optimal
14118 int16x8_t scalar;
14119 scalar = vdupq_n_s16(val2);
14120 return vqrdmulhq_s16(vec1, scalar);
14121}
14122
14123_NEON2SSESTORAGE int32x4_t vqrdmulhq_n_s32(int32x4_t vec1, int32_t val2); // VQRDMULH.S32 q0,q0,d0[0]
14124_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int32x4_t vqrdmulhq_n_s32(int32x4_t vec1, int32_t val2), _NEON2SSE_REASON_SLOW_UNEFFECTIVE)
14125{
14126 int32x4_t scalar;
14127 scalar = vdupq_n_s32(val2);
14128 return vqrdmulhq_s32(vec1, scalar);
14129}
14130
14131//********* Vector rounding saturating doubling multiply high by scalar ****
14132_NEON2SSESTORAGE int16x4_t vqrdmulh_lane_s16(int16x4_t vec1, int16x4_t val2, __constrange(0, 3) int val3); // VQRDMULH.S16 d0,d0,d0[0]
14133_NEON2SSE_INLINE int16x4_t vqrdmulh_lane_s16(int16x4_t vec1, int16x4_t val2, __constrange(0, 3) int val3) // VQRDMULH.S16 d0,d0,d0[0]
14134{
14135 //solution may be not optimal
14136 int16_t vlane;
14137 int16x4_t scalar;
14138 vlane = vget_lane_s16(val2, val3);
14139 scalar = vdup_n_s16(vlane);
14140 return vqrdmulh_s16(vec1, scalar);
14141}
14142
14143_NEON2SSESTORAGE int32x2_t vqrdmulh_lane_s32(int32x2_t vec1, int32x2_t val2, __constrange(0, 1) int val3); // VQRDMULH.S32 d0,d0,d0[0]
14144_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int32x2_t vqrdmulh_lane_s32(int32x2_t vec1, int32x2_t val2, __constrange(0, 1) int val3), _NEON2SSE_REASON_SLOW_UNEFFECTIVE)
14145{
14146 int32_t vlane;
14147 int32x2_t scalar;
14148 vlane = vget_lane_s32(val2, val3);
14149 scalar = vdup_n_s32(vlane);
14150 return vqrdmulh_s32(vec1, scalar);
14151}
14152
14153_NEON2SSESTORAGE int16x8_t vqrdmulhq_lane_s16(int16x8_t vec1, int16x4_t val2, __constrange(0, 3) int val3); // VQRDMULH.S16 q0,q0,d0[0]
14154_NEON2SSE_INLINE int16x8_t vqrdmulhq_lane_s16(int16x8_t vec1, int16x4_t val2, __constrange(0, 3) int val3) // VQRDMULH.S16 q0,q0,d0[0]
14155{
14156 //solution may be not optimal
14157 int16_t vlane;
14158 int16x8_t scalar;
14159 vlane = vget_lane_s16(val2, val3);
14160 scalar = vdupq_n_s16(vlane);
14161 return vqrdmulhq_s16(vec1, scalar);
14162}
14163
14164_NEON2SSESTORAGE int32x4_t vqrdmulhq_lane_s32(int32x4_t vec1, int32x2_t val2, __constrange(0, 1) int val3); // VQRDMULH.S32 q0,q0,d0[0]
14165_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int32x4_t vqrdmulhq_lane_s32(int32x4_t vec1, int32x2_t val2, __constrange(0, 1) int val3), _NEON2SSE_REASON_SLOW_UNEFFECTIVE)
14166{
14167 //solution may be not optimal
14168 int32_t vlane;
14169 int32x4_t scalar;
14170 vlane = vgetq_lane_s32(_pM128i(val2), val3);
14171 scalar = vdupq_n_s32(vlane );
14172 return vqrdmulhq_s32(vec1, scalar);
14173}
14174
14175//**************Vector multiply accumulate with scalar *******************
14176_NEON2SSESTORAGE int16x4_t vmla_n_s16(int16x4_t a, int16x4_t b, int16_t c); // VMLA.I16 d0, d0, d0[0]
14177_NEON2SSE_INLINE int16x4_t vmla_n_s16(int16x4_t a, int16x4_t b, int16_t c) // VMLA.I16 d0, d0, d0[0]
14178{
14179 int16x4_t scalar;
14180 scalar = vdup_n_s16(c);
14181 return vmla_s16(a, b, scalar);
14182}
14183
14184_NEON2SSESTORAGE int32x2_t vmla_n_s32(int32x2_t a, int32x2_t b, int32_t c); // VMLA.I32 d0, d0, d0[0]
14185_NEON2SSE_INLINE int32x2_t vmla_n_s32(int32x2_t a, int32x2_t b, int32_t c) // VMLA.I32 d0, d0, d0[0]
14186{
14187 int32x2_t scalar;
14188 scalar = vdup_n_s32(c);
14189 return vmla_s32(a, b, scalar);
14190}
14191
14192_NEON2SSE_GLOBAL uint16x4_t vmla_n_u16(uint16x4_t a, uint16x4_t b, uint16_t c); // VMLA.I16 d0, d0, d0[0]
14193#define vmla_n_u16 vmla_n_s16
14194
14195
14196_NEON2SSE_GLOBAL uint32x2_t vmla_n_u32(uint32x2_t a, uint32x2_t b, uint32_t c); // VMLA.I32 d0, d0, d0[0]
14197#define vmla_n_u32 vmla_n_s32
14198
14199
14200_NEON2SSESTORAGE float32x2_t vmla_n_f32(float32x2_t a, float32x2_t b, float32_t c); // VMLA.F32 d0, d0, d0[0]
14201_NEON2SSE_INLINE float32x2_t vmla_n_f32(float32x2_t a, float32x2_t b, float32_t c) // VMLA.F32 d0, d0, d0[0]
14202{
14203 float32x2_t scalar;
14204 scalar = vdup_n_f32(c);
14205 return vmla_f32(a, b, scalar);
14206}
14207
14208_NEON2SSESTORAGE int16x8_t vmlaq_n_s16(int16x8_t a, int16x8_t b, int16_t c); // VMLA.I16 q0, q0, d0[0]
14209_NEON2SSE_INLINE int16x8_t vmlaq_n_s16(int16x8_t a, int16x8_t b, int16_t c) // VMLA.I16 q0, q0, d0[0]
14210{
14211 int16x8_t scalar;
14212 scalar = vdupq_n_s16(c);
14213 return vmlaq_s16(a,b,scalar);
14214}
14215
14216_NEON2SSESTORAGE int32x4_t vmlaq_n_s32(int32x4_t a, int32x4_t b, int32_t c); // VMLA.I32 q0, q0, d0[0]
14217_NEON2SSE_INLINE int32x4_t vmlaq_n_s32(int32x4_t a, int32x4_t b, int32_t c) // VMLA.I32 q0, q0, d0[0]
14218{
14219 int32x4_t scalar;
14220 scalar = vdupq_n_s32(c);
14221 return vmlaq_s32(a,b,scalar);
14222}
14223
14224_NEON2SSE_GLOBAL uint16x8_t vmlaq_n_u16(uint16x8_t a, uint16x8_t b, uint16_t c); // VMLA.I16 q0, q0, d0[0]
14225#define vmlaq_n_u16 vmlaq_n_s16
14226
14227_NEON2SSE_GLOBAL uint32x4_t vmlaq_n_u32(uint32x4_t a, uint32x4_t b, uint32_t c); // VMLA.I32 q0, q0, d0[0]
14228#define vmlaq_n_u32 vmlaq_n_s32
14229
14230_NEON2SSESTORAGE float32x4_t vmlaq_n_f32(float32x4_t a, float32x4_t b, float32_t c); // VMLA.F32 q0, q0, d0[0]
14231_NEON2SSE_INLINE float32x4_t vmlaq_n_f32(float32x4_t a, float32x4_t b, float32_t c) // VMLA.F32 q0, q0, d0[0]
14232{
14233 float32x4_t scalar;
14234 scalar = vdupq_n_f32(c);
14235 return vmlaq_f32(a,b,scalar);
14236}
14237
14238//************Vector widening multiply accumulate with scalar****************************
14239_NEON2SSESTORAGE int32x4_t vmlal_n_s16(int32x4_t a, int16x4_t b, int16_t c); // VMLAL.S16 q0, d0, d0[0]
14240_NEON2SSE_INLINE int32x4_t vmlal_n_s16(int32x4_t a, int16x4_t b, int16_t c) // VMLAL.S16 q0, d0, d0[0]
14241{
14242 int16x4_t vc;
14243 vc = vdup_n_s16(c);
14244 return vmlal_s16(a, b, vc);
14245}
14246
14247_NEON2SSESTORAGE int64x2_t vmlal_n_s32(int64x2_t a, int32x2_t b, int32_t c); // VMLAL.S32 q0, d0, d0[0]
14248_NEON2SSE_INLINE int64x2_t vmlal_n_s32(int64x2_t a, int32x2_t b, int32_t c) // VMLAL.S32 q0, d0, d0[0]
14249{
14250 int32x2_t vc;
14251 vc = vdup_n_s32(c);
14252 return vmlal_s32(a, b, vc);
14253}
14254
14255_NEON2SSESTORAGE uint32x4_t vmlal_n_u16(uint32x4_t a, uint16x4_t b, uint16_t c); // VMLAL.s16 q0, d0, d0[0]
14256_NEON2SSE_INLINE uint32x4_t vmlal_n_u16(uint32x4_t a, uint16x4_t b, uint16_t c) // VMLAL.s16 q0, d0, d0[0]
14257{
14258 uint16x4_t vc;
14259 vc = vdup_n_u16(c);
14260 return vmlal_u16(a, b, vc);
14261}
14262
14263_NEON2SSESTORAGE uint64x2_t vmlal_n_u32(uint64x2_t a, uint32x2_t b, uint32_t c); // VMLAL.U32 q0, d0, d0[0]
14264_NEON2SSE_INLINE uint64x2_t vmlal_n_u32(uint64x2_t a, uint32x2_t b, uint32_t c) // VMLAL.U32 q0, d0, d0[0]
14265{
14266 uint32x2_t vc;
14267 vc = vdup_n_u32(c);
14268 return vmlal_u32(a, b, vc);
14269}
14270
14271//************ Vector widening saturating doubling multiply accumulate with scalar **************
14272_NEON2SSESTORAGE int32x4_t vqdmlal_n_s16(int32x4_t a, int16x4_t b, int16_t c); // VQDMLAL.S16 q0, d0, d0[0]
14273_NEON2SSE_INLINE int32x4_t vqdmlal_n_s16(int32x4_t a, int16x4_t b, int16_t c)
14274{
14275 //not optimal SIMD soulution, serial may be faster
14276 int16x4_t vc;
14277 vc = vdup_n_s16(c);
14278 return vqdmlal_s16(a, b, vc);
14279}
14280
14281_NEON2SSESTORAGE int64x2_t vqdmlal_n_s32(int64x2_t a, int32x2_t b, int32_t c); // VQDMLAL.S32 q0, d0, d0[0]
14282_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int64x2_t vqdmlal_n_s32(int64x2_t a, int32x2_t b, int32_t c), _NEON2SSE_REASON_SLOW_SERIAL)
14283{
14284 int32x2_t vc;
14285 vc = vdup_n_s32(c);
14286 return vqdmlal_s32(a, b, vc);
14287}
14288
14289//******** Vector multiply subtract with scalar **************
14290_NEON2SSESTORAGE int16x4_t vmls_n_s16(int16x4_t a, int16x4_t b, int16_t c); // VMLS.I16 d0, d0, d0[0]
14291_NEON2SSE_INLINE int16x4_t vmls_n_s16(int16x4_t a, int16x4_t b, int16_t c) // VMLS.I16 d0, d0, d0[0]
14292{
14293 int16x4_t vc;
14294 vc = vdup_n_s16(c);
14295 return vmls_s16(a, b, vc);
14296}
14297
14298_NEON2SSESTORAGE int32x2_t vmls_n_s32(int32x2_t a, int32x2_t b, int32_t c); // VMLS.I32 d0, d0, d0[0]
14299_NEON2SSE_INLINE int32x2_t vmls_n_s32(int32x2_t a, int32x2_t b, int32_t c) // VMLS.I32 d0, d0, d0[0]
14300{
14301 int32x2_t vc;
14302 vc = vdup_n_s32(c);
14303 return vmls_s32(a, b, vc);
14304}
14305
14306_NEON2SSESTORAGE uint16x4_t vmls_n_u16(uint16x4_t a, uint16x4_t b, uint16_t c); // VMLS.I16 d0, d0, d0[0]
14307_NEON2SSE_INLINE uint16x4_t vmls_n_u16(uint16x4_t a, uint16x4_t b, uint16_t c) // VMLS.I16 d0, d0, d0[0]
14308{
14309 uint16x4_t vc;
14310 vc = vdup_n_s16(c);
14311 return vmls_s16(a, b, vc);
14312}
14313
14314_NEON2SSESTORAGE uint32x2_t vmls_n_u32(uint32x2_t a, uint32x2_t b, uint32_t c); // VMLS.I32 d0, d0, d0[0]
14315_NEON2SSE_INLINE uint32x2_t vmls_n_u32(uint32x2_t a, uint32x2_t b, uint32_t c) // VMLS.I32 d0, d0, d0[0]
14316{
14317 uint32x2_t vc;
14318 vc = vdup_n_u32(c);
14319 return vmls_u32(a, b, vc);
14320}
14321
14322_NEON2SSESTORAGE float32x2_t vmls_n_f32(float32x2_t a, float32x2_t b, float32_t c); // VMLS.F32 d0, d0, d0[0]
14323_NEON2SSE_INLINE float32x2_t vmls_n_f32(float32x2_t a, float32x2_t b, float32_t c)
14324{
14325 float32x2_t res;
14326 res.m64_f32[0] = a.m64_f32[0] - b.m64_f32[0] * c;
14327 res.m64_f32[1] = a.m64_f32[1] - b.m64_f32[1] * c;
14328 return res;
14329}
14330
14331_NEON2SSESTORAGE int16x8_t vmlsq_n_s16(int16x8_t a, int16x8_t b, int16_t c); // VMLS.I16 q0, q0, d0[0]
14332_NEON2SSE_INLINE int16x8_t vmlsq_n_s16(int16x8_t a, int16x8_t b, int16_t c) // VMLS.I16 q0, q0, d0[0]
14333{
14334 int16x8_t vc;
14335 vc = vdupq_n_s16(c);
14336 return vmlsq_s16(a, b,vc);
14337}
14338
14339_NEON2SSESTORAGE int32x4_t vmlsq_n_s32(int32x4_t a, int32x4_t b, int32_t c); // VMLS.I32 q0, q0, d0[0]
14340_NEON2SSE_INLINE int32x4_t vmlsq_n_s32(int32x4_t a, int32x4_t b, int32_t c) // VMLS.I32 q0, q0, d0[0]
14341{
14342 int32x4_t vc;
14343 vc = vdupq_n_s32(c);
14344 return vmlsq_s32(a,b,vc);
14345}
14346
14347_NEON2SSESTORAGE uint16x8_t vmlsq_n_u16(uint16x8_t a, uint16x8_t b, uint16_t c); // VMLS.I16 q0, q0, d0[0]
14348_NEON2SSE_INLINE uint16x8_t vmlsq_n_u16(uint16x8_t a, uint16x8_t b, uint16_t c) // VMLS.I16 q0, q0, d0[0]
14349{
14350 uint16x8_t vc;
14351 vc = vdupq_n_u16(c);
14352 return vmlsq_u16(a,b,vc);
14353}
14354
14355_NEON2SSESTORAGE uint32x4_t vmlsq_n_u32(uint32x4_t a, uint32x4_t b, uint32_t c); // VMLS.I32 q0, q0, d0[0]
14356_NEON2SSE_INLINE uint32x4_t vmlsq_n_u32(uint32x4_t a, uint32x4_t b, uint32_t c) // VMLS.I32 q0, q0, d0[0]
14357{
14358 uint32x4_t vc;
14359 vc = vdupq_n_u32(c);
14360 return vmlsq_u32(a,b,vc);
14361}
14362
14363_NEON2SSESTORAGE float32x4_t vmlsq_n_f32(float32x4_t a, float32x4_t b, float32_t c); // VMLS.F32 q0, q0, d0[0]
14364_NEON2SSE_INLINE float32x4_t vmlsq_n_f32(float32x4_t a, float32x4_t b, float32_t c)
14365{
14366 float32x4_t vc;
14367 vc = vdupq_n_f32(c);
14368 return vmlsq_f32(a,b,vc);
14369}
14370
14371//**** Vector widening multiply subtract with scalar ******
14372_NEON2SSESTORAGE int32x4_t vmlsl_n_s16(int32x4_t a, int16x4_t b, int16_t c); // VMLSL.S16 q0, d0, d0[0]
14373_NEON2SSE_INLINE int32x4_t vmlsl_n_s16(int32x4_t a, int16x4_t b, int16_t c) // VMLSL.S16 q0, d0, d0[0]
14374{
14375 int16x4_t vc;
14376 vc = vdup_n_s16(c);
14377 return vmlsl_s16(a, b, vc);
14378}
14379
14380_NEON2SSESTORAGE int64x2_t vmlsl_n_s32(int64x2_t a, int32x2_t b, int32_t c); // VMLSL.S32 q0, d0, d0[0]
14381_NEON2SSE_INLINE int64x2_t vmlsl_n_s32(int64x2_t a, int32x2_t b, int32_t c) // VMLSL.S32 q0, d0, d0[0]
14382{
14383 int32x2_t vc;
14384 vc = vdup_n_s32(c);
14385 return vmlsl_s32(a, b, vc);
14386}
14387
14388_NEON2SSESTORAGE uint32x4_t vmlsl_n_u16(uint32x4_t a, uint16x4_t b, uint16_t c); // VMLSL.s16 q0, d0, d0[0]
14389_NEON2SSE_INLINE uint32x4_t vmlsl_n_u16(uint32x4_t a, uint16x4_t b, uint16_t c) // VMLSL.s16 q0, d0, d0[0]
14390{
14391 uint16x4_t vc;
14392 vc = vdup_n_u16(c);
14393 return vmlsl_u16(a, b, vc);
14394}
14395
14396_NEON2SSESTORAGE uint64x2_t vmlsl_n_u32(uint64x2_t a, uint32x2_t b, uint32_t c); // VMLSL.U32 q0, d0, d0[0]
14397_NEON2SSE_INLINE uint64x2_t vmlsl_n_u32(uint64x2_t a, uint32x2_t b, uint32_t c) // VMLSL.U32 q0, d0, d0[0]
14398{
14399 uint32x2_t vc;
14400 vc = vdup_n_u32(c);
14401 return vmlsl_u32(a, b, vc);
14402}
14403
14404//***** Vector widening saturating doubling multiply subtract with scalar *********
14405//**********************************************************************************
14406_NEON2SSESTORAGE int32x4_t vqdmlsl_n_s16(int32x4_t a, int16x4_t b, int16_t c); // VQDMLSL.S16 q0, d0, d0[0]
14407_NEON2SSE_INLINE int32x4_t vqdmlsl_n_s16(int32x4_t a, int16x4_t b, int16_t c)
14408{
14409 int16x4_t vc;
14410 vc = vdup_n_s16(c);
14411 return vqdmlsl_s16(a, b, vc);
14412}
14413
14414_NEON2SSESTORAGE int64x2_t vqdmlsl_n_s32(int64x2_t a, int32x2_t b, int32_t c); // VQDMLSL.S32 q0, d0, d0[0]
14415_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int64x2_t vqdmlsl_n_s32(int64x2_t a, int32x2_t b, int32_t c), _NEON2SSE_REASON_SLOW_SERIAL)
14416{
14417 int32x2_t vc;
14418 vc = vdup_n_s32(c);
14419 return vqdmlsl_s32(a, b, vc);
14420}
14421
14422//******************* Vector extract ***********************************************
14423//*************************************************************************************
14424//VEXT (Vector Extract) extracts elements from the bottom end of the second operand
14425//vector and the top end of the first, concatenates them, and places the result in the destination vector
14426//c elements from the bottom end of the second operand and (8-c) from the top end of the first
14427_NEON2SSESTORAGE int8x8_t vext_s8(int8x8_t a, int8x8_t b, __constrange(0,7) int c); // VEXT.8 d0,d0,d0,#0
14428_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int8x8_t vext_s8(int8x8_t a, int8x8_t b, __constrange(0,7) int c),_NEON2SSE_REASON_SLOW_SERIAL)
14429{
14430 int8x8_t res;
14431 int i;
14432 for (i = 0; i<8 - c; i++) {
14433 res.m64_i8[i] = a.m64_i8[i + c];
14434 }
14435 for(i = 0; i<c; i++) {
14436 res.m64_i8[8 - c + i] = b.m64_i8[i];
14437 }
14438 return res;
14439}
14440
14441_NEON2SSE_GLOBAL uint8x8_t vext_u8(uint8x8_t a, uint8x8_t b, __constrange(0,7) int c); // VEXT.8 d0,d0,d0,#0
14442#define vext_u8 vext_s8
14443//same result tested
14444
14445_NEON2SSE_GLOBAL poly8x8_t vext_p8(poly8x8_t a, poly8x8_t b, __constrange(0,7) int c); // VEXT.8 d0,d0,d0,#0
14446#define vext_p8 vext_u8
14447
14448_NEON2SSESTORAGE int16x4_t vext_s16(int16x4_t a, int16x4_t b, __constrange(0,3) int c); // VEXT.16 d0,d0,d0,#0
14449_NEON2SSE_INLINE int16x4_t _NEON2SSE_PERFORMANCE_WARNING (vext_s16(int16x4_t a, int16x4_t b, __constrange(0,3) int c), _NEON2SSE_REASON_SLOW_SERIAL)
14450{
14451 int16x4_t res;
14452 int i;
14453 for (i = 0; i<4 - c; i++) {
14454 res.m64_i16[i] = a.m64_i16[i + c];
14455 }
14456 for(i = 0; i<c; i++) {
14457 res.m64_i16[4 - c + i] = b.m64_i16[i];
14458 }
14459 return res;
14460}
14461
14462_NEON2SSE_GLOBAL uint16x4_t vext_u16(uint16x4_t a, uint16x4_t b, __constrange(0,3) int c); // VEXT.16 d0,d0,d0,#0
14463#define vext_u16 vext_s16
14464
14465_NEON2SSE_GLOBAL poly16x4_t vext_p16(poly16x4_t a, poly16x4_t b, __constrange(0,3) int c); // VEXT.16 d0,d0,d0,#0
14466#define vext_p16 vext_s16
14467
14468_NEON2SSESTORAGE int32x2_t vext_s32(int32x2_t a, int32x2_t b, __constrange(0,1) int c); // VEXT.32 d0,d0,d0,#0
14469_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int32x2_t vext_s32(int32x2_t a, int32x2_t b, __constrange(0,1) int c), _NEON2SSE_REASON_SLOW_SERIAL)
14470{
14471 int32x2_t res;
14472 if (c==0) {
14473 res.m64_i32[0] = a.m64_i32[0];
14474 res.m64_i32[1] = a.m64_i32[1];
14475 } else {
14476 res.m64_i32[0] = a.m64_i32[1];
14477 res.m64_i32[1] = b.m64_i32[0];
14478 }
14479 return res;
14480}
14481
14482_NEON2SSESTORAGE float32x2_t vext_f32(float32x2_t a, float32x2_t b, __constrange(0,1) int c); // VEXT.32 d0,d0,d0,#0
14483_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(float32x2_t vext_f32(float32x2_t a, float32x2_t b, __constrange(0,1) int c), _NEON2SSE_REASON_SLOW_SERIAL)
14484{
14485 float32x2_t res;
14486 if (c==0) {
14487 res.m64_f32[0] = a.m64_f32[0];
14488 res.m64_f32[1] = a.m64_f32[1];
14489 } else {
14490 res.m64_f32[0] = a.m64_f32[1];
14491 res.m64_f32[1] = b.m64_f32[0];
14492 }
14493 return res;
14494}
14495
14496_NEON2SSE_GLOBAL uint32x2_t vext_u32(uint32x2_t a, uint32x2_t b, __constrange(0,1) int c); // VEXT.32 d0,d0,d0,#0
14497#define vext_u32 vext_s32
14498
14499
14500_NEON2SSE_GLOBAL int64x1_t vext_s64(int64x1_t a, int64x1_t b, __constrange(0,0) int c); // VEXT.64 d0,d0,d0,#0
14501#define vext_s64(a,b,c) a
14502
14503_NEON2SSE_GLOBAL uint64x1_t vext_u64(uint64x1_t a, uint64x1_t b, __constrange(0,0) int c); // VEXT.64 d0,d0,d0,#0
14504#define vext_u64(a,b,c) a
14505
14506_NEON2SSE_GLOBAL int8x16_t vextq_s8(int8x16_t a, int8x16_t b, __constrange(0,15) int c); // VEXT.8 q0,q0,q0,#0
14507#define vextq_s8(a,b,c) _MM_ALIGNR_EPI8 (b,a,c)
14508
14509_NEON2SSE_GLOBAL uint8x16_t vextq_u8(uint8x16_t a, uint8x16_t b, __constrange(0,15) int c); // VEXT.8 q0,q0,q0,#0
14510#define vextq_u8(a,b,c) _MM_ALIGNR_EPI8 (b,a,c)
14511
14512_NEON2SSE_GLOBAL poly8x16_t vextq_p8(poly8x16_t a, poly8x16_t b, __constrange(0,15) int c); // VEXT.8 q0,q0,q0,#0
14513#define vextq_p8 vextq_s8
14514
14515_NEON2SSE_GLOBAL int16x8_t vextq_s16(int16x8_t a, int16x8_t b, __constrange(0,7) int c); // VEXT.16 q0,q0,q0,#0
14516#define vextq_s16(a,b,c) _MM_ALIGNR_EPI8 (b,a,c * 2)
14517
14518_NEON2SSE_GLOBAL uint16x8_t vextq_u16(uint16x8_t a, uint16x8_t b, __constrange(0,7) int c); // VEXT.16 q0,q0,q0,#0
14519#define vextq_u16(a,b,c) _MM_ALIGNR_EPI8 (b,a,c * 2)
14520
14521_NEON2SSE_GLOBAL poly16x8_t vextq_p16(poly16x8_t a, poly16x8_t b, __constrange(0,7) int c); // VEXT.16 q0,q0,q0,#0
14522#define vextq_p16 vextq_s16
14523
14524_NEON2SSE_GLOBAL int32x4_t vextq_s32(int32x4_t a, int32x4_t b, __constrange(0,3) int c); // VEXT.32 q0,q0,q0,#0
14525#define vextq_s32(a,b,c) _MM_ALIGNR_EPI8 (b,a,c * 4)
14526
14527_NEON2SSE_GLOBAL uint32x4_t vextq_u32(uint32x4_t a, uint32x4_t b, __constrange(0,3) int c); // VEXT.32 q0,q0,q0,#0
14528#define vextq_u32(a,b,c) _MM_ALIGNR_EPI8 (b,a,c * 4)
14529
14530_NEON2SSE_GLOBAL float32x4_t vextq_f32(float32x4_t a, float32x4_t b, __constrange(0,3) float c); // VEXT.32 q0,q0,q0,#0
14531#define vextq_f32(a,b,c) _M128(vextq_s32(_M128i(a),_M128i(b),c) )
14532
14533_NEON2SSE_GLOBAL int64x2_t vextq_s64(int64x2_t a, int64x2_t b, __constrange(0,1) int c); // VEXT.64 q0,q0,q0,#0
14534#define vextq_s64(a,b,c) _MM_ALIGNR_EPI8(b,a,c * 8)
14535
14536_NEON2SSE_GLOBAL uint64x2_t vextq_u64(uint64x2_t a, uint64x2_t b, __constrange(0,1) int c); // VEXT.64 q0,q0,q0,#0
14537#define vextq_u64(a,b,c) _MM_ALIGNR_EPI8(b,a,c * 8)
14538
14539//************ Reverse vector elements (swap endianness)*****************
14540//*************************************************************************
14541//VREVn.m reverses the order of the m-bit lanes within a set that is n bits wide.
14542_NEON2SSESTORAGE int8x8_t vrev64_s8(int8x8_t vec); // VREV64.8 d0,d0
14543_NEON2SSE_INLINE int8x8_t vrev64_s8(int8x8_t vec)
14544{
14545 int8x8_t res64;
14546 __m128i res;
14547 res = vrev64q_s8(_pM128i(vec));
14548 return64(res);
14549}
14550
14551_NEON2SSESTORAGE int16x4_t vrev64_s16(int16x4_t vec); // VREV64.16 d0,d0
14552_NEON2SSE_INLINE int16x4_t vrev64_s16(int16x4_t vec)
14553{
14554 int16x4_t res64;
14555 __m128i res;
14556 res = vrev64q_s16(_pM128i(vec));
14557 return64(res);
14558}
14559
14560_NEON2SSESTORAGE int32x2_t vrev64_s32(int32x2_t vec); // VREV64.32 d0,d0
14561_NEON2SSE_INLINE int32x2_t vrev64_s32(int32x2_t vec)
14562{
14563 int32x2_t res;
14564 res.m64_i32[0] = vec.m64_i32[1];
14565 res.m64_i32[1] = vec.m64_i32[0];
14566 return res;
14567}
14568
14569_NEON2SSE_GLOBAL uint8x8_t vrev64_u8(uint8x8_t vec); // VREV64.8 d0,d0
14570#define vrev64_u8 vrev64_s8
14571
14572_NEON2SSE_GLOBAL uint16x4_t vrev64_u16(uint16x4_t vec); // VREV64.16 d0,d0
14573#define vrev64_u16 vrev64_s16
14574
14575_NEON2SSE_GLOBAL uint32x2_t vrev64_u32(uint32x2_t vec); // VREV64.32 d0,d0
14576#define vrev64_u32 vrev64_s32
14577
14578_NEON2SSE_GLOBAL poly8x8_t vrev64_p8(poly8x8_t vec); // VREV64.8 d0,d0
14579#define vrev64_p8 vrev64_u8
14580
14581_NEON2SSE_GLOBAL poly16x4_t vrev64_p16(poly16x4_t vec); // VREV64.16 d0,d0
14582#define vrev64_p16 vrev64_u16
14583
14584_NEON2SSESTORAGE float32x2_t vrev64_f32(float32x2_t vec); // VREV64.32 d0,d0
14585_NEON2SSE_INLINE float32x2_t vrev64_f32(float32x2_t vec)
14586{
14587 float32x2_t res;
14588 res.m64_f32[0] = vec.m64_f32[1];
14589 res.m64_f32[1] = vec.m64_f32[0];
14590 return res;
14591}
14592
14593_NEON2SSESTORAGE int8x16_t vrev64q_s8(int8x16_t vec); // VREV64.8 q0,q0
14594_NEON2SSE_INLINE int8x16_t vrev64q_s8(int8x16_t vec) // VREV64.8 q0,q0
14595{
14596 _NEON2SSE_ALIGN_16 static const int8_t mask_rev_e8[16] = {7,6,5,4,3,2,1,0, 15,14,13,12,11,10,9, 8};
14597 return _mm_shuffle_epi8 (vec, *(__m128i*) mask_rev_e8);
14598}
14599
14600_NEON2SSESTORAGE int16x8_t vrev64q_s16(int16x8_t vec); // VREV64.16 q0,q0
14601_NEON2SSE_INLINE int16x8_t vrev64q_s16(int16x8_t vec) // VREV64.16 q0,q0
14602{
14603 //no _mm_shuffle_epi16, _mm_shuffle_epi8 to be used with the corresponding mask
14604 _NEON2SSE_ALIGN_16 static const int8_t mask_rev_e16[16] = {6,7, 4,5,2,3,0,1,14,15,12,13,10,11,8,9};
14605 return _mm_shuffle_epi8 (vec, *(__m128i*)mask_rev_e16);
14606}
14607
14608_NEON2SSESTORAGE int32x4_t vrev64q_s32(int32x4_t vec); // VREV64.32 q0,q0
14609_NEON2SSE_INLINE int32x4_t vrev64q_s32(int32x4_t vec) // VREV64.32 q0,q0
14610{
14611 return _mm_shuffle_epi32 (vec, 1 | (0 << 2) | (3 << 4) | (2 << 6) );
14612}
14613
14614_NEON2SSE_GLOBAL uint8x16_t vrev64q_u8(uint8x16_t vec); // VREV64.8 q0,q0
14615#define vrev64q_u8 vrev64q_s8
14616
14617_NEON2SSE_GLOBAL uint16x8_t vrev64q_u16(uint16x8_t vec); // VREV64.16 q0,q0
14618#define vrev64q_u16 vrev64q_s16
14619
14620_NEON2SSE_GLOBAL uint32x4_t vrev64q_u32(uint32x4_t vec); // VREV64.32 q0,q0
14621#define vrev64q_u32 vrev64q_s32
14622
14623_NEON2SSE_GLOBAL poly8x16_t vrev64q_p8(poly8x16_t vec); // VREV64.8 q0,q0
14624#define vrev64q_p8 vrev64q_u8
14625
14626_NEON2SSE_GLOBAL poly16x8_t vrev64q_p16(poly16x8_t vec); // VREV64.16 q0,q0
14627#define vrev64q_p16 vrev64q_u16
14628
14629_NEON2SSE_GLOBAL float32x4_t vrev64q_f32(float32x4_t vec); // VREV64.32 q0,q0
14630#define vrev64q_f32(vec) _mm_shuffle_ps (vec, vec, _MM_SHUFFLE(2,3, 0,1))
14631
14632//******************** 32 bit shuffles **********************
14633//************************************************************
14634_NEON2SSESTORAGE int8x8_t vrev32_s8(int8x8_t vec); // VREV32.8 d0,d0
14635_NEON2SSE_INLINE int8x8_t vrev32_s8(int8x8_t vec)
14636{
14637 int8x8_t res64;
14638 __m128i res;
14639 res = vrev32q_s8(_pM128i(vec));
14640 return64(res);
14641}
14642
14643_NEON2SSESTORAGE int16x4_t vrev32_s16(int16x4_t vec); // VREV32.16 d0,d0
14644_NEON2SSE_INLINE int16x4_t vrev32_s16(int16x4_t vec)
14645{
14646 int16x4_t res64;
14647 __m128i res;
14648 res = vrev32q_s16(_pM128i(vec));
14649 return64(res);
14650}
14651
14652_NEON2SSE_GLOBAL uint8x8_t vrev32_u8(uint8x8_t vec); // VREV32.8 d0,d0
14653#define vrev32_u8 vrev32_s8
14654
14655_NEON2SSE_GLOBAL uint16x4_t vrev32_u16(uint16x4_t vec); // VREV32.16 d0,d0
14656#define vrev32_u16 vrev32_s16
14657
14658_NEON2SSE_GLOBAL poly8x8_t vrev32_p8(poly8x8_t vec); // VREV32.8 d0,d0
14659#define vrev32_p8 vrev32_u8
14660
14661_NEON2SSE_GLOBAL poly16x4_t vrev32_p16(poly16x4_t vec); // VREV32.16 d0,d0
14662#define vrev32_p16 vrev32_u16
14663
14664_NEON2SSESTORAGE int8x16_t vrev32q_s8(int8x16_t vec); // VREV32.8 q0,q0
14665_NEON2SSE_INLINE int8x16_t vrev32q_s8(int8x16_t vec) // VREV32.8 q0,q0
14666{
14667 _NEON2SSE_ALIGN_16 static const int8_t mask_rev_e8[16] = {3,2,1,0, 7,6,5,4, 11,10,9,8, 15,14,13,12};
14668 return _mm_shuffle_epi8 (vec, *(__m128i*) mask_rev_e8);
14669}
14670
14671_NEON2SSESTORAGE int16x8_t vrev32q_s16(int16x8_t vec); // VREV32.16 q0,q0
14672_NEON2SSE_INLINE int16x8_t vrev32q_s16(int16x8_t vec) // VREV32.16 q0,q0
14673{
14674 _NEON2SSE_ALIGN_16 static const int8_t mask_rev_e8[16] = {2,3,0,1, 6,7, 4,5, 10,11, 8,9, 14,15,12,13};
14675 return _mm_shuffle_epi8 (vec, *(__m128i*) mask_rev_e8);
14676}
14677
14678_NEON2SSE_GLOBAL uint8x16_t vrev32q_u8(uint8x16_t vec); // VREV32.8 q0,q0
14679#define vrev32q_u8 vrev32q_s8
14680
14681_NEON2SSE_GLOBAL uint16x8_t vrev32q_u16(uint16x8_t vec); // VREV32.16 q0,q0
14682#define vrev32q_u16 vrev32q_s16
14683
14684_NEON2SSE_GLOBAL poly8x16_t vrev32q_p8(poly8x16_t vec); // VREV32.8 q0,q0
14685#define vrev32q_p8 vrev32q_u8
14686
14687_NEON2SSE_GLOBAL poly16x8_t vrev32q_p16(poly16x8_t vec); // VREV32.16 q0,q0
14688#define vrev32q_p16 vrev32q_u16
14689
14690//************* 16 bit shuffles **********************
14691//******************************************************
14692_NEON2SSESTORAGE int8x8_t vrev16_s8(int8x8_t vec); // VREV16.8 d0,d0
14693_NEON2SSE_INLINE int8x8_t vrev16_s8(int8x8_t vec)
14694{
14695 int8x8_t res64;
14696 __m128i res;
14697 res = vrev16q_s8(_pM128i(vec));
14698 return64(res);
14699}
14700
14701_NEON2SSE_GLOBAL uint8x8_t vrev16_u8(uint8x8_t vec); // VREV16.8 d0,d0
14702#define vrev16_u8 vrev16_s8
14703
14704_NEON2SSE_GLOBAL poly8x8_t vrev16_p8(poly8x8_t vec); // VREV16.8 d0,d0
14705#define vrev16_p8 vrev16_u8
14706
14707_NEON2SSESTORAGE int8x16_t vrev16q_s8(int8x16_t vec); // VREV16.8 q0,q0
14708_NEON2SSE_INLINE int8x16_t vrev16q_s8(int8x16_t vec) // VREV16.8 q0,q0
14709{
14710 _NEON2SSE_ALIGN_16 static const int8_t mask_rev8[16] = {1,0, 3,2, 5,4, 7,6, 9,8, 11, 10, 13, 12, 15, 14};
14711 return _mm_shuffle_epi8 (vec, *(__m128i*) mask_rev8);
14712}
14713
14714_NEON2SSE_GLOBAL uint8x16_t vrev16q_u8(uint8x16_t vec); // VREV16.8 q0,q0
14715#define vrev16q_u8 vrev16q_s8
14716
14717_NEON2SSE_GLOBAL poly8x16_t vrev16q_p8(poly8x16_t vec); // VREV16.8 q0,q0
14718#define vrev16q_p8 vrev16q_u8
14719
14720//*********************************************************************
14721//**************** Other single operand arithmetic *******************
14722//*********************************************************************
14723
14724//*********** Absolute: Vd[i] = |Va[i]| **********************************
14725//************************************************************************
14726_NEON2SSESTORAGE int8x8_t vabs_s8(int8x8_t a); // VABS.S8 d0,d0
14727_NEON2SSE_INLINE int8x8_t vabs_s8(int8x8_t a)
14728{
14729 int8x8_t res64;
14730 __m128i res;
14731 res = _mm_abs_epi8(_pM128i(a));
14732 return64(res);
14733}
14734
14735
14736_NEON2SSESTORAGE int16x4_t vabs_s16(int16x4_t a); // VABS.S16 d0,d0
14737_NEON2SSE_INLINE int16x4_t vabs_s16(int16x4_t a)
14738{
14739 int16x4_t res64;
14740 __m128i res;
14741 res = _mm_abs_epi16(_pM128i(a));
14742 return64(res);
14743}
14744
14745_NEON2SSESTORAGE int32x2_t vabs_s32(int32x2_t a); // VABS.S32 d0,d0
14746_NEON2SSE_INLINE int32x2_t vabs_s32(int32x2_t a)
14747{
14748 int32x2_t res64;
14749 __m128i res;
14750 res = _mm_abs_epi32(_pM128i(a));
14751 return64(res);
14752}
14753
14754_NEON2SSESTORAGE float32x2_t vabs_f32(float32x2_t a); // VABS.F32 d0,d0
14755_NEON2SSE_INLINE float32x2_t vabs_f32(float32x2_t a) // VABS.F32 d0,d0
14756{
14757 float32x4_t res;
14758 __m64_128 res64;
14759 _NEON2SSE_ALIGN_16 static const int32_t c7fffffff[4] = {0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff};
14760 res = _mm_and_ps (_pM128(a), *(__m128*)c7fffffff); //use 64 low bits only
14761 _M64f(res64, res);
14762 return res64;
14763}
14764
14765_NEON2SSE_GLOBAL int8x16_t vabsq_s8(int8x16_t a); // VABS.S8 q0,q0
14766#define vabsq_s8 _mm_abs_epi8
14767
14768_NEON2SSE_GLOBAL int16x8_t vabsq_s16(int16x8_t a); // VABS.S16 q0,q0
14769#define vabsq_s16 _mm_abs_epi16
14770
14771_NEON2SSE_GLOBAL int32x4_t vabsq_s32(int32x4_t a); // VABS.S32 q0,q0
14772#define vabsq_s32 _mm_abs_epi32
14773
14774_NEON2SSESTORAGE float32x4_t vabsq_f32(float32x4_t a); // VABS.F32 q0,q0
14775_NEON2SSE_INLINE float32x4_t vabsq_f32(float32x4_t a) // VABS.F32 q0,q0
14776{
14777 _NEON2SSE_ALIGN_16 static const int32_t c7fffffff[4] = {0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff};
14778 return _mm_and_ps (a, *(__m128*)c7fffffff);
14779}
14780
14781#ifdef _NEON2SSE_64BIT
14782_NEON2SSESTORAGE int64x2_t vabsq_s64(int64x2_t a); // VABS.S64 q0,q0
14783_NEON2SSE_INLINE int64x2_t vabsq_s64(int64x2_t a) // VABS.S64 q0,q0
14784{
14785 __m128i sign = _mm_srai_epi32 (_mm_shuffle_epi32 (a, 0xf5), 31);
14786 return _mm_sub_epi64 (_mm_xor_si128 (a, sign), sign);
14787}
14788
14789_NEON2SSESTORAGE float64x2_t vabsq_f64(float64x2_t a); // VABS.F64 q0,q0
14790_NEON2SSE_INLINE float64x2_t vabsq_f64(float64x2_t a) // VABS.F64 q0,q0
14791{
14792 _NEON2SSE_ALIGN_16 static const int64_t mask[2] = {0x7fffffffffffffffLL, 0x7fffffffffffffffLL};
14793 return _mm_and_pd (a, *(__m128d*)mask);
14794}
14795#endif
14796
14797//****** Saturating absolute: Vd[i] = sat(|Va[i]|) *********************
14798//**********************************************************************
14799//For signed-integer data types, the absolute value of the most negative value is not representable by the data type, saturation takes place
14800_NEON2SSESTORAGE int8x8_t vqabs_s8(int8x8_t a); // VQABS.S8 d0,d0
14801_NEON2SSE_INLINE int8x8_t vqabs_s8(int8x8_t a)
14802{
14803 int8x8_t res64;
14804 __m128i res;
14805 res = vqabsq_s8(_pM128i(a));
14806 return64(res);
14807}
14808
14809_NEON2SSESTORAGE int16x4_t vqabs_s16(int16x4_t a); // VQABS.S16 d0,d0
14810_NEON2SSE_INLINE int16x4_t vqabs_s16(int16x4_t a)
14811{
14812 int16x4_t res64;
14813 __m128i res;
14814 res = vqabsq_s16(_pM128i(a));
14815 return64(res);
14816}
14817
14818_NEON2SSESTORAGE int32x2_t vqabs_s32(int32x2_t a); // VQABS.S32 d0,d0
14819_NEON2SSE_INLINE int32x2_t vqabs_s32(int32x2_t a)
14820{
14821 int32x2_t res64;
14822 __m128i res;
14823 res = vqabsq_s32(_pM128i(a));
14824 return64(res);
14825}
14826
14827_NEON2SSESTORAGE int8x16_t vqabsq_s8(int8x16_t a); // VQABS.S8 q0,q0
14828_NEON2SSE_INLINE int8x16_t vqabsq_s8(int8x16_t a) // VQABS.S8 q0,q0
14829{
14830 __m128i c_128, abs, abs_cmp;
14831 c_128 = _mm_set1_epi8 (-128); //(int8_t)0x80
14832 abs = _mm_abs_epi8 (a);
14833 abs_cmp = _mm_cmpeq_epi8 (abs, c_128);
14834 return _mm_xor_si128 (abs, abs_cmp);
14835}
14836
14837_NEON2SSESTORAGE int16x8_t vqabsq_s16(int16x8_t a); // VQABS.S16 q0,q0
14838_NEON2SSE_INLINE int16x8_t vqabsq_s16(int16x8_t a) // VQABS.S16 q0,q0
14839{
14840 __m128i c_32768, abs, abs_cmp;
14841 c_32768 = _mm_set1_epi16 (-32768); //(int16_t)0x8000
14842 abs = _mm_abs_epi16 (a);
14843 abs_cmp = _mm_cmpeq_epi16 (abs, c_32768);
14844 return _mm_xor_si128 (abs, abs_cmp);
14845}
14846
14847_NEON2SSESTORAGE int32x4_t vqabsq_s32(int32x4_t a); // VQABS.S32 q0,q0
14848_NEON2SSE_INLINE int32x4_t vqabsq_s32(int32x4_t a) // VQABS.S32 q0,q0
14849{
14850 __m128i c80000000, abs, abs_cmp;
14851 c80000000 = _mm_set1_epi32 (0x80000000); //most negative value
14852 abs = _mm_abs_epi32 (a);
14853 abs_cmp = _mm_cmpeq_epi32 (abs, c80000000);
14854 return _mm_xor_si128 (abs, abs_cmp);
14855}
14856
14857//*************** Negate: Vd[i] = - Va[i] *************************************
14858//*****************************************************************************
14859//several Negate implementations possible for SIMD.
14860//e.//function _mm_sign function(a, negative numbers vector), but the following one gives good performance:
14861_NEON2SSESTORAGE int8x8_t vneg_s8(int8x8_t a); // VNE//d0,d0
14862_NEON2SSE_INLINE int8x8_t vneg_s8(int8x8_t a)
14863{
14864 int8x8_t res64;
14865 __m128i res;
14866 res = vnegq_s8(_pM128i(a));
14867 return64(res);
14868}
14869
14870_NEON2SSESTORAGE int16x4_t vneg_s16(int16x4_t a); // VNE//d0,d0
14871_NEON2SSE_INLINE int16x4_t vneg_s16(int16x4_t a)
14872{
14873 int16x4_t res64;
14874 __m128i res;
14875 res = vnegq_s16(_pM128i(a));
14876 return64(res);
14877}
14878
14879_NEON2SSESTORAGE int32x2_t vneg_s32(int32x2_t a); // VNE//d0,d0
14880_NEON2SSE_INLINE int32x2_t vneg_s32(int32x2_t a)
14881{
14882 int32x2_t res64;
14883 __m128i res;
14884 res = vnegq_s32(_pM128i(a));
14885 return64(res);
14886}
14887
14888_NEON2SSESTORAGE float32x2_t vneg_f32(float32x2_t a); // VNE//d0,d0
14889_NEON2SSE_INLINE float32x2_t vneg_f32(float32x2_t a) // VNE//d0,d0
14890{
14891 float32x4_t res;
14892 __m64_128 res64;
14893 _NEON2SSE_ALIGN_16 static const uint32_t c80000000[4] = {0x80000000, 0x80000000, 0x80000000, 0x80000000};
14894 res = _mm_xor_ps (_pM128(a), *(__m128*) c80000000); //use low 64 bits
14895 _M64f(res64, res);
14896 return res64;
14897}
14898
14899_NEON2SSESTORAGE int8x16_t vnegq_s8(int8x16_t a); // VNE//q0,q0
14900_NEON2SSE_INLINE int8x16_t vnegq_s8(int8x16_t a) // VNE//q0,q0
14901{
14902 __m128i zero;
14903 zero = _mm_setzero_si128 ();
14904 return _mm_sub_epi8 (zero, a);
14905} //or _mm_sign_epi8 (a, negative numbers vector)
14906
14907_NEON2SSESTORAGE int16x8_t vnegq_s16(int16x8_t a); // VNE//q0,q0
14908_NEON2SSE_INLINE int16x8_t vnegq_s16(int16x8_t a) // VNE//q0,q0
14909{
14910 __m128i zero;
14911 zero = _mm_setzero_si128 ();
14912 return _mm_sub_epi16 (zero, a);
14913} //or _mm_sign_epi16 (a, negative numbers vector)
14914
14915_NEON2SSESTORAGE int32x4_t vnegq_s32(int32x4_t a); // VNE//q0,q0
14916_NEON2SSE_INLINE int32x4_t vnegq_s32(int32x4_t a) // VNE//q0,q0
14917{
14918 __m128i zero;
14919 zero = _mm_setzero_si128 ();
14920 return _mm_sub_epi32 (zero, a);
14921} //or _mm_sign_epi32 (a, negative numbers vector)
14922
14923_NEON2SSESTORAGE float32x4_t vnegq_f32(float32x4_t a); // VNE//q0,q0
14924_NEON2SSE_INLINE float32x4_t vnegq_f32(float32x4_t a) // VNE//q0,q0
14925{
14926 _NEON2SSE_ALIGN_16 static const uint32_t c80000000[4] = {0x80000000, 0x80000000, 0x80000000, 0x80000000};
14927 return _mm_xor_ps (a, *(__m128*) c80000000);
14928}
14929
14930//************** Saturating Negate: sat(Vd[i] = - Va[i]) **************************
14931//***************************************************************************************
14932//For signed-integer data types, the negation of the most negative value can't be produced without saturation, while with saturation it is max positive
14933_NEON2SSESTORAGE int8x8_t vqneg_s8(int8x8_t a); // VQNE//d0,d0
14934_NEON2SSE_INLINE int8x8_t vqneg_s8(int8x8_t a)
14935{
14936 int8x8_t res64;
14937 __m128i res;
14938 res = vqnegq_s8(_pM128i(a));
14939 return64(res);
14940}
14941
14942_NEON2SSESTORAGE int16x4_t vqneg_s16(int16x4_t a); // VQNE//d0,d0
14943_NEON2SSE_INLINE int16x4_t vqneg_s16(int16x4_t a)
14944{
14945 int16x4_t res64;
14946 __m128i res;
14947 res = vqnegq_s16(_pM128i(a));
14948 return64(res);
14949}
14950
14951_NEON2SSESTORAGE int32x2_t vqneg_s32(int32x2_t a); // VQNE//d0,d0
14952_NEON2SSE_INLINE int32x2_t vqneg_s32(int32x2_t a)
14953{
14954 int32x2_t res64;
14955 __m128i res;
14956 res = vqnegq_s32(_pM128i(a));
14957 return64(res);
14958}
14959
14960_NEON2SSESTORAGE int8x16_t vqnegq_s8(int8x16_t a); // VQNE//q0,q0
14961_NEON2SSE_INLINE int8x16_t vqnegq_s8(int8x16_t a) // VQNE//q0,q0
14962{
14963 __m128i zero;
14964 zero = _mm_setzero_si128 ();
14965 return _mm_subs_epi8 (zero, a); //saturating substraction
14966}
14967
14968_NEON2SSESTORAGE int16x8_t vqnegq_s16(int16x8_t a); // VQNE//q0,q0
14969_NEON2SSE_INLINE int16x8_t vqnegq_s16(int16x8_t a) // VQNE//q0,q0
14970{
14971 __m128i zero;
14972 zero = _mm_setzero_si128 ();
14973 return _mm_subs_epi16 (zero, a); //saturating substraction
14974}
14975
14976_NEON2SSESTORAGE int32x4_t vqnegq_s32(int32x4_t a); // VQNE//q0,q0
14977_NEON2SSE_INLINE int32x4_t vqnegq_s32(int32x4_t a) // VQNE//q0,q0
14978{
14979 //solution may be not optimal compared with a serial
14980 __m128i c80000000, zero, sub, cmp;
14981 c80000000 = _mm_set1_epi32 (0x80000000); //most negative value
14982 zero = _mm_setzero_si128 ();
14983 sub = _mm_sub_epi32 (zero, a); //substraction
14984 cmp = _mm_cmpeq_epi32 (a, c80000000);
14985 return _mm_xor_si128 (sub, cmp);
14986}
14987
14988//****************** Count leading zeros ********************************
14989//**************************************************************************
14990//no corresponding vector intrinsics in IA32, need to implement it. While the implementation is effective for 8 bits, it may be not for 16 and 32 bits
14991_NEON2SSESTORAGE int8x8_t vclz_s8(int8x8_t a); // VCLZ.I8 d0,d0
14992_NEON2SSE_INLINE int8x8_t vclz_s8(int8x8_t a)
14993{
14994 int8x8_t res64;
14995 __m128i res;
14996 res = vclzq_s8(_pM128i(a));
14997 return64(res);
14998}
14999
15000_NEON2SSESTORAGE int16x4_t vclz_s16(int16x4_t a); // VCLZ.I16 d0,d0
15001_NEON2SSE_INLINE int16x4_t vclz_s16(int16x4_t a)
15002{
15003 int16x4_t res64;
15004 __m128i res;
15005 res = vclzq_s16(_pM128i(a));
15006 return64(res);
15007}
15008
15009_NEON2SSESTORAGE int32x2_t vclz_s32(int32x2_t a); // VCLZ.I32 d0,d0
15010_NEON2SSE_INLINE int32x2_t vclz_s32(int32x2_t a)
15011{
15012 int32x2_t res64;
15013 __m128i res;
15014 res = vclzq_s32(_pM128i(a));
15015 return64(res);
15016}
15017
15018
15019_NEON2SSE_GLOBAL uint8x8_t vclz_u8(uint8x8_t a); // VCLZ.I8 d0,d0
15020#define vclz_u8 vclz_s8
15021
15022_NEON2SSE_GLOBAL uint16x4_t vclz_u16(uint16x4_t a); // VCLZ.I16 d0,d0
15023#define vclz_u16 vclz_s16
15024
15025_NEON2SSE_GLOBAL uint32x2_t vclz_u32(uint32x2_t a); // VCLZ.I32 d0,d0
15026#define vclz_u32 vclz_s32
15027
15028_NEON2SSESTORAGE int8x16_t vclzq_s8(int8x16_t a); // VCLZ.I8 q0,q0
15029_NEON2SSE_INLINE int8x16_t vclzq_s8(int8x16_t a)
15030{
15031 _NEON2SSE_ALIGN_16 static const int8_t mask_CLZ[16] = { /* 0 */ 4,/* 1 */ 3,/* 2 */ 2,/* 3 */ 2,
15032 /* 4 */ 1,/* 5 */ 1,/* 6 */ 1,/* 7 */ 1,
15033 /* 8 */ 0,/* 9 */ 0,/* a */ 0,/* b */ 0,
15034 /* c */ 0,/* d */ 0,/* e */ 0,/* f */ 0 };
15035 __m128i maskLOW, c4, lowclz, mask, hiclz;
15036 maskLOW = _mm_set1_epi8(0x0f); //low 4 bits, don't need masking low to avoid zero if MSB is set - it happens automatically
15037 c4 = _mm_set1_epi8(4);
15038 lowclz = _mm_shuffle_epi8( *(__m128i*)mask_CLZ, a); //uses low 4 bits anyway
15039 mask = _mm_srli_epi16(a, 4); //get high 4 bits as low bits
15040 mask = _mm_and_si128(mask, maskLOW); //low 4 bits, need masking to avoid zero if MSB is set
15041 hiclz = _mm_shuffle_epi8( *(__m128i*) mask_CLZ, mask); //uses low 4 bits anyway
15042 mask = _mm_cmpeq_epi8(hiclz, c4); // shows the need to add lowclz zeros
15043 lowclz = _mm_and_si128(lowclz,mask);
15044 return _mm_add_epi8(lowclz, hiclz);
15045}
15046
15047_NEON2SSESTORAGE int16x8_t vclzq_s16(int16x8_t a); // VCLZ.I16 q0,q0
15048_NEON2SSE_INLINE int16x8_t vclzq_s16(int16x8_t a)
15049{
15050 __m128i c7, res8x16, res8x16_swap;
15051 _NEON2SSE_ALIGN_16 static const int8_t mask8_sab[16] = { 1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14};
15052 _NEON2SSE_ALIGN_16 static const uint16_t mask8bit[8] = {0x00ff, 0x00ff, 0x00ff, 0x00ff,0x00ff, 0x00ff, 0x00ff, 0x00ff};
15053 c7 = _mm_srli_epi16(*(__m128i*)mask8bit, 5); //7
15054 res8x16 = vclzq_s8(a);
15055 res8x16_swap = _mm_shuffle_epi8 (res8x16, *(__m128i*) mask8_sab); //horisontal pairs swap
15056 res8x16 = _mm_and_si128(res8x16, *(__m128i*)mask8bit); //lowclz
15057 res8x16_swap = _mm_and_si128(res8x16_swap, *(__m128i*)mask8bit); //hiclz
15058 c7 = _mm_cmpgt_epi16(res8x16_swap, c7); // shows the need to add lowclz zeros
15059 res8x16 = _mm_and_si128(res8x16, c7); //lowclz
15060 return _mm_add_epi16(res8x16_swap, res8x16);
15061}
15062
15063_NEON2SSESTORAGE int32x4_t vclzq_s32(int32x4_t a); // VCLZ.I32 q0,q0
15064_NEON2SSE_INLINE int32x4_t vclzq_s32(int32x4_t a)
15065{
15066 __m128i c55555555, c33333333, c0f0f0f0f, c3f, c32, tmp, tmp1, res;
15067 c55555555 = _mm_set1_epi32(0x55555555);
15068 c33333333 = _mm_set1_epi32(0x33333333);
15069 c0f0f0f0f = _mm_set1_epi32(0x0f0f0f0f);
15070 c3f = _mm_set1_epi32(0x3f);
15071 c32 = _mm_set1_epi32(32);
15072 tmp = _mm_srli_epi32(a, 1);
15073 res = _mm_or_si128(tmp, a); //atmp[i] |= (atmp[i] >> 1);
15074 tmp = _mm_srli_epi32(res, 2);
15075 res = _mm_or_si128(tmp, res); //atmp[i] |= (atmp[i] >> 2);
15076 tmp = _mm_srli_epi32(res, 4);
15077 res = _mm_or_si128(tmp, res); //atmp[i] |= (atmp[i] >> 4);
15078 tmp = _mm_srli_epi32(res, 8);
15079 res = _mm_or_si128(tmp, res); //atmp[i] |= (atmp[i] >> 8);
15080 tmp = _mm_srli_epi32(res, 16);
15081 res = _mm_or_si128(tmp, res); //atmp[i] |= (atmp[i] >> 16);
15082
15083 tmp = _mm_srli_epi32(res, 1);
15084 tmp = _mm_and_si128(tmp, c55555555);
15085 res = _mm_sub_epi32(res, tmp); //atmp[i] -= ((atmp[i] >> 1) & 0x55555555);
15086
15087 tmp = _mm_srli_epi32(res, 2);
15088 tmp = _mm_and_si128(tmp, c33333333);
15089 tmp1 = _mm_and_si128(res, c33333333);
15090 res = _mm_add_epi32(tmp, tmp1); //atmp[i] = (((atmp[i] >> 2) & 0x33333333) + (atmp[i] & 0x33333333));
15091
15092 tmp = _mm_srli_epi32(res, 4);
15093 tmp = _mm_add_epi32(tmp, res);
15094 res = _mm_and_si128(tmp, c0f0f0f0f); //atmp[i] = (((atmp[i] >> 4) + atmp[i]) & 0x0f0f0f0f);
15095
15096 tmp = _mm_srli_epi32(res, 8);
15097 res = _mm_add_epi32(tmp, res); //atmp[i] += (atmp[i] >> 8);
15098
15099 tmp = _mm_srli_epi32(res, 16);
15100 res = _mm_add_epi32(tmp, res); //atmp[i] += (atmp[i] >> 16);
15101
15102 res = _mm_and_si128(res, c3f); //atmp[i] = atmp[i] & 0x0000003f;
15103
15104 return _mm_sub_epi32(c32, res); //res[i] = 32 - atmp[i];
15105}
15106
15107_NEON2SSE_GLOBAL uint8x16_t vclzq_u8(uint8x16_t a); // VCLZ.I8 q0,q0
15108#define vclzq_u8 vclzq_s8
15109
15110_NEON2SSE_GLOBAL uint16x8_t vclzq_u16(uint16x8_t a); // VCLZ.I16 q0,q0
15111#define vclzq_u16 vclzq_s16
15112
15113_NEON2SSE_GLOBAL uint32x4_t vclzq_u32(uint32x4_t a); // VCLZ.I32 q0,q0
15114#define vclzq_u32 vclzq_s32
15115
15116//************** Count leading sign bits **************************
15117//********************************************************************
15118//VCLS (Vector Count Leading Sign bits) counts the number of consecutive bits following
15119// the topmost bit, that are the same as the topmost bit, in each element in a vector
15120//No corresponding vector intrinsics in IA32, need to implement it.
15121//While the implementation is effective for 8 bits, it may be not for 16 and 32 bits
15122_NEON2SSESTORAGE int8x8_t vcls_s8(int8x8_t a); // VCLS.S8 d0,d0
15123_NEON2SSE_INLINE int8x8_t vcls_s8(int8x8_t a)
15124{
15125 int8x8_t res64;
15126 __m128i res;
15127 res = vclsq_s8(_pM128i(a));
15128 return64(res);
15129}
15130
15131_NEON2SSESTORAGE int16x4_t vcls_s16(int16x4_t a); // VCLS.S16 d0,d0
15132_NEON2SSE_INLINE int16x4_t vcls_s16(int16x4_t a)
15133{
15134 int16x4_t res64;
15135 __m128i res;
15136 res = vclsq_s16(_pM128i(a));
15137 return64(res);
15138}
15139
15140_NEON2SSESTORAGE int32x2_t vcls_s32(int32x2_t a); // VCLS.S32 d0,d0
15141_NEON2SSE_INLINE int32x2_t vcls_s32(int32x2_t a)
15142{
15143 int32x2_t res64;
15144 __m128i res;
15145 res = vclsq_s32(_pM128i(a));
15146 return64(res);
15147}
15148
15149_NEON2SSESTORAGE int8x16_t vclsq_s8(int8x16_t a); // VCLS.S8 q0,q0
15150_NEON2SSE_INLINE int8x16_t vclsq_s8(int8x16_t a)
15151{
15152 __m128i cff, c80, c1, a_mask, a_neg, a_pos, a_comb;
15153 cff = _mm_cmpeq_epi8 (a,a); //0xff
15154 c80 = _mm_set1_epi8(-128); //(int8_t)0x80
15155 c1 = _mm_set1_epi8(1);
15156 a_mask = _mm_and_si128(a, c80);
15157 a_mask = _mm_cmpeq_epi8(a_mask, c80); //0xff if negative input and 0 if positive
15158 a_neg = _mm_xor_si128(a, cff);
15159 a_neg = _mm_and_si128(a_mask, a_neg);
15160 a_pos = _mm_andnot_si128(a_mask, a);
15161 a_comb = _mm_or_si128(a_pos, a_neg);
15162 a_comb = vclzq_s8(a_comb);
15163 return _mm_sub_epi8(a_comb, c1);
15164}
15165
15166_NEON2SSESTORAGE int16x8_t vclsq_s16(int16x8_t a); // VCLS.S16 q0,q0
15167_NEON2SSE_INLINE int16x8_t vclsq_s16(int16x8_t a)
15168{
15169 __m128i cffff, c8000, c1, a_mask, a_neg, a_pos, a_comb;
15170 cffff = _mm_cmpeq_epi16(a,a);
15171 c8000 = _mm_slli_epi16(cffff, 15); //0x8000
15172 c1 = _mm_srli_epi16(cffff,15); //0x1
15173 a_mask = _mm_and_si128(a, c8000);
15174 a_mask = _mm_cmpeq_epi16(a_mask, c8000); //0xffff if negative input and 0 if positive
15175 a_neg = _mm_xor_si128(a, cffff);
15176 a_neg = _mm_and_si128(a_mask, a_neg);
15177 a_pos = _mm_andnot_si128(a_mask, a);
15178 a_comb = _mm_or_si128(a_pos, a_neg);
15179 a_comb = vclzq_s16(a_comb);
15180 return _mm_sub_epi16(a_comb, c1);
15181}
15182
15183_NEON2SSESTORAGE int32x4_t vclsq_s32(int32x4_t a); // VCLS.S32 q0,q0
15184_NEON2SSE_INLINE int32x4_t vclsq_s32(int32x4_t a)
15185{
15186 __m128i cffffffff, c80000000, c1, a_mask, a_neg, a_pos, a_comb;
15187 cffffffff = _mm_cmpeq_epi32(a,a);
15188 c80000000 = _mm_slli_epi32(cffffffff, 31); //0x80000000
15189 c1 = _mm_srli_epi32(cffffffff,31); //0x1
15190 a_mask = _mm_and_si128(a, c80000000);
15191 a_mask = _mm_cmpeq_epi32(a_mask, c80000000); //0xffffffff if negative input and 0 if positive
15192 a_neg = _mm_xor_si128(a, cffffffff);
15193 a_neg = _mm_and_si128(a_mask, a_neg);
15194 a_pos = _mm_andnot_si128(a_mask, a);
15195 a_comb = _mm_or_si128(a_pos, a_neg);
15196 a_comb = vclzq_s32(a_comb);
15197 return _mm_sub_epi32(a_comb, c1);
15198}
15199
15200//************************* Count number of set bits ********************************
15201//*************************************************************************************
15202//No corresponding SIMD solution. One option is to get a elements, convert it to 32 bits and then use SSE4.2 _mm_popcnt__u32 (unsigned int v) for each element
15203//another option is to do the following algorithm:
15204
15205_NEON2SSESTORAGE uint8x8_t vcnt_u8(uint8x8_t a); // VCNT.8 d0,d0
15206_NEON2SSE_INLINE uint8x8_t vcnt_u8(uint8x8_t a)
15207{
15208 uint8x8_t res64;
15209 __m128i res;
15210 res = vcntq_u8(_pM128i(a));
15211 return64(res);
15212}
15213
15214_NEON2SSE_GLOBAL int8x8_t vcnt_s8(int8x8_t a); // VCNT.8 d0,d0
15215#define vcnt_s8 vcnt_u8
15216
15217_NEON2SSE_GLOBAL poly8x8_t vcnt_p8(poly8x8_t a); // VCNT.8 d0,d0
15218#define vcnt_p8 vcnt_u8
15219
15220_NEON2SSESTORAGE uint8x16_t vcntq_u8(uint8x16_t a); // VCNT.8 q0,q0
15221_NEON2SSE_INLINE uint8x16_t vcntq_u8(uint8x16_t a)
15222{
15223 _NEON2SSE_ALIGN_16 static const int8_t mask_POPCOUNT[16] = { /* 0 */ 0,/* 1 */ 1,/* 2 */ 1,/* 3 */ 2,
15224 /* 4 */ 1,/* 5 */ 2,/* 6 */ 2,/* 7 */ 3,
15225 /* 8 */ 1,/* 9 */ 2,/* a */ 2,/* b */ 3,
15226 /* c */ 2,/* d */ 3,/* e */ 3,/* f */ 4};
15227 __m128i maskLOW, mask, lowpopcnt, hipopcnt;
15228 maskLOW = _mm_set1_epi8(0x0f); //low 4 bits, need masking to avoid zero if MSB is set
15229 mask = _mm_and_si128(a, maskLOW);
15230 lowpopcnt = _mm_shuffle_epi8( *(__m128i*)mask_POPCOUNT, mask); //uses low 4 bits anyway
15231 mask = _mm_srli_epi16(a, 4); //get high 4 bits as low bits
15232 mask = _mm_and_si128(mask, maskLOW); //low 4 bits, need masking to avoid zero if MSB is set
15233 hipopcnt = _mm_shuffle_epi8( *(__m128i*) mask_POPCOUNT, mask); //uses low 4 bits anyway
15234 return _mm_add_epi8(lowpopcnt, hipopcnt);
15235}
15236
15237_NEON2SSE_GLOBAL int8x16_t vcntq_s8(int8x16_t a); // VCNT.8 q0,q0
15238#define vcntq_s8 vcntq_u8
15239
15240_NEON2SSE_GLOBAL poly8x16_t vcntq_p8(poly8x16_t a); // VCNT.8 q0,q0
15241#define vcntq_p8 vcntq_u8
15242
15243//**************************************************************************************
15244//*********************** Logical operations ****************************************
15245//**************************************************************************************
15246//************************** Bitwise not ***********************************
15247//several Bitwise not implementations possible for SIMD. Eg "xor" with all ones, but the following one gives good performance
15248_NEON2SSESTORAGE int8x8_t vmvn_s8(int8x8_t a); // VMVN d0,d0
15249_NEON2SSE_INLINE int8x8_t vmvn_s8(int8x8_t a)
15250{
15251 int8x8_t res64;
15252 __m128i res;
15253 res = vmvnq_s8(_pM128i(a));
15254 return64(res);
15255}
15256
15257_NEON2SSESTORAGE int16x4_t vmvn_s16(int16x4_t a); // VMVN d0,d0
15258_NEON2SSE_INLINE int16x4_t vmvn_s16(int16x4_t a)
15259{
15260 int16x4_t res64;
15261 __m128i res;
15262 res = vmvnq_s16(_pM128i(a));
15263 return64(res);
15264}
15265
15266_NEON2SSESTORAGE int32x2_t vmvn_s32(int32x2_t a); // VMVN d0,d0
15267_NEON2SSE_INLINE int32x2_t vmvn_s32(int32x2_t a)
15268{
15269 int32x2_t res64;
15270 __m128i res;
15271 res = vmvnq_s32(_pM128i(a));
15272 return64(res);
15273}
15274
15275_NEON2SSE_GLOBAL uint8x8_t vmvn_u8(uint8x8_t a); // VMVN d0,d0
15276#define vmvn_u8 vmvn_s8
15277
15278_NEON2SSE_GLOBAL uint16x4_t vmvn_u16(uint16x4_t a); // VMVN d0,d0
15279#define vmvn_u16 vmvn_s16
15280
15281_NEON2SSE_GLOBAL uint32x2_t vmvn_u32(uint32x2_t a); // VMVN d0,d0
15282#define vmvn_u32 vmvn_s32
15283
15284_NEON2SSE_GLOBAL poly8x8_t vmvn_p8(poly8x8_t a); // VMVN d0,d0
15285#define vmvn_p8 vmvn_u8
15286
15287_NEON2SSESTORAGE int8x16_t vmvnq_s8(int8x16_t a); // VMVN q0,q0
15288_NEON2SSE_INLINE int8x16_t vmvnq_s8(int8x16_t a) // VMVN q0,q0
15289{
15290 __m128i c1;
15291 c1 = _mm_cmpeq_epi8 (a,a); //0xff
15292 return _mm_andnot_si128 (a, c1);
15293}
15294
15295_NEON2SSESTORAGE int16x8_t vmvnq_s16(int16x8_t a); // VMVN q0,q0
15296_NEON2SSE_INLINE int16x8_t vmvnq_s16(int16x8_t a) // VMVN q0,q0
15297{
15298 __m128i c1;
15299 c1 = _mm_cmpeq_epi16 (a,a); //0xffff
15300 return _mm_andnot_si128 (a, c1);
15301}
15302
15303_NEON2SSESTORAGE int32x4_t vmvnq_s32(int32x4_t a); // VMVN q0,q0
15304_NEON2SSE_INLINE int32x4_t vmvnq_s32(int32x4_t a) // VMVN q0,q0
15305{
15306 __m128i c1;
15307 c1 = _mm_cmpeq_epi32 (a,a); //0xffffffff
15308 return _mm_andnot_si128 (a, c1);
15309}
15310
15311_NEON2SSE_GLOBAL uint8x16_t vmvnq_u8(uint8x16_t a); // VMVN q0,q0
15312#define vmvnq_u8 vmvnq_s8
15313
15314_NEON2SSE_GLOBAL uint16x8_t vmvnq_u16(uint16x8_t a); // VMVN q0,q0
15315#define vmvnq_u16 vmvnq_s16
15316
15317_NEON2SSE_GLOBAL uint32x4_t vmvnq_u32(uint32x4_t a); // VMVN q0,q0
15318#define vmvnq_u32 vmvnq_s32
15319
15320_NEON2SSE_GLOBAL poly8x16_t vmvnq_p8(poly8x16_t a); // VMVN q0,q0
15321#define vmvnq_p8 vmvnq_u8
15322
15323//****************** Bitwise and ***********************
15324//******************************************************
15325_NEON2SSESTORAGE int8x8_t vand_s8(int8x8_t a, int8x8_t b); // VAND d0,d0,d0
15326_NEON2SSE_INLINE int8x8_t vand_s8(int8x8_t a, int8x8_t b)
15327{
15328 int8x8_t res64;
15329 return64(_mm_and_si128(_pM128i(a),_pM128i(b)));
15330}
15331
15332_NEON2SSESTORAGE int16x4_t vand_s16(int16x4_t a, int16x4_t b); // VAND d0,d0,d0
15333_NEON2SSE_INLINE int16x4_t vand_s16(int16x4_t a, int16x4_t b)
15334{
15335 int16x4_t res64;
15336 return64(_mm_and_si128(_pM128i(a),_pM128i(b)));
15337}
15338
15339_NEON2SSESTORAGE int32x2_t vand_s32(int32x2_t a, int32x2_t b); // VAND d0,d0,d0
15340_NEON2SSE_INLINE int32x2_t vand_s32(int32x2_t a, int32x2_t b)
15341{
15342 int32x2_t res64;
15343 return64(_mm_and_si128(_pM128i(a),_pM128i(b)));
15344}
15345
15346
15347_NEON2SSESTORAGE int64x1_t vand_s64(int64x1_t a, int64x1_t b); // VAND d0,d0,d0
15348_NEON2SSE_INLINE int64x1_t vand_s64(int64x1_t a, int64x1_t b)
15349{
15350 int64x1_t res;
15351 res.m64_i64[0] = a.m64_i64[0] & b.m64_i64[0];
15352 return res;
15353}
15354
15355_NEON2SSE_GLOBAL uint8x8_t vand_u8(uint8x8_t a, uint8x8_t b); // VAND d0,d0,d0
15356#define vand_u8 vand_s8
15357
15358_NEON2SSE_GLOBAL uint16x4_t vand_u16(uint16x4_t a, uint16x4_t b); // VAND d0,d0,d0
15359#define vand_u16 vand_s16
15360
15361_NEON2SSE_GLOBAL uint32x2_t vand_u32(uint32x2_t a, uint32x2_t b); // VAND d0,d0,d0
15362#define vand_u32 vand_s32
15363
15364_NEON2SSE_GLOBAL uint64x1_t vand_u64(uint64x1_t a, uint64x1_t b); // VAND d0,d0,d0
15365#define vand_u64 vand_s64
15366
15367
15368_NEON2SSE_GLOBAL int8x16_t vandq_s8(int8x16_t a, int8x16_t b); // VAND q0,q0,q0
15369#define vandq_s8 _mm_and_si128
15370
15371_NEON2SSE_GLOBAL int16x8_t vandq_s16(int16x8_t a, int16x8_t b); // VAND q0,q0,q0
15372#define vandq_s16 _mm_and_si128
15373
15374_NEON2SSE_GLOBAL int32x4_t vandq_s32(int32x4_t a, int32x4_t b); // VAND q0,q0,q0
15375#define vandq_s32 _mm_and_si128
15376
15377_NEON2SSE_GLOBAL int64x2_t vandq_s64(int64x2_t a, int64x2_t b); // VAND q0,q0,q0
15378#define vandq_s64 _mm_and_si128
15379
15380_NEON2SSE_GLOBAL uint8x16_t vandq_u8(uint8x16_t a, uint8x16_t b); // VAND q0,q0,q0
15381#define vandq_u8 _mm_and_si128
15382
15383_NEON2SSE_GLOBAL uint16x8_t vandq_u16(uint16x8_t a, uint16x8_t b); // VAND q0,q0,q0
15384#define vandq_u16 _mm_and_si128
15385
15386_NEON2SSE_GLOBAL uint32x4_t vandq_u32(uint32x4_t a, uint32x4_t b); // VAND q0,q0,q0
15387#define vandq_u32 _mm_and_si128
15388
15389_NEON2SSE_GLOBAL uint64x2_t vandq_u64(uint64x2_t a, uint64x2_t b); // VAND q0,q0,q0
15390#define vandq_u64 _mm_and_si128
15391
15392//******************** Bitwise or *********************************
15393//******************************************************************
15394_NEON2SSESTORAGE int8x8_t vorr_s8(int8x8_t a, int8x8_t b); // VORR d0,d0,d0
15395_NEON2SSE_INLINE int8x8_t vorr_s8(int8x8_t a, int8x8_t b)
15396{
15397 int8x8_t res64;
15398 return64(_mm_or_si128(_pM128i(a),_pM128i(b)));
15399}
15400
15401
15402_NEON2SSESTORAGE int16x4_t vorr_s16(int16x4_t a, int16x4_t b); // VORR d0,d0,d0
15403_NEON2SSE_INLINE int16x4_t vorr_s16(int16x4_t a, int16x4_t b)
15404{
15405 int16x4_t res64;
15406 return64(_mm_or_si128(_pM128i(a),_pM128i(b)));
15407}
15408
15409
15410_NEON2SSESTORAGE int32x2_t vorr_s32(int32x2_t a, int32x2_t b); // VORR d0,d0,d0
15411_NEON2SSE_INLINE int32x2_t vorr_s32(int32x2_t a, int32x2_t b)
15412{
15413 int32x2_t res64;
15414 return64(_mm_or_si128(_pM128i(a),_pM128i(b)));
15415}
15416
15417
15418_NEON2SSESTORAGE int64x1_t vorr_s64(int64x1_t a, int64x1_t b); // VORR d0,d0,d0
15419_NEON2SSE_INLINE int64x1_t vorr_s64(int64x1_t a, int64x1_t b)
15420{
15421 int64x1_t res;
15422 res.m64_i64[0] = a.m64_i64[0] | b.m64_i64[0];
15423 return res;
15424}
15425
15426_NEON2SSE_GLOBAL uint8x8_t vorr_u8(uint8x8_t a, uint8x8_t b); // VORR d0,d0,d0
15427#define vorr_u8 vorr_s8
15428
15429_NEON2SSE_GLOBAL uint16x4_t vorr_u16(uint16x4_t a, uint16x4_t b); // VORR d0,d0,d0
15430#define vorr_u16 vorr_s16
15431
15432_NEON2SSE_GLOBAL uint32x2_t vorr_u32(uint32x2_t a, uint32x2_t b); // VORR d0,d0,d0
15433#define vorr_u32 vorr_s32
15434
15435_NEON2SSE_GLOBAL uint64x1_t vorr_u64(uint64x1_t a, uint64x1_t b); // VORR d0,d0,d0
15436#define vorr_u64 vorr_s64
15437
15438_NEON2SSE_GLOBAL int8x16_t vorrq_s8(int8x16_t a, int8x16_t b); // VORR q0,q0,q0
15439#define vorrq_s8 _mm_or_si128
15440
15441_NEON2SSE_GLOBAL int16x8_t vorrq_s16(int16x8_t a, int16x8_t b); // VORR q0,q0,q0
15442#define vorrq_s16 _mm_or_si128
15443
15444_NEON2SSE_GLOBAL int32x4_t vorrq_s32(int32x4_t a, int32x4_t b); // VORR q0,q0,q0
15445#define vorrq_s32 _mm_or_si128
15446
15447_NEON2SSE_GLOBAL int64x2_t vorrq_s64(int64x2_t a, int64x2_t b); // VORR q0,q0,q0
15448#define vorrq_s64 _mm_or_si128
15449
15450_NEON2SSE_GLOBAL uint8x16_t vorrq_u8(uint8x16_t a, uint8x16_t b); // VORR q0,q0,q0
15451#define vorrq_u8 _mm_or_si128
15452
15453_NEON2SSE_GLOBAL uint16x8_t vorrq_u16(uint16x8_t a, uint16x8_t b); // VORR q0,q0,q0
15454#define vorrq_u16 _mm_or_si128
15455
15456_NEON2SSE_GLOBAL uint32x4_t vorrq_u32(uint32x4_t a, uint32x4_t b); // VORR q0,q0,q0
15457#define vorrq_u32 _mm_or_si128
15458
15459_NEON2SSE_GLOBAL uint64x2_t vorrq_u64(uint64x2_t a, uint64x2_t b); // VORR q0,q0,q0
15460#define vorrq_u64 _mm_or_si128
15461
15462//************* Bitwise exclusive or (EOR or XOR) ******************
15463//*******************************************************************
15464_NEON2SSESTORAGE int8x8_t veor_s8(int8x8_t a, int8x8_t b); // VEOR d0,d0,d0
15465_NEON2SSE_INLINE int8x8_t veor_s8(int8x8_t a, int8x8_t b)
15466{
15467 int8x8_t res64;
15468 return64(_mm_xor_si128(_pM128i(a),_pM128i(b)));
15469}
15470
15471_NEON2SSE_GLOBAL int16x4_t veor_s16(int16x4_t a, int16x4_t b); // VEOR d0,d0,d0
15472#define veor_s16 veor_s8
15473
15474_NEON2SSE_GLOBAL int32x2_t veor_s32(int32x2_t a, int32x2_t b); // VEOR d0,d0,d0
15475#define veor_s32 veor_s8
15476
15477_NEON2SSESTORAGE int64x1_t veor_s64(int64x1_t a, int64x1_t b); // VEOR d0,d0,d0
15478_NEON2SSE_INLINE int64x1_t veor_s64(int64x1_t a, int64x1_t b)
15479{
15480 int64x1_t res;
15481 res.m64_i64[0] = a.m64_i64[0] ^ b.m64_i64[0];
15482 return res;
15483}
15484
15485_NEON2SSE_GLOBAL uint8x8_t veor_u8(uint8x8_t a, uint8x8_t b); // VEOR d0,d0,d0
15486#define veor_u8 veor_s8
15487
15488_NEON2SSE_GLOBAL uint16x4_t veor_u16(uint16x4_t a, uint16x4_t b); // VEOR d0,d0,d0
15489#define veor_u16 veor_s16
15490
15491_NEON2SSE_GLOBAL uint32x2_t veor_u32(uint32x2_t a, uint32x2_t b); // VEOR d0,d0,d0
15492#define veor_u32 veor_s32
15493
15494_NEON2SSE_GLOBAL uint64x1_t veor_u64(uint64x1_t a, uint64x1_t b); // VEOR d0,d0,d0
15495#define veor_u64 veor_s64
15496
15497_NEON2SSE_GLOBAL int8x16_t veorq_s8(int8x16_t a, int8x16_t b); // VEOR q0,q0,q0
15498#define veorq_s8 _mm_xor_si128
15499
15500_NEON2SSE_GLOBAL int16x8_t veorq_s16(int16x8_t a, int16x8_t b); // VEOR q0,q0,q0
15501#define veorq_s16 _mm_xor_si128
15502
15503_NEON2SSE_GLOBAL int32x4_t veorq_s32(int32x4_t a, int32x4_t b); // VEOR q0,q0,q0
15504#define veorq_s32 _mm_xor_si128
15505
15506_NEON2SSE_GLOBAL int64x2_t veorq_s64(int64x2_t a, int64x2_t b); // VEOR q0,q0,q0
15507#define veorq_s64 _mm_xor_si128
15508
15509_NEON2SSE_GLOBAL uint8x16_t veorq_u8(uint8x16_t a, uint8x16_t b); // VEOR q0,q0,q0
15510#define veorq_u8 _mm_xor_si128
15511
15512_NEON2SSE_GLOBAL uint16x8_t veorq_u16(uint16x8_t a, uint16x8_t b); // VEOR q0,q0,q0
15513#define veorq_u16 _mm_xor_si128
15514
15515_NEON2SSE_GLOBAL uint32x4_t veorq_u32(uint32x4_t a, uint32x4_t b); // VEOR q0,q0,q0
15516#define veorq_u32 _mm_xor_si128
15517
15518_NEON2SSE_GLOBAL uint64x2_t veorq_u64(uint64x2_t a, uint64x2_t b); // VEOR q0,q0,q0
15519#define veorq_u64 _mm_xor_si128
15520
15521//********************** Bit Clear **********************************
15522//*******************************************************************
15523//Logical AND complement (AND negation or AND NOT)
15524_NEON2SSESTORAGE int8x8_t vbic_s8(int8x8_t a, int8x8_t b); // VBIC d0,d0,d0
15525_NEON2SSE_INLINE int8x8_t vbic_s8(int8x8_t a, int8x8_t b)
15526{
15527 int8x8_t res64;
15528 return64(_mm_andnot_si128(_pM128i(b),_pM128i(a))); //notice the arguments "swap"
15529}
15530
15531_NEON2SSE_GLOBAL int16x4_t vbic_s16(int16x4_t a, int16x4_t b); // VBIC d0,d0,d0
15532#define vbic_s16 vbic_s8
15533
15534_NEON2SSE_GLOBAL int32x2_t vbic_s32(int32x2_t a, int32x2_t b); // VBIC d0,d0,d0
15535#define vbic_s32 vbic_s8
15536
15537_NEON2SSESTORAGE int64x1_t vbic_s64(int64x1_t a, int64x1_t b); // VBIC d0,d0,d0
15538_NEON2SSE_INLINE int64x1_t vbic_s64(int64x1_t a, int64x1_t b)
15539{
15540 int64x1_t res;
15541 res.m64_i64[0] = a.m64_i64[0] & (~b.m64_i64[0]);
15542 return res;
15543}
15544
15545_NEON2SSE_GLOBAL uint8x8_t vbic_u8(uint8x8_t a, uint8x8_t b); // VBIC d0,d0,d0
15546#define vbic_u8 vbic_s8
15547
15548_NEON2SSE_GLOBAL uint16x4_t vbic_u16(uint16x4_t a, uint16x4_t b); // VBIC d0,d0,d0
15549#define vbic_u16 vbic_s16
15550
15551_NEON2SSE_GLOBAL uint32x2_t vbic_u32(uint32x2_t a, uint32x2_t b); // VBIC d0,d0,d0
15552#define vbic_u32 vbic_s32
15553
15554_NEON2SSE_GLOBAL uint64x1_t vbic_u64(uint64x1_t a, uint64x1_t b); // VBIC d0,d0,d0
15555#define vbic_u64 vbic_s64
15556
15557_NEON2SSE_GLOBAL int8x16_t vbicq_s8(int8x16_t a, int8x16_t b); // VBIC q0,q0,q0
15558#define vbicq_s8(a,b) _mm_andnot_si128 (b,a) //notice arguments "swap"
15559
15560_NEON2SSE_GLOBAL int16x8_t vbicq_s16(int16x8_t a, int16x8_t b); // VBIC q0,q0,q0
15561#define vbicq_s16(a,b) _mm_andnot_si128 (b,a) //notice arguments "swap"
15562
15563_NEON2SSE_GLOBAL int32x4_t vbicq_s32(int32x4_t a, int32x4_t b); // VBIC q0,q0,q0
15564#define vbicq_s32(a,b) _mm_andnot_si128 (b,a) //notice arguments "swap"
15565
15566_NEON2SSE_GLOBAL int64x2_t vbicq_s64(int64x2_t a, int64x2_t b); // VBIC q0,q0,q0
15567#define vbicq_s64(a,b) _mm_andnot_si128 (b,a) //notice arguments "swap"
15568
15569_NEON2SSE_GLOBAL uint8x16_t vbicq_u8(uint8x16_t a, uint8x16_t b); // VBIC q0,q0,q0
15570#define vbicq_u8(a,b) _mm_andnot_si128 (b,a) //notice arguments "swap"
15571
15572_NEON2SSE_GLOBAL uint16x8_t vbicq_u16(uint16x8_t a, uint16x8_t b); // VBIC q0,q0,q0
15573#define vbicq_u16(a,b) _mm_andnot_si128 (b,a) //notice arguments "swap"
15574
15575_NEON2SSE_GLOBAL uint32x4_t vbicq_u32(uint32x4_t a, uint32x4_t b); // VBIC q0,q0,q0
15576#define vbicq_u32(a,b) _mm_andnot_si128 (b,a) //notice arguments "swap"
15577
15578_NEON2SSE_GLOBAL uint64x2_t vbicq_u64(uint64x2_t a, uint64x2_t b); // VBIC q0,q0,q0
15579#define vbicq_u64(a,b) _mm_andnot_si128 (b,a) //notice arguments "swap"
15580
15581//**************** Bitwise OR complement ********************************
15582//**************************************** ********************************
15583//no exact IA 32 match, need to implement it as following
15584_NEON2SSESTORAGE int8x8_t vorn_s8(int8x8_t a, int8x8_t b); // VORN d0,d0,d0
15585_NEON2SSE_INLINE int8x8_t vorn_s8(int8x8_t a, int8x8_t b)
15586{
15587 int8x8_t res64;
15588 return64(vornq_s8(_pM128i(a), _pM128i(b)));
15589}
15590
15591
15592_NEON2SSESTORAGE int16x4_t vorn_s16(int16x4_t a, int16x4_t b); // VORN d0,d0,d0
15593_NEON2SSE_INLINE int16x4_t vorn_s16(int16x4_t a, int16x4_t b)
15594{
15595 int16x4_t res64;
15596 return64(vornq_s16(_pM128i(a), _pM128i(b)));
15597}
15598
15599
15600_NEON2SSESTORAGE int32x2_t vorn_s32(int32x2_t a, int32x2_t b); // VORN d0,d0,d0
15601_NEON2SSE_INLINE int32x2_t vorn_s32(int32x2_t a, int32x2_t b)
15602{
15603 int32x2_t res64;
15604 return64(vornq_s32(_pM128i(a), _pM128i(b)));
15605}
15606
15607
15608_NEON2SSESTORAGE int64x1_t vorn_s64(int64x1_t a, int64x1_t b); // VORN d0,d0,d0
15609_NEON2SSE_INLINE int64x1_t vorn_s64(int64x1_t a, int64x1_t b)
15610{
15611 int64x1_t res;
15612 res.m64_i64[0] = a.m64_i64[0] | (~b.m64_i64[0]);
15613 return res;
15614}
15615
15616_NEON2SSE_GLOBAL uint8x8_t vorn_u8(uint8x8_t a, uint8x8_t b); // VORN d0,d0,d0
15617#define vorn_u8 vorn_s8
15618
15619
15620_NEON2SSE_GLOBAL uint16x4_t vorn_u16(uint16x4_t a, uint16x4_t b); // VORN d0,d0,d0
15621#define vorn_u16 vorn_s16
15622
15623_NEON2SSE_GLOBAL uint32x2_t vorn_u32(uint32x2_t a, uint32x2_t b); // VORN d0,d0,d0
15624#define vorn_u32 vorn_s32
15625
15626_NEON2SSE_GLOBAL uint64x1_t vorn_u64(uint64x1_t a, uint64x1_t b); // VORN d0,d0,d0
15627#define vorn_u64 vorn_s64
15628
15629
15630_NEON2SSESTORAGE int8x16_t vornq_s8(int8x16_t a, int8x16_t b); // VORN q0,q0,q0
15631_NEON2SSE_INLINE int8x16_t vornq_s8(int8x16_t a, int8x16_t b) // VORN q0,q0,q0
15632{
15633 __m128i b1;
15634 b1 = vmvnq_s8( b); //bitwise not for b
15635 return _mm_or_si128 (a, b1);
15636}
15637
15638_NEON2SSESTORAGE int16x8_t vornq_s16(int16x8_t a, int16x8_t b); // VORN q0,q0,q0
15639_NEON2SSE_INLINE int16x8_t vornq_s16(int16x8_t a, int16x8_t b) // VORN q0,q0,q0
15640{
15641 __m128i b1;
15642 b1 = vmvnq_s16( b); //bitwise not for b
15643 return _mm_or_si128 (a, b1);
15644}
15645
15646_NEON2SSESTORAGE int32x4_t vornq_s32(int32x4_t a, int32x4_t b); // VORN q0,q0,q0
15647_NEON2SSE_INLINE int32x4_t vornq_s32(int32x4_t a, int32x4_t b) // VORN q0,q0,q0
15648{
15649 __m128i b1;
15650 b1 = vmvnq_s32( b); //bitwise not for b
15651 return _mm_or_si128 (a, b1);
15652}
15653
15654_NEON2SSESTORAGE int64x2_t vornq_s64(int64x2_t a, int64x2_t b); // VORN q0,q0,q0
15655_NEON2SSE_INLINE int64x2_t vornq_s64(int64x2_t a, int64x2_t b)
15656{
15657 __m128i c1, b1;
15658 c1 = _mm_cmpeq_epi8 (a, a); //all ones 0xfffffff...fffff
15659 b1 = _mm_andnot_si128 (b, c1);
15660 return _mm_or_si128 (a, b1);
15661}
15662
15663_NEON2SSESTORAGE uint8x16_t vornq_u8(uint8x16_t a, uint8x16_t b); // VORN q0,q0,q0
15664_NEON2SSE_INLINE uint8x16_t vornq_u8(uint8x16_t a, uint8x16_t b) // VORN q0,q0,q0
15665{
15666 __m128i b1;
15667 b1 = vmvnq_u8( b); //bitwise not for b
15668 return _mm_or_si128 (a, b1);
15669}
15670
15671_NEON2SSESTORAGE uint16x8_t vornq_u16(uint16x8_t a, uint16x8_t b); // VORN q0,q0,q0
15672_NEON2SSE_INLINE uint16x8_t vornq_u16(uint16x8_t a, uint16x8_t b) // VORN q0,q0,q0
15673{
15674 __m128i b1;
15675 b1 = vmvnq_s16( b); //bitwise not for b
15676 return _mm_or_si128 (a, b1);
15677}
15678
15679_NEON2SSESTORAGE uint32x4_t vornq_u32(uint32x4_t a, uint32x4_t b); // VORN q0,q0,q0
15680_NEON2SSE_INLINE uint32x4_t vornq_u32(uint32x4_t a, uint32x4_t b) // VORN q0,q0,q0
15681{
15682 __m128i b1;
15683 b1 = vmvnq_u32( b); //bitwise not for b
15684 return _mm_or_si128 (a, b1);
15685}
15686_NEON2SSE_GLOBAL uint64x2_t vornq_u64(uint64x2_t a, uint64x2_t b); // VORN q0,q0,q0
15687#define vornq_u64 vornq_s64
15688
15689//********************* Bitwise Select *****************************
15690//******************************************************************
15691//Note This intrinsic can compile to any of VBSL/VBIF/VBIT depending on register allocation.(?????????)
15692
15693//VBSL (Bitwise Select) selects each bit for the destination from the first operand if the
15694//corresponding bit of the destination is 1, or from the second operand if the corresponding bit of the destination is 0.
15695
15696//VBIF (Bitwise Insert if False) inserts each bit from the first operand into the destination
15697//if the corresponding bit of the second operand is 0, otherwise leaves the destination bit unchanged
15698
15699//VBIT (Bitwise Insert if True) inserts each bit from the first operand into the destination
15700//if the corresponding bit of the second operand is 1, otherwise leaves the destination bit unchanged.
15701
15702//VBSL only is implemented for SIMD
15703_NEON2SSESTORAGE int8x8_t vbsl_s8(uint8x8_t a, int8x8_t b, int8x8_t c); // VBSL d0,d0,d0
15704_NEON2SSE_INLINE int8x8_t vbsl_s8(uint8x8_t a, int8x8_t b, int8x8_t c)
15705{
15706 int8x8_t res64;
15707 __m128i res;
15708 res = vbslq_s8(_pM128i(a), _pM128i(b), _pM128i(c));
15709 return64(res);
15710}
15711
15712_NEON2SSE_GLOBAL int16x4_t vbsl_s16(uint16x4_t a, int16x4_t b, int16x4_t c); // VBSL d0,d0,d0
15713#define vbsl_s16 vbsl_s8
15714
15715_NEON2SSE_GLOBAL int32x2_t vbsl_s32(uint32x2_t a, int32x2_t b, int32x2_t c); // VBSL d0,d0,d0
15716#define vbsl_s32 vbsl_s8
15717
15718_NEON2SSESTORAGE int64x1_t vbsl_s64(uint64x1_t a, int64x1_t b, int64x1_t c); // VBSL d0,d0,d0
15719_NEON2SSE_INLINE int64x1_t vbsl_s64(uint64x1_t a, int64x1_t b, int64x1_t c)
15720{
15721 int64x1_t res;
15722 res.m64_i64[0] = (a.m64_i64[0] & b.m64_i64[0]) | ( (~a.m64_i64[0]) & c.m64_i64[0]);
15723 return res;
15724}
15725
15726_NEON2SSE_GLOBAL uint8x8_t vbsl_u8(uint8x8_t a, uint8x8_t b, uint8x8_t c); // VBSL d0,d0,d0
15727#define vbsl_u8 vbsl_s8
15728
15729_NEON2SSE_GLOBAL uint16x4_t vbsl_u16(uint16x4_t a, uint16x4_t b, uint16x4_t c); // VBSL d0,d0,d0
15730#define vbsl_u16 vbsl_s8
15731
15732_NEON2SSE_GLOBAL uint32x2_t vbsl_u32(uint32x2_t a, uint32x2_t b, uint32x2_t c); // VBSL d0,d0,d0
15733#define vbsl_u32 vbsl_s8
15734
15735_NEON2SSE_GLOBAL uint64x1_t vbsl_u64(uint64x1_t a, uint64x1_t b, uint64x1_t c); // VBSL d0,d0,d0
15736#define vbsl_u64 vbsl_s64
15737
15738_NEON2SSESTORAGE float32x2_t vbsl_f32(uint32x2_t a, float32x2_t b, float32x2_t c); // VBSL d0,d0,d0
15739_NEON2SSE_INLINE float32x2_t vbsl_f32(uint32x2_t a, float32x2_t b, float32x2_t c)
15740{
15741 __m128 sel1, sel2;
15742 __m64_128 res64;
15743 sel1 = _mm_and_ps (_pM128(a), _pM128(b));
15744 sel2 = _mm_andnot_ps (_pM128(a), _pM128(c));
15745 sel1 = _mm_or_ps (sel1, sel2);
15746 _M64f(res64, sel1);
15747 return res64;
15748}
15749
15750_NEON2SSE_GLOBAL poly8x8_t vbsl_p8(uint8x8_t a, poly8x8_t b, poly8x8_t c); // VBSL d0,d0,d0
15751#define vbsl_p8 vbsl_s8
15752
15753_NEON2SSE_GLOBAL poly16x4_t vbsl_p16(uint16x4_t a, poly16x4_t b, poly16x4_t c); // VBSL d0,d0,d0
15754#define vbsl_p16 vbsl_s8
15755
15756_NEON2SSESTORAGE int8x16_t vbslq_s8(uint8x16_t a, int8x16_t b, int8x16_t c); // VBSL q0,q0,q0
15757_NEON2SSE_INLINE int8x16_t vbslq_s8(uint8x16_t a, int8x16_t b, int8x16_t c) // VBSL q0,q0,q0
15758{
15759 __m128i sel1, sel2;
15760 sel1 = _mm_and_si128 (a, b);
15761 sel2 = _mm_andnot_si128 (a, c);
15762 return _mm_or_si128 (sel1, sel2);
15763}
15764
15765_NEON2SSE_GLOBAL int16x8_t vbslq_s16(uint16x8_t a, int16x8_t b, int16x8_t c); // VBSL q0,q0,q0
15766#define vbslq_s16 vbslq_s8
15767
15768_NEON2SSE_GLOBAL int32x4_t vbslq_s32(uint32x4_t a, int32x4_t b, int32x4_t c); // VBSL q0,q0,q0
15769#define vbslq_s32 vbslq_s8
15770
15771_NEON2SSE_GLOBAL int64x2_t vbslq_s64(uint64x2_t a, int64x2_t b, int64x2_t c); // VBSL q0,q0,q0
15772#define vbslq_s64 vbslq_s8
15773
15774_NEON2SSE_GLOBAL uint8x16_t vbslq_u8(uint8x16_t a, uint8x16_t b, uint8x16_t c); // VBSL q0,q0,q0
15775#define vbslq_u8 vbslq_s8
15776
15777_NEON2SSE_GLOBAL uint16x8_t vbslq_u16(uint16x8_t a, uint16x8_t b, uint16x8_t c); // VBSL q0,q0,q0
15778#define vbslq_u16 vbslq_s8
15779
15780_NEON2SSE_GLOBAL uint32x4_t vbslq_u32(uint32x4_t a, uint32x4_t b, uint32x4_t c); // VBSL q0,q0,q0
15781#define vbslq_u32 vbslq_s8
15782
15783_NEON2SSE_GLOBAL uint64x2_t vbslq_u64(uint64x2_t a, uint64x2_t b, uint64x2_t c); // VBSL q0,q0,q0
15784#define vbslq_u64 vbslq_s8
15785
15786_NEON2SSESTORAGE float32x4_t vbslq_f32(uint32x4_t a, float32x4_t b, float32x4_t c); // VBSL q0,q0,q0
15787_NEON2SSE_INLINE float32x4_t vbslq_f32(uint32x4_t a, float32x4_t b, float32x4_t c) // VBSL q0,q0,q0
15788{
15789 __m128 sel1, sel2;
15790 sel1 = _mm_and_ps (*(__m128*)&a, b);
15791 sel2 = _mm_andnot_ps (*(__m128*)&a, c);
15792 return _mm_or_ps (sel1, sel2);
15793}
15794
15795_NEON2SSE_GLOBAL poly8x16_t vbslq_p8(uint8x16_t a, poly8x16_t b, poly8x16_t c); // VBSL q0,q0,q0
15796#define vbslq_p8 vbslq_u8
15797
15798_NEON2SSE_GLOBAL poly16x8_t vbslq_p16(uint16x8_t a, poly16x8_t b, poly16x8_t c); // VBSL q0,q0,q0
15799#define vbslq_p16 vbslq_s8
15800
15801//************************************************************************************
15802//**************** Transposition operations ****************************************
15803//************************************************************************************
15804//***************** Vector Transpose ************************************************
15805//************************************************************************************
15806//VTRN (Vector Transpose) treats the elements of its operand vectors as elements of 2 x 2 matrices, and transposes the matrices.
15807// making the result look as (a0, b0, a2, b2, a4, b4,....) (a1, b1, a3, b3, a5, b5,.....)
15808_NEON2SSESTORAGE int8x8x2_t vtrn_s8(int8x8_t a, int8x8_t b); // VTRN.8 d0,d0
15809_NEON2SSE_INLINE int8x8x2_t vtrn_s8(int8x8_t a, int8x8_t b) // VTRN.8 d0,d0
15810{
15811 int8x8x2_t val;
15812 __m128i tmp, val0;
15813 tmp = _mm_unpacklo_epi8(_pM128i(a), _pM128i(b)); //a0,b0,a1,b1,a2,b2,a3,b3,...,a7,b7
15814 val0 = _mm_shuffle_epi8 (tmp, *(__m128i*)mask8_32_even_odd); //(a0, b0, a2, b2, a4, b4, a6, b6), (a1,b1, a3,b3, a5,b5, a7,b7)
15815 vst1q_s8 (val.val, val0); // _mm_shuffle_epi32 (val.val[0], _SWAP_HI_LOW32); //(a1,b1, a3,b3, a5,b5, a7,b7),(a0, b0, a2, b2, a4, b4, a6, b6),
15816 return val;
15817}
15818
15819_NEON2SSESTORAGE int16x4x2_t vtrn_s16(int16x4_t a, int16x4_t b); // VTRN.16 d0,d0
15820_NEON2SSE_INLINE int16x4x2_t vtrn_s16(int16x4_t a, int16x4_t b) // VTRN.16 d0,d0
15821{
15822 int16x4x2_t val;
15823 __m128i tmp, val0;
15824 _NEON2SSE_ALIGN_16 static const int8_t maskdlv16[16] = {0,1, 2,3, 8,9, 10,11, 4,5, 6,7, 12,13, 14, 15};
15825 tmp = _mm_unpacklo_epi16(_pM128i(a), _pM128i(b)); //a0,b0,a1,b1,a2,b2,a3,b3
15826 val0 = _mm_shuffle_epi8 (tmp, *(__m128i*)maskdlv16); //a0, b0, a2, b2, a1,b1, a3, b3
15827 vst1q_s16(val.val, val0); // _mm_shuffle_epi32 (val.val[0], _SWAP_HI_LOW32); //(a1,b1, a3,b3),(a0, b0, a2, b2),
15828 return val;
15829}
15830
15831_NEON2SSESTORAGE int32x2x2_t vtrn_s32(int32x2_t a, int32x2_t b); // VTRN.32 d0,d0
15832_NEON2SSE_INLINE int32x2x2_t vtrn_s32(int32x2_t a, int32x2_t b)
15833{
15834 int32x2x2_t val;
15835 __m128i val0;
15836 val0 = _mm_unpacklo_epi32(_pM128i(a), _pM128i(b)); //a0,b0,a1,b1
15837 vst1q_s32(val.val, val0); // _mm_shuffle_epi32(val.val[0], _SWAP_HI_LOW32); //a1,b1, a0,b0,
15838 return val;
15839}
15840
15841_NEON2SSE_GLOBAL uint8x8x2_t vtrn_u8(uint8x8_t a, uint8x8_t b); // VTRN.8 d0,d0
15842#define vtrn_u8 vtrn_s8
15843
15844_NEON2SSE_GLOBAL uint16x4x2_t vtrn_u16(uint16x4_t a, uint16x4_t b); // VTRN.16 d0,d0
15845#define vtrn_u16 vtrn_s16
15846
15847_NEON2SSE_GLOBAL uint32x2x2_t vtrn_u32(uint32x2_t a, uint32x2_t b); // VTRN.32 d0,d0
15848#define vtrn_u32 vtrn_s32
15849
15850_NEON2SSESTORAGE float32x2x2_t vtrn_f32(float32x2_t a, float32x2_t b); // VTRN.32 d0,d0
15851_NEON2SSE_INLINE float32x2x2_t vtrn_f32(float32x2_t a, float32x2_t b)
15852{
15853 float32x2x2_t val;
15854 val.val[0].m64_f32[0] = a.m64_f32[0];
15855 val.val[0].m64_f32[1] = b.m64_f32[0];
15856 val.val[1].m64_f32[0] = a.m64_f32[1];
15857 val.val[1].m64_f32[1] = b.m64_f32[1];
15858 return val; //a0,b0,a1,b1
15859}
15860
15861_NEON2SSE_GLOBAL poly8x8x2_t vtrn_p8(poly8x8_t a, poly8x8_t b); // VTRN.8 d0,d0
15862#define vtrn_p8 vtrn_u8
15863
15864_NEON2SSE_GLOBAL poly16x4x2_t vtrn_p16(poly16x4_t a, poly16x4_t b); // VTRN.16 d0,d0
15865#define vtrn_p16 vtrn_s16
15866
15867//int8x16x2_t vtrnq_s8(int8x16_t a, int8x16_t b); // VTRN.8 q0,q0
15868_NEON2SSE_INLINE int8x16x2_t vtrnq_s8(int8x16_t a, int8x16_t b) // VTRN.8 q0,q0
15869{
15870 int8x16x2_t r8x16;
15871 __m128i a_sh, b_sh;
15872 a_sh = _mm_shuffle_epi8 (a, *(__m128i*)mask8_16_even_odd); //a0, a2, a4, a6, a8, a10, a12, a14, a1, a3, a5, a7, a9, a11, a13, a15
15873 b_sh = _mm_shuffle_epi8 (b, *(__m128i*)mask8_16_even_odd); //b0, b2, b4, b6, b8, b10, b12, b14, b1, b3, b5, b7, b9, b11, b13, b15
15874
15875 r8x16.val[0] = _mm_unpacklo_epi8(a_sh, b_sh); //(a0, b0, a2, b2, a4, b4, a6, b6, a8,b8, a10,b10, a12,b12, a14,b14)
15876 r8x16.val[1] = _mm_unpackhi_epi8(a_sh, b_sh); // (a1, b1, a3, b3, a5, b5, a7, b7, a9,b9, a11,b11, a13,b13, a15,b15)
15877 return r8x16;
15878}
15879
15880_NEON2SSESTORAGE int16x8x2_t vtrnq_s16(int16x8_t a, int16x8_t b); // VTRN.16 q0,q0
15881_NEON2SSE_INLINE int16x8x2_t vtrnq_s16(int16x8_t a, int16x8_t b) // VTRN.16 q0,q0
15882{
15883 int16x8x2_t v16x8;
15884 __m128i a_sh, b_sh;
15885 a_sh = _mm_shuffle_epi8 (a, *(__m128i*) mask8_32_even_odd); //a0, a2, a4, a6, a1, a3, a5, a7
15886 b_sh = _mm_shuffle_epi8 (b, *(__m128i*) mask8_32_even_odd); //b0, b2, b4, b6, b1, b3, b5, b7
15887 v16x8.val[0] = _mm_unpacklo_epi16(a_sh, b_sh); //a0, b0, a2, b2, a4, b4, a6, b6
15888 v16x8.val[1] = _mm_unpackhi_epi16(a_sh, b_sh); //a1, b1, a3, b3, a5, b5, a7, b7
15889 return v16x8;
15890}
15891
15892_NEON2SSESTORAGE int32x4x2_t vtrnq_s32(int32x4_t a, int32x4_t b); // VTRN.32 q0,q0
15893_NEON2SSE_INLINE int32x4x2_t vtrnq_s32(int32x4_t a, int32x4_t b) // VTRN.32 q0,q0
15894{
15895 //may be not optimal solution compared with serial
15896 int32x4x2_t v32x4;
15897 __m128i a_sh, b_sh;
15898 a_sh = _mm_shuffle_epi32 (a, 216); //a0, a2, a1, a3
15899 b_sh = _mm_shuffle_epi32 (b, 216); //b0, b2, b1, b3
15900
15901 v32x4.val[0] = _mm_unpacklo_epi32(a_sh, b_sh); //a0, b0, a2, b2
15902 v32x4.val[1] = _mm_unpackhi_epi32(a_sh, b_sh); //a1, b1, a3, b3
15903 return v32x4;
15904}
15905
15906_NEON2SSE_GLOBAL uint8x16x2_t vtrnq_u8(uint8x16_t a, uint8x16_t b); // VTRN.8 q0,q0
15907#define vtrnq_u8 vtrnq_s8
15908
15909_NEON2SSE_GLOBAL uint16x8x2_t vtrnq_u16(uint16x8_t a, uint16x8_t b); // VTRN.16 q0,q0
15910#define vtrnq_u16 vtrnq_s16
15911
15912_NEON2SSE_GLOBAL uint32x4x2_t vtrnq_u32(uint32x4_t a, uint32x4_t b); // VTRN.32 q0,q0
15913#define vtrnq_u32 vtrnq_s32
15914
15915_NEON2SSESTORAGE float32x4x2_t vtrnq_f32(float32x4_t a, float32x4_t b); // VTRN.32 q0,q0
15916_NEON2SSE_INLINE float32x4x2_t vtrnq_f32(float32x4_t a, float32x4_t b) // VTRN.32 q0,q0
15917{
15918 //may be not optimal solution compared with serial
15919 float32x4x2_t f32x4;
15920 __m128 a_sh, b_sh;
15921 a_sh = _mm_shuffle_ps (a, a, _MM_SHUFFLE(3,1, 2, 0)); //a0, a2, a1, a3, need to check endiness
15922 b_sh = _mm_shuffle_ps (b, b, _MM_SHUFFLE(3,1, 2, 0)); //b0, b2, b1, b3, need to check endiness
15923
15924 f32x4.val[0] = _mm_unpacklo_ps(a_sh, b_sh); //a0, b0, a2, b2
15925 f32x4.val[1] = _mm_unpackhi_ps(a_sh, b_sh); //a1, b1, a3, b3
15926 return f32x4;
15927}
15928
15929_NEON2SSE_GLOBAL poly8x16x2_t vtrnq_p8(poly8x16_t a, poly8x16_t b); // VTRN.8 q0,q0
15930#define vtrnq_p8 vtrnq_s8
15931
15932_NEON2SSE_GLOBAL poly16x8x2_t vtrnq_p16(poly16x8_t a, poly16x8_t b); // VTRN.16 q0,q0
15933#define vtrnq_p16 vtrnq_s16
15934
15935//***************** Interleave elements ***************************
15936//*****************************************************************
15937//output has (a0,b0,a1,b1, a2,b2,.....)
15938_NEON2SSESTORAGE int8x8x2_t vzip_s8(int8x8_t a, int8x8_t b); // VZIP.8 d0,d0
15939_NEON2SSE_INLINE int8x8x2_t vzip_s8(int8x8_t a, int8x8_t b) // VZIP.8 d0,d0
15940{
15941 int8x8x2_t val;
15942 __m128i val0;
15943 val0 = _mm_unpacklo_epi8(_pM128i(a), _pM128i(b));
15944 vst1q_s8(val.val, val0); //_mm_shuffle_epi32(val.val[0], _SWAP_HI_LOW32);
15945 return val;
15946}
15947
15948_NEON2SSESTORAGE int16x4x2_t vzip_s16(int16x4_t a, int16x4_t b); // VZIP.16 d0,d0
15949_NEON2SSE_INLINE int16x4x2_t vzip_s16(int16x4_t a, int16x4_t b) // VZIP.16 d0,d0
15950{
15951 int16x4x2_t val;
15952 __m128i val0;
15953 val0 = _mm_unpacklo_epi16(_pM128i(a), _pM128i(b));
15954 vst1q_s16(val.val, val0); // _mm_shuffle_epi32(val.val[0], _SWAP_HI_LOW32);
15955 return val;
15956}
15957
15958_NEON2SSE_GLOBAL int32x2x2_t vzip_s32(int32x2_t a, int32x2_t b); // VZIP.32 d0,d0
15959#define vzip_s32 vtrn_s32
15960
15961_NEON2SSE_GLOBAL uint8x8x2_t vzip_u8(uint8x8_t a, uint8x8_t b); // VZIP.8 d0,d0
15962#define vzip_u8 vzip_s8
15963
15964_NEON2SSE_GLOBAL uint16x4x2_t vzip_u16(uint16x4_t a, uint16x4_t b); // VZIP.16 d0,d0
15965#define vzip_u16 vzip_s16
15966
15967_NEON2SSE_GLOBAL uint32x2x2_t vzip_u32(uint32x2_t a, uint32x2_t b); // VZIP.32 d0,d0
15968#define vzip_u32 vzip_s32
15969
15970_NEON2SSE_GLOBAL float32x2x2_t vzip_f32(float32x2_t a, float32x2_t b); // VZIP.32 d0,d0
15971#define vzip_f32 vtrn_f32
15972
15973_NEON2SSE_GLOBAL poly8x8x2_t vzip_p8(poly8x8_t a, poly8x8_t b); // VZIP.8 d0,d0
15974#define vzip_p8 vzip_u8
15975
15976_NEON2SSE_GLOBAL poly16x4x2_t vzip_p16(poly16x4_t a, poly16x4_t b); // VZIP.16 d0,d0
15977#define vzip_p16 vzip_u16
15978
15979_NEON2SSESTORAGE int8x16x2_t vzipq_s8(int8x16_t a, int8x16_t b); // VZIP.8 q0,q0
15980_NEON2SSE_INLINE int8x16x2_t vzipq_s8(int8x16_t a, int8x16_t b) // VZIP.8 q0,q0
15981{
15982 int8x16x2_t r8x16;
15983 r8x16.val[0] = _mm_unpacklo_epi8(a, b);
15984 r8x16.val[1] = _mm_unpackhi_epi8(a, b);
15985 return r8x16;
15986}
15987
15988_NEON2SSESTORAGE int16x8x2_t vzipq_s16(int16x8_t a, int16x8_t b); // VZIP.16 q0,q0
15989_NEON2SSE_INLINE int16x8x2_t vzipq_s16(int16x8_t a, int16x8_t b) // VZIP.16 q0,q0
15990{
15991 int16x8x2_t r16x8;
15992 r16x8.val[0] = _mm_unpacklo_epi16(a, b);
15993 r16x8.val[1] = _mm_unpackhi_epi16(a, b);
15994 return r16x8;
15995}
15996
15997_NEON2SSESTORAGE int32x4x2_t vzipq_s32(int32x4_t a, int32x4_t b); // VZIP.32 q0,q0
15998_NEON2SSE_INLINE int32x4x2_t vzipq_s32(int32x4_t a, int32x4_t b) // VZIP.32 q0,q0
15999{
16000 int32x4x2_t r32x4;
16001 r32x4.val[0] = _mm_unpacklo_epi32(a, b);
16002 r32x4.val[1] = _mm_unpackhi_epi32(a, b);
16003 return r32x4;
16004}
16005
16006_NEON2SSE_GLOBAL uint8x16x2_t vzipq_u8(uint8x16_t a, uint8x16_t b); // VZIP.8 q0,q0
16007#define vzipq_u8 vzipq_s8
16008
16009_NEON2SSE_GLOBAL uint16x8x2_t vzipq_u16(uint16x8_t a, uint16x8_t b); // VZIP.16 q0,q0
16010#define vzipq_u16 vzipq_s16
16011
16012_NEON2SSE_GLOBAL uint32x4x2_t vzipq_u32(uint32x4_t a, uint32x4_t b); // VZIP.32 q0,q0
16013#define vzipq_u32 vzipq_s32
16014
16015_NEON2SSESTORAGE float32x4x2_t vzipq_f32(float32x4_t a, float32x4_t b); // VZIP.32 q0,q0
16016_NEON2SSE_INLINE float32x4x2_t vzipq_f32(float32x4_t a, float32x4_t b) // VZIP.32 q0,q0
16017{
16018 float32x4x2_t f32x4;
16019 f32x4.val[0] = _mm_unpacklo_ps ( a, b);
16020 f32x4.val[1] = _mm_unpackhi_ps ( a, b);
16021 return f32x4;
16022}
16023
16024_NEON2SSE_GLOBAL poly8x16x2_t vzipq_p8(poly8x16_t a, poly8x16_t b); // VZIP.8 q0,q0
16025#define vzipq_p8 vzipq_u8
16026
16027_NEON2SSE_GLOBAL poly16x8x2_t vzipq_p16(poly16x8_t a, poly16x8_t b); // VZIP.16 q0,q0
16028#define vzipq_p16 vzipq_u16
16029
16030//*********************** De-Interleave elements *************************
16031//*************************************************************************
16032//As the result of these functions first val contains (a0,a2,a4,....,b0,b2, b4,...) and the second val (a1,a3,a5,....b1,b3,b5...)
16033//no such functions in IA32 SIMD, shuffle is required
16034_NEON2SSESTORAGE int8x8x2_t vuzp_s8(int8x8_t a, int8x8_t b); // VUZP.8 d0,d0
16035_NEON2SSE_INLINE int8x8x2_t vuzp_s8(int8x8_t a, int8x8_t b) // VUZP.8 d0,d0
16036{
16037 int8x8x2_t val;
16038 __m128i tmp, val0;
16039 _NEON2SSE_ALIGN_16 static const int8_t maskdlv8[16] = { 0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11,15};
16040 tmp = _mm_unpacklo_epi8(_pM128i(a), _pM128i(b)); //a0,b0,a1,b1,a2,b2,a3,b3,...,a7,b7
16041 val0 = _mm_shuffle_epi8 (tmp, *(__m128i*)maskdlv8); //(a0, a2, a4, a6, b0, b2, b4, b6), (a1, a3, a5, a7, b1,b3, b5, b7)
16042 vst1q_s8(val.val, val0); // _mm_shuffle_epi32(val.val[0], _SWAP_HI_LOW32);
16043 return val;
16044}
16045
16046_NEON2SSESTORAGE int16x4x2_t vuzp_s16(int16x4_t a, int16x4_t b); // VUZP.16 d0,d0
16047_NEON2SSE_INLINE int16x4x2_t vuzp_s16(int16x4_t a, int16x4_t b) // VUZP.16 d0,d0
16048{
16049 int16x4x2_t val;
16050 __m128i tmp, val0;
16051 _NEON2SSE_ALIGN_16 static const int8_t maskdlv16[16] = {0,1, 8,9, 2,3, 10,11, 4,5, 12,13, 6,7, 14,15};
16052 tmp = _mm_unpacklo_epi16(_pM128i(a), _pM128i(b)); //a0,b0,a1,b1,a2,b2,a3,b3
16053 val0 = _mm_shuffle_epi8 (tmp, *(__m128i*)maskdlv16); //a0,a2, b0, b2, a1,a3, b1,b3
16054 vst1q_s16(val.val, val0); // _mm_shuffle_epi32(val.val[0], _SWAP_HI_LOW32);
16055 return val;
16056}
16057
16058_NEON2SSESTORAGE int32x2x2_t vuzp_s32(int32x2_t a, int32x2_t b); // VUZP.32 d0,d0
16059_NEON2SSE_INLINE int32x2x2_t vuzp_s32(int32x2_t a, int32x2_t b) // VUZP.32 d0,d0
16060{
16061 int32x2x2_t val;
16062 __m128i val0;
16063 val0 = _mm_unpacklo_epi32(_pM128i(a), _pM128i(b)); //a0,b0, a1,b1
16064 vst1q_s32(val.val, val0); // _mm_shuffle_epi32(val.val[0], _SWAP_HI_LOW32);
16065 return val;
16066}
16067
16068_NEON2SSE_GLOBAL uint8x8x2_t vuzp_u8(uint8x8_t a, uint8x8_t b); // VUZP.8 d0,d0
16069#define vuzp_u8 vuzp_s8
16070
16071_NEON2SSE_GLOBAL uint16x4x2_t vuzp_u16(uint16x4_t a, uint16x4_t b); // VUZP.16 d0,d0
16072#define vuzp_u16 vuzp_s16
16073
16074_NEON2SSE_GLOBAL uint32x2x2_t vuzp_u32(uint32x2_t a, uint32x2_t b); // VUZP.32 d0,d0
16075#define vuzp_u32 vuzp_s32
16076
16077_NEON2SSE_GLOBAL float32x2x2_t vuzp_f32(float32x2_t a, float32x2_t b); // VUZP.32 d0,d0
16078#define vuzp_f32 vzip_f32
16079
16080_NEON2SSE_GLOBAL poly8x8x2_t vuzp_p8(poly8x8_t a, poly8x8_t b); // VUZP.8 d0,d0
16081#define vuzp_p8 vuzp_u8
16082
16083_NEON2SSE_GLOBAL poly16x4x2_t vuzp_p16(poly16x4_t a, poly16x4_t b); // VUZP.16 d0,d0
16084#define vuzp_p16 vuzp_u16
16085
16086_NEON2SSESTORAGE int8x16x2_t vuzpq_s8(int8x16_t a, int8x16_t b); // VUZP.8 q0,q0
16087_NEON2SSE_INLINE int8x16x2_t vuzpq_s8(int8x16_t a, int8x16_t b) // VUZP.8 q0,q0
16088{
16089 int8x16x2_t v8x16;
16090 __m128i a_sh, b_sh;
16091 a_sh = _mm_shuffle_epi8 (a, *(__m128i*)mask8_16_even_odd); //a0, a2, a4, a6, a8, a10, a12, a14, a1, a3, a5, a7, a9, a11, a13, a15
16092 b_sh = _mm_shuffle_epi8 (b, *(__m128i*)mask8_16_even_odd); //b0, b2, b4, b6, b8, b10, b12, b14, b1, b3, b5, b7, b9, b11, b13, b15
16093 //we need unpack64 to combine lower (upper) 64 bits from a with lower (upper) 64 bits from b
16094 v8x16.val[0] = _mm_unpacklo_epi64(a_sh, b_sh); ///a0, a2, a4, a6, a8, a10, a12, a14, b0, b2, b4, b6, b8, b10, b12, b14,
16095 v8x16.val[1] = _mm_unpackhi_epi64(a_sh, b_sh); //a1, a3, a5, a7, a9, a11, a13, a15, b1, b3, b5, b7, b9, b11, b13, b15
16096 return v8x16;
16097}
16098
16099_NEON2SSESTORAGE int16x8x2_t vuzpq_s16(int16x8_t a, int16x8_t b); // VUZP.16 q0,q0
16100_NEON2SSE_INLINE int16x8x2_t vuzpq_s16(int16x8_t a, int16x8_t b) // VUZP.16 q0,q0
16101{
16102 int16x8x2_t v16x8;
16103 __m128i a_sh, b_sh;
16104 a_sh = _mm_shuffle_epi8 (a, *(__m128i*)mask8_32_even_odd); //a0, a2, a4, a6, a1, a3, a5, a7
16105 b_sh = _mm_shuffle_epi8 (b, *(__m128i*)mask8_32_even_odd); //b0, b2, b4, b6, b1, b3, b5, b7
16106 v16x8.val[0] = _mm_unpacklo_epi64(a_sh, b_sh); //a0, a2, a4, a6, b0, b2, b4, b6
16107 v16x8.val[1] = _mm_unpackhi_epi64(a_sh, b_sh); //a1, a3, a5, a7, b1, b3, b5, b7
16108 return v16x8;
16109}
16110
16111_NEON2SSESTORAGE int32x4x2_t vuzpq_s32(int32x4_t a, int32x4_t b); // VUZP.32 q0,q0
16112_NEON2SSE_INLINE int32x4x2_t vuzpq_s32(int32x4_t a, int32x4_t b) // VUZP.32 q0,q0
16113{
16114 //may be not optimal solution compared with serial
16115 int32x4x2_t v32x4;
16116 __m128i a_sh, b_sh;
16117 a_sh = _mm_shuffle_epi32 (a, 216); //a0, a2, a1, a3
16118 b_sh = _mm_shuffle_epi32 (b, 216); //b0, b2, b1, b3
16119
16120 v32x4.val[0] = _mm_unpacklo_epi64(a_sh, b_sh); //a0, a2, b0, b2
16121 v32x4.val[1] = _mm_unpackhi_epi64(a_sh, b_sh); //a1, a3, b1, b3
16122 return v32x4;
16123}
16124
16125_NEON2SSE_GLOBAL uint8x16x2_t vuzpq_u8(uint8x16_t a, uint8x16_t b); // VUZP.8 q0,q0
16126#define vuzpq_u8 vuzpq_s8
16127
16128_NEON2SSE_GLOBAL uint16x8x2_t vuzpq_u16(uint16x8_t a, uint16x8_t b); // VUZP.16 q0,q0
16129#define vuzpq_u16 vuzpq_s16
16130
16131_NEON2SSE_GLOBAL uint32x4x2_t vuzpq_u32(uint32x4_t a, uint32x4_t b); // VUZP.32 q0,q0
16132#define vuzpq_u32 vuzpq_s32
16133
16134_NEON2SSESTORAGE float32x4x2_t vuzpq_f32(float32x4_t a, float32x4_t b); // VUZP.32 q0,q0
16135_NEON2SSE_INLINE float32x4x2_t vuzpq_f32(float32x4_t a, float32x4_t b) // VUZP.32 q0,q0
16136{
16137 float32x4x2_t v32x4;
16138 v32x4.val[0] = _mm_shuffle_ps(a, b, _MM_SHUFFLE(2,0, 2, 0)); //a0, a2, b0, b2 , need to check endianess however
16139 v32x4.val[1] = _mm_shuffle_ps(a, b, _MM_SHUFFLE(3,1, 3, 1)); //a1, a3, b1, b3, need to check endianess however
16140 return v32x4;
16141}
16142
16143_NEON2SSE_GLOBAL poly8x16x2_t vuzpq_p8(poly8x16_t a, poly8x16_t b); // VUZP.8 q0,q0
16144#define vuzpq_p8 vuzpq_u8
16145
16146_NEON2SSE_GLOBAL poly16x8x2_t vuzpq_p16(poly16x8_t a, poly16x8_t b); // VUZP.16 q0,q0
16147#define vuzpq_p16 vuzpq_u16
16148
16149//##############################################################################################
16150//*********************** Reinterpret cast intrinsics.******************************************
16151//##############################################################################################
16152// Not a part of oficial NEON instruction set but available in gcc compiler *********************
16153_NEON2SSE_GLOBAL poly8x8_t vreinterpret_p8_u32 (uint32x2_t t);
16154#define vreinterpret_p8_u32
16155
16156_NEON2SSE_GLOBAL poly8x8_t vreinterpret_p8_u16 (uint16x4_t t);
16157#define vreinterpret_p8_u16
16158
16159_NEON2SSE_GLOBAL poly8x8_t vreinterpret_p8_u8 (uint8x8_t t);
16160#define vreinterpret_p8_u8
16161
16162_NEON2SSE_GLOBAL poly8x8_t vreinterpret_p8_s32 (int32x2_t t);
16163#define vreinterpret_p8_s32
16164
16165_NEON2SSE_GLOBAL poly8x8_t vreinterpret_p8_s16 (int16x4_t t);
16166#define vreinterpret_p8_s16
16167
16168_NEON2SSE_GLOBAL poly8x8_t vreinterpret_p8_s8 (int8x8_t t);
16169#define vreinterpret_p8_s8
16170
16171_NEON2SSE_GLOBAL poly8x8_t vreinterpret_p8_u64 (uint64x1_t t);
16172#define vreinterpret_p8_u64
16173
16174_NEON2SSE_GLOBAL poly8x8_t vreinterpret_p8_s64 (int64x1_t t);
16175#define vreinterpret_p8_s64
16176
16177_NEON2SSE_GLOBAL poly8x8_t vreinterpret_p8_f32 (float32x2_t t);
16178#define vreinterpret_p8_f32
16179
16180_NEON2SSE_GLOBAL poly8x8_t vreinterpret_p8_p16 (poly16x4_t t);
16181#define vreinterpret_p8_p16
16182
16183_NEON2SSE_GLOBAL poly8x16_t vreinterpretq_p8_u32 (uint32x4_t t);
16184#define vreinterpretq_p8_u32
16185
16186_NEON2SSE_GLOBAL poly8x16_t vreinterpretq_p8_u16 (uint16x8_t t);
16187#define vreinterpretq_p8_u16
16188
16189_NEON2SSE_GLOBAL poly8x16_t vreinterpretq_p8_u8 (uint8x16_t t);
16190#define vreinterpretq_p8_u8
16191
16192_NEON2SSE_GLOBAL poly8x16_t vreinterpretq_p8_s32 (int32x4_t t);
16193#define vreinterpretq_p8_s32
16194
16195_NEON2SSE_GLOBAL poly8x16_t vreinterpretq_p8_s16 (int16x8_t t);
16196#define vreinterpretq_p8_s16
16197
16198_NEON2SSE_GLOBAL poly8x16_t vreinterpretq_p8_s8 (int8x16_t t);
16199#define vreinterpretq_p8_s8
16200
16201_NEON2SSE_GLOBAL poly8x16_t vreinterpretq_p8_u64 (uint64x2_t t);
16202#define vreinterpretq_p8_u64
16203
16204_NEON2SSE_GLOBAL poly8x16_t vreinterpretq_p8_s64 (int64x2_t t);
16205#define vreinterpretq_p8_s64
16206
16207_NEON2SSE_GLOBAL poly8x16_t vreinterpretq_p8_f32 (float32x4_t t);
16208#define vreinterpretq_p8_f32(t) _M128i(t)
16209
16210_NEON2SSE_GLOBAL poly8x16_t vreinterpretq_p8_p16 (poly16x8_t t);
16211#define vreinterpretq_p8_p16
16212
16213_NEON2SSE_GLOBAL poly16x4_t vreinterpret_p16_u32 (uint32x2_t t);
16214#define vreinterpret_p16_u32
16215
16216_NEON2SSE_GLOBAL poly16x4_t vreinterpret_p16_u16 (uint16x4_t t);
16217#define vreinterpret_p16_u16
16218
16219_NEON2SSE_GLOBAL poly16x4_t vreinterpret_p16_u8 (uint8x8_t t);
16220#define vreinterpret_p16_u8
16221
16222_NEON2SSE_GLOBAL poly16x4_t vreinterpret_p16_s32 (int32x2_t t);
16223#define vreinterpret_p16_s32
16224
16225_NEON2SSE_GLOBAL poly16x4_t vreinterpret_p16_s16 (int16x4_t t);
16226#define vreinterpret_p16_s16
16227
16228_NEON2SSE_GLOBAL poly16x4_t vreinterpret_p16_s8 (int8x8_t t);
16229#define vreinterpret_p16_s8
16230
16231_NEON2SSE_GLOBAL poly16x4_t vreinterpret_p16_u64 (uint64x1_t t);
16232#define vreinterpret_p16_u64
16233
16234_NEON2SSE_GLOBAL poly16x4_t vreinterpret_p16_s64 (int64x1_t t);
16235#define vreinterpret_p16_s64
16236
16237_NEON2SSE_GLOBAL poly16x4_t vreinterpret_p16_f32 (float32x2_t t);
16238#define vreinterpret_p16_f32
16239
16240_NEON2SSE_GLOBAL poly16x4_t vreinterpret_p16_p8 (poly8x8_t t);
16241#define vreinterpret_p16_p8
16242
16243_NEON2SSE_GLOBAL poly16x8_t vreinterpretq_p16_u32 (uint32x4_t t);
16244#define vreinterpretq_p16_u32
16245
16246_NEON2SSE_GLOBAL poly16x8_t vreinterpretq_p16_u16 (uint16x8_t t);
16247#define vreinterpretq_p16_u16
16248
16249_NEON2SSE_GLOBAL poly16x8_t vreinterpretq_p16_s32 (int32x4_t t);
16250#define vreinterpretq_p16_s32
16251
16252_NEON2SSE_GLOBAL poly16x8_t vreinterpretq_p16_s16 (int16x8_t t);
16253#define vreinterpretq_p16_s16
16254
16255_NEON2SSE_GLOBAL poly16x8_t vreinterpretq_p16_s8 (int8x16_t t);
16256#define vreinterpretq_p16_s8
16257
16258_NEON2SSE_GLOBAL poly16x8_t vreinterpretq_p16_u64 (uint64x2_t t);
16259#define vreinterpretq_p16_u64
16260
16261_NEON2SSE_GLOBAL poly16x8_t vreinterpretq_p16_s64 (int64x2_t t);
16262#define vreinterpretq_p16_s64
16263
16264_NEON2SSE_GLOBAL poly16x8_t vreinterpretq_p16_f32 (float32x4_t t);
16265#define vreinterpretq_p16_f32(t) _M128i(t)
16266
16267_NEON2SSE_GLOBAL poly16x8_t vreinterpretq_p16_p8 (poly8x16_t t);
16268#define vreinterpretq_p16_p8 vreinterpretq_s16_p8
16269
16270//**** Integer to float ******
16271_NEON2SSESTORAGE float32x2_t vreinterpret_f32_u32 (uint32x2_t t);
16272_NEON2SSE_INLINE float32x2_t vreinterpret_f32_u32 (uint32x2_t t)
16273{
16274 return (*(__m64_128*)&(t));
16275}
16276
16277_NEON2SSE_GLOBAL float32x2_t vreinterpret_f32_u16 (uint16x4_t t);
16278#define vreinterpret_f32_u16 vreinterpret_f32_u32
16279
16280
16281_NEON2SSE_GLOBAL float32x2_t vreinterpret_f32_u8 (uint8x8_t t);
16282#define vreinterpret_f32_u8 vreinterpret_f32_u32
16283
16284
16285_NEON2SSE_GLOBAL float32x2_t vreinterpret_f32_s32 (int32x2_t t);
16286#define vreinterpret_f32_s32 vreinterpret_f32_u32
16287
16288
16289_NEON2SSE_GLOBAL float32x2_t vreinterpret_f32_s16 (int16x4_t t);
16290#define vreinterpret_f32_s16 vreinterpret_f32_u32
16291
16292_NEON2SSE_GLOBAL float32x2_t vreinterpret_f32_s8 (int8x8_t t);
16293#define vreinterpret_f32_s8 vreinterpret_f32_u32
16294
16295
16296_NEON2SSE_GLOBAL float32x2_t vreinterpret_f32_u64(uint64x1_t t);
16297#define vreinterpret_f32_u64 vreinterpret_f32_u32
16298
16299
16300_NEON2SSE_GLOBAL float32x2_t vreinterpret_f32_s64 (int64x1_t t);
16301#define vreinterpret_f32_s64 vreinterpret_f32_u32
16302
16303
16304_NEON2SSE_GLOBAL float32x2_t vreinterpret_f32_p16 (poly16x4_t t);
16305#define vreinterpret_f32_p16 vreinterpret_f32_u32
16306
16307_NEON2SSE_GLOBAL float32x2_t vreinterpret_f32_p8 (poly8x8_t t);
16308#define vreinterpret_f32_p8 vreinterpret_f32_u32
16309
16310_NEON2SSE_GLOBAL float32x4_t vreinterpretq_f32_u32 (uint32x4_t t);
16311#define vreinterpretq_f32_u32(t) _M128(t)
16312
16313_NEON2SSE_GLOBAL float32x4_t vreinterpretq_f32_u16 (uint16x8_t t);
16314#define vreinterpretq_f32_u16 vreinterpretq_f32_u32
16315
16316_NEON2SSE_GLOBAL float32x4_t vreinterpretq_f32_u8 (uint8x16_t t);
16317#define vreinterpretq_f32_u8 vreinterpretq_f32_u32
16318
16319_NEON2SSE_GLOBAL float32x4_t vreinterpretq_f32_s32 (int32x4_t t);
16320#define vreinterpretq_f32_s32 vreinterpretq_f32_u32
16321
16322_NEON2SSE_GLOBAL float32x4_t vreinterpretq_f32_s16 (int16x8_t t);
16323#define vreinterpretq_f32_s16 vreinterpretq_f32_u32
16324
16325_NEON2SSE_GLOBAL float32x4_t vreinterpretq_f32_s8 (int8x16_t t);
16326#define vreinterpretq_f32_s8 vreinterpretq_f32_u32
16327
16328_NEON2SSE_GLOBAL float32x4_t vreinterpretq_f32_u64 (uint64x2_t t);
16329#define vreinterpretq_f32_u64 vreinterpretq_f32_u32
16330
16331_NEON2SSE_GLOBAL float32x4_t vreinterpretq_f32_s64 (int64x2_t t);
16332#define vreinterpretq_f32_s64 vreinterpretq_f32_u32
16333
16334_NEON2SSE_GLOBAL float32x4_t vreinterpretq_f32_p16 (poly16x8_t t);
16335#define vreinterpretq_f32_p16 vreinterpretq_f32_u32
16336
16337_NEON2SSE_GLOBAL float32x4_t vreinterpretq_f32_p8 (poly8x16_t t);
16338#define vreinterpretq_f32_p8 vreinterpretq_f32_u32
16339
16340//*** Integer type conversions ******************
16341//no conversion necessary for the following functions because it is same data type
16342_NEON2SSE_GLOBAL int64x1_t vreinterpret_s64_u32 (uint32x2_t t);
16343#define vreinterpret_s64_u32
16344
16345_NEON2SSE_GLOBAL int64x1_t vreinterpret_s64_u16 (uint16x4_t t);
16346#define vreinterpret_s64_u16
16347
16348_NEON2SSE_GLOBAL int64x1_t vreinterpret_s64_u8 (uint8x8_t t);
16349#define vreinterpret_s64_u8
16350
16351_NEON2SSE_GLOBAL int64x1_t vreinterpret_s64_s32 (int32x2_t t);
16352#define vreinterpret_s64_s32
16353
16354_NEON2SSE_GLOBAL int64x1_t vreinterpret_s64_s16 (int16x4_t t);
16355#define vreinterpret_s64_s16
16356
16357_NEON2SSE_GLOBAL int64x1_t vreinterpret_s64_s8 (int8x8_t t);
16358#define vreinterpret_s64_s8
16359
16360_NEON2SSE_GLOBAL int64x1_t vreinterpret_s64_u64 (uint64x1_t t);
16361#define vreinterpret_s64_u64
16362
16363_NEON2SSE_GLOBAL int64x1_t vreinterpret_s64_f32 (float32x2_t t);
16364#define vreinterpret_s64_f32
16365
16366_NEON2SSE_GLOBAL int64x1_t vreinterpret_s64_p16 (poly16x4_t t);
16367#define vreinterpret_s64_p16
16368
16369_NEON2SSE_GLOBAL int64x1_t vreinterpret_s64_p8 (poly8x8_t t);
16370#define vreinterpret_s64_p8
16371
16372_NEON2SSE_GLOBAL int64x2_t vreinterpretq_s64_u32 (uint32x4_t t);
16373#define vreinterpretq_s64_u32
16374
16375_NEON2SSE_GLOBAL int64x2_t vreinterpretq_s64_s16 (uint16x8_t t);
16376#define vreinterpretq_s64_s16
16377
16378_NEON2SSE_GLOBAL int64x2_t vreinterpretq_s64_u8 (uint8x16_t t);
16379#define vreinterpretq_s64_u8
16380
16381_NEON2SSE_GLOBAL int64x2_t vreinterpretq_s64_s32 (int32x4_t t);
16382#define vreinterpretq_s64_s32
16383
16384_NEON2SSE_GLOBAL int64x2_t vreinterpretq_s64_u16 (int16x8_t t);
16385#define vreinterpretq_s64_u16
16386
16387_NEON2SSE_GLOBAL int64x2_t vreinterpretq_s64_s8 (int8x16_t t);
16388#define vreinterpretq_s64_s8
16389
16390_NEON2SSE_GLOBAL int64x2_t vreinterpretq_s64_u64 (uint64x2_t t);
16391#define vreinterpretq_s64_u64
16392
16393_NEON2SSE_GLOBAL int64x2_t vreinterpretq_s64_f32 (float32x4_t t);
16394#define vreinterpretq_s64_f32(t) _M128i(t)
16395
16396_NEON2SSE_GLOBAL int64x2_t vreinterpretq_s64_p16 (poly16x8_t t);
16397#define vreinterpretq_s64_p16
16398
16399_NEON2SSE_GLOBAL int64x2_t vreinterpretq_s64_p8 (poly8x16_t t);
16400#define vreinterpretq_s64_p8
16401
16402_NEON2SSE_GLOBAL uint64x1_t vreinterpret_u64_u32 (uint32x2_t t);
16403#define vreinterpret_u64_u32
16404
16405_NEON2SSE_GLOBAL uint64x1_t vreinterpret_u64_u16 (uint16x4_t t);
16406#define vreinterpret_u64_u16
16407
16408_NEON2SSE_GLOBAL uint64x1_t vreinterpret_u64_u8 (uint8x8_t t);
16409#define vreinterpret_u64_u8
16410
16411_NEON2SSE_GLOBAL uint64x1_t vreinterpret_u64_s32 (int32x2_t t);
16412#define vreinterpret_u64_s32
16413
16414_NEON2SSE_GLOBAL uint64x1_t vreinterpret_u64_s16 (int16x4_t t);
16415#define vreinterpret_u64_s16
16416
16417_NEON2SSE_GLOBAL uint64x1_t vreinterpret_u64_s8 (int8x8_t t);
16418#define vreinterpret_u64_s8
16419
16420_NEON2SSE_GLOBAL uint64x1_t vreinterpret_u64_s64 (int64x1_t t);
16421#define vreinterpret_u64_s64
16422
16423_NEON2SSE_GLOBAL uint64x1_t vreinterpret_u64_f32 (float32x2_t t);
16424#define vreinterpret_u64_f32
16425
16426_NEON2SSE_GLOBAL uint64x1_t vreinterpret_u64_p16 (poly16x4_t t);
16427#define vreinterpret_u64_p16
16428
16429_NEON2SSE_GLOBAL uint64x1_t vreinterpret_u64_p8 (poly8x8_t t);
16430#define vreinterpret_u64_p8
16431
16432_NEON2SSE_GLOBAL uint64x2_t vreinterpretq_u64_u32 (uint32x4_t t);
16433#define vreinterpretq_u64_u32
16434
16435_NEON2SSE_GLOBAL uint64x2_t vreinterpretq_u64_u16 (uint16x8_t t);
16436#define vreinterpretq_u64_u16
16437
16438_NEON2SSE_GLOBAL uint64x2_t vreinterpretq_u64_u8 (uint8x16_t t);
16439#define vreinterpretq_u64_u8
16440
16441_NEON2SSE_GLOBAL uint64x2_t vreinterpretq_u64_s32 (int32x4_t t);
16442#define vreinterpretq_u64_s32
16443
16444_NEON2SSE_GLOBAL uint64x2_t vreinterpretq_u64_s16 (int16x8_t t);
16445#define vreinterpretq_u64_s16
16446
16447_NEON2SSE_GLOBAL uint64x2_t vreinterpretq_u64_s8 (int8x16_t t);
16448#define vreinterpretq_u64_s8
16449
16450_NEON2SSE_GLOBAL uint64x2_t vreinterpretq_u64_s64 (int64x2_t t);
16451#define vreinterpretq_u64_s64
16452
16453_NEON2SSE_GLOBAL uint64x2_t vreinterpretq_u64_f32 (float32x4_t t);
16454#define vreinterpretq_u64_f32(t) _M128i(t)
16455
16456_NEON2SSE_GLOBAL uint64x2_t vreinterpretq_u64_p16 (poly16x8_t t);
16457#define vreinterpretq_u64_p16
16458
16459_NEON2SSE_GLOBAL uint64x2_t vreinterpretq_u64_p8 (poly8x16_t t);
16460#define vreinterpretq_u64_p8
16461
16462_NEON2SSE_GLOBAL int8x8_t vreinterpret_s8_u32 (uint32x2_t t);
16463#define vreinterpret_s8_u32
16464
16465_NEON2SSE_GLOBAL int8x8_t vreinterpret_s8_u16 (uint16x4_t t);
16466#define vreinterpret_s8_u16
16467
16468_NEON2SSE_GLOBAL int8x8_t vreinterpret_s8_u8 (uint8x8_t t);
16469#define vreinterpret_s8_u8
16470
16471_NEON2SSE_GLOBAL int8x8_t vreinterpret_s8_s32 (int32x2_t t);
16472#define vreinterpret_s8_s32
16473
16474_NEON2SSE_GLOBAL int8x8_t vreinterpret_s8_s16 (int16x4_t t);
16475#define vreinterpret_s8_s16
16476
16477_NEON2SSE_GLOBAL int8x8_t vreinterpret_s8_u64 (uint64x1_t t);
16478#define vreinterpret_s8_u64
16479
16480_NEON2SSE_GLOBAL int8x8_t vreinterpret_s8_s64 (int64x1_t t);
16481#define vreinterpret_s8_s64
16482
16483_NEON2SSE_GLOBAL int8x8_t vreinterpret_s8_f32 (float32x2_t t);
16484#define vreinterpret_s8_f32
16485
16486_NEON2SSE_GLOBAL int8x8_t vreinterpret_s8_p16 (poly16x4_t t);
16487#define vreinterpret_s8_p16
16488
16489_NEON2SSE_GLOBAL int8x8_t vreinterpret_s8_p8 (poly8x8_t t);
16490#define vreinterpret_s8_p8
16491
16492_NEON2SSE_GLOBAL int8x16_t vreinterpretq_s8_u32 (uint32x4_t t);
16493#define vreinterpretq_s8_u32
16494
16495_NEON2SSE_GLOBAL int8x16_t vreinterpretq_s8_u16 (uint16x8_t t);
16496#define vreinterpretq_s8_u16
16497
16498_NEON2SSE_GLOBAL int8x16_t vreinterpretq_s8_u8 (uint8x16_t t);
16499#define vreinterpretq_s8_u8
16500
16501_NEON2SSE_GLOBAL int8x16_t vreinterpretq_s8_s32 (int32x4_t t);
16502#define vreinterpretq_s8_s32
16503
16504_NEON2SSE_GLOBAL int8x16_t vreinterpretq_s8_s16 (int16x8_t t);
16505#define vreinterpretq_s8_s16
16506
16507_NEON2SSE_GLOBAL int8x16_t vreinterpretq_s8_u64 (uint64x2_t t);
16508#define vreinterpretq_s8_u64
16509
16510_NEON2SSE_GLOBAL int8x16_t vreinterpretq_s8_s64 (int64x2_t t);
16511#define vreinterpretq_s8_s64
16512
16513_NEON2SSE_GLOBAL int8x16_t vreinterpretq_s8_f32 (float32x4_t t);
16514#define vreinterpretq_s8_f32(t) _M128i(t)
16515
16516_NEON2SSE_GLOBAL int8x16_t vreinterpretq_s8_p16 (poly16x8_t t);
16517#define vreinterpretq_s8_p16
16518
16519_NEON2SSE_GLOBAL int8x16_t vreinterpretq_s8_p8 (poly8x16_t t);
16520#define vreinterpretq_s8_p8
16521
16522_NEON2SSE_GLOBAL int16x4_t vreinterpret_s16_u32 (uint32x2_t t);
16523#define vreinterpret_s16_u32
16524
16525_NEON2SSE_GLOBAL int16x4_t vreinterpret_s16_u16 (uint16x4_t t);
16526#define vreinterpret_s16_u16
16527
16528_NEON2SSE_GLOBAL int16x4_t vreinterpret_s16_u8 (uint8x8_t t);
16529#define vreinterpret_s16_u8
16530
16531_NEON2SSE_GLOBAL int16x4_t vreinterpret_s16_s32 (int32x2_t t);
16532#define vreinterpret_s16_s32
16533
16534_NEON2SSE_GLOBAL int16x4_t vreinterpret_s16_s8 (int8x8_t t);
16535#define vreinterpret_s16_s8
16536
16537_NEON2SSE_GLOBAL int16x4_t vreinterpret_s16_u64 (uint64x1_t t);
16538#define vreinterpret_s16_u64
16539
16540_NEON2SSE_GLOBAL int16x4_t vreinterpret_s16_s64 (int64x1_t t);
16541#define vreinterpret_s16_s64
16542
16543_NEON2SSE_GLOBAL int16x4_t vreinterpret_s16_f32 (float32x2_t t);
16544#define vreinterpret_s16_f32
16545
16546
16547_NEON2SSE_GLOBAL int16x4_t vreinterpret_s16_p16 (poly16x4_t t);
16548#define vreinterpret_s16_p16
16549
16550_NEON2SSE_GLOBAL int16x4_t vreinterpret_s16_p8 (poly8x8_t t);
16551#define vreinterpret_s16_p8
16552
16553_NEON2SSE_GLOBAL int16x8_t vreinterpretq_s16_u32 (uint32x4_t t);
16554#define vreinterpretq_s16_u32
16555
16556_NEON2SSE_GLOBAL int16x8_t vreinterpretq_s16_u16 (uint16x8_t t);
16557#define vreinterpretq_s16_u16
16558
16559_NEON2SSE_GLOBAL int16x8_t vreinterpretq_s16_u8 (uint8x16_t t);
16560#define vreinterpretq_s16_u8
16561
16562_NEON2SSE_GLOBAL int16x8_t vreinterpretq_s16_s32 (int32x4_t t);
16563#define vreinterpretq_s16_s32
16564
16565_NEON2SSE_GLOBAL int16x8_t vreinterpretq_s16_s8 (int8x16_t t);
16566#define vreinterpretq_s16_s8
16567
16568_NEON2SSE_GLOBAL int16x8_t vreinterpretq_s16_u64 (uint64x2_t t);
16569#define vreinterpretq_s16_u64
16570
16571_NEON2SSE_GLOBAL int16x8_t vreinterpretq_s16_s64 (int64x2_t t);
16572#define vreinterpretq_s16_s64
16573
16574_NEON2SSE_GLOBAL int16x8_t vreinterpretq_s16_f32 (float32x4_t t);
16575#define vreinterpretq_s16_f32(t) _M128i(t)
16576
16577_NEON2SSE_GLOBAL int16x8_t vreinterpretq_s16_p16 (poly16x8_t t);
16578#define vreinterpretq_s16_p16
16579
16580_NEON2SSE_GLOBAL int16x8_t vreinterpretq_s16_p8 (poly8x16_t t);
16581#define vreinterpretq_s16_p8
16582
16583_NEON2SSE_GLOBAL int32x2_t vreinterpret_s32_u32 (uint32x2_t t);
16584#define vreinterpret_s32_u32
16585
16586_NEON2SSE_GLOBAL int32x2_t vreinterpret_s32_u16 (uint16x4_t t);
16587#define vreinterpret_s32_u16
16588
16589_NEON2SSE_GLOBAL int32x2_t vreinterpret_s32_u8 (uint8x8_t t);
16590#define vreinterpret_s32_u8
16591
16592_NEON2SSE_GLOBAL int32x2_t vreinterpret_s32_s16 (int16x4_t t);
16593#define vreinterpret_s32_s16
16594
16595_NEON2SSE_GLOBAL int32x2_t vreinterpret_s32_s8 (int8x8_t t);
16596#define vreinterpret_s32_s8
16597
16598_NEON2SSE_GLOBAL int32x2_t vreinterpret_s32_u64 (uint64x1_t t);
16599#define vreinterpret_s32_u64
16600
16601_NEON2SSE_GLOBAL int32x2_t vreinterpret_s32_s64 (int64x1_t t);
16602#define vreinterpret_s32_s64
16603
16604_NEON2SSE_GLOBAL int32x2_t vreinterpret_s32_f32 (float32x2_t t);
16605#define vreinterpret_s32_f32
16606
16607_NEON2SSE_GLOBAL int32x2_t vreinterpret_s32_p16 (poly16x4_t t);
16608#define vreinterpret_s32_p16
16609
16610_NEON2SSE_GLOBAL int32x2_t vreinterpret_s32_p8 (poly8x8_t t);
16611#define vreinterpret_s32_p8
16612
16613_NEON2SSE_GLOBAL int32x4_t vreinterpretq_s32_u32 (uint32x4_t t);
16614#define vreinterpretq_s32_u32
16615
16616_NEON2SSE_GLOBAL int32x4_t vreinterpretq_s32_u16 (uint16x8_t t);
16617#define vreinterpretq_s32_u16
16618
16619_NEON2SSE_GLOBAL int32x4_t vreinterpretq_s32_u8 (uint8x16_t t);
16620#define vreinterpretq_s32_u8
16621
16622_NEON2SSE_GLOBAL int32x4_t vreinterpretq_s32_s16 (int16x8_t t);
16623#define vreinterpretq_s32_s16
16624
16625_NEON2SSE_GLOBAL int32x4_t vreinterpretq_s32_s8 (int8x16_t t);
16626#define vreinterpretq_s32_s8
16627
16628_NEON2SSE_GLOBAL int32x4_t vreinterpretq_s32_u64 (uint64x2_t t);
16629#define vreinterpretq_s32_u64
16630
16631_NEON2SSE_GLOBAL int32x4_t vreinterpretq_s32_s64 (int64x2_t t);
16632#define vreinterpretq_s32_s64
16633
16634_NEON2SSE_GLOBAL int32x4_t vreinterpretq_s32_f32 (float32x4_t t);
16635#define vreinterpretq_s32_f32(t) _M128i(t)
16636
16637_NEON2SSE_GLOBAL int32x4_t vreinterpretq_s32_p16 (poly16x8_t t);
16638#define vreinterpretq_s32_p16
16639
16640_NEON2SSE_GLOBAL int32x4_t vreinterpretq_s32_p8 (poly8x16_t t);
16641#define vreinterpretq_s32_p8
16642
16643_NEON2SSE_GLOBAL uint8x8_t vreinterpret_u8_u32 (uint32x2_t t);
16644#define vreinterpret_u8_u32
16645
16646_NEON2SSE_GLOBAL uint8x8_t vreinterpret_u8_u16 (uint16x4_t t);
16647#define vreinterpret_u8_u16
16648
16649_NEON2SSE_GLOBAL uint8x8_t vreinterpret_u8_s32 (int32x2_t t);
16650#define vreinterpret_u8_s32
16651
16652_NEON2SSE_GLOBAL uint8x8_t vreinterpret_u8_s16 (int16x4_t t);
16653#define vreinterpret_u8_s16
16654
16655_NEON2SSE_GLOBAL uint8x8_t vreinterpret_u8_s8 (int8x8_t t);
16656#define vreinterpret_u8_s8
16657
16658_NEON2SSE_GLOBAL uint8x8_t vreinterpret_u8_u64 (uint64x1_t t);
16659#define vreinterpret_u8_u64
16660
16661_NEON2SSE_GLOBAL uint8x8_t vreinterpret_u8_s64 (int64x1_t t);
16662#define vreinterpret_u8_s64
16663
16664_NEON2SSE_GLOBAL uint8x8_t vreinterpret_u8_f32 (float32x2_t t);
16665#define vreinterpret_u8_f32
16666
16667_NEON2SSE_GLOBAL uint8x8_t vreinterpret_u8_p16 (poly16x4_t t);
16668#define vreinterpret_u8_p16
16669
16670_NEON2SSE_GLOBAL uint8x8_t vreinterpret_u8_p8 (poly8x8_t t);
16671#define vreinterpret_u8_p8
16672
16673_NEON2SSE_GLOBAL uint8x16_t vreinterpretq_u8_u32 (uint32x4_t t);
16674#define vreinterpretq_u8_u32
16675
16676_NEON2SSE_GLOBAL uint8x16_t vreinterpretq_u8_u16 (uint16x8_t t);
16677#define vreinterpretq_u8_u16
16678
16679_NEON2SSE_GLOBAL uint8x16_t vreinterpretq_u8_s32 (int32x4_t t);
16680#define vreinterpretq_u8_s32
16681
16682_NEON2SSE_GLOBAL uint8x16_t vreinterpretq_u8_s16 (int16x8_t t);
16683#define vreinterpretq_u8_s16
16684
16685_NEON2SSE_GLOBAL uint8x16_t vreinterpretq_u8_s8 (int8x16_t t);
16686#define vreinterpretq_u8_s8
16687
16688_NEON2SSE_GLOBAL uint8x16_t vreinterpretq_u8_u64 (uint64x2_t t);
16689#define vreinterpretq_u8_u64
16690
16691_NEON2SSE_GLOBAL uint8x16_t vreinterpretq_u8_s64 (int64x2_t t);
16692#define vreinterpretq_u8_s64
16693
16694_NEON2SSE_GLOBAL uint8x16_t vreinterpretq_u8_f32 (float32x4_t t);
16695#define vreinterpretq_u8_f32(t) _M128i(t)
16696
16697
16698_NEON2SSE_GLOBAL uint8x16_t vreinterpretq_u8_p16 (poly16x8_t t);
16699#define vreinterpretq_u8_p16
16700
16701_NEON2SSE_GLOBAL uint8x16_t vreinterpretq_u8_p8 (poly8x16_t t);
16702#define vreinterpretq_u8_p8
16703
16704_NEON2SSE_GLOBAL uint16x4_t vreinterpret_u16_u32 (uint32x2_t t);
16705#define vreinterpret_u16_u32
16706
16707_NEON2SSE_GLOBAL uint16x4_t vreinterpret_u16_u8 (uint8x8_t t);
16708#define vreinterpret_u16_u8
16709
16710_NEON2SSE_GLOBAL uint16x4_t vreinterpret_u16_s32 (int32x2_t t);
16711#define vreinterpret_u16_s32
16712
16713_NEON2SSE_GLOBAL uint16x4_t vreinterpret_u16_s16 (int16x4_t t);
16714#define vreinterpret_u16_s16
16715
16716_NEON2SSE_GLOBAL uint16x4_t vreinterpret_u16_s8 (int8x8_t t);
16717#define vreinterpret_u16_s8
16718
16719_NEON2SSE_GLOBAL uint16x4_t vreinterpret_u16_u64 (uint64x1_t t);
16720#define vreinterpret_u16_u64
16721
16722_NEON2SSE_GLOBAL uint16x4_t vreinterpret_u16_s64 (int64x1_t t);
16723#define vreinterpret_u16_s64
16724
16725_NEON2SSE_GLOBAL uint16x4_t vreinterpret_u16_f32 (float32x2_t t);
16726#define vreinterpret_u16_f32
16727
16728_NEON2SSE_GLOBAL uint16x4_t vreinterpret_u16_p16 (poly16x4_t t);
16729#define vreinterpret_u16_p16
16730
16731_NEON2SSE_GLOBAL uint16x4_t vreinterpret_u16_p8 (poly8x8_t t);
16732#define vreinterpret_u16_p8
16733
16734_NEON2SSE_GLOBAL uint16x8_t vreinterpretq_u16_u32 (uint32x4_t t);
16735#define vreinterpretq_u16_u32
16736
16737_NEON2SSE_GLOBAL uint16x8_t vreinterpretq_u16_u8 (uint8x16_t t);
16738#define vreinterpretq_u16_u8
16739
16740_NEON2SSE_GLOBAL uint16x8_t vreinterpretq_u16_s32 (int32x4_t t);
16741#define vreinterpretq_u16_s32
16742
16743_NEON2SSE_GLOBAL uint16x8_t vreinterpretq_u16_s16 (int16x8_t t);
16744#define vreinterpretq_u16_s16
16745
16746_NEON2SSE_GLOBAL uint16x8_t vreinterpretq_u16_s8 (int8x16_t t);
16747#define vreinterpretq_u16_s8
16748
16749_NEON2SSE_GLOBAL uint16x8_t vreinterpretq_u16_u64 (uint64x2_t t);
16750#define vreinterpretq_u16_u64
16751
16752_NEON2SSE_GLOBAL uint16x8_t vreinterpretq_u16_s64 (int64x2_t t);
16753#define vreinterpretq_u16_s64
16754
16755_NEON2SSE_GLOBAL uint16x8_t vreinterpretq_u16_f32 (float32x4_t t);
16756#define vreinterpretq_u16_f32(t) _M128i(t)
16757
16758_NEON2SSE_GLOBAL uint16x8_t vreinterpretq_u16_p16 (poly16x8_t t);
16759#define vreinterpretq_u16_p16
16760
16761_NEON2SSE_GLOBAL uint16x8_t vreinterpretq_u16_p8 (poly8x16_t t);
16762#define vreinterpretq_u16_p8
16763
16764_NEON2SSE_GLOBAL uint32x2_t vreinterpret_u32_u16 (uint16x4_t t);
16765#define vreinterpret_u32_u16
16766
16767_NEON2SSE_GLOBAL uint32x2_t vreinterpret_u32_u8 (uint8x8_t t);
16768#define vreinterpret_u32_u8
16769
16770_NEON2SSE_GLOBAL uint32x2_t vreinterpret_u32_s32 (int32x2_t t);
16771#define vreinterpret_u32_s32
16772
16773_NEON2SSE_GLOBAL uint32x2_t vreinterpret_u32_s16 (int16x4_t t);
16774#define vreinterpret_u32_s16
16775
16776_NEON2SSE_GLOBAL uint32x2_t vreinterpret_u32_s8 (int8x8_t t);
16777#define vreinterpret_u32_s8
16778
16779_NEON2SSE_GLOBAL uint32x2_t vreinterpret_u32_u64 (uint64x1_t t);
16780#define vreinterpret_u32_u64
16781
16782_NEON2SSE_GLOBAL uint32x2_t vreinterpret_u32_s64 (int64x1_t t);
16783#define vreinterpret_u32_s64
16784
16785_NEON2SSE_GLOBAL uint32x2_t vreinterpret_u32_f32 (float32x2_t t);
16786#define vreinterpret_u32_f32
16787
16788_NEON2SSE_GLOBAL uint32x2_t vreinterpret_u32_p16 (poly16x4_t t);
16789#define vreinterpret_u32_p16
16790
16791_NEON2SSE_GLOBAL uint32x2_t vreinterpret_u32_p8 (poly8x8_t t);
16792#define vreinterpret_u32_p8
16793
16794_NEON2SSE_GLOBAL uint32x4_t vreinterpretq_u32_u16 (uint16x8_t t);
16795#define vreinterpretq_u32_u16
16796
16797_NEON2SSE_GLOBAL uint32x4_t vreinterpretq_u32_u8 (uint8x16_t t);
16798#define vreinterpretq_u32_u8
16799
16800_NEON2SSE_GLOBAL uint32x4_t vreinterpretq_u32_s32 (int32x4_t t);
16801#define vreinterpretq_u32_s32
16802
16803_NEON2SSE_GLOBAL uint32x4_t vreinterpretq_u32_s16 (int16x8_t t);
16804#define vreinterpretq_u32_s16
16805
16806_NEON2SSE_GLOBAL uint32x4_t vreinterpretq_u32_s8 (int8x16_t t);
16807#define vreinterpretq_u32_s8
16808
16809_NEON2SSE_GLOBAL uint32x4_t vreinterpretq_u32_u64 (uint64x2_t t);
16810#define vreinterpretq_u32_u64
16811
16812_NEON2SSE_GLOBAL uint32x4_t vreinterpretq_u32_s64 (int64x2_t t);
16813#define vreinterpretq_u32_s64
16814
16815_NEON2SSE_GLOBAL uint32x4_t vreinterpretq_u32_f32 (float32x4_t t);
16816#define vreinterpretq_u32_f32(t) _M128i(t)
16817
16818_NEON2SSE_GLOBAL uint32x4_t vreinterpretq_u32_p16 (poly16x8_t t);
16819#define vreinterpretq_u32_p16
16820
16821_NEON2SSE_GLOBAL uint32x4_t vreinterpretq_u32_p8 (poly8x16_t t);
16822#define vreinterpretq_u32_p8
16823
16824//************* Round ******************
16825_NEON2SSESTORAGE float32x4_t vrndnq_f32(float32x4_t a);
16826#ifdef USE_SSE4
16827_NEON2SSE_INLINE float32x4_t vrndnq_f32(float32x4_t a)
16828{
16829 return _mm_round_ps(a, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
16830}
16831#else
16832_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING( float32x4_t vrndnq_f32(float32x4_t a), _NEON2SSE_REASON_SLOW_SERIAL)
16833{
16834 int i;
16835 _NEON2SSE_ALIGN_16 float32_t res[4];
16836 _mm_store_ps(res, a);
16837 for(i = 0; i<4; i++) {
16838 res[i] = nearbyintf(res[i]);
16839 }
16840 return _mm_load_ps(res);
16841}
16842#endif
16843
16844
16845_NEON2SSESTORAGE float64x2_t vrndnq_f64(float64x2_t a);
16846#ifdef USE_SSE4
16847_NEON2SSE_INLINE float64x2_t vrndnq_f64(float64x2_t a)
16848{
16849 return _mm_round_pd(a, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
16850}
16851#else
16852_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(float64x2_t vrndnq_f64(float64x2_t a), _NEON2SSE_REASON_SLOW_SERIAL)
16853{
16854 _NEON2SSE_ALIGN_16 float64_t res[2];
16855 _mm_store_pd(res, a);
16856 res[0] = nearbyint(res[0]);
16857 res[1] = nearbyint(res[1]);
16858 return _mm_load_pd(res);
16859}
16860#endif
16861
16862
16863
16864//************* Sqrt ******************
16865_NEON2SSE_GLOBAL float32x4_t vsqrtq_f32(float32x4_t a);
16866#define vsqrtq_f32 _mm_sqrt_ps
16867
16868_NEON2SSE_GLOBAL float64x2_t vsqrtq_f64(float64x2_t a);
16869#define vsqrtq_f64 _mm_sqrt_pd
16870
16871
16872#endif /* NEON2SSE_H */
16873