NEON_2_SSE.h source code [tensorflow/external/arm_neon_2_x86_sse/NEON_2_SSE.h]

1	//created by Victoria Zhislina, the Senior Application Engineer, Intel Corporation, [email protected]
2
3	//** Copyright (C) 2012-2022 Intel Corporation. All rights reserved.*
4
5	//IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
6
7	//By downloading, copying, installing or using the software you agree to this license.
8	//If you do not agree to this license, do not download, install, copy or use the software.
9
10	// License Agreement
11	//Redistribution and use in source and binary forms, with or without modification,
12	//are permitted provided that the following conditions are met:
13
14	// Redistributions of source code must retain the above copyright notice,*
15	// this list of conditions and the following disclaimer.
16
17	// The name of the copyright holders may not be used to endorse or promote products*
18	// derived from this software without specific prior written permission.
19
20	//This software is provided by the copyright holders and contributors "as is" and
21	//any express or implied warranties, including, but not limited to, the implied
22	//warranties of merchantability and fitness for a particular purpose are disclaimed.
23	//In no event shall the Intel Corporation or contributors be liable for any direct,
24	//indirect, incidental, special, exemplary, or consequential damages
25	//(including, but not limited to, procurement of substitute goods or services;
26	//loss of use, data, or profits; or business interruption) however caused
27	//and on any theory of liability, whether in contract, strict liability,
28	//or tort (including negligence or otherwise) arising in any way out of
29	//the use of this software, even if advised of the possibility of such damage.
30
31	//*****************************************************************************************
32	// This file is intended to simplify ARM->IA32 porting
33	// It makes the correspondence between ARM NEON intrinsics (as defined in "arm_neon.h")
34	// and x86 SSE(up to SSE4.2) intrinsic functions as defined in headers files below
35	//MMX instruction set is not used due to non availability on x64 systems,
36	//performance overhead and the necessity to use the EMMS instruction (_mm_empty())for mmx-x87 floating point switching
37	//*****************************************************************************************
38
39	//!!!!!!!!!!!!!! To use this file just include it in your project that uses ARM NEON intrinsics instead of "arm_neon.h" and compile it as usual
40	//!!!!!!!!!!!!!! but please pay attention at #define USE_SSE4 below - you might need to define it manualy for newest Intel Atom or any Intel Core platforms for greater performance.
41
42	#ifndef NEON2SSE_H
43	#define NEON2SSE_H
44
45	/*******************************************************************************************************************/
46	//!!!!!!!!!!!!!!
47	//if USE_SSE4 is defined, some functions use SSE4 instructions instead of earlier SSE versions, when undefined - SIMD up to SSSE3 are used
48	//For older devices without SSE4 support it should be undefined, for newer devices - defined, probably manualy if your compiler doesn't set __SSE4_2__ predefine
49	#ifndef USE_SSE4
50	# if defined(__SSE4_2__)
51	# define USE_SSE4
52	# endif
53	#endif
54	/*******************************************************************************************************************/
55
56	#include <xmmintrin.h> //SSE
57	#include <emmintrin.h> //SSE2
58	#include <pmmintrin.h> //SSE3
59	#include <tmmintrin.h> //SSSE3
60	#ifdef USE_SSE4
61	# include <smmintrin.h> //SSE4.1
62	# include <nmmintrin.h> //SSE4.2
63	#endif
64
65	#include <math.h>
66
67	//************* functions and data attributes, compiler dependent *******************************
68	//***********************************************************************************
69	#ifdef __GNUC__
70	# define _GCC_VERSION (__GNUC__ * 10000 + __GNUC_MINOR__ * 100 + __GNUC_PATCHLEVEL__)
71	# define _NEON2SSESTORAGE static
72	# define _NEON2SSE_ALIGN_16 __attribute__((aligned(16)))
73	# ifdef __clang__
74	# define _NEON2SSE_INLINE _NEON2SSESTORAGE inline __attribute__((__gnu_inline__, __always_inline__))
75	# else
76	# define _NEON2SSE_INLINE _NEON2SSESTORAGE inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
77	# endif
78	# ifndef NEON2SSE_DISABLE_PERFORMANCE_WARNING
79	# if _GCC_VERSION < 40500
80	# define _NEON2SSE_PERFORMANCE_WARNING(function, explanation) __attribute__((deprecated)) function
81	# else
82	# define _NEON2SSE_PERFORMANCE_WARNING(function, explanation) __attribute__((deprecated(explanation))) function
83	# endif
84	# else
85	# define _NEON2SSE_PERFORMANCE_WARNING(function, explanation) function
86	# endif
87	# if defined(__x86_64__)
88	# define _NEON2SSE_64BIT __x86_64__
89	# endif
90	#else
91	# define _NEON2SSESTORAGE static
92	# define _NEON2SSE_ALIGN_16 __declspec(align(16))
93	# define _NEON2SSE_INLINE _NEON2SSESTORAGE __inline
94	# if (defined(_MSC_VER) \|\| defined (__INTEL_COMPILER)) && !defined(NEON2SSE_DISABLE_PERFORMANCE_WARNING)
95	# define _NEON2SSE_PERFORMANCE_WARNING(function, EXPLANATION) __declspec(deprecated(EXPLANATION)) function
96	# if defined(_M_X64)
97	# define _NEON2SSE_64BIT _M_X64
98	# endif
99	# else
100	# define _NEON2SSE_PERFORMANCE_WARNING(function, explanation) function
101	# endif
102	#endif
103
104	/ Used to mark the intinsics that are declared as functions, but implemented as macros /
105	#define _NEON2SSE_GLOBAL
106
107	#if defined (_NEON2SSE_64BIT) && defined (USE_SSE4)
108	# define _NEON2SSE_64BIT_SSE4
109	#endif
110
111	#ifndef UNREFERENCED_PARAMETER
112	# define UNREFERENCED_PARAMETER(P) ((void)(P))
113	#endif
114
115	/*******************************************************************************************************************/
116	// data types conversion
117	/*******************************************************************************************************************/
118	#if defined(_MSC_VER) && (_MSC_VER < 1300)
119	typedef signed char int8_t;
120	typedef unsigned char uint8_t;
121	typedef signed short int16_t;
122	typedef unsigned short uint16_t;
123	typedef signed int int32_t;
124	typedef unsigned int uint32_t;
125	typedef signed long long int64_t;
126	typedef unsigned long long uint64_t;
127	#elif defined(_MSC_VER)
128	typedef signed __int8 int8_t;
129	typedef unsigned __int8 uint8_t;
130	typedef signed __int16 int16_t;
131	typedef unsigned __int16 uint16_t;
132	typedef signed __int32 int32_t;
133	typedef unsigned __int32 uint32_t;
134
135	typedef signed long long int64_t;
136	typedef unsigned long long uint64_t;
137	#else
138	# include <stdint.h>
139	# include <limits.h>
140	#endif
141
142
143	typedef float float32_t;
144	#if !defined(__clang__)
145	typedef float __fp16;
146	#endif
147
148	typedef double float64_t;
149
150	typedef union __m64_128 {
151	uint64_t m64_u64[`1`];
152	int64_t m64_i64[`1`];
153	float64_t m64_d64[`1`];
154	uint32_t m64_u32[`2`];
155	int32_t m64_i32[`2`];
156	float32_t m64_f32[`2`];
157	int16_t m64_i16[`4`];
158	uint16_t m64_u16[`4`];
159	int8_t m64_i8[`8`];
160	uint8_t m64_u8[`8`];
161	} __m64_128;
162
163	typedef __m64_128 int8x8_t;
164	typedef __m64_128 uint8x8_t;
165	typedef __m64_128 int16x4_t;
166	typedef __m64_128 uint16x4_t;
167	typedef __m64_128 int32x2_t;
168	typedef __m64_128 uint32x2_t;
169	typedef __m64_128 int64x1_t;
170	typedef __m64_128 uint64x1_t;
171	typedef __m64_128 poly8x8_t;
172	typedef __m64_128 poly16x4_t;
173
174	typedef __m64_128 float32x2_t;
175	typedef __m128 float32x4_t;
176
177	typedef __m128 float16x4_t; //not supported by IA, for compartibility
178	typedef __m128 float16x8_t; //not supported by IA, for compartibility
179
180	typedef __m64_128 float64x1_t;
181	typedef __m128d float64x2_t;
182
183	typedef __m128i int8x16_t;
184	typedef __m128i int16x8_t;
185	typedef __m128i int32x4_t;
186	typedef __m128i int64x2_t;
187	typedef __m128i uint8x16_t;
188	typedef __m128i uint16x8_t;
189	typedef __m128i uint32x4_t;
190	typedef __m128i uint64x2_t;
191	typedef __m128i poly8x16_t;
192	typedef __m128i poly16x8_t;
193
194	#if defined(_MSC_VER)
195	# define SINT_MIN (-2147483647 - 1) /* min signed int value */
196	# define SINT_MAX 2147483647 /* max signed int value */
197	#else
198	# define SINT_MIN INT_MIN /* min signed int value */
199	# define SINT_MAX INT_MAX /* max signed int value */
200	#endif
201
202	typedef uint8_t poly8_t;
203	typedef uint16_t poly16_t;
204
205
206	//MSVC compilers (tested up to 2012 VS version) doesn't allow using structures or arrays of __m128x type as functions arguments resulting in
207	//error C2719: 'src': formal parameter with __declspec(align('16')) won't be aligned. To avoid it we need the special trick for functions that use these types
208	struct int8x16x2_t {
209	int8x16_t val[`2`];
210	};
211	struct int16x8x2_t {
212	int16x8_t val[`2`];
213	};
214	struct int32x4x2_t {
215	int32x4_t val[`2`];
216	};
217	struct int64x2x2_t {
218	int64x2_t val[`2`];
219	};
220	//Unfortunately we are unable to merge two 64-bits in on 128 bit register because user should be able to access val[n] members explicitly!!!
221	struct int8x8x2_t {
222	int8x8_t val[`2`];
223	};
224	struct int16x4x2_t {
225	int16x4_t val[`2`];
226	};
227	struct int32x2x2_t {
228	int32x2_t val[`2`];
229	};
230	struct int64x1x2_t {
231	int64x1_t val[`2`];
232	};
233
234	typedef struct int8x16x2_t int8x16x2_t; //for C compilers to make them happy
235	typedef struct int16x8x2_t int16x8x2_t; //for C compilers to make them happy
236	typedef struct int32x4x2_t int32x4x2_t; //for C compilers to make them happy
237	typedef struct int64x2x2_t int64x2x2_t; //for C compilers to make them happy
238
239	typedef struct int8x8x2_t int8x8x2_t; //for C compilers to make them happy
240	typedef struct int16x4x2_t int16x4x2_t; //for C compilers to make them happy
241	typedef struct int32x2x2_t int32x2x2_t; //for C compilers to make them happy
242	typedef struct int64x1x2_t int64x1x2_t; //for C compilers to make them happy
243
244	/ to avoid pointer conversions the following unsigned integers structures are defined via the corresponding signed integers structures above /
245	typedef struct int8x16x2_t uint8x16x2_t;
246	typedef struct int16x8x2_t uint16x8x2_t;
247	typedef struct int32x4x2_t uint32x4x2_t;
248	typedef struct int64x2x2_t uint64x2x2_t;
249	typedef struct int8x16x2_t poly8x16x2_t;
250	typedef struct int16x8x2_t poly16x8x2_t;
251
252	typedef struct int8x8x2_t uint8x8x2_t;
253	typedef struct int16x4x2_t uint16x4x2_t;
254	typedef struct int32x2x2_t uint32x2x2_t;
255	typedef struct int64x1x2_t uint64x1x2_t;
256	typedef struct int8x8x2_t poly8x8x2_t;
257	typedef struct int16x4x2_t poly16x4x2_t;
258
259	//float
260	struct float32x4x2_t {
261	float32x4_t val[`2`];
262	};
263	struct float16x8x2_t {
264	float16x8_t val[`2`];
265	};
266	struct float32x2x2_t {
267	float32x2_t val[`2`];
268	};
269
270	typedef struct float32x4x2_t float32x4x2_t; //for C compilers to make them happy
271	typedef struct float16x8x2_t float16x8x2_t; //for C compilers to make them happy
272	typedef struct float32x2x2_t float32x2x2_t; //for C compilers to make them happy
273	typedef float16x8x2_t float16x4x2_t;
274
275	//4
276	struct int8x16x4_t {
277	int8x16_t val[`4`];
278	};
279	struct int16x8x4_t {
280	int16x8_t val[`4`];
281	};
282	struct int32x4x4_t {
283	int32x4_t val[`4`];
284	};
285	struct int64x2x4_t {
286	int64x2_t val[`4`];
287	};
288
289	struct int8x8x4_t {
290	int8x8_t val[`4`];
291	};
292	struct int16x4x4_t {
293	int16x4_t val[`4`];
294	};
295	struct int32x2x4_t {
296	int32x2_t val[`4`];
297	};
298	struct int64x1x4_t {
299	int64x1_t val[`4`];
300	};
301
302	typedef struct int8x16x4_t int8x16x4_t; //for C compilers to make them happy
303	typedef struct int16x8x4_t int16x8x4_t; //for C compilers to make them happy
304	typedef struct int32x4x4_t int32x4x4_t; //for C compilers to make them happy
305	typedef struct int64x2x4_t int64x2x4_t; //for C compilers to make them happy
306
307	typedef struct int8x8x4_t int8x8x4_t; //for C compilers to make them happy
308	typedef struct int16x4x4_t int16x4x4_t; //for C compilers to make them happy
309	typedef struct int32x2x4_t int32x2x4_t; //for C compilers to make them happy
310	typedef struct int64x1x4_t int64x1x4_t; //for C compilers to make them happy
311
312	/ to avoid pointer conversions the following unsigned integers structures are defined via the corresponding signed integers dealing structures above:/
313	typedef struct int8x8x4_t uint8x8x4_t;
314	typedef struct int16x4x4_t uint16x4x4_t;
315	typedef struct int32x2x4_t uint32x2x4_t;
316	typedef struct int64x1x4_t uint64x1x4_t;
317	typedef struct int8x8x4_t poly8x8x4_t;
318	typedef struct int16x4x4_t poly16x4x4_t;
319
320	typedef struct int8x16x4_t uint8x16x4_t;
321	typedef struct int16x8x4_t uint16x8x4_t;
322	typedef struct int32x4x4_t uint32x4x4_t;
323	typedef struct int64x2x4_t uint64x2x4_t;
324	typedef struct int8x16x4_t poly8x16x4_t;
325	typedef struct int16x8x4_t poly16x8x4_t;
326
327	struct float32x4x4_t {
328	float32x4_t val[`4`];
329	};
330	struct float16x8x4_t {
331	float16x8_t val[`4`];
332	};
333	struct float32x2x4_t {
334	float32x2_t val[`4`];
335	};
336
337	typedef struct float32x4x4_t float32x4x4_t; //for C compilers to make them happy
338	typedef struct float16x8x4_t float16x8x4_t; //for C compilers to make them happy
339	typedef struct float32x2x4_t float32x2x4_t; //for C compilers to make them happy
340	typedef float16x8x4_t float16x4x4_t;
341
342	//3
343	struct int16x8x3_t {
344	int16x8_t val[`3`];
345	};
346	struct int32x4x3_t {
347	int32x4_t val[`3`];
348	};
349	struct int64x2x3_t {
350	int64x2_t val[`3`];
351	};
352	struct int8x16x3_t {
353	int8x16_t val[`3`];
354	};
355
356	struct int16x4x3_t {
357	int16x4_t val[`3`];
358	};
359	struct int32x2x3_t {
360	int32x2_t val[`3`];
361	};
362	struct int64x1x3_t {
363	int64x1_t val[`3`];
364	};
365	struct int8x8x3_t {
366	int8x8_t val[`3`];
367	};
368	typedef struct int16x8x3_t int16x8x3_t; //for C compilers to make them happy
369	typedef struct int32x4x3_t int32x4x3_t; //for C compilers to make them happy
370	typedef struct int64x2x3_t int64x2x3_t; //for C compilers to make them happy
371	typedef struct int8x16x3_t int8x16x3_t; //for C compilers to make them happy
372
373	typedef struct int8x8x3_t int8x8x3_t; //for C compilers to make them happy
374	typedef struct int16x4x3_t int16x4x3_t; //for C compilers to make them happy
375	typedef struct int32x2x3_t int32x2x3_t; //for C compilers to make them happy
376	typedef struct int64x1x3_t int64x1x3_t; //for C compilers to make them happy
377
378
379	/ to avoid pointer conversions the following unsigned integers structures are defined via the corresponding signed integers dealing structures above:/
380	typedef struct int8x16x3_t uint8x16x3_t;
381	typedef struct int16x8x3_t uint16x8x3_t;
382	typedef struct int32x4x3_t uint32x4x3_t;
383	typedef struct int64x2x3_t uint64x2x3_t;
384	typedef struct int8x16x3_t poly8x16x3_t;
385	typedef struct int16x8x3_t poly16x8x3_t;
386	typedef struct int8x8x3_t uint8x8x3_t;
387	typedef struct int16x4x3_t uint16x4x3_t;
388	typedef struct int32x2x3_t uint32x2x3_t;
389	typedef struct int64x1x3_t uint64x1x3_t;
390	typedef struct int8x8x3_t poly8x8x3_t;
391	typedef struct int16x4x3_t poly16x4x3_t;
392
393	//float
394	struct float32x4x3_t {
395	float32x4_t val[`3`];
396	};
397	struct float32x2x3_t {
398	float32x2_t val[`3`];
399	};
400	struct float16x8x3_t {
401	float16x8_t val[`3`];
402	};
403
404	typedef struct float32x4x3_t float32x4x3_t; //for C compilers to make them happy
405	typedef struct float16x8x3_t float16x8x3_t; //for C compilers to make them happy
406	typedef struct float32x2x3_t float32x2x3_t; //for C compilers to make them happy
407	typedef float16x8x3_t float16x4x3_t;
408
409
410	//****************************************************************************
411	//**** Porting auxiliary macros ******************************************
412
413	// floating point related macros
414	#define _M128i(a) _mm_castps_si128(a)
415	#define _M128(a) _mm_castsi128_ps(a)
416	//here the most performance effective implementation is compiler and 32/64 bits build dependent
417	#if defined (_NEON2SSE_64BIT) \|\| (defined (__INTEL_COMPILER) && (__INTEL_COMPILER >= 1500) )
418	# define _pM128i(a) _mm_cvtsi64_si128((int64_t)(&(a)))
419	# define _M64(out, inp) out.m64_i64[0] = _mm_cvtsi128_si64 (inp);
420	# define _M64f(out, inp) out.m64_i64[0] = _mm_cvtsi128_si64 (_M128i(inp));
421	#else
422	//for 32bit gcc and Microsoft compilers builds
423	# define _pM128i(a) _mm_loadl_epi64((__m128i*)&(a))
424	# define _M64(out, inp) _mm_storel_epi64 ((__m128i*)&(out), inp)
425	# define _M64f(out, inp) _mm_storel_epi64 ((__m128i*)&(out), _M128i(inp))
426	#endif
427	#define _pM128(a) _mm_castsi128_ps(_pM128i(a))
428
429	#define return64(a) _M64(res64,a); return res64;
430	#define return64f(a) _M64f(res64,a); return res64;
431
432	#define _Ui64(a) ((uint64_t)&(a))
433	#define _UNSIGNED_T(a) u ## a
434
435	#define _SIGNBIT64 ((uint64_t)1 << 63)
436	#define _SWAP_HI_LOW32 (2 \| (3 << 2) \| (0 << 4) \| (1 << 6))
437	#define _INSERTPS_NDX(srcField, dstField) (((srcField) << 6) \| ((dstField) << 4) )
438
439	#define _NEON2SSE_REASON_SLOW_SERIAL "The function may be very slow due to the serial implementation, please try to avoid it"
440	#define _NEON2SSE_REASON_SLOW_UNEFFECTIVE "The function may be slow due to inefficient x86 SIMD implementation, please try to avoid it"
441
442	//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
443	#define __constrange(min,max) const
444	#define __transfersize(size)
445	//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
446
447	//&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&& mask constants used in porting &&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&
448	_NEON2SSE_ALIGN_16 static const int8_t mask8_16_even_odd[`16`] = { `0`, `2`, `4`, `6`, `8`, `10`, `12`, `14`, `1`, `3`, `5`, `7`, `9`, `11`, `13`, `15` };
449	_NEON2SSE_ALIGN_16 static const int8_t mask8_32_even_odd[`16`] = { `0`, `1`, `4`, `5`, `8`, `9`, `12`, `13`, `2`, `3`, `6`, `7`, `10`, `11`, `14`, `15` };
450	//&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&
451
452	//*************************************************************************
453	//*************************************************************************
454	//******* Functions declarations as declared in original arm_neon.h ***
455	//*************************************************************************
456	//Vector add: vadd -> Vr[i]:=Va[i]+Vb[i], Vr, Va, Vb have equal lane sizes.
457	_NEON2SSESTORAGE int8x8_t vadd_s8(int8x8_t a, int8x8_t b); // VADD.I8 d0,d0,d0
458	_NEON2SSESTORAGE int16x4_t vadd_s16(int16x4_t a, int16x4_t b); // VADD.I16 d0,d0,d0
459	_NEON2SSESTORAGE int32x2_t vadd_s32(int32x2_t a, int32x2_t b); // VADD.I32 d0,d0,d0
460	_NEON2SSESTORAGE int64x1_t vadd_s64(int64x1_t a, int64x1_t b); // VADD.I64 d0,d0,d0
461	_NEON2SSESTORAGE float32x2_t vadd_f32(float32x2_t a, float32x2_t b); // VADD.F32 d0,d0,d0
462	_NEON2SSE_GLOBAL uint8x8_t vadd_u8(uint8x8_t a, uint8x8_t b); // VADD.I8 d0,d0,d0
463	_NEON2SSE_GLOBAL uint16x4_t vadd_u16(uint16x4_t a, uint16x4_t b); // VADD.I16 d0,d0,d0
464	_NEON2SSE_GLOBAL uint32x2_t vadd_u32(uint32x2_t a, uint32x2_t b); // VADD.I32 d0,d0,d0
465	_NEON2SSESTORAGE uint64x1_t vadd_u64(uint64x1_t a, uint64x1_t b); // VADD.I64 d0,d0,d0
466	_NEON2SSE_GLOBAL int8x16_t vaddq_s8(int8x16_t a, int8x16_t b); // VADD.I8 q0,q0,q0
467	_NEON2SSE_GLOBAL int16x8_t vaddq_s16(int16x8_t a, int16x8_t b); // VADD.I16 q0,q0,q0
468	_NEON2SSE_GLOBAL int32x4_t vaddq_s32(int32x4_t a, int32x4_t b); // VADD.I32 q0,q0,q0
469	_NEON2SSE_GLOBAL int64x2_t vaddq_s64(int64x2_t a, int64x2_t b); // VADD.I64 q0,q0,q0
470	_NEON2SSE_GLOBAL float32x4_t vaddq_f32(float32x4_t a, float32x4_t b); // VADD.F32 q0,q0,q0
471	_NEON2SSE_GLOBAL uint8x16_t vaddq_u8(uint8x16_t a, uint8x16_t b); // VADD.I8 q0,q0,q0
472	_NEON2SSE_GLOBAL uint16x8_t vaddq_u16(uint16x8_t a, uint16x8_t b); // VADD.I16 q0,q0,q0
473	_NEON2SSE_GLOBAL uint32x4_t vaddq_u32(uint32x4_t a, uint32x4_t b); // VADD.I32 q0,q0,q0
474	_NEON2SSE_GLOBAL uint64x2_t vaddq_u64(uint64x2_t a, uint64x2_t b); // VADD.I64 q0,q0,q0
475	//Vector long add: vaddl -> Vr[i]:=Va[i]+Vb[i], Va, Vb have equal lane sizes, result is a 128 bit vector of lanes that are twice the width.
476	_NEON2SSESTORAGE int16x8_t vaddl_s8(int8x8_t a, int8x8_t b); // VADDL.S8 q0,d0,d0
477	_NEON2SSESTORAGE int32x4_t vaddl_s16(int16x4_t a, int16x4_t b); // VADDL.S16 q0,d0,d0
478	_NEON2SSESTORAGE int64x2_t vaddl_s32(int32x2_t a, int32x2_t b); // VADDL.S32 q0,d0,d0
479	_NEON2SSESTORAGE uint16x8_t vaddl_u8(uint8x8_t a, uint8x8_t b); // VADDL.U8 q0,d0,d0
480	_NEON2SSESTORAGE uint32x4_t vaddl_u16(uint16x4_t a, uint16x4_t b); // VADDL.U16 q0,d0,d0
481	_NEON2SSESTORAGE uint64x2_t vaddl_u32(uint32x2_t a, uint32x2_t b); // VADDL.U32 q0,d0,d0
482	//Vector wide addw: vadd -> Vr[i]:=Va[i]+Vb[i]
483	_NEON2SSESTORAGE int16x8_t vaddw_s8(int16x8_t a, int8x8_t b); // VADDW.S8 q0,q0,d0
484	_NEON2SSESTORAGE int32x4_t vaddw_s16(int32x4_t a, int16x4_t b); // VADDW.S16 q0,q0,d0
485	_NEON2SSESTORAGE int64x2_t vaddw_s32(int64x2_t a, int32x2_t b); // VADDW.S32 q0,q0,d0
486	_NEON2SSESTORAGE uint16x8_t vaddw_u8(uint16x8_t a, uint8x8_t b); // VADDW.U8 q0,q0,d0
487	_NEON2SSESTORAGE uint32x4_t vaddw_u16(uint32x4_t a, uint16x4_t b); // VADDW.U16 q0,q0,d0
488	_NEON2SSESTORAGE uint64x2_t vaddw_u32(uint64x2_t a, uint32x2_t b); // VADDW.U32 q0,q0,d0
489	//Vector halving add: vhadd -> Vr[i]:=(Va[i]+Vb[i])>>1
490	_NEON2SSESTORAGE int8x8_t vhadd_s8(int8x8_t a, int8x8_t b); // VHADD.S8 d0,d0,d0
491	_NEON2SSESTORAGE int16x4_t vhadd_s16(int16x4_t a, int16x4_t b); // VHADD.S16 d0,d0,d0
492	_NEON2SSESTORAGE int32x2_t vhadd_s32(int32x2_t a, int32x2_t b); // VHADD.S32 d0,d0,d0
493	_NEON2SSESTORAGE uint8x8_t vhadd_u8(uint8x8_t a, uint8x8_t b); // VHADD.U8 d0,d0,d0
494	_NEON2SSESTORAGE uint16x4_t vhadd_u16(uint16x4_t a, uint16x4_t b); // VHADD.U16 d0,d0,d0
495	_NEON2SSESTORAGE uint32x2_t vhadd_u32(uint32x2_t a, uint32x2_t b); // VHADD.U32 d0,d0,d0
496	_NEON2SSESTORAGE int8x16_t vhaddq_s8(int8x16_t a, int8x16_t b); // VHADD.S8 q0,q0,q0
497	_NEON2SSESTORAGE int16x8_t vhaddq_s16(int16x8_t a, int16x8_t b); // VHADD.S16 q0,q0,q0
498	_NEON2SSESTORAGE int32x4_t vhaddq_s32(int32x4_t a, int32x4_t b); // VHADD.S32 q0,q0,q0
499	_NEON2SSESTORAGE uint8x16_t vhaddq_u8(uint8x16_t a, uint8x16_t b); // VHADD.U8 q0,q0,q0
500	_NEON2SSESTORAGE uint16x8_t vhaddq_u16(uint16x8_t a, uint16x8_t b); // VHADD.U16 q0,q0,q0
501	_NEON2SSESTORAGE uint32x4_t vhaddq_u32(uint32x4_t a, uint32x4_t b); // VHADD.U32 q0,q0,q0
502	//Vector rounding halving add: vrhadd -> Vr[i]:=(Va[i]+Vb[i]+1)>>1
503	_NEON2SSESTORAGE int8x8_t vrhadd_s8(int8x8_t a, int8x8_t b); // VRHADD.S8 d0,d0,d0
504	_NEON2SSESTORAGE int16x4_t vrhadd_s16(int16x4_t a, int16x4_t b); // VRHADD.S16 d0,d0,d0
505	_NEON2SSESTORAGE int32x2_t vrhadd_s32(int32x2_t a, int32x2_t b); // VRHADD.S32 d0,d0,d0
506	_NEON2SSESTORAGE uint8x8_t vrhadd_u8(uint8x8_t a, uint8x8_t b); // VRHADD.U8 d0,d0,d0
507	_NEON2SSESTORAGE uint16x4_t vrhadd_u16(uint16x4_t a, uint16x4_t b); // VRHADD.U16 d0,d0,d0
508	_NEON2SSESTORAGE uint32x2_t vrhadd_u32(uint32x2_t a, uint32x2_t b); // VRHADD.U32 d0,d0,d0
509	_NEON2SSESTORAGE int8x16_t vrhaddq_s8(int8x16_t a, int8x16_t b); // VRHADD.S8 q0,q0,q0
510	_NEON2SSESTORAGE int16x8_t vrhaddq_s16(int16x8_t a, int16x8_t b); // VRHADD.S16 q0,q0,q0
511	_NEON2SSESTORAGE int32x4_t vrhaddq_s32(int32x4_t a, int32x4_t b); // VRHADD.S32 q0,q0,q0
512	_NEON2SSE_GLOBAL uint8x16_t vrhaddq_u8(uint8x16_t a, uint8x16_t b); // VRHADD.U8 q0,q0,q0
513	_NEON2SSE_GLOBAL uint16x8_t vrhaddq_u16(uint16x8_t a, uint16x8_t b); // VRHADD.U16 q0,q0,q0
514	_NEON2SSESTORAGE uint32x4_t vrhaddq_u32(uint32x4_t a, uint32x4_t b); // VRHADD.U32 q0,q0,q0
515	//Vector saturating add: vqadd -> Vr[i]:=sat<size>(Va[i]+Vb[i])
516	_NEON2SSESTORAGE int8x8_t vqadd_s8(int8x8_t a, int8x8_t b); // VQADD.S8 d0,d0,d0
517	_NEON2SSESTORAGE int16x4_t vqadd_s16(int16x4_t a, int16x4_t b); // VQADD.S16 d0,d0,d0
518	_NEON2SSESTORAGE int32x2_t vqadd_s32(int32x2_t a, int32x2_t b); // VQADD.S32 d0,d0,d0
519	_NEON2SSESTORAGE int64x1_t vqadd_s64(int64x1_t a, int64x1_t b); // VQADD.S64 d0,d0,d0
520	_NEON2SSESTORAGE uint8x8_t vqadd_u8(uint8x8_t a, uint8x8_t b); // VQADD.U8 d0,d0,d0
521	_NEON2SSESTORAGE uint16x4_t vqadd_u16(uint16x4_t a, uint16x4_t b); // VQADD.U16 d0,d0,d0
522	_NEON2SSESTORAGE uint32x2_t vqadd_u32(uint32x2_t a, uint32x2_t b); // VQADD.U32 d0,d0,d0
523	_NEON2SSESTORAGE uint64x1_t vqadd_u64(uint64x1_t a, uint64x1_t b); // VQADD.U64 d0,d0,d0
524	_NEON2SSE_GLOBAL int8x16_t vqaddq_s8(int8x16_t a, int8x16_t b); // VQADD.S8 q0,q0,q0
525	_NEON2SSE_GLOBAL int16x8_t vqaddq_s16(int16x8_t a, int16x8_t b); // VQADD.S16 q0,q0,q0
526	_NEON2SSESTORAGE int32x4_t vqaddq_s32(int32x4_t a, int32x4_t b); // VQADD.S32 q0,q0,q0
527	_NEON2SSESTORAGE int64x2_t vqaddq_s64(int64x2_t a, int64x2_t b); // VQADD.S64 q0,q0,q0
528	_NEON2SSE_GLOBAL uint8x16_t vqaddq_u8(uint8x16_t a, uint8x16_t b); // VQADD.U8 q0,q0,q0
529	_NEON2SSE_GLOBAL uint16x8_t vqaddq_u16(uint16x8_t a, uint16x8_t b); // VQADD.U16 q0,q0,q0
530	_NEON2SSESTORAGE uint32x4_t vqaddq_u32(uint32x4_t a, uint32x4_t b); // VQADD.U32 q0,q0,q0
531	_NEON2SSESTORAGE uint64x2_t vqaddq_u64(uint64x2_t a, uint64x2_t b); // VQADD.U64 q0,q0,q0
532	//Vector add high half: vaddhn-> Vr[i]:=Va[i]+Vb[i]
533	_NEON2SSESTORAGE int8x8_t vaddhn_s16(int16x8_t a, int16x8_t b); // VADDHN.I16 d0,q0,q0
534	_NEON2SSESTORAGE int16x4_t vaddhn_s32(int32x4_t a, int32x4_t b); // VADDHN.I32 d0,q0,q0
535	_NEON2SSESTORAGE int32x2_t vaddhn_s64(int64x2_t a, int64x2_t b); // VADDHN.I64 d0,q0,q0
536	_NEON2SSESTORAGE uint8x8_t vaddhn_u16(uint16x8_t a, uint16x8_t b); // VADDHN.I16 d0,q0,q0
537	_NEON2SSESTORAGE uint16x4_t vaddhn_u32(uint32x4_t a, uint32x4_t b); // VADDHN.I32 d0,q0,q0
538	_NEON2SSE_GLOBAL uint32x2_t vaddhn_u64(uint64x2_t a, uint64x2_t b); // VADDHN.I64 d0,q0,q0
539	//Vector rounding add high half: vraddhn
540	_NEON2SSESTORAGE int8x8_t vraddhn_s16(int16x8_t a, int16x8_t b); // VRADDHN.I16 d0,q0,q0
541	_NEON2SSESTORAGE int16x4_t vraddhn_s32(int32x4_t a, int32x4_t b); // VRADDHN.I32 d0,q0,q0
542	_NEON2SSESTORAGE int32x2_t vraddhn_s64(int64x2_t a, int64x2_t b); // VRADDHN.I64 d0,q0,q0
543	_NEON2SSESTORAGE uint8x8_t vraddhn_u16(uint16x8_t a, uint16x8_t b); // VRADDHN.I16 d0,q0,q0
544	_NEON2SSESTORAGE uint16x4_t vraddhn_u32(uint32x4_t a, uint32x4_t b); // VRADDHN.I32 d0,q0,q0
545	_NEON2SSE_GLOBAL uint32x2_t vraddhn_u64(uint64x2_t a, uint64x2_t b); // VRADDHN.I64 d0,q0,q0
546	//Multiplication
547	//Vector multiply: vmul -> Vr[i] := Va[i] Vb[i]*
548	_NEON2SSESTORAGE int8x8_t vmul_s8(int8x8_t a, int8x8_t b); // VMUL.I8 d0,d0,d0
549	_NEON2SSE_GLOBAL int16x4_t vmul_s16(int16x4_t a, int16x4_t b); // VMUL.I16 d0,d0,d0
550	_NEON2SSE_GLOBAL int32x2_t vmul_s32(int32x2_t a, int32x2_t b); // VMUL.I32 d0,d0,d0
551	_NEON2SSESTORAGE float32x2_t vmul_f32(float32x2_t a, float32x2_t b); // VMUL.F32 d0,d0,d0
552	_NEON2SSESTORAGE uint8x8_t vmul_u8(uint8x8_t a, uint8x8_t b); // VMUL.I8 d0,d0,d0
553	_NEON2SSESTORAGE uint16x4_t vmul_u16(uint16x4_t a, uint16x4_t b); // VMUL.I16 d0,d0,d0
554	_NEON2SSESTORAGE uint32x2_t vmul_u32(uint32x2_t a, uint32x2_t b); // VMUL.I32 d0,d0,d0
555	_NEON2SSESTORAGE poly8x8_t vmul_p8(poly8x8_t a, poly8x8_t b); // VMUL.P8 d0,d0,d0
556	_NEON2SSESTORAGE int8x16_t vmulq_s8(int8x16_t a, int8x16_t b); // VMUL.I8 q0,q0,q0
557	_NEON2SSE_GLOBAL int16x8_t vmulq_s16(int16x8_t a, int16x8_t b); // VMUL.I16 q0,q0,q0
558	_NEON2SSE_GLOBAL int32x4_t vmulq_s32(int32x4_t a, int32x4_t b); // VMUL.I32 q0,q0,q0
559	_NEON2SSE_GLOBAL float32x4_t vmulq_f32(float32x4_t a, float32x4_t b); // VMUL.F32 q0,q0,q0
560	_NEON2SSESTORAGE uint8x16_t vmulq_u8(uint8x16_t a, uint8x16_t b); // VMUL.I8 q0,q0,q0
561	_NEON2SSE_GLOBAL uint16x8_t vmulq_u16(uint16x8_t a, uint16x8_t b); // VMUL.I16 q0,q0,q0
562	_NEON2SSE_GLOBAL uint32x4_t vmulq_u32(uint32x4_t a, uint32x4_t b); // VMUL.I32 q0,q0,q0
563	_NEON2SSESTORAGE poly8x16_t vmulq_p8(poly8x16_t a, poly8x16_t b); // VMUL.P8 q0,q0,q0
564	//multiply lane
565	_NEON2SSESTORAGE int16x4_t vmul_lane_s16 (int16x4_t a, int16x4_t b, __constrange(`0`,`3`) int c);
566	_NEON2SSESTORAGE int32x2_t vmul_lane_s32 (int32x2_t a, int32x2_t b, __constrange(`0`,`1`) int c);
567	_NEON2SSESTORAGE float32x2_t vmul_lane_f32 (float32x2_t a, float32x2_t b, __constrange(`0`,`1`) int c);
568	_NEON2SSE_GLOBAL uint16x4_t vmul_lane_u16 (uint16x4_t a, uint16x4_t b, __constrange(`0`,`3`) int c);
569	_NEON2SSE_GLOBAL uint32x2_t vmul_lane_u32 (uint32x2_t a, uint32x2_t b, __constrange(`0`,`1`) int c);
570	_NEON2SSESTORAGE int16x8_t vmulq_lane_s16 (int16x8_t a, int16x4_t b, __constrange(`0`,`3`) int c);
571	_NEON2SSESTORAGE int32x4_t vmulq_lane_s32 (int32x4_t a, int32x2_t b, __constrange(`0`,`1`) int c);
572	_NEON2SSESTORAGE float32x4_t vmulq_lane_f32 (float32x4_t a, float32x2_t b, __constrange(`0`,`1`) int c);
573	_NEON2SSE_GLOBAL uint16x8_t vmulq_lane_u16 (uint16x8_t a, uint16x4_t b, __constrange(`0`,`3`) int c);
574	_NEON2SSE_GLOBAL uint32x4_t vmulq_lane_u32 (uint32x4_t a, uint32x2_t b, __constrange(`0`,`1`) int c);
575	//Vector multiply accumulate: vmla -> Vr[i] := Va[i] + Vb[i] Vc[i]*
576	_NEON2SSESTORAGE int8x8_t vmla_s8(int8x8_t a, int8x8_t b, int8x8_t c); // VMLA.I8 d0,d0,d0
577	_NEON2SSESTORAGE int16x4_t vmla_s16(int16x4_t a, int16x4_t b, int16x4_t c); // VMLA.I16 d0,d0,d0
578	_NEON2SSESTORAGE int32x2_t vmla_s32(int32x2_t a, int32x2_t b, int32x2_t c); // VMLA.I32 d0,d0,d0
579	_NEON2SSESTORAGE float32x2_t vmla_f32(float32x2_t a, float32x2_t b, float32x2_t c); // VMLA.F32 d0,d0,d0
580	_NEON2SSESTORAGE uint8x8_t vmla_u8(uint8x8_t a, uint8x8_t b, uint8x8_t c); // VMLA.I8 d0,d0,d0
581	_NEON2SSE_GLOBAL uint16x4_t vmla_u16(uint16x4_t a, uint16x4_t b, uint16x4_t c); // VMLA.I16 d0,d0,d0
582	_NEON2SSE_GLOBAL uint32x2_t vmla_u32(uint32x2_t a, uint32x2_t b, uint32x2_t c); // VMLA.I32 d0,d0,d0
583	_NEON2SSESTORAGE int8x16_t vmlaq_s8(int8x16_t a, int8x16_t b, int8x16_t c); // VMLA.I8 q0,q0,q0
584	_NEON2SSESTORAGE int16x8_t vmlaq_s16(int16x8_t a, int16x8_t b, int16x8_t c); // VMLA.I16 q0,q0,q0
585	_NEON2SSESTORAGE int32x4_t vmlaq_s32(int32x4_t a, int32x4_t b, int32x4_t c); // VMLA.I32 q0,q0,q0
586	_NEON2SSESTORAGE float32x4_t vmlaq_f32(float32x4_t a, float32x4_t b, float32x4_t c); // VMLA.F32 q0,q0,q0
587	_NEON2SSESTORAGE uint8x16_t vmlaq_u8(uint8x16_t a, uint8x16_t b, uint8x16_t c); // VMLA.I8 q0,q0,q0
588	_NEON2SSE_GLOBAL uint16x8_t vmlaq_u16(uint16x8_t a, uint16x8_t b, uint16x8_t c); // VMLA.I16 q0,q0,q0
589	_NEON2SSE_GLOBAL uint32x4_t vmlaq_u32(uint32x4_t a, uint32x4_t b, uint32x4_t c); // VMLA.I32 q0,q0,q0
590	//Vector multiply accumulate long: vmlal -> Vr[i] := Va[i] + Vb[i] Vc[i]*
591	_NEON2SSESTORAGE int16x8_t vmlal_s8(int16x8_t a, int8x8_t b, int8x8_t c); // VMLAL.S8 q0,d0,d0
592	_NEON2SSESTORAGE int32x4_t vmlal_s16(int32x4_t a, int16x4_t b, int16x4_t c); // VMLAL.S16 q0,d0,d0
593	_NEON2SSESTORAGE int64x2_t vmlal_s32(int64x2_t a, int32x2_t b, int32x2_t c); // VMLAL.S32 q0,d0,d0
594	_NEON2SSESTORAGE uint16x8_t vmlal_u8(uint16x8_t a, uint8x8_t b, uint8x8_t c); // VMLAL.U8 q0,d0,d0
595	_NEON2SSESTORAGE uint32x4_t vmlal_u16(uint32x4_t a, uint16x4_t b, uint16x4_t c); // VMLAL.U16 q0,d0,d0
596	_NEON2SSESTORAGE uint64x2_t vmlal_u32(uint64x2_t a, uint32x2_t b, uint32x2_t c); // VMLAL.U32 q0,d0,d0
597	//Vector multiply subtract: vmls -> Vr[i] := Va[i] - Vb[i] Vc[i]*
598	_NEON2SSESTORAGE int8x8_t vmls_s8(int8x8_t a, int8x8_t b, int8x8_t c); // VMLS.I8 d0,d0,d0
599	_NEON2SSESTORAGE int16x4_t vmls_s16(int16x4_t a, int16x4_t b, int16x4_t c); // VMLS.I16 d0,d0,d0
600	_NEON2SSESTORAGE int32x2_t vmls_s32(int32x2_t a, int32x2_t b, int32x2_t c); // VMLS.I32 d0,d0,d0
601	_NEON2SSESTORAGE float32x2_t vmls_f32(float32x2_t a, float32x2_t b, float32x2_t c); // VMLS.F32 d0,d0,d0
602	_NEON2SSESTORAGE uint8x8_t vmls_u8(uint8x8_t a, uint8x8_t b, uint8x8_t c); // VMLS.I8 d0,d0,d0
603	_NEON2SSE_GLOBAL uint16x4_t vmls_u16(uint16x4_t a, uint16x4_t b, uint16x4_t c); // VMLS.I16 d0,d0,d0
604	_NEON2SSE_GLOBAL uint32x2_t vmls_u32(uint32x2_t a, uint32x2_t b, uint32x2_t c); // VMLS.I32 d0,d0,d0
605	_NEON2SSESTORAGE int8x16_t vmlsq_s8(int8x16_t a, int8x16_t b, int8x16_t c); // VMLS.I8 q0,q0,q0
606	_NEON2SSESTORAGE int16x8_t vmlsq_s16(int16x8_t a, int16x8_t b, int16x8_t c); // VMLS.I16 q0,q0,q0
607	_NEON2SSESTORAGE int32x4_t vmlsq_s32(int32x4_t a, int32x4_t b, int32x4_t c); // VMLS.I32 q0,q0,q0
608	_NEON2SSESTORAGE float32x4_t vmlsq_f32(float32x4_t a, float32x4_t b, float32x4_t c); // VMLS.F32 q0,q0,q0
609	_NEON2SSESTORAGE uint8x16_t vmlsq_u8(uint8x16_t a, uint8x16_t b, uint8x16_t c); // VMLS.I8 q0,q0,q0
610	_NEON2SSE_GLOBAL uint16x8_t vmlsq_u16(uint16x8_t a, uint16x8_t b, uint16x8_t c); // VMLS.I16 q0,q0,q0
611	_NEON2SSE_GLOBAL uint32x4_t vmlsq_u32(uint32x4_t a, uint32x4_t b, uint32x4_t c); // VMLS.I32 q0,q0,q0
612	//Vector multiply subtract long
613	_NEON2SSESTORAGE int16x8_t vmlsl_s8(int16x8_t a, int8x8_t b, int8x8_t c); // VMLSL.S8 q0,d0,d0
614	_NEON2SSESTORAGE int32x4_t vmlsl_s16(int32x4_t a, int16x4_t b, int16x4_t c); // VMLSL.S16 q0,d0,d0
615	_NEON2SSESTORAGE int64x2_t vmlsl_s32(int64x2_t a, int32x2_t b, int32x2_t c); // VMLSL.S32 q0,d0,d0
616	_NEON2SSESTORAGE uint16x8_t vmlsl_u8(uint16x8_t a, uint8x8_t b, uint8x8_t c); // VMLSL.U8 q0,d0,d0
617	_NEON2SSESTORAGE uint32x4_t vmlsl_u16(uint32x4_t a, uint16x4_t b, uint16x4_t c); // VMLSL.U16 q0,d0,d0
618	_NEON2SSESTORAGE uint64x2_t vmlsl_u32(uint64x2_t a, uint32x2_t b, uint32x2_t c); // VMLSL.U32 q0,d0,d0
619	//Vector saturating doubling multiply high
620	_NEON2SSESTORAGE int16x4_t vqdmulh_s16(int16x4_t a, int16x4_t b); // VQDMULH.S16 d0,d0,d0
621	_NEON2SSESTORAGE int32x2_t vqdmulh_s32(int32x2_t a, int32x2_t b); // VQDMULH.S32 d0,d0,d0
622	_NEON2SSESTORAGE int16x8_t vqdmulhq_s16(int16x8_t a, int16x8_t b); // VQDMULH.S16 q0,q0,q0
623	_NEON2SSESTORAGE int32x4_t vqdmulhq_s32(int32x4_t a, int32x4_t b); // VQDMULH.S32 q0,q0,q0
624	//Vector saturating rounding doubling multiply high
625	_NEON2SSESTORAGE int16x4_t vqrdmulh_s16(int16x4_t a, int16x4_t b); // VQRDMULH.S16 d0,d0,d0
626	_NEON2SSESTORAGE int32x2_t vqrdmulh_s32(int32x2_t a, int32x2_t b); // VQRDMULH.S32 d0,d0,d0
627	_NEON2SSESTORAGE int16x8_t vqrdmulhq_s16(int16x8_t a, int16x8_t b); // VQRDMULH.S16 q0,q0,q0
628	_NEON2SSESTORAGE int32x4_t vqrdmulhq_s32(int32x4_t a, int32x4_t b); // VQRDMULH.S32 q0,q0,q0
629	//Vector saturating doubling multiply accumulate long
630	_NEON2SSESTORAGE int32x4_t vqdmlal_s16(int32x4_t a, int16x4_t b, int16x4_t c); // VQDMLAL.S16 q0,d0,d0
631	_NEON2SSESTORAGE int64x2_t vqdmlal_s32(int64x2_t a, int32x2_t b, int32x2_t c); // VQDMLAL.S32 q0,d0,d0
632	//Vector saturating doubling multiply subtract long
633	_NEON2SSESTORAGE int32x4_t vqdmlsl_s16(int32x4_t a, int16x4_t b, int16x4_t c); // VQDMLSL.S16 q0,d0,d0
634	_NEON2SSESTORAGE int64x2_t vqdmlsl_s32(int64x2_t a, int32x2_t b, int32x2_t c); // VQDMLSL.S32 q0,d0,d0
635	//Vector long multiply
636	_NEON2SSESTORAGE int16x8_t vmull_s8(int8x8_t a, int8x8_t b); // VMULL.S8 q0,d0,d0
637	_NEON2SSESTORAGE int32x4_t vmull_s16(int16x4_t a, int16x4_t b); // VMULL.S16 q0,d0,d0
638	_NEON2SSESTORAGE int64x2_t vmull_s32(int32x2_t a, int32x2_t b); // VMULL.S32 q0,d0,d0
639	_NEON2SSESTORAGE uint16x8_t vmull_u8(uint8x8_t a, uint8x8_t b); // VMULL.U8 q0,d0,d0
640	_NEON2SSESTORAGE uint32x4_t vmull_u16(uint16x4_t a, uint16x4_t b); // VMULL.U16 q0,d0,d0
641	_NEON2SSESTORAGE uint64x2_t vmull_u32(uint32x2_t a, uint32x2_t b); // VMULL.U32 q0,d0,d0
642	_NEON2SSESTORAGE poly16x8_t vmull_p8(poly8x8_t a, poly8x8_t b); // VMULL.P8 q0,d0,d0
643	//Vector saturating doubling long multiply
644	_NEON2SSESTORAGE int32x4_t vqdmull_s16(int16x4_t a, int16x4_t b); // VQDMULL.S16 q0,d0,d0
645	_NEON2SSESTORAGE int64x2_t vqdmull_s32(int32x2_t a, int32x2_t b); // VQDMULL.S32 q0,d0,d0
646	//Subtraction
647	//Vector subtract
648	_NEON2SSESTORAGE int8x8_t vsub_s8(int8x8_t a, int8x8_t b); // VSUB.I8 d0,d0,d0
649	_NEON2SSESTORAGE int16x4_t vsub_s16(int16x4_t a, int16x4_t b); // VSUB.I16 d0,d0,d0
650	_NEON2SSESTORAGE int32x2_t vsub_s32(int32x2_t a, int32x2_t b); // VSUB.I32 d0,d0,d0
651	_NEON2SSESTORAGE int64x1_t vsub_s64(int64x1_t a, int64x1_t b); // VSUB.I64 d0,d0,d0
652	_NEON2SSESTORAGE float32x2_t vsub_f32(float32x2_t a, float32x2_t b); // VSUB.F32 d0,d0,d0
653	_NEON2SSE_GLOBAL uint8x8_t vsub_u8(uint8x8_t a, uint8x8_t b); // VSUB.I8 d0,d0,d0
654	_NEON2SSE_GLOBAL uint16x4_t vsub_u16(uint16x4_t a, uint16x4_t b); // VSUB.I16 d0,d0,d0
655	_NEON2SSE_GLOBAL uint32x2_t vsub_u32(uint32x2_t a, uint32x2_t b); // VSUB.I32 d0,d0,d0
656	_NEON2SSESTORAGE uint64x1_t vsub_u64(uint64x1_t a, uint64x1_t b); // VSUB.I64 d0,d0,d0
657	_NEON2SSE_GLOBAL int8x16_t vsubq_s8(int8x16_t a, int8x16_t b); // VSUB.I8 q0,q0,q0
658	_NEON2SSE_GLOBAL int16x8_t vsubq_s16(int16x8_t a, int16x8_t b); // VSUB.I16 q0,q0,q0
659	_NEON2SSE_GLOBAL int32x4_t vsubq_s32(int32x4_t a, int32x4_t b); // VSUB.I32 q0,q0,q0
660	_NEON2SSE_GLOBAL int64x2_t vsubq_s64(int64x2_t a, int64x2_t b); // VSUB.I64 q0,q0,q0
661	_NEON2SSE_GLOBAL float32x4_t vsubq_f32(float32x4_t a, float32x4_t b); // VSUB.F32 q0,q0,q0
662	_NEON2SSE_GLOBAL uint8x16_t vsubq_u8(uint8x16_t a, uint8x16_t b); // VSUB.I8 q0,q0,q0
663	_NEON2SSE_GLOBAL uint16x8_t vsubq_u16(uint16x8_t a, uint16x8_t b); // VSUB.I16 q0,q0,q0
664	_NEON2SSE_GLOBAL uint32x4_t vsubq_u32(uint32x4_t a, uint32x4_t b); // VSUB.I32 q0,q0,q0
665	_NEON2SSE_GLOBAL uint64x2_t vsubq_u64(uint64x2_t a, uint64x2_t b); // VSUB.I64 q0,q0,q0
666	//Vector long subtract: vsub -> Vr[i]:=Va[i]+Vb[i]
667	_NEON2SSESTORAGE int16x8_t vsubl_s8(int8x8_t a, int8x8_t b); // VSUBL.S8 q0,d0,d0
668	_NEON2SSESTORAGE int32x4_t vsubl_s16(int16x4_t a, int16x4_t b); // VSUBL.S16 q0,d0,d0
669	_NEON2SSESTORAGE int64x2_t vsubl_s32(int32x2_t a, int32x2_t b); // VSUBL.S32 q0,d0,d0
670	_NEON2SSESTORAGE uint16x8_t vsubl_u8(uint8x8_t a, uint8x8_t b); // VSUBL.U8 q0,d0,d0
671	_NEON2SSESTORAGE uint32x4_t vsubl_u16(uint16x4_t a, uint16x4_t b); // VSUBL.U16 q0,d0,d0
672	_NEON2SSESTORAGE uint64x2_t vsubl_u32(uint32x2_t a, uint32x2_t b); // VSUBL.U32 q0,d0,d0
673	//Vector wide subtract: vsub -> Vr[i]:=Va[i]+Vb[i]
674	_NEON2SSESTORAGE int16x8_t vsubw_s8(int16x8_t a, int8x8_t b); // VSUBW.S8 q0,q0,d0
675	_NEON2SSESTORAGE int32x4_t vsubw_s16(int32x4_t a, int16x4_t b); // VSUBW.S16 q0,q0,d0
676	_NEON2SSESTORAGE int64x2_t vsubw_s32(int64x2_t a, int32x2_t b); // VSUBW.S32 q0,q0,d0
677	_NEON2SSESTORAGE uint16x8_t vsubw_u8(uint16x8_t a, uint8x8_t b); // VSUBW.U8 q0,q0,d0
678	_NEON2SSESTORAGE uint32x4_t vsubw_u16(uint32x4_t a, uint16x4_t b); // VSUBW.U16 q0,q0,d0
679	_NEON2SSESTORAGE uint64x2_t vsubw_u32(uint64x2_t a, uint32x2_t b); // VSUBW.U32 q0,q0,d0
680	//Vector saturating subtract
681	_NEON2SSESTORAGE int8x8_t vqsub_s8(int8x8_t a, int8x8_t b); // VQSUB.S8 d0,d0,d0
682	_NEON2SSESTORAGE int16x4_t vqsub_s16(int16x4_t a, int16x4_t b); // VQSUB.S16 d0,d0,d0
683	_NEON2SSESTORAGE int32x2_t vqsub_s32(int32x2_t a, int32x2_t b); // VQSUB.S32 d0,d0,d0
684	_NEON2SSESTORAGE int64x1_t vqsub_s64(int64x1_t a, int64x1_t b); // VQSUB.S64 d0,d0,d0
685	_NEON2SSESTORAGE uint8x8_t vqsub_u8(uint8x8_t a, uint8x8_t b); // VQSUB.U8 d0,d0,d0
686	_NEON2SSESTORAGE uint16x4_t vqsub_u16(uint16x4_t a, uint16x4_t b); // VQSUB.U16 d0,d0,d0
687	_NEON2SSESTORAGE uint32x2_t vqsub_u32(uint32x2_t a, uint32x2_t b); // VQSUB.U32 d0,d0,d0
688	_NEON2SSESTORAGE uint64x1_t vqsub_u64(uint64x1_t a, uint64x1_t b); // VQSUB.U64 d0,d0,d0
689	_NEON2SSE_GLOBAL int8x16_t vqsubq_s8(int8x16_t a, int8x16_t b); // VQSUB.S8 q0,q0,q0
690	_NEON2SSE_GLOBAL int16x8_t vqsubq_s16(int16x8_t a, int16x8_t b); // VQSUB.S16 q0,q0,q0
691	_NEON2SSESTORAGE int32x4_t vqsubq_s32(int32x4_t a, int32x4_t b); // VQSUB.S32 q0,q0,q0
692	_NEON2SSESTORAGE int64x2_t vqsubq_s64(int64x2_t a, int64x2_t b); // VQSUB.S64 q0,q0,q0
693	_NEON2SSE_GLOBAL uint8x16_t vqsubq_u8(uint8x16_t a, uint8x16_t b); // VQSUB.U8 q0,q0,q0
694	_NEON2SSE_GLOBAL uint16x8_t vqsubq_u16(uint16x8_t a, uint16x8_t b); // VQSUB.U16 q0,q0,q0
695	_NEON2SSESTORAGE uint32x4_t vqsubq_u32(uint32x4_t a, uint32x4_t b); // VQSUB.U32 q0,q0,q0
696	_NEON2SSESTORAGE uint64x2_t vqsubq_u64(uint64x2_t a, uint64x2_t b); // VQSUB.U64 q0,q0,q0
697	//Vector halving subtract
698	_NEON2SSESTORAGE int8x8_t vhsub_s8(int8x8_t a, int8x8_t b); // VHSUB.S8 d0,d0,d0
699	_NEON2SSESTORAGE int16x4_t vhsub_s16(int16x4_t a, int16x4_t b); // VHSUB.S16 d0,d0,d0
700	_NEON2SSESTORAGE int32x2_t vhsub_s32(int32x2_t a, int32x2_t b); // VHSUB.S32 d0,d0,d0
701	_NEON2SSESTORAGE uint8x8_t vhsub_u8(uint8x8_t a, uint8x8_t b); // VHSUB.U8 d0,d0,d0
702	_NEON2SSESTORAGE uint16x4_t vhsub_u16(uint16x4_t a, uint16x4_t b); // VHSUB.U16 d0,d0,d0
703	_NEON2SSESTORAGE uint32x2_t vhsub_u32(uint32x2_t a, uint32x2_t b); // VHSUB.U32 d0,d0,d0
704	_NEON2SSESTORAGE int8x16_t vhsubq_s8(int8x16_t a, int8x16_t b); // VHSUB.S8 q0,q0,q0
705	_NEON2SSESTORAGE int16x8_t vhsubq_s16(int16x8_t a, int16x8_t b); // VHSUB.S16 q0,q0,q0
706	_NEON2SSESTORAGE int32x4_t vhsubq_s32(int32x4_t a, int32x4_t b); // VHSUB.S32 q0,q0,q0
707	_NEON2SSESTORAGE uint8x16_t vhsubq_u8(uint8x16_t a, uint8x16_t b); // VHSUB.U8 q0,q0,q0
708	_NEON2SSESTORAGE uint16x8_t vhsubq_u16(uint16x8_t a, uint16x8_t b); // VHSUB.U16 q0,q0,q0
709	_NEON2SSESTORAGE uint32x4_t vhsubq_u32(uint32x4_t a, uint32x4_t b); // VHSUB.U32 q0,q0,q0
710	//Vector subtract high half
711	_NEON2SSESTORAGE int8x8_t vsubhn_s16(int16x8_t a, int16x8_t b); // VSUBHN.I16 d0,q0,q0
712	_NEON2SSESTORAGE int16x4_t vsubhn_s32(int32x4_t a, int32x4_t b); // VSUBHN.I32 d0,q0,q0
713	_NEON2SSESTORAGE int32x2_t vsubhn_s64(int64x2_t a, int64x2_t b); // VSUBHN.I64 d0,q0,q0
714	_NEON2SSESTORAGE uint8x8_t vsubhn_u16(uint16x8_t a, uint16x8_t b); // VSUBHN.I16 d0,q0,q0
715	_NEON2SSESTORAGE uint16x4_t vsubhn_u32(uint32x4_t a, uint32x4_t b); // VSUBHN.I32 d0,q0,q0
716	_NEON2SSE_GLOBAL uint32x2_t vsubhn_u64(uint64x2_t a, uint64x2_t b); // VSUBHN.I64 d0,q0,q0
717	//Vector rounding subtract high half
718	_NEON2SSESTORAGE int8x8_t vrsubhn_s16(int16x8_t a, int16x8_t b); // VRSUBHN.I16 d0,q0,q0
719	_NEON2SSESTORAGE int16x4_t vrsubhn_s32(int32x4_t a, int32x4_t b); // VRSUBHN.I32 d0,q0,q0
720	_NEON2SSESTORAGE int32x2_t vrsubhn_s64(int64x2_t a, int64x2_t b); // VRSUBHN.I64 d0,q0,q0
721	_NEON2SSESTORAGE uint8x8_t vrsubhn_u16(uint16x8_t a, uint16x8_t b); // VRSUBHN.I16 d0,q0,q0
722	_NEON2SSESTORAGE uint16x4_t vrsubhn_u32(uint32x4_t a, uint32x4_t b); // VRSUBHN.I32 d0,q0,q0
723	_NEON2SSE_GLOBAL uint32x2_t vrsubhn_u64(uint64x2_t a, uint64x2_t b); // VRSUBHN.I64 d0,q0,q0
724	//Comparison
725	//Vector compare equal
726	_NEON2SSESTORAGE uint8x8_t vceq_s8(int8x8_t a, int8x8_t b); // VCEQ.I8 d0, d0, d0
727	_NEON2SSESTORAGE uint16x4_t vceq_s16(int16x4_t a, int16x4_t b); // VCEQ.I16 d0, d0, d0
728	_NEON2SSESTORAGE uint32x2_t vceq_s32(int32x2_t a, int32x2_t b); // VCEQ.I32 d0, d0, d0
729	_NEON2SSESTORAGE uint32x2_t vceq_f32(float32x2_t a, float32x2_t b); // VCEQ.F32 d0, d0, d0
730	_NEON2SSESTORAGE uint8x8_t vceq_u8(uint8x8_t a, uint8x8_t b); // VCEQ.I8 d0, d0, d0
731	_NEON2SSESTORAGE uint16x4_t vceq_u16(uint16x4_t a, uint16x4_t b); // VCEQ.I16 d0, d0, d0
732	_NEON2SSESTORAGE uint32x2_t vceq_u32(uint32x2_t a, uint32x2_t b); // VCEQ.I32 d0, d0, d0
733	_NEON2SSE_GLOBAL uint8x8_t vceq_p8(poly8x8_t a, poly8x8_t b); // VCEQ.I8 d0, d0, d0
734	_NEON2SSE_GLOBAL uint8x16_t vceqq_s8(int8x16_t a, int8x16_t b); // VCEQ.I8 q0, q0, q0
735	_NEON2SSE_GLOBAL uint16x8_t vceqq_s16(int16x8_t a, int16x8_t b); // VCEQ.I16 q0, q0, q0
736	_NEON2SSE_GLOBAL uint32x4_t vceqq_s32(int32x4_t a, int32x4_t b); // VCEQ.I32 q0, q0, q0
737	_NEON2SSESTORAGE uint32x4_t vceqq_f32(float32x4_t a, float32x4_t b); // VCEQ.F32 q0, q0, q0
738	_NEON2SSE_GLOBAL uint8x16_t vceqq_u8(uint8x16_t a, uint8x16_t b); // VCEQ.I8 q0, q0, q0
739	_NEON2SSE_GLOBAL uint16x8_t vceqq_u16(uint16x8_t a, uint16x8_t b); // VCEQ.I16 q0, q0, q0
740	_NEON2SSE_GLOBAL uint32x4_t vceqq_u32(uint32x4_t a, uint32x4_t b); // VCEQ.I32 q0, q0, q0
741	_NEON2SSE_GLOBAL uint8x16_t vceqq_p8(poly8x16_t a, poly8x16_t b); // VCEQ.I8 q0, q0, q0
742	//Vector compare greater-than or equal
743	_NEON2SSESTORAGE uint8x8_t vcge_s8(int8x8_t a, int8x8_t b); // VCGE.S8 d0, d0, d0
744	_NEON2SSESTORAGE uint16x4_t vcge_s16(int16x4_t a, int16x4_t b); // VCGE.S16 d0, d0, d0
745	_NEON2SSESTORAGE uint32x2_t vcge_s32(int32x2_t a, int32x2_t b); // VCGE.S32 d0, d0, d0
746	_NEON2SSESTORAGE uint32x2_t vcge_f32(float32x2_t a, float32x2_t b); // VCGE.F32 d0, d0, d0
747	_NEON2SSESTORAGE uint8x8_t vcge_u8(uint8x8_t a, uint8x8_t b); // VCGE.U8 d0, d0, d0
748	_NEON2SSESTORAGE uint16x4_t vcge_u16(uint16x4_t a, uint16x4_t b); // VCGE.U16 d0, d0, d0
749	_NEON2SSESTORAGE uint32x2_t vcge_u32(uint32x2_t a, uint32x2_t b); // VCGE.U32 d0, d0, d0
750	_NEON2SSESTORAGE uint8x16_t vcgeq_s8(int8x16_t a, int8x16_t b); // VCGE.S8 q0, q0, q0
751	_NEON2SSESTORAGE uint16x8_t vcgeq_s16(int16x8_t a, int16x8_t b); // VCGE.S16 q0, q0, q0
752	_NEON2SSESTORAGE uint32x4_t vcgeq_s32(int32x4_t a, int32x4_t b); // VCGE.S32 q0, q0, q0
753	_NEON2SSESTORAGE uint32x4_t vcgeq_f32(float32x4_t a, float32x4_t b); // VCGE.F32 q0, q0, q0
754	_NEON2SSESTORAGE uint8x16_t vcgeq_u8(uint8x16_t a, uint8x16_t b); // VCGE.U8 q0, q0, q0
755	_NEON2SSESTORAGE uint16x8_t vcgeq_u16(uint16x8_t a, uint16x8_t b); // VCGE.U16 q0, q0, q0
756	_NEON2SSESTORAGE uint32x4_t vcgeq_u32(uint32x4_t a, uint32x4_t b); // VCGE.U32 q0, q0, q0
757	//Vector compare less-than or equal
758	_NEON2SSESTORAGE uint8x8_t vcle_s8(int8x8_t a, int8x8_t b); // VCGE.S8 d0, d0, d0
759	_NEON2SSESTORAGE uint16x4_t vcle_s16(int16x4_t a, int16x4_t b); // VCGE.S16 d0, d0, d0
760	_NEON2SSESTORAGE uint32x2_t vcle_s32(int32x2_t a, int32x2_t b); // VCGE.S32 d0, d0, d0
761	_NEON2SSESTORAGE uint32x2_t vcle_f32(float32x2_t a, float32x2_t b); // VCGE.F32 d0, d0, d0
762	_NEON2SSE_GLOBAL uint8x8_t vcle_u8(uint8x8_t a, uint8x8_t b); // VCGE.U8 d0, d0, d0
763	_NEON2SSE_GLOBAL uint16x4_t vcle_u16(uint16x4_t a, uint16x4_t b); // VCGE.U16 d0, d0, d0
764	_NEON2SSE_GLOBAL uint32x2_t vcle_u32(uint32x2_t a, uint32x2_t b); // VCGE.U32 d0, d0, d0
765	_NEON2SSESTORAGE uint8x16_t vcleq_s8(int8x16_t a, int8x16_t b); // VCGE.S8 q0, q0, q0
766	_NEON2SSESTORAGE uint16x8_t vcleq_s16(int16x8_t a, int16x8_t b); // VCGE.S16 q0, q0, q0
767	_NEON2SSESTORAGE uint32x4_t vcleq_s32(int32x4_t a, int32x4_t b); // VCGE.S32 q0, q0, q0
768	_NEON2SSESTORAGE uint32x4_t vcleq_f32(float32x4_t a, float32x4_t b); // VCGE.F32 q0, q0, q0
769	_NEON2SSESTORAGE uint8x16_t vcleq_u8(uint8x16_t a, uint8x16_t b); // VCGE.U8 q0, q0, q0
770	_NEON2SSESTORAGE uint16x8_t vcleq_u16(uint16x8_t a, uint16x8_t b); // VCGE.U16 q0, q0, q0
771	_NEON2SSESTORAGE uint32x4_t vcleq_u32(uint32x4_t a, uint32x4_t b); // VCGE.U32 q0, q0, q0
772	//Vector compare greater-than
773	_NEON2SSESTORAGE uint8x8_t vcgt_s8(int8x8_t a, int8x8_t b); // VCGT.S8 d0, d0, d0
774	_NEON2SSESTORAGE uint16x4_t vcgt_s16(int16x4_t a, int16x4_t b); // VCGT.S16 d0, d0, d0
775	_NEON2SSESTORAGE uint32x2_t vcgt_s32(int32x2_t a, int32x2_t b); // VCGT.S32 d0, d0, d0
776	_NEON2SSESTORAGE uint32x2_t vcgt_f32(float32x2_t a, float32x2_t b); // VCGT.F32 d0, d0, d0
777	_NEON2SSESTORAGE uint8x8_t vcgt_u8(uint8x8_t a, uint8x8_t b); // VCGT.U8 d0, d0, d0
778	_NEON2SSESTORAGE uint16x4_t vcgt_u16(uint16x4_t a, uint16x4_t b); // VCGT.U16 d0, d0, d0
779	_NEON2SSESTORAGE uint32x2_t vcgt_u32(uint32x2_t a, uint32x2_t b); // VCGT.U32 d0, d0, d0
780	_NEON2SSE_GLOBAL uint8x16_t vcgtq_s8(int8x16_t a, int8x16_t b); // VCGT.S8 q0, q0, q0
781	_NEON2SSE_GLOBAL uint16x8_t vcgtq_s16(int16x8_t a, int16x8_t b); // VCGT.S16 q0, q0, q0
782	_NEON2SSE_GLOBAL uint32x4_t vcgtq_s32(int32x4_t a, int32x4_t b); // VCGT.S32 q0, q0, q0
783	_NEON2SSESTORAGE uint32x4_t vcgtq_f32(float32x4_t a, float32x4_t b); // VCGT.F32 q0, q0, q0
784	_NEON2SSESTORAGE uint8x16_t vcgtq_u8(uint8x16_t a, uint8x16_t b); // VCGT.U8 q0, q0, q0
785	_NEON2SSESTORAGE uint16x8_t vcgtq_u16(uint16x8_t a, uint16x8_t b); // VCGT.U16 q0, q0, q0
786	_NEON2SSESTORAGE uint32x4_t vcgtq_u32(uint32x4_t a, uint32x4_t b); // VCGT.U32 q0, q0, q0
787	//Vector compare less-than
788	_NEON2SSE_GLOBAL uint8x8_t vclt_s8(int8x8_t a, int8x8_t b); // VCGT.S8 d0, d0, d0
789	_NEON2SSE_GLOBAL uint16x4_t vclt_s16(int16x4_t a, int16x4_t b); // VCGT.S16 d0, d0, d0
790	_NEON2SSE_GLOBAL uint32x2_t vclt_s32(int32x2_t a, int32x2_t b); // VCGT.S32 d0, d0, d0
791	_NEON2SSE_GLOBAL uint32x2_t vclt_f32(float32x2_t a, float32x2_t b); // VCGT.F32 d0, d0, d0
792	_NEON2SSE_GLOBAL uint8x8_t vclt_u8(uint8x8_t a, uint8x8_t b); // VCGT.U8 d0, d0, d0
793	_NEON2SSE_GLOBAL uint16x4_t vclt_u16(uint16x4_t a, uint16x4_t b); // VCGT.U16 d0, d0, d0
794	_NEON2SSE_GLOBAL uint32x2_t vclt_u32(uint32x2_t a, uint32x2_t b); // VCGT.U32 d0, d0, d0
795	_NEON2SSE_GLOBAL uint8x16_t vcltq_s8(int8x16_t a, int8x16_t b); // VCGT.S8 q0, q0, q0
796	_NEON2SSE_GLOBAL uint16x8_t vcltq_s16(int16x8_t a, int16x8_t b); // VCGT.S16 q0, q0, q0
797	_NEON2SSE_GLOBAL uint32x4_t vcltq_s32(int32x4_t a, int32x4_t b); // VCGT.S32 q0, q0, q0
798	_NEON2SSE_GLOBAL uint32x4_t vcltq_f32(float32x4_t a, float32x4_t b); // VCGT.F32 q0, q0, q0
799	_NEON2SSE_GLOBAL uint8x16_t vcltq_u8(uint8x16_t a, uint8x16_t b); // VCGT.U8 q0, q0, q0
800	_NEON2SSE_GLOBAL uint16x8_t vcltq_u16(uint16x8_t a, uint16x8_t b); // VCGT.U16 q0, q0, q0
801	_NEON2SSE_GLOBAL uint32x4_t vcltq_u32(uint32x4_t a, uint32x4_t b); // VCGT.U32 q0, q0, q0
802	//Vector compare absolute greater-than or equal
803	_NEON2SSESTORAGE uint32x2_t vcage_f32(float32x2_t a, float32x2_t b); // VACGE.F32 d0, d0, d0
804	_NEON2SSESTORAGE uint32x4_t vcageq_f32(float32x4_t a, float32x4_t b); // VACGE.F32 q0, q0, q0
805	//Vector compare absolute less-than or equal
806	_NEON2SSESTORAGE uint32x2_t vcale_f32(float32x2_t a, float32x2_t b); // VACGE.F32 d0, d0, d0
807	_NEON2SSESTORAGE uint32x4_t vcaleq_f32(float32x4_t a, float32x4_t b); // VACGE.F32 q0, q0, q0
808	//Vector compare absolute greater-than
809	_NEON2SSESTORAGE uint32x2_t vcagt_f32(float32x2_t a, float32x2_t b); // VACGT.F32 d0, d0, d0
810	_NEON2SSESTORAGE uint32x4_t vcagtq_f32(float32x4_t a, float32x4_t b); // VACGT.F32 q0, q0, q0
811	//Vector compare absolute less-than
812	_NEON2SSESTORAGE uint32x2_t vcalt_f32(float32x2_t a, float32x2_t b); // VACGT.F32 d0, d0, d0
813	_NEON2SSESTORAGE uint32x4_t vcaltq_f32(float32x4_t a, float32x4_t b); // VACGT.F32 q0, q0, q0
814	//Vector test bits
815	_NEON2SSESTORAGE uint8x8_t vtst_s8(int8x8_t a, int8x8_t b); // VTST.8 d0, d0, d0
816	_NEON2SSESTORAGE uint16x4_t vtst_s16(int16x4_t a, int16x4_t b); // VTST.16 d0, d0, d0
817	_NEON2SSESTORAGE uint32x2_t vtst_s32(int32x2_t a, int32x2_t b); // VTST.32 d0, d0, d0
818	_NEON2SSE_GLOBAL uint8x8_t vtst_u8(uint8x8_t a, uint8x8_t b); // VTST.8 d0, d0, d0
819	_NEON2SSE_GLOBAL uint16x4_t vtst_u16(uint16x4_t a, uint16x4_t b); // VTST.16 d0, d0, d0
820	_NEON2SSE_GLOBAL uint32x2_t vtst_u32(uint32x2_t a, uint32x2_t b); // VTST.32 d0, d0, d0
821	_NEON2SSE_GLOBAL uint8x8_t vtst_p8(poly8x8_t a, poly8x8_t b); // VTST.8 d0, d0, d0
822	_NEON2SSESTORAGE uint8x16_t vtstq_s8(int8x16_t a, int8x16_t b); // VTST.8 q0, q0, q0
823	_NEON2SSESTORAGE uint16x8_t vtstq_s16(int16x8_t a, int16x8_t b); // VTST.16 q0, q0, q0
824	_NEON2SSESTORAGE uint32x4_t vtstq_s32(int32x4_t a, int32x4_t b); // VTST.32 q0, q0, q0
825	_NEON2SSE_GLOBAL uint8x16_t vtstq_u8(uint8x16_t a, uint8x16_t b); // VTST.8 q0, q0, q0
826	_NEON2SSE_GLOBAL uint16x8_t vtstq_u16(uint16x8_t a, uint16x8_t b); // VTST.16 q0, q0, q0
827	_NEON2SSE_GLOBAL uint32x4_t vtstq_u32(uint32x4_t a, uint32x4_t b); // VTST.32 q0, q0, q0
828	_NEON2SSE_GLOBAL uint8x16_t vtstq_p8(poly8x16_t a, poly8x16_t b); // VTST.8 q0, q0, q0
829	//Absolute difference
830	//Absolute difference between the arguments: Vr[i] = \| Va[i] - Vb[i] \|
831	_NEON2SSESTORAGE int8x8_t vabd_s8(int8x8_t a, int8x8_t b); // VABD.S8 d0,d0,d0
832	_NEON2SSESTORAGE int16x4_t vabd_s16(int16x4_t a, int16x4_t b); // VABD.S16 d0,d0,d0
833	_NEON2SSESTORAGE int32x2_t vabd_s32(int32x2_t a, int32x2_t b); // VABD.S32 d0,d0,d0
834	_NEON2SSESTORAGE uint8x8_t vabd_u8(uint8x8_t a, uint8x8_t b); // VABD.U8 d0,d0,d0
835	_NEON2SSESTORAGE uint16x4_t vabd_u16(uint16x4_t a, uint16x4_t b); // VABD.U16 d0,d0,d0
836	_NEON2SSESTORAGE uint32x2_t vabd_u32(uint32x2_t a, uint32x2_t b); // VABD.U32 d0,d0,d0
837	_NEON2SSESTORAGE float32x2_t vabd_f32(float32x2_t a, float32x2_t b); // VABD.F32 d0,d0,d0
838	_NEON2SSESTORAGE int8x16_t vabdq_s8(int8x16_t a, int8x16_t b); // VABD.S8 q0,q0,q0
839	_NEON2SSESTORAGE int16x8_t vabdq_s16(int16x8_t a, int16x8_t b); // VABD.S16 q0,q0,q0
840	_NEON2SSESTORAGE int32x4_t vabdq_s32(int32x4_t a, int32x4_t b); // VABD.S32 q0,q0,q0
841	_NEON2SSESTORAGE uint8x16_t vabdq_u8(uint8x16_t a, uint8x16_t b); // VABD.U8 q0,q0,q0
842	_NEON2SSESTORAGE uint16x8_t vabdq_u16(uint16x8_t a, uint16x8_t b); // VABD.U16 q0,q0,q0
843	_NEON2SSESTORAGE uint32x4_t vabdq_u32(uint32x4_t a, uint32x4_t b); // VABD.U32 q0,q0,q0
844	_NEON2SSESTORAGE float32x4_t vabdq_f32(float32x4_t a, float32x4_t b); // VABD.F32 q0,q0,q0
845	//Absolute difference - long
846	_NEON2SSESTORAGE int16x8_t vabdl_s8(int8x8_t a, int8x8_t b); // VABDL.S8 q0,d0,d0
847	_NEON2SSESTORAGE int32x4_t vabdl_s16(int16x4_t a, int16x4_t b); // VABDL.S16 q0,d0,d0
848	_NEON2SSESTORAGE int64x2_t vabdl_s32(int32x2_t a, int32x2_t b); // VABDL.S32 q0,d0,d0
849	_NEON2SSESTORAGE uint16x8_t vabdl_u8(uint8x8_t a, uint8x8_t b); // VABDL.U8 q0,d0,d0
850	_NEON2SSESTORAGE uint32x4_t vabdl_u16(uint16x4_t a, uint16x4_t b); // VABDL.U16 q0,d0,d0
851	_NEON2SSESTORAGE uint64x2_t vabdl_u32(uint32x2_t a, uint32x2_t b); // VABDL.U32 q0,d0,d0
852	//Absolute difference and accumulate: Vr[i] = Va[i] + \| Vb[i] - Vc[i] \|
853	_NEON2SSESTORAGE int8x8_t vaba_s8(int8x8_t a, int8x8_t b, int8x8_t c); // VABA.S8 d0,d0,d0
854	_NEON2SSESTORAGE int16x4_t vaba_s16(int16x4_t a, int16x4_t b, int16x4_t c); // VABA.S16 d0,d0,d0
855	_NEON2SSESTORAGE int32x2_t vaba_s32(int32x2_t a, int32x2_t b, int32x2_t c); // VABA.S32 d0,d0,d0
856	_NEON2SSESTORAGE uint8x8_t vaba_u8(uint8x8_t a, uint8x8_t b, uint8x8_t c); // VABA.U8 d0,d0,d0
857	_NEON2SSESTORAGE uint16x4_t vaba_u16(uint16x4_t a, uint16x4_t b, uint16x4_t c); // VABA.U16 d0,d0,d0
858	_NEON2SSESTORAGE uint32x2_t vaba_u32(uint32x2_t a, uint32x2_t b, uint32x2_t c); // VABA.U32 d0,d0,d0
859	_NEON2SSESTORAGE int8x16_t vabaq_s8(int8x16_t a, int8x16_t b, int8x16_t c); // VABA.S8 q0,q0,q0
860	_NEON2SSESTORAGE int16x8_t vabaq_s16(int16x8_t a, int16x8_t b, int16x8_t c); // VABA.S16 q0,q0,q0
861	_NEON2SSESTORAGE int32x4_t vabaq_s32(int32x4_t a, int32x4_t b, int32x4_t c); // VABA.S32 q0,q0,q0
862	_NEON2SSESTORAGE uint8x16_t vabaq_u8(uint8x16_t a, uint8x16_t b, uint8x16_t c); // VABA.U8 q0,q0,q0
863	_NEON2SSESTORAGE uint16x8_t vabaq_u16(uint16x8_t a, uint16x8_t b, uint16x8_t c); // VABA.U16 q0,q0,q0
864	_NEON2SSESTORAGE uint32x4_t vabaq_u32(uint32x4_t a, uint32x4_t b, uint32x4_t c); // VABA.U32 q0,q0,q0
865	//Absolute difference and accumulate - long
866	_NEON2SSESTORAGE int16x8_t vabal_s8(int16x8_t a, int8x8_t b, int8x8_t c); // VABAL.S8 q0,d0,d0
867	_NEON2SSESTORAGE int32x4_t vabal_s16(int32x4_t a, int16x4_t b, int16x4_t c); // VABAL.S16 q0,d0,d0
868	_NEON2SSESTORAGE int64x2_t vabal_s32(int64x2_t a, int32x2_t b, int32x2_t c); // VABAL.S32 q0,d0,d0
869	_NEON2SSESTORAGE uint16x8_t vabal_u8(uint16x8_t a, uint8x8_t b, uint8x8_t c); // VABAL.U8 q0,d0,d0
870	_NEON2SSESTORAGE uint32x4_t vabal_u16(uint32x4_t a, uint16x4_t b, uint16x4_t c); // VABAL.U16 q0,d0,d0
871	_NEON2SSESTORAGE uint64x2_t vabal_u32(uint64x2_t a, uint32x2_t b, uint32x2_t c); // VABAL.U32 q0,d0,d0
872	//Max/Min
873	//vmax -> Vr[i] := (Va[i] >= Vb[i]) ? Va[i] : Vb[i]
874	_NEON2SSESTORAGE int8x8_t vmax_s8(int8x8_t a, int8x8_t b); // VMAX.S8 d0,d0,d0
875	_NEON2SSESTORAGE int16x4_t vmax_s16(int16x4_t a, int16x4_t b); // VMAX.S16 d0,d0,d0
876	_NEON2SSESTORAGE int32x2_t vmax_s32(int32x2_t a, int32x2_t b); // VMAX.S32 d0,d0,d0
877	_NEON2SSESTORAGE uint8x8_t vmax_u8(uint8x8_t a, uint8x8_t b); // VMAX.U8 d0,d0,d0
878	_NEON2SSESTORAGE uint16x4_t vmax_u16(uint16x4_t a, uint16x4_t b); // VMAX.U16 d0,d0,d0
879	_NEON2SSESTORAGE uint32x2_t vmax_u32(uint32x2_t a, uint32x2_t b); // VMAX.U32 d0,d0,d0
880	_NEON2SSESTORAGE float32x2_t vmax_f32(float32x2_t a, float32x2_t b); // VMAX.F32 d0,d0,d0
881	_NEON2SSE_GLOBAL int8x16_t vmaxq_s8(int8x16_t a, int8x16_t b); // VMAX.S8 q0,q0,q0
882	_NEON2SSE_GLOBAL int16x8_t vmaxq_s16(int16x8_t a, int16x8_t b); // VMAX.S16 q0,q0,q0
883	_NEON2SSE_GLOBAL int32x4_t vmaxq_s32(int32x4_t a, int32x4_t b); // VMAX.S32 q0,q0,q0
884	_NEON2SSE_GLOBAL uint8x16_t vmaxq_u8(uint8x16_t a, uint8x16_t b); // VMAX.U8 q0,q0,q0
885	_NEON2SSE_GLOBAL uint16x8_t vmaxq_u16(uint16x8_t a, uint16x8_t b); // VMAX.U16 q0,q0,q0
886	_NEON2SSE_GLOBAL uint32x4_t vmaxq_u32(uint32x4_t a, uint32x4_t b); // VMAX.U32 q0,q0,q0
887	_NEON2SSE_GLOBAL float32x4_t vmaxq_f32(float32x4_t a, float32x4_t b); // VMAX.F32 q0,q0,q0
888
889	_NEON2SSE_GLOBAL float64x2_t vmaxq_f64(float64x2_t a, float64x2_t b); // VMAX.F64 q0,q0,q0
890
891	//vmin -> Vr[i] := (Va[i] >= Vb[i]) ? Vb[i] : Va[i]
892	_NEON2SSESTORAGE int8x8_t vmin_s8(int8x8_t a, int8x8_t b); // VMIN.S8 d0,d0,d0
893	_NEON2SSESTORAGE int16x4_t vmin_s16(int16x4_t a, int16x4_t b); // VMIN.S16 d0,d0,d0
894	_NEON2SSESTORAGE int32x2_t vmin_s32(int32x2_t a, int32x2_t b); // VMIN.S32 d0,d0,d0
895	_NEON2SSESTORAGE uint8x8_t vmin_u8(uint8x8_t a, uint8x8_t b); // VMIN.U8 d0,d0,d0
896	_NEON2SSESTORAGE uint16x4_t vmin_u16(uint16x4_t a, uint16x4_t b); // VMIN.U16 d0,d0,d0
897	_NEON2SSESTORAGE uint32x2_t vmin_u32(uint32x2_t a, uint32x2_t b); // VMIN.U32 d0,d0,d0
898	_NEON2SSESTORAGE float32x2_t vmin_f32(float32x2_t a, float32x2_t b); // VMIN.F32 d0,d0,d0
899	_NEON2SSE_GLOBAL int8x16_t vminq_s8(int8x16_t a, int8x16_t b); // VMIN.S8 q0,q0,q0
900	_NEON2SSE_GLOBAL int16x8_t vminq_s16(int16x8_t a, int16x8_t b); // VMIN.S16 q0,q0,q0
901	_NEON2SSE_GLOBAL int32x4_t vminq_s32(int32x4_t a, int32x4_t b); // VMIN.S32 q0,q0,q0
902	_NEON2SSE_GLOBAL uint8x16_t vminq_u8(uint8x16_t a, uint8x16_t b); // VMIN.U8 q0,q0,q0
903	_NEON2SSE_GLOBAL uint16x8_t vminq_u16(uint16x8_t a, uint16x8_t b); // VMIN.U16 q0,q0,q0
904	_NEON2SSE_GLOBAL uint32x4_t vminq_u32(uint32x4_t a, uint32x4_t b); // VMIN.U32 q0,q0,q0
905	_NEON2SSE_GLOBAL float32x4_t vminq_f32(float32x4_t a, float32x4_t b); // VMIN.F32 q0,q0,q0
906
907	_NEON2SSE_GLOBAL float64x2_t vminq_f64(float64x2_t a, float64x2_t b); // VMIN.F64 q0,q0,q0
908
909	//Pairwise addition
910	//Pairwise add
911	_NEON2SSESTORAGE int8x8_t vpadd_s8(int8x8_t a, int8x8_t b); // VPADD.I8 d0,d0,d0
912	_NEON2SSESTORAGE int16x4_t vpadd_s16(int16x4_t a, int16x4_t b); // VPADD.I16 d0,d0,d0
913	_NEON2SSESTORAGE int32x2_t vpadd_s32(int32x2_t a, int32x2_t b); // VPADD.I32 d0,d0,d0
914	_NEON2SSESTORAGE uint8x8_t vpadd_u8(uint8x8_t a, uint8x8_t b); // VPADD.I8 d0,d0,d0
915	_NEON2SSESTORAGE uint16x4_t vpadd_u16(uint16x4_t a, uint16x4_t b); // VPADD.I16 d0,d0,d0
916	_NEON2SSESTORAGE uint32x2_t vpadd_u32(uint32x2_t a, uint32x2_t b); // VPADD.I32 d0,d0,d0
917	_NEON2SSESTORAGE float32x2_t vpadd_f32(float32x2_t a, float32x2_t b); // VPADD.F32 d0,d0,d0
918	//Long pairwise add
919	_NEON2SSESTORAGE int16x4_t vpaddl_s8(int8x8_t a); // VPADDL.S8 d0,d0
920	_NEON2SSESTORAGE int32x2_t vpaddl_s16(int16x4_t a); // VPADDL.S16 d0,d0
921	_NEON2SSESTORAGE int64x1_t vpaddl_s32(int32x2_t a); // VPADDL.S32 d0,d0
922	_NEON2SSESTORAGE uint16x4_t vpaddl_u8(uint8x8_t a); // VPADDL.U8 d0,d0
923	_NEON2SSESTORAGE uint32x2_t vpaddl_u16(uint16x4_t a); // VPADDL.U16 d0,d0
924	_NEON2SSESTORAGE uint64x1_t vpaddl_u32(uint32x2_t a); // VPADDL.U32 d0,d0
925	_NEON2SSESTORAGE int16x8_t vpaddlq_s8(int8x16_t a); // VPADDL.S8 q0,q0
926	_NEON2SSESTORAGE int32x4_t vpaddlq_s16(int16x8_t a); // VPADDL.S16 q0,q0
927	_NEON2SSESTORAGE int64x2_t vpaddlq_s32(int32x4_t a); // VPADDL.S32 q0,q0
928	_NEON2SSESTORAGE uint16x8_t vpaddlq_u8(uint8x16_t a); // VPADDL.U8 q0,q0
929	_NEON2SSESTORAGE uint32x4_t vpaddlq_u16(uint16x8_t a); // VPADDL.U16 q0,q0
930	_NEON2SSESTORAGE uint64x2_t vpaddlq_u32(uint32x4_t a); // VPADDL.U32 q0,q0
931	//Long pairwise add and accumulate
932	_NEON2SSESTORAGE int16x4_t vpadal_s8(int16x4_t a, int8x8_t b); // VPADAL.S8 d0,d0
933	_NEON2SSESTORAGE int32x2_t vpadal_s16(int32x2_t a, int16x4_t b); // VPADAL.S16 d0,d0
934	_NEON2SSESTORAGE int64x1_t vpadal_s32(int64x1_t a, int32x2_t b); // VPADAL.S32 d0,d0
935	_NEON2SSESTORAGE uint16x4_t vpadal_u8(uint16x4_t a, uint8x8_t b); // VPADAL.U8 d0,d0
936	_NEON2SSESTORAGE uint32x2_t vpadal_u16(uint32x2_t a, uint16x4_t b); // VPADAL.U16 d0,d0
937	_NEON2SSESTORAGE uint64x1_t vpadal_u32(uint64x1_t a, uint32x2_t b); // VPADAL.U32 d0,d0
938	_NEON2SSESTORAGE int16x8_t vpadalq_s8(int16x8_t a, int8x16_t b); // VPADAL.S8 q0,q0
939	_NEON2SSESTORAGE int32x4_t vpadalq_s16(int32x4_t a, int16x8_t b); // VPADAL.S16 q0,q0
940	_NEON2SSESTORAGE int64x2_t vpadalq_s32(int64x2_t a, int32x4_t b); // VPADAL.S32 q0,q0
941	_NEON2SSESTORAGE uint16x8_t vpadalq_u8(uint16x8_t a, uint8x16_t b); // VPADAL.U8 q0,q0
942	_NEON2SSESTORAGE uint32x4_t vpadalq_u16(uint32x4_t a, uint16x8_t b); // VPADAL.U16 q0,q0
943	_NEON2SSESTORAGE uint64x2_t vpadalq_u32(uint64x2_t a, uint32x4_t b); // VPADAL.U32 q0,q0
944	//Folding maximum vpmax -> takes maximum of adjacent pairs
945	_NEON2SSESTORAGE int8x8_t vpmax_s8(int8x8_t a, int8x8_t b); // VPMAX.S8 d0,d0,d0
946	_NEON2SSESTORAGE int16x4_t vpmax_s16(int16x4_t a, int16x4_t b); // VPMAX.S16 d0,d0,d0
947	_NEON2SSESTORAGE int32x2_t vpmax_s32(int32x2_t a, int32x2_t b); // VPMAX.S32 d0,d0,d0
948	_NEON2SSESTORAGE uint8x8_t vpmax_u8(uint8x8_t a, uint8x8_t b); // VPMAX.U8 d0,d0,d0
949	_NEON2SSESTORAGE uint16x4_t vpmax_u16(uint16x4_t a, uint16x4_t b); // VPMAX.U16 d0,d0,d0
950	_NEON2SSESTORAGE uint32x2_t vpmax_u32(uint32x2_t a, uint32x2_t b); // VPMAX.U32 d0,d0,d0
951	_NEON2SSESTORAGE float32x2_t vpmax_f32(float32x2_t a, float32x2_t b); // VPMAX.F32 d0,d0,d0
952	//Folding minimum vpmin -> takes minimum of adjacent pairs
953	_NEON2SSESTORAGE int8x8_t vpmin_s8(int8x8_t a, int8x8_t b); // VPMIN.S8 d0,d0,d0
954	_NEON2SSESTORAGE int16x4_t vpmin_s16(int16x4_t a, int16x4_t b); // VPMIN.S16 d0,d0,d0
955	_NEON2SSESTORAGE int32x2_t vpmin_s32(int32x2_t a, int32x2_t b); // VPMIN.S32 d0,d0,d0
956	_NEON2SSESTORAGE uint8x8_t vpmin_u8(uint8x8_t a, uint8x8_t b); // VPMIN.U8 d0,d0,d0
957	_NEON2SSESTORAGE uint16x4_t vpmin_u16(uint16x4_t a, uint16x4_t b); // VPMIN.U16 d0,d0,d0
958	_NEON2SSESTORAGE uint32x2_t vpmin_u32(uint32x2_t a, uint32x2_t b); // VPMIN.U32 d0,d0,d0
959	_NEON2SSESTORAGE float32x2_t vpmin_f32(float32x2_t a, float32x2_t b); // VPMIN.F32 d0,d0,d0
960	//Reciprocal/Sqrt
961	_NEON2SSESTORAGE float32x2_t vrecps_f32(float32x2_t a, float32x2_t b); // VRECPS.F32 d0, d0, d0
962	_NEON2SSESTORAGE float32x4_t vrecpsq_f32(float32x4_t a, float32x4_t b); // VRECPS.F32 q0, q0, q0
963	_NEON2SSESTORAGE float32x2_t vrsqrts_f32(float32x2_t a, float32x2_t b); // VRSQRTS.F32 d0, d0, d0
964	_NEON2SSESTORAGE float32x4_t vrsqrtsq_f32(float32x4_t a, float32x4_t b); // VRSQRTS.F32 q0, q0, q0
965	//Shifts by signed variable
966	//Vector shift left: Vr[i] := Va[i] << Vb[i] (negative values shift right)
967	_NEON2SSESTORAGE int8x8_t vshl_s8(int8x8_t a, int8x8_t b); // VSHL.S8 d0,d0,d0
968	_NEON2SSESTORAGE int16x4_t vshl_s16(int16x4_t a, int16x4_t b); // VSHL.S16 d0,d0,d0
969	_NEON2SSESTORAGE int32x2_t vshl_s32(int32x2_t a, int32x2_t b); // VSHL.S32 d0,d0,d0
970	_NEON2SSESTORAGE int64x1_t vshl_s64(int64x1_t a, int64x1_t b); // VSHL.S64 d0,d0,d0
971	_NEON2SSESTORAGE uint8x8_t vshl_u8(uint8x8_t a, int8x8_t b); // VSHL.U8 d0,d0,d0
972	_NEON2SSESTORAGE uint16x4_t vshl_u16(uint16x4_t a, int16x4_t b); // VSHL.U16 d0,d0,d0
973	_NEON2SSESTORAGE uint32x2_t vshl_u32(uint32x2_t a, int32x2_t b); // VSHL.U32 d0,d0,d0
974	_NEON2SSESTORAGE uint64x1_t vshl_u64(uint64x1_t a, int64x1_t b); // VSHL.U64 d0,d0,d0
975	_NEON2SSESTORAGE int8x16_t vshlq_s8(int8x16_t a, int8x16_t b); // VSHL.S8 q0,q0,q0
976	_NEON2SSESTORAGE int16x8_t vshlq_s16(int16x8_t a, int16x8_t b); // VSHL.S16 q0,q0,q0
977	_NEON2SSESTORAGE int32x4_t vshlq_s32(int32x4_t a, int32x4_t b); // VSHL.S32 q0,q0,q0
978	_NEON2SSESTORAGE int64x2_t vshlq_s64(int64x2_t a, int64x2_t b); // VSHL.S64 q0,q0,q0
979	_NEON2SSESTORAGE uint8x16_t vshlq_u8(uint8x16_t a, int8x16_t b); // VSHL.U8 q0,q0,q0
980	_NEON2SSESTORAGE uint16x8_t vshlq_u16(uint16x8_t a, int16x8_t b); // VSHL.U16 q0,q0,q0
981	_NEON2SSESTORAGE uint32x4_t vshlq_u32(uint32x4_t a, int32x4_t b); // VSHL.U32 q0,q0,q0
982	_NEON2SSESTORAGE uint64x2_t vshlq_u64(uint64x2_t a, int64x2_t b); // VSHL.U64 q0,q0,q0
983	//Vector saturating shift left: (negative values shift right)
984	_NEON2SSESTORAGE int8x8_t vqshl_s8(int8x8_t a, int8x8_t b); // VQSHL.S8 d0,d0,d0
985	_NEON2SSESTORAGE int16x4_t vqshl_s16(int16x4_t a, int16x4_t b); // VQSHL.S16 d0,d0,d0
986	_NEON2SSESTORAGE int32x2_t vqshl_s32(int32x2_t a, int32x2_t b); // VQSHL.S32 d0,d0,d0
987	_NEON2SSESTORAGE int64x1_t vqshl_s64(int64x1_t a, int64x1_t b); // VQSHL.S64 d0,d0,d0
988	_NEON2SSESTORAGE uint8x8_t vqshl_u8(uint8x8_t a, int8x8_t b); // VQSHL.U8 d0,d0,d0
989	_NEON2SSESTORAGE uint16x4_t vqshl_u16(uint16x4_t a, int16x4_t b); // VQSHL.U16 d0,d0,d0
990	_NEON2SSESTORAGE uint32x2_t vqshl_u32(uint32x2_t a, int32x2_t b); // VQSHL.U32 d0,d0,d0
991	_NEON2SSESTORAGE uint64x1_t vqshl_u64(uint64x1_t a, int64x1_t b); // VQSHL.U64 d0,d0,d0
992	_NEON2SSESTORAGE int8x16_t vqshlq_s8(int8x16_t a, int8x16_t b); // VQSHL.S8 q0,q0,q0
993	_NEON2SSESTORAGE int16x8_t vqshlq_s16(int16x8_t a, int16x8_t b); // VQSHL.S16 q0,q0,q0
994	_NEON2SSESTORAGE int32x4_t vqshlq_s32(int32x4_t a, int32x4_t b); // VQSHL.S32 q0,q0,q0
995	_NEON2SSESTORAGE int64x2_t vqshlq_s64(int64x2_t a, int64x2_t b); // VQSHL.S64 q0,q0,q0
996	_NEON2SSESTORAGE uint8x16_t vqshlq_u8(uint8x16_t a, int8x16_t b); // VQSHL.U8 q0,q0,q0
997	_NEON2SSESTORAGE uint16x8_t vqshlq_u16(uint16x8_t a, int16x8_t b); // VQSHL.U16 q0,q0,q0
998	_NEON2SSESTORAGE uint32x4_t vqshlq_u32(uint32x4_t a, int32x4_t b); // VQSHL.U32 q0,q0,q0
999	_NEON2SSESTORAGE uint64x2_t vqshlq_u64(uint64x2_t a, int64x2_t b); // VQSHL.U64 q0,q0,q0
1000	//Vector rounding shift left: (negative values shift right)
1001	_NEON2SSESTORAGE int8x8_t vrshl_s8(int8x8_t a, int8x8_t b); // VRSHL.S8 d0,d0,d0
1002	_NEON2SSESTORAGE int16x4_t vrshl_s16(int16x4_t a, int16x4_t b); // VRSHL.S16 d0,d0,d0
1003	_NEON2SSESTORAGE int32x2_t vrshl_s32(int32x2_t a, int32x2_t b); // VRSHL.S32 d0,d0,d0
1004	_NEON2SSESTORAGE int64x1_t vrshl_s64(int64x1_t a, int64x1_t b); // VRSHL.S64 d0,d0,d0
1005	_NEON2SSESTORAGE uint8x8_t vrshl_u8(uint8x8_t a, int8x8_t b); // VRSHL.U8 d0,d0,d0
1006	_NEON2SSESTORAGE uint16x4_t vrshl_u16(uint16x4_t a, int16x4_t b); // VRSHL.U16 d0,d0,d0
1007	_NEON2SSESTORAGE uint32x2_t vrshl_u32(uint32x2_t a, int32x2_t b); // VRSHL.U32 d0,d0,d0
1008	_NEON2SSESTORAGE uint64x1_t vrshl_u64(uint64x1_t a, int64x1_t b); // VRSHL.U64 d0,d0,d0
1009	_NEON2SSESTORAGE int8x16_t vrshlq_s8(int8x16_t a, int8x16_t b); // VRSHL.S8 q0,q0,q0
1010	_NEON2SSESTORAGE int16x8_t vrshlq_s16(int16x8_t a, int16x8_t b); // VRSHL.S16 q0,q0,q0
1011	_NEON2SSESTORAGE int32x4_t vrshlq_s32(int32x4_t a, int32x4_t b); // VRSHL.S32 q0,q0,q0
1012	_NEON2SSESTORAGE int64x2_t vrshlq_s64(int64x2_t a, int64x2_t b); // VRSHL.S64 q0,q0,q0
1013	_NEON2SSESTORAGE uint8x16_t vrshlq_u8(uint8x16_t a, int8x16_t b); // VRSHL.U8 q0,q0,q0
1014	_NEON2SSESTORAGE uint16x8_t vrshlq_u16(uint16x8_t a, int16x8_t b); // VRSHL.U16 q0,q0,q0
1015	_NEON2SSESTORAGE uint32x4_t vrshlq_u32(uint32x4_t a, int32x4_t b); // VRSHL.U32 q0,q0,q0
1016	_NEON2SSESTORAGE uint64x2_t vrshlq_u64(uint64x2_t a, int64x2_t b); // VRSHL.U64 q0,q0,q0
1017	//Vector saturating rounding shift left: (negative values shift right)
1018	_NEON2SSESTORAGE int8x8_t vqrshl_s8(int8x8_t a, int8x8_t b); // VQRSHL.S8 d0,d0,d0
1019	_NEON2SSESTORAGE int16x4_t vqrshl_s16(int16x4_t a, int16x4_t b); // VQRSHL.S16 d0,d0,d0
1020	_NEON2SSESTORAGE int32x2_t vqrshl_s32(int32x2_t a, int32x2_t b); // VQRSHL.S32 d0,d0,d0
1021	_NEON2SSESTORAGE int64x1_t vqrshl_s64(int64x1_t a, int64x1_t b); // VQRSHL.S64 d0,d0,d0
1022	_NEON2SSESTORAGE uint8x8_t vqrshl_u8(uint8x8_t a, int8x8_t b); // VQRSHL.U8 d0,d0,d0
1023	_NEON2SSESTORAGE uint16x4_t vqrshl_u16(uint16x4_t a, int16x4_t b); // VQRSHL.U16 d0,d0,d0
1024	_NEON2SSESTORAGE uint32x2_t vqrshl_u32(uint32x2_t a, int32x2_t b); // VQRSHL.U32 d0,d0,d0
1025	_NEON2SSESTORAGE uint64x1_t vqrshl_u64(uint64x1_t a, int64x1_t b); // VQRSHL.U64 d0,d0,d0
1026	_NEON2SSESTORAGE int8x16_t vqrshlq_s8(int8x16_t a, int8x16_t b); // VQRSHL.S8 q0,q0,q0
1027	_NEON2SSESTORAGE int16x8_t vqrshlq_s16(int16x8_t a, int16x8_t b); // VQRSHL.S16 q0,q0,q0
1028	_NEON2SSESTORAGE int32x4_t vqrshlq_s32(int32x4_t a, int32x4_t b); // VQRSHL.S32 q0,q0,q0
1029	_NEON2SSESTORAGE int64x2_t vqrshlq_s64(int64x2_t a, int64x2_t b); // VQRSHL.S64 q0,q0,q0
1030	_NEON2SSESTORAGE uint8x16_t vqrshlq_u8(uint8x16_t a, int8x16_t b); // VQRSHL.U8 q0,q0,q0
1031	_NEON2SSESTORAGE uint16x8_t vqrshlq_u16(uint16x8_t a, int16x8_t b); // VQRSHL.U16 q0,q0,q0
1032	_NEON2SSESTORAGE uint32x4_t vqrshlq_u32(uint32x4_t a, int32x4_t b); // VQRSHL.U32 q0,q0,q0
1033	_NEON2SSESTORAGE uint64x2_t vqrshlq_u64(uint64x2_t a, int64x2_t b); // VQRSHL.U64 q0,q0,q0
1034	//Shifts by a constant
1035	//Vector shift right by constant
1036	_NEON2SSESTORAGE int8x8_t vshr_n_s8(int8x8_t a, __constrange(`1`,`8`) int b); // VSHR.S8 d0,d0,#8
1037	_NEON2SSESTORAGE int16x4_t vshr_n_s16(int16x4_t a, __constrange(`1`,`16`) int b); // VSHR.S16 d0,d0,#16
1038	_NEON2SSESTORAGE int32x2_t vshr_n_s32(int32x2_t a, __constrange(`1`,`32`) int b); // VSHR.S32 d0,d0,#32
1039	_NEON2SSESTORAGE int64x1_t vshr_n_s64(int64x1_t a, __constrange(`1`,`64`) int b); // VSHR.S64 d0,d0,#64
1040	_NEON2SSESTORAGE uint8x8_t vshr_n_u8(uint8x8_t a, __constrange(`1`,`8`) int b); // VSHR.U8 d0,d0,#8
1041	_NEON2SSESTORAGE uint16x4_t vshr_n_u16(uint16x4_t a, __constrange(`1`,`16`) int b); // VSHR.U16 d0,d0,#16
1042	_NEON2SSESTORAGE uint32x2_t vshr_n_u32(uint32x2_t a, __constrange(`1`,`32`) int b); // VSHR.U32 d0,d0,#32
1043	_NEON2SSESTORAGE uint64x1_t vshr_n_u64(uint64x1_t a, __constrange(`1`,`64`) int b); // VSHR.U64 d0,d0,#64
1044	_NEON2SSESTORAGE int8x16_t vshrq_n_s8(int8x16_t a, __constrange(`1`,`8`) int b); // VSHR.S8 q0,q0,#8
1045	_NEON2SSE_GLOBAL int16x8_t vshrq_n_s16(int16x8_t a, __constrange(`1`,`16`) int b); // VSHR.S16 q0,q0,#16
1046	_NEON2SSE_GLOBAL int32x4_t vshrq_n_s32(int32x4_t a, __constrange(`1`,`32`) int b); // VSHR.S32 q0,q0,#32
1047	_NEON2SSESTORAGE int64x2_t vshrq_n_s64(int64x2_t a, __constrange(`1`,`64`) int b); // VSHR.S64 q0,q0,#64
1048	_NEON2SSESTORAGE uint8x16_t vshrq_n_u8(uint8x16_t a, __constrange(`1`,`8`) int b); // VSHR.U8 q0,q0,#8
1049	_NEON2SSE_GLOBAL uint16x8_t vshrq_n_u16(uint16x8_t a, __constrange(`1`,`16`) int b); // VSHR.U16 q0,q0,#16
1050	_NEON2SSE_GLOBAL uint32x4_t vshrq_n_u32(uint32x4_t a, __constrange(`1`,`32`) int b); // VSHR.U32 q0,q0,#32
1051	_NEON2SSE_GLOBAL uint64x2_t vshrq_n_u64(uint64x2_t a, __constrange(`1`,`64`) int b); // VSHR.U64 q0,q0,#64
1052	//Vector shift left by constant
1053	_NEON2SSESTORAGE int8x8_t vshl_n_s8(int8x8_t a, __constrange(`0`,`7`) int b); // VSHL.I8 d0,d0,#0
1054	_NEON2SSESTORAGE int16x4_t vshl_n_s16(int16x4_t a, __constrange(`0`,`15`) int b); // VSHL.I16 d0,d0,#0
1055	_NEON2SSESTORAGE int32x2_t vshl_n_s32(int32x2_t a, __constrange(`0`,`31`) int b); // VSHL.I32 d0,d0,#0
1056	_NEON2SSESTORAGE int64x1_t vshl_n_s64(int64x1_t a, __constrange(`0`,`63`) int b); // VSHL.I64 d0,d0,#0
1057	_NEON2SSESTORAGE uint8x8_t vshl_n_u8(uint8x8_t a, __constrange(`0`,`7`) int b); // VSHL.I8 d0,d0,#0
1058	_NEON2SSE_GLOBAL uint16x4_t vshl_n_u16(uint16x4_t a, __constrange(`0`,`15`) int b); // VSHL.I16 d0,d0,#0
1059	_NEON2SSE_GLOBAL uint32x2_t vshl_n_u32(uint32x2_t a, __constrange(`0`,`31`) int b); // VSHL.I32 d0,d0,#0
1060	_NEON2SSE_GLOBAL uint64x1_t vshl_n_u64(uint64x1_t a, __constrange(`0`,`63`) int b); // VSHL.I64 d0,d0,#0
1061	_NEON2SSE_GLOBAL int8x16_t vshlq_n_s8(int8x16_t a, __constrange(`0`,`7`) int b); // VSHL.I8 q0,q0,#0
1062	_NEON2SSE_GLOBAL int16x8_t vshlq_n_s16(int16x8_t a, __constrange(`0`,`15`) int b); // VSHL.I16 q0,q0,#0
1063	_NEON2SSE_GLOBAL int32x4_t vshlq_n_s32(int32x4_t a, __constrange(`0`,`31`) int b); // VSHL.I32 q0,q0,#0
1064	_NEON2SSE_GLOBAL int64x2_t vshlq_n_s64(int64x2_t a, __constrange(`0`,`63`) int b); // VSHL.I64 q0,q0,#0
1065	_NEON2SSESTORAGE uint8x16_t vshlq_n_u8(uint8x16_t a, __constrange(`0`,`7`) int b); // VSHL.I8 q0,q0,#0
1066	_NEON2SSE_GLOBAL uint16x8_t vshlq_n_u16(uint16x8_t a, __constrange(`0`,`15`) int b); // VSHL.I16 q0,q0,#0
1067	_NEON2SSE_GLOBAL uint32x4_t vshlq_n_u32(uint32x4_t a, __constrange(`0`,`31`) int b); // VSHL.I32 q0,q0,#0
1068	_NEON2SSE_GLOBAL uint64x2_t vshlq_n_u64(uint64x2_t a, __constrange(`0`,`63`) int b); // VSHL.I64 q0,q0,#0
1069	//Vector rounding shift right by constant
1070	_NEON2SSESTORAGE int8x8_t vrshr_n_s8(int8x8_t a, __constrange(`1`,`8`) int b); // VRSHR.S8 d0,d0,#8
1071	_NEON2SSESTORAGE int16x4_t vrshr_n_s16(int16x4_t a, __constrange(`1`,`16`) int b); // VRSHR.S16 d0,d0,#16
1072	_NEON2SSESTORAGE int32x2_t vrshr_n_s32(int32x2_t a, __constrange(`1`,`32`) int b); // VRSHR.S32 d0,d0,#32
1073	_NEON2SSESTORAGE int64x1_t vrshr_n_s64(int64x1_t a, __constrange(`1`,`64`) int b); // VRSHR.S64 d0,d0,#64
1074	_NEON2SSESTORAGE uint8x8_t vrshr_n_u8(uint8x8_t a, __constrange(`1`,`8`) int b); // VRSHR.U8 d0,d0,#8
1075	_NEON2SSESTORAGE uint16x4_t vrshr_n_u16(uint16x4_t a, __constrange(`1`,`16`) int b); // VRSHR.U16 d0,d0,#16
1076	_NEON2SSESTORAGE uint32x2_t vrshr_n_u32(uint32x2_t a, __constrange(`1`,`32`) int b); // VRSHR.U32 d0,d0,#32
1077	_NEON2SSESTORAGE uint64x1_t vrshr_n_u64(uint64x1_t a, __constrange(`1`,`64`) int b); // VRSHR.U64 d0,d0,#64
1078	_NEON2SSESTORAGE int8x16_t vrshrq_n_s8(int8x16_t a, __constrange(`1`,`8`) int b); // VRSHR.S8 q0,q0,#8
1079	_NEON2SSESTORAGE int16x8_t vrshrq_n_s16(int16x8_t a, __constrange(`1`,`16`) int b); // VRSHR.S16 q0,q0,#16
1080	_NEON2SSESTORAGE int32x4_t vrshrq_n_s32(int32x4_t a, __constrange(`1`,`32`) int b); // VRSHR.S32 q0,q0,#32
1081	_NEON2SSESTORAGE int64x2_t vrshrq_n_s64(int64x2_t a, __constrange(`1`,`64`) int b); // VRSHR.S64 q0,q0,#64
1082	_NEON2SSESTORAGE uint8x16_t vrshrq_n_u8(uint8x16_t a, __constrange(`1`,`8`) int b); // VRSHR.U8 q0,q0,#8
1083	_NEON2SSESTORAGE uint16x8_t vrshrq_n_u16(uint16x8_t a, __constrange(`1`,`16`) int b); // VRSHR.U16 q0,q0,#16
1084	_NEON2SSESTORAGE uint32x4_t vrshrq_n_u32(uint32x4_t a, __constrange(`1`,`32`) int b); // VRSHR.U32 q0,q0,#32
1085	_NEON2SSESTORAGE uint64x2_t vrshrq_n_u64(uint64x2_t a, __constrange(`1`,`64`) int b); // VRSHR.U64 q0,q0,#64
1086	//Vector shift right by constant and accumulate
1087	_NEON2SSESTORAGE int8x8_t vsra_n_s8(int8x8_t a, int8x8_t b, __constrange(`1`,`8`) int c); // VSRA.S8 d0,d0,#8
1088	_NEON2SSESTORAGE int16x4_t vsra_n_s16(int16x4_t a, int16x4_t b, __constrange(`1`,`16`) int c); // VSRA.S16 d0,d0,#16
1089	_NEON2SSESTORAGE int32x2_t vsra_n_s32(int32x2_t a, int32x2_t b, __constrange(`1`,`32`) int c); // VSRA.S32 d0,d0,#32
1090	_NEON2SSESTORAGE int64x1_t vsra_n_s64(int64x1_t a, int64x1_t b, __constrange(`1`,`64`) int c); // VSRA.S64 d0,d0,#64
1091	_NEON2SSESTORAGE uint8x8_t vsra_n_u8(uint8x8_t a, uint8x8_t b, __constrange(`1`,`8`) int c); // VSRA.U8 d0,d0,#8
1092	_NEON2SSESTORAGE uint16x4_t vsra_n_u16(uint16x4_t a, uint16x4_t b, __constrange(`1`,`16`) int c); // VSRA.U16 d0,d0,#16
1093	_NEON2SSESTORAGE uint32x2_t vsra_n_u32(uint32x2_t a, uint32x2_t b, __constrange(`1`,`32`) int c); // VSRA.U32 d0,d0,#32
1094	_NEON2SSESTORAGE uint64x1_t vsra_n_u64(uint64x1_t a, uint64x1_t b, __constrange(`1`,`64`) int c); // VSRA.U64 d0,d0,#64
1095	_NEON2SSESTORAGE int8x16_t vsraq_n_s8(int8x16_t a, int8x16_t b, __constrange(`1`,`8`) int c); // VSRA.S8 q0,q0,#8
1096	_NEON2SSESTORAGE int16x8_t vsraq_n_s16(int16x8_t a, int16x8_t b, __constrange(`1`,`16`) int c); // VSRA.S16 q0,q0,#16
1097	_NEON2SSESTORAGE int32x4_t vsraq_n_s32(int32x4_t a, int32x4_t b, __constrange(`1`,`32`) int c); // VSRA.S32 q0,q0,#32
1098	_NEON2SSESTORAGE int64x2_t vsraq_n_s64(int64x2_t a, int64x2_t b, __constrange(`1`,`64`) int c); // VSRA.S64 q0,q0,#64
1099	_NEON2SSESTORAGE uint8x16_t vsraq_n_u8(uint8x16_t a, uint8x16_t b, __constrange(`1`,`8`) int c); // VSRA.U8 q0,q0,#8
1100	_NEON2SSESTORAGE uint16x8_t vsraq_n_u16(uint16x8_t a, uint16x8_t b, __constrange(`1`,`16`) int c); // VSRA.U16 q0,q0,#16
1101	_NEON2SSESTORAGE uint32x4_t vsraq_n_u32(uint32x4_t a, uint32x4_t b, __constrange(`1`,`32`) int c); // VSRA.U32 q0,q0,#32
1102	_NEON2SSESTORAGE uint64x2_t vsraq_n_u64(uint64x2_t a, uint64x2_t b, __constrange(`1`,`64`) int c); // VSRA.U64 q0,q0,#64
1103	//Vector rounding shift right by constant and accumulate
1104	_NEON2SSESTORAGE int8x8_t vrsra_n_s8(int8x8_t a, int8x8_t b, __constrange(`1`,`8`) int c); // VRSRA.S8 d0,d0,#8
1105	_NEON2SSESTORAGE int16x4_t vrsra_n_s16(int16x4_t a, int16x4_t b, __constrange(`1`,`16`) int c); // VRSRA.S16 d0,d0,#16
1106	_NEON2SSESTORAGE int32x2_t vrsra_n_s32(int32x2_t a, int32x2_t b, __constrange(`1`,`32`) int c); // VRSRA.S32 d0,d0,#32
1107	_NEON2SSESTORAGE int64x1_t vrsra_n_s64(int64x1_t a, int64x1_t b, __constrange(`1`,`64`) int c); // VRSRA.S64 d0,d0,#64
1108	_NEON2SSESTORAGE uint8x8_t vrsra_n_u8(uint8x8_t a, uint8x8_t b, __constrange(`1`,`8`) int c); // VRSRA.U8 d0,d0,#8
1109	_NEON2SSESTORAGE uint16x4_t vrsra_n_u16(uint16x4_t a, uint16x4_t b, __constrange(`1`,`16`) int c); // VRSRA.U16 d0,d0,#16
1110	_NEON2SSESTORAGE uint32x2_t vrsra_n_u32(uint32x2_t a, uint32x2_t b, __constrange(`1`,`32`) int c); // VRSRA.U32 d0,d0,#32
1111	_NEON2SSESTORAGE uint64x1_t vrsra_n_u64(uint64x1_t a, uint64x1_t b, __constrange(`1`,`64`) int c); // VRSRA.U64 d0,d0,#64
1112	_NEON2SSESTORAGE int8x16_t vrsraq_n_s8(int8x16_t a, int8x16_t b, __constrange(`1`,`8`) int c); // VRSRA.S8 q0,q0,#8
1113	_NEON2SSESTORAGE int16x8_t vrsraq_n_s16(int16x8_t a, int16x8_t b, __constrange(`1`,`16`) int c); // VRSRA.S16 q0,q0,#16
1114	_NEON2SSESTORAGE int32x4_t vrsraq_n_s32(int32x4_t a, int32x4_t b, __constrange(`1`,`32`) int c); // VRSRA.S32 q0,q0,#32
1115	_NEON2SSESTORAGE int64x2_t vrsraq_n_s64(int64x2_t a, int64x2_t b, __constrange(`1`,`64`) int c); // VRSRA.S64 q0,q0,#64
1116	_NEON2SSESTORAGE uint8x16_t vrsraq_n_u8(uint8x16_t a, uint8x16_t b, __constrange(`1`,`8`) int c); // VRSRA.U8 q0,q0,#8
1117	_NEON2SSESTORAGE uint16x8_t vrsraq_n_u16(uint16x8_t a, uint16x8_t b, __constrange(`1`,`16`) int c); // VRSRA.U16 q0,q0,#16
1118	_NEON2SSESTORAGE uint32x4_t vrsraq_n_u32(uint32x4_t a, uint32x4_t b, __constrange(`1`,`32`) int c); // VRSRA.U32 q0,q0,#32
1119	_NEON2SSESTORAGE uint64x2_t vrsraq_n_u64(uint64x2_t a, uint64x2_t b, __constrange(`1`,`64`) int c); // VRSRA.U64 q0,q0,#64
1120	//Vector saturating shift left by constant
1121	_NEON2SSESTORAGE int8x8_t vqshl_n_s8(int8x8_t a, __constrange(`0`,`7`) int b); // VQSHL.S8 d0,d0,#0
1122	_NEON2SSESTORAGE int16x4_t vqshl_n_s16(int16x4_t a, __constrange(`0`,`15`) int b); // VQSHL.S16 d0,d0,#0
1123	_NEON2SSESTORAGE int32x2_t vqshl_n_s32(int32x2_t a, __constrange(`0`,`31`) int b); // VQSHL.S32 d0,d0,#0
1124	_NEON2SSESTORAGE int64x1_t vqshl_n_s64(int64x1_t a, __constrange(`0`,`63`) int b); // VQSHL.S64 d0,d0,#0
1125	_NEON2SSESTORAGE uint8x8_t vqshl_n_u8(uint8x8_t a, __constrange(`0`,`7`) int b); // VQSHL.U8 d0,d0,#0
1126	_NEON2SSESTORAGE uint16x4_t vqshl_n_u16(uint16x4_t a, __constrange(`0`,`15`) int b); // VQSHL.U16 d0,d0,#0
1127	_NEON2SSESTORAGE uint32x2_t vqshl_n_u32(uint32x2_t a, __constrange(`0`,`31`) int b); // VQSHL.U32 d0,d0,#0
1128	_NEON2SSESTORAGE uint64x1_t vqshl_n_u64(uint64x1_t a, __constrange(`0`,`63`) int b); // VQSHL.U64 d0,d0,#0
1129	_NEON2SSESTORAGE int8x16_t vqshlq_n_s8(int8x16_t a, __constrange(`0`,`7`) int b); // VQSHL.S8 q0,q0,#0
1130	_NEON2SSESTORAGE int16x8_t vqshlq_n_s16(int16x8_t a, __constrange(`0`,`15`) int b); // VQSHL.S16 q0,q0,#0
1131	_NEON2SSESTORAGE int32x4_t vqshlq_n_s32(int32x4_t a, __constrange(`0`,`31`) int b); // VQSHL.S32 q0,q0,#0
1132	_NEON2SSESTORAGE int64x2_t vqshlq_n_s64(int64x2_t a, __constrange(`0`,`63`) int b); // VQSHL.S64 q0,q0,#0
1133	_NEON2SSESTORAGE uint8x16_t vqshlq_n_u8(uint8x16_t a, __constrange(`0`,`7`) int b); // VQSHL.U8 q0,q0,#0
1134	_NEON2SSESTORAGE uint16x8_t vqshlq_n_u16(uint16x8_t a, __constrange(`0`,`15`) int b); // VQSHL.U16 q0,q0,#0
1135	_NEON2SSESTORAGE uint32x4_t vqshlq_n_u32(uint32x4_t a, __constrange(`0`,`31`) int b); // VQSHL.U32 q0,q0,#0
1136	_NEON2SSESTORAGE uint64x2_t vqshlq_n_u64(uint64x2_t a, __constrange(`0`,`63`) int b); // VQSHL.U64 q0,q0,#0
1137	//Vector signed->unsigned saturating shift left by constant
1138	_NEON2SSESTORAGE uint8x8_t vqshlu_n_s8(int8x8_t a, __constrange(`0`,`7`) int b); // VQSHLU.S8 d0,d0,#0
1139	_NEON2SSESTORAGE uint16x4_t vqshlu_n_s16(int16x4_t a, __constrange(`0`,`15`) int b); // VQSHLU.S16 d0,d0,#0
1140	_NEON2SSESTORAGE uint32x2_t vqshlu_n_s32(int32x2_t a, __constrange(`0`,`31`) int b); // VQSHLU.S32 d0,d0,#0
1141	_NEON2SSESTORAGE uint64x1_t vqshlu_n_s64(int64x1_t a, __constrange(`0`,`63`) int b); // VQSHLU.S64 d0,d0,#0
1142	_NEON2SSESTORAGE uint8x16_t vqshluq_n_s8(int8x16_t a, __constrange(`0`,`7`) int b); // VQSHLU.S8 q0,q0,#0
1143	_NEON2SSESTORAGE uint16x8_t vqshluq_n_s16(int16x8_t a, __constrange(`0`,`15`) int b); // VQSHLU.S16 q0,q0,#0
1144	_NEON2SSESTORAGE uint32x4_t vqshluq_n_s32(int32x4_t a, __constrange(`0`,`31`) int b); // VQSHLU.S32 q0,q0,#0
1145	_NEON2SSESTORAGE uint64x2_t vqshluq_n_s64(int64x2_t a, __constrange(`0`,`63`) int b); // VQSHLU.S64 q0,q0,#0
1146	//Vector narrowing shift right by constant
1147	_NEON2SSESTORAGE int8x8_t vshrn_n_s16(int16x8_t a, __constrange(`1`,`8`) int b); // VSHRN.I16 d0,q0,#8
1148	_NEON2SSESTORAGE int16x4_t vshrn_n_s32(int32x4_t a, __constrange(`1`,`16`) int b); // VSHRN.I32 d0,q0,#16
1149	_NEON2SSESTORAGE int32x2_t vshrn_n_s64(int64x2_t a, __constrange(`1`,`32`) int b); // VSHRN.I64 d0,q0,#32
1150	_NEON2SSESTORAGE uint8x8_t vshrn_n_u16(uint16x8_t a, __constrange(`1`,`8`) int b); // VSHRN.I16 d0,q0,#8
1151	_NEON2SSESTORAGE uint16x4_t vshrn_n_u32(uint32x4_t a, __constrange(`1`,`16`) int b); // VSHRN.I32 d0,q0,#16
1152	_NEON2SSESTORAGE uint32x2_t vshrn_n_u64(uint64x2_t a, __constrange(`1`,`32`) int b); // VSHRN.I64 d0,q0,#32
1153	//Vector signed->unsigned narrowing saturating shift right by constant
1154	_NEON2SSESTORAGE uint8x8_t vqshrun_n_s16(int16x8_t a, __constrange(`1`,`8`) int b); // VQSHRUN.S16 d0,q0,#8
1155	_NEON2SSESTORAGE uint16x4_t vqshrun_n_s32(int32x4_t a, __constrange(`1`,`16`) int b); // VQSHRUN.S32 d0,q0,#16
1156	_NEON2SSESTORAGE uint32x2_t vqshrun_n_s64(int64x2_t a, __constrange(`1`,`32`) int b); // VQSHRUN.S64 d0,q0,#32
1157	//Vector signed->unsigned rounding narrowing saturating shift right by constant
1158	_NEON2SSESTORAGE uint8x8_t vqrshrun_n_s16(int16x8_t a, __constrange(`1`,`8`) int b); // VQRSHRUN.S16 d0,q0,#8
1159	_NEON2SSESTORAGE uint16x4_t vqrshrun_n_s32(int32x4_t a, __constrange(`1`,`16`) int b); // VQRSHRUN.S32 d0,q0,#16
1160	_NEON2SSESTORAGE uint32x2_t vqrshrun_n_s64(int64x2_t a, __constrange(`1`,`32`) int b); // VQRSHRUN.S64 d0,q0,#32
1161	//Vector narrowing saturating shift right by constant
1162	_NEON2SSESTORAGE int8x8_t vqshrn_n_s16(int16x8_t a, __constrange(`1`,`8`) int b); // VQSHRN.S16 d0,q0,#8
1163	_NEON2SSESTORAGE int16x4_t vqshrn_n_s32(int32x4_t a, __constrange(`1`,`16`) int b); // VQSHRN.S32 d0,q0,#16
1164	_NEON2SSESTORAGE int32x2_t vqshrn_n_s64(int64x2_t a, __constrange(`1`,`32`) int b); // VQSHRN.S64 d0,q0,#32
1165	_NEON2SSESTORAGE uint8x8_t vqshrn_n_u16(uint16x8_t a, __constrange(`1`,`8`) int b); // VQSHRN.U16 d0,q0,#8
1166	_NEON2SSESTORAGE uint16x4_t vqshrn_n_u32(uint32x4_t a, __constrange(`1`,`16`) int b); // VQSHRN.U32 d0,q0,#16
1167	_NEON2SSESTORAGE uint32x2_t vqshrn_n_u64(uint64x2_t a, __constrange(`1`,`32`) int b); // VQSHRN.U64 d0,q0,#32
1168	//Vector rounding narrowing shift right by constant
1169	_NEON2SSESTORAGE int8x8_t vrshrn_n_s16(int16x8_t a, __constrange(`1`,`8`) int b); // VRSHRN.I16 d0,q0,#8
1170	_NEON2SSESTORAGE int16x4_t vrshrn_n_s32(int32x4_t a, __constrange(`1`,`16`) int b); // VRSHRN.I32 d0,q0,#16
1171	_NEON2SSESTORAGE int32x2_t vrshrn_n_s64(int64x2_t a, __constrange(`1`,`32`) int b); // VRSHRN.I64 d0,q0,#32
1172	_NEON2SSESTORAGE uint8x8_t vrshrn_n_u16(uint16x8_t a, __constrange(`1`,`8`) int b); // VRSHRN.I16 d0,q0,#8
1173	_NEON2SSESTORAGE uint16x4_t vrshrn_n_u32(uint32x4_t a, __constrange(`1`,`16`) int b); // VRSHRN.I32 d0,q0,#16
1174	_NEON2SSESTORAGE uint32x2_t vrshrn_n_u64(uint64x2_t a, __constrange(`1`,`32`) int b); // VRSHRN.I64 d0,q0,#32
1175	//Vector rounding narrowing saturating shift right by constant
1176	_NEON2SSESTORAGE int8x8_t vqrshrn_n_s16(int16x8_t a, __constrange(`1`,`8`) int b); // VQRSHRN.S16 d0,q0,#8
1177	_NEON2SSESTORAGE int16x4_t vqrshrn_n_s32(int32x4_t a, __constrange(`1`,`16`) int b); // VQRSHRN.S32 d0,q0,#16
1178	_NEON2SSESTORAGE int32x2_t vqrshrn_n_s64(int64x2_t a, __constrange(`1`,`32`) int b); // VQRSHRN.S64 d0,q0,#32
1179	_NEON2SSESTORAGE uint8x8_t vqrshrn_n_u16(uint16x8_t a, __constrange(`1`,`8`) int b); // VQRSHRN.U16 d0,q0,#8
1180	_NEON2SSESTORAGE uint16x4_t vqrshrn_n_u32(uint32x4_t a, __constrange(`1`,`16`) int b); // VQRSHRN.U32 d0,q0,#16
1181	_NEON2SSESTORAGE uint32x2_t vqrshrn_n_u64(uint64x2_t a, __constrange(`1`,`32`) int b); // VQRSHRN.U64 d0,q0,#32
1182	//Vector widening shift left by constant
1183	_NEON2SSESTORAGE int16x8_t vshll_n_s8(int8x8_t a, __constrange(`0`,`8`) int b); // VSHLL.S8 q0,d0,#0
1184	_NEON2SSESTORAGE int32x4_t vshll_n_s16(int16x4_t a, __constrange(`0`,`16`) int b); // VSHLL.S16 q0,d0,#0
1185	_NEON2SSESTORAGE int64x2_t vshll_n_s32(int32x2_t a, __constrange(`0`,`32`) int b); // VSHLL.S32 q0,d0,#0
1186	_NEON2SSESTORAGE uint16x8_t vshll_n_u8(uint8x8_t a, __constrange(`0`,`8`) int b); // VSHLL.U8 q0,d0,#0
1187	_NEON2SSESTORAGE uint32x4_t vshll_n_u16(uint16x4_t a, __constrange(`0`,`16`) int b); // VSHLL.U16 q0,d0,#0
1188	_NEON2SSESTORAGE uint64x2_t vshll_n_u32(uint32x2_t a, __constrange(`0`,`32`) int b); // VSHLL.U32 q0,d0,#0
1189	//Shifts with insert
1190	//Vector shift right and insert
1191	_NEON2SSESTORAGE int8x8_t vsri_n_s8(int8x8_t a, int8x8_t b, __constrange(`1`,`8`) int c); // VSRI.8 d0,d0,#8
1192	_NEON2SSESTORAGE int16x4_t vsri_n_s16(int16x4_t a, int16x4_t b, __constrange(`1`,`16`) int c); // VSRI.16 d0,d0,#16
1193	_NEON2SSESTORAGE int32x2_t vsri_n_s32(int32x2_t a, int32x2_t b, __constrange(`1`,`32`) int c); // VSRI.32 d0,d0,#32
1194	_NEON2SSESTORAGE int64x1_t vsri_n_s64(int64x1_t a, int64x1_t b, __constrange(`1`,`64`) int c); // VSRI.64 d0,d0,#64
1195	_NEON2SSE_GLOBAL uint8x8_t vsri_n_u8(uint8x8_t a, uint8x8_t b, __constrange(`1`,`8`) int c); // VSRI.8 d0,d0,#8
1196	_NEON2SSE_GLOBAL uint16x4_t vsri_n_u16(uint16x4_t a, uint16x4_t b, __constrange(`1`,`16`) int c); // VSRI.16 d0,d0,#16
1197	_NEON2SSE_GLOBAL uint32x2_t vsri_n_u32(uint32x2_t a, uint32x2_t b, __constrange(`1`,`32`) int c); // VSRI.32 d0,d0,#32
1198	_NEON2SSE_GLOBAL uint64x1_t vsri_n_u64(uint64x1_t a, uint64x1_t b, __constrange(`1`,`64`) int c); // VSRI.64 d0,d0,#64
1199	_NEON2SSE_GLOBAL poly8x8_t vsri_n_p8(poly8x8_t a, poly8x8_t b, __constrange(`1`,`8`) int c); // VSRI.8 d0,d0,#8
1200	_NEON2SSE_GLOBAL poly16x4_t vsri_n_p16(poly16x4_t a, poly16x4_t b, __constrange(`1`,`16`) int c); // VSRI.16 d0,d0,#16
1201	_NEON2SSESTORAGE int8x16_t vsriq_n_s8(int8x16_t a, int8x16_t b, __constrange(`1`,`8`) int c); // VSRI.8 q0,q0,#8
1202	_NEON2SSESTORAGE int16x8_t vsriq_n_s16(int16x8_t a, int16x8_t b, __constrange(`1`,`16`) int c); // VSRI.16 q0,q0,#16
1203	_NEON2SSESTORAGE int32x4_t vsriq_n_s32(int32x4_t a, int32x4_t b, __constrange(`1`,`32`) int c); // VSRI.32 q0,q0,#32
1204	_NEON2SSESTORAGE int64x2_t vsriq_n_s64(int64x2_t a, int64x2_t b, __constrange(`1`,`64`) int c); // VSRI.64 q0,q0,#64
1205	_NEON2SSE_GLOBAL uint8x16_t vsriq_n_u8(uint8x16_t a, uint8x16_t b, __constrange(`1`,`8`) int c); // VSRI.8 q0,q0,#8
1206	_NEON2SSE_GLOBAL uint16x8_t vsriq_n_u16(uint16x8_t a, uint16x8_t b, __constrange(`1`,`16`) int c); // VSRI.16 q0,q0,#16
1207	_NEON2SSE_GLOBAL uint32x4_t vsriq_n_u32(uint32x4_t a, uint32x4_t b, __constrange(`1`,`32`) int c); // VSRI.32 q0,q0,#32
1208	_NEON2SSE_GLOBAL uint64x2_t vsriq_n_u64(uint64x2_t a, uint64x2_t b, __constrange(`1`,`64`) int c); // VSRI.64 q0,q0,#64
1209	_NEON2SSE_GLOBAL poly8x16_t vsriq_n_p8(poly8x16_t a, poly8x16_t b, __constrange(`1`,`8`) int c); // VSRI.8 q0,q0,#8
1210	_NEON2SSE_GLOBAL poly16x8_t vsriq_n_p16(poly16x8_t a, poly16x8_t b, __constrange(`1`,`16`) int c); // VSRI.16 q0,q0,#16
1211	//Vector shift left and insert
1212	_NEON2SSESTORAGE int8x8_t vsli_n_s8(int8x8_t a, int8x8_t b, __constrange(`0`,`7`) int c); // VSLI.8 d0,d0,#0
1213	_NEON2SSESTORAGE int16x4_t vsli_n_s16(int16x4_t a, int16x4_t b, __constrange(`0`,`15`) int c); // VSLI.16 d0,d0,#0
1214	_NEON2SSESTORAGE int32x2_t vsli_n_s32(int32x2_t a, int32x2_t b, __constrange(`0`,`31`) int c); // VSLI.32 d0,d0,#0
1215	_NEON2SSESTORAGE int64x1_t vsli_n_s64(int64x1_t a, int64x1_t b, __constrange(`0`,`63`) int c); // VSLI.64 d0,d0,#0
1216	_NEON2SSE_GLOBAL uint8x8_t vsli_n_u8(uint8x8_t a, uint8x8_t b, __constrange(`0`,`7`) int c); // VSLI.8 d0,d0,#0
1217	_NEON2SSE_GLOBAL uint16x4_t vsli_n_u16(uint16x4_t a, uint16x4_t b, __constrange(`0`,`15`) int c); // VSLI.16 d0,d0,#0
1218	_NEON2SSE_GLOBAL uint32x2_t vsli_n_u32(uint32x2_t a, uint32x2_t b, __constrange(`0`,`31`) int c); // VSLI.32 d0,d0,#0
1219	_NEON2SSE_GLOBAL uint64x1_t vsli_n_u64(uint64x1_t a, uint64x1_t b, __constrange(`0`,`63`) int c); // VSLI.64 d0,d0,#0
1220	_NEON2SSE_GLOBAL poly8x8_t vsli_n_p8(poly8x8_t a, poly8x8_t b, __constrange(`0`,`7`) int c); // VSLI.8 d0,d0,#0
1221	_NEON2SSE_GLOBAL poly16x4_t vsli_n_p16(poly16x4_t a, poly16x4_t b, __constrange(`0`,`15`) int c); // VSLI.16 d0,d0,#0
1222	_NEON2SSESTORAGE int8x16_t vsliq_n_s8(int8x16_t a, int8x16_t b, __constrange(`0`,`7`) int c); // VSLI.8 q0,q0,#0
1223	_NEON2SSESTORAGE int16x8_t vsliq_n_s16(int16x8_t a, int16x8_t b, __constrange(`0`,`15`) int c); // VSLI.16 q0,q0,#0
1224	_NEON2SSESTORAGE int32x4_t vsliq_n_s32(int32x4_t a, int32x4_t b, __constrange(`0`,`31`) int c); // VSLI.32 q0,q0,#0
1225	_NEON2SSESTORAGE int64x2_t vsliq_n_s64(int64x2_t a, int64x2_t b, __constrange(`0`,`63`) int c); // VSLI.64 q0,q0,#0
1226	_NEON2SSE_GLOBAL uint8x16_t vsliq_n_u8(uint8x16_t a, uint8x16_t b, __constrange(`0`,`7`) int c); // VSLI.8 q0,q0,#0
1227	_NEON2SSE_GLOBAL uint16x8_t vsliq_n_u16(uint16x8_t a, uint16x8_t b, __constrange(`0`,`15`) int c); // VSLI.16 q0,q0,#0
1228	_NEON2SSE_GLOBAL uint32x4_t vsliq_n_u32(uint32x4_t a, uint32x4_t b, __constrange(`0`,`31`) int c); // VSLI.32 q0,q0,#0
1229	_NEON2SSE_GLOBAL uint64x2_t vsliq_n_u64(uint64x2_t a, uint64x2_t b, __constrange(`0`,`63`) int c); // VSLI.64 q0,q0,#0
1230	_NEON2SSE_GLOBAL poly8x16_t vsliq_n_p8(poly8x16_t a, poly8x16_t b, __constrange(`0`,`7`) int c); // VSLI.8 q0,q0,#0
1231	_NEON2SSE_GLOBAL poly16x8_t vsliq_n_p16(poly16x8_t a, poly16x8_t b, __constrange(`0`,`15`) int c); // VSLI.16 q0,q0,#0
1232	//Loads of a single vector or lane. Perform loads and stores of a single vector of some type.
1233	//Load a single vector from memory
1234	_NEON2SSE_GLOBAL uint8x16_t vld1q_u8(__transfersize(`16`) uint8_t const * ptr); // VLD1.8 {d0, d1}, [r0]
1235	_NEON2SSE_GLOBAL uint16x8_t vld1q_u16(__transfersize(`8`) uint16_t const * ptr); // VLD1.16 {d0, d1}, [r0]
1236	_NEON2SSE_GLOBAL uint32x4_t vld1q_u32(__transfersize(`4`) uint32_t const * ptr); // VLD1.32 {d0, d1}, [r0]
1237	_NEON2SSE_GLOBAL uint64x2_t vld1q_u64(__transfersize(`2`) uint64_t const * ptr); // VLD1.64 {d0, d1}, [r0]
1238	_NEON2SSE_GLOBAL int8x16_t vld1q_s8(__transfersize(`16`) int8_t const * ptr); // VLD1.8 {d0, d1}, [r0]
1239	_NEON2SSE_GLOBAL int16x8_t vld1q_s16(__transfersize(`8`) int16_t const * ptr); // VLD1.16 {d0, d1}, [r0]
1240	_NEON2SSE_GLOBAL int32x4_t vld1q_s32(__transfersize(`4`) int32_t const * ptr); // VLD1.32 {d0, d1}, [r0]
1241	_NEON2SSE_GLOBAL int64x2_t vld1q_s64(__transfersize(`2`) int64_t const * ptr); // VLD1.64 {d0, d1}, [r0]
1242	_NEON2SSE_GLOBAL float16x8_t vld1q_f16(__transfersize(`8`) __fp16 const * ptr); // VLD1.16 {d0, d1}, [r0]
1243	_NEON2SSESTORAGE float32x4_t vld1q_f32(__transfersize(`4`) float32_t const * ptr); // VLD1.32 {d0, d1}, [r0]
1244	_NEON2SSE_GLOBAL poly8x16_t vld1q_p8(__transfersize(`16`) poly8_t const * ptr); // VLD1.8 {d0, d1}, [r0]
1245	_NEON2SSE_GLOBAL poly16x8_t vld1q_p16(__transfersize(`8`) poly16_t const * ptr); // VLD1.16 {d0, d1}, [r0]
1246	_NEON2SSE_GLOBAL uint8x8_t vld1_u8(__transfersize(`8`) uint8_t const * ptr); // VLD1.8 {d0}, [r0]
1247	_NEON2SSE_GLOBAL uint16x4_t vld1_u16(__transfersize(`4`) uint16_t const * ptr); // VLD1.16 {d0}, [r0]
1248	_NEON2SSE_GLOBAL uint32x2_t vld1_u32(__transfersize(`2`) uint32_t const * ptr); // VLD1.32 {d0}, [r0]
1249	_NEON2SSE_GLOBAL uint64x1_t vld1_u64(__transfersize(`1`) uint64_t const * ptr); // VLD1.64 {d0}, [r0]
1250	_NEON2SSE_GLOBAL int8x8_t vld1_s8(__transfersize(`8`) int8_t const * ptr); // VLD1.8 {d0}, [r0]
1251	_NEON2SSE_GLOBAL int16x4_t vld1_s16(__transfersize(`4`) int16_t const * ptr); // VLD1.16 {d0}, [r0]
1252	_NEON2SSE_GLOBAL int32x2_t vld1_s32(__transfersize(`2`) int32_t const * ptr); // VLD1.32 {d0}, [r0]
1253	_NEON2SSE_GLOBAL int64x1_t vld1_s64(__transfersize(`1`) int64_t const * ptr); // VLD1.64 {d0}, [r0]
1254	_NEON2SSE_GLOBAL float16x4_t vld1_f16(__transfersize(`4`) __fp16 const * ptr); // VLD1.16 {d0}, [r0]
1255	_NEON2SSESTORAGE float32x2_t vld1_f32(__transfersize(`2`) float32_t const * ptr); // VLD1.32 {d0}, [r0]
1256	_NEON2SSE_GLOBAL poly8x8_t vld1_p8(__transfersize(`8`) poly8_t const * ptr); // VLD1.8 {d0}, [r0]
1257	_NEON2SSE_GLOBAL poly16x4_t vld1_p16(__transfersize(`4`) poly16_t const * ptr); // VLD1.16 {d0}, [r0]
1258
1259	_NEON2SSESTORAGE float64x2_t vld1q_f64(__transfersize(`4`) float64_t const * ptr); // VLD1.64 {d0, d1}, [r0]
1260
1261	//Load a single lane from memory
1262	_NEON2SSE_GLOBAL uint8x16_t vld1q_lane_u8(__transfersize(`1`) uint8_t const * ptr, uint8x16_t vec, __constrange(`0`,`15`) int lane); //VLD1.8 {d0[0]}, [r0]
1263	_NEON2SSE_GLOBAL uint16x8_t vld1q_lane_u16(__transfersize(`1`) uint16_t const * ptr, uint16x8_t vec, __constrange(`0`,`7`) int lane); // VLD1.16 {d0[0]}, [r0]
1264	_NEON2SSE_GLOBAL uint32x4_t vld1q_lane_u32(__transfersize(`1`) uint32_t const * ptr, uint32x4_t vec, __constrange(`0`,`3`) int lane); // VLD1.32 {d0[0]}, [r0]
1265	_NEON2SSE_GLOBAL uint64x2_t vld1q_lane_u64(__transfersize(`1`) uint64_t const * ptr, uint64x2_t vec, __constrange(`0`,`1`) int lane); // VLD1.64 {d0}, [r0]
1266	_NEON2SSE_GLOBAL int8x16_t vld1q_lane_s8(__transfersize(`1`) int8_t const * ptr, int8x16_t vec, __constrange(`0`,`15`) int lane); //VLD1.8 {d0[0]}, [r0]
1267	_NEON2SSE_GLOBAL int16x8_t vld1q_lane_s16(__transfersize(`1`) int16_t const * ptr, int16x8_t vec, __constrange(`0`,`7`) int lane); //VLD1.16 {d0[0]}, [r0]
1268	_NEON2SSE_GLOBAL int32x4_t vld1q_lane_s32(__transfersize(`1`) int32_t const * ptr, int32x4_t vec, __constrange(`0`,`3`) int lane); //VLD1.32 {d0[0]}, [r0]
1269	_NEON2SSE_GLOBAL float16x8_t vld1q_lane_f16(__transfersize(`1`) __fp16 const * ptr, float16x8_t vec, __constrange(`0`,`7`) int lane); //VLD1.16 {d0[0]}, [r0]
1270	_NEON2SSESTORAGE float32x4_t vld1q_lane_f32(__transfersize(`1`) float32_t const * ptr, float32x4_t vec, __constrange(`0`,`3`) int lane); // VLD1.32 {d0[0]}, [r0]
1271	_NEON2SSE_GLOBAL int64x2_t vld1q_lane_s64(__transfersize(`1`) int64_t const * ptr, int64x2_t vec, __constrange(`0`,`1`) int lane); //VLD1.64 {d0}, [r0]
1272	_NEON2SSE_GLOBAL poly8x16_t vld1q_lane_p8(__transfersize(`1`) poly8_t const * ptr, poly8x16_t vec, __constrange(`0`,`15`) int lane); //VLD1.8 {d0[0]}, [r0]
1273	_NEON2SSE_GLOBAL poly16x8_t vld1q_lane_p16(__transfersize(`1`) poly16_t const * ptr, poly16x8_t vec, __constrange(`0`,`7`) int lane); // VLD1.16 {d0[0]}, [r0]
1274	_NEON2SSESTORAGE uint8x8_t vld1_lane_u8(__transfersize(`1`) uint8_t const * ptr, uint8x8_t vec, __constrange(`0`,`7`) int lane); //VLD1.8 {d0[0]}, [r0]
1275	_NEON2SSESTORAGE uint16x4_t vld1_lane_u16(__transfersize(`1`) uint16_t const * ptr, uint16x4_t vec, __constrange(`0`,`3`) int lane); //VLD1.16 {d0[0]}, [r0]
1276	_NEON2SSESTORAGE uint32x2_t vld1_lane_u32(__transfersize(`1`) uint32_t const * ptr, uint32x2_t vec, __constrange(`0`,`1`) int lane); //VLD1.32 {d0[0]}, [r0]
1277	_NEON2SSESTORAGE uint64x1_t vld1_lane_u64(__transfersize(`1`) uint64_t const * ptr, uint64x1_t vec, __constrange(`0`,`0`) int lane); //VLD1.64 {d0}, [r0]
1278	_NEON2SSE_GLOBAL int8x8_t vld1_lane_s8(__transfersize(`1`) int8_t const * ptr, int8x8_t vec, __constrange(`0`,`7`) int lane); // VLD1.8{d0[0]}, [r0]
1279	_NEON2SSE_GLOBAL int16x4_t vld1_lane_s16(__transfersize(`1`) int16_t const * ptr, int16x4_t vec, __constrange(`0`,`3`) int lane); //VLD1.16 {d0[0]}, [r0]
1280	_NEON2SSE_GLOBAL int32x2_t vld1_lane_s32(__transfersize(`1`) int32_t const * ptr, int32x2_t vec, __constrange(`0`,`1`) int lane); //VLD1.32 {d0[0]}, [r0]
1281	_NEON2SSE_GLOBAL float16x4_t vld1q_lane_f16(__transfersize(`1`) __fp16 const * ptr, float16x4_t vec, __constrange(`0`,`3`) int lane); //VLD1.16 {d0[0]}, [r0]
1282	_NEON2SSESTORAGE float32x2_t vld1_lane_f32(__transfersize(`1`) float32_t const * ptr, float32x2_t vec, __constrange(`0`,`1`) int lane); // VLD1.32 {d0[0]}, [r0]
1283	_NEON2SSE_GLOBAL int64x1_t vld1_lane_s64(__transfersize(`1`) int64_t const * ptr, int64x1_t vec, __constrange(`0`,`0`) int lane); //VLD1.64 {d0}, [r0]
1284	_NEON2SSE_GLOBAL poly8x8_t vld1_lane_p8(__transfersize(`1`) poly8_t const * ptr, poly8x8_t vec, __constrange(`0`,`7`) int lane); //VLD1.8 {d0[0]}, [r0]
1285	_NEON2SSE_GLOBAL poly16x4_t vld1_lane_p16(__transfersize(`1`) poly16_t const * ptr, poly16x4_t vec, __constrange(`0`,`3`) int lane); //VLD1.16 {d0[0]}, [r0]
1286	//Load all lanes of vector with same value from memory
1287	_NEON2SSE_GLOBAL uint8x16_t vld1q_dup_u8(__transfersize(`1`) uint8_t const * ptr); // VLD1.8 {d0[]}, [r0]
1288	_NEON2SSE_GLOBAL uint16x8_t vld1q_dup_u16(__transfersize(`1`) uint16_t const * ptr); // VLD1.16 {d0[]}, [r0]
1289	_NEON2SSE_GLOBAL uint32x4_t vld1q_dup_u32(__transfersize(`1`) uint32_t const * ptr); // VLD1.32 {d0[]}, [r0]
1290	_NEON2SSESTORAGE uint64x2_t vld1q_dup_u64(__transfersize(`1`) uint64_t const * ptr); // VLD1.64 {d0}, [r0]
1291	_NEON2SSE_GLOBAL int8x16_t vld1q_dup_s8(__transfersize(`1`) int8_t const * ptr); // VLD1.8 {d0[]}, [r0]
1292	_NEON2SSE_GLOBAL int16x8_t vld1q_dup_s16(__transfersize(`1`) int16_t const * ptr); // VLD1.16 {d0[]}, [r0]
1293	_NEON2SSE_GLOBAL int32x4_t vld1q_dup_s32(__transfersize(`1`) int32_t const * ptr); // VLD1.32 {d0[]}, [r0]
1294	_NEON2SSE_GLOBAL int64x2_t vld1q_dup_s64(__transfersize(`1`) int64_t const * ptr); // VLD1.64 {d0}, [r0]
1295	_NEON2SSE_GLOBAL float16x8_t vld1q_dup_f16(__transfersize(`1`) __fp16 const * ptr); // VLD1.16 {d0[]}, [r0]
1296	_NEON2SSE_GLOBAL float32x4_t vld1q_dup_f32(__transfersize(`1`) float32_t const * ptr); // VLD1.32 {d0[]}, [r0]
1297	_NEON2SSE_GLOBAL poly8x16_t vld1q_dup_p8(__transfersize(`1`) poly8_t const * ptr); // VLD1.8 {d0[]}, [r0]
1298	_NEON2SSE_GLOBAL poly16x8_t vld1q_dup_p16(__transfersize(`1`) poly16_t const * ptr); // VLD1.16 {d0[]}, [r0]
1299	_NEON2SSESTORAGE uint8x8_t vld1_dup_u8(__transfersize(`1`) uint8_t const * ptr); // VLD1.8 {d0[]}, [r0]
1300	_NEON2SSESTORAGE uint16x4_t vld1_dup_u16(__transfersize(`1`) uint16_t const * ptr); // VLD1.16 {d0[]}, [r0]
1301	_NEON2SSESTORAGE uint32x2_t vld1_dup_u32(__transfersize(`1`) uint32_t const * ptr); // VLD1.32 {d0[]}, [r0]
1302	_NEON2SSESTORAGE uint64x1_t vld1_dup_u64(__transfersize(`1`) uint64_t const * ptr); // VLD1.64 {d0}, [r0]
1303	_NEON2SSE_GLOBAL int8x8_t vld1_dup_s8(__transfersize(`1`) int8_t const * ptr); // VLD1.8 {d0[]}, [r0]
1304	_NEON2SSE_GLOBAL int16x4_t vld1_dup_s16(__transfersize(`1`) int16_t const * ptr); // VLD1.16 {d0[]}, [r0]
1305	_NEON2SSE_GLOBAL int32x2_t vld1_dup_s32(__transfersize(`1`) int32_t const * ptr); // VLD1.32 {d0[]}, [r0]
1306	_NEON2SSE_GLOBAL int64x1_t vld1_dup_s64(__transfersize(`1`) int64_t const * ptr); // VLD1.64 {d0}, [r0]
1307	_NEON2SSE_GLOBAL float16x4_t vld1_dup_f16(__transfersize(`1`) __fp16 const * ptr); // VLD1.16 {d0[]}, [r0]
1308	_NEON2SSESTORAGE float32x2_t vld1_dup_f32(__transfersize(`1`) float32_t const * ptr); // VLD1.32 {d0[]}, [r0]
1309	_NEON2SSE_GLOBAL poly8x8_t vld1_dup_p8(__transfersize(`1`) poly8_t const * ptr); // VLD1.8 {d0[]}, [r0]
1310	_NEON2SSE_GLOBAL poly16x4_t vld1_dup_p16(__transfersize(`1`) poly16_t const * ptr); // VLD1.16 {d0[]}, [r0]
1311	//Store a single vector or lane. Stores all lanes or a single lane of a vector.
1312	//Store a single vector into memory
1313	_NEON2SSE_GLOBAL void vst1q_u8(__transfersize(`16`) uint8_t * ptr, uint8x16_t val); // VST1.8 {d0, d1}, [r0]
1314	_NEON2SSE_GLOBAL void vst1q_u16(__transfersize(`8`) uint16_t * ptr, uint16x8_t val); // VST1.16 {d0, d1}, [r0]
1315	_NEON2SSE_GLOBAL void vst1q_u32(__transfersize(`4`) uint32_t * ptr, uint32x4_t val); // VST1.32 {d0, d1}, [r0]
1316	_NEON2SSE_GLOBAL void vst1q_u64(__transfersize(`2`) uint64_t * ptr, uint64x2_t val); // VST1.64 {d0, d1}, [r0]
1317	_NEON2SSE_GLOBAL void vst1q_s8(__transfersize(`16`) int8_t * ptr, int8x16_t val); // VST1.8 {d0, d1}, [r0]
1318	_NEON2SSE_GLOBAL void vst1q_s16(__transfersize(`8`) int16_t * ptr, int16x8_t val); // VST1.16 {d0, d1}, [r0]
1319	_NEON2SSE_GLOBAL void vst1q_s32(__transfersize(`4`) int32_t * ptr, int32x4_t val); // VST1.32 {d0, d1}, [r0]
1320	_NEON2SSE_GLOBAL void vst1q_s64(__transfersize(`2`) int64_t * ptr, int64x2_t val); // VST1.64 {d0, d1}, [r0]
1321	_NEON2SSE_GLOBAL void vst1q_f16(__transfersize(`8`) __fp16 * ptr, float16x8_t val); // VST1.16 {d0, d1}, [r0]
1322	_NEON2SSESTORAGE void vst1q_f32(__transfersize(`4`) float32_t * ptr, float32x4_t val); // VST1.32 {d0, d1}, [r0]
1323	_NEON2SSE_GLOBAL void vst1q_p8(__transfersize(`16`) poly8_t * ptr, poly8x16_t val); // VST1.8 {d0, d1}, [r0]
1324	_NEON2SSE_GLOBAL void vst1q_p16(__transfersize(`8`) poly16_t * ptr, poly16x8_t val); // VST1.16 {d0, d1}, [r0]
1325	_NEON2SSESTORAGE void vst1_u8(__transfersize(`8`) uint8_t * ptr, uint8x8_t val); // VST1.8 {d0}, [r0]
1326	_NEON2SSESTORAGE void vst1_u16(__transfersize(`4`) uint16_t * ptr, uint16x4_t val); // VST1.16 {d0}, [r0]
1327	_NEON2SSESTORAGE void vst1_u32(__transfersize(`2`) uint32_t * ptr, uint32x2_t val); // VST1.32 {d0}, [r0]
1328	_NEON2SSESTORAGE void vst1_u64(__transfersize(`1`) uint64_t * ptr, uint64x1_t val); // VST1.64 {d0}, [r0]
1329	_NEON2SSE_GLOBAL void vst1_s8(__transfersize(`8`) int8_t * ptr, int8x8_t val); // VST1.8 {d0}, [r0]
1330	_NEON2SSE_GLOBAL void vst1_s16(__transfersize(`4`) int16_t * ptr, int16x4_t val); // VST1.16 {d0}, [r0]
1331	_NEON2SSE_GLOBAL void vst1_s32(__transfersize(`2`) int32_t * ptr, int32x2_t val); // VST1.32 {d0}, [r0]
1332	_NEON2SSE_GLOBAL void vst1_s64(__transfersize(`1`) int64_t * ptr, int64x1_t val); // VST1.64 {d0}, [r0]
1333	_NEON2SSE_GLOBAL void vst1_f16(__transfersize(`4`) __fp16 * ptr, float16x4_t val); // VST1.16 {d0}, [r0]
1334	_NEON2SSESTORAGE void vst1_f32(__transfersize(`2`) float32_t * ptr, float32x2_t val); // VST1.32 {d0}, [r0]
1335	_NEON2SSE_GLOBAL void vst1_p8(__transfersize(`8`) poly8_t * ptr, poly8x8_t val); // VST1.8 {d0}, [r0]
1336	_NEON2SSE_GLOBAL void vst1_p16(__transfersize(`4`) poly16_t * ptr, poly16x4_t val); // VST1.16 {d0}, [r0]
1337	//Store a lane of a vector into memory
1338	//Loads of an N-element structure
1339	//Load N-element structure from memory
1340	_NEON2SSESTORAGE uint8x16x2_t vld2q_u8(__transfersize(`32`) uint8_t const * ptr); // VLD2.8 {d0, d2}, [r0]
1341	_NEON2SSESTORAGE uint16x8x2_t vld2q_u16(__transfersize(`16`) uint16_t const * ptr); // VLD2.16 {d0, d2}, [r0]
1342	_NEON2SSESTORAGE uint32x4x2_t vld2q_u32(__transfersize(`8`) uint32_t const * ptr); // VLD2.32 {d0, d2}, [r0]
1343	_NEON2SSE_GLOBAL int8x16x2_t vld2q_s8(__transfersize(`32`) int8_t const * ptr); // VLD2.8 {d0, d2}, [r0]
1344	_NEON2SSE_GLOBAL int16x8x2_t vld2q_s16(__transfersize(`16`) int16_t const * ptr); // VLD2.16 {d0, d2}, [r0]
1345	_NEON2SSE_GLOBAL int32x4x2_t vld2q_s32(__transfersize(`8`) int32_t const * ptr); // VLD2.32 {d0, d2}, [r0]
1346	_NEON2SSE_GLOBAL float16x8x2_t vld2q_f16(__transfersize(`16`) __fp16 const * ptr); // VLD2.16 {d0, d2}, [r0]
1347	_NEON2SSESTORAGE float32x4x2_t vld2q_f32(__transfersize(`8`) float32_t const * ptr); // VLD2.32 {d0, d2}, [r0]
1348	_NEON2SSE_GLOBAL poly8x16x2_t vld2q_p8(__transfersize(`32`) poly8_t const * ptr); // VLD2.8 {d0, d2}, [r0]
1349	_NEON2SSE_GLOBAL poly16x8x2_t vld2q_p16(__transfersize(`16`) poly16_t const * ptr); // VLD2.16 {d0, d2}, [r0]
1350	_NEON2SSESTORAGE uint8x8x2_t vld2_u8(__transfersize(`16`) uint8_t const * ptr); // VLD2.8 {d0, d1}, [r0]
1351	_NEON2SSESTORAGE uint16x4x2_t vld2_u16(__transfersize(`8`) uint16_t const * ptr); // VLD2.16 {d0, d1}, [r0]
1352	_NEON2SSESTORAGE uint32x2x2_t vld2_u32(__transfersize(`4`) uint32_t const * ptr); // VLD2.32 {d0, d1}, [r0]
1353	_NEON2SSESTORAGE uint64x1x2_t vld2_u64(__transfersize(`2`) uint64_t const * ptr); // VLD1.64 {d0, d1}, [r0]
1354	_NEON2SSE_GLOBAL int8x8x2_t vld2_s8(__transfersize(`16`) int8_t const * ptr); // VLD2.8 {d0, d1}, [r0]
1355	_NEON2SSE_GLOBAL int16x4x2_t vld2_s16(__transfersize(`8`) int16_t const * ptr); // VLD2.16 {d0, d1}, [r0]
1356	_NEON2SSE_GLOBAL int32x2x2_t vld2_s32(__transfersize(`4`) int32_t const * ptr); // VLD2.32 {d0, d1}, [r0]
1357	_NEON2SSE_GLOBAL int64x1x2_t vld2_s64(__transfersize(`2`) int64_t const * ptr); // VLD1.64 {d0, d1}, [r0]
1358	//float16x4x2_t vld2_f16(__transfersize(8) __fp16 const ptr); // VLD2.16 {d0, d1}, [r0]*
1359	_NEON2SSESTORAGE float32x2x2_t vld2_f32(__transfersize(`4`) float32_t const * ptr); // VLD2.32 {d0, d1}, [r0]
1360	_NEON2SSE_GLOBAL poly8x8x2_t vld2_p8(__transfersize(`16`) poly8_t const * ptr); // VLD2.8 {d0, d1}, [r0]
1361	_NEON2SSE_GLOBAL poly16x4x2_t vld2_p16(__transfersize(`8`) poly16_t const * ptr); // VLD2.16 {d0, d1}, [r0]
1362	_NEON2SSESTORAGE uint8x16x3_t vld3q_u8(__transfersize(`48`) uint8_t const * ptr); // VLD3.8 {d0, d2, d4}, [r0]
1363	_NEON2SSESTORAGE uint16x8x3_t vld3q_u16(__transfersize(`24`) uint16_t const * ptr); // VLD3.16 {d0, d2, d4}, [r0]
1364	_NEON2SSESTORAGE uint32x4x3_t vld3q_u32(__transfersize(`12`) uint32_t const * ptr); // VLD3.32 {d0, d2, d4}, [r0]
1365	_NEON2SSE_GLOBAL int8x16x3_t vld3q_s8(__transfersize(`48`) int8_t const * ptr); // VLD3.8 {d0, d2, d4}, [r0]
1366	_NEON2SSE_GLOBAL int16x8x3_t vld3q_s16(__transfersize(`24`) int16_t const * ptr); // VLD3.16 {d0, d2, d4}, [r0]
1367	_NEON2SSE_GLOBAL int32x4x3_t vld3q_s32(__transfersize(`12`) int32_t const * ptr); // VLD3.32 {d0, d2, d4}, [r0]
1368	_NEON2SSE_GLOBAL float16x8x3_t vld3q_f16(__transfersize(`24`) __fp16 const * ptr); // VLD3.16 {d0, d2, d4}, [r0]
1369	_NEON2SSESTORAGE float32x4x3_t vld3q_f32(__transfersize(`12`) float32_t const * ptr); // VLD3.32 {d0, d2, d4}, [r0]
1370	poly8x16x3_t vld3q_p8(__transfersize(`48`) poly8_t const * ptr); // VLD3.8 {d0, d2, d4}, [r0]
1371	_NEON2SSE_GLOBAL poly16x8x3_t vld3q_p16(__transfersize(`24`) poly16_t const * ptr); // VLD3.16 {d0, d2, d4}, [r0]
1372	_NEON2SSESTORAGE uint8x8x3_t vld3_u8(__transfersize(`24`) uint8_t const * ptr); // VLD3.8 {d0, d1, d2}, [r0]
1373	_NEON2SSESTORAGE uint16x4x3_t vld3_u16(__transfersize(`12`) uint16_t const * ptr); // VLD3.16 {d0, d1, d2}, [r0]
1374	_NEON2SSESTORAGE uint32x2x3_t vld3_u32(__transfersize(`6`) uint32_t const * ptr); // VLD3.32 {d0, d1, d2}, [r0]
1375	_NEON2SSESTORAGE uint64x1x3_t vld3_u64(__transfersize(`3`) uint64_t const * ptr); // VLD1.64 {d0, d1, d2}, [r0]
1376	_NEON2SSE_GLOBAL int8x8x3_t vld3_s8(__transfersize(`24`) int8_t const * ptr); // VLD3.8 {d0, d1, d2}, [r0]
1377	_NEON2SSE_GLOBAL int16x4x3_t vld3_s16(__transfersize(`12`) int16_t const * ptr); // VLD3.16 {d0, d1, d2}, [r0]
1378	_NEON2SSE_GLOBAL int32x2x3_t vld3_s32(__transfersize(`6`) int32_t const * ptr); // VLD3.32 {d0, d1, d2}, [r0]
1379	_NEON2SSE_GLOBAL int64x1x3_t vld3_s64(__transfersize(`3`) int64_t const * ptr); // VLD1.64 {d0, d1, d2}, [r0]
1380	_NEON2SSE_GLOBAL float16x4x3_t vld3_f16(__transfersize(`12`) __fp16 const * ptr); // VLD3.16 {d0, d1, d2}, [r0]
1381	_NEON2SSESTORAGE float32x2x3_t vld3_f32(__transfersize(`6`) float32_t const * ptr); // VLD3.32 {d0, d1, d2}, [r0]
1382	_NEON2SSE_GLOBAL poly8x8x3_t vld3_p8(__transfersize(`24`) poly8_t const * ptr); // VLD3.8 {d0, d1, d2}, [r0]
1383	_NEON2SSE_GLOBAL poly16x4x3_t vld3_p16(__transfersize(`12`) poly16_t const * ptr); // VLD3.16 {d0, d1, d2}, [r0]
1384	_NEON2SSESTORAGE uint8x16x4_t vld4q_u8(__transfersize(`64`) uint8_t const * ptr); // VLD4.8 {d0, d2, d4, d6}, [r0]
1385	_NEON2SSESTORAGE uint16x8x4_t vld4q_u16(__transfersize(`32`) uint16_t const * ptr); // VLD4.16 {d0, d2, d4, d6}, [r0]
1386	_NEON2SSESTORAGE uint32x4x4_t vld4q_u32(__transfersize(`16`) uint32_t const * ptr); // VLD4.32 {d0, d2, d4, d6}, [r0]
1387	_NEON2SSE_GLOBAL int8x16x4_t vld4q_s8(__transfersize(`64`) int8_t const * ptr); // VLD4.8 {d0, d2, d4, d6}, [r0]
1388	_NEON2SSE_GLOBAL int16x8x4_t vld4q_s16(__transfersize(`32`) int16_t const * ptr); // VLD4.16 {d0, d2, d4, d6}, [r0]
1389	_NEON2SSE_GLOBAL int32x4x4_t vld4q_s32(__transfersize(`16`) int32_t const * ptr); // VLD4.32 {d0, d2, d4, d6}, [r0]
1390	_NEON2SSE_GLOBAL float16x8x4_t vld4q_f16(__transfersize(`32`) __fp16 const * ptr); // VLD4.16 {d0, d2, d4, d6}, [r0]
1391	_NEON2SSESTORAGE float32x4x4_t vld4q_f32(__transfersize(`16`) float32_t const * ptr); // VLD4.32 {d0, d2, d4, d6}, [r0]
1392	_NEON2SSE_GLOBAL poly8x16x4_t vld4q_p8(__transfersize(`64`) poly8_t const * ptr); // VLD4.8 {d0, d2, d4, d6}, [r0]
1393	_NEON2SSE_GLOBAL poly16x8x4_t vld4q_p16(__transfersize(`32`) poly16_t const * ptr); // VLD4.16 {d0, d2, d4, d6}, [r0]
1394	_NEON2SSESTORAGE uint8x8x4_t vld4_u8(__transfersize(`32`) uint8_t const * ptr); // VLD4.8 {d0, d1, d2, d3}, [r0]
1395	_NEON2SSESTORAGE uint16x4x4_t vld4_u16(__transfersize(`16`) uint16_t const * ptr); // VLD4.16 {d0, d1, d2, d3}, [r0]
1396	_NEON2SSESTORAGE uint32x2x4_t vld4_u32(__transfersize(`8`) uint32_t const * ptr); // VLD4.32 {d0, d1, d2, d3}, [r0]
1397	_NEON2SSESTORAGE uint64x1x4_t vld4_u64(__transfersize(`4`) uint64_t const * ptr); // VLD1.64 {d0, d1, d2, d3}, [r0]
1398	_NEON2SSE_GLOBAL int8x8x4_t vld4_s8(__transfersize(`32`) int8_t const * ptr); // VLD4.8 {d0, d1, d2, d3}, [r0]
1399	_NEON2SSE_GLOBAL int16x4x4_t vld4_s16(__transfersize(`16`) int16_t const * ptr); // VLD4.16 {d0, d1, d2, d3}, [r0]
1400	_NEON2SSE_GLOBAL int32x2x4_t vld4_s32(__transfersize(`8`) int32_t const * ptr); // VLD4.32 {d0, d1, d2, d3}, [r0]
1401	_NEON2SSE_GLOBAL int64x1x4_t vld4_s64(__transfersize(`4`) int64_t const * ptr); // VLD1.64 {d0, d1, d2, d3}, [r0]
1402	_NEON2SSE_GLOBAL float16x4x4_t vld4_f16(__transfersize(`16`) __fp16 const * ptr); // VLD4.16 {d0, d1, d2, d3}, [r0]
1403	_NEON2SSESTORAGE float32x2x4_t vld4_f32(__transfersize(`8`) float32_t const * ptr); // VLD4.32 {d0, d1, d2, d3}, [r0]
1404	_NEON2SSE_GLOBAL poly8x8x4_t vld4_p8(__transfersize(`32`) poly8_t const * ptr); // VLD4.8 {d0, d1, d2, d3}, [r0]
1405	_NEON2SSE_GLOBAL poly16x4x4_t vld4_p16(__transfersize(`16`) poly16_t const * ptr); // VLD4.16 {d0, d1, d2, d3}, [r0]
1406	//Load all lanes of N-element structure with same value from memory
1407	_NEON2SSESTORAGE uint8x8x2_t vld2_dup_u8(__transfersize(`2`) uint8_t const * ptr); // VLD2.8 {d0[], d1[]}, [r0]
1408	_NEON2SSESTORAGE uint16x4x2_t vld2_dup_u16(__transfersize(`2`) uint16_t const * ptr); // VLD2.16 {d0[], d1[]}, [r0]
1409	_NEON2SSESTORAGE uint32x2x2_t vld2_dup_u32(__transfersize(`2`) uint32_t const * ptr); // VLD2.32 {d0[], d1[]}, [r0]
1410	_NEON2SSE_GLOBAL uint64x1x2_t vld2_dup_u64(__transfersize(`2`) uint64_t const * ptr); // VLD1.64 {d0, d1}, [r0]
1411	_NEON2SSE_GLOBAL int8x8x2_t vld2_dup_s8(__transfersize(`2`) int8_t const * ptr); // VLD2.8 {d0[], d1[]}, [r0]
1412	_NEON2SSE_GLOBAL int16x4x2_t vld2_dup_s16(__transfersize(`2`) int16_t const * ptr); // VLD2.16 {d0[], d1[]}, [r0]
1413	_NEON2SSE_GLOBAL int32x2x2_t vld2_dup_s32(__transfersize(`2`) int32_t const * ptr); // VLD2.32 {d0[], d1[]}, [r0]
1414	_NEON2SSE_GLOBAL int64x1x2_t vld2_dup_s64(__transfersize(`2`) int64_t const * ptr); // VLD1.64 {d0, d1}, [r0]
1415	//float16x4x2_t vld2_dup_f16(__transfersize(2) __fp16 const ptr); // VLD2.16 {d0[], d1[]}, [r0]*
1416	_NEON2SSESTORAGE float32x2x2_t vld2_dup_f32(__transfersize(`2`) float32_t const * ptr); // VLD2.32 {d0[], d1[]}, [r0]
1417	_NEON2SSE_GLOBAL poly8x8x2_t vld2_dup_p8(__transfersize(`2`) poly8_t const * ptr); // VLD2.8 {d0[], d1[]}, [r0]
1418	_NEON2SSE_GLOBAL poly16x4x2_t vld2_dup_p16(__transfersize(`2`) poly16_t const * ptr); // VLD2.16 {d0[], d1[]}, [r0]
1419	_NEON2SSESTORAGE uint8x8x3_t vld3_dup_u8(__transfersize(`3`) uint8_t const * ptr); // VLD3.8 {d0[], d1[], d2[]}, [r0]
1420	_NEON2SSESTORAGE uint16x4x3_t vld3_dup_u16(__transfersize(`3`) uint16_t const * ptr); // VLD3.16 {d0[], d1[], d2[]}, [r0]
1421	_NEON2SSESTORAGE uint32x2x3_t vld3_dup_u32(__transfersize(`3`) uint32_t const * ptr); // VLD3.32 {d0[], d1[], d2[]}, [r0]
1422	_NEON2SSESTORAGE uint64x1x3_t vld3_dup_u64(__transfersize(`3`) uint64_t const * ptr); // VLD1.64 {d0, d1, d2}, [r0]
1423	_NEON2SSE_GLOBAL int8x8x3_t vld3_dup_s8(__transfersize(`3`) int8_t const * ptr); // VLD3.8 {d0[], d1[], d2[]}, [r0]
1424	_NEON2SSE_GLOBAL int16x4x3_t vld3_dup_s16(__transfersize(`3`) int16_t const * ptr); // VLD3.16 {d0[], d1[], d2[]}, [r0]
1425	_NEON2SSE_GLOBAL int32x2x3_t vld3_dup_s32(__transfersize(`3`) int32_t const * ptr); // VLD3.32 {d0[], d1[], d2[]}, [r0]
1426	_NEON2SSE_GLOBAL int64x1x3_t vld3_dup_s64(__transfersize(`3`) int64_t const * ptr); // VLD1.64 {d0, d1, d2}, [r0]
1427	_NEON2SSE_GLOBAL float16x4x3_t vld3_dup_f16(__transfersize(`3`) __fp16 const * ptr); // VLD3.16 {d0[], d1[], d2[]}, [r0]
1428	_NEON2SSESTORAGE float32x2x3_t vld3_dup_f32(__transfersize(`3`) float32_t const * ptr); // VLD3.32 {d0[], d1[], d2[]}, [r0]
1429	_NEON2SSE_GLOBAL poly8x8x3_t vld3_dup_p8(__transfersize(`3`) poly8_t const * ptr); // VLD3.8 {d0[], d1[], d2[]}, [r0]
1430	_NEON2SSE_GLOBAL poly16x4x3_t vld3_dup_p16(__transfersize(`3`) poly16_t const * ptr); // VLD3.16 {d0[], d1[], d2[]}, [r0]
1431	_NEON2SSESTORAGE uint8x8x4_t vld4_dup_u8(__transfersize(`4`) uint8_t const * ptr); // VLD4.8 {d0[], d1[], d2[], d3[]}, [r0]
1432	_NEON2SSESTORAGE uint16x4x4_t vld4_dup_u16(__transfersize(`4`) uint16_t const * ptr); // VLD4.16 {d0[], d1[], d2[], d3[]}, [r0]
1433	_NEON2SSESTORAGE uint32x2x4_t vld4_dup_u32(__transfersize(`4`) uint32_t const * ptr); // VLD4.32 {d0[], d1[], d2[], d3[]}, [r0]
1434	_NEON2SSESTORAGE uint64x1x4_t vld4_dup_u64(__transfersize(`4`) uint64_t const * ptr); // VLD1.64 {d0, d1, d2, d3}, [r0]
1435	_NEON2SSE_GLOBAL int8x8x4_t vld4_dup_s8(__transfersize(`4`) int8_t const * ptr); // VLD4.8 {d0[], d1[], d2[], d3[]}, [r0]
1436	_NEON2SSE_GLOBAL int16x4x4_t vld4_dup_s16(__transfersize(`4`) int16_t const * ptr); // VLD4.16 {d0[], d1[], d2[], d3[]}, [r0]
1437	_NEON2SSE_GLOBAL int32x2x4_t vld4_dup_s32(__transfersize(`4`) int32_t const * ptr); // VLD4.32 {d0[], d1[], d2[], d3[]}, [r0]
1438	_NEON2SSE_GLOBAL int64x1x4_t vld4_dup_s64(__transfersize(`4`) int64_t const * ptr); // VLD1.64 {d0, d1, d2, d3}, [r0]
1439	_NEON2SSE_GLOBAL float16x4x4_t vld4_dup_f16(__transfersize(`4`) __fp16 const * ptr); // VLD4.16 {d0[], d1[], d2[], d3[]}, [r0]
1440	_NEON2SSESTORAGE float32x2x4_t vld4_dup_f32(__transfersize(`4`) float32_t const * ptr); // VLD4.32 {d0[], d1[], d2[], d3[]}, [r0]
1441	_NEON2SSE_GLOBAL poly8x8x4_t vld4_dup_p8(__transfersize(`4`) poly8_t const * ptr); // VLD4.8 {d0[], d1[], d2[], d3[]}, [r0]
1442	_NEON2SSE_GLOBAL poly16x4x4_t vld4_dup_p16(__transfersize(`4`) poly16_t const * ptr); // VLD4.16 {d0[], d1[], d2[], d3[]}, [r0]
1443	//Load a single lane of N-element structure from memory
1444	//the functions below are modified to deal with the error C2719: 'src': formal parameter with __declspec(align('16')) won't be aligned
1445	_NEON2SSESTORAGE uint16x8x2_t vld2q_lane_u16_ptr(__transfersize(`2`) uint16_t const * ptr, uint16x8x2_t * src, __constrange(`0`,`7`) int lane); // VLD2.16 {d0[0], d2[0]}, [r0]
1446	_NEON2SSESTORAGE uint32x4x2_t vld2q_lane_u32_ptr(__transfersize(`2`) uint32_t const * ptr, uint32x4x2_t * src, __constrange(`0`,`3`) int lane); // VLD2.32 {d0[0], d2[0]}, [r0]
1447	_NEON2SSESTORAGE int16x8x2_t vld2q_lane_s16_ptr(__transfersize(`2`) int16_t const * ptr, int16x8x2_t * src, __constrange(`0`,`7`) int lane); // VLD2.16 {d0[0], d2[0]}, [r0]
1448	_NEON2SSESTORAGE int32x4x2_t vld2q_lane_s32_ptr(__transfersize(`2`) int32_t const * ptr, int32x4x2_t * src, __constrange(`0`,`3`) int lane); // VLD2.32 {d0[0], d2[0]}, [r0]
1449	_NEON2SSE_GLOBAL float16x8x2_t vld2q_lane_f16_ptr(__transfersize(`2`) __fp16 const * ptr, float16x8x2_t * src, __constrange(`0`,`7`) int lane); // VLD2.16 {d0[0], d2[0]}, [r0]
1450	_NEON2SSESTORAGE float32x4x2_t vld2q_lane_f32_ptr(__transfersize(`2`) float32_t const * ptr, float32x4x2_t * src, __constrange(`0`,`3`) int lane); // VLD2.32 {d0[0], d2[0]}, [r0]
1451	_NEON2SSE_GLOBAL poly16x8x2_t vld2q_lane_p16_ptr(__transfersize(`2`) poly16_t const * ptr, poly16x8x2_t * src, __constrange(`0`,`7`) int lane); // VLD2.16 {d0[0], d2[0]}, [r0]
1452	_NEON2SSESTORAGE uint8x8x2_t vld2_lane_u8(__transfersize(`2`) uint8_t const * ptr, uint8x8x2_t src, __constrange(`0`,`7`) int lane); //VLD2.8 {d0[0], d1[0]}, [r0]
1453	_NEON2SSESTORAGE uint16x4x2_t vld2_lane_u16(__transfersize(`2`) uint16_t const * ptr, uint16x4x2_t src, __constrange(`0`,`3`) int lane); // VLD2.16 {d0[0], d1[0]}, [r0]
1454	_NEON2SSESTORAGE uint32x2x2_t vld2_lane_u32(__transfersize(`2`) uint32_t const * ptr, uint32x2x2_t src, __constrange(`0`,`1`) int lane); // VLD2.32 {d0[0], d1[0]}, [r0]
1455	_NEON2SSE_GLOBAL int8x8x2_t vld2_lane_s8(__transfersize(`2`) int8_t const * ptr, int8x8x2_t src, __constrange(`0`,`7`) int lane); //VLD2.8 {d0[0], d1[0]}, [r0]
1456	_NEON2SSE_GLOBAL int16x4x2_t vld2_lane_s16(__transfersize(`2`) int16_t const * ptr, int16x4x2_t src, __constrange(`0`,`3`) int lane); //VLD2.16 {d0[0], d1[0]}, [r0]
1457	_NEON2SSE_GLOBAL int32x2x2_t vld2_lane_s32(__transfersize(`2`) int32_t const * ptr, int32x2x2_t src, __constrange(`0`,`1`) int lane); //VLD2.32 {d0[0], d1[0]}, [r0]
1458	//float16x4x2_t vld2_lane_f16_ptr(__transfersize(2) __fp16 const ptr, float16x4x2_t * src, __constrange(0,3) int lane); // VLD2.16 {d0[0], d1[0]}, [r0]*
1459	_NEON2SSESTORAGE float32x2x2_t vld2_lane_f32(__transfersize(`2`) float32_t const * ptr, float32x2x2_t src, __constrange(`0`,`1`) int lane); // VLD2.32 {d0[0], d1[0]}, [r0]
1460	_NEON2SSE_GLOBAL poly8x8x2_t vld2_lane_p8(__transfersize(`2`) poly8_t const * ptr, poly8x8x2_t src, __constrange(`0`,`7`) int lane); //VLD2.8 {d0[0], d1[0]}, [r0]
1461	_NEON2SSE_GLOBAL poly16x4x2_t vld2_lane_p16(__transfersize(`2`) poly16_t const * ptr, poly16x4x2_t src, __constrange(`0`,`3`) int lane); // VLD2.16 {d0[0], d1[0]}, [r0]
1462	_NEON2SSESTORAGE uint16x8x3_t vld3q_lane_u16_ptr(__transfersize(`3`) uint16_t const * ptr, uint16x8x3_t * src, __constrange(`0`,`7`) int lane); // VLD3.16 {d0[0], d2[0], d4[0]}, [r0]
1463	_NEON2SSESTORAGE uint32x4x3_t vld3q_lane_u32_ptr(__transfersize(`3`) uint32_t const * ptr, uint32x4x3_t * src, __constrange(`0`,`3`) int lane); // VLD3.32 {d0[0], d2[0], d4[0]}, [r0]
1464	_NEON2SSESTORAGE int16x8x3_t vld3q_lane_s16_ptr(__transfersize(`3`) int16_t const * ptr, int16x8x3_t * src, __constrange(`0`,`7`) int lane); // VLD3.16 {d0[0], d2[0], d4[0]}, [r0]
1465	_NEON2SSESTORAGE int32x4x3_t vld3q_lane_s32_ptr(__transfersize(`3`) int32_t const * ptr, int32x4x3_t * src, __constrange(`0`,`3`) int lane); // VLD3.32 {d0[0], d2[0], d4[0]}, [r0]
1466	_NEON2SSE_GLOBAL float16x8x3_t vld3q_lane_f16_ptr(__transfersize(`3`) __fp16 const * ptr, float16x8x3_t * src, __constrange(`0`,`7`) int lane); // VLD3.16 {d0[0], d2[0], d4[0]}, [r0]
1467	_NEON2SSESTORAGE float32x4x3_t vld3q_lane_f32_ptr(__transfersize(`3`) float32_t const * ptr, float32x4x3_t * src, __constrange(`0`,`3`) int lane); // VLD3.32 {d0[0], d2[0], d4[0]}, [r0]
1468	_NEON2SSE_GLOBAL poly16x8x3_t vld3q_lane_p16_ptr(__transfersize(`3`) poly16_t const * ptr, poly16x8x3_t * src, __constrange(`0`,`7`) int lane); // VLD3.16 {d0[0], d2[0], d4[0]}, [r0]
1469	_NEON2SSESTORAGE uint8x8x3_t vld3_lane_u8(__transfersize(`3`) uint8_t const * ptr, uint8x8x3_t src, __constrange(`0`,`7`) int lane); //VLD3.8 {d0[0], d1[0], d2[0]}, [r0]
1470	_NEON2SSESTORAGE uint16x4x3_t vld3_lane_u16(__transfersize(`3`) uint16_t const * ptr, uint16x4x3_t src, __constrange(`0`,`3`) int lane); // VLD3.16 {d0[0], d1[0], d2[0]}, [r0]
1471	_NEON2SSESTORAGE uint32x2x3_t vld3_lane_u32(__transfersize(`3`) uint32_t const * ptr, uint32x2x3_t src, __constrange(`0`,`1`) int lane); // VLD3.32 {d0[0], d1[0], d2[0]}, [r0]
1472	_NEON2SSE_GLOBAL int8x8x3_t vld3_lane_s8(__transfersize(`3`) int8_t const * ptr, int8x8x3_t src, __constrange(`0`,`7`) int lane); //VLD3.8 {d0[0], d1[0], d2[0]}, [r0]
1473	_NEON2SSE_GLOBAL int16x4x3_t vld3_lane_s16(__transfersize(`3`) int16_t const * ptr, int16x4x3_t src, __constrange(`0`,`3`) int lane); //VLD3.16 {d0[0], d1[0], d2[0]}, [r0]
1474	_NEON2SSE_GLOBAL int32x2x3_t vld3_lane_s32(__transfersize(`3`) int32_t const * ptr, int32x2x3_t src, __constrange(`0`,`1`) int lane); //VLD3.32 {d0[0], d1[0], d2[0]}, [r0]
1475	_NEON2SSE_GLOBAL float16x4x3_t vld3_lane_f16_ptr(__transfersize(`3`) __fp16 const * ptr, float16x4x3_t * src, __constrange(`0`,`3`) int lane); // VLD3.16 {d0[0], d1[0], d2[0]}, [r0]
1476	_NEON2SSESTORAGE float32x2x3_t vld3_lane_f32(__transfersize(`3`) float32_t const * ptr, float32x2x3_t src, __constrange(`0`,`1`) int lane); // VLD3.32 {d0[0], d1[0], d2[0]}, [r0]
1477	_NEON2SSE_GLOBAL poly8x8x3_t vld3_lane_p8(__transfersize(`3`) poly8_t const * ptr, poly8x8x3_t src, __constrange(`0`,`7`) int lane); //VLD3.8 {d0[0], d1[0], d2[0]}, [r0]
1478	_NEON2SSE_GLOBAL poly16x4x3_t vld3_lane_p16(__transfersize(`3`) poly16_t const * ptr, poly16x4x3_t src, __constrange(`0`,`3`) int lane); // VLD3.16 {d0[0], d1[0], d2[0]}, [r0]
1479	_NEON2SSESTORAGE uint16x8x4_t vld4q_lane_u16_ptr(__transfersize(`4`) uint16_t const * ptr, uint16x8x4_t * src, __constrange(`0`,`7`) int lane); // VLD4.16 {d0[0], d2[0], d4[0], d6[0]}, [r0]
1480	_NEON2SSESTORAGE uint32x4x4_t vld4q_lane_u32_ptr(__transfersize(`4`) uint32_t const * ptr, uint32x4x4_t * src, __constrange(`0`,`3`) int lane); // VLD4.32 {d0[0], d2[0], d4[0], d6[0]}, [r0]
1481	_NEON2SSE_GLOBAL int16x8x4_t vld4q_lane_s16_ptr(__transfersize(`4`) int16_t const * ptr, int16x8x4_t * src, __constrange(`0`,`7`) int lane); // VLD4.16 {d0[0], d2[0], d4[0], d6[0]}, [r0]
1482	_NEON2SSE_GLOBAL int32x4x4_t vld4q_lane_s32_ptr(__transfersize(`4`) int32_t const * ptr, int32x4x4_t * src, __constrange(`0`,`3`) int lane); // VLD4.32 {d0[0], d2[0], d4[0], d6[0]}, [r0]
1483	_NEON2SSE_GLOBAL float16x8x4_t vld4q_lane_f16_ptr(__transfersize(`4`) __fp16 const * ptr, float16x8x4_t * src, __constrange(`0`,`7`) int lane); // VLD4.16 {d0[0], d2[0], d4[0], d6[0]}, [r0]
1484	_NEON2SSESTORAGE float32x4x4_t vld4q_lane_f32_ptr(__transfersize(`4`) float32_t const * ptr, float32x4x4_t * src, __constrange(`0`,`3`) int lane); // VLD4.32 {d0[0], d2[0], d4[0], d6[0]}, [r0]
1485	_NEON2SSE_GLOBAL poly16x8x4_t vld4q_lane_p16_ptr(__transfersize(`4`) poly16_t const * ptr, poly16x8x4_t * src, __constrange(`0`,`7`) int lane); // VLD4.16 {d0[0], d2[0], d4[0], d6[0]}, [r0]
1486	_NEON2SSESTORAGE uint8x8x4_t vld4_lane_u8(__transfersize(`4`) uint8_t const * ptr, uint8x8x4_t src, __constrange(`0`,`7`) int lane); //VLD4.8 {d0[0], d1[0], d2[0], d3[0]}, [r0]
1487	_NEON2SSESTORAGE uint16x4x4_t vld4_lane_u16(__transfersize(`4`) uint16_t const * ptr, uint16x4x4_t src, __constrange(`0`,`3`) int lane); // VLD4.16 {d0[0], d1[0], d2[0], d3[0]}, [r0]
1488	_NEON2SSESTORAGE uint32x2x4_t vld4_lane_u32(__transfersize(`4`) uint32_t const * ptr, uint32x2x4_t src, __constrange(`0`,`1`) int lane); // VLD4.32 {d0[0], d1[0], d2[0], d3[0]}, [r0]
1489	_NEON2SSE_GLOBAL int8x8x4_t vld4_lane_s8(__transfersize(`4`) int8_t const * ptr, int8x8x4_t src, __constrange(`0`,`7`) int lane); //VLD4.8 {d0[0], d1[0], d2[0], d3[0]}, [r0]
1490	_NEON2SSE_GLOBAL int16x4x4_t vld4_lane_s16(__transfersize(`4`) int16_t const * ptr, int16x4x4_t src, __constrange(`0`,`3`) int lane); //VLD4.16 {d0[0], d1[0], d2[0], d3[0]}, [r0]
1491	_NEON2SSE_GLOBAL int32x2x4_t vld4_lane_s32(__transfersize(`4`) int32_t const * ptr, int32x2x4_t src, __constrange(`0`,`1`) int lane); //VLD4.32 {d0[0], d1[0], d2[0], d3[0]}, [r0]
1492	_NEON2SSE_GLOBAL float16x4x4_t vld4_lane_f16_ptr(__transfersize(`4`) __fp16 const * ptr, float16x4x4_t * src, __constrange(`0`,`3`) int lane); // VLD4.16 {d0[0], d1[0], d2[0], d3[0]}, [r0]
1493	_NEON2SSESTORAGE float32x2x4_t vld4_lane_f32(__transfersize(`4`) float32_t const * ptr, float32x2x4_t src, __constrange(`0`,`1`) int lane); // VLD4.32 {d0[0], d1[0], d2[0], d3[0]}, [r0]
1494	_NEON2SSE_GLOBAL poly8x8x4_t vld4_lane_p8(__transfersize(`4`) poly8_t const * ptr, poly8x8x4_t src, __constrange(`0`,`7`) int lane); //VLD4.8 {d0[0], d1[0], d2[0], d3[0]}, [r0]
1495	_NEON2SSE_GLOBAL poly16x4x4_t vld4_lane_p16(__transfersize(`4`) poly16_t const * ptr, poly16x4x4_t src, __constrange(`0`,`3`) int lane); // VLD4.16 {d0[0], d1[0], d2[0], d3[0]}, [r0]
1496	//Store N-element structure to memory
1497	_NEON2SSESTORAGE void vst2q_u8_ptr(__transfersize(`32`) uint8_t * ptr, uint8x16x2_t const * val); // VST2.8 {d0, d2}, [r0]
1498	_NEON2SSESTORAGE void vst2q_u16_ptr(__transfersize(`16`) uint16_t * ptr, uint16x8x2_t const * val); // VST2.16 {d0, d2}, [r0]
1499	_NEON2SSESTORAGE void vst2q_u32_ptr(__transfersize(`8`) uint32_t * ptr, uint32x4x2_t const * val); // VST2.32 {d0, d2}, [r0]
1500	_NEON2SSE_GLOBAL void vst2q_s8_ptr(__transfersize(`32`) int8_t * ptr, int8x16x2_t const * val); // VST2.8 {d0, d2}, [r0]
1501	_NEON2SSE_GLOBAL void vst2q_s16_ptr(__transfersize(`16`) int16_t * ptr, int16x8x2_t const * val); // VST2.16 {d0, d2}, [r0]
1502	_NEON2SSE_GLOBAL void vst2q_s32_ptr(__transfersize(`8`) int32_t * ptr, int32x4x2_t const * val); // VST2.32 {d0, d2}, [r0]
1503	_NEON2SSE_GLOBAL void vst2q_f16_ptr(__transfersize(`16`) __fp16 * ptr, float16x8x2_t const * val); // VST2.16 {d0, d2}, [r0]
1504	_NEON2SSESTORAGE void vst2q_f32_ptr(__transfersize(`8`) float32_t * ptr, float32x4x2_t const * val); // VST2.32 {d0, d2}, [r0]
1505	_NEON2SSE_GLOBAL void vst2q_p8_ptr(__transfersize(`32`) poly8_t * ptr, poly8x16x2_t const * val); // VST2.8 {d0, d2}, [r0]
1506	_NEON2SSE_GLOBAL void vst2q_p16_ptr(__transfersize(`16`) poly16_t * ptr, poly16x8x2_t const * val); // VST2.16 {d0, d2}, [r0]
1507	_NEON2SSESTORAGE void vst2_u8(__transfersize(`16`) uint8_t * ptr, uint8x8x2_t val); // VST2.8 {d0, d1}, [r0]
1508	_NEON2SSESTORAGE void vst2_u16(__transfersize(`8`) uint16_t * ptr, uint16x4x2_t val); // VST2.16 {d0, d1}, [r0]
1509	_NEON2SSESTORAGE void vst2_u32(__transfersize(`4`) uint32_t * ptr, uint32x2x2_t val); // VST2.32 {d0, d1}, [r0]
1510	_NEON2SSESTORAGE void vst2_u64(__transfersize(`2`) uint64_t * ptr, uint64x1x2_t val); // VST1.64 {d0, d1}, [r0]
1511	_NEON2SSE_GLOBAL void vst2_s8(__transfersize(`16`) int8_t * ptr, int8x8x2_t val); // VST2.8 {d0, d1}, [r0]
1512	_NEON2SSE_GLOBAL void vst2_s16(__transfersize(`8`) int16_t * ptr, int16x4x2_t val); // VST2.16 {d0, d1}, [r0]
1513	_NEON2SSE_GLOBAL void vst2_s32(__transfersize(`4`) int32_t * ptr, int32x2x2_t val); // VST2.32 {d0, d1}, [r0]
1514	_NEON2SSE_GLOBAL void vst2_s64(__transfersize(`2`) int64_t * ptr, int64x1x2_t val); // VST1.64 {d0, d1}, [r0]
1515	//void vst2_f16_ptr(__transfersize(8) __fp16 ptr, float16x4x2_t const * val); // VST2.16 {d0, d1}, [r0]*
1516	_NEON2SSE_GLOBAL void vst2_f32_ptr(__transfersize(`4`) float32_t * ptr, float32x2x2_t const * val); // VST2.32 {d0, d1}, [r0]
1517	_NEON2SSE_GLOBAL void vst2_p8(__transfersize(`16`) poly8_t * ptr, poly8x8x2_t val); // VST2.8 {d0, d1}, [r0]
1518	_NEON2SSE_GLOBAL void vst2_p16(__transfersize(`8`) poly16_t * ptr, poly16x4x2_t val); // VST2.16 {d0, d1}, [r0]
1519	_NEON2SSESTORAGE void vst3q_u8_ptr(__transfersize(`48`) uint8_t * ptr, uint8x16x3_t const * val); // VST3.8 {d0, d2, d4}, [r0]
1520	_NEON2SSESTORAGE void vst3q_u16_ptr(__transfersize(`24`) uint16_t * ptr, uint16x8x3_t const * val); // VST3.16 {d0, d2, d4}, [r0]
1521	_NEON2SSESTORAGE void vst3q_u32_ptr(__transfersize(`12`) uint32_t * ptr, uint32x4x3_t const * val); // VST3.32 {d0, d2, d4}, [r0]
1522	_NEON2SSE_GLOBAL void vst3q_s8_ptr(__transfersize(`48`) int8_t * ptr, int8x16x3_t const * val); // VST3.8 {d0, d2, d4}, [r0]
1523	_NEON2SSE_GLOBAL void vst3q_s16_ptr(__transfersize(`24`) int16_t * ptr, int16x8x3_t const * val); // VST3.16 {d0, d2, d4}, [r0]
1524	_NEON2SSE_GLOBAL void vst3q_s32_ptr(__transfersize(`12`) int32_t * ptr, int32x4x3_t const * val); // VST3.32 {d0, d2, d4}, [r0]
1525	_NEON2SSE_GLOBAL void vst3q_f16_ptr(__transfersize(`24`) __fp16 * ptr, float16x8x3_t const * val); // VST3.16 {d0, d2, d4}, [r0]
1526	_NEON2SSESTORAGE void vst3q_f32_ptr(__transfersize(`12`) float32_t * ptr, float32x4x3_t const * val); // VST3.32 {d0, d2, d4}, [r0]
1527	_NEON2SSE_GLOBAL void vst3q_p8_ptr(__transfersize(`48`) poly8_t * ptr, poly8x16x3_t const * val); // VST3.8 {d0, d2, d4}, [r0]
1528	_NEON2SSE_GLOBAL void vst3q_p16_ptr(__transfersize(`24`) poly16_t * ptr, poly16x8x3_t const * val); // VST3.16 {d0, d2, d4}, [r0]
1529	_NEON2SSESTORAGE void vst3_u8(__transfersize(`24`) uint8_t * ptr, uint8x8x3_t val); // VST3.8 {d0, d1, d2}, [r0]
1530	_NEON2SSESTORAGE void vst3_u16(__transfersize(`12`) uint16_t * ptr, uint16x4x3_t val); // VST3.16 {d0, d1, d2}, [r0]
1531	_NEON2SSESTORAGE void vst3_u32(__transfersize(`6`) uint32_t * ptr, uint32x2x3_t val); // VST3.32 {d0, d1, d2}, [r0]
1532	_NEON2SSESTORAGE void vst3_u64(__transfersize(`3`) uint64_t * ptr, uint64x1x3_t val); // VST1.64 {d0, d1, d2}, [r0]
1533	_NEON2SSE_GLOBAL void vst3_s8(__transfersize(`24`) int8_t * ptr, int8x8x3_t val); // VST3.8 {d0, d1, d2}, [r0]
1534	_NEON2SSE_GLOBAL void vst3_s16(__transfersize(`12`) int16_t * ptr, int16x4x3_t val); // VST3.16 {d0, d1, d2}, [r0]
1535	_NEON2SSE_GLOBAL void vst3_s32(__transfersize(`6`) int32_t * ptr, int32x2x3_t val); // VST3.32 {d0, d1, d2}, [r0]
1536	_NEON2SSE_GLOBAL void vst3_s64(__transfersize(`3`) int64_t * ptr, int64x1x3_t val); // VST1.64 {d0, d1, d2}, [r0]
1537	_NEON2SSE_GLOBAL void vst3_f16_ptr(__transfersize(`12`) __fp16 * ptr, float16x4x3_t const * val); // VST3.16 {d0, d1, d2}, [r0]
1538	_NEON2SSESTORAGE void vst3_f32(__transfersize(`6`) float32_t * ptr, float32x2x3_t val); // VST3.32 {d0, d1, d2}, [r0]
1539	_NEON2SSE_GLOBAL void vst3_p8(__transfersize(`24`) poly8_t * ptr, poly8x8x3_t val); // VST3.8 {d0, d1, d2}, [r0]
1540	_NEON2SSE_GLOBAL void vst3_p16(__transfersize(`12`) poly16_t * ptr, poly16x4x3_t val); // VST3.16 {d0, d1, d2}, [r0]
1541	_NEON2SSESTORAGE void vst4q_u8_ptr(__transfersize(`64`) uint8_t * ptr, uint8x16x4_t const * val); // VST4.8 {d0, d2, d4, d6}, [r0]
1542	_NEON2SSESTORAGE void vst4q_u16_ptr(__transfersize(`32`) uint16_t * ptr, uint16x8x4_t const * val); // VST4.16 {d0, d2, d4, d6}, [r0]
1543	_NEON2SSESTORAGE void vst4q_u32_ptr(__transfersize(`16`) uint32_t * ptr, uint32x4x4_t const * val); // VST4.32 {d0, d2, d4, d6}, [r0]
1544	_NEON2SSE_GLOBAL void vst4q_s8_ptr(__transfersize(`64`) int8_t * ptr, int8x16x4_t const * val); // VST4.8 {d0, d2, d4, d6}, [r0]
1545	_NEON2SSE_GLOBAL void vst4q_s16_ptr(__transfersize(`32`) int16_t * ptr, int16x8x4_t const * val); // VST4.16 {d0, d2, d4, d6}, [r0]
1546	_NEON2SSE_GLOBAL void vst4q_s32_ptr(__transfersize(`16`) int32_t * ptr, int32x4x4_t const * val); // VST4.32 {d0, d2, d4, d6}, [r0]
1547	_NEON2SSE_GLOBAL void vst4q_f16_ptr(__transfersize(`32`) __fp16 * ptr, float16x8x4_t const * val); // VST4.16 {d0, d2, d4, d6}, [r0]
1548	_NEON2SSESTORAGE void vst4q_f32_ptr(__transfersize(`16`) float32_t * ptr, float32x4x4_t const * val); // VST4.32 {d0, d2, d4, d6}, [r0]
1549	_NEON2SSE_GLOBAL void vst4q_p8_ptr(__transfersize(`64`) poly8_t * ptr, poly8x16x4_t const * val); // VST4.8 {d0, d2, d4, d6}, [r0]
1550	_NEON2SSE_GLOBAL void vst4q_p16_ptr(__transfersize(`32`) poly16_t * ptr, poly16x8x4_t const * val); // VST4.16 {d0, d2, d4, d6}, [r0]
1551	_NEON2SSESTORAGE void vst4_u8(__transfersize(`32`) uint8_t * ptr, uint8x8x4_t val); // VST4.8 {d0, d1, d2, d3}, [r0]
1552	_NEON2SSESTORAGE void vst4_u16(__transfersize(`16`) uint16_t * ptr, uint16x4x4_t val); // VST4.16 {d0, d1, d2, d3}, [r0]
1553	_NEON2SSESTORAGE void vst4_u32(__transfersize(`8`) uint32_t * ptr, uint32x2x4_t val); // VST4.32 {d0, d1, d2, d3}, [r0]
1554	_NEON2SSESTORAGE void vst4_u64(__transfersize(`4`) uint64_t * ptr, uint64x1x4_t val); // VST1.64 {d0, d1, d2, d3}, [r0]
1555	_NEON2SSE_GLOBAL void vst4_s8(__transfersize(`32`) int8_t * ptr, int8x8x4_t val); // VST4.8 {d0, d1, d2, d3}, [r0]
1556	_NEON2SSE_GLOBAL void vst4_s16(__transfersize(`16`) int16_t * ptr, int16x4x4_t val); // VST4.16 {d0, d1, d2, d3}, [r0]
1557	_NEON2SSE_GLOBAL void vst4_s32(__transfersize(`8`) int32_t * ptr, int32x2x4_t val); // VST4.32 {d0, d1, d2, d3}, [r0]
1558	_NEON2SSE_GLOBAL void vst4_s64(__transfersize(`4`) int64_t * ptr, int64x1x4_t val); // VST1.64 {d0, d1, d2, d3}, [r0]
1559	_NEON2SSE_GLOBAL void vst4_f16_ptr(__transfersize(`16`) __fp16 * ptr, float16x4x4_t const * val); // VST4.16 {d0, d1, d2, d3}, [r0]
1560	_NEON2SSESTORAGE void vst4_f32(__transfersize(`8`) float32_t * ptr, float32x2x4_t val); // VST4.32 {d0, d1, d2, d3}, [r0]
1561	_NEON2SSE_GLOBAL void vst4_p8(__transfersize(`32`) poly8_t * ptr, poly8x8x4_t val); // VST4.8 {d0, d1, d2, d3}, [r0]
1562	_NEON2SSE_GLOBAL void vst4_p16(__transfersize(`16`) poly16_t * ptr, poly16x4x4_t val); // VST4.16 {d0, d1, d2, d3}, [r0]
1563	//Store a single lane of N-element structure to memory
1564	_NEON2SSESTORAGE void vst2q_lane_u16_ptr(__transfersize(`2`) uint16_t * ptr, uint16x8x2_t const * val, __constrange(`0`,`7`) int lane); // VST2.16{d0[0], d2[0]}, [r0]
1565	_NEON2SSESTORAGE void vst2q_lane_u32_ptr(__transfersize(`2`) uint32_t * ptr, uint32x4x2_t const * val, __constrange(`0`,`3`) int lane); // VST2.32{d0[0], d2[0]}, [r0]
1566	_NEON2SSE_GLOBAL void vst2q_lane_s16_ptr(__transfersize(`2`) int16_t * ptr, int16x8x2_t const * val, __constrange(`0`,`7`) int lane); // VST2.16{d0[0], d2[0]}, [r0]
1567	_NEON2SSE_GLOBAL void vst2q_lane_s32_ptr(__transfersize(`2`) int32_t * ptr, int32x4x2_t const * val, __constrange(`0`,`3`) int lane); // VST2.32{d0[0], d2[0]}, [r0]
1568	_NEON2SSE_GLOBAL void vst2q_lane_f16_ptr(__transfersize(`2`) __fp16 * ptr, float16x8x2_t const * val, __constrange(`0`,`7`) int lane); // VST2.16{d0[0], d2[0]}, [r0]
1569	_NEON2SSESTORAGE void vst2q_lane_f32_ptr(__transfersize(`2`) float32_t * ptr, float32x4x2_t const * val, __constrange(`0`,`3`) int lane); //VST2.32 {d0[0], d2[0]}, [r0]
1570	_NEON2SSE_GLOBAL void vst2q_lane_p16_ptr(__transfersize(`2`) poly16_t * ptr, poly16x8x2_t const * val, __constrange(`0`,`7`) int lane); // VST2.16{d0[0], d2[0]}, [r0]
1571	_NEON2SSESTORAGE void vst2_lane_u8(__transfersize(`2`) uint8_t * ptr, uint8x8x2_t val, __constrange(`0`,`7`) int lane); // VST2.8{d0[0], d1[0]}, [r0]
1572	_NEON2SSESTORAGE void vst2_lane_u16(__transfersize(`2`) uint16_t * ptr, uint16x4x2_t val, __constrange(`0`,`3`) int lane); // VST2.16{d0[0], d1[0]}, [r0]
1573	_NEON2SSESTORAGE void vst2_lane_u32(__transfersize(`2`) uint32_t * ptr, uint32x2x2_t val, __constrange(`0`,`1`) int lane); // VST2.32{d0[0], d1[0]}, [r0]
1574	_NEON2SSE_GLOBAL void vst2_lane_s8(__transfersize(`2`) int8_t * ptr, int8x8x2_t val, __constrange(`0`,`7`) int lane); // VST2.8 {d0[0],d1[0]}, [r0]
1575	_NEON2SSE_GLOBAL void vst2_lane_s16(__transfersize(`2`) int16_t * ptr, int16x4x2_t val, __constrange(`0`,`3`) int lane); // VST2.16{d0[0], d1[0]}, [r0]
1576	_NEON2SSE_GLOBAL void vst2_lane_s32(__transfersize(`2`) int32_t * ptr, int32x2x2_t val, __constrange(`0`,`1`) int lane); // VST2.32{d0[0], d1[0]}, [r0]
1577	_NEON2SSE_GLOBAL void vst2_lane_f16_ptr(__transfersize(`2`) __fp16 * ptr, float16x4x2_t const * val, __constrange(`0`,`3`) int lane); // VST2.16{d0[0], d1[0]}, [r0]
1578	_NEON2SSESTORAGE void vst2_lane_f32(__transfersize(`2`) float32_t * ptr, float32x2x2_t val, __constrange(`0`,`1`) int lane); // VST2.32{d0[0], d1[0]}, [r0]
1579	_NEON2SSE_GLOBAL void vst2_lane_p8(__transfersize(`2`) poly8_t * ptr, poly8x8x2_t val, __constrange(`0`,`7`) int lane); // VST2.8{d0[0], d1[0]}, [r0]
1580	_NEON2SSE_GLOBAL void vst2_lane_p16(__transfersize(`2`) poly16_t * ptr, poly16x4x2_t val, __constrange(`0`,`3`) int lane); // VST2.16{d0[0], d1[0]}, [r0]
1581	_NEON2SSESTORAGE void vst3q_lane_u16_ptr(__transfersize(`3`) uint16_t * ptr, uint16x8x3_t const * val, __constrange(`0`,`7`) int lane); // VST3.16{d0[0], d2[0], d4[0]}, [r0]
1582	_NEON2SSESTORAGE void vst3q_lane_u32_ptr(__transfersize(`3`) uint32_t * ptr, uint32x4x3_t const * val, __constrange(`0`,`3`) int lane); // VST3.32{d0[0], d2[0], d4[0]}, [r0]
1583	_NEON2SSE_GLOBAL void vst3q_lane_s16_ptr(__transfersize(`3`) int16_t * ptr, int16x8x3_t const * val, __constrange(`0`,`7`) int lane); // VST3.16{d0[0], d2[0], d4[0]}, [r0]
1584	_NEON2SSE_GLOBAL void vst3q_lane_s32_ptr(__transfersize(`3`) int32_t * ptr, int32x4x3_t const * val, __constrange(`0`,`3`) int lane); // VST3.32{d0[0], d2[0], d4[0]}, [r0]
1585	_NEON2SSE_GLOBAL void vst3q_lane_f16_ptr(__transfersize(`3`) __fp16 * ptr, float16x8x3_t const * val, __constrange(`0`,`7`) int lane); // VST3.16{d0[0], d2[0], d4[0]}, [r0]
1586	_NEON2SSESTORAGE void vst3q_lane_f32_ptr(__transfersize(`3`) float32_t * ptr, float32x4x3_t const * val, __constrange(`0`,`3`) int lane); //VST3.32 {d0[0], d2[0], d4[0]}, [r0]
1587	_NEON2SSE_GLOBAL void vst3q_lane_p16_ptr(__transfersize(`3`) poly16_t * ptr, poly16x8x3_t const * val, __constrange(`0`,`7`) int lane); // VST3.16{d0[0], d2[0], d4[0]}, [r0]
1588	_NEON2SSESTORAGE void vst3_lane_u8(__transfersize(`3`) uint8_t * ptr, uint8x8x3_t val, __constrange(`0`,`7`) int lane); // VST3.8{d0[0], d1[0], d2[0]}, [r0]
1589	_NEON2SSESTORAGE void vst3_lane_u16(__transfersize(`3`) uint16_t * ptr, uint16x4x3_t val, __constrange(`0`,`3`) int lane); // VST3.16{d0[0], d1[0], d2[0]}, [r0]
1590	_NEON2SSESTORAGE void vst3_lane_u32(__transfersize(`3`) uint32_t * ptr, uint32x2x3_t val, __constrange(`0`,`1`) int lane); // VST3.32{d0[0], d1[0], d2[0]}, [r0]
1591	_NEON2SSE_GLOBAL void vst3_lane_s8(__transfersize(`3`) int8_t * ptr, int8x8x3_t val, __constrange(`0`,`7`) int lane); // VST3.8 {d0[0],d1[0], d2[0]}, [r0]
1592	_NEON2SSE_GLOBAL void vst3_lane_s16(__transfersize(`3`) int16_t * ptr, int16x4x3_t val, __constrange(`0`,`3`) int lane); // VST3.16{d0[0], d1[0], d2[0]}, [r0]
1593	_NEON2SSE_GLOBAL void vst3_lane_s32(__transfersize(`3`) int32_t * ptr, int32x2x3_t val, __constrange(`0`,`1`) int lane); // VST3.32{d0[0], d1[0], d2[0]}, [r0]
1594	_NEON2SSE_GLOBAL void vst3_lane_f16_ptr(__transfersize(`3`) __fp16 * ptr, float16x4x3_t const * val, __constrange(`0`,`3`) int lane); // VST3.16{d0[0], d1[0], d2[0]}, [r0]
1595	_NEON2SSESTORAGE void vst3_lane_f32(__transfersize(`3`) float32_t * ptr, float32x2x3_t val, __constrange(`0`,`1`) int lane); // VST3.32{d0[0], d1[0], d2[0]}, [r0]
1596	_NEON2SSE_GLOBAL void vst3_lane_p8(__transfersize(`3`) poly8_t * ptr, poly8x8x3_t val, __constrange(`0`,`7`) int lane); // VST3.8{d0[0], d1[0], d2[0]}, [r0]
1597	_NEON2SSE_GLOBAL void vst3_lane_p16(__transfersize(`3`) poly16_t * ptr, poly16x4x3_t val, __constrange(`0`,`3`) int lane); // VST3.16{d0[0], d1[0], d2[0]}, [r0]
1598	_NEON2SSESTORAGE void vst4q_lane_u16_ptr(__transfersize(`4`) uint16_t * ptr, uint16x8x4_t const * val, __constrange(`0`,`7`) int lane); // VST4.16{d0[0], d2[0], d4[0], d6[0]}, [r0]
1599	_NEON2SSESTORAGE void vst4q_lane_u32_ptr(__transfersize(`4`) uint32_t * ptr, uint32x4x4_t const * val, __constrange(`0`,`3`) int lane); // VST4.32{d0[0], d2[0], d4[0], d6[0]}, [r0]
1600	_NEON2SSE_GLOBAL void vst4q_lane_s16_ptr(__transfersize(`4`) int16_t * ptr, int16x8x4_t const * val, __constrange(`0`,`7`) int lane); // VST4.16{d0[0], d2[0], d4[0], d6[0]}, [r0]
1601	_NEON2SSE_GLOBAL void vst4q_lane_s32_ptr(__transfersize(`4`) int32_t * ptr, int32x4x4_t const * val, __constrange(`0`,`3`) int lane); // VST4.32{d0[0], d2[0], d4[0], d6[0]}, [r0]
1602	_NEON2SSE_GLOBAL void vst4q_lane_f16_ptr(__transfersize(`4`) __fp16 * ptr, float16x8x4_t const * val, __constrange(`0`,`7`) int lane); // VST4.16{d0[0], d2[0], d4[0], d6[0]}, [r0]
1603	_NEON2SSESTORAGE void vst4q_lane_f32_ptr(__transfersize(`4`) float32_t * ptr, float32x4x4_t const * val, __constrange(`0`,`3`) int lane); //VST4.32 {d0[0], d2[0], d4[0], d6[0]}, [r0]
1604	_NEON2SSE_GLOBAL void vst4q_lane_p16_ptr(__transfersize(`4`) poly16_t * ptr, poly16x8x4_t const * val, __constrange(`0`,`7`) int lane); // VST4.16{d0[0], d2[0], d4[0], d6[0]}, [r0]
1605	_NEON2SSESTORAGE void vst4_lane_u8(__transfersize(`4`) uint8_t * ptr, uint8x8x4_t val, __constrange(`0`,`7`) int lane); // VST4.8{d0[0], d1[0], d2[0], d3[0]}, [r0]
1606	_NEON2SSESTORAGE void vst4_lane_u16(__transfersize(`4`) uint16_t * ptr, uint16x4x4_t val, __constrange(`0`,`3`) int lane); // VST4.16{d0[0], d1[0], d2[0], d3[0]}, [r0]
1607	_NEON2SSESTORAGE void vst4_lane_u32(__transfersize(`4`) uint32_t * ptr, uint32x2x4_t val, __constrange(`0`,`1`) int lane); // VST4.32{d0[0], d1[0], d2[0], d3[0]}, [r0]
1608	_NEON2SSE_GLOBAL void vst4_lane_s8(__transfersize(`4`) int8_t * ptr, int8x8x4_t val, __constrange(`0`,`7`) int lane); // VST4.8 {d0[0],d1[0], d2[0], d3[0]}, [r0]
1609	_NEON2SSE_GLOBAL void vst4_lane_s16(__transfersize(`4`) int16_t * ptr, int16x4x4_t val, __constrange(`0`,`3`) int lane); // VST4.16{d0[0], d1[0], d2[0], d3[0]}, [r0]
1610	_NEON2SSE_GLOBAL void vst4_lane_s32(__transfersize(`4`) int32_t * ptr, int32x2x4_t val, __constrange(`0`,`1`) int lane); // VST4.32{d0[0], d1[0], d2[0], d3[0]}, [r0]
1611	_NEON2SSE_GLOBAL void vst4_lane_f16_ptr(__transfersize(`4`) __fp16 * ptr, float16x4x4_t const * val, __constrange(`0`,`3`) int lane); // VST4.16{d0[0], d1[0], d2[0], d3[0]}, [r0]
1612	_NEON2SSESTORAGE void vst4_lane_f32(__transfersize(`4`) float32_t * ptr, float32x2x4_t val, __constrange(`0`,`1`) int lane); // VST4.32{d0[0], d1[0], d2[0], d3[0]}, [r0]
1613	_NEON2SSE_GLOBAL void vst4_lane_p8(__transfersize(`4`) poly8_t * ptr, poly8x8x4_t val, __constrange(`0`,`7`) int lane); // VST4.8{d0[0], d1[0], d2[0], d3[0]}, [r0]
1614	_NEON2SSE_GLOBAL void vst4_lane_p16(__transfersize(`4`) poly16_t * ptr, poly16x4x4_t val, __constrange(`0`,`3`) int lane); // VST4.16{d0[0], d1[0], d2[0], d3[0]}, [r0]
1615	//Extract lanes from a vector and put into a register. These intrinsics extract a single lane (element) from a vector.
1616	_NEON2SSE_GLOBAL uint8_t vget_lane_u8(uint8x8_t vec, __constrange(`0`,`7`) int lane); // VMOV.U8 r0, d0[0]
1617	_NEON2SSE_GLOBAL uint16_t vget_lane_u16(uint16x4_t vec, __constrange(`0`,`3`) int lane); // VMOV.U16 r0, d0[0]
1618	_NEON2SSE_GLOBAL uint32_t vget_lane_u32(uint32x2_t vec, __constrange(`0`,`1`) int lane); // VMOV.32 r0, d0[0]
1619	_NEON2SSE_GLOBAL int8_t vget_lane_s8(int8x8_t vec, __constrange(`0`,`7`) int lane); // VMOV.S8 r0, d0[0]
1620	_NEON2SSE_GLOBAL int16_t vget_lane_s16(int16x4_t vec, __constrange(`0`,`3`) int lane); // VMOV.S16 r0, d0[0]
1621	_NEON2SSE_GLOBAL int32_t vget_lane_s32(int32x2_t vec, __constrange(`0`,`1`) int lane); // VMOV.32 r0, d0[0]
1622	_NEON2SSE_GLOBAL poly8_t vget_lane_p8(poly8x8_t vec, __constrange(`0`,`7`) int lane); // VMOV.U8 r0, d0[0]
1623	_NEON2SSE_GLOBAL poly16_t vget_lane_p16(poly16x4_t vec, __constrange(`0`,`3`) int lane); // VMOV.U16 r0, d0[0]
1624	_NEON2SSE_GLOBAL float32_t vget_lane_f32(float32x2_t vec, __constrange(`0`,`1`) int lane); // VMOV.32 r0, d0[0]
1625	_NEON2SSE_GLOBAL uint8_t vgetq_lane_u8(uint8x16_t vec, __constrange(`0`,`15`) int lane); // VMOV.U8 r0, d0[0]
1626	_NEON2SSE_GLOBAL uint16_t vgetq_lane_u16(uint16x8_t vec, __constrange(`0`,`7`) int lane); // VMOV.U16 r0, d0[0]
1627	_NEON2SSE_GLOBAL uint32_t vgetq_lane_u32(uint32x4_t vec, __constrange(`0`,`3`) int lane); // VMOV.32 r0, d0[0]
1628	_NEON2SSE_GLOBAL int8_t vgetq_lane_s8(int8x16_t vec, __constrange(`0`,`15`) int lane); // VMOV.S8 r0, d0[0]
1629	_NEON2SSE_GLOBAL int16_t vgetq_lane_s16(int16x8_t vec, __constrange(`0`,`7`) int lane); // VMOV.S16 r0, d0[0]
1630	_NEON2SSE_GLOBAL int32_t vgetq_lane_s32(int32x4_t vec, __constrange(`0`,`3`) int lane); // VMOV.32 r0, d0[0]
1631	_NEON2SSE_GLOBAL poly8_t vgetq_lane_p8(poly8x16_t vec, __constrange(`0`,`15`) int lane); // VMOV.U8 r0, d0[0]
1632	_NEON2SSE_GLOBAL poly16_t vgetq_lane_p16(poly16x8_t vec, __constrange(`0`,`7`) int lane); // VMOV.U16 r0, d0[0]
1633	_NEON2SSESTORAGE float32_t vgetq_lane_f32(float32x4_t vec, __constrange(`0`,`3`) int lane); // VMOV.32 r0, d0[0]
1634	_NEON2SSE_GLOBAL int64_t vget_lane_s64(int64x1_t vec, __constrange(`0`,`0`) int lane); // VMOV r0,r0,d0
1635	_NEON2SSE_GLOBAL uint64_t vget_lane_u64(uint64x1_t vec, __constrange(`0`,`0`) int lane); // VMOV r0,r0,d0
1636	_NEON2SSE_GLOBAL int64_t vgetq_lane_s64(int64x2_t vec, __constrange(`0`,`1`) int lane); // VMOV r0,r0,d0
1637	_NEON2SSE_GLOBAL uint64_t vgetq_lane_u64(uint64x2_t vec, __constrange(`0`,`1`) int lane); // VMOV r0,r0,d0
1638	//Load a single lane of a vector from a literal. These intrinsics set a single lane (element) within a vector.
1639	_NEON2SSESTORAGE uint8x8_t vset_lane_u8(uint8_t value, uint8x8_t vec, __constrange(`0`,`7`) int lane); // VMOV.8 d0[0],r0
1640	_NEON2SSESTORAGE uint16x4_t vset_lane_u16(uint16_t value, uint16x4_t vec, __constrange(`0`,`3`) int lane); // VMOV.16 d0[0],r0
1641	_NEON2SSESTORAGE uint32x2_t vset_lane_u32(uint32_t value, uint32x2_t vec, __constrange(`0`,`1`) int lane); // VMOV.32 d0[0],r0
1642	_NEON2SSESTORAGE int8x8_t vset_lane_s8(int8_t value, int8x8_t vec, __constrange(`0`,`7`) int lane); // VMOV.8 d0[0],r0
1643	_NEON2SSESTORAGE int16x4_t vset_lane_s16(int16_t value, int16x4_t vec, __constrange(`0`,`3`) int lane); // VMOV.16 d0[0],r0
1644	_NEON2SSESTORAGE int32x2_t vset_lane_s32(int32_t value, int32x2_t vec, __constrange(`0`,`1`) int lane); // VMOV.32 d0[0],r0
1645	_NEON2SSE_GLOBAL poly8x8_t vset_lane_p8(poly8_t value, poly8x8_t vec, __constrange(`0`,`7`) int lane); // VMOV.8 d0[0],r0
1646	_NEON2SSE_GLOBAL poly16x4_t vset_lane_p16(poly16_t value, poly16x4_t vec, __constrange(`0`,`3`) int lane); // VMOV.16 d0[0],r0
1647	_NEON2SSESTORAGE float32x2_t vset_lane_f32(float32_t value, float32x2_t vec, __constrange(`0`,`1`) int lane); // VMOV.32 d0[0],r0
1648	_NEON2SSESTORAGE uint8x16_t vsetq_lane_u8(uint8_t value, uint8x16_t vec, __constrange(`0`,`15`) int lane); // VMOV.8 d0[0],r0
1649	_NEON2SSESTORAGE uint16x8_t vsetq_lane_u16(uint16_t value, uint16x8_t vec, __constrange(`0`,`7`) int lane); // VMOV.16 d0[0],r0
1650	_NEON2SSESTORAGE uint32x4_t vsetq_lane_u32(uint32_t value, uint32x4_t vec, __constrange(`0`,`3`) int lane); // VMOV.32 d0[0],r0
1651	_NEON2SSESTORAGE int8x16_t vsetq_lane_s8(int8_t value, int8x16_t vec, __constrange(`0`,`15`) int lane); // VMOV.8 d0[0],r0
1652	_NEON2SSESTORAGE int16x8_t vsetq_lane_s16(int16_t value, int16x8_t vec, __constrange(`0`,`7`) int lane); // VMOV.16 d0[0],r0
1653	_NEON2SSESTORAGE int32x4_t vsetq_lane_s32(int32_t value, int32x4_t vec, __constrange(`0`,`3`) int lane); // VMOV.32 d0[0],r0
1654	_NEON2SSE_GLOBAL poly8x16_t vsetq_lane_p8(poly8_t value, poly8x16_t vec, __constrange(`0`,`15`) int lane); // VMOV.8 d0[0],r0
1655	_NEON2SSE_GLOBAL poly16x8_t vsetq_lane_p16(poly16_t value, poly16x8_t vec, __constrange(`0`,`7`) int lane); // VMOV.16 d0[0],r0
1656	_NEON2SSESTORAGE float32x4_t vsetq_lane_f32(float32_t value, float32x4_t vec, __constrange(`0`,`3`) int lane); // VMOV.32 d0[0],r0
1657	_NEON2SSESTORAGE int64x1_t vset_lane_s64(int64_t value, int64x1_t vec, __constrange(`0`,`0`) int lane); // VMOV d0,r0,r0
1658	_NEON2SSESTORAGE uint64x1_t vset_lane_u64(uint64_t value, uint64x1_t vec, __constrange(`0`,`0`) int lane); // VMOV d0,r0,r0
1659	_NEON2SSESTORAGE int64x2_t vsetq_lane_s64(int64_t value, int64x2_t vec, __constrange(`0`,`1`) int lane); // VMOV d0,r0,r0
1660	_NEON2SSE_GLOBAL uint64x2_t vsetq_lane_u64(uint64_t value, uint64x2_t vec, __constrange(`0`,`1`) int lane); // VMOV d0,r0,r0
1661	//Initialize a vector from a literal bit pattern.
1662	_NEON2SSESTORAGE int8x8_t vcreate_s8(uint64_t a); // VMOV d0,r0,r0
1663	_NEON2SSE_GLOBAL int16x4_t vcreate_s16(uint64_t a); // VMOV d0,r0,r0
1664	_NEON2SSE_GLOBAL int32x2_t vcreate_s32(uint64_t a); // VMOV d0,r0,r0
1665	_NEON2SSE_GLOBAL float16x4_t vcreate_f16(uint64_t a); // VMOV d0,r0,r0
1666	_NEON2SSESTORAGE float32x2_t vcreate_f32(uint64_t a); // VMOV d0,r0,r0
1667	_NEON2SSE_GLOBAL uint8x8_t vcreate_u8(uint64_t a); // VMOV d0,r0,r0
1668	_NEON2SSE_GLOBAL uint16x4_t vcreate_u16(uint64_t a); // VMOV d0,r0,r0
1669	_NEON2SSE_GLOBAL uint32x2_t vcreate_u32(uint64_t a); // VMOV d0,r0,r0
1670	_NEON2SSE_GLOBAL uint64x1_t vcreate_u64(uint64_t a); // VMOV d0,r0,r0
1671	_NEON2SSE_GLOBAL poly8x8_t vcreate_p8(uint64_t a); // VMOV d0,r0,r0
1672	_NEON2SSE_GLOBAL poly16x4_t vcreate_p16(uint64_t a); // VMOV d0,r0,r0
1673	_NEON2SSE_GLOBAL int64x1_t vcreate_s64(uint64_t a); // VMOV d0,r0,r0
1674	//Set all lanes to same value
1675	//Load all lanes of vector to the same literal value
1676	_NEON2SSESTORAGE uint8x8_t vdup_n_u8(uint8_t value); // VDUP.8 d0,r0
1677	_NEON2SSESTORAGE uint16x4_t vdup_n_u16(uint16_t value); // VDUP.16 d0,r0
1678	_NEON2SSESTORAGE uint32x2_t vdup_n_u32(uint32_t value); // VDUP.32 d0,r0
1679	_NEON2SSESTORAGE int8x8_t vdup_n_s8(int8_t value); // VDUP.8 d0,r0
1680	_NEON2SSESTORAGE int16x4_t vdup_n_s16(int16_t value); // VDUP.16 d0,r0
1681	_NEON2SSESTORAGE int32x2_t vdup_n_s32(int32_t value); // VDUP.32 d0,r0
1682	_NEON2SSE_GLOBAL poly8x8_t vdup_n_p8(poly8_t value); // VDUP.8 d0,r0
1683	_NEON2SSE_GLOBAL poly16x4_t vdup_n_p16(poly16_t value); // VDUP.16 d0,r0
1684	_NEON2SSESTORAGE float32x2_t vdup_n_f32(float32_t value); // VDUP.32 d0,r0
1685	_NEON2SSE_GLOBAL uint8x16_t vdupq_n_u8(uint8_t value); // VDUP.8 q0,r0
1686	_NEON2SSE_GLOBAL uint16x8_t vdupq_n_u16(uint16_t value); // VDUP.16 q0,r0
1687	_NEON2SSE_GLOBAL uint32x4_t vdupq_n_u32(uint32_t value); // VDUP.32 q0,r0
1688	_NEON2SSE_GLOBAL int8x16_t vdupq_n_s8(int8_t value); // VDUP.8 q0,r0
1689	_NEON2SSE_GLOBAL int16x8_t vdupq_n_s16(int16_t value); // VDUP.16 q0,r0
1690	_NEON2SSE_GLOBAL int32x4_t vdupq_n_s32(int32_t value); // VDUP.32 q0,r0
1691	_NEON2SSE_GLOBAL poly8x16_t vdupq_n_p8(poly8_t value); // VDUP.8 q0,r0
1692	_NEON2SSE_GLOBAL poly16x8_t vdupq_n_p16(poly16_t value); // VDUP.16 q0,r0
1693	_NEON2SSE_GLOBAL float32x4_t vdupq_n_f32(float32_t value); // VDUP.32 q0,r0
1694	_NEON2SSESTORAGE int64x1_t vdup_n_s64(int64_t value); // VMOV d0,r0,r0
1695	_NEON2SSESTORAGE uint64x1_t vdup_n_u64(uint64_t value); // VMOV d0,r0,r0
1696	_NEON2SSESTORAGE int64x2_t vdupq_n_s64(int64_t value); // VMOV d0,r0,r0
1697	_NEON2SSESTORAGE uint64x2_t vdupq_n_u64(uint64_t value); // VMOV d0,r0,r0
1698	_NEON2SSE_GLOBAL uint8x8_t vmov_n_u8(uint8_t value); // VDUP.8 d0,r0
1699	_NEON2SSE_GLOBAL uint16x4_t vmov_n_u16(uint16_t value); // VDUP.16 d0,r0
1700	_NEON2SSE_GLOBAL uint32x2_t vmov_n_u32(uint32_t value); // VDUP.32 d0,r0
1701	_NEON2SSE_GLOBAL int8x8_t vmov_n_s8(int8_t value); // VDUP.8 d0,r0
1702	_NEON2SSE_GLOBAL int16x4_t vmov_n_s16(int16_t value); // VDUP.16 d0,r0
1703	_NEON2SSE_GLOBAL int32x2_t vmov_n_s32(int32_t value); // VDUP.32 d0,r0
1704	_NEON2SSE_GLOBAL poly8x8_t vmov_n_p8(poly8_t value); // VDUP.8 d0,r0
1705	_NEON2SSE_GLOBAL poly16x4_t vmov_n_p16(poly16_t value); // VDUP.16 d0,r0
1706	_NEON2SSE_GLOBAL float32x2_t vmov_n_f32(float32_t value); // VDUP.32 d0,r0
1707	_NEON2SSE_GLOBAL uint8x16_t vmovq_n_u8(uint8_t value); // VDUP.8 q0,r0
1708	_NEON2SSE_GLOBAL uint16x8_t vmovq_n_u16(uint16_t value); // VDUP.16 q0,r0
1709	_NEON2SSE_GLOBAL uint32x4_t vmovq_n_u32(uint32_t value); // VDUP.32 q0,r0
1710	_NEON2SSE_GLOBAL int8x16_t vmovq_n_s8(int8_t value); // VDUP.8 q0,r0
1711	_NEON2SSE_GLOBAL int16x8_t vmovq_n_s16(int16_t value); // VDUP.16 q0,r0
1712	_NEON2SSE_GLOBAL int32x4_t vmovq_n_s32(int32_t value); // VDUP.32 q0,r0
1713	_NEON2SSE_GLOBAL poly8x16_t vmovq_n_p8(poly8_t value); // VDUP.8 q0,r0
1714	_NEON2SSE_GLOBAL poly16x8_t vmovq_n_p16(poly16_t value); // VDUP.16 q0,r0
1715	_NEON2SSE_GLOBAL float32x4_t vmovq_n_f32(float32_t value); // VDUP.32 q0,r0
1716	_NEON2SSE_GLOBAL int64x1_t vmov_n_s64(int64_t value); // VMOV d0,r0,r0
1717	_NEON2SSE_GLOBAL uint64x1_t vmov_n_u64(uint64_t value); // VMOV d0,r0,r0
1718	_NEON2SSE_GLOBAL int64x2_t vmovq_n_s64(int64_t value); // VMOV d0,r0,r0
1719	_NEON2SSE_GLOBAL uint64x2_t vmovq_n_u64(uint64_t value); // VMOV d0,r0,r0
1720	//Load all lanes of the vector to the value of a lane of a vector
1721	_NEON2SSESTORAGE uint8x8_t vdup_lane_u8(uint8x8_t vec, __constrange(`0`,`7`) int lane); // VDUP.8 d0,d0[0]
1722	_NEON2SSESTORAGE uint16x4_t vdup_lane_u16(uint16x4_t vec, __constrange(`0`,`3`) int lane); // VDUP.16 d0,d0[0]
1723	_NEON2SSESTORAGE uint32x2_t vdup_lane_u32(uint32x2_t vec, __constrange(`0`,`1`) int lane); // VDUP.32 d0,d0[0]
1724	_NEON2SSE_GLOBAL int8x8_t vdup_lane_s8(int8x8_t vec, __constrange(`0`,`7`) int lane); // VDUP.8 d0,d0[0]
1725	_NEON2SSE_GLOBAL int16x4_t vdup_lane_s16(int16x4_t vec, __constrange(`0`,`3`) int lane); // VDUP.16 d0,d0[0]
1726	_NEON2SSE_GLOBAL int32x2_t vdup_lane_s32(int32x2_t vec, __constrange(`0`,`1`) int lane); // VDUP.32 d0,d0[0]
1727	_NEON2SSE_GLOBAL poly8x8_t vdup_lane_p8(poly8x8_t vec, __constrange(`0`,`7`) int lane); // VDUP.8 d0,d0[0]
1728	_NEON2SSE_GLOBAL poly16x4_t vdup_lane_p16(poly16x4_t vec, __constrange(`0`,`3`) int lane); // VDUP.16 d0,d0[0]
1729	_NEON2SSESTORAGE float32x2_t vdup_lane_f32(float32x2_t vec, __constrange(`0`,`1`) int lane); // VDUP.32 d0,d0[0]
1730	_NEON2SSESTORAGE uint8x16_t vdupq_lane_u8(uint8x8_t vec, __constrange(`0`,`7`) int lane); // VDUP.8 q0,d0[0]
1731	_NEON2SSESTORAGE uint16x8_t vdupq_lane_u16(uint16x4_t vec, __constrange(`0`,`3`) int lane); // VDUP.16 q0,d0[0]
1732	_NEON2SSESTORAGE uint32x4_t vdupq_lane_u32(uint32x2_t vec, __constrange(`0`,`1`) int lane); // VDUP.32 q0,d0[0]
1733	_NEON2SSE_GLOBAL int8x16_t vdupq_lane_s8(int8x8_t vec, __constrange(`0`,`7`) int lane); // VDUP.8 q0,d0[0]
1734	_NEON2SSE_GLOBAL int16x8_t vdupq_lane_s16(int16x4_t vec, __constrange(`0`,`3`) int lane); // VDUP.16 q0,d0[0]
1735	_NEON2SSE_GLOBAL int32x4_t vdupq_lane_s32(int32x2_t vec, __constrange(`0`,`1`) int lane); // VDUP.32 q0,d0[0]
1736	_NEON2SSE_GLOBAL poly8x16_t vdupq_lane_p8(poly8x8_t vec, __constrange(`0`,`7`) int lane); // VDUP.8 q0,d0[0]
1737	_NEON2SSE_GLOBAL poly16x8_t vdupq_lane_p16(poly16x4_t vec, __constrange(`0`,`3`) int lane); // VDUP.16 q0,d0[0]
1738	_NEON2SSE_GLOBAL float32x4_t vdupq_lane_f32(float32x2_t vec, __constrange(`0`,`1`) int lane); // VDUP.32 q0,d0[0]
1739	_NEON2SSE_GLOBAL int64x1_t vdup_lane_s64(int64x1_t vec, __constrange(`0`,`0`) int lane); // VMOV d0,d0
1740	_NEON2SSE_GLOBAL uint64x1_t vdup_lane_u64(uint64x1_t vec, __constrange(`0`,`0`) int lane); // VMOV d0,d0
1741	_NEON2SSESTORAGE int64x2_t vdupq_lane_s64(int64x1_t vec, __constrange(`0`,`0`) int lane); // VMOV q0,q0
1742	_NEON2SSE_GLOBAL uint64x2_t vdupq_lane_u64(uint64x1_t vec, __constrange(`0`,`0`) int lane); // VMOV q0,q0
1743	//Combining vectors. These intrinsics join two 64 bit vectors into a single 128bit vector.
1744	_NEON2SSESTORAGE int8x16_t vcombine_s8(int8x8_t low, int8x8_t high); // VMOV d0,d0
1745	_NEON2SSE_GLOBAL int16x8_t vcombine_s16(int16x4_t low, int16x4_t high); // VMOV d0,d0
1746	_NEON2SSE_GLOBAL int32x4_t vcombine_s32(int32x2_t low, int32x2_t high); // VMOV d0,d0
1747	_NEON2SSE_GLOBAL int64x2_t vcombine_s64(int64x1_t low, int64x1_t high); // VMOV d0,d0
1748	_NEON2SSE_GLOBAL float16x8_t vcombine_f16(float16x4_t low, float16x4_t high); // VMOV d0,d0
1749	_NEON2SSESTORAGE float32x4_t vcombine_f32(float32x2_t low, float32x2_t high); // VMOV d0,d0
1750	_NEON2SSE_GLOBAL uint8x16_t vcombine_u8(uint8x8_t low, uint8x8_t high); // VMOV d0,d0
1751	_NEON2SSE_GLOBAL uint16x8_t vcombine_u16(uint16x4_t low, uint16x4_t high); // VMOV d0,d0
1752	_NEON2SSE_GLOBAL uint32x4_t vcombine_u32(uint32x2_t low, uint32x2_t high); // VMOV d0,d0
1753	_NEON2SSE_GLOBAL uint64x2_t vcombine_u64(uint64x1_t low, uint64x1_t high); // VMOV d0,d0
1754	_NEON2SSE_GLOBAL poly8x16_t vcombine_p8(poly8x8_t low, poly8x8_t high); // VMOV d0,d0
1755	_NEON2SSE_GLOBAL poly16x8_t vcombine_p16(poly16x4_t low, poly16x4_t high); // VMOV d0,d0
1756	//Splitting vectors. These intrinsics split a 128 bit vector into 2 component 64 bit vectors
1757	_NEON2SSESTORAGE int8x8_t vget_high_s8(int8x16_t a); // VMOV d0,d0
1758	_NEON2SSESTORAGE int16x4_t vget_high_s16(int16x8_t a); // VMOV d0,d0
1759	_NEON2SSESTORAGE int32x2_t vget_high_s32(int32x4_t a); // VMOV d0,d0
1760	_NEON2SSESTORAGE int64x1_t vget_high_s64(int64x2_t a); // VMOV d0,d0
1761	_NEON2SSE_GLOBAL float16x4_t vget_high_f16(float16x8_t a); // VMOV d0,d0
1762	_NEON2SSESTORAGE float32x2_t vget_high_f32(float32x4_t a); // VMOV d0,d0
1763	_NEON2SSE_GLOBAL uint8x8_t vget_high_u8(uint8x16_t a); // VMOV d0,d0
1764	_NEON2SSE_GLOBAL uint16x4_t vget_high_u16(uint16x8_t a); // VMOV d0,d0
1765	_NEON2SSE_GLOBAL uint32x2_t vget_high_u32(uint32x4_t a); // VMOV d0,d0
1766	_NEON2SSE_GLOBAL uint64x1_t vget_high_u64(uint64x2_t a); // VMOV d0,d0
1767	_NEON2SSE_GLOBAL poly8x8_t vget_high_p8(poly8x16_t a); // VMOV d0,d0
1768	_NEON2SSE_GLOBAL poly16x4_t vget_high_p16(poly16x8_t a); // VMOV d0,d0
1769	_NEON2SSESTORAGE int8x8_t vget_low_s8(int8x16_t a); // VMOV d0,d0
1770	_NEON2SSESTORAGE int16x4_t vget_low_s16(int16x8_t a); // VMOV d0,d0
1771	_NEON2SSESTORAGE int32x2_t vget_low_s32(int32x4_t a); // VMOV d0,d0
1772	_NEON2SSESTORAGE int64x1_t vget_low_s64(int64x2_t a); // VMOV d0,d0
1773	_NEON2SSE_GLOBAL float16x4_t vget_low_f16(float16x8_t a); // VMOV d0,d0
1774	_NEON2SSESTORAGE float32x2_t vget_low_f32(float32x4_t a); // VMOV d0,d0
1775	_NEON2SSE_GLOBAL uint8x8_t vget_low_u8(uint8x16_t a); // VMOV d0,d0
1776	_NEON2SSE_GLOBAL uint16x4_t vget_low_u16(uint16x8_t a); // VMOV d0,d0
1777	_NEON2SSE_GLOBAL uint32x2_t vget_low_u32(uint32x4_t a); // VMOV d0,d0
1778	_NEON2SSE_GLOBAL uint64x1_t vget_low_u64(uint64x2_t a); // VMOV d0,d0
1779	_NEON2SSE_GLOBAL poly8x8_t vget_low_p8(poly8x16_t a); // VMOV d0,d0
1780	_NEON2SSE_GLOBAL poly16x4_t vget_low_p16(poly16x8_t a); // VMOV d0,d0
1781	//Converting vectors. These intrinsics are used to convert vectors.
1782	//Convert from float
1783	_NEON2SSESTORAGE int32x2_t vcvt_s32_f32(float32x2_t a); // VCVT.S32.F32 d0, d0
1784	_NEON2SSESTORAGE uint32x2_t vcvt_u32_f32(float32x2_t a); // VCVT.U32.F32 d0, d0
1785	_NEON2SSESTORAGE int32x4_t vcvtq_s32_f32(float32x4_t a); // VCVT.S32.F32 q0, q0
1786	_NEON2SSESTORAGE uint32x4_t vcvtq_u32_f32(float32x4_t a); // VCVT.U32.F32 q0, q0
1787	_NEON2SSESTORAGE int32x2_t vcvt_n_s32_f32(float32x2_t a, __constrange(`1`,`32`) int b); // VCVT.S32.F32 d0, d0, #32
1788	_NEON2SSESTORAGE uint32x2_t vcvt_n_u32_f32(float32x2_t a, __constrange(`1`,`32`) int b); // VCVT.U32.F32 d0, d0, #32
1789	_NEON2SSESTORAGE int32x4_t vcvtq_n_s32_f32(float32x4_t a, __constrange(`1`,`32`) int b); // VCVT.S32.F32 q0, q0, #32
1790	_NEON2SSESTORAGE uint32x4_t vcvtq_n_u32_f32(float32x4_t a, __constrange(`1`,`32`) int b); // VCVT.U32.F32 q0, q0, #32
1791	_NEON2SSESTORAGE int32x4_t vcvtnq_s32_f32(float32x4_t a); // VCVTN.S32.F32 q0, q0
1792	//Convert to float
1793	_NEON2SSESTORAGE float32x2_t vcvt_f32_s32(int32x2_t a); // VCVT.F32.S32 d0, d0
1794	_NEON2SSESTORAGE float32x2_t vcvt_f32_u32(uint32x2_t a); // VCVT.F32.U32 d0, d0
1795	_NEON2SSE_GLOBAL float32x4_t vcvtq_f32_s32(int32x4_t a); // VCVT.F32.S32 q0, q0
1796	_NEON2SSESTORAGE float32x4_t vcvtq_f32_u32(uint32x4_t a); // VCVT.F32.U32 q0, q0
1797	_NEON2SSESTORAGE float32x2_t vcvt_n_f32_s32(int32x2_t a, __constrange(`1`,`32`) int b); // VCVT.F32.S32 d0, d0, #32
1798	_NEON2SSESTORAGE float32x2_t vcvt_n_f32_u32(uint32x2_t a, __constrange(`1`,`32`) int b); // VCVT.F32.U32 d0, d0, #32
1799	_NEON2SSESTORAGE float32x4_t vcvtq_n_f32_s32(int32x4_t a, __constrange(`1`,`32`) int b); // VCVT.F32.S32 q0, q0, #32
1800	_NEON2SSESTORAGE float32x4_t vcvtq_n_f32_u32(uint32x4_t a, __constrange(`1`,`32`) int b); // VCVT.F32.U32 q0, q0, #32
1801	//Convert between floats
1802	_NEON2SSE_GLOBAL float16x4_t vcvt_f16_f32(float32x4_t a); // VCVT.F16.F32 d0, q0
1803	_NEON2SSE_GLOBAL float32x4_t vcvt_f32_f16(float16x4_t a); // VCVT.F32.F16 q0, d0
1804	//Vector narrow integer
1805	_NEON2SSESTORAGE int8x8_t vmovn_s16(int16x8_t a); // VMOVN.I16 d0,q0
1806	_NEON2SSESTORAGE int16x4_t vmovn_s32(int32x4_t a); // VMOVN.I32 d0,q0
1807	_NEON2SSESTORAGE int32x2_t vmovn_s64(int64x2_t a); // VMOVN.I64 d0,q0
1808	_NEON2SSE_GLOBAL uint8x8_t vmovn_u16(uint16x8_t a); // VMOVN.I16 d0,q0
1809	_NEON2SSE_GLOBAL uint16x4_t vmovn_u32(uint32x4_t a); // VMOVN.I32 d0,q0
1810	_NEON2SSE_GLOBAL uint32x2_t vmovn_u64(uint64x2_t a); // VMOVN.I64 d0,q0
1811	//Vector long move
1812	_NEON2SSESTORAGE int16x8_t vmovl_s8(int8x8_t a); // VMOVL.S8 q0,d0
1813	_NEON2SSESTORAGE int32x4_t vmovl_s16(int16x4_t a); // VMOVL.S16 q0,d0
1814	_NEON2SSESTORAGE int64x2_t vmovl_s32(int32x2_t a); // VMOVL.S32 q0,d0
1815	_NEON2SSESTORAGE uint16x8_t vmovl_u8(uint8x8_t a); // VMOVL.U8 q0,d0
1816	_NEON2SSESTORAGE uint32x4_t vmovl_u16(uint16x4_t a); // VMOVL.U16 q0,d0
1817	_NEON2SSESTORAGE uint64x2_t vmovl_u32(uint32x2_t a); // VMOVL.U32 q0,d0
1818	//Vector saturating narrow integer
1819	_NEON2SSESTORAGE int8x8_t vqmovn_s16(int16x8_t a); // VQMOVN.S16 d0,q0
1820	_NEON2SSESTORAGE int16x4_t vqmovn_s32(int32x4_t a); // VQMOVN.S32 d0,q0
1821	_NEON2SSESTORAGE int32x2_t vqmovn_s64(int64x2_t a); // VQMOVN.S64 d0,q0
1822	_NEON2SSESTORAGE uint8x8_t vqmovn_u16(uint16x8_t a); // VQMOVN.U16 d0,q0
1823	_NEON2SSESTORAGE uint16x4_t vqmovn_u32(uint32x4_t a); // VQMOVN.U32 d0,q0
1824	_NEON2SSESTORAGE uint32x2_t vqmovn_u64(uint64x2_t a); // VQMOVN.U64 d0,q0
1825	//Vector saturating narrow integer signed->unsigned
1826	_NEON2SSESTORAGE uint8x8_t vqmovun_s16(int16x8_t a); // VQMOVUN.S16 d0,q0
1827	_NEON2SSESTORAGE uint16x4_t vqmovun_s32(int32x4_t a); // VQMOVUN.S32 d0,q0
1828	_NEON2SSESTORAGE uint32x2_t vqmovun_s64(int64x2_t a); // VQMOVUN.S64 d0,q0
1829	//Table look up
1830	_NEON2SSESTORAGE uint8x8_t vtbl1_u8(uint8x8_t a, uint8x8_t b); // VTBL.8 d0, {d0}, d0
1831	_NEON2SSE_GLOBAL int8x8_t vtbl1_s8(int8x8_t a, int8x8_t b); // VTBL.8 d0, {d0}, d0
1832	_NEON2SSE_GLOBAL poly8x8_t vtbl1_p8(poly8x8_t a, uint8x8_t b); // VTBL.8 d0, {d0}, d0
1833	//Extended table look up intrinsics
1834	_NEON2SSESTORAGE uint8x8_t vtbx1_u8(uint8x8_t a, uint8x8_t b, uint8x8_t c); // VTBX.8 d0, {d0}, d0
1835	_NEON2SSE_GLOBAL int8x8_t vtbx1_s8(int8x8_t a, int8x8_t b, int8x8_t c); // VTBX.8 d0, {d0}, d0
1836	_NEON2SSE_GLOBAL poly8x8_t vtbx1_p8(poly8x8_t a, poly8x8_t b, uint8x8_t c); // VTBX.8 d0, {d0}, d0
1837	_NEON2SSESTORAGE uint8x8_t vtbx2_u8(uint8x8_t a, uint8x8x2_t b, uint8x8_t c); // VTBX.8 d0, {d0, d1}, d0
1838	_NEON2SSE_GLOBAL int8x8_t vtbx2_s8(int8x8_t a, int8x8x2_t b, int8x8_t c); // VTBX.8 d0, {d0, d1}, d0
1839	_NEON2SSE_GLOBAL poly8x8_t vtbx2_p8(poly8x8_t a, poly8x8x2_t b, uint8x8_t c); // VTBX.8 d0, {d0, d1}, d0
1840	_NEON2SSESTORAGE uint8x8_t vtbx3_u8(uint8x8_t a, uint8x8x3_t b, uint8x8_t c); // VTBX.8 d0, {d0, d1, d2}, d0
1841	_NEON2SSE_GLOBAL int8x8_t vtbx3_s8(int8x8_t a, int8x8x3_t b, int8x8_t c); // VTBX.8 d0, {d0, d1, d2}, d0
1842	_NEON2SSE_GLOBAL poly8x8_t vtbx3_p8(poly8x8_t a, poly8x8x3_t b, uint8x8_t c); // VTBX.8 d0, {d0, d1, d2}, d0
1843	_NEON2SSESTORAGE uint8x8_t vtbx4_u8(uint8x8_t a, uint8x8x4_t b, uint8x8_t c); // VTBX.8 d0, {d0, d1, d2, d3}, d0
1844	_NEON2SSE_GLOBAL int8x8_t vtbx4_s8(int8x8_t a, int8x8x4_t b, int8x8_t c); // VTBX.8 d0, {d0, d1, d2, d3}, d0
1845	_NEON2SSE_GLOBAL poly8x8_t vtbx4_p8(poly8x8_t a, poly8x8x4_t b, uint8x8_t c); // VTBX.8 d0, {d0, d1, d2, d3}, d0
1846	//Operations with a scalar value
1847	//Vector multiply accumulate with scalar
1848	_NEON2SSESTORAGE int16x4_t vmla_lane_s16(int16x4_t a, int16x4_t b, int16x4_t v, __constrange(`0`,`3`) int l); // VMLA.I16 d0, d0,d0[0]
1849	_NEON2SSESTORAGE int32x2_t vmla_lane_s32(int32x2_t a, int32x2_t b, int32x2_t v, __constrange(`0`,`1`) int l); // VMLA.I32 d0, d0,d0[0]
1850	_NEON2SSE_GLOBAL uint16x4_t vmla_lane_u16(uint16x4_t a, uint16x4_t b, uint16x4_t v, __constrange(`0`,`3`) int l); // VMLA.I16 d0, d0,d0[0]
1851	_NEON2SSE_GLOBAL uint32x2_t vmla_lane_u32(uint32x2_t a, uint32x2_t b, uint32x2_t v, __constrange(`0`,`1`) int l); // VMLA.I32 d0, d0,d0[0]
1852	_NEON2SSESTORAGE float32x2_t vmla_lane_f32(float32x2_t a, float32x2_t b, float32x2_t v, __constrange(`0`,`1`) int l); // VMLA.F32 d0,d0, d0[0]
1853	_NEON2SSESTORAGE int16x8_t vmlaq_lane_s16(int16x8_t a, int16x8_t b, int16x4_t v, __constrange(`0`,`3`) int l); // VMLA.I16 q0, q0,d0[0]
1854	_NEON2SSESTORAGE int32x4_t vmlaq_lane_s32(int32x4_t a, int32x4_t b, int32x2_t v, __constrange(`0`,`1`) int l); // VMLA.I32 q0, q0,d0[0]
1855	_NEON2SSE_GLOBAL uint16x8_t vmlaq_lane_u16(uint16x8_t a, uint16x8_t b, uint16x4_t v, __constrange(`0`,`3`) int l); // VMLA.I16 q0,q0, d0[0]
1856	_NEON2SSE_GLOBAL uint32x4_t vmlaq_lane_u32(uint32x4_t a, uint32x4_t b, uint32x2_t v, __constrange(`0`,`1`) int l); // VMLA.I32 q0,q0, d0[0]
1857	_NEON2SSESTORAGE float32x4_t vmlaq_lane_f32(float32x4_t a, float32x4_t b, float32x2_t v, __constrange(`0`,`1`) int l); // VMLA.F32 q0,q0, d0[0]
1858	//Vector widening multiply accumulate with scalar
1859	_NEON2SSESTORAGE int32x4_t vmlal_lane_s16(int32x4_t a, int16x4_t b, int16x4_t v, __constrange(`0`,`3`) int l); //VMLAL.S16 q0, d0,d0[0]
1860	_NEON2SSESTORAGE int64x2_t vmlal_lane_s32(int64x2_t a, int32x2_t b, int32x2_t v, __constrange(`0`,`1`) int l); //VMLAL.S32 q0, d0,d0[0]
1861	_NEON2SSESTORAGE uint32x4_t vmlal_lane_u16(uint32x4_t a, uint16x4_t b, uint16x4_t v, __constrange(`0`,`3`) int l); // VMLAL.U16 q0,d0, d0[0]
1862	_NEON2SSESTORAGE uint64x2_t vmlal_lane_u32(uint64x2_t a, uint32x2_t b, uint32x2_t v, __constrange(`0`,`1`) int l); // VMLAL.U32 q0,d0, d0[0]
1863	//Vector widening saturating doubling multiply accumulate with scalar
1864	_NEON2SSESTORAGE int32x4_t vqdmlal_lane_s16(int32x4_t a, int16x4_t b, int16x4_t v, __constrange(`0`,`3`) int l); // VQDMLAL.S16 q0,d0, d0[0]
1865	_NEON2SSESTORAGE int64x2_t vqdmlal_lane_s32(int64x2_t a, int32x2_t b, int32x2_t v, __constrange(`0`,`1`) int l); // VQDMLAL.S32 q0,d0, d0[0]
1866	//Vector multiply subtract with scalar
1867	_NEON2SSESTORAGE int16x4_t vmls_lane_s16(int16x4_t a, int16x4_t b, int16x4_t v, __constrange(`0`,`3`) int l); // VMLS.I16 d0, d0,d0[0]
1868	_NEON2SSESTORAGE int32x2_t vmls_lane_s32(int32x2_t a, int32x2_t b, int32x2_t v, __constrange(`0`,`1`) int l); // VMLS.I32 d0, d0,d0[0]
1869	_NEON2SSESTORAGE uint16x4_t vmls_lane_u16(uint16x4_t a, uint16x4_t b, uint16x4_t v, __constrange(`0`,`3`) int l); // VMLS.I16 d0, d0,d0[0]
1870	_NEON2SSESTORAGE uint32x2_t vmls_lane_u32(uint32x2_t a, uint32x2_t b, uint32x2_t v, __constrange(`0`,`1`) int l); // VMLS.I32 d0, d0,d0[0]
1871	_NEON2SSESTORAGE float32x2_t vmls_lane_f32(float32x2_t a, float32x2_t b, float32x2_t v, __constrange(`0`,`1`) int l); // VMLS.F32 d0,d0, d0[0]
1872	_NEON2SSESTORAGE int16x8_t vmlsq_lane_s16(int16x8_t a, int16x8_t b, int16x4_t v, __constrange(`0`,`3`) int l); // VMLS.I16 q0, q0,d0[0]
1873	_NEON2SSESTORAGE int32x4_t vmlsq_lane_s32(int32x4_t a, int32x4_t b, int32x2_t v, __constrange(`0`,`1`) int l); // VMLS.I32 q0, q0,d0[0]
1874	_NEON2SSESTORAGE uint16x8_t vmlsq_lane_u16(uint16x8_t a, uint16x8_t b, uint16x4_t v, __constrange(`0`,`3`) int l); // VMLS.I16 q0,q0, d0[0]
1875	_NEON2SSESTORAGE uint32x4_t vmlsq_lane_u32(uint32x4_t a, uint32x4_t b, uint32x2_t v, __constrange(`0`,`1`) int l); // VMLS.I32 q0,q0, d0[0]
1876	_NEON2SSESTORAGE float32x4_t vmlsq_lane_f32(float32x4_t a, float32x4_t b, float32x2_t v, __constrange(`0`,`1`) int l); // VMLS.F32 q0,q0, d0[0]
1877	//Vector widening multiply subtract with scalar
1878	_NEON2SSESTORAGE int32x4_t vmlsl_lane_s16(int32x4_t a, int16x4_t b, int16x4_t v, __constrange(`0`,`3`) int l); // VMLSL.S16 q0, d0,d0[0]
1879	_NEON2SSESTORAGE int64x2_t vmlsl_lane_s32(int64x2_t a, int32x2_t b, int32x2_t v, __constrange(`0`,`1`) int l); // VMLSL.S32 q0, d0,d0[0]
1880	_NEON2SSESTORAGE uint32x4_t vmlsl_lane_u16(uint32x4_t a, uint16x4_t b, uint16x4_t v, __constrange(`0`,`3`) int l); // VMLSL.U16 q0,d0, d0[0]
1881	_NEON2SSESTORAGE uint64x2_t vmlsl_lane_u32(uint64x2_t a, uint32x2_t b, uint32x2_t v, __constrange(`0`,`1`) int l); // VMLSL.U32 q0,d0, d0[0]
1882	//Vector widening saturating doubling multiply subtract with scalar
1883	_NEON2SSESTORAGE int32x4_t vqdmlsl_lane_s16(int32x4_t a, int16x4_t b, int16x4_t v, __constrange(`0`,`3`) int l); // VQDMLSL.S16 q0,d0, d0[0]
1884	_NEON2SSESTORAGE int64x2_t vqdmlsl_lane_s32(int64x2_t a, int32x2_t b, int32x2_t v, __constrange(`0`,`1`) int l); // VQDMLSL.S32 q0,d0, d0[0]
1885	//Vector multiply by scalar
1886	_NEON2SSESTORAGE int16x4_t vmul_n_s16(int16x4_t a, int16_t b); // VMUL.I16 d0,d0,d0[0]
1887	_NEON2SSESTORAGE int32x2_t vmul_n_s32(int32x2_t a, int32_t b); // VMUL.I32 d0,d0,d0[0]
1888	_NEON2SSESTORAGE float32x2_t vmul_n_f32(float32x2_t a, float32_t b); // VMUL.F32 d0,d0,d0[0]
1889	_NEON2SSESTORAGE uint16x4_t vmul_n_u16(uint16x4_t a, uint16_t b); // VMUL.I16 d0,d0,d0[0]
1890	_NEON2SSESTORAGE uint32x2_t vmul_n_u32(uint32x2_t a, uint32_t b); // VMUL.I32 d0,d0,d0[0]
1891	_NEON2SSESTORAGE int16x8_t vmulq_n_s16(int16x8_t a, int16_t b); // VMUL.I16 q0,q0,d0[0]
1892	_NEON2SSESTORAGE int32x4_t vmulq_n_s32(int32x4_t a, int32_t b); // VMUL.I32 q0,q0,d0[0]
1893	_NEON2SSESTORAGE float32x4_t vmulq_n_f32(float32x4_t a, float32_t b); // VMUL.F32 q0,q0,d0[0]
1894	_NEON2SSESTORAGE uint16x8_t vmulq_n_u16(uint16x8_t a, uint16_t b); // VMUL.I16 q0,q0,d0[0]
1895	_NEON2SSESTORAGE uint32x4_t vmulq_n_u32(uint32x4_t a, uint32_t b); // VMUL.I32 q0,q0,d0[0]
1896	//Vector long multiply with scalar
1897	_NEON2SSESTORAGE int32x4_t vmull_n_s16(int16x4_t vec1, int16_t val2); // VMULL.S16 q0,d0,d0[0]
1898	_NEON2SSESTORAGE int64x2_t vmull_n_s32(int32x2_t vec1, int32_t val2); // VMULL.S32 q0,d0,d0[0]
1899	_NEON2SSESTORAGE uint32x4_t vmull_n_u16(uint16x4_t vec1, uint16_t val2); // VMULL.U16 q0,d0,d0[0]
1900	_NEON2SSESTORAGE uint64x2_t vmull_n_u32(uint32x2_t vec1, uint32_t val2); // VMULL.U32 q0,d0,d0[0]
1901	//Vector long multiply by scalar
1902	_NEON2SSESTORAGE int32x4_t vmull_lane_s16(int16x4_t vec1, int16x4_t val2, __constrange(`0`, `3`) int val3); // VMULL.S16 q0,d0,d0[0]
1903	_NEON2SSESTORAGE int64x2_t vmull_lane_s32(int32x2_t vec1, int32x2_t val2, __constrange(`0`, `1`) int val3); // VMULL.S32 q0,d0,d0[0]
1904	_NEON2SSESTORAGE uint32x4_t vmull_lane_u16(uint16x4_t vec1, uint16x4_t val2, __constrange(`0`, `3`) int val3); // VMULL.U16 q0,d0,d0[0]
1905	_NEON2SSESTORAGE uint64x2_t vmull_lane_u32(uint32x2_t vec1, uint32x2_t val2, __constrange(`0`, `1`) int val3); // VMULL.U32 q0,d0,d0[0]
1906	//Vector saturating doubling long multiply with scalar
1907	_NEON2SSESTORAGE int32x4_t vqdmull_n_s16(int16x4_t vec1, int16_t val2); // VQDMULL.S16 q0,d0,d0[0]
1908	_NEON2SSESTORAGE int64x2_t vqdmull_n_s32(int32x2_t vec1, int32_t val2); // VQDMULL.S32 q0,d0,d0[0]
1909	//Vector saturating doubling long multiply by scalar
1910	_NEON2SSESTORAGE int32x4_t vqdmull_lane_s16(int16x4_t vec1, int16x4_t val2, __constrange(`0`, `3`) int val3); // VQDMULL.S16 q0,d0,d0[0]
1911	_NEON2SSESTORAGE int64x2_t vqdmull_lane_s32(int32x2_t vec1, int32x2_t val2, __constrange(`0`, `1`) int val3); // VQDMULL.S32 q0,d0,d0[0]
1912	//Vector saturating doubling multiply high with scalar
1913	_NEON2SSESTORAGE int16x4_t vqdmulh_n_s16(int16x4_t vec1, int16_t val2); // VQDMULH.S16 d0,d0,d0[0]
1914	_NEON2SSESTORAGE int32x2_t vqdmulh_n_s32(int32x2_t vec1, int32_t val2); // VQDMULH.S32 d0,d0,d0[0]
1915	_NEON2SSESTORAGE int16x8_t vqdmulhq_n_s16(int16x8_t vec1, int16_t val2); // VQDMULH.S16 q0,q0,d0[0]
1916	_NEON2SSESTORAGE int32x4_t vqdmulhq_n_s32(int32x4_t vec1, int32_t val2); // VQDMULH.S32 q0,q0,d0[0]
1917	//Vector saturating doubling multiply high by scalar
1918	_NEON2SSESTORAGE int16x4_t vqdmulh_lane_s16(int16x4_t vec1, int16x4_t val2, __constrange(`0`, `3`) int val3); // VQDMULH.S16 d0,d0,d0[0]
1919	_NEON2SSESTORAGE int32x2_t vqdmulh_lane_s32(int32x2_t vec1, int32x2_t val2, __constrange(`0`, `1`) int val3); // VQDMULH.S32 d0,d0,d0[0]
1920	_NEON2SSESTORAGE int16x8_t vqdmulhq_lane_s16(int16x8_t vec1, int16x4_t val2, __constrange(`0`, `3`) int val3); // VQDMULH.S16 q0,q0,d0[0]
1921	_NEON2SSESTORAGE int32x4_t vqdmulhq_lane_s32(int32x4_t vec1, int32x2_t val2, __constrange(`0`, `1`) int val3); // VQDMULH.S32 q0,q0,d0[0]
1922	//Vector saturating rounding doubling multiply high with scalar
1923	_NEON2SSESTORAGE int16x4_t vqrdmulh_n_s16(int16x4_t vec1, int16_t val2); // VQRDMULH.S16 d0,d0,d0[0]
1924	_NEON2SSESTORAGE int32x2_t vqrdmulh_n_s32(int32x2_t vec1, int32_t val2); // VQRDMULH.S32 d0,d0,d0[0]
1925	_NEON2SSESTORAGE int16x8_t vqrdmulhq_n_s16(int16x8_t vec1, int16_t val2); // VQRDMULH.S16 q0,q0,d0[0]
1926	_NEON2SSESTORAGE int32x4_t vqrdmulhq_n_s32(int32x4_t vec1, int32_t val2); // VQRDMULH.S32 q0,q0,d0[0]
1927	//Vector rounding saturating doubling multiply high by scalar
1928	_NEON2SSESTORAGE int16x4_t vqrdmulh_lane_s16(int16x4_t vec1, int16x4_t val2, __constrange(`0`, `3`) int val3); // VQRDMULH.S16 d0,d0,d0[0]
1929	_NEON2SSESTORAGE int32x2_t vqrdmulh_lane_s32(int32x2_t vec1, int32x2_t val2, __constrange(`0`, `1`) int val3); // VQRDMULH.S32 d0,d0,d0[0]
1930	_NEON2SSESTORAGE int16x8_t vqrdmulhq_lane_s16(int16x8_t vec1, int16x4_t val2, __constrange(`0`, `3`) int val3); // VQRDMULH.S16 q0,q0,d0[0]
1931	_NEON2SSESTORAGE int32x4_t vqrdmulhq_lane_s32(int32x4_t vec1, int32x2_t val2, __constrange(`0`, `1`) int val3); // VQRDMULH.S32 q0,q0,d0[0]
1932	//Vector multiply accumulate with scalar
1933	_NEON2SSESTORAGE int16x4_t vmla_n_s16(int16x4_t a, int16x4_t b, int16_t c); // VMLA.I16 d0, d0, d0[0]
1934	_NEON2SSESTORAGE int32x2_t vmla_n_s32(int32x2_t a, int32x2_t b, int32_t c); // VMLA.I32 d0, d0, d0[0]
1935	_NEON2SSE_GLOBAL uint16x4_t vmla_n_u16(uint16x4_t a, uint16x4_t b, uint16_t c); // VMLA.I16 d0, d0, d0[0]
1936	_NEON2SSE_GLOBAL uint32x2_t vmla_n_u32(uint32x2_t a, uint32x2_t b, uint32_t c); // VMLA.I32 d0, d0, d0[0]
1937	_NEON2SSESTORAGE float32x2_t vmla_n_f32(float32x2_t a, float32x2_t b, float32_t c); // VMLA.F32 d0, d0, d0[0]
1938	_NEON2SSESTORAGE int16x8_t vmlaq_n_s16(int16x8_t a, int16x8_t b, int16_t c); // VMLA.I16 q0, q0, d0[0]
1939	_NEON2SSESTORAGE int32x4_t vmlaq_n_s32(int32x4_t a, int32x4_t b, int32_t c); // VMLA.I32 q0, q0, d0[0]
1940	_NEON2SSE_GLOBAL uint16x8_t vmlaq_n_u16(uint16x8_t a, uint16x8_t b, uint16_t c); // VMLA.I16 q0, q0, d0[0]
1941	_NEON2SSE_GLOBAL uint32x4_t vmlaq_n_u32(uint32x4_t a, uint32x4_t b, uint32_t c); // VMLA.I32 q0, q0, d0[0]
1942	_NEON2SSESTORAGE float32x4_t vmlaq_n_f32(float32x4_t a, float32x4_t b, float32_t c); // VMLA.F32 q0, q0, d0[0]
1943	//Vector widening multiply accumulate with scalar
1944	_NEON2SSESTORAGE int32x4_t vmlal_n_s16(int32x4_t a, int16x4_t b, int16_t c); // VMLAL.S16 q0, d0, d0[0]
1945	_NEON2SSESTORAGE int64x2_t vmlal_n_s32(int64x2_t a, int32x2_t b, int32_t c); // VMLAL.S32 q0, d0, d0[0]
1946	_NEON2SSESTORAGE uint32x4_t vmlal_n_u16(uint32x4_t a, uint16x4_t b, uint16_t c); // VMLAL.U16 q0, d0, d0[0]
1947	_NEON2SSESTORAGE uint64x2_t vmlal_n_u32(uint64x2_t a, uint32x2_t b, uint32_t c); // VMLAL.U32 q0, d0, d0[0]
1948	//Vector widening saturating doubling multiply accumulate with scalar
1949	_NEON2SSESTORAGE int32x4_t vqdmlal_n_s16(int32x4_t a, int16x4_t b, int16_t c); // VQDMLAL.S16 q0, d0, d0[0]
1950	_NEON2SSESTORAGE int64x2_t vqdmlal_n_s32(int64x2_t a, int32x2_t b, int32_t c); // VQDMLAL.S32 q0, d0, d0[0]
1951	//Vector multiply subtract with scalar
1952	_NEON2SSESTORAGE int16x4_t vmls_n_s16(int16x4_t a, int16x4_t b, int16_t c); // VMLS.I16 d0, d0, d0[0]
1953	_NEON2SSESTORAGE int32x2_t vmls_n_s32(int32x2_t a, int32x2_t b, int32_t c); // VMLS.I32 d0, d0, d0[0]
1954	_NEON2SSESTORAGE uint16x4_t vmls_n_u16(uint16x4_t a, uint16x4_t b, uint16_t c); // VMLS.I16 d0, d0, d0[0]
1955	_NEON2SSESTORAGE uint32x2_t vmls_n_u32(uint32x2_t a, uint32x2_t b, uint32_t c); // VMLS.I32 d0, d0, d0[0]
1956	_NEON2SSESTORAGE float32x2_t vmls_n_f32(float32x2_t a, float32x2_t b, float32_t c); // VMLS.F32 d0, d0, d0[0]
1957	_NEON2SSESTORAGE int16x8_t vmlsq_n_s16(int16x8_t a, int16x8_t b, int16_t c); // VMLS.I16 q0, q0, d0[0]
1958	_NEON2SSESTORAGE int32x4_t vmlsq_n_s32(int32x4_t a, int32x4_t b, int32_t c); // VMLS.I32 q0, q0, d0[0]
1959	_NEON2SSESTORAGE uint16x8_t vmlsq_n_u16(uint16x8_t a, uint16x8_t b, uint16_t c); // VMLS.I16 q0, q0, d0[0]
1960	_NEON2SSESTORAGE uint32x4_t vmlsq_n_u32(uint32x4_t a, uint32x4_t b, uint32_t c); // VMLS.I32 q0, q0, d0[0]
1961	_NEON2SSESTORAGE float32x4_t vmlsq_n_f32(float32x4_t a, float32x4_t b, float32_t c); // VMLS.F32 q0, q0, d0[0]
1962	//Vector widening multiply subtract with scalar
1963	_NEON2SSESTORAGE int32x4_t vmlsl_n_s16(int32x4_t a, int16x4_t b, int16_t c); // VMLSL.S16 q0, d0, d0[0]
1964	_NEON2SSESTORAGE int64x2_t vmlsl_n_s32(int64x2_t a, int32x2_t b, int32_t c); // VMLSL.S32 q0, d0, d0[0]
1965	_NEON2SSESTORAGE uint32x4_t vmlsl_n_u16(uint32x4_t a, uint16x4_t b, uint16_t c); // VMLSL.U16 q0, d0, d0[0]
1966	_NEON2SSESTORAGE uint64x2_t vmlsl_n_u32(uint64x2_t a, uint32x2_t b, uint32_t c); // VMLSL.U32 q0, d0, d0[0]
1967	//Vector widening saturating doubling multiply subtract with scalar
1968	_NEON2SSESTORAGE int32x4_t vqdmlsl_n_s16(int32x4_t a, int16x4_t b, int16_t c); // VQDMLSL.S16 q0, d0, d0[0]
1969	_NEON2SSESTORAGE int64x2_t vqdmlsl_n_s32(int64x2_t a, int32x2_t b, int32_t c); // VQDMLSL.S32 q0, d0, d0[0]
1970	//Vector extract
1971	_NEON2SSESTORAGE int8x8_t vext_s8(int8x8_t a, int8x8_t b, __constrange(`0`,`7`) int c); // VEXT.8 d0,d0,d0,#0
1972	_NEON2SSE_GLOBAL uint8x8_t vext_u8(uint8x8_t a, uint8x8_t b, __constrange(`0`,`7`) int c); // VEXT.8 d0,d0,d0,#0
1973	_NEON2SSE_GLOBAL poly8x8_t vext_p8(poly8x8_t a, poly8x8_t b, __constrange(`0`,`7`) int c); // VEXT.8 d0,d0,d0,#0
1974	_NEON2SSESTORAGE int16x4_t vext_s16(int16x4_t a, int16x4_t b, __constrange(`0`,`3`) int c); // VEXT.16 d0,d0,d0,#0
1975	_NEON2SSE_GLOBAL uint16x4_t vext_u16(uint16x4_t a, uint16x4_t b, __constrange(`0`,`3`) int c); // VEXT.16 d0,d0,d0,#0
1976	_NEON2SSE_GLOBAL poly16x4_t vext_p16(poly16x4_t a, poly16x4_t b, __constrange(`0`,`3`) int c); // VEXT.16 d0,d0,d0,#0
1977	_NEON2SSESTORAGE int32x2_t vext_s32(int32x2_t a, int32x2_t b, __constrange(`0`,`1`) int c); // VEXT.32 d0,d0,d0,#0
1978	_NEON2SSE_GLOBAL uint32x2_t vext_u32(uint32x2_t a, uint32x2_t b, __constrange(`0`,`1`) int c); // VEXT.32 d0,d0,d0,#0
1979	_NEON2SSE_GLOBAL int64x1_t vext_s64(int64x1_t a, int64x1_t b, __constrange(`0`,`0`) int c); // VEXT.64 d0,d0,d0,#0
1980	_NEON2SSE_GLOBAL uint64x1_t vext_u64(uint64x1_t a, uint64x1_t b, __constrange(`0`,`0`) int c); // VEXT.64 d0,d0,d0,#0
1981	_NEON2SSESTORAGE float32x2_t vext_f32(float32x2_t a, float32x2_t b, __constrange(`0`,`1`) int c); // VEXT.32 d0,d0,d0,#0
1982	_NEON2SSE_GLOBAL int8x16_t vextq_s8(int8x16_t a, int8x16_t b, __constrange(`0`,`15`) int c); // VEXT.8 q0,q0,q0,#0
1983	_NEON2SSE_GLOBAL uint8x16_t vextq_u8(uint8x16_t a, uint8x16_t b, __constrange(`0`,`15`) int c); // VEXT.8 q0,q0,q0,#0
1984	_NEON2SSE_GLOBAL poly8x16_t vextq_p8(poly8x16_t a, poly8x16_t b, __constrange(`0`,`15`) int c); // VEXT.8 q0,q0,q0,#0
1985	_NEON2SSE_GLOBAL int16x8_t vextq_s16(int16x8_t a, int16x8_t b, __constrange(`0`,`7`) int c); // VEXT.16 q0,q0,q0,#0
1986	_NEON2SSE_GLOBAL uint16x8_t vextq_u16(uint16x8_t a, uint16x8_t b, __constrange(`0`,`7`) int c); // VEXT.16 q0,q0,q0,#0
1987	_NEON2SSE_GLOBAL poly16x8_t vextq_p16(poly16x8_t a, poly16x8_t b, __constrange(`0`,`7`) int c); // VEXT.16 q0,q0,q0,#0
1988	_NEON2SSE_GLOBAL int32x4_t vextq_s32(int32x4_t a, int32x4_t b, __constrange(`0`,`3`) int c); // VEXT.32 q0,q0,q0,#0
1989	_NEON2SSE_GLOBAL uint32x4_t vextq_u32(uint32x4_t a, uint32x4_t b, __constrange(`0`,`3`) int c); // VEXT.32 q0,q0,q0,#0
1990	_NEON2SSE_GLOBAL int64x2_t vextq_s64(int64x2_t a, int64x2_t b, __constrange(`0`,`1`) int c); // VEXT.64 q0,q0,q0,#0
1991	_NEON2SSE_GLOBAL uint64x2_t vextq_u64(uint64x2_t a, uint64x2_t b, __constrange(`0`,`1`) int c); // VEXT.64 q0,q0,q0,#0
1992	_NEON2SSE_GLOBAL float32x4_t vextq_f32(float32x4_t a, float32x4_t b, __constrange(`0`,`3`) float c); // VEXT.32 q0,q0,q0,#0
1993	//Reverse vector elements (swap endianness). VREVn.m reverses the order of the m-bit lanes within a set that is n bits wide.
1994	_NEON2SSESTORAGE int8x8_t vrev64_s8(int8x8_t vec); // VREV64.8 d0,d0
1995	_NEON2SSESTORAGE int16x4_t vrev64_s16(int16x4_t vec); // VREV64.16 d0,d0
1996	_NEON2SSESTORAGE int32x2_t vrev64_s32(int32x2_t vec); // VREV64.32 d0,d0
1997	_NEON2SSE_GLOBAL uint8x8_t vrev64_u8(uint8x8_t vec); // VREV64.8 d0,d0
1998	_NEON2SSE_GLOBAL uint16x4_t vrev64_u16(uint16x4_t vec); // VREV64.16 d0,d0
1999	_NEON2SSE_GLOBAL uint32x2_t vrev64_u32(uint32x2_t vec); // VREV64.32 d0,d0
2000	_NEON2SSE_GLOBAL poly8x8_t vrev64_p8(poly8x8_t vec); // VREV64.8 d0,d0
2001	_NEON2SSE_GLOBAL poly16x4_t vrev64_p16(poly16x4_t vec); // VREV64.16 d0,d0
2002	_NEON2SSESTORAGE float32x2_t vrev64_f32(float32x2_t vec); // VREV64.32 d0,d0
2003	_NEON2SSESTORAGE int8x16_t vrev64q_s8(int8x16_t vec); // VREV64.8 q0,q0
2004	_NEON2SSESTORAGE int16x8_t vrev64q_s16(int16x8_t vec); // VREV64.16 q0,q0
2005	_NEON2SSESTORAGE int32x4_t vrev64q_s32(int32x4_t vec); // VREV64.32 q0,q0
2006	_NEON2SSE_GLOBAL uint8x16_t vrev64q_u8(uint8x16_t vec); // VREV64.8 q0,q0
2007	_NEON2SSE_GLOBAL uint16x8_t vrev64q_u16(uint16x8_t vec); // VREV64.16 q0,q0
2008	_NEON2SSE_GLOBAL uint32x4_t vrev64q_u32(uint32x4_t vec); // VREV64.32 q0,q0
2009	_NEON2SSE_GLOBAL poly8x16_t vrev64q_p8(poly8x16_t vec); // VREV64.8 q0,q0
2010	_NEON2SSE_GLOBAL poly16x8_t vrev64q_p16(poly16x8_t vec); // VREV64.16 q0,q0
2011	_NEON2SSE_GLOBAL float32x4_t vrev64q_f32(float32x4_t vec); // VREV64.32 q0,q0
2012	_NEON2SSESTORAGE int8x8_t vrev32_s8(int8x8_t vec); // VREV32.8 d0,d0
2013	_NEON2SSESTORAGE int16x4_t vrev32_s16(int16x4_t vec); // VREV32.16 d0,d0
2014	_NEON2SSE_GLOBAL uint8x8_t vrev32_u8(uint8x8_t vec); // VREV32.8 d0,d0
2015	_NEON2SSE_GLOBAL uint16x4_t vrev32_u16(uint16x4_t vec); // VREV32.16 d0,d0
2016	_NEON2SSE_GLOBAL poly8x8_t vrev32_p8(poly8x8_t vec); // VREV32.8 d0,d0
2017	_NEON2SSE_GLOBAL poly16x4_t vrev32_p16(poly16x4_t vec); // VREV32.16 d0,d0
2018	_NEON2SSESTORAGE int8x16_t vrev32q_s8(int8x16_t vec); // VREV32.8 q0,q0
2019	_NEON2SSESTORAGE int16x8_t vrev32q_s16(int16x8_t vec); // VREV32.16 q0,q0
2020	_NEON2SSE_GLOBAL uint8x16_t vrev32q_u8(uint8x16_t vec); // VREV32.8 q0,q0
2021	_NEON2SSE_GLOBAL uint16x8_t vrev32q_u16(uint16x8_t vec); // VREV32.16 q0,q0
2022	_NEON2SSE_GLOBAL poly8x16_t vrev32q_p8(poly8x16_t vec); // VREV32.8 q0,q0
2023	_NEON2SSE_GLOBAL poly16x8_t vrev32q_p16(poly16x8_t vec); // VREV32.16 q0,q0
2024	_NEON2SSESTORAGE int8x8_t vrev16_s8(int8x8_t vec); // VREV16.8 d0,d0
2025	_NEON2SSE_GLOBAL uint8x8_t vrev16_u8(uint8x8_t vec); // VREV16.8 d0,d0
2026	_NEON2SSE_GLOBAL poly8x8_t vrev16_p8(poly8x8_t vec); // VREV16.8 d0,d0
2027	_NEON2SSESTORAGE int8x16_t vrev16q_s8(int8x16_t vec); // VREV16.8 q0,q0
2028	_NEON2SSE_GLOBAL uint8x16_t vrev16q_u8(uint8x16_t vec); // VREV16.8 q0,q0
2029	_NEON2SSE_GLOBAL poly8x16_t vrev16q_p8(poly8x16_t vec); // VREV16.8 q0,q0
2030	//Other single operand arithmetic
2031	//Absolute: Vd[i] = \|Va[i]\|
2032	_NEON2SSESTORAGE int8x8_t vabs_s8(int8x8_t a); // VABS.S8 d0,d0
2033	_NEON2SSESTORAGE int16x4_t vabs_s16(int16x4_t a); // VABS.S16 d0,d0
2034	_NEON2SSESTORAGE int32x2_t vabs_s32(int32x2_t a); // VABS.S32 d0,d0
2035	_NEON2SSESTORAGE float32x2_t vabs_f32(float32x2_t a); // VABS.F32 d0,d0
2036	_NEON2SSE_GLOBAL int8x16_t vabsq_s8(int8x16_t a); // VABS.S8 q0,q0
2037	_NEON2SSE_GLOBAL int16x8_t vabsq_s16(int16x8_t a); // VABS.S16 q0,q0
2038	_NEON2SSE_GLOBAL int32x4_t vabsq_s32(int32x4_t a); // VABS.S32 q0,q0
2039	_NEON2SSESTORAGE float32x4_t vabsq_f32(float32x4_t a); // VABS.F32 q0,q0
2040
2041	#ifdef _NEON2SSE_64BIT
2042	_NEON2SSESTORAGE int64x2_t vabsq_s64(int64x2_t a); // VABS.S64 q0,q0
2043	_NEON2SSESTORAGE float64x2_t vabsq_f64(float64x2_t a); // VABS.F64 q0,q0
2044	#endif
2045
2046	//Saturating absolute: Vd[i] = sat(\|Va[i]\|)
2047	_NEON2SSESTORAGE int8x8_t vqabs_s8(int8x8_t a); // VQABS.S8 d0,d0
2048	_NEON2SSESTORAGE int16x4_t vqabs_s16(int16x4_t a); // VQABS.S16 d0,d0
2049	_NEON2SSESTORAGE int32x2_t vqabs_s32(int32x2_t a); // VQABS.S32 d0,d0
2050	_NEON2SSESTORAGE int8x16_t vqabsq_s8(int8x16_t a); // VQABS.S8 q0,q0
2051	_NEON2SSESTORAGE int16x8_t vqabsq_s16(int16x8_t a); // VQABS.S16 q0,q0
2052	_NEON2SSESTORAGE int32x4_t vqabsq_s32(int32x4_t a); // VQABS.S32 q0,q0
2053	//Negate: Vd[i] = - Va[i]
2054	_NEON2SSESTORAGE int8x8_t vneg_s8(int8x8_t a); // VNE//d0,d0
2055	_NEON2SSESTORAGE int16x4_t vneg_s16(int16x4_t a); // VNE//d0,d0
2056	_NEON2SSESTORAGE int32x2_t vneg_s32(int32x2_t a); // VNE//d0,d0
2057	_NEON2SSESTORAGE float32x2_t vneg_f32(float32x2_t a); // VNE//d0,d0
2058	_NEON2SSESTORAGE int8x16_t vnegq_s8(int8x16_t a); // VNE//q0,q0
2059	_NEON2SSESTORAGE int16x8_t vnegq_s16(int16x8_t a); // VNE//q0,q0
2060	_NEON2SSESTORAGE int32x4_t vnegq_s32(int32x4_t a); // VNE//q0,q0
2061	_NEON2SSESTORAGE float32x4_t vnegq_f32(float32x4_t a); // VNE//q0,q0
2062	//Saturating Negate: sat(Vd[i] = - Va[i])
2063	_NEON2SSESTORAGE int8x8_t vqneg_s8(int8x8_t a); // VQNE//d0,d0
2064	_NEON2SSESTORAGE int16x4_t vqneg_s16(int16x4_t a); // VQNE//d0,d0
2065	_NEON2SSESTORAGE int32x2_t vqneg_s32(int32x2_t a); // VQNE//d0,d0
2066	_NEON2SSESTORAGE int8x16_t vqnegq_s8(int8x16_t a); // VQNE//q0,q0
2067	_NEON2SSESTORAGE int16x8_t vqnegq_s16(int16x8_t a); // VQNE//q0,q0
2068	_NEON2SSESTORAGE int32x4_t vqnegq_s32(int32x4_t a); // VQNE//q0,q0
2069	//Count leading sign bits
2070	_NEON2SSESTORAGE int8x8_t vcls_s8(int8x8_t a); // VCLS.S8 d0,d0
2071	_NEON2SSESTORAGE int16x4_t vcls_s16(int16x4_t a); // VCLS.S16 d0,d0
2072	_NEON2SSESTORAGE int32x2_t vcls_s32(int32x2_t a); // VCLS.S32 d0,d0
2073	_NEON2SSESTORAGE int8x16_t vclsq_s8(int8x16_t a); // VCLS.S8 q0,q0
2074	_NEON2SSESTORAGE int16x8_t vclsq_s16(int16x8_t a); // VCLS.S16 q0,q0
2075	_NEON2SSESTORAGE int32x4_t vclsq_s32(int32x4_t a); // VCLS.S32 q0,q0
2076	//Count leading zeros
2077	_NEON2SSESTORAGE int8x8_t vclz_s8(int8x8_t a); // VCLZ.I8 d0,d0
2078	_NEON2SSESTORAGE int16x4_t vclz_s16(int16x4_t a); // VCLZ.I16 d0,d0
2079	_NEON2SSESTORAGE int32x2_t vclz_s32(int32x2_t a); // VCLZ.I32 d0,d0
2080	_NEON2SSE_GLOBAL uint8x8_t vclz_u8(uint8x8_t a); // VCLZ.I8 d0,d0
2081	_NEON2SSE_GLOBAL uint16x4_t vclz_u16(uint16x4_t a); // VCLZ.I16 d0,d0
2082	_NEON2SSE_GLOBAL uint32x2_t vclz_u32(uint32x2_t a); // VCLZ.I32 d0,d0
2083	_NEON2SSESTORAGE int8x16_t vclzq_s8(int8x16_t a); // VCLZ.I8 q0,q0
2084	_NEON2SSESTORAGE int16x8_t vclzq_s16(int16x8_t a); // VCLZ.I16 q0,q0
2085	_NEON2SSESTORAGE int32x4_t vclzq_s32(int32x4_t a); // VCLZ.I32 q0,q0
2086	_NEON2SSE_GLOBAL uint8x16_t vclzq_u8(uint8x16_t a); // VCLZ.I8 q0,q0
2087	_NEON2SSE_GLOBAL uint16x8_t vclzq_u16(uint16x8_t a); // VCLZ.I16 q0,q0
2088	_NEON2SSE_GLOBAL uint32x4_t vclzq_u32(uint32x4_t a); // VCLZ.I32 q0,q0
2089	//Count number of set bits
2090	_NEON2SSESTORAGE uint8x8_t vcnt_u8(uint8x8_t a); // VCNT.8 d0,d0
2091	_NEON2SSE_GLOBAL int8x8_t vcnt_s8(int8x8_t a); // VCNT.8 d0,d0
2092	_NEON2SSE_GLOBAL poly8x8_t vcnt_p8(poly8x8_t a); // VCNT.8 d0,d0
2093	_NEON2SSESTORAGE uint8x16_t vcntq_u8(uint8x16_t a); // VCNT.8 q0,q0
2094	_NEON2SSE_GLOBAL int8x16_t vcntq_s8(int8x16_t a); // VCNT.8 q0,q0
2095	_NEON2SSE_GLOBAL poly8x16_t vcntq_p8(poly8x16_t a); // VCNT.8 q0,q0
2096	//Reciprocal estimate
2097	_NEON2SSESTORAGE float32x2_t vrecpe_f32(float32x2_t a); // VRECPE.F32 d0,d0
2098	_NEON2SSESTORAGE uint32x2_t vrecpe_u32(uint32x2_t a); // VRECPE.U32 d0,d0
2099	_NEON2SSE_GLOBAL float32x4_t vrecpeq_f32(float32x4_t a); // VRECPE.F32 q0,q0
2100	_NEON2SSESTORAGE uint32x4_t vrecpeq_u32(uint32x4_t a); // VRECPE.U32 q0,q0
2101	//Reciprocal square root estimate
2102	_NEON2SSESTORAGE float32x2_t vrsqrte_f32(float32x2_t a); // VRSQRTE.F32 d0,d0
2103	_NEON2SSESTORAGE uint32x2_t vrsqrte_u32(uint32x2_t a); // VRSQRTE.U32 d0,d0
2104	_NEON2SSE_GLOBAL float32x4_t vrsqrteq_f32(float32x4_t a); // VRSQRTE.F32 q0,q0
2105	_NEON2SSESTORAGE uint32x4_t vrsqrteq_u32(uint32x4_t a); // VRSQRTE.U32 q0,q0
2106	//Logical operations
2107	//Bitwise not
2108	_NEON2SSESTORAGE int8x8_t vmvn_s8(int8x8_t a); // VMVN d0,d0
2109	_NEON2SSESTORAGE int16x4_t vmvn_s16(int16x4_t a); // VMVN d0,d0
2110	_NEON2SSESTORAGE int32x2_t vmvn_s32(int32x2_t a); // VMVN d0,d0
2111	_NEON2SSE_GLOBAL uint8x8_t vmvn_u8(uint8x8_t a); // VMVN d0,d0
2112	_NEON2SSE_GLOBAL uint16x4_t vmvn_u16(uint16x4_t a); // VMVN d0,d0
2113	_NEON2SSE_GLOBAL uint32x2_t vmvn_u32(uint32x2_t a); // VMVN d0,d0
2114	_NEON2SSE_GLOBAL poly8x8_t vmvn_p8(poly8x8_t a); // VMVN d0,d0
2115	_NEON2SSESTORAGE int8x16_t vmvnq_s8(int8x16_t a); // VMVN q0,q0
2116	_NEON2SSESTORAGE int16x8_t vmvnq_s16(int16x8_t a); // VMVN q0,q0
2117	_NEON2SSESTORAGE int32x4_t vmvnq_s32(int32x4_t a); // VMVN q0,q0
2118	_NEON2SSE_GLOBAL uint8x16_t vmvnq_u8(uint8x16_t a); // VMVN q0,q0
2119	_NEON2SSE_GLOBAL uint16x8_t vmvnq_u16(uint16x8_t a); // VMVN q0,q0
2120	_NEON2SSE_GLOBAL uint32x4_t vmvnq_u32(uint32x4_t a); // VMVN q0,q0
2121	_NEON2SSE_GLOBAL poly8x16_t vmvnq_p8(poly8x16_t a); // VMVN q0,q0
2122	//Bitwise and
2123	_NEON2SSESTORAGE int8x8_t vand_s8(int8x8_t a, int8x8_t b); // VAND d0,d0,d0
2124	_NEON2SSESTORAGE int16x4_t vand_s16(int16x4_t a, int16x4_t b); // VAND d0,d0,d0
2125	_NEON2SSESTORAGE int32x2_t vand_s32(int32x2_t a, int32x2_t b); // VAND d0,d0,d0
2126	_NEON2SSESTORAGE int64x1_t vand_s64(int64x1_t a, int64x1_t b); // VAND d0,d0,d0
2127	_NEON2SSE_GLOBAL uint8x8_t vand_u8(uint8x8_t a, uint8x8_t b); // VAND d0,d0,d0
2128	_NEON2SSE_GLOBAL uint16x4_t vand_u16(uint16x4_t a, uint16x4_t b); // VAND d0,d0,d0
2129	_NEON2SSE_GLOBAL uint32x2_t vand_u32(uint32x2_t a, uint32x2_t b); // VAND d0,d0,d0
2130	_NEON2SSE_GLOBAL uint64x1_t vand_u64(uint64x1_t a, uint64x1_t b); // VAND d0,d0,d0
2131	_NEON2SSE_GLOBAL int8x16_t vandq_s8(int8x16_t a, int8x16_t b); // VAND q0,q0,q0
2132	_NEON2SSE_GLOBAL int16x8_t vandq_s16(int16x8_t a, int16x8_t b); // VAND q0,q0,q0
2133	_NEON2SSE_GLOBAL int32x4_t vandq_s32(int32x4_t a, int32x4_t b); // VAND q0,q0,q0
2134	_NEON2SSE_GLOBAL int64x2_t vandq_s64(int64x2_t a, int64x2_t b); // VAND q0,q0,q0
2135	_NEON2SSE_GLOBAL uint8x16_t vandq_u8(uint8x16_t a, uint8x16_t b); // VAND q0,q0,q0
2136	_NEON2SSE_GLOBAL uint16x8_t vandq_u16(uint16x8_t a, uint16x8_t b); // VAND q0,q0,q0
2137	_NEON2SSE_GLOBAL uint32x4_t vandq_u32(uint32x4_t a, uint32x4_t b); // VAND q0,q0,q0
2138	_NEON2SSE_GLOBAL uint64x2_t vandq_u64(uint64x2_t a, uint64x2_t b); // VAND q0,q0,q0
2139	//Bitwise or
2140	_NEON2SSESTORAGE int8x8_t vorr_s8(int8x8_t a, int8x8_t b); // VORR d0,d0,d0
2141	_NEON2SSESTORAGE int16x4_t vorr_s16(int16x4_t a, int16x4_t b); // VORR d0,d0,d0
2142	_NEON2SSESTORAGE int32x2_t vorr_s32(int32x2_t a, int32x2_t b); // VORR d0,d0,d0
2143	_NEON2SSESTORAGE int64x1_t vorr_s64(int64x1_t a, int64x1_t b); // VORR d0,d0,d0
2144	_NEON2SSE_GLOBAL uint8x8_t vorr_u8(uint8x8_t a, uint8x8_t b); // VORR d0,d0,d0
2145	_NEON2SSE_GLOBAL uint16x4_t vorr_u16(uint16x4_t a, uint16x4_t b); // VORR d0,d0,d0
2146	_NEON2SSE_GLOBAL uint32x2_t vorr_u32(uint32x2_t a, uint32x2_t b); // VORR d0,d0,d0
2147	_NEON2SSE_GLOBAL uint64x1_t vorr_u64(uint64x1_t a, uint64x1_t b); // VORR d0,d0,d0
2148	_NEON2SSE_GLOBAL int8x16_t vorrq_s8(int8x16_t a, int8x16_t b); // VORR q0,q0,q0
2149	_NEON2SSE_GLOBAL int16x8_t vorrq_s16(int16x8_t a, int16x8_t b); // VORR q0,q0,q0
2150	_NEON2SSE_GLOBAL int32x4_t vorrq_s32(int32x4_t a, int32x4_t b); // VORR q0,q0,q0
2151	_NEON2SSE_GLOBAL int64x2_t vorrq_s64(int64x2_t a, int64x2_t b); // VORR q0,q0,q0
2152	_NEON2SSE_GLOBAL uint8x16_t vorrq_u8(uint8x16_t a, uint8x16_t b); // VORR q0,q0,q0
2153	_NEON2SSE_GLOBAL uint16x8_t vorrq_u16(uint16x8_t a, uint16x8_t b); // VORR q0,q0,q0
2154	_NEON2SSE_GLOBAL uint32x4_t vorrq_u32(uint32x4_t a, uint32x4_t b); // VORR q0,q0,q0
2155	_NEON2SSE_GLOBAL uint64x2_t vorrq_u64(uint64x2_t a, uint64x2_t b); // VORR q0,q0,q0
2156	//Bitwise exclusive or (EOR or XOR)
2157	_NEON2SSESTORAGE int8x8_t veor_s8(int8x8_t a, int8x8_t b); // VEOR d0,d0,d0
2158	_NEON2SSE_GLOBAL int16x4_t veor_s16(int16x4_t a, int16x4_t b); // VEOR d0,d0,d0
2159	_NEON2SSE_GLOBAL int32x2_t veor_s32(int32x2_t a, int32x2_t b); // VEOR d0,d0,d0
2160	_NEON2SSESTORAGE int64x1_t veor_s64(int64x1_t a, int64x1_t b); // VEOR d0,d0,d0
2161	_NEON2SSE_GLOBAL uint8x8_t veor_u8(uint8x8_t a, uint8x8_t b); // VEOR d0,d0,d0
2162	_NEON2SSE_GLOBAL uint16x4_t veor_u16(uint16x4_t a, uint16x4_t b); // VEOR d0,d0,d0
2163	_NEON2SSE_GLOBAL uint32x2_t veor_u32(uint32x2_t a, uint32x2_t b); // VEOR d0,d0,d0
2164	_NEON2SSE_GLOBAL uint64x1_t veor_u64(uint64x1_t a, uint64x1_t b); // VEOR d0,d0,d0
2165	_NEON2SSE_GLOBAL int8x16_t veorq_s8(int8x16_t a, int8x16_t b); // VEOR q0,q0,q0
2166	_NEON2SSE_GLOBAL int16x8_t veorq_s16(int16x8_t a, int16x8_t b); // VEOR q0,q0,q0
2167	_NEON2SSE_GLOBAL int32x4_t veorq_s32(int32x4_t a, int32x4_t b); // VEOR q0,q0,q0
2168	_NEON2SSE_GLOBAL int64x2_t veorq_s64(int64x2_t a, int64x2_t b); // VEOR q0,q0,q0
2169	_NEON2SSE_GLOBAL uint8x16_t veorq_u8(uint8x16_t a, uint8x16_t b); // VEOR q0,q0,q0
2170	_NEON2SSE_GLOBAL uint16x8_t veorq_u16(uint16x8_t a, uint16x8_t b); // VEOR q0,q0,q0
2171	_NEON2SSE_GLOBAL uint32x4_t veorq_u32(uint32x4_t a, uint32x4_t b); // VEOR q0,q0,q0
2172	_NEON2SSE_GLOBAL uint64x2_t veorq_u64(uint64x2_t a, uint64x2_t b); // VEOR q0,q0,q0
2173	//Bit Clear
2174	_NEON2SSESTORAGE int8x8_t vbic_s8(int8x8_t a, int8x8_t b); // VBIC d0,d0,d0
2175	_NEON2SSE_GLOBAL int16x4_t vbic_s16(int16x4_t a, int16x4_t b); // VBIC d0,d0,d0
2176	_NEON2SSE_GLOBAL int32x2_t vbic_s32(int32x2_t a, int32x2_t b); // VBIC d0,d0,d0
2177	_NEON2SSESTORAGE int64x1_t vbic_s64(int64x1_t a, int64x1_t b); // VBIC d0,d0,d0
2178	_NEON2SSE_GLOBAL uint8x8_t vbic_u8(uint8x8_t a, uint8x8_t b); // VBIC d0,d0,d0
2179	_NEON2SSE_GLOBAL uint16x4_t vbic_u16(uint16x4_t a, uint16x4_t b); // VBIC d0,d0,d0
2180	_NEON2SSE_GLOBAL uint32x2_t vbic_u32(uint32x2_t a, uint32x2_t b); // VBIC d0,d0,d0
2181	_NEON2SSE_GLOBAL uint64x1_t vbic_u64(uint64x1_t a, uint64x1_t b); // VBIC d0,d0,d0
2182	_NEON2SSE_GLOBAL int8x16_t vbicq_s8(int8x16_t a, int8x16_t b); // VBIC q0,q0,q0
2183	_NEON2SSE_GLOBAL int16x8_t vbicq_s16(int16x8_t a, int16x8_t b); // VBIC q0,q0,q0
2184	_NEON2SSE_GLOBAL int32x4_t vbicq_s32(int32x4_t a, int32x4_t b); // VBIC q0,q0,q0
2185	_NEON2SSE_GLOBAL int64x2_t vbicq_s64(int64x2_t a, int64x2_t b); // VBIC q0,q0,q0
2186	_NEON2SSE_GLOBAL uint8x16_t vbicq_u8(uint8x16_t a, uint8x16_t b); // VBIC q0,q0,q0
2187	_NEON2SSE_GLOBAL uint16x8_t vbicq_u16(uint16x8_t a, uint16x8_t b); // VBIC q0,q0,q0
2188	_NEON2SSE_GLOBAL uint32x4_t vbicq_u32(uint32x4_t a, uint32x4_t b); // VBIC q0,q0,q0
2189	_NEON2SSE_GLOBAL uint64x2_t vbicq_u64(uint64x2_t a, uint64x2_t b); // VBIC q0,q0,q0
2190	//Bitwise OR complement
2191	_NEON2SSESTORAGE int8x8_t vorn_s8(int8x8_t a, int8x8_t b); // VORN d0,d0,d0
2192	_NEON2SSESTORAGE int16x4_t vorn_s16(int16x4_t a, int16x4_t b); // VORN d0,d0,d0
2193	_NEON2SSESTORAGE int32x2_t vorn_s32(int32x2_t a, int32x2_t b); // VORN d0,d0,d0
2194	_NEON2SSESTORAGE int64x1_t vorn_s64(int64x1_t a, int64x1_t b); // VORN d0,d0,d0
2195	_NEON2SSE_GLOBAL uint8x8_t vorn_u8(uint8x8_t a, uint8x8_t b); // VORN d0,d0,d0
2196	_NEON2SSE_GLOBAL uint16x4_t vorn_u16(uint16x4_t a, uint16x4_t b); // VORN d0,d0,d0
2197	_NEON2SSE_GLOBAL uint32x2_t vorn_u32(uint32x2_t a, uint32x2_t b); // VORN d0,d0,d0
2198	_NEON2SSE_GLOBAL uint64x1_t vorn_u64(uint64x1_t a, uint64x1_t b); // VORN d0,d0,d0
2199	_NEON2SSESTORAGE int8x16_t vornq_s8(int8x16_t a, int8x16_t b); // VORN q0,q0,q0
2200	_NEON2SSESTORAGE int16x8_t vornq_s16(int16x8_t a, int16x8_t b); // VORN q0,q0,q0
2201	_NEON2SSESTORAGE int32x4_t vornq_s32(int32x4_t a, int32x4_t b); // VORN q0,q0,q0
2202	_NEON2SSESTORAGE int64x2_t vornq_s64(int64x2_t a, int64x2_t b); // VORN q0,q0,q0
2203	_NEON2SSESTORAGE uint8x16_t vornq_u8(uint8x16_t a, uint8x16_t b); // VORN q0,q0,q0
2204	_NEON2SSESTORAGE uint16x8_t vornq_u16(uint16x8_t a, uint16x8_t b); // VORN q0,q0,q0
2205	_NEON2SSESTORAGE uint32x4_t vornq_u32(uint32x4_t a, uint32x4_t b); // VORN q0,q0,q0
2206	_NEON2SSE_GLOBAL uint64x2_t vornq_u64(uint64x2_t a, uint64x2_t b); // VORN q0,q0,q0
2207	//Bitwise Select
2208	_NEON2SSESTORAGE int8x8_t vbsl_s8(uint8x8_t a, int8x8_t b, int8x8_t c); // VBSL d0,d0,d0
2209	_NEON2SSE_GLOBAL int16x4_t vbsl_s16(uint16x4_t a, int16x4_t b, int16x4_t c); // VBSL d0,d0,d0
2210	_NEON2SSE_GLOBAL int32x2_t vbsl_s32(uint32x2_t a, int32x2_t b, int32x2_t c); // VBSL d0,d0,d0
2211	_NEON2SSESTORAGE int64x1_t vbsl_s64(uint64x1_t a, int64x1_t b, int64x1_t c); // VBSL d0,d0,d0
2212	_NEON2SSE_GLOBAL uint8x8_t vbsl_u8(uint8x8_t a, uint8x8_t b, uint8x8_t c); // VBSL d0,d0,d0
2213	_NEON2SSE_GLOBAL uint16x4_t vbsl_u16(uint16x4_t a, uint16x4_t b, uint16x4_t c); // VBSL d0,d0,d0
2214	_NEON2SSE_GLOBAL uint32x2_t vbsl_u32(uint32x2_t a, uint32x2_t b, uint32x2_t c); // VBSL d0,d0,d0
2215	_NEON2SSE_GLOBAL uint64x1_t vbsl_u64(uint64x1_t a, uint64x1_t b, uint64x1_t c); // VBSL d0,d0,d0
2216	_NEON2SSESTORAGE float32x2_t vbsl_f32(uint32x2_t a, float32x2_t b, float32x2_t c); // VBSL d0,d0,d0
2217	_NEON2SSE_GLOBAL poly8x8_t vbsl_p8(uint8x8_t a, poly8x8_t b, poly8x8_t c); // VBSL d0,d0,d0
2218	_NEON2SSE_GLOBAL poly16x4_t vbsl_p16(uint16x4_t a, poly16x4_t b, poly16x4_t c); // VBSL d0,d0,d0
2219	_NEON2SSESTORAGE int8x16_t vbslq_s8(uint8x16_t a, int8x16_t b, int8x16_t c); // VBSL q0,q0,q0
2220	_NEON2SSE_GLOBAL int16x8_t vbslq_s16(uint16x8_t a, int16x8_t b, int16x8_t c); // VBSL q0,q0,q0
2221	_NEON2SSE_GLOBAL int32x4_t vbslq_s32(uint32x4_t a, int32x4_t b, int32x4_t c); // VBSL q0,q0,q0
2222	_NEON2SSE_GLOBAL int64x2_t vbslq_s64(uint64x2_t a, int64x2_t b, int64x2_t c); // VBSL q0,q0,q0
2223	_NEON2SSE_GLOBAL uint8x16_t vbslq_u8(uint8x16_t a, uint8x16_t b, uint8x16_t c); // VBSL q0,q0,q0
2224	_NEON2SSE_GLOBAL uint16x8_t vbslq_u16(uint16x8_t a, uint16x8_t b, uint16x8_t c); // VBSL q0,q0,q0
2225	_NEON2SSE_GLOBAL uint32x4_t vbslq_u32(uint32x4_t a, uint32x4_t b, uint32x4_t c); // VBSL q0,q0,q0
2226	_NEON2SSE_GLOBAL uint64x2_t vbslq_u64(uint64x2_t a, uint64x2_t b, uint64x2_t c); // VBSL q0,q0,q0
2227	_NEON2SSESTORAGE float32x4_t vbslq_f32(uint32x4_t a, float32x4_t b, float32x4_t c); // VBSL q0,q0,q0
2228	_NEON2SSE_GLOBAL poly8x16_t vbslq_p8(uint8x16_t a, poly8x16_t b, poly8x16_t c); // VBSL q0,q0,q0
2229	_NEON2SSE_GLOBAL poly16x8_t vbslq_p16(uint16x8_t a, poly16x8_t b, poly16x8_t c); // VBSL q0,q0,q0
2230	//Transposition operations
2231	//Transpose elements
2232	_NEON2SSESTORAGE int8x8x2_t vtrn_s8(int8x8_t a, int8x8_t b); // VTRN.8 d0,d0
2233	_NEON2SSESTORAGE int16x4x2_t vtrn_s16(int16x4_t a, int16x4_t b); // VTRN.16 d0,d0
2234	_NEON2SSESTORAGE int32x2x2_t vtrn_s32(int32x2_t a, int32x2_t b); // VTRN.32 d0,d0
2235	_NEON2SSE_GLOBAL uint8x8x2_t vtrn_u8(uint8x8_t a, uint8x8_t b); // VTRN.8 d0,d0
2236	_NEON2SSE_GLOBAL uint16x4x2_t vtrn_u16(uint16x4_t a, uint16x4_t b); // VTRN.16 d0,d0
2237	_NEON2SSE_GLOBAL uint32x2x2_t vtrn_u32(uint32x2_t a, uint32x2_t b); // VTRN.32 d0,d0
2238	_NEON2SSESTORAGE float32x2x2_t vtrn_f32(float32x2_t a, float32x2_t b); // VTRN.32 d0,d0
2239	_NEON2SSE_GLOBAL poly8x8x2_t vtrn_p8(poly8x8_t a, poly8x8_t b); // VTRN.8 d0,d0
2240	_NEON2SSE_GLOBAL poly16x4x2_t vtrn_p16(poly16x4_t a, poly16x4_t b); // VTRN.16 d0,d0
2241	_NEON2SSESTORAGE int8x16x2_t vtrnq_s8(int8x16_t a, int8x16_t b); // VTRN.8 q0,q0
2242	_NEON2SSESTORAGE int16x8x2_t vtrnq_s16(int16x8_t a, int16x8_t b); // VTRN.16 q0,q0
2243	_NEON2SSESTORAGE int32x4x2_t vtrnq_s32(int32x4_t a, int32x4_t b); // VTRN.32 q0,q0
2244	_NEON2SSE_GLOBAL uint8x16x2_t vtrnq_u8(uint8x16_t a, uint8x16_t b); // VTRN.8 q0,q0
2245	_NEON2SSE_GLOBAL uint16x8x2_t vtrnq_u16(uint16x8_t a, uint16x8_t b); // VTRN.16 q0,q0
2246	_NEON2SSE_GLOBAL uint32x4x2_t vtrnq_u32(uint32x4_t a, uint32x4_t b); // VTRN.32 q0,q0
2247	_NEON2SSESTORAGE float32x4x2_t vtrnq_f32(float32x4_t a, float32x4_t b); // VTRN.32 q0,q0
2248	_NEON2SSE_GLOBAL poly8x16x2_t vtrnq_p8(poly8x16_t a, poly8x16_t b); // VTRN.8 q0,q0
2249	_NEON2SSE_GLOBAL poly16x8x2_t vtrnq_p16(poly16x8_t a, poly16x8_t b); // VTRN.16 q0,q0
2250	//Interleave elements
2251	_NEON2SSESTORAGE int8x8x2_t vzip_s8(int8x8_t a, int8x8_t b); // VZIP.8 d0,d0
2252	_NEON2SSESTORAGE int16x4x2_t vzip_s16(int16x4_t a, int16x4_t b); // VZIP.16 d0,d0
2253	_NEON2SSE_GLOBAL int32x2x2_t vzip_s32(int32x2_t a, int32x2_t b); // VZIP.32 d0,d0
2254	_NEON2SSE_GLOBAL uint8x8x2_t vzip_u8(uint8x8_t a, uint8x8_t b); // VZIP.8 d0,d0
2255	_NEON2SSE_GLOBAL uint16x4x2_t vzip_u16(uint16x4_t a, uint16x4_t b); // VZIP.16 d0,d0
2256	_NEON2SSE_GLOBAL uint32x2x2_t vzip_u32(uint32x2_t a, uint32x2_t b); // VZIP.32 d0,d0
2257	_NEON2SSE_GLOBAL float32x2x2_t vzip_f32(float32x2_t a, float32x2_t b); // VZIP.32 d0,d0
2258	_NEON2SSE_GLOBAL poly8x8x2_t vzip_p8(poly8x8_t a, poly8x8_t b); // VZIP.8 d0,d0
2259	_NEON2SSE_GLOBAL poly16x4x2_t vzip_p16(poly16x4_t a, poly16x4_t b); // VZIP.16 d0,d0
2260	_NEON2SSESTORAGE int8x16x2_t vzipq_s8(int8x16_t a, int8x16_t b); // VZIP.8 q0,q0
2261	_NEON2SSESTORAGE int16x8x2_t vzipq_s16(int16x8_t a, int16x8_t b); // VZIP.16 q0,q0
2262	_NEON2SSESTORAGE int32x4x2_t vzipq_s32(int32x4_t a, int32x4_t b); // VZIP.32 q0,q0
2263	_NEON2SSE_GLOBAL uint8x16x2_t vzipq_u8(uint8x16_t a, uint8x16_t b); // VZIP.8 q0,q0
2264	_NEON2SSE_GLOBAL uint16x8x2_t vzipq_u16(uint16x8_t a, uint16x8_t b); // VZIP.16 q0,q0
2265	_NEON2SSE_GLOBAL uint32x4x2_t vzipq_u32(uint32x4_t a, uint32x4_t b); // VZIP.32 q0,q0
2266	_NEON2SSESTORAGE float32x4x2_t vzipq_f32(float32x4_t a, float32x4_t b); // VZIP.32 q0,q0
2267	_NEON2SSE_GLOBAL poly8x16x2_t vzipq_p8(poly8x16_t a, poly8x16_t b); // VZIP.8 q0,q0
2268	_NEON2SSE_GLOBAL poly16x8x2_t vzipq_p16(poly16x8_t a, poly16x8_t b); // VZIP.16 q0,q0
2269	//De-Interleave elements
2270	_NEON2SSESTORAGE int8x8x2_t vuzp_s8(int8x8_t a, int8x8_t b); // VUZP.8 d0,d0
2271	_NEON2SSESTORAGE int16x4x2_t vuzp_s16(int16x4_t a, int16x4_t b); // VUZP.16 d0,d0
2272	_NEON2SSESTORAGE int32x2x2_t vuzp_s32(int32x2_t a, int32x2_t b); // VUZP.32 d0,d0
2273	_NEON2SSE_GLOBAL uint8x8x2_t vuzp_u8(uint8x8_t a, uint8x8_t b); // VUZP.8 d0,d0
2274	_NEON2SSE_GLOBAL uint16x4x2_t vuzp_u16(uint16x4_t a, uint16x4_t b); // VUZP.16 d0,d0
2275	_NEON2SSE_GLOBAL uint32x2x2_t vuzp_u32(uint32x2_t a, uint32x2_t b); // VUZP.32 d0,d0
2276	_NEON2SSE_GLOBAL float32x2x2_t vuzp_f32(float32x2_t a, float32x2_t b); // VUZP.32 d0,d0
2277	_NEON2SSE_GLOBAL poly8x8x2_t vuzp_p8(poly8x8_t a, poly8x8_t b); // VUZP.8 d0,d0
2278	_NEON2SSE_GLOBAL poly16x4x2_t vuzp_p16(poly16x4_t a, poly16x4_t b); // VUZP.16 d0,d0
2279	_NEON2SSESTORAGE int8x16x2_t vuzpq_s8(int8x16_t a, int8x16_t b); // VUZP.8 q0,q0
2280	_NEON2SSESTORAGE int16x8x2_t vuzpq_s16(int16x8_t a, int16x8_t b); // VUZP.16 q0,q0
2281	_NEON2SSESTORAGE int32x4x2_t vuzpq_s32(int32x4_t a, int32x4_t b); // VUZP.32 q0,q0
2282	_NEON2SSE_GLOBAL uint8x16x2_t vuzpq_u8(uint8x16_t a, uint8x16_t b); // VUZP.8 q0,q0
2283	_NEON2SSE_GLOBAL uint16x8x2_t vuzpq_u16(uint16x8_t a, uint16x8_t b); // VUZP.16 q0,q0
2284	_NEON2SSE_GLOBAL uint32x4x2_t vuzpq_u32(uint32x4_t a, uint32x4_t b); // VUZP.32 q0,q0
2285	_NEON2SSESTORAGE float32x4x2_t vuzpq_f32(float32x4_t a, float32x4_t b); // VUZP.32 q0,q0
2286	_NEON2SSE_GLOBAL poly8x16x2_t vuzpq_p8(poly8x16_t a, poly8x16_t b); // VUZP.8 q0,q0
2287	_NEON2SSE_GLOBAL poly16x8x2_t vuzpq_p16(poly16x8_t a, poly16x8_t b); // VUZP.16 q0,q0
2288
2289	_NEON2SSESTORAGE float32x4_t vrndnq_f32(float32x4_t a); // VRND.F32 q0,q0
2290	_NEON2SSESTORAGE float64x2_t vrndnq_f64(float64x2_t a); // VRND.F64 q0,q0
2291
2292	//Sqrt
2293	_NEON2SSE_GLOBAL float32x4_t vsqrtq_f32(float32x4_t a); // VSQRT.F32 q0,q0
2294	_NEON2SSE_GLOBAL float64x2_t vsqrtq_f64(float64x2_t a); // VSQRT.F64 q0,q0
2295
2296
2297	//^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
2298	// the following macros solve the problem of the "immediate parameters requirement" for some x86 intrinsics.
2299	// we need it to compile the code unless the "Intrinsic parameter must be an immediate value" error is our goal
2300	//
2301	#if ( defined (__INTEL_COMPILER) && !defined(__llvm__) )
2302	# define _MM_ALIGNR_EPI8 _mm_alignr_epi8
2303	# define _MM_EXTRACT_EPI16 (int16_t) _mm_extract_epi16
2304	# define _MM_INSERT_EPI16 _mm_insert_epi16
2305	# ifdef USE_SSE4
2306	# define _MM_EXTRACT_EPI8 _mm_extract_epi8
2307	# define _MM_EXTRACT_EPI32 _mm_extract_epi32
2308	# define _MM_EXTRACT_PS _mm_extract_ps
2309	# define _MM_INSERT_EPI8 _mm_insert_epi8
2310	# define _MM_INSERT_EPI32 _mm_insert_epi32
2311	# define _MM_INSERT_PS _mm_insert_ps
2312	# ifdef _NEON2SSE_64BIT
2313	# define _MM_INSERT_EPI64 _mm_insert_epi64
2314	# define _MM_EXTRACT_EPI64 _mm_extract_epi64
2315	# endif
2316	# endif //SSE4
2317	#else
2318	# define _NEON2SSE_COMMA ,
2319	# define _NEON2SSE_SWITCH16(NAME, a, b, LANE) \
2320	switch(LANE) \
2321	{ \
2322	case 0: return NAME(a b, 0); \
2323	case 1: return NAME(a b, 1); \
2324	case 2: return NAME(a b, 2); \
2325	case 3: return NAME(a b, 3); \
2326	case 4: return NAME(a b, 4); \
2327	case 5: return NAME(a b, 5); \
2328	case 6: return NAME(a b, 6); \
2329	case 7: return NAME(a b, 7); \
2330	case 8: return NAME(a b, 8); \
2331	case 9: return NAME(a b, 9); \
2332	case 10: return NAME(a b, 10); \
2333	case 11: return NAME(a b, 11); \
2334	case 12: return NAME(a b, 12); \
2335	case 13: return NAME(a b, 13); \
2336	case 14: return NAME(a b, 14); \
2337	case 15: return NAME(a b, 15); \
2338	default: return NAME(a b, 0); \
2339	}
2340
2341	# define _NEON2SSE_SWITCH8(NAME, vec, LANE, p) \
2342	switch(LANE) \
2343	{ \
2344	case 0: return NAME(vec p,0); \
2345	case 1: return NAME(vec p,1); \
2346	case 2: return NAME(vec p,2); \
2347	case 3: return NAME(vec p,3); \
2348	case 4: return NAME(vec p,4); \
2349	case 5: return NAME(vec p,5); \
2350	case 6: return NAME(vec p,6); \
2351	case 7: return NAME(vec p,7); \
2352	default: return NAME(vec p,0); \
2353	}
2354
2355	# define _NEON2SSE_SWITCH4(NAME, case0, case1, case2, case3, vec, LANE, p) \
2356	switch(LANE) \
2357	{ \
2358	case case0: return NAME(vec p,case0); \
2359	case case1: return NAME(vec p,case1); \
2360	case case2: return NAME(vec p,case2); \
2361	case case3: return NAME(vec p,case3); \
2362	default: return NAME(vec p,case0); \
2363	}
2364
2365	_NEON2SSE_INLINE __m128i _MM_ALIGNR_EPI8(__m128i a, __m128i b, int LANE)
2366	{
2367	_NEON2SSE_SWITCH16(_mm_alignr_epi8, a, _NEON2SSE_COMMA b, LANE)
2368	}
2369
2370	_NEON2SSE_INLINE __m128i _MM_INSERT_EPI16(__m128i vec, int p, const int LANE)
2371	{
2372	_NEON2SSE_SWITCH8(_mm_insert_epi16, vec, LANE, _NEON2SSE_COMMA p)
2373	}
2374
2375	_NEON2SSE_INLINE int16_t _MM_EXTRACT_EPI16(__m128i vec, const int LANE)
2376	{
2377	_NEON2SSE_SWITCH8((int16_t)_mm_extract_epi16, vec, LANE,)
2378	}
2379
2380	#ifdef USE_SSE4
2381	_NEON2SSE_INLINE int _MM_EXTRACT_EPI32(__m128i vec, const int LANE)
2382	{
2383	_NEON2SSE_SWITCH4(_mm_extract_epi32, `0`,`1`,`2`,`3`, vec, LANE,)
2384	}
2385
2386	_NEON2SSE_INLINE int _MM_EXTRACT_PS(__m128 vec, const int LANE)
2387	{
2388	_NEON2SSE_SWITCH4(_mm_extract_ps, `0`,`1`,`2`,`3`, vec, LANE,)
2389	}
2390
2391	_NEON2SSE_INLINE int _MM_EXTRACT_EPI8(__m128i vec, const int LANE)
2392	{
2393	_NEON2SSE_SWITCH16(_mm_extract_epi8, vec, , LANE)
2394	}
2395
2396	_NEON2SSE_INLINE __m128i _MM_INSERT_EPI32(__m128i vec, int p, const int LANE)
2397	{
2398	_NEON2SSE_SWITCH4(_mm_insert_epi32, `0`, `1`, `2`, `3`, vec, LANE, _NEON2SSE_COMMA p)
2399	}
2400
2401	_NEON2SSE_INLINE __m128i _MM_INSERT_EPI8(__m128i vec, int p, const int LANE)
2402	{
2403	_NEON2SSE_SWITCH16(_mm_insert_epi8, vec, _NEON2SSE_COMMA p, LANE)
2404	}
2405
2406	#ifdef _NEON2SSE_64BIT
2407	//the special case of functions available only for SSE4 and 64-bit build.
2408	_NEON2SSE_INLINE __m128i _MM_INSERT_EPI64(__m128i vec, int64_t p, const int LANE)
2409	{
2410	switch(LANE) {
2411	case `0`:
2412	return _mm_insert_epi64(vec, p, `0`);
2413	case `1`:
2414	return _mm_insert_epi64(vec, p, `1`);
2415	default:
2416	return _mm_insert_epi64(vec, p, `0`);
2417	}
2418	}
2419
2420	_NEON2SSE_INLINE int64_t _MM_EXTRACT_EPI64(__m128i val, const int LANE)
2421	{
2422	if (LANE ==`0`) return _mm_extract_epi64(val, `0`);
2423	else return _mm_extract_epi64(val, `1`);
2424	}
2425	#endif
2426
2427	_NEON2SSE_INLINE __m128 _MM_INSERT_PS(__m128 vec, __m128 p, const int LANE)
2428	{
2429	_NEON2SSE_SWITCH4(_mm_insert_ps, `0`, `16`, `32`, `48`, vec, LANE, _NEON2SSE_COMMA p)
2430	}
2431
2432	#endif //USE_SSE4
2433
2434	#endif //#ifdef NDEBUG
2435
2436	//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
2437	// Below are some helper functions used either for SSE4 intrinsics "emulation" for SSSE3 limited devices
2438	// or for some specific commonly used operations implementation missing in SSE
2439	#ifdef USE_SSE4
2440	# define _MM_CVTEPU8_EPI16 _mm_cvtepu8_epi16
2441	# define _MM_CVTEPU16_EPI32 _mm_cvtepu16_epi32
2442	# define _MM_CVTEPU32_EPI64 _mm_cvtepu32_epi64
2443
2444	# define _MM_CVTEPI8_EPI16 _mm_cvtepi8_epi16
2445	# define _MM_CVTEPI16_EPI32 _mm_cvtepi16_epi32
2446	# define _MM_CVTEPI32_EPI64 _mm_cvtepi32_epi64
2447
2448	# define _MM_MAX_EPI8 _mm_max_epi8
2449	# define _MM_MAX_EPI32 _mm_max_epi32
2450	# define _MM_MAX_EPU16 _mm_max_epu16
2451	# define _MM_MAX_EPU32 _mm_max_epu32
2452
2453	# define _MM_MIN_EPI8 _mm_min_epi8
2454	# define _MM_MIN_EPI32 _mm_min_epi32
2455	# define _MM_MIN_EPU16 _mm_min_epu16
2456	# define _MM_MIN_EPU32 _mm_min_epu32
2457
2458	# define _MM_BLENDV_EPI8 _mm_blendv_epi8
2459	# define _MM_PACKUS_EPI32 _mm_packus_epi32
2460	# define _MM_PACKUS1_EPI32(a) _mm_packus_epi32(a, a)
2461
2462	# define _MM_MULLO_EPI32 _mm_mullo_epi32
2463	# define _MM_MUL_EPI32 _mm_mul_epi32
2464
2465	# define _MM_CMPEQ_EPI64 _mm_cmpeq_epi64
2466	#else //no SSE4 !!!!!!
2467	_NEON2SSE_INLINE __m128i _MM_CVTEPU8_EPI16(__m128i a)
2468	{
2469	__m128i zero = _mm_setzero_si128();
2470	return _mm_unpacklo_epi8(a, zero);
2471	}
2472
2473	_NEON2SSE_INLINE __m128i _MM_CVTEPU16_EPI32(__m128i a)
2474	{
2475	__m128i zero = _mm_setzero_si128();
2476	return _mm_unpacklo_epi16(a, zero);
2477	}
2478
2479	_NEON2SSE_INLINE __m128i _MM_CVTEPU32_EPI64(__m128i a)
2480	{
2481	__m128i zero = _mm_setzero_si128();
2482	return _mm_unpacklo_epi32(a, zero);
2483	}
2484
2485	_NEON2SSE_INLINE __m128i _MM_CVTEPI8_EPI16(__m128i a)
2486	{
2487	__m128i zero = _mm_setzero_si128();
2488	__m128i sign = _mm_cmpgt_epi8(zero, a);
2489	return _mm_unpacklo_epi8(a, sign);
2490	}
2491
2492	_NEON2SSE_INLINE __m128i _MM_CVTEPI16_EPI32(__m128i a)
2493	{
2494	__m128i zero = _mm_setzero_si128();
2495	__m128i sign = _mm_cmpgt_epi16(zero, a);
2496	return _mm_unpacklo_epi16(a, sign);
2497	}
2498
2499	_NEON2SSE_INLINE __m128i _MM_CVTEPI32_EPI64(__m128i a)
2500	{
2501	__m128i zero = _mm_setzero_si128();
2502	__m128i sign = _mm_cmpgt_epi32(zero, a);
2503	return _mm_unpacklo_epi32(a, sign);
2504	}
2505
2506	_NEON2SSE_INLINE int _MM_EXTRACT_EPI32(__m128i vec, const int LANE)
2507	{
2508	_NEON2SSE_ALIGN_16 int32_t tmp[`4`];
2509	_mm_store_si128((__m128i*)tmp, vec);
2510	return tmp[LANE];
2511	}
2512
2513	_NEON2SSE_INLINE int _MM_EXTRACT_EPI8(__m128i vec, const int LANE)
2514	{
2515	_NEON2SSE_ALIGN_16 int8_t tmp[`16`];
2516	_mm_store_si128((__m128i*)tmp, vec);
2517	return (int)tmp[LANE];
2518	}
2519
2520	_NEON2SSE_INLINE int _MM_EXTRACT_PS(__m128 vec, const int LANE)
2521	{
2522	_NEON2SSE_ALIGN_16 int32_t tmp[`4`];
2523	_mm_store_si128((__m128i*)tmp, _M128i(vec));
2524	return tmp[LANE];
2525	}
2526
2527	_NEON2SSE_INLINE __m128i _MM_INSERT_EPI32(__m128i vec, int p, const int LANE)
2528	{
2529	_NEON2SSE_ALIGN_16 int32_t pvec[`4`] = {`0`,`0`,`0`,`0`};
2530	_NEON2SSE_ALIGN_16 uint32_t mask[`4`] = {`0xffffffff`,`0xffffffff`,`0xffffffff`,`0xffffffff`};
2531	__m128i vec_masked, p_masked;
2532	pvec[LANE] = p;
2533	mask[LANE] = `0x0`;
2534	vec_masked = _mm_and_si128 ((__m128i)mask,vec); //ready for p
2535	p_masked = _mm_andnot_si128 ((__m128i)mask,(__m128i)pvec); //ready for vec
2536	return _mm_or_si128(vec_masked, p_masked);
2537	}
2538
2539	_NEON2SSE_INLINE __m128i _MM_INSERT_EPI8(__m128i vec, int p, const int LANE)
2540	{
2541	_NEON2SSE_ALIGN_16 int8_t pvec[`16`] = {`0`,`0`,`0`,`0`, `0`,`0`,`0`,`0`, `0`,`0`,`0`,`0`, `0`,`0`,`0`,`0`};
2542	_NEON2SSE_ALIGN_16 uint8_t mask[`16`] = {`0xff`,`0xff`,`0xff`,`0xff`,`0xff`,`0xff`,`0xff`,`0xff`,`0xff`,`0xff`,`0xff`,`0xff`,`0xff`,`0xff`,`0xff`,`0xff`};
2543	__m128i vec_masked, p_masked;
2544	pvec[LANE] = (int8_t)p;
2545	mask[LANE] = `0x0`;
2546	vec_masked = _mm_and_si128 ((__m128i)mask,vec); //ready for p
2547	p_masked = _mm_andnot_si128 ((__m128i)mask,(__m128i)pvec); //ready for vec
2548	return _mm_or_si128(vec_masked, p_masked);
2549	}
2550
2551	_NEON2SSE_INLINE __m128 _MM_INSERT_PS(__m128 vec, __m128 p, const int LANE)
2552	{
2553	_NEON2SSE_ALIGN_16 uint32_t mask[`4`] = {`0xffffffff`,`0xffffffff`,`0xffffffff`,`0xffffffff`};
2554	__m128 tmp, vec_masked, p_masked;
2555	mask[LANE >> `4`] = `0x0`; //here the LANE is not actural lane, need to deal with it
2556	vec_masked = _mm_and_ps ((__m128)mask,vec); //ready for p
2557	p_masked = _mm_andnot_ps ((__m128)mask, p); //ready for vec
2558	tmp = _mm_or_ps(vec_masked, p_masked);
2559	return tmp;
2560	}
2561
2562	_NEON2SSE_INLINE __m128i _MM_MAX_EPI8(__m128i a, __m128i b)
2563	{
2564	__m128i cmp, resa, resb;
2565	cmp = _mm_cmpgt_epi8 (a, b);
2566	resa = _mm_and_si128 (cmp, a);
2567	resb = _mm_andnot_si128 (cmp,b);
2568	return _mm_or_si128(resa, resb);
2569	}
2570
2571	_NEON2SSE_INLINE __m128i _MM_MAX_EPI32(__m128i a, __m128i b)
2572	{
2573	__m128i cmp, resa, resb;
2574	cmp = _mm_cmpgt_epi32(a, b);
2575	resa = _mm_and_si128 (cmp, a);
2576	resb = _mm_andnot_si128 (cmp,b);
2577	return _mm_or_si128(resa, resb);
2578	}
2579
2580	_NEON2SSE_INLINE __m128i _MM_MAX_EPU16(__m128i a, __m128i b)
2581	{
2582	__m128i c8000, b_s, a_s, cmp;
2583	c8000 = _mm_cmpeq_epi16 (a,a); //0xffff
2584	c8000 = _mm_slli_epi16 (c8000, `15`); //0x8000
2585	b_s = _mm_sub_epi16 (b, c8000);
2586	a_s = _mm_sub_epi16 (a, c8000);
2587	cmp = _mm_cmpgt_epi16 (a_s, b_s); //no unsigned comparison, need to go to signed
2588	a_s = _mm_and_si128 (cmp,a);
2589	b_s = _mm_andnot_si128 (cmp,b);
2590	return _mm_or_si128(a_s, b_s);
2591	}
2592
2593	_NEON2SSE_INLINE __m128i _MM_MAX_EPU32(__m128i a, __m128i b)
2594	{
2595	__m128i c80000000, b_s, a_s, cmp;
2596	c80000000 = _mm_cmpeq_epi32 (a,a); //0xffffffff
2597	c80000000 = _mm_slli_epi32 (c80000000, `31`); //0x80000000
2598	b_s = _mm_sub_epi32 (b, c80000000);
2599	a_s = _mm_sub_epi32 (a, c80000000);
2600	cmp = _mm_cmpgt_epi32 (a_s, b_s); //no unsigned comparison, need to go to signed
2601	a_s = _mm_and_si128 (cmp,a);
2602	b_s = _mm_andnot_si128 (cmp,b);
2603	return _mm_or_si128(a_s, b_s);
2604	}
2605
2606	_NEON2SSE_INLINE __m128i _MM_MIN_EPI8(__m128i a, __m128i b)
2607	{
2608	__m128i cmp, resa, resb;
2609	cmp = _mm_cmpgt_epi8 (b, a);
2610	resa = _mm_and_si128 (cmp, a);
2611	resb = _mm_andnot_si128 (cmp,b);
2612	return _mm_or_si128(resa, resb);
2613	}
2614
2615	_NEON2SSE_INLINE __m128i _MM_MIN_EPI32(__m128i a, __m128i b)
2616	{
2617	__m128i cmp, resa, resb;
2618	cmp = _mm_cmpgt_epi32(b, a);
2619	resa = _mm_and_si128 (cmp, a);
2620	resb = _mm_andnot_si128 (cmp,b);
2621	return _mm_or_si128(resa, resb);
2622	}
2623
2624	_NEON2SSE_INLINE __m128i _MM_MIN_EPU16(__m128i a, __m128i b)
2625	{
2626	__m128i c8000, b_s, a_s, cmp;
2627	c8000 = _mm_cmpeq_epi16 (a,a); //0xffff
2628	c8000 = _mm_slli_epi16 (c8000, `15`); //0x8000
2629	b_s = _mm_sub_epi16 (b, c8000);
2630	a_s = _mm_sub_epi16 (a, c8000);
2631	cmp = _mm_cmpgt_epi16 (b_s, a_s); //no unsigned comparison, need to go to signed
2632	a_s = _mm_and_si128 (cmp,a);
2633	b_s = _mm_andnot_si128 (cmp,b);
2634	return _mm_or_si128(a_s, b_s);
2635	}
2636
2637	_NEON2SSE_INLINE __m128i _MM_MIN_EPU32(__m128i a, __m128i b)
2638	{
2639	__m128i c80000000, b_s, a_s, cmp;
2640	c80000000 = _mm_cmpeq_epi32 (a,a); //0xffffffff
2641	c80000000 = _mm_slli_epi32 (c80000000, `31`); //0x80000000
2642	b_s = _mm_sub_epi32 (b, c80000000);
2643	a_s = _mm_sub_epi32 (a, c80000000);
2644	cmp = _mm_cmpgt_epi32 (b_s, a_s); //no unsigned comparison, need to go to signed
2645	a_s = _mm_and_si128 (cmp,a);
2646	b_s = _mm_andnot_si128 (cmp,b);
2647	return _mm_or_si128(a_s, b_s);
2648	}
2649
2650	_NEON2SSE_INLINE __m128i _MM_BLENDV_EPI8(__m128i a, __m128i b, __m128i mask) //this is NOT exact implementation of _mm_blendv_epi8 !!!!! - please see below
2651	{
2652	//it assumes mask is either 0xff or 0 always (like in all usecases below) while for the original _mm_blendv_epi8 only MSB mask byte matters.
2653	__m128i a_masked, b_masked;
2654	b_masked = _mm_and_si128 (mask,b); //use b if mask 0xff
2655	a_masked = _mm_andnot_si128 (mask,a);
2656	return _mm_or_si128(a_masked, b_masked);
2657	}
2658
2659	_NEON2SSE_INLINE __m128i _MM_PACKUS_EPI32(__m128i a, __m128i b)
2660	{
2661	__m128i a16, b16, res, reshi,cmp, zero;
2662	zero = _mm_setzero_si128();
2663	a16 = _mm_shuffle_epi8 (a, (__m128i) mask8_32_even_odd);
2664	b16 = _mm_shuffle_epi8 (b, (__m128i) mask8_32_even_odd);
2665	res = _mm_unpacklo_epi64(a16, b16); //result without saturation
2666	reshi = _mm_unpackhi_epi64(a16, b16); //hi part of result used for saturation
2667	cmp = _mm_cmpgt_epi16(zero, reshi); //if cmp<0 the result should be zero
2668	res = _mm_andnot_si128(cmp,res); //if cmp zero - do nothing, otherwise cmp <0 and the result is 0
2669	cmp = _mm_cmpgt_epi16(reshi,zero); //if cmp positive
2670	return _mm_or_si128(res, cmp); //if cmp positive we are out of 16bits need to saturaate to 0xffff
2671	}
2672
2673	_NEON2SSE_INLINE __m128i _MM_PACKUS1_EPI32(__m128i a)
2674	{
2675	__m128i a16, res, reshi,cmp, zero;
2676	zero = _mm_setzero_si128();
2677	a16 = _mm_shuffle_epi8 (a, (__m128i)mask8_32_even_odd);
2678	reshi = _mm_unpackhi_epi64(a16, a16); //hi part of result used for saturation
2679	cmp = _mm_cmpgt_epi16(zero, reshi); //if cmp<0 the result should be zero
2680	res = _mm_andnot_si128(cmp, a16); //if cmp zero - do nothing, otherwise cmp <0 and the result is 0
2681	cmp = _mm_cmpgt_epi16(reshi,zero); //if cmp positive
2682	return _mm_or_si128(res, cmp); //if cmp positive we are out of 16bits need to saturaate to 0xffff
2683	}
2684
2685	// method used by GCC with generic vector extensions
2686	_NEON2SSE_INLINE __m128i _MM_MULLO_EPI32(__m128i a, __m128i b)
2687	{
2688	__m128i a_high = _mm_srli_epi64(a, `32`);
2689	__m128i low = _mm_mul_epu32(a, b);
2690	__m128i b_high = _mm_srli_epi64(b, `32`);
2691	__m128i high = _mm_mul_epu32(a_high, b_high);
2692	low = _mm_shuffle_epi32(low, _MM_SHUFFLE(`0`, `0`, `2`, `0`));
2693	high = _mm_shuffle_epi32(high, _MM_SHUFFLE(`0`, `0`, `2`, `0`));
2694	return _mm_unpacklo_epi32(low, high);
2695	}
2696
2697	_NEON2SSE_INLINE __m128i _MM_MUL_EPI32(__m128i a, __m128i b)
2698	{
2699	__m128i sign, zero, mul_us, a_neg, b_neg, mul_us_neg;
2700	sign = _mm_xor_si128 (a, b);
2701	sign = _mm_srai_epi32 (sign, `31`); //promote sign bit to all fields, all fff if negative and all 0 if positive
2702	sign = _mm_shuffle_epi32(sign, _MM_SHUFFLE(`2`, `2`, `0`, `0`)); //promote sign bit to 3 and 1st data lanes
2703	zero = _mm_setzero_si128();
2704	a_neg = _mm_abs_epi32 (a); //negate a and b
2705	b_neg = _mm_abs_epi32 (b); //negate a and b
2706	mul_us = _mm_mul_epu32 (a_neg, b_neg); //uses 0 and 2nd data lanes, (abs), the multiplication gives 64 bit result
2707	mul_us_neg = _mm_sub_epi64(zero, mul_us);
2708	mul_us_neg = _mm_and_si128(sign, mul_us_neg);
2709	mul_us = _mm_andnot_si128(sign, mul_us);
2710	return _mm_or_si128 (mul_us, mul_us_neg);
2711	}
2712
2713	_NEON2SSE_INLINE __m128i _MM_CMPEQ_EPI64(__m128i a, __m128i b)
2714	{
2715	__m128i res;
2716	res = _mm_cmpeq_epi32 (a, b);
2717	return _mm_shuffle_epi32 (res, `1` \| (`1` << `2`) \| (`3` << `4`) \| (`3` << `6`)); //copy the information from hi to low part of the 64 bit data
2718	}
2719	#endif //SSE4
2720
2721	//the special case of functions working only for 32 bits, no SSE4
2722	_NEON2SSE_INLINE __m128i _MM_INSERT_EPI64_32(__m128i vec, int64_t p, const int LANE)
2723	{
2724	_NEON2SSE_ALIGN_16 uint64_t pvec[`2`] = {`0`,`0`};
2725	_NEON2SSE_ALIGN_16 uint64_t mask[`2`] = {`0xffffffffffffffff`, `0xffffffffffffffff`};
2726	__m128i vec_masked, p_masked;
2727	pvec[LANE] = p;
2728	mask[LANE] = `0x0`;
2729	vec_masked = _mm_and_si128 ((__m128i)mask,vec); //ready for p
2730	p_masked = _mm_andnot_si128 ((__m128i)mask,(__m128i)pvec); //ready for vec
2731	return _mm_or_si128(vec_masked, p_masked);
2732	}
2733
2734	_NEON2SSE_INLINE int64_t _MM_EXTRACT_EPI64_32(__m128i val, const int LANE)
2735	{
2736	_NEON2SSE_ALIGN_16 int64_t tmp[`2`];
2737	_mm_store_si128((__m128i*)tmp, val);
2738	return tmp[LANE];
2739	}
2740
2741	#ifndef _NEON2SSE_64BIT_SSE4
2742	# define _MM_INSERT_EPI64 _MM_INSERT_EPI64_32
2743	# define _MM_EXTRACT_EPI64 _MM_EXTRACT_EPI64_32
2744	#endif
2745
2746	_NEON2SSESTORAGE int32x4_t vqd_s32(int32x4_t a); //Doubling saturation for signed ints
2747	_NEON2SSE_INLINE int32x4_t vqd_s32(int32x4_t a)
2748	{
2749	//Overflow happens only if a and sum have the opposite signs
2750	__m128i c7fffffff, res, res_sat, res_xor_a;
2751	c7fffffff = _mm_set1_epi32(`0x7fffffff`);
2752	res = _mm_slli_epi32 (a, `1`); // res = a2*
2753	res_sat = _mm_srli_epi32(a, `31`);
2754	res_sat = _mm_add_epi32(res_sat, c7fffffff);
2755	res_xor_a = _mm_xor_si128(res, a);
2756	res_xor_a = _mm_srai_epi32(res_xor_a,`31`); //propagate the sigh bit, all ffff if <0 all ones otherwise
2757	res_sat = _mm_and_si128(res_xor_a, res_sat);
2758	res = _mm_andnot_si128(res_xor_a, res);
2759	return _mm_or_si128(res, res_sat);
2760	}
2761
2762
2763	//!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
2764	//*************************************************************************
2765	//*************************************************************************
2766	//*************** Functions redefinition\implementatin starts here ***
2767	//*************************************************************************
2768	//*************************************************************************
2769	//!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
2770
2771	/If the unified intrinsics solutions is necessary please define your SSE intrinsics wrap here like in the following sample:*
2772	#ifdef ARM
2773	#define vector_addq_s32 _mm_add_epi32
2774	#else //if we have IA
2775	#define vector_addq_s32 vadd_s32
2776	#endif
2777
2778	********************************************************************************************
2779	Functions below are organised in the following way:
2780
2781	Each NEON intrinsic function has one of the following options:
2782	1. its x86 full equivalent SSE intrinsic - in this case x86 version just follows the NEON one under the corresponding #define statement
2783	2. x86 implementation using more than one x86 intrinsics. In this case it is shaped as inlined C function with return statement
2784	3. the reference to the NEON function returning the same result and implemented in x86 as above. In this case it is shaped as matching NEON function definition
2785	4. for about 5% of functions due to the corresponding x86 SIMD unavailability or inefficiency in terms of performance
2786	the serial implementation is provided along with the corresponding compiler warning. If these functions are on your app critical path
2787	- please consider such functions removal from your code.
2788	*/
2789
2790	//***********************************************************************
2791	//********************** Vector add ***************************
2792	//***********************************************************************
2793	_NEON2SSESTORAGE int8x8_t vadd_s8(int8x8_t a, int8x8_t b); // VADD.I8 d0,d0,d0
2794	_NEON2SSE_INLINE int8x8_t vadd_s8(int8x8_t a, int8x8_t b)
2795	{
2796	int8x8_t res64;
2797	return64(_mm_add_epi8(_pM128i(a),_pM128i(b)));
2798	}
2799
2800
2801	_NEON2SSESTORAGE int16x4_t vadd_s16(int16x4_t a, int16x4_t b); // VADD.I16 d0,d0,d0
2802	_NEON2SSE_INLINE int16x4_t vadd_s16(int16x4_t a, int16x4_t b)
2803	{
2804	int16x4_t res64;
2805	return64(_mm_add_epi16(_pM128i(a),_pM128i(b)));
2806	}
2807
2808
2809	_NEON2SSESTORAGE int32x2_t vadd_s32(int32x2_t a, int32x2_t b); // VADD.I32 d0,d0,d0
2810	_NEON2SSE_INLINE int32x2_t vadd_s32(int32x2_t a, int32x2_t b)
2811	{
2812	int32x2_t res64;
2813	return64(_mm_add_epi32(_pM128i(a),_pM128i(b)));
2814	}
2815
2816
2817	_NEON2SSESTORAGE int64x1_t vadd_s64(int64x1_t a, int64x1_t b); // VADD.I64 d0,d0,d0
2818	_NEON2SSE_INLINE int64x1_t vadd_s64(int64x1_t a, int64x1_t b)
2819	{
2820	int64x1_t res64;
2821	res64.m64_i64[`0`] = a.m64_i64[`0`] + b.m64_i64[`0`];
2822	return res64;
2823	}
2824
2825
2826	_NEON2SSESTORAGE float32x2_t vadd_f32(float32x2_t a, float32x2_t b); // VADD.F32 d0,d0,d0
2827	_NEON2SSE_INLINE float32x2_t vadd_f32(float32x2_t a, float32x2_t b)
2828	{
2829	__m128 res;
2830	__m64_128 res64;
2831	res = _mm_add_ps(_pM128(a),_pM128(b)); //SSE, use only low 64 bits
2832	_M64f(res64, res);
2833	return res64;
2834	}
2835
2836	_NEON2SSE_GLOBAL uint8x8_t vadd_u8(uint8x8_t a, uint8x8_t b); // VADD.I8 d0,d0,d0
2837	#define vadd_u8 vadd_s8
2838
2839	_NEON2SSE_GLOBAL uint16x4_t vadd_u16(uint16x4_t a, uint16x4_t b); // VADD.I16 d0,d0,d0
2840	#define vadd_u16 vadd_s16
2841
2842	_NEON2SSE_GLOBAL uint32x2_t vadd_u32(uint32x2_t a, uint32x2_t b); // VADD.I32 d0,d0,d0
2843	#define vadd_u32 vadd_s32
2844
2845	_NEON2SSESTORAGE uint64x1_t vadd_u64(uint64x1_t a, uint64x1_t b); // VADD.I64 d0,d0,d0
2846	_NEON2SSE_INLINE uint64x1_t vadd_u64(uint64x1_t a, uint64x1_t b)
2847	{
2848	uint64x1_t res64;
2849	res64.m64_u64[`0`] = a.m64_u64[`0`] + b.m64_u64[`0`];
2850	return res64;
2851	}
2852
2853
2854	_NEON2SSE_GLOBAL int8x16_t vaddq_s8(int8x16_t a, int8x16_t b); // VADD.I8 q0,q0,q0
2855	#define vaddq_s8 _mm_add_epi8
2856
2857	_NEON2SSE_GLOBAL int16x8_t vaddq_s16(int16x8_t a, int16x8_t b); // VADD.I16 q0,q0,q0
2858	#define vaddq_s16 _mm_add_epi16
2859
2860	_NEON2SSE_GLOBAL int32x4_t vaddq_s32(int32x4_t a, int32x4_t b); // VADD.I32 q0,q0,q0
2861	#define vaddq_s32 _mm_add_epi32
2862
2863	_NEON2SSE_GLOBAL int64x2_t vaddq_s64(int64x2_t a, int64x2_t b); // VADD.I64 q0,q0,q0
2864	#define vaddq_s64 _mm_add_epi64
2865
2866	_NEON2SSE_GLOBAL float32x4_t vaddq_f32(float32x4_t a, float32x4_t b); // VADD.F32 q0,q0,q0
2867	#define vaddq_f32 _mm_add_ps
2868
2869	_NEON2SSE_GLOBAL uint8x16_t vaddq_u8(uint8x16_t a, uint8x16_t b); // VADD.I8 q0,q0,q0
2870	#define vaddq_u8 _mm_add_epi8
2871
2872	_NEON2SSE_GLOBAL uint16x8_t vaddq_u16(uint16x8_t a, uint16x8_t b); // VADD.I16 q0,q0,q0
2873	#define vaddq_u16 _mm_add_epi16
2874
2875	_NEON2SSE_GLOBAL uint32x4_t vaddq_u32(uint32x4_t a, uint32x4_t b); // VADD.I32 q0,q0,q0
2876	#define vaddq_u32 _mm_add_epi32
2877
2878	_NEON2SSE_GLOBAL uint64x2_t vaddq_u64(uint64x2_t a, uint64x2_t b); // VADD.I64 q0,q0,q0
2879	#define vaddq_u64 _mm_add_epi64
2880
2881	//************************* Vector long add **************************:
2882	//***********************************************************************
2883	//Va, Vb have equal lane sizes, result is a 128 bit vector of lanes that are twice the width.
2884	_NEON2SSESTORAGE int16x8_t vaddl_s8(int8x8_t a, int8x8_t b); // VADDL.S8 q0,d0,d0
2885	_NEON2SSE_INLINE int16x8_t vaddl_s8(int8x8_t a, int8x8_t b) // VADDL.S8 q0,d0,d0
2886	{
2887	__m128i a16, b16;
2888	a16 = _MM_CVTEPI8_EPI16 (_pM128i(a)); //SSE4.1,
2889	b16 = _MM_CVTEPI8_EPI16 (_pM128i(b)); //SSE4.1,
2890	return _mm_add_epi16 (a16, b16);
2891	}
2892
2893	_NEON2SSESTORAGE int32x4_t vaddl_s16(int16x4_t a, int16x4_t b); // VADDL.S16 q0,d0,d0
2894	_NEON2SSE_INLINE int32x4_t vaddl_s16(int16x4_t a, int16x4_t b) // VADDL.S16 q0,d0,d0
2895	{
2896	__m128i a32, b32;
2897	a32 = _MM_CVTEPI16_EPI32 (_pM128i(a)); //SSE4.1
2898	b32 = _MM_CVTEPI16_EPI32 (_pM128i(b)); //SSE4.1
2899	return _mm_add_epi32 (a32, b32);
2900	}
2901
2902	_NEON2SSESTORAGE int64x2_t vaddl_s32(int32x2_t a, int32x2_t b); // VADDL.S32 q0,d0,d0
2903	_NEON2SSE_INLINE int64x2_t vaddl_s32(int32x2_t a, int32x2_t b) // VADDL.S32 q0,d0,d0
2904	{
2905	//may be not optimal
2906	__m128i a64, b64;
2907	a64 = _MM_CVTEPI32_EPI64 (_pM128i(a)); //SSE4.1
2908	b64 = _MM_CVTEPI32_EPI64 (_pM128i(b)); //SSE4.1
2909	return _mm_add_epi64 ( a64, b64);
2910	}
2911
2912	_NEON2SSESTORAGE uint16x8_t vaddl_u8(uint8x8_t a, uint8x8_t b); // VADDL.U8 q0,d0,d0
2913	_NEON2SSE_INLINE uint16x8_t vaddl_u8(uint8x8_t a, uint8x8_t b) // VADDL.U8 q0,d0,d0
2914	{
2915	__m128i a16, b16;
2916	a16 = _MM_CVTEPU8_EPI16 (_pM128i(a)); //SSE4.1
2917	b16 = _MM_CVTEPU8_EPI16 (_pM128i(b)); //SSE4.1
2918	return _mm_add_epi16 (a16, b16);
2919	}
2920
2921	_NEON2SSESTORAGE uint32x4_t vaddl_u16(uint16x4_t a, uint16x4_t b); // VADDL.s16 q0,d0,d0
2922	_NEON2SSE_INLINE uint32x4_t vaddl_u16(uint16x4_t a, uint16x4_t b) // VADDL.s16 q0,d0,d0
2923	{
2924	__m128i a32, b32;
2925	a32 = _MM_CVTEPU16_EPI32 (_pM128i(a)); //SSE4.1
2926	b32 = _MM_CVTEPU16_EPI32 (_pM128i(b)); //SSE4.1
2927	return _mm_add_epi32 (a32, b32);
2928	}
2929
2930	_NEON2SSESTORAGE uint64x2_t vaddl_u32(uint32x2_t a, uint32x2_t b); // VADDL.U32 q0,d0,d0
2931	_NEON2SSE_INLINE uint64x2_t vaddl_u32(uint32x2_t a, uint32x2_t b) // VADDL.U32 q0,d0,d0
2932	{
2933	//may be not optimal
2934	__m128i a64, b64;
2935	a64 = _MM_CVTEPU32_EPI64 (_pM128i(a)); //SSE4.1
2936	b64 = _MM_CVTEPU32_EPI64 (_pM128i(b)); //SSE4.1
2937	return _mm_add_epi64 (a64, b64);
2938	}
2939
2940	//************* Vector wide add: vaddw_<type>. Vr[i]:=Va[i]+Vb[i] ****************
2941	//************* *******************************************************************
2942	_NEON2SSESTORAGE int16x8_t vaddw_s8(int16x8_t a, int8x8_t b); // VADDW.S8 q0,q0,d0
2943	_NEON2SSE_INLINE int16x8_t vaddw_s8(int16x8_t a, int8x8_t b) // VADDW.S8 q0,q0,d0
2944	{
2945	__m128i b16;
2946	b16 = _MM_CVTEPI8_EPI16 (_pM128i(b)); //SSE4.1,
2947	return _mm_add_epi16 (a, b16);
2948	}
2949
2950	_NEON2SSESTORAGE int32x4_t vaddw_s16(int32x4_t a, int16x4_t b); // VADDW.S16 q0,q0,d0
2951	_NEON2SSE_INLINE int32x4_t vaddw_s16(int32x4_t a, int16x4_t b) // VADDW.S16 q0,q0,d0
2952	{
2953	__m128i b32;
2954	b32 = _MM_CVTEPI16_EPI32(_pM128i(b)); //SSE4.1,
2955	return _mm_add_epi32 (a, b32);
2956	}
2957
2958	_NEON2SSESTORAGE int64x2_t vaddw_s32(int64x2_t a, int32x2_t b); // VADDW.S32 q0,q0,d0
2959	_NEON2SSE_INLINE int64x2_t vaddw_s32(int64x2_t a, int32x2_t b) // VADDW.S32 q0,q0,d0
2960	{
2961	__m128i b64;
2962	b64 = _MM_CVTEPI32_EPI64 (_pM128i(b)); //SSE4.1
2963	return _mm_add_epi64 (a, b64);
2964	}
2965
2966	_NEON2SSESTORAGE uint16x8_t vaddw_u8(uint16x8_t a, uint8x8_t b); // VADDW.U8 q0,q0,d0
2967	_NEON2SSE_INLINE uint16x8_t vaddw_u8(uint16x8_t a, uint8x8_t b) // VADDW.U8 q0,q0,d0
2968	{
2969	__m128i b16;
2970	b16 = _MM_CVTEPU8_EPI16 (_pM128i(b)); //SSE4.1
2971	return _mm_add_epi16 (a, b16);
2972	}
2973
2974	_NEON2SSESTORAGE uint32x4_t vaddw_u16(uint32x4_t a, uint16x4_t b); // VADDW.s16 q0,q0,d0
2975	_NEON2SSE_INLINE uint32x4_t vaddw_u16(uint32x4_t a, uint16x4_t b) // VADDW.s16 q0,q0,d0
2976	{
2977	__m128i b32;
2978	b32 = _MM_CVTEPU16_EPI32 (_pM128i(b)); //SSE4.1
2979	return _mm_add_epi32 (a, b32);
2980	}
2981
2982	_NEON2SSESTORAGE uint64x2_t vaddw_u32(uint64x2_t a, uint32x2_t b); // VADDW.U32 q0,q0,d0
2983	_NEON2SSE_INLINE uint64x2_t vaddw_u32(uint64x2_t a, uint32x2_t b) // VADDW.U32 q0,q0,d0
2984	{
2985	__m128i b64;
2986	b64 = _MM_CVTEPU32_EPI64 (_pM128i(b)); //SSE4.1
2987	return _mm_add_epi64 (a, b64);
2988	}
2989
2990	//****************************Vector halving add: vhadd -> Vr[i]:=(Va[i]+Vb[i])>>1 , result truncated *****************************
2991	//*************************************************************************************************************************
2992	_NEON2SSESTORAGE int8x8_t vhadd_s8(int8x8_t a, int8x8_t b); // VHADD.S8 d0,d0,d0
2993	_NEON2SSE_INLINE int8x8_t vhadd_s8(int8x8_t a, int8x8_t b)
2994	{
2995	int8x8_t res64;
2996	return64(vhaddq_s8(_pM128i(a), _pM128i(b)));
2997	}
2998
2999
3000	_NEON2SSESTORAGE int16x4_t vhadd_s16(int16x4_t a, int16x4_t b); // VHADD.S16 d0,d0,d0
3001	_NEON2SSE_INLINE int16x4_t vhadd_s16(int16x4_t a, int16x4_t b)
3002	{
3003	int16x4_t res64;
3004	return64( vhaddq_s16(_pM128i(a), _pM128i(b)));
3005	}
3006
3007
3008	_NEON2SSESTORAGE int32x2_t vhadd_s32(int32x2_t a, int32x2_t b); // VHADD.S32 d0,d0,d0
3009	_NEON2SSE_INLINE int32x2_t vhadd_s32(int32x2_t a, int32x2_t b)
3010	{
3011	int32x2_t res64;
3012	return64( vhaddq_s32(_pM128i(a), _pM128i(b)));
3013	}
3014
3015
3016	_NEON2SSESTORAGE uint8x8_t vhadd_u8(uint8x8_t a, uint8x8_t b); // VHADD.w d0,d0,d0
3017	_NEON2SSE_INLINE uint8x8_t vhadd_u8(uint8x8_t a, uint8x8_t b)
3018	{
3019	uint8x8_t res64;
3020	return64( vhaddq_u8(_pM128i(a), _pM128i(b)));
3021	}
3022
3023
3024	_NEON2SSESTORAGE uint16x4_t vhadd_u16(uint16x4_t a, uint16x4_t b); // VHADD.s16 d0,d0,d0
3025	_NEON2SSE_INLINE uint16x4_t vhadd_u16(uint16x4_t a, uint16x4_t b)
3026	{
3027	uint16x4_t res64;
3028	return64( vhaddq_u16(_pM128i(a), _pM128i(b)));
3029	}
3030
3031
3032	_NEON2SSESTORAGE uint32x2_t vhadd_u32(uint32x2_t a, uint32x2_t b); // VHADD.U32 d0,d0,d0
3033	_NEON2SSE_INLINE uint32x2_t vhadd_u32(uint32x2_t a, uint32x2_t b)
3034	{
3035	uint32x2_t res64;
3036	return64( vhaddq_u32(_pM128i(a), _pM128i(b)));
3037	}
3038
3039
3040	_NEON2SSESTORAGE int8x16_t vhaddq_s8(int8x16_t a, int8x16_t b); // VHADD.S8 q0,q0,q0
3041	_NEON2SSE_INLINE int8x16_t vhaddq_s8(int8x16_t a, int8x16_t b)
3042	{
3043	//need to avoid internal overflow, will use the (x&y)+((x^y)>>1).
3044	__m128i tmp1, tmp2;
3045	tmp1 = _mm_and_si128(a,b);
3046	tmp2 = _mm_xor_si128(a,b);
3047	tmp2 = vshrq_n_s8(tmp2,`1`);
3048	return _mm_add_epi8(tmp1,tmp2);
3049	}
3050
3051	_NEON2SSESTORAGE int16x8_t vhaddq_s16(int16x8_t a, int16x8_t b); // VHADD.S1 6 q0,q0,q0
3052	_NEON2SSE_INLINE int16x8_t vhaddq_s16(int16x8_t a, int16x8_t b)
3053	{
3054	//need to avoid internal overflow, will use the (x&y)+((x^y)>>1).
3055	__m128i tmp1, tmp2;
3056	tmp1 = _mm_and_si128(a,b);
3057	tmp2 = _mm_xor_si128(a,b);
3058	tmp2 = _mm_srai_epi16(tmp2,`1`);
3059	return _mm_add_epi16(tmp1,tmp2);
3060	}
3061
3062	_NEON2SSESTORAGE int32x4_t vhaddq_s32(int32x4_t a, int32x4_t b); // VHADD.S32 q0,q0,q0
3063	_NEON2SSE_INLINE int32x4_t vhaddq_s32(int32x4_t a, int32x4_t b) // VHADD.S32 q0,q0,q0
3064	{
3065	//need to avoid internal overflow, will use the (x&y)+((x^y)>>1).
3066	__m128i tmp1, tmp2;
3067	tmp1 = _mm_and_si128(a,b);
3068	tmp2 = _mm_xor_si128(a,b);
3069	tmp2 = _mm_srai_epi32(tmp2,`1`);
3070	return _mm_add_epi32(tmp1,tmp2);
3071	}
3072
3073	_NEON2SSESTORAGE uint8x16_t vhaddq_u8(uint8x16_t a, uint8x16_t b); // VHADD.U8 q0,q0,q0
3074	_NEON2SSE_INLINE uint8x16_t vhaddq_u8(uint8x16_t a, uint8x16_t b) // VHADD.U8 q0,q0,q0
3075	{
3076	__m128i c1, sum, res;
3077	c1 = _mm_set1_epi8(`1`);
3078	sum = _mm_avg_epu8(a, b); //result is rounded, need to compensate it
3079	res = _mm_xor_si128(a, b); //for rounding compensation
3080	res = _mm_and_si128(res,c1); //for rounding compensation
3081	return _mm_sub_epi8 (sum, res); //actual rounding compensation
3082	}
3083
3084	_NEON2SSESTORAGE uint16x8_t vhaddq_u16(uint16x8_t a, uint16x8_t b); // VHADD.s16 q0,q0,q0
3085	_NEON2SSE_INLINE uint16x8_t vhaddq_u16(uint16x8_t a, uint16x8_t b) // VHADD.s16 q0,q0,q0
3086	{
3087	__m128i sum, res;
3088	sum = _mm_avg_epu16(a, b); //result is rounded, need to compensate it
3089	res = _mm_xor_si128(a, b); //for rounding compensation
3090	res = _mm_slli_epi16 (res,`15`); //shift left then back right to
3091	res = _mm_srli_epi16 (res,`15`); //get 1 or zero
3092	return _mm_sub_epi16 (sum, res); //actual rounding compensation
3093	}
3094
3095	_NEON2SSESTORAGE uint32x4_t vhaddq_u32(uint32x4_t a, uint32x4_t b); // VHADD.U32 q0,q0,q0
3096	_NEON2SSE_INLINE uint32x4_t vhaddq_u32(uint32x4_t a, uint32x4_t b) // VHADD.U32 q0,q0,q0
3097	{
3098	//need to avoid internal overflow, will use the (x&y)+((x^y)>>1).
3099	__m128i tmp1, tmp2;
3100	tmp1 = _mm_and_si128(a,b);
3101	tmp2 = _mm_xor_si128(a,b);
3102	tmp2 = _mm_srli_epi32(tmp2,`1`);
3103	return _mm_add_epi32(tmp1,tmp2);
3104	}
3105
3106	//**********************Vector rounding halving add: vrhadd{q}_<type>. Vr[i]:=(Va[i]+Vb[i]+1)>>1 *************************
3107	//*****************************************************************************************************************************
3108	_NEON2SSESTORAGE int8x8_t vrhadd_s8(int8x8_t a, int8x8_t b); // VRHADD.S8 d0,d0,d0
3109	_NEON2SSE_INLINE int8x8_t vrhadd_s8(int8x8_t a, int8x8_t b)
3110	{
3111	int8x8_t res64;
3112	return64(vrhaddq_s8(_pM128i(a), _pM128i(b)));
3113	}
3114
3115
3116	_NEON2SSESTORAGE int16x4_t vrhadd_s16(int16x4_t a, int16x4_t b); // VRHADD.S16 d0,d0,d0
3117	_NEON2SSE_INLINE int16x4_t vrhadd_s16(int16x4_t a, int16x4_t b)
3118	{
3119	int16x4_t res64;
3120	return64(vrhaddq_s16(_pM128i(a), _pM128i(b)));
3121	}
3122
3123
3124	_NEON2SSESTORAGE int32x2_t vrhadd_s32(int32x2_t a, int32x2_t b); // VRHADD.S32 d0,d0,d0
3125	_NEON2SSE_INLINE int32x2_t vrhadd_s32(int32x2_t a, int32x2_t b)
3126	{
3127	int32x2_t res64;
3128	return64(vrhaddq_s32(_pM128i(a), _pM128i(b)));
3129	}
3130
3131
3132	_NEON2SSESTORAGE uint8x8_t vrhadd_u8(uint8x8_t a, uint8x8_t b); // VRHADD.U8 d0,d0,d0
3133	_NEON2SSE_INLINE uint8x8_t vrhadd_u8(uint8x8_t a, uint8x8_t b)
3134	{
3135	uint8x8_t res64;
3136	return64(_mm_avg_epu8(_pM128i(a),_pM128i(b))); //SSE, result rounding!!!
3137	}
3138
3139
3140	_NEON2SSESTORAGE uint16x4_t vrhadd_u16(uint16x4_t a, uint16x4_t b); // VRHADD.s16 d0,d0,d0
3141	_NEON2SSE_INLINE uint16x4_t vrhadd_u16(uint16x4_t a, uint16x4_t b)
3142	{
3143	uint16x4_t res64;
3144	return64(_mm_avg_epu16(_pM128i(a),_pM128i(b))); //SSE, result rounding!!!
3145	}
3146
3147
3148	_NEON2SSESTORAGE uint32x2_t vrhadd_u32(uint32x2_t a, uint32x2_t b); // VRHADD.U32 d0,d0,d0
3149	_NEON2SSE_INLINE uint32x2_t vrhadd_u32(uint32x2_t a, uint32x2_t b)
3150	{
3151	uint32x2_t res64;
3152	return64(vrhaddq_u32(_pM128i(a), _pM128i(b)));
3153	}
3154
3155
3156	_NEON2SSESTORAGE int8x16_t vrhaddq_s8(int8x16_t a, int8x16_t b); // VRHADD.S8 q0,q0,q0
3157	_NEON2SSE_INLINE int8x16_t vrhaddq_s8(int8x16_t a, int8x16_t b) // VRHADD.S8 q0,q0,q0
3158	{
3159	//no signed average in x86 SIMD, go to unsigned
3160	__m128i c128, au, bu, sum;
3161	c128 = _mm_set1_epi8(-`128`); //(int8_t)0x80
3162	au = _mm_sub_epi8(a, c128); //add 128
3163	bu = _mm_sub_epi8(b, c128); //add 128
3164	sum = _mm_avg_epu8(au, bu);
3165	return _mm_add_epi8 (sum, c128); //sub 128
3166	}
3167
3168	_NEON2SSESTORAGE int16x8_t vrhaddq_s16(int16x8_t a, int16x8_t b); // VRHADD.S16 q0,q0,q0
3169	_NEON2SSE_INLINE int16x8_t vrhaddq_s16(int16x8_t a, int16x8_t b) // VRHADD.S16 q0,q0,q0
3170	{
3171	//no signed average in x86 SIMD, go to unsigned
3172	__m128i cx8000, au, bu, sum;
3173	cx8000 = _mm_set1_epi16(-`32768`); //(int16_t)0x8000
3174	au = _mm_sub_epi16(a, cx8000); //add 32768
3175	bu = _mm_sub_epi16(b, cx8000); //add 32768
3176	sum = _mm_avg_epu16(au, bu);
3177	return _mm_add_epi16 (sum, cx8000); //sub 32768
3178	}
3179
3180	_NEON2SSESTORAGE int32x4_t vrhaddq_s32(int32x4_t a, int32x4_t b); // VRHADD.S32 q0,q0,q0
3181	_NEON2SSE_INLINE int32x4_t vrhaddq_s32(int32x4_t a, int32x4_t b)
3182	{
3183	//need to avoid overflow
3184	__m128i a2, b2, res, sum;
3185	a2 = _mm_srai_epi32(a,`1`); //a2=a/2;
3186	b2 = _mm_srai_epi32(b,`1`); // b2=b/2;
3187	res = _mm_or_si128(a,b); //for rounding
3188	res = _mm_slli_epi32 (res,`31`); //shift left then back right to
3189	res = _mm_srli_epi32 (res,`31`); //get 1 or zero
3190	sum = _mm_add_epi32(a2,b2);
3191	return _mm_add_epi32(sum,res);
3192	}
3193
3194	_NEON2SSE_GLOBAL uint8x16_t vrhaddq_u8(uint8x16_t a, uint8x16_t b); // VRHADD.U8 q0,q0,q0
3195	#define vrhaddq_u8 _mm_avg_epu8 //SSE2, results rounded
3196
3197	_NEON2SSE_GLOBAL uint16x8_t vrhaddq_u16(uint16x8_t a, uint16x8_t b); // VRHADD.s16 q0,q0,q0
3198	#define vrhaddq_u16 _mm_avg_epu16 //SSE2, results rounded
3199
3200
3201	_NEON2SSESTORAGE uint32x4_t vrhaddq_u32(uint32x4_t a, uint32x4_t b); // VRHADD.U32 q0,q0,q0
3202	_NEON2SSE_INLINE uint32x4_t vrhaddq_u32(uint32x4_t a, uint32x4_t b) // VRHADD.U32 q0,q0,q0
3203	{
3204	//need to avoid overflow
3205	__m128i a2, b2, res, sum;
3206	a2 = _mm_srli_epi32(a,`1`); //a2=a/2;
3207	b2 = _mm_srli_epi32(b,`1`); // b2=b/2;
3208	res = _mm_or_si128(a,b); //for rounding
3209	res = _mm_slli_epi32 (res,`31`); //shift left then back right to
3210	res = _mm_srli_epi32 (res,`31`); //get 1 or zero
3211	sum = _mm_add_epi32(a2,b2);
3212	return _mm_add_epi32(sum,res);
3213	}
3214
3215	//**************** VQADD: Vector saturating add **********************
3216	//************************************************************************
3217	_NEON2SSESTORAGE int8x8_t vqadd_s8(int8x8_t a, int8x8_t b); // VQADD.S8 d0,d0,d0
3218	_NEON2SSE_INLINE int8x8_t vqadd_s8(int8x8_t a, int8x8_t b)
3219	{
3220	int8x8_t res64;
3221	return64(_mm_adds_epi8(_pM128i(a),_pM128i(b)));
3222	}
3223
3224
3225	_NEON2SSESTORAGE int16x4_t vqadd_s16(int16x4_t a, int16x4_t b); // VQADD.S16 d0,d0,d0
3226	_NEON2SSE_INLINE int16x4_t vqadd_s16(int16x4_t a, int16x4_t b)
3227	{
3228	int16x4_t res64;
3229	return64(_mm_adds_epi16(_pM128i(a),_pM128i(b)));
3230	}
3231
3232
3233	_NEON2SSESTORAGE int32x2_t vqadd_s32(int32x2_t a, int32x2_t b); // VQADD.S32 d0,d0,d0
3234	_NEON2SSE_INLINE int32x2_t vqadd_s32(int32x2_t a, int32x2_t b)
3235	{
3236	int32x2_t res64;
3237	return64(vqaddq_s32(_pM128i(a), _pM128i(b)));
3238	}
3239
3240
3241	_NEON2SSESTORAGE int64x1_t vqadd_s64(int64x1_t a, int64x1_t b); // VQADD.S64 d0,d0,d0
3242	_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int64x1_t vqadd_s64(int64x1_t a, int64x1_t b), _NEON2SSE_REASON_SLOW_SERIAL)
3243	{
3244	int64x1_t res;
3245	uint64_t a64, b64;
3246	a64 = a.m64_u64[`0`];
3247	b64 = b.m64_u64[`0`];
3248	res.m64_u64[`0`] = a64 + b64;
3249	a64 = (a64 >> `63`) + (~_SIGNBIT64);
3250	if ((int64_t)((b64 ^ a64) \| ~(res.m64_u64[`0`] ^ b64))>=`0`) {
3251	res.m64_u64[`0`] = a64;
3252	}
3253	return res;
3254	}
3255
3256	_NEON2SSESTORAGE uint8x8_t vqadd_u8(uint8x8_t a, uint8x8_t b); // VQADD.U8 d0,d0,d0
3257	_NEON2SSE_INLINE uint8x8_t vqadd_u8(uint8x8_t a, uint8x8_t b)
3258	{
3259	uint8x8_t res64;
3260	return64(_mm_adds_epu8(_pM128i(a),_pM128i(b)));
3261	}
3262
3263
3264	_NEON2SSESTORAGE uint16x4_t vqadd_u16(uint16x4_t a, uint16x4_t b); // VQADD.s16 d0,d0,d0
3265	_NEON2SSE_INLINE uint16x4_t vqadd_u16(uint16x4_t a, uint16x4_t b)
3266	{
3267	uint16x4_t res64;
3268	return64(_mm_adds_epu16(_pM128i(a),_pM128i(b)));
3269	}
3270
3271
3272	_NEON2SSESTORAGE uint32x2_t vqadd_u32(uint32x2_t a, uint32x2_t b); // VQADD.U32 d0,d0,d0
3273	_NEON2SSE_INLINE uint32x2_t vqadd_u32(uint32x2_t a, uint32x2_t b)
3274	{
3275	uint32x2_t res64;
3276	return64(vqaddq_u32(_pM128i(a), _pM128i(b)));
3277	}
3278
3279
3280	_NEON2SSESTORAGE uint64x1_t vqadd_u64(uint64x1_t a, uint64x1_t b); // VQADD.U64 d0,d0,d0
3281	_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint64x1_t vqadd_u64(uint64x1_t a, uint64x1_t b), _NEON2SSE_REASON_SLOW_SERIAL)
3282	{
3283	_NEON2SSE_ALIGN_16 uint64_t a64, b64;
3284	uint64x1_t res;
3285	a64 = a.m64_u64[`0`];
3286	b64 = b.m64_u64[`0`];
3287	res.m64_u64[`0`] = a64 + b64;
3288	if (res.m64_u64[`0`] < a64) {
3289	res.m64_u64[`0`] = ~(uint64_t)`0`;
3290	}
3291	return res;
3292	}
3293
3294	_NEON2SSE_GLOBAL int8x16_t vqaddq_s8(int8x16_t a, int8x16_t b); // VQADD.S8 q0,q0,q0
3295	#define vqaddq_s8 _mm_adds_epi8
3296
3297	_NEON2SSE_GLOBAL int16x8_t vqaddq_s16(int16x8_t a, int16x8_t b); // VQADD.S16 q0,q0,q0
3298	#define vqaddq_s16 _mm_adds_epi16
3299
3300	_NEON2SSESTORAGE int32x4_t vqaddq_s32(int32x4_t a, int32x4_t b); // VQADD.S32 q0,q0,q0
3301	_NEON2SSE_INLINE int32x4_t vqaddq_s32(int32x4_t a, int32x4_t b)
3302	{
3303	//no corresponding x86 SIMD soulution, special tricks are necessary. Overflow happens only if a and b have the same sign and sum has the opposite sign
3304	__m128i c7fffffff, res, res_sat, res_xor_a, b_xor_a_;
3305	c7fffffff = _mm_set1_epi32(`0x7fffffff`);
3306	res = _mm_add_epi32(a, b);
3307	res_sat = _mm_srli_epi32(a, `31`);
3308	res_sat = _mm_add_epi32(res_sat, c7fffffff);
3309	res_xor_a = _mm_xor_si128(res, a);
3310	b_xor_a_ = _mm_xor_si128(b, a);
3311	res_xor_a = _mm_andnot_si128(b_xor_a_, res_xor_a);
3312	res_xor_a = _mm_srai_epi32(res_xor_a,`31`); //propagate the sigh bit, all ffff if <0 all ones otherwise
3313	res_sat = _mm_and_si128(res_xor_a, res_sat);
3314	res = _mm_andnot_si128(res_xor_a, res);
3315	return _mm_or_si128(res, res_sat);
3316	}
3317
3318	_NEON2SSESTORAGE int64x2_t vqaddq_s64(int64x2_t a, int64x2_t b); // VQADD.S64 q0,q0,q0
3319	_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int64x2_t vqaddq_s64(int64x2_t a, int64x2_t b), _NEON2SSE_REASON_SLOW_SERIAL)
3320	{
3321	_NEON2SSE_ALIGN_16 uint64_t atmp[`2`], btmp[`2`], res[`2`];
3322	_mm_store_si128((__m128i*)atmp, a);
3323	_mm_store_si128((__m128i*)btmp, b);
3324	res[`0`] = atmp[`0`] + btmp[`0`];
3325	res[`1`] = atmp[`1`] + btmp[`1`];
3326
3327	atmp[`0`] = (atmp[`0`] >> `63`) + (~_SIGNBIT64);
3328	atmp[`1`] = (atmp[`1`] >> `63`) + (~_SIGNBIT64);
3329
3330	if ((int64_t)((btmp[`0`] ^ atmp[`0`]) \| ~(res[`0`] ^ btmp[`0`]))>=`0`) {
3331	res[`0`] = atmp[`0`];
3332	}
3333	if ((int64_t)((btmp[`1`] ^ atmp[`1`]) \| ~(res[`1`] ^ btmp[`1`]))>=`0`) {
3334	res[`1`] = atmp[`1`];
3335	}
3336	return _mm_load_si128((__m128i*)res);
3337	}
3338
3339	_NEON2SSE_GLOBAL uint8x16_t vqaddq_u8(uint8x16_t a, uint8x16_t b); // VQADD.U8 q0,q0,q0
3340	#define vqaddq_u8 _mm_adds_epu8
3341
3342	_NEON2SSE_GLOBAL uint16x8_t vqaddq_u16(uint16x8_t a, uint16x8_t b); // VQADD.s16 q0,q0,q0
3343	#define vqaddq_u16 _mm_adds_epu16
3344
3345	_NEON2SSESTORAGE uint32x4_t vqaddq_u32(uint32x4_t a, uint32x4_t b); // VQADD.U32 q0,q0,q0
3346	_NEON2SSE_INLINE uint32x4_t vqaddq_u32(uint32x4_t a, uint32x4_t b)
3347	{
3348	__m128i c80000000, cmp, subsum, suba, sum;
3349	c80000000 = _mm_set1_epi32 (`0x80000000`);
3350	sum = _mm_add_epi32 (a, b);
3351	subsum = _mm_sub_epi32 (sum, c80000000);
3352	suba = _mm_sub_epi32 (a, c80000000);
3353	cmp = _mm_cmpgt_epi32 ( suba, subsum); //no unsigned comparison, need to go to signed
3354	return _mm_or_si128 (sum, cmp); //saturation
3355	}
3356
3357	_NEON2SSESTORAGE uint64x2_t vqaddq_u64(uint64x2_t a, uint64x2_t b); // VQADD.U64 q0,q0,q0
3358	#ifdef USE_SSE4
3359	_NEON2SSE_INLINE uint64x2_t vqaddq_u64(uint64x2_t a, uint64x2_t b)
3360	{
3361	__m128i c80000000, sum, cmp, suba, subsum;
3362	c80000000 = _mm_set_epi32 (`0x80000000`, `0x0`, `0x80000000`, `0x0`);
3363	sum = _mm_add_epi64 (a, b);
3364	subsum = _mm_sub_epi64 (sum, c80000000);
3365	suba = _mm_sub_epi64 (a, c80000000);
3366	cmp = _mm_cmpgt_epi64 ( suba, subsum); //no unsigned comparison, need to go to signed, SSE4.2!!!
3367	return _mm_or_si128 (sum, cmp); //saturation
3368	}
3369	#else
3370	_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint64x2_t vqaddq_u64(uint64x2_t a, uint64x2_t b), _NEON2SSE_REASON_SLOW_SERIAL)
3371	{
3372	_NEON2SSE_ALIGN_16 uint64_t atmp[`2`], btmp[`2`], res[`2`];
3373	_mm_store_si128((__m128i*)atmp, a);
3374	_mm_store_si128((__m128i*)btmp, b);
3375	res[`0`] = atmp[`0`] + btmp[`0`];
3376	res[`1`] = atmp[`1`] + btmp[`1`];
3377	if (res[`0`] < atmp[`0`]) res[`0`] = ~(uint64_t)`0`;
3378	if (res[`1`] < atmp[`1`]) res[`1`] = ~(uint64_t)`0`;
3379	return _mm_load_si128((__m128i*)(res));
3380	}
3381	#endif
3382
3383
3384	//***************** Vector add high half (truncated) ****************
3385	//************************************************************************
3386	_NEON2SSESTORAGE int8x8_t vaddhn_s16(int16x8_t a, int16x8_t b); // VADDHN.I16 d0,q0,q0
3387	_NEON2SSE_INLINE int8x8_t vaddhn_s16(int16x8_t a, int16x8_t b) // VADDHN.I16 d0,q0,q0
3388	{
3389	int8x8_t res64;
3390	__m128i sum;
3391	sum = _mm_add_epi16 (a, b);
3392	sum = _mm_srai_epi16 (sum, `8`);
3393	sum = _mm_packs_epi16 (sum, sum); //use 64 low bits only
3394	return64(sum);
3395	}
3396
3397	_NEON2SSESTORAGE int16x4_t vaddhn_s32(int32x4_t a, int32x4_t b); // VADDHN.I32 d0,q0,q0
3398	_NEON2SSE_INLINE int16x4_t vaddhn_s32(int32x4_t a, int32x4_t b) // VADDHN.I32 d0,q0,q0
3399	{
3400	int16x4_t res64;
3401	__m128i sum;
3402	sum = _mm_add_epi32 (a, b);
3403	sum = _mm_srai_epi32(sum, `16`);
3404	sum = _mm_packs_epi32 (sum, sum); //use 64 low bits only
3405	return64(sum);
3406	}
3407
3408	_NEON2SSESTORAGE int32x2_t vaddhn_s64(int64x2_t a, int64x2_t b); // VADDHN.I64 d0,q0,q0
3409	_NEON2SSE_INLINE int32x2_t vaddhn_s64(int64x2_t a, int64x2_t b)
3410	{
3411	int32x2_t res64;
3412	__m128i sum;
3413	sum = _mm_add_epi64 (a, b);
3414	sum = _mm_shuffle_epi32(sum, `1` \| (`3` << `2`) \| (`0` << `4`) \| (`2` << `6`));
3415	return64(sum);
3416	}
3417
3418	_NEON2SSESTORAGE uint8x8_t vaddhn_u16(uint16x8_t a, uint16x8_t b); // VADDHN.I16 d0,q0,q0
3419	_NEON2SSE_INLINE uint8x8_t vaddhn_u16(uint16x8_t a, uint16x8_t b) // VADDHN.I16 d0,q0,q0
3420	{
3421	uint8x8_t res64;
3422	__m128i sum;
3423	sum = _mm_add_epi16 (a, b);
3424	sum = _mm_srli_epi16 (sum, `8`);
3425	sum = _mm_packus_epi16 (sum,sum); //use 64 low bits only
3426	return64(sum);
3427	}
3428
3429	_NEON2SSESTORAGE uint16x4_t vaddhn_u32(uint32x4_t a, uint32x4_t b); // VADDHN.I32 d0,q0,q0
3430	_NEON2SSE_INLINE uint16x4_t vaddhn_u32(uint32x4_t a, uint32x4_t b) // VADDHN.I32 d0,q0,q0
3431	{
3432	uint16x4_t res64;
3433	__m128i sum;
3434	sum = _mm_add_epi32 (a, b);
3435	sum = _mm_srli_epi32 (sum, `16`);
3436	#ifdef USE_SSE4
3437	sum = _MM_PACKUS1_EPI32 (sum); //use 64 low bits only
3438	#else
3439	sum = _mm_shuffle_epi8 (sum, (__m128i) mask8_32_even_odd); //go to 16 bits
3440	#endif
3441	return64(sum);
3442	}
3443
3444	_NEON2SSE_GLOBAL uint32x2_t vaddhn_u64(uint64x2_t a, uint64x2_t b); // VADDHN.I64 d0,q0,q0
3445	#define vaddhn_u64 vaddhn_s64
3446
3447	//******** Vector rounding add high half: vraddhn_<type> ***************.
3448	//***************************************************************************
3449	_NEON2SSESTORAGE int8x8_t vraddhn_s16(int16x8_t a, int16x8_t b); // VRADDHN.I16 d0,q0,q0
3450	_NEON2SSE_INLINE int8x8_t vraddhn_s16(int16x8_t a, int16x8_t b) // VRADDHN.I16 d0,q0,q0
3451	{
3452	int8x8_t res64;
3453	__m128i sum, mask1;
3454	sum = _mm_add_epi16 (a, b);
3455	mask1 = _mm_slli_epi16(sum, `8`); //shift left then back right to
3456	mask1 = _mm_srli_epi16(mask1, `15`); //get 7-th bit 1 or zero
3457	sum = _mm_srai_epi16 (sum, `8`); //get high half
3458	sum = _mm_add_epi16 (sum, mask1); //actual rounding
3459	sum = _mm_packs_epi16 (sum, sum);
3460	return64(sum);
3461	}
3462
3463	_NEON2SSESTORAGE int16x4_t vraddhn_s32(int32x4_t a, int32x4_t b); // VRADDHN.I32 d0,q0,q0
3464	_NEON2SSE_INLINE int16x4_t vraddhn_s32(int32x4_t a, int32x4_t b) // VRADDHN.I32 d0,q0,q0
3465	{
3466	//SIMD may be not optimal, serial may be faster
3467	int16x4_t res64;
3468	__m128i sum, mask1;
3469	sum = _mm_add_epi32 (a, b);
3470	mask1 = _mm_slli_epi32(sum, `16`); //shift left then back right to
3471	mask1 = _mm_srli_epi32(mask1,`31`); //get 15-th bit 1 or zero
3472	sum = _mm_srai_epi32 (sum, `16`); //get high half
3473	sum = _mm_add_epi32 (sum, mask1); //actual rounding
3474	sum = _mm_packs_epi32 (sum, sum);
3475	return64(sum);
3476	}
3477
3478	_NEON2SSESTORAGE int32x2_t vraddhn_s64(int64x2_t a, int64x2_t b); // VRADDHN.I64 d0,q0,q0
3479	_NEON2SSE_INLINE int32x2_t vraddhn_s64(int64x2_t a, int64x2_t b)
3480	{
3481	//SIMD may be not optimal, serial may be faster
3482	int32x2_t res64;
3483	__m128i sum, mask1;
3484	sum = _mm_add_epi64 (a, b);
3485	mask1 = _mm_slli_epi64(sum, `32`); //shift left then back right to
3486	mask1 = _mm_srli_epi64(mask1,`31`); //get 31-th bit 1 or zero
3487	sum = _mm_add_epi32 (sum, mask1); //actual high half rounding
3488	sum = _mm_shuffle_epi32(sum, `1` \| (`3` << `2`) \| (`1` << `4`) \| (`3` << `6`));
3489	return64(sum);
3490	}
3491
3492	_NEON2SSESTORAGE uint8x8_t vraddhn_u16(uint16x8_t a, uint16x8_t b); // VRADDHN.I16 d0,q0,q0
3493	_NEON2SSE_INLINE uint8x8_t vraddhn_u16(uint16x8_t a, uint16x8_t b) // VRADDHN.I16 d0,q0,q0
3494	{
3495	uint8x8_t res64;
3496	__m128i sum, mask1;
3497	sum = _mm_add_epi16 (a, b);
3498	mask1 = _mm_slli_epi16(sum, `8`); //shift left then back right to
3499	mask1 = _mm_srli_epi16(mask1, `15`); //get 7-th bit 1 or zero
3500	sum = _mm_srai_epi16 (sum, `8`); //get high half
3501	sum = _mm_add_epi16 (sum, mask1); //actual rounding
3502	sum = _mm_packus_epi16 (sum, sum);
3503	return64(sum);
3504	}
3505
3506	_NEON2SSESTORAGE uint16x4_t vraddhn_u32(uint32x4_t a, uint32x4_t b); // VRADDHN.I32 d0,q0,q0
3507	_NEON2SSE_INLINE uint16x4_t vraddhn_u32(uint32x4_t a, uint32x4_t b)
3508	{
3509	//SIMD may be not optimal, serial may be faster
3510	uint16x4_t res64;
3511	__m128i sum, mask1;
3512	sum = _mm_add_epi32 (a, b);
3513	mask1 = _mm_slli_epi32(sum, `16`); //shift left then back right to
3514	mask1 = _mm_srli_epi32(mask1,`31`); //get 15-th bit 1 or zero
3515	sum = _mm_srai_epi32 (sum, `16`); //get high half
3516	sum = _mm_add_epi32 (sum, mask1); //actual rounding
3517	sum = _MM_PACKUS1_EPI32 (sum);
3518	return64(sum);
3519	}
3520
3521	_NEON2SSE_GLOBAL uint32x2_t vraddhn_u64(uint64x2_t a, uint64x2_t b); // VRADDHN.I64 d0,q0,q0
3522	#define vraddhn_u64 vraddhn_s64
3523
3524	//**********************************************************************************
3525	//******* Multiplication ***********************************
3526	//**************************************************************************************
3527
3528	//Vector multiply: vmul -> Vr[i] := Va[i] Vb[i]*
3529	//As we don't go to wider result functions are equal to "multiply low" in x86
3530	_NEON2SSESTORAGE int8x8_t vmul_s8(int8x8_t a, int8x8_t b); // VMUL.I8 d0,d0,d0
3531	_NEON2SSE_INLINE int8x8_t vmul_s8(int8x8_t a, int8x8_t b) // VMUL.I8 d0,d0,d0
3532	{
3533	// no 8 bit simd multiply, need to go to 16 bits in SSE
3534	int8x8_t res64;
3535	__m128i a128, b128, res;
3536	a128 = _MM_CVTEPI8_EPI16 (_pM128i(a)); // SSE 4.1 use low 64 bits
3537	b128 = _MM_CVTEPI8_EPI16 (_pM128i(b)); // SSE 4.1 use low 64 bits
3538	res = _mm_mullo_epi16 (a128, b128);
3539	res = _mm_shuffle_epi8 (res, (__m128i) mask8_16_even_odd); //return to 8 bit from 16, use 64 low bits only
3540	return64(res);
3541	}
3542
3543	_NEON2SSE_GLOBAL int16x4_t vmul_s16(int16x4_t a, int16x4_t b); // VMUL.I16 d0,d0,d0
3544	#define vmul_s16 vmul_u16
3545
3546	_NEON2SSE_GLOBAL int32x2_t vmul_s32(int32x2_t a, int32x2_t b); // VMUL.I32 d0,d0,d0
3547	#define vmul_s32 vmul_u32
3548
3549	_NEON2SSESTORAGE float32x2_t vmul_f32(float32x2_t a, float32x2_t b); // VMUL.F32 d0,d0,d0
3550	_NEON2SSE_INLINE float32x2_t vmul_f32(float32x2_t a, float32x2_t b)
3551	{
3552	float32x4_t tmp;
3553	__m64_128 res64;
3554	tmp = _mm_mul_ps(_pM128(a),_pM128(b));
3555	_M64f(res64, tmp); //use low 64 bits
3556	return res64;
3557	}
3558
3559	_NEON2SSESTORAGE uint8x8_t vmul_u8(uint8x8_t a, uint8x8_t b); // VMUL.I8 d0,d0,d0
3560	_NEON2SSE_INLINE uint8x8_t vmul_u8(uint8x8_t a, uint8x8_t b) // VMUL.I8 d0,d0,d0
3561	{
3562	// no 8 bit simd multiply, need to go to 16 bits in SSE
3563	uint8x8_t res64;
3564	__m128i mask, a128, b128, res;
3565	mask = _mm_set1_epi16(`0xff`);
3566	a128 = _MM_CVTEPU8_EPI16 (_pM128i(a));
3567	b128 = _MM_CVTEPU8_EPI16 (_pM128i(b));
3568	res = _mm_mullo_epi16 (a128, b128);
3569	res = _mm_and_si128(res, mask); //to avoid saturation
3570	res = _mm_packus_epi16 (res,res); //use only low 64 bits
3571	return64(res);
3572	}
3573
3574	_NEON2SSESTORAGE uint16x4_t vmul_u16(uint16x4_t a, uint16x4_t b); // VMUL.I16 d0,d0,d0
3575	_NEON2SSE_INLINE uint16x4_t vmul_u16(uint16x4_t a, uint16x4_t b)
3576	{
3577	uint16x4_t res64;
3578	return64(_mm_mullo_epi16(_pM128i(a),_pM128i(b)));
3579	}
3580
3581	_NEON2SSESTORAGE uint32x2_t vmul_u32(uint32x2_t a, uint32x2_t b); // VMUL.I32 d0,d0,d0
3582	_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING( uint32x2_t vmul_u32(uint32x2_t a, uint32x2_t b), _NEON2SSE_REASON_SLOW_SERIAL)
3583	{
3584	uint32x2_t res;
3585	res.m64_u32[`0`] = a.m64_u32[`0`] * b.m64_u32[`0`];
3586	res.m64_u32[`1`] = a.m64_u32[`1`] * b.m64_u32[`1`];
3587	return res;
3588	}
3589
3590	_NEON2SSESTORAGE poly8x8_t vmul_p8(poly8x8_t a, poly8x8_t b); // VMUL.P8 d0,d0,d0
3591	_NEON2SSE_INLINE poly8x8_t vmul_p8(poly8x8_t a, poly8x8_t b)
3592	{
3593	//may be optimized
3594	poly8x8_t res64;
3595	__m128i a64, b64, c1, res, tmp, bmasked;
3596	int i;
3597	a64 = _pM128i(a);
3598	b64 = _pM128i(b);
3599	c1 = _mm_cmpeq_epi8 (a64,a64); //all ones 0xff....
3600	c1 = vshrq_n_u8(c1,`7`); //0x1
3601	bmasked = _mm_and_si128(b64, c1); //0x1
3602	res = vmulq_u8(a64, bmasked);
3603	for(i = `1`; i<`8`; i++) {
3604	c1 = _mm_slli_epi16(c1,`1`); //shift mask left by 1, 16 bit shift is OK here
3605	bmasked = _mm_and_si128(b64, c1); //0x1
3606	tmp = vmulq_u8(a64, bmasked);
3607	res = _mm_xor_si128(res, tmp);
3608	}
3609	return64 (res);
3610	}
3611
3612	_NEON2SSESTORAGE int8x16_t vmulq_s8(int8x16_t a, int8x16_t b); // VMUL.I8 q0,q0,q0
3613	_NEON2SSE_INLINE int8x16_t vmulq_s8(int8x16_t a, int8x16_t b) // VMUL.I8 q0,q0,q0
3614	{
3615	// no 8 bit simd multiply, need to go to 16 bits
3616	//solution may be not optimal
3617	__m128i a16, b16, r16_1, r16_2;
3618	a16 = _MM_CVTEPI8_EPI16 (a); // SSE 4.1
3619	b16 = _MM_CVTEPI8_EPI16 (b); // SSE 4.1
3620	r16_1 = _mm_mullo_epi16 (a16, b16);
3621	//swap hi and low part of a and b to process the remaining data
3622	a16 = _mm_shuffle_epi32 (a, _SWAP_HI_LOW32);
3623	b16 = _mm_shuffle_epi32 (b, _SWAP_HI_LOW32);
3624	a16 = _MM_CVTEPI8_EPI16 (a16); // SSE 4.1
3625	b16 = _MM_CVTEPI8_EPI16 (b16); // SSE 4.1 __m128i r16_2
3626
3627	r16_2 = _mm_mullo_epi16 (a16, b16);
3628	r16_1 = _mm_shuffle_epi8 (r16_1, (__m128i)mask8_16_even_odd); //return to 8 bit
3629	r16_2 = _mm_shuffle_epi8 (r16_2, (__m128i)mask8_16_even_odd); //return to 8 bit
3630
3631	return _mm_unpacklo_epi64(r16_1, r16_2);
3632	}
3633
3634	_NEON2SSE_GLOBAL int16x8_t vmulq_s16(int16x8_t a, int16x8_t b); // VMUL.I16 q0,q0,q0
3635	#define vmulq_s16 _mm_mullo_epi16
3636
3637	_NEON2SSE_GLOBAL int32x4_t vmulq_s32(int32x4_t a, int32x4_t b); // VMUL.I32 q0,q0,q0
3638	#define vmulq_s32 _MM_MULLO_EPI32 //SSE4.1
3639
3640	_NEON2SSE_GLOBAL float32x4_t vmulq_f32(float32x4_t a, float32x4_t b); // VMUL.F32 q0,q0,q0
3641	#define vmulq_f32 _mm_mul_ps
3642
3643	_NEON2SSESTORAGE uint8x16_t vmulq_u8(uint8x16_t a, uint8x16_t b); // VMUL.I8 q0,q0,q0
3644	_NEON2SSE_INLINE uint8x16_t vmulq_u8(uint8x16_t a, uint8x16_t b) // VMUL.I8 q0,q0,q0
3645	{
3646	// no 8 bit simd multiply, need to go to 16 bits
3647	//solution may be not optimal
3648	__m128i maskff, a16, b16, r16_1, r16_2;
3649	maskff = _mm_set1_epi16(`0xff`);
3650	a16 = _MM_CVTEPU8_EPI16 (a); // SSE 4.1
3651	b16 = _MM_CVTEPU8_EPI16 (b); // SSE 4.1
3652	r16_1 = _mm_mullo_epi16 (a16, b16);
3653	r16_1 = _mm_and_si128(r16_1, maskff); //to avoid saturation
3654	//swap hi and low part of a and b to process the remaining data
3655	a16 = _mm_shuffle_epi32 (a, _SWAP_HI_LOW32);
3656	b16 = _mm_shuffle_epi32 (b, _SWAP_HI_LOW32);
3657	a16 = _MM_CVTEPI8_EPI16 (a16); // SSE 4.1
3658	b16 = _MM_CVTEPI8_EPI16 (b16); // SSE 4.1
3659
3660	r16_2 = _mm_mullo_epi16 (a16, b16);
3661	r16_2 = _mm_and_si128(r16_2, maskff); //to avoid saturation
3662	return _mm_packus_epi16 (r16_1, r16_2);
3663	}
3664
3665	_NEON2SSE_GLOBAL uint16x8_t vmulq_u16(uint16x8_t a, uint16x8_t b); // VMUL.I16 q0,q0,q0
3666	#define vmulq_u16 _mm_mullo_epi16
3667
3668	_NEON2SSE_GLOBAL uint32x4_t vmulq_u32(uint32x4_t a, uint32x4_t b); // VMUL.I32 q0,q0,q0
3669	#define vmulq_u32 _MM_MULLO_EPI32 //SSE4.1
3670
3671	_NEON2SSESTORAGE poly8x16_t vmulq_p8(poly8x16_t a, poly8x16_t b); // VMUL.P8 q0,q0,q0
3672	_NEON2SSE_INLINE poly8x16_t vmulq_p8(poly8x16_t a, poly8x16_t b)
3673	{
3674	//may be optimized
3675	__m128i c1, res, tmp, bmasked;
3676	int i;
3677	c1 = _mm_cmpeq_epi8 (a,a); //all ones 0xff....
3678	c1 = vshrq_n_u8(c1,`7`); //0x1
3679	bmasked = _mm_and_si128(b, c1); //0x1
3680	res = vmulq_u8(a, bmasked);
3681	for(i = `1`; i<`8`; i++) {
3682	c1 = _mm_slli_epi16(c1,`1`); //shift mask left by 1, 16 bit shift is OK here
3683	bmasked = _mm_and_si128(b, c1); //0x1
3684	tmp = vmulq_u8(a, bmasked);
3685	res = _mm_xor_si128(res, tmp);
3686	}
3687	return res;
3688	}
3689
3690	//*********************** Vector long multiply *********************************
3691	//****************************************************************************
3692	_NEON2SSESTORAGE int16x8_t vmull_s8(int8x8_t a, int8x8_t b); // VMULL.S8 q0,d0,d0
3693	_NEON2SSE_INLINE int16x8_t vmull_s8(int8x8_t a, int8x8_t b) // VMULL.S8 q0,d0,d0
3694	{
3695	//no 8 bit simd multiply, need to go to 16 bits
3696	__m128i a16, b16;
3697	a16 = _MM_CVTEPI8_EPI16 (_pM128i(a)); // SSE 4.1
3698	b16 = _MM_CVTEPI8_EPI16 (_pM128i(b)); // SSE 4.1
3699	return _mm_mullo_epi16 (a16, b16); //should fit into 16 bit
3700	}
3701
3702	_NEON2SSESTORAGE int32x4_t vmull_s16(int16x4_t a, int16x4_t b); // VMULL.S16 q0,d0,d0
3703	_NEON2SSE_INLINE int32x4_t vmull_s16(int16x4_t a, int16x4_t b) // VMULL.S16 q0,d0,d0
3704	{
3705	#ifdef USE_SSE4
3706	__m128i a16, b16;
3707	a16 = _MM_CVTEPI16_EPI32 (_pM128i(a)); // SSE 4.1
3708	b16 = _MM_CVTEPI16_EPI32 (_pM128i(b)); // SSE 4.1
3709	return _MM_MULLO_EPI32 (a16, b16); // SSE 4.1
3710	#else
3711	__m128i low, hi, a128,b128;
3712	a128 = _pM128i(a);
3713	b128 = _pM128i(b);
3714	low = _mm_mullo_epi16(a128,b128);
3715	hi = _mm_mulhi_epi16(a128,b128);
3716	return _mm_unpacklo_epi16(low,hi);
3717	#endif
3718	}
3719
3720	_NEON2SSESTORAGE int64x2_t vmull_s32(int32x2_t a, int32x2_t b); // VMULL.S32 q0,d0,d0
3721	_NEON2SSE_INLINE int64x2_t vmull_s32(int32x2_t a, int32x2_t b) // VMULL.S32 q0,d0,d0
3722	{
3723	__m128i ab, ba, a128, b128;
3724	a128 = _pM128i(a);
3725	b128 = _pM128i(b);
3726	ab = _mm_unpacklo_epi32 (a128, b128); //a0, b0, a1,b1
3727	ba = _mm_unpacklo_epi32 (b128, a128); //b0, a0, b1,a1
3728	return _MM_MUL_EPI32(ab, ba); //uses 1rst and 3rd data lanes, the multiplication gives 64 bit result
3729	}
3730
3731	_NEON2SSESTORAGE uint16x8_t vmull_u8(uint8x8_t a, uint8x8_t b); // VMULL.U8 q0,d0,d0
3732	_NEON2SSE_INLINE uint16x8_t vmull_u8(uint8x8_t a, uint8x8_t b) // VMULL.U8 q0,d0,d0
3733	{
3734	//no 8 bit simd multiply, need to go to 16 bits
3735	__m128i a16, b16;
3736	a16 = _MM_CVTEPU8_EPI16 (_pM128i(a)); // SSE 4.1
3737	b16 = _MM_CVTEPU8_EPI16 (_pM128i(b)); // SSE 4.1
3738	return _mm_mullo_epi16 (a16, b16); //should fit into 16 bit
3739	}
3740
3741	_NEON2SSESTORAGE uint32x4_t vmull_u16(uint16x4_t a, uint16x4_t b); // VMULL.s16 q0,d0,d0
3742	_NEON2SSE_INLINE uint32x4_t vmull_u16(uint16x4_t a, uint16x4_t b) // VMULL.s16 q0,d0,d0
3743	{
3744	#ifdef USE_SSE4
3745	__m128i a16, b16;
3746	a16 = _MM_CVTEPU16_EPI32 (_pM128i(a)); // SSE 4.1
3747	b16 = _MM_CVTEPU16_EPI32 (_pM128i(b)); // SSE 4.1
3748	return _MM_MULLO_EPI32 (a16, b16); // SSE 4.1
3749	#else
3750	__m128i a128,b128,low, hi;
3751	a128 = _pM128i(a);
3752	b128 = _pM128i(b);
3753	low = _mm_mullo_epi16(a128,b128);
3754	hi = _mm_mulhi_epu16(a128,b128);
3755	return _mm_unpacklo_epi16(low,hi);
3756	#endif
3757	}
3758
3759	_NEON2SSESTORAGE uint64x2_t vmull_u32(uint32x2_t a, uint32x2_t b); // VMULL.U32 q0,d0,d0
3760	_NEON2SSE_INLINE uint64x2_t vmull_u32(uint32x2_t a, uint32x2_t b) // VMULL.U32 q0,d0,d0
3761	{
3762	///may be not optimal compared with serial implementation
3763	__m128i ab, ba, a128, b128;
3764	a128 = _pM128i(a);
3765	b128 = _pM128i(b);
3766	ab = _mm_unpacklo_epi32 (a128, b128); //a0, b0, a1,b1
3767	ba = _mm_unpacklo_epi32 (b128, a128); //b0, a0, b1,a1
3768	return _mm_mul_epu32 (ab, ba); //uses 1rst and 3rd data lanes, the multiplication gives 64 bit result
3769	}
3770
3771	_NEON2SSESTORAGE poly16x8_t vmull_p8(poly8x8_t a, poly8x8_t b); // VMULL.P8 q0,d0,d0
3772	_NEON2SSE_INLINE poly16x8_t vmull_p8(poly8x8_t a, poly8x8_t b)
3773	{
3774	//may be optimized
3775	__m128i a128,b128, c1, a128_16, bmasked_16, res, tmp, bmasked;
3776	int i;
3777	a128 = _pM128i(a);
3778	b128 = _pM128i(b);
3779	c1 = _mm_cmpeq_epi8 (a128,a128); //all ones 0xff....
3780	c1 = vshrq_n_u8(c1,`7`); //0x1
3781	bmasked = _mm_and_si128(b128, c1); //0x1
3782
3783	a128_16 = _MM_CVTEPU8_EPI16 (a128); // SSE 4.1
3784	bmasked_16 = _MM_CVTEPU8_EPI16 (bmasked); // SSE 4.1
3785	res = _mm_mullo_epi16 (a128_16, bmasked_16); //should fit into 16 bit
3786	for(i = `1`; i<`8`; i++) {
3787	c1 = _mm_slli_epi16(c1,`1`); //shift mask left by 1, 16 bit shift is OK here
3788	bmasked = _mm_and_si128(b128, c1); //0x1
3789	bmasked_16 = _MM_CVTEPU8_EPI16 (bmasked); // SSE 4.1
3790	tmp = _mm_mullo_epi16 (a128_16, bmasked_16); //should fit into 16 bit, vmull_u8(a, bmasked);
3791	res = _mm_xor_si128(res, tmp);
3792	}
3793	return res;
3794	}
3795
3796	//**************Vector saturating doubling long multiply ************************
3797	//*****************************************************************
3798	_NEON2SSESTORAGE int32x4_t vqdmull_s16(int16x4_t a, int16x4_t b); // VQDMULL.S16 q0,d0,d0
3799	_NEON2SSE_INLINE int32x4_t vqdmull_s16(int16x4_t a, int16x4_t b)
3800	{
3801	//the serial soulution may be faster due to saturation
3802	__m128i res;
3803	res = vmull_s16(a, b);
3804	return vqd_s32(res);
3805	}
3806
3807	_NEON2SSESTORAGE int64x2_t vqdmull_s32(int32x2_t a, int32x2_t b); // VQDMULL.S32 q0,d0,d0
3808	_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int64x2_t vqdmull_s32(int32x2_t a, int32x2_t b),_NEON2SSE_REASON_SLOW_SERIAL)
3809	{
3810	//the serial soulution may be faster due to saturation
3811	__m128i res;
3812	res = vmull_s32(a,b);
3813	return vqaddq_s64(res,res); //slow serial function!!!!
3814	}
3815
3816	//********************* Vector multiply accumulate: vmla -> Vr[i] := Va[i] + Vb[i] * Vc[i] ************************
3817	//******************************************************************************************
3818	_NEON2SSESTORAGE int8x8_t vmla_s8(int8x8_t a, int8x8_t b, int8x8_t c); // VMLA.I8 d0,d0,d0
3819	_NEON2SSE_INLINE int8x8_t vmla_s8(int8x8_t a, int8x8_t b, int8x8_t c) // VMLA.I8 d0,d0,d0
3820	{
3821	// no 8 bit x86 simd multiply, need to go to 16 bits, and use the low 64 bits
3822	int8x8_t res64;
3823	__m128i b128, c128, res;
3824	b128 = _MM_CVTEPI8_EPI16 (_pM128i(b)); // SSE 4.1 use low 64 bits
3825	c128 = _MM_CVTEPI8_EPI16 (_pM128i(c)); // SSE 4.1 use low 64 bits
3826	res = _mm_mullo_epi16 (c128, b128);
3827	res = _mm_shuffle_epi8 (res, (__m128i) mask8_16_even_odd);
3828	res = _mm_add_epi8 (res, _pM128i(a)); //use the low 64 bits
3829	return64(res);
3830	}
3831
3832	_NEON2SSESTORAGE int16x4_t vmla_s16(int16x4_t a, int16x4_t b, int16x4_t c); // VMLA.I16 d0,d0,d0
3833	_NEON2SSE_INLINE int16x4_t vmla_s16(int16x4_t a, int16x4_t b, int16x4_t c)
3834	{
3835	int16x4_t res64;
3836	return64(vmlaq_s16(_pM128i(a),_pM128i(b), _pM128i(c)));
3837	}
3838
3839
3840	_NEON2SSESTORAGE int32x2_t vmla_s32(int32x2_t a, int32x2_t b, int32x2_t c); // VMLA.I32 d0,d0,d0
3841	_NEON2SSE_INLINE int32x2_t vmla_s32(int32x2_t a, int32x2_t b, int32x2_t c) // VMLA.I32 d0,d0,d0
3842	{
3843	int32x2_t res64;
3844	__m128i res;
3845	res = _MM_MULLO_EPI32 (_pM128i(b), _pM128i(c)); //SSE4.1
3846	res = _mm_add_epi32 (res, _pM128i(a)); //use the low 64 bits
3847	return64(res);
3848	}
3849
3850	_NEON2SSESTORAGE float32x2_t vmla_f32(float32x2_t a, float32x2_t b, float32x2_t c); // VMLA.F32 d0,d0,d0
3851	_NEON2SSE_INLINE float32x2_t vmla_f32(float32x2_t a, float32x2_t b, float32x2_t c)
3852	{
3853	//fma is coming soon, but right now:
3854	__m128 res;
3855	__m64_128 res64;
3856	res = _mm_mul_ps (_pM128(c), _pM128(b));
3857	res = _mm_add_ps (_pM128(a), res);
3858	_M64f(res64, res);
3859	return res64;
3860	}
3861
3862	_NEON2SSESTORAGE uint8x8_t vmla_u8(uint8x8_t a, uint8x8_t b, uint8x8_t c); // VMLA.I8 d0,d0,d0
3863	_NEON2SSE_INLINE uint8x8_t vmla_u8(uint8x8_t a, uint8x8_t b, uint8x8_t c) // VMLA.I8 d0,d0,d0
3864	{
3865	// no 8 bit x86 simd multiply, need to go to 16 bits, and use the low 64 bits
3866	uint8x8_t res64;
3867	__m128i mask, b128, c128, res;
3868	mask = _mm_set1_epi16(`0xff`);
3869	b128 = _MM_CVTEPU8_EPI16 (_pM128i(b)); // SSE 4.1 use low 64 bits
3870	c128 = _MM_CVTEPU8_EPI16 (_pM128i(c)); // SSE 4.1 use low 64 bits
3871	res = _mm_mullo_epi16 (c128, b128);
3872	res = _mm_and_si128(res, mask); //to avoid saturation
3873	res = _mm_packus_epi16 (res, res);
3874	res = _mm_add_epi8 (res, _pM128i(a)); //use the low 64 bits
3875	return64(res);
3876	}
3877
3878	_NEON2SSE_GLOBAL uint16x4_t vmla_u16(uint16x4_t a, uint16x4_t b, uint16x4_t c); // VMLA.I16 d0,d0,d0
3879	#define vmla_u16 vmla_s16
3880
3881	_NEON2SSE_GLOBAL uint32x2_t vmla_u32(uint32x2_t a, uint32x2_t b, uint32x2_t c); // VMLA.I32 d0,d0,d0
3882	#define vmla_u32 vmla_s32
3883
3884	_NEON2SSESTORAGE int8x16_t vmlaq_s8(int8x16_t a, int8x16_t b, int8x16_t c); // VMLA.I8 q0,q0,q0
3885	_NEON2SSE_INLINE int8x16_t vmlaq_s8(int8x16_t a, int8x16_t b, int8x16_t c) // VMLA.I8 q0,q0,q0
3886	{
3887	//solution may be not optimal
3888	// no 8 bit simd multiply, need to go to 16 bits
3889	__m128i b16, c16, r16_1, a_2,r16_2;
3890	b16 = _MM_CVTEPI8_EPI16 (b); // SSE 4.1
3891	c16 = _MM_CVTEPI8_EPI16 (c); // SSE 4.1
3892	r16_1 = _mm_mullo_epi16 (b16, c16);
3893	r16_1 = _mm_shuffle_epi8 (r16_1, (__m128i) mask8_16_even_odd); //return to 8 bits
3894	r16_1 = _mm_add_epi8 (r16_1, a);
3895	//swap hi and low part of a, b and c to process the remaining data
3896	a_2 = _mm_shuffle_epi32 (a, _SWAP_HI_LOW32);
3897	b16 = _mm_shuffle_epi32 (b, _SWAP_HI_LOW32);
3898	c16 = _mm_shuffle_epi32 (c, _SWAP_HI_LOW32);
3899	b16 = _MM_CVTEPI8_EPI16 (b16); // SSE 4.1
3900	c16 = _MM_CVTEPI8_EPI16 (c16); // SSE 4.1
3901
3902	r16_2 = _mm_mullo_epi16 (b16, c16);
3903	r16_2 = _mm_shuffle_epi8 (r16_2, (__m128i) mask8_16_even_odd);
3904	r16_2 = _mm_add_epi8(r16_2, a_2);
3905	return _mm_unpacklo_epi64(r16_1,r16_2);
3906	}
3907
3908	_NEON2SSESTORAGE int16x8_t vmlaq_s16(int16x8_t a, int16x8_t b, int16x8_t c); // VMLA.I16 q0,q0,q0
3909	_NEON2SSE_INLINE int16x8_t vmlaq_s16(int16x8_t a, int16x8_t b, int16x8_t c) // VMLA.I16 q0,q0,q0
3910	{
3911	__m128i res;
3912	res = _mm_mullo_epi16 (c, b);
3913	return _mm_add_epi16 (res, a);
3914	}
3915
3916	_NEON2SSESTORAGE int32x4_t vmlaq_s32(int32x4_t a, int32x4_t b, int32x4_t c); // VMLA.I32 q0,q0,q0
3917	_NEON2SSE_INLINE int32x4_t vmlaq_s32(int32x4_t a, int32x4_t b, int32x4_t c) // VMLA.I32 q0,q0,q0
3918	{
3919	__m128i res;
3920	res = _MM_MULLO_EPI32 (c, b); //SSE4.1
3921	return _mm_add_epi32 (res, a);
3922	}
3923
3924	_NEON2SSESTORAGE float32x4_t vmlaq_f32(float32x4_t a, float32x4_t b, float32x4_t c); // VMLA.F32 q0,q0,q0
3925	_NEON2SSE_INLINE float32x4_t vmlaq_f32(float32x4_t a, float32x4_t b, float32x4_t c) // VMLA.F32 q0,q0,q0
3926	{
3927	//fma is coming soon, but right now:
3928	__m128 res;
3929	res = _mm_mul_ps (c, b);
3930	return _mm_add_ps (a, res);
3931	}
3932
3933	_NEON2SSESTORAGE uint8x16_t vmlaq_u8(uint8x16_t a, uint8x16_t b, uint8x16_t c); // VMLA.I8 q0,q0,q0
3934	_NEON2SSE_INLINE uint8x16_t vmlaq_u8(uint8x16_t a, uint8x16_t b, uint8x16_t c) // VMLA.I8 q0,q0,q0
3935	{
3936	//solution may be not optimal
3937	// no 8 bit simd multiply, need to go to 16 bits
3938	__m128i b16, c16, r16_1, a_2, r16_2;
3939	b16 = _MM_CVTEPU8_EPI16 (b); // SSE 4.1
3940	c16 = _MM_CVTEPU8_EPI16 (c); // SSE 4.1
3941	r16_1 = _mm_mullo_epi16 (b16, c16);
3942	r16_1 = _mm_shuffle_epi8 (r16_1, (__m128i) mask8_16_even_odd); //return to 8 bits
3943	r16_1 = _mm_add_epi8 (r16_1, a);
3944	//swap hi and low part of a, b and c to process the remaining data
3945	a_2 = _mm_shuffle_epi32 (a, _SWAP_HI_LOW32);
3946	b16 = _mm_shuffle_epi32 (b, _SWAP_HI_LOW32);
3947	c16 = _mm_shuffle_epi32 (c, _SWAP_HI_LOW32);
3948	b16 = _MM_CVTEPU8_EPI16 (b16); // SSE 4.1
3949	c16 = _MM_CVTEPU8_EPI16 (c16); // SSE 4.1
3950
3951	r16_2 = _mm_mullo_epi16 (b16, c16);
3952	r16_2 = _mm_shuffle_epi8 (r16_2, (__m128i) mask8_16_even_odd);
3953	r16_2 = _mm_add_epi8(r16_2, a_2);
3954	return _mm_unpacklo_epi64(r16_1,r16_2);
3955	}
3956
3957	_NEON2SSE_GLOBAL uint16x8_t vmlaq_u16(uint16x8_t a, uint16x8_t b, uint16x8_t c); // VMLA.I16 q0,q0,q0
3958	#define vmlaq_u16 vmlaq_s16
3959
3960	_NEON2SSE_GLOBAL uint32x4_t vmlaq_u32(uint32x4_t a, uint32x4_t b, uint32x4_t c); // VMLA.I32 q0,q0,q0
3961	#define vmlaq_u32 vmlaq_s32
3962
3963	//********************* Vector widening multiply accumulate (long multiply accumulate):*
3964	// vmla -> Vr[i] := Va[i] + Vb[i] * Vc[i] **************
3965	//********************************************************************************************
3966	_NEON2SSESTORAGE int16x8_t vmlal_s8(int16x8_t a, int8x8_t b, int8x8_t c); // VMLAL.S8 q0,d0,d0
3967	_NEON2SSE_INLINE int16x8_t vmlal_s8(int16x8_t a, int8x8_t b, int8x8_t c) // VMLAL.S8 q0,d0,d0
3968	{
3969	int16x8_t res;
3970	res = vmull_s8(b, c);
3971	return _mm_add_epi16 (res, a);
3972	}
3973
3974	_NEON2SSESTORAGE int32x4_t vmlal_s16(int32x4_t a, int16x4_t b, int16x4_t c); // VMLAL.S16 q0,d0,d0
3975	_NEON2SSE_INLINE int32x4_t vmlal_s16(int32x4_t a, int16x4_t b, int16x4_t c) // VMLAL.S16 q0,d0,d0
3976	{
3977	//may be not optimal compared with serial implementation
3978	int32x4_t res;
3979	res = vmull_s16(b, c);
3980	return _mm_add_epi32 (res, a);
3981	}
3982
3983	_NEON2SSESTORAGE int64x2_t vmlal_s32(int64x2_t a, int32x2_t b, int32x2_t c); // VMLAL.S32 q0,d0,d0
3984	_NEON2SSE_INLINE int64x2_t vmlal_s32(int64x2_t a, int32x2_t b, int32x2_t c) // VMLAL.S32 q0,d0,d0
3985	{
3986	//may be not optimal compared with serial implementation
3987	int64x2_t res;
3988	res = vmull_s32( b, c);
3989	return _mm_add_epi64 (res, a);
3990	}
3991
3992	_NEON2SSESTORAGE uint16x8_t vmlal_u8(uint16x8_t a, uint8x8_t b, uint8x8_t c); // VMLAL.U8 q0,d0,d0
3993	_NEON2SSE_INLINE uint16x8_t vmlal_u8(uint16x8_t a, uint8x8_t b, uint8x8_t c) // VMLAL.U8 q0,d0,d0
3994	{
3995	uint16x8_t res;
3996	res = vmull_u8(b, c);
3997	return _mm_add_epi16 (res, a);
3998	}
3999
4000	_NEON2SSESTORAGE uint32x4_t vmlal_u16(uint32x4_t a, uint16x4_t b, uint16x4_t c); // VMLAL.s16 q0,d0,d0
4001	_NEON2SSE_INLINE uint32x4_t vmlal_u16(uint32x4_t a, uint16x4_t b, uint16x4_t c) // VMLAL.s16 q0,d0,d0
4002	{
4003	//may be not optimal compared with serial implementation
4004	uint32x4_t res;
4005	res = vmull_u16(b, c);
4006	return _mm_add_epi32 (res, a);
4007	}
4008
4009	_NEON2SSESTORAGE uint64x2_t vmlal_u32(uint64x2_t a, uint32x2_t b, uint32x2_t c); // VMLAL.U32 q0,d0,d0
4010	_NEON2SSE_INLINE uint64x2_t vmlal_u32(uint64x2_t a, uint32x2_t b, uint32x2_t c) // VMLAL.U32 q0,d0,d0
4011	{
4012	//may be not optimal compared with serial implementation
4013	int64x2_t res;
4014	res = vmull_u32( b,c);
4015	return _mm_add_epi64 (res, a);
4016	}
4017
4018	//******************** Vector multiply subtract: vmls -> Vr[i] := Va[i] - Vb[i] * Vc[i] ***************************************
4019	//********************************************************************************************
4020	_NEON2SSESTORAGE int8x8_t vmls_s8(int8x8_t a, int8x8_t b, int8x8_t c); // VMLS.I8 d0,d0,d0
4021	_NEON2SSE_INLINE int8x8_t vmls_s8(int8x8_t a, int8x8_t b, int8x8_t c) // VMLS.I8 d0,d0,d0
4022	{
4023	// no 8 bit simd multiply, need to go to 16 bits - and use the low 64 bits
4024	int8x8_t res64;
4025	__m128i res;
4026	res64 = vmul_s8(b,c);
4027	res = _mm_sub_epi8 (_pM128i(a), _pM128i(res64));
4028	return64(res);
4029	}
4030
4031	_NEON2SSESTORAGE int16x4_t vmls_s16(int16x4_t a, int16x4_t b, int16x4_t c); // VMLS.I16 d0,d0,d0
4032	_NEON2SSE_INLINE int16x4_t vmls_s16(int16x4_t a, int16x4_t b, int16x4_t c)
4033	{
4034	int16x4_t res64;
4035	return64(vmlsq_s16(_pM128i(a),_pM128i(b), _pM128i(c)));
4036	}
4037
4038
4039	_NEON2SSESTORAGE int32x2_t vmls_s32(int32x2_t a, int32x2_t b, int32x2_t c); // VMLS.I32 d0,d0,d0
4040	_NEON2SSE_INLINE int32x2_t vmls_s32(int32x2_t a, int32x2_t b, int32x2_t c) // VMLS.I32 d0,d0,d0
4041	{
4042	int32x2_t res64;
4043	__m128i res;
4044	res = _MM_MULLO_EPI32 (_pM128i(c),_pM128i( b)); //SSE4.1
4045	res = _mm_sub_epi32 (_pM128i(a),res); //use low 64 bits only
4046	return64(res);
4047	}
4048
4049	_NEON2SSESTORAGE float32x2_t vmls_f32(float32x2_t a, float32x2_t b, float32x2_t c); // VMLS.F32 d0,d0,d0
4050	_NEON2SSE_INLINE float32x2_t vmls_f32(float32x2_t a, float32x2_t b, float32x2_t c)
4051	{
4052	__m128 res;
4053	__m64_128 res64;
4054	res = _mm_mul_ps (_pM128(c), _pM128(b));
4055	res = _mm_sub_ps (_pM128(a), res);
4056	_M64f(res64, res);
4057	return res64;
4058	}
4059
4060	_NEON2SSESTORAGE uint8x8_t vmls_u8(uint8x8_t a, uint8x8_t b, uint8x8_t c); // VMLS.I8 d0,d0,d0
4061	_NEON2SSE_INLINE uint8x8_t vmls_u8(uint8x8_t a, uint8x8_t b, uint8x8_t c)
4062	{
4063	// no 8 bit simd multiply, need to go to 16 bits - and use the low 64 bits
4064	uint8x8_t res64;
4065	__m128i res;
4066	res64 = vmul_u8(b,c);
4067	res = _mm_sub_epi8 (_pM128i(a), _pM128i(res64));
4068	return64(res);
4069	}
4070
4071	_NEON2SSE_GLOBAL uint16x4_t vmls_u16(uint16x4_t a, uint16x4_t b, uint16x4_t c); // VMLS.I16 d0,d0,d0
4072	#define vmls_u16 vmls_s16
4073
4074	_NEON2SSE_GLOBAL uint32x2_t vmls_u32(uint32x2_t a, uint32x2_t b, uint32x2_t c); // VMLS.I32 d0,d0,d0
4075	#define vmls_u32 vmls_s32
4076
4077
4078	_NEON2SSESTORAGE int8x16_t vmlsq_s8(int8x16_t a, int8x16_t b, int8x16_t c); // VMLS.I8 q0,q0,q0
4079	_NEON2SSE_INLINE int8x16_t vmlsq_s8(int8x16_t a, int8x16_t b, int8x16_t c) // VMLS.I8 q0,q0,q0
4080	{
4081	//solution may be not optimal
4082	// no 8 bit simd multiply, need to go to 16 bits
4083	__m128i b16, c16, r16_1, a_2, r16_2;
4084	b16 = _MM_CVTEPI8_EPI16 (b); // SSE 4.1
4085	c16 = _MM_CVTEPI8_EPI16 (c); // SSE 4.1
4086	r16_1 = _mm_mullo_epi16 (b16, c16);
4087	r16_1 = _mm_shuffle_epi8 (r16_1, (__m128i) mask8_16_even_odd);
4088	r16_1 = _mm_sub_epi8 (a, r16_1);
4089	//swap hi and low part of a, b, c to process the remaining data
4090	a_2 = _mm_shuffle_epi32 (a, _SWAP_HI_LOW32);
4091	b16 = _mm_shuffle_epi32 (b, _SWAP_HI_LOW32);
4092	c16 = _mm_shuffle_epi32 (c, _SWAP_HI_LOW32);
4093	b16 = _MM_CVTEPI8_EPI16 (b16); // SSE 4.1
4094	c16 = _MM_CVTEPI8_EPI16 (c16); // SSE 4.1
4095
4096	r16_2 = _mm_mullo_epi16 (b16, c16);
4097	r16_2 = _mm_shuffle_epi8 (r16_2, (__m128i) mask8_16_even_odd);
4098	r16_2 = _mm_sub_epi8 (a_2, r16_2);
4099	return _mm_unpacklo_epi64(r16_1,r16_2);
4100	}
4101
4102	_NEON2SSESTORAGE int16x8_t vmlsq_s16(int16x8_t a, int16x8_t b, int16x8_t c); // VMLS.I16 q0,q0,q0
4103	_NEON2SSE_INLINE int16x8_t vmlsq_s16(int16x8_t a, int16x8_t b, int16x8_t c) // VMLS.I16 q0,q0,q0
4104	{
4105	__m128i res;
4106	res = _mm_mullo_epi16 (c, b);
4107	return _mm_sub_epi16 (a, res);
4108	}
4109
4110	_NEON2SSESTORAGE int32x4_t vmlsq_s32(int32x4_t a, int32x4_t b, int32x4_t c); // VMLS.I32 q0,q0,q0
4111	_NEON2SSE_INLINE int32x4_t vmlsq_s32(int32x4_t a, int32x4_t b, int32x4_t c) // VMLS.I32 q0,q0,q0
4112	{
4113	__m128i res;
4114	res = _MM_MULLO_EPI32 (c, b); //SSE4.1
4115	return _mm_sub_epi32 (a, res);
4116	}
4117
4118	_NEON2SSESTORAGE float32x4_t vmlsq_f32(float32x4_t a, float32x4_t b, float32x4_t c); // VMLS.F32 q0,q0,q0
4119	_NEON2SSE_INLINE float32x4_t vmlsq_f32(float32x4_t a, float32x4_t b, float32x4_t c) // VMLS.F32 q0,q0,q0
4120	{
4121	__m128 res;
4122	res = _mm_mul_ps (c, b);
4123	return _mm_sub_ps (a, res);
4124	}
4125
4126	_NEON2SSESTORAGE uint8x16_t vmlsq_u8(uint8x16_t a, uint8x16_t b, uint8x16_t c); // VMLS.I8 q0,q0,q0
4127	_NEON2SSE_INLINE uint8x16_t vmlsq_u8(uint8x16_t a, uint8x16_t b, uint8x16_t c) // VMLS.I8 q0,q0,q0
4128	{
4129	//solution may be not optimal
4130	// no 8 bit simd multiply, need to go to 16 bits
4131	__m128i b16, c16, r16_1, a_2, r16_2;
4132	b16 = _MM_CVTEPU8_EPI16 (b); // SSE 4.1
4133	c16 = _MM_CVTEPU8_EPI16 (c); // SSE 4.1
4134	r16_1 = _mm_mullo_epi16 (b16, c16);
4135	r16_1 = _mm_shuffle_epi8 (r16_1, (__m128i) mask8_16_even_odd); //return to 8 bits
4136	r16_1 = _mm_sub_epi8 (a, r16_1);
4137	//swap hi and low part of a, b and c to process the remaining data
4138	a_2 = _mm_shuffle_epi32 (a, _SWAP_HI_LOW32);
4139	b16 = _mm_shuffle_epi32 (b, _SWAP_HI_LOW32);
4140	c16 = _mm_shuffle_epi32 (c, _SWAP_HI_LOW32);
4141	b16 = _MM_CVTEPU8_EPI16 (b16); // SSE 4.1
4142	c16 = _MM_CVTEPU8_EPI16 (c16); // SSE 4.1
4143
4144	r16_2 = _mm_mullo_epi16 (b16, c16);
4145	r16_2 = _mm_shuffle_epi8 (r16_2, (__m128i) mask8_16_even_odd);
4146	r16_2 = _mm_sub_epi8(a_2, r16_2);
4147	return _mm_unpacklo_epi64(r16_1,r16_2);
4148	}
4149
4150	_NEON2SSE_GLOBAL uint16x8_t vmlsq_u16(uint16x8_t a, uint16x8_t b, uint16x8_t c); // VMLS.I16 q0,q0,q0
4151	#define vmlsq_u16 vmlsq_s16
4152
4153	_NEON2SSE_GLOBAL uint32x4_t vmlsq_u32(uint32x4_t a, uint32x4_t b, uint32x4_t c); // VMLS.I32 q0,q0,q0
4154	#define vmlsq_u32 vmlsq_s32
4155
4156	//****************** Vector multiply subtract long (widening multiply subtract) **********************************
4157	//*************************************************************************************************************
4158	_NEON2SSESTORAGE int16x8_t vmlsl_s8(int16x8_t a, int8x8_t b, int8x8_t c); // VMLSL.S8 q0,d0,d0
4159	_NEON2SSE_INLINE int16x8_t vmlsl_s8(int16x8_t a, int8x8_t b, int8x8_t c) // VMLSL.S8 q0,d0,d0
4160	{
4161	int16x8_t res;
4162	res = vmull_s8(b, c);
4163	return _mm_sub_epi16 (a, res);
4164	}
4165
4166	_NEON2SSESTORAGE int32x4_t vmlsl_s16(int32x4_t a, int16x4_t b, int16x4_t c); // VMLSL.S16 q0,d0,d0
4167	_NEON2SSE_INLINE int32x4_t vmlsl_s16(int32x4_t a, int16x4_t b, int16x4_t c) // VMLSL.S16 q0,d0,d0
4168	{
4169	//may be not optimal compared with serial implementation
4170	int32x4_t res;
4171	res = vmull_s16(b, c);
4172	return _mm_sub_epi32 (a, res);
4173	}
4174
4175	_NEON2SSESTORAGE int64x2_t vmlsl_s32(int64x2_t a, int32x2_t b, int32x2_t c); // VMLSL.S32 q0,d0,d0
4176	_NEON2SSE_INLINE int64x2_t vmlsl_s32(int64x2_t a, int32x2_t b, int32x2_t c) // VMLSL.S32 q0,d0,d0
4177	{
4178	//may be not optimal compared with serial implementation
4179	int64x2_t res;
4180	res = vmull_s32( b,c);
4181	return _mm_sub_epi64 (a, res);
4182	}
4183
4184	_NEON2SSESTORAGE uint16x8_t vmlsl_u8(uint16x8_t a, uint8x8_t b, uint8x8_t c); // VMLSL.U8 q0,d0,d0
4185	_NEON2SSE_INLINE uint16x8_t vmlsl_u8(uint16x8_t a, uint8x8_t b, uint8x8_t c) // VMLSL.U8 q0,d0,d0
4186	{
4187	uint16x8_t res;
4188	res = vmull_u8(b, c);
4189	return _mm_sub_epi16 (a, res);
4190	}
4191
4192	_NEON2SSESTORAGE uint32x4_t vmlsl_u16(uint32x4_t a, uint16x4_t b, uint16x4_t c); // VMLSL.s16 q0,d0,d0
4193	_NEON2SSE_INLINE uint32x4_t vmlsl_u16(uint32x4_t a, uint16x4_t b, uint16x4_t c) // VMLSL.s16 q0,d0,d0
4194	{
4195	//may be not optimal compared with serial implementation
4196	uint32x4_t res;
4197	res = vmull_u16(b, c);
4198	return _mm_sub_epi32 (a, res);
4199	}
4200
4201	_NEON2SSESTORAGE uint64x2_t vmlsl_u32(uint64x2_t a, uint32x2_t b, uint32x2_t c); // VMLSL.U32 q0,d0,d0
4202	_NEON2SSE_INLINE uint64x2_t vmlsl_u32(uint64x2_t a, uint32x2_t b, uint32x2_t c) // VMLSL.U32 q0,d0,d0
4203	{
4204	//may be not optimal compared with serial implementation
4205	int64x2_t res;
4206	res = vmull_u32( b,c);
4207	return _mm_sub_epi64 (a, res);
4208	}
4209
4210	//**** Vector saturating doubling multiply high ********************
4211	//*************************************************************************
4212	_NEON2SSESTORAGE int16x4_t vqdmulh_s16(int16x4_t a, int16x4_t b); // VQDMULH.S16 d0,d0,d0
4213	_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int16x4_t vqdmulh_s16(int16x4_t a, int16x4_t b), _NEON2SSE_REASON_SLOW_SERIAL)
4214	{
4215	int16x4_t res;
4216	int32_t a32, b32, i;
4217	for (i = `0`; i<`4`; i++) {
4218	a32 = (int32_t) a.m64_i16[i];
4219	b32 = (int32_t) b.m64_i16[i];
4220	a32 = (a32 * b32) >> `15`;
4221	res.m64_i16[i] = (a32 == `0x8000`) ? `0x7fff` : (int16_t) a32;
4222	}
4223	return res;
4224	}
4225
4226	_NEON2SSESTORAGE int32x2_t vqdmulh_s32(int32x2_t a, int32x2_t b); // VQDMULH.S32 d0,d0,d0
4227	_NEON2SSE_INLINE int32x2_t vqdmulh_s32(int32x2_t a, int32x2_t b) // no multiply high 32 bit SIMD in IA32, so need to do some tricks, serial solution may be faster
4228	{
4229	//may be not optimal compared with a serial solution
4230	int32x2_t res64;
4231	__m128i mask;
4232	_NEON2SSE_ALIGN_16 static const uint32_t cmask32[] = {`0x80000000`, `0x80000000`, `0x80000000`, `0x80000000`};
4233	int64x2_t mul;
4234	mul = vmull_s32(a,b);
4235	mul = _mm_slli_epi64(mul,`1`); //double the result
4236	//at this point start treating 2 64-bit numbers as 4 32-bit
4237	mul = _mm_shuffle_epi32 (mul, `1` \| (`3` << `2`) \| (`0` << `4`) \| (`2` << `6`)); //shuffle the data to get 2 32-bits
4238	mask = _mm_cmpeq_epi32 (mul, (__m128i)cmask32);
4239	mul = _mm_xor_si128 (mul, mask); //res saturated for 0x80000000
4240	return64(mul);
4241	}
4242
4243	_NEON2SSESTORAGE int16x8_t vqdmulhq_s16(int16x8_t a, int16x8_t b); // VQDMULH.S16 q0,q0,q0
4244	_NEON2SSE_INLINE int16x8_t vqdmulhq_s16(int16x8_t a, int16x8_t b) // VQDMULH.S16 q0,q0,q0
4245	{
4246	__m128i res, res_lo, mask;
4247	_NEON2SSE_ALIGN_16 static const uint16_t cmask[] = {`0x8000`, `0x8000`, `0x8000`, `0x8000`, `0x8000`, `0x8000`, `0x8000`, `0x8000`};
4248	res = _mm_mulhi_epi16 (a, b);
4249	res = _mm_slli_epi16 (res, `1`); //double the result, don't care about saturation
4250	res_lo = _mm_mullo_epi16 (a, b);
4251	res_lo = _mm_srli_epi16(res_lo,`15`); //take the highest bit
4252	res = _mm_add_epi16(res, res_lo); //combine results
4253	mask = _mm_cmpeq_epi16 (res, (__m128i)cmask);
4254	return _mm_xor_si128 (res, mask); //res saturated for 0x8000
4255	}
4256
4257	_NEON2SSESTORAGE int32x4_t vqdmulhq_s32(int32x4_t a, int32x4_t b); // VQDMULH.S32 q0,q0,q0
4258	_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int32x4_t vqdmulhq_s32(int32x4_t a, int32x4_t b), _NEON2SSE_REASON_SLOW_UNEFFECTIVE)
4259	{
4260	// no multiply high 32 bit SIMD in IA32, may be not optimal compared with a serial solution for the SSSE3 target
4261	__m128i ab, ba, mask, mul, mul1;
4262	_NEON2SSE_ALIGN_16 static const uint32_t cmask32[] = {`0x80000000`, `0x80000000`, `0x80000000`, `0x80000000`};
4263	ab = _mm_unpacklo_epi32 (a, b); //a0, b0, a1,b1
4264	ba = _mm_unpacklo_epi32 (b, a); //b0, a0, b1,a1
4265	mul = _MM_MUL_EPI32(ab, ba); //uses 1rst and 3rd data lanes, the multiplication gives 64 bit result
4266	mul = _mm_slli_epi64(mul,`1`); //double the result
4267	ab = _mm_unpackhi_epi32 (a, b); //a2, b2, a3,b3
4268	ba = _mm_unpackhi_epi32 (b, a); //b2, a2, b3,a3
4269	mul1 = _MM_MUL_EPI32(ab, ba); //uses 1rst and 3rd data lanes, the multiplication gives 64 bit result
4270	mul1 = _mm_slli_epi64(mul1,`1`); //double the result
4271	mul = _mm_shuffle_epi32 (mul, `1` \| (`3` << `2`) \| (`0` << `4`) \| (`2` << `6`)); //shuffle the data to get 2 32-bits
4272	mul1 = _mm_shuffle_epi32 (mul1, `1` \| (`3` << `2`) \| (`0` << `4`) \| (`2` << `6`)); //shuffle the data to get 2 32-bits
4273	mul = _mm_unpacklo_epi64(mul, mul1);
4274	mask = _mm_cmpeq_epi32 (mul, (__m128i)cmask32);
4275	return _mm_xor_si128 (mul, mask); //res saturated for 0x80000000
4276	}
4277
4278	//******* Vector saturating rounding doubling multiply high **************
4279	//****************************************************************************
4280	//If use _mm_mulhrs_xx functions the result may differ from NEON one a little due to different rounding rules and order
4281	_NEON2SSESTORAGE int16x4_t vqrdmulh_s16(int16x4_t a, int16x4_t b); // VQRDMULH.S16 d0,d0,d0
4282	_NEON2SSE_INLINE int16x4_t vqrdmulh_s16(int16x4_t a, int16x4_t b)
4283	{
4284	int16x4_t res64;
4285	return64(vqrdmulhq_s16(_pM128i(a), _pM128i(b)));
4286	}
4287
4288	_NEON2SSESTORAGE int32x2_t vqrdmulh_s32(int32x2_t a, int32x2_t b); // VQRDMULH.S32 d0,d0,d0
4289	_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int32x2_t vqrdmulh_s32(int32x2_t a, int32x2_t b), _NEON2SSE_REASON_SLOW_UNEFFECTIVE)
4290	{
4291	//may be not optimal compared with a serial solution
4292	int32x2_t res64;
4293	_NEON2SSE_ALIGN_16 static const uint32_t cmask32[] = {`0x80000000`, `0x80000000`, `0x80000000`, `0x80000000`};
4294	__m128i res_sat, mask, mask1;
4295	int64x2_t mul;
4296	mul = vmull_s32(a,b);
4297	res_sat = _mm_slli_epi64 (mul, `1`); //double the result, saturation not considered
4298	mask1 = _mm_slli_epi64(res_sat, `32`); //shift left then back right to
4299	mask1 = _mm_srli_epi64(mask1,`31`); //get 31-th bit 1 or zero
4300	mul = _mm_add_epi32 (res_sat, mask1); //actual rounding
4301	//at this point start treating 2 64-bit numbers as 4 32-bit
4302	mul = _mm_shuffle_epi32 (mul, `1` \| (`3` << `2`) \| (`0` << `4`) \| (`2` << `6`)); //shuffle the data to get 2 32-bits from each 64-bit
4303	mask = _mm_cmpeq_epi32 (mul, (__m128i)cmask32);
4304	mul = _mm_xor_si128 (mul, mask); //res saturated for 0x80000000
4305	return64(mul);
4306	}
4307
4308	_NEON2SSESTORAGE int16x8_t vqrdmulhq_s16(int16x8_t a, int16x8_t b); // VQRDMULH.S16 q0,q0,q0
4309	_NEON2SSE_INLINE int16x8_t vqrdmulhq_s16(int16x8_t a, int16x8_t b) // VQRDMULH.S16 q0,q0,q0
4310	{
4311	__m128i mask, res;
4312	_NEON2SSE_ALIGN_16 static const uint16_t cmask[] = {`0x8000`, `0x8000`, `0x8000`, `0x8000`, `0x8000`, `0x8000`, `0x8000`, `0x8000`};
4313	res = _mm_mulhrs_epi16 (a, b);
4314	mask = _mm_cmpeq_epi16 (res, (__m128i)cmask);
4315	return _mm_xor_si128 (res, mask); //res saturated for 0x8000
4316	}
4317
4318	_NEON2SSESTORAGE int32x4_t vqrdmulhq_s32(int32x4_t a, int32x4_t b); // VQRDMULH.S32 q0,q0,q0
4319	_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int32x4_t vqrdmulhq_s32(int32x4_t a, int32x4_t b), _NEON2SSE_REASON_SLOW_UNEFFECTIVE)
4320	{
4321	// no multiply high 32 bit SIMD in IA32, may be not optimal compared with a serial solution for the SSSE3 target
4322	__m128i ab, ba, mask, mul, mul1, mask1;
4323	_NEON2SSE_ALIGN_16 static const uint32_t cmask32[] = {`0x80000000`, `0x80000000`, `0x80000000`, `0x80000000`};
4324	ab = _mm_unpacklo_epi32 (a, b); //a0, b0, a1,b1
4325	ba = _mm_unpacklo_epi32 (b, a); //b0, a0, b1,a1
4326	mul = _MM_MUL_EPI32(ab, ba); //uses 1rst and 3rd data lanes, the multiplication gives 64 bit result
4327	mul = _mm_slli_epi64 (mul, `1`); //double the result, saturation not considered
4328	mask1 = _mm_slli_epi64(mul, `32`); //shift left then back right to
4329	mask1 = _mm_srli_epi64(mask1,`31`); //get 31-th bit 1 or zero
4330	mul = _mm_add_epi32 (mul, mask1); //actual rounding
4331
4332	ab = _mm_unpackhi_epi32 (a, b); //a2, b2, a3,b3
4333	ba = _mm_unpackhi_epi32 (b, a); //b2, a2, b3,a3
4334	mul1 = _MM_MUL_EPI32(ab, ba); //uses 1rst and 3rd data lanes, the multiplication gives 64 bit result
4335	mul1 = _mm_slli_epi64 (mul1, `1`); //double the result, saturation not considered
4336	mask1 = _mm_slli_epi64(mul1, `32`); //shift left then back right to
4337	mask1 = _mm_srli_epi64(mask1,`31`); //get 31-th bit 1 or zero
4338	mul1 = _mm_add_epi32 (mul1, mask1); //actual rounding
4339	//at this point start treating 2 64-bit numbers as 4 32-bit
4340	mul = _mm_shuffle_epi32 (mul, `1` \| (`3` << `2`) \| (`0` << `4`) \| (`2` << `6`)); //shuffle the data to get 2 32-bits from each 64-bit
4341	mul1 = _mm_shuffle_epi32 (mul1, `1` \| (`3` << `2`) \| (`0` << `4`) \| (`2` << `6`)); //shuffle the data to get 2 32-bits from each 64-bit
4342	mul = _mm_unpacklo_epi64(mul, mul1);
4343	mask = _mm_cmpeq_epi32 (mul, (__m128i)cmask32);
4344	return _mm_xor_si128 (mul, mask); //res saturated for 0x80000000
4345	}
4346
4347	//***********Vector widening saturating doubling multiply accumulate (long saturating doubling multiply accumulate) ***
4348	//*************************************************************************************************************************
4349	_NEON2SSESTORAGE int32x4_t vqdmlal_s16(int32x4_t a, int16x4_t b, int16x4_t c); // VQDMLAL.S16 q0,d0,d0
4350	_NEON2SSE_INLINE int32x4_t vqdmlal_s16(int32x4_t a, int16x4_t b, int16x4_t c) // VQDMLAL.S16 q0,d0,d0
4351	{
4352	//not optimal SIMD soulution, serial may be faster
4353	__m128i res32;
4354	res32 = vmull_s16(b, c);
4355	res32 = vqd_s32(res32); //doubling & saturation ,if no saturation we could use _mm_slli_epi32 (res, 1);
4356	return vqaddq_s32(res32, a); //saturation
4357	}
4358
4359	_NEON2SSESTORAGE int64x2_t vqdmlal_s32(int64x2_t a, int32x2_t b, int32x2_t c); // VQDMLAL.S32 q0,d0,d0
4360	_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int64x2_t vqdmlal_s32(int64x2_t a, int32x2_t b, int32x2_t c),_NEON2SSE_REASON_SLOW_SERIAL)
4361	{
4362	__m128i res64;
4363	res64 = vmull_s32(b,c);
4364	res64 = vqaddq_s64(res64, res64); //doubling & saturation ,if no saturation we could use _mm_slli_epi64 (res, 1);
4365	return vqaddq_s64(res64, a); //saturation
4366	}
4367
4368	//************************************************************************************
4369	//**************** Vector subtract *********************************************
4370	//************************************************************************************
4371	_NEON2SSESTORAGE int8x8_t vsub_s8(int8x8_t a, int8x8_t b); // VSUB.I8 d0,d0,d0
4372	_NEON2SSE_INLINE int8x8_t vsub_s8(int8x8_t a, int8x8_t b)
4373	{
4374	int8x8_t res64;
4375	return64(_mm_sub_epi8(_pM128i(a),_pM128i(b)));
4376	}
4377
4378
4379	_NEON2SSESTORAGE int16x4_t vsub_s16(int16x4_t a, int16x4_t b); // VSUB.I16 d0,d0,d0
4380	_NEON2SSE_INLINE int16x4_t vsub_s16(int16x4_t a, int16x4_t b)
4381	{
4382	int16x4_t res64;
4383	return64(_mm_sub_epi16(_pM128i(a),_pM128i(b)));
4384	}
4385
4386
4387	_NEON2SSESTORAGE int32x2_t vsub_s32(int32x2_t a, int32x2_t b); // VSUB.I32 d0,d0,d0
4388	_NEON2SSE_INLINE int32x2_t vsub_s32(int32x2_t a, int32x2_t b)
4389	{
4390	int32x2_t res64;
4391	return64(_mm_sub_epi32(_pM128i(a),_pM128i(b)));
4392	}
4393
4394
4395	_NEON2SSESTORAGE int64x1_t vsub_s64(int64x1_t a, int64x1_t b); // VSUB.I64 d0,d0,d0
4396	_NEON2SSE_INLINE int64x1_t vsub_s64(int64x1_t a, int64x1_t b)
4397	{
4398	int64x1_t res64;
4399	res64.m64_i64[`0`] = a.m64_i64[`0`] - b.m64_i64[`0`];
4400	return res64;
4401	}
4402
4403
4404	_NEON2SSESTORAGE float32x2_t vsub_f32(float32x2_t a, float32x2_t b); // VSUB.F32 d0,d0,d0
4405	_NEON2SSE_INLINE float32x2_t vsub_f32(float32x2_t a, float32x2_t b)
4406	{
4407	float32x2_t res;
4408	res.m64_f32[`0`] = a.m64_f32[`0`] - b.m64_f32[`0`];
4409	res.m64_f32[`1`] = a.m64_f32[`1`] - b.m64_f32[`1`];
4410	return res;
4411	}
4412
4413	_NEON2SSE_GLOBAL uint8x8_t vsub_u8(uint8x8_t a, uint8x8_t b); // VSUB.I8 d0,d0,d0
4414	#define vsub_u8 vsub_s8
4415
4416	_NEON2SSE_GLOBAL uint16x4_t vsub_u16(uint16x4_t a, uint16x4_t b); // VSUB.I16 d0,d0,d0
4417	#define vsub_u16 vsub_s16
4418
4419	_NEON2SSE_GLOBAL uint32x2_t vsub_u32(uint32x2_t a, uint32x2_t b); // VSUB.I32 d0,d0,d0
4420	#define vsub_u32 vsub_s32
4421
4422
4423	_NEON2SSESTORAGE uint64x1_t vsub_u64(uint64x1_t a, uint64x1_t b); // VSUB.I64 d0,d0,d0
4424	_NEON2SSE_INLINE uint64x1_t vsub_u64(uint64x1_t a, uint64x1_t b)
4425	{
4426	int64x1_t res64;
4427	res64.m64_u64[`0`] = a.m64_u64[`0`] - b.m64_u64[`0`];
4428	return res64;
4429	}
4430
4431
4432	_NEON2SSE_GLOBAL int8x16_t vsubq_s8(int8x16_t a, int8x16_t b); // VSUB.I8 q0,q0,q0
4433	#define vsubq_s8 _mm_sub_epi8
4434
4435	_NEON2SSE_GLOBAL int16x8_t vsubq_s16(int16x8_t a, int16x8_t b); // VSUB.I16 q0,q0,q0
4436	#define vsubq_s16 _mm_sub_epi16
4437
4438	_NEON2SSE_GLOBAL int32x4_t vsubq_s32(int32x4_t a, int32x4_t b); // VSUB.I32 q0,q0,q0
4439	#define vsubq_s32 _mm_sub_epi32
4440
4441	_NEON2SSE_GLOBAL int64x2_t vsubq_s64(int64x2_t a, int64x2_t b); // VSUB.I64 q0,q0,q0
4442	#define vsubq_s64 _mm_sub_epi64
4443
4444	_NEON2SSE_GLOBAL float32x4_t vsubq_f32(float32x4_t a, float32x4_t b); // VSUB.F32 q0,q0,q0
4445	#define vsubq_f32 _mm_sub_ps
4446
4447	_NEON2SSE_GLOBAL uint8x16_t vsubq_u8(uint8x16_t a, uint8x16_t b); // VSUB.I8 q0,q0,q0
4448	#define vsubq_u8 _mm_sub_epi8
4449
4450	_NEON2SSE_GLOBAL uint16x8_t vsubq_u16(uint16x8_t a, uint16x8_t b); // VSUB.I16 q0,q0,q0
4451	#define vsubq_u16 _mm_sub_epi16
4452
4453	_NEON2SSE_GLOBAL uint32x4_t vsubq_u32(uint32x4_t a, uint32x4_t b); // VSUB.I32 q0,q0,q0
4454	#define vsubq_u32 _mm_sub_epi32
4455
4456	_NEON2SSE_GLOBAL uint64x2_t vsubq_u64(uint64x2_t a, uint64x2_t b); // VSUB.I64 q0,q0,q0
4457	#define vsubq_u64 _mm_sub_epi64
4458
4459	//*************Vector long subtract: vsub -> Vr[i]:=Va[i]-Vb[i] ****************
4460	//***********************************************************************************
4461	//Va, Vb have equal lane sizes, result is a 128 bit vector of lanes that are twice the width.
4462	_NEON2SSESTORAGE int16x8_t vsubl_s8(int8x8_t a, int8x8_t b); // VSUBL.S8 q0,d0,d0
4463	_NEON2SSE_INLINE int16x8_t vsubl_s8(int8x8_t a, int8x8_t b) // VSUBL.S8 q0,d0,d0
4464	{
4465	__m128i a16, b16;
4466	a16 = _MM_CVTEPI8_EPI16 (_pM128i(a)); //SSE4.1,
4467	b16 = _MM_CVTEPI8_EPI16 (_pM128i(b)); //SSE4.1,
4468	return _mm_sub_epi16 (a16, b16);
4469	}
4470
4471	_NEON2SSESTORAGE int32x4_t vsubl_s16(int16x4_t a, int16x4_t b); // VSUBL.S16 q0,d0,d0
4472	_NEON2SSE_INLINE int32x4_t vsubl_s16(int16x4_t a, int16x4_t b) // VSUBL.S16 q0,d0,d0
4473	{
4474	__m128i a32, b32;
4475	a32 = _MM_CVTEPI16_EPI32 (_pM128i(a)); //SSE4.1
4476	b32 = _MM_CVTEPI16_EPI32 (_pM128i(b)); //SSE4.1,
4477	return _mm_sub_epi32 (a32, b32);
4478	}
4479
4480	_NEON2SSESTORAGE int64x2_t vsubl_s32(int32x2_t a, int32x2_t b); // VSUBL.S32 q0,d0,d0
4481	_NEON2SSE_INLINE int64x2_t vsubl_s32(int32x2_t a, int32x2_t b) // VSUBL.S32 q0,d0,d0
4482	{
4483	//may be not optimal
4484	__m128i a64, b64;
4485	a64 = _MM_CVTEPI32_EPI64 (_pM128i(a)); //SSE4.1
4486	b64 = _MM_CVTEPI32_EPI64 (_pM128i(b)); //SSE4.1,
4487	return _mm_sub_epi64 (a64, b64);
4488	}
4489
4490	_NEON2SSESTORAGE uint16x8_t vsubl_u8(uint8x8_t a, uint8x8_t b); // VSUBL.U8 q0,d0,d0
4491	_NEON2SSE_INLINE uint16x8_t vsubl_u8(uint8x8_t a, uint8x8_t b) // VSUBL.U8 q0,d0,d0
4492	{
4493	__m128i a16, b16;
4494	a16 = _MM_CVTEPU8_EPI16 (_pM128i(a)); //SSE4.1,
4495	b16 = _MM_CVTEPU8_EPI16 (_pM128i(b)); //SSE4.1,
4496	return _mm_sub_epi16 (a16, b16);
4497	}
4498
4499	_NEON2SSESTORAGE uint32x4_t vsubl_u16(uint16x4_t a, uint16x4_t b); // VSUBL.s16 q0,d0,d0
4500	_NEON2SSE_INLINE uint32x4_t vsubl_u16(uint16x4_t a, uint16x4_t b) // VSUBL.s16 q0,d0,d0
4501	{
4502	__m128i a32, b32;
4503	a32 = _MM_CVTEPU16_EPI32 (_pM128i(a)); //SSE4.1
4504	b32 = _MM_CVTEPU16_EPI32 (_pM128i(b)); //SSE4.1,
4505	return _mm_sub_epi32 (a32, b32);
4506	}
4507
4508	_NEON2SSESTORAGE uint64x2_t vsubl_u32(uint32x2_t a, uint32x2_t b); // VSUBL.U32 q0,d0,d0
4509	_NEON2SSE_INLINE uint64x2_t vsubl_u32(uint32x2_t a, uint32x2_t b) // VSUBL.U32 q0,d0,d0
4510	{
4511	//may be not optimal
4512	__m128i a64, b64;
4513	a64 = _MM_CVTEPU32_EPI64 (_pM128i(a)); //SSE4.1
4514	b64 = _MM_CVTEPU32_EPI64 (_pM128i(b)); //SSE4.1,
4515	return _mm_sub_epi64 (a64, b64);
4516	}
4517
4518	//*************** Vector wide subtract: vsub -> Vr[i]:=Va[i]-Vb[i] ********************************
4519	//*****************************************************************************************************
4520	_NEON2SSESTORAGE int16x8_t vsubw_s8(int16x8_t a, int8x8_t b); // VSUBW.S8 q0,q0,d0
4521	_NEON2SSE_INLINE int16x8_t vsubw_s8(int16x8_t a, int8x8_t b) // VSUBW.S8 q0,q0,d0
4522	{
4523	__m128i b16;
4524	b16 = _MM_CVTEPI8_EPI16 (_pM128i(b)); //SSE4.1,
4525	return _mm_sub_epi16 (a, b16);
4526	}
4527
4528	_NEON2SSESTORAGE int32x4_t vsubw_s16(int32x4_t a, int16x4_t b); // VSUBW.S16 q0,q0,d0
4529	_NEON2SSE_INLINE int32x4_t vsubw_s16(int32x4_t a, int16x4_t b) // VSUBW.S16 q0,q0,d0
4530	{
4531	__m128i b32;
4532	b32 = _MM_CVTEPI16_EPI32 (_pM128i(b)); //SSE4.1,
4533	return _mm_sub_epi32 (a, b32);
4534	}
4535
4536	_NEON2SSESTORAGE int64x2_t vsubw_s32(int64x2_t a, int32x2_t b); // VSUBW.S32 q0,q0,d0
4537	_NEON2SSE_INLINE int64x2_t vsubw_s32(int64x2_t a, int32x2_t b) // VSUBW.S32 q0,q0,d0
4538	{
4539	__m128i b64;
4540	b64 = _MM_CVTEPI32_EPI64 (_pM128i(b)); //SSE4.1
4541	return _mm_sub_epi64 (a, b64);
4542	}
4543
4544	_NEON2SSESTORAGE uint16x8_t vsubw_u8(uint16x8_t a, uint8x8_t b); // VSUBW.U8 q0,q0,d0
4545	_NEON2SSE_INLINE uint16x8_t vsubw_u8(uint16x8_t a, uint8x8_t b) // VSUBW.U8 q0,q0,d0
4546	{
4547	__m128i b16;
4548	b16 = _MM_CVTEPU8_EPI16 (_pM128i(b)); //SSE4.1,
4549	return _mm_sub_epi16 (a, b16);
4550	}
4551
4552	_NEON2SSESTORAGE uint32x4_t vsubw_u16(uint32x4_t a, uint16x4_t b); // VSUBW.s16 q0,q0,d0
4553	_NEON2SSE_INLINE uint32x4_t vsubw_u16(uint32x4_t a, uint16x4_t b) // VSUBW.s16 q0,q0,d0
4554	{
4555	__m128i b32;
4556	b32 = _MM_CVTEPU16_EPI32 (_pM128i(b)); //SSE4.1,
4557	return _mm_sub_epi32 (a, b32);
4558	}
4559
4560	_NEON2SSESTORAGE uint64x2_t vsubw_u32(uint64x2_t a, uint32x2_t b); // VSUBW.U32 q0,q0,d0
4561	_NEON2SSE_INLINE uint64x2_t vsubw_u32(uint64x2_t a, uint32x2_t b) // VSUBW.U32 q0,q0,d0
4562	{
4563	__m128i b64;
4564	b64 = _MM_CVTEPU32_EPI64 (_pM128i(b)); //SSE4.1
4565	return _mm_sub_epi64 (a, b64);
4566	}
4567
4568	//**********************Vector saturating subtract *******************************
4569	//*************************************************************************************
4570	_NEON2SSESTORAGE int8x8_t vqsub_s8(int8x8_t a, int8x8_t b); // VQSUB.S8 d0,d0,d0
4571	_NEON2SSE_INLINE int8x8_t vqsub_s8(int8x8_t a, int8x8_t b)
4572	{
4573	int8x8_t res64;
4574	return64(_mm_subs_epi8(_pM128i(a),_pM128i(b)));
4575	}
4576
4577
4578	_NEON2SSESTORAGE int16x4_t vqsub_s16(int16x4_t a, int16x4_t b); // VQSUB.S16 d0,d0,d0
4579	_NEON2SSE_INLINE int16x4_t vqsub_s16(int16x4_t a, int16x4_t b)
4580	{
4581	int16x4_t res64;
4582	return64(_mm_subs_epi16(_pM128i(a),_pM128i(b)));
4583	}
4584
4585
4586	_NEON2SSESTORAGE int32x2_t vqsub_s32(int32x2_t a, int32x2_t b); // VQSUB.S32 d0,d0,d0
4587	_NEON2SSE_INLINE int32x2_t vqsub_s32(int32x2_t a, int32x2_t b)
4588	{
4589	int32x2_t res64;
4590	return64(vqsubq_s32(_pM128i(a), _pM128i(b)));
4591	}
4592
4593
4594	_NEON2SSESTORAGE int64x1_t vqsub_s64(int64x1_t a, int64x1_t b); // VQSUB.S64 d0,d0,d0
4595	_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int64x1_t vqsub_s64(int64x1_t a, int64x1_t b), _NEON2SSE_REASON_SLOW_SERIAL) //no optimal SIMD soulution
4596	{
4597	uint64x1_t res;
4598	uint64_t a64,b64;
4599	a64 = a.m64_u64[`0`];
4600	b64 = b.m64_u64[`0`];
4601	res.m64_u64[`0`] = a64 - b64;
4602
4603	a64 = (a64 >> `63`) + (~_SIGNBIT64);
4604	if ((int64_t)((a64 ^ b64) & (a64 ^ res.m64_u64[`0`])) < `0`) {
4605	res.m64_u64[`0`] = a64;
4606	}
4607	return res;
4608	}
4609
4610	_NEON2SSESTORAGE uint8x8_t vqsub_u8(uint8x8_t a, uint8x8_t b); // VQSUB.U8 d0,d0,d0
4611	_NEON2SSE_INLINE uint8x8_t vqsub_u8(uint8x8_t a, uint8x8_t b)
4612	{
4613	uint8x8_t res64;
4614	return64(_mm_subs_epu8(_pM128i(a),_pM128i(b)));
4615	}
4616
4617
4618	_NEON2SSESTORAGE uint16x4_t vqsub_u16(uint16x4_t a, uint16x4_t b); // VQSUB.s16 d0,d0,d0
4619	_NEON2SSE_INLINE uint16x4_t vqsub_u16(uint16x4_t a, uint16x4_t b)
4620	{
4621	uint16x4_t res64;
4622	return64(_mm_subs_epu16(_pM128i(a),_pM128i(b)));
4623	}
4624
4625
4626	_NEON2SSESTORAGE uint32x2_t vqsub_u32(uint32x2_t a, uint32x2_t b); // VQSUB.U32 d0,d0,d0
4627	_NEON2SSE_INLINE uint32x2_t vqsub_u32(uint32x2_t a, uint32x2_t b)
4628	{
4629	uint32x2_t res64;
4630	return64(vqsubq_u32(_pM128i(a), _pM128i(b)));
4631	}
4632
4633
4634	_NEON2SSESTORAGE uint64x1_t vqsub_u64(uint64x1_t a, uint64x1_t b); // VQSUB.U64 d0,d0,d0
4635	_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint64x1_t vqsub_u64(uint64x1_t a, uint64x1_t b), _NEON2SSE_REASON_SLOW_SERIAL)
4636	{
4637	uint64x1_t res;
4638	uint64_t a64, b64;
4639	a64 = _Ui64(a);
4640	b64 = _Ui64(b);
4641	if (a64 > b64) {
4642	res.m64_u64[`0`] = a64 - b64;
4643	} else {
4644	res.m64_u64[`0`] = `0`;
4645	}
4646	return res;
4647	}
4648
4649	_NEON2SSE_GLOBAL int8x16_t vqsubq_s8(int8x16_t a, int8x16_t b); // VQSUB.S8 q0,q0,q0
4650	#define vqsubq_s8 _mm_subs_epi8
4651
4652	_NEON2SSE_GLOBAL int16x8_t vqsubq_s16(int16x8_t a, int16x8_t b); // VQSUB.S16 q0,q0,q0
4653	#define vqsubq_s16 _mm_subs_epi16
4654
4655	_NEON2SSESTORAGE int32x4_t vqsubq_s32(int32x4_t a, int32x4_t b); // VQSUB.S32 q0,q0,q0
4656	_NEON2SSE_INLINE int32x4_t vqsubq_s32(int32x4_t a, int32x4_t b)
4657	{
4658	//no corresponding x86 SIMD soulution, special tricks are necessary. The overflow is possible only if a and b have opposite signs and sub has opposite sign to a
4659	__m128i c7fffffff, res, res_sat, res_xor_a, b_xor_a;
4660	c7fffffff = _mm_set1_epi32(`0x7fffffff`);
4661	res = _mm_sub_epi32(a, b);
4662	res_sat = _mm_srli_epi32(a, `31`);
4663	res_sat = _mm_add_epi32(res_sat, c7fffffff);
4664	res_xor_a = _mm_xor_si128(res, a);
4665	b_xor_a = _mm_xor_si128(b, a);
4666	res_xor_a = _mm_and_si128(b_xor_a, res_xor_a);
4667	res_xor_a = _mm_srai_epi32(res_xor_a,`31`); //propagate the sigh bit, all ffff if <0 all ones otherwise
4668	res_sat = _mm_and_si128(res_xor_a, res_sat);
4669	res = _mm_andnot_si128(res_xor_a, res);
4670	return _mm_or_si128(res, res_sat);
4671	}
4672
4673	_NEON2SSESTORAGE int64x2_t vqsubq_s64(int64x2_t a, int64x2_t b); // VQSUB.S64 q0,q0,q0
4674	_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int64x2_t vqsubq_s64(int64x2_t a, int64x2_t b), _NEON2SSE_REASON_SLOW_SERIAL) //no optimal SIMD soulution
4675	{
4676	_NEON2SSE_ALIGN_16 int64_t atmp[`2`], btmp[`2`];
4677	_NEON2SSE_ALIGN_16 uint64_t res[`2`];
4678	_mm_store_si128((__m128i*)atmp, a);
4679	_mm_store_si128((__m128i*)btmp, b);
4680	res[`0`] = atmp[`0`] - btmp[`0`];
4681	res[`1`] = atmp[`1`] - btmp[`1`];
4682	if (((res[`0`] ^ atmp[`0`]) & _SIGNBIT64) && ((atmp[`0`] ^ btmp[`0`]) & _SIGNBIT64)) {
4683	res[`0`] = (atmp[`0`] >> `63`) ^ ~_SIGNBIT64;
4684	}
4685	if (((res[`1`] ^ atmp[`1`]) & _SIGNBIT64) && ((atmp[`1`] ^ btmp[`1`]) & _SIGNBIT64)) {
4686	res[`1`] = (atmp[`1`] >> `63`) ^ ~_SIGNBIT64;
4687	}
4688	return _mm_load_si128((__m128i*)res);
4689	}
4690
4691	_NEON2SSE_GLOBAL uint8x16_t vqsubq_u8(uint8x16_t a, uint8x16_t b); // VQSUB.U8 q0,q0,q0
4692	#define vqsubq_u8 _mm_subs_epu8
4693
4694	_NEON2SSE_GLOBAL uint16x8_t vqsubq_u16(uint16x8_t a, uint16x8_t b); // VQSUB.s16 q0,q0,q0
4695	#define vqsubq_u16 _mm_subs_epu16
4696
4697	_NEON2SSESTORAGE uint32x4_t vqsubq_u32(uint32x4_t a, uint32x4_t b); // VQSUB.U32 q0,q0,q0
4698	_NEON2SSE_INLINE uint32x4_t vqsubq_u32(uint32x4_t a, uint32x4_t b) // VQSUB.U32 q0,q0,q0
4699	{
4700	__m128i min, mask, sub;
4701	min = _MM_MIN_EPU32(a, b); //SSE4.1
4702	mask = _mm_cmpeq_epi32 (min, b);
4703	sub = _mm_sub_epi32 (a, b);
4704	return _mm_and_si128 ( sub, mask);
4705	}
4706
4707	_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint64x2_t vqsubq_u64(uint64x2_t a, uint64x2_t b), _NEON2SSE_REASON_SLOW_SERIAL); // VQSUB.U64 q0,q0,q0
4708	#ifdef USE_SSE4
4709	_NEON2SSE_INLINE uint64x2_t vqsubq_u64(uint64x2_t a, uint64x2_t b)
4710	{
4711	__m128i c80000000, subb, suba, cmp, sub;
4712	c80000000 = _mm_set_epi32 (`0x80000000`, `0x0`, `0x80000000`, `0x0`);
4713	sub = _mm_sub_epi64 (a, b);
4714	suba = _mm_sub_epi64 (a, c80000000);
4715	subb = _mm_sub_epi64 (b, c80000000);
4716	cmp = _mm_cmpgt_epi64 ( suba, subb); //no unsigned comparison, need to go to signed, SSE4.2!!!
4717	return _mm_and_si128 (sub, cmp); //saturation
4718	}
4719	#else
4720	_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint64x2_t vqsubq_u64(uint64x2_t a, uint64x2_t b), _NEON2SSE_REASON_SLOW_SERIAL)
4721	{
4722	_NEON2SSE_ALIGN_16 uint64_t atmp[`2`], btmp[`2`], res[`2`];
4723	_mm_store_si128((__m128i*)atmp, a);
4724	_mm_store_si128((__m128i*)btmp, b);
4725	res[`0`] = (atmp[`0`] > btmp[`0`]) ? atmp[`0`] - btmp[`0`] : `0`;
4726	res[`1`] = (atmp[`1`] > btmp[`1`]) ? atmp[`1`] - btmp[`1`] : `0`;
4727	return _mm_load_si128((__m128i*)(res));
4728	}
4729	#endif
4730
4731	//********Vector halving subtract Vr[i]:=(Va[i]-Vb[i])>>1 ****************************************************
4732	//****************************************************************
4733	_NEON2SSESTORAGE int8x8_t vhsub_s8(int8x8_t a, int8x8_t b); // VHSUB.S8 d0,d0,d0
4734	_NEON2SSE_INLINE int8x8_t vhsub_s8(int8x8_t a, int8x8_t b) // VHSUB.S8 d0,d0,d0
4735	{
4736	//no 8 bit shift available, internal overflow is possible, so let's go to 16 bit,
4737	int8x8_t res64;
4738	__m128i r16;
4739	int8x8_t r;
4740	r = vsub_s8 (a, b);
4741	r16 = _MM_CVTEPI8_EPI16 (_pM128i(r)); //SSE 4.1
4742	r16 = _mm_srai_epi16 (r16, `1`); //SSE2
4743	r16 = _mm_packs_epi16 (r16,r16); //use low 64 bits
4744	return64(r16);
4745	}
4746
4747	_NEON2SSESTORAGE int16x4_t vhsub_s16(int16x4_t a, int16x4_t b); // VHSUB.S16 d0,d0,d0
4748	_NEON2SSE_INLINE int16x4_t vhsub_s16(int16x4_t a, int16x4_t b)
4749	{
4750	int16x4_t res64;
4751	return64(vhsubq_s16(_pM128i(a), _pM128i(b)));
4752	}
4753
4754
4755
4756	_NEON2SSESTORAGE int32x2_t vhsub_s32(int32x2_t a, int32x2_t b); // VHSUB.S32 d0,d0,d0
4757	_NEON2SSE_INLINE int32x2_t vhsub_s32(int32x2_t a, int32x2_t b)
4758	{
4759	int32x2_t res64;
4760	return64(vhsubq_s32(_pM128i(a), _pM128i(b)));
4761	}
4762
4763
4764	_NEON2SSESTORAGE uint8x8_t vhsub_u8(uint8x8_t a, uint8x8_t b); // VHSUB.U8 d0,d0,d0
4765	_NEON2SSE_INLINE uint8x8_t vhsub_u8(uint8x8_t a, uint8x8_t b)
4766	{
4767	uint8x8_t res64;
4768	return64(vhsubq_u8(_pM128i(a), _pM128i(b)));
4769	}
4770
4771	_NEON2SSESTORAGE uint16x4_t vhsub_u16(uint16x4_t a, uint16x4_t b); // VHSUB.s16 d0,d0,d0
4772	_NEON2SSE_INLINE uint16x4_t vhsub_u16(uint16x4_t a, uint16x4_t b)
4773	{
4774	uint16x4_t res64;
4775	return64(vhsubq_u16(_pM128i(a), _pM128i(b)));
4776	}
4777
4778	_NEON2SSESTORAGE uint32x2_t vhsub_u32(uint32x2_t a, uint32x2_t b); // VHSUB.U32 d0,d0,d0
4779	_NEON2SSE_INLINE uint32x2_t vhsub_u32(uint32x2_t a, uint32x2_t b)
4780	{
4781	uint32x2_t res64;
4782	return64(vhsubq_u32(_pM128i(a), _pM128i(b)));
4783	}
4784
4785	_NEON2SSESTORAGE int8x16_t vhsubq_s8(int8x16_t a, int8x16_t b); // VHSUB.S8 q0,q0,q0
4786	_NEON2SSE_INLINE int8x16_t vhsubq_s8(int8x16_t a, int8x16_t b) // VHSUB.S8 q0,q0,q0
4787	{
4788	//need to deal with the possibility of internal overflow
4789	__m128i c128, au,bu;
4790	c128 = _mm_set1_epi8(-`128`); //(int8_t)0x80
4791	au = _mm_add_epi8( a, c128);
4792	bu = _mm_add_epi8( b, c128);
4793	return vhsubq_u8(au,bu);
4794	}
4795
4796	_NEON2SSESTORAGE int16x8_t vhsubq_s16(int16x8_t a, int16x8_t b); // VHSUB.S16 q0,q0,q0
4797	_NEON2SSE_INLINE int16x8_t vhsubq_s16(int16x8_t a, int16x8_t b) // VHSUB.S16 q0,q0,q0
4798	{
4799	//need to deal with the possibility of internal overflow
4800	__m128i c8000, au,bu;
4801	c8000 = _mm_set1_epi16(-`32768`); //(int16_t)0x8000
4802	au = _mm_add_epi16( a, c8000);
4803	bu = _mm_add_epi16( b, c8000);
4804	return vhsubq_u16(au,bu);
4805	}
4806
4807	_NEON2SSESTORAGE int32x4_t vhsubq_s32(int32x4_t a, int32x4_t b); // VHSUB.S32 q0,q0,q0
4808	_NEON2SSE_INLINE int32x4_t vhsubq_s32(int32x4_t a, int32x4_t b) // VHSUB.S32 q0,q0,q0
4809	{
4810	//need to deal with the possibility of internal overflow
4811	__m128i a2, b2,r, b_1;
4812	a2 = _mm_srai_epi32 (a,`1`);
4813	b2 = _mm_srai_epi32 (b,`1`);
4814	r = _mm_sub_epi32 (a2, b2);
4815	b_1 = _mm_andnot_si128(a, b); //!a and b
4816	b_1 = _mm_slli_epi32 (b_1,`31`);
4817	b_1 = _mm_srli_epi32 (b_1,`31`); //0 or 1, last b bit
4818	return _mm_sub_epi32(r,b_1);
4819	}
4820
4821	_NEON2SSESTORAGE uint8x16_t vhsubq_u8(uint8x16_t a, uint8x16_t b); // VHSUB.U8 q0,q0,q0
4822	_NEON2SSE_INLINE uint8x16_t vhsubq_u8(uint8x16_t a, uint8x16_t b) // VHSUB.U8 q0,q0,q0
4823	{
4824	__m128i avg;
4825	avg = _mm_avg_epu8 (a, b);
4826	return _mm_sub_epi8(a, avg);
4827	}
4828
4829	_NEON2SSESTORAGE uint16x8_t vhsubq_u16(uint16x8_t a, uint16x8_t b); // VHSUB.s16 q0,q0,q0
4830	_NEON2SSE_INLINE uint16x8_t vhsubq_u16(uint16x8_t a, uint16x8_t b) // VHSUB.s16 q0,q0,q0
4831	{
4832	__m128i avg;
4833	avg = _mm_avg_epu16 (a, b);
4834	return _mm_sub_epi16(a, avg);
4835	}
4836
4837	_NEON2SSESTORAGE uint32x4_t vhsubq_u32(uint32x4_t a, uint32x4_t b); // VHSUB.U32 q0,q0,q0
4838	_NEON2SSE_INLINE uint32x4_t vhsubq_u32(uint32x4_t a, uint32x4_t b) // VHSUB.U32 q0,q0,q0
4839	{
4840	//need to deal with the possibility of internal overflow
4841	__m128i a2, b2,r, b_1;
4842	a2 = _mm_srli_epi32 (a,`1`);
4843	b2 = _mm_srli_epi32 (b,`1`);
4844	r = _mm_sub_epi32 (a2, b2);
4845	b_1 = _mm_andnot_si128(a, b); //!a and b
4846	b_1 = _mm_slli_epi32 (b_1,`31`);
4847	b_1 = _mm_srli_epi32 (b_1,`31`); //0 or 1, last b bit
4848	return _mm_sub_epi32(r,b_1);
4849	}
4850
4851	//***** Vector subtract high half (truncated) ************
4852	//************************************************************
4853	_NEON2SSESTORAGE int8x8_t vsubhn_s16(int16x8_t a, int16x8_t b); // VSUBHN.I16 d0,q0,q0
4854	_NEON2SSE_INLINE int8x8_t vsubhn_s16(int16x8_t a, int16x8_t b) // VSUBHN.I16 d0,q0,q0
4855	{
4856	int8x8_t res64;
4857	__m128i sum, sum8;
4858	sum = _mm_sub_epi16 (a, b);
4859	sum8 = _mm_srai_epi16 (sum, `8`);
4860	sum8 = _mm_packs_epi16(sum8,sum8);
4861	return64(sum8);
4862	}
4863
4864	_NEON2SSESTORAGE int16x4_t vsubhn_s32(int32x4_t a, int32x4_t b); // VSUBHN.I32 d0,q0,q0
4865	_NEON2SSE_INLINE int16x4_t vsubhn_s32(int32x4_t a, int32x4_t b) // VSUBHN.I32 d0,q0,q0
4866	{
4867	int16x4_t res64;
4868	__m128i sum, sum16;
4869	sum = _mm_sub_epi32 (a, b);
4870	sum16 = _mm_srai_epi32 (sum, `16`);
4871	sum16 = _mm_packs_epi32(sum16,sum16);
4872	return64(sum16);
4873	}
4874
4875	_NEON2SSESTORAGE int32x2_t vsubhn_s64(int64x2_t a, int64x2_t b); // VSUBHN.I64 d0,q0,q0
4876	_NEON2SSE_INLINE int32x2_t vsubhn_s64(int64x2_t a, int64x2_t b)
4877	{
4878	int32x2_t res64;
4879	__m128i sub;
4880	sub = _mm_sub_epi64 (a, b);
4881	sub = _mm_shuffle_epi32(sub, `1` \| (`3` << `2`) \| (`0` << `4`) \| (`2` << `6`));
4882	return64(sub);
4883	}
4884
4885	_NEON2SSESTORAGE uint8x8_t vsubhn_u16(uint16x8_t a, uint16x8_t b); // VSUBHN.I16 d0,q0,q0
4886	_NEON2SSE_INLINE uint8x8_t vsubhn_u16(uint16x8_t a, uint16x8_t b) // VSUBHN.I16 d0,q0,q0
4887	{
4888	uint8x8_t res64;
4889	__m128i sum, sum8;
4890	sum = _mm_sub_epi16 (a, b);
4891	sum8 = _mm_srli_epi16 (sum, `8`);
4892	sum8 = _mm_packus_epi16(sum8,sum8);
4893	return64(sum8);
4894	}
4895
4896	_NEON2SSESTORAGE uint16x4_t vsubhn_u32(uint32x4_t a, uint32x4_t b); // VSUBHN.I32 d0,q0,q0
4897	_NEON2SSE_INLINE uint16x4_t vsubhn_u32(uint32x4_t a, uint32x4_t b) // VSUBHN.I32 d0,q0,q0
4898	{
4899	uint16x4_t res64;
4900	__m128i sum, sum16;
4901	sum = _mm_sub_epi32 (a, b);
4902	sum16 = _mm_srli_epi32 (sum, `16`);
4903	#ifdef USE_SSE4
4904	sum16 = _MM_PACKUS1_EPI32(sum16);
4905	#else
4906	sum16 = _mm_shuffle_epi8 (sum16, (__m128i) mask8_32_even_odd); //go to 16 bits
4907	#endif
4908	return64(sum16);
4909	}
4910
4911	_NEON2SSE_GLOBAL uint32x2_t vsubhn_u64(uint64x2_t a, uint64x2_t b); // VSUBHN.I64 d0,q0,q0
4912	#define vsubhn_u64 vsubhn_s64
4913
4914	//********** Vector rounding subtract high half *******************
4915	//*********************************************************************
4916	_NEON2SSESTORAGE int8x8_t vrsubhn_s16(int16x8_t a, int16x8_t b); // VRSUBHN.I16 d0,q0,q0
4917	_NEON2SSE_INLINE int8x8_t vrsubhn_s16(int16x8_t a, int16x8_t b) // VRSUBHN.I16 d0,q0,q0
4918	{
4919	int8x8_t res64;
4920	__m128i sub, mask1;
4921	sub = _mm_sub_epi16 (a, b);
4922	mask1 = _mm_slli_epi16(sub, `8`); //shift left then back right to
4923	mask1 = _mm_srli_epi16(mask1, `15`); //get 7-th bit 1 or zero
4924	sub = _mm_srai_epi16 (sub, `8`); //get high half
4925	sub = _mm_add_epi16 (sub, mask1); //actual rounding
4926	sub = _mm_packs_epi16 (sub, sub);
4927	return64(sub);
4928	}
4929
4930	_NEON2SSESTORAGE int16x4_t vrsubhn_s32(int32x4_t a, int32x4_t b); // VRSUBHN.I32 d0,q0,q0
4931	_NEON2SSE_INLINE int16x4_t vrsubhn_s32(int32x4_t a, int32x4_t b) // VRSUBHN.I32 d0,q0,q0
4932	{
4933	//SIMD may be not optimal, serial may be faster
4934	int16x4_t res64;
4935	__m128i sub, mask1;
4936	sub = _mm_sub_epi32 (a, b);
4937	mask1 = _mm_slli_epi32(sub, `16`); //shift left then back right to
4938	mask1 = _mm_srli_epi32(mask1,`31`); //get 15-th bit 1 or zero
4939	sub = _mm_srai_epi32 (sub, `16`); //get high half
4940	sub = _mm_add_epi32 (sub, mask1); //actual rounding
4941	sub = _mm_packs_epi32 (sub, sub);
4942	return64(sub);
4943	}
4944
4945	_NEON2SSESTORAGE int32x2_t vrsubhn_s64(int64x2_t a, int64x2_t b); // VRSUBHN.I64 d0,q0,q0
4946	_NEON2SSE_INLINE int32x2_t vrsubhn_s64(int64x2_t a, int64x2_t b)
4947	{
4948	//SIMD may be not optimal, serial may be faster
4949	int32x2_t res64;
4950	__m128i sub, mask1;
4951	sub = _mm_sub_epi64 (a, b);
4952	mask1 = _mm_slli_epi64(sub, `32`); //shift left then back right to
4953	mask1 = _mm_srli_epi64(mask1,`31`); //get 31-th bit 1 or zero
4954	sub = _mm_add_epi32 (sub, mask1); //actual high half rounding
4955	sub = _mm_shuffle_epi32(sub, `1` \| (`3` << `2`) \| (`0` << `4`) \| (`2` << `6`));
4956	return64(sub);
4957	}
4958
4959	_NEON2SSESTORAGE uint8x8_t vrsubhn_u16(uint16x8_t a, uint16x8_t b); // VRSUBHN.I16 d0,q0,q0
4960	_NEON2SSE_INLINE uint8x8_t vrsubhn_u16(uint16x8_t a, uint16x8_t b) // VRSUBHN.I16 d0,q0,q0
4961	{
4962	uint8x8_t res64;
4963	__m128i sub, mask1;
4964	sub = _mm_sub_epi16 (a, b);
4965	mask1 = _mm_slli_epi16(sub, `8`); //shift left then back right to
4966	mask1 = _mm_srli_epi16(mask1, `15`); //get 7-th bit 1 or zero
4967	sub = _mm_srai_epi16 (sub, `8`); //get high half
4968	sub = _mm_add_epi16 (sub, mask1); //actual rounding
4969	sub = _mm_packus_epi16 (sub, sub);
4970	return64(sub);
4971	}
4972
4973	_NEON2SSESTORAGE uint16x4_t vrsubhn_u32(uint32x4_t a, uint32x4_t b); // VRSUBHN.I32 d0,q0,q0
4974	_NEON2SSE_INLINE uint16x4_t vrsubhn_u32(uint32x4_t a, uint32x4_t b) // VRSUBHN.I32 d0,q0,q0
4975	{
4976	//SIMD may be not optimal, serial may be faster
4977	uint16x4_t res64;
4978	__m128i sub, mask1;
4979	sub = _mm_sub_epi32 (a, b);
4980	mask1 = _mm_slli_epi32(sub, `16`); //shift left then back right to
4981	mask1 = _mm_srli_epi32(mask1,`31`); //get 15-th bit 1 or zero
4982	sub = _mm_srai_epi32 (sub, `16`); //get high half
4983	sub = _mm_add_epi32 (sub, mask1); //actual rounding
4984	#ifdef USE_SSE4
4985	sub = _MM_PACKUS1_EPI32 (sub);
4986	#else
4987	sub = _mm_shuffle_epi8 (sub, (__m128i) mask8_32_even_odd); //go to 16 bits
4988	#endif
4989	return64(sub);
4990	}
4991
4992	_NEON2SSE_GLOBAL uint32x2_t vrsubhn_u64(uint64x2_t a, uint64x2_t b); // VRSUBHN.I64 d0,q0,q0
4993	#define vrsubhn_u64 vrsubhn_s64
4994
4995	//********* Vector saturating doubling multiply subtract long ******************
4996	//************************************************************************************
4997	_NEON2SSESTORAGE int32x4_t vqdmlsl_s16(int32x4_t a, int16x4_t b, int16x4_t c); // VQDMLSL.S16 q0,d0,d0
4998	_NEON2SSE_INLINE int32x4_t vqdmlsl_s16(int32x4_t a, int16x4_t b, int16x4_t c)
4999	{
5000	//not optimal SIMD soulution, serial may be faster
5001	__m128i res32, mask;
5002	int32x4_t res;
5003	_NEON2SSE_ALIGN_16 static const uint32_t cmask[] = {`0x80000000`, `0x80000000`, `0x80000000`, `0x80000000`};
5004	res = vmull_s16(b, c);
5005	res32 = _mm_slli_epi32 (res, `1`); //double the result, saturation not considered
5006	mask = _mm_cmpeq_epi32 (res32, (__m128i)cmask);
5007	res32 = _mm_xor_si128 (res32, mask); //res32 saturated for 0x80000000
5008	return vqsubq_s32(a, res32); //saturation
5009	}
5010
5011	_NEON2SSESTORAGE int64x2_t vqdmlsl_s32(int64x2_t a, int32x2_t b, int32x2_t c); // VQDMLSL.S32 q0,d0,d0
5012	_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int64x2_t vqdmlsl_s32(int64x2_t a, int32x2_t b, int32x2_t c), _NEON2SSE_REASON_SLOW_SERIAL)
5013	{
5014	__m128i res64, mask;
5015	int64x2_t res;
5016	_NEON2SSE_ALIGN_16 static const uint64_t cmask[] = {`0x8000000000000000`, `0x8000000000000000`};
5017	res = vmull_s32(b, c);
5018	res64 = _mm_slli_epi64 (res, `1`); //double the result, saturation not considered
5019	mask = _MM_CMPEQ_EPI64 (res64, (__m128i)cmask);
5020	res64 = _mm_xor_si128 (res64, mask); //res32 saturated for 0x80000000
5021	return vqsubq_s64(a, res64); //saturation
5022	}
5023
5024	//**************** COMPARISON *************************************
5025	//***************** Vector compare equal ***********************************
5026	//****************************************************************************
5027	_NEON2SSESTORAGE uint8x8_t vceq_s8(int8x8_t a, int8x8_t b); // VCEQ.I8 d0, d0, d0
5028	_NEON2SSE_INLINE int8x8_t vceq_s8(int8x8_t a, int8x8_t b)
5029	{
5030	int8x8_t res64;
5031	return64(_mm_cmpeq_epi8(_pM128i(a),_pM128i(b)));
5032	}
5033
5034
5035	_NEON2SSESTORAGE uint16x4_t vceq_s16(int16x4_t a, int16x4_t b); // VCEQ.I16 d0, d0, d0
5036	_NEON2SSE_INLINE int16x4_t vceq_s16(int16x4_t a, int16x4_t b)
5037	{
5038	int16x4_t res64;
5039	return64(_mm_cmpeq_epi16(_pM128i(a),_pM128i(b)));
5040	}
5041
5042
5043	_NEON2SSESTORAGE uint32x2_t vceq_s32(int32x2_t a, int32x2_t b); // VCEQ.I32 d0, d0, d0
5044	_NEON2SSE_INLINE int32x2_t vceq_s32(int32x2_t a, int32x2_t b)
5045	{
5046	int32x2_t res64;
5047	return64(_mm_cmpeq_epi32(_pM128i(a),_pM128i(b)));
5048	}
5049
5050
5051	_NEON2SSESTORAGE uint32x2_t vceq_f32(float32x2_t a, float32x2_t b); // VCEQ.F32 d0, d0, d0
5052	_NEON2SSE_INLINE uint32x2_t vceq_f32(float32x2_t a, float32x2_t b)
5053	{
5054	uint32x2_t res64;
5055	__m128 res;
5056	res = _mm_cmpeq_ps(_pM128(a), _pM128(b) );
5057	return64f(res);
5058	}
5059
5060	_NEON2SSESTORAGE uint8x8_t vceq_u8(uint8x8_t a, uint8x8_t b); // VCEQ.I8 d0, d0, d0
5061	_NEON2SSE_INLINE uint8x8_t vceq_u8(uint8x8_t a, uint8x8_t b)
5062	{
5063	uint8x8_t res64;
5064	return64(_mm_cmpeq_epi8(_pM128i(a),_pM128i(b)));
5065	}
5066
5067
5068	_NEON2SSESTORAGE uint16x4_t vceq_u16(uint16x4_t a, uint16x4_t b); // VCEQ.I16 d0, d0, d0
5069	_NEON2SSE_INLINE uint16x4_t vceq_u16(uint16x4_t a, uint16x4_t b)
5070	{
5071	uint16x4_t res64;
5072	return64(_mm_cmpeq_epi16(_pM128i(a),_pM128i(b)));
5073	}
5074
5075
5076	_NEON2SSESTORAGE uint32x2_t vceq_u32(uint32x2_t a, uint32x2_t b); // VCEQ.I32 d0, d0, d0
5077	_NEON2SSE_INLINE uint32x2_t vceq_u32(uint32x2_t a, uint32x2_t b)
5078	{
5079	uint32x2_t res64;
5080	return64(_mm_cmpeq_epi32(_pM128i(a),_pM128i(b)));
5081	}
5082
5083
5084	_NEON2SSE_GLOBAL uint8x8_t vceq_p8(poly8x8_t a, poly8x8_t b); // VCEQ.I8 d0, d0, d0
5085	#define vceq_p8 vceq_u8
5086
5087
5088	_NEON2SSE_GLOBAL uint8x16_t vceqq_s8(int8x16_t a, int8x16_t b); // VCEQ.I8 q0, q0, q0
5089	#define vceqq_s8 _mm_cmpeq_epi8
5090
5091	_NEON2SSE_GLOBAL uint16x8_t vceqq_s16(int16x8_t a, int16x8_t b); // VCEQ.I16 q0, q0, q0
5092	#define vceqq_s16 _mm_cmpeq_epi16
5093
5094	_NEON2SSE_GLOBAL uint32x4_t vceqq_s32(int32x4_t a, int32x4_t b); // VCEQ.I32 q0, q0, q0
5095	#define vceqq_s32 _mm_cmpeq_epi32
5096
5097	_NEON2SSESTORAGE uint32x4_t vceqq_f32(float32x4_t a, float32x4_t b); // VCEQ.F32 q0, q0, q0
5098	_NEON2SSE_INLINE uint32x4_t vceqq_f32(float32x4_t a, float32x4_t b)
5099	{
5100	__m128 res;
5101	res = _mm_cmpeq_ps(a,b);
5102	return _M128i(res);
5103	}
5104
5105	_NEON2SSE_GLOBAL uint8x16_t vceqq_u8(uint8x16_t a, uint8x16_t b); // VCEQ.I8 q0, q0, q0
5106	#define vceqq_u8 _mm_cmpeq_epi8
5107
5108	_NEON2SSE_GLOBAL uint16x8_t vceqq_u16(uint16x8_t a, uint16x8_t b); // VCEQ.I16 q0, q0, q0
5109	#define vceqq_u16 _mm_cmpeq_epi16
5110
5111	_NEON2SSE_GLOBAL uint32x4_t vceqq_u32(uint32x4_t a, uint32x4_t b); // VCEQ.I32 q0, q0, q0
5112	#define vceqq_u32 _mm_cmpeq_epi32
5113
5114	_NEON2SSE_GLOBAL uint8x16_t vceqq_p8(poly8x16_t a, poly8x16_t b); // VCEQ.I8 q0, q0, q0
5115	#define vceqq_p8 _mm_cmpeq_epi8
5116
5117	//****************Vector compare greater-than or equal***********************
5118	//*******************************************************************************
5119	//in IA SIMD no greater-than-or-equal comparison for integers,
5120	// there is greater-than available only, so we need the following tricks
5121
5122	_NEON2SSESTORAGE uint8x8_t vcge_s8(int8x8_t a, int8x8_t b); // VCGE.S8 d0, d0, d0
5123	_NEON2SSE_INLINE int8x8_t vcge_s8(int8x8_t a, int8x8_t b)
5124	{
5125	int8x8_t res64;
5126	return64(vcgeq_s8(_pM128i(a), _pM128i(b)));
5127	}
5128
5129
5130	_NEON2SSESTORAGE uint16x4_t vcge_s16(int16x4_t a, int16x4_t b); // VCGE.S16 d0, d0, d0
5131	_NEON2SSE_INLINE int16x4_t vcge_s16(int16x4_t a, int16x4_t b)
5132	{
5133	int16x4_t res64;
5134	return64(vcgeq_s16(_pM128i(a), _pM128i(b)));
5135	}
5136
5137
5138	_NEON2SSESTORAGE uint32x2_t vcge_s32(int32x2_t a, int32x2_t b); // VCGE.S32 d0, d0, d0
5139	_NEON2SSE_INLINE int32x2_t vcge_s32(int32x2_t a, int32x2_t b)
5140	{
5141	int32x2_t res64;
5142	return64(vcgeq_s32(_pM128i(a), _pM128i(b)));
5143	}
5144
5145
5146	_NEON2SSESTORAGE uint32x2_t vcge_f32(float32x2_t a, float32x2_t b); // VCGE.F32 d0, d0, d0
5147	_NEON2SSE_INLINE uint32x2_t vcge_f32(float32x2_t a, float32x2_t b)
5148	{
5149	uint32x2_t res64;
5150	__m128 res;
5151	res = _mm_cmpge_ps(_pM128(a),_pM128(b)); //use only 2 first entries
5152	return64f(res);
5153	}
5154
5155	_NEON2SSESTORAGE uint8x8_t vcge_u8(uint8x8_t a, uint8x8_t b); // VCGE.U8 d0, d0, d0
5156	_NEON2SSE_INLINE uint8x8_t vcge_u8(uint8x8_t a, uint8x8_t b)
5157	{
5158	uint8x8_t res64;
5159	return64(vcgeq_u8(_pM128i(a), _pM128i(b)));
5160	}
5161
5162
5163	_NEON2SSESTORAGE uint16x4_t vcge_u16(uint16x4_t a, uint16x4_t b); // VCGE.s16 d0, d0, d0
5164	_NEON2SSE_INLINE uint16x4_t vcge_u16(uint16x4_t a, uint16x4_t b)
5165	{
5166	uint16x4_t res64;
5167	return64(vcgeq_u16(_pM128i(a), _pM128i(b)));
5168	}
5169
5170
5171	_NEON2SSESTORAGE uint32x2_t vcge_u32(uint32x2_t a, uint32x2_t b); // VCGE.U32 d0, d0, d0
5172	_NEON2SSE_INLINE uint32x2_t vcge_u32(uint32x2_t a, uint32x2_t b)
5173	{
5174	//serial solution looks faster
5175	uint32x2_t res64;
5176	return64(vcgeq_u32 (_pM128i(a), _pM128i(b)));
5177	}
5178
5179
5180
5181	_NEON2SSESTORAGE uint8x16_t vcgeq_s8(int8x16_t a, int8x16_t b); // VCGE.S8 q0, q0, q0
5182	_NEON2SSE_INLINE uint8x16_t vcgeq_s8(int8x16_t a, int8x16_t b) // VCGE.S8 q0, q0, q0
5183	{
5184	__m128i m1, m2;
5185	m1 = _mm_cmpgt_epi8 ( a, b);
5186	m2 = _mm_cmpeq_epi8 ( a, b);
5187	return _mm_or_si128 ( m1, m2);
5188	}
5189
5190	_NEON2SSESTORAGE uint16x8_t vcgeq_s16(int16x8_t a, int16x8_t b); // VCGE.S16 q0, q0, q0
5191	_NEON2SSE_INLINE uint16x8_t vcgeq_s16(int16x8_t a, int16x8_t b) // VCGE.S16 q0, q0, q0
5192	{
5193	__m128i m1, m2;
5194	m1 = _mm_cmpgt_epi16 ( a, b);
5195	m2 = _mm_cmpeq_epi16 ( a, b);
5196	return _mm_or_si128 ( m1,m2);
5197	}
5198
5199	_NEON2SSESTORAGE uint32x4_t vcgeq_s32(int32x4_t a, int32x4_t b); // VCGE.S32 q0, q0, q0
5200	_NEON2SSE_INLINE uint32x4_t vcgeq_s32(int32x4_t a, int32x4_t b) // VCGE.S32 q0, q0, q0
5201	{
5202	__m128i m1, m2;
5203	m1 = _mm_cmpgt_epi32 (a, b);
5204	m2 = _mm_cmpeq_epi32 (a, b);
5205	return _mm_or_si128 (m1, m2);
5206	}
5207
5208	_NEON2SSESTORAGE uint32x4_t vcgeq_f32(float32x4_t a, float32x4_t b); // VCGE.F32 q0, q0, q0
5209	_NEON2SSE_INLINE uint32x4_t vcgeq_f32(float32x4_t a, float32x4_t b)
5210	{
5211	__m128 res;
5212	res = _mm_cmpge_ps(a,b); //use only 2 first entries
5213	return (__m128i)&res;
5214	}
5215
5216	_NEON2SSESTORAGE uint8x16_t vcgeq_u8(uint8x16_t a, uint8x16_t b); // VCGE.U8 q0, q0, q0
5217	_NEON2SSE_INLINE uint8x16_t vcgeq_u8(uint8x16_t a, uint8x16_t b) // VCGE.U8 q0, q0, q0
5218	{
5219	//no unsigned chars comparison, only signed available,so need the trick
5220	__m128i cmp;
5221	cmp = _mm_max_epu8(a, b);
5222	return _mm_cmpeq_epi8(cmp, a); //a>=b
5223	}
5224
5225	_NEON2SSESTORAGE uint16x8_t vcgeq_u16(uint16x8_t a, uint16x8_t b); // VCGE.s16 q0, q0, q0
5226	_NEON2SSE_INLINE uint16x8_t vcgeq_u16(uint16x8_t a, uint16x8_t b) // VCGE.s16 q0, q0, q0
5227	{
5228	//no unsigned shorts comparison, only signed available,so need the trick
5229	#ifdef USE_SSE4
5230	__m128i cmp;
5231	cmp = _mm_max_epu16(a, b);
5232	return _mm_cmpeq_epi16(cmp, a); //a>=b
5233	#else
5234	__m128i zero = _mm_setzero_si128();
5235	__m128i as = _mm_subs_epu16(b, a);
5236	return _mm_cmpeq_epi16(as, zero);
5237	#endif
5238	}
5239
5240	_NEON2SSESTORAGE uint32x4_t vcgeq_u32(uint32x4_t a, uint32x4_t b); // VCGE.U32 q0, q0, q0
5241	_NEON2SSE_INLINE uint32x4_t vcgeq_u32(uint32x4_t a, uint32x4_t b) // VCGE.U32 q0, q0, q0
5242	{
5243	//no unsigned ints comparison, only signed available,so need the trick
5244	#ifdef USE_SSE4
5245	__m128i cmp;
5246	cmp = _mm_max_epu32(a, b);
5247	return _mm_cmpeq_epi32(cmp, a); //a>=b
5248	#else
5249	//serial solution may be faster
5250	__m128i c80000000, as, bs, m1, m2;
5251	c80000000 = _mm_set1_epi32 (`0x80000000`);
5252	as = _mm_sub_epi32(a,c80000000);
5253	bs = _mm_sub_epi32(b,c80000000);
5254	m1 = _mm_cmpgt_epi32 (as, bs);
5255	m2 = _mm_cmpeq_epi32 (as, bs);
5256	return _mm_or_si128 ( m1, m2);
5257	#endif
5258	}
5259
5260	//********************Vector compare less-than or equal****************************
5261	//***************************************************************************************
5262	//in IA SIMD no less-than-or-equal comparison for integers present, so we need the tricks
5263
5264	_NEON2SSESTORAGE uint8x8_t vcle_s8(int8x8_t a, int8x8_t b); // VCGE.S8 d0, d0, d0
5265	_NEON2SSE_INLINE int8x8_t vcle_s8(int8x8_t a, int8x8_t b)
5266	{
5267	int8x8_t res64;
5268	return64(vcleq_s8(_pM128i(a), _pM128i(b)));
5269	}
5270
5271
5272	_NEON2SSESTORAGE uint16x4_t vcle_s16(int16x4_t a, int16x4_t b); // VCGE.S16 d0, d0, d0
5273	_NEON2SSE_INLINE int16x4_t vcle_s16(int16x4_t a, int16x4_t b)
5274	{
5275	int16x4_t res64;
5276	return64(vcleq_s16(_pM128i(a), _pM128i(b)));
5277	}
5278
5279
5280	_NEON2SSESTORAGE uint32x2_t vcle_s32(int32x2_t a, int32x2_t b); // VCGE.S32 d0, d0, d0
5281	_NEON2SSE_INLINE int32x2_t vcle_s32(int32x2_t a, int32x2_t b)
5282	{
5283	int32x2_t res64;
5284	return64(vcleq_s32(_pM128i(a), _pM128i(b)));
5285	}
5286
5287
5288	_NEON2SSESTORAGE uint32x2_t vcle_f32(float32x2_t a, float32x2_t b); // VCGE.F32 d0, d0, d0?
5289	_NEON2SSE_INLINE uint32x2_t vcle_f32(float32x2_t a, float32x2_t b)
5290	{
5291	uint32x2_t res64;
5292	__m128 res;
5293	res = _mm_cmple_ps(_pM128(a),_pM128(b));
5294	return64f(res);
5295	}
5296
5297	_NEON2SSE_GLOBAL uint8x8_t vcle_u8(uint8x8_t a, uint8x8_t b); // VCGE.U8 d0, d0, d0
5298	#define vcle_u8(a,b) vcge_u8(b,a)
5299
5300
5301	_NEON2SSE_GLOBAL uint16x4_t vcle_u16(uint16x4_t a, uint16x4_t b); // VCGE.s16 d0, d0, d0
5302	#define vcle_u16(a,b) vcge_u16(b,a)
5303
5304
5305	_NEON2SSE_GLOBAL uint32x2_t vcle_u32(uint32x2_t a, uint32x2_t b); // VCGE.U32 d0, d0, d0
5306	#define vcle_u32(a,b) vcge_u32(b,a)
5307
5308	_NEON2SSESTORAGE uint8x16_t vcleq_s8(int8x16_t a, int8x16_t b); // VCGE.S8 q0, q0, q0
5309	_NEON2SSE_INLINE uint8x16_t vcleq_s8(int8x16_t a, int8x16_t b) // VCGE.S8 q0, q0, q0
5310	{
5311	__m128i c1, res;
5312	c1 = _mm_cmpeq_epi8 (a,a); //all ones 0xff....
5313	res = _mm_cmpgt_epi8 ( a, b);
5314	return _mm_andnot_si128 (res, c1); //inverse the cmpgt result, get less-than-or-equal
5315	}
5316
5317	_NEON2SSESTORAGE uint16x8_t vcleq_s16(int16x8_t a, int16x8_t b); // VCGE.S16 q0, q0, q0
5318	_NEON2SSE_INLINE uint16x8_t vcleq_s16(int16x8_t a, int16x8_t b) // VCGE.S16 q0, q0, q0
5319	{
5320	__m128i c1, res;
5321	c1 = _mm_cmpeq_epi16 (a,a); //all ones 0xff....
5322	res = _mm_cmpgt_epi16 ( a, b);
5323	return _mm_andnot_si128 (res, c1);
5324	}
5325
5326	_NEON2SSESTORAGE uint32x4_t vcleq_s32(int32x4_t a, int32x4_t b); // VCGE.S32 q0, q0, q0
5327	_NEON2SSE_INLINE uint32x4_t vcleq_s32(int32x4_t a, int32x4_t b) // VCGE.S32 q0, q0, q0
5328	{
5329	__m128i c1, res;
5330	c1 = _mm_cmpeq_epi32 (a,a); //all ones 0xff....
5331	res = _mm_cmpgt_epi32 ( a, b);
5332	return _mm_andnot_si128 (res, c1);
5333	}
5334
5335	_NEON2SSESTORAGE uint32x4_t vcleq_f32(float32x4_t a, float32x4_t b); // VCGE.F32 q0, q0, q0
5336	_NEON2SSE_INLINE uint32x4_t vcleq_f32(float32x4_t a, float32x4_t b)
5337	{
5338	__m128 res;
5339	res = _mm_cmple_ps(a,b);
5340	return (__m128i)&res;
5341	}
5342
5343	_NEON2SSESTORAGE uint8x16_t vcleq_u8(uint8x16_t a, uint8x16_t b); // VCGE.U8 q0, q0, q0
5344	#ifdef USE_SSE4
5345	_NEON2SSE_INLINE uint8x16_t vcleq_u8(uint8x16_t a, uint8x16_t b) // VCGE.U8 q0, q0, q0
5346	{
5347	//no unsigned chars comparison in SSE, only signed available,so need the trick
5348	__m128i cmp;
5349	cmp = _mm_min_epu8(a, b);
5350	return _mm_cmpeq_epi8(cmp, a); //a<=b
5351	}
5352	#else
5353	_NEON2SSE_INLINE uint8x16_t vcleq_u8(uint8x16_t a, uint8x16_t b) // VCGE.U8 q0, q0, q0
5354	{
5355	return vcgeq_u8(b, a);
5356	}
5357	#endif
5358
5359	_NEON2SSESTORAGE uint16x8_t vcleq_u16(uint16x8_t a, uint16x8_t b); // VCGE.s16 q0, q0, q0
5360	#ifdef USE_SSE4
5361	_NEON2SSE_INLINE uint16x8_t vcleq_u16(uint16x8_t a, uint16x8_t b) // VCGE.s16 q0, q0, q0
5362	{
5363	//no unsigned shorts comparison in SSE, only signed available,so need the trick
5364	__m128i cmp;
5365	cmp = _mm_min_epu16(a, b);
5366	return _mm_cmpeq_epi16(cmp, a); //a<=b
5367	}
5368	#else
5369	_NEON2SSE_INLINE uint16x8_t vcleq_u16(uint16x8_t a, uint16x8_t b) // VCGE.s16 q0, q0, q0
5370	{
5371	return vcgeq_u16(b, a);
5372	}
5373	#endif
5374
5375	_NEON2SSESTORAGE uint32x4_t vcleq_u32(uint32x4_t a, uint32x4_t b); // VCGE.U32 q0, q0, q0
5376	#ifdef USE_SSE4
5377	_NEON2SSE_INLINE uint32x4_t vcleq_u32(uint32x4_t a, uint32x4_t b) // VCGE.U32 q0, q0, q0
5378	{
5379	//no unsigned chars comparison in SSE, only signed available,so need the trick
5380	__m128i cmp;
5381	cmp = _mm_min_epu32(a, b);
5382	return _mm_cmpeq_epi32(cmp, a); //a<=b
5383	}
5384	#else
5385	_NEON2SSE_INLINE uint32x4_t vcleq_u32(uint32x4_t a, uint32x4_t b) // VCGE.U32 q0, q0, q0
5386	{
5387	return vcgeq_u32(b, a);
5388	}
5389	#endif
5390
5391
5392	//**** Vector compare greater-than ****************************************
5393	//**************************************************************************
5394	_NEON2SSESTORAGE uint8x8_t vcgt_s8(int8x8_t a, int8x8_t b); // VCGT.S8 d0, d0, d0
5395	_NEON2SSE_INLINE int8x8_t vcgt_s8(int8x8_t a, int8x8_t b)
5396	{
5397	int8x8_t res64;
5398	return64(_mm_cmpgt_epi8(_pM128i(a),_pM128i(b)));
5399	}
5400
5401
5402	_NEON2SSESTORAGE uint16x4_t vcgt_s16(int16x4_t a, int16x4_t b); // VCGT.S16 d0, d0, d0
5403	_NEON2SSE_INLINE int16x4_t vcgt_s16(int16x4_t a, int16x4_t b)
5404	{
5405	int16x4_t res64;
5406	return64(_mm_cmpgt_epi16(_pM128i(a),_pM128i(b)));
5407	}
5408
5409
5410	_NEON2SSESTORAGE uint32x2_t vcgt_s32(int32x2_t a, int32x2_t b); // VCGT.S32 d0, d0, d0
5411	_NEON2SSE_INLINE int32x2_t vcgt_s32(int32x2_t a, int32x2_t b)
5412	{
5413	int32x2_t res64;
5414	return64(_mm_cmpgt_epi32(_pM128i(a),_pM128i(b)));
5415	}
5416
5417
5418	_NEON2SSESTORAGE uint32x2_t vcgt_f32(float32x2_t a, float32x2_t b); // VCGT.F32 d0, d0, d0
5419	_NEON2SSE_INLINE uint32x2_t vcgt_f32(float32x2_t a, float32x2_t b)
5420	{
5421	uint32x2_t res64;
5422	__m128 res;
5423	res = _mm_cmpgt_ps(_pM128(a),_pM128(b)); //use only 2 first entries
5424	return64f(res);
5425	}
5426
5427	_NEON2SSESTORAGE uint8x8_t vcgt_u8(uint8x8_t a, uint8x8_t b); // VCGT.U8 d0, d0, d0
5428	_NEON2SSE_INLINE uint8x8_t vcgt_u8(uint8x8_t a, uint8x8_t b)
5429	{
5430	uint8x8_t res64;
5431	return64(vcgtq_u8(_pM128i(a), _pM128i(b)));
5432	}
5433
5434
5435	_NEON2SSESTORAGE uint16x4_t vcgt_u16(uint16x4_t a, uint16x4_t b); // VCGT.s16 d0, d0, d0
5436	_NEON2SSE_INLINE uint16x4_t vcgt_u16(uint16x4_t a, uint16x4_t b)
5437	{
5438	uint16x4_t res64;
5439	return64(vcgtq_u16(_pM128i(a), _pM128i(b)));
5440	}
5441
5442
5443	_NEON2SSESTORAGE uint32x2_t vcgt_u32(uint32x2_t a, uint32x2_t b); // VCGT.U32 d0, d0, d0
5444	_NEON2SSE_INLINE uint32x2_t vcgt_u32(uint32x2_t a, uint32x2_t b)
5445	{
5446	uint32x2_t res64;
5447	return64(vcgtq_u32(_pM128i(a), _pM128i(b)));
5448	}
5449
5450
5451	_NEON2SSE_GLOBAL uint8x16_t vcgtq_s8(int8x16_t a, int8x16_t b); // VCGT.S8 q0, q0, q0
5452	#define vcgtq_s8 _mm_cmpgt_epi8
5453
5454	_NEON2SSE_GLOBAL uint16x8_t vcgtq_s16(int16x8_t a, int16x8_t b); // VCGT.S16 q0, q0, q0
5455	#define vcgtq_s16 _mm_cmpgt_epi16
5456
5457	_NEON2SSE_GLOBAL uint32x4_t vcgtq_s32(int32x4_t a, int32x4_t b); // VCGT.S32 q0, q0, q0
5458	#define vcgtq_s32 _mm_cmpgt_epi32
5459
5460	_NEON2SSESTORAGE uint32x4_t vcgtq_f32(float32x4_t a, float32x4_t b); // VCGT.F32 q0, q0, q0
5461	_NEON2SSE_INLINE uint32x4_t vcgtq_f32(float32x4_t a, float32x4_t b)
5462	{
5463	__m128 res;
5464	res = _mm_cmpgt_ps(a,b); //use only 2 first entries
5465	return (__m128i)&res;
5466	}
5467
5468	_NEON2SSESTORAGE uint8x16_t vcgtq_u8(uint8x16_t a, uint8x16_t b); // VCGT.U8 q0, q0, q0
5469	_NEON2SSE_INLINE uint8x16_t vcgtq_u8(uint8x16_t a, uint8x16_t b) // VCGT.U8 q0, q0, q0
5470	{
5471	//no unsigned chars comparison, only signed available,so need the trick
5472	__m128i c128, as, bs;
5473	c128 = _mm_set1_epi8(-`128`); //(int8_t)0x80
5474	as = _mm_sub_epi8(a, c128);
5475	bs = _mm_sub_epi8(b, c128);
5476	return _mm_cmpgt_epi8(as, bs);
5477	}
5478
5479	_NEON2SSESTORAGE uint16x8_t vcgtq_u16(uint16x8_t a, uint16x8_t b); // VCGT.s16 q0, q0, q0
5480	_NEON2SSE_INLINE uint16x8_t vcgtq_u16(uint16x8_t a, uint16x8_t b) // VCGT.s16 q0, q0, q0
5481	{
5482	//no unsigned short comparison, only signed available,so need the trick
5483	__m128i c8000, as, bs;
5484	c8000 = _mm_set1_epi16(-`32768`); //(int16_t)0x8000
5485	as = _mm_sub_epi16(a, c8000);
5486	bs = _mm_sub_epi16(b, c8000);
5487	return _mm_cmpgt_epi16(as, bs);
5488	}
5489
5490	_NEON2SSESTORAGE uint32x4_t vcgtq_u32(uint32x4_t a, uint32x4_t b); // VCGT.U32 q0, q0, q0
5491	_NEON2SSE_INLINE uint32x4_t vcgtq_u32(uint32x4_t a, uint32x4_t b) // VCGT.U32 q0, q0, q0
5492	{
5493	//no unsigned int comparison, only signed available,so need the trick
5494	__m128i c80000000, as, bs;
5495	c80000000 = _mm_set1_epi32 (`0x80000000`);
5496	as = _mm_sub_epi32(a,c80000000);
5497	bs = _mm_sub_epi32(b,c80000000);
5498	return _mm_cmpgt_epi32 ( as, bs);
5499	}
5500
5501	//******************* Vector compare less-than ************************
5502	//*************************************************************************
5503	_NEON2SSE_GLOBAL uint8x8_t vclt_s8(int8x8_t a, int8x8_t b); // VCGT.S8 d0, d0, d0
5504	#define vclt_s8(a,b) vcgt_s8(b,a) //swap the arguments!!
5505
5506
5507	_NEON2SSE_GLOBAL uint16x4_t vclt_s16(int16x4_t a, int16x4_t b); // VCGT.S16 d0, d0, d0
5508	#define vclt_s16(a,b) vcgt_s16(b,a) //swap the arguments!!
5509
5510
5511	_NEON2SSE_GLOBAL uint32x2_t vclt_s32(int32x2_t a, int32x2_t b); // VCGT.S32 d0, d0, d0
5512	#define vclt_s32(a,b) vcgt_s32(b,a) //swap the arguments!!
5513
5514
5515	_NEON2SSE_GLOBAL uint32x2_t vclt_f32(float32x2_t a, float32x2_t b); // VCGT.F32 d0, d0, d0
5516	#define vclt_f32(a,b) vcgt_f32(b, a) //swap the arguments!!
5517
5518	_NEON2SSE_GLOBAL uint8x8_t vclt_u8(uint8x8_t a, uint8x8_t b); // VCGT.U8 d0, d0, d0
5519	#define vclt_u8(a,b) vcgt_u8(b,a) //swap the arguments!!
5520
5521	_NEON2SSE_GLOBAL uint16x4_t vclt_u16(uint16x4_t a, uint16x4_t b); // VCGT.s16 d0, d0, d0
5522	#define vclt_u16(a,b) vcgt_u16(b,a) //swap the arguments!!
5523
5524	_NEON2SSE_GLOBAL uint32x2_t vclt_u32(uint32x2_t a, uint32x2_t b); // VCGT.U32 d0, d0, d0
5525	#define vclt_u32(a,b) vcgt_u32(b,a) //swap the arguments!!
5526
5527	_NEON2SSE_GLOBAL uint8x16_t vcltq_s8(int8x16_t a, int8x16_t b); // VCGT.S8 q0, q0, q0
5528	#define vcltq_s8(a,b) vcgtq_s8(b, a) //swap the arguments!!
5529
5530	_NEON2SSE_GLOBAL uint16x8_t vcltq_s16(int16x8_t a, int16x8_t b); // VCGT.S16 q0, q0, q0
5531	#define vcltq_s16(a,b) vcgtq_s16(b, a) //swap the arguments!!
5532
5533	_NEON2SSE_GLOBAL uint32x4_t vcltq_s32(int32x4_t a, int32x4_t b); // VCGT.S32 q0, q0, q0
5534	#define vcltq_s32(a,b) vcgtq_s32(b, a) //swap the arguments!!
5535
5536	_NEON2SSE_GLOBAL uint32x4_t vcltq_f32(float32x4_t a, float32x4_t b); // VCGT.F32 q0, q0, q0
5537	#define vcltq_f32(a,b) vcgtq_f32(b, a) //swap the arguments!!
5538
5539	_NEON2SSE_GLOBAL uint8x16_t vcltq_u8(uint8x16_t a, uint8x16_t b); // VCGT.U8 q0, q0, q0
5540	#define vcltq_u8(a,b) vcgtq_u8(b, a) //swap the arguments!!
5541
5542	_NEON2SSE_GLOBAL uint16x8_t vcltq_u16(uint16x8_t a, uint16x8_t b); // VCGT.s16 q0, q0, q0
5543	#define vcltq_u16(a,b) vcgtq_u16(b, a) //swap the arguments!!
5544
5545	_NEON2SSE_GLOBAL uint32x4_t vcltq_u32(uint32x4_t a, uint32x4_t b); // VCGT.U32 q0, q0, q0
5546	#define vcltq_u32(a,b) vcgtq_u32(b, a) //swap the arguments!!
5547
5548	//***************Vector compare absolute greater-than or equal **********
5549	//***************************************************************************
5550	_NEON2SSESTORAGE uint32x2_t vcage_f32(float32x2_t a, float32x2_t b); // VACGE.F32 d0, d0, d0
5551	_NEON2SSE_INLINE uint32x2_t vcage_f32(float32x2_t a, float32x2_t b)
5552	{
5553	uint32x2_t res64;
5554	__m128i c7fffffff;
5555	__m128 a0, b0;
5556	c7fffffff = _mm_set1_epi32 (`0x7fffffff`);
5557	a0 = _mm_and_ps (_pM128(a), (__m128)&c7fffffff);
5558	b0 = _mm_and_ps (_pM128(b), (__m128)&c7fffffff);
5559	a0 = _mm_cmpge_ps ( a0, b0);
5560	return64f(a0);
5561	}
5562
5563	_NEON2SSESTORAGE uint32x4_t vcageq_f32(float32x4_t a, float32x4_t b); // VACGE.F32 q0, q0, q0
5564	_NEON2SSE_INLINE uint32x4_t vcageq_f32(float32x4_t a, float32x4_t b) // VACGE.F32 q0, q0, q0
5565	{
5566	__m128i c7fffffff;
5567	__m128 a0, b0;
5568	c7fffffff = _mm_set1_epi32 (`0x7fffffff`);
5569	a0 = _mm_and_ps (a, (__m128)&c7fffffff);
5570	b0 = _mm_and_ps (b, (__m128)&c7fffffff);
5571	a0 = _mm_cmpge_ps ( a0, b0);
5572	return ((__m128i)&a0);
5573	}
5574
5575	//******Vector compare absolute less-than or equal ****************
5576	//********************************************************************
5577	_NEON2SSESTORAGE uint32x2_t vcale_f32(float32x2_t a, float32x2_t b); // VACGE.F32 d0, d0, d0
5578	_NEON2SSE_INLINE uint32x2_t vcale_f32(float32x2_t a, float32x2_t b)
5579	{
5580	uint32x2_t res64;
5581	__m128i c7fffffff;
5582	__m128 a0, b0;
5583	c7fffffff = _mm_set1_epi32 (`0x7fffffff`);
5584	a0 = _mm_and_ps (_pM128(a), (__m128)&c7fffffff);
5585	b0 = _mm_and_ps (_pM128(b), (__m128)&c7fffffff);
5586	a0 = _mm_cmple_ps (a0, b0);
5587	return64f(a0);
5588	}
5589
5590	_NEON2SSESTORAGE uint32x4_t vcaleq_f32(float32x4_t a, float32x4_t b); // VACGE.F32 q0, q0, q0
5591	_NEON2SSE_INLINE uint32x4_t vcaleq_f32(float32x4_t a, float32x4_t b) // VACGE.F32 q0, q0, q0
5592	{
5593	__m128i c7fffffff;
5594	__m128 a0, b0;
5595	c7fffffff = _mm_set1_epi32 (`0x7fffffff`);
5596	a0 = _mm_and_ps (a, (__m128)&c7fffffff);
5597	b0 = _mm_and_ps (b, (__m128)&c7fffffff);
5598	a0 = _mm_cmple_ps (a0, b0);
5599	return ((__m128i)&a0);
5600	}
5601
5602	//****** Vector compare absolute greater-than ****************
5603	//******************************************************************
5604	_NEON2SSESTORAGE uint32x2_t vcagt_f32(float32x2_t a, float32x2_t b); // VACGT.F32 d0, d0, d0
5605	_NEON2SSE_INLINE uint32x2_t vcagt_f32(float32x2_t a, float32x2_t b)
5606	{
5607	uint32x2_t res64;
5608	__m128i c7fffffff;
5609	__m128 a0, b0;
5610	c7fffffff = _mm_set1_epi32 (`0x7fffffff`);
5611	a0 = _mm_and_ps (_pM128(a), (__m128)&c7fffffff);
5612	b0 = _mm_and_ps (_pM128(b), (__m128)&c7fffffff);
5613	a0 = _mm_cmpgt_ps (a0, b0);
5614	return64f(a0);
5615	}
5616
5617	_NEON2SSESTORAGE uint32x4_t vcagtq_f32(float32x4_t a, float32x4_t b); // VACGT.F32 q0, q0, q0
5618	_NEON2SSE_INLINE uint32x4_t vcagtq_f32(float32x4_t a, float32x4_t b) // VACGT.F32 q0, q0, q0
5619	{
5620	__m128i c7fffffff;
5621	__m128 a0, b0;
5622	c7fffffff = _mm_set1_epi32 (`0x7fffffff`);
5623	a0 = _mm_and_ps (a, (__m128)&c7fffffff);
5624	b0 = _mm_and_ps (b, (__m128)&c7fffffff);
5625	a0 = _mm_cmpgt_ps (a0, b0);
5626	return ((__m128i)&a0);
5627	}
5628
5629	//*************Vector compare absolute less-than *********************
5630	//*************************************************************************
5631	_NEON2SSESTORAGE uint32x2_t vcalt_f32(float32x2_t a, float32x2_t b); // VACGT.F32 d0, d0, d0
5632	_NEON2SSE_INLINE uint32x2_t vcalt_f32(float32x2_t a, float32x2_t b)
5633	{
5634	uint32x2_t res64;
5635	__m128i c7fffffff;
5636	__m128 a0, b0;
5637	c7fffffff = _mm_set1_epi32 (`0x7fffffff`);
5638	a0 = _mm_and_ps (_pM128(a), (__m128)&c7fffffff);
5639	b0 = _mm_and_ps (_pM128(b), (__m128)&c7fffffff);
5640	a0 = _mm_cmplt_ps (a0, b0);
5641	return64f(a0);
5642	}
5643
5644	_NEON2SSESTORAGE uint32x4_t vcaltq_f32(float32x4_t a, float32x4_t b); // VACGT.F32 q0, q0, q0
5645	_NEON2SSE_INLINE uint32x4_t vcaltq_f32(float32x4_t a, float32x4_t b) // VACGT.F32 q0, q0, q0
5646	{
5647	__m128i c7fffffff;
5648	__m128 a0, b0;
5649	c7fffffff = _mm_set1_epi32 (`0x7fffffff`);
5650	a0 = _mm_and_ps (a, (__m128)&c7fffffff);
5651	b0 = _mm_and_ps (b, (__m128)&c7fffffff);
5652	a0 = _mm_cmplt_ps (a0, b0);
5653	return ((__m128i)&a0);
5654	}
5655
5656	//***********************Vector test bits**********************************
5657	//*****************************************************************************
5658	/VTST (Vector Test Bits) takes each element in a vector, and bitwise logical ANDs them*
5659	with the corresponding element of a second vector. If the result is not zero, the
5660	corresponding element in the destination vector is set to all ones. Otherwise, it is set to
5661	all zeros. /*
5662
5663	_NEON2SSESTORAGE uint8x8_t vtst_s8(int8x8_t a, int8x8_t b); // VTST.8 d0, d0, d0
5664	_NEON2SSE_INLINE uint8x8_t vtst_s8(int8x8_t a, int8x8_t b)
5665	{
5666	int8x8_t res64;
5667	return64(vtstq_s8(_pM128i(a), _pM128i(b)));
5668	}
5669
5670
5671	_NEON2SSESTORAGE uint16x4_t vtst_s16(int16x4_t a, int16x4_t b); // VTST.16 d0, d0, d0
5672	_NEON2SSE_INLINE uint16x4_t vtst_s16(int16x4_t a, int16x4_t b)
5673	{
5674	int16x4_t res64;
5675	return64(vtstq_s16(_pM128i(a), _pM128i(b)));
5676	}
5677
5678
5679	_NEON2SSESTORAGE uint32x2_t vtst_s32(int32x2_t a, int32x2_t b); // VTST.32 d0, d0, d0
5680	_NEON2SSE_INLINE uint32x2_t vtst_s32(int32x2_t a, int32x2_t b)
5681	{
5682	int32x2_t res64;
5683	return64(vtstq_s32(_pM128i(a), _pM128i(b)));
5684	}
5685
5686
5687	_NEON2SSE_GLOBAL uint8x8_t vtst_u8(uint8x8_t a, uint8x8_t b); // VTST.8 d0, d0, d0
5688	#define vtst_u8 vtst_s8
5689
5690	_NEON2SSE_GLOBAL uint16x4_t vtst_u16(uint16x4_t a, uint16x4_t b); // VTST.16 d0, d0, d0
5691	#define vtst_u16 vtst_s16
5692
5693	_NEON2SSE_GLOBAL uint32x2_t vtst_u32(uint32x2_t a, uint32x2_t b); // VTST.32 d0, d0, d0
5694	#define vtst_u32 vtst_s32
5695
5696
5697	_NEON2SSE_GLOBAL uint8x8_t vtst_p8(poly8x8_t a, poly8x8_t b); // VTST.8 d0, d0, d0
5698	#define vtst_p8 vtst_u8
5699
5700	_NEON2SSESTORAGE uint8x16_t vtstq_s8(int8x16_t a, int8x16_t b); // VTST.8 q0, q0, q0
5701	_NEON2SSE_INLINE uint8x16_t vtstq_s8(int8x16_t a, int8x16_t b) // VTST.8 q0, q0, q0
5702	{
5703	__m128i zero, one, res;
5704	zero = _mm_setzero_si128 ();
5705	one = _mm_cmpeq_epi8(zero,zero); //0xfff..ffff
5706	res = _mm_and_si128 (a, b);
5707	res = _mm_cmpeq_epi8 (res, zero);
5708	return _mm_xor_si128(res, one); //invert result
5709	}
5710
5711	_NEON2SSESTORAGE uint16x8_t vtstq_s16(int16x8_t a, int16x8_t b); // VTST.16 q0, q0, q0
5712	_NEON2SSE_INLINE uint16x8_t vtstq_s16(int16x8_t a, int16x8_t b) // VTST.16 q0, q0, q0
5713	{
5714	__m128i zero, one, res;
5715	zero = _mm_setzero_si128 ();
5716	one = _mm_cmpeq_epi8(zero,zero); //0xfff..ffff
5717	res = _mm_and_si128 (a, b);
5718	res = _mm_cmpeq_epi16 (res, zero);
5719	return _mm_xor_si128(res, one); //invert result
5720	}
5721
5722	_NEON2SSESTORAGE uint32x4_t vtstq_s32(int32x4_t a, int32x4_t b); // VTST.32 q0, q0, q0
5723	_NEON2SSE_INLINE uint32x4_t vtstq_s32(int32x4_t a, int32x4_t b) // VTST.32 q0, q0, q0
5724	{
5725	__m128i zero, one, res;
5726	zero = _mm_setzero_si128 ();
5727	one = _mm_cmpeq_epi8(zero,zero); //0xfff..ffff
5728	res = _mm_and_si128 (a, b);
5729	res = _mm_cmpeq_epi32 (res, zero);
5730	return _mm_xor_si128(res, one); //invert result
5731	}
5732
5733	_NEON2SSE_GLOBAL uint8x16_t vtstq_u8(uint8x16_t a, uint8x16_t b); // VTST.8 q0, q0, q0
5734	#define vtstq_u8 vtstq_s8
5735
5736	_NEON2SSE_GLOBAL uint16x8_t vtstq_u16(uint16x8_t a, uint16x8_t b); // VTST.16 q0, q0, q0
5737	#define vtstq_u16 vtstq_s16
5738
5739	_NEON2SSE_GLOBAL uint32x4_t vtstq_u32(uint32x4_t a, uint32x4_t b); // VTST.32 q0, q0, q0
5740	#define vtstq_u32 vtstq_s32
5741
5742	_NEON2SSE_GLOBAL uint8x16_t vtstq_p8(poly8x16_t a, poly8x16_t b); // VTST.8 q0, q0, q0
5743	#define vtstq_p8 vtstq_u8
5744
5745	//**************** Absolute difference ******************
5746	//* Absolute difference between the arguments: Vr[i] = \| Va[i] - Vb[i] \|***
5747	//************************************************************
5748	_NEON2SSESTORAGE int8x8_t vabd_s8(int8x8_t a, int8x8_t b); // VABD.S8 d0,d0,d0
5749	_NEON2SSE_INLINE int8x8_t vabd_s8(int8x8_t a, int8x8_t b)
5750	{
5751	int8x8_t res64;
5752	return64(vabdq_s8(_pM128i(a), _pM128i(b)));
5753	}
5754
5755	_NEON2SSESTORAGE int16x4_t vabd_s16(int16x4_t a, int16x4_t b); // VABD.S16 d0,d0,d0
5756	_NEON2SSE_INLINE int16x4_t vabd_s16(int16x4_t a, int16x4_t b)
5757	{
5758	int16x4_t res64;
5759	return64(vabdq_s16(_pM128i(a), _pM128i(b)));
5760	}
5761
5762	_NEON2SSESTORAGE int32x2_t vabd_s32(int32x2_t a, int32x2_t b); // VABD.S32 d0,d0,d0
5763	_NEON2SSE_INLINE int32x2_t vabd_s32(int32x2_t a, int32x2_t b)
5764	{//need to deal with an intermediate overflow
5765	int32x2_t res;
5766	res.m64_i32[`0`] = (a.m64_i32[`0`] > b.m64_i32[`0`]) ? a.m64_i32[`0`] - b.m64_i32[`0`]: b.m64_i32[`0`] - a.m64_i32[`0`];
5767	res.m64_i32[`1`] = (a.m64_i32[`1`] > b.m64_i32[`1`]) ? a.m64_i32[`1`] - b.m64_i32[`1`]: b.m64_i32[`1`] - a.m64_i32[`1`];
5768	return res;
5769	}
5770
5771	_NEON2SSESTORAGE uint8x8_t vabd_u8(uint8x8_t a, uint8x8_t b); // VABD.U8 d0,d0,d0
5772	_NEON2SSE_INLINE uint8x8_t vabd_u8(uint8x8_t a, uint8x8_t b)
5773	{
5774	uint8x8_t res64;
5775	return64(vabdq_u8(_pM128i(a), _pM128i(b)));
5776	}
5777
5778	_NEON2SSESTORAGE uint16x4_t vabd_u16(uint16x4_t a, uint16x4_t b); // VABD.s16 d0,d0,d0
5779	_NEON2SSE_INLINE uint16x4_t vabd_u16(uint16x4_t a, uint16x4_t b)
5780	{
5781	uint16x4_t res64;
5782	return64(vabdq_u16(_pM128i(a), _pM128i(b)));
5783	}
5784
5785	_NEON2SSESTORAGE uint32x2_t vabd_u32(uint32x2_t a, uint32x2_t b); // VABD.U32 d0,d0,d0
5786	_NEON2SSE_INLINE uint32x2_t vabd_u32(uint32x2_t a, uint32x2_t b)
5787	{
5788	uint32x2_t res64;
5789	return64(vabdq_u32(_pM128i(a), _pM128i(b)));
5790	}
5791
5792	_NEON2SSESTORAGE float32x2_t vabd_f32(float32x2_t a, float32x2_t b); // VABD.F32 d0,d0,d0
5793	_NEON2SSE_INLINE float32x2_t vabd_f32(float32x2_t a, float32x2_t b)
5794	{
5795	float32x4_t res;
5796	__m64_128 res64;
5797	res = vabdq_f32(_pM128(a), _pM128(b));
5798	_M64f(res64, res);
5799	return res64;
5800	}
5801
5802	_NEON2SSESTORAGE int8x16_t vabdq_s8(int8x16_t a, int8x16_t b); // VABD.S8 q0,q0,q0
5803	_NEON2SSE_INLINE int8x16_t vabdq_s8(int8x16_t a, int8x16_t b) // VABD.S8 q0,q0,q0
5804	{ //need to deal with an intermediate overflow
5805	__m128i cmp, difab, difba;
5806	cmp = vcgtq_s8(a,b);
5807	difab = _mm_sub_epi8(a,b);
5808	difba = _mm_sub_epi8(b,a);
5809	difab = _mm_and_si128(cmp, difab);
5810	difba = _mm_andnot_si128(cmp, difba);
5811	return _mm_or_si128(difab, difba);
5812	}
5813
5814	_NEON2SSESTORAGE int16x8_t vabdq_s16(int16x8_t a, int16x8_t b); // VABD.S16 q0,q0,q0
5815	_NEON2SSE_INLINE int16x8_t vabdq_s16(int16x8_t a, int16x8_t b) // VABD.S16 q0,q0,q0
5816	{//need to deal with an intermediate overflow
5817	__m128i cmp, difab, difba;
5818	cmp = vcgtq_s16(a,b);
5819	difab = _mm_sub_epi16(a,b);
5820	difba = _mm_sub_epi16 (b,a);
5821	difab = _mm_and_si128(cmp, difab);
5822	difba = _mm_andnot_si128(cmp, difba);
5823	return _mm_or_si128(difab, difba);
5824	}
5825
5826	_NEON2SSESTORAGE int32x4_t vabdq_s32(int32x4_t a, int32x4_t b); // VABD.S32 q0,q0,q0
5827	_NEON2SSE_INLINE int32x4_t vabdq_s32(int32x4_t a, int32x4_t b) // VABD.S32 q0,q0,q0
5828	{//need to deal with an intermediate overflow
5829	__m128i cmp, difab, difba;
5830	cmp = vcgtq_s32(a,b);
5831	difab = _mm_sub_epi32(a,b);
5832	difba = _mm_sub_epi32(b,a);
5833	difab = _mm_and_si128(cmp, difab);
5834	difba = _mm_andnot_si128(cmp, difba);
5835	return _mm_or_si128(difab, difba);
5836	}
5837
5838	_NEON2SSESTORAGE uint8x16_t vabdq_u8(uint8x16_t a, uint8x16_t b); // VABD.U8 q0,q0,q0
5839	_NEON2SSE_INLINE uint8x16_t vabdq_u8(uint8x16_t a, uint8x16_t b) //no abs for unsigned
5840	{
5841	__m128i difab, difba;
5842	difab = _mm_subs_epu8(a,b);
5843	difba = _mm_subs_epu8 (b,a);
5844	return _mm_or_si128(difab, difba);
5845	}
5846
5847	_NEON2SSESTORAGE uint16x8_t vabdq_u16(uint16x8_t a, uint16x8_t b); // VABD.s16 q0,q0,q0
5848	_NEON2SSE_INLINE uint16x8_t vabdq_u16(uint16x8_t a, uint16x8_t b)
5849	{
5850	__m128i difab, difba;
5851	difab = _mm_subs_epu16(a,b);
5852	difba = _mm_subs_epu16 (b,a);
5853	return _mm_or_si128(difab, difba);
5854	}
5855
5856	_NEON2SSESTORAGE uint32x4_t vabdq_u32(uint32x4_t a, uint32x4_t b); // VABD.U32 q0,q0,q0
5857	_NEON2SSE_INLINE uint32x4_t vabdq_u32(uint32x4_t a, uint32x4_t b)
5858	{
5859	__m128i cmp, difab, difba;
5860	cmp = vcgtq_u32(a,b);
5861	difab = _mm_sub_epi32(a,b);
5862	difba = _mm_sub_epi32 (b,a);
5863	difab = _mm_and_si128(cmp, difab);
5864	difba = _mm_andnot_si128(cmp, difba);
5865	return _mm_or_si128(difab, difba);
5866	}
5867
5868	_NEON2SSESTORAGE float32x4_t vabdq_f32(float32x4_t a, float32x4_t b); // VABD.F32 q0,q0,q0
5869	_NEON2SSE_INLINE float32x4_t vabdq_f32(float32x4_t a, float32x4_t b) // VABD.F32 q0,q0,q0
5870	{
5871	__m128i c1;
5872	__m128 res;
5873	c1 = _mm_set1_epi32(`0x7fffffff`);
5874	res = _mm_sub_ps (a, b);
5875	return _mm_and_ps (res, (__m128)&c1);
5876	}
5877
5878	//********** Absolute difference - long ************************
5879	//********************************************************************
5880	_NEON2SSESTORAGE int16x8_t vabdl_s8(int8x8_t a, int8x8_t b); // VABDL.S8 q0,d0,d0
5881	_NEON2SSE_INLINE int16x8_t vabdl_s8(int8x8_t a, int8x8_t b) // VABDL.S8 q0,d0,d0
5882	{
5883	__m128i a16, b16;
5884	a16 = _MM_CVTEPI8_EPI16 (_pM128i(a)); //SSE4.1,
5885	b16 = _MM_CVTEPI8_EPI16 (_pM128i(b)); //SSE4.1,
5886	return vabdq_s16(a16, b16);
5887
5888	}
5889
5890	_NEON2SSESTORAGE int32x4_t vabdl_s16(int16x4_t a, int16x4_t b); // VABDL.S16 q0,d0,d0
5891	_NEON2SSE_INLINE int32x4_t vabdl_s16(int16x4_t a, int16x4_t b) // VABDL.S16 q0,d0,d0
5892	{
5893	__m128i a32, b32;
5894	a32 = _MM_CVTEPI16_EPI32 (_pM128i(a)); //SSE4.1
5895	b32 = _MM_CVTEPI16_EPI32 (_pM128i(b)); //SSE4.1,
5896	return vabdq_s32(a32, b32);
5897	}
5898
5899	_NEON2SSESTORAGE int64x2_t vabdl_s32(int32x2_t a, int32x2_t b); // VABDL.S32 q0,d0,d0
5900	_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING (int64x2_t vabdl_s32(int32x2_t a, int32x2_t b),_NEON2SSE_REASON_SLOW_SERIAL)
5901	{
5902	//no optimal SIMD solution, serial looks faster
5903	_NEON2SSE_ALIGN_16 int64_t res[`2`];
5904	if(a.m64_i32[`0`] > b.m64_i32[`0`]) res[`0`] = ( int64_t) a.m64_i32[`0`] - ( int64_t) b.m64_i32[`0`];
5905	else res[`0`] = ( int64_t) b.m64_i32[`0`] - ( int64_t) a.m64_i32[`0`];
5906	if(a.m64_i32[`1`] > b.m64_i32[`1`]) res[`1`] = ( int64_t) a.m64_i32[`1`] - ( int64_t) b.m64_i32[`1`];
5907	else res[`1`] = ( int64_t) b.m64_i32[`1`] - ( int64_t) a.m64_i32[`1`];
5908	return _mm_load_si128((__m128i*)res);
5909	}
5910
5911	_NEON2SSESTORAGE uint16x8_t vabdl_u8(uint8x8_t a, uint8x8_t b); // VABDL.U8 q0,d0,d0
5912	_NEON2SSE_INLINE uint16x8_t vabdl_u8(uint8x8_t a, uint8x8_t b)
5913	{
5914	__m128i res;
5915	res = vsubl_u8(a,b);
5916	return _mm_abs_epi16(res);
5917	}
5918
5919	_NEON2SSESTORAGE uint32x4_t vabdl_u16(uint16x4_t a, uint16x4_t b); // VABDL.s16 q0,d0,d0
5920	_NEON2SSE_INLINE uint32x4_t vabdl_u16(uint16x4_t a, uint16x4_t b)
5921	{
5922	__m128i res;
5923	res = vsubl_u16(a,b);
5924	return _mm_abs_epi32(res);
5925	}
5926
5927	_NEON2SSESTORAGE uint64x2_t vabdl_u32(uint32x2_t a, uint32x2_t b); // VABDL.U32 q0,d0,d0
5928	_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING (uint64x2_t vabdl_u32(uint32x2_t a, uint32x2_t b), _NEON2SSE_REASON_SLOW_SERIAL)
5929	{
5930	_NEON2SSE_ALIGN_16 uint64_t res[`2`];
5931	if(a.m64_u32[`0`] > b.m64_u32[`0`]) res[`0`] = ( uint64_t) a.m64_u32[`0`] - ( uint64_t) b.m64_u32[`0`];
5932	else res[`0`] = ( uint64_t) b.m64_u32[`0`] - ( uint64_t) a.m64_u32[`0`];
5933	if(a.m64_u32[`1`] > b.m64_u32[`1`]) res[`1`] = ( uint64_t) a.m64_u32[`1`] - ( uint64_t) b.m64_u32[`1`];
5934	else res[`1`] = ( uint64_t) b.m64_u32[`1`] - ( uint64_t) a.m64_u32[`1`];
5935	return _mm_load_si128((__m128i*)res);
5936	}
5937
5938	//********Absolute difference and accumulate: Vr[i] = Va[i] + \| Vb[i] - Vc[i] \| ***********
5939	//*********************************************************************************************
5940	_NEON2SSESTORAGE int8x8_t vaba_s8(int8x8_t a, int8x8_t b, int8x8_t c); // VABA.S8 d0,d0,d0
5941	_NEON2SSE_INLINE int8x8_t vaba_s8(int8x8_t a, int8x8_t b, int8x8_t c)
5942	{
5943	int8x8_t res64;
5944	return64(vabaq_s8(_pM128i(a),_pM128i(b), _pM128i(c)));
5945	}
5946
5947	_NEON2SSESTORAGE int16x4_t vaba_s16(int16x4_t a, int16x4_t b, int16x4_t c); // VABA.S16 d0,d0,d0
5948	_NEON2SSE_INLINE int16x4_t vaba_s16(int16x4_t a, int16x4_t b, int16x4_t c)
5949	{
5950	int16x4_t res64;
5951	return64(vabaq_s16(_pM128i(a), _pM128i(b), _pM128i(c)));
5952	}
5953
5954	_NEON2SSESTORAGE int32x2_t vaba_s32(int32x2_t a, int32x2_t b, int32x2_t c); // VABA.S32 d0,d0,d0
5955	_NEON2SSE_INLINE int32x2_t vaba_s32(int32x2_t a, int32x2_t b, int32x2_t c)
5956	{
5957	int32x2_t res64;
5958	return64(vabaq_s32(_pM128i(a), _pM128i(b), _pM128i(c)));
5959	}
5960
5961	_NEON2SSESTORAGE uint8x8_t vaba_u8(uint8x8_t a, uint8x8_t b, uint8x8_t c); // VABA.U8 d0,d0,d0
5962	_NEON2SSE_INLINE uint8x8_t vaba_u8(uint8x8_t a, uint8x8_t b, uint8x8_t c)
5963	{
5964	int8x8_t res64;
5965	return64(vabaq_u8(_pM128i(a),_pM128i(b), _pM128i(c)));
5966	}
5967
5968
5969	_NEON2SSESTORAGE uint16x4_t vaba_u16(uint16x4_t a, uint16x4_t b, uint16x4_t c); // VABA.s16 d0,d0,d0
5970	_NEON2SSE_INLINE uint16x4_t vaba_u16(uint16x4_t a, uint16x4_t b, uint16x4_t c)
5971	{
5972	int16x4_t res64;
5973	return64(vabaq_u16(_pM128i(a), _pM128i(b), _pM128i(c)));
5974	}
5975
5976	_NEON2SSESTORAGE uint32x2_t vaba_u32(uint32x2_t a, uint32x2_t b, uint32x2_t c); // VABA.U32 d0,d0,d0
5977	_NEON2SSE_INLINE uint32x2_t vaba_u32(uint32x2_t a, uint32x2_t b, uint32x2_t c)
5978	{
5979	uint32x2_t res64;
5980	return64(vabaq_u32(_pM128i(a), _pM128i(b), _pM128i(c)));
5981	}
5982
5983	_NEON2SSESTORAGE int8x16_t vabaq_s8(int8x16_t a, int8x16_t b, int8x16_t c); // VABA.S8 q0,q0,q0
5984	_NEON2SSE_INLINE int8x16_t vabaq_s8(int8x16_t a, int8x16_t b, int8x16_t c) // VABA.S8 q0,q0,q0
5985	{
5986	int8x16_t sub;
5987	sub = vabdq_s8(b, c);
5988	return vaddq_s8( a, sub);
5989	}
5990
5991	_NEON2SSESTORAGE int16x8_t vabaq_s16(int16x8_t a, int16x8_t b, int16x8_t c); // VABA.S16 q0,q0,q0
5992	_NEON2SSE_INLINE int16x8_t vabaq_s16(int16x8_t a, int16x8_t b, int16x8_t c) // VABA.S16 q0,q0,q0
5993	{
5994	int16x8_t sub;
5995	sub = vabdq_s16(b, c);
5996	return vaddq_s16( a, sub);
5997	}
5998
5999	_NEON2SSESTORAGE int32x4_t vabaq_s32(int32x4_t a, int32x4_t b, int32x4_t c); // VABA.S32 q0,q0,q0
6000	_NEON2SSE_INLINE int32x4_t vabaq_s32(int32x4_t a, int32x4_t b, int32x4_t c) // VABA.S32 q0,q0,q0
6001	{
6002	int32x4_t sub;
6003	sub = vabdq_s32(b, c);
6004	return vaddq_s32( a, sub);
6005	}
6006
6007	_NEON2SSESTORAGE uint8x16_t vabaq_u8(uint8x16_t a, uint8x16_t b, uint8x16_t c); // VABA.U8 q0,q0,q0
6008	_NEON2SSE_INLINE uint8x16_t vabaq_u8(uint8x16_t a, uint8x16_t b, uint8x16_t c)
6009	{
6010	uint8x16_t sub;
6011	sub = vabdq_u8(b, c);
6012	return vaddq_u8( a, sub);
6013	}
6014
6015	_NEON2SSESTORAGE uint16x8_t vabaq_u16(uint16x8_t a, uint16x8_t b, uint16x8_t c); // VABA.s16 q0,q0,q0
6016	_NEON2SSE_INLINE uint16x8_t vabaq_u16(uint16x8_t a, uint16x8_t b, uint16x8_t c)
6017	{
6018	uint16x8_t sub;
6019	sub = vabdq_u16(b, c);
6020	return vaddq_u16( a, sub);
6021	}
6022
6023	_NEON2SSESTORAGE uint32x4_t vabaq_u32(uint32x4_t a, uint32x4_t b, uint32x4_t c); // VABA.U32 q0,q0,q0
6024	_NEON2SSE_INLINE uint32x4_t vabaq_u32(uint32x4_t a, uint32x4_t b, uint32x4_t c)
6025	{
6026	uint32x4_t sub;
6027	sub = vabdq_u32(b, c);
6028	return vaddq_u32( a, sub);
6029	}
6030
6031	//************ Absolute difference and accumulate - long ******************************
6032	//*************************************************************************************
6033	_NEON2SSESTORAGE int16x8_t vabal_s8(int16x8_t a, int8x8_t b, int8x8_t c); // VABAL.S8 q0,d0,d0
6034	_NEON2SSE_INLINE int16x8_t vabal_s8(int16x8_t a, int8x8_t b, int8x8_t c) // VABAL.S8 q0,d0,d0
6035	{
6036	__m128i b16, c16, res;
6037	b16 = _MM_CVTEPI8_EPI16 (_pM128i(b)); //SSE4.1,
6038	c16 = _MM_CVTEPI8_EPI16 (_pM128i(c)); //SSE4.1,
6039	res = _mm_abs_epi16 (_mm_sub_epi16 (b16, c16) );
6040	return _mm_add_epi16 (a, res);
6041	}
6042
6043	_NEON2SSESTORAGE int32x4_t vabal_s16(int32x4_t a, int16x4_t b, int16x4_t c); // VABAL.S16 q0,d0,d0
6044	_NEON2SSE_INLINE int32x4_t vabal_s16(int32x4_t a, int16x4_t b, int16x4_t c) // VABAL.S16 q0,d0,d0
6045	{
6046	__m128i b32, c32, res;
6047	b32 = _MM_CVTEPI16_EPI32(_pM128i(b)); //SSE4.1
6048	c32 = _MM_CVTEPI16_EPI32(_pM128i(c)); //SSE4.1
6049	res = _mm_abs_epi32 (_mm_sub_epi32 (b32, c32) );
6050	return _mm_add_epi32 (a, res);
6051	}
6052
6053	_NEON2SSESTORAGE int64x2_t vabal_s32(int64x2_t a, int32x2_t b, int32x2_t c); // VABAL.S32 q0,d0,d0
6054	_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING (int64x2_t vabal_s32(int64x2_t a, int32x2_t b, int32x2_t c), _NEON2SSE_REASON_SLOW_SERIAL)
6055	{
6056	__m128i res;
6057	res = vabdl_s32(b,c);
6058	return _mm_add_epi64(a, res);
6059	}
6060
6061	_NEON2SSESTORAGE uint16x8_t vabal_u8(uint16x8_t a, uint8x8_t b, uint8x8_t c); // VABAL.U8 q0,d0,d0
6062	_NEON2SSE_INLINE uint16x8_t vabal_u8(uint16x8_t a, uint8x8_t b, uint8x8_t c)
6063	{
6064	__m128i b16, c16, res;
6065	b16 = _MM_CVTEPU8_EPI16 (_pM128i(b)); //SSE4.1,
6066	c16 = _MM_CVTEPU8_EPI16 (_pM128i(c)); //SSE4.1,
6067	res = _mm_abs_epi16 (_mm_sub_epi16 (b16, c16) );
6068	return _mm_add_epi16 (a, res);
6069	}
6070
6071	_NEON2SSESTORAGE uint32x4_t vabal_u16(uint32x4_t a, uint16x4_t b, uint16x4_t c); // VABAL.s16 q0,d0,d0
6072	_NEON2SSE_INLINE uint32x4_t vabal_u16(uint32x4_t a, uint16x4_t b, uint16x4_t c)
6073	{
6074	__m128i b32, c32, res;
6075	b32 = _MM_CVTEPU16_EPI32(_pM128i(b)); //SSE4.1
6076	c32 = _MM_CVTEPU16_EPI32(_pM128i(c)); //SSE4.1
6077	res = _mm_abs_epi32 (_mm_sub_epi32 (b32, c32) );
6078	return _mm_add_epi32 (a, res);
6079	}
6080
6081	_NEON2SSESTORAGE uint64x2_t vabal_u32(uint64x2_t a, uint32x2_t b, uint32x2_t c); // VABAL.U32 q0,d0,d0
6082	_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING (uint64x2_t vabal_u32(uint64x2_t a, uint32x2_t b, uint32x2_t c), _NEON2SSE_REASON_SLOW_SERIAL)
6083	{
6084	__m128i res;
6085	res = vabdl_u32(b,c);
6086	return _mm_add_epi64(a, res);
6087	}
6088
6089	//***********************************************************************************
6090	//************** Maximum and minimum operations ********************************
6091	//***********************************************************************************
6092	//*********** Maximum: vmax -> Vr[i] := (Va[i] >= Vb[i]) ? Va[i] : Vb[i] *****
6093	//***********************************************************************************
6094	_NEON2SSESTORAGE int8x8_t vmax_s8(int8x8_t a, int8x8_t b); // VMAX.S8 d0,d0,d0
6095	_NEON2SSE_INLINE int8x8_t vmax_s8(int8x8_t a, int8x8_t b)
6096	{
6097	int8x8_t res64;
6098	__m128i res;
6099	res = _MM_MAX_EPI8(_pM128i(a),_pM128i(b)); //SSE4.1, use only lower 64 bits
6100	return64(res);
6101	}
6102
6103	_NEON2SSESTORAGE int16x4_t vmax_s16(int16x4_t a, int16x4_t b); // VMAX.S16 d0,d0,d0
6104	_NEON2SSE_INLINE int16x4_t vmax_s16(int16x4_t a, int16x4_t b)
6105	{
6106	int16x4_t res64;
6107	return64(_mm_max_epi16(_pM128i(a),_pM128i(b)));
6108	}
6109
6110	_NEON2SSESTORAGE int32x2_t vmax_s32(int32x2_t a, int32x2_t b); // VMAX.S32 d0,d0,d0
6111	_NEON2SSE_INLINE int32x2_t vmax_s32(int32x2_t a, int32x2_t b)
6112	{
6113	int32x2_t res64;
6114	__m128i res;
6115	res = _MM_MAX_EPI32(_pM128i(a),_pM128i(b)); //SSE4.1, use only lower 64 bits
6116	return64(res);
6117	}
6118
6119	_NEON2SSESTORAGE uint8x8_t vmax_u8(uint8x8_t a, uint8x8_t b); // VMAX.U8 d0,d0,d0
6120	_NEON2SSE_INLINE uint8x8_t vmax_u8(uint8x8_t a, uint8x8_t b)
6121	{
6122	uint8x8_t res64;
6123	return64(_mm_max_epu8(_pM128i(a),_pM128i(b)));
6124	}
6125
6126
6127	_NEON2SSESTORAGE uint16x4_t vmax_u16(uint16x4_t a, uint16x4_t b); // VMAX.s16 d0,d0,d0
6128	_NEON2SSE_INLINE uint16x4_t vmax_u16(uint16x4_t a, uint16x4_t b)
6129	{
6130	uint16x4_t res64;
6131	return64(_MM_MAX_EPU16(_pM128i(a),_pM128i(b)));
6132	}
6133
6134
6135	_NEON2SSESTORAGE uint32x2_t vmax_u32(uint32x2_t a, uint32x2_t b); // VMAX.U32 d0,d0,d0
6136	_NEON2SSE_INLINE uint32x2_t vmax_u32(uint32x2_t a, uint32x2_t b)
6137	{
6138	uint32x2_t res64;
6139	__m128i res;
6140	res = _MM_MAX_EPU32(_pM128i(a),_pM128i(b)); //SSE4.1, use only lower 64 bits, may be not effective compared with serial
6141	return64(res);
6142	}
6143
6144	_NEON2SSESTORAGE float32x2_t vmax_f32(float32x2_t a, float32x2_t b); // VMAX.F32 d0,d0,d0
6145	_NEON2SSE_INLINE float32x2_t vmax_f32(float32x2_t a, float32x2_t b)
6146	{
6147	//serial solution looks faster than SIMD one
6148	float32x2_t res;
6149	res.m64_f32[`0`] = (a.m64_f32[`0`] > b.m64_f32[`0`]) ? a.m64_f32[`0`] : b.m64_f32[`0`];
6150	res.m64_f32[`1`] = (a.m64_f32[`1`] > b.m64_f32[`1`]) ? a.m64_f32[`1`] : b.m64_f32[`1`];
6151	return res;
6152	}
6153
6154	_NEON2SSE_GLOBAL int8x16_t vmaxq_s8(int8x16_t a, int8x16_t b); // VMAX.S8 q0,q0,q0
6155	#define vmaxq_s8 _MM_MAX_EPI8 //SSE4.1
6156
6157	_NEON2SSE_GLOBAL int16x8_t vmaxq_s16(int16x8_t a, int16x8_t b); // VMAX.S16 q0,q0,q0
6158	#define vmaxq_s16 _mm_max_epi16
6159
6160	_NEON2SSE_GLOBAL int32x4_t vmaxq_s32(int32x4_t a, int32x4_t b); // VMAX.S32 q0,q0,q0
6161	#define vmaxq_s32 _MM_MAX_EPI32 //SSE4.1
6162
6163	_NEON2SSE_GLOBAL uint8x16_t vmaxq_u8(uint8x16_t a, uint8x16_t b); // VMAX.U8 q0,q0,q0
6164	#define vmaxq_u8 _mm_max_epu8
6165
6166	_NEON2SSE_GLOBAL uint16x8_t vmaxq_u16(uint16x8_t a, uint16x8_t b); // VMAX.s16 q0,q0,q0
6167	#define vmaxq_u16 _MM_MAX_EPU16 //SSE4.1
6168
6169	_NEON2SSE_GLOBAL uint32x4_t vmaxq_u32(uint32x4_t a, uint32x4_t b); // VMAX.U32 q0,q0,q0
6170	#define vmaxq_u32 _MM_MAX_EPU32 //SSE4.1
6171
6172
6173	_NEON2SSE_GLOBAL float32x4_t vmaxq_f32(float32x4_t a, float32x4_t b); // VMAX.F32 q0,q0,q0
6174	#define vmaxq_f32 _mm_max_ps
6175
6176
6177	_NEON2SSE_GLOBAL float64x2_t vmaxq_f64(float64x2_t a, float64x2_t b); // VMAX.F64 q0,q0,q0
6178	#define vmaxq_f64 _mm_max_pd
6179
6180
6181	//************* Minimum: vmin -> Vr[i] := (Va[i] >= Vb[i]) ? Vb[i] : Va[i] ******************************
6182	//***********************************************************************************************************
6183	_NEON2SSESTORAGE int8x8_t vmin_s8(int8x8_t a, int8x8_t b); // VMIN.S8 d0,d0,d0
6184	_NEON2SSE_INLINE int8x8_t vmin_s8(int8x8_t a, int8x8_t b)
6185	{
6186	int8x8_t res64;
6187	__m128i res;
6188	res = _MM_MIN_EPI8(_pM128i(a),_pM128i(b)); //SSE4.1, use only lower 64 bits
6189	return64(res);
6190	}
6191
6192	_NEON2SSESTORAGE int16x4_t vmin_s16(int16x4_t a, int16x4_t b); // VMIN.S16 d0,d0,d0
6193	_NEON2SSE_INLINE int16x4_t vmin_s16(int16x4_t a, int16x4_t b)
6194	{
6195	int16x4_t res64;
6196	return64(_mm_min_epi16(_pM128i(a),_pM128i(b)));
6197	}
6198
6199
6200	_NEON2SSESTORAGE int32x2_t vmin_s32(int32x2_t a, int32x2_t b); // VMIN.S32 d0,d0,d0
6201	_NEON2SSE_INLINE int32x2_t vmin_s32(int32x2_t a, int32x2_t b)
6202	{
6203	int32x2_t res64;
6204	__m128i res;
6205	res = _MM_MIN_EPI32(_pM128i(a),_pM128i(b)); //SSE4.1, use only lower 64 bits
6206	return64(res);
6207	}
6208
6209	_NEON2SSESTORAGE uint8x8_t vmin_u8(uint8x8_t a, uint8x8_t b); // VMIN.U8 d0,d0,d0
6210	_NEON2SSE_INLINE uint8x8_t vmin_u8(uint8x8_t a, uint8x8_t b)
6211	{
6212	uint8x8_t res64;
6213	return64(_mm_min_epu8(_pM128i(a),_pM128i(b)));
6214	}
6215
6216
6217	_NEON2SSESTORAGE uint16x4_t vmin_u16(uint16x4_t a, uint16x4_t b); // VMIN.s16 d0,d0,d0
6218	_NEON2SSE_INLINE uint16x4_t vmin_u16(uint16x4_t a, uint16x4_t b)
6219	{
6220	uint16x4_t res64;
6221	return64(_MM_MIN_EPU16(_pM128i(a),_pM128i(b)));
6222	}
6223
6224
6225	_NEON2SSESTORAGE uint32x2_t vmin_u32(uint32x2_t a, uint32x2_t b); // VMIN.U32 d0,d0,d0
6226	_NEON2SSE_INLINE uint32x2_t vmin_u32(uint32x2_t a, uint32x2_t b)
6227	{
6228	uint32x2_t res64;
6229	__m128i res;
6230	res = _MM_MIN_EPU32(_pM128i(a),_pM128i(b)); //SSE4.1, use only lower 64 bits, may be not effective compared with serial
6231	return64(res);
6232	}
6233
6234	_NEON2SSESTORAGE float32x2_t vmin_f32(float32x2_t a, float32x2_t b); // VMIN.F32 d0,d0,d0
6235	_NEON2SSE_INLINE float32x2_t vmin_f32(float32x2_t a, float32x2_t b)
6236	{
6237	//serial solution looks faster than SIMD one
6238	float32x2_t res;
6239	res.m64_f32[`0`] = (a.m64_f32[`0`] < b.m64_f32[`0`]) ? a.m64_f32[`0`] : b.m64_f32[`0`];
6240	res.m64_f32[`1`] = (a.m64_f32[`1`] < b.m64_f32[`1`]) ? a.m64_f32[`1`] : b.m64_f32[`1`];
6241	return res;
6242	}
6243
6244	_NEON2SSE_GLOBAL int8x16_t vminq_s8(int8x16_t a, int8x16_t b); // VMIN.S8 q0,q0,q0
6245	#define vminq_s8 _MM_MIN_EPI8 //SSE4.1
6246
6247	_NEON2SSE_GLOBAL int16x8_t vminq_s16(int16x8_t a, int16x8_t b); // VMIN.S16 q0,q0,q0
6248	#define vminq_s16 _mm_min_epi16
6249
6250	_NEON2SSE_GLOBAL int32x4_t vminq_s32(int32x4_t a, int32x4_t b); // VMIN.S32 q0,q0,q0
6251	#define vminq_s32 _MM_MIN_EPI32 //SSE4.1
6252
6253	_NEON2SSE_GLOBAL uint8x16_t vminq_u8(uint8x16_t a, uint8x16_t b); // VMIN.U8 q0,q0,q0
6254	#define vminq_u8 _mm_min_epu8
6255
6256	_NEON2SSE_GLOBAL uint16x8_t vminq_u16(uint16x8_t a, uint16x8_t b); // VMIN.s16 q0,q0,q0
6257	#define vminq_u16 _MM_MIN_EPU16 //SSE4.1
6258
6259	_NEON2SSE_GLOBAL uint32x4_t vminq_u32(uint32x4_t a, uint32x4_t b); // VMIN.U32 q0,q0,q0
6260	#define vminq_u32 _MM_MIN_EPU32 //SSE4.1
6261
6262	_NEON2SSE_GLOBAL float32x4_t vminq_f32(float32x4_t a, float32x4_t b); // VMIN.F32 q0,q0,q0
6263	#define vminq_f32 _mm_min_ps
6264
6265
6266	_NEON2SSE_GLOBAL float64x2_t vminq_f64(float64x2_t a, float64x2_t b); // VMIN.F64 q0,q0,q0
6267	#define vminq_f64 _mm_min_pd
6268
6269
6270	//*********** Pairwise addition operations. ************************************
6271	//************************************************************************************
6272	//Pairwise add - adds adjacent pairs of elements of two vectors, and places the results in the destination vector
6273	_NEON2SSESTORAGE int8x8_t vpadd_s8(int8x8_t a, int8x8_t b); // VPADD.I8 d0,d0,d0
6274	_NEON2SSE_INLINE int8x8_t vpadd_s8(int8x8_t a, int8x8_t b) // VPADD.I8 d0,d0,d0
6275	{
6276	//no 8 bit hadd in IA32, need to go to 16 bit and then pack
6277	int8x8_t res64;
6278	__m128i a16, b16, res;
6279	a16 = _MM_CVTEPI8_EPI16 (_pM128i(a)); // SSE 4.1
6280	b16 = _MM_CVTEPI8_EPI16 (_pM128i(b)); // SSE 4.1
6281	res = _mm_hadd_epi16 (a16, b16);
6282	res = _mm_shuffle_epi8 (res, (__m128i) mask8_16_even_odd); //return to 8 bit, use low 64 bits
6283	return64(res);
6284	}
6285
6286	_NEON2SSESTORAGE int16x4_t vpadd_s16(int16x4_t a, int16x4_t b); // VPADD.I16 d0,d0,d0
6287	_NEON2SSE_INLINE int16x4_t vpadd_s16(int16x4_t a, int16x4_t b)
6288	{
6289	int16x4_t res64;
6290	__m128i hadd128;
6291	hadd128 = _mm_hadd_epi16 (_pM128i(a), _pM128i(b));
6292	hadd128 = _mm_shuffle_epi32 (hadd128, `0` \| (`2` << `2`) \| (`1` << `4`) \| (`3` << `6`));
6293	return64(hadd128);
6294	}
6295
6296
6297	_NEON2SSESTORAGE int32x2_t vpadd_s32(int32x2_t a, int32x2_t b); // VPADD.I32 d0,d0,d0
6298	_NEON2SSE_INLINE int32x2_t vpadd_s32(int32x2_t a, int32x2_t b)
6299	{
6300	int32x2_t res64;
6301	__m128i hadd128;
6302	hadd128 = _mm_hadd_epi32 (_pM128i(a), _pM128i(b));
6303	hadd128 = _mm_shuffle_epi32 (hadd128, `0` \| (`2` << `2`) \| (`1` << `4`) \| (`3` << `6`));
6304	return64(hadd128);
6305	}
6306
6307
6308	_NEON2SSESTORAGE uint8x8_t vpadd_u8(uint8x8_t a, uint8x8_t b); // VPADD.I8 d0,d0,d0
6309	_NEON2SSE_INLINE uint8x8_t vpadd_u8(uint8x8_t a, uint8x8_t b) // VPADD.I8 d0,d0,d0
6310	{
6311	// no 8 bit hadd in IA32, need to go to 16 bit and then pack
6312	uint8x8_t res64;
6313	// no unsigned _mm_hadd_ functions in IA32, but 8 unsigned is less then 16 signed, so it should work
6314	__m128i mask8, a16, b16, res;
6315	mask8 = _mm_set1_epi16(`0xff`);
6316	a16 = _MM_CVTEPU8_EPI16 (_pM128i(a)); // SSE 4.1
6317	b16 = _MM_CVTEPU8_EPI16 (_pM128i(b)); // SSE 4.1
6318	res = _mm_hadd_epi16 (a16, b16);
6319	res = _mm_and_si128(res, mask8); //to avoid saturation
6320	res = _mm_packus_epi16 (res,res); //use low 64 bits
6321	return64(res);
6322	}
6323
6324	_NEON2SSESTORAGE uint16x4_t vpadd_u16(uint16x4_t a, uint16x4_t b); // VPADD.I16 d0,d0,d0
6325	_NEON2SSE_INLINE uint16x4_t vpadd_u16(uint16x4_t a, uint16x4_t b) // VPADD.I16 d0,d0,d0
6326	{
6327	// solution may be not optimal, serial execution may be faster
6328	// no unsigned _mm_hadd_ functions in IA32, need to move from unsigned to signed
6329	uint16x4_t res64;
6330	__m128i c32767, cfffe, as, bs, res;
6331	c32767 = _mm_set1_epi16 (`32767`);
6332	cfffe = _mm_set1_epi16 (-`2`); //(int16_t)0xfffe
6333	as = _mm_sub_epi16 (_pM128i(a), c32767);
6334	bs = _mm_sub_epi16 (_pM128i(b), c32767);
6335	res = _mm_hadd_epi16 (as, bs);
6336	res = _mm_add_epi16 (res, cfffe);
6337	res = _mm_shuffle_epi32 (res, `0` \| (`2` << `2`) \| (`1` << `4`) \| (`3` << `6`));
6338	return64(res);
6339	}
6340
6341	_NEON2SSESTORAGE uint32x2_t vpadd_u32(uint32x2_t a, uint32x2_t b); // VPADD.I32 d0,d0,d0
6342	_NEON2SSE_INLINE uint32x2_t vpadd_u32(uint32x2_t a, uint32x2_t b) //serial may be faster
6343	{
6344	//hadd doesn't work for unsigned values
6345	uint32x2_t res64;
6346	__m128i ab, ab_sh, res;
6347	ab = _mm_unpacklo_epi64 ( _pM128i(a), _pM128i(b)); //a0 a1 b0 b1
6348	ab_sh = _mm_shuffle_epi32(ab, `1` \| (`0` << `2`) \| (`3` << `4`) \| (`2` << `6`)); //a1, a0, b1, b0
6349	res = _mm_add_epi32(ab, ab_sh);
6350	res = _mm_shuffle_epi32(res, `0` \| (`2` << `2`) \| (`1` << `4`) \| (`3` << `6`));
6351	return64(res);
6352	}
6353
6354	_NEON2SSESTORAGE float32x2_t vpadd_f32(float32x2_t a, float32x2_t b); // VPADD.F32 d0,d0,d0
6355	_NEON2SSE_INLINE float32x2_t vpadd_f32(float32x2_t a, float32x2_t b)
6356	{
6357	__m128 hadd128;
6358	__m64_128 res64;
6359	hadd128 = _mm_hadd_ps (_pM128(a), _pM128(b));
6360	hadd128 = _mm_shuffle_ps (hadd128, hadd128, _MM_SHUFFLE(`3`,`1`, `2`, `0`)); //use low 64 bits
6361	_M64f(res64, hadd128);
6362	return res64;
6363	}
6364
6365
6366	//************************ Long pairwise add ********************************
6367	//*********************************************************************************
6368	//Adds adjacent pairs of elements of a vector,sign or zero extends the results to twice their original width,
6369	// and places the final results in the destination vector.
6370
6371	_NEON2SSESTORAGE int16x4_t vpaddl_s8(int8x8_t a); // VPADDL.S8 d0,d0
6372	_NEON2SSE_INLINE int16x4_t vpaddl_s8(int8x8_t a) // VPADDL.S8 d0,d0
6373	{
6374	//no 8 bit hadd in IA32, need to go to 16 bit anyway
6375	__m128i a16;
6376	int16x4_t res64;
6377	a16 = _MM_CVTEPI8_EPI16 (_pM128i(a)); // SSE 4.1
6378	a16 = _mm_hadd_epi16 (a16, a16); //use low 64 bits
6379	return64(a16);
6380	}
6381
6382	_NEON2SSESTORAGE int32x2_t vpaddl_s16(int16x4_t a); // VPADDL.S16 d0,d0
6383	_NEON2SSE_INLINE int32x2_t vpaddl_s16(int16x4_t a) // VPADDL.S16 d0,d0
6384	{
6385	// solution may be not optimal, serial execution may be faster
6386	int32x2_t res64;
6387	__m128i r32_1;
6388	r32_1 = _MM_CVTEPI16_EPI32 (_pM128i(a));
6389	r32_1 = _mm_hadd_epi32(r32_1, r32_1); //use low 64 bits
6390	return64(r32_1);
6391	}
6392
6393	_NEON2SSESTORAGE int64x1_t vpaddl_s32(int32x2_t a); // VPADDL.S32 d0,d0
6394	_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int64x1_t vpaddl_s32(int32x2_t a), _NEON2SSE_REASON_SLOW_SERIAL) //serial solution looks faster
6395	{
6396	int64x1_t res;
6397	res.m64_i64[`0`] = (int64_t)a.m64_i32[`0`] + (int64_t)a.m64_i32[`1`];
6398	return res;
6399	}
6400
6401	_NEON2SSESTORAGE uint16x4_t vpaddl_u8(uint8x8_t a); // VPADDL.U8 d0,d0
6402	_NEON2SSE_INLINE uint16x4_t vpaddl_u8(uint8x8_t a) // VPADDL.U8 d0,d0
6403	{
6404	// no 8 bit hadd in IA32, need to go to 16 bit
6405	// no unsigned _mm_hadd_ functions in IA32, but 8 unsigned is less then 16 signed, so it should work
6406	uint16x4_t res64;
6407	__m128i a16;
6408	a16 = _MM_CVTEPU8_EPI16 (_pM128i(a)); // SSE 4.1 use low 64 bits
6409	a16 = _mm_hadd_epi16 (a16, a16); //use low 64 bits
6410	return64(a16);
6411	}
6412
6413	_NEON2SSESTORAGE uint32x2_t vpaddl_u16(uint16x4_t a); // VPADDL.s16 d0,d0
6414	_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint32x2_t vpaddl_u16(uint16x4_t a), _NEON2SSE_REASON_SLOW_SERIAL)
6415	{
6416	//serial solution looks faster than a SIMD one
6417	uint32x2_t res;
6418	res.m64_u32[`0`] = (uint32_t)a.m64_u16[`0`] + (uint32_t)a.m64_u16[`1`];
6419	res.m64_u32[`1`] = (uint32_t)a.m64_u16[`2`] + (uint32_t)a.m64_u16[`3`];
6420	return res;
6421	}
6422
6423	_NEON2SSESTORAGE uint64x1_t vpaddl_u32(uint32x2_t a); // VPADDL.U32 d0,d0
6424	_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint64x1_t vpaddl_u32(uint32x2_t a), _NEON2SSE_REASON_SLOW_SERIAL) //serial solution looks faster
6425	{
6426	uint64x1_t res;
6427	res.m64_u64[`0`] = (uint64_t)a.m64_u32[`0`] + (uint64_t)a.m64_u32[`1`];
6428	return res;
6429	}
6430
6431	_NEON2SSESTORAGE int16x8_t vpaddlq_s8(int8x16_t a); // VPADDL.S8 q0,q0
6432	_NEON2SSE_INLINE int16x8_t vpaddlq_s8(int8x16_t a) // VPADDL.S8 q0,q0
6433	{
6434	//no 8 bit hadd in IA32, need to go to 16 bit
6435	__m128i r16_1, r16_2;
6436	r16_1 = _MM_CVTEPI8_EPI16 (a); // SSE 4.1
6437	//swap hi and low part of r to process the remaining data
6438	r16_2 = _mm_shuffle_epi32 (a, _SWAP_HI_LOW32);
6439	r16_2 = _MM_CVTEPI8_EPI16 (r16_2);
6440	return _mm_hadd_epi16 (r16_1, r16_2);
6441	}
6442
6443	_NEON2SSESTORAGE int32x4_t vpaddlq_s16(int16x8_t a); // VPADDL.S16 q0,q0
6444	_NEON2SSE_INLINE int32x4_t vpaddlq_s16(int16x8_t a) // VPADDL.S16 q0,q0
6445	{
6446	//no 8 bit hadd in IA32, need to go to 16 bit
6447	__m128i r32_1, r32_2;
6448	r32_1 = _MM_CVTEPI16_EPI32(a);
6449	//swap hi and low part of r to process the remaining data
6450	r32_2 = _mm_shuffle_epi32 (a, _SWAP_HI_LOW32);
6451	r32_2 = _MM_CVTEPI16_EPI32 (r32_2);
6452	return _mm_hadd_epi32 (r32_1, r32_2);
6453	}
6454
6455	_NEON2SSESTORAGE int64x2_t vpaddlq_s32(int32x4_t a); // VPADDL.S32 q0,q0
6456	_NEON2SSE_INLINE int64x2_t vpaddlq_s32(int32x4_t a)
6457	{
6458	__m128i top, bot;
6459	bot = _mm_shuffle_epi32(a, _MM_SHUFFLE(`0`, `0`, `2`, `0`));
6460	bot = _MM_CVTEPI32_EPI64(bot);
6461	top = _mm_shuffle_epi32(a, _MM_SHUFFLE(`0`, `0`, `3`, `1`));
6462	top = _MM_CVTEPI32_EPI64(top);
6463	return _mm_add_epi64(top, bot);
6464	}
6465
6466	_NEON2SSESTORAGE uint16x8_t vpaddlq_u8(uint8x16_t a); // VPADDL.U8 q0,q0
6467	_NEON2SSE_INLINE uint16x8_t vpaddlq_u8(uint8x16_t a) // VPADDL.U8 q0,q0
6468	{
6469	const __m128i ff = _mm_set1_epi16(`0xFF`);
6470	__m128i low = _mm_and_si128(a, ff);
6471	__m128i high = _mm_srli_epi16(a, `8`);
6472	return _mm_add_epi16(low, high);
6473	}
6474
6475	#ifdef USE_SSE4
6476	_NEON2SSESTORAGE uint32x4_t vpaddlq_u16(uint16x8_t a); // VPADDL.s16 q0,q0
6477	_NEON2SSE_INLINE uint32x4_t vpaddlq_u16(uint16x8_t a)
6478	{
6479	const __m128i zero = _mm_setzero_si128();
6480	__m128i low = _mm_blend_epi16(zero, a, `0x55`); // 0b1010101
6481	__m128i high = _mm_srli_epi32(a, `16`);
6482	return _mm_add_epi32(low, high);
6483	}
6484
6485	_NEON2SSESTORAGE uint64x2_t vpaddlq_u32(uint32x4_t a); // VPADDL.U32 q0,q0
6486	_NEON2SSE_INLINE uint64x2_t vpaddlq_u32(uint32x4_t a)
6487	{
6488	const __m128i zero = _mm_setzero_si128();
6489	__m128i low = _mm_blend_epi16(zero, a, `0x33`); // 0b00110011
6490	__m128i high = _mm_srli_epi64(a, `32`);
6491	return _mm_add_epi64(low, high);
6492	}
6493	#else
6494	_NEON2SSESTORAGE uint32x4_t vpaddlq_u16(uint16x8_t a); // VPADDL.s16 q0,q0
6495	_NEON2SSE_INLINE uint32x4_t vpaddlq_u16(uint16x8_t a)
6496	{
6497	const __m128i ff = _mm_set1_epi32(`0xFFFF`);
6498	__m128i low = _mm_and_si128(a, ff);
6499	__m128i high = _mm_srli_epi32(a, `16`);
6500	return _mm_add_epi32(low, high);
6501	}
6502
6503	_NEON2SSESTORAGE uint64x2_t vpaddlq_u32(uint32x4_t a); // VPADDL.U32 q0,q0
6504	_NEON2SSE_INLINE uint64x2_t vpaddlq_u32(uint32x4_t a)
6505	{
6506	const __m128i ff = _mm_set_epi32(`0x00000000`, `0xFFFFFFFF`, `0x00000000`, `0xFFFFFFFF`);
6507	__m128i low = _mm_and_si128(a, ff);
6508	__m128i high = _mm_srli_epi64(a, `32`);
6509	return _mm_add_epi64(low, high);
6510	}
6511	#endif
6512
6513	//********************** Long pairwise add and accumulate ************************
6514	//****************************************************************************************
6515	//VPADAL (Vector Pairwise Add and Accumulate Long) adds adjacent pairs of elements of a vector,
6516	// and accumulates the values of the results into the elements of the destination (wide) vector
6517	_NEON2SSESTORAGE int16x4_t vpadal_s8(int16x4_t a, int8x8_t b); // VPADAL.S8 d0,d0
6518	_NEON2SSE_INLINE int16x4_t vpadal_s8(int16x4_t a, int8x8_t b)
6519	{
6520	int16x4_t res64;
6521	return64(vpadalq_s8(_pM128i(a), _pM128i(b)));
6522	}
6523
6524	_NEON2SSESTORAGE int32x2_t vpadal_s16(int32x2_t a, int16x4_t b); // VPADAL.S16 d0,d0
6525	_NEON2SSE_INLINE int32x2_t vpadal_s16(int32x2_t a, int16x4_t b)
6526	{
6527	int32x2_t res64;
6528	return64(vpadalq_s16(_pM128i(a), _pM128i(b)));
6529	}
6530
6531
6532	_NEON2SSESTORAGE int64x1_t vpadal_s32(int64x1_t a, int32x2_t b); // VPADAL.S32 d0,d0
6533	_NEON2SSE_INLINE int64x1_t vpadal_s32(int64x1_t a, int32x2_t b)
6534	{
6535	int64x1_t res;
6536	res.m64_i64[`0`] = (int64_t)b.m64_i32[`0`] + (int64_t)b.m64_i32[`1`] + a.m64_i64[`0`];
6537	return res;
6538	}
6539
6540	_NEON2SSESTORAGE uint16x4_t vpadal_u8(uint16x4_t a, uint8x8_t b); // VPADAL.U8 d0,d0
6541	_NEON2SSE_INLINE uint16x4_t vpadal_u8(uint16x4_t a, uint8x8_t b)
6542	{
6543	uint16x4_t res64;
6544	return64(vpadalq_u8(_pM128i(a), _pM128i(b)));
6545	}
6546
6547
6548	_NEON2SSESTORAGE uint32x2_t vpadal_u16(uint32x2_t a, uint16x4_t b); // VPADAL.s16 d0,d0
6549	_NEON2SSE_INLINE uint32x2_t vpadal_u16(uint32x2_t a, uint16x4_t b)
6550	{
6551	uint32x2_t res64;
6552	return64(vpadalq_u16(_pM128i(a), _pM128i(b)));
6553	}
6554
6555	_NEON2SSESTORAGE uint64x1_t vpadal_u32(uint64x1_t a, uint32x2_t b); // VPADAL.U32 d0,d0
6556	_NEON2SSE_INLINE uint64x1_t vpadal_u32(uint64x1_t a, uint32x2_t b)
6557	{
6558	uint64x1_t res;
6559	res.m64_u64[`0`] = (uint64_t)b.m64_u32[`0`] + (uint64_t)b.m64_u32[`1`] + a.m64_u64[`0`];
6560	return res;
6561	}
6562
6563	_NEON2SSESTORAGE int16x8_t vpadalq_s8(int16x8_t a, int8x16_t b); // VPADAL.S8 q0,q0
6564	_NEON2SSE_INLINE int16x8_t vpadalq_s8(int16x8_t a, int8x16_t b) // VPADAL.S8 q0,q0
6565	{
6566	int16x8_t pad;
6567	pad = vpaddlq_s8(b);
6568	return _mm_add_epi16 (a, pad);
6569	}
6570
6571	_NEON2SSESTORAGE int32x4_t vpadalq_s16(int32x4_t a, int16x8_t b); // VPADAL.S16 q0,q0
6572	_NEON2SSE_INLINE int32x4_t vpadalq_s16(int32x4_t a, int16x8_t b) // VPADAL.S16 q0,q0
6573	{
6574	int32x4_t pad;
6575	pad = vpaddlq_s16(b);
6576	return _mm_add_epi32(a, pad);
6577	}
6578
6579	_NEON2SSESTORAGE int64x2_t vpadalq_s32(int64x2_t a, int32x4_t b); // VPADAL.S32 q0,q0
6580	_NEON2SSE_INLINE int64x2_t vpadalq_s32(int64x2_t a, int32x4_t b)
6581	{
6582	int64x2_t pad;
6583	pad = vpaddlq_s32(b);
6584	return _mm_add_epi64 (a, pad);
6585	}
6586
6587	_NEON2SSESTORAGE uint16x8_t vpadalq_u8(uint16x8_t a, uint8x16_t b); // VPADAL.U8 q0,q0
6588	_NEON2SSE_INLINE uint16x8_t vpadalq_u8(uint16x8_t a, uint8x16_t b) // VPADAL.U8 q0,q0
6589	{
6590	uint16x8_t pad;
6591	pad = vpaddlq_u8(b);
6592	return _mm_add_epi16 (a, pad);
6593	}
6594
6595	_NEON2SSESTORAGE uint32x4_t vpadalq_u16(uint32x4_t a, uint16x8_t b); // VPADAL.s16 q0,q0
6596	_NEON2SSE_INLINE uint32x4_t vpadalq_u16(uint32x4_t a, uint16x8_t b)
6597	{
6598	uint32x4_t pad;
6599	pad = vpaddlq_u16(b);
6600	return _mm_add_epi32(a, pad);
6601	} //no optimal SIMD solution, serial is faster
6602
6603	_NEON2SSESTORAGE uint64x2_t vpadalq_u32(uint64x2_t a, uint32x4_t b); // VPADAL.U32 q0,q0
6604	_NEON2SSE_INLINE uint64x2_t vpadalq_u32(uint64x2_t a, uint32x4_t b)
6605	{
6606	uint64x2_t pad;
6607	pad = vpaddlq_u32(b);
6608	return _mm_add_epi64(a, pad);
6609	}
6610
6611	//******** Folding maximum ***********************************
6612	//*******************************************************************
6613	//VPMAX (Vector Pairwise Maximum) compares adjacent pairs of elements in two vectors,
6614	//and copies the larger of each pair into the corresponding element in the destination
6615	// no corresponding functionality in IA32 SIMD, so we need to do the vertical comparison
6616	_NEON2SSESTORAGE int8x8_t vpmax_s8(int8x8_t a, int8x8_t b); // VPMAX.S8 d0,d0,d0
6617	_NEON2SSE_INLINE int8x8_t vpmax_s8(int8x8_t a, int8x8_t b) // VPMAX.S8 d0,d0,d0
6618	{
6619	int8x8_t res64;
6620	__m128i ab, ab1, max;
6621	_NEON2SSE_ALIGN_16 static const uint8_t mask8_sab[`16`] = { `1`, `0`, `3`, `2`, `5`, `4`, `7`, `6`, `9`, `8`, `11`, `10`, `13`, `12`, `15`, `14`};
6622	_NEON2SSE_ALIGN_16 static const uint8_t mask8_odd[`16`] = { `1`, `3`, `5`, `7`, `9`, `11`, `13`, `15`, `0xff`, `0xff`, `0xff`, `0xff`, `0xff`, `0xff`, `0xff`, `0xff`};
6623	ab = _mm_unpacklo_epi64 ( _pM128i(a), _pM128i(b)); //ab
6624	ab1 = _mm_shuffle_epi8 (ab, (__m128i) mask8_sab); //horisontal pairs swap for vertical max finding
6625	max = _MM_MAX_EPI8 (ab, ab1); // SSE4.1
6626	max = _mm_shuffle_epi8 (max, (__m128i) mask8_odd); //remove repetitive data
6627	return64(max); //we need 64 bits only
6628	}
6629
6630	_NEON2SSESTORAGE int16x4_t vpmax_s16(int16x4_t a, int16x4_t b); // VPMAX.S16 d0,d0,d0
6631	_NEON2SSE_INLINE int16x4_t vpmax_s16(int16x4_t a, int16x4_t b) // VPMAX.S16 d0,d0,d0
6632	{
6633	//solution may be not optimal compared with the serial one
6634	int16x4_t res64;
6635	__m128i ab, ab1, max;
6636	_NEON2SSE_ALIGN_16 static const int8_t mask16_sab[`16`] = { `2`, `3`, `0`, `1`, `6`, `7`, `4`, `5`, `10`, `11`, `8`, `9`, `14`, `15`, `12`, `13`}; //each chars pair is considerd to be 16 bit number
6637	ab = _mm_unpacklo_epi64 ( _pM128i(a), _pM128i(b)); //ab
6638	ab1 = _mm_shuffle_epi8 (ab, (__m128i) mask16_sab); //horisontal pairs swap for vertical max finding, use 8bit fn and the corresponding mask
6639	max = _mm_max_epi16 (ab, ab1);
6640	max = _mm_shuffle_epi8 (max, (__m128i) mask8_32_even_odd); //remove repetitive data, only the low part of mask is used
6641	return64(max);
6642	}
6643
6644	_NEON2SSESTORAGE int32x2_t vpmax_s32(int32x2_t a, int32x2_t b); // VPMAX.S32 d0,d0,d0
6645	_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int32x2_t vpmax_s32(int32x2_t a, int32x2_t b), _NEON2SSE_REASON_SLOW_SERIAL)
6646	{
6647	//serial solution looks faster than SIMD one
6648	int32x2_t res;
6649	res.m64_i32[`0`] = (a.m64_i32[`0`] < a.m64_i32[`1`]) ? a.m64_i32[`1`] : a.m64_i32[`0`];
6650	res.m64_i32[`1`] = (b.m64_i32[`0`] < b.m64_i32[`1`]) ? b.m64_i32[`1`] : b.m64_i32[`0`];
6651	return res;
6652	}
6653
6654	_NEON2SSESTORAGE uint8x8_t vpmax_u8(uint8x8_t a, uint8x8_t b); // VPMAX.U8 d0,d0,d0
6655	_NEON2SSE_INLINE uint8x8_t vpmax_u8(uint8x8_t a, uint8x8_t b) // VPMAX.U8 d0,d0,d0
6656	{
6657	uint8x8_t res64;
6658	__m128i ab, ab1, max;
6659	_NEON2SSE_ALIGN_16 static const int8_t mask8_sab[`16`] = { `1`, `0`, `3`, `2`, `5`, `4`, `7`, `6`, `9`, `8`, `11`, `10`, `13`, `12`, `15`, `14`};
6660	_NEON2SSE_ALIGN_16 static const uint8_t mask8_odd[`16`] = { `1`, `3`, `5`, `7`, `9`, `11`, `13`, `15`, `0xff`, `0xff`, `0xff`, `0xff`, `0xff`, `0xff`, `0xff`, `0xff`};
6661	ab = _mm_unpacklo_epi64 (_pM128i(a), _pM128i(b)); //ab
6662	ab1 = _mm_shuffle_epi8 (ab, (__m128i) mask8_sab); //horisontal pairs swap for vertical max finding
6663	max = _mm_max_epu8 (ab, ab1); // SSE4.1
6664	max = _mm_shuffle_epi8 (max, (__m128i) mask8_odd); //remove repetitive data
6665	return64(max);
6666	}
6667
6668	_NEON2SSESTORAGE uint16x4_t vpmax_u16(uint16x4_t a, uint16x4_t b); // VPMAX.s16 d0,d0,d0
6669	_NEON2SSE_INLINE uint16x4_t vpmax_u16(uint16x4_t a, uint16x4_t b) // VPMAX.s16 d0,d0,d0
6670	{
6671	//solution may be not optimal compared with the serial one
6672	uint16x4_t res64;
6673	__m128i ab, ab1, max;
6674	_NEON2SSE_ALIGN_16 static const uint8_t mask16_sab[`16`] = { `2`, `3`, `0`, `1`, `6`, `7`, `4`, `5`, `10`, `11`, `8`, `9`, `14`, `15`, `12`, `13`}; //each chars pair is considerd to be 16 bit number
6675	ab = _mm_unpacklo_epi64 ( _pM128i(a), _pM128i(b)); //ab
6676	ab1 = _mm_shuffle_epi8 (ab, (__m128i) mask16_sab); //horisontal pairs swap for vertical max finding, use 8bit fn and the corresponding mask
6677	max = _MM_MAX_EPU16 (ab, ab1);
6678	max = _mm_shuffle_epi8 (max, (__m128i) mask8_32_even_odd); //remove repetitive data, only the low part of mask is used
6679	return64(max);
6680	}
6681
6682	_NEON2SSESTORAGE uint32x2_t vpmax_u32(uint32x2_t a, uint32x2_t b); // VPMAX.U32 d0,d0,d0
6683	_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint32x2_t vpmax_u32(uint32x2_t a, uint32x2_t b), _NEON2SSE_REASON_SLOW_SERIAL)
6684	{
6685	//serial solution looks faster than SIMD one
6686	uint32x2_t res;
6687	res.m64_u32[`0`] = (a.m64_u32[`0`] < a.m64_u32[`1`]) ? a.m64_u32[`1`] : a.m64_u32[`0`];
6688	res.m64_u32[`1`] = (b.m64_u32[`0`] < b.m64_u32[`1`]) ? b.m64_u32[`1`] : b.m64_u32[`0`];
6689	return res;
6690	}
6691
6692	_NEON2SSESTORAGE float32x2_t vpmax_f32(float32x2_t a, float32x2_t b); // VPMAX.F32 d0,d0,d0
6693	_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(float32x2_t vpmax_f32(float32x2_t a, float32x2_t b), _NEON2SSE_REASON_SLOW_SERIAL)
6694	{
6695	//serial solution looks faster than SIMD one
6696	float32x2_t res;
6697	res.m64_f32[`0`] = (a.m64_f32[`0`] < a.m64_f32[`1`]) ? a.m64_f32[`1`] : a.m64_f32[`0`];
6698	res.m64_f32[`1`] = (b.m64_f32[`0`] < b.m64_f32[`1`]) ? b.m64_f32[`1`] : b.m64_f32[`0`];
6699	return res;
6700	}
6701
6702	// *************** Folding minimum **************************
6703	// **************************************************************
6704	//vpmin -> takes minimum of adjacent pairs
6705	_NEON2SSESTORAGE int8x8_t vpmin_s8(int8x8_t a, int8x8_t b); // VPMIN.S8 d0,d0,d0
6706	_NEON2SSE_INLINE int8x8_t vpmin_s8(int8x8_t a, int8x8_t b) // VPMIN.S8 d0,d0,d0
6707	{
6708	int8x8_t res64;
6709	__m128i ab, ab1, min;
6710	_NEON2SSE_ALIGN_16 static const uint8_t mask8_sab[`16`] = { `1`, `0`, `3`, `2`, `5`, `4`, `7`, `6`, `9`, `8`, `11`, `10`, `13`, `12`, `15`, `14`};
6711	_NEON2SSE_ALIGN_16 static const uint8_t mask8_odd[`16`] = { `1`, `3`, `5`, `7`, `9`, `11`, `13`, `15`, `0xff`, `0xff`, `0xff`, `0xff`, `0xff`, `0xff`, `0xff`, `0xff`};
6712	ab = _mm_unpacklo_epi64 ( _pM128i(a), _pM128i(b)); //ab
6713	ab1 = _mm_shuffle_epi8 (ab, (__m128i) mask8_sab); //horisontal pairs swap for vertical min finding
6714	min = _MM_MIN_EPI8 (ab, ab1); // SSE4.1
6715	min = _mm_shuffle_epi8 (min, (__m128i) mask8_odd); //remove repetitive data
6716	return64(min);
6717	}
6718
6719	_NEON2SSESTORAGE int16x4_t vpmin_s16(int16x4_t a, int16x4_t b); // VPMIN.S16 d0,d0,d0
6720	_NEON2SSE_INLINE int16x4_t vpmin_s16(int16x4_t a, int16x4_t b) // VPMIN.S16 d0,d0,d0
6721	{
6722	//solution may be not optimal compared with the serial one
6723	int16x4_t res64;
6724	__m128i ab, ab1, min;
6725	_NEON2SSE_ALIGN_16 static const int8_t mask16_sab[`16`] = { `2`, `3`, `0`, `1`, `6`, `7`, `4`, `5`, `10`, `11`, `8`, `9`, `14`, `15`, `12`, `13`}; //each chars pair is considerd to be 16 bit number
6726	ab = _mm_unpacklo_epi64 ( _pM128i(a), _pM128i(b)); //ab
6727	ab1 = _mm_shuffle_epi8 (ab, (__m128i) mask16_sab); //horisontal pairs swap for vertical max finding, use 8bit fn and the corresponding mask
6728	min = _mm_min_epi16 (ab, ab1);
6729	min = _mm_shuffle_epi8 (min, (__m128i) mask8_32_even_odd); //remove repetitive data, only the low part of mask is used
6730	return64(min);
6731	}
6732
6733	_NEON2SSESTORAGE int32x2_t vpmin_s32(int32x2_t a, int32x2_t b); // VPMIN.S32 d0,d0,d0
6734	_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int32x2_t vpmin_s32(int32x2_t a, int32x2_t b), _NEON2SSE_REASON_SLOW_SERIAL)
6735	{
6736	//serial solution looks faster than SIMD one
6737	int32x2_t res;
6738	res.m64_i32[`0`] = (a.m64_i32[`0`] > a.m64_i32[`1`]) ? a.m64_i32[`1`] : a.m64_i32[`0`];
6739	res.m64_i32[`1`] = (b.m64_i32[`0`] > b.m64_i32[`1`]) ? b.m64_i32[`1`] : b.m64_i32[`0`];
6740	return res;
6741	}
6742
6743	_NEON2SSESTORAGE uint8x8_t vpmin_u8(uint8x8_t a, uint8x8_t b); // VPMIN.U8 d0,d0,d0
6744	_NEON2SSE_INLINE uint8x8_t vpmin_u8(uint8x8_t a, uint8x8_t b) // VPMIN.U8 d0,d0,d0
6745	{
6746	uint8x8_t res64;
6747	__m128i ab, ab1, min;
6748	_NEON2SSE_ALIGN_16 static const uint8_t mask8_sab[`16`] = { `1`, `0`, `3`, `2`, `5`, `4`, `7`, `6`, `9`, `8`, `11`, `10`, `13`, `12`, `15`, `14`};
6749	_NEON2SSE_ALIGN_16 static const uint8_t mask8_odd[`16`] = { `1`, `3`, `5`, `7`, `9`, `11`, `13`, `15`, `0xff`, `0xff`, `0xff`, `0xff`, `0xff`, `0xff`, `0xff`, `0xff`};
6750	ab = _mm_unpacklo_epi64 ( _pM128i(a), _pM128i(b)); //ab
6751	ab1 = _mm_shuffle_epi8 (ab, (__m128i) mask8_sab); //horisontal pairs swap for vertical max finding
6752	min = _mm_min_epu8 (ab, ab1); // SSE4.1
6753	min = _mm_shuffle_epi8 (min, (__m128i) mask8_odd); //remove repetitive data
6754	return64(min);
6755	}
6756
6757	_NEON2SSESTORAGE uint16x4_t vpmin_u16(uint16x4_t a, uint16x4_t b); // VPMIN.s16 d0,d0,d0
6758	_NEON2SSE_INLINE uint16x4_t vpmin_u16(uint16x4_t a, uint16x4_t b) // VPMIN.s16 d0,d0,d0
6759	{
6760	//solution may be not optimal compared with the serial one
6761	uint16x4_t res64;
6762	__m128i ab, ab1, min;
6763	_NEON2SSE_ALIGN_16 static const uint8_t mask16_sab[`16`] = { `2`, `3`, `0`, `1`, `6`, `7`, `4`, `5`, `10`, `11`, `8`, `9`, `14`, `15`, `12`, `13`}; //each chars pair is considerd to be 16 bit number
6764	ab = _mm_unpacklo_epi64 ( _pM128i(a), _pM128i(b)); //ab
6765	ab1 = _mm_shuffle_epi8 (ab, (__m128i) mask16_sab); //horisontal pairs swap for vertical min finding, use 8bit fn and the corresponding mask
6766	min = _MM_MIN_EPU16 (ab, ab1);
6767	min = _mm_shuffle_epi8 (min, (__m128i) mask8_32_even_odd); //remove repetitive data, only the low part of mask is used
6768	return64(min);
6769	}
6770
6771	_NEON2SSESTORAGE uint32x2_t vpmin_u32(uint32x2_t a, uint32x2_t b); // VPMIN.U32 d0,d0,d0
6772	_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint32x2_t vpmin_u32(uint32x2_t a, uint32x2_t b), _NEON2SSE_REASON_SLOW_SERIAL)
6773	{
6774	//serial solution looks faster than SIMD one
6775	uint32x2_t res;
6776	res.m64_u32[`0`] = (a.m64_u32[`0`] > a.m64_u32[`1`]) ? a.m64_u32[`1`] : a.m64_u32[`0`];
6777	res.m64_u32[`1`] = (b.m64_u32[`0`] > b.m64_u32[`1`]) ? b.m64_u32[`1`] : b.m64_u32[`0`];
6778	return res;
6779	}
6780
6781	_NEON2SSESTORAGE float32x2_t vpmin_f32(float32x2_t a, float32x2_t b); // VPMIN.F32 d0,d0,d0
6782	_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(float32x2_t vpmin_f32(float32x2_t a, float32x2_t b), _NEON2SSE_REASON_SLOW_SERIAL)
6783	{
6784	//serial solution looks faster than SIMD one
6785	float32x2_t res;
6786	res.m64_f32[`0`] = (a.m64_f32[`0`] > a.m64_f32[`1`]) ? a.m64_f32[`1`] : a.m64_f32[`0`];
6787	res.m64_f32[`1`] = (b.m64_f32[`0`] > b.m64_f32[`1`]) ? b.m64_f32[`1`] : b.m64_f32[`0`];
6788	return res;
6789	}
6790
6791	//***************************************************************
6792	//********* Reciprocal/Sqrt **********************************
6793	//***************************************************************
6794	//**************** Reciprocal estimate *****************************
6795	//the ARM NEON and x86 SIMD results may be slightly different
6796	_NEON2SSESTORAGE float32x2_t vrecpe_f32(float32x2_t a); // VRECPE.F32 d0,d0
6797	_NEON2SSE_INLINE float32x2_t vrecpe_f32(float32x2_t a) //use low 64 bits
6798	{
6799	float32x4_t res;
6800	__m64_128 res64;
6801	res = _mm_rcp_ps(_pM128(a));
6802	_M64f(res64, res);
6803	return res64;
6804	}
6805
6806	_NEON2SSESTORAGE uint32x2_t vrecpe_u32(uint32x2_t a); // VRECPE.U32 d0,d0
6807	_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint32x2_t vrecpe_u32(uint32x2_t a), _NEON2SSE_REASON_SLOW_SERIAL)
6808	{
6809	//Input is fixed point number!!! No reciprocal for ints in IA32 available
6810	uint32x2_t res;
6811	float resf, r;
6812	int i, q, s;
6813	for (i =`0`; i<`2`; i++){
6814	if((a.m64_u32[i] & `0x80000000`) == `0`) {
6815	res.m64_u32[i] = `0xffffffff`;
6816	}else{
6817	resf = (float) (a.m64_u32[i] * (`0.5f` / (uint32_t)(`1` << `31`)));
6818	q = (int)(resf * `512.0f`); / a in units of 1/512 rounded down /
6819	r = (float)(`1.0f` / (((float)q + `0.5f`) / `512.0f`)); / reciprocal r /
6820	s = (int)(`256.0f` * r + `0.5f`); / r in units of 1/256 rounded to nearest /
6821	r = (float)s / `256.0f`;
6822	res.m64_u32[i] = (uint32_t)(r * (uint32_t)(`1` << `31`));
6823	}
6824	}
6825	return res;
6826	}
6827
6828	_NEON2SSE_GLOBAL float32x4_t vrecpeq_f32(float32x4_t a); // VRECPE.F32 q0,q0
6829	#define vrecpeq_f32 _mm_rcp_ps
6830
6831
6832	_NEON2SSESTORAGE uint32x4_t vrecpeq_u32(uint32x4_t a); // VRECPE.U32 q0,q0
6833	_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint32x4_t vrecpeq_u32(uint32x4_t a), _NEON2SSE_REASON_SLOW_SERIAL)
6834	{
6835	//Input is fixed point number!!!
6836	//We implement the recip_estimate function as described in ARMv7 reference manual (VRECPE instruction) but use float instead of double
6837	_NEON2SSE_ALIGN_16 uint32_t atmp[`4`];
6838	_NEON2SSE_ALIGN_16 uint32_t res[`4`];
6839	_NEON2SSE_ALIGN_16 static const uint32_t c80000000[`4`] = {`0x80000000`,`0x80000000`, `0x80000000`,`0x80000000`};
6840	float resf, r;
6841	int i, q, s;
6842	__m128i res128, mask, zero;
6843	_mm_store_si128((__m128i*)atmp, a);
6844	zero = _mm_setzero_si128();
6845	for (i =`0`; i<`4`; i++){
6846	resf = (atmp[i] * (`0.5f` / (uint32_t) (`1` << `31`))); // 2.3283064365386963E-10 ~(0.5f / (uint32_t) (1 << 31))
6847	q = (int)(resf * `512.0f`); / a in units of 1/512 rounded down /
6848	r = `1.0f` / (((float)q + `0.5f`) / `512.0f`); / reciprocal r /
6849	s = (int)(`256.0f` * r + `0.5f`); / r in units of 1/256 rounded to nearest /
6850	r = (float)s / `256.0f`;
6851	res[i] = (uint32_t) (r * (((uint32_t)`1`) << `31`) );
6852	}
6853	res128 = _mm_load_si128((__m128i*)res);
6854	mask = _mm_and_si128(a, (__m128i)c80000000);
6855	mask = _mm_cmpeq_epi32(zero, mask); //0xffffffff if atmp[i] <= 0x7fffffff
6856	return _mm_or_si128(res128, mask);
6857	}
6858
6859	//********Reciprocal square root estimate **************
6860	//**********************************************************
6861	//no reciprocal square root for ints in IA32 available, neither for unsigned int to float4 lanes conversion, so a serial solution looks faster
6862	//but the particular implementation for vrsqrte_u32 may vary for various ARM compilers
6863	////the ARM NEON and x86 SIMD results may be slightly different
6864	_NEON2SSESTORAGE float32x2_t vrsqrte_f32(float32x2_t a); // VRSQRTE.F32 d0,d0
6865	_NEON2SSE_INLINE float32x2_t vrsqrte_f32(float32x2_t a) //use low 64 bits
6866	{
6867	float32x4_t res;
6868	__m64_128 res64;
6869	res = _mm_rsqrt_ps(_pM128(a));
6870	_M64f(res64, res);
6871	return res64;
6872	}
6873
6874	_NEON2SSESTORAGE uint32x2_t vrsqrte_u32(uint32x2_t a); // VRSQRTE.U32 d0,d0
6875	_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint32x2_t vrsqrte_u32(uint32x2_t a), _NEON2SSE_REASON_SLOW_SERIAL)
6876	{
6877	// Input is fixed point number!!!
6878	// We implement the recip_sqrt_estimate function as described in ARMv7
6879	// reference manual (VRSQRTE instruction) But results may be slightly different
6880	// from ARM implementation due to _mm_rsqrt_ps precision
6881	uint32x2_t res;
6882	__m64_128 res64[`2`];
6883	int i;
6884	_NEON2SSE_ALIGN_16 float coeff[`2`];
6885	for (i = `0`; i < `2`; i++) {
6886	// Generate double-precision value = operand 2^(-32). This has zero sign*
6887	// bit, with:
6888	// exponent = 1022 or 1021 = double-precision representation of 2^(-1)
6889	// or 2^(-2) fraction taken from operand, excluding its most significant
6890	// one or two bits.
6891	uint64_t dp_operand;
6892	if (a.m64_u32[i] & `0x80000000`) {
6893	dp_operand =
6894	(`0x3feLL` << `52`) \| (((uint64_t)a.m64_u32[i] & `0x7FFFFFFF`) << `21`);
6895	} else {
6896	dp_operand =
6897	(`0x3fdLL` << `52`) \| (((uint64_t)a.m64_u32[i] & `0x3FFFFFFF`) << `22`);
6898	}
6899	res64[i].m64_u64[`0`] = dp_operand;
6900	coeff[i] = (res64[i].m64_d64[`0`] < `0.5`) ? `512.0f` : `256.0f`; / range 0.25 <= resf < 0.5 or range 0.5 <= resf < 1.0/
6901	}
6902	__m128 coeff_f = _mm_load_ps(coeff);
6903	__m128d q0_d = _mm_mul_pd(_mm_loadu_pd(&res64[`0`].m64_d64[`0`]), _mm_cvtps_pd(coeff_f));
6904	__m128i q0_i = _mm_cvttpd_epi32(q0_d);
6905	__m128 c05_f = _mm_set1_ps(`0.5`);
6906	__m128 r_f = _mm_div_ps(_mm_add_ps(_mm_cvtepi32_ps(q0_i), c05_f), coeff_f);
6907	__m128 rsqrt_f = _mm_rsqrt_ps(r_f);
6908	__m128 c256_f = _mm_set1_ps(`256.0`);
6909	__m128 s_f = _mm_add_ps(_mm_mul_ps(rsqrt_f, c256_f), c05_f);
6910	#ifdef USE_SSE4
6911	s_f = _mm_floor_ps(s_f);
6912	#else
6913	s_f = _mm_cvtepi32_ps(_mm_cvttps_epi32(s_f));
6914	#endif
6915	s_f = _mm_div_ps(s_f, c256_f);
6916	_M64f(res64[`0`], s_f);
6917
6918	for (i = `0`; i < `2`; i++) {
6919	if ((a.m64_u32[i] & `0xc0000000`) == `0`) { // a <=0x3fffffff
6920	res.m64_u32[i] = `0xffffffff`;
6921	} else {
6922	res.m64_u32[i] = (uint32_t)(res64[`0`].m64_f32[i] * (((uint32_t)`1`) << `31`));
6923	}
6924	}
6925	return res;
6926	}
6927
6928	_NEON2SSE_GLOBAL float32x4_t vrsqrteq_f32(float32x4_t a); // VRSQRTE.F32 q0,q0
6929	#define vrsqrteq_f32 _mm_rsqrt_ps
6930
6931	_NEON2SSESTORAGE uint32x4_t vrsqrteq_u32(uint32x4_t a); // VRSQRTE.U32 q0,q0
6932	_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint32x4_t vrsqrteq_u32(uint32x4_t a), _NEON2SSE_REASON_SLOW_SERIAL)
6933	{
6934	// Input is fixed point number!!!
6935	// We implement the recip_sqrt_estimate function as described in ARMv7
6936	// reference manual (VRSQRTE instruction) But results may be slightly different
6937	// from ARM implementation due to _mm_rsqrt_ps precision
6938	int i;
6939	_NEON2SSE_ALIGN_16 uint32_t atmp[`4`], res[`4`];
6940	_NEON2SSE_ALIGN_16 float coeff[`4`], rr[`4`];
6941	char* coeff_f2_c = (char*)&coeff[`2`];
6942	__m64_128 res64[`4`];
6943	_mm_store_si128((__m128i *)atmp, a);
6944	for (i = `0`; i < `4`; i++) {
6945	// Generate double-precision value = operand 2^(-32). This has zero sign*
6946	// bit, with:
6947	// exponent = 1022 or 1021 = double-precision representation of 2^(-1)
6948	// or 2^(-2) fraction taken from operand, excluding its most significant
6949	// one or two bits.
6950	uint64_t dp_operand;
6951	if (atmp[i] & `0x80000000`) {
6952	dp_operand = (`0x3feLL` << `52`) \| (((uint64_t)atmp[i] & `0x7FFFFFFF`) << `21`);
6953	} else {
6954	dp_operand = (`0x3fdLL` << `52`) \| (((uint64_t)atmp[i] & `0x3FFFFFFF`) << `22`);
6955	}
6956	res64[i].m64_u64[`0`] = dp_operand;
6957	coeff[i] = (res64[i].m64_d64[`0`] < `0.5`) ? `512.0f` : `256.0f`; / range 0.25 <= resf < 0.5 or range 0.5 <= resf < 1.0/
6958	}
6959	__m128 c05_f = _mm_set1_ps(`0.5`);
6960	__m128 coeff_f = _mm_load_ps(coeff);
6961	__m128d q0_d = _mm_mul_pd(_mm_loadu_pd(&res64[`0`].m64_d64[`0`]), _mm_cvtps_pd(coeff_f));
6962	__m128i q0_i = _mm_cvttpd_epi32(q0_d);
6963
6964	__m128 coeff_f2 = _M128(_pM128i(*coeff_f2_c));
6965	q0_d = _mm_mul_pd(_mm_loadu_pd(&res64[`2`].m64_d64[`0`]), _mm_cvtps_pd(coeff_f2));
6966	__m128i q0_i2 = _mm_cvttpd_epi32(q0_d);
6967	coeff_f = _M128(_mm_unpacklo_epi64(_M128i(coeff_f), _M128i(coeff_f2)));
6968	q0_i = _mm_unpacklo_epi64(q0_i, q0_i2);
6969
6970	__m128 r_f = _mm_div_ps(_mm_add_ps(_mm_cvtepi32_ps(q0_i), c05_f), coeff_f);
6971	__m128 rsqrt_f = _mm_rsqrt_ps(r_f);
6972	__m128 c256_f = _mm_set1_ps(`256.0`);
6973	__m128 s_f = _mm_add_ps(_mm_mul_ps(rsqrt_f, c256_f), c05_f);
6974	#ifdef USE_SSE4
6975	s_f = _mm_floor_ps(s_f);
6976	#else
6977	s_f = _mm_cvtepi32_ps(_mm_cvttps_epi32(s_f));
6978	#endif
6979	s_f = _mm_div_ps(s_f, c256_f);
6980	_mm_store_ps(rr, s_f);
6981
6982	for (i = `0`; i < `4`; i++) {
6983	if ((atmp[i] & `0xc0000000`) == `0`) { // a <=0x3fffffff
6984	res[i] = `0xffffffff`;
6985	} else {
6986	res[i] = (uint32_t)(rr[i] * (((uint32_t)`1`) << `31`));
6987	}
6988	}
6989	return _mm_load_si128((__m128i *)res);
6990	}
6991
6992	//********** Reciprocal estimate/step and 1/sqrt estimate/step *************************
6993	//******************************************************************************************
6994	//****VRECPS (Vector Reciprocal Step) *************************************************
6995	//multiplies the elements of one vector by the corresponding elements of another vector,
6996	//subtracts each of the results from 2, and places the final results into the elements of the destination vector.
6997
6998	_NEON2SSESTORAGE float32x2_t vrecps_f32(float32x2_t a, float32x2_t b); // VRECPS.F32 d0, d0, d0
6999	_NEON2SSE_INLINE float32x2_t vrecps_f32(float32x2_t a, float32x2_t b)
7000	{
7001	float32x4_t res;
7002	__m64_128 res64;
7003	res = vrecpsq_f32(_pM128(a), _pM128(b));
7004	_M64f(res64, res);
7005	return res64;
7006	}
7007
7008	_NEON2SSESTORAGE float32x4_t vrecpsq_f32(float32x4_t a, float32x4_t b); // VRECPS.F32 q0, q0, q0
7009	_NEON2SSE_INLINE float32x4_t vrecpsq_f32(float32x4_t a, float32x4_t b) // VRECPS.F32 q0, q0, q0
7010	{
7011	__m128 f2, mul;
7012	f2 = _mm_set1_ps(`2.`);
7013	mul = _mm_mul_ps(a,b);
7014	return _mm_sub_ps(f2,mul);
7015	}
7016
7017	//***************VRSQRTS (Vector Reciprocal Square Root Step) ***************************
7018	//multiplies the elements of one vector by the corresponding elements of another vector,
7019	//subtracts each of the results from 3, divides these results by two, and places the final results into the elements of the destination vector.
7020
7021	_NEON2SSESTORAGE float32x2_t vrsqrts_f32(float32x2_t a, float32x2_t b); // VRSQRTS.F32 d0, d0, d0
7022	_NEON2SSE_INLINE float32x2_t vrsqrts_f32(float32x2_t a, float32x2_t b)
7023	{
7024	float32x2_t res;
7025	res.m64_f32[`0`] = (`3` - a.m64_f32[`0`] * b.m64_f32[`0`]) / `2`;
7026	res.m64_f32[`1`] = (`3` - a.m64_f32[`1`] * b.m64_f32[`1`]) / `2`;
7027	return res;
7028	}
7029
7030	_NEON2SSESTORAGE float32x4_t vrsqrtsq_f32(float32x4_t a, float32x4_t b); // VRSQRTS.F32 q0, q0, q0
7031	_NEON2SSE_INLINE float32x4_t vrsqrtsq_f32(float32x4_t a, float32x4_t b) // VRSQRTS.F32 q0, q0, q0
7032	{
7033	__m128 f3, f05, mul;
7034	f3 = _mm_set1_ps(`3.f`);
7035	f05 = _mm_set1_ps(`0.5f`);
7036	mul = _mm_mul_ps(a,b);
7037	f3 = _mm_sub_ps(f3,mul);
7038	return _mm_mul_ps (f3, f05);
7039	}
7040	//********************************************************************************************
7041	//*************************** Shifts by signed variable *********************************
7042	//********************************************************************************************
7043	//*** Vector shift left: Vr[i] := Va[i] << Vb[i] (negative values shift right) *********************
7044	//********************************************************************************************
7045	//No such operations in IA32 SIMD unfortunately, constant shift only available, so need to do the serial solution
7046	//helper macro. It matches ARM implementation for big shifts
7047	#define SERIAL_SHIFT(TYPE, INTERNAL_TYPE, LENMAX, LEN) \
7048	_NEON2SSE_ALIGN_16 TYPE atmp[LENMAX], res[LENMAX]; _NEON2SSE_ALIGN_16 INTERNAL_TYPE btmp[LENMAX]; int i, lanesize = sizeof(INTERNAL_TYPE) << 3; \
7049	_mm_store_si128((__m128i)atmp, a); _mm_store_si128((__m128i)btmp, b); \
7050	for (i = 0; i<LEN; i++) { \
7051	if( (btmp[i] >= lanesize)\|\|(btmp[i] <= -lanesize) ) res[i] = 0; \
7052	else res[i] = (btmp[i] >=0) ? atmp[i] << btmp[i] : atmp[i] >> (-btmp[i]); } \
7053	return _mm_load_si128((__m128i*)res);
7054
7055	#define SERIAL_SHIFT_64(TYPE, SIGN, LEN) \
7056	int ## TYPE ## x ## LEN ## _t res; int i, lanesize = sizeof(int ## TYPE ## _t) << 3; \
7057	for (i = 0; i<LEN; i++) { \
7058	if( (b.m64_i ## TYPE[i] >= lanesize)\|\|(b.m64_i ## TYPE[i] <= -lanesize) ) res.m64_ ## SIGN ## TYPE[i] = 0; \
7059	else res.m64_ ## SIGN ## TYPE[i] = (b.m64_i ## TYPE[i] >=0) ? a.m64_ ## SIGN ## TYPE[i] << b.m64_i ## TYPE[i] : a.m64_ ## SIGN ## TYPE[i] >> (-b.m64_i ## TYPE[i]); } \
7060	return res;
7061
7062	_NEON2SSESTORAGE int8x8_t vshl_s8(int8x8_t a, int8x8_t b); // VSHL.S8 d0,d0,d0
7063	_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int8x8_t vshl_s8(int8x8_t a, int8x8_t b), _NEON2SSE_REASON_SLOW_SERIAL)
7064	{
7065	SERIAL_SHIFT_64(`8`, i, `8`)
7066	}
7067
7068	_NEON2SSESTORAGE int16x4_t vshl_s16(int16x4_t a, int16x4_t b); // VSHL.S16 d0,d0,d0
7069	_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int16x4_t vshl_s16(int16x4_t a, int16x4_t b), _NEON2SSE_REASON_SLOW_SERIAL)
7070	{
7071	SERIAL_SHIFT_64(`16`, i, `4`)
7072	}
7073
7074	_NEON2SSESTORAGE int32x2_t vshl_s32(int32x2_t a, int32x2_t b); // VSHL.S32 d0,d0,d0
7075	_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int32x2_t vshl_s32(int32x2_t a, int32x2_t b), _NEON2SSE_REASON_SLOW_SERIAL)
7076	{
7077	SERIAL_SHIFT_64(`32`, i, `2`)
7078	}
7079
7080	_NEON2SSESTORAGE int64x1_t vshl_s64(int64x1_t a, int64x1_t b); // VSHL.S64 d0,d0,d0
7081	_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int64x1_t vshl_s64(int64x1_t a, int64x1_t b), _NEON2SSE_REASON_SLOW_SERIAL)
7082	{
7083	SERIAL_SHIFT_64(`64`, i, `1`)
7084	}
7085
7086	_NEON2SSESTORAGE uint8x8_t vshl_u8(uint8x8_t a, int8x8_t b); // VSHL.U8 d0,d0,d0
7087	_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint8x8_t vshl_u8(uint8x8_t a, int8x8_t b), _NEON2SSE_REASON_SLOW_SERIAL)
7088	{
7089	SERIAL_SHIFT_64(`8`, u, `8`)
7090	}
7091
7092	_NEON2SSESTORAGE uint16x4_t vshl_u16(uint16x4_t a, int16x4_t b); // VSHL.s16 d0,d0,d0
7093	_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint16x4_t vshl_u16(uint16x4_t a, int16x4_t b), _NEON2SSE_REASON_SLOW_SERIAL)
7094	{
7095	SERIAL_SHIFT_64(`16`, u, `4`)
7096	}
7097
7098	_NEON2SSESTORAGE uint32x2_t vshl_u32(uint32x2_t a, int32x2_t b); // VSHL.U32 d0,d0,d0
7099	_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint32x2_t vshl_u32(uint32x2_t a, int32x2_t b), _NEON2SSE_REASON_SLOW_SERIAL)
7100	{
7101	SERIAL_SHIFT_64(`32`, u, `2`)
7102	}
7103
7104	_NEON2SSESTORAGE uint64x1_t vshl_u64(uint64x1_t a, int64x1_t b); // VSHL.U64 d0,d0,d0
7105	_NEON2SSE_INLINE uint64x1_t vshl_u64(uint64x1_t a, int64x1_t b) //if we use the SERIAL_SHIFT macro need to have the special processing for large numbers
7106	{
7107	SERIAL_SHIFT_64(`64`, u, `1`)
7108	}
7109
7110	_NEON2SSESTORAGE int8x16_t vshlq_s8(int8x16_t a, int8x16_t b); // VSHL.S8 q0,q0,q0
7111	_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int8x16_t vshlq_s8(int8x16_t a, int8x16_t b), _NEON2SSE_REASON_SLOW_SERIAL)
7112	{
7113	SERIAL_SHIFT(int8_t, int8_t, `16`, `16`)
7114	}
7115
7116	_NEON2SSESTORAGE int16x8_t vshlq_s16(int16x8_t a, int16x8_t b); // VSHL.S16 q0,q0,q0
7117	_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int16x8_t vshlq_s16(int16x8_t a, int16x8_t b), _NEON2SSE_REASON_SLOW_SERIAL)
7118	{
7119	SERIAL_SHIFT(int16_t, int16_t, `8`, `8`)
7120	}
7121
7122	_NEON2SSESTORAGE int32x4_t vshlq_s32(int32x4_t a, int32x4_t b); // VSHL.S32 q0,q0,q0
7123	_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int32x4_t vshlq_s32(int32x4_t a, int32x4_t b), _NEON2SSE_REASON_SLOW_SERIAL)
7124	{
7125	SERIAL_SHIFT(int32_t, int32_t, `4`, `4`)
7126	}
7127
7128	_NEON2SSESTORAGE int64x2_t vshlq_s64(int64x2_t a, int64x2_t b); // VSHL.S64 q0,q0,q0
7129	_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int64x2_t vshlq_s64(int64x2_t a, int64x2_t b), _NEON2SSE_REASON_SLOW_SERIAL)
7130	{
7131	SERIAL_SHIFT(int64_t, int64_t, `2`, `2`)
7132	}
7133
7134	_NEON2SSESTORAGE uint8x16_t vshlq_u8(uint8x16_t a, int8x16_t b); // VSHL.U8 q0,q0,q0
7135	_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint8x16_t vshlq_u8(uint8x16_t a, int8x16_t b), _NEON2SSE_REASON_SLOW_SERIAL)
7136	{
7137	SERIAL_SHIFT(uint8_t, int8_t, `16`, `16`)
7138	}
7139
7140	_NEON2SSESTORAGE uint16x8_t vshlq_u16(uint16x8_t a, int16x8_t b); // VSHL.s16 q0,q0,q0
7141	_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint16x8_t vshlq_u16(uint16x8_t a, int16x8_t b), _NEON2SSE_REASON_SLOW_SERIAL)
7142	{
7143	SERIAL_SHIFT(uint16_t, int16_t, `8`, `8`)
7144	}
7145
7146	_NEON2SSESTORAGE uint32x4_t vshlq_u32(uint32x4_t a, int32x4_t b); // VSHL.U32 q0,q0,q0
7147	_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint32x4_t vshlq_u32(uint32x4_t a, int32x4_t b), _NEON2SSE_REASON_SLOW_SERIAL)
7148	{
7149	SERIAL_SHIFT(uint32_t, int32_t, `4`, `4`)
7150	}
7151
7152	_NEON2SSESTORAGE uint64x2_t vshlq_u64(uint64x2_t a, int64x2_t b); // VSHL.U64 q0,q0,q0
7153	_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING( uint64x2_t vshlq_u64(uint64x2_t a, int64x2_t b), _NEON2SSE_REASON_SLOW_SERIAL)
7154	{
7155	SERIAL_SHIFT(uint64_t, int64_t, `2`, `2`)
7156	}
7157
7158
7159	//********* Vector saturating shift left: (negative values shift right) ********************
7160	//********************************************************************************************
7161	//No such operations in IA32 SIMD available yet, constant shift only available, so need to do the serial solution
7162	#define SERIAL_SATURATING_SHIFT_SIGNED(TYPE, LENMAX, LEN) \
7163	_NEON2SSE_ALIGN_16 TYPE atmp[LENMAX], res[LENMAX], btmp[LENMAX]; TYPE limit; int i; \
7164	int lanesize_1 = (sizeof(TYPE) << 3) - 1; \
7165	_mm_store_si128((__m128i)atmp, a); _mm_store_si128((__m128i)btmp, b); \
7166	for (i = 0; i<LEN; i++) { \
7167	if ((atmp[i] ==0)\|\|(btmp[i] ==0)) res[i] = atmp[i]; \
7168	else{ \
7169	if(btmp[i] <0) res[i] = atmp[i] >> (-btmp[i]); \
7170	else{ \
7171	if (btmp[i]>lanesize_1) { \
7172	res[i] = ((_UNSIGNED_T(TYPE))atmp[i] >> lanesize_1 ) + ((TYPE)1 << lanesize_1) - 1; \
7173	}else{ \
7174	limit = (TYPE)1 << (lanesize_1 - btmp[i]); \
7175	if((atmp[i] >= limit)\|\|(atmp[i] <= -limit)) \
7176	res[i] = ((_UNSIGNED_T(TYPE))atmp[i] >> lanesize_1 ) + ((TYPE)1 << lanesize_1) - 1; \
7177	else res[i] = atmp[i] << btmp[i]; }}}} \
7178	return _mm_load_si128((__m128i*)res);
7179
7180	#define SERIAL_SATURATING_SHIFT_UNSIGNED(TYPE, LENMAX, LEN) \
7181	_NEON2SSE_ALIGN_16 _UNSIGNED_T(TYPE) atmp[LENMAX], res[LENMAX]; _NEON2SSE_ALIGN_16 TYPE btmp[LENMAX]; _UNSIGNED_T(TYPE) limit; int i; \
7182	TYPE lanesize = (sizeof(TYPE) << 3); \
7183	_mm_store_si128((__m128i)atmp, a); _mm_store_si128((__m128i)btmp, b); \
7184	for (i = 0; i<LEN; i++) { \
7185	if ((atmp[i] ==0)\|\|(btmp[i] ==0)) { res[i] = atmp[i]; \
7186	}else{ \
7187	if(btmp[i] < 0) res[i] = atmp[i] >> (-btmp[i]); \
7188	else{ \
7189	if (btmp[i]>lanesize) res[i] = (_UNSIGNED_T(TYPE))(~0ll); \
7190	else{ \
7191	limit = (TYPE) 1 << (lanesize - btmp[i]); \
7192	res[i] = ( atmp[i] >= limit) ? (_UNSIGNED_T(TYPE))(~0ll) : atmp[i] << btmp[i]; }}}} \
7193	return _mm_load_si128((__m128i*)res);
7194
7195	#define SERIAL_SATURATING_SHIFT_SIGNED_64(TYPE, LEN) \
7196	int ## TYPE ## x ## LEN ## _t res; int ## TYPE ## _t limit; int i; \
7197	int lanesize_1 = (sizeof( int ## TYPE ## _t) << 3) - 1; \
7198	for (i = 0; i<LEN; i++) { \
7199	if ((a.m64_i ## TYPE[i] == 0) \|\|(b.m64_i ## TYPE[i] == 0)) res.m64_i ## TYPE[i] = a.m64_i ## TYPE[i]; \
7200	else{ \
7201	if(b.m64_i ## TYPE[i] <0) res.m64_i ## TYPE[i] = a.m64_i ## TYPE[i] >> (-(b.m64_i ## TYPE[i])); \
7202	else{ \
7203	if (b.m64_i ## TYPE[i]>lanesize_1) { \
7204	res.m64_i ## TYPE[i] = ((_UNSIGNED_T(int ## TYPE ## _t))a.m64_i ## TYPE[i] >> lanesize_1 ) + ((int ## TYPE ## _t) 1 << lanesize_1) - 1; \
7205	}else{ \
7206	limit = (int ## TYPE ## _t) 1 << (lanesize_1 - b.m64_i ## TYPE[i]); \
7207	if((a.m64_i ## TYPE[i] >= limit)\|\|(a.m64_i ## TYPE[i] <= -limit)) \
7208	res.m64_i ## TYPE[i] = ((_UNSIGNED_T(int ## TYPE ## _t))a.m64_i ## TYPE[i] >> lanesize_1 ) + ((int ## TYPE ## _t) 1 << lanesize_1) - 1; \
7209	else res.m64_i ## TYPE[i] = a.m64_i ## TYPE[i] << b.m64_i ## TYPE[i]; }}}} \
7210	return res;
7211
7212	#define SERIAL_SATURATING_SHIFT_UNSIGNED_64(TYPE, LEN) \
7213	int ## TYPE ## x ## LEN ## _t res; _UNSIGNED_T(int ## TYPE ## _t) limit; int i; \
7214	int ## TYPE ## _t lanesize = (sizeof(int ## TYPE ## _t) << 3); \
7215	for (i = 0; i<LEN; i++) { \
7216	if ((a.m64_u ## TYPE[i] == 0) \|\|(b.m64_u ## TYPE[i] == 0)) {res.m64_u ## TYPE[i] = a.m64_u ## TYPE[i]; \
7217	}else{ \
7218	if(b.m64_i ## TYPE[i] < 0) res.m64_u ## TYPE[i] = a.m64_u ## TYPE[i] >> (-(b.m64_i ## TYPE[i])); \
7219	else{ \
7220	if (b.m64_i ## TYPE[i]>lanesize) res.m64_u ## TYPE[i] = (_UNSIGNED_T(int ## TYPE ## _t))(~0ll); \
7221	else{ \
7222	limit = (int ## TYPE ## _t) 1 << (lanesize - b.m64_i ## TYPE[i]); \
7223	res.m64_u ## TYPE[i] = ( a.m64_u ## TYPE[i] >= limit) ? (_UNSIGNED_T(int ## TYPE ## _t))(~0ll) : a.m64_u ## TYPE[i] << b.m64_u ## TYPE[i]; }}}} \
7224	return res;
7225
7226	_NEON2SSESTORAGE int8x8_t vqshl_s8(int8x8_t a, int8x8_t b); // VQSHL.S8 d0,d0,d0
7227	_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int8x8_t vqshl_s8(int8x8_t a, int8x8_t b), _NEON2SSE_REASON_SLOW_SERIAL)
7228	{
7229	SERIAL_SATURATING_SHIFT_SIGNED_64(`8`,`8`)
7230	}
7231
7232	_NEON2SSESTORAGE int16x4_t vqshl_s16(int16x4_t a, int16x4_t b); // VQSHL.S16 d0,d0,d0
7233	_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int16x4_t vqshl_s16(int16x4_t a, int16x4_t b), _NEON2SSE_REASON_SLOW_SERIAL)
7234	{
7235	SERIAL_SATURATING_SHIFT_SIGNED_64(`16`,`4`)
7236	}
7237
7238	_NEON2SSESTORAGE int32x2_t vqshl_s32(int32x2_t a, int32x2_t b); // VQSHL.S32 d0,d0,d0
7239	_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int32x2_t vqshl_s32(int32x2_t a, int32x2_t b), _NEON2SSE_REASON_SLOW_SERIAL)
7240	{
7241	SERIAL_SATURATING_SHIFT_SIGNED_64(`32`,`2`)
7242	}
7243
7244	_NEON2SSESTORAGE int64x1_t vqshl_s64(int64x1_t a, int64x1_t b); // VQSHL.S64 d0,d0,d0
7245	_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int64x1_t vqshl_s64(int64x1_t a, int64x1_t b), _NEON2SSE_REASON_SLOW_SERIAL)
7246	{
7247	SERIAL_SATURATING_SHIFT_SIGNED_64(`64`,`1`)
7248	}
7249
7250	_NEON2SSESTORAGE uint8x8_t vqshl_u8(uint8x8_t a, int8x8_t b); // VQSHL.U8 d0,d0,d0
7251	_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint8x8_t vqshl_u8(uint8x8_t a, int8x8_t b), _NEON2SSE_REASON_SLOW_SERIAL)
7252	{
7253	SERIAL_SATURATING_SHIFT_UNSIGNED_64(`8`,`8`)
7254	}
7255
7256	_NEON2SSESTORAGE uint16x4_t vqshl_u16(uint16x4_t a, int16x4_t b); // VQSHL.s16 d0,d0,d0
7257	_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint16x4_t vqshl_u16(uint16x4_t a, int16x4_t b), _NEON2SSE_REASON_SLOW_SERIAL)
7258	{
7259	SERIAL_SATURATING_SHIFT_UNSIGNED_64(`16`,`4`)
7260	}
7261
7262	_NEON2SSESTORAGE uint32x2_t vqshl_u32(uint32x2_t a, int32x2_t b); // VQSHL.U32 d0,d0,d0
7263	_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint32x2_t vqshl_u32(uint32x2_t a, int32x2_t b), _NEON2SSE_REASON_SLOW_SERIAL)
7264	{
7265	SERIAL_SATURATING_SHIFT_UNSIGNED_64(`32`,`2`)
7266	}
7267
7268	_NEON2SSESTORAGE uint64x1_t vqshl_u64(uint64x1_t a, int64x1_t b); // VQSHL.U64 d0,d0,d0
7269	_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint64x1_t vqshl_u64(uint64x1_t a, int64x1_t b), _NEON2SSE_REASON_SLOW_SERIAL)
7270	{
7271	SERIAL_SATURATING_SHIFT_UNSIGNED_64(`64`,`1`)
7272	}
7273
7274	_NEON2SSESTORAGE int8x16_t vqshlq_s8(int8x16_t a, int8x16_t b); // VQSHL.S8 q0,q0,q0
7275	_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int8x16_t vqshlq_s8(int8x16_t a, int8x16_t b), _NEON2SSE_REASON_SLOW_SERIAL)
7276	{
7277	SERIAL_SATURATING_SHIFT_SIGNED(int8_t, `16`, `16`)
7278	}
7279
7280	_NEON2SSESTORAGE int16x8_t vqshlq_s16(int16x8_t a, int16x8_t b); // VQSHL.S16 q0,q0,q0
7281	_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int16x8_t vqshlq_s16(int16x8_t a, int16x8_t b), _NEON2SSE_REASON_SLOW_SERIAL)
7282	{
7283	SERIAL_SATURATING_SHIFT_SIGNED(int16_t, `8`, `8`)
7284	}
7285
7286	_NEON2SSESTORAGE int32x4_t vqshlq_s32(int32x4_t a, int32x4_t b); // VQSHL.S32 q0,q0,q0
7287	_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int32x4_t vqshlq_s32(int32x4_t a, int32x4_t b), _NEON2SSE_REASON_SLOW_SERIAL)
7288	{
7289	SERIAL_SATURATING_SHIFT_SIGNED(int32_t, `4`, `4`)
7290	}
7291
7292	_NEON2SSESTORAGE int64x2_t vqshlq_s64(int64x2_t a, int64x2_t b); // VQSHL.S64 q0,q0,q0
7293	_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int64x2_t vqshlq_s64(int64x2_t a, int64x2_t b), _NEON2SSE_REASON_SLOW_SERIAL)
7294	{
7295	SERIAL_SATURATING_SHIFT_SIGNED(int64_t, `2`, `2`)
7296	}
7297
7298	_NEON2SSESTORAGE uint8x16_t vqshlq_u8(uint8x16_t a, int8x16_t b); // VQSHL.U8 q0,q0,q0
7299	_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint8x16_t vqshlq_u8(uint8x16_t a, int8x16_t b), _NEON2SSE_REASON_SLOW_SERIAL)
7300	{
7301	SERIAL_SATURATING_SHIFT_UNSIGNED(int8_t, `16`, `16`)
7302	}
7303
7304	_NEON2SSESTORAGE uint16x8_t vqshlq_u16(uint16x8_t a, int16x8_t b); // VQSHL.s16 q0,q0,q0
7305	_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint16x8_t vqshlq_u16(uint16x8_t a, int16x8_t b), _NEON2SSE_REASON_SLOW_SERIAL)
7306	{
7307	SERIAL_SATURATING_SHIFT_UNSIGNED(int16_t, `8`, `8`)
7308	}
7309
7310	_NEON2SSESTORAGE uint32x4_t vqshlq_u32(uint32x4_t a, int32x4_t b); // VQSHL.U32 q0,q0,q0
7311	_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint32x4_t vqshlq_u32(uint32x4_t a, int32x4_t b), _NEON2SSE_REASON_SLOW_SERIAL)
7312	{
7313	SERIAL_SATURATING_SHIFT_UNSIGNED(int32_t, `4`, `4`)
7314	}
7315
7316	_NEON2SSESTORAGE uint64x2_t vqshlq_u64(uint64x2_t a, int64x2_t b); // VQSHL.U64 q0,q0,q0
7317	_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint64x2_t vqshlq_u64(uint64x2_t a, int64x2_t b), _NEON2SSE_REASON_SLOW_SERIAL)
7318	{
7319	SERIAL_SATURATING_SHIFT_UNSIGNED(int64_t, `2`, `2`)
7320	}
7321
7322
7323	//****** Vector rounding shift left: (negative values shift right) ********
7324	//****************************************************************************
7325	//No such operations in IA32 SIMD available yet, constant shift only available, so need to do the serial solution
7326	//rounding makes sense for right shifts only.
7327	#define SERIAL_ROUNDING_SHIFT(TYPE, INTERNAL_TYPE, LENMAX, LEN) \
7328	_NEON2SSE_ALIGN_16 TYPE atmp[LENMAX], res[LENMAX]; _NEON2SSE_ALIGN_16 INTERNAL_TYPE btmp[LENMAX]; INTERNAL_TYPE i, lanesize = sizeof(INTERNAL_TYPE) << 3; \
7329	_mm_store_si128((__m128i)atmp, a); _mm_store_si128((__m128i)btmp, b); \
7330	for (i = 0; i<LEN; i++) { \
7331	if( btmp[i] >= 0) { \
7332	if(btmp[i] >= lanesize) res[i] = 0; \
7333	else res[i] = (atmp[i] << btmp[i]); \
7334	}else{ \
7335	res[i] = (btmp[i] < -lanesize) ? 0 : \
7336	(btmp[i] == -lanesize) ? (atmp[i] & ((INTERNAL_TYPE)1 << (-btmp[i] - 1))) >> (-btmp[i] - 1) : \
7337	(atmp[i] >> (-btmp[i])) + ( (atmp[i] & ((INTERNAL_TYPE)1 << (-btmp[i] - 1))) >> (-btmp[i] - 1) ); }} \
7338	return _mm_load_si128((__m128i*)res);
7339
7340
7341	#define SERIAL_ROUNDING_SHIFT_64(TYPE, SIGN, LEN) \
7342	int ## TYPE ## x ## LEN ## _t res; int i; int lanesize = sizeof(int ## TYPE ## _t) << 3; \
7343	for (i = 0; i<LEN; i++) { \
7344	if( b.m64_i ## TYPE[i] >= 0) { \
7345	if(b.m64_i ## TYPE[i] >= lanesize) res.m64_ ## SIGN ## TYPE[i] = 0; \
7346	else res.m64_ ## SIGN ## TYPE[i] = (a.m64_ ## SIGN ## TYPE[i] << b.m64_i ## TYPE[i]); \
7347	}else{ \
7348	res.m64_ ## SIGN ## TYPE[i] = (b.m64_i ## TYPE[i] < -lanesize) ? 0 : \
7349	(b.m64_i ## TYPE[i] == -lanesize) ? (a.m64_ ## SIGN ## TYPE[i] & ((int ## TYPE ## _t) 1 << (-(b.m64_i ## TYPE[i]) - 1))) >> (-(b.m64_i ## TYPE[i]) - 1) : \
7350	(a.m64_ ## SIGN ## TYPE[i] >> (-(b.m64_i ## TYPE[i]))) + ( (a.m64_ ## SIGN ## TYPE[i] & ((int ## TYPE ## _t) 1 << (-(b.m64_i ## TYPE[i]) - 1))) >> (-(b.m64_i ## TYPE[i]) - 1) ); }} \
7351	return res;
7352
7353
7354	_NEON2SSESTORAGE int8x8_t vrshl_s8(int8x8_t a, int8x8_t b); // VRSHL.S8 d0,d0,d0
7355	_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int8x8_t vrshl_s8(int8x8_t a, int8x8_t b), _NEON2SSE_REASON_SLOW_SERIAL)
7356	{
7357	SERIAL_ROUNDING_SHIFT_64(`8`,i,`8`)
7358	}
7359
7360	_NEON2SSESTORAGE int16x4_t vrshl_s16(int16x4_t a, int16x4_t b); // VRSHL.S16 d0,d0,d0
7361	_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int16x4_t vrshl_s16(int16x4_t a, int16x4_t b), _NEON2SSE_REASON_SLOW_SERIAL)
7362	{
7363	SERIAL_ROUNDING_SHIFT_64(`16`,i,`4`)
7364	}
7365
7366	_NEON2SSESTORAGE int32x2_t vrshl_s32(int32x2_t a, int32x2_t b); // VRSHL.S32 d0,d0,d0
7367	_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int32x2_t vrshl_s32(int32x2_t a, int32x2_t b), _NEON2SSE_REASON_SLOW_SERIAL)
7368	{
7369	SERIAL_ROUNDING_SHIFT_64(`32`,i,`2`)
7370	}
7371
7372	_NEON2SSESTORAGE int64x1_t vrshl_s64(int64x1_t a, int64x1_t b); // VRSHL.S64 d0,d0,d0
7373	_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int64x1_t vrshl_s64(int64x1_t a, int64x1_t b), _NEON2SSE_REASON_SLOW_SERIAL)
7374	{
7375	SERIAL_ROUNDING_SHIFT_64(`64`,i,`1`)
7376	}
7377
7378	_NEON2SSESTORAGE uint8x8_t vrshl_u8(uint8x8_t a, int8x8_t b); // VRSHL.U8 d0,d0,d0
7379	_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint8x8_t vrshl_u8(uint8x8_t a, int8x8_t b), _NEON2SSE_REASON_SLOW_SERIAL)
7380	{
7381	SERIAL_ROUNDING_SHIFT_64(`8`,u,`8`)
7382	}
7383
7384	_NEON2SSESTORAGE uint16x4_t vrshl_u16(uint16x4_t a, int16x4_t b); // VRSHL.s16 d0,d0,d0
7385	_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint16x4_t vrshl_u16(uint16x4_t a, int16x4_t b), _NEON2SSE_REASON_SLOW_SERIAL)
7386	{
7387	SERIAL_ROUNDING_SHIFT_64(`16`,u,`4`)
7388	}
7389
7390	_NEON2SSESTORAGE uint32x2_t vrshl_u32(uint32x2_t a, int32x2_t b); // VRSHL.U32 d0,d0,d0
7391	_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint32x2_t vrshl_u32(uint32x2_t a, int32x2_t b), _NEON2SSE_REASON_SLOW_SERIAL)
7392	{
7393	SERIAL_ROUNDING_SHIFT_64(`32`,u,`2`)
7394	}
7395
7396	_NEON2SSESTORAGE uint64x1_t vrshl_u64(uint64x1_t a, int64x1_t b); // VRSHL.U64 d0,d0,d0
7397	_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint64x1_t vrshl_u64(uint64x1_t a, int64x1_t b), _NEON2SSE_REASON_SLOW_SERIAL)
7398	{
7399	SERIAL_ROUNDING_SHIFT_64(`64`,u,`1`)
7400	}
7401
7402	_NEON2SSESTORAGE int8x16_t vrshlq_s8(int8x16_t a, int8x16_t b); // VRSHL.S8 q0,q0,q0
7403	_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int8x16_t vrshlq_s8(int8x16_t a, int8x16_t b), _NEON2SSE_REASON_SLOW_SERIAL)
7404	{
7405	SERIAL_ROUNDING_SHIFT(int8_t, int8_t, `16`, `16`)
7406	}
7407
7408	_NEON2SSESTORAGE int16x8_t vrshlq_s16(int16x8_t a, int16x8_t b); // VRSHL.S16 q0,q0,q0
7409	_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int16x8_t vrshlq_s16(int16x8_t a, int16x8_t b), _NEON2SSE_REASON_SLOW_SERIAL)
7410	{
7411	SERIAL_ROUNDING_SHIFT(int16_t, int16_t, `8`, `8`)
7412	}
7413
7414	_NEON2SSESTORAGE int32x4_t vrshlq_s32(int32x4_t a, int32x4_t b); // VRSHL.S32 q0,q0,q0
7415	_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int32x4_t vrshlq_s32(int32x4_t a, int32x4_t b), _NEON2SSE_REASON_SLOW_SERIAL)
7416	{
7417	SERIAL_ROUNDING_SHIFT(int32_t, int32_t, `4`, `4`)
7418	}
7419
7420	_NEON2SSESTORAGE int64x2_t vrshlq_s64(int64x2_t a, int64x2_t b); // VRSHL.S64 q0,q0,q0
7421	_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int64x2_t vrshlq_s64(int64x2_t a, int64x2_t b), _NEON2SSE_REASON_SLOW_SERIAL)
7422	{
7423	SERIAL_ROUNDING_SHIFT(int64_t, int64_t, `2`, `2`)
7424	}
7425
7426	_NEON2SSESTORAGE uint8x16_t vrshlq_u8(uint8x16_t a, int8x16_t b); // VRSHL.U8 q0,q0,q0
7427	_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint8x16_t vrshlq_u8(uint8x16_t a, int8x16_t b), _NEON2SSE_REASON_SLOW_SERIAL)
7428	{
7429	SERIAL_ROUNDING_SHIFT(uint8_t, int8_t, `16`, `16`)
7430	}
7431
7432	_NEON2SSESTORAGE uint16x8_t vrshlq_u16(uint16x8_t a, int16x8_t b); // VRSHL.s16 q0,q0,q0
7433	_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint16x8_t vrshlq_u16(uint16x8_t a, int16x8_t b), _NEON2SSE_REASON_SLOW_SERIAL)
7434	{
7435	SERIAL_ROUNDING_SHIFT(uint16_t, int16_t, `8`, `8`)
7436	}
7437
7438	_NEON2SSESTORAGE uint32x4_t vrshlq_u32(uint32x4_t a, int32x4_t b); // VRSHL.U32 q0,q0,q0
7439	_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint32x4_t vrshlq_u32(uint32x4_t a, int32x4_t b), _NEON2SSE_REASON_SLOW_SERIAL)
7440	{
7441	SERIAL_ROUNDING_SHIFT(uint32_t, int32_t, `4`, `4`)
7442	}
7443
7444	_NEON2SSESTORAGE uint64x2_t vrshlq_u64(uint64x2_t a, int64x2_t b); // VRSHL.U64 q0,q0,q0
7445	_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint64x2_t vrshlq_u64(uint64x2_t a, int64x2_t b), _NEON2SSE_REASON_SLOW_SERIAL)
7446	{
7447	SERIAL_ROUNDING_SHIFT(uint64_t, int64_t, `2`, `2`)
7448	}
7449
7450
7451	//******** Vector saturating rounding shift left: (negative values shift right) **************
7452	//*************************************************************************************************
7453	//No such operations in IA32 SIMD unfortunately, constant shift only available, so need to do the serial solution
7454	//Saturation happens for left shifts only while rounding makes sense for right shifts only.
7455	#define SERIAL_SATURATING_ROUNDING_SHIFT_SIGNED(TYPE, LENMAX, LEN) \
7456	_NEON2SSE_ALIGN_16 TYPE atmp[LENMAX], res[LENMAX], btmp[LENMAX]; TYPE limit; int i; \
7457	int lanesize_1 = (sizeof(TYPE) << 3) - 1; \
7458	_mm_store_si128((__m128i)atmp, a); _mm_store_si128((__m128i)btmp, b); \
7459	for (i = 0; i<LEN; i++) { \
7460	if (atmp[i] ==0) res[i] = 0; \
7461	else{ \
7462	if(btmp[i] <0) res[i] = (btmp[i] < (-lanesize_1)) ? 0 : (atmp[i] >> (-btmp[i])) + ( (atmp[i] & ((TYPE)1 << (-btmp[i] - 1))) >> (-btmp[i] - 1) ); \
7463	else{ \
7464	if (btmp[i]>lanesize_1) { \
7465	res[i] = ((_UNSIGNED_T(TYPE))atmp[i] >> lanesize_1 ) + ((TYPE)1 << lanesize_1) - 1; \
7466	}else{ \
7467	limit = (TYPE)1 << (lanesize_1 - btmp[i]); \
7468	if((atmp[i] >= limit)\|\|(atmp[i] <= -limit)) \
7469	res[i] = ((_UNSIGNED_T(TYPE))atmp[i] >> lanesize_1 ) + ((TYPE)1 << lanesize_1) - 1; \
7470	else res[i] = atmp[i] << btmp[i]; }}}} \
7471	return _mm_load_si128((__m128i*)res);
7472
7473	#define SERIAL_SATURATING_ROUNDING_SHIFT_UNSIGNED(TYPE, LENMAX, LEN) \
7474	_NEON2SSE_ALIGN_16 _UNSIGNED_T(TYPE) atmp[LENMAX], res[LENMAX]; _NEON2SSE_ALIGN_16 TYPE btmp[LENMAX]; _UNSIGNED_T(TYPE) limit; int i; \
7475	int lanesize = (sizeof(TYPE) << 3); \
7476	_mm_store_si128((__m128i)atmp, a); _mm_store_si128((__m128i)btmp, b); \
7477	for (i = 0; i<LEN; i++) { \
7478	if (atmp[i] ==0) {res[i] = 0; \
7479	}else{ \
7480	if(btmp[i] < 0) res[i] = (btmp[i] < (-lanesize)) ? 0 : (atmp[i] >> (-btmp[i])) + ( (atmp[i] & ((TYPE)1 << (-btmp[i] - 1))) >> (-btmp[i] - 1) ); \
7481	else{ \
7482	if (btmp[i]>lanesize) res[i] = (_UNSIGNED_T(TYPE))(~0ll); \
7483	else{ \
7484	limit = (TYPE) 1 << (lanesize - btmp[i]); \
7485	res[i] = ( atmp[i] >= limit) ? (_UNSIGNED_T(TYPE))(~0ll) : atmp[i] << btmp[i]; }}}} \
7486	return _mm_load_si128((__m128i*)res);
7487
7488	#define SERIAL_SATURATING_ROUNDING_SHIFT_SIGNED_64(TYPE, LEN) \
7489	__m64_128 res; int ## TYPE ## _t limit; int i; \
7490	int lanesize_1 = (sizeof(int ## TYPE ## _t ) << 3) - 1; \
7491	for (i = 0; i<LEN; i++) { \
7492	if (a.m64_i ## TYPE[i] ==0) res.m64_i ## TYPE[i] = 0; \
7493	else{ \
7494	if(b.m64_i ## TYPE[i] <0) res.m64_i ## TYPE[i] = (b.m64_i ## TYPE[i] < (-lanesize_1)) ? 0 : (a.m64_i ## TYPE[i] >> (-(b.m64_i ## TYPE[i]))) + ( (a.m64_i ## TYPE[i] & ((int ## TYPE ## _t ) 1 << (-(b.m64_i ## TYPE[i]) - 1))) >> (-(b.m64_i ## TYPE[i]) - 1) ); \
7495	else{ \
7496	if (b.m64_i ## TYPE[i]>lanesize_1) { \
7497	res.m64_i ## TYPE[i] = ((_UNSIGNED_T(int ## TYPE ## _t ))a.m64_i ## TYPE[i] >> lanesize_1 ) + ((int ## TYPE ## _t ) 1 << lanesize_1) - 1; \
7498	}else{ \
7499	limit = (int ## TYPE ## _t ) 1 << (lanesize_1 - b.m64_i ## TYPE[i]); \
7500	if((a.m64_i ## TYPE[i] >= limit)\|\|(a.m64_i ## TYPE[i] <= -limit)) \
7501	res.m64_i ## TYPE[i] = ((_UNSIGNED_T(int ## TYPE ## _t ))a.m64_i ## TYPE[i] >> lanesize_1 ) + ((int ## TYPE ## _t ) 1 << lanesize_1) - 1; \
7502	else res.m64_i ## TYPE[i] = a.m64_i ## TYPE[i] << b.m64_i ## TYPE[i]; }}}} \
7503	return res;
7504
7505	#define SERIAL_SATURATING_ROUNDING_SHIFT_UNSIGNED_64(TYPE, LEN) \
7506	__m64_128 res; _UNSIGNED_T(int ## TYPE ## _t) limit; int i; \
7507	int lanesize = (sizeof(int ## TYPE ## _t) << 3); \
7508	for (i = 0; i<LEN; i++) { \
7509	if (a.m64_u ## TYPE[i] ==0) {res.m64_u ## TYPE[i] = 0; \
7510	}else{ \
7511	if(b.m64_i ## TYPE[i] < 0) res.m64_u ## TYPE[i] = (b.m64_i ## TYPE[i] < (-lanesize)) ? 0 : (a.m64_u ## TYPE[i] >> (-(b.m64_i ## TYPE[i]))) + ( (a.m64_u ## TYPE[i] & ((int ## TYPE ## _t) 1 << (-(b.m64_i ## TYPE[i]) - 1))) >> (-(b.m64_i ## TYPE[i]) - 1) ); \
7512	else{ \
7513	if (b.m64_i ## TYPE[i]>lanesize) res.m64_u ## TYPE[i] = (_UNSIGNED_T(int ## TYPE ## _t))(~0ll); \
7514	else{ \
7515	limit = (int ## TYPE ## _t) 1 << (lanesize - b.m64_i ## TYPE[i]); \
7516	res.m64_u ## TYPE[i] = ( a.m64_u ## TYPE[i] >= limit) ? (_UNSIGNED_T(int ## TYPE ## _t))(~0ll) : a.m64_u ## TYPE[i] << b.m64_i ## TYPE[i]; }}}} \
7517	return res;
7518
7519	_NEON2SSESTORAGE int8x8_t vqrshl_s8(int8x8_t a, int8x8_t b); // VQRSHL.S8 d0,d0,d0
7520	_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int8x8_t vqrshl_s8(int8x8_t a, int8x8_t b), _NEON2SSE_REASON_SLOW_SERIAL)
7521	{
7522	SERIAL_SATURATING_ROUNDING_SHIFT_SIGNED_64(`8`,`8`)
7523	}
7524
7525	_NEON2SSESTORAGE int16x4_t vqrshl_s16(int16x4_t a, int16x4_t b); // VQRSHL.S16 d0,d0,d0
7526	_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int16x4_t vqrshl_s16(int16x4_t a, int16x4_t b), _NEON2SSE_REASON_SLOW_SERIAL)
7527	{
7528	SERIAL_SATURATING_ROUNDING_SHIFT_SIGNED_64(`16`,`4`)
7529	}
7530
7531	_NEON2SSESTORAGE int32x2_t vqrshl_s32(int32x2_t a, int32x2_t b); // VQRSHL.S32 d0,d0,d0
7532	_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int32x2_t vqrshl_s32(int32x2_t a, int32x2_t b), _NEON2SSE_REASON_SLOW_SERIAL)
7533	{
7534	SERIAL_SATURATING_ROUNDING_SHIFT_SIGNED_64(`32`,`2`)
7535	}
7536
7537	_NEON2SSESTORAGE int64x1_t vqrshl_s64(int64x1_t a, int64x1_t b); // VQRSHL.S64 d0,d0,d0
7538	_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int64x1_t vqrshl_s64(int64x1_t a, int64x1_t b), _NEON2SSE_REASON_SLOW_SERIAL)
7539	{
7540	SERIAL_SATURATING_ROUNDING_SHIFT_SIGNED_64(`64`,`1`)
7541	}
7542
7543	_NEON2SSESTORAGE uint8x8_t vqrshl_u8(uint8x8_t a, int8x8_t b); // VQRSHL.U8 d0,d0,d0
7544	_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint8x8_t vqrshl_u8(uint8x8_t a, int8x8_t b), _NEON2SSE_REASON_SLOW_SERIAL)
7545	{
7546	SERIAL_SATURATING_ROUNDING_SHIFT_UNSIGNED_64(`8`,`8`)
7547	}
7548
7549	_NEON2SSESTORAGE uint16x4_t vqrshl_u16(uint16x4_t a, int16x4_t b); // VQRSHL.s16 d0,d0,d0
7550	_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint16x4_t vqrshl_u16(uint16x4_t a, int16x4_t b), _NEON2SSE_REASON_SLOW_SERIAL)
7551	{
7552	SERIAL_SATURATING_ROUNDING_SHIFT_UNSIGNED_64(`16`,`4`)
7553	}
7554
7555	_NEON2SSESTORAGE uint32x2_t vqrshl_u32(uint32x2_t a, int32x2_t b); // VQRSHL.U32 d0,d0,d0
7556	_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint32x2_t vqrshl_u32(uint32x2_t a, int32x2_t b), _NEON2SSE_REASON_SLOW_SERIAL)
7557	{
7558	SERIAL_SATURATING_ROUNDING_SHIFT_UNSIGNED_64(`32`,`2`)
7559	}
7560
7561	_NEON2SSESTORAGE uint64x1_t vqrshl_u64(uint64x1_t a, int64x1_t b); // VQRSHL.U64 d0,d0,d0
7562	_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint64x1_t vqrshl_u64(uint64x1_t a, int64x1_t b), _NEON2SSE_REASON_SLOW_SERIAL)
7563	{
7564	SERIAL_SATURATING_ROUNDING_SHIFT_UNSIGNED_64(`64`,`1`)
7565	}
7566
7567	_NEON2SSESTORAGE int8x16_t vqrshlq_s8(int8x16_t a, int8x16_t b); // VQRSHL.S8 q0,q0,q0
7568	_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int8x16_t vqrshlq_s8(int8x16_t a, int8x16_t b), _NEON2SSE_REASON_SLOW_SERIAL)
7569	{
7570	SERIAL_SATURATING_ROUNDING_SHIFT_SIGNED(int8_t, `16`, `16`)
7571	}
7572
7573	_NEON2SSESTORAGE int16x8_t vqrshlq_s16(int16x8_t a, int16x8_t b); // VQRSHL.S16 q0,q0,q0
7574	_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int16x8_t vqrshlq_s16(int16x8_t a, int16x8_t b), _NEON2SSE_REASON_SLOW_SERIAL)
7575	{
7576	SERIAL_SATURATING_ROUNDING_SHIFT_SIGNED(int16_t, `8`, `8`)
7577	}
7578
7579	_NEON2SSESTORAGE int32x4_t vqrshlq_s32(int32x4_t a, int32x4_t b); // VQRSHL.S32 q0,q0,q0
7580	_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int32x4_t vqrshlq_s32(int32x4_t a, int32x4_t b), _NEON2SSE_REASON_SLOW_SERIAL)
7581	{
7582	SERIAL_SATURATING_ROUNDING_SHIFT_SIGNED(int32_t, `4`, `4`)
7583	}
7584
7585	_NEON2SSESTORAGE int64x2_t vqrshlq_s64(int64x2_t a, int64x2_t b); // VQRSHL.S64 q0,q0,q0
7586	_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int64x2_t vqrshlq_s64(int64x2_t a, int64x2_t b), _NEON2SSE_REASON_SLOW_SERIAL)
7587	{
7588	SERIAL_SATURATING_ROUNDING_SHIFT_SIGNED(int64_t, `2`, `2`)
7589	}
7590
7591	_NEON2SSESTORAGE uint8x16_t vqrshlq_u8(uint8x16_t a, int8x16_t b); // VQRSHL.U8 q0,q0,q0
7592	_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint8x16_t vqrshlq_u8(uint8x16_t a, int8x16_t b), _NEON2SSE_REASON_SLOW_SERIAL)
7593	{
7594	SERIAL_SATURATING_ROUNDING_SHIFT_UNSIGNED(int8_t, `16`, `16`)
7595	}
7596
7597	_NEON2SSESTORAGE uint16x8_t vqrshlq_u16(uint16x8_t a, int16x8_t b); // VQRSHL.s16 q0,q0,q0
7598	_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint16x8_t vqrshlq_u16(uint16x8_t a, int16x8_t b), _NEON2SSE_REASON_SLOW_SERIAL)
7599	{
7600	SERIAL_SATURATING_ROUNDING_SHIFT_UNSIGNED(int16_t, `8`, `8`)
7601	}
7602
7603	_NEON2SSESTORAGE uint32x4_t vqrshlq_u32(uint32x4_t a, int32x4_t b); // VQRSHL.U32 q0,q0,q0
7604	_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint32x4_t vqrshlq_u32(uint32x4_t a, int32x4_t b), _NEON2SSE_REASON_SLOW_SERIAL)
7605	{
7606	SERIAL_SATURATING_ROUNDING_SHIFT_UNSIGNED(int32_t, `4`, `4`)
7607	}
7608
7609	_NEON2SSESTORAGE uint64x2_t vqrshlq_u64(uint64x2_t a, int64x2_t b); // VQRSHL.U64 q0,q0,q0
7610	_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint64x2_t vqrshlq_u64(uint64x2_t a, int64x2_t b), _NEON2SSE_REASON_SLOW_SERIAL)
7611	{
7612	SERIAL_SATURATING_ROUNDING_SHIFT_UNSIGNED(int64_t, `2`, `2`)
7613	}
7614
7615	// *********************************************************************************
7616	// *************************** Shifts by a constant ***************************
7617	// *********************************************************************************
7618	//************** Vector shift right by constant***********************************
7619	//************************************************************************************
7620	_NEON2SSESTORAGE int8x8_t vshr_n_s8(int8x8_t a, __constrange(`1`,`8`) int b); // VSHR.S8 d0,d0,#8
7621	_NEON2SSE_INLINE int8x8_t vshr_n_s8(int8x8_t a, __constrange(`1`,`8`) int b) // VSHR.S8 d0,d0,#8
7622	{
7623	//no 8 bit shift available, go to 16 bit
7624	int8x8_t res64;
7625	__m128i r;
7626	r = _MM_CVTEPI8_EPI16 (_pM128i(a)); //SSE 4.1
7627	r = _mm_srai_epi16 (r, b); //SSE2
7628	r = _mm_packs_epi16 (r,r); //we need 64 bits only
7629	return64(r);
7630	}
7631
7632	_NEON2SSESTORAGE int16x4_t vshr_n_s16(int16x4_t a, __constrange(`1`,`16`) int b); // VSHR.S16 d0,d0,#16
7633	_NEON2SSE_INLINE int16x4_t vshr_n_s16(int16x4_t a, __constrange(`1`,`16`) int b)
7634	{
7635	int16x4_t res64;
7636	return64(_mm_srai_epi16(_pM128i(a), b));
7637	}
7638
7639
7640	_NEON2SSESTORAGE int32x2_t vshr_n_s32(int32x2_t a, __constrange(`1`,`32`) int b); // VSHR.S32 d0,d0,#32
7641	_NEON2SSE_INLINE int32x2_t vshr_n_s32(int32x2_t a, __constrange(`1`,`32`) int b)
7642	{
7643	int32x2_t res64;
7644	return64(_mm_srai_epi32(_pM128i(a), b));
7645	}
7646
7647	_NEON2SSESTORAGE int64x1_t vshr_n_s64(int64x1_t a, __constrange(`1`,`64`) int b); // VSHR.S64 d0,d0,#64
7648	_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int64x1_t vshr_n_s64(int64x1_t a, __constrange(`1`,`64`) int b), _NEON2SSE_REASON_SLOW_SERIAL)
7649	{
7650	//no arithmetic shift for 64bit values, serial solution used
7651	int64x1_t res;
7652	if(b>=`64`) res.m64_i64[`0`] = `0`;
7653	else res.m64_i64[`0`] = ((int64_t)&a) >> b;
7654	return res;
7655	}
7656
7657	_NEON2SSESTORAGE uint8x8_t vshr_n_u8(uint8x8_t a, __constrange(`1`,`8`) int b); // VSHR.U8 d0,d0,#8
7658	_NEON2SSE_INLINE uint8x8_t vshr_n_u8(uint8x8_t a, __constrange(`1`,`8`) int b) // VSHR.U8 d0,d0,#8
7659	{
7660	//no 8 bit shift available, go to 16 bit
7661	uint8x8_t res64;
7662	__m128i r;
7663	r = _MM_CVTEPU8_EPI16 (_pM128i(a)); //SSE 4.1
7664	r = _mm_srli_epi16 (r, b); //for unsigned variables we use the logical shift not arithmetical one
7665	r = _mm_packus_epi16 (r,r); //we need 64 bits only
7666	return64(r);
7667	}
7668
7669	_NEON2SSESTORAGE uint16x4_t vshr_n_u16(uint16x4_t a, __constrange(`1`,`16`) int b); // VSHR.s16 d0,d0,#16
7670	_NEON2SSE_INLINE uint16x4_t vshr_n_u16(uint16x4_t a, __constrange(`1`,`16`) int b)
7671	{
7672	uint16x4_t res64;
7673	return64(_mm_srli_epi16(_pM128i(a), b));
7674	}
7675
7676
7677	_NEON2SSESTORAGE uint32x2_t vshr_n_u32(uint32x2_t a, __constrange(`1`,`32`) int b); // VSHR.U32 d0,d0,#32
7678	_NEON2SSE_INLINE uint32x2_t vshr_n_u32(uint32x2_t a, __constrange(`1`,`32`) int b)
7679	{
7680	uint32x2_t res64;
7681	return64(_mm_srli_epi32(_pM128i(a), b));
7682	}
7683
7684
7685	_NEON2SSESTORAGE uint64x1_t vshr_n_u64(uint64x1_t a, __constrange(`1`,`64`) int b); // VSHR.U64 d0,d0,#64
7686	_NEON2SSE_INLINE uint64x1_t vshr_n_u64(uint64x1_t a, __constrange(`1`,`64`) int b)
7687	{
7688	uint64x1_t res64;
7689	return64(_mm_srli_epi64(_pM128i(a), b));
7690	}
7691
7692
7693	_NEON2SSESTORAGE int8x16_t vshrq_n_s8(int8x16_t a, __constrange(`1`,`8`) int b); // VSHR.S8 q0,q0,#8
7694	_NEON2SSE_INLINE int8x16_t vshrq_n_s8(int8x16_t a, __constrange(`1`,`8`) int b) // VSHR.S8 q0,q0,#8
7695	{
7696	//no 8 bit shift available, go to 16 bit trick
7697	__m128i zero, mask0, a_sign, r, a_sign_mask;
7698	_NEON2SSE_ALIGN_16 static const int16_t mask0_16[`9`] = {`0x0000`, `0x0080`, `0x00c0`, `0x00e0`, `0x00f0`, `0x00f8`, `0x00fc`, `0x00fe`, `0x00ff`};
7699	zero = _mm_setzero_si128();
7700	mask0 = _mm_set1_epi16(mask0_16[b]); //to mask the bits to be "spoiled" by 16 bit shift
7701	a_sign = _mm_cmpgt_epi8 (zero, a); //ff if a<0 or zero if a>0
7702	r = _mm_srai_epi16 (a, b);
7703	a_sign_mask = _mm_and_si128 (mask0, a_sign);
7704	r = _mm_andnot_si128 (mask0, r);
7705	return _mm_or_si128 (r, a_sign_mask);
7706	}
7707
7708	_NEON2SSE_GLOBAL int16x8_t vshrq_n_s16(int16x8_t a, __constrange(`1`,`16`) int b); // VSHR.S16 q0,q0,#16
7709	#define vshrq_n_s16 _mm_srai_epi16
7710
7711	_NEON2SSE_GLOBAL int32x4_t vshrq_n_s32(int32x4_t a, __constrange(`1`,`32`) int b); // VSHR.S32 q0,q0,#32
7712	#define vshrq_n_s32 _mm_srai_epi32
7713
7714	_NEON2SSESTORAGE int64x2_t vshrq_n_s64(int64x2_t a, __constrange(`1`,`64`) int b); // VSHR.S64 q0,q0,#64
7715	_NEON2SSE_INLINE int64x2_t vshrq_n_s64(int64x2_t a, __constrange(`1`,`64`) int b)
7716	{
7717	//SIMD implementation may be not optimal due to 64 bit arithmetic shift absense in x86 SIMD
7718	__m128i c1, signmask,a0, res64;
7719	_NEON2SSE_ALIGN_16 static const uint64_t mask[] = {`0x8000000000000000`, `0x8000000000000000`};
7720	c1 = _mm_cmpeq_epi32(a,a); //0xffffffffffffffff
7721	signmask = _mm_slli_epi64 (c1, (`64` - b));
7722	a0 = _mm_or_si128(a, (__m128i)mask); //get the first bit
7723	a0 = _MM_CMPEQ_EPI64 (a, a0);
7724	signmask = _mm_and_si128(a0, signmask);
7725	res64 = _mm_srli_epi64 (a, b);
7726	return _mm_or_si128(res64, signmask);
7727	}
7728
7729	_NEON2SSESTORAGE uint8x16_t vshrq_n_u8(uint8x16_t a, __constrange(`1`,`8`) int b); // VSHR.U8 q0,q0,#8
7730	_NEON2SSE_INLINE uint8x16_t vshrq_n_u8(uint8x16_t a, __constrange(`1`,`8`) int b) // VSHR.U8 q0,q0,#8
7731	{
7732	//no 8 bit shift available, need the special trick
7733	__m128i mask0, r;
7734	_NEON2SSE_ALIGN_16 static const uint16_t mask10_16[`9`] = {`0xffff`, `0xff7f`, `0xff3f`, `0xff1f`, `0xff0f`, `0xff07`, `0xff03`, `0xff01`, `0xff00`};
7735	mask0 = _mm_set1_epi16(mask10_16[b]); //to mask the bits to be "spoiled" by 16 bit shift
7736	r = _mm_srli_epi16 ( a, b);
7737	return _mm_and_si128 (r, mask0);
7738	}
7739
7740	_NEON2SSE_GLOBAL uint16x8_t vshrq_n_u16(uint16x8_t a, __constrange(`1`,`16`) int b); // VSHR.s16 q0,q0,#16
7741	#define vshrq_n_u16 _mm_srli_epi16
7742
7743	_NEON2SSE_GLOBAL uint32x4_t vshrq_n_u32(uint32x4_t a, __constrange(`1`,`32`) int b); // VSHR.U32 q0,q0,#32
7744	#define vshrq_n_u32 _mm_srli_epi32
7745
7746	_NEON2SSE_GLOBAL uint64x2_t vshrq_n_u64(uint64x2_t a, __constrange(`1`,`64`) int b); // VSHR.U64 q0,q0,#64
7747	#define vshrq_n_u64 _mm_srli_epi64
7748
7749	//************************* Vector shift left by constant ***********************
7750	//*********************************************************************************
7751	_NEON2SSESTORAGE int8x8_t vshl_n_s8(int8x8_t a, __constrange(`0`,`7`) int b); // VSHL.I8 d0,d0,#0
7752	_NEON2SSE_INLINE int8x8_t vshl_n_s8(int8x8_t a, __constrange(`0`,`7`) int b) // VSHL.I8 d0,d0,#0
7753	{
7754	//no 8 bit shift available, go to 16 bit
7755	int8x8_t res64;
7756	__m128i r;
7757	r = _MM_CVTEPI8_EPI16 (_pM128i(a)); //SSE 4.1
7758	r = _mm_slli_epi16 (r, b); //SSE2
7759	r = _mm_shuffle_epi8 (r, (__m128i) mask8_16_even_odd); //return to 8 bit, we need 64 bits only
7760	return64(r);
7761	}
7762
7763	_NEON2SSESTORAGE int16x4_t vshl_n_s16(int16x4_t a, __constrange(`0`,`15`) int b); // VSHL.I16 d0,d0,#0
7764	_NEON2SSE_INLINE int16x4_t vshl_n_s16(int16x4_t a, __constrange(`0`,`15`) int b)
7765	{
7766	int16x4_t res64;
7767	return64(_mm_slli_epi16(_pM128i(a), b));
7768	}
7769
7770
7771	_NEON2SSESTORAGE int32x2_t vshl_n_s32(int32x2_t a, __constrange(`0`,`31`) int b); // VSHL.I32 d0,d0,#0
7772	_NEON2SSE_INLINE int32x2_t vshl_n_s32(int32x2_t a, __constrange(`0`,`31`) int b)
7773	{
7774	int32x2_t res64;
7775	return64(_mm_slli_epi32(_pM128i(a), b));
7776	}
7777
7778
7779	_NEON2SSESTORAGE int64x1_t vshl_n_s64(int64x1_t a, __constrange(`0`,`63`) int b); // VSHL.I64 d0,d0,#0
7780	_NEON2SSE_INLINE int64x1_t vshl_n_s64(int64x1_t a, __constrange(`0`,`63`) int b)
7781	{
7782	int64x1_t res64;
7783	return64(_mm_slli_epi64(_pM128i(a), b));
7784	}
7785
7786
7787	_NEON2SSESTORAGE uint8x8_t vshl_n_u8(uint8x8_t a, __constrange(`0`,`7`) int b); // VSHL.I8 d0,d0,#0
7788	_NEON2SSE_INLINE uint8x8_t vshl_n_u8(uint8x8_t a, __constrange(`0`,`7`) int b)
7789	{
7790	//no 8 bit shift available, go to 16 bit
7791	uint8x8_t res64;
7792	__m128i mask8;
7793	__m128i r;
7794	mask8 = _mm_set1_epi16(`0xff`);
7795	r = _MM_CVTEPU8_EPI16 (_pM128i(a)); //SSE 4.1
7796	r = _mm_slli_epi16 (r, b); //SSE2
7797	r = _mm_and_si128(r, mask8); //to avoid saturation
7798	r = _mm_packus_epi16 (r,r); //we need 64 bits only
7799	return64(r);
7800	}
7801
7802	_NEON2SSE_GLOBAL uint16x4_t vshl_n_u16(uint16x4_t a, __constrange(`0`,`15`) int b); // VSHL.I16 d0,d0,#0
7803	#define vshl_n_u16 vshl_n_s16
7804
7805
7806	_NEON2SSE_GLOBAL uint32x2_t vshl_n_u32(uint32x2_t a, __constrange(`0`,`31`) int b); // VSHL.I32 d0,d0,#0
7807	#define vshl_n_u32 vshl_n_s32
7808
7809	_NEON2SSE_GLOBAL uint64x1_t vshl_n_u64(uint64x1_t a, __constrange(`0`,`63`) int b); // VSHL.I64 d0,d0,#0
7810	#define vshl_n_u64 vshl_n_s64
7811
7812	_NEON2SSE_GLOBAL int8x16_t vshlq_n_s8(int8x16_t a, __constrange(`0`,`7`) int b); // VSHL.I8 q0,q0,#0
7813	#define vshlq_n_s8 vshlq_n_u8
7814
7815	_NEON2SSE_GLOBAL int16x8_t vshlq_n_s16(int16x8_t a, __constrange(`0`,`15`) int b); // VSHL.I16 q0,q0,#0
7816	#define vshlq_n_s16 _mm_slli_epi16
7817
7818	_NEON2SSE_GLOBAL int32x4_t vshlq_n_s32(int32x4_t a, __constrange(`0`,`31`) int b); // VSHL.I32 q0,q0,#0
7819	#define vshlq_n_s32 _mm_slli_epi32
7820
7821	_NEON2SSE_GLOBAL int64x2_t vshlq_n_s64(int64x2_t a, __constrange(`0`,`63`) int b); // VSHL.I64 q0,q0,#0
7822	#define vshlq_n_s64 _mm_slli_epi64
7823
7824	_NEON2SSESTORAGE uint8x16_t vshlq_n_u8(uint8x16_t a, __constrange(`0`,`7`) int b); // VSHL.I8 q0,q0,#0
7825	_NEON2SSE_INLINE uint8x16_t vshlq_n_u8(uint8x16_t a, __constrange(`0`,`7`) int b)
7826	{
7827	//no 8 bit shift available, need the special trick
7828	__m128i mask0, r;
7829	_NEON2SSE_ALIGN_16 static const uint16_t mask10_16[`9`] = {`0xffff`, `0xfeff`, `0xfcff`, `0xf8ff`, `0xf0ff`, `0xe0ff`, `0xc0ff`, `0x80ff`, `0xff`};
7830	mask0 = _mm_set1_epi16(mask10_16[b]); //to mask the bits to be "spoiled" by 16 bit shift
7831	r = _mm_slli_epi16 ( a, b);
7832	return _mm_and_si128 (r, mask0);
7833	}
7834
7835	_NEON2SSE_GLOBAL uint16x8_t vshlq_n_u16(uint16x8_t a, __constrange(`0`,`15`) int b); // VSHL.I16 q0,q0,#0
7836	#define vshlq_n_u16 vshlq_n_s16
7837
7838	_NEON2SSE_GLOBAL uint32x4_t vshlq_n_u32(uint32x4_t a, __constrange(`0`,`31`) int b); // VSHL.I32 q0,q0,#0
7839	#define vshlq_n_u32 vshlq_n_s32
7840
7841	_NEON2SSE_GLOBAL uint64x2_t vshlq_n_u64(uint64x2_t a, __constrange(`0`,`63`) int b); // VSHL.I64 q0,q0,#0
7842	#define vshlq_n_u64 vshlq_n_s64
7843
7844	//*********** Vector rounding shift right by constant ****************
7845	//*************************************************************************
7846	//No corresponding x86 intrinsics exist, need to do some tricks
7847	_NEON2SSESTORAGE int8x8_t vrshr_n_s8(int8x8_t a, __constrange(`1`,`8`) int b); // VRSHR.S8 d0,d0,#8
7848	_NEON2SSE_INLINE int8x8_t vrshr_n_s8(int8x8_t a, __constrange(`1`,`8`) int b) // VRSHR.S8 d0,d0,#8
7849	{
7850	//no 8 bit shift available, go to 16 bit
7851	int8x8_t res64;
7852	__m128i r, maskb;
7853	r = _MM_CVTEPI8_EPI16 (_pM128i(a)); //SSE 4.1
7854	maskb = _mm_slli_epi16 (r, (`16` - b)); //to get rounding (b-1)th bit
7855	maskb = _mm_srli_epi16 (maskb, `15`); //1 or 0
7856	r = _mm_srai_epi16 (r, b);
7857	r = _mm_add_epi16 (r, maskb); //actual rounding
7858	r = _mm_packs_epi16 (r,r); ////we need 64 bits only
7859	return64(r);
7860	}
7861
7862	_NEON2SSESTORAGE int16x4_t vrshr_n_s16(int16x4_t a, __constrange(`1`,`16`) int b); // VRSHR.S16 d0,d0,#16
7863	_NEON2SSE_INLINE int16x4_t vrshr_n_s16(int16x4_t a, __constrange(`1`,`16`) int b)
7864	{
7865	int16x4_t res64;
7866	return64(vrshrq_n_s16(_pM128i(a), b));
7867	}
7868
7869
7870	_NEON2SSESTORAGE int32x2_t vrshr_n_s32(int32x2_t a, __constrange(`1`,`32`) int b); // VRSHR.S32 d0,d0,#32
7871	_NEON2SSE_INLINE int32x2_t vrshr_n_s32(int32x2_t a, __constrange(`1`,`32`) int b)
7872	{
7873	int32x2_t res64;
7874	return64(vrshrq_n_s32(_pM128i(a), b));
7875	}
7876
7877
7878	_NEON2SSESTORAGE int64x1_t vrshr_n_s64(int64x1_t a, __constrange(`1`,`64`) int b); // VRSHR.S64 d0,d0,#64
7879	_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int64x1_t vrshr_n_s64(int64x1_t a, __constrange(`1`,`64`) int b), _NEON2SSE_REASON_SLOW_SERIAL)
7880	{
7881	//serial solution is faster
7882	int64x1_t res;
7883	int64_t a_i64 = ( int64_t)&a;
7884	if(b==`64`) {
7885	res.m64_i64[`0`] = `0`; //for some compilers rounding happens and we need to use(a_i64 & _SIGNBIT64)>>63;
7886	} else {
7887	int64_t maskb = a_i64 & (( int64_t)`1` << (b - `1`));
7888	res.m64_i64[`0`] = (a_i64 >> b) + (maskb >> (b - `1`));
7889	}
7890	return res;
7891	}
7892
7893	_NEON2SSESTORAGE uint8x8_t vrshr_n_u8(uint8x8_t a, __constrange(`1`,`8`) int b); // VRSHR.U8 d0,d0,#8
7894	_NEON2SSE_INLINE uint8x8_t vrshr_n_u8(uint8x8_t a, __constrange(`1`,`8`) int b) // VRSHR.U8 d0,d0,#8
7895	{
7896	//no 8 bit shift available, go to 16 bit, solution may be not optimal compared with the serial one
7897	uint8x8_t res64;
7898	__m128i r, maskb;
7899	r = _MM_CVTEPU8_EPI16 (_pM128i(a)); //SSE 4.1
7900	maskb = _mm_slli_epi16 (r, (`16` - b)); //to get rounding (b-1)th bit
7901	maskb = _mm_srli_epi16 (maskb, `15`); //1 or 0
7902	r = _mm_srli_epi16 (r, b);
7903	r = _mm_add_epi16 (r, maskb); //actual rounding
7904	r = _mm_packus_epi16 (r,r); ////we need 64 bits only
7905	return64(r);
7906	}
7907
7908	_NEON2SSESTORAGE uint16x4_t vrshr_n_u16(uint16x4_t a, __constrange(`1`,`16`) int b); // VRSHR.s16 d0,d0,#16
7909	_NEON2SSE_INLINE uint16x4_t vrshr_n_u16(uint16x4_t a, __constrange(`1`,`16`) int b)
7910	{
7911	uint16x4_t res64;
7912	return64(vrshrq_n_u16(_pM128i(a), b));
7913	}
7914
7915
7916	_NEON2SSESTORAGE uint32x2_t vrshr_n_u32(uint32x2_t a, __constrange(`1`,`32`) int b); // VRSHR.U32 d0,d0,#32
7917	_NEON2SSE_INLINE uint32x2_t vrshr_n_u32(uint32x2_t a, __constrange(`1`,`32`) int b)
7918	{
7919	uint32x2_t res64;
7920	return64(vrshrq_n_u32(_pM128i(a), b));
7921	}
7922
7923
7924	_NEON2SSESTORAGE uint64x1_t vrshr_n_u64(uint64x1_t a, __constrange(`1`,`64`) int b); // VRSHR.U64 d0,d0,#64
7925	_NEON2SSE_INLINE uint64x1_t vrshr_n_u64(uint64x1_t a, __constrange(`1`,`64`) int b)
7926	{
7927	uint64x1_t res64;
7928	return64(vrshrq_n_u64(_pM128i(a), b));
7929	}
7930
7931	_NEON2SSESTORAGE int8x16_t vrshrq_n_s8(int8x16_t a, __constrange(`1`,`8`) int b); // VRSHR.S8 q0,q0,#8
7932	_NEON2SSE_INLINE int8x16_t vrshrq_n_s8(int8x16_t a, __constrange(`1`,`8`) int b) // VRSHR.S8 q0,q0,#8
7933	{
7934	//no 8 bit shift available, go to 16 bit trick
7935	__m128i r, mask1, maskb;
7936	_NEON2SSE_ALIGN_16 static const uint16_t mask2b[`9`] = {`0x0000`, `0x0101`, `0x0202`, `0x0404`, `0x0808`, `0x1010`, `0x2020`, `0x4040`, `0x8080`}; // 2^b-th bit set to 1
7937	r = vshrq_n_s8 (a, b);
7938	mask1 = _mm_set1_epi16(mask2b[b]); // 2^b-th bit set to 1 for 16bit, need it for rounding
7939	maskb = _mm_and_si128(a, mask1); //get b or 0 for rounding
7940	maskb = _mm_srli_epi16 (maskb, b - `1`); // to add 1
7941	return _mm_add_epi8(r, maskb); //actual rounding
7942	}
7943
7944	_NEON2SSESTORAGE int16x8_t vrshrq_n_s16(int16x8_t a, __constrange(`1`,`16`) int b); // VRSHR.S16 q0,q0,#16
7945	_NEON2SSE_INLINE int16x8_t vrshrq_n_s16(int16x8_t a, __constrange(`1`,`16`) int b) // VRSHR.S16 q0,q0,#16
7946	{
7947	__m128i maskb, r;
7948	maskb = _mm_slli_epi16(a, (`16` - b)); //to get rounding (b-1)th bit
7949	maskb = _mm_srli_epi16(maskb, `15`); //1 or 0
7950	r = _mm_srai_epi16 (a, b);
7951	return _mm_add_epi16 (r, maskb); //actual rounding
7952	}
7953
7954	_NEON2SSESTORAGE int32x4_t vrshrq_n_s32(int32x4_t a, __constrange(`1`,`32`) int b); // VRSHR.S32 q0,q0,#32
7955	_NEON2SSE_INLINE int32x4_t vrshrq_n_s32(int32x4_t a, __constrange(`1`,`32`) int b) // VRSHR.S32 q0,q0,#32
7956	{
7957	__m128i maskb, r;
7958	maskb = _mm_slli_epi32 (a, (`32` - b)); //to get rounding (b-1)th bit
7959	maskb = _mm_srli_epi32 (maskb,`31`); //1 or 0
7960	r = _mm_srai_epi32(a, b);
7961	return _mm_add_epi32 (r, maskb); //actual rounding
7962	}
7963
7964	_NEON2SSESTORAGE int64x2_t vrshrq_n_s64(int64x2_t a, __constrange(`1`,`64`) int b); // VRSHR.S64 q0,q0,#64
7965	_NEON2SSE_INLINE int64x2_t vrshrq_n_s64(int64x2_t a, __constrange(`1`,`64`) int b)
7966	{
7967	//solution may be not optimal compared with a serial one
7968	__m128i maskb;
7969	int64x2_t r;
7970	maskb = _mm_slli_epi64 (a, (`64` - b)); //to get rounding (b-1)th bit
7971	maskb = _mm_srli_epi64 (maskb,`63`); //1 or 0
7972	r = vshrq_n_s64(a, b);
7973	return _mm_add_epi64 (r, maskb); //actual rounding
7974	}
7975
7976	_NEON2SSESTORAGE uint8x16_t vrshrq_n_u8(uint8x16_t a, __constrange(`1`,`8`) int b); // VRSHR.U8 q0,q0,#8
7977	_NEON2SSE_INLINE uint8x16_t vrshrq_n_u8(uint8x16_t a, __constrange(`1`,`8`) int b) // VRSHR.U8 q0,q0,#8
7978	{
7979	//no 8 bit shift available, go to 16 bit trick
7980	__m128i r, mask1, maskb;
7981	_NEON2SSE_ALIGN_16 static const uint16_t mask2b[`9`] = {`0x0000`, `0x0101`, `0x0202`, `0x0404`, `0x0808`, `0x1010`, `0x2020`, `0x4040`, `0x8080`}; // 2^b-th bit set to 1
7982	r = vshrq_n_u8 (a, b);
7983	mask1 = _mm_set1_epi16(mask2b[b]); // 2^b-th bit set to 1 for 16bit, need it for rounding
7984	maskb = _mm_and_si128(a, mask1); //get b or 0 for rounding
7985	maskb = _mm_srli_epi16 (maskb, b - `1`); // to add 1
7986	return _mm_add_epi8(r, maskb); //actual rounding
7987	}
7988
7989	_NEON2SSESTORAGE uint16x8_t vrshrq_n_u16(uint16x8_t a, __constrange(`1`,`16`) int b); // VRSHR.s16 q0,q0,#16
7990	_NEON2SSE_INLINE uint16x8_t vrshrq_n_u16(uint16x8_t a, __constrange(`1`,`16`) int b) // VRSHR.S16 q0,q0,#16
7991	{
7992	__m128i maskb, r;
7993	maskb = _mm_slli_epi16(a, (`16` - b)); //to get rounding (b-1)th bit
7994	maskb = _mm_srli_epi16(maskb, `15`); //1 or 0
7995	r = _mm_srli_epi16 (a, b);
7996	return _mm_add_epi16 (r, maskb); //actual rounding
7997	}
7998
7999	_NEON2SSESTORAGE uint32x4_t vrshrq_n_u32(uint32x4_t a, __constrange(`1`,`32`) int b); // VRSHR.U32 q0,q0,#32
8000	_NEON2SSE_INLINE uint32x4_t vrshrq_n_u32(uint32x4_t a, __constrange(`1`,`32`) int b) // VRSHR.S32 q0,q0,#32
8001	{
8002	__m128i maskb, r;
8003	maskb = _mm_slli_epi32 (a, (`32` - b)); //to get rounding (b-1)th bit
8004	maskb = _mm_srli_epi32 (maskb,`31`); //1 or 0
8005	r = _mm_srli_epi32(a, b);
8006	return _mm_add_epi32 (r, maskb); //actual rounding
8007	}
8008
8009	_NEON2SSESTORAGE uint64x2_t vrshrq_n_u64(uint64x2_t a, __constrange(`1`,`64`) int b); // VRSHR.U64 q0,q0,#64
8010	_NEON2SSE_INLINE uint64x2_t vrshrq_n_u64(uint64x2_t a, __constrange(`1`,`64`) int b)
8011	{
8012	//solution may be not optimal compared with a serial one
8013	__m128i maskb, r;
8014	maskb = _mm_slli_epi64 (a, (`64` - b)); //to get rounding (b-1)th bit
8015	maskb = _mm_srli_epi64 (maskb,`63`); //1 or 0
8016	r = _mm_srli_epi64(a, b);
8017	return _mm_add_epi64 (r, maskb); //actual rounding
8018	}
8019
8020	//*********** Vector shift right by constant and accumulate *******
8021	//*********************************************************************
8022	_NEON2SSESTORAGE int8x8_t vsra_n_s8(int8x8_t a, int8x8_t b, __constrange(`1`,`8`) int c); // VSRA.S8 d0,d0,#8
8023	_NEON2SSE_INLINE int8x8_t vsra_n_s8(int8x8_t a, int8x8_t b, __constrange(`1`,`8`) int c) // VSRA.S8 d0,d0,#8
8024	{
8025	int8x8_t shift;
8026	shift = vshr_n_s8(b, c);
8027	return vadd_s8( a, shift);
8028	}
8029
8030	_NEON2SSESTORAGE int16x4_t vsra_n_s16(int16x4_t a, int16x4_t b, __constrange(`1`,`16`) int c); // VSRA.S16 d0,d0,#16
8031	_NEON2SSE_INLINE int16x4_t vsra_n_s16(int16x4_t a, int16x4_t b, __constrange(`1`,`16`) int c) // VSRA.S16 d0,d0,#16
8032	{
8033	int16x4_t shift;
8034	shift = vshr_n_s16( b, c);
8035	return vadd_s16(a, shift);
8036	}
8037
8038	_NEON2SSESTORAGE int32x2_t vsra_n_s32(int32x2_t a, int32x2_t b, __constrange(`1`,`32`) int c); // VSRA.S32 d0,d0,#32
8039	_NEON2SSE_INLINE int32x2_t vsra_n_s32(int32x2_t a, int32x2_t b, __constrange(`1`,`32`) int c) // VSRA.S32 d0,d0,#32
8040	{
8041	//may be not optimal compared with the serial execution
8042	int32x2_t shift;
8043	shift = vshr_n_s32(b, c);
8044	return vadd_s32( a, shift);
8045	}
8046
8047	_NEON2SSESTORAGE int64x1_t vsra_n_s64(int64x1_t a, int64x1_t b, __constrange(`1`,`64`) int c); // VSRA.S64 d0,d0,#64
8048	_NEON2SSE_INLINE int64x1_t vsra_n_s64(int64x1_t a, int64x1_t b, __constrange(`1`,`64`) int c)
8049	{
8050	//may be not optimal compared with a serial solution
8051	int64x1_t shift;
8052	shift = vshr_n_s64(b, c);
8053	return vadd_s64( a, shift);
8054	}
8055
8056	_NEON2SSESTORAGE uint8x8_t vsra_n_u8(uint8x8_t a, uint8x8_t b, __constrange(`1`,`8`) int c); // VSRA.U8 d0,d0,#8
8057	_NEON2SSE_INLINE uint8x8_t vsra_n_u8(uint8x8_t a, uint8x8_t b, __constrange(`1`,`8`) int c) // VSRA.U8 d0,d0,#8
8058	{
8059	uint8x8_t shift;
8060	shift = vshr_n_u8(b, c);
8061	return vadd_u8(a, shift);
8062	}
8063
8064	_NEON2SSESTORAGE uint16x4_t vsra_n_u16(uint16x4_t a, uint16x4_t b, __constrange(`1`,`16`) int c); // VSRA.s16 d0,d0,#16
8065	_NEON2SSE_INLINE uint16x4_t vsra_n_u16(uint16x4_t a, uint16x4_t b, __constrange(`1`,`16`) int c) // VSRA.s16 d0,d0,#16
8066	{
8067	uint16x4_t shift;
8068	shift = vshr_n_u16(b, c);
8069	return vadd_u16(a,shift);
8070	}
8071
8072	_NEON2SSESTORAGE uint32x2_t vsra_n_u32(uint32x2_t a, uint32x2_t b, __constrange(`1`,`32`) int c); // VSRA.U32 d0,d0,#32
8073	_NEON2SSE_INLINE uint32x2_t vsra_n_u32(uint32x2_t a, uint32x2_t b, __constrange(`1`,`32`) int c) // VSRA.U32 d0,d0,#32
8074	{
8075	//may be not optimal compared with the serial execution
8076	uint32x2_t shift;
8077	shift = vshr_n_u32(b, c);
8078	return vadd_u32( a, shift);
8079	}
8080
8081	_NEON2SSESTORAGE uint64x1_t vsra_n_u64(uint64x1_t a, uint64x1_t b, __constrange(`1`,`64`) int c); // VSRA.U64 d0,d0,#64
8082	_NEON2SSE_INLINE uint64x1_t vsra_n_u64(uint64x1_t a, uint64x1_t b, __constrange(`1`,`64`) int c) // VSRA.U64 d0,d0,#64
8083	{
8084	//may be not optimal compared with the serial execution
8085	uint64x1_t shift;
8086	shift = vshr_n_u64(b, c);
8087	return vadd_u64(a, shift);
8088	}
8089
8090	_NEON2SSESTORAGE int8x16_t vsraq_n_s8(int8x16_t a, int8x16_t b, __constrange(`1`,`8`) int c); // VSRA.S8 q0,q0,#8
8091	_NEON2SSE_INLINE int8x16_t vsraq_n_s8(int8x16_t a, int8x16_t b, __constrange(`1`,`8`) int c) // VSRA.S8 q0,q0,#8
8092	{
8093	int8x16_t shift;
8094	shift = vshrq_n_s8(b, c);
8095	return vaddq_s8(a, shift);
8096	}
8097
8098	_NEON2SSESTORAGE int16x8_t vsraq_n_s16(int16x8_t a, int16x8_t b, __constrange(`1`,`16`) int c); // VSRA.S16 q0,q0,#16
8099	_NEON2SSE_INLINE int16x8_t vsraq_n_s16(int16x8_t a, int16x8_t b, __constrange(`1`,`16`) int c) // VSRA.S16 q0,q0,#16
8100	{
8101	int16x8_t shift;
8102	shift = vshrq_n_s16(b, c);
8103	return vaddq_s16(a, shift);
8104	}
8105
8106	_NEON2SSESTORAGE int32x4_t vsraq_n_s32(int32x4_t a, int32x4_t b, __constrange(`1`,`32`) int c); // VSRA.S32 q0,q0,#32
8107	_NEON2SSE_INLINE int32x4_t vsraq_n_s32(int32x4_t a, int32x4_t b, __constrange(`1`,`32`) int c) // VSRA.S32 q0,q0,#32
8108	{
8109	int32x4_t shift;
8110	shift = vshrq_n_s32(b, c);
8111	return vaddq_s32(a, shift);
8112	}
8113
8114	_NEON2SSESTORAGE int64x2_t vsraq_n_s64(int64x2_t a, int64x2_t b, __constrange(`1`,`64`) int c); // VSRA.S64 q0,q0,#64
8115	_NEON2SSE_INLINE int64x2_t vsraq_n_s64(int64x2_t a, int64x2_t b, __constrange(`1`,`64`) int c) // VSRA.S64 q0,q0,#64
8116	{
8117	int64x2_t shift;
8118	shift = vshrq_n_s64(b, c);
8119	return vaddq_s64( a, shift);
8120	}
8121
8122	_NEON2SSESTORAGE uint8x16_t vsraq_n_u8(uint8x16_t a, uint8x16_t b, __constrange(`1`,`8`) int c); // VSRA.U8 q0,q0,#8
8123	_NEON2SSE_INLINE uint8x16_t vsraq_n_u8(uint8x16_t a, uint8x16_t b, __constrange(`1`,`8`) int c) // VSRA.U8 q0,q0,#8
8124	{
8125	uint8x16_t shift;
8126	shift = vshrq_n_u8(b, c);
8127	return vaddq_u8(a, shift);
8128	}
8129
8130	_NEON2SSESTORAGE uint16x8_t vsraq_n_u16(uint16x8_t a, uint16x8_t b, __constrange(`1`,`16`) int c); // VSRA.s16 q0,q0,#16
8131	_NEON2SSE_INLINE uint16x8_t vsraq_n_u16(uint16x8_t a, uint16x8_t b, __constrange(`1`,`16`) int c) // VSRA.s16 q0,q0,#16
8132	{
8133	uint16x8_t shift;
8134	shift = vshrq_n_u16(b, c);
8135	return vaddq_u16(a, shift);
8136	}
8137
8138	_NEON2SSESTORAGE uint32x4_t vsraq_n_u32(uint32x4_t a, uint32x4_t b, __constrange(`1`,`32`) int c); // VSRA.U32 q0,q0,#32
8139	_NEON2SSE_INLINE uint32x4_t vsraq_n_u32(uint32x4_t a, uint32x4_t b, __constrange(`1`,`32`) int c) // VSRA.U32 q0,q0,#32
8140	{
8141	uint32x4_t shift;
8142	shift = vshrq_n_u32(b, c);
8143	return vaddq_u32(a, shift);
8144	}
8145
8146	_NEON2SSESTORAGE uint64x2_t vsraq_n_u64(uint64x2_t a, uint64x2_t b, __constrange(`1`,`64`) int c); // VSRA.U64 q0,q0,#64
8147	_NEON2SSE_INLINE uint64x2_t vsraq_n_u64(uint64x2_t a, uint64x2_t b, __constrange(`1`,`64`) int c) // VSRA.U64 q0,q0,#64
8148	{
8149	uint64x2_t shift;
8150	shift = vshrq_n_u64(b, c);
8151	return vaddq_u64(a, shift);
8152	}
8153
8154	//*********** Vector rounding shift right by constant and accumulate **************************
8155	//************************************************************************************************
8156	_NEON2SSESTORAGE int8x8_t vrsra_n_s8(int8x8_t a, int8x8_t b, __constrange(`1`,`8`) int c); // VRSRA.S8 d0,d0,#8
8157	_NEON2SSE_INLINE int8x8_t vrsra_n_s8(int8x8_t a, int8x8_t b, __constrange(`1`,`8`) int c) // VRSRA.S8 d0,d0,#8
8158	{
8159	int8x8_t shift;
8160	shift = vrshr_n_s8(b, c);
8161	return vadd_s8( a, shift);
8162	}
8163
8164	_NEON2SSESTORAGE int16x4_t vrsra_n_s16(int16x4_t a, int16x4_t b, __constrange(`1`,`16`) int c); // VRSRA.S16 d0,d0,#16
8165	_NEON2SSE_INLINE int16x4_t vrsra_n_s16(int16x4_t a, int16x4_t b, __constrange(`1`,`16`) int c) // VRSRA.S16 d0,d0,#16
8166	{
8167	int16x4_t shift;
8168	shift = vrshr_n_s16( b, c);
8169	return vadd_s16(a, shift);
8170	}
8171
8172	_NEON2SSESTORAGE int32x2_t vrsra_n_s32(int32x2_t a, int32x2_t b, __constrange(`1`,`32`) int c); // VRSRA.S32 d0,d0,#32
8173	_NEON2SSE_INLINE int32x2_t vrsra_n_s32(int32x2_t a, int32x2_t b, __constrange(`1`,`32`) int c) // VRSRA.S32 d0,d0,#32
8174	{
8175	//may be not optimal compared with the serial execution
8176	int32x2_t shift;
8177	shift = vrshr_n_s32(b, c);
8178	return vadd_s32( a, shift);
8179	}
8180
8181	_NEON2SSESTORAGE int64x1_t vrsra_n_s64(int64x1_t a, int64x1_t b, __constrange(`1`,`64`) int c); // VRSRA.S64 d0,d0,#64
8182	_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int64x1_t vrsra_n_s64(int64x1_t a, int64x1_t b, __constrange(`1`,`64`) int c), _NEON2SSE_REASON_SLOW_SERIAL) //serial solution
8183	{
8184	int64x1_t shift;
8185	shift = vrshr_n_s64(b, c);
8186	return vadd_s64( a, shift);
8187	}
8188
8189	_NEON2SSESTORAGE uint8x8_t vrsra_n_u8(uint8x8_t a, uint8x8_t b, __constrange(`1`,`8`) int c); // VRSRA.U8 d0,d0,#8
8190	_NEON2SSE_INLINE uint8x8_t vrsra_n_u8(uint8x8_t a, uint8x8_t b, __constrange(`1`,`8`) int c) // VRSRA.U8 d0,d0,#8
8191	{
8192	uint8x8_t shift;
8193	shift = vrshr_n_u8(b, c);
8194	return vadd_u8(a, shift);
8195	}
8196
8197	_NEON2SSESTORAGE uint16x4_t vrsra_n_u16(uint16x4_t a, uint16x4_t b, __constrange(`1`,`16`) int c); // VRSRA.s16 d0,d0,#16
8198	_NEON2SSE_INLINE uint16x4_t vrsra_n_u16(uint16x4_t a, uint16x4_t b, __constrange(`1`,`16`) int c) // VRSRA.s16 d0,d0,#16
8199	{
8200	uint16x4_t shift;
8201	shift = vrshr_n_u16(b, c);
8202	return vadd_u16(a,shift);
8203	}
8204
8205	_NEON2SSESTORAGE uint32x2_t vrsra_n_u32(uint32x2_t a, uint32x2_t b, __constrange(`1`,`32`) int c); // VRSRA.U32 d0,d0,#32
8206	_NEON2SSE_INLINE uint32x2_t vrsra_n_u32(uint32x2_t a, uint32x2_t b, __constrange(`1`,`32`) int c) // VRSRA.U32 d0,d0,#32
8207	{
8208	//may be not optimal compared with the serial execution
8209	uint32x2_t shift;
8210	shift = vrshr_n_u32(b, c);
8211	return vadd_u32( a, shift);
8212	}
8213
8214	_NEON2SSESTORAGE uint64x1_t vrsra_n_u64(uint64x1_t a, uint64x1_t b, __constrange(`1`,`64`) int c); // VRSRA.U64 d0,d0,#64
8215	_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint64x1_t vrsra_n_u64(uint64x1_t a, uint64x1_t b, __constrange(`1`,`64`) int c), _NEON2SSE_REASON_SLOW_SERIAL) //serial solution
8216	{
8217	//may be not optimal compared with the serial execution
8218	uint64x1_t shift;
8219	shift = vrshr_n_u64(b, c);
8220	return vadd_u64( a, shift);
8221	}
8222
8223	_NEON2SSESTORAGE int8x16_t vrsraq_n_s8(int8x16_t a, int8x16_t b, __constrange(`1`,`8`) int c); // VRSRA.S8 q0,q0,#8
8224	_NEON2SSE_INLINE int8x16_t vrsraq_n_s8(int8x16_t a, int8x16_t b, __constrange(`1`,`8`) int c) // VRSRA.S8 q0,q0,#8
8225	{
8226	int8x16_t shift;
8227	shift = vrshrq_n_s8(b, c);
8228	return vaddq_s8(a, shift);
8229	}
8230
8231	_NEON2SSESTORAGE int16x8_t vrsraq_n_s16(int16x8_t a, int16x8_t b, __constrange(`1`,`16`) int c); // VRSRA.S16 q0,q0,#16
8232	_NEON2SSE_INLINE int16x8_t vrsraq_n_s16(int16x8_t a, int16x8_t b, __constrange(`1`,`16`) int c) // VRSRA.S16 q0,q0,#16
8233	{
8234	int16x8_t shift;
8235	shift = vrshrq_n_s16(b, c);
8236	return vaddq_s16(a, shift);
8237	}
8238
8239	_NEON2SSESTORAGE int32x4_t vrsraq_n_s32(int32x4_t a, int32x4_t b, __constrange(`1`,`32`) int c); // VRSRA.S32 q0,q0,#32
8240	_NEON2SSE_INLINE int32x4_t vrsraq_n_s32(int32x4_t a, int32x4_t b, __constrange(`1`,`32`) int c) // VRSRA.S32 q0,q0,#32
8241	{
8242	int32x4_t shift;
8243	shift = vrshrq_n_s32(b, c);
8244	return vaddq_s32(a, shift);
8245	}
8246
8247	_NEON2SSESTORAGE int64x2_t vrsraq_n_s64(int64x2_t a, int64x2_t b, __constrange(`1`,`64`) int c); // VRSRA.S64 q0,q0,#64
8248	_NEON2SSE_INLINE int64x2_t vrsraq_n_s64(int64x2_t a, int64x2_t b, __constrange(`1`,`64`) int c)
8249	{
8250	int64x2_t shift;
8251	shift = vrshrq_n_s64(b, c);
8252	return vaddq_s64(a, shift);
8253	}
8254
8255	_NEON2SSESTORAGE uint8x16_t vrsraq_n_u8(uint8x16_t a, uint8x16_t b, __constrange(`1`,`8`) int c); // VRSRA.U8 q0,q0,#8
8256	_NEON2SSE_INLINE uint8x16_t vrsraq_n_u8(uint8x16_t a, uint8x16_t b, __constrange(`1`,`8`) int c) // VRSRA.U8 q0,q0,#8
8257	{
8258	uint8x16_t shift;
8259	shift = vrshrq_n_u8(b, c);
8260	return vaddq_u8(a, shift);
8261	}
8262
8263	_NEON2SSESTORAGE uint16x8_t vrsraq_n_u16(uint16x8_t a, uint16x8_t b, __constrange(`1`,`16`) int c); // VRSRA.s16 q0,q0,#16
8264	_NEON2SSE_INLINE uint16x8_t vrsraq_n_u16(uint16x8_t a, uint16x8_t b, __constrange(`1`,`16`) int c) // VRSRA.s16 q0,q0,#16
8265	{
8266	uint16x8_t shift;
8267	shift = vrshrq_n_u16(b, c);
8268	return vaddq_u16(a, shift);
8269	}
8270
8271	_NEON2SSESTORAGE uint32x4_t vrsraq_n_u32(uint32x4_t a, uint32x4_t b, __constrange(`1`,`32`) int c); // VRSRA.U32 q0,q0,#32
8272	_NEON2SSE_INLINE uint32x4_t vrsraq_n_u32(uint32x4_t a, uint32x4_t b, __constrange(`1`,`32`) int c) // VRSRA.U32 q0,q0,#32
8273	{
8274	uint32x4_t shift;
8275	shift = vrshrq_n_u32(b, c);
8276	return vaddq_u32(a, shift);
8277	}
8278
8279	_NEON2SSESTORAGE uint64x2_t vrsraq_n_u64(uint64x2_t a, uint64x2_t b, __constrange(`1`,`64`) int c); // VRSRA.U64 q0,q0,#64
8280	_NEON2SSE_INLINE uint64x2_t vrsraq_n_u64(uint64x2_t a, uint64x2_t b, __constrange(`1`,`64`) int c)
8281	{
8282	uint64x2_t shift;
8283	shift = vrshrq_n_u64(b, c);
8284	return vaddq_u64(a, shift);
8285	}
8286
8287	//********************Vector saturating shift left by constant ***************************
8288	//********************************************************************************************
8289	//we don't check const ranges assuming they are met
8290	_NEON2SSESTORAGE int8x8_t vqshl_n_s8(int8x8_t a, __constrange(`0`,`7`) int b); // VQSHL.S8 d0,d0,#0
8291	_NEON2SSE_INLINE int8x8_t vqshl_n_s8(int8x8_t a, __constrange(`0`,`7`) int b) // VQSHL.S8 d0,d0,#0
8292	{
8293	//no 8 bit shift available in IA32 SIMD, go to 16 bit. It also provides the auto saturation (in packs function)
8294	int8x8_t res64;
8295	__m128i a128, r128;
8296	a128 = _MM_CVTEPI8_EPI16 (_pM128i(a)); //SSE 4.1
8297	r128 = _mm_slli_epi16 (a128, b);
8298	r128 = _mm_packs_epi16 (r128,r128); //saturated s8, use 64 low bits only
8299	return64(r128);
8300	}
8301
8302	_NEON2SSESTORAGE int16x4_t vqshl_n_s16(int16x4_t a, __constrange(`0`,`15`) int b); // VQSHL.S16 d0,d0,#0
8303	_NEON2SSE_INLINE int16x4_t vqshl_n_s16(int16x4_t a, __constrange(`0`,`15`) int b) // VQSHL.S16 d0,d0,#0
8304	{
8305	// go to 32 bit to get the auto saturation (in packs function)
8306	int16x4_t res64;
8307	__m128i a128, r128;
8308	a128 = _MM_CVTEPI16_EPI32 (_pM128i(a)); //SSE 4.1
8309	r128 = _mm_slli_epi32 (a128, b); //shift_res
8310	r128 = _mm_packs_epi32 (r128,r128); //saturated s16, use 64 low bits only
8311	return64(r128);
8312	}
8313
8314	_NEON2SSESTORAGE int32x2_t vqshl_n_s32(int32x2_t a, __constrange(`0`,`31`) int b); // VQSHL.S32 d0,d0,#0
8315	_NEON2SSE_INLINE int32x2_t vqshl_n_s32(int32x2_t a, __constrange(`0`,`31`) int b)
8316	{
8317	//serial execution may be faster
8318	int32x2_t res64;
8319	return64(vqshlq_n_s32 (_pM128i(a), b));
8320	}
8321
8322
8323	_NEON2SSESTORAGE int64x1_t vqshl_n_s64(int64x1_t a, __constrange(`0`,`63`) int b); // VQSHL.S64 d0,d0,#0
8324	_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int64x1_t vqshl_n_s64(int64x1_t a, __constrange(`0`,`63`) int b), _NEON2SSE_REASON_SLOW_SERIAL)
8325	{
8326	// no effective SIMD solution here
8327	int64x1_t res;
8328	int64_t bmask;
8329	int64_t a_i64 = ( int64_t)&a;
8330	bmask = ( int64_t)`1` << (`63` - b); //positive
8331	if (a_i64 >= bmask) {
8332	res.m64_i64[`0`] = ~(_SIGNBIT64);
8333	} else {
8334	res.m64_i64[`0`] = (a_i64 <= -bmask) ? (int64_t)_SIGNBIT64 : a_i64 << b;
8335	}
8336	return res;
8337	}
8338
8339
8340	_NEON2SSESTORAGE uint8x8_t vqshl_n_u8(uint8x8_t a, __constrange(`0`,`7`) int b); // VQSHL.U8 d0,d0,#0
8341	_NEON2SSE_INLINE uint8x8_t vqshl_n_u8(uint8x8_t a, __constrange(`0`,`7`) int b) // VQSHL.U8 d0,d0,#0
8342	{
8343	//no 8 bit shift available in IA32 SIMD, go to 16 bit
8344	uint8x8_t res64;
8345	__m128i a128, r128;
8346	a128 = _MM_CVTEPU8_EPI16 (_pM128i(a)); //SSE 4.1
8347	r128 = _mm_slli_epi16 (a128, b); //shift_res
8348	r128 = _mm_packus_epi16 (r128,r128); //saturated u8, use 64 low bits only
8349	return64(r128);
8350	}
8351
8352	_NEON2SSESTORAGE uint16x4_t vqshl_n_u16(uint16x4_t a, __constrange(`0`,`15`) int b); // VQSHL.s16 d0,d0,#0
8353	_NEON2SSE_INLINE uint16x4_t vqshl_n_u16(uint16x4_t a, __constrange(`0`,`15`) int b) // VQSHL.s16 d0,d0,#0
8354	{
8355	// go to 32 bit to get the auto saturation (in packus function)
8356	uint16x4_t res64;
8357	__m128i a128, r128;
8358	a128 = _MM_CVTEPU16_EPI32 (_pM128i(a)); //SSE 4.1
8359	r128 = _mm_slli_epi32 (a128, b); //shift_res
8360	r128 = _MM_PACKUS1_EPI32 (r128); //saturated s16
8361	return64(r128);
8362	}
8363
8364	_NEON2SSESTORAGE uint32x2_t vqshl_n_u32(uint32x2_t a, __constrange(`0`,`31`) int b); // VQSHL.U32 d0,d0,#0
8365	_NEON2SSE_INLINE uint32x2_t vqshl_n_u32(uint32x2_t a, __constrange(`0`,`31`) int b)
8366	{
8367	uint32x2_t res64;
8368	return64(vqshlq_n_u32(_pM128i(a), b));
8369	}
8370
8371	_NEON2SSESTORAGE uint64x1_t vqshl_n_u64(uint64x1_t a, __constrange(`0`,`63`) int b); // VQSHL.U64 d0,d0,#0
8372	_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint64x1_t vqshl_n_u64(uint64x1_t a, __constrange(`0`,`63`) int b), _NEON2SSE_REASON_SLOW_SERIAL)
8373	{
8374	// no effective SIMD solution here
8375	uint64x1_t res;
8376	uint64_t bmask;
8377	uint64_t a_i64 = (uint64_t)&a;
8378	bmask = ( uint64_t)`1` << (`64` - b);
8379	res.m64_u64[`0`] = (a_i64 >= bmask)&&(b>`0`) ? `0xffffffffffffffff` : a_i64 << b; //if b=0 we are fine with any a
8380	return res;
8381	}
8382
8383	_NEON2SSESTORAGE int8x16_t vqshlq_n_s8(int8x16_t a, __constrange(`0`,`7`) int b); // VQSHL.S8 q0,q0,#0
8384	_NEON2SSE_INLINE int8x16_t vqshlq_n_s8(int8x16_t a, __constrange(`0`,`7`) int b) // VQSHL.S8 q0,q0,#0
8385	{
8386	// go to 16 bit to get the auto saturation (in packs function)
8387	__m128i a128, r128_1, r128_2;
8388	a128 = _MM_CVTEPI8_EPI16 (a); //SSE 4.1
8389	r128_1 = _mm_slli_epi16 (a128, b);
8390	//swap hi and low part of a128 to process the remaining data
8391	a128 = _mm_shuffle_epi32 (a, _SWAP_HI_LOW32);
8392	a128 = _MM_CVTEPI8_EPI16 (a128);
8393	r128_2 = _mm_slli_epi16 (a128, b);
8394	return _mm_packs_epi16 (r128_1, r128_2); //saturated s8
8395	}
8396
8397	_NEON2SSESTORAGE int16x8_t vqshlq_n_s16(int16x8_t a, __constrange(`0`,`15`) int b); // VQSHL.S16 q0,q0,#0
8398	_NEON2SSE_INLINE int16x8_t vqshlq_n_s16(int16x8_t a, __constrange(`0`,`15`) int b) // VQSHL.S16 q0,q0,#0
8399	{
8400	// manual saturation solution looks LESS optimal than 32 bits conversion one
8401	// go to 32 bit to get the auto saturation (in packs function)
8402	__m128i a128, r128_1, r128_2;
8403	a128 = _MM_CVTEPI16_EPI32 (a); //SSE 4.1
8404	r128_1 = _mm_slli_epi32 (a128, b); //shift_res
8405	//swap hi and low part of a128 to process the remaining data
8406	a128 = _mm_shuffle_epi32 (a, _SWAP_HI_LOW32);
8407	a128 = _MM_CVTEPI16_EPI32 (a128);
8408	r128_2 = _mm_slli_epi32 (a128, b);
8409	return _mm_packs_epi32 (r128_1, r128_2); //saturated s16
8410	}
8411
8412	_NEON2SSESTORAGE int32x4_t vqshlq_n_s32(int32x4_t a, __constrange(`0`,`31`) int b); // VQSHL.S32 q0,q0,#0
8413	_NEON2SSE_INLINE int32x4_t vqshlq_n_s32(int32x4_t a, __constrange(`0`,`31`) int b) // VQSHL.S32 q0,q0,#0
8414	{
8415	// no 64 bit saturation option available, special tricks necessary
8416	__m128i c1, maskA, saturation_mask, c7ffffff_mask, shift_res, shift_res_mask;
8417	c1 = _mm_cmpeq_epi32(a,a); //0xff..ff
8418	maskA = _mm_srli_epi32(c1, b + `1`); //mask for positive numbers (32-b+1) zeros and b-1 ones
8419	saturation_mask = _mm_cmpgt_epi32 (a, maskA); //0xff...ff if we need saturation, 0 otherwise
8420	c7ffffff_mask = _mm_srli_epi32(saturation_mask, `1`); //saturated to 0x7f..ff when needed and zeros if not
8421	shift_res = _mm_slli_epi32 (a, b);
8422	shift_res_mask = _mm_andnot_si128(saturation_mask, shift_res);
8423	//result with positive numbers saturated
8424	shift_res = _mm_or_si128 (c7ffffff_mask, shift_res_mask);
8425	//treat negative numbers
8426	maskA = _mm_slli_epi32(c1, `31` - b); //mask for negative numbers b-1 ones and (32-b+1) zeros
8427	saturation_mask = _mm_cmpgt_epi32 (maskA,a); //0xff...ff if we need saturation, 0 otherwise
8428	c7ffffff_mask = _mm_slli_epi32(saturation_mask, `31`); //saturated to 0x80..00 when needed and zeros if not
8429	shift_res_mask = _mm_andnot_si128(saturation_mask, shift_res);
8430	return _mm_or_si128 (c7ffffff_mask, shift_res_mask);
8431	}
8432
8433	_NEON2SSESTORAGE int64x2_t vqshlq_n_s64(int64x2_t a, __constrange(`0`,`63`) int b); // VQSHL.S64 q0,q0,#0
8434	_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int64x2_t vqshlq_n_s64(int64x2_t a, __constrange(`0`,`63`) int b), _NEON2SSE_REASON_SLOW_SERIAL)
8435	{
8436	// no effective SIMD solution here
8437	_NEON2SSE_ALIGN_16 int64_t atmp[`2`], res[`2`];
8438	int64_t bmask;
8439	int i;
8440	bmask = ( int64_t)`1` << (`63` - b); //positive
8441	_mm_store_si128((__m128i*)atmp, a);
8442	for (i = `0`; i<`2`; i++) {
8443	if (atmp[i] >= bmask) {
8444	res[i] = ~(_SIGNBIT64);
8445	} else {
8446	res[i] = (atmp[i] <= -bmask) ? (int64_t)_SIGNBIT64 : atmp[i] << b;
8447	}
8448	}
8449	return _mm_load_si128((__m128i*)res);
8450	}
8451
8452	_NEON2SSESTORAGE uint8x16_t vqshlq_n_u8(uint8x16_t a, __constrange(`0`,`7`) int b); // VQSHL.U8 q0,q0,#0
8453	_NEON2SSE_INLINE uint8x16_t vqshlq_n_u8(uint8x16_t a, __constrange(`0`,`7`) int b) // VQSHL.U8 q0,q0,#0
8454	{
8455	// go to 16 bit to get the auto saturation (in packs function)
8456	__m128i a128, r128_1, r128_2;
8457	a128 = _MM_CVTEPU8_EPI16 (a); //SSE 4.1
8458	r128_1 = _mm_slli_epi16 (a128, b);
8459	//swap hi and low part of a128 to process the remaining data
8460	a128 = _mm_shuffle_epi32 (a, _SWAP_HI_LOW32);
8461	a128 = _MM_CVTEPU8_EPI16 (a128);
8462	r128_2 = _mm_slli_epi16 (a128, b);
8463	return _mm_packus_epi16 (r128_1, r128_2); //saturated u8
8464	}
8465
8466	_NEON2SSESTORAGE uint16x8_t vqshlq_n_u16(uint16x8_t a, __constrange(`0`,`15`) int b); // VQSHL.s16 q0,q0,#0
8467	_NEON2SSE_INLINE uint16x8_t vqshlq_n_u16(uint16x8_t a, __constrange(`0`,`15`) int b) // VQSHL.s16 q0,q0,#0
8468	{
8469	// manual saturation solution looks more optimal than 32 bits conversion one
8470	__m128i cb, c8000, a_signed, saturation_mask, shift_res;
8471	cb = _mm_set1_epi16((`1` << (`16` - b)) - `1` - `0x8000` );
8472	c8000 = _mm_set1_epi16 (-`32768`); // (int16_t)0x8000
8473	//no unsigned shorts comparison in SSE, only signed available, so need the trick
8474	a_signed = _mm_sub_epi16(a, c8000); //go to signed
8475	saturation_mask = _mm_cmpgt_epi16 (a_signed, cb);
8476	shift_res = _mm_slli_epi16 (a, b);
8477	return _mm_or_si128 (shift_res, saturation_mask);
8478	}
8479
8480	_NEON2SSESTORAGE uint32x4_t vqshlq_n_u32(uint32x4_t a, __constrange(`0`,`31`) int b); // VQSHL.U32 q0,q0,#0
8481	_NEON2SSE_INLINE uint32x4_t vqshlq_n_u32(uint32x4_t a, __constrange(`0`,`31`) int b) // VQSHL.U32 q0,q0,#0
8482	{
8483	// manual saturation solution, no 64 bit saturation option, the serial version may be faster
8484	__m128i cb, c80000000, a_signed, saturation_mask, shift_res;
8485	cb = _mm_set1_epi32((`1` << (`32` - b)) - `1` - `0x80000000` );
8486	c80000000 = _mm_set1_epi32 (`0x80000000`);
8487	//no unsigned ints comparison in SSE, only signed available, so need the trick
8488	a_signed = _mm_sub_epi32(a, c80000000); //go to signed
8489	saturation_mask = _mm_cmpgt_epi32 (a_signed, cb);
8490	shift_res = _mm_slli_epi32 (a, b);
8491	return _mm_or_si128 (shift_res, saturation_mask);
8492	}
8493
8494	_NEON2SSESTORAGE uint64x2_t vqshlq_n_u64(uint64x2_t a, __constrange(`0`,`63`) int b); // VQSHL.U64 q0,q0,#0
8495	_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint64x2_t vqshlq_n_u64(uint64x2_t a, __constrange(`0`,`63`) int b), _NEON2SSE_REASON_SLOW_SERIAL)
8496	{
8497	// no effective SIMD solution here
8498	_NEON2SSE_ALIGN_16 uint64_t atmp[`2`], res[`2`];
8499	uint64_t bmask;
8500	int i;
8501	bmask = ( uint64_t)`1` << (`64` - b);
8502	_mm_store_si128((__m128i*)atmp, a);
8503	for (i = `0`; i<`2`; i++) {
8504	res[i] = (atmp[i] >= bmask)&&(b>`0`) ? `0xffffffffffffffff` : atmp[i] << b; //if b=0 we are fine with any a
8505	}
8506	return _mm_load_si128((__m128i*)res);
8507	}
8508
8509	//************Vector signed->unsigned saturating shift left by constant ***********
8510	//*************************************************************************************
8511	_NEON2SSESTORAGE uint8x8_t vqshlu_n_s8(int8x8_t a, __constrange(`0`,`7`) int b); // VQSHLU.S8 d0,d0,#0
8512	_NEON2SSE_INLINE uint8x8_t vqshlu_n_s8(int8x8_t a, __constrange(`0`,`7`) int b) // VQSHLU.S8 d0,d0,#0
8513	{
8514	//no 8 bit shift available in IA32 SIMD, go to 16 bit. It also provides the auto saturation (in packs function)
8515	uint8x8_t res64;
8516	__m128i a128, r128;
8517	a128 = _MM_CVTEPI8_EPI16 (_pM128i(a)); //SSE 4.1
8518	r128 = _mm_slli_epi16 (a128, b);
8519	r128 = _mm_packus_epi16 (r128,r128); //saturated u8, use 64 low bits only
8520	return64(r128);
8521	}
8522
8523	_NEON2SSESTORAGE uint16x4_t vqshlu_n_s16(int16x4_t a, __constrange(`0`,`15`) int b); // VQSHLU.S16 d0,d0,#0
8524	_NEON2SSE_INLINE uint16x4_t vqshlu_n_s16(int16x4_t a, __constrange(`0`,`15`) int b) // VQSHLU.S16 d0,d0,#0
8525	{
8526	uint16x4_t res64;
8527	__m128i a128, r128;
8528	a128 = _MM_CVTEPI16_EPI32 (_pM128i(a)); //SSE 4.1
8529	r128 = _mm_slli_epi32 (a128, b); //shift_res
8530	r128 = _MM_PACKUS1_EPI32 (r128); //saturated s16, use 64 low bits only
8531	return64(r128);
8532	}
8533
8534	_NEON2SSESTORAGE uint32x2_t vqshlu_n_s32(int32x2_t a, __constrange(`0`,`31`) int b); // VQSHLU.S32 d0,d0,#0
8535	_NEON2SSE_INLINE int32x2_t vqshlu_n_s32(int32x2_t a, __constrange(`0`,`31`) int b)
8536	{
8537	int32x2_t res64;
8538	return64( vqshluq_n_s32(_pM128i(a), b));
8539	}
8540
8541	_NEON2SSESTORAGE uint64x1_t vqshlu_n_s64(int64x1_t a, __constrange(`0`,`63`) int b); // VQSHLU.S64 d0,d0,#0
8542	_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint64x1_t vqshlu_n_s64(int64x1_t a, __constrange(`0`,`63`) int b), _NEON2SSE_REASON_SLOW_SERIAL) // no effective SIMD solution here, serial execution looks faster
8543	{
8544	uint64x1_t res;
8545	uint64_t limit;
8546	if (a.m64_i64[`0`]<=`0`) {
8547	res.m64_u64[`0`] = `0`;
8548	} else {
8549	limit = (uint64_t) `1` << (`64` - b);
8550	res.m64_u64[`0`] = ( ((uint64_t)a.m64_i64[`0`]) >= limit) ? ~((uint64_t)`0`) : (uint64_t)a.m64_i64[`0`] << b;
8551	}
8552	return res;
8553	}
8554
8555	_NEON2SSESTORAGE uint8x16_t vqshluq_n_s8(int8x16_t a, __constrange(`0`,`7`) int b); // VQSHLU.S8 q0,q0,#0
8556	_NEON2SSE_INLINE uint8x16_t vqshluq_n_s8(int8x16_t a, __constrange(`0`,`7`) int b) // VQSHLU.S8 q0,q0,#0
8557	{
8558	__m128i a128, r128_1, r128_2;
8559	a128 = _MM_CVTEPI8_EPI16 (a); //SSE 4.1
8560	r128_1 = _mm_slli_epi16 (a128, b);
8561	//swap hi and low part of a128 to process the remaining data
8562	a128 = _mm_shuffle_epi32 (a, _SWAP_HI_LOW32);
8563	a128 = _MM_CVTEPI8_EPI16 (a128);
8564	r128_2 = _mm_slli_epi16 (a128, b);
8565	return _mm_packus_epi16 (r128_1, r128_2); //saturated u8
8566	}
8567
8568	_NEON2SSESTORAGE uint16x8_t vqshluq_n_s16(int16x8_t a, __constrange(`0`,`15`) int b); // VQSHLU.S16 q0,q0,#0
8569	_NEON2SSE_INLINE uint16x8_t vqshluq_n_s16(int16x8_t a, __constrange(`0`,`15`) int b) // VQSHLU.S16 q0,q0,#0
8570	{
8571	// manual saturation solution looks LESS optimal than 32 bits conversion one
8572	__m128i a128, r128_1, r128_2;
8573	a128 = _MM_CVTEPI16_EPI32 (a); //SSE 4.1
8574	r128_1 = _mm_slli_epi32 (a128, b); //shift_res
8575	//swap hi and low part of a128 to process the remaining data
8576	a128 = _mm_shuffle_epi32 (a, _SWAP_HI_LOW32);
8577	a128 = _MM_CVTEPI16_EPI32 (a128);
8578	r128_2 = _mm_slli_epi32 (a128, b);
8579	return _MM_PACKUS_EPI32 (r128_1, r128_2); //saturated s16
8580	}
8581
8582	_NEON2SSESTORAGE uint32x4_t vqshluq_n_s32(int32x4_t a, __constrange(`0`,`31`) int b); // VQSHLU.S32 q0,q0,#0
8583	_NEON2SSE_INLINE uint32x4_t vqshluq_n_s32(int32x4_t a, __constrange(`0`,`31`) int b) // VQSHLU.S32 q0,q0,#0
8584	{
8585	//solution may be not optimal compared with the serial one
8586	__m128i zero, maskA, maskGT0, a0, a_masked, a_shift;
8587	zero = _mm_setzero_si128();
8588	maskA = _mm_cmpeq_epi32(a, a);
8589	maskA = _mm_slli_epi32(maskA,(`32` - b)); // b ones and (32-b)zeros
8590	//saturate negative numbers to zero
8591	maskGT0 = _mm_cmpgt_epi32 (a, zero); // //0xffffffff if positive number and zero otherwise (negative numbers)
8592	a0 = _mm_and_si128 (a, maskGT0); //negative are zeros now
8593	//saturate positive to 0xffffffff
8594	a_masked = _mm_and_si128 (a0, maskA);
8595	a_masked = _mm_cmpgt_epi32 (a_masked, zero); //0xffffffff if saturation necessary 0 otherwise
8596	a_shift = _mm_slli_epi32 (a0, b);
8597	return _mm_or_si128 (a_shift, a_masked); //actual saturation
8598	}
8599
8600	_NEON2SSESTORAGE uint64x2_t vqshluq_n_s64(int64x2_t a, __constrange(`0`,`63`) int b); // VQSHLU.S64 q0,q0,#0
8601	_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint64x2_t vqshluq_n_s64(int64x2_t a, __constrange(`0`,`63`) int b), _NEON2SSE_REASON_SLOW_SERIAL)
8602	{
8603	// no effective SIMD solution here, serial execution looks faster
8604	_NEON2SSE_ALIGN_16 int64_t atmp[`2`];
8605	_NEON2SSE_ALIGN_16 uint64_t res[`2`];
8606	uint64_t limit;
8607	int i;
8608	_mm_store_si128((__m128i*)atmp, a);
8609	for (i = `0`; i<`2`; i++) {
8610	if (atmp[i]<=`0`) {
8611	res[i] = `0`;
8612	} else {
8613	limit = (uint64_t) `1` << (`64` - b);
8614	res[i] = ( ((uint64_t)atmp[i]) >= limit) ? ~((uint64_t)`0`) : (uint64_t)atmp[i] << b;
8615	}
8616	}
8617	return _mm_load_si128((__m128i*)res);
8618	}
8619
8620	//************ Vector narrowing shift right by constant ************
8621	//**********************************************************************
8622	_NEON2SSESTORAGE int8x8_t vshrn_n_s16(int16x8_t a, __constrange(`1`,`8`) int b); // VSHRN.I16 d0,q0,#8
8623	_NEON2SSE_INLINE int8x8_t vshrn_n_s16(int16x8_t a, __constrange(`1`,`8`) int b) // VSHRN.I16 d0,q0,#8
8624	{
8625	int8x8_t res64;
8626	__m128i r16;
8627	r16 = vshrq_n_s16(a,b);
8628	r16 = _mm_shuffle_epi8 (r16, (__m128i) mask8_16_even_odd); //narrow, use low 64 bits only. Impossible to use _mm_packs because of negative saturation problems
8629	return64(r16);
8630	}
8631
8632	_NEON2SSESTORAGE int16x4_t vshrn_n_s32(int32x4_t a, __constrange(`1`,`16`) int b); // VSHRN.I32 d0,q0,#16
8633	_NEON2SSE_INLINE int16x4_t vshrn_n_s32(int32x4_t a, __constrange(`1`,`16`) int b) // VSHRN.I32 d0,q0,#16
8634	{
8635	int16x4_t res64;
8636	__m128i r32;
8637	r32 = vshrq_n_s32(a,b);
8638	r32 = _mm_shuffle_epi8 (r32, (__m128i) mask8_32_even_odd); //narrow, use low 64 bits only. Impossible to use _mm_packs because of negative saturation problems
8639	return64(r32);
8640	}
8641
8642	_NEON2SSESTORAGE int32x2_t vshrn_n_s64(int64x2_t a, __constrange(`1`,`32`) int b); // VSHRN.I64 d0,q0,#32
8643	_NEON2SSE_INLINE int32x2_t vshrn_n_s64(int64x2_t a, __constrange(`1`,`32`) int b)
8644	{
8645	int32x2_t res64;
8646	__m128i r64;
8647	r64 = vshrq_n_s64(a,b);
8648	r64 = _mm_shuffle_epi32(r64, `0` \| (`2` << `2`) \| (`1` << `4`) \| (`3` << `6`)); //shuffle the data to get 2 32-bits
8649	return64(r64);
8650	}
8651
8652	_NEON2SSESTORAGE uint8x8_t vshrn_n_u16(uint16x8_t a, __constrange(`1`,`8`) int b); // VSHRN.I16 d0,q0,#8
8653	_NEON2SSE_INLINE uint8x8_t vshrn_n_u16(uint16x8_t a, __constrange(`1`,`8`) int b) // VSHRN.I16 d0,q0,#8
8654	{
8655	uint8x8_t res64;
8656	__m128i mask, r16;
8657	mask = _mm_set1_epi16(`0xff`);
8658	r16 = vshrq_n_s16(a,b); //after right shift b>=1 unsigned var fits into signed range, so we could use _mm_packus_epi16 (signed 16 to unsigned 8)
8659	r16 = _mm_and_si128(r16, mask); //to avoid saturation
8660	r16 = _mm_packus_epi16 (r16,r16); //narrow, use low 64 bits only
8661	return64(r16);
8662	}
8663
8664	_NEON2SSESTORAGE uint16x4_t vshrn_n_u32(uint32x4_t a, __constrange(`1`,`16`) int b); // VSHRN.I32 d0,q0,#16
8665	_NEON2SSE_INLINE uint16x4_t vshrn_n_u32(uint32x4_t a, __constrange(`1`,`16`) int b) // VSHRN.I32 d0,q0,#16
8666	{
8667	uint16x4_t res64;
8668	__m128i mask, r32;
8669	mask = _mm_set1_epi32(`0xffff`);
8670	r32 = vshrq_n_u32(a,b); //after right shift b>=1 unsigned var fits into signed range, so we could use _MM_PACKUS_EPI32 (signed 32 to unsigned 16)
8671	r32 = _mm_and_si128(r32, mask); //to avoid saturation
8672	r32 = _MM_PACKUS1_EPI32 (r32); //saturate and narrow, use low 64 bits only
8673	return64(r32);
8674	}
8675
8676	_NEON2SSESTORAGE uint32x2_t vshrn_n_u64(uint64x2_t a, __constrange(`1`,`32`) int b); // VSHRN.I64 d0,q0,#32
8677	_NEON2SSE_INLINE uint32x2_t vshrn_n_u64(uint64x2_t a, __constrange(`1`,`32`) int b)
8678	{
8679	uint32x2_t res64;
8680	__m128i r64;
8681	r64 = vshrq_n_u64(a,b);
8682	r64 = _mm_shuffle_epi32(r64, `0` \| (`2` << `2`) \| (`1` << `4`) \| (`3` << `6`)); //shuffle the data to get 2 32-bits
8683	return64(r64);
8684	}
8685
8686	//************ Vector signed->unsigned narrowing saturating shift right by constant ******
8687	//*********************************************************************************************
8688	_NEON2SSESTORAGE uint8x8_t vqshrun_n_s16(int16x8_t a, __constrange(`1`,`8`) int b); // VQSHRUN.S16 d0,q0,#8
8689	_NEON2SSE_INLINE uint8x8_t vqshrun_n_s16(int16x8_t a, __constrange(`1`,`8`) int b) // VQSHRUN.S16 d0,q0,#8
8690	{
8691	uint8x8_t res64;
8692	__m128i r16;
8693	r16 = vshrq_n_s16(a,b);
8694	r16 = _mm_packus_epi16 (r16,r16); //saturate and narrow (signed to unsigned), use low 64 bits only
8695	return64(r16);
8696	}
8697
8698	_NEON2SSESTORAGE uint16x4_t vqshrun_n_s32(int32x4_t a, __constrange(`1`,`16`) int b); // VQSHRUN.S32 d0,q0,#16
8699	_NEON2SSE_INLINE uint16x4_t vqshrun_n_s32(int32x4_t a, __constrange(`1`,`16`) int b) // VQSHRUN.S32 d0,q0,#16
8700	{
8701	uint16x4_t res64;
8702	__m128i r32;
8703	r32 = vshrq_n_s32(a,b);
8704	r32 = _MM_PACKUS1_EPI32 (r32); //saturate and narrow(signed to unsigned), use low 64 bits only
8705	return64(r32);
8706	}
8707
8708	_NEON2SSESTORAGE uint32x2_t vqshrun_n_s64(int64x2_t a, __constrange(`1`,`32`) int b); // VQSHRUN.S64 d0,q0,#32
8709	_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint32x2_t vqshrun_n_s64(int64x2_t a, __constrange(`1`,`32`) int b), _NEON2SSE_REASON_SLOW_SERIAL) //serial solution is faster
8710	{
8711	_NEON2SSE_ALIGN_16 int64_t atmp[`2`];
8712	uint32x2_t res;
8713	int64_t res64;
8714	_mm_store_si128((__m128i*)atmp, a);
8715	if (atmp[`0`] < `0`) {
8716	res.m64_u32[`0`] = `0`;
8717	} else {
8718	res64 = (atmp[`0`] >> b);
8719	res.m64_u32[`0`] = (res64 > (int64_t)`0xffffffff`) ? `0xffffffff` : (uint32_t) res64;
8720	}
8721	if (atmp[`1`] < `0`) {
8722	res.m64_u32[`1`] = `0`;
8723	} else {
8724	res64 = (atmp[`1`] >> b);
8725	res.m64_u32[`1`] = (res64 > (int64_t)`0xffffffff`) ? `0xffffffff` : (uint32_t)res64;
8726	}
8727	return res;
8728	}
8729
8730	//** Vector signed->unsigned rounding narrowing saturating shift right by constant ***
8731	_NEON2SSESTORAGE uint8x8_t vqrshrun_n_s16(int16x8_t a, __constrange(`1`,`8`) int b); // VQRSHRUN.S16 d0,q0,#8
8732	_NEON2SSE_INLINE uint8x8_t vqrshrun_n_s16(int16x8_t a, __constrange(`1`,`8`) int b) // VQRSHRUN.S16 d0,q0,#8
8733	{
8734	//solution may be not optimal compared with the serial one
8735	__m128i r16;
8736	uint8x8_t res64;
8737	r16 = vrshrq_n_s16(a,b);
8738	r16 = _mm_packus_epi16 (r16,r16); //saturate and narrow (signed to unsigned), use low 64 bits only
8739	return64(r16);
8740	}
8741
8742	_NEON2SSESTORAGE uint16x4_t vqrshrun_n_s32(int32x4_t a, __constrange(`1`,`16`) int b); // VQRSHRUN.S32 d0,q0,#16
8743	_NEON2SSE_INLINE uint16x4_t vqrshrun_n_s32(int32x4_t a, __constrange(`1`,`16`) int b) // VQRSHRUN.S32 d0,q0,#16
8744	{
8745	//solution may be not optimal compared with the serial one
8746	__m128i r32;
8747	uint16x4_t res64;
8748	r32 = vrshrq_n_s32(a,b);
8749	r32 = _MM_PACKUS1_EPI32 (r32); //saturate and narrow (signed to unsigned), use low 64 bits only
8750	return64(r32);
8751	}
8752
8753	_NEON2SSESTORAGE uint32x2_t vqrshrun_n_s64(int64x2_t a, __constrange(`1`,`32`) int b); // VQRSHRUN.S64 d0,q0,#32
8754	_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint32x2_t vqrshrun_n_s64(int64x2_t a, __constrange(`1`,`32`) int b), _NEON2SSE_REASON_SLOW_SERIAL) //serial solution is faster
8755	{
8756	_NEON2SSE_ALIGN_16 int64_t atmp[`2`];
8757	uint32x2_t res;
8758	int64_t res64;
8759	_mm_store_si128((__m128i*)atmp, a);
8760	if (atmp[`0`] < `0`) {
8761	res.m64_u32[`0`] = `0`;
8762	} else {
8763	res64 = (atmp[`0`] >> b) + ( (atmp[`0`] & ((int64_t)`1` << (b - `1`))) >> (b - `1`) );
8764	res.m64_u32[`0`] = (uint32_t) ((res64 > (int64_t)`0xffffffff` ) ? `0xffffffff` : res64);
8765	}
8766	if (atmp[`1`] < `0`) {
8767	res.m64_u32[`1`] = `0`;
8768	} else {
8769	res64 = (atmp[`1`] >> b) + ( (atmp[`0`] & ((int64_t)`1` << (b - `1`))) >> (b - `1`) );
8770	res.m64_u32[`1`] = (uint32_t)((res64 > (int64_t)`0xffffffff` ) ? `0xffffffff` : res64);
8771	}
8772	return res;
8773	}
8774
8775	//*** Vector narrowing saturating shift right by constant ****
8776	//*****************************************************************
8777	_NEON2SSESTORAGE int8x8_t vqshrn_n_s16(int16x8_t a, __constrange(`1`,`8`) int b); // VQSHRN.S16 d0,q0,#8
8778	_NEON2SSE_INLINE int8x8_t vqshrn_n_s16(int16x8_t a, __constrange(`1`,`8`) int b) // VQSHRN.S16 d0,q0,#8
8779	{
8780	int8x8_t res64;
8781	__m128i r16;
8782	r16 = vshrq_n_s16(a,b);
8783	r16 = _mm_packs_epi16 (r16,r16); //saturate and narrow, use low 64 bits only
8784	return64(r16);
8785	}
8786
8787	_NEON2SSESTORAGE int16x4_t vqshrn_n_s32(int32x4_t a, __constrange(`1`,`16`) int b); // VQSHRN.S32 d0,q0,#16
8788	_NEON2SSE_INLINE int16x4_t vqshrn_n_s32(int32x4_t a, __constrange(`1`,`16`) int b) // VQSHRN.S32 d0,q0,#16
8789	{
8790	int16x4_t res64;
8791	__m128i r32;
8792	r32 = vshrq_n_s32(a,b);
8793	r32 = _mm_packs_epi32 (r32,r32); //saturate and narrow, use low 64 bits only
8794	return64(r32);
8795	}
8796
8797	_NEON2SSESTORAGE int32x2_t vqshrn_n_s64(int64x2_t a, __constrange(`1`,`32`) int b); // VQSHRN.S64 d0,q0,#32
8798	_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int32x2_t vqshrn_n_s64(int64x2_t a, __constrange(`1`,`32`) int b), _NEON2SSE_REASON_SLOW_UNEFFECTIVE)
8799	{
8800	//no optimal SIMD solution found
8801	_NEON2SSE_ALIGN_16 int64_t res64[`2`], atmp[`2`];
8802	int32x2_t res;
8803	_mm_store_si128((__m128i*)atmp, a);
8804	res64[`0`] = (atmp[`0`] >> b);
8805	res64[`1`] = (atmp[`1`] >> b);
8806	if(res64[`0`]>SINT_MAX) res64[`0`] = SINT_MAX;
8807	if(res64[`0`]<SINT_MIN) res64[`0`] = SINT_MIN;
8808	if(res64[`1`]>SINT_MAX) res64[`1`] = SINT_MAX;
8809	if(res64[`1`]<SINT_MIN) res64[`1`] = SINT_MIN;
8810	res.m64_i32[`0`] = (int32_t)res64[`0`];
8811	res.m64_i32[`1`] = (int32_t)res64[`1`];
8812	return res;
8813	}
8814
8815	_NEON2SSESTORAGE uint8x8_t vqshrn_n_u16(uint16x8_t a, __constrange(`1`,`8`) int b); // VQSHRN.s16 d0,q0,#8
8816	_NEON2SSE_INLINE uint8x8_t vqshrn_n_u16(uint16x8_t a, __constrange(`1`,`8`) int b) // VQSHRN.s16 d0,q0,#8
8817	{
8818	uint8x8_t res64;
8819	__m128i r16;
8820	r16 = vshrq_n_u16(a,b); //after right shift b>=1 unsigned var fits into signed range, so we could use _mm_packus_epi16 (signed 16 to unsigned 8)
8821	r16 = _mm_packus_epi16 (r16,r16); //saturate and narrow, use low 64 bits only
8822	return64(r16);
8823	}
8824
8825	_NEON2SSESTORAGE uint16x4_t vqshrn_n_u32(uint32x4_t a, __constrange(`1`,`16`) int b); // VQSHRN.U32 d0,q0,#16
8826	_NEON2SSE_INLINE uint16x4_t vqshrn_n_u32(uint32x4_t a, __constrange(`1`,`16`) int b) // VQSHRN.U32 d0,q0,#16
8827	{
8828	uint16x4_t res64;
8829	__m128i r32;
8830	r32 = vshrq_n_u32(a,b); //after right shift b>=1 unsigned var fits into signed range, so we could use _MM_PACKUS_EPI32 (signed 32 to unsigned 8)
8831	r32 = _MM_PACKUS1_EPI32 (r32); //saturate and narrow, use low 64 bits only
8832	return64(r32);
8833	}
8834
8835	_NEON2SSESTORAGE uint32x2_t vqshrn_n_u64(uint64x2_t a, __constrange(`1`,`32`) int b); // VQSHRN.U64 d0,q0,#32
8836	_NEON2SSE_INLINE uint32x2_t vqshrn_n_u64(uint64x2_t a, __constrange(`1`,`32`) int b)
8837	{
8838	//serial solution may be faster
8839	uint32x2_t res64;
8840	__m128i r64, res_hi, zero;
8841	zero = _mm_setzero_si128();
8842	r64 = vshrq_n_u64(a,b);
8843	res_hi = _mm_srli_epi64(r64, `32`);
8844	res_hi = _mm_cmpgt_epi32(res_hi, zero);
8845	r64 = _mm_or_si128(r64, res_hi);
8846	r64 = _mm_shuffle_epi32(r64, `0` \| (`2` << `2`) \| (`1` << `4`) \| (`3` << `6`)); //shuffle the data to get 2 32-bits
8847	return64(r64);
8848	}
8849
8850
8851	//******* Vector rounding narrowing shift right by constant ***********************
8852	//****************************************************************************************
8853	_NEON2SSESTORAGE int8x8_t vrshrn_n_s16(int16x8_t a, __constrange(`1`,`8`) int b); // VRSHRN.I16 d0,q0,#8
8854	_NEON2SSE_INLINE int8x8_t vrshrn_n_s16(int16x8_t a, __constrange(`1`,`8`) int b) // VRSHRN.I16 d0,q0,#8
8855	{
8856	int8x8_t res64;
8857	__m128i r16;
8858	r16 = vrshrq_n_s16(a,b);
8859	r16 = _mm_shuffle_epi8 (r16, (__m128i) mask8_16_even_odd); //narrow, use low 64 bits only. Impossible to use _mm_packs because of negative saturation problems
8860	return64(r16);
8861	}
8862
8863	_NEON2SSESTORAGE int16x4_t vrshrn_n_s32(int32x4_t a, __constrange(`1`,`16`) int b); // VRSHRN.I32 d0,q0,#16
8864	_NEON2SSE_INLINE int16x4_t vrshrn_n_s32(int32x4_t a, __constrange(`1`,`16`) int b) // VRSHRN.I32 d0,q0,#16
8865	{
8866	int16x4_t res64;
8867	__m128i r32;
8868	r32 = vrshrq_n_s32(a,b);
8869	r32 = _mm_shuffle_epi8 (r32, (__m128i) mask8_32_even_odd); //narrow, use low 64 bits only. Impossible to use _mm_packs because of negative saturation problems
8870	return64(r32);
8871	}
8872
8873	_NEON2SSESTORAGE int32x2_t vrshrn_n_s64(int64x2_t a, __constrange(`1`,`32`) int b); // VRSHRN.I64 d0,q0,#32
8874	_NEON2SSE_INLINE int32x2_t vrshrn_n_s64(int64x2_t a, __constrange(`1`,`32`) int b)
8875	{
8876	int32x2_t res64;
8877	__m128i r64;
8878	r64 = vrshrq_n_s64(a,b);
8879	r64 = _mm_shuffle_epi32(r64, `0` \| (`2` << `2`) \| (`1` << `4`) \| (`3` << `6`)); //shuffle the data to get 2 32-bits
8880	return64(r64);
8881	}
8882
8883	_NEON2SSESTORAGE uint8x8_t vrshrn_n_u16(uint16x8_t a, __constrange(`1`,`8`) int b); // VRSHRN.I16 d0,q0,#8
8884	_NEON2SSE_INLINE uint8x8_t vrshrn_n_u16(uint16x8_t a, __constrange(`1`,`8`) int b) // VRSHRN.I16 d0,q0,#8
8885	{
8886	uint8x8_t res64;
8887	__m128i mask, r16;
8888	mask = _mm_set1_epi16(`0xff`);
8889	r16 = vrshrq_n_s16(a,b); //after right shift b>=1 unsigned var fits into signed range, so we could use _mm_packus_epi16 (signed 16 to unsigned 8)
8890	r16 = _mm_and_si128(r16, mask); //to avoid saturation
8891	r16 = _mm_packus_epi16 (r16,r16); //saturate and narrow, use low 64 bits only
8892	return64(r16);
8893	}
8894
8895	_NEON2SSESTORAGE uint16x4_t vrshrn_n_u32(uint32x4_t a, __constrange(`1`,`16`) int b); // VRSHRN.I32 d0,q0,#16
8896	_NEON2SSE_INLINE uint16x4_t vrshrn_n_u32(uint32x4_t a, __constrange(`1`,`16`) int b) // VRSHRN.I32 d0,q0,#16
8897	{
8898	uint16x4_t res64;
8899	__m128i mask, r32;
8900	mask = _mm_set1_epi32(`0xffff`);
8901	r32 = vrshrq_n_u32(a,b); //after right shift b>=1 unsigned var fits into signed range, so we could use _MM_PACKUS_EPI32 (signed 32 to unsigned 8)
8902	r32 = _mm_and_si128(r32, mask); //to avoid saturation
8903	r32 = _MM_PACKUS1_EPI32 (r32); //saturate and narrow, use low 64 bits only
8904	return64(r32);
8905	}
8906
8907	_NEON2SSESTORAGE uint32x2_t vrshrn_n_u64(uint64x2_t a, __constrange(`1`,`32`) int b); // VRSHRN.I64 d0,q0,#32
8908	_NEON2SSE_INLINE uint32x2_t vrshrn_n_u64(uint64x2_t a, __constrange(`1`,`32`) int b) //serial solution may be faster
8909	{
8910	uint32x2_t res64;
8911	__m128i r64;
8912	r64 = vrshrq_n_u64(a,b);
8913	r64 = _mm_shuffle_epi32(r64, `0` \| (`2` << `2`) \| (`1` << `4`) \| (`3` << `6`)); //shuffle the data to get 2 32-bits
8914	return64(r64);
8915	}
8916
8917	//*********** Vector rounding narrowing saturating shift right by constant **********
8918	//****************************************************************************************
8919	_NEON2SSESTORAGE int8x8_t vqrshrn_n_s16(int16x8_t a, __constrange(`1`,`8`) int b); // VQRSHRN.S16 d0,q0,#8
8920	_NEON2SSE_INLINE int8x8_t vqrshrn_n_s16(int16x8_t a, __constrange(`1`,`8`) int b) // VQRSHRN.S16 d0,q0,#8
8921	{
8922	int8x8_t res64;
8923	__m128i r16;
8924	r16 = vrshrq_n_s16(a,b);
8925	r16 = _mm_packs_epi16 (r16,r16); //saturate and narrow, use low 64 bits only
8926	return64(r16);
8927	}
8928
8929	_NEON2SSESTORAGE int16x4_t vqrshrn_n_s32(int32x4_t a, __constrange(`1`,`16`) int b); // VQRSHRN.S32 d0,q0,#16
8930	_NEON2SSE_INLINE int16x4_t vqrshrn_n_s32(int32x4_t a, __constrange(`1`,`16`) int b) // VQRSHRN.S32 d0,q0,#16
8931	{
8932	int16x4_t res64;
8933	__m128i r32;
8934	r32 = vrshrq_n_s32(a,b);
8935	r32 = _mm_packs_epi32 (r32,r32); //saturate and narrow, use low 64 bits only
8936	return64(r32);
8937	}
8938
8939	_NEON2SSESTORAGE int32x2_t vqrshrn_n_s64(int64x2_t a, __constrange(`1`,`32`) int b); // VQRSHRN.S64 d0,q0,#32
8940	_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int32x2_t vqrshrn_n_s64(int64x2_t a, __constrange(`1`,`32`) int b), _NEON2SSE_REASON_SLOW_UNEFFECTIVE)
8941	{
8942	//no optimal SIMD solution found
8943	_NEON2SSE_ALIGN_16 int64_t res64[`2`], atmp[`2`], maskb[`2`];
8944	int32x2_t res;
8945	_mm_store_si128((__m128i*)atmp, a);
8946	maskb[`0`] = atmp[`0`] & (( int64_t)`1` << (b - `1`));
8947	res64[`0`] = (atmp[`0`] >> b) + (maskb[`0`] >> (b - `1`)); //rounded result
8948	maskb[`1`] = atmp[`1`] & (( int64_t)`1` << (b - `1`));
8949	res64[`1`] = (atmp[`1`] >> b) + (maskb[`1`] >> (b - `1`)); //rounded result
8950	if(res64[`0`]>SINT_MAX) res64[`0`] = SINT_MAX;
8951	if(res64[`0`]<SINT_MIN) res64[`0`] = SINT_MIN;
8952	if(res64[`1`]>SINT_MAX) res64[`1`] = SINT_MAX;
8953	if(res64[`1`]<SINT_MIN) res64[`1`] = SINT_MIN;
8954	res.m64_i32[`0`] = (int32_t)res64[`0`];
8955	res.m64_i32[`1`] = (int32_t)res64[`1`];
8956	return res;
8957	}
8958
8959	_NEON2SSESTORAGE uint8x8_t vqrshrn_n_u16(uint16x8_t a, __constrange(`1`,`8`) int b); // VQRSHRN.s16 d0,q0,#8
8960	_NEON2SSE_INLINE uint8x8_t vqrshrn_n_u16(uint16x8_t a, __constrange(`1`,`8`) int b) // VQRSHRN.s16 d0,q0,#8
8961	{
8962	uint8x8_t res64;
8963	__m128i r16;
8964	r16 = vrshrq_n_u16(a,b); //after right shift b>=1 unsigned var fits into signed range, so we could use _mm_packus_epi16 (signed 16 to unsigned 8)
8965	r16 = _mm_packus_epi16 (r16,r16); //saturate and narrow, use low 64 bits only
8966	return64(r16);
8967	}
8968
8969	_NEON2SSESTORAGE uint16x4_t vqrshrn_n_u32(uint32x4_t a, __constrange(`1`,`16`) int b); // VQRSHRN.U32 d0,q0,#16
8970	_NEON2SSE_INLINE uint16x4_t vqrshrn_n_u32(uint32x4_t a, __constrange(`1`,`16`) int b) // VQRSHRN.U32 d0,q0,#16
8971	{
8972	uint16x4_t res64;
8973	__m128i r32;
8974	r32 = vrshrq_n_u32(a,b); //after right shift b>=1 unsigned var fits into signed range, so we could use _MM_PACKUS_EPI32 (signed 32 to unsigned 8)
8975	r32 = _MM_PACKUS1_EPI32 (r32); //saturate and narrow, use low 64 bits only
8976	return64(r32);
8977	}
8978
8979	_NEON2SSESTORAGE uint32x2_t vqrshrn_n_u64(uint64x2_t a, __constrange(`1`,`32`) int b); // VQRSHRN.U64 d0,q0,#32
8980	_NEON2SSE_INLINE uint32x2_t vqrshrn_n_u64(uint64x2_t a, __constrange(`1`,`32`) int b)
8981	{
8982	//serial solution may be faster
8983	uint32x2_t res64;
8984	__m128i r64, res_hi, zero;
8985	zero = _mm_setzero_si128();
8986	r64 = vrshrq_n_u64(a,b);
8987	res_hi = _mm_srli_epi64(r64, `32`);
8988	res_hi = _mm_cmpgt_epi32(res_hi, zero);
8989	r64 = _mm_or_si128(r64, res_hi);
8990	r64 = _mm_shuffle_epi32(r64, `0` \| (`2` << `2`) \| (`1` << `4`) \| (`3` << `6`)); //shuffle the data to get 2 32-bits
8991	return64(r64);
8992	}
8993
8994	//************ Vector widening shift left by constant **************
8995	//************************************************************************
8996	_NEON2SSESTORAGE int16x8_t vshll_n_s8(int8x8_t a, __constrange(`0`,`8`) int b); // VSHLL.S8 q0,d0,#0
8997	_NEON2SSE_INLINE int16x8_t vshll_n_s8(int8x8_t a, __constrange(`0`,`8`) int b) // VSHLL.S8 q0,d0,#0
8998	{
8999	__m128i r;
9000	r = _MM_CVTEPI8_EPI16 (_pM128i(a)); //SSE 4.1
9001	return _mm_slli_epi16 (r, b);
9002	}
9003
9004	_NEON2SSESTORAGE int32x4_t vshll_n_s16(int16x4_t a, __constrange(`0`,`16`) int b); // VSHLL.S16 q0,d0,#0
9005	_NEON2SSE_INLINE int32x4_t vshll_n_s16(int16x4_t a, __constrange(`0`,`16`) int b) // VSHLL.S16 q0,d0,#0
9006	{
9007	__m128i r;
9008	r = _MM_CVTEPI16_EPI32(_pM128i(a)); //SSE4.1,
9009	return _mm_slli_epi32 (r, b);
9010	}
9011
9012	_NEON2SSESTORAGE int64x2_t vshll_n_s32(int32x2_t a, __constrange(`0`,`32`) int b); // VSHLL.S32 q0,d0,#0
9013	_NEON2SSE_INLINE int64x2_t vshll_n_s32(int32x2_t a, __constrange(`0`,`32`) int b) // VSHLL.S32 q0,d0,#0
9014	{
9015	__m128i r;
9016	r = _MM_CVTEPI32_EPI64(_pM128i(a)); //SSE4.1,
9017	return _mm_slli_epi64 (r, b);
9018	}
9019
9020	_NEON2SSESTORAGE uint16x8_t vshll_n_u8(uint8x8_t a, __constrange(`0`,`8`) int b); // VSHLL.U8 q0,d0,#0
9021	_NEON2SSE_INLINE uint16x8_t vshll_n_u8(uint8x8_t a, __constrange(`0`,`8`) int b) // VSHLL.U8 q0,d0,#0
9022	{
9023	//no uint8 to uint16 conversion available, manual conversion used
9024	__m128i zero, r;
9025	zero = _mm_setzero_si128 ();
9026	r = _mm_unpacklo_epi8(_pM128i(a), zero);
9027	return _mm_slli_epi16 (r, b);
9028	}
9029
9030	_NEON2SSESTORAGE uint32x4_t vshll_n_u16(uint16x4_t a, __constrange(`0`,`16`) int b); // VSHLL.s16 q0,d0,#0
9031	_NEON2SSE_INLINE uint32x4_t vshll_n_u16(uint16x4_t a, __constrange(`0`,`16`) int b) // VSHLL.s16 q0,d0,#0
9032	{
9033	//no uint16 to uint32 conversion available, manual conversion used
9034	__m128i zero, r;
9035	zero = _mm_setzero_si128 ();
9036	r = _mm_unpacklo_epi16(_pM128i(a), zero);
9037	return _mm_slli_epi32 (r, b);
9038	}
9039
9040	_NEON2SSESTORAGE uint64x2_t vshll_n_u32(uint32x2_t a, __constrange(`0`,`32`) int b); // VSHLL.U32 q0,d0,#0
9041	_NEON2SSE_INLINE uint64x2_t vshll_n_u32(uint32x2_t a, __constrange(`0`,`32`) int b) // VSHLL.U32 q0,d0,#0
9042	{
9043	//no uint32 to uint64 conversion available, manual conversion used
9044	__m128i zero, r;
9045	zero = _mm_setzero_si128 ();
9046	r = _mm_unpacklo_epi32(_pM128i(a), zero);
9047	return _mm_slli_epi64 (r, b);
9048	}
9049
9050	//************************************************************************************
9051	//************************** Shifts with insert **********************************
9052	//************************************************************************************
9053	//takes each element in a vector, shifts them by an immediate value,
9054	//and inserts the results in the destination vector. Bits shifted out of the each element are lost.
9055
9056	//************** Vector shift right and insert **********************************
9057	//Actually the "c" left bits from "a" are the only bits remained from "a" after the shift.
9058	//All other bits are taken from b shifted.
9059	_NEON2SSESTORAGE int8x8_t vsri_n_s8(int8x8_t a, int8x8_t b, __constrange(`1`,`8`) int c); // VSRI.8 d0,d0,#8
9060	_NEON2SSE_INLINE int8x8_t vsri_n_s8(int8x8_t a, int8x8_t b, __constrange(`1`,`8`) int c)
9061	{
9062	int8x8_t res64;
9063	return64(vsriq_n_s8(_pM128i(a),_pM128i(b), c));
9064	}
9065
9066
9067	_NEON2SSESTORAGE int16x4_t vsri_n_s16(int16x4_t a, int16x4_t b, __constrange(`1`,`16`) int c); // VSRI.16 d0,d0,#16
9068	_NEON2SSE_INLINE int16x4_t vsri_n_s16(int16x4_t a, int16x4_t b, __constrange(`1`,`16`) int c)
9069	{
9070	int16x4_t res64;
9071	return64(vsriq_n_s16(_pM128i(a),_pM128i(b), c));
9072	}
9073
9074
9075	_NEON2SSESTORAGE int32x2_t vsri_n_s32(int32x2_t a, int32x2_t b, __constrange(`1`,`32`) int c); // VSRI.32 d0,d0,#32
9076	_NEON2SSE_INLINE int32x2_t vsri_n_s32(int32x2_t a, int32x2_t b, __constrange(`1`,`32`) int c)
9077	{
9078	int32x2_t res64;
9079	return64(vsriq_n_s32(_pM128i(a),_pM128i(b), c));
9080	}
9081
9082
9083	_NEON2SSESTORAGE int64x1_t vsri_n_s64(int64x1_t a, int64x1_t b, __constrange(`1`,`64`) int c); // VSRI.64 d0,d0,#64
9084	_NEON2SSE_INLINE int64x1_t vsri_n_s64(int64x1_t a, int64x1_t b, __constrange(`1`,`64`) int c)
9085	{
9086	int64x1_t res;
9087	if (c ==`64`)
9088	res = a;
9089	else{
9090	res.m64_i64[`0`] = (b.m64_u64[`0`] >> c) \| ((a.m64_i64[`0`] >> (`64` - c)) << (`64` - c)); //treat b as unsigned for shift to get leading zeros
9091	}
9092	return res;
9093	}
9094
9095	_NEON2SSE_GLOBAL uint8x8_t vsri_n_u8(uint8x8_t a, uint8x8_t b, __constrange(`1`,`8`) int c); // VSRI.8 d0,d0,#8
9096	#define vsri_n_u8 vsri_n_s8
9097
9098	_NEON2SSE_GLOBAL uint16x4_t vsri_n_u16(uint16x4_t a, uint16x4_t b, __constrange(`1`,`16`) int c); // VSRI.16 d0,d0,#16
9099	#define vsri_n_u16 vsri_n_s16
9100
9101	_NEON2SSE_GLOBAL uint32x2_t vsri_n_u32(uint32x2_t a, uint32x2_t b, __constrange(`1`,`32`) int c); // VSRI.32 d0,d0,#32
9102	#define vsri_n_u32 vsri_n_s32
9103
9104
9105	_NEON2SSE_GLOBAL uint64x1_t vsri_n_u64(uint64x1_t a, uint64x1_t b, __constrange(`1`,`64`) int c); // VSRI.64 d0,d0,#64
9106	#define vsri_n_u64 vsri_n_s64
9107
9108	_NEON2SSE_GLOBAL poly8x8_t vsri_n_p8(poly8x8_t a, poly8x8_t b, __constrange(`1`,`8`) int c); // VSRI.8 d0,d0,#8
9109	#define vsri_n_p8 vsri_n_u8
9110
9111	_NEON2SSE_GLOBAL poly16x4_t vsri_n_p16(poly16x4_t a, poly16x4_t b, __constrange(`1`,`16`) int c); // VSRI.16 d0,d0,#16
9112	#define vsri_n_p16 vsri_n_u16
9113
9114	_NEON2SSESTORAGE int8x16_t vsriq_n_s8(int8x16_t a, int8x16_t b, __constrange(`1`,`8`) int c); // VSRI.8 q0,q0,#8
9115	_NEON2SSE_INLINE int8x16_t vsriq_n_s8(int8x16_t a, int8x16_t b, __constrange(`1`,`8`) int c) // VSRI.8 q0,q0,#8
9116	{
9117	__m128i maskA, a_masked;
9118	uint8x16_t b_shift;
9119	_NEON2SSE_ALIGN_16 static const uint8_t maskLeft[`9`] = {`0x0`, `0x80`, `0xc0`, `0xe0`, `0xf0`, `0xf8`, `0xfc`, `0xfe`, `0xff`}; //"a" bits mask, 0 bit not used
9120	maskA = _mm_set1_epi8(maskLeft[c]); // c ones and (8-c)zeros
9121	a_masked = _mm_and_si128 (a, maskA);
9122	b_shift = vshrq_n_u8( b, c); // c zeros on the left in b due to logical shift
9123	return _mm_or_si128 (a_masked, b_shift); //combine (insert b into a)
9124	}
9125
9126	_NEON2SSESTORAGE int16x8_t vsriq_n_s16(int16x8_t a, int16x8_t b, __constrange(`1`,`16`) int c); // VSRI.16 q0,q0,#16
9127	_NEON2SSE_INLINE int16x8_t vsriq_n_s16(int16x8_t a, int16x8_t b, __constrange(`1`,`16`) int c) // VSRI.16 q0,q0,#16
9128	{
9129	//to cut "c" left bits from a we do shift right and then shift back left providing c right zeros in a
9130	uint16x8_t b_shift;
9131	uint16x8_t a_c;
9132	b_shift = vshrq_n_u16( b, c); // c zeros on the left in b due to logical shift
9133	a_c = vshrq_n_u16( a, (`16` - c));
9134	a_c = _mm_slli_epi16(a_c, (`16` - c)); //logical shift provides right "c" bits zeros in a
9135	return _mm_or_si128 (a_c, b_shift); //combine (insert b into a)
9136	}
9137
9138	_NEON2SSESTORAGE int32x4_t vsriq_n_s32(int32x4_t a, int32x4_t b, __constrange(`1`,`32`) int c); // VSRI.32 q0,q0,#32
9139	_NEON2SSE_INLINE int32x4_t vsriq_n_s32(int32x4_t a, int32x4_t b, __constrange(`1`,`32`) int c) // VSRI.32 q0,q0,#32
9140	{
9141	//to cut "c" left bits from a we do shift right and then shift back left providing c right zeros in a
9142	uint32x4_t b_shift;
9143	uint32x4_t a_c;
9144	b_shift = vshrq_n_u32( b, c); // c zeros on the left in b due to logical shift
9145	a_c = vshrq_n_u32( a, (`32` - c));
9146	a_c = _mm_slli_epi32(a_c, (`32` - c)); //logical shift provides right "c" bits zeros in a
9147	return _mm_or_si128 (a_c, b_shift); //combine (insert b into a)
9148	}
9149
9150	_NEON2SSESTORAGE int64x2_t vsriq_n_s64(int64x2_t a, int64x2_t b, __constrange(`1`,`64`) int c); // VSRI.64 q0,q0,#64
9151	_NEON2SSE_INLINE int64x2_t vsriq_n_s64(int64x2_t a, int64x2_t b, __constrange(`1`,`64`) int c)
9152	{
9153	//serial solution may be faster
9154	uint64x2_t b_shift;
9155	uint64x2_t a_c;
9156	b_shift = _mm_srli_epi64(b, c); // c zeros on the left in b due to logical shift
9157	a_c = _mm_srli_epi64(a, (`64` - c));
9158	a_c = _mm_slli_epi64(a_c, (`64` - c)); //logical shift provides right "c" bits zeros in a
9159	return _mm_or_si128 (a_c, b_shift); //combine (insert b into a)
9160	}
9161
9162	_NEON2SSE_GLOBAL uint8x16_t vsriq_n_u8(uint8x16_t a, uint8x16_t b, __constrange(`1`,`8`) int c); // VSRI.8 q0,q0,#8
9163	#define vsriq_n_u8 vsriq_n_s8
9164
9165	_NEON2SSE_GLOBAL uint16x8_t vsriq_n_u16(uint16x8_t a, uint16x8_t b, __constrange(`1`,`16`) int c); // VSRI.16 q0,q0,#16
9166	#define vsriq_n_u16 vsriq_n_s16
9167
9168	_NEON2SSE_GLOBAL uint32x4_t vsriq_n_u32(uint32x4_t a, uint32x4_t b, __constrange(`1`,`32`) int c); // VSRI.32 q0,q0,#32
9169	#define vsriq_n_u32 vsriq_n_s32
9170
9171	_NEON2SSE_GLOBAL uint64x2_t vsriq_n_u64(uint64x2_t a, uint64x2_t b, __constrange(`1`,`64`) int c); // VSRI.64 q0,q0,#64
9172	#define vsriq_n_u64 vsriq_n_s64
9173
9174	_NEON2SSE_GLOBAL poly8x16_t vsriq_n_p8(poly8x16_t a, poly8x16_t b, __constrange(`1`,`8`) int c); // VSRI.8 q0,q0,#8
9175	#define vsriq_n_p8 vsriq_n_u8
9176
9177	_NEON2SSE_GLOBAL poly16x8_t vsriq_n_p16(poly16x8_t a, poly16x8_t b, __constrange(`1`,`16`) int c); // VSRI.16 q0,q0,#16
9178	#define vsriq_n_p16 vsriq_n_u16
9179
9180	//*** Vector shift left and insert *******************************************
9181	//*********************************************************************************
9182	//Actually the "c" right bits from "a" are the only bits remained from "a" after the shift.
9183	//All other bits are taken from b shifted. Ending zeros are inserted in b in the shift proces. We need to combine "a" and "b shifted".
9184	_NEON2SSESTORAGE int8x8_t vsli_n_s8(int8x8_t a, int8x8_t b, __constrange(`0`,`7`) int c); // VSLI.8 d0,d0,#0
9185	_NEON2SSE_INLINE int8x8_t vsli_n_s8(int8x8_t a, int8x8_t b, __constrange(`0`,`7`) int c)
9186	{
9187	int8x8_t res64;
9188	return64(vsliq_n_s8(_pM128i(a),_pM128i(b), c));
9189	}
9190
9191
9192	_NEON2SSESTORAGE int16x4_t vsli_n_s16(int16x4_t a, int16x4_t b, __constrange(`0`,`15`) int c); // VSLI.16 d0,d0,#0
9193	_NEON2SSE_INLINE int16x4_t vsli_n_s16(int16x4_t a, int16x4_t b, __constrange(`0`,`15`) int c)
9194	{
9195	int16x4_t res64;
9196	return64(vsliq_n_s16(_pM128i(a),_pM128i(b), c));
9197	}
9198
9199
9200	_NEON2SSESTORAGE int32x2_t vsli_n_s32(int32x2_t a, int32x2_t b, __constrange(`0`,`31`) int c); // VSLI.32 d0,d0,#0
9201	_NEON2SSE_INLINE int32x2_t vsli_n_s32(int32x2_t a, int32x2_t b, __constrange(`0`,`31`) int c)
9202	{
9203	int32x2_t res64;
9204	return64(vsliq_n_s32(_pM128i(a),_pM128i(b), c));
9205	}
9206
9207	_NEON2SSESTORAGE int64x1_t vsli_n_s64(int64x1_t a, int64x1_t b, __constrange(`0`,`63`) int c); // VSLI.64 d0,d0,#0
9208	_NEON2SSE_INLINE int64x1_t vsli_n_s64(int64x1_t a, int64x1_t b, __constrange(`0`,`63`) int c)
9209	{
9210	int64x1_t res;
9211	res.m64_i64[`0`] = (b.m64_i64[`0`] << c) \| ((a.m64_u64[`0`] << (`64` - c)) >> (`64` - c)); //need to treat a as unsigned to get leading zeros
9212	return res;
9213	}
9214
9215
9216	_NEON2SSE_GLOBAL uint8x8_t vsli_n_u8(uint8x8_t a, uint8x8_t b, __constrange(`0`,`7`) int c); // VSLI.8 d0,d0,#0
9217	#define vsli_n_u8 vsli_n_s8
9218
9219	_NEON2SSE_GLOBAL uint16x4_t vsli_n_u16(uint16x4_t a, uint16x4_t b, __constrange(`0`,`15`) int c); // VSLI.16 d0,d0,#0
9220	#define vsli_n_u16 vsli_n_s16
9221
9222	_NEON2SSE_GLOBAL uint32x2_t vsli_n_u32(uint32x2_t a, uint32x2_t b, __constrange(`0`,`31`) int c); // VSLI.32 d0,d0,#0
9223	#define vsli_n_u32 vsli_n_s32
9224
9225	_NEON2SSE_GLOBAL uint64x1_t vsli_n_u64(uint64x1_t a, uint64x1_t b, __constrange(`0`,`63`) int c); // VSLI.64 d0,d0,#0
9226	#define vsli_n_u64 vsli_n_s64
9227
9228	_NEON2SSE_GLOBAL poly8x8_t vsli_n_p8(poly8x8_t a, poly8x8_t b, __constrange(`0`,`7`) int c); // VSLI.8 d0,d0,#0
9229	#define vsli_n_p8 vsli_n_u8
9230
9231	_NEON2SSE_GLOBAL poly16x4_t vsli_n_p16(poly16x4_t a, poly16x4_t b, __constrange(`0`,`15`) int c); // VSLI.16 d0,d0,#0
9232	#define vsli_n_p16 vsli_n_u16
9233
9234	_NEON2SSESTORAGE int8x16_t vsliq_n_s8(int8x16_t a, int8x16_t b, __constrange(`0`,`7`) int c); // VSLI.8 q0,q0,#0
9235	_NEON2SSE_INLINE int8x16_t vsliq_n_s8(int8x16_t a, int8x16_t b, __constrange(`0`,`7`) int c) // VSLI.8 q0,q0,#0
9236	{
9237	__m128i maskA, a_masked;
9238	int8x16_t b_shift;
9239	_NEON2SSE_ALIGN_16 static const uint8_t maskRight[`8`] = {`0x0`, `0x1`, `0x3`, `0x7`, `0x0f`, `0x1f`, `0x3f`, `0x7f`}; //"a" bits mask
9240	maskA = _mm_set1_epi8(maskRight[c]); // (8-c)zeros and c ones
9241	b_shift = vshlq_n_s8( b, c);
9242	a_masked = _mm_and_si128 (a, maskA);
9243	return _mm_or_si128 (b_shift, a_masked); //combine (insert b into a)
9244	}
9245
9246	_NEON2SSESTORAGE int16x8_t vsliq_n_s16(int16x8_t a, int16x8_t b, __constrange(`0`,`15`) int c); // VSLI.16 q0,q0,#0
9247	_NEON2SSE_INLINE int16x8_t vsliq_n_s16(int16x8_t a, int16x8_t b, __constrange(`0`,`15`) int c) // VSLI.16 q0,q0,#0
9248	{
9249	//to cut "c" right bits from a we do shift left and then logical shift back right providing (16-c)zeros in a
9250	int16x8_t b_shift;
9251	int16x8_t a_c;
9252	b_shift = vshlq_n_s16( b, c);
9253	a_c = vshlq_n_s16( a, (`16` - c));
9254	a_c = _mm_srli_epi16(a_c, (`16` - c));
9255	return _mm_or_si128 (b_shift, a_c); //combine (insert b into a)
9256	}
9257
9258	_NEON2SSESTORAGE int32x4_t vsliq_n_s32(int32x4_t a, int32x4_t b, __constrange(`0`,`31`) int c); // VSLI.32 q0,q0,#0
9259	_NEON2SSE_INLINE int32x4_t vsliq_n_s32(int32x4_t a, int32x4_t b, __constrange(`0`,`31`) int c) // VSLI.32 q0,q0,#0
9260	{
9261	//solution may be not optimal compared with the serial one
9262	//to cut "c" right bits from a we do shift left and then logical shift back right providing (32-c)zeros in a
9263	int32x4_t b_shift;
9264	int32x4_t a_c;
9265	b_shift = vshlq_n_s32( b, c);
9266	a_c = vshlq_n_s32( a, (`32` - c));
9267	a_c = _mm_srli_epi32(a_c, (`32` - c));
9268	return _mm_or_si128 (b_shift, a_c); //combine (insert b into a)
9269	}
9270
9271	_NEON2SSESTORAGE int64x2_t vsliq_n_s64(int64x2_t a, int64x2_t b, __constrange(`0`,`63`) int c); // VSLI.64 q0,q0,#0
9272	_NEON2SSE_INLINE int64x2_t vsliq_n_s64(int64x2_t a, int64x2_t b, __constrange(`0`,`63`) int c) // VSLI.64 q0,q0,#0
9273	{
9274	//solution may be not optimal compared with the serial one
9275	//to cut "c" right bits from a we do shift left and then logical shift back right providing (64-c)zeros in a
9276	int64x2_t b_shift;
9277	int64x2_t a_c;
9278	b_shift = vshlq_n_s64( b, c);
9279	a_c = vshlq_n_s64( a, (`64` - c));
9280	a_c = _mm_srli_epi64(a_c, (`64` - c));
9281	return _mm_or_si128 (b_shift, a_c); //combine (insert b into a)
9282	}
9283
9284	_NEON2SSE_GLOBAL uint8x16_t vsliq_n_u8(uint8x16_t a, uint8x16_t b, __constrange(`0`,`7`) int c); // VSLI.8 q0,q0,#0
9285	#define vsliq_n_u8 vsliq_n_s8
9286
9287	_NEON2SSE_GLOBAL uint16x8_t vsliq_n_u16(uint16x8_t a, uint16x8_t b, __constrange(`0`,`15`) int c); // VSLI.16 q0,q0,#0
9288	#define vsliq_n_u16 vsliq_n_s16
9289
9290	_NEON2SSE_GLOBAL uint32x4_t vsliq_n_u32(uint32x4_t a, uint32x4_t b, __constrange(`0`,`31`) int c); // VSLI.32 q0,q0,#0
9291	#define vsliq_n_u32 vsliq_n_s32
9292
9293	_NEON2SSE_GLOBAL uint64x2_t vsliq_n_u64(uint64x2_t a, uint64x2_t b, __constrange(`0`,`63`) int c); // VSLI.64 q0,q0,#0
9294	#define vsliq_n_u64 vsliq_n_s64
9295
9296	_NEON2SSE_GLOBAL poly8x16_t vsliq_n_p8(poly8x16_t a, poly8x16_t b, __constrange(`0`,`7`) int c); // VSLI.8 q0,q0,#0
9297	#define vsliq_n_p8 vsliq_n_u8
9298
9299	_NEON2SSE_GLOBAL poly16x8_t vsliq_n_p16(poly16x8_t a, poly16x8_t b, __constrange(`0`,`15`) int c); // VSLI.16 q0,q0,#0
9300	#define vsliq_n_p16 vsliq_n_u16
9301
9302	// ***********************************************************************************************
9303	// **************** Loads and stores of a single vector *************************************
9304	// ***********************************************************************************************
9305	//Performs loads and stores of a single vector of some type.
9306	//***************************** Loads ******************************************************
9307	// ***********************************************************************************************
9308	//We assume ptr is NOT aligned in general case and use __m128i _mm_loadu_si128 ((__m128i) ptr);.*
9309	//also for SSE3 supporting systems the __m128i _mm_lddqu_si128 (__m128i const p) usage for unaligned access may be advantageous.*
9310	// it loads a 32-byte block aligned on a 16-byte boundary and extracts the 16 bytes corresponding to the unaligned access
9311	//If the ptr is aligned then could use __m128i _mm_load_si128 ((__m128i) ptr) instead;*
9312	#define LOAD_SI128(ptr) \
9313	( ((uintptr_t)(ptr) & 15) == 0 ) ? _mm_load_si128((__m128i)(ptr)) : _mm_loadu_si128((__m128i)(ptr))
9314
9315	_NEON2SSE_GLOBAL uint8x16_t vld1q_u8(__transfersize(`16`) uint8_t const * ptr); // VLD1.8 {d0, d1}, [r0]
9316	#define vld1q_u8 LOAD_SI128
9317
9318	_NEON2SSE_GLOBAL uint16x8_t vld1q_u16(__transfersize(`8`) uint16_t const * ptr); // VLD1.16 {d0, d1}, [r0]
9319	#define vld1q_u16 LOAD_SI128
9320
9321	_NEON2SSE_GLOBAL uint32x4_t vld1q_u32(__transfersize(`4`) uint32_t const * ptr); // VLD1.32 {d0, d1}, [r0]
9322	#define vld1q_u32 LOAD_SI128
9323
9324	_NEON2SSE_GLOBAL uint64x2_t vld1q_u64(__transfersize(`2`) uint64_t const * ptr); // VLD1.64 {d0, d1}, [r0]
9325	#define vld1q_u64 LOAD_SI128
9326
9327	_NEON2SSE_GLOBAL int8x16_t vld1q_s8(__transfersize(`16`) int8_t const * ptr); // VLD1.8 {d0, d1}, [r0]
9328	#define vld1q_s8 LOAD_SI128
9329
9330	_NEON2SSE_GLOBAL int16x8_t vld1q_s16(__transfersize(`8`) int16_t const * ptr); // VLD1.16 {d0, d1}, [r0]
9331	#define vld1q_s16 LOAD_SI128
9332
9333	_NEON2SSE_GLOBAL int32x4_t vld1q_s32(__transfersize(`4`) int32_t const * ptr); // VLD1.32 {d0, d1}, [r0]
9334	#define vld1q_s32 LOAD_SI128
9335
9336	_NEON2SSE_GLOBAL int64x2_t vld1q_s64(__transfersize(`2`) int64_t const * ptr); // VLD1.64 {d0, d1}, [r0]
9337	#define vld1q_s64 LOAD_SI128
9338
9339	_NEON2SSE_GLOBAL float16x8_t vld1q_f16(__transfersize(`8`) __fp16 const * ptr); // VLD1.16 {d0, d1}, [r0]
9340	// IA32 SIMD doesn't work with 16bit floats currently, so need to go to 32 bit and then work with two 128bit registers
9341	/ _NEON2SSE_INLINE float16x8_t vld1q_f16(__transfersize(8) __fp16 const * ptr)// VLD1.16 {d0, d1}, [r0]*
9342	{__m128 f1 = _mm_set_ps (ptr[3], ptr[2], ptr[1], ptr[0]);
9343	__m128 f2;
9344	f2 = _mm_set_ps (ptr[7], ptr[6], ptr[5], ptr[4]);
9345	}/*
9346
9347	_NEON2SSESTORAGE float32x4_t vld1q_f32(__transfersize(`4`) float32_t const * ptr); // VLD1.32 {d0, d1}, [r0]
9348	_NEON2SSE_INLINE float32x4_t vld1q_f32(__transfersize(`4`) float32_t const * ptr)
9349	{
9350	if( (((uintptr_t)(ptr)) & `15` ) == `0` ) //16 bits aligned
9351	return _mm_load_ps(ptr);
9352	else
9353	return _mm_loadu_ps(ptr);
9354	}
9355
9356	_NEON2SSE_GLOBAL poly8x16_t vld1q_p8(__transfersize(`16`) poly8_t const * ptr); // VLD1.8 {d0, d1}, [r0]
9357	#define vld1q_p8 LOAD_SI128
9358
9359	_NEON2SSE_GLOBAL poly16x8_t vld1q_p16(__transfersize(`8`) poly16_t const * ptr); // VLD1.16 {d0, d1}, [r0]
9360	#define vld1q_p16 LOAD_SI128
9361
9362	_NEON2SSE_GLOBAL uint8x8_t vld1_u8(__transfersize(`8`) uint8_t const * ptr); // VLD1.8 {d0}, [r0]
9363	#define vld1_u8(ptr) ((__m64_128)(ptr)) //was _mm_loadl_epi64((__m128i*)(ptr))
9364
9365	_NEON2SSE_GLOBAL uint16x4_t vld1_u16(__transfersize(`4`) uint16_t const * ptr); // VLD1.16 {d0}, [r0]
9366	#define vld1_u16 vld1_u8
9367
9368	_NEON2SSE_GLOBAL uint32x2_t vld1_u32(__transfersize(`2`) uint32_t const * ptr); // VLD1.32 {d0}, [r0]
9369	#define vld1_u32 vld1_u8
9370
9371
9372	_NEON2SSE_GLOBAL uint64x1_t vld1_u64(__transfersize(`1`) uint64_t const * ptr); // VLD1.64 {d0}, [r0]
9373	#define vld1_u64 vld1_u8
9374
9375	_NEON2SSE_GLOBAL int8x8_t vld1_s8(__transfersize(`8`) int8_t const * ptr); // VLD1.8 {d0}, [r0]
9376	#define vld1_s8 vld1_u8
9377
9378	_NEON2SSE_GLOBAL int16x4_t vld1_s16(__transfersize(`4`) int16_t const * ptr); // VLD1.16 {d0}, [r0]
9379	#define vld1_s16 vld1_u16
9380
9381	_NEON2SSE_GLOBAL int32x2_t vld1_s32(__transfersize(`2`) int32_t const * ptr); // VLD1.32 {d0}, [r0]
9382	#define vld1_s32 vld1_u32
9383
9384	_NEON2SSE_GLOBAL int64x1_t vld1_s64(__transfersize(`1`) int64_t const * ptr); // VLD1.64 {d0}, [r0]
9385	#define vld1_s64 vld1_u64
9386
9387	_NEON2SSE_GLOBAL float16x4_t vld1_f16(__transfersize(`4`) __fp16 const * ptr); // VLD1.16 {d0}, [r0]
9388	// IA32 SIMD doesn't work with 16bit floats currently, so need to go to 32 bit like _mm_set_ps (ptr[3], ptr[2], ptr[1], ptr[0]);
9389
9390	_NEON2SSESTORAGE float32x2_t vld1_f32(__transfersize(`2`) float32_t const * ptr); // VLD1.32 {d0}, [r0]
9391	_NEON2SSE_INLINE float32x2_t vld1_f32(__transfersize(`2`) float32_t const * ptr)
9392	{
9393	float32x2_t res;
9394	res.m64_f32[`0`] = *(ptr);
9395	res.m64_f32[`1`] = *(ptr + `1`);
9396	return res;
9397	}
9398
9399	_NEON2SSE_GLOBAL poly8x8_t vld1_p8(__transfersize(`8`) poly8_t const * ptr); // VLD1.8 {d0}, [r0]
9400	#define vld1_p8 vld1_u8
9401
9402	_NEON2SSE_GLOBAL poly16x4_t vld1_p16(__transfersize(`4`) poly16_t const * ptr); // VLD1.16 {d0}, [r0]
9403	#define vld1_p16 vld1_u16
9404
9405
9406	_NEON2SSESTORAGE float64x2_t vld1q_f64(__transfersize(`4`) float64_t const * ptr); // VLD1.64 {d0, d1}, [r0]
9407	_NEON2SSE_INLINE float64x2_t vld1q_f64(__transfersize(`4`) float64_t const * ptr)
9408	{
9409	if ((((uintptr_t)(ptr)) & `15`) == `0`) //16 bits aligned
9410	return _mm_load_pd(ptr);
9411	else
9412	return _mm_loadu_pd(ptr);
9413	}
9414
9415
9416	//***********************************************************************************************************
9417	//***** Lane load functions - insert the data at vector's given position (lane) ***********************
9418	//***********************************************************************************************************
9419	_NEON2SSE_GLOBAL uint8x16_t vld1q_lane_u8(__transfersize(`1`) uint8_t const * ptr, uint8x16_t vec, __constrange(`0`,`15`) int lane); // VLD1.8 {d0[0]}, [r0]
9420	#define vld1q_lane_u8(ptr, vec, lane) _MM_INSERT_EPI8(vec, *(ptr), lane)
9421
9422	_NEON2SSE_GLOBAL uint16x8_t vld1q_lane_u16(__transfersize(`1`) uint16_t const * ptr, uint16x8_t vec, __constrange(`0`,`7`) int lane); // VLD1.16 {d0[0]}, [r0]
9423	#define vld1q_lane_u16(ptr, vec, lane) _MM_INSERT_EPI16(vec, *(ptr), lane)
9424
9425	_NEON2SSE_GLOBAL uint32x4_t vld1q_lane_u32(__transfersize(`1`) uint32_t const * ptr, uint32x4_t vec, __constrange(`0`,`3`) int lane); // VLD1.32 {d0[0]}, [r0]
9426	#define vld1q_lane_u32(ptr, vec, lane) _MM_INSERT_EPI32(vec, *(ptr), lane)
9427
9428	_NEON2SSE_GLOBAL uint64x2_t vld1q_lane_u64(__transfersize(`1`) uint64_t const * ptr, uint64x2_t vec, __constrange(`0`,`1`) int lane); // VLD1.64 {d0}, [r0]
9429	#define vld1q_lane_u64(ptr, vec, lane) _MM_INSERT_EPI64(vec, *(ptr), lane)
9430
9431
9432	_NEON2SSE_GLOBAL int8x16_t vld1q_lane_s8(__transfersize(`1`) int8_t const * ptr, int8x16_t vec, __constrange(`0`,`15`) int lane); // VLD1.8 {d0[0]}, [r0]
9433	#define vld1q_lane_s8(ptr, vec, lane) _MM_INSERT_EPI8(vec, *(ptr), lane)
9434
9435	_NEON2SSE_GLOBAL int16x8_t vld1q_lane_s16(__transfersize(`1`) int16_t const * ptr, int16x8_t vec, __constrange(`0`,`7`) int lane); // VLD1.16 {d0[0]}, [r0]
9436	#define vld1q_lane_s16(ptr, vec, lane) _MM_INSERT_EPI16(vec, *(ptr), lane)
9437
9438	_NEON2SSE_GLOBAL int32x4_t vld1q_lane_s32(__transfersize(`1`) int32_t const * ptr, int32x4_t vec, __constrange(`0`,`3`) int lane); // VLD1.32 {d0[0]}, [r0]
9439	#define vld1q_lane_s32(ptr, vec, lane) _MM_INSERT_EPI32(vec, *(ptr), lane)
9440
9441	_NEON2SSE_GLOBAL float16x8_t vld1q_lane_f16(__transfersize(`1`) __fp16 const * ptr, float16x8_t vec, __constrange(`0`,`7`) int lane); // VLD1.16 {d0[0]}, [r0]
9442	//current IA SIMD doesn't support float16
9443
9444	_NEON2SSESTORAGE float32x4_t vld1q_lane_f32(__transfersize(`1`) float32_t const * ptr, float32x4_t vec, __constrange(`0`,`3`) int lane); // VLD1.32 {d0[0]}, [r0]
9445	_NEON2SSE_INLINE float32x4_t vld1q_lane_f32(__transfersize(`1`) float32_t const * ptr, float32x4_t vec, __constrange(`0`,`3`) int lane)
9446	{
9447	//we need to deal with ptr 16bit NOT aligned case
9448	__m128 p;
9449	p = _mm_set1_ps(*(ptr));
9450	return _MM_INSERT_PS(vec, p, _INSERTPS_NDX(`0`, lane));
9451	}
9452
9453	_NEON2SSE_GLOBAL int64x2_t vld1q_lane_s64(__transfersize(`1`) int64_t const * ptr, int64x2_t vec, __constrange(`0`,`1`) int lane); // VLD1.64 {d0}, [r0]
9454	#define vld1q_lane_s64(ptr, vec, lane) _MM_INSERT_EPI64(vec, *(ptr), lane)
9455
9456	_NEON2SSE_GLOBAL poly8x16_t vld1q_lane_p8(__transfersize(`1`) poly8_t const * ptr, poly8x16_t vec, __constrange(`0`,`15`) int lane); // VLD1.8 {d0[0]}, [r0]
9457	#define vld1q_lane_p8(ptr, vec, lane) _MM_INSERT_EPI8(vec, *(ptr), lane)
9458
9459	_NEON2SSE_GLOBAL poly16x8_t vld1q_lane_p16(__transfersize(`1`) poly16_t const * ptr, poly16x8_t vec, __constrange(`0`,`7`) int lane); // VLD1.16 {d0[0]}, [r0]
9460	#define vld1q_lane_p16(ptr, vec, lane) _MM_INSERT_EPI16(vec, *(ptr), lane)
9461
9462	_NEON2SSESTORAGE uint8x8_t vld1_lane_u8(__transfersize(`1`) uint8_t const * ptr, uint8x8_t vec, __constrange(`0`,`7`) int lane); // VLD1.8 {d0[0]}, [r0]
9463	_NEON2SSE_INLINE uint8x8_t vld1_lane_u8(__transfersize(`1`) uint8_t const * ptr, uint8x8_t vec, __constrange(`0`,`7`) int lane)
9464	{
9465	uint8x8_t res;
9466	res = vec;
9467	res.m64_u8[lane] = *(ptr);
9468	return res;
9469	}
9470
9471	_NEON2SSESTORAGE uint16x4_t vld1_lane_u16(__transfersize(`1`) uint16_t const * ptr, uint16x4_t vec, __constrange(`0`,`3`) int lane); // VLD1.16 {d0[0]}, [r0]
9472	_NEON2SSE_INLINE uint16x4_t vld1_lane_u16(__transfersize(`1`) uint16_t const * ptr, uint16x4_t vec, __constrange(`0`,`3`) int lane)
9473	{
9474	uint16x4_t res;
9475	res = vec;
9476	res.m64_u16[lane] = *(ptr);
9477	return res;
9478	}
9479
9480	_NEON2SSESTORAGE uint32x2_t vld1_lane_u32(__transfersize(`1`) uint32_t const * ptr, uint32x2_t vec, __constrange(`0`,`1`) int lane); // VLD1.32 {d0[0]}, [r0]
9481	_NEON2SSE_INLINE uint32x2_t vld1_lane_u32(__transfersize(`1`) uint32_t const * ptr, uint32x2_t vec, __constrange(`0`,`1`) int lane)
9482	{
9483	uint32x2_t res;
9484	res = vec;
9485	res.m64_u32[lane] = *(ptr);
9486	return res;
9487	}
9488
9489	_NEON2SSESTORAGE uint64x1_t vld1_lane_u64(__transfersize(`1`) uint64_t const * ptr, uint64x1_t vec, __constrange(`0`,`0`) int lane); // VLD1.64 {d0}, [r0]
9490	_NEON2SSE_INLINE uint64x1_t vld1_lane_u64(__transfersize(`1`) uint64_t const * ptr, uint64x1_t vec, __constrange(`0`,`0`) int lane)
9491	{
9492	uint64x1_t res;
9493	UNREFERENCED_PARAMETER(vec);
9494	UNREFERENCED_PARAMETER(lane);
9495	res.m64_u64[`0`] = *(ptr);
9496	return res;
9497	}
9498
9499
9500	_NEON2SSE_GLOBAL int8x8_t vld1_lane_s8(__transfersize(`1`) int8_t const * ptr, int8x8_t vec, __constrange(`0`,`7`) int lane); // VLD1.8 {d0[0]}, [r0]
9501	#define vld1_lane_s8(ptr, vec, lane) vld1_lane_u8((uint8_t*)ptr, vec, lane)
9502
9503	_NEON2SSE_GLOBAL int16x4_t vld1_lane_s16(__transfersize(`1`) int16_t const * ptr, int16x4_t vec, __constrange(`0`,`3`) int lane); // VLD1.16 {d0[0]}, [r0]
9504	#define vld1_lane_s16(ptr, vec, lane) vld1_lane_u16((uint16_t*)ptr, vec, lane)
9505
9506	_NEON2SSE_GLOBAL int32x2_t vld1_lane_s32(__transfersize(`1`) int32_t const * ptr, int32x2_t vec, __constrange(`0`,`1`) int lane); // VLD1.32 {d0[0]}, [r0]
9507	#define vld1_lane_s32(ptr, vec, lane) vld1_lane_u32((uint32_t*)ptr, vec, lane)
9508
9509	_NEON2SSE_GLOBAL float16x4_t vld1_lane_f16(__transfersize(`1`) __fp16 const * ptr, float16x4_t vec, __constrange(`0`,`3`) int lane); // VLD1.16 {d0[0]}, [r0]
9510	//current IA SIMD doesn't support float16
9511
9512	_NEON2SSESTORAGE float32x2_t vld1_lane_f32(__transfersize(`1`) float32_t const * ptr, float32x2_t vec, __constrange(`0`,`1`) int lane); // VLD1.32 {d0[0]}, [r0]
9513	_NEON2SSE_INLINE float32x2_t vld1_lane_f32(__transfersize(`1`) float32_t const * ptr, float32x2_t vec, __constrange(`0`,`1`) int lane)
9514	{
9515	float32x2_t res;
9516	res = vec;
9517	res.m64_f32[lane] = *(ptr);
9518	return res;
9519	}
9520
9521	_NEON2SSE_GLOBAL int64x1_t vld1_lane_s64(__transfersize(`1`) int64_t const * ptr, int64x1_t vec, __constrange(`0`,`0`) int lane); // VLD1.64 {d0}, [r0]
9522	#define vld1_lane_s64(ptr, vec, lane) vld1_lane_u64((uint64_t*)ptr, vec, lane)
9523
9524	_NEON2SSE_GLOBAL poly8x8_t vld1_lane_p8(__transfersize(`1`) poly8_t const * ptr, poly8x8_t vec, __constrange(`0`,`7`) int lane); // VLD1.8 {d0[0]}, [r0]
9525	#define vld1_lane_p8 vld1_lane_u8
9526
9527	_NEON2SSE_GLOBAL poly16x4_t vld1_lane_p16(__transfersize(`1`) poly16_t const * ptr, poly16x4_t vec, __constrange(`0`,`3`) int lane); // VLD1.16 {d0[0]}, [r0]
9528	#define vld1_lane_p16 vld1_lane_s16
9529
9530	// **************** Load single value ( set all lanes of vector with same value from memory)********************
9531	// ******************************************************************************************************************
9532	_NEON2SSE_GLOBAL uint8x16_t vld1q_dup_u8(__transfersize(`1`) uint8_t const * ptr); // VLD1.8 {d0[]}, [r0]
9533	#define vld1q_dup_u8(ptr) _mm_set1_epi8(*(ptr))
9534
9535	_NEON2SSE_GLOBAL uint16x8_t vld1q_dup_u16(__transfersize(`1`) uint16_t const * ptr); // VLD1.16 {d0[]}, [r0]
9536	#define vld1q_dup_u16(ptr) _mm_set1_epi16(*(ptr))
9537
9538	_NEON2SSE_GLOBAL uint32x4_t vld1q_dup_u32(__transfersize(`1`) uint32_t const * ptr); // VLD1.32 {d0[]}, [r0]
9539	#define vld1q_dup_u32(ptr) _mm_set1_epi32(*(ptr))
9540
9541	_NEON2SSESTORAGE uint64x2_t vld1q_dup_u64(__transfersize(`1`) uint64_t const * ptr); // VLD1.64 {d0}, [r0]
9542	_NEON2SSE_INLINE uint64x2_t vld1q_dup_u64(__transfersize(`1`) uint64_t const * ptr)
9543	{
9544	_NEON2SSE_ALIGN_16 uint64_t val[`2`];
9545
9546	val[`0`] = *(ptr);
9547	val[`1`] = *(ptr);
9548
9549	return LOAD_SI128(val);
9550	}
9551
9552	_NEON2SSE_GLOBAL int8x16_t vld1q_dup_s8(__transfersize(`1`) int8_t const * ptr); // VLD1.8 {d0[]}, [r0]
9553	#define vld1q_dup_s8(ptr) _mm_set1_epi8(*(ptr))
9554
9555	_NEON2SSE_GLOBAL int16x8_t vld1q_dup_s16(__transfersize(`1`) int16_t const * ptr); // VLD1.16 {d0[]}, [r0]
9556	#define vld1q_dup_s16(ptr) _mm_set1_epi16 (*(ptr))
9557
9558	_NEON2SSE_GLOBAL int32x4_t vld1q_dup_s32(__transfersize(`1`) int32_t const * ptr); // VLD1.32 {d0[]}, [r0]
9559	#define vld1q_dup_s32(ptr) _mm_set1_epi32 (*(ptr))
9560
9561	_NEON2SSE_GLOBAL int64x2_t vld1q_dup_s64(__transfersize(`1`) int64_t const * ptr); // VLD1.64 {d0}, [r0]
9562	#define vld1q_dup_s64(ptr) vld1q_dup_u64((uint64_t*)ptr)
9563
9564	_NEON2SSE_GLOBAL float16x8_t vld1q_dup_f16(__transfersize(`1`) __fp16 const * ptr); // VLD1.16 {d0[]}, [r0]
9565	//current IA SIMD doesn't support float16, need to go to 32 bits
9566
9567	_NEON2SSE_GLOBAL float32x4_t vld1q_dup_f32(__transfersize(`1`) float32_t const * ptr); // VLD1.32 {d0[]}, [r0]
9568	#define vld1q_dup_f32(ptr) _mm_set1_ps (*(ptr))
9569
9570	_NEON2SSE_GLOBAL poly8x16_t vld1q_dup_p8(__transfersize(`1`) poly8_t const * ptr); // VLD1.8 {d0[]}, [r0]
9571	#define vld1q_dup_p8(ptr) _mm_set1_epi8(*(ptr))
9572
9573	_NEON2SSE_GLOBAL poly16x8_t vld1q_dup_p16(__transfersize(`1`) poly16_t const * ptr); // VLD1.16 {d0[]}, [r0]
9574	#define vld1q_dup_p16(ptr) _mm_set1_epi16 (*(ptr))
9575
9576	_NEON2SSESTORAGE uint8x8_t vld1_dup_u8(__transfersize(`1`) uint8_t const * ptr); // VLD1.8 {d0[]}, [r0]
9577	_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint8x8_t vld1_dup_u8(__transfersize(`1`) uint8_t const * ptr), _NEON2SSE_REASON_SLOW_SERIAL)
9578	{
9579	uint8x8_t res;
9580	int i;
9581	for(i = `0`; i<`8`; i++) {
9582	res.m64_u8[i] = *(ptr);
9583	}
9584	return res;
9585	}
9586
9587	_NEON2SSESTORAGE uint16x4_t vld1_dup_u16(__transfersize(`1`) uint16_t const * ptr); // VLD1.16 {d0[]}, [r0]
9588	_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint16x4_t vld1_dup_u16(__transfersize(`1`) uint16_t const * ptr), _NEON2SSE_REASON_SLOW_SERIAL)
9589	{
9590	uint16x4_t res;
9591	int i;
9592	for(i = `0`; i<`4`; i++) {
9593	res.m64_u16[i] = *(ptr);
9594	}
9595	return res;
9596	}
9597
9598	_NEON2SSESTORAGE uint32x2_t vld1_dup_u32(__transfersize(`1`) uint32_t const * ptr); // VLD1.32 {d0[]}, [r0]
9599	_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint32x2_t vld1_dup_u32(__transfersize(`1`) uint32_t const * ptr), _NEON2SSE_REASON_SLOW_SERIAL)
9600	{
9601	uint32x2_t res;
9602	res.m64_u32[`0`] = *(ptr);
9603	res.m64_u32[`1`] = *(ptr);
9604	return res;
9605	}
9606
9607	_NEON2SSESTORAGE uint64x1_t vld1_dup_u64(__transfersize(`1`) uint64_t const * ptr); // VLD1.64 {d0}, [r0]
9608	_NEON2SSE_INLINE uint64x1_t vld1_dup_u64(__transfersize(`1`) uint64_t const * ptr)
9609	{
9610	uint64x1_t res;
9611	res.m64_u64[`0`] = *(ptr);
9612	return res;
9613	}
9614
9615	_NEON2SSE_GLOBAL int8x8_t vld1_dup_s8(__transfersize(`1`) int8_t const * ptr); // VLD1.8 {d0[]}, [r0]
9616	#define vld1_dup_s8(ptr) vld1_dup_u8((uint8_t*)ptr)
9617
9618
9619	_NEON2SSE_GLOBAL int16x4_t vld1_dup_s16(__transfersize(`1`) int16_t const * ptr); // VLD1.16 {d0[]}, [r0]
9620	#define vld1_dup_s16(ptr) vld1_dup_u16((uint16_t*)ptr)
9621
9622
9623	_NEON2SSE_GLOBAL int32x2_t vld1_dup_s32(__transfersize(`1`) int32_t const * ptr); // VLD1.32 {d0[]}, [r0]
9624	#define vld1_dup_s32(ptr) vld1_dup_u32((uint32_t*)ptr)
9625
9626
9627	_NEON2SSE_GLOBAL int64x1_t vld1_dup_s64(__transfersize(`1`) int64_t const * ptr); // VLD1.64 {d0}, [r0]
9628	#define vld1_dup_s64(ptr) vld1_dup_u64((uint64_t*)ptr)
9629
9630	_NEON2SSE_GLOBAL float16x4_t vld1_dup_f16(__transfersize(`1`) __fp16 const * ptr); // VLD1.16 {d0[]}, [r0]
9631	//current IA SIMD doesn't support float16
9632
9633	_NEON2SSESTORAGE float32x2_t vld1_dup_f32(__transfersize(`1`) float32_t const * ptr); // VLD1.32 {d0[]}, [r0]
9634	_NEON2SSE_INLINE float32x2_t vld1_dup_f32(__transfersize(`1`) float32_t const * ptr)
9635	{
9636	float32x2_t res;
9637	res.m64_f32[`0`] = *(ptr);
9638	res.m64_f32[`1`] = res.m64_f32[`0`];
9639	return res; // use last 64bits only
9640	}
9641
9642	_NEON2SSE_GLOBAL poly8x8_t vld1_dup_p8(__transfersize(`1`) poly8_t const * ptr); // VLD1.8 {d0[]}, [r0]
9643	#define vld1_dup_p8 vld1_dup_u8
9644
9645
9646	_NEON2SSE_GLOBAL poly16x4_t vld1_dup_p16(__transfersize(`1`) poly16_t const * ptr); // VLD1.16 {d0[]}, [r0]
9647	#define vld1_dup_p16 vld1_dup_u16
9648
9649
9650	//*************************************************************************************
9651	//******************************* Store ********************************************
9652	//*************************************************************************************
9653	// If ptr is 16bit aligned and you need to store data without cache pollution then use void _mm_stream_si128 ((__m128i)ptr, val);*
9654	//here we assume the case of NOT 16bit aligned ptr possible. If it is aligned we could to use _mm_store_si128 like shown in the following macro
9655	#define STORE_SI128(ptr, val) \
9656	(((uintptr_t)(ptr) & 15) == 0 ) ? _mm_store_si128 ((__m128i)(ptr), val) : _mm_storeu_si128 ((__m128i)(ptr), val);
9657
9658	_NEON2SSE_GLOBAL void vst1q_u8(__transfersize(`16`) uint8_t * ptr, uint8x16_t val); // VST1.8 {d0, d1}, [r0]
9659	#define vst1q_u8 STORE_SI128
9660
9661	_NEON2SSE_GLOBAL void vst1q_u16(__transfersize(`8`) uint16_t * ptr, uint16x8_t val); // VST1.16 {d0, d1}, [r0]
9662	#define vst1q_u16 STORE_SI128
9663
9664	_NEON2SSE_GLOBAL void vst1q_u32(__transfersize(`4`) uint32_t * ptr, uint32x4_t val); // VST1.32 {d0, d1}, [r0]
9665	#define vst1q_u32 STORE_SI128
9666
9667	_NEON2SSE_GLOBAL void vst1q_u64(__transfersize(`2`) uint64_t * ptr, uint64x2_t val); // VST1.64 {d0, d1}, [r0]
9668	#define vst1q_u64 STORE_SI128
9669
9670	_NEON2SSE_GLOBAL void vst1q_s8(__transfersize(`16`) int8_t * ptr, int8x16_t val); // VST1.8 {d0, d1}, [r0]
9671	#define vst1q_s8 STORE_SI128
9672
9673	_NEON2SSE_GLOBAL void vst1q_s16(__transfersize(`8`) int16_t * ptr, int16x8_t val); // VST1.16 {d0, d1}, [r0]
9674	#define vst1q_s16 STORE_SI128
9675
9676	_NEON2SSE_GLOBAL void vst1q_s32(__transfersize(`4`) int32_t * ptr, int32x4_t val); // VST1.32 {d0, d1}, [r0]
9677	#define vst1q_s32 STORE_SI128
9678
9679	_NEON2SSE_GLOBAL void vst1q_s64(__transfersize(`2`) int64_t * ptr, int64x2_t val); // VST1.64 {d0, d1}, [r0]
9680	#define vst1q_s64 STORE_SI128
9681
9682	_NEON2SSE_GLOBAL void vst1q_f16(__transfersize(`8`) __fp16 * ptr, float16x8_t val); // VST1.16 {d0, d1}, [r0]
9683	// IA32 SIMD doesn't work with 16bit floats currently
9684
9685	_NEON2SSESTORAGE void vst1q_f32(__transfersize(`4`) float32_t * ptr, float32x4_t val); // VST1.32 {d0, d1}, [r0]
9686	_NEON2SSE_INLINE void vst1q_f32(__transfersize(`4`) float32_t * ptr, float32x4_t val)
9687	{
9688	if( ((uintptr_t)(ptr) & `15`) == `0` ) //16 bits aligned
9689	_mm_store_ps (ptr, val);
9690	else
9691	_mm_storeu_ps (ptr, val);
9692	}
9693
9694	_NEON2SSE_GLOBAL void vst1q_p8(__transfersize(`16`) poly8_t * ptr, poly8x16_t val); // VST1.8 {d0, d1}, [r0]
9695	#define vst1q_p8 vst1q_u8
9696
9697	_NEON2SSE_GLOBAL void vst1q_p16(__transfersize(`8`) poly16_t * ptr, poly16x8_t val); // VST1.16 {d0, d1}, [r0]
9698	#define vst1q_p16 vst1q_u16
9699
9700	_NEON2SSESTORAGE void vst1_u8(__transfersize(`8`) uint8_t * ptr, uint8x8_t val); // VST1.8 {d0}, [r0]
9701	_NEON2SSE_INLINE void vst1_u8(__transfersize(`8`) uint8_t * ptr, uint8x8_t val)
9702	{
9703	int i;
9704	for (i = `0`; i<`8`; i++) {
9705	(ptr + i) = ((uint8_t)&val)[i];
9706	}
9707	//_mm_storel_epi64((__m128i)ptr, val);*
9708	return;
9709	}
9710
9711	_NEON2SSESTORAGE void vst1_u16(__transfersize(`4`) uint16_t * ptr, uint16x4_t val); // VST1.16 {d0}, [r0]
9712	_NEON2SSE_INLINE void vst1_u16(__transfersize(`4`) uint16_t * ptr, uint16x4_t val)
9713	{
9714	int i;
9715	for (i = `0`; i<`4`; i++) {
9716	(ptr + i) = ((uint16_t)&val)[i];
9717	}
9718	//_mm_storel_epi64((__m128i)ptr, val);*
9719	return;
9720	}
9721
9722	_NEON2SSESTORAGE void vst1_u32(__transfersize(`2`) uint32_t * ptr, uint32x2_t val); // VST1.32 {d0}, [r0]
9723	_NEON2SSE_INLINE void vst1_u32(__transfersize(`2`) uint32_t * ptr, uint32x2_t val)
9724	{
9725	int i;
9726	for (i = `0`; i<`2`; i++) {
9727	(ptr + i) = ((uint32_t)&val)[i];
9728	}
9729	//_mm_storel_epi64((__m128i)ptr, val);*
9730	return;
9731	}
9732
9733	_NEON2SSESTORAGE void vst1_u64(__transfersize(`1`) uint64_t * ptr, uint64x1_t val); // VST1.64 {d0}, [r0]
9734	_NEON2SSE_INLINE void vst1_u64(__transfersize(`1`) uint64_t * ptr, uint64x1_t val)
9735	{
9736	(ptr) = ((uint64_t*)&val);
9737	//_mm_storel_epi64((__m128i)ptr, val);*
9738	return;
9739	}
9740
9741	_NEON2SSE_GLOBAL void vst1_s8(__transfersize(`8`) int8_t * ptr, int8x8_t val); // VST1.8 {d0}, [r0]
9742	#define vst1_s8(ptr,val) vst1_u8((uint8_t*)ptr,val)
9743
9744	_NEON2SSE_GLOBAL void vst1_s16(__transfersize(`4`) int16_t * ptr, int16x4_t val); // VST1.16 {d0}, [r0]
9745	#define vst1_s16(ptr,val) vst1_u16((uint16_t*)ptr,val)
9746
9747	_NEON2SSE_GLOBAL void vst1_s32(__transfersize(`2`) int32_t * ptr, int32x2_t val); // VST1.32 {d0}, [r0]
9748	#define vst1_s32(ptr,val) vst1_u32((uint32_t*)ptr,val)
9749
9750	_NEON2SSE_GLOBAL void vst1_s64(__transfersize(`1`) int64_t * ptr, int64x1_t val); // VST1.64 {d0}, [r0]
9751	#define vst1_s64(ptr,val) vst1_u64((uint64_t*)ptr,val)
9752
9753	_NEON2SSE_GLOBAL void vst1_f16(__transfersize(`4`) __fp16 * ptr, float16x4_t val); // VST1.16 {d0}, [r0]
9754	//current IA SIMD doesn't support float16
9755
9756	_NEON2SSESTORAGE void vst1_f32(__transfersize(`2`) float32_t * ptr, float32x2_t val); // VST1.32 {d0}, [r0]
9757	_NEON2SSE_INLINE void vst1_f32(__transfersize(`2`) float32_t * ptr, float32x2_t val)
9758	{
9759	*(ptr) = val.m64_f32[`0`];
9760	*(ptr + `1`) = val.m64_f32[`1`];
9761	return;
9762	}
9763
9764	_NEON2SSE_GLOBAL void vst1_p8(__transfersize(`8`) poly8_t * ptr, poly8x8_t val); // VST1.8 {d0}, [r0]
9765	#define vst1_p8 vst1_u8
9766
9767	_NEON2SSE_GLOBAL void vst1_p16(__transfersize(`4`) poly16_t * ptr, poly16x4_t val); // VST1.16 {d0}, [r0]
9768	#define vst1_p16 vst1_u16
9769
9770	//*********Store a lane of a vector into memory (extract given lane) *******************
9771	//******************************************************************************************
9772	_NEON2SSE_GLOBAL void vst1q_lane_u8(__transfersize(`1`) uint8_t * ptr, uint8x16_t val, __constrange(`0`,`15`) int lane); // VST1.8 {d0[0]}, [r0]
9773	#define vst1q_lane_u8(ptr, val, lane) *(ptr) = (uint8_t) _MM_EXTRACT_EPI8 (val, lane)
9774
9775	_NEON2SSE_GLOBAL void vst1q_lane_u16(__transfersize(`1`) uint16_t * ptr, uint16x8_t val, __constrange(`0`,`7`) int lane); // VST1.16 {d0[0]}, [r0]
9776	#define vst1q_lane_u16(ptr, val, lane) *(ptr) = (uint16_t) _MM_EXTRACT_EPI16 (val, lane)
9777
9778	_NEON2SSE_GLOBAL void vst1q_lane_u32(__transfersize(`1`) uint32_t * ptr, uint32x4_t val, __constrange(`0`,`3`) int lane); // VST1.32 {d0[0]}, [r0]
9779	#define vst1q_lane_u32(ptr, val, lane) *(ptr) = (uint32_t) _MM_EXTRACT_EPI32 (val, lane)
9780
9781	_NEON2SSE_GLOBAL void vst1q_lane_u64(__transfersize(`1`) uint64_t * ptr, uint64x2_t val, __constrange(`0`,`1`) int lane); // VST1.64 {d0}, [r0]
9782	#define vst1q_lane_u64(ptr, val, lane) *(ptr) = (uint64_t) _MM_EXTRACT_EPI64 (val, lane)
9783
9784	_NEON2SSE_GLOBAL void vst1q_lane_s8(__transfersize(`1`) int8_t * ptr, int8x16_t val, __constrange(`0`,`15`) int lane); // VST1.8 {d0[0]}, [r0]
9785	#define vst1q_lane_s8(ptr, val, lane) *(ptr) = (int8_t) _MM_EXTRACT_EPI8 (val, lane)
9786
9787	_NEON2SSE_GLOBAL void vst1q_lane_s16(__transfersize(`1`) int16_t * ptr, int16x8_t val, __constrange(`0`,`7`) int lane); // VST1.16 {d0[0]}, [r0]
9788	#define vst1q_lane_s16(ptr, val, lane) *(ptr) = (int16_t) _MM_EXTRACT_EPI16 (val, lane)
9789
9790	_NEON2SSE_GLOBAL void vst1q_lane_s32(__transfersize(`1`) int32_t * ptr, int32x4_t val, __constrange(`0`,`3`) int lane); // VST1.32 {d0[0]}, [r0]
9791	#define vst1q_lane_s32(ptr, val, lane) *(ptr) = _MM_EXTRACT_EPI32 (val, lane)
9792
9793	_NEON2SSE_GLOBAL void vst1q_lane_s64(__transfersize(`1`) int64_t * ptr, int64x2_t val, __constrange(`0`,`1`) int lane); // VST1.64 {d0}, [r0]
9794	#define vst1q_lane_s64(ptr, val, lane) *(ptr) = _MM_EXTRACT_EPI64 (val, lane)
9795
9796	_NEON2SSE_GLOBAL void vst1q_lane_f16(__transfersize(`1`) __fp16 * ptr, float16x8_t val, __constrange(`0`,`7`) int lane); // VST1.16 {d0[0]}, [r0]
9797	//current IA SIMD doesn't support float16
9798
9799	_NEON2SSESTORAGE void vst1q_lane_f32(__transfersize(`1`) float32_t * ptr, float32x4_t val, __constrange(`0`,`3`) int lane); // VST1.32 {d0[0]}, [r0]
9800	_NEON2SSE_INLINE void vst1q_lane_f32(__transfersize(`1`) float32_t * ptr, float32x4_t val, __constrange(`0`,`3`) int lane)
9801	{
9802	((int32_t)ptr) = _MM_EXTRACT_PS(val,lane);
9803	}
9804
9805	_NEON2SSE_GLOBAL void vst1q_lane_p8(__transfersize(`1`) poly8_t * ptr, poly8x16_t val, __constrange(`0`,`15`) int lane); // VST1.8 {d0[0]}, [r0]
9806	#define vst1q_lane_p8 vst1q_lane_u8
9807
9808	_NEON2SSE_GLOBAL void vst1q_lane_p16(__transfersize(`1`) poly16_t * ptr, poly16x8_t val, __constrange(`0`,`7`) int lane); // VST1.16 {d0[0]}, [r0]
9809	#define vst1q_lane_p16 vst1q_lane_s16
9810
9811	_NEON2SSESTORAGE void vst1_lane_u8(__transfersize(`1`) uint8_t * ptr, uint8x8_t val, __constrange(`0`,`7`) int lane); // VST1.8 {d0[0]}, [r0]
9812	_NEON2SSE_INLINE void vst1_lane_u8(__transfersize(`1`) uint8_t * ptr, uint8x8_t val, __constrange(`0`,`7`) int lane)
9813	{
9814	*(ptr) = val.m64_u8[lane];
9815	}
9816
9817	_NEON2SSESTORAGE void vst1_lane_u16(__transfersize(`1`) uint16_t * ptr, uint16x4_t val, __constrange(`0`,`3`) int lane); // VST1.16 {d0[0]}, [r0]
9818	_NEON2SSE_INLINE void vst1_lane_u16(__transfersize(`1`) uint16_t * ptr, uint16x4_t val, __constrange(`0`,`3`) int lane)
9819	{
9820	*(ptr) = val.m64_u16[lane];
9821	}
9822
9823	_NEON2SSESTORAGE void vst1_lane_u32(__transfersize(`1`) uint32_t * ptr, uint32x2_t val, __constrange(`0`,`1`) int lane); // VST1.32 {d0[0]}, [r0]
9824	_NEON2SSE_INLINE void vst1_lane_u32(__transfersize(`1`) uint32_t * ptr, uint32x2_t val, __constrange(`0`,`1`) int lane)
9825	{
9826	*(ptr) = val.m64_u32[lane];
9827	}
9828
9829	_NEON2SSESTORAGE void vst1_lane_u64(__transfersize(`1`) uint64_t * ptr, uint64x1_t val, __constrange(`0`,`0`) int lane); // VST1.64 {d0}, [r0]
9830	_NEON2SSE_INLINE void vst1_lane_u64(__transfersize(`1`) uint64_t * ptr, uint64x1_t val, __constrange(`0`,`0`) int lane)
9831	{
9832	UNREFERENCED_PARAMETER(lane);
9833	*(ptr) = val.m64_u64[`0`];
9834	}
9835
9836	_NEON2SSE_GLOBAL void vst1_lane_s8(__transfersize(`1`) int8_t * ptr, int8x8_t val, __constrange(`0`,`7`) int lane); // VST1.8 {d0[0]}, [r0]
9837	#define vst1_lane_s8(ptr, val, lane) vst1_lane_u8((uint8_t*)ptr, val, lane)
9838
9839	_NEON2SSE_GLOBAL void vst1_lane_s16(__transfersize(`1`) int16_t * ptr, int16x4_t val, __constrange(`0`,`3`) int lane); // VST1.16 {d0[0]}, [r0]
9840	#define vst1_lane_s16(ptr, val, lane) vst1_lane_u16((uint16_t*)ptr, val, lane)
9841
9842	_NEON2SSE_GLOBAL void vst1_lane_s32(__transfersize(`1`) int32_t * ptr, int32x2_t val, __constrange(`0`,`1`) int lane); // VST1.32 {d0[0]}, [r0]
9843	#define vst1_lane_s32(ptr, val, lane) vst1_lane_u32((uint32_t*)ptr, val, lane)
9844
9845
9846	_NEON2SSE_GLOBAL void vst1_lane_s64(__transfersize(`1`) int64_t * ptr, int64x1_t val, __constrange(`0`,`0`) int lane); // VST1.64 {d0}, [r0]
9847	#define vst1_lane_s64(ptr, val, lane) vst1_lane_u64((uint64_t*)ptr, val, lane)
9848
9849
9850	_NEON2SSE_GLOBAL void vst1_lane_f16(__transfersize(`1`) __fp16 * ptr, float16x4_t val, __constrange(`0`,`3`) int lane); // VST1.16 {d0[0]}, [r0]
9851	//current IA SIMD doesn't support float16
9852
9853	_NEON2SSESTORAGE void vst1_lane_f32(__transfersize(`1`) float32_t * ptr, float32x2_t val, __constrange(`0`,`1`) int lane); // VST1.32 {d0[0]}, [r0]
9854	_NEON2SSE_INLINE void vst1_lane_f32(__transfersize(`1`) float32_t * ptr, float32x2_t val, __constrange(`0`,`1`) int lane)
9855	{
9856	*(ptr) = val.m64_f32[lane];
9857	}
9858
9859	_NEON2SSE_GLOBAL void vst1_lane_p8(__transfersize(`1`) poly8_t * ptr, poly8x8_t val, __constrange(`0`,`7`) int lane); // VST1.8 {d0[0]}, [r0]
9860	#define vst1_lane_p8 vst1_lane_u8
9861
9862	_NEON2SSE_GLOBAL void vst1_lane_p16(__transfersize(`1`) poly16_t * ptr, poly16x4_t val, __constrange(`0`,`3`) int lane); // VST1.16 {d0[0]}, [r0]
9863	#define vst1_lane_p16 vst1_lane_s16
9864
9865	//***********************************************************************************************
9866	//************** Loads and stores of an N-element structure ********************************
9867	//***********************************************************************************************
9868	//These intrinsics load or store an n-element structure. The array structures are defined in the beginning
9869	//We assume ptr is NOT aligned in general case, for more details see "Loads and stores of a single vector functions"
9870	//**************** 2 elements load *******************************************
9871	_NEON2SSESTORAGE uint8x16x2_t vld2q_u8(__transfersize(`32`) uint8_t const * ptr); // VLD2.8 {d0, d2}, [r0]
9872	_NEON2SSE_INLINE uint8x16x2_t vld2q_u8(__transfersize(`32`) uint8_t const * ptr) // VLD2.8 {d0, d2}, [r0]
9873	{
9874	uint8x16x2_t v;
9875	v.val[`0`] = vld1q_u8(ptr);
9876	v.val[`1`] = vld1q_u8((ptr + `16`));
9877	v = vuzpq_s8(v.val[`0`], v.val[`1`]);
9878	return v;
9879	}
9880
9881	_NEON2SSESTORAGE uint16x8x2_t vld2q_u16(__transfersize(`16`) uint16_t const * ptr); // VLD2.16 {d0, d2}, [r0]
9882	_NEON2SSE_INLINE uint16x8x2_t vld2q_u16(__transfersize(`16`) uint16_t const * ptr) // VLD2.16 {d0, d2}, [r0]
9883	{
9884	uint16x8x2_t v;
9885	v.val[`0`] = vld1q_u16( ptr);
9886	v.val[`1`] = vld1q_u16( (ptr + `8`));
9887	v = vuzpq_s16(v.val[`0`], v.val[`1`]);
9888	return v;
9889	}
9890
9891	_NEON2SSESTORAGE uint32x4x2_t vld2q_u32(__transfersize(`8`) uint32_t const * ptr); // VLD2.32 {d0, d2}, [r0]
9892	_NEON2SSE_INLINE uint32x4x2_t vld2q_u32(__transfersize(`8`) uint32_t const * ptr) // VLD2.32 {d0, d2}, [r0]
9893	{
9894	uint32x4x2_t v;
9895	v.val[`0`] = vld1q_u32 ( ptr);
9896	v.val[`1`] = vld1q_u32 ( (ptr + `4`));
9897	v = vuzpq_s32(v.val[`0`], v.val[`1`]);
9898	return v;
9899	}
9900
9901	_NEON2SSE_GLOBAL int8x16x2_t vld2q_s8(__transfersize(`32`) int8_t const * ptr);
9902	#define vld2q_s8(ptr) vld2q_u8((uint8_t*) ptr)
9903
9904	_NEON2SSE_GLOBAL int16x8x2_t vld2q_s16(__transfersize(`16`) int16_t const * ptr); // VLD2.16 {d0, d2}, [r0]
9905	#define vld2q_s16(ptr) vld2q_u16((uint16_t*) ptr)
9906
9907	_NEON2SSE_GLOBAL int32x4x2_t vld2q_s32(__transfersize(`8`) int32_t const * ptr); // VLD2.32 {d0, d2}, [r0]
9908	#define vld2q_s32(ptr) vld2q_u32((uint32_t*) ptr)
9909
9910
9911	_NEON2SSE_GLOBAL float16x8x2_t vld2q_f16(__transfersize(`16`) __fp16 const * ptr); // VLD2.16 {d0, d2}, [r0]
9912	// IA32 SIMD doesn't work with 16bit floats currently, so need to go to 32 bit and then work with two 128bit registers. See vld1q_f16 for example
9913
9914	_NEON2SSESTORAGE float32x4x2_t vld2q_f32(__transfersize(`8`) float32_t const * ptr); // VLD2.32 {d0, d2}, [r0]
9915	_NEON2SSE_INLINE float32x4x2_t vld2q_f32(__transfersize(`8`) float32_t const * ptr) // VLD2.32 {d0, d2}, [r0]
9916	{
9917	float32x4x2_t v;
9918	v.val[`0`] = vld1q_f32 (ptr);
9919	v.val[`1`] = vld1q_f32 ((ptr + `4`));
9920	v = vuzpq_f32(v.val[`0`], v.val[`1`]);
9921	return v;
9922	}
9923
9924	_NEON2SSE_GLOBAL poly8x16x2_t vld2q_p8(__transfersize(`32`) poly8_t const * ptr); // VLD2.8 {d0, d2}, [r0]
9925	#define vld2q_p8 vld2q_u8
9926
9927	_NEON2SSE_GLOBAL poly16x8x2_t vld2q_p16(__transfersize(`16`) poly16_t const * ptr); // VLD2.16 {d0, d2}, [r0]
9928	#define vld2q_p16 vld2q_u16
9929
9930	_NEON2SSESTORAGE uint8x8x2_t vld2_u8(__transfersize(`16`) uint8_t const * ptr); // VLD2.8 {d0, d1}, [r0]
9931	_NEON2SSE_INLINE uint8x8x2_t vld2_u8(__transfersize(`16`) uint8_t const * ptr)
9932	{
9933	uint8x8x2_t v;
9934	__m128i ld128;
9935	ld128 = vld1q_u8(ptr); //merge two 64-bits in 128 bit
9936	ld128 = _mm_shuffle_epi8(ld128, (__m128i)mask8_16_even_odd);
9937	vst1q_u8((v.val), ld128); // v.val[1] = _mm_shuffle_epi32(v.val[0], _SWAP_HI_LOW32);
9938	return v;
9939	}
9940
9941	_NEON2SSESTORAGE uint16x4x2_t vld2_u16(__transfersize(`8`) uint16_t const * ptr); // VLD2.16 {d0, d1}, [r0]
9942	_NEON2SSE_INLINE uint16x4x2_t vld2_u16(__transfersize(`8`) uint16_t const * ptr)
9943	{
9944	_NEON2SSE_ALIGN_16 uint16x4x2_t v;
9945	__m128i ld128;
9946	ld128 = vld1q_u16(ptr); //merge two 64-bits in 128 bit
9947	ld128 = _mm_shuffle_epi8(ld128, (__m128i) mask8_32_even_odd);
9948	vst1q_u16((v.val), ld128);
9949	return v;
9950	}
9951
9952	_NEON2SSESTORAGE uint32x2x2_t vld2_u32(__transfersize(`4`) uint32_t const * ptr); // VLD2.32 {d0, d1}, [r0]
9953	_NEON2SSE_INLINE uint32x2x2_t vld2_u32(__transfersize(`4`) uint32_t const * ptr)
9954	{
9955	_NEON2SSE_ALIGN_16 uint32x2x2_t v;
9956	__m128i ld128;
9957	ld128 = vld1q_u32(ptr); //merge two 64-bits in 128 bit
9958	ld128 = _mm_shuffle_epi32(ld128, `0` \| (`2` << `2`) \| (`1` << `4`) \| (`3` << `6`));
9959	vst1q_u32((v.val), ld128);
9960	return v;
9961	}
9962
9963	_NEON2SSESTORAGE uint64x1x2_t vld2_u64(__transfersize(`2`) uint64_t const * ptr); // VLD1.64 {d0, d1}, [r0]
9964	_NEON2SSE_INLINE uint64x1x2_t vld2_u64(__transfersize(`2`) uint64_t const * ptr)
9965	{
9966	uint64x1x2_t v;
9967	v.val[`0`].m64_u64[`0`] = *(ptr);
9968	v.val[`1`].m64_u64[`0`] = *(ptr + `1`);
9969	return v;
9970	}
9971
9972	_NEON2SSE_GLOBAL int8x8x2_t vld2_s8(__transfersize(`16`) int8_t const * ptr); // VLD2.8 {d0, d1}, [r0]
9973	#define vld2_s8(ptr) vld2_u8((uint8_t*)ptr)
9974
9975	_NEON2SSE_GLOBAL int16x4x2_t vld2_s16(__transfersize(`8`) int16_t const * ptr); // VLD2.16 {d0, d1}, [r0]
9976	#define vld2_s16(ptr) vld2_u16((uint16_t*)ptr)
9977
9978	_NEON2SSE_GLOBAL int32x2x2_t vld2_s32(__transfersize(`4`) int32_t const * ptr); // VLD2.32 {d0, d1}, [r0]
9979	#define vld2_s32(ptr) vld2_u32((uint32_t*)ptr)
9980
9981	_NEON2SSE_GLOBAL int64x1x2_t vld2_s64(__transfersize(`2`) int64_t const * ptr); // VLD1.64 {d0, d1}, [r0]
9982	#define vld2_s64(ptr) vld2_u64((uint64_t*)ptr)
9983
9984	_NEON2SSE_GLOBAL float16x4x2_t vld2_f16(__transfersize(`8`) __fp16 const * ptr); // VLD2.16 {d0, d1}, [r0]
9985	// IA32 SIMD doesn't work with 16bit floats currently, so need to go to 32 bit and then work with two 128bit registers. See vld1_f16 for example
9986
9987	_NEON2SSESTORAGE float32x2x2_t vld2_f32(__transfersize(`4`) float32_t const * ptr); // VLD2.32 {d0, d1}, [r0]
9988	_NEON2SSE_INLINE float32x2x2_t vld2_f32(__transfersize(`4`) float32_t const * ptr)
9989	{
9990	float32x2x2_t v;
9991	v.val[`0`].m64_f32[`0`] = *(ptr);
9992	v.val[`0`].m64_f32[`1`] = *(ptr + `2`);
9993	v.val[`1`].m64_f32[`0`] = *(ptr + `1`);
9994	v.val[`1`].m64_f32[`1`] = *(ptr + `3`);
9995	return v;
9996	}
9997
9998	_NEON2SSE_GLOBAL poly8x8x2_t vld2_p8(__transfersize(`16`) poly8_t const * ptr); // VLD2.8 {d0, d1}, [r0]
9999	#define vld2_p8 vld2_u8
10000
10001	_NEON2SSE_GLOBAL poly16x4x2_t vld2_p16(__transfersize(`8`) poly16_t const * ptr); // VLD2.16 {d0, d1}, [r0]
10002	#define vld2_p16 vld2_u16
10003
10004	//****************** Triplets *************************************
10005	//*********************************************************************
10006	_NEON2SSESTORAGE uint8x16x3_t vld3q_u8(__transfersize(`48`) uint8_t const * ptr); // VLD3.8 {d0, d2, d4}, [r0]
10007	_NEON2SSE_INLINE uint8x16x3_t vld3q_u8(__transfersize(`48`) uint8_t const * ptr) // VLD3.8 {d0, d2, d4}, [r0]
10008	{
10009	//a0,a1,a2,a3,...a7,a8,...a15, b0,b1,b2,...b7,b8,...b15, c0,c1,c2,...c7,c8,...c15 ->
10010	//a:0,3,6,9,12,15,b:2,5,8,11,14, c:1,4,7,10,13
10011	//a:1,4,7,10,13, b:0,3,6,9,12,15,c:2,5,8,11,14,
10012	//a:2,5,8,11,14, b:1,4,7,10,13, c:0,3,6,9,12,15
10013	uint8x16x3_t v;
10014	__m128i tmp0, tmp1,tmp2, tmp3;
10015	_NEON2SSE_ALIGN_16 static const int8_t mask8_0[`16`] = {`0`,`3`,`6`,`9`,`12`,`15`,`1`,`4`,`7`,`10`,`13`,`2`,`5`,`8`,`11`,`14`};
10016	_NEON2SSE_ALIGN_16 static const int8_t mask8_1[`16`] = {`2`,`5`,`8`,`11`,`14`,`0`,`3`,`6`,`9`,`12`,`15`,`1`,`4`,`7`,`10`,`13`};
10017	_NEON2SSE_ALIGN_16 static const int8_t mask8_2[`16`] = {`1`,`4`,`7`,`10`,`13`,`2`,`5`,`8`,`11`,`14`,`0`,`3`,`6`,`9`,`12`,`15`};
10018
10019	v.val[`0`] = vld1q_u8 (ptr); //a0,a1,a2,a3,...a7, ...a15
10020	v.val[`1`] = vld1q_u8 ((ptr + `16`)); //b0,b1,b2,b3...b7, ...b15
10021	v.val[`2`] = vld1q_u8 ((ptr + `32`)); //c0,c1,c2,c3,...c7,...c15
10022
10023	tmp0 = _mm_shuffle_epi8(v.val[`0`], (__m128i)mask8_0); //a:0,3,6,9,12,15,1,4,7,10,13,2,5,8,11
10024	tmp1 = _mm_shuffle_epi8(v.val[`1`], (__m128i)mask8_1); //b:2,5,8,11,14,0,3,6,9,12,15,1,4,7,10,13
10025	tmp2 = _mm_shuffle_epi8(v.val[`2`], (__m128i)mask8_2); //c:1,4,7,10,13,2,5,8,11,14,3,6,9,12,15
10026
10027	tmp3 = _mm_slli_si128(tmp0,`10`); //0,0,0,0,0,0,0,0,0,0,a0,a3,a6,a9,a12,a15
10028	tmp3 = _mm_alignr_epi8(tmp1,tmp3, `10`); //a:0,3,6,9,12,15,b:2,5,8,11,14,x,x,x,x,x
10029	tmp3 = _mm_slli_si128(tmp3, `5`); //0,0,0,0,0,a:0,3,6,9,12,15,b:2,5,8,11,14,
10030	tmp3 = _mm_srli_si128(tmp3, `5`); //a:0,3,6,9,12,15,b:2,5,8,11,14,:0,0,0,0,0
10031	v.val[`0`] = _mm_slli_si128(tmp2, `11`); //0,0,0,0,0,0,0,0,0,0,0,0, 1,4,7,10,13,
10032	v.val[`0`] = _mm_or_si128(v.val[`0`],tmp3); //a:0,3,6,9,12,15,b:2,5,8,11,14,c:1,4,7,10,13,
10033
10034	tmp3 = _mm_slli_si128(tmp0, `5`); //0,0,0,0,0,a:0,3,6,9,12,15,1,4,7,10,13,
10035	tmp3 = _mm_srli_si128(tmp3, `11`); //a:1,4,7,10,13, 0,0,0,0,0,0,0,0,0,0,0
10036	v.val[`1`] = _mm_srli_si128(tmp1,`5`); //b:0,3,6,9,12,15,C:1,4,7,10,13, 0,0,0,0,0
10037	v.val[`1`] = _mm_slli_si128(v.val[`1`], `5`); //0,0,0,0,0,b:0,3,6,9,12,15,C:1,4,7,10,13,
10038	v.val[`1`] = _mm_or_si128(v.val[`1`],tmp3); //a:1,4,7,10,13,b:0,3,6,9,12,15,C:1,4,7,10,13,
10039	v.val[`1`] = _mm_slli_si128(v.val[`1`],`5`); //0,0,0,0,0,a:1,4,7,10,13,b:0,3,6,9,12,15,
10040	v.val[`1`] = _mm_srli_si128(v.val[`1`], `5`); //a:1,4,7,10,13,b:0,3,6,9,12,15,0,0,0,0,0
10041	tmp3 = _mm_srli_si128(tmp2,`5`); //c:2,5,8,11,14,0,3,6,9,12,15,0,0,0,0,0
10042	tmp3 = _mm_slli_si128(tmp3,`11`); //0,0,0,0,0,0,0,0,0,0,0,c:2,5,8,11,14,
10043	v.val[`1`] = _mm_or_si128(v.val[`1`],tmp3); //a:1,4,7,10,13,b:0,3,6,9,12,15,c:2,5,8,11,14,
10044
10045	tmp3 = _mm_srli_si128(tmp2,`10`); //c:0,3,6,9,12,15, 0,0,0,0,0,0,0,0,0,0,
10046	tmp3 = _mm_slli_si128(tmp3,`10`); //0,0,0,0,0,0,0,0,0,0, c:0,3,6,9,12,15,
10047	v.val[`2`] = _mm_srli_si128(tmp1,`11`); //b:1,4,7,10,13,0,0,0,0,0,0,0,0,0,0,0
10048	v.val[`2`] = _mm_slli_si128(v.val[`2`],`5`); //0,0,0,0,0,b:1,4,7,10,13, 0,0,0,0,0,0
10049	v.val[`2`] = _mm_or_si128(v.val[`2`],tmp3); //0,0,0,0,0,b:1,4,7,10,13,c:0,3,6,9,12,15,
10050	tmp0 = _mm_srli_si128(tmp0, `11`); //a:2,5,8,11,14, 0,0,0,0,0,0,0,0,0,0,0,
10051	v.val[`2`] = _mm_or_si128(v.val[`2`],tmp0); //a:2,5,8,11,14,b:1,4,7,10,13,c:0,3,6,9,12,15,
10052	return v;
10053	}
10054
10055	_NEON2SSESTORAGE uint16x8x3_t vld3q_u16(__transfersize(`24`) uint16_t const * ptr); // VLD3.16 {d0, d2, d4}, [r0]
10056	_NEON2SSE_INLINE uint16x8x3_t vld3q_u16(__transfersize(`24`) uint16_t const * ptr) // VLD3.16 {d0, d2, d4}, [r0]
10057	{
10058	//a0, a1,a2,a3,...a7, b0,b1,b2,b3,...b7, c0,c1,c2,c3...c7 -> a0,a3,a6,b1,b4,b7,c2,c5, a1,a4,a7,b2,b5,c0,c3,c6, a2,a5,b0,b3,b6,c1,c4,c7
10059	uint16x8x3_t v;
10060	__m128i tmp0, tmp1,tmp2, tmp3;
10061	_NEON2SSE_ALIGN_16 static const int8_t mask16_0[`16`] = {`0`,`1`, `6`,`7`, `12`,`13`, `2`,`3`, `8`,`9`, `14`,`15`, `4`,`5`, `10`,`11`};
10062	_NEON2SSE_ALIGN_16 static const int8_t mask16_1[`16`] = {`2`,`3`, `8`,`9`, `14`,`15`, `4`,`5`, `10`,`11`, `0`,`1`, `6`,`7`, `12`,`13`};
10063	_NEON2SSE_ALIGN_16 static const int8_t mask16_2[`16`] = {`4`,`5`, `10`,`11`, `0`,`1`, `6`,`7`, `12`,`13`, `2`,`3`, `8`,`9`, `14`,`15`};
10064
10065	v.val[`0`] = vld1q_u16 (ptr); //a0,a1,a2,a3,...a7,
10066	v.val[`1`] = vld1q_u16 ((ptr + `8`)); //b0,b1,b2,b3...b7
10067	v.val[`2`] = vld1q_u16 ((ptr + `16`)); //c0,c1,c2,c3,...c7
10068
10069	tmp0 = _mm_shuffle_epi8(v.val[`0`], (__m128i)mask16_0); //a0,a3,a6,a1,a4,a7,a2,a5,
10070	tmp1 = _mm_shuffle_epi8(v.val[`1`], (__m128i)mask16_1); //b1,b4,b7,b2,b5,b0,b3,b6
10071	tmp2 = _mm_shuffle_epi8(v.val[`2`], (__m128i)mask16_2); //c2,c5, c0,c3,c6, c1,c4,c7
10072
10073	tmp3 = _mm_slli_si128(tmp0,`10`); //0,0,0,0,0,a0,a3,a6,
10074	tmp3 = _mm_alignr_epi8(tmp1,tmp3, `10`); //a0,a3,a6,b1,b4,b7,x,x
10075	tmp3 = _mm_slli_si128(tmp3, `4`); //0,0, a0,a3,a6,b1,b4,b7
10076	tmp3 = _mm_srli_si128(tmp3, `4`); //a0,a3,a6,b1,b4,b7,0,0
10077	v.val[`0`] = _mm_slli_si128(tmp2, `12`); //0,0,0,0,0,0, c2,c5,
10078	v.val[`0`] = _mm_or_si128(v.val[`0`],tmp3); //a0,a3,a6,b1,b4,b7,c2,c5
10079
10080	tmp3 = _mm_slli_si128(tmp0, `4`); //0,0,a0,a3,a6,a1,a4,a7
10081	tmp3 = _mm_srli_si128(tmp3,`10`); //a1,a4,a7, 0,0,0,0,0
10082	v.val[`1`] = _mm_srli_si128(tmp1,`6`); //b2,b5,b0,b3,b6,0,0
10083	v.val[`1`] = _mm_slli_si128(v.val[`1`], `6`); //0,0,0,b2,b5,b0,b3,b6,
10084	v.val[`1`] = _mm_or_si128(v.val[`1`],tmp3); //a1,a4,a7,b2,b5,b0,b3,b6,
10085	v.val[`1`] = _mm_slli_si128(v.val[`1`],`6`); //0,0,0,a1,a4,a7,b2,b5,
10086	v.val[`1`] = _mm_srli_si128(v.val[`1`], `6`); //a1,a4,a7,b2,b5,0,0,0,
10087	tmp3 = _mm_srli_si128(tmp2,`4`); //c0,c3,c6, c1,c4,c7,0,0
10088	tmp3 = _mm_slli_si128(tmp3,`10`); //0,0,0,0,0,c0,c3,c6,
10089	v.val[`1`] = _mm_or_si128(v.val[`1`],tmp3); //a1,a4,a7,b2,b5,c0,c3,c6,
10090
10091	tmp3 = _mm_srli_si128(tmp2,`10`); //c1,c4,c7, 0,0,0,0,0
10092	tmp3 = _mm_slli_si128(tmp3,`10`); //0,0,0,0,0, c1,c4,c7,
10093	v.val[`2`] = _mm_srli_si128(tmp1,`10`); //b0,b3,b6,0,0, 0,0,0
10094	v.val[`2`] = _mm_slli_si128(v.val[`2`],`4`); //0,0, b0,b3,b6,0,0,0
10095	v.val[`2`] = _mm_or_si128(v.val[`2`],tmp3); //0,0, b0,b3,b6,c1,c4,c7,
10096	tmp0 = _mm_srli_si128(tmp0, `12`); //a2,a5,0,0,0,0,0,0
10097	v.val[`2`] = _mm_or_si128(v.val[`2`],tmp0); //a2,a5,b0,b3,b6,c1,c4,c7,
10098	return v;
10099	}
10100
10101	_NEON2SSESTORAGE uint32x4x3_t vld3q_u32(__transfersize(`12`) uint32_t const * ptr); // VLD3.32 {d0, d2, d4}, [r0]
10102	_NEON2SSE_INLINE uint32x4x3_t vld3q_u32(__transfersize(`12`) uint32_t const * ptr) // VLD3.32 {d0, d2, d4}, [r0]
10103	{
10104	//a0,a1,a2,a3, b0,b1,b2,b3, c0,c1,c2,c3 -> a0,a3,b2,c1, a1,b0,b3,c2, a2,b1,c0,c3,
10105	uint32x4x3_t v;
10106	__m128i tmp0, tmp1,tmp2, tmp3;
10107	v.val[`0`] = vld1q_u32 (ptr); //a0,a1,a2,a3,
10108	v.val[`1`] = vld1q_u32 ((ptr + `4`)); //b0,b1,b2,b3
10109	v.val[`2`] = vld1q_u32 ((ptr + `8`)); //c0,c1,c2,c3,
10110
10111	tmp0 = _mm_shuffle_epi32(v.val[`0`], `0` \| (`3` << `2`) \| (`1` << `4`) \| (`2` << `6`)); //a0,a3,a1,a2
10112	tmp1 = _mm_shuffle_epi32(v.val[`1`], _SWAP_HI_LOW32); //b2,b3,b0,b1
10113	tmp2 = _mm_shuffle_epi32(v.val[`2`], `1` \| (`2` << `2`) \| (`0` << `4`) \| (`3` << `6`)); //c1,c2, c0,c3
10114
10115	tmp3 = _mm_unpacklo_epi32(tmp1, tmp2); //b2,c1, b3,c2
10116	v.val[`0`] = _mm_unpacklo_epi64(tmp0,tmp3); //a0,a3,b2,c1
10117	tmp0 = _mm_unpackhi_epi32(tmp0, tmp1); //a1,b0, a2,b1
10118	v.val[`1`] = _mm_shuffle_epi32(tmp0, _SWAP_HI_LOW32 ); //a2,b1, a1,b0,
10119	v.val[`1`] = _mm_unpackhi_epi64(v.val[`1`], tmp3); //a1,b0, b3,c2
10120	v.val[`2`] = _mm_unpackhi_epi64(tmp0, tmp2); //a2,b1, c0,c3
10121	return v;
10122	}
10123
10124	_NEON2SSE_GLOBAL int8x16x3_t vld3q_s8(__transfersize(`48`) int8_t const * ptr); // VLD3.8 {d0, d2, d4}, [r0]
10125	#define vld3q_s8(ptr) vld3q_u8((uint8_t*) (ptr))
10126
10127	_NEON2SSE_GLOBAL int16x8x3_t vld3q_s16(__transfersize(`24`) int16_t const * ptr); // VLD3.16 {d0, d2, d4}, [r0]
10128	#define vld3q_s16(ptr) vld3q_u16((uint16_t*) (ptr))
10129
10130	_NEON2SSE_GLOBAL int32x4x3_t vld3q_s32(__transfersize(`12`) int32_t const * ptr); // VLD3.32 {d0, d2, d4}, [r0]
10131	#define vld3q_s32(ptr) vld3q_u32((uint32_t*) (ptr))
10132
10133	_NEON2SSE_GLOBAL float16x8x3_t vld3q_f16(__transfersize(`24`) __fp16 const * ptr); // VLD3.16 {d0, d2, d4}, [r0]
10134	// IA32 SIMD doesn't work with 16bit floats currently, so need to go to 32 bit and then work with two 128bit registers. See vld1q_f16 for example
10135
10136	_NEON2SSESTORAGE float32x4x3_t vld3q_f32(__transfersize(`12`) float32_t const * ptr); // VLD3.32 {d0, d2, d4}, [r0]
10137	_NEON2SSE_INLINE float32x4x3_t vld3q_f32(__transfersize(`12`) float32_t const * ptr) // VLD3.32 {d0, d2, d4}, [r0]
10138	{
10139	//a0,a1,a2,a3, b0,b1,b2,b3, c0,c1,c2,c3 -> a0,a3,b2,c1, a1,b0,b3,c2, a2,b1,c0,c3,
10140	float32x4x3_t v;
10141	__m128 tmp0, tmp1,tmp2, tmp3;
10142	v.val[`0`] = vld1q_f32 (ptr); //a0,a1,a2,a3,
10143	v.val[`1`] = vld1q_f32 ((ptr + `4`)); //b0,b1,b2,b3
10144	v.val[`2`] = vld1q_f32 ((ptr + `8`)); //c0,c1,c2,c3,
10145
10146	tmp0 = _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(v.val[`0`]), `0` \| (`3` << `2`) \| (`1` << `4`) \| (`2` << `6`))); //a0,a3,a1,a2
10147	tmp1 = _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(v.val[`1`]), _SWAP_HI_LOW32)); //b2,b3,b0,b1
10148	tmp2 = _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(v.val[`2`]), `1` \| (`2` << `2`) \| (`0` << `4`) \| (`3` << `6`))); //c1,c2, c0,c3
10149	tmp3 = _mm_unpacklo_ps(tmp1, tmp2); //b2,c1, b3,c2
10150
10151	v.val[`0`] = _mm_movelh_ps(tmp0,tmp3); //a0,a3,b2,c1
10152	tmp0 = _mm_unpackhi_ps(tmp0, tmp1); //a1,b0, a2,b1
10153	v.val[`1`] = _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(tmp0), _SWAP_HI_LOW32 )); //a2,b1, a1,b0,
10154	v.val[`1`] = _mm_movehl_ps(tmp3,v.val[`1`]); //a1,b0, b3,c2
10155	v.val[`2`] = _mm_movehl_ps(tmp2,tmp0); //a2,b1, c0,c3
10156	return v;
10157	}
10158
10159	//poly8x16x3_t vld3q_p8(__transfersize(48) poly8_t const ptr); // VLD3.8 {d0, d2, d4}, [r0]*
10160	#define vld3q_p8 vld3q_u8
10161
10162	_NEON2SSE_GLOBAL poly16x8x3_t vld3q_p16(__transfersize(`24`) poly16_t const * ptr); // VLD3.16 {d0, d2, d4}, [r0]
10163	#define vld3q_p16 vld3q_u16
10164
10165	_NEON2SSESTORAGE uint8x8x3_t vld3_u8(__transfersize(`24`) uint8_t const * ptr); // VLD3.8 {d0, d1, d2}, [r0]
10166	_NEON2SSE_INLINE uint8x8x3_t vld3_u8(__transfersize(`24`) uint8_t const * ptr) // VLD3.8 {d0, d1, d2}, [r0]
10167	{
10168	//a0, a1,a2,a3,...a7, b0,b1,b2,b3,...b7, c0,c1,c2,c3...c7 -> a0,a3,a6,b1,b4,b7,c2,c5, a1,a4,a7,b2,b5,c0,c3,c6, a2,a5,b0,b3,b6,c1,c4,c7
10169	uint8x8x3_t v;
10170	__m128i val0, val1, val2, tmp0, tmp1;
10171	_NEON2SSE_ALIGN_16 static const int8_t mask8_0[`16`] = {`0`,`3`,`6`,`9`,`12`,`15`, `1`,`4`,`7`,`10`,`13`, `2`,`5`,`8`,`11`,`14`};
10172	_NEON2SSE_ALIGN_16 static const int8_t mask8_1[`16`] = {`2`,`5`, `0`,`3`,`6`, `1`,`4`,`7`, `0`,`0`,`0`,`0`,`0`,`0`,`0`,`0`};
10173	val0 = vld1q_u8 (ptr); //a0,a1,a2,a3,...a7, b0,b1,b2,b3...b7
10174	val2 = _mm_loadl_epi64((__m128i)(ptr + `16`)); //c0,c1,c2,c3,...c7*
10175
10176	tmp0 = _mm_shuffle_epi8(val0, (__m128i)mask8_0); //a0,a3,a6,b1,b4,b7, a1,a4,a7,b2,b5, a2,a5,b0,b3,b6,
10177	tmp1 = _mm_shuffle_epi8(val2, (__m128i)mask8_1); //c2,c5, c0,c3,c6, c1,c4,c7,x,x,x,x,x,x,x,x
10178	val0 = _mm_slli_si128(tmp0,`10`);
10179	val0 = _mm_srli_si128(val0,`10`); //a0,a3,a6,b1,b4,b7, 0,0,0,0,0,0,0,0,0,0
10180	val2 = _mm_slli_si128(tmp1,`6`); //0,0,0,0,0,0,c2,c5,x,x,x,x,x,x,x,x
10181	val0 = _mm_or_si128(val0,val2); //a0,a3,a6,b1,b4,b7,c2,c5 x,x,x,x,x,x,x,x
10182	_M64(v.val[`0`], val0);
10183	val1 = _mm_slli_si128(tmp0,`5`); //0,0,0,0,0,0,0,0,0,0,0, a1,a4,a7,b2,b5,
10184	val1 = _mm_srli_si128(val1,`11`); //a1,a4,a7,b2,b5,0,0,0,0,0,0,0,0,0,0,0,
10185	val2 = _mm_srli_si128(tmp1,`2`); //c0,c3,c6,c1,c4,c7,x,x,x,x,x,x,x,x,0,0
10186	val2 = _mm_slli_si128(val2,`5`); //0,0,0,0,0,c0,c3,c6,0,0,0,0,0,0,0,0
10187	val1 = _mm_or_si128(val1,val2); //a1,a4,a7,b2,b5,c0,c3,c6,x,x,x,x,x,x,x,x
10188	_M64(v.val[`1`], val1);
10189
10190	tmp0 = _mm_srli_si128(tmp0,`11`); //a2,a5,b0,b3,b6,0,0,0,0,0,0,0,0,0,0,0,
10191	val2 = _mm_srli_si128(tmp1,`5`); //c1,c4,c7,0,0,0,0,0,0,0,0,0,0,0,0,0
10192	val2 = _mm_slli_si128(val2,`5`); //0,0,0,0,0,c1,c4,c7,
10193	val2 = _mm_or_si128(tmp0, val2); //a2,a5,b0,b3,b6,c1,c4,c7,x,x,x,x,x,x,x,x
10194	_M64(v.val[`2`], val2);
10195	return v;
10196	}
10197
10198	_NEON2SSESTORAGE uint16x4x3_t vld3_u16(__transfersize(`12`) uint16_t const * ptr); // VLD3.16 {d0, d1, d2}, [r0]
10199	_NEON2SSE_INLINE uint16x4x3_t vld3_u16(__transfersize(`12`) uint16_t const * ptr) // VLD3.16 {d0, d1, d2}, [r0]
10200	{
10201	//a0,a1,a2,a3, b0,b1,b2,b3, c0,c1,c2,c3 -> a0,a3,b2,c1, a1,b0,b3,c2, a2,b1,c0,c3,
10202	uint16x4x3_t v;
10203	__m128i val0, val1, val2, tmp0, tmp1;
10204	_NEON2SSE_ALIGN_16 static const int8_t mask16[`16`] = {`0`,`1`, `6`,`7`, `12`,`13`, `2`,`3`, `8`,`9`, `14`,`15`, `4`,`5`, `10`,`11`};
10205	val0 = vld1q_u16 (ptr); //a0,a1,a2,a3, b0,b1,b2,b3
10206	val2 = _mm_loadl_epi64((__m128i)(ptr + `8`)); //c0,c1,c2,c3, x,x,x,x*
10207
10208	tmp0 = _mm_shuffle_epi8(val0, (__m128i)mask16); //a0, a3, b2,a1, b0, b3, a2, b1
10209	tmp1 = _mm_shufflelo_epi16(val2, `201`); //11 00 10 01 : c1, c2, c0, c3,
10210	val0 = _mm_slli_si128(tmp0,`10`);
10211	val0 = _mm_srli_si128(val0,`10`); //a0, a3, b2, 0,0, 0,0,
10212	val2 = _mm_slli_si128(tmp1,`14`); //0,0,0,0,0,0,0,c1
10213	val2 = _mm_srli_si128(val2,`8`); //0,0,0,c1,0,0,0,0
10214	val0 = _mm_or_si128(val0,val2); //a0, a3, b2, c1, x,x,x,x
10215	_M64(v.val[`0`], val0);
10216
10217	val1 = _mm_slli_si128(tmp0,`4`); //0,0,0,0,0,a1, b0, b3
10218	val1 = _mm_srli_si128(val1,`10`); //a1, b0, b3, 0,0, 0,0,
10219	val2 = _mm_srli_si128(tmp1,`2`); //c2, 0,0,0,0,0,0,0,
10220	val2 = _mm_slli_si128(val2,`6`); //0,0,0,c2,0,0,0,0
10221	val1 = _mm_or_si128(val1,val2); //a1, b0, b3, c2, x,x,x,x
10222	_M64(v.val[`1`], val1);
10223
10224	tmp0 = _mm_srli_si128(tmp0,`12`); //a2, b1,0,0,0,0,0,0
10225	tmp1 = _mm_srli_si128(tmp1,`4`);
10226	tmp1 = _mm_slli_si128(tmp1,`4`); //0,0,c0, c3,
10227	val2 = _mm_or_si128(tmp0, tmp1); //a2, b1, c0, c3,
10228	_M64(v.val[`2`], val2);
10229	return v;
10230	}
10231
10232	_NEON2SSESTORAGE uint32x2x3_t vld3_u32(__transfersize(`6`) uint32_t const * ptr); // VLD3.32 {d0, d1, d2}, [r0]
10233	_NEON2SSE_INLINE uint32x2x3_t vld3_u32(__transfersize(`6`) uint32_t const * ptr) // VLD3.32 {d0, d1, d2}, [r0]
10234	{
10235	//a0,a1, b0,b1, c0,c1, -> a0,b1, a1,c0, b0,c1
10236	uint32x2x3_t v;
10237	__m128i val0, val1, val2;
10238	val0 = vld1q_u32 (ptr); //a0,a1, b0,b1,
10239	val2 = _mm_loadl_epi64((__m128i) (ptr + `4`)); //c0,c1, x,x*
10240
10241	val0 = _mm_shuffle_epi32(val0, `0` \| (`3` << `2`) \| (`1` << `4`) \| (`2` << `6`)); //a0,b1, a1, b0
10242	_M64(v.val[`0`], val0);
10243	val2 = _mm_slli_si128(val2, `8`); //x, x,c0,c1,
10244	val1 = _mm_unpackhi_epi32(val0,val2); //a1,c0, b0, c1
10245	_M64(v.val[`1`], val1);
10246	val2 = _mm_srli_si128(val1, `8`); //b0, c1, x, x,
10247	_M64(v.val[`2`], val2);
10248	return v;
10249	}
10250	_NEON2SSESTORAGE uint64x1x3_t vld3_u64(__transfersize(`3`) uint64_t const * ptr); // VLD1.64 {d0, d1, d2}, [r0]
10251	_NEON2SSE_INLINE uint64x1x3_t vld3_u64(__transfersize(`3`) uint64_t const * ptr) // VLD1.64 {d0, d1, d2}, [r0]
10252	{
10253	uint64x1x3_t v;
10254	v.val[`0`].m64_u64[`0`] = *(ptr);
10255	v.val[`1`].m64_u64[`0`] = *(ptr + `1`);
10256	v.val[`2`].m64_u64[`0`] = *(ptr + `2`);
10257	return v;
10258	}
10259
10260	_NEON2SSE_GLOBAL int8x8x3_t vld3_s8(__transfersize(`24`) int8_t const * ptr); // VLD3.8 {d0, d1, d2}, [r0]
10261	#define vld3_s8(ptr) vld3_u8((uint8_t*)ptr)
10262
10263	_NEON2SSE_GLOBAL int16x4x3_t vld3_s16(__transfersize(`12`) int16_t const * ptr); // VLD3.16 {d0, d1, d2}, [r0]
10264	#define vld3_s16(ptr) vld3_u16((uint16_t*)ptr)
10265
10266	_NEON2SSE_GLOBAL int32x2x3_t vld3_s32(__transfersize(`6`) int32_t const * ptr); // VLD3.32 {d0, d1, d2}, [r0]
10267	#define vld3_s32(ptr) vld3_u32((uint32_t*)ptr)
10268
10269	//int64x1x3_t vld3_s64(__transfersize(3) int64_t const ptr); // VLD1.64 {d0, d1, d2}, [r0]*
10270	#define vld3_s64(ptr) vld3_u64((uint64_t*)ptr)
10271
10272	_NEON2SSE_GLOBAL float16x4x3_t vld3_f16(__transfersize(`12`) __fp16 const * ptr); // VLD3.16 {d0, d1, d2}, [r0]
10273	// IA32 SIMD doesn't work with 16bit floats currently, so need to go to 32 bit and then work with two 128bit registers. See vld1q_f16 for example
10274
10275	_NEON2SSESTORAGE float32x2x3_t vld3_f32(__transfersize(`6`) float32_t const * ptr); // VLD3.32 {d0, d1, d2}, [r0]
10276	_NEON2SSE_INLINE float32x2x3_t vld3_f32(__transfersize(`6`) float32_t const * ptr)
10277	{
10278	//a0,a1, b0,b1, c0,c1, -> a0,b1, a1,c0, b0,c1
10279	float32x2x3_t v;
10280	v.val[`0`].m64_f32[`0`] = *(ptr);
10281	v.val[`0`].m64_f32[`1`] = *(ptr + `3`);
10282
10283	v.val[`1`].m64_f32[`0`] = *(ptr + `1`);
10284	v.val[`1`].m64_f32[`1`] = *(ptr + `4`);
10285
10286	v.val[`2`].m64_f32[`0`] = *(ptr + `2`);
10287	v.val[`2`].m64_f32[`1`] = *(ptr + `5`);
10288	return v;
10289	}
10290
10291	_NEON2SSE_GLOBAL poly8x8x3_t vld3_p8(__transfersize(`24`) poly8_t const * ptr); // VLD3.8 {d0, d1, d2}, [r0]
10292	#define vld3_p8 vld3_u8
10293
10294	_NEON2SSE_GLOBAL poly16x4x3_t vld3_p16(__transfersize(`12`) poly16_t const * ptr); // VLD3.16 {d0, d1, d2}, [r0]
10295	#define vld3_p16 vld3_u16
10296
10297	//************* Quadruples load ******************************
10298	//*****************************************************************
10299	_NEON2SSESTORAGE uint8x16x4_t vld4q_u8(__transfersize(`64`) uint8_t const * ptr); // VLD4.8 {d0, d2, d4, d6}, [r0]
10300	_NEON2SSE_INLINE uint8x16x4_t vld4q_u8(__transfersize(`64`) uint8_t const * ptr) // VLD4.8 {d0, d2, d4, d6}, [r0]
10301	{
10302	uint8x16x4_t v;
10303	__m128i tmp3, tmp2, tmp1, tmp0;
10304
10305	v.val[`0`] = vld1q_u8 ( ptr); //a0,a1,a2,...a7, ...a15
10306	v.val[`1`] = vld1q_u8 ( (ptr + `16`)); //b0, b1,b2,...b7.... b15
10307	v.val[`2`] = vld1q_u8 ( (ptr + `32`)); //c0, c1,c2,...c7....c15
10308	v.val[`3`] = vld1q_u8 ( (ptr + `48`)); //d0,d1,d2,...d7....d15
10309
10310	tmp0 = _mm_unpacklo_epi8(v.val[`0`],v.val[`1`]); //a0,b0, a1,b1, a2,b2, a3,b3,....a7,b7
10311	tmp1 = _mm_unpacklo_epi8(v.val[`2`],v.val[`3`]); //c0,d0, c1,d1, c2,d2, c3,d3,... c7,d7
10312	tmp2 = _mm_unpackhi_epi8(v.val[`0`],v.val[`1`]); //a8,b8, a9,b9, a10,b10, a11,b11,...a15,b15
10313	tmp3 = _mm_unpackhi_epi8(v.val[`2`],v.val[`3`]); //c8,d8, c9,d9, c10,d10, c11,d11,...c15,d15
10314
10315	v.val[`0`] = _mm_unpacklo_epi8(tmp0, tmp2); //a0,a8, b0,b8, a1,a9, b1,b9, ....a3,a11, b3,b11
10316	v.val[`1`] = _mm_unpackhi_epi8(tmp0, tmp2); //a4,a12, b4,b12, a5,a13, b5,b13,....a7,a15,b7,b15
10317	v.val[`2`] = _mm_unpacklo_epi8(tmp1, tmp3); //c0,c8, d0,d8, c1,c9, d1,d9.....d3,d11
10318	v.val[`3`] = _mm_unpackhi_epi8(tmp1, tmp3); //c4,c12,d4,d12, c5,c13, d5,d13,....d7,d15
10319
10320	tmp0 = _mm_unpacklo_epi32(v.val[`0`], v.val[`2`] ); ///a0,a8, b0,b8, c0,c8, d0,d8, a1,a9, b1,b9, c1,c9, d1,d9
10321	tmp1 = _mm_unpackhi_epi32(v.val[`0`], v.val[`2`] ); //a2,a10, b2,b10, c2,c10, d2,d10, a3,a11, b3,b11, c3,c11, d3,d11
10322	tmp2 = _mm_unpacklo_epi32(v.val[`1`], v.val[`3`] ); //a4,a12, b4,b12, c4,c12, d4,d12, a5,a13, b5,b13, c5,c13, d5,d13,
10323	tmp3 = _mm_unpackhi_epi32(v.val[`1`], v.val[`3`] ); //a6,a14, b6,b14, c6,c14, d6,d14, a7,a15,b7,b15,c7,c15,d7,d15
10324
10325	v.val[`0`] = _mm_unpacklo_epi8(tmp0, tmp2); //a0,a4,a8,a12,b0,b4,b8,b12,c0,c4,c8,c12,d0,d4,d8,d12
10326	v.val[`1`] = _mm_unpackhi_epi8(tmp0, tmp2); //a1,a5, a9, a13, b1,b5, b9,b13, c1,c5, c9, c13, d1,d5, d9,d13
10327	v.val[`2`] = _mm_unpacklo_epi8(tmp1, tmp3); //a2,a6, a10,a14, b2,b6, b10,b14,c2,c6, c10,c14, d2,d6, d10,d14
10328	v.val[`3`] = _mm_unpackhi_epi8(tmp1, tmp3); //a3,a7, a11,a15, b3,b7, b11,b15,c3,c7, c11, c15,d3,d7, d11,d15
10329	return v;
10330	}
10331
10332	_NEON2SSESTORAGE uint16x8x4_t vld4q_u16(__transfersize(`32`) uint16_t const * ptr); // VLD4.16 {d0, d2, d4, d6}, [r0]
10333	_NEON2SSE_INLINE uint16x8x4_t vld4q_u16(__transfersize(`32`) uint16_t const * ptr) // VLD4.16 {d0, d2, d4, d6}, [r0]
10334	{
10335	uint16x8x4_t v;
10336	__m128i tmp3, tmp2, tmp1, tmp0;
10337	tmp0 = vld1q_u16 (ptr); //a0,a1,a2,...a7
10338	tmp1 = vld1q_u16 ((ptr + `8`)); //b0, b1,b2,...b7
10339	tmp2 = vld1q_u16 ((ptr + `16`)); //c0, c1,c2,...c7
10340	tmp3 = vld1q_u16 ((ptr + `24`)); //d0,d1,d2,...d7
10341	v.val[`0`] = _mm_unpacklo_epi16(tmp0,tmp1); //a0,b0, a1,b1, a2,b2, a3,b3,
10342	v.val[`1`] = _mm_unpacklo_epi16(tmp2,tmp3); //c0,d0, c1,d1, c2,d2, c3,d3,
10343	v.val[`2`] = _mm_unpackhi_epi16(tmp0,tmp1); //a4,b4, a5,b5, a6,b6, a7,b7
10344	v.val[`3`] = _mm_unpackhi_epi16(tmp2,tmp3); //c4,d4, c5,d5, c6,d6, c7,d7
10345	tmp0 = _mm_unpacklo_epi16(v.val[`0`], v.val[`2`]); //a0,a4, b0,b4, a1,a5, b1,b5
10346	tmp1 = _mm_unpackhi_epi16(v.val[`0`], v.val[`2`]); //a2,a6, b2,b6, a3,a7, b3,b7
10347	tmp2 = _mm_unpacklo_epi16(v.val[`1`], v.val[`3`]); //c0,c4, d0,d4, c1,c5, d1,d5
10348	tmp3 = _mm_unpackhi_epi16(v.val[`1`], v.val[`3`]); //c2,c6, d2,d6, c3,c7, d3,d7
10349	v.val[`0`] = _mm_unpacklo_epi64(tmp0, tmp2); //a0,a4, b0,b4, c0,c4, d0,d4,
10350	v.val[`1`] = _mm_unpackhi_epi64(tmp0, tmp2); //a1,a5, b1,b5, c1,c5, d1,d5
10351	v.val[`2`] = _mm_unpacklo_epi64(tmp1, tmp3); //a2,a6, b2,b6, c2,c6, d2,d6,
10352	v.val[`3`] = _mm_unpackhi_epi64(tmp1, tmp3); //a3,a7, b3,b7, c3,c7, d3,d7
10353	return v;
10354	}
10355
10356	_NEON2SSESTORAGE uint32x4x4_t vld4q_u32(__transfersize(`16`) uint32_t const * ptr); // VLD4.32 {d0, d2, d4, d6}, [r0]
10357	_NEON2SSE_INLINE uint32x4x4_t vld4q_u32(__transfersize(`16`) uint32_t const * ptr) // VLD4.32 {d0, d2, d4, d6}, [r0]
10358	{
10359	uint32x4x4_t v;
10360	__m128i tmp3, tmp2, tmp1, tmp0;
10361	v.val[`0`] = vld1q_u32 (ptr);
10362	v.val[`1`] = vld1q_u32 ((ptr + `4`));
10363	v.val[`2`] = vld1q_u32 ((ptr + `8`));
10364	v.val[`3`] = vld1q_u32 ((ptr + `12`));
10365	tmp0 = _mm_unpacklo_epi32(v.val[`0`],v.val[`1`]);
10366	tmp1 = _mm_unpacklo_epi32(v.val[`2`],v.val[`3`]);
10367	tmp2 = _mm_unpackhi_epi32(v.val[`0`],v.val[`1`]);
10368	tmp3 = _mm_unpackhi_epi32(v.val[`2`],v.val[`3`]);
10369	v.val[`0`] = _mm_unpacklo_epi64(tmp0, tmp1);
10370	v.val[`1`] = _mm_unpackhi_epi64(tmp0, tmp1);
10371	v.val[`2`] = _mm_unpacklo_epi64(tmp2, tmp3);
10372	v.val[`3`] = _mm_unpackhi_epi64(tmp2, tmp3);
10373	return v;
10374	}
10375
10376	_NEON2SSE_GLOBAL int8x16x4_t vld4q_s8(__transfersize(`64`) int8_t const * ptr); // VLD4.8 {d0, d2, d4, d6}, [r0]
10377	#define vld4q_s8(ptr) vld4q_u8((uint8_t*)ptr)
10378
10379	_NEON2SSE_GLOBAL int16x8x4_t vld4q_s16(__transfersize(`32`) int16_t const * ptr); // VLD4.16 {d0, d2, d4, d6}, [r0]
10380	#define vld4q_s16(ptr) vld4q_u16((uint16_t*)ptr)
10381
10382	_NEON2SSE_GLOBAL int32x4x4_t vld4q_s32(__transfersize(`16`) int32_t const * ptr); // VLD4.32 {d0, d2, d4, d6}, [r0]
10383	#define vld4q_s32(ptr) vld4q_u32((uint32_t*)ptr)
10384
10385	_NEON2SSE_GLOBAL float16x8x4_t vld4q_f16(__transfersize(`32`) __fp16 const * ptr); // VLD4.16 {d0, d2, d4, d6}, [r0]
10386	// IA32 SIMD doesn't work with 16bit floats currently, so need to go to 32 bit and then work with two 128bit registers. See vld1q_f16 for example
10387
10388	_NEON2SSESTORAGE float32x4x4_t vld4q_f32(__transfersize(`16`) float32_t const * ptr); // VLD4.32 {d0, d2, d4, d6}, [r0]
10389	_NEON2SSE_INLINE float32x4x4_t vld4q_f32(__transfersize(`16`) float32_t const * ptr) // VLD4.32 {d0, d2, d4, d6}, [r0]
10390	{
10391	float32x4x4_t v;
10392	__m128 tmp3, tmp2, tmp1, tmp0;
10393
10394	v.val[`0`] = vld1q_f32 ((float*) ptr);
10395	v.val[`1`] = vld1q_f32 ((float*) (ptr + `4`));
10396	v.val[`2`] = vld1q_f32 ((float*) (ptr + `8`));
10397	v.val[`3`] = vld1q_f32 ((float*) (ptr + `12`));
10398	tmp0 = _mm_unpacklo_ps(v.val[`0`], v.val[`1`]);
10399	tmp2 = _mm_unpacklo_ps(v.val[`2`], v.val[`3`]);
10400	tmp1 = _mm_unpackhi_ps(v.val[`0`], v.val[`1`]);
10401	tmp3 = _mm_unpackhi_ps(v.val[`2`], v.val[`3`]);
10402	v.val[`0`] = _mm_movelh_ps(tmp0, tmp2);
10403	v.val[`1`] = _mm_movehl_ps(tmp2, tmp0);
10404	v.val[`2`] = _mm_movelh_ps(tmp1, tmp3);
10405	v.val[`3`] = _mm_movehl_ps(tmp3, tmp1);
10406	return v;
10407	}
10408
10409	_NEON2SSE_GLOBAL poly8x16x4_t vld4q_p8(__transfersize(`64`) poly8_t const * ptr); // VLD4.8 {d0, d2, d4, d6}, [r0]
10410	#define vld4q_p8 vld4q_u8
10411
10412	_NEON2SSE_GLOBAL poly16x8x4_t vld4q_p16(__transfersize(`32`) poly16_t const * ptr); // VLD4.16 {d0, d2, d4, d6}, [r0]
10413	#define vld4q_p16 vld4q_s16
10414
10415	_NEON2SSESTORAGE uint8x8x4_t vld4_u8(__transfersize(`32`) uint8_t const * ptr); // VLD4.8 {d0, d1, d2, d3}, [r0]
10416	_NEON2SSE_INLINE uint8x8x4_t vld4_u8(__transfersize(`32`) uint8_t const * ptr) // VLD4.8 {d0, d1, d2, d3}, [r0]
10417	{
10418	uint8x8x4_t v;
10419	__m128i sh0, sh1;
10420	__m128i val0, val2;
10421	_NEON2SSE_ALIGN_16 static const int8_t mask4_8[`16`] = {`0`, `4`, `8`, `12`, `1`, `5`, `9`, `13`, `2`, `6`, `10`, `14`, `3`, `7`, `11`, `15`};
10422
10423	val0 = vld1q_u8(( ptr)); //load first 64-bits in val[0] and val[1]
10424	val2 = vld1q_u8(( ptr + `16`)); //load third and forth 64-bits in val[2], val[3]
10425
10426	sh0 = _mm_shuffle_epi8(val0, (__m128i)mask4_8);
10427	sh1 = _mm_shuffle_epi8(val2, (__m128i)mask4_8);
10428	val0 = _mm_unpacklo_epi32(sh0,sh1); //0,4,8,12,16,20,24,28, 1,5,9,13,17,21,25,29
10429	vst1q_u8(&v.val[`0`], val0 );
10430	val2 = _mm_unpackhi_epi32(sh0,sh1); //2,6,10,14,18,22,26,30, 3,7,11,15,19,23,27,31
10431	vst1q_u8(&v.val[`2`], val2 );
10432	return v;
10433	}
10434
10435	_NEON2SSESTORAGE uint16x4x4_t vld4_u16(__transfersize(`16`) uint16_t const * ptr); // VLD4.16 {d0, d1, d2, d3}, [r0]
10436	_NEON2SSE_INLINE uint16x4x4_t vld4_u16(__transfersize(`16`) uint16_t const * ptr) // VLD4.16 {d0, d1, d2, d3}, [r0]
10437	{
10438	uint16x4x4_t v;
10439	__m128i sh0, sh1;
10440	__m128i val0, val2;
10441	_NEON2SSE_ALIGN_16 static const int8_t mask4_16[`16`] = {`0`,`1`, `8`,`9`, `2`,`3`, `10`,`11`, `4`,`5`, `12`,`13`, `6`,`7`, `14`,`15`}; //0, 4, 1, 5, 2, 6, 3, 7
10442	val0 = vld1q_u16 ( (ptr)); //load first 64-bits in val[0] and val[1]
10443	val2 = vld1q_u16 ( (ptr + `8`)); //load third and forth 64-bits in val[2], val[3]
10444	sh0 = _mm_shuffle_epi8(val0, (__m128i)mask4_16);
10445	sh1 = _mm_shuffle_epi8(val2, (__m128i)mask4_16);
10446	val0 = _mm_unpacklo_epi32(sh0,sh1); //0,4,8,12, 1,5,9,13
10447	vst1q_u16(&v.val[`0`], val0 );
10448	val2 = _mm_unpackhi_epi32(sh0,sh1); //2,6,10,14, 3,7,11,15
10449	vst1q_u16(&v.val[`2`], val2 );
10450	return v;
10451	}
10452
10453	_NEON2SSESTORAGE uint32x2x4_t vld4_u32(__transfersize(`8`) uint32_t const * ptr); // VLD4.32 {d0, d1, d2, d3}, [r0]
10454	_NEON2SSE_INLINE uint32x2x4_t vld4_u32(__transfersize(`8`) uint32_t const * ptr)
10455	{
10456	//a0,a1, b0,b1, c0,c1, d0,d1 -> a0,c0, a1,c1, b0,d0, b1,d1
10457	uint32x2x4_t v;
10458	__m128i val0, val01, val2;
10459	val0 = vld1q_u32 (ptr); //a0,a1, b0,b1,
10460	val2 = vld1q_u32 ((ptr + `4`)); //c0,c1, d0,d1
10461	val01 = _mm_unpacklo_epi32(val0,val2); //a0, c0, a1,c1,
10462	val2 = _mm_unpackhi_epi32(val0,val2); //b0,d0, b1, d1
10463	vst1q_u32(&v.val[`0`], val01);
10464	vst1q_u32(&v.val[`2`], val2 );
10465	return v;
10466	}
10467
10468	_NEON2SSESTORAGE uint64x1x4_t vld4_u64(__transfersize(`4`) uint64_t const * ptr); // VLD1.64 {d0, d1, d2, d3}, [r0]
10469	_NEON2SSE_INLINE uint64x1x4_t vld4_u64(__transfersize(`4`) uint64_t const * ptr) // VLD1.64 {d0, d1, d2, d3}, [r0]
10470	{
10471	uint64x1x4_t v;
10472	v.val[`0`].m64_u64[`0`] = (ptr); //load first 64-bits in val[0] and val[1]*
10473	v.val[`1`].m64_u64[`0`] = (ptr + `1`); //load first 64-bits in val[0] and val[1]*
10474	v.val[`2`].m64_u64[`0`] = (ptr + `2`); //load third and forth 64-bits in val[2], val[3]*
10475	v.val[`3`].m64_u64[`0`] = (ptr + `3`); //load third and forth 64-bits in val[2], val[3]*
10476	return v;
10477	}
10478
10479	_NEON2SSE_GLOBAL int8x8x4_t vld4_s8(__transfersize(`32`) int8_t const * ptr); // VLD4.8 {d0, d1, d2, d3}, [r0]
10480	#define vld4_s8(ptr) vld4_u8((uint8_t*)ptr)
10481
10482	_NEON2SSE_GLOBAL int16x4x4_t vld4_s16(__transfersize(`16`) int16_t const * ptr); // VLD4.16 {d0, d1, d2, d3}, [r0]
10483	#define vld4_s16(ptr) vld4_u16((uint16_t*)ptr)
10484
10485	_NEON2SSE_GLOBAL int32x2x4_t vld4_s32(__transfersize(`8`) int32_t const * ptr); // VLD4.32 {d0, d1, d2, d3}, [r0]
10486	#define vld4_s32(ptr) vld4_u32((uint32_t*)ptr)
10487
10488	//int64x1x4_t vld4_s64(__transfersize(4) int64_t const ptr); // VLD1.64 {d0, d1, d2, d3}, [r0]*
10489	#define vld4_s64(ptr) vld4_u64((uint64_t*)ptr)
10490
10491	_NEON2SSE_GLOBAL float16x4x4_t vld4_f16(__transfersize(`16`) __fp16 const * ptr); // VLD4.16 {d0, d1, d2, d3}, [r0]
10492	// IA32 SIMD doesn't work with 16bit floats currently, so need to go to 32 bit and then work with two 128bit registers. See vld1q_f16 for example
10493
10494	_NEON2SSESTORAGE float32x2x4_t vld4_f32(__transfersize(`8`) float32_t const * ptr); // VLD4.32 {d0, d1, d2, d3}, [r0]
10495	_NEON2SSE_INLINE float32x2x4_t vld4_f32(__transfersize(`8`) float32_t const * ptr) // VLD4.32 {d0, d1, d2, d3}, [r0]
10496	{
10497	//a0,a1, b0,b1, c0,c1, d0,d1 -> a0,c0, a1,c1, b0,d0, b1,d1
10498	float32x2x4_t res;
10499	res.val[`0`].m64_f32[`0`] = *(ptr);
10500	res.val[`0`].m64_f32[`1`] = *(ptr + `4`);
10501	res.val[`1`].m64_f32[`0`] = *(ptr + `1`);
10502	res.val[`1`].m64_f32[`1`] = *(ptr + `5`);
10503	res.val[`2`].m64_f32[`0`] = *(ptr + `2`);
10504	res.val[`2`].m64_f32[`1`] = *(ptr + `6`);
10505	res.val[`3`].m64_f32[`0`] = *(ptr + `3`);
10506	res.val[`3`].m64_f32[`1`] = *(ptr + `7`);
10507	return res;
10508	}
10509
10510	_NEON2SSE_GLOBAL poly8x8x4_t vld4_p8(__transfersize(`32`) poly8_t const * ptr); // VLD4.8 {d0, d1, d2, d3}, [r0]
10511	#define vld4_p8 vld4_u8
10512
10513	_NEON2SSE_GLOBAL poly16x4x4_t vld4_p16(__transfersize(`16`) poly16_t const * ptr); // VLD4.16 {d0, d1, d2, d3}, [r0]
10514	#define vld4_p16 vld4_u16
10515
10516	//*********** Duplicate (or propagate) ptr[0] to all val[0] lanes and ptr[1] to all val[1] lanes *****************
10517	//*******************************************************************************************************************
10518	_NEON2SSESTORAGE uint8x8x2_t vld2_dup_u8(__transfersize(`2`) uint8_t const * ptr); // VLD2.8 {d0[], d1[]}, [r0]
10519	_NEON2SSE_INLINE uint8x8x2_t vld2_dup_u8(__transfersize(`2`) uint8_t const * ptr) // VLD2.8 {d0[], d1[]}, [r0]
10520	{
10521	uint8x8x2_t v;
10522	__m128i val0, val1;
10523	val0 = LOAD_SI128(ptr); //0,1,x,x, x,x,x,x,x,x,x,x, x,x,x,x
10524	val1 = _mm_unpacklo_epi8(val0,val0); //0,0,1,1,x,x,x,x, x,x,x,x,x,x,x,x,
10525	val1 = _mm_unpacklo_epi16(val1,val1); //0,0,0,0, 1,1,1,1,x,x,x,x, x,x,x,x
10526	val0 = _mm_unpacklo_epi32(val1,val1); //0,0,0,0, 0,0,0,0,1,1,1,1,1,1,1,1,
10527	vst1q_u8(v.val, val0);
10528	return v;
10529	}
10530
10531	_NEON2SSESTORAGE uint16x4x2_t vld2_dup_u16(__transfersize(`2`) uint16_t const * ptr); // VLD2.16 {d0[], d1[]}, [r0]
10532	_NEON2SSE_INLINE uint16x4x2_t vld2_dup_u16(__transfersize(`2`) uint16_t const * ptr) // VLD2.16 {d0[], d1[]}, [r0]
10533	{
10534	uint16x4x2_t v;
10535	__m128i val0, val1;
10536	val1 = LOAD_SI128(ptr); //0,1,x,x, x,x,x,x
10537	val0 = _mm_shufflelo_epi16(val1, `0`); //00 00 00 00 (all 0)
10538	_M64(v.val[`0`], val0);
10539	val1 = _mm_shufflelo_epi16(val1, `85`); //01 01 01 01 (all 1)
10540	_M64(v.val[`1`], val1);
10541	return v;
10542	}
10543
10544	_NEON2SSESTORAGE uint32x2x2_t vld2_dup_u32(__transfersize(`2`) uint32_t const * ptr); // VLD2.32 {d0[], d1[]}, [r0]
10545	_NEON2SSE_INLINE uint32x2x2_t vld2_dup_u32(__transfersize(`2`) uint32_t const * ptr) // VLD2.32 {d0[], d1[]}, [r0]
10546	{
10547	uint32x2x2_t v;
10548	__m128i val0;
10549	val0 = LOAD_SI128(ptr); //0,1,x,x
10550	val0 = _mm_shuffle_epi32(val0, `0` \| (`0` << `2`) \| (`1` << `4`) \| (`1` << `6`)); //0,0,1,1
10551	vst1q_u32(v.val, val0);
10552	return v;
10553	}
10554
10555	_NEON2SSE_GLOBAL uint64x1x2_t vld2_dup_u64(__transfersize(`2`) uint64_t const * ptr); // VLD1.64 {d0, d1}, [r0]
10556	#define vld2_dup_u64 vld2_u64
10557
10558	_NEON2SSE_GLOBAL int8x8x2_t vld2_dup_s8(__transfersize(`2`) int8_t const * ptr); // VLD2.8 {d0[], d1[]}, [r0]
10559	#define vld2_dup_s8(ptr) vld2_dup_u8((uint8_t*)ptr)
10560
10561	_NEON2SSE_GLOBAL int16x4x2_t vld2_dup_s16(__transfersize(`2`) int16_t const * ptr); // VLD2.16 {d0[], d1[]}, [r0]
10562	#define vld2_dup_s16(ptr) vld2_dup_u16((uint16_t*)ptr)
10563
10564	_NEON2SSE_GLOBAL int32x2x2_t vld2_dup_s32(__transfersize(`2`) int32_t const * ptr); // VLD2.32 {d0[], d1[]}, [r0]
10565	#define vld2_dup_s32(ptr) vld2_dup_u32((uint32_t*)ptr)
10566
10567	_NEON2SSE_GLOBAL int64x1x2_t vld2_dup_s64(__transfersize(`2`) int64_t const * ptr); // VLD1.64 {d0, d1}, [r0]
10568	#define vld2_dup_s64(ptr) vld2_dup_u64((uint64_t*)ptr)
10569
10570	_NEON2SSE_GLOBAL float16x4x2_t vld2_dup_f16(__transfersize(`2`) __fp16 const * ptr); // VLD2.16 {d0[], d1[]}, [r0]
10571	// IA32 SIMD doesn't work with 16bit floats currently, so need to go to 32 bit and then work with two 128bit registers. See vld1q_f16 for example
10572
10573	_NEON2SSESTORAGE float32x2x2_t vld2_dup_f32(__transfersize(`2`) float32_t const * ptr); // VLD2.32 {d0[], d1[]}, [r0]
10574	_NEON2SSE_INLINE float32x2x2_t vld2_dup_f32(__transfersize(`2`) float32_t const * ptr) // VLD2.32 {d0[], d1[]}, [r0]
10575	{
10576	float32x2x2_t v;
10577	v.val[`0`].m64_f32[`0`] = (ptr); //0,0*
10578	v.val[`0`].m64_f32[`1`] = (ptr); //0,0*
10579	v.val[`1`].m64_f32[`0`] = (ptr + `1`); //1,1*
10580	v.val[`1`].m64_f32[`1`] = (ptr + `1`); //1,1*
10581	return v;
10582	}
10583
10584	_NEON2SSE_GLOBAL poly8x8x2_t vld2_dup_p8(__transfersize(`2`) poly8_t const * ptr); // VLD2.8 {d0[], d1[]}, [r0]
10585	#define vld2_dup_p8 vld2_dup_u8
10586
10587	_NEON2SSE_GLOBAL poly16x4x2_t vld2_dup_p16(__transfersize(`2`) poly16_t const * ptr); // VLD2.16 {d0[], d1[]}, [r0]
10588	#define vld2_dup_p16 vld2_dup_s16
10589
10590	//*********** Duplicate (or propagate)triplets: *****************
10591	//********************************************************************
10592	//ptr[0] to all val[0] lanes, ptr[1] to all val[1] lanes and ptr[2] to all val[2] lanes
10593	_NEON2SSESTORAGE uint8x8x3_t vld3_dup_u8(__transfersize(`3`) uint8_t const * ptr); // VLD3.8 {d0[], d1[], d2[]}, [r0]
10594	_NEON2SSE_INLINE uint8x8x3_t vld3_dup_u8(__transfersize(`3`) uint8_t const * ptr) // VLD3.8 {d0[], d1[], d2[]}, [r0]
10595	{
10596	uint8x8x3_t v;
10597	__m128i val0, val1, val2;
10598	val0 = LOAD_SI128(ptr); //0,1,2,x, x,x,x,x,x,x,x,x, x,x,x,x
10599	val1 = _mm_unpacklo_epi8(val0,val0); //0,0,1,1,2,2,x,x, x,x,x,x,x,x,x,x,
10600	val1 = _mm_unpacklo_epi16(val1,val1); //0,0,0,0, 1,1,1,1,2,2,2,2,x,x,x,x,
10601	val0 = _mm_unpacklo_epi32(val1,val1); //0,0,0,0, 0,0,0,0,1,1,1,1,1,1,1,1,
10602	val2 = _mm_unpackhi_epi32(val1,val1); // 2,2,2,2,2,2,2,2, x,x,x,x,x,x,x,x,
10603	vst1q_u8(v.val, val0);
10604	_M64(v.val[`2`], val2);
10605	return v;
10606	}
10607
10608	_NEON2SSESTORAGE uint16x4x3_t vld3_dup_u16(__transfersize(`3`) uint16_t const * ptr); // VLD3.16 {d0[], d1[], d2[]}, [r0]
10609	_NEON2SSE_INLINE uint16x4x3_t vld3_dup_u16(__transfersize(`3`) uint16_t const * ptr) // VLD3.16 {d0[], d1[], d2[]}, [r0]
10610	{
10611	uint16x4x3_t v;
10612	__m128i val0, val1, val2;
10613	val2 = LOAD_SI128(ptr); //0,1,2,x, x,x,x,x
10614	val0 = _mm_shufflelo_epi16(val2, `0`); //00 00 00 00 (all 0)
10615	val1 = _mm_shufflelo_epi16(val2, `85`); //01 01 01 01 (all 1)
10616	val2 = _mm_shufflelo_epi16(val2, `170`); //10 10 10 10 (all 2)
10617	_M64(v.val[`0`], val0);
10618	_M64(v.val[`1`], val1);
10619	_M64(v.val[`2`], val2);
10620	return v;
10621	}
10622
10623	_NEON2SSESTORAGE uint32x2x3_t vld3_dup_u32(__transfersize(`3`) uint32_t const * ptr); // VLD3.32 {d0[], d1[], d2[]}, [r0]
10624	_NEON2SSE_INLINE uint32x2x3_t vld3_dup_u32(__transfersize(`3`) uint32_t const * ptr) // VLD3.32 {d0[], d1[], d2[]}, [r0]
10625	{
10626	uint32x2x3_t v;
10627	__m128i val0, val1, val2;
10628	val2 = LOAD_SI128(ptr); //0,1,2,x
10629	val0 = _mm_shuffle_epi32(val2, `0` \| (`0` << `2`) \| (`2` << `4`) \| (`2` << `6`)); //0,0,2,2
10630	val1 = _mm_shuffle_epi32(val2, `1` \| (`1` << `2`) \| (`2` << `4`) \| (`2` << `6`)); //1,1,2,2
10631	val2 = _mm_srli_si128(val0, `8`); //2,2,0x0,0x0
10632	_M64(v.val[`0`], val0);
10633	_M64(v.val[`1`], val1);
10634	_M64(v.val[`2`], val2);
10635	return v;
10636	}
10637
10638	_NEON2SSESTORAGE uint64x1x3_t vld3_dup_u64(__transfersize(`3`) uint64_t const * ptr); // VLD1.64 {d0, d1, d2}, [r0]
10639	_NEON2SSE_INLINE uint64x1x3_t vld3_dup_u64(__transfersize(`3`) uint64_t const * ptr) // VLD1.64 {d0, d1, d2}, [r0]
10640	{
10641	uint64x1x3_t v;
10642	v.val[`0`].m64_u64[`0`] = *(ptr);
10643	v.val[`1`].m64_u64[`0`] = *(ptr + `1`);
10644	v.val[`2`].m64_u64[`0`] = *(ptr + `2`);
10645	return v;
10646	}
10647
10648	_NEON2SSE_GLOBAL int8x8x3_t vld3_dup_s8(__transfersize(`3`) int8_t const * ptr); // VLD3.8 {d0[], d1[], d2[]}, [r0]
10649	#define vld3_dup_s8(ptr) vld3_dup_u8((uint8_t*)ptr)
10650
10651	_NEON2SSE_GLOBAL int16x4x3_t vld3_dup_s16(__transfersize(`3`) int16_t const * ptr); // VLD3.16 {d0[], d1[], d2[]}, [r0]
10652	#define vld3_dup_s16(ptr) vld3_dup_u16((uint16_t*)ptr)
10653
10654	_NEON2SSE_GLOBAL int32x2x3_t vld3_dup_s32(__transfersize(`3`) int32_t const * ptr); // VLD3.32 {d0[], d1[], d2[]}, [r0]
10655	#define vld3_dup_s32(ptr) vld3_dup_u32((uint32_t*)ptr)
10656
10657	//int64x1x3_t vld3_dup_s64(__transfersize(3) int64_t const ptr); // VLD1.64 {d0, d1, d2}, [r0]*
10658	#define vld3_dup_s64(ptr) vld3_dup_u64((uint64_t*)ptr)
10659
10660
10661	_NEON2SSE_GLOBAL float16x4x3_t vld3_dup_f16(__transfersize(`3`) __fp16 const * ptr); // VLD3.16 {d0[], d1[], d2[]}, [r0]
10662	// IA32 SIMD doesn't work with 16bit floats currently, so need to go to 32 bit and then work with two 128bit registers. See vld1q_f16 for example
10663
10664	_NEON2SSESTORAGE float32x2x3_t vld3_dup_f32(__transfersize(`3`) float32_t const * ptr); // VLD3.32 {d0[], d1[], d2[]}, [r0]
10665	_NEON2SSE_INLINE float32x2x3_t vld3_dup_f32(__transfersize(`3`) float32_t const * ptr) // VLD3.32 {d0[], d1[], d2[]}, [r0]
10666	{
10667	float32x2x3_t v;
10668	int i;
10669	for (i = `0`; i<`3`; i++) {
10670	v.val[i].m64_f32[`0`] = *(ptr + i);
10671	v.val[i].m64_f32[`1`] = *(ptr + i);
10672	}
10673	return v;
10674	}
10675
10676	_NEON2SSE_GLOBAL poly8x8x3_t vld3_dup_p8(__transfersize(`3`) poly8_t const * ptr); // VLD3.8 {d0[], d1[], d2[]}, [r0]
10677	#define vld3_dup_p8 vld3_dup_u8
10678
10679	_NEON2SSE_GLOBAL poly16x4x3_t vld3_dup_p16(__transfersize(`3`) poly16_t const * ptr); // VLD3.16 {d0[], d1[], d2[]}, [r0]
10680	#define vld3_dup_p16 vld3_dup_s16
10681
10682
10683	//*********** Duplicate (or propagate) quadruples: *****************
10684	//***********************************************************************
10685	//ptr[0] to all val[0] lanes, ptr[1] to all val[1] lanes, ptr[2] to all val[2] lanes and ptr[3] to all val[3] lanes
10686	_NEON2SSESTORAGE uint8x8x4_t vld4_dup_u8(__transfersize(`4`) uint8_t const * ptr); // VLD4.8 {d0[], d1[], d2[], d3[]}, [r0]
10687	_NEON2SSE_INLINE uint8x8x4_t vld4_dup_u8(__transfersize(`4`) uint8_t const * ptr) // VLD4.8 {d0[], d1[], d2[], d3[]}, [r0]
10688	{
10689	uint8x8x4_t v;
10690	__m128i val0, val1, val2;
10691	val0 = LOAD_SI128(ptr); //0,1,2,3, x,x,x,x,x,x,x,x, x,x,x,x
10692	val1 = _mm_unpacklo_epi8(val0,val0); //0,0,1,1,2,2,3,3, x,x,x,x,x,x,x,x,
10693	val1 = _mm_unpacklo_epi16(val1,val1); //0,0,0,0, 1,1,1,1,2,2,2,2,3,3,3,3
10694	val0 = _mm_unpacklo_epi32(val1,val1); //0,0,0,0, 0,0,0,0,1,1,1,1,1,1,1,1,
10695	val2 = _mm_unpackhi_epi32(val1,val1); // 2,2,2,2,2,2,2,2, 3,3,3,3, 3,3,3,3
10696	vst1q_u8(&v.val[`0`], val0);
10697	vst1q_u8(&v.val[`2`], val2);
10698	return v;
10699	}
10700
10701	_NEON2SSESTORAGE uint16x4x4_t vld4_dup_u16(__transfersize(`4`) uint16_t const * ptr); // VLD4.16 {d0[], d1[], d2[], d3[]}, [r0]
10702	_NEON2SSE_INLINE uint16x4x4_t vld4_dup_u16(__transfersize(`4`) uint16_t const * ptr) // VLD4.16 {d0[], d1[], d2[], d3[]}, [r0]
10703	{
10704	uint16x4x4_t v;
10705	__m128i val0, val1, val2, val3;
10706	val3 = LOAD_SI128(ptr); //0,1,2,3, x,x,x,x
10707	val0 = _mm_shufflelo_epi16(val3, `0`); //00 00 00 00 (all 0)
10708	val1 = _mm_shufflelo_epi16(val3, `85`); //01 01 01 01 (all 1)
10709	val2 = _mm_shufflelo_epi16(val3, `170`); //10 10 10 10 (all 2)
10710	val3 = _mm_shufflelo_epi16(val3, `255`); //11 11 11 11 (all 3)
10711	_M64(v.val[`0`], val0);
10712	_M64(v.val[`1`], val1);
10713	_M64(v.val[`2`], val2);
10714	_M64(v.val[`3`], val3);
10715	return v;
10716	}
10717
10718	_NEON2SSESTORAGE uint32x2x4_t vld4_dup_u32(__transfersize(`4`) uint32_t const * ptr); // VLD4.32 {d0[], d1[], d2[], d3[]}, [r0]
10719	_NEON2SSE_INLINE uint32x2x4_t vld4_dup_u32(__transfersize(`4`) uint32_t const * ptr) // VLD4.32 {d0[], d1[], d2[], d3[]}, [r0]
10720	{
10721	uint32x2x4_t v;
10722	__m128i val0, val1, val2, val3;
10723	val3 = LOAD_SI128(ptr); //0,1,2,3
10724	val0 = _mm_shuffle_epi32(val3, `0` \| (`0` << `2`) \| (`2` << `4`) \| (`3` << `6`)); //0,0,2,3
10725	val1 = _mm_shuffle_epi32(val3, `1` \| (`1` << `2`) \| (`2` << `4`) \| (`3` << `6`)); //1,1,2,3
10726	val2 = _mm_shuffle_epi32(val3, `2` \| (`2` << `2`) \| (`3` << `4`) \| (`3` << `6`)); //2,2,3,3
10727	val3 = _mm_shuffle_epi32(val3, `3` \| (`3` << `2`) \| (`3` << `4`) \| (`3` << `6`)); //3,3,2,2
10728	_M64(v.val[`0`], val0);
10729	_M64(v.val[`1`], val1);
10730	_M64(v.val[`2`], val2);
10731	_M64(v.val[`3`], val3);
10732	return v;
10733	}
10734
10735	_NEON2SSESTORAGE uint64x1x4_t vld4_dup_u64(__transfersize(`4`) uint64_t const * ptr); // VLD1.64 {d0, d1, d2, d3}, [r0]
10736	_NEON2SSE_INLINE uint64x1x4_t vld4_dup_u64(__transfersize(`4`) uint64_t const * ptr) // VLD1.64 {d0, d1, d2, d3}, [r0]
10737	{
10738	uint64x1x4_t v;
10739	v.val[`0`].m64_u64[`0`] = *(ptr);
10740	v.val[`1`].m64_u64[`0`] = *(ptr + `1`);
10741	v.val[`2`].m64_u64[`0`] = *(ptr + `2`);
10742	v.val[`3`].m64_u64[`0`] = *(ptr + `3`);
10743	return v;
10744	}
10745
10746	_NEON2SSE_GLOBAL int8x8x4_t vld4_dup_s8(__transfersize(`4`) int8_t const * ptr); // VLD4.8 {d0[], d1[], d2[], d3[]}, [r0]
10747	#define vld4_dup_s8(ptr) vld4_dup_u8((uint8_t*)ptr)
10748
10749	_NEON2SSE_GLOBAL int16x4x4_t vld4_dup_s16(__transfersize(`4`) int16_t const * ptr); // VLD4.16 {d0[], d1[], d2[], d3[]}, [r0]
10750	#define vld4_dup_s16(ptr) vld4_dup_u16((uint16_t*)ptr)
10751
10752	_NEON2SSE_GLOBAL int32x2x4_t vld4_dup_s32(__transfersize(`4`) int32_t const * ptr); // VLD4.32 {d0[], d1[], d2[], d3[]}, [r0]
10753	#define vld4_dup_s32(ptr) vld4_dup_u32((uint32_t*)ptr)
10754
10755	//int64x1x4_t vld4_dup_s64(__transfersize(4) int64_t const ptr); // VLD1.64 {d0, d1, d2, d3}, [r0]*
10756	#define vld4_dup_s64(ptr) vld4_dup_u64((uint64_t*)ptr)
10757
10758	_NEON2SSE_GLOBAL float16x4x4_t vld4_dup_f16(__transfersize(`4`) __fp16 const * ptr); // VLD4.16 {d0[], d1[], d2[], d3[]}, [r0]
10759	// IA32 SIMD doesn't work with 16bit floats currently, so need to go to 32 bit and then work with two 128bit registers. See vld1q_f16 for example
10760
10761	_NEON2SSESTORAGE float32x2x4_t vld4_dup_f32(__transfersize(`4`) float32_t const * ptr); // VLD4.32 {d0[], d1[], d2[], d3[]}, [r0]
10762	_NEON2SSE_INLINE float32x2x4_t vld4_dup_f32(__transfersize(`4`) float32_t const * ptr) // VLD4.32 {d0[], d1[], d2[], d3[]}, [r0]
10763	{
10764	float32x2x4_t v;
10765	int i;
10766	for (i = `0`; i<`4`; i++) {
10767	v.val[i].m64_f32[`0`] = *(ptr + i);
10768	v.val[i].m64_f32[`1`] = *(ptr + i);
10769	}
10770	return v;
10771	}
10772
10773	_NEON2SSE_GLOBAL poly8x8x4_t vld4_dup_p8(__transfersize(`4`) poly8_t const * ptr); // VLD4.8 {d0[], d1[], d2[], d3[]}, [r0]
10774	#define vld4_dup_p8 vld4_dup_u8
10775
10776	_NEON2SSE_GLOBAL poly16x4x4_t vld4_dup_p16(__transfersize(`4`) poly16_t const * ptr); // VLD4.16 {d0[], d1[], d2[], d3[]}, [r0]
10777	#define vld4_dup_p16 vld4_dup_u16
10778
10779
10780	//**********************************************************************************
10781	//*****************Lane loads for an N-element structures *********************
10782	//**********************************************************************************
10783	//******************** Lane pairs **********************************************
10784	//does vld1_lane_xx ptr[0] to src->val[0] at lane positon and ptr[1] to src->val[1] at lane positon
10785	//we assume src is 16 bit aligned
10786
10787	//!!!!!! Microsoft compiler does not allow xxxxxx_2t function arguments resulting in "formal parameter with __declspec(align('16')) won't be aligned" error
10788	//to fix it the all functions below work with xxxxxx_2t pointers and the corresponding original functions are redefined
10789
10790	//uint16x8x2_t vld2q_lane_u16(__transfersize(2) uint16_t const ptr, uint16x8x2_t src,__constrange(0,7) int lane);// VLD2.16 {d0[0], d2[0]}, [r0]*
10791	_NEON2SSE_INLINE uint16x8x2_t vld2q_lane_u16_ptr(__transfersize(`2`) uint16_t const * ptr, uint16x8x2_t* src,__constrange(`0`,`7`) int lane) // VLD2.16 {d0[0], d2[0]}, [r0]
10792	{
10793	uint16x8x2_t v;
10794	v.val[`0`] = vld1q_lane_s16 (ptr, src->val[`0`], lane);
10795	v.val[`1`] = vld1q_lane_s16 ((ptr + `1`), src->val[`1`], lane);
10796	return v;
10797	}
10798	#define vld2q_lane_u16(ptr, src, lane) vld2q_lane_u16_ptr(ptr, &src, lane)
10799
10800	//uint32x4x2_t vld2q_lane_u32(__transfersize(2) uint32_t const ptr, uint32x4x2_t src,__constrange(0,3) int lane);// VLD2.32 {d0[0], d2[0]}, [r0]*
10801	_NEON2SSE_INLINE uint32x4x2_t vld2q_lane_u32_ptr(__transfersize(`2`) uint32_t const * ptr, uint32x4x2_t* src,__constrange(`0`,`3`) int lane) // VLD2.32 {d0[0], d2[0]}, [r0]
10802	{
10803	uint32x4x2_t v;
10804	v.val[`0`] = _MM_INSERT_EPI32 (src->val[`0`], ptr[`0`], lane);
10805	v.val[`1`] = _MM_INSERT_EPI32 (src->val[`1`], ptr[`1`], lane);
10806	return v;
10807	}
10808	#define vld2q_lane_u32(ptr, src, lane) vld2q_lane_u32_ptr(ptr, &src, lane)
10809
10810	//int16x8x2_t vld2q_lane_s16(__transfersize(2) int16_t const ptr, int16x8x2_t src, __constrange(0,7)int lane);// VLD2.16 {d0[0], d2[0]}, [r0]*
10811	_NEON2SSE_INLINE int16x8x2_t vld2q_lane_s16_ptr(__transfersize(`2`) int16_t const * ptr, int16x8x2_t* src, __constrange(`0`,`7`) int lane)
10812	{
10813	int16x8x2_t v;
10814	v.val[`0`] = vld1q_lane_s16 (ptr, src->val[`0`], lane);
10815	v.val[`1`] = vld1q_lane_s16 ((ptr + `1`), src->val[`1`], lane);
10816	return v;
10817	}
10818	#define vld2q_lane_s16(ptr, src, lane) vld2q_lane_s16_ptr(ptr, &src, lane)
10819
10820	//int32x4x2_t vld2q_lane_s32(__transfersize(2) int32_t const ptr, int32x4x2_t src, __constrange(0,3)int lane);// VLD2.32 {d0[0], d2[0]}, [r0]*
10821	_NEON2SSE_INLINE int32x4x2_t vld2q_lane_s32_ptr(__transfersize(`2`) int32_t const * ptr, int32x4x2_t* src, __constrange(`0`,`3`) int lane)
10822	{
10823	int32x4x2_t v;
10824	v.val[`0`] = _MM_INSERT_EPI32 (src->val[`0`], ptr[`0`], lane);
10825	v.val[`1`] = _MM_INSERT_EPI32 (src->val[`1`], ptr[`1`], lane);
10826	return v;
10827	}
10828	#define vld2q_lane_s32(ptr, src, lane) vld2q_lane_s32_ptr(ptr, &src, lane)
10829
10830	//float16x8x2_t vld2q_lane_f16(__transfersize(2) __fp16 const ptr, float16x8x2_t src, __constrange(0,7)int lane);// VLD2.16 {d0[0], d2[0]}, [r0]*
10831	//current IA SIMD doesn't support float16
10832
10833	//float32x4x2_t vld2q_lane_f32_ptr(__transfersize(2) float32_t const ptr, float32x4x2_t src,__constrange(0,3) int lane);// VLD2.32 {d0[0], d2[0]}, [r0]*
10834	_NEON2SSE_INLINE float32x4x2_t vld2q_lane_f32_ptr(__transfersize(`2`) float32_t const * ptr, float32x4x2_t* src,__constrange(`0`,`3`) int lane) // VLD2.32 {d0[0], d2[0]}, [r0]
10835	{
10836	float32x4x2_t v;
10837	v.val[`0`] = vld1q_lane_f32(ptr, src->val[`0`], lane);
10838	v.val[`1`] = vld1q_lane_f32((ptr + `1`), src->val[`1`], lane);
10839	return v;
10840	}
10841	#define vld2q_lane_f32(ptr,src,lane) vld2q_lane_f32_ptr(ptr,&src,lane)
10842
10843	//poly16x8x2_t vld2q_lane_p16(__transfersize(2) poly16_t const ptr, poly16x8x2_t src,__constrange(0,7) int lane);// VLD2.16 {d0[0], d2[0]}, [r0]*
10844	#define vld2q_lane_p16 vld2q_lane_u16
10845
10846	_NEON2SSESTORAGE uint8x8x2_t vld2_lane_u8(__transfersize(`2`) uint8_t const * ptr, uint8x8x2_t src, __constrange(`0`,`7`) int lane);// VLD2.8 {d0[0], d1[0]}, [r0]
10847	_NEON2SSE_INLINE uint8x8x2_t vld2_lane_u8(__transfersize(`2`) uint8_t const * ptr, uint8x8x2_t src, __constrange(`0`,`7`) int lane) // VLD2.8 {d0[0], d1[0]}, [r0]
10848	{
10849	uint8x8x2_t v;
10850	v.val[`0`] = vld1_lane_u8(ptr, src.val[`0`], lane);
10851	v.val[`1`] = vld1_lane_u8((ptr + `1`), src.val[`1`], lane);
10852	return v;
10853	}
10854
10855	_NEON2SSESTORAGE uint16x4x2_t vld2_lane_u16(__transfersize(`2`) uint16_t const * ptr, uint16x4x2_t src, __constrange(`0`,`3`)int lane);// VLD2.16 {d0[0], d1[0]}, [r0]
10856	_NEON2SSE_INLINE uint16x4x2_t vld2_lane_u16(__transfersize(`2`) uint16_t const * ptr, uint16x4x2_t src, __constrange(`0`,`3`) int lane)
10857	{
10858	uint16x4x2_t v;
10859	v.val[`0`] = vld1_lane_u16(ptr, src.val[`0`], lane);
10860	v.val[`1`] = vld1_lane_u16((ptr + `1`), src.val[`1`], lane);
10861	return v;
10862	}
10863
10864	_NEON2SSESTORAGE uint32x2x2_t vld2_lane_u32(__transfersize(`2`) uint32_t const * ptr, uint32x2x2_t src, __constrange(`0`,`1`)int lane);// VLD2.32 {d0[0], d1[0]}, [r0]
10865	_NEON2SSE_INLINE uint32x2x2_t vld2_lane_u32(__transfersize(`2`) uint32_t const * ptr, uint32x2x2_t src, __constrange(`0`,`1`) int lane)
10866	{
10867	uint32x2x2_t v;
10868	v.val[`0`] = vld1_lane_u32(ptr, src.val[`0`], lane);
10869	v.val[`1`] = vld1_lane_u32((ptr + `1`), src.val[`1`], lane);
10870	return v;
10871	}
10872
10873	_NEON2SSE_GLOBAL int8x8x2_t vld2_lane_s8(__transfersize(`2`) int8_t const * ptr, int8x8x2_t src, __constrange(`0`,`7`) int lane);// VLD2.8 {d0[0], d1[0]}, [r0]
10874	#define vld2_lane_s8(ptr, src, lane) vld2_lane_u8(( uint8_t*) ptr, src, lane)
10875
10876	_NEON2SSE_GLOBAL int16x4x2_t vld2_lane_s16(__transfersize(`2`) int16_t const * ptr, int16x4x2_t src, __constrange(`0`,`3`) int lane);// VLD2.16 {d0[0], d1[0]}, [r0]
10877	#define vld2_lane_s16(ptr, src, lane) vld2_lane_u16(( uint16_t*) ptr, src, lane)
10878
10879	_NEON2SSE_GLOBAL int32x2x2_t vld2_lane_s32(__transfersize(`2`) int32_t const * ptr, int32x2x2_t src, __constrange(`0`,`1`) int lane);// VLD2.32 {d0[0], d1[0]}, [r0]
10880	#define vld2_lane_s32(ptr, src, lane) vld2_lane_u32(( uint32_t*) ptr, src, lane)
10881
10882	//float16x4x2_t vld2_lane_f16(__transfersize(2) __fp16 const ptr, float16x4x2_t src, __constrange(0,3) int lane); // VLD2.16 {d0[0], d1[0]}, [r0]*
10883	//current IA SIMD doesn't support float16
10884
10885	_NEON2SSESTORAGE float32x2x2_t vld2_lane_f32(__transfersize(`2`) float32_t const * ptr, float32x2x2_t src,__constrange(`0`,`1`) int lane); // VLD2.32 {d0[0], d1[0]}, [r0]
10886	_NEON2SSE_INLINE float32x2x2_t vld2_lane_f32(__transfersize(`2`) float32_t const * ptr, float32x2x2_t src,__constrange(`0`,`1`) int lane)
10887	{
10888	float32x2x2_t v;
10889	v.val[`0`] = vld1_lane_f32(ptr, src.val[`0`], lane);
10890	v.val[`1`] = vld1_lane_f32((ptr + `1`), src.val[`1`], lane);
10891	return v;
10892	}
10893
10894	//poly8x8x2_t vld2_lane_p8(__transfersize(2) poly8_t const ptr, poly8x8x2_t src, __constrange(0,7) int lane);// VLD2.8 {d0[0], d1[0]}, [r0]*
10895	_NEON2SSE_GLOBAL poly8x8x2_t vld2_lane_p8_ptr(__transfersize(`2`) poly8_t const * ptr, poly8x8x2_t * src, __constrange(`0`,`7`) int lane); // VLD2.8 {d0[0], d1[0]}, [r0]
10896	#define vld2_lane_p8 vld2_lane_u8
10897
10898	//poly16x4x2_t vld2_lane_p16(__transfersize(2) poly16_t const ptr, poly16x4x2_t src, __constrange(0,3)int lane);// VLD2.16 {d0[0], d1[0]}, [r0]*
10899	_NEON2SSE_GLOBAL poly16x4x2_t vld2_lane_p16_ptr(__transfersize(`2`) poly16_t const * ptr, poly16x4x2_t * src, __constrange(`0`,`3`) int lane); // VLD2.16 {d0[0], d1[0]}, [r0]
10900	#define vld2_lane_p16 vld2_lane_u16
10901
10902	//********* Lane triplets ********************
10903	//*************************************************
10904	//does vld1_lane_xx ptr[0] to src->val[0], ptr[1] to src->val[1] and ptr[2] to src->val[2] at lane positon
10905	//we assume src is 16 bit aligned
10906
10907	//uint16x8x3_t vld3q_lane_u16(__transfersize(3) uint16_t const ptr, uint16x8x3_t src,__constrange(0,7) int lane);// VLD3.16 {d0[0], d2[0], d4[0]}, [r0]*
10908	_NEON2SSE_INLINE uint16x8x3_t vld3q_lane_u16_ptr(__transfersize(`3`) uint16_t const * ptr, uint16x8x3_t* src,__constrange(`0`,`7`) int lane) // VLD3.16 {d0[0], d2[0], d4[0]}, [r0]
10909	{
10910	uint16x8x3_t v;
10911	v.val[`0`] = _MM_INSERT_EPI16 ( src->val[`0`], ptr[`0`], lane);
10912	v.val[`1`] = _MM_INSERT_EPI16 ( src->val[`1`], ptr[`1`], lane);
10913	v.val[`2`] = _MM_INSERT_EPI16 ( src->val[`2`], ptr[`2`], lane);
10914	return v;
10915	}
10916	#define vld3q_lane_u16(ptr, src, lane) vld3q_lane_u16_ptr(ptr, &src, lane)
10917
10918	//uint32x4x3_t vld3q_lane_u32(__transfersize(3) uint32_t const ptr, uint32x4x3_t src,__constrange(0,3) int lane);// VLD3.32 {d0[0], d2[0], d4[0]}, [r0]*
10919	_NEON2SSE_INLINE uint32x4x3_t vld3q_lane_u32_ptr(__transfersize(`3`) uint32_t const * ptr, uint32x4x3_t* src,__constrange(`0`,`3`) int lane) // VLD3.32 {d0[0], d2[0], d4[0]}, [r0]
10920	{
10921	uint32x4x3_t v;
10922	v.val[`0`] = _MM_INSERT_EPI32 ( src->val[`0`], ptr[`0`], lane);
10923	v.val[`1`] = _MM_INSERT_EPI32 ( src->val[`1`], ptr[`1`], lane);
10924	v.val[`2`] = _MM_INSERT_EPI32 ( src->val[`2`], ptr[`2`], lane);
10925	return v;
10926	}
10927	#define vld3q_lane_u32(ptr, src, lane) vld3q_lane_u32_ptr(ptr, &src, lane)
10928
10929	//int16x8x3_t vld3q_lane_s16(__transfersize(3) int16_t const ptr, int16x8x3_t src, __constrange(0,7)int lane);// VLD3.16 {d0[0], d2[0], d4[0]}, [r0]*
10930	_NEON2SSE_INLINE int16x8x3_t vld3q_lane_s16_ptr(__transfersize(`3`) int16_t const * ptr, int16x8x3_t* src, __constrange(`0`,`7`) int lane) // VLD3.16 {d0[0], d2[0], d4[0]}, [r0]
10931	{
10932	int16x8x3_t v;
10933	v.val[`0`] = _MM_INSERT_EPI16 ( src->val[`0`], ptr[`0`], lane);
10934	v.val[`1`] = _MM_INSERT_EPI16 ( src->val[`1`], ptr[`1`], lane);
10935	v.val[`2`] = _MM_INSERT_EPI16 ( src->val[`2`], ptr[`2`], lane);
10936	return v;
10937	}
10938	#define vld3q_lane_s16(ptr, src, lane) vld3q_lane_s16_ptr(ptr, &src, lane)
10939
10940	//int32x4x3_t vld3q_lane_s32(__transfersize(3) int32_t const ptr, int32x4x3_t src, __constrange(0,3)int lane);// VLD3.32 {d0[0], d2[0], d4[0]}, [r0]*
10941	_NEON2SSE_INLINE int32x4x3_t vld3q_lane_s32_ptr(__transfersize(`3`) int32_t const * ptr, int32x4x3_t* src, __constrange(`0`,`3`) int lane) // VLD3.32 {d0[0], d2[0], d4[0]}, [r0]
10942	{
10943	int32x4x3_t v;
10944	v.val[`0`] = _MM_INSERT_EPI32 ( src->val[`0`], ptr[`0`], lane);
10945	v.val[`1`] = _MM_INSERT_EPI32 ( src->val[`1`], ptr[`1`], lane);
10946	v.val[`2`] = _MM_INSERT_EPI32 ( src->val[`2`], ptr[`2`], lane);
10947	return v;
10948	}
10949	#define vld3q_lane_s32(ptr, src, lane) vld3q_lane_s32_ptr(ptr, &src, lane)
10950
10951	_NEON2SSE_GLOBAL float16x8x3_t vld3q_lane_f16_ptr(__transfersize(`3`) __fp16 const * ptr, float16x8x3_t * src, __constrange(`0`,`7`) int lane); // VLD3.16 {d0[0], d2[0], d4[0]}, [r0]
10952	//current IA SIMD doesn't support float16
10953	#define vld3q_lane_f16(ptr, src, lane) vld3q_lane_f16_ptr(ptr, &src, lane)
10954
10955
10956	//float32x4x3_t vld3q_lane_f32(__transfersize(3) float32_t const ptr, float32x4x3_t src,__constrange(0,3) int lane);// VLD3.32 {d0[0], d2[0], d4[0]}, [r0]*
10957	_NEON2SSE_INLINE float32x4x3_t vld3q_lane_f32_ptr(__transfersize(`3`) float32_t const * ptr, float32x4x3_t* src,__constrange(`0`,`3`) int lane) // VLD3.32 {d0[0], d2[0], d4[0]}, [r0]
10958	{
10959	float32x4x3_t v;
10960	v.val[`0`] = vld1q_lane_f32(&ptr[`0`], src->val[`0`], lane);
10961	v.val[`1`] = vld1q_lane_f32(&ptr[`1`], src->val[`1`], lane);
10962	v.val[`2`] = vld1q_lane_f32(&ptr[`2`], src->val[`2`], lane);
10963	return v;
10964	}
10965	#define vld3q_lane_f32(ptr,src,lane) vld3q_lane_f32_ptr(ptr,&src,lane)
10966
10967	_NEON2SSE_GLOBAL poly16x8x3_t vld3q_lane_p16_ptr(__transfersize(`3`) poly16_t const * ptr, poly16x8x3_t * src,__constrange(`0`,`7`) int lane); // VLD3.16 {d0[0], d2[0], d4[0]}, [r0]
10968	#define vld3q_lane_p16 vld3q_lane_u16
10969
10970	_NEON2SSESTORAGE uint8x8x3_t vld3_lane_u8(__transfersize(`3`) uint8_t const * ptr, uint8x8x3_t src, __constrange(`0`,`7`) int lane);// VLD3.8 {d0[0], d1[0], d2[0]}, [r0]
10971	_NEON2SSE_INLINE uint8x8x3_t vld3_lane_u8(__transfersize(`3`) uint8_t const * ptr, uint8x8x3_t src, __constrange(`0`,`7`) int lane) // VLD3.8 {d0[0], d1[0], d2[0]}, [r0]
10972	{
10973	uint8x8x3_t v;
10974	v.val[`0`] = vld1_lane_u8(ptr, src.val[`0`], lane);
10975	v.val[`1`] = vld1_lane_u8((ptr + `1`), src.val[`1`], lane);
10976	v.val[`2`] = vld1_lane_u8((ptr + `2`), src.val[`2`], lane);
10977	return v;
10978	}
10979
10980	_NEON2SSESTORAGE uint16x4x3_t vld3_lane_u16(__transfersize(`3`) uint16_t const * ptr, uint16x4x3_t src, __constrange(`0`,`3`)int lane);// VLD3.16 {d0[0], d1[0], d2[0]}, [r0]
10981	_NEON2SSE_INLINE uint16x4x3_t vld3_lane_u16(__transfersize(`3`) uint16_t const * ptr, uint16x4x3_t src, __constrange(`0`,`3`) int lane) // VLD3.16 {d0[0], d1[0], d2[0]}, [r0]
10982	{
10983	uint16x4x3_t v;
10984	v.val[`0`] = vld1_lane_u16(ptr, src.val[`0`], lane);
10985	v.val[`1`] = vld1_lane_u16((ptr + `1`), src.val[`1`], lane);
10986	v.val[`2`] = vld1_lane_u16((ptr + `2`), src.val[`2`], lane);
10987	return v;
10988	}
10989
10990	_NEON2SSESTORAGE uint32x2x3_t vld3_lane_u32(__transfersize(`3`) uint32_t const * ptr, uint32x2x3_t src, __constrange(`0`,`1`)int lane);// VLD3.32 {d0[0], d1[0], d2[0]}, [r0]
10991	_NEON2SSE_INLINE uint32x2x3_t vld3_lane_u32(__transfersize(`3`) uint32_t const * ptr, uint32x2x3_t src, __constrange(`0`,`1`) int lane) // VLD3.32 {d0[0], d1[0], d2[0]}, [r0]
10992	{
10993	//need to merge into 128 bit anyway
10994	uint32x2x3_t v;
10995	v.val[`0`] = vld1_lane_u32(ptr, src.val[`0`], lane);;
10996	v.val[`1`] = vld1_lane_u32((ptr + `1`), src.val[`1`], lane);;
10997	v.val[`2`] = vld1_lane_u32((ptr + `2`), src.val[`2`], lane);;
10998	return v;
10999	}
11000
11001	_NEON2SSE_GLOBAL int8x8x3_t vld3_lane_s8(__transfersize(`3`) int8_t const * ptr, int8x8x3_t src, __constrange(`0`,`7`) int lane); // VLD3.8 {d0[0], d1[0], d2[0]}, [r0]
11002	#define vld3_lane_s8(ptr, src, lane) vld3_lane_u8(( uint8_t*) ptr, src, lane)
11003
11004	_NEON2SSE_GLOBAL int16x4x3_t vld3_lane_s16(__transfersize(`3`) int16_t const * ptr, int16x4x3_t src, __constrange(`0`,`3`) int lane); // VLD3.16 {d0[0], d1[0], d2[0]}, [r0]
11005	#define vld3_lane_s16(ptr, src, lane) vld3_lane_u16(( uint16_t*) ptr, src, lane)
11006
11007	_NEON2SSE_GLOBAL int32x2x3_t vld3_lane_s32(__transfersize(`3`) int32_t const * ptr, int32x2x3_t src, __constrange(`0`,`1`) int lane); // VLD3.32 {d0[0], d1[0], d2[0]}, [r0]
11008	#define vld3_lane_s32(ptr, src, lane) vld3_lane_u32(( uint32_t*) ptr, src, lane)
11009
11010	_NEON2SSE_GLOBAL float16x4x3_t vld3_lane_f16_ptr(__transfersize(`3`) __fp16 const * ptr, float16x4x3_t * src, __constrange(`0`,`3`) int lane); // VLD3.16 {d0[0], d1[0], d2[0]}, [r0]
11011	//current IA SIMD doesn't support float16
11012
11013	_NEON2SSESTORAGE float32x2x3_t vld3_lane_f32(__transfersize(`3`) float32_t const * ptr, float32x2x3_t src,__constrange(`0`,`1`) int lane);// VLD3.32 {d0[0], d1[0], d2[0]}, [r0]
11014	_NEON2SSE_INLINE float32x2x3_t vld3_lane_f32(__transfersize(`3`) float32_t const * ptr, float32x2x3_t src,__constrange(`0`,`1`) int lane) // VLD3.32 {d0[0], d1[0], d2[0]}, [r0]
11015	{
11016	float32x2x3_t v;
11017	v.val[`0`] = vld1_lane_f32(ptr, src.val[`0`], lane);
11018	v.val[`1`] = vld1_lane_f32((ptr + `1`), src.val[`1`], lane);
11019	v.val[`2`] = vld1_lane_f32((ptr + `2`), src.val[`2`], lane);
11020	return v;
11021	}
11022
11023	_NEON2SSE_GLOBAL poly8x8x3_t vld3_lane_p8(__transfersize(`3`) poly8_t const * ptr, poly8x8x3_t src, __constrange(`0`,`7`) int lane); // VLD3.8 {d0[0], d1[0], d2[0]}, [r0]
11024	#define vld3_lane_p8 vld3_lane_u8
11025
11026	_NEON2SSE_GLOBAL poly16x4x3_t vld3_lane_p16(__transfersize(`3`) poly16_t const * ptr, poly16x4x3_t src, __constrange(`0`,`3`) int lane); // VLD3.16 {d0[0], d1[0], d2[0]}, [r0]
11027	#define vld3_lane_p16 vld3_lane_u16
11028
11029	//***************** Lane Quadruples load *************************
11030	//*********************************************************************
11031	//does vld1_lane_xx ptr[0] to src->val[0], ptr[1] to src->val[1], ptr[2] to src->val[2] and ptr[3] to src->val[3] at lane positon
11032	//we assume src is 16 bit aligned
11033
11034	//uint16x8x4_t vld4q_lane_u16(__transfersize(4) uint16_t const ptr, uint16x8x4_t src,__constrange(0,7) int lane)// VLD4.16 {d0[0], d2[0], d4[0], d6[0]}, [r0]*
11035	_NEON2SSE_INLINE uint16x8x4_t vld4q_lane_u16_ptr(__transfersize(`4`) uint16_t const * ptr, uint16x8x4_t* src,__constrange(`0`,`7`) int lane)
11036	{
11037	uint16x8x4_t v;
11038	v.val[`0`] = _MM_INSERT_EPI16 ( src->val[`0`], ptr[`0`], lane);
11039	v.val[`1`] = _MM_INSERT_EPI16 ( src->val[`1`], ptr[`1`], lane);
11040	v.val[`2`] = _MM_INSERT_EPI16 ( src->val[`2`], ptr[`2`], lane);
11041	v.val[`3`] = _MM_INSERT_EPI16 ( src->val[`3`], ptr[`3`], lane);
11042	return v;
11043	}
11044	#define vld4q_lane_u16(ptr, src, lane) vld4q_lane_u16_ptr(ptr, &src, lane)
11045
11046	//uint32x4x4_t vld4q_lane_u32(__transfersize(4) uint32_t const ptr, uint32x4x4_t src,__constrange(0,3) int lane)// VLD4.32 {d0[0], d2[0], d4[0], d6[0]}, [r0]*
11047	_NEON2SSE_INLINE uint32x4x4_t vld4q_lane_u32_ptr(__transfersize(`4`) uint32_t const * ptr, uint32x4x4_t* src,__constrange(`0`,`3`) int lane)
11048	{
11049	uint32x4x4_t v;
11050	v.val[`0`] = _MM_INSERT_EPI32 ( src->val[`0`], ptr[`0`], lane);
11051	v.val[`1`] = _MM_INSERT_EPI32 ( src->val[`1`], ptr[`1`], lane);
11052	v.val[`2`] = _MM_INSERT_EPI32 ( src->val[`2`], ptr[`2`], lane);
11053	v.val[`3`] = _MM_INSERT_EPI32 ( src->val[`3`], ptr[`3`], lane);
11054	return v;
11055	}
11056	#define vld4q_lane_u32(ptr, src, lane) vld4q_lane_u32_ptr(ptr, &src, lane)
11057
11058	//int16x8x4_t vld4q_lane_s16(__transfersize(4) int16_t const ptr, int16x8x4_t src, __constrange(0,7)int lane);// VLD4.16 {d0[0], d2[0], d4[0], d6[0]}, [r0]*
11059	_NEON2SSE_GLOBAL int16x8x4_t vld4q_lane_s16_ptr(__transfersize(`4`) int16_t const * ptr, int16x8x4_t * src, __constrange(`0`,`7`) int lane); // VLD4.16 {d0[0], d2[0], d4[0], d6[0]}, [r0]
11060	#define vld4q_lane_s16(ptr, src, lane) vld4q_lane_u16(( uint16_t*) ptr, src, lane)
11061
11062	//int32x4x4_t vld4q_lane_s32(__transfersize(4) int32_t const ptr, int32x4x4_t src, __constrange(0,3)int lane);// VLD4.32 {d0[0], d2[0], d4[0], d6[0]}, [r0]*
11063	_NEON2SSE_GLOBAL int32x4x4_t vld4q_lane_s32_ptr(__transfersize(`4`) int32_t const * ptr, int32x4x4_t * src, __constrange(`0`,`3`) int lane); // VLD4.32 {d0[0], d2[0], d4[0], d6[0]}, [r0]
11064	#define vld4q_lane_s32(ptr, src, lane) vld4q_lane_u32(( uint32_t*) ptr, src, lane)
11065
11066	//float16x8x4_t vld4q_lane_f16(__transfersize(4) __fp16 const ptr, float16x8x4_t src, __constrange(0,7)int lane);// VLD4.16 {d0[0], d2[0], d4[0], d6[0]}, [r0]*
11067	_NEON2SSE_GLOBAL float16x8x4_t vld4q_lane_f16_ptr(__transfersize(`4`) __fp16 const * ptr, float16x8x4_t * src, __constrange(`0`,`7`) int lane); // VLD4.16 {d0[0], d2[0], d4[0], d6[0]}, [r0]
11068	//current IA SIMD doesn't support float16
11069
11070	//float32x4x4_t vld4q_lane_f32(__transfersize(4) float32_t const ptr, float32x4x4_t src,__constrange(0,3) int lane)// VLD4.32 {d0[0], d2[0], d4[0], d6[0]}, [r0]*
11071	_NEON2SSE_INLINE float32x4x4_t vld4q_lane_f32_ptr(__transfersize(`4`) float32_t const * ptr, float32x4x4_t* src,__constrange(`0`,`3`) int lane)
11072	{
11073	float32x4x4_t v;
11074	v.val[`0`] = vld1q_lane_f32(&ptr[`0`], src->val[`0`], lane);
11075	v.val[`1`] = vld1q_lane_f32(&ptr[`1`], src->val[`1`], lane);
11076	v.val[`2`] = vld1q_lane_f32(&ptr[`2`], src->val[`2`], lane);
11077	v.val[`3`] = vld1q_lane_f32(&ptr[`3`], src->val[`3`], lane);
11078	return v;
11079	}
11080	#define vld4q_lane_f32(ptr,val,lane) vld4q_lane_f32_ptr(ptr,&val,lane)
11081
11082	//poly16x8x4_t vld4q_lane_p16(__transfersize(4) poly16_t const ptr, poly16x8x4_t src,__constrange(0,7) int lane);// VLD4.16 {d0[0], d2[0], d4[0], d6[0]}, [r0]*
11083	_NEON2SSE_GLOBAL poly16x8x4_t vld4q_lane_p16_ptr(__transfersize(`4`) poly16_t const * ptr, poly16x8x4_t * src,__constrange(`0`,`7`) int lane); // VLD4.16 {d0[0], d2[0], d4[0], d6[0]}, [r0]
11084	#define vld4q_lane_p16 vld4q_lane_u16
11085
11086	_NEON2SSESTORAGE uint8x8x4_t vld4_lane_u8(__transfersize(`4`) uint8_t const * ptr, uint8x8x4_t src, __constrange(`0`,`7`) int lane);// VLD4.8 {d0[0], d1[0], d2[0], d3[0]}, [r0]
11087	_NEON2SSE_INLINE uint8x8x4_t vld4_lane_u8(__transfersize(`4`) uint8_t const * ptr, uint8x8x4_t src, __constrange(`0`,`7`) int lane)
11088	{
11089	uint8x8x4_t v;
11090	v.val[`0`] = vld1_lane_u8(ptr, src.val[`0`], lane);
11091	v.val[`1`] = vld1_lane_u8((ptr + `1`), src.val[`1`], lane);
11092	v.val[`2`] = vld1_lane_u8((ptr + `2`), src.val[`2`], lane);
11093	v.val[`3`] = vld1_lane_u8((ptr + `3`), src.val[`3`], lane);
11094	return v;
11095	}
11096
11097	_NEON2SSESTORAGE uint16x4x4_t vld4_lane_u16(__transfersize(`4`) uint16_t const * ptr, uint16x4x4_t src, __constrange(`0`,`3`)int lane);// VLD4.16 {d0[0], d1[0], d2[0], d3[0]}, [r0]
11098	_NEON2SSE_INLINE uint16x4x4_t vld4_lane_u16(__transfersize(`4`) uint16_t const * ptr, uint16x4x4_t src, __constrange(`0`,`3`) int lane)
11099	{
11100	uint16x4x4_t v;
11101	v.val[`0`] = vld1_lane_u16(ptr, src.val[`0`], lane);
11102	v.val[`1`] = vld1_lane_u16((ptr + `1`), src.val[`1`], lane);
11103	v.val[`2`] = vld1_lane_u16((ptr + `2`), src.val[`2`], lane);
11104	v.val[`3`] = vld1_lane_u16((ptr + `3`), src.val[`3`], lane);
11105	return v;
11106	}
11107
11108	_NEON2SSESTORAGE uint32x2x4_t vld4_lane_u32(__transfersize(`4`) uint32_t const * ptr, uint32x2x4_t src, __constrange(`0`,`1`)int lane);// VLD4.32 {d0[0], d1[0], d2[0], d3[0]}, [r0]
11109	_NEON2SSE_INLINE uint32x2x4_t vld4_lane_u32(__transfersize(`4`) uint32_t const * ptr, uint32x2x4_t src, __constrange(`0`,`1`) int lane)
11110	{
11111	uint32x2x4_t v;
11112	v.val[`0`] = vld1_lane_u32(ptr, src.val[`0`], lane);
11113	v.val[`1`] = vld1_lane_u32((ptr + `1`), src.val[`1`], lane);
11114	v.val[`2`] = vld1_lane_u32((ptr + `2`), src.val[`2`], lane);
11115	v.val[`3`] = vld1_lane_u32((ptr + `3`), src.val[`3`], lane);
11116	return v;
11117	}
11118
11119	_NEON2SSE_GLOBAL int8x8x4_t vld4_lane_s8(__transfersize(`4`) int8_t const * ptr, int8x8x4_t src, __constrange(`0`,`7`) int lane);// VLD4.8 {d0[0], d1[0], d2[0], d3[0]}, [r0]
11120	#define vld4_lane_s8(ptr,src,lane) vld4_lane_u8((uint8_t*)ptr,src,lane)
11121
11122	_NEON2SSE_GLOBAL int16x4x4_t vld4_lane_s16(__transfersize(`4`) int16_t const * ptr, int16x4x4_t src, __constrange(`0`,`3`) int lane);// VLD4.16 {d0[0], d1[0], d2[0], d3[0]}, [r0]
11123	#define vld4_lane_s16(ptr,src,lane) vld4_lane_u16((uint16_t*)ptr,src,lane)
11124
11125	_NEON2SSE_GLOBAL int32x2x4_t vld4_lane_s32(__transfersize(`4`) int32_t const * ptr, int32x2x4_t src, __constrange(`0`,`1`) int lane);// VLD4.32 {d0[0], d1[0], d2[0], d3[0]}, [r0]
11126	#define vld4_lane_s32(ptr,src,lane) vld4_lane_u32((uint32_t*)ptr,src,lane)
11127
11128	//float16x4x4_t vld4_lane_f16(__transfersize(4) __fp16 const ptr, float16x4x4_t src, __constrange(0,3)int lane);// VLD4.16 {d0[0], d1[0], d2[0], d3[0]}, [r0]*
11129	_NEON2SSE_GLOBAL float16x4x4_t vld4_lane_f16_ptr(__transfersize(`4`) __fp16 const * ptr, float16x4x4_t * src, __constrange(`0`,`3`) int lane);
11130	//current IA SIMD doesn't support float16
11131
11132	_NEON2SSESTORAGE float32x2x4_t vld4_lane_f32(__transfersize(`4`) float32_t const * ptr, float32x2x4_t src,__constrange(`0`,`1`) int lane);// VLD4.32 {d0[0], d1[0], d2[0], d3[0]}, [r0]
11133	_NEON2SSE_INLINE float32x2x4_t vld4_lane_f32(__transfersize(`4`) float32_t const * ptr, float32x2x4_t src,__constrange(`0`,`1`) int lane)
11134	{
11135	//serial solution may be faster
11136	float32x2x4_t v;
11137	v.val[`0`] = vld1_lane_f32(ptr, src.val[`0`], lane);
11138	v.val[`1`] = vld1_lane_f32((ptr + `1`), src.val[`1`], lane);
11139	v.val[`2`] = vld1_lane_f32((ptr + `2`), src.val[`2`], lane);
11140	v.val[`3`] = vld1_lane_f32((ptr + `3`), src.val[`3`], lane);
11141	return v;
11142	}
11143
11144	_NEON2SSE_GLOBAL poly8x8x4_t vld4_lane_p8(__transfersize(`4`) poly8_t const * ptr, poly8x8x4_t src, __constrange(`0`,`7`) int lane);// VLD4.8 {d0[0], d1[0], d2[0], d3[0]}, [r0]
11145	#define vld4_lane_p8 vld4_lane_u8
11146
11147	_NEON2SSE_GLOBAL poly16x4x4_t vld4_lane_p16(__transfersize(`4`) poly16_t const * ptr, poly16x4x4_t src, __constrange(`0`,`3`)int lane);// VLD4.16 {d0[0], d1[0], d2[0], d3[0]}, [r0]
11148	#define vld4_lane_p16 vld4_lane_u16
11149
11150	//***************** Store duplets *******************************************
11151	//********************************************************************************
11152	//void vst2q_u8(__transfersize(32) uint8_t ptr, uint8x16x2_t val)// VST2.8 {d0, d2}, [r0]*
11153	_NEON2SSE_INLINE void vst2q_u8_ptr(__transfersize(`32`) uint8_t * ptr, uint8x16x2_t const * val)
11154	{
11155	uint8x16x2_t v;
11156	v.val[`0`] = _mm_unpacklo_epi8(val->val[`0`], val->val[`1`]);
11157	v.val[`1`] = _mm_unpackhi_epi8(val->val[`0`], val->val[`1`]);
11158	vst1q_u8 (ptr, v.val[`0`]);
11159	vst1q_u8 ((ptr + `16`), v.val[`1`]);
11160	}
11161	#define vst2q_u8(ptr, val) vst2q_u8_ptr(ptr, &val)
11162
11163	//void vst2q_u16(__transfersize(16) uint16_t ptr, uint16x8x2_t val)// VST2.16 {d0, d2}, [r0]*
11164	_NEON2SSE_INLINE void vst2q_u16_ptr(__transfersize(`16`) uint16_t * ptr, uint16x8x2_t const * val)
11165	{
11166	uint16x8x2_t v;
11167	v.val[`0`] = _mm_unpacklo_epi16(val->val[`0`], val->val[`1`]);
11168	v.val[`1`] = _mm_unpackhi_epi16(val->val[`0`], val->val[`1`]);
11169	vst1q_u16 (ptr, v.val[`0`]);
11170	vst1q_u16 ((ptr + `8`), v.val[`1`]);
11171	}
11172	#define vst2q_u16(ptr, val) vst2q_u16_ptr(ptr, &val)
11173
11174	//void vst2q_u32(__transfersize(8) uint32_t ptr, uint32x4x2_t val)// VST2.32 {d0, d2}, [r0]*
11175	_NEON2SSE_INLINE void vst2q_u32_ptr(__transfersize(`8`) uint32_t* ptr, uint32x4x2_t const * val)
11176	{
11177	uint32x4x2_t v;
11178	v.val[`0`] = _mm_unpacklo_epi32(val->val[`0`], val->val[`1`]);
11179	v.val[`1`] = _mm_unpackhi_epi32(val->val[`0`], val->val[`1`]);
11180	vst1q_u32 (ptr, v.val[`0`]);
11181	vst1q_u32 ((ptr + `4`), v.val[`1`]);
11182	}
11183	#define vst2q_u32(ptr, val) vst2q_u32_ptr(ptr, &val)
11184
11185	//void vst2q_s8(__transfersize(32) int8_t ptr, int8x16x2_t val); // VST2.8 {d0, d2}, [r0]*
11186	_NEON2SSE_GLOBAL void vst2q_s8_ptr(__transfersize(`32`) int8_t * ptr, int8x16x2_t const * val);
11187	#define vst2q_s8(ptr, val) vst2q_u8((uint8_t*)(ptr), val)
11188
11189	//void vst2q_s16(__transfersize(16) int16_t ptr, int16x8x2_t val);// VST2.16 {d0, d2}, [r0]*
11190	_NEON2SSE_GLOBAL void vst2q_s16_ptr(__transfersize(`16`) int16_t * ptr, int16x8x2_t const * val);
11191	#define vst2q_s16(ptr, val) vst2q_u16((uint16_t*)(ptr), val)
11192
11193	//void vst2q_s32(__transfersize(8) int32_t ptr, int32x4x2_t val);// VST2.32 {d0, d2}, [r0]*
11194	_NEON2SSE_GLOBAL void vst2q_s32_ptr(__transfersize(`8`) int32_t * ptr, int32x4x2_t const * val);
11195	#define vst2q_s32(ptr, val) vst2q_u32((uint32_t*)(ptr), val)
11196
11197	//void vst2q_f16(__transfersize(16) __fp16 ptr, float16x8x2_t val);// VST2.16 {d0, d2}, [r0]*
11198	_NEON2SSE_GLOBAL void vst2q_f16_ptr(__transfersize(`16`) __fp16 * ptr, float16x8x2_t const * val);
11199	// IA32 SIMD doesn't work with 16bit floats currently
11200
11201	//void vst2q_f32(__transfersize(8) float32_t ptr, float32x4x2_t val)// VST2.32 {d0, d2}, [r0]*
11202	_NEON2SSE_INLINE void vst2q_f32_ptr(__transfersize(`8`) float32_t* ptr, float32x4x2_t const * val)
11203	{
11204	float32x4x2_t v;
11205	v.val[`0`] = _mm_unpacklo_ps(val->val[`0`], val->val[`1`]);
11206	v.val[`1`] = _mm_unpackhi_ps(val->val[`0`], val->val[`1`]);
11207	vst1q_f32 (ptr, v.val[`0`]);
11208	vst1q_f32 ((ptr + `4`), v.val[`1`]);
11209	}
11210	#define vst2q_f32(ptr, val) vst2q_f32_ptr(ptr, &val)
11211
11212	//void vst2q_p8(__transfersize(32) poly8_t ptr, poly8x16x2_t val);// VST2.8 {d0, d2}, [r0]*
11213	_NEON2SSE_GLOBAL void vst2q_p8_ptr(__transfersize(`32`) poly8_t * ptr, poly8x16x2_t const * val);
11214	#define vst2q_p8 vst2q_u8
11215
11216	//void vst2q_p16(__transfersize(16) poly16_t ptr, poly16x8x2_t val);// VST2.16 {d0, d2}, [r0]*
11217	_NEON2SSE_GLOBAL void vst2q_p16_ptr(__transfersize(`16`) poly16_t * ptr, poly16x8x2_t const * val);
11218	#define vst2q_p16 vst2q_u16
11219
11220	_NEON2SSESTORAGE void vst2_u8(__transfersize(`16`) uint8_t * ptr, uint8x8x2_t val);// VST2.8 {d0, d1}, [r0]
11221	_NEON2SSE_INLINE void vst2_u8(__transfersize(`16`) uint8_t * ptr, uint8x8x2_t val)
11222	{
11223	__m128i v0;
11224	v0 = _mm_unpacklo_epi8(_pM128i(val.val[`0`]), _pM128i(val.val[`1`]));
11225	vst1q_u8 (ptr, v0);
11226	}
11227
11228	_NEON2SSESTORAGE void vst2_u16(__transfersize(`8`) uint16_t * ptr, uint16x4x2_t val);// VST2.16 {d0, d1}, [r0]
11229	_NEON2SSE_INLINE void vst2_u16(__transfersize(`8`) uint16_t * ptr, uint16x4x2_t val)
11230	{
11231	__m128i v0;
11232	v0 = _mm_unpacklo_epi16(_pM128i(val.val[`0`]), _pM128i(val.val[`1`]));
11233	vst1q_u16 (ptr, v0);
11234	}
11235
11236	_NEON2SSESTORAGE void vst2_u32(__transfersize(`4`) uint32_t * ptr, uint32x2x2_t val);// VST2.32 {d0, d1}, [r0]
11237	_NEON2SSE_INLINE void vst2_u32(__transfersize(`4`) uint32_t * ptr, uint32x2x2_t val)
11238	{
11239	__m128i v0;
11240	v0 = _mm_unpacklo_epi32(_pM128i(val.val[`0`]), _pM128i(val.val[`1`]));
11241	vst1q_u32 (ptr, v0);
11242	}
11243
11244	_NEON2SSESTORAGE void vst2_u64(__transfersize(`2`) uint64_t * ptr, uint64x1x2_t val);// VST1.64 {d0, d1}, [r0]
11245	_NEON2SSE_INLINE void vst2_u64(__transfersize(`2`) uint64_t * ptr, uint64x1x2_t val)
11246	{
11247	*(ptr) = val.val[`0`].m64_u64[`0`];
11248	*(ptr + `1`) = val.val[`1`].m64_u64[`0`];
11249	}
11250
11251	_NEON2SSE_GLOBAL void vst2_s8(__transfersize(`16`) int8_t * ptr, int8x8x2_t val);// VST2.8 {d0, d1}, [r0]
11252	#define vst2_s8(ptr, val) vst2_u8((uint8_t*) ptr, val)
11253
11254	_NEON2SSE_GLOBAL void vst2_s16(__transfersize(`8`) int16_t * ptr, int16x4x2_t val); // VST2.16 {d0, d1}, [r0]
11255	#define vst2_s16(ptr,val) vst2_u16((uint16_t*) ptr, val)
11256
11257	_NEON2SSE_GLOBAL void vst2_s32(__transfersize(`4`) int32_t * ptr, int32x2x2_t val); // VST2.32 {d0, d1}, [r0]
11258	#define vst2_s32(ptr,val) vst2_u32((uint32_t*) ptr, val)
11259
11260	_NEON2SSE_GLOBAL void vst2_s64(__transfersize(`2`) int64_t * ptr, int64x1x2_t val);
11261	#define vst2_s64(ptr,val) vst2_u64((uint64_t*) ptr,val)
11262
11263	//void vst2_f16(__transfersize(8) __fp16 ptr, float16x4x2_t val); // VST2.16 {d0, d1}, [r0]*
11264	//current IA SIMD doesn't support float16
11265
11266	_NEON2SSESTORAGE void vst2_f32(__transfersize(`4`) float32_t * ptr, float32x2x2_t val); // VST2.32 {d0, d1}, [r0]
11267	_NEON2SSE_INLINE void vst2_f32(__transfersize(`4`) float32_t* ptr, float32x2x2_t val)
11268	{
11269	*(ptr) = val.val[`0`].m64_f32[`0`];
11270	*(ptr + `1`) = val.val[`1`].m64_f32[`0`];
11271	*(ptr + `2`) = val.val[`0`].m64_f32[`1`];
11272	*(ptr + `3`) = val.val[`1`].m64_f32[`1`];
11273	}
11274
11275	_NEON2SSE_GLOBAL void vst2_p8(__transfersize(`16`) poly8_t * ptr, poly8x8x2_t val); // VST2.8 {d0, d1}, [r0]
11276	#define vst2_p8 vst2_u8
11277
11278	_NEON2SSE_GLOBAL void vst2_p16(__transfersize(`8`) poly16_t * ptr, poly16x4x2_t val); // VST2.16 {d0, d1}, [r0]
11279	#define vst2_p16 vst2_u16
11280
11281	//****************** Triplets store ***************************************
11282	//******************************************************************************
11283	//void vst3q_u8(__transfersize(48) uint8_t ptr, uint8x16x3_t val)// VST3.8 {d0, d2, d4}, [r0]*
11284	_NEON2SSE_INLINE void vst3q_u8_ptr(__transfersize(`48`) uint8_t * ptr, uint8x16x3_t const * val)
11285	{
11286	uint8x16x3_t v;
11287	__m128i v0,v1,v2, cff, bldmask;
11288	_NEON2SSE_ALIGN_16 static const uint8_t mask0[`16`] = {`0`, `1`, `0xff`, `2`, `3`,`0xff`, `4`, `5`,`0xff`, `6`,`7`,`0xff`, `8`,`9`,`0xff`, `10`};
11289	_NEON2SSE_ALIGN_16 static const uint8_t mask1[`16`] = {`0`, `0xff`, `1`, `2`, `0xff`, `3`, `4`, `0xff`, `5`, `6`, `0xff`, `7`,`8`,`0xff`, `9`,`10`};
11290	_NEON2SSE_ALIGN_16 static const uint8_t mask2[`16`] = {`0xff`, `6`, `7`, `0xff`, `8`, `9`,`0xff`, `10`, `11`,`0xff`, `12`,`13`,`0xff`, `14`,`15`,`0xff`};
11291	_NEON2SSE_ALIGN_16 static const uint8_t mask2lo[`16`] = {`0xff`,`0xff`, `0`, `0xff`,`0xff`, `1`, `0xff`,`0xff`, `2`, `0xff`,`0xff`, `3`, `0xff`,`0xff`, `4`, `0xff`};
11292	_NEON2SSE_ALIGN_16 static const uint8_t mask2med[`16`] = {`0xff`, `5`, `0xff`, `0xff`, `6`, `0xff`,`0xff`, `7`, `0xff`,`0xff`, `8`, `0xff`,`0xff`, `9`, `0xff`, `0xff`};
11293	_NEON2SSE_ALIGN_16 static const uint8_t mask2hi[`16`] = {`10`, `0xff`,`0xff`, `11`, `0xff`,`0xff`, `12`, `0xff`,`0xff`, `13`, `0xff`,`0xff`, `14`, `0xff`, `0xff`, `15`};
11294
11295	v0 = _mm_unpacklo_epi8(val->val[`0`], val->val[`1`]); //0,1, 3,4, 6,7, 9,10, 12,13, 15,16, 18,19, 21,22
11296	v2 = _mm_unpackhi_epi8(val->val[`0`], val->val[`1`]); //24,25, 27,28, 30,31, 33,34, 36,37, 39,40, 42,43, 45,46
11297	v1 = _mm_alignr_epi8(v2, v0, `11`); //12,13, 15,16, 18,19, 21,22, 24,25, 27,28, 30,31, 33,34
11298	v.val[`0`] = _mm_shuffle_epi8(v0, (__m128i)mask0); //make holes for the v.val[2] data embedding
11299	v.val[`2`] = _mm_shuffle_epi8(val->val[`2`], (__m128i)mask2lo); //make plugs for the v.val[2] data embedding
11300	cff = _mm_cmpeq_epi8(v0, v0); //all ff
11301	bldmask = _mm_cmpeq_epi8((__m128i)mask0, cff);
11302	v.val[`0`] = _MM_BLENDV_EPI8(v.val[`0`], v.val[`2`], bldmask);
11303	vst1q_u8(ptr, v.val[`0`]);
11304	v.val[`0`] = _mm_shuffle_epi8(v1, (__m128i)mask1); //make holes for the v.val[2] data embedding
11305	v.val[`2`] = _mm_shuffle_epi8(val->val[`2`], (__m128i)mask2med); //make plugs for the v.val[2] data embedding
11306	bldmask = _mm_cmpeq_epi8((__m128i)mask1, cff);
11307	v.val[`1`] = _MM_BLENDV_EPI8(v.val[`0`],v.val[`2`], bldmask);
11308	vst1q_u8((ptr + `16`), v.val[`1`]);
11309	v.val[`0`] = _mm_shuffle_epi8(v2, (__m128i)mask2); //make holes for the v.val[2] data embedding
11310	v.val[`2`] = _mm_shuffle_epi8(val->val[`2`], (__m128i)mask2hi); //make plugs for the v.val[2] data embedding
11311	bldmask = _mm_cmpeq_epi8((__m128i)mask2, cff);
11312	v.val[`2`] = _MM_BLENDV_EPI8(v.val[`0`],v.val[`2`], bldmask );
11313	vst1q_u8((ptr + `32`), v.val[`2`]);
11314	}
11315	#define vst3q_u8(ptr, val) vst3q_u8_ptr(ptr, &val)
11316
11317	//void vst3q_u16(__transfersize(24) uint16_t ptr, uint16x8x3_t val)// VST3.16 {d0, d2, d4}, [r0]*
11318	_NEON2SSE_INLINE void vst3q_u16_ptr(__transfersize(`24`) uint16_t * ptr, uint16x8x3_t const * val)
11319	{
11320	uint16x8x3_t v;
11321	__m128i v0,v1,v2, cff, bldmask;
11322	_NEON2SSE_ALIGN_16 static const uint8_t mask0[`16`] = {`0`,`1`, `2`,`3`, `0xff`,`0xff`, `4`,`5`, `6`,`7`,`0xff`,`0xff`, `8`,`9`,`10`,`11`};
11323	_NEON2SSE_ALIGN_16 static const uint8_t mask1[`16`] = {`0xff`, `0xff`, `0`,`1`, `2`,`3`, `0xff`,`0xff`, `4`,`5`, `6`,`7`, `0xff`,`0xff`, `8`,`9`};
11324	_NEON2SSE_ALIGN_16 static const uint8_t mask2[`16`] = {`6`,`7`,`0xff`,`0xff`, `8`,`9`,`10`,`11`, `0xff`, `0xff`, `12`,`13`,`14`,`15`, `0xff`, `0xff`};
11325	_NEON2SSE_ALIGN_16 static const uint8_t mask2lo[`16`] = {`0xff`,`0xff`, `0xff`,`0xff`, `0`,`1`, `0xff`,`0xff`, `0xff`,`0xff`, `2`,`3`, `0xff`,`0xff`, `0xff`,`0xff`};
11326	_NEON2SSE_ALIGN_16 static const uint8_t mask2med[`16`] = {`4`,`5`, `0xff`,`0xff`,`0xff`,`0xff`, `6`,`7`, `0xff`, `0xff`,`0xff`,`0xff`, `8`,`9`, `0xff`, `0xff`};
11327	_NEON2SSE_ALIGN_16 static const uint8_t mask2hi[`16`] = {`0xff`, `0xff`, `10`,`11`, `0xff`, `0xff`, `0xff`, `0xff`, `12`,`13`, `0xff`, `0xff`, `0xff`, `0xff`,`14`,`15`};
11328
11329	v0 = _mm_unpacklo_epi16(val->val[`0`], val->val[`1`]); //0,1, 3,4, 6,7, 9,10
11330	v2 = _mm_unpackhi_epi16(val->val[`0`], val->val[`1`]); //12,13, 15,16, 18,19, 21,22,
11331	v1 = _mm_alignr_epi8(v2, v0, `12`); //9,10, 12,13, 15,16, 18,19
11332	v.val[`0`] = _mm_shuffle_epi8(v0, (__m128i)mask0); //make holes for the v.val[2] data embedding
11333	v.val[`2`] = _mm_shuffle_epi8(val->val[`2`], (__m128i)mask2lo); //make plugs for the v.val[2] data embedding
11334	cff = _mm_cmpeq_epi16(v0, v0); //all ff
11335	bldmask = _mm_cmpeq_epi16((__m128i)mask0, cff);
11336	v.val[`0`] = _MM_BLENDV_EPI8(v.val[`0`], v.val[`2`], bldmask);
11337	vst1q_u16(ptr, v.val[`0`]);
11338	v.val[`0`] = _mm_shuffle_epi8(v1, (__m128i)mask1); //make holes for the v.val[2] data embedding
11339	v.val[`2`] = _mm_shuffle_epi8(val->val[`2`], (__m128i)mask2med); //make plugs for the v.val[2] data embedding
11340	bldmask = _mm_cmpeq_epi16((__m128i)mask1, cff);
11341	v.val[`1`] = _MM_BLENDV_EPI8(v.val[`0`],v.val[`2`], bldmask);
11342	vst1q_u16((ptr + `8`), v.val[`1`]);
11343	v.val[`0`] = _mm_shuffle_epi8(v2, (__m128i)mask2); //make holes for the v.val[2] data embedding
11344	v.val[`2`] = _mm_shuffle_epi8(val->val[`2`], (__m128i)mask2hi); //make plugs for the v.val[2] data embedding
11345	bldmask = _mm_cmpeq_epi16((__m128i)mask2, cff);
11346	v.val[`2`] = _MM_BLENDV_EPI8(v.val[`0`],v.val[`2`], bldmask );
11347	vst1q_u16((ptr + `16`), v.val[`2`]);
11348	}
11349	#define vst3q_u16(ptr, val) vst3q_u16_ptr(ptr, &val)
11350
11351	//void vst3q_u32(__transfersize(12) uint32_t ptr, uint32x4x3_t val)// VST3.32 {d0, d2, d4}, [r0]*
11352	_NEON2SSE_INLINE void vst3q_u32_ptr(__transfersize(`12`) uint32_t * ptr, uint32x4x3_t const * val)
11353	{
11354	//a0,a1,a2,a3, b0,b1,b2,b3, c0,c1,c2,c3 -> a0,b0,c0,a1, b1,c1,a2,b2, c2,a3,b3,c3
11355	uint32x4x3_t v;
11356	__m128i tmp0, tmp1,tmp2;
11357	tmp0 = _mm_unpacklo_epi32(val->val[`0`], val->val[`1`]); //a0,b0,a1,b1
11358	tmp1 = _mm_unpackhi_epi32(val->val[`0`], val->val[`1`]); //a2,b2,a3,b3
11359	tmp2 = _mm_unpacklo_epi32(val->val[`1`], val->val[`2`]); //b0,c0,b1,c1
11360	v.val[`1`] = _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp2),_mm_castsi128_ps(tmp1), _MM_SHUFFLE(`1`,`0`,`3`,`2`))); //b1,c1,a2,b2,
11361	v.val[`2`] = _mm_unpackhi_epi64(tmp1, val->val[`2`]); //a3,b3, c2,c3
11362	v.val[`2`] = _mm_shuffle_epi32(v.val[`2`], `2` \| (`0` << `2`) \| (`1` << `4`) \| (`3` << `6`)); //c2,a3,b3,c3
11363	tmp1 = _mm_unpacklo_epi32(tmp2,val->val[`0`]); //b0,a0,c0,a1
11364	v.val[`0`] = _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp0),_mm_castsi128_ps(tmp1), _MM_SHUFFLE(`3`,`2`,`1`,`0`))); //a0,b0,c0,a1,
11365
11366	vst1q_u32(ptr, v.val[`0`]);
11367	vst1q_u32((ptr + `4`), v.val[`1`]);
11368	vst1q_u32((ptr + `8`), v.val[`2`]);
11369	}
11370	#define vst3q_u32(ptr, val) vst3q_u32_ptr(ptr, &val)
11371
11372	//void vst3q_s8(__transfersize(48) int8_t ptr, int8x16x3_t val);*
11373	_NEON2SSE_GLOBAL void vst3q_s8_ptr(__transfersize(`48`) int8_t * ptr, int8x16x3_t const * val);
11374	#define vst3q_s8(ptr, val) vst3q_u8((uint8_t*)(ptr), val)
11375
11376	//void vst3q_s16(__transfersize(24) int16_t ptr, int16x8x3_t val);*
11377	_NEON2SSE_GLOBAL void vst3q_s16_ptr(__transfersize(`24`) int16_t * ptr, int16x8x3_t const * val);
11378	#define vst3q_s16(ptr, val) vst3q_u16((uint16_t*)(ptr), val)
11379
11380	//void vst3q_s32(__transfersize(12) int32_t ptr, int32x4x3_t val);*
11381	_NEON2SSE_GLOBAL void vst3q_s32_ptr(__transfersize(`12`) int32_t * ptr, int32x4x3_t const * val);
11382	#define vst3q_s32(ptr, val) vst3q_u32((uint32_t*)(ptr), val)
11383
11384	//void vst3q_f16(__transfersize(24) __fp16 ptr, float16x8x3_t val);// VST3.16 {d0, d2, d4}, [r0]*
11385	_NEON2SSE_GLOBAL void vst3q_f16_ptr(__transfersize(`24`) __fp16 * ptr, float16x8x3_t const * val);
11386	// IA32 SIMD doesn't work with 16bit floats currently
11387
11388	//void vst3q_f32(__transfersize(12) float32_t ptr, float32x4x3_t val)// VST3.32 {d0, d2, d4}, [r0]*
11389	_NEON2SSE_INLINE void vst3q_f32_ptr(__transfersize(`12`) float32_t * ptr, float32x4x3_t const * val)
11390	{
11391	float32x4x3_t v;
11392	__m128 tmp0, tmp1,tmp2;
11393	tmp0 = _mm_unpacklo_ps(val->val[`0`], val->val[`1`]); //a0,b0,a1,b1
11394	tmp1 = _mm_unpackhi_ps(val->val[`0`], val->val[`1`]); //a2,b2,a3,b3
11395	tmp2 = _mm_unpacklo_ps(val->val[`1`], val->val[`2`]); //b0,c0,b1,c1
11396	v.val[`1`] = _mm_shuffle_ps(tmp2,tmp1, _MM_SHUFFLE(`1`,`0`,`3`,`2`)); //b1,c1,a2,b2,
11397	v.val[`2`] = _mm_movehl_ps(val->val[`2`],tmp1); //a3,b3, c2,c3
11398	v.val[`2`] = _mm_shuffle_ps(v.val[`2`],v.val[`2`], _MM_SHUFFLE(`3`,`1`,`0`,`2`)); //c2,a3,b3,c3
11399	tmp1 = _mm_unpacklo_ps(tmp2,val->val[`0`]); //b0,a0,c0,a1
11400	v.val[`0`] = _mm_shuffle_ps(tmp0,tmp1, _MM_SHUFFLE(`3`,`2`,`1`,`0`)); //a0,b0,c0,a1,
11401
11402	vst1q_f32( ptr, v.val[`0`]);
11403	vst1q_f32( (ptr + `4`), v.val[`1`]);
11404	vst1q_f32( (ptr + `8`), v.val[`2`]);
11405	}
11406	#define vst3q_f32(ptr, val) vst3q_f32_ptr(ptr, &val)
11407
11408	//void vst3q_p8(__transfersize(48) poly8_t ptr, poly8x16x3_t val);// VST3.8 {d0, d2, d4}, [r0]*
11409	_NEON2SSE_GLOBAL void vst3q_p8_ptr(__transfersize(`48`) poly8_t * ptr, poly8x16x3_t const * val);
11410	#define vst3q_p8 vst3q_u8
11411
11412	//void vst3q_p16(__transfersize(24) poly16_t ptr, poly16x8x3_t val);// VST3.16 {d0, d2, d4}, [r0]*
11413	_NEON2SSE_GLOBAL void vst3q_p16_ptr(__transfersize(`24`) poly16_t * ptr, poly16x8x3_t const * val);
11414	#define vst3q_p16 vst3q_u16
11415
11416	_NEON2SSESTORAGE void vst3_u8(__transfersize(`24`) uint8_t * ptr, uint8x8x3_t val);// VST3.8 {d0, d1, d2}, [r0]
11417	_NEON2SSE_INLINE void vst3_u8(__transfersize(`24`) uint8_t * ptr, uint8x8x3_t val)
11418	{
11419	__m128i tmp, sh0, sh1, val0, val2;
11420	_NEON2SSE_ALIGN_16 static const int8_t mask0[`16`] = { `0`, `8`, `16`, `1`, `9`, `17`, `2`, `10`, `18`, `3`, `11`, `19`, `4`, `12`, `20`, `5`};
11421	_NEON2SSE_ALIGN_16 static const int8_t mask1[`16`] = {`13`, `21`, `6`, `14`, `22`, `7`, `15`, `23`, `0`,`0`,`0`,`0`,`0`,`0`,`0`,`0`};
11422	_NEON2SSE_ALIGN_16 static const uint8_t mask0_sel[`16`] = {`0`, `0`, `0xff`, `0`, `0`, `0xff`, `0`, `0`, `0xff`, `0`, `0`, `0xff`, `0`, `0`, `0xff`, `0`};
11423	_NEON2SSE_ALIGN_16 static const uint8_t mask1_sel[`16`] = {`0`, `0xff`, `0`, `0`, `0xff`, `0`, `0`, `0xff`, `0`,`0`,`0`,`0`,`0`,`0`,`0`,`0`};
11424	tmp = _mm_unpacklo_epi64(_pM128i(val.val[`0`]), _pM128i(val.val[`1`]) );
11425	sh0 = _mm_shuffle_epi8(tmp, (__m128i)mask0); //for bi>15 bi is wrapped (bi-=15)
11426	val2 = _pM128i(val.val[`2`]);
11427	sh1 = _mm_shuffle_epi8(val2, (__m128i)mask0);
11428	val0 = _MM_BLENDV_EPI8(sh0, sh1, (__m128i)mask0_sel);
11429	vst1q_u8(ptr, val0); //store as 128 bit structure
11430	sh0 = _mm_shuffle_epi8(tmp, (__m128i)mask1); //for bi>15 bi is wrapped (bi-=15)
11431	sh1 = _mm_shuffle_epi8(val2, (__m128i)mask1);
11432	val2 = _MM_BLENDV_EPI8(sh0, sh1, (__m128i)mask1_sel);
11433	_M64(((__m64_128)(ptr + `16`)), val2); //need it to fit into ptr memory*
11434	}
11435
11436	_NEON2SSESTORAGE void vst3_u16(__transfersize(`12`) uint16_t * ptr, uint16x4x3_t val);// VST3.16 {d0, d1, d2}, [r0]
11437	_NEON2SSE_INLINE void vst3_u16(__transfersize(`12`) uint16_t * ptr, uint16x4x3_t val)
11438	{
11439	__m128i tmp, val0, val1, val2;
11440	_NEON2SSE_ALIGN_16 static const int8_t mask0[`16`] = {`0`,`1`, `8`,`9`, `16`,`17`, `2`,`3`, `10`,`11`, `18`,`19`, `4`,`5`, `12`,`13`};
11441	_NEON2SSE_ALIGN_16 static const int8_t mask1[`16`] = {`20`,`21`, `6`,`7`, `14`,`15`, `22`,`23`, `0`,`0`,`0`,`0`,`0`,`0`,`0`,`0`};
11442	_NEON2SSE_ALIGN_16 static const uint16_t mask0f[`8`] = {`0xffff`, `0xffff`, `0`, `0xffff`, `0xffff`, `0`, `0xffff`, `0xffff`}; //if all ones we take the result from v.val[0] otherwise from v.val[1]
11443	_NEON2SSE_ALIGN_16 static const uint16_t mask1f[`8`] = {`0xffff`, `0`, `0`, `0xffff`, `0xffff`, `0xffff`, `0xffff`, `0xffff`}; //if all ones we take the result from v.val[1] otherwise from v.val[0]
11444	tmp = _mm_unpacklo_epi64(_pM128i(val.val[`0`]), _pM128i(val.val[`1`]));
11445	val0 = _mm_shuffle_epi8(tmp, (__m128i)mask0);
11446	val2 = _pM128i(val.val[`2`]);
11447	val1 = _mm_shuffle_epi8(val2, (__m128i)mask0);
11448	val0 = _MM_BLENDV_EPI8(val1, val0, (__m128i)mask0f);
11449	vst1q_u16(ptr, val0); //store as 128 bit structure
11450	val0 = _mm_shuffle_epi8(tmp, (__m128i)mask1);
11451	val1 = _mm_shuffle_epi8(val2, (__m128i)mask1);
11452	val1 = _MM_BLENDV_EPI8(val0, val1, (__m128i)mask1f); //change the operands order
11453	_M64(((__m64_128)(ptr + `8`)), val1); //need it to fit into ptr memory*
11454	}
11455
11456	_NEON2SSESTORAGE void vst3_u32(__transfersize(`6`) uint32_t * ptr, uint32x2x3_t val);// VST3.32 {d0, d1, d2}, [r0]
11457	_NEON2SSE_INLINE void vst3_u32(__transfersize(`6`) uint32_t * ptr, uint32x2x3_t val)
11458	{
11459	//val.val[0]:0,3,val.val[1]:1,4; val.val[2]:2,5,x,x;
11460	__m128i val0, val1;
11461	val0 = _mm_unpacklo_epi64(_pM128i(val.val[`1`]), _pM128i(val.val[`2`])); //val[0]: 1,4,2,5
11462	val0 = _mm_shuffle_epi32(val0, `0` \| (`2` << `2`) \| (`1` << `4`) \| (`3` << `6`)); //1,2,4,5
11463	val1 = _mm_srli_si128(val0, `8`); //4,5, x,x
11464	_M64(((__m64_128)(ptr + `4`)), val1);
11465	val0 = _mm_unpacklo_epi32(_pM128i(val.val[`0`]), val0); //0,1,3,2
11466	val0 = _mm_shuffle_epi32(val0, `0` \| (`1` << `2`) \| (`3` << `4`) \| (`2` << `6`)); //0,1,2, 3
11467	vst1q_u32(ptr, val0); //store as 128 bit structure
11468	}
11469
11470	_NEON2SSESTORAGE void vst3_u64(__transfersize(`3`) uint64_t * ptr, uint64x1x3_t val);// VST1.64 {d0, d1, d2}, [r0]
11471	_NEON2SSE_INLINE void vst3_u64(__transfersize(`3`) uint64_t * ptr, uint64x1x3_t val)
11472	{
11473	*(ptr) = val.val[`0`].m64_u64[`0`];
11474	*(ptr + `1`) = val.val[`1`].m64_u64[`0`];
11475	*(ptr + `2`) = val.val[`2`].m64_u64[`0`];
11476	}
11477
11478	_NEON2SSE_GLOBAL void vst3_s8(__transfersize(`24`) int8_t * ptr, int8x8x3_t val); // VST3.8 {d0, d1, d2}, [r0]
11479	#define vst3_s8(ptr, val) vst3_u8((uint8_t*)ptr, val)
11480
11481	_NEON2SSE_GLOBAL void vst3_s16(__transfersize(`12`) int16_t * ptr, int16x4x3_t val); // VST3.16 {d0, d1, d2}, [r0]
11482	#define vst3_s16(ptr, val) vst3_u16((uint16_t*)ptr, val)
11483
11484	_NEON2SSE_GLOBAL void vst3_s32(__transfersize(`6`) int32_t * ptr, int32x2x3_t val); // VST3.32 {d0, d1, d2}, [r0]
11485	#define vst3_s32(ptr, val) vst3_u32((uint32_t*)ptr, val)
11486
11487	_NEON2SSE_GLOBAL void vst3_s64(__transfersize(`3`) int64_t * ptr, int64x1x3_t val); // VST1.64 {d0, d1, d2}, [r0]
11488	#define vst3_s64(ptr, val) vst3_u64((uint64_t*)ptr, val)
11489
11490	//void vst3_f16(__transfersize(12) __fp16 ptr, float16x4x3_t val);// VST3.16 {d0, d1, d2}, [r0]*
11491	_NEON2SSE_GLOBAL void vst3_f16_ptr(__transfersize(`12`) __fp16 * ptr, float16x4x3_t const * val); // VST3.16 {d0, d1, d2}, [r0]
11492	// IA32 SIMD doesn't work with 16bit floats currently, so need to go to 32 bit and then work with two 128bit registers. See vld1q_f16 for example
11493
11494	_NEON2SSESTORAGE void vst3_f32(__transfersize(`6`) float32_t * ptr, float32x2x3_t val);// VST3.32 {d0, d1, d2}, [r0]
11495	_NEON2SSE_INLINE void vst3_f32(__transfersize(`6`) float32_t * ptr, float32x2x3_t val)
11496	{
11497	//val->val[0]:0,3,val->val[1]:1,4; val->val[2]:2,5,x,x; -> 0,2, 4,1, 3,5
11498	*(ptr) = val.val[`0`].m64_f32[`0`];
11499	*(ptr + `1`) = val.val[`1`].m64_f32[`0`];
11500	*(ptr + `2`) = val.val[`2`].m64_f32[`0`];
11501	*(ptr + `3`) = val.val[`0`].m64_f32[`1`];
11502	*(ptr + `4`) = val.val[`1`].m64_f32[`1`];
11503	*(ptr + `5`) = val.val[`2`].m64_f32[`1`];
11504	}
11505
11506	_NEON2SSE_GLOBAL void vst3_p8(__transfersize(`24`) poly8_t * ptr, poly8x8x3_t val);// VST3.8 {d0, d1, d2}, [r0]
11507	#define vst3_p8 vst3_u8
11508
11509	_NEON2SSE_GLOBAL void vst3_p16(__transfersize(`12`) poly16_t * ptr, poly16x4x3_t val);// VST3.16 {d0, d1, d2}, [r0]
11510	#define vst3_p16 vst3_u16
11511
11512	//************* Quadruples store ******************************
11513	//*********************************************************************
11514	//void vst4q_u8(__transfersize(64) uint8_t ptr, uint8x16x4_t val)// VST4.8 {d0, d2, d4, d6}, [r0]*
11515	_NEON2SSE_INLINE void vst4q_u8_ptr(__transfersize(`64`) uint8_t * ptr, uint8x16x4_t const * val)
11516	{
11517	__m128i tmp1, tmp2, res;
11518	tmp1 = _mm_unpacklo_epi8(val->val[`0`], val->val[`1`]); // 0,1, 4,5, 8,9, 12,13, 16,17, 20,21, 24,25, 28,29
11519	tmp2 = _mm_unpacklo_epi8(val->val[`2`], val->val[`3`]); // 2,3, 6,7, 10,11, 14,15, 18,19, 22,23, 26,27, 30,31
11520	res = _mm_unpacklo_epi16(tmp1, tmp2); //0,1, 2,3, 4,5, 6,7, 8,9, 10,11, 12,13, 14,15
11521	vst1q_u8(ptr, res);
11522	res = _mm_unpackhi_epi16(tmp1, tmp2); //16,17, 18,19, 20,21, 22,23, 24,25, 26,27, 28,29, 30,31
11523	vst1q_u8((ptr + `16`), res);
11524	tmp1 = _mm_unpackhi_epi8(val->val[`0`], val->val[`1`]); //
11525	tmp2 = _mm_unpackhi_epi8(val->val[`2`], val->val[`3`]); //
11526	res = _mm_unpacklo_epi16(tmp1, tmp2); //
11527	vst1q_u8((ptr + `32`), res);
11528	res = _mm_unpackhi_epi16(tmp1, tmp2); //
11529	vst1q_u8((ptr + `48`), res);
11530	}
11531	#define vst4q_u8(ptr, val) vst4q_u8_ptr(ptr, &val)
11532
11533	//void vst4q_u16(__transfersize(32) uint16_t ptr, uint16x8x4_t val)// VST4.16 {d0, d2, d4, d6}, [r0]*
11534	_NEON2SSE_INLINE void vst4q_u16_ptr(__transfersize(`32`) uint16_t * ptr, uint16x8x4_t const * val)
11535	{
11536	uint16x8x4_t v;
11537	__m128i tmp1, tmp2;
11538	tmp1 = _mm_unpacklo_epi16(val->val[`0`], val->val[`1`]); //0,1, 4,5, 8,9, 12,13
11539	tmp2 = _mm_unpacklo_epi16(val->val[`2`], val->val[`3`]); //2,3, 6,7 , 10,11, 14,15
11540	v.val[`0`] = _mm_unpacklo_epi32(tmp1, tmp2);
11541	v.val[`1`] = _mm_unpackhi_epi32(tmp1, tmp2);
11542	tmp1 = _mm_unpackhi_epi16(val->val[`0`], val->val[`1`]); //0,1, 4,5, 8,9, 12,13
11543	tmp2 = _mm_unpackhi_epi16(val->val[`2`], val->val[`3`]); //2,3, 6,7 , 10,11, 14,15
11544	v.val[`2`] = _mm_unpacklo_epi32(tmp1, tmp2);
11545	v.val[`3`] = _mm_unpackhi_epi32(tmp1, tmp2);
11546	vst1q_u16(ptr, v.val[`0`]);
11547	vst1q_u16((ptr + `8`), v.val[`1`]);
11548	vst1q_u16((ptr + `16`),v.val[`2`]);
11549	vst1q_u16((ptr + `24`), v.val[`3`]);
11550	}
11551	#define vst4q_u16(ptr, val) vst4q_u16_ptr(ptr, &val)
11552
11553	//void vst4q_u32(__transfersize(16) uint32_t ptr, uint32x4x4_t val)// VST4.32 {d0, d2, d4, d6}, [r0]*
11554	_NEON2SSE_INLINE void vst4q_u32_ptr(__transfersize(`16`) uint32_t * ptr, uint32x4x4_t const * val)
11555	{
11556	uint16x8x4_t v;
11557	__m128i tmp1, tmp2;
11558	tmp1 = _mm_unpacklo_epi32(val->val[`0`], val->val[`1`]); //0,1, 4,5, 8,9, 12,13
11559	tmp2 = _mm_unpacklo_epi32(val->val[`2`], val->val[`3`]); //2,3, 6,7 , 10,11, 14,15
11560	v.val[`0`] = _mm_unpacklo_epi64(tmp1, tmp2);
11561	v.val[`1`] = _mm_unpackhi_epi64(tmp1, tmp2);
11562	tmp1 = _mm_unpackhi_epi32(val->val[`0`], val->val[`1`]); //0,1, 4,5, 8,9, 12,13
11563	tmp2 = _mm_unpackhi_epi32(val->val[`2`], val->val[`3`]); //2,3, 6,7 , 10,11, 14,15
11564	v.val[`2`] = _mm_unpacklo_epi64(tmp1, tmp2);
11565	v.val[`3`] = _mm_unpackhi_epi64(tmp1, tmp2);
11566	vst1q_u32(ptr, v.val[`0`]);
11567	vst1q_u32((ptr + `4`), v.val[`1`]);
11568	vst1q_u32((ptr + `8`), v.val[`2`]);
11569	vst1q_u32((ptr + `12`), v.val[`3`]);
11570	}
11571	#define vst4q_u32(ptr, val) vst4q_u32_ptr(ptr, &val)
11572
11573	//void vst4q_s8(__transfersize(64) int8_t ptr, int8x16x4_t val);*
11574	_NEON2SSE_GLOBAL void vst4q_s8_ptr(__transfersize(`64`) int8_t * ptr, int8x16x4_t const * val);
11575	#define vst4q_s8(ptr, val) vst4q_u8((uint8_t*)(ptr), val)
11576
11577	//void vst4q_s16(__transfersize(32) int16_t ptr, int16x8x4_t val);*
11578	_NEON2SSE_GLOBAL void vst4q_s16_ptr(__transfersize(`32`) int16_t * ptr, int16x8x4_t const * val);
11579	#define vst4q_s16(ptr, val) vst4q_u16((uint16_t*)(ptr), val)
11580
11581	//void vst4q_s32(__transfersize(16) int32_t ptr, int32x4x4_t val);*
11582	_NEON2SSE_GLOBAL void vst4q_s32_ptr(__transfersize(`16`) int32_t * ptr, int32x4x4_t const * val);
11583	#define vst4q_s32(ptr, val) vst4q_u32((uint32_t*)(ptr), val)
11584
11585	//void vst4q_f16(__transfersize(32) __fp16 ptr, float16x8x4_t val);// VST4.16 {d0, d2, d4, d6}, [r0]*
11586	_NEON2SSE_GLOBAL void vst4q_f16_ptr(__transfersize(`32`) __fp16 * ptr, float16x8x4_t const * val);
11587	// IA32 SIMD doesn't work with 16bit floats currently
11588
11589	//void vst4q_f32(__transfersize(16) float32_t ptr, float32x4x4_t val)// VST4.32 {d0, d2, d4, d6}, [r0]*
11590	_NEON2SSE_INLINE void vst4q_f32_ptr(__transfersize(`16`) float32_t * ptr, float32x4x4_t const * val)
11591	{
11592	__m128 tmp3, tmp2, tmp1, tmp0;
11593	float32x4x4_t v;
11594	tmp0 = _mm_unpacklo_ps(val->val[`0`], val->val[`1`]);
11595	tmp2 = _mm_unpacklo_ps(val->val[`2`], val->val[`3`]);
11596	tmp1 = _mm_unpackhi_ps(val->val[`0`], val->val[`1`]);
11597	tmp3 = _mm_unpackhi_ps(val->val[`2`], val->val[`3`]);
11598	v.val[`0`] = _mm_movelh_ps(tmp0, tmp2);
11599	v.val[`1`] = _mm_movehl_ps(tmp2, tmp0);
11600	v.val[`2`] = _mm_movelh_ps(tmp1, tmp3);
11601	v.val[`3`] = _mm_movehl_ps(tmp3, tmp1);
11602	vst1q_f32(ptr, v.val[`0`]);
11603	vst1q_f32((ptr + `4`), v.val[`1`]);
11604	vst1q_f32((ptr + `8`), v.val[`2`]);
11605	vst1q_f32((ptr + `12`), v.val[`3`]);
11606	}
11607	#define vst4q_f32(ptr, val) vst4q_f32_ptr(ptr, &val)
11608
11609	//void vst4q_p8(__transfersize(64) poly8_t ptr, poly8x16x4_t val);// VST4.8 {d0, d2, d4, d6}, [r0]*
11610	_NEON2SSE_GLOBAL void vst4q_p8_ptr(__transfersize(`64`) poly8_t * ptr, poly8x16x4_t const * val);
11611	#define vst4q_p8 vst4q_u8
11612
11613	//void vst4q_p16(__transfersize(32) poly16_t ptr, poly16x8x4_t val);// VST4.16 {d0, d2, d4, d6}, [r0]*
11614	_NEON2SSE_GLOBAL void vst4q_p16_ptr(__transfersize(`32`) poly16_t * ptr, poly16x8x4_t const * val);
11615	#define vst4q_p16 vst4q_s16
11616
11617	_NEON2SSESTORAGE void vst4_u8(__transfersize(`32`) uint8_t * ptr, uint8x8x4_t val);// VST4.8 {d0, d1, d2, d3}, [r0]
11618	_NEON2SSE_INLINE void vst4_u8(__transfersize(`32`) uint8_t * ptr, uint8x8x4_t val)
11619	{
11620	__m128i sh0, sh1, val0, val2;
11621	sh0 = _mm_unpacklo_epi8(_pM128i(val.val[`0`]),_pM128i(val.val[`1`])); // a0,b0,a1,b1,a2,b2,a3,b3,a4,b4,a5,b5, a6,b6,a7,b7,
11622	sh1 = _mm_unpacklo_epi8(_pM128i(val.val[`2`]),_pM128i(val.val[`3`])); // c0,d0,c1,d1,c2,d2,c3,d3, c4,d4,c5,d5,c6,d6,c7,d7
11623	val0 = _mm_unpacklo_epi16(sh0,sh1); // a0,b0,c0,d0,a1,b1,c1,d1,a2,b2,c2,d2,a3,b3,c3,d3,
11624	val2 = _mm_unpackhi_epi16(sh0,sh1); //a4,b4,c4,d4,a5,b5,c5,d5, a6,b6,c6,d6,a7,b7,c7,d7
11625	vst1q_u8(ptr, val0);
11626	vst1q_u8((ptr + `16`), val2);
11627	}
11628
11629	_NEON2SSESTORAGE void vst4_u16(__transfersize(`16`) uint16_t * ptr, uint16x4x4_t val);// VST4.16 {d0, d1, d2, d3}, [r0]
11630	_NEON2SSE_INLINE void vst4_u16(__transfersize(`16`) uint16_t * ptr, uint16x4x4_t val)
11631	{
11632	__m128i sh0, sh1, val0, val2;
11633	sh0 = _mm_unpacklo_epi16(_pM128i(val.val[`0`]),_pM128i(val.val[`1`])); //a0,a1,b0,b1,c0,c1,d0,d1,
11634	sh1 = _mm_unpacklo_epi16(_pM128i(val.val[`2`]),_pM128i(val.val[`3`])); //a2,a3,b2,b3,c2,c3,d2,d3
11635	val0 = _mm_unpacklo_epi32(sh0,sh1); // a0,a1,a2,a3,b0,b1,b2,b3
11636	val2 = _mm_unpackhi_epi32(sh0,sh1); // c0,c1,c2,c3,d0,d1,d2,d3
11637	vst1q_u16(ptr, val0); //store as 128 bit structure
11638	vst1q_u16((ptr + `8`), val2);
11639	}
11640
11641	_NEON2SSESTORAGE void vst4_u32(__transfersize(`8`) uint32_t * ptr, uint32x2x4_t val);// VST4.32 {d0, d1, d2, d3}, [r0]
11642	_NEON2SSE_INLINE void vst4_u32(__transfersize(`8`) uint32_t * ptr, uint32x2x4_t val)
11643	{
11644	//0,4, 1,5, 2,6, 3,7
11645	__m128i sh0, sh1, val0, val1;
11646	sh0 = _mm_unpacklo_epi32(_pM128i(val.val[`0`]), _pM128i(val.val[`1`])); //0,1,4,5
11647	sh1 = _mm_unpacklo_epi32(_pM128i(val.val[`2`]), _pM128i(val.val[`3`])); //2,3,6,7
11648	val0 = _mm_unpacklo_epi64(sh0,sh1); //
11649	val1 = _mm_unpackhi_epi64(sh0,sh1); //
11650	vst1q_u32(ptr, val0); //store as 128 bit structure
11651	vst1q_u32((ptr + `4`), val1);
11652	}
11653
11654	_NEON2SSESTORAGE void vst4_u64(__transfersize(`4`) uint64_t * ptr, uint64x1x4_t val);// VST1.64 {d0, d1, d2, d3}, [r0]
11655	_NEON2SSE_INLINE void vst4_u64(__transfersize(`4`) uint64_t * ptr, uint64x1x4_t val)
11656	{
11657	*(ptr) = val.val[`0`].m64_u64[`0`];
11658	*(ptr + `1`) = val.val[`1`].m64_u64[`0`];
11659	*(ptr + `2`) = val.val[`2`].m64_u64[`0`];
11660	*(ptr + `3`) = val.val[`3`].m64_u64[`0`];
11661	}
11662
11663	//void vst4_s8(__transfersize(32) int8_t ptr, int8x8x4_t val) //VST4.8 {d0, d1, d2, d3}, [r0]*
11664	#define vst4_s8(ptr, val) vst4_u8((uint8_t*)ptr, val)
11665
11666	//void vst4_s16(__transfersize(16) int16_t ptr, int16x4x4_t val) // VST4.16 {d0, d1, d2, d3}, [r0]*
11667	#define vst4_s16(ptr, val) vst4_u16((uint16_t*)ptr, val)
11668
11669	//void vst4_s32(__transfersize(8) int32_t ptr, int32x2x4_t val) // VST4.32 {d0, d1, d2, d3}, [r0]*
11670	#define vst4_s32(ptr, val) vst4_u32((uint32_t*)ptr, val)
11671
11672	//void vst4_s64(__transfersize(4) int64_t ptr, int64x1x4_t val); // VST1.64 {d0, d1, d2, d3}, [r0]*
11673	_NEON2SSE_GLOBAL void vst4_s64_ptr(__transfersize(`4`) int64_t * ptr, int64x1x4_t const * val);
11674	#define vst4_s64(ptr, val) vst4_u64((uint64_t*)ptr, val)
11675
11676	//void vst4_f16(__transfersize(16) __fp16 ptr, float16x4x4_t val);// VST4.16 {d0, d1, d2, d3}, [r0]*
11677	_NEON2SSE_GLOBAL void vst4_f16_ptr(__transfersize(`16`) __fp16 * ptr, float16x4x4_t const * val);
11678	// IA32 SIMD doesn't work with 16bit floats currently, so need to go to 32 bit and then work with two 128bit registers. See vld1q_f16 for example
11679
11680	_NEON2SSESTORAGE void vst4_f32(__transfersize(`8`) float32_t * ptr, float32x2x4_t val);// VST4.32 {d0, d1, d2, d3}, [r0]
11681	_NEON2SSE_INLINE void vst4_f32(__transfersize(`8`) float32_t * ptr, float32x2x4_t val)
11682	{
11683	//0,4, 1,5, 2,6, 3,7 -> 0,1, 2,3, 4,5, 6,7
11684	*(ptr) = val.val[`0`].m64_f32[`0`];
11685	*(ptr + `1`) = val.val[`1`].m64_f32[`0`];
11686	*(ptr + `2`) = val.val[`2`].m64_f32[`0`];
11687	*(ptr + `3`) = val.val[`3`].m64_f32[`0`];
11688	*(ptr + `4`) = val.val[`0`].m64_f32[`1`];
11689	*(ptr + `5`) = val.val[`1`].m64_f32[`1`];
11690	*(ptr + `6`) = val.val[`2`].m64_f32[`1`];
11691	*(ptr + `7`) = val.val[`3`].m64_f32[`1`];
11692	}
11693
11694	_NEON2SSE_GLOBAL void vst4_p8(__transfersize(`32`) poly8_t * ptr, poly8x8x4_t val);// VST4.8 {d0, d1, d2, d3}, [r0]
11695	#define vst4_p8 vst4_u8
11696
11697	_NEON2SSE_GLOBAL void vst4_p16(__transfersize(`16`) poly16_t * ptr, poly16x4x4_t val);// VST4.16 {d0, d1, d2, d3}, [r0]
11698	#define vst4_p16 vst4_u16
11699
11700	//********* Store a lane of a vector into memory (extract given lane) for a couple of vectors *******************
11701	//********************************************************************************************************************
11702	//void vst2q_lane_u16(__transfersize(2) uint16_t ptr, uint16x8x2_t val, __constrange(0,7) int lane)// VST2.16 {d0[0], d2[0]}, [r0]*
11703	_NEON2SSE_INLINE void vst2q_lane_u16_ptr(__transfersize(`2`) uint16_t * ptr, uint16x8x2_t const * val, __constrange(`0`,`7`) int lane)
11704	{
11705	vst1q_lane_s16(ptr, val->val[`0`], lane);
11706	vst1q_lane_s16((ptr + `1`), val->val[`1`], lane);
11707	}
11708	#define vst2q_lane_u16(ptr, val, lane) vst2q_lane_u16_ptr(ptr, &val, lane)
11709
11710	//void vst2q_lane_u32(__transfersize(2) uint32_t ptr, uint32x4x2_t val, __constrange(0,3) int lane)// VST2.32 {d0[0], d2[0]}, [r0]*
11711	_NEON2SSE_INLINE void vst2q_lane_u32_ptr(__transfersize(`2`) uint32_t* ptr, uint32x4x2_t const * val, __constrange(`0`,`3`) int lane)
11712	{
11713	vst1q_lane_u32(ptr, val->val[`0`], lane);
11714	vst1q_lane_u32((ptr + `1`), val->val[`1`], lane);
11715	}
11716	#define vst2q_lane_u32(ptr, val, lane) vst2q_lane_u32_ptr(ptr, &val, lane)
11717
11718	//void vst2q_lane_s16(__transfersize(2) int16_t ptr, int16x8x2_t val, __constrange(0,7) int lane);// VST2.16 {d0[0], d2[0]}, [r0]*
11719	_NEON2SSE_GLOBAL void vst2q_lane_s16_ptr(__transfersize(`2`) int16_t * ptr, int16x8x2_t const * val, __constrange(`0`,`7`) int lane);
11720	#define vst2q_lane_s16(ptr, val, lane) vst2q_lane_u16((uint16_t*)ptr, val, lane)
11721
11722	//void vst2q_lane_s32(__transfersize(2) int32_t ptr, int32x4x2_t val, __constrange(0,3) int lane);// VST2.32 {d0[0], d2[0]}, [r0]*
11723	_NEON2SSE_GLOBAL void vst2q_lane_s32_ptr(__transfersize(`2`) int32_t * ptr, int32x4x2_t const * val, __constrange(`0`,`3`) int lane);
11724	#define vst2q_lane_s32(ptr, val, lane) vst2q_lane_u32((uint32_t*)ptr, val, lane)
11725
11726	//void vst2q_lane_f16(__transfersize(2) __fp16 ptr, float16x8x2_t val, __constrange(0,7) int lane);// VST2.16 {d0[0], d2[0]}, [r0]*
11727	_NEON2SSE_GLOBAL void vst2q_lane_f16_ptr(__transfersize(`2`) __fp16 * ptr, float16x8x2_t const * val, __constrange(`0`,`7`) int lane);
11728	//current IA SIMD doesn't support float16
11729
11730	//void vst2q_lane_f32(__transfersize(2) float32_t ptr, float32x4x2_t val, __constrange(0,3) int lane)// VST2.32 {d0[0], d2[0]}, [r0]*
11731	_NEON2SSE_INLINE void vst2q_lane_f32_ptr(__transfersize(`2`) float32_t* ptr, float32x4x2_t const * val, __constrange(`0`,`3`) int lane)
11732	{
11733	vst1q_lane_f32(ptr, val->val[`0`], lane);
11734	vst1q_lane_f32((ptr + `1`), val->val[`1`], lane);
11735	}
11736	#define vst2q_lane_f32(ptr,src,lane) vst2q_lane_f32_ptr(ptr,&src,lane)
11737
11738	//void vst2q_lane_p16(__transfersize(2) poly16_t ptr, poly16x8x2_t val, __constrange(0,7) int lane);// VST2.16 {d0[0], d2[0]}, [r0]*
11739	_NEON2SSE_GLOBAL void vst2q_lane_p16_ptr(__transfersize(`2`) poly16_t * ptr, poly16x8x2_t const * val, __constrange(`0`,`7`) int lane);
11740	#define vst2q_lane_p16 vst2q_lane_s16
11741
11742	_NEON2SSESTORAGE void vst2_lane_u8(__transfersize(`2`) uint8_t * ptr, uint8x8x2_t val, __constrange(`0`,`7`) int lane);// VST2.8 {d0[0], d1[0]}, [r0]
11743	_NEON2SSE_INLINE void vst2_lane_u8(__transfersize(`2`) uint8_t * ptr, uint8x8x2_t val, __constrange(`0`,`7`) int lane) // VST2.8 {d0[0], d1[0]}, [r0]
11744	{
11745	*(ptr) = val.val[`0`].m64_u8[lane];
11746	*(ptr + `1`) = val.val[`1`].m64_u8[lane];
11747	}
11748
11749	_NEON2SSESTORAGE void vst2_lane_u16(__transfersize(`2`) uint16_t * ptr, uint16x4x2_t val, __constrange(`0`,`3`) int lane);// VST2.16 {d0[0], d1[0]}, [r0]
11750	_NEON2SSE_INLINE void vst2_lane_u16(__transfersize(`2`) uint16_t * ptr, uint16x4x2_t val, __constrange(`0`,`3`) int lane)
11751	{
11752	*(ptr) = val.val[`0`].m64_u16[lane];
11753	*(ptr + `1`) = val.val[`1`].m64_u16[lane];
11754	}
11755
11756	_NEON2SSESTORAGE void vst2_lane_u32(__transfersize(`2`) uint32_t * ptr, uint32x2x2_t val, __constrange(`0`,`1`) int lane);// VST2.32 {d0[0], d1[0]}, [r0]
11757	_NEON2SSE_INLINE void vst2_lane_u32(__transfersize(`2`) uint32_t * ptr, uint32x2x2_t val, __constrange(`0`,`1`) int lane)
11758	{
11759	*(ptr) = val.val[`0`].m64_u32[lane];
11760	*(ptr + `1`) = val.val[`1`].m64_u32[lane];
11761	}
11762
11763	_NEON2SSE_GLOBAL void vst2_lane_s8(__transfersize(`2`) int8_t * ptr, int8x8x2_t val, __constrange(`0`,`7`) int lane);// VST2.8 {d0[0], d1[0]}, [r0]
11764	#define vst2_lane_s8(ptr, val, lane) vst2_lane_u8((uint8_t*)ptr, val, lane)
11765
11766	_NEON2SSE_GLOBAL void vst2_lane_s16(__transfersize(`2`) int16_t * ptr, int16x4x2_t val, __constrange(`0`,`3`) int lane);// VST2.16 {d0[0], d1[0]}, [r0]
11767	#define vst2_lane_s16(ptr, val, lane) vst2_lane_u16((uint16_t*)ptr, val, lane)
11768
11769	_NEON2SSE_GLOBAL void vst2_lane_s32(__transfersize(`2`) int32_t * ptr, int32x2x2_t val, __constrange(`0`,`1`) int lane);// VST2.32 {d0[0], d1[0]}, [r0]
11770	#define vst2_lane_s32(ptr, val, lane) vst2_lane_u32((uint32_t*)ptr, val, lane)
11771
11772	//void vst2_lane_f16(__transfersize(2) __fp16 ptr, float16x4x2_t val, __constrange(0,3) int lane); // VST2.16 {d0[0], d1[0]}, [r0]*
11773	//current IA SIMD doesn't support float16
11774
11775	_NEON2SSESTORAGE void vst2_lane_f32(__transfersize(`2`) float32_t * ptr, float32x2x2_t val, __constrange(`0`,`1`) int lane); // VST2.32 {d0[0], d1[0]}, [r0]
11776	_NEON2SSE_INLINE void vst2_lane_f32(__transfersize(`2`) float32_t * ptr, float32x2x2_t val, __constrange(`0`,`1`) int lane)
11777	{
11778	*(ptr) = val.val[`0`].m64_f32[lane];
11779	*(ptr + `1`) = val.val[`1`].m64_f32[lane];
11780	}
11781
11782	_NEON2SSE_GLOBAL void vst2_lane_p8(__transfersize(`2`) poly8_t * ptr, poly8x8x2_t val, __constrange(`0`,`7`) int lane);// VST2.8 {d0[0], d1[0]}, [r0]
11783	#define vst2_lane_p8 vst2_lane_u8
11784
11785	_NEON2SSE_GLOBAL void vst2_lane_p16(__transfersize(`2`) poly16_t * ptr, poly16x4x2_t val, __constrange(`0`,`3`) int lane);// VST2.16 {d0[0], d1[0]}, [r0]
11786	#define vst2_lane_p16 vst2_lane_u16
11787
11788	//*********************** Triple lanes stores *****************************************************
11789	//*******************************************************************************************************
11790	//void vst3q_lane_u16(__transfersize(3) uint16_t ptr, uint16x8x3_t val, __constrange(0,7) int lane)// VST3.16 {d0[0], d2[0], d4[0]}, [r0]*
11791	_NEON2SSE_INLINE void vst3q_lane_u16_ptr(__transfersize(`3`) uint16_t * ptr, uint16x8x3_t const * val, __constrange(`0`,`7`) int lane)
11792	{
11793	vst2q_lane_u16_ptr(ptr, (uint16x8x2_t*)val, lane);
11794	vst1q_lane_u16((ptr + `2`), val->val[`2`], lane);
11795	}
11796	#define vst3q_lane_u16(ptr, val, lane) vst3q_lane_u16_ptr(ptr, &val, lane)
11797
11798	//void vst3q_lane_u32(__transfersize(3) uint32_t ptr, uint32x4x3_t val, __constrange(0,3) int lane)// VST3.32 {d0[0], d2[0], d4[0]}, [r0]*
11799	_NEON2SSE_INLINE void vst3q_lane_u32_ptr(__transfersize(`3`) uint32_t * ptr, uint32x4x3_t const * val, __constrange(`0`,`3`) int lane)
11800	{
11801	vst2q_lane_u32_ptr(ptr, (uint32x4x2_t*)val, lane);
11802	vst1q_lane_u32((ptr + `2`), val->val[`2`], lane);
11803	}
11804	#define vst3q_lane_u32(ptr, val, lane) vst3q_lane_u32_ptr(ptr, &val, lane)
11805
11806	//void vst3q_lane_s16(__transfersize(3) int16_t ptr, int16x8x3_t val, __constrange(0,7) int lane);// VST3.16 {d0[0], d2[0], d4[0]}, [r0]*
11807	_NEON2SSE_GLOBAL void vst3q_lane_s16_ptr(__transfersize(`3`) int16_t * ptr, int16x8x3_t const * val, __constrange(`0`,`7`) int lane);
11808	#define vst3q_lane_s16(ptr, val, lane) vst3q_lane_u16((uint16_t *)ptr, val, lane)
11809
11810	//void vst3q_lane_s32(__transfersize(3) int32_t ptr, int32x4x3_t val, __constrange(0,3) int lane);// VST3.32 {d0[0], d2[0], d4[0]}, [r0]*
11811	_NEON2SSE_GLOBAL void vst3q_lane_s32_ptr(__transfersize(`3`) int32_t * ptr, int32x4x3_t const * val, __constrange(`0`,`3`) int lane);
11812	#define vst3q_lane_s32(ptr, val, lane) vst3q_lane_u32((uint32_t *)ptr, val, lane)
11813
11814	//void vst3q_lane_f16(__transfersize(3) __fp16 ptr, float16x8x3_t val, __constrange(0,7) int lane);// VST3.16 {d0[0], d2[0], d4[0]}, [r0]*
11815	_NEON2SSE_GLOBAL void vst3q_lane_f16_ptr(__transfersize(`3`) __fp16 * ptr, float16x8x3_t const * val, __constrange(`0`,`7`) int lane);
11816	//current IA SIMD doesn't support float16
11817
11818	//void vst3q_lane_f32(__transfersize(3) float32_t ptr, float32x4x3_t val, __constrange(0,3) int lane)// VST3.32 {d0[0], d2[0], d4[0]}, [r0]*
11819	_NEON2SSE_INLINE void vst3q_lane_f32_ptr(__transfersize(`3`) float32_t * ptr, float32x4x3_t const * val, __constrange(`0`,`3`) int lane)
11820	{
11821	vst1q_lane_f32(ptr, val->val[`0`], lane);
11822	vst1q_lane_f32((ptr + `1`), val->val[`1`], lane);
11823	vst1q_lane_f32((ptr + `2`), val->val[`2`], lane);
11824	}
11825	#define vst3q_lane_f32(ptr,val,lane) vst3q_lane_f32_ptr(ptr,&val,lane)
11826
11827	//void vst3q_lane_p16(__transfersize(3) poly16_t ptr, poly16x8x3_t val, __constrange(0,7) int lane);// VST3.16 {d0[0], d2[0], d4[0]}, [r0]*
11828	_NEON2SSE_GLOBAL void vst3q_lane_p16_ptr(__transfersize(`3`) poly16_t * ptr, poly16x8x3_t const * val, __constrange(`0`,`7`) int lane);
11829	#define vst3q_lane_p16 vst3q_lane_s16
11830
11831	_NEON2SSESTORAGE void vst3_lane_u8(__transfersize(`3`) uint8_t * ptr, uint8x8x3_t val, __constrange(`0`,`7`) int lane);// VST3.8 {d0[0], d1[0], d2[0]}, [r0]
11832	_NEON2SSE_INLINE void vst3_lane_u8(__transfersize(`3`) uint8_t * ptr, uint8x8x3_t val, __constrange(`0`,`7`) int lane)
11833	{
11834	*(ptr) = val.val[`0`].m64_u8[lane];
11835	*(ptr + `1`) = val.val[`1`].m64_u8[lane];
11836	*(ptr + `2`) = val.val[`2`].m64_u8[lane];
11837	}
11838
11839	_NEON2SSESTORAGE void vst3_lane_u16(__transfersize(`3`) uint16_t * ptr, uint16x4x3_t val, __constrange(`0`,`3`) int lane);// VST3.16 {d0[0], d1[0], d2[0]}, [r0]
11840	_NEON2SSE_INLINE void vst3_lane_u16(__transfersize(`3`) uint16_t * ptr, uint16x4x3_t val, __constrange(`0`,`3`) int lane)
11841	{
11842	*(ptr) = val.val[`0`].m64_u16[lane];
11843	*(ptr + `1`) = val.val[`1`].m64_u16[lane];
11844	*(ptr + `2`) = val.val[`2`].m64_u16[lane];
11845	}
11846
11847	_NEON2SSESTORAGE void vst3_lane_u32(__transfersize(`3`) uint32_t * ptr, uint32x2x3_t val, __constrange(`0`,`1`) int lane);// VST3.32 {d0[0], d1[0], d2[0]}, [r0]
11848	_NEON2SSE_INLINE void vst3_lane_u32(__transfersize(`3`) uint32_t * ptr, uint32x2x3_t val, __constrange(`0`,`1`) int lane)
11849	{
11850	*(ptr) = val.val[`0`].m64_u32[lane];
11851	*(ptr + `1`) = val.val[`1`].m64_u32[lane];
11852	*(ptr + `2`) = val.val[`2`].m64_u32[lane];
11853	}
11854
11855	_NEON2SSE_GLOBAL void vst3_lane_s8(__transfersize(`3`) int8_t * ptr, int8x8x3_t val, __constrange(`0`,`7`) int lane);// VST3.8 {d0[0], d1[0], d2[0]}, [r0]
11856	#define vst3_lane_s8(ptr, val, lane) vst3_lane_u8((uint8_t *)ptr, val, lane)
11857
11858	_NEON2SSE_GLOBAL void vst3_lane_s16(__transfersize(`3`) int16_t * ptr, int16x4x3_t val, __constrange(`0`,`3`) int lane);// VST3.16 {d0[0], d1[0], d2[0]}, [r0]
11859	#define vst3_lane_s16(ptr, val, lane) vst3_lane_u16((uint16_t *)ptr, val, lane)
11860
11861	_NEON2SSE_GLOBAL void vst3_lane_s32(__transfersize(`3`) int32_t * ptr, int32x2x3_t val, __constrange(`0`,`1`) int lane);// VST3.32 {d0[0], d1[0], d2[0]}, [r0]
11862	#define vst3_lane_s32(ptr, val, lane) vst3_lane_u32((uint32_t *)ptr, val, lane)
11863
11864	//void vst3_lane_f16(__transfersize(3) __fp16 ptr, float16x4x3_t val, __constrange(0,3) int lane);// VST3.16 {d0[0], d1[0], d2[0]}, [r0]*
11865	_NEON2SSE_GLOBAL void vst3_lane_f16_ptr(__transfersize(`3`) __fp16 * ptr, float16x4x3_t const * val, __constrange(`0`,`3`) int lane);
11866	//current IA SIMD doesn't support float16
11867
11868	_NEON2SSESTORAGE void vst3_lane_f32(__transfersize(`3`) float32_t * ptr, float32x2x3_t val, __constrange(`0`,`1`) int lane);// VST3.32 {d0[0], d1[0], d2[0]}, [r0]
11869	_NEON2SSE_INLINE void vst3_lane_f32(__transfersize(`3`) float32_t * ptr, float32x2x3_t val, __constrange(`0`,`1`) int lane)
11870	{
11871	*(ptr) = val.val[`0`].m64_f32[lane];
11872	*(ptr + `1`) = val.val[`1`].m64_f32[lane];
11873	*(ptr + `2`) = val.val[`2`].m64_f32[lane];
11874	}
11875
11876	_NEON2SSE_GLOBAL void vst3_lane_p8(__transfersize(`3`) poly8_t * ptr, poly8x8x3_t val, __constrange(`0`,`7`) int lane);// VST3.8 {d0[0], d1[0], d2[0]}, [r0]
11877	#define vst3_lane_p8 vst3_lane_u8
11878
11879	_NEON2SSE_GLOBAL void vst3_lane_p16(__transfersize(`3`) poly16_t * ptr, poly16x4x3_t val, __constrange(`0`,`3`) int lane);// VST3.16 {d0[0], d1[0], d2[0]}, [r0]
11880	#define vst3_lane_p16 vst3_lane_u16
11881
11882	//****************************** Quadruple lanes stores *********************************************
11883	//*******************************************************************************************************
11884	//void vst4q_lane_u16(__transfersize(4) uint16_t ptr, uint16x8x4_t val, __constrange(0,7) int lane)// VST4.16 {d0[0], d2[0], d4[0], d6[0]}, [r0]*
11885	_NEON2SSE_INLINE void vst4q_lane_u16_ptr(__transfersize(`4`) uint16_t * ptr, uint16x8x4_t const * val4, __constrange(`0`,`7`) int lane)
11886	{
11887	vst2q_lane_u16_ptr(ptr, (uint16x8x2_t*)val4->val, lane);
11888	vst2q_lane_u16_ptr((ptr + `2`),((uint16x8x2_t*)val4->val + `1`), lane);
11889	}
11890	#define vst4q_lane_u16(ptr, val, lane) vst4q_lane_u16_ptr(ptr, &val, lane)
11891
11892	//void vst4q_lane_u32(__transfersize(4) uint32_t ptr, uint32x4x4_t val, __constrange(0,3) int lane)// VST4.32 {d0[0], d2[0], d4[0], d6[0]}, [r0]*
11893	_NEON2SSE_INLINE void vst4q_lane_u32_ptr(__transfersize(`4`) uint32_t * ptr, uint32x4x4_t const * val4, __constrange(`0`,`3`) int lane)
11894	{
11895	vst2q_lane_u32_ptr(ptr, (uint32x4x2_t*)val4->val, lane);
11896	vst2q_lane_u32_ptr((ptr + `2`), ((uint32x4x2_t*)val4->val + `1`), lane);
11897	}
11898	#define vst4q_lane_u32(ptr, val, lane) vst4q_lane_u32_ptr(ptr, &val, lane)
11899
11900	//void vst4q_lane_s16(__transfersize(4) int16_t ptr, int16x8x4_t val, __constrange(0,7) int lane);// VST4.16 {d0[0], d2[0], d4[0], d6[0]}, [r0]*
11901	_NEON2SSE_GLOBAL void vst4q_lane_s16_ptr(__transfersize(`4`) int16_t * ptr, int16x8x4_t const * val, __constrange(`0`,`7`) int lane);
11902	#define vst4q_lane_s16(ptr,val,lane) vst4q_lane_u16((uint16_t *)ptr,val,lane)
11903
11904	//void vst4q_lane_s32(__transfersize(4) int32_t ptr, int32x4x4_t val, __constrange(0,3) int lane);// VST4.32 {d0[0], d2[0], d4[0], d6[0]}, [r0]*
11905	_NEON2SSE_GLOBAL void vst4q_lane_s32_ptr(__transfersize(`4`) int32_t * ptr, int32x4x4_t const * val, __constrange(`0`,`3`) int lane);
11906	#define vst4q_lane_s32(ptr,val,lane) vst4q_lane_u32((uint32_t *)ptr,val,lane)
11907
11908	//void vst4q_lane_f16(__transfersize(4) __fp16 ptr, float16x8x4_t val, __constrange(0,7) int lane);// VST4.16 {d0[0], d2[0], d4[0], d6[0]}, [r0]*
11909	_NEON2SSE_GLOBAL void vst4q_lane_f16_ptr(__transfersize(`4`) __fp16 * ptr, float16x8x4_t const * val, __constrange(`0`,`7`) int lane);
11910	//current IA SIMD doesn't support float16
11911
11912	//void vst4q_lane_f32(__transfersize(4) float32_t ptr, float32x4x4_t val, __constrange(0,3) int lane)// VST4.32 {d0[0], d2[0], d4[0], d6[0]}, [r0]*
11913	_NEON2SSE_INLINE void vst4q_lane_f32_ptr(__transfersize(`4`) float32_t * ptr, float32x4x4_t const * val, __constrange(`0`,`3`) int lane)
11914	{
11915	vst1q_lane_f32(ptr, val->val[`0`], lane);
11916	vst1q_lane_f32((ptr + `1`), val->val[`1`], lane);
11917	vst1q_lane_f32((ptr + `2`), val->val[`2`], lane);
11918	vst1q_lane_f32((ptr + `3`), val->val[`3`], lane);
11919	}
11920	#define vst4q_lane_f32(ptr,val,lane) vst4q_lane_f32_ptr(ptr,&val,lane)
11921
11922	//void vst4q_lane_p16(__transfersize(4) poly16_t ptr, poly16x8x4_t val, __constrange(0,7) int lane);// VST4.16 {d0[0], d2[0], d4[0], d6[0]}, [r0]*
11923	_NEON2SSE_GLOBAL void vst4q_lane_p16_ptr(__transfersize(`4`) poly16_t * ptr, poly16x8x4_t const * val, __constrange(`0`,`7`) int lane);
11924	#define vst4q_lane_p16 vst4q_lane_u16
11925
11926	_NEON2SSESTORAGE void vst4_lane_u8(__transfersize(`4`) uint8_t * ptr, uint8x8x4_t val, __constrange(`0`,`7`) int lane);// VST4.8 {d0[0], d1[0], d2[0], d3[0]}, [r0]
11927	_NEON2SSE_INLINE void vst4_lane_u8(__transfersize(`4`) uint8_t * ptr, uint8x8x4_t val, __constrange(`0`,`7`) int lane)
11928	{
11929	*(ptr) = val.val[`0`].m64_u8[lane];
11930	*(ptr + `1`) = val.val[`1`].m64_u8[lane];
11931	*(ptr + `2`) = val.val[`2`].m64_u8[lane];
11932	*(ptr + `3`) = val.val[`3`].m64_u8[lane];
11933	}
11934
11935	_NEON2SSESTORAGE void vst4_lane_u16(__transfersize(`4`) uint16_t * ptr, uint16x4x4_t val, __constrange(`0`,`3`) int lane);// VST4.16 {d0[0], d1[0], d2[0], d3[0]}, [r0]
11936	_NEON2SSE_INLINE void vst4_lane_u16(__transfersize(`4`) uint16_t * ptr, uint16x4x4_t val, __constrange(`0`,`3`) int lane)
11937	{
11938	*(ptr) = val.val[`0`].m64_u16[lane];
11939	*(ptr + `1`) = val.val[`1`].m64_u16[lane];
11940	*(ptr + `2`) = val.val[`2`].m64_u16[lane];
11941	*(ptr + `3`) = val.val[`3`].m64_u16[lane];
11942	}
11943
11944	_NEON2SSESTORAGE void vst4_lane_u32(__transfersize(`4`) uint32_t * ptr, uint32x2x4_t val, __constrange(`0`,`1`) int lane);// VST4.32 {d0[0], d1[0], d2[0], d3[0]}, [r0]
11945	_NEON2SSE_INLINE void vst4_lane_u32(__transfersize(`4`) uint32_t * ptr, uint32x2x4_t val, __constrange(`0`,`1`) int lane)
11946	{
11947	*(ptr) = val.val[`0`].m64_u32[lane];
11948	*(ptr + `1`) = val.val[`1`].m64_u32[lane];
11949	*(ptr + `2`) = val.val[`2`].m64_u32[lane];
11950	*(ptr + `3`) = val.val[`3`].m64_u32[lane];
11951	}
11952
11953	_NEON2SSE_GLOBAL void vst4_lane_s8(__transfersize(`4`) int8_t * ptr, int8x8x4_t val, __constrange(`0`,`7`) int lane);// VST4.8 {d0[0], d1[0], d2[0], d3[0]}, [r0]
11954	#define vst4_lane_s8(ptr, val, lane) vst4_lane_u8((uint8_t*)ptr, val, lane)
11955
11956	_NEON2SSE_GLOBAL void vst4_lane_s16(__transfersize(`4`) int16_t * ptr, int16x4x4_t val, __constrange(`0`,`3`) int lane);// VST4.16 {d0[0], d1[0], d2[0], d3[0]}, [r0]
11957	#define vst4_lane_s16(ptr, val, lane) vst4_lane_u16((uint16_t*)ptr, val, lane)
11958
11959	_NEON2SSE_GLOBAL void vst4_lane_s32(__transfersize(`4`) int32_t * ptr, int32x2x4_t val, __constrange(`0`,`1`) int lane);// VST4.32 {d0[0], d1[0], d2[0], d3[0]}, [r0]
11960	#define vst4_lane_s32(ptr, val, lane) vst4_lane_u32((uint32_t*)ptr, val, lane)
11961
11962	//void vst4_lane_f16(__transfersize(4) __fp16 ptr, float16x4x4_t val, __constrange(0,3) int lane);// VST4.16 {d0[0], d1[0], d2[0], d3[0]}, [r0]*
11963	_NEON2SSE_GLOBAL void vst4_lane_f16_ptr(__transfersize(`4`) __fp16 * ptr, float16x4x4_t const * val, __constrange(`0`,`3`) int lane);
11964	//current IA SIMD doesn't support float16
11965
11966	_NEON2SSESTORAGE void vst4_lane_f32(__transfersize(`4`) float32_t * ptr, float32x2x4_t val, __constrange(`0`,`1`) int lane); // VST4.32 {d0[0], d1[0], d2[0], d3[0]}, [r0]
11967	_NEON2SSE_INLINE void vst4_lane_f32(__transfersize(`4`) float32_t * ptr, float32x2x4_t val, __constrange(`0`,`1`) int lane)
11968	{
11969	*(ptr) = val.val[`0`].m64_f32[lane];
11970	*(ptr + `1`) = val.val[`1`].m64_f32[lane];
11971	*(ptr + `2`) = val.val[`2`].m64_f32[lane];
11972	*(ptr + `3`) = val.val[`3`].m64_f32[lane];
11973	}
11974
11975	_NEON2SSE_GLOBAL void vst4_lane_p8(__transfersize(`4`) poly8_t * ptr, poly8x8x4_t val, __constrange(`0`,`7`) int lane);// VST4.8 {d0[0], d1[0], d2[0], d3[0]}, [r0]
11976	#define vst4_lane_p8 vst4_lane_u8
11977
11978	_NEON2SSE_GLOBAL void vst4_lane_p16(__transfersize(`4`) poly16_t * ptr, poly16x4x4_t val, __constrange(`0`,`3`) int lane);// VST4.16 {d0[0], d1[0], d2[0], d3[0]}, [r0]
11979	#define vst4_lane_p16 vst4_lane_u16
11980
11981	//**************************************************************************************************
11982	//********************** Extract lanes from a vector ******************************************
11983	//**************************************************************************************************
11984	//These intrinsics extract a single lane (element) from a vector.
11985	_NEON2SSE_GLOBAL uint8_t vget_lane_u8(uint8x8_t vec, __constrange(`0`,`7`) int lane); // VMOV.U8 r0, d0[0]
11986	#define vget_lane_u8(vec, lane) vec.m64_u8[lane]
11987
11988	_NEON2SSE_GLOBAL uint16_t vget_lane_u16(uint16x4_t vec, __constrange(`0`,`3`) int lane); // VMOV.s16 r0, d0[0]
11989	#define vget_lane_u16(vec, lane) vec.m64_u16[lane]
11990
11991
11992	_NEON2SSE_GLOBAL uint32_t vget_lane_u32(uint32x2_t vec, __constrange(`0`,`1`) int lane); // VMOV.32 r0, d0[0]
11993	#define vget_lane_u32(vec, lane) vec.m64_u32[lane]
11994
11995	_NEON2SSE_GLOBAL int8_t vget_lane_s8(int8x8_t vec, __constrange(`0`,`7`) int lane); // VMOV.S8 r0, d0[0]
11996	#define vget_lane_s8(vec, lane) vec.m64_i8[lane]
11997
11998	_NEON2SSE_GLOBAL int16_t vget_lane_s16(int16x4_t vec, __constrange(`0`,`3`) int lane); // VMOV.S16 r0, d0[0]
11999	#define vget_lane_s16(vec, lane) vec.m64_i16[lane]
12000
12001	_NEON2SSE_GLOBAL int32_t vget_lane_s32(int32x2_t vec, __constrange(`0`,`1`) int lane); // VMOV.32 r0, d0[0]
12002	#define vget_lane_s32(vec, lane) vec.m64_i32[lane]
12003
12004	_NEON2SSE_GLOBAL poly8_t vget_lane_p8(poly8x8_t vec, __constrange(`0`,`7`) int lane); // VMOV.U8 r0, d0[0]
12005	#define vget_lane_p8 vget_lane_u8
12006
12007	_NEON2SSE_GLOBAL poly16_t vget_lane_p16(poly16x4_t vec, __constrange(`0`,`3`) int lane); // VMOV.s16 r0, d0[0]
12008	#define vget_lane_p16 vget_lane_u16
12009
12010	_NEON2SSE_GLOBAL float32_t vget_lane_f32(float32x2_t vec, __constrange(`0`,`1`) int lane); // VMOV.32 r0, d0[0]
12011	#define vget_lane_f32(vec, lane) vec.m64_f32[lane]
12012
12013	_NEON2SSE_GLOBAL uint8_t vgetq_lane_u8(uint8x16_t vec, __constrange(`0`,`15`) int lane); // VMOV.U8 r0, d0[0]
12014	#define vgetq_lane_u8 (uint8_t) _MM_EXTRACT_EPI8
12015
12016	_NEON2SSE_GLOBAL uint16_t vgetq_lane_u16(uint16x8_t vec, __constrange(`0`,`7`) int lane); // VMOV.s16 r0, d0[0]
12017	#define vgetq_lane_u16 (uint16_t) _MM_EXTRACT_EPI16
12018
12019	_NEON2SSE_GLOBAL uint32_t vgetq_lane_u32(uint32x4_t vec, __constrange(`0`,`3`) int lane); // VMOV.32 r0, d0[0]
12020	#define vgetq_lane_u32 (uint32_t) _MM_EXTRACT_EPI32
12021
12022	_NEON2SSE_GLOBAL int8_t vgetq_lane_s8(int8x16_t vec, __constrange(`0`,`15`) int lane); // VMOV.S8 r0, d0[0]
12023	#define vgetq_lane_s8 _MM_EXTRACT_EPI8
12024
12025	_NEON2SSE_GLOBAL int16_t vgetq_lane_s16(int16x8_t vec, __constrange(`0`,`7`) int lane); // VMOV.S16 r0, d0[0]
12026	#define vgetq_lane_s16 _MM_EXTRACT_EPI16
12027
12028	_NEON2SSE_GLOBAL int32_t vgetq_lane_s32(int32x4_t vec, __constrange(`0`,`3`) int lane); // VMOV.32 r0, d0[0]
12029	#define vgetq_lane_s32 _MM_EXTRACT_EPI32
12030
12031	_NEON2SSE_GLOBAL poly8_t vgetq_lane_p8(poly8x16_t vec, __constrange(`0`,`15`) int lane); // VMOV.U8 r0, d0[0]
12032	#define vgetq_lane_p8 vgetq_lane_u8
12033
12034	_NEON2SSE_GLOBAL poly16_t vgetq_lane_p16(poly16x8_t vec, __constrange(`0`,`7`) int lane); // VMOV.s16 r0, d0[0]
12035	#define vgetq_lane_p16 vgetq_lane_u16
12036
12037	_NEON2SSESTORAGE float32_t vgetq_lane_f32(float32x4_t vec, __constrange(`0`,`3`) int lane); // VMOV.32 r0, d0[0]
12038	_NEON2SSE_INLINE float32_t vgetq_lane_f32(float32x4_t vec, __constrange(`0`,`3`) int lane)
12039	{
12040	float32_t floatVal;
12041	char * const floatVal_c = (char*)&floatVal;
12042	((int32_t)floatVal_c) = _MM_EXTRACT_PS(vec,lane);
12043	return floatVal;
12044	}
12045
12046	_NEON2SSE_GLOBAL int64_t vget_lane_s64(int64x1_t vec, __constrange(`0`,`0`) int lane); // VMOV r0,r0,d0
12047	#define vget_lane_s64(vec, lane) vec.m64_i64[0]
12048
12049	_NEON2SSE_GLOBAL uint64_t vget_lane_u64(uint64x1_t vec, __constrange(`0`,`0`) int lane); // VMOV r0,r0,d0
12050	#define vget_lane_u64(vec, lane) vec.m64_u64[0]
12051
12052
12053	_NEON2SSE_GLOBAL int64_t vgetq_lane_s64(int64x2_t vec, __constrange(`0`,`1`) int lane); // VMOV r0,r0,d0
12054	#define vgetq_lane_s64 _MM_EXTRACT_EPI64
12055
12056	_NEON2SSE_GLOBAL uint64_t vgetq_lane_u64(uint64x2_t vec, __constrange(`0`,`1`) int lane); // VMOV r0,r0,d0
12057	#define vgetq_lane_u64 (uint64_t) _MM_EXTRACT_EPI64
12058
12059	// *************** Set lanes within a vector ******************************************
12060	// **************************************************************************************
12061	//These intrinsics set a single lane (element) within a vector.
12062	//same functions as vld1_lane_xx ones, but take the value to be set directly.
12063
12064	_NEON2SSESTORAGE uint8x8_t vset_lane_u8(uint8_t value, uint8x8_t vec, __constrange(`0`,`7`) int lane); // VMOV.8 d0[0],r0
12065	_NEON2SSE_INLINE uint8x8_t vset_lane_u8(uint8_t value, uint8x8_t vec, __constrange(`0`,`7`) int lane)
12066	{
12067	uint8_t val;
12068	val = value;
12069	return vld1_lane_u8(&val, vec, lane);
12070	}
12071
12072	_NEON2SSESTORAGE uint16x4_t vset_lane_u16(uint16_t value, uint16x4_t vec, __constrange(`0`,`3`) int lane); // VMOV.16 d0[0],r0
12073	_NEON2SSE_INLINE uint16x4_t vset_lane_u16(uint16_t value, uint16x4_t vec, __constrange(`0`,`3`) int lane)
12074	{
12075	uint16_t val;
12076	val = value;
12077	return vld1_lane_u16(&val, vec, lane);
12078	}
12079
12080	_NEON2SSESTORAGE uint32x2_t vset_lane_u32(uint32_t value, uint32x2_t vec, __constrange(`0`,`1`) int lane); // VMOV.32 d0[0],r0
12081	_NEON2SSE_INLINE uint32x2_t vset_lane_u32(uint32_t value, uint32x2_t vec, __constrange(`0`,`1`) int lane)
12082	{
12083	uint32_t val;
12084	val = value;
12085	return vld1_lane_u32(&val, vec, lane);
12086	}
12087
12088	_NEON2SSESTORAGE int8x8_t vset_lane_s8(int8_t value, int8x8_t vec, __constrange(`0`,`7`) int lane); // VMOV.8 d0[0],r0
12089	_NEON2SSE_INLINE int8x8_t vset_lane_s8(int8_t value, int8x8_t vec, __constrange(`0`,`7`) int lane)
12090	{
12091	int8_t val;
12092	val = value;
12093	return vld1_lane_s8(&val, vec, lane);
12094	}
12095
12096	_NEON2SSESTORAGE int16x4_t vset_lane_s16(int16_t value, int16x4_t vec, __constrange(`0`,`3`) int lane); // VMOV.16 d0[0],r0
12097	_NEON2SSE_INLINE int16x4_t vset_lane_s16(int16_t value, int16x4_t vec, __constrange(`0`,`3`) int lane)
12098	{
12099	int16_t val;
12100	val = value;
12101	return vld1_lane_s16(&val, vec, lane);
12102	}
12103
12104	_NEON2SSESTORAGE int32x2_t vset_lane_s32(int32_t value, int32x2_t vec, __constrange(`0`,`1`) int lane); // VMOV.32 d0[0],r0
12105	_NEON2SSE_INLINE int32x2_t vset_lane_s32(int32_t value, int32x2_t vec, __constrange(`0`,`1`) int lane)
12106	{
12107	int32_t val;
12108	val = value;
12109	return vld1_lane_s32(&val, vec, lane);
12110	}
12111
12112	_NEON2SSE_GLOBAL poly8x8_t vset_lane_p8(poly8_t value, poly8x8_t vec, __constrange(`0`,`7`) int lane); // VMOV.8 d0[0],r0
12113	#define vset_lane_p8 vset_lane_u8
12114
12115	_NEON2SSE_GLOBAL poly16x4_t vset_lane_p16(poly16_t value, poly16x4_t vec, __constrange(`0`,`3`) int lane); // VMOV.16 d0[0],r0
12116	#define vset_lane_p16 vset_lane_u16
12117
12118	_NEON2SSESTORAGE float32x2_t vset_lane_f32(float32_t value, float32x2_t vec, __constrange(`0`,`1`) int lane); // VMOV.32 d0[0],r0
12119	_NEON2SSE_INLINE float32x2_t vset_lane_f32(float32_t value, float32x2_t vec, __constrange(`0`,`1`) int lane)
12120	{
12121	float32_t val;
12122	val = value;
12123	return vld1_lane_f32(&val, vec, lane);
12124	}
12125
12126	_NEON2SSESTORAGE uint8x16_t vsetq_lane_u8(uint8_t value, uint8x16_t vec, __constrange(`0`,`15`) int lane); // VMOV.8 d0[0],r0
12127	_NEON2SSE_INLINE uint8x16_t vsetq_lane_u8(uint8_t value, uint8x16_t vec, __constrange(`0`,`15`) int lane)
12128	{
12129	uint8_t val;
12130	val = value;
12131	return vld1q_lane_u8(&val, vec, lane);
12132	}
12133
12134	_NEON2SSESTORAGE uint16x8_t vsetq_lane_u16(uint16_t value, uint16x8_t vec, __constrange(`0`,`7`) int lane); // VMOV.16 d0[0],r0
12135	_NEON2SSE_INLINE uint16x8_t vsetq_lane_u16(uint16_t value, uint16x8_t vec, __constrange(`0`,`7`) int lane)
12136	{
12137	uint16_t val;
12138	val = value;
12139	return vld1q_lane_u16(&val, vec, lane);
12140	}
12141
12142	_NEON2SSESTORAGE uint32x4_t vsetq_lane_u32(uint32_t value, uint32x4_t vec, __constrange(`0`,`3`) int lane); // VMOV.32 d0[0],r0
12143	_NEON2SSE_INLINE uint32x4_t vsetq_lane_u32(uint32_t value, uint32x4_t vec, __constrange(`0`,`3`) int lane)
12144	{
12145	uint32_t val;
12146	val = value;
12147	return vld1q_lane_u32(&val, vec, lane);
12148	}
12149
12150	_NEON2SSESTORAGE int8x16_t vsetq_lane_s8(int8_t value, int8x16_t vec, __constrange(`0`,`15`) int lane); // VMOV.8 d0[0],r0
12151	_NEON2SSE_INLINE int8x16_t vsetq_lane_s8(int8_t value, int8x16_t vec, __constrange(`0`,`15`) int lane)
12152	{
12153	int8_t val;
12154	val = value;
12155	return vld1q_lane_s8(&val, vec, lane);
12156	}
12157
12158	_NEON2SSESTORAGE int16x8_t vsetq_lane_s16(int16_t value, int16x8_t vec, __constrange(`0`,`7`) int lane); // VMOV.16 d0[0],r0
12159	_NEON2SSE_INLINE int16x8_t vsetq_lane_s16(int16_t value, int16x8_t vec, __constrange(`0`,`7`) int lane)
12160	{
12161	int16_t val;
12162	val = value;
12163	return vld1q_lane_s16(&val, vec, lane);
12164	}
12165
12166	_NEON2SSESTORAGE int32x4_t vsetq_lane_s32(int32_t value, int32x4_t vec, __constrange(`0`,`3`) int lane); // VMOV.32 d0[0],r0
12167	_NEON2SSE_INLINE int32x4_t vsetq_lane_s32(int32_t value, int32x4_t vec, __constrange(`0`,`3`) int lane)
12168	{
12169	int32_t val;
12170	val = value;
12171	return vld1q_lane_s32(&val, vec, lane);
12172	}
12173
12174	_NEON2SSE_GLOBAL poly8x16_t vsetq_lane_p8(poly8_t value, poly8x16_t vec, __constrange(`0`,`15`) int lane); // VMOV.8 d0[0],r0
12175	#define vsetq_lane_p8 vsetq_lane_u8
12176
12177	_NEON2SSE_GLOBAL poly16x8_t vsetq_lane_p16(poly16_t value, poly16x8_t vec, __constrange(`0`,`7`) int lane); // VMOV.16 d0[0],r0
12178	#define vsetq_lane_p16 vsetq_lane_u16
12179
12180	_NEON2SSESTORAGE float32x4_t vsetq_lane_f32(float32_t value, float32x4_t vec, __constrange(`0`,`3`) int lane); // VMOV.32 d0[0],r0
12181	_NEON2SSE_INLINE float32x4_t vsetq_lane_f32(float32_t value, float32x4_t vec, __constrange(`0`,`3`) int lane)
12182	{
12183	float32_t val;
12184	val = value;
12185	return vld1q_lane_f32(&val, vec, lane);
12186	}
12187
12188	_NEON2SSESTORAGE int64x1_t vset_lane_s64(int64_t value, int64x1_t vec, __constrange(`0`,`0`) int lane); // VMOV d0,r0,r0
12189	_NEON2SSE_INLINE int64x1_t vset_lane_s64(int64_t value, int64x1_t vec, __constrange(`0`,`0`) int lane)
12190	{
12191	int64_t val;
12192	val = value;
12193	return vld1_lane_s64(&val, vec, lane);
12194	}
12195
12196	_NEON2SSESTORAGE uint64x1_t vset_lane_u64(uint64_t value, uint64x1_t vec, __constrange(`0`,`0`) int lane); // VMOV d0,r0,r0
12197	_NEON2SSE_INLINE uint64x1_t vset_lane_u64(uint64_t value, uint64x1_t vec, __constrange(`0`,`0`) int lane)
12198	{
12199	uint64_t val;
12200	val = value;
12201	return vld1_lane_u64(&val, vec, lane);
12202	}
12203
12204	_NEON2SSESTORAGE int64x2_t vsetq_lane_s64(int64_t value, int64x2_t vec, __constrange(`0`,`1`) int lane); // VMOV d0,r0,r0
12205	_NEON2SSE_INLINE int64x2_t vsetq_lane_s64(int64_t value, int64x2_t vec, __constrange(`0`,`1`) int lane)
12206	{
12207	uint64_t val;
12208	val = value;
12209	return vld1q_lane_s64(&val, vec, lane);
12210	}
12211
12212	_NEON2SSE_GLOBAL uint64x2_t vsetq_lane_u64(uint64_t value, uint64x2_t vec, __constrange(`0`,`1`) int lane); // VMOV d0,r0,r0
12213	#define vsetq_lane_u64 vsetq_lane_s64
12214
12215	// *******************************************************************************
12216	// ************** Initialize a vector from bit pattern *************************
12217	// *******************************************************************************
12218	//These intrinsics create a vector from a literal bit pattern.
12219	_NEON2SSESTORAGE int8x8_t vcreate_s8(uint64_t a); // VMOV d0,r0,r0
12220	_NEON2SSE_INLINE int8x8_t vcreate_s8(uint64_t a)
12221	{
12222	return ((__m64_128)&(a)); //here we couldn't use macro due to possible immediate value usage
12223	}
12224
12225	_NEON2SSE_GLOBAL int16x4_t vcreate_s16(uint64_t a); // VMOV d0,r0,r0
12226	#define vcreate_s16 vcreate_s8
12227
12228	_NEON2SSE_GLOBAL int32x2_t vcreate_s32(uint64_t a); // VMOV d0,r0,r0
12229	#define vcreate_s32 vcreate_s8
12230
12231	_NEON2SSE_GLOBAL float16x4_t vcreate_f16(uint64_t a); // VMOV d0,r0,r0
12232	//no IA32 SIMD avalilable
12233
12234	_NEON2SSESTORAGE float32x2_t vcreate_f32(uint64_t a); // VMOV d0,r0,r0
12235	_NEON2SSE_INLINE float32x2_t vcreate_f32(uint64_t a)
12236	{
12237	return ((__m64_128)&(a)); //here we couldn't use macro due to possible immediate value usage
12238	}
12239
12240	_NEON2SSE_GLOBAL uint8x8_t vcreate_u8(uint64_t a); // VMOV d0,r0,r0
12241	#define vcreate_u8 vcreate_s8
12242
12243	_NEON2SSE_GLOBAL uint16x4_t vcreate_u16(uint64_t a); // VMOV d0,r0,r0
12244	#define vcreate_u16 vcreate_s16
12245
12246	_NEON2SSE_GLOBAL uint32x2_t vcreate_u32(uint64_t a); // VMOV d0,r0,r0
12247	#define vcreate_u32 vcreate_s32
12248
12249	_NEON2SSE_GLOBAL uint64x1_t vcreate_u64(uint64_t a); // VMOV d0,r0,r0
12250	#define vcreate_u64 vcreate_s8
12251
12252
12253	_NEON2SSE_GLOBAL poly8x8_t vcreate_p8(uint64_t a); // VMOV d0,r0,r0
12254	#define vcreate_p8 vcreate_u8
12255
12256	_NEON2SSE_GLOBAL poly16x4_t vcreate_p16(uint64_t a); // VMOV d0,r0,r0
12257	#define vcreate_p16 vcreate_u16
12258
12259	_NEON2SSE_GLOBAL int64x1_t vcreate_s64(uint64_t a); // VMOV d0,r0,r0
12260	#define vcreate_s64 vcreate_u64
12261
12262	//******************* Set all lanes to same value ******************************
12263	//*********************************************************************************
12264	//These intrinsics set all lanes to the same value.
12265	_NEON2SSESTORAGE uint8x8_t vdup_n_u8(uint8_t value); // VDUP.8 d0,r0
12266	_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint8x8_t vdup_n_u8(uint8_t value), _NEON2SSE_REASON_SLOW_SERIAL)
12267	{
12268	uint8x8_t res;
12269	int i;
12270	for (i = `0`; i<`8`; i++) {
12271	res.m64_u8[i] = value;
12272	}
12273	return res;
12274	}
12275
12276	_NEON2SSESTORAGE uint16x4_t vdup_n_u16(uint16_t value); // VDUP.16 d0,r0
12277	_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint16x4_t vdup_n_u16(uint16_t value), _NEON2SSE_REASON_SLOW_SERIAL)
12278	{
12279	uint16x4_t res;
12280	int i;
12281	for (i = `0`; i<`4`; i++) {
12282	res.m64_u16[i] = value;
12283	}
12284	return res;
12285	}
12286
12287	_NEON2SSESTORAGE uint32x2_t vdup_n_u32(uint32_t value); // VDUP.32 d0,r0
12288	_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint32x2_t vdup_n_u32(uint32_t value), _NEON2SSE_REASON_SLOW_SERIAL)
12289	{
12290	uint32x2_t res;
12291	res.m64_u32[`0`] = value;
12292	res.m64_u32[`1`] = value;
12293	return res;
12294	}
12295
12296	_NEON2SSESTORAGE int8x8_t vdup_n_s8(int8_t value); // VDUP.8 d0,r0
12297	_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int8x8_t vdup_n_s8(int8_t value), _NEON2SSE_REASON_SLOW_SERIAL)
12298	{
12299	int8x8_t res;
12300	int i;
12301	for (i = `0`; i<`8`; i++) {
12302	res.m64_i8[i] = value;
12303	}
12304	return res;
12305	}
12306
12307	_NEON2SSESTORAGE int16x4_t vdup_n_s16(int16_t value); // VDUP.16 d0,r0
12308	_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int16x4_t vdup_n_s16(int16_t value), _NEON2SSE_REASON_SLOW_SERIAL)
12309	{
12310	int16x4_t res;
12311	int i;
12312	for (i = `0`; i<`4`; i++) {
12313	res.m64_i16[i] = value;
12314	}
12315	return res;
12316	}
12317
12318	_NEON2SSESTORAGE int32x2_t vdup_n_s32(int32_t value); // VDUP.32 d0,r0
12319	_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int32x2_t vdup_n_s32(int32_t value), _NEON2SSE_REASON_SLOW_SERIAL)
12320	{
12321	int32x2_t res;
12322	res.m64_i32[`0`] = value;
12323	res.m64_i32[`1`] = value;
12324	return res;
12325	}
12326
12327	_NEON2SSE_GLOBAL poly8x8_t vdup_n_p8(poly8_t value); // VDUP.8 d0,r0
12328	#define vdup_n_p8 vdup_n_u8
12329
12330	_NEON2SSE_GLOBAL poly16x4_t vdup_n_p16(poly16_t value); // VDUP.16 d0,r0
12331	#define vdup_n_p16 vdup_n_s16
12332
12333	_NEON2SSESTORAGE float32x2_t vdup_n_f32(float32_t value); // VDUP.32 d0,r0
12334	_NEON2SSE_INLINE float32x2_t vdup_n_f32(float32_t value)
12335	{
12336	float32x2_t res;
12337	res.m64_f32[`0`] = value;
12338	res.m64_f32[`1`] = value;
12339	return res;
12340	}
12341
12342	_NEON2SSE_GLOBAL uint8x16_t vdupq_n_u8(uint8_t value); // VDUP.8 q0,r0
12343	#define vdupq_n_u8(value) _mm_set1_epi8((int8_t) (value))
12344
12345	_NEON2SSE_GLOBAL uint16x8_t vdupq_n_u16(uint16_t value); // VDUP.16 q0,r0
12346	#define vdupq_n_u16(value) _mm_set1_epi16((int16_t) (value))
12347
12348	_NEON2SSE_GLOBAL uint32x4_t vdupq_n_u32(uint32_t value); // VDUP.32 q0,r0
12349	#define vdupq_n_u32(value) _mm_set1_epi32((int32_t) (value))
12350
12351	_NEON2SSE_GLOBAL int8x16_t vdupq_n_s8(int8_t value); // VDUP.8 q0,r0
12352	#define vdupq_n_s8 _mm_set1_epi8
12353
12354	_NEON2SSE_GLOBAL int16x8_t vdupq_n_s16(int16_t value); // VDUP.16 q0,r0
12355	#define vdupq_n_s16 _mm_set1_epi16
12356
12357	_NEON2SSE_GLOBAL int32x4_t vdupq_n_s32(int32_t value); // VDUP.32 q0,r0
12358	#define vdupq_n_s32 _mm_set1_epi32
12359
12360	_NEON2SSE_GLOBAL poly8x16_t vdupq_n_p8(poly8_t value); // VDUP.8 q0,r0
12361	#define vdupq_n_p8 vdupq_n_u8
12362
12363	_NEON2SSE_GLOBAL poly16x8_t vdupq_n_p16(poly16_t value); // VDUP.16 q0,r0
12364	#define vdupq_n_p16 vdupq_n_u16
12365
12366	_NEON2SSE_GLOBAL float32x4_t vdupq_n_f32(float32_t value); // VDUP.32 q0,r0
12367	#define vdupq_n_f32 _mm_set1_ps
12368
12369	_NEON2SSESTORAGE int64x1_t vdup_n_s64(int64_t value); // VMOV d0,r0,r0
12370	_NEON2SSE_INLINE int64x1_t vdup_n_s64(int64_t value)
12371	{
12372	int64x1_t res;
12373	res.m64_i64[`0`] = value;
12374	return res;
12375	}
12376
12377	_NEON2SSESTORAGE uint64x1_t vdup_n_u64(uint64_t value); // VMOV d0,r0,r0
12378	_NEON2SSE_INLINE uint64x1_t vdup_n_u64(uint64_t value)
12379	{
12380	uint64x1_t res;
12381	res.m64_u64[`0`] = value;
12382	return res;
12383	}
12384
12385	_NEON2SSESTORAGE int64x2_t vdupq_n_s64(int64_t value); // VMOV d0,r0,r0
12386	_NEON2SSE_INLINE int64x2_t vdupq_n_s64(int64_t value)
12387	{
12388	_NEON2SSE_ALIGN_16 int64_t value2[`2`];
12389
12390	value2[`0`] = value;
12391	value2[`1`] = value;
12392
12393	return LOAD_SI128(value2);
12394	}
12395
12396	_NEON2SSESTORAGE uint64x2_t vdupq_n_u64(uint64_t value); // VMOV d0,r0,r0
12397	_NEON2SSE_INLINE uint64x2_t vdupq_n_u64(uint64_t value)
12398	{
12399	_NEON2SSE_ALIGN_16 uint64_t val[`2`];
12400
12401	val[`0`] = value;
12402	val[`1`] = value;
12403
12404	return LOAD_SI128(val);
12405	}
12406
12407	//** Set all lanes to same value **********************
12408	//Same functions as above - just aliaces.********************
12409	//Probably they reflect the fact that 128-bit functions versions use VMOV instruction **********
12410	_NEON2SSE_GLOBAL uint8x8_t vmov_n_u8(uint8_t value); // VDUP.8 d0,r0
12411	#define vmov_n_u8 vdup_n_s8
12412
12413	_NEON2SSE_GLOBAL uint16x4_t vmov_n_u16(uint16_t value); // VDUP.16 d0,r0
12414	#define vmov_n_u16 vdup_n_s16
12415
12416	_NEON2SSE_GLOBAL uint32x2_t vmov_n_u32(uint32_t value); // VDUP.32 d0,r0
12417	#define vmov_n_u32 vdup_n_u32
12418
12419	_NEON2SSE_GLOBAL int8x8_t vmov_n_s8(int8_t value); // VDUP.8 d0,r0
12420	#define vmov_n_s8 vdup_n_s8
12421
12422	_NEON2SSE_GLOBAL int16x4_t vmov_n_s16(int16_t value); // VDUP.16 d0,r0
12423	#define vmov_n_s16 vdup_n_s16
12424
12425	_NEON2SSE_GLOBAL int32x2_t vmov_n_s32(int32_t value); // VDUP.32 d0,r0
12426	#define vmov_n_s32 vdup_n_s32
12427
12428	_NEON2SSE_GLOBAL poly8x8_t vmov_n_p8(poly8_t value); // VDUP.8 d0,r0
12429	#define vmov_n_p8 vdup_n_u8
12430
12431	_NEON2SSE_GLOBAL poly16x4_t vmov_n_p16(poly16_t value); // VDUP.16 d0,r0
12432	#define vmov_n_p16 vdup_n_s16
12433
12434	_NEON2SSE_GLOBAL float32x2_t vmov_n_f32(float32_t value); // VDUP.32 d0,r0
12435	#define vmov_n_f32 vdup_n_f32
12436
12437	_NEON2SSE_GLOBAL uint8x16_t vmovq_n_u8(uint8_t value); // VDUP.8 q0,r0
12438	#define vmovq_n_u8 vdupq_n_u8
12439
12440	_NEON2SSE_GLOBAL uint16x8_t vmovq_n_u16(uint16_t value); // VDUP.16 q0,r0
12441	#define vmovq_n_u16 vdupq_n_s16
12442
12443	_NEON2SSE_GLOBAL uint32x4_t vmovq_n_u32(uint32_t value); // VDUP.32 q0,r0
12444	#define vmovq_n_u32 vdupq_n_u32
12445
12446	_NEON2SSE_GLOBAL int8x16_t vmovq_n_s8(int8_t value); // VDUP.8 q0,r0
12447	#define vmovq_n_s8 vdupq_n_s8
12448
12449	_NEON2SSE_GLOBAL int16x8_t vmovq_n_s16(int16_t value); // VDUP.16 q0,r0
12450	#define vmovq_n_s16 vdupq_n_s16
12451
12452	_NEON2SSE_GLOBAL int32x4_t vmovq_n_s32(int32_t value); // VDUP.32 q0,r0
12453	#define vmovq_n_s32 vdupq_n_s32
12454
12455	_NEON2SSE_GLOBAL poly8x16_t vmovq_n_p8(poly8_t value); // VDUP.8 q0,r0
12456	#define vmovq_n_p8 vdupq_n_u8
12457
12458	_NEON2SSE_GLOBAL poly16x8_t vmovq_n_p16(poly16_t value); // VDUP.16 q0,r0
12459	#define vmovq_n_p16 vdupq_n_s16
12460
12461	_NEON2SSE_GLOBAL float32x4_t vmovq_n_f32(float32_t value); // VDUP.32 q0,r0
12462	#define vmovq_n_f32 vdupq_n_f32
12463
12464	_NEON2SSE_GLOBAL int64x1_t vmov_n_s64(int64_t value); // VMOV d0,r0,r0
12465	#define vmov_n_s64 vdup_n_s64
12466
12467	_NEON2SSE_GLOBAL uint64x1_t vmov_n_u64(uint64_t value); // VMOV d0,r0,r0
12468	#define vmov_n_u64 vdup_n_u64
12469
12470	_NEON2SSE_GLOBAL int64x2_t vmovq_n_s64(int64_t value); // VMOV d0,r0,r0
12471	#define vmovq_n_s64 vdupq_n_s64
12472
12473	_NEON2SSE_GLOBAL uint64x2_t vmovq_n_u64(uint64_t value); // VMOV d0,r0,r0
12474	#define vmovq_n_u64 vdupq_n_u64
12475
12476	//************Set all lanes to the value of one lane of a vector ***********
12477	//****************************************************************************
12478	//here shuffle is better solution than lane extraction followed by set1 function
12479	_NEON2SSESTORAGE uint8x8_t vdup_lane_u8(uint8x8_t vec, __constrange(`0`,`7`) int lane); // VDUP.8 d0,d0[0]
12480	_NEON2SSE_INLINE uint8x8_t vdup_lane_u8(uint8x8_t vec, __constrange(`0`,`7`) int lane)
12481	{
12482	uint8x8_t res;
12483	uint8_t valane;
12484	int i = `0`;
12485	valane = vec.m64_u8[lane];
12486	for (i = `0`; i<`8`; i++) {
12487	res.m64_u8[i] = valane;
12488	}
12489	return res;
12490	}
12491
12492	_NEON2SSESTORAGE uint16x4_t vdup_lane_u16(uint16x4_t vec, __constrange(`0`,`3`) int lane); // VDUP.16 d0,d0[0]
12493	_NEON2SSE_INLINE uint16x4_t vdup_lane_u16(uint16x4_t vec, __constrange(`0`,`3`) int lane)
12494	{
12495	uint16x4_t res;
12496	uint16_t valane;
12497	valane = vec.m64_u16[lane];
12498	res.m64_u16[`0`] = valane;
12499	res.m64_u16[`1`] = valane;
12500	res.m64_u16[`2`] = valane;
12501	res.m64_u16[`3`] = valane;
12502	return res;
12503	}
12504
12505	_NEON2SSESTORAGE uint32x2_t vdup_lane_u32(uint32x2_t vec, __constrange(`0`,`1`) int lane); // VDUP.32 d0,d0[0]
12506	_NEON2SSE_INLINE uint32x2_t vdup_lane_u32(uint32x2_t vec, __constrange(`0`,`1`) int lane)
12507	{
12508	uint32x2_t res;
12509	res.m64_u32[`0`] = vec.m64_u32[lane];
12510	res.m64_u32[`1`] = res.m64_u32[`0`];
12511	return res;
12512	}
12513
12514	_NEON2SSE_GLOBAL int8x8_t vdup_lane_s8(int8x8_t vec, __constrange(`0`,`7`) int lane); // VDUP.8 d0,d0[0]
12515	#define vdup_lane_s8 vdup_lane_u8
12516
12517	_NEON2SSE_GLOBAL int16x4_t vdup_lane_s16(int16x4_t vec, __constrange(`0`,`3`) int lane); // VDUP.16 d0,d0[0]
12518	#define vdup_lane_s16 vdup_lane_u16
12519
12520	_NEON2SSE_GLOBAL int32x2_t vdup_lane_s32(int32x2_t vec, __constrange(`0`,`1`) int lane); // VDUP.32 d0,d0[0]
12521	#define vdup_lane_s32 vdup_lane_u32
12522
12523	_NEON2SSE_GLOBAL poly8x8_t vdup_lane_p8(poly8x8_t vec, __constrange(`0`,`7`) int lane); // VDUP.8 d0,d0[0]
12524	#define vdup_lane_p8 vdup_lane_u8
12525
12526	_NEON2SSE_GLOBAL poly16x4_t vdup_lane_p16(poly16x4_t vec, __constrange(`0`,`3`) int lane); // VDUP.16 d0,d0[0]
12527	#define vdup_lane_p16 vdup_lane_s16
12528
12529	_NEON2SSESTORAGE float32x2_t vdup_lane_f32(float32x2_t vec, __constrange(`0`,`1`) int lane); // VDUP.32 d0,d0[0]
12530	_NEON2SSE_INLINE float32x2_t vdup_lane_f32(float32x2_t vec, __constrange(`0`,`1`) int lane)
12531	{
12532	float32x2_t res;
12533	res.m64_f32[`0`] = vec.m64_f32[lane];
12534	res.m64_f32[`1`] = res.m64_f32[`0`];
12535	return res;
12536	}
12537
12538	_NEON2SSESTORAGE uint8x16_t vdupq_lane_u8(uint8x8_t vec, __constrange(`0`,`7`) int lane); // VDUP.8 q0,d0[0]
12539	_NEON2SSE_INLINE uint8x16_t vdupq_lane_u8(uint8x8_t vec, __constrange(`0`,`7`) int lane) // VDUP.8 q0,d0[0]
12540	{
12541	const int8_t lane8 = (int8_t) lane;
12542	_NEON2SSE_ALIGN_16 int8_t lanemask8[`16`];
12543
12544	lanemask8[`0`] = lane8;
12545	lanemask8[`1`] = lane8;
12546	lanemask8[`2`] = lane8;
12547	lanemask8[`3`] = lane8;
12548	lanemask8[`4`] = lane8;
12549	lanemask8[`5`] = lane8;
12550	lanemask8[`6`] = lane8;
12551	lanemask8[`7`] = lane8;
12552	lanemask8[`8`] = lane8;
12553	lanemask8[`9`] = lane8;
12554	lanemask8[`10`] = lane8;
12555	lanemask8[`11`] = lane8;
12556	lanemask8[`12`] = lane8;
12557	lanemask8[`13`] = lane8;
12558	lanemask8[`14`] = lane8;
12559	lanemask8[`15`] = lane8;
12560
12561	return _mm_shuffle_epi8 (_pM128i(vec), (__m128i) lanemask8);
12562	}
12563
12564	_NEON2SSESTORAGE uint16x8_t vdupq_lane_u16(uint16x4_t vec, __constrange(`0`,`3`) int lane); // VDUP.16 q0,d0[0]
12565	_NEON2SSE_INLINE uint16x8_t vdupq_lane_u16(uint16x4_t vec, __constrange(`0`,`3`) int lane) // VDUP.16 q0,d0[0]
12566	{
12567	//we could use 8bit shuffle for 16 bit as well
12568	const int8_t lane16 = ((int8_t) lane) << `1`;
12569	const int8_t lane16_1 = lane16 + `1`;
12570	_NEON2SSE_ALIGN_16 int8_t lanemask_e16[`16`];
12571
12572	lanemask_e16[`0`] = lane16;
12573	lanemask_e16[`1`] = lane16_1;
12574	lanemask_e16[`2`] = lane16;
12575	lanemask_e16[`3`] = lane16_1;
12576	lanemask_e16[`4`] = lane16;
12577	lanemask_e16[`5`] = lane16_1;
12578	lanemask_e16[`6`] = lane16;
12579	lanemask_e16[`7`] = lane16_1;
12580	lanemask_e16[`8`] = lane16;
12581	lanemask_e16[`9`] = lane16_1;
12582	lanemask_e16[`10`] = lane16;
12583	lanemask_e16[`11`] = lane16_1;
12584	lanemask_e16[`12`] = lane16;
12585	lanemask_e16[`13`] = lane16_1;
12586	lanemask_e16[`14`] = lane16;
12587	lanemask_e16[`15`] = lane16_1;
12588
12589	return _mm_shuffle_epi8 (_pM128i(vec), (__m128i)lanemask_e16);
12590	}
12591
12592	_NEON2SSESTORAGE uint32x4_t vdupq_lane_u32(uint32x2_t vec, __constrange(`0`,`1`) int lane); // VDUP.32 q0,d0[0]
12593	_NEON2SSE_INLINE uint32x4_t vdupq_lane_u32(uint32x2_t vec, __constrange(`0`,`1`) int lane)
12594	{
12595	//need to use function not macro to make it gcc friendly and meet the immediate const requirement for _mm_shuffle_epi32
12596	if (lane == `1`)
12597	return _mm_shuffle_epi32 (_pM128i(vec), (`1` \| (`1` << `2`) \| (`1` << `4`) \| (`1` << `6`)) );
12598	else
12599	return _mm_shuffle_epi32 (_pM128i(vec), `0`);
12600	}
12601
12602	_NEON2SSE_GLOBAL int8x16_t vdupq_lane_s8(int8x8_t vec, __constrange(`0`,`7`) int lane); // VDUP.8 q0,d0[0]
12603	#define vdupq_lane_s8 vdupq_lane_u8
12604
12605	_NEON2SSE_GLOBAL int16x8_t vdupq_lane_s16(int16x4_t vec, __constrange(`0`,`3`) int lane); // VDUP.16 q0,d0[0]
12606	#define vdupq_lane_s16 vdupq_lane_u16
12607
12608	_NEON2SSE_GLOBAL int32x4_t vdupq_lane_s32(int32x2_t vec, __constrange(`0`,`1`) int lane); // VDUP.32 q0,d0[0]
12609	#define vdupq_lane_s32 vdupq_lane_u32
12610
12611	_NEON2SSE_GLOBAL poly8x16_t vdupq_lane_p8(poly8x8_t vec, __constrange(`0`,`7`) int lane); // VDUP.8 q0,d0[0]
12612	#define vdupq_lane_p8 vdupq_lane_u8
12613
12614	_NEON2SSE_GLOBAL poly16x8_t vdupq_lane_p16(poly16x4_t vec, __constrange(`0`,`3`) int lane); // VDUP.16 q0,d0[0]
12615	#define vdupq_lane_p16 vdupq_lane_s16
12616
12617	_NEON2SSE_GLOBAL float32x4_t vdupq_lane_f32(float32x2_t vec, __constrange(`0`,`1`) int lane); // VDUP.32 q0,d0[0]
12618	#define vdupq_lane_f32(vec, lane) _mm_load1_ps((vec.m64_f32 + lane))
12619
12620	_NEON2SSE_GLOBAL int64x1_t vdup_lane_s64(int64x1_t vec, __constrange(`0`,`0`) int lane); // VMOV d0,d0
12621	#define vdup_lane_s64(vec,lane) vec
12622
12623	_NEON2SSE_GLOBAL uint64x1_t vdup_lane_u64(uint64x1_t vec, __constrange(`0`,`0`) int lane); // VMOV d0,d0
12624	#define vdup_lane_u64(vec,lane) vec
12625
12626	_NEON2SSESTORAGE int64x2_t vdupq_lane_s64(int64x1_t vec, __constrange(`0`,`0`) int lane); // VMOV q0,q0
12627	_NEON2SSE_INLINE int64x2_t vdupq_lane_s64(int64x1_t vec, __constrange(`0`,`0`) int lane)
12628	{
12629	__m128i vec128;
12630	UNREFERENCED_PARAMETER(lane);
12631	vec128 = _pM128i(vec);
12632	return _mm_unpacklo_epi64(vec128,vec128);
12633	}
12634
12635	_NEON2SSE_GLOBAL uint64x2_t vdupq_lane_u64(uint64x1_t vec, __constrange(`0`,`0`) int lane); // VMOV q0,q0
12636	#define vdupq_lane_u64 vdupq_lane_s64
12637
12638	// ********************************************************************
12639	// ****************** Combining vectors ***************************
12640	// ********************************************************************
12641	//These intrinsics join two 64 bit vectors into a single 128bit vector.
12642	_NEON2SSESTORAGE int8x16_t vcombine_s8(int8x8_t low, int8x8_t high); // VMOV d0,d0
12643	_NEON2SSE_INLINE int8x16_t vcombine_s8(int8x8_t low, int8x8_t high)
12644	{
12645	return _mm_unpacklo_epi64 (_pM128i(low), _pM128i(high) );
12646	}
12647
12648	_NEON2SSE_GLOBAL int16x8_t vcombine_s16(int16x4_t low, int16x4_t high); // VMOV d0,d0
12649	#define vcombine_s16 vcombine_s8
12650
12651	_NEON2SSE_GLOBAL int32x4_t vcombine_s32(int32x2_t low, int32x2_t high); // VMOV d0,d0
12652	#define vcombine_s32 vcombine_s8
12653
12654	_NEON2SSE_GLOBAL int64x2_t vcombine_s64(int64x1_t low, int64x1_t high); // VMOV d0,d0
12655	#define vcombine_s64 vcombine_s8
12656
12657	_NEON2SSE_GLOBAL float16x8_t vcombine_f16(float16x4_t low, float16x4_t high); // VMOV d0,d0
12658	//current IA SIMD doesn't support float16
12659
12660	_NEON2SSESTORAGE float32x4_t vcombine_f32(float32x2_t low, float32x2_t high); // VMOV d0,d0
12661	_NEON2SSE_INLINE float32x4_t vcombine_f32(float32x2_t low, float32x2_t high)
12662	{
12663	__m128i res;
12664	res = _mm_unpacklo_epi64(_pM128i(low), _pM128i(high) );
12665	return _M128(res);
12666	}
12667
12668	_NEON2SSE_GLOBAL uint8x16_t vcombine_u8(uint8x8_t low, uint8x8_t high); // VMOV d0,d0
12669	#define vcombine_u8 vcombine_s8
12670
12671	_NEON2SSE_GLOBAL uint16x8_t vcombine_u16(uint16x4_t low, uint16x4_t high); // VMOV d0,d0
12672	#define vcombine_u16 vcombine_s16
12673
12674	_NEON2SSE_GLOBAL uint32x4_t vcombine_u32(uint32x2_t low, uint32x2_t high); // VMOV d0,d0
12675	#define vcombine_u32 vcombine_s32
12676
12677	_NEON2SSE_GLOBAL uint64x2_t vcombine_u64(uint64x1_t low, uint64x1_t high); // VMOV d0,d0
12678	#define vcombine_u64 vcombine_s64
12679
12680	_NEON2SSE_GLOBAL poly8x16_t vcombine_p8(poly8x8_t low, poly8x8_t high); // VMOV d0,d0
12681	#define vcombine_p8 vcombine_u8
12682
12683	_NEON2SSE_GLOBAL poly16x8_t vcombine_p16(poly16x4_t low, poly16x4_t high); // VMOV d0,d0
12684	#define vcombine_p16 vcombine_u16
12685
12686	//**********************************************************************
12687	//*********************** Splitting vectors ************************
12688	//**********************************************************************
12689	//************** Get high part ****************************************
12690	//These intrinsics split a 128 bit vector into 2 component 64 bit vectors
12691	_NEON2SSESTORAGE int8x8_t vget_high_s8(int8x16_t a); // VMOV d0,d0
12692	_NEON2SSE_INLINE int8x8_t vget_high_s8(int8x16_t a)
12693	{
12694	int8x8_t res64;
12695	__m128i res;
12696	res = _mm_unpackhi_epi64(a,a); //SSE2
12697	return64(res);
12698	}
12699
12700	_NEON2SSESTORAGE int16x4_t vget_high_s16(int16x8_t a); // VMOV d0,d0
12701	_NEON2SSE_INLINE int16x4_t vget_high_s16(int16x8_t a)
12702	{
12703	int16x4_t res64;
12704	__m128i res;
12705	res = _mm_unpackhi_epi64(a,a); //SSE2
12706	return64(res);
12707	}
12708
12709	_NEON2SSESTORAGE int32x2_t vget_high_s32(int32x4_t a); // VMOV d0,d0
12710	_NEON2SSE_INLINE int32x2_t vget_high_s32(int32x4_t a)
12711	{
12712	int32x2_t res64;
12713	__m128i res;
12714	res = _mm_unpackhi_epi64(a,a); //SSE2
12715	return64(res);
12716	}
12717
12718	_NEON2SSESTORAGE int64x1_t vget_high_s64(int64x2_t a); // VMOV d0,d0
12719	_NEON2SSE_INLINE int64x1_t vget_high_s64(int64x2_t a)
12720	{
12721	int64x1_t res64;
12722	__m128i res;
12723	res = _mm_unpackhi_epi64(a,a); //SSE2
12724	return64(res);
12725	}
12726
12727	_NEON2SSE_GLOBAL float16x4_t vget_high_f16(float16x8_t a); // VMOV d0,d0
12728	// IA32 SIMD doesn't work with 16bit floats currently
12729
12730	_NEON2SSESTORAGE float32x2_t vget_high_f32(float32x4_t a); // VMOV d0,d0
12731	_NEON2SSE_INLINE float32x2_t vget_high_f32(float32x4_t a)
12732	{
12733	__m128i res;
12734	__m64_128 res64;
12735	res = _mm_unpackhi_epi64(_M128i(a),_M128i(a));
12736	return64(res);
12737	}
12738
12739	_NEON2SSE_GLOBAL uint8x8_t vget_high_u8(uint8x16_t a); // VMOV d0,d0
12740	#define vget_high_u8 vget_high_s8
12741
12742	_NEON2SSE_GLOBAL uint16x4_t vget_high_u16(uint16x8_t a); // VMOV d0,d0
12743	#define vget_high_u16 vget_high_s16
12744
12745	_NEON2SSE_GLOBAL uint32x2_t vget_high_u32(uint32x4_t a); // VMOV d0,d0
12746	#define vget_high_u32 vget_high_s32
12747
12748	_NEON2SSE_GLOBAL uint64x1_t vget_high_u64(uint64x2_t a); // VMOV d0,d0
12749	#define vget_high_u64 vget_high_s64
12750
12751	_NEON2SSE_GLOBAL poly8x8_t vget_high_p8(poly8x16_t a); // VMOV d0,d0
12752	#define vget_high_p8 vget_high_u8
12753
12754	_NEON2SSE_GLOBAL poly16x4_t vget_high_p16(poly16x8_t a); // VMOV d0,d0
12755	#define vget_high_p16 vget_high_u16
12756
12757	//******************** Get low part ********************
12758	//**********************************************************
12759	_NEON2SSESTORAGE int8x8_t vget_low_s8(int8x16_t a); // VMOV d0,d0
12760	_NEON2SSE_INLINE int8x8_t vget_low_s8(int8x16_t a) // VMOV d0,d0
12761	{
12762	int16x4_t res64;
12763	return64(a);
12764	}
12765
12766	_NEON2SSESTORAGE int16x4_t vget_low_s16(int16x8_t a); // VMOV d0,d0
12767	_NEON2SSE_INLINE int16x4_t vget_low_s16(int16x8_t a) // VMOV d0,d0
12768	{
12769	int16x4_t res64;
12770	return64(a);
12771	}
12772
12773	_NEON2SSESTORAGE int32x2_t vget_low_s32(int32x4_t a); // VMOV d0,d0
12774	_NEON2SSE_INLINE int32x2_t vget_low_s32(int32x4_t a) // VMOV d0,d0
12775	{
12776	int32x2_t res64;
12777	return64(a);
12778	}
12779
12780	_NEON2SSESTORAGE int64x1_t vget_low_s64(int64x2_t a); // VMOV d0,d0
12781	_NEON2SSE_INLINE int64x1_t vget_low_s64(int64x2_t a) // VMOV d0,d0
12782	{
12783	int64x1_t res64;
12784	return64 (a);
12785	}
12786
12787	_NEON2SSE_GLOBAL float16x4_t vget_low_f16(float16x8_t a); // VMOV d0,d0
12788	// IA32 SIMD doesn't work with 16bit floats currently
12789
12790	_NEON2SSESTORAGE float32x2_t vget_low_f32(float32x4_t a); // VMOV d0,d0
12791	_NEON2SSE_INLINE float32x2_t vget_low_f32(float32x4_t a)
12792	{
12793	float32x2_t res64;
12794	_M64f(res64, a);
12795	return res64;
12796	}
12797
12798	_NEON2SSE_GLOBAL uint8x8_t vget_low_u8(uint8x16_t a); // VMOV d0,d0
12799	#define vget_low_u8 vget_low_s8
12800
12801	_NEON2SSE_GLOBAL uint16x4_t vget_low_u16(uint16x8_t a); // VMOV d0,d0
12802	#define vget_low_u16 vget_low_s16
12803
12804	_NEON2SSE_GLOBAL uint32x2_t vget_low_u32(uint32x4_t a); // VMOV d0,d0
12805	#define vget_low_u32 vget_low_s32
12806
12807	_NEON2SSE_GLOBAL uint64x1_t vget_low_u64(uint64x2_t a); // VMOV d0,d0
12808	#define vget_low_u64 vget_low_s64
12809
12810	_NEON2SSE_GLOBAL poly8x8_t vget_low_p8(poly8x16_t a); // VMOV d0,d0
12811	#define vget_low_p8 vget_low_u8
12812
12813	_NEON2SSE_GLOBAL poly16x4_t vget_low_p16(poly16x8_t a); // VMOV d0,d0
12814	#define vget_low_p16 vget_low_s16
12815
12816	//**************************************************************************
12817	//********************** Converting vectors ********************************
12818	//**************************************************************************
12819	//*********** Convert from float *************************************
12820	// need to set _MM_SET_ROUNDING_MODE ( x) accordingly
12821	_NEON2SSESTORAGE int32x2_t vcvt_s32_f32(float32x2_t a); // VCVT.S32.F32 d0, d0
12822	_NEON2SSE_INLINE int32x2_t vcvt_s32_f32(float32x2_t a)
12823	{
12824	int32x2_t res64;
12825	__m128i res;
12826	res = _mm_cvtps_epi32(_pM128(a)); //use low 64 bits of result only
12827	return64(res);
12828	}
12829
12830	_NEON2SSESTORAGE uint32x2_t vcvt_u32_f32(float32x2_t a); // VCVT.U32.F32 d0, d0
12831	_NEON2SSE_INLINE uint32x2_t vcvt_u32_f32(float32x2_t a)
12832	{
12833	uint32x2_t res64;
12834	__m128i res;
12835	res = vcvtq_u32_f32(_pM128(a));
12836	return64(res);
12837	}
12838
12839	_NEON2SSESTORAGE int32x4_t vcvtq_s32_f32(float32x4_t a); // VCVT.S32.F32 q0, q0
12840	_NEON2SSE_INLINE int32x4_t vcvtq_s32_f32(float32x4_t a)
12841	{
12842	__m128 dif;
12843	__m128i res;
12844	//_mm_cvttps_epi32 incorrectly treats the case a > =2.14748364e+009, therefore the special processing is necessary
12845	_NEON2SSE_ALIGN_16 static const float32_t fmax[] = { `2.14748364e+009f`, `2.14748364e+009f`, `2.14748364e+009f`, `2.14748364e+009f` };
12846	dif = _mm_cmpge_ps(a, (__m128)fmax);
12847	res = _mm_cvttps_epi32(a);
12848	return _mm_xor_si128(res, _M128i(dif));
12849	}
12850
12851	_NEON2SSESTORAGE uint32x4_t vcvtq_u32_f32(float32x4_t a); // VCVT.U32.F32 q0, q0
12852	_NEON2SSE_INLINE uint32x4_t vcvtq_u32_f32(float32x4_t a) // VCVT.U32.F32 q0, q0
12853	{
12854	//No single instruction SSE solution but we could implement it as following:
12855	__m128i res1, res2, zero, mask;
12856	__m128 max, min, dif;
12857	_NEON2SSE_ALIGN_16 static const float32_t fmax[] = { `2.14748364e+009f`, `2.14748364e+009f`, `2.14748364e+009f`, `2.14748364e+009f` };
12858	_NEON2SSE_ALIGN_16 static const float32_t fmax_unsigned[] = { `4.29496729e+009f`, `4.29496729e+009f`, `4.29496729e+009f`, `4.29496729e+009f` };
12859	zero = _mm_setzero_si128();
12860	mask = _mm_cmpgt_epi32(_M128i(a), zero);
12861	min = _mm_and_ps(_M128(mask), a);
12862	max = _mm_min_ps(min, (__m128)fmax_unsigned); //clamped in 0 - 4.29496729+009
12863
12864	dif = _mm_sub_ps(max, (__m128)fmax);
12865	mask = _mm_cmpgt_epi32(_M128i(dif),zero);
12866	dif = _mm_and_ps(_M128(mask), dif);
12867
12868	res1 = _mm_cvttps_epi32(dif);
12869	res2 = vcvtq_s32_f32(max);
12870	return _mm_add_epi32(res1, res2);
12871	}
12872
12873	// *** Convert to the fixed point with the number of fraction bits specified by b *********
12874	//*************************************************************************************************
12875	_NEON2SSESTORAGE uint32_t clamp_u32_f32(float v);
12876	_NEON2SSE_INLINE uint32_t clamp_u32_f32(float v)
12877	{
12878	return (v <= `0` ? `0` : (v >= (float)~`0U` ? ~`0U` : (uint32_t)(v)));
12879	}
12880
12881	_NEON2SSESTORAGE int32x2_t vcvt_n_s32_f32(float32x2_t a, __constrange(`1`,`32`) int b); // VCVT.S32.F32 d0, d0, #32
12882	_NEON2SSE_INLINE int32x2_t vcvt_n_s32_f32(float32x2_t a, __constrange(`1`,`32`) int b)
12883	{
12884	int32x2_t res64;
12885	return64(vcvtq_n_s32_f32(_pM128(a),b));
12886	}
12887
12888	_NEON2SSESTORAGE uint32x2_t vcvt_n_u32_f32(float32x2_t a, __constrange(`1`,`32`) int b); // VCVT.U32.F32 d0, d0, #32
12889	_NEON2SSE_INLINE uint32x2_t vcvt_n_u32_f32(float32x2_t a, __constrange(`1`,`32`) int b)
12890	{
12891	uint32x2_t res;
12892	float convconst;
12893	convconst = (float)((uint64_t)`1` << b);
12894	res.m64_u32[`0`] = clamp_u32_f32(a.m64_f32[`0`] * convconst);
12895	res.m64_u32[`1`] = clamp_u32_f32(a.m64_f32[`1`] * convconst);
12896	return res;
12897	}
12898
12899	_NEON2SSESTORAGE int32x4_t vcvtq_n_s32_f32(float32x4_t a, __constrange(`1`,`32`) int b); // VCVT.S32.F32 q0, q0, #32
12900	_NEON2SSE_INLINE int32x4_t vcvtq_n_s32_f32(float32x4_t a, __constrange(`1`,`32`) int b)
12901	{
12902	float convconst;
12903	_NEON2SSE_ALIGN_16 static const uint32_t cmask[] = {`0x80000000`, `0x80000000`, `0x80000000`, `0x80000000`};
12904	__m128 cconst128;
12905	__m128i mask, res;
12906	convconst = (float)((uint64_t)`1` << b);
12907	cconst128 = vdupq_n_f32(convconst);
12908	res = _mm_cvttps_epi32(_mm_mul_ps(a,cconst128));
12909	mask = _mm_cmpeq_epi32 (res, (__m128i)cmask);
12910
12911	/ ...for negative values we do not want to negate the bits of saturated value /
12912	mask = _mm_and_si128(_mm_castps_si128(_mm_cmpgt_ps(a,_mm_setzero_ps())), mask);
12913
12914	return _mm_xor_si128 (res, mask); //res saturated for 0x80000000
12915	}
12916
12917	_NEON2SSESTORAGE uint32x4_t vcvtq_n_u32_f32(float32x4_t a, __constrange(`1`,`32`) int b); // VCVT.U32.F32 q0, q0, #32
12918	_NEON2SSE_INLINE uint32x4_t vcvtq_n_u32_f32(float32x4_t a, __constrange(`1`,`32`) int b)
12919	{
12920	float convconst;
12921	__m128 cconst128;
12922	convconst = (float)((uint64_t)`1` << b);
12923	cconst128 = vdupq_n_f32(convconst);
12924	return vcvtq_u32_f32(_mm_mul_ps(a,cconst128));
12925	}
12926
12927
12928	_NEON2SSESTORAGE int32x4_t vcvtnq_s32_f32(float32x4_t a); // VCVTN.S32.F32 q0, q0
12929	_NEON2SSE_INLINE int32x4_t vcvtnq_s32_f32(float32x4_t a)
12930	{
12931	return _mm_cvtps_epi32(a);
12932	}
12933
12934	//*************** Convert to float ***********************
12935	//*************************************************************
12936	_NEON2SSESTORAGE float32x2_t vcvt_f32_s32(int32x2_t a); // VCVT.F32.S32 d0, d0
12937	_NEON2SSE_INLINE float32x2_t vcvt_f32_s32(int32x2_t a) //use low 64 bits
12938	{
12939	float32x2_t res;
12940	res.m64_f32[`0`] = (float) a.m64_i32[`0`];
12941	res.m64_f32[`1`] = (float) a.m64_i32[`1`];
12942	return res;
12943	}
12944
12945	_NEON2SSESTORAGE float32x2_t vcvt_f32_u32(uint32x2_t a); // VCVT.F32.U32 d0, d0
12946	_NEON2SSE_INLINE float32x2_t vcvt_f32_u32(uint32x2_t a)
12947	{
12948	float32x2_t res;
12949	res.m64_f32[`0`] = (float) a.m64_u32[`0`];
12950	res.m64_f32[`1`] = (float) a.m64_u32[`1`];
12951	return res;
12952	}
12953
12954	_NEON2SSE_GLOBAL float32x4_t vcvtq_f32_s32(int32x4_t a); // VCVT.F32.S32 q0, q0
12955	#define vcvtq_f32_s32(a) _mm_cvtepi32_ps(a)
12956
12957	_NEON2SSESTORAGE float32x4_t vcvtq_f32_u32(uint32x4_t a); // VCVT.F32.U32 q0, q0
12958	_NEON2SSE_INLINE float32x4_t vcvtq_f32_u32(uint32x4_t a) // VCVT.F32.U32 q0, q0
12959	{
12960	//solution may be not optimal
12961	__m128 two16, fHi, fLo;
12962	__m128i hi, lo;
12963	two16 = _mm_set1_ps((float)`0x10000`); //2^16
12964	// Avoid double rounding by doing two exact conversions
12965	// of high and low 16-bit segments
12966	hi = _mm_srli_epi32(a, `16`);
12967	lo = _mm_srli_epi32(_mm_slli_epi32(a, `16`), `16`);
12968	fHi = _mm_mul_ps(_mm_cvtepi32_ps(hi), two16);
12969	fLo = _mm_cvtepi32_ps(lo);
12970	// do single rounding according to current rounding mode
12971	return _mm_add_ps(fHi, fLo);
12972	}
12973
12974	// *** Convert to the float from fixed point with the number of fraction bits specified by b *********
12975	_NEON2SSESTORAGE float32x2_t vcvt_n_f32_s32(int32x2_t a, __constrange(`1`,`32`) int b); // VCVT.F32.S32 d0, d0, #32
12976	_NEON2SSE_INLINE float32x2_t vcvt_n_f32_s32(int32x2_t a, __constrange(`1`,`32`) int b)
12977	{
12978	float32x2_t res;
12979	float convconst;
12980	convconst = (float)(`1.` / ((uint64_t)`1` << b));
12981	res.m64_f32[`0`] = a.m64_i32[`0`] * convconst;
12982	res.m64_f32[`1`] = a.m64_i32[`1`] * convconst;
12983	return res;
12984	}
12985
12986	_NEON2SSESTORAGE float32x2_t vcvt_n_f32_u32(uint32x2_t a, __constrange(`1`,`32`) int b); // VCVT.F32.U32 d0, d0, #32
12987	_NEON2SSE_INLINE float32x2_t vcvt_n_f32_u32(uint32x2_t a, __constrange(`1`,`32`) int b) // VCVT.F32.U32 d0, d0, #32
12988	{
12989	float32x2_t res;
12990	float convconst;
12991	convconst = (float)(`1.` / ((uint64_t)`1` << b));
12992	res.m64_f32[`0`] = a.m64_u32[`0`] * convconst;
12993	res.m64_f32[`1`] = a.m64_u32[`1`] * convconst;
12994	return res;
12995	}
12996
12997	_NEON2SSESTORAGE float32x4_t vcvtq_n_f32_s32(int32x4_t a, __constrange(`1`,`32`) int b); // VCVT.F32.S32 q0, q0, #32
12998	_NEON2SSE_INLINE float32x4_t vcvtq_n_f32_s32(int32x4_t a, __constrange(`1`,`32`) int b)
12999	{
13000	float convconst;
13001	__m128 cconst128, af;
13002	convconst = (float)(`1.` / ((uint64_t)`1` << b));
13003	af = _mm_cvtepi32_ps(a);
13004	cconst128 = vdupq_n_f32(convconst);
13005	return _mm_mul_ps(af,cconst128);
13006	}
13007
13008	_NEON2SSESTORAGE float32x4_t vcvtq_n_f32_u32(uint32x4_t a, __constrange(`1`,`32`) int b); // VCVT.F32.U32 q0, q0, #32
13009	_NEON2SSE_INLINE float32x4_t vcvtq_n_f32_u32(uint32x4_t a, __constrange(`1`,`32`) int b)
13010	{
13011	float convconst;
13012	__m128 cconst128, af;
13013	convconst = (float)(`1.` / ((uint64_t)`1` << b));
13014	af = vcvtq_f32_u32(a);
13015	cconst128 = vdupq_n_f32(convconst);
13016	return _mm_mul_ps(af,cconst128);
13017	}
13018
13019	//************Convert between floats *********************
13020	//************************************************************
13021	_NEON2SSE_GLOBAL float16x4_t vcvt_f16_f32(float32x4_t a); // VCVT.F16.F32 d0, q0
13022	//Intel SIMD doesn't support 16bits floats curently
13023
13024	_NEON2SSE_GLOBAL float32x4_t vcvt_f32_f16(float16x4_t a); // VCVT.F32.F16 q0, d0
13025	//Intel SIMD doesn't support 16bits floats curently, the only solution is to store 16bit floats and load as 32 bits
13026
13027	//**********Vector narrow integer conversion (truncation) ****************
13028	//****************************************************************************
13029	_NEON2SSESTORAGE int8x8_t vmovn_s16(int16x8_t a); // VMOVN.I16 d0,q0
13030	_NEON2SSE_INLINE int8x8_t vmovn_s16(int16x8_t a) // VMOVN.I16 d0,q0
13031	{
13032	int8x8_t res64;
13033	__m128i res;
13034	res = _mm_shuffle_epi8 (a, (__m128i) mask8_16_even_odd); //use 64 low bits only
13035	return64(res);
13036	}
13037
13038	_NEON2SSESTORAGE int16x4_t vmovn_s32(int32x4_t a); // VMOVN.I32 d0,q0
13039	_NEON2SSE_INLINE int16x4_t vmovn_s32(int32x4_t a) // VMOVN.I32 d0,q0
13040	{
13041	int16x4_t res64;
13042	__m128i res;
13043	res = _mm_shuffle_epi8 (a, (__m128i) mask8_32_even_odd); //use 64 low bits only
13044	return64(res);
13045	}
13046
13047	_NEON2SSESTORAGE int32x2_t vmovn_s64(int64x2_t a); // VMOVN.I64 d0,q0
13048	_NEON2SSE_INLINE int32x2_t vmovn_s64(int64x2_t a)
13049	{
13050	//may be not effective compared with a serial implementation
13051	int32x2_t res64;
13052	__m128i res;
13053	res = _mm_shuffle_epi32 (a, `0` \| (`2` << `2`) \| (`1` << `4`) \| (`3` << `6`)); //use 64 low bits only, _MM_SHUFFLE(3, 1, 2, 0)
13054	return64(res);
13055	}
13056
13057	_NEON2SSE_GLOBAL uint8x8_t vmovn_u16(uint16x8_t a); // VMOVN.I16 d0,q0
13058	#define vmovn_u16 vmovn_s16
13059
13060	_NEON2SSE_GLOBAL uint16x4_t vmovn_u32(uint32x4_t a); // VMOVN.I32 d0,q0
13061	#define vmovn_u32 vmovn_s32
13062
13063	_NEON2SSE_GLOBAL uint32x2_t vmovn_u64(uint64x2_t a); // VMOVN.I64 d0,q0
13064	#define vmovn_u64 vmovn_s64
13065
13066	//************** Vector long move *********************
13067	//***********************************************************
13068	_NEON2SSESTORAGE int16x8_t vmovl_s8(int8x8_t a); // VMOVL.S8 q0,d0
13069	_NEON2SSE_INLINE int16x8_t vmovl_s8(int8x8_t a)
13070	{
13071	return _MM_CVTEPI8_EPI16(_pM128i(a)); //SSE4.1
13072	}
13073
13074	_NEON2SSESTORAGE int32x4_t vmovl_s16(int16x4_t a); // VMOVL.S16 q0,d0
13075	_NEON2SSE_INLINE int32x4_t vmovl_s16(int16x4_t a)
13076	{
13077	return _MM_CVTEPI16_EPI32(_pM128i(a)); //SSE4.1
13078	}
13079
13080	_NEON2SSESTORAGE int64x2_t vmovl_s32(int32x2_t a); // VMOVL.S32 q0,d0
13081	_NEON2SSE_INLINE int64x2_t vmovl_s32(int32x2_t a)
13082	{
13083	return _MM_CVTEPI32_EPI64(_pM128i(a)); //SSE4.1
13084	}
13085
13086	_NEON2SSESTORAGE uint16x8_t vmovl_u8(uint8x8_t a); // VMOVL.U8 q0,d0
13087	_NEON2SSE_INLINE uint16x8_t vmovl_u8(uint8x8_t a)
13088	{
13089	return _MM_CVTEPU8_EPI16(_pM128i(a)); //SSE4.1
13090	}
13091
13092	_NEON2SSESTORAGE uint32x4_t vmovl_u16(uint16x4_t a); // VMOVL.s16 q0,d0
13093	_NEON2SSE_INLINE uint32x4_t vmovl_u16(uint16x4_t a)
13094	{
13095	return _MM_CVTEPU16_EPI32(_pM128i(a)); //SSE4.1
13096	}
13097
13098	_NEON2SSESTORAGE uint64x2_t vmovl_u32(uint32x2_t a); // VMOVL.U32 q0,d0
13099	_NEON2SSE_INLINE uint64x2_t vmovl_u32(uint32x2_t a)
13100	{
13101	return _MM_CVTEPU32_EPI64(_pM128i(a)); //SSE4.1
13102	}
13103
13104	//***********Vector saturating narrow integer***************
13105	//**************************************************************
13106	_NEON2SSESTORAGE int8x8_t vqmovn_s16(int16x8_t a); // VQMOVN.S16 d0,q0
13107	_NEON2SSE_INLINE int8x8_t vqmovn_s16(int16x8_t a)
13108	{
13109	int8x8_t res64;
13110	__m128i res;
13111	res = _mm_packs_epi16(a, a);
13112	return64(res);
13113	}
13114
13115	_NEON2SSESTORAGE int16x4_t vqmovn_s32(int32x4_t a); // VQMOVN.S32 d0,q0
13116	_NEON2SSE_INLINE int16x4_t vqmovn_s32(int32x4_t a)
13117	{
13118	int16x4_t res64;
13119	__m128i res;
13120	res = _mm_packs_epi32(a, a);
13121	return64(res);
13122	}
13123
13124	_NEON2SSESTORAGE int32x2_t vqmovn_s64(int64x2_t a); // VQMOVN.S64 d0,q0
13125	_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int32x2_t vqmovn_s64(int64x2_t a),_NEON2SSE_REASON_SLOW_SERIAL) //no effective SIMD solution
13126	{
13127	int32x2_t res;
13128	_NEON2SSE_ALIGN_16 int64_t atmp[`2`];
13129	_mm_store_si128((__m128i*)atmp, a);
13130	if(atmp[`0`]>SINT_MAX) atmp[`0`] = SINT_MAX;
13131	if(atmp[`0`]<SINT_MIN) atmp[`0`] = SINT_MIN;
13132	if(atmp[`1`]>SINT_MAX) atmp[`1`] = SINT_MAX;
13133	if(atmp[`1`]<SINT_MIN) atmp[`1`] = SINT_MIN;
13134	res.m64_i32[`0`] = (int32_t)atmp[`0`];
13135	res.m64_i32[`1`] = (int32_t)atmp[`1`];
13136	return res;
13137	}
13138
13139	_NEON2SSESTORAGE uint8x8_t vqmovn_u16(uint16x8_t a); // VQMOVN.s16 d0,q0
13140	_NEON2SSE_INLINE uint8x8_t vqmovn_u16(uint16x8_t a) // VQMOVN.s16 d0,q0
13141	{
13142	//no uint16 to uint8 conversion in SSE, need truncate to max signed first. Also trying to avoid _mm_shuffle_epi8 because of its big latency for old Atom CPUs
13143	uint8x8_t res64;
13144	__m128i c7fff, a_trunc, mask_trunc;
13145	c7fff = _mm_set1_epi16 (`0x7fff`); // 15-th bit set to zero
13146	a_trunc = _mm_and_si128(a, c7fff); // a truncated to max signed
13147	mask_trunc = _mm_cmpgt_epi16(a_trunc, a); //if after the shift we have bigger value than before then the 15-th bit had been set initially.
13148	mask_trunc = _mm_and_si128(mask_trunc, c7fff); //zero or c7fff if the 15-th bit had been set initially
13149	a_trunc = _mm_or_si128(a_trunc, mask_trunc);
13150	a_trunc = _mm_packus_epi16 (a_trunc, a_trunc); //use low 64bits only
13151	return64(a_trunc);
13152	}
13153
13154	_NEON2SSESTORAGE uint16x4_t vqmovn_u32(uint32x4_t a); // VQMOVN.U32 d0,q0
13155	_NEON2SSE_INLINE uint16x4_t vqmovn_u32(uint32x4_t a) // VQMOVN.U32 d0,q0
13156	{
13157	#ifdef USE_SSE4
13158	//no uint32 to uint16 conversion in SSE, need truncate to max signed first
13159	uint16x4_t res64;
13160	__m128i c7fffffff, a_trunc, mask_trunc;
13161	c7fffffff = _mm_set1_epi32((uint32_t)`0x7fffffff`); // 31-th bit set to zero
13162	a_trunc = _mm_and_si128(a, c7fffffff); // a truncated to max signed
13163	mask_trunc = _mm_cmpgt_epi16(a_trunc, a); //if after the shift we have bigger value than before then the 15-th bit had been set initially.
13164	mask_trunc = _mm_and_si128(mask_trunc, c7fffffff); //zero or c7fff if the 15-th bit had been set initially
13165	a_trunc = _mm_or_si128(a_trunc, mask_trunc);
13166	a_trunc = _MM_PACKUS1_EPI32 (a_trunc); //use low 64bits only
13167	return64(a_trunc);
13168	#else
13169	uint16x4_t res64;
13170	__m128i res_hi, mask;
13171	mask = _mm_setzero_si128();
13172	res_hi = _mm_srli_epi32(a, `16`);
13173	res_hi = _mm_cmpeq_epi16(res_hi, mask);
13174	mask = _mm_cmpeq_epi16(mask,mask); //all fff
13175	mask = _mm_andnot_si128(res_hi,mask); //inverst res_hi to get >16 bits numbers
13176	res_hi = _mm_or_si128(a, mask); //saturated res
13177	res_hi = _mm_shuffle_epi8 (res_hi, (__m128i) mask8_32_even_odd); //go to 16 bits
13178	return64(res_hi);
13179	#endif
13180	}
13181
13182	_NEON2SSESTORAGE uint32x2_t vqmovn_u64(uint64x2_t a); // VQMOVN.U64 d0,q0
13183	_NEON2SSE_INLINE uint32x2_t vqmovn_u64(uint64x2_t a)
13184	{
13185	//serial solution may be faster
13186	uint32x2_t res64;
13187	__m128i res_hi, mask;
13188	mask = _mm_setzero_si128();
13189	res_hi = _mm_srli_epi64(a, `32`);
13190	res_hi = _mm_cmpeq_epi32(res_hi, mask);
13191	mask = _mm_cmpeq_epi32(mask,mask); //all fff
13192	mask = _mm_andnot_si128(res_hi,mask); //inverst res_hi to get >32 bits numbers
13193	res_hi = _mm_or_si128(a, mask);
13194	res_hi = _mm_shuffle_epi32(res_hi, `0` \| (`2` << `2`) \| (`1` << `4`) \| (`3` << `6`)); //shuffle the data to get 2 32-bits
13195	return64(res_hi);
13196	}
13197	//*********** Vector saturating narrow integer signed->unsigned ************
13198	//*****************************************************************************
13199	_NEON2SSESTORAGE uint8x8_t vqmovun_s16(int16x8_t a); // VQMOVUN.S16 d0,q0
13200	_NEON2SSE_INLINE uint8x8_t vqmovun_s16(int16x8_t a)
13201	{
13202	uint8x8_t res64;
13203	__m128i res;
13204	res = _mm_packus_epi16(a, a); //use low 64bits only
13205	return64(res);
13206	}
13207
13208	_NEON2SSESTORAGE uint16x4_t vqmovun_s32(int32x4_t a); // VQMOVUN.S32 d0,q0
13209	_NEON2SSE_INLINE uint16x4_t vqmovun_s32(int32x4_t a)
13210	{
13211	uint16x4_t res64;
13212	__m128i res;
13213	res = _MM_PACKUS1_EPI32(a); //use low 64bits only
13214	return64(res);
13215	}
13216
13217	_NEON2SSESTORAGE uint32x2_t vqmovun_s64(int64x2_t a); // VQMOVUN.S64 d0,q0
13218	_NEON2SSE_INLINE uint32x2_t vqmovun_s64(int64x2_t a)
13219	{
13220	uint32x2_t res64;
13221	__m128i res_hi,res_lo, zero, cmp;
13222	zero = _mm_setzero_si128();
13223	res_hi = _mm_srli_epi64(a, `32`);
13224	cmp = _mm_cmpgt_epi32(zero, res_hi); //if cmp<0 the result should be zero
13225	res_lo = _mm_andnot_si128(cmp,a); //if cmp zero - do nothing, otherwise cmp <0 and the result is 0
13226	cmp = _mm_cmpgt_epi32(res_hi,zero); //if cmp positive
13227	res_lo = _mm_or_si128(res_lo, cmp); //if cmp positive we are out of 32bits need to saturaate to 0xffffffff
13228	res_lo = _mm_shuffle_epi32(res_lo, `0` \| (`2` << `2`) \| (`1` << `4`) \| (`3` << `6`)); //shuffle the data to get 2 32-bits
13229	return64(res_lo);
13230	}
13231
13232	// ********************************************************
13233	// ************** Table look up ************************
13234	// ********************************************************
13235	//VTBL (Vector Table Lookup) uses byte indexes in a control vector to look up byte values
13236	//in a table and generate a new vector. Indexes out of range return 0.
13237
13238	//for Intel SIMD we need to set the MSB to 1 for zero return
13239	//if b is unsigned ( > max signed) or negative it has MSB 1 set and doesn't need any special processing
13240	_NEON2SSESTORAGE uint8x8_t vtbl1_u8(uint8x8_t a, uint8x8_t b); // VTBL.8 d0, {d0}, d0
13241	_NEON2SSE_INLINE uint8x8_t vtbl1_u8(uint8x8_t a, uint8x8_t b)
13242	{
13243	uint8x8_t res64;
13244	__m128i c7, maskgt, bmask, b128;
13245	c7 = _mm_set1_epi8 (`7`);
13246	b128 = _pM128i(b);
13247	maskgt = _mm_cmpgt_epi8(b128,c7);
13248	bmask = _mm_or_si128(b128,maskgt);
13249	bmask = _mm_shuffle_epi8(_pM128i(a),bmask);
13250	return64(bmask);
13251	}
13252
13253	_NEON2SSE_GLOBAL int8x8_t vtbl1_s8(int8x8_t a, int8x8_t b); // VTBL.8 d0, {d0}, d0
13254	#define vtbl1_s8 vtbl1_u8
13255
13256	_NEON2SSE_GLOBAL poly8x8_t vtbl1_p8(poly8x8_t a, uint8x8_t b); // VTBL.8 d0, {d0}, d0
13257	#define vtbl1_p8 vtbl1_u8
13258
13259	_NEON2SSESTORAGE uint8x8_t vtbl2_u8(uint8x8x2_t a, uint8x8_t b); // VTBL.8 d0, {d0, d1}, d0
13260	_NEON2SSE_INLINE uint8x8_t vtbl2_u8(uint8x8x2_t a, uint8x8_t b)
13261	{
13262	uint8x8_t res64;
13263	__m128i c15, a01, maskgt15, bmask, b128;
13264	c15 = _mm_set1_epi8 (`15`);
13265	b128 = _pM128i(b);
13266	maskgt15 = _mm_cmpgt_epi8(b128,c15);
13267	bmask = _mm_or_si128(b128, maskgt15);
13268	a01 = _mm_unpacklo_epi64(_pM128i(a.val[`0`]), _pM128i(a.val[`1`]));
13269	a01 = _mm_shuffle_epi8(a01, bmask);
13270	return64(a01);
13271	}
13272
13273	//int8x8_t vtbl2_s8(int8x8x2_t a, int8x8_t b); // VTBL.8 d0, {d0, d1}, d0
13274	#define vtbl2_s8 vtbl2_u8
13275
13276	//poly8x8_t vtbl2_p8(poly8x8x2_t a, uint8x8_t b); // VTBL.8 d0, {d0, d1}, d0
13277	#define vtbl2_p8 vtbl2_u8
13278
13279	_NEON2SSESTORAGE uint8x8_t vtbl3_u8(uint8x8x3_t a, uint8x8_t b); // VTBL.8 d0, {d0, d1, d2}, d0
13280	_NEON2SSE_INLINE uint8x8_t vtbl3_u8(uint8x8x3_t a, uint8x8_t b)
13281	{
13282	//solution may be not optimal
13283	uint8x8_t res64;
13284	__m128i c15, c23, maskgt23, bmask, maskgt15, sh0, sh1, a01, b128;
13285	c15 = _mm_set1_epi8 (`15`);
13286	c23 = _mm_set1_epi8 (`23`);
13287	b128 = _pM128i(b);
13288	maskgt23 = _mm_cmpgt_epi8(b128,c23);
13289	bmask = _mm_or_si128(b128, maskgt23);
13290	maskgt15 = _mm_cmpgt_epi8(b128,c15);
13291	a01 = _mm_unpacklo_epi64(_pM128i(a.val[`0`]),_pM128i(a.val[`1`]));
13292	sh0 = _mm_shuffle_epi8(a01, bmask);
13293	sh1 = _mm_shuffle_epi8(_pM128i(a.val[`2`]), bmask); //for bi>15 bi is wrapped (bi-=15)
13294	sh0 = _MM_BLENDV_EPI8(sh0, sh1, maskgt15); //SSE4.1
13295	return64(sh0);
13296	}
13297
13298	_NEON2SSE_GLOBAL int8x8_t vtbl3_s8(int8x8x3_t a, int8x8_t b); // VTBL.8 d0, {d0, d1, d2}, d0
13299	#define vtbl3_s8 vtbl3_u8
13300
13301	_NEON2SSE_GLOBAL poly8x8_t vtbl3_p8(poly8x8x3_t a, uint8x8_t b); // VTBL.8 d0, {d0, d1, d2}, d0
13302	#define vtbl3_p8 vtbl3_u8
13303
13304	_NEON2SSESTORAGE uint8x8_t vtbl4_u8(uint8x8x4_t a, uint8x8_t b); // VTBL.8 d0, {d0, d1, d2, d3}, d0
13305	_NEON2SSE_INLINE uint8x8_t vtbl4_u8(uint8x8x4_t a, uint8x8_t b)
13306	{
13307	//solution may be not optimal
13308	uint8x8_t res64;
13309	__m128i c15, c31, maskgt31, bmask, maskgt15, sh0, sh1, a01, a23, b128;
13310	c15 = _mm_set1_epi8 (`15`);
13311	c31 = _mm_set1_epi8 (`31`);
13312	b128 = _pM128i(b);
13313	maskgt31 = _mm_cmpgt_epi8(b128,c31);
13314	bmask = _mm_or_si128(b128, maskgt31);
13315	maskgt15 = _mm_cmpgt_epi8(b128,c15);
13316	a01 = _mm_unpacklo_epi64(_pM128i(a.val[`0`]),_pM128i(a.val[`1`]));
13317	a23 = _mm_unpacklo_epi64(_pM128i(a.val[`2`]),_pM128i(a.val[`3`]));
13318	sh0 = _mm_shuffle_epi8(a01, bmask);
13319	sh1 = _mm_shuffle_epi8(a23, bmask); //for bi>15 bi is wrapped (bi-=15)
13320	sh0 = _MM_BLENDV_EPI8 (sh0, sh1, maskgt15); //SSE4.1
13321	return64(sh0);
13322	}
13323
13324	_NEON2SSE_GLOBAL int8x8_t vtbl4_s8(int8x8x4_t a, int8x8_t b); // VTBL.8 d0, {d0, d1, d2, d3}, d0
13325	#define vtbl4_s8 vtbl4_u8
13326
13327	_NEON2SSE_GLOBAL poly8x8_t vtbl4_p8(poly8x8x4_t a, uint8x8_t b); // VTBL.8 d0, {d0, d1, d2, d3}, d0
13328	#define vtbl4_p8 vtbl4_u8
13329
13330	//**************** Extended table look up intrinsics *************************
13331	//**********************************************************************************
13332	//VTBX (Vector Table Extension) works in the same way as VTBL do,
13333	// except that indexes out of range leave the destination element unchanged.
13334
13335	_NEON2SSESTORAGE uint8x8_t vtbx1_u8(uint8x8_t a, uint8x8_t b, uint8x8_t c); // VTBX.8 d0, {d0}, d0
13336	_NEON2SSE_INLINE uint8x8_t vtbx1_u8(uint8x8_t a, uint8x8_t b, uint8x8_t c)
13337	{
13338	uint8x8_t res64;
13339	__m128i c8, maskgt, sh, c128;
13340	c8 = _mm_set1_epi8(`8`);
13341	c128 = _pM128i(c);
13342	//need to pre-clamp c values to avoid unsigned comparison
13343	c128 = _mm_min_epu8(c128, c8);
13344	maskgt = _mm_cmpgt_epi8(c8,c128);
13345	sh = _mm_shuffle_epi8(_pM128i(b),c128);
13346	sh = _mm_and_si128(maskgt,sh);
13347	c8 = _mm_andnot_si128(maskgt,_pM128i(a));
13348	sh = _mm_or_si128(sh,c8);
13349	return64(sh);
13350	}
13351
13352	_NEON2SSE_GLOBAL int8x8_t vtbx1_s8(int8x8_t a, int8x8_t b, int8x8_t c); // VTBX.8 d0, {d0}, d0
13353	#define vtbx1_s8 vtbx1_u8
13354
13355	_NEON2SSE_GLOBAL poly8x8_t vtbx1_p8(poly8x8_t a, poly8x8_t b, uint8x8_t c); // VTBX.8 d0, {d0}, d0
13356	#define vtbx1_p8 vtbx1_u8
13357
13358	_NEON2SSESTORAGE uint8x8_t vtbx2_u8(uint8x8_t a, uint8x8x2_t b, uint8x8_t c); // VTBX.8 d0, {d0, d1}, d0
13359	_NEON2SSE_INLINE uint8x8_t vtbx2_u8(uint8x8_t a, uint8x8x2_t b, uint8x8_t c)
13360	{
13361	uint8x8_t res64;
13362	__m128i c16, b01, maskgt15, sh, c128;
13363	c16 = _mm_set1_epi8(`16`);
13364	c128 = _pM128i(c);
13365	//need to pre-clamp c values to avoid unsigned comparison
13366	c128 = _mm_min_epu8(c128, c16);
13367	maskgt15 = _mm_cmpgt_epi8(c16,c128);
13368	b01 = _mm_unpacklo_epi64(_pM128i(b.val[`0`]), _pM128i(b.val[`1`]));
13369	sh = _mm_shuffle_epi8(b01, c128);
13370	sh = _mm_and_si128(maskgt15, sh);
13371	c16 = _mm_andnot_si128(maskgt15, _pM128i(a));
13372	sh = _mm_or_si128(sh,c16);
13373	return64(sh);
13374	}
13375
13376	//int8x8_t vtbx2_s8(int8x8_t a, int8x8x2_t b, int8x8_t c); // VTBX.8 d0, {d0, d1}, d0
13377	#define vtbx2_s8 vtbx2_u8
13378
13379	//poly8x8_t vtbx2_p8(poly8x8_t a, poly8x8x2_t b, uint8x8_t c); // VTBX.8 d0, {d0, d1}, d0
13380	#define vtbx2_p8 vtbx2_u8
13381
13382	_NEON2SSESTORAGE uint8x8_t vtbx3_u8(uint8x8_t a, uint8x8x3_t b, uint8x8_t c); // VTBX.8 d0, {d0, d1, d2}, d0
13383	_NEON2SSE_INLINE uint8x8_t vtbx3_u8(uint8x8_t a, uint8x8x3_t b, uint8x8_t c)
13384	{
13385	//solution may be not optimal
13386	uint8x8_t res64;
13387	__m128i c15, c24, maskgt15, maskgt23, sh0, sh1, b01, c128;
13388	c15 = _mm_set1_epi8 (`15`);
13389	c24 = _mm_set1_epi8 (`24`);
13390	c128 = _pM128i(c);
13391	//need to pre-clamp c values to avoid unsigned comparison
13392	c128 = _mm_min_epu8(c128, c24);
13393	maskgt23 = _mm_cmpgt_epi8(c24,c128);
13394	maskgt15 = _mm_cmpgt_epi8(c128,c15);
13395	c24 = _mm_andnot_si128(maskgt23, _pM128i(a));
13396	b01 = _mm_unpacklo_epi64(_pM128i(b.val[`0`]),_pM128i(b.val[`1`]));
13397	sh0 = _mm_shuffle_epi8(b01, c128);
13398	sh1 = _mm_shuffle_epi8(_pM128i(b.val[`2`]), c128); //for bi>15 bi is wrapped (bi-=15)
13399	sh0 = _MM_BLENDV_EPI8(sh0, sh1, maskgt15);
13400	sh0 = _mm_and_si128(maskgt23,sh0);
13401	sh0 = _mm_or_si128(sh0,c24);
13402	return64(sh0);
13403	}
13404
13405	_NEON2SSE_GLOBAL int8x8_t vtbx3_s8(int8x8_t a, int8x8x3_t b, int8x8_t c); // VTBX.8 d0, {d0, d1, d2}, d0
13406	#define vtbx3_s8 vtbx3_u8
13407
13408	_NEON2SSE_GLOBAL poly8x8_t vtbx3_p8(poly8x8_t a, poly8x8x3_t b, uint8x8_t c); // VTBX.8 d0, {d0, d1, d2}, d0
13409	#define vtbx3_p8 vtbx3_u8
13410
13411	_NEON2SSESTORAGE uint8x8_t vtbx4_u8(uint8x8_t a, uint8x8x4_t b, uint8x8_t c); // VTBX.8 d0, {d0, d1, d2, d3}, d0
13412	_NEON2SSE_INLINE uint8x8_t vtbx4_u8(uint8x8_t a, uint8x8x4_t b, uint8x8_t c)
13413	{
13414	//solution may be not optimal
13415	uint8x8_t res64;
13416	__m128i c15, c32, maskgt15, maskgt31, sh0, sh1, b01, b23, c128;
13417	c15 = _mm_set1_epi8 (`15`);
13418	c32 = _mm_set1_epi8 (`32`);
13419	c128 = _pM128i(c);
13420	//need to pre-clamp c values to avoid unsigned comparison
13421	c128 = _mm_min_epu8(c128, c32);
13422	maskgt15 = _mm_cmpgt_epi8(c128,c15);
13423	maskgt31 = _mm_cmpgt_epi8(c32,c128);
13424	c32 = _mm_andnot_si128(maskgt31, _pM128i(a));
13425
13426	b01 = _mm_unpacklo_epi64(_pM128i(b.val[`0`]),_pM128i(b.val[`1`]));
13427	b23 = _mm_unpacklo_epi64(_pM128i(b.val[`2`]),_pM128i(b.val[`3`]));
13428	sh0 = _mm_shuffle_epi8(b01, c128);
13429	sh1 = _mm_shuffle_epi8(b23, c128); //for bi>15 bi is wrapped (bi-=15)
13430	sh0 = _MM_BLENDV_EPI8(sh0, sh1, maskgt15);
13431	sh0 = _mm_and_si128(maskgt31,sh0);
13432	sh0 = _mm_or_si128(sh0,c32);
13433	return64(sh0);
13434	}
13435
13436	_NEON2SSE_GLOBAL int8x8_t vtbx4_s8(int8x8_t a, int8x8x4_t b, int8x8_t c); // VTBX.8 d0, {d0, d1, d2, d3}, d0
13437	#define vtbx4_s8 vtbx4_u8
13438
13439	_NEON2SSE_GLOBAL poly8x8_t vtbx4_p8(poly8x8_t a, poly8x8x4_t b, uint8x8_t c); // VTBX.8 d0, {d0, d1, d2, d3}, d0
13440	#define vtbx4_p8 vtbx4_u8
13441
13442	//*************************************************************************************************
13443	// ************************* Operations with a scalar value *******************************
13444	//*************************************************************************************************
13445
13446	//***** Vector multiply accumulate by scalar ***********************************************
13447	//**********************************************************************************************
13448	_NEON2SSESTORAGE int16x4_t vmla_lane_s16(int16x4_t a, int16x4_t b, int16x4_t v, __constrange(`0`,`3`) int l); // VMLA.I16 d0, d0, d0[0]
13449	_NEON2SSE_INLINE int16x4_t vmla_lane_s16(int16x4_t a, int16x4_t b, int16x4_t v, __constrange(`0`,`3`) int l) // VMLA.I16 d0, d0, d0[0]
13450	{
13451	int16_t c;
13452	int16x4_t scalar;
13453	c = vget_lane_s16(v, l);
13454	scalar = vdup_n_s16(c);
13455	return vmla_s16(a, b, scalar);
13456	}
13457
13458	_NEON2SSESTORAGE int32x2_t vmla_lane_s32(int32x2_t a, int32x2_t b, int32x2_t v, __constrange(`0`,`1`) int l); // VMLA.I32 d0, d0, d0[0]
13459	_NEON2SSE_INLINE int32x2_t vmla_lane_s32(int32x2_t a, int32x2_t b, int32x2_t v, __constrange(`0`,`1`) int l) // VMLA.I32 d0, d0, d0[0]
13460	{
13461	int32_t c;
13462	int32x2_t scalar;
13463	c = vget_lane_s32(v, l);
13464	scalar = vdup_n_s32(c);
13465	return vmla_s32(a, b, scalar);
13466	}
13467
13468	_NEON2SSE_GLOBAL uint16x4_t vmla_lane_u16(uint16x4_t a, uint16x4_t b, uint16x4_t v, __constrange(`0`,`3`) int l); // VMLA.I16 d0, d0, d0[0]
13469	#define vmla_lane_u16 vmla_lane_s16
13470
13471
13472	_NEON2SSE_GLOBAL uint32x2_t vmla_lane_u32(uint32x2_t a, uint32x2_t b, uint32x2_t v, __constrange(`0`,`1`) int l); // VMLA.I32 d0, d0, d0[0]
13473	#define vmla_lane_u32 vmla_lane_s32
13474
13475	_NEON2SSESTORAGE float32x2_t vmla_lane_f32(float32x2_t a, float32x2_t b, float32x2_t v, __constrange(`0`,`1`) int l); // VMLA.F32 d0, d0, d0[0]
13476	_NEON2SSE_INLINE float32x2_t vmla_lane_f32(float32x2_t a, float32x2_t b, float32x2_t v, __constrange(`0`,`1`) int l)
13477	{
13478	float32_t vlane;
13479	float32x2_t c;
13480	vlane = vget_lane_f32(v, l);
13481	c = vdup_n_f32(vlane);
13482	return vmla_f32(a,b,c);
13483	}
13484
13485	_NEON2SSESTORAGE int16x8_t vmlaq_lane_s16(int16x8_t a, int16x8_t b, int16x4_t v, __constrange(`0`,`3`) int l); // VMLA.I16 q0, q0, d0[0]
13486	_NEON2SSE_INLINE int16x8_t vmlaq_lane_s16(int16x8_t a, int16x8_t b, int16x4_t v, __constrange(`0`,`3`) int l) // VMLA.I16 q0, q0, d0[0]
13487	{
13488	int16_t vlane;
13489	int16x8_t c;
13490	vlane = vget_lane_s16(v, l);
13491	c = vdupq_n_s16(vlane);
13492	return vmlaq_s16(a,b,c);
13493	}
13494
13495	_NEON2SSESTORAGE int32x4_t vmlaq_lane_s32(int32x4_t a, int32x4_t b, int32x2_t v, __constrange(`0`,`1`) int l); // VMLA.I32 q0, q0, d0[0]
13496	_NEON2SSE_INLINE int32x4_t vmlaq_lane_s32(int32x4_t a, int32x4_t b, int32x2_t v, __constrange(`0`,`1`) int l) // VMLA.I32 q0, q0, d0[0]
13497	{
13498	int32_t vlane;
13499	int32x4_t c;
13500	vlane = vget_lane_s32(v, l);
13501	c = vdupq_n_s32(vlane);
13502	return vmlaq_s32(a,b,c);
13503	}
13504
13505	_NEON2SSE_GLOBAL uint16x8_t vmlaq_lane_u16(uint16x8_t a, uint16x8_t b, uint16x4_t v, __constrange(`0`,`3`) int l); // VMLA.I16 q0, q0, d0[0]
13506	#define vmlaq_lane_u16 vmlaq_lane_s16
13507
13508	_NEON2SSE_GLOBAL uint32x4_t vmlaq_lane_u32(uint32x4_t a, uint32x4_t b, uint32x2_t v, __constrange(`0`,`1`) int l); // VMLA.I32 q0, q0, d0[0]
13509	#define vmlaq_lane_u32 vmlaq_lane_s32
13510
13511	_NEON2SSESTORAGE float32x4_t vmlaq_lane_f32(float32x4_t a, float32x4_t b, float32x2_t v, __constrange(`0`,`1`) int l); // VMLA.F32 q0, q0, d0[0]
13512	_NEON2SSE_INLINE float32x4_t vmlaq_lane_f32(float32x4_t a, float32x4_t b, float32x2_t v, __constrange(`0`,`1`) int l) // VMLA.F32 q0, q0, d0[0]
13513	{
13514	float32_t vlane;
13515	float32x4_t c;
13516	vlane = vget_lane_f32(v, l);
13517	c = vdupq_n_f32(vlane);
13518	return vmlaq_f32(a,b,c);
13519	}
13520
13521	//*************** Vector widening multiply accumulate by scalar ********************
13522	//***************************************************************************************
13523	_NEON2SSESTORAGE int32x4_t vmlal_lane_s16(int32x4_t a, int16x4_t b, int16x4_t v, __constrange(`0`,`3`) int l); // VMLAL.S16 q0, d0, d0[0]
13524	_NEON2SSE_INLINE int32x4_t vmlal_lane_s16(int32x4_t a, int16x4_t b, int16x4_t v, __constrange(`0`,`3`) int l) // VMLAL.S16 q0, d0, d0[0]
13525	{
13526	int16_t vlane;
13527	int16x4_t c;
13528	vlane = vget_lane_s16(v, l);
13529	c = vdup_n_s16(vlane);
13530	return vmlal_s16(a, b, c);
13531	}
13532
13533	_NEON2SSESTORAGE int64x2_t vmlal_lane_s32(int64x2_t a, int32x2_t b, int32x2_t v, __constrange(`0`,`1`) int l); // VMLAL.S32 q0, d0, d0[0]
13534	_NEON2SSE_INLINE int64x2_t vmlal_lane_s32(int64x2_t a, int32x2_t b, int32x2_t v, __constrange(`0`,`1`) int l) // VMLAL.S32 q0, d0, d0[0]
13535	{
13536	int32_t vlane;
13537	int32x2_t c;
13538	vlane = vget_lane_s32(v, l);
13539	c = vdup_n_s32(vlane);
13540	return vmlal_s32(a, b, c);
13541	}
13542
13543	_NEON2SSESTORAGE uint32x4_t vmlal_lane_u16(uint32x4_t a, uint16x4_t b, uint16x4_t v, __constrange(`0`,`3`) int l); // VMLAL.s16 q0, d0, d0[0]
13544	_NEON2SSE_INLINE uint32x4_t vmlal_lane_u16(uint32x4_t a, uint16x4_t b, uint16x4_t v, __constrange(`0`,`3`) int l) // VMLAL.s16 q0, d0, d0[0]
13545	{
13546	uint16_t vlane;
13547	uint16x4_t c;
13548	vlane = vget_lane_u16(v, l);
13549	c = vdup_n_u16(vlane);
13550	return vmlal_u16(a, b, c);
13551	}
13552
13553	_NEON2SSESTORAGE uint64x2_t vmlal_lane_u32(uint64x2_t a, uint32x2_t b, uint32x2_t v, __constrange(`0`,`1`) int l); // VMLAL.U32 q0, d0, d0[0]
13554	_NEON2SSE_INLINE uint64x2_t vmlal_lane_u32(uint64x2_t a, uint32x2_t b, uint32x2_t v, __constrange(`0`,`1`) int l) // VMLAL.U32 q0, d0, d0[0]
13555	{
13556	uint32_t vlane;
13557	uint32x2_t c;
13558	vlane = vget_lane_u32(v, l);
13559	c = vdup_n_u32(vlane);
13560	return vmlal_u32(a, b, c);
13561	}
13562
13563	// ****** Vector widening saturating doubling multiply accumulate by scalar *****************************
13564	// ************************************************************************************************
13565	_NEON2SSESTORAGE int32x4_t vqdmlal_lane_s16(int32x4_t a, int16x4_t b, int16x4_t v, __constrange(`0`,`3`) int l); // VQDMLAL.S16 q0, d0, d0[0]
13566	_NEON2SSE_INLINE int32x4_t vqdmlal_lane_s16(int32x4_t a, int16x4_t b, int16x4_t v, __constrange(`0`,`3`) int l)
13567	{
13568	int16_t vlane;
13569	int16x4_t c;
13570	vlane = vget_lane_s16(v, l);
13571	c = vdup_n_s16(vlane);
13572	return vqdmlal_s16(a, b, c);
13573	}
13574
13575	_NEON2SSESTORAGE int64x2_t vqdmlal_lane_s32(int64x2_t a, int32x2_t b, int32x2_t v, __constrange(`0`,`1`) int l); // VQDMLAL.S32 q0, d0, d0[0]
13576	_NEON2SSE_INLINE int64x2_t vqdmlal_lane_s32(int64x2_t a, int32x2_t b, int32x2_t v, __constrange(`0`,`1`) int l)
13577	{
13578	int32_t vlane;
13579	uint32x2_t c;
13580	vlane = vget_lane_s32(v, l);
13581	c = vdup_n_s32(vlane);
13582	return vqdmlal_s32(a, b, c);
13583	}
13584
13585	// **** Vector multiply subtract by scalar ***************
13586	// *************************************************************
13587	_NEON2SSESTORAGE int16x4_t vmls_lane_s16(int16x4_t a, int16x4_t b, int16x4_t v, __constrange(`0`,`3`) int l); // VMLS.I16 d0, d0, d0[0]
13588	_NEON2SSE_INLINE int16x4_t vmls_lane_s16(int16x4_t a, int16x4_t b, int16x4_t v, __constrange(`0`,`3`) int l) // VMLS.I16 d0, d0, d0[0]
13589	{
13590	int16_t vlane;
13591	int16x4_t c;
13592	vlane = vget_lane_s16(v, l);
13593	c = vdup_n_s16(vlane);
13594	return vmls_s16(a, b, c);
13595	}
13596
13597	_NEON2SSESTORAGE int32x2_t vmls_lane_s32(int32x2_t a, int32x2_t b, int32x2_t v, __constrange(`0`,`1`) int l); // VMLS.I32 d0, d0, d0[0]
13598	_NEON2SSE_INLINE int32x2_t vmls_lane_s32(int32x2_t a, int32x2_t b, int32x2_t v, __constrange(`0`,`1`) int l) // VMLS.I32 d0, d0, d0[0]
13599	{
13600	int32_t vlane;
13601	int32x2_t c;
13602	vlane = vget_lane_s32(v, l);
13603	c = vdup_n_s32(vlane);
13604	return vmls_s32(a, b, c);
13605	}
13606
13607	_NEON2SSESTORAGE uint16x4_t vmls_lane_u16(uint16x4_t a, uint16x4_t b, uint16x4_t v, __constrange(`0`,`3`) int l); // VMLS.I16 d0, d0, d0[0]
13608	_NEON2SSE_INLINE uint16x4_t vmls_lane_u16(uint16x4_t a, uint16x4_t b, uint16x4_t v, __constrange(`0`,`3`) int l) // VMLS.I16 d0, d0, d0[0]
13609	{
13610	uint16_t vlane;
13611	uint16x4_t c;
13612	vlane = vget_lane_s16(v, l);
13613	c = vdup_n_s16(vlane);
13614	return vmls_s16(a, b, c);
13615	}
13616
13617	_NEON2SSESTORAGE uint32x2_t vmls_lane_u32(uint32x2_t a, uint32x2_t b, uint32x2_t v, __constrange(`0`,`1`) int l); // VMLS.I32 d0, d0, d0[0]
13618	_NEON2SSE_INLINE uint32x2_t vmls_lane_u32(uint32x2_t a, uint32x2_t b, uint32x2_t v, __constrange(`0`,`1`) int l) // VMLS.I32 d0, d0, d0[0]
13619	{
13620	uint32_t vlane;
13621	uint32x2_t c;
13622	vlane = vget_lane_u32(v, l);
13623	c = vdup_n_u32(vlane);
13624	return vmls_u32(a, b, c);
13625	}
13626
13627	_NEON2SSESTORAGE float32x2_t vmls_lane_f32(float32x2_t a, float32x2_t b, float32x2_t v, __constrange(`0`,`1`) int l); // VMLS.F32 d0, d0, d0[0]
13628	_NEON2SSE_INLINE float32x2_t vmls_lane_f32(float32x2_t a, float32x2_t b, float32x2_t v, __constrange(`0`,`1`) int l)
13629	{
13630	float32_t vlane;
13631	float32x2_t c;
13632	vlane = (float) vget_lane_f32(v, l);
13633	c = vdup_n_f32(vlane);
13634	return vmls_f32(a,b,c);
13635	}
13636
13637	_NEON2SSESTORAGE int16x8_t vmlsq_lane_s16(int16x8_t a, int16x8_t b, int16x4_t v, __constrange(`0`,`3`) int l); // VMLS.I16 q0, q0, d0[0]
13638	_NEON2SSE_INLINE int16x8_t vmlsq_lane_s16(int16x8_t a, int16x8_t b, int16x4_t v, __constrange(`0`,`3`) int l) // VMLS.I16 q0, q0, d0[0]
13639	{
13640	int16_t vlane;
13641	int16x8_t c;
13642	vlane = vget_lane_s16(v, l);
13643	c = vdupq_n_s16(vlane);
13644	return vmlsq_s16(a, b,c);
13645	}
13646
13647	_NEON2SSESTORAGE int32x4_t vmlsq_lane_s32(int32x4_t a, int32x4_t b, int32x2_t v, __constrange(`0`,`1`) int l); // VMLS.I32 q0, q0, d0[0]
13648	_NEON2SSE_INLINE int32x4_t vmlsq_lane_s32(int32x4_t a, int32x4_t b, int32x2_t v, __constrange(`0`,`1`) int l) // VMLS.I32 q0, q0, d0[0]
13649	{
13650	int32_t vlane;
13651	int32x4_t c;
13652	vlane = vget_lane_s32(v, l);
13653	c = vdupq_n_s32(vlane);
13654	return vmlsq_s32(a,b,c);
13655	}
13656
13657	_NEON2SSESTORAGE uint16x8_t vmlsq_lane_u16(uint16x8_t a, uint16x8_t b, uint16x4_t v, __constrange(`0`,`3`) int l); // VMLA.I16 q0, q0, d0[0]
13658	_NEON2SSE_INLINE uint16x8_t vmlsq_lane_u16(uint16x8_t a, uint16x8_t b, uint16x4_t v, __constrange(`0`,`3`) int l) // VMLA.I16 q0, q0, d0[0]
13659	{
13660	uint16_t vlane;
13661	uint16x8_t c;
13662	vlane = vget_lane_u16(v, l);
13663	c = vdupq_n_u16(vlane);
13664	return vmlsq_u16(a,b,c);
13665	}
13666
13667	_NEON2SSESTORAGE uint32x4_t vmlsq_lane_u32(uint32x4_t a, uint32x4_t b, uint32x2_t v, __constrange(`0`,`1`) int l); // VMLA.I32 q0, q0, d0[0]
13668	_NEON2SSE_INLINE uint32x4_t vmlsq_lane_u32(uint32x4_t a, uint32x4_t b, uint32x2_t v, __constrange(`0`,`1`) int l) // VMLA.I32 q0, q0, d0[0]
13669	{
13670	uint32_t vlane;
13671	uint32x4_t c;
13672	vlane = vget_lane_u32(v, l);
13673	c = vdupq_n_u32(vlane);
13674	return vmlsq_u32(a,b,c);
13675	}
13676
13677	_NEON2SSESTORAGE float32x4_t vmlsq_lane_f32(float32x4_t a, float32x4_t b, float32x2_t v, __constrange(`0`,`1`) int l); // VMLA.F32 q0, q0, d0[0]
13678	_NEON2SSE_INLINE float32x4_t vmlsq_lane_f32(float32x4_t a, float32x4_t b, float32x2_t v, __constrange(`0`,`1`) int l) // VMLA.F32 q0, q0, d0[0]
13679	{
13680	float32_t vlane;
13681	float32x4_t c;
13682	vlane = (float) vget_lane_f32(v, l);
13683	c = vdupq_n_f32(vlane);
13684	return vmlsq_f32(a,b,c);
13685	}
13686
13687	// ** Vector widening multiply subtract by scalar **
13688	// ****************************************************
13689	_NEON2SSESTORAGE int32x4_t vmlsl_lane_s16(int32x4_t a, int16x4_t b, int16x4_t v, __constrange(`0`,`3`) int l); // VMLAL.S16 q0, d0, d0[0]
13690	_NEON2SSE_INLINE int32x4_t vmlsl_lane_s16(int32x4_t a, int16x4_t b, int16x4_t v, __constrange(`0`,`3`) int l) // VMLAL.S16 q0, d0, d0[0]
13691	{
13692	int16_t vlane;
13693	int16x4_t c;
13694	vlane = vget_lane_s16(v, l);
13695	c = vdup_n_s16(vlane);
13696	return vmlsl_s16(a, b, c);
13697	}
13698
13699	_NEON2SSESTORAGE int64x2_t vmlsl_lane_s32(int64x2_t a, int32x2_t b, int32x2_t v, __constrange(`0`,`1`) int l); // VMLAL.S32 q0, d0, d0[0]
13700	_NEON2SSE_INLINE int64x2_t vmlsl_lane_s32(int64x2_t a, int32x2_t b, int32x2_t v, __constrange(`0`,`1`) int l) // VMLAL.S32 q0, d0, d0[0]
13701	{
13702	int32_t vlane;
13703	int32x2_t c;
13704	vlane = vget_lane_s32(v, l);
13705	c = vdup_n_s32(vlane);
13706	return vmlsl_s32(a, b, c);
13707	}
13708
13709	_NEON2SSESTORAGE uint32x4_t vmlsl_lane_u16(uint32x4_t a, uint16x4_t b, uint16x4_t v, __constrange(`0`,`3`) int l); // VMLAL.U16 q0, d0, d0[0]
13710	_NEON2SSE_INLINE uint32x4_t vmlsl_lane_u16(uint32x4_t a, uint16x4_t b, uint16x4_t v, __constrange(`0`,`3`) int l) // VMLAL.U16 q0, d0, d0[0]
13711	{
13712	uint16_t vlane;
13713	uint16x4_t c;
13714	vlane = vget_lane_u16(v, l);
13715	c = vdup_n_u16(vlane);
13716	return vmlsl_u16(a, b, c);
13717	}
13718
13719	_NEON2SSESTORAGE uint64x2_t vmlsl_lane_u32(uint64x2_t a, uint32x2_t b, uint32x2_t v, __constrange(`0`,`1`) int l); // VMLAL.U32 q0, d0, d0[0]
13720	_NEON2SSE_INLINE uint64x2_t vmlsl_lane_u32(uint64x2_t a, uint32x2_t b, uint32x2_t v, __constrange(`0`,`1`) int l) // VMLAL.U32 q0, d0, d0[0]
13721	{
13722	uint32_t vlane;
13723	uint32x2_t c;
13724	vlane = vget_lane_u32(v, l);
13725	c = vdup_n_u32(vlane);
13726	return vmlsl_u32(a, b, c);
13727	}
13728
13729	//******* Vector widening saturating doubling multiply subtract by scalar ************************
13730	//******************************************************************************************************
13731	_NEON2SSESTORAGE int32x4_t vqdmlsl_lane_s16(int32x4_t a, int16x4_t b, int16x4_t v, __constrange(`0`,`3`) int l); // VQDMLSL.S16 q0, d0, d0[0]
13732	_NEON2SSE_INLINE int32x4_t vqdmlsl_lane_s16(int32x4_t a, int16x4_t b, int16x4_t v, __constrange(`0`,`3`) int l)
13733	{
13734	int16_t vlane;
13735	int16x4_t c;
13736	vlane = vget_lane_s16(v, l);
13737	c = vdup_n_s16(vlane);
13738	return vqdmlsl_s16(a, b, c);
13739	}
13740
13741	_NEON2SSESTORAGE int64x2_t vqdmlsl_lane_s32(int64x2_t a, int32x2_t b, int32x2_t v, __constrange(`0`,`1`) int l); // VQDMLSL.S32 q0, d0, d0[0]
13742	_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int64x2_t vqdmlsl_lane_s32(int64x2_t a, int32x2_t b, int32x2_t v, __constrange(`0`,`1`) int l), _NEON2SSE_REASON_SLOW_SERIAL)
13743	{
13744	int32_t vlane;
13745	int32x2_t c;
13746	vlane = vget_lane_s32(v, l);
13747	c = vdup_n_s32(vlane);
13748	return vqdmlsl_s32(a, b, c);
13749	}
13750	//******** Vector multiply with scalar ***************************
13751	_NEON2SSESTORAGE int16x4_t vmul_n_s16(int16x4_t a, int16_t b); // VMUL.I16 d0,d0,d0[0]
13752	_NEON2SSE_INLINE int16x4_t vmul_n_s16(int16x4_t a, int16_t b) // VMUL.I16 d0,d0,d0[0]
13753	{
13754	int16x4_t b16x4;
13755	b16x4 = vdup_n_s16(b);
13756	return vmul_s16(a, b16x4);
13757	}
13758
13759	_NEON2SSESTORAGE int32x2_t vmul_n_s32(int32x2_t a, int32_t b); // VMUL.I32 d0,d0,d0[0]
13760	_NEON2SSE_INLINE int32x2_t vmul_n_s32(int32x2_t a, int32_t b) // VMUL.I32 d0,d0,d0[0]
13761	{
13762	//serial solution looks faster
13763	int32x2_t b32x2;
13764	b32x2 = vdup_n_s32(b);
13765	return vmul_s32(a, b32x2);
13766	}
13767
13768	_NEON2SSESTORAGE float32x2_t vmul_n_f32(float32x2_t a, float32_t b); // VMUL.F32 d0,d0,d0[0]
13769	_NEON2SSE_INLINE float32x2_t vmul_n_f32(float32x2_t a, float32_t b) // VMUL.F32 d0,d0,d0[0]
13770	{
13771	float32x2_t b32x2;
13772	b32x2 = vdup_n_f32(b);
13773	return vmul_f32(a, b32x2);
13774	}
13775
13776	_NEON2SSESTORAGE uint16x4_t vmul_n_u16(uint16x4_t a, uint16_t b); // VMUL.I16 d0,d0,d0[0]
13777	_NEON2SSE_INLINE uint16x4_t vmul_n_u16(uint16x4_t a, uint16_t b) // VMUL.I16 d0,d0,d0[0]
13778	{
13779	uint16x4_t b16x4;
13780	b16x4 = vdup_n_s16(b);
13781	return vmul_s16(a, b16x4);
13782	}
13783
13784	_NEON2SSESTORAGE uint32x2_t vmul_n_u32(uint32x2_t a, uint32_t b); // VMUL.I32 d0,d0,d0[0]
13785	_NEON2SSE_INLINE uint32x2_t vmul_n_u32(uint32x2_t a, uint32_t b) // VMUL.I32 d0,d0,d0[0]
13786	{
13787	//serial solution looks faster
13788	uint32x2_t b32x2;
13789	b32x2 = vdup_n_u32(b);
13790	return vmul_u32(a, b32x2);
13791	}
13792
13793	_NEON2SSESTORAGE int16x8_t vmulq_n_s16(int16x8_t a, int16_t b); // VMUL.I16 q0,q0,d0[0]
13794	_NEON2SSE_INLINE int16x8_t vmulq_n_s16(int16x8_t a, int16_t b) // VMUL.I16 q0,q0,d0[0]
13795	{
13796	int16x8_t b16x8;
13797	b16x8 = vdupq_n_s16(b);
13798	return vmulq_s16(a, b16x8);
13799	}
13800
13801	_NEON2SSESTORAGE int32x4_t vmulq_n_s32(int32x4_t a, int32_t b); // VMUL.I32 q0,q0,d0[0]
13802	_NEON2SSE_INLINE int32x4_t vmulq_n_s32(int32x4_t a, int32_t b) // VMUL.I32 q0,q0,d0[0]
13803	{
13804	int32x4_t b32x4;
13805	b32x4 = vdupq_n_s32(b);
13806	return vmulq_s32(a, b32x4);
13807	}
13808
13809	_NEON2SSESTORAGE float32x4_t vmulq_n_f32(float32x4_t a, float32_t b); // VMUL.F32 q0,q0,d0[0]
13810	_NEON2SSE_INLINE float32x4_t vmulq_n_f32(float32x4_t a, float32_t b) // VMUL.F32 q0,q0,d0[0]
13811	{
13812	float32x4_t b32x4;
13813	b32x4 = vdupq_n_f32(b);
13814	return vmulq_f32(a, b32x4);
13815	}
13816
13817	_NEON2SSESTORAGE uint16x8_t vmulq_n_u16(uint16x8_t a, uint16_t b); // VMUL.I16 q0,q0,d0[0]
13818	_NEON2SSE_INLINE uint16x8_t vmulq_n_u16(uint16x8_t a, uint16_t b) // VMUL.I16 q0,q0,d0[0]
13819	{
13820	uint16x8_t b16x8;
13821	b16x8 = vdupq_n_s16(b);
13822	return vmulq_s16(a, b16x8);
13823	}
13824
13825	_NEON2SSESTORAGE uint32x4_t vmulq_n_u32(uint32x4_t a, uint32_t b); // VMUL.I32 q0,q0,d0[0]
13826	_NEON2SSE_INLINE uint32x4_t vmulq_n_u32(uint32x4_t a, uint32_t b) // VMUL.I32 q0,q0,d0[0]
13827	{
13828	uint32x4_t b32x4;
13829	b32x4 = vdupq_n_u32(b);
13830	return vmulq_u32(a, b32x4);
13831	}
13832
13833	//******** Vector multiply lane ***************************
13834	_NEON2SSESTORAGE int16x4_t vmul_lane_s16 (int16x4_t a, int16x4_t b, __constrange(`0`,`3`) int c);
13835	_NEON2SSE_INLINE int16x4_t vmul_lane_s16 (int16x4_t a, int16x4_t b, __constrange(`0`,`3`) int c)
13836	{
13837	int16x4_t b16x4;
13838	int16_t vlane;
13839	vlane = vget_lane_s16(b, c);
13840	b16x4 = vdup_n_s16(vlane);
13841	return vmul_s16(a, b16x4);
13842	}
13843
13844	_NEON2SSESTORAGE int32x2_t vmul_lane_s32 (int32x2_t a, int32x2_t b, __constrange(`0`,`1`) int c);
13845	_NEON2SSE_INLINE int32x2_t vmul_lane_s32 (int32x2_t a, int32x2_t b, __constrange(`0`,`1`) int c)
13846	{
13847	int32x2_t b32x2;
13848	int32_t vlane;
13849	vlane = vget_lane_s32(b, c);
13850	b32x2 = vdup_n_s32(vlane);
13851	return vmul_s32(a, b32x2);
13852	}
13853
13854	_NEON2SSESTORAGE float32x2_t vmul_lane_f32 (float32x2_t a, float32x2_t b, __constrange(`0`,`1`) int c);
13855	_NEON2SSE_INLINE float32x2_t vmul_lane_f32 (float32x2_t a, float32x2_t b, __constrange(`0`,`1`) int c)
13856	{
13857	float32x2_t b32x2;
13858	float32_t vlane;
13859	vlane = vget_lane_f32(b, c);
13860	b32x2 = vdup_n_f32(vlane);
13861	return vmul_f32(a, b32x2);
13862	}
13863
13864	_NEON2SSE_GLOBAL uint16x4_t vmul_lane_u16 (uint16x4_t a, uint16x4_t b, __constrange(`0`,`3`) int c);
13865	#define vmul_lane_u16 vmul_lane_s16
13866
13867	_NEON2SSE_GLOBAL uint32x2_t vmul_lane_u32 (uint32x2_t a, uint32x2_t b, __constrange(`0`,`1`) int c);
13868	#define vmul_lane_u32 vmul_lane_s32
13869
13870	_NEON2SSESTORAGE int16x8_t vmulq_lane_s16(int16x8_t a, int16x4_t b, __constrange(`0`,`3`) int c);
13871	_NEON2SSE_INLINE int16x8_t vmulq_lane_s16 (int16x8_t a, int16x4_t b, __constrange(`0`,`3`) int c)
13872	{
13873	int16x8_t b16x8;
13874	int16_t vlane;
13875	vlane = vget_lane_s16(b, c);
13876	b16x8 = vdupq_n_s16(vlane);
13877	return vmulq_s16(a, b16x8);
13878	}
13879
13880	_NEON2SSESTORAGE int32x4_t vmulq_lane_s32 (int32x4_t a, int32x2_t b, __constrange(`0`,`1`) int c);
13881	_NEON2SSE_INLINE int32x4_t vmulq_lane_s32 (int32x4_t a, int32x2_t b, __constrange(`0`,`1`) int c)
13882	{
13883	int32x4_t b32x4;
13884	int32_t vlane;
13885	vlane = vget_lane_s32(b, c);
13886	b32x4 = vdupq_n_s32(vlane);
13887	return vmulq_s32(a, b32x4);
13888	}
13889
13890	_NEON2SSESTORAGE float32x4_t vmulq_lane_f32 (float32x4_t a, float32x2_t b, __constrange(`0`,`1`) int c);
13891	_NEON2SSE_INLINE float32x4_t vmulq_lane_f32 (float32x4_t a, float32x2_t b, __constrange(`0`,`1`) int c)
13892	{
13893	float32x4_t b32x4;
13894	float32_t vlane;
13895	vlane = vget_lane_f32(b, c);
13896	b32x4 = vdupq_n_f32(vlane);
13897	return vmulq_f32(a, b32x4);
13898	}
13899
13900	_NEON2SSE_GLOBAL uint16x8_t vmulq_lane_u16 (uint16x8_t a, uint16x4_t b, __constrange(`0`,`3`) int c);
13901	#define vmulq_lane_u16 vmulq_lane_s16
13902
13903	_NEON2SSE_GLOBAL uint32x4_t vmulq_lane_u32 (uint32x4_t a, uint32x2_t b, __constrange(`0`,`1`) int c);
13904	#define vmulq_lane_u32 vmulq_lane_s32
13905
13906	//** Vector long multiply with scalar **********
13907	_NEON2SSESTORAGE int32x4_t vmull_n_s16(int16x4_t vec1, int16_t val2); // VMULL.S16 q0,d0,d0[0]
13908	_NEON2SSE_INLINE int32x4_t vmull_n_s16(int16x4_t vec1, int16_t val2) // VMULL.S16 q0,d0,d0[0]
13909	{
13910	int16x4_t b16x4;
13911	b16x4 = vdup_n_s16(val2);
13912	return vmull_s16(vec1, b16x4);
13913	}
13914
13915	_NEON2SSESTORAGE int64x2_t vmull_n_s32(int32x2_t vec1, int32_t val2); // VMULL.S32 q0,d0,d0[0]
13916	_NEON2SSE_INLINE int64x2_t vmull_n_s32(int32x2_t vec1, int32_t val2) // VMULL.S32 q0,d0,d0[0]
13917	{
13918	int32x2_t b32x2;
13919	b32x2 = vdup_n_s32(val2);
13920	return vmull_s32(vec1, b32x2);
13921	}
13922
13923	_NEON2SSESTORAGE uint32x4_t vmull_n_u16(uint16x4_t vec1, uint16_t val2); // VMULL.s16 q0,d0,d0[0]
13924	_NEON2SSE_INLINE uint32x4_t vmull_n_u16(uint16x4_t vec1, uint16_t val2) // VMULL.s16 q0,d0,d0[0]
13925	{
13926	uint16x4_t b16x4;
13927	b16x4 = vdup_n_s16(val2);
13928	return vmull_u16(vec1, b16x4);
13929	}
13930
13931	_NEON2SSESTORAGE uint64x2_t vmull_n_u32(uint32x2_t vec1, uint32_t val2); // VMULL.U32 q0,d0,d0[0]
13932	_NEON2SSE_INLINE uint64x2_t vmull_n_u32(uint32x2_t vec1, uint32_t val2) // VMULL.U32 q0,d0,d0[0]
13933	{
13934	uint32x2_t b32x2;
13935	b32x2 = vdup_n_u32(val2);
13936	return vmull_u32(vec1, b32x2);
13937	}
13938
13939	//** Vector long multiply by scalar **
13940	_NEON2SSESTORAGE int32x4_t vmull_lane_s16(int16x4_t vec1, int16x4_t val2, __constrange(`0`, `3`) int val3); // VMULL.S16 q0,d0,d0[0]
13941	_NEON2SSE_INLINE int32x4_t vmull_lane_s16(int16x4_t vec1, int16x4_t val2, __constrange(`0`, `3`) int val3) // VMULL.S16 q0,d0,d0[0]
13942	{
13943	int16_t vlane;
13944	int16x4_t b;
13945	vlane = vget_lane_s16(val2, val3);
13946	b = vdup_n_s16(vlane);
13947	return vmull_s16(vec1, b);
13948	}
13949
13950	_NEON2SSESTORAGE int64x2_t vmull_lane_s32(int32x2_t vec1, int32x2_t val2, __constrange(`0`, `1`) int val3); // VMULL.S32 q0,d0,d0[0]
13951	_NEON2SSE_INLINE int64x2_t vmull_lane_s32(int32x2_t vec1, int32x2_t val2, __constrange(`0`, `1`) int val3) // VMULL.S32 q0,d0,d0[0]
13952	{
13953	int32_t vlane;
13954	int32x2_t b;
13955	vlane = vget_lane_s32(val2, val3);
13956	b = vdup_n_s32(vlane);
13957	return vmull_s32(vec1, b);
13958	}
13959
13960	_NEON2SSESTORAGE uint32x4_t vmull_lane_u16(uint16x4_t vec1, uint16x4_t val2, __constrange(`0`, `3`) int val3); // VMULL.s16 q0,d0,d0[0]
13961	_NEON2SSE_INLINE uint32x4_t vmull_lane_u16(uint16x4_t vec1, uint16x4_t val2, __constrange(`0`, `3`) int val3) // VMULL.s16 q0,d0,d0[0]
13962	{
13963	uint16_t vlane;
13964	uint16x4_t b;
13965	vlane = vget_lane_s16(val2, val3);
13966	b = vdup_n_s16(vlane);
13967	return vmull_u16(vec1, b);
13968	}
13969
13970	_NEON2SSESTORAGE uint64x2_t vmull_lane_u32(uint32x2_t vec1, uint32x2_t val2, __constrange(`0`, `1`) int val3); // VMULL.U32 q0,d0,d0[0]
13971	_NEON2SSE_INLINE uint64x2_t vmull_lane_u32(uint32x2_t vec1, uint32x2_t val2, __constrange(`0`, `1`) int val3) // VMULL.U32 q0,d0,d0[0]
13972	{
13973	uint32_t vlane;
13974	uint32x2_t b;
13975	vlane = vget_lane_u32(val2, val3);
13976	b = vdup_n_u32(vlane);
13977	return vmull_u32(vec1, b);
13978	}
13979
13980	//******* Vector saturating doubling long multiply with scalar *****************
13981	_NEON2SSESTORAGE int32x4_t vqdmull_n_s16(int16x4_t vec1, int16_t val2); // VQDMULL.S16 q0,d0,d0[0]
13982	_NEON2SSE_INLINE int32x4_t vqdmull_n_s16(int16x4_t vec1, int16_t val2)
13983	{
13984	//the serial soulution may be faster due to saturation
13985	int16x4_t b;
13986	b = vdup_n_s16(val2);
13987	return vqdmull_s16(vec1, b);
13988	}
13989
13990	_NEON2SSESTORAGE int64x2_t vqdmull_n_s32(int32x2_t vec1, int32_t val2); // VQDMULL.S32 q0,d0,d0[0]
13991	_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int64x2_t vqdmull_n_s32(int32x2_t vec1, int32_t val2), _NEON2SSE_REASON_SLOW_SERIAL)
13992	{
13993	int32x2_t b;
13994	b = vdup_n_s32(val2);
13995	return vqdmull_s32(vec1,b); //slow serial function!!!!
13996	}
13997
13998	//*********** Vector saturating doubling long multiply by scalar *********************************************
13999	_NEON2SSESTORAGE int32x4_t vqdmull_lane_s16(int16x4_t vec1, int16x4_t val2, __constrange(`0`, `3`) int val3); // VQDMULL.S16 q0,d0,d0[0]
14000	_NEON2SSE_INLINE int32x4_t vqdmull_lane_s16(int16x4_t vec1, int16x4_t val2, __constrange(`0`, `3`) int val3)
14001	{
14002	int16_t c;
14003	int16x4_t scalar;
14004	c = vget_lane_s16(val2, val3);
14005	scalar = vdup_n_s16(c);
14006	return vqdmull_s16(vec1, scalar);
14007	}
14008
14009
14010	_NEON2SSESTORAGE int64x2_t vqdmull_lane_s32(int32x2_t vec1, int32x2_t val2, __constrange(`0`, `1`) int val3); // VQDMULL.S32 q0,d0,d0[0]
14011	_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int64x2_t vqdmull_lane_s32(int32x2_t vec1, int32x2_t val2, __constrange(`0`, `1`) int val3), _NEON2SSE_REASON_SLOW_SERIAL)
14012	{
14013	int32_t c;
14014	int32x2_t scalar;
14015	c = vget_lane_s32(val2, val3);
14016	scalar = vdup_n_s32(c);
14017	return vqdmull_s32(vec1,scalar); //slow serial function!!!!
14018	}
14019
14020	// ***Vector saturating doubling multiply high with scalar ***
14021	_NEON2SSESTORAGE int16x4_t vqdmulh_n_s16(int16x4_t vec1, int16_t val2); // VQDMULH.S16 d0,d0,d0[0]
14022	_NEON2SSE_INLINE int16x4_t vqdmulh_n_s16(int16x4_t vec1, int16_t val2)
14023	{
14024	int16x4_t res64;
14025	return64(vqdmulhq_n_s16(_pM128i(vec1), val2));
14026	}
14027
14028	_NEON2SSESTORAGE int32x2_t vqdmulh_n_s32(int32x2_t vec1, int32_t val2); // VQDMULH.S32 d0,d0,d0[0]
14029	_NEON2SSE_INLINE int32x2_t vqdmulh_n_s32(int32x2_t vec1, int32_t val2)
14030	{
14031	int32x2_t res64;
14032	return64(vqdmulhq_n_s32(_pM128i(vec1), val2));
14033	}
14034
14035	_NEON2SSESTORAGE int16x8_t vqdmulhq_n_s16(int16x8_t vec1, int16_t val2); // VQDMULH.S16 q0,q0,d0[0]
14036	_NEON2SSE_INLINE int16x8_t vqdmulhq_n_s16(int16x8_t vec1, int16_t val2) // VQDMULH.S16 q0,q0,d0[0]
14037	{
14038	//solution may be not optimal
14039	int16x8_t scalar;
14040	scalar = vdupq_n_s16(val2);
14041	return vqdmulhq_s16(vec1, scalar);
14042	}
14043
14044	_NEON2SSESTORAGE int32x4_t vqdmulhq_n_s32(int32x4_t vec1, int32_t val2); // VQDMULH.S32 q0,q0,d0[0]
14045	_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int32x4_t vqdmulhq_n_s32(int32x4_t vec1, int32_t val2), _NEON2SSE_REASON_SLOW_UNEFFECTIVE)
14046	{
14047	int32x4_t scalar;
14048	scalar = vdupq_n_s32(val2);
14049	return vqdmulhq_s32(vec1, scalar);
14050	}
14051
14052	//*** Vector saturating doubling multiply high by scalar **************
14053	_NEON2SSESTORAGE int16x4_t vqdmulh_lane_s16(int16x4_t vec1, int16x4_t val2, __constrange(`0`, `3`) int val3); // VQDMULH.S16 d0,d0,d0[0]
14054	_NEON2SSE_INLINE int16x4_t vqdmulh_lane_s16(int16x4_t vec1, int16x4_t val2, __constrange(`0`, `3`) int val3) // VQDMULH.S16 d0,d0,d0[0]
14055	{
14056	//solution may be not optimal
14057	int16_t vlane;
14058	int16x4_t scalar;
14059	vlane = vget_lane_s16(val2, val3);
14060	scalar = vdup_n_s16(vlane);
14061	return vqdmulh_s16(vec1, scalar);
14062	}
14063
14064	_NEON2SSESTORAGE int32x2_t vqdmulh_lane_s32(int32x2_t vec1, int32x2_t val2, __constrange(`0`, `1`) int val3); // VQDMULH.S32 d0,d0,d0[0]
14065	_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int32x2_t vqdmulh_lane_s32(int32x2_t vec1, int32x2_t val2, __constrange(`0`, `1`) int val3), _NEON2SSE_REASON_SLOW_UNEFFECTIVE)
14066	{
14067	int32_t vlane;
14068	int32x2_t scalar;
14069	vlane = vget_lane_s32(val2, val3);
14070	scalar = vdup_n_s32(vlane);
14071	return vqdmulh_s32(vec1, scalar);
14072	}
14073
14074	_NEON2SSESTORAGE int16x8_t vqdmulhq_lane_s16(int16x8_t vec1, int16x4_t val2, __constrange(`0`, `3`) int val3); // VQDMULH.S16 q0,q0,d0[0]
14075	_NEON2SSE_INLINE int16x8_t vqdmulhq_lane_s16(int16x8_t vec1, int16x4_t val2, __constrange(`0`, `3`) int val3) // VQDMULH.S16 q0,q0,d0[0]
14076	{
14077	//solution may be not optimal
14078	int16_t vlane;
14079	int16x8_t scalar;
14080	vlane = vget_lane_s16(val2, val3);
14081	scalar = vdupq_n_s16(vlane );
14082	return vqdmulhq_s16(vec1, scalar);
14083	}
14084
14085	_NEON2SSESTORAGE int32x4_t vqdmulhq_lane_s32(int32x4_t vec1, int32x2_t val2, __constrange(`0`, `1`) int val3); // VQDMULH.S32 q0,q0,d0[0]
14086	_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int32x4_t vqdmulhq_lane_s32(int32x4_t vec1, int32x2_t val2, __constrange(`0`, `1`) int val3), _NEON2SSE_REASON_SLOW_UNEFFECTIVE)
14087	{
14088	//solution may be not optimal
14089	int32_t vlane;
14090	int32x4_t scalar;
14091	vlane = vgetq_lane_s32(_pM128i(val2), val3);
14092	scalar = vdupq_n_s32(vlane );
14093	return vqdmulhq_s32(vec1, scalar);
14094	}
14095
14096	//****** Vector saturating rounding doubling multiply high with scalar *
14097	_NEON2SSESTORAGE int16x4_t vqrdmulh_n_s16(int16x4_t vec1, int16_t val2); // VQRDMULH.S16 d0,d0,d0[0]
14098	_NEON2SSE_INLINE int16x4_t vqrdmulh_n_s16(int16x4_t vec1, int16_t val2) // VQRDMULH.S16 d0,d0,d0[0]
14099	{
14100	//solution may be not optimal
14101	int16x4_t scalar;
14102	scalar = vdup_n_s16(val2);
14103	return vqrdmulh_s16(vec1, scalar);
14104	}
14105
14106	_NEON2SSESTORAGE int32x2_t vqrdmulh_n_s32(int32x2_t vec1, int32_t val2); // VQRDMULH.S32 d0,d0,d0[0]
14107	_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int32x2_t vqrdmulh_n_s32(int32x2_t vec1, int32_t val2), _NEON2SSE_REASON_SLOW_UNEFFECTIVE)
14108	{
14109	int32x2_t scalar;
14110	scalar = vdup_n_s32(val2);
14111	return vqrdmulh_s32(vec1, scalar);
14112	}
14113
14114	_NEON2SSESTORAGE int16x8_t vqrdmulhq_n_s16(int16x8_t vec1, int16_t val2); // VQRDMULH.S16 q0,q0,d0[0]
14115	_NEON2SSE_INLINE int16x8_t vqrdmulhq_n_s16(int16x8_t vec1, int16_t val2) // VQRDMULH.S16 q0,q0,d0[0]
14116	{
14117	//solution may be not optimal
14118	int16x8_t scalar;
14119	scalar = vdupq_n_s16(val2);
14120	return vqrdmulhq_s16(vec1, scalar);
14121	}
14122
14123	_NEON2SSESTORAGE int32x4_t vqrdmulhq_n_s32(int32x4_t vec1, int32_t val2); // VQRDMULH.S32 q0,q0,d0[0]
14124	_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int32x4_t vqrdmulhq_n_s32(int32x4_t vec1, int32_t val2), _NEON2SSE_REASON_SLOW_UNEFFECTIVE)
14125	{
14126	int32x4_t scalar;
14127	scalar = vdupq_n_s32(val2);
14128	return vqrdmulhq_s32(vec1, scalar);
14129	}
14130
14131	//******* Vector rounding saturating doubling multiply high by scalar **
14132	_NEON2SSESTORAGE int16x4_t vqrdmulh_lane_s16(int16x4_t vec1, int16x4_t val2, __constrange(`0`, `3`) int val3); // VQRDMULH.S16 d0,d0,d0[0]
14133	_NEON2SSE_INLINE int16x4_t vqrdmulh_lane_s16(int16x4_t vec1, int16x4_t val2, __constrange(`0`, `3`) int val3) // VQRDMULH.S16 d0,d0,d0[0]
14134	{
14135	//solution may be not optimal
14136	int16_t vlane;
14137	int16x4_t scalar;
14138	vlane = vget_lane_s16(val2, val3);
14139	scalar = vdup_n_s16(vlane);
14140	return vqrdmulh_s16(vec1, scalar);
14141	}
14142
14143	_NEON2SSESTORAGE int32x2_t vqrdmulh_lane_s32(int32x2_t vec1, int32x2_t val2, __constrange(`0`, `1`) int val3); // VQRDMULH.S32 d0,d0,d0[0]
14144	_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int32x2_t vqrdmulh_lane_s32(int32x2_t vec1, int32x2_t val2, __constrange(`0`, `1`) int val3), _NEON2SSE_REASON_SLOW_UNEFFECTIVE)
14145	{
14146	int32_t vlane;
14147	int32x2_t scalar;
14148	vlane = vget_lane_s32(val2, val3);
14149	scalar = vdup_n_s32(vlane);
14150	return vqrdmulh_s32(vec1, scalar);
14151	}
14152
14153	_NEON2SSESTORAGE int16x8_t vqrdmulhq_lane_s16(int16x8_t vec1, int16x4_t val2, __constrange(`0`, `3`) int val3); // VQRDMULH.S16 q0,q0,d0[0]
14154	_NEON2SSE_INLINE int16x8_t vqrdmulhq_lane_s16(int16x8_t vec1, int16x4_t val2, __constrange(`0`, `3`) int val3) // VQRDMULH.S16 q0,q0,d0[0]
14155	{
14156	//solution may be not optimal
14157	int16_t vlane;
14158	int16x8_t scalar;
14159	vlane = vget_lane_s16(val2, val3);
14160	scalar = vdupq_n_s16(vlane);
14161	return vqrdmulhq_s16(vec1, scalar);
14162	}
14163
14164	_NEON2SSESTORAGE int32x4_t vqrdmulhq_lane_s32(int32x4_t vec1, int32x2_t val2, __constrange(`0`, `1`) int val3); // VQRDMULH.S32 q0,q0,d0[0]
14165	_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int32x4_t vqrdmulhq_lane_s32(int32x4_t vec1, int32x2_t val2, __constrange(`0`, `1`) int val3), _NEON2SSE_REASON_SLOW_UNEFFECTIVE)
14166	{
14167	//solution may be not optimal
14168	int32_t vlane;
14169	int32x4_t scalar;
14170	vlane = vgetq_lane_s32(_pM128i(val2), val3);
14171	scalar = vdupq_n_s32(vlane );
14172	return vqrdmulhq_s32(vec1, scalar);
14173	}
14174
14175	//************Vector multiply accumulate with scalar *****************
14176	_NEON2SSESTORAGE int16x4_t vmla_n_s16(int16x4_t a, int16x4_t b, int16_t c); // VMLA.I16 d0, d0, d0[0]
14177	_NEON2SSE_INLINE int16x4_t vmla_n_s16(int16x4_t a, int16x4_t b, int16_t c) // VMLA.I16 d0, d0, d0[0]
14178	{
14179	int16x4_t scalar;
14180	scalar = vdup_n_s16(c);
14181	return vmla_s16(a, b, scalar);
14182	}
14183
14184	_NEON2SSESTORAGE int32x2_t vmla_n_s32(int32x2_t a, int32x2_t b, int32_t c); // VMLA.I32 d0, d0, d0[0]
14185	_NEON2SSE_INLINE int32x2_t vmla_n_s32(int32x2_t a, int32x2_t b, int32_t c) // VMLA.I32 d0, d0, d0[0]
14186	{
14187	int32x2_t scalar;
14188	scalar = vdup_n_s32(c);
14189	return vmla_s32(a, b, scalar);
14190	}
14191
14192	_NEON2SSE_GLOBAL uint16x4_t vmla_n_u16(uint16x4_t a, uint16x4_t b, uint16_t c); // VMLA.I16 d0, d0, d0[0]
14193	#define vmla_n_u16 vmla_n_s16
14194
14195
14196	_NEON2SSE_GLOBAL uint32x2_t vmla_n_u32(uint32x2_t a, uint32x2_t b, uint32_t c); // VMLA.I32 d0, d0, d0[0]
14197	#define vmla_n_u32 vmla_n_s32
14198
14199
14200	_NEON2SSESTORAGE float32x2_t vmla_n_f32(float32x2_t a, float32x2_t b, float32_t c); // VMLA.F32 d0, d0, d0[0]
14201	_NEON2SSE_INLINE float32x2_t vmla_n_f32(float32x2_t a, float32x2_t b, float32_t c) // VMLA.F32 d0, d0, d0[0]
14202	{
14203	float32x2_t scalar;
14204	scalar = vdup_n_f32(c);
14205	return vmla_f32(a, b, scalar);
14206	}
14207
14208	_NEON2SSESTORAGE int16x8_t vmlaq_n_s16(int16x8_t a, int16x8_t b, int16_t c); // VMLA.I16 q0, q0, d0[0]
14209	_NEON2SSE_INLINE int16x8_t vmlaq_n_s16(int16x8_t a, int16x8_t b, int16_t c) // VMLA.I16 q0, q0, d0[0]
14210	{
14211	int16x8_t scalar;
14212	scalar = vdupq_n_s16(c);
14213	return vmlaq_s16(a,b,scalar);
14214	}
14215
14216	_NEON2SSESTORAGE int32x4_t vmlaq_n_s32(int32x4_t a, int32x4_t b, int32_t c); // VMLA.I32 q0, q0, d0[0]
14217	_NEON2SSE_INLINE int32x4_t vmlaq_n_s32(int32x4_t a, int32x4_t b, int32_t c) // VMLA.I32 q0, q0, d0[0]
14218	{
14219	int32x4_t scalar;
14220	scalar = vdupq_n_s32(c);
14221	return vmlaq_s32(a,b,scalar);
14222	}
14223
14224	_NEON2SSE_GLOBAL uint16x8_t vmlaq_n_u16(uint16x8_t a, uint16x8_t b, uint16_t c); // VMLA.I16 q0, q0, d0[0]
14225	#define vmlaq_n_u16 vmlaq_n_s16
14226
14227	_NEON2SSE_GLOBAL uint32x4_t vmlaq_n_u32(uint32x4_t a, uint32x4_t b, uint32_t c); // VMLA.I32 q0, q0, d0[0]
14228	#define vmlaq_n_u32 vmlaq_n_s32
14229
14230	_NEON2SSESTORAGE float32x4_t vmlaq_n_f32(float32x4_t a, float32x4_t b, float32_t c); // VMLA.F32 q0, q0, d0[0]
14231	_NEON2SSE_INLINE float32x4_t vmlaq_n_f32(float32x4_t a, float32x4_t b, float32_t c) // VMLA.F32 q0, q0, d0[0]
14232	{
14233	float32x4_t scalar;
14234	scalar = vdupq_n_f32(c);
14235	return vmlaq_f32(a,b,scalar);
14236	}
14237
14238	//**********Vector widening multiply accumulate with scalar**************************
14239	_NEON2SSESTORAGE int32x4_t vmlal_n_s16(int32x4_t a, int16x4_t b, int16_t c); // VMLAL.S16 q0, d0, d0[0]
14240	_NEON2SSE_INLINE int32x4_t vmlal_n_s16(int32x4_t a, int16x4_t b, int16_t c) // VMLAL.S16 q0, d0, d0[0]
14241	{
14242	int16x4_t vc;
14243	vc = vdup_n_s16(c);
14244	return vmlal_s16(a, b, vc);
14245	}
14246
14247	_NEON2SSESTORAGE int64x2_t vmlal_n_s32(int64x2_t a, int32x2_t b, int32_t c); // VMLAL.S32 q0, d0, d0[0]
14248	_NEON2SSE_INLINE int64x2_t vmlal_n_s32(int64x2_t a, int32x2_t b, int32_t c) // VMLAL.S32 q0, d0, d0[0]
14249	{
14250	int32x2_t vc;
14251	vc = vdup_n_s32(c);
14252	return vmlal_s32(a, b, vc);
14253	}
14254
14255	_NEON2SSESTORAGE uint32x4_t vmlal_n_u16(uint32x4_t a, uint16x4_t b, uint16_t c); // VMLAL.s16 q0, d0, d0[0]
14256	_NEON2SSE_INLINE uint32x4_t vmlal_n_u16(uint32x4_t a, uint16x4_t b, uint16_t c) // VMLAL.s16 q0, d0, d0[0]
14257	{
14258	uint16x4_t vc;
14259	vc = vdup_n_u16(c);
14260	return vmlal_u16(a, b, vc);
14261	}
14262
14263	_NEON2SSESTORAGE uint64x2_t vmlal_n_u32(uint64x2_t a, uint32x2_t b, uint32_t c); // VMLAL.U32 q0, d0, d0[0]
14264	_NEON2SSE_INLINE uint64x2_t vmlal_n_u32(uint64x2_t a, uint32x2_t b, uint32_t c) // VMLAL.U32 q0, d0, d0[0]
14265	{
14266	uint32x2_t vc;
14267	vc = vdup_n_u32(c);
14268	return vmlal_u32(a, b, vc);
14269	}
14270
14271	//********** Vector widening saturating doubling multiply accumulate with scalar ************
14272	_NEON2SSESTORAGE int32x4_t vqdmlal_n_s16(int32x4_t a, int16x4_t b, int16_t c); // VQDMLAL.S16 q0, d0, d0[0]
14273	_NEON2SSE_INLINE int32x4_t vqdmlal_n_s16(int32x4_t a, int16x4_t b, int16_t c)
14274	{
14275	//not optimal SIMD soulution, serial may be faster
14276	int16x4_t vc;
14277	vc = vdup_n_s16(c);
14278	return vqdmlal_s16(a, b, vc);
14279	}
14280
14281	_NEON2SSESTORAGE int64x2_t vqdmlal_n_s32(int64x2_t a, int32x2_t b, int32_t c); // VQDMLAL.S32 q0, d0, d0[0]
14282	_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int64x2_t vqdmlal_n_s32(int64x2_t a, int32x2_t b, int32_t c), _NEON2SSE_REASON_SLOW_SERIAL)
14283	{
14284	int32x2_t vc;
14285	vc = vdup_n_s32(c);
14286	return vqdmlal_s32(a, b, vc);
14287	}
14288
14289	//****** Vector multiply subtract with scalar ************
14290	_NEON2SSESTORAGE int16x4_t vmls_n_s16(int16x4_t a, int16x4_t b, int16_t c); // VMLS.I16 d0, d0, d0[0]
14291	_NEON2SSE_INLINE int16x4_t vmls_n_s16(int16x4_t a, int16x4_t b, int16_t c) // VMLS.I16 d0, d0, d0[0]
14292	{
14293	int16x4_t vc;
14294	vc = vdup_n_s16(c);
14295	return vmls_s16(a, b, vc);
14296	}
14297
14298	_NEON2SSESTORAGE int32x2_t vmls_n_s32(int32x2_t a, int32x2_t b, int32_t c); // VMLS.I32 d0, d0, d0[0]
14299	_NEON2SSE_INLINE int32x2_t vmls_n_s32(int32x2_t a, int32x2_t b, int32_t c) // VMLS.I32 d0, d0, d0[0]
14300	{
14301	int32x2_t vc;
14302	vc = vdup_n_s32(c);
14303	return vmls_s32(a, b, vc);
14304	}
14305
14306	_NEON2SSESTORAGE uint16x4_t vmls_n_u16(uint16x4_t a, uint16x4_t b, uint16_t c); // VMLS.I16 d0, d0, d0[0]
14307	_NEON2SSE_INLINE uint16x4_t vmls_n_u16(uint16x4_t a, uint16x4_t b, uint16_t c) // VMLS.I16 d0, d0, d0[0]
14308	{
14309	uint16x4_t vc;
14310	vc = vdup_n_s16(c);
14311	return vmls_s16(a, b, vc);
14312	}
14313
14314	_NEON2SSESTORAGE uint32x2_t vmls_n_u32(uint32x2_t a, uint32x2_t b, uint32_t c); // VMLS.I32 d0, d0, d0[0]
14315	_NEON2SSE_INLINE uint32x2_t vmls_n_u32(uint32x2_t a, uint32x2_t b, uint32_t c) // VMLS.I32 d0, d0, d0[0]
14316	{
14317	uint32x2_t vc;
14318	vc = vdup_n_u32(c);
14319	return vmls_u32(a, b, vc);
14320	}
14321
14322	_NEON2SSESTORAGE float32x2_t vmls_n_f32(float32x2_t a, float32x2_t b, float32_t c); // VMLS.F32 d0, d0, d0[0]
14323	_NEON2SSE_INLINE float32x2_t vmls_n_f32(float32x2_t a, float32x2_t b, float32_t c)
14324	{
14325	float32x2_t res;
14326	res.m64_f32[`0`] = a.m64_f32[`0`] - b.m64_f32[`0`] * c;
14327	res.m64_f32[`1`] = a.m64_f32[`1`] - b.m64_f32[`1`] * c;
14328	return res;
14329	}
14330
14331	_NEON2SSESTORAGE int16x8_t vmlsq_n_s16(int16x8_t a, int16x8_t b, int16_t c); // VMLS.I16 q0, q0, d0[0]
14332	_NEON2SSE_INLINE int16x8_t vmlsq_n_s16(int16x8_t a, int16x8_t b, int16_t c) // VMLS.I16 q0, q0, d0[0]
14333	{
14334	int16x8_t vc;
14335	vc = vdupq_n_s16(c);
14336	return vmlsq_s16(a, b,vc);
14337	}
14338
14339	_NEON2SSESTORAGE int32x4_t vmlsq_n_s32(int32x4_t a, int32x4_t b, int32_t c); // VMLS.I32 q0, q0, d0[0]
14340	_NEON2SSE_INLINE int32x4_t vmlsq_n_s32(int32x4_t a, int32x4_t b, int32_t c) // VMLS.I32 q0, q0, d0[0]
14341	{
14342	int32x4_t vc;
14343	vc = vdupq_n_s32(c);
14344	return vmlsq_s32(a,b,vc);
14345	}
14346
14347	_NEON2SSESTORAGE uint16x8_t vmlsq_n_u16(uint16x8_t a, uint16x8_t b, uint16_t c); // VMLS.I16 q0, q0, d0[0]
14348	_NEON2SSE_INLINE uint16x8_t vmlsq_n_u16(uint16x8_t a, uint16x8_t b, uint16_t c) // VMLS.I16 q0, q0, d0[0]
14349	{
14350	uint16x8_t vc;
14351	vc = vdupq_n_u16(c);
14352	return vmlsq_u16(a,b,vc);
14353	}
14354
14355	_NEON2SSESTORAGE uint32x4_t vmlsq_n_u32(uint32x4_t a, uint32x4_t b, uint32_t c); // VMLS.I32 q0, q0, d0[0]
14356	_NEON2SSE_INLINE uint32x4_t vmlsq_n_u32(uint32x4_t a, uint32x4_t b, uint32_t c) // VMLS.I32 q0, q0, d0[0]
14357	{
14358	uint32x4_t vc;
14359	vc = vdupq_n_u32(c);
14360	return vmlsq_u32(a,b,vc);
14361	}
14362
14363	_NEON2SSESTORAGE float32x4_t vmlsq_n_f32(float32x4_t a, float32x4_t b, float32_t c); // VMLS.F32 q0, q0, d0[0]
14364	_NEON2SSE_INLINE float32x4_t vmlsq_n_f32(float32x4_t a, float32x4_t b, float32_t c)
14365	{
14366	float32x4_t vc;
14367	vc = vdupq_n_f32(c);
14368	return vmlsq_f32(a,b,vc);
14369	}
14370
14371	//** Vector widening multiply subtract with scalar ****
14372	_NEON2SSESTORAGE int32x4_t vmlsl_n_s16(int32x4_t a, int16x4_t b, int16_t c); // VMLSL.S16 q0, d0, d0[0]
14373	_NEON2SSE_INLINE int32x4_t vmlsl_n_s16(int32x4_t a, int16x4_t b, int16_t c) // VMLSL.S16 q0, d0, d0[0]
14374	{
14375	int16x4_t vc;
14376	vc = vdup_n_s16(c);
14377	return vmlsl_s16(a, b, vc);
14378	}
14379
14380	_NEON2SSESTORAGE int64x2_t vmlsl_n_s32(int64x2_t a, int32x2_t b, int32_t c); // VMLSL.S32 q0, d0, d0[0]
14381	_NEON2SSE_INLINE int64x2_t vmlsl_n_s32(int64x2_t a, int32x2_t b, int32_t c) // VMLSL.S32 q0, d0, d0[0]
14382	{
14383	int32x2_t vc;
14384	vc = vdup_n_s32(c);
14385	return vmlsl_s32(a, b, vc);
14386	}
14387
14388	_NEON2SSESTORAGE uint32x4_t vmlsl_n_u16(uint32x4_t a, uint16x4_t b, uint16_t c); // VMLSL.s16 q0, d0, d0[0]
14389	_NEON2SSE_INLINE uint32x4_t vmlsl_n_u16(uint32x4_t a, uint16x4_t b, uint16_t c) // VMLSL.s16 q0, d0, d0[0]
14390	{
14391	uint16x4_t vc;
14392	vc = vdup_n_u16(c);
14393	return vmlsl_u16(a, b, vc);
14394	}
14395
14396	_NEON2SSESTORAGE uint64x2_t vmlsl_n_u32(uint64x2_t a, uint32x2_t b, uint32_t c); // VMLSL.U32 q0, d0, d0[0]
14397	_NEON2SSE_INLINE uint64x2_t vmlsl_n_u32(uint64x2_t a, uint32x2_t b, uint32_t c) // VMLSL.U32 q0, d0, d0[0]
14398	{
14399	uint32x2_t vc;
14400	vc = vdup_n_u32(c);
14401	return vmlsl_u32(a, b, vc);
14402	}
14403
14404	//*** Vector widening saturating doubling multiply subtract with scalar *******
14405	//**********************************************************************************
14406	_NEON2SSESTORAGE int32x4_t vqdmlsl_n_s16(int32x4_t a, int16x4_t b, int16_t c); // VQDMLSL.S16 q0, d0, d0[0]
14407	_NEON2SSE_INLINE int32x4_t vqdmlsl_n_s16(int32x4_t a, int16x4_t b, int16_t c)
14408	{
14409	int16x4_t vc;
14410	vc = vdup_n_s16(c);
14411	return vqdmlsl_s16(a, b, vc);
14412	}
14413
14414	_NEON2SSESTORAGE int64x2_t vqdmlsl_n_s32(int64x2_t a, int32x2_t b, int32_t c); // VQDMLSL.S32 q0, d0, d0[0]
14415	_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int64x2_t vqdmlsl_n_s32(int64x2_t a, int32x2_t b, int32_t c), _NEON2SSE_REASON_SLOW_SERIAL)
14416	{
14417	int32x2_t vc;
14418	vc = vdup_n_s32(c);
14419	return vqdmlsl_s32(a, b, vc);
14420	}
14421
14422	//***************** Vector extract *********************************************
14423	//*************************************************************************************
14424	//VEXT (Vector Extract) extracts elements from the bottom end of the second operand
14425	//vector and the top end of the first, concatenates them, and places the result in the destination vector
14426	//c elements from the bottom end of the second operand and (8-c) from the top end of the first
14427	_NEON2SSESTORAGE int8x8_t vext_s8(int8x8_t a, int8x8_t b, __constrange(`0`,`7`) int c); // VEXT.8 d0,d0,d0,#0
14428	_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int8x8_t vext_s8(int8x8_t a, int8x8_t b, __constrange(`0`,`7`) int c),_NEON2SSE_REASON_SLOW_SERIAL)
14429	{
14430	int8x8_t res;
14431	int i;
14432	for (i = `0`; i<`8` - c; i++) {
14433	res.m64_i8[i] = a.m64_i8[i + c];
14434	}
14435	for(i = `0`; i<c; i++) {
14436	res.m64_i8[`8` - c + i] = b.m64_i8[i];
14437	}
14438	return res;
14439	}
14440
14441	_NEON2SSE_GLOBAL uint8x8_t vext_u8(uint8x8_t a, uint8x8_t b, __constrange(`0`,`7`) int c); // VEXT.8 d0,d0,d0,#0
14442	#define vext_u8 vext_s8
14443	//same result tested
14444
14445	_NEON2SSE_GLOBAL poly8x8_t vext_p8(poly8x8_t a, poly8x8_t b, __constrange(`0`,`7`) int c); // VEXT.8 d0,d0,d0,#0
14446	#define vext_p8 vext_u8
14447
14448	_NEON2SSESTORAGE int16x4_t vext_s16(int16x4_t a, int16x4_t b, __constrange(`0`,`3`) int c); // VEXT.16 d0,d0,d0,#0
14449	_NEON2SSE_INLINE int16x4_t _NEON2SSE_PERFORMANCE_WARNING (vext_s16(int16x4_t a, int16x4_t b, __constrange(`0`,`3`) int c), _NEON2SSE_REASON_SLOW_SERIAL)
14450	{
14451	int16x4_t res;
14452	int i;
14453	for (i = `0`; i<`4` - c; i++) {
14454	res.m64_i16[i] = a.m64_i16[i + c];
14455	}
14456	for(i = `0`; i<c; i++) {
14457	res.m64_i16[`4` - c + i] = b.m64_i16[i];
14458	}
14459	return res;
14460	}
14461
14462	_NEON2SSE_GLOBAL uint16x4_t vext_u16(uint16x4_t a, uint16x4_t b, __constrange(`0`,`3`) int c); // VEXT.16 d0,d0,d0,#0
14463	#define vext_u16 vext_s16
14464
14465	_NEON2SSE_GLOBAL poly16x4_t vext_p16(poly16x4_t a, poly16x4_t b, __constrange(`0`,`3`) int c); // VEXT.16 d0,d0,d0,#0
14466	#define vext_p16 vext_s16
14467
14468	_NEON2SSESTORAGE int32x2_t vext_s32(int32x2_t a, int32x2_t b, __constrange(`0`,`1`) int c); // VEXT.32 d0,d0,d0,#0
14469	_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int32x2_t vext_s32(int32x2_t a, int32x2_t b, __constrange(`0`,`1`) int c), _NEON2SSE_REASON_SLOW_SERIAL)
14470	{
14471	int32x2_t res;
14472	if (c==`0`) {
14473	res.m64_i32[`0`] = a.m64_i32[`0`];
14474	res.m64_i32[`1`] = a.m64_i32[`1`];
14475	} else {
14476	res.m64_i32[`0`] = a.m64_i32[`1`];
14477	res.m64_i32[`1`] = b.m64_i32[`0`];
14478	}
14479	return res;
14480	}
14481
14482	_NEON2SSESTORAGE float32x2_t vext_f32(float32x2_t a, float32x2_t b, __constrange(`0`,`1`) int c); // VEXT.32 d0,d0,d0,#0
14483	_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(float32x2_t vext_f32(float32x2_t a, float32x2_t b, __constrange(`0`,`1`) int c), _NEON2SSE_REASON_SLOW_SERIAL)
14484	{
14485	float32x2_t res;
14486	if (c==`0`) {
14487	res.m64_f32[`0`] = a.m64_f32[`0`];
14488	res.m64_f32[`1`] = a.m64_f32[`1`];
14489	} else {
14490	res.m64_f32[`0`] = a.m64_f32[`1`];
14491	res.m64_f32[`1`] = b.m64_f32[`0`];
14492	}
14493	return res;
14494	}
14495
14496	_NEON2SSE_GLOBAL uint32x2_t vext_u32(uint32x2_t a, uint32x2_t b, __constrange(`0`,`1`) int c); // VEXT.32 d0,d0,d0,#0
14497	#define vext_u32 vext_s32
14498
14499
14500	_NEON2SSE_GLOBAL int64x1_t vext_s64(int64x1_t a, int64x1_t b, __constrange(`0`,`0`) int c); // VEXT.64 d0,d0,d0,#0
14501	#define vext_s64(a,b,c) a
14502
14503	_NEON2SSE_GLOBAL uint64x1_t vext_u64(uint64x1_t a, uint64x1_t b, __constrange(`0`,`0`) int c); // VEXT.64 d0,d0,d0,#0
14504	#define vext_u64(a,b,c) a
14505
14506	_NEON2SSE_GLOBAL int8x16_t vextq_s8(int8x16_t a, int8x16_t b, __constrange(`0`,`15`) int c); // VEXT.8 q0,q0,q0,#0
14507	#define vextq_s8(a,b,c) _MM_ALIGNR_EPI8 (b,a,c)
14508
14509	_NEON2SSE_GLOBAL uint8x16_t vextq_u8(uint8x16_t a, uint8x16_t b, __constrange(`0`,`15`) int c); // VEXT.8 q0,q0,q0,#0
14510	#define vextq_u8(a,b,c) _MM_ALIGNR_EPI8 (b,a,c)
14511
14512	_NEON2SSE_GLOBAL poly8x16_t vextq_p8(poly8x16_t a, poly8x16_t b, __constrange(`0`,`15`) int c); // VEXT.8 q0,q0,q0,#0
14513	#define vextq_p8 vextq_s8
14514
14515	_NEON2SSE_GLOBAL int16x8_t vextq_s16(int16x8_t a, int16x8_t b, __constrange(`0`,`7`) int c); // VEXT.16 q0,q0,q0,#0
14516	#define vextq_s16(a,b,c) _MM_ALIGNR_EPI8 (b,a,c * 2)
14517
14518	_NEON2SSE_GLOBAL uint16x8_t vextq_u16(uint16x8_t a, uint16x8_t b, __constrange(`0`,`7`) int c); // VEXT.16 q0,q0,q0,#0
14519	#define vextq_u16(a,b,c) _MM_ALIGNR_EPI8 (b,a,c * 2)
14520
14521	_NEON2SSE_GLOBAL poly16x8_t vextq_p16(poly16x8_t a, poly16x8_t b, __constrange(`0`,`7`) int c); // VEXT.16 q0,q0,q0,#0
14522	#define vextq_p16 vextq_s16
14523
14524	_NEON2SSE_GLOBAL int32x4_t vextq_s32(int32x4_t a, int32x4_t b, __constrange(`0`,`3`) int c); // VEXT.32 q0,q0,q0,#0
14525	#define vextq_s32(a,b,c) _MM_ALIGNR_EPI8 (b,a,c * 4)
14526
14527	_NEON2SSE_GLOBAL uint32x4_t vextq_u32(uint32x4_t a, uint32x4_t b, __constrange(`0`,`3`) int c); // VEXT.32 q0,q0,q0,#0
14528	#define vextq_u32(a,b,c) _MM_ALIGNR_EPI8 (b,a,c * 4)
14529
14530	_NEON2SSE_GLOBAL float32x4_t vextq_f32(float32x4_t a, float32x4_t b, __constrange(`0`,`3`) float c); // VEXT.32 q0,q0,q0,#0
14531	#define vextq_f32(a,b,c) _M128(vextq_s32(_M128i(a),_M128i(b),c) )
14532
14533	_NEON2SSE_GLOBAL int64x2_t vextq_s64(int64x2_t a, int64x2_t b, __constrange(`0`,`1`) int c); // VEXT.64 q0,q0,q0,#0
14534	#define vextq_s64(a,b,c) _MM_ALIGNR_EPI8(b,a,c * 8)
14535
14536	_NEON2SSE_GLOBAL uint64x2_t vextq_u64(uint64x2_t a, uint64x2_t b, __constrange(`0`,`1`) int c); // VEXT.64 q0,q0,q0,#0
14537	#define vextq_u64(a,b,c) _MM_ALIGNR_EPI8(b,a,c * 8)
14538
14539	//********** Reverse vector elements (swap endianness)***************
14540	//*************************************************************************
14541	//VREVn.m reverses the order of the m-bit lanes within a set that is n bits wide.
14542	_NEON2SSESTORAGE int8x8_t vrev64_s8(int8x8_t vec); // VREV64.8 d0,d0
14543	_NEON2SSE_INLINE int8x8_t vrev64_s8(int8x8_t vec)
14544	{
14545	int8x8_t res64;
14546	__m128i res;
14547	res = vrev64q_s8(_pM128i(vec));
14548	return64(res);
14549	}
14550
14551	_NEON2SSESTORAGE int16x4_t vrev64_s16(int16x4_t vec); // VREV64.16 d0,d0
14552	_NEON2SSE_INLINE int16x4_t vrev64_s16(int16x4_t vec)
14553	{
14554	int16x4_t res64;
14555	__m128i res;
14556	res = vrev64q_s16(_pM128i(vec));
14557	return64(res);
14558	}
14559
14560	_NEON2SSESTORAGE int32x2_t vrev64_s32(int32x2_t vec); // VREV64.32 d0,d0
14561	_NEON2SSE_INLINE int32x2_t vrev64_s32(int32x2_t vec)
14562	{
14563	int32x2_t res;
14564	res.m64_i32[`0`] = vec.m64_i32[`1`];
14565	res.m64_i32[`1`] = vec.m64_i32[`0`];
14566	return res;
14567	}
14568
14569	_NEON2SSE_GLOBAL uint8x8_t vrev64_u8(uint8x8_t vec); // VREV64.8 d0,d0
14570	#define vrev64_u8 vrev64_s8
14571
14572	_NEON2SSE_GLOBAL uint16x4_t vrev64_u16(uint16x4_t vec); // VREV64.16 d0,d0
14573	#define vrev64_u16 vrev64_s16
14574
14575	_NEON2SSE_GLOBAL uint32x2_t vrev64_u32(uint32x2_t vec); // VREV64.32 d0,d0
14576	#define vrev64_u32 vrev64_s32
14577
14578	_NEON2SSE_GLOBAL poly8x8_t vrev64_p8(poly8x8_t vec); // VREV64.8 d0,d0
14579	#define vrev64_p8 vrev64_u8
14580
14581	_NEON2SSE_GLOBAL poly16x4_t vrev64_p16(poly16x4_t vec); // VREV64.16 d0,d0
14582	#define vrev64_p16 vrev64_u16
14583
14584	_NEON2SSESTORAGE float32x2_t vrev64_f32(float32x2_t vec); // VREV64.32 d0,d0
14585	_NEON2SSE_INLINE float32x2_t vrev64_f32(float32x2_t vec)
14586	{
14587	float32x2_t res;
14588	res.m64_f32[`0`] = vec.m64_f32[`1`];
14589	res.m64_f32[`1`] = vec.m64_f32[`0`];
14590	return res;
14591	}
14592
14593	_NEON2SSESTORAGE int8x16_t vrev64q_s8(int8x16_t vec); // VREV64.8 q0,q0
14594	_NEON2SSE_INLINE int8x16_t vrev64q_s8(int8x16_t vec) // VREV64.8 q0,q0
14595	{
14596	_NEON2SSE_ALIGN_16 static const int8_t mask_rev_e8[`16`] = {`7`,`6`,`5`,`4`,`3`,`2`,`1`,`0`, `15`,`14`,`13`,`12`,`11`,`10`,`9`, `8`};
14597	return _mm_shuffle_epi8 (vec, (__m128i) mask_rev_e8);
14598	}
14599
14600	_NEON2SSESTORAGE int16x8_t vrev64q_s16(int16x8_t vec); // VREV64.16 q0,q0
14601	_NEON2SSE_INLINE int16x8_t vrev64q_s16(int16x8_t vec) // VREV64.16 q0,q0
14602	{
14603	//no _mm_shuffle_epi16, _mm_shuffle_epi8 to be used with the corresponding mask
14604	_NEON2SSE_ALIGN_16 static const int8_t mask_rev_e16[`16`] = {`6`,`7`, `4`,`5`,`2`,`3`,`0`,`1`,`14`,`15`,`12`,`13`,`10`,`11`,`8`,`9`};
14605	return _mm_shuffle_epi8 (vec, (__m128i)mask_rev_e16);
14606	}
14607
14608	_NEON2SSESTORAGE int32x4_t vrev64q_s32(int32x4_t vec); // VREV64.32 q0,q0
14609	_NEON2SSE_INLINE int32x4_t vrev64q_s32(int32x4_t vec) // VREV64.32 q0,q0
14610	{
14611	return _mm_shuffle_epi32 (vec, `1` \| (`0` << `2`) \| (`3` << `4`) \| (`2` << `6`) );
14612	}
14613
14614	_NEON2SSE_GLOBAL uint8x16_t vrev64q_u8(uint8x16_t vec); // VREV64.8 q0,q0
14615	#define vrev64q_u8 vrev64q_s8
14616
14617	_NEON2SSE_GLOBAL uint16x8_t vrev64q_u16(uint16x8_t vec); // VREV64.16 q0,q0
14618	#define vrev64q_u16 vrev64q_s16
14619
14620	_NEON2SSE_GLOBAL uint32x4_t vrev64q_u32(uint32x4_t vec); // VREV64.32 q0,q0
14621	#define vrev64q_u32 vrev64q_s32
14622
14623	_NEON2SSE_GLOBAL poly8x16_t vrev64q_p8(poly8x16_t vec); // VREV64.8 q0,q0
14624	#define vrev64q_p8 vrev64q_u8
14625
14626	_NEON2SSE_GLOBAL poly16x8_t vrev64q_p16(poly16x8_t vec); // VREV64.16 q0,q0
14627	#define vrev64q_p16 vrev64q_u16
14628
14629	_NEON2SSE_GLOBAL float32x4_t vrev64q_f32(float32x4_t vec); // VREV64.32 q0,q0
14630	#define vrev64q_f32(vec) _mm_shuffle_ps (vec, vec, _MM_SHUFFLE(2,3, 0,1))
14631
14632	//****************** 32 bit shuffles ********************
14633	//************************************************************
14634	_NEON2SSESTORAGE int8x8_t vrev32_s8(int8x8_t vec); // VREV32.8 d0,d0
14635	_NEON2SSE_INLINE int8x8_t vrev32_s8(int8x8_t vec)
14636	{
14637	int8x8_t res64;
14638	__m128i res;
14639	res = vrev32q_s8(_pM128i(vec));
14640	return64(res);
14641	}
14642
14643	_NEON2SSESTORAGE int16x4_t vrev32_s16(int16x4_t vec); // VREV32.16 d0,d0
14644	_NEON2SSE_INLINE int16x4_t vrev32_s16(int16x4_t vec)
14645	{
14646	int16x4_t res64;
14647	__m128i res;
14648	res = vrev32q_s16(_pM128i(vec));
14649	return64(res);
14650	}
14651
14652	_NEON2SSE_GLOBAL uint8x8_t vrev32_u8(uint8x8_t vec); // VREV32.8 d0,d0
14653	#define vrev32_u8 vrev32_s8
14654
14655	_NEON2SSE_GLOBAL uint16x4_t vrev32_u16(uint16x4_t vec); // VREV32.16 d0,d0
14656	#define vrev32_u16 vrev32_s16
14657
14658	_NEON2SSE_GLOBAL poly8x8_t vrev32_p8(poly8x8_t vec); // VREV32.8 d0,d0
14659	#define vrev32_p8 vrev32_u8
14660
14661	_NEON2SSE_GLOBAL poly16x4_t vrev32_p16(poly16x4_t vec); // VREV32.16 d0,d0
14662	#define vrev32_p16 vrev32_u16
14663
14664	_NEON2SSESTORAGE int8x16_t vrev32q_s8(int8x16_t vec); // VREV32.8 q0,q0
14665	_NEON2SSE_INLINE int8x16_t vrev32q_s8(int8x16_t vec) // VREV32.8 q0,q0
14666	{
14667	_NEON2SSE_ALIGN_16 static const int8_t mask_rev_e8[`16`] = {`3`,`2`,`1`,`0`, `7`,`6`,`5`,`4`, `11`,`10`,`9`,`8`, `15`,`14`,`13`,`12`};
14668	return _mm_shuffle_epi8 (vec, (__m128i) mask_rev_e8);
14669	}
14670
14671	_NEON2SSESTORAGE int16x8_t vrev32q_s16(int16x8_t vec); // VREV32.16 q0,q0
14672	_NEON2SSE_INLINE int16x8_t vrev32q_s16(int16x8_t vec) // VREV32.16 q0,q0
14673	{
14674	_NEON2SSE_ALIGN_16 static const int8_t mask_rev_e8[`16`] = {`2`,`3`,`0`,`1`, `6`,`7`, `4`,`5`, `10`,`11`, `8`,`9`, `14`,`15`,`12`,`13`};
14675	return _mm_shuffle_epi8 (vec, (__m128i) mask_rev_e8);
14676	}
14677
14678	_NEON2SSE_GLOBAL uint8x16_t vrev32q_u8(uint8x16_t vec); // VREV32.8 q0,q0
14679	#define vrev32q_u8 vrev32q_s8
14680
14681	_NEON2SSE_GLOBAL uint16x8_t vrev32q_u16(uint16x8_t vec); // VREV32.16 q0,q0
14682	#define vrev32q_u16 vrev32q_s16
14683
14684	_NEON2SSE_GLOBAL poly8x16_t vrev32q_p8(poly8x16_t vec); // VREV32.8 q0,q0
14685	#define vrev32q_p8 vrev32q_u8
14686
14687	_NEON2SSE_GLOBAL poly16x8_t vrev32q_p16(poly16x8_t vec); // VREV32.16 q0,q0
14688	#define vrev32q_p16 vrev32q_u16
14689
14690	//*********** 16 bit shuffles ********************
14691	//******************************************************
14692	_NEON2SSESTORAGE int8x8_t vrev16_s8(int8x8_t vec); // VREV16.8 d0,d0
14693	_NEON2SSE_INLINE int8x8_t vrev16_s8(int8x8_t vec)
14694	{
14695	int8x8_t res64;
14696	__m128i res;
14697	res = vrev16q_s8(_pM128i(vec));
14698	return64(res);
14699	}
14700
14701	_NEON2SSE_GLOBAL uint8x8_t vrev16_u8(uint8x8_t vec); // VREV16.8 d0,d0
14702	#define vrev16_u8 vrev16_s8
14703
14704	_NEON2SSE_GLOBAL poly8x8_t vrev16_p8(poly8x8_t vec); // VREV16.8 d0,d0
14705	#define vrev16_p8 vrev16_u8
14706
14707	_NEON2SSESTORAGE int8x16_t vrev16q_s8(int8x16_t vec); // VREV16.8 q0,q0
14708	_NEON2SSE_INLINE int8x16_t vrev16q_s8(int8x16_t vec) // VREV16.8 q0,q0
14709	{
14710	_NEON2SSE_ALIGN_16 static const int8_t mask_rev8[`16`] = {`1`,`0`, `3`,`2`, `5`,`4`, `7`,`6`, `9`,`8`, `11`, `10`, `13`, `12`, `15`, `14`};
14711	return _mm_shuffle_epi8 (vec, (__m128i) mask_rev8);
14712	}
14713
14714	_NEON2SSE_GLOBAL uint8x16_t vrev16q_u8(uint8x16_t vec); // VREV16.8 q0,q0
14715	#define vrev16q_u8 vrev16q_s8
14716
14717	_NEON2SSE_GLOBAL poly8x16_t vrev16q_p8(poly8x16_t vec); // VREV16.8 q0,q0
14718	#define vrev16q_p8 vrev16q_u8
14719
14720	//*********************************************************************
14721	//************** Other single operand arithmetic *****************
14722	//*********************************************************************
14723
14724	//********* Absolute: Vd[i] = \|Va[i]\| ********************************
14725	//************************************************************************
14726	_NEON2SSESTORAGE int8x8_t vabs_s8(int8x8_t a); // VABS.S8 d0,d0
14727	_NEON2SSE_INLINE int8x8_t vabs_s8(int8x8_t a)
14728	{
14729	int8x8_t res64;
14730	__m128i res;
14731	res = _mm_abs_epi8(_pM128i(a));
14732	return64(res);
14733	}
14734
14735
14736	_NEON2SSESTORAGE int16x4_t vabs_s16(int16x4_t a); // VABS.S16 d0,d0
14737	_NEON2SSE_INLINE int16x4_t vabs_s16(int16x4_t a)
14738	{
14739	int16x4_t res64;
14740	__m128i res;
14741	res = _mm_abs_epi16(_pM128i(a));
14742	return64(res);
14743	}
14744
14745	_NEON2SSESTORAGE int32x2_t vabs_s32(int32x2_t a); // VABS.S32 d0,d0
14746	_NEON2SSE_INLINE int32x2_t vabs_s32(int32x2_t a)
14747	{
14748	int32x2_t res64;
14749	__m128i res;
14750	res = _mm_abs_epi32(_pM128i(a));
14751	return64(res);
14752	}
14753
14754	_NEON2SSESTORAGE float32x2_t vabs_f32(float32x2_t a); // VABS.F32 d0,d0
14755	_NEON2SSE_INLINE float32x2_t vabs_f32(float32x2_t a) // VABS.F32 d0,d0
14756	{
14757	float32x4_t res;
14758	__m64_128 res64;
14759	_NEON2SSE_ALIGN_16 static const int32_t c7fffffff[`4`] = {`0x7fffffff`, `0x7fffffff`, `0x7fffffff`, `0x7fffffff`};
14760	res = _mm_and_ps (_pM128(a), (__m128)c7fffffff); //use 64 low bits only
14761	_M64f(res64, res);
14762	return res64;
14763	}
14764
14765	_NEON2SSE_GLOBAL int8x16_t vabsq_s8(int8x16_t a); // VABS.S8 q0,q0
14766	#define vabsq_s8 _mm_abs_epi8
14767
14768	_NEON2SSE_GLOBAL int16x8_t vabsq_s16(int16x8_t a); // VABS.S16 q0,q0
14769	#define vabsq_s16 _mm_abs_epi16
14770
14771	_NEON2SSE_GLOBAL int32x4_t vabsq_s32(int32x4_t a); // VABS.S32 q0,q0
14772	#define vabsq_s32 _mm_abs_epi32
14773
14774	_NEON2SSESTORAGE float32x4_t vabsq_f32(float32x4_t a); // VABS.F32 q0,q0
14775	_NEON2SSE_INLINE float32x4_t vabsq_f32(float32x4_t a) // VABS.F32 q0,q0
14776	{
14777	_NEON2SSE_ALIGN_16 static const int32_t c7fffffff[`4`] = {`0x7fffffff`, `0x7fffffff`, `0x7fffffff`, `0x7fffffff`};
14778	return _mm_and_ps (a, (__m128)c7fffffff);
14779	}
14780
14781	#ifdef _NEON2SSE_64BIT
14782	_NEON2SSESTORAGE int64x2_t vabsq_s64(int64x2_t a); // VABS.S64 q0,q0
14783	_NEON2SSE_INLINE int64x2_t vabsq_s64(int64x2_t a) // VABS.S64 q0,q0
14784	{
14785	__m128i sign = _mm_srai_epi32 (_mm_shuffle_epi32 (a, `0xf5`), `31`);
14786	return _mm_sub_epi64 (_mm_xor_si128 (a, sign), sign);
14787	}
14788
14789	_NEON2SSESTORAGE float64x2_t vabsq_f64(float64x2_t a); // VABS.F64 q0,q0
14790	_NEON2SSE_INLINE float64x2_t vabsq_f64(float64x2_t a) // VABS.F64 q0,q0
14791	{
14792	_NEON2SSE_ALIGN_16 static const int64_t mask[`2`] = {`0x7fffffffffffffffLL`, `0x7fffffffffffffffLL`};
14793	return _mm_and_pd (a, (__m128d)mask);
14794	}
14795	#endif
14796
14797	//**** Saturating absolute: Vd[i] = sat(\|Va[i]\|) *******************
14798	//**********************************************************************
14799	//For signed-integer data types, the absolute value of the most negative value is not representable by the data type, saturation takes place
14800	_NEON2SSESTORAGE int8x8_t vqabs_s8(int8x8_t a); // VQABS.S8 d0,d0
14801	_NEON2SSE_INLINE int8x8_t vqabs_s8(int8x8_t a)
14802	{
14803	int8x8_t res64;
14804	__m128i res;
14805	res = vqabsq_s8(_pM128i(a));
14806	return64(res);
14807	}
14808
14809	_NEON2SSESTORAGE int16x4_t vqabs_s16(int16x4_t a); // VQABS.S16 d0,d0
14810	_NEON2SSE_INLINE int16x4_t vqabs_s16(int16x4_t a)
14811	{
14812	int16x4_t res64;
14813	__m128i res;
14814	res = vqabsq_s16(_pM128i(a));
14815	return64(res);
14816	}
14817
14818	_NEON2SSESTORAGE int32x2_t vqabs_s32(int32x2_t a); // VQABS.S32 d0,d0
14819	_NEON2SSE_INLINE int32x2_t vqabs_s32(int32x2_t a)
14820	{
14821	int32x2_t res64;
14822	__m128i res;
14823	res = vqabsq_s32(_pM128i(a));
14824	return64(res);
14825	}
14826
14827	_NEON2SSESTORAGE int8x16_t vqabsq_s8(int8x16_t a); // VQABS.S8 q0,q0
14828	_NEON2SSE_INLINE int8x16_t vqabsq_s8(int8x16_t a) // VQABS.S8 q0,q0
14829	{
14830	__m128i c_128, abs, abs_cmp;
14831	c_128 = _mm_set1_epi8 (-`128`); //(int8_t)0x80
14832	abs = _mm_abs_epi8 (a);
14833	abs_cmp = _mm_cmpeq_epi8 (abs, c_128);
14834	return _mm_xor_si128 (abs, abs_cmp);
14835	}
14836
14837	_NEON2SSESTORAGE int16x8_t vqabsq_s16(int16x8_t a); // VQABS.S16 q0,q0
14838	_NEON2SSE_INLINE int16x8_t vqabsq_s16(int16x8_t a) // VQABS.S16 q0,q0
14839	{
14840	__m128i c_32768, abs, abs_cmp;
14841	c_32768 = _mm_set1_epi16 (-`32768`); //(int16_t)0x8000
14842	abs = _mm_abs_epi16 (a);
14843	abs_cmp = _mm_cmpeq_epi16 (abs, c_32768);
14844	return _mm_xor_si128 (abs, abs_cmp);
14845	}
14846
14847	_NEON2SSESTORAGE int32x4_t vqabsq_s32(int32x4_t a); // VQABS.S32 q0,q0
14848	_NEON2SSE_INLINE int32x4_t vqabsq_s32(int32x4_t a) // VQABS.S32 q0,q0
14849	{
14850	__m128i c80000000, abs, abs_cmp;
14851	c80000000 = _mm_set1_epi32 (`0x80000000`); //most negative value
14852	abs = _mm_abs_epi32 (a);
14853	abs_cmp = _mm_cmpeq_epi32 (abs, c80000000);
14854	return _mm_xor_si128 (abs, abs_cmp);
14855	}
14856
14857	//************* Negate: Vd[i] = - Va[i] ***********************************
14858	//*****************************************************************************
14859	//several Negate implementations possible for SIMD.
14860	//e.//function _mm_sign function(a, negative numbers vector), but the following one gives good performance:
14861	_NEON2SSESTORAGE int8x8_t vneg_s8(int8x8_t a); // VNE//d0,d0
14862	_NEON2SSE_INLINE int8x8_t vneg_s8(int8x8_t a)
14863	{
14864	int8x8_t res64;
14865	__m128i res;
14866	res = vnegq_s8(_pM128i(a));
14867	return64(res);
14868	}
14869
14870	_NEON2SSESTORAGE int16x4_t vneg_s16(int16x4_t a); // VNE//d0,d0
14871	_NEON2SSE_INLINE int16x4_t vneg_s16(int16x4_t a)
14872	{
14873	int16x4_t res64;
14874	__m128i res;
14875	res = vnegq_s16(_pM128i(a));
14876	return64(res);
14877	}
14878
14879	_NEON2SSESTORAGE int32x2_t vneg_s32(int32x2_t a); // VNE//d0,d0
14880	_NEON2SSE_INLINE int32x2_t vneg_s32(int32x2_t a)
14881	{
14882	int32x2_t res64;
14883	__m128i res;
14884	res = vnegq_s32(_pM128i(a));
14885	return64(res);
14886	}
14887
14888	_NEON2SSESTORAGE float32x2_t vneg_f32(float32x2_t a); // VNE//d0,d0
14889	_NEON2SSE_INLINE float32x2_t vneg_f32(float32x2_t a) // VNE//d0,d0
14890	{
14891	float32x4_t res;
14892	__m64_128 res64;
14893	_NEON2SSE_ALIGN_16 static const uint32_t c80000000[`4`] = {`0x80000000`, `0x80000000`, `0x80000000`, `0x80000000`};
14894	res = _mm_xor_ps (_pM128(a), (__m128) c80000000); //use low 64 bits
14895	_M64f(res64, res);
14896	return res64;
14897	}
14898
14899	_NEON2SSESTORAGE int8x16_t vnegq_s8(int8x16_t a); // VNE//q0,q0
14900	_NEON2SSE_INLINE int8x16_t vnegq_s8(int8x16_t a) // VNE//q0,q0
14901	{
14902	__m128i zero;
14903	zero = _mm_setzero_si128 ();
14904	return _mm_sub_epi8 (zero, a);
14905	} //or _mm_sign_epi8 (a, negative numbers vector)
14906
14907	_NEON2SSESTORAGE int16x8_t vnegq_s16(int16x8_t a); // VNE//q0,q0
14908	_NEON2SSE_INLINE int16x8_t vnegq_s16(int16x8_t a) // VNE//q0,q0
14909	{
14910	__m128i zero;
14911	zero = _mm_setzero_si128 ();
14912	return _mm_sub_epi16 (zero, a);
14913	} //or _mm_sign_epi16 (a, negative numbers vector)
14914
14915	_NEON2SSESTORAGE int32x4_t vnegq_s32(int32x4_t a); // VNE//q0,q0
14916	_NEON2SSE_INLINE int32x4_t vnegq_s32(int32x4_t a) // VNE//q0,q0
14917	{
14918	__m128i zero;
14919	zero = _mm_setzero_si128 ();
14920	return _mm_sub_epi32 (zero, a);
14921	} //or _mm_sign_epi32 (a, negative numbers vector)
14922
14923	_NEON2SSESTORAGE float32x4_t vnegq_f32(float32x4_t a); // VNE//q0,q0
14924	_NEON2SSE_INLINE float32x4_t vnegq_f32(float32x4_t a) // VNE//q0,q0
14925	{
14926	_NEON2SSE_ALIGN_16 static const uint32_t c80000000[`4`] = {`0x80000000`, `0x80000000`, `0x80000000`, `0x80000000`};
14927	return _mm_xor_ps (a, (__m128) c80000000);
14928	}
14929
14930	//************ Saturating Negate: sat(Vd[i] = - Va[i]) ************************
14931	//***************************************************************************************
14932	//For signed-integer data types, the negation of the most negative value can't be produced without saturation, while with saturation it is max positive
14933	_NEON2SSESTORAGE int8x8_t vqneg_s8(int8x8_t a); // VQNE//d0,d0
14934	_NEON2SSE_INLINE int8x8_t vqneg_s8(int8x8_t a)
14935	{
14936	int8x8_t res64;
14937	__m128i res;
14938	res = vqnegq_s8(_pM128i(a));
14939	return64(res);
14940	}
14941
14942	_NEON2SSESTORAGE int16x4_t vqneg_s16(int16x4_t a); // VQNE//d0,d0
14943	_NEON2SSE_INLINE int16x4_t vqneg_s16(int16x4_t a)
14944	{
14945	int16x4_t res64;
14946	__m128i res;
14947	res = vqnegq_s16(_pM128i(a));
14948	return64(res);
14949	}
14950
14951	_NEON2SSESTORAGE int32x2_t vqneg_s32(int32x2_t a); // VQNE//d0,d0
14952	_NEON2SSE_INLINE int32x2_t vqneg_s32(int32x2_t a)
14953	{
14954	int32x2_t res64;
14955	__m128i res;
14956	res = vqnegq_s32(_pM128i(a));
14957	return64(res);
14958	}
14959
14960	_NEON2SSESTORAGE int8x16_t vqnegq_s8(int8x16_t a); // VQNE//q0,q0
14961	_NEON2SSE_INLINE int8x16_t vqnegq_s8(int8x16_t a) // VQNE//q0,q0
14962	{
14963	__m128i zero;
14964	zero = _mm_setzero_si128 ();
14965	return _mm_subs_epi8 (zero, a); //saturating substraction
14966	}
14967
14968	_NEON2SSESTORAGE int16x8_t vqnegq_s16(int16x8_t a); // VQNE//q0,q0
14969	_NEON2SSE_INLINE int16x8_t vqnegq_s16(int16x8_t a) // VQNE//q0,q0
14970	{
14971	__m128i zero;
14972	zero = _mm_setzero_si128 ();
14973	return _mm_subs_epi16 (zero, a); //saturating substraction
14974	}
14975
14976	_NEON2SSESTORAGE int32x4_t vqnegq_s32(int32x4_t a); // VQNE//q0,q0
14977	_NEON2SSE_INLINE int32x4_t vqnegq_s32(int32x4_t a) // VQNE//q0,q0
14978	{
14979	//solution may be not optimal compared with a serial
14980	__m128i c80000000, zero, sub, cmp;
14981	c80000000 = _mm_set1_epi32 (`0x80000000`); //most negative value
14982	zero = _mm_setzero_si128 ();
14983	sub = _mm_sub_epi32 (zero, a); //substraction
14984	cmp = _mm_cmpeq_epi32 (a, c80000000);
14985	return _mm_xor_si128 (sub, cmp);
14986	}
14987
14988	//**************** Count leading zeros ******************************
14989	//**************************************************************************
14990	//no corresponding vector intrinsics in IA32, need to implement it. While the implementation is effective for 8 bits, it may be not for 16 and 32 bits
14991	_NEON2SSESTORAGE int8x8_t vclz_s8(int8x8_t a); // VCLZ.I8 d0,d0
14992	_NEON2SSE_INLINE int8x8_t vclz_s8(int8x8_t a)
14993	{
14994	int8x8_t res64;
14995	__m128i res;
14996	res = vclzq_s8(_pM128i(a));
14997	return64(res);
14998	}
14999
15000	_NEON2SSESTORAGE int16x4_t vclz_s16(int16x4_t a); // VCLZ.I16 d0,d0
15001	_NEON2SSE_INLINE int16x4_t vclz_s16(int16x4_t a)
15002	{
15003	int16x4_t res64;
15004	__m128i res;
15005	res = vclzq_s16(_pM128i(a));
15006	return64(res);
15007	}
15008
15009	_NEON2SSESTORAGE int32x2_t vclz_s32(int32x2_t a); // VCLZ.I32 d0,d0
15010	_NEON2SSE_INLINE int32x2_t vclz_s32(int32x2_t a)
15011	{
15012	int32x2_t res64;
15013	__m128i res;
15014	res = vclzq_s32(_pM128i(a));
15015	return64(res);
15016	}
15017
15018
15019	_NEON2SSE_GLOBAL uint8x8_t vclz_u8(uint8x8_t a); // VCLZ.I8 d0,d0
15020	#define vclz_u8 vclz_s8
15021
15022	_NEON2SSE_GLOBAL uint16x4_t vclz_u16(uint16x4_t a); // VCLZ.I16 d0,d0
15023	#define vclz_u16 vclz_s16
15024
15025	_NEON2SSE_GLOBAL uint32x2_t vclz_u32(uint32x2_t a); // VCLZ.I32 d0,d0
15026	#define vclz_u32 vclz_s32
15027
15028	_NEON2SSESTORAGE int8x16_t vclzq_s8(int8x16_t a); // VCLZ.I8 q0,q0
15029	_NEON2SSE_INLINE int8x16_t vclzq_s8(int8x16_t a)
15030	{
15031	_NEON2SSE_ALIGN_16 static const int8_t mask_CLZ[`16`] = { / 0 / `4`,/ 1 / `3`,/ 2 / `2`,/ 3 / `2`,
15032	/ 4 / `1`,/ 5 / `1`,/ 6 / `1`,/ 7 / `1`,
15033	/ 8 / `0`,/ 9 / `0`,/ a / `0`,/ b / `0`,
15034	/ c / `0`,/ d / `0`,/ e / `0`,/ f / `0` };
15035	__m128i maskLOW, c4, lowclz, mask, hiclz;
15036	maskLOW = _mm_set1_epi8(`0x0f`); //low 4 bits, don't need masking low to avoid zero if MSB is set - it happens automatically
15037	c4 = _mm_set1_epi8(`4`);
15038	lowclz = _mm_shuffle_epi8( (__m128i)mask_CLZ, a); //uses low 4 bits anyway
15039	mask = _mm_srli_epi16(a, `4`); //get high 4 bits as low bits
15040	mask = _mm_and_si128(mask, maskLOW); //low 4 bits, need masking to avoid zero if MSB is set
15041	hiclz = _mm_shuffle_epi8( (__m128i) mask_CLZ, mask); //uses low 4 bits anyway
15042	mask = _mm_cmpeq_epi8(hiclz, c4); // shows the need to add lowclz zeros
15043	lowclz = _mm_and_si128(lowclz,mask);
15044	return _mm_add_epi8(lowclz, hiclz);
15045	}
15046
15047	_NEON2SSESTORAGE int16x8_t vclzq_s16(int16x8_t a); // VCLZ.I16 q0,q0
15048	_NEON2SSE_INLINE int16x8_t vclzq_s16(int16x8_t a)
15049	{
15050	__m128i c7, res8x16, res8x16_swap;
15051	_NEON2SSE_ALIGN_16 static const int8_t mask8_sab[`16`] = { `1`, `0`, `3`, `2`, `5`, `4`, `7`, `6`, `9`, `8`, `11`, `10`, `13`, `12`, `15`, `14`};
15052	_NEON2SSE_ALIGN_16 static const uint16_t mask8bit[`8`] = {`0x00ff`, `0x00ff`, `0x00ff`, `0x00ff`,`0x00ff`, `0x00ff`, `0x00ff`, `0x00ff`};
15053	c7 = _mm_srli_epi16((__m128i)mask8bit, `5`); //7
15054	res8x16 = vclzq_s8(a);
15055	res8x16_swap = _mm_shuffle_epi8 (res8x16, (__m128i) mask8_sab); //horisontal pairs swap
15056	res8x16 = _mm_and_si128(res8x16, (__m128i)mask8bit); //lowclz
15057	res8x16_swap = _mm_and_si128(res8x16_swap, (__m128i)mask8bit); //hiclz
15058	c7 = _mm_cmpgt_epi16(res8x16_swap, c7); // shows the need to add lowclz zeros
15059	res8x16 = _mm_and_si128(res8x16, c7); //lowclz
15060	return _mm_add_epi16(res8x16_swap, res8x16);
15061	}
15062
15063	_NEON2SSESTORAGE int32x4_t vclzq_s32(int32x4_t a); // VCLZ.I32 q0,q0
15064	_NEON2SSE_INLINE int32x4_t vclzq_s32(int32x4_t a)
15065	{
15066	__m128i c55555555, c33333333, c0f0f0f0f, c3f, c32, tmp, tmp1, res;
15067	c55555555 = _mm_set1_epi32(`0x55555555`);
15068	c33333333 = _mm_set1_epi32(`0x33333333`);
15069	c0f0f0f0f = _mm_set1_epi32(`0x0f0f0f0f`);
15070	c3f = _mm_set1_epi32(`0x3f`);
15071	c32 = _mm_set1_epi32(`32`);
15072	tmp = _mm_srli_epi32(a, `1`);
15073	res = _mm_or_si128(tmp, a); //atmp[i] \|= (atmp[i] >> 1);
15074	tmp = _mm_srli_epi32(res, `2`);
15075	res = _mm_or_si128(tmp, res); //atmp[i] \|= (atmp[i] >> 2);
15076	tmp = _mm_srli_epi32(res, `4`);
15077	res = _mm_or_si128(tmp, res); //atmp[i] \|= (atmp[i] >> 4);
15078	tmp = _mm_srli_epi32(res, `8`);
15079	res = _mm_or_si128(tmp, res); //atmp[i] \|= (atmp[i] >> 8);
15080	tmp = _mm_srli_epi32(res, `16`);
15081	res = _mm_or_si128(tmp, res); //atmp[i] \|= (atmp[i] >> 16);
15082
15083	tmp = _mm_srli_epi32(res, `1`);
15084	tmp = _mm_and_si128(tmp, c55555555);
15085	res = _mm_sub_epi32(res, tmp); //atmp[i] -= ((atmp[i] >> 1) & 0x55555555);
15086
15087	tmp = _mm_srli_epi32(res, `2`);
15088	tmp = _mm_and_si128(tmp, c33333333);
15089	tmp1 = _mm_and_si128(res, c33333333);
15090	res = _mm_add_epi32(tmp, tmp1); //atmp[i] = (((atmp[i] >> 2) & 0x33333333) + (atmp[i] & 0x33333333));
15091
15092	tmp = _mm_srli_epi32(res, `4`);
15093	tmp = _mm_add_epi32(tmp, res);
15094	res = _mm_and_si128(tmp, c0f0f0f0f); //atmp[i] = (((atmp[i] >> 4) + atmp[i]) & 0x0f0f0f0f);
15095
15096	tmp = _mm_srli_epi32(res, `8`);
15097	res = _mm_add_epi32(tmp, res); //atmp[i] += (atmp[i] >> 8);
15098
15099	tmp = _mm_srli_epi32(res, `16`);
15100	res = _mm_add_epi32(tmp, res); //atmp[i] += (atmp[i] >> 16);
15101
15102	res = _mm_and_si128(res, c3f); //atmp[i] = atmp[i] & 0x0000003f;
15103
15104	return _mm_sub_epi32(c32, res); //res[i] = 32 - atmp[i];
15105	}
15106
15107	_NEON2SSE_GLOBAL uint8x16_t vclzq_u8(uint8x16_t a); // VCLZ.I8 q0,q0
15108	#define vclzq_u8 vclzq_s8
15109
15110	_NEON2SSE_GLOBAL uint16x8_t vclzq_u16(uint16x8_t a); // VCLZ.I16 q0,q0
15111	#define vclzq_u16 vclzq_s16
15112
15113	_NEON2SSE_GLOBAL uint32x4_t vclzq_u32(uint32x4_t a); // VCLZ.I32 q0,q0
15114	#define vclzq_u32 vclzq_s32
15115
15116	//************ Count leading sign bits ************************
15117	//********************************************************************
15118	//VCLS (Vector Count Leading Sign bits) counts the number of consecutive bits following
15119	// the topmost bit, that are the same as the topmost bit, in each element in a vector
15120	//No corresponding vector intrinsics in IA32, need to implement it.
15121	//While the implementation is effective for 8 bits, it may be not for 16 and 32 bits
15122	_NEON2SSESTORAGE int8x8_t vcls_s8(int8x8_t a); // VCLS.S8 d0,d0
15123	_NEON2SSE_INLINE int8x8_t vcls_s8(int8x8_t a)
15124	{
15125	int8x8_t res64;
15126	__m128i res;
15127	res = vclsq_s8(_pM128i(a));
15128	return64(res);
15129	}
15130
15131	_NEON2SSESTORAGE int16x4_t vcls_s16(int16x4_t a); // VCLS.S16 d0,d0
15132	_NEON2SSE_INLINE int16x4_t vcls_s16(int16x4_t a)
15133	{
15134	int16x4_t res64;
15135	__m128i res;
15136	res = vclsq_s16(_pM128i(a));
15137	return64(res);
15138	}
15139
15140	_NEON2SSESTORAGE int32x2_t vcls_s32(int32x2_t a); // VCLS.S32 d0,d0
15141	_NEON2SSE_INLINE int32x2_t vcls_s32(int32x2_t a)
15142	{
15143	int32x2_t res64;
15144	__m128i res;
15145	res = vclsq_s32(_pM128i(a));
15146	return64(res);
15147	}
15148
15149	_NEON2SSESTORAGE int8x16_t vclsq_s8(int8x16_t a); // VCLS.S8 q0,q0
15150	_NEON2SSE_INLINE int8x16_t vclsq_s8(int8x16_t a)
15151	{
15152	__m128i cff, c80, c1, a_mask, a_neg, a_pos, a_comb;
15153	cff = _mm_cmpeq_epi8 (a,a); //0xff
15154	c80 = _mm_set1_epi8(-`128`); //(int8_t)0x80
15155	c1 = _mm_set1_epi8(`1`);
15156	a_mask = _mm_and_si128(a, c80);
15157	a_mask = _mm_cmpeq_epi8(a_mask, c80); //0xff if negative input and 0 if positive
15158	a_neg = _mm_xor_si128(a, cff);
15159	a_neg = _mm_and_si128(a_mask, a_neg);
15160	a_pos = _mm_andnot_si128(a_mask, a);
15161	a_comb = _mm_or_si128(a_pos, a_neg);
15162	a_comb = vclzq_s8(a_comb);
15163	return _mm_sub_epi8(a_comb, c1);
15164	}
15165
15166	_NEON2SSESTORAGE int16x8_t vclsq_s16(int16x8_t a); // VCLS.S16 q0,q0
15167	_NEON2SSE_INLINE int16x8_t vclsq_s16(int16x8_t a)
15168	{
15169	__m128i cffff, c8000, c1, a_mask, a_neg, a_pos, a_comb;
15170	cffff = _mm_cmpeq_epi16(a,a);
15171	c8000 = _mm_slli_epi16(cffff, `15`); //0x8000
15172	c1 = _mm_srli_epi16(cffff,`15`); //0x1
15173	a_mask = _mm_and_si128(a, c8000);
15174	a_mask = _mm_cmpeq_epi16(a_mask, c8000); //0xffff if negative input and 0 if positive
15175	a_neg = _mm_xor_si128(a, cffff);
15176	a_neg = _mm_and_si128(a_mask, a_neg);
15177	a_pos = _mm_andnot_si128(a_mask, a);
15178	a_comb = _mm_or_si128(a_pos, a_neg);
15179	a_comb = vclzq_s16(a_comb);
15180	return _mm_sub_epi16(a_comb, c1);
15181	}
15182
15183	_NEON2SSESTORAGE int32x4_t vclsq_s32(int32x4_t a); // VCLS.S32 q0,q0
15184	_NEON2SSE_INLINE int32x4_t vclsq_s32(int32x4_t a)
15185	{
15186	__m128i cffffffff, c80000000, c1, a_mask, a_neg, a_pos, a_comb;
15187	cffffffff = _mm_cmpeq_epi32(a,a);
15188	c80000000 = _mm_slli_epi32(cffffffff, `31`); //0x80000000
15189	c1 = _mm_srli_epi32(cffffffff,`31`); //0x1
15190	a_mask = _mm_and_si128(a, c80000000);
15191	a_mask = _mm_cmpeq_epi32(a_mask, c80000000); //0xffffffff if negative input and 0 if positive
15192	a_neg = _mm_xor_si128(a, cffffffff);
15193	a_neg = _mm_and_si128(a_mask, a_neg);
15194	a_pos = _mm_andnot_si128(a_mask, a);
15195	a_comb = _mm_or_si128(a_pos, a_neg);
15196	a_comb = vclzq_s32(a_comb);
15197	return _mm_sub_epi32(a_comb, c1);
15198	}
15199
15200	//*********************** Count number of set bits ******************************
15201	//*************************************************************************************
15202	//No corresponding SIMD solution. One option is to get a elements, convert it to 32 bits and then use SSE4.2 _mm_popcnt__u32 (unsigned int v) for each element
15203	//another option is to do the following algorithm:
15204
15205	_NEON2SSESTORAGE uint8x8_t vcnt_u8(uint8x8_t a); // VCNT.8 d0,d0
15206	_NEON2SSE_INLINE uint8x8_t vcnt_u8(uint8x8_t a)
15207	{
15208	uint8x8_t res64;
15209	__m128i res;
15210	res = vcntq_u8(_pM128i(a));
15211	return64(res);
15212	}
15213
15214	_NEON2SSE_GLOBAL int8x8_t vcnt_s8(int8x8_t a); // VCNT.8 d0,d0
15215	#define vcnt_s8 vcnt_u8
15216
15217	_NEON2SSE_GLOBAL poly8x8_t vcnt_p8(poly8x8_t a); // VCNT.8 d0,d0
15218	#define vcnt_p8 vcnt_u8
15219
15220	_NEON2SSESTORAGE uint8x16_t vcntq_u8(uint8x16_t a); // VCNT.8 q0,q0
15221	_NEON2SSE_INLINE uint8x16_t vcntq_u8(uint8x16_t a)
15222	{
15223	_NEON2SSE_ALIGN_16 static const int8_t mask_POPCOUNT[`16`] = { / 0 / `0`,/ 1 / `1`,/ 2 / `1`,/ 3 / `2`,
15224	/ 4 / `1`,/ 5 / `2`,/ 6 / `2`,/ 7 / `3`,
15225	/ 8 / `1`,/ 9 / `2`,/ a / `2`,/ b / `3`,
15226	/ c / `2`,/ d / `3`,/ e / `3`,/ f / `4`};
15227	__m128i maskLOW, mask, lowpopcnt, hipopcnt;
15228	maskLOW = _mm_set1_epi8(`0x0f`); //low 4 bits, need masking to avoid zero if MSB is set
15229	mask = _mm_and_si128(a, maskLOW);
15230	lowpopcnt = _mm_shuffle_epi8( (__m128i)mask_POPCOUNT, mask); //uses low 4 bits anyway
15231	mask = _mm_srli_epi16(a, `4`); //get high 4 bits as low bits
15232	mask = _mm_and_si128(mask, maskLOW); //low 4 bits, need masking to avoid zero if MSB is set
15233	hipopcnt = _mm_shuffle_epi8( (__m128i) mask_POPCOUNT, mask); //uses low 4 bits anyway
15234	return _mm_add_epi8(lowpopcnt, hipopcnt);
15235	}
15236
15237	_NEON2SSE_GLOBAL int8x16_t vcntq_s8(int8x16_t a); // VCNT.8 q0,q0
15238	#define vcntq_s8 vcntq_u8
15239
15240	_NEON2SSE_GLOBAL poly8x16_t vcntq_p8(poly8x16_t a); // VCNT.8 q0,q0
15241	#define vcntq_p8 vcntq_u8
15242
15243	//**************************************************************************************
15244	//********************* Logical operations **************************************
15245	//**************************************************************************************
15246	//************************ Bitwise not *********************************
15247	//several Bitwise not implementations possible for SIMD. Eg "xor" with all ones, but the following one gives good performance
15248	_NEON2SSESTORAGE int8x8_t vmvn_s8(int8x8_t a); // VMVN d0,d0
15249	_NEON2SSE_INLINE int8x8_t vmvn_s8(int8x8_t a)
15250	{
15251	int8x8_t res64;
15252	__m128i res;
15253	res = vmvnq_s8(_pM128i(a));
15254	return64(res);
15255	}
15256
15257	_NEON2SSESTORAGE int16x4_t vmvn_s16(int16x4_t a); // VMVN d0,d0
15258	_NEON2SSE_INLINE int16x4_t vmvn_s16(int16x4_t a)
15259	{
15260	int16x4_t res64;
15261	__m128i res;
15262	res = vmvnq_s16(_pM128i(a));
15263	return64(res);
15264	}
15265
15266	_NEON2SSESTORAGE int32x2_t vmvn_s32(int32x2_t a); // VMVN d0,d0
15267	_NEON2SSE_INLINE int32x2_t vmvn_s32(int32x2_t a)
15268	{
15269	int32x2_t res64;
15270	__m128i res;
15271	res = vmvnq_s32(_pM128i(a));
15272	return64(res);
15273	}
15274
15275	_NEON2SSE_GLOBAL uint8x8_t vmvn_u8(uint8x8_t a); // VMVN d0,d0
15276	#define vmvn_u8 vmvn_s8
15277
15278	_NEON2SSE_GLOBAL uint16x4_t vmvn_u16(uint16x4_t a); // VMVN d0,d0
15279	#define vmvn_u16 vmvn_s16
15280
15281	_NEON2SSE_GLOBAL uint32x2_t vmvn_u32(uint32x2_t a); // VMVN d0,d0
15282	#define vmvn_u32 vmvn_s32
15283
15284	_NEON2SSE_GLOBAL poly8x8_t vmvn_p8(poly8x8_t a); // VMVN d0,d0
15285	#define vmvn_p8 vmvn_u8
15286
15287	_NEON2SSESTORAGE int8x16_t vmvnq_s8(int8x16_t a); // VMVN q0,q0
15288	_NEON2SSE_INLINE int8x16_t vmvnq_s8(int8x16_t a) // VMVN q0,q0
15289	{
15290	__m128i c1;
15291	c1 = _mm_cmpeq_epi8 (a,a); //0xff
15292	return _mm_andnot_si128 (a, c1);
15293	}
15294
15295	_NEON2SSESTORAGE int16x8_t vmvnq_s16(int16x8_t a); // VMVN q0,q0
15296	_NEON2SSE_INLINE int16x8_t vmvnq_s16(int16x8_t a) // VMVN q0,q0
15297	{
15298	__m128i c1;
15299	c1 = _mm_cmpeq_epi16 (a,a); //0xffff
15300	return _mm_andnot_si128 (a, c1);
15301	}
15302
15303	_NEON2SSESTORAGE int32x4_t vmvnq_s32(int32x4_t a); // VMVN q0,q0
15304	_NEON2SSE_INLINE int32x4_t vmvnq_s32(int32x4_t a) // VMVN q0,q0
15305	{
15306	__m128i c1;
15307	c1 = _mm_cmpeq_epi32 (a,a); //0xffffffff
15308	return _mm_andnot_si128 (a, c1);
15309	}
15310
15311	_NEON2SSE_GLOBAL uint8x16_t vmvnq_u8(uint8x16_t a); // VMVN q0,q0
15312	#define vmvnq_u8 vmvnq_s8
15313
15314	_NEON2SSE_GLOBAL uint16x8_t vmvnq_u16(uint16x8_t a); // VMVN q0,q0
15315	#define vmvnq_u16 vmvnq_s16
15316
15317	_NEON2SSE_GLOBAL uint32x4_t vmvnq_u32(uint32x4_t a); // VMVN q0,q0
15318	#define vmvnq_u32 vmvnq_s32
15319
15320	_NEON2SSE_GLOBAL poly8x16_t vmvnq_p8(poly8x16_t a); // VMVN q0,q0
15321	#define vmvnq_p8 vmvnq_u8
15322
15323	//**************** Bitwise and *********************
15324	//******************************************************
15325	_NEON2SSESTORAGE int8x8_t vand_s8(int8x8_t a, int8x8_t b); // VAND d0,d0,d0
15326	_NEON2SSE_INLINE int8x8_t vand_s8(int8x8_t a, int8x8_t b)
15327	{
15328	int8x8_t res64;
15329	return64(_mm_and_si128(_pM128i(a),_pM128i(b)));
15330	}
15331
15332	_NEON2SSESTORAGE int16x4_t vand_s16(int16x4_t a, int16x4_t b); // VAND d0,d0,d0
15333	_NEON2SSE_INLINE int16x4_t vand_s16(int16x4_t a, int16x4_t b)
15334	{
15335	int16x4_t res64;
15336	return64(_mm_and_si128(_pM128i(a),_pM128i(b)));
15337	}
15338
15339	_NEON2SSESTORAGE int32x2_t vand_s32(int32x2_t a, int32x2_t b); // VAND d0,d0,d0
15340	_NEON2SSE_INLINE int32x2_t vand_s32(int32x2_t a, int32x2_t b)
15341	{
15342	int32x2_t res64;
15343	return64(_mm_and_si128(_pM128i(a),_pM128i(b)));
15344	}
15345
15346
15347	_NEON2SSESTORAGE int64x1_t vand_s64(int64x1_t a, int64x1_t b); // VAND d0,d0,d0
15348	_NEON2SSE_INLINE int64x1_t vand_s64(int64x1_t a, int64x1_t b)
15349	{
15350	int64x1_t res;
15351	res.m64_i64[`0`] = a.m64_i64[`0`] & b.m64_i64[`0`];
15352	return res;
15353	}
15354
15355	_NEON2SSE_GLOBAL uint8x8_t vand_u8(uint8x8_t a, uint8x8_t b); // VAND d0,d0,d0
15356	#define vand_u8 vand_s8
15357
15358	_NEON2SSE_GLOBAL uint16x4_t vand_u16(uint16x4_t a, uint16x4_t b); // VAND d0,d0,d0
15359	#define vand_u16 vand_s16
15360
15361	_NEON2SSE_GLOBAL uint32x2_t vand_u32(uint32x2_t a, uint32x2_t b); // VAND d0,d0,d0
15362	#define vand_u32 vand_s32
15363
15364	_NEON2SSE_GLOBAL uint64x1_t vand_u64(uint64x1_t a, uint64x1_t b); // VAND d0,d0,d0
15365	#define vand_u64 vand_s64
15366
15367
15368	_NEON2SSE_GLOBAL int8x16_t vandq_s8(int8x16_t a, int8x16_t b); // VAND q0,q0,q0
15369	#define vandq_s8 _mm_and_si128
15370
15371	_NEON2SSE_GLOBAL int16x8_t vandq_s16(int16x8_t a, int16x8_t b); // VAND q0,q0,q0
15372	#define vandq_s16 _mm_and_si128
15373
15374	_NEON2SSE_GLOBAL int32x4_t vandq_s32(int32x4_t a, int32x4_t b); // VAND q0,q0,q0
15375	#define vandq_s32 _mm_and_si128
15376
15377	_NEON2SSE_GLOBAL int64x2_t vandq_s64(int64x2_t a, int64x2_t b); // VAND q0,q0,q0
15378	#define vandq_s64 _mm_and_si128
15379
15380	_NEON2SSE_GLOBAL uint8x16_t vandq_u8(uint8x16_t a, uint8x16_t b); // VAND q0,q0,q0
15381	#define vandq_u8 _mm_and_si128
15382
15383	_NEON2SSE_GLOBAL uint16x8_t vandq_u16(uint16x8_t a, uint16x8_t b); // VAND q0,q0,q0
15384	#define vandq_u16 _mm_and_si128
15385
15386	_NEON2SSE_GLOBAL uint32x4_t vandq_u32(uint32x4_t a, uint32x4_t b); // VAND q0,q0,q0
15387	#define vandq_u32 _mm_and_si128
15388
15389	_NEON2SSE_GLOBAL uint64x2_t vandq_u64(uint64x2_t a, uint64x2_t b); // VAND q0,q0,q0
15390	#define vandq_u64 _mm_and_si128
15391
15392	//****************** Bitwise or *******************************
15393	//******************************************************************
15394	_NEON2SSESTORAGE int8x8_t vorr_s8(int8x8_t a, int8x8_t b); // VORR d0,d0,d0
15395	_NEON2SSE_INLINE int8x8_t vorr_s8(int8x8_t a, int8x8_t b)
15396	{
15397	int8x8_t res64;
15398	return64(_mm_or_si128(_pM128i(a),_pM128i(b)));
15399	}
15400
15401
15402	_NEON2SSESTORAGE int16x4_t vorr_s16(int16x4_t a, int16x4_t b); // VORR d0,d0,d0
15403	_NEON2SSE_INLINE int16x4_t vorr_s16(int16x4_t a, int16x4_t b)
15404	{
15405	int16x4_t res64;
15406	return64(_mm_or_si128(_pM128i(a),_pM128i(b)));
15407	}
15408
15409
15410	_NEON2SSESTORAGE int32x2_t vorr_s32(int32x2_t a, int32x2_t b); // VORR d0,d0,d0
15411	_NEON2SSE_INLINE int32x2_t vorr_s32(int32x2_t a, int32x2_t b)
15412	{
15413	int32x2_t res64;
15414	return64(_mm_or_si128(_pM128i(a),_pM128i(b)));
15415	}
15416
15417
15418	_NEON2SSESTORAGE int64x1_t vorr_s64(int64x1_t a, int64x1_t b); // VORR d0,d0,d0
15419	_NEON2SSE_INLINE int64x1_t vorr_s64(int64x1_t a, int64x1_t b)
15420	{
15421	int64x1_t res;
15422	res.m64_i64[`0`] = a.m64_i64[`0`] \| b.m64_i64[`0`];
15423	return res;
15424	}
15425
15426	_NEON2SSE_GLOBAL uint8x8_t vorr_u8(uint8x8_t a, uint8x8_t b); // VORR d0,d0,d0
15427	#define vorr_u8 vorr_s8
15428
15429	_NEON2SSE_GLOBAL uint16x4_t vorr_u16(uint16x4_t a, uint16x4_t b); // VORR d0,d0,d0
15430	#define vorr_u16 vorr_s16
15431
15432	_NEON2SSE_GLOBAL uint32x2_t vorr_u32(uint32x2_t a, uint32x2_t b); // VORR d0,d0,d0
15433	#define vorr_u32 vorr_s32
15434
15435	_NEON2SSE_GLOBAL uint64x1_t vorr_u64(uint64x1_t a, uint64x1_t b); // VORR d0,d0,d0
15436	#define vorr_u64 vorr_s64
15437
15438	_NEON2SSE_GLOBAL int8x16_t vorrq_s8(int8x16_t a, int8x16_t b); // VORR q0,q0,q0
15439	#define vorrq_s8 _mm_or_si128
15440
15441	_NEON2SSE_GLOBAL int16x8_t vorrq_s16(int16x8_t a, int16x8_t b); // VORR q0,q0,q0
15442	#define vorrq_s16 _mm_or_si128
15443
15444	_NEON2SSE_GLOBAL int32x4_t vorrq_s32(int32x4_t a, int32x4_t b); // VORR q0,q0,q0
15445	#define vorrq_s32 _mm_or_si128
15446
15447	_NEON2SSE_GLOBAL int64x2_t vorrq_s64(int64x2_t a, int64x2_t b); // VORR q0,q0,q0
15448	#define vorrq_s64 _mm_or_si128
15449
15450	_NEON2SSE_GLOBAL uint8x16_t vorrq_u8(uint8x16_t a, uint8x16_t b); // VORR q0,q0,q0
15451	#define vorrq_u8 _mm_or_si128
15452
15453	_NEON2SSE_GLOBAL uint16x8_t vorrq_u16(uint16x8_t a, uint16x8_t b); // VORR q0,q0,q0
15454	#define vorrq_u16 _mm_or_si128
15455
15456	_NEON2SSE_GLOBAL uint32x4_t vorrq_u32(uint32x4_t a, uint32x4_t b); // VORR q0,q0,q0
15457	#define vorrq_u32 _mm_or_si128
15458
15459	_NEON2SSE_GLOBAL uint64x2_t vorrq_u64(uint64x2_t a, uint64x2_t b); // VORR q0,q0,q0
15460	#define vorrq_u64 _mm_or_si128
15461
15462	//*********** Bitwise exclusive or (EOR or XOR) ****************
15463	//*******************************************************************
15464	_NEON2SSESTORAGE int8x8_t veor_s8(int8x8_t a, int8x8_t b); // VEOR d0,d0,d0
15465	_NEON2SSE_INLINE int8x8_t veor_s8(int8x8_t a, int8x8_t b)
15466	{
15467	int8x8_t res64;
15468	return64(_mm_xor_si128(_pM128i(a),_pM128i(b)));
15469	}
15470
15471	_NEON2SSE_GLOBAL int16x4_t veor_s16(int16x4_t a, int16x4_t b); // VEOR d0,d0,d0
15472	#define veor_s16 veor_s8
15473
15474	_NEON2SSE_GLOBAL int32x2_t veor_s32(int32x2_t a, int32x2_t b); // VEOR d0,d0,d0
15475	#define veor_s32 veor_s8
15476
15477	_NEON2SSESTORAGE int64x1_t veor_s64(int64x1_t a, int64x1_t b); // VEOR d0,d0,d0
15478	_NEON2SSE_INLINE int64x1_t veor_s64(int64x1_t a, int64x1_t b)
15479	{
15480	int64x1_t res;
15481	res.m64_i64[`0`] = a.m64_i64[`0`] ^ b.m64_i64[`0`];
15482	return res;
15483	}
15484
15485	_NEON2SSE_GLOBAL uint8x8_t veor_u8(uint8x8_t a, uint8x8_t b); // VEOR d0,d0,d0
15486	#define veor_u8 veor_s8
15487
15488	_NEON2SSE_GLOBAL uint16x4_t veor_u16(uint16x4_t a, uint16x4_t b); // VEOR d0,d0,d0
15489	#define veor_u16 veor_s16
15490
15491	_NEON2SSE_GLOBAL uint32x2_t veor_u32(uint32x2_t a, uint32x2_t b); // VEOR d0,d0,d0
15492	#define veor_u32 veor_s32
15493
15494	_NEON2SSE_GLOBAL uint64x1_t veor_u64(uint64x1_t a, uint64x1_t b); // VEOR d0,d0,d0
15495	#define veor_u64 veor_s64
15496
15497	_NEON2SSE_GLOBAL int8x16_t veorq_s8(int8x16_t a, int8x16_t b); // VEOR q0,q0,q0
15498	#define veorq_s8 _mm_xor_si128
15499
15500	_NEON2SSE_GLOBAL int16x8_t veorq_s16(int16x8_t a, int16x8_t b); // VEOR q0,q0,q0
15501	#define veorq_s16 _mm_xor_si128
15502
15503	_NEON2SSE_GLOBAL int32x4_t veorq_s32(int32x4_t a, int32x4_t b); // VEOR q0,q0,q0
15504	#define veorq_s32 _mm_xor_si128
15505
15506	_NEON2SSE_GLOBAL int64x2_t veorq_s64(int64x2_t a, int64x2_t b); // VEOR q0,q0,q0
15507	#define veorq_s64 _mm_xor_si128
15508
15509	_NEON2SSE_GLOBAL uint8x16_t veorq_u8(uint8x16_t a, uint8x16_t b); // VEOR q0,q0,q0
15510	#define veorq_u8 _mm_xor_si128
15511
15512	_NEON2SSE_GLOBAL uint16x8_t veorq_u16(uint16x8_t a, uint16x8_t b); // VEOR q0,q0,q0
15513	#define veorq_u16 _mm_xor_si128
15514
15515	_NEON2SSE_GLOBAL uint32x4_t veorq_u32(uint32x4_t a, uint32x4_t b); // VEOR q0,q0,q0
15516	#define veorq_u32 _mm_xor_si128
15517
15518	_NEON2SSE_GLOBAL uint64x2_t veorq_u64(uint64x2_t a, uint64x2_t b); // VEOR q0,q0,q0
15519	#define veorq_u64 _mm_xor_si128
15520
15521	//******************** Bit Clear ********************************
15522	//*******************************************************************
15523	//Logical AND complement (AND negation or AND NOT)
15524	_NEON2SSESTORAGE int8x8_t vbic_s8(int8x8_t a, int8x8_t b); // VBIC d0,d0,d0
15525	_NEON2SSE_INLINE int8x8_t vbic_s8(int8x8_t a, int8x8_t b)
15526	{
15527	int8x8_t res64;
15528	return64(_mm_andnot_si128(_pM128i(b),_pM128i(a))); //notice the arguments "swap"
15529	}
15530
15531	_NEON2SSE_GLOBAL int16x4_t vbic_s16(int16x4_t a, int16x4_t b); // VBIC d0,d0,d0
15532	#define vbic_s16 vbic_s8
15533
15534	_NEON2SSE_GLOBAL int32x2_t vbic_s32(int32x2_t a, int32x2_t b); // VBIC d0,d0,d0
15535	#define vbic_s32 vbic_s8
15536
15537	_NEON2SSESTORAGE int64x1_t vbic_s64(int64x1_t a, int64x1_t b); // VBIC d0,d0,d0
15538	_NEON2SSE_INLINE int64x1_t vbic_s64(int64x1_t a, int64x1_t b)
15539	{
15540	int64x1_t res;
15541	res.m64_i64[`0`] = a.m64_i64[`0`] & (~b.m64_i64[`0`]);
15542	return res;
15543	}
15544
15545	_NEON2SSE_GLOBAL uint8x8_t vbic_u8(uint8x8_t a, uint8x8_t b); // VBIC d0,d0,d0
15546	#define vbic_u8 vbic_s8
15547
15548	_NEON2SSE_GLOBAL uint16x4_t vbic_u16(uint16x4_t a, uint16x4_t b); // VBIC d0,d0,d0
15549	#define vbic_u16 vbic_s16
15550
15551	_NEON2SSE_GLOBAL uint32x2_t vbic_u32(uint32x2_t a, uint32x2_t b); // VBIC d0,d0,d0
15552	#define vbic_u32 vbic_s32
15553
15554	_NEON2SSE_GLOBAL uint64x1_t vbic_u64(uint64x1_t a, uint64x1_t b); // VBIC d0,d0,d0
15555	#define vbic_u64 vbic_s64
15556
15557	_NEON2SSE_GLOBAL int8x16_t vbicq_s8(int8x16_t a, int8x16_t b); // VBIC q0,q0,q0
15558	#define vbicq_s8(a,b) _mm_andnot_si128 (b,a) //notice arguments "swap"
15559
15560	_NEON2SSE_GLOBAL int16x8_t vbicq_s16(int16x8_t a, int16x8_t b); // VBIC q0,q0,q0
15561	#define vbicq_s16(a,b) _mm_andnot_si128 (b,a) //notice arguments "swap"
15562
15563	_NEON2SSE_GLOBAL int32x4_t vbicq_s32(int32x4_t a, int32x4_t b); // VBIC q0,q0,q0
15564	#define vbicq_s32(a,b) _mm_andnot_si128 (b,a) //notice arguments "swap"
15565
15566	_NEON2SSE_GLOBAL int64x2_t vbicq_s64(int64x2_t a, int64x2_t b); // VBIC q0,q0,q0
15567	#define vbicq_s64(a,b) _mm_andnot_si128 (b,a) //notice arguments "swap"
15568
15569	_NEON2SSE_GLOBAL uint8x16_t vbicq_u8(uint8x16_t a, uint8x16_t b); // VBIC q0,q0,q0
15570	#define vbicq_u8(a,b) _mm_andnot_si128 (b,a) //notice arguments "swap"
15571
15572	_NEON2SSE_GLOBAL uint16x8_t vbicq_u16(uint16x8_t a, uint16x8_t b); // VBIC q0,q0,q0
15573	#define vbicq_u16(a,b) _mm_andnot_si128 (b,a) //notice arguments "swap"
15574
15575	_NEON2SSE_GLOBAL uint32x4_t vbicq_u32(uint32x4_t a, uint32x4_t b); // VBIC q0,q0,q0
15576	#define vbicq_u32(a,b) _mm_andnot_si128 (b,a) //notice arguments "swap"
15577
15578	_NEON2SSE_GLOBAL uint64x2_t vbicq_u64(uint64x2_t a, uint64x2_t b); // VBIC q0,q0,q0
15579	#define vbicq_u64(a,b) _mm_andnot_si128 (b,a) //notice arguments "swap"
15580
15581	//************** Bitwise OR complement ******************************
15582	//************************************** ******************************
15583	//no exact IA 32 match, need to implement it as following
15584	_NEON2SSESTORAGE int8x8_t vorn_s8(int8x8_t a, int8x8_t b); // VORN d0,d0,d0
15585	_NEON2SSE_INLINE int8x8_t vorn_s8(int8x8_t a, int8x8_t b)
15586	{
15587	int8x8_t res64;
15588	return64(vornq_s8(_pM128i(a), _pM128i(b)));
15589	}
15590
15591
15592	_NEON2SSESTORAGE int16x4_t vorn_s16(int16x4_t a, int16x4_t b); // VORN d0,d0,d0
15593	_NEON2SSE_INLINE int16x4_t vorn_s16(int16x4_t a, int16x4_t b)
15594	{
15595	int16x4_t res64;
15596	return64(vornq_s16(_pM128i(a), _pM128i(b)));
15597	}
15598
15599
15600	_NEON2SSESTORAGE int32x2_t vorn_s32(int32x2_t a, int32x2_t b); // VORN d0,d0,d0
15601	_NEON2SSE_INLINE int32x2_t vorn_s32(int32x2_t a, int32x2_t b)
15602	{
15603	int32x2_t res64;
15604	return64(vornq_s32(_pM128i(a), _pM128i(b)));
15605	}
15606
15607
15608	_NEON2SSESTORAGE int64x1_t vorn_s64(int64x1_t a, int64x1_t b); // VORN d0,d0,d0
15609	_NEON2SSE_INLINE int64x1_t vorn_s64(int64x1_t a, int64x1_t b)
15610	{
15611	int64x1_t res;
15612	res.m64_i64[`0`] = a.m64_i64[`0`] \| (~b.m64_i64[`0`]);
15613	return res;
15614	}
15615
15616	_NEON2SSE_GLOBAL uint8x8_t vorn_u8(uint8x8_t a, uint8x8_t b); // VORN d0,d0,d0
15617	#define vorn_u8 vorn_s8
15618
15619
15620	_NEON2SSE_GLOBAL uint16x4_t vorn_u16(uint16x4_t a, uint16x4_t b); // VORN d0,d0,d0
15621	#define vorn_u16 vorn_s16
15622
15623	_NEON2SSE_GLOBAL uint32x2_t vorn_u32(uint32x2_t a, uint32x2_t b); // VORN d0,d0,d0
15624	#define vorn_u32 vorn_s32
15625
15626	_NEON2SSE_GLOBAL uint64x1_t vorn_u64(uint64x1_t a, uint64x1_t b); // VORN d0,d0,d0
15627	#define vorn_u64 vorn_s64
15628
15629
15630	_NEON2SSESTORAGE int8x16_t vornq_s8(int8x16_t a, int8x16_t b); // VORN q0,q0,q0
15631	_NEON2SSE_INLINE int8x16_t vornq_s8(int8x16_t a, int8x16_t b) // VORN q0,q0,q0
15632	{
15633	__m128i b1;
15634	b1 = vmvnq_s8( b); //bitwise not for b
15635	return _mm_or_si128 (a, b1);
15636	}
15637
15638	_NEON2SSESTORAGE int16x8_t vornq_s16(int16x8_t a, int16x8_t b); // VORN q0,q0,q0
15639	_NEON2SSE_INLINE int16x8_t vornq_s16(int16x8_t a, int16x8_t b) // VORN q0,q0,q0
15640	{
15641	__m128i b1;
15642	b1 = vmvnq_s16( b); //bitwise not for b
15643	return _mm_or_si128 (a, b1);
15644	}
15645
15646	_NEON2SSESTORAGE int32x4_t vornq_s32(int32x4_t a, int32x4_t b); // VORN q0,q0,q0
15647	_NEON2SSE_INLINE int32x4_t vornq_s32(int32x4_t a, int32x4_t b) // VORN q0,q0,q0
15648	{
15649	__m128i b1;
15650	b1 = vmvnq_s32( b); //bitwise not for b
15651	return _mm_or_si128 (a, b1);
15652	}
15653
15654	_NEON2SSESTORAGE int64x2_t vornq_s64(int64x2_t a, int64x2_t b); // VORN q0,q0,q0
15655	_NEON2SSE_INLINE int64x2_t vornq_s64(int64x2_t a, int64x2_t b)
15656	{
15657	__m128i c1, b1;
15658	c1 = _mm_cmpeq_epi8 (a, a); //all ones 0xfffffff...fffff
15659	b1 = _mm_andnot_si128 (b, c1);
15660	return _mm_or_si128 (a, b1);
15661	}
15662
15663	_NEON2SSESTORAGE uint8x16_t vornq_u8(uint8x16_t a, uint8x16_t b); // VORN q0,q0,q0
15664	_NEON2SSE_INLINE uint8x16_t vornq_u8(uint8x16_t a, uint8x16_t b) // VORN q0,q0,q0
15665	{
15666	__m128i b1;
15667	b1 = vmvnq_u8( b); //bitwise not for b
15668	return _mm_or_si128 (a, b1);
15669	}
15670
15671	_NEON2SSESTORAGE uint16x8_t vornq_u16(uint16x8_t a, uint16x8_t b); // VORN q0,q0,q0
15672	_NEON2SSE_INLINE uint16x8_t vornq_u16(uint16x8_t a, uint16x8_t b) // VORN q0,q0,q0
15673	{
15674	__m128i b1;
15675	b1 = vmvnq_s16( b); //bitwise not for b
15676	return _mm_or_si128 (a, b1);
15677	}
15678
15679	_NEON2SSESTORAGE uint32x4_t vornq_u32(uint32x4_t a, uint32x4_t b); // VORN q0,q0,q0
15680	_NEON2SSE_INLINE uint32x4_t vornq_u32(uint32x4_t a, uint32x4_t b) // VORN q0,q0,q0
15681	{
15682	__m128i b1;
15683	b1 = vmvnq_u32( b); //bitwise not for b
15684	return _mm_or_si128 (a, b1);
15685	}
15686	_NEON2SSE_GLOBAL uint64x2_t vornq_u64(uint64x2_t a, uint64x2_t b); // VORN q0,q0,q0
15687	#define vornq_u64 vornq_s64
15688
15689	//******************* Bitwise Select ***************************
15690	//******************************************************************
15691	//Note This intrinsic can compile to any of VBSL/VBIF/VBIT depending on register allocation.(?????????)
15692
15693	//VBSL (Bitwise Select) selects each bit for the destination from the first operand if the
15694	//corresponding bit of the destination is 1, or from the second operand if the corresponding bit of the destination is 0.
15695
15696	//VBIF (Bitwise Insert if False) inserts each bit from the first operand into the destination
15697	//if the corresponding bit of the second operand is 0, otherwise leaves the destination bit unchanged
15698
15699	//VBIT (Bitwise Insert if True) inserts each bit from the first operand into the destination
15700	//if the corresponding bit of the second operand is 1, otherwise leaves the destination bit unchanged.
15701
15702	//VBSL only is implemented for SIMD
15703	_NEON2SSESTORAGE int8x8_t vbsl_s8(uint8x8_t a, int8x8_t b, int8x8_t c); // VBSL d0,d0,d0
15704	_NEON2SSE_INLINE int8x8_t vbsl_s8(uint8x8_t a, int8x8_t b, int8x8_t c)
15705	{
15706	int8x8_t res64;
15707	__m128i res;
15708	res = vbslq_s8(_pM128i(a), _pM128i(b), _pM128i(c));
15709	return64(res);
15710	}
15711
15712	_NEON2SSE_GLOBAL int16x4_t vbsl_s16(uint16x4_t a, int16x4_t b, int16x4_t c); // VBSL d0,d0,d0
15713	#define vbsl_s16 vbsl_s8
15714
15715	_NEON2SSE_GLOBAL int32x2_t vbsl_s32(uint32x2_t a, int32x2_t b, int32x2_t c); // VBSL d0,d0,d0
15716	#define vbsl_s32 vbsl_s8
15717
15718	_NEON2SSESTORAGE int64x1_t vbsl_s64(uint64x1_t a, int64x1_t b, int64x1_t c); // VBSL d0,d0,d0
15719	_NEON2SSE_INLINE int64x1_t vbsl_s64(uint64x1_t a, int64x1_t b, int64x1_t c)
15720	{
15721	int64x1_t res;
15722	res.m64_i64[`0`] = (a.m64_i64[`0`] & b.m64_i64[`0`]) \| ( (~a.m64_i64[`0`]) & c.m64_i64[`0`]);
15723	return res;
15724	}
15725
15726	_NEON2SSE_GLOBAL uint8x8_t vbsl_u8(uint8x8_t a, uint8x8_t b, uint8x8_t c); // VBSL d0,d0,d0
15727	#define vbsl_u8 vbsl_s8
15728
15729	_NEON2SSE_GLOBAL uint16x4_t vbsl_u16(uint16x4_t a, uint16x4_t b, uint16x4_t c); // VBSL d0,d0,d0
15730	#define vbsl_u16 vbsl_s8
15731
15732	_NEON2SSE_GLOBAL uint32x2_t vbsl_u32(uint32x2_t a, uint32x2_t b, uint32x2_t c); // VBSL d0,d0,d0
15733	#define vbsl_u32 vbsl_s8
15734
15735	_NEON2SSE_GLOBAL uint64x1_t vbsl_u64(uint64x1_t a, uint64x1_t b, uint64x1_t c); // VBSL d0,d0,d0
15736	#define vbsl_u64 vbsl_s64
15737
15738	_NEON2SSESTORAGE float32x2_t vbsl_f32(uint32x2_t a, float32x2_t b, float32x2_t c); // VBSL d0,d0,d0
15739	_NEON2SSE_INLINE float32x2_t vbsl_f32(uint32x2_t a, float32x2_t b, float32x2_t c)
15740	{
15741	__m128 sel1, sel2;
15742	__m64_128 res64;
15743	sel1 = _mm_and_ps (_pM128(a), _pM128(b));
15744	sel2 = _mm_andnot_ps (_pM128(a), _pM128(c));
15745	sel1 = _mm_or_ps (sel1, sel2);
15746	_M64f(res64, sel1);
15747	return res64;
15748	}
15749
15750	_NEON2SSE_GLOBAL poly8x8_t vbsl_p8(uint8x8_t a, poly8x8_t b, poly8x8_t c); // VBSL d0,d0,d0
15751	#define vbsl_p8 vbsl_s8
15752
15753	_NEON2SSE_GLOBAL poly16x4_t vbsl_p16(uint16x4_t a, poly16x4_t b, poly16x4_t c); // VBSL d0,d0,d0
15754	#define vbsl_p16 vbsl_s8
15755
15756	_NEON2SSESTORAGE int8x16_t vbslq_s8(uint8x16_t a, int8x16_t b, int8x16_t c); // VBSL q0,q0,q0
15757	_NEON2SSE_INLINE int8x16_t vbslq_s8(uint8x16_t a, int8x16_t b, int8x16_t c) // VBSL q0,q0,q0
15758	{
15759	__m128i sel1, sel2;
15760	sel1 = _mm_and_si128 (a, b);
15761	sel2 = _mm_andnot_si128 (a, c);
15762	return _mm_or_si128 (sel1, sel2);
15763	}
15764
15765	_NEON2SSE_GLOBAL int16x8_t vbslq_s16(uint16x8_t a, int16x8_t b, int16x8_t c); // VBSL q0,q0,q0
15766	#define vbslq_s16 vbslq_s8
15767
15768	_NEON2SSE_GLOBAL int32x4_t vbslq_s32(uint32x4_t a, int32x4_t b, int32x4_t c); // VBSL q0,q0,q0
15769	#define vbslq_s32 vbslq_s8
15770
15771	_NEON2SSE_GLOBAL int64x2_t vbslq_s64(uint64x2_t a, int64x2_t b, int64x2_t c); // VBSL q0,q0,q0
15772	#define vbslq_s64 vbslq_s8
15773
15774	_NEON2SSE_GLOBAL uint8x16_t vbslq_u8(uint8x16_t a, uint8x16_t b, uint8x16_t c); // VBSL q0,q0,q0
15775	#define vbslq_u8 vbslq_s8
15776
15777	_NEON2SSE_GLOBAL uint16x8_t vbslq_u16(uint16x8_t a, uint16x8_t b, uint16x8_t c); // VBSL q0,q0,q0
15778	#define vbslq_u16 vbslq_s8
15779
15780	_NEON2SSE_GLOBAL uint32x4_t vbslq_u32(uint32x4_t a, uint32x4_t b, uint32x4_t c); // VBSL q0,q0,q0
15781	#define vbslq_u32 vbslq_s8
15782
15783	_NEON2SSE_GLOBAL uint64x2_t vbslq_u64(uint64x2_t a, uint64x2_t b, uint64x2_t c); // VBSL q0,q0,q0
15784	#define vbslq_u64 vbslq_s8
15785
15786	_NEON2SSESTORAGE float32x4_t vbslq_f32(uint32x4_t a, float32x4_t b, float32x4_t c); // VBSL q0,q0,q0
15787	_NEON2SSE_INLINE float32x4_t vbslq_f32(uint32x4_t a, float32x4_t b, float32x4_t c) // VBSL q0,q0,q0
15788	{
15789	__m128 sel1, sel2;
15790	sel1 = _mm_and_ps ((__m128)&a, b);
15791	sel2 = _mm_andnot_ps ((__m128)&a, c);
15792	return _mm_or_ps (sel1, sel2);
15793	}
15794
15795	_NEON2SSE_GLOBAL poly8x16_t vbslq_p8(uint8x16_t a, poly8x16_t b, poly8x16_t c); // VBSL q0,q0,q0
15796	#define vbslq_p8 vbslq_u8
15797
15798	_NEON2SSE_GLOBAL poly16x8_t vbslq_p16(uint16x8_t a, poly16x8_t b, poly16x8_t c); // VBSL q0,q0,q0
15799	#define vbslq_p16 vbslq_s8
15800
15801	//************************************************************************************
15802	//************** Transposition operations **************************************
15803	//************************************************************************************
15804	//*************** Vector Transpose **********************************************
15805	//************************************************************************************
15806	//VTRN (Vector Transpose) treats the elements of its operand vectors as elements of 2 x 2 matrices, and transposes the matrices.
15807	// making the result look as (a0, b0, a2, b2, a4, b4,....) (a1, b1, a3, b3, a5, b5,.....)
15808	_NEON2SSESTORAGE int8x8x2_t vtrn_s8(int8x8_t a, int8x8_t b); // VTRN.8 d0,d0
15809	_NEON2SSE_INLINE int8x8x2_t vtrn_s8(int8x8_t a, int8x8_t b) // VTRN.8 d0,d0
15810	{
15811	int8x8x2_t val;
15812	__m128i tmp, val0;
15813	tmp = _mm_unpacklo_epi8(_pM128i(a), _pM128i(b)); //a0,b0,a1,b1,a2,b2,a3,b3,...,a7,b7
15814	val0 = _mm_shuffle_epi8 (tmp, (__m128i)mask8_32_even_odd); //(a0, b0, a2, b2, a4, b4, a6, b6), (a1,b1, a3,b3, a5,b5, a7,b7)
15815	vst1q_s8 (val.val, val0); // _mm_shuffle_epi32 (val.val[0], _SWAP_HI_LOW32); //(a1,b1, a3,b3, a5,b5, a7,b7),(a0, b0, a2, b2, a4, b4, a6, b6),
15816	return val;
15817	}
15818
15819	_NEON2SSESTORAGE int16x4x2_t vtrn_s16(int16x4_t a, int16x4_t b); // VTRN.16 d0,d0
15820	_NEON2SSE_INLINE int16x4x2_t vtrn_s16(int16x4_t a, int16x4_t b) // VTRN.16 d0,d0
15821	{
15822	int16x4x2_t val;
15823	__m128i tmp, val0;
15824	_NEON2SSE_ALIGN_16 static const int8_t maskdlv16[`16`] = {`0`,`1`, `2`,`3`, `8`,`9`, `10`,`11`, `4`,`5`, `6`,`7`, `12`,`13`, `14`, `15`};
15825	tmp = _mm_unpacklo_epi16(_pM128i(a), _pM128i(b)); //a0,b0,a1,b1,a2,b2,a3,b3
15826	val0 = _mm_shuffle_epi8 (tmp, (__m128i)maskdlv16); //a0, b0, a2, b2, a1,b1, a3, b3
15827	vst1q_s16(val.val, val0); // _mm_shuffle_epi32 (val.val[0], _SWAP_HI_LOW32); //(a1,b1, a3,b3),(a0, b0, a2, b2),
15828	return val;
15829	}
15830
15831	_NEON2SSESTORAGE int32x2x2_t vtrn_s32(int32x2_t a, int32x2_t b); // VTRN.32 d0,d0
15832	_NEON2SSE_INLINE int32x2x2_t vtrn_s32(int32x2_t a, int32x2_t b)
15833	{
15834	int32x2x2_t val;
15835	__m128i val0;
15836	val0 = _mm_unpacklo_epi32(_pM128i(a), _pM128i(b)); //a0,b0,a1,b1
15837	vst1q_s32(val.val, val0); // _mm_shuffle_epi32(val.val[0], _SWAP_HI_LOW32); //a1,b1, a0,b0,
15838	return val;
15839	}
15840
15841	_NEON2SSE_GLOBAL uint8x8x2_t vtrn_u8(uint8x8_t a, uint8x8_t b); // VTRN.8 d0,d0
15842	#define vtrn_u8 vtrn_s8
15843
15844	_NEON2SSE_GLOBAL uint16x4x2_t vtrn_u16(uint16x4_t a, uint16x4_t b); // VTRN.16 d0,d0
15845	#define vtrn_u16 vtrn_s16
15846
15847	_NEON2SSE_GLOBAL uint32x2x2_t vtrn_u32(uint32x2_t a, uint32x2_t b); // VTRN.32 d0,d0
15848	#define vtrn_u32 vtrn_s32
15849
15850	_NEON2SSESTORAGE float32x2x2_t vtrn_f32(float32x2_t a, float32x2_t b); // VTRN.32 d0,d0
15851	_NEON2SSE_INLINE float32x2x2_t vtrn_f32(float32x2_t a, float32x2_t b)
15852	{
15853	float32x2x2_t val;
15854	val.val[`0`].m64_f32[`0`] = a.m64_f32[`0`];
15855	val.val[`0`].m64_f32[`1`] = b.m64_f32[`0`];
15856	val.val[`1`].m64_f32[`0`] = a.m64_f32[`1`];
15857	val.val[`1`].m64_f32[`1`] = b.m64_f32[`1`];
15858	return val; //a0,b0,a1,b1
15859	}
15860
15861	_NEON2SSE_GLOBAL poly8x8x2_t vtrn_p8(poly8x8_t a, poly8x8_t b); // VTRN.8 d0,d0
15862	#define vtrn_p8 vtrn_u8
15863
15864	_NEON2SSE_GLOBAL poly16x4x2_t vtrn_p16(poly16x4_t a, poly16x4_t b); // VTRN.16 d0,d0
15865	#define vtrn_p16 vtrn_s16
15866
15867	//int8x16x2_t vtrnq_s8(int8x16_t a, int8x16_t b); // VTRN.8 q0,q0
15868	_NEON2SSE_INLINE int8x16x2_t vtrnq_s8(int8x16_t a, int8x16_t b) // VTRN.8 q0,q0
15869	{
15870	int8x16x2_t r8x16;
15871	__m128i a_sh, b_sh;
15872	a_sh = _mm_shuffle_epi8 (a, (__m128i)mask8_16_even_odd); //a0, a2, a4, a6, a8, a10, a12, a14, a1, a3, a5, a7, a9, a11, a13, a15
15873	b_sh = _mm_shuffle_epi8 (b, (__m128i)mask8_16_even_odd); //b0, b2, b4, b6, b8, b10, b12, b14, b1, b3, b5, b7, b9, b11, b13, b15
15874
15875	r8x16.val[`0`] = _mm_unpacklo_epi8(a_sh, b_sh); //(a0, b0, a2, b2, a4, b4, a6, b6, a8,b8, a10,b10, a12,b12, a14,b14)
15876	r8x16.val[`1`] = _mm_unpackhi_epi8(a_sh, b_sh); // (a1, b1, a3, b3, a5, b5, a7, b7, a9,b9, a11,b11, a13,b13, a15,b15)
15877	return r8x16;
15878	}
15879
15880	_NEON2SSESTORAGE int16x8x2_t vtrnq_s16(int16x8_t a, int16x8_t b); // VTRN.16 q0,q0
15881	_NEON2SSE_INLINE int16x8x2_t vtrnq_s16(int16x8_t a, int16x8_t b) // VTRN.16 q0,q0
15882	{
15883	int16x8x2_t v16x8;
15884	__m128i a_sh, b_sh;
15885	a_sh = _mm_shuffle_epi8 (a, (__m128i) mask8_32_even_odd); //a0, a2, a4, a6, a1, a3, a5, a7
15886	b_sh = _mm_shuffle_epi8 (b, (__m128i) mask8_32_even_odd); //b0, b2, b4, b6, b1, b3, b5, b7
15887	v16x8.val[`0`] = _mm_unpacklo_epi16(a_sh, b_sh); //a0, b0, a2, b2, a4, b4, a6, b6
15888	v16x8.val[`1`] = _mm_unpackhi_epi16(a_sh, b_sh); //a1, b1, a3, b3, a5, b5, a7, b7
15889	return v16x8;
15890	}
15891
15892	_NEON2SSESTORAGE int32x4x2_t vtrnq_s32(int32x4_t a, int32x4_t b); // VTRN.32 q0,q0
15893	_NEON2SSE_INLINE int32x4x2_t vtrnq_s32(int32x4_t a, int32x4_t b) // VTRN.32 q0,q0
15894	{
15895	//may be not optimal solution compared with serial
15896	int32x4x2_t v32x4;
15897	__m128i a_sh, b_sh;
15898	a_sh = _mm_shuffle_epi32 (a, `216`); //a0, a2, a1, a3
15899	b_sh = _mm_shuffle_epi32 (b, `216`); //b0, b2, b1, b3
15900
15901	v32x4.val[`0`] = _mm_unpacklo_epi32(a_sh, b_sh); //a0, b0, a2, b2
15902	v32x4.val[`1`] = _mm_unpackhi_epi32(a_sh, b_sh); //a1, b1, a3, b3
15903	return v32x4;
15904	}
15905
15906	_NEON2SSE_GLOBAL uint8x16x2_t vtrnq_u8(uint8x16_t a, uint8x16_t b); // VTRN.8 q0,q0
15907	#define vtrnq_u8 vtrnq_s8
15908
15909	_NEON2SSE_GLOBAL uint16x8x2_t vtrnq_u16(uint16x8_t a, uint16x8_t b); // VTRN.16 q0,q0
15910	#define vtrnq_u16 vtrnq_s16
15911
15912	_NEON2SSE_GLOBAL uint32x4x2_t vtrnq_u32(uint32x4_t a, uint32x4_t b); // VTRN.32 q0,q0
15913	#define vtrnq_u32 vtrnq_s32
15914
15915	_NEON2SSESTORAGE float32x4x2_t vtrnq_f32(float32x4_t a, float32x4_t b); // VTRN.32 q0,q0
15916	_NEON2SSE_INLINE float32x4x2_t vtrnq_f32(float32x4_t a, float32x4_t b) // VTRN.32 q0,q0
15917	{
15918	//may be not optimal solution compared with serial
15919	float32x4x2_t f32x4;
15920	__m128 a_sh, b_sh;
15921	a_sh = _mm_shuffle_ps (a, a, _MM_SHUFFLE(`3`,`1`, `2`, `0`)); //a0, a2, a1, a3, need to check endiness
15922	b_sh = _mm_shuffle_ps (b, b, _MM_SHUFFLE(`3`,`1`, `2`, `0`)); //b0, b2, b1, b3, need to check endiness
15923
15924	f32x4.val[`0`] = _mm_unpacklo_ps(a_sh, b_sh); //a0, b0, a2, b2
15925	f32x4.val[`1`] = _mm_unpackhi_ps(a_sh, b_sh); //a1, b1, a3, b3
15926	return f32x4;
15927	}
15928
15929	_NEON2SSE_GLOBAL poly8x16x2_t vtrnq_p8(poly8x16_t a, poly8x16_t b); // VTRN.8 q0,q0
15930	#define vtrnq_p8 vtrnq_s8
15931
15932	_NEON2SSE_GLOBAL poly16x8x2_t vtrnq_p16(poly16x8_t a, poly16x8_t b); // VTRN.16 q0,q0
15933	#define vtrnq_p16 vtrnq_s16
15934
15935	//*************** Interleave elements *************************
15936	//*****************************************************************
15937	//output has (a0,b0,a1,b1, a2,b2,.....)
15938	_NEON2SSESTORAGE int8x8x2_t vzip_s8(int8x8_t a, int8x8_t b); // VZIP.8 d0,d0
15939	_NEON2SSE_INLINE int8x8x2_t vzip_s8(int8x8_t a, int8x8_t b) // VZIP.8 d0,d0
15940	{
15941	int8x8x2_t val;
15942	__m128i val0;
15943	val0 = _mm_unpacklo_epi8(_pM128i(a), _pM128i(b));
15944	vst1q_s8(val.val, val0); //_mm_shuffle_epi32(val.val[0], _SWAP_HI_LOW32);
15945	return val;
15946	}
15947
15948	_NEON2SSESTORAGE int16x4x2_t vzip_s16(int16x4_t a, int16x4_t b); // VZIP.16 d0,d0
15949	_NEON2SSE_INLINE int16x4x2_t vzip_s16(int16x4_t a, int16x4_t b) // VZIP.16 d0,d0
15950	{
15951	int16x4x2_t val;
15952	__m128i val0;
15953	val0 = _mm_unpacklo_epi16(_pM128i(a), _pM128i(b));
15954	vst1q_s16(val.val, val0); // _mm_shuffle_epi32(val.val[0], _SWAP_HI_LOW32);
15955	return val;
15956	}
15957
15958	_NEON2SSE_GLOBAL int32x2x2_t vzip_s32(int32x2_t a, int32x2_t b); // VZIP.32 d0,d0
15959	#define vzip_s32 vtrn_s32
15960
15961	_NEON2SSE_GLOBAL uint8x8x2_t vzip_u8(uint8x8_t a, uint8x8_t b); // VZIP.8 d0,d0
15962	#define vzip_u8 vzip_s8
15963
15964	_NEON2SSE_GLOBAL uint16x4x2_t vzip_u16(uint16x4_t a, uint16x4_t b); // VZIP.16 d0,d0
15965	#define vzip_u16 vzip_s16
15966
15967	_NEON2SSE_GLOBAL uint32x2x2_t vzip_u32(uint32x2_t a, uint32x2_t b); // VZIP.32 d0,d0
15968	#define vzip_u32 vzip_s32
15969
15970	_NEON2SSE_GLOBAL float32x2x2_t vzip_f32(float32x2_t a, float32x2_t b); // VZIP.32 d0,d0
15971	#define vzip_f32 vtrn_f32
15972
15973	_NEON2SSE_GLOBAL poly8x8x2_t vzip_p8(poly8x8_t a, poly8x8_t b); // VZIP.8 d0,d0
15974	#define vzip_p8 vzip_u8
15975
15976	_NEON2SSE_GLOBAL poly16x4x2_t vzip_p16(poly16x4_t a, poly16x4_t b); // VZIP.16 d0,d0
15977	#define vzip_p16 vzip_u16
15978
15979	_NEON2SSESTORAGE int8x16x2_t vzipq_s8(int8x16_t a, int8x16_t b); // VZIP.8 q0,q0
15980	_NEON2SSE_INLINE int8x16x2_t vzipq_s8(int8x16_t a, int8x16_t b) // VZIP.8 q0,q0
15981	{
15982	int8x16x2_t r8x16;
15983	r8x16.val[`0`] = _mm_unpacklo_epi8(a, b);
15984	r8x16.val[`1`] = _mm_unpackhi_epi8(a, b);
15985	return r8x16;
15986	}
15987
15988	_NEON2SSESTORAGE int16x8x2_t vzipq_s16(int16x8_t a, int16x8_t b); // VZIP.16 q0,q0
15989	_NEON2SSE_INLINE int16x8x2_t vzipq_s16(int16x8_t a, int16x8_t b) // VZIP.16 q0,q0
15990	{
15991	int16x8x2_t r16x8;
15992	r16x8.val[`0`] = _mm_unpacklo_epi16(a, b);
15993	r16x8.val[`1`] = _mm_unpackhi_epi16(a, b);
15994	return r16x8;
15995	}
15996
15997	_NEON2SSESTORAGE int32x4x2_t vzipq_s32(int32x4_t a, int32x4_t b); // VZIP.32 q0,q0
15998	_NEON2SSE_INLINE int32x4x2_t vzipq_s32(int32x4_t a, int32x4_t b) // VZIP.32 q0,q0
15999	{
16000	int32x4x2_t r32x4;
16001	r32x4.val[`0`] = _mm_unpacklo_epi32(a, b);
16002	r32x4.val[`1`] = _mm_unpackhi_epi32(a, b);
16003	return r32x4;
16004	}
16005
16006	_NEON2SSE_GLOBAL uint8x16x2_t vzipq_u8(uint8x16_t a, uint8x16_t b); // VZIP.8 q0,q0
16007	#define vzipq_u8 vzipq_s8
16008
16009	_NEON2SSE_GLOBAL uint16x8x2_t vzipq_u16(uint16x8_t a, uint16x8_t b); // VZIP.16 q0,q0
16010	#define vzipq_u16 vzipq_s16
16011
16012	_NEON2SSE_GLOBAL uint32x4x2_t vzipq_u32(uint32x4_t a, uint32x4_t b); // VZIP.32 q0,q0
16013	#define vzipq_u32 vzipq_s32
16014
16015	_NEON2SSESTORAGE float32x4x2_t vzipq_f32(float32x4_t a, float32x4_t b); // VZIP.32 q0,q0
16016	_NEON2SSE_INLINE float32x4x2_t vzipq_f32(float32x4_t a, float32x4_t b) // VZIP.32 q0,q0
16017	{
16018	float32x4x2_t f32x4;
16019	f32x4.val[`0`] = _mm_unpacklo_ps ( a, b);
16020	f32x4.val[`1`] = _mm_unpackhi_ps ( a, b);
16021	return f32x4;
16022	}
16023
16024	_NEON2SSE_GLOBAL poly8x16x2_t vzipq_p8(poly8x16_t a, poly8x16_t b); // VZIP.8 q0,q0
16025	#define vzipq_p8 vzipq_u8
16026
16027	_NEON2SSE_GLOBAL poly16x8x2_t vzipq_p16(poly16x8_t a, poly16x8_t b); // VZIP.16 q0,q0
16028	#define vzipq_p16 vzipq_u16
16029
16030	//********************* De-Interleave elements ***********************
16031	//*************************************************************************
16032	//As the result of these functions first val contains (a0,a2,a4,....,b0,b2, b4,...) and the second val (a1,a3,a5,....b1,b3,b5...)
16033	//no such functions in IA32 SIMD, shuffle is required
16034	_NEON2SSESTORAGE int8x8x2_t vuzp_s8(int8x8_t a, int8x8_t b); // VUZP.8 d0,d0
16035	_NEON2SSE_INLINE int8x8x2_t vuzp_s8(int8x8_t a, int8x8_t b) // VUZP.8 d0,d0
16036	{
16037	int8x8x2_t val;
16038	__m128i tmp, val0;
16039	_NEON2SSE_ALIGN_16 static const int8_t maskdlv8[`16`] = { `0`, `4`, `8`, `12`, `1`, `5`, `9`, `13`, `2`, `6`, `10`, `14`, `3`, `7`, `11`,`15`};
16040	tmp = _mm_unpacklo_epi8(_pM128i(a), _pM128i(b)); //a0,b0,a1,b1,a2,b2,a3,b3,...,a7,b7
16041	val0 = _mm_shuffle_epi8 (tmp, (__m128i)maskdlv8); //(a0, a2, a4, a6, b0, b2, b4, b6), (a1, a3, a5, a7, b1,b3, b5, b7)
16042	vst1q_s8(val.val, val0); // _mm_shuffle_epi32(val.val[0], _SWAP_HI_LOW32);
16043	return val;
16044	}
16045
16046	_NEON2SSESTORAGE int16x4x2_t vuzp_s16(int16x4_t a, int16x4_t b); // VUZP.16 d0,d0
16047	_NEON2SSE_INLINE int16x4x2_t vuzp_s16(int16x4_t a, int16x4_t b) // VUZP.16 d0,d0
16048	{
16049	int16x4x2_t val;
16050	__m128i tmp, val0;
16051	_NEON2SSE_ALIGN_16 static const int8_t maskdlv16[`16`] = {`0`,`1`, `8`,`9`, `2`,`3`, `10`,`11`, `4`,`5`, `12`,`13`, `6`,`7`, `14`,`15`};
16052	tmp = _mm_unpacklo_epi16(_pM128i(a), _pM128i(b)); //a0,b0,a1,b1,a2,b2,a3,b3
16053	val0 = _mm_shuffle_epi8 (tmp, (__m128i)maskdlv16); //a0,a2, b0, b2, a1,a3, b1,b3
16054	vst1q_s16(val.val, val0); // _mm_shuffle_epi32(val.val[0], _SWAP_HI_LOW32);
16055	return val;
16056	}
16057
16058	_NEON2SSESTORAGE int32x2x2_t vuzp_s32(int32x2_t a, int32x2_t b); // VUZP.32 d0,d0
16059	_NEON2SSE_INLINE int32x2x2_t vuzp_s32(int32x2_t a, int32x2_t b) // VUZP.32 d0,d0
16060	{
16061	int32x2x2_t val;
16062	__m128i val0;
16063	val0 = _mm_unpacklo_epi32(_pM128i(a), _pM128i(b)); //a0,b0, a1,b1
16064	vst1q_s32(val.val, val0); // _mm_shuffle_epi32(val.val[0], _SWAP_HI_LOW32);
16065	return val;
16066	}
16067
16068	_NEON2SSE_GLOBAL uint8x8x2_t vuzp_u8(uint8x8_t a, uint8x8_t b); // VUZP.8 d0,d0
16069	#define vuzp_u8 vuzp_s8
16070
16071	_NEON2SSE_GLOBAL uint16x4x2_t vuzp_u16(uint16x4_t a, uint16x4_t b); // VUZP.16 d0,d0
16072	#define vuzp_u16 vuzp_s16
16073
16074	_NEON2SSE_GLOBAL uint32x2x2_t vuzp_u32(uint32x2_t a, uint32x2_t b); // VUZP.32 d0,d0
16075	#define vuzp_u32 vuzp_s32
16076
16077	_NEON2SSE_GLOBAL float32x2x2_t vuzp_f32(float32x2_t a, float32x2_t b); // VUZP.32 d0,d0
16078	#define vuzp_f32 vzip_f32
16079
16080	_NEON2SSE_GLOBAL poly8x8x2_t vuzp_p8(poly8x8_t a, poly8x8_t b); // VUZP.8 d0,d0
16081	#define vuzp_p8 vuzp_u8
16082
16083	_NEON2SSE_GLOBAL poly16x4x2_t vuzp_p16(poly16x4_t a, poly16x4_t b); // VUZP.16 d0,d0
16084	#define vuzp_p16 vuzp_u16
16085
16086	_NEON2SSESTORAGE int8x16x2_t vuzpq_s8(int8x16_t a, int8x16_t b); // VUZP.8 q0,q0
16087	_NEON2SSE_INLINE int8x16x2_t vuzpq_s8(int8x16_t a, int8x16_t b) // VUZP.8 q0,q0
16088	{
16089	int8x16x2_t v8x16;
16090	__m128i a_sh, b_sh;
16091	a_sh = _mm_shuffle_epi8 (a, (__m128i)mask8_16_even_odd); //a0, a2, a4, a6, a8, a10, a12, a14, a1, a3, a5, a7, a9, a11, a13, a15
16092	b_sh = _mm_shuffle_epi8 (b, (__m128i)mask8_16_even_odd); //b0, b2, b4, b6, b8, b10, b12, b14, b1, b3, b5, b7, b9, b11, b13, b15
16093	//we need unpack64 to combine lower (upper) 64 bits from a with lower (upper) 64 bits from b
16094	v8x16.val[`0`] = _mm_unpacklo_epi64(a_sh, b_sh); ///a0, a2, a4, a6, a8, a10, a12, a14, b0, b2, b4, b6, b8, b10, b12, b14,
16095	v8x16.val[`1`] = _mm_unpackhi_epi64(a_sh, b_sh); //a1, a3, a5, a7, a9, a11, a13, a15, b1, b3, b5, b7, b9, b11, b13, b15
16096	return v8x16;
16097	}
16098
16099	_NEON2SSESTORAGE int16x8x2_t vuzpq_s16(int16x8_t a, int16x8_t b); // VUZP.16 q0,q0
16100	_NEON2SSE_INLINE int16x8x2_t vuzpq_s16(int16x8_t a, int16x8_t b) // VUZP.16 q0,q0
16101	{
16102	int16x8x2_t v16x8;
16103	__m128i a_sh, b_sh;
16104	a_sh = _mm_shuffle_epi8 (a, (__m128i)mask8_32_even_odd); //a0, a2, a4, a6, a1, a3, a5, a7
16105	b_sh = _mm_shuffle_epi8 (b, (__m128i)mask8_32_even_odd); //b0, b2, b4, b6, b1, b3, b5, b7
16106	v16x8.val[`0`] = _mm_unpacklo_epi64(a_sh, b_sh); //a0, a2, a4, a6, b0, b2, b4, b6
16107	v16x8.val[`1`] = _mm_unpackhi_epi64(a_sh, b_sh); //a1, a3, a5, a7, b1, b3, b5, b7
16108	return v16x8;
16109	}
16110
16111	_NEON2SSESTORAGE int32x4x2_t vuzpq_s32(int32x4_t a, int32x4_t b); // VUZP.32 q0,q0
16112	_NEON2SSE_INLINE int32x4x2_t vuzpq_s32(int32x4_t a, int32x4_t b) // VUZP.32 q0,q0
16113	{
16114	//may be not optimal solution compared with serial
16115	int32x4x2_t v32x4;
16116	__m128i a_sh, b_sh;
16117	a_sh = _mm_shuffle_epi32 (a, `216`); //a0, a2, a1, a3
16118	b_sh = _mm_shuffle_epi32 (b, `216`); //b0, b2, b1, b3
16119
16120	v32x4.val[`0`] = _mm_unpacklo_epi64(a_sh, b_sh); //a0, a2, b0, b2
16121	v32x4.val[`1`] = _mm_unpackhi_epi64(a_sh, b_sh); //a1, a3, b1, b3
16122	return v32x4;
16123	}
16124
16125	_NEON2SSE_GLOBAL uint8x16x2_t vuzpq_u8(uint8x16_t a, uint8x16_t b); // VUZP.8 q0,q0
16126	#define vuzpq_u8 vuzpq_s8
16127
16128	_NEON2SSE_GLOBAL uint16x8x2_t vuzpq_u16(uint16x8_t a, uint16x8_t b); // VUZP.16 q0,q0
16129	#define vuzpq_u16 vuzpq_s16
16130
16131	_NEON2SSE_GLOBAL uint32x4x2_t vuzpq_u32(uint32x4_t a, uint32x4_t b); // VUZP.32 q0,q0
16132	#define vuzpq_u32 vuzpq_s32
16133
16134	_NEON2SSESTORAGE float32x4x2_t vuzpq_f32(float32x4_t a, float32x4_t b); // VUZP.32 q0,q0
16135	_NEON2SSE_INLINE float32x4x2_t vuzpq_f32(float32x4_t a, float32x4_t b) // VUZP.32 q0,q0
16136	{
16137	float32x4x2_t v32x4;
16138	v32x4.val[`0`] = _mm_shuffle_ps(a, b, _MM_SHUFFLE(`2`,`0`, `2`, `0`)); //a0, a2, b0, b2 , need to check endianess however
16139	v32x4.val[`1`] = _mm_shuffle_ps(a, b, _MM_SHUFFLE(`3`,`1`, `3`, `1`)); //a1, a3, b1, b3, need to check endianess however
16140	return v32x4;
16141	}
16142
16143	_NEON2SSE_GLOBAL poly8x16x2_t vuzpq_p8(poly8x16_t a, poly8x16_t b); // VUZP.8 q0,q0
16144	#define vuzpq_p8 vuzpq_u8
16145
16146	_NEON2SSE_GLOBAL poly16x8x2_t vuzpq_p16(poly16x8_t a, poly16x8_t b); // VUZP.16 q0,q0
16147	#define vuzpq_p16 vuzpq_u16
16148
16149	//##############################################################################################
16150	//********************* Reinterpret cast intrinsics.****************************************
16151	//##############################################################################################
16152	// Not a part of oficial NEON instruction set but available in gcc compiler *********************
16153	_NEON2SSE_GLOBAL poly8x8_t vreinterpret_p8_u32 (uint32x2_t t);
16154	#define vreinterpret_p8_u32
16155
16156	_NEON2SSE_GLOBAL poly8x8_t vreinterpret_p8_u16 (uint16x4_t t);
16157	#define vreinterpret_p8_u16
16158
16159	_NEON2SSE_GLOBAL poly8x8_t vreinterpret_p8_u8 (uint8x8_t t);
16160	#define vreinterpret_p8_u8
16161
16162	_NEON2SSE_GLOBAL poly8x8_t vreinterpret_p8_s32 (int32x2_t t);
16163	#define vreinterpret_p8_s32
16164
16165	_NEON2SSE_GLOBAL poly8x8_t vreinterpret_p8_s16 (int16x4_t t);
16166	#define vreinterpret_p8_s16
16167
16168	_NEON2SSE_GLOBAL poly8x8_t vreinterpret_p8_s8 (int8x8_t t);
16169	#define vreinterpret_p8_s8
16170
16171	_NEON2SSE_GLOBAL poly8x8_t vreinterpret_p8_u64 (uint64x1_t t);
16172	#define vreinterpret_p8_u64
16173
16174	_NEON2SSE_GLOBAL poly8x8_t vreinterpret_p8_s64 (int64x1_t t);
16175	#define vreinterpret_p8_s64
16176
16177	_NEON2SSE_GLOBAL poly8x8_t vreinterpret_p8_f32 (float32x2_t t);
16178	#define vreinterpret_p8_f32
16179
16180	_NEON2SSE_GLOBAL poly8x8_t vreinterpret_p8_p16 (poly16x4_t t);
16181	#define vreinterpret_p8_p16
16182
16183	_NEON2SSE_GLOBAL poly8x16_t vreinterpretq_p8_u32 (uint32x4_t t);
16184	#define vreinterpretq_p8_u32
16185
16186	_NEON2SSE_GLOBAL poly8x16_t vreinterpretq_p8_u16 (uint16x8_t t);
16187	#define vreinterpretq_p8_u16
16188
16189	_NEON2SSE_GLOBAL poly8x16_t vreinterpretq_p8_u8 (uint8x16_t t);
16190	#define vreinterpretq_p8_u8
16191
16192	_NEON2SSE_GLOBAL poly8x16_t vreinterpretq_p8_s32 (int32x4_t t);
16193	#define vreinterpretq_p8_s32
16194
16195	_NEON2SSE_GLOBAL poly8x16_t vreinterpretq_p8_s16 (int16x8_t t);
16196	#define vreinterpretq_p8_s16
16197
16198	_NEON2SSE_GLOBAL poly8x16_t vreinterpretq_p8_s8 (int8x16_t t);
16199	#define vreinterpretq_p8_s8
16200
16201	_NEON2SSE_GLOBAL poly8x16_t vreinterpretq_p8_u64 (uint64x2_t t);
16202	#define vreinterpretq_p8_u64
16203
16204	_NEON2SSE_GLOBAL poly8x16_t vreinterpretq_p8_s64 (int64x2_t t);
16205	#define vreinterpretq_p8_s64
16206
16207	_NEON2SSE_GLOBAL poly8x16_t vreinterpretq_p8_f32 (float32x4_t t);
16208	#define vreinterpretq_p8_f32(t) _M128i(t)
16209
16210	_NEON2SSE_GLOBAL poly8x16_t vreinterpretq_p8_p16 (poly16x8_t t);
16211	#define vreinterpretq_p8_p16
16212
16213	_NEON2SSE_GLOBAL poly16x4_t vreinterpret_p16_u32 (uint32x2_t t);
16214	#define vreinterpret_p16_u32
16215
16216	_NEON2SSE_GLOBAL poly16x4_t vreinterpret_p16_u16 (uint16x4_t t);
16217	#define vreinterpret_p16_u16
16218
16219	_NEON2SSE_GLOBAL poly16x4_t vreinterpret_p16_u8 (uint8x8_t t);
16220	#define vreinterpret_p16_u8
16221
16222	_NEON2SSE_GLOBAL poly16x4_t vreinterpret_p16_s32 (int32x2_t t);
16223	#define vreinterpret_p16_s32
16224
16225	_NEON2SSE_GLOBAL poly16x4_t vreinterpret_p16_s16 (int16x4_t t);
16226	#define vreinterpret_p16_s16
16227
16228	_NEON2SSE_GLOBAL poly16x4_t vreinterpret_p16_s8 (int8x8_t t);
16229	#define vreinterpret_p16_s8
16230
16231	_NEON2SSE_GLOBAL poly16x4_t vreinterpret_p16_u64 (uint64x1_t t);
16232	#define vreinterpret_p16_u64
16233
16234	_NEON2SSE_GLOBAL poly16x4_t vreinterpret_p16_s64 (int64x1_t t);
16235	#define vreinterpret_p16_s64
16236
16237	_NEON2SSE_GLOBAL poly16x4_t vreinterpret_p16_f32 (float32x2_t t);
16238	#define vreinterpret_p16_f32
16239
16240	_NEON2SSE_GLOBAL poly16x4_t vreinterpret_p16_p8 (poly8x8_t t);
16241	#define vreinterpret_p16_p8
16242
16243	_NEON2SSE_GLOBAL poly16x8_t vreinterpretq_p16_u32 (uint32x4_t t);
16244	#define vreinterpretq_p16_u32
16245
16246	_NEON2SSE_GLOBAL poly16x8_t vreinterpretq_p16_u16 (uint16x8_t t);
16247	#define vreinterpretq_p16_u16
16248
16249	_NEON2SSE_GLOBAL poly16x8_t vreinterpretq_p16_s32 (int32x4_t t);
16250	#define vreinterpretq_p16_s32
16251
16252	_NEON2SSE_GLOBAL poly16x8_t vreinterpretq_p16_s16 (int16x8_t t);
16253	#define vreinterpretq_p16_s16
16254
16255	_NEON2SSE_GLOBAL poly16x8_t vreinterpretq_p16_s8 (int8x16_t t);
16256	#define vreinterpretq_p16_s8
16257
16258	_NEON2SSE_GLOBAL poly16x8_t vreinterpretq_p16_u64 (uint64x2_t t);
16259	#define vreinterpretq_p16_u64
16260
16261	_NEON2SSE_GLOBAL poly16x8_t vreinterpretq_p16_s64 (int64x2_t t);
16262	#define vreinterpretq_p16_s64
16263
16264	_NEON2SSE_GLOBAL poly16x8_t vreinterpretq_p16_f32 (float32x4_t t);
16265	#define vreinterpretq_p16_f32(t) _M128i(t)
16266
16267	_NEON2SSE_GLOBAL poly16x8_t vreinterpretq_p16_p8 (poly8x16_t t);
16268	#define vreinterpretq_p16_p8 vreinterpretq_s16_p8
16269
16270	//** Integer to float ****
16271	_NEON2SSESTORAGE float32x2_t vreinterpret_f32_u32 (uint32x2_t t);
16272	_NEON2SSE_INLINE float32x2_t vreinterpret_f32_u32 (uint32x2_t t)
16273	{
16274	return ((__m64_128)&(t));
16275	}
16276
16277	_NEON2SSE_GLOBAL float32x2_t vreinterpret_f32_u16 (uint16x4_t t);
16278	#define vreinterpret_f32_u16 vreinterpret_f32_u32
16279
16280
16281	_NEON2SSE_GLOBAL float32x2_t vreinterpret_f32_u8 (uint8x8_t t);
16282	#define vreinterpret_f32_u8 vreinterpret_f32_u32
16283
16284
16285	_NEON2SSE_GLOBAL float32x2_t vreinterpret_f32_s32 (int32x2_t t);
16286	#define vreinterpret_f32_s32 vreinterpret_f32_u32
16287
16288
16289	_NEON2SSE_GLOBAL float32x2_t vreinterpret_f32_s16 (int16x4_t t);
16290	#define vreinterpret_f32_s16 vreinterpret_f32_u32
16291
16292	_NEON2SSE_GLOBAL float32x2_t vreinterpret_f32_s8 (int8x8_t t);
16293	#define vreinterpret_f32_s8 vreinterpret_f32_u32
16294
16295
16296	_NEON2SSE_GLOBAL float32x2_t vreinterpret_f32_u64(uint64x1_t t);
16297	#define vreinterpret_f32_u64 vreinterpret_f32_u32
16298
16299
16300	_NEON2SSE_GLOBAL float32x2_t vreinterpret_f32_s64 (int64x1_t t);
16301	#define vreinterpret_f32_s64 vreinterpret_f32_u32
16302
16303
16304	_NEON2SSE_GLOBAL float32x2_t vreinterpret_f32_p16 (poly16x4_t t);
16305	#define vreinterpret_f32_p16 vreinterpret_f32_u32
16306
16307	_NEON2SSE_GLOBAL float32x2_t vreinterpret_f32_p8 (poly8x8_t t);
16308	#define vreinterpret_f32_p8 vreinterpret_f32_u32
16309
16310	_NEON2SSE_GLOBAL float32x4_t vreinterpretq_f32_u32 (uint32x4_t t);
16311	#define vreinterpretq_f32_u32(t) _M128(t)
16312
16313	_NEON2SSE_GLOBAL float32x4_t vreinterpretq_f32_u16 (uint16x8_t t);
16314	#define vreinterpretq_f32_u16 vreinterpretq_f32_u32
16315
16316	_NEON2SSE_GLOBAL float32x4_t vreinterpretq_f32_u8 (uint8x16_t t);
16317	#define vreinterpretq_f32_u8 vreinterpretq_f32_u32
16318
16319	_NEON2SSE_GLOBAL float32x4_t vreinterpretq_f32_s32 (int32x4_t t);
16320	#define vreinterpretq_f32_s32 vreinterpretq_f32_u32
16321
16322	_NEON2SSE_GLOBAL float32x4_t vreinterpretq_f32_s16 (int16x8_t t);
16323	#define vreinterpretq_f32_s16 vreinterpretq_f32_u32
16324
16325	_NEON2SSE_GLOBAL float32x4_t vreinterpretq_f32_s8 (int8x16_t t);
16326	#define vreinterpretq_f32_s8 vreinterpretq_f32_u32
16327
16328	_NEON2SSE_GLOBAL float32x4_t vreinterpretq_f32_u64 (uint64x2_t t);
16329	#define vreinterpretq_f32_u64 vreinterpretq_f32_u32
16330
16331	_NEON2SSE_GLOBAL float32x4_t vreinterpretq_f32_s64 (int64x2_t t);
16332	#define vreinterpretq_f32_s64 vreinterpretq_f32_u32
16333
16334	_NEON2SSE_GLOBAL float32x4_t vreinterpretq_f32_p16 (poly16x8_t t);
16335	#define vreinterpretq_f32_p16 vreinterpretq_f32_u32
16336
16337	_NEON2SSE_GLOBAL float32x4_t vreinterpretq_f32_p8 (poly8x16_t t);
16338	#define vreinterpretq_f32_p8 vreinterpretq_f32_u32
16339
16340	//* Integer type conversions ****************
16341	//no conversion necessary for the following functions because it is same data type
16342	_NEON2SSE_GLOBAL int64x1_t vreinterpret_s64_u32 (uint32x2_t t);
16343	#define vreinterpret_s64_u32
16344
16345	_NEON2SSE_GLOBAL int64x1_t vreinterpret_s64_u16 (uint16x4_t t);
16346	#define vreinterpret_s64_u16
16347
16348	_NEON2SSE_GLOBAL int64x1_t vreinterpret_s64_u8 (uint8x8_t t);
16349	#define vreinterpret_s64_u8
16350
16351	_NEON2SSE_GLOBAL int64x1_t vreinterpret_s64_s32 (int32x2_t t);
16352	#define vreinterpret_s64_s32
16353
16354	_NEON2SSE_GLOBAL int64x1_t vreinterpret_s64_s16 (int16x4_t t);
16355	#define vreinterpret_s64_s16
16356
16357	_NEON2SSE_GLOBAL int64x1_t vreinterpret_s64_s8 (int8x8_t t);
16358	#define vreinterpret_s64_s8
16359
16360	_NEON2SSE_GLOBAL int64x1_t vreinterpret_s64_u64 (uint64x1_t t);
16361	#define vreinterpret_s64_u64
16362
16363	_NEON2SSE_GLOBAL int64x1_t vreinterpret_s64_f32 (float32x2_t t);
16364	#define vreinterpret_s64_f32
16365
16366	_NEON2SSE_GLOBAL int64x1_t vreinterpret_s64_p16 (poly16x4_t t);
16367	#define vreinterpret_s64_p16
16368
16369	_NEON2SSE_GLOBAL int64x1_t vreinterpret_s64_p8 (poly8x8_t t);
16370	#define vreinterpret_s64_p8
16371
16372	_NEON2SSE_GLOBAL int64x2_t vreinterpretq_s64_u32 (uint32x4_t t);
16373	#define vreinterpretq_s64_u32
16374
16375	_NEON2SSE_GLOBAL int64x2_t vreinterpretq_s64_s16 (uint16x8_t t);
16376	#define vreinterpretq_s64_s16
16377
16378	_NEON2SSE_GLOBAL int64x2_t vreinterpretq_s64_u8 (uint8x16_t t);
16379	#define vreinterpretq_s64_u8
16380
16381	_NEON2SSE_GLOBAL int64x2_t vreinterpretq_s64_s32 (int32x4_t t);
16382	#define vreinterpretq_s64_s32
16383
16384	_NEON2SSE_GLOBAL int64x2_t vreinterpretq_s64_u16 (int16x8_t t);
16385	#define vreinterpretq_s64_u16
16386
16387	_NEON2SSE_GLOBAL int64x2_t vreinterpretq_s64_s8 (int8x16_t t);
16388	#define vreinterpretq_s64_s8
16389
16390	_NEON2SSE_GLOBAL int64x2_t vreinterpretq_s64_u64 (uint64x2_t t);
16391	#define vreinterpretq_s64_u64
16392
16393	_NEON2SSE_GLOBAL int64x2_t vreinterpretq_s64_f32 (float32x4_t t);
16394	#define vreinterpretq_s64_f32(t) _M128i(t)
16395
16396	_NEON2SSE_GLOBAL int64x2_t vreinterpretq_s64_p16 (poly16x8_t t);
16397	#define vreinterpretq_s64_p16
16398
16399	_NEON2SSE_GLOBAL int64x2_t vreinterpretq_s64_p8 (poly8x16_t t);
16400	#define vreinterpretq_s64_p8
16401
16402	_NEON2SSE_GLOBAL uint64x1_t vreinterpret_u64_u32 (uint32x2_t t);
16403	#define vreinterpret_u64_u32
16404
16405	_NEON2SSE_GLOBAL uint64x1_t vreinterpret_u64_u16 (uint16x4_t t);
16406	#define vreinterpret_u64_u16
16407
16408	_NEON2SSE_GLOBAL uint64x1_t vreinterpret_u64_u8 (uint8x8_t t);
16409	#define vreinterpret_u64_u8
16410
16411	_NEON2SSE_GLOBAL uint64x1_t vreinterpret_u64_s32 (int32x2_t t);
16412	#define vreinterpret_u64_s32
16413
16414	_NEON2SSE_GLOBAL uint64x1_t vreinterpret_u64_s16 (int16x4_t t);
16415	#define vreinterpret_u64_s16
16416
16417	_NEON2SSE_GLOBAL uint64x1_t vreinterpret_u64_s8 (int8x8_t t);
16418	#define vreinterpret_u64_s8
16419
16420	_NEON2SSE_GLOBAL uint64x1_t vreinterpret_u64_s64 (int64x1_t t);
16421	#define vreinterpret_u64_s64
16422
16423	_NEON2SSE_GLOBAL uint64x1_t vreinterpret_u64_f32 (float32x2_t t);
16424	#define vreinterpret_u64_f32
16425
16426	_NEON2SSE_GLOBAL uint64x1_t vreinterpret_u64_p16 (poly16x4_t t);
16427	#define vreinterpret_u64_p16
16428
16429	_NEON2SSE_GLOBAL uint64x1_t vreinterpret_u64_p8 (poly8x8_t t);
16430	#define vreinterpret_u64_p8
16431
16432	_NEON2SSE_GLOBAL uint64x2_t vreinterpretq_u64_u32 (uint32x4_t t);
16433	#define vreinterpretq_u64_u32
16434
16435	_NEON2SSE_GLOBAL uint64x2_t vreinterpretq_u64_u16 (uint16x8_t t);
16436	#define vreinterpretq_u64_u16
16437
16438	_NEON2SSE_GLOBAL uint64x2_t vreinterpretq_u64_u8 (uint8x16_t t);
16439	#define vreinterpretq_u64_u8
16440
16441	_NEON2SSE_GLOBAL uint64x2_t vreinterpretq_u64_s32 (int32x4_t t);
16442	#define vreinterpretq_u64_s32
16443
16444	_NEON2SSE_GLOBAL uint64x2_t vreinterpretq_u64_s16 (int16x8_t t);
16445	#define vreinterpretq_u64_s16
16446
16447	_NEON2SSE_GLOBAL uint64x2_t vreinterpretq_u64_s8 (int8x16_t t);
16448	#define vreinterpretq_u64_s8
16449
16450	_NEON2SSE_GLOBAL uint64x2_t vreinterpretq_u64_s64 (int64x2_t t);
16451	#define vreinterpretq_u64_s64
16452
16453	_NEON2SSE_GLOBAL uint64x2_t vreinterpretq_u64_f32 (float32x4_t t);
16454	#define vreinterpretq_u64_f32(t) _M128i(t)
16455
16456	_NEON2SSE_GLOBAL uint64x2_t vreinterpretq_u64_p16 (poly16x8_t t);
16457	#define vreinterpretq_u64_p16
16458
16459	_NEON2SSE_GLOBAL uint64x2_t vreinterpretq_u64_p8 (poly8x16_t t);
16460	#define vreinterpretq_u64_p8
16461
16462	_NEON2SSE_GLOBAL int8x8_t vreinterpret_s8_u32 (uint32x2_t t);
16463	#define vreinterpret_s8_u32
16464
16465	_NEON2SSE_GLOBAL int8x8_t vreinterpret_s8_u16 (uint16x4_t t);
16466	#define vreinterpret_s8_u16
16467
16468	_NEON2SSE_GLOBAL int8x8_t vreinterpret_s8_u8 (uint8x8_t t);
16469	#define vreinterpret_s8_u8
16470
16471	_NEON2SSE_GLOBAL int8x8_t vreinterpret_s8_s32 (int32x2_t t);
16472	#define vreinterpret_s8_s32
16473
16474	_NEON2SSE_GLOBAL int8x8_t vreinterpret_s8_s16 (int16x4_t t);
16475	#define vreinterpret_s8_s16
16476
16477	_NEON2SSE_GLOBAL int8x8_t vreinterpret_s8_u64 (uint64x1_t t);
16478	#define vreinterpret_s8_u64
16479
16480	_NEON2SSE_GLOBAL int8x8_t vreinterpret_s8_s64 (int64x1_t t);
16481	#define vreinterpret_s8_s64
16482
16483	_NEON2SSE_GLOBAL int8x8_t vreinterpret_s8_f32 (float32x2_t t);
16484	#define vreinterpret_s8_f32
16485
16486	_NEON2SSE_GLOBAL int8x8_t vreinterpret_s8_p16 (poly16x4_t t);
16487	#define vreinterpret_s8_p16
16488
16489	_NEON2SSE_GLOBAL int8x8_t vreinterpret_s8_p8 (poly8x8_t t);
16490	#define vreinterpret_s8_p8
16491
16492	_NEON2SSE_GLOBAL int8x16_t vreinterpretq_s8_u32 (uint32x4_t t);
16493	#define vreinterpretq_s8_u32
16494
16495	_NEON2SSE_GLOBAL int8x16_t vreinterpretq_s8_u16 (uint16x8_t t);
16496	#define vreinterpretq_s8_u16
16497
16498	_NEON2SSE_GLOBAL int8x16_t vreinterpretq_s8_u8 (uint8x16_t t);
16499	#define vreinterpretq_s8_u8
16500
16501	_NEON2SSE_GLOBAL int8x16_t vreinterpretq_s8_s32 (int32x4_t t);
16502	#define vreinterpretq_s8_s32
16503
16504	_NEON2SSE_GLOBAL int8x16_t vreinterpretq_s8_s16 (int16x8_t t);
16505	#define vreinterpretq_s8_s16
16506
16507	_NEON2SSE_GLOBAL int8x16_t vreinterpretq_s8_u64 (uint64x2_t t);
16508	#define vreinterpretq_s8_u64
16509
16510	_NEON2SSE_GLOBAL int8x16_t vreinterpretq_s8_s64 (int64x2_t t);
16511	#define vreinterpretq_s8_s64
16512
16513	_NEON2SSE_GLOBAL int8x16_t vreinterpretq_s8_f32 (float32x4_t t);
16514	#define vreinterpretq_s8_f32(t) _M128i(t)
16515
16516	_NEON2SSE_GLOBAL int8x16_t vreinterpretq_s8_p16 (poly16x8_t t);
16517	#define vreinterpretq_s8_p16
16518
16519	_NEON2SSE_GLOBAL int8x16_t vreinterpretq_s8_p8 (poly8x16_t t);
16520	#define vreinterpretq_s8_p8
16521
16522	_NEON2SSE_GLOBAL int16x4_t vreinterpret_s16_u32 (uint32x2_t t);
16523	#define vreinterpret_s16_u32
16524
16525	_NEON2SSE_GLOBAL int16x4_t vreinterpret_s16_u16 (uint16x4_t t);
16526	#define vreinterpret_s16_u16
16527
16528	_NEON2SSE_GLOBAL int16x4_t vreinterpret_s16_u8 (uint8x8_t t);
16529	#define vreinterpret_s16_u8
16530
16531	_NEON2SSE_GLOBAL int16x4_t vreinterpret_s16_s32 (int32x2_t t);
16532	#define vreinterpret_s16_s32
16533
16534	_NEON2SSE_GLOBAL int16x4_t vreinterpret_s16_s8 (int8x8_t t);
16535	#define vreinterpret_s16_s8
16536
16537	_NEON2SSE_GLOBAL int16x4_t vreinterpret_s16_u64 (uint64x1_t t);
16538	#define vreinterpret_s16_u64
16539
16540	_NEON2SSE_GLOBAL int16x4_t vreinterpret_s16_s64 (int64x1_t t);
16541	#define vreinterpret_s16_s64
16542
16543	_NEON2SSE_GLOBAL int16x4_t vreinterpret_s16_f32 (float32x2_t t);
16544	#define vreinterpret_s16_f32
16545
16546
16547	_NEON2SSE_GLOBAL int16x4_t vreinterpret_s16_p16 (poly16x4_t t);
16548	#define vreinterpret_s16_p16
16549
16550	_NEON2SSE_GLOBAL int16x4_t vreinterpret_s16_p8 (poly8x8_t t);
16551	#define vreinterpret_s16_p8
16552
16553	_NEON2SSE_GLOBAL int16x8_t vreinterpretq_s16_u32 (uint32x4_t t);
16554	#define vreinterpretq_s16_u32
16555
16556	_NEON2SSE_GLOBAL int16x8_t vreinterpretq_s16_u16 (uint16x8_t t);
16557	#define vreinterpretq_s16_u16
16558
16559	_NEON2SSE_GLOBAL int16x8_t vreinterpretq_s16_u8 (uint8x16_t t);
16560	#define vreinterpretq_s16_u8
16561
16562	_NEON2SSE_GLOBAL int16x8_t vreinterpretq_s16_s32 (int32x4_t t);
16563	#define vreinterpretq_s16_s32
16564
16565	_NEON2SSE_GLOBAL int16x8_t vreinterpretq_s16_s8 (int8x16_t t);
16566	#define vreinterpretq_s16_s8
16567
16568	_NEON2SSE_GLOBAL int16x8_t vreinterpretq_s16_u64 (uint64x2_t t);
16569	#define vreinterpretq_s16_u64
16570
16571	_NEON2SSE_GLOBAL int16x8_t vreinterpretq_s16_s64 (int64x2_t t);
16572	#define vreinterpretq_s16_s64
16573
16574	_NEON2SSE_GLOBAL int16x8_t vreinterpretq_s16_f32 (float32x4_t t);
16575	#define vreinterpretq_s16_f32(t) _M128i(t)
16576
16577	_NEON2SSE_GLOBAL int16x8_t vreinterpretq_s16_p16 (poly16x8_t t);
16578	#define vreinterpretq_s16_p16
16579
16580	_NEON2SSE_GLOBAL int16x8_t vreinterpretq_s16_p8 (poly8x16_t t);
16581	#define vreinterpretq_s16_p8
16582
16583	_NEON2SSE_GLOBAL int32x2_t vreinterpret_s32_u32 (uint32x2_t t);
16584	#define vreinterpret_s32_u32
16585
16586	_NEON2SSE_GLOBAL int32x2_t vreinterpret_s32_u16 (uint16x4_t t);
16587	#define vreinterpret_s32_u16
16588
16589	_NEON2SSE_GLOBAL int32x2_t vreinterpret_s32_u8 (uint8x8_t t);
16590	#define vreinterpret_s32_u8
16591
16592	_NEON2SSE_GLOBAL int32x2_t vreinterpret_s32_s16 (int16x4_t t);
16593	#define vreinterpret_s32_s16
16594
16595	_NEON2SSE_GLOBAL int32x2_t vreinterpret_s32_s8 (int8x8_t t);
16596	#define vreinterpret_s32_s8
16597
16598	_NEON2SSE_GLOBAL int32x2_t vreinterpret_s32_u64 (uint64x1_t t);
16599	#define vreinterpret_s32_u64
16600
16601	_NEON2SSE_GLOBAL int32x2_t vreinterpret_s32_s64 (int64x1_t t);
16602	#define vreinterpret_s32_s64
16603
16604	_NEON2SSE_GLOBAL int32x2_t vreinterpret_s32_f32 (float32x2_t t);
16605	#define vreinterpret_s32_f32
16606
16607	_NEON2SSE_GLOBAL int32x2_t vreinterpret_s32_p16 (poly16x4_t t);
16608	#define vreinterpret_s32_p16
16609
16610	_NEON2SSE_GLOBAL int32x2_t vreinterpret_s32_p8 (poly8x8_t t);
16611	#define vreinterpret_s32_p8
16612
16613	_NEON2SSE_GLOBAL int32x4_t vreinterpretq_s32_u32 (uint32x4_t t);
16614	#define vreinterpretq_s32_u32
16615
16616	_NEON2SSE_GLOBAL int32x4_t vreinterpretq_s32_u16 (uint16x8_t t);
16617	#define vreinterpretq_s32_u16
16618
16619	_NEON2SSE_GLOBAL int32x4_t vreinterpretq_s32_u8 (uint8x16_t t);
16620	#define vreinterpretq_s32_u8
16621
16622	_NEON2SSE_GLOBAL int32x4_t vreinterpretq_s32_s16 (int16x8_t t);
16623	#define vreinterpretq_s32_s16
16624
16625	_NEON2SSE_GLOBAL int32x4_t vreinterpretq_s32_s8 (int8x16_t t);
16626	#define vreinterpretq_s32_s8
16627
16628	_NEON2SSE_GLOBAL int32x4_t vreinterpretq_s32_u64 (uint64x2_t t);
16629	#define vreinterpretq_s32_u64
16630
16631	_NEON2SSE_GLOBAL int32x4_t vreinterpretq_s32_s64 (int64x2_t t);
16632	#define vreinterpretq_s32_s64
16633
16634	_NEON2SSE_GLOBAL int32x4_t vreinterpretq_s32_f32 (float32x4_t t);
16635	#define vreinterpretq_s32_f32(t) _M128i(t)
16636
16637	_NEON2SSE_GLOBAL int32x4_t vreinterpretq_s32_p16 (poly16x8_t t);
16638	#define vreinterpretq_s32_p16
16639
16640	_NEON2SSE_GLOBAL int32x4_t vreinterpretq_s32_p8 (poly8x16_t t);
16641	#define vreinterpretq_s32_p8
16642
16643	_NEON2SSE_GLOBAL uint8x8_t vreinterpret_u8_u32 (uint32x2_t t);
16644	#define vreinterpret_u8_u32
16645
16646	_NEON2SSE_GLOBAL uint8x8_t vreinterpret_u8_u16 (uint16x4_t t);
16647	#define vreinterpret_u8_u16
16648
16649	_NEON2SSE_GLOBAL uint8x8_t vreinterpret_u8_s32 (int32x2_t t);
16650	#define vreinterpret_u8_s32
16651
16652	_NEON2SSE_GLOBAL uint8x8_t vreinterpret_u8_s16 (int16x4_t t);
16653	#define vreinterpret_u8_s16
16654
16655	_NEON2SSE_GLOBAL uint8x8_t vreinterpret_u8_s8 (int8x8_t t);
16656	#define vreinterpret_u8_s8
16657
16658	_NEON2SSE_GLOBAL uint8x8_t vreinterpret_u8_u64 (uint64x1_t t);
16659	#define vreinterpret_u8_u64
16660
16661	_NEON2SSE_GLOBAL uint8x8_t vreinterpret_u8_s64 (int64x1_t t);
16662	#define vreinterpret_u8_s64
16663
16664	_NEON2SSE_GLOBAL uint8x8_t vreinterpret_u8_f32 (float32x2_t t);
16665	#define vreinterpret_u8_f32
16666
16667	_NEON2SSE_GLOBAL uint8x8_t vreinterpret_u8_p16 (poly16x4_t t);
16668	#define vreinterpret_u8_p16
16669
16670	_NEON2SSE_GLOBAL uint8x8_t vreinterpret_u8_p8 (poly8x8_t t);
16671	#define vreinterpret_u8_p8
16672
16673	_NEON2SSE_GLOBAL uint8x16_t vreinterpretq_u8_u32 (uint32x4_t t);
16674	#define vreinterpretq_u8_u32
16675
16676	_NEON2SSE_GLOBAL uint8x16_t vreinterpretq_u8_u16 (uint16x8_t t);
16677	#define vreinterpretq_u8_u16
16678
16679	_NEON2SSE_GLOBAL uint8x16_t vreinterpretq_u8_s32 (int32x4_t t);
16680	#define vreinterpretq_u8_s32
16681
16682	_NEON2SSE_GLOBAL uint8x16_t vreinterpretq_u8_s16 (int16x8_t t);
16683	#define vreinterpretq_u8_s16
16684
16685	_NEON2SSE_GLOBAL uint8x16_t vreinterpretq_u8_s8 (int8x16_t t);
16686	#define vreinterpretq_u8_s8
16687
16688	_NEON2SSE_GLOBAL uint8x16_t vreinterpretq_u8_u64 (uint64x2_t t);
16689	#define vreinterpretq_u8_u64
16690
16691	_NEON2SSE_GLOBAL uint8x16_t vreinterpretq_u8_s64 (int64x2_t t);
16692	#define vreinterpretq_u8_s64
16693
16694	_NEON2SSE_GLOBAL uint8x16_t vreinterpretq_u8_f32 (float32x4_t t);
16695	#define vreinterpretq_u8_f32(t) _M128i(t)
16696
16697
16698	_NEON2SSE_GLOBAL uint8x16_t vreinterpretq_u8_p16 (poly16x8_t t);
16699	#define vreinterpretq_u8_p16
16700
16701	_NEON2SSE_GLOBAL uint8x16_t vreinterpretq_u8_p8 (poly8x16_t t);
16702	#define vreinterpretq_u8_p8
16703
16704	_NEON2SSE_GLOBAL uint16x4_t vreinterpret_u16_u32 (uint32x2_t t);
16705	#define vreinterpret_u16_u32
16706
16707	_NEON2SSE_GLOBAL uint16x4_t vreinterpret_u16_u8 (uint8x8_t t);
16708	#define vreinterpret_u16_u8
16709
16710	_NEON2SSE_GLOBAL uint16x4_t vreinterpret_u16_s32 (int32x2_t t);
16711	#define vreinterpret_u16_s32
16712
16713	_NEON2SSE_GLOBAL uint16x4_t vreinterpret_u16_s16 (int16x4_t t);
16714	#define vreinterpret_u16_s16
16715
16716	_NEON2SSE_GLOBAL uint16x4_t vreinterpret_u16_s8 (int8x8_t t);
16717	#define vreinterpret_u16_s8
16718
16719	_NEON2SSE_GLOBAL uint16x4_t vreinterpret_u16_u64 (uint64x1_t t);
16720	#define vreinterpret_u16_u64
16721
16722	_NEON2SSE_GLOBAL uint16x4_t vreinterpret_u16_s64 (int64x1_t t);
16723	#define vreinterpret_u16_s64
16724
16725	_NEON2SSE_GLOBAL uint16x4_t vreinterpret_u16_f32 (float32x2_t t);
16726	#define vreinterpret_u16_f32
16727
16728	_NEON2SSE_GLOBAL uint16x4_t vreinterpret_u16_p16 (poly16x4_t t);
16729	#define vreinterpret_u16_p16
16730
16731	_NEON2SSE_GLOBAL uint16x4_t vreinterpret_u16_p8 (poly8x8_t t);
16732	#define vreinterpret_u16_p8
16733
16734	_NEON2SSE_GLOBAL uint16x8_t vreinterpretq_u16_u32 (uint32x4_t t);
16735	#define vreinterpretq_u16_u32
16736
16737	_NEON2SSE_GLOBAL uint16x8_t vreinterpretq_u16_u8 (uint8x16_t t);
16738	#define vreinterpretq_u16_u8
16739
16740	_NEON2SSE_GLOBAL uint16x8_t vreinterpretq_u16_s32 (int32x4_t t);
16741	#define vreinterpretq_u16_s32
16742
16743	_NEON2SSE_GLOBAL uint16x8_t vreinterpretq_u16_s16 (int16x8_t t);
16744	#define vreinterpretq_u16_s16
16745
16746	_NEON2SSE_GLOBAL uint16x8_t vreinterpretq_u16_s8 (int8x16_t t);
16747	#define vreinterpretq_u16_s8
16748
16749	_NEON2SSE_GLOBAL uint16x8_t vreinterpretq_u16_u64 (uint64x2_t t);
16750	#define vreinterpretq_u16_u64
16751
16752	_NEON2SSE_GLOBAL uint16x8_t vreinterpretq_u16_s64 (int64x2_t t);
16753	#define vreinterpretq_u16_s64
16754
16755	_NEON2SSE_GLOBAL uint16x8_t vreinterpretq_u16_f32 (float32x4_t t);
16756	#define vreinterpretq_u16_f32(t) _M128i(t)
16757
16758	_NEON2SSE_GLOBAL uint16x8_t vreinterpretq_u16_p16 (poly16x8_t t);
16759	#define vreinterpretq_u16_p16
16760
16761	_NEON2SSE_GLOBAL uint16x8_t vreinterpretq_u16_p8 (poly8x16_t t);
16762	#define vreinterpretq_u16_p8
16763
16764	_NEON2SSE_GLOBAL uint32x2_t vreinterpret_u32_u16 (uint16x4_t t);
16765	#define vreinterpret_u32_u16
16766
16767	_NEON2SSE_GLOBAL uint32x2_t vreinterpret_u32_u8 (uint8x8_t t);
16768	#define vreinterpret_u32_u8
16769
16770	_NEON2SSE_GLOBAL uint32x2_t vreinterpret_u32_s32 (int32x2_t t);
16771	#define vreinterpret_u32_s32
16772
16773	_NEON2SSE_GLOBAL uint32x2_t vreinterpret_u32_s16 (int16x4_t t);
16774	#define vreinterpret_u32_s16
16775
16776	_NEON2SSE_GLOBAL uint32x2_t vreinterpret_u32_s8 (int8x8_t t);
16777	#define vreinterpret_u32_s8
16778
16779	_NEON2SSE_GLOBAL uint32x2_t vreinterpret_u32_u64 (uint64x1_t t);
16780	#define vreinterpret_u32_u64
16781
16782	_NEON2SSE_GLOBAL uint32x2_t vreinterpret_u32_s64 (int64x1_t t);
16783	#define vreinterpret_u32_s64
16784
16785	_NEON2SSE_GLOBAL uint32x2_t vreinterpret_u32_f32 (float32x2_t t);
16786	#define vreinterpret_u32_f32
16787
16788	_NEON2SSE_GLOBAL uint32x2_t vreinterpret_u32_p16 (poly16x4_t t);
16789	#define vreinterpret_u32_p16
16790
16791	_NEON2SSE_GLOBAL uint32x2_t vreinterpret_u32_p8 (poly8x8_t t);
16792	#define vreinterpret_u32_p8
16793
16794	_NEON2SSE_GLOBAL uint32x4_t vreinterpretq_u32_u16 (uint16x8_t t);
16795	#define vreinterpretq_u32_u16
16796
16797	_NEON2SSE_GLOBAL uint32x4_t vreinterpretq_u32_u8 (uint8x16_t t);
16798	#define vreinterpretq_u32_u8
16799
16800	_NEON2SSE_GLOBAL uint32x4_t vreinterpretq_u32_s32 (int32x4_t t);
16801	#define vreinterpretq_u32_s32
16802
16803	_NEON2SSE_GLOBAL uint32x4_t vreinterpretq_u32_s16 (int16x8_t t);
16804	#define vreinterpretq_u32_s16
16805
16806	_NEON2SSE_GLOBAL uint32x4_t vreinterpretq_u32_s8 (int8x16_t t);
16807	#define vreinterpretq_u32_s8
16808
16809	_NEON2SSE_GLOBAL uint32x4_t vreinterpretq_u32_u64 (uint64x2_t t);
16810	#define vreinterpretq_u32_u64
16811
16812	_NEON2SSE_GLOBAL uint32x4_t vreinterpretq_u32_s64 (int64x2_t t);
16813	#define vreinterpretq_u32_s64
16814
16815	_NEON2SSE_GLOBAL uint32x4_t vreinterpretq_u32_f32 (float32x4_t t);
16816	#define vreinterpretq_u32_f32(t) _M128i(t)
16817
16818	_NEON2SSE_GLOBAL uint32x4_t vreinterpretq_u32_p16 (poly16x8_t t);
16819	#define vreinterpretq_u32_p16
16820
16821	_NEON2SSE_GLOBAL uint32x4_t vreinterpretq_u32_p8 (poly8x16_t t);
16822	#define vreinterpretq_u32_p8
16823
16824	//*********** Round ****************
16825	_NEON2SSESTORAGE float32x4_t vrndnq_f32(float32x4_t a);
16826	#ifdef USE_SSE4
16827	_NEON2SSE_INLINE float32x4_t vrndnq_f32(float32x4_t a)
16828	{
16829	return _mm_round_ps(a, _MM_FROUND_TO_NEAREST_INT \| _MM_FROUND_NO_EXC);
16830	}
16831	#else
16832	_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING( float32x4_t vrndnq_f32(float32x4_t a), _NEON2SSE_REASON_SLOW_SERIAL)
16833	{
16834	int i;
16835	_NEON2SSE_ALIGN_16 float32_t res[`4`];
16836	_mm_store_ps(res, a);
16837	for(i = `0`; i<`4`; i++) {
16838	res[i] = nearbyintf(res[i]);
16839	}
16840	return _mm_load_ps(res);
16841	}
16842	#endif
16843
16844
16845	_NEON2SSESTORAGE float64x2_t vrndnq_f64(float64x2_t a);
16846	#ifdef USE_SSE4
16847	_NEON2SSE_INLINE float64x2_t vrndnq_f64(float64x2_t a)
16848	{
16849	return _mm_round_pd(a, _MM_FROUND_TO_NEAREST_INT \| _MM_FROUND_NO_EXC);
16850	}
16851	#else
16852	_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(float64x2_t vrndnq_f64(float64x2_t a), _NEON2SSE_REASON_SLOW_SERIAL)
16853	{
16854	_NEON2SSE_ALIGN_16 float64_t res[`2`];
16855	_mm_store_pd(res, a);
16856	res[`0`] = nearbyint(res[`0`]);
16857	res[`1`] = nearbyint(res[`1`]);
16858	return _mm_load_pd(res);
16859	}
16860	#endif
16861
16862
16863
16864	//*********** Sqrt ****************
16865	_NEON2SSE_GLOBAL float32x4_t vsqrtq_f32(float32x4_t a);
16866	#define vsqrtq_f32 _mm_sqrt_ps
16867
16868	_NEON2SSE_GLOBAL float64x2_t vsqrtq_f64(float64x2_t a);
16869	#define vsqrtq_f64 _mm_sqrt_pd
16870
16871
16872	#endif /* NEON2SSE_H */
16873

Browse the source code of tensorflow/external/arm_neon_2_x86_sse/NEON_2_SSE.h