Home | History | Annotate | Line # | Download | only in arm
      1  1.2  rin /*	$NetBSD: arm_neon.h,v 1.2 2023/08/07 01:14:19 rin Exp $	*/
      2  1.1  rin 
      3  1.1  rin /*-
      4  1.1  rin  * Copyright (c) 2020 The NetBSD Foundation, Inc.
      5  1.1  rin  * All rights reserved.
      6  1.1  rin  *
      7  1.1  rin  * Redistribution and use in source and binary forms, with or without
      8  1.1  rin  * modification, are permitted provided that the following conditions
      9  1.1  rin  * are met:
     10  1.1  rin  * 1. Redistributions of source code must retain the above copyright
     11  1.1  rin  *    notice, this list of conditions and the following disclaimer.
     12  1.1  rin  * 2. Redistributions in binary form must reproduce the above copyright
     13  1.1  rin  *    notice, this list of conditions and the following disclaimer in the
     14  1.1  rin  *    documentation and/or other materials provided with the distribution.
     15  1.1  rin  *
     16  1.1  rin  * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
     17  1.1  rin  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
     18  1.1  rin  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
     19  1.1  rin  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
     20  1.1  rin  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
     21  1.1  rin  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
     22  1.1  rin  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
     23  1.1  rin  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
     24  1.1  rin  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
     25  1.1  rin  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
     26  1.1  rin  * POSSIBILITY OF SUCH DAMAGE.
     27  1.1  rin  */
     28  1.1  rin 
     29  1.1  rin #ifndef	_SYS_CRYPTO_ARCH_ARM_ARM_NEON_H
     30  1.1  rin #define	_SYS_CRYPTO_ARCH_ARM_ARM_NEON_H
     31  1.1  rin 
     32  1.1  rin #if defined(__GNUC__) && !defined(__clang__)
     33  1.1  rin 
     34  1.1  rin #define	_INTRINSATTR							      \
     35  1.1  rin 	__extension__							      \
     36  1.1  rin 	__attribute__((__always_inline__, __gnu_inline__, __artificial__))
     37  1.1  rin 
     38  1.1  rin #ifdef __aarch64__
     39  1.1  rin typedef __Int32x4_t int32x4_t;
     40  1.1  rin typedef __Int64x2_t int64x2_t;
     41  1.1  rin typedef __Int8x16_t int8x16_t;
     42  1.1  rin typedef __Uint16x8_t uint16x8_t;
     43  1.1  rin typedef __Uint32x4_t uint32x4_t;
     44  1.1  rin typedef __Uint64x2_t uint64x2_t;
     45  1.1  rin typedef __Uint8x16_t uint8x16_t;
     46  1.1  rin typedef struct { uint8x16_t val[2]; } uint8x16x2_t;
     47  1.1  rin #else
     48  1.1  rin typedef __simd128_int32_t int32x4_t;
     49  1.1  rin typedef __simd128_int64_t int64x2_t;
     50  1.1  rin typedef __simd128_int8_t int8x16_t;
     51  1.1  rin typedef __simd128_uint16_t uint16x8_t;
     52  1.1  rin typedef __simd128_uint32_t uint32x4_t;
     53  1.1  rin typedef __simd128_uint64_t uint64x2_t;
     54  1.1  rin typedef __simd128_uint8_t uint8x16_t;
     55  1.1  rin 
     56  1.1  rin typedef __simd64_int8_t int8x8_t;
     57  1.1  rin typedef __simd64_uint8_t uint8x8_t;
     58  1.1  rin typedef __builtin_neon_udi uint64x1_t;
     59  1.1  rin typedef struct { uint8x8_t val[2]; } uint8x8x2_t;
     60  1.1  rin typedef struct { uint8x16_t val[2]; } uint8x16x2_t;
     61  1.1  rin #endif
     62  1.1  rin 
     63  1.1  rin #if defined(__AARCH64EB__)
     64  1.1  rin #define	__neon_lane_index(__v, __i)	(__arraycount(__v) - 1 - (__i))
     65  1.1  rin #define	__neon_laneq_index(__v, __i)	(__arraycount(__v) - 1 - (__i))
     66  1.1  rin #elif defined(__ARM_BIG_ENDIAN)
     67  1.1  rin #define	__neon_lane_index(__v, __i)	((__i) ^ (__arraycount(__v) - 1))
     68  1.1  rin #define	__neon_laneq_index(__v, __i)	((__i) ^ (__arraycount(__v)/2 - 1))
     69  1.1  rin #else
     70  1.1  rin #define	__neon_lane_index(__v, __i)	(__i)
     71  1.1  rin #define	__neon_laneq_index(__v, __i)	(__i)
     72  1.1  rin #endif
     73  1.1  rin 
     74  1.1  rin #elif defined(__clang__)
     75  1.1  rin 
     76  1.1  rin #define	_INTRINSATTR							      \
     77  1.1  rin 	__attribute__((__always_inline__, __nodebug__))
     78  1.1  rin 
     79  1.1  rin typedef __attribute__((neon_vector_type(16))) int8_t int8x16_t;
     80  1.1  rin typedef __attribute__((neon_vector_type(2))) int64_t int64x2_t;
     81  1.1  rin typedef __attribute__((neon_vector_type(4))) int32_t int32x4_t;
     82  1.1  rin 
     83  1.1  rin typedef __attribute__((neon_vector_type(16))) uint8_t uint8x16_t;
     84  1.1  rin typedef __attribute__((neon_vector_type(2))) uint64_t uint64x2_t;
     85  1.1  rin typedef __attribute__((neon_vector_type(4))) uint32_t uint32x4_t;
     86  1.1  rin typedef __attribute__((neon_vector_type(8))) uint16_t uint16x8_t;
     87  1.1  rin 
     88  1.1  rin typedef __attribute__((neon_vector_type(8))) int8_t int8x8_t;
     89  1.1  rin 
     90  1.1  rin typedef __attribute__((neon_vector_type(8))) uint8_t uint8x8_t;
     91  1.1  rin 
     92  1.1  rin typedef struct { uint8x8_t val[2]; } uint8x8x2_t;
     93  1.1  rin typedef struct { uint8x16_t val[2]; } uint8x16x2_t;
     94  1.1  rin 
     95  1.1  rin #ifdef __LITTLE_ENDIAN__
     96  1.1  rin #define	__neon_lane_index(__v, __i)	__i
     97  1.1  rin #define	__neon_laneq_index(__v, __i)	__i
     98  1.1  rin #else
     99  1.1  rin #define	__neon_lane_index(__v, __i)	(__arraycount(__v) - 1 - __i)
    100  1.1  rin #define	__neon_laneq_index(__v, __i)	(__arraycount(__v) - 1 - __i)
    101  1.1  rin #endif
    102  1.1  rin 
    103  1.1  rin #else
    104  1.1  rin 
    105  1.1  rin #error Teach me how to neon in your compile!
    106  1.1  rin 
    107  1.1  rin #endif
    108  1.1  rin 
    109  1.1  rin _INTRINSATTR
    110  1.1  rin static __inline uint32x4_t
    111  1.1  rin vaddq_u32(uint32x4_t __v0, uint32x4_t __v1)
    112  1.1  rin {
    113  1.1  rin 	return __v0 + __v1;
    114  1.1  rin }
    115  1.1  rin 
    116  1.1  rin _INTRINSATTR
    117  1.1  rin static __inline uint32x4_t
    118  1.1  rin vcltq_s32(int32x4_t __v0, int32x4_t __v1)
    119  1.1  rin {
    120  1.1  rin 	return (uint32x4_t)(__v0 < __v1);
    121  1.1  rin }
    122  1.1  rin 
    123  1.1  rin _INTRINSATTR
    124  1.1  rin static __inline int32x4_t
    125  1.1  rin vdupq_n_s32(int32_t __x)
    126  1.1  rin {
    127  1.1  rin 	return (int32x4_t) { __x, __x, __x, __x };
    128  1.1  rin }
    129  1.1  rin 
    130  1.1  rin _INTRINSATTR
    131  1.1  rin static __inline uint32x4_t
    132  1.1  rin vdupq_n_u32(uint32_t __x)
    133  1.1  rin {
    134  1.1  rin 	return (uint32x4_t) { __x, __x, __x, __x };
    135  1.1  rin }
    136  1.1  rin 
    137  1.1  rin _INTRINSATTR
    138  1.1  rin static __inline uint8x16_t
    139  1.1  rin vdupq_n_u8(uint8_t __x)
    140  1.1  rin {
    141  1.1  rin 	return (uint8x16_t) {
    142  1.1  rin 		__x, __x, __x, __x, __x, __x, __x, __x,
    143  1.1  rin 		__x, __x, __x, __x, __x, __x, __x, __x,
    144  1.1  rin 	};
    145  1.1  rin }
    146  1.1  rin 
    147  1.1  rin #if defined(__GNUC__) && !defined(__clang__)
    148  1.1  rin _INTRINSATTR
    149  1.1  rin static __inline uint32x4_t
    150  1.1  rin vextq_u32(uint32x4_t __lo, uint32x4_t __hi, uint8_t __i)
    151  1.1  rin {
    152  1.1  rin #if defined(__AARCH64EB__) || defined(__ARM_BIG_ENDIAN)
    153  1.1  rin 	return __builtin_shuffle(__hi, __lo,
    154  1.1  rin 	    (uint32x4_t) { 4 - __i, 5 - __i, 6 - __i, 7 - __i });
    155  1.1  rin #else
    156  1.1  rin 	return __builtin_shuffle(__lo, __hi,
    157  1.1  rin 	    (uint32x4_t) { __i + 0, __i + 1, __i + 2, __i + 3 });
    158  1.1  rin #endif
    159  1.1  rin }
    160  1.1  rin #elif defined(__clang__)
    161  1.1  rin #ifdef __LITTLE_ENDIAN__
    162  1.1  rin #define	vextq_u32(__lo, __hi, __i)					      \
    163  1.1  rin 	(uint32x4_t)__builtin_neon_vextq_v((int8x16_t)(__lo),		      \
    164  1.1  rin 	    (int8x16_t)(__hi), (__i), 50)
    165  1.1  rin #else
    166  1.1  rin #define	vextq_u32(__lo, __hi, __i) (					      \
    167  1.1  rin {									      \
    168  1.1  rin 	uint32x4_t __tlo = (__lo);					      \
    169  1.1  rin 	uint32x4_t __thi = (__hi);					      \
    170  1.1  rin 	uint32x4_t __lo_r = __builtin_shufflevector(__tlo, __tlo, 3,2,1,0);   \
    171  1.1  rin 	uint32x4_t __hi_r = __builtin_shufflevector(__thi, __thi, 3,2,1,0);   \
    172  1.1  rin 	uint32x4_t __r = __builtin_neon_vextq_v((int8x16_t)__lo_r,	      \
    173  1.1  rin 	    (int8x16_t)__hi_r, __i, 50);				      \
    174  1.1  rin 	__builtin_shufflevector(__r, __r, 3,2,1,0);			      \
    175  1.1  rin })
    176  1.1  rin #endif	/* __LITTLE_ENDIAN__ */
    177  1.1  rin #endif
    178  1.1  rin 
    179  1.1  rin #if defined(__GNUC__) && !defined(__clang__)
    180  1.1  rin _INTRINSATTR
    181  1.1  rin static __inline uint8x16_t
    182  1.1  rin vextq_u8(uint8x16_t __lo, uint8x16_t __hi, uint8_t __i)
    183  1.1  rin {
    184  1.1  rin #ifdef __aarch64__
    185  1.1  rin #if defined(__AARCH64EB__)
    186  1.1  rin 	return __builtin_shuffle(__hi, __lo,
    187  1.1  rin 	    (uint8x16_t) {
    188  1.1  rin 		16 - __i, 17 - __i, 18 - __i, 19 - __i,
    189  1.1  rin 		20 - __i, 21 - __i, 22 - __i, 23 - __i,
    190  1.1  rin 		24 - __i, 25 - __i, 26 - __i, 27 - __i,
    191  1.1  rin 		28 - __i, 29 - __i, 30 - __i, 31 - __i,
    192  1.1  rin 	});
    193  1.1  rin #else
    194  1.1  rin 	return __builtin_shuffle(__lo, __hi,
    195  1.1  rin 	    (uint8x16_t) {
    196  1.1  rin 		__i +  0, __i +  1, __i +  2, __i +  3,
    197  1.1  rin 		__i +  4, __i +  5, __i +  6, __i +  7,
    198  1.1  rin 		__i +  8, __i +  9, __i + 10, __i + 11,
    199  1.1  rin 		__i + 12, __i + 13, __i + 14, __i + 15,
    200  1.1  rin 	});
    201  1.1  rin #endif
    202  1.1  rin #else
    203  1.1  rin 	return (uint8x16_t)__builtin_neon_vextv16qi((int8x16_t)__lo,
    204  1.1  rin 	    (int8x16_t)__hi, __i);
    205  1.1  rin #endif
    206  1.1  rin }
    207  1.1  rin #elif defined(__clang__)
    208  1.1  rin #ifdef __LITTLE_ENDIAN__
    209  1.1  rin #define	vextq_u8(__lo, __hi, __i)					      \
    210  1.1  rin 	(uint8x16_t)__builtin_neon_vextq_v((int8x16_t)(__lo),		      \
    211  1.1  rin 	    (int8x16_t)(__hi), (__i), 48)
    212  1.1  rin #else
    213  1.1  rin #define	vextq_u8(__lo, __hi, __i) (					      \
    214  1.1  rin {									      \
    215  1.1  rin 	uint8x16_t __tlo = (__lo);					      \
    216  1.1  rin 	uint8x16_t __thi = (__hi);					      \
    217  1.1  rin 	uint8x16_t __lo_r = __builtin_shufflevector(__tlo, __tlo,	      \
    218  1.1  rin 	    15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0);			      \
    219  1.1  rin 	uint8x16_t __hi_r = __builtin_shufflevector(__thi, __thi,	      \
    220  1.1  rin 	    15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0);			      \
    221  1.1  rin 	uint8x16_t __r = __builtin_neon_vextq_v((int8x16_t)__lo_r,	      \
    222  1.1  rin 	    (int8x16_t)__hi_r, (__i), 48);				      \
    223  1.1  rin 	__builtin_shufflevector(__r, __r,				      \
    224  1.1  rin 	    15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0);			      \
    225  1.1  rin })
    226  1.1  rin #endif	/* __LITTLE_ENDIAN */
    227  1.1  rin #endif
    228  1.1  rin 
    229  1.1  rin #if defined(__GNUC__) && !defined(__clang__)
    230  1.1  rin _INTRINSATTR
    231  1.1  rin static __inline uint32_t
    232  1.1  rin vgetq_lane_u32(uint32x4_t __v, uint8_t __i)
    233  1.1  rin {
    234  1.1  rin #ifdef __aarch64__
    235  1.1  rin 	return __v[__neon_laneq_index(__v, __i)];
    236  1.1  rin #else
    237  1.1  rin 	return (uint32_t)__builtin_neon_vget_laneuv4si((int32x4_t)__v, __i);
    238  1.1  rin #endif
    239  1.1  rin }
    240  1.1  rin #elif defined(__clang__)
    241  1.1  rin #define	vgetq_lane_u32(__v, __i)					      \
    242  1.1  rin 	(uint32_t)__builtin_neon_vgetq_lane_i32((int32x4_t)(__v),	      \
    243  1.1  rin 	    __neon_laneq_index(__v, __i))
    244  1.1  rin #endif
    245  1.1  rin 
    246  1.1  rin _INTRINSATTR
    247  1.1  rin static __inline uint32x4_t
    248  1.1  rin vld1q_u32(const uint32_t *__p32)
    249  1.1  rin {
    250  1.1  rin #if defined(__GNUC__) && !defined(__clang__)
    251  1.1  rin #ifdef __aarch64__
    252  1.1  rin 	const __builtin_aarch64_simd_si *__p =
    253  1.1  rin 	    (const __builtin_aarch64_simd_si *)__p32;
    254  1.1  rin 
    255  1.1  rin 	return (uint32x4_t)__builtin_aarch64_ld1v4si(__p);
    256  1.1  rin #else
    257  1.1  rin 	const __builtin_neon_si *__p = (const __builtin_neon_si *)__p32;
    258  1.1  rin 
    259  1.1  rin 	return (uint32x4_t)__builtin_neon_vld1v4si(__p);
    260  1.1  rin #endif
    261  1.1  rin #elif defined(__clang__)
    262  1.1  rin 	uint32x4_t __v = (uint32x4_t)__builtin_neon_vld1q_v(__p32, 50);
    263  1.1  rin #ifndef __LITTLE_ENDIAN__
    264  1.1  rin 	__v = __builtin_shufflevector(__v, __v, 3,2,1,0);
    265  1.1  rin #endif
    266  1.1  rin 	return __v;
    267  1.1  rin #endif
    268  1.1  rin }
    269  1.1  rin 
    270  1.1  rin _INTRINSATTR
    271  1.1  rin static __inline uint8x16_t
    272  1.1  rin vld1q_u8(const uint8_t *__p8)
    273  1.1  rin {
    274  1.1  rin #if defined(__GNUC__) && !defined(__clang__)
    275  1.1  rin #ifdef __aarch64__
    276  1.1  rin 	const __builtin_aarch64_simd_qi *__p =
    277  1.1  rin 	    (const __builtin_aarch64_simd_qi *)__p8;
    278  1.1  rin 
    279  1.1  rin 	return (uint8x16_t)__builtin_aarch64_ld1v16qi(__p);
    280  1.1  rin #else
    281  1.1  rin 	const __builtin_neon_qi *__p = (const __builtin_neon_qi *)__p8;
    282  1.1  rin 
    283  1.1  rin 	return (uint8x16_t)__builtin_neon_vld1v16qi(__p);
    284  1.1  rin #endif
    285  1.1  rin #elif defined(__clang__)
    286  1.1  rin 	uint8x16_t __v = (uint8x16_t)__builtin_neon_vld1q_v(__p8, 48);
    287  1.1  rin #ifndef __LITTLE_ENDIAN__
    288  1.1  rin 	__v = __builtin_shufflevector(__v, __v,
    289  1.1  rin 	    15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0);
    290  1.1  rin #endif
    291  1.1  rin 	return __v;
    292  1.1  rin #endif
    293  1.1  rin }
    294  1.1  rin 
    295  1.1  rin _INTRINSATTR
    296  1.1  rin static __inline uint8x16_t
    297  1.1  rin vqtbl1q_u8(uint8x16_t __tab, uint8x16_t __idx)
    298  1.1  rin {
    299  1.1  rin #if defined(__GNUC__) && !defined(__clang__)
    300  1.1  rin #ifdef __aarch64__
    301  1.1  rin 	uint8x16_t __res;
    302  1.1  rin 	__asm__("tbl %0.16b, {%1.16b}, %2.16b"
    303  1.1  rin 	    : "=w"(__res) : "w"(__tab), "w"(__idx));
    304  1.1  rin 	return __res;
    305  1.1  rin #else
    306  1.1  rin 	/*
    307  1.1  rin 	 * No native ARMv7 NEON instruction for this, so do it via two
    308  1.1  rin 	 * half-width TBLs instead (vtbl2_u8 equivalent).
    309  1.1  rin 	 */
    310  1.1  rin 	uint64x2_t __tab64 = (uint64x2_t)__tab;
    311  1.1  rin 	uint8x8_t __tablo = (uint8x8_t)__tab64[0];
    312  1.1  rin 	uint8x8_t __tabhi = (uint8x8_t)__tab64[1];
    313  1.1  rin 	uint8x8x2_t __tab8x8x2 = { { __tablo, __tabhi } };
    314  1.1  rin 	union {
    315  1.1  rin 		uint8x8x2_t __u8x8x2;
    316  1.1  rin 		__builtin_neon_ti __ti;
    317  1.1  rin 	} __u = { __tab8x8x2 };
    318  1.1  rin 	uint64x2_t __idx64, __out64;
    319  1.1  rin 	int8x8_t __idxlo, __idxhi, __outlo, __outhi;
    320  1.1  rin 
    321  1.1  rin 	__idx64 = (uint64x2_t)__idx;
    322  1.1  rin 	__idxlo = (int8x8_t)__idx64[0];
    323  1.1  rin 	__idxhi = (int8x8_t)__idx64[1];
    324  1.1  rin 	__outlo = (int8x8_t)__builtin_neon_vtbl2v8qi(__u.__ti, __idxlo);
    325  1.1  rin 	__outhi = (int8x8_t)__builtin_neon_vtbl2v8qi(__u.__ti, __idxhi);
    326  1.1  rin 	__out64 = (uint64x2_t) { (uint64x1_t)__outlo, (uint64x1_t)__outhi };
    327  1.1  rin 
    328  1.1  rin 	return (uint8x16_t)__out64;
    329  1.1  rin #endif
    330  1.1  rin #elif defined(__clang__)
    331  1.1  rin #ifndef __LITTLE_ENDIAN__
    332  1.1  rin 	__tab = __builtin_shufflevector(__tab, __tab,
    333  1.1  rin 	    15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0);
    334  1.1  rin 	__idx = __builtin_shufflevector(__idx, __idx,
    335  1.1  rin 	    15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0);
    336  1.1  rin #endif
    337  1.1  rin 	uint8x16_t __r;
    338  1.1  rin #ifdef __aarch64__
    339  1.1  rin 	__r = __builtin_neon_vqtbl1q_v((int8x16_t)__tab, (int8x16_t)__idx, 48);
    340  1.1  rin #else
    341  1.1  rin 	uint64x2_t __tab64 = (uint64x2_t)__tab;
    342  1.1  rin 	uint8x8_t __tablo = (uint8x8_t)__tab64[0];
    343  1.1  rin 	uint8x8_t __tabhi = (uint8x8_t)__tab64[1];
    344  1.1  rin 	uint64x2_t __idx64, __out64;
    345  1.1  rin 	int8x8_t __idxlo, __idxhi, __outlo, __outhi;
    346  1.1  rin 
    347  1.1  rin 	__idx64 = (uint64x2_t)__idx;
    348  1.1  rin 	__idxlo = (int8x8_t)__idx64[0];
    349  1.1  rin 	__idxhi = (int8x8_t)__idx64[1];
    350  1.1  rin 	__outlo = (uint8x8_t)__builtin_neon_vtbl2_v((int8x8_t)__tablo,
    351  1.1  rin 	    (int8x8_t)__tabhi, (int8x8_t)__idxlo, 16);
    352  1.1  rin 	__outhi = (uint8x8_t)__builtin_neon_vtbl2_v((int8x8_t)__tablo,
    353  1.1  rin 	    (int8x8_t)__tabhi, (int8x8_t)__idxhi, 16);
    354  1.1  rin 	__out64 = (uint64x2_t) { (uint64_t)__outlo, (uint64_t)__outhi };
    355  1.1  rin 	__r = (uint8x16_t)__out64;
    356  1.1  rin #endif
    357  1.1  rin #ifndef __LITTLE_ENDIAN__
    358  1.1  rin 	__r = __builtin_shufflevector(__r, __r,
    359  1.1  rin 	    15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0);
    360  1.1  rin #endif
    361  1.1  rin 	return __r;
    362  1.1  rin #endif
    363  1.1  rin }
    364  1.1  rin 
    365  1.1  rin _INTRINSATTR
    366  1.1  rin static __inline int32x4_t
    367  1.1  rin vreinterpretq_s32_u8(uint8x16_t __v)
    368  1.1  rin {
    369  1.1  rin 	return (int32x4_t)__v;
    370  1.1  rin }
    371  1.1  rin 
    372  1.1  rin _INTRINSATTR
    373  1.1  rin static __inline uint16x8_t
    374  1.1  rin vreinterpretq_u16_u32(uint32x4_t __v)
    375  1.1  rin {
    376  1.1  rin 	return (uint16x8_t)__v;
    377  1.1  rin }
    378  1.1  rin 
    379  1.1  rin _INTRINSATTR
    380  1.1  rin static __inline uint32x4_t
    381  1.1  rin vreinterpretq_u32_u16(uint16x8_t __v)
    382  1.1  rin {
    383  1.1  rin 	return (uint32x4_t)__v;
    384  1.1  rin }
    385  1.1  rin 
    386  1.1  rin _INTRINSATTR
    387  1.1  rin static __inline uint32x4_t
    388  1.1  rin vreinterpretq_u32_u64(uint64x2_t __v)
    389  1.1  rin {
    390  1.1  rin 	return (uint32x4_t)__v;
    391  1.1  rin }
    392  1.1  rin 
    393  1.1  rin _INTRINSATTR
    394  1.1  rin static __inline uint32x4_t
    395  1.1  rin vreinterpretq_u32_u8(uint8x16_t __v)
    396  1.1  rin {
    397  1.1  rin 	return (uint32x4_t)__v;
    398  1.1  rin }
    399  1.1  rin 
    400  1.1  rin _INTRINSATTR
    401  1.1  rin static __inline uint64x2_t
    402  1.1  rin vreinterpretq_u64_u32(uint32x4_t __v)
    403  1.1  rin {
    404  1.1  rin 	return (uint64x2_t)__v;
    405  1.1  rin }
    406  1.1  rin 
    407  1.1  rin _INTRINSATTR
    408  1.1  rin static __inline uint64x2_t
    409  1.1  rin vreinterpretq_u64_u8(uint8x16_t __v)
    410  1.1  rin {
    411  1.1  rin 	return (uint64x2_t)__v;
    412  1.1  rin }
    413  1.1  rin 
    414  1.1  rin _INTRINSATTR
    415  1.1  rin static __inline uint8x16_t
    416  1.1  rin vreinterpretq_u8_s32(int32x4_t __v)
    417  1.1  rin {
    418  1.1  rin 	return (uint8x16_t)__v;
    419  1.1  rin }
    420  1.1  rin 
    421  1.1  rin _INTRINSATTR
    422  1.1  rin static __inline uint8x16_t
    423  1.1  rin vreinterpretq_u8_u32(uint32x4_t __v)
    424  1.1  rin {
    425  1.1  rin 	return (uint8x16_t)__v;
    426  1.1  rin }
    427  1.1  rin 
    428  1.1  rin _INTRINSATTR
    429  1.1  rin static __inline uint8x16_t
    430  1.1  rin vreinterpretq_u8_u64(uint64x2_t __v)
    431  1.1  rin {
    432  1.1  rin 	return (uint8x16_t)__v;
    433  1.1  rin }
    434  1.1  rin 
    435  1.1  rin _INTRINSATTR
    436  1.1  rin static __inline uint16x8_t
    437  1.1  rin vrev32q_u16(uint16x8_t __v)
    438  1.1  rin {
    439  1.1  rin #if defined(__GNUC__) && !defined(__clang__)
    440  1.1  rin 	return __builtin_shuffle(__v, (uint16x8_t) { 1,0, 3,2, 5,4, 7,6 });
    441  1.1  rin #elif defined(__clang__)
    442  1.1  rin 	return __builtin_shufflevector(__v, __v,  1,0, 3,2, 5,4, 7,6);
    443  1.1  rin #endif
    444  1.1  rin }
    445  1.1  rin 
    446  1.1  rin _INTRINSATTR
    447  1.1  rin static __inline uint8x16_t
    448  1.1  rin vrev32q_u8(uint8x16_t __v)
    449  1.1  rin {
    450  1.1  rin #if defined(__GNUC__) && !defined(__clang__)
    451  1.1  rin 	return __builtin_shuffle(__v,
    452  1.1  rin 	    (uint8x16_t) { 3,2,1,0, 7,6,5,4, 11,10,9,8, 15,14,13,12 });
    453  1.1  rin #elif defined(__clang__)
    454  1.1  rin 	return __builtin_shufflevector(__v, __v,
    455  1.1  rin 	    3,2,1,0, 7,6,5,4, 11,10,9,8, 15,14,13,12);
    456  1.1  rin #endif
    457  1.1  rin }
    458  1.1  rin 
    459  1.1  rin #if defined(__GNUC__) && !defined(__clang__)
    460  1.1  rin _INTRINSATTR
    461  1.1  rin static __inline uint32x4_t
    462  1.1  rin vsetq_lane_u32(uint32_t __x, uint32x4_t __v, uint8_t __i)
    463  1.1  rin {
    464  1.1  rin 	__v[__neon_laneq_index(__v, __i)] = __x;
    465  1.1  rin 	return __v;
    466  1.1  rin }
    467  1.1  rin #elif defined(__clang__)
    468  1.1  rin #define	vsetq_lane_u32(__x, __v, __i)					      \
    469  1.1  rin 	(uint32x4_t)__builtin_neon_vsetq_lane_i32((__x), (int32x4_t)(__v),    \
    470  1.1  rin 	    __neon_laneq_index(__v, __i))
    471  1.1  rin #endif
    472  1.1  rin 
    473  1.1  rin #if defined(__GNUC__) && !defined(__clang__)
    474  1.1  rin _INTRINSATTR
    475  1.1  rin static __inline uint64x2_t
    476  1.1  rin vsetq_lane_u64(uint64_t __x, uint64x2_t __v, uint8_t __i)
    477  1.1  rin {
    478  1.1  rin 	__v[__neon_laneq_index(__v, __i)] = __x;
    479  1.1  rin 	return __v;
    480  1.1  rin }
    481  1.1  rin #elif defined(__clang__)
    482  1.1  rin #define	vsetq_lane_u64(__x, __v, __i)					      \
    483  1.1  rin 	(uint64x2_t)__builtin_neon_vsetq_lane_i64((__x), (int64x2_t)(__v),    \
    484  1.1  rin 	    __neon_laneq_index(__v, __i));
    485  1.1  rin #endif
    486  1.1  rin 
    487  1.1  rin #if defined(__GNUC__) && !defined(__clang__)
    488  1.1  rin _INTRINSATTR
    489  1.1  rin static __inline int32x4_t
    490  1.1  rin vshlq_n_s32(int32x4_t __v, uint8_t __bits)
    491  1.1  rin {
    492  1.1  rin #ifdef __aarch64__
    493  1.1  rin 	return (int32x4_t)__builtin_aarch64_ashlv4si(__v, __bits);
    494  1.1  rin #else
    495  1.1  rin 	return (int32x4_t)__builtin_neon_vshl_nv4si(__v, __bits);
    496  1.1  rin #endif
    497  1.1  rin }
    498  1.1  rin #elif defined(__clang__)
    499  1.1  rin #define	vshlq_n_s32(__v, __bits)					      \
    500  1.1  rin 	(int32x4_t)__builtin_neon_vshlq_n_v((int32x4_t)(__v), (__bits), 34)
    501  1.1  rin #endif
    502  1.1  rin 
    503  1.1  rin #if defined(__GNUC__) && !defined(__clang__)
    504  1.1  rin _INTRINSATTR
    505  1.1  rin static __inline uint32x4_t
    506  1.1  rin vshlq_n_u32(uint32x4_t __v, uint8_t __bits)
    507  1.1  rin {
    508  1.1  rin #ifdef __aarch64__
    509  1.1  rin 	return (uint32x4_t)__builtin_aarch64_ashlv4si((int32x4_t)__v, __bits);
    510  1.1  rin #else
    511  1.1  rin 	return (uint32x4_t)__builtin_neon_vshl_nv4si((int32x4_t)__v, __bits);
    512  1.1  rin #endif
    513  1.1  rin }
    514  1.1  rin #elif defined(__clang__)
    515  1.1  rin #define	vshlq_n_u32(__v, __bits)					      \
    516  1.1  rin 	(uint32x4_t)__builtin_neon_vshlq_n_v((int32x4_t)(__v), (__bits), 50)
    517  1.1  rin #endif
    518  1.1  rin 
    519  1.1  rin #if defined(__GNUC__) && !defined(__clang__)
    520  1.1  rin _INTRINSATTR
    521  1.1  rin static __inline uint32x4_t
    522  1.1  rin vshrq_n_u32(uint32x4_t __v, uint8_t __bits)
    523  1.1  rin {
    524  1.1  rin #ifdef __aarch64__
    525  1.2  rin #  if __GNUC_PREREQ__(12, 0)
    526  1.2  rin 	return __builtin_aarch64_lshrv4si_uus(__v, __bits);
    527  1.2  rin #  else
    528  1.1  rin 	return (uint32x4_t)__builtin_aarch64_lshrv4si((int32x4_t)__v, __bits);
    529  1.2  rin #  endif
    530  1.1  rin #else
    531  1.1  rin 	return (uint32x4_t)__builtin_neon_vshru_nv4si((int32x4_t)__v, __bits);
    532  1.1  rin #endif
    533  1.1  rin }
    534  1.1  rin #elif defined(__clang__)
    535  1.1  rin #define	vshrq_n_u32(__v, __bits)					      \
    536  1.1  rin 	(uint32x4_t)__builtin_neon_vshrq_n_v((int32x4_t)(__v), (__bits), 50)
    537  1.1  rin #endif
    538  1.1  rin 
    539  1.1  rin #if defined(__GNUC__) && !defined(__clang__)
    540  1.1  rin _INTRINSATTR
    541  1.1  rin static __inline uint8x16_t
    542  1.1  rin vshrq_n_u8(uint8x16_t __v, uint8_t __bits)
    543  1.1  rin {
    544  1.1  rin #ifdef __aarch64__
    545  1.2  rin #  if __GNUC_PREREQ__(12, 0)
    546  1.2  rin 	return __builtin_aarch64_lshrv16qi_uus(__v, __bits);
    547  1.2  rin #  else
    548  1.1  rin 	return (uint8x16_t)__builtin_aarch64_lshrv16qi((int8x16_t)__v, __bits);
    549  1.2  rin #  endif
    550  1.1  rin #else
    551  1.1  rin 	return (uint8x16_t)__builtin_neon_vshru_nv16qi((int8x16_t)__v, __bits);
    552  1.1  rin #endif
    553  1.1  rin }
    554  1.1  rin #elif defined(__clang__)
    555  1.1  rin #define	vshrq_n_u8(__v, __bits)						      \
    556  1.1  rin 	(uint8x16_t)__builtin_neon_vshrq_n_v((int8x16_t)(__v), (__bits), 48)
    557  1.1  rin #endif
    558  1.1  rin 
    559  1.1  rin #if defined(__GNUC__) && !defined(__clang__)
    560  1.1  rin _INTRINSATTR
    561  1.1  rin static __inline int32x4_t
    562  1.1  rin vsliq_n_s32(int32x4_t __vins, int32x4_t __vsh, uint8_t __bits)
    563  1.1  rin {
    564  1.1  rin #ifdef __aarch64__
    565  1.1  rin 	return (int32x4_t)__builtin_aarch64_ssli_nv4si(__vins, __vsh, __bits);
    566  1.1  rin #else
    567  1.1  rin 	return (int32x4_t)__builtin_neon_vsli_nv4si(__vins, __vsh, __bits);
    568  1.1  rin #endif
    569  1.1  rin }
    570  1.1  rin #elif defined(__clang__)
    571  1.1  rin #ifdef __LITTLE_ENDIAN__
    572  1.1  rin #define	vsliq_n_s32(__vins, __vsh, __bits)				      \
    573  1.1  rin 	(int32x4_t)__builtin_neon_vsliq_n_v((int32x4_t)(__vins),	      \
    574  1.1  rin 	    (int32x4_t)(__vsh), (__bits), 34)
    575  1.1  rin #else
    576  1.1  rin #define	vsliq_n_s32(__vins, __vsh, __bits) (				      \
    577  1.1  rin {									      \
    578  1.1  rin 	int32x4_t __tvins = (__vins);					      \
    579  1.1  rin 	int32x4_t __tvsh = (__vsh);					      \
    580  1.1  rin 	uint8_t __tbits = (__bits);					      \
    581  1.1  rin 	int32x4_t __vins_r = __builtin_shufflevector(__tvins, __tvins,	      \
    582  1.1  rin 	    3,2,1,0);							      \
    583  1.1  rin 	int32x4_t __vsh_r = __builtin_shufflevector(__tvsh, __tvsh,	      \
    584  1.1  rin 	    3,2,1,0);							      \
    585  1.1  rin 	int32x4_t __r = __builtin_neon_vsliq_n_v(__tvins, __tvsh, __tbits,    \
    586  1.1  rin 	    34);							      \
    587  1.1  rin 	__builtin_shufflevector(__r, __r, 3,2,1,0);			      \
    588  1.1  rin })
    589  1.1  rin #endif	/* __LITTLE_ENDIAN__ */
    590  1.1  rin #endif
    591  1.1  rin 
    592  1.1  rin #if defined(__GNUC__) && !defined(__clang__)
    593  1.1  rin _INTRINSATTR
    594  1.1  rin static __inline uint32x4_t
    595  1.1  rin vsriq_n_u32(uint32x4_t __vins, uint32x4_t __vsh, uint8_t __bits)
    596  1.1  rin {
    597  1.1  rin #ifdef __aarch64__
    598  1.1  rin 	return __builtin_aarch64_usri_nv4si_uuus(__vins, __vsh, __bits);
    599  1.1  rin #else
    600  1.1  rin 	return (uint32x4_t)__builtin_neon_vsri_nv4si((int32x4_t)__vins,
    601  1.1  rin 	    (int32x4_t)__vsh, __bits);
    602  1.1  rin #endif
    603  1.1  rin }
    604  1.1  rin #elif defined(__clang__)
    605  1.1  rin #ifdef __LITTLE_ENDIAN__
    606  1.1  rin #define	vsriq_n_u32(__vins, __vsh, __bits)				      \
    607  1.1  rin 	(int32x4_t)__builtin_neon_vsriq_n_v((int32x4_t)(__vins),	      \
    608  1.1  rin 	    (int32x4_t)(__vsh), (__bits), 34)
    609  1.1  rin #else
    610  1.1  rin #define	vsriq_n_s32(__vins, __vsh, __bits) (				      \
    611  1.1  rin {									      \
    612  1.1  rin 	int32x4_t __tvins = (__vins);					      \
    613  1.1  rin 	int32x4_t __tvsh = (__vsh);					      \
    614  1.1  rin 	uint8_t __tbits = (__bits);					      \
    615  1.1  rin 	int32x4_t __vins_r = __builtin_shufflevector(__tvins, __tvins,	      \
    616  1.1  rin 	    3,2,1,0);							      \
    617  1.1  rin 	int32x4_t __vsh_r = __builtin_shufflevector(__tvsh, __tvsh,	      \
    618  1.1  rin 	    3,2,1,0);							      \
    619  1.1  rin 	int32x4_t __r = __builtin_neon_vsriq_n_v(__tvins, __tvsh, __tbits,    \
    620  1.1  rin 	    34);							      \
    621  1.1  rin 	__builtin_shufflevector(__r, __r, 3,2,1,0);			      \
    622  1.1  rin })
    623  1.1  rin #endif
    624  1.1  rin #endif
    625  1.1  rin 
    626  1.1  rin _INTRINSATTR
    627  1.1  rin static __inline void
    628  1.1  rin vst1q_u32(uint32_t *__p32, uint32x4_t __v)
    629  1.1  rin {
    630  1.1  rin #if defined(__GNUC__) && !defined(__clang__)
    631  1.1  rin #ifdef __aarch64__
    632  1.1  rin 	__builtin_aarch64_simd_si *__p = (__builtin_aarch64_simd_si *)__p32;
    633  1.1  rin 
    634  1.1  rin 	__builtin_aarch64_st1v4si(__p, (int32x4_t)__v);
    635  1.1  rin #else
    636  1.1  rin 	__builtin_neon_si *__p = (__builtin_neon_si *)__p32;
    637  1.1  rin 
    638  1.1  rin 	__builtin_neon_vst1v4si(__p, (int32x4_t)__v);
    639  1.1  rin #endif
    640  1.1  rin #elif defined(__clang__)
    641  1.1  rin #ifndef __LITTLE_ENDIAN__
    642  1.1  rin 	__v = __builtin_shufflevector(__v, __v, 3,2,1,0);
    643  1.1  rin #endif
    644  1.1  rin 	__builtin_neon_vst1q_v(__p32, __v, 50);
    645  1.1  rin #endif
    646  1.1  rin }
    647  1.1  rin 
    648  1.1  rin _INTRINSATTR
    649  1.1  rin static __inline void
    650  1.1  rin vst1q_u8(uint8_t *__p8, uint8x16_t __v)
    651  1.1  rin {
    652  1.1  rin #if defined(__GNUC__) && !defined(__clang__)
    653  1.1  rin #ifdef __aarch64__
    654  1.1  rin 	__builtin_aarch64_simd_qi *__p = (__builtin_aarch64_simd_qi *)__p8;
    655  1.1  rin 
    656  1.1  rin 	__builtin_aarch64_st1v16qi(__p, (int8x16_t)__v);
    657  1.1  rin #else
    658  1.1  rin 	__builtin_neon_qi *__p = (__builtin_neon_qi *)__p8;
    659  1.1  rin 
    660  1.1  rin 	__builtin_neon_vst1v16qi(__p, (int8x16_t)__v);
    661  1.1  rin #endif
    662  1.1  rin #elif defined(__clang__)
    663  1.1  rin #ifndef __LITTLE_ENDIAN__
    664  1.1  rin 	__v = __builtin_shufflevector(__v, __v,
    665  1.1  rin 	    15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0);
    666  1.1  rin #endif
    667  1.1  rin 	__builtin_neon_vst1q_v(__p8, __v, 48);
    668  1.1  rin #endif
    669  1.1  rin }
    670  1.1  rin 
    671  1.1  rin #ifndef __aarch64__		/* XXX */
    672  1.1  rin 
    673  1.1  rin _INTRINSATTR
    674  1.1  rin static __inline uint8x8_t
    675  1.1  rin vtbl1_u8(uint8x8_t __tab, uint8x8_t __idx)
    676  1.1  rin {
    677  1.1  rin #if defined(__GNUC__) && !defined(__clang__)
    678  1.1  rin 	return (uint8x8_t)__builtin_neon_vtbl1v8qi((int8x8_t)__tab,
    679  1.1  rin 	    (int8x8_t)__idx);
    680  1.1  rin #elif defined(__clang__)
    681  1.1  rin 	uint8x8_t __ret;
    682  1.1  rin #ifndef __LITTLE_ENDIAN__
    683  1.1  rin 	__tab = __builtin_shufflevector(__tab, __tab, 7,6,5,4,3,2,1,0);
    684  1.1  rin 	__idx = __builtin_shufflevector(__idx, __idx, 7,6,5,4,3,2,1,0);
    685  1.1  rin #endif
    686  1.1  rin 	__ret = (uint8x8_t)__builtin_neon_vtbl1_v((int8x8_t)__tab,
    687  1.1  rin 	    (int8x8_t)__idx, 16);
    688  1.1  rin #ifndef __LITTLE_ENDIAN__
    689  1.1  rin 	__ret = __builtin_shufflevector(__ret, __ret, 7,6,5,4,3,2,1,0);
    690  1.1  rin #endif
    691  1.1  rin 	return __ret;
    692  1.1  rin #endif
    693  1.1  rin }
    694  1.1  rin 
    695  1.1  rin _INTRINSATTR
    696  1.1  rin static __inline uint8x8_t
    697  1.1  rin vtbl2_u8(uint8x8x2_t __tab, uint8x8_t __idx)
    698  1.1  rin {
    699  1.1  rin #if defined(__GNUC__) && !defined(__clang__)
    700  1.1  rin 	union {
    701  1.1  rin 		uint8x8x2_t __u8x8x82;
    702  1.1  rin 		__builtin_neon_ti __ti;
    703  1.1  rin 	} __u = { __tab };
    704  1.1  rin 	return (uint8x8_t)__builtin_neon_vtbl2v8qi(__u.__ti, (int8x8_t)__idx);
    705  1.1  rin #elif defined(__clang__)
    706  1.1  rin 	uint8x8_t __ret;
    707  1.1  rin #ifndef __LITTLE_ENDIAN__
    708  1.1  rin 	__tab.val[0] = __builtin_shufflevector(__tab.val[0], __tab.val[0],
    709  1.1  rin 	    7,6,5,4,3,2,1,0);
    710  1.1  rin 	__tab.val[1] = __builtin_shufflevector(__tab.val[1], __tab.val[1],
    711  1.1  rin 	    7,6,5,4,3,2,1,0);
    712  1.1  rin 	__idx = __builtin_shufflevector(__idx, __idx, 7,6,5,4,3,2,1,0);
    713  1.1  rin #endif
    714  1.1  rin 	__ret = (uint8x8_t)__builtin_neon_vtbl2_v((int8x8_t)__tab.val[0],
    715  1.1  rin 	    (int8x8_t)__tab.val[1], (int8x8_t)__idx, 16);
    716  1.1  rin #ifndef __LITTLE_ENDIAN__
    717  1.1  rin 	__ret = __builtin_shufflevector(__ret, __ret, 7,6,5,4,3,2,1,0);
    718  1.1  rin #endif
    719  1.1  rin 	return __ret;
    720  1.1  rin #endif
    721  1.1  rin }
    722  1.1  rin 
    723  1.1  rin #endif	/* !defined(__aarch64__) */
    724  1.1  rin 
    725  1.1  rin #endif	/* _SYS_CRYPTO_ARCH_ARM_ARM_NEON_H */
    726