arm_neon.h revision 1.2 1 1.2 rin /* $NetBSD: arm_neon.h,v 1.2 2023/08/07 01:14:19 rin Exp $ */
2 1.1 rin
3 1.1 rin /*-
4 1.1 rin * Copyright (c) 2020 The NetBSD Foundation, Inc.
5 1.1 rin * All rights reserved.
6 1.1 rin *
7 1.1 rin * Redistribution and use in source and binary forms, with or without
8 1.1 rin * modification, are permitted provided that the following conditions
9 1.1 rin * are met:
10 1.1 rin * 1. Redistributions of source code must retain the above copyright
11 1.1 rin * notice, this list of conditions and the following disclaimer.
12 1.1 rin * 2. Redistributions in binary form must reproduce the above copyright
13 1.1 rin * notice, this list of conditions and the following disclaimer in the
14 1.1 rin * documentation and/or other materials provided with the distribution.
15 1.1 rin *
16 1.1 rin * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
17 1.1 rin * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
18 1.1 rin * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
19 1.1 rin * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
20 1.1 rin * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
21 1.1 rin * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
22 1.1 rin * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
23 1.1 rin * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
24 1.1 rin * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
25 1.1 rin * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
26 1.1 rin * POSSIBILITY OF SUCH DAMAGE.
27 1.1 rin */
28 1.1 rin
29 1.1 rin #ifndef _SYS_CRYPTO_ARCH_ARM_ARM_NEON_H
30 1.1 rin #define _SYS_CRYPTO_ARCH_ARM_ARM_NEON_H
31 1.1 rin
32 1.1 rin #if defined(__GNUC__) && !defined(__clang__)
33 1.1 rin
34 1.1 rin #define _INTRINSATTR \
35 1.1 rin __extension__ \
36 1.1 rin __attribute__((__always_inline__, __gnu_inline__, __artificial__))
37 1.1 rin
38 1.1 rin #ifdef __aarch64__
39 1.1 rin typedef __Int32x4_t int32x4_t;
40 1.1 rin typedef __Int64x2_t int64x2_t;
41 1.1 rin typedef __Int8x16_t int8x16_t;
42 1.1 rin typedef __Uint16x8_t uint16x8_t;
43 1.1 rin typedef __Uint32x4_t uint32x4_t;
44 1.1 rin typedef __Uint64x2_t uint64x2_t;
45 1.1 rin typedef __Uint8x16_t uint8x16_t;
46 1.1 rin typedef struct { uint8x16_t val[2]; } uint8x16x2_t;
47 1.1 rin #else
48 1.1 rin typedef __simd128_int32_t int32x4_t;
49 1.1 rin typedef __simd128_int64_t int64x2_t;
50 1.1 rin typedef __simd128_int8_t int8x16_t;
51 1.1 rin typedef __simd128_uint16_t uint16x8_t;
52 1.1 rin typedef __simd128_uint32_t uint32x4_t;
53 1.1 rin typedef __simd128_uint64_t uint64x2_t;
54 1.1 rin typedef __simd128_uint8_t uint8x16_t;
55 1.1 rin
56 1.1 rin typedef __simd64_int8_t int8x8_t;
57 1.1 rin typedef __simd64_uint8_t uint8x8_t;
58 1.1 rin typedef __builtin_neon_udi uint64x1_t;
59 1.1 rin typedef struct { uint8x8_t val[2]; } uint8x8x2_t;
60 1.1 rin typedef struct { uint8x16_t val[2]; } uint8x16x2_t;
61 1.1 rin #endif
62 1.1 rin
63 1.1 rin #if defined(__AARCH64EB__)
64 1.1 rin #define __neon_lane_index(__v, __i) (__arraycount(__v) - 1 - (__i))
65 1.1 rin #define __neon_laneq_index(__v, __i) (__arraycount(__v) - 1 - (__i))
66 1.1 rin #elif defined(__ARM_BIG_ENDIAN)
67 1.1 rin #define __neon_lane_index(__v, __i) ((__i) ^ (__arraycount(__v) - 1))
68 1.1 rin #define __neon_laneq_index(__v, __i) ((__i) ^ (__arraycount(__v)/2 - 1))
69 1.1 rin #else
70 1.1 rin #define __neon_lane_index(__v, __i) (__i)
71 1.1 rin #define __neon_laneq_index(__v, __i) (__i)
72 1.1 rin #endif
73 1.1 rin
74 1.1 rin #elif defined(__clang__)
75 1.1 rin
76 1.1 rin #define _INTRINSATTR \
77 1.1 rin __attribute__((__always_inline__, __nodebug__))
78 1.1 rin
79 1.1 rin typedef __attribute__((neon_vector_type(16))) int8_t int8x16_t;
80 1.1 rin typedef __attribute__((neon_vector_type(2))) int64_t int64x2_t;
81 1.1 rin typedef __attribute__((neon_vector_type(4))) int32_t int32x4_t;
82 1.1 rin
83 1.1 rin typedef __attribute__((neon_vector_type(16))) uint8_t uint8x16_t;
84 1.1 rin typedef __attribute__((neon_vector_type(2))) uint64_t uint64x2_t;
85 1.1 rin typedef __attribute__((neon_vector_type(4))) uint32_t uint32x4_t;
86 1.1 rin typedef __attribute__((neon_vector_type(8))) uint16_t uint16x8_t;
87 1.1 rin
88 1.1 rin typedef __attribute__((neon_vector_type(8))) int8_t int8x8_t;
89 1.1 rin
90 1.1 rin typedef __attribute__((neon_vector_type(8))) uint8_t uint8x8_t;
91 1.1 rin
92 1.1 rin typedef struct { uint8x8_t val[2]; } uint8x8x2_t;
93 1.1 rin typedef struct { uint8x16_t val[2]; } uint8x16x2_t;
94 1.1 rin
95 1.1 rin #ifdef __LITTLE_ENDIAN__
96 1.1 rin #define __neon_lane_index(__v, __i) __i
97 1.1 rin #define __neon_laneq_index(__v, __i) __i
98 1.1 rin #else
99 1.1 rin #define __neon_lane_index(__v, __i) (__arraycount(__v) - 1 - __i)
100 1.1 rin #define __neon_laneq_index(__v, __i) (__arraycount(__v) - 1 - __i)
101 1.1 rin #endif
102 1.1 rin
103 1.1 rin #else
104 1.1 rin
105 1.1 rin #error Teach me how to neon in your compile!
106 1.1 rin
107 1.1 rin #endif
108 1.1 rin
109 1.1 rin _INTRINSATTR
110 1.1 rin static __inline uint32x4_t
111 1.1 rin vaddq_u32(uint32x4_t __v0, uint32x4_t __v1)
112 1.1 rin {
113 1.1 rin return __v0 + __v1;
114 1.1 rin }
115 1.1 rin
116 1.1 rin _INTRINSATTR
117 1.1 rin static __inline uint32x4_t
118 1.1 rin vcltq_s32(int32x4_t __v0, int32x4_t __v1)
119 1.1 rin {
120 1.1 rin return (uint32x4_t)(__v0 < __v1);
121 1.1 rin }
122 1.1 rin
123 1.1 rin _INTRINSATTR
124 1.1 rin static __inline int32x4_t
125 1.1 rin vdupq_n_s32(int32_t __x)
126 1.1 rin {
127 1.1 rin return (int32x4_t) { __x, __x, __x, __x };
128 1.1 rin }
129 1.1 rin
130 1.1 rin _INTRINSATTR
131 1.1 rin static __inline uint32x4_t
132 1.1 rin vdupq_n_u32(uint32_t __x)
133 1.1 rin {
134 1.1 rin return (uint32x4_t) { __x, __x, __x, __x };
135 1.1 rin }
136 1.1 rin
137 1.1 rin _INTRINSATTR
138 1.1 rin static __inline uint8x16_t
139 1.1 rin vdupq_n_u8(uint8_t __x)
140 1.1 rin {
141 1.1 rin return (uint8x16_t) {
142 1.1 rin __x, __x, __x, __x, __x, __x, __x, __x,
143 1.1 rin __x, __x, __x, __x, __x, __x, __x, __x,
144 1.1 rin };
145 1.1 rin }
146 1.1 rin
147 1.1 rin #if defined(__GNUC__) && !defined(__clang__)
148 1.1 rin _INTRINSATTR
149 1.1 rin static __inline uint32x4_t
150 1.1 rin vextq_u32(uint32x4_t __lo, uint32x4_t __hi, uint8_t __i)
151 1.1 rin {
152 1.1 rin #if defined(__AARCH64EB__) || defined(__ARM_BIG_ENDIAN)
153 1.1 rin return __builtin_shuffle(__hi, __lo,
154 1.1 rin (uint32x4_t) { 4 - __i, 5 - __i, 6 - __i, 7 - __i });
155 1.1 rin #else
156 1.1 rin return __builtin_shuffle(__lo, __hi,
157 1.1 rin (uint32x4_t) { __i + 0, __i + 1, __i + 2, __i + 3 });
158 1.1 rin #endif
159 1.1 rin }
160 1.1 rin #elif defined(__clang__)
161 1.1 rin #ifdef __LITTLE_ENDIAN__
162 1.1 rin #define vextq_u32(__lo, __hi, __i) \
163 1.1 rin (uint32x4_t)__builtin_neon_vextq_v((int8x16_t)(__lo), \
164 1.1 rin (int8x16_t)(__hi), (__i), 50)
165 1.1 rin #else
166 1.1 rin #define vextq_u32(__lo, __hi, __i) ( \
167 1.1 rin { \
168 1.1 rin uint32x4_t __tlo = (__lo); \
169 1.1 rin uint32x4_t __thi = (__hi); \
170 1.1 rin uint32x4_t __lo_r = __builtin_shufflevector(__tlo, __tlo, 3,2,1,0); \
171 1.1 rin uint32x4_t __hi_r = __builtin_shufflevector(__thi, __thi, 3,2,1,0); \
172 1.1 rin uint32x4_t __r = __builtin_neon_vextq_v((int8x16_t)__lo_r, \
173 1.1 rin (int8x16_t)__hi_r, __i, 50); \
174 1.1 rin __builtin_shufflevector(__r, __r, 3,2,1,0); \
175 1.1 rin })
176 1.1 rin #endif /* __LITTLE_ENDIAN__ */
177 1.1 rin #endif
178 1.1 rin
179 1.1 rin #if defined(__GNUC__) && !defined(__clang__)
180 1.1 rin _INTRINSATTR
181 1.1 rin static __inline uint8x16_t
182 1.1 rin vextq_u8(uint8x16_t __lo, uint8x16_t __hi, uint8_t __i)
183 1.1 rin {
184 1.1 rin #ifdef __aarch64__
185 1.1 rin #if defined(__AARCH64EB__)
186 1.1 rin return __builtin_shuffle(__hi, __lo,
187 1.1 rin (uint8x16_t) {
188 1.1 rin 16 - __i, 17 - __i, 18 - __i, 19 - __i,
189 1.1 rin 20 - __i, 21 - __i, 22 - __i, 23 - __i,
190 1.1 rin 24 - __i, 25 - __i, 26 - __i, 27 - __i,
191 1.1 rin 28 - __i, 29 - __i, 30 - __i, 31 - __i,
192 1.1 rin });
193 1.1 rin #else
194 1.1 rin return __builtin_shuffle(__lo, __hi,
195 1.1 rin (uint8x16_t) {
196 1.1 rin __i + 0, __i + 1, __i + 2, __i + 3,
197 1.1 rin __i + 4, __i + 5, __i + 6, __i + 7,
198 1.1 rin __i + 8, __i + 9, __i + 10, __i + 11,
199 1.1 rin __i + 12, __i + 13, __i + 14, __i + 15,
200 1.1 rin });
201 1.1 rin #endif
202 1.1 rin #else
203 1.1 rin return (uint8x16_t)__builtin_neon_vextv16qi((int8x16_t)__lo,
204 1.1 rin (int8x16_t)__hi, __i);
205 1.1 rin #endif
206 1.1 rin }
207 1.1 rin #elif defined(__clang__)
208 1.1 rin #ifdef __LITTLE_ENDIAN__
209 1.1 rin #define vextq_u8(__lo, __hi, __i) \
210 1.1 rin (uint8x16_t)__builtin_neon_vextq_v((int8x16_t)(__lo), \
211 1.1 rin (int8x16_t)(__hi), (__i), 48)
212 1.1 rin #else
213 1.1 rin #define vextq_u8(__lo, __hi, __i) ( \
214 1.1 rin { \
215 1.1 rin uint8x16_t __tlo = (__lo); \
216 1.1 rin uint8x16_t __thi = (__hi); \
217 1.1 rin uint8x16_t __lo_r = __builtin_shufflevector(__tlo, __tlo, \
218 1.1 rin 15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0); \
219 1.1 rin uint8x16_t __hi_r = __builtin_shufflevector(__thi, __thi, \
220 1.1 rin 15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0); \
221 1.1 rin uint8x16_t __r = __builtin_neon_vextq_v((int8x16_t)__lo_r, \
222 1.1 rin (int8x16_t)__hi_r, (__i), 48); \
223 1.1 rin __builtin_shufflevector(__r, __r, \
224 1.1 rin 15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0); \
225 1.1 rin })
226 1.1 rin #endif /* __LITTLE_ENDIAN */
227 1.1 rin #endif
228 1.1 rin
229 1.1 rin #if defined(__GNUC__) && !defined(__clang__)
230 1.1 rin _INTRINSATTR
231 1.1 rin static __inline uint32_t
232 1.1 rin vgetq_lane_u32(uint32x4_t __v, uint8_t __i)
233 1.1 rin {
234 1.1 rin #ifdef __aarch64__
235 1.1 rin return __v[__neon_laneq_index(__v, __i)];
236 1.1 rin #else
237 1.1 rin return (uint32_t)__builtin_neon_vget_laneuv4si((int32x4_t)__v, __i);
238 1.1 rin #endif
239 1.1 rin }
240 1.1 rin #elif defined(__clang__)
241 1.1 rin #define vgetq_lane_u32(__v, __i) \
242 1.1 rin (uint32_t)__builtin_neon_vgetq_lane_i32((int32x4_t)(__v), \
243 1.1 rin __neon_laneq_index(__v, __i))
244 1.1 rin #endif
245 1.1 rin
246 1.1 rin _INTRINSATTR
247 1.1 rin static __inline uint32x4_t
248 1.1 rin vld1q_u32(const uint32_t *__p32)
249 1.1 rin {
250 1.1 rin #if defined(__GNUC__) && !defined(__clang__)
251 1.1 rin #ifdef __aarch64__
252 1.1 rin const __builtin_aarch64_simd_si *__p =
253 1.1 rin (const __builtin_aarch64_simd_si *)__p32;
254 1.1 rin
255 1.1 rin return (uint32x4_t)__builtin_aarch64_ld1v4si(__p);
256 1.1 rin #else
257 1.1 rin const __builtin_neon_si *__p = (const __builtin_neon_si *)__p32;
258 1.1 rin
259 1.1 rin return (uint32x4_t)__builtin_neon_vld1v4si(__p);
260 1.1 rin #endif
261 1.1 rin #elif defined(__clang__)
262 1.1 rin uint32x4_t __v = (uint32x4_t)__builtin_neon_vld1q_v(__p32, 50);
263 1.1 rin #ifndef __LITTLE_ENDIAN__
264 1.1 rin __v = __builtin_shufflevector(__v, __v, 3,2,1,0);
265 1.1 rin #endif
266 1.1 rin return __v;
267 1.1 rin #endif
268 1.1 rin }
269 1.1 rin
270 1.1 rin _INTRINSATTR
271 1.1 rin static __inline uint8x16_t
272 1.1 rin vld1q_u8(const uint8_t *__p8)
273 1.1 rin {
274 1.1 rin #if defined(__GNUC__) && !defined(__clang__)
275 1.1 rin #ifdef __aarch64__
276 1.1 rin const __builtin_aarch64_simd_qi *__p =
277 1.1 rin (const __builtin_aarch64_simd_qi *)__p8;
278 1.1 rin
279 1.1 rin return (uint8x16_t)__builtin_aarch64_ld1v16qi(__p);
280 1.1 rin #else
281 1.1 rin const __builtin_neon_qi *__p = (const __builtin_neon_qi *)__p8;
282 1.1 rin
283 1.1 rin return (uint8x16_t)__builtin_neon_vld1v16qi(__p);
284 1.1 rin #endif
285 1.1 rin #elif defined(__clang__)
286 1.1 rin uint8x16_t __v = (uint8x16_t)__builtin_neon_vld1q_v(__p8, 48);
287 1.1 rin #ifndef __LITTLE_ENDIAN__
288 1.1 rin __v = __builtin_shufflevector(__v, __v,
289 1.1 rin 15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0);
290 1.1 rin #endif
291 1.1 rin return __v;
292 1.1 rin #endif
293 1.1 rin }
294 1.1 rin
295 1.1 rin _INTRINSATTR
296 1.1 rin static __inline uint8x16_t
297 1.1 rin vqtbl1q_u8(uint8x16_t __tab, uint8x16_t __idx)
298 1.1 rin {
299 1.1 rin #if defined(__GNUC__) && !defined(__clang__)
300 1.1 rin #ifdef __aarch64__
301 1.1 rin uint8x16_t __res;
302 1.1 rin __asm__("tbl %0.16b, {%1.16b}, %2.16b"
303 1.1 rin : "=w"(__res) : "w"(__tab), "w"(__idx));
304 1.1 rin return __res;
305 1.1 rin #else
306 1.1 rin /*
307 1.1 rin * No native ARMv7 NEON instruction for this, so do it via two
308 1.1 rin * half-width TBLs instead (vtbl2_u8 equivalent).
309 1.1 rin */
310 1.1 rin uint64x2_t __tab64 = (uint64x2_t)__tab;
311 1.1 rin uint8x8_t __tablo = (uint8x8_t)__tab64[0];
312 1.1 rin uint8x8_t __tabhi = (uint8x8_t)__tab64[1];
313 1.1 rin uint8x8x2_t __tab8x8x2 = { { __tablo, __tabhi } };
314 1.1 rin union {
315 1.1 rin uint8x8x2_t __u8x8x2;
316 1.1 rin __builtin_neon_ti __ti;
317 1.1 rin } __u = { __tab8x8x2 };
318 1.1 rin uint64x2_t __idx64, __out64;
319 1.1 rin int8x8_t __idxlo, __idxhi, __outlo, __outhi;
320 1.1 rin
321 1.1 rin __idx64 = (uint64x2_t)__idx;
322 1.1 rin __idxlo = (int8x8_t)__idx64[0];
323 1.1 rin __idxhi = (int8x8_t)__idx64[1];
324 1.1 rin __outlo = (int8x8_t)__builtin_neon_vtbl2v8qi(__u.__ti, __idxlo);
325 1.1 rin __outhi = (int8x8_t)__builtin_neon_vtbl2v8qi(__u.__ti, __idxhi);
326 1.1 rin __out64 = (uint64x2_t) { (uint64x1_t)__outlo, (uint64x1_t)__outhi };
327 1.1 rin
328 1.1 rin return (uint8x16_t)__out64;
329 1.1 rin #endif
330 1.1 rin #elif defined(__clang__)
331 1.1 rin #ifndef __LITTLE_ENDIAN__
332 1.1 rin __tab = __builtin_shufflevector(__tab, __tab,
333 1.1 rin 15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0);
334 1.1 rin __idx = __builtin_shufflevector(__idx, __idx,
335 1.1 rin 15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0);
336 1.1 rin #endif
337 1.1 rin uint8x16_t __r;
338 1.1 rin #ifdef __aarch64__
339 1.1 rin __r = __builtin_neon_vqtbl1q_v((int8x16_t)__tab, (int8x16_t)__idx, 48);
340 1.1 rin #else
341 1.1 rin uint64x2_t __tab64 = (uint64x2_t)__tab;
342 1.1 rin uint8x8_t __tablo = (uint8x8_t)__tab64[0];
343 1.1 rin uint8x8_t __tabhi = (uint8x8_t)__tab64[1];
344 1.1 rin uint64x2_t __idx64, __out64;
345 1.1 rin int8x8_t __idxlo, __idxhi, __outlo, __outhi;
346 1.1 rin
347 1.1 rin __idx64 = (uint64x2_t)__idx;
348 1.1 rin __idxlo = (int8x8_t)__idx64[0];
349 1.1 rin __idxhi = (int8x8_t)__idx64[1];
350 1.1 rin __outlo = (uint8x8_t)__builtin_neon_vtbl2_v((int8x8_t)__tablo,
351 1.1 rin (int8x8_t)__tabhi, (int8x8_t)__idxlo, 16);
352 1.1 rin __outhi = (uint8x8_t)__builtin_neon_vtbl2_v((int8x8_t)__tablo,
353 1.1 rin (int8x8_t)__tabhi, (int8x8_t)__idxhi, 16);
354 1.1 rin __out64 = (uint64x2_t) { (uint64_t)__outlo, (uint64_t)__outhi };
355 1.1 rin __r = (uint8x16_t)__out64;
356 1.1 rin #endif
357 1.1 rin #ifndef __LITTLE_ENDIAN__
358 1.1 rin __r = __builtin_shufflevector(__r, __r,
359 1.1 rin 15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0);
360 1.1 rin #endif
361 1.1 rin return __r;
362 1.1 rin #endif
363 1.1 rin }
364 1.1 rin
365 1.1 rin _INTRINSATTR
366 1.1 rin static __inline int32x4_t
367 1.1 rin vreinterpretq_s32_u8(uint8x16_t __v)
368 1.1 rin {
369 1.1 rin return (int32x4_t)__v;
370 1.1 rin }
371 1.1 rin
372 1.1 rin _INTRINSATTR
373 1.1 rin static __inline uint16x8_t
374 1.1 rin vreinterpretq_u16_u32(uint32x4_t __v)
375 1.1 rin {
376 1.1 rin return (uint16x8_t)__v;
377 1.1 rin }
378 1.1 rin
379 1.1 rin _INTRINSATTR
380 1.1 rin static __inline uint32x4_t
381 1.1 rin vreinterpretq_u32_u16(uint16x8_t __v)
382 1.1 rin {
383 1.1 rin return (uint32x4_t)__v;
384 1.1 rin }
385 1.1 rin
386 1.1 rin _INTRINSATTR
387 1.1 rin static __inline uint32x4_t
388 1.1 rin vreinterpretq_u32_u64(uint64x2_t __v)
389 1.1 rin {
390 1.1 rin return (uint32x4_t)__v;
391 1.1 rin }
392 1.1 rin
393 1.1 rin _INTRINSATTR
394 1.1 rin static __inline uint32x4_t
395 1.1 rin vreinterpretq_u32_u8(uint8x16_t __v)
396 1.1 rin {
397 1.1 rin return (uint32x4_t)__v;
398 1.1 rin }
399 1.1 rin
400 1.1 rin _INTRINSATTR
401 1.1 rin static __inline uint64x2_t
402 1.1 rin vreinterpretq_u64_u32(uint32x4_t __v)
403 1.1 rin {
404 1.1 rin return (uint64x2_t)__v;
405 1.1 rin }
406 1.1 rin
407 1.1 rin _INTRINSATTR
408 1.1 rin static __inline uint64x2_t
409 1.1 rin vreinterpretq_u64_u8(uint8x16_t __v)
410 1.1 rin {
411 1.1 rin return (uint64x2_t)__v;
412 1.1 rin }
413 1.1 rin
414 1.1 rin _INTRINSATTR
415 1.1 rin static __inline uint8x16_t
416 1.1 rin vreinterpretq_u8_s32(int32x4_t __v)
417 1.1 rin {
418 1.1 rin return (uint8x16_t)__v;
419 1.1 rin }
420 1.1 rin
421 1.1 rin _INTRINSATTR
422 1.1 rin static __inline uint8x16_t
423 1.1 rin vreinterpretq_u8_u32(uint32x4_t __v)
424 1.1 rin {
425 1.1 rin return (uint8x16_t)__v;
426 1.1 rin }
427 1.1 rin
428 1.1 rin _INTRINSATTR
429 1.1 rin static __inline uint8x16_t
430 1.1 rin vreinterpretq_u8_u64(uint64x2_t __v)
431 1.1 rin {
432 1.1 rin return (uint8x16_t)__v;
433 1.1 rin }
434 1.1 rin
435 1.1 rin _INTRINSATTR
436 1.1 rin static __inline uint16x8_t
437 1.1 rin vrev32q_u16(uint16x8_t __v)
438 1.1 rin {
439 1.1 rin #if defined(__GNUC__) && !defined(__clang__)
440 1.1 rin return __builtin_shuffle(__v, (uint16x8_t) { 1,0, 3,2, 5,4, 7,6 });
441 1.1 rin #elif defined(__clang__)
442 1.1 rin return __builtin_shufflevector(__v, __v, 1,0, 3,2, 5,4, 7,6);
443 1.1 rin #endif
444 1.1 rin }
445 1.1 rin
446 1.1 rin _INTRINSATTR
447 1.1 rin static __inline uint8x16_t
448 1.1 rin vrev32q_u8(uint8x16_t __v)
449 1.1 rin {
450 1.1 rin #if defined(__GNUC__) && !defined(__clang__)
451 1.1 rin return __builtin_shuffle(__v,
452 1.1 rin (uint8x16_t) { 3,2,1,0, 7,6,5,4, 11,10,9,8, 15,14,13,12 });
453 1.1 rin #elif defined(__clang__)
454 1.1 rin return __builtin_shufflevector(__v, __v,
455 1.1 rin 3,2,1,0, 7,6,5,4, 11,10,9,8, 15,14,13,12);
456 1.1 rin #endif
457 1.1 rin }
458 1.1 rin
459 1.1 rin #if defined(__GNUC__) && !defined(__clang__)
460 1.1 rin _INTRINSATTR
461 1.1 rin static __inline uint32x4_t
462 1.1 rin vsetq_lane_u32(uint32_t __x, uint32x4_t __v, uint8_t __i)
463 1.1 rin {
464 1.1 rin __v[__neon_laneq_index(__v, __i)] = __x;
465 1.1 rin return __v;
466 1.1 rin }
467 1.1 rin #elif defined(__clang__)
468 1.1 rin #define vsetq_lane_u32(__x, __v, __i) \
469 1.1 rin (uint32x4_t)__builtin_neon_vsetq_lane_i32((__x), (int32x4_t)(__v), \
470 1.1 rin __neon_laneq_index(__v, __i))
471 1.1 rin #endif
472 1.1 rin
473 1.1 rin #if defined(__GNUC__) && !defined(__clang__)
474 1.1 rin _INTRINSATTR
475 1.1 rin static __inline uint64x2_t
476 1.1 rin vsetq_lane_u64(uint64_t __x, uint64x2_t __v, uint8_t __i)
477 1.1 rin {
478 1.1 rin __v[__neon_laneq_index(__v, __i)] = __x;
479 1.1 rin return __v;
480 1.1 rin }
481 1.1 rin #elif defined(__clang__)
482 1.1 rin #define vsetq_lane_u64(__x, __v, __i) \
483 1.1 rin (uint64x2_t)__builtin_neon_vsetq_lane_i64((__x), (int64x2_t)(__v), \
484 1.1 rin __neon_laneq_index(__v, __i));
485 1.1 rin #endif
486 1.1 rin
487 1.1 rin #if defined(__GNUC__) && !defined(__clang__)
488 1.1 rin _INTRINSATTR
489 1.1 rin static __inline int32x4_t
490 1.1 rin vshlq_n_s32(int32x4_t __v, uint8_t __bits)
491 1.1 rin {
492 1.1 rin #ifdef __aarch64__
493 1.1 rin return (int32x4_t)__builtin_aarch64_ashlv4si(__v, __bits);
494 1.1 rin #else
495 1.1 rin return (int32x4_t)__builtin_neon_vshl_nv4si(__v, __bits);
496 1.1 rin #endif
497 1.1 rin }
498 1.1 rin #elif defined(__clang__)
499 1.1 rin #define vshlq_n_s32(__v, __bits) \
500 1.1 rin (int32x4_t)__builtin_neon_vshlq_n_v((int32x4_t)(__v), (__bits), 34)
501 1.1 rin #endif
502 1.1 rin
503 1.1 rin #if defined(__GNUC__) && !defined(__clang__)
504 1.1 rin _INTRINSATTR
505 1.1 rin static __inline uint32x4_t
506 1.1 rin vshlq_n_u32(uint32x4_t __v, uint8_t __bits)
507 1.1 rin {
508 1.1 rin #ifdef __aarch64__
509 1.1 rin return (uint32x4_t)__builtin_aarch64_ashlv4si((int32x4_t)__v, __bits);
510 1.1 rin #else
511 1.1 rin return (uint32x4_t)__builtin_neon_vshl_nv4si((int32x4_t)__v, __bits);
512 1.1 rin #endif
513 1.1 rin }
514 1.1 rin #elif defined(__clang__)
515 1.1 rin #define vshlq_n_u32(__v, __bits) \
516 1.1 rin (uint32x4_t)__builtin_neon_vshlq_n_v((int32x4_t)(__v), (__bits), 50)
517 1.1 rin #endif
518 1.1 rin
519 1.1 rin #if defined(__GNUC__) && !defined(__clang__)
520 1.1 rin _INTRINSATTR
521 1.1 rin static __inline uint32x4_t
522 1.1 rin vshrq_n_u32(uint32x4_t __v, uint8_t __bits)
523 1.1 rin {
524 1.1 rin #ifdef __aarch64__
525 1.2 rin # if __GNUC_PREREQ__(12, 0)
526 1.2 rin return __builtin_aarch64_lshrv4si_uus(__v, __bits);
527 1.2 rin # else
528 1.1 rin return (uint32x4_t)__builtin_aarch64_lshrv4si((int32x4_t)__v, __bits);
529 1.2 rin # endif
530 1.1 rin #else
531 1.1 rin return (uint32x4_t)__builtin_neon_vshru_nv4si((int32x4_t)__v, __bits);
532 1.1 rin #endif
533 1.1 rin }
534 1.1 rin #elif defined(__clang__)
535 1.1 rin #define vshrq_n_u32(__v, __bits) \
536 1.1 rin (uint32x4_t)__builtin_neon_vshrq_n_v((int32x4_t)(__v), (__bits), 50)
537 1.1 rin #endif
538 1.1 rin
539 1.1 rin #if defined(__GNUC__) && !defined(__clang__)
540 1.1 rin _INTRINSATTR
541 1.1 rin static __inline uint8x16_t
542 1.1 rin vshrq_n_u8(uint8x16_t __v, uint8_t __bits)
543 1.1 rin {
544 1.1 rin #ifdef __aarch64__
545 1.2 rin # if __GNUC_PREREQ__(12, 0)
546 1.2 rin return __builtin_aarch64_lshrv16qi_uus(__v, __bits);
547 1.2 rin # else
548 1.1 rin return (uint8x16_t)__builtin_aarch64_lshrv16qi((int8x16_t)__v, __bits);
549 1.2 rin # endif
550 1.1 rin #else
551 1.1 rin return (uint8x16_t)__builtin_neon_vshru_nv16qi((int8x16_t)__v, __bits);
552 1.1 rin #endif
553 1.1 rin }
554 1.1 rin #elif defined(__clang__)
555 1.1 rin #define vshrq_n_u8(__v, __bits) \
556 1.1 rin (uint8x16_t)__builtin_neon_vshrq_n_v((int8x16_t)(__v), (__bits), 48)
557 1.1 rin #endif
558 1.1 rin
559 1.1 rin #if defined(__GNUC__) && !defined(__clang__)
560 1.1 rin _INTRINSATTR
561 1.1 rin static __inline int32x4_t
562 1.1 rin vsliq_n_s32(int32x4_t __vins, int32x4_t __vsh, uint8_t __bits)
563 1.1 rin {
564 1.1 rin #ifdef __aarch64__
565 1.1 rin return (int32x4_t)__builtin_aarch64_ssli_nv4si(__vins, __vsh, __bits);
566 1.1 rin #else
567 1.1 rin return (int32x4_t)__builtin_neon_vsli_nv4si(__vins, __vsh, __bits);
568 1.1 rin #endif
569 1.1 rin }
570 1.1 rin #elif defined(__clang__)
571 1.1 rin #ifdef __LITTLE_ENDIAN__
572 1.1 rin #define vsliq_n_s32(__vins, __vsh, __bits) \
573 1.1 rin (int32x4_t)__builtin_neon_vsliq_n_v((int32x4_t)(__vins), \
574 1.1 rin (int32x4_t)(__vsh), (__bits), 34)
575 1.1 rin #else
576 1.1 rin #define vsliq_n_s32(__vins, __vsh, __bits) ( \
577 1.1 rin { \
578 1.1 rin int32x4_t __tvins = (__vins); \
579 1.1 rin int32x4_t __tvsh = (__vsh); \
580 1.1 rin uint8_t __tbits = (__bits); \
581 1.1 rin int32x4_t __vins_r = __builtin_shufflevector(__tvins, __tvins, \
582 1.1 rin 3,2,1,0); \
583 1.1 rin int32x4_t __vsh_r = __builtin_shufflevector(__tvsh, __tvsh, \
584 1.1 rin 3,2,1,0); \
585 1.1 rin int32x4_t __r = __builtin_neon_vsliq_n_v(__tvins, __tvsh, __tbits, \
586 1.1 rin 34); \
587 1.1 rin __builtin_shufflevector(__r, __r, 3,2,1,0); \
588 1.1 rin })
589 1.1 rin #endif /* __LITTLE_ENDIAN__ */
590 1.1 rin #endif
591 1.1 rin
592 1.1 rin #if defined(__GNUC__) && !defined(__clang__)
593 1.1 rin _INTRINSATTR
594 1.1 rin static __inline uint32x4_t
595 1.1 rin vsriq_n_u32(uint32x4_t __vins, uint32x4_t __vsh, uint8_t __bits)
596 1.1 rin {
597 1.1 rin #ifdef __aarch64__
598 1.1 rin return __builtin_aarch64_usri_nv4si_uuus(__vins, __vsh, __bits);
599 1.1 rin #else
600 1.1 rin return (uint32x4_t)__builtin_neon_vsri_nv4si((int32x4_t)__vins,
601 1.1 rin (int32x4_t)__vsh, __bits);
602 1.1 rin #endif
603 1.1 rin }
604 1.1 rin #elif defined(__clang__)
605 1.1 rin #ifdef __LITTLE_ENDIAN__
606 1.1 rin #define vsriq_n_u32(__vins, __vsh, __bits) \
607 1.1 rin (int32x4_t)__builtin_neon_vsriq_n_v((int32x4_t)(__vins), \
608 1.1 rin (int32x4_t)(__vsh), (__bits), 34)
609 1.1 rin #else
610 1.1 rin #define vsriq_n_s32(__vins, __vsh, __bits) ( \
611 1.1 rin { \
612 1.1 rin int32x4_t __tvins = (__vins); \
613 1.1 rin int32x4_t __tvsh = (__vsh); \
614 1.1 rin uint8_t __tbits = (__bits); \
615 1.1 rin int32x4_t __vins_r = __builtin_shufflevector(__tvins, __tvins, \
616 1.1 rin 3,2,1,0); \
617 1.1 rin int32x4_t __vsh_r = __builtin_shufflevector(__tvsh, __tvsh, \
618 1.1 rin 3,2,1,0); \
619 1.1 rin int32x4_t __r = __builtin_neon_vsriq_n_v(__tvins, __tvsh, __tbits, \
620 1.1 rin 34); \
621 1.1 rin __builtin_shufflevector(__r, __r, 3,2,1,0); \
622 1.1 rin })
623 1.1 rin #endif
624 1.1 rin #endif
625 1.1 rin
626 1.1 rin _INTRINSATTR
627 1.1 rin static __inline void
628 1.1 rin vst1q_u32(uint32_t *__p32, uint32x4_t __v)
629 1.1 rin {
630 1.1 rin #if defined(__GNUC__) && !defined(__clang__)
631 1.1 rin #ifdef __aarch64__
632 1.1 rin __builtin_aarch64_simd_si *__p = (__builtin_aarch64_simd_si *)__p32;
633 1.1 rin
634 1.1 rin __builtin_aarch64_st1v4si(__p, (int32x4_t)__v);
635 1.1 rin #else
636 1.1 rin __builtin_neon_si *__p = (__builtin_neon_si *)__p32;
637 1.1 rin
638 1.1 rin __builtin_neon_vst1v4si(__p, (int32x4_t)__v);
639 1.1 rin #endif
640 1.1 rin #elif defined(__clang__)
641 1.1 rin #ifndef __LITTLE_ENDIAN__
642 1.1 rin __v = __builtin_shufflevector(__v, __v, 3,2,1,0);
643 1.1 rin #endif
644 1.1 rin __builtin_neon_vst1q_v(__p32, __v, 50);
645 1.1 rin #endif
646 1.1 rin }
647 1.1 rin
648 1.1 rin _INTRINSATTR
649 1.1 rin static __inline void
650 1.1 rin vst1q_u8(uint8_t *__p8, uint8x16_t __v)
651 1.1 rin {
652 1.1 rin #if defined(__GNUC__) && !defined(__clang__)
653 1.1 rin #ifdef __aarch64__
654 1.1 rin __builtin_aarch64_simd_qi *__p = (__builtin_aarch64_simd_qi *)__p8;
655 1.1 rin
656 1.1 rin __builtin_aarch64_st1v16qi(__p, (int8x16_t)__v);
657 1.1 rin #else
658 1.1 rin __builtin_neon_qi *__p = (__builtin_neon_qi *)__p8;
659 1.1 rin
660 1.1 rin __builtin_neon_vst1v16qi(__p, (int8x16_t)__v);
661 1.1 rin #endif
662 1.1 rin #elif defined(__clang__)
663 1.1 rin #ifndef __LITTLE_ENDIAN__
664 1.1 rin __v = __builtin_shufflevector(__v, __v,
665 1.1 rin 15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0);
666 1.1 rin #endif
667 1.1 rin __builtin_neon_vst1q_v(__p8, __v, 48);
668 1.1 rin #endif
669 1.1 rin }
670 1.1 rin
671 1.1 rin #ifndef __aarch64__ /* XXX */
672 1.1 rin
673 1.1 rin _INTRINSATTR
674 1.1 rin static __inline uint8x8_t
675 1.1 rin vtbl1_u8(uint8x8_t __tab, uint8x8_t __idx)
676 1.1 rin {
677 1.1 rin #if defined(__GNUC__) && !defined(__clang__)
678 1.1 rin return (uint8x8_t)__builtin_neon_vtbl1v8qi((int8x8_t)__tab,
679 1.1 rin (int8x8_t)__idx);
680 1.1 rin #elif defined(__clang__)
681 1.1 rin uint8x8_t __ret;
682 1.1 rin #ifndef __LITTLE_ENDIAN__
683 1.1 rin __tab = __builtin_shufflevector(__tab, __tab, 7,6,5,4,3,2,1,0);
684 1.1 rin __idx = __builtin_shufflevector(__idx, __idx, 7,6,5,4,3,2,1,0);
685 1.1 rin #endif
686 1.1 rin __ret = (uint8x8_t)__builtin_neon_vtbl1_v((int8x8_t)__tab,
687 1.1 rin (int8x8_t)__idx, 16);
688 1.1 rin #ifndef __LITTLE_ENDIAN__
689 1.1 rin __ret = __builtin_shufflevector(__ret, __ret, 7,6,5,4,3,2,1,0);
690 1.1 rin #endif
691 1.1 rin return __ret;
692 1.1 rin #endif
693 1.1 rin }
694 1.1 rin
695 1.1 rin _INTRINSATTR
696 1.1 rin static __inline uint8x8_t
697 1.1 rin vtbl2_u8(uint8x8x2_t __tab, uint8x8_t __idx)
698 1.1 rin {
699 1.1 rin #if defined(__GNUC__) && !defined(__clang__)
700 1.1 rin union {
701 1.1 rin uint8x8x2_t __u8x8x82;
702 1.1 rin __builtin_neon_ti __ti;
703 1.1 rin } __u = { __tab };
704 1.1 rin return (uint8x8_t)__builtin_neon_vtbl2v8qi(__u.__ti, (int8x8_t)__idx);
705 1.1 rin #elif defined(__clang__)
706 1.1 rin uint8x8_t __ret;
707 1.1 rin #ifndef __LITTLE_ENDIAN__
708 1.1 rin __tab.val[0] = __builtin_shufflevector(__tab.val[0], __tab.val[0],
709 1.1 rin 7,6,5,4,3,2,1,0);
710 1.1 rin __tab.val[1] = __builtin_shufflevector(__tab.val[1], __tab.val[1],
711 1.1 rin 7,6,5,4,3,2,1,0);
712 1.1 rin __idx = __builtin_shufflevector(__idx, __idx, 7,6,5,4,3,2,1,0);
713 1.1 rin #endif
714 1.1 rin __ret = (uint8x8_t)__builtin_neon_vtbl2_v((int8x8_t)__tab.val[0],
715 1.1 rin (int8x8_t)__tab.val[1], (int8x8_t)__idx, 16);
716 1.1 rin #ifndef __LITTLE_ENDIAN__
717 1.1 rin __ret = __builtin_shufflevector(__ret, __ret, 7,6,5,4,3,2,1,0);
718 1.1 rin #endif
719 1.1 rin return __ret;
720 1.1 rin #endif
721 1.1 rin }
722 1.1 rin
723 1.1 rin #endif /* !defined(__aarch64__) */
724 1.1 rin
725 1.1 rin #endif /* _SYS_CRYPTO_ARCH_ARM_ARM_NEON_H */
726