1 1.2 rin /* $NetBSD: arm_neon.h,v 1.2 2023/08/07 01:14:19 rin Exp $ */ 2 1.1 rin 3 1.1 rin /*- 4 1.1 rin * Copyright (c) 2020 The NetBSD Foundation, Inc. 5 1.1 rin * All rights reserved. 6 1.1 rin * 7 1.1 rin * Redistribution and use in source and binary forms, with or without 8 1.1 rin * modification, are permitted provided that the following conditions 9 1.1 rin * are met: 10 1.1 rin * 1. Redistributions of source code must retain the above copyright 11 1.1 rin * notice, this list of conditions and the following disclaimer. 12 1.1 rin * 2. Redistributions in binary form must reproduce the above copyright 13 1.1 rin * notice, this list of conditions and the following disclaimer in the 14 1.1 rin * documentation and/or other materials provided with the distribution. 15 1.1 rin * 16 1.1 rin * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS 17 1.1 rin * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED 18 1.1 rin * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 19 1.1 rin * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS 20 1.1 rin * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 21 1.1 rin * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 22 1.1 rin * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 23 1.1 rin * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 24 1.1 rin * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 25 1.1 rin * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 26 1.1 rin * POSSIBILITY OF SUCH DAMAGE. 27 1.1 rin */ 28 1.1 rin 29 1.1 rin #ifndef _SYS_CRYPTO_ARCH_ARM_ARM_NEON_H 30 1.1 rin #define _SYS_CRYPTO_ARCH_ARM_ARM_NEON_H 31 1.1 rin 32 1.1 rin #if defined(__GNUC__) && !defined(__clang__) 33 1.1 rin 34 1.1 rin #define _INTRINSATTR \ 35 1.1 rin __extension__ \ 36 1.1 rin __attribute__((__always_inline__, __gnu_inline__, __artificial__)) 37 1.1 rin 38 1.1 rin #ifdef __aarch64__ 39 1.1 rin typedef __Int32x4_t int32x4_t; 40 1.1 rin typedef __Int64x2_t int64x2_t; 41 1.1 rin typedef __Int8x16_t int8x16_t; 42 1.1 rin typedef __Uint16x8_t uint16x8_t; 43 1.1 rin typedef __Uint32x4_t uint32x4_t; 44 1.1 rin typedef __Uint64x2_t uint64x2_t; 45 1.1 rin typedef __Uint8x16_t uint8x16_t; 46 1.1 rin typedef struct { uint8x16_t val[2]; } uint8x16x2_t; 47 1.1 rin #else 48 1.1 rin typedef __simd128_int32_t int32x4_t; 49 1.1 rin typedef __simd128_int64_t int64x2_t; 50 1.1 rin typedef __simd128_int8_t int8x16_t; 51 1.1 rin typedef __simd128_uint16_t uint16x8_t; 52 1.1 rin typedef __simd128_uint32_t uint32x4_t; 53 1.1 rin typedef __simd128_uint64_t uint64x2_t; 54 1.1 rin typedef __simd128_uint8_t uint8x16_t; 55 1.1 rin 56 1.1 rin typedef __simd64_int8_t int8x8_t; 57 1.1 rin typedef __simd64_uint8_t uint8x8_t; 58 1.1 rin typedef __builtin_neon_udi uint64x1_t; 59 1.1 rin typedef struct { uint8x8_t val[2]; } uint8x8x2_t; 60 1.1 rin typedef struct { uint8x16_t val[2]; } uint8x16x2_t; 61 1.1 rin #endif 62 1.1 rin 63 1.1 rin #if defined(__AARCH64EB__) 64 1.1 rin #define __neon_lane_index(__v, __i) (__arraycount(__v) - 1 - (__i)) 65 1.1 rin #define __neon_laneq_index(__v, __i) (__arraycount(__v) - 1 - (__i)) 66 1.1 rin #elif defined(__ARM_BIG_ENDIAN) 67 1.1 rin #define __neon_lane_index(__v, __i) ((__i) ^ (__arraycount(__v) - 1)) 68 1.1 rin #define __neon_laneq_index(__v, __i) ((__i) ^ (__arraycount(__v)/2 - 1)) 69 1.1 rin #else 70 1.1 rin #define __neon_lane_index(__v, __i) (__i) 71 1.1 rin #define __neon_laneq_index(__v, __i) (__i) 72 1.1 rin #endif 73 1.1 rin 74 1.1 rin #elif defined(__clang__) 75 1.1 rin 76 1.1 rin #define _INTRINSATTR \ 77 1.1 rin __attribute__((__always_inline__, __nodebug__)) 78 1.1 rin 79 1.1 rin typedef __attribute__((neon_vector_type(16))) int8_t int8x16_t; 80 1.1 rin typedef __attribute__((neon_vector_type(2))) int64_t int64x2_t; 81 1.1 rin typedef __attribute__((neon_vector_type(4))) int32_t int32x4_t; 82 1.1 rin 83 1.1 rin typedef __attribute__((neon_vector_type(16))) uint8_t uint8x16_t; 84 1.1 rin typedef __attribute__((neon_vector_type(2))) uint64_t uint64x2_t; 85 1.1 rin typedef __attribute__((neon_vector_type(4))) uint32_t uint32x4_t; 86 1.1 rin typedef __attribute__((neon_vector_type(8))) uint16_t uint16x8_t; 87 1.1 rin 88 1.1 rin typedef __attribute__((neon_vector_type(8))) int8_t int8x8_t; 89 1.1 rin 90 1.1 rin typedef __attribute__((neon_vector_type(8))) uint8_t uint8x8_t; 91 1.1 rin 92 1.1 rin typedef struct { uint8x8_t val[2]; } uint8x8x2_t; 93 1.1 rin typedef struct { uint8x16_t val[2]; } uint8x16x2_t; 94 1.1 rin 95 1.1 rin #ifdef __LITTLE_ENDIAN__ 96 1.1 rin #define __neon_lane_index(__v, __i) __i 97 1.1 rin #define __neon_laneq_index(__v, __i) __i 98 1.1 rin #else 99 1.1 rin #define __neon_lane_index(__v, __i) (__arraycount(__v) - 1 - __i) 100 1.1 rin #define __neon_laneq_index(__v, __i) (__arraycount(__v) - 1 - __i) 101 1.1 rin #endif 102 1.1 rin 103 1.1 rin #else 104 1.1 rin 105 1.1 rin #error Teach me how to neon in your compile! 106 1.1 rin 107 1.1 rin #endif 108 1.1 rin 109 1.1 rin _INTRINSATTR 110 1.1 rin static __inline uint32x4_t 111 1.1 rin vaddq_u32(uint32x4_t __v0, uint32x4_t __v1) 112 1.1 rin { 113 1.1 rin return __v0 + __v1; 114 1.1 rin } 115 1.1 rin 116 1.1 rin _INTRINSATTR 117 1.1 rin static __inline uint32x4_t 118 1.1 rin vcltq_s32(int32x4_t __v0, int32x4_t __v1) 119 1.1 rin { 120 1.1 rin return (uint32x4_t)(__v0 < __v1); 121 1.1 rin } 122 1.1 rin 123 1.1 rin _INTRINSATTR 124 1.1 rin static __inline int32x4_t 125 1.1 rin vdupq_n_s32(int32_t __x) 126 1.1 rin { 127 1.1 rin return (int32x4_t) { __x, __x, __x, __x }; 128 1.1 rin } 129 1.1 rin 130 1.1 rin _INTRINSATTR 131 1.1 rin static __inline uint32x4_t 132 1.1 rin vdupq_n_u32(uint32_t __x) 133 1.1 rin { 134 1.1 rin return (uint32x4_t) { __x, __x, __x, __x }; 135 1.1 rin } 136 1.1 rin 137 1.1 rin _INTRINSATTR 138 1.1 rin static __inline uint8x16_t 139 1.1 rin vdupq_n_u8(uint8_t __x) 140 1.1 rin { 141 1.1 rin return (uint8x16_t) { 142 1.1 rin __x, __x, __x, __x, __x, __x, __x, __x, 143 1.1 rin __x, __x, __x, __x, __x, __x, __x, __x, 144 1.1 rin }; 145 1.1 rin } 146 1.1 rin 147 1.1 rin #if defined(__GNUC__) && !defined(__clang__) 148 1.1 rin _INTRINSATTR 149 1.1 rin static __inline uint32x4_t 150 1.1 rin vextq_u32(uint32x4_t __lo, uint32x4_t __hi, uint8_t __i) 151 1.1 rin { 152 1.1 rin #if defined(__AARCH64EB__) || defined(__ARM_BIG_ENDIAN) 153 1.1 rin return __builtin_shuffle(__hi, __lo, 154 1.1 rin (uint32x4_t) { 4 - __i, 5 - __i, 6 - __i, 7 - __i }); 155 1.1 rin #else 156 1.1 rin return __builtin_shuffle(__lo, __hi, 157 1.1 rin (uint32x4_t) { __i + 0, __i + 1, __i + 2, __i + 3 }); 158 1.1 rin #endif 159 1.1 rin } 160 1.1 rin #elif defined(__clang__) 161 1.1 rin #ifdef __LITTLE_ENDIAN__ 162 1.1 rin #define vextq_u32(__lo, __hi, __i) \ 163 1.1 rin (uint32x4_t)__builtin_neon_vextq_v((int8x16_t)(__lo), \ 164 1.1 rin (int8x16_t)(__hi), (__i), 50) 165 1.1 rin #else 166 1.1 rin #define vextq_u32(__lo, __hi, __i) ( \ 167 1.1 rin { \ 168 1.1 rin uint32x4_t __tlo = (__lo); \ 169 1.1 rin uint32x4_t __thi = (__hi); \ 170 1.1 rin uint32x4_t __lo_r = __builtin_shufflevector(__tlo, __tlo, 3,2,1,0); \ 171 1.1 rin uint32x4_t __hi_r = __builtin_shufflevector(__thi, __thi, 3,2,1,0); \ 172 1.1 rin uint32x4_t __r = __builtin_neon_vextq_v((int8x16_t)__lo_r, \ 173 1.1 rin (int8x16_t)__hi_r, __i, 50); \ 174 1.1 rin __builtin_shufflevector(__r, __r, 3,2,1,0); \ 175 1.1 rin }) 176 1.1 rin #endif /* __LITTLE_ENDIAN__ */ 177 1.1 rin #endif 178 1.1 rin 179 1.1 rin #if defined(__GNUC__) && !defined(__clang__) 180 1.1 rin _INTRINSATTR 181 1.1 rin static __inline uint8x16_t 182 1.1 rin vextq_u8(uint8x16_t __lo, uint8x16_t __hi, uint8_t __i) 183 1.1 rin { 184 1.1 rin #ifdef __aarch64__ 185 1.1 rin #if defined(__AARCH64EB__) 186 1.1 rin return __builtin_shuffle(__hi, __lo, 187 1.1 rin (uint8x16_t) { 188 1.1 rin 16 - __i, 17 - __i, 18 - __i, 19 - __i, 189 1.1 rin 20 - __i, 21 - __i, 22 - __i, 23 - __i, 190 1.1 rin 24 - __i, 25 - __i, 26 - __i, 27 - __i, 191 1.1 rin 28 - __i, 29 - __i, 30 - __i, 31 - __i, 192 1.1 rin }); 193 1.1 rin #else 194 1.1 rin return __builtin_shuffle(__lo, __hi, 195 1.1 rin (uint8x16_t) { 196 1.1 rin __i + 0, __i + 1, __i + 2, __i + 3, 197 1.1 rin __i + 4, __i + 5, __i + 6, __i + 7, 198 1.1 rin __i + 8, __i + 9, __i + 10, __i + 11, 199 1.1 rin __i + 12, __i + 13, __i + 14, __i + 15, 200 1.1 rin }); 201 1.1 rin #endif 202 1.1 rin #else 203 1.1 rin return (uint8x16_t)__builtin_neon_vextv16qi((int8x16_t)__lo, 204 1.1 rin (int8x16_t)__hi, __i); 205 1.1 rin #endif 206 1.1 rin } 207 1.1 rin #elif defined(__clang__) 208 1.1 rin #ifdef __LITTLE_ENDIAN__ 209 1.1 rin #define vextq_u8(__lo, __hi, __i) \ 210 1.1 rin (uint8x16_t)__builtin_neon_vextq_v((int8x16_t)(__lo), \ 211 1.1 rin (int8x16_t)(__hi), (__i), 48) 212 1.1 rin #else 213 1.1 rin #define vextq_u8(__lo, __hi, __i) ( \ 214 1.1 rin { \ 215 1.1 rin uint8x16_t __tlo = (__lo); \ 216 1.1 rin uint8x16_t __thi = (__hi); \ 217 1.1 rin uint8x16_t __lo_r = __builtin_shufflevector(__tlo, __tlo, \ 218 1.1 rin 15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0); \ 219 1.1 rin uint8x16_t __hi_r = __builtin_shufflevector(__thi, __thi, \ 220 1.1 rin 15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0); \ 221 1.1 rin uint8x16_t __r = __builtin_neon_vextq_v((int8x16_t)__lo_r, \ 222 1.1 rin (int8x16_t)__hi_r, (__i), 48); \ 223 1.1 rin __builtin_shufflevector(__r, __r, \ 224 1.1 rin 15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0); \ 225 1.1 rin }) 226 1.1 rin #endif /* __LITTLE_ENDIAN */ 227 1.1 rin #endif 228 1.1 rin 229 1.1 rin #if defined(__GNUC__) && !defined(__clang__) 230 1.1 rin _INTRINSATTR 231 1.1 rin static __inline uint32_t 232 1.1 rin vgetq_lane_u32(uint32x4_t __v, uint8_t __i) 233 1.1 rin { 234 1.1 rin #ifdef __aarch64__ 235 1.1 rin return __v[__neon_laneq_index(__v, __i)]; 236 1.1 rin #else 237 1.1 rin return (uint32_t)__builtin_neon_vget_laneuv4si((int32x4_t)__v, __i); 238 1.1 rin #endif 239 1.1 rin } 240 1.1 rin #elif defined(__clang__) 241 1.1 rin #define vgetq_lane_u32(__v, __i) \ 242 1.1 rin (uint32_t)__builtin_neon_vgetq_lane_i32((int32x4_t)(__v), \ 243 1.1 rin __neon_laneq_index(__v, __i)) 244 1.1 rin #endif 245 1.1 rin 246 1.1 rin _INTRINSATTR 247 1.1 rin static __inline uint32x4_t 248 1.1 rin vld1q_u32(const uint32_t *__p32) 249 1.1 rin { 250 1.1 rin #if defined(__GNUC__) && !defined(__clang__) 251 1.1 rin #ifdef __aarch64__ 252 1.1 rin const __builtin_aarch64_simd_si *__p = 253 1.1 rin (const __builtin_aarch64_simd_si *)__p32; 254 1.1 rin 255 1.1 rin return (uint32x4_t)__builtin_aarch64_ld1v4si(__p); 256 1.1 rin #else 257 1.1 rin const __builtin_neon_si *__p = (const __builtin_neon_si *)__p32; 258 1.1 rin 259 1.1 rin return (uint32x4_t)__builtin_neon_vld1v4si(__p); 260 1.1 rin #endif 261 1.1 rin #elif defined(__clang__) 262 1.1 rin uint32x4_t __v = (uint32x4_t)__builtin_neon_vld1q_v(__p32, 50); 263 1.1 rin #ifndef __LITTLE_ENDIAN__ 264 1.1 rin __v = __builtin_shufflevector(__v, __v, 3,2,1,0); 265 1.1 rin #endif 266 1.1 rin return __v; 267 1.1 rin #endif 268 1.1 rin } 269 1.1 rin 270 1.1 rin _INTRINSATTR 271 1.1 rin static __inline uint8x16_t 272 1.1 rin vld1q_u8(const uint8_t *__p8) 273 1.1 rin { 274 1.1 rin #if defined(__GNUC__) && !defined(__clang__) 275 1.1 rin #ifdef __aarch64__ 276 1.1 rin const __builtin_aarch64_simd_qi *__p = 277 1.1 rin (const __builtin_aarch64_simd_qi *)__p8; 278 1.1 rin 279 1.1 rin return (uint8x16_t)__builtin_aarch64_ld1v16qi(__p); 280 1.1 rin #else 281 1.1 rin const __builtin_neon_qi *__p = (const __builtin_neon_qi *)__p8; 282 1.1 rin 283 1.1 rin return (uint8x16_t)__builtin_neon_vld1v16qi(__p); 284 1.1 rin #endif 285 1.1 rin #elif defined(__clang__) 286 1.1 rin uint8x16_t __v = (uint8x16_t)__builtin_neon_vld1q_v(__p8, 48); 287 1.1 rin #ifndef __LITTLE_ENDIAN__ 288 1.1 rin __v = __builtin_shufflevector(__v, __v, 289 1.1 rin 15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0); 290 1.1 rin #endif 291 1.1 rin return __v; 292 1.1 rin #endif 293 1.1 rin } 294 1.1 rin 295 1.1 rin _INTRINSATTR 296 1.1 rin static __inline uint8x16_t 297 1.1 rin vqtbl1q_u8(uint8x16_t __tab, uint8x16_t __idx) 298 1.1 rin { 299 1.1 rin #if defined(__GNUC__) && !defined(__clang__) 300 1.1 rin #ifdef __aarch64__ 301 1.1 rin uint8x16_t __res; 302 1.1 rin __asm__("tbl %0.16b, {%1.16b}, %2.16b" 303 1.1 rin : "=w"(__res) : "w"(__tab), "w"(__idx)); 304 1.1 rin return __res; 305 1.1 rin #else 306 1.1 rin /* 307 1.1 rin * No native ARMv7 NEON instruction for this, so do it via two 308 1.1 rin * half-width TBLs instead (vtbl2_u8 equivalent). 309 1.1 rin */ 310 1.1 rin uint64x2_t __tab64 = (uint64x2_t)__tab; 311 1.1 rin uint8x8_t __tablo = (uint8x8_t)__tab64[0]; 312 1.1 rin uint8x8_t __tabhi = (uint8x8_t)__tab64[1]; 313 1.1 rin uint8x8x2_t __tab8x8x2 = { { __tablo, __tabhi } }; 314 1.1 rin union { 315 1.1 rin uint8x8x2_t __u8x8x2; 316 1.1 rin __builtin_neon_ti __ti; 317 1.1 rin } __u = { __tab8x8x2 }; 318 1.1 rin uint64x2_t __idx64, __out64; 319 1.1 rin int8x8_t __idxlo, __idxhi, __outlo, __outhi; 320 1.1 rin 321 1.1 rin __idx64 = (uint64x2_t)__idx; 322 1.1 rin __idxlo = (int8x8_t)__idx64[0]; 323 1.1 rin __idxhi = (int8x8_t)__idx64[1]; 324 1.1 rin __outlo = (int8x8_t)__builtin_neon_vtbl2v8qi(__u.__ti, __idxlo); 325 1.1 rin __outhi = (int8x8_t)__builtin_neon_vtbl2v8qi(__u.__ti, __idxhi); 326 1.1 rin __out64 = (uint64x2_t) { (uint64x1_t)__outlo, (uint64x1_t)__outhi }; 327 1.1 rin 328 1.1 rin return (uint8x16_t)__out64; 329 1.1 rin #endif 330 1.1 rin #elif defined(__clang__) 331 1.1 rin #ifndef __LITTLE_ENDIAN__ 332 1.1 rin __tab = __builtin_shufflevector(__tab, __tab, 333 1.1 rin 15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0); 334 1.1 rin __idx = __builtin_shufflevector(__idx, __idx, 335 1.1 rin 15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0); 336 1.1 rin #endif 337 1.1 rin uint8x16_t __r; 338 1.1 rin #ifdef __aarch64__ 339 1.1 rin __r = __builtin_neon_vqtbl1q_v((int8x16_t)__tab, (int8x16_t)__idx, 48); 340 1.1 rin #else 341 1.1 rin uint64x2_t __tab64 = (uint64x2_t)__tab; 342 1.1 rin uint8x8_t __tablo = (uint8x8_t)__tab64[0]; 343 1.1 rin uint8x8_t __tabhi = (uint8x8_t)__tab64[1]; 344 1.1 rin uint64x2_t __idx64, __out64; 345 1.1 rin int8x8_t __idxlo, __idxhi, __outlo, __outhi; 346 1.1 rin 347 1.1 rin __idx64 = (uint64x2_t)__idx; 348 1.1 rin __idxlo = (int8x8_t)__idx64[0]; 349 1.1 rin __idxhi = (int8x8_t)__idx64[1]; 350 1.1 rin __outlo = (uint8x8_t)__builtin_neon_vtbl2_v((int8x8_t)__tablo, 351 1.1 rin (int8x8_t)__tabhi, (int8x8_t)__idxlo, 16); 352 1.1 rin __outhi = (uint8x8_t)__builtin_neon_vtbl2_v((int8x8_t)__tablo, 353 1.1 rin (int8x8_t)__tabhi, (int8x8_t)__idxhi, 16); 354 1.1 rin __out64 = (uint64x2_t) { (uint64_t)__outlo, (uint64_t)__outhi }; 355 1.1 rin __r = (uint8x16_t)__out64; 356 1.1 rin #endif 357 1.1 rin #ifndef __LITTLE_ENDIAN__ 358 1.1 rin __r = __builtin_shufflevector(__r, __r, 359 1.1 rin 15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0); 360 1.1 rin #endif 361 1.1 rin return __r; 362 1.1 rin #endif 363 1.1 rin } 364 1.1 rin 365 1.1 rin _INTRINSATTR 366 1.1 rin static __inline int32x4_t 367 1.1 rin vreinterpretq_s32_u8(uint8x16_t __v) 368 1.1 rin { 369 1.1 rin return (int32x4_t)__v; 370 1.1 rin } 371 1.1 rin 372 1.1 rin _INTRINSATTR 373 1.1 rin static __inline uint16x8_t 374 1.1 rin vreinterpretq_u16_u32(uint32x4_t __v) 375 1.1 rin { 376 1.1 rin return (uint16x8_t)__v; 377 1.1 rin } 378 1.1 rin 379 1.1 rin _INTRINSATTR 380 1.1 rin static __inline uint32x4_t 381 1.1 rin vreinterpretq_u32_u16(uint16x8_t __v) 382 1.1 rin { 383 1.1 rin return (uint32x4_t)__v; 384 1.1 rin } 385 1.1 rin 386 1.1 rin _INTRINSATTR 387 1.1 rin static __inline uint32x4_t 388 1.1 rin vreinterpretq_u32_u64(uint64x2_t __v) 389 1.1 rin { 390 1.1 rin return (uint32x4_t)__v; 391 1.1 rin } 392 1.1 rin 393 1.1 rin _INTRINSATTR 394 1.1 rin static __inline uint32x4_t 395 1.1 rin vreinterpretq_u32_u8(uint8x16_t __v) 396 1.1 rin { 397 1.1 rin return (uint32x4_t)__v; 398 1.1 rin } 399 1.1 rin 400 1.1 rin _INTRINSATTR 401 1.1 rin static __inline uint64x2_t 402 1.1 rin vreinterpretq_u64_u32(uint32x4_t __v) 403 1.1 rin { 404 1.1 rin return (uint64x2_t)__v; 405 1.1 rin } 406 1.1 rin 407 1.1 rin _INTRINSATTR 408 1.1 rin static __inline uint64x2_t 409 1.1 rin vreinterpretq_u64_u8(uint8x16_t __v) 410 1.1 rin { 411 1.1 rin return (uint64x2_t)__v; 412 1.1 rin } 413 1.1 rin 414 1.1 rin _INTRINSATTR 415 1.1 rin static __inline uint8x16_t 416 1.1 rin vreinterpretq_u8_s32(int32x4_t __v) 417 1.1 rin { 418 1.1 rin return (uint8x16_t)__v; 419 1.1 rin } 420 1.1 rin 421 1.1 rin _INTRINSATTR 422 1.1 rin static __inline uint8x16_t 423 1.1 rin vreinterpretq_u8_u32(uint32x4_t __v) 424 1.1 rin { 425 1.1 rin return (uint8x16_t)__v; 426 1.1 rin } 427 1.1 rin 428 1.1 rin _INTRINSATTR 429 1.1 rin static __inline uint8x16_t 430 1.1 rin vreinterpretq_u8_u64(uint64x2_t __v) 431 1.1 rin { 432 1.1 rin return (uint8x16_t)__v; 433 1.1 rin } 434 1.1 rin 435 1.1 rin _INTRINSATTR 436 1.1 rin static __inline uint16x8_t 437 1.1 rin vrev32q_u16(uint16x8_t __v) 438 1.1 rin { 439 1.1 rin #if defined(__GNUC__) && !defined(__clang__) 440 1.1 rin return __builtin_shuffle(__v, (uint16x8_t) { 1,0, 3,2, 5,4, 7,6 }); 441 1.1 rin #elif defined(__clang__) 442 1.1 rin return __builtin_shufflevector(__v, __v, 1,0, 3,2, 5,4, 7,6); 443 1.1 rin #endif 444 1.1 rin } 445 1.1 rin 446 1.1 rin _INTRINSATTR 447 1.1 rin static __inline uint8x16_t 448 1.1 rin vrev32q_u8(uint8x16_t __v) 449 1.1 rin { 450 1.1 rin #if defined(__GNUC__) && !defined(__clang__) 451 1.1 rin return __builtin_shuffle(__v, 452 1.1 rin (uint8x16_t) { 3,2,1,0, 7,6,5,4, 11,10,9,8, 15,14,13,12 }); 453 1.1 rin #elif defined(__clang__) 454 1.1 rin return __builtin_shufflevector(__v, __v, 455 1.1 rin 3,2,1,0, 7,6,5,4, 11,10,9,8, 15,14,13,12); 456 1.1 rin #endif 457 1.1 rin } 458 1.1 rin 459 1.1 rin #if defined(__GNUC__) && !defined(__clang__) 460 1.1 rin _INTRINSATTR 461 1.1 rin static __inline uint32x4_t 462 1.1 rin vsetq_lane_u32(uint32_t __x, uint32x4_t __v, uint8_t __i) 463 1.1 rin { 464 1.1 rin __v[__neon_laneq_index(__v, __i)] = __x; 465 1.1 rin return __v; 466 1.1 rin } 467 1.1 rin #elif defined(__clang__) 468 1.1 rin #define vsetq_lane_u32(__x, __v, __i) \ 469 1.1 rin (uint32x4_t)__builtin_neon_vsetq_lane_i32((__x), (int32x4_t)(__v), \ 470 1.1 rin __neon_laneq_index(__v, __i)) 471 1.1 rin #endif 472 1.1 rin 473 1.1 rin #if defined(__GNUC__) && !defined(__clang__) 474 1.1 rin _INTRINSATTR 475 1.1 rin static __inline uint64x2_t 476 1.1 rin vsetq_lane_u64(uint64_t __x, uint64x2_t __v, uint8_t __i) 477 1.1 rin { 478 1.1 rin __v[__neon_laneq_index(__v, __i)] = __x; 479 1.1 rin return __v; 480 1.1 rin } 481 1.1 rin #elif defined(__clang__) 482 1.1 rin #define vsetq_lane_u64(__x, __v, __i) \ 483 1.1 rin (uint64x2_t)__builtin_neon_vsetq_lane_i64((__x), (int64x2_t)(__v), \ 484 1.1 rin __neon_laneq_index(__v, __i)); 485 1.1 rin #endif 486 1.1 rin 487 1.1 rin #if defined(__GNUC__) && !defined(__clang__) 488 1.1 rin _INTRINSATTR 489 1.1 rin static __inline int32x4_t 490 1.1 rin vshlq_n_s32(int32x4_t __v, uint8_t __bits) 491 1.1 rin { 492 1.1 rin #ifdef __aarch64__ 493 1.1 rin return (int32x4_t)__builtin_aarch64_ashlv4si(__v, __bits); 494 1.1 rin #else 495 1.1 rin return (int32x4_t)__builtin_neon_vshl_nv4si(__v, __bits); 496 1.1 rin #endif 497 1.1 rin } 498 1.1 rin #elif defined(__clang__) 499 1.1 rin #define vshlq_n_s32(__v, __bits) \ 500 1.1 rin (int32x4_t)__builtin_neon_vshlq_n_v((int32x4_t)(__v), (__bits), 34) 501 1.1 rin #endif 502 1.1 rin 503 1.1 rin #if defined(__GNUC__) && !defined(__clang__) 504 1.1 rin _INTRINSATTR 505 1.1 rin static __inline uint32x4_t 506 1.1 rin vshlq_n_u32(uint32x4_t __v, uint8_t __bits) 507 1.1 rin { 508 1.1 rin #ifdef __aarch64__ 509 1.1 rin return (uint32x4_t)__builtin_aarch64_ashlv4si((int32x4_t)__v, __bits); 510 1.1 rin #else 511 1.1 rin return (uint32x4_t)__builtin_neon_vshl_nv4si((int32x4_t)__v, __bits); 512 1.1 rin #endif 513 1.1 rin } 514 1.1 rin #elif defined(__clang__) 515 1.1 rin #define vshlq_n_u32(__v, __bits) \ 516 1.1 rin (uint32x4_t)__builtin_neon_vshlq_n_v((int32x4_t)(__v), (__bits), 50) 517 1.1 rin #endif 518 1.1 rin 519 1.1 rin #if defined(__GNUC__) && !defined(__clang__) 520 1.1 rin _INTRINSATTR 521 1.1 rin static __inline uint32x4_t 522 1.1 rin vshrq_n_u32(uint32x4_t __v, uint8_t __bits) 523 1.1 rin { 524 1.1 rin #ifdef __aarch64__ 525 1.2 rin # if __GNUC_PREREQ__(12, 0) 526 1.2 rin return __builtin_aarch64_lshrv4si_uus(__v, __bits); 527 1.2 rin # else 528 1.1 rin return (uint32x4_t)__builtin_aarch64_lshrv4si((int32x4_t)__v, __bits); 529 1.2 rin # endif 530 1.1 rin #else 531 1.1 rin return (uint32x4_t)__builtin_neon_vshru_nv4si((int32x4_t)__v, __bits); 532 1.1 rin #endif 533 1.1 rin } 534 1.1 rin #elif defined(__clang__) 535 1.1 rin #define vshrq_n_u32(__v, __bits) \ 536 1.1 rin (uint32x4_t)__builtin_neon_vshrq_n_v((int32x4_t)(__v), (__bits), 50) 537 1.1 rin #endif 538 1.1 rin 539 1.1 rin #if defined(__GNUC__) && !defined(__clang__) 540 1.1 rin _INTRINSATTR 541 1.1 rin static __inline uint8x16_t 542 1.1 rin vshrq_n_u8(uint8x16_t __v, uint8_t __bits) 543 1.1 rin { 544 1.1 rin #ifdef __aarch64__ 545 1.2 rin # if __GNUC_PREREQ__(12, 0) 546 1.2 rin return __builtin_aarch64_lshrv16qi_uus(__v, __bits); 547 1.2 rin # else 548 1.1 rin return (uint8x16_t)__builtin_aarch64_lshrv16qi((int8x16_t)__v, __bits); 549 1.2 rin # endif 550 1.1 rin #else 551 1.1 rin return (uint8x16_t)__builtin_neon_vshru_nv16qi((int8x16_t)__v, __bits); 552 1.1 rin #endif 553 1.1 rin } 554 1.1 rin #elif defined(__clang__) 555 1.1 rin #define vshrq_n_u8(__v, __bits) \ 556 1.1 rin (uint8x16_t)__builtin_neon_vshrq_n_v((int8x16_t)(__v), (__bits), 48) 557 1.1 rin #endif 558 1.1 rin 559 1.1 rin #if defined(__GNUC__) && !defined(__clang__) 560 1.1 rin _INTRINSATTR 561 1.1 rin static __inline int32x4_t 562 1.1 rin vsliq_n_s32(int32x4_t __vins, int32x4_t __vsh, uint8_t __bits) 563 1.1 rin { 564 1.1 rin #ifdef __aarch64__ 565 1.1 rin return (int32x4_t)__builtin_aarch64_ssli_nv4si(__vins, __vsh, __bits); 566 1.1 rin #else 567 1.1 rin return (int32x4_t)__builtin_neon_vsli_nv4si(__vins, __vsh, __bits); 568 1.1 rin #endif 569 1.1 rin } 570 1.1 rin #elif defined(__clang__) 571 1.1 rin #ifdef __LITTLE_ENDIAN__ 572 1.1 rin #define vsliq_n_s32(__vins, __vsh, __bits) \ 573 1.1 rin (int32x4_t)__builtin_neon_vsliq_n_v((int32x4_t)(__vins), \ 574 1.1 rin (int32x4_t)(__vsh), (__bits), 34) 575 1.1 rin #else 576 1.1 rin #define vsliq_n_s32(__vins, __vsh, __bits) ( \ 577 1.1 rin { \ 578 1.1 rin int32x4_t __tvins = (__vins); \ 579 1.1 rin int32x4_t __tvsh = (__vsh); \ 580 1.1 rin uint8_t __tbits = (__bits); \ 581 1.1 rin int32x4_t __vins_r = __builtin_shufflevector(__tvins, __tvins, \ 582 1.1 rin 3,2,1,0); \ 583 1.1 rin int32x4_t __vsh_r = __builtin_shufflevector(__tvsh, __tvsh, \ 584 1.1 rin 3,2,1,0); \ 585 1.1 rin int32x4_t __r = __builtin_neon_vsliq_n_v(__tvins, __tvsh, __tbits, \ 586 1.1 rin 34); \ 587 1.1 rin __builtin_shufflevector(__r, __r, 3,2,1,0); \ 588 1.1 rin }) 589 1.1 rin #endif /* __LITTLE_ENDIAN__ */ 590 1.1 rin #endif 591 1.1 rin 592 1.1 rin #if defined(__GNUC__) && !defined(__clang__) 593 1.1 rin _INTRINSATTR 594 1.1 rin static __inline uint32x4_t 595 1.1 rin vsriq_n_u32(uint32x4_t __vins, uint32x4_t __vsh, uint8_t __bits) 596 1.1 rin { 597 1.1 rin #ifdef __aarch64__ 598 1.1 rin return __builtin_aarch64_usri_nv4si_uuus(__vins, __vsh, __bits); 599 1.1 rin #else 600 1.1 rin return (uint32x4_t)__builtin_neon_vsri_nv4si((int32x4_t)__vins, 601 1.1 rin (int32x4_t)__vsh, __bits); 602 1.1 rin #endif 603 1.1 rin } 604 1.1 rin #elif defined(__clang__) 605 1.1 rin #ifdef __LITTLE_ENDIAN__ 606 1.1 rin #define vsriq_n_u32(__vins, __vsh, __bits) \ 607 1.1 rin (int32x4_t)__builtin_neon_vsriq_n_v((int32x4_t)(__vins), \ 608 1.1 rin (int32x4_t)(__vsh), (__bits), 34) 609 1.1 rin #else 610 1.1 rin #define vsriq_n_s32(__vins, __vsh, __bits) ( \ 611 1.1 rin { \ 612 1.1 rin int32x4_t __tvins = (__vins); \ 613 1.1 rin int32x4_t __tvsh = (__vsh); \ 614 1.1 rin uint8_t __tbits = (__bits); \ 615 1.1 rin int32x4_t __vins_r = __builtin_shufflevector(__tvins, __tvins, \ 616 1.1 rin 3,2,1,0); \ 617 1.1 rin int32x4_t __vsh_r = __builtin_shufflevector(__tvsh, __tvsh, \ 618 1.1 rin 3,2,1,0); \ 619 1.1 rin int32x4_t __r = __builtin_neon_vsriq_n_v(__tvins, __tvsh, __tbits, \ 620 1.1 rin 34); \ 621 1.1 rin __builtin_shufflevector(__r, __r, 3,2,1,0); \ 622 1.1 rin }) 623 1.1 rin #endif 624 1.1 rin #endif 625 1.1 rin 626 1.1 rin _INTRINSATTR 627 1.1 rin static __inline void 628 1.1 rin vst1q_u32(uint32_t *__p32, uint32x4_t __v) 629 1.1 rin { 630 1.1 rin #if defined(__GNUC__) && !defined(__clang__) 631 1.1 rin #ifdef __aarch64__ 632 1.1 rin __builtin_aarch64_simd_si *__p = (__builtin_aarch64_simd_si *)__p32; 633 1.1 rin 634 1.1 rin __builtin_aarch64_st1v4si(__p, (int32x4_t)__v); 635 1.1 rin #else 636 1.1 rin __builtin_neon_si *__p = (__builtin_neon_si *)__p32; 637 1.1 rin 638 1.1 rin __builtin_neon_vst1v4si(__p, (int32x4_t)__v); 639 1.1 rin #endif 640 1.1 rin #elif defined(__clang__) 641 1.1 rin #ifndef __LITTLE_ENDIAN__ 642 1.1 rin __v = __builtin_shufflevector(__v, __v, 3,2,1,0); 643 1.1 rin #endif 644 1.1 rin __builtin_neon_vst1q_v(__p32, __v, 50); 645 1.1 rin #endif 646 1.1 rin } 647 1.1 rin 648 1.1 rin _INTRINSATTR 649 1.1 rin static __inline void 650 1.1 rin vst1q_u8(uint8_t *__p8, uint8x16_t __v) 651 1.1 rin { 652 1.1 rin #if defined(__GNUC__) && !defined(__clang__) 653 1.1 rin #ifdef __aarch64__ 654 1.1 rin __builtin_aarch64_simd_qi *__p = (__builtin_aarch64_simd_qi *)__p8; 655 1.1 rin 656 1.1 rin __builtin_aarch64_st1v16qi(__p, (int8x16_t)__v); 657 1.1 rin #else 658 1.1 rin __builtin_neon_qi *__p = (__builtin_neon_qi *)__p8; 659 1.1 rin 660 1.1 rin __builtin_neon_vst1v16qi(__p, (int8x16_t)__v); 661 1.1 rin #endif 662 1.1 rin #elif defined(__clang__) 663 1.1 rin #ifndef __LITTLE_ENDIAN__ 664 1.1 rin __v = __builtin_shufflevector(__v, __v, 665 1.1 rin 15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0); 666 1.1 rin #endif 667 1.1 rin __builtin_neon_vst1q_v(__p8, __v, 48); 668 1.1 rin #endif 669 1.1 rin } 670 1.1 rin 671 1.1 rin #ifndef __aarch64__ /* XXX */ 672 1.1 rin 673 1.1 rin _INTRINSATTR 674 1.1 rin static __inline uint8x8_t 675 1.1 rin vtbl1_u8(uint8x8_t __tab, uint8x8_t __idx) 676 1.1 rin { 677 1.1 rin #if defined(__GNUC__) && !defined(__clang__) 678 1.1 rin return (uint8x8_t)__builtin_neon_vtbl1v8qi((int8x8_t)__tab, 679 1.1 rin (int8x8_t)__idx); 680 1.1 rin #elif defined(__clang__) 681 1.1 rin uint8x8_t __ret; 682 1.1 rin #ifndef __LITTLE_ENDIAN__ 683 1.1 rin __tab = __builtin_shufflevector(__tab, __tab, 7,6,5,4,3,2,1,0); 684 1.1 rin __idx = __builtin_shufflevector(__idx, __idx, 7,6,5,4,3,2,1,0); 685 1.1 rin #endif 686 1.1 rin __ret = (uint8x8_t)__builtin_neon_vtbl1_v((int8x8_t)__tab, 687 1.1 rin (int8x8_t)__idx, 16); 688 1.1 rin #ifndef __LITTLE_ENDIAN__ 689 1.1 rin __ret = __builtin_shufflevector(__ret, __ret, 7,6,5,4,3,2,1,0); 690 1.1 rin #endif 691 1.1 rin return __ret; 692 1.1 rin #endif 693 1.1 rin } 694 1.1 rin 695 1.1 rin _INTRINSATTR 696 1.1 rin static __inline uint8x8_t 697 1.1 rin vtbl2_u8(uint8x8x2_t __tab, uint8x8_t __idx) 698 1.1 rin { 699 1.1 rin #if defined(__GNUC__) && !defined(__clang__) 700 1.1 rin union { 701 1.1 rin uint8x8x2_t __u8x8x82; 702 1.1 rin __builtin_neon_ti __ti; 703 1.1 rin } __u = { __tab }; 704 1.1 rin return (uint8x8_t)__builtin_neon_vtbl2v8qi(__u.__ti, (int8x8_t)__idx); 705 1.1 rin #elif defined(__clang__) 706 1.1 rin uint8x8_t __ret; 707 1.1 rin #ifndef __LITTLE_ENDIAN__ 708 1.1 rin __tab.val[0] = __builtin_shufflevector(__tab.val[0], __tab.val[0], 709 1.1 rin 7,6,5,4,3,2,1,0); 710 1.1 rin __tab.val[1] = __builtin_shufflevector(__tab.val[1], __tab.val[1], 711 1.1 rin 7,6,5,4,3,2,1,0); 712 1.1 rin __idx = __builtin_shufflevector(__idx, __idx, 7,6,5,4,3,2,1,0); 713 1.1 rin #endif 714 1.1 rin __ret = (uint8x8_t)__builtin_neon_vtbl2_v((int8x8_t)__tab.val[0], 715 1.1 rin (int8x8_t)__tab.val[1], (int8x8_t)__idx, 16); 716 1.1 rin #ifndef __LITTLE_ENDIAN__ 717 1.1 rin __ret = __builtin_shufflevector(__ret, __ret, 7,6,5,4,3,2,1,0); 718 1.1 rin #endif 719 1.1 rin return __ret; 720 1.1 rin #endif 721 1.1 rin } 722 1.1 rin 723 1.1 rin #endif /* !defined(__aarch64__) */ 724 1.1 rin 725 1.1 rin #endif /* _SYS_CRYPTO_ARCH_ARM_ARM_NEON_H */ 726