1 /* $NetBSD: arm_neon.h,v 1.2 2023/08/07 01:14:19 rin Exp $ */ 2 3 /*- 4 * Copyright (c) 2020 The NetBSD Foundation, Inc. 5 * All rights reserved. 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 1. Redistributions of source code must retain the above copyright 11 * notice, this list of conditions and the following disclaimer. 12 * 2. Redistributions in binary form must reproduce the above copyright 13 * notice, this list of conditions and the following disclaimer in the 14 * documentation and/or other materials provided with the distribution. 15 * 16 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS 17 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED 18 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 19 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS 20 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 21 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 22 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 23 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 24 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 25 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 26 * POSSIBILITY OF SUCH DAMAGE. 27 */ 28 29 #ifndef _SYS_CRYPTO_ARCH_ARM_ARM_NEON_H 30 #define _SYS_CRYPTO_ARCH_ARM_ARM_NEON_H 31 32 #if defined(__GNUC__) && !defined(__clang__) 33 34 #define _INTRINSATTR \ 35 __extension__ \ 36 __attribute__((__always_inline__, __gnu_inline__, __artificial__)) 37 38 #ifdef __aarch64__ 39 typedef __Int32x4_t int32x4_t; 40 typedef __Int64x2_t int64x2_t; 41 typedef __Int8x16_t int8x16_t; 42 typedef __Uint16x8_t uint16x8_t; 43 typedef __Uint32x4_t uint32x4_t; 44 typedef __Uint64x2_t uint64x2_t; 45 typedef __Uint8x16_t uint8x16_t; 46 typedef struct { uint8x16_t val[2]; } uint8x16x2_t; 47 #else 48 typedef __simd128_int32_t int32x4_t; 49 typedef __simd128_int64_t int64x2_t; 50 typedef __simd128_int8_t int8x16_t; 51 typedef __simd128_uint16_t uint16x8_t; 52 typedef __simd128_uint32_t uint32x4_t; 53 typedef __simd128_uint64_t uint64x2_t; 54 typedef __simd128_uint8_t uint8x16_t; 55 56 typedef __simd64_int8_t int8x8_t; 57 typedef __simd64_uint8_t uint8x8_t; 58 typedef __builtin_neon_udi uint64x1_t; 59 typedef struct { uint8x8_t val[2]; } uint8x8x2_t; 60 typedef struct { uint8x16_t val[2]; } uint8x16x2_t; 61 #endif 62 63 #if defined(__AARCH64EB__) 64 #define __neon_lane_index(__v, __i) (__arraycount(__v) - 1 - (__i)) 65 #define __neon_laneq_index(__v, __i) (__arraycount(__v) - 1 - (__i)) 66 #elif defined(__ARM_BIG_ENDIAN) 67 #define __neon_lane_index(__v, __i) ((__i) ^ (__arraycount(__v) - 1)) 68 #define __neon_laneq_index(__v, __i) ((__i) ^ (__arraycount(__v)/2 - 1)) 69 #else 70 #define __neon_lane_index(__v, __i) (__i) 71 #define __neon_laneq_index(__v, __i) (__i) 72 #endif 73 74 #elif defined(__clang__) 75 76 #define _INTRINSATTR \ 77 __attribute__((__always_inline__, __nodebug__)) 78 79 typedef __attribute__((neon_vector_type(16))) int8_t int8x16_t; 80 typedef __attribute__((neon_vector_type(2))) int64_t int64x2_t; 81 typedef __attribute__((neon_vector_type(4))) int32_t int32x4_t; 82 83 typedef __attribute__((neon_vector_type(16))) uint8_t uint8x16_t; 84 typedef __attribute__((neon_vector_type(2))) uint64_t uint64x2_t; 85 typedef __attribute__((neon_vector_type(4))) uint32_t uint32x4_t; 86 typedef __attribute__((neon_vector_type(8))) uint16_t uint16x8_t; 87 88 typedef __attribute__((neon_vector_type(8))) int8_t int8x8_t; 89 90 typedef __attribute__((neon_vector_type(8))) uint8_t uint8x8_t; 91 92 typedef struct { uint8x8_t val[2]; } uint8x8x2_t; 93 typedef struct { uint8x16_t val[2]; } uint8x16x2_t; 94 95 #ifdef __LITTLE_ENDIAN__ 96 #define __neon_lane_index(__v, __i) __i 97 #define __neon_laneq_index(__v, __i) __i 98 #else 99 #define __neon_lane_index(__v, __i) (__arraycount(__v) - 1 - __i) 100 #define __neon_laneq_index(__v, __i) (__arraycount(__v) - 1 - __i) 101 #endif 102 103 #else 104 105 #error Teach me how to neon in your compile! 106 107 #endif 108 109 _INTRINSATTR 110 static __inline uint32x4_t 111 vaddq_u32(uint32x4_t __v0, uint32x4_t __v1) 112 { 113 return __v0 + __v1; 114 } 115 116 _INTRINSATTR 117 static __inline uint32x4_t 118 vcltq_s32(int32x4_t __v0, int32x4_t __v1) 119 { 120 return (uint32x4_t)(__v0 < __v1); 121 } 122 123 _INTRINSATTR 124 static __inline int32x4_t 125 vdupq_n_s32(int32_t __x) 126 { 127 return (int32x4_t) { __x, __x, __x, __x }; 128 } 129 130 _INTRINSATTR 131 static __inline uint32x4_t 132 vdupq_n_u32(uint32_t __x) 133 { 134 return (uint32x4_t) { __x, __x, __x, __x }; 135 } 136 137 _INTRINSATTR 138 static __inline uint8x16_t 139 vdupq_n_u8(uint8_t __x) 140 { 141 return (uint8x16_t) { 142 __x, __x, __x, __x, __x, __x, __x, __x, 143 __x, __x, __x, __x, __x, __x, __x, __x, 144 }; 145 } 146 147 #if defined(__GNUC__) && !defined(__clang__) 148 _INTRINSATTR 149 static __inline uint32x4_t 150 vextq_u32(uint32x4_t __lo, uint32x4_t __hi, uint8_t __i) 151 { 152 #if defined(__AARCH64EB__) || defined(__ARM_BIG_ENDIAN) 153 return __builtin_shuffle(__hi, __lo, 154 (uint32x4_t) { 4 - __i, 5 - __i, 6 - __i, 7 - __i }); 155 #else 156 return __builtin_shuffle(__lo, __hi, 157 (uint32x4_t) { __i + 0, __i + 1, __i + 2, __i + 3 }); 158 #endif 159 } 160 #elif defined(__clang__) 161 #ifdef __LITTLE_ENDIAN__ 162 #define vextq_u32(__lo, __hi, __i) \ 163 (uint32x4_t)__builtin_neon_vextq_v((int8x16_t)(__lo), \ 164 (int8x16_t)(__hi), (__i), 50) 165 #else 166 #define vextq_u32(__lo, __hi, __i) ( \ 167 { \ 168 uint32x4_t __tlo = (__lo); \ 169 uint32x4_t __thi = (__hi); \ 170 uint32x4_t __lo_r = __builtin_shufflevector(__tlo, __tlo, 3,2,1,0); \ 171 uint32x4_t __hi_r = __builtin_shufflevector(__thi, __thi, 3,2,1,0); \ 172 uint32x4_t __r = __builtin_neon_vextq_v((int8x16_t)__lo_r, \ 173 (int8x16_t)__hi_r, __i, 50); \ 174 __builtin_shufflevector(__r, __r, 3,2,1,0); \ 175 }) 176 #endif /* __LITTLE_ENDIAN__ */ 177 #endif 178 179 #if defined(__GNUC__) && !defined(__clang__) 180 _INTRINSATTR 181 static __inline uint8x16_t 182 vextq_u8(uint8x16_t __lo, uint8x16_t __hi, uint8_t __i) 183 { 184 #ifdef __aarch64__ 185 #if defined(__AARCH64EB__) 186 return __builtin_shuffle(__hi, __lo, 187 (uint8x16_t) { 188 16 - __i, 17 - __i, 18 - __i, 19 - __i, 189 20 - __i, 21 - __i, 22 - __i, 23 - __i, 190 24 - __i, 25 - __i, 26 - __i, 27 - __i, 191 28 - __i, 29 - __i, 30 - __i, 31 - __i, 192 }); 193 #else 194 return __builtin_shuffle(__lo, __hi, 195 (uint8x16_t) { 196 __i + 0, __i + 1, __i + 2, __i + 3, 197 __i + 4, __i + 5, __i + 6, __i + 7, 198 __i + 8, __i + 9, __i + 10, __i + 11, 199 __i + 12, __i + 13, __i + 14, __i + 15, 200 }); 201 #endif 202 #else 203 return (uint8x16_t)__builtin_neon_vextv16qi((int8x16_t)__lo, 204 (int8x16_t)__hi, __i); 205 #endif 206 } 207 #elif defined(__clang__) 208 #ifdef __LITTLE_ENDIAN__ 209 #define vextq_u8(__lo, __hi, __i) \ 210 (uint8x16_t)__builtin_neon_vextq_v((int8x16_t)(__lo), \ 211 (int8x16_t)(__hi), (__i), 48) 212 #else 213 #define vextq_u8(__lo, __hi, __i) ( \ 214 { \ 215 uint8x16_t __tlo = (__lo); \ 216 uint8x16_t __thi = (__hi); \ 217 uint8x16_t __lo_r = __builtin_shufflevector(__tlo, __tlo, \ 218 15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0); \ 219 uint8x16_t __hi_r = __builtin_shufflevector(__thi, __thi, \ 220 15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0); \ 221 uint8x16_t __r = __builtin_neon_vextq_v((int8x16_t)__lo_r, \ 222 (int8x16_t)__hi_r, (__i), 48); \ 223 __builtin_shufflevector(__r, __r, \ 224 15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0); \ 225 }) 226 #endif /* __LITTLE_ENDIAN */ 227 #endif 228 229 #if defined(__GNUC__) && !defined(__clang__) 230 _INTRINSATTR 231 static __inline uint32_t 232 vgetq_lane_u32(uint32x4_t __v, uint8_t __i) 233 { 234 #ifdef __aarch64__ 235 return __v[__neon_laneq_index(__v, __i)]; 236 #else 237 return (uint32_t)__builtin_neon_vget_laneuv4si((int32x4_t)__v, __i); 238 #endif 239 } 240 #elif defined(__clang__) 241 #define vgetq_lane_u32(__v, __i) \ 242 (uint32_t)__builtin_neon_vgetq_lane_i32((int32x4_t)(__v), \ 243 __neon_laneq_index(__v, __i)) 244 #endif 245 246 _INTRINSATTR 247 static __inline uint32x4_t 248 vld1q_u32(const uint32_t *__p32) 249 { 250 #if defined(__GNUC__) && !defined(__clang__) 251 #ifdef __aarch64__ 252 const __builtin_aarch64_simd_si *__p = 253 (const __builtin_aarch64_simd_si *)__p32; 254 255 return (uint32x4_t)__builtin_aarch64_ld1v4si(__p); 256 #else 257 const __builtin_neon_si *__p = (const __builtin_neon_si *)__p32; 258 259 return (uint32x4_t)__builtin_neon_vld1v4si(__p); 260 #endif 261 #elif defined(__clang__) 262 uint32x4_t __v = (uint32x4_t)__builtin_neon_vld1q_v(__p32, 50); 263 #ifndef __LITTLE_ENDIAN__ 264 __v = __builtin_shufflevector(__v, __v, 3,2,1,0); 265 #endif 266 return __v; 267 #endif 268 } 269 270 _INTRINSATTR 271 static __inline uint8x16_t 272 vld1q_u8(const uint8_t *__p8) 273 { 274 #if defined(__GNUC__) && !defined(__clang__) 275 #ifdef __aarch64__ 276 const __builtin_aarch64_simd_qi *__p = 277 (const __builtin_aarch64_simd_qi *)__p8; 278 279 return (uint8x16_t)__builtin_aarch64_ld1v16qi(__p); 280 #else 281 const __builtin_neon_qi *__p = (const __builtin_neon_qi *)__p8; 282 283 return (uint8x16_t)__builtin_neon_vld1v16qi(__p); 284 #endif 285 #elif defined(__clang__) 286 uint8x16_t __v = (uint8x16_t)__builtin_neon_vld1q_v(__p8, 48); 287 #ifndef __LITTLE_ENDIAN__ 288 __v = __builtin_shufflevector(__v, __v, 289 15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0); 290 #endif 291 return __v; 292 #endif 293 } 294 295 _INTRINSATTR 296 static __inline uint8x16_t 297 vqtbl1q_u8(uint8x16_t __tab, uint8x16_t __idx) 298 { 299 #if defined(__GNUC__) && !defined(__clang__) 300 #ifdef __aarch64__ 301 uint8x16_t __res; 302 __asm__("tbl %0.16b, {%1.16b}, %2.16b" 303 : "=w"(__res) : "w"(__tab), "w"(__idx)); 304 return __res; 305 #else 306 /* 307 * No native ARMv7 NEON instruction for this, so do it via two 308 * half-width TBLs instead (vtbl2_u8 equivalent). 309 */ 310 uint64x2_t __tab64 = (uint64x2_t)__tab; 311 uint8x8_t __tablo = (uint8x8_t)__tab64[0]; 312 uint8x8_t __tabhi = (uint8x8_t)__tab64[1]; 313 uint8x8x2_t __tab8x8x2 = { { __tablo, __tabhi } }; 314 union { 315 uint8x8x2_t __u8x8x2; 316 __builtin_neon_ti __ti; 317 } __u = { __tab8x8x2 }; 318 uint64x2_t __idx64, __out64; 319 int8x8_t __idxlo, __idxhi, __outlo, __outhi; 320 321 __idx64 = (uint64x2_t)__idx; 322 __idxlo = (int8x8_t)__idx64[0]; 323 __idxhi = (int8x8_t)__idx64[1]; 324 __outlo = (int8x8_t)__builtin_neon_vtbl2v8qi(__u.__ti, __idxlo); 325 __outhi = (int8x8_t)__builtin_neon_vtbl2v8qi(__u.__ti, __idxhi); 326 __out64 = (uint64x2_t) { (uint64x1_t)__outlo, (uint64x1_t)__outhi }; 327 328 return (uint8x16_t)__out64; 329 #endif 330 #elif defined(__clang__) 331 #ifndef __LITTLE_ENDIAN__ 332 __tab = __builtin_shufflevector(__tab, __tab, 333 15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0); 334 __idx = __builtin_shufflevector(__idx, __idx, 335 15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0); 336 #endif 337 uint8x16_t __r; 338 #ifdef __aarch64__ 339 __r = __builtin_neon_vqtbl1q_v((int8x16_t)__tab, (int8x16_t)__idx, 48); 340 #else 341 uint64x2_t __tab64 = (uint64x2_t)__tab; 342 uint8x8_t __tablo = (uint8x8_t)__tab64[0]; 343 uint8x8_t __tabhi = (uint8x8_t)__tab64[1]; 344 uint64x2_t __idx64, __out64; 345 int8x8_t __idxlo, __idxhi, __outlo, __outhi; 346 347 __idx64 = (uint64x2_t)__idx; 348 __idxlo = (int8x8_t)__idx64[0]; 349 __idxhi = (int8x8_t)__idx64[1]; 350 __outlo = (uint8x8_t)__builtin_neon_vtbl2_v((int8x8_t)__tablo, 351 (int8x8_t)__tabhi, (int8x8_t)__idxlo, 16); 352 __outhi = (uint8x8_t)__builtin_neon_vtbl2_v((int8x8_t)__tablo, 353 (int8x8_t)__tabhi, (int8x8_t)__idxhi, 16); 354 __out64 = (uint64x2_t) { (uint64_t)__outlo, (uint64_t)__outhi }; 355 __r = (uint8x16_t)__out64; 356 #endif 357 #ifndef __LITTLE_ENDIAN__ 358 __r = __builtin_shufflevector(__r, __r, 359 15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0); 360 #endif 361 return __r; 362 #endif 363 } 364 365 _INTRINSATTR 366 static __inline int32x4_t 367 vreinterpretq_s32_u8(uint8x16_t __v) 368 { 369 return (int32x4_t)__v; 370 } 371 372 _INTRINSATTR 373 static __inline uint16x8_t 374 vreinterpretq_u16_u32(uint32x4_t __v) 375 { 376 return (uint16x8_t)__v; 377 } 378 379 _INTRINSATTR 380 static __inline uint32x4_t 381 vreinterpretq_u32_u16(uint16x8_t __v) 382 { 383 return (uint32x4_t)__v; 384 } 385 386 _INTRINSATTR 387 static __inline uint32x4_t 388 vreinterpretq_u32_u64(uint64x2_t __v) 389 { 390 return (uint32x4_t)__v; 391 } 392 393 _INTRINSATTR 394 static __inline uint32x4_t 395 vreinterpretq_u32_u8(uint8x16_t __v) 396 { 397 return (uint32x4_t)__v; 398 } 399 400 _INTRINSATTR 401 static __inline uint64x2_t 402 vreinterpretq_u64_u32(uint32x4_t __v) 403 { 404 return (uint64x2_t)__v; 405 } 406 407 _INTRINSATTR 408 static __inline uint64x2_t 409 vreinterpretq_u64_u8(uint8x16_t __v) 410 { 411 return (uint64x2_t)__v; 412 } 413 414 _INTRINSATTR 415 static __inline uint8x16_t 416 vreinterpretq_u8_s32(int32x4_t __v) 417 { 418 return (uint8x16_t)__v; 419 } 420 421 _INTRINSATTR 422 static __inline uint8x16_t 423 vreinterpretq_u8_u32(uint32x4_t __v) 424 { 425 return (uint8x16_t)__v; 426 } 427 428 _INTRINSATTR 429 static __inline uint8x16_t 430 vreinterpretq_u8_u64(uint64x2_t __v) 431 { 432 return (uint8x16_t)__v; 433 } 434 435 _INTRINSATTR 436 static __inline uint16x8_t 437 vrev32q_u16(uint16x8_t __v) 438 { 439 #if defined(__GNUC__) && !defined(__clang__) 440 return __builtin_shuffle(__v, (uint16x8_t) { 1,0, 3,2, 5,4, 7,6 }); 441 #elif defined(__clang__) 442 return __builtin_shufflevector(__v, __v, 1,0, 3,2, 5,4, 7,6); 443 #endif 444 } 445 446 _INTRINSATTR 447 static __inline uint8x16_t 448 vrev32q_u8(uint8x16_t __v) 449 { 450 #if defined(__GNUC__) && !defined(__clang__) 451 return __builtin_shuffle(__v, 452 (uint8x16_t) { 3,2,1,0, 7,6,5,4, 11,10,9,8, 15,14,13,12 }); 453 #elif defined(__clang__) 454 return __builtin_shufflevector(__v, __v, 455 3,2,1,0, 7,6,5,4, 11,10,9,8, 15,14,13,12); 456 #endif 457 } 458 459 #if defined(__GNUC__) && !defined(__clang__) 460 _INTRINSATTR 461 static __inline uint32x4_t 462 vsetq_lane_u32(uint32_t __x, uint32x4_t __v, uint8_t __i) 463 { 464 __v[__neon_laneq_index(__v, __i)] = __x; 465 return __v; 466 } 467 #elif defined(__clang__) 468 #define vsetq_lane_u32(__x, __v, __i) \ 469 (uint32x4_t)__builtin_neon_vsetq_lane_i32((__x), (int32x4_t)(__v), \ 470 __neon_laneq_index(__v, __i)) 471 #endif 472 473 #if defined(__GNUC__) && !defined(__clang__) 474 _INTRINSATTR 475 static __inline uint64x2_t 476 vsetq_lane_u64(uint64_t __x, uint64x2_t __v, uint8_t __i) 477 { 478 __v[__neon_laneq_index(__v, __i)] = __x; 479 return __v; 480 } 481 #elif defined(__clang__) 482 #define vsetq_lane_u64(__x, __v, __i) \ 483 (uint64x2_t)__builtin_neon_vsetq_lane_i64((__x), (int64x2_t)(__v), \ 484 __neon_laneq_index(__v, __i)); 485 #endif 486 487 #if defined(__GNUC__) && !defined(__clang__) 488 _INTRINSATTR 489 static __inline int32x4_t 490 vshlq_n_s32(int32x4_t __v, uint8_t __bits) 491 { 492 #ifdef __aarch64__ 493 return (int32x4_t)__builtin_aarch64_ashlv4si(__v, __bits); 494 #else 495 return (int32x4_t)__builtin_neon_vshl_nv4si(__v, __bits); 496 #endif 497 } 498 #elif defined(__clang__) 499 #define vshlq_n_s32(__v, __bits) \ 500 (int32x4_t)__builtin_neon_vshlq_n_v((int32x4_t)(__v), (__bits), 34) 501 #endif 502 503 #if defined(__GNUC__) && !defined(__clang__) 504 _INTRINSATTR 505 static __inline uint32x4_t 506 vshlq_n_u32(uint32x4_t __v, uint8_t __bits) 507 { 508 #ifdef __aarch64__ 509 return (uint32x4_t)__builtin_aarch64_ashlv4si((int32x4_t)__v, __bits); 510 #else 511 return (uint32x4_t)__builtin_neon_vshl_nv4si((int32x4_t)__v, __bits); 512 #endif 513 } 514 #elif defined(__clang__) 515 #define vshlq_n_u32(__v, __bits) \ 516 (uint32x4_t)__builtin_neon_vshlq_n_v((int32x4_t)(__v), (__bits), 50) 517 #endif 518 519 #if defined(__GNUC__) && !defined(__clang__) 520 _INTRINSATTR 521 static __inline uint32x4_t 522 vshrq_n_u32(uint32x4_t __v, uint8_t __bits) 523 { 524 #ifdef __aarch64__ 525 # if __GNUC_PREREQ__(12, 0) 526 return __builtin_aarch64_lshrv4si_uus(__v, __bits); 527 # else 528 return (uint32x4_t)__builtin_aarch64_lshrv4si((int32x4_t)__v, __bits); 529 # endif 530 #else 531 return (uint32x4_t)__builtin_neon_vshru_nv4si((int32x4_t)__v, __bits); 532 #endif 533 } 534 #elif defined(__clang__) 535 #define vshrq_n_u32(__v, __bits) \ 536 (uint32x4_t)__builtin_neon_vshrq_n_v((int32x4_t)(__v), (__bits), 50) 537 #endif 538 539 #if defined(__GNUC__) && !defined(__clang__) 540 _INTRINSATTR 541 static __inline uint8x16_t 542 vshrq_n_u8(uint8x16_t __v, uint8_t __bits) 543 { 544 #ifdef __aarch64__ 545 # if __GNUC_PREREQ__(12, 0) 546 return __builtin_aarch64_lshrv16qi_uus(__v, __bits); 547 # else 548 return (uint8x16_t)__builtin_aarch64_lshrv16qi((int8x16_t)__v, __bits); 549 # endif 550 #else 551 return (uint8x16_t)__builtin_neon_vshru_nv16qi((int8x16_t)__v, __bits); 552 #endif 553 } 554 #elif defined(__clang__) 555 #define vshrq_n_u8(__v, __bits) \ 556 (uint8x16_t)__builtin_neon_vshrq_n_v((int8x16_t)(__v), (__bits), 48) 557 #endif 558 559 #if defined(__GNUC__) && !defined(__clang__) 560 _INTRINSATTR 561 static __inline int32x4_t 562 vsliq_n_s32(int32x4_t __vins, int32x4_t __vsh, uint8_t __bits) 563 { 564 #ifdef __aarch64__ 565 return (int32x4_t)__builtin_aarch64_ssli_nv4si(__vins, __vsh, __bits); 566 #else 567 return (int32x4_t)__builtin_neon_vsli_nv4si(__vins, __vsh, __bits); 568 #endif 569 } 570 #elif defined(__clang__) 571 #ifdef __LITTLE_ENDIAN__ 572 #define vsliq_n_s32(__vins, __vsh, __bits) \ 573 (int32x4_t)__builtin_neon_vsliq_n_v((int32x4_t)(__vins), \ 574 (int32x4_t)(__vsh), (__bits), 34) 575 #else 576 #define vsliq_n_s32(__vins, __vsh, __bits) ( \ 577 { \ 578 int32x4_t __tvins = (__vins); \ 579 int32x4_t __tvsh = (__vsh); \ 580 uint8_t __tbits = (__bits); \ 581 int32x4_t __vins_r = __builtin_shufflevector(__tvins, __tvins, \ 582 3,2,1,0); \ 583 int32x4_t __vsh_r = __builtin_shufflevector(__tvsh, __tvsh, \ 584 3,2,1,0); \ 585 int32x4_t __r = __builtin_neon_vsliq_n_v(__tvins, __tvsh, __tbits, \ 586 34); \ 587 __builtin_shufflevector(__r, __r, 3,2,1,0); \ 588 }) 589 #endif /* __LITTLE_ENDIAN__ */ 590 #endif 591 592 #if defined(__GNUC__) && !defined(__clang__) 593 _INTRINSATTR 594 static __inline uint32x4_t 595 vsriq_n_u32(uint32x4_t __vins, uint32x4_t __vsh, uint8_t __bits) 596 { 597 #ifdef __aarch64__ 598 return __builtin_aarch64_usri_nv4si_uuus(__vins, __vsh, __bits); 599 #else 600 return (uint32x4_t)__builtin_neon_vsri_nv4si((int32x4_t)__vins, 601 (int32x4_t)__vsh, __bits); 602 #endif 603 } 604 #elif defined(__clang__) 605 #ifdef __LITTLE_ENDIAN__ 606 #define vsriq_n_u32(__vins, __vsh, __bits) \ 607 (int32x4_t)__builtin_neon_vsriq_n_v((int32x4_t)(__vins), \ 608 (int32x4_t)(__vsh), (__bits), 34) 609 #else 610 #define vsriq_n_s32(__vins, __vsh, __bits) ( \ 611 { \ 612 int32x4_t __tvins = (__vins); \ 613 int32x4_t __tvsh = (__vsh); \ 614 uint8_t __tbits = (__bits); \ 615 int32x4_t __vins_r = __builtin_shufflevector(__tvins, __tvins, \ 616 3,2,1,0); \ 617 int32x4_t __vsh_r = __builtin_shufflevector(__tvsh, __tvsh, \ 618 3,2,1,0); \ 619 int32x4_t __r = __builtin_neon_vsriq_n_v(__tvins, __tvsh, __tbits, \ 620 34); \ 621 __builtin_shufflevector(__r, __r, 3,2,1,0); \ 622 }) 623 #endif 624 #endif 625 626 _INTRINSATTR 627 static __inline void 628 vst1q_u32(uint32_t *__p32, uint32x4_t __v) 629 { 630 #if defined(__GNUC__) && !defined(__clang__) 631 #ifdef __aarch64__ 632 __builtin_aarch64_simd_si *__p = (__builtin_aarch64_simd_si *)__p32; 633 634 __builtin_aarch64_st1v4si(__p, (int32x4_t)__v); 635 #else 636 __builtin_neon_si *__p = (__builtin_neon_si *)__p32; 637 638 __builtin_neon_vst1v4si(__p, (int32x4_t)__v); 639 #endif 640 #elif defined(__clang__) 641 #ifndef __LITTLE_ENDIAN__ 642 __v = __builtin_shufflevector(__v, __v, 3,2,1,0); 643 #endif 644 __builtin_neon_vst1q_v(__p32, __v, 50); 645 #endif 646 } 647 648 _INTRINSATTR 649 static __inline void 650 vst1q_u8(uint8_t *__p8, uint8x16_t __v) 651 { 652 #if defined(__GNUC__) && !defined(__clang__) 653 #ifdef __aarch64__ 654 __builtin_aarch64_simd_qi *__p = (__builtin_aarch64_simd_qi *)__p8; 655 656 __builtin_aarch64_st1v16qi(__p, (int8x16_t)__v); 657 #else 658 __builtin_neon_qi *__p = (__builtin_neon_qi *)__p8; 659 660 __builtin_neon_vst1v16qi(__p, (int8x16_t)__v); 661 #endif 662 #elif defined(__clang__) 663 #ifndef __LITTLE_ENDIAN__ 664 __v = __builtin_shufflevector(__v, __v, 665 15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0); 666 #endif 667 __builtin_neon_vst1q_v(__p8, __v, 48); 668 #endif 669 } 670 671 #ifndef __aarch64__ /* XXX */ 672 673 _INTRINSATTR 674 static __inline uint8x8_t 675 vtbl1_u8(uint8x8_t __tab, uint8x8_t __idx) 676 { 677 #if defined(__GNUC__) && !defined(__clang__) 678 return (uint8x8_t)__builtin_neon_vtbl1v8qi((int8x8_t)__tab, 679 (int8x8_t)__idx); 680 #elif defined(__clang__) 681 uint8x8_t __ret; 682 #ifndef __LITTLE_ENDIAN__ 683 __tab = __builtin_shufflevector(__tab, __tab, 7,6,5,4,3,2,1,0); 684 __idx = __builtin_shufflevector(__idx, __idx, 7,6,5,4,3,2,1,0); 685 #endif 686 __ret = (uint8x8_t)__builtin_neon_vtbl1_v((int8x8_t)__tab, 687 (int8x8_t)__idx, 16); 688 #ifndef __LITTLE_ENDIAN__ 689 __ret = __builtin_shufflevector(__ret, __ret, 7,6,5,4,3,2,1,0); 690 #endif 691 return __ret; 692 #endif 693 } 694 695 _INTRINSATTR 696 static __inline uint8x8_t 697 vtbl2_u8(uint8x8x2_t __tab, uint8x8_t __idx) 698 { 699 #if defined(__GNUC__) && !defined(__clang__) 700 union { 701 uint8x8x2_t __u8x8x82; 702 __builtin_neon_ti __ti; 703 } __u = { __tab }; 704 return (uint8x8_t)__builtin_neon_vtbl2v8qi(__u.__ti, (int8x8_t)__idx); 705 #elif defined(__clang__) 706 uint8x8_t __ret; 707 #ifndef __LITTLE_ENDIAN__ 708 __tab.val[0] = __builtin_shufflevector(__tab.val[0], __tab.val[0], 709 7,6,5,4,3,2,1,0); 710 __tab.val[1] = __builtin_shufflevector(__tab.val[1], __tab.val[1], 711 7,6,5,4,3,2,1,0); 712 __idx = __builtin_shufflevector(__idx, __idx, 7,6,5,4,3,2,1,0); 713 #endif 714 __ret = (uint8x8_t)__builtin_neon_vtbl2_v((int8x8_t)__tab.val[0], 715 (int8x8_t)__tab.val[1], (int8x8_t)__idx, 16); 716 #ifndef __LITTLE_ENDIAN__ 717 __ret = __builtin_shufflevector(__ret, __ret, 7,6,5,4,3,2,1,0); 718 #endif 719 return __ret; 720 #endif 721 } 722 723 #endif /* !defined(__aarch64__) */ 724 725 #endif /* _SYS_CRYPTO_ARCH_ARM_ARM_NEON_H */ 726