1 1.4 riastrad /* $NetBSD: immintrin.h,v 1.4 2024/07/16 15:27:40 riastradh Exp $ */ 2 1.1 rin 3 1.1 rin /*- 4 1.1 rin * Copyright (c) 2020 The NetBSD Foundation, Inc. 5 1.1 rin * All rights reserved. 6 1.1 rin * 7 1.1 rin * Redistribution and use in source and binary forms, with or without 8 1.1 rin * modification, are permitted provided that the following conditions 9 1.1 rin * are met: 10 1.1 rin * 1. Redistributions of source code must retain the above copyright 11 1.1 rin * notice, this list of conditions and the following disclaimer. 12 1.1 rin * 2. Redistributions in binary form must reproduce the above copyright 13 1.1 rin * notice, this list of conditions and the following disclaimer in the 14 1.1 rin * documentation and/or other materials provided with the distribution. 15 1.1 rin * 16 1.1 rin * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS 17 1.1 rin * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED 18 1.1 rin * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 19 1.1 rin * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS 20 1.1 rin * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 21 1.1 rin * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 22 1.1 rin * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 23 1.1 rin * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 24 1.1 rin * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 25 1.1 rin * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 26 1.1 rin * POSSIBILITY OF SUCH DAMAGE. 27 1.1 rin */ 28 1.1 rin 29 1.1 rin #ifndef _SYS_CRYPTO_ARCH_X86_IMMINTRIN_H 30 1.1 rin #define _SYS_CRYPTO_ARCH_X86_IMMINTRIN_H 31 1.1 rin 32 1.1 rin #include <sys/types.h> 33 1.1 rin 34 1.1 rin /* 35 1.1 rin * This kludgerous header file provides definitions for the Intel 36 1.1 rin * intrinsics that work with GCC and Clang, because <immintrin.h> is 37 1.1 rin * not available during the kernel build and arranging to make it 38 1.1 rin * available is complicated. Please fix this properly! 39 1.1 rin */ 40 1.1 rin 41 1.1 rin #if defined(__GNUC__) && !defined(__clang__) 42 1.1 rin 43 1.1 rin #define _INTRINSATTR \ 44 1.1 rin __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 45 1.1 rin 46 1.2 riastrad typedef short __m16 __attribute__((__vector_size__(2), __may_alias__)); 47 1.2 riastrad typedef short __m16_u 48 1.2 riastrad __attribute__((__vector_size__(2), __may_alias__, __aligned__(1))); 49 1.2 riastrad typedef int __m32 __attribute__((__vector_size__(4), __may_alias__)); 50 1.2 riastrad typedef int __m32_u 51 1.2 riastrad __attribute__((__vector_size__(4), __may_alias__, __aligned__(1))); 52 1.2 riastrad typedef int __m64 __attribute__((__vector_size__(8), __may_alias__)); 53 1.2 riastrad typedef int __m64_u 54 1.2 riastrad __attribute__((__vector_size__(8), __may_alias__, __aligned__(1))); 55 1.1 rin typedef float __m128 __attribute__((__vector_size__(16), __may_alias__)); 56 1.1 rin typedef long long __m128i __attribute__((__vector_size__(16), __may_alias__)); 57 1.1 rin typedef long long __m128i_u 58 1.1 rin __attribute__((__vector_size__(16), __may_alias__, __aligned__(1))); 59 1.1 rin typedef long long __v2di __attribute__((__vector_size__(16))); 60 1.1 rin typedef unsigned long long __v2du __attribute__((__vector_size__(16))); 61 1.1 rin typedef int __v4si __attribute__((__vector_size__(16))); 62 1.1 rin typedef unsigned __v4su __attribute__((__vector_size__(16))); 63 1.1 rin typedef float __v4sf __attribute__((__vector_size__(16))); 64 1.1 rin typedef short __v8hi __attribute__((__vector_size__(16))); 65 1.1 rin typedef char __v16qi __attribute__((__vector_size__(16))); 66 1.4 riastrad typedef char __v16qi_u 67 1.4 riastrad __attribute__((__vector_size__(16), __may_alias__, __aligned__(1))); 68 1.1 rin 69 1.1 rin #elif defined(__clang__) 70 1.1 rin 71 1.1 rin typedef float __m128 __attribute__((__vector_size__(16), __aligned__(16))); 72 1.1 rin typedef long long __m128i 73 1.1 rin __attribute__((__vector_size__(16), __aligned__(16))); 74 1.1 rin typedef long long __m128i_u 75 1.1 rin __attribute__((__vector_size__(16), __may_alias__, __aligned__(1))); 76 1.1 rin typedef long long __v2di __attribute__((__vector_size__(16))); 77 1.1 rin typedef unsigned long long __v2du __attribute__((__vector_size__(16))); 78 1.1 rin typedef int __v4si __attribute__((__vector_size__(16))); 79 1.1 rin typedef unsigned __v4su __attribute__((__vector_size__(16))); 80 1.1 rin typedef float __v4sf __attribute__((__vector_size__(16))); 81 1.1 rin typedef short __v8hi __attribute__((__vector_size__(16))); 82 1.1 rin typedef char __v16qi __attribute__((__vector_size__(16))); 83 1.1 rin 84 1.1 rin #define _INTRINSATTR \ 85 1.1 rin __attribute__((__always_inline__, __nodebug__, __target__("sse2"), \ 86 1.1 rin __min_vector_width__(128))) 87 1.1 rin #define _PACKALIAS \ 88 1.1 rin __attribute__((__packed__, __may_alias__)) 89 1.1 rin 90 1.1 rin #else 91 1.1 rin 92 1.1 rin #error Please teach me how to do Intel intrinsics for your compiler! 93 1.1 rin 94 1.1 rin #endif 95 1.1 rin 96 1.1 rin #define _SSSE3_ATTR __attribute__((target("ssse3"))) 97 1.1 rin 98 1.1 rin _INTRINSATTR 99 1.1 rin static __inline __m128i 100 1.1 rin _mm_add_epi32(__m128i __a, __m128i __b) 101 1.1 rin { 102 1.1 rin return (__m128i)((__v4su)__a + (__v4su)__b); 103 1.1 rin } 104 1.1 rin 105 1.1 rin #if defined(__GNUC__) && !defined(__clang__) 106 1.1 rin #define _mm_alignr_epi8(hi,lo,bytes) \ 107 1.1 rin (__m128i)__builtin_ia32_palignr128((__v2di)(__m128i)(hi), \ 108 1.1 rin (__v2di)(__m128i)(lo), 8*(int)(bytes)) 109 1.1 rin #elif defined(__clang__) 110 1.1 rin #define _mm_alignr_epi8(hi,lo,bytes) \ 111 1.1 rin (__m128i)__builtin_ia32_palignr128((__v16qi)(__m128i)(hi), \ 112 1.1 rin (__v16qi)(__m128i)(lo), (int)(bytes)) 113 1.1 rin #endif 114 1.1 rin 115 1.1 rin _INTRINSATTR 116 1.1 rin static __inline __m128 117 1.1 rin _mm_load1_ps(const float *__p) 118 1.1 rin { 119 1.1 rin return __extension__ (__m128)(__v4sf) { *__p, *__p, *__p, *__p }; 120 1.1 rin } 121 1.1 rin 122 1.1 rin _INTRINSATTR 123 1.1 rin static __inline __m128i 124 1.1 rin _mm_loadu_si128(const __m128i_u *__p) 125 1.1 rin { 126 1.2 riastrad #if defined(__GNUC__) && !defined(__clang__) 127 1.2 riastrad return *__p; 128 1.2 riastrad #else 129 1.1 rin return ((const struct { __m128i_u __v; } _PACKALIAS *)__p)->__v; 130 1.2 riastrad #endif 131 1.1 rin } 132 1.1 rin 133 1.1 rin _INTRINSATTR 134 1.1 rin static __inline __m128i 135 1.1 rin _mm_loadu_si32(const void *__p) 136 1.1 rin { 137 1.2 riastrad #if defined(__GNUC__) && !defined(__clang__) 138 1.3 riastrad int32_t __v = (*(const __m32_u *)__p)[0]; 139 1.2 riastrad #else 140 1.1 rin int32_t __v = ((const struct { int32_t __v; } _PACKALIAS *)__p)->__v; 141 1.2 riastrad #endif 142 1.1 rin return __extension__ (__m128i)(__v4si){ __v, 0, 0, 0 }; 143 1.1 rin } 144 1.1 rin 145 1.1 rin _INTRINSATTR 146 1.1 rin static __inline __m128i 147 1.1 rin _mm_loadu_si64(const void *__p) 148 1.1 rin { 149 1.2 riastrad #if defined(__GNUC__) && !defined(__clang__) 150 1.4 riastrad int64_t __v = (int64_t)*(const __m64_u *)__p; 151 1.2 riastrad #else 152 1.1 rin int64_t __v = ((const struct { int64_t __v; } _PACKALIAS *)__p)->__v; 153 1.2 riastrad #endif 154 1.1 rin return __extension__ (__m128i)(__v2di){ __v, 0 }; 155 1.1 rin } 156 1.1 rin 157 1.1 rin _INTRINSATTR 158 1.1 rin static __inline __m128i 159 1.1 rin _mm_load_si128(const __m128i *__p) 160 1.1 rin { 161 1.1 rin return *__p; 162 1.1 rin } 163 1.1 rin 164 1.1 rin _INTRINSATTR 165 1.1 rin static __inline __m128 166 1.1 rin _mm_movehl_ps(__m128 __v0, __m128 __v1) 167 1.1 rin { 168 1.1 rin #if defined(__GNUC__) && !defined(__clang__) 169 1.1 rin return (__m128)__builtin_ia32_movhlps((__v4sf)__v0, (__v4sf)__v1); 170 1.1 rin #elif defined(__clang__) 171 1.1 rin return __builtin_shufflevector((__v4sf)__v0, (__v4sf)__v1, 6,7,2,3); 172 1.1 rin #endif 173 1.1 rin } 174 1.1 rin 175 1.1 rin _INTRINSATTR 176 1.1 rin static __inline __m128 177 1.1 rin _mm_movelh_ps(__m128 __v0, __m128 __v1) 178 1.1 rin { 179 1.1 rin #if defined(__GNUC__) && !defined(__clang__) 180 1.1 rin return (__m128)__builtin_ia32_movlhps((__v4sf)__v0, (__v4sf)__v1); 181 1.1 rin #elif defined(__clang__) 182 1.1 rin return __builtin_shufflevector((__v4sf)__v0, (__v4sf)__v1, 0,1,4,5); 183 1.1 rin #endif 184 1.1 rin } 185 1.1 rin 186 1.1 rin _INTRINSATTR 187 1.1 rin static __inline __m128i 188 1.1 rin _mm_set1_epi16(int16_t __v) 189 1.1 rin { 190 1.1 rin return __extension__ (__m128i)(__v8hi){ 191 1.1 rin __v, __v, __v, __v, __v, __v, __v, __v 192 1.1 rin }; 193 1.1 rin } 194 1.1 rin 195 1.1 rin _INTRINSATTR 196 1.1 rin static __inline __m128i 197 1.1 rin _mm_set1_epi32(int32_t __v) 198 1.1 rin { 199 1.1 rin return __extension__ (__m128i)(__v4si){ __v, __v, __v, __v }; 200 1.1 rin } 201 1.1 rin 202 1.1 rin _INTRINSATTR 203 1.1 rin static __inline __m128i 204 1.1 rin _mm_set1_epi64x(int64_t __v) 205 1.1 rin { 206 1.1 rin return __extension__ (__m128i)(__v2di){ __v, __v }; 207 1.1 rin } 208 1.1 rin 209 1.1 rin _INTRINSATTR 210 1.1 rin static __inline __m128i 211 1.1 rin _mm_set_epi32(int32_t __v3, int32_t __v2, int32_t __v1, int32_t __v0) 212 1.1 rin { 213 1.1 rin return __extension__ (__m128i)(__v4si){ __v0, __v1, __v2, __v3 }; 214 1.1 rin } 215 1.1 rin 216 1.1 rin _INTRINSATTR 217 1.1 rin static __inline __m128i 218 1.1 rin _mm_set_epi64x(int64_t __v1, int64_t __v0) 219 1.1 rin { 220 1.1 rin return __extension__ (__m128i)(__v2di){ __v0, __v1 }; 221 1.1 rin } 222 1.1 rin 223 1.1 rin _INTRINSATTR 224 1.1 rin static __inline __m128 225 1.1 rin _mm_setzero_ps(void) 226 1.1 rin { 227 1.1 rin return __extension__ (__m128){ 0, 0, 0, 0 }; 228 1.1 rin } 229 1.1 rin 230 1.1 rin _INTRINSATTR 231 1.1 rin static __inline __m128i 232 1.1 rin _mm_setzero_si128(void) 233 1.1 rin { 234 1.1 rin return _mm_set1_epi64x(0); 235 1.1 rin } 236 1.1 rin 237 1.1 rin _INTRINSATTR _SSSE3_ATTR 238 1.1 rin static __inline __m128i 239 1.1 rin _mm_shuffle_epi8(__m128i __vtbl, __m128i __vidx) 240 1.1 rin { 241 1.1 rin return (__m128i)__builtin_ia32_pshufb128((__v16qi)__vtbl, 242 1.1 rin (__v16qi)__vidx); 243 1.1 rin } 244 1.1 rin 245 1.1 rin #define _mm_shuffle_epi32(v,m) \ 246 1.1 rin (__m128i)__builtin_ia32_pshufd((__v4si)(__m128i)(v), (int)(m)) 247 1.1 rin 248 1.1 rin #define _mm_shuffle_ps(x,y,m) \ 249 1.1 rin (__m128)__builtin_ia32_shufps((__v4sf)(__m128)(x), \ 250 1.1 rin (__v4sf)(__m128)(y), (int)(m)) \ 251 1.1 rin 252 1.1 rin _INTRINSATTR 253 1.1 rin static __inline __m128i 254 1.1 rin _mm_slli_epi32(__m128i __v, uint8_t __bits) 255 1.1 rin { 256 1.1 rin return (__m128i)__builtin_ia32_pslldi128((__v4si)__v, (int)__bits); 257 1.1 rin } 258 1.1 rin 259 1.1 rin _INTRINSATTR 260 1.1 rin static __inline __m128i 261 1.1 rin _mm_slli_epi64(__m128i __v, uint8_t __bits) 262 1.1 rin { 263 1.1 rin return (__m128i)__builtin_ia32_psllqi128((__v2di)__v, (int)__bits); 264 1.1 rin } 265 1.1 rin 266 1.1 rin #if defined(__GNUC__) && !defined(__clang__) 267 1.1 rin #define _mm_slli_si128(v,bytes) \ 268 1.1 rin (__m128i)__builtin_ia32_pslldqi128((__v2di)(__m128i)(v), \ 269 1.1 rin 8*(int)(bytes)) 270 1.1 rin #elif defined(__clang__) 271 1.1 rin #define _mm_slli_si128(v,bytes) \ 272 1.1 rin (__m128i)__builtin_ia32_pslldqi128_byteshift((__v2di)(__m128i)(v), \ 273 1.1 rin (int)(bytes)) 274 1.1 rin #endif 275 1.1 rin 276 1.1 rin _INTRINSATTR 277 1.1 rin static __inline __m128i 278 1.1 rin _mm_srli_epi32(__m128i __v, uint8_t __bits) 279 1.1 rin { 280 1.1 rin return (__m128i)__builtin_ia32_psrldi128((__v4si)__v, (int)__bits); 281 1.1 rin } 282 1.1 rin 283 1.1 rin _INTRINSATTR 284 1.1 rin static __inline __m128i 285 1.1 rin _mm_srli_epi64(__m128i __v, uint8_t __bits) 286 1.1 rin { 287 1.1 rin return (__m128i)__builtin_ia32_psrlqi128((__v2di)__v, (int)__bits); 288 1.1 rin } 289 1.1 rin 290 1.1 rin #if defined(__GNUC__) && !defined(__clang__) 291 1.1 rin #define _mm_srli_si128(v,bytes) \ 292 1.1 rin (__m128i)__builtin_ia32_psrldqi128((__m128i)(v), 8*(int)(bytes)) 293 1.1 rin #elif defined(__clang__) 294 1.1 rin #define _mm_srli_si128(v,bytes) \ 295 1.1 rin (__m128i)__builtin_ia32_psrldqi128_byteshift((__v2di)(__m128i)(v), \ 296 1.1 rin (int)(bytes)); 297 1.1 rin #endif 298 1.1 rin 299 1.1 rin _INTRINSATTR 300 1.1 rin static __inline void 301 1.1 rin _mm_storeu_si128(__m128i_u *__p, __m128i __v) 302 1.1 rin { 303 1.2 riastrad #if defined(__GNUC__) && !defined(__clang__) 304 1.2 riastrad *__p = __v; 305 1.2 riastrad #else 306 1.1 rin ((struct { __m128i_u __v; } _PACKALIAS *)__p)->__v = __v; 307 1.2 riastrad #endif 308 1.1 rin } 309 1.1 rin 310 1.1 rin _INTRINSATTR 311 1.1 rin static __inline void 312 1.1 rin _mm_storeu_si32(void *__p, __m128i __v) 313 1.1 rin { 314 1.2 riastrad #if defined(__GNUC__) && !defined(__clang__) 315 1.2 riastrad *(__m32_u *)__p = (__m32)((__v4si)__v)[0]; 316 1.2 riastrad #else 317 1.1 rin ((struct { int32_t __v; } _PACKALIAS *)__p)->__v = ((__v4si)__v)[0]; 318 1.2 riastrad #endif 319 1.1 rin } 320 1.1 rin 321 1.1 rin _INTRINSATTR 322 1.1 rin static __inline void 323 1.1 rin _mm_storeu_si64(void *__p, __m128i __v) 324 1.1 rin { 325 1.2 riastrad #if defined(__GNUC__) && !defined(__clang__) 326 1.2 riastrad *(__m64_u *)__p = (__m64)((__v2di)__v)[0]; 327 1.2 riastrad #else 328 1.1 rin ((struct { int64_t __v; } _PACKALIAS *)__p)->__v = ((__v2di)__v)[0]; 329 1.2 riastrad #endif 330 1.1 rin } 331 1.1 rin 332 1.1 rin _INTRINSATTR 333 1.1 rin static __inline void 334 1.1 rin _mm_store_si128(__m128i *__p, __m128i __v) 335 1.1 rin { 336 1.1 rin *__p = __v; 337 1.1 rin } 338 1.1 rin 339 1.1 rin _INTRINSATTR 340 1.1 rin static __inline __m128i 341 1.1 rin _mm_sub_epi64(__m128i __x, __m128i __y) 342 1.1 rin { 343 1.1 rin return (__m128i)((__v2du)__x - (__v2du)__y); 344 1.1 rin } 345 1.1 rin 346 1.1 rin _INTRINSATTR 347 1.1 rin static __inline __m128i 348 1.1 rin _mm_unpackhi_epi32(__m128i __lo, __m128i __hi) 349 1.1 rin { 350 1.1 rin #if defined(__GNUC__) && !defined(__clang__) 351 1.1 rin return (__m128i)__builtin_ia32_punpckhdq128((__v4si)__lo, 352 1.1 rin (__v4si)__hi); 353 1.1 rin #elif defined(__clang__) 354 1.1 rin return (__m128i)__builtin_shufflevector((__v4si)__lo, (__v4si)__hi, 355 1.1 rin 2,6,3,7); 356 1.1 rin #endif 357 1.1 rin } 358 1.1 rin 359 1.1 rin _INTRINSATTR 360 1.1 rin static __inline __m128i 361 1.1 rin _mm_unpacklo_epi32(__m128i __lo, __m128i __hi) 362 1.1 rin { 363 1.1 rin #if defined(__GNUC__) && !defined(__clang__) 364 1.1 rin return (__m128i)__builtin_ia32_punpckldq128((__v4si)__lo, 365 1.1 rin (__v4si)__hi); 366 1.1 rin #elif defined(__clang__) 367 1.1 rin return (__m128i)__builtin_shufflevector((__v4si)__lo, (__v4si)__hi, 368 1.1 rin 0,4,1,5); 369 1.1 rin #endif 370 1.1 rin } 371 1.1 rin 372 1.1 rin _INTRINSATTR 373 1.1 rin static __inline __m128i 374 1.1 rin _mm_unpacklo_epi64(__m128i __lo, __m128i __hi) 375 1.1 rin { 376 1.1 rin #if defined(__GNUC__) && !defined(__clang__) 377 1.1 rin return (__m128i)__builtin_ia32_punpcklqdq128((__v2di)__lo, 378 1.1 rin (__v2di)__hi); 379 1.1 rin #elif defined(__clang__) 380 1.1 rin return (__m128i)__builtin_shufflevector((__v2di)__lo, (__v2di)__hi, 381 1.1 rin 0,2); 382 1.1 rin #endif 383 1.1 rin } 384 1.1 rin 385 1.1 rin #endif /* _SYS_CRYPTO_ARCH_X86_IMMINTRIN_H */ 386