Home | History | Annotate | Line # | Download | only in x86
      1 /*	$NetBSD: immintrin.h,v 1.4 2024/07/16 15:27:40 riastradh Exp $	*/
      2 
      3 /*-
      4  * Copyright (c) 2020 The NetBSD Foundation, Inc.
      5  * All rights reserved.
      6  *
      7  * Redistribution and use in source and binary forms, with or without
      8  * modification, are permitted provided that the following conditions
      9  * are met:
     10  * 1. Redistributions of source code must retain the above copyright
     11  *    notice, this list of conditions and the following disclaimer.
     12  * 2. Redistributions in binary form must reproduce the above copyright
     13  *    notice, this list of conditions and the following disclaimer in the
     14  *    documentation and/or other materials provided with the distribution.
     15  *
     16  * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
     17  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
     18  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
     19  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
     20  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
     21  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
     22  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
     23  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
     24  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
     25  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
     26  * POSSIBILITY OF SUCH DAMAGE.
     27  */
     28 
     29 #ifndef	_SYS_CRYPTO_ARCH_X86_IMMINTRIN_H
     30 #define	_SYS_CRYPTO_ARCH_X86_IMMINTRIN_H
     31 
     32 #include <sys/types.h>
     33 
     34 /*
     35  * This kludgerous header file provides definitions for the Intel
     36  * intrinsics that work with GCC and Clang, because <immintrin.h> is
     37  * not available during the kernel build and arranging to make it
     38  * available is complicated.  Please fix this properly!
     39  */
     40 
     41 #if defined(__GNUC__) && !defined(__clang__)
     42 
     43 #define	_INTRINSATTR							      \
     44 	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
     45 
     46 typedef short __m16 __attribute__((__vector_size__(2), __may_alias__));
     47 typedef short __m16_u
     48     __attribute__((__vector_size__(2), __may_alias__, __aligned__(1)));
     49 typedef int __m32 __attribute__((__vector_size__(4), __may_alias__));
     50 typedef int __m32_u
     51     __attribute__((__vector_size__(4), __may_alias__, __aligned__(1)));
     52 typedef int __m64 __attribute__((__vector_size__(8), __may_alias__));
     53 typedef int __m64_u
     54     __attribute__((__vector_size__(8), __may_alias__, __aligned__(1)));
     55 typedef float __m128 __attribute__((__vector_size__(16), __may_alias__));
     56 typedef long long __m128i __attribute__((__vector_size__(16), __may_alias__));
     57 typedef long long __m128i_u
     58     __attribute__((__vector_size__(16), __may_alias__, __aligned__(1)));
     59 typedef long long __v2di __attribute__((__vector_size__(16)));
     60 typedef unsigned long long __v2du __attribute__((__vector_size__(16)));
     61 typedef int __v4si __attribute__((__vector_size__(16)));
     62 typedef unsigned __v4su __attribute__((__vector_size__(16)));
     63 typedef float __v4sf __attribute__((__vector_size__(16)));
     64 typedef short __v8hi __attribute__((__vector_size__(16)));
     65 typedef char __v16qi __attribute__((__vector_size__(16)));
     66 typedef char __v16qi_u
     67     __attribute__((__vector_size__(16), __may_alias__, __aligned__(1)));
     68 
     69 #elif defined(__clang__)
     70 
     71 typedef float __m128 __attribute__((__vector_size__(16), __aligned__(16)));
     72 typedef long long __m128i
     73     __attribute__((__vector_size__(16), __aligned__(16)));
     74 typedef long long __m128i_u
     75     __attribute__((__vector_size__(16), __may_alias__, __aligned__(1)));
     76 typedef long long __v2di __attribute__((__vector_size__(16)));
     77 typedef unsigned long long __v2du __attribute__((__vector_size__(16)));
     78 typedef int __v4si __attribute__((__vector_size__(16)));
     79 typedef unsigned __v4su __attribute__((__vector_size__(16)));
     80 typedef float __v4sf __attribute__((__vector_size__(16)));
     81 typedef short __v8hi __attribute__((__vector_size__(16)));
     82 typedef char __v16qi __attribute__((__vector_size__(16)));
     83 
     84 #define	_INTRINSATTR							      \
     85 	__attribute__((__always_inline__, __nodebug__, __target__("sse2"),    \
     86 		__min_vector_width__(128)))
     87 #define	_PACKALIAS							      \
     88 	__attribute__((__packed__, __may_alias__))
     89 
     90 #else
     91 
     92 #error Please teach me how to do Intel intrinsics for your compiler!
     93 
     94 #endif
     95 
     96 #define	_SSSE3_ATTR	__attribute__((target("ssse3")))
     97 
     98 _INTRINSATTR
     99 static __inline __m128i
    100 _mm_add_epi32(__m128i __a, __m128i __b)
    101 {
    102 	return (__m128i)((__v4su)__a + (__v4su)__b);
    103 }
    104 
    105 #if defined(__GNUC__) && !defined(__clang__)
    106 #define	_mm_alignr_epi8(hi,lo,bytes)					      \
    107 	(__m128i)__builtin_ia32_palignr128((__v2di)(__m128i)(hi),	      \
    108 	    (__v2di)(__m128i)(lo), 8*(int)(bytes))
    109 #elif defined(__clang__)
    110 #define	_mm_alignr_epi8(hi,lo,bytes)					      \
    111 	(__m128i)__builtin_ia32_palignr128((__v16qi)(__m128i)(hi),	      \
    112 	    (__v16qi)(__m128i)(lo), (int)(bytes))
    113 #endif
    114 
    115 _INTRINSATTR
    116 static __inline __m128
    117 _mm_load1_ps(const float *__p)
    118 {
    119 	return __extension__ (__m128)(__v4sf) { *__p, *__p, *__p, *__p };
    120 }
    121 
    122 _INTRINSATTR
    123 static __inline __m128i
    124 _mm_loadu_si128(const __m128i_u *__p)
    125 {
    126 #if defined(__GNUC__) && !defined(__clang__)
    127 	return *__p;
    128 #else
    129 	return ((const struct { __m128i_u __v; } _PACKALIAS *)__p)->__v;
    130 #endif
    131 }
    132 
    133 _INTRINSATTR
    134 static __inline __m128i
    135 _mm_loadu_si32(const void *__p)
    136 {
    137 #if defined(__GNUC__) && !defined(__clang__)
    138 	int32_t __v = (*(const __m32_u *)__p)[0];
    139 #else
    140 	int32_t __v = ((const struct { int32_t __v; } _PACKALIAS *)__p)->__v;
    141 #endif
    142 	return __extension__ (__m128i)(__v4si){ __v, 0, 0, 0 };
    143 }
    144 
    145 _INTRINSATTR
    146 static __inline __m128i
    147 _mm_loadu_si64(const void *__p)
    148 {
    149 #if defined(__GNUC__) && !defined(__clang__)
    150 	int64_t __v = (int64_t)*(const __m64_u *)__p;
    151 #else
    152 	int64_t __v = ((const struct { int64_t __v; } _PACKALIAS *)__p)->__v;
    153 #endif
    154 	return __extension__ (__m128i)(__v2di){ __v, 0 };
    155 }
    156 
    157 _INTRINSATTR
    158 static __inline __m128i
    159 _mm_load_si128(const __m128i *__p)
    160 {
    161 	return *__p;
    162 }
    163 
    164 _INTRINSATTR
    165 static __inline __m128
    166 _mm_movehl_ps(__m128 __v0, __m128 __v1)
    167 {
    168 #if defined(__GNUC__) && !defined(__clang__)
    169 	return (__m128)__builtin_ia32_movhlps((__v4sf)__v0, (__v4sf)__v1);
    170 #elif defined(__clang__)
    171 	return __builtin_shufflevector((__v4sf)__v0, (__v4sf)__v1, 6,7,2,3);
    172 #endif
    173 }
    174 
    175 _INTRINSATTR
    176 static __inline __m128
    177 _mm_movelh_ps(__m128 __v0, __m128 __v1)
    178 {
    179 #if defined(__GNUC__) && !defined(__clang__)
    180 	return (__m128)__builtin_ia32_movlhps((__v4sf)__v0, (__v4sf)__v1);
    181 #elif defined(__clang__)
    182 	return __builtin_shufflevector((__v4sf)__v0, (__v4sf)__v1, 0,1,4,5);
    183 #endif
    184 }
    185 
    186 _INTRINSATTR
    187 static __inline __m128i
    188 _mm_set1_epi16(int16_t __v)
    189 {
    190 	return __extension__ (__m128i)(__v8hi){
    191 	    __v, __v, __v, __v, __v, __v, __v, __v
    192 	};
    193 }
    194 
    195 _INTRINSATTR
    196 static __inline __m128i
    197 _mm_set1_epi32(int32_t __v)
    198 {
    199 	return __extension__ (__m128i)(__v4si){ __v, __v, __v, __v };
    200 }
    201 
    202 _INTRINSATTR
    203 static __inline __m128i
    204 _mm_set1_epi64x(int64_t __v)
    205 {
    206 	return __extension__ (__m128i)(__v2di){ __v, __v };
    207 }
    208 
    209 _INTRINSATTR
    210 static __inline __m128i
    211 _mm_set_epi32(int32_t __v3, int32_t __v2, int32_t __v1, int32_t __v0)
    212 {
    213 	return __extension__ (__m128i)(__v4si){ __v0, __v1, __v2, __v3 };
    214 }
    215 
    216 _INTRINSATTR
    217 static __inline __m128i
    218 _mm_set_epi64x(int64_t __v1, int64_t __v0)
    219 {
    220 	return __extension__ (__m128i)(__v2di){ __v0, __v1 };
    221 }
    222 
    223 _INTRINSATTR
    224 static __inline __m128
    225 _mm_setzero_ps(void)
    226 {
    227 	return __extension__ (__m128){ 0, 0, 0, 0 };
    228 }
    229 
    230 _INTRINSATTR
    231 static __inline __m128i
    232 _mm_setzero_si128(void)
    233 {
    234 	return _mm_set1_epi64x(0);
    235 }
    236 
    237 _INTRINSATTR _SSSE3_ATTR
    238 static __inline __m128i
    239 _mm_shuffle_epi8(__m128i __vtbl, __m128i __vidx)
    240 {
    241 	return (__m128i)__builtin_ia32_pshufb128((__v16qi)__vtbl,
    242 	    (__v16qi)__vidx);
    243 }
    244 
    245 #define	_mm_shuffle_epi32(v,m)						      \
    246 	(__m128i)__builtin_ia32_pshufd((__v4si)(__m128i)(v), (int)(m))
    247 
    248 #define	_mm_shuffle_ps(x,y,m)						      \
    249 	(__m128)__builtin_ia32_shufps((__v4sf)(__m128)(x),		      \
    250 	    (__v4sf)(__m128)(y), (int)(m))				      \
    251 
    252 _INTRINSATTR
    253 static __inline __m128i
    254 _mm_slli_epi32(__m128i __v, uint8_t __bits)
    255 {
    256 	return (__m128i)__builtin_ia32_pslldi128((__v4si)__v, (int)__bits);
    257 }
    258 
    259 _INTRINSATTR
    260 static __inline __m128i
    261 _mm_slli_epi64(__m128i __v, uint8_t __bits)
    262 {
    263 	return (__m128i)__builtin_ia32_psllqi128((__v2di)__v, (int)__bits);
    264 }
    265 
    266 #if defined(__GNUC__) && !defined(__clang__)
    267 #define	_mm_slli_si128(v,bytes)						      \
    268 	(__m128i)__builtin_ia32_pslldqi128((__v2di)(__m128i)(v),	      \
    269 	    8*(int)(bytes))
    270 #elif defined(__clang__)
    271 #define	_mm_slli_si128(v,bytes)						      \
    272 	(__m128i)__builtin_ia32_pslldqi128_byteshift((__v2di)(__m128i)(v),    \
    273 	    (int)(bytes))
    274 #endif
    275 
    276 _INTRINSATTR
    277 static __inline __m128i
    278 _mm_srli_epi32(__m128i __v, uint8_t __bits)
    279 {
    280 	return (__m128i)__builtin_ia32_psrldi128((__v4si)__v, (int)__bits);
    281 }
    282 
    283 _INTRINSATTR
    284 static __inline __m128i
    285 _mm_srli_epi64(__m128i __v, uint8_t __bits)
    286 {
    287 	return (__m128i)__builtin_ia32_psrlqi128((__v2di)__v, (int)__bits);
    288 }
    289 
    290 #if defined(__GNUC__) && !defined(__clang__)
    291 #define	_mm_srli_si128(v,bytes)						      \
    292 	(__m128i)__builtin_ia32_psrldqi128((__m128i)(v), 8*(int)(bytes))
    293 #elif defined(__clang__)
    294 #define	_mm_srli_si128(v,bytes)						      \
    295 	(__m128i)__builtin_ia32_psrldqi128_byteshift((__v2di)(__m128i)(v),    \
    296 	    (int)(bytes));
    297 #endif
    298 
    299 _INTRINSATTR
    300 static __inline void
    301 _mm_storeu_si128(__m128i_u *__p, __m128i __v)
    302 {
    303 #if defined(__GNUC__) && !defined(__clang__)
    304 	*__p = __v;
    305 #else
    306 	((struct { __m128i_u __v; } _PACKALIAS *)__p)->__v = __v;
    307 #endif
    308 }
    309 
    310 _INTRINSATTR
    311 static __inline void
    312 _mm_storeu_si32(void *__p, __m128i __v)
    313 {
    314 #if defined(__GNUC__) && !defined(__clang__)
    315 	*(__m32_u *)__p = (__m32)((__v4si)__v)[0];
    316 #else
    317 	((struct { int32_t __v; } _PACKALIAS *)__p)->__v = ((__v4si)__v)[0];
    318 #endif
    319 }
    320 
    321 _INTRINSATTR
    322 static __inline void
    323 _mm_storeu_si64(void *__p, __m128i __v)
    324 {
    325 #if defined(__GNUC__) && !defined(__clang__)
    326 	*(__m64_u *)__p = (__m64)((__v2di)__v)[0];
    327 #else
    328 	((struct { int64_t __v; } _PACKALIAS *)__p)->__v = ((__v2di)__v)[0];
    329 #endif
    330 }
    331 
    332 _INTRINSATTR
    333 static __inline void
    334 _mm_store_si128(__m128i *__p, __m128i __v)
    335 {
    336 	*__p = __v;
    337 }
    338 
    339 _INTRINSATTR
    340 static __inline __m128i
    341 _mm_sub_epi64(__m128i __x, __m128i __y)
    342 {
    343 	return (__m128i)((__v2du)__x - (__v2du)__y);
    344 }
    345 
    346 _INTRINSATTR
    347 static __inline __m128i
    348 _mm_unpackhi_epi32(__m128i __lo, __m128i __hi)
    349 {
    350 #if defined(__GNUC__) && !defined(__clang__)
    351 	return (__m128i)__builtin_ia32_punpckhdq128((__v4si)__lo,
    352 	    (__v4si)__hi);
    353 #elif defined(__clang__)
    354 	return (__m128i)__builtin_shufflevector((__v4si)__lo, (__v4si)__hi,
    355 	    2,6,3,7);
    356 #endif
    357 }
    358 
    359 _INTRINSATTR
    360 static __inline __m128i
    361 _mm_unpacklo_epi32(__m128i __lo, __m128i __hi)
    362 {
    363 #if defined(__GNUC__) && !defined(__clang__)
    364 	return (__m128i)__builtin_ia32_punpckldq128((__v4si)__lo,
    365 	    (__v4si)__hi);
    366 #elif defined(__clang__)
    367 	return (__m128i)__builtin_shufflevector((__v4si)__lo, (__v4si)__hi,
    368 	    0,4,1,5);
    369 #endif
    370 }
    371 
    372 _INTRINSATTR
    373 static __inline __m128i
    374 _mm_unpacklo_epi64(__m128i __lo, __m128i __hi)
    375 {
    376 #if defined(__GNUC__) && !defined(__clang__)
    377 	return (__m128i)__builtin_ia32_punpcklqdq128((__v2di)__lo,
    378 	    (__v2di)__hi);
    379 #elif defined(__clang__)
    380 	return (__m128i)__builtin_shufflevector((__v2di)__lo, (__v2di)__hi,
    381 	    0,2);
    382 #endif
    383 }
    384 
    385 #endif	/* _SYS_CRYPTO_ARCH_X86_IMMINTRIN_H */
    386