Home | History | Annotate | Line # | Download | only in x86
immintrin.h revision 1.2
      1 /*	$NetBSD: immintrin.h,v 1.2 2024/07/15 13:51:10 riastradh Exp $	*/
      2 
      3 /*-
      4  * Copyright (c) 2020 The NetBSD Foundation, Inc.
      5  * All rights reserved.
      6  *
      7  * Redistribution and use in source and binary forms, with or without
      8  * modification, are permitted provided that the following conditions
      9  * are met:
     10  * 1. Redistributions of source code must retain the above copyright
     11  *    notice, this list of conditions and the following disclaimer.
     12  * 2. Redistributions in binary form must reproduce the above copyright
     13  *    notice, this list of conditions and the following disclaimer in the
     14  *    documentation and/or other materials provided with the distribution.
     15  *
     16  * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
     17  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
     18  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
     19  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
     20  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
     21  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
     22  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
     23  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
     24  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
     25  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
     26  * POSSIBILITY OF SUCH DAMAGE.
     27  */
     28 
     29 #ifndef	_SYS_CRYPTO_ARCH_X86_IMMINTRIN_H
     30 #define	_SYS_CRYPTO_ARCH_X86_IMMINTRIN_H
     31 
     32 #include <sys/types.h>
     33 
     34 /*
     35  * This kludgerous header file provides definitions for the Intel
     36  * intrinsics that work with GCC and Clang, because <immintrin.h> is
     37  * not available during the kernel build and arranging to make it
     38  * available is complicated.  Please fix this properly!
     39  */
     40 
     41 #if defined(__GNUC__) && !defined(__clang__)
     42 
     43 #define	_INTRINSATTR							      \
     44 	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
     45 
     46 typedef short __m16 __attribute__((__vector_size__(2), __may_alias__));
     47 typedef short __m16_u
     48     __attribute__((__vector_size__(2), __may_alias__, __aligned__(1)));
     49 typedef int __m32 __attribute__((__vector_size__(4), __may_alias__));
     50 typedef int __m32_u
     51     __attribute__((__vector_size__(4), __may_alias__, __aligned__(1)));
     52 typedef int __m64 __attribute__((__vector_size__(8), __may_alias__));
     53 typedef int __m64_u
     54     __attribute__((__vector_size__(8), __may_alias__, __aligned__(1)));
     55 typedef float __m128 __attribute__((__vector_size__(16), __may_alias__));
     56 typedef long long __m128i __attribute__((__vector_size__(16), __may_alias__));
     57 typedef long long __m128i_u
     58     __attribute__((__vector_size__(16), __may_alias__, __aligned__(1)));
     59 typedef long long __v2di __attribute__((__vector_size__(16)));
     60 typedef unsigned long long __v2du __attribute__((__vector_size__(16)));
     61 typedef int __v4si __attribute__((__vector_size__(16)));
     62 typedef unsigned __v4su __attribute__((__vector_size__(16)));
     63 typedef float __v4sf __attribute__((__vector_size__(16)));
     64 typedef short __v8hi __attribute__((__vector_size__(16)));
     65 typedef char __v16qi __attribute__((__vector_size__(16)));
     66 
     67 #elif defined(__clang__)
     68 
     69 typedef float __m128 __attribute__((__vector_size__(16), __aligned__(16)));
     70 typedef long long __m128i
     71     __attribute__((__vector_size__(16), __aligned__(16)));
     72 typedef long long __m128i_u
     73     __attribute__((__vector_size__(16), __may_alias__, __aligned__(1)));
     74 typedef long long __v2di __attribute__((__vector_size__(16)));
     75 typedef unsigned long long __v2du __attribute__((__vector_size__(16)));
     76 typedef int __v4si __attribute__((__vector_size__(16)));
     77 typedef unsigned __v4su __attribute__((__vector_size__(16)));
     78 typedef float __v4sf __attribute__((__vector_size__(16)));
     79 typedef short __v8hi __attribute__((__vector_size__(16)));
     80 typedef char __v16qi __attribute__((__vector_size__(16)));
     81 
     82 #define	_INTRINSATTR							      \
     83 	__attribute__((__always_inline__, __nodebug__, __target__("sse2"),    \
     84 		__min_vector_width__(128)))
     85 #define	_PACKALIAS							      \
     86 	__attribute__((__packed__, __may_alias__))
     87 
     88 #else
     89 
     90 #error Please teach me how to do Intel intrinsics for your compiler!
     91 
     92 #endif
     93 
     94 #define	_SSSE3_ATTR	__attribute__((target("ssse3")))
     95 
     96 _INTRINSATTR
     97 static __inline __m128i
     98 _mm_add_epi32(__m128i __a, __m128i __b)
     99 {
    100 	return (__m128i)((__v4su)__a + (__v4su)__b);
    101 }
    102 
    103 #if defined(__GNUC__) && !defined(__clang__)
    104 #define	_mm_alignr_epi8(hi,lo,bytes)					      \
    105 	(__m128i)__builtin_ia32_palignr128((__v2di)(__m128i)(hi),	      \
    106 	    (__v2di)(__m128i)(lo), 8*(int)(bytes))
    107 #elif defined(__clang__)
    108 #define	_mm_alignr_epi8(hi,lo,bytes)					      \
    109 	(__m128i)__builtin_ia32_palignr128((__v16qi)(__m128i)(hi),	      \
    110 	    (__v16qi)(__m128i)(lo), (int)(bytes))
    111 #endif
    112 
    113 _INTRINSATTR
    114 static __inline __m128
    115 _mm_load1_ps(const float *__p)
    116 {
    117 	return __extension__ (__m128)(__v4sf) { *__p, *__p, *__p, *__p };
    118 }
    119 
    120 _INTRINSATTR
    121 static __inline __m128i
    122 _mm_loadu_si128(const __m128i_u *__p)
    123 {
    124 #if defined(__GNUC__) && !defined(__clang__)
    125 	return *__p;
    126 #else
    127 	return ((const struct { __m128i_u __v; } _PACKALIAS *)__p)->__v;
    128 #endif
    129 }
    130 
    131 _INTRINSATTR
    132 static __inline __m128i
    133 _mm_loadu_si32(const void *__p)
    134 {
    135 #if defined(__GNUC__) && !defined(__clang__)
    136 	int32_t __v = (*(__m32_u *)__p)[0];
    137 #else
    138 	int32_t __v = ((const struct { int32_t __v; } _PACKALIAS *)__p)->__v;
    139 #endif
    140 	return __extension__ (__m128i)(__v4si){ __v, 0, 0, 0 };
    141 }
    142 
    143 _INTRINSATTR
    144 static __inline __m128i
    145 _mm_loadu_si64(const void *__p)
    146 {
    147 #if defined(__GNUC__) && !defined(__clang__)
    148 	int64_t __v = (*(__m64_u *)__p)[0];
    149 #else
    150 	int64_t __v = ((const struct { int64_t __v; } _PACKALIAS *)__p)->__v;
    151 #endif
    152 	return __extension__ (__m128i)(__v2di){ __v, 0 };
    153 }
    154 
    155 _INTRINSATTR
    156 static __inline __m128i
    157 _mm_load_si128(const __m128i *__p)
    158 {
    159 	return *__p;
    160 }
    161 
    162 _INTRINSATTR
    163 static __inline __m128
    164 _mm_movehl_ps(__m128 __v0, __m128 __v1)
    165 {
    166 #if defined(__GNUC__) && !defined(__clang__)
    167 	return (__m128)__builtin_ia32_movhlps((__v4sf)__v0, (__v4sf)__v1);
    168 #elif defined(__clang__)
    169 	return __builtin_shufflevector((__v4sf)__v0, (__v4sf)__v1, 6,7,2,3);
    170 #endif
    171 }
    172 
    173 _INTRINSATTR
    174 static __inline __m128
    175 _mm_movelh_ps(__m128 __v0, __m128 __v1)
    176 {
    177 #if defined(__GNUC__) && !defined(__clang__)
    178 	return (__m128)__builtin_ia32_movlhps((__v4sf)__v0, (__v4sf)__v1);
    179 #elif defined(__clang__)
    180 	return __builtin_shufflevector((__v4sf)__v0, (__v4sf)__v1, 0,1,4,5);
    181 #endif
    182 }
    183 
    184 _INTRINSATTR
    185 static __inline __m128i
    186 _mm_set1_epi16(int16_t __v)
    187 {
    188 	return __extension__ (__m128i)(__v8hi){
    189 	    __v, __v, __v, __v, __v, __v, __v, __v
    190 	};
    191 }
    192 
    193 _INTRINSATTR
    194 static __inline __m128i
    195 _mm_set1_epi32(int32_t __v)
    196 {
    197 	return __extension__ (__m128i)(__v4si){ __v, __v, __v, __v };
    198 }
    199 
    200 _INTRINSATTR
    201 static __inline __m128i
    202 _mm_set1_epi64x(int64_t __v)
    203 {
    204 	return __extension__ (__m128i)(__v2di){ __v, __v };
    205 }
    206 
    207 _INTRINSATTR
    208 static __inline __m128i
    209 _mm_set_epi32(int32_t __v3, int32_t __v2, int32_t __v1, int32_t __v0)
    210 {
    211 	return __extension__ (__m128i)(__v4si){ __v0, __v1, __v2, __v3 };
    212 }
    213 
    214 _INTRINSATTR
    215 static __inline __m128i
    216 _mm_set_epi64x(int64_t __v1, int64_t __v0)
    217 {
    218 	return __extension__ (__m128i)(__v2di){ __v0, __v1 };
    219 }
    220 
    221 _INTRINSATTR
    222 static __inline __m128
    223 _mm_setzero_ps(void)
    224 {
    225 	return __extension__ (__m128){ 0, 0, 0, 0 };
    226 }
    227 
    228 _INTRINSATTR
    229 static __inline __m128i
    230 _mm_setzero_si128(void)
    231 {
    232 	return _mm_set1_epi64x(0);
    233 }
    234 
    235 _INTRINSATTR _SSSE3_ATTR
    236 static __inline __m128i
    237 _mm_shuffle_epi8(__m128i __vtbl, __m128i __vidx)
    238 {
    239 	return (__m128i)__builtin_ia32_pshufb128((__v16qi)__vtbl,
    240 	    (__v16qi)__vidx);
    241 }
    242 
    243 #define	_mm_shuffle_epi32(v,m)						      \
    244 	(__m128i)__builtin_ia32_pshufd((__v4si)(__m128i)(v), (int)(m))
    245 
    246 #define	_mm_shuffle_ps(x,y,m)						      \
    247 	(__m128)__builtin_ia32_shufps((__v4sf)(__m128)(x),		      \
    248 	    (__v4sf)(__m128)(y), (int)(m))				      \
    249 
    250 _INTRINSATTR
    251 static __inline __m128i
    252 _mm_slli_epi32(__m128i __v, uint8_t __bits)
    253 {
    254 	return (__m128i)__builtin_ia32_pslldi128((__v4si)__v, (int)__bits);
    255 }
    256 
    257 _INTRINSATTR
    258 static __inline __m128i
    259 _mm_slli_epi64(__m128i __v, uint8_t __bits)
    260 {
    261 	return (__m128i)__builtin_ia32_psllqi128((__v2di)__v, (int)__bits);
    262 }
    263 
    264 #if defined(__GNUC__) && !defined(__clang__)
    265 #define	_mm_slli_si128(v,bytes)						      \
    266 	(__m128i)__builtin_ia32_pslldqi128((__v2di)(__m128i)(v),	      \
    267 	    8*(int)(bytes))
    268 #elif defined(__clang__)
    269 #define	_mm_slli_si128(v,bytes)						      \
    270 	(__m128i)__builtin_ia32_pslldqi128_byteshift((__v2di)(__m128i)(v),    \
    271 	    (int)(bytes))
    272 #endif
    273 
    274 _INTRINSATTR
    275 static __inline __m128i
    276 _mm_srli_epi32(__m128i __v, uint8_t __bits)
    277 {
    278 	return (__m128i)__builtin_ia32_psrldi128((__v4si)__v, (int)__bits);
    279 }
    280 
    281 _INTRINSATTR
    282 static __inline __m128i
    283 _mm_srli_epi64(__m128i __v, uint8_t __bits)
    284 {
    285 	return (__m128i)__builtin_ia32_psrlqi128((__v2di)__v, (int)__bits);
    286 }
    287 
    288 #if defined(__GNUC__) && !defined(__clang__)
    289 #define	_mm_srli_si128(v,bytes)						      \
    290 	(__m128i)__builtin_ia32_psrldqi128((__m128i)(v), 8*(int)(bytes))
    291 #elif defined(__clang__)
    292 #define	_mm_srli_si128(v,bytes)						      \
    293 	(__m128i)__builtin_ia32_psrldqi128_byteshift((__v2di)(__m128i)(v),    \
    294 	    (int)(bytes));
    295 #endif
    296 
    297 _INTRINSATTR
    298 static __inline void
    299 _mm_storeu_si128(__m128i_u *__p, __m128i __v)
    300 {
    301 #if defined(__GNUC__) && !defined(__clang__)
    302 	*__p = __v;
    303 #else
    304 	((struct { __m128i_u __v; } _PACKALIAS *)__p)->__v = __v;
    305 #endif
    306 }
    307 
    308 _INTRINSATTR
    309 static __inline void
    310 _mm_storeu_si32(void *__p, __m128i __v)
    311 {
    312 #if defined(__GNUC__) && !defined(__clang__)
    313 	*(__m32_u *)__p = (__m32)((__v4si)__v)[0];
    314 #else
    315 	((struct { int32_t __v; } _PACKALIAS *)__p)->__v = ((__v4si)__v)[0];
    316 #endif
    317 }
    318 
    319 _INTRINSATTR
    320 static __inline void
    321 _mm_storeu_si64(void *__p, __m128i __v)
    322 {
    323 #if defined(__GNUC__) && !defined(__clang__)
    324 	*(__m64_u *)__p = (__m64)((__v2di)__v)[0];
    325 #else
    326 	((struct { int64_t __v; } _PACKALIAS *)__p)->__v = ((__v2di)__v)[0];
    327 #endif
    328 }
    329 
    330 _INTRINSATTR
    331 static __inline void
    332 _mm_store_si128(__m128i *__p, __m128i __v)
    333 {
    334 	*__p = __v;
    335 }
    336 
    337 _INTRINSATTR
    338 static __inline __m128i
    339 _mm_sub_epi64(__m128i __x, __m128i __y)
    340 {
    341 	return (__m128i)((__v2du)__x - (__v2du)__y);
    342 }
    343 
    344 _INTRINSATTR
    345 static __inline __m128i
    346 _mm_unpackhi_epi32(__m128i __lo, __m128i __hi)
    347 {
    348 #if defined(__GNUC__) && !defined(__clang__)
    349 	return (__m128i)__builtin_ia32_punpckhdq128((__v4si)__lo,
    350 	    (__v4si)__hi);
    351 #elif defined(__clang__)
    352 	return (__m128i)__builtin_shufflevector((__v4si)__lo, (__v4si)__hi,
    353 	    2,6,3,7);
    354 #endif
    355 }
    356 
    357 _INTRINSATTR
    358 static __inline __m128i
    359 _mm_unpacklo_epi32(__m128i __lo, __m128i __hi)
    360 {
    361 #if defined(__GNUC__) && !defined(__clang__)
    362 	return (__m128i)__builtin_ia32_punpckldq128((__v4si)__lo,
    363 	    (__v4si)__hi);
    364 #elif defined(__clang__)
    365 	return (__m128i)__builtin_shufflevector((__v4si)__lo, (__v4si)__hi,
    366 	    0,4,1,5);
    367 #endif
    368 }
    369 
    370 _INTRINSATTR
    371 static __inline __m128i
    372 _mm_unpacklo_epi64(__m128i __lo, __m128i __hi)
    373 {
    374 #if defined(__GNUC__) && !defined(__clang__)
    375 	return (__m128i)__builtin_ia32_punpcklqdq128((__v2di)__lo,
    376 	    (__v2di)__hi);
    377 #elif defined(__clang__)
    378 	return (__m128i)__builtin_shufflevector((__v2di)__lo, (__v2di)__hi,
    379 	    0,2);
    380 #endif
    381 }
    382 
    383 #endif	/* _SYS_CRYPTO_ARCH_X86_IMMINTRIN_H */
    384