immintrin.h revision 1.1 1 1.1 rin /* $NetBSD: immintrin.h,v 1.1 2023/08/07 01:07:36 rin Exp $ */
2 1.1 rin
3 1.1 rin /*-
4 1.1 rin * Copyright (c) 2020 The NetBSD Foundation, Inc.
5 1.1 rin * All rights reserved.
6 1.1 rin *
7 1.1 rin * Redistribution and use in source and binary forms, with or without
8 1.1 rin * modification, are permitted provided that the following conditions
9 1.1 rin * are met:
10 1.1 rin * 1. Redistributions of source code must retain the above copyright
11 1.1 rin * notice, this list of conditions and the following disclaimer.
12 1.1 rin * 2. Redistributions in binary form must reproduce the above copyright
13 1.1 rin * notice, this list of conditions and the following disclaimer in the
14 1.1 rin * documentation and/or other materials provided with the distribution.
15 1.1 rin *
16 1.1 rin * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
17 1.1 rin * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
18 1.1 rin * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
19 1.1 rin * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
20 1.1 rin * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
21 1.1 rin * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
22 1.1 rin * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
23 1.1 rin * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
24 1.1 rin * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
25 1.1 rin * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
26 1.1 rin * POSSIBILITY OF SUCH DAMAGE.
27 1.1 rin */
28 1.1 rin
29 1.1 rin #ifndef _SYS_CRYPTO_ARCH_X86_IMMINTRIN_H
30 1.1 rin #define _SYS_CRYPTO_ARCH_X86_IMMINTRIN_H
31 1.1 rin
32 1.1 rin #include <sys/types.h>
33 1.1 rin
34 1.1 rin /*
35 1.1 rin * This kludgerous header file provides definitions for the Intel
36 1.1 rin * intrinsics that work with GCC and Clang, because <immintrin.h> is
37 1.1 rin * not available during the kernel build and arranging to make it
38 1.1 rin * available is complicated. Please fix this properly!
39 1.1 rin */
40 1.1 rin
41 1.1 rin #if defined(__GNUC__) && !defined(__clang__)
42 1.1 rin
43 1.1 rin #define _INTRINSATTR \
44 1.1 rin __attribute__((__gnu_inline__, __always_inline__, __artificial__))
45 1.1 rin #define _PACKALIAS
46 1.1 rin
47 1.1 rin typedef float __m128 __attribute__((__vector_size__(16), __may_alias__));
48 1.1 rin typedef long long __m128i __attribute__((__vector_size__(16), __may_alias__));
49 1.1 rin typedef long long __m128i_u
50 1.1 rin __attribute__((__vector_size__(16), __may_alias__, __aligned__(1)));
51 1.1 rin typedef long long __v2di __attribute__((__vector_size__(16)));
52 1.1 rin typedef unsigned long long __v2du __attribute__((__vector_size__(16)));
53 1.1 rin typedef int __v4si __attribute__((__vector_size__(16)));
54 1.1 rin typedef unsigned __v4su __attribute__((__vector_size__(16)));
55 1.1 rin typedef float __v4sf __attribute__((__vector_size__(16)));
56 1.1 rin typedef short __v8hi __attribute__((__vector_size__(16)));
57 1.1 rin typedef char __v16qi __attribute__((__vector_size__(16)));
58 1.1 rin
59 1.1 rin #elif defined(__clang__)
60 1.1 rin
61 1.1 rin typedef float __m128 __attribute__((__vector_size__(16), __aligned__(16)));
62 1.1 rin typedef long long __m128i
63 1.1 rin __attribute__((__vector_size__(16), __aligned__(16)));
64 1.1 rin typedef long long __m128i_u
65 1.1 rin __attribute__((__vector_size__(16), __may_alias__, __aligned__(1)));
66 1.1 rin typedef long long __v2di __attribute__((__vector_size__(16)));
67 1.1 rin typedef unsigned long long __v2du __attribute__((__vector_size__(16)));
68 1.1 rin typedef int __v4si __attribute__((__vector_size__(16)));
69 1.1 rin typedef unsigned __v4su __attribute__((__vector_size__(16)));
70 1.1 rin typedef float __v4sf __attribute__((__vector_size__(16)));
71 1.1 rin typedef short __v8hi __attribute__((__vector_size__(16)));
72 1.1 rin typedef char __v16qi __attribute__((__vector_size__(16)));
73 1.1 rin
74 1.1 rin #define _INTRINSATTR \
75 1.1 rin __attribute__((__always_inline__, __nodebug__, __target__("sse2"), \
76 1.1 rin __min_vector_width__(128)))
77 1.1 rin #define _PACKALIAS \
78 1.1 rin __attribute__((__packed__, __may_alias__))
79 1.1 rin
80 1.1 rin #else
81 1.1 rin
82 1.1 rin #error Please teach me how to do Intel intrinsics for your compiler!
83 1.1 rin
84 1.1 rin #endif
85 1.1 rin
86 1.1 rin #define _SSSE3_ATTR __attribute__((target("ssse3")))
87 1.1 rin
88 1.1 rin _INTRINSATTR
89 1.1 rin static __inline __m128i
90 1.1 rin _mm_add_epi32(__m128i __a, __m128i __b)
91 1.1 rin {
92 1.1 rin return (__m128i)((__v4su)__a + (__v4su)__b);
93 1.1 rin }
94 1.1 rin
95 1.1 rin #if defined(__GNUC__) && !defined(__clang__)
96 1.1 rin #define _mm_alignr_epi8(hi,lo,bytes) \
97 1.1 rin (__m128i)__builtin_ia32_palignr128((__v2di)(__m128i)(hi), \
98 1.1 rin (__v2di)(__m128i)(lo), 8*(int)(bytes))
99 1.1 rin #elif defined(__clang__)
100 1.1 rin #define _mm_alignr_epi8(hi,lo,bytes) \
101 1.1 rin (__m128i)__builtin_ia32_palignr128((__v16qi)(__m128i)(hi), \
102 1.1 rin (__v16qi)(__m128i)(lo), (int)(bytes))
103 1.1 rin #endif
104 1.1 rin
105 1.1 rin _INTRINSATTR
106 1.1 rin static __inline __m128
107 1.1 rin _mm_load1_ps(const float *__p)
108 1.1 rin {
109 1.1 rin return __extension__ (__m128)(__v4sf) { *__p, *__p, *__p, *__p };
110 1.1 rin }
111 1.1 rin
112 1.1 rin _INTRINSATTR
113 1.1 rin static __inline __m128i
114 1.1 rin _mm_loadu_si128(const __m128i_u *__p)
115 1.1 rin {
116 1.1 rin return ((const struct { __m128i_u __v; } _PACKALIAS *)__p)->__v;
117 1.1 rin }
118 1.1 rin
119 1.1 rin _INTRINSATTR
120 1.1 rin static __inline __m128i
121 1.1 rin _mm_loadu_si32(const void *__p)
122 1.1 rin {
123 1.1 rin int32_t __v = ((const struct { int32_t __v; } _PACKALIAS *)__p)->__v;
124 1.1 rin return __extension__ (__m128i)(__v4si){ __v, 0, 0, 0 };
125 1.1 rin }
126 1.1 rin
127 1.1 rin _INTRINSATTR
128 1.1 rin static __inline __m128i
129 1.1 rin _mm_loadu_si64(const void *__p)
130 1.1 rin {
131 1.1 rin int64_t __v = ((const struct { int64_t __v; } _PACKALIAS *)__p)->__v;
132 1.1 rin return __extension__ (__m128i)(__v2di){ __v, 0 };
133 1.1 rin }
134 1.1 rin
135 1.1 rin _INTRINSATTR
136 1.1 rin static __inline __m128i
137 1.1 rin _mm_load_si128(const __m128i *__p)
138 1.1 rin {
139 1.1 rin return *__p;
140 1.1 rin }
141 1.1 rin
142 1.1 rin _INTRINSATTR
143 1.1 rin static __inline __m128
144 1.1 rin _mm_movehl_ps(__m128 __v0, __m128 __v1)
145 1.1 rin {
146 1.1 rin #if defined(__GNUC__) && !defined(__clang__)
147 1.1 rin return (__m128)__builtin_ia32_movhlps((__v4sf)__v0, (__v4sf)__v1);
148 1.1 rin #elif defined(__clang__)
149 1.1 rin return __builtin_shufflevector((__v4sf)__v0, (__v4sf)__v1, 6,7,2,3);
150 1.1 rin #endif
151 1.1 rin }
152 1.1 rin
153 1.1 rin _INTRINSATTR
154 1.1 rin static __inline __m128
155 1.1 rin _mm_movelh_ps(__m128 __v0, __m128 __v1)
156 1.1 rin {
157 1.1 rin #if defined(__GNUC__) && !defined(__clang__)
158 1.1 rin return (__m128)__builtin_ia32_movlhps((__v4sf)__v0, (__v4sf)__v1);
159 1.1 rin #elif defined(__clang__)
160 1.1 rin return __builtin_shufflevector((__v4sf)__v0, (__v4sf)__v1, 0,1,4,5);
161 1.1 rin #endif
162 1.1 rin }
163 1.1 rin
164 1.1 rin _INTRINSATTR
165 1.1 rin static __inline __m128i
166 1.1 rin _mm_set1_epi16(int16_t __v)
167 1.1 rin {
168 1.1 rin return __extension__ (__m128i)(__v8hi){
169 1.1 rin __v, __v, __v, __v, __v, __v, __v, __v
170 1.1 rin };
171 1.1 rin }
172 1.1 rin
173 1.1 rin _INTRINSATTR
174 1.1 rin static __inline __m128i
175 1.1 rin _mm_set1_epi32(int32_t __v)
176 1.1 rin {
177 1.1 rin return __extension__ (__m128i)(__v4si){ __v, __v, __v, __v };
178 1.1 rin }
179 1.1 rin
180 1.1 rin _INTRINSATTR
181 1.1 rin static __inline __m128i
182 1.1 rin _mm_set1_epi64x(int64_t __v)
183 1.1 rin {
184 1.1 rin return __extension__ (__m128i)(__v2di){ __v, __v };
185 1.1 rin }
186 1.1 rin
187 1.1 rin _INTRINSATTR
188 1.1 rin static __inline __m128i
189 1.1 rin _mm_set_epi32(int32_t __v3, int32_t __v2, int32_t __v1, int32_t __v0)
190 1.1 rin {
191 1.1 rin return __extension__ (__m128i)(__v4si){ __v0, __v1, __v2, __v3 };
192 1.1 rin }
193 1.1 rin
194 1.1 rin _INTRINSATTR
195 1.1 rin static __inline __m128i
196 1.1 rin _mm_set_epi64x(int64_t __v1, int64_t __v0)
197 1.1 rin {
198 1.1 rin return __extension__ (__m128i)(__v2di){ __v0, __v1 };
199 1.1 rin }
200 1.1 rin
201 1.1 rin _INTRINSATTR
202 1.1 rin static __inline __m128
203 1.1 rin _mm_setzero_ps(void)
204 1.1 rin {
205 1.1 rin return __extension__ (__m128){ 0, 0, 0, 0 };
206 1.1 rin }
207 1.1 rin
208 1.1 rin _INTRINSATTR
209 1.1 rin static __inline __m128i
210 1.1 rin _mm_setzero_si128(void)
211 1.1 rin {
212 1.1 rin return _mm_set1_epi64x(0);
213 1.1 rin }
214 1.1 rin
215 1.1 rin _INTRINSATTR _SSSE3_ATTR
216 1.1 rin static __inline __m128i
217 1.1 rin _mm_shuffle_epi8(__m128i __vtbl, __m128i __vidx)
218 1.1 rin {
219 1.1 rin return (__m128i)__builtin_ia32_pshufb128((__v16qi)__vtbl,
220 1.1 rin (__v16qi)__vidx);
221 1.1 rin }
222 1.1 rin
223 1.1 rin #define _mm_shuffle_epi32(v,m) \
224 1.1 rin (__m128i)__builtin_ia32_pshufd((__v4si)(__m128i)(v), (int)(m))
225 1.1 rin
226 1.1 rin #define _mm_shuffle_ps(x,y,m) \
227 1.1 rin (__m128)__builtin_ia32_shufps((__v4sf)(__m128)(x), \
228 1.1 rin (__v4sf)(__m128)(y), (int)(m)) \
229 1.1 rin
230 1.1 rin _INTRINSATTR
231 1.1 rin static __inline __m128i
232 1.1 rin _mm_slli_epi32(__m128i __v, uint8_t __bits)
233 1.1 rin {
234 1.1 rin return (__m128i)__builtin_ia32_pslldi128((__v4si)__v, (int)__bits);
235 1.1 rin }
236 1.1 rin
237 1.1 rin _INTRINSATTR
238 1.1 rin static __inline __m128i
239 1.1 rin _mm_slli_epi64(__m128i __v, uint8_t __bits)
240 1.1 rin {
241 1.1 rin return (__m128i)__builtin_ia32_psllqi128((__v2di)__v, (int)__bits);
242 1.1 rin }
243 1.1 rin
244 1.1 rin #if defined(__GNUC__) && !defined(__clang__)
245 1.1 rin #define _mm_slli_si128(v,bytes) \
246 1.1 rin (__m128i)__builtin_ia32_pslldqi128((__v2di)(__m128i)(v), \
247 1.1 rin 8*(int)(bytes))
248 1.1 rin #elif defined(__clang__)
249 1.1 rin #define _mm_slli_si128(v,bytes) \
250 1.1 rin (__m128i)__builtin_ia32_pslldqi128_byteshift((__v2di)(__m128i)(v), \
251 1.1 rin (int)(bytes))
252 1.1 rin #endif
253 1.1 rin
254 1.1 rin _INTRINSATTR
255 1.1 rin static __inline __m128i
256 1.1 rin _mm_srli_epi32(__m128i __v, uint8_t __bits)
257 1.1 rin {
258 1.1 rin return (__m128i)__builtin_ia32_psrldi128((__v4si)__v, (int)__bits);
259 1.1 rin }
260 1.1 rin
261 1.1 rin _INTRINSATTR
262 1.1 rin static __inline __m128i
263 1.1 rin _mm_srli_epi64(__m128i __v, uint8_t __bits)
264 1.1 rin {
265 1.1 rin return (__m128i)__builtin_ia32_psrlqi128((__v2di)__v, (int)__bits);
266 1.1 rin }
267 1.1 rin
268 1.1 rin #if defined(__GNUC__) && !defined(__clang__)
269 1.1 rin #define _mm_srli_si128(v,bytes) \
270 1.1 rin (__m128i)__builtin_ia32_psrldqi128((__m128i)(v), 8*(int)(bytes))
271 1.1 rin #elif defined(__clang__)
272 1.1 rin #define _mm_srli_si128(v,bytes) \
273 1.1 rin (__m128i)__builtin_ia32_psrldqi128_byteshift((__v2di)(__m128i)(v), \
274 1.1 rin (int)(bytes));
275 1.1 rin #endif
276 1.1 rin
277 1.1 rin _INTRINSATTR
278 1.1 rin static __inline void
279 1.1 rin _mm_storeu_si128(__m128i_u *__p, __m128i __v)
280 1.1 rin {
281 1.1 rin ((struct { __m128i_u __v; } _PACKALIAS *)__p)->__v = __v;
282 1.1 rin }
283 1.1 rin
284 1.1 rin _INTRINSATTR
285 1.1 rin static __inline void
286 1.1 rin _mm_storeu_si32(void *__p, __m128i __v)
287 1.1 rin {
288 1.1 rin ((struct { int32_t __v; } _PACKALIAS *)__p)->__v = ((__v4si)__v)[0];
289 1.1 rin }
290 1.1 rin
291 1.1 rin _INTRINSATTR
292 1.1 rin static __inline void
293 1.1 rin _mm_storeu_si64(void *__p, __m128i __v)
294 1.1 rin {
295 1.1 rin ((struct { int64_t __v; } _PACKALIAS *)__p)->__v = ((__v2di)__v)[0];
296 1.1 rin }
297 1.1 rin
298 1.1 rin _INTRINSATTR
299 1.1 rin static __inline void
300 1.1 rin _mm_store_si128(__m128i *__p, __m128i __v)
301 1.1 rin {
302 1.1 rin *__p = __v;
303 1.1 rin }
304 1.1 rin
305 1.1 rin _INTRINSATTR
306 1.1 rin static __inline __m128i
307 1.1 rin _mm_sub_epi64(__m128i __x, __m128i __y)
308 1.1 rin {
309 1.1 rin return (__m128i)((__v2du)__x - (__v2du)__y);
310 1.1 rin }
311 1.1 rin
312 1.1 rin _INTRINSATTR
313 1.1 rin static __inline __m128i
314 1.1 rin _mm_unpackhi_epi32(__m128i __lo, __m128i __hi)
315 1.1 rin {
316 1.1 rin #if defined(__GNUC__) && !defined(__clang__)
317 1.1 rin return (__m128i)__builtin_ia32_punpckhdq128((__v4si)__lo,
318 1.1 rin (__v4si)__hi);
319 1.1 rin #elif defined(__clang__)
320 1.1 rin return (__m128i)__builtin_shufflevector((__v4si)__lo, (__v4si)__hi,
321 1.1 rin 2,6,3,7);
322 1.1 rin #endif
323 1.1 rin }
324 1.1 rin
325 1.1 rin _INTRINSATTR
326 1.1 rin static __inline __m128i
327 1.1 rin _mm_unpacklo_epi32(__m128i __lo, __m128i __hi)
328 1.1 rin {
329 1.1 rin #if defined(__GNUC__) && !defined(__clang__)
330 1.1 rin return (__m128i)__builtin_ia32_punpckldq128((__v4si)__lo,
331 1.1 rin (__v4si)__hi);
332 1.1 rin #elif defined(__clang__)
333 1.1 rin return (__m128i)__builtin_shufflevector((__v4si)__lo, (__v4si)__hi,
334 1.1 rin 0,4,1,5);
335 1.1 rin #endif
336 1.1 rin }
337 1.1 rin
338 1.1 rin _INTRINSATTR
339 1.1 rin static __inline __m128i
340 1.1 rin _mm_unpacklo_epi64(__m128i __lo, __m128i __hi)
341 1.1 rin {
342 1.1 rin #if defined(__GNUC__) && !defined(__clang__)
343 1.1 rin return (__m128i)__builtin_ia32_punpcklqdq128((__v2di)__lo,
344 1.1 rin (__v2di)__hi);
345 1.1 rin #elif defined(__clang__)
346 1.1 rin return (__m128i)__builtin_shufflevector((__v2di)__lo, (__v2di)__hi,
347 1.1 rin 0,2);
348 1.1 rin #endif
349 1.1 rin }
350 1.1 rin
351 1.1 rin #endif /* _SYS_CRYPTO_ARCH_X86_IMMINTRIN_H */
352