immintrin.h revision 1.2 1 1.2 riastrad /* $NetBSD: immintrin.h,v 1.2 2024/07/15 13:51:10 riastradh Exp $ */
2 1.1 rin
3 1.1 rin /*-
4 1.1 rin * Copyright (c) 2020 The NetBSD Foundation, Inc.
5 1.1 rin * All rights reserved.
6 1.1 rin *
7 1.1 rin * Redistribution and use in source and binary forms, with or without
8 1.1 rin * modification, are permitted provided that the following conditions
9 1.1 rin * are met:
10 1.1 rin * 1. Redistributions of source code must retain the above copyright
11 1.1 rin * notice, this list of conditions and the following disclaimer.
12 1.1 rin * 2. Redistributions in binary form must reproduce the above copyright
13 1.1 rin * notice, this list of conditions and the following disclaimer in the
14 1.1 rin * documentation and/or other materials provided with the distribution.
15 1.1 rin *
16 1.1 rin * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
17 1.1 rin * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
18 1.1 rin * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
19 1.1 rin * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
20 1.1 rin * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
21 1.1 rin * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
22 1.1 rin * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
23 1.1 rin * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
24 1.1 rin * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
25 1.1 rin * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
26 1.1 rin * POSSIBILITY OF SUCH DAMAGE.
27 1.1 rin */
28 1.1 rin
29 1.1 rin #ifndef _SYS_CRYPTO_ARCH_X86_IMMINTRIN_H
30 1.1 rin #define _SYS_CRYPTO_ARCH_X86_IMMINTRIN_H
31 1.1 rin
32 1.1 rin #include <sys/types.h>
33 1.1 rin
34 1.1 rin /*
35 1.1 rin * This kludgerous header file provides definitions for the Intel
36 1.1 rin * intrinsics that work with GCC and Clang, because <immintrin.h> is
37 1.1 rin * not available during the kernel build and arranging to make it
38 1.1 rin * available is complicated. Please fix this properly!
39 1.1 rin */
40 1.1 rin
41 1.1 rin #if defined(__GNUC__) && !defined(__clang__)
42 1.1 rin
43 1.1 rin #define _INTRINSATTR \
44 1.1 rin __attribute__((__gnu_inline__, __always_inline__, __artificial__))
45 1.1 rin
46 1.2 riastrad typedef short __m16 __attribute__((__vector_size__(2), __may_alias__));
47 1.2 riastrad typedef short __m16_u
48 1.2 riastrad __attribute__((__vector_size__(2), __may_alias__, __aligned__(1)));
49 1.2 riastrad typedef int __m32 __attribute__((__vector_size__(4), __may_alias__));
50 1.2 riastrad typedef int __m32_u
51 1.2 riastrad __attribute__((__vector_size__(4), __may_alias__, __aligned__(1)));
52 1.2 riastrad typedef int __m64 __attribute__((__vector_size__(8), __may_alias__));
53 1.2 riastrad typedef int __m64_u
54 1.2 riastrad __attribute__((__vector_size__(8), __may_alias__, __aligned__(1)));
55 1.1 rin typedef float __m128 __attribute__((__vector_size__(16), __may_alias__));
56 1.1 rin typedef long long __m128i __attribute__((__vector_size__(16), __may_alias__));
57 1.1 rin typedef long long __m128i_u
58 1.1 rin __attribute__((__vector_size__(16), __may_alias__, __aligned__(1)));
59 1.1 rin typedef long long __v2di __attribute__((__vector_size__(16)));
60 1.1 rin typedef unsigned long long __v2du __attribute__((__vector_size__(16)));
61 1.1 rin typedef int __v4si __attribute__((__vector_size__(16)));
62 1.1 rin typedef unsigned __v4su __attribute__((__vector_size__(16)));
63 1.1 rin typedef float __v4sf __attribute__((__vector_size__(16)));
64 1.1 rin typedef short __v8hi __attribute__((__vector_size__(16)));
65 1.1 rin typedef char __v16qi __attribute__((__vector_size__(16)));
66 1.1 rin
67 1.1 rin #elif defined(__clang__)
68 1.1 rin
69 1.1 rin typedef float __m128 __attribute__((__vector_size__(16), __aligned__(16)));
70 1.1 rin typedef long long __m128i
71 1.1 rin __attribute__((__vector_size__(16), __aligned__(16)));
72 1.1 rin typedef long long __m128i_u
73 1.1 rin __attribute__((__vector_size__(16), __may_alias__, __aligned__(1)));
74 1.1 rin typedef long long __v2di __attribute__((__vector_size__(16)));
75 1.1 rin typedef unsigned long long __v2du __attribute__((__vector_size__(16)));
76 1.1 rin typedef int __v4si __attribute__((__vector_size__(16)));
77 1.1 rin typedef unsigned __v4su __attribute__((__vector_size__(16)));
78 1.1 rin typedef float __v4sf __attribute__((__vector_size__(16)));
79 1.1 rin typedef short __v8hi __attribute__((__vector_size__(16)));
80 1.1 rin typedef char __v16qi __attribute__((__vector_size__(16)));
81 1.1 rin
82 1.1 rin #define _INTRINSATTR \
83 1.1 rin __attribute__((__always_inline__, __nodebug__, __target__("sse2"), \
84 1.1 rin __min_vector_width__(128)))
85 1.1 rin #define _PACKALIAS \
86 1.1 rin __attribute__((__packed__, __may_alias__))
87 1.1 rin
88 1.1 rin #else
89 1.1 rin
90 1.1 rin #error Please teach me how to do Intel intrinsics for your compiler!
91 1.1 rin
92 1.1 rin #endif
93 1.1 rin
94 1.1 rin #define _SSSE3_ATTR __attribute__((target("ssse3")))
95 1.1 rin
96 1.1 rin _INTRINSATTR
97 1.1 rin static __inline __m128i
98 1.1 rin _mm_add_epi32(__m128i __a, __m128i __b)
99 1.1 rin {
100 1.1 rin return (__m128i)((__v4su)__a + (__v4su)__b);
101 1.1 rin }
102 1.1 rin
103 1.1 rin #if defined(__GNUC__) && !defined(__clang__)
104 1.1 rin #define _mm_alignr_epi8(hi,lo,bytes) \
105 1.1 rin (__m128i)__builtin_ia32_palignr128((__v2di)(__m128i)(hi), \
106 1.1 rin (__v2di)(__m128i)(lo), 8*(int)(bytes))
107 1.1 rin #elif defined(__clang__)
108 1.1 rin #define _mm_alignr_epi8(hi,lo,bytes) \
109 1.1 rin (__m128i)__builtin_ia32_palignr128((__v16qi)(__m128i)(hi), \
110 1.1 rin (__v16qi)(__m128i)(lo), (int)(bytes))
111 1.1 rin #endif
112 1.1 rin
113 1.1 rin _INTRINSATTR
114 1.1 rin static __inline __m128
115 1.1 rin _mm_load1_ps(const float *__p)
116 1.1 rin {
117 1.1 rin return __extension__ (__m128)(__v4sf) { *__p, *__p, *__p, *__p };
118 1.1 rin }
119 1.1 rin
120 1.1 rin _INTRINSATTR
121 1.1 rin static __inline __m128i
122 1.1 rin _mm_loadu_si128(const __m128i_u *__p)
123 1.1 rin {
124 1.2 riastrad #if defined(__GNUC__) && !defined(__clang__)
125 1.2 riastrad return *__p;
126 1.2 riastrad #else
127 1.1 rin return ((const struct { __m128i_u __v; } _PACKALIAS *)__p)->__v;
128 1.2 riastrad #endif
129 1.1 rin }
130 1.1 rin
131 1.1 rin _INTRINSATTR
132 1.1 rin static __inline __m128i
133 1.1 rin _mm_loadu_si32(const void *__p)
134 1.1 rin {
135 1.2 riastrad #if defined(__GNUC__) && !defined(__clang__)
136 1.2 riastrad int32_t __v = (*(__m32_u *)__p)[0];
137 1.2 riastrad #else
138 1.1 rin int32_t __v = ((const struct { int32_t __v; } _PACKALIAS *)__p)->__v;
139 1.2 riastrad #endif
140 1.1 rin return __extension__ (__m128i)(__v4si){ __v, 0, 0, 0 };
141 1.1 rin }
142 1.1 rin
143 1.1 rin _INTRINSATTR
144 1.1 rin static __inline __m128i
145 1.1 rin _mm_loadu_si64(const void *__p)
146 1.1 rin {
147 1.2 riastrad #if defined(__GNUC__) && !defined(__clang__)
148 1.2 riastrad int64_t __v = (*(__m64_u *)__p)[0];
149 1.2 riastrad #else
150 1.1 rin int64_t __v = ((const struct { int64_t __v; } _PACKALIAS *)__p)->__v;
151 1.2 riastrad #endif
152 1.1 rin return __extension__ (__m128i)(__v2di){ __v, 0 };
153 1.1 rin }
154 1.1 rin
155 1.1 rin _INTRINSATTR
156 1.1 rin static __inline __m128i
157 1.1 rin _mm_load_si128(const __m128i *__p)
158 1.1 rin {
159 1.1 rin return *__p;
160 1.1 rin }
161 1.1 rin
162 1.1 rin _INTRINSATTR
163 1.1 rin static __inline __m128
164 1.1 rin _mm_movehl_ps(__m128 __v0, __m128 __v1)
165 1.1 rin {
166 1.1 rin #if defined(__GNUC__) && !defined(__clang__)
167 1.1 rin return (__m128)__builtin_ia32_movhlps((__v4sf)__v0, (__v4sf)__v1);
168 1.1 rin #elif defined(__clang__)
169 1.1 rin return __builtin_shufflevector((__v4sf)__v0, (__v4sf)__v1, 6,7,2,3);
170 1.1 rin #endif
171 1.1 rin }
172 1.1 rin
173 1.1 rin _INTRINSATTR
174 1.1 rin static __inline __m128
175 1.1 rin _mm_movelh_ps(__m128 __v0, __m128 __v1)
176 1.1 rin {
177 1.1 rin #if defined(__GNUC__) && !defined(__clang__)
178 1.1 rin return (__m128)__builtin_ia32_movlhps((__v4sf)__v0, (__v4sf)__v1);
179 1.1 rin #elif defined(__clang__)
180 1.1 rin return __builtin_shufflevector((__v4sf)__v0, (__v4sf)__v1, 0,1,4,5);
181 1.1 rin #endif
182 1.1 rin }
183 1.1 rin
184 1.1 rin _INTRINSATTR
185 1.1 rin static __inline __m128i
186 1.1 rin _mm_set1_epi16(int16_t __v)
187 1.1 rin {
188 1.1 rin return __extension__ (__m128i)(__v8hi){
189 1.1 rin __v, __v, __v, __v, __v, __v, __v, __v
190 1.1 rin };
191 1.1 rin }
192 1.1 rin
193 1.1 rin _INTRINSATTR
194 1.1 rin static __inline __m128i
195 1.1 rin _mm_set1_epi32(int32_t __v)
196 1.1 rin {
197 1.1 rin return __extension__ (__m128i)(__v4si){ __v, __v, __v, __v };
198 1.1 rin }
199 1.1 rin
200 1.1 rin _INTRINSATTR
201 1.1 rin static __inline __m128i
202 1.1 rin _mm_set1_epi64x(int64_t __v)
203 1.1 rin {
204 1.1 rin return __extension__ (__m128i)(__v2di){ __v, __v };
205 1.1 rin }
206 1.1 rin
207 1.1 rin _INTRINSATTR
208 1.1 rin static __inline __m128i
209 1.1 rin _mm_set_epi32(int32_t __v3, int32_t __v2, int32_t __v1, int32_t __v0)
210 1.1 rin {
211 1.1 rin return __extension__ (__m128i)(__v4si){ __v0, __v1, __v2, __v3 };
212 1.1 rin }
213 1.1 rin
214 1.1 rin _INTRINSATTR
215 1.1 rin static __inline __m128i
216 1.1 rin _mm_set_epi64x(int64_t __v1, int64_t __v0)
217 1.1 rin {
218 1.1 rin return __extension__ (__m128i)(__v2di){ __v0, __v1 };
219 1.1 rin }
220 1.1 rin
221 1.1 rin _INTRINSATTR
222 1.1 rin static __inline __m128
223 1.1 rin _mm_setzero_ps(void)
224 1.1 rin {
225 1.1 rin return __extension__ (__m128){ 0, 0, 0, 0 };
226 1.1 rin }
227 1.1 rin
228 1.1 rin _INTRINSATTR
229 1.1 rin static __inline __m128i
230 1.1 rin _mm_setzero_si128(void)
231 1.1 rin {
232 1.1 rin return _mm_set1_epi64x(0);
233 1.1 rin }
234 1.1 rin
235 1.1 rin _INTRINSATTR _SSSE3_ATTR
236 1.1 rin static __inline __m128i
237 1.1 rin _mm_shuffle_epi8(__m128i __vtbl, __m128i __vidx)
238 1.1 rin {
239 1.1 rin return (__m128i)__builtin_ia32_pshufb128((__v16qi)__vtbl,
240 1.1 rin (__v16qi)__vidx);
241 1.1 rin }
242 1.1 rin
243 1.1 rin #define _mm_shuffle_epi32(v,m) \
244 1.1 rin (__m128i)__builtin_ia32_pshufd((__v4si)(__m128i)(v), (int)(m))
245 1.1 rin
246 1.1 rin #define _mm_shuffle_ps(x,y,m) \
247 1.1 rin (__m128)__builtin_ia32_shufps((__v4sf)(__m128)(x), \
248 1.1 rin (__v4sf)(__m128)(y), (int)(m)) \
249 1.1 rin
250 1.1 rin _INTRINSATTR
251 1.1 rin static __inline __m128i
252 1.1 rin _mm_slli_epi32(__m128i __v, uint8_t __bits)
253 1.1 rin {
254 1.1 rin return (__m128i)__builtin_ia32_pslldi128((__v4si)__v, (int)__bits);
255 1.1 rin }
256 1.1 rin
257 1.1 rin _INTRINSATTR
258 1.1 rin static __inline __m128i
259 1.1 rin _mm_slli_epi64(__m128i __v, uint8_t __bits)
260 1.1 rin {
261 1.1 rin return (__m128i)__builtin_ia32_psllqi128((__v2di)__v, (int)__bits);
262 1.1 rin }
263 1.1 rin
264 1.1 rin #if defined(__GNUC__) && !defined(__clang__)
265 1.1 rin #define _mm_slli_si128(v,bytes) \
266 1.1 rin (__m128i)__builtin_ia32_pslldqi128((__v2di)(__m128i)(v), \
267 1.1 rin 8*(int)(bytes))
268 1.1 rin #elif defined(__clang__)
269 1.1 rin #define _mm_slli_si128(v,bytes) \
270 1.1 rin (__m128i)__builtin_ia32_pslldqi128_byteshift((__v2di)(__m128i)(v), \
271 1.1 rin (int)(bytes))
272 1.1 rin #endif
273 1.1 rin
274 1.1 rin _INTRINSATTR
275 1.1 rin static __inline __m128i
276 1.1 rin _mm_srli_epi32(__m128i __v, uint8_t __bits)
277 1.1 rin {
278 1.1 rin return (__m128i)__builtin_ia32_psrldi128((__v4si)__v, (int)__bits);
279 1.1 rin }
280 1.1 rin
281 1.1 rin _INTRINSATTR
282 1.1 rin static __inline __m128i
283 1.1 rin _mm_srli_epi64(__m128i __v, uint8_t __bits)
284 1.1 rin {
285 1.1 rin return (__m128i)__builtin_ia32_psrlqi128((__v2di)__v, (int)__bits);
286 1.1 rin }
287 1.1 rin
288 1.1 rin #if defined(__GNUC__) && !defined(__clang__)
289 1.1 rin #define _mm_srli_si128(v,bytes) \
290 1.1 rin (__m128i)__builtin_ia32_psrldqi128((__m128i)(v), 8*(int)(bytes))
291 1.1 rin #elif defined(__clang__)
292 1.1 rin #define _mm_srli_si128(v,bytes) \
293 1.1 rin (__m128i)__builtin_ia32_psrldqi128_byteshift((__v2di)(__m128i)(v), \
294 1.1 rin (int)(bytes));
295 1.1 rin #endif
296 1.1 rin
297 1.1 rin _INTRINSATTR
298 1.1 rin static __inline void
299 1.1 rin _mm_storeu_si128(__m128i_u *__p, __m128i __v)
300 1.1 rin {
301 1.2 riastrad #if defined(__GNUC__) && !defined(__clang__)
302 1.2 riastrad *__p = __v;
303 1.2 riastrad #else
304 1.1 rin ((struct { __m128i_u __v; } _PACKALIAS *)__p)->__v = __v;
305 1.2 riastrad #endif
306 1.1 rin }
307 1.1 rin
308 1.1 rin _INTRINSATTR
309 1.1 rin static __inline void
310 1.1 rin _mm_storeu_si32(void *__p, __m128i __v)
311 1.1 rin {
312 1.2 riastrad #if defined(__GNUC__) && !defined(__clang__)
313 1.2 riastrad *(__m32_u *)__p = (__m32)((__v4si)__v)[0];
314 1.2 riastrad #else
315 1.1 rin ((struct { int32_t __v; } _PACKALIAS *)__p)->__v = ((__v4si)__v)[0];
316 1.2 riastrad #endif
317 1.1 rin }
318 1.1 rin
319 1.1 rin _INTRINSATTR
320 1.1 rin static __inline void
321 1.1 rin _mm_storeu_si64(void *__p, __m128i __v)
322 1.1 rin {
323 1.2 riastrad #if defined(__GNUC__) && !defined(__clang__)
324 1.2 riastrad *(__m64_u *)__p = (__m64)((__v2di)__v)[0];
325 1.2 riastrad #else
326 1.1 rin ((struct { int64_t __v; } _PACKALIAS *)__p)->__v = ((__v2di)__v)[0];
327 1.2 riastrad #endif
328 1.1 rin }
329 1.1 rin
330 1.1 rin _INTRINSATTR
331 1.1 rin static __inline void
332 1.1 rin _mm_store_si128(__m128i *__p, __m128i __v)
333 1.1 rin {
334 1.1 rin *__p = __v;
335 1.1 rin }
336 1.1 rin
337 1.1 rin _INTRINSATTR
338 1.1 rin static __inline __m128i
339 1.1 rin _mm_sub_epi64(__m128i __x, __m128i __y)
340 1.1 rin {
341 1.1 rin return (__m128i)((__v2du)__x - (__v2du)__y);
342 1.1 rin }
343 1.1 rin
344 1.1 rin _INTRINSATTR
345 1.1 rin static __inline __m128i
346 1.1 rin _mm_unpackhi_epi32(__m128i __lo, __m128i __hi)
347 1.1 rin {
348 1.1 rin #if defined(__GNUC__) && !defined(__clang__)
349 1.1 rin return (__m128i)__builtin_ia32_punpckhdq128((__v4si)__lo,
350 1.1 rin (__v4si)__hi);
351 1.1 rin #elif defined(__clang__)
352 1.1 rin return (__m128i)__builtin_shufflevector((__v4si)__lo, (__v4si)__hi,
353 1.1 rin 2,6,3,7);
354 1.1 rin #endif
355 1.1 rin }
356 1.1 rin
357 1.1 rin _INTRINSATTR
358 1.1 rin static __inline __m128i
359 1.1 rin _mm_unpacklo_epi32(__m128i __lo, __m128i __hi)
360 1.1 rin {
361 1.1 rin #if defined(__GNUC__) && !defined(__clang__)
362 1.1 rin return (__m128i)__builtin_ia32_punpckldq128((__v4si)__lo,
363 1.1 rin (__v4si)__hi);
364 1.1 rin #elif defined(__clang__)
365 1.1 rin return (__m128i)__builtin_shufflevector((__v4si)__lo, (__v4si)__hi,
366 1.1 rin 0,4,1,5);
367 1.1 rin #endif
368 1.1 rin }
369 1.1 rin
370 1.1 rin _INTRINSATTR
371 1.1 rin static __inline __m128i
372 1.1 rin _mm_unpacklo_epi64(__m128i __lo, __m128i __hi)
373 1.1 rin {
374 1.1 rin #if defined(__GNUC__) && !defined(__clang__)
375 1.1 rin return (__m128i)__builtin_ia32_punpcklqdq128((__v2di)__lo,
376 1.1 rin (__v2di)__hi);
377 1.1 rin #elif defined(__clang__)
378 1.1 rin return (__m128i)__builtin_shufflevector((__v2di)__lo, (__v2di)__hi,
379 1.1 rin 0,2);
380 1.1 rin #endif
381 1.1 rin }
382 1.1 rin
383 1.1 rin #endif /* _SYS_CRYPTO_ARCH_X86_IMMINTRIN_H */
384