immintrin.h revision 1.4 1 1.4 riastrad /* $NetBSD: immintrin.h,v 1.4 2024/07/16 15:27:40 riastradh Exp $ */
2 1.1 rin
3 1.1 rin /*-
4 1.1 rin * Copyright (c) 2020 The NetBSD Foundation, Inc.
5 1.1 rin * All rights reserved.
6 1.1 rin *
7 1.1 rin * Redistribution and use in source and binary forms, with or without
8 1.1 rin * modification, are permitted provided that the following conditions
9 1.1 rin * are met:
10 1.1 rin * 1. Redistributions of source code must retain the above copyright
11 1.1 rin * notice, this list of conditions and the following disclaimer.
12 1.1 rin * 2. Redistributions in binary form must reproduce the above copyright
13 1.1 rin * notice, this list of conditions and the following disclaimer in the
14 1.1 rin * documentation and/or other materials provided with the distribution.
15 1.1 rin *
16 1.1 rin * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
17 1.1 rin * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
18 1.1 rin * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
19 1.1 rin * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
20 1.1 rin * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
21 1.1 rin * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
22 1.1 rin * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
23 1.1 rin * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
24 1.1 rin * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
25 1.1 rin * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
26 1.1 rin * POSSIBILITY OF SUCH DAMAGE.
27 1.1 rin */
28 1.1 rin
29 1.1 rin #ifndef _SYS_CRYPTO_ARCH_X86_IMMINTRIN_H
30 1.1 rin #define _SYS_CRYPTO_ARCH_X86_IMMINTRIN_H
31 1.1 rin
32 1.1 rin #include <sys/types.h>
33 1.1 rin
34 1.1 rin /*
35 1.1 rin * This kludgerous header file provides definitions for the Intel
36 1.1 rin * intrinsics that work with GCC and Clang, because <immintrin.h> is
37 1.1 rin * not available during the kernel build and arranging to make it
38 1.1 rin * available is complicated. Please fix this properly!
39 1.1 rin */
40 1.1 rin
41 1.1 rin #if defined(__GNUC__) && !defined(__clang__)
42 1.1 rin
43 1.1 rin #define _INTRINSATTR \
44 1.1 rin __attribute__((__gnu_inline__, __always_inline__, __artificial__))
45 1.1 rin
46 1.2 riastrad typedef short __m16 __attribute__((__vector_size__(2), __may_alias__));
47 1.2 riastrad typedef short __m16_u
48 1.2 riastrad __attribute__((__vector_size__(2), __may_alias__, __aligned__(1)));
49 1.2 riastrad typedef int __m32 __attribute__((__vector_size__(4), __may_alias__));
50 1.2 riastrad typedef int __m32_u
51 1.2 riastrad __attribute__((__vector_size__(4), __may_alias__, __aligned__(1)));
52 1.2 riastrad typedef int __m64 __attribute__((__vector_size__(8), __may_alias__));
53 1.2 riastrad typedef int __m64_u
54 1.2 riastrad __attribute__((__vector_size__(8), __may_alias__, __aligned__(1)));
55 1.1 rin typedef float __m128 __attribute__((__vector_size__(16), __may_alias__));
56 1.1 rin typedef long long __m128i __attribute__((__vector_size__(16), __may_alias__));
57 1.1 rin typedef long long __m128i_u
58 1.1 rin __attribute__((__vector_size__(16), __may_alias__, __aligned__(1)));
59 1.1 rin typedef long long __v2di __attribute__((__vector_size__(16)));
60 1.1 rin typedef unsigned long long __v2du __attribute__((__vector_size__(16)));
61 1.1 rin typedef int __v4si __attribute__((__vector_size__(16)));
62 1.1 rin typedef unsigned __v4su __attribute__((__vector_size__(16)));
63 1.1 rin typedef float __v4sf __attribute__((__vector_size__(16)));
64 1.1 rin typedef short __v8hi __attribute__((__vector_size__(16)));
65 1.1 rin typedef char __v16qi __attribute__((__vector_size__(16)));
66 1.4 riastrad typedef char __v16qi_u
67 1.4 riastrad __attribute__((__vector_size__(16), __may_alias__, __aligned__(1)));
68 1.1 rin
69 1.1 rin #elif defined(__clang__)
70 1.1 rin
71 1.1 rin typedef float __m128 __attribute__((__vector_size__(16), __aligned__(16)));
72 1.1 rin typedef long long __m128i
73 1.1 rin __attribute__((__vector_size__(16), __aligned__(16)));
74 1.1 rin typedef long long __m128i_u
75 1.1 rin __attribute__((__vector_size__(16), __may_alias__, __aligned__(1)));
76 1.1 rin typedef long long __v2di __attribute__((__vector_size__(16)));
77 1.1 rin typedef unsigned long long __v2du __attribute__((__vector_size__(16)));
78 1.1 rin typedef int __v4si __attribute__((__vector_size__(16)));
79 1.1 rin typedef unsigned __v4su __attribute__((__vector_size__(16)));
80 1.1 rin typedef float __v4sf __attribute__((__vector_size__(16)));
81 1.1 rin typedef short __v8hi __attribute__((__vector_size__(16)));
82 1.1 rin typedef char __v16qi __attribute__((__vector_size__(16)));
83 1.1 rin
84 1.1 rin #define _INTRINSATTR \
85 1.1 rin __attribute__((__always_inline__, __nodebug__, __target__("sse2"), \
86 1.1 rin __min_vector_width__(128)))
87 1.1 rin #define _PACKALIAS \
88 1.1 rin __attribute__((__packed__, __may_alias__))
89 1.1 rin
90 1.1 rin #else
91 1.1 rin
92 1.1 rin #error Please teach me how to do Intel intrinsics for your compiler!
93 1.1 rin
94 1.1 rin #endif
95 1.1 rin
96 1.1 rin #define _SSSE3_ATTR __attribute__((target("ssse3")))
97 1.1 rin
98 1.1 rin _INTRINSATTR
99 1.1 rin static __inline __m128i
100 1.1 rin _mm_add_epi32(__m128i __a, __m128i __b)
101 1.1 rin {
102 1.1 rin return (__m128i)((__v4su)__a + (__v4su)__b);
103 1.1 rin }
104 1.1 rin
105 1.1 rin #if defined(__GNUC__) && !defined(__clang__)
106 1.1 rin #define _mm_alignr_epi8(hi,lo,bytes) \
107 1.1 rin (__m128i)__builtin_ia32_palignr128((__v2di)(__m128i)(hi), \
108 1.1 rin (__v2di)(__m128i)(lo), 8*(int)(bytes))
109 1.1 rin #elif defined(__clang__)
110 1.1 rin #define _mm_alignr_epi8(hi,lo,bytes) \
111 1.1 rin (__m128i)__builtin_ia32_palignr128((__v16qi)(__m128i)(hi), \
112 1.1 rin (__v16qi)(__m128i)(lo), (int)(bytes))
113 1.1 rin #endif
114 1.1 rin
115 1.1 rin _INTRINSATTR
116 1.1 rin static __inline __m128
117 1.1 rin _mm_load1_ps(const float *__p)
118 1.1 rin {
119 1.1 rin return __extension__ (__m128)(__v4sf) { *__p, *__p, *__p, *__p };
120 1.1 rin }
121 1.1 rin
122 1.1 rin _INTRINSATTR
123 1.1 rin static __inline __m128i
124 1.1 rin _mm_loadu_si128(const __m128i_u *__p)
125 1.1 rin {
126 1.2 riastrad #if defined(__GNUC__) && !defined(__clang__)
127 1.2 riastrad return *__p;
128 1.2 riastrad #else
129 1.1 rin return ((const struct { __m128i_u __v; } _PACKALIAS *)__p)->__v;
130 1.2 riastrad #endif
131 1.1 rin }
132 1.1 rin
133 1.1 rin _INTRINSATTR
134 1.1 rin static __inline __m128i
135 1.1 rin _mm_loadu_si32(const void *__p)
136 1.1 rin {
137 1.2 riastrad #if defined(__GNUC__) && !defined(__clang__)
138 1.3 riastrad int32_t __v = (*(const __m32_u *)__p)[0];
139 1.2 riastrad #else
140 1.1 rin int32_t __v = ((const struct { int32_t __v; } _PACKALIAS *)__p)->__v;
141 1.2 riastrad #endif
142 1.1 rin return __extension__ (__m128i)(__v4si){ __v, 0, 0, 0 };
143 1.1 rin }
144 1.1 rin
145 1.1 rin _INTRINSATTR
146 1.1 rin static __inline __m128i
147 1.1 rin _mm_loadu_si64(const void *__p)
148 1.1 rin {
149 1.2 riastrad #if defined(__GNUC__) && !defined(__clang__)
150 1.4 riastrad int64_t __v = (int64_t)*(const __m64_u *)__p;
151 1.2 riastrad #else
152 1.1 rin int64_t __v = ((const struct { int64_t __v; } _PACKALIAS *)__p)->__v;
153 1.2 riastrad #endif
154 1.1 rin return __extension__ (__m128i)(__v2di){ __v, 0 };
155 1.1 rin }
156 1.1 rin
157 1.1 rin _INTRINSATTR
158 1.1 rin static __inline __m128i
159 1.1 rin _mm_load_si128(const __m128i *__p)
160 1.1 rin {
161 1.1 rin return *__p;
162 1.1 rin }
163 1.1 rin
164 1.1 rin _INTRINSATTR
165 1.1 rin static __inline __m128
166 1.1 rin _mm_movehl_ps(__m128 __v0, __m128 __v1)
167 1.1 rin {
168 1.1 rin #if defined(__GNUC__) && !defined(__clang__)
169 1.1 rin return (__m128)__builtin_ia32_movhlps((__v4sf)__v0, (__v4sf)__v1);
170 1.1 rin #elif defined(__clang__)
171 1.1 rin return __builtin_shufflevector((__v4sf)__v0, (__v4sf)__v1, 6,7,2,3);
172 1.1 rin #endif
173 1.1 rin }
174 1.1 rin
175 1.1 rin _INTRINSATTR
176 1.1 rin static __inline __m128
177 1.1 rin _mm_movelh_ps(__m128 __v0, __m128 __v1)
178 1.1 rin {
179 1.1 rin #if defined(__GNUC__) && !defined(__clang__)
180 1.1 rin return (__m128)__builtin_ia32_movlhps((__v4sf)__v0, (__v4sf)__v1);
181 1.1 rin #elif defined(__clang__)
182 1.1 rin return __builtin_shufflevector((__v4sf)__v0, (__v4sf)__v1, 0,1,4,5);
183 1.1 rin #endif
184 1.1 rin }
185 1.1 rin
186 1.1 rin _INTRINSATTR
187 1.1 rin static __inline __m128i
188 1.1 rin _mm_set1_epi16(int16_t __v)
189 1.1 rin {
190 1.1 rin return __extension__ (__m128i)(__v8hi){
191 1.1 rin __v, __v, __v, __v, __v, __v, __v, __v
192 1.1 rin };
193 1.1 rin }
194 1.1 rin
195 1.1 rin _INTRINSATTR
196 1.1 rin static __inline __m128i
197 1.1 rin _mm_set1_epi32(int32_t __v)
198 1.1 rin {
199 1.1 rin return __extension__ (__m128i)(__v4si){ __v, __v, __v, __v };
200 1.1 rin }
201 1.1 rin
202 1.1 rin _INTRINSATTR
203 1.1 rin static __inline __m128i
204 1.1 rin _mm_set1_epi64x(int64_t __v)
205 1.1 rin {
206 1.1 rin return __extension__ (__m128i)(__v2di){ __v, __v };
207 1.1 rin }
208 1.1 rin
209 1.1 rin _INTRINSATTR
210 1.1 rin static __inline __m128i
211 1.1 rin _mm_set_epi32(int32_t __v3, int32_t __v2, int32_t __v1, int32_t __v0)
212 1.1 rin {
213 1.1 rin return __extension__ (__m128i)(__v4si){ __v0, __v1, __v2, __v3 };
214 1.1 rin }
215 1.1 rin
216 1.1 rin _INTRINSATTR
217 1.1 rin static __inline __m128i
218 1.1 rin _mm_set_epi64x(int64_t __v1, int64_t __v0)
219 1.1 rin {
220 1.1 rin return __extension__ (__m128i)(__v2di){ __v0, __v1 };
221 1.1 rin }
222 1.1 rin
223 1.1 rin _INTRINSATTR
224 1.1 rin static __inline __m128
225 1.1 rin _mm_setzero_ps(void)
226 1.1 rin {
227 1.1 rin return __extension__ (__m128){ 0, 0, 0, 0 };
228 1.1 rin }
229 1.1 rin
230 1.1 rin _INTRINSATTR
231 1.1 rin static __inline __m128i
232 1.1 rin _mm_setzero_si128(void)
233 1.1 rin {
234 1.1 rin return _mm_set1_epi64x(0);
235 1.1 rin }
236 1.1 rin
237 1.1 rin _INTRINSATTR _SSSE3_ATTR
238 1.1 rin static __inline __m128i
239 1.1 rin _mm_shuffle_epi8(__m128i __vtbl, __m128i __vidx)
240 1.1 rin {
241 1.1 rin return (__m128i)__builtin_ia32_pshufb128((__v16qi)__vtbl,
242 1.1 rin (__v16qi)__vidx);
243 1.1 rin }
244 1.1 rin
245 1.1 rin #define _mm_shuffle_epi32(v,m) \
246 1.1 rin (__m128i)__builtin_ia32_pshufd((__v4si)(__m128i)(v), (int)(m))
247 1.1 rin
248 1.1 rin #define _mm_shuffle_ps(x,y,m) \
249 1.1 rin (__m128)__builtin_ia32_shufps((__v4sf)(__m128)(x), \
250 1.1 rin (__v4sf)(__m128)(y), (int)(m)) \
251 1.1 rin
252 1.1 rin _INTRINSATTR
253 1.1 rin static __inline __m128i
254 1.1 rin _mm_slli_epi32(__m128i __v, uint8_t __bits)
255 1.1 rin {
256 1.1 rin return (__m128i)__builtin_ia32_pslldi128((__v4si)__v, (int)__bits);
257 1.1 rin }
258 1.1 rin
259 1.1 rin _INTRINSATTR
260 1.1 rin static __inline __m128i
261 1.1 rin _mm_slli_epi64(__m128i __v, uint8_t __bits)
262 1.1 rin {
263 1.1 rin return (__m128i)__builtin_ia32_psllqi128((__v2di)__v, (int)__bits);
264 1.1 rin }
265 1.1 rin
266 1.1 rin #if defined(__GNUC__) && !defined(__clang__)
267 1.1 rin #define _mm_slli_si128(v,bytes) \
268 1.1 rin (__m128i)__builtin_ia32_pslldqi128((__v2di)(__m128i)(v), \
269 1.1 rin 8*(int)(bytes))
270 1.1 rin #elif defined(__clang__)
271 1.1 rin #define _mm_slli_si128(v,bytes) \
272 1.1 rin (__m128i)__builtin_ia32_pslldqi128_byteshift((__v2di)(__m128i)(v), \
273 1.1 rin (int)(bytes))
274 1.1 rin #endif
275 1.1 rin
276 1.1 rin _INTRINSATTR
277 1.1 rin static __inline __m128i
278 1.1 rin _mm_srli_epi32(__m128i __v, uint8_t __bits)
279 1.1 rin {
280 1.1 rin return (__m128i)__builtin_ia32_psrldi128((__v4si)__v, (int)__bits);
281 1.1 rin }
282 1.1 rin
283 1.1 rin _INTRINSATTR
284 1.1 rin static __inline __m128i
285 1.1 rin _mm_srli_epi64(__m128i __v, uint8_t __bits)
286 1.1 rin {
287 1.1 rin return (__m128i)__builtin_ia32_psrlqi128((__v2di)__v, (int)__bits);
288 1.1 rin }
289 1.1 rin
290 1.1 rin #if defined(__GNUC__) && !defined(__clang__)
291 1.1 rin #define _mm_srli_si128(v,bytes) \
292 1.1 rin (__m128i)__builtin_ia32_psrldqi128((__m128i)(v), 8*(int)(bytes))
293 1.1 rin #elif defined(__clang__)
294 1.1 rin #define _mm_srli_si128(v,bytes) \
295 1.1 rin (__m128i)__builtin_ia32_psrldqi128_byteshift((__v2di)(__m128i)(v), \
296 1.1 rin (int)(bytes));
297 1.1 rin #endif
298 1.1 rin
299 1.1 rin _INTRINSATTR
300 1.1 rin static __inline void
301 1.1 rin _mm_storeu_si128(__m128i_u *__p, __m128i __v)
302 1.1 rin {
303 1.2 riastrad #if defined(__GNUC__) && !defined(__clang__)
304 1.2 riastrad *__p = __v;
305 1.2 riastrad #else
306 1.1 rin ((struct { __m128i_u __v; } _PACKALIAS *)__p)->__v = __v;
307 1.2 riastrad #endif
308 1.1 rin }
309 1.1 rin
310 1.1 rin _INTRINSATTR
311 1.1 rin static __inline void
312 1.1 rin _mm_storeu_si32(void *__p, __m128i __v)
313 1.1 rin {
314 1.2 riastrad #if defined(__GNUC__) && !defined(__clang__)
315 1.2 riastrad *(__m32_u *)__p = (__m32)((__v4si)__v)[0];
316 1.2 riastrad #else
317 1.1 rin ((struct { int32_t __v; } _PACKALIAS *)__p)->__v = ((__v4si)__v)[0];
318 1.2 riastrad #endif
319 1.1 rin }
320 1.1 rin
321 1.1 rin _INTRINSATTR
322 1.1 rin static __inline void
323 1.1 rin _mm_storeu_si64(void *__p, __m128i __v)
324 1.1 rin {
325 1.2 riastrad #if defined(__GNUC__) && !defined(__clang__)
326 1.2 riastrad *(__m64_u *)__p = (__m64)((__v2di)__v)[0];
327 1.2 riastrad #else
328 1.1 rin ((struct { int64_t __v; } _PACKALIAS *)__p)->__v = ((__v2di)__v)[0];
329 1.2 riastrad #endif
330 1.1 rin }
331 1.1 rin
332 1.1 rin _INTRINSATTR
333 1.1 rin static __inline void
334 1.1 rin _mm_store_si128(__m128i *__p, __m128i __v)
335 1.1 rin {
336 1.1 rin *__p = __v;
337 1.1 rin }
338 1.1 rin
339 1.1 rin _INTRINSATTR
340 1.1 rin static __inline __m128i
341 1.1 rin _mm_sub_epi64(__m128i __x, __m128i __y)
342 1.1 rin {
343 1.1 rin return (__m128i)((__v2du)__x - (__v2du)__y);
344 1.1 rin }
345 1.1 rin
346 1.1 rin _INTRINSATTR
347 1.1 rin static __inline __m128i
348 1.1 rin _mm_unpackhi_epi32(__m128i __lo, __m128i __hi)
349 1.1 rin {
350 1.1 rin #if defined(__GNUC__) && !defined(__clang__)
351 1.1 rin return (__m128i)__builtin_ia32_punpckhdq128((__v4si)__lo,
352 1.1 rin (__v4si)__hi);
353 1.1 rin #elif defined(__clang__)
354 1.1 rin return (__m128i)__builtin_shufflevector((__v4si)__lo, (__v4si)__hi,
355 1.1 rin 2,6,3,7);
356 1.1 rin #endif
357 1.1 rin }
358 1.1 rin
359 1.1 rin _INTRINSATTR
360 1.1 rin static __inline __m128i
361 1.1 rin _mm_unpacklo_epi32(__m128i __lo, __m128i __hi)
362 1.1 rin {
363 1.1 rin #if defined(__GNUC__) && !defined(__clang__)
364 1.1 rin return (__m128i)__builtin_ia32_punpckldq128((__v4si)__lo,
365 1.1 rin (__v4si)__hi);
366 1.1 rin #elif defined(__clang__)
367 1.1 rin return (__m128i)__builtin_shufflevector((__v4si)__lo, (__v4si)__hi,
368 1.1 rin 0,4,1,5);
369 1.1 rin #endif
370 1.1 rin }
371 1.1 rin
372 1.1 rin _INTRINSATTR
373 1.1 rin static __inline __m128i
374 1.1 rin _mm_unpacklo_epi64(__m128i __lo, __m128i __hi)
375 1.1 rin {
376 1.1 rin #if defined(__GNUC__) && !defined(__clang__)
377 1.1 rin return (__m128i)__builtin_ia32_punpcklqdq128((__v2di)__lo,
378 1.1 rin (__v2di)__hi);
379 1.1 rin #elif defined(__clang__)
380 1.1 rin return (__m128i)__builtin_shufflevector((__v2di)__lo, (__v2di)__hi,
381 1.1 rin 0,2);
382 1.1 rin #endif
383 1.1 rin }
384 1.1 rin
385 1.1 rin #endif /* _SYS_CRYPTO_ARCH_X86_IMMINTRIN_H */
386