immintrin.h revision 1.2 1 /* $NetBSD: immintrin.h,v 1.2 2024/07/15 13:51:10 riastradh Exp $ */
2
3 /*-
4 * Copyright (c) 2020 The NetBSD Foundation, Inc.
5 * All rights reserved.
6 *
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions
9 * are met:
10 * 1. Redistributions of source code must retain the above copyright
11 * notice, this list of conditions and the following disclaimer.
12 * 2. Redistributions in binary form must reproduce the above copyright
13 * notice, this list of conditions and the following disclaimer in the
14 * documentation and/or other materials provided with the distribution.
15 *
16 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
17 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
18 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
19 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
20 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
21 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
22 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
23 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
24 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
25 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
26 * POSSIBILITY OF SUCH DAMAGE.
27 */
28
29 #ifndef _SYS_CRYPTO_ARCH_X86_IMMINTRIN_H
30 #define _SYS_CRYPTO_ARCH_X86_IMMINTRIN_H
31
32 #include <sys/types.h>
33
34 /*
35 * This kludgerous header file provides definitions for the Intel
36 * intrinsics that work with GCC and Clang, because <immintrin.h> is
37 * not available during the kernel build and arranging to make it
38 * available is complicated. Please fix this properly!
39 */
40
41 #if defined(__GNUC__) && !defined(__clang__)
42
43 #define _INTRINSATTR \
44 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
45
46 typedef short __m16 __attribute__((__vector_size__(2), __may_alias__));
47 typedef short __m16_u
48 __attribute__((__vector_size__(2), __may_alias__, __aligned__(1)));
49 typedef int __m32 __attribute__((__vector_size__(4), __may_alias__));
50 typedef int __m32_u
51 __attribute__((__vector_size__(4), __may_alias__, __aligned__(1)));
52 typedef int __m64 __attribute__((__vector_size__(8), __may_alias__));
53 typedef int __m64_u
54 __attribute__((__vector_size__(8), __may_alias__, __aligned__(1)));
55 typedef float __m128 __attribute__((__vector_size__(16), __may_alias__));
56 typedef long long __m128i __attribute__((__vector_size__(16), __may_alias__));
57 typedef long long __m128i_u
58 __attribute__((__vector_size__(16), __may_alias__, __aligned__(1)));
59 typedef long long __v2di __attribute__((__vector_size__(16)));
60 typedef unsigned long long __v2du __attribute__((__vector_size__(16)));
61 typedef int __v4si __attribute__((__vector_size__(16)));
62 typedef unsigned __v4su __attribute__((__vector_size__(16)));
63 typedef float __v4sf __attribute__((__vector_size__(16)));
64 typedef short __v8hi __attribute__((__vector_size__(16)));
65 typedef char __v16qi __attribute__((__vector_size__(16)));
66
67 #elif defined(__clang__)
68
69 typedef float __m128 __attribute__((__vector_size__(16), __aligned__(16)));
70 typedef long long __m128i
71 __attribute__((__vector_size__(16), __aligned__(16)));
72 typedef long long __m128i_u
73 __attribute__((__vector_size__(16), __may_alias__, __aligned__(1)));
74 typedef long long __v2di __attribute__((__vector_size__(16)));
75 typedef unsigned long long __v2du __attribute__((__vector_size__(16)));
76 typedef int __v4si __attribute__((__vector_size__(16)));
77 typedef unsigned __v4su __attribute__((__vector_size__(16)));
78 typedef float __v4sf __attribute__((__vector_size__(16)));
79 typedef short __v8hi __attribute__((__vector_size__(16)));
80 typedef char __v16qi __attribute__((__vector_size__(16)));
81
82 #define _INTRINSATTR \
83 __attribute__((__always_inline__, __nodebug__, __target__("sse2"), \
84 __min_vector_width__(128)))
85 #define _PACKALIAS \
86 __attribute__((__packed__, __may_alias__))
87
88 #else
89
90 #error Please teach me how to do Intel intrinsics for your compiler!
91
92 #endif
93
94 #define _SSSE3_ATTR __attribute__((target("ssse3")))
95
96 _INTRINSATTR
97 static __inline __m128i
98 _mm_add_epi32(__m128i __a, __m128i __b)
99 {
100 return (__m128i)((__v4su)__a + (__v4su)__b);
101 }
102
103 #if defined(__GNUC__) && !defined(__clang__)
104 #define _mm_alignr_epi8(hi,lo,bytes) \
105 (__m128i)__builtin_ia32_palignr128((__v2di)(__m128i)(hi), \
106 (__v2di)(__m128i)(lo), 8*(int)(bytes))
107 #elif defined(__clang__)
108 #define _mm_alignr_epi8(hi,lo,bytes) \
109 (__m128i)__builtin_ia32_palignr128((__v16qi)(__m128i)(hi), \
110 (__v16qi)(__m128i)(lo), (int)(bytes))
111 #endif
112
113 _INTRINSATTR
114 static __inline __m128
115 _mm_load1_ps(const float *__p)
116 {
117 return __extension__ (__m128)(__v4sf) { *__p, *__p, *__p, *__p };
118 }
119
120 _INTRINSATTR
121 static __inline __m128i
122 _mm_loadu_si128(const __m128i_u *__p)
123 {
124 #if defined(__GNUC__) && !defined(__clang__)
125 return *__p;
126 #else
127 return ((const struct { __m128i_u __v; } _PACKALIAS *)__p)->__v;
128 #endif
129 }
130
131 _INTRINSATTR
132 static __inline __m128i
133 _mm_loadu_si32(const void *__p)
134 {
135 #if defined(__GNUC__) && !defined(__clang__)
136 int32_t __v = (*(__m32_u *)__p)[0];
137 #else
138 int32_t __v = ((const struct { int32_t __v; } _PACKALIAS *)__p)->__v;
139 #endif
140 return __extension__ (__m128i)(__v4si){ __v, 0, 0, 0 };
141 }
142
143 _INTRINSATTR
144 static __inline __m128i
145 _mm_loadu_si64(const void *__p)
146 {
147 #if defined(__GNUC__) && !defined(__clang__)
148 int64_t __v = (*(__m64_u *)__p)[0];
149 #else
150 int64_t __v = ((const struct { int64_t __v; } _PACKALIAS *)__p)->__v;
151 #endif
152 return __extension__ (__m128i)(__v2di){ __v, 0 };
153 }
154
155 _INTRINSATTR
156 static __inline __m128i
157 _mm_load_si128(const __m128i *__p)
158 {
159 return *__p;
160 }
161
162 _INTRINSATTR
163 static __inline __m128
164 _mm_movehl_ps(__m128 __v0, __m128 __v1)
165 {
166 #if defined(__GNUC__) && !defined(__clang__)
167 return (__m128)__builtin_ia32_movhlps((__v4sf)__v0, (__v4sf)__v1);
168 #elif defined(__clang__)
169 return __builtin_shufflevector((__v4sf)__v0, (__v4sf)__v1, 6,7,2,3);
170 #endif
171 }
172
173 _INTRINSATTR
174 static __inline __m128
175 _mm_movelh_ps(__m128 __v0, __m128 __v1)
176 {
177 #if defined(__GNUC__) && !defined(__clang__)
178 return (__m128)__builtin_ia32_movlhps((__v4sf)__v0, (__v4sf)__v1);
179 #elif defined(__clang__)
180 return __builtin_shufflevector((__v4sf)__v0, (__v4sf)__v1, 0,1,4,5);
181 #endif
182 }
183
184 _INTRINSATTR
185 static __inline __m128i
186 _mm_set1_epi16(int16_t __v)
187 {
188 return __extension__ (__m128i)(__v8hi){
189 __v, __v, __v, __v, __v, __v, __v, __v
190 };
191 }
192
193 _INTRINSATTR
194 static __inline __m128i
195 _mm_set1_epi32(int32_t __v)
196 {
197 return __extension__ (__m128i)(__v4si){ __v, __v, __v, __v };
198 }
199
200 _INTRINSATTR
201 static __inline __m128i
202 _mm_set1_epi64x(int64_t __v)
203 {
204 return __extension__ (__m128i)(__v2di){ __v, __v };
205 }
206
207 _INTRINSATTR
208 static __inline __m128i
209 _mm_set_epi32(int32_t __v3, int32_t __v2, int32_t __v1, int32_t __v0)
210 {
211 return __extension__ (__m128i)(__v4si){ __v0, __v1, __v2, __v3 };
212 }
213
214 _INTRINSATTR
215 static __inline __m128i
216 _mm_set_epi64x(int64_t __v1, int64_t __v0)
217 {
218 return __extension__ (__m128i)(__v2di){ __v0, __v1 };
219 }
220
221 _INTRINSATTR
222 static __inline __m128
223 _mm_setzero_ps(void)
224 {
225 return __extension__ (__m128){ 0, 0, 0, 0 };
226 }
227
228 _INTRINSATTR
229 static __inline __m128i
230 _mm_setzero_si128(void)
231 {
232 return _mm_set1_epi64x(0);
233 }
234
235 _INTRINSATTR _SSSE3_ATTR
236 static __inline __m128i
237 _mm_shuffle_epi8(__m128i __vtbl, __m128i __vidx)
238 {
239 return (__m128i)__builtin_ia32_pshufb128((__v16qi)__vtbl,
240 (__v16qi)__vidx);
241 }
242
243 #define _mm_shuffle_epi32(v,m) \
244 (__m128i)__builtin_ia32_pshufd((__v4si)(__m128i)(v), (int)(m))
245
246 #define _mm_shuffle_ps(x,y,m) \
247 (__m128)__builtin_ia32_shufps((__v4sf)(__m128)(x), \
248 (__v4sf)(__m128)(y), (int)(m)) \
249
250 _INTRINSATTR
251 static __inline __m128i
252 _mm_slli_epi32(__m128i __v, uint8_t __bits)
253 {
254 return (__m128i)__builtin_ia32_pslldi128((__v4si)__v, (int)__bits);
255 }
256
257 _INTRINSATTR
258 static __inline __m128i
259 _mm_slli_epi64(__m128i __v, uint8_t __bits)
260 {
261 return (__m128i)__builtin_ia32_psllqi128((__v2di)__v, (int)__bits);
262 }
263
264 #if defined(__GNUC__) && !defined(__clang__)
265 #define _mm_slli_si128(v,bytes) \
266 (__m128i)__builtin_ia32_pslldqi128((__v2di)(__m128i)(v), \
267 8*(int)(bytes))
268 #elif defined(__clang__)
269 #define _mm_slli_si128(v,bytes) \
270 (__m128i)__builtin_ia32_pslldqi128_byteshift((__v2di)(__m128i)(v), \
271 (int)(bytes))
272 #endif
273
274 _INTRINSATTR
275 static __inline __m128i
276 _mm_srli_epi32(__m128i __v, uint8_t __bits)
277 {
278 return (__m128i)__builtin_ia32_psrldi128((__v4si)__v, (int)__bits);
279 }
280
281 _INTRINSATTR
282 static __inline __m128i
283 _mm_srli_epi64(__m128i __v, uint8_t __bits)
284 {
285 return (__m128i)__builtin_ia32_psrlqi128((__v2di)__v, (int)__bits);
286 }
287
288 #if defined(__GNUC__) && !defined(__clang__)
289 #define _mm_srli_si128(v,bytes) \
290 (__m128i)__builtin_ia32_psrldqi128((__m128i)(v), 8*(int)(bytes))
291 #elif defined(__clang__)
292 #define _mm_srli_si128(v,bytes) \
293 (__m128i)__builtin_ia32_psrldqi128_byteshift((__v2di)(__m128i)(v), \
294 (int)(bytes));
295 #endif
296
297 _INTRINSATTR
298 static __inline void
299 _mm_storeu_si128(__m128i_u *__p, __m128i __v)
300 {
301 #if defined(__GNUC__) && !defined(__clang__)
302 *__p = __v;
303 #else
304 ((struct { __m128i_u __v; } _PACKALIAS *)__p)->__v = __v;
305 #endif
306 }
307
308 _INTRINSATTR
309 static __inline void
310 _mm_storeu_si32(void *__p, __m128i __v)
311 {
312 #if defined(__GNUC__) && !defined(__clang__)
313 *(__m32_u *)__p = (__m32)((__v4si)__v)[0];
314 #else
315 ((struct { int32_t __v; } _PACKALIAS *)__p)->__v = ((__v4si)__v)[0];
316 #endif
317 }
318
319 _INTRINSATTR
320 static __inline void
321 _mm_storeu_si64(void *__p, __m128i __v)
322 {
323 #if defined(__GNUC__) && !defined(__clang__)
324 *(__m64_u *)__p = (__m64)((__v2di)__v)[0];
325 #else
326 ((struct { int64_t __v; } _PACKALIAS *)__p)->__v = ((__v2di)__v)[0];
327 #endif
328 }
329
330 _INTRINSATTR
331 static __inline void
332 _mm_store_si128(__m128i *__p, __m128i __v)
333 {
334 *__p = __v;
335 }
336
337 _INTRINSATTR
338 static __inline __m128i
339 _mm_sub_epi64(__m128i __x, __m128i __y)
340 {
341 return (__m128i)((__v2du)__x - (__v2du)__y);
342 }
343
344 _INTRINSATTR
345 static __inline __m128i
346 _mm_unpackhi_epi32(__m128i __lo, __m128i __hi)
347 {
348 #if defined(__GNUC__) && !defined(__clang__)
349 return (__m128i)__builtin_ia32_punpckhdq128((__v4si)__lo,
350 (__v4si)__hi);
351 #elif defined(__clang__)
352 return (__m128i)__builtin_shufflevector((__v4si)__lo, (__v4si)__hi,
353 2,6,3,7);
354 #endif
355 }
356
357 _INTRINSATTR
358 static __inline __m128i
359 _mm_unpacklo_epi32(__m128i __lo, __m128i __hi)
360 {
361 #if defined(__GNUC__) && !defined(__clang__)
362 return (__m128i)__builtin_ia32_punpckldq128((__v4si)__lo,
363 (__v4si)__hi);
364 #elif defined(__clang__)
365 return (__m128i)__builtin_shufflevector((__v4si)__lo, (__v4si)__hi,
366 0,4,1,5);
367 #endif
368 }
369
370 _INTRINSATTR
371 static __inline __m128i
372 _mm_unpacklo_epi64(__m128i __lo, __m128i __hi)
373 {
374 #if defined(__GNUC__) && !defined(__clang__)
375 return (__m128i)__builtin_ia32_punpcklqdq128((__v2di)__lo,
376 (__v2di)__hi);
377 #elif defined(__clang__)
378 return (__m128i)__builtin_shufflevector((__v2di)__lo, (__v2di)__hi,
379 0,2);
380 #endif
381 }
382
383 #endif /* _SYS_CRYPTO_ARCH_X86_IMMINTRIN_H */
384