avx512bf16vlintrin.h revision 1.1.1.2 1 1.1.1.2 mrg /* Copyright (C) 2019-2022 Free Software Foundation, Inc.
2 1.1 mrg
3 1.1 mrg This file is part of GCC.
4 1.1 mrg
5 1.1 mrg GCC is free software; you can redistribute it and/or modify
6 1.1 mrg it under the terms of the GNU General Public License as published by
7 1.1 mrg the Free Software Foundation; either version 3, or (at your option)
8 1.1 mrg any later version.
9 1.1 mrg
10 1.1 mrg GCC is distributed in the hope that it will be useful,
11 1.1 mrg but WITHOUT ANY WARRANTY; without even the implied warranty of
12 1.1 mrg MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 1.1 mrg GNU General Public License for more details.
14 1.1 mrg
15 1.1 mrg Under Section 7 of GPL version 3, you are granted additional
16 1.1 mrg permissions described in the GCC Runtime Library Exception, version
17 1.1 mrg 3.1, as published by the Free Software Foundation.
18 1.1 mrg
19 1.1 mrg You should have received a copy of the GNU General Public License and
20 1.1 mrg a copy of the GCC Runtime Library Exception along with this program;
21 1.1 mrg see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
22 1.1 mrg <http://www.gnu.org/licenses/>. */
23 1.1 mrg
24 1.1 mrg #ifndef _IMMINTRIN_H_INCLUDED
25 1.1 mrg #error "Never use <avx512bf16vlintrin.h> directly; include <immintrin.h> instead."
26 1.1 mrg #endif
27 1.1 mrg
28 1.1 mrg #ifndef _AVX512BF16VLINTRIN_H_INCLUDED
29 1.1 mrg #define _AVX512BF16VLINTRIN_H_INCLUDED
30 1.1 mrg
31 1.1 mrg #if !defined(__AVX512VL__) || !defined(__AVX512BF16__)
32 1.1 mrg #pragma GCC push_options
33 1.1 mrg #pragma GCC target("avx512bf16,avx512vl")
34 1.1 mrg #define __DISABLE_AVX512BF16VL__
35 1.1 mrg #endif /* __AVX512BF16__ */
36 1.1 mrg
37 1.1 mrg /* Internal data types for implementing the intrinsics. */
38 1.1 mrg typedef short __v16bh __attribute__ ((__vector_size__ (32)));
39 1.1 mrg typedef short __v8bh __attribute__ ((__vector_size__ (16)));
40 1.1 mrg
41 1.1 mrg /* The Intel API is flexible enough that we must allow aliasing with other
42 1.1 mrg vector types, and their scalar components. */
43 1.1 mrg typedef short __m256bh __attribute__ ((__vector_size__ (32), __may_alias__));
44 1.1 mrg typedef short __m128bh __attribute__ ((__vector_size__ (16), __may_alias__));
45 1.1 mrg
46 1.1.1.2 mrg typedef unsigned short __bfloat16;
47 1.1 mrg /* vcvtne2ps2bf16 */
48 1.1 mrg
49 1.1 mrg extern __inline __m256bh
50 1.1 mrg __attribute__((__gnu_inline__, __always_inline__, __artificial__))
51 1.1 mrg _mm256_cvtne2ps_pbh (__m256 __A, __m256 __B)
52 1.1 mrg {
53 1.1 mrg return (__m256bh)__builtin_ia32_cvtne2ps2bf16_v16hi(__A, __B);
54 1.1 mrg }
55 1.1 mrg
56 1.1 mrg extern __inline __m256bh
57 1.1 mrg __attribute__((__gnu_inline__, __always_inline__, __artificial__))
58 1.1 mrg _mm256_mask_cvtne2ps_pbh (__m256bh __A, __mmask16 __B, __m256 __C, __m256 __D)
59 1.1 mrg {
60 1.1 mrg return (__m256bh)__builtin_ia32_cvtne2ps2bf16_v16hi_mask(__C, __D, __A, __B);
61 1.1 mrg }
62 1.1 mrg
63 1.1 mrg extern __inline __m256bh
64 1.1 mrg __attribute__((__gnu_inline__, __always_inline__, __artificial__))
65 1.1 mrg _mm256_maskz_cvtne2ps_pbh (__mmask16 __A, __m256 __B, __m256 __C)
66 1.1 mrg {
67 1.1 mrg return (__m256bh)__builtin_ia32_cvtne2ps2bf16_v16hi_maskz(__B, __C, __A);
68 1.1 mrg }
69 1.1 mrg
70 1.1 mrg extern __inline __m128bh
71 1.1 mrg __attribute__((__gnu_inline__, __always_inline__, __artificial__))
72 1.1 mrg _mm_cvtne2ps_pbh (__m128 __A, __m128 __B)
73 1.1 mrg {
74 1.1 mrg return (__m128bh)__builtin_ia32_cvtne2ps2bf16_v8hi(__A, __B);
75 1.1 mrg }
76 1.1 mrg
77 1.1 mrg extern __inline __m128bh
78 1.1 mrg __attribute__((__gnu_inline__, __always_inline__, __artificial__))
79 1.1 mrg _mm_mask_cvtne2ps_pbh (__m128bh __A, __mmask8 __B, __m128 __C, __m128 __D)
80 1.1 mrg {
81 1.1 mrg return (__m128bh)__builtin_ia32_cvtne2ps2bf16_v8hi_mask(__C, __D, __A, __B);
82 1.1 mrg }
83 1.1 mrg
84 1.1 mrg extern __inline __m128bh
85 1.1 mrg __attribute__((__gnu_inline__, __always_inline__, __artificial__))
86 1.1 mrg _mm_maskz_cvtne2ps_pbh (__mmask8 __A, __m128 __B, __m128 __C)
87 1.1 mrg {
88 1.1 mrg return (__m128bh)__builtin_ia32_cvtne2ps2bf16_v8hi_maskz(__B, __C, __A);
89 1.1 mrg }
90 1.1 mrg
91 1.1 mrg /* vcvtneps2bf16 */
92 1.1 mrg
93 1.1 mrg extern __inline __m128bh
94 1.1 mrg __attribute__((__gnu_inline__, __always_inline__, __artificial__))
95 1.1 mrg _mm256_cvtneps_pbh (__m256 __A)
96 1.1 mrg {
97 1.1 mrg return (__m128bh)__builtin_ia32_cvtneps2bf16_v8sf(__A);
98 1.1 mrg }
99 1.1 mrg
100 1.1 mrg extern __inline __m128bh
101 1.1 mrg __attribute__((__gnu_inline__, __always_inline__, __artificial__))
102 1.1 mrg _mm256_mask_cvtneps_pbh (__m128bh __A, __mmask8 __B, __m256 __C)
103 1.1 mrg {
104 1.1 mrg return (__m128bh)__builtin_ia32_cvtneps2bf16_v8sf_mask(__C, __A, __B);
105 1.1 mrg }
106 1.1 mrg
107 1.1 mrg extern __inline __m128bh
108 1.1 mrg __attribute__((__gnu_inline__, __always_inline__, __artificial__))
109 1.1 mrg _mm256_maskz_cvtneps_pbh (__mmask8 __A, __m256 __B)
110 1.1 mrg {
111 1.1 mrg return (__m128bh)__builtin_ia32_cvtneps2bf16_v8sf_maskz(__B, __A);
112 1.1 mrg }
113 1.1 mrg
114 1.1 mrg extern __inline __m128bh
115 1.1 mrg __attribute__((__gnu_inline__, __always_inline__, __artificial__))
116 1.1 mrg _mm_cvtneps_pbh (__m128 __A)
117 1.1 mrg {
118 1.1 mrg return (__m128bh)__builtin_ia32_cvtneps2bf16_v4sf(__A);
119 1.1 mrg }
120 1.1 mrg
121 1.1 mrg extern __inline __m128bh
122 1.1 mrg __attribute__((__gnu_inline__, __always_inline__, __artificial__))
123 1.1 mrg _mm_mask_cvtneps_pbh (__m128bh __A, __mmask8 __B, __m128 __C)
124 1.1 mrg {
125 1.1 mrg return (__m128bh)__builtin_ia32_cvtneps2bf16_v4sf_mask(__C, __A, __B);
126 1.1 mrg }
127 1.1 mrg
128 1.1 mrg extern __inline __m128bh
129 1.1 mrg __attribute__((__gnu_inline__, __always_inline__, __artificial__))
130 1.1 mrg _mm_maskz_cvtneps_pbh (__mmask8 __A, __m128 __B)
131 1.1 mrg {
132 1.1 mrg return (__m128bh)__builtin_ia32_cvtneps2bf16_v4sf_maskz(__B, __A);
133 1.1 mrg }
134 1.1 mrg
135 1.1 mrg /* vdpbf16ps */
136 1.1 mrg
137 1.1 mrg extern __inline __m256
138 1.1 mrg __attribute__((__gnu_inline__, __always_inline__, __artificial__))
139 1.1 mrg _mm256_dpbf16_ps (__m256 __A, __m256bh __B, __m256bh __C)
140 1.1 mrg {
141 1.1 mrg return (__m256)__builtin_ia32_dpbf16ps_v8sf(__A, __B, __C);
142 1.1 mrg }
143 1.1 mrg
144 1.1 mrg extern __inline __m256
145 1.1 mrg __attribute__((__gnu_inline__, __always_inline__, __artificial__))
146 1.1 mrg _mm256_mask_dpbf16_ps (__m256 __A, __mmask8 __B, __m256bh __C, __m256bh __D)
147 1.1 mrg {
148 1.1 mrg return (__m256)__builtin_ia32_dpbf16ps_v8sf_mask(__A, __C, __D, __B);
149 1.1 mrg }
150 1.1 mrg
151 1.1 mrg extern __inline __m256
152 1.1 mrg __attribute__((__gnu_inline__, __always_inline__, __artificial__))
153 1.1 mrg _mm256_maskz_dpbf16_ps (__mmask8 __A, __m256 __B, __m256bh __C, __m256bh __D)
154 1.1 mrg {
155 1.1 mrg return (__m256)__builtin_ia32_dpbf16ps_v8sf_maskz(__B, __C, __D, __A);
156 1.1 mrg }
157 1.1 mrg
158 1.1 mrg extern __inline __m128
159 1.1 mrg __attribute__((__gnu_inline__, __always_inline__, __artificial__))
160 1.1 mrg _mm_dpbf16_ps (__m128 __A, __m128bh __B, __m128bh __C)
161 1.1 mrg {
162 1.1 mrg return (__m128)__builtin_ia32_dpbf16ps_v4sf(__A, __B, __C);
163 1.1 mrg }
164 1.1 mrg
165 1.1 mrg extern __inline __m128
166 1.1 mrg __attribute__((__gnu_inline__, __always_inline__, __artificial__))
167 1.1 mrg _mm_mask_dpbf16_ps (__m128 __A, __mmask8 __B, __m128bh __C, __m128bh __D)
168 1.1 mrg {
169 1.1 mrg return (__m128)__builtin_ia32_dpbf16ps_v4sf_mask(__A, __C, __D, __B);
170 1.1 mrg }
171 1.1 mrg
172 1.1 mrg extern __inline __m128
173 1.1 mrg __attribute__((__gnu_inline__, __always_inline__, __artificial__))
174 1.1 mrg _mm_maskz_dpbf16_ps (__mmask8 __A, __m128 __B, __m128bh __C, __m128bh __D)
175 1.1 mrg {
176 1.1 mrg return (__m128)__builtin_ia32_dpbf16ps_v4sf_maskz(__B, __C, __D, __A);
177 1.1 mrg }
178 1.1 mrg
179 1.1.1.2 mrg extern __inline __bfloat16
180 1.1.1.2 mrg __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
181 1.1.1.2 mrg _mm_cvtness_sbh (float __A)
182 1.1.1.2 mrg {
183 1.1.1.2 mrg __v4sf __V = {__A, 0, 0, 0};
184 1.1.1.2 mrg __v8hi __R = __builtin_ia32_cvtneps2bf16_v4sf_mask ((__v4sf)__V,
185 1.1.1.2 mrg (__v8hi)_mm_undefined_si128 (), (__mmask8)-1);
186 1.1.1.2 mrg return __R[0];
187 1.1.1.2 mrg }
188 1.1.1.2 mrg
189 1.1.1.2 mrg extern __inline __m128
190 1.1.1.2 mrg __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
191 1.1.1.2 mrg _mm_cvtpbh_ps (__m128bh __A)
192 1.1.1.2 mrg {
193 1.1.1.2 mrg return (__m128)_mm_castsi128_ps ((__m128i)_mm_slli_epi32 (
194 1.1.1.2 mrg (__m128i)_mm_cvtepi16_epi32 ((__m128i)__A), 16));
195 1.1.1.2 mrg }
196 1.1.1.2 mrg
197 1.1.1.2 mrg extern __inline __m256
198 1.1.1.2 mrg __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
199 1.1.1.2 mrg _mm256_cvtpbh_ps (__m128bh __A)
200 1.1.1.2 mrg {
201 1.1.1.2 mrg return (__m256)_mm256_castsi256_ps ((__m256i)_mm256_slli_epi32 (
202 1.1.1.2 mrg (__m256i)_mm256_cvtepi16_epi32 ((__m128i)__A), 16));
203 1.1.1.2 mrg }
204 1.1.1.2 mrg
205 1.1.1.2 mrg extern __inline __m128
206 1.1.1.2 mrg __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
207 1.1.1.2 mrg _mm_maskz_cvtpbh_ps (__mmask8 __U, __m128bh __A)
208 1.1.1.2 mrg {
209 1.1.1.2 mrg return (__m128)_mm_castsi128_ps ((__m128i)_mm_slli_epi32 (
210 1.1.1.2 mrg (__m128i)_mm_maskz_cvtepi16_epi32 (
211 1.1.1.2 mrg (__mmask8)__U, (__m128i)__A), 16));
212 1.1.1.2 mrg }
213 1.1.1.2 mrg
214 1.1.1.2 mrg extern __inline __m256
215 1.1.1.2 mrg __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
216 1.1.1.2 mrg _mm256_maskz_cvtpbh_ps (__mmask8 __U, __m128bh __A)
217 1.1.1.2 mrg {
218 1.1.1.2 mrg return (__m256)_mm256_castsi256_ps ((__m256i)_mm256_slli_epi32 (
219 1.1.1.2 mrg (__m256i)_mm256_maskz_cvtepi16_epi32 (
220 1.1.1.2 mrg (__mmask8)__U, (__m128i)__A), 16));
221 1.1.1.2 mrg }
222 1.1.1.2 mrg
223 1.1.1.2 mrg extern __inline __m128
224 1.1.1.2 mrg __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
225 1.1.1.2 mrg _mm_mask_cvtpbh_ps (__m128 __S, __mmask8 __U, __m128bh __A)
226 1.1.1.2 mrg {
227 1.1.1.2 mrg return (__m128)_mm_castsi128_ps ((__m128i)_mm_mask_slli_epi32 (
228 1.1.1.2 mrg (__m128i)__S, (__mmask8)__U, (__m128i)_mm_cvtepi16_epi32 (
229 1.1.1.2 mrg (__m128i)__A), 16));
230 1.1.1.2 mrg }
231 1.1.1.2 mrg
232 1.1.1.2 mrg extern __inline __m256
233 1.1.1.2 mrg __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
234 1.1.1.2 mrg _mm256_mask_cvtpbh_ps (__m256 __S, __mmask8 __U, __m128bh __A)
235 1.1.1.2 mrg {
236 1.1.1.2 mrg return (__m256)_mm256_castsi256_ps ((__m256i)_mm256_mask_slli_epi32 (
237 1.1.1.2 mrg (__m256i)__S, (__mmask8)__U, (__m256i)_mm256_cvtepi16_epi32 (
238 1.1.1.2 mrg (__m128i)__A), 16));
239 1.1.1.2 mrg }
240 1.1.1.2 mrg
241 1.1 mrg #ifdef __DISABLE_AVX512BF16VL__
242 1.1 mrg #undef __DISABLE_AVX512BF16VL__
243 1.1 mrg #pragma GCC pop_options
244 1.1 mrg #endif /* __DISABLE_AVX512BF16VL__ */
245 1.1 mrg
246 1.1 mrg #endif /* _AVX512BF16VLINTRIN_H_INCLUDED */
247