1 1.1.1.2 mrg /* Copyright (C) 2019-2022 Free Software Foundation, Inc. 2 1.1 mrg 3 1.1 mrg This file is part of GCC. 4 1.1 mrg 5 1.1 mrg GCC is free software; you can redistribute it and/or modify 6 1.1 mrg it under the terms of the GNU General Public License as published by 7 1.1 mrg the Free Software Foundation; either version 3, or (at your option) 8 1.1 mrg any later version. 9 1.1 mrg 10 1.1 mrg GCC is distributed in the hope that it will be useful, 11 1.1 mrg but WITHOUT ANY WARRANTY; without even the implied warranty of 12 1.1 mrg MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 13 1.1 mrg GNU General Public License for more details. 14 1.1 mrg 15 1.1 mrg Under Section 7 of GPL version 3, you are granted additional 16 1.1 mrg permissions described in the GCC Runtime Library Exception, version 17 1.1 mrg 3.1, as published by the Free Software Foundation. 18 1.1 mrg 19 1.1 mrg You should have received a copy of the GNU General Public License and 20 1.1 mrg a copy of the GCC Runtime Library Exception along with this program; 21 1.1 mrg see the files COPYING3 and COPYING.RUNTIME respectively. If not, see 22 1.1 mrg <http://www.gnu.org/licenses/>. */ 23 1.1 mrg 24 1.1 mrg #ifndef _IMMINTRIN_H_INCLUDED 25 1.1 mrg #error "Never use <avx512bf16vlintrin.h> directly; include <immintrin.h> instead." 26 1.1 mrg #endif 27 1.1 mrg 28 1.1 mrg #ifndef _AVX512BF16VLINTRIN_H_INCLUDED 29 1.1 mrg #define _AVX512BF16VLINTRIN_H_INCLUDED 30 1.1 mrg 31 1.1 mrg #if !defined(__AVX512VL__) || !defined(__AVX512BF16__) 32 1.1 mrg #pragma GCC push_options 33 1.1 mrg #pragma GCC target("avx512bf16,avx512vl") 34 1.1 mrg #define __DISABLE_AVX512BF16VL__ 35 1.1 mrg #endif /* __AVX512BF16__ */ 36 1.1 mrg 37 1.1 mrg /* Internal data types for implementing the intrinsics. */ 38 1.1 mrg typedef short __v16bh __attribute__ ((__vector_size__ (32))); 39 1.1 mrg typedef short __v8bh __attribute__ ((__vector_size__ (16))); 40 1.1 mrg 41 1.1 mrg /* The Intel API is flexible enough that we must allow aliasing with other 42 1.1 mrg vector types, and their scalar components. */ 43 1.1 mrg typedef short __m256bh __attribute__ ((__vector_size__ (32), __may_alias__)); 44 1.1 mrg typedef short __m128bh __attribute__ ((__vector_size__ (16), __may_alias__)); 45 1.1 mrg 46 1.1.1.2 mrg typedef unsigned short __bfloat16; 47 1.1 mrg /* vcvtne2ps2bf16 */ 48 1.1 mrg 49 1.1 mrg extern __inline __m256bh 50 1.1 mrg __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 51 1.1 mrg _mm256_cvtne2ps_pbh (__m256 __A, __m256 __B) 52 1.1 mrg { 53 1.1 mrg return (__m256bh)__builtin_ia32_cvtne2ps2bf16_v16hi(__A, __B); 54 1.1 mrg } 55 1.1 mrg 56 1.1 mrg extern __inline __m256bh 57 1.1 mrg __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 58 1.1 mrg _mm256_mask_cvtne2ps_pbh (__m256bh __A, __mmask16 __B, __m256 __C, __m256 __D) 59 1.1 mrg { 60 1.1 mrg return (__m256bh)__builtin_ia32_cvtne2ps2bf16_v16hi_mask(__C, __D, __A, __B); 61 1.1 mrg } 62 1.1 mrg 63 1.1 mrg extern __inline __m256bh 64 1.1 mrg __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 65 1.1 mrg _mm256_maskz_cvtne2ps_pbh (__mmask16 __A, __m256 __B, __m256 __C) 66 1.1 mrg { 67 1.1 mrg return (__m256bh)__builtin_ia32_cvtne2ps2bf16_v16hi_maskz(__B, __C, __A); 68 1.1 mrg } 69 1.1 mrg 70 1.1 mrg extern __inline __m128bh 71 1.1 mrg __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 72 1.1 mrg _mm_cvtne2ps_pbh (__m128 __A, __m128 __B) 73 1.1 mrg { 74 1.1 mrg return (__m128bh)__builtin_ia32_cvtne2ps2bf16_v8hi(__A, __B); 75 1.1 mrg } 76 1.1 mrg 77 1.1 mrg extern __inline __m128bh 78 1.1 mrg __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 79 1.1 mrg _mm_mask_cvtne2ps_pbh (__m128bh __A, __mmask8 __B, __m128 __C, __m128 __D) 80 1.1 mrg { 81 1.1 mrg return (__m128bh)__builtin_ia32_cvtne2ps2bf16_v8hi_mask(__C, __D, __A, __B); 82 1.1 mrg } 83 1.1 mrg 84 1.1 mrg extern __inline __m128bh 85 1.1 mrg __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 86 1.1 mrg _mm_maskz_cvtne2ps_pbh (__mmask8 __A, __m128 __B, __m128 __C) 87 1.1 mrg { 88 1.1 mrg return (__m128bh)__builtin_ia32_cvtne2ps2bf16_v8hi_maskz(__B, __C, __A); 89 1.1 mrg } 90 1.1 mrg 91 1.1 mrg /* vcvtneps2bf16 */ 92 1.1 mrg 93 1.1 mrg extern __inline __m128bh 94 1.1 mrg __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 95 1.1 mrg _mm256_cvtneps_pbh (__m256 __A) 96 1.1 mrg { 97 1.1 mrg return (__m128bh)__builtin_ia32_cvtneps2bf16_v8sf(__A); 98 1.1 mrg } 99 1.1 mrg 100 1.1 mrg extern __inline __m128bh 101 1.1 mrg __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 102 1.1 mrg _mm256_mask_cvtneps_pbh (__m128bh __A, __mmask8 __B, __m256 __C) 103 1.1 mrg { 104 1.1 mrg return (__m128bh)__builtin_ia32_cvtneps2bf16_v8sf_mask(__C, __A, __B); 105 1.1 mrg } 106 1.1 mrg 107 1.1 mrg extern __inline __m128bh 108 1.1 mrg __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 109 1.1 mrg _mm256_maskz_cvtneps_pbh (__mmask8 __A, __m256 __B) 110 1.1 mrg { 111 1.1 mrg return (__m128bh)__builtin_ia32_cvtneps2bf16_v8sf_maskz(__B, __A); 112 1.1 mrg } 113 1.1 mrg 114 1.1 mrg extern __inline __m128bh 115 1.1 mrg __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 116 1.1 mrg _mm_cvtneps_pbh (__m128 __A) 117 1.1 mrg { 118 1.1 mrg return (__m128bh)__builtin_ia32_cvtneps2bf16_v4sf(__A); 119 1.1 mrg } 120 1.1 mrg 121 1.1 mrg extern __inline __m128bh 122 1.1 mrg __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 123 1.1 mrg _mm_mask_cvtneps_pbh (__m128bh __A, __mmask8 __B, __m128 __C) 124 1.1 mrg { 125 1.1 mrg return (__m128bh)__builtin_ia32_cvtneps2bf16_v4sf_mask(__C, __A, __B); 126 1.1 mrg } 127 1.1 mrg 128 1.1 mrg extern __inline __m128bh 129 1.1 mrg __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 130 1.1 mrg _mm_maskz_cvtneps_pbh (__mmask8 __A, __m128 __B) 131 1.1 mrg { 132 1.1 mrg return (__m128bh)__builtin_ia32_cvtneps2bf16_v4sf_maskz(__B, __A); 133 1.1 mrg } 134 1.1 mrg 135 1.1 mrg /* vdpbf16ps */ 136 1.1 mrg 137 1.1 mrg extern __inline __m256 138 1.1 mrg __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 139 1.1 mrg _mm256_dpbf16_ps (__m256 __A, __m256bh __B, __m256bh __C) 140 1.1 mrg { 141 1.1 mrg return (__m256)__builtin_ia32_dpbf16ps_v8sf(__A, __B, __C); 142 1.1 mrg } 143 1.1 mrg 144 1.1 mrg extern __inline __m256 145 1.1 mrg __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 146 1.1 mrg _mm256_mask_dpbf16_ps (__m256 __A, __mmask8 __B, __m256bh __C, __m256bh __D) 147 1.1 mrg { 148 1.1 mrg return (__m256)__builtin_ia32_dpbf16ps_v8sf_mask(__A, __C, __D, __B); 149 1.1 mrg } 150 1.1 mrg 151 1.1 mrg extern __inline __m256 152 1.1 mrg __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 153 1.1 mrg _mm256_maskz_dpbf16_ps (__mmask8 __A, __m256 __B, __m256bh __C, __m256bh __D) 154 1.1 mrg { 155 1.1 mrg return (__m256)__builtin_ia32_dpbf16ps_v8sf_maskz(__B, __C, __D, __A); 156 1.1 mrg } 157 1.1 mrg 158 1.1 mrg extern __inline __m128 159 1.1 mrg __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 160 1.1 mrg _mm_dpbf16_ps (__m128 __A, __m128bh __B, __m128bh __C) 161 1.1 mrg { 162 1.1 mrg return (__m128)__builtin_ia32_dpbf16ps_v4sf(__A, __B, __C); 163 1.1 mrg } 164 1.1 mrg 165 1.1 mrg extern __inline __m128 166 1.1 mrg __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 167 1.1 mrg _mm_mask_dpbf16_ps (__m128 __A, __mmask8 __B, __m128bh __C, __m128bh __D) 168 1.1 mrg { 169 1.1 mrg return (__m128)__builtin_ia32_dpbf16ps_v4sf_mask(__A, __C, __D, __B); 170 1.1 mrg } 171 1.1 mrg 172 1.1 mrg extern __inline __m128 173 1.1 mrg __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 174 1.1 mrg _mm_maskz_dpbf16_ps (__mmask8 __A, __m128 __B, __m128bh __C, __m128bh __D) 175 1.1 mrg { 176 1.1 mrg return (__m128)__builtin_ia32_dpbf16ps_v4sf_maskz(__B, __C, __D, __A); 177 1.1 mrg } 178 1.1 mrg 179 1.1.1.2 mrg extern __inline __bfloat16 180 1.1.1.2 mrg __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 181 1.1.1.2 mrg _mm_cvtness_sbh (float __A) 182 1.1.1.2 mrg { 183 1.1.1.2 mrg __v4sf __V = {__A, 0, 0, 0}; 184 1.1.1.2 mrg __v8hi __R = __builtin_ia32_cvtneps2bf16_v4sf_mask ((__v4sf)__V, 185 1.1.1.2 mrg (__v8hi)_mm_undefined_si128 (), (__mmask8)-1); 186 1.1.1.2 mrg return __R[0]; 187 1.1.1.2 mrg } 188 1.1.1.2 mrg 189 1.1.1.2 mrg extern __inline __m128 190 1.1.1.2 mrg __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 191 1.1.1.2 mrg _mm_cvtpbh_ps (__m128bh __A) 192 1.1.1.2 mrg { 193 1.1.1.2 mrg return (__m128)_mm_castsi128_ps ((__m128i)_mm_slli_epi32 ( 194 1.1.1.2 mrg (__m128i)_mm_cvtepi16_epi32 ((__m128i)__A), 16)); 195 1.1.1.2 mrg } 196 1.1.1.2 mrg 197 1.1.1.2 mrg extern __inline __m256 198 1.1.1.2 mrg __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 199 1.1.1.2 mrg _mm256_cvtpbh_ps (__m128bh __A) 200 1.1.1.2 mrg { 201 1.1.1.2 mrg return (__m256)_mm256_castsi256_ps ((__m256i)_mm256_slli_epi32 ( 202 1.1.1.2 mrg (__m256i)_mm256_cvtepi16_epi32 ((__m128i)__A), 16)); 203 1.1.1.2 mrg } 204 1.1.1.2 mrg 205 1.1.1.2 mrg extern __inline __m128 206 1.1.1.2 mrg __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 207 1.1.1.2 mrg _mm_maskz_cvtpbh_ps (__mmask8 __U, __m128bh __A) 208 1.1.1.2 mrg { 209 1.1.1.2 mrg return (__m128)_mm_castsi128_ps ((__m128i)_mm_slli_epi32 ( 210 1.1.1.2 mrg (__m128i)_mm_maskz_cvtepi16_epi32 ( 211 1.1.1.2 mrg (__mmask8)__U, (__m128i)__A), 16)); 212 1.1.1.2 mrg } 213 1.1.1.2 mrg 214 1.1.1.2 mrg extern __inline __m256 215 1.1.1.2 mrg __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 216 1.1.1.2 mrg _mm256_maskz_cvtpbh_ps (__mmask8 __U, __m128bh __A) 217 1.1.1.2 mrg { 218 1.1.1.2 mrg return (__m256)_mm256_castsi256_ps ((__m256i)_mm256_slli_epi32 ( 219 1.1.1.2 mrg (__m256i)_mm256_maskz_cvtepi16_epi32 ( 220 1.1.1.2 mrg (__mmask8)__U, (__m128i)__A), 16)); 221 1.1.1.2 mrg } 222 1.1.1.2 mrg 223 1.1.1.2 mrg extern __inline __m128 224 1.1.1.2 mrg __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 225 1.1.1.2 mrg _mm_mask_cvtpbh_ps (__m128 __S, __mmask8 __U, __m128bh __A) 226 1.1.1.2 mrg { 227 1.1.1.2 mrg return (__m128)_mm_castsi128_ps ((__m128i)_mm_mask_slli_epi32 ( 228 1.1.1.2 mrg (__m128i)__S, (__mmask8)__U, (__m128i)_mm_cvtepi16_epi32 ( 229 1.1.1.2 mrg (__m128i)__A), 16)); 230 1.1.1.2 mrg } 231 1.1.1.2 mrg 232 1.1.1.2 mrg extern __inline __m256 233 1.1.1.2 mrg __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 234 1.1.1.2 mrg _mm256_mask_cvtpbh_ps (__m256 __S, __mmask8 __U, __m128bh __A) 235 1.1.1.2 mrg { 236 1.1.1.2 mrg return (__m256)_mm256_castsi256_ps ((__m256i)_mm256_mask_slli_epi32 ( 237 1.1.1.2 mrg (__m256i)__S, (__mmask8)__U, (__m256i)_mm256_cvtepi16_epi32 ( 238 1.1.1.2 mrg (__m128i)__A), 16)); 239 1.1.1.2 mrg } 240 1.1.1.2 mrg 241 1.1 mrg #ifdef __DISABLE_AVX512BF16VL__ 242 1.1 mrg #undef __DISABLE_AVX512BF16VL__ 243 1.1 mrg #pragma GCC pop_options 244 1.1 mrg #endif /* __DISABLE_AVX512BF16VL__ */ 245 1.1 mrg 246 1.1 mrg #endif /* _AVX512BF16VLINTRIN_H_INCLUDED */ 247