1 1.1 joerg /*===--------------- amxintrin.h - AMX intrinsics -*- C/C++ -*---------------=== 2 1.1 joerg * 3 1.1 joerg * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 1.1 joerg * See https://llvm.org/LICENSE.txt for license information. 5 1.1 joerg * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 1.1 joerg * 7 1.1 joerg *===------------------------------------------------------------------------=== 8 1.1 joerg */ 9 1.1 joerg 10 1.1 joerg #ifndef __IMMINTRIN_H 11 1.1 joerg #error "Never use <amxintrin.h> directly; include <immintrin.h> instead." 12 1.1 joerg #endif /* __IMMINTRIN_H */ 13 1.1 joerg 14 1.1 joerg #ifndef __AMXINTRIN_H 15 1.1 joerg #define __AMXINTRIN_H 16 1.1 joerg #ifdef __x86_64__ 17 1.1 joerg 18 1.1 joerg /* Define the default attributes for the functions in this file. */ 19 1.1 joerg #define __DEFAULT_FN_ATTRS_TILE \ 20 1.1 joerg __attribute__((__always_inline__, __nodebug__, __target__("amx-tile"))) 21 1.1 joerg #define __DEFAULT_FN_ATTRS_INT8 \ 22 1.1 joerg __attribute__((__always_inline__, __nodebug__, __target__("amx-int8"))) 23 1.1 joerg #define __DEFAULT_FN_ATTRS_BF16 \ 24 1.1 joerg __attribute__((__always_inline__, __nodebug__, __target__("amx-bf16"))) 25 1.1 joerg 26 1.1 joerg /// Load tile configuration from a 64-byte memory location specified by 27 1.1 joerg /// "mem_addr". The tile configuration includes the tile type palette, the 28 1.1 joerg /// number of bytes per row, and the number of rows. If the specified 29 1.1 joerg /// palette_id is zero, that signifies the init state for both the tile 30 1.1 joerg /// config and the tile data, and the tiles are zeroed. Any invalid 31 1.1 joerg /// configurations will result in #GP fault. 32 1.1 joerg /// 33 1.1 joerg /// \headerfile <immintrin.h> 34 1.1 joerg /// 35 1.1 joerg /// This intrinsic corresponds to the <c> LDTILECFG </c> instruction. 36 1.1 joerg /// 37 1.1 joerg /// \param __config 38 1.1 joerg /// A pointer to 512-bits configuration 39 1.1 joerg static __inline__ void __DEFAULT_FN_ATTRS_TILE 40 1.1 joerg _tile_loadconfig(const void *__config) { 41 1.1 joerg __builtin_ia32_tile_loadconfig(__config); 42 1.1 joerg } 43 1.1 joerg 44 1.1 joerg /// Stores the current tile configuration to a 64-byte memory location 45 1.1 joerg /// specified by "mem_addr". The tile configuration includes the tile type 46 1.1 joerg /// palette, the number of bytes per row, and the number of rows. If tiles 47 1.1 joerg /// are not configured, all zeroes will be stored to memory. 48 1.1 joerg /// 49 1.1 joerg /// \headerfile <immintrin.h> 50 1.1 joerg /// 51 1.1 joerg /// This intrinsic corresponds to the <c> STTILECFG </c> instruction. 52 1.1 joerg /// 53 1.1 joerg /// \param __config 54 1.1 joerg /// A pointer to 512-bits configuration 55 1.1 joerg static __inline__ void __DEFAULT_FN_ATTRS_TILE 56 1.1 joerg _tile_storeconfig(void *__config) { 57 1.1 joerg __builtin_ia32_tile_storeconfig(__config); 58 1.1 joerg } 59 1.1 joerg 60 1.1 joerg /// Release the tile configuration to return to the init state, which 61 1.1 joerg /// releases all storage it currently holds. 62 1.1 joerg /// 63 1.1 joerg /// \headerfile <immintrin.h> 64 1.1 joerg /// 65 1.1 joerg /// This intrinsic corresponds to the <c> TILERELEASE </c> instruction. 66 1.1 joerg static __inline__ void __DEFAULT_FN_ATTRS_TILE _tile_release(void) { 67 1.1 joerg __builtin_ia32_tilerelease(); 68 1.1 joerg } 69 1.1 joerg 70 1.1 joerg /// Load tile rows from memory specifieid by "base" address and "stride" into 71 1.1 joerg /// destination tile "dst" using the tile configuration previously configured 72 1.1 joerg /// via "_tile_loadconfig". 73 1.1 joerg /// 74 1.1 joerg /// \headerfile <immintrin.h> 75 1.1 joerg /// 76 1.1 joerg /// This intrinsic corresponds to the <c> TILELOADD </c> instruction. 77 1.1 joerg /// 78 1.1 joerg /// \param dst 79 1.1 joerg /// A destination tile. Max size is 1024 Bytes. 80 1.1 joerg /// \param base 81 1.1 joerg /// A pointer to base address. 82 1.1 joerg /// \param stride 83 1.1 joerg /// The stride between the rows' data to be loaded in memory. 84 1.1 joerg #define _tile_loadd(dst, base, stride) \ 85 1.1 joerg __builtin_ia32_tileloadd64((dst), ((const void *)(base)), \ 86 1.1 joerg (__SIZE_TYPE__)(stride)) 87 1.1 joerg 88 1.1 joerg /// Load tile rows from memory specifieid by "base" address and "stride" into 89 1.1 joerg /// destination tile "dst" using the tile configuration previously configured 90 1.1 joerg /// via "_tile_loadconfig". This intrinsic provides a hint to the implementation 91 1.1 joerg /// that the data will likely not be reused in the near future and the data 92 1.1 joerg /// caching can be optimized accordingly. 93 1.1 joerg /// 94 1.1 joerg /// \headerfile <immintrin.h> 95 1.1 joerg /// 96 1.1 joerg /// This intrinsic corresponds to the <c> TILELOADDT1 </c> instruction. 97 1.1 joerg /// 98 1.1 joerg /// \param dst 99 1.1 joerg /// A destination tile. Max size is 1024 Bytes. 100 1.1 joerg /// \param base 101 1.1 joerg /// A pointer to base address. 102 1.1 joerg /// \param stride 103 1.1 joerg /// The stride between the rows' data to be loaded in memory. 104 1.1 joerg #define _tile_stream_loadd(dst, base, stride) \ 105 1.1 joerg __builtin_ia32_tileloaddt164((dst), ((const void *)(base)), \ 106 1.1 joerg (__SIZE_TYPE__)(stride)) 107 1.1 joerg 108 1.1 joerg /// Store the tile specified by "src" to memory specifieid by "base" address and 109 1.1 joerg /// "stride" using the tile configuration previously configured via 110 1.1 joerg /// "_tile_loadconfig". 111 1.1 joerg /// 112 1.1 joerg /// \headerfile <immintrin.h> 113 1.1 joerg /// 114 1.1 joerg /// This intrinsic corresponds to the <c> TILESTORED </c> instruction. 115 1.1 joerg /// 116 1.1 joerg /// \param dst 117 1.1 joerg /// A destination tile. Max size is 1024 Bytes. 118 1.1 joerg /// \param base 119 1.1 joerg /// A pointer to base address. 120 1.1 joerg /// \param stride 121 1.1 joerg /// The stride between the rows' data to be stored in memory. 122 1.1 joerg #define _tile_stored(dst, base, stride) \ 123 1.1 joerg __builtin_ia32_tilestored64((dst), ((void *)(base)), (__SIZE_TYPE__)(stride)) 124 1.1 joerg 125 1.1 joerg /// Zero the tile specified by "tdest". 126 1.1 joerg /// 127 1.1 joerg /// \headerfile <immintrin.h> 128 1.1 joerg /// 129 1.1 joerg /// This intrinsic corresponds to the <c> TILEZERO </c> instruction. 130 1.1 joerg /// 131 1.1 joerg /// \param tile 132 1.1 joerg /// The destination tile to be zero. Max size is 1024 Bytes. 133 1.1 joerg #define _tile_zero(tile) __builtin_ia32_tilezero((tile)) 134 1.1 joerg 135 1.1 joerg /// Compute dot-product of bytes in tiles with a source/destination accumulator. 136 1.1 joerg /// Multiply groups of 4 adjacent pairs of signed 8-bit integers in src0 with 137 1.1 joerg /// corresponding signed 8-bit integers in src1, producing 4 intermediate 32-bit 138 1.1 joerg /// results. Sum these 4 results with the corresponding 32-bit integer in "dst", 139 1.1 joerg /// and store the 32-bit result back to tile "dst". 140 1.1 joerg /// 141 1.1 joerg /// \headerfile <immintrin.h> 142 1.1 joerg /// 143 1.1 joerg /// This intrinsic corresponds to the <c> TDPBSSD </c> instruction. 144 1.1 joerg /// 145 1.1 joerg /// \param dst 146 1.1 joerg /// The destination tile. Max size is 1024 Bytes. 147 1.1 joerg /// \param src0 148 1.1 joerg /// The 1st source tile. Max size is 1024 Bytes. 149 1.1 joerg /// \param src1 150 1.1 joerg /// The 2nd source tile. Max size is 1024 Bytes. 151 1.1 joerg #define _tile_dpbssd(dst, src0, src1) \ 152 1.1 joerg __builtin_ia32_tdpbssd((dst), (src0), (src1)) 153 1.1 joerg 154 1.1 joerg /// Compute dot-product of bytes in tiles with a source/destination accumulator. 155 1.1 joerg /// Multiply groups of 4 adjacent pairs of signed 8-bit integers in src0 with 156 1.1 joerg /// corresponding unsigned 8-bit integers in src1, producing 4 intermediate 157 1.1 joerg /// 32-bit results. Sum these 4 results with the corresponding 32-bit integer 158 1.1 joerg /// in "dst", and store the 32-bit result back to tile "dst". 159 1.1 joerg /// 160 1.1 joerg /// \headerfile <immintrin.h> 161 1.1 joerg /// 162 1.1 joerg /// This intrinsic corresponds to the <c> TDPBSUD </c> instruction. 163 1.1 joerg /// 164 1.1 joerg /// \param dst 165 1.1 joerg /// The destination tile. Max size is 1024 Bytes. 166 1.1 joerg /// \param src0 167 1.1 joerg /// The 1st source tile. Max size is 1024 Bytes. 168 1.1 joerg /// \param src1 169 1.1 joerg /// The 2nd source tile. Max size is 1024 Bytes. 170 1.1 joerg #define _tile_dpbsud(dst, src0, src1) \ 171 1.1 joerg __builtin_ia32_tdpbsud((dst), (src0), (src1)) 172 1.1 joerg 173 1.1 joerg /// Compute dot-product of bytes in tiles with a source/destination accumulator. 174 1.1 joerg /// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in src0 with 175 1.1 joerg /// corresponding signed 8-bit integers in src1, producing 4 intermediate 32-bit 176 1.1 joerg /// results. Sum these 4 results with the corresponding 32-bit integer in "dst", 177 1.1 joerg /// and store the 32-bit result back to tile "dst". 178 1.1 joerg /// 179 1.1 joerg /// \headerfile <immintrin.h> 180 1.1 joerg /// 181 1.1 joerg /// This intrinsic corresponds to the <c> TDPBUSD </c> instruction. 182 1.1 joerg /// 183 1.1 joerg /// \param dst 184 1.1 joerg /// The destination tile. Max size is 1024 Bytes. 185 1.1 joerg /// \param src0 186 1.1 joerg /// The 1st source tile. Max size is 1024 Bytes. 187 1.1 joerg /// \param src1 188 1.1 joerg /// The 2nd source tile. Max size is 1024 Bytes. 189 1.1 joerg #define _tile_dpbusd(dst, src0, src1) \ 190 1.1 joerg __builtin_ia32_tdpbusd((dst), (src0), (src1)) 191 1.1 joerg 192 1.1 joerg /// Compute dot-product of bytes in tiles with a source/destination accumulator. 193 1.1 joerg /// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in src0 with 194 1.1 joerg /// corresponding unsigned 8-bit integers in src1, producing 4 intermediate 195 1.1 joerg /// 32-bit results. Sum these 4 results with the corresponding 32-bit integer in 196 1.1 joerg /// "dst", and store the 32-bit result back to tile "dst". 197 1.1 joerg /// 198 1.1 joerg /// \headerfile <immintrin.h> 199 1.1 joerg /// 200 1.1 joerg /// This intrinsic corresponds to the <c> TDPBUUD </c> instruction. 201 1.1 joerg /// 202 1.1 joerg /// \param dst 203 1.1 joerg /// The destination tile. Max size is 1024 Bytes. 204 1.1 joerg /// \param src0 205 1.1 joerg /// The 1st source tile. Max size is 1024 Bytes. 206 1.1 joerg /// \param src1 207 1.1 joerg /// The 2nd source tile. Max size is 1024 Bytes. 208 1.1 joerg #define _tile_dpbuud(dst, src0, src1) \ 209 1.1 joerg __builtin_ia32_tdpbuud((dst), (src0), (src1)) 210 1.1 joerg 211 1.1 joerg /// Compute dot-product of BF16 (16-bit) floating-point pairs in tiles src0 and 212 1.1 joerg /// src1, accumulating the intermediate single-precision (32-bit) floating-point 213 1.1 joerg /// elements with elements in "dst", and store the 32-bit result back to tile 214 1.1 joerg /// "dst". 215 1.1 joerg /// 216 1.1 joerg /// \headerfile <immintrin.h> 217 1.1 joerg /// 218 1.1 joerg /// This intrinsic corresponds to the <c> TDPBF16PS </c> instruction. 219 1.1 joerg /// 220 1.1 joerg /// \param dst 221 1.1 joerg /// The destination tile. Max size is 1024 Bytes. 222 1.1 joerg /// \param src0 223 1.1 joerg /// The 1st source tile. Max size is 1024 Bytes. 224 1.1 joerg /// \param src1 225 1.1 joerg /// The 2nd source tile. Max size is 1024 Bytes. 226 1.1 joerg #define _tile_dpbf16ps(dst, src0, src1) \ 227 1.1 joerg __builtin_ia32_tdpbf16ps((dst), (src0), (src1)) 228 1.1 joerg 229 1.1 joerg /// AMX tile register size can be configured, the maximum size is 16x64=1024 230 1.1 joerg /// bytes. Since there is no 2D type in llvm IR, we use vector type to 231 1.1 joerg /// represent 2D tile and the fixed size is maximum amx tile register size. 232 1.1 joerg typedef int _tile1024i __attribute__((__vector_size__(1024), __aligned__(64))); 233 1.1 joerg 234 1.1 joerg /// This is internal intrinsic. C/C++ user should avoid calling it directly. 235 1.1 joerg static __inline__ _tile1024i __DEFAULT_FN_ATTRS_INT8 236 1.1 joerg _tile_loadd_internal(unsigned short m, unsigned short n, const void *base, 237 1.1 joerg __SIZE_TYPE__ stride) { 238 1.1 joerg return __builtin_ia32_tileloadd64_internal(m, n, base, 239 1.1 joerg (__SIZE_TYPE__)(stride)); 240 1.1 joerg } 241 1.1 joerg 242 1.1 joerg /// This is internal intrinsic. C/C++ user should avoid calling it directly. 243 1.1 joerg static __inline__ _tile1024i __DEFAULT_FN_ATTRS_INT8 244 1.1 joerg _tile_dpbssd_internal(unsigned short m, unsigned short n, unsigned short k, 245 1.1 joerg _tile1024i dst, _tile1024i src1, _tile1024i src2) { 246 1.1 joerg return __builtin_ia32_tdpbssd_internal(m, n, k, dst, src1, src2); 247 1.1 joerg } 248 1.1 joerg 249 1.1 joerg /// This is internal intrinsic. C/C++ user should avoid calling it directly. 250 1.1 joerg static __inline__ _tile1024i __DEFAULT_FN_ATTRS_INT8 251 1.1 joerg _tile_dpbsud_internal(unsigned short m, unsigned short n, unsigned short k, 252 1.1 joerg _tile1024i dst, _tile1024i src1, _tile1024i src2) { 253 1.1 joerg return __builtin_ia32_tdpbsud_internal(m, n, k, dst, src1, src2); 254 1.1 joerg } 255 1.1 joerg 256 1.1 joerg /// This is internal intrinsic. C/C++ user should avoid calling it directly. 257 1.1 joerg static __inline__ _tile1024i __DEFAULT_FN_ATTRS_INT8 258 1.1 joerg _tile_dpbusd_internal(unsigned short m, unsigned short n, unsigned short k, 259 1.1 joerg _tile1024i dst, _tile1024i src1, _tile1024i src2) { 260 1.1 joerg return __builtin_ia32_tdpbusd_internal(m, n, k, dst, src1, src2); 261 1.1 joerg } 262 1.1 joerg 263 1.1 joerg /// This is internal intrinsic. C/C++ user should avoid calling it directly. 264 1.1 joerg static __inline__ _tile1024i __DEFAULT_FN_ATTRS_INT8 265 1.1 joerg _tile_dpbuud_internal(unsigned short m, unsigned short n, unsigned short k, 266 1.1 joerg _tile1024i dst, _tile1024i src1, _tile1024i src2) { 267 1.1 joerg return __builtin_ia32_tdpbuud_internal(m, n, k, dst, src1, src2); 268 1.1 joerg } 269 1.1 joerg 270 1.1 joerg /// This is internal intrinsic. C/C++ user should avoid calling it directly. 271 1.1 joerg static __inline__ void __DEFAULT_FN_ATTRS_INT8 272 1.1 joerg _tile_stored_internal(unsigned short m, unsigned short n, void *base, 273 1.1 joerg __SIZE_TYPE__ stride, _tile1024i tile) { 274 1.1 joerg return __builtin_ia32_tilestored64_internal(m, n, base, 275 1.1 joerg (__SIZE_TYPE__)(stride), tile); 276 1.1 joerg } 277 1.1 joerg 278 1.1 joerg /// This is internal intrinsic. C/C++ user should avoid calling it directly. 279 1.1 joerg static __inline__ _tile1024i __DEFAULT_FN_ATTRS_BF16 280 1.1 joerg _tile_dpbf16ps_internal(unsigned short m, unsigned short n, unsigned short k, 281 1.1 joerg _tile1024i dst, _tile1024i src1, _tile1024i src2) { 282 1.1 joerg return __builtin_ia32_tdpbf16ps_internal(m, n, k, dst, src1, src2); 283 1.1 joerg } 284 1.1 joerg 285 1.1 joerg /// This struct pack the shape and tile data together for user. We suggest 286 1.1 joerg /// initializing the struct as early as possible, because compiler depends 287 1.1 joerg /// on the shape information to do configure. The constant value is preferred 288 1.1 joerg /// for optimization by compiler. 289 1.1 joerg typedef struct __tile1024i_str { 290 1.1 joerg const unsigned short row; 291 1.1 joerg const unsigned short col; 292 1.1 joerg _tile1024i tile; 293 1.1 joerg } __tile1024i; 294 1.1 joerg 295 1.1 joerg /// Load tile rows from memory specifieid by "base" address and "stride" into 296 1.1 joerg /// destination tile "dst". 297 1.1 joerg /// 298 1.1 joerg /// \headerfile <immintrin.h> 299 1.1 joerg /// 300 1.1 joerg /// This intrinsic corresponds to the <c> TILELOADD </c> instruction. 301 1.1 joerg /// 302 1.1 joerg /// \param dst 303 1.1 joerg /// A destination tile. Max size is 1024 Bytes. 304 1.1 joerg /// \param base 305 1.1 joerg /// A pointer to base address. 306 1.1 joerg /// \param stride 307 1.1 joerg /// The stride between the rows' data to be loaded in memory. 308 1.1 joerg __DEFAULT_FN_ATTRS_TILE 309 1.1 joerg static void __tile_loadd(__tile1024i *dst, const void *base, 310 1.1 joerg __SIZE_TYPE__ stride) { 311 1.1 joerg dst->tile = _tile_loadd_internal(dst->row, dst->col, base, stride); 312 1.1 joerg } 313 1.1 joerg 314 1.1 joerg /// Compute dot-product of bytes in tiles with a source/destination accumulator. 315 1.1 joerg /// Multiply groups of 4 adjacent pairs of signed 8-bit integers in src0 with 316 1.1 joerg /// corresponding signed 8-bit integers in src1, producing 4 intermediate 32-bit 317 1.1 joerg /// results. Sum these 4 results with the corresponding 32-bit integer in "dst", 318 1.1 joerg /// and store the 32-bit result back to tile "dst". 319 1.1 joerg /// 320 1.1 joerg /// \headerfile <immintrin.h> 321 1.1 joerg /// 322 1.1 joerg /// This intrinsic corresponds to the <c> TDPBSSD </c> instruction. 323 1.1 joerg /// 324 1.1 joerg /// \param dst 325 1.1 joerg /// The destination tile. Max size is 1024 Bytes. 326 1.1 joerg /// \param src0 327 1.1 joerg /// The 1st source tile. Max size is 1024 Bytes. 328 1.1 joerg /// \param src1 329 1.1 joerg /// The 2nd source tile. Max size is 1024 Bytes. 330 1.1 joerg __DEFAULT_FN_ATTRS_INT8 331 1.1 joerg static void __tile_dpbssd(__tile1024i *dst, __tile1024i src0, 332 1.1 joerg __tile1024i src1) { 333 1.1 joerg dst->tile = _tile_dpbssd_internal(src0.row, src1.col, src0.col, dst->tile, 334 1.1 joerg src0.tile, src1.tile); 335 1.1 joerg } 336 1.1 joerg 337 1.1 joerg /// Compute dot-product of bytes in tiles with a source/destination accumulator. 338 1.1 joerg /// Multiply groups of 4 adjacent pairs of signed 8-bit integers in src0 with 339 1.1 joerg /// corresponding unsigned 8-bit integers in src1, producing 4 intermediate 340 1.1 joerg /// 32-bit results. Sum these 4 results with the corresponding 32-bit integer 341 1.1 joerg /// in "dst", and store the 32-bit result back to tile "dst". 342 1.1 joerg /// 343 1.1 joerg /// \headerfile <immintrin.h> 344 1.1 joerg /// 345 1.1 joerg /// This intrinsic corresponds to the <c> TDPBSUD </c> instruction. 346 1.1 joerg /// 347 1.1 joerg /// \param dst 348 1.1 joerg /// The destination tile. Max size is 1024 Bytes. 349 1.1 joerg /// \param src0 350 1.1 joerg /// The 1st source tile. Max size is 1024 Bytes. 351 1.1 joerg /// \param src1 352 1.1 joerg /// The 2nd source tile. Max size is 1024 Bytes. 353 1.1 joerg __DEFAULT_FN_ATTRS_INT8 354 1.1 joerg static void __tile_dpbsud(__tile1024i *dst, __tile1024i src0, 355 1.1 joerg __tile1024i src1) { 356 1.1 joerg dst->tile = _tile_dpbsud_internal(src0.row, src1.col, src0.col, dst->tile, 357 1.1 joerg src0.tile, src1.tile); 358 1.1 joerg } 359 1.1 joerg 360 1.1 joerg /// Compute dot-product of bytes in tiles with a source/destination accumulator. 361 1.1 joerg /// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in src0 with 362 1.1 joerg /// corresponding signed 8-bit integers in src1, producing 4 intermediate 32-bit 363 1.1 joerg /// results. Sum these 4 results with the corresponding 32-bit integer in "dst", 364 1.1 joerg /// and store the 32-bit result back to tile "dst". 365 1.1 joerg /// 366 1.1 joerg /// \headerfile <immintrin.h> 367 1.1 joerg /// 368 1.1 joerg /// This intrinsic corresponds to the <c> TDPBUSD </c> instruction. 369 1.1 joerg /// 370 1.1 joerg /// \param dst 371 1.1 joerg /// The destination tile. Max size is 1024 Bytes. 372 1.1 joerg /// \param src0 373 1.1 joerg /// The 1st source tile. Max size is 1024 Bytes. 374 1.1 joerg /// \param src1 375 1.1 joerg /// The 2nd source tile. Max size is 1024 Bytes. 376 1.1 joerg __DEFAULT_FN_ATTRS_INT8 377 1.1 joerg static void __tile_dpbusd(__tile1024i *dst, __tile1024i src0, 378 1.1 joerg __tile1024i src1) { 379 1.1 joerg dst->tile = _tile_dpbusd_internal(src0.row, src1.col, src0.col, dst->tile, 380 1.1 joerg src0.tile, src1.tile); 381 1.1 joerg } 382 1.1 joerg 383 1.1 joerg /// Compute dot-product of bytes in tiles with a source/destination accumulator. 384 1.1 joerg /// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in src0 with 385 1.1 joerg /// corresponding unsigned 8-bit integers in src1, producing 4 intermediate 386 1.1 joerg /// 32-bit results. Sum these 4 results with the corresponding 32-bit integer in 387 1.1 joerg /// "dst", and store the 32-bit result back to tile "dst". 388 1.1 joerg /// 389 1.1 joerg /// \headerfile <immintrin.h> 390 1.1 joerg /// 391 1.1 joerg /// This intrinsic corresponds to the <c> TDPBUUD </c> instruction. 392 1.1 joerg /// 393 1.1 joerg /// \param dst 394 1.1 joerg /// The destination tile. Max size is 1024 Bytes. 395 1.1 joerg /// \param src0 396 1.1 joerg /// The 1st source tile. Max size is 1024 Bytes. 397 1.1 joerg /// \param src1 398 1.1 joerg /// The 2nd source tile. Max size is 1024 Bytes. 399 1.1 joerg __DEFAULT_FN_ATTRS_INT8 400 1.1 joerg static void __tile_dpbuud(__tile1024i *dst, __tile1024i src0, 401 1.1 joerg __tile1024i src1) { 402 1.1 joerg dst->tile = _tile_dpbuud_internal(src0.row, src1.col, src0.col, dst->tile, 403 1.1 joerg src0.tile, src1.tile); 404 1.1 joerg } 405 1.1 joerg 406 1.1 joerg /// Store the tile specified by "src" to memory specifieid by "base" address and 407 1.1 joerg /// "stride". 408 1.1 joerg /// 409 1.1 joerg /// \headerfile <immintrin.h> 410 1.1 joerg /// 411 1.1 joerg /// This intrinsic corresponds to the <c> TILESTORED </c> instruction. 412 1.1 joerg /// 413 1.1 joerg /// \param dst 414 1.1 joerg /// A destination tile. Max size is 1024 Bytes. 415 1.1 joerg /// \param base 416 1.1 joerg /// A pointer to base address. 417 1.1 joerg /// \param stride 418 1.1 joerg /// The stride between the rows' data to be stored in memory. 419 1.1 joerg __DEFAULT_FN_ATTRS_TILE 420 1.1 joerg static void __tile_stored(void *base, __SIZE_TYPE__ stride, __tile1024i src) { 421 1.1 joerg _tile_stored_internal(src.row, src.col, base, stride, src.tile); 422 1.1 joerg } 423 1.1 joerg 424 1.1 joerg /// Zero the tile specified by "dst". 425 1.1 joerg /// 426 1.1 joerg /// \headerfile <immintrin.h> 427 1.1 joerg /// 428 1.1 joerg /// This intrinsic corresponds to the <c> TILEZERO </c> instruction. 429 1.1 joerg /// 430 1.1 joerg /// \param dst 431 1.1 joerg /// The destination tile to be zero. Max size is 1024 Bytes. 432 1.1 joerg __DEFAULT_FN_ATTRS_TILE 433 1.1 joerg static void __tile_zero(__tile1024i *dst) { 434 1.1 joerg dst->tile = __builtin_ia32_tilezero_internal(dst->row, dst->col); 435 1.1 joerg } 436 1.1 joerg 437 1.1 joerg /// Compute dot-product of BF16 (16-bit) floating-point pairs in tiles src0 and 438 1.1 joerg /// src1, accumulating the intermediate single-precision (32-bit) floating-point 439 1.1 joerg /// elements with elements in "dst", and store the 32-bit result back to tile 440 1.1 joerg /// "dst". 441 1.1 joerg /// 442 1.1 joerg /// \headerfile <immintrin.h> 443 1.1 joerg /// 444 1.1 joerg /// This intrinsic corresponds to the <c> TDPBF16PS </c> instruction. 445 1.1 joerg /// 446 1.1 joerg /// \param dst 447 1.1 joerg /// The destination tile. Max size is 1024 Bytes. 448 1.1 joerg /// \param src0 449 1.1 joerg /// The 1st source tile. Max size is 1024 Bytes. 450 1.1 joerg /// \param src1 451 1.1 joerg /// The 2nd source tile. Max size is 1024 Bytes. 452 1.1 joerg __DEFAULT_FN_ATTRS_BF16 453 1.1 joerg static void __tile_dpbf16ps(__tile1024i *dst, __tile1024i src0, 454 1.1 joerg __tile1024i src1) { 455 1.1 joerg dst->tile = _tile_dpbf16ps_internal(src0.row, src1.col, src0.col, dst->tile, 456 1.1 joerg src0.tile, src1.tile); 457 1.1 joerg } 458 1.1 joerg 459 1.1 joerg #undef __DEFAULT_FN_ATTRS_TILE 460 1.1 joerg #undef __DEFAULT_FN_ATTRS_INT8 461 1.1 joerg #undef __DEFAULT_FN_ATTRS_BF16 462 1.1 joerg 463 1.1 joerg #endif /* __x86_64__ */ 464 1.1 joerg #endif /* __AMXINTRIN_H */ 465