Home | History | Annotate | Line # | Download | only in Headers
      1  1.1  joerg /*===--------------- amxintrin.h - AMX intrinsics -*- C/C++ -*---------------===
      2  1.1  joerg  *
      3  1.1  joerg  * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
      4  1.1  joerg  * See https://llvm.org/LICENSE.txt for license information.
      5  1.1  joerg  * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
      6  1.1  joerg  *
      7  1.1  joerg  *===------------------------------------------------------------------------===
      8  1.1  joerg  */
      9  1.1  joerg 
     10  1.1  joerg #ifndef __IMMINTRIN_H
     11  1.1  joerg #error "Never use <amxintrin.h> directly; include <immintrin.h> instead."
     12  1.1  joerg #endif /* __IMMINTRIN_H */
     13  1.1  joerg 
     14  1.1  joerg #ifndef __AMXINTRIN_H
     15  1.1  joerg #define __AMXINTRIN_H
     16  1.1  joerg #ifdef __x86_64__
     17  1.1  joerg 
     18  1.1  joerg /* Define the default attributes for the functions in this file. */
     19  1.1  joerg #define __DEFAULT_FN_ATTRS_TILE                                                \
     20  1.1  joerg   __attribute__((__always_inline__, __nodebug__, __target__("amx-tile")))
     21  1.1  joerg #define __DEFAULT_FN_ATTRS_INT8                                                \
     22  1.1  joerg   __attribute__((__always_inline__, __nodebug__, __target__("amx-int8")))
     23  1.1  joerg #define __DEFAULT_FN_ATTRS_BF16                                                \
     24  1.1  joerg   __attribute__((__always_inline__, __nodebug__, __target__("amx-bf16")))
     25  1.1  joerg 
     26  1.1  joerg /// Load tile configuration from a 64-byte memory location specified by
     27  1.1  joerg /// "mem_addr". The tile configuration includes the tile type palette, the
     28  1.1  joerg /// number of bytes per row, and the number of rows. If the specified
     29  1.1  joerg /// palette_id is zero, that signifies the init state for both the tile
     30  1.1  joerg /// config and the tile data, and the tiles are zeroed. Any invalid
     31  1.1  joerg /// configurations will result in #GP fault.
     32  1.1  joerg ///
     33  1.1  joerg /// \headerfile <immintrin.h>
     34  1.1  joerg ///
     35  1.1  joerg /// This intrinsic corresponds to the <c> LDTILECFG </c> instruction.
     36  1.1  joerg ///
     37  1.1  joerg /// \param __config
     38  1.1  joerg ///    A pointer to 512-bits configuration
     39  1.1  joerg static __inline__ void __DEFAULT_FN_ATTRS_TILE
     40  1.1  joerg _tile_loadconfig(const void *__config) {
     41  1.1  joerg   __builtin_ia32_tile_loadconfig(__config);
     42  1.1  joerg }
     43  1.1  joerg 
     44  1.1  joerg /// Stores the current tile configuration to a 64-byte memory location
     45  1.1  joerg /// specified by "mem_addr". The tile configuration includes the tile type
     46  1.1  joerg /// palette, the number of bytes per row, and the number of rows. If tiles
     47  1.1  joerg /// are not configured, all zeroes will be stored to memory.
     48  1.1  joerg ///
     49  1.1  joerg /// \headerfile <immintrin.h>
     50  1.1  joerg ///
     51  1.1  joerg /// This intrinsic corresponds to the <c> STTILECFG </c> instruction.
     52  1.1  joerg ///
     53  1.1  joerg /// \param __config
     54  1.1  joerg ///    A pointer to 512-bits configuration
     55  1.1  joerg static __inline__ void __DEFAULT_FN_ATTRS_TILE
     56  1.1  joerg _tile_storeconfig(void *__config) {
     57  1.1  joerg   __builtin_ia32_tile_storeconfig(__config);
     58  1.1  joerg }
     59  1.1  joerg 
     60  1.1  joerg /// Release the tile configuration to return to the init state, which
     61  1.1  joerg /// releases all storage it currently holds.
     62  1.1  joerg ///
     63  1.1  joerg /// \headerfile <immintrin.h>
     64  1.1  joerg ///
     65  1.1  joerg /// This intrinsic corresponds to the <c> TILERELEASE </c> instruction.
     66  1.1  joerg static __inline__ void __DEFAULT_FN_ATTRS_TILE _tile_release(void) {
     67  1.1  joerg   __builtin_ia32_tilerelease();
     68  1.1  joerg }
     69  1.1  joerg 
     70  1.1  joerg /// Load tile rows from memory specifieid by "base" address and "stride" into
     71  1.1  joerg /// destination tile "dst" using the tile configuration previously configured
     72  1.1  joerg /// via "_tile_loadconfig".
     73  1.1  joerg ///
     74  1.1  joerg /// \headerfile <immintrin.h>
     75  1.1  joerg ///
     76  1.1  joerg /// This intrinsic corresponds to the <c> TILELOADD </c> instruction.
     77  1.1  joerg ///
     78  1.1  joerg /// \param dst
     79  1.1  joerg ///    A destination tile. Max size is 1024 Bytes.
     80  1.1  joerg /// \param base
     81  1.1  joerg ///    A pointer to base address.
     82  1.1  joerg /// \param stride
     83  1.1  joerg ///    The stride between the rows' data to be loaded in memory.
     84  1.1  joerg #define _tile_loadd(dst, base, stride)                                         \
     85  1.1  joerg   __builtin_ia32_tileloadd64((dst), ((const void *)(base)),                    \
     86  1.1  joerg                              (__SIZE_TYPE__)(stride))
     87  1.1  joerg 
     88  1.1  joerg /// Load tile rows from memory specifieid by "base" address and "stride" into
     89  1.1  joerg /// destination tile "dst" using the tile configuration previously configured
     90  1.1  joerg /// via "_tile_loadconfig". This intrinsic provides a hint to the implementation
     91  1.1  joerg /// that the data will likely not be reused in the near future and the data
     92  1.1  joerg /// caching can be optimized accordingly.
     93  1.1  joerg ///
     94  1.1  joerg /// \headerfile <immintrin.h>
     95  1.1  joerg ///
     96  1.1  joerg /// This intrinsic corresponds to the <c> TILELOADDT1 </c> instruction.
     97  1.1  joerg ///
     98  1.1  joerg /// \param dst
     99  1.1  joerg ///    A destination tile. Max size is 1024 Bytes.
    100  1.1  joerg /// \param base
    101  1.1  joerg ///    A pointer to base address.
    102  1.1  joerg /// \param stride
    103  1.1  joerg ///    The stride between the rows' data to be loaded in memory.
    104  1.1  joerg #define _tile_stream_loadd(dst, base, stride)                                  \
    105  1.1  joerg   __builtin_ia32_tileloaddt164((dst), ((const void *)(base)),                  \
    106  1.1  joerg                                (__SIZE_TYPE__)(stride))
    107  1.1  joerg 
    108  1.1  joerg /// Store the tile specified by "src" to memory specifieid by "base" address and
    109  1.1  joerg /// "stride" using the tile configuration previously configured via
    110  1.1  joerg /// "_tile_loadconfig".
    111  1.1  joerg ///
    112  1.1  joerg /// \headerfile <immintrin.h>
    113  1.1  joerg ///
    114  1.1  joerg /// This intrinsic corresponds to the <c> TILESTORED </c> instruction.
    115  1.1  joerg ///
    116  1.1  joerg /// \param dst
    117  1.1  joerg ///    A destination tile. Max size is 1024 Bytes.
    118  1.1  joerg /// \param base
    119  1.1  joerg ///    A pointer to base address.
    120  1.1  joerg /// \param stride
    121  1.1  joerg ///    The stride between the rows' data to be stored in memory.
    122  1.1  joerg #define _tile_stored(dst, base, stride)                                        \
    123  1.1  joerg   __builtin_ia32_tilestored64((dst), ((void *)(base)), (__SIZE_TYPE__)(stride))
    124  1.1  joerg 
    125  1.1  joerg /// Zero the tile specified by "tdest".
    126  1.1  joerg ///
    127  1.1  joerg /// \headerfile <immintrin.h>
    128  1.1  joerg ///
    129  1.1  joerg /// This intrinsic corresponds to the <c> TILEZERO </c> instruction.
    130  1.1  joerg ///
    131  1.1  joerg /// \param tile
    132  1.1  joerg ///    The destination tile to be zero. Max size is 1024 Bytes.
    133  1.1  joerg #define _tile_zero(tile) __builtin_ia32_tilezero((tile))
    134  1.1  joerg 
    135  1.1  joerg /// Compute dot-product of bytes in tiles with a source/destination accumulator.
    136  1.1  joerg /// Multiply groups of 4 adjacent pairs of signed 8-bit integers in src0 with
    137  1.1  joerg /// corresponding signed 8-bit integers in src1, producing 4 intermediate 32-bit
    138  1.1  joerg /// results. Sum these 4 results with the corresponding 32-bit integer in "dst",
    139  1.1  joerg /// and store the 32-bit result back to tile "dst".
    140  1.1  joerg ///
    141  1.1  joerg /// \headerfile <immintrin.h>
    142  1.1  joerg ///
    143  1.1  joerg /// This intrinsic corresponds to the <c> TDPBSSD </c> instruction.
    144  1.1  joerg ///
    145  1.1  joerg /// \param dst
    146  1.1  joerg ///    The destination tile. Max size is 1024 Bytes.
    147  1.1  joerg /// \param src0
    148  1.1  joerg ///    The 1st source tile. Max size is 1024 Bytes.
    149  1.1  joerg /// \param src1
    150  1.1  joerg ///    The 2nd source tile. Max size is 1024 Bytes.
    151  1.1  joerg #define _tile_dpbssd(dst, src0, src1)                                          \
    152  1.1  joerg   __builtin_ia32_tdpbssd((dst), (src0), (src1))
    153  1.1  joerg 
    154  1.1  joerg /// Compute dot-product of bytes in tiles with a source/destination accumulator.
    155  1.1  joerg /// Multiply groups of 4 adjacent pairs of signed 8-bit integers in src0 with
    156  1.1  joerg /// corresponding unsigned 8-bit integers in src1, producing 4 intermediate
    157  1.1  joerg /// 32-bit results. Sum these 4 results with the corresponding 32-bit integer
    158  1.1  joerg /// in "dst", and store the 32-bit result back to tile "dst".
    159  1.1  joerg ///
    160  1.1  joerg /// \headerfile <immintrin.h>
    161  1.1  joerg ///
    162  1.1  joerg /// This intrinsic corresponds to the <c> TDPBSUD </c> instruction.
    163  1.1  joerg ///
    164  1.1  joerg /// \param dst
    165  1.1  joerg ///    The destination tile. Max size is 1024 Bytes.
    166  1.1  joerg /// \param src0
    167  1.1  joerg ///    The 1st source tile. Max size is 1024 Bytes.
    168  1.1  joerg /// \param src1
    169  1.1  joerg ///    The 2nd source tile. Max size is 1024 Bytes.
    170  1.1  joerg #define _tile_dpbsud(dst, src0, src1)                                          \
    171  1.1  joerg   __builtin_ia32_tdpbsud((dst), (src0), (src1))
    172  1.1  joerg 
    173  1.1  joerg /// Compute dot-product of bytes in tiles with a source/destination accumulator.
    174  1.1  joerg /// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in src0 with
    175  1.1  joerg /// corresponding signed 8-bit integers in src1, producing 4 intermediate 32-bit
    176  1.1  joerg /// results. Sum these 4 results with the corresponding 32-bit integer in "dst",
    177  1.1  joerg /// and store the 32-bit result back to tile "dst".
    178  1.1  joerg ///
    179  1.1  joerg /// \headerfile <immintrin.h>
    180  1.1  joerg ///
    181  1.1  joerg /// This intrinsic corresponds to the <c> TDPBUSD </c> instruction.
    182  1.1  joerg ///
    183  1.1  joerg /// \param dst
    184  1.1  joerg ///    The destination tile. Max size is 1024 Bytes.
    185  1.1  joerg /// \param src0
    186  1.1  joerg ///    The 1st source tile. Max size is 1024 Bytes.
    187  1.1  joerg /// \param src1
    188  1.1  joerg ///    The 2nd source tile. Max size is 1024 Bytes.
    189  1.1  joerg #define _tile_dpbusd(dst, src0, src1)                                          \
    190  1.1  joerg   __builtin_ia32_tdpbusd((dst), (src0), (src1))
    191  1.1  joerg 
    192  1.1  joerg /// Compute dot-product of bytes in tiles with a source/destination accumulator.
    193  1.1  joerg /// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in src0 with
    194  1.1  joerg /// corresponding unsigned 8-bit integers in src1, producing 4 intermediate
    195  1.1  joerg /// 32-bit results. Sum these 4 results with the corresponding 32-bit integer in
    196  1.1  joerg /// "dst", and store the 32-bit result back to tile "dst".
    197  1.1  joerg ///
    198  1.1  joerg /// \headerfile <immintrin.h>
    199  1.1  joerg ///
    200  1.1  joerg /// This intrinsic corresponds to the <c> TDPBUUD </c> instruction.
    201  1.1  joerg ///
    202  1.1  joerg /// \param dst
    203  1.1  joerg ///    The destination tile. Max size is 1024 Bytes.
    204  1.1  joerg /// \param src0
    205  1.1  joerg ///    The 1st source tile. Max size is 1024 Bytes.
    206  1.1  joerg /// \param src1
    207  1.1  joerg ///    The 2nd source tile. Max size is 1024 Bytes.
    208  1.1  joerg #define _tile_dpbuud(dst, src0, src1)                                          \
    209  1.1  joerg   __builtin_ia32_tdpbuud((dst), (src0), (src1))
    210  1.1  joerg 
    211  1.1  joerg /// Compute dot-product of BF16 (16-bit) floating-point pairs in tiles src0 and
    212  1.1  joerg /// src1, accumulating the intermediate single-precision (32-bit) floating-point
    213  1.1  joerg /// elements with elements in "dst", and store the 32-bit result back to tile
    214  1.1  joerg /// "dst".
    215  1.1  joerg ///
    216  1.1  joerg /// \headerfile <immintrin.h>
    217  1.1  joerg ///
    218  1.1  joerg /// This intrinsic corresponds to the <c> TDPBF16PS </c> instruction.
    219  1.1  joerg ///
    220  1.1  joerg /// \param dst
    221  1.1  joerg ///    The destination tile. Max size is 1024 Bytes.
    222  1.1  joerg /// \param src0
    223  1.1  joerg ///    The 1st source tile. Max size is 1024 Bytes.
    224  1.1  joerg /// \param src1
    225  1.1  joerg ///    The 2nd source tile. Max size is 1024 Bytes.
    226  1.1  joerg #define _tile_dpbf16ps(dst, src0, src1)                                        \
    227  1.1  joerg   __builtin_ia32_tdpbf16ps((dst), (src0), (src1))
    228  1.1  joerg 
    229  1.1  joerg /// AMX tile register size can be configured, the maximum size is 16x64=1024
    230  1.1  joerg /// bytes. Since there is no 2D type in llvm IR, we use vector type to
    231  1.1  joerg /// represent 2D tile and the fixed size is maximum amx tile register size.
    232  1.1  joerg typedef int _tile1024i __attribute__((__vector_size__(1024), __aligned__(64)));
    233  1.1  joerg 
    234  1.1  joerg /// This is internal intrinsic. C/C++ user should avoid calling it directly.
    235  1.1  joerg static __inline__ _tile1024i __DEFAULT_FN_ATTRS_INT8
    236  1.1  joerg _tile_loadd_internal(unsigned short m, unsigned short n, const void *base,
    237  1.1  joerg                      __SIZE_TYPE__ stride) {
    238  1.1  joerg   return __builtin_ia32_tileloadd64_internal(m, n, base,
    239  1.1  joerg                                              (__SIZE_TYPE__)(stride));
    240  1.1  joerg }
    241  1.1  joerg 
    242  1.1  joerg /// This is internal intrinsic. C/C++ user should avoid calling it directly.
    243  1.1  joerg static __inline__ _tile1024i __DEFAULT_FN_ATTRS_INT8
    244  1.1  joerg _tile_dpbssd_internal(unsigned short m, unsigned short n, unsigned short k,
    245  1.1  joerg                       _tile1024i dst, _tile1024i src1, _tile1024i src2) {
    246  1.1  joerg   return __builtin_ia32_tdpbssd_internal(m, n, k, dst, src1, src2);
    247  1.1  joerg }
    248  1.1  joerg 
    249  1.1  joerg /// This is internal intrinsic. C/C++ user should avoid calling it directly.
    250  1.1  joerg static __inline__ _tile1024i __DEFAULT_FN_ATTRS_INT8
    251  1.1  joerg _tile_dpbsud_internal(unsigned short m, unsigned short n, unsigned short k,
    252  1.1  joerg                       _tile1024i dst, _tile1024i src1, _tile1024i src2) {
    253  1.1  joerg   return __builtin_ia32_tdpbsud_internal(m, n, k, dst, src1, src2);
    254  1.1  joerg }
    255  1.1  joerg 
    256  1.1  joerg /// This is internal intrinsic. C/C++ user should avoid calling it directly.
    257  1.1  joerg static __inline__ _tile1024i __DEFAULT_FN_ATTRS_INT8
    258  1.1  joerg _tile_dpbusd_internal(unsigned short m, unsigned short n, unsigned short k,
    259  1.1  joerg                       _tile1024i dst, _tile1024i src1, _tile1024i src2) {
    260  1.1  joerg   return __builtin_ia32_tdpbusd_internal(m, n, k, dst, src1, src2);
    261  1.1  joerg }
    262  1.1  joerg 
    263  1.1  joerg /// This is internal intrinsic. C/C++ user should avoid calling it directly.
    264  1.1  joerg static __inline__ _tile1024i __DEFAULT_FN_ATTRS_INT8
    265  1.1  joerg _tile_dpbuud_internal(unsigned short m, unsigned short n, unsigned short k,
    266  1.1  joerg                       _tile1024i dst, _tile1024i src1, _tile1024i src2) {
    267  1.1  joerg   return __builtin_ia32_tdpbuud_internal(m, n, k, dst, src1, src2);
    268  1.1  joerg }
    269  1.1  joerg 
    270  1.1  joerg /// This is internal intrinsic. C/C++ user should avoid calling it directly.
    271  1.1  joerg static __inline__ void __DEFAULT_FN_ATTRS_INT8
    272  1.1  joerg _tile_stored_internal(unsigned short m, unsigned short n, void *base,
    273  1.1  joerg                       __SIZE_TYPE__ stride, _tile1024i tile) {
    274  1.1  joerg   return __builtin_ia32_tilestored64_internal(m, n, base,
    275  1.1  joerg                                               (__SIZE_TYPE__)(stride), tile);
    276  1.1  joerg }
    277  1.1  joerg 
    278  1.1  joerg /// This is internal intrinsic. C/C++ user should avoid calling it directly.
    279  1.1  joerg static __inline__ _tile1024i __DEFAULT_FN_ATTRS_BF16
    280  1.1  joerg _tile_dpbf16ps_internal(unsigned short m, unsigned short n, unsigned short k,
    281  1.1  joerg                         _tile1024i dst, _tile1024i src1, _tile1024i src2) {
    282  1.1  joerg   return __builtin_ia32_tdpbf16ps_internal(m, n, k, dst, src1, src2);
    283  1.1  joerg }
    284  1.1  joerg 
    285  1.1  joerg /// This struct pack the shape and tile data together for user. We suggest
    286  1.1  joerg /// initializing the struct as early as possible, because compiler depends
    287  1.1  joerg /// on the shape information to do configure. The constant value is preferred
    288  1.1  joerg /// for optimization by compiler.
    289  1.1  joerg typedef struct __tile1024i_str {
    290  1.1  joerg   const unsigned short row;
    291  1.1  joerg   const unsigned short col;
    292  1.1  joerg   _tile1024i tile;
    293  1.1  joerg } __tile1024i;
    294  1.1  joerg 
    295  1.1  joerg /// Load tile rows from memory specifieid by "base" address and "stride" into
    296  1.1  joerg /// destination tile "dst".
    297  1.1  joerg ///
    298  1.1  joerg /// \headerfile <immintrin.h>
    299  1.1  joerg ///
    300  1.1  joerg /// This intrinsic corresponds to the <c> TILELOADD </c> instruction.
    301  1.1  joerg ///
    302  1.1  joerg /// \param dst
    303  1.1  joerg ///    A destination tile. Max size is 1024 Bytes.
    304  1.1  joerg /// \param base
    305  1.1  joerg ///    A pointer to base address.
    306  1.1  joerg /// \param stride
    307  1.1  joerg ///    The stride between the rows' data to be loaded in memory.
    308  1.1  joerg __DEFAULT_FN_ATTRS_TILE
    309  1.1  joerg static void __tile_loadd(__tile1024i *dst, const void *base,
    310  1.1  joerg                          __SIZE_TYPE__ stride) {
    311  1.1  joerg   dst->tile = _tile_loadd_internal(dst->row, dst->col, base, stride);
    312  1.1  joerg }
    313  1.1  joerg 
    314  1.1  joerg /// Compute dot-product of bytes in tiles with a source/destination accumulator.
    315  1.1  joerg /// Multiply groups of 4 adjacent pairs of signed 8-bit integers in src0 with
    316  1.1  joerg /// corresponding signed 8-bit integers in src1, producing 4 intermediate 32-bit
    317  1.1  joerg /// results. Sum these 4 results with the corresponding 32-bit integer in "dst",
    318  1.1  joerg /// and store the 32-bit result back to tile "dst".
    319  1.1  joerg ///
    320  1.1  joerg /// \headerfile <immintrin.h>
    321  1.1  joerg ///
    322  1.1  joerg /// This intrinsic corresponds to the <c> TDPBSSD </c> instruction.
    323  1.1  joerg ///
    324  1.1  joerg /// \param dst
    325  1.1  joerg ///    The destination tile. Max size is 1024 Bytes.
    326  1.1  joerg /// \param src0
    327  1.1  joerg ///    The 1st source tile. Max size is 1024 Bytes.
    328  1.1  joerg /// \param src1
    329  1.1  joerg ///    The 2nd source tile. Max size is 1024 Bytes.
    330  1.1  joerg __DEFAULT_FN_ATTRS_INT8
    331  1.1  joerg static void __tile_dpbssd(__tile1024i *dst, __tile1024i src0,
    332  1.1  joerg                           __tile1024i src1) {
    333  1.1  joerg   dst->tile = _tile_dpbssd_internal(src0.row, src1.col, src0.col, dst->tile,
    334  1.1  joerg                                     src0.tile, src1.tile);
    335  1.1  joerg }
    336  1.1  joerg 
    337  1.1  joerg /// Compute dot-product of bytes in tiles with a source/destination accumulator.
    338  1.1  joerg /// Multiply groups of 4 adjacent pairs of signed 8-bit integers in src0 with
    339  1.1  joerg /// corresponding unsigned 8-bit integers in src1, producing 4 intermediate
    340  1.1  joerg /// 32-bit results. Sum these 4 results with the corresponding 32-bit integer
    341  1.1  joerg /// in "dst", and store the 32-bit result back to tile "dst".
    342  1.1  joerg ///
    343  1.1  joerg /// \headerfile <immintrin.h>
    344  1.1  joerg ///
    345  1.1  joerg /// This intrinsic corresponds to the <c> TDPBSUD </c> instruction.
    346  1.1  joerg ///
    347  1.1  joerg /// \param dst
    348  1.1  joerg ///    The destination tile. Max size is 1024 Bytes.
    349  1.1  joerg /// \param src0
    350  1.1  joerg ///    The 1st source tile. Max size is 1024 Bytes.
    351  1.1  joerg /// \param src1
    352  1.1  joerg ///    The 2nd source tile. Max size is 1024 Bytes.
    353  1.1  joerg __DEFAULT_FN_ATTRS_INT8
    354  1.1  joerg static void __tile_dpbsud(__tile1024i *dst, __tile1024i src0,
    355  1.1  joerg                           __tile1024i src1) {
    356  1.1  joerg   dst->tile = _tile_dpbsud_internal(src0.row, src1.col, src0.col, dst->tile,
    357  1.1  joerg                                     src0.tile, src1.tile);
    358  1.1  joerg }
    359  1.1  joerg 
    360  1.1  joerg /// Compute dot-product of bytes in tiles with a source/destination accumulator.
    361  1.1  joerg /// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in src0 with
    362  1.1  joerg /// corresponding signed 8-bit integers in src1, producing 4 intermediate 32-bit
    363  1.1  joerg /// results. Sum these 4 results with the corresponding 32-bit integer in "dst",
    364  1.1  joerg /// and store the 32-bit result back to tile "dst".
    365  1.1  joerg ///
    366  1.1  joerg /// \headerfile <immintrin.h>
    367  1.1  joerg ///
    368  1.1  joerg /// This intrinsic corresponds to the <c> TDPBUSD </c> instruction.
    369  1.1  joerg ///
    370  1.1  joerg /// \param dst
    371  1.1  joerg ///    The destination tile. Max size is 1024 Bytes.
    372  1.1  joerg /// \param src0
    373  1.1  joerg ///    The 1st source tile. Max size is 1024 Bytes.
    374  1.1  joerg /// \param src1
    375  1.1  joerg ///    The 2nd source tile. Max size is 1024 Bytes.
    376  1.1  joerg __DEFAULT_FN_ATTRS_INT8
    377  1.1  joerg static void __tile_dpbusd(__tile1024i *dst, __tile1024i src0,
    378  1.1  joerg                           __tile1024i src1) {
    379  1.1  joerg   dst->tile = _tile_dpbusd_internal(src0.row, src1.col, src0.col, dst->tile,
    380  1.1  joerg                                     src0.tile, src1.tile);
    381  1.1  joerg }
    382  1.1  joerg 
    383  1.1  joerg /// Compute dot-product of bytes in tiles with a source/destination accumulator.
    384  1.1  joerg /// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in src0 with
    385  1.1  joerg /// corresponding unsigned 8-bit integers in src1, producing 4 intermediate
    386  1.1  joerg /// 32-bit results. Sum these 4 results with the corresponding 32-bit integer in
    387  1.1  joerg /// "dst", and store the 32-bit result back to tile "dst".
    388  1.1  joerg ///
    389  1.1  joerg /// \headerfile <immintrin.h>
    390  1.1  joerg ///
    391  1.1  joerg /// This intrinsic corresponds to the <c> TDPBUUD </c> instruction.
    392  1.1  joerg ///
    393  1.1  joerg /// \param dst
    394  1.1  joerg ///    The destination tile. Max size is 1024 Bytes.
    395  1.1  joerg /// \param src0
    396  1.1  joerg ///    The 1st source tile. Max size is 1024 Bytes.
    397  1.1  joerg /// \param src1
    398  1.1  joerg ///    The 2nd source tile. Max size is 1024 Bytes.
    399  1.1  joerg __DEFAULT_FN_ATTRS_INT8
    400  1.1  joerg static void __tile_dpbuud(__tile1024i *dst, __tile1024i src0,
    401  1.1  joerg                           __tile1024i src1) {
    402  1.1  joerg   dst->tile = _tile_dpbuud_internal(src0.row, src1.col, src0.col, dst->tile,
    403  1.1  joerg                                     src0.tile, src1.tile);
    404  1.1  joerg }
    405  1.1  joerg 
    406  1.1  joerg /// Store the tile specified by "src" to memory specifieid by "base" address and
    407  1.1  joerg /// "stride".
    408  1.1  joerg ///
    409  1.1  joerg /// \headerfile <immintrin.h>
    410  1.1  joerg ///
    411  1.1  joerg /// This intrinsic corresponds to the <c> TILESTORED </c> instruction.
    412  1.1  joerg ///
    413  1.1  joerg /// \param dst
    414  1.1  joerg ///    A destination tile. Max size is 1024 Bytes.
    415  1.1  joerg /// \param base
    416  1.1  joerg ///    A pointer to base address.
    417  1.1  joerg /// \param stride
    418  1.1  joerg ///    The stride between the rows' data to be stored in memory.
    419  1.1  joerg __DEFAULT_FN_ATTRS_TILE
    420  1.1  joerg static void __tile_stored(void *base, __SIZE_TYPE__ stride, __tile1024i src) {
    421  1.1  joerg   _tile_stored_internal(src.row, src.col, base, stride, src.tile);
    422  1.1  joerg }
    423  1.1  joerg 
    424  1.1  joerg /// Zero the tile specified by "dst".
    425  1.1  joerg ///
    426  1.1  joerg /// \headerfile <immintrin.h>
    427  1.1  joerg ///
    428  1.1  joerg /// This intrinsic corresponds to the <c> TILEZERO </c> instruction.
    429  1.1  joerg ///
    430  1.1  joerg /// \param dst
    431  1.1  joerg ///    The destination tile to be zero. Max size is 1024 Bytes.
    432  1.1  joerg __DEFAULT_FN_ATTRS_TILE
    433  1.1  joerg static void __tile_zero(__tile1024i *dst) {
    434  1.1  joerg   dst->tile = __builtin_ia32_tilezero_internal(dst->row, dst->col);
    435  1.1  joerg }
    436  1.1  joerg 
    437  1.1  joerg /// Compute dot-product of BF16 (16-bit) floating-point pairs in tiles src0 and
    438  1.1  joerg /// src1, accumulating the intermediate single-precision (32-bit) floating-point
    439  1.1  joerg /// elements with elements in "dst", and store the 32-bit result back to tile
    440  1.1  joerg /// "dst".
    441  1.1  joerg ///
    442  1.1  joerg /// \headerfile <immintrin.h>
    443  1.1  joerg ///
    444  1.1  joerg /// This intrinsic corresponds to the <c> TDPBF16PS </c> instruction.
    445  1.1  joerg ///
    446  1.1  joerg /// \param dst
    447  1.1  joerg ///    The destination tile. Max size is 1024 Bytes.
    448  1.1  joerg /// \param src0
    449  1.1  joerg ///    The 1st source tile. Max size is 1024 Bytes.
    450  1.1  joerg /// \param src1
    451  1.1  joerg ///    The 2nd source tile. Max size is 1024 Bytes.
    452  1.1  joerg __DEFAULT_FN_ATTRS_BF16
    453  1.1  joerg static void __tile_dpbf16ps(__tile1024i *dst, __tile1024i src0,
    454  1.1  joerg                             __tile1024i src1) {
    455  1.1  joerg   dst->tile = _tile_dpbf16ps_internal(src0.row, src1.col, src0.col, dst->tile,
    456  1.1  joerg                                       src0.tile, src1.tile);
    457  1.1  joerg }
    458  1.1  joerg 
    459  1.1  joerg #undef __DEFAULT_FN_ATTRS_TILE
    460  1.1  joerg #undef __DEFAULT_FN_ATTRS_INT8
    461  1.1  joerg #undef __DEFAULT_FN_ATTRS_BF16
    462  1.1  joerg 
    463  1.1  joerg #endif /* __x86_64__ */
    464  1.1  joerg #endif /* __AMXINTRIN_H */
    465