Home | History | Annotate | Line # | Download | only in m4
matmul.m4 revision 1.1.1.3
      1      1.1  mrg `/* Implementation of the MATMUL intrinsic
      2  1.1.1.3  mrg    Copyright (C) 2002-2022 Free Software Foundation, Inc.
      3      1.1  mrg    Contributed by Paul Brook <paul (a] nowt.org>
      4      1.1  mrg 
      5      1.1  mrg This file is part of the GNU Fortran runtime library (libgfortran).
      6      1.1  mrg 
      7      1.1  mrg Libgfortran is free software; you can redistribute it and/or
      8      1.1  mrg modify it under the terms of the GNU General Public
      9      1.1  mrg License as published by the Free Software Foundation; either
     10      1.1  mrg version 3 of the License, or (at your option) any later version.
     11      1.1  mrg 
     12      1.1  mrg Libgfortran is distributed in the hope that it will be useful,
     13      1.1  mrg but WITHOUT ANY WARRANTY; without even the implied warranty of
     14      1.1  mrg MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
     15      1.1  mrg GNU General Public License for more details.
     16      1.1  mrg 
     17      1.1  mrg Under Section 7 of GPL version 3, you are granted additional
     18      1.1  mrg permissions described in the GCC Runtime Library Exception, version
     19      1.1  mrg 3.1, as published by the Free Software Foundation.
     20      1.1  mrg 
     21      1.1  mrg You should have received a copy of the GNU General Public License and
     22      1.1  mrg a copy of the GCC Runtime Library Exception along with this program;
     23      1.1  mrg see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
     24      1.1  mrg <http://www.gnu.org/licenses/>.  */
     25      1.1  mrg 
     26      1.1  mrg #include "libgfortran.h"
     27      1.1  mrg #include <string.h>
     28      1.1  mrg #include <assert.h>'
     29      1.1  mrg 
     30      1.1  mrg include(iparm.m4)dnl
     31      1.1  mrg 
     32      1.1  mrg `#if defined (HAVE_'rtype_name`)
     33      1.1  mrg 
     34      1.1  mrg /* Prototype for the BLAS ?gemm subroutine, a pointer to which can be
     35      1.1  mrg    passed to us by the front-end, in which case we call it for large
     36      1.1  mrg    matrices.  */
     37      1.1  mrg 
     38      1.1  mrg typedef void (*blas_call)(const char *, const char *, const int *, const int *,
     39      1.1  mrg                           const int *, const 'rtype_name` *, const 'rtype_name` *,
     40      1.1  mrg                           const int *, const 'rtype_name` *, const int *,
     41      1.1  mrg                           const 'rtype_name` *, 'rtype_name` *, const int *,
     42      1.1  mrg                           int, int);
     43      1.1  mrg 
     44      1.1  mrg /* The order of loops is different in the case of plain matrix
     45      1.1  mrg    multiplication C=MATMUL(A,B), and in the frequent special case where
     46      1.1  mrg    the argument A is the temporary result of a TRANSPOSE intrinsic:
     47      1.1  mrg    C=MATMUL(TRANSPOSE(A),B).  Transposed temporaries are detected by
     48      1.1  mrg    looking at their strides.
     49      1.1  mrg 
     50      1.1  mrg    The equivalent Fortran pseudo-code is:
     51      1.1  mrg 
     52      1.1  mrg    DIMENSION A(M,COUNT), B(COUNT,N), C(M,N)
     53      1.1  mrg    IF (.NOT.IS_TRANSPOSED(A)) THEN
     54      1.1  mrg      C = 0
     55      1.1  mrg      DO J=1,N
     56      1.1  mrg        DO K=1,COUNT
     57      1.1  mrg          DO I=1,M
     58      1.1  mrg            C(I,J) = C(I,J)+A(I,K)*B(K,J)
     59      1.1  mrg    ELSE
     60      1.1  mrg      DO J=1,N
     61      1.1  mrg        DO I=1,M
     62      1.1  mrg          S = 0
     63      1.1  mrg          DO K=1,COUNT
     64      1.1  mrg            S = S+A(I,K)*B(K,J)
     65      1.1  mrg          C(I,J) = S
     66      1.1  mrg    ENDIF
     67      1.1  mrg */
     68      1.1  mrg 
     69      1.1  mrg /* If try_blas is set to a nonzero value, then the matmul function will
     70      1.1  mrg    see if there is a way to perform the matrix multiplication by a call
     71      1.1  mrg    to the BLAS gemm function.  */
     72      1.1  mrg 
     73      1.1  mrg extern void matmul_'rtype_code` ('rtype` * const restrict retarray, 
     74      1.1  mrg 	'rtype` * const restrict a, 'rtype` * const restrict b, int try_blas,
     75      1.1  mrg 	int blas_limit, blas_call gemm);
     76      1.1  mrg export_proto(matmul_'rtype_code`);
     77      1.1  mrg 
     78      1.1  mrg /* Put exhaustive list of possible architectures here here, ORed together.  */
     79      1.1  mrg 
     80      1.1  mrg #if defined(HAVE_AVX) || defined(HAVE_AVX2) || defined(HAVE_AVX512F)
     81      1.1  mrg 
     82      1.1  mrg #ifdef HAVE_AVX
     83      1.1  mrg 'define(`matmul_name',`matmul_'rtype_code`_avx')dnl
     84      1.1  mrg `static void
     85      1.1  mrg 'matmul_name` ('rtype` * const restrict retarray, 
     86      1.1  mrg 	'rtype` * const restrict a, 'rtype` * const restrict b, int try_blas,
     87      1.1  mrg 	int blas_limit, blas_call gemm) __attribute__((__target__("avx")));
     88      1.1  mrg static' include(matmul_internal.m4)dnl
     89      1.1  mrg `#endif /* HAVE_AVX */
     90      1.1  mrg 
     91      1.1  mrg #ifdef HAVE_AVX2
     92      1.1  mrg 'define(`matmul_name',`matmul_'rtype_code`_avx2')dnl
     93      1.1  mrg `static void
     94      1.1  mrg 'matmul_name` ('rtype` * const restrict retarray, 
     95      1.1  mrg 	'rtype` * const restrict a, 'rtype` * const restrict b, int try_blas,
     96      1.1  mrg 	int blas_limit, blas_call gemm) __attribute__((__target__("avx2,fma")));
     97      1.1  mrg static' include(matmul_internal.m4)dnl
     98      1.1  mrg `#endif /* HAVE_AVX2 */
     99      1.1  mrg 
    100      1.1  mrg #ifdef HAVE_AVX512F
    101      1.1  mrg 'define(`matmul_name',`matmul_'rtype_code`_avx512f')dnl
    102      1.1  mrg `static void
    103      1.1  mrg 'matmul_name` ('rtype` * const restrict retarray, 
    104      1.1  mrg 	'rtype` * const restrict a, 'rtype` * const restrict b, int try_blas,
    105      1.1  mrg 	int blas_limit, blas_call gemm) __attribute__((__target__("avx512f")));
    106      1.1  mrg static' include(matmul_internal.m4)dnl
    107      1.1  mrg `#endif  /* HAVE_AVX512F */
    108      1.1  mrg 
    109      1.1  mrg /* AMD-specifix funtions with AVX128 and FMA3/FMA4.  */
    110      1.1  mrg 
    111      1.1  mrg #if defined(HAVE_AVX) && defined(HAVE_FMA3) && defined(HAVE_AVX128)
    112      1.1  mrg 'define(`matmul_name',`matmul_'rtype_code`_avx128_fma3')dnl
    113      1.1  mrg `void
    114      1.1  mrg 'matmul_name` ('rtype` * const restrict retarray, 
    115      1.1  mrg 	'rtype` * const restrict a, 'rtype` * const restrict b, int try_blas,
    116      1.1  mrg 	int blas_limit, blas_call gemm) __attribute__((__target__("avx,fma")));
    117      1.1  mrg internal_proto('matmul_name`);
    118      1.1  mrg #endif
    119      1.1  mrg 
    120      1.1  mrg #if defined(HAVE_AVX) && defined(HAVE_FMA4) && defined(HAVE_AVX128)
    121      1.1  mrg 'define(`matmul_name',`matmul_'rtype_code`_avx128_fma4')dnl
    122      1.1  mrg `void
    123      1.1  mrg 'matmul_name` ('rtype` * const restrict retarray, 
    124      1.1  mrg 	'rtype` * const restrict a, 'rtype` * const restrict b, int try_blas,
    125      1.1  mrg 	int blas_limit, blas_call gemm) __attribute__((__target__("avx,fma4")));
    126      1.1  mrg internal_proto('matmul_name`);
    127      1.1  mrg #endif
    128      1.1  mrg 
    129      1.1  mrg /* Function to fall back to if there is no special processor-specific version.  */
    130      1.1  mrg 'define(`matmul_name',`matmul_'rtype_code`_vanilla')dnl
    131      1.1  mrg `static' include(matmul_internal.m4)dnl
    132      1.1  mrg 
    133      1.1  mrg `/* Compiling main function, with selection code for the processor.  */
    134      1.1  mrg 
    135      1.1  mrg /* Currently, this is i386 only.  Adjust for other architectures.  */
    136      1.1  mrg 
    137      1.1  mrg void matmul_'rtype_code` ('rtype` * const restrict retarray, 
    138      1.1  mrg 	'rtype` * const restrict a, 'rtype` * const restrict b, int try_blas,
    139      1.1  mrg 	int blas_limit, blas_call gemm)
    140      1.1  mrg {
    141      1.1  mrg   static void (*matmul_p) ('rtype` * const restrict retarray, 
    142      1.1  mrg 	'rtype` * const restrict a, 'rtype` * const restrict b, int try_blas,
    143      1.1  mrg 	int blas_limit, blas_call gemm);
    144      1.1  mrg 
    145      1.1  mrg   void (*matmul_fn) ('rtype` * const restrict retarray, 
    146      1.1  mrg 	'rtype` * const restrict a, 'rtype` * const restrict b, int try_blas,
    147      1.1  mrg 	int blas_limit, blas_call gemm);
    148      1.1  mrg 
    149      1.1  mrg   matmul_fn = __atomic_load_n (&matmul_p, __ATOMIC_RELAXED);
    150      1.1  mrg   if (matmul_fn == NULL)
    151      1.1  mrg     {
    152      1.1  mrg       matmul_fn = matmul_'rtype_code`_vanilla;
    153  1.1.1.3  mrg       if (__builtin_cpu_is ("intel"))
    154      1.1  mrg 	{
    155      1.1  mrg           /* Run down the available processors in order of preference.  */
    156      1.1  mrg #ifdef HAVE_AVX512F
    157  1.1.1.3  mrg 	  if (__builtin_cpu_supports ("avx512f"))
    158      1.1  mrg 	    {
    159      1.1  mrg 	      matmul_fn = matmul_'rtype_code`_avx512f;
    160      1.1  mrg 	      goto store;
    161      1.1  mrg 	    }
    162      1.1  mrg 
    163      1.1  mrg #endif  /* HAVE_AVX512F */
    164      1.1  mrg 
    165      1.1  mrg #ifdef HAVE_AVX2
    166  1.1.1.3  mrg 	  if (__builtin_cpu_supports ("avx2")
    167  1.1.1.3  mrg 	      && __builtin_cpu_supports ("fma"))
    168      1.1  mrg 	    {
    169      1.1  mrg 	      matmul_fn = matmul_'rtype_code`_avx2;
    170      1.1  mrg 	      goto store;
    171      1.1  mrg 	    }
    172      1.1  mrg 
    173      1.1  mrg #endif
    174      1.1  mrg 
    175      1.1  mrg #ifdef HAVE_AVX
    176  1.1.1.3  mrg 	  if (__builtin_cpu_supports ("avx"))
    177      1.1  mrg  	    {
    178      1.1  mrg               matmul_fn = matmul_'rtype_code`_avx;
    179      1.1  mrg 	      goto store;
    180      1.1  mrg 	    }
    181      1.1  mrg #endif  /* HAVE_AVX */
    182      1.1  mrg         }
    183  1.1.1.3  mrg     else if (__builtin_cpu_is ("amd"))
    184      1.1  mrg       {
    185      1.1  mrg #if defined(HAVE_AVX) && defined(HAVE_FMA3) && defined(HAVE_AVX128)
    186  1.1.1.3  mrg 	if (__builtin_cpu_supports ("avx")
    187  1.1.1.3  mrg 	    && __builtin_cpu_supports ("fma"))
    188      1.1  mrg 	  {
    189      1.1  mrg             matmul_fn = matmul_'rtype_code`_avx128_fma3;
    190      1.1  mrg 	    goto store;
    191      1.1  mrg 	  }
    192      1.1  mrg #endif
    193      1.1  mrg #if defined(HAVE_AVX) && defined(HAVE_FMA4) && defined(HAVE_AVX128)
    194  1.1.1.3  mrg 	if (__builtin_cpu_supports ("avx")
    195  1.1.1.3  mrg 	    && __builtin_cpu_supports ("fma4"))
    196      1.1  mrg 	  {
    197      1.1  mrg             matmul_fn = matmul_'rtype_code`_avx128_fma4;
    198      1.1  mrg 	    goto store;
    199      1.1  mrg 	  }
    200      1.1  mrg #endif
    201      1.1  mrg 
    202      1.1  mrg       }
    203      1.1  mrg    store:
    204      1.1  mrg       __atomic_store_n (&matmul_p, matmul_fn, __ATOMIC_RELAXED);
    205      1.1  mrg    }
    206      1.1  mrg 
    207      1.1  mrg    (*matmul_fn) (retarray, a, b, try_blas, blas_limit, gemm);
    208      1.1  mrg }
    209      1.1  mrg 
    210      1.1  mrg #else  /* Just the vanilla function.  */
    211      1.1  mrg 
    212      1.1  mrg 'define(`matmul_name',`matmul_'rtype_code)dnl
    213      1.1  mrg define(`target_attribute',`')dnl
    214      1.1  mrg include(matmul_internal.m4)dnl
    215      1.1  mrg `#endif
    216      1.1  mrg #endif
    217      1.1  mrg '
    218