Home | History | Annotate | Line # | Download | only in m4
      1 `/* Implementation of the MATMUL intrinsic
      2    Copyright (C) 2002-2022 Free Software Foundation, Inc.
      3    Contributed by Paul Brook <paul (a] nowt.org>
      4 
      5 This file is part of the GNU Fortran runtime library (libgfortran).
      6 
      7 Libgfortran is free software; you can redistribute it and/or
      8 modify it under the terms of the GNU General Public
      9 License as published by the Free Software Foundation; either
     10 version 3 of the License, or (at your option) any later version.
     11 
     12 Libgfortran is distributed in the hope that it will be useful,
     13 but WITHOUT ANY WARRANTY; without even the implied warranty of
     14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
     15 GNU General Public License for more details.
     16 
     17 Under Section 7 of GPL version 3, you are granted additional
     18 permissions described in the GCC Runtime Library Exception, version
     19 3.1, as published by the Free Software Foundation.
     20 
     21 You should have received a copy of the GNU General Public License and
     22 a copy of the GCC Runtime Library Exception along with this program;
     23 see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
     24 <http://www.gnu.org/licenses/>.  */
     25 
     26 #include "libgfortran.h"
     27 #include <string.h>
     28 #include <assert.h>'
     29 
     30 include(iparm.m4)dnl
     31 
     32 `#if defined (HAVE_'rtype_name`)
     33 
     34 /* Prototype for the BLAS ?gemm subroutine, a pointer to which can be
     35    passed to us by the front-end, in which case we call it for large
     36    matrices.  */
     37 
     38 typedef void (*blas_call)(const char *, const char *, const int *, const int *,
     39                           const int *, const 'rtype_name` *, const 'rtype_name` *,
     40                           const int *, const 'rtype_name` *, const int *,
     41                           const 'rtype_name` *, 'rtype_name` *, const int *,
     42                           int, int);
     43 
     44 /* The order of loops is different in the case of plain matrix
     45    multiplication C=MATMUL(A,B), and in the frequent special case where
     46    the argument A is the temporary result of a TRANSPOSE intrinsic:
     47    C=MATMUL(TRANSPOSE(A),B).  Transposed temporaries are detected by
     48    looking at their strides.
     49 
     50    The equivalent Fortran pseudo-code is:
     51 
     52    DIMENSION A(M,COUNT), B(COUNT,N), C(M,N)
     53    IF (.NOT.IS_TRANSPOSED(A)) THEN
     54      C = 0
     55      DO J=1,N
     56        DO K=1,COUNT
     57          DO I=1,M
     58            C(I,J) = C(I,J)+A(I,K)*B(K,J)
     59    ELSE
     60      DO J=1,N
     61        DO I=1,M
     62          S = 0
     63          DO K=1,COUNT
     64            S = S+A(I,K)*B(K,J)
     65          C(I,J) = S
     66    ENDIF
     67 */
     68 
     69 /* If try_blas is set to a nonzero value, then the matmul function will
     70    see if there is a way to perform the matrix multiplication by a call
     71    to the BLAS gemm function.  */
     72 
     73 extern void matmul_'rtype_code` ('rtype` * const restrict retarray, 
     74 	'rtype` * const restrict a, 'rtype` * const restrict b, int try_blas,
     75 	int blas_limit, blas_call gemm);
     76 export_proto(matmul_'rtype_code`);
     77 
     78 /* Put exhaustive list of possible architectures here here, ORed together.  */
     79 
     80 #if defined(HAVE_AVX) || defined(HAVE_AVX2) || defined(HAVE_AVX512F)
     81 
     82 #ifdef HAVE_AVX
     83 'define(`matmul_name',`matmul_'rtype_code`_avx')dnl
     84 `static void
     85 'matmul_name` ('rtype` * const restrict retarray, 
     86 	'rtype` * const restrict a, 'rtype` * const restrict b, int try_blas,
     87 	int blas_limit, blas_call gemm) __attribute__((__target__("avx")));
     88 static' include(matmul_internal.m4)dnl
     89 `#endif /* HAVE_AVX */
     90 
     91 #ifdef HAVE_AVX2
     92 'define(`matmul_name',`matmul_'rtype_code`_avx2')dnl
     93 `static void
     94 'matmul_name` ('rtype` * const restrict retarray, 
     95 	'rtype` * const restrict a, 'rtype` * const restrict b, int try_blas,
     96 	int blas_limit, blas_call gemm) __attribute__((__target__("avx2,fma")));
     97 static' include(matmul_internal.m4)dnl
     98 `#endif /* HAVE_AVX2 */
     99 
    100 #ifdef HAVE_AVX512F
    101 'define(`matmul_name',`matmul_'rtype_code`_avx512f')dnl
    102 `static void
    103 'matmul_name` ('rtype` * const restrict retarray, 
    104 	'rtype` * const restrict a, 'rtype` * const restrict b, int try_blas,
    105 	int blas_limit, blas_call gemm) __attribute__((__target__("avx512f")));
    106 static' include(matmul_internal.m4)dnl
    107 `#endif  /* HAVE_AVX512F */
    108 
    109 /* AMD-specifix funtions with AVX128 and FMA3/FMA4.  */
    110 
    111 #if defined(HAVE_AVX) && defined(HAVE_FMA3) && defined(HAVE_AVX128)
    112 'define(`matmul_name',`matmul_'rtype_code`_avx128_fma3')dnl
    113 `void
    114 'matmul_name` ('rtype` * const restrict retarray, 
    115 	'rtype` * const restrict a, 'rtype` * const restrict b, int try_blas,
    116 	int blas_limit, blas_call gemm) __attribute__((__target__("avx,fma")));
    117 internal_proto('matmul_name`);
    118 #endif
    119 
    120 #if defined(HAVE_AVX) && defined(HAVE_FMA4) && defined(HAVE_AVX128)
    121 'define(`matmul_name',`matmul_'rtype_code`_avx128_fma4')dnl
    122 `void
    123 'matmul_name` ('rtype` * const restrict retarray, 
    124 	'rtype` * const restrict a, 'rtype` * const restrict b, int try_blas,
    125 	int blas_limit, blas_call gemm) __attribute__((__target__("avx,fma4")));
    126 internal_proto('matmul_name`);
    127 #endif
    128 
    129 /* Function to fall back to if there is no special processor-specific version.  */
    130 'define(`matmul_name',`matmul_'rtype_code`_vanilla')dnl
    131 `static' include(matmul_internal.m4)dnl
    132 
    133 `/* Compiling main function, with selection code for the processor.  */
    134 
    135 /* Currently, this is i386 only.  Adjust for other architectures.  */
    136 
    137 void matmul_'rtype_code` ('rtype` * const restrict retarray, 
    138 	'rtype` * const restrict a, 'rtype` * const restrict b, int try_blas,
    139 	int blas_limit, blas_call gemm)
    140 {
    141   static void (*matmul_p) ('rtype` * const restrict retarray, 
    142 	'rtype` * const restrict a, 'rtype` * const restrict b, int try_blas,
    143 	int blas_limit, blas_call gemm);
    144 
    145   void (*matmul_fn) ('rtype` * const restrict retarray, 
    146 	'rtype` * const restrict a, 'rtype` * const restrict b, int try_blas,
    147 	int blas_limit, blas_call gemm);
    148 
    149   matmul_fn = __atomic_load_n (&matmul_p, __ATOMIC_RELAXED);
    150   if (matmul_fn == NULL)
    151     {
    152       matmul_fn = matmul_'rtype_code`_vanilla;
    153       if (__builtin_cpu_is ("intel"))
    154 	{
    155           /* Run down the available processors in order of preference.  */
    156 #ifdef HAVE_AVX512F
    157 	  if (__builtin_cpu_supports ("avx512f"))
    158 	    {
    159 	      matmul_fn = matmul_'rtype_code`_avx512f;
    160 	      goto store;
    161 	    }
    162 
    163 #endif  /* HAVE_AVX512F */
    164 
    165 #ifdef HAVE_AVX2
    166 	  if (__builtin_cpu_supports ("avx2")
    167 	      && __builtin_cpu_supports ("fma"))
    168 	    {
    169 	      matmul_fn = matmul_'rtype_code`_avx2;
    170 	      goto store;
    171 	    }
    172 
    173 #endif
    174 
    175 #ifdef HAVE_AVX
    176 	  if (__builtin_cpu_supports ("avx"))
    177  	    {
    178               matmul_fn = matmul_'rtype_code`_avx;
    179 	      goto store;
    180 	    }
    181 #endif  /* HAVE_AVX */
    182         }
    183     else if (__builtin_cpu_is ("amd"))
    184       {
    185 #if defined(HAVE_AVX) && defined(HAVE_FMA3) && defined(HAVE_AVX128)
    186 	if (__builtin_cpu_supports ("avx")
    187 	    && __builtin_cpu_supports ("fma"))
    188 	  {
    189             matmul_fn = matmul_'rtype_code`_avx128_fma3;
    190 	    goto store;
    191 	  }
    192 #endif
    193 #if defined(HAVE_AVX) && defined(HAVE_FMA4) && defined(HAVE_AVX128)
    194 	if (__builtin_cpu_supports ("avx")
    195 	    && __builtin_cpu_supports ("fma4"))
    196 	  {
    197             matmul_fn = matmul_'rtype_code`_avx128_fma4;
    198 	    goto store;
    199 	  }
    200 #endif
    201 
    202       }
    203    store:
    204       __atomic_store_n (&matmul_p, matmul_fn, __ATOMIC_RELAXED);
    205    }
    206 
    207    (*matmul_fn) (retarray, a, b, try_blas, blas_limit, gemm);
    208 }
    209 
    210 #else  /* Just the vanilla function.  */
    211 
    212 'define(`matmul_name',`matmul_'rtype_code)dnl
    213 define(`target_attribute',`')dnl
    214 include(matmul_internal.m4)dnl
    215 `#endif
    216 #endif
    217 '
    218