libgfortran/m4/matmul.m4

    1.1  mrg `/* Implementation of the MATMUL intrinsic
1.1.1.4  mrg    Copyright (C) 2002-2024 Free Software Foundation, Inc.
    1.1  mrg    Contributed by Paul Brook <paul (a] nowt.org>
    1.1  mrg
    1.1  mrg This file is part of the GNU Fortran runtime library (libgfortran).
    1.1  mrg
    1.1  mrg Libgfortran is free software; you can redistribute it and/or
    1.1  mrg modify it under the terms of the GNU General Public
    1.1  mrg License as published by the Free Software Foundation; either
    1.1  mrg version 3 of the License, or (at your option) any later version.
    1.1  mrg
    1.1  mrg Libgfortran is distributed in the hope that it will be useful,
    1.1  mrg but WITHOUT ANY WARRANTY; without even the implied warranty of
    1.1  mrg MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    1.1  mrg GNU General Public License for more details.
    1.1  mrg
    1.1  mrg Under Section 7 of GPL version 3, you are granted additional
    1.1  mrg permissions described in the GCC Runtime Library Exception, version
    1.1  mrg 3.1, as published by the Free Software Foundation.
    1.1  mrg
    1.1  mrg You should have received a copy of the GNU General Public License and
    1.1  mrg a copy of the GCC Runtime Library Exception along with this program;
    1.1  mrg see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
    1.1  mrg <http://www.gnu.org/licenses/>.  */
    1.1  mrg
    1.1  mrg #include "libgfortran.h"
    1.1  mrg #include <string.h>
    1.1  mrg #include <assert.h>'
    1.1  mrg
    1.1  mrg include(iparm.m4)dnl
    1.1  mrg
    1.1  mrg `#if defined (HAVE_'rtype_name`)
    1.1  mrg
    1.1  mrg /* Prototype for the BLAS ?gemm subroutine, a pointer to which can be
    1.1  mrg    passed to us by the front-end, in which case we call it for large
    1.1  mrg    matrices.  */
    1.1  mrg
    1.1  mrg typedef void (*blas_call)(const char *, const char *, const int *, const int *,
    1.1  mrg                           const int *, const 'rtype_name` *, const 'rtype_name` *,
    1.1  mrg                           const int *, const 'rtype_name` *, const int *,
    1.1  mrg                           const 'rtype_name` *, 'rtype_name` *, const int *,
    1.1  mrg                           int, int);
    1.1  mrg
    1.1  mrg /* The order of loops is different in the case of plain matrix
    1.1  mrg    multiplication C=MATMUL(A,B), and in the frequent special case where
    1.1  mrg    the argument A is the temporary result of a TRANSPOSE intrinsic:
    1.1  mrg    C=MATMUL(TRANSPOSE(A),B).  Transposed temporaries are detected by
    1.1  mrg    looking at their strides.
    1.1  mrg
    1.1  mrg    The equivalent Fortran pseudo-code is:
    1.1  mrg
    1.1  mrg    DIMENSION A(M,COUNT), B(COUNT,N), C(M,N)
    1.1  mrg    IF (.NOT.IS_TRANSPOSED(A)) THEN
    1.1  mrg      C = 0
    1.1  mrg      DO J=1,N
    1.1  mrg        DO K=1,COUNT
    1.1  mrg          DO I=1,M
    1.1  mrg            C(I,J) = C(I,J)+A(I,K)*B(K,J)
    1.1  mrg    ELSE
    1.1  mrg      DO J=1,N
    1.1  mrg        DO I=1,M
    1.1  mrg          S = 0
    1.1  mrg          DO K=1,COUNT
    1.1  mrg            S = S+A(I,K)*B(K,J)
    1.1  mrg          C(I,J) = S
    1.1  mrg    ENDIF
    1.1  mrg */
    1.1  mrg
    1.1  mrg /* If try_blas is set to a nonzero value, then the matmul function will
    1.1  mrg    see if there is a way to perform the matrix multiplication by a call
    1.1  mrg    to the BLAS gemm function.  */
    1.1  mrg
    1.1  mrg extern void matmul_'rtype_code` ('rtype` * const restrict retarray,
    1.1  mrg 	'rtype` * const restrict a, 'rtype` * const restrict b, int try_blas,
    1.1  mrg 	int blas_limit, blas_call gemm);
    1.1  mrg export_proto(matmul_'rtype_code`);
    1.1  mrg
    1.1  mrg /* Put exhaustive list of possible architectures here here, ORed together.  */
    1.1  mrg
    1.1  mrg #if defined(HAVE_AVX) || defined(HAVE_AVX2) || defined(HAVE_AVX512F)
    1.1  mrg
    1.1  mrg #ifdef HAVE_AVX
    1.1  mrg 'define(`matmul_name',`matmul_'rtype_code`_avx')dnl
    1.1  mrg `static void
    1.1  mrg 'matmul_name` ('rtype` * const restrict retarray,
    1.1  mrg 	'rtype` * const restrict a, 'rtype` * const restrict b, int try_blas,
    1.1  mrg 	int blas_limit, blas_call gemm) __attribute__((__target__("avx")));
    1.1  mrg static' include(matmul_internal.m4)dnl
    1.1  mrg `#endif /* HAVE_AVX */
    1.1  mrg
    1.1  mrg #ifdef HAVE_AVX2
    1.1  mrg 'define(`matmul_name',`matmul_'rtype_code`_avx2')dnl
    1.1  mrg `static void
    1.1  mrg 'matmul_name` ('rtype` * const restrict retarray,
    1.1  mrg 	'rtype` * const restrict a, 'rtype` * const restrict b, int try_blas,
    1.1  mrg 	int blas_limit, blas_call gemm) __attribute__((__target__("avx2,fma")));
    1.1  mrg static' include(matmul_internal.m4)dnl
    1.1  mrg `#endif /* HAVE_AVX2 */
    1.1  mrg
    1.1  mrg #ifdef HAVE_AVX512F
    1.1  mrg 'define(`matmul_name',`matmul_'rtype_code`_avx512f')dnl
    1.1  mrg `static void
    1.1  mrg 'matmul_name` ('rtype` * const restrict retarray,
    1.1  mrg 	'rtype` * const restrict a, 'rtype` * const restrict b, int try_blas,
    1.1  mrg 	int blas_limit, blas_call gemm) __attribute__((__target__("avx512f")));
    1.1  mrg static' include(matmul_internal.m4)dnl
    1.1  mrg `#endif  /* HAVE_AVX512F */
    1.1  mrg
    1.1  mrg /* AMD-specifix funtions with AVX128 and FMA3/FMA4.  */
    1.1  mrg
    1.1  mrg #if defined(HAVE_AVX) && defined(HAVE_FMA3) && defined(HAVE_AVX128)
    1.1  mrg 'define(`matmul_name',`matmul_'rtype_code`_avx128_fma3')dnl
    1.1  mrg `void
    1.1  mrg 'matmul_name` ('rtype` * const restrict retarray,
    1.1  mrg 	'rtype` * const restrict a, 'rtype` * const restrict b, int try_blas,
    1.1  mrg 	int blas_limit, blas_call gemm) __attribute__((__target__("avx,fma")));
    1.1  mrg internal_proto('matmul_name`);
    1.1  mrg #endif
    1.1  mrg
    1.1  mrg #if defined(HAVE_AVX) && defined(HAVE_FMA4) && defined(HAVE_AVX128)
    1.1  mrg 'define(`matmul_name',`matmul_'rtype_code`_avx128_fma4')dnl
    1.1  mrg `void
    1.1  mrg 'matmul_name` ('rtype` * const restrict retarray,
    1.1  mrg 	'rtype` * const restrict a, 'rtype` * const restrict b, int try_blas,
    1.1  mrg 	int blas_limit, blas_call gemm) __attribute__((__target__("avx,fma4")));
    1.1  mrg internal_proto('matmul_name`);
    1.1  mrg #endif
    1.1  mrg
    1.1  mrg /* Function to fall back to if there is no special processor-specific version.  */
    1.1  mrg 'define(`matmul_name',`matmul_'rtype_code`_vanilla')dnl
    1.1  mrg `static' include(matmul_internal.m4)dnl
    1.1  mrg
    1.1  mrg `/* Compiling main function, with selection code for the processor.  */
    1.1  mrg
    1.1  mrg /* Currently, this is i386 only.  Adjust for other architectures.  */
    1.1  mrg
    1.1  mrg void matmul_'rtype_code` ('rtype` * const restrict retarray,
    1.1  mrg 	'rtype` * const restrict a, 'rtype` * const restrict b, int try_blas,
    1.1  mrg 	int blas_limit, blas_call gemm)
    1.1  mrg {
    1.1  mrg   static void (*matmul_p) ('rtype` * const restrict retarray,
    1.1  mrg 	'rtype` * const restrict a, 'rtype` * const restrict b, int try_blas,
    1.1  mrg 	int blas_limit, blas_call gemm);
    1.1  mrg
    1.1  mrg   void (*matmul_fn) ('rtype` * const restrict retarray,
    1.1  mrg 	'rtype` * const restrict a, 'rtype` * const restrict b, int try_blas,
    1.1  mrg 	int blas_limit, blas_call gemm);
    1.1  mrg
    1.1  mrg   matmul_fn = __atomic_load_n (&matmul_p, __ATOMIC_RELAXED);
    1.1  mrg   if (matmul_fn == NULL)
    1.1  mrg     {
    1.1  mrg       matmul_fn = matmul_'rtype_code`_vanilla;
1.1.1.3  mrg       if (__builtin_cpu_is ("intel"))
    1.1  mrg 	{
    1.1  mrg           /* Run down the available processors in order of preference.  */
    1.1  mrg #ifdef HAVE_AVX512F
1.1.1.3  mrg 	  if (__builtin_cpu_supports ("avx512f"))
    1.1  mrg 	    {
    1.1  mrg 	      matmul_fn = matmul_'rtype_code`_avx512f;
    1.1  mrg 	      goto store;
    1.1  mrg 	    }
    1.1  mrg
    1.1  mrg #endif  /* HAVE_AVX512F */
    1.1  mrg
    1.1  mrg #ifdef HAVE_AVX2
1.1.1.3  mrg 	  if (__builtin_cpu_supports ("avx2")
1.1.1.3  mrg 	      && __builtin_cpu_supports ("fma"))
    1.1  mrg 	    {
    1.1  mrg 	      matmul_fn = matmul_'rtype_code`_avx2;
    1.1  mrg 	      goto store;
    1.1  mrg 	    }
    1.1  mrg
    1.1  mrg #endif
    1.1  mrg
    1.1  mrg #ifdef HAVE_AVX
1.1.1.3  mrg 	  if (__builtin_cpu_supports ("avx"))
    1.1  mrg  	    {
    1.1  mrg               matmul_fn = matmul_'rtype_code`_avx;
    1.1  mrg 	      goto store;
    1.1  mrg 	    }
    1.1  mrg #endif  /* HAVE_AVX */
    1.1  mrg         }
1.1.1.3  mrg     else if (__builtin_cpu_is ("amd"))
    1.1  mrg       {
    1.1  mrg #if defined(HAVE_AVX) && defined(HAVE_FMA3) && defined(HAVE_AVX128)
1.1.1.3  mrg 	if (__builtin_cpu_supports ("avx")
1.1.1.3  mrg 	    && __builtin_cpu_supports ("fma"))
    1.1  mrg 	  {
    1.1  mrg             matmul_fn = matmul_'rtype_code`_avx128_fma3;
    1.1  mrg 	    goto store;
    1.1  mrg 	  }
    1.1  mrg #endif
    1.1  mrg #if defined(HAVE_AVX) && defined(HAVE_FMA4) && defined(HAVE_AVX128)
1.1.1.3  mrg 	if (__builtin_cpu_supports ("avx")
1.1.1.3  mrg 	    && __builtin_cpu_supports ("fma4"))
    1.1  mrg 	  {
    1.1  mrg             matmul_fn = matmul_'rtype_code`_avx128_fma4;
    1.1  mrg 	    goto store;
    1.1  mrg 	  }
    1.1  mrg #endif
    1.1  mrg
    1.1  mrg       }
    1.1  mrg    store:
    1.1  mrg       __atomic_store_n (&matmul_p, matmul_fn, __ATOMIC_RELAXED);
    1.1  mrg    }
    1.1  mrg
    1.1  mrg    (*matmul_fn) (retarray, a, b, try_blas, blas_limit, gemm);
    1.1  mrg }
    1.1  mrg
    1.1  mrg #else  /* Just the vanilla function.  */
    1.1  mrg
    1.1  mrg 'define(`matmul_name',`matmul_'rtype_code)dnl
    1.1  mrg define(`target_attribute',`')dnl
    1.1  mrg include(matmul_internal.m4)dnl
    1.1  mrg `#endif
    1.1  mrg #endif
    1.1  mrg '