1 1.1 mrg `/* Implementation of the MATMUL intrinsic 2 1.1.1.4 mrg Copyright (C) 2002-2024 Free Software Foundation, Inc. 3 1.1 mrg Contributed by Paul Brook <paul (a] nowt.org> 4 1.1 mrg 5 1.1 mrg This file is part of the GNU Fortran runtime library (libgfortran). 6 1.1 mrg 7 1.1 mrg Libgfortran is free software; you can redistribute it and/or 8 1.1 mrg modify it under the terms of the GNU General Public 9 1.1 mrg License as published by the Free Software Foundation; either 10 1.1 mrg version 3 of the License, or (at your option) any later version. 11 1.1 mrg 12 1.1 mrg Libgfortran is distributed in the hope that it will be useful, 13 1.1 mrg but WITHOUT ANY WARRANTY; without even the implied warranty of 14 1.1 mrg MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 15 1.1 mrg GNU General Public License for more details. 16 1.1 mrg 17 1.1 mrg Under Section 7 of GPL version 3, you are granted additional 18 1.1 mrg permissions described in the GCC Runtime Library Exception, version 19 1.1 mrg 3.1, as published by the Free Software Foundation. 20 1.1 mrg 21 1.1 mrg You should have received a copy of the GNU General Public License and 22 1.1 mrg a copy of the GCC Runtime Library Exception along with this program; 23 1.1 mrg see the files COPYING3 and COPYING.RUNTIME respectively. If not, see 24 1.1 mrg <http://www.gnu.org/licenses/>. */ 25 1.1 mrg 26 1.1 mrg #include "libgfortran.h" 27 1.1 mrg #include <string.h> 28 1.1 mrg #include <assert.h>' 29 1.1 mrg 30 1.1 mrg include(iparm.m4)dnl 31 1.1 mrg 32 1.1 mrg `#if defined (HAVE_'rtype_name`) 33 1.1 mrg 34 1.1 mrg /* Prototype for the BLAS ?gemm subroutine, a pointer to which can be 35 1.1 mrg passed to us by the front-end, in which case we call it for large 36 1.1 mrg matrices. */ 37 1.1 mrg 38 1.1 mrg typedef void (*blas_call)(const char *, const char *, const int *, const int *, 39 1.1 mrg const int *, const 'rtype_name` *, const 'rtype_name` *, 40 1.1 mrg const int *, const 'rtype_name` *, const int *, 41 1.1 mrg const 'rtype_name` *, 'rtype_name` *, const int *, 42 1.1 mrg int, int); 43 1.1 mrg 44 1.1 mrg /* The order of loops is different in the case of plain matrix 45 1.1 mrg multiplication C=MATMUL(A,B), and in the frequent special case where 46 1.1 mrg the argument A is the temporary result of a TRANSPOSE intrinsic: 47 1.1 mrg C=MATMUL(TRANSPOSE(A),B). Transposed temporaries are detected by 48 1.1 mrg looking at their strides. 49 1.1 mrg 50 1.1 mrg The equivalent Fortran pseudo-code is: 51 1.1 mrg 52 1.1 mrg DIMENSION A(M,COUNT), B(COUNT,N), C(M,N) 53 1.1 mrg IF (.NOT.IS_TRANSPOSED(A)) THEN 54 1.1 mrg C = 0 55 1.1 mrg DO J=1,N 56 1.1 mrg DO K=1,COUNT 57 1.1 mrg DO I=1,M 58 1.1 mrg C(I,J) = C(I,J)+A(I,K)*B(K,J) 59 1.1 mrg ELSE 60 1.1 mrg DO J=1,N 61 1.1 mrg DO I=1,M 62 1.1 mrg S = 0 63 1.1 mrg DO K=1,COUNT 64 1.1 mrg S = S+A(I,K)*B(K,J) 65 1.1 mrg C(I,J) = S 66 1.1 mrg ENDIF 67 1.1 mrg */ 68 1.1 mrg 69 1.1 mrg /* If try_blas is set to a nonzero value, then the matmul function will 70 1.1 mrg see if there is a way to perform the matrix multiplication by a call 71 1.1 mrg to the BLAS gemm function. */ 72 1.1 mrg 73 1.1 mrg extern void matmul_'rtype_code` ('rtype` * const restrict retarray, 74 1.1 mrg 'rtype` * const restrict a, 'rtype` * const restrict b, int try_blas, 75 1.1 mrg int blas_limit, blas_call gemm); 76 1.1 mrg export_proto(matmul_'rtype_code`); 77 1.1 mrg 78 1.1 mrg /* Put exhaustive list of possible architectures here here, ORed together. */ 79 1.1 mrg 80 1.1 mrg #if defined(HAVE_AVX) || defined(HAVE_AVX2) || defined(HAVE_AVX512F) 81 1.1 mrg 82 1.1 mrg #ifdef HAVE_AVX 83 1.1 mrg 'define(`matmul_name',`matmul_'rtype_code`_avx')dnl 84 1.1 mrg `static void 85 1.1 mrg 'matmul_name` ('rtype` * const restrict retarray, 86 1.1 mrg 'rtype` * const restrict a, 'rtype` * const restrict b, int try_blas, 87 1.1 mrg int blas_limit, blas_call gemm) __attribute__((__target__("avx"))); 88 1.1 mrg static' include(matmul_internal.m4)dnl 89 1.1 mrg `#endif /* HAVE_AVX */ 90 1.1 mrg 91 1.1 mrg #ifdef HAVE_AVX2 92 1.1 mrg 'define(`matmul_name',`matmul_'rtype_code`_avx2')dnl 93 1.1 mrg `static void 94 1.1 mrg 'matmul_name` ('rtype` * const restrict retarray, 95 1.1 mrg 'rtype` * const restrict a, 'rtype` * const restrict b, int try_blas, 96 1.1 mrg int blas_limit, blas_call gemm) __attribute__((__target__("avx2,fma"))); 97 1.1 mrg static' include(matmul_internal.m4)dnl 98 1.1 mrg `#endif /* HAVE_AVX2 */ 99 1.1 mrg 100 1.1 mrg #ifdef HAVE_AVX512F 101 1.1 mrg 'define(`matmul_name',`matmul_'rtype_code`_avx512f')dnl 102 1.1 mrg `static void 103 1.1 mrg 'matmul_name` ('rtype` * const restrict retarray, 104 1.1 mrg 'rtype` * const restrict a, 'rtype` * const restrict b, int try_blas, 105 1.1 mrg int blas_limit, blas_call gemm) __attribute__((__target__("avx512f"))); 106 1.1 mrg static' include(matmul_internal.m4)dnl 107 1.1 mrg `#endif /* HAVE_AVX512F */ 108 1.1 mrg 109 1.1 mrg /* AMD-specifix funtions with AVX128 and FMA3/FMA4. */ 110 1.1 mrg 111 1.1 mrg #if defined(HAVE_AVX) && defined(HAVE_FMA3) && defined(HAVE_AVX128) 112 1.1 mrg 'define(`matmul_name',`matmul_'rtype_code`_avx128_fma3')dnl 113 1.1 mrg `void 114 1.1 mrg 'matmul_name` ('rtype` * const restrict retarray, 115 1.1 mrg 'rtype` * const restrict a, 'rtype` * const restrict b, int try_blas, 116 1.1 mrg int blas_limit, blas_call gemm) __attribute__((__target__("avx,fma"))); 117 1.1 mrg internal_proto('matmul_name`); 118 1.1 mrg #endif 119 1.1 mrg 120 1.1 mrg #if defined(HAVE_AVX) && defined(HAVE_FMA4) && defined(HAVE_AVX128) 121 1.1 mrg 'define(`matmul_name',`matmul_'rtype_code`_avx128_fma4')dnl 122 1.1 mrg `void 123 1.1 mrg 'matmul_name` ('rtype` * const restrict retarray, 124 1.1 mrg 'rtype` * const restrict a, 'rtype` * const restrict b, int try_blas, 125 1.1 mrg int blas_limit, blas_call gemm) __attribute__((__target__("avx,fma4"))); 126 1.1 mrg internal_proto('matmul_name`); 127 1.1 mrg #endif 128 1.1 mrg 129 1.1 mrg /* Function to fall back to if there is no special processor-specific version. */ 130 1.1 mrg 'define(`matmul_name',`matmul_'rtype_code`_vanilla')dnl 131 1.1 mrg `static' include(matmul_internal.m4)dnl 132 1.1 mrg 133 1.1 mrg `/* Compiling main function, with selection code for the processor. */ 134 1.1 mrg 135 1.1 mrg /* Currently, this is i386 only. Adjust for other architectures. */ 136 1.1 mrg 137 1.1 mrg void matmul_'rtype_code` ('rtype` * const restrict retarray, 138 1.1 mrg 'rtype` * const restrict a, 'rtype` * const restrict b, int try_blas, 139 1.1 mrg int blas_limit, blas_call gemm) 140 1.1 mrg { 141 1.1 mrg static void (*matmul_p) ('rtype` * const restrict retarray, 142 1.1 mrg 'rtype` * const restrict a, 'rtype` * const restrict b, int try_blas, 143 1.1 mrg int blas_limit, blas_call gemm); 144 1.1 mrg 145 1.1 mrg void (*matmul_fn) ('rtype` * const restrict retarray, 146 1.1 mrg 'rtype` * const restrict a, 'rtype` * const restrict b, int try_blas, 147 1.1 mrg int blas_limit, blas_call gemm); 148 1.1 mrg 149 1.1 mrg matmul_fn = __atomic_load_n (&matmul_p, __ATOMIC_RELAXED); 150 1.1 mrg if (matmul_fn == NULL) 151 1.1 mrg { 152 1.1 mrg matmul_fn = matmul_'rtype_code`_vanilla; 153 1.1.1.3 mrg if (__builtin_cpu_is ("intel")) 154 1.1 mrg { 155 1.1 mrg /* Run down the available processors in order of preference. */ 156 1.1 mrg #ifdef HAVE_AVX512F 157 1.1.1.3 mrg if (__builtin_cpu_supports ("avx512f")) 158 1.1 mrg { 159 1.1 mrg matmul_fn = matmul_'rtype_code`_avx512f; 160 1.1 mrg goto store; 161 1.1 mrg } 162 1.1 mrg 163 1.1 mrg #endif /* HAVE_AVX512F */ 164 1.1 mrg 165 1.1 mrg #ifdef HAVE_AVX2 166 1.1.1.3 mrg if (__builtin_cpu_supports ("avx2") 167 1.1.1.3 mrg && __builtin_cpu_supports ("fma")) 168 1.1 mrg { 169 1.1 mrg matmul_fn = matmul_'rtype_code`_avx2; 170 1.1 mrg goto store; 171 1.1 mrg } 172 1.1 mrg 173 1.1 mrg #endif 174 1.1 mrg 175 1.1 mrg #ifdef HAVE_AVX 176 1.1.1.3 mrg if (__builtin_cpu_supports ("avx")) 177 1.1 mrg { 178 1.1 mrg matmul_fn = matmul_'rtype_code`_avx; 179 1.1 mrg goto store; 180 1.1 mrg } 181 1.1 mrg #endif /* HAVE_AVX */ 182 1.1 mrg } 183 1.1.1.3 mrg else if (__builtin_cpu_is ("amd")) 184 1.1 mrg { 185 1.1 mrg #if defined(HAVE_AVX) && defined(HAVE_FMA3) && defined(HAVE_AVX128) 186 1.1.1.3 mrg if (__builtin_cpu_supports ("avx") 187 1.1.1.3 mrg && __builtin_cpu_supports ("fma")) 188 1.1 mrg { 189 1.1 mrg matmul_fn = matmul_'rtype_code`_avx128_fma3; 190 1.1 mrg goto store; 191 1.1 mrg } 192 1.1 mrg #endif 193 1.1 mrg #if defined(HAVE_AVX) && defined(HAVE_FMA4) && defined(HAVE_AVX128) 194 1.1.1.3 mrg if (__builtin_cpu_supports ("avx") 195 1.1.1.3 mrg && __builtin_cpu_supports ("fma4")) 196 1.1 mrg { 197 1.1 mrg matmul_fn = matmul_'rtype_code`_avx128_fma4; 198 1.1 mrg goto store; 199 1.1 mrg } 200 1.1 mrg #endif 201 1.1 mrg 202 1.1 mrg } 203 1.1 mrg store: 204 1.1 mrg __atomic_store_n (&matmul_p, matmul_fn, __ATOMIC_RELAXED); 205 1.1 mrg } 206 1.1 mrg 207 1.1 mrg (*matmul_fn) (retarray, a, b, try_blas, blas_limit, gemm); 208 1.1 mrg } 209 1.1 mrg 210 1.1 mrg #else /* Just the vanilla function. */ 211 1.1 mrg 212 1.1 mrg 'define(`matmul_name',`matmul_'rtype_code)dnl 213 1.1 mrg define(`target_attribute',`')dnl 214 1.1 mrg include(matmul_internal.m4)dnl 215 1.1 mrg `#endif 216 1.1 mrg #endif 217 1.1 mrg ' 218