matmul.m4 revision 1.1.1.4 1 1.1 mrg `/* Implementation of the MATMUL intrinsic
2 1.1.1.4 mrg Copyright (C) 2002-2024 Free Software Foundation, Inc.
3 1.1 mrg Contributed by Paul Brook <paul (a] nowt.org>
4 1.1 mrg
5 1.1 mrg This file is part of the GNU Fortran runtime library (libgfortran).
6 1.1 mrg
7 1.1 mrg Libgfortran is free software; you can redistribute it and/or
8 1.1 mrg modify it under the terms of the GNU General Public
9 1.1 mrg License as published by the Free Software Foundation; either
10 1.1 mrg version 3 of the License, or (at your option) any later version.
11 1.1 mrg
12 1.1 mrg Libgfortran is distributed in the hope that it will be useful,
13 1.1 mrg but WITHOUT ANY WARRANTY; without even the implied warranty of
14 1.1 mrg MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 1.1 mrg GNU General Public License for more details.
16 1.1 mrg
17 1.1 mrg Under Section 7 of GPL version 3, you are granted additional
18 1.1 mrg permissions described in the GCC Runtime Library Exception, version
19 1.1 mrg 3.1, as published by the Free Software Foundation.
20 1.1 mrg
21 1.1 mrg You should have received a copy of the GNU General Public License and
22 1.1 mrg a copy of the GCC Runtime Library Exception along with this program;
23 1.1 mrg see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
24 1.1 mrg <http://www.gnu.org/licenses/>. */
25 1.1 mrg
26 1.1 mrg #include "libgfortran.h"
27 1.1 mrg #include <string.h>
28 1.1 mrg #include <assert.h>'
29 1.1 mrg
30 1.1 mrg include(iparm.m4)dnl
31 1.1 mrg
32 1.1 mrg `#if defined (HAVE_'rtype_name`)
33 1.1 mrg
34 1.1 mrg /* Prototype for the BLAS ?gemm subroutine, a pointer to which can be
35 1.1 mrg passed to us by the front-end, in which case we call it for large
36 1.1 mrg matrices. */
37 1.1 mrg
38 1.1 mrg typedef void (*blas_call)(const char *, const char *, const int *, const int *,
39 1.1 mrg const int *, const 'rtype_name` *, const 'rtype_name` *,
40 1.1 mrg const int *, const 'rtype_name` *, const int *,
41 1.1 mrg const 'rtype_name` *, 'rtype_name` *, const int *,
42 1.1 mrg int, int);
43 1.1 mrg
44 1.1 mrg /* The order of loops is different in the case of plain matrix
45 1.1 mrg multiplication C=MATMUL(A,B), and in the frequent special case where
46 1.1 mrg the argument A is the temporary result of a TRANSPOSE intrinsic:
47 1.1 mrg C=MATMUL(TRANSPOSE(A),B). Transposed temporaries are detected by
48 1.1 mrg looking at their strides.
49 1.1 mrg
50 1.1 mrg The equivalent Fortran pseudo-code is:
51 1.1 mrg
52 1.1 mrg DIMENSION A(M,COUNT), B(COUNT,N), C(M,N)
53 1.1 mrg IF (.NOT.IS_TRANSPOSED(A)) THEN
54 1.1 mrg C = 0
55 1.1 mrg DO J=1,N
56 1.1 mrg DO K=1,COUNT
57 1.1 mrg DO I=1,M
58 1.1 mrg C(I,J) = C(I,J)+A(I,K)*B(K,J)
59 1.1 mrg ELSE
60 1.1 mrg DO J=1,N
61 1.1 mrg DO I=1,M
62 1.1 mrg S = 0
63 1.1 mrg DO K=1,COUNT
64 1.1 mrg S = S+A(I,K)*B(K,J)
65 1.1 mrg C(I,J) = S
66 1.1 mrg ENDIF
67 1.1 mrg */
68 1.1 mrg
69 1.1 mrg /* If try_blas is set to a nonzero value, then the matmul function will
70 1.1 mrg see if there is a way to perform the matrix multiplication by a call
71 1.1 mrg to the BLAS gemm function. */
72 1.1 mrg
73 1.1 mrg extern void matmul_'rtype_code` ('rtype` * const restrict retarray,
74 1.1 mrg 'rtype` * const restrict a, 'rtype` * const restrict b, int try_blas,
75 1.1 mrg int blas_limit, blas_call gemm);
76 1.1 mrg export_proto(matmul_'rtype_code`);
77 1.1 mrg
78 1.1 mrg /* Put exhaustive list of possible architectures here here, ORed together. */
79 1.1 mrg
80 1.1 mrg #if defined(HAVE_AVX) || defined(HAVE_AVX2) || defined(HAVE_AVX512F)
81 1.1 mrg
82 1.1 mrg #ifdef HAVE_AVX
83 1.1 mrg 'define(`matmul_name',`matmul_'rtype_code`_avx')dnl
84 1.1 mrg `static void
85 1.1 mrg 'matmul_name` ('rtype` * const restrict retarray,
86 1.1 mrg 'rtype` * const restrict a, 'rtype` * const restrict b, int try_blas,
87 1.1 mrg int blas_limit, blas_call gemm) __attribute__((__target__("avx")));
88 1.1 mrg static' include(matmul_internal.m4)dnl
89 1.1 mrg `#endif /* HAVE_AVX */
90 1.1 mrg
91 1.1 mrg #ifdef HAVE_AVX2
92 1.1 mrg 'define(`matmul_name',`matmul_'rtype_code`_avx2')dnl
93 1.1 mrg `static void
94 1.1 mrg 'matmul_name` ('rtype` * const restrict retarray,
95 1.1 mrg 'rtype` * const restrict a, 'rtype` * const restrict b, int try_blas,
96 1.1 mrg int blas_limit, blas_call gemm) __attribute__((__target__("avx2,fma")));
97 1.1 mrg static' include(matmul_internal.m4)dnl
98 1.1 mrg `#endif /* HAVE_AVX2 */
99 1.1 mrg
100 1.1 mrg #ifdef HAVE_AVX512F
101 1.1 mrg 'define(`matmul_name',`matmul_'rtype_code`_avx512f')dnl
102 1.1 mrg `static void
103 1.1 mrg 'matmul_name` ('rtype` * const restrict retarray,
104 1.1 mrg 'rtype` * const restrict a, 'rtype` * const restrict b, int try_blas,
105 1.1 mrg int blas_limit, blas_call gemm) __attribute__((__target__("avx512f")));
106 1.1 mrg static' include(matmul_internal.m4)dnl
107 1.1 mrg `#endif /* HAVE_AVX512F */
108 1.1 mrg
109 1.1 mrg /* AMD-specifix funtions with AVX128 and FMA3/FMA4. */
110 1.1 mrg
111 1.1 mrg #if defined(HAVE_AVX) && defined(HAVE_FMA3) && defined(HAVE_AVX128)
112 1.1 mrg 'define(`matmul_name',`matmul_'rtype_code`_avx128_fma3')dnl
113 1.1 mrg `void
114 1.1 mrg 'matmul_name` ('rtype` * const restrict retarray,
115 1.1 mrg 'rtype` * const restrict a, 'rtype` * const restrict b, int try_blas,
116 1.1 mrg int blas_limit, blas_call gemm) __attribute__((__target__("avx,fma")));
117 1.1 mrg internal_proto('matmul_name`);
118 1.1 mrg #endif
119 1.1 mrg
120 1.1 mrg #if defined(HAVE_AVX) && defined(HAVE_FMA4) && defined(HAVE_AVX128)
121 1.1 mrg 'define(`matmul_name',`matmul_'rtype_code`_avx128_fma4')dnl
122 1.1 mrg `void
123 1.1 mrg 'matmul_name` ('rtype` * const restrict retarray,
124 1.1 mrg 'rtype` * const restrict a, 'rtype` * const restrict b, int try_blas,
125 1.1 mrg int blas_limit, blas_call gemm) __attribute__((__target__("avx,fma4")));
126 1.1 mrg internal_proto('matmul_name`);
127 1.1 mrg #endif
128 1.1 mrg
129 1.1 mrg /* Function to fall back to if there is no special processor-specific version. */
130 1.1 mrg 'define(`matmul_name',`matmul_'rtype_code`_vanilla')dnl
131 1.1 mrg `static' include(matmul_internal.m4)dnl
132 1.1 mrg
133 1.1 mrg `/* Compiling main function, with selection code for the processor. */
134 1.1 mrg
135 1.1 mrg /* Currently, this is i386 only. Adjust for other architectures. */
136 1.1 mrg
137 1.1 mrg void matmul_'rtype_code` ('rtype` * const restrict retarray,
138 1.1 mrg 'rtype` * const restrict a, 'rtype` * const restrict b, int try_blas,
139 1.1 mrg int blas_limit, blas_call gemm)
140 1.1 mrg {
141 1.1 mrg static void (*matmul_p) ('rtype` * const restrict retarray,
142 1.1 mrg 'rtype` * const restrict a, 'rtype` * const restrict b, int try_blas,
143 1.1 mrg int blas_limit, blas_call gemm);
144 1.1 mrg
145 1.1 mrg void (*matmul_fn) ('rtype` * const restrict retarray,
146 1.1 mrg 'rtype` * const restrict a, 'rtype` * const restrict b, int try_blas,
147 1.1 mrg int blas_limit, blas_call gemm);
148 1.1 mrg
149 1.1 mrg matmul_fn = __atomic_load_n (&matmul_p, __ATOMIC_RELAXED);
150 1.1 mrg if (matmul_fn == NULL)
151 1.1 mrg {
152 1.1 mrg matmul_fn = matmul_'rtype_code`_vanilla;
153 1.1.1.3 mrg if (__builtin_cpu_is ("intel"))
154 1.1 mrg {
155 1.1 mrg /* Run down the available processors in order of preference. */
156 1.1 mrg #ifdef HAVE_AVX512F
157 1.1.1.3 mrg if (__builtin_cpu_supports ("avx512f"))
158 1.1 mrg {
159 1.1 mrg matmul_fn = matmul_'rtype_code`_avx512f;
160 1.1 mrg goto store;
161 1.1 mrg }
162 1.1 mrg
163 1.1 mrg #endif /* HAVE_AVX512F */
164 1.1 mrg
165 1.1 mrg #ifdef HAVE_AVX2
166 1.1.1.3 mrg if (__builtin_cpu_supports ("avx2")
167 1.1.1.3 mrg && __builtin_cpu_supports ("fma"))
168 1.1 mrg {
169 1.1 mrg matmul_fn = matmul_'rtype_code`_avx2;
170 1.1 mrg goto store;
171 1.1 mrg }
172 1.1 mrg
173 1.1 mrg #endif
174 1.1 mrg
175 1.1 mrg #ifdef HAVE_AVX
176 1.1.1.3 mrg if (__builtin_cpu_supports ("avx"))
177 1.1 mrg {
178 1.1 mrg matmul_fn = matmul_'rtype_code`_avx;
179 1.1 mrg goto store;
180 1.1 mrg }
181 1.1 mrg #endif /* HAVE_AVX */
182 1.1 mrg }
183 1.1.1.3 mrg else if (__builtin_cpu_is ("amd"))
184 1.1 mrg {
185 1.1 mrg #if defined(HAVE_AVX) && defined(HAVE_FMA3) && defined(HAVE_AVX128)
186 1.1.1.3 mrg if (__builtin_cpu_supports ("avx")
187 1.1.1.3 mrg && __builtin_cpu_supports ("fma"))
188 1.1 mrg {
189 1.1 mrg matmul_fn = matmul_'rtype_code`_avx128_fma3;
190 1.1 mrg goto store;
191 1.1 mrg }
192 1.1 mrg #endif
193 1.1 mrg #if defined(HAVE_AVX) && defined(HAVE_FMA4) && defined(HAVE_AVX128)
194 1.1.1.3 mrg if (__builtin_cpu_supports ("avx")
195 1.1.1.3 mrg && __builtin_cpu_supports ("fma4"))
196 1.1 mrg {
197 1.1 mrg matmul_fn = matmul_'rtype_code`_avx128_fma4;
198 1.1 mrg goto store;
199 1.1 mrg }
200 1.1 mrg #endif
201 1.1 mrg
202 1.1 mrg }
203 1.1 mrg store:
204 1.1 mrg __atomic_store_n (&matmul_p, matmul_fn, __ATOMIC_RELAXED);
205 1.1 mrg }
206 1.1 mrg
207 1.1 mrg (*matmul_fn) (retarray, a, b, try_blas, blas_limit, gemm);
208 1.1 mrg }
209 1.1 mrg
210 1.1 mrg #else /* Just the vanilla function. */
211 1.1 mrg
212 1.1 mrg 'define(`matmul_name',`matmul_'rtype_code)dnl
213 1.1 mrg define(`target_attribute',`')dnl
214 1.1 mrg include(matmul_internal.m4)dnl
215 1.1 mrg `#endif
216 1.1 mrg #endif
217 1.1 mrg '
218