1 1.1 mrg /* Implementation of the MATMUL intrinsic 2 1.1.1.4 mrg Copyright (C) 2002-2024 Free Software Foundation, Inc. 3 1.1 mrg Contributed by Thomas Koenig <tkoenig (at) gcc.gnu.org>. 4 1.1 mrg 5 1.1 mrg This file is part of the GNU Fortran runtime library (libgfortran). 6 1.1 mrg 7 1.1 mrg Libgfortran is free software; you can redistribute it and/or 8 1.1 mrg modify it under the terms of the GNU General Public 9 1.1 mrg License as published by the Free Software Foundation; either 10 1.1 mrg version 3 of the License, or (at your option) any later version. 11 1.1 mrg 12 1.1 mrg Libgfortran is distributed in the hope that it will be useful, 13 1.1 mrg but WITHOUT ANY WARRANTY; without even the implied warranty of 14 1.1 mrg MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 15 1.1 mrg GNU General Public License for more details. 16 1.1 mrg 17 1.1 mrg Under Section 7 of GPL version 3, you are granted additional 18 1.1 mrg permissions described in the GCC Runtime Library Exception, version 19 1.1 mrg 3.1, as published by the Free Software Foundation. 20 1.1 mrg 21 1.1 mrg You should have received a copy of the GNU General Public License and 22 1.1 mrg a copy of the GCC Runtime Library Exception along with this program; 23 1.1 mrg see the files COPYING3 and COPYING.RUNTIME respectively. If not, see 24 1.1 mrg <http://www.gnu.org/licenses/>. */ 25 1.1 mrg 26 1.1 mrg #include "libgfortran.h" 27 1.1 mrg #include <string.h> 28 1.1 mrg #include <assert.h> 29 1.1 mrg 30 1.1 mrg 31 1.1 mrg /* These are the specific versions of matmul with -mprefer-avx128. */ 32 1.1 mrg 33 1.1 mrg #if defined (HAVE_GFC_REAL_10) 34 1.1 mrg 35 1.1 mrg /* Prototype for the BLAS ?gemm subroutine, a pointer to which can be 36 1.1 mrg passed to us by the front-end, in which case we call it for large 37 1.1 mrg matrices. */ 38 1.1 mrg 39 1.1 mrg typedef void (*blas_call)(const char *, const char *, const int *, const int *, 40 1.1 mrg const int *, const GFC_REAL_10 *, const GFC_REAL_10 *, 41 1.1 mrg const int *, const GFC_REAL_10 *, const int *, 42 1.1 mrg const GFC_REAL_10 *, GFC_REAL_10 *, const int *, 43 1.1 mrg int, int); 44 1.1 mrg 45 1.1 mrg #if defined(HAVE_AVX) && defined(HAVE_FMA3) && defined(HAVE_AVX128) 46 1.1 mrg void 47 1.1 mrg matmul_r10_avx128_fma3 (gfc_array_r10 * const restrict retarray, 48 1.1 mrg gfc_array_r10 * const restrict a, gfc_array_r10 * const restrict b, int try_blas, 49 1.1 mrg int blas_limit, blas_call gemm) __attribute__((__target__("avx,fma"))); 50 1.1 mrg internal_proto(matmul_r10_avx128_fma3); 51 1.1 mrg void 52 1.1 mrg matmul_r10_avx128_fma3 (gfc_array_r10 * const restrict retarray, 53 1.1 mrg gfc_array_r10 * const restrict a, gfc_array_r10 * const restrict b, int try_blas, 54 1.1 mrg int blas_limit, blas_call gemm) 55 1.1 mrg { 56 1.1 mrg const GFC_REAL_10 * restrict abase; 57 1.1 mrg const GFC_REAL_10 * restrict bbase; 58 1.1 mrg GFC_REAL_10 * restrict dest; 59 1.1 mrg 60 1.1 mrg index_type rxstride, rystride, axstride, aystride, bxstride, bystride; 61 1.1 mrg index_type x, y, n, count, xcount, ycount; 62 1.1 mrg 63 1.1 mrg assert (GFC_DESCRIPTOR_RANK (a) == 2 64 1.1 mrg || GFC_DESCRIPTOR_RANK (b) == 2); 65 1.1 mrg 66 1.1 mrg /* C[xcount,ycount] = A[xcount, count] * B[count,ycount] 67 1.1 mrg 68 1.1 mrg Either A or B (but not both) can be rank 1: 69 1.1 mrg 70 1.1 mrg o One-dimensional argument A is implicitly treated as a row matrix 71 1.1 mrg dimensioned [1,count], so xcount=1. 72 1.1 mrg 73 1.1 mrg o One-dimensional argument B is implicitly treated as a column matrix 74 1.1 mrg dimensioned [count, 1], so ycount=1. 75 1.1 mrg */ 76 1.1 mrg 77 1.1 mrg if (retarray->base_addr == NULL) 78 1.1 mrg { 79 1.1 mrg if (GFC_DESCRIPTOR_RANK (a) == 1) 80 1.1 mrg { 81 1.1 mrg GFC_DIMENSION_SET(retarray->dim[0], 0, 82 1.1 mrg GFC_DESCRIPTOR_EXTENT(b,1) - 1, 1); 83 1.1 mrg } 84 1.1 mrg else if (GFC_DESCRIPTOR_RANK (b) == 1) 85 1.1 mrg { 86 1.1 mrg GFC_DIMENSION_SET(retarray->dim[0], 0, 87 1.1 mrg GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1); 88 1.1 mrg } 89 1.1 mrg else 90 1.1 mrg { 91 1.1 mrg GFC_DIMENSION_SET(retarray->dim[0], 0, 92 1.1 mrg GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1); 93 1.1 mrg 94 1.1 mrg GFC_DIMENSION_SET(retarray->dim[1], 0, 95 1.1 mrg GFC_DESCRIPTOR_EXTENT(b,1) - 1, 96 1.1 mrg GFC_DESCRIPTOR_EXTENT(retarray,0)); 97 1.1 mrg } 98 1.1 mrg 99 1.1 mrg retarray->base_addr 100 1.1 mrg = xmallocarray (size0 ((array_t *) retarray), sizeof (GFC_REAL_10)); 101 1.1 mrg retarray->offset = 0; 102 1.1 mrg } 103 1.1 mrg else if (unlikely (compile_options.bounds_check)) 104 1.1 mrg { 105 1.1 mrg index_type ret_extent, arg_extent; 106 1.1 mrg 107 1.1 mrg if (GFC_DESCRIPTOR_RANK (a) == 1) 108 1.1 mrg { 109 1.1 mrg arg_extent = GFC_DESCRIPTOR_EXTENT(b,1); 110 1.1 mrg ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0); 111 1.1 mrg if (arg_extent != ret_extent) 112 1.1 mrg runtime_error ("Array bound mismatch for dimension 1 of " 113 1.1 mrg "array (%ld/%ld) ", 114 1.1 mrg (long int) ret_extent, (long int) arg_extent); 115 1.1 mrg } 116 1.1 mrg else if (GFC_DESCRIPTOR_RANK (b) == 1) 117 1.1 mrg { 118 1.1 mrg arg_extent = GFC_DESCRIPTOR_EXTENT(a,0); 119 1.1 mrg ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0); 120 1.1 mrg if (arg_extent != ret_extent) 121 1.1 mrg runtime_error ("Array bound mismatch for dimension 1 of " 122 1.1 mrg "array (%ld/%ld) ", 123 1.1 mrg (long int) ret_extent, (long int) arg_extent); 124 1.1 mrg } 125 1.1 mrg else 126 1.1 mrg { 127 1.1 mrg arg_extent = GFC_DESCRIPTOR_EXTENT(a,0); 128 1.1 mrg ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0); 129 1.1 mrg if (arg_extent != ret_extent) 130 1.1 mrg runtime_error ("Array bound mismatch for dimension 1 of " 131 1.1 mrg "array (%ld/%ld) ", 132 1.1 mrg (long int) ret_extent, (long int) arg_extent); 133 1.1 mrg 134 1.1 mrg arg_extent = GFC_DESCRIPTOR_EXTENT(b,1); 135 1.1 mrg ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,1); 136 1.1 mrg if (arg_extent != ret_extent) 137 1.1 mrg runtime_error ("Array bound mismatch for dimension 2 of " 138 1.1 mrg "array (%ld/%ld) ", 139 1.1 mrg (long int) ret_extent, (long int) arg_extent); 140 1.1 mrg } 141 1.1 mrg } 142 1.1 mrg 143 1.1 mrg 144 1.1 mrg if (GFC_DESCRIPTOR_RANK (retarray) == 1) 145 1.1 mrg { 146 1.1 mrg /* One-dimensional result may be addressed in the code below 147 1.1 mrg either as a row or a column matrix. We want both cases to 148 1.1 mrg work. */ 149 1.1 mrg rxstride = rystride = GFC_DESCRIPTOR_STRIDE(retarray,0); 150 1.1 mrg } 151 1.1 mrg else 152 1.1 mrg { 153 1.1 mrg rxstride = GFC_DESCRIPTOR_STRIDE(retarray,0); 154 1.1 mrg rystride = GFC_DESCRIPTOR_STRIDE(retarray,1); 155 1.1 mrg } 156 1.1 mrg 157 1.1 mrg 158 1.1 mrg if (GFC_DESCRIPTOR_RANK (a) == 1) 159 1.1 mrg { 160 1.1 mrg /* Treat it as a a row matrix A[1,count]. */ 161 1.1 mrg axstride = GFC_DESCRIPTOR_STRIDE(a,0); 162 1.1 mrg aystride = 1; 163 1.1 mrg 164 1.1 mrg xcount = 1; 165 1.1 mrg count = GFC_DESCRIPTOR_EXTENT(a,0); 166 1.1 mrg } 167 1.1 mrg else 168 1.1 mrg { 169 1.1 mrg axstride = GFC_DESCRIPTOR_STRIDE(a,0); 170 1.1 mrg aystride = GFC_DESCRIPTOR_STRIDE(a,1); 171 1.1 mrg 172 1.1 mrg count = GFC_DESCRIPTOR_EXTENT(a,1); 173 1.1 mrg xcount = GFC_DESCRIPTOR_EXTENT(a,0); 174 1.1 mrg } 175 1.1 mrg 176 1.1 mrg if (count != GFC_DESCRIPTOR_EXTENT(b,0)) 177 1.1 mrg { 178 1.1 mrg if (count > 0 || GFC_DESCRIPTOR_EXTENT(b,0) > 0) 179 1.1 mrg runtime_error ("Incorrect extent in argument B in MATMUL intrinsic " 180 1.1 mrg "in dimension 1: is %ld, should be %ld", 181 1.1 mrg (long int) GFC_DESCRIPTOR_EXTENT(b,0), (long int) count); 182 1.1 mrg } 183 1.1 mrg 184 1.1 mrg if (GFC_DESCRIPTOR_RANK (b) == 1) 185 1.1 mrg { 186 1.1 mrg /* Treat it as a column matrix B[count,1] */ 187 1.1 mrg bxstride = GFC_DESCRIPTOR_STRIDE(b,0); 188 1.1 mrg 189 1.1 mrg /* bystride should never be used for 1-dimensional b. 190 1.1 mrg The value is only used for calculation of the 191 1.1 mrg memory by the buffer. */ 192 1.1 mrg bystride = 256; 193 1.1 mrg ycount = 1; 194 1.1 mrg } 195 1.1 mrg else 196 1.1 mrg { 197 1.1 mrg bxstride = GFC_DESCRIPTOR_STRIDE(b,0); 198 1.1 mrg bystride = GFC_DESCRIPTOR_STRIDE(b,1); 199 1.1 mrg ycount = GFC_DESCRIPTOR_EXTENT(b,1); 200 1.1 mrg } 201 1.1 mrg 202 1.1 mrg abase = a->base_addr; 203 1.1 mrg bbase = b->base_addr; 204 1.1 mrg dest = retarray->base_addr; 205 1.1 mrg 206 1.1 mrg /* Now that everything is set up, we perform the multiplication 207 1.1 mrg itself. */ 208 1.1 mrg 209 1.1 mrg #define POW3(x) (((float) (x)) * ((float) (x)) * ((float) (x))) 210 1.1 mrg #define min(a,b) ((a) <= (b) ? (a) : (b)) 211 1.1 mrg #define max(a,b) ((a) >= (b) ? (a) : (b)) 212 1.1 mrg 213 1.1 mrg if (try_blas && rxstride == 1 && (axstride == 1 || aystride == 1) 214 1.1 mrg && (bxstride == 1 || bystride == 1) 215 1.1 mrg && (((float) xcount) * ((float) ycount) * ((float) count) 216 1.1 mrg > POW3(blas_limit))) 217 1.1 mrg { 218 1.1 mrg const int m = xcount, n = ycount, k = count, ldc = rystride; 219 1.1 mrg const GFC_REAL_10 one = 1, zero = 0; 220 1.1 mrg const int lda = (axstride == 1) ? aystride : axstride, 221 1.1 mrg ldb = (bxstride == 1) ? bystride : bxstride; 222 1.1 mrg 223 1.1 mrg if (lda > 0 && ldb > 0 && ldc > 0 && m > 1 && n > 1 && k > 1) 224 1.1 mrg { 225 1.1 mrg assert (gemm != NULL); 226 1.1 mrg const char *transa, *transb; 227 1.1 mrg if (try_blas & 2) 228 1.1 mrg transa = "C"; 229 1.1 mrg else 230 1.1 mrg transa = axstride == 1 ? "N" : "T"; 231 1.1 mrg 232 1.1 mrg if (try_blas & 4) 233 1.1 mrg transb = "C"; 234 1.1 mrg else 235 1.1 mrg transb = bxstride == 1 ? "N" : "T"; 236 1.1 mrg 237 1.1 mrg gemm (transa, transb , &m, 238 1.1 mrg &n, &k, &one, abase, &lda, bbase, &ldb, &zero, dest, 239 1.1 mrg &ldc, 1, 1); 240 1.1 mrg return; 241 1.1 mrg } 242 1.1 mrg } 243 1.1 mrg 244 1.1.1.2 mrg if (rxstride == 1 && axstride == 1 && bxstride == 1 245 1.1.1.2 mrg && GFC_DESCRIPTOR_RANK (b) != 1) 246 1.1 mrg { 247 1.1 mrg /* This block of code implements a tuned matmul, derived from 248 1.1 mrg Superscalar GEMM-based level 3 BLAS, Beta version 0.1 249 1.1 mrg 250 1.1 mrg Bo Kagstrom and Per Ling 251 1.1 mrg Department of Computing Science 252 1.1 mrg Umea University 253 1.1 mrg S-901 87 Umea, Sweden 254 1.1 mrg 255 1.1 mrg from netlib.org, translated to C, and modified for matmul.m4. */ 256 1.1 mrg 257 1.1 mrg const GFC_REAL_10 *a, *b; 258 1.1 mrg GFC_REAL_10 *c; 259 1.1 mrg const index_type m = xcount, n = ycount, k = count; 260 1.1 mrg 261 1.1 mrg /* System generated locals */ 262 1.1 mrg index_type a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset, 263 1.1 mrg i1, i2, i3, i4, i5, i6; 264 1.1 mrg 265 1.1 mrg /* Local variables */ 266 1.1 mrg GFC_REAL_10 f11, f12, f21, f22, f31, f32, f41, f42, 267 1.1 mrg f13, f14, f23, f24, f33, f34, f43, f44; 268 1.1 mrg index_type i, j, l, ii, jj, ll; 269 1.1 mrg index_type isec, jsec, lsec, uisec, ujsec, ulsec; 270 1.1 mrg GFC_REAL_10 *t1; 271 1.1 mrg 272 1.1 mrg a = abase; 273 1.1 mrg b = bbase; 274 1.1 mrg c = retarray->base_addr; 275 1.1 mrg 276 1.1 mrg /* Parameter adjustments */ 277 1.1 mrg c_dim1 = rystride; 278 1.1 mrg c_offset = 1 + c_dim1; 279 1.1 mrg c -= c_offset; 280 1.1 mrg a_dim1 = aystride; 281 1.1 mrg a_offset = 1 + a_dim1; 282 1.1 mrg a -= a_offset; 283 1.1 mrg b_dim1 = bystride; 284 1.1 mrg b_offset = 1 + b_dim1; 285 1.1 mrg b -= b_offset; 286 1.1 mrg 287 1.1 mrg /* Empty c first. */ 288 1.1 mrg for (j=1; j<=n; j++) 289 1.1 mrg for (i=1; i<=m; i++) 290 1.1 mrg c[i + j * c_dim1] = (GFC_REAL_10)0; 291 1.1 mrg 292 1.1 mrg /* Early exit if possible */ 293 1.1 mrg if (m == 0 || n == 0 || k == 0) 294 1.1 mrg return; 295 1.1 mrg 296 1.1 mrg /* Adjust size of t1 to what is needed. */ 297 1.1 mrg index_type t1_dim, a_sz; 298 1.1 mrg if (aystride == 1) 299 1.1 mrg a_sz = rystride; 300 1.1 mrg else 301 1.1 mrg a_sz = a_dim1; 302 1.1 mrg 303 1.1 mrg t1_dim = a_sz * 256 + b_dim1; 304 1.1 mrg if (t1_dim > 65536) 305 1.1 mrg t1_dim = 65536; 306 1.1 mrg 307 1.1 mrg t1 = malloc (t1_dim * sizeof(GFC_REAL_10)); 308 1.1 mrg 309 1.1 mrg /* Start turning the crank. */ 310 1.1 mrg i1 = n; 311 1.1 mrg for (jj = 1; jj <= i1; jj += 512) 312 1.1 mrg { 313 1.1 mrg /* Computing MIN */ 314 1.1 mrg i2 = 512; 315 1.1 mrg i3 = n - jj + 1; 316 1.1 mrg jsec = min(i2,i3); 317 1.1 mrg ujsec = jsec - jsec % 4; 318 1.1 mrg i2 = k; 319 1.1 mrg for (ll = 1; ll <= i2; ll += 256) 320 1.1 mrg { 321 1.1 mrg /* Computing MIN */ 322 1.1 mrg i3 = 256; 323 1.1 mrg i4 = k - ll + 1; 324 1.1 mrg lsec = min(i3,i4); 325 1.1 mrg ulsec = lsec - lsec % 2; 326 1.1 mrg 327 1.1 mrg i3 = m; 328 1.1 mrg for (ii = 1; ii <= i3; ii += 256) 329 1.1 mrg { 330 1.1 mrg /* Computing MIN */ 331 1.1 mrg i4 = 256; 332 1.1 mrg i5 = m - ii + 1; 333 1.1 mrg isec = min(i4,i5); 334 1.1 mrg uisec = isec - isec % 2; 335 1.1 mrg i4 = ll + ulsec - 1; 336 1.1 mrg for (l = ll; l <= i4; l += 2) 337 1.1 mrg { 338 1.1 mrg i5 = ii + uisec - 1; 339 1.1 mrg for (i = ii; i <= i5; i += 2) 340 1.1 mrg { 341 1.1 mrg t1[l - ll + 1 + ((i - ii + 1) << 8) - 257] = 342 1.1 mrg a[i + l * a_dim1]; 343 1.1 mrg t1[l - ll + 2 + ((i - ii + 1) << 8) - 257] = 344 1.1 mrg a[i + (l + 1) * a_dim1]; 345 1.1 mrg t1[l - ll + 1 + ((i - ii + 2) << 8) - 257] = 346 1.1 mrg a[i + 1 + l * a_dim1]; 347 1.1 mrg t1[l - ll + 2 + ((i - ii + 2) << 8) - 257] = 348 1.1 mrg a[i + 1 + (l + 1) * a_dim1]; 349 1.1 mrg } 350 1.1 mrg if (uisec < isec) 351 1.1 mrg { 352 1.1 mrg t1[l - ll + 1 + (isec << 8) - 257] = 353 1.1 mrg a[ii + isec - 1 + l * a_dim1]; 354 1.1 mrg t1[l - ll + 2 + (isec << 8) - 257] = 355 1.1 mrg a[ii + isec - 1 + (l + 1) * a_dim1]; 356 1.1 mrg } 357 1.1 mrg } 358 1.1 mrg if (ulsec < lsec) 359 1.1 mrg { 360 1.1 mrg i4 = ii + isec - 1; 361 1.1 mrg for (i = ii; i<= i4; ++i) 362 1.1 mrg { 363 1.1 mrg t1[lsec + ((i - ii + 1) << 8) - 257] = 364 1.1 mrg a[i + (ll + lsec - 1) * a_dim1]; 365 1.1 mrg } 366 1.1 mrg } 367 1.1 mrg 368 1.1 mrg uisec = isec - isec % 4; 369 1.1 mrg i4 = jj + ujsec - 1; 370 1.1 mrg for (j = jj; j <= i4; j += 4) 371 1.1 mrg { 372 1.1 mrg i5 = ii + uisec - 1; 373 1.1 mrg for (i = ii; i <= i5; i += 4) 374 1.1 mrg { 375 1.1 mrg f11 = c[i + j * c_dim1]; 376 1.1 mrg f21 = c[i + 1 + j * c_dim1]; 377 1.1 mrg f12 = c[i + (j + 1) * c_dim1]; 378 1.1 mrg f22 = c[i + 1 + (j + 1) * c_dim1]; 379 1.1 mrg f13 = c[i + (j + 2) * c_dim1]; 380 1.1 mrg f23 = c[i + 1 + (j + 2) * c_dim1]; 381 1.1 mrg f14 = c[i + (j + 3) * c_dim1]; 382 1.1 mrg f24 = c[i + 1 + (j + 3) * c_dim1]; 383 1.1 mrg f31 = c[i + 2 + j * c_dim1]; 384 1.1 mrg f41 = c[i + 3 + j * c_dim1]; 385 1.1 mrg f32 = c[i + 2 + (j + 1) * c_dim1]; 386 1.1 mrg f42 = c[i + 3 + (j + 1) * c_dim1]; 387 1.1 mrg f33 = c[i + 2 + (j + 2) * c_dim1]; 388 1.1 mrg f43 = c[i + 3 + (j + 2) * c_dim1]; 389 1.1 mrg f34 = c[i + 2 + (j + 3) * c_dim1]; 390 1.1 mrg f44 = c[i + 3 + (j + 3) * c_dim1]; 391 1.1 mrg i6 = ll + lsec - 1; 392 1.1 mrg for (l = ll; l <= i6; ++l) 393 1.1 mrg { 394 1.1 mrg f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257] 395 1.1 mrg * b[l + j * b_dim1]; 396 1.1 mrg f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257] 397 1.1 mrg * b[l + j * b_dim1]; 398 1.1 mrg f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257] 399 1.1 mrg * b[l + (j + 1) * b_dim1]; 400 1.1 mrg f22 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257] 401 1.1 mrg * b[l + (j + 1) * b_dim1]; 402 1.1 mrg f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257] 403 1.1 mrg * b[l + (j + 2) * b_dim1]; 404 1.1 mrg f23 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257] 405 1.1 mrg * b[l + (j + 2) * b_dim1]; 406 1.1 mrg f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257] 407 1.1 mrg * b[l + (j + 3) * b_dim1]; 408 1.1 mrg f24 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257] 409 1.1 mrg * b[l + (j + 3) * b_dim1]; 410 1.1 mrg f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257] 411 1.1 mrg * b[l + j * b_dim1]; 412 1.1 mrg f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257] 413 1.1 mrg * b[l + j * b_dim1]; 414 1.1 mrg f32 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257] 415 1.1 mrg * b[l + (j + 1) * b_dim1]; 416 1.1 mrg f42 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257] 417 1.1 mrg * b[l + (j + 1) * b_dim1]; 418 1.1 mrg f33 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257] 419 1.1 mrg * b[l + (j + 2) * b_dim1]; 420 1.1 mrg f43 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257] 421 1.1 mrg * b[l + (j + 2) * b_dim1]; 422 1.1 mrg f34 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257] 423 1.1 mrg * b[l + (j + 3) * b_dim1]; 424 1.1 mrg f44 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257] 425 1.1 mrg * b[l + (j + 3) * b_dim1]; 426 1.1 mrg } 427 1.1 mrg c[i + j * c_dim1] = f11; 428 1.1 mrg c[i + 1 + j * c_dim1] = f21; 429 1.1 mrg c[i + (j + 1) * c_dim1] = f12; 430 1.1 mrg c[i + 1 + (j + 1) * c_dim1] = f22; 431 1.1 mrg c[i + (j + 2) * c_dim1] = f13; 432 1.1 mrg c[i + 1 + (j + 2) * c_dim1] = f23; 433 1.1 mrg c[i + (j + 3) * c_dim1] = f14; 434 1.1 mrg c[i + 1 + (j + 3) * c_dim1] = f24; 435 1.1 mrg c[i + 2 + j * c_dim1] = f31; 436 1.1 mrg c[i + 3 + j * c_dim1] = f41; 437 1.1 mrg c[i + 2 + (j + 1) * c_dim1] = f32; 438 1.1 mrg c[i + 3 + (j + 1) * c_dim1] = f42; 439 1.1 mrg c[i + 2 + (j + 2) * c_dim1] = f33; 440 1.1 mrg c[i + 3 + (j + 2) * c_dim1] = f43; 441 1.1 mrg c[i + 2 + (j + 3) * c_dim1] = f34; 442 1.1 mrg c[i + 3 + (j + 3) * c_dim1] = f44; 443 1.1 mrg } 444 1.1 mrg if (uisec < isec) 445 1.1 mrg { 446 1.1 mrg i5 = ii + isec - 1; 447 1.1 mrg for (i = ii + uisec; i <= i5; ++i) 448 1.1 mrg { 449 1.1 mrg f11 = c[i + j * c_dim1]; 450 1.1 mrg f12 = c[i + (j + 1) * c_dim1]; 451 1.1 mrg f13 = c[i + (j + 2) * c_dim1]; 452 1.1 mrg f14 = c[i + (j + 3) * c_dim1]; 453 1.1 mrg i6 = ll + lsec - 1; 454 1.1 mrg for (l = ll; l <= i6; ++l) 455 1.1 mrg { 456 1.1 mrg f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 457 1.1 mrg 257] * b[l + j * b_dim1]; 458 1.1 mrg f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 459 1.1 mrg 257] * b[l + (j + 1) * b_dim1]; 460 1.1 mrg f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 461 1.1 mrg 257] * b[l + (j + 2) * b_dim1]; 462 1.1 mrg f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 463 1.1 mrg 257] * b[l + (j + 3) * b_dim1]; 464 1.1 mrg } 465 1.1 mrg c[i + j * c_dim1] = f11; 466 1.1 mrg c[i + (j + 1) * c_dim1] = f12; 467 1.1 mrg c[i + (j + 2) * c_dim1] = f13; 468 1.1 mrg c[i + (j + 3) * c_dim1] = f14; 469 1.1 mrg } 470 1.1 mrg } 471 1.1 mrg } 472 1.1 mrg if (ujsec < jsec) 473 1.1 mrg { 474 1.1 mrg i4 = jj + jsec - 1; 475 1.1 mrg for (j = jj + ujsec; j <= i4; ++j) 476 1.1 mrg { 477 1.1 mrg i5 = ii + uisec - 1; 478 1.1 mrg for (i = ii; i <= i5; i += 4) 479 1.1 mrg { 480 1.1 mrg f11 = c[i + j * c_dim1]; 481 1.1 mrg f21 = c[i + 1 + j * c_dim1]; 482 1.1 mrg f31 = c[i + 2 + j * c_dim1]; 483 1.1 mrg f41 = c[i + 3 + j * c_dim1]; 484 1.1 mrg i6 = ll + lsec - 1; 485 1.1 mrg for (l = ll; l <= i6; ++l) 486 1.1 mrg { 487 1.1 mrg f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 488 1.1 mrg 257] * b[l + j * b_dim1]; 489 1.1 mrg f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 490 1.1 mrg 257] * b[l + j * b_dim1]; 491 1.1 mrg f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 492 1.1 mrg 257] * b[l + j * b_dim1]; 493 1.1 mrg f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 494 1.1 mrg 257] * b[l + j * b_dim1]; 495 1.1 mrg } 496 1.1 mrg c[i + j * c_dim1] = f11; 497 1.1 mrg c[i + 1 + j * c_dim1] = f21; 498 1.1 mrg c[i + 2 + j * c_dim1] = f31; 499 1.1 mrg c[i + 3 + j * c_dim1] = f41; 500 1.1 mrg } 501 1.1 mrg i5 = ii + isec - 1; 502 1.1 mrg for (i = ii + uisec; i <= i5; ++i) 503 1.1 mrg { 504 1.1 mrg f11 = c[i + j * c_dim1]; 505 1.1 mrg i6 = ll + lsec - 1; 506 1.1 mrg for (l = ll; l <= i6; ++l) 507 1.1 mrg { 508 1.1 mrg f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 509 1.1 mrg 257] * b[l + j * b_dim1]; 510 1.1 mrg } 511 1.1 mrg c[i + j * c_dim1] = f11; 512 1.1 mrg } 513 1.1 mrg } 514 1.1 mrg } 515 1.1 mrg } 516 1.1 mrg } 517 1.1 mrg } 518 1.1 mrg free(t1); 519 1.1 mrg return; 520 1.1 mrg } 521 1.1 mrg else if (rxstride == 1 && aystride == 1 && bxstride == 1) 522 1.1 mrg { 523 1.1 mrg if (GFC_DESCRIPTOR_RANK (a) != 1) 524 1.1 mrg { 525 1.1 mrg const GFC_REAL_10 *restrict abase_x; 526 1.1 mrg const GFC_REAL_10 *restrict bbase_y; 527 1.1 mrg GFC_REAL_10 *restrict dest_y; 528 1.1 mrg GFC_REAL_10 s; 529 1.1 mrg 530 1.1 mrg for (y = 0; y < ycount; y++) 531 1.1 mrg { 532 1.1 mrg bbase_y = &bbase[y*bystride]; 533 1.1 mrg dest_y = &dest[y*rystride]; 534 1.1 mrg for (x = 0; x < xcount; x++) 535 1.1 mrg { 536 1.1 mrg abase_x = &abase[x*axstride]; 537 1.1 mrg s = (GFC_REAL_10) 0; 538 1.1 mrg for (n = 0; n < count; n++) 539 1.1 mrg s += abase_x[n] * bbase_y[n]; 540 1.1 mrg dest_y[x] = s; 541 1.1 mrg } 542 1.1 mrg } 543 1.1 mrg } 544 1.1 mrg else 545 1.1 mrg { 546 1.1 mrg const GFC_REAL_10 *restrict bbase_y; 547 1.1 mrg GFC_REAL_10 s; 548 1.1 mrg 549 1.1 mrg for (y = 0; y < ycount; y++) 550 1.1 mrg { 551 1.1 mrg bbase_y = &bbase[y*bystride]; 552 1.1 mrg s = (GFC_REAL_10) 0; 553 1.1 mrg for (n = 0; n < count; n++) 554 1.1 mrg s += abase[n*axstride] * bbase_y[n]; 555 1.1 mrg dest[y*rystride] = s; 556 1.1 mrg } 557 1.1 mrg } 558 1.1 mrg } 559 1.1 mrg else if (GFC_DESCRIPTOR_RANK (a) == 1) 560 1.1 mrg { 561 1.1 mrg const GFC_REAL_10 *restrict bbase_y; 562 1.1 mrg GFC_REAL_10 s; 563 1.1 mrg 564 1.1 mrg for (y = 0; y < ycount; y++) 565 1.1 mrg { 566 1.1 mrg bbase_y = &bbase[y*bystride]; 567 1.1 mrg s = (GFC_REAL_10) 0; 568 1.1 mrg for (n = 0; n < count; n++) 569 1.1 mrg s += abase[n*axstride] * bbase_y[n*bxstride]; 570 1.1 mrg dest[y*rxstride] = s; 571 1.1 mrg } 572 1.1 mrg } 573 1.1.1.2 mrg else if (axstride < aystride) 574 1.1.1.2 mrg { 575 1.1.1.2 mrg for (y = 0; y < ycount; y++) 576 1.1.1.2 mrg for (x = 0; x < xcount; x++) 577 1.1.1.2 mrg dest[x*rxstride + y*rystride] = (GFC_REAL_10)0; 578 1.1.1.2 mrg 579 1.1.1.2 mrg for (y = 0; y < ycount; y++) 580 1.1.1.2 mrg for (n = 0; n < count; n++) 581 1.1.1.2 mrg for (x = 0; x < xcount; x++) 582 1.1.1.2 mrg /* dest[x,y] += a[x,n] * b[n,y] */ 583 1.1.1.2 mrg dest[x*rxstride + y*rystride] += 584 1.1.1.2 mrg abase[x*axstride + n*aystride] * 585 1.1.1.2 mrg bbase[n*bxstride + y*bystride]; 586 1.1.1.2 mrg } 587 1.1 mrg else 588 1.1 mrg { 589 1.1 mrg const GFC_REAL_10 *restrict abase_x; 590 1.1 mrg const GFC_REAL_10 *restrict bbase_y; 591 1.1 mrg GFC_REAL_10 *restrict dest_y; 592 1.1 mrg GFC_REAL_10 s; 593 1.1 mrg 594 1.1 mrg for (y = 0; y < ycount; y++) 595 1.1 mrg { 596 1.1 mrg bbase_y = &bbase[y*bystride]; 597 1.1 mrg dest_y = &dest[y*rystride]; 598 1.1 mrg for (x = 0; x < xcount; x++) 599 1.1 mrg { 600 1.1 mrg abase_x = &abase[x*axstride]; 601 1.1 mrg s = (GFC_REAL_10) 0; 602 1.1 mrg for (n = 0; n < count; n++) 603 1.1 mrg s += abase_x[n*aystride] * bbase_y[n*bxstride]; 604 1.1 mrg dest_y[x*rxstride] = s; 605 1.1 mrg } 606 1.1 mrg } 607 1.1 mrg } 608 1.1 mrg } 609 1.1 mrg #undef POW3 610 1.1 mrg #undef min 611 1.1 mrg #undef max 612 1.1 mrg 613 1.1 mrg #endif 614 1.1 mrg 615 1.1 mrg #if defined(HAVE_AVX) && defined(HAVE_FMA4) && defined(HAVE_AVX128) 616 1.1 mrg void 617 1.1 mrg matmul_r10_avx128_fma4 (gfc_array_r10 * const restrict retarray, 618 1.1 mrg gfc_array_r10 * const restrict a, gfc_array_r10 * const restrict b, int try_blas, 619 1.1 mrg int blas_limit, blas_call gemm) __attribute__((__target__("avx,fma4"))); 620 1.1 mrg internal_proto(matmul_r10_avx128_fma4); 621 1.1 mrg void 622 1.1 mrg matmul_r10_avx128_fma4 (gfc_array_r10 * const restrict retarray, 623 1.1 mrg gfc_array_r10 * const restrict a, gfc_array_r10 * const restrict b, int try_blas, 624 1.1 mrg int blas_limit, blas_call gemm) 625 1.1 mrg { 626 1.1 mrg const GFC_REAL_10 * restrict abase; 627 1.1 mrg const GFC_REAL_10 * restrict bbase; 628 1.1 mrg GFC_REAL_10 * restrict dest; 629 1.1 mrg 630 1.1 mrg index_type rxstride, rystride, axstride, aystride, bxstride, bystride; 631 1.1 mrg index_type x, y, n, count, xcount, ycount; 632 1.1 mrg 633 1.1 mrg assert (GFC_DESCRIPTOR_RANK (a) == 2 634 1.1 mrg || GFC_DESCRIPTOR_RANK (b) == 2); 635 1.1 mrg 636 1.1 mrg /* C[xcount,ycount] = A[xcount, count] * B[count,ycount] 637 1.1 mrg 638 1.1 mrg Either A or B (but not both) can be rank 1: 639 1.1 mrg 640 1.1 mrg o One-dimensional argument A is implicitly treated as a row matrix 641 1.1 mrg dimensioned [1,count], so xcount=1. 642 1.1 mrg 643 1.1 mrg o One-dimensional argument B is implicitly treated as a column matrix 644 1.1 mrg dimensioned [count, 1], so ycount=1. 645 1.1 mrg */ 646 1.1 mrg 647 1.1 mrg if (retarray->base_addr == NULL) 648 1.1 mrg { 649 1.1 mrg if (GFC_DESCRIPTOR_RANK (a) == 1) 650 1.1 mrg { 651 1.1 mrg GFC_DIMENSION_SET(retarray->dim[0], 0, 652 1.1 mrg GFC_DESCRIPTOR_EXTENT(b,1) - 1, 1); 653 1.1 mrg } 654 1.1 mrg else if (GFC_DESCRIPTOR_RANK (b) == 1) 655 1.1 mrg { 656 1.1 mrg GFC_DIMENSION_SET(retarray->dim[0], 0, 657 1.1 mrg GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1); 658 1.1 mrg } 659 1.1 mrg else 660 1.1 mrg { 661 1.1 mrg GFC_DIMENSION_SET(retarray->dim[0], 0, 662 1.1 mrg GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1); 663 1.1 mrg 664 1.1 mrg GFC_DIMENSION_SET(retarray->dim[1], 0, 665 1.1 mrg GFC_DESCRIPTOR_EXTENT(b,1) - 1, 666 1.1 mrg GFC_DESCRIPTOR_EXTENT(retarray,0)); 667 1.1 mrg } 668 1.1 mrg 669 1.1 mrg retarray->base_addr 670 1.1 mrg = xmallocarray (size0 ((array_t *) retarray), sizeof (GFC_REAL_10)); 671 1.1 mrg retarray->offset = 0; 672 1.1 mrg } 673 1.1 mrg else if (unlikely (compile_options.bounds_check)) 674 1.1 mrg { 675 1.1 mrg index_type ret_extent, arg_extent; 676 1.1 mrg 677 1.1 mrg if (GFC_DESCRIPTOR_RANK (a) == 1) 678 1.1 mrg { 679 1.1 mrg arg_extent = GFC_DESCRIPTOR_EXTENT(b,1); 680 1.1 mrg ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0); 681 1.1 mrg if (arg_extent != ret_extent) 682 1.1 mrg runtime_error ("Array bound mismatch for dimension 1 of " 683 1.1 mrg "array (%ld/%ld) ", 684 1.1 mrg (long int) ret_extent, (long int) arg_extent); 685 1.1 mrg } 686 1.1 mrg else if (GFC_DESCRIPTOR_RANK (b) == 1) 687 1.1 mrg { 688 1.1 mrg arg_extent = GFC_DESCRIPTOR_EXTENT(a,0); 689 1.1 mrg ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0); 690 1.1 mrg if (arg_extent != ret_extent) 691 1.1 mrg runtime_error ("Array bound mismatch for dimension 1 of " 692 1.1 mrg "array (%ld/%ld) ", 693 1.1 mrg (long int) ret_extent, (long int) arg_extent); 694 1.1 mrg } 695 1.1 mrg else 696 1.1 mrg { 697 1.1 mrg arg_extent = GFC_DESCRIPTOR_EXTENT(a,0); 698 1.1 mrg ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0); 699 1.1 mrg if (arg_extent != ret_extent) 700 1.1 mrg runtime_error ("Array bound mismatch for dimension 1 of " 701 1.1 mrg "array (%ld/%ld) ", 702 1.1 mrg (long int) ret_extent, (long int) arg_extent); 703 1.1 mrg 704 1.1 mrg arg_extent = GFC_DESCRIPTOR_EXTENT(b,1); 705 1.1 mrg ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,1); 706 1.1 mrg if (arg_extent != ret_extent) 707 1.1 mrg runtime_error ("Array bound mismatch for dimension 2 of " 708 1.1 mrg "array (%ld/%ld) ", 709 1.1 mrg (long int) ret_extent, (long int) arg_extent); 710 1.1 mrg } 711 1.1 mrg } 712 1.1 mrg 713 1.1 mrg 714 1.1 mrg if (GFC_DESCRIPTOR_RANK (retarray) == 1) 715 1.1 mrg { 716 1.1 mrg /* One-dimensional result may be addressed in the code below 717 1.1 mrg either as a row or a column matrix. We want both cases to 718 1.1 mrg work. */ 719 1.1 mrg rxstride = rystride = GFC_DESCRIPTOR_STRIDE(retarray,0); 720 1.1 mrg } 721 1.1 mrg else 722 1.1 mrg { 723 1.1 mrg rxstride = GFC_DESCRIPTOR_STRIDE(retarray,0); 724 1.1 mrg rystride = GFC_DESCRIPTOR_STRIDE(retarray,1); 725 1.1 mrg } 726 1.1 mrg 727 1.1 mrg 728 1.1 mrg if (GFC_DESCRIPTOR_RANK (a) == 1) 729 1.1 mrg { 730 1.1 mrg /* Treat it as a a row matrix A[1,count]. */ 731 1.1 mrg axstride = GFC_DESCRIPTOR_STRIDE(a,0); 732 1.1 mrg aystride = 1; 733 1.1 mrg 734 1.1 mrg xcount = 1; 735 1.1 mrg count = GFC_DESCRIPTOR_EXTENT(a,0); 736 1.1 mrg } 737 1.1 mrg else 738 1.1 mrg { 739 1.1 mrg axstride = GFC_DESCRIPTOR_STRIDE(a,0); 740 1.1 mrg aystride = GFC_DESCRIPTOR_STRIDE(a,1); 741 1.1 mrg 742 1.1 mrg count = GFC_DESCRIPTOR_EXTENT(a,1); 743 1.1 mrg xcount = GFC_DESCRIPTOR_EXTENT(a,0); 744 1.1 mrg } 745 1.1 mrg 746 1.1 mrg if (count != GFC_DESCRIPTOR_EXTENT(b,0)) 747 1.1 mrg { 748 1.1 mrg if (count > 0 || GFC_DESCRIPTOR_EXTENT(b,0) > 0) 749 1.1 mrg runtime_error ("Incorrect extent in argument B in MATMUL intrinsic " 750 1.1 mrg "in dimension 1: is %ld, should be %ld", 751 1.1 mrg (long int) GFC_DESCRIPTOR_EXTENT(b,0), (long int) count); 752 1.1 mrg } 753 1.1 mrg 754 1.1 mrg if (GFC_DESCRIPTOR_RANK (b) == 1) 755 1.1 mrg { 756 1.1 mrg /* Treat it as a column matrix B[count,1] */ 757 1.1 mrg bxstride = GFC_DESCRIPTOR_STRIDE(b,0); 758 1.1 mrg 759 1.1 mrg /* bystride should never be used for 1-dimensional b. 760 1.1 mrg The value is only used for calculation of the 761 1.1 mrg memory by the buffer. */ 762 1.1 mrg bystride = 256; 763 1.1 mrg ycount = 1; 764 1.1 mrg } 765 1.1 mrg else 766 1.1 mrg { 767 1.1 mrg bxstride = GFC_DESCRIPTOR_STRIDE(b,0); 768 1.1 mrg bystride = GFC_DESCRIPTOR_STRIDE(b,1); 769 1.1 mrg ycount = GFC_DESCRIPTOR_EXTENT(b,1); 770 1.1 mrg } 771 1.1 mrg 772 1.1 mrg abase = a->base_addr; 773 1.1 mrg bbase = b->base_addr; 774 1.1 mrg dest = retarray->base_addr; 775 1.1 mrg 776 1.1 mrg /* Now that everything is set up, we perform the multiplication 777 1.1 mrg itself. */ 778 1.1 mrg 779 1.1 mrg #define POW3(x) (((float) (x)) * ((float) (x)) * ((float) (x))) 780 1.1 mrg #define min(a,b) ((a) <= (b) ? (a) : (b)) 781 1.1 mrg #define max(a,b) ((a) >= (b) ? (a) : (b)) 782 1.1 mrg 783 1.1 mrg if (try_blas && rxstride == 1 && (axstride == 1 || aystride == 1) 784 1.1 mrg && (bxstride == 1 || bystride == 1) 785 1.1 mrg && (((float) xcount) * ((float) ycount) * ((float) count) 786 1.1 mrg > POW3(blas_limit))) 787 1.1 mrg { 788 1.1 mrg const int m = xcount, n = ycount, k = count, ldc = rystride; 789 1.1 mrg const GFC_REAL_10 one = 1, zero = 0; 790 1.1 mrg const int lda = (axstride == 1) ? aystride : axstride, 791 1.1 mrg ldb = (bxstride == 1) ? bystride : bxstride; 792 1.1 mrg 793 1.1 mrg if (lda > 0 && ldb > 0 && ldc > 0 && m > 1 && n > 1 && k > 1) 794 1.1 mrg { 795 1.1 mrg assert (gemm != NULL); 796 1.1 mrg const char *transa, *transb; 797 1.1 mrg if (try_blas & 2) 798 1.1 mrg transa = "C"; 799 1.1 mrg else 800 1.1 mrg transa = axstride == 1 ? "N" : "T"; 801 1.1 mrg 802 1.1 mrg if (try_blas & 4) 803 1.1 mrg transb = "C"; 804 1.1 mrg else 805 1.1 mrg transb = bxstride == 1 ? "N" : "T"; 806 1.1 mrg 807 1.1 mrg gemm (transa, transb , &m, 808 1.1 mrg &n, &k, &one, abase, &lda, bbase, &ldb, &zero, dest, 809 1.1 mrg &ldc, 1, 1); 810 1.1 mrg return; 811 1.1 mrg } 812 1.1 mrg } 813 1.1 mrg 814 1.1.1.2 mrg if (rxstride == 1 && axstride == 1 && bxstride == 1 815 1.1.1.2 mrg && GFC_DESCRIPTOR_RANK (b) != 1) 816 1.1 mrg { 817 1.1 mrg /* This block of code implements a tuned matmul, derived from 818 1.1 mrg Superscalar GEMM-based level 3 BLAS, Beta version 0.1 819 1.1 mrg 820 1.1 mrg Bo Kagstrom and Per Ling 821 1.1 mrg Department of Computing Science 822 1.1 mrg Umea University 823 1.1 mrg S-901 87 Umea, Sweden 824 1.1 mrg 825 1.1 mrg from netlib.org, translated to C, and modified for matmul.m4. */ 826 1.1 mrg 827 1.1 mrg const GFC_REAL_10 *a, *b; 828 1.1 mrg GFC_REAL_10 *c; 829 1.1 mrg const index_type m = xcount, n = ycount, k = count; 830 1.1 mrg 831 1.1 mrg /* System generated locals */ 832 1.1 mrg index_type a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset, 833 1.1 mrg i1, i2, i3, i4, i5, i6; 834 1.1 mrg 835 1.1 mrg /* Local variables */ 836 1.1 mrg GFC_REAL_10 f11, f12, f21, f22, f31, f32, f41, f42, 837 1.1 mrg f13, f14, f23, f24, f33, f34, f43, f44; 838 1.1 mrg index_type i, j, l, ii, jj, ll; 839 1.1 mrg index_type isec, jsec, lsec, uisec, ujsec, ulsec; 840 1.1 mrg GFC_REAL_10 *t1; 841 1.1 mrg 842 1.1 mrg a = abase; 843 1.1 mrg b = bbase; 844 1.1 mrg c = retarray->base_addr; 845 1.1 mrg 846 1.1 mrg /* Parameter adjustments */ 847 1.1 mrg c_dim1 = rystride; 848 1.1 mrg c_offset = 1 + c_dim1; 849 1.1 mrg c -= c_offset; 850 1.1 mrg a_dim1 = aystride; 851 1.1 mrg a_offset = 1 + a_dim1; 852 1.1 mrg a -= a_offset; 853 1.1 mrg b_dim1 = bystride; 854 1.1 mrg b_offset = 1 + b_dim1; 855 1.1 mrg b -= b_offset; 856 1.1 mrg 857 1.1 mrg /* Empty c first. */ 858 1.1 mrg for (j=1; j<=n; j++) 859 1.1 mrg for (i=1; i<=m; i++) 860 1.1 mrg c[i + j * c_dim1] = (GFC_REAL_10)0; 861 1.1 mrg 862 1.1 mrg /* Early exit if possible */ 863 1.1 mrg if (m == 0 || n == 0 || k == 0) 864 1.1 mrg return; 865 1.1 mrg 866 1.1 mrg /* Adjust size of t1 to what is needed. */ 867 1.1 mrg index_type t1_dim, a_sz; 868 1.1 mrg if (aystride == 1) 869 1.1 mrg a_sz = rystride; 870 1.1 mrg else 871 1.1 mrg a_sz = a_dim1; 872 1.1 mrg 873 1.1 mrg t1_dim = a_sz * 256 + b_dim1; 874 1.1 mrg if (t1_dim > 65536) 875 1.1 mrg t1_dim = 65536; 876 1.1 mrg 877 1.1 mrg t1 = malloc (t1_dim * sizeof(GFC_REAL_10)); 878 1.1 mrg 879 1.1 mrg /* Start turning the crank. */ 880 1.1 mrg i1 = n; 881 1.1 mrg for (jj = 1; jj <= i1; jj += 512) 882 1.1 mrg { 883 1.1 mrg /* Computing MIN */ 884 1.1 mrg i2 = 512; 885 1.1 mrg i3 = n - jj + 1; 886 1.1 mrg jsec = min(i2,i3); 887 1.1 mrg ujsec = jsec - jsec % 4; 888 1.1 mrg i2 = k; 889 1.1 mrg for (ll = 1; ll <= i2; ll += 256) 890 1.1 mrg { 891 1.1 mrg /* Computing MIN */ 892 1.1 mrg i3 = 256; 893 1.1 mrg i4 = k - ll + 1; 894 1.1 mrg lsec = min(i3,i4); 895 1.1 mrg ulsec = lsec - lsec % 2; 896 1.1 mrg 897 1.1 mrg i3 = m; 898 1.1 mrg for (ii = 1; ii <= i3; ii += 256) 899 1.1 mrg { 900 1.1 mrg /* Computing MIN */ 901 1.1 mrg i4 = 256; 902 1.1 mrg i5 = m - ii + 1; 903 1.1 mrg isec = min(i4,i5); 904 1.1 mrg uisec = isec - isec % 2; 905 1.1 mrg i4 = ll + ulsec - 1; 906 1.1 mrg for (l = ll; l <= i4; l += 2) 907 1.1 mrg { 908 1.1 mrg i5 = ii + uisec - 1; 909 1.1 mrg for (i = ii; i <= i5; i += 2) 910 1.1 mrg { 911 1.1 mrg t1[l - ll + 1 + ((i - ii + 1) << 8) - 257] = 912 1.1 mrg a[i + l * a_dim1]; 913 1.1 mrg t1[l - ll + 2 + ((i - ii + 1) << 8) - 257] = 914 1.1 mrg a[i + (l + 1) * a_dim1]; 915 1.1 mrg t1[l - ll + 1 + ((i - ii + 2) << 8) - 257] = 916 1.1 mrg a[i + 1 + l * a_dim1]; 917 1.1 mrg t1[l - ll + 2 + ((i - ii + 2) << 8) - 257] = 918 1.1 mrg a[i + 1 + (l + 1) * a_dim1]; 919 1.1 mrg } 920 1.1 mrg if (uisec < isec) 921 1.1 mrg { 922 1.1 mrg t1[l - ll + 1 + (isec << 8) - 257] = 923 1.1 mrg a[ii + isec - 1 + l * a_dim1]; 924 1.1 mrg t1[l - ll + 2 + (isec << 8) - 257] = 925 1.1 mrg a[ii + isec - 1 + (l + 1) * a_dim1]; 926 1.1 mrg } 927 1.1 mrg } 928 1.1 mrg if (ulsec < lsec) 929 1.1 mrg { 930 1.1 mrg i4 = ii + isec - 1; 931 1.1 mrg for (i = ii; i<= i4; ++i) 932 1.1 mrg { 933 1.1 mrg t1[lsec + ((i - ii + 1) << 8) - 257] = 934 1.1 mrg a[i + (ll + lsec - 1) * a_dim1]; 935 1.1 mrg } 936 1.1 mrg } 937 1.1 mrg 938 1.1 mrg uisec = isec - isec % 4; 939 1.1 mrg i4 = jj + ujsec - 1; 940 1.1 mrg for (j = jj; j <= i4; j += 4) 941 1.1 mrg { 942 1.1 mrg i5 = ii + uisec - 1; 943 1.1 mrg for (i = ii; i <= i5; i += 4) 944 1.1 mrg { 945 1.1 mrg f11 = c[i + j * c_dim1]; 946 1.1 mrg f21 = c[i + 1 + j * c_dim1]; 947 1.1 mrg f12 = c[i + (j + 1) * c_dim1]; 948 1.1 mrg f22 = c[i + 1 + (j + 1) * c_dim1]; 949 1.1 mrg f13 = c[i + (j + 2) * c_dim1]; 950 1.1 mrg f23 = c[i + 1 + (j + 2) * c_dim1]; 951 1.1 mrg f14 = c[i + (j + 3) * c_dim1]; 952 1.1 mrg f24 = c[i + 1 + (j + 3) * c_dim1]; 953 1.1 mrg f31 = c[i + 2 + j * c_dim1]; 954 1.1 mrg f41 = c[i + 3 + j * c_dim1]; 955 1.1 mrg f32 = c[i + 2 + (j + 1) * c_dim1]; 956 1.1 mrg f42 = c[i + 3 + (j + 1) * c_dim1]; 957 1.1 mrg f33 = c[i + 2 + (j + 2) * c_dim1]; 958 1.1 mrg f43 = c[i + 3 + (j + 2) * c_dim1]; 959 1.1 mrg f34 = c[i + 2 + (j + 3) * c_dim1]; 960 1.1 mrg f44 = c[i + 3 + (j + 3) * c_dim1]; 961 1.1 mrg i6 = ll + lsec - 1; 962 1.1 mrg for (l = ll; l <= i6; ++l) 963 1.1 mrg { 964 1.1 mrg f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257] 965 1.1 mrg * b[l + j * b_dim1]; 966 1.1 mrg f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257] 967 1.1 mrg * b[l + j * b_dim1]; 968 1.1 mrg f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257] 969 1.1 mrg * b[l + (j + 1) * b_dim1]; 970 1.1 mrg f22 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257] 971 1.1 mrg * b[l + (j + 1) * b_dim1]; 972 1.1 mrg f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257] 973 1.1 mrg * b[l + (j + 2) * b_dim1]; 974 1.1 mrg f23 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257] 975 1.1 mrg * b[l + (j + 2) * b_dim1]; 976 1.1 mrg f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257] 977 1.1 mrg * b[l + (j + 3) * b_dim1]; 978 1.1 mrg f24 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257] 979 1.1 mrg * b[l + (j + 3) * b_dim1]; 980 1.1 mrg f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257] 981 1.1 mrg * b[l + j * b_dim1]; 982 1.1 mrg f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257] 983 1.1 mrg * b[l + j * b_dim1]; 984 1.1 mrg f32 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257] 985 1.1 mrg * b[l + (j + 1) * b_dim1]; 986 1.1 mrg f42 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257] 987 1.1 mrg * b[l + (j + 1) * b_dim1]; 988 1.1 mrg f33 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257] 989 1.1 mrg * b[l + (j + 2) * b_dim1]; 990 1.1 mrg f43 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257] 991 1.1 mrg * b[l + (j + 2) * b_dim1]; 992 1.1 mrg f34 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257] 993 1.1 mrg * b[l + (j + 3) * b_dim1]; 994 1.1 mrg f44 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257] 995 1.1 mrg * b[l + (j + 3) * b_dim1]; 996 1.1 mrg } 997 1.1 mrg c[i + j * c_dim1] = f11; 998 1.1 mrg c[i + 1 + j * c_dim1] = f21; 999 1.1 mrg c[i + (j + 1) * c_dim1] = f12; 1000 1.1 mrg c[i + 1 + (j + 1) * c_dim1] = f22; 1001 1.1 mrg c[i + (j + 2) * c_dim1] = f13; 1002 1.1 mrg c[i + 1 + (j + 2) * c_dim1] = f23; 1003 1.1 mrg c[i + (j + 3) * c_dim1] = f14; 1004 1.1 mrg c[i + 1 + (j + 3) * c_dim1] = f24; 1005 1.1 mrg c[i + 2 + j * c_dim1] = f31; 1006 1.1 mrg c[i + 3 + j * c_dim1] = f41; 1007 1.1 mrg c[i + 2 + (j + 1) * c_dim1] = f32; 1008 1.1 mrg c[i + 3 + (j + 1) * c_dim1] = f42; 1009 1.1 mrg c[i + 2 + (j + 2) * c_dim1] = f33; 1010 1.1 mrg c[i + 3 + (j + 2) * c_dim1] = f43; 1011 1.1 mrg c[i + 2 + (j + 3) * c_dim1] = f34; 1012 1.1 mrg c[i + 3 + (j + 3) * c_dim1] = f44; 1013 1.1 mrg } 1014 1.1 mrg if (uisec < isec) 1015 1.1 mrg { 1016 1.1 mrg i5 = ii + isec - 1; 1017 1.1 mrg for (i = ii + uisec; i <= i5; ++i) 1018 1.1 mrg { 1019 1.1 mrg f11 = c[i + j * c_dim1]; 1020 1.1 mrg f12 = c[i + (j + 1) * c_dim1]; 1021 1.1 mrg f13 = c[i + (j + 2) * c_dim1]; 1022 1.1 mrg f14 = c[i + (j + 3) * c_dim1]; 1023 1.1 mrg i6 = ll + lsec - 1; 1024 1.1 mrg for (l = ll; l <= i6; ++l) 1025 1.1 mrg { 1026 1.1 mrg f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 1027 1.1 mrg 257] * b[l + j * b_dim1]; 1028 1.1 mrg f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 1029 1.1 mrg 257] * b[l + (j + 1) * b_dim1]; 1030 1.1 mrg f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 1031 1.1 mrg 257] * b[l + (j + 2) * b_dim1]; 1032 1.1 mrg f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 1033 1.1 mrg 257] * b[l + (j + 3) * b_dim1]; 1034 1.1 mrg } 1035 1.1 mrg c[i + j * c_dim1] = f11; 1036 1.1 mrg c[i + (j + 1) * c_dim1] = f12; 1037 1.1 mrg c[i + (j + 2) * c_dim1] = f13; 1038 1.1 mrg c[i + (j + 3) * c_dim1] = f14; 1039 1.1 mrg } 1040 1.1 mrg } 1041 1.1 mrg } 1042 1.1 mrg if (ujsec < jsec) 1043 1.1 mrg { 1044 1.1 mrg i4 = jj + jsec - 1; 1045 1.1 mrg for (j = jj + ujsec; j <= i4; ++j) 1046 1.1 mrg { 1047 1.1 mrg i5 = ii + uisec - 1; 1048 1.1 mrg for (i = ii; i <= i5; i += 4) 1049 1.1 mrg { 1050 1.1 mrg f11 = c[i + j * c_dim1]; 1051 1.1 mrg f21 = c[i + 1 + j * c_dim1]; 1052 1.1 mrg f31 = c[i + 2 + j * c_dim1]; 1053 1.1 mrg f41 = c[i + 3 + j * c_dim1]; 1054 1.1 mrg i6 = ll + lsec - 1; 1055 1.1 mrg for (l = ll; l <= i6; ++l) 1056 1.1 mrg { 1057 1.1 mrg f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 1058 1.1 mrg 257] * b[l + j * b_dim1]; 1059 1.1 mrg f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 1060 1.1 mrg 257] * b[l + j * b_dim1]; 1061 1.1 mrg f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 1062 1.1 mrg 257] * b[l + j * b_dim1]; 1063 1.1 mrg f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 1064 1.1 mrg 257] * b[l + j * b_dim1]; 1065 1.1 mrg } 1066 1.1 mrg c[i + j * c_dim1] = f11; 1067 1.1 mrg c[i + 1 + j * c_dim1] = f21; 1068 1.1 mrg c[i + 2 + j * c_dim1] = f31; 1069 1.1 mrg c[i + 3 + j * c_dim1] = f41; 1070 1.1 mrg } 1071 1.1 mrg i5 = ii + isec - 1; 1072 1.1 mrg for (i = ii + uisec; i <= i5; ++i) 1073 1.1 mrg { 1074 1.1 mrg f11 = c[i + j * c_dim1]; 1075 1.1 mrg i6 = ll + lsec - 1; 1076 1.1 mrg for (l = ll; l <= i6; ++l) 1077 1.1 mrg { 1078 1.1 mrg f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 1079 1.1 mrg 257] * b[l + j * b_dim1]; 1080 1.1 mrg } 1081 1.1 mrg c[i + j * c_dim1] = f11; 1082 1.1 mrg } 1083 1.1 mrg } 1084 1.1 mrg } 1085 1.1 mrg } 1086 1.1 mrg } 1087 1.1 mrg } 1088 1.1 mrg free(t1); 1089 1.1 mrg return; 1090 1.1 mrg } 1091 1.1 mrg else if (rxstride == 1 && aystride == 1 && bxstride == 1) 1092 1.1 mrg { 1093 1.1 mrg if (GFC_DESCRIPTOR_RANK (a) != 1) 1094 1.1 mrg { 1095 1.1 mrg const GFC_REAL_10 *restrict abase_x; 1096 1.1 mrg const GFC_REAL_10 *restrict bbase_y; 1097 1.1 mrg GFC_REAL_10 *restrict dest_y; 1098 1.1 mrg GFC_REAL_10 s; 1099 1.1 mrg 1100 1.1 mrg for (y = 0; y < ycount; y++) 1101 1.1 mrg { 1102 1.1 mrg bbase_y = &bbase[y*bystride]; 1103 1.1 mrg dest_y = &dest[y*rystride]; 1104 1.1 mrg for (x = 0; x < xcount; x++) 1105 1.1 mrg { 1106 1.1 mrg abase_x = &abase[x*axstride]; 1107 1.1 mrg s = (GFC_REAL_10) 0; 1108 1.1 mrg for (n = 0; n < count; n++) 1109 1.1 mrg s += abase_x[n] * bbase_y[n]; 1110 1.1 mrg dest_y[x] = s; 1111 1.1 mrg } 1112 1.1 mrg } 1113 1.1 mrg } 1114 1.1 mrg else 1115 1.1 mrg { 1116 1.1 mrg const GFC_REAL_10 *restrict bbase_y; 1117 1.1 mrg GFC_REAL_10 s; 1118 1.1 mrg 1119 1.1 mrg for (y = 0; y < ycount; y++) 1120 1.1 mrg { 1121 1.1 mrg bbase_y = &bbase[y*bystride]; 1122 1.1 mrg s = (GFC_REAL_10) 0; 1123 1.1 mrg for (n = 0; n < count; n++) 1124 1.1 mrg s += abase[n*axstride] * bbase_y[n]; 1125 1.1 mrg dest[y*rystride] = s; 1126 1.1 mrg } 1127 1.1 mrg } 1128 1.1 mrg } 1129 1.1 mrg else if (GFC_DESCRIPTOR_RANK (a) == 1) 1130 1.1 mrg { 1131 1.1 mrg const GFC_REAL_10 *restrict bbase_y; 1132 1.1 mrg GFC_REAL_10 s; 1133 1.1 mrg 1134 1.1 mrg for (y = 0; y < ycount; y++) 1135 1.1 mrg { 1136 1.1 mrg bbase_y = &bbase[y*bystride]; 1137 1.1 mrg s = (GFC_REAL_10) 0; 1138 1.1 mrg for (n = 0; n < count; n++) 1139 1.1 mrg s += abase[n*axstride] * bbase_y[n*bxstride]; 1140 1.1 mrg dest[y*rxstride] = s; 1141 1.1 mrg } 1142 1.1 mrg } 1143 1.1.1.2 mrg else if (axstride < aystride) 1144 1.1.1.2 mrg { 1145 1.1.1.2 mrg for (y = 0; y < ycount; y++) 1146 1.1.1.2 mrg for (x = 0; x < xcount; x++) 1147 1.1.1.2 mrg dest[x*rxstride + y*rystride] = (GFC_REAL_10)0; 1148 1.1.1.2 mrg 1149 1.1.1.2 mrg for (y = 0; y < ycount; y++) 1150 1.1.1.2 mrg for (n = 0; n < count; n++) 1151 1.1.1.2 mrg for (x = 0; x < xcount; x++) 1152 1.1.1.2 mrg /* dest[x,y] += a[x,n] * b[n,y] */ 1153 1.1.1.2 mrg dest[x*rxstride + y*rystride] += 1154 1.1.1.2 mrg abase[x*axstride + n*aystride] * 1155 1.1.1.2 mrg bbase[n*bxstride + y*bystride]; 1156 1.1.1.2 mrg } 1157 1.1 mrg else 1158 1.1 mrg { 1159 1.1 mrg const GFC_REAL_10 *restrict abase_x; 1160 1.1 mrg const GFC_REAL_10 *restrict bbase_y; 1161 1.1 mrg GFC_REAL_10 *restrict dest_y; 1162 1.1 mrg GFC_REAL_10 s; 1163 1.1 mrg 1164 1.1 mrg for (y = 0; y < ycount; y++) 1165 1.1 mrg { 1166 1.1 mrg bbase_y = &bbase[y*bystride]; 1167 1.1 mrg dest_y = &dest[y*rystride]; 1168 1.1 mrg for (x = 0; x < xcount; x++) 1169 1.1 mrg { 1170 1.1 mrg abase_x = &abase[x*axstride]; 1171 1.1 mrg s = (GFC_REAL_10) 0; 1172 1.1 mrg for (n = 0; n < count; n++) 1173 1.1 mrg s += abase_x[n*aystride] * bbase_y[n*bxstride]; 1174 1.1 mrg dest_y[x*rxstride] = s; 1175 1.1 mrg } 1176 1.1 mrg } 1177 1.1 mrg } 1178 1.1 mrg } 1179 1.1 mrg #undef POW3 1180 1.1 mrg #undef min 1181 1.1 mrg #undef max 1182 1.1 mrg 1183 1.1 mrg #endif 1184 1.1 mrg 1185 1.1 mrg #endif 1186 1.1 mrg 1187