1 1.1 mrg /* Implementation of the MATMUL intrinsic 2 1.1.1.4 mrg Copyright (C) 2002-2024 Free Software Foundation, Inc. 3 1.1 mrg Contributed by Paul Brook <paul (at) nowt.org> 4 1.1 mrg 5 1.1 mrg This file is part of the GNU Fortran runtime library (libgfortran). 6 1.1 mrg 7 1.1 mrg Libgfortran is free software; you can redistribute it and/or 8 1.1 mrg modify it under the terms of the GNU General Public 9 1.1 mrg License as published by the Free Software Foundation; either 10 1.1 mrg version 3 of the License, or (at your option) any later version. 11 1.1 mrg 12 1.1 mrg Libgfortran is distributed in the hope that it will be useful, 13 1.1 mrg but WITHOUT ANY WARRANTY; without even the implied warranty of 14 1.1 mrg MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 15 1.1 mrg GNU General Public License for more details. 16 1.1 mrg 17 1.1 mrg Under Section 7 of GPL version 3, you are granted additional 18 1.1 mrg permissions described in the GCC Runtime Library Exception, version 19 1.1 mrg 3.1, as published by the Free Software Foundation. 20 1.1 mrg 21 1.1 mrg You should have received a copy of the GNU General Public License and 22 1.1 mrg a copy of the GCC Runtime Library Exception along with this program; 23 1.1 mrg see the files COPYING3 and COPYING.RUNTIME respectively. If not, see 24 1.1 mrg <http://www.gnu.org/licenses/>. */ 25 1.1 mrg 26 1.1 mrg #include "libgfortran.h" 27 1.1 mrg #include <string.h> 28 1.1 mrg #include <assert.h> 29 1.1 mrg 30 1.1 mrg 31 1.1 mrg #if defined (HAVE_GFC_COMPLEX_16) 32 1.1 mrg 33 1.1 mrg /* Prototype for the BLAS ?gemm subroutine, a pointer to which can be 34 1.1 mrg passed to us by the front-end, in which case we call it for large 35 1.1 mrg matrices. */ 36 1.1 mrg 37 1.1 mrg typedef void (*blas_call)(const char *, const char *, const int *, const int *, 38 1.1 mrg const int *, const GFC_COMPLEX_16 *, const GFC_COMPLEX_16 *, 39 1.1 mrg const int *, const GFC_COMPLEX_16 *, const int *, 40 1.1 mrg const GFC_COMPLEX_16 *, GFC_COMPLEX_16 *, const int *, 41 1.1 mrg int, int); 42 1.1 mrg 43 1.1 mrg /* The order of loops is different in the case of plain matrix 44 1.1 mrg multiplication C=MATMUL(A,B), and in the frequent special case where 45 1.1 mrg the argument A is the temporary result of a TRANSPOSE intrinsic: 46 1.1 mrg C=MATMUL(TRANSPOSE(A),B). Transposed temporaries are detected by 47 1.1 mrg looking at their strides. 48 1.1 mrg 49 1.1 mrg The equivalent Fortran pseudo-code is: 50 1.1 mrg 51 1.1 mrg DIMENSION A(M,COUNT), B(COUNT,N), C(M,N) 52 1.1 mrg IF (.NOT.IS_TRANSPOSED(A)) THEN 53 1.1 mrg C = 0 54 1.1 mrg DO J=1,N 55 1.1 mrg DO K=1,COUNT 56 1.1 mrg DO I=1,M 57 1.1 mrg C(I,J) = C(I,J)+A(I,K)*B(K,J) 58 1.1 mrg ELSE 59 1.1 mrg DO J=1,N 60 1.1 mrg DO I=1,M 61 1.1 mrg S = 0 62 1.1 mrg DO K=1,COUNT 63 1.1 mrg S = S+A(I,K)*B(K,J) 64 1.1 mrg C(I,J) = S 65 1.1 mrg ENDIF 66 1.1 mrg */ 67 1.1 mrg 68 1.1 mrg /* If try_blas is set to a nonzero value, then the matmul function will 69 1.1 mrg see if there is a way to perform the matrix multiplication by a call 70 1.1 mrg to the BLAS gemm function. */ 71 1.1 mrg 72 1.1 mrg extern void matmul_c16 (gfc_array_c16 * const restrict retarray, 73 1.1 mrg gfc_array_c16 * const restrict a, gfc_array_c16 * const restrict b, int try_blas, 74 1.1 mrg int blas_limit, blas_call gemm); 75 1.1 mrg export_proto(matmul_c16); 76 1.1 mrg 77 1.1 mrg /* Put exhaustive list of possible architectures here here, ORed together. */ 78 1.1 mrg 79 1.1 mrg #if defined(HAVE_AVX) || defined(HAVE_AVX2) || defined(HAVE_AVX512F) 80 1.1 mrg 81 1.1 mrg #ifdef HAVE_AVX 82 1.1 mrg static void 83 1.1 mrg matmul_c16_avx (gfc_array_c16 * const restrict retarray, 84 1.1 mrg gfc_array_c16 * const restrict a, gfc_array_c16 * const restrict b, int try_blas, 85 1.1 mrg int blas_limit, blas_call gemm) __attribute__((__target__("avx"))); 86 1.1 mrg static void 87 1.1 mrg matmul_c16_avx (gfc_array_c16 * const restrict retarray, 88 1.1 mrg gfc_array_c16 * const restrict a, gfc_array_c16 * const restrict b, int try_blas, 89 1.1 mrg int blas_limit, blas_call gemm) 90 1.1 mrg { 91 1.1 mrg const GFC_COMPLEX_16 * restrict abase; 92 1.1 mrg const GFC_COMPLEX_16 * restrict bbase; 93 1.1 mrg GFC_COMPLEX_16 * restrict dest; 94 1.1 mrg 95 1.1 mrg index_type rxstride, rystride, axstride, aystride, bxstride, bystride; 96 1.1 mrg index_type x, y, n, count, xcount, ycount; 97 1.1 mrg 98 1.1 mrg assert (GFC_DESCRIPTOR_RANK (a) == 2 99 1.1 mrg || GFC_DESCRIPTOR_RANK (b) == 2); 100 1.1 mrg 101 1.1 mrg /* C[xcount,ycount] = A[xcount, count] * B[count,ycount] 102 1.1 mrg 103 1.1 mrg Either A or B (but not both) can be rank 1: 104 1.1 mrg 105 1.1 mrg o One-dimensional argument A is implicitly treated as a row matrix 106 1.1 mrg dimensioned [1,count], so xcount=1. 107 1.1 mrg 108 1.1 mrg o One-dimensional argument B is implicitly treated as a column matrix 109 1.1 mrg dimensioned [count, 1], so ycount=1. 110 1.1 mrg */ 111 1.1 mrg 112 1.1 mrg if (retarray->base_addr == NULL) 113 1.1 mrg { 114 1.1 mrg if (GFC_DESCRIPTOR_RANK (a) == 1) 115 1.1 mrg { 116 1.1 mrg GFC_DIMENSION_SET(retarray->dim[0], 0, 117 1.1 mrg GFC_DESCRIPTOR_EXTENT(b,1) - 1, 1); 118 1.1 mrg } 119 1.1 mrg else if (GFC_DESCRIPTOR_RANK (b) == 1) 120 1.1 mrg { 121 1.1 mrg GFC_DIMENSION_SET(retarray->dim[0], 0, 122 1.1 mrg GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1); 123 1.1 mrg } 124 1.1 mrg else 125 1.1 mrg { 126 1.1 mrg GFC_DIMENSION_SET(retarray->dim[0], 0, 127 1.1 mrg GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1); 128 1.1 mrg 129 1.1 mrg GFC_DIMENSION_SET(retarray->dim[1], 0, 130 1.1 mrg GFC_DESCRIPTOR_EXTENT(b,1) - 1, 131 1.1 mrg GFC_DESCRIPTOR_EXTENT(retarray,0)); 132 1.1 mrg } 133 1.1 mrg 134 1.1 mrg retarray->base_addr 135 1.1 mrg = xmallocarray (size0 ((array_t *) retarray), sizeof (GFC_COMPLEX_16)); 136 1.1 mrg retarray->offset = 0; 137 1.1 mrg } 138 1.1 mrg else if (unlikely (compile_options.bounds_check)) 139 1.1 mrg { 140 1.1 mrg index_type ret_extent, arg_extent; 141 1.1 mrg 142 1.1 mrg if (GFC_DESCRIPTOR_RANK (a) == 1) 143 1.1 mrg { 144 1.1 mrg arg_extent = GFC_DESCRIPTOR_EXTENT(b,1); 145 1.1 mrg ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0); 146 1.1 mrg if (arg_extent != ret_extent) 147 1.1 mrg runtime_error ("Array bound mismatch for dimension 1 of " 148 1.1 mrg "array (%ld/%ld) ", 149 1.1 mrg (long int) ret_extent, (long int) arg_extent); 150 1.1 mrg } 151 1.1 mrg else if (GFC_DESCRIPTOR_RANK (b) == 1) 152 1.1 mrg { 153 1.1 mrg arg_extent = GFC_DESCRIPTOR_EXTENT(a,0); 154 1.1 mrg ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0); 155 1.1 mrg if (arg_extent != ret_extent) 156 1.1 mrg runtime_error ("Array bound mismatch for dimension 1 of " 157 1.1 mrg "array (%ld/%ld) ", 158 1.1 mrg (long int) ret_extent, (long int) arg_extent); 159 1.1 mrg } 160 1.1 mrg else 161 1.1 mrg { 162 1.1 mrg arg_extent = GFC_DESCRIPTOR_EXTENT(a,0); 163 1.1 mrg ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0); 164 1.1 mrg if (arg_extent != ret_extent) 165 1.1 mrg runtime_error ("Array bound mismatch for dimension 1 of " 166 1.1 mrg "array (%ld/%ld) ", 167 1.1 mrg (long int) ret_extent, (long int) arg_extent); 168 1.1 mrg 169 1.1 mrg arg_extent = GFC_DESCRIPTOR_EXTENT(b,1); 170 1.1 mrg ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,1); 171 1.1 mrg if (arg_extent != ret_extent) 172 1.1 mrg runtime_error ("Array bound mismatch for dimension 2 of " 173 1.1 mrg "array (%ld/%ld) ", 174 1.1 mrg (long int) ret_extent, (long int) arg_extent); 175 1.1 mrg } 176 1.1 mrg } 177 1.1 mrg 178 1.1 mrg 179 1.1 mrg if (GFC_DESCRIPTOR_RANK (retarray) == 1) 180 1.1 mrg { 181 1.1 mrg /* One-dimensional result may be addressed in the code below 182 1.1 mrg either as a row or a column matrix. We want both cases to 183 1.1 mrg work. */ 184 1.1 mrg rxstride = rystride = GFC_DESCRIPTOR_STRIDE(retarray,0); 185 1.1 mrg } 186 1.1 mrg else 187 1.1 mrg { 188 1.1 mrg rxstride = GFC_DESCRIPTOR_STRIDE(retarray,0); 189 1.1 mrg rystride = GFC_DESCRIPTOR_STRIDE(retarray,1); 190 1.1 mrg } 191 1.1 mrg 192 1.1 mrg 193 1.1 mrg if (GFC_DESCRIPTOR_RANK (a) == 1) 194 1.1 mrg { 195 1.1 mrg /* Treat it as a a row matrix A[1,count]. */ 196 1.1 mrg axstride = GFC_DESCRIPTOR_STRIDE(a,0); 197 1.1 mrg aystride = 1; 198 1.1 mrg 199 1.1 mrg xcount = 1; 200 1.1 mrg count = GFC_DESCRIPTOR_EXTENT(a,0); 201 1.1 mrg } 202 1.1 mrg else 203 1.1 mrg { 204 1.1 mrg axstride = GFC_DESCRIPTOR_STRIDE(a,0); 205 1.1 mrg aystride = GFC_DESCRIPTOR_STRIDE(a,1); 206 1.1 mrg 207 1.1 mrg count = GFC_DESCRIPTOR_EXTENT(a,1); 208 1.1 mrg xcount = GFC_DESCRIPTOR_EXTENT(a,0); 209 1.1 mrg } 210 1.1 mrg 211 1.1 mrg if (count != GFC_DESCRIPTOR_EXTENT(b,0)) 212 1.1 mrg { 213 1.1 mrg if (count > 0 || GFC_DESCRIPTOR_EXTENT(b,0) > 0) 214 1.1 mrg runtime_error ("Incorrect extent in argument B in MATMUL intrinsic " 215 1.1 mrg "in dimension 1: is %ld, should be %ld", 216 1.1 mrg (long int) GFC_DESCRIPTOR_EXTENT(b,0), (long int) count); 217 1.1 mrg } 218 1.1 mrg 219 1.1 mrg if (GFC_DESCRIPTOR_RANK (b) == 1) 220 1.1 mrg { 221 1.1 mrg /* Treat it as a column matrix B[count,1] */ 222 1.1 mrg bxstride = GFC_DESCRIPTOR_STRIDE(b,0); 223 1.1 mrg 224 1.1 mrg /* bystride should never be used for 1-dimensional b. 225 1.1 mrg The value is only used for calculation of the 226 1.1 mrg memory by the buffer. */ 227 1.1 mrg bystride = 256; 228 1.1 mrg ycount = 1; 229 1.1 mrg } 230 1.1 mrg else 231 1.1 mrg { 232 1.1 mrg bxstride = GFC_DESCRIPTOR_STRIDE(b,0); 233 1.1 mrg bystride = GFC_DESCRIPTOR_STRIDE(b,1); 234 1.1 mrg ycount = GFC_DESCRIPTOR_EXTENT(b,1); 235 1.1 mrg } 236 1.1 mrg 237 1.1 mrg abase = a->base_addr; 238 1.1 mrg bbase = b->base_addr; 239 1.1 mrg dest = retarray->base_addr; 240 1.1 mrg 241 1.1 mrg /* Now that everything is set up, we perform the multiplication 242 1.1 mrg itself. */ 243 1.1 mrg 244 1.1 mrg #define POW3(x) (((float) (x)) * ((float) (x)) * ((float) (x))) 245 1.1 mrg #define min(a,b) ((a) <= (b) ? (a) : (b)) 246 1.1 mrg #define max(a,b) ((a) >= (b) ? (a) : (b)) 247 1.1 mrg 248 1.1 mrg if (try_blas && rxstride == 1 && (axstride == 1 || aystride == 1) 249 1.1 mrg && (bxstride == 1 || bystride == 1) 250 1.1 mrg && (((float) xcount) * ((float) ycount) * ((float) count) 251 1.1 mrg > POW3(blas_limit))) 252 1.1 mrg { 253 1.1 mrg const int m = xcount, n = ycount, k = count, ldc = rystride; 254 1.1 mrg const GFC_COMPLEX_16 one = 1, zero = 0; 255 1.1 mrg const int lda = (axstride == 1) ? aystride : axstride, 256 1.1 mrg ldb = (bxstride == 1) ? bystride : bxstride; 257 1.1 mrg 258 1.1 mrg if (lda > 0 && ldb > 0 && ldc > 0 && m > 1 && n > 1 && k > 1) 259 1.1 mrg { 260 1.1 mrg assert (gemm != NULL); 261 1.1 mrg const char *transa, *transb; 262 1.1 mrg if (try_blas & 2) 263 1.1 mrg transa = "C"; 264 1.1 mrg else 265 1.1 mrg transa = axstride == 1 ? "N" : "T"; 266 1.1 mrg 267 1.1 mrg if (try_blas & 4) 268 1.1 mrg transb = "C"; 269 1.1 mrg else 270 1.1 mrg transb = bxstride == 1 ? "N" : "T"; 271 1.1 mrg 272 1.1 mrg gemm (transa, transb , &m, 273 1.1 mrg &n, &k, &one, abase, &lda, bbase, &ldb, &zero, dest, 274 1.1 mrg &ldc, 1, 1); 275 1.1 mrg return; 276 1.1 mrg } 277 1.1 mrg } 278 1.1 mrg 279 1.1.1.2 mrg if (rxstride == 1 && axstride == 1 && bxstride == 1 280 1.1.1.2 mrg && GFC_DESCRIPTOR_RANK (b) != 1) 281 1.1 mrg { 282 1.1 mrg /* This block of code implements a tuned matmul, derived from 283 1.1 mrg Superscalar GEMM-based level 3 BLAS, Beta version 0.1 284 1.1 mrg 285 1.1 mrg Bo Kagstrom and Per Ling 286 1.1 mrg Department of Computing Science 287 1.1 mrg Umea University 288 1.1 mrg S-901 87 Umea, Sweden 289 1.1 mrg 290 1.1 mrg from netlib.org, translated to C, and modified for matmul.m4. */ 291 1.1 mrg 292 1.1 mrg const GFC_COMPLEX_16 *a, *b; 293 1.1 mrg GFC_COMPLEX_16 *c; 294 1.1 mrg const index_type m = xcount, n = ycount, k = count; 295 1.1 mrg 296 1.1 mrg /* System generated locals */ 297 1.1 mrg index_type a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset, 298 1.1 mrg i1, i2, i3, i4, i5, i6; 299 1.1 mrg 300 1.1 mrg /* Local variables */ 301 1.1 mrg GFC_COMPLEX_16 f11, f12, f21, f22, f31, f32, f41, f42, 302 1.1 mrg f13, f14, f23, f24, f33, f34, f43, f44; 303 1.1 mrg index_type i, j, l, ii, jj, ll; 304 1.1 mrg index_type isec, jsec, lsec, uisec, ujsec, ulsec; 305 1.1 mrg GFC_COMPLEX_16 *t1; 306 1.1 mrg 307 1.1 mrg a = abase; 308 1.1 mrg b = bbase; 309 1.1 mrg c = retarray->base_addr; 310 1.1 mrg 311 1.1 mrg /* Parameter adjustments */ 312 1.1 mrg c_dim1 = rystride; 313 1.1 mrg c_offset = 1 + c_dim1; 314 1.1 mrg c -= c_offset; 315 1.1 mrg a_dim1 = aystride; 316 1.1 mrg a_offset = 1 + a_dim1; 317 1.1 mrg a -= a_offset; 318 1.1 mrg b_dim1 = bystride; 319 1.1 mrg b_offset = 1 + b_dim1; 320 1.1 mrg b -= b_offset; 321 1.1 mrg 322 1.1 mrg /* Empty c first. */ 323 1.1 mrg for (j=1; j<=n; j++) 324 1.1 mrg for (i=1; i<=m; i++) 325 1.1 mrg c[i + j * c_dim1] = (GFC_COMPLEX_16)0; 326 1.1 mrg 327 1.1 mrg /* Early exit if possible */ 328 1.1 mrg if (m == 0 || n == 0 || k == 0) 329 1.1 mrg return; 330 1.1 mrg 331 1.1 mrg /* Adjust size of t1 to what is needed. */ 332 1.1 mrg index_type t1_dim, a_sz; 333 1.1 mrg if (aystride == 1) 334 1.1 mrg a_sz = rystride; 335 1.1 mrg else 336 1.1 mrg a_sz = a_dim1; 337 1.1 mrg 338 1.1 mrg t1_dim = a_sz * 256 + b_dim1; 339 1.1 mrg if (t1_dim > 65536) 340 1.1 mrg t1_dim = 65536; 341 1.1 mrg 342 1.1 mrg t1 = malloc (t1_dim * sizeof(GFC_COMPLEX_16)); 343 1.1 mrg 344 1.1 mrg /* Start turning the crank. */ 345 1.1 mrg i1 = n; 346 1.1 mrg for (jj = 1; jj <= i1; jj += 512) 347 1.1 mrg { 348 1.1 mrg /* Computing MIN */ 349 1.1 mrg i2 = 512; 350 1.1 mrg i3 = n - jj + 1; 351 1.1 mrg jsec = min(i2,i3); 352 1.1 mrg ujsec = jsec - jsec % 4; 353 1.1 mrg i2 = k; 354 1.1 mrg for (ll = 1; ll <= i2; ll += 256) 355 1.1 mrg { 356 1.1 mrg /* Computing MIN */ 357 1.1 mrg i3 = 256; 358 1.1 mrg i4 = k - ll + 1; 359 1.1 mrg lsec = min(i3,i4); 360 1.1 mrg ulsec = lsec - lsec % 2; 361 1.1 mrg 362 1.1 mrg i3 = m; 363 1.1 mrg for (ii = 1; ii <= i3; ii += 256) 364 1.1 mrg { 365 1.1 mrg /* Computing MIN */ 366 1.1 mrg i4 = 256; 367 1.1 mrg i5 = m - ii + 1; 368 1.1 mrg isec = min(i4,i5); 369 1.1 mrg uisec = isec - isec % 2; 370 1.1 mrg i4 = ll + ulsec - 1; 371 1.1 mrg for (l = ll; l <= i4; l += 2) 372 1.1 mrg { 373 1.1 mrg i5 = ii + uisec - 1; 374 1.1 mrg for (i = ii; i <= i5; i += 2) 375 1.1 mrg { 376 1.1 mrg t1[l - ll + 1 + ((i - ii + 1) << 8) - 257] = 377 1.1 mrg a[i + l * a_dim1]; 378 1.1 mrg t1[l - ll + 2 + ((i - ii + 1) << 8) - 257] = 379 1.1 mrg a[i + (l + 1) * a_dim1]; 380 1.1 mrg t1[l - ll + 1 + ((i - ii + 2) << 8) - 257] = 381 1.1 mrg a[i + 1 + l * a_dim1]; 382 1.1 mrg t1[l - ll + 2 + ((i - ii + 2) << 8) - 257] = 383 1.1 mrg a[i + 1 + (l + 1) * a_dim1]; 384 1.1 mrg } 385 1.1 mrg if (uisec < isec) 386 1.1 mrg { 387 1.1 mrg t1[l - ll + 1 + (isec << 8) - 257] = 388 1.1 mrg a[ii + isec - 1 + l * a_dim1]; 389 1.1 mrg t1[l - ll + 2 + (isec << 8) - 257] = 390 1.1 mrg a[ii + isec - 1 + (l + 1) * a_dim1]; 391 1.1 mrg } 392 1.1 mrg } 393 1.1 mrg if (ulsec < lsec) 394 1.1 mrg { 395 1.1 mrg i4 = ii + isec - 1; 396 1.1 mrg for (i = ii; i<= i4; ++i) 397 1.1 mrg { 398 1.1 mrg t1[lsec + ((i - ii + 1) << 8) - 257] = 399 1.1 mrg a[i + (ll + lsec - 1) * a_dim1]; 400 1.1 mrg } 401 1.1 mrg } 402 1.1 mrg 403 1.1 mrg uisec = isec - isec % 4; 404 1.1 mrg i4 = jj + ujsec - 1; 405 1.1 mrg for (j = jj; j <= i4; j += 4) 406 1.1 mrg { 407 1.1 mrg i5 = ii + uisec - 1; 408 1.1 mrg for (i = ii; i <= i5; i += 4) 409 1.1 mrg { 410 1.1 mrg f11 = c[i + j * c_dim1]; 411 1.1 mrg f21 = c[i + 1 + j * c_dim1]; 412 1.1 mrg f12 = c[i + (j + 1) * c_dim1]; 413 1.1 mrg f22 = c[i + 1 + (j + 1) * c_dim1]; 414 1.1 mrg f13 = c[i + (j + 2) * c_dim1]; 415 1.1 mrg f23 = c[i + 1 + (j + 2) * c_dim1]; 416 1.1 mrg f14 = c[i + (j + 3) * c_dim1]; 417 1.1 mrg f24 = c[i + 1 + (j + 3) * c_dim1]; 418 1.1 mrg f31 = c[i + 2 + j * c_dim1]; 419 1.1 mrg f41 = c[i + 3 + j * c_dim1]; 420 1.1 mrg f32 = c[i + 2 + (j + 1) * c_dim1]; 421 1.1 mrg f42 = c[i + 3 + (j + 1) * c_dim1]; 422 1.1 mrg f33 = c[i + 2 + (j + 2) * c_dim1]; 423 1.1 mrg f43 = c[i + 3 + (j + 2) * c_dim1]; 424 1.1 mrg f34 = c[i + 2 + (j + 3) * c_dim1]; 425 1.1 mrg f44 = c[i + 3 + (j + 3) * c_dim1]; 426 1.1 mrg i6 = ll + lsec - 1; 427 1.1 mrg for (l = ll; l <= i6; ++l) 428 1.1 mrg { 429 1.1 mrg f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257] 430 1.1 mrg * b[l + j * b_dim1]; 431 1.1 mrg f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257] 432 1.1 mrg * b[l + j * b_dim1]; 433 1.1 mrg f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257] 434 1.1 mrg * b[l + (j + 1) * b_dim1]; 435 1.1 mrg f22 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257] 436 1.1 mrg * b[l + (j + 1) * b_dim1]; 437 1.1 mrg f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257] 438 1.1 mrg * b[l + (j + 2) * b_dim1]; 439 1.1 mrg f23 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257] 440 1.1 mrg * b[l + (j + 2) * b_dim1]; 441 1.1 mrg f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257] 442 1.1 mrg * b[l + (j + 3) * b_dim1]; 443 1.1 mrg f24 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257] 444 1.1 mrg * b[l + (j + 3) * b_dim1]; 445 1.1 mrg f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257] 446 1.1 mrg * b[l + j * b_dim1]; 447 1.1 mrg f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257] 448 1.1 mrg * b[l + j * b_dim1]; 449 1.1 mrg f32 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257] 450 1.1 mrg * b[l + (j + 1) * b_dim1]; 451 1.1 mrg f42 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257] 452 1.1 mrg * b[l + (j + 1) * b_dim1]; 453 1.1 mrg f33 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257] 454 1.1 mrg * b[l + (j + 2) * b_dim1]; 455 1.1 mrg f43 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257] 456 1.1 mrg * b[l + (j + 2) * b_dim1]; 457 1.1 mrg f34 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257] 458 1.1 mrg * b[l + (j + 3) * b_dim1]; 459 1.1 mrg f44 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257] 460 1.1 mrg * b[l + (j + 3) * b_dim1]; 461 1.1 mrg } 462 1.1 mrg c[i + j * c_dim1] = f11; 463 1.1 mrg c[i + 1 + j * c_dim1] = f21; 464 1.1 mrg c[i + (j + 1) * c_dim1] = f12; 465 1.1 mrg c[i + 1 + (j + 1) * c_dim1] = f22; 466 1.1 mrg c[i + (j + 2) * c_dim1] = f13; 467 1.1 mrg c[i + 1 + (j + 2) * c_dim1] = f23; 468 1.1 mrg c[i + (j + 3) * c_dim1] = f14; 469 1.1 mrg c[i + 1 + (j + 3) * c_dim1] = f24; 470 1.1 mrg c[i + 2 + j * c_dim1] = f31; 471 1.1 mrg c[i + 3 + j * c_dim1] = f41; 472 1.1 mrg c[i + 2 + (j + 1) * c_dim1] = f32; 473 1.1 mrg c[i + 3 + (j + 1) * c_dim1] = f42; 474 1.1 mrg c[i + 2 + (j + 2) * c_dim1] = f33; 475 1.1 mrg c[i + 3 + (j + 2) * c_dim1] = f43; 476 1.1 mrg c[i + 2 + (j + 3) * c_dim1] = f34; 477 1.1 mrg c[i + 3 + (j + 3) * c_dim1] = f44; 478 1.1 mrg } 479 1.1 mrg if (uisec < isec) 480 1.1 mrg { 481 1.1 mrg i5 = ii + isec - 1; 482 1.1 mrg for (i = ii + uisec; i <= i5; ++i) 483 1.1 mrg { 484 1.1 mrg f11 = c[i + j * c_dim1]; 485 1.1 mrg f12 = c[i + (j + 1) * c_dim1]; 486 1.1 mrg f13 = c[i + (j + 2) * c_dim1]; 487 1.1 mrg f14 = c[i + (j + 3) * c_dim1]; 488 1.1 mrg i6 = ll + lsec - 1; 489 1.1 mrg for (l = ll; l <= i6; ++l) 490 1.1 mrg { 491 1.1 mrg f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 492 1.1 mrg 257] * b[l + j * b_dim1]; 493 1.1 mrg f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 494 1.1 mrg 257] * b[l + (j + 1) * b_dim1]; 495 1.1 mrg f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 496 1.1 mrg 257] * b[l + (j + 2) * b_dim1]; 497 1.1 mrg f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 498 1.1 mrg 257] * b[l + (j + 3) * b_dim1]; 499 1.1 mrg } 500 1.1 mrg c[i + j * c_dim1] = f11; 501 1.1 mrg c[i + (j + 1) * c_dim1] = f12; 502 1.1 mrg c[i + (j + 2) * c_dim1] = f13; 503 1.1 mrg c[i + (j + 3) * c_dim1] = f14; 504 1.1 mrg } 505 1.1 mrg } 506 1.1 mrg } 507 1.1 mrg if (ujsec < jsec) 508 1.1 mrg { 509 1.1 mrg i4 = jj + jsec - 1; 510 1.1 mrg for (j = jj + ujsec; j <= i4; ++j) 511 1.1 mrg { 512 1.1 mrg i5 = ii + uisec - 1; 513 1.1 mrg for (i = ii; i <= i5; i += 4) 514 1.1 mrg { 515 1.1 mrg f11 = c[i + j * c_dim1]; 516 1.1 mrg f21 = c[i + 1 + j * c_dim1]; 517 1.1 mrg f31 = c[i + 2 + j * c_dim1]; 518 1.1 mrg f41 = c[i + 3 + j * c_dim1]; 519 1.1 mrg i6 = ll + lsec - 1; 520 1.1 mrg for (l = ll; l <= i6; ++l) 521 1.1 mrg { 522 1.1 mrg f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 523 1.1 mrg 257] * b[l + j * b_dim1]; 524 1.1 mrg f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 525 1.1 mrg 257] * b[l + j * b_dim1]; 526 1.1 mrg f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 527 1.1 mrg 257] * b[l + j * b_dim1]; 528 1.1 mrg f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 529 1.1 mrg 257] * b[l + j * b_dim1]; 530 1.1 mrg } 531 1.1 mrg c[i + j * c_dim1] = f11; 532 1.1 mrg c[i + 1 + j * c_dim1] = f21; 533 1.1 mrg c[i + 2 + j * c_dim1] = f31; 534 1.1 mrg c[i + 3 + j * c_dim1] = f41; 535 1.1 mrg } 536 1.1 mrg i5 = ii + isec - 1; 537 1.1 mrg for (i = ii + uisec; i <= i5; ++i) 538 1.1 mrg { 539 1.1 mrg f11 = c[i + j * c_dim1]; 540 1.1 mrg i6 = ll + lsec - 1; 541 1.1 mrg for (l = ll; l <= i6; ++l) 542 1.1 mrg { 543 1.1 mrg f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 544 1.1 mrg 257] * b[l + j * b_dim1]; 545 1.1 mrg } 546 1.1 mrg c[i + j * c_dim1] = f11; 547 1.1 mrg } 548 1.1 mrg } 549 1.1 mrg } 550 1.1 mrg } 551 1.1 mrg } 552 1.1 mrg } 553 1.1 mrg free(t1); 554 1.1 mrg return; 555 1.1 mrg } 556 1.1 mrg else if (rxstride == 1 && aystride == 1 && bxstride == 1) 557 1.1 mrg { 558 1.1 mrg if (GFC_DESCRIPTOR_RANK (a) != 1) 559 1.1 mrg { 560 1.1 mrg const GFC_COMPLEX_16 *restrict abase_x; 561 1.1 mrg const GFC_COMPLEX_16 *restrict bbase_y; 562 1.1 mrg GFC_COMPLEX_16 *restrict dest_y; 563 1.1 mrg GFC_COMPLEX_16 s; 564 1.1 mrg 565 1.1 mrg for (y = 0; y < ycount; y++) 566 1.1 mrg { 567 1.1 mrg bbase_y = &bbase[y*bystride]; 568 1.1 mrg dest_y = &dest[y*rystride]; 569 1.1 mrg for (x = 0; x < xcount; x++) 570 1.1 mrg { 571 1.1 mrg abase_x = &abase[x*axstride]; 572 1.1 mrg s = (GFC_COMPLEX_16) 0; 573 1.1 mrg for (n = 0; n < count; n++) 574 1.1 mrg s += abase_x[n] * bbase_y[n]; 575 1.1 mrg dest_y[x] = s; 576 1.1 mrg } 577 1.1 mrg } 578 1.1 mrg } 579 1.1 mrg else 580 1.1 mrg { 581 1.1 mrg const GFC_COMPLEX_16 *restrict bbase_y; 582 1.1 mrg GFC_COMPLEX_16 s; 583 1.1 mrg 584 1.1 mrg for (y = 0; y < ycount; y++) 585 1.1 mrg { 586 1.1 mrg bbase_y = &bbase[y*bystride]; 587 1.1 mrg s = (GFC_COMPLEX_16) 0; 588 1.1 mrg for (n = 0; n < count; n++) 589 1.1 mrg s += abase[n*axstride] * bbase_y[n]; 590 1.1 mrg dest[y*rystride] = s; 591 1.1 mrg } 592 1.1 mrg } 593 1.1 mrg } 594 1.1 mrg else if (GFC_DESCRIPTOR_RANK (a) == 1) 595 1.1 mrg { 596 1.1 mrg const GFC_COMPLEX_16 *restrict bbase_y; 597 1.1 mrg GFC_COMPLEX_16 s; 598 1.1 mrg 599 1.1 mrg for (y = 0; y < ycount; y++) 600 1.1 mrg { 601 1.1 mrg bbase_y = &bbase[y*bystride]; 602 1.1 mrg s = (GFC_COMPLEX_16) 0; 603 1.1 mrg for (n = 0; n < count; n++) 604 1.1 mrg s += abase[n*axstride] * bbase_y[n*bxstride]; 605 1.1 mrg dest[y*rxstride] = s; 606 1.1 mrg } 607 1.1 mrg } 608 1.1.1.2 mrg else if (axstride < aystride) 609 1.1.1.2 mrg { 610 1.1.1.2 mrg for (y = 0; y < ycount; y++) 611 1.1.1.2 mrg for (x = 0; x < xcount; x++) 612 1.1.1.2 mrg dest[x*rxstride + y*rystride] = (GFC_COMPLEX_16)0; 613 1.1.1.2 mrg 614 1.1.1.2 mrg for (y = 0; y < ycount; y++) 615 1.1.1.2 mrg for (n = 0; n < count; n++) 616 1.1.1.2 mrg for (x = 0; x < xcount; x++) 617 1.1.1.2 mrg /* dest[x,y] += a[x,n] * b[n,y] */ 618 1.1.1.2 mrg dest[x*rxstride + y*rystride] += 619 1.1.1.2 mrg abase[x*axstride + n*aystride] * 620 1.1.1.2 mrg bbase[n*bxstride + y*bystride]; 621 1.1.1.2 mrg } 622 1.1 mrg else 623 1.1 mrg { 624 1.1 mrg const GFC_COMPLEX_16 *restrict abase_x; 625 1.1 mrg const GFC_COMPLEX_16 *restrict bbase_y; 626 1.1 mrg GFC_COMPLEX_16 *restrict dest_y; 627 1.1 mrg GFC_COMPLEX_16 s; 628 1.1 mrg 629 1.1 mrg for (y = 0; y < ycount; y++) 630 1.1 mrg { 631 1.1 mrg bbase_y = &bbase[y*bystride]; 632 1.1 mrg dest_y = &dest[y*rystride]; 633 1.1 mrg for (x = 0; x < xcount; x++) 634 1.1 mrg { 635 1.1 mrg abase_x = &abase[x*axstride]; 636 1.1 mrg s = (GFC_COMPLEX_16) 0; 637 1.1 mrg for (n = 0; n < count; n++) 638 1.1 mrg s += abase_x[n*aystride] * bbase_y[n*bxstride]; 639 1.1 mrg dest_y[x*rxstride] = s; 640 1.1 mrg } 641 1.1 mrg } 642 1.1 mrg } 643 1.1 mrg } 644 1.1 mrg #undef POW3 645 1.1 mrg #undef min 646 1.1 mrg #undef max 647 1.1 mrg 648 1.1 mrg #endif /* HAVE_AVX */ 649 1.1 mrg 650 1.1 mrg #ifdef HAVE_AVX2 651 1.1 mrg static void 652 1.1 mrg matmul_c16_avx2 (gfc_array_c16 * const restrict retarray, 653 1.1 mrg gfc_array_c16 * const restrict a, gfc_array_c16 * const restrict b, int try_blas, 654 1.1 mrg int blas_limit, blas_call gemm) __attribute__((__target__("avx2,fma"))); 655 1.1 mrg static void 656 1.1 mrg matmul_c16_avx2 (gfc_array_c16 * const restrict retarray, 657 1.1 mrg gfc_array_c16 * const restrict a, gfc_array_c16 * const restrict b, int try_blas, 658 1.1 mrg int blas_limit, blas_call gemm) 659 1.1 mrg { 660 1.1 mrg const GFC_COMPLEX_16 * restrict abase; 661 1.1 mrg const GFC_COMPLEX_16 * restrict bbase; 662 1.1 mrg GFC_COMPLEX_16 * restrict dest; 663 1.1 mrg 664 1.1 mrg index_type rxstride, rystride, axstride, aystride, bxstride, bystride; 665 1.1 mrg index_type x, y, n, count, xcount, ycount; 666 1.1 mrg 667 1.1 mrg assert (GFC_DESCRIPTOR_RANK (a) == 2 668 1.1 mrg || GFC_DESCRIPTOR_RANK (b) == 2); 669 1.1 mrg 670 1.1 mrg /* C[xcount,ycount] = A[xcount, count] * B[count,ycount] 671 1.1 mrg 672 1.1 mrg Either A or B (but not both) can be rank 1: 673 1.1 mrg 674 1.1 mrg o One-dimensional argument A is implicitly treated as a row matrix 675 1.1 mrg dimensioned [1,count], so xcount=1. 676 1.1 mrg 677 1.1 mrg o One-dimensional argument B is implicitly treated as a column matrix 678 1.1 mrg dimensioned [count, 1], so ycount=1. 679 1.1 mrg */ 680 1.1 mrg 681 1.1 mrg if (retarray->base_addr == NULL) 682 1.1 mrg { 683 1.1 mrg if (GFC_DESCRIPTOR_RANK (a) == 1) 684 1.1 mrg { 685 1.1 mrg GFC_DIMENSION_SET(retarray->dim[0], 0, 686 1.1 mrg GFC_DESCRIPTOR_EXTENT(b,1) - 1, 1); 687 1.1 mrg } 688 1.1 mrg else if (GFC_DESCRIPTOR_RANK (b) == 1) 689 1.1 mrg { 690 1.1 mrg GFC_DIMENSION_SET(retarray->dim[0], 0, 691 1.1 mrg GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1); 692 1.1 mrg } 693 1.1 mrg else 694 1.1 mrg { 695 1.1 mrg GFC_DIMENSION_SET(retarray->dim[0], 0, 696 1.1 mrg GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1); 697 1.1 mrg 698 1.1 mrg GFC_DIMENSION_SET(retarray->dim[1], 0, 699 1.1 mrg GFC_DESCRIPTOR_EXTENT(b,1) - 1, 700 1.1 mrg GFC_DESCRIPTOR_EXTENT(retarray,0)); 701 1.1 mrg } 702 1.1 mrg 703 1.1 mrg retarray->base_addr 704 1.1 mrg = xmallocarray (size0 ((array_t *) retarray), sizeof (GFC_COMPLEX_16)); 705 1.1 mrg retarray->offset = 0; 706 1.1 mrg } 707 1.1 mrg else if (unlikely (compile_options.bounds_check)) 708 1.1 mrg { 709 1.1 mrg index_type ret_extent, arg_extent; 710 1.1 mrg 711 1.1 mrg if (GFC_DESCRIPTOR_RANK (a) == 1) 712 1.1 mrg { 713 1.1 mrg arg_extent = GFC_DESCRIPTOR_EXTENT(b,1); 714 1.1 mrg ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0); 715 1.1 mrg if (arg_extent != ret_extent) 716 1.1 mrg runtime_error ("Array bound mismatch for dimension 1 of " 717 1.1 mrg "array (%ld/%ld) ", 718 1.1 mrg (long int) ret_extent, (long int) arg_extent); 719 1.1 mrg } 720 1.1 mrg else if (GFC_DESCRIPTOR_RANK (b) == 1) 721 1.1 mrg { 722 1.1 mrg arg_extent = GFC_DESCRIPTOR_EXTENT(a,0); 723 1.1 mrg ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0); 724 1.1 mrg if (arg_extent != ret_extent) 725 1.1 mrg runtime_error ("Array bound mismatch for dimension 1 of " 726 1.1 mrg "array (%ld/%ld) ", 727 1.1 mrg (long int) ret_extent, (long int) arg_extent); 728 1.1 mrg } 729 1.1 mrg else 730 1.1 mrg { 731 1.1 mrg arg_extent = GFC_DESCRIPTOR_EXTENT(a,0); 732 1.1 mrg ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0); 733 1.1 mrg if (arg_extent != ret_extent) 734 1.1 mrg runtime_error ("Array bound mismatch for dimension 1 of " 735 1.1 mrg "array (%ld/%ld) ", 736 1.1 mrg (long int) ret_extent, (long int) arg_extent); 737 1.1 mrg 738 1.1 mrg arg_extent = GFC_DESCRIPTOR_EXTENT(b,1); 739 1.1 mrg ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,1); 740 1.1 mrg if (arg_extent != ret_extent) 741 1.1 mrg runtime_error ("Array bound mismatch for dimension 2 of " 742 1.1 mrg "array (%ld/%ld) ", 743 1.1 mrg (long int) ret_extent, (long int) arg_extent); 744 1.1 mrg } 745 1.1 mrg } 746 1.1 mrg 747 1.1 mrg 748 1.1 mrg if (GFC_DESCRIPTOR_RANK (retarray) == 1) 749 1.1 mrg { 750 1.1 mrg /* One-dimensional result may be addressed in the code below 751 1.1 mrg either as a row or a column matrix. We want both cases to 752 1.1 mrg work. */ 753 1.1 mrg rxstride = rystride = GFC_DESCRIPTOR_STRIDE(retarray,0); 754 1.1 mrg } 755 1.1 mrg else 756 1.1 mrg { 757 1.1 mrg rxstride = GFC_DESCRIPTOR_STRIDE(retarray,0); 758 1.1 mrg rystride = GFC_DESCRIPTOR_STRIDE(retarray,1); 759 1.1 mrg } 760 1.1 mrg 761 1.1 mrg 762 1.1 mrg if (GFC_DESCRIPTOR_RANK (a) == 1) 763 1.1 mrg { 764 1.1 mrg /* Treat it as a a row matrix A[1,count]. */ 765 1.1 mrg axstride = GFC_DESCRIPTOR_STRIDE(a,0); 766 1.1 mrg aystride = 1; 767 1.1 mrg 768 1.1 mrg xcount = 1; 769 1.1 mrg count = GFC_DESCRIPTOR_EXTENT(a,0); 770 1.1 mrg } 771 1.1 mrg else 772 1.1 mrg { 773 1.1 mrg axstride = GFC_DESCRIPTOR_STRIDE(a,0); 774 1.1 mrg aystride = GFC_DESCRIPTOR_STRIDE(a,1); 775 1.1 mrg 776 1.1 mrg count = GFC_DESCRIPTOR_EXTENT(a,1); 777 1.1 mrg xcount = GFC_DESCRIPTOR_EXTENT(a,0); 778 1.1 mrg } 779 1.1 mrg 780 1.1 mrg if (count != GFC_DESCRIPTOR_EXTENT(b,0)) 781 1.1 mrg { 782 1.1 mrg if (count > 0 || GFC_DESCRIPTOR_EXTENT(b,0) > 0) 783 1.1 mrg runtime_error ("Incorrect extent in argument B in MATMUL intrinsic " 784 1.1 mrg "in dimension 1: is %ld, should be %ld", 785 1.1 mrg (long int) GFC_DESCRIPTOR_EXTENT(b,0), (long int) count); 786 1.1 mrg } 787 1.1 mrg 788 1.1 mrg if (GFC_DESCRIPTOR_RANK (b) == 1) 789 1.1 mrg { 790 1.1 mrg /* Treat it as a column matrix B[count,1] */ 791 1.1 mrg bxstride = GFC_DESCRIPTOR_STRIDE(b,0); 792 1.1 mrg 793 1.1 mrg /* bystride should never be used for 1-dimensional b. 794 1.1 mrg The value is only used for calculation of the 795 1.1 mrg memory by the buffer. */ 796 1.1 mrg bystride = 256; 797 1.1 mrg ycount = 1; 798 1.1 mrg } 799 1.1 mrg else 800 1.1 mrg { 801 1.1 mrg bxstride = GFC_DESCRIPTOR_STRIDE(b,0); 802 1.1 mrg bystride = GFC_DESCRIPTOR_STRIDE(b,1); 803 1.1 mrg ycount = GFC_DESCRIPTOR_EXTENT(b,1); 804 1.1 mrg } 805 1.1 mrg 806 1.1 mrg abase = a->base_addr; 807 1.1 mrg bbase = b->base_addr; 808 1.1 mrg dest = retarray->base_addr; 809 1.1 mrg 810 1.1 mrg /* Now that everything is set up, we perform the multiplication 811 1.1 mrg itself. */ 812 1.1 mrg 813 1.1 mrg #define POW3(x) (((float) (x)) * ((float) (x)) * ((float) (x))) 814 1.1 mrg #define min(a,b) ((a) <= (b) ? (a) : (b)) 815 1.1 mrg #define max(a,b) ((a) >= (b) ? (a) : (b)) 816 1.1 mrg 817 1.1 mrg if (try_blas && rxstride == 1 && (axstride == 1 || aystride == 1) 818 1.1 mrg && (bxstride == 1 || bystride == 1) 819 1.1 mrg && (((float) xcount) * ((float) ycount) * ((float) count) 820 1.1 mrg > POW3(blas_limit))) 821 1.1 mrg { 822 1.1 mrg const int m = xcount, n = ycount, k = count, ldc = rystride; 823 1.1 mrg const GFC_COMPLEX_16 one = 1, zero = 0; 824 1.1 mrg const int lda = (axstride == 1) ? aystride : axstride, 825 1.1 mrg ldb = (bxstride == 1) ? bystride : bxstride; 826 1.1 mrg 827 1.1 mrg if (lda > 0 && ldb > 0 && ldc > 0 && m > 1 && n > 1 && k > 1) 828 1.1 mrg { 829 1.1 mrg assert (gemm != NULL); 830 1.1 mrg const char *transa, *transb; 831 1.1 mrg if (try_blas & 2) 832 1.1 mrg transa = "C"; 833 1.1 mrg else 834 1.1 mrg transa = axstride == 1 ? "N" : "T"; 835 1.1 mrg 836 1.1 mrg if (try_blas & 4) 837 1.1 mrg transb = "C"; 838 1.1 mrg else 839 1.1 mrg transb = bxstride == 1 ? "N" : "T"; 840 1.1 mrg 841 1.1 mrg gemm (transa, transb , &m, 842 1.1 mrg &n, &k, &one, abase, &lda, bbase, &ldb, &zero, dest, 843 1.1 mrg &ldc, 1, 1); 844 1.1 mrg return; 845 1.1 mrg } 846 1.1 mrg } 847 1.1 mrg 848 1.1.1.2 mrg if (rxstride == 1 && axstride == 1 && bxstride == 1 849 1.1.1.2 mrg && GFC_DESCRIPTOR_RANK (b) != 1) 850 1.1 mrg { 851 1.1 mrg /* This block of code implements a tuned matmul, derived from 852 1.1 mrg Superscalar GEMM-based level 3 BLAS, Beta version 0.1 853 1.1 mrg 854 1.1 mrg Bo Kagstrom and Per Ling 855 1.1 mrg Department of Computing Science 856 1.1 mrg Umea University 857 1.1 mrg S-901 87 Umea, Sweden 858 1.1 mrg 859 1.1 mrg from netlib.org, translated to C, and modified for matmul.m4. */ 860 1.1 mrg 861 1.1 mrg const GFC_COMPLEX_16 *a, *b; 862 1.1 mrg GFC_COMPLEX_16 *c; 863 1.1 mrg const index_type m = xcount, n = ycount, k = count; 864 1.1 mrg 865 1.1 mrg /* System generated locals */ 866 1.1 mrg index_type a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset, 867 1.1 mrg i1, i2, i3, i4, i5, i6; 868 1.1 mrg 869 1.1 mrg /* Local variables */ 870 1.1 mrg GFC_COMPLEX_16 f11, f12, f21, f22, f31, f32, f41, f42, 871 1.1 mrg f13, f14, f23, f24, f33, f34, f43, f44; 872 1.1 mrg index_type i, j, l, ii, jj, ll; 873 1.1 mrg index_type isec, jsec, lsec, uisec, ujsec, ulsec; 874 1.1 mrg GFC_COMPLEX_16 *t1; 875 1.1 mrg 876 1.1 mrg a = abase; 877 1.1 mrg b = bbase; 878 1.1 mrg c = retarray->base_addr; 879 1.1 mrg 880 1.1 mrg /* Parameter adjustments */ 881 1.1 mrg c_dim1 = rystride; 882 1.1 mrg c_offset = 1 + c_dim1; 883 1.1 mrg c -= c_offset; 884 1.1 mrg a_dim1 = aystride; 885 1.1 mrg a_offset = 1 + a_dim1; 886 1.1 mrg a -= a_offset; 887 1.1 mrg b_dim1 = bystride; 888 1.1 mrg b_offset = 1 + b_dim1; 889 1.1 mrg b -= b_offset; 890 1.1 mrg 891 1.1 mrg /* Empty c first. */ 892 1.1 mrg for (j=1; j<=n; j++) 893 1.1 mrg for (i=1; i<=m; i++) 894 1.1 mrg c[i + j * c_dim1] = (GFC_COMPLEX_16)0; 895 1.1 mrg 896 1.1 mrg /* Early exit if possible */ 897 1.1 mrg if (m == 0 || n == 0 || k == 0) 898 1.1 mrg return; 899 1.1 mrg 900 1.1 mrg /* Adjust size of t1 to what is needed. */ 901 1.1 mrg index_type t1_dim, a_sz; 902 1.1 mrg if (aystride == 1) 903 1.1 mrg a_sz = rystride; 904 1.1 mrg else 905 1.1 mrg a_sz = a_dim1; 906 1.1 mrg 907 1.1 mrg t1_dim = a_sz * 256 + b_dim1; 908 1.1 mrg if (t1_dim > 65536) 909 1.1 mrg t1_dim = 65536; 910 1.1 mrg 911 1.1 mrg t1 = malloc (t1_dim * sizeof(GFC_COMPLEX_16)); 912 1.1 mrg 913 1.1 mrg /* Start turning the crank. */ 914 1.1 mrg i1 = n; 915 1.1 mrg for (jj = 1; jj <= i1; jj += 512) 916 1.1 mrg { 917 1.1 mrg /* Computing MIN */ 918 1.1 mrg i2 = 512; 919 1.1 mrg i3 = n - jj + 1; 920 1.1 mrg jsec = min(i2,i3); 921 1.1 mrg ujsec = jsec - jsec % 4; 922 1.1 mrg i2 = k; 923 1.1 mrg for (ll = 1; ll <= i2; ll += 256) 924 1.1 mrg { 925 1.1 mrg /* Computing MIN */ 926 1.1 mrg i3 = 256; 927 1.1 mrg i4 = k - ll + 1; 928 1.1 mrg lsec = min(i3,i4); 929 1.1 mrg ulsec = lsec - lsec % 2; 930 1.1 mrg 931 1.1 mrg i3 = m; 932 1.1 mrg for (ii = 1; ii <= i3; ii += 256) 933 1.1 mrg { 934 1.1 mrg /* Computing MIN */ 935 1.1 mrg i4 = 256; 936 1.1 mrg i5 = m - ii + 1; 937 1.1 mrg isec = min(i4,i5); 938 1.1 mrg uisec = isec - isec % 2; 939 1.1 mrg i4 = ll + ulsec - 1; 940 1.1 mrg for (l = ll; l <= i4; l += 2) 941 1.1 mrg { 942 1.1 mrg i5 = ii + uisec - 1; 943 1.1 mrg for (i = ii; i <= i5; i += 2) 944 1.1 mrg { 945 1.1 mrg t1[l - ll + 1 + ((i - ii + 1) << 8) - 257] = 946 1.1 mrg a[i + l * a_dim1]; 947 1.1 mrg t1[l - ll + 2 + ((i - ii + 1) << 8) - 257] = 948 1.1 mrg a[i + (l + 1) * a_dim1]; 949 1.1 mrg t1[l - ll + 1 + ((i - ii + 2) << 8) - 257] = 950 1.1 mrg a[i + 1 + l * a_dim1]; 951 1.1 mrg t1[l - ll + 2 + ((i - ii + 2) << 8) - 257] = 952 1.1 mrg a[i + 1 + (l + 1) * a_dim1]; 953 1.1 mrg } 954 1.1 mrg if (uisec < isec) 955 1.1 mrg { 956 1.1 mrg t1[l - ll + 1 + (isec << 8) - 257] = 957 1.1 mrg a[ii + isec - 1 + l * a_dim1]; 958 1.1 mrg t1[l - ll + 2 + (isec << 8) - 257] = 959 1.1 mrg a[ii + isec - 1 + (l + 1) * a_dim1]; 960 1.1 mrg } 961 1.1 mrg } 962 1.1 mrg if (ulsec < lsec) 963 1.1 mrg { 964 1.1 mrg i4 = ii + isec - 1; 965 1.1 mrg for (i = ii; i<= i4; ++i) 966 1.1 mrg { 967 1.1 mrg t1[lsec + ((i - ii + 1) << 8) - 257] = 968 1.1 mrg a[i + (ll + lsec - 1) * a_dim1]; 969 1.1 mrg } 970 1.1 mrg } 971 1.1 mrg 972 1.1 mrg uisec = isec - isec % 4; 973 1.1 mrg i4 = jj + ujsec - 1; 974 1.1 mrg for (j = jj; j <= i4; j += 4) 975 1.1 mrg { 976 1.1 mrg i5 = ii + uisec - 1; 977 1.1 mrg for (i = ii; i <= i5; i += 4) 978 1.1 mrg { 979 1.1 mrg f11 = c[i + j * c_dim1]; 980 1.1 mrg f21 = c[i + 1 + j * c_dim1]; 981 1.1 mrg f12 = c[i + (j + 1) * c_dim1]; 982 1.1 mrg f22 = c[i + 1 + (j + 1) * c_dim1]; 983 1.1 mrg f13 = c[i + (j + 2) * c_dim1]; 984 1.1 mrg f23 = c[i + 1 + (j + 2) * c_dim1]; 985 1.1 mrg f14 = c[i + (j + 3) * c_dim1]; 986 1.1 mrg f24 = c[i + 1 + (j + 3) * c_dim1]; 987 1.1 mrg f31 = c[i + 2 + j * c_dim1]; 988 1.1 mrg f41 = c[i + 3 + j * c_dim1]; 989 1.1 mrg f32 = c[i + 2 + (j + 1) * c_dim1]; 990 1.1 mrg f42 = c[i + 3 + (j + 1) * c_dim1]; 991 1.1 mrg f33 = c[i + 2 + (j + 2) * c_dim1]; 992 1.1 mrg f43 = c[i + 3 + (j + 2) * c_dim1]; 993 1.1 mrg f34 = c[i + 2 + (j + 3) * c_dim1]; 994 1.1 mrg f44 = c[i + 3 + (j + 3) * c_dim1]; 995 1.1 mrg i6 = ll + lsec - 1; 996 1.1 mrg for (l = ll; l <= i6; ++l) 997 1.1 mrg { 998 1.1 mrg f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257] 999 1.1 mrg * b[l + j * b_dim1]; 1000 1.1 mrg f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257] 1001 1.1 mrg * b[l + j * b_dim1]; 1002 1.1 mrg f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257] 1003 1.1 mrg * b[l + (j + 1) * b_dim1]; 1004 1.1 mrg f22 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257] 1005 1.1 mrg * b[l + (j + 1) * b_dim1]; 1006 1.1 mrg f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257] 1007 1.1 mrg * b[l + (j + 2) * b_dim1]; 1008 1.1 mrg f23 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257] 1009 1.1 mrg * b[l + (j + 2) * b_dim1]; 1010 1.1 mrg f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257] 1011 1.1 mrg * b[l + (j + 3) * b_dim1]; 1012 1.1 mrg f24 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257] 1013 1.1 mrg * b[l + (j + 3) * b_dim1]; 1014 1.1 mrg f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257] 1015 1.1 mrg * b[l + j * b_dim1]; 1016 1.1 mrg f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257] 1017 1.1 mrg * b[l + j * b_dim1]; 1018 1.1 mrg f32 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257] 1019 1.1 mrg * b[l + (j + 1) * b_dim1]; 1020 1.1 mrg f42 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257] 1021 1.1 mrg * b[l + (j + 1) * b_dim1]; 1022 1.1 mrg f33 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257] 1023 1.1 mrg * b[l + (j + 2) * b_dim1]; 1024 1.1 mrg f43 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257] 1025 1.1 mrg * b[l + (j + 2) * b_dim1]; 1026 1.1 mrg f34 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257] 1027 1.1 mrg * b[l + (j + 3) * b_dim1]; 1028 1.1 mrg f44 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257] 1029 1.1 mrg * b[l + (j + 3) * b_dim1]; 1030 1.1 mrg } 1031 1.1 mrg c[i + j * c_dim1] = f11; 1032 1.1 mrg c[i + 1 + j * c_dim1] = f21; 1033 1.1 mrg c[i + (j + 1) * c_dim1] = f12; 1034 1.1 mrg c[i + 1 + (j + 1) * c_dim1] = f22; 1035 1.1 mrg c[i + (j + 2) * c_dim1] = f13; 1036 1.1 mrg c[i + 1 + (j + 2) * c_dim1] = f23; 1037 1.1 mrg c[i + (j + 3) * c_dim1] = f14; 1038 1.1 mrg c[i + 1 + (j + 3) * c_dim1] = f24; 1039 1.1 mrg c[i + 2 + j * c_dim1] = f31; 1040 1.1 mrg c[i + 3 + j * c_dim1] = f41; 1041 1.1 mrg c[i + 2 + (j + 1) * c_dim1] = f32; 1042 1.1 mrg c[i + 3 + (j + 1) * c_dim1] = f42; 1043 1.1 mrg c[i + 2 + (j + 2) * c_dim1] = f33; 1044 1.1 mrg c[i + 3 + (j + 2) * c_dim1] = f43; 1045 1.1 mrg c[i + 2 + (j + 3) * c_dim1] = f34; 1046 1.1 mrg c[i + 3 + (j + 3) * c_dim1] = f44; 1047 1.1 mrg } 1048 1.1 mrg if (uisec < isec) 1049 1.1 mrg { 1050 1.1 mrg i5 = ii + isec - 1; 1051 1.1 mrg for (i = ii + uisec; i <= i5; ++i) 1052 1.1 mrg { 1053 1.1 mrg f11 = c[i + j * c_dim1]; 1054 1.1 mrg f12 = c[i + (j + 1) * c_dim1]; 1055 1.1 mrg f13 = c[i + (j + 2) * c_dim1]; 1056 1.1 mrg f14 = c[i + (j + 3) * c_dim1]; 1057 1.1 mrg i6 = ll + lsec - 1; 1058 1.1 mrg for (l = ll; l <= i6; ++l) 1059 1.1 mrg { 1060 1.1 mrg f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 1061 1.1 mrg 257] * b[l + j * b_dim1]; 1062 1.1 mrg f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 1063 1.1 mrg 257] * b[l + (j + 1) * b_dim1]; 1064 1.1 mrg f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 1065 1.1 mrg 257] * b[l + (j + 2) * b_dim1]; 1066 1.1 mrg f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 1067 1.1 mrg 257] * b[l + (j + 3) * b_dim1]; 1068 1.1 mrg } 1069 1.1 mrg c[i + j * c_dim1] = f11; 1070 1.1 mrg c[i + (j + 1) * c_dim1] = f12; 1071 1.1 mrg c[i + (j + 2) * c_dim1] = f13; 1072 1.1 mrg c[i + (j + 3) * c_dim1] = f14; 1073 1.1 mrg } 1074 1.1 mrg } 1075 1.1 mrg } 1076 1.1 mrg if (ujsec < jsec) 1077 1.1 mrg { 1078 1.1 mrg i4 = jj + jsec - 1; 1079 1.1 mrg for (j = jj + ujsec; j <= i4; ++j) 1080 1.1 mrg { 1081 1.1 mrg i5 = ii + uisec - 1; 1082 1.1 mrg for (i = ii; i <= i5; i += 4) 1083 1.1 mrg { 1084 1.1 mrg f11 = c[i + j * c_dim1]; 1085 1.1 mrg f21 = c[i + 1 + j * c_dim1]; 1086 1.1 mrg f31 = c[i + 2 + j * c_dim1]; 1087 1.1 mrg f41 = c[i + 3 + j * c_dim1]; 1088 1.1 mrg i6 = ll + lsec - 1; 1089 1.1 mrg for (l = ll; l <= i6; ++l) 1090 1.1 mrg { 1091 1.1 mrg f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 1092 1.1 mrg 257] * b[l + j * b_dim1]; 1093 1.1 mrg f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 1094 1.1 mrg 257] * b[l + j * b_dim1]; 1095 1.1 mrg f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 1096 1.1 mrg 257] * b[l + j * b_dim1]; 1097 1.1 mrg f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 1098 1.1 mrg 257] * b[l + j * b_dim1]; 1099 1.1 mrg } 1100 1.1 mrg c[i + j * c_dim1] = f11; 1101 1.1 mrg c[i + 1 + j * c_dim1] = f21; 1102 1.1 mrg c[i + 2 + j * c_dim1] = f31; 1103 1.1 mrg c[i + 3 + j * c_dim1] = f41; 1104 1.1 mrg } 1105 1.1 mrg i5 = ii + isec - 1; 1106 1.1 mrg for (i = ii + uisec; i <= i5; ++i) 1107 1.1 mrg { 1108 1.1 mrg f11 = c[i + j * c_dim1]; 1109 1.1 mrg i6 = ll + lsec - 1; 1110 1.1 mrg for (l = ll; l <= i6; ++l) 1111 1.1 mrg { 1112 1.1 mrg f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 1113 1.1 mrg 257] * b[l + j * b_dim1]; 1114 1.1 mrg } 1115 1.1 mrg c[i + j * c_dim1] = f11; 1116 1.1 mrg } 1117 1.1 mrg } 1118 1.1 mrg } 1119 1.1 mrg } 1120 1.1 mrg } 1121 1.1 mrg } 1122 1.1 mrg free(t1); 1123 1.1 mrg return; 1124 1.1 mrg } 1125 1.1 mrg else if (rxstride == 1 && aystride == 1 && bxstride == 1) 1126 1.1 mrg { 1127 1.1 mrg if (GFC_DESCRIPTOR_RANK (a) != 1) 1128 1.1 mrg { 1129 1.1 mrg const GFC_COMPLEX_16 *restrict abase_x; 1130 1.1 mrg const GFC_COMPLEX_16 *restrict bbase_y; 1131 1.1 mrg GFC_COMPLEX_16 *restrict dest_y; 1132 1.1 mrg GFC_COMPLEX_16 s; 1133 1.1 mrg 1134 1.1 mrg for (y = 0; y < ycount; y++) 1135 1.1 mrg { 1136 1.1 mrg bbase_y = &bbase[y*bystride]; 1137 1.1 mrg dest_y = &dest[y*rystride]; 1138 1.1 mrg for (x = 0; x < xcount; x++) 1139 1.1 mrg { 1140 1.1 mrg abase_x = &abase[x*axstride]; 1141 1.1 mrg s = (GFC_COMPLEX_16) 0; 1142 1.1 mrg for (n = 0; n < count; n++) 1143 1.1 mrg s += abase_x[n] * bbase_y[n]; 1144 1.1 mrg dest_y[x] = s; 1145 1.1 mrg } 1146 1.1 mrg } 1147 1.1 mrg } 1148 1.1 mrg else 1149 1.1 mrg { 1150 1.1 mrg const GFC_COMPLEX_16 *restrict bbase_y; 1151 1.1 mrg GFC_COMPLEX_16 s; 1152 1.1 mrg 1153 1.1 mrg for (y = 0; y < ycount; y++) 1154 1.1 mrg { 1155 1.1 mrg bbase_y = &bbase[y*bystride]; 1156 1.1 mrg s = (GFC_COMPLEX_16) 0; 1157 1.1 mrg for (n = 0; n < count; n++) 1158 1.1 mrg s += abase[n*axstride] * bbase_y[n]; 1159 1.1 mrg dest[y*rystride] = s; 1160 1.1 mrg } 1161 1.1 mrg } 1162 1.1 mrg } 1163 1.1 mrg else if (GFC_DESCRIPTOR_RANK (a) == 1) 1164 1.1 mrg { 1165 1.1 mrg const GFC_COMPLEX_16 *restrict bbase_y; 1166 1.1 mrg GFC_COMPLEX_16 s; 1167 1.1 mrg 1168 1.1 mrg for (y = 0; y < ycount; y++) 1169 1.1 mrg { 1170 1.1 mrg bbase_y = &bbase[y*bystride]; 1171 1.1 mrg s = (GFC_COMPLEX_16) 0; 1172 1.1 mrg for (n = 0; n < count; n++) 1173 1.1 mrg s += abase[n*axstride] * bbase_y[n*bxstride]; 1174 1.1 mrg dest[y*rxstride] = s; 1175 1.1 mrg } 1176 1.1 mrg } 1177 1.1.1.2 mrg else if (axstride < aystride) 1178 1.1.1.2 mrg { 1179 1.1.1.2 mrg for (y = 0; y < ycount; y++) 1180 1.1.1.2 mrg for (x = 0; x < xcount; x++) 1181 1.1.1.2 mrg dest[x*rxstride + y*rystride] = (GFC_COMPLEX_16)0; 1182 1.1.1.2 mrg 1183 1.1.1.2 mrg for (y = 0; y < ycount; y++) 1184 1.1.1.2 mrg for (n = 0; n < count; n++) 1185 1.1.1.2 mrg for (x = 0; x < xcount; x++) 1186 1.1.1.2 mrg /* dest[x,y] += a[x,n] * b[n,y] */ 1187 1.1.1.2 mrg dest[x*rxstride + y*rystride] += 1188 1.1.1.2 mrg abase[x*axstride + n*aystride] * 1189 1.1.1.2 mrg bbase[n*bxstride + y*bystride]; 1190 1.1.1.2 mrg } 1191 1.1 mrg else 1192 1.1 mrg { 1193 1.1 mrg const GFC_COMPLEX_16 *restrict abase_x; 1194 1.1 mrg const GFC_COMPLEX_16 *restrict bbase_y; 1195 1.1 mrg GFC_COMPLEX_16 *restrict dest_y; 1196 1.1 mrg GFC_COMPLEX_16 s; 1197 1.1 mrg 1198 1.1 mrg for (y = 0; y < ycount; y++) 1199 1.1 mrg { 1200 1.1 mrg bbase_y = &bbase[y*bystride]; 1201 1.1 mrg dest_y = &dest[y*rystride]; 1202 1.1 mrg for (x = 0; x < xcount; x++) 1203 1.1 mrg { 1204 1.1 mrg abase_x = &abase[x*axstride]; 1205 1.1 mrg s = (GFC_COMPLEX_16) 0; 1206 1.1 mrg for (n = 0; n < count; n++) 1207 1.1 mrg s += abase_x[n*aystride] * bbase_y[n*bxstride]; 1208 1.1 mrg dest_y[x*rxstride] = s; 1209 1.1 mrg } 1210 1.1 mrg } 1211 1.1 mrg } 1212 1.1 mrg } 1213 1.1 mrg #undef POW3 1214 1.1 mrg #undef min 1215 1.1 mrg #undef max 1216 1.1 mrg 1217 1.1 mrg #endif /* HAVE_AVX2 */ 1218 1.1 mrg 1219 1.1 mrg #ifdef HAVE_AVX512F 1220 1.1 mrg static void 1221 1.1 mrg matmul_c16_avx512f (gfc_array_c16 * const restrict retarray, 1222 1.1 mrg gfc_array_c16 * const restrict a, gfc_array_c16 * const restrict b, int try_blas, 1223 1.1 mrg int blas_limit, blas_call gemm) __attribute__((__target__("avx512f"))); 1224 1.1 mrg static void 1225 1.1 mrg matmul_c16_avx512f (gfc_array_c16 * const restrict retarray, 1226 1.1 mrg gfc_array_c16 * const restrict a, gfc_array_c16 * const restrict b, int try_blas, 1227 1.1 mrg int blas_limit, blas_call gemm) 1228 1.1 mrg { 1229 1.1 mrg const GFC_COMPLEX_16 * restrict abase; 1230 1.1 mrg const GFC_COMPLEX_16 * restrict bbase; 1231 1.1 mrg GFC_COMPLEX_16 * restrict dest; 1232 1.1 mrg 1233 1.1 mrg index_type rxstride, rystride, axstride, aystride, bxstride, bystride; 1234 1.1 mrg index_type x, y, n, count, xcount, ycount; 1235 1.1 mrg 1236 1.1 mrg assert (GFC_DESCRIPTOR_RANK (a) == 2 1237 1.1 mrg || GFC_DESCRIPTOR_RANK (b) == 2); 1238 1.1 mrg 1239 1.1 mrg /* C[xcount,ycount] = A[xcount, count] * B[count,ycount] 1240 1.1 mrg 1241 1.1 mrg Either A or B (but not both) can be rank 1: 1242 1.1 mrg 1243 1.1 mrg o One-dimensional argument A is implicitly treated as a row matrix 1244 1.1 mrg dimensioned [1,count], so xcount=1. 1245 1.1 mrg 1246 1.1 mrg o One-dimensional argument B is implicitly treated as a column matrix 1247 1.1 mrg dimensioned [count, 1], so ycount=1. 1248 1.1 mrg */ 1249 1.1 mrg 1250 1.1 mrg if (retarray->base_addr == NULL) 1251 1.1 mrg { 1252 1.1 mrg if (GFC_DESCRIPTOR_RANK (a) == 1) 1253 1.1 mrg { 1254 1.1 mrg GFC_DIMENSION_SET(retarray->dim[0], 0, 1255 1.1 mrg GFC_DESCRIPTOR_EXTENT(b,1) - 1, 1); 1256 1.1 mrg } 1257 1.1 mrg else if (GFC_DESCRIPTOR_RANK (b) == 1) 1258 1.1 mrg { 1259 1.1 mrg GFC_DIMENSION_SET(retarray->dim[0], 0, 1260 1.1 mrg GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1); 1261 1.1 mrg } 1262 1.1 mrg else 1263 1.1 mrg { 1264 1.1 mrg GFC_DIMENSION_SET(retarray->dim[0], 0, 1265 1.1 mrg GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1); 1266 1.1 mrg 1267 1.1 mrg GFC_DIMENSION_SET(retarray->dim[1], 0, 1268 1.1 mrg GFC_DESCRIPTOR_EXTENT(b,1) - 1, 1269 1.1 mrg GFC_DESCRIPTOR_EXTENT(retarray,0)); 1270 1.1 mrg } 1271 1.1 mrg 1272 1.1 mrg retarray->base_addr 1273 1.1 mrg = xmallocarray (size0 ((array_t *) retarray), sizeof (GFC_COMPLEX_16)); 1274 1.1 mrg retarray->offset = 0; 1275 1.1 mrg } 1276 1.1 mrg else if (unlikely (compile_options.bounds_check)) 1277 1.1 mrg { 1278 1.1 mrg index_type ret_extent, arg_extent; 1279 1.1 mrg 1280 1.1 mrg if (GFC_DESCRIPTOR_RANK (a) == 1) 1281 1.1 mrg { 1282 1.1 mrg arg_extent = GFC_DESCRIPTOR_EXTENT(b,1); 1283 1.1 mrg ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0); 1284 1.1 mrg if (arg_extent != ret_extent) 1285 1.1 mrg runtime_error ("Array bound mismatch for dimension 1 of " 1286 1.1 mrg "array (%ld/%ld) ", 1287 1.1 mrg (long int) ret_extent, (long int) arg_extent); 1288 1.1 mrg } 1289 1.1 mrg else if (GFC_DESCRIPTOR_RANK (b) == 1) 1290 1.1 mrg { 1291 1.1 mrg arg_extent = GFC_DESCRIPTOR_EXTENT(a,0); 1292 1.1 mrg ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0); 1293 1.1 mrg if (arg_extent != ret_extent) 1294 1.1 mrg runtime_error ("Array bound mismatch for dimension 1 of " 1295 1.1 mrg "array (%ld/%ld) ", 1296 1.1 mrg (long int) ret_extent, (long int) arg_extent); 1297 1.1 mrg } 1298 1.1 mrg else 1299 1.1 mrg { 1300 1.1 mrg arg_extent = GFC_DESCRIPTOR_EXTENT(a,0); 1301 1.1 mrg ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0); 1302 1.1 mrg if (arg_extent != ret_extent) 1303 1.1 mrg runtime_error ("Array bound mismatch for dimension 1 of " 1304 1.1 mrg "array (%ld/%ld) ", 1305 1.1 mrg (long int) ret_extent, (long int) arg_extent); 1306 1.1 mrg 1307 1.1 mrg arg_extent = GFC_DESCRIPTOR_EXTENT(b,1); 1308 1.1 mrg ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,1); 1309 1.1 mrg if (arg_extent != ret_extent) 1310 1.1 mrg runtime_error ("Array bound mismatch for dimension 2 of " 1311 1.1 mrg "array (%ld/%ld) ", 1312 1.1 mrg (long int) ret_extent, (long int) arg_extent); 1313 1.1 mrg } 1314 1.1 mrg } 1315 1.1 mrg 1316 1.1 mrg 1317 1.1 mrg if (GFC_DESCRIPTOR_RANK (retarray) == 1) 1318 1.1 mrg { 1319 1.1 mrg /* One-dimensional result may be addressed in the code below 1320 1.1 mrg either as a row or a column matrix. We want both cases to 1321 1.1 mrg work. */ 1322 1.1 mrg rxstride = rystride = GFC_DESCRIPTOR_STRIDE(retarray,0); 1323 1.1 mrg } 1324 1.1 mrg else 1325 1.1 mrg { 1326 1.1 mrg rxstride = GFC_DESCRIPTOR_STRIDE(retarray,0); 1327 1.1 mrg rystride = GFC_DESCRIPTOR_STRIDE(retarray,1); 1328 1.1 mrg } 1329 1.1 mrg 1330 1.1 mrg 1331 1.1 mrg if (GFC_DESCRIPTOR_RANK (a) == 1) 1332 1.1 mrg { 1333 1.1 mrg /* Treat it as a a row matrix A[1,count]. */ 1334 1.1 mrg axstride = GFC_DESCRIPTOR_STRIDE(a,0); 1335 1.1 mrg aystride = 1; 1336 1.1 mrg 1337 1.1 mrg xcount = 1; 1338 1.1 mrg count = GFC_DESCRIPTOR_EXTENT(a,0); 1339 1.1 mrg } 1340 1.1 mrg else 1341 1.1 mrg { 1342 1.1 mrg axstride = GFC_DESCRIPTOR_STRIDE(a,0); 1343 1.1 mrg aystride = GFC_DESCRIPTOR_STRIDE(a,1); 1344 1.1 mrg 1345 1.1 mrg count = GFC_DESCRIPTOR_EXTENT(a,1); 1346 1.1 mrg xcount = GFC_DESCRIPTOR_EXTENT(a,0); 1347 1.1 mrg } 1348 1.1 mrg 1349 1.1 mrg if (count != GFC_DESCRIPTOR_EXTENT(b,0)) 1350 1.1 mrg { 1351 1.1 mrg if (count > 0 || GFC_DESCRIPTOR_EXTENT(b,0) > 0) 1352 1.1 mrg runtime_error ("Incorrect extent in argument B in MATMUL intrinsic " 1353 1.1 mrg "in dimension 1: is %ld, should be %ld", 1354 1.1 mrg (long int) GFC_DESCRIPTOR_EXTENT(b,0), (long int) count); 1355 1.1 mrg } 1356 1.1 mrg 1357 1.1 mrg if (GFC_DESCRIPTOR_RANK (b) == 1) 1358 1.1 mrg { 1359 1.1 mrg /* Treat it as a column matrix B[count,1] */ 1360 1.1 mrg bxstride = GFC_DESCRIPTOR_STRIDE(b,0); 1361 1.1 mrg 1362 1.1 mrg /* bystride should never be used for 1-dimensional b. 1363 1.1 mrg The value is only used for calculation of the 1364 1.1 mrg memory by the buffer. */ 1365 1.1 mrg bystride = 256; 1366 1.1 mrg ycount = 1; 1367 1.1 mrg } 1368 1.1 mrg else 1369 1.1 mrg { 1370 1.1 mrg bxstride = GFC_DESCRIPTOR_STRIDE(b,0); 1371 1.1 mrg bystride = GFC_DESCRIPTOR_STRIDE(b,1); 1372 1.1 mrg ycount = GFC_DESCRIPTOR_EXTENT(b,1); 1373 1.1 mrg } 1374 1.1 mrg 1375 1.1 mrg abase = a->base_addr; 1376 1.1 mrg bbase = b->base_addr; 1377 1.1 mrg dest = retarray->base_addr; 1378 1.1 mrg 1379 1.1 mrg /* Now that everything is set up, we perform the multiplication 1380 1.1 mrg itself. */ 1381 1.1 mrg 1382 1.1 mrg #define POW3(x) (((float) (x)) * ((float) (x)) * ((float) (x))) 1383 1.1 mrg #define min(a,b) ((a) <= (b) ? (a) : (b)) 1384 1.1 mrg #define max(a,b) ((a) >= (b) ? (a) : (b)) 1385 1.1 mrg 1386 1.1 mrg if (try_blas && rxstride == 1 && (axstride == 1 || aystride == 1) 1387 1.1 mrg && (bxstride == 1 || bystride == 1) 1388 1.1 mrg && (((float) xcount) * ((float) ycount) * ((float) count) 1389 1.1 mrg > POW3(blas_limit))) 1390 1.1 mrg { 1391 1.1 mrg const int m = xcount, n = ycount, k = count, ldc = rystride; 1392 1.1 mrg const GFC_COMPLEX_16 one = 1, zero = 0; 1393 1.1 mrg const int lda = (axstride == 1) ? aystride : axstride, 1394 1.1 mrg ldb = (bxstride == 1) ? bystride : bxstride; 1395 1.1 mrg 1396 1.1 mrg if (lda > 0 && ldb > 0 && ldc > 0 && m > 1 && n > 1 && k > 1) 1397 1.1 mrg { 1398 1.1 mrg assert (gemm != NULL); 1399 1.1 mrg const char *transa, *transb; 1400 1.1 mrg if (try_blas & 2) 1401 1.1 mrg transa = "C"; 1402 1.1 mrg else 1403 1.1 mrg transa = axstride == 1 ? "N" : "T"; 1404 1.1 mrg 1405 1.1 mrg if (try_blas & 4) 1406 1.1 mrg transb = "C"; 1407 1.1 mrg else 1408 1.1 mrg transb = bxstride == 1 ? "N" : "T"; 1409 1.1 mrg 1410 1.1 mrg gemm (transa, transb , &m, 1411 1.1 mrg &n, &k, &one, abase, &lda, bbase, &ldb, &zero, dest, 1412 1.1 mrg &ldc, 1, 1); 1413 1.1 mrg return; 1414 1.1 mrg } 1415 1.1 mrg } 1416 1.1 mrg 1417 1.1.1.2 mrg if (rxstride == 1 && axstride == 1 && bxstride == 1 1418 1.1.1.2 mrg && GFC_DESCRIPTOR_RANK (b) != 1) 1419 1.1 mrg { 1420 1.1 mrg /* This block of code implements a tuned matmul, derived from 1421 1.1 mrg Superscalar GEMM-based level 3 BLAS, Beta version 0.1 1422 1.1 mrg 1423 1.1 mrg Bo Kagstrom and Per Ling 1424 1.1 mrg Department of Computing Science 1425 1.1 mrg Umea University 1426 1.1 mrg S-901 87 Umea, Sweden 1427 1.1 mrg 1428 1.1 mrg from netlib.org, translated to C, and modified for matmul.m4. */ 1429 1.1 mrg 1430 1.1 mrg const GFC_COMPLEX_16 *a, *b; 1431 1.1 mrg GFC_COMPLEX_16 *c; 1432 1.1 mrg const index_type m = xcount, n = ycount, k = count; 1433 1.1 mrg 1434 1.1 mrg /* System generated locals */ 1435 1.1 mrg index_type a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset, 1436 1.1 mrg i1, i2, i3, i4, i5, i6; 1437 1.1 mrg 1438 1.1 mrg /* Local variables */ 1439 1.1 mrg GFC_COMPLEX_16 f11, f12, f21, f22, f31, f32, f41, f42, 1440 1.1 mrg f13, f14, f23, f24, f33, f34, f43, f44; 1441 1.1 mrg index_type i, j, l, ii, jj, ll; 1442 1.1 mrg index_type isec, jsec, lsec, uisec, ujsec, ulsec; 1443 1.1 mrg GFC_COMPLEX_16 *t1; 1444 1.1 mrg 1445 1.1 mrg a = abase; 1446 1.1 mrg b = bbase; 1447 1.1 mrg c = retarray->base_addr; 1448 1.1 mrg 1449 1.1 mrg /* Parameter adjustments */ 1450 1.1 mrg c_dim1 = rystride; 1451 1.1 mrg c_offset = 1 + c_dim1; 1452 1.1 mrg c -= c_offset; 1453 1.1 mrg a_dim1 = aystride; 1454 1.1 mrg a_offset = 1 + a_dim1; 1455 1.1 mrg a -= a_offset; 1456 1.1 mrg b_dim1 = bystride; 1457 1.1 mrg b_offset = 1 + b_dim1; 1458 1.1 mrg b -= b_offset; 1459 1.1 mrg 1460 1.1 mrg /* Empty c first. */ 1461 1.1 mrg for (j=1; j<=n; j++) 1462 1.1 mrg for (i=1; i<=m; i++) 1463 1.1 mrg c[i + j * c_dim1] = (GFC_COMPLEX_16)0; 1464 1.1 mrg 1465 1.1 mrg /* Early exit if possible */ 1466 1.1 mrg if (m == 0 || n == 0 || k == 0) 1467 1.1 mrg return; 1468 1.1 mrg 1469 1.1 mrg /* Adjust size of t1 to what is needed. */ 1470 1.1 mrg index_type t1_dim, a_sz; 1471 1.1 mrg if (aystride == 1) 1472 1.1 mrg a_sz = rystride; 1473 1.1 mrg else 1474 1.1 mrg a_sz = a_dim1; 1475 1.1 mrg 1476 1.1 mrg t1_dim = a_sz * 256 + b_dim1; 1477 1.1 mrg if (t1_dim > 65536) 1478 1.1 mrg t1_dim = 65536; 1479 1.1 mrg 1480 1.1 mrg t1 = malloc (t1_dim * sizeof(GFC_COMPLEX_16)); 1481 1.1 mrg 1482 1.1 mrg /* Start turning the crank. */ 1483 1.1 mrg i1 = n; 1484 1.1 mrg for (jj = 1; jj <= i1; jj += 512) 1485 1.1 mrg { 1486 1.1 mrg /* Computing MIN */ 1487 1.1 mrg i2 = 512; 1488 1.1 mrg i3 = n - jj + 1; 1489 1.1 mrg jsec = min(i2,i3); 1490 1.1 mrg ujsec = jsec - jsec % 4; 1491 1.1 mrg i2 = k; 1492 1.1 mrg for (ll = 1; ll <= i2; ll += 256) 1493 1.1 mrg { 1494 1.1 mrg /* Computing MIN */ 1495 1.1 mrg i3 = 256; 1496 1.1 mrg i4 = k - ll + 1; 1497 1.1 mrg lsec = min(i3,i4); 1498 1.1 mrg ulsec = lsec - lsec % 2; 1499 1.1 mrg 1500 1.1 mrg i3 = m; 1501 1.1 mrg for (ii = 1; ii <= i3; ii += 256) 1502 1.1 mrg { 1503 1.1 mrg /* Computing MIN */ 1504 1.1 mrg i4 = 256; 1505 1.1 mrg i5 = m - ii + 1; 1506 1.1 mrg isec = min(i4,i5); 1507 1.1 mrg uisec = isec - isec % 2; 1508 1.1 mrg i4 = ll + ulsec - 1; 1509 1.1 mrg for (l = ll; l <= i4; l += 2) 1510 1.1 mrg { 1511 1.1 mrg i5 = ii + uisec - 1; 1512 1.1 mrg for (i = ii; i <= i5; i += 2) 1513 1.1 mrg { 1514 1.1 mrg t1[l - ll + 1 + ((i - ii + 1) << 8) - 257] = 1515 1.1 mrg a[i + l * a_dim1]; 1516 1.1 mrg t1[l - ll + 2 + ((i - ii + 1) << 8) - 257] = 1517 1.1 mrg a[i + (l + 1) * a_dim1]; 1518 1.1 mrg t1[l - ll + 1 + ((i - ii + 2) << 8) - 257] = 1519 1.1 mrg a[i + 1 + l * a_dim1]; 1520 1.1 mrg t1[l - ll + 2 + ((i - ii + 2) << 8) - 257] = 1521 1.1 mrg a[i + 1 + (l + 1) * a_dim1]; 1522 1.1 mrg } 1523 1.1 mrg if (uisec < isec) 1524 1.1 mrg { 1525 1.1 mrg t1[l - ll + 1 + (isec << 8) - 257] = 1526 1.1 mrg a[ii + isec - 1 + l * a_dim1]; 1527 1.1 mrg t1[l - ll + 2 + (isec << 8) - 257] = 1528 1.1 mrg a[ii + isec - 1 + (l + 1) * a_dim1]; 1529 1.1 mrg } 1530 1.1 mrg } 1531 1.1 mrg if (ulsec < lsec) 1532 1.1 mrg { 1533 1.1 mrg i4 = ii + isec - 1; 1534 1.1 mrg for (i = ii; i<= i4; ++i) 1535 1.1 mrg { 1536 1.1 mrg t1[lsec + ((i - ii + 1) << 8) - 257] = 1537 1.1 mrg a[i + (ll + lsec - 1) * a_dim1]; 1538 1.1 mrg } 1539 1.1 mrg } 1540 1.1 mrg 1541 1.1 mrg uisec = isec - isec % 4; 1542 1.1 mrg i4 = jj + ujsec - 1; 1543 1.1 mrg for (j = jj; j <= i4; j += 4) 1544 1.1 mrg { 1545 1.1 mrg i5 = ii + uisec - 1; 1546 1.1 mrg for (i = ii; i <= i5; i += 4) 1547 1.1 mrg { 1548 1.1 mrg f11 = c[i + j * c_dim1]; 1549 1.1 mrg f21 = c[i + 1 + j * c_dim1]; 1550 1.1 mrg f12 = c[i + (j + 1) * c_dim1]; 1551 1.1 mrg f22 = c[i + 1 + (j + 1) * c_dim1]; 1552 1.1 mrg f13 = c[i + (j + 2) * c_dim1]; 1553 1.1 mrg f23 = c[i + 1 + (j + 2) * c_dim1]; 1554 1.1 mrg f14 = c[i + (j + 3) * c_dim1]; 1555 1.1 mrg f24 = c[i + 1 + (j + 3) * c_dim1]; 1556 1.1 mrg f31 = c[i + 2 + j * c_dim1]; 1557 1.1 mrg f41 = c[i + 3 + j * c_dim1]; 1558 1.1 mrg f32 = c[i + 2 + (j + 1) * c_dim1]; 1559 1.1 mrg f42 = c[i + 3 + (j + 1) * c_dim1]; 1560 1.1 mrg f33 = c[i + 2 + (j + 2) * c_dim1]; 1561 1.1 mrg f43 = c[i + 3 + (j + 2) * c_dim1]; 1562 1.1 mrg f34 = c[i + 2 + (j + 3) * c_dim1]; 1563 1.1 mrg f44 = c[i + 3 + (j + 3) * c_dim1]; 1564 1.1 mrg i6 = ll + lsec - 1; 1565 1.1 mrg for (l = ll; l <= i6; ++l) 1566 1.1 mrg { 1567 1.1 mrg f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257] 1568 1.1 mrg * b[l + j * b_dim1]; 1569 1.1 mrg f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257] 1570 1.1 mrg * b[l + j * b_dim1]; 1571 1.1 mrg f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257] 1572 1.1 mrg * b[l + (j + 1) * b_dim1]; 1573 1.1 mrg f22 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257] 1574 1.1 mrg * b[l + (j + 1) * b_dim1]; 1575 1.1 mrg f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257] 1576 1.1 mrg * b[l + (j + 2) * b_dim1]; 1577 1.1 mrg f23 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257] 1578 1.1 mrg * b[l + (j + 2) * b_dim1]; 1579 1.1 mrg f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257] 1580 1.1 mrg * b[l + (j + 3) * b_dim1]; 1581 1.1 mrg f24 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257] 1582 1.1 mrg * b[l + (j + 3) * b_dim1]; 1583 1.1 mrg f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257] 1584 1.1 mrg * b[l + j * b_dim1]; 1585 1.1 mrg f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257] 1586 1.1 mrg * b[l + j * b_dim1]; 1587 1.1 mrg f32 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257] 1588 1.1 mrg * b[l + (j + 1) * b_dim1]; 1589 1.1 mrg f42 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257] 1590 1.1 mrg * b[l + (j + 1) * b_dim1]; 1591 1.1 mrg f33 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257] 1592 1.1 mrg * b[l + (j + 2) * b_dim1]; 1593 1.1 mrg f43 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257] 1594 1.1 mrg * b[l + (j + 2) * b_dim1]; 1595 1.1 mrg f34 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257] 1596 1.1 mrg * b[l + (j + 3) * b_dim1]; 1597 1.1 mrg f44 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257] 1598 1.1 mrg * b[l + (j + 3) * b_dim1]; 1599 1.1 mrg } 1600 1.1 mrg c[i + j * c_dim1] = f11; 1601 1.1 mrg c[i + 1 + j * c_dim1] = f21; 1602 1.1 mrg c[i + (j + 1) * c_dim1] = f12; 1603 1.1 mrg c[i + 1 + (j + 1) * c_dim1] = f22; 1604 1.1 mrg c[i + (j + 2) * c_dim1] = f13; 1605 1.1 mrg c[i + 1 + (j + 2) * c_dim1] = f23; 1606 1.1 mrg c[i + (j + 3) * c_dim1] = f14; 1607 1.1 mrg c[i + 1 + (j + 3) * c_dim1] = f24; 1608 1.1 mrg c[i + 2 + j * c_dim1] = f31; 1609 1.1 mrg c[i + 3 + j * c_dim1] = f41; 1610 1.1 mrg c[i + 2 + (j + 1) * c_dim1] = f32; 1611 1.1 mrg c[i + 3 + (j + 1) * c_dim1] = f42; 1612 1.1 mrg c[i + 2 + (j + 2) * c_dim1] = f33; 1613 1.1 mrg c[i + 3 + (j + 2) * c_dim1] = f43; 1614 1.1 mrg c[i + 2 + (j + 3) * c_dim1] = f34; 1615 1.1 mrg c[i + 3 + (j + 3) * c_dim1] = f44; 1616 1.1 mrg } 1617 1.1 mrg if (uisec < isec) 1618 1.1 mrg { 1619 1.1 mrg i5 = ii + isec - 1; 1620 1.1 mrg for (i = ii + uisec; i <= i5; ++i) 1621 1.1 mrg { 1622 1.1 mrg f11 = c[i + j * c_dim1]; 1623 1.1 mrg f12 = c[i + (j + 1) * c_dim1]; 1624 1.1 mrg f13 = c[i + (j + 2) * c_dim1]; 1625 1.1 mrg f14 = c[i + (j + 3) * c_dim1]; 1626 1.1 mrg i6 = ll + lsec - 1; 1627 1.1 mrg for (l = ll; l <= i6; ++l) 1628 1.1 mrg { 1629 1.1 mrg f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 1630 1.1 mrg 257] * b[l + j * b_dim1]; 1631 1.1 mrg f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 1632 1.1 mrg 257] * b[l + (j + 1) * b_dim1]; 1633 1.1 mrg f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 1634 1.1 mrg 257] * b[l + (j + 2) * b_dim1]; 1635 1.1 mrg f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 1636 1.1 mrg 257] * b[l + (j + 3) * b_dim1]; 1637 1.1 mrg } 1638 1.1 mrg c[i + j * c_dim1] = f11; 1639 1.1 mrg c[i + (j + 1) * c_dim1] = f12; 1640 1.1 mrg c[i + (j + 2) * c_dim1] = f13; 1641 1.1 mrg c[i + (j + 3) * c_dim1] = f14; 1642 1.1 mrg } 1643 1.1 mrg } 1644 1.1 mrg } 1645 1.1 mrg if (ujsec < jsec) 1646 1.1 mrg { 1647 1.1 mrg i4 = jj + jsec - 1; 1648 1.1 mrg for (j = jj + ujsec; j <= i4; ++j) 1649 1.1 mrg { 1650 1.1 mrg i5 = ii + uisec - 1; 1651 1.1 mrg for (i = ii; i <= i5; i += 4) 1652 1.1 mrg { 1653 1.1 mrg f11 = c[i + j * c_dim1]; 1654 1.1 mrg f21 = c[i + 1 + j * c_dim1]; 1655 1.1 mrg f31 = c[i + 2 + j * c_dim1]; 1656 1.1 mrg f41 = c[i + 3 + j * c_dim1]; 1657 1.1 mrg i6 = ll + lsec - 1; 1658 1.1 mrg for (l = ll; l <= i6; ++l) 1659 1.1 mrg { 1660 1.1 mrg f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 1661 1.1 mrg 257] * b[l + j * b_dim1]; 1662 1.1 mrg f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 1663 1.1 mrg 257] * b[l + j * b_dim1]; 1664 1.1 mrg f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 1665 1.1 mrg 257] * b[l + j * b_dim1]; 1666 1.1 mrg f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 1667 1.1 mrg 257] * b[l + j * b_dim1]; 1668 1.1 mrg } 1669 1.1 mrg c[i + j * c_dim1] = f11; 1670 1.1 mrg c[i + 1 + j * c_dim1] = f21; 1671 1.1 mrg c[i + 2 + j * c_dim1] = f31; 1672 1.1 mrg c[i + 3 + j * c_dim1] = f41; 1673 1.1 mrg } 1674 1.1 mrg i5 = ii + isec - 1; 1675 1.1 mrg for (i = ii + uisec; i <= i5; ++i) 1676 1.1 mrg { 1677 1.1 mrg f11 = c[i + j * c_dim1]; 1678 1.1 mrg i6 = ll + lsec - 1; 1679 1.1 mrg for (l = ll; l <= i6; ++l) 1680 1.1 mrg { 1681 1.1 mrg f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 1682 1.1 mrg 257] * b[l + j * b_dim1]; 1683 1.1 mrg } 1684 1.1 mrg c[i + j * c_dim1] = f11; 1685 1.1 mrg } 1686 1.1 mrg } 1687 1.1 mrg } 1688 1.1 mrg } 1689 1.1 mrg } 1690 1.1 mrg } 1691 1.1 mrg free(t1); 1692 1.1 mrg return; 1693 1.1 mrg } 1694 1.1 mrg else if (rxstride == 1 && aystride == 1 && bxstride == 1) 1695 1.1 mrg { 1696 1.1 mrg if (GFC_DESCRIPTOR_RANK (a) != 1) 1697 1.1 mrg { 1698 1.1 mrg const GFC_COMPLEX_16 *restrict abase_x; 1699 1.1 mrg const GFC_COMPLEX_16 *restrict bbase_y; 1700 1.1 mrg GFC_COMPLEX_16 *restrict dest_y; 1701 1.1 mrg GFC_COMPLEX_16 s; 1702 1.1 mrg 1703 1.1 mrg for (y = 0; y < ycount; y++) 1704 1.1 mrg { 1705 1.1 mrg bbase_y = &bbase[y*bystride]; 1706 1.1 mrg dest_y = &dest[y*rystride]; 1707 1.1 mrg for (x = 0; x < xcount; x++) 1708 1.1 mrg { 1709 1.1 mrg abase_x = &abase[x*axstride]; 1710 1.1 mrg s = (GFC_COMPLEX_16) 0; 1711 1.1 mrg for (n = 0; n < count; n++) 1712 1.1 mrg s += abase_x[n] * bbase_y[n]; 1713 1.1 mrg dest_y[x] = s; 1714 1.1 mrg } 1715 1.1 mrg } 1716 1.1 mrg } 1717 1.1 mrg else 1718 1.1 mrg { 1719 1.1 mrg const GFC_COMPLEX_16 *restrict bbase_y; 1720 1.1 mrg GFC_COMPLEX_16 s; 1721 1.1 mrg 1722 1.1 mrg for (y = 0; y < ycount; y++) 1723 1.1 mrg { 1724 1.1 mrg bbase_y = &bbase[y*bystride]; 1725 1.1 mrg s = (GFC_COMPLEX_16) 0; 1726 1.1 mrg for (n = 0; n < count; n++) 1727 1.1 mrg s += abase[n*axstride] * bbase_y[n]; 1728 1.1 mrg dest[y*rystride] = s; 1729 1.1 mrg } 1730 1.1 mrg } 1731 1.1 mrg } 1732 1.1 mrg else if (GFC_DESCRIPTOR_RANK (a) == 1) 1733 1.1 mrg { 1734 1.1 mrg const GFC_COMPLEX_16 *restrict bbase_y; 1735 1.1 mrg GFC_COMPLEX_16 s; 1736 1.1 mrg 1737 1.1 mrg for (y = 0; y < ycount; y++) 1738 1.1 mrg { 1739 1.1 mrg bbase_y = &bbase[y*bystride]; 1740 1.1 mrg s = (GFC_COMPLEX_16) 0; 1741 1.1 mrg for (n = 0; n < count; n++) 1742 1.1 mrg s += abase[n*axstride] * bbase_y[n*bxstride]; 1743 1.1 mrg dest[y*rxstride] = s; 1744 1.1 mrg } 1745 1.1 mrg } 1746 1.1.1.2 mrg else if (axstride < aystride) 1747 1.1.1.2 mrg { 1748 1.1.1.2 mrg for (y = 0; y < ycount; y++) 1749 1.1.1.2 mrg for (x = 0; x < xcount; x++) 1750 1.1.1.2 mrg dest[x*rxstride + y*rystride] = (GFC_COMPLEX_16)0; 1751 1.1.1.2 mrg 1752 1.1.1.2 mrg for (y = 0; y < ycount; y++) 1753 1.1.1.2 mrg for (n = 0; n < count; n++) 1754 1.1.1.2 mrg for (x = 0; x < xcount; x++) 1755 1.1.1.2 mrg /* dest[x,y] += a[x,n] * b[n,y] */ 1756 1.1.1.2 mrg dest[x*rxstride + y*rystride] += 1757 1.1.1.2 mrg abase[x*axstride + n*aystride] * 1758 1.1.1.2 mrg bbase[n*bxstride + y*bystride]; 1759 1.1.1.2 mrg } 1760 1.1 mrg else 1761 1.1 mrg { 1762 1.1 mrg const GFC_COMPLEX_16 *restrict abase_x; 1763 1.1 mrg const GFC_COMPLEX_16 *restrict bbase_y; 1764 1.1 mrg GFC_COMPLEX_16 *restrict dest_y; 1765 1.1 mrg GFC_COMPLEX_16 s; 1766 1.1 mrg 1767 1.1 mrg for (y = 0; y < ycount; y++) 1768 1.1 mrg { 1769 1.1 mrg bbase_y = &bbase[y*bystride]; 1770 1.1 mrg dest_y = &dest[y*rystride]; 1771 1.1 mrg for (x = 0; x < xcount; x++) 1772 1.1 mrg { 1773 1.1 mrg abase_x = &abase[x*axstride]; 1774 1.1 mrg s = (GFC_COMPLEX_16) 0; 1775 1.1 mrg for (n = 0; n < count; n++) 1776 1.1 mrg s += abase_x[n*aystride] * bbase_y[n*bxstride]; 1777 1.1 mrg dest_y[x*rxstride] = s; 1778 1.1 mrg } 1779 1.1 mrg } 1780 1.1 mrg } 1781 1.1 mrg } 1782 1.1 mrg #undef POW3 1783 1.1 mrg #undef min 1784 1.1 mrg #undef max 1785 1.1 mrg 1786 1.1 mrg #endif /* HAVE_AVX512F */ 1787 1.1 mrg 1788 1.1 mrg /* AMD-specifix funtions with AVX128 and FMA3/FMA4. */ 1789 1.1 mrg 1790 1.1 mrg #if defined(HAVE_AVX) && defined(HAVE_FMA3) && defined(HAVE_AVX128) 1791 1.1 mrg void 1792 1.1 mrg matmul_c16_avx128_fma3 (gfc_array_c16 * const restrict retarray, 1793 1.1 mrg gfc_array_c16 * const restrict a, gfc_array_c16 * const restrict b, int try_blas, 1794 1.1 mrg int blas_limit, blas_call gemm) __attribute__((__target__("avx,fma"))); 1795 1.1 mrg internal_proto(matmul_c16_avx128_fma3); 1796 1.1 mrg #endif 1797 1.1 mrg 1798 1.1 mrg #if defined(HAVE_AVX) && defined(HAVE_FMA4) && defined(HAVE_AVX128) 1799 1.1 mrg void 1800 1.1 mrg matmul_c16_avx128_fma4 (gfc_array_c16 * const restrict retarray, 1801 1.1 mrg gfc_array_c16 * const restrict a, gfc_array_c16 * const restrict b, int try_blas, 1802 1.1 mrg int blas_limit, blas_call gemm) __attribute__((__target__("avx,fma4"))); 1803 1.1 mrg internal_proto(matmul_c16_avx128_fma4); 1804 1.1 mrg #endif 1805 1.1 mrg 1806 1.1 mrg /* Function to fall back to if there is no special processor-specific version. */ 1807 1.1 mrg static void 1808 1.1 mrg matmul_c16_vanilla (gfc_array_c16 * const restrict retarray, 1809 1.1 mrg gfc_array_c16 * const restrict a, gfc_array_c16 * const restrict b, int try_blas, 1810 1.1 mrg int blas_limit, blas_call gemm) 1811 1.1 mrg { 1812 1.1 mrg const GFC_COMPLEX_16 * restrict abase; 1813 1.1 mrg const GFC_COMPLEX_16 * restrict bbase; 1814 1.1 mrg GFC_COMPLEX_16 * restrict dest; 1815 1.1 mrg 1816 1.1 mrg index_type rxstride, rystride, axstride, aystride, bxstride, bystride; 1817 1.1 mrg index_type x, y, n, count, xcount, ycount; 1818 1.1 mrg 1819 1.1 mrg assert (GFC_DESCRIPTOR_RANK (a) == 2 1820 1.1 mrg || GFC_DESCRIPTOR_RANK (b) == 2); 1821 1.1 mrg 1822 1.1 mrg /* C[xcount,ycount] = A[xcount, count] * B[count,ycount] 1823 1.1 mrg 1824 1.1 mrg Either A or B (but not both) can be rank 1: 1825 1.1 mrg 1826 1.1 mrg o One-dimensional argument A is implicitly treated as a row matrix 1827 1.1 mrg dimensioned [1,count], so xcount=1. 1828 1.1 mrg 1829 1.1 mrg o One-dimensional argument B is implicitly treated as a column matrix 1830 1.1 mrg dimensioned [count, 1], so ycount=1. 1831 1.1 mrg */ 1832 1.1 mrg 1833 1.1 mrg if (retarray->base_addr == NULL) 1834 1.1 mrg { 1835 1.1 mrg if (GFC_DESCRIPTOR_RANK (a) == 1) 1836 1.1 mrg { 1837 1.1 mrg GFC_DIMENSION_SET(retarray->dim[0], 0, 1838 1.1 mrg GFC_DESCRIPTOR_EXTENT(b,1) - 1, 1); 1839 1.1 mrg } 1840 1.1 mrg else if (GFC_DESCRIPTOR_RANK (b) == 1) 1841 1.1 mrg { 1842 1.1 mrg GFC_DIMENSION_SET(retarray->dim[0], 0, 1843 1.1 mrg GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1); 1844 1.1 mrg } 1845 1.1 mrg else 1846 1.1 mrg { 1847 1.1 mrg GFC_DIMENSION_SET(retarray->dim[0], 0, 1848 1.1 mrg GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1); 1849 1.1 mrg 1850 1.1 mrg GFC_DIMENSION_SET(retarray->dim[1], 0, 1851 1.1 mrg GFC_DESCRIPTOR_EXTENT(b,1) - 1, 1852 1.1 mrg GFC_DESCRIPTOR_EXTENT(retarray,0)); 1853 1.1 mrg } 1854 1.1 mrg 1855 1.1 mrg retarray->base_addr 1856 1.1 mrg = xmallocarray (size0 ((array_t *) retarray), sizeof (GFC_COMPLEX_16)); 1857 1.1 mrg retarray->offset = 0; 1858 1.1 mrg } 1859 1.1 mrg else if (unlikely (compile_options.bounds_check)) 1860 1.1 mrg { 1861 1.1 mrg index_type ret_extent, arg_extent; 1862 1.1 mrg 1863 1.1 mrg if (GFC_DESCRIPTOR_RANK (a) == 1) 1864 1.1 mrg { 1865 1.1 mrg arg_extent = GFC_DESCRIPTOR_EXTENT(b,1); 1866 1.1 mrg ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0); 1867 1.1 mrg if (arg_extent != ret_extent) 1868 1.1 mrg runtime_error ("Array bound mismatch for dimension 1 of " 1869 1.1 mrg "array (%ld/%ld) ", 1870 1.1 mrg (long int) ret_extent, (long int) arg_extent); 1871 1.1 mrg } 1872 1.1 mrg else if (GFC_DESCRIPTOR_RANK (b) == 1) 1873 1.1 mrg { 1874 1.1 mrg arg_extent = GFC_DESCRIPTOR_EXTENT(a,0); 1875 1.1 mrg ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0); 1876 1.1 mrg if (arg_extent != ret_extent) 1877 1.1 mrg runtime_error ("Array bound mismatch for dimension 1 of " 1878 1.1 mrg "array (%ld/%ld) ", 1879 1.1 mrg (long int) ret_extent, (long int) arg_extent); 1880 1.1 mrg } 1881 1.1 mrg else 1882 1.1 mrg { 1883 1.1 mrg arg_extent = GFC_DESCRIPTOR_EXTENT(a,0); 1884 1.1 mrg ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0); 1885 1.1 mrg if (arg_extent != ret_extent) 1886 1.1 mrg runtime_error ("Array bound mismatch for dimension 1 of " 1887 1.1 mrg "array (%ld/%ld) ", 1888 1.1 mrg (long int) ret_extent, (long int) arg_extent); 1889 1.1 mrg 1890 1.1 mrg arg_extent = GFC_DESCRIPTOR_EXTENT(b,1); 1891 1.1 mrg ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,1); 1892 1.1 mrg if (arg_extent != ret_extent) 1893 1.1 mrg runtime_error ("Array bound mismatch for dimension 2 of " 1894 1.1 mrg "array (%ld/%ld) ", 1895 1.1 mrg (long int) ret_extent, (long int) arg_extent); 1896 1.1 mrg } 1897 1.1 mrg } 1898 1.1 mrg 1899 1.1 mrg 1900 1.1 mrg if (GFC_DESCRIPTOR_RANK (retarray) == 1) 1901 1.1 mrg { 1902 1.1 mrg /* One-dimensional result may be addressed in the code below 1903 1.1 mrg either as a row or a column matrix. We want both cases to 1904 1.1 mrg work. */ 1905 1.1 mrg rxstride = rystride = GFC_DESCRIPTOR_STRIDE(retarray,0); 1906 1.1 mrg } 1907 1.1 mrg else 1908 1.1 mrg { 1909 1.1 mrg rxstride = GFC_DESCRIPTOR_STRIDE(retarray,0); 1910 1.1 mrg rystride = GFC_DESCRIPTOR_STRIDE(retarray,1); 1911 1.1 mrg } 1912 1.1 mrg 1913 1.1 mrg 1914 1.1 mrg if (GFC_DESCRIPTOR_RANK (a) == 1) 1915 1.1 mrg { 1916 1.1 mrg /* Treat it as a a row matrix A[1,count]. */ 1917 1.1 mrg axstride = GFC_DESCRIPTOR_STRIDE(a,0); 1918 1.1 mrg aystride = 1; 1919 1.1 mrg 1920 1.1 mrg xcount = 1; 1921 1.1 mrg count = GFC_DESCRIPTOR_EXTENT(a,0); 1922 1.1 mrg } 1923 1.1 mrg else 1924 1.1 mrg { 1925 1.1 mrg axstride = GFC_DESCRIPTOR_STRIDE(a,0); 1926 1.1 mrg aystride = GFC_DESCRIPTOR_STRIDE(a,1); 1927 1.1 mrg 1928 1.1 mrg count = GFC_DESCRIPTOR_EXTENT(a,1); 1929 1.1 mrg xcount = GFC_DESCRIPTOR_EXTENT(a,0); 1930 1.1 mrg } 1931 1.1 mrg 1932 1.1 mrg if (count != GFC_DESCRIPTOR_EXTENT(b,0)) 1933 1.1 mrg { 1934 1.1 mrg if (count > 0 || GFC_DESCRIPTOR_EXTENT(b,0) > 0) 1935 1.1 mrg runtime_error ("Incorrect extent in argument B in MATMUL intrinsic " 1936 1.1 mrg "in dimension 1: is %ld, should be %ld", 1937 1.1 mrg (long int) GFC_DESCRIPTOR_EXTENT(b,0), (long int) count); 1938 1.1 mrg } 1939 1.1 mrg 1940 1.1 mrg if (GFC_DESCRIPTOR_RANK (b) == 1) 1941 1.1 mrg { 1942 1.1 mrg /* Treat it as a column matrix B[count,1] */ 1943 1.1 mrg bxstride = GFC_DESCRIPTOR_STRIDE(b,0); 1944 1.1 mrg 1945 1.1 mrg /* bystride should never be used for 1-dimensional b. 1946 1.1 mrg The value is only used for calculation of the 1947 1.1 mrg memory by the buffer. */ 1948 1.1 mrg bystride = 256; 1949 1.1 mrg ycount = 1; 1950 1.1 mrg } 1951 1.1 mrg else 1952 1.1 mrg { 1953 1.1 mrg bxstride = GFC_DESCRIPTOR_STRIDE(b,0); 1954 1.1 mrg bystride = GFC_DESCRIPTOR_STRIDE(b,1); 1955 1.1 mrg ycount = GFC_DESCRIPTOR_EXTENT(b,1); 1956 1.1 mrg } 1957 1.1 mrg 1958 1.1 mrg abase = a->base_addr; 1959 1.1 mrg bbase = b->base_addr; 1960 1.1 mrg dest = retarray->base_addr; 1961 1.1 mrg 1962 1.1 mrg /* Now that everything is set up, we perform the multiplication 1963 1.1 mrg itself. */ 1964 1.1 mrg 1965 1.1 mrg #define POW3(x) (((float) (x)) * ((float) (x)) * ((float) (x))) 1966 1.1 mrg #define min(a,b) ((a) <= (b) ? (a) : (b)) 1967 1.1 mrg #define max(a,b) ((a) >= (b) ? (a) : (b)) 1968 1.1 mrg 1969 1.1 mrg if (try_blas && rxstride == 1 && (axstride == 1 || aystride == 1) 1970 1.1 mrg && (bxstride == 1 || bystride == 1) 1971 1.1 mrg && (((float) xcount) * ((float) ycount) * ((float) count) 1972 1.1 mrg > POW3(blas_limit))) 1973 1.1 mrg { 1974 1.1 mrg const int m = xcount, n = ycount, k = count, ldc = rystride; 1975 1.1 mrg const GFC_COMPLEX_16 one = 1, zero = 0; 1976 1.1 mrg const int lda = (axstride == 1) ? aystride : axstride, 1977 1.1 mrg ldb = (bxstride == 1) ? bystride : bxstride; 1978 1.1 mrg 1979 1.1 mrg if (lda > 0 && ldb > 0 && ldc > 0 && m > 1 && n > 1 && k > 1) 1980 1.1 mrg { 1981 1.1 mrg assert (gemm != NULL); 1982 1.1 mrg const char *transa, *transb; 1983 1.1 mrg if (try_blas & 2) 1984 1.1 mrg transa = "C"; 1985 1.1 mrg else 1986 1.1 mrg transa = axstride == 1 ? "N" : "T"; 1987 1.1 mrg 1988 1.1 mrg if (try_blas & 4) 1989 1.1 mrg transb = "C"; 1990 1.1 mrg else 1991 1.1 mrg transb = bxstride == 1 ? "N" : "T"; 1992 1.1 mrg 1993 1.1 mrg gemm (transa, transb , &m, 1994 1.1 mrg &n, &k, &one, abase, &lda, bbase, &ldb, &zero, dest, 1995 1.1 mrg &ldc, 1, 1); 1996 1.1 mrg return; 1997 1.1 mrg } 1998 1.1 mrg } 1999 1.1 mrg 2000 1.1.1.2 mrg if (rxstride == 1 && axstride == 1 && bxstride == 1 2001 1.1.1.2 mrg && GFC_DESCRIPTOR_RANK (b) != 1) 2002 1.1 mrg { 2003 1.1 mrg /* This block of code implements a tuned matmul, derived from 2004 1.1 mrg Superscalar GEMM-based level 3 BLAS, Beta version 0.1 2005 1.1 mrg 2006 1.1 mrg Bo Kagstrom and Per Ling 2007 1.1 mrg Department of Computing Science 2008 1.1 mrg Umea University 2009 1.1 mrg S-901 87 Umea, Sweden 2010 1.1 mrg 2011 1.1 mrg from netlib.org, translated to C, and modified for matmul.m4. */ 2012 1.1 mrg 2013 1.1 mrg const GFC_COMPLEX_16 *a, *b; 2014 1.1 mrg GFC_COMPLEX_16 *c; 2015 1.1 mrg const index_type m = xcount, n = ycount, k = count; 2016 1.1 mrg 2017 1.1 mrg /* System generated locals */ 2018 1.1 mrg index_type a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset, 2019 1.1 mrg i1, i2, i3, i4, i5, i6; 2020 1.1 mrg 2021 1.1 mrg /* Local variables */ 2022 1.1 mrg GFC_COMPLEX_16 f11, f12, f21, f22, f31, f32, f41, f42, 2023 1.1 mrg f13, f14, f23, f24, f33, f34, f43, f44; 2024 1.1 mrg index_type i, j, l, ii, jj, ll; 2025 1.1 mrg index_type isec, jsec, lsec, uisec, ujsec, ulsec; 2026 1.1 mrg GFC_COMPLEX_16 *t1; 2027 1.1 mrg 2028 1.1 mrg a = abase; 2029 1.1 mrg b = bbase; 2030 1.1 mrg c = retarray->base_addr; 2031 1.1 mrg 2032 1.1 mrg /* Parameter adjustments */ 2033 1.1 mrg c_dim1 = rystride; 2034 1.1 mrg c_offset = 1 + c_dim1; 2035 1.1 mrg c -= c_offset; 2036 1.1 mrg a_dim1 = aystride; 2037 1.1 mrg a_offset = 1 + a_dim1; 2038 1.1 mrg a -= a_offset; 2039 1.1 mrg b_dim1 = bystride; 2040 1.1 mrg b_offset = 1 + b_dim1; 2041 1.1 mrg b -= b_offset; 2042 1.1 mrg 2043 1.1 mrg /* Empty c first. */ 2044 1.1 mrg for (j=1; j<=n; j++) 2045 1.1 mrg for (i=1; i<=m; i++) 2046 1.1 mrg c[i + j * c_dim1] = (GFC_COMPLEX_16)0; 2047 1.1 mrg 2048 1.1 mrg /* Early exit if possible */ 2049 1.1 mrg if (m == 0 || n == 0 || k == 0) 2050 1.1 mrg return; 2051 1.1 mrg 2052 1.1 mrg /* Adjust size of t1 to what is needed. */ 2053 1.1 mrg index_type t1_dim, a_sz; 2054 1.1 mrg if (aystride == 1) 2055 1.1 mrg a_sz = rystride; 2056 1.1 mrg else 2057 1.1 mrg a_sz = a_dim1; 2058 1.1 mrg 2059 1.1 mrg t1_dim = a_sz * 256 + b_dim1; 2060 1.1 mrg if (t1_dim > 65536) 2061 1.1 mrg t1_dim = 65536; 2062 1.1 mrg 2063 1.1 mrg t1 = malloc (t1_dim * sizeof(GFC_COMPLEX_16)); 2064 1.1 mrg 2065 1.1 mrg /* Start turning the crank. */ 2066 1.1 mrg i1 = n; 2067 1.1 mrg for (jj = 1; jj <= i1; jj += 512) 2068 1.1 mrg { 2069 1.1 mrg /* Computing MIN */ 2070 1.1 mrg i2 = 512; 2071 1.1 mrg i3 = n - jj + 1; 2072 1.1 mrg jsec = min(i2,i3); 2073 1.1 mrg ujsec = jsec - jsec % 4; 2074 1.1 mrg i2 = k; 2075 1.1 mrg for (ll = 1; ll <= i2; ll += 256) 2076 1.1 mrg { 2077 1.1 mrg /* Computing MIN */ 2078 1.1 mrg i3 = 256; 2079 1.1 mrg i4 = k - ll + 1; 2080 1.1 mrg lsec = min(i3,i4); 2081 1.1 mrg ulsec = lsec - lsec % 2; 2082 1.1 mrg 2083 1.1 mrg i3 = m; 2084 1.1 mrg for (ii = 1; ii <= i3; ii += 256) 2085 1.1 mrg { 2086 1.1 mrg /* Computing MIN */ 2087 1.1 mrg i4 = 256; 2088 1.1 mrg i5 = m - ii + 1; 2089 1.1 mrg isec = min(i4,i5); 2090 1.1 mrg uisec = isec - isec % 2; 2091 1.1 mrg i4 = ll + ulsec - 1; 2092 1.1 mrg for (l = ll; l <= i4; l += 2) 2093 1.1 mrg { 2094 1.1 mrg i5 = ii + uisec - 1; 2095 1.1 mrg for (i = ii; i <= i5; i += 2) 2096 1.1 mrg { 2097 1.1 mrg t1[l - ll + 1 + ((i - ii + 1) << 8) - 257] = 2098 1.1 mrg a[i + l * a_dim1]; 2099 1.1 mrg t1[l - ll + 2 + ((i - ii + 1) << 8) - 257] = 2100 1.1 mrg a[i + (l + 1) * a_dim1]; 2101 1.1 mrg t1[l - ll + 1 + ((i - ii + 2) << 8) - 257] = 2102 1.1 mrg a[i + 1 + l * a_dim1]; 2103 1.1 mrg t1[l - ll + 2 + ((i - ii + 2) << 8) - 257] = 2104 1.1 mrg a[i + 1 + (l + 1) * a_dim1]; 2105 1.1 mrg } 2106 1.1 mrg if (uisec < isec) 2107 1.1 mrg { 2108 1.1 mrg t1[l - ll + 1 + (isec << 8) - 257] = 2109 1.1 mrg a[ii + isec - 1 + l * a_dim1]; 2110 1.1 mrg t1[l - ll + 2 + (isec << 8) - 257] = 2111 1.1 mrg a[ii + isec - 1 + (l + 1) * a_dim1]; 2112 1.1 mrg } 2113 1.1 mrg } 2114 1.1 mrg if (ulsec < lsec) 2115 1.1 mrg { 2116 1.1 mrg i4 = ii + isec - 1; 2117 1.1 mrg for (i = ii; i<= i4; ++i) 2118 1.1 mrg { 2119 1.1 mrg t1[lsec + ((i - ii + 1) << 8) - 257] = 2120 1.1 mrg a[i + (ll + lsec - 1) * a_dim1]; 2121 1.1 mrg } 2122 1.1 mrg } 2123 1.1 mrg 2124 1.1 mrg uisec = isec - isec % 4; 2125 1.1 mrg i4 = jj + ujsec - 1; 2126 1.1 mrg for (j = jj; j <= i4; j += 4) 2127 1.1 mrg { 2128 1.1 mrg i5 = ii + uisec - 1; 2129 1.1 mrg for (i = ii; i <= i5; i += 4) 2130 1.1 mrg { 2131 1.1 mrg f11 = c[i + j * c_dim1]; 2132 1.1 mrg f21 = c[i + 1 + j * c_dim1]; 2133 1.1 mrg f12 = c[i + (j + 1) * c_dim1]; 2134 1.1 mrg f22 = c[i + 1 + (j + 1) * c_dim1]; 2135 1.1 mrg f13 = c[i + (j + 2) * c_dim1]; 2136 1.1 mrg f23 = c[i + 1 + (j + 2) * c_dim1]; 2137 1.1 mrg f14 = c[i + (j + 3) * c_dim1]; 2138 1.1 mrg f24 = c[i + 1 + (j + 3) * c_dim1]; 2139 1.1 mrg f31 = c[i + 2 + j * c_dim1]; 2140 1.1 mrg f41 = c[i + 3 + j * c_dim1]; 2141 1.1 mrg f32 = c[i + 2 + (j + 1) * c_dim1]; 2142 1.1 mrg f42 = c[i + 3 + (j + 1) * c_dim1]; 2143 1.1 mrg f33 = c[i + 2 + (j + 2) * c_dim1]; 2144 1.1 mrg f43 = c[i + 3 + (j + 2) * c_dim1]; 2145 1.1 mrg f34 = c[i + 2 + (j + 3) * c_dim1]; 2146 1.1 mrg f44 = c[i + 3 + (j + 3) * c_dim1]; 2147 1.1 mrg i6 = ll + lsec - 1; 2148 1.1 mrg for (l = ll; l <= i6; ++l) 2149 1.1 mrg { 2150 1.1 mrg f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257] 2151 1.1 mrg * b[l + j * b_dim1]; 2152 1.1 mrg f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257] 2153 1.1 mrg * b[l + j * b_dim1]; 2154 1.1 mrg f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257] 2155 1.1 mrg * b[l + (j + 1) * b_dim1]; 2156 1.1 mrg f22 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257] 2157 1.1 mrg * b[l + (j + 1) * b_dim1]; 2158 1.1 mrg f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257] 2159 1.1 mrg * b[l + (j + 2) * b_dim1]; 2160 1.1 mrg f23 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257] 2161 1.1 mrg * b[l + (j + 2) * b_dim1]; 2162 1.1 mrg f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257] 2163 1.1 mrg * b[l + (j + 3) * b_dim1]; 2164 1.1 mrg f24 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257] 2165 1.1 mrg * b[l + (j + 3) * b_dim1]; 2166 1.1 mrg f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257] 2167 1.1 mrg * b[l + j * b_dim1]; 2168 1.1 mrg f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257] 2169 1.1 mrg * b[l + j * b_dim1]; 2170 1.1 mrg f32 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257] 2171 1.1 mrg * b[l + (j + 1) * b_dim1]; 2172 1.1 mrg f42 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257] 2173 1.1 mrg * b[l + (j + 1) * b_dim1]; 2174 1.1 mrg f33 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257] 2175 1.1 mrg * b[l + (j + 2) * b_dim1]; 2176 1.1 mrg f43 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257] 2177 1.1 mrg * b[l + (j + 2) * b_dim1]; 2178 1.1 mrg f34 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257] 2179 1.1 mrg * b[l + (j + 3) * b_dim1]; 2180 1.1 mrg f44 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257] 2181 1.1 mrg * b[l + (j + 3) * b_dim1]; 2182 1.1 mrg } 2183 1.1 mrg c[i + j * c_dim1] = f11; 2184 1.1 mrg c[i + 1 + j * c_dim1] = f21; 2185 1.1 mrg c[i + (j + 1) * c_dim1] = f12; 2186 1.1 mrg c[i + 1 + (j + 1) * c_dim1] = f22; 2187 1.1 mrg c[i + (j + 2) * c_dim1] = f13; 2188 1.1 mrg c[i + 1 + (j + 2) * c_dim1] = f23; 2189 1.1 mrg c[i + (j + 3) * c_dim1] = f14; 2190 1.1 mrg c[i + 1 + (j + 3) * c_dim1] = f24; 2191 1.1 mrg c[i + 2 + j * c_dim1] = f31; 2192 1.1 mrg c[i + 3 + j * c_dim1] = f41; 2193 1.1 mrg c[i + 2 + (j + 1) * c_dim1] = f32; 2194 1.1 mrg c[i + 3 + (j + 1) * c_dim1] = f42; 2195 1.1 mrg c[i + 2 + (j + 2) * c_dim1] = f33; 2196 1.1 mrg c[i + 3 + (j + 2) * c_dim1] = f43; 2197 1.1 mrg c[i + 2 + (j + 3) * c_dim1] = f34; 2198 1.1 mrg c[i + 3 + (j + 3) * c_dim1] = f44; 2199 1.1 mrg } 2200 1.1 mrg if (uisec < isec) 2201 1.1 mrg { 2202 1.1 mrg i5 = ii + isec - 1; 2203 1.1 mrg for (i = ii + uisec; i <= i5; ++i) 2204 1.1 mrg { 2205 1.1 mrg f11 = c[i + j * c_dim1]; 2206 1.1 mrg f12 = c[i + (j + 1) * c_dim1]; 2207 1.1 mrg f13 = c[i + (j + 2) * c_dim1]; 2208 1.1 mrg f14 = c[i + (j + 3) * c_dim1]; 2209 1.1 mrg i6 = ll + lsec - 1; 2210 1.1 mrg for (l = ll; l <= i6; ++l) 2211 1.1 mrg { 2212 1.1 mrg f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 2213 1.1 mrg 257] * b[l + j * b_dim1]; 2214 1.1 mrg f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 2215 1.1 mrg 257] * b[l + (j + 1) * b_dim1]; 2216 1.1 mrg f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 2217 1.1 mrg 257] * b[l + (j + 2) * b_dim1]; 2218 1.1 mrg f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 2219 1.1 mrg 257] * b[l + (j + 3) * b_dim1]; 2220 1.1 mrg } 2221 1.1 mrg c[i + j * c_dim1] = f11; 2222 1.1 mrg c[i + (j + 1) * c_dim1] = f12; 2223 1.1 mrg c[i + (j + 2) * c_dim1] = f13; 2224 1.1 mrg c[i + (j + 3) * c_dim1] = f14; 2225 1.1 mrg } 2226 1.1 mrg } 2227 1.1 mrg } 2228 1.1 mrg if (ujsec < jsec) 2229 1.1 mrg { 2230 1.1 mrg i4 = jj + jsec - 1; 2231 1.1 mrg for (j = jj + ujsec; j <= i4; ++j) 2232 1.1 mrg { 2233 1.1 mrg i5 = ii + uisec - 1; 2234 1.1 mrg for (i = ii; i <= i5; i += 4) 2235 1.1 mrg { 2236 1.1 mrg f11 = c[i + j * c_dim1]; 2237 1.1 mrg f21 = c[i + 1 + j * c_dim1]; 2238 1.1 mrg f31 = c[i + 2 + j * c_dim1]; 2239 1.1 mrg f41 = c[i + 3 + j * c_dim1]; 2240 1.1 mrg i6 = ll + lsec - 1; 2241 1.1 mrg for (l = ll; l <= i6; ++l) 2242 1.1 mrg { 2243 1.1 mrg f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 2244 1.1 mrg 257] * b[l + j * b_dim1]; 2245 1.1 mrg f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 2246 1.1 mrg 257] * b[l + j * b_dim1]; 2247 1.1 mrg f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 2248 1.1 mrg 257] * b[l + j * b_dim1]; 2249 1.1 mrg f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 2250 1.1 mrg 257] * b[l + j * b_dim1]; 2251 1.1 mrg } 2252 1.1 mrg c[i + j * c_dim1] = f11; 2253 1.1 mrg c[i + 1 + j * c_dim1] = f21; 2254 1.1 mrg c[i + 2 + j * c_dim1] = f31; 2255 1.1 mrg c[i + 3 + j * c_dim1] = f41; 2256 1.1 mrg } 2257 1.1 mrg i5 = ii + isec - 1; 2258 1.1 mrg for (i = ii + uisec; i <= i5; ++i) 2259 1.1 mrg { 2260 1.1 mrg f11 = c[i + j * c_dim1]; 2261 1.1 mrg i6 = ll + lsec - 1; 2262 1.1 mrg for (l = ll; l <= i6; ++l) 2263 1.1 mrg { 2264 1.1 mrg f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 2265 1.1 mrg 257] * b[l + j * b_dim1]; 2266 1.1 mrg } 2267 1.1 mrg c[i + j * c_dim1] = f11; 2268 1.1 mrg } 2269 1.1 mrg } 2270 1.1 mrg } 2271 1.1 mrg } 2272 1.1 mrg } 2273 1.1 mrg } 2274 1.1 mrg free(t1); 2275 1.1 mrg return; 2276 1.1 mrg } 2277 1.1 mrg else if (rxstride == 1 && aystride == 1 && bxstride == 1) 2278 1.1 mrg { 2279 1.1 mrg if (GFC_DESCRIPTOR_RANK (a) != 1) 2280 1.1 mrg { 2281 1.1 mrg const GFC_COMPLEX_16 *restrict abase_x; 2282 1.1 mrg const GFC_COMPLEX_16 *restrict bbase_y; 2283 1.1 mrg GFC_COMPLEX_16 *restrict dest_y; 2284 1.1 mrg GFC_COMPLEX_16 s; 2285 1.1 mrg 2286 1.1 mrg for (y = 0; y < ycount; y++) 2287 1.1 mrg { 2288 1.1 mrg bbase_y = &bbase[y*bystride]; 2289 1.1 mrg dest_y = &dest[y*rystride]; 2290 1.1 mrg for (x = 0; x < xcount; x++) 2291 1.1 mrg { 2292 1.1 mrg abase_x = &abase[x*axstride]; 2293 1.1 mrg s = (GFC_COMPLEX_16) 0; 2294 1.1 mrg for (n = 0; n < count; n++) 2295 1.1 mrg s += abase_x[n] * bbase_y[n]; 2296 1.1 mrg dest_y[x] = s; 2297 1.1 mrg } 2298 1.1 mrg } 2299 1.1 mrg } 2300 1.1 mrg else 2301 1.1 mrg { 2302 1.1 mrg const GFC_COMPLEX_16 *restrict bbase_y; 2303 1.1 mrg GFC_COMPLEX_16 s; 2304 1.1 mrg 2305 1.1 mrg for (y = 0; y < ycount; y++) 2306 1.1 mrg { 2307 1.1 mrg bbase_y = &bbase[y*bystride]; 2308 1.1 mrg s = (GFC_COMPLEX_16) 0; 2309 1.1 mrg for (n = 0; n < count; n++) 2310 1.1 mrg s += abase[n*axstride] * bbase_y[n]; 2311 1.1 mrg dest[y*rystride] = s; 2312 1.1 mrg } 2313 1.1 mrg } 2314 1.1 mrg } 2315 1.1 mrg else if (GFC_DESCRIPTOR_RANK (a) == 1) 2316 1.1 mrg { 2317 1.1 mrg const GFC_COMPLEX_16 *restrict bbase_y; 2318 1.1 mrg GFC_COMPLEX_16 s; 2319 1.1 mrg 2320 1.1 mrg for (y = 0; y < ycount; y++) 2321 1.1 mrg { 2322 1.1 mrg bbase_y = &bbase[y*bystride]; 2323 1.1 mrg s = (GFC_COMPLEX_16) 0; 2324 1.1 mrg for (n = 0; n < count; n++) 2325 1.1 mrg s += abase[n*axstride] * bbase_y[n*bxstride]; 2326 1.1 mrg dest[y*rxstride] = s; 2327 1.1 mrg } 2328 1.1 mrg } 2329 1.1.1.2 mrg else if (axstride < aystride) 2330 1.1.1.2 mrg { 2331 1.1.1.2 mrg for (y = 0; y < ycount; y++) 2332 1.1.1.2 mrg for (x = 0; x < xcount; x++) 2333 1.1.1.2 mrg dest[x*rxstride + y*rystride] = (GFC_COMPLEX_16)0; 2334 1.1.1.2 mrg 2335 1.1.1.2 mrg for (y = 0; y < ycount; y++) 2336 1.1.1.2 mrg for (n = 0; n < count; n++) 2337 1.1.1.2 mrg for (x = 0; x < xcount; x++) 2338 1.1.1.2 mrg /* dest[x,y] += a[x,n] * b[n,y] */ 2339 1.1.1.2 mrg dest[x*rxstride + y*rystride] += 2340 1.1.1.2 mrg abase[x*axstride + n*aystride] * 2341 1.1.1.2 mrg bbase[n*bxstride + y*bystride]; 2342 1.1.1.2 mrg } 2343 1.1 mrg else 2344 1.1 mrg { 2345 1.1 mrg const GFC_COMPLEX_16 *restrict abase_x; 2346 1.1 mrg const GFC_COMPLEX_16 *restrict bbase_y; 2347 1.1 mrg GFC_COMPLEX_16 *restrict dest_y; 2348 1.1 mrg GFC_COMPLEX_16 s; 2349 1.1 mrg 2350 1.1 mrg for (y = 0; y < ycount; y++) 2351 1.1 mrg { 2352 1.1 mrg bbase_y = &bbase[y*bystride]; 2353 1.1 mrg dest_y = &dest[y*rystride]; 2354 1.1 mrg for (x = 0; x < xcount; x++) 2355 1.1 mrg { 2356 1.1 mrg abase_x = &abase[x*axstride]; 2357 1.1 mrg s = (GFC_COMPLEX_16) 0; 2358 1.1 mrg for (n = 0; n < count; n++) 2359 1.1 mrg s += abase_x[n*aystride] * bbase_y[n*bxstride]; 2360 1.1 mrg dest_y[x*rxstride] = s; 2361 1.1 mrg } 2362 1.1 mrg } 2363 1.1 mrg } 2364 1.1 mrg } 2365 1.1 mrg #undef POW3 2366 1.1 mrg #undef min 2367 1.1 mrg #undef max 2368 1.1 mrg 2369 1.1 mrg 2370 1.1 mrg /* Compiling main function, with selection code for the processor. */ 2371 1.1 mrg 2372 1.1 mrg /* Currently, this is i386 only. Adjust for other architectures. */ 2373 1.1 mrg 2374 1.1 mrg void matmul_c16 (gfc_array_c16 * const restrict retarray, 2375 1.1 mrg gfc_array_c16 * const restrict a, gfc_array_c16 * const restrict b, int try_blas, 2376 1.1 mrg int blas_limit, blas_call gemm) 2377 1.1 mrg { 2378 1.1 mrg static void (*matmul_p) (gfc_array_c16 * const restrict retarray, 2379 1.1 mrg gfc_array_c16 * const restrict a, gfc_array_c16 * const restrict b, int try_blas, 2380 1.1 mrg int blas_limit, blas_call gemm); 2381 1.1 mrg 2382 1.1 mrg void (*matmul_fn) (gfc_array_c16 * const restrict retarray, 2383 1.1 mrg gfc_array_c16 * const restrict a, gfc_array_c16 * const restrict b, int try_blas, 2384 1.1 mrg int blas_limit, blas_call gemm); 2385 1.1 mrg 2386 1.1 mrg matmul_fn = __atomic_load_n (&matmul_p, __ATOMIC_RELAXED); 2387 1.1 mrg if (matmul_fn == NULL) 2388 1.1 mrg { 2389 1.1 mrg matmul_fn = matmul_c16_vanilla; 2390 1.1.1.3 mrg if (__builtin_cpu_is ("intel")) 2391 1.1 mrg { 2392 1.1 mrg /* Run down the available processors in order of preference. */ 2393 1.1 mrg #ifdef HAVE_AVX512F 2394 1.1.1.3 mrg if (__builtin_cpu_supports ("avx512f")) 2395 1.1 mrg { 2396 1.1 mrg matmul_fn = matmul_c16_avx512f; 2397 1.1 mrg goto store; 2398 1.1 mrg } 2399 1.1 mrg 2400 1.1 mrg #endif /* HAVE_AVX512F */ 2401 1.1 mrg 2402 1.1 mrg #ifdef HAVE_AVX2 2403 1.1.1.3 mrg if (__builtin_cpu_supports ("avx2") 2404 1.1.1.3 mrg && __builtin_cpu_supports ("fma")) 2405 1.1 mrg { 2406 1.1 mrg matmul_fn = matmul_c16_avx2; 2407 1.1 mrg goto store; 2408 1.1 mrg } 2409 1.1 mrg 2410 1.1 mrg #endif 2411 1.1 mrg 2412 1.1 mrg #ifdef HAVE_AVX 2413 1.1.1.3 mrg if (__builtin_cpu_supports ("avx")) 2414 1.1 mrg { 2415 1.1 mrg matmul_fn = matmul_c16_avx; 2416 1.1 mrg goto store; 2417 1.1 mrg } 2418 1.1 mrg #endif /* HAVE_AVX */ 2419 1.1 mrg } 2420 1.1.1.3 mrg else if (__builtin_cpu_is ("amd")) 2421 1.1 mrg { 2422 1.1 mrg #if defined(HAVE_AVX) && defined(HAVE_FMA3) && defined(HAVE_AVX128) 2423 1.1.1.3 mrg if (__builtin_cpu_supports ("avx") 2424 1.1.1.3 mrg && __builtin_cpu_supports ("fma")) 2425 1.1 mrg { 2426 1.1 mrg matmul_fn = matmul_c16_avx128_fma3; 2427 1.1 mrg goto store; 2428 1.1 mrg } 2429 1.1 mrg #endif 2430 1.1 mrg #if defined(HAVE_AVX) && defined(HAVE_FMA4) && defined(HAVE_AVX128) 2431 1.1.1.3 mrg if (__builtin_cpu_supports ("avx") 2432 1.1.1.3 mrg && __builtin_cpu_supports ("fma4")) 2433 1.1 mrg { 2434 1.1 mrg matmul_fn = matmul_c16_avx128_fma4; 2435 1.1 mrg goto store; 2436 1.1 mrg } 2437 1.1 mrg #endif 2438 1.1 mrg 2439 1.1 mrg } 2440 1.1 mrg store: 2441 1.1 mrg __atomic_store_n (&matmul_p, matmul_fn, __ATOMIC_RELAXED); 2442 1.1 mrg } 2443 1.1 mrg 2444 1.1 mrg (*matmul_fn) (retarray, a, b, try_blas, blas_limit, gemm); 2445 1.1 mrg } 2446 1.1 mrg 2447 1.1 mrg #else /* Just the vanilla function. */ 2448 1.1 mrg 2449 1.1 mrg void 2450 1.1 mrg matmul_c16 (gfc_array_c16 * const restrict retarray, 2451 1.1 mrg gfc_array_c16 * const restrict a, gfc_array_c16 * const restrict b, int try_blas, 2452 1.1 mrg int blas_limit, blas_call gemm) 2453 1.1 mrg { 2454 1.1 mrg const GFC_COMPLEX_16 * restrict abase; 2455 1.1 mrg const GFC_COMPLEX_16 * restrict bbase; 2456 1.1 mrg GFC_COMPLEX_16 * restrict dest; 2457 1.1 mrg 2458 1.1 mrg index_type rxstride, rystride, axstride, aystride, bxstride, bystride; 2459 1.1 mrg index_type x, y, n, count, xcount, ycount; 2460 1.1 mrg 2461 1.1 mrg assert (GFC_DESCRIPTOR_RANK (a) == 2 2462 1.1 mrg || GFC_DESCRIPTOR_RANK (b) == 2); 2463 1.1 mrg 2464 1.1 mrg /* C[xcount,ycount] = A[xcount, count] * B[count,ycount] 2465 1.1 mrg 2466 1.1 mrg Either A or B (but not both) can be rank 1: 2467 1.1 mrg 2468 1.1 mrg o One-dimensional argument A is implicitly treated as a row matrix 2469 1.1 mrg dimensioned [1,count], so xcount=1. 2470 1.1 mrg 2471 1.1 mrg o One-dimensional argument B is implicitly treated as a column matrix 2472 1.1 mrg dimensioned [count, 1], so ycount=1. 2473 1.1 mrg */ 2474 1.1 mrg 2475 1.1 mrg if (retarray->base_addr == NULL) 2476 1.1 mrg { 2477 1.1 mrg if (GFC_DESCRIPTOR_RANK (a) == 1) 2478 1.1 mrg { 2479 1.1 mrg GFC_DIMENSION_SET(retarray->dim[0], 0, 2480 1.1 mrg GFC_DESCRIPTOR_EXTENT(b,1) - 1, 1); 2481 1.1 mrg } 2482 1.1 mrg else if (GFC_DESCRIPTOR_RANK (b) == 1) 2483 1.1 mrg { 2484 1.1 mrg GFC_DIMENSION_SET(retarray->dim[0], 0, 2485 1.1 mrg GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1); 2486 1.1 mrg } 2487 1.1 mrg else 2488 1.1 mrg { 2489 1.1 mrg GFC_DIMENSION_SET(retarray->dim[0], 0, 2490 1.1 mrg GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1); 2491 1.1 mrg 2492 1.1 mrg GFC_DIMENSION_SET(retarray->dim[1], 0, 2493 1.1 mrg GFC_DESCRIPTOR_EXTENT(b,1) - 1, 2494 1.1 mrg GFC_DESCRIPTOR_EXTENT(retarray,0)); 2495 1.1 mrg } 2496 1.1 mrg 2497 1.1 mrg retarray->base_addr 2498 1.1 mrg = xmallocarray (size0 ((array_t *) retarray), sizeof (GFC_COMPLEX_16)); 2499 1.1 mrg retarray->offset = 0; 2500 1.1 mrg } 2501 1.1 mrg else if (unlikely (compile_options.bounds_check)) 2502 1.1 mrg { 2503 1.1 mrg index_type ret_extent, arg_extent; 2504 1.1 mrg 2505 1.1 mrg if (GFC_DESCRIPTOR_RANK (a) == 1) 2506 1.1 mrg { 2507 1.1 mrg arg_extent = GFC_DESCRIPTOR_EXTENT(b,1); 2508 1.1 mrg ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0); 2509 1.1 mrg if (arg_extent != ret_extent) 2510 1.1 mrg runtime_error ("Array bound mismatch for dimension 1 of " 2511 1.1 mrg "array (%ld/%ld) ", 2512 1.1 mrg (long int) ret_extent, (long int) arg_extent); 2513 1.1 mrg } 2514 1.1 mrg else if (GFC_DESCRIPTOR_RANK (b) == 1) 2515 1.1 mrg { 2516 1.1 mrg arg_extent = GFC_DESCRIPTOR_EXTENT(a,0); 2517 1.1 mrg ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0); 2518 1.1 mrg if (arg_extent != ret_extent) 2519 1.1 mrg runtime_error ("Array bound mismatch for dimension 1 of " 2520 1.1 mrg "array (%ld/%ld) ", 2521 1.1 mrg (long int) ret_extent, (long int) arg_extent); 2522 1.1 mrg } 2523 1.1 mrg else 2524 1.1 mrg { 2525 1.1 mrg arg_extent = GFC_DESCRIPTOR_EXTENT(a,0); 2526 1.1 mrg ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0); 2527 1.1 mrg if (arg_extent != ret_extent) 2528 1.1 mrg runtime_error ("Array bound mismatch for dimension 1 of " 2529 1.1 mrg "array (%ld/%ld) ", 2530 1.1 mrg (long int) ret_extent, (long int) arg_extent); 2531 1.1 mrg 2532 1.1 mrg arg_extent = GFC_DESCRIPTOR_EXTENT(b,1); 2533 1.1 mrg ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,1); 2534 1.1 mrg if (arg_extent != ret_extent) 2535 1.1 mrg runtime_error ("Array bound mismatch for dimension 2 of " 2536 1.1 mrg "array (%ld/%ld) ", 2537 1.1 mrg (long int) ret_extent, (long int) arg_extent); 2538 1.1 mrg } 2539 1.1 mrg } 2540 1.1 mrg 2541 1.1 mrg 2542 1.1 mrg if (GFC_DESCRIPTOR_RANK (retarray) == 1) 2543 1.1 mrg { 2544 1.1 mrg /* One-dimensional result may be addressed in the code below 2545 1.1 mrg either as a row or a column matrix. We want both cases to 2546 1.1 mrg work. */ 2547 1.1 mrg rxstride = rystride = GFC_DESCRIPTOR_STRIDE(retarray,0); 2548 1.1 mrg } 2549 1.1 mrg else 2550 1.1 mrg { 2551 1.1 mrg rxstride = GFC_DESCRIPTOR_STRIDE(retarray,0); 2552 1.1 mrg rystride = GFC_DESCRIPTOR_STRIDE(retarray,1); 2553 1.1 mrg } 2554 1.1 mrg 2555 1.1 mrg 2556 1.1 mrg if (GFC_DESCRIPTOR_RANK (a) == 1) 2557 1.1 mrg { 2558 1.1 mrg /* Treat it as a a row matrix A[1,count]. */ 2559 1.1 mrg axstride = GFC_DESCRIPTOR_STRIDE(a,0); 2560 1.1 mrg aystride = 1; 2561 1.1 mrg 2562 1.1 mrg xcount = 1; 2563 1.1 mrg count = GFC_DESCRIPTOR_EXTENT(a,0); 2564 1.1 mrg } 2565 1.1 mrg else 2566 1.1 mrg { 2567 1.1 mrg axstride = GFC_DESCRIPTOR_STRIDE(a,0); 2568 1.1 mrg aystride = GFC_DESCRIPTOR_STRIDE(a,1); 2569 1.1 mrg 2570 1.1 mrg count = GFC_DESCRIPTOR_EXTENT(a,1); 2571 1.1 mrg xcount = GFC_DESCRIPTOR_EXTENT(a,0); 2572 1.1 mrg } 2573 1.1 mrg 2574 1.1 mrg if (count != GFC_DESCRIPTOR_EXTENT(b,0)) 2575 1.1 mrg { 2576 1.1 mrg if (count > 0 || GFC_DESCRIPTOR_EXTENT(b,0) > 0) 2577 1.1 mrg runtime_error ("Incorrect extent in argument B in MATMUL intrinsic " 2578 1.1 mrg "in dimension 1: is %ld, should be %ld", 2579 1.1 mrg (long int) GFC_DESCRIPTOR_EXTENT(b,0), (long int) count); 2580 1.1 mrg } 2581 1.1 mrg 2582 1.1 mrg if (GFC_DESCRIPTOR_RANK (b) == 1) 2583 1.1 mrg { 2584 1.1 mrg /* Treat it as a column matrix B[count,1] */ 2585 1.1 mrg bxstride = GFC_DESCRIPTOR_STRIDE(b,0); 2586 1.1 mrg 2587 1.1 mrg /* bystride should never be used for 1-dimensional b. 2588 1.1 mrg The value is only used for calculation of the 2589 1.1 mrg memory by the buffer. */ 2590 1.1 mrg bystride = 256; 2591 1.1 mrg ycount = 1; 2592 1.1 mrg } 2593 1.1 mrg else 2594 1.1 mrg { 2595 1.1 mrg bxstride = GFC_DESCRIPTOR_STRIDE(b,0); 2596 1.1 mrg bystride = GFC_DESCRIPTOR_STRIDE(b,1); 2597 1.1 mrg ycount = GFC_DESCRIPTOR_EXTENT(b,1); 2598 1.1 mrg } 2599 1.1 mrg 2600 1.1 mrg abase = a->base_addr; 2601 1.1 mrg bbase = b->base_addr; 2602 1.1 mrg dest = retarray->base_addr; 2603 1.1 mrg 2604 1.1 mrg /* Now that everything is set up, we perform the multiplication 2605 1.1 mrg itself. */ 2606 1.1 mrg 2607 1.1 mrg #define POW3(x) (((float) (x)) * ((float) (x)) * ((float) (x))) 2608 1.1 mrg #define min(a,b) ((a) <= (b) ? (a) : (b)) 2609 1.1 mrg #define max(a,b) ((a) >= (b) ? (a) : (b)) 2610 1.1 mrg 2611 1.1 mrg if (try_blas && rxstride == 1 && (axstride == 1 || aystride == 1) 2612 1.1 mrg && (bxstride == 1 || bystride == 1) 2613 1.1 mrg && (((float) xcount) * ((float) ycount) * ((float) count) 2614 1.1 mrg > POW3(blas_limit))) 2615 1.1 mrg { 2616 1.1 mrg const int m = xcount, n = ycount, k = count, ldc = rystride; 2617 1.1 mrg const GFC_COMPLEX_16 one = 1, zero = 0; 2618 1.1 mrg const int lda = (axstride == 1) ? aystride : axstride, 2619 1.1 mrg ldb = (bxstride == 1) ? bystride : bxstride; 2620 1.1 mrg 2621 1.1 mrg if (lda > 0 && ldb > 0 && ldc > 0 && m > 1 && n > 1 && k > 1) 2622 1.1 mrg { 2623 1.1 mrg assert (gemm != NULL); 2624 1.1 mrg const char *transa, *transb; 2625 1.1 mrg if (try_blas & 2) 2626 1.1 mrg transa = "C"; 2627 1.1 mrg else 2628 1.1 mrg transa = axstride == 1 ? "N" : "T"; 2629 1.1 mrg 2630 1.1 mrg if (try_blas & 4) 2631 1.1 mrg transb = "C"; 2632 1.1 mrg else 2633 1.1 mrg transb = bxstride == 1 ? "N" : "T"; 2634 1.1 mrg 2635 1.1 mrg gemm (transa, transb , &m, 2636 1.1 mrg &n, &k, &one, abase, &lda, bbase, &ldb, &zero, dest, 2637 1.1 mrg &ldc, 1, 1); 2638 1.1 mrg return; 2639 1.1 mrg } 2640 1.1 mrg } 2641 1.1 mrg 2642 1.1.1.2 mrg if (rxstride == 1 && axstride == 1 && bxstride == 1 2643 1.1.1.2 mrg && GFC_DESCRIPTOR_RANK (b) != 1) 2644 1.1 mrg { 2645 1.1 mrg /* This block of code implements a tuned matmul, derived from 2646 1.1 mrg Superscalar GEMM-based level 3 BLAS, Beta version 0.1 2647 1.1 mrg 2648 1.1 mrg Bo Kagstrom and Per Ling 2649 1.1 mrg Department of Computing Science 2650 1.1 mrg Umea University 2651 1.1 mrg S-901 87 Umea, Sweden 2652 1.1 mrg 2653 1.1 mrg from netlib.org, translated to C, and modified for matmul.m4. */ 2654 1.1 mrg 2655 1.1 mrg const GFC_COMPLEX_16 *a, *b; 2656 1.1 mrg GFC_COMPLEX_16 *c; 2657 1.1 mrg const index_type m = xcount, n = ycount, k = count; 2658 1.1 mrg 2659 1.1 mrg /* System generated locals */ 2660 1.1 mrg index_type a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset, 2661 1.1 mrg i1, i2, i3, i4, i5, i6; 2662 1.1 mrg 2663 1.1 mrg /* Local variables */ 2664 1.1 mrg GFC_COMPLEX_16 f11, f12, f21, f22, f31, f32, f41, f42, 2665 1.1 mrg f13, f14, f23, f24, f33, f34, f43, f44; 2666 1.1 mrg index_type i, j, l, ii, jj, ll; 2667 1.1 mrg index_type isec, jsec, lsec, uisec, ujsec, ulsec; 2668 1.1 mrg GFC_COMPLEX_16 *t1; 2669 1.1 mrg 2670 1.1 mrg a = abase; 2671 1.1 mrg b = bbase; 2672 1.1 mrg c = retarray->base_addr; 2673 1.1 mrg 2674 1.1 mrg /* Parameter adjustments */ 2675 1.1 mrg c_dim1 = rystride; 2676 1.1 mrg c_offset = 1 + c_dim1; 2677 1.1 mrg c -= c_offset; 2678 1.1 mrg a_dim1 = aystride; 2679 1.1 mrg a_offset = 1 + a_dim1; 2680 1.1 mrg a -= a_offset; 2681 1.1 mrg b_dim1 = bystride; 2682 1.1 mrg b_offset = 1 + b_dim1; 2683 1.1 mrg b -= b_offset; 2684 1.1 mrg 2685 1.1 mrg /* Empty c first. */ 2686 1.1 mrg for (j=1; j<=n; j++) 2687 1.1 mrg for (i=1; i<=m; i++) 2688 1.1 mrg c[i + j * c_dim1] = (GFC_COMPLEX_16)0; 2689 1.1 mrg 2690 1.1 mrg /* Early exit if possible */ 2691 1.1 mrg if (m == 0 || n == 0 || k == 0) 2692 1.1 mrg return; 2693 1.1 mrg 2694 1.1 mrg /* Adjust size of t1 to what is needed. */ 2695 1.1 mrg index_type t1_dim, a_sz; 2696 1.1 mrg if (aystride == 1) 2697 1.1 mrg a_sz = rystride; 2698 1.1 mrg else 2699 1.1 mrg a_sz = a_dim1; 2700 1.1 mrg 2701 1.1 mrg t1_dim = a_sz * 256 + b_dim1; 2702 1.1 mrg if (t1_dim > 65536) 2703 1.1 mrg t1_dim = 65536; 2704 1.1 mrg 2705 1.1 mrg t1 = malloc (t1_dim * sizeof(GFC_COMPLEX_16)); 2706 1.1 mrg 2707 1.1 mrg /* Start turning the crank. */ 2708 1.1 mrg i1 = n; 2709 1.1 mrg for (jj = 1; jj <= i1; jj += 512) 2710 1.1 mrg { 2711 1.1 mrg /* Computing MIN */ 2712 1.1 mrg i2 = 512; 2713 1.1 mrg i3 = n - jj + 1; 2714 1.1 mrg jsec = min(i2,i3); 2715 1.1 mrg ujsec = jsec - jsec % 4; 2716 1.1 mrg i2 = k; 2717 1.1 mrg for (ll = 1; ll <= i2; ll += 256) 2718 1.1 mrg { 2719 1.1 mrg /* Computing MIN */ 2720 1.1 mrg i3 = 256; 2721 1.1 mrg i4 = k - ll + 1; 2722 1.1 mrg lsec = min(i3,i4); 2723 1.1 mrg ulsec = lsec - lsec % 2; 2724 1.1 mrg 2725 1.1 mrg i3 = m; 2726 1.1 mrg for (ii = 1; ii <= i3; ii += 256) 2727 1.1 mrg { 2728 1.1 mrg /* Computing MIN */ 2729 1.1 mrg i4 = 256; 2730 1.1 mrg i5 = m - ii + 1; 2731 1.1 mrg isec = min(i4,i5); 2732 1.1 mrg uisec = isec - isec % 2; 2733 1.1 mrg i4 = ll + ulsec - 1; 2734 1.1 mrg for (l = ll; l <= i4; l += 2) 2735 1.1 mrg { 2736 1.1 mrg i5 = ii + uisec - 1; 2737 1.1 mrg for (i = ii; i <= i5; i += 2) 2738 1.1 mrg { 2739 1.1 mrg t1[l - ll + 1 + ((i - ii + 1) << 8) - 257] = 2740 1.1 mrg a[i + l * a_dim1]; 2741 1.1 mrg t1[l - ll + 2 + ((i - ii + 1) << 8) - 257] = 2742 1.1 mrg a[i + (l + 1) * a_dim1]; 2743 1.1 mrg t1[l - ll + 1 + ((i - ii + 2) << 8) - 257] = 2744 1.1 mrg a[i + 1 + l * a_dim1]; 2745 1.1 mrg t1[l - ll + 2 + ((i - ii + 2) << 8) - 257] = 2746 1.1 mrg a[i + 1 + (l + 1) * a_dim1]; 2747 1.1 mrg } 2748 1.1 mrg if (uisec < isec) 2749 1.1 mrg { 2750 1.1 mrg t1[l - ll + 1 + (isec << 8) - 257] = 2751 1.1 mrg a[ii + isec - 1 + l * a_dim1]; 2752 1.1 mrg t1[l - ll + 2 + (isec << 8) - 257] = 2753 1.1 mrg a[ii + isec - 1 + (l + 1) * a_dim1]; 2754 1.1 mrg } 2755 1.1 mrg } 2756 1.1 mrg if (ulsec < lsec) 2757 1.1 mrg { 2758 1.1 mrg i4 = ii + isec - 1; 2759 1.1 mrg for (i = ii; i<= i4; ++i) 2760 1.1 mrg { 2761 1.1 mrg t1[lsec + ((i - ii + 1) << 8) - 257] = 2762 1.1 mrg a[i + (ll + lsec - 1) * a_dim1]; 2763 1.1 mrg } 2764 1.1 mrg } 2765 1.1 mrg 2766 1.1 mrg uisec = isec - isec % 4; 2767 1.1 mrg i4 = jj + ujsec - 1; 2768 1.1 mrg for (j = jj; j <= i4; j += 4) 2769 1.1 mrg { 2770 1.1 mrg i5 = ii + uisec - 1; 2771 1.1 mrg for (i = ii; i <= i5; i += 4) 2772 1.1 mrg { 2773 1.1 mrg f11 = c[i + j * c_dim1]; 2774 1.1 mrg f21 = c[i + 1 + j * c_dim1]; 2775 1.1 mrg f12 = c[i + (j + 1) * c_dim1]; 2776 1.1 mrg f22 = c[i + 1 + (j + 1) * c_dim1]; 2777 1.1 mrg f13 = c[i + (j + 2) * c_dim1]; 2778 1.1 mrg f23 = c[i + 1 + (j + 2) * c_dim1]; 2779 1.1 mrg f14 = c[i + (j + 3) * c_dim1]; 2780 1.1 mrg f24 = c[i + 1 + (j + 3) * c_dim1]; 2781 1.1 mrg f31 = c[i + 2 + j * c_dim1]; 2782 1.1 mrg f41 = c[i + 3 + j * c_dim1]; 2783 1.1 mrg f32 = c[i + 2 + (j + 1) * c_dim1]; 2784 1.1 mrg f42 = c[i + 3 + (j + 1) * c_dim1]; 2785 1.1 mrg f33 = c[i + 2 + (j + 2) * c_dim1]; 2786 1.1 mrg f43 = c[i + 3 + (j + 2) * c_dim1]; 2787 1.1 mrg f34 = c[i + 2 + (j + 3) * c_dim1]; 2788 1.1 mrg f44 = c[i + 3 + (j + 3) * c_dim1]; 2789 1.1 mrg i6 = ll + lsec - 1; 2790 1.1 mrg for (l = ll; l <= i6; ++l) 2791 1.1 mrg { 2792 1.1 mrg f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257] 2793 1.1 mrg * b[l + j * b_dim1]; 2794 1.1 mrg f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257] 2795 1.1 mrg * b[l + j * b_dim1]; 2796 1.1 mrg f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257] 2797 1.1 mrg * b[l + (j + 1) * b_dim1]; 2798 1.1 mrg f22 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257] 2799 1.1 mrg * b[l + (j + 1) * b_dim1]; 2800 1.1 mrg f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257] 2801 1.1 mrg * b[l + (j + 2) * b_dim1]; 2802 1.1 mrg f23 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257] 2803 1.1 mrg * b[l + (j + 2) * b_dim1]; 2804 1.1 mrg f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257] 2805 1.1 mrg * b[l + (j + 3) * b_dim1]; 2806 1.1 mrg f24 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257] 2807 1.1 mrg * b[l + (j + 3) * b_dim1]; 2808 1.1 mrg f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257] 2809 1.1 mrg * b[l + j * b_dim1]; 2810 1.1 mrg f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257] 2811 1.1 mrg * b[l + j * b_dim1]; 2812 1.1 mrg f32 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257] 2813 1.1 mrg * b[l + (j + 1) * b_dim1]; 2814 1.1 mrg f42 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257] 2815 1.1 mrg * b[l + (j + 1) * b_dim1]; 2816 1.1 mrg f33 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257] 2817 1.1 mrg * b[l + (j + 2) * b_dim1]; 2818 1.1 mrg f43 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257] 2819 1.1 mrg * b[l + (j + 2) * b_dim1]; 2820 1.1 mrg f34 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257] 2821 1.1 mrg * b[l + (j + 3) * b_dim1]; 2822 1.1 mrg f44 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257] 2823 1.1 mrg * b[l + (j + 3) * b_dim1]; 2824 1.1 mrg } 2825 1.1 mrg c[i + j * c_dim1] = f11; 2826 1.1 mrg c[i + 1 + j * c_dim1] = f21; 2827 1.1 mrg c[i + (j + 1) * c_dim1] = f12; 2828 1.1 mrg c[i + 1 + (j + 1) * c_dim1] = f22; 2829 1.1 mrg c[i + (j + 2) * c_dim1] = f13; 2830 1.1 mrg c[i + 1 + (j + 2) * c_dim1] = f23; 2831 1.1 mrg c[i + (j + 3) * c_dim1] = f14; 2832 1.1 mrg c[i + 1 + (j + 3) * c_dim1] = f24; 2833 1.1 mrg c[i + 2 + j * c_dim1] = f31; 2834 1.1 mrg c[i + 3 + j * c_dim1] = f41; 2835 1.1 mrg c[i + 2 + (j + 1) * c_dim1] = f32; 2836 1.1 mrg c[i + 3 + (j + 1) * c_dim1] = f42; 2837 1.1 mrg c[i + 2 + (j + 2) * c_dim1] = f33; 2838 1.1 mrg c[i + 3 + (j + 2) * c_dim1] = f43; 2839 1.1 mrg c[i + 2 + (j + 3) * c_dim1] = f34; 2840 1.1 mrg c[i + 3 + (j + 3) * c_dim1] = f44; 2841 1.1 mrg } 2842 1.1 mrg if (uisec < isec) 2843 1.1 mrg { 2844 1.1 mrg i5 = ii + isec - 1; 2845 1.1 mrg for (i = ii + uisec; i <= i5; ++i) 2846 1.1 mrg { 2847 1.1 mrg f11 = c[i + j * c_dim1]; 2848 1.1 mrg f12 = c[i + (j + 1) * c_dim1]; 2849 1.1 mrg f13 = c[i + (j + 2) * c_dim1]; 2850 1.1 mrg f14 = c[i + (j + 3) * c_dim1]; 2851 1.1 mrg i6 = ll + lsec - 1; 2852 1.1 mrg for (l = ll; l <= i6; ++l) 2853 1.1 mrg { 2854 1.1 mrg f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 2855 1.1 mrg 257] * b[l + j * b_dim1]; 2856 1.1 mrg f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 2857 1.1 mrg 257] * b[l + (j + 1) * b_dim1]; 2858 1.1 mrg f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 2859 1.1 mrg 257] * b[l + (j + 2) * b_dim1]; 2860 1.1 mrg f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 2861 1.1 mrg 257] * b[l + (j + 3) * b_dim1]; 2862 1.1 mrg } 2863 1.1 mrg c[i + j * c_dim1] = f11; 2864 1.1 mrg c[i + (j + 1) * c_dim1] = f12; 2865 1.1 mrg c[i + (j + 2) * c_dim1] = f13; 2866 1.1 mrg c[i + (j + 3) * c_dim1] = f14; 2867 1.1 mrg } 2868 1.1 mrg } 2869 1.1 mrg } 2870 1.1 mrg if (ujsec < jsec) 2871 1.1 mrg { 2872 1.1 mrg i4 = jj + jsec - 1; 2873 1.1 mrg for (j = jj + ujsec; j <= i4; ++j) 2874 1.1 mrg { 2875 1.1 mrg i5 = ii + uisec - 1; 2876 1.1 mrg for (i = ii; i <= i5; i += 4) 2877 1.1 mrg { 2878 1.1 mrg f11 = c[i + j * c_dim1]; 2879 1.1 mrg f21 = c[i + 1 + j * c_dim1]; 2880 1.1 mrg f31 = c[i + 2 + j * c_dim1]; 2881 1.1 mrg f41 = c[i + 3 + j * c_dim1]; 2882 1.1 mrg i6 = ll + lsec - 1; 2883 1.1 mrg for (l = ll; l <= i6; ++l) 2884 1.1 mrg { 2885 1.1 mrg f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 2886 1.1 mrg 257] * b[l + j * b_dim1]; 2887 1.1 mrg f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 2888 1.1 mrg 257] * b[l + j * b_dim1]; 2889 1.1 mrg f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 2890 1.1 mrg 257] * b[l + j * b_dim1]; 2891 1.1 mrg f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 2892 1.1 mrg 257] * b[l + j * b_dim1]; 2893 1.1 mrg } 2894 1.1 mrg c[i + j * c_dim1] = f11; 2895 1.1 mrg c[i + 1 + j * c_dim1] = f21; 2896 1.1 mrg c[i + 2 + j * c_dim1] = f31; 2897 1.1 mrg c[i + 3 + j * c_dim1] = f41; 2898 1.1 mrg } 2899 1.1 mrg i5 = ii + isec - 1; 2900 1.1 mrg for (i = ii + uisec; i <= i5; ++i) 2901 1.1 mrg { 2902 1.1 mrg f11 = c[i + j * c_dim1]; 2903 1.1 mrg i6 = ll + lsec - 1; 2904 1.1 mrg for (l = ll; l <= i6; ++l) 2905 1.1 mrg { 2906 1.1 mrg f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 2907 1.1 mrg 257] * b[l + j * b_dim1]; 2908 1.1 mrg } 2909 1.1 mrg c[i + j * c_dim1] = f11; 2910 1.1 mrg } 2911 1.1 mrg } 2912 1.1 mrg } 2913 1.1 mrg } 2914 1.1 mrg } 2915 1.1 mrg } 2916 1.1 mrg free(t1); 2917 1.1 mrg return; 2918 1.1 mrg } 2919 1.1 mrg else if (rxstride == 1 && aystride == 1 && bxstride == 1) 2920 1.1 mrg { 2921 1.1 mrg if (GFC_DESCRIPTOR_RANK (a) != 1) 2922 1.1 mrg { 2923 1.1 mrg const GFC_COMPLEX_16 *restrict abase_x; 2924 1.1 mrg const GFC_COMPLEX_16 *restrict bbase_y; 2925 1.1 mrg GFC_COMPLEX_16 *restrict dest_y; 2926 1.1 mrg GFC_COMPLEX_16 s; 2927 1.1 mrg 2928 1.1 mrg for (y = 0; y < ycount; y++) 2929 1.1 mrg { 2930 1.1 mrg bbase_y = &bbase[y*bystride]; 2931 1.1 mrg dest_y = &dest[y*rystride]; 2932 1.1 mrg for (x = 0; x < xcount; x++) 2933 1.1 mrg { 2934 1.1 mrg abase_x = &abase[x*axstride]; 2935 1.1 mrg s = (GFC_COMPLEX_16) 0; 2936 1.1 mrg for (n = 0; n < count; n++) 2937 1.1 mrg s += abase_x[n] * bbase_y[n]; 2938 1.1 mrg dest_y[x] = s; 2939 1.1 mrg } 2940 1.1 mrg } 2941 1.1 mrg } 2942 1.1 mrg else 2943 1.1 mrg { 2944 1.1 mrg const GFC_COMPLEX_16 *restrict bbase_y; 2945 1.1 mrg GFC_COMPLEX_16 s; 2946 1.1 mrg 2947 1.1 mrg for (y = 0; y < ycount; y++) 2948 1.1 mrg { 2949 1.1 mrg bbase_y = &bbase[y*bystride]; 2950 1.1 mrg s = (GFC_COMPLEX_16) 0; 2951 1.1 mrg for (n = 0; n < count; n++) 2952 1.1 mrg s += abase[n*axstride] * bbase_y[n]; 2953 1.1 mrg dest[y*rystride] = s; 2954 1.1 mrg } 2955 1.1 mrg } 2956 1.1 mrg } 2957 1.1 mrg else if (GFC_DESCRIPTOR_RANK (a) == 1) 2958 1.1 mrg { 2959 1.1 mrg const GFC_COMPLEX_16 *restrict bbase_y; 2960 1.1 mrg GFC_COMPLEX_16 s; 2961 1.1 mrg 2962 1.1 mrg for (y = 0; y < ycount; y++) 2963 1.1 mrg { 2964 1.1 mrg bbase_y = &bbase[y*bystride]; 2965 1.1 mrg s = (GFC_COMPLEX_16) 0; 2966 1.1 mrg for (n = 0; n < count; n++) 2967 1.1 mrg s += abase[n*axstride] * bbase_y[n*bxstride]; 2968 1.1 mrg dest[y*rxstride] = s; 2969 1.1 mrg } 2970 1.1 mrg } 2971 1.1.1.2 mrg else if (axstride < aystride) 2972 1.1.1.2 mrg { 2973 1.1.1.2 mrg for (y = 0; y < ycount; y++) 2974 1.1.1.2 mrg for (x = 0; x < xcount; x++) 2975 1.1.1.2 mrg dest[x*rxstride + y*rystride] = (GFC_COMPLEX_16)0; 2976 1.1.1.2 mrg 2977 1.1.1.2 mrg for (y = 0; y < ycount; y++) 2978 1.1.1.2 mrg for (n = 0; n < count; n++) 2979 1.1.1.2 mrg for (x = 0; x < xcount; x++) 2980 1.1.1.2 mrg /* dest[x,y] += a[x,n] * b[n,y] */ 2981 1.1.1.2 mrg dest[x*rxstride + y*rystride] += 2982 1.1.1.2 mrg abase[x*axstride + n*aystride] * 2983 1.1.1.2 mrg bbase[n*bxstride + y*bystride]; 2984 1.1.1.2 mrg } 2985 1.1 mrg else 2986 1.1 mrg { 2987 1.1 mrg const GFC_COMPLEX_16 *restrict abase_x; 2988 1.1 mrg const GFC_COMPLEX_16 *restrict bbase_y; 2989 1.1 mrg GFC_COMPLEX_16 *restrict dest_y; 2990 1.1 mrg GFC_COMPLEX_16 s; 2991 1.1 mrg 2992 1.1 mrg for (y = 0; y < ycount; y++) 2993 1.1 mrg { 2994 1.1 mrg bbase_y = &bbase[y*bystride]; 2995 1.1 mrg dest_y = &dest[y*rystride]; 2996 1.1 mrg for (x = 0; x < xcount; x++) 2997 1.1 mrg { 2998 1.1 mrg abase_x = &abase[x*axstride]; 2999 1.1 mrg s = (GFC_COMPLEX_16) 0; 3000 1.1 mrg for (n = 0; n < count; n++) 3001 1.1 mrg s += abase_x[n*aystride] * bbase_y[n*bxstride]; 3002 1.1 mrg dest_y[x*rxstride] = s; 3003 1.1 mrg } 3004 1.1 mrg } 3005 1.1 mrg } 3006 1.1 mrg } 3007 1.1 mrg #undef POW3 3008 1.1 mrg #undef min 3009 1.1 mrg #undef max 3010 1.1 mrg 3011 1.1 mrg #endif 3012 1.1 mrg #endif 3013 1.1 mrg 3014