Home | History | Annotate | Line # | Download | only in generated
      1 /* Implementation of the MATMUL intrinsic
      2    Copyright (C) 2002-2024 Free Software Foundation, Inc.
      3    Contributed by Thomas Koenig <tkoenig (at) gcc.gnu.org>.
      4 
      5 This file is part of the GNU Fortran runtime library (libgfortran).
      6 
      7 Libgfortran is free software; you can redistribute it and/or
      8 modify it under the terms of the GNU General Public
      9 License as published by the Free Software Foundation; either
     10 version 3 of the License, or (at your option) any later version.
     11 
     12 Libgfortran is distributed in the hope that it will be useful,
     13 but WITHOUT ANY WARRANTY; without even the implied warranty of
     14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
     15 GNU General Public License for more details.
     16 
     17 Under Section 7 of GPL version 3, you are granted additional
     18 permissions described in the GCC Runtime Library Exception, version
     19 3.1, as published by the Free Software Foundation.
     20 
     21 You should have received a copy of the GNU General Public License and
     22 a copy of the GCC Runtime Library Exception along with this program;
     23 see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
     24 <http://www.gnu.org/licenses/>.  */
     25 
     26 #include "libgfortran.h"
     27 #include <string.h>
     28 #include <assert.h>
     29 
     30 
     31 /* These are the specific versions of matmul with -mprefer-avx128.  */
     32 
     33 #if defined (HAVE_GFC_REAL_16)
     34 
     35 /* Prototype for the BLAS ?gemm subroutine, a pointer to which can be
     36    passed to us by the front-end, in which case we call it for large
     37    matrices.  */
     38 
     39 typedef void (*blas_call)(const char *, const char *, const int *, const int *,
     40                           const int *, const GFC_REAL_16 *, const GFC_REAL_16 *,
     41                           const int *, const GFC_REAL_16 *, const int *,
     42                           const GFC_REAL_16 *, GFC_REAL_16 *, const int *,
     43                           int, int);
     44 
     45 #if defined(HAVE_AVX) && defined(HAVE_FMA3) && defined(HAVE_AVX128)
     46 void
     47 matmul_r16_avx128_fma3 (gfc_array_r16 * const restrict retarray,
     48 	gfc_array_r16 * const restrict a, gfc_array_r16 * const restrict b, int try_blas,
     49 	int blas_limit, blas_call gemm) __attribute__((__target__("avx,fma")));
     50 internal_proto(matmul_r16_avx128_fma3);
     51 void
     52 matmul_r16_avx128_fma3 (gfc_array_r16 * const restrict retarray,
     53 	gfc_array_r16 * const restrict a, gfc_array_r16 * const restrict b, int try_blas,
     54 	int blas_limit, blas_call gemm)
     55 {
     56   const GFC_REAL_16 * restrict abase;
     57   const GFC_REAL_16 * restrict bbase;
     58   GFC_REAL_16 * restrict dest;
     59 
     60   index_type rxstride, rystride, axstride, aystride, bxstride, bystride;
     61   index_type x, y, n, count, xcount, ycount;
     62 
     63   assert (GFC_DESCRIPTOR_RANK (a) == 2
     64           || GFC_DESCRIPTOR_RANK (b) == 2);
     65 
     66 /* C[xcount,ycount] = A[xcount, count] * B[count,ycount]
     67 
     68    Either A or B (but not both) can be rank 1:
     69 
     70    o One-dimensional argument A is implicitly treated as a row matrix
     71      dimensioned [1,count], so xcount=1.
     72 
     73    o One-dimensional argument B is implicitly treated as a column matrix
     74      dimensioned [count, 1], so ycount=1.
     75 */
     76 
     77   if (retarray->base_addr == NULL)
     78     {
     79       if (GFC_DESCRIPTOR_RANK (a) == 1)
     80         {
     81 	  GFC_DIMENSION_SET(retarray->dim[0], 0,
     82 	                    GFC_DESCRIPTOR_EXTENT(b,1) - 1, 1);
     83         }
     84       else if (GFC_DESCRIPTOR_RANK (b) == 1)
     85         {
     86 	  GFC_DIMENSION_SET(retarray->dim[0], 0,
     87 	                    GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1);
     88         }
     89       else
     90         {
     91 	  GFC_DIMENSION_SET(retarray->dim[0], 0,
     92 	                    GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1);
     93 
     94           GFC_DIMENSION_SET(retarray->dim[1], 0,
     95 	                    GFC_DESCRIPTOR_EXTENT(b,1) - 1,
     96 			    GFC_DESCRIPTOR_EXTENT(retarray,0));
     97         }
     98 
     99       retarray->base_addr
    100 	= xmallocarray (size0 ((array_t *) retarray), sizeof (GFC_REAL_16));
    101       retarray->offset = 0;
    102     }
    103   else if (unlikely (compile_options.bounds_check))
    104     {
    105       index_type ret_extent, arg_extent;
    106 
    107       if (GFC_DESCRIPTOR_RANK (a) == 1)
    108 	{
    109 	  arg_extent = GFC_DESCRIPTOR_EXTENT(b,1);
    110 	  ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
    111 	  if (arg_extent != ret_extent)
    112 	    runtime_error ("Array bound mismatch for dimension 1 of "
    113 	    		   "array (%ld/%ld) ",
    114 			   (long int) ret_extent, (long int) arg_extent);
    115 	}
    116       else if (GFC_DESCRIPTOR_RANK (b) == 1)
    117 	{
    118 	  arg_extent = GFC_DESCRIPTOR_EXTENT(a,0);
    119 	  ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
    120 	  if (arg_extent != ret_extent)
    121 	    runtime_error ("Array bound mismatch for dimension 1 of "
    122 	    		   "array (%ld/%ld) ",
    123 			   (long int) ret_extent, (long int) arg_extent);
    124 	}
    125       else
    126 	{
    127 	  arg_extent = GFC_DESCRIPTOR_EXTENT(a,0);
    128 	  ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
    129 	  if (arg_extent != ret_extent)
    130 	    runtime_error ("Array bound mismatch for dimension 1 of "
    131 	    		   "array (%ld/%ld) ",
    132 			   (long int) ret_extent, (long int) arg_extent);
    133 
    134 	  arg_extent = GFC_DESCRIPTOR_EXTENT(b,1);
    135 	  ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,1);
    136 	  if (arg_extent != ret_extent)
    137 	    runtime_error ("Array bound mismatch for dimension 2 of "
    138 	    		   "array (%ld/%ld) ",
    139 			   (long int) ret_extent, (long int) arg_extent);
    140 	}
    141     }
    142 
    143 
    144   if (GFC_DESCRIPTOR_RANK (retarray) == 1)
    145     {
    146       /* One-dimensional result may be addressed in the code below
    147 	 either as a row or a column matrix. We want both cases to
    148 	 work. */
    149       rxstride = rystride = GFC_DESCRIPTOR_STRIDE(retarray,0);
    150     }
    151   else
    152     {
    153       rxstride = GFC_DESCRIPTOR_STRIDE(retarray,0);
    154       rystride = GFC_DESCRIPTOR_STRIDE(retarray,1);
    155     }
    156 
    157 
    158   if (GFC_DESCRIPTOR_RANK (a) == 1)
    159     {
    160       /* Treat it as a a row matrix A[1,count]. */
    161       axstride = GFC_DESCRIPTOR_STRIDE(a,0);
    162       aystride = 1;
    163 
    164       xcount = 1;
    165       count = GFC_DESCRIPTOR_EXTENT(a,0);
    166     }
    167   else
    168     {
    169       axstride = GFC_DESCRIPTOR_STRIDE(a,0);
    170       aystride = GFC_DESCRIPTOR_STRIDE(a,1);
    171 
    172       count = GFC_DESCRIPTOR_EXTENT(a,1);
    173       xcount = GFC_DESCRIPTOR_EXTENT(a,0);
    174     }
    175 
    176   if (count != GFC_DESCRIPTOR_EXTENT(b,0))
    177     {
    178       if (count > 0 || GFC_DESCRIPTOR_EXTENT(b,0) > 0)
    179 	runtime_error ("Incorrect extent in argument B in MATMUL intrinsic "
    180 		       "in dimension 1: is %ld, should be %ld",
    181 		       (long int) GFC_DESCRIPTOR_EXTENT(b,0), (long int) count);
    182     }
    183 
    184   if (GFC_DESCRIPTOR_RANK (b) == 1)
    185     {
    186       /* Treat it as a column matrix B[count,1] */
    187       bxstride = GFC_DESCRIPTOR_STRIDE(b,0);
    188 
    189       /* bystride should never be used for 1-dimensional b.
    190          The value is only used for calculation of the
    191          memory by the buffer.  */
    192       bystride = 256;
    193       ycount = 1;
    194     }
    195   else
    196     {
    197       bxstride = GFC_DESCRIPTOR_STRIDE(b,0);
    198       bystride = GFC_DESCRIPTOR_STRIDE(b,1);
    199       ycount = GFC_DESCRIPTOR_EXTENT(b,1);
    200     }
    201 
    202   abase = a->base_addr;
    203   bbase = b->base_addr;
    204   dest = retarray->base_addr;
    205 
    206   /* Now that everything is set up, we perform the multiplication
    207      itself.  */
    208 
    209 #define POW3(x) (((float) (x)) * ((float) (x)) * ((float) (x)))
    210 #define min(a,b) ((a) <= (b) ? (a) : (b))
    211 #define max(a,b) ((a) >= (b) ? (a) : (b))
    212 
    213   if (try_blas && rxstride == 1 && (axstride == 1 || aystride == 1)
    214       && (bxstride == 1 || bystride == 1)
    215       && (((float) xcount) * ((float) ycount) * ((float) count)
    216           > POW3(blas_limit)))
    217     {
    218       const int m = xcount, n = ycount, k = count, ldc = rystride;
    219       const GFC_REAL_16 one = 1, zero = 0;
    220       const int lda = (axstride == 1) ? aystride : axstride,
    221 		ldb = (bxstride == 1) ? bystride : bxstride;
    222 
    223       if (lda > 0 && ldb > 0 && ldc > 0 && m > 1 && n > 1 && k > 1)
    224 	{
    225 	  assert (gemm != NULL);
    226 	  const char *transa, *transb;
    227 	  if (try_blas & 2)
    228 	    transa = "C";
    229 	  else
    230 	    transa = axstride == 1 ? "N" : "T";
    231 
    232 	  if (try_blas & 4)
    233 	    transb = "C";
    234 	  else
    235 	    transb = bxstride == 1 ? "N" : "T";
    236 
    237 	  gemm (transa, transb , &m,
    238 		&n, &k,	&one, abase, &lda, bbase, &ldb, &zero, dest,
    239 		&ldc, 1, 1);
    240 	  return;
    241 	}
    242     }
    243 
    244   if (rxstride == 1 && axstride == 1 && bxstride == 1
    245       && GFC_DESCRIPTOR_RANK (b) != 1)
    246     {
    247       /* This block of code implements a tuned matmul, derived from
    248          Superscalar GEMM-based level 3 BLAS,  Beta version 0.1
    249 
    250                Bo Kagstrom and Per Ling
    251                Department of Computing Science
    252                Umea University
    253                S-901 87 Umea, Sweden
    254 
    255 	 from netlib.org, translated to C, and modified for matmul.m4.  */
    256 
    257       const GFC_REAL_16 *a, *b;
    258       GFC_REAL_16 *c;
    259       const index_type m = xcount, n = ycount, k = count;
    260 
    261       /* System generated locals */
    262       index_type a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset,
    263 		 i1, i2, i3, i4, i5, i6;
    264 
    265       /* Local variables */
    266       GFC_REAL_16 f11, f12, f21, f22, f31, f32, f41, f42,
    267 		 f13, f14, f23, f24, f33, f34, f43, f44;
    268       index_type i, j, l, ii, jj, ll;
    269       index_type isec, jsec, lsec, uisec, ujsec, ulsec;
    270       GFC_REAL_16 *t1;
    271 
    272       a = abase;
    273       b = bbase;
    274       c = retarray->base_addr;
    275 
    276       /* Parameter adjustments */
    277       c_dim1 = rystride;
    278       c_offset = 1 + c_dim1;
    279       c -= c_offset;
    280       a_dim1 = aystride;
    281       a_offset = 1 + a_dim1;
    282       a -= a_offset;
    283       b_dim1 = bystride;
    284       b_offset = 1 + b_dim1;
    285       b -= b_offset;
    286 
    287       /* Empty c first.  */
    288       for (j=1; j<=n; j++)
    289 	for (i=1; i<=m; i++)
    290 	  c[i + j * c_dim1] = (GFC_REAL_16)0;
    291 
    292       /* Early exit if possible */
    293       if (m == 0 || n == 0 || k == 0)
    294 	return;
    295 
    296       /* Adjust size of t1 to what is needed.  */
    297       index_type t1_dim, a_sz;
    298       if (aystride == 1)
    299         a_sz = rystride;
    300       else
    301         a_sz = a_dim1;
    302 
    303       t1_dim = a_sz * 256 + b_dim1;
    304       if (t1_dim > 65536)
    305 	t1_dim = 65536;
    306 
    307       t1 = malloc (t1_dim * sizeof(GFC_REAL_16));
    308 
    309       /* Start turning the crank. */
    310       i1 = n;
    311       for (jj = 1; jj <= i1; jj += 512)
    312 	{
    313 	  /* Computing MIN */
    314 	  i2 = 512;
    315 	  i3 = n - jj + 1;
    316 	  jsec = min(i2,i3);
    317 	  ujsec = jsec - jsec % 4;
    318 	  i2 = k;
    319 	  for (ll = 1; ll <= i2; ll += 256)
    320 	    {
    321 	      /* Computing MIN */
    322 	      i3 = 256;
    323 	      i4 = k - ll + 1;
    324 	      lsec = min(i3,i4);
    325 	      ulsec = lsec - lsec % 2;
    326 
    327 	      i3 = m;
    328 	      for (ii = 1; ii <= i3; ii += 256)
    329 		{
    330 		  /* Computing MIN */
    331 		  i4 = 256;
    332 		  i5 = m - ii + 1;
    333 		  isec = min(i4,i5);
    334 		  uisec = isec - isec % 2;
    335 		  i4 = ll + ulsec - 1;
    336 		  for (l = ll; l <= i4; l += 2)
    337 		    {
    338 		      i5 = ii + uisec - 1;
    339 		      for (i = ii; i <= i5; i += 2)
    340 			{
    341 			  t1[l - ll + 1 + ((i - ii + 1) << 8) - 257] =
    342 					a[i + l * a_dim1];
    343 			  t1[l - ll + 2 + ((i - ii + 1) << 8) - 257] =
    344 					a[i + (l + 1) * a_dim1];
    345 			  t1[l - ll + 1 + ((i - ii + 2) << 8) - 257] =
    346 					a[i + 1 + l * a_dim1];
    347 			  t1[l - ll + 2 + ((i - ii + 2) << 8) - 257] =
    348 					a[i + 1 + (l + 1) * a_dim1];
    349 			}
    350 		      if (uisec < isec)
    351 			{
    352 			  t1[l - ll + 1 + (isec << 8) - 257] =
    353 				    a[ii + isec - 1 + l * a_dim1];
    354 			  t1[l - ll + 2 + (isec << 8) - 257] =
    355 				    a[ii + isec - 1 + (l + 1) * a_dim1];
    356 			}
    357 		    }
    358 		  if (ulsec < lsec)
    359 		    {
    360 		      i4 = ii + isec - 1;
    361 		      for (i = ii; i<= i4; ++i)
    362 			{
    363 			  t1[lsec + ((i - ii + 1) << 8) - 257] =
    364 				    a[i + (ll + lsec - 1) * a_dim1];
    365 			}
    366 		    }
    367 
    368 		  uisec = isec - isec % 4;
    369 		  i4 = jj + ujsec - 1;
    370 		  for (j = jj; j <= i4; j += 4)
    371 		    {
    372 		      i5 = ii + uisec - 1;
    373 		      for (i = ii; i <= i5; i += 4)
    374 			{
    375 			  f11 = c[i + j * c_dim1];
    376 			  f21 = c[i + 1 + j * c_dim1];
    377 			  f12 = c[i + (j + 1) * c_dim1];
    378 			  f22 = c[i + 1 + (j + 1) * c_dim1];
    379 			  f13 = c[i + (j + 2) * c_dim1];
    380 			  f23 = c[i + 1 + (j + 2) * c_dim1];
    381 			  f14 = c[i + (j + 3) * c_dim1];
    382 			  f24 = c[i + 1 + (j + 3) * c_dim1];
    383 			  f31 = c[i + 2 + j * c_dim1];
    384 			  f41 = c[i + 3 + j * c_dim1];
    385 			  f32 = c[i + 2 + (j + 1) * c_dim1];
    386 			  f42 = c[i + 3 + (j + 1) * c_dim1];
    387 			  f33 = c[i + 2 + (j + 2) * c_dim1];
    388 			  f43 = c[i + 3 + (j + 2) * c_dim1];
    389 			  f34 = c[i + 2 + (j + 3) * c_dim1];
    390 			  f44 = c[i + 3 + (j + 3) * c_dim1];
    391 			  i6 = ll + lsec - 1;
    392 			  for (l = ll; l <= i6; ++l)
    393 			    {
    394 			      f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
    395 				      * b[l + j * b_dim1];
    396 			      f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
    397 				      * b[l + j * b_dim1];
    398 			      f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
    399 				      * b[l + (j + 1) * b_dim1];
    400 			      f22 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
    401 				      * b[l + (j + 1) * b_dim1];
    402 			      f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
    403 				      * b[l + (j + 2) * b_dim1];
    404 			      f23 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
    405 				      * b[l + (j + 2) * b_dim1];
    406 			      f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
    407 				      * b[l + (j + 3) * b_dim1];
    408 			      f24 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
    409 				      * b[l + (j + 3) * b_dim1];
    410 			      f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
    411 				      * b[l + j * b_dim1];
    412 			      f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
    413 				      * b[l + j * b_dim1];
    414 			      f32 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
    415 				      * b[l + (j + 1) * b_dim1];
    416 			      f42 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
    417 				      * b[l + (j + 1) * b_dim1];
    418 			      f33 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
    419 				      * b[l + (j + 2) * b_dim1];
    420 			      f43 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
    421 				      * b[l + (j + 2) * b_dim1];
    422 			      f34 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
    423 				      * b[l + (j + 3) * b_dim1];
    424 			      f44 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
    425 				      * b[l + (j + 3) * b_dim1];
    426 			    }
    427 			  c[i + j * c_dim1] = f11;
    428 			  c[i + 1 + j * c_dim1] = f21;
    429 			  c[i + (j + 1) * c_dim1] = f12;
    430 			  c[i + 1 + (j + 1) * c_dim1] = f22;
    431 			  c[i + (j + 2) * c_dim1] = f13;
    432 			  c[i + 1 + (j + 2) * c_dim1] = f23;
    433 			  c[i + (j + 3) * c_dim1] = f14;
    434 			  c[i + 1 + (j + 3) * c_dim1] = f24;
    435 			  c[i + 2 + j * c_dim1] = f31;
    436 			  c[i + 3 + j * c_dim1] = f41;
    437 			  c[i + 2 + (j + 1) * c_dim1] = f32;
    438 			  c[i + 3 + (j + 1) * c_dim1] = f42;
    439 			  c[i + 2 + (j + 2) * c_dim1] = f33;
    440 			  c[i + 3 + (j + 2) * c_dim1] = f43;
    441 			  c[i + 2 + (j + 3) * c_dim1] = f34;
    442 			  c[i + 3 + (j + 3) * c_dim1] = f44;
    443 			}
    444 		      if (uisec < isec)
    445 			{
    446 			  i5 = ii + isec - 1;
    447 			  for (i = ii + uisec; i <= i5; ++i)
    448 			    {
    449 			      f11 = c[i + j * c_dim1];
    450 			      f12 = c[i + (j + 1) * c_dim1];
    451 			      f13 = c[i + (j + 2) * c_dim1];
    452 			      f14 = c[i + (j + 3) * c_dim1];
    453 			      i6 = ll + lsec - 1;
    454 			      for (l = ll; l <= i6; ++l)
    455 				{
    456 				  f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
    457 					  257] * b[l + j * b_dim1];
    458 				  f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
    459 					  257] * b[l + (j + 1) * b_dim1];
    460 				  f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
    461 					  257] * b[l + (j + 2) * b_dim1];
    462 				  f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
    463 					  257] * b[l + (j + 3) * b_dim1];
    464 				}
    465 			      c[i + j * c_dim1] = f11;
    466 			      c[i + (j + 1) * c_dim1] = f12;
    467 			      c[i + (j + 2) * c_dim1] = f13;
    468 			      c[i + (j + 3) * c_dim1] = f14;
    469 			    }
    470 			}
    471 		    }
    472 		  if (ujsec < jsec)
    473 		    {
    474 		      i4 = jj + jsec - 1;
    475 		      for (j = jj + ujsec; j <= i4; ++j)
    476 			{
    477 			  i5 = ii + uisec - 1;
    478 			  for (i = ii; i <= i5; i += 4)
    479 			    {
    480 			      f11 = c[i + j * c_dim1];
    481 			      f21 = c[i + 1 + j * c_dim1];
    482 			      f31 = c[i + 2 + j * c_dim1];
    483 			      f41 = c[i + 3 + j * c_dim1];
    484 			      i6 = ll + lsec - 1;
    485 			      for (l = ll; l <= i6; ++l)
    486 				{
    487 				  f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
    488 					  257] * b[l + j * b_dim1];
    489 				  f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) -
    490 					  257] * b[l + j * b_dim1];
    491 				  f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) -
    492 					  257] * b[l + j * b_dim1];
    493 				  f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) -
    494 					  257] * b[l + j * b_dim1];
    495 				}
    496 			      c[i + j * c_dim1] = f11;
    497 			      c[i + 1 + j * c_dim1] = f21;
    498 			      c[i + 2 + j * c_dim1] = f31;
    499 			      c[i + 3 + j * c_dim1] = f41;
    500 			    }
    501 			  i5 = ii + isec - 1;
    502 			  for (i = ii + uisec; i <= i5; ++i)
    503 			    {
    504 			      f11 = c[i + j * c_dim1];
    505 			      i6 = ll + lsec - 1;
    506 			      for (l = ll; l <= i6; ++l)
    507 				{
    508 				  f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
    509 					  257] * b[l + j * b_dim1];
    510 				}
    511 			      c[i + j * c_dim1] = f11;
    512 			    }
    513 			}
    514 		    }
    515 		}
    516 	    }
    517 	}
    518       free(t1);
    519       return;
    520     }
    521   else if (rxstride == 1 && aystride == 1 && bxstride == 1)
    522     {
    523       if (GFC_DESCRIPTOR_RANK (a) != 1)
    524 	{
    525 	  const GFC_REAL_16 *restrict abase_x;
    526 	  const GFC_REAL_16 *restrict bbase_y;
    527 	  GFC_REAL_16 *restrict dest_y;
    528 	  GFC_REAL_16 s;
    529 
    530 	  for (y = 0; y < ycount; y++)
    531 	    {
    532 	      bbase_y = &bbase[y*bystride];
    533 	      dest_y = &dest[y*rystride];
    534 	      for (x = 0; x < xcount; x++)
    535 		{
    536 		  abase_x = &abase[x*axstride];
    537 		  s = (GFC_REAL_16) 0;
    538 		  for (n = 0; n < count; n++)
    539 		    s += abase_x[n] * bbase_y[n];
    540 		  dest_y[x] = s;
    541 		}
    542 	    }
    543 	}
    544       else
    545 	{
    546 	  const GFC_REAL_16 *restrict bbase_y;
    547 	  GFC_REAL_16 s;
    548 
    549 	  for (y = 0; y < ycount; y++)
    550 	    {
    551 	      bbase_y = &bbase[y*bystride];
    552 	      s = (GFC_REAL_16) 0;
    553 	      for (n = 0; n < count; n++)
    554 		s += abase[n*axstride] * bbase_y[n];
    555 	      dest[y*rystride] = s;
    556 	    }
    557 	}
    558     }
    559   else if (GFC_DESCRIPTOR_RANK (a) == 1)
    560     {
    561       const GFC_REAL_16 *restrict bbase_y;
    562       GFC_REAL_16 s;
    563 
    564       for (y = 0; y < ycount; y++)
    565 	{
    566 	  bbase_y = &bbase[y*bystride];
    567 	  s = (GFC_REAL_16) 0;
    568 	  for (n = 0; n < count; n++)
    569 	    s += abase[n*axstride] * bbase_y[n*bxstride];
    570 	  dest[y*rxstride] = s;
    571 	}
    572     }
    573   else if (axstride < aystride)
    574     {
    575       for (y = 0; y < ycount; y++)
    576 	for (x = 0; x < xcount; x++)
    577 	  dest[x*rxstride + y*rystride] = (GFC_REAL_16)0;
    578 
    579       for (y = 0; y < ycount; y++)
    580 	for (n = 0; n < count; n++)
    581 	  for (x = 0; x < xcount; x++)
    582 	    /* dest[x,y] += a[x,n] * b[n,y] */
    583 	    dest[x*rxstride + y*rystride] +=
    584 					abase[x*axstride + n*aystride] *
    585 					bbase[n*bxstride + y*bystride];
    586     }
    587   else
    588     {
    589       const GFC_REAL_16 *restrict abase_x;
    590       const GFC_REAL_16 *restrict bbase_y;
    591       GFC_REAL_16 *restrict dest_y;
    592       GFC_REAL_16 s;
    593 
    594       for (y = 0; y < ycount; y++)
    595 	{
    596 	  bbase_y = &bbase[y*bystride];
    597 	  dest_y = &dest[y*rystride];
    598 	  for (x = 0; x < xcount; x++)
    599 	    {
    600 	      abase_x = &abase[x*axstride];
    601 	      s = (GFC_REAL_16) 0;
    602 	      for (n = 0; n < count; n++)
    603 		s += abase_x[n*aystride] * bbase_y[n*bxstride];
    604 	      dest_y[x*rxstride] = s;
    605 	    }
    606 	}
    607     }
    608 }
    609 #undef POW3
    610 #undef min
    611 #undef max
    612 
    613 #endif
    614 
    615 #if defined(HAVE_AVX) && defined(HAVE_FMA4) && defined(HAVE_AVX128)
    616 void
    617 matmul_r16_avx128_fma4 (gfc_array_r16 * const restrict retarray,
    618 	gfc_array_r16 * const restrict a, gfc_array_r16 * const restrict b, int try_blas,
    619 	int blas_limit, blas_call gemm) __attribute__((__target__("avx,fma4")));
    620 internal_proto(matmul_r16_avx128_fma4);
    621 void
    622 matmul_r16_avx128_fma4 (gfc_array_r16 * const restrict retarray,
    623 	gfc_array_r16 * const restrict a, gfc_array_r16 * const restrict b, int try_blas,
    624 	int blas_limit, blas_call gemm)
    625 {
    626   const GFC_REAL_16 * restrict abase;
    627   const GFC_REAL_16 * restrict bbase;
    628   GFC_REAL_16 * restrict dest;
    629 
    630   index_type rxstride, rystride, axstride, aystride, bxstride, bystride;
    631   index_type x, y, n, count, xcount, ycount;
    632 
    633   assert (GFC_DESCRIPTOR_RANK (a) == 2
    634           || GFC_DESCRIPTOR_RANK (b) == 2);
    635 
    636 /* C[xcount,ycount] = A[xcount, count] * B[count,ycount]
    637 
    638    Either A or B (but not both) can be rank 1:
    639 
    640    o One-dimensional argument A is implicitly treated as a row matrix
    641      dimensioned [1,count], so xcount=1.
    642 
    643    o One-dimensional argument B is implicitly treated as a column matrix
    644      dimensioned [count, 1], so ycount=1.
    645 */
    646 
    647   if (retarray->base_addr == NULL)
    648     {
    649       if (GFC_DESCRIPTOR_RANK (a) == 1)
    650         {
    651 	  GFC_DIMENSION_SET(retarray->dim[0], 0,
    652 	                    GFC_DESCRIPTOR_EXTENT(b,1) - 1, 1);
    653         }
    654       else if (GFC_DESCRIPTOR_RANK (b) == 1)
    655         {
    656 	  GFC_DIMENSION_SET(retarray->dim[0], 0,
    657 	                    GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1);
    658         }
    659       else
    660         {
    661 	  GFC_DIMENSION_SET(retarray->dim[0], 0,
    662 	                    GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1);
    663 
    664           GFC_DIMENSION_SET(retarray->dim[1], 0,
    665 	                    GFC_DESCRIPTOR_EXTENT(b,1) - 1,
    666 			    GFC_DESCRIPTOR_EXTENT(retarray,0));
    667         }
    668 
    669       retarray->base_addr
    670 	= xmallocarray (size0 ((array_t *) retarray), sizeof (GFC_REAL_16));
    671       retarray->offset = 0;
    672     }
    673   else if (unlikely (compile_options.bounds_check))
    674     {
    675       index_type ret_extent, arg_extent;
    676 
    677       if (GFC_DESCRIPTOR_RANK (a) == 1)
    678 	{
    679 	  arg_extent = GFC_DESCRIPTOR_EXTENT(b,1);
    680 	  ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
    681 	  if (arg_extent != ret_extent)
    682 	    runtime_error ("Array bound mismatch for dimension 1 of "
    683 	    		   "array (%ld/%ld) ",
    684 			   (long int) ret_extent, (long int) arg_extent);
    685 	}
    686       else if (GFC_DESCRIPTOR_RANK (b) == 1)
    687 	{
    688 	  arg_extent = GFC_DESCRIPTOR_EXTENT(a,0);
    689 	  ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
    690 	  if (arg_extent != ret_extent)
    691 	    runtime_error ("Array bound mismatch for dimension 1 of "
    692 	    		   "array (%ld/%ld) ",
    693 			   (long int) ret_extent, (long int) arg_extent);
    694 	}
    695       else
    696 	{
    697 	  arg_extent = GFC_DESCRIPTOR_EXTENT(a,0);
    698 	  ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
    699 	  if (arg_extent != ret_extent)
    700 	    runtime_error ("Array bound mismatch for dimension 1 of "
    701 	    		   "array (%ld/%ld) ",
    702 			   (long int) ret_extent, (long int) arg_extent);
    703 
    704 	  arg_extent = GFC_DESCRIPTOR_EXTENT(b,1);
    705 	  ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,1);
    706 	  if (arg_extent != ret_extent)
    707 	    runtime_error ("Array bound mismatch for dimension 2 of "
    708 	    		   "array (%ld/%ld) ",
    709 			   (long int) ret_extent, (long int) arg_extent);
    710 	}
    711     }
    712 
    713 
    714   if (GFC_DESCRIPTOR_RANK (retarray) == 1)
    715     {
    716       /* One-dimensional result may be addressed in the code below
    717 	 either as a row or a column matrix. We want both cases to
    718 	 work. */
    719       rxstride = rystride = GFC_DESCRIPTOR_STRIDE(retarray,0);
    720     }
    721   else
    722     {
    723       rxstride = GFC_DESCRIPTOR_STRIDE(retarray,0);
    724       rystride = GFC_DESCRIPTOR_STRIDE(retarray,1);
    725     }
    726 
    727 
    728   if (GFC_DESCRIPTOR_RANK (a) == 1)
    729     {
    730       /* Treat it as a a row matrix A[1,count]. */
    731       axstride = GFC_DESCRIPTOR_STRIDE(a,0);
    732       aystride = 1;
    733 
    734       xcount = 1;
    735       count = GFC_DESCRIPTOR_EXTENT(a,0);
    736     }
    737   else
    738     {
    739       axstride = GFC_DESCRIPTOR_STRIDE(a,0);
    740       aystride = GFC_DESCRIPTOR_STRIDE(a,1);
    741 
    742       count = GFC_DESCRIPTOR_EXTENT(a,1);
    743       xcount = GFC_DESCRIPTOR_EXTENT(a,0);
    744     }
    745 
    746   if (count != GFC_DESCRIPTOR_EXTENT(b,0))
    747     {
    748       if (count > 0 || GFC_DESCRIPTOR_EXTENT(b,0) > 0)
    749 	runtime_error ("Incorrect extent in argument B in MATMUL intrinsic "
    750 		       "in dimension 1: is %ld, should be %ld",
    751 		       (long int) GFC_DESCRIPTOR_EXTENT(b,0), (long int) count);
    752     }
    753 
    754   if (GFC_DESCRIPTOR_RANK (b) == 1)
    755     {
    756       /* Treat it as a column matrix B[count,1] */
    757       bxstride = GFC_DESCRIPTOR_STRIDE(b,0);
    758 
    759       /* bystride should never be used for 1-dimensional b.
    760          The value is only used for calculation of the
    761          memory by the buffer.  */
    762       bystride = 256;
    763       ycount = 1;
    764     }
    765   else
    766     {
    767       bxstride = GFC_DESCRIPTOR_STRIDE(b,0);
    768       bystride = GFC_DESCRIPTOR_STRIDE(b,1);
    769       ycount = GFC_DESCRIPTOR_EXTENT(b,1);
    770     }
    771 
    772   abase = a->base_addr;
    773   bbase = b->base_addr;
    774   dest = retarray->base_addr;
    775 
    776   /* Now that everything is set up, we perform the multiplication
    777      itself.  */
    778 
    779 #define POW3(x) (((float) (x)) * ((float) (x)) * ((float) (x)))
    780 #define min(a,b) ((a) <= (b) ? (a) : (b))
    781 #define max(a,b) ((a) >= (b) ? (a) : (b))
    782 
    783   if (try_blas && rxstride == 1 && (axstride == 1 || aystride == 1)
    784       && (bxstride == 1 || bystride == 1)
    785       && (((float) xcount) * ((float) ycount) * ((float) count)
    786           > POW3(blas_limit)))
    787     {
    788       const int m = xcount, n = ycount, k = count, ldc = rystride;
    789       const GFC_REAL_16 one = 1, zero = 0;
    790       const int lda = (axstride == 1) ? aystride : axstride,
    791 		ldb = (bxstride == 1) ? bystride : bxstride;
    792 
    793       if (lda > 0 && ldb > 0 && ldc > 0 && m > 1 && n > 1 && k > 1)
    794 	{
    795 	  assert (gemm != NULL);
    796 	  const char *transa, *transb;
    797 	  if (try_blas & 2)
    798 	    transa = "C";
    799 	  else
    800 	    transa = axstride == 1 ? "N" : "T";
    801 
    802 	  if (try_blas & 4)
    803 	    transb = "C";
    804 	  else
    805 	    transb = bxstride == 1 ? "N" : "T";
    806 
    807 	  gemm (transa, transb , &m,
    808 		&n, &k,	&one, abase, &lda, bbase, &ldb, &zero, dest,
    809 		&ldc, 1, 1);
    810 	  return;
    811 	}
    812     }
    813 
    814   if (rxstride == 1 && axstride == 1 && bxstride == 1
    815       && GFC_DESCRIPTOR_RANK (b) != 1)
    816     {
    817       /* This block of code implements a tuned matmul, derived from
    818          Superscalar GEMM-based level 3 BLAS,  Beta version 0.1
    819 
    820                Bo Kagstrom and Per Ling
    821                Department of Computing Science
    822                Umea University
    823                S-901 87 Umea, Sweden
    824 
    825 	 from netlib.org, translated to C, and modified for matmul.m4.  */
    826 
    827       const GFC_REAL_16 *a, *b;
    828       GFC_REAL_16 *c;
    829       const index_type m = xcount, n = ycount, k = count;
    830 
    831       /* System generated locals */
    832       index_type a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset,
    833 		 i1, i2, i3, i4, i5, i6;
    834 
    835       /* Local variables */
    836       GFC_REAL_16 f11, f12, f21, f22, f31, f32, f41, f42,
    837 		 f13, f14, f23, f24, f33, f34, f43, f44;
    838       index_type i, j, l, ii, jj, ll;
    839       index_type isec, jsec, lsec, uisec, ujsec, ulsec;
    840       GFC_REAL_16 *t1;
    841 
    842       a = abase;
    843       b = bbase;
    844       c = retarray->base_addr;
    845 
    846       /* Parameter adjustments */
    847       c_dim1 = rystride;
    848       c_offset = 1 + c_dim1;
    849       c -= c_offset;
    850       a_dim1 = aystride;
    851       a_offset = 1 + a_dim1;
    852       a -= a_offset;
    853       b_dim1 = bystride;
    854       b_offset = 1 + b_dim1;
    855       b -= b_offset;
    856 
    857       /* Empty c first.  */
    858       for (j=1; j<=n; j++)
    859 	for (i=1; i<=m; i++)
    860 	  c[i + j * c_dim1] = (GFC_REAL_16)0;
    861 
    862       /* Early exit if possible */
    863       if (m == 0 || n == 0 || k == 0)
    864 	return;
    865 
    866       /* Adjust size of t1 to what is needed.  */
    867       index_type t1_dim, a_sz;
    868       if (aystride == 1)
    869         a_sz = rystride;
    870       else
    871         a_sz = a_dim1;
    872 
    873       t1_dim = a_sz * 256 + b_dim1;
    874       if (t1_dim > 65536)
    875 	t1_dim = 65536;
    876 
    877       t1 = malloc (t1_dim * sizeof(GFC_REAL_16));
    878 
    879       /* Start turning the crank. */
    880       i1 = n;
    881       for (jj = 1; jj <= i1; jj += 512)
    882 	{
    883 	  /* Computing MIN */
    884 	  i2 = 512;
    885 	  i3 = n - jj + 1;
    886 	  jsec = min(i2,i3);
    887 	  ujsec = jsec - jsec % 4;
    888 	  i2 = k;
    889 	  for (ll = 1; ll <= i2; ll += 256)
    890 	    {
    891 	      /* Computing MIN */
    892 	      i3 = 256;
    893 	      i4 = k - ll + 1;
    894 	      lsec = min(i3,i4);
    895 	      ulsec = lsec - lsec % 2;
    896 
    897 	      i3 = m;
    898 	      for (ii = 1; ii <= i3; ii += 256)
    899 		{
    900 		  /* Computing MIN */
    901 		  i4 = 256;
    902 		  i5 = m - ii + 1;
    903 		  isec = min(i4,i5);
    904 		  uisec = isec - isec % 2;
    905 		  i4 = ll + ulsec - 1;
    906 		  for (l = ll; l <= i4; l += 2)
    907 		    {
    908 		      i5 = ii + uisec - 1;
    909 		      for (i = ii; i <= i5; i += 2)
    910 			{
    911 			  t1[l - ll + 1 + ((i - ii + 1) << 8) - 257] =
    912 					a[i + l * a_dim1];
    913 			  t1[l - ll + 2 + ((i - ii + 1) << 8) - 257] =
    914 					a[i + (l + 1) * a_dim1];
    915 			  t1[l - ll + 1 + ((i - ii + 2) << 8) - 257] =
    916 					a[i + 1 + l * a_dim1];
    917 			  t1[l - ll + 2 + ((i - ii + 2) << 8) - 257] =
    918 					a[i + 1 + (l + 1) * a_dim1];
    919 			}
    920 		      if (uisec < isec)
    921 			{
    922 			  t1[l - ll + 1 + (isec << 8) - 257] =
    923 				    a[ii + isec - 1 + l * a_dim1];
    924 			  t1[l - ll + 2 + (isec << 8) - 257] =
    925 				    a[ii + isec - 1 + (l + 1) * a_dim1];
    926 			}
    927 		    }
    928 		  if (ulsec < lsec)
    929 		    {
    930 		      i4 = ii + isec - 1;
    931 		      for (i = ii; i<= i4; ++i)
    932 			{
    933 			  t1[lsec + ((i - ii + 1) << 8) - 257] =
    934 				    a[i + (ll + lsec - 1) * a_dim1];
    935 			}
    936 		    }
    937 
    938 		  uisec = isec - isec % 4;
    939 		  i4 = jj + ujsec - 1;
    940 		  for (j = jj; j <= i4; j += 4)
    941 		    {
    942 		      i5 = ii + uisec - 1;
    943 		      for (i = ii; i <= i5; i += 4)
    944 			{
    945 			  f11 = c[i + j * c_dim1];
    946 			  f21 = c[i + 1 + j * c_dim1];
    947 			  f12 = c[i + (j + 1) * c_dim1];
    948 			  f22 = c[i + 1 + (j + 1) * c_dim1];
    949 			  f13 = c[i + (j + 2) * c_dim1];
    950 			  f23 = c[i + 1 + (j + 2) * c_dim1];
    951 			  f14 = c[i + (j + 3) * c_dim1];
    952 			  f24 = c[i + 1 + (j + 3) * c_dim1];
    953 			  f31 = c[i + 2 + j * c_dim1];
    954 			  f41 = c[i + 3 + j * c_dim1];
    955 			  f32 = c[i + 2 + (j + 1) * c_dim1];
    956 			  f42 = c[i + 3 + (j + 1) * c_dim1];
    957 			  f33 = c[i + 2 + (j + 2) * c_dim1];
    958 			  f43 = c[i + 3 + (j + 2) * c_dim1];
    959 			  f34 = c[i + 2 + (j + 3) * c_dim1];
    960 			  f44 = c[i + 3 + (j + 3) * c_dim1];
    961 			  i6 = ll + lsec - 1;
    962 			  for (l = ll; l <= i6; ++l)
    963 			    {
    964 			      f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
    965 				      * b[l + j * b_dim1];
    966 			      f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
    967 				      * b[l + j * b_dim1];
    968 			      f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
    969 				      * b[l + (j + 1) * b_dim1];
    970 			      f22 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
    971 				      * b[l + (j + 1) * b_dim1];
    972 			      f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
    973 				      * b[l + (j + 2) * b_dim1];
    974 			      f23 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
    975 				      * b[l + (j + 2) * b_dim1];
    976 			      f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
    977 				      * b[l + (j + 3) * b_dim1];
    978 			      f24 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
    979 				      * b[l + (j + 3) * b_dim1];
    980 			      f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
    981 				      * b[l + j * b_dim1];
    982 			      f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
    983 				      * b[l + j * b_dim1];
    984 			      f32 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
    985 				      * b[l + (j + 1) * b_dim1];
    986 			      f42 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
    987 				      * b[l + (j + 1) * b_dim1];
    988 			      f33 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
    989 				      * b[l + (j + 2) * b_dim1];
    990 			      f43 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
    991 				      * b[l + (j + 2) * b_dim1];
    992 			      f34 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
    993 				      * b[l + (j + 3) * b_dim1];
    994 			      f44 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
    995 				      * b[l + (j + 3) * b_dim1];
    996 			    }
    997 			  c[i + j * c_dim1] = f11;
    998 			  c[i + 1 + j * c_dim1] = f21;
    999 			  c[i + (j + 1) * c_dim1] = f12;
   1000 			  c[i + 1 + (j + 1) * c_dim1] = f22;
   1001 			  c[i + (j + 2) * c_dim1] = f13;
   1002 			  c[i + 1 + (j + 2) * c_dim1] = f23;
   1003 			  c[i + (j + 3) * c_dim1] = f14;
   1004 			  c[i + 1 + (j + 3) * c_dim1] = f24;
   1005 			  c[i + 2 + j * c_dim1] = f31;
   1006 			  c[i + 3 + j * c_dim1] = f41;
   1007 			  c[i + 2 + (j + 1) * c_dim1] = f32;
   1008 			  c[i + 3 + (j + 1) * c_dim1] = f42;
   1009 			  c[i + 2 + (j + 2) * c_dim1] = f33;
   1010 			  c[i + 3 + (j + 2) * c_dim1] = f43;
   1011 			  c[i + 2 + (j + 3) * c_dim1] = f34;
   1012 			  c[i + 3 + (j + 3) * c_dim1] = f44;
   1013 			}
   1014 		      if (uisec < isec)
   1015 			{
   1016 			  i5 = ii + isec - 1;
   1017 			  for (i = ii + uisec; i <= i5; ++i)
   1018 			    {
   1019 			      f11 = c[i + j * c_dim1];
   1020 			      f12 = c[i + (j + 1) * c_dim1];
   1021 			      f13 = c[i + (j + 2) * c_dim1];
   1022 			      f14 = c[i + (j + 3) * c_dim1];
   1023 			      i6 = ll + lsec - 1;
   1024 			      for (l = ll; l <= i6; ++l)
   1025 				{
   1026 				  f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
   1027 					  257] * b[l + j * b_dim1];
   1028 				  f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
   1029 					  257] * b[l + (j + 1) * b_dim1];
   1030 				  f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
   1031 					  257] * b[l + (j + 2) * b_dim1];
   1032 				  f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
   1033 					  257] * b[l + (j + 3) * b_dim1];
   1034 				}
   1035 			      c[i + j * c_dim1] = f11;
   1036 			      c[i + (j + 1) * c_dim1] = f12;
   1037 			      c[i + (j + 2) * c_dim1] = f13;
   1038 			      c[i + (j + 3) * c_dim1] = f14;
   1039 			    }
   1040 			}
   1041 		    }
   1042 		  if (ujsec < jsec)
   1043 		    {
   1044 		      i4 = jj + jsec - 1;
   1045 		      for (j = jj + ujsec; j <= i4; ++j)
   1046 			{
   1047 			  i5 = ii + uisec - 1;
   1048 			  for (i = ii; i <= i5; i += 4)
   1049 			    {
   1050 			      f11 = c[i + j * c_dim1];
   1051 			      f21 = c[i + 1 + j * c_dim1];
   1052 			      f31 = c[i + 2 + j * c_dim1];
   1053 			      f41 = c[i + 3 + j * c_dim1];
   1054 			      i6 = ll + lsec - 1;
   1055 			      for (l = ll; l <= i6; ++l)
   1056 				{
   1057 				  f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
   1058 					  257] * b[l + j * b_dim1];
   1059 				  f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) -
   1060 					  257] * b[l + j * b_dim1];
   1061 				  f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) -
   1062 					  257] * b[l + j * b_dim1];
   1063 				  f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) -
   1064 					  257] * b[l + j * b_dim1];
   1065 				}
   1066 			      c[i + j * c_dim1] = f11;
   1067 			      c[i + 1 + j * c_dim1] = f21;
   1068 			      c[i + 2 + j * c_dim1] = f31;
   1069 			      c[i + 3 + j * c_dim1] = f41;
   1070 			    }
   1071 			  i5 = ii + isec - 1;
   1072 			  for (i = ii + uisec; i <= i5; ++i)
   1073 			    {
   1074 			      f11 = c[i + j * c_dim1];
   1075 			      i6 = ll + lsec - 1;
   1076 			      for (l = ll; l <= i6; ++l)
   1077 				{
   1078 				  f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
   1079 					  257] * b[l + j * b_dim1];
   1080 				}
   1081 			      c[i + j * c_dim1] = f11;
   1082 			    }
   1083 			}
   1084 		    }
   1085 		}
   1086 	    }
   1087 	}
   1088       free(t1);
   1089       return;
   1090     }
   1091   else if (rxstride == 1 && aystride == 1 && bxstride == 1)
   1092     {
   1093       if (GFC_DESCRIPTOR_RANK (a) != 1)
   1094 	{
   1095 	  const GFC_REAL_16 *restrict abase_x;
   1096 	  const GFC_REAL_16 *restrict bbase_y;
   1097 	  GFC_REAL_16 *restrict dest_y;
   1098 	  GFC_REAL_16 s;
   1099 
   1100 	  for (y = 0; y < ycount; y++)
   1101 	    {
   1102 	      bbase_y = &bbase[y*bystride];
   1103 	      dest_y = &dest[y*rystride];
   1104 	      for (x = 0; x < xcount; x++)
   1105 		{
   1106 		  abase_x = &abase[x*axstride];
   1107 		  s = (GFC_REAL_16) 0;
   1108 		  for (n = 0; n < count; n++)
   1109 		    s += abase_x[n] * bbase_y[n];
   1110 		  dest_y[x] = s;
   1111 		}
   1112 	    }
   1113 	}
   1114       else
   1115 	{
   1116 	  const GFC_REAL_16 *restrict bbase_y;
   1117 	  GFC_REAL_16 s;
   1118 
   1119 	  for (y = 0; y < ycount; y++)
   1120 	    {
   1121 	      bbase_y = &bbase[y*bystride];
   1122 	      s = (GFC_REAL_16) 0;
   1123 	      for (n = 0; n < count; n++)
   1124 		s += abase[n*axstride] * bbase_y[n];
   1125 	      dest[y*rystride] = s;
   1126 	    }
   1127 	}
   1128     }
   1129   else if (GFC_DESCRIPTOR_RANK (a) == 1)
   1130     {
   1131       const GFC_REAL_16 *restrict bbase_y;
   1132       GFC_REAL_16 s;
   1133 
   1134       for (y = 0; y < ycount; y++)
   1135 	{
   1136 	  bbase_y = &bbase[y*bystride];
   1137 	  s = (GFC_REAL_16) 0;
   1138 	  for (n = 0; n < count; n++)
   1139 	    s += abase[n*axstride] * bbase_y[n*bxstride];
   1140 	  dest[y*rxstride] = s;
   1141 	}
   1142     }
   1143   else if (axstride < aystride)
   1144     {
   1145       for (y = 0; y < ycount; y++)
   1146 	for (x = 0; x < xcount; x++)
   1147 	  dest[x*rxstride + y*rystride] = (GFC_REAL_16)0;
   1148 
   1149       for (y = 0; y < ycount; y++)
   1150 	for (n = 0; n < count; n++)
   1151 	  for (x = 0; x < xcount; x++)
   1152 	    /* dest[x,y] += a[x,n] * b[n,y] */
   1153 	    dest[x*rxstride + y*rystride] +=
   1154 					abase[x*axstride + n*aystride] *
   1155 					bbase[n*bxstride + y*bystride];
   1156     }
   1157   else
   1158     {
   1159       const GFC_REAL_16 *restrict abase_x;
   1160       const GFC_REAL_16 *restrict bbase_y;
   1161       GFC_REAL_16 *restrict dest_y;
   1162       GFC_REAL_16 s;
   1163 
   1164       for (y = 0; y < ycount; y++)
   1165 	{
   1166 	  bbase_y = &bbase[y*bystride];
   1167 	  dest_y = &dest[y*rystride];
   1168 	  for (x = 0; x < xcount; x++)
   1169 	    {
   1170 	      abase_x = &abase[x*axstride];
   1171 	      s = (GFC_REAL_16) 0;
   1172 	      for (n = 0; n < count; n++)
   1173 		s += abase_x[n*aystride] * bbase_y[n*bxstride];
   1174 	      dest_y[x*rxstride] = s;
   1175 	    }
   1176 	}
   1177     }
   1178 }
   1179 #undef POW3
   1180 #undef min
   1181 #undef max
   1182 
   1183 #endif
   1184 
   1185 #endif
   1186 
   1187