Home | History | Annotate | Line # | Download | only in generated
      1      1.1  mrg /* Implementation of the MATMUL intrinsic
      2  1.1.1.4  mrg    Copyright (C) 2002-2024 Free Software Foundation, Inc.
      3      1.1  mrg    Contributed by Thomas Koenig <tkoenig (at) gcc.gnu.org>.
      4      1.1  mrg 
      5      1.1  mrg This file is part of the GNU Fortran runtime library (libgfortran).
      6      1.1  mrg 
      7      1.1  mrg Libgfortran is free software; you can redistribute it and/or
      8      1.1  mrg modify it under the terms of the GNU General Public
      9      1.1  mrg License as published by the Free Software Foundation; either
     10      1.1  mrg version 3 of the License, or (at your option) any later version.
     11      1.1  mrg 
     12      1.1  mrg Libgfortran is distributed in the hope that it will be useful,
     13      1.1  mrg but WITHOUT ANY WARRANTY; without even the implied warranty of
     14      1.1  mrg MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
     15      1.1  mrg GNU General Public License for more details.
     16      1.1  mrg 
     17      1.1  mrg Under Section 7 of GPL version 3, you are granted additional
     18      1.1  mrg permissions described in the GCC Runtime Library Exception, version
     19      1.1  mrg 3.1, as published by the Free Software Foundation.
     20      1.1  mrg 
     21      1.1  mrg You should have received a copy of the GNU General Public License and
     22      1.1  mrg a copy of the GCC Runtime Library Exception along with this program;
     23      1.1  mrg see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
     24      1.1  mrg <http://www.gnu.org/licenses/>.  */
     25      1.1  mrg 
     26      1.1  mrg #include "libgfortran.h"
     27      1.1  mrg #include <string.h>
     28      1.1  mrg #include <assert.h>
     29      1.1  mrg 
     30      1.1  mrg 
     31      1.1  mrg /* These are the specific versions of matmul with -mprefer-avx128.  */
     32      1.1  mrg 
     33      1.1  mrg #if defined (HAVE_GFC_REAL_10)
     34      1.1  mrg 
     35      1.1  mrg /* Prototype for the BLAS ?gemm subroutine, a pointer to which can be
     36      1.1  mrg    passed to us by the front-end, in which case we call it for large
     37      1.1  mrg    matrices.  */
     38      1.1  mrg 
     39      1.1  mrg typedef void (*blas_call)(const char *, const char *, const int *, const int *,
     40      1.1  mrg                           const int *, const GFC_REAL_10 *, const GFC_REAL_10 *,
     41      1.1  mrg                           const int *, const GFC_REAL_10 *, const int *,
     42      1.1  mrg                           const GFC_REAL_10 *, GFC_REAL_10 *, const int *,
     43      1.1  mrg                           int, int);
     44      1.1  mrg 
     45      1.1  mrg #if defined(HAVE_AVX) && defined(HAVE_FMA3) && defined(HAVE_AVX128)
     46      1.1  mrg void
     47      1.1  mrg matmul_r10_avx128_fma3 (gfc_array_r10 * const restrict retarray,
     48      1.1  mrg 	gfc_array_r10 * const restrict a, gfc_array_r10 * const restrict b, int try_blas,
     49      1.1  mrg 	int blas_limit, blas_call gemm) __attribute__((__target__("avx,fma")));
     50      1.1  mrg internal_proto(matmul_r10_avx128_fma3);
     51      1.1  mrg void
     52      1.1  mrg matmul_r10_avx128_fma3 (gfc_array_r10 * const restrict retarray,
     53      1.1  mrg 	gfc_array_r10 * const restrict a, gfc_array_r10 * const restrict b, int try_blas,
     54      1.1  mrg 	int blas_limit, blas_call gemm)
     55      1.1  mrg {
     56      1.1  mrg   const GFC_REAL_10 * restrict abase;
     57      1.1  mrg   const GFC_REAL_10 * restrict bbase;
     58      1.1  mrg   GFC_REAL_10 * restrict dest;
     59      1.1  mrg 
     60      1.1  mrg   index_type rxstride, rystride, axstride, aystride, bxstride, bystride;
     61      1.1  mrg   index_type x, y, n, count, xcount, ycount;
     62      1.1  mrg 
     63      1.1  mrg   assert (GFC_DESCRIPTOR_RANK (a) == 2
     64      1.1  mrg           || GFC_DESCRIPTOR_RANK (b) == 2);
     65      1.1  mrg 
     66      1.1  mrg /* C[xcount,ycount] = A[xcount, count] * B[count,ycount]
     67      1.1  mrg 
     68      1.1  mrg    Either A or B (but not both) can be rank 1:
     69      1.1  mrg 
     70      1.1  mrg    o One-dimensional argument A is implicitly treated as a row matrix
     71      1.1  mrg      dimensioned [1,count], so xcount=1.
     72      1.1  mrg 
     73      1.1  mrg    o One-dimensional argument B is implicitly treated as a column matrix
     74      1.1  mrg      dimensioned [count, 1], so ycount=1.
     75      1.1  mrg */
     76      1.1  mrg 
     77      1.1  mrg   if (retarray->base_addr == NULL)
     78      1.1  mrg     {
     79      1.1  mrg       if (GFC_DESCRIPTOR_RANK (a) == 1)
     80      1.1  mrg         {
     81      1.1  mrg 	  GFC_DIMENSION_SET(retarray->dim[0], 0,
     82      1.1  mrg 	                    GFC_DESCRIPTOR_EXTENT(b,1) - 1, 1);
     83      1.1  mrg         }
     84      1.1  mrg       else if (GFC_DESCRIPTOR_RANK (b) == 1)
     85      1.1  mrg         {
     86      1.1  mrg 	  GFC_DIMENSION_SET(retarray->dim[0], 0,
     87      1.1  mrg 	                    GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1);
     88      1.1  mrg         }
     89      1.1  mrg       else
     90      1.1  mrg         {
     91      1.1  mrg 	  GFC_DIMENSION_SET(retarray->dim[0], 0,
     92      1.1  mrg 	                    GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1);
     93      1.1  mrg 
     94      1.1  mrg           GFC_DIMENSION_SET(retarray->dim[1], 0,
     95      1.1  mrg 	                    GFC_DESCRIPTOR_EXTENT(b,1) - 1,
     96      1.1  mrg 			    GFC_DESCRIPTOR_EXTENT(retarray,0));
     97      1.1  mrg         }
     98      1.1  mrg 
     99      1.1  mrg       retarray->base_addr
    100      1.1  mrg 	= xmallocarray (size0 ((array_t *) retarray), sizeof (GFC_REAL_10));
    101      1.1  mrg       retarray->offset = 0;
    102      1.1  mrg     }
    103      1.1  mrg   else if (unlikely (compile_options.bounds_check))
    104      1.1  mrg     {
    105      1.1  mrg       index_type ret_extent, arg_extent;
    106      1.1  mrg 
    107      1.1  mrg       if (GFC_DESCRIPTOR_RANK (a) == 1)
    108      1.1  mrg 	{
    109      1.1  mrg 	  arg_extent = GFC_DESCRIPTOR_EXTENT(b,1);
    110      1.1  mrg 	  ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
    111      1.1  mrg 	  if (arg_extent != ret_extent)
    112      1.1  mrg 	    runtime_error ("Array bound mismatch for dimension 1 of "
    113      1.1  mrg 	    		   "array (%ld/%ld) ",
    114      1.1  mrg 			   (long int) ret_extent, (long int) arg_extent);
    115      1.1  mrg 	}
    116      1.1  mrg       else if (GFC_DESCRIPTOR_RANK (b) == 1)
    117      1.1  mrg 	{
    118      1.1  mrg 	  arg_extent = GFC_DESCRIPTOR_EXTENT(a,0);
    119      1.1  mrg 	  ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
    120      1.1  mrg 	  if (arg_extent != ret_extent)
    121      1.1  mrg 	    runtime_error ("Array bound mismatch for dimension 1 of "
    122      1.1  mrg 	    		   "array (%ld/%ld) ",
    123      1.1  mrg 			   (long int) ret_extent, (long int) arg_extent);
    124      1.1  mrg 	}
    125      1.1  mrg       else
    126      1.1  mrg 	{
    127      1.1  mrg 	  arg_extent = GFC_DESCRIPTOR_EXTENT(a,0);
    128      1.1  mrg 	  ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
    129      1.1  mrg 	  if (arg_extent != ret_extent)
    130      1.1  mrg 	    runtime_error ("Array bound mismatch for dimension 1 of "
    131      1.1  mrg 	    		   "array (%ld/%ld) ",
    132      1.1  mrg 			   (long int) ret_extent, (long int) arg_extent);
    133      1.1  mrg 
    134      1.1  mrg 	  arg_extent = GFC_DESCRIPTOR_EXTENT(b,1);
    135      1.1  mrg 	  ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,1);
    136      1.1  mrg 	  if (arg_extent != ret_extent)
    137      1.1  mrg 	    runtime_error ("Array bound mismatch for dimension 2 of "
    138      1.1  mrg 	    		   "array (%ld/%ld) ",
    139      1.1  mrg 			   (long int) ret_extent, (long int) arg_extent);
    140      1.1  mrg 	}
    141      1.1  mrg     }
    142      1.1  mrg 
    143      1.1  mrg 
    144      1.1  mrg   if (GFC_DESCRIPTOR_RANK (retarray) == 1)
    145      1.1  mrg     {
    146      1.1  mrg       /* One-dimensional result may be addressed in the code below
    147      1.1  mrg 	 either as a row or a column matrix. We want both cases to
    148      1.1  mrg 	 work. */
    149      1.1  mrg       rxstride = rystride = GFC_DESCRIPTOR_STRIDE(retarray,0);
    150      1.1  mrg     }
    151      1.1  mrg   else
    152      1.1  mrg     {
    153      1.1  mrg       rxstride = GFC_DESCRIPTOR_STRIDE(retarray,0);
    154      1.1  mrg       rystride = GFC_DESCRIPTOR_STRIDE(retarray,1);
    155      1.1  mrg     }
    156      1.1  mrg 
    157      1.1  mrg 
    158      1.1  mrg   if (GFC_DESCRIPTOR_RANK (a) == 1)
    159      1.1  mrg     {
    160      1.1  mrg       /* Treat it as a a row matrix A[1,count]. */
    161      1.1  mrg       axstride = GFC_DESCRIPTOR_STRIDE(a,0);
    162      1.1  mrg       aystride = 1;
    163      1.1  mrg 
    164      1.1  mrg       xcount = 1;
    165      1.1  mrg       count = GFC_DESCRIPTOR_EXTENT(a,0);
    166      1.1  mrg     }
    167      1.1  mrg   else
    168      1.1  mrg     {
    169      1.1  mrg       axstride = GFC_DESCRIPTOR_STRIDE(a,0);
    170      1.1  mrg       aystride = GFC_DESCRIPTOR_STRIDE(a,1);
    171      1.1  mrg 
    172      1.1  mrg       count = GFC_DESCRIPTOR_EXTENT(a,1);
    173      1.1  mrg       xcount = GFC_DESCRIPTOR_EXTENT(a,0);
    174      1.1  mrg     }
    175      1.1  mrg 
    176      1.1  mrg   if (count != GFC_DESCRIPTOR_EXTENT(b,0))
    177      1.1  mrg     {
    178      1.1  mrg       if (count > 0 || GFC_DESCRIPTOR_EXTENT(b,0) > 0)
    179      1.1  mrg 	runtime_error ("Incorrect extent in argument B in MATMUL intrinsic "
    180      1.1  mrg 		       "in dimension 1: is %ld, should be %ld",
    181      1.1  mrg 		       (long int) GFC_DESCRIPTOR_EXTENT(b,0), (long int) count);
    182      1.1  mrg     }
    183      1.1  mrg 
    184      1.1  mrg   if (GFC_DESCRIPTOR_RANK (b) == 1)
    185      1.1  mrg     {
    186      1.1  mrg       /* Treat it as a column matrix B[count,1] */
    187      1.1  mrg       bxstride = GFC_DESCRIPTOR_STRIDE(b,0);
    188      1.1  mrg 
    189      1.1  mrg       /* bystride should never be used for 1-dimensional b.
    190      1.1  mrg          The value is only used for calculation of the
    191      1.1  mrg          memory by the buffer.  */
    192      1.1  mrg       bystride = 256;
    193      1.1  mrg       ycount = 1;
    194      1.1  mrg     }
    195      1.1  mrg   else
    196      1.1  mrg     {
    197      1.1  mrg       bxstride = GFC_DESCRIPTOR_STRIDE(b,0);
    198      1.1  mrg       bystride = GFC_DESCRIPTOR_STRIDE(b,1);
    199      1.1  mrg       ycount = GFC_DESCRIPTOR_EXTENT(b,1);
    200      1.1  mrg     }
    201      1.1  mrg 
    202      1.1  mrg   abase = a->base_addr;
    203      1.1  mrg   bbase = b->base_addr;
    204      1.1  mrg   dest = retarray->base_addr;
    205      1.1  mrg 
    206      1.1  mrg   /* Now that everything is set up, we perform the multiplication
    207      1.1  mrg      itself.  */
    208      1.1  mrg 
    209      1.1  mrg #define POW3(x) (((float) (x)) * ((float) (x)) * ((float) (x)))
    210      1.1  mrg #define min(a,b) ((a) <= (b) ? (a) : (b))
    211      1.1  mrg #define max(a,b) ((a) >= (b) ? (a) : (b))
    212      1.1  mrg 
    213      1.1  mrg   if (try_blas && rxstride == 1 && (axstride == 1 || aystride == 1)
    214      1.1  mrg       && (bxstride == 1 || bystride == 1)
    215      1.1  mrg       && (((float) xcount) * ((float) ycount) * ((float) count)
    216      1.1  mrg           > POW3(blas_limit)))
    217      1.1  mrg     {
    218      1.1  mrg       const int m = xcount, n = ycount, k = count, ldc = rystride;
    219      1.1  mrg       const GFC_REAL_10 one = 1, zero = 0;
    220      1.1  mrg       const int lda = (axstride == 1) ? aystride : axstride,
    221      1.1  mrg 		ldb = (bxstride == 1) ? bystride : bxstride;
    222      1.1  mrg 
    223      1.1  mrg       if (lda > 0 && ldb > 0 && ldc > 0 && m > 1 && n > 1 && k > 1)
    224      1.1  mrg 	{
    225      1.1  mrg 	  assert (gemm != NULL);
    226      1.1  mrg 	  const char *transa, *transb;
    227      1.1  mrg 	  if (try_blas & 2)
    228      1.1  mrg 	    transa = "C";
    229      1.1  mrg 	  else
    230      1.1  mrg 	    transa = axstride == 1 ? "N" : "T";
    231      1.1  mrg 
    232      1.1  mrg 	  if (try_blas & 4)
    233      1.1  mrg 	    transb = "C";
    234      1.1  mrg 	  else
    235      1.1  mrg 	    transb = bxstride == 1 ? "N" : "T";
    236      1.1  mrg 
    237      1.1  mrg 	  gemm (transa, transb , &m,
    238      1.1  mrg 		&n, &k,	&one, abase, &lda, bbase, &ldb, &zero, dest,
    239      1.1  mrg 		&ldc, 1, 1);
    240      1.1  mrg 	  return;
    241      1.1  mrg 	}
    242      1.1  mrg     }
    243      1.1  mrg 
    244  1.1.1.2  mrg   if (rxstride == 1 && axstride == 1 && bxstride == 1
    245  1.1.1.2  mrg       && GFC_DESCRIPTOR_RANK (b) != 1)
    246      1.1  mrg     {
    247      1.1  mrg       /* This block of code implements a tuned matmul, derived from
    248      1.1  mrg          Superscalar GEMM-based level 3 BLAS,  Beta version 0.1
    249      1.1  mrg 
    250      1.1  mrg                Bo Kagstrom and Per Ling
    251      1.1  mrg                Department of Computing Science
    252      1.1  mrg                Umea University
    253      1.1  mrg                S-901 87 Umea, Sweden
    254      1.1  mrg 
    255      1.1  mrg 	 from netlib.org, translated to C, and modified for matmul.m4.  */
    256      1.1  mrg 
    257      1.1  mrg       const GFC_REAL_10 *a, *b;
    258      1.1  mrg       GFC_REAL_10 *c;
    259      1.1  mrg       const index_type m = xcount, n = ycount, k = count;
    260      1.1  mrg 
    261      1.1  mrg       /* System generated locals */
    262      1.1  mrg       index_type a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset,
    263      1.1  mrg 		 i1, i2, i3, i4, i5, i6;
    264      1.1  mrg 
    265      1.1  mrg       /* Local variables */
    266      1.1  mrg       GFC_REAL_10 f11, f12, f21, f22, f31, f32, f41, f42,
    267      1.1  mrg 		 f13, f14, f23, f24, f33, f34, f43, f44;
    268      1.1  mrg       index_type i, j, l, ii, jj, ll;
    269      1.1  mrg       index_type isec, jsec, lsec, uisec, ujsec, ulsec;
    270      1.1  mrg       GFC_REAL_10 *t1;
    271      1.1  mrg 
    272      1.1  mrg       a = abase;
    273      1.1  mrg       b = bbase;
    274      1.1  mrg       c = retarray->base_addr;
    275      1.1  mrg 
    276      1.1  mrg       /* Parameter adjustments */
    277      1.1  mrg       c_dim1 = rystride;
    278      1.1  mrg       c_offset = 1 + c_dim1;
    279      1.1  mrg       c -= c_offset;
    280      1.1  mrg       a_dim1 = aystride;
    281      1.1  mrg       a_offset = 1 + a_dim1;
    282      1.1  mrg       a -= a_offset;
    283      1.1  mrg       b_dim1 = bystride;
    284      1.1  mrg       b_offset = 1 + b_dim1;
    285      1.1  mrg       b -= b_offset;
    286      1.1  mrg 
    287      1.1  mrg       /* Empty c first.  */
    288      1.1  mrg       for (j=1; j<=n; j++)
    289      1.1  mrg 	for (i=1; i<=m; i++)
    290      1.1  mrg 	  c[i + j * c_dim1] = (GFC_REAL_10)0;
    291      1.1  mrg 
    292      1.1  mrg       /* Early exit if possible */
    293      1.1  mrg       if (m == 0 || n == 0 || k == 0)
    294      1.1  mrg 	return;
    295      1.1  mrg 
    296      1.1  mrg       /* Adjust size of t1 to what is needed.  */
    297      1.1  mrg       index_type t1_dim, a_sz;
    298      1.1  mrg       if (aystride == 1)
    299      1.1  mrg         a_sz = rystride;
    300      1.1  mrg       else
    301      1.1  mrg         a_sz = a_dim1;
    302      1.1  mrg 
    303      1.1  mrg       t1_dim = a_sz * 256 + b_dim1;
    304      1.1  mrg       if (t1_dim > 65536)
    305      1.1  mrg 	t1_dim = 65536;
    306      1.1  mrg 
    307      1.1  mrg       t1 = malloc (t1_dim * sizeof(GFC_REAL_10));
    308      1.1  mrg 
    309      1.1  mrg       /* Start turning the crank. */
    310      1.1  mrg       i1 = n;
    311      1.1  mrg       for (jj = 1; jj <= i1; jj += 512)
    312      1.1  mrg 	{
    313      1.1  mrg 	  /* Computing MIN */
    314      1.1  mrg 	  i2 = 512;
    315      1.1  mrg 	  i3 = n - jj + 1;
    316      1.1  mrg 	  jsec = min(i2,i3);
    317      1.1  mrg 	  ujsec = jsec - jsec % 4;
    318      1.1  mrg 	  i2 = k;
    319      1.1  mrg 	  for (ll = 1; ll <= i2; ll += 256)
    320      1.1  mrg 	    {
    321      1.1  mrg 	      /* Computing MIN */
    322      1.1  mrg 	      i3 = 256;
    323      1.1  mrg 	      i4 = k - ll + 1;
    324      1.1  mrg 	      lsec = min(i3,i4);
    325      1.1  mrg 	      ulsec = lsec - lsec % 2;
    326      1.1  mrg 
    327      1.1  mrg 	      i3 = m;
    328      1.1  mrg 	      for (ii = 1; ii <= i3; ii += 256)
    329      1.1  mrg 		{
    330      1.1  mrg 		  /* Computing MIN */
    331      1.1  mrg 		  i4 = 256;
    332      1.1  mrg 		  i5 = m - ii + 1;
    333      1.1  mrg 		  isec = min(i4,i5);
    334      1.1  mrg 		  uisec = isec - isec % 2;
    335      1.1  mrg 		  i4 = ll + ulsec - 1;
    336      1.1  mrg 		  for (l = ll; l <= i4; l += 2)
    337      1.1  mrg 		    {
    338      1.1  mrg 		      i5 = ii + uisec - 1;
    339      1.1  mrg 		      for (i = ii; i <= i5; i += 2)
    340      1.1  mrg 			{
    341      1.1  mrg 			  t1[l - ll + 1 + ((i - ii + 1) << 8) - 257] =
    342      1.1  mrg 					a[i + l * a_dim1];
    343      1.1  mrg 			  t1[l - ll + 2 + ((i - ii + 1) << 8) - 257] =
    344      1.1  mrg 					a[i + (l + 1) * a_dim1];
    345      1.1  mrg 			  t1[l - ll + 1 + ((i - ii + 2) << 8) - 257] =
    346      1.1  mrg 					a[i + 1 + l * a_dim1];
    347      1.1  mrg 			  t1[l - ll + 2 + ((i - ii + 2) << 8) - 257] =
    348      1.1  mrg 					a[i + 1 + (l + 1) * a_dim1];
    349      1.1  mrg 			}
    350      1.1  mrg 		      if (uisec < isec)
    351      1.1  mrg 			{
    352      1.1  mrg 			  t1[l - ll + 1 + (isec << 8) - 257] =
    353      1.1  mrg 				    a[ii + isec - 1 + l * a_dim1];
    354      1.1  mrg 			  t1[l - ll + 2 + (isec << 8) - 257] =
    355      1.1  mrg 				    a[ii + isec - 1 + (l + 1) * a_dim1];
    356      1.1  mrg 			}
    357      1.1  mrg 		    }
    358      1.1  mrg 		  if (ulsec < lsec)
    359      1.1  mrg 		    {
    360      1.1  mrg 		      i4 = ii + isec - 1;
    361      1.1  mrg 		      for (i = ii; i<= i4; ++i)
    362      1.1  mrg 			{
    363      1.1  mrg 			  t1[lsec + ((i - ii + 1) << 8) - 257] =
    364      1.1  mrg 				    a[i + (ll + lsec - 1) * a_dim1];
    365      1.1  mrg 			}
    366      1.1  mrg 		    }
    367      1.1  mrg 
    368      1.1  mrg 		  uisec = isec - isec % 4;
    369      1.1  mrg 		  i4 = jj + ujsec - 1;
    370      1.1  mrg 		  for (j = jj; j <= i4; j += 4)
    371      1.1  mrg 		    {
    372      1.1  mrg 		      i5 = ii + uisec - 1;
    373      1.1  mrg 		      for (i = ii; i <= i5; i += 4)
    374      1.1  mrg 			{
    375      1.1  mrg 			  f11 = c[i + j * c_dim1];
    376      1.1  mrg 			  f21 = c[i + 1 + j * c_dim1];
    377      1.1  mrg 			  f12 = c[i + (j + 1) * c_dim1];
    378      1.1  mrg 			  f22 = c[i + 1 + (j + 1) * c_dim1];
    379      1.1  mrg 			  f13 = c[i + (j + 2) * c_dim1];
    380      1.1  mrg 			  f23 = c[i + 1 + (j + 2) * c_dim1];
    381      1.1  mrg 			  f14 = c[i + (j + 3) * c_dim1];
    382      1.1  mrg 			  f24 = c[i + 1 + (j + 3) * c_dim1];
    383      1.1  mrg 			  f31 = c[i + 2 + j * c_dim1];
    384      1.1  mrg 			  f41 = c[i + 3 + j * c_dim1];
    385      1.1  mrg 			  f32 = c[i + 2 + (j + 1) * c_dim1];
    386      1.1  mrg 			  f42 = c[i + 3 + (j + 1) * c_dim1];
    387      1.1  mrg 			  f33 = c[i + 2 + (j + 2) * c_dim1];
    388      1.1  mrg 			  f43 = c[i + 3 + (j + 2) * c_dim1];
    389      1.1  mrg 			  f34 = c[i + 2 + (j + 3) * c_dim1];
    390      1.1  mrg 			  f44 = c[i + 3 + (j + 3) * c_dim1];
    391      1.1  mrg 			  i6 = ll + lsec - 1;
    392      1.1  mrg 			  for (l = ll; l <= i6; ++l)
    393      1.1  mrg 			    {
    394      1.1  mrg 			      f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
    395      1.1  mrg 				      * b[l + j * b_dim1];
    396      1.1  mrg 			      f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
    397      1.1  mrg 				      * b[l + j * b_dim1];
    398      1.1  mrg 			      f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
    399      1.1  mrg 				      * b[l + (j + 1) * b_dim1];
    400      1.1  mrg 			      f22 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
    401      1.1  mrg 				      * b[l + (j + 1) * b_dim1];
    402      1.1  mrg 			      f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
    403      1.1  mrg 				      * b[l + (j + 2) * b_dim1];
    404      1.1  mrg 			      f23 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
    405      1.1  mrg 				      * b[l + (j + 2) * b_dim1];
    406      1.1  mrg 			      f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
    407      1.1  mrg 				      * b[l + (j + 3) * b_dim1];
    408      1.1  mrg 			      f24 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
    409      1.1  mrg 				      * b[l + (j + 3) * b_dim1];
    410      1.1  mrg 			      f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
    411      1.1  mrg 				      * b[l + j * b_dim1];
    412      1.1  mrg 			      f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
    413      1.1  mrg 				      * b[l + j * b_dim1];
    414      1.1  mrg 			      f32 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
    415      1.1  mrg 				      * b[l + (j + 1) * b_dim1];
    416      1.1  mrg 			      f42 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
    417      1.1  mrg 				      * b[l + (j + 1) * b_dim1];
    418      1.1  mrg 			      f33 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
    419      1.1  mrg 				      * b[l + (j + 2) * b_dim1];
    420      1.1  mrg 			      f43 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
    421      1.1  mrg 				      * b[l + (j + 2) * b_dim1];
    422      1.1  mrg 			      f34 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
    423      1.1  mrg 				      * b[l + (j + 3) * b_dim1];
    424      1.1  mrg 			      f44 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
    425      1.1  mrg 				      * b[l + (j + 3) * b_dim1];
    426      1.1  mrg 			    }
    427      1.1  mrg 			  c[i + j * c_dim1] = f11;
    428      1.1  mrg 			  c[i + 1 + j * c_dim1] = f21;
    429      1.1  mrg 			  c[i + (j + 1) * c_dim1] = f12;
    430      1.1  mrg 			  c[i + 1 + (j + 1) * c_dim1] = f22;
    431      1.1  mrg 			  c[i + (j + 2) * c_dim1] = f13;
    432      1.1  mrg 			  c[i + 1 + (j + 2) * c_dim1] = f23;
    433      1.1  mrg 			  c[i + (j + 3) * c_dim1] = f14;
    434      1.1  mrg 			  c[i + 1 + (j + 3) * c_dim1] = f24;
    435      1.1  mrg 			  c[i + 2 + j * c_dim1] = f31;
    436      1.1  mrg 			  c[i + 3 + j * c_dim1] = f41;
    437      1.1  mrg 			  c[i + 2 + (j + 1) * c_dim1] = f32;
    438      1.1  mrg 			  c[i + 3 + (j + 1) * c_dim1] = f42;
    439      1.1  mrg 			  c[i + 2 + (j + 2) * c_dim1] = f33;
    440      1.1  mrg 			  c[i + 3 + (j + 2) * c_dim1] = f43;
    441      1.1  mrg 			  c[i + 2 + (j + 3) * c_dim1] = f34;
    442      1.1  mrg 			  c[i + 3 + (j + 3) * c_dim1] = f44;
    443      1.1  mrg 			}
    444      1.1  mrg 		      if (uisec < isec)
    445      1.1  mrg 			{
    446      1.1  mrg 			  i5 = ii + isec - 1;
    447      1.1  mrg 			  for (i = ii + uisec; i <= i5; ++i)
    448      1.1  mrg 			    {
    449      1.1  mrg 			      f11 = c[i + j * c_dim1];
    450      1.1  mrg 			      f12 = c[i + (j + 1) * c_dim1];
    451      1.1  mrg 			      f13 = c[i + (j + 2) * c_dim1];
    452      1.1  mrg 			      f14 = c[i + (j + 3) * c_dim1];
    453      1.1  mrg 			      i6 = ll + lsec - 1;
    454      1.1  mrg 			      for (l = ll; l <= i6; ++l)
    455      1.1  mrg 				{
    456      1.1  mrg 				  f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
    457      1.1  mrg 					  257] * b[l + j * b_dim1];
    458      1.1  mrg 				  f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
    459      1.1  mrg 					  257] * b[l + (j + 1) * b_dim1];
    460      1.1  mrg 				  f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
    461      1.1  mrg 					  257] * b[l + (j + 2) * b_dim1];
    462      1.1  mrg 				  f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
    463      1.1  mrg 					  257] * b[l + (j + 3) * b_dim1];
    464      1.1  mrg 				}
    465      1.1  mrg 			      c[i + j * c_dim1] = f11;
    466      1.1  mrg 			      c[i + (j + 1) * c_dim1] = f12;
    467      1.1  mrg 			      c[i + (j + 2) * c_dim1] = f13;
    468      1.1  mrg 			      c[i + (j + 3) * c_dim1] = f14;
    469      1.1  mrg 			    }
    470      1.1  mrg 			}
    471      1.1  mrg 		    }
    472      1.1  mrg 		  if (ujsec < jsec)
    473      1.1  mrg 		    {
    474      1.1  mrg 		      i4 = jj + jsec - 1;
    475      1.1  mrg 		      for (j = jj + ujsec; j <= i4; ++j)
    476      1.1  mrg 			{
    477      1.1  mrg 			  i5 = ii + uisec - 1;
    478      1.1  mrg 			  for (i = ii; i <= i5; i += 4)
    479      1.1  mrg 			    {
    480      1.1  mrg 			      f11 = c[i + j * c_dim1];
    481      1.1  mrg 			      f21 = c[i + 1 + j * c_dim1];
    482      1.1  mrg 			      f31 = c[i + 2 + j * c_dim1];
    483      1.1  mrg 			      f41 = c[i + 3 + j * c_dim1];
    484      1.1  mrg 			      i6 = ll + lsec - 1;
    485      1.1  mrg 			      for (l = ll; l <= i6; ++l)
    486      1.1  mrg 				{
    487      1.1  mrg 				  f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
    488      1.1  mrg 					  257] * b[l + j * b_dim1];
    489      1.1  mrg 				  f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) -
    490      1.1  mrg 					  257] * b[l + j * b_dim1];
    491      1.1  mrg 				  f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) -
    492      1.1  mrg 					  257] * b[l + j * b_dim1];
    493      1.1  mrg 				  f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) -
    494      1.1  mrg 					  257] * b[l + j * b_dim1];
    495      1.1  mrg 				}
    496      1.1  mrg 			      c[i + j * c_dim1] = f11;
    497      1.1  mrg 			      c[i + 1 + j * c_dim1] = f21;
    498      1.1  mrg 			      c[i + 2 + j * c_dim1] = f31;
    499      1.1  mrg 			      c[i + 3 + j * c_dim1] = f41;
    500      1.1  mrg 			    }
    501      1.1  mrg 			  i5 = ii + isec - 1;
    502      1.1  mrg 			  for (i = ii + uisec; i <= i5; ++i)
    503      1.1  mrg 			    {
    504      1.1  mrg 			      f11 = c[i + j * c_dim1];
    505      1.1  mrg 			      i6 = ll + lsec - 1;
    506      1.1  mrg 			      for (l = ll; l <= i6; ++l)
    507      1.1  mrg 				{
    508      1.1  mrg 				  f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
    509      1.1  mrg 					  257] * b[l + j * b_dim1];
    510      1.1  mrg 				}
    511      1.1  mrg 			      c[i + j * c_dim1] = f11;
    512      1.1  mrg 			    }
    513      1.1  mrg 			}
    514      1.1  mrg 		    }
    515      1.1  mrg 		}
    516      1.1  mrg 	    }
    517      1.1  mrg 	}
    518      1.1  mrg       free(t1);
    519      1.1  mrg       return;
    520      1.1  mrg     }
    521      1.1  mrg   else if (rxstride == 1 && aystride == 1 && bxstride == 1)
    522      1.1  mrg     {
    523      1.1  mrg       if (GFC_DESCRIPTOR_RANK (a) != 1)
    524      1.1  mrg 	{
    525      1.1  mrg 	  const GFC_REAL_10 *restrict abase_x;
    526      1.1  mrg 	  const GFC_REAL_10 *restrict bbase_y;
    527      1.1  mrg 	  GFC_REAL_10 *restrict dest_y;
    528      1.1  mrg 	  GFC_REAL_10 s;
    529      1.1  mrg 
    530      1.1  mrg 	  for (y = 0; y < ycount; y++)
    531      1.1  mrg 	    {
    532      1.1  mrg 	      bbase_y = &bbase[y*bystride];
    533      1.1  mrg 	      dest_y = &dest[y*rystride];
    534      1.1  mrg 	      for (x = 0; x < xcount; x++)
    535      1.1  mrg 		{
    536      1.1  mrg 		  abase_x = &abase[x*axstride];
    537      1.1  mrg 		  s = (GFC_REAL_10) 0;
    538      1.1  mrg 		  for (n = 0; n < count; n++)
    539      1.1  mrg 		    s += abase_x[n] * bbase_y[n];
    540      1.1  mrg 		  dest_y[x] = s;
    541      1.1  mrg 		}
    542      1.1  mrg 	    }
    543      1.1  mrg 	}
    544      1.1  mrg       else
    545      1.1  mrg 	{
    546      1.1  mrg 	  const GFC_REAL_10 *restrict bbase_y;
    547      1.1  mrg 	  GFC_REAL_10 s;
    548      1.1  mrg 
    549      1.1  mrg 	  for (y = 0; y < ycount; y++)
    550      1.1  mrg 	    {
    551      1.1  mrg 	      bbase_y = &bbase[y*bystride];
    552      1.1  mrg 	      s = (GFC_REAL_10) 0;
    553      1.1  mrg 	      for (n = 0; n < count; n++)
    554      1.1  mrg 		s += abase[n*axstride] * bbase_y[n];
    555      1.1  mrg 	      dest[y*rystride] = s;
    556      1.1  mrg 	    }
    557      1.1  mrg 	}
    558      1.1  mrg     }
    559      1.1  mrg   else if (GFC_DESCRIPTOR_RANK (a) == 1)
    560      1.1  mrg     {
    561      1.1  mrg       const GFC_REAL_10 *restrict bbase_y;
    562      1.1  mrg       GFC_REAL_10 s;
    563      1.1  mrg 
    564      1.1  mrg       for (y = 0; y < ycount; y++)
    565      1.1  mrg 	{
    566      1.1  mrg 	  bbase_y = &bbase[y*bystride];
    567      1.1  mrg 	  s = (GFC_REAL_10) 0;
    568      1.1  mrg 	  for (n = 0; n < count; n++)
    569      1.1  mrg 	    s += abase[n*axstride] * bbase_y[n*bxstride];
    570      1.1  mrg 	  dest[y*rxstride] = s;
    571      1.1  mrg 	}
    572      1.1  mrg     }
    573  1.1.1.2  mrg   else if (axstride < aystride)
    574  1.1.1.2  mrg     {
    575  1.1.1.2  mrg       for (y = 0; y < ycount; y++)
    576  1.1.1.2  mrg 	for (x = 0; x < xcount; x++)
    577  1.1.1.2  mrg 	  dest[x*rxstride + y*rystride] = (GFC_REAL_10)0;
    578  1.1.1.2  mrg 
    579  1.1.1.2  mrg       for (y = 0; y < ycount; y++)
    580  1.1.1.2  mrg 	for (n = 0; n < count; n++)
    581  1.1.1.2  mrg 	  for (x = 0; x < xcount; x++)
    582  1.1.1.2  mrg 	    /* dest[x,y] += a[x,n] * b[n,y] */
    583  1.1.1.2  mrg 	    dest[x*rxstride + y*rystride] +=
    584  1.1.1.2  mrg 					abase[x*axstride + n*aystride] *
    585  1.1.1.2  mrg 					bbase[n*bxstride + y*bystride];
    586  1.1.1.2  mrg     }
    587      1.1  mrg   else
    588      1.1  mrg     {
    589      1.1  mrg       const GFC_REAL_10 *restrict abase_x;
    590      1.1  mrg       const GFC_REAL_10 *restrict bbase_y;
    591      1.1  mrg       GFC_REAL_10 *restrict dest_y;
    592      1.1  mrg       GFC_REAL_10 s;
    593      1.1  mrg 
    594      1.1  mrg       for (y = 0; y < ycount; y++)
    595      1.1  mrg 	{
    596      1.1  mrg 	  bbase_y = &bbase[y*bystride];
    597      1.1  mrg 	  dest_y = &dest[y*rystride];
    598      1.1  mrg 	  for (x = 0; x < xcount; x++)
    599      1.1  mrg 	    {
    600      1.1  mrg 	      abase_x = &abase[x*axstride];
    601      1.1  mrg 	      s = (GFC_REAL_10) 0;
    602      1.1  mrg 	      for (n = 0; n < count; n++)
    603      1.1  mrg 		s += abase_x[n*aystride] * bbase_y[n*bxstride];
    604      1.1  mrg 	      dest_y[x*rxstride] = s;
    605      1.1  mrg 	    }
    606      1.1  mrg 	}
    607      1.1  mrg     }
    608      1.1  mrg }
    609      1.1  mrg #undef POW3
    610      1.1  mrg #undef min
    611      1.1  mrg #undef max
    612      1.1  mrg 
    613      1.1  mrg #endif
    614      1.1  mrg 
    615      1.1  mrg #if defined(HAVE_AVX) && defined(HAVE_FMA4) && defined(HAVE_AVX128)
    616      1.1  mrg void
    617      1.1  mrg matmul_r10_avx128_fma4 (gfc_array_r10 * const restrict retarray,
    618      1.1  mrg 	gfc_array_r10 * const restrict a, gfc_array_r10 * const restrict b, int try_blas,
    619      1.1  mrg 	int blas_limit, blas_call gemm) __attribute__((__target__("avx,fma4")));
    620      1.1  mrg internal_proto(matmul_r10_avx128_fma4);
    621      1.1  mrg void
    622      1.1  mrg matmul_r10_avx128_fma4 (gfc_array_r10 * const restrict retarray,
    623      1.1  mrg 	gfc_array_r10 * const restrict a, gfc_array_r10 * const restrict b, int try_blas,
    624      1.1  mrg 	int blas_limit, blas_call gemm)
    625      1.1  mrg {
    626      1.1  mrg   const GFC_REAL_10 * restrict abase;
    627      1.1  mrg   const GFC_REAL_10 * restrict bbase;
    628      1.1  mrg   GFC_REAL_10 * restrict dest;
    629      1.1  mrg 
    630      1.1  mrg   index_type rxstride, rystride, axstride, aystride, bxstride, bystride;
    631      1.1  mrg   index_type x, y, n, count, xcount, ycount;
    632      1.1  mrg 
    633      1.1  mrg   assert (GFC_DESCRIPTOR_RANK (a) == 2
    634      1.1  mrg           || GFC_DESCRIPTOR_RANK (b) == 2);
    635      1.1  mrg 
    636      1.1  mrg /* C[xcount,ycount] = A[xcount, count] * B[count,ycount]
    637      1.1  mrg 
    638      1.1  mrg    Either A or B (but not both) can be rank 1:
    639      1.1  mrg 
    640      1.1  mrg    o One-dimensional argument A is implicitly treated as a row matrix
    641      1.1  mrg      dimensioned [1,count], so xcount=1.
    642      1.1  mrg 
    643      1.1  mrg    o One-dimensional argument B is implicitly treated as a column matrix
    644      1.1  mrg      dimensioned [count, 1], so ycount=1.
    645      1.1  mrg */
    646      1.1  mrg 
    647      1.1  mrg   if (retarray->base_addr == NULL)
    648      1.1  mrg     {
    649      1.1  mrg       if (GFC_DESCRIPTOR_RANK (a) == 1)
    650      1.1  mrg         {
    651      1.1  mrg 	  GFC_DIMENSION_SET(retarray->dim[0], 0,
    652      1.1  mrg 	                    GFC_DESCRIPTOR_EXTENT(b,1) - 1, 1);
    653      1.1  mrg         }
    654      1.1  mrg       else if (GFC_DESCRIPTOR_RANK (b) == 1)
    655      1.1  mrg         {
    656      1.1  mrg 	  GFC_DIMENSION_SET(retarray->dim[0], 0,
    657      1.1  mrg 	                    GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1);
    658      1.1  mrg         }
    659      1.1  mrg       else
    660      1.1  mrg         {
    661      1.1  mrg 	  GFC_DIMENSION_SET(retarray->dim[0], 0,
    662      1.1  mrg 	                    GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1);
    663      1.1  mrg 
    664      1.1  mrg           GFC_DIMENSION_SET(retarray->dim[1], 0,
    665      1.1  mrg 	                    GFC_DESCRIPTOR_EXTENT(b,1) - 1,
    666      1.1  mrg 			    GFC_DESCRIPTOR_EXTENT(retarray,0));
    667      1.1  mrg         }
    668      1.1  mrg 
    669      1.1  mrg       retarray->base_addr
    670      1.1  mrg 	= xmallocarray (size0 ((array_t *) retarray), sizeof (GFC_REAL_10));
    671      1.1  mrg       retarray->offset = 0;
    672      1.1  mrg     }
    673      1.1  mrg   else if (unlikely (compile_options.bounds_check))
    674      1.1  mrg     {
    675      1.1  mrg       index_type ret_extent, arg_extent;
    676      1.1  mrg 
    677      1.1  mrg       if (GFC_DESCRIPTOR_RANK (a) == 1)
    678      1.1  mrg 	{
    679      1.1  mrg 	  arg_extent = GFC_DESCRIPTOR_EXTENT(b,1);
    680      1.1  mrg 	  ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
    681      1.1  mrg 	  if (arg_extent != ret_extent)
    682      1.1  mrg 	    runtime_error ("Array bound mismatch for dimension 1 of "
    683      1.1  mrg 	    		   "array (%ld/%ld) ",
    684      1.1  mrg 			   (long int) ret_extent, (long int) arg_extent);
    685      1.1  mrg 	}
    686      1.1  mrg       else if (GFC_DESCRIPTOR_RANK (b) == 1)
    687      1.1  mrg 	{
    688      1.1  mrg 	  arg_extent = GFC_DESCRIPTOR_EXTENT(a,0);
    689      1.1  mrg 	  ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
    690      1.1  mrg 	  if (arg_extent != ret_extent)
    691      1.1  mrg 	    runtime_error ("Array bound mismatch for dimension 1 of "
    692      1.1  mrg 	    		   "array (%ld/%ld) ",
    693      1.1  mrg 			   (long int) ret_extent, (long int) arg_extent);
    694      1.1  mrg 	}
    695      1.1  mrg       else
    696      1.1  mrg 	{
    697      1.1  mrg 	  arg_extent = GFC_DESCRIPTOR_EXTENT(a,0);
    698      1.1  mrg 	  ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
    699      1.1  mrg 	  if (arg_extent != ret_extent)
    700      1.1  mrg 	    runtime_error ("Array bound mismatch for dimension 1 of "
    701      1.1  mrg 	    		   "array (%ld/%ld) ",
    702      1.1  mrg 			   (long int) ret_extent, (long int) arg_extent);
    703      1.1  mrg 
    704      1.1  mrg 	  arg_extent = GFC_DESCRIPTOR_EXTENT(b,1);
    705      1.1  mrg 	  ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,1);
    706      1.1  mrg 	  if (arg_extent != ret_extent)
    707      1.1  mrg 	    runtime_error ("Array bound mismatch for dimension 2 of "
    708      1.1  mrg 	    		   "array (%ld/%ld) ",
    709      1.1  mrg 			   (long int) ret_extent, (long int) arg_extent);
    710      1.1  mrg 	}
    711      1.1  mrg     }
    712      1.1  mrg 
    713      1.1  mrg 
    714      1.1  mrg   if (GFC_DESCRIPTOR_RANK (retarray) == 1)
    715      1.1  mrg     {
    716      1.1  mrg       /* One-dimensional result may be addressed in the code below
    717      1.1  mrg 	 either as a row or a column matrix. We want both cases to
    718      1.1  mrg 	 work. */
    719      1.1  mrg       rxstride = rystride = GFC_DESCRIPTOR_STRIDE(retarray,0);
    720      1.1  mrg     }
    721      1.1  mrg   else
    722      1.1  mrg     {
    723      1.1  mrg       rxstride = GFC_DESCRIPTOR_STRIDE(retarray,0);
    724      1.1  mrg       rystride = GFC_DESCRIPTOR_STRIDE(retarray,1);
    725      1.1  mrg     }
    726      1.1  mrg 
    727      1.1  mrg 
    728      1.1  mrg   if (GFC_DESCRIPTOR_RANK (a) == 1)
    729      1.1  mrg     {
    730      1.1  mrg       /* Treat it as a a row matrix A[1,count]. */
    731      1.1  mrg       axstride = GFC_DESCRIPTOR_STRIDE(a,0);
    732      1.1  mrg       aystride = 1;
    733      1.1  mrg 
    734      1.1  mrg       xcount = 1;
    735      1.1  mrg       count = GFC_DESCRIPTOR_EXTENT(a,0);
    736      1.1  mrg     }
    737      1.1  mrg   else
    738      1.1  mrg     {
    739      1.1  mrg       axstride = GFC_DESCRIPTOR_STRIDE(a,0);
    740      1.1  mrg       aystride = GFC_DESCRIPTOR_STRIDE(a,1);
    741      1.1  mrg 
    742      1.1  mrg       count = GFC_DESCRIPTOR_EXTENT(a,1);
    743      1.1  mrg       xcount = GFC_DESCRIPTOR_EXTENT(a,0);
    744      1.1  mrg     }
    745      1.1  mrg 
    746      1.1  mrg   if (count != GFC_DESCRIPTOR_EXTENT(b,0))
    747      1.1  mrg     {
    748      1.1  mrg       if (count > 0 || GFC_DESCRIPTOR_EXTENT(b,0) > 0)
    749      1.1  mrg 	runtime_error ("Incorrect extent in argument B in MATMUL intrinsic "
    750      1.1  mrg 		       "in dimension 1: is %ld, should be %ld",
    751      1.1  mrg 		       (long int) GFC_DESCRIPTOR_EXTENT(b,0), (long int) count);
    752      1.1  mrg     }
    753      1.1  mrg 
    754      1.1  mrg   if (GFC_DESCRIPTOR_RANK (b) == 1)
    755      1.1  mrg     {
    756      1.1  mrg       /* Treat it as a column matrix B[count,1] */
    757      1.1  mrg       bxstride = GFC_DESCRIPTOR_STRIDE(b,0);
    758      1.1  mrg 
    759      1.1  mrg       /* bystride should never be used for 1-dimensional b.
    760      1.1  mrg          The value is only used for calculation of the
    761      1.1  mrg          memory by the buffer.  */
    762      1.1  mrg       bystride = 256;
    763      1.1  mrg       ycount = 1;
    764      1.1  mrg     }
    765      1.1  mrg   else
    766      1.1  mrg     {
    767      1.1  mrg       bxstride = GFC_DESCRIPTOR_STRIDE(b,0);
    768      1.1  mrg       bystride = GFC_DESCRIPTOR_STRIDE(b,1);
    769      1.1  mrg       ycount = GFC_DESCRIPTOR_EXTENT(b,1);
    770      1.1  mrg     }
    771      1.1  mrg 
    772      1.1  mrg   abase = a->base_addr;
    773      1.1  mrg   bbase = b->base_addr;
    774      1.1  mrg   dest = retarray->base_addr;
    775      1.1  mrg 
    776      1.1  mrg   /* Now that everything is set up, we perform the multiplication
    777      1.1  mrg      itself.  */
    778      1.1  mrg 
    779      1.1  mrg #define POW3(x) (((float) (x)) * ((float) (x)) * ((float) (x)))
    780      1.1  mrg #define min(a,b) ((a) <= (b) ? (a) : (b))
    781      1.1  mrg #define max(a,b) ((a) >= (b) ? (a) : (b))
    782      1.1  mrg 
    783      1.1  mrg   if (try_blas && rxstride == 1 && (axstride == 1 || aystride == 1)
    784      1.1  mrg       && (bxstride == 1 || bystride == 1)
    785      1.1  mrg       && (((float) xcount) * ((float) ycount) * ((float) count)
    786      1.1  mrg           > POW3(blas_limit)))
    787      1.1  mrg     {
    788      1.1  mrg       const int m = xcount, n = ycount, k = count, ldc = rystride;
    789      1.1  mrg       const GFC_REAL_10 one = 1, zero = 0;
    790      1.1  mrg       const int lda = (axstride == 1) ? aystride : axstride,
    791      1.1  mrg 		ldb = (bxstride == 1) ? bystride : bxstride;
    792      1.1  mrg 
    793      1.1  mrg       if (lda > 0 && ldb > 0 && ldc > 0 && m > 1 && n > 1 && k > 1)
    794      1.1  mrg 	{
    795      1.1  mrg 	  assert (gemm != NULL);
    796      1.1  mrg 	  const char *transa, *transb;
    797      1.1  mrg 	  if (try_blas & 2)
    798      1.1  mrg 	    transa = "C";
    799      1.1  mrg 	  else
    800      1.1  mrg 	    transa = axstride == 1 ? "N" : "T";
    801      1.1  mrg 
    802      1.1  mrg 	  if (try_blas & 4)
    803      1.1  mrg 	    transb = "C";
    804      1.1  mrg 	  else
    805      1.1  mrg 	    transb = bxstride == 1 ? "N" : "T";
    806      1.1  mrg 
    807      1.1  mrg 	  gemm (transa, transb , &m,
    808      1.1  mrg 		&n, &k,	&one, abase, &lda, bbase, &ldb, &zero, dest,
    809      1.1  mrg 		&ldc, 1, 1);
    810      1.1  mrg 	  return;
    811      1.1  mrg 	}
    812      1.1  mrg     }
    813      1.1  mrg 
    814  1.1.1.2  mrg   if (rxstride == 1 && axstride == 1 && bxstride == 1
    815  1.1.1.2  mrg       && GFC_DESCRIPTOR_RANK (b) != 1)
    816      1.1  mrg     {
    817      1.1  mrg       /* This block of code implements a tuned matmul, derived from
    818      1.1  mrg          Superscalar GEMM-based level 3 BLAS,  Beta version 0.1
    819      1.1  mrg 
    820      1.1  mrg                Bo Kagstrom and Per Ling
    821      1.1  mrg                Department of Computing Science
    822      1.1  mrg                Umea University
    823      1.1  mrg                S-901 87 Umea, Sweden
    824      1.1  mrg 
    825      1.1  mrg 	 from netlib.org, translated to C, and modified for matmul.m4.  */
    826      1.1  mrg 
    827      1.1  mrg       const GFC_REAL_10 *a, *b;
    828      1.1  mrg       GFC_REAL_10 *c;
    829      1.1  mrg       const index_type m = xcount, n = ycount, k = count;
    830      1.1  mrg 
    831      1.1  mrg       /* System generated locals */
    832      1.1  mrg       index_type a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset,
    833      1.1  mrg 		 i1, i2, i3, i4, i5, i6;
    834      1.1  mrg 
    835      1.1  mrg       /* Local variables */
    836      1.1  mrg       GFC_REAL_10 f11, f12, f21, f22, f31, f32, f41, f42,
    837      1.1  mrg 		 f13, f14, f23, f24, f33, f34, f43, f44;
    838      1.1  mrg       index_type i, j, l, ii, jj, ll;
    839      1.1  mrg       index_type isec, jsec, lsec, uisec, ujsec, ulsec;
    840      1.1  mrg       GFC_REAL_10 *t1;
    841      1.1  mrg 
    842      1.1  mrg       a = abase;
    843      1.1  mrg       b = bbase;
    844      1.1  mrg       c = retarray->base_addr;
    845      1.1  mrg 
    846      1.1  mrg       /* Parameter adjustments */
    847      1.1  mrg       c_dim1 = rystride;
    848      1.1  mrg       c_offset = 1 + c_dim1;
    849      1.1  mrg       c -= c_offset;
    850      1.1  mrg       a_dim1 = aystride;
    851      1.1  mrg       a_offset = 1 + a_dim1;
    852      1.1  mrg       a -= a_offset;
    853      1.1  mrg       b_dim1 = bystride;
    854      1.1  mrg       b_offset = 1 + b_dim1;
    855      1.1  mrg       b -= b_offset;
    856      1.1  mrg 
    857      1.1  mrg       /* Empty c first.  */
    858      1.1  mrg       for (j=1; j<=n; j++)
    859      1.1  mrg 	for (i=1; i<=m; i++)
    860      1.1  mrg 	  c[i + j * c_dim1] = (GFC_REAL_10)0;
    861      1.1  mrg 
    862      1.1  mrg       /* Early exit if possible */
    863      1.1  mrg       if (m == 0 || n == 0 || k == 0)
    864      1.1  mrg 	return;
    865      1.1  mrg 
    866      1.1  mrg       /* Adjust size of t1 to what is needed.  */
    867      1.1  mrg       index_type t1_dim, a_sz;
    868      1.1  mrg       if (aystride == 1)
    869      1.1  mrg         a_sz = rystride;
    870      1.1  mrg       else
    871      1.1  mrg         a_sz = a_dim1;
    872      1.1  mrg 
    873      1.1  mrg       t1_dim = a_sz * 256 + b_dim1;
    874      1.1  mrg       if (t1_dim > 65536)
    875      1.1  mrg 	t1_dim = 65536;
    876      1.1  mrg 
    877      1.1  mrg       t1 = malloc (t1_dim * sizeof(GFC_REAL_10));
    878      1.1  mrg 
    879      1.1  mrg       /* Start turning the crank. */
    880      1.1  mrg       i1 = n;
    881      1.1  mrg       for (jj = 1; jj <= i1; jj += 512)
    882      1.1  mrg 	{
    883      1.1  mrg 	  /* Computing MIN */
    884      1.1  mrg 	  i2 = 512;
    885      1.1  mrg 	  i3 = n - jj + 1;
    886      1.1  mrg 	  jsec = min(i2,i3);
    887      1.1  mrg 	  ujsec = jsec - jsec % 4;
    888      1.1  mrg 	  i2 = k;
    889      1.1  mrg 	  for (ll = 1; ll <= i2; ll += 256)
    890      1.1  mrg 	    {
    891      1.1  mrg 	      /* Computing MIN */
    892      1.1  mrg 	      i3 = 256;
    893      1.1  mrg 	      i4 = k - ll + 1;
    894      1.1  mrg 	      lsec = min(i3,i4);
    895      1.1  mrg 	      ulsec = lsec - lsec % 2;
    896      1.1  mrg 
    897      1.1  mrg 	      i3 = m;
    898      1.1  mrg 	      for (ii = 1; ii <= i3; ii += 256)
    899      1.1  mrg 		{
    900      1.1  mrg 		  /* Computing MIN */
    901      1.1  mrg 		  i4 = 256;
    902      1.1  mrg 		  i5 = m - ii + 1;
    903      1.1  mrg 		  isec = min(i4,i5);
    904      1.1  mrg 		  uisec = isec - isec % 2;
    905      1.1  mrg 		  i4 = ll + ulsec - 1;
    906      1.1  mrg 		  for (l = ll; l <= i4; l += 2)
    907      1.1  mrg 		    {
    908      1.1  mrg 		      i5 = ii + uisec - 1;
    909      1.1  mrg 		      for (i = ii; i <= i5; i += 2)
    910      1.1  mrg 			{
    911      1.1  mrg 			  t1[l - ll + 1 + ((i - ii + 1) << 8) - 257] =
    912      1.1  mrg 					a[i + l * a_dim1];
    913      1.1  mrg 			  t1[l - ll + 2 + ((i - ii + 1) << 8) - 257] =
    914      1.1  mrg 					a[i + (l + 1) * a_dim1];
    915      1.1  mrg 			  t1[l - ll + 1 + ((i - ii + 2) << 8) - 257] =
    916      1.1  mrg 					a[i + 1 + l * a_dim1];
    917      1.1  mrg 			  t1[l - ll + 2 + ((i - ii + 2) << 8) - 257] =
    918      1.1  mrg 					a[i + 1 + (l + 1) * a_dim1];
    919      1.1  mrg 			}
    920      1.1  mrg 		      if (uisec < isec)
    921      1.1  mrg 			{
    922      1.1  mrg 			  t1[l - ll + 1 + (isec << 8) - 257] =
    923      1.1  mrg 				    a[ii + isec - 1 + l * a_dim1];
    924      1.1  mrg 			  t1[l - ll + 2 + (isec << 8) - 257] =
    925      1.1  mrg 				    a[ii + isec - 1 + (l + 1) * a_dim1];
    926      1.1  mrg 			}
    927      1.1  mrg 		    }
    928      1.1  mrg 		  if (ulsec < lsec)
    929      1.1  mrg 		    {
    930      1.1  mrg 		      i4 = ii + isec - 1;
    931      1.1  mrg 		      for (i = ii; i<= i4; ++i)
    932      1.1  mrg 			{
    933      1.1  mrg 			  t1[lsec + ((i - ii + 1) << 8) - 257] =
    934      1.1  mrg 				    a[i + (ll + lsec - 1) * a_dim1];
    935      1.1  mrg 			}
    936      1.1  mrg 		    }
    937      1.1  mrg 
    938      1.1  mrg 		  uisec = isec - isec % 4;
    939      1.1  mrg 		  i4 = jj + ujsec - 1;
    940      1.1  mrg 		  for (j = jj; j <= i4; j += 4)
    941      1.1  mrg 		    {
    942      1.1  mrg 		      i5 = ii + uisec - 1;
    943      1.1  mrg 		      for (i = ii; i <= i5; i += 4)
    944      1.1  mrg 			{
    945      1.1  mrg 			  f11 = c[i + j * c_dim1];
    946      1.1  mrg 			  f21 = c[i + 1 + j * c_dim1];
    947      1.1  mrg 			  f12 = c[i + (j + 1) * c_dim1];
    948      1.1  mrg 			  f22 = c[i + 1 + (j + 1) * c_dim1];
    949      1.1  mrg 			  f13 = c[i + (j + 2) * c_dim1];
    950      1.1  mrg 			  f23 = c[i + 1 + (j + 2) * c_dim1];
    951      1.1  mrg 			  f14 = c[i + (j + 3) * c_dim1];
    952      1.1  mrg 			  f24 = c[i + 1 + (j + 3) * c_dim1];
    953      1.1  mrg 			  f31 = c[i + 2 + j * c_dim1];
    954      1.1  mrg 			  f41 = c[i + 3 + j * c_dim1];
    955      1.1  mrg 			  f32 = c[i + 2 + (j + 1) * c_dim1];
    956      1.1  mrg 			  f42 = c[i + 3 + (j + 1) * c_dim1];
    957      1.1  mrg 			  f33 = c[i + 2 + (j + 2) * c_dim1];
    958      1.1  mrg 			  f43 = c[i + 3 + (j + 2) * c_dim1];
    959      1.1  mrg 			  f34 = c[i + 2 + (j + 3) * c_dim1];
    960      1.1  mrg 			  f44 = c[i + 3 + (j + 3) * c_dim1];
    961      1.1  mrg 			  i6 = ll + lsec - 1;
    962      1.1  mrg 			  for (l = ll; l <= i6; ++l)
    963      1.1  mrg 			    {
    964      1.1  mrg 			      f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
    965      1.1  mrg 				      * b[l + j * b_dim1];
    966      1.1  mrg 			      f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
    967      1.1  mrg 				      * b[l + j * b_dim1];
    968      1.1  mrg 			      f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
    969      1.1  mrg 				      * b[l + (j + 1) * b_dim1];
    970      1.1  mrg 			      f22 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
    971      1.1  mrg 				      * b[l + (j + 1) * b_dim1];
    972      1.1  mrg 			      f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
    973      1.1  mrg 				      * b[l + (j + 2) * b_dim1];
    974      1.1  mrg 			      f23 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
    975      1.1  mrg 				      * b[l + (j + 2) * b_dim1];
    976      1.1  mrg 			      f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
    977      1.1  mrg 				      * b[l + (j + 3) * b_dim1];
    978      1.1  mrg 			      f24 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
    979      1.1  mrg 				      * b[l + (j + 3) * b_dim1];
    980      1.1  mrg 			      f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
    981      1.1  mrg 				      * b[l + j * b_dim1];
    982      1.1  mrg 			      f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
    983      1.1  mrg 				      * b[l + j * b_dim1];
    984      1.1  mrg 			      f32 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
    985      1.1  mrg 				      * b[l + (j + 1) * b_dim1];
    986      1.1  mrg 			      f42 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
    987      1.1  mrg 				      * b[l + (j + 1) * b_dim1];
    988      1.1  mrg 			      f33 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
    989      1.1  mrg 				      * b[l + (j + 2) * b_dim1];
    990      1.1  mrg 			      f43 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
    991      1.1  mrg 				      * b[l + (j + 2) * b_dim1];
    992      1.1  mrg 			      f34 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
    993      1.1  mrg 				      * b[l + (j + 3) * b_dim1];
    994      1.1  mrg 			      f44 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
    995      1.1  mrg 				      * b[l + (j + 3) * b_dim1];
    996      1.1  mrg 			    }
    997      1.1  mrg 			  c[i + j * c_dim1] = f11;
    998      1.1  mrg 			  c[i + 1 + j * c_dim1] = f21;
    999      1.1  mrg 			  c[i + (j + 1) * c_dim1] = f12;
   1000      1.1  mrg 			  c[i + 1 + (j + 1) * c_dim1] = f22;
   1001      1.1  mrg 			  c[i + (j + 2) * c_dim1] = f13;
   1002      1.1  mrg 			  c[i + 1 + (j + 2) * c_dim1] = f23;
   1003      1.1  mrg 			  c[i + (j + 3) * c_dim1] = f14;
   1004      1.1  mrg 			  c[i + 1 + (j + 3) * c_dim1] = f24;
   1005      1.1  mrg 			  c[i + 2 + j * c_dim1] = f31;
   1006      1.1  mrg 			  c[i + 3 + j * c_dim1] = f41;
   1007      1.1  mrg 			  c[i + 2 + (j + 1) * c_dim1] = f32;
   1008      1.1  mrg 			  c[i + 3 + (j + 1) * c_dim1] = f42;
   1009      1.1  mrg 			  c[i + 2 + (j + 2) * c_dim1] = f33;
   1010      1.1  mrg 			  c[i + 3 + (j + 2) * c_dim1] = f43;
   1011      1.1  mrg 			  c[i + 2 + (j + 3) * c_dim1] = f34;
   1012      1.1  mrg 			  c[i + 3 + (j + 3) * c_dim1] = f44;
   1013      1.1  mrg 			}
   1014      1.1  mrg 		      if (uisec < isec)
   1015      1.1  mrg 			{
   1016      1.1  mrg 			  i5 = ii + isec - 1;
   1017      1.1  mrg 			  for (i = ii + uisec; i <= i5; ++i)
   1018      1.1  mrg 			    {
   1019      1.1  mrg 			      f11 = c[i + j * c_dim1];
   1020      1.1  mrg 			      f12 = c[i + (j + 1) * c_dim1];
   1021      1.1  mrg 			      f13 = c[i + (j + 2) * c_dim1];
   1022      1.1  mrg 			      f14 = c[i + (j + 3) * c_dim1];
   1023      1.1  mrg 			      i6 = ll + lsec - 1;
   1024      1.1  mrg 			      for (l = ll; l <= i6; ++l)
   1025      1.1  mrg 				{
   1026      1.1  mrg 				  f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
   1027      1.1  mrg 					  257] * b[l + j * b_dim1];
   1028      1.1  mrg 				  f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
   1029      1.1  mrg 					  257] * b[l + (j + 1) * b_dim1];
   1030      1.1  mrg 				  f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
   1031      1.1  mrg 					  257] * b[l + (j + 2) * b_dim1];
   1032      1.1  mrg 				  f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
   1033      1.1  mrg 					  257] * b[l + (j + 3) * b_dim1];
   1034      1.1  mrg 				}
   1035      1.1  mrg 			      c[i + j * c_dim1] = f11;
   1036      1.1  mrg 			      c[i + (j + 1) * c_dim1] = f12;
   1037      1.1  mrg 			      c[i + (j + 2) * c_dim1] = f13;
   1038      1.1  mrg 			      c[i + (j + 3) * c_dim1] = f14;
   1039      1.1  mrg 			    }
   1040      1.1  mrg 			}
   1041      1.1  mrg 		    }
   1042      1.1  mrg 		  if (ujsec < jsec)
   1043      1.1  mrg 		    {
   1044      1.1  mrg 		      i4 = jj + jsec - 1;
   1045      1.1  mrg 		      for (j = jj + ujsec; j <= i4; ++j)
   1046      1.1  mrg 			{
   1047      1.1  mrg 			  i5 = ii + uisec - 1;
   1048      1.1  mrg 			  for (i = ii; i <= i5; i += 4)
   1049      1.1  mrg 			    {
   1050      1.1  mrg 			      f11 = c[i + j * c_dim1];
   1051      1.1  mrg 			      f21 = c[i + 1 + j * c_dim1];
   1052      1.1  mrg 			      f31 = c[i + 2 + j * c_dim1];
   1053      1.1  mrg 			      f41 = c[i + 3 + j * c_dim1];
   1054      1.1  mrg 			      i6 = ll + lsec - 1;
   1055      1.1  mrg 			      for (l = ll; l <= i6; ++l)
   1056      1.1  mrg 				{
   1057      1.1  mrg 				  f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
   1058      1.1  mrg 					  257] * b[l + j * b_dim1];
   1059      1.1  mrg 				  f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) -
   1060      1.1  mrg 					  257] * b[l + j * b_dim1];
   1061      1.1  mrg 				  f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) -
   1062      1.1  mrg 					  257] * b[l + j * b_dim1];
   1063      1.1  mrg 				  f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) -
   1064      1.1  mrg 					  257] * b[l + j * b_dim1];
   1065      1.1  mrg 				}
   1066      1.1  mrg 			      c[i + j * c_dim1] = f11;
   1067      1.1  mrg 			      c[i + 1 + j * c_dim1] = f21;
   1068      1.1  mrg 			      c[i + 2 + j * c_dim1] = f31;
   1069      1.1  mrg 			      c[i + 3 + j * c_dim1] = f41;
   1070      1.1  mrg 			    }
   1071      1.1  mrg 			  i5 = ii + isec - 1;
   1072      1.1  mrg 			  for (i = ii + uisec; i <= i5; ++i)
   1073      1.1  mrg 			    {
   1074      1.1  mrg 			      f11 = c[i + j * c_dim1];
   1075      1.1  mrg 			      i6 = ll + lsec - 1;
   1076      1.1  mrg 			      for (l = ll; l <= i6; ++l)
   1077      1.1  mrg 				{
   1078      1.1  mrg 				  f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
   1079      1.1  mrg 					  257] * b[l + j * b_dim1];
   1080      1.1  mrg 				}
   1081      1.1  mrg 			      c[i + j * c_dim1] = f11;
   1082      1.1  mrg 			    }
   1083      1.1  mrg 			}
   1084      1.1  mrg 		    }
   1085      1.1  mrg 		}
   1086      1.1  mrg 	    }
   1087      1.1  mrg 	}
   1088      1.1  mrg       free(t1);
   1089      1.1  mrg       return;
   1090      1.1  mrg     }
   1091      1.1  mrg   else if (rxstride == 1 && aystride == 1 && bxstride == 1)
   1092      1.1  mrg     {
   1093      1.1  mrg       if (GFC_DESCRIPTOR_RANK (a) != 1)
   1094      1.1  mrg 	{
   1095      1.1  mrg 	  const GFC_REAL_10 *restrict abase_x;
   1096      1.1  mrg 	  const GFC_REAL_10 *restrict bbase_y;
   1097      1.1  mrg 	  GFC_REAL_10 *restrict dest_y;
   1098      1.1  mrg 	  GFC_REAL_10 s;
   1099      1.1  mrg 
   1100      1.1  mrg 	  for (y = 0; y < ycount; y++)
   1101      1.1  mrg 	    {
   1102      1.1  mrg 	      bbase_y = &bbase[y*bystride];
   1103      1.1  mrg 	      dest_y = &dest[y*rystride];
   1104      1.1  mrg 	      for (x = 0; x < xcount; x++)
   1105      1.1  mrg 		{
   1106      1.1  mrg 		  abase_x = &abase[x*axstride];
   1107      1.1  mrg 		  s = (GFC_REAL_10) 0;
   1108      1.1  mrg 		  for (n = 0; n < count; n++)
   1109      1.1  mrg 		    s += abase_x[n] * bbase_y[n];
   1110      1.1  mrg 		  dest_y[x] = s;
   1111      1.1  mrg 		}
   1112      1.1  mrg 	    }
   1113      1.1  mrg 	}
   1114      1.1  mrg       else
   1115      1.1  mrg 	{
   1116      1.1  mrg 	  const GFC_REAL_10 *restrict bbase_y;
   1117      1.1  mrg 	  GFC_REAL_10 s;
   1118      1.1  mrg 
   1119      1.1  mrg 	  for (y = 0; y < ycount; y++)
   1120      1.1  mrg 	    {
   1121      1.1  mrg 	      bbase_y = &bbase[y*bystride];
   1122      1.1  mrg 	      s = (GFC_REAL_10) 0;
   1123      1.1  mrg 	      for (n = 0; n < count; n++)
   1124      1.1  mrg 		s += abase[n*axstride] * bbase_y[n];
   1125      1.1  mrg 	      dest[y*rystride] = s;
   1126      1.1  mrg 	    }
   1127      1.1  mrg 	}
   1128      1.1  mrg     }
   1129      1.1  mrg   else if (GFC_DESCRIPTOR_RANK (a) == 1)
   1130      1.1  mrg     {
   1131      1.1  mrg       const GFC_REAL_10 *restrict bbase_y;
   1132      1.1  mrg       GFC_REAL_10 s;
   1133      1.1  mrg 
   1134      1.1  mrg       for (y = 0; y < ycount; y++)
   1135      1.1  mrg 	{
   1136      1.1  mrg 	  bbase_y = &bbase[y*bystride];
   1137      1.1  mrg 	  s = (GFC_REAL_10) 0;
   1138      1.1  mrg 	  for (n = 0; n < count; n++)
   1139      1.1  mrg 	    s += abase[n*axstride] * bbase_y[n*bxstride];
   1140      1.1  mrg 	  dest[y*rxstride] = s;
   1141      1.1  mrg 	}
   1142      1.1  mrg     }
   1143  1.1.1.2  mrg   else if (axstride < aystride)
   1144  1.1.1.2  mrg     {
   1145  1.1.1.2  mrg       for (y = 0; y < ycount; y++)
   1146  1.1.1.2  mrg 	for (x = 0; x < xcount; x++)
   1147  1.1.1.2  mrg 	  dest[x*rxstride + y*rystride] = (GFC_REAL_10)0;
   1148  1.1.1.2  mrg 
   1149  1.1.1.2  mrg       for (y = 0; y < ycount; y++)
   1150  1.1.1.2  mrg 	for (n = 0; n < count; n++)
   1151  1.1.1.2  mrg 	  for (x = 0; x < xcount; x++)
   1152  1.1.1.2  mrg 	    /* dest[x,y] += a[x,n] * b[n,y] */
   1153  1.1.1.2  mrg 	    dest[x*rxstride + y*rystride] +=
   1154  1.1.1.2  mrg 					abase[x*axstride + n*aystride] *
   1155  1.1.1.2  mrg 					bbase[n*bxstride + y*bystride];
   1156  1.1.1.2  mrg     }
   1157      1.1  mrg   else
   1158      1.1  mrg     {
   1159      1.1  mrg       const GFC_REAL_10 *restrict abase_x;
   1160      1.1  mrg       const GFC_REAL_10 *restrict bbase_y;
   1161      1.1  mrg       GFC_REAL_10 *restrict dest_y;
   1162      1.1  mrg       GFC_REAL_10 s;
   1163      1.1  mrg 
   1164      1.1  mrg       for (y = 0; y < ycount; y++)
   1165      1.1  mrg 	{
   1166      1.1  mrg 	  bbase_y = &bbase[y*bystride];
   1167      1.1  mrg 	  dest_y = &dest[y*rystride];
   1168      1.1  mrg 	  for (x = 0; x < xcount; x++)
   1169      1.1  mrg 	    {
   1170      1.1  mrg 	      abase_x = &abase[x*axstride];
   1171      1.1  mrg 	      s = (GFC_REAL_10) 0;
   1172      1.1  mrg 	      for (n = 0; n < count; n++)
   1173      1.1  mrg 		s += abase_x[n*aystride] * bbase_y[n*bxstride];
   1174      1.1  mrg 	      dest_y[x*rxstride] = s;
   1175      1.1  mrg 	    }
   1176      1.1  mrg 	}
   1177      1.1  mrg     }
   1178      1.1  mrg }
   1179      1.1  mrg #undef POW3
   1180      1.1  mrg #undef min
   1181      1.1  mrg #undef max
   1182      1.1  mrg 
   1183      1.1  mrg #endif
   1184      1.1  mrg 
   1185      1.1  mrg #endif
   1186      1.1  mrg 
   1187