Home | History | Annotate | Line # | Download | only in generated
      1      1.1  mrg /* Implementation of the MATMUL intrinsic
      2  1.1.1.4  mrg    Copyright (C) 2002-2024 Free Software Foundation, Inc.
      3      1.1  mrg    Contributed by Paul Brook <paul (at) nowt.org>
      4      1.1  mrg 
      5      1.1  mrg This file is part of the GNU Fortran runtime library (libgfortran).
      6      1.1  mrg 
      7      1.1  mrg Libgfortran is free software; you can redistribute it and/or
      8      1.1  mrg modify it under the terms of the GNU General Public
      9      1.1  mrg License as published by the Free Software Foundation; either
     10      1.1  mrg version 3 of the License, or (at your option) any later version.
     11      1.1  mrg 
     12      1.1  mrg Libgfortran is distributed in the hope that it will be useful,
     13      1.1  mrg but WITHOUT ANY WARRANTY; without even the implied warranty of
     14      1.1  mrg MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
     15      1.1  mrg GNU General Public License for more details.
     16      1.1  mrg 
     17      1.1  mrg Under Section 7 of GPL version 3, you are granted additional
     18      1.1  mrg permissions described in the GCC Runtime Library Exception, version
     19      1.1  mrg 3.1, as published by the Free Software Foundation.
     20      1.1  mrg 
     21      1.1  mrg You should have received a copy of the GNU General Public License and
     22      1.1  mrg a copy of the GCC Runtime Library Exception along with this program;
     23      1.1  mrg see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
     24      1.1  mrg <http://www.gnu.org/licenses/>.  */
     25      1.1  mrg 
     26      1.1  mrg #include "libgfortran.h"
     27      1.1  mrg #include <string.h>
     28      1.1  mrg #include <assert.h>
     29      1.1  mrg 
     30      1.1  mrg 
     31      1.1  mrg #if defined (HAVE_GFC_COMPLEX_16)
     32      1.1  mrg 
     33      1.1  mrg /* Prototype for the BLAS ?gemm subroutine, a pointer to which can be
     34      1.1  mrg    passed to us by the front-end, in which case we call it for large
     35      1.1  mrg    matrices.  */
     36      1.1  mrg 
     37      1.1  mrg typedef void (*blas_call)(const char *, const char *, const int *, const int *,
     38      1.1  mrg                           const int *, const GFC_COMPLEX_16 *, const GFC_COMPLEX_16 *,
     39      1.1  mrg                           const int *, const GFC_COMPLEX_16 *, const int *,
     40      1.1  mrg                           const GFC_COMPLEX_16 *, GFC_COMPLEX_16 *, const int *,
     41      1.1  mrg                           int, int);
     42      1.1  mrg 
     43      1.1  mrg /* The order of loops is different in the case of plain matrix
     44      1.1  mrg    multiplication C=MATMUL(A,B), and in the frequent special case where
     45      1.1  mrg    the argument A is the temporary result of a TRANSPOSE intrinsic:
     46      1.1  mrg    C=MATMUL(TRANSPOSE(A),B).  Transposed temporaries are detected by
     47      1.1  mrg    looking at their strides.
     48      1.1  mrg 
     49      1.1  mrg    The equivalent Fortran pseudo-code is:
     50      1.1  mrg 
     51      1.1  mrg    DIMENSION A(M,COUNT), B(COUNT,N), C(M,N)
     52      1.1  mrg    IF (.NOT.IS_TRANSPOSED(A)) THEN
     53      1.1  mrg      C = 0
     54      1.1  mrg      DO J=1,N
     55      1.1  mrg        DO K=1,COUNT
     56      1.1  mrg          DO I=1,M
     57      1.1  mrg            C(I,J) = C(I,J)+A(I,K)*B(K,J)
     58      1.1  mrg    ELSE
     59      1.1  mrg      DO J=1,N
     60      1.1  mrg        DO I=1,M
     61      1.1  mrg          S = 0
     62      1.1  mrg          DO K=1,COUNT
     63      1.1  mrg            S = S+A(I,K)*B(K,J)
     64      1.1  mrg          C(I,J) = S
     65      1.1  mrg    ENDIF
     66      1.1  mrg */
     67      1.1  mrg 
     68      1.1  mrg /* If try_blas is set to a nonzero value, then the matmul function will
     69      1.1  mrg    see if there is a way to perform the matrix multiplication by a call
     70      1.1  mrg    to the BLAS gemm function.  */
     71      1.1  mrg 
     72      1.1  mrg extern void matmul_c16 (gfc_array_c16 * const restrict retarray,
     73      1.1  mrg 	gfc_array_c16 * const restrict a, gfc_array_c16 * const restrict b, int try_blas,
     74      1.1  mrg 	int blas_limit, blas_call gemm);
     75      1.1  mrg export_proto(matmul_c16);
     76      1.1  mrg 
     77      1.1  mrg /* Put exhaustive list of possible architectures here here, ORed together.  */
     78      1.1  mrg 
     79      1.1  mrg #if defined(HAVE_AVX) || defined(HAVE_AVX2) || defined(HAVE_AVX512F)
     80      1.1  mrg 
     81      1.1  mrg #ifdef HAVE_AVX
     82      1.1  mrg static void
     83      1.1  mrg matmul_c16_avx (gfc_array_c16 * const restrict retarray,
     84      1.1  mrg 	gfc_array_c16 * const restrict a, gfc_array_c16 * const restrict b, int try_blas,
     85      1.1  mrg 	int blas_limit, blas_call gemm) __attribute__((__target__("avx")));
     86      1.1  mrg static void
     87      1.1  mrg matmul_c16_avx (gfc_array_c16 * const restrict retarray,
     88      1.1  mrg 	gfc_array_c16 * const restrict a, gfc_array_c16 * const restrict b, int try_blas,
     89      1.1  mrg 	int blas_limit, blas_call gemm)
     90      1.1  mrg {
     91      1.1  mrg   const GFC_COMPLEX_16 * restrict abase;
     92      1.1  mrg   const GFC_COMPLEX_16 * restrict bbase;
     93      1.1  mrg   GFC_COMPLEX_16 * restrict dest;
     94      1.1  mrg 
     95      1.1  mrg   index_type rxstride, rystride, axstride, aystride, bxstride, bystride;
     96      1.1  mrg   index_type x, y, n, count, xcount, ycount;
     97      1.1  mrg 
     98      1.1  mrg   assert (GFC_DESCRIPTOR_RANK (a) == 2
     99      1.1  mrg           || GFC_DESCRIPTOR_RANK (b) == 2);
    100      1.1  mrg 
    101      1.1  mrg /* C[xcount,ycount] = A[xcount, count] * B[count,ycount]
    102      1.1  mrg 
    103      1.1  mrg    Either A or B (but not both) can be rank 1:
    104      1.1  mrg 
    105      1.1  mrg    o One-dimensional argument A is implicitly treated as a row matrix
    106      1.1  mrg      dimensioned [1,count], so xcount=1.
    107      1.1  mrg 
    108      1.1  mrg    o One-dimensional argument B is implicitly treated as a column matrix
    109      1.1  mrg      dimensioned [count, 1], so ycount=1.
    110      1.1  mrg */
    111      1.1  mrg 
    112      1.1  mrg   if (retarray->base_addr == NULL)
    113      1.1  mrg     {
    114      1.1  mrg       if (GFC_DESCRIPTOR_RANK (a) == 1)
    115      1.1  mrg         {
    116      1.1  mrg 	  GFC_DIMENSION_SET(retarray->dim[0], 0,
    117      1.1  mrg 	                    GFC_DESCRIPTOR_EXTENT(b,1) - 1, 1);
    118      1.1  mrg         }
    119      1.1  mrg       else if (GFC_DESCRIPTOR_RANK (b) == 1)
    120      1.1  mrg         {
    121      1.1  mrg 	  GFC_DIMENSION_SET(retarray->dim[0], 0,
    122      1.1  mrg 	                    GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1);
    123      1.1  mrg         }
    124      1.1  mrg       else
    125      1.1  mrg         {
    126      1.1  mrg 	  GFC_DIMENSION_SET(retarray->dim[0], 0,
    127      1.1  mrg 	                    GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1);
    128      1.1  mrg 
    129      1.1  mrg           GFC_DIMENSION_SET(retarray->dim[1], 0,
    130      1.1  mrg 	                    GFC_DESCRIPTOR_EXTENT(b,1) - 1,
    131      1.1  mrg 			    GFC_DESCRIPTOR_EXTENT(retarray,0));
    132      1.1  mrg         }
    133      1.1  mrg 
    134      1.1  mrg       retarray->base_addr
    135      1.1  mrg 	= xmallocarray (size0 ((array_t *) retarray), sizeof (GFC_COMPLEX_16));
    136      1.1  mrg       retarray->offset = 0;
    137      1.1  mrg     }
    138      1.1  mrg   else if (unlikely (compile_options.bounds_check))
    139      1.1  mrg     {
    140      1.1  mrg       index_type ret_extent, arg_extent;
    141      1.1  mrg 
    142      1.1  mrg       if (GFC_DESCRIPTOR_RANK (a) == 1)
    143      1.1  mrg 	{
    144      1.1  mrg 	  arg_extent = GFC_DESCRIPTOR_EXTENT(b,1);
    145      1.1  mrg 	  ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
    146      1.1  mrg 	  if (arg_extent != ret_extent)
    147      1.1  mrg 	    runtime_error ("Array bound mismatch for dimension 1 of "
    148      1.1  mrg 	    		   "array (%ld/%ld) ",
    149      1.1  mrg 			   (long int) ret_extent, (long int) arg_extent);
    150      1.1  mrg 	}
    151      1.1  mrg       else if (GFC_DESCRIPTOR_RANK (b) == 1)
    152      1.1  mrg 	{
    153      1.1  mrg 	  arg_extent = GFC_DESCRIPTOR_EXTENT(a,0);
    154      1.1  mrg 	  ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
    155      1.1  mrg 	  if (arg_extent != ret_extent)
    156      1.1  mrg 	    runtime_error ("Array bound mismatch for dimension 1 of "
    157      1.1  mrg 	    		   "array (%ld/%ld) ",
    158      1.1  mrg 			   (long int) ret_extent, (long int) arg_extent);
    159      1.1  mrg 	}
    160      1.1  mrg       else
    161      1.1  mrg 	{
    162      1.1  mrg 	  arg_extent = GFC_DESCRIPTOR_EXTENT(a,0);
    163      1.1  mrg 	  ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
    164      1.1  mrg 	  if (arg_extent != ret_extent)
    165      1.1  mrg 	    runtime_error ("Array bound mismatch for dimension 1 of "
    166      1.1  mrg 	    		   "array (%ld/%ld) ",
    167      1.1  mrg 			   (long int) ret_extent, (long int) arg_extent);
    168      1.1  mrg 
    169      1.1  mrg 	  arg_extent = GFC_DESCRIPTOR_EXTENT(b,1);
    170      1.1  mrg 	  ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,1);
    171      1.1  mrg 	  if (arg_extent != ret_extent)
    172      1.1  mrg 	    runtime_error ("Array bound mismatch for dimension 2 of "
    173      1.1  mrg 	    		   "array (%ld/%ld) ",
    174      1.1  mrg 			   (long int) ret_extent, (long int) arg_extent);
    175      1.1  mrg 	}
    176      1.1  mrg     }
    177      1.1  mrg 
    178      1.1  mrg 
    179      1.1  mrg   if (GFC_DESCRIPTOR_RANK (retarray) == 1)
    180      1.1  mrg     {
    181      1.1  mrg       /* One-dimensional result may be addressed in the code below
    182      1.1  mrg 	 either as a row or a column matrix. We want both cases to
    183      1.1  mrg 	 work. */
    184      1.1  mrg       rxstride = rystride = GFC_DESCRIPTOR_STRIDE(retarray,0);
    185      1.1  mrg     }
    186      1.1  mrg   else
    187      1.1  mrg     {
    188      1.1  mrg       rxstride = GFC_DESCRIPTOR_STRIDE(retarray,0);
    189      1.1  mrg       rystride = GFC_DESCRIPTOR_STRIDE(retarray,1);
    190      1.1  mrg     }
    191      1.1  mrg 
    192      1.1  mrg 
    193      1.1  mrg   if (GFC_DESCRIPTOR_RANK (a) == 1)
    194      1.1  mrg     {
    195      1.1  mrg       /* Treat it as a a row matrix A[1,count]. */
    196      1.1  mrg       axstride = GFC_DESCRIPTOR_STRIDE(a,0);
    197      1.1  mrg       aystride = 1;
    198      1.1  mrg 
    199      1.1  mrg       xcount = 1;
    200      1.1  mrg       count = GFC_DESCRIPTOR_EXTENT(a,0);
    201      1.1  mrg     }
    202      1.1  mrg   else
    203      1.1  mrg     {
    204      1.1  mrg       axstride = GFC_DESCRIPTOR_STRIDE(a,0);
    205      1.1  mrg       aystride = GFC_DESCRIPTOR_STRIDE(a,1);
    206      1.1  mrg 
    207      1.1  mrg       count = GFC_DESCRIPTOR_EXTENT(a,1);
    208      1.1  mrg       xcount = GFC_DESCRIPTOR_EXTENT(a,0);
    209      1.1  mrg     }
    210      1.1  mrg 
    211      1.1  mrg   if (count != GFC_DESCRIPTOR_EXTENT(b,0))
    212      1.1  mrg     {
    213      1.1  mrg       if (count > 0 || GFC_DESCRIPTOR_EXTENT(b,0) > 0)
    214      1.1  mrg 	runtime_error ("Incorrect extent in argument B in MATMUL intrinsic "
    215      1.1  mrg 		       "in dimension 1: is %ld, should be %ld",
    216      1.1  mrg 		       (long int) GFC_DESCRIPTOR_EXTENT(b,0), (long int) count);
    217      1.1  mrg     }
    218      1.1  mrg 
    219      1.1  mrg   if (GFC_DESCRIPTOR_RANK (b) == 1)
    220      1.1  mrg     {
    221      1.1  mrg       /* Treat it as a column matrix B[count,1] */
    222      1.1  mrg       bxstride = GFC_DESCRIPTOR_STRIDE(b,0);
    223      1.1  mrg 
    224      1.1  mrg       /* bystride should never be used for 1-dimensional b.
    225      1.1  mrg          The value is only used for calculation of the
    226      1.1  mrg          memory by the buffer.  */
    227      1.1  mrg       bystride = 256;
    228      1.1  mrg       ycount = 1;
    229      1.1  mrg     }
    230      1.1  mrg   else
    231      1.1  mrg     {
    232      1.1  mrg       bxstride = GFC_DESCRIPTOR_STRIDE(b,0);
    233      1.1  mrg       bystride = GFC_DESCRIPTOR_STRIDE(b,1);
    234      1.1  mrg       ycount = GFC_DESCRIPTOR_EXTENT(b,1);
    235      1.1  mrg     }
    236      1.1  mrg 
    237      1.1  mrg   abase = a->base_addr;
    238      1.1  mrg   bbase = b->base_addr;
    239      1.1  mrg   dest = retarray->base_addr;
    240      1.1  mrg 
    241      1.1  mrg   /* Now that everything is set up, we perform the multiplication
    242      1.1  mrg      itself.  */
    243      1.1  mrg 
    244      1.1  mrg #define POW3(x) (((float) (x)) * ((float) (x)) * ((float) (x)))
    245      1.1  mrg #define min(a,b) ((a) <= (b) ? (a) : (b))
    246      1.1  mrg #define max(a,b) ((a) >= (b) ? (a) : (b))
    247      1.1  mrg 
    248      1.1  mrg   if (try_blas && rxstride == 1 && (axstride == 1 || aystride == 1)
    249      1.1  mrg       && (bxstride == 1 || bystride == 1)
    250      1.1  mrg       && (((float) xcount) * ((float) ycount) * ((float) count)
    251      1.1  mrg           > POW3(blas_limit)))
    252      1.1  mrg     {
    253      1.1  mrg       const int m = xcount, n = ycount, k = count, ldc = rystride;
    254      1.1  mrg       const GFC_COMPLEX_16 one = 1, zero = 0;
    255      1.1  mrg       const int lda = (axstride == 1) ? aystride : axstride,
    256      1.1  mrg 		ldb = (bxstride == 1) ? bystride : bxstride;
    257      1.1  mrg 
    258      1.1  mrg       if (lda > 0 && ldb > 0 && ldc > 0 && m > 1 && n > 1 && k > 1)
    259      1.1  mrg 	{
    260      1.1  mrg 	  assert (gemm != NULL);
    261      1.1  mrg 	  const char *transa, *transb;
    262      1.1  mrg 	  if (try_blas & 2)
    263      1.1  mrg 	    transa = "C";
    264      1.1  mrg 	  else
    265      1.1  mrg 	    transa = axstride == 1 ? "N" : "T";
    266      1.1  mrg 
    267      1.1  mrg 	  if (try_blas & 4)
    268      1.1  mrg 	    transb = "C";
    269      1.1  mrg 	  else
    270      1.1  mrg 	    transb = bxstride == 1 ? "N" : "T";
    271      1.1  mrg 
    272      1.1  mrg 	  gemm (transa, transb , &m,
    273      1.1  mrg 		&n, &k,	&one, abase, &lda, bbase, &ldb, &zero, dest,
    274      1.1  mrg 		&ldc, 1, 1);
    275      1.1  mrg 	  return;
    276      1.1  mrg 	}
    277      1.1  mrg     }
    278      1.1  mrg 
    279  1.1.1.2  mrg   if (rxstride == 1 && axstride == 1 && bxstride == 1
    280  1.1.1.2  mrg       && GFC_DESCRIPTOR_RANK (b) != 1)
    281      1.1  mrg     {
    282      1.1  mrg       /* This block of code implements a tuned matmul, derived from
    283      1.1  mrg          Superscalar GEMM-based level 3 BLAS,  Beta version 0.1
    284      1.1  mrg 
    285      1.1  mrg                Bo Kagstrom and Per Ling
    286      1.1  mrg                Department of Computing Science
    287      1.1  mrg                Umea University
    288      1.1  mrg                S-901 87 Umea, Sweden
    289      1.1  mrg 
    290      1.1  mrg 	 from netlib.org, translated to C, and modified for matmul.m4.  */
    291      1.1  mrg 
    292      1.1  mrg       const GFC_COMPLEX_16 *a, *b;
    293      1.1  mrg       GFC_COMPLEX_16 *c;
    294      1.1  mrg       const index_type m = xcount, n = ycount, k = count;
    295      1.1  mrg 
    296      1.1  mrg       /* System generated locals */
    297      1.1  mrg       index_type a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset,
    298      1.1  mrg 		 i1, i2, i3, i4, i5, i6;
    299      1.1  mrg 
    300      1.1  mrg       /* Local variables */
    301      1.1  mrg       GFC_COMPLEX_16 f11, f12, f21, f22, f31, f32, f41, f42,
    302      1.1  mrg 		 f13, f14, f23, f24, f33, f34, f43, f44;
    303      1.1  mrg       index_type i, j, l, ii, jj, ll;
    304      1.1  mrg       index_type isec, jsec, lsec, uisec, ujsec, ulsec;
    305      1.1  mrg       GFC_COMPLEX_16 *t1;
    306      1.1  mrg 
    307      1.1  mrg       a = abase;
    308      1.1  mrg       b = bbase;
    309      1.1  mrg       c = retarray->base_addr;
    310      1.1  mrg 
    311      1.1  mrg       /* Parameter adjustments */
    312      1.1  mrg       c_dim1 = rystride;
    313      1.1  mrg       c_offset = 1 + c_dim1;
    314      1.1  mrg       c -= c_offset;
    315      1.1  mrg       a_dim1 = aystride;
    316      1.1  mrg       a_offset = 1 + a_dim1;
    317      1.1  mrg       a -= a_offset;
    318      1.1  mrg       b_dim1 = bystride;
    319      1.1  mrg       b_offset = 1 + b_dim1;
    320      1.1  mrg       b -= b_offset;
    321      1.1  mrg 
    322      1.1  mrg       /* Empty c first.  */
    323      1.1  mrg       for (j=1; j<=n; j++)
    324      1.1  mrg 	for (i=1; i<=m; i++)
    325      1.1  mrg 	  c[i + j * c_dim1] = (GFC_COMPLEX_16)0;
    326      1.1  mrg 
    327      1.1  mrg       /* Early exit if possible */
    328      1.1  mrg       if (m == 0 || n == 0 || k == 0)
    329      1.1  mrg 	return;
    330      1.1  mrg 
    331      1.1  mrg       /* Adjust size of t1 to what is needed.  */
    332      1.1  mrg       index_type t1_dim, a_sz;
    333      1.1  mrg       if (aystride == 1)
    334      1.1  mrg         a_sz = rystride;
    335      1.1  mrg       else
    336      1.1  mrg         a_sz = a_dim1;
    337      1.1  mrg 
    338      1.1  mrg       t1_dim = a_sz * 256 + b_dim1;
    339      1.1  mrg       if (t1_dim > 65536)
    340      1.1  mrg 	t1_dim = 65536;
    341      1.1  mrg 
    342      1.1  mrg       t1 = malloc (t1_dim * sizeof(GFC_COMPLEX_16));
    343      1.1  mrg 
    344      1.1  mrg       /* Start turning the crank. */
    345      1.1  mrg       i1 = n;
    346      1.1  mrg       for (jj = 1; jj <= i1; jj += 512)
    347      1.1  mrg 	{
    348      1.1  mrg 	  /* Computing MIN */
    349      1.1  mrg 	  i2 = 512;
    350      1.1  mrg 	  i3 = n - jj + 1;
    351      1.1  mrg 	  jsec = min(i2,i3);
    352      1.1  mrg 	  ujsec = jsec - jsec % 4;
    353      1.1  mrg 	  i2 = k;
    354      1.1  mrg 	  for (ll = 1; ll <= i2; ll += 256)
    355      1.1  mrg 	    {
    356      1.1  mrg 	      /* Computing MIN */
    357      1.1  mrg 	      i3 = 256;
    358      1.1  mrg 	      i4 = k - ll + 1;
    359      1.1  mrg 	      lsec = min(i3,i4);
    360      1.1  mrg 	      ulsec = lsec - lsec % 2;
    361      1.1  mrg 
    362      1.1  mrg 	      i3 = m;
    363      1.1  mrg 	      for (ii = 1; ii <= i3; ii += 256)
    364      1.1  mrg 		{
    365      1.1  mrg 		  /* Computing MIN */
    366      1.1  mrg 		  i4 = 256;
    367      1.1  mrg 		  i5 = m - ii + 1;
    368      1.1  mrg 		  isec = min(i4,i5);
    369      1.1  mrg 		  uisec = isec - isec % 2;
    370      1.1  mrg 		  i4 = ll + ulsec - 1;
    371      1.1  mrg 		  for (l = ll; l <= i4; l += 2)
    372      1.1  mrg 		    {
    373      1.1  mrg 		      i5 = ii + uisec - 1;
    374      1.1  mrg 		      for (i = ii; i <= i5; i += 2)
    375      1.1  mrg 			{
    376      1.1  mrg 			  t1[l - ll + 1 + ((i - ii + 1) << 8) - 257] =
    377      1.1  mrg 					a[i + l * a_dim1];
    378      1.1  mrg 			  t1[l - ll + 2 + ((i - ii + 1) << 8) - 257] =
    379      1.1  mrg 					a[i + (l + 1) * a_dim1];
    380      1.1  mrg 			  t1[l - ll + 1 + ((i - ii + 2) << 8) - 257] =
    381      1.1  mrg 					a[i + 1 + l * a_dim1];
    382      1.1  mrg 			  t1[l - ll + 2 + ((i - ii + 2) << 8) - 257] =
    383      1.1  mrg 					a[i + 1 + (l + 1) * a_dim1];
    384      1.1  mrg 			}
    385      1.1  mrg 		      if (uisec < isec)
    386      1.1  mrg 			{
    387      1.1  mrg 			  t1[l - ll + 1 + (isec << 8) - 257] =
    388      1.1  mrg 				    a[ii + isec - 1 + l * a_dim1];
    389      1.1  mrg 			  t1[l - ll + 2 + (isec << 8) - 257] =
    390      1.1  mrg 				    a[ii + isec - 1 + (l + 1) * a_dim1];
    391      1.1  mrg 			}
    392      1.1  mrg 		    }
    393      1.1  mrg 		  if (ulsec < lsec)
    394      1.1  mrg 		    {
    395      1.1  mrg 		      i4 = ii + isec - 1;
    396      1.1  mrg 		      for (i = ii; i<= i4; ++i)
    397      1.1  mrg 			{
    398      1.1  mrg 			  t1[lsec + ((i - ii + 1) << 8) - 257] =
    399      1.1  mrg 				    a[i + (ll + lsec - 1) * a_dim1];
    400      1.1  mrg 			}
    401      1.1  mrg 		    }
    402      1.1  mrg 
    403      1.1  mrg 		  uisec = isec - isec % 4;
    404      1.1  mrg 		  i4 = jj + ujsec - 1;
    405      1.1  mrg 		  for (j = jj; j <= i4; j += 4)
    406      1.1  mrg 		    {
    407      1.1  mrg 		      i5 = ii + uisec - 1;
    408      1.1  mrg 		      for (i = ii; i <= i5; i += 4)
    409      1.1  mrg 			{
    410      1.1  mrg 			  f11 = c[i + j * c_dim1];
    411      1.1  mrg 			  f21 = c[i + 1 + j * c_dim1];
    412      1.1  mrg 			  f12 = c[i + (j + 1) * c_dim1];
    413      1.1  mrg 			  f22 = c[i + 1 + (j + 1) * c_dim1];
    414      1.1  mrg 			  f13 = c[i + (j + 2) * c_dim1];
    415      1.1  mrg 			  f23 = c[i + 1 + (j + 2) * c_dim1];
    416      1.1  mrg 			  f14 = c[i + (j + 3) * c_dim1];
    417      1.1  mrg 			  f24 = c[i + 1 + (j + 3) * c_dim1];
    418      1.1  mrg 			  f31 = c[i + 2 + j * c_dim1];
    419      1.1  mrg 			  f41 = c[i + 3 + j * c_dim1];
    420      1.1  mrg 			  f32 = c[i + 2 + (j + 1) * c_dim1];
    421      1.1  mrg 			  f42 = c[i + 3 + (j + 1) * c_dim1];
    422      1.1  mrg 			  f33 = c[i + 2 + (j + 2) * c_dim1];
    423      1.1  mrg 			  f43 = c[i + 3 + (j + 2) * c_dim1];
    424      1.1  mrg 			  f34 = c[i + 2 + (j + 3) * c_dim1];
    425      1.1  mrg 			  f44 = c[i + 3 + (j + 3) * c_dim1];
    426      1.1  mrg 			  i6 = ll + lsec - 1;
    427      1.1  mrg 			  for (l = ll; l <= i6; ++l)
    428      1.1  mrg 			    {
    429      1.1  mrg 			      f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
    430      1.1  mrg 				      * b[l + j * b_dim1];
    431      1.1  mrg 			      f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
    432      1.1  mrg 				      * b[l + j * b_dim1];
    433      1.1  mrg 			      f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
    434      1.1  mrg 				      * b[l + (j + 1) * b_dim1];
    435      1.1  mrg 			      f22 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
    436      1.1  mrg 				      * b[l + (j + 1) * b_dim1];
    437      1.1  mrg 			      f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
    438      1.1  mrg 				      * b[l + (j + 2) * b_dim1];
    439      1.1  mrg 			      f23 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
    440      1.1  mrg 				      * b[l + (j + 2) * b_dim1];
    441      1.1  mrg 			      f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
    442      1.1  mrg 				      * b[l + (j + 3) * b_dim1];
    443      1.1  mrg 			      f24 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
    444      1.1  mrg 				      * b[l + (j + 3) * b_dim1];
    445      1.1  mrg 			      f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
    446      1.1  mrg 				      * b[l + j * b_dim1];
    447      1.1  mrg 			      f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
    448      1.1  mrg 				      * b[l + j * b_dim1];
    449      1.1  mrg 			      f32 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
    450      1.1  mrg 				      * b[l + (j + 1) * b_dim1];
    451      1.1  mrg 			      f42 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
    452      1.1  mrg 				      * b[l + (j + 1) * b_dim1];
    453      1.1  mrg 			      f33 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
    454      1.1  mrg 				      * b[l + (j + 2) * b_dim1];
    455      1.1  mrg 			      f43 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
    456      1.1  mrg 				      * b[l + (j + 2) * b_dim1];
    457      1.1  mrg 			      f34 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
    458      1.1  mrg 				      * b[l + (j + 3) * b_dim1];
    459      1.1  mrg 			      f44 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
    460      1.1  mrg 				      * b[l + (j + 3) * b_dim1];
    461      1.1  mrg 			    }
    462      1.1  mrg 			  c[i + j * c_dim1] = f11;
    463      1.1  mrg 			  c[i + 1 + j * c_dim1] = f21;
    464      1.1  mrg 			  c[i + (j + 1) * c_dim1] = f12;
    465      1.1  mrg 			  c[i + 1 + (j + 1) * c_dim1] = f22;
    466      1.1  mrg 			  c[i + (j + 2) * c_dim1] = f13;
    467      1.1  mrg 			  c[i + 1 + (j + 2) * c_dim1] = f23;
    468      1.1  mrg 			  c[i + (j + 3) * c_dim1] = f14;
    469      1.1  mrg 			  c[i + 1 + (j + 3) * c_dim1] = f24;
    470      1.1  mrg 			  c[i + 2 + j * c_dim1] = f31;
    471      1.1  mrg 			  c[i + 3 + j * c_dim1] = f41;
    472      1.1  mrg 			  c[i + 2 + (j + 1) * c_dim1] = f32;
    473      1.1  mrg 			  c[i + 3 + (j + 1) * c_dim1] = f42;
    474      1.1  mrg 			  c[i + 2 + (j + 2) * c_dim1] = f33;
    475      1.1  mrg 			  c[i + 3 + (j + 2) * c_dim1] = f43;
    476      1.1  mrg 			  c[i + 2 + (j + 3) * c_dim1] = f34;
    477      1.1  mrg 			  c[i + 3 + (j + 3) * c_dim1] = f44;
    478      1.1  mrg 			}
    479      1.1  mrg 		      if (uisec < isec)
    480      1.1  mrg 			{
    481      1.1  mrg 			  i5 = ii + isec - 1;
    482      1.1  mrg 			  for (i = ii + uisec; i <= i5; ++i)
    483      1.1  mrg 			    {
    484      1.1  mrg 			      f11 = c[i + j * c_dim1];
    485      1.1  mrg 			      f12 = c[i + (j + 1) * c_dim1];
    486      1.1  mrg 			      f13 = c[i + (j + 2) * c_dim1];
    487      1.1  mrg 			      f14 = c[i + (j + 3) * c_dim1];
    488      1.1  mrg 			      i6 = ll + lsec - 1;
    489      1.1  mrg 			      for (l = ll; l <= i6; ++l)
    490      1.1  mrg 				{
    491      1.1  mrg 				  f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
    492      1.1  mrg 					  257] * b[l + j * b_dim1];
    493      1.1  mrg 				  f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
    494      1.1  mrg 					  257] * b[l + (j + 1) * b_dim1];
    495      1.1  mrg 				  f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
    496      1.1  mrg 					  257] * b[l + (j + 2) * b_dim1];
    497      1.1  mrg 				  f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
    498      1.1  mrg 					  257] * b[l + (j + 3) * b_dim1];
    499      1.1  mrg 				}
    500      1.1  mrg 			      c[i + j * c_dim1] = f11;
    501      1.1  mrg 			      c[i + (j + 1) * c_dim1] = f12;
    502      1.1  mrg 			      c[i + (j + 2) * c_dim1] = f13;
    503      1.1  mrg 			      c[i + (j + 3) * c_dim1] = f14;
    504      1.1  mrg 			    }
    505      1.1  mrg 			}
    506      1.1  mrg 		    }
    507      1.1  mrg 		  if (ujsec < jsec)
    508      1.1  mrg 		    {
    509      1.1  mrg 		      i4 = jj + jsec - 1;
    510      1.1  mrg 		      for (j = jj + ujsec; j <= i4; ++j)
    511      1.1  mrg 			{
    512      1.1  mrg 			  i5 = ii + uisec - 1;
    513      1.1  mrg 			  for (i = ii; i <= i5; i += 4)
    514      1.1  mrg 			    {
    515      1.1  mrg 			      f11 = c[i + j * c_dim1];
    516      1.1  mrg 			      f21 = c[i + 1 + j * c_dim1];
    517      1.1  mrg 			      f31 = c[i + 2 + j * c_dim1];
    518      1.1  mrg 			      f41 = c[i + 3 + j * c_dim1];
    519      1.1  mrg 			      i6 = ll + lsec - 1;
    520      1.1  mrg 			      for (l = ll; l <= i6; ++l)
    521      1.1  mrg 				{
    522      1.1  mrg 				  f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
    523      1.1  mrg 					  257] * b[l + j * b_dim1];
    524      1.1  mrg 				  f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) -
    525      1.1  mrg 					  257] * b[l + j * b_dim1];
    526      1.1  mrg 				  f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) -
    527      1.1  mrg 					  257] * b[l + j * b_dim1];
    528      1.1  mrg 				  f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) -
    529      1.1  mrg 					  257] * b[l + j * b_dim1];
    530      1.1  mrg 				}
    531      1.1  mrg 			      c[i + j * c_dim1] = f11;
    532      1.1  mrg 			      c[i + 1 + j * c_dim1] = f21;
    533      1.1  mrg 			      c[i + 2 + j * c_dim1] = f31;
    534      1.1  mrg 			      c[i + 3 + j * c_dim1] = f41;
    535      1.1  mrg 			    }
    536      1.1  mrg 			  i5 = ii + isec - 1;
    537      1.1  mrg 			  for (i = ii + uisec; i <= i5; ++i)
    538      1.1  mrg 			    {
    539      1.1  mrg 			      f11 = c[i + j * c_dim1];
    540      1.1  mrg 			      i6 = ll + lsec - 1;
    541      1.1  mrg 			      for (l = ll; l <= i6; ++l)
    542      1.1  mrg 				{
    543      1.1  mrg 				  f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
    544      1.1  mrg 					  257] * b[l + j * b_dim1];
    545      1.1  mrg 				}
    546      1.1  mrg 			      c[i + j * c_dim1] = f11;
    547      1.1  mrg 			    }
    548      1.1  mrg 			}
    549      1.1  mrg 		    }
    550      1.1  mrg 		}
    551      1.1  mrg 	    }
    552      1.1  mrg 	}
    553      1.1  mrg       free(t1);
    554      1.1  mrg       return;
    555      1.1  mrg     }
    556      1.1  mrg   else if (rxstride == 1 && aystride == 1 && bxstride == 1)
    557      1.1  mrg     {
    558      1.1  mrg       if (GFC_DESCRIPTOR_RANK (a) != 1)
    559      1.1  mrg 	{
    560      1.1  mrg 	  const GFC_COMPLEX_16 *restrict abase_x;
    561      1.1  mrg 	  const GFC_COMPLEX_16 *restrict bbase_y;
    562      1.1  mrg 	  GFC_COMPLEX_16 *restrict dest_y;
    563      1.1  mrg 	  GFC_COMPLEX_16 s;
    564      1.1  mrg 
    565      1.1  mrg 	  for (y = 0; y < ycount; y++)
    566      1.1  mrg 	    {
    567      1.1  mrg 	      bbase_y = &bbase[y*bystride];
    568      1.1  mrg 	      dest_y = &dest[y*rystride];
    569      1.1  mrg 	      for (x = 0; x < xcount; x++)
    570      1.1  mrg 		{
    571      1.1  mrg 		  abase_x = &abase[x*axstride];
    572      1.1  mrg 		  s = (GFC_COMPLEX_16) 0;
    573      1.1  mrg 		  for (n = 0; n < count; n++)
    574      1.1  mrg 		    s += abase_x[n] * bbase_y[n];
    575      1.1  mrg 		  dest_y[x] = s;
    576      1.1  mrg 		}
    577      1.1  mrg 	    }
    578      1.1  mrg 	}
    579      1.1  mrg       else
    580      1.1  mrg 	{
    581      1.1  mrg 	  const GFC_COMPLEX_16 *restrict bbase_y;
    582      1.1  mrg 	  GFC_COMPLEX_16 s;
    583      1.1  mrg 
    584      1.1  mrg 	  for (y = 0; y < ycount; y++)
    585      1.1  mrg 	    {
    586      1.1  mrg 	      bbase_y = &bbase[y*bystride];
    587      1.1  mrg 	      s = (GFC_COMPLEX_16) 0;
    588      1.1  mrg 	      for (n = 0; n < count; n++)
    589      1.1  mrg 		s += abase[n*axstride] * bbase_y[n];
    590      1.1  mrg 	      dest[y*rystride] = s;
    591      1.1  mrg 	    }
    592      1.1  mrg 	}
    593      1.1  mrg     }
    594      1.1  mrg   else if (GFC_DESCRIPTOR_RANK (a) == 1)
    595      1.1  mrg     {
    596      1.1  mrg       const GFC_COMPLEX_16 *restrict bbase_y;
    597      1.1  mrg       GFC_COMPLEX_16 s;
    598      1.1  mrg 
    599      1.1  mrg       for (y = 0; y < ycount; y++)
    600      1.1  mrg 	{
    601      1.1  mrg 	  bbase_y = &bbase[y*bystride];
    602      1.1  mrg 	  s = (GFC_COMPLEX_16) 0;
    603      1.1  mrg 	  for (n = 0; n < count; n++)
    604      1.1  mrg 	    s += abase[n*axstride] * bbase_y[n*bxstride];
    605      1.1  mrg 	  dest[y*rxstride] = s;
    606      1.1  mrg 	}
    607      1.1  mrg     }
    608  1.1.1.2  mrg   else if (axstride < aystride)
    609  1.1.1.2  mrg     {
    610  1.1.1.2  mrg       for (y = 0; y < ycount; y++)
    611  1.1.1.2  mrg 	for (x = 0; x < xcount; x++)
    612  1.1.1.2  mrg 	  dest[x*rxstride + y*rystride] = (GFC_COMPLEX_16)0;
    613  1.1.1.2  mrg 
    614  1.1.1.2  mrg       for (y = 0; y < ycount; y++)
    615  1.1.1.2  mrg 	for (n = 0; n < count; n++)
    616  1.1.1.2  mrg 	  for (x = 0; x < xcount; x++)
    617  1.1.1.2  mrg 	    /* dest[x,y] += a[x,n] * b[n,y] */
    618  1.1.1.2  mrg 	    dest[x*rxstride + y*rystride] +=
    619  1.1.1.2  mrg 					abase[x*axstride + n*aystride] *
    620  1.1.1.2  mrg 					bbase[n*bxstride + y*bystride];
    621  1.1.1.2  mrg     }
    622      1.1  mrg   else
    623      1.1  mrg     {
    624      1.1  mrg       const GFC_COMPLEX_16 *restrict abase_x;
    625      1.1  mrg       const GFC_COMPLEX_16 *restrict bbase_y;
    626      1.1  mrg       GFC_COMPLEX_16 *restrict dest_y;
    627      1.1  mrg       GFC_COMPLEX_16 s;
    628      1.1  mrg 
    629      1.1  mrg       for (y = 0; y < ycount; y++)
    630      1.1  mrg 	{
    631      1.1  mrg 	  bbase_y = &bbase[y*bystride];
    632      1.1  mrg 	  dest_y = &dest[y*rystride];
    633      1.1  mrg 	  for (x = 0; x < xcount; x++)
    634      1.1  mrg 	    {
    635      1.1  mrg 	      abase_x = &abase[x*axstride];
    636      1.1  mrg 	      s = (GFC_COMPLEX_16) 0;
    637      1.1  mrg 	      for (n = 0; n < count; n++)
    638      1.1  mrg 		s += abase_x[n*aystride] * bbase_y[n*bxstride];
    639      1.1  mrg 	      dest_y[x*rxstride] = s;
    640      1.1  mrg 	    }
    641      1.1  mrg 	}
    642      1.1  mrg     }
    643      1.1  mrg }
    644      1.1  mrg #undef POW3
    645      1.1  mrg #undef min
    646      1.1  mrg #undef max
    647      1.1  mrg 
    648      1.1  mrg #endif /* HAVE_AVX */
    649      1.1  mrg 
    650      1.1  mrg #ifdef HAVE_AVX2
    651      1.1  mrg static void
    652      1.1  mrg matmul_c16_avx2 (gfc_array_c16 * const restrict retarray,
    653      1.1  mrg 	gfc_array_c16 * const restrict a, gfc_array_c16 * const restrict b, int try_blas,
    654      1.1  mrg 	int blas_limit, blas_call gemm) __attribute__((__target__("avx2,fma")));
    655      1.1  mrg static void
    656      1.1  mrg matmul_c16_avx2 (gfc_array_c16 * const restrict retarray,
    657      1.1  mrg 	gfc_array_c16 * const restrict a, gfc_array_c16 * const restrict b, int try_blas,
    658      1.1  mrg 	int blas_limit, blas_call gemm)
    659      1.1  mrg {
    660      1.1  mrg   const GFC_COMPLEX_16 * restrict abase;
    661      1.1  mrg   const GFC_COMPLEX_16 * restrict bbase;
    662      1.1  mrg   GFC_COMPLEX_16 * restrict dest;
    663      1.1  mrg 
    664      1.1  mrg   index_type rxstride, rystride, axstride, aystride, bxstride, bystride;
    665      1.1  mrg   index_type x, y, n, count, xcount, ycount;
    666      1.1  mrg 
    667      1.1  mrg   assert (GFC_DESCRIPTOR_RANK (a) == 2
    668      1.1  mrg           || GFC_DESCRIPTOR_RANK (b) == 2);
    669      1.1  mrg 
    670      1.1  mrg /* C[xcount,ycount] = A[xcount, count] * B[count,ycount]
    671      1.1  mrg 
    672      1.1  mrg    Either A or B (but not both) can be rank 1:
    673      1.1  mrg 
    674      1.1  mrg    o One-dimensional argument A is implicitly treated as a row matrix
    675      1.1  mrg      dimensioned [1,count], so xcount=1.
    676      1.1  mrg 
    677      1.1  mrg    o One-dimensional argument B is implicitly treated as a column matrix
    678      1.1  mrg      dimensioned [count, 1], so ycount=1.
    679      1.1  mrg */
    680      1.1  mrg 
    681      1.1  mrg   if (retarray->base_addr == NULL)
    682      1.1  mrg     {
    683      1.1  mrg       if (GFC_DESCRIPTOR_RANK (a) == 1)
    684      1.1  mrg         {
    685      1.1  mrg 	  GFC_DIMENSION_SET(retarray->dim[0], 0,
    686      1.1  mrg 	                    GFC_DESCRIPTOR_EXTENT(b,1) - 1, 1);
    687      1.1  mrg         }
    688      1.1  mrg       else if (GFC_DESCRIPTOR_RANK (b) == 1)
    689      1.1  mrg         {
    690      1.1  mrg 	  GFC_DIMENSION_SET(retarray->dim[0], 0,
    691      1.1  mrg 	                    GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1);
    692      1.1  mrg         }
    693      1.1  mrg       else
    694      1.1  mrg         {
    695      1.1  mrg 	  GFC_DIMENSION_SET(retarray->dim[0], 0,
    696      1.1  mrg 	                    GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1);
    697      1.1  mrg 
    698      1.1  mrg           GFC_DIMENSION_SET(retarray->dim[1], 0,
    699      1.1  mrg 	                    GFC_DESCRIPTOR_EXTENT(b,1) - 1,
    700      1.1  mrg 			    GFC_DESCRIPTOR_EXTENT(retarray,0));
    701      1.1  mrg         }
    702      1.1  mrg 
    703      1.1  mrg       retarray->base_addr
    704      1.1  mrg 	= xmallocarray (size0 ((array_t *) retarray), sizeof (GFC_COMPLEX_16));
    705      1.1  mrg       retarray->offset = 0;
    706      1.1  mrg     }
    707      1.1  mrg   else if (unlikely (compile_options.bounds_check))
    708      1.1  mrg     {
    709      1.1  mrg       index_type ret_extent, arg_extent;
    710      1.1  mrg 
    711      1.1  mrg       if (GFC_DESCRIPTOR_RANK (a) == 1)
    712      1.1  mrg 	{
    713      1.1  mrg 	  arg_extent = GFC_DESCRIPTOR_EXTENT(b,1);
    714      1.1  mrg 	  ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
    715      1.1  mrg 	  if (arg_extent != ret_extent)
    716      1.1  mrg 	    runtime_error ("Array bound mismatch for dimension 1 of "
    717      1.1  mrg 	    		   "array (%ld/%ld) ",
    718      1.1  mrg 			   (long int) ret_extent, (long int) arg_extent);
    719      1.1  mrg 	}
    720      1.1  mrg       else if (GFC_DESCRIPTOR_RANK (b) == 1)
    721      1.1  mrg 	{
    722      1.1  mrg 	  arg_extent = GFC_DESCRIPTOR_EXTENT(a,0);
    723      1.1  mrg 	  ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
    724      1.1  mrg 	  if (arg_extent != ret_extent)
    725      1.1  mrg 	    runtime_error ("Array bound mismatch for dimension 1 of "
    726      1.1  mrg 	    		   "array (%ld/%ld) ",
    727      1.1  mrg 			   (long int) ret_extent, (long int) arg_extent);
    728      1.1  mrg 	}
    729      1.1  mrg       else
    730      1.1  mrg 	{
    731      1.1  mrg 	  arg_extent = GFC_DESCRIPTOR_EXTENT(a,0);
    732      1.1  mrg 	  ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
    733      1.1  mrg 	  if (arg_extent != ret_extent)
    734      1.1  mrg 	    runtime_error ("Array bound mismatch for dimension 1 of "
    735      1.1  mrg 	    		   "array (%ld/%ld) ",
    736      1.1  mrg 			   (long int) ret_extent, (long int) arg_extent);
    737      1.1  mrg 
    738      1.1  mrg 	  arg_extent = GFC_DESCRIPTOR_EXTENT(b,1);
    739      1.1  mrg 	  ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,1);
    740      1.1  mrg 	  if (arg_extent != ret_extent)
    741      1.1  mrg 	    runtime_error ("Array bound mismatch for dimension 2 of "
    742      1.1  mrg 	    		   "array (%ld/%ld) ",
    743      1.1  mrg 			   (long int) ret_extent, (long int) arg_extent);
    744      1.1  mrg 	}
    745      1.1  mrg     }
    746      1.1  mrg 
    747      1.1  mrg 
    748      1.1  mrg   if (GFC_DESCRIPTOR_RANK (retarray) == 1)
    749      1.1  mrg     {
    750      1.1  mrg       /* One-dimensional result may be addressed in the code below
    751      1.1  mrg 	 either as a row or a column matrix. We want both cases to
    752      1.1  mrg 	 work. */
    753      1.1  mrg       rxstride = rystride = GFC_DESCRIPTOR_STRIDE(retarray,0);
    754      1.1  mrg     }
    755      1.1  mrg   else
    756      1.1  mrg     {
    757      1.1  mrg       rxstride = GFC_DESCRIPTOR_STRIDE(retarray,0);
    758      1.1  mrg       rystride = GFC_DESCRIPTOR_STRIDE(retarray,1);
    759      1.1  mrg     }
    760      1.1  mrg 
    761      1.1  mrg 
    762      1.1  mrg   if (GFC_DESCRIPTOR_RANK (a) == 1)
    763      1.1  mrg     {
    764      1.1  mrg       /* Treat it as a a row matrix A[1,count]. */
    765      1.1  mrg       axstride = GFC_DESCRIPTOR_STRIDE(a,0);
    766      1.1  mrg       aystride = 1;
    767      1.1  mrg 
    768      1.1  mrg       xcount = 1;
    769      1.1  mrg       count = GFC_DESCRIPTOR_EXTENT(a,0);
    770      1.1  mrg     }
    771      1.1  mrg   else
    772      1.1  mrg     {
    773      1.1  mrg       axstride = GFC_DESCRIPTOR_STRIDE(a,0);
    774      1.1  mrg       aystride = GFC_DESCRIPTOR_STRIDE(a,1);
    775      1.1  mrg 
    776      1.1  mrg       count = GFC_DESCRIPTOR_EXTENT(a,1);
    777      1.1  mrg       xcount = GFC_DESCRIPTOR_EXTENT(a,0);
    778      1.1  mrg     }
    779      1.1  mrg 
    780      1.1  mrg   if (count != GFC_DESCRIPTOR_EXTENT(b,0))
    781      1.1  mrg     {
    782      1.1  mrg       if (count > 0 || GFC_DESCRIPTOR_EXTENT(b,0) > 0)
    783      1.1  mrg 	runtime_error ("Incorrect extent in argument B in MATMUL intrinsic "
    784      1.1  mrg 		       "in dimension 1: is %ld, should be %ld",
    785      1.1  mrg 		       (long int) GFC_DESCRIPTOR_EXTENT(b,0), (long int) count);
    786      1.1  mrg     }
    787      1.1  mrg 
    788      1.1  mrg   if (GFC_DESCRIPTOR_RANK (b) == 1)
    789      1.1  mrg     {
    790      1.1  mrg       /* Treat it as a column matrix B[count,1] */
    791      1.1  mrg       bxstride = GFC_DESCRIPTOR_STRIDE(b,0);
    792      1.1  mrg 
    793      1.1  mrg       /* bystride should never be used for 1-dimensional b.
    794      1.1  mrg          The value is only used for calculation of the
    795      1.1  mrg          memory by the buffer.  */
    796      1.1  mrg       bystride = 256;
    797      1.1  mrg       ycount = 1;
    798      1.1  mrg     }
    799      1.1  mrg   else
    800      1.1  mrg     {
    801      1.1  mrg       bxstride = GFC_DESCRIPTOR_STRIDE(b,0);
    802      1.1  mrg       bystride = GFC_DESCRIPTOR_STRIDE(b,1);
    803      1.1  mrg       ycount = GFC_DESCRIPTOR_EXTENT(b,1);
    804      1.1  mrg     }
    805      1.1  mrg 
    806      1.1  mrg   abase = a->base_addr;
    807      1.1  mrg   bbase = b->base_addr;
    808      1.1  mrg   dest = retarray->base_addr;
    809      1.1  mrg 
    810      1.1  mrg   /* Now that everything is set up, we perform the multiplication
    811      1.1  mrg      itself.  */
    812      1.1  mrg 
    813      1.1  mrg #define POW3(x) (((float) (x)) * ((float) (x)) * ((float) (x)))
    814      1.1  mrg #define min(a,b) ((a) <= (b) ? (a) : (b))
    815      1.1  mrg #define max(a,b) ((a) >= (b) ? (a) : (b))
    816      1.1  mrg 
    817      1.1  mrg   if (try_blas && rxstride == 1 && (axstride == 1 || aystride == 1)
    818      1.1  mrg       && (bxstride == 1 || bystride == 1)
    819      1.1  mrg       && (((float) xcount) * ((float) ycount) * ((float) count)
    820      1.1  mrg           > POW3(blas_limit)))
    821      1.1  mrg     {
    822      1.1  mrg       const int m = xcount, n = ycount, k = count, ldc = rystride;
    823      1.1  mrg       const GFC_COMPLEX_16 one = 1, zero = 0;
    824      1.1  mrg       const int lda = (axstride == 1) ? aystride : axstride,
    825      1.1  mrg 		ldb = (bxstride == 1) ? bystride : bxstride;
    826      1.1  mrg 
    827      1.1  mrg       if (lda > 0 && ldb > 0 && ldc > 0 && m > 1 && n > 1 && k > 1)
    828      1.1  mrg 	{
    829      1.1  mrg 	  assert (gemm != NULL);
    830      1.1  mrg 	  const char *transa, *transb;
    831      1.1  mrg 	  if (try_blas & 2)
    832      1.1  mrg 	    transa = "C";
    833      1.1  mrg 	  else
    834      1.1  mrg 	    transa = axstride == 1 ? "N" : "T";
    835      1.1  mrg 
    836      1.1  mrg 	  if (try_blas & 4)
    837      1.1  mrg 	    transb = "C";
    838      1.1  mrg 	  else
    839      1.1  mrg 	    transb = bxstride == 1 ? "N" : "T";
    840      1.1  mrg 
    841      1.1  mrg 	  gemm (transa, transb , &m,
    842      1.1  mrg 		&n, &k,	&one, abase, &lda, bbase, &ldb, &zero, dest,
    843      1.1  mrg 		&ldc, 1, 1);
    844      1.1  mrg 	  return;
    845      1.1  mrg 	}
    846      1.1  mrg     }
    847      1.1  mrg 
    848  1.1.1.2  mrg   if (rxstride == 1 && axstride == 1 && bxstride == 1
    849  1.1.1.2  mrg       && GFC_DESCRIPTOR_RANK (b) != 1)
    850      1.1  mrg     {
    851      1.1  mrg       /* This block of code implements a tuned matmul, derived from
    852      1.1  mrg          Superscalar GEMM-based level 3 BLAS,  Beta version 0.1
    853      1.1  mrg 
    854      1.1  mrg                Bo Kagstrom and Per Ling
    855      1.1  mrg                Department of Computing Science
    856      1.1  mrg                Umea University
    857      1.1  mrg                S-901 87 Umea, Sweden
    858      1.1  mrg 
    859      1.1  mrg 	 from netlib.org, translated to C, and modified for matmul.m4.  */
    860      1.1  mrg 
    861      1.1  mrg       const GFC_COMPLEX_16 *a, *b;
    862      1.1  mrg       GFC_COMPLEX_16 *c;
    863      1.1  mrg       const index_type m = xcount, n = ycount, k = count;
    864      1.1  mrg 
    865      1.1  mrg       /* System generated locals */
    866      1.1  mrg       index_type a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset,
    867      1.1  mrg 		 i1, i2, i3, i4, i5, i6;
    868      1.1  mrg 
    869      1.1  mrg       /* Local variables */
    870      1.1  mrg       GFC_COMPLEX_16 f11, f12, f21, f22, f31, f32, f41, f42,
    871      1.1  mrg 		 f13, f14, f23, f24, f33, f34, f43, f44;
    872      1.1  mrg       index_type i, j, l, ii, jj, ll;
    873      1.1  mrg       index_type isec, jsec, lsec, uisec, ujsec, ulsec;
    874      1.1  mrg       GFC_COMPLEX_16 *t1;
    875      1.1  mrg 
    876      1.1  mrg       a = abase;
    877      1.1  mrg       b = bbase;
    878      1.1  mrg       c = retarray->base_addr;
    879      1.1  mrg 
    880      1.1  mrg       /* Parameter adjustments */
    881      1.1  mrg       c_dim1 = rystride;
    882      1.1  mrg       c_offset = 1 + c_dim1;
    883      1.1  mrg       c -= c_offset;
    884      1.1  mrg       a_dim1 = aystride;
    885      1.1  mrg       a_offset = 1 + a_dim1;
    886      1.1  mrg       a -= a_offset;
    887      1.1  mrg       b_dim1 = bystride;
    888      1.1  mrg       b_offset = 1 + b_dim1;
    889      1.1  mrg       b -= b_offset;
    890      1.1  mrg 
    891      1.1  mrg       /* Empty c first.  */
    892      1.1  mrg       for (j=1; j<=n; j++)
    893      1.1  mrg 	for (i=1; i<=m; i++)
    894      1.1  mrg 	  c[i + j * c_dim1] = (GFC_COMPLEX_16)0;
    895      1.1  mrg 
    896      1.1  mrg       /* Early exit if possible */
    897      1.1  mrg       if (m == 0 || n == 0 || k == 0)
    898      1.1  mrg 	return;
    899      1.1  mrg 
    900      1.1  mrg       /* Adjust size of t1 to what is needed.  */
    901      1.1  mrg       index_type t1_dim, a_sz;
    902      1.1  mrg       if (aystride == 1)
    903      1.1  mrg         a_sz = rystride;
    904      1.1  mrg       else
    905      1.1  mrg         a_sz = a_dim1;
    906      1.1  mrg 
    907      1.1  mrg       t1_dim = a_sz * 256 + b_dim1;
    908      1.1  mrg       if (t1_dim > 65536)
    909      1.1  mrg 	t1_dim = 65536;
    910      1.1  mrg 
    911      1.1  mrg       t1 = malloc (t1_dim * sizeof(GFC_COMPLEX_16));
    912      1.1  mrg 
    913      1.1  mrg       /* Start turning the crank. */
    914      1.1  mrg       i1 = n;
    915      1.1  mrg       for (jj = 1; jj <= i1; jj += 512)
    916      1.1  mrg 	{
    917      1.1  mrg 	  /* Computing MIN */
    918      1.1  mrg 	  i2 = 512;
    919      1.1  mrg 	  i3 = n - jj + 1;
    920      1.1  mrg 	  jsec = min(i2,i3);
    921      1.1  mrg 	  ujsec = jsec - jsec % 4;
    922      1.1  mrg 	  i2 = k;
    923      1.1  mrg 	  for (ll = 1; ll <= i2; ll += 256)
    924      1.1  mrg 	    {
    925      1.1  mrg 	      /* Computing MIN */
    926      1.1  mrg 	      i3 = 256;
    927      1.1  mrg 	      i4 = k - ll + 1;
    928      1.1  mrg 	      lsec = min(i3,i4);
    929      1.1  mrg 	      ulsec = lsec - lsec % 2;
    930      1.1  mrg 
    931      1.1  mrg 	      i3 = m;
    932      1.1  mrg 	      for (ii = 1; ii <= i3; ii += 256)
    933      1.1  mrg 		{
    934      1.1  mrg 		  /* Computing MIN */
    935      1.1  mrg 		  i4 = 256;
    936      1.1  mrg 		  i5 = m - ii + 1;
    937      1.1  mrg 		  isec = min(i4,i5);
    938      1.1  mrg 		  uisec = isec - isec % 2;
    939      1.1  mrg 		  i4 = ll + ulsec - 1;
    940      1.1  mrg 		  for (l = ll; l <= i4; l += 2)
    941      1.1  mrg 		    {
    942      1.1  mrg 		      i5 = ii + uisec - 1;
    943      1.1  mrg 		      for (i = ii; i <= i5; i += 2)
    944      1.1  mrg 			{
    945      1.1  mrg 			  t1[l - ll + 1 + ((i - ii + 1) << 8) - 257] =
    946      1.1  mrg 					a[i + l * a_dim1];
    947      1.1  mrg 			  t1[l - ll + 2 + ((i - ii + 1) << 8) - 257] =
    948      1.1  mrg 					a[i + (l + 1) * a_dim1];
    949      1.1  mrg 			  t1[l - ll + 1 + ((i - ii + 2) << 8) - 257] =
    950      1.1  mrg 					a[i + 1 + l * a_dim1];
    951      1.1  mrg 			  t1[l - ll + 2 + ((i - ii + 2) << 8) - 257] =
    952      1.1  mrg 					a[i + 1 + (l + 1) * a_dim1];
    953      1.1  mrg 			}
    954      1.1  mrg 		      if (uisec < isec)
    955      1.1  mrg 			{
    956      1.1  mrg 			  t1[l - ll + 1 + (isec << 8) - 257] =
    957      1.1  mrg 				    a[ii + isec - 1 + l * a_dim1];
    958      1.1  mrg 			  t1[l - ll + 2 + (isec << 8) - 257] =
    959      1.1  mrg 				    a[ii + isec - 1 + (l + 1) * a_dim1];
    960      1.1  mrg 			}
    961      1.1  mrg 		    }
    962      1.1  mrg 		  if (ulsec < lsec)
    963      1.1  mrg 		    {
    964      1.1  mrg 		      i4 = ii + isec - 1;
    965      1.1  mrg 		      for (i = ii; i<= i4; ++i)
    966      1.1  mrg 			{
    967      1.1  mrg 			  t1[lsec + ((i - ii + 1) << 8) - 257] =
    968      1.1  mrg 				    a[i + (ll + lsec - 1) * a_dim1];
    969      1.1  mrg 			}
    970      1.1  mrg 		    }
    971      1.1  mrg 
    972      1.1  mrg 		  uisec = isec - isec % 4;
    973      1.1  mrg 		  i4 = jj + ujsec - 1;
    974      1.1  mrg 		  for (j = jj; j <= i4; j += 4)
    975      1.1  mrg 		    {
    976      1.1  mrg 		      i5 = ii + uisec - 1;
    977      1.1  mrg 		      for (i = ii; i <= i5; i += 4)
    978      1.1  mrg 			{
    979      1.1  mrg 			  f11 = c[i + j * c_dim1];
    980      1.1  mrg 			  f21 = c[i + 1 + j * c_dim1];
    981      1.1  mrg 			  f12 = c[i + (j + 1) * c_dim1];
    982      1.1  mrg 			  f22 = c[i + 1 + (j + 1) * c_dim1];
    983      1.1  mrg 			  f13 = c[i + (j + 2) * c_dim1];
    984      1.1  mrg 			  f23 = c[i + 1 + (j + 2) * c_dim1];
    985      1.1  mrg 			  f14 = c[i + (j + 3) * c_dim1];
    986      1.1  mrg 			  f24 = c[i + 1 + (j + 3) * c_dim1];
    987      1.1  mrg 			  f31 = c[i + 2 + j * c_dim1];
    988      1.1  mrg 			  f41 = c[i + 3 + j * c_dim1];
    989      1.1  mrg 			  f32 = c[i + 2 + (j + 1) * c_dim1];
    990      1.1  mrg 			  f42 = c[i + 3 + (j + 1) * c_dim1];
    991      1.1  mrg 			  f33 = c[i + 2 + (j + 2) * c_dim1];
    992      1.1  mrg 			  f43 = c[i + 3 + (j + 2) * c_dim1];
    993      1.1  mrg 			  f34 = c[i + 2 + (j + 3) * c_dim1];
    994      1.1  mrg 			  f44 = c[i + 3 + (j + 3) * c_dim1];
    995      1.1  mrg 			  i6 = ll + lsec - 1;
    996      1.1  mrg 			  for (l = ll; l <= i6; ++l)
    997      1.1  mrg 			    {
    998      1.1  mrg 			      f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
    999      1.1  mrg 				      * b[l + j * b_dim1];
   1000      1.1  mrg 			      f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
   1001      1.1  mrg 				      * b[l + j * b_dim1];
   1002      1.1  mrg 			      f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
   1003      1.1  mrg 				      * b[l + (j + 1) * b_dim1];
   1004      1.1  mrg 			      f22 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
   1005      1.1  mrg 				      * b[l + (j + 1) * b_dim1];
   1006      1.1  mrg 			      f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
   1007      1.1  mrg 				      * b[l + (j + 2) * b_dim1];
   1008      1.1  mrg 			      f23 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
   1009      1.1  mrg 				      * b[l + (j + 2) * b_dim1];
   1010      1.1  mrg 			      f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
   1011      1.1  mrg 				      * b[l + (j + 3) * b_dim1];
   1012      1.1  mrg 			      f24 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
   1013      1.1  mrg 				      * b[l + (j + 3) * b_dim1];
   1014      1.1  mrg 			      f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
   1015      1.1  mrg 				      * b[l + j * b_dim1];
   1016      1.1  mrg 			      f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
   1017      1.1  mrg 				      * b[l + j * b_dim1];
   1018      1.1  mrg 			      f32 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
   1019      1.1  mrg 				      * b[l + (j + 1) * b_dim1];
   1020      1.1  mrg 			      f42 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
   1021      1.1  mrg 				      * b[l + (j + 1) * b_dim1];
   1022      1.1  mrg 			      f33 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
   1023      1.1  mrg 				      * b[l + (j + 2) * b_dim1];
   1024      1.1  mrg 			      f43 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
   1025      1.1  mrg 				      * b[l + (j + 2) * b_dim1];
   1026      1.1  mrg 			      f34 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
   1027      1.1  mrg 				      * b[l + (j + 3) * b_dim1];
   1028      1.1  mrg 			      f44 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
   1029      1.1  mrg 				      * b[l + (j + 3) * b_dim1];
   1030      1.1  mrg 			    }
   1031      1.1  mrg 			  c[i + j * c_dim1] = f11;
   1032      1.1  mrg 			  c[i + 1 + j * c_dim1] = f21;
   1033      1.1  mrg 			  c[i + (j + 1) * c_dim1] = f12;
   1034      1.1  mrg 			  c[i + 1 + (j + 1) * c_dim1] = f22;
   1035      1.1  mrg 			  c[i + (j + 2) * c_dim1] = f13;
   1036      1.1  mrg 			  c[i + 1 + (j + 2) * c_dim1] = f23;
   1037      1.1  mrg 			  c[i + (j + 3) * c_dim1] = f14;
   1038      1.1  mrg 			  c[i + 1 + (j + 3) * c_dim1] = f24;
   1039      1.1  mrg 			  c[i + 2 + j * c_dim1] = f31;
   1040      1.1  mrg 			  c[i + 3 + j * c_dim1] = f41;
   1041      1.1  mrg 			  c[i + 2 + (j + 1) * c_dim1] = f32;
   1042      1.1  mrg 			  c[i + 3 + (j + 1) * c_dim1] = f42;
   1043      1.1  mrg 			  c[i + 2 + (j + 2) * c_dim1] = f33;
   1044      1.1  mrg 			  c[i + 3 + (j + 2) * c_dim1] = f43;
   1045      1.1  mrg 			  c[i + 2 + (j + 3) * c_dim1] = f34;
   1046      1.1  mrg 			  c[i + 3 + (j + 3) * c_dim1] = f44;
   1047      1.1  mrg 			}
   1048      1.1  mrg 		      if (uisec < isec)
   1049      1.1  mrg 			{
   1050      1.1  mrg 			  i5 = ii + isec - 1;
   1051      1.1  mrg 			  for (i = ii + uisec; i <= i5; ++i)
   1052      1.1  mrg 			    {
   1053      1.1  mrg 			      f11 = c[i + j * c_dim1];
   1054      1.1  mrg 			      f12 = c[i + (j + 1) * c_dim1];
   1055      1.1  mrg 			      f13 = c[i + (j + 2) * c_dim1];
   1056      1.1  mrg 			      f14 = c[i + (j + 3) * c_dim1];
   1057      1.1  mrg 			      i6 = ll + lsec - 1;
   1058      1.1  mrg 			      for (l = ll; l <= i6; ++l)
   1059      1.1  mrg 				{
   1060      1.1  mrg 				  f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
   1061      1.1  mrg 					  257] * b[l + j * b_dim1];
   1062      1.1  mrg 				  f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
   1063      1.1  mrg 					  257] * b[l + (j + 1) * b_dim1];
   1064      1.1  mrg 				  f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
   1065      1.1  mrg 					  257] * b[l + (j + 2) * b_dim1];
   1066      1.1  mrg 				  f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
   1067      1.1  mrg 					  257] * b[l + (j + 3) * b_dim1];
   1068      1.1  mrg 				}
   1069      1.1  mrg 			      c[i + j * c_dim1] = f11;
   1070      1.1  mrg 			      c[i + (j + 1) * c_dim1] = f12;
   1071      1.1  mrg 			      c[i + (j + 2) * c_dim1] = f13;
   1072      1.1  mrg 			      c[i + (j + 3) * c_dim1] = f14;
   1073      1.1  mrg 			    }
   1074      1.1  mrg 			}
   1075      1.1  mrg 		    }
   1076      1.1  mrg 		  if (ujsec < jsec)
   1077      1.1  mrg 		    {
   1078      1.1  mrg 		      i4 = jj + jsec - 1;
   1079      1.1  mrg 		      for (j = jj + ujsec; j <= i4; ++j)
   1080      1.1  mrg 			{
   1081      1.1  mrg 			  i5 = ii + uisec - 1;
   1082      1.1  mrg 			  for (i = ii; i <= i5; i += 4)
   1083      1.1  mrg 			    {
   1084      1.1  mrg 			      f11 = c[i + j * c_dim1];
   1085      1.1  mrg 			      f21 = c[i + 1 + j * c_dim1];
   1086      1.1  mrg 			      f31 = c[i + 2 + j * c_dim1];
   1087      1.1  mrg 			      f41 = c[i + 3 + j * c_dim1];
   1088      1.1  mrg 			      i6 = ll + lsec - 1;
   1089      1.1  mrg 			      for (l = ll; l <= i6; ++l)
   1090      1.1  mrg 				{
   1091      1.1  mrg 				  f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
   1092      1.1  mrg 					  257] * b[l + j * b_dim1];
   1093      1.1  mrg 				  f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) -
   1094      1.1  mrg 					  257] * b[l + j * b_dim1];
   1095      1.1  mrg 				  f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) -
   1096      1.1  mrg 					  257] * b[l + j * b_dim1];
   1097      1.1  mrg 				  f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) -
   1098      1.1  mrg 					  257] * b[l + j * b_dim1];
   1099      1.1  mrg 				}
   1100      1.1  mrg 			      c[i + j * c_dim1] = f11;
   1101      1.1  mrg 			      c[i + 1 + j * c_dim1] = f21;
   1102      1.1  mrg 			      c[i + 2 + j * c_dim1] = f31;
   1103      1.1  mrg 			      c[i + 3 + j * c_dim1] = f41;
   1104      1.1  mrg 			    }
   1105      1.1  mrg 			  i5 = ii + isec - 1;
   1106      1.1  mrg 			  for (i = ii + uisec; i <= i5; ++i)
   1107      1.1  mrg 			    {
   1108      1.1  mrg 			      f11 = c[i + j * c_dim1];
   1109      1.1  mrg 			      i6 = ll + lsec - 1;
   1110      1.1  mrg 			      for (l = ll; l <= i6; ++l)
   1111      1.1  mrg 				{
   1112      1.1  mrg 				  f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
   1113      1.1  mrg 					  257] * b[l + j * b_dim1];
   1114      1.1  mrg 				}
   1115      1.1  mrg 			      c[i + j * c_dim1] = f11;
   1116      1.1  mrg 			    }
   1117      1.1  mrg 			}
   1118      1.1  mrg 		    }
   1119      1.1  mrg 		}
   1120      1.1  mrg 	    }
   1121      1.1  mrg 	}
   1122      1.1  mrg       free(t1);
   1123      1.1  mrg       return;
   1124      1.1  mrg     }
   1125      1.1  mrg   else if (rxstride == 1 && aystride == 1 && bxstride == 1)
   1126      1.1  mrg     {
   1127      1.1  mrg       if (GFC_DESCRIPTOR_RANK (a) != 1)
   1128      1.1  mrg 	{
   1129      1.1  mrg 	  const GFC_COMPLEX_16 *restrict abase_x;
   1130      1.1  mrg 	  const GFC_COMPLEX_16 *restrict bbase_y;
   1131      1.1  mrg 	  GFC_COMPLEX_16 *restrict dest_y;
   1132      1.1  mrg 	  GFC_COMPLEX_16 s;
   1133      1.1  mrg 
   1134      1.1  mrg 	  for (y = 0; y < ycount; y++)
   1135      1.1  mrg 	    {
   1136      1.1  mrg 	      bbase_y = &bbase[y*bystride];
   1137      1.1  mrg 	      dest_y = &dest[y*rystride];
   1138      1.1  mrg 	      for (x = 0; x < xcount; x++)
   1139      1.1  mrg 		{
   1140      1.1  mrg 		  abase_x = &abase[x*axstride];
   1141      1.1  mrg 		  s = (GFC_COMPLEX_16) 0;
   1142      1.1  mrg 		  for (n = 0; n < count; n++)
   1143      1.1  mrg 		    s += abase_x[n] * bbase_y[n];
   1144      1.1  mrg 		  dest_y[x] = s;
   1145      1.1  mrg 		}
   1146      1.1  mrg 	    }
   1147      1.1  mrg 	}
   1148      1.1  mrg       else
   1149      1.1  mrg 	{
   1150      1.1  mrg 	  const GFC_COMPLEX_16 *restrict bbase_y;
   1151      1.1  mrg 	  GFC_COMPLEX_16 s;
   1152      1.1  mrg 
   1153      1.1  mrg 	  for (y = 0; y < ycount; y++)
   1154      1.1  mrg 	    {
   1155      1.1  mrg 	      bbase_y = &bbase[y*bystride];
   1156      1.1  mrg 	      s = (GFC_COMPLEX_16) 0;
   1157      1.1  mrg 	      for (n = 0; n < count; n++)
   1158      1.1  mrg 		s += abase[n*axstride] * bbase_y[n];
   1159      1.1  mrg 	      dest[y*rystride] = s;
   1160      1.1  mrg 	    }
   1161      1.1  mrg 	}
   1162      1.1  mrg     }
   1163      1.1  mrg   else if (GFC_DESCRIPTOR_RANK (a) == 1)
   1164      1.1  mrg     {
   1165      1.1  mrg       const GFC_COMPLEX_16 *restrict bbase_y;
   1166      1.1  mrg       GFC_COMPLEX_16 s;
   1167      1.1  mrg 
   1168      1.1  mrg       for (y = 0; y < ycount; y++)
   1169      1.1  mrg 	{
   1170      1.1  mrg 	  bbase_y = &bbase[y*bystride];
   1171      1.1  mrg 	  s = (GFC_COMPLEX_16) 0;
   1172      1.1  mrg 	  for (n = 0; n < count; n++)
   1173      1.1  mrg 	    s += abase[n*axstride] * bbase_y[n*bxstride];
   1174      1.1  mrg 	  dest[y*rxstride] = s;
   1175      1.1  mrg 	}
   1176      1.1  mrg     }
   1177  1.1.1.2  mrg   else if (axstride < aystride)
   1178  1.1.1.2  mrg     {
   1179  1.1.1.2  mrg       for (y = 0; y < ycount; y++)
   1180  1.1.1.2  mrg 	for (x = 0; x < xcount; x++)
   1181  1.1.1.2  mrg 	  dest[x*rxstride + y*rystride] = (GFC_COMPLEX_16)0;
   1182  1.1.1.2  mrg 
   1183  1.1.1.2  mrg       for (y = 0; y < ycount; y++)
   1184  1.1.1.2  mrg 	for (n = 0; n < count; n++)
   1185  1.1.1.2  mrg 	  for (x = 0; x < xcount; x++)
   1186  1.1.1.2  mrg 	    /* dest[x,y] += a[x,n] * b[n,y] */
   1187  1.1.1.2  mrg 	    dest[x*rxstride + y*rystride] +=
   1188  1.1.1.2  mrg 					abase[x*axstride + n*aystride] *
   1189  1.1.1.2  mrg 					bbase[n*bxstride + y*bystride];
   1190  1.1.1.2  mrg     }
   1191      1.1  mrg   else
   1192      1.1  mrg     {
   1193      1.1  mrg       const GFC_COMPLEX_16 *restrict abase_x;
   1194      1.1  mrg       const GFC_COMPLEX_16 *restrict bbase_y;
   1195      1.1  mrg       GFC_COMPLEX_16 *restrict dest_y;
   1196      1.1  mrg       GFC_COMPLEX_16 s;
   1197      1.1  mrg 
   1198      1.1  mrg       for (y = 0; y < ycount; y++)
   1199      1.1  mrg 	{
   1200      1.1  mrg 	  bbase_y = &bbase[y*bystride];
   1201      1.1  mrg 	  dest_y = &dest[y*rystride];
   1202      1.1  mrg 	  for (x = 0; x < xcount; x++)
   1203      1.1  mrg 	    {
   1204      1.1  mrg 	      abase_x = &abase[x*axstride];
   1205      1.1  mrg 	      s = (GFC_COMPLEX_16) 0;
   1206      1.1  mrg 	      for (n = 0; n < count; n++)
   1207      1.1  mrg 		s += abase_x[n*aystride] * bbase_y[n*bxstride];
   1208      1.1  mrg 	      dest_y[x*rxstride] = s;
   1209      1.1  mrg 	    }
   1210      1.1  mrg 	}
   1211      1.1  mrg     }
   1212      1.1  mrg }
   1213      1.1  mrg #undef POW3
   1214      1.1  mrg #undef min
   1215      1.1  mrg #undef max
   1216      1.1  mrg 
   1217      1.1  mrg #endif /* HAVE_AVX2 */
   1218      1.1  mrg 
   1219      1.1  mrg #ifdef HAVE_AVX512F
   1220      1.1  mrg static void
   1221      1.1  mrg matmul_c16_avx512f (gfc_array_c16 * const restrict retarray,
   1222      1.1  mrg 	gfc_array_c16 * const restrict a, gfc_array_c16 * const restrict b, int try_blas,
   1223      1.1  mrg 	int blas_limit, blas_call gemm) __attribute__((__target__("avx512f")));
   1224      1.1  mrg static void
   1225      1.1  mrg matmul_c16_avx512f (gfc_array_c16 * const restrict retarray,
   1226      1.1  mrg 	gfc_array_c16 * const restrict a, gfc_array_c16 * const restrict b, int try_blas,
   1227      1.1  mrg 	int blas_limit, blas_call gemm)
   1228      1.1  mrg {
   1229      1.1  mrg   const GFC_COMPLEX_16 * restrict abase;
   1230      1.1  mrg   const GFC_COMPLEX_16 * restrict bbase;
   1231      1.1  mrg   GFC_COMPLEX_16 * restrict dest;
   1232      1.1  mrg 
   1233      1.1  mrg   index_type rxstride, rystride, axstride, aystride, bxstride, bystride;
   1234      1.1  mrg   index_type x, y, n, count, xcount, ycount;
   1235      1.1  mrg 
   1236      1.1  mrg   assert (GFC_DESCRIPTOR_RANK (a) == 2
   1237      1.1  mrg           || GFC_DESCRIPTOR_RANK (b) == 2);
   1238      1.1  mrg 
   1239      1.1  mrg /* C[xcount,ycount] = A[xcount, count] * B[count,ycount]
   1240      1.1  mrg 
   1241      1.1  mrg    Either A or B (but not both) can be rank 1:
   1242      1.1  mrg 
   1243      1.1  mrg    o One-dimensional argument A is implicitly treated as a row matrix
   1244      1.1  mrg      dimensioned [1,count], so xcount=1.
   1245      1.1  mrg 
   1246      1.1  mrg    o One-dimensional argument B is implicitly treated as a column matrix
   1247      1.1  mrg      dimensioned [count, 1], so ycount=1.
   1248      1.1  mrg */
   1249      1.1  mrg 
   1250      1.1  mrg   if (retarray->base_addr == NULL)
   1251      1.1  mrg     {
   1252      1.1  mrg       if (GFC_DESCRIPTOR_RANK (a) == 1)
   1253      1.1  mrg         {
   1254      1.1  mrg 	  GFC_DIMENSION_SET(retarray->dim[0], 0,
   1255      1.1  mrg 	                    GFC_DESCRIPTOR_EXTENT(b,1) - 1, 1);
   1256      1.1  mrg         }
   1257      1.1  mrg       else if (GFC_DESCRIPTOR_RANK (b) == 1)
   1258      1.1  mrg         {
   1259      1.1  mrg 	  GFC_DIMENSION_SET(retarray->dim[0], 0,
   1260      1.1  mrg 	                    GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1);
   1261      1.1  mrg         }
   1262      1.1  mrg       else
   1263      1.1  mrg         {
   1264      1.1  mrg 	  GFC_DIMENSION_SET(retarray->dim[0], 0,
   1265      1.1  mrg 	                    GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1);
   1266      1.1  mrg 
   1267      1.1  mrg           GFC_DIMENSION_SET(retarray->dim[1], 0,
   1268      1.1  mrg 	                    GFC_DESCRIPTOR_EXTENT(b,1) - 1,
   1269      1.1  mrg 			    GFC_DESCRIPTOR_EXTENT(retarray,0));
   1270      1.1  mrg         }
   1271      1.1  mrg 
   1272      1.1  mrg       retarray->base_addr
   1273      1.1  mrg 	= xmallocarray (size0 ((array_t *) retarray), sizeof (GFC_COMPLEX_16));
   1274      1.1  mrg       retarray->offset = 0;
   1275      1.1  mrg     }
   1276      1.1  mrg   else if (unlikely (compile_options.bounds_check))
   1277      1.1  mrg     {
   1278      1.1  mrg       index_type ret_extent, arg_extent;
   1279      1.1  mrg 
   1280      1.1  mrg       if (GFC_DESCRIPTOR_RANK (a) == 1)
   1281      1.1  mrg 	{
   1282      1.1  mrg 	  arg_extent = GFC_DESCRIPTOR_EXTENT(b,1);
   1283      1.1  mrg 	  ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
   1284      1.1  mrg 	  if (arg_extent != ret_extent)
   1285      1.1  mrg 	    runtime_error ("Array bound mismatch for dimension 1 of "
   1286      1.1  mrg 	    		   "array (%ld/%ld) ",
   1287      1.1  mrg 			   (long int) ret_extent, (long int) arg_extent);
   1288      1.1  mrg 	}
   1289      1.1  mrg       else if (GFC_DESCRIPTOR_RANK (b) == 1)
   1290      1.1  mrg 	{
   1291      1.1  mrg 	  arg_extent = GFC_DESCRIPTOR_EXTENT(a,0);
   1292      1.1  mrg 	  ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
   1293      1.1  mrg 	  if (arg_extent != ret_extent)
   1294      1.1  mrg 	    runtime_error ("Array bound mismatch for dimension 1 of "
   1295      1.1  mrg 	    		   "array (%ld/%ld) ",
   1296      1.1  mrg 			   (long int) ret_extent, (long int) arg_extent);
   1297      1.1  mrg 	}
   1298      1.1  mrg       else
   1299      1.1  mrg 	{
   1300      1.1  mrg 	  arg_extent = GFC_DESCRIPTOR_EXTENT(a,0);
   1301      1.1  mrg 	  ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
   1302      1.1  mrg 	  if (arg_extent != ret_extent)
   1303      1.1  mrg 	    runtime_error ("Array bound mismatch for dimension 1 of "
   1304      1.1  mrg 	    		   "array (%ld/%ld) ",
   1305      1.1  mrg 			   (long int) ret_extent, (long int) arg_extent);
   1306      1.1  mrg 
   1307      1.1  mrg 	  arg_extent = GFC_DESCRIPTOR_EXTENT(b,1);
   1308      1.1  mrg 	  ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,1);
   1309      1.1  mrg 	  if (arg_extent != ret_extent)
   1310      1.1  mrg 	    runtime_error ("Array bound mismatch for dimension 2 of "
   1311      1.1  mrg 	    		   "array (%ld/%ld) ",
   1312      1.1  mrg 			   (long int) ret_extent, (long int) arg_extent);
   1313      1.1  mrg 	}
   1314      1.1  mrg     }
   1315      1.1  mrg 
   1316      1.1  mrg 
   1317      1.1  mrg   if (GFC_DESCRIPTOR_RANK (retarray) == 1)
   1318      1.1  mrg     {
   1319      1.1  mrg       /* One-dimensional result may be addressed in the code below
   1320      1.1  mrg 	 either as a row or a column matrix. We want both cases to
   1321      1.1  mrg 	 work. */
   1322      1.1  mrg       rxstride = rystride = GFC_DESCRIPTOR_STRIDE(retarray,0);
   1323      1.1  mrg     }
   1324      1.1  mrg   else
   1325      1.1  mrg     {
   1326      1.1  mrg       rxstride = GFC_DESCRIPTOR_STRIDE(retarray,0);
   1327      1.1  mrg       rystride = GFC_DESCRIPTOR_STRIDE(retarray,1);
   1328      1.1  mrg     }
   1329      1.1  mrg 
   1330      1.1  mrg 
   1331      1.1  mrg   if (GFC_DESCRIPTOR_RANK (a) == 1)
   1332      1.1  mrg     {
   1333      1.1  mrg       /* Treat it as a a row matrix A[1,count]. */
   1334      1.1  mrg       axstride = GFC_DESCRIPTOR_STRIDE(a,0);
   1335      1.1  mrg       aystride = 1;
   1336      1.1  mrg 
   1337      1.1  mrg       xcount = 1;
   1338      1.1  mrg       count = GFC_DESCRIPTOR_EXTENT(a,0);
   1339      1.1  mrg     }
   1340      1.1  mrg   else
   1341      1.1  mrg     {
   1342      1.1  mrg       axstride = GFC_DESCRIPTOR_STRIDE(a,0);
   1343      1.1  mrg       aystride = GFC_DESCRIPTOR_STRIDE(a,1);
   1344      1.1  mrg 
   1345      1.1  mrg       count = GFC_DESCRIPTOR_EXTENT(a,1);
   1346      1.1  mrg       xcount = GFC_DESCRIPTOR_EXTENT(a,0);
   1347      1.1  mrg     }
   1348      1.1  mrg 
   1349      1.1  mrg   if (count != GFC_DESCRIPTOR_EXTENT(b,0))
   1350      1.1  mrg     {
   1351      1.1  mrg       if (count > 0 || GFC_DESCRIPTOR_EXTENT(b,0) > 0)
   1352      1.1  mrg 	runtime_error ("Incorrect extent in argument B in MATMUL intrinsic "
   1353      1.1  mrg 		       "in dimension 1: is %ld, should be %ld",
   1354      1.1  mrg 		       (long int) GFC_DESCRIPTOR_EXTENT(b,0), (long int) count);
   1355      1.1  mrg     }
   1356      1.1  mrg 
   1357      1.1  mrg   if (GFC_DESCRIPTOR_RANK (b) == 1)
   1358      1.1  mrg     {
   1359      1.1  mrg       /* Treat it as a column matrix B[count,1] */
   1360      1.1  mrg       bxstride = GFC_DESCRIPTOR_STRIDE(b,0);
   1361      1.1  mrg 
   1362      1.1  mrg       /* bystride should never be used for 1-dimensional b.
   1363      1.1  mrg          The value is only used for calculation of the
   1364      1.1  mrg          memory by the buffer.  */
   1365      1.1  mrg       bystride = 256;
   1366      1.1  mrg       ycount = 1;
   1367      1.1  mrg     }
   1368      1.1  mrg   else
   1369      1.1  mrg     {
   1370      1.1  mrg       bxstride = GFC_DESCRIPTOR_STRIDE(b,0);
   1371      1.1  mrg       bystride = GFC_DESCRIPTOR_STRIDE(b,1);
   1372      1.1  mrg       ycount = GFC_DESCRIPTOR_EXTENT(b,1);
   1373      1.1  mrg     }
   1374      1.1  mrg 
   1375      1.1  mrg   abase = a->base_addr;
   1376      1.1  mrg   bbase = b->base_addr;
   1377      1.1  mrg   dest = retarray->base_addr;
   1378      1.1  mrg 
   1379      1.1  mrg   /* Now that everything is set up, we perform the multiplication
   1380      1.1  mrg      itself.  */
   1381      1.1  mrg 
   1382      1.1  mrg #define POW3(x) (((float) (x)) * ((float) (x)) * ((float) (x)))
   1383      1.1  mrg #define min(a,b) ((a) <= (b) ? (a) : (b))
   1384      1.1  mrg #define max(a,b) ((a) >= (b) ? (a) : (b))
   1385      1.1  mrg 
   1386      1.1  mrg   if (try_blas && rxstride == 1 && (axstride == 1 || aystride == 1)
   1387      1.1  mrg       && (bxstride == 1 || bystride == 1)
   1388      1.1  mrg       && (((float) xcount) * ((float) ycount) * ((float) count)
   1389      1.1  mrg           > POW3(blas_limit)))
   1390      1.1  mrg     {
   1391      1.1  mrg       const int m = xcount, n = ycount, k = count, ldc = rystride;
   1392      1.1  mrg       const GFC_COMPLEX_16 one = 1, zero = 0;
   1393      1.1  mrg       const int lda = (axstride == 1) ? aystride : axstride,
   1394      1.1  mrg 		ldb = (bxstride == 1) ? bystride : bxstride;
   1395      1.1  mrg 
   1396      1.1  mrg       if (lda > 0 && ldb > 0 && ldc > 0 && m > 1 && n > 1 && k > 1)
   1397      1.1  mrg 	{
   1398      1.1  mrg 	  assert (gemm != NULL);
   1399      1.1  mrg 	  const char *transa, *transb;
   1400      1.1  mrg 	  if (try_blas & 2)
   1401      1.1  mrg 	    transa = "C";
   1402      1.1  mrg 	  else
   1403      1.1  mrg 	    transa = axstride == 1 ? "N" : "T";
   1404      1.1  mrg 
   1405      1.1  mrg 	  if (try_blas & 4)
   1406      1.1  mrg 	    transb = "C";
   1407      1.1  mrg 	  else
   1408      1.1  mrg 	    transb = bxstride == 1 ? "N" : "T";
   1409      1.1  mrg 
   1410      1.1  mrg 	  gemm (transa, transb , &m,
   1411      1.1  mrg 		&n, &k,	&one, abase, &lda, bbase, &ldb, &zero, dest,
   1412      1.1  mrg 		&ldc, 1, 1);
   1413      1.1  mrg 	  return;
   1414      1.1  mrg 	}
   1415      1.1  mrg     }
   1416      1.1  mrg 
   1417  1.1.1.2  mrg   if (rxstride == 1 && axstride == 1 && bxstride == 1
   1418  1.1.1.2  mrg       && GFC_DESCRIPTOR_RANK (b) != 1)
   1419      1.1  mrg     {
   1420      1.1  mrg       /* This block of code implements a tuned matmul, derived from
   1421      1.1  mrg          Superscalar GEMM-based level 3 BLAS,  Beta version 0.1
   1422      1.1  mrg 
   1423      1.1  mrg                Bo Kagstrom and Per Ling
   1424      1.1  mrg                Department of Computing Science
   1425      1.1  mrg                Umea University
   1426      1.1  mrg                S-901 87 Umea, Sweden
   1427      1.1  mrg 
   1428      1.1  mrg 	 from netlib.org, translated to C, and modified for matmul.m4.  */
   1429      1.1  mrg 
   1430      1.1  mrg       const GFC_COMPLEX_16 *a, *b;
   1431      1.1  mrg       GFC_COMPLEX_16 *c;
   1432      1.1  mrg       const index_type m = xcount, n = ycount, k = count;
   1433      1.1  mrg 
   1434      1.1  mrg       /* System generated locals */
   1435      1.1  mrg       index_type a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset,
   1436      1.1  mrg 		 i1, i2, i3, i4, i5, i6;
   1437      1.1  mrg 
   1438      1.1  mrg       /* Local variables */
   1439      1.1  mrg       GFC_COMPLEX_16 f11, f12, f21, f22, f31, f32, f41, f42,
   1440      1.1  mrg 		 f13, f14, f23, f24, f33, f34, f43, f44;
   1441      1.1  mrg       index_type i, j, l, ii, jj, ll;
   1442      1.1  mrg       index_type isec, jsec, lsec, uisec, ujsec, ulsec;
   1443      1.1  mrg       GFC_COMPLEX_16 *t1;
   1444      1.1  mrg 
   1445      1.1  mrg       a = abase;
   1446      1.1  mrg       b = bbase;
   1447      1.1  mrg       c = retarray->base_addr;
   1448      1.1  mrg 
   1449      1.1  mrg       /* Parameter adjustments */
   1450      1.1  mrg       c_dim1 = rystride;
   1451      1.1  mrg       c_offset = 1 + c_dim1;
   1452      1.1  mrg       c -= c_offset;
   1453      1.1  mrg       a_dim1 = aystride;
   1454      1.1  mrg       a_offset = 1 + a_dim1;
   1455      1.1  mrg       a -= a_offset;
   1456      1.1  mrg       b_dim1 = bystride;
   1457      1.1  mrg       b_offset = 1 + b_dim1;
   1458      1.1  mrg       b -= b_offset;
   1459      1.1  mrg 
   1460      1.1  mrg       /* Empty c first.  */
   1461      1.1  mrg       for (j=1; j<=n; j++)
   1462      1.1  mrg 	for (i=1; i<=m; i++)
   1463      1.1  mrg 	  c[i + j * c_dim1] = (GFC_COMPLEX_16)0;
   1464      1.1  mrg 
   1465      1.1  mrg       /* Early exit if possible */
   1466      1.1  mrg       if (m == 0 || n == 0 || k == 0)
   1467      1.1  mrg 	return;
   1468      1.1  mrg 
   1469      1.1  mrg       /* Adjust size of t1 to what is needed.  */
   1470      1.1  mrg       index_type t1_dim, a_sz;
   1471      1.1  mrg       if (aystride == 1)
   1472      1.1  mrg         a_sz = rystride;
   1473      1.1  mrg       else
   1474      1.1  mrg         a_sz = a_dim1;
   1475      1.1  mrg 
   1476      1.1  mrg       t1_dim = a_sz * 256 + b_dim1;
   1477      1.1  mrg       if (t1_dim > 65536)
   1478      1.1  mrg 	t1_dim = 65536;
   1479      1.1  mrg 
   1480      1.1  mrg       t1 = malloc (t1_dim * sizeof(GFC_COMPLEX_16));
   1481      1.1  mrg 
   1482      1.1  mrg       /* Start turning the crank. */
   1483      1.1  mrg       i1 = n;
   1484      1.1  mrg       for (jj = 1; jj <= i1; jj += 512)
   1485      1.1  mrg 	{
   1486      1.1  mrg 	  /* Computing MIN */
   1487      1.1  mrg 	  i2 = 512;
   1488      1.1  mrg 	  i3 = n - jj + 1;
   1489      1.1  mrg 	  jsec = min(i2,i3);
   1490      1.1  mrg 	  ujsec = jsec - jsec % 4;
   1491      1.1  mrg 	  i2 = k;
   1492      1.1  mrg 	  for (ll = 1; ll <= i2; ll += 256)
   1493      1.1  mrg 	    {
   1494      1.1  mrg 	      /* Computing MIN */
   1495      1.1  mrg 	      i3 = 256;
   1496      1.1  mrg 	      i4 = k - ll + 1;
   1497      1.1  mrg 	      lsec = min(i3,i4);
   1498      1.1  mrg 	      ulsec = lsec - lsec % 2;
   1499      1.1  mrg 
   1500      1.1  mrg 	      i3 = m;
   1501      1.1  mrg 	      for (ii = 1; ii <= i3; ii += 256)
   1502      1.1  mrg 		{
   1503      1.1  mrg 		  /* Computing MIN */
   1504      1.1  mrg 		  i4 = 256;
   1505      1.1  mrg 		  i5 = m - ii + 1;
   1506      1.1  mrg 		  isec = min(i4,i5);
   1507      1.1  mrg 		  uisec = isec - isec % 2;
   1508      1.1  mrg 		  i4 = ll + ulsec - 1;
   1509      1.1  mrg 		  for (l = ll; l <= i4; l += 2)
   1510      1.1  mrg 		    {
   1511      1.1  mrg 		      i5 = ii + uisec - 1;
   1512      1.1  mrg 		      for (i = ii; i <= i5; i += 2)
   1513      1.1  mrg 			{
   1514      1.1  mrg 			  t1[l - ll + 1 + ((i - ii + 1) << 8) - 257] =
   1515      1.1  mrg 					a[i + l * a_dim1];
   1516      1.1  mrg 			  t1[l - ll + 2 + ((i - ii + 1) << 8) - 257] =
   1517      1.1  mrg 					a[i + (l + 1) * a_dim1];
   1518      1.1  mrg 			  t1[l - ll + 1 + ((i - ii + 2) << 8) - 257] =
   1519      1.1  mrg 					a[i + 1 + l * a_dim1];
   1520      1.1  mrg 			  t1[l - ll + 2 + ((i - ii + 2) << 8) - 257] =
   1521      1.1  mrg 					a[i + 1 + (l + 1) * a_dim1];
   1522      1.1  mrg 			}
   1523      1.1  mrg 		      if (uisec < isec)
   1524      1.1  mrg 			{
   1525      1.1  mrg 			  t1[l - ll + 1 + (isec << 8) - 257] =
   1526      1.1  mrg 				    a[ii + isec - 1 + l * a_dim1];
   1527      1.1  mrg 			  t1[l - ll + 2 + (isec << 8) - 257] =
   1528      1.1  mrg 				    a[ii + isec - 1 + (l + 1) * a_dim1];
   1529      1.1  mrg 			}
   1530      1.1  mrg 		    }
   1531      1.1  mrg 		  if (ulsec < lsec)
   1532      1.1  mrg 		    {
   1533      1.1  mrg 		      i4 = ii + isec - 1;
   1534      1.1  mrg 		      for (i = ii; i<= i4; ++i)
   1535      1.1  mrg 			{
   1536      1.1  mrg 			  t1[lsec + ((i - ii + 1) << 8) - 257] =
   1537      1.1  mrg 				    a[i + (ll + lsec - 1) * a_dim1];
   1538      1.1  mrg 			}
   1539      1.1  mrg 		    }
   1540      1.1  mrg 
   1541      1.1  mrg 		  uisec = isec - isec % 4;
   1542      1.1  mrg 		  i4 = jj + ujsec - 1;
   1543      1.1  mrg 		  for (j = jj; j <= i4; j += 4)
   1544      1.1  mrg 		    {
   1545      1.1  mrg 		      i5 = ii + uisec - 1;
   1546      1.1  mrg 		      for (i = ii; i <= i5; i += 4)
   1547      1.1  mrg 			{
   1548      1.1  mrg 			  f11 = c[i + j * c_dim1];
   1549      1.1  mrg 			  f21 = c[i + 1 + j * c_dim1];
   1550      1.1  mrg 			  f12 = c[i + (j + 1) * c_dim1];
   1551      1.1  mrg 			  f22 = c[i + 1 + (j + 1) * c_dim1];
   1552      1.1  mrg 			  f13 = c[i + (j + 2) * c_dim1];
   1553      1.1  mrg 			  f23 = c[i + 1 + (j + 2) * c_dim1];
   1554      1.1  mrg 			  f14 = c[i + (j + 3) * c_dim1];
   1555      1.1  mrg 			  f24 = c[i + 1 + (j + 3) * c_dim1];
   1556      1.1  mrg 			  f31 = c[i + 2 + j * c_dim1];
   1557      1.1  mrg 			  f41 = c[i + 3 + j * c_dim1];
   1558      1.1  mrg 			  f32 = c[i + 2 + (j + 1) * c_dim1];
   1559      1.1  mrg 			  f42 = c[i + 3 + (j + 1) * c_dim1];
   1560      1.1  mrg 			  f33 = c[i + 2 + (j + 2) * c_dim1];
   1561      1.1  mrg 			  f43 = c[i + 3 + (j + 2) * c_dim1];
   1562      1.1  mrg 			  f34 = c[i + 2 + (j + 3) * c_dim1];
   1563      1.1  mrg 			  f44 = c[i + 3 + (j + 3) * c_dim1];
   1564      1.1  mrg 			  i6 = ll + lsec - 1;
   1565      1.1  mrg 			  for (l = ll; l <= i6; ++l)
   1566      1.1  mrg 			    {
   1567      1.1  mrg 			      f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
   1568      1.1  mrg 				      * b[l + j * b_dim1];
   1569      1.1  mrg 			      f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
   1570      1.1  mrg 				      * b[l + j * b_dim1];
   1571      1.1  mrg 			      f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
   1572      1.1  mrg 				      * b[l + (j + 1) * b_dim1];
   1573      1.1  mrg 			      f22 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
   1574      1.1  mrg 				      * b[l + (j + 1) * b_dim1];
   1575      1.1  mrg 			      f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
   1576      1.1  mrg 				      * b[l + (j + 2) * b_dim1];
   1577      1.1  mrg 			      f23 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
   1578      1.1  mrg 				      * b[l + (j + 2) * b_dim1];
   1579      1.1  mrg 			      f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
   1580      1.1  mrg 				      * b[l + (j + 3) * b_dim1];
   1581      1.1  mrg 			      f24 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
   1582      1.1  mrg 				      * b[l + (j + 3) * b_dim1];
   1583      1.1  mrg 			      f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
   1584      1.1  mrg 				      * b[l + j * b_dim1];
   1585      1.1  mrg 			      f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
   1586      1.1  mrg 				      * b[l + j * b_dim1];
   1587      1.1  mrg 			      f32 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
   1588      1.1  mrg 				      * b[l + (j + 1) * b_dim1];
   1589      1.1  mrg 			      f42 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
   1590      1.1  mrg 				      * b[l + (j + 1) * b_dim1];
   1591      1.1  mrg 			      f33 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
   1592      1.1  mrg 				      * b[l + (j + 2) * b_dim1];
   1593      1.1  mrg 			      f43 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
   1594      1.1  mrg 				      * b[l + (j + 2) * b_dim1];
   1595      1.1  mrg 			      f34 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
   1596      1.1  mrg 				      * b[l + (j + 3) * b_dim1];
   1597      1.1  mrg 			      f44 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
   1598      1.1  mrg 				      * b[l + (j + 3) * b_dim1];
   1599      1.1  mrg 			    }
   1600      1.1  mrg 			  c[i + j * c_dim1] = f11;
   1601      1.1  mrg 			  c[i + 1 + j * c_dim1] = f21;
   1602      1.1  mrg 			  c[i + (j + 1) * c_dim1] = f12;
   1603      1.1  mrg 			  c[i + 1 + (j + 1) * c_dim1] = f22;
   1604      1.1  mrg 			  c[i + (j + 2) * c_dim1] = f13;
   1605      1.1  mrg 			  c[i + 1 + (j + 2) * c_dim1] = f23;
   1606      1.1  mrg 			  c[i + (j + 3) * c_dim1] = f14;
   1607      1.1  mrg 			  c[i + 1 + (j + 3) * c_dim1] = f24;
   1608      1.1  mrg 			  c[i + 2 + j * c_dim1] = f31;
   1609      1.1  mrg 			  c[i + 3 + j * c_dim1] = f41;
   1610      1.1  mrg 			  c[i + 2 + (j + 1) * c_dim1] = f32;
   1611      1.1  mrg 			  c[i + 3 + (j + 1) * c_dim1] = f42;
   1612      1.1  mrg 			  c[i + 2 + (j + 2) * c_dim1] = f33;
   1613      1.1  mrg 			  c[i + 3 + (j + 2) * c_dim1] = f43;
   1614      1.1  mrg 			  c[i + 2 + (j + 3) * c_dim1] = f34;
   1615      1.1  mrg 			  c[i + 3 + (j + 3) * c_dim1] = f44;
   1616      1.1  mrg 			}
   1617      1.1  mrg 		      if (uisec < isec)
   1618      1.1  mrg 			{
   1619      1.1  mrg 			  i5 = ii + isec - 1;
   1620      1.1  mrg 			  for (i = ii + uisec; i <= i5; ++i)
   1621      1.1  mrg 			    {
   1622      1.1  mrg 			      f11 = c[i + j * c_dim1];
   1623      1.1  mrg 			      f12 = c[i + (j + 1) * c_dim1];
   1624      1.1  mrg 			      f13 = c[i + (j + 2) * c_dim1];
   1625      1.1  mrg 			      f14 = c[i + (j + 3) * c_dim1];
   1626      1.1  mrg 			      i6 = ll + lsec - 1;
   1627      1.1  mrg 			      for (l = ll; l <= i6; ++l)
   1628      1.1  mrg 				{
   1629      1.1  mrg 				  f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
   1630      1.1  mrg 					  257] * b[l + j * b_dim1];
   1631      1.1  mrg 				  f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
   1632      1.1  mrg 					  257] * b[l + (j + 1) * b_dim1];
   1633      1.1  mrg 				  f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
   1634      1.1  mrg 					  257] * b[l + (j + 2) * b_dim1];
   1635      1.1  mrg 				  f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
   1636      1.1  mrg 					  257] * b[l + (j + 3) * b_dim1];
   1637      1.1  mrg 				}
   1638      1.1  mrg 			      c[i + j * c_dim1] = f11;
   1639      1.1  mrg 			      c[i + (j + 1) * c_dim1] = f12;
   1640      1.1  mrg 			      c[i + (j + 2) * c_dim1] = f13;
   1641      1.1  mrg 			      c[i + (j + 3) * c_dim1] = f14;
   1642      1.1  mrg 			    }
   1643      1.1  mrg 			}
   1644      1.1  mrg 		    }
   1645      1.1  mrg 		  if (ujsec < jsec)
   1646      1.1  mrg 		    {
   1647      1.1  mrg 		      i4 = jj + jsec - 1;
   1648      1.1  mrg 		      for (j = jj + ujsec; j <= i4; ++j)
   1649      1.1  mrg 			{
   1650      1.1  mrg 			  i5 = ii + uisec - 1;
   1651      1.1  mrg 			  for (i = ii; i <= i5; i += 4)
   1652      1.1  mrg 			    {
   1653      1.1  mrg 			      f11 = c[i + j * c_dim1];
   1654      1.1  mrg 			      f21 = c[i + 1 + j * c_dim1];
   1655      1.1  mrg 			      f31 = c[i + 2 + j * c_dim1];
   1656      1.1  mrg 			      f41 = c[i + 3 + j * c_dim1];
   1657      1.1  mrg 			      i6 = ll + lsec - 1;
   1658      1.1  mrg 			      for (l = ll; l <= i6; ++l)
   1659      1.1  mrg 				{
   1660      1.1  mrg 				  f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
   1661      1.1  mrg 					  257] * b[l + j * b_dim1];
   1662      1.1  mrg 				  f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) -
   1663      1.1  mrg 					  257] * b[l + j * b_dim1];
   1664      1.1  mrg 				  f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) -
   1665      1.1  mrg 					  257] * b[l + j * b_dim1];
   1666      1.1  mrg 				  f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) -
   1667      1.1  mrg 					  257] * b[l + j * b_dim1];
   1668      1.1  mrg 				}
   1669      1.1  mrg 			      c[i + j * c_dim1] = f11;
   1670      1.1  mrg 			      c[i + 1 + j * c_dim1] = f21;
   1671      1.1  mrg 			      c[i + 2 + j * c_dim1] = f31;
   1672      1.1  mrg 			      c[i + 3 + j * c_dim1] = f41;
   1673      1.1  mrg 			    }
   1674      1.1  mrg 			  i5 = ii + isec - 1;
   1675      1.1  mrg 			  for (i = ii + uisec; i <= i5; ++i)
   1676      1.1  mrg 			    {
   1677      1.1  mrg 			      f11 = c[i + j * c_dim1];
   1678      1.1  mrg 			      i6 = ll + lsec - 1;
   1679      1.1  mrg 			      for (l = ll; l <= i6; ++l)
   1680      1.1  mrg 				{
   1681      1.1  mrg 				  f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
   1682      1.1  mrg 					  257] * b[l + j * b_dim1];
   1683      1.1  mrg 				}
   1684      1.1  mrg 			      c[i + j * c_dim1] = f11;
   1685      1.1  mrg 			    }
   1686      1.1  mrg 			}
   1687      1.1  mrg 		    }
   1688      1.1  mrg 		}
   1689      1.1  mrg 	    }
   1690      1.1  mrg 	}
   1691      1.1  mrg       free(t1);
   1692      1.1  mrg       return;
   1693      1.1  mrg     }
   1694      1.1  mrg   else if (rxstride == 1 && aystride == 1 && bxstride == 1)
   1695      1.1  mrg     {
   1696      1.1  mrg       if (GFC_DESCRIPTOR_RANK (a) != 1)
   1697      1.1  mrg 	{
   1698      1.1  mrg 	  const GFC_COMPLEX_16 *restrict abase_x;
   1699      1.1  mrg 	  const GFC_COMPLEX_16 *restrict bbase_y;
   1700      1.1  mrg 	  GFC_COMPLEX_16 *restrict dest_y;
   1701      1.1  mrg 	  GFC_COMPLEX_16 s;
   1702      1.1  mrg 
   1703      1.1  mrg 	  for (y = 0; y < ycount; y++)
   1704      1.1  mrg 	    {
   1705      1.1  mrg 	      bbase_y = &bbase[y*bystride];
   1706      1.1  mrg 	      dest_y = &dest[y*rystride];
   1707      1.1  mrg 	      for (x = 0; x < xcount; x++)
   1708      1.1  mrg 		{
   1709      1.1  mrg 		  abase_x = &abase[x*axstride];
   1710      1.1  mrg 		  s = (GFC_COMPLEX_16) 0;
   1711      1.1  mrg 		  for (n = 0; n < count; n++)
   1712      1.1  mrg 		    s += abase_x[n] * bbase_y[n];
   1713      1.1  mrg 		  dest_y[x] = s;
   1714      1.1  mrg 		}
   1715      1.1  mrg 	    }
   1716      1.1  mrg 	}
   1717      1.1  mrg       else
   1718      1.1  mrg 	{
   1719      1.1  mrg 	  const GFC_COMPLEX_16 *restrict bbase_y;
   1720      1.1  mrg 	  GFC_COMPLEX_16 s;
   1721      1.1  mrg 
   1722      1.1  mrg 	  for (y = 0; y < ycount; y++)
   1723      1.1  mrg 	    {
   1724      1.1  mrg 	      bbase_y = &bbase[y*bystride];
   1725      1.1  mrg 	      s = (GFC_COMPLEX_16) 0;
   1726      1.1  mrg 	      for (n = 0; n < count; n++)
   1727      1.1  mrg 		s += abase[n*axstride] * bbase_y[n];
   1728      1.1  mrg 	      dest[y*rystride] = s;
   1729      1.1  mrg 	    }
   1730      1.1  mrg 	}
   1731      1.1  mrg     }
   1732      1.1  mrg   else if (GFC_DESCRIPTOR_RANK (a) == 1)
   1733      1.1  mrg     {
   1734      1.1  mrg       const GFC_COMPLEX_16 *restrict bbase_y;
   1735      1.1  mrg       GFC_COMPLEX_16 s;
   1736      1.1  mrg 
   1737      1.1  mrg       for (y = 0; y < ycount; y++)
   1738      1.1  mrg 	{
   1739      1.1  mrg 	  bbase_y = &bbase[y*bystride];
   1740      1.1  mrg 	  s = (GFC_COMPLEX_16) 0;
   1741      1.1  mrg 	  for (n = 0; n < count; n++)
   1742      1.1  mrg 	    s += abase[n*axstride] * bbase_y[n*bxstride];
   1743      1.1  mrg 	  dest[y*rxstride] = s;
   1744      1.1  mrg 	}
   1745      1.1  mrg     }
   1746  1.1.1.2  mrg   else if (axstride < aystride)
   1747  1.1.1.2  mrg     {
   1748  1.1.1.2  mrg       for (y = 0; y < ycount; y++)
   1749  1.1.1.2  mrg 	for (x = 0; x < xcount; x++)
   1750  1.1.1.2  mrg 	  dest[x*rxstride + y*rystride] = (GFC_COMPLEX_16)0;
   1751  1.1.1.2  mrg 
   1752  1.1.1.2  mrg       for (y = 0; y < ycount; y++)
   1753  1.1.1.2  mrg 	for (n = 0; n < count; n++)
   1754  1.1.1.2  mrg 	  for (x = 0; x < xcount; x++)
   1755  1.1.1.2  mrg 	    /* dest[x,y] += a[x,n] * b[n,y] */
   1756  1.1.1.2  mrg 	    dest[x*rxstride + y*rystride] +=
   1757  1.1.1.2  mrg 					abase[x*axstride + n*aystride] *
   1758  1.1.1.2  mrg 					bbase[n*bxstride + y*bystride];
   1759  1.1.1.2  mrg     }
   1760      1.1  mrg   else
   1761      1.1  mrg     {
   1762      1.1  mrg       const GFC_COMPLEX_16 *restrict abase_x;
   1763      1.1  mrg       const GFC_COMPLEX_16 *restrict bbase_y;
   1764      1.1  mrg       GFC_COMPLEX_16 *restrict dest_y;
   1765      1.1  mrg       GFC_COMPLEX_16 s;
   1766      1.1  mrg 
   1767      1.1  mrg       for (y = 0; y < ycount; y++)
   1768      1.1  mrg 	{
   1769      1.1  mrg 	  bbase_y = &bbase[y*bystride];
   1770      1.1  mrg 	  dest_y = &dest[y*rystride];
   1771      1.1  mrg 	  for (x = 0; x < xcount; x++)
   1772      1.1  mrg 	    {
   1773      1.1  mrg 	      abase_x = &abase[x*axstride];
   1774      1.1  mrg 	      s = (GFC_COMPLEX_16) 0;
   1775      1.1  mrg 	      for (n = 0; n < count; n++)
   1776      1.1  mrg 		s += abase_x[n*aystride] * bbase_y[n*bxstride];
   1777      1.1  mrg 	      dest_y[x*rxstride] = s;
   1778      1.1  mrg 	    }
   1779      1.1  mrg 	}
   1780      1.1  mrg     }
   1781      1.1  mrg }
   1782      1.1  mrg #undef POW3
   1783      1.1  mrg #undef min
   1784      1.1  mrg #undef max
   1785      1.1  mrg 
   1786      1.1  mrg #endif  /* HAVE_AVX512F */
   1787      1.1  mrg 
   1788      1.1  mrg /* AMD-specifix funtions with AVX128 and FMA3/FMA4.  */
   1789      1.1  mrg 
   1790      1.1  mrg #if defined(HAVE_AVX) && defined(HAVE_FMA3) && defined(HAVE_AVX128)
   1791      1.1  mrg void
   1792      1.1  mrg matmul_c16_avx128_fma3 (gfc_array_c16 * const restrict retarray,
   1793      1.1  mrg 	gfc_array_c16 * const restrict a, gfc_array_c16 * const restrict b, int try_blas,
   1794      1.1  mrg 	int blas_limit, blas_call gemm) __attribute__((__target__("avx,fma")));
   1795      1.1  mrg internal_proto(matmul_c16_avx128_fma3);
   1796      1.1  mrg #endif
   1797      1.1  mrg 
   1798      1.1  mrg #if defined(HAVE_AVX) && defined(HAVE_FMA4) && defined(HAVE_AVX128)
   1799      1.1  mrg void
   1800      1.1  mrg matmul_c16_avx128_fma4 (gfc_array_c16 * const restrict retarray,
   1801      1.1  mrg 	gfc_array_c16 * const restrict a, gfc_array_c16 * const restrict b, int try_blas,
   1802      1.1  mrg 	int blas_limit, blas_call gemm) __attribute__((__target__("avx,fma4")));
   1803      1.1  mrg internal_proto(matmul_c16_avx128_fma4);
   1804      1.1  mrg #endif
   1805      1.1  mrg 
   1806      1.1  mrg /* Function to fall back to if there is no special processor-specific version.  */
   1807      1.1  mrg static void
   1808      1.1  mrg matmul_c16_vanilla (gfc_array_c16 * const restrict retarray,
   1809      1.1  mrg 	gfc_array_c16 * const restrict a, gfc_array_c16 * const restrict b, int try_blas,
   1810      1.1  mrg 	int blas_limit, blas_call gemm)
   1811      1.1  mrg {
   1812      1.1  mrg   const GFC_COMPLEX_16 * restrict abase;
   1813      1.1  mrg   const GFC_COMPLEX_16 * restrict bbase;
   1814      1.1  mrg   GFC_COMPLEX_16 * restrict dest;
   1815      1.1  mrg 
   1816      1.1  mrg   index_type rxstride, rystride, axstride, aystride, bxstride, bystride;
   1817      1.1  mrg   index_type x, y, n, count, xcount, ycount;
   1818      1.1  mrg 
   1819      1.1  mrg   assert (GFC_DESCRIPTOR_RANK (a) == 2
   1820      1.1  mrg           || GFC_DESCRIPTOR_RANK (b) == 2);
   1821      1.1  mrg 
   1822      1.1  mrg /* C[xcount,ycount] = A[xcount, count] * B[count,ycount]
   1823      1.1  mrg 
   1824      1.1  mrg    Either A or B (but not both) can be rank 1:
   1825      1.1  mrg 
   1826      1.1  mrg    o One-dimensional argument A is implicitly treated as a row matrix
   1827      1.1  mrg      dimensioned [1,count], so xcount=1.
   1828      1.1  mrg 
   1829      1.1  mrg    o One-dimensional argument B is implicitly treated as a column matrix
   1830      1.1  mrg      dimensioned [count, 1], so ycount=1.
   1831      1.1  mrg */
   1832      1.1  mrg 
   1833      1.1  mrg   if (retarray->base_addr == NULL)
   1834      1.1  mrg     {
   1835      1.1  mrg       if (GFC_DESCRIPTOR_RANK (a) == 1)
   1836      1.1  mrg         {
   1837      1.1  mrg 	  GFC_DIMENSION_SET(retarray->dim[0], 0,
   1838      1.1  mrg 	                    GFC_DESCRIPTOR_EXTENT(b,1) - 1, 1);
   1839      1.1  mrg         }
   1840      1.1  mrg       else if (GFC_DESCRIPTOR_RANK (b) == 1)
   1841      1.1  mrg         {
   1842      1.1  mrg 	  GFC_DIMENSION_SET(retarray->dim[0], 0,
   1843      1.1  mrg 	                    GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1);
   1844      1.1  mrg         }
   1845      1.1  mrg       else
   1846      1.1  mrg         {
   1847      1.1  mrg 	  GFC_DIMENSION_SET(retarray->dim[0], 0,
   1848      1.1  mrg 	                    GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1);
   1849      1.1  mrg 
   1850      1.1  mrg           GFC_DIMENSION_SET(retarray->dim[1], 0,
   1851      1.1  mrg 	                    GFC_DESCRIPTOR_EXTENT(b,1) - 1,
   1852      1.1  mrg 			    GFC_DESCRIPTOR_EXTENT(retarray,0));
   1853      1.1  mrg         }
   1854      1.1  mrg 
   1855      1.1  mrg       retarray->base_addr
   1856      1.1  mrg 	= xmallocarray (size0 ((array_t *) retarray), sizeof (GFC_COMPLEX_16));
   1857      1.1  mrg       retarray->offset = 0;
   1858      1.1  mrg     }
   1859      1.1  mrg   else if (unlikely (compile_options.bounds_check))
   1860      1.1  mrg     {
   1861      1.1  mrg       index_type ret_extent, arg_extent;
   1862      1.1  mrg 
   1863      1.1  mrg       if (GFC_DESCRIPTOR_RANK (a) == 1)
   1864      1.1  mrg 	{
   1865      1.1  mrg 	  arg_extent = GFC_DESCRIPTOR_EXTENT(b,1);
   1866      1.1  mrg 	  ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
   1867      1.1  mrg 	  if (arg_extent != ret_extent)
   1868      1.1  mrg 	    runtime_error ("Array bound mismatch for dimension 1 of "
   1869      1.1  mrg 	    		   "array (%ld/%ld) ",
   1870      1.1  mrg 			   (long int) ret_extent, (long int) arg_extent);
   1871      1.1  mrg 	}
   1872      1.1  mrg       else if (GFC_DESCRIPTOR_RANK (b) == 1)
   1873      1.1  mrg 	{
   1874      1.1  mrg 	  arg_extent = GFC_DESCRIPTOR_EXTENT(a,0);
   1875      1.1  mrg 	  ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
   1876      1.1  mrg 	  if (arg_extent != ret_extent)
   1877      1.1  mrg 	    runtime_error ("Array bound mismatch for dimension 1 of "
   1878      1.1  mrg 	    		   "array (%ld/%ld) ",
   1879      1.1  mrg 			   (long int) ret_extent, (long int) arg_extent);
   1880      1.1  mrg 	}
   1881      1.1  mrg       else
   1882      1.1  mrg 	{
   1883      1.1  mrg 	  arg_extent = GFC_DESCRIPTOR_EXTENT(a,0);
   1884      1.1  mrg 	  ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
   1885      1.1  mrg 	  if (arg_extent != ret_extent)
   1886      1.1  mrg 	    runtime_error ("Array bound mismatch for dimension 1 of "
   1887      1.1  mrg 	    		   "array (%ld/%ld) ",
   1888      1.1  mrg 			   (long int) ret_extent, (long int) arg_extent);
   1889      1.1  mrg 
   1890      1.1  mrg 	  arg_extent = GFC_DESCRIPTOR_EXTENT(b,1);
   1891      1.1  mrg 	  ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,1);
   1892      1.1  mrg 	  if (arg_extent != ret_extent)
   1893      1.1  mrg 	    runtime_error ("Array bound mismatch for dimension 2 of "
   1894      1.1  mrg 	    		   "array (%ld/%ld) ",
   1895      1.1  mrg 			   (long int) ret_extent, (long int) arg_extent);
   1896      1.1  mrg 	}
   1897      1.1  mrg     }
   1898      1.1  mrg 
   1899      1.1  mrg 
   1900      1.1  mrg   if (GFC_DESCRIPTOR_RANK (retarray) == 1)
   1901      1.1  mrg     {
   1902      1.1  mrg       /* One-dimensional result may be addressed in the code below
   1903      1.1  mrg 	 either as a row or a column matrix. We want both cases to
   1904      1.1  mrg 	 work. */
   1905      1.1  mrg       rxstride = rystride = GFC_DESCRIPTOR_STRIDE(retarray,0);
   1906      1.1  mrg     }
   1907      1.1  mrg   else
   1908      1.1  mrg     {
   1909      1.1  mrg       rxstride = GFC_DESCRIPTOR_STRIDE(retarray,0);
   1910      1.1  mrg       rystride = GFC_DESCRIPTOR_STRIDE(retarray,1);
   1911      1.1  mrg     }
   1912      1.1  mrg 
   1913      1.1  mrg 
   1914      1.1  mrg   if (GFC_DESCRIPTOR_RANK (a) == 1)
   1915      1.1  mrg     {
   1916      1.1  mrg       /* Treat it as a a row matrix A[1,count]. */
   1917      1.1  mrg       axstride = GFC_DESCRIPTOR_STRIDE(a,0);
   1918      1.1  mrg       aystride = 1;
   1919      1.1  mrg 
   1920      1.1  mrg       xcount = 1;
   1921      1.1  mrg       count = GFC_DESCRIPTOR_EXTENT(a,0);
   1922      1.1  mrg     }
   1923      1.1  mrg   else
   1924      1.1  mrg     {
   1925      1.1  mrg       axstride = GFC_DESCRIPTOR_STRIDE(a,0);
   1926      1.1  mrg       aystride = GFC_DESCRIPTOR_STRIDE(a,1);
   1927      1.1  mrg 
   1928      1.1  mrg       count = GFC_DESCRIPTOR_EXTENT(a,1);
   1929      1.1  mrg       xcount = GFC_DESCRIPTOR_EXTENT(a,0);
   1930      1.1  mrg     }
   1931      1.1  mrg 
   1932      1.1  mrg   if (count != GFC_DESCRIPTOR_EXTENT(b,0))
   1933      1.1  mrg     {
   1934      1.1  mrg       if (count > 0 || GFC_DESCRIPTOR_EXTENT(b,0) > 0)
   1935      1.1  mrg 	runtime_error ("Incorrect extent in argument B in MATMUL intrinsic "
   1936      1.1  mrg 		       "in dimension 1: is %ld, should be %ld",
   1937      1.1  mrg 		       (long int) GFC_DESCRIPTOR_EXTENT(b,0), (long int) count);
   1938      1.1  mrg     }
   1939      1.1  mrg 
   1940      1.1  mrg   if (GFC_DESCRIPTOR_RANK (b) == 1)
   1941      1.1  mrg     {
   1942      1.1  mrg       /* Treat it as a column matrix B[count,1] */
   1943      1.1  mrg       bxstride = GFC_DESCRIPTOR_STRIDE(b,0);
   1944      1.1  mrg 
   1945      1.1  mrg       /* bystride should never be used for 1-dimensional b.
   1946      1.1  mrg          The value is only used for calculation of the
   1947      1.1  mrg          memory by the buffer.  */
   1948      1.1  mrg       bystride = 256;
   1949      1.1  mrg       ycount = 1;
   1950      1.1  mrg     }
   1951      1.1  mrg   else
   1952      1.1  mrg     {
   1953      1.1  mrg       bxstride = GFC_DESCRIPTOR_STRIDE(b,0);
   1954      1.1  mrg       bystride = GFC_DESCRIPTOR_STRIDE(b,1);
   1955      1.1  mrg       ycount = GFC_DESCRIPTOR_EXTENT(b,1);
   1956      1.1  mrg     }
   1957      1.1  mrg 
   1958      1.1  mrg   abase = a->base_addr;
   1959      1.1  mrg   bbase = b->base_addr;
   1960      1.1  mrg   dest = retarray->base_addr;
   1961      1.1  mrg 
   1962      1.1  mrg   /* Now that everything is set up, we perform the multiplication
   1963      1.1  mrg      itself.  */
   1964      1.1  mrg 
   1965      1.1  mrg #define POW3(x) (((float) (x)) * ((float) (x)) * ((float) (x)))
   1966      1.1  mrg #define min(a,b) ((a) <= (b) ? (a) : (b))
   1967      1.1  mrg #define max(a,b) ((a) >= (b) ? (a) : (b))
   1968      1.1  mrg 
   1969      1.1  mrg   if (try_blas && rxstride == 1 && (axstride == 1 || aystride == 1)
   1970      1.1  mrg       && (bxstride == 1 || bystride == 1)
   1971      1.1  mrg       && (((float) xcount) * ((float) ycount) * ((float) count)
   1972      1.1  mrg           > POW3(blas_limit)))
   1973      1.1  mrg     {
   1974      1.1  mrg       const int m = xcount, n = ycount, k = count, ldc = rystride;
   1975      1.1  mrg       const GFC_COMPLEX_16 one = 1, zero = 0;
   1976      1.1  mrg       const int lda = (axstride == 1) ? aystride : axstride,
   1977      1.1  mrg 		ldb = (bxstride == 1) ? bystride : bxstride;
   1978      1.1  mrg 
   1979      1.1  mrg       if (lda > 0 && ldb > 0 && ldc > 0 && m > 1 && n > 1 && k > 1)
   1980      1.1  mrg 	{
   1981      1.1  mrg 	  assert (gemm != NULL);
   1982      1.1  mrg 	  const char *transa, *transb;
   1983      1.1  mrg 	  if (try_blas & 2)
   1984      1.1  mrg 	    transa = "C";
   1985      1.1  mrg 	  else
   1986      1.1  mrg 	    transa = axstride == 1 ? "N" : "T";
   1987      1.1  mrg 
   1988      1.1  mrg 	  if (try_blas & 4)
   1989      1.1  mrg 	    transb = "C";
   1990      1.1  mrg 	  else
   1991      1.1  mrg 	    transb = bxstride == 1 ? "N" : "T";
   1992      1.1  mrg 
   1993      1.1  mrg 	  gemm (transa, transb , &m,
   1994      1.1  mrg 		&n, &k,	&one, abase, &lda, bbase, &ldb, &zero, dest,
   1995      1.1  mrg 		&ldc, 1, 1);
   1996      1.1  mrg 	  return;
   1997      1.1  mrg 	}
   1998      1.1  mrg     }
   1999      1.1  mrg 
   2000  1.1.1.2  mrg   if (rxstride == 1 && axstride == 1 && bxstride == 1
   2001  1.1.1.2  mrg       && GFC_DESCRIPTOR_RANK (b) != 1)
   2002      1.1  mrg     {
   2003      1.1  mrg       /* This block of code implements a tuned matmul, derived from
   2004      1.1  mrg          Superscalar GEMM-based level 3 BLAS,  Beta version 0.1
   2005      1.1  mrg 
   2006      1.1  mrg                Bo Kagstrom and Per Ling
   2007      1.1  mrg                Department of Computing Science
   2008      1.1  mrg                Umea University
   2009      1.1  mrg                S-901 87 Umea, Sweden
   2010      1.1  mrg 
   2011      1.1  mrg 	 from netlib.org, translated to C, and modified for matmul.m4.  */
   2012      1.1  mrg 
   2013      1.1  mrg       const GFC_COMPLEX_16 *a, *b;
   2014      1.1  mrg       GFC_COMPLEX_16 *c;
   2015      1.1  mrg       const index_type m = xcount, n = ycount, k = count;
   2016      1.1  mrg 
   2017      1.1  mrg       /* System generated locals */
   2018      1.1  mrg       index_type a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset,
   2019      1.1  mrg 		 i1, i2, i3, i4, i5, i6;
   2020      1.1  mrg 
   2021      1.1  mrg       /* Local variables */
   2022      1.1  mrg       GFC_COMPLEX_16 f11, f12, f21, f22, f31, f32, f41, f42,
   2023      1.1  mrg 		 f13, f14, f23, f24, f33, f34, f43, f44;
   2024      1.1  mrg       index_type i, j, l, ii, jj, ll;
   2025      1.1  mrg       index_type isec, jsec, lsec, uisec, ujsec, ulsec;
   2026      1.1  mrg       GFC_COMPLEX_16 *t1;
   2027      1.1  mrg 
   2028      1.1  mrg       a = abase;
   2029      1.1  mrg       b = bbase;
   2030      1.1  mrg       c = retarray->base_addr;
   2031      1.1  mrg 
   2032      1.1  mrg       /* Parameter adjustments */
   2033      1.1  mrg       c_dim1 = rystride;
   2034      1.1  mrg       c_offset = 1 + c_dim1;
   2035      1.1  mrg       c -= c_offset;
   2036      1.1  mrg       a_dim1 = aystride;
   2037      1.1  mrg       a_offset = 1 + a_dim1;
   2038      1.1  mrg       a -= a_offset;
   2039      1.1  mrg       b_dim1 = bystride;
   2040      1.1  mrg       b_offset = 1 + b_dim1;
   2041      1.1  mrg       b -= b_offset;
   2042      1.1  mrg 
   2043      1.1  mrg       /* Empty c first.  */
   2044      1.1  mrg       for (j=1; j<=n; j++)
   2045      1.1  mrg 	for (i=1; i<=m; i++)
   2046      1.1  mrg 	  c[i + j * c_dim1] = (GFC_COMPLEX_16)0;
   2047      1.1  mrg 
   2048      1.1  mrg       /* Early exit if possible */
   2049      1.1  mrg       if (m == 0 || n == 0 || k == 0)
   2050      1.1  mrg 	return;
   2051      1.1  mrg 
   2052      1.1  mrg       /* Adjust size of t1 to what is needed.  */
   2053      1.1  mrg       index_type t1_dim, a_sz;
   2054      1.1  mrg       if (aystride == 1)
   2055      1.1  mrg         a_sz = rystride;
   2056      1.1  mrg       else
   2057      1.1  mrg         a_sz = a_dim1;
   2058      1.1  mrg 
   2059      1.1  mrg       t1_dim = a_sz * 256 + b_dim1;
   2060      1.1  mrg       if (t1_dim > 65536)
   2061      1.1  mrg 	t1_dim = 65536;
   2062      1.1  mrg 
   2063      1.1  mrg       t1 = malloc (t1_dim * sizeof(GFC_COMPLEX_16));
   2064      1.1  mrg 
   2065      1.1  mrg       /* Start turning the crank. */
   2066      1.1  mrg       i1 = n;
   2067      1.1  mrg       for (jj = 1; jj <= i1; jj += 512)
   2068      1.1  mrg 	{
   2069      1.1  mrg 	  /* Computing MIN */
   2070      1.1  mrg 	  i2 = 512;
   2071      1.1  mrg 	  i3 = n - jj + 1;
   2072      1.1  mrg 	  jsec = min(i2,i3);
   2073      1.1  mrg 	  ujsec = jsec - jsec % 4;
   2074      1.1  mrg 	  i2 = k;
   2075      1.1  mrg 	  for (ll = 1; ll <= i2; ll += 256)
   2076      1.1  mrg 	    {
   2077      1.1  mrg 	      /* Computing MIN */
   2078      1.1  mrg 	      i3 = 256;
   2079      1.1  mrg 	      i4 = k - ll + 1;
   2080      1.1  mrg 	      lsec = min(i3,i4);
   2081      1.1  mrg 	      ulsec = lsec - lsec % 2;
   2082      1.1  mrg 
   2083      1.1  mrg 	      i3 = m;
   2084      1.1  mrg 	      for (ii = 1; ii <= i3; ii += 256)
   2085      1.1  mrg 		{
   2086      1.1  mrg 		  /* Computing MIN */
   2087      1.1  mrg 		  i4 = 256;
   2088      1.1  mrg 		  i5 = m - ii + 1;
   2089      1.1  mrg 		  isec = min(i4,i5);
   2090      1.1  mrg 		  uisec = isec - isec % 2;
   2091      1.1  mrg 		  i4 = ll + ulsec - 1;
   2092      1.1  mrg 		  for (l = ll; l <= i4; l += 2)
   2093      1.1  mrg 		    {
   2094      1.1  mrg 		      i5 = ii + uisec - 1;
   2095      1.1  mrg 		      for (i = ii; i <= i5; i += 2)
   2096      1.1  mrg 			{
   2097      1.1  mrg 			  t1[l - ll + 1 + ((i - ii + 1) << 8) - 257] =
   2098      1.1  mrg 					a[i + l * a_dim1];
   2099      1.1  mrg 			  t1[l - ll + 2 + ((i - ii + 1) << 8) - 257] =
   2100      1.1  mrg 					a[i + (l + 1) * a_dim1];
   2101      1.1  mrg 			  t1[l - ll + 1 + ((i - ii + 2) << 8) - 257] =
   2102      1.1  mrg 					a[i + 1 + l * a_dim1];
   2103      1.1  mrg 			  t1[l - ll + 2 + ((i - ii + 2) << 8) - 257] =
   2104      1.1  mrg 					a[i + 1 + (l + 1) * a_dim1];
   2105      1.1  mrg 			}
   2106      1.1  mrg 		      if (uisec < isec)
   2107      1.1  mrg 			{
   2108      1.1  mrg 			  t1[l - ll + 1 + (isec << 8) - 257] =
   2109      1.1  mrg 				    a[ii + isec - 1 + l * a_dim1];
   2110      1.1  mrg 			  t1[l - ll + 2 + (isec << 8) - 257] =
   2111      1.1  mrg 				    a[ii + isec - 1 + (l + 1) * a_dim1];
   2112      1.1  mrg 			}
   2113      1.1  mrg 		    }
   2114      1.1  mrg 		  if (ulsec < lsec)
   2115      1.1  mrg 		    {
   2116      1.1  mrg 		      i4 = ii + isec - 1;
   2117      1.1  mrg 		      for (i = ii; i<= i4; ++i)
   2118      1.1  mrg 			{
   2119      1.1  mrg 			  t1[lsec + ((i - ii + 1) << 8) - 257] =
   2120      1.1  mrg 				    a[i + (ll + lsec - 1) * a_dim1];
   2121      1.1  mrg 			}
   2122      1.1  mrg 		    }
   2123      1.1  mrg 
   2124      1.1  mrg 		  uisec = isec - isec % 4;
   2125      1.1  mrg 		  i4 = jj + ujsec - 1;
   2126      1.1  mrg 		  for (j = jj; j <= i4; j += 4)
   2127      1.1  mrg 		    {
   2128      1.1  mrg 		      i5 = ii + uisec - 1;
   2129      1.1  mrg 		      for (i = ii; i <= i5; i += 4)
   2130      1.1  mrg 			{
   2131      1.1  mrg 			  f11 = c[i + j * c_dim1];
   2132      1.1  mrg 			  f21 = c[i + 1 + j * c_dim1];
   2133      1.1  mrg 			  f12 = c[i + (j + 1) * c_dim1];
   2134      1.1  mrg 			  f22 = c[i + 1 + (j + 1) * c_dim1];
   2135      1.1  mrg 			  f13 = c[i + (j + 2) * c_dim1];
   2136      1.1  mrg 			  f23 = c[i + 1 + (j + 2) * c_dim1];
   2137      1.1  mrg 			  f14 = c[i + (j + 3) * c_dim1];
   2138      1.1  mrg 			  f24 = c[i + 1 + (j + 3) * c_dim1];
   2139      1.1  mrg 			  f31 = c[i + 2 + j * c_dim1];
   2140      1.1  mrg 			  f41 = c[i + 3 + j * c_dim1];
   2141      1.1  mrg 			  f32 = c[i + 2 + (j + 1) * c_dim1];
   2142      1.1  mrg 			  f42 = c[i + 3 + (j + 1) * c_dim1];
   2143      1.1  mrg 			  f33 = c[i + 2 + (j + 2) * c_dim1];
   2144      1.1  mrg 			  f43 = c[i + 3 + (j + 2) * c_dim1];
   2145      1.1  mrg 			  f34 = c[i + 2 + (j + 3) * c_dim1];
   2146      1.1  mrg 			  f44 = c[i + 3 + (j + 3) * c_dim1];
   2147      1.1  mrg 			  i6 = ll + lsec - 1;
   2148      1.1  mrg 			  for (l = ll; l <= i6; ++l)
   2149      1.1  mrg 			    {
   2150      1.1  mrg 			      f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
   2151      1.1  mrg 				      * b[l + j * b_dim1];
   2152      1.1  mrg 			      f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
   2153      1.1  mrg 				      * b[l + j * b_dim1];
   2154      1.1  mrg 			      f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
   2155      1.1  mrg 				      * b[l + (j + 1) * b_dim1];
   2156      1.1  mrg 			      f22 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
   2157      1.1  mrg 				      * b[l + (j + 1) * b_dim1];
   2158      1.1  mrg 			      f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
   2159      1.1  mrg 				      * b[l + (j + 2) * b_dim1];
   2160      1.1  mrg 			      f23 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
   2161      1.1  mrg 				      * b[l + (j + 2) * b_dim1];
   2162      1.1  mrg 			      f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
   2163      1.1  mrg 				      * b[l + (j + 3) * b_dim1];
   2164      1.1  mrg 			      f24 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
   2165      1.1  mrg 				      * b[l + (j + 3) * b_dim1];
   2166      1.1  mrg 			      f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
   2167      1.1  mrg 				      * b[l + j * b_dim1];
   2168      1.1  mrg 			      f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
   2169      1.1  mrg 				      * b[l + j * b_dim1];
   2170      1.1  mrg 			      f32 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
   2171      1.1  mrg 				      * b[l + (j + 1) * b_dim1];
   2172      1.1  mrg 			      f42 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
   2173      1.1  mrg 				      * b[l + (j + 1) * b_dim1];
   2174      1.1  mrg 			      f33 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
   2175      1.1  mrg 				      * b[l + (j + 2) * b_dim1];
   2176      1.1  mrg 			      f43 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
   2177      1.1  mrg 				      * b[l + (j + 2) * b_dim1];
   2178      1.1  mrg 			      f34 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
   2179      1.1  mrg 				      * b[l + (j + 3) * b_dim1];
   2180      1.1  mrg 			      f44 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
   2181      1.1  mrg 				      * b[l + (j + 3) * b_dim1];
   2182      1.1  mrg 			    }
   2183      1.1  mrg 			  c[i + j * c_dim1] = f11;
   2184      1.1  mrg 			  c[i + 1 + j * c_dim1] = f21;
   2185      1.1  mrg 			  c[i + (j + 1) * c_dim1] = f12;
   2186      1.1  mrg 			  c[i + 1 + (j + 1) * c_dim1] = f22;
   2187      1.1  mrg 			  c[i + (j + 2) * c_dim1] = f13;
   2188      1.1  mrg 			  c[i + 1 + (j + 2) * c_dim1] = f23;
   2189      1.1  mrg 			  c[i + (j + 3) * c_dim1] = f14;
   2190      1.1  mrg 			  c[i + 1 + (j + 3) * c_dim1] = f24;
   2191      1.1  mrg 			  c[i + 2 + j * c_dim1] = f31;
   2192      1.1  mrg 			  c[i + 3 + j * c_dim1] = f41;
   2193      1.1  mrg 			  c[i + 2 + (j + 1) * c_dim1] = f32;
   2194      1.1  mrg 			  c[i + 3 + (j + 1) * c_dim1] = f42;
   2195      1.1  mrg 			  c[i + 2 + (j + 2) * c_dim1] = f33;
   2196      1.1  mrg 			  c[i + 3 + (j + 2) * c_dim1] = f43;
   2197      1.1  mrg 			  c[i + 2 + (j + 3) * c_dim1] = f34;
   2198      1.1  mrg 			  c[i + 3 + (j + 3) * c_dim1] = f44;
   2199      1.1  mrg 			}
   2200      1.1  mrg 		      if (uisec < isec)
   2201      1.1  mrg 			{
   2202      1.1  mrg 			  i5 = ii + isec - 1;
   2203      1.1  mrg 			  for (i = ii + uisec; i <= i5; ++i)
   2204      1.1  mrg 			    {
   2205      1.1  mrg 			      f11 = c[i + j * c_dim1];
   2206      1.1  mrg 			      f12 = c[i + (j + 1) * c_dim1];
   2207      1.1  mrg 			      f13 = c[i + (j + 2) * c_dim1];
   2208      1.1  mrg 			      f14 = c[i + (j + 3) * c_dim1];
   2209      1.1  mrg 			      i6 = ll + lsec - 1;
   2210      1.1  mrg 			      for (l = ll; l <= i6; ++l)
   2211      1.1  mrg 				{
   2212      1.1  mrg 				  f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
   2213      1.1  mrg 					  257] * b[l + j * b_dim1];
   2214      1.1  mrg 				  f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
   2215      1.1  mrg 					  257] * b[l + (j + 1) * b_dim1];
   2216      1.1  mrg 				  f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
   2217      1.1  mrg 					  257] * b[l + (j + 2) * b_dim1];
   2218      1.1  mrg 				  f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
   2219      1.1  mrg 					  257] * b[l + (j + 3) * b_dim1];
   2220      1.1  mrg 				}
   2221      1.1  mrg 			      c[i + j * c_dim1] = f11;
   2222      1.1  mrg 			      c[i + (j + 1) * c_dim1] = f12;
   2223      1.1  mrg 			      c[i + (j + 2) * c_dim1] = f13;
   2224      1.1  mrg 			      c[i + (j + 3) * c_dim1] = f14;
   2225      1.1  mrg 			    }
   2226      1.1  mrg 			}
   2227      1.1  mrg 		    }
   2228      1.1  mrg 		  if (ujsec < jsec)
   2229      1.1  mrg 		    {
   2230      1.1  mrg 		      i4 = jj + jsec - 1;
   2231      1.1  mrg 		      for (j = jj + ujsec; j <= i4; ++j)
   2232      1.1  mrg 			{
   2233      1.1  mrg 			  i5 = ii + uisec - 1;
   2234      1.1  mrg 			  for (i = ii; i <= i5; i += 4)
   2235      1.1  mrg 			    {
   2236      1.1  mrg 			      f11 = c[i + j * c_dim1];
   2237      1.1  mrg 			      f21 = c[i + 1 + j * c_dim1];
   2238      1.1  mrg 			      f31 = c[i + 2 + j * c_dim1];
   2239      1.1  mrg 			      f41 = c[i + 3 + j * c_dim1];
   2240      1.1  mrg 			      i6 = ll + lsec - 1;
   2241      1.1  mrg 			      for (l = ll; l <= i6; ++l)
   2242      1.1  mrg 				{
   2243      1.1  mrg 				  f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
   2244      1.1  mrg 					  257] * b[l + j * b_dim1];
   2245      1.1  mrg 				  f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) -
   2246      1.1  mrg 					  257] * b[l + j * b_dim1];
   2247      1.1  mrg 				  f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) -
   2248      1.1  mrg 					  257] * b[l + j * b_dim1];
   2249      1.1  mrg 				  f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) -
   2250      1.1  mrg 					  257] * b[l + j * b_dim1];
   2251      1.1  mrg 				}
   2252      1.1  mrg 			      c[i + j * c_dim1] = f11;
   2253      1.1  mrg 			      c[i + 1 + j * c_dim1] = f21;
   2254      1.1  mrg 			      c[i + 2 + j * c_dim1] = f31;
   2255      1.1  mrg 			      c[i + 3 + j * c_dim1] = f41;
   2256      1.1  mrg 			    }
   2257      1.1  mrg 			  i5 = ii + isec - 1;
   2258      1.1  mrg 			  for (i = ii + uisec; i <= i5; ++i)
   2259      1.1  mrg 			    {
   2260      1.1  mrg 			      f11 = c[i + j * c_dim1];
   2261      1.1  mrg 			      i6 = ll + lsec - 1;
   2262      1.1  mrg 			      for (l = ll; l <= i6; ++l)
   2263      1.1  mrg 				{
   2264      1.1  mrg 				  f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
   2265      1.1  mrg 					  257] * b[l + j * b_dim1];
   2266      1.1  mrg 				}
   2267      1.1  mrg 			      c[i + j * c_dim1] = f11;
   2268      1.1  mrg 			    }
   2269      1.1  mrg 			}
   2270      1.1  mrg 		    }
   2271      1.1  mrg 		}
   2272      1.1  mrg 	    }
   2273      1.1  mrg 	}
   2274      1.1  mrg       free(t1);
   2275      1.1  mrg       return;
   2276      1.1  mrg     }
   2277      1.1  mrg   else if (rxstride == 1 && aystride == 1 && bxstride == 1)
   2278      1.1  mrg     {
   2279      1.1  mrg       if (GFC_DESCRIPTOR_RANK (a) != 1)
   2280      1.1  mrg 	{
   2281      1.1  mrg 	  const GFC_COMPLEX_16 *restrict abase_x;
   2282      1.1  mrg 	  const GFC_COMPLEX_16 *restrict bbase_y;
   2283      1.1  mrg 	  GFC_COMPLEX_16 *restrict dest_y;
   2284      1.1  mrg 	  GFC_COMPLEX_16 s;
   2285      1.1  mrg 
   2286      1.1  mrg 	  for (y = 0; y < ycount; y++)
   2287      1.1  mrg 	    {
   2288      1.1  mrg 	      bbase_y = &bbase[y*bystride];
   2289      1.1  mrg 	      dest_y = &dest[y*rystride];
   2290      1.1  mrg 	      for (x = 0; x < xcount; x++)
   2291      1.1  mrg 		{
   2292      1.1  mrg 		  abase_x = &abase[x*axstride];
   2293      1.1  mrg 		  s = (GFC_COMPLEX_16) 0;
   2294      1.1  mrg 		  for (n = 0; n < count; n++)
   2295      1.1  mrg 		    s += abase_x[n] * bbase_y[n];
   2296      1.1  mrg 		  dest_y[x] = s;
   2297      1.1  mrg 		}
   2298      1.1  mrg 	    }
   2299      1.1  mrg 	}
   2300      1.1  mrg       else
   2301      1.1  mrg 	{
   2302      1.1  mrg 	  const GFC_COMPLEX_16 *restrict bbase_y;
   2303      1.1  mrg 	  GFC_COMPLEX_16 s;
   2304      1.1  mrg 
   2305      1.1  mrg 	  for (y = 0; y < ycount; y++)
   2306      1.1  mrg 	    {
   2307      1.1  mrg 	      bbase_y = &bbase[y*bystride];
   2308      1.1  mrg 	      s = (GFC_COMPLEX_16) 0;
   2309      1.1  mrg 	      for (n = 0; n < count; n++)
   2310      1.1  mrg 		s += abase[n*axstride] * bbase_y[n];
   2311      1.1  mrg 	      dest[y*rystride] = s;
   2312      1.1  mrg 	    }
   2313      1.1  mrg 	}
   2314      1.1  mrg     }
   2315      1.1  mrg   else if (GFC_DESCRIPTOR_RANK (a) == 1)
   2316      1.1  mrg     {
   2317      1.1  mrg       const GFC_COMPLEX_16 *restrict bbase_y;
   2318      1.1  mrg       GFC_COMPLEX_16 s;
   2319      1.1  mrg 
   2320      1.1  mrg       for (y = 0; y < ycount; y++)
   2321      1.1  mrg 	{
   2322      1.1  mrg 	  bbase_y = &bbase[y*bystride];
   2323      1.1  mrg 	  s = (GFC_COMPLEX_16) 0;
   2324      1.1  mrg 	  for (n = 0; n < count; n++)
   2325      1.1  mrg 	    s += abase[n*axstride] * bbase_y[n*bxstride];
   2326      1.1  mrg 	  dest[y*rxstride] = s;
   2327      1.1  mrg 	}
   2328      1.1  mrg     }
   2329  1.1.1.2  mrg   else if (axstride < aystride)
   2330  1.1.1.2  mrg     {
   2331  1.1.1.2  mrg       for (y = 0; y < ycount; y++)
   2332  1.1.1.2  mrg 	for (x = 0; x < xcount; x++)
   2333  1.1.1.2  mrg 	  dest[x*rxstride + y*rystride] = (GFC_COMPLEX_16)0;
   2334  1.1.1.2  mrg 
   2335  1.1.1.2  mrg       for (y = 0; y < ycount; y++)
   2336  1.1.1.2  mrg 	for (n = 0; n < count; n++)
   2337  1.1.1.2  mrg 	  for (x = 0; x < xcount; x++)
   2338  1.1.1.2  mrg 	    /* dest[x,y] += a[x,n] * b[n,y] */
   2339  1.1.1.2  mrg 	    dest[x*rxstride + y*rystride] +=
   2340  1.1.1.2  mrg 					abase[x*axstride + n*aystride] *
   2341  1.1.1.2  mrg 					bbase[n*bxstride + y*bystride];
   2342  1.1.1.2  mrg     }
   2343      1.1  mrg   else
   2344      1.1  mrg     {
   2345      1.1  mrg       const GFC_COMPLEX_16 *restrict abase_x;
   2346      1.1  mrg       const GFC_COMPLEX_16 *restrict bbase_y;
   2347      1.1  mrg       GFC_COMPLEX_16 *restrict dest_y;
   2348      1.1  mrg       GFC_COMPLEX_16 s;
   2349      1.1  mrg 
   2350      1.1  mrg       for (y = 0; y < ycount; y++)
   2351      1.1  mrg 	{
   2352      1.1  mrg 	  bbase_y = &bbase[y*bystride];
   2353      1.1  mrg 	  dest_y = &dest[y*rystride];
   2354      1.1  mrg 	  for (x = 0; x < xcount; x++)
   2355      1.1  mrg 	    {
   2356      1.1  mrg 	      abase_x = &abase[x*axstride];
   2357      1.1  mrg 	      s = (GFC_COMPLEX_16) 0;
   2358      1.1  mrg 	      for (n = 0; n < count; n++)
   2359      1.1  mrg 		s += abase_x[n*aystride] * bbase_y[n*bxstride];
   2360      1.1  mrg 	      dest_y[x*rxstride] = s;
   2361      1.1  mrg 	    }
   2362      1.1  mrg 	}
   2363      1.1  mrg     }
   2364      1.1  mrg }
   2365      1.1  mrg #undef POW3
   2366      1.1  mrg #undef min
   2367      1.1  mrg #undef max
   2368      1.1  mrg 
   2369      1.1  mrg 
   2370      1.1  mrg /* Compiling main function, with selection code for the processor.  */
   2371      1.1  mrg 
   2372      1.1  mrg /* Currently, this is i386 only.  Adjust for other architectures.  */
   2373      1.1  mrg 
   2374      1.1  mrg void matmul_c16 (gfc_array_c16 * const restrict retarray,
   2375      1.1  mrg 	gfc_array_c16 * const restrict a, gfc_array_c16 * const restrict b, int try_blas,
   2376      1.1  mrg 	int blas_limit, blas_call gemm)
   2377      1.1  mrg {
   2378      1.1  mrg   static void (*matmul_p) (gfc_array_c16 * const restrict retarray,
   2379      1.1  mrg 	gfc_array_c16 * const restrict a, gfc_array_c16 * const restrict b, int try_blas,
   2380      1.1  mrg 	int blas_limit, blas_call gemm);
   2381      1.1  mrg 
   2382      1.1  mrg   void (*matmul_fn) (gfc_array_c16 * const restrict retarray,
   2383      1.1  mrg 	gfc_array_c16 * const restrict a, gfc_array_c16 * const restrict b, int try_blas,
   2384      1.1  mrg 	int blas_limit, blas_call gemm);
   2385      1.1  mrg 
   2386      1.1  mrg   matmul_fn = __atomic_load_n (&matmul_p, __ATOMIC_RELAXED);
   2387      1.1  mrg   if (matmul_fn == NULL)
   2388      1.1  mrg     {
   2389      1.1  mrg       matmul_fn = matmul_c16_vanilla;
   2390  1.1.1.3  mrg       if (__builtin_cpu_is ("intel"))
   2391      1.1  mrg 	{
   2392      1.1  mrg           /* Run down the available processors in order of preference.  */
   2393      1.1  mrg #ifdef HAVE_AVX512F
   2394  1.1.1.3  mrg 	  if (__builtin_cpu_supports ("avx512f"))
   2395      1.1  mrg 	    {
   2396      1.1  mrg 	      matmul_fn = matmul_c16_avx512f;
   2397      1.1  mrg 	      goto store;
   2398      1.1  mrg 	    }
   2399      1.1  mrg 
   2400      1.1  mrg #endif  /* HAVE_AVX512F */
   2401      1.1  mrg 
   2402      1.1  mrg #ifdef HAVE_AVX2
   2403  1.1.1.3  mrg 	  if (__builtin_cpu_supports ("avx2")
   2404  1.1.1.3  mrg 	      && __builtin_cpu_supports ("fma"))
   2405      1.1  mrg 	    {
   2406      1.1  mrg 	      matmul_fn = matmul_c16_avx2;
   2407      1.1  mrg 	      goto store;
   2408      1.1  mrg 	    }
   2409      1.1  mrg 
   2410      1.1  mrg #endif
   2411      1.1  mrg 
   2412      1.1  mrg #ifdef HAVE_AVX
   2413  1.1.1.3  mrg 	  if (__builtin_cpu_supports ("avx"))
   2414      1.1  mrg  	    {
   2415      1.1  mrg               matmul_fn = matmul_c16_avx;
   2416      1.1  mrg 	      goto store;
   2417      1.1  mrg 	    }
   2418      1.1  mrg #endif  /* HAVE_AVX */
   2419      1.1  mrg         }
   2420  1.1.1.3  mrg     else if (__builtin_cpu_is ("amd"))
   2421      1.1  mrg       {
   2422      1.1  mrg #if defined(HAVE_AVX) && defined(HAVE_FMA3) && defined(HAVE_AVX128)
   2423  1.1.1.3  mrg 	if (__builtin_cpu_supports ("avx")
   2424  1.1.1.3  mrg 	    && __builtin_cpu_supports ("fma"))
   2425      1.1  mrg 	  {
   2426      1.1  mrg             matmul_fn = matmul_c16_avx128_fma3;
   2427      1.1  mrg 	    goto store;
   2428      1.1  mrg 	  }
   2429      1.1  mrg #endif
   2430      1.1  mrg #if defined(HAVE_AVX) && defined(HAVE_FMA4) && defined(HAVE_AVX128)
   2431  1.1.1.3  mrg 	if (__builtin_cpu_supports ("avx")
   2432  1.1.1.3  mrg 	    && __builtin_cpu_supports ("fma4"))
   2433      1.1  mrg 	  {
   2434      1.1  mrg             matmul_fn = matmul_c16_avx128_fma4;
   2435      1.1  mrg 	    goto store;
   2436      1.1  mrg 	  }
   2437      1.1  mrg #endif
   2438      1.1  mrg 
   2439      1.1  mrg       }
   2440      1.1  mrg    store:
   2441      1.1  mrg       __atomic_store_n (&matmul_p, matmul_fn, __ATOMIC_RELAXED);
   2442      1.1  mrg    }
   2443      1.1  mrg 
   2444      1.1  mrg    (*matmul_fn) (retarray, a, b, try_blas, blas_limit, gemm);
   2445      1.1  mrg }
   2446      1.1  mrg 
   2447      1.1  mrg #else  /* Just the vanilla function.  */
   2448      1.1  mrg 
   2449      1.1  mrg void
   2450      1.1  mrg matmul_c16 (gfc_array_c16 * const restrict retarray,
   2451      1.1  mrg 	gfc_array_c16 * const restrict a, gfc_array_c16 * const restrict b, int try_blas,
   2452      1.1  mrg 	int blas_limit, blas_call gemm)
   2453      1.1  mrg {
   2454      1.1  mrg   const GFC_COMPLEX_16 * restrict abase;
   2455      1.1  mrg   const GFC_COMPLEX_16 * restrict bbase;
   2456      1.1  mrg   GFC_COMPLEX_16 * restrict dest;
   2457      1.1  mrg 
   2458      1.1  mrg   index_type rxstride, rystride, axstride, aystride, bxstride, bystride;
   2459      1.1  mrg   index_type x, y, n, count, xcount, ycount;
   2460      1.1  mrg 
   2461      1.1  mrg   assert (GFC_DESCRIPTOR_RANK (a) == 2
   2462      1.1  mrg           || GFC_DESCRIPTOR_RANK (b) == 2);
   2463      1.1  mrg 
   2464      1.1  mrg /* C[xcount,ycount] = A[xcount, count] * B[count,ycount]
   2465      1.1  mrg 
   2466      1.1  mrg    Either A or B (but not both) can be rank 1:
   2467      1.1  mrg 
   2468      1.1  mrg    o One-dimensional argument A is implicitly treated as a row matrix
   2469      1.1  mrg      dimensioned [1,count], so xcount=1.
   2470      1.1  mrg 
   2471      1.1  mrg    o One-dimensional argument B is implicitly treated as a column matrix
   2472      1.1  mrg      dimensioned [count, 1], so ycount=1.
   2473      1.1  mrg */
   2474      1.1  mrg 
   2475      1.1  mrg   if (retarray->base_addr == NULL)
   2476      1.1  mrg     {
   2477      1.1  mrg       if (GFC_DESCRIPTOR_RANK (a) == 1)
   2478      1.1  mrg         {
   2479      1.1  mrg 	  GFC_DIMENSION_SET(retarray->dim[0], 0,
   2480      1.1  mrg 	                    GFC_DESCRIPTOR_EXTENT(b,1) - 1, 1);
   2481      1.1  mrg         }
   2482      1.1  mrg       else if (GFC_DESCRIPTOR_RANK (b) == 1)
   2483      1.1  mrg         {
   2484      1.1  mrg 	  GFC_DIMENSION_SET(retarray->dim[0], 0,
   2485      1.1  mrg 	                    GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1);
   2486      1.1  mrg         }
   2487      1.1  mrg       else
   2488      1.1  mrg         {
   2489      1.1  mrg 	  GFC_DIMENSION_SET(retarray->dim[0], 0,
   2490      1.1  mrg 	                    GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1);
   2491      1.1  mrg 
   2492      1.1  mrg           GFC_DIMENSION_SET(retarray->dim[1], 0,
   2493      1.1  mrg 	                    GFC_DESCRIPTOR_EXTENT(b,1) - 1,
   2494      1.1  mrg 			    GFC_DESCRIPTOR_EXTENT(retarray,0));
   2495      1.1  mrg         }
   2496      1.1  mrg 
   2497      1.1  mrg       retarray->base_addr
   2498      1.1  mrg 	= xmallocarray (size0 ((array_t *) retarray), sizeof (GFC_COMPLEX_16));
   2499      1.1  mrg       retarray->offset = 0;
   2500      1.1  mrg     }
   2501      1.1  mrg   else if (unlikely (compile_options.bounds_check))
   2502      1.1  mrg     {
   2503      1.1  mrg       index_type ret_extent, arg_extent;
   2504      1.1  mrg 
   2505      1.1  mrg       if (GFC_DESCRIPTOR_RANK (a) == 1)
   2506      1.1  mrg 	{
   2507      1.1  mrg 	  arg_extent = GFC_DESCRIPTOR_EXTENT(b,1);
   2508      1.1  mrg 	  ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
   2509      1.1  mrg 	  if (arg_extent != ret_extent)
   2510      1.1  mrg 	    runtime_error ("Array bound mismatch for dimension 1 of "
   2511      1.1  mrg 	    		   "array (%ld/%ld) ",
   2512      1.1  mrg 			   (long int) ret_extent, (long int) arg_extent);
   2513      1.1  mrg 	}
   2514      1.1  mrg       else if (GFC_DESCRIPTOR_RANK (b) == 1)
   2515      1.1  mrg 	{
   2516      1.1  mrg 	  arg_extent = GFC_DESCRIPTOR_EXTENT(a,0);
   2517      1.1  mrg 	  ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
   2518      1.1  mrg 	  if (arg_extent != ret_extent)
   2519      1.1  mrg 	    runtime_error ("Array bound mismatch for dimension 1 of "
   2520      1.1  mrg 	    		   "array (%ld/%ld) ",
   2521      1.1  mrg 			   (long int) ret_extent, (long int) arg_extent);
   2522      1.1  mrg 	}
   2523      1.1  mrg       else
   2524      1.1  mrg 	{
   2525      1.1  mrg 	  arg_extent = GFC_DESCRIPTOR_EXTENT(a,0);
   2526      1.1  mrg 	  ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
   2527      1.1  mrg 	  if (arg_extent != ret_extent)
   2528      1.1  mrg 	    runtime_error ("Array bound mismatch for dimension 1 of "
   2529      1.1  mrg 	    		   "array (%ld/%ld) ",
   2530      1.1  mrg 			   (long int) ret_extent, (long int) arg_extent);
   2531      1.1  mrg 
   2532      1.1  mrg 	  arg_extent = GFC_DESCRIPTOR_EXTENT(b,1);
   2533      1.1  mrg 	  ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,1);
   2534      1.1  mrg 	  if (arg_extent != ret_extent)
   2535      1.1  mrg 	    runtime_error ("Array bound mismatch for dimension 2 of "
   2536      1.1  mrg 	    		   "array (%ld/%ld) ",
   2537      1.1  mrg 			   (long int) ret_extent, (long int) arg_extent);
   2538      1.1  mrg 	}
   2539      1.1  mrg     }
   2540      1.1  mrg 
   2541      1.1  mrg 
   2542      1.1  mrg   if (GFC_DESCRIPTOR_RANK (retarray) == 1)
   2543      1.1  mrg     {
   2544      1.1  mrg       /* One-dimensional result may be addressed in the code below
   2545      1.1  mrg 	 either as a row or a column matrix. We want both cases to
   2546      1.1  mrg 	 work. */
   2547      1.1  mrg       rxstride = rystride = GFC_DESCRIPTOR_STRIDE(retarray,0);
   2548      1.1  mrg     }
   2549      1.1  mrg   else
   2550      1.1  mrg     {
   2551      1.1  mrg       rxstride = GFC_DESCRIPTOR_STRIDE(retarray,0);
   2552      1.1  mrg       rystride = GFC_DESCRIPTOR_STRIDE(retarray,1);
   2553      1.1  mrg     }
   2554      1.1  mrg 
   2555      1.1  mrg 
   2556      1.1  mrg   if (GFC_DESCRIPTOR_RANK (a) == 1)
   2557      1.1  mrg     {
   2558      1.1  mrg       /* Treat it as a a row matrix A[1,count]. */
   2559      1.1  mrg       axstride = GFC_DESCRIPTOR_STRIDE(a,0);
   2560      1.1  mrg       aystride = 1;
   2561      1.1  mrg 
   2562      1.1  mrg       xcount = 1;
   2563      1.1  mrg       count = GFC_DESCRIPTOR_EXTENT(a,0);
   2564      1.1  mrg     }
   2565      1.1  mrg   else
   2566      1.1  mrg     {
   2567      1.1  mrg       axstride = GFC_DESCRIPTOR_STRIDE(a,0);
   2568      1.1  mrg       aystride = GFC_DESCRIPTOR_STRIDE(a,1);
   2569      1.1  mrg 
   2570      1.1  mrg       count = GFC_DESCRIPTOR_EXTENT(a,1);
   2571      1.1  mrg       xcount = GFC_DESCRIPTOR_EXTENT(a,0);
   2572      1.1  mrg     }
   2573      1.1  mrg 
   2574      1.1  mrg   if (count != GFC_DESCRIPTOR_EXTENT(b,0))
   2575      1.1  mrg     {
   2576      1.1  mrg       if (count > 0 || GFC_DESCRIPTOR_EXTENT(b,0) > 0)
   2577      1.1  mrg 	runtime_error ("Incorrect extent in argument B in MATMUL intrinsic "
   2578      1.1  mrg 		       "in dimension 1: is %ld, should be %ld",
   2579      1.1  mrg 		       (long int) GFC_DESCRIPTOR_EXTENT(b,0), (long int) count);
   2580      1.1  mrg     }
   2581      1.1  mrg 
   2582      1.1  mrg   if (GFC_DESCRIPTOR_RANK (b) == 1)
   2583      1.1  mrg     {
   2584      1.1  mrg       /* Treat it as a column matrix B[count,1] */
   2585      1.1  mrg       bxstride = GFC_DESCRIPTOR_STRIDE(b,0);
   2586      1.1  mrg 
   2587      1.1  mrg       /* bystride should never be used for 1-dimensional b.
   2588      1.1  mrg          The value is only used for calculation of the
   2589      1.1  mrg          memory by the buffer.  */
   2590      1.1  mrg       bystride = 256;
   2591      1.1  mrg       ycount = 1;
   2592      1.1  mrg     }
   2593      1.1  mrg   else
   2594      1.1  mrg     {
   2595      1.1  mrg       bxstride = GFC_DESCRIPTOR_STRIDE(b,0);
   2596      1.1  mrg       bystride = GFC_DESCRIPTOR_STRIDE(b,1);
   2597      1.1  mrg       ycount = GFC_DESCRIPTOR_EXTENT(b,1);
   2598      1.1  mrg     }
   2599      1.1  mrg 
   2600      1.1  mrg   abase = a->base_addr;
   2601      1.1  mrg   bbase = b->base_addr;
   2602      1.1  mrg   dest = retarray->base_addr;
   2603      1.1  mrg 
   2604      1.1  mrg   /* Now that everything is set up, we perform the multiplication
   2605      1.1  mrg      itself.  */
   2606      1.1  mrg 
   2607      1.1  mrg #define POW3(x) (((float) (x)) * ((float) (x)) * ((float) (x)))
   2608      1.1  mrg #define min(a,b) ((a) <= (b) ? (a) : (b))
   2609      1.1  mrg #define max(a,b) ((a) >= (b) ? (a) : (b))
   2610      1.1  mrg 
   2611      1.1  mrg   if (try_blas && rxstride == 1 && (axstride == 1 || aystride == 1)
   2612      1.1  mrg       && (bxstride == 1 || bystride == 1)
   2613      1.1  mrg       && (((float) xcount) * ((float) ycount) * ((float) count)
   2614      1.1  mrg           > POW3(blas_limit)))
   2615      1.1  mrg     {
   2616      1.1  mrg       const int m = xcount, n = ycount, k = count, ldc = rystride;
   2617      1.1  mrg       const GFC_COMPLEX_16 one = 1, zero = 0;
   2618      1.1  mrg       const int lda = (axstride == 1) ? aystride : axstride,
   2619      1.1  mrg 		ldb = (bxstride == 1) ? bystride : bxstride;
   2620      1.1  mrg 
   2621      1.1  mrg       if (lda > 0 && ldb > 0 && ldc > 0 && m > 1 && n > 1 && k > 1)
   2622      1.1  mrg 	{
   2623      1.1  mrg 	  assert (gemm != NULL);
   2624      1.1  mrg 	  const char *transa, *transb;
   2625      1.1  mrg 	  if (try_blas & 2)
   2626      1.1  mrg 	    transa = "C";
   2627      1.1  mrg 	  else
   2628      1.1  mrg 	    transa = axstride == 1 ? "N" : "T";
   2629      1.1  mrg 
   2630      1.1  mrg 	  if (try_blas & 4)
   2631      1.1  mrg 	    transb = "C";
   2632      1.1  mrg 	  else
   2633      1.1  mrg 	    transb = bxstride == 1 ? "N" : "T";
   2634      1.1  mrg 
   2635      1.1  mrg 	  gemm (transa, transb , &m,
   2636      1.1  mrg 		&n, &k,	&one, abase, &lda, bbase, &ldb, &zero, dest,
   2637      1.1  mrg 		&ldc, 1, 1);
   2638      1.1  mrg 	  return;
   2639      1.1  mrg 	}
   2640      1.1  mrg     }
   2641      1.1  mrg 
   2642  1.1.1.2  mrg   if (rxstride == 1 && axstride == 1 && bxstride == 1
   2643  1.1.1.2  mrg       && GFC_DESCRIPTOR_RANK (b) != 1)
   2644      1.1  mrg     {
   2645      1.1  mrg       /* This block of code implements a tuned matmul, derived from
   2646      1.1  mrg          Superscalar GEMM-based level 3 BLAS,  Beta version 0.1
   2647      1.1  mrg 
   2648      1.1  mrg                Bo Kagstrom and Per Ling
   2649      1.1  mrg                Department of Computing Science
   2650      1.1  mrg                Umea University
   2651      1.1  mrg                S-901 87 Umea, Sweden
   2652      1.1  mrg 
   2653      1.1  mrg 	 from netlib.org, translated to C, and modified for matmul.m4.  */
   2654      1.1  mrg 
   2655      1.1  mrg       const GFC_COMPLEX_16 *a, *b;
   2656      1.1  mrg       GFC_COMPLEX_16 *c;
   2657      1.1  mrg       const index_type m = xcount, n = ycount, k = count;
   2658      1.1  mrg 
   2659      1.1  mrg       /* System generated locals */
   2660      1.1  mrg       index_type a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset,
   2661      1.1  mrg 		 i1, i2, i3, i4, i5, i6;
   2662      1.1  mrg 
   2663      1.1  mrg       /* Local variables */
   2664      1.1  mrg       GFC_COMPLEX_16 f11, f12, f21, f22, f31, f32, f41, f42,
   2665      1.1  mrg 		 f13, f14, f23, f24, f33, f34, f43, f44;
   2666      1.1  mrg       index_type i, j, l, ii, jj, ll;
   2667      1.1  mrg       index_type isec, jsec, lsec, uisec, ujsec, ulsec;
   2668      1.1  mrg       GFC_COMPLEX_16 *t1;
   2669      1.1  mrg 
   2670      1.1  mrg       a = abase;
   2671      1.1  mrg       b = bbase;
   2672      1.1  mrg       c = retarray->base_addr;
   2673      1.1  mrg 
   2674      1.1  mrg       /* Parameter adjustments */
   2675      1.1  mrg       c_dim1 = rystride;
   2676      1.1  mrg       c_offset = 1 + c_dim1;
   2677      1.1  mrg       c -= c_offset;
   2678      1.1  mrg       a_dim1 = aystride;
   2679      1.1  mrg       a_offset = 1 + a_dim1;
   2680      1.1  mrg       a -= a_offset;
   2681      1.1  mrg       b_dim1 = bystride;
   2682      1.1  mrg       b_offset = 1 + b_dim1;
   2683      1.1  mrg       b -= b_offset;
   2684      1.1  mrg 
   2685      1.1  mrg       /* Empty c first.  */
   2686      1.1  mrg       for (j=1; j<=n; j++)
   2687      1.1  mrg 	for (i=1; i<=m; i++)
   2688      1.1  mrg 	  c[i + j * c_dim1] = (GFC_COMPLEX_16)0;
   2689      1.1  mrg 
   2690      1.1  mrg       /* Early exit if possible */
   2691      1.1  mrg       if (m == 0 || n == 0 || k == 0)
   2692      1.1  mrg 	return;
   2693      1.1  mrg 
   2694      1.1  mrg       /* Adjust size of t1 to what is needed.  */
   2695      1.1  mrg       index_type t1_dim, a_sz;
   2696      1.1  mrg       if (aystride == 1)
   2697      1.1  mrg         a_sz = rystride;
   2698      1.1  mrg       else
   2699      1.1  mrg         a_sz = a_dim1;
   2700      1.1  mrg 
   2701      1.1  mrg       t1_dim = a_sz * 256 + b_dim1;
   2702      1.1  mrg       if (t1_dim > 65536)
   2703      1.1  mrg 	t1_dim = 65536;
   2704      1.1  mrg 
   2705      1.1  mrg       t1 = malloc (t1_dim * sizeof(GFC_COMPLEX_16));
   2706      1.1  mrg 
   2707      1.1  mrg       /* Start turning the crank. */
   2708      1.1  mrg       i1 = n;
   2709      1.1  mrg       for (jj = 1; jj <= i1; jj += 512)
   2710      1.1  mrg 	{
   2711      1.1  mrg 	  /* Computing MIN */
   2712      1.1  mrg 	  i2 = 512;
   2713      1.1  mrg 	  i3 = n - jj + 1;
   2714      1.1  mrg 	  jsec = min(i2,i3);
   2715      1.1  mrg 	  ujsec = jsec - jsec % 4;
   2716      1.1  mrg 	  i2 = k;
   2717      1.1  mrg 	  for (ll = 1; ll <= i2; ll += 256)
   2718      1.1  mrg 	    {
   2719      1.1  mrg 	      /* Computing MIN */
   2720      1.1  mrg 	      i3 = 256;
   2721      1.1  mrg 	      i4 = k - ll + 1;
   2722      1.1  mrg 	      lsec = min(i3,i4);
   2723      1.1  mrg 	      ulsec = lsec - lsec % 2;
   2724      1.1  mrg 
   2725      1.1  mrg 	      i3 = m;
   2726      1.1  mrg 	      for (ii = 1; ii <= i3; ii += 256)
   2727      1.1  mrg 		{
   2728      1.1  mrg 		  /* Computing MIN */
   2729      1.1  mrg 		  i4 = 256;
   2730      1.1  mrg 		  i5 = m - ii + 1;
   2731      1.1  mrg 		  isec = min(i4,i5);
   2732      1.1  mrg 		  uisec = isec - isec % 2;
   2733      1.1  mrg 		  i4 = ll + ulsec - 1;
   2734      1.1  mrg 		  for (l = ll; l <= i4; l += 2)
   2735      1.1  mrg 		    {
   2736      1.1  mrg 		      i5 = ii + uisec - 1;
   2737      1.1  mrg 		      for (i = ii; i <= i5; i += 2)
   2738      1.1  mrg 			{
   2739      1.1  mrg 			  t1[l - ll + 1 + ((i - ii + 1) << 8) - 257] =
   2740      1.1  mrg 					a[i + l * a_dim1];
   2741      1.1  mrg 			  t1[l - ll + 2 + ((i - ii + 1) << 8) - 257] =
   2742      1.1  mrg 					a[i + (l + 1) * a_dim1];
   2743      1.1  mrg 			  t1[l - ll + 1 + ((i - ii + 2) << 8) - 257] =
   2744      1.1  mrg 					a[i + 1 + l * a_dim1];
   2745      1.1  mrg 			  t1[l - ll + 2 + ((i - ii + 2) << 8) - 257] =
   2746      1.1  mrg 					a[i + 1 + (l + 1) * a_dim1];
   2747      1.1  mrg 			}
   2748      1.1  mrg 		      if (uisec < isec)
   2749      1.1  mrg 			{
   2750      1.1  mrg 			  t1[l - ll + 1 + (isec << 8) - 257] =
   2751      1.1  mrg 				    a[ii + isec - 1 + l * a_dim1];
   2752      1.1  mrg 			  t1[l - ll + 2 + (isec << 8) - 257] =
   2753      1.1  mrg 				    a[ii + isec - 1 + (l + 1) * a_dim1];
   2754      1.1  mrg 			}
   2755      1.1  mrg 		    }
   2756      1.1  mrg 		  if (ulsec < lsec)
   2757      1.1  mrg 		    {
   2758      1.1  mrg 		      i4 = ii + isec - 1;
   2759      1.1  mrg 		      for (i = ii; i<= i4; ++i)
   2760      1.1  mrg 			{
   2761      1.1  mrg 			  t1[lsec + ((i - ii + 1) << 8) - 257] =
   2762      1.1  mrg 				    a[i + (ll + lsec - 1) * a_dim1];
   2763      1.1  mrg 			}
   2764      1.1  mrg 		    }
   2765      1.1  mrg 
   2766      1.1  mrg 		  uisec = isec - isec % 4;
   2767      1.1  mrg 		  i4 = jj + ujsec - 1;
   2768      1.1  mrg 		  for (j = jj; j <= i4; j += 4)
   2769      1.1  mrg 		    {
   2770      1.1  mrg 		      i5 = ii + uisec - 1;
   2771      1.1  mrg 		      for (i = ii; i <= i5; i += 4)
   2772      1.1  mrg 			{
   2773      1.1  mrg 			  f11 = c[i + j * c_dim1];
   2774      1.1  mrg 			  f21 = c[i + 1 + j * c_dim1];
   2775      1.1  mrg 			  f12 = c[i + (j + 1) * c_dim1];
   2776      1.1  mrg 			  f22 = c[i + 1 + (j + 1) * c_dim1];
   2777      1.1  mrg 			  f13 = c[i + (j + 2) * c_dim1];
   2778      1.1  mrg 			  f23 = c[i + 1 + (j + 2) * c_dim1];
   2779      1.1  mrg 			  f14 = c[i + (j + 3) * c_dim1];
   2780      1.1  mrg 			  f24 = c[i + 1 + (j + 3) * c_dim1];
   2781      1.1  mrg 			  f31 = c[i + 2 + j * c_dim1];
   2782      1.1  mrg 			  f41 = c[i + 3 + j * c_dim1];
   2783      1.1  mrg 			  f32 = c[i + 2 + (j + 1) * c_dim1];
   2784      1.1  mrg 			  f42 = c[i + 3 + (j + 1) * c_dim1];
   2785      1.1  mrg 			  f33 = c[i + 2 + (j + 2) * c_dim1];
   2786      1.1  mrg 			  f43 = c[i + 3 + (j + 2) * c_dim1];
   2787      1.1  mrg 			  f34 = c[i + 2 + (j + 3) * c_dim1];
   2788      1.1  mrg 			  f44 = c[i + 3 + (j + 3) * c_dim1];
   2789      1.1  mrg 			  i6 = ll + lsec - 1;
   2790      1.1  mrg 			  for (l = ll; l <= i6; ++l)
   2791      1.1  mrg 			    {
   2792      1.1  mrg 			      f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
   2793      1.1  mrg 				      * b[l + j * b_dim1];
   2794      1.1  mrg 			      f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
   2795      1.1  mrg 				      * b[l + j * b_dim1];
   2796      1.1  mrg 			      f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
   2797      1.1  mrg 				      * b[l + (j + 1) * b_dim1];
   2798      1.1  mrg 			      f22 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
   2799      1.1  mrg 				      * b[l + (j + 1) * b_dim1];
   2800      1.1  mrg 			      f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
   2801      1.1  mrg 				      * b[l + (j + 2) * b_dim1];
   2802      1.1  mrg 			      f23 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
   2803      1.1  mrg 				      * b[l + (j + 2) * b_dim1];
   2804      1.1  mrg 			      f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
   2805      1.1  mrg 				      * b[l + (j + 3) * b_dim1];
   2806      1.1  mrg 			      f24 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
   2807      1.1  mrg 				      * b[l + (j + 3) * b_dim1];
   2808      1.1  mrg 			      f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
   2809      1.1  mrg 				      * b[l + j * b_dim1];
   2810      1.1  mrg 			      f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
   2811      1.1  mrg 				      * b[l + j * b_dim1];
   2812      1.1  mrg 			      f32 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
   2813      1.1  mrg 				      * b[l + (j + 1) * b_dim1];
   2814      1.1  mrg 			      f42 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
   2815      1.1  mrg 				      * b[l + (j + 1) * b_dim1];
   2816      1.1  mrg 			      f33 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
   2817      1.1  mrg 				      * b[l + (j + 2) * b_dim1];
   2818      1.1  mrg 			      f43 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
   2819      1.1  mrg 				      * b[l + (j + 2) * b_dim1];
   2820      1.1  mrg 			      f34 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
   2821      1.1  mrg 				      * b[l + (j + 3) * b_dim1];
   2822      1.1  mrg 			      f44 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
   2823      1.1  mrg 				      * b[l + (j + 3) * b_dim1];
   2824      1.1  mrg 			    }
   2825      1.1  mrg 			  c[i + j * c_dim1] = f11;
   2826      1.1  mrg 			  c[i + 1 + j * c_dim1] = f21;
   2827      1.1  mrg 			  c[i + (j + 1) * c_dim1] = f12;
   2828      1.1  mrg 			  c[i + 1 + (j + 1) * c_dim1] = f22;
   2829      1.1  mrg 			  c[i + (j + 2) * c_dim1] = f13;
   2830      1.1  mrg 			  c[i + 1 + (j + 2) * c_dim1] = f23;
   2831      1.1  mrg 			  c[i + (j + 3) * c_dim1] = f14;
   2832      1.1  mrg 			  c[i + 1 + (j + 3) * c_dim1] = f24;
   2833      1.1  mrg 			  c[i + 2 + j * c_dim1] = f31;
   2834      1.1  mrg 			  c[i + 3 + j * c_dim1] = f41;
   2835      1.1  mrg 			  c[i + 2 + (j + 1) * c_dim1] = f32;
   2836      1.1  mrg 			  c[i + 3 + (j + 1) * c_dim1] = f42;
   2837      1.1  mrg 			  c[i + 2 + (j + 2) * c_dim1] = f33;
   2838      1.1  mrg 			  c[i + 3 + (j + 2) * c_dim1] = f43;
   2839      1.1  mrg 			  c[i + 2 + (j + 3) * c_dim1] = f34;
   2840      1.1  mrg 			  c[i + 3 + (j + 3) * c_dim1] = f44;
   2841      1.1  mrg 			}
   2842      1.1  mrg 		      if (uisec < isec)
   2843      1.1  mrg 			{
   2844      1.1  mrg 			  i5 = ii + isec - 1;
   2845      1.1  mrg 			  for (i = ii + uisec; i <= i5; ++i)
   2846      1.1  mrg 			    {
   2847      1.1  mrg 			      f11 = c[i + j * c_dim1];
   2848      1.1  mrg 			      f12 = c[i + (j + 1) * c_dim1];
   2849      1.1  mrg 			      f13 = c[i + (j + 2) * c_dim1];
   2850      1.1  mrg 			      f14 = c[i + (j + 3) * c_dim1];
   2851      1.1  mrg 			      i6 = ll + lsec - 1;
   2852      1.1  mrg 			      for (l = ll; l <= i6; ++l)
   2853      1.1  mrg 				{
   2854      1.1  mrg 				  f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
   2855      1.1  mrg 					  257] * b[l + j * b_dim1];
   2856      1.1  mrg 				  f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
   2857      1.1  mrg 					  257] * b[l + (j + 1) * b_dim1];
   2858      1.1  mrg 				  f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
   2859      1.1  mrg 					  257] * b[l + (j + 2) * b_dim1];
   2860      1.1  mrg 				  f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
   2861      1.1  mrg 					  257] * b[l + (j + 3) * b_dim1];
   2862      1.1  mrg 				}
   2863      1.1  mrg 			      c[i + j * c_dim1] = f11;
   2864      1.1  mrg 			      c[i + (j + 1) * c_dim1] = f12;
   2865      1.1  mrg 			      c[i + (j + 2) * c_dim1] = f13;
   2866      1.1  mrg 			      c[i + (j + 3) * c_dim1] = f14;
   2867      1.1  mrg 			    }
   2868      1.1  mrg 			}
   2869      1.1  mrg 		    }
   2870      1.1  mrg 		  if (ujsec < jsec)
   2871      1.1  mrg 		    {
   2872      1.1  mrg 		      i4 = jj + jsec - 1;
   2873      1.1  mrg 		      for (j = jj + ujsec; j <= i4; ++j)
   2874      1.1  mrg 			{
   2875      1.1  mrg 			  i5 = ii + uisec - 1;
   2876      1.1  mrg 			  for (i = ii; i <= i5; i += 4)
   2877      1.1  mrg 			    {
   2878      1.1  mrg 			      f11 = c[i + j * c_dim1];
   2879      1.1  mrg 			      f21 = c[i + 1 + j * c_dim1];
   2880      1.1  mrg 			      f31 = c[i + 2 + j * c_dim1];
   2881      1.1  mrg 			      f41 = c[i + 3 + j * c_dim1];
   2882      1.1  mrg 			      i6 = ll + lsec - 1;
   2883      1.1  mrg 			      for (l = ll; l <= i6; ++l)
   2884      1.1  mrg 				{
   2885      1.1  mrg 				  f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
   2886      1.1  mrg 					  257] * b[l + j * b_dim1];
   2887      1.1  mrg 				  f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) -
   2888      1.1  mrg 					  257] * b[l + j * b_dim1];
   2889      1.1  mrg 				  f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) -
   2890      1.1  mrg 					  257] * b[l + j * b_dim1];
   2891      1.1  mrg 				  f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) -
   2892      1.1  mrg 					  257] * b[l + j * b_dim1];
   2893      1.1  mrg 				}
   2894      1.1  mrg 			      c[i + j * c_dim1] = f11;
   2895      1.1  mrg 			      c[i + 1 + j * c_dim1] = f21;
   2896      1.1  mrg 			      c[i + 2 + j * c_dim1] = f31;
   2897      1.1  mrg 			      c[i + 3 + j * c_dim1] = f41;
   2898      1.1  mrg 			    }
   2899      1.1  mrg 			  i5 = ii + isec - 1;
   2900      1.1  mrg 			  for (i = ii + uisec; i <= i5; ++i)
   2901      1.1  mrg 			    {
   2902      1.1  mrg 			      f11 = c[i + j * c_dim1];
   2903      1.1  mrg 			      i6 = ll + lsec - 1;
   2904      1.1  mrg 			      for (l = ll; l <= i6; ++l)
   2905      1.1  mrg 				{
   2906      1.1  mrg 				  f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
   2907      1.1  mrg 					  257] * b[l + j * b_dim1];
   2908      1.1  mrg 				}
   2909      1.1  mrg 			      c[i + j * c_dim1] = f11;
   2910      1.1  mrg 			    }
   2911      1.1  mrg 			}
   2912      1.1  mrg 		    }
   2913      1.1  mrg 		}
   2914      1.1  mrg 	    }
   2915      1.1  mrg 	}
   2916      1.1  mrg       free(t1);
   2917      1.1  mrg       return;
   2918      1.1  mrg     }
   2919      1.1  mrg   else if (rxstride == 1 && aystride == 1 && bxstride == 1)
   2920      1.1  mrg     {
   2921      1.1  mrg       if (GFC_DESCRIPTOR_RANK (a) != 1)
   2922      1.1  mrg 	{
   2923      1.1  mrg 	  const GFC_COMPLEX_16 *restrict abase_x;
   2924      1.1  mrg 	  const GFC_COMPLEX_16 *restrict bbase_y;
   2925      1.1  mrg 	  GFC_COMPLEX_16 *restrict dest_y;
   2926      1.1  mrg 	  GFC_COMPLEX_16 s;
   2927      1.1  mrg 
   2928      1.1  mrg 	  for (y = 0; y < ycount; y++)
   2929      1.1  mrg 	    {
   2930      1.1  mrg 	      bbase_y = &bbase[y*bystride];
   2931      1.1  mrg 	      dest_y = &dest[y*rystride];
   2932      1.1  mrg 	      for (x = 0; x < xcount; x++)
   2933      1.1  mrg 		{
   2934      1.1  mrg 		  abase_x = &abase[x*axstride];
   2935      1.1  mrg 		  s = (GFC_COMPLEX_16) 0;
   2936      1.1  mrg 		  for (n = 0; n < count; n++)
   2937      1.1  mrg 		    s += abase_x[n] * bbase_y[n];
   2938      1.1  mrg 		  dest_y[x] = s;
   2939      1.1  mrg 		}
   2940      1.1  mrg 	    }
   2941      1.1  mrg 	}
   2942      1.1  mrg       else
   2943      1.1  mrg 	{
   2944      1.1  mrg 	  const GFC_COMPLEX_16 *restrict bbase_y;
   2945      1.1  mrg 	  GFC_COMPLEX_16 s;
   2946      1.1  mrg 
   2947      1.1  mrg 	  for (y = 0; y < ycount; y++)
   2948      1.1  mrg 	    {
   2949      1.1  mrg 	      bbase_y = &bbase[y*bystride];
   2950      1.1  mrg 	      s = (GFC_COMPLEX_16) 0;
   2951      1.1  mrg 	      for (n = 0; n < count; n++)
   2952      1.1  mrg 		s += abase[n*axstride] * bbase_y[n];
   2953      1.1  mrg 	      dest[y*rystride] = s;
   2954      1.1  mrg 	    }
   2955      1.1  mrg 	}
   2956      1.1  mrg     }
   2957      1.1  mrg   else if (GFC_DESCRIPTOR_RANK (a) == 1)
   2958      1.1  mrg     {
   2959      1.1  mrg       const GFC_COMPLEX_16 *restrict bbase_y;
   2960      1.1  mrg       GFC_COMPLEX_16 s;
   2961      1.1  mrg 
   2962      1.1  mrg       for (y = 0; y < ycount; y++)
   2963      1.1  mrg 	{
   2964      1.1  mrg 	  bbase_y = &bbase[y*bystride];
   2965      1.1  mrg 	  s = (GFC_COMPLEX_16) 0;
   2966      1.1  mrg 	  for (n = 0; n < count; n++)
   2967      1.1  mrg 	    s += abase[n*axstride] * bbase_y[n*bxstride];
   2968      1.1  mrg 	  dest[y*rxstride] = s;
   2969      1.1  mrg 	}
   2970      1.1  mrg     }
   2971  1.1.1.2  mrg   else if (axstride < aystride)
   2972  1.1.1.2  mrg     {
   2973  1.1.1.2  mrg       for (y = 0; y < ycount; y++)
   2974  1.1.1.2  mrg 	for (x = 0; x < xcount; x++)
   2975  1.1.1.2  mrg 	  dest[x*rxstride + y*rystride] = (GFC_COMPLEX_16)0;
   2976  1.1.1.2  mrg 
   2977  1.1.1.2  mrg       for (y = 0; y < ycount; y++)
   2978  1.1.1.2  mrg 	for (n = 0; n < count; n++)
   2979  1.1.1.2  mrg 	  for (x = 0; x < xcount; x++)
   2980  1.1.1.2  mrg 	    /* dest[x,y] += a[x,n] * b[n,y] */
   2981  1.1.1.2  mrg 	    dest[x*rxstride + y*rystride] +=
   2982  1.1.1.2  mrg 					abase[x*axstride + n*aystride] *
   2983  1.1.1.2  mrg 					bbase[n*bxstride + y*bystride];
   2984  1.1.1.2  mrg     }
   2985      1.1  mrg   else
   2986      1.1  mrg     {
   2987      1.1  mrg       const GFC_COMPLEX_16 *restrict abase_x;
   2988      1.1  mrg       const GFC_COMPLEX_16 *restrict bbase_y;
   2989      1.1  mrg       GFC_COMPLEX_16 *restrict dest_y;
   2990      1.1  mrg       GFC_COMPLEX_16 s;
   2991      1.1  mrg 
   2992      1.1  mrg       for (y = 0; y < ycount; y++)
   2993      1.1  mrg 	{
   2994      1.1  mrg 	  bbase_y = &bbase[y*bystride];
   2995      1.1  mrg 	  dest_y = &dest[y*rystride];
   2996      1.1  mrg 	  for (x = 0; x < xcount; x++)
   2997      1.1  mrg 	    {
   2998      1.1  mrg 	      abase_x = &abase[x*axstride];
   2999      1.1  mrg 	      s = (GFC_COMPLEX_16) 0;
   3000      1.1  mrg 	      for (n = 0; n < count; n++)
   3001      1.1  mrg 		s += abase_x[n*aystride] * bbase_y[n*bxstride];
   3002      1.1  mrg 	      dest_y[x*rxstride] = s;
   3003      1.1  mrg 	    }
   3004      1.1  mrg 	}
   3005      1.1  mrg     }
   3006      1.1  mrg }
   3007      1.1  mrg #undef POW3
   3008      1.1  mrg #undef min
   3009      1.1  mrg #undef max
   3010      1.1  mrg 
   3011      1.1  mrg #endif
   3012      1.1  mrg #endif
   3013      1.1  mrg 
   3014