Home | History | Annotate | Line # | Download | only in ieee
      1      1.1  mrg /* Cray PVP/IEEE mpn_submul_1 -- multiply a limb vector with a limb and
      2      1.1  mrg    subtract the result from a second limb vector.
      3      1.1  mrg 
      4  1.1.1.2  mrg Copyright 2000-2002 Free Software Foundation, Inc.
      5      1.1  mrg 
      6      1.1  mrg This file is part of the GNU MP Library.
      7      1.1  mrg 
      8      1.1  mrg The GNU MP Library is free software; you can redistribute it and/or modify
      9  1.1.1.2  mrg it under the terms of either:
     10  1.1.1.2  mrg 
     11  1.1.1.2  mrg   * the GNU Lesser General Public License as published by the Free
     12  1.1.1.2  mrg     Software Foundation; either version 3 of the License, or (at your
     13  1.1.1.2  mrg     option) any later version.
     14  1.1.1.2  mrg 
     15  1.1.1.2  mrg or
     16  1.1.1.2  mrg 
     17  1.1.1.2  mrg   * the GNU General Public License as published by the Free Software
     18  1.1.1.2  mrg     Foundation; either version 2 of the License, or (at your option) any
     19  1.1.1.2  mrg     later version.
     20  1.1.1.2  mrg 
     21  1.1.1.2  mrg or both in parallel, as here.
     22      1.1  mrg 
     23      1.1  mrg The GNU MP Library is distributed in the hope that it will be useful, but
     24      1.1  mrg WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
     25  1.1.1.2  mrg or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
     26  1.1.1.2  mrg for more details.
     27      1.1  mrg 
     28  1.1.1.2  mrg You should have received copies of the GNU General Public License and the
     29  1.1.1.2  mrg GNU Lesser General Public License along with the GNU MP Library.  If not,
     30  1.1.1.2  mrg see https://www.gnu.org/licenses/.  */
     31      1.1  mrg 
     32      1.1  mrg /* This code runs at just under 9 cycles/limb on a T90.  That is not perfect,
     33      1.1  mrg    mainly due to vector register shortage in the main loop.  Assembly code
     34      1.1  mrg    should bring it down to perhaps 7 cycles/limb.  */
     35      1.1  mrg 
     36      1.1  mrg #include <intrinsics.h>
     37      1.1  mrg #include "gmp-impl.h"
     38      1.1  mrg 
     39      1.1  mrg mp_limb_t
     40      1.1  mrg mpn_submul_1 (mp_ptr rp, mp_srcptr up, mp_size_t n, mp_limb_t vl)
     41      1.1  mrg {
     42      1.1  mrg   mp_limb_t cy[n];
     43      1.1  mrg   mp_limb_t a, b, r, s0, s1, c0, c1;
     44      1.1  mrg   mp_size_t i;
     45      1.1  mrg   int more_carries;
     46      1.1  mrg 
     47      1.1  mrg   if (up == rp)
     48      1.1  mrg     {
     49      1.1  mrg       /* The algorithm used below cannot handle overlap.  Handle it here by
     50      1.1  mrg 	 making a temporary copy of the source vector, then call ourselves.  */
     51      1.1  mrg       mp_limb_t xp[n];
     52      1.1  mrg       MPN_COPY (xp, up, n);
     53      1.1  mrg       return mpn_submul_1 (rp, xp, n, vl);
     54      1.1  mrg     }
     55      1.1  mrg 
     56      1.1  mrg   a = up[0] * vl;
     57      1.1  mrg   r = rp[0];
     58      1.1  mrg   s0 = r - a;
     59      1.1  mrg   rp[0] = s0;
     60      1.1  mrg   c1 = ((s0 & a) | ((s0 | a) & ~r)) >> 63;
     61      1.1  mrg   cy[0] = c1;
     62      1.1  mrg 
     63      1.1  mrg   /* Main multiply loop.  Generate a raw accumulated output product in rp[]
     64      1.1  mrg      and a carry vector in cy[].  */
     65      1.1  mrg #pragma _CRI ivdep
     66      1.1  mrg   for (i = 1; i < n; i++)
     67      1.1  mrg     {
     68      1.1  mrg       a = up[i] * vl;
     69      1.1  mrg       b = _int_mult_upper (up[i - 1], vl);
     70      1.1  mrg       s0 = a + b;
     71      1.1  mrg       c0 = ((a & b) | ((a | b) & ~s0)) >> 63;
     72      1.1  mrg       r = rp[i];
     73      1.1  mrg       s1 = r - s0;
     74      1.1  mrg       rp[i] = s1;
     75      1.1  mrg       c1 = ((s1 & s0) | ((s1 | s0) & ~r)) >> 63;
     76      1.1  mrg       cy[i] = c0 + c1;
     77      1.1  mrg     }
     78      1.1  mrg   /* Carry subtract loop.  Subtract the carry vector cy[] from the raw result
     79      1.1  mrg      rp[] and store the new result back to rp[].  */
     80      1.1  mrg   more_carries = 0;
     81      1.1  mrg #pragma _CRI ivdep
     82      1.1  mrg   for (i = 1; i < n; i++)
     83      1.1  mrg     {
     84      1.1  mrg       r = rp[i];
     85      1.1  mrg       c0 = cy[i - 1];
     86      1.1  mrg       s0 = r - c0;
     87      1.1  mrg       rp[i] = s0;
     88      1.1  mrg       c0 = (s0 & ~r) >> 63;
     89      1.1  mrg       more_carries += c0;
     90      1.1  mrg     }
     91      1.1  mrg   /* If that second loop generated carry, handle that in scalar loop.  */
     92      1.1  mrg   if (more_carries)
     93      1.1  mrg     {
     94      1.1  mrg       mp_limb_t cyrec = 0;
     95      1.1  mrg       /* Look for places where rp[k] == ~0 and cy[k-1] == 1 or
     96      1.1  mrg 	 rp[k] == ~1 and cy[k-1] == 2.
     97      1.1  mrg 	 These are where we got a recurrency carry.  */
     98      1.1  mrg       for (i = 1; i < n; i++)
     99      1.1  mrg 	{
    100      1.1  mrg 	  r = rp[i];
    101      1.1  mrg 	  c0 = ~r < cy[i - 1];
    102      1.1  mrg 	  s0 = r - cyrec;
    103      1.1  mrg 	  rp[i] = s0;
    104      1.1  mrg 	  c1 = (s0 & ~r) >> 63;
    105      1.1  mrg 	  cyrec = c0 | c1;
    106      1.1  mrg 	}
    107      1.1  mrg       return _int_mult_upper (up[n - 1], vl) + cyrec + cy[n - 1];
    108      1.1  mrg     }
    109      1.1  mrg 
    110      1.1  mrg   return _int_mult_upper (up[n - 1], vl) + cy[n - 1];
    111      1.1  mrg }
    112