cray/ieee/mul_1.c

    1.1  mrg /* Cray PVP/IEEE mpn_mul_1 -- multiply a limb vector with a limb and store the
    1.1  mrg    result in a second limb vector.
    1.1  mrg
    1.1  mrg Copyright 2000, 2001 Free Software Foundation, Inc.
    1.1  mrg
    1.1  mrg This file is part of the GNU MP Library.
    1.1  mrg
    1.1  mrg The GNU MP Library is free software; you can redistribute it and/or modify
1.1.1.2  mrg it under the terms of either:
1.1.1.2  mrg
1.1.1.2  mrg   * the GNU Lesser General Public License as published by the Free
1.1.1.2  mrg     Software Foundation; either version 3 of the License, or (at your
1.1.1.2  mrg     option) any later version.
1.1.1.2  mrg
1.1.1.2  mrg or
1.1.1.2  mrg
1.1.1.2  mrg   * the GNU General Public License as published by the Free Software
1.1.1.2  mrg     Foundation; either version 2 of the License, or (at your option) any
1.1.1.2  mrg     later version.
1.1.1.2  mrg
1.1.1.2  mrg or both in parallel, as here.
    1.1  mrg
    1.1  mrg The GNU MP Library is distributed in the hope that it will be useful, but
    1.1  mrg WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
1.1.1.2  mrg or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
1.1.1.2  mrg for more details.
    1.1  mrg
1.1.1.2  mrg You should have received copies of the GNU General Public License and the
1.1.1.2  mrg GNU Lesser General Public License along with the GNU MP Library.  If not,
1.1.1.2  mrg see https://www.gnu.org/licenses/.  */
    1.1  mrg
    1.1  mrg /* This code runs at 5 cycles/limb on a T90.  That would probably
    1.1  mrg    be hard to improve upon, even with assembly code.  */
    1.1  mrg
    1.1  mrg #include <intrinsics.h>
    1.1  mrg #include "gmp-impl.h"
    1.1  mrg
    1.1  mrg mp_limb_t
    1.1  mrg mpn_mul_1 (mp_ptr rp, mp_srcptr up, mp_size_t n, mp_limb_t vl)
    1.1  mrg {
    1.1  mrg   mp_limb_t cy[n];
    1.1  mrg   mp_limb_t a, b, r, s0, s1, c0, c1;
    1.1  mrg   mp_size_t i;
    1.1  mrg   int more_carries;
    1.1  mrg
    1.1  mrg   if (up == rp)
    1.1  mrg     {
    1.1  mrg       /* The algorithm used below cannot handle overlap.  Handle it here by
    1.1  mrg 	 making a temporary copy of the source vector, then call ourselves.  */
    1.1  mrg       mp_limb_t xp[n];
    1.1  mrg       MPN_COPY (xp, up, n);
    1.1  mrg       return mpn_mul_1 (rp, xp, n, vl);
    1.1  mrg     }
    1.1  mrg
    1.1  mrg   a = up[0] * vl;
    1.1  mrg   rp[0] = a;
    1.1  mrg   cy[0] = 0;
    1.1  mrg
    1.1  mrg   /* Main multiply loop.  Generate a raw accumulated output product in rp[]
    1.1  mrg      and a carry vector in cy[].  */
    1.1  mrg #pragma _CRI ivdep
    1.1  mrg   for (i = 1; i < n; i++)
    1.1  mrg     {
    1.1  mrg       a = up[i] * vl;
    1.1  mrg       b = _int_mult_upper (up[i - 1], vl);
    1.1  mrg       s0 = a + b;
    1.1  mrg       c0 = ((a & b) | ((a | b) & ~s0)) >> 63;
    1.1  mrg       rp[i] = s0;
    1.1  mrg       cy[i] = c0;
    1.1  mrg     }
    1.1  mrg   /* Carry add loop.  Add the carry vector cy[] to the raw sum rp[] and
    1.1  mrg      store the new sum back to rp[0].  */
    1.1  mrg   more_carries = 0;
    1.1  mrg #pragma _CRI ivdep
    1.1  mrg   for (i = 2; i < n; i++)
    1.1  mrg     {
    1.1  mrg       r = rp[i];
    1.1  mrg       c0 = cy[i - 1];
    1.1  mrg       s0 = r + c0;
    1.1  mrg       rp[i] = s0;
    1.1  mrg       c0 = (r & ~s0) >> 63;
    1.1  mrg       more_carries += c0;
    1.1  mrg     }
    1.1  mrg   /* If that second loop generated carry, handle that in scalar loop.  */
    1.1  mrg   if (more_carries)
    1.1  mrg     {
    1.1  mrg       mp_limb_t cyrec = 0;
    1.1  mrg       /* Look for places where rp[k] is zero and cy[k-1] is non-zero.
    1.1  mrg 	 These are where we got a recurrency carry.  */
    1.1  mrg       for (i = 2; i < n; i++)
    1.1  mrg 	{
    1.1  mrg 	  r = rp[i];
    1.1  mrg 	  c0 = (r == 0 && cy[i - 1] != 0);
    1.1  mrg 	  s0 = r + cyrec;
    1.1  mrg 	  rp[i] = s0;
    1.1  mrg 	  c1 = (r & ~s0) >> 63;
    1.1  mrg 	  cyrec = c0 | c1;
    1.1  mrg 	}
    1.1  mrg       return _int_mult_upper (up[n - 1], vl) + cyrec + cy[n - 1];
    1.1  mrg     }
    1.1  mrg
    1.1  mrg   return _int_mult_upper (up[n - 1], vl) + cy[n - 1];
    1.1  mrg }