1 1.1 mrg /* Cray PVP/IEEE mpn_submul_1 -- multiply a limb vector with a limb and 2 1.1 mrg subtract the result from a second limb vector. 3 1.1 mrg 4 1.1.1.2 mrg Copyright 2000-2002 Free Software Foundation, Inc. 5 1.1 mrg 6 1.1 mrg This file is part of the GNU MP Library. 7 1.1 mrg 8 1.1 mrg The GNU MP Library is free software; you can redistribute it and/or modify 9 1.1.1.2 mrg it under the terms of either: 10 1.1.1.2 mrg 11 1.1.1.2 mrg * the GNU Lesser General Public License as published by the Free 12 1.1.1.2 mrg Software Foundation; either version 3 of the License, or (at your 13 1.1.1.2 mrg option) any later version. 14 1.1.1.2 mrg 15 1.1.1.2 mrg or 16 1.1.1.2 mrg 17 1.1.1.2 mrg * the GNU General Public License as published by the Free Software 18 1.1.1.2 mrg Foundation; either version 2 of the License, or (at your option) any 19 1.1.1.2 mrg later version. 20 1.1.1.2 mrg 21 1.1.1.2 mrg or both in parallel, as here. 22 1.1 mrg 23 1.1 mrg The GNU MP Library is distributed in the hope that it will be useful, but 24 1.1 mrg WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 25 1.1.1.2 mrg or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 26 1.1.1.2 mrg for more details. 27 1.1 mrg 28 1.1.1.2 mrg You should have received copies of the GNU General Public License and the 29 1.1.1.2 mrg GNU Lesser General Public License along with the GNU MP Library. If not, 30 1.1.1.2 mrg see https://www.gnu.org/licenses/. */ 31 1.1 mrg 32 1.1 mrg /* This code runs at just under 9 cycles/limb on a T90. That is not perfect, 33 1.1 mrg mainly due to vector register shortage in the main loop. Assembly code 34 1.1 mrg should bring it down to perhaps 7 cycles/limb. */ 35 1.1 mrg 36 1.1 mrg #include <intrinsics.h> 37 1.1 mrg #include "gmp-impl.h" 38 1.1 mrg 39 1.1 mrg mp_limb_t 40 1.1 mrg mpn_submul_1 (mp_ptr rp, mp_srcptr up, mp_size_t n, mp_limb_t vl) 41 1.1 mrg { 42 1.1 mrg mp_limb_t cy[n]; 43 1.1 mrg mp_limb_t a, b, r, s0, s1, c0, c1; 44 1.1 mrg mp_size_t i; 45 1.1 mrg int more_carries; 46 1.1 mrg 47 1.1 mrg if (up == rp) 48 1.1 mrg { 49 1.1 mrg /* The algorithm used below cannot handle overlap. Handle it here by 50 1.1 mrg making a temporary copy of the source vector, then call ourselves. */ 51 1.1 mrg mp_limb_t xp[n]; 52 1.1 mrg MPN_COPY (xp, up, n); 53 1.1 mrg return mpn_submul_1 (rp, xp, n, vl); 54 1.1 mrg } 55 1.1 mrg 56 1.1 mrg a = up[0] * vl; 57 1.1 mrg r = rp[0]; 58 1.1 mrg s0 = r - a; 59 1.1 mrg rp[0] = s0; 60 1.1 mrg c1 = ((s0 & a) | ((s0 | a) & ~r)) >> 63; 61 1.1 mrg cy[0] = c1; 62 1.1 mrg 63 1.1 mrg /* Main multiply loop. Generate a raw accumulated output product in rp[] 64 1.1 mrg and a carry vector in cy[]. */ 65 1.1 mrg #pragma _CRI ivdep 66 1.1 mrg for (i = 1; i < n; i++) 67 1.1 mrg { 68 1.1 mrg a = up[i] * vl; 69 1.1 mrg b = _int_mult_upper (up[i - 1], vl); 70 1.1 mrg s0 = a + b; 71 1.1 mrg c0 = ((a & b) | ((a | b) & ~s0)) >> 63; 72 1.1 mrg r = rp[i]; 73 1.1 mrg s1 = r - s0; 74 1.1 mrg rp[i] = s1; 75 1.1 mrg c1 = ((s1 & s0) | ((s1 | s0) & ~r)) >> 63; 76 1.1 mrg cy[i] = c0 + c1; 77 1.1 mrg } 78 1.1 mrg /* Carry subtract loop. Subtract the carry vector cy[] from the raw result 79 1.1 mrg rp[] and store the new result back to rp[]. */ 80 1.1 mrg more_carries = 0; 81 1.1 mrg #pragma _CRI ivdep 82 1.1 mrg for (i = 1; i < n; i++) 83 1.1 mrg { 84 1.1 mrg r = rp[i]; 85 1.1 mrg c0 = cy[i - 1]; 86 1.1 mrg s0 = r - c0; 87 1.1 mrg rp[i] = s0; 88 1.1 mrg c0 = (s0 & ~r) >> 63; 89 1.1 mrg more_carries += c0; 90 1.1 mrg } 91 1.1 mrg /* If that second loop generated carry, handle that in scalar loop. */ 92 1.1 mrg if (more_carries) 93 1.1 mrg { 94 1.1 mrg mp_limb_t cyrec = 0; 95 1.1 mrg /* Look for places where rp[k] == ~0 and cy[k-1] == 1 or 96 1.1 mrg rp[k] == ~1 and cy[k-1] == 2. 97 1.1 mrg These are where we got a recurrency carry. */ 98 1.1 mrg for (i = 1; i < n; i++) 99 1.1 mrg { 100 1.1 mrg r = rp[i]; 101 1.1 mrg c0 = ~r < cy[i - 1]; 102 1.1 mrg s0 = r - cyrec; 103 1.1 mrg rp[i] = s0; 104 1.1 mrg c1 = (s0 & ~r) >> 63; 105 1.1 mrg cyrec = c0 | c1; 106 1.1 mrg } 107 1.1 mrg return _int_mult_upper (up[n - 1], vl) + cyrec + cy[n - 1]; 108 1.1 mrg } 109 1.1 mrg 110 1.1 mrg return _int_mult_upper (up[n - 1], vl) + cy[n - 1]; 111 1.1 mrg } 112