1 /* Mulders' short product, square and division. 2 3 Copyright 2005-2023 Free Software Foundation, Inc. 4 Contributed by the AriC and Caramba projects, INRIA. 5 6 This file is part of the GNU MPFR Library. 7 8 The GNU MPFR Library is free software; you can redistribute it and/or modify 9 it under the terms of the GNU Lesser General Public License as published by 10 the Free Software Foundation; either version 3 of the License, or (at your 11 option) any later version. 12 13 The GNU MPFR Library is distributed in the hope that it will be useful, but 14 WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 15 or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public 16 License for more details. 17 18 You should have received a copy of the GNU Lesser General Public License 19 along with the GNU MPFR Library; see the file COPYING.LESSER. If not, see 20 https://www.gnu.org/licenses/ or write to the Free Software Foundation, Inc., 21 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA. */ 22 23 /* References: 24 [1] Short Division of Long Integers, David Harvey and Paul Zimmermann, 25 Proceedings of the 20th Symposium on Computer Arithmetic (ARITH-20), 26 July 25-27, 2011, pages 7-14. 27 */ 28 29 #define MPFR_NEED_LONGLONG_H 30 #include "mpfr-impl.h" 31 32 /* Don't use MPFR_MULHIGH_SIZE since it is handled by tuneup */ 33 #ifdef MPFR_MULHIGH_TAB_SIZE 34 static short mulhigh_ktab[MPFR_MULHIGH_TAB_SIZE]; 35 #else 36 static short mulhigh_ktab[] = {MPFR_MULHIGH_TAB}; 37 #define MPFR_MULHIGH_TAB_SIZE (numberof_const (mulhigh_ktab)) 38 #endif 39 40 /* Put in rp[n..2n-1] an approximation of the n high limbs 41 of {up, n} * {vp, n}. The error is less than n ulps of rp[n] (and the 42 approximation is always less or equal to the truncated full product). 43 Assume 2n limbs are allocated at rp. 44 45 Implements Algorithm ShortMulNaive from [1]. 46 */ 47 static void 48 mpfr_mulhigh_n_basecase (mpfr_limb_ptr rp, mpfr_limb_srcptr up, 49 mpfr_limb_srcptr vp, mp_size_t n) 50 { 51 mp_size_t i; 52 53 rp += n - 1; 54 umul_ppmm (rp[1], rp[0], up[n-1], vp[0]); /* we neglect up[0..n-2]*vp[0], 55 which is less than B^n */ 56 for (i = 1 ; i < n ; i++) 57 /* here, we neglect up[0..n-i-2] * vp[i], which is less than B^n too */ 58 rp[i + 1] = mpn_addmul_1 (rp, up + (n - i - 1), i + 1, vp[i]); 59 /* in total, we neglect less than n*B^n, i.e., n ulps of rp[n]. */ 60 } 61 62 /* Put in rp[n..2n-1] an approximation of the n high limbs 63 of {np, n} * {mp, n}. The error is less than n ulps of rp[n] (and the 64 approximation is always less or equal to the truncated full product). 65 66 Implements Algorithm ShortMul from [1]. 67 */ 68 void 69 mpfr_mulhigh_n (mpfr_limb_ptr rp, mpfr_limb_srcptr np, mpfr_limb_srcptr mp, 70 mp_size_t n) 71 { 72 mp_size_t k; 73 74 MPFR_STAT_STATIC_ASSERT (MPFR_MULHIGH_TAB_SIZE >= 8); /* so that 3*(n/4) > n/2 */ 75 k = MPFR_LIKELY (n < MPFR_MULHIGH_TAB_SIZE) ? mulhigh_ktab[n] : 3*(n/4); 76 /* Algorithm ShortMul from [1] requires k >= (n+3)/2, which translates 77 into k >= (n+4)/2 in the C language. */ 78 MPFR_ASSERTD (k == -1 || k == 0 || (k >= (n+4)/2 && k < n)); 79 if (k < 0) 80 mpn_mul_basecase (rp, np, n, mp, n); /* result is exact, no error */ 81 else if (k == 0) 82 mpfr_mulhigh_n_basecase (rp, np, mp, n); /* basecase error < n ulps */ 83 else if (n > MUL_FFT_THRESHOLD) 84 mpn_mul_n (rp, np, mp, n); /* result is exact, no error */ 85 else 86 { 87 mp_size_t l = n - k; 88 mp_limb_t cy; 89 90 mpn_mul_n (rp + 2 * l, np + l, mp + l, k); /* fills rp[2l..2n-1] */ 91 mpfr_mulhigh_n (rp, np + k, mp, l); /* fills rp[l-1..2l-1] */ 92 cy = mpn_add_n (rp + n - 1, rp + n - 1, rp + l - 1, l + 1); 93 mpfr_mulhigh_n (rp, np, mp + k, l); /* fills rp[l-1..2l-1] */ 94 cy += mpn_add_n (rp + n - 1, rp + n - 1, rp + l - 1, l + 1); 95 mpn_add_1 (rp + n + l, rp + n + l, k, cy); /* propagate carry */ 96 } 97 } 98 99 #ifdef MPFR_SQRHIGH_TAB_SIZE 100 static short sqrhigh_ktab[MPFR_SQRHIGH_TAB_SIZE]; 101 #else 102 static short sqrhigh_ktab[] = {MPFR_SQRHIGH_TAB}; 103 #define MPFR_SQRHIGH_TAB_SIZE (numberof_const (sqrhigh_ktab)) 104 #endif 105 106 /* Put in rp[n..2n-1] an approximation of the n high limbs 107 of {np, n}^2. The error is less than n ulps of rp[n]. */ 108 void 109 mpfr_sqrhigh_n (mpfr_limb_ptr rp, mpfr_limb_srcptr np, mp_size_t n) 110 { 111 mp_size_t k; 112 113 MPFR_STAT_STATIC_ASSERT (MPFR_SQRHIGH_TAB_SIZE > 2); /* ensures k < n */ 114 k = MPFR_LIKELY (n < MPFR_SQRHIGH_TAB_SIZE) ? sqrhigh_ktab[n] 115 : (n+4)/2; /* ensures that k >= (n+3)/2 */ 116 MPFR_ASSERTD (k == -1 || k == 0 || (k >= (n+4)/2 && k < n)); 117 if (k < 0) 118 /* we can't use mpn_sqr_basecase here, since it requires 119 n <= SQR_KARATSUBA_THRESHOLD, where SQR_KARATSUBA_THRESHOLD 120 is not exported by GMP */ 121 mpn_sqr (rp, np, n); 122 else if (k == 0) 123 mpfr_mulhigh_n_basecase (rp, np, np, n); 124 else 125 { 126 mp_size_t l = n - k; 127 mp_limb_t cy; 128 129 mpn_sqr (rp + 2 * l, np + l, k); /* fills rp[2l..2n-1] */ 130 mpfr_mulhigh_n (rp, np, np + k, l); /* fills rp[l-1..2l-1] */ 131 /* {rp+n-1,l+1} += 2 * {rp+l-1,l+1} */ 132 cy = mpn_lshift (rp + l - 1, rp + l - 1, l + 1, 1); 133 cy += mpn_add_n (rp + n - 1, rp + n - 1, rp + l - 1, l + 1); 134 mpn_add_1 (rp + n + l, rp + n + l, k, cy); /* propagate carry */ 135 } 136 } 137 138 #ifdef MPFR_DIVHIGH_TAB_SIZE 139 static short divhigh_ktab[MPFR_DIVHIGH_TAB_SIZE]; 140 #else 141 static short divhigh_ktab[] = {MPFR_DIVHIGH_TAB}; 142 #define MPFR_DIVHIGH_TAB_SIZE (numberof_const (divhigh_ktab)) 143 #endif 144 145 /* Put in Q={qp, n} an approximation of N={np, 2*n} divided by D={dp, n}, 146 with the most significant limb of the quotient as return value (0 or 1). 147 Assumes the most significant bit of D is set. Clobbers N. 148 149 The approximate quotient Q satisfies - 2(n-1) < N/D - Q <= 4. 150 151 Assumes n >= 2. 152 */ 153 static mp_limb_t 154 mpfr_divhigh_n_basecase (mpfr_limb_ptr qp, mpfr_limb_ptr np, 155 mpfr_limb_srcptr dp, mp_size_t n) 156 { 157 mp_limb_t qh, d1, d0, q2, q1, q0; 158 mpfr_pi1_t dinv2; 159 160 MPFR_ASSERTD(n >= 2); 161 162 np += n; 163 164 if ((qh = (mpn_cmp (np, dp, n) >= 0))) 165 mpn_sub_n (np, np, dp, n); 166 167 /* now {np, n} is less than D={dp, n}, which implies np[n-1] <= dp[n-1] */ 168 169 d1 = dp[n - 1]; 170 171 /* we assumed n >= 2 */ 172 d0 = dp[n - 2]; 173 invert_pi1 (dinv2, d1, d0); 174 /* dinv2.inv32 = floor ((B^3 - 1) / (d0 + d1 B)) - B */ 175 while (n > 1) 176 { 177 /* Invariant: it remains to reduce n limbs from N (in addition to the 178 initial low n limbs). 179 Since n >= 2 here, necessarily we had n >= 2 initially, which means 180 that in addition to the limb np[n-1] to reduce, we have at least 2 181 extra limbs, thus accessing np[n-3] is valid. */ 182 183 /* Warning: we can have np[n-1]>d1 or (np[n-1]=d1 and np[n-2]>=d0) here, 184 since we truncate the divisor at each step, but since {np,n} < D 185 originally, the largest possible partial quotient is B-1. */ 186 if (MPFR_UNLIKELY(np[n-1] > d1 || (np[n-1] == d1 && np[n-2] >= d0))) 187 q2 = MPFR_LIMB_MAX; 188 else 189 udiv_qr_3by2 (q2, q1, q0, np[n - 1], np[n - 2], np[n - 3], 190 d1, d0, dinv2.inv32); 191 /* since q2 = floor((np[n-1]*B^2+np[n-2]*B+np[n-3])/(d1*B+d0)), 192 we have q2 <= (np[n-1]*B^2+np[n-2]*B+np[n-3])/(d1*B+d0), 193 thus np[n-1]*B^2+np[n-2]*B+np[n-3] >= q2*(d1*B+d0) 194 and {np-1, n} >= q2*D - q2*B^(n-2) >= q2*D - B^(n-1) 195 thus {np-1, n} - (q2-1)*D >= D - B^(n-1) >= 0 196 which proves that at most one correction is needed */ 197 q0 = mpn_submul_1 (np - 1, dp, n, q2); 198 if (MPFR_UNLIKELY(q0 > np[n - 1])) 199 { 200 mpn_add_n (np - 1, np - 1, dp, n); 201 q2 --; 202 } 203 qp[--n] = q2; 204 dp ++; 205 } 206 207 /* we have B+dinv2 = floor((B^3-1)/(d1*B+d0)) < B^2/d1 208 q1 = floor(np[0]*(B+dinv2)/B) <= floor(np[0]*B/d1) 209 <= floor((np[0]*B+np[1])/d1) 210 thus q1 is not larger than the true quotient. 211 q1 > np[0]*(B+dinv2)/B - 1 > np[0]*(B^3-1)/(d1*B+d0)/B - 2 212 For d1*B+d0 <> B^2/2, we have B+dinv2 = floor(B^3/(d1*B+d0)) 213 thus q1 > np[0]*B^2/(d1*B+d0) - 2, i.e., 214 (d1*B+d0)*q1 > np[0]*B^2 - 2*(d1*B+d0) 215 d1*B*q1 > np[0]*B^2 - 2*d1*B - 2*d0 - d0*q1 >= np[0]*B^2 - 2*d1*B - B^2 216 thus q1 > np[0]*B/d1 - 2 - B/d1 > np[0]*B/d1 - 4. 217 218 For d1*B+d0 = B^2/2, dinv2 = B-1 thus q1 > np[0]*(2B-1)/B - 1 > 219 np[0]*B/d1 - 2. 220 221 In all cases, if q = floor((np[0]*B+np[1])/d1), we have: 222 q - 4 <= q1 <= q 223 */ 224 umul_ppmm (q1, q0, np[0], dinv2.inv32); 225 qp[0] = np[0] + q1; 226 227 return qh; 228 } 229 230 /* Put in {qp, n} an approximation of N={np, 2*n} divided by D={dp, n}, 231 with the most significant limb of the quotient as return value (0 or 1). 232 Assumes the most significant bit of D is set. Clobbers N. 233 234 This implements the ShortDiv algorithm from reference [1]. 235 236 Assumes n >= 2 (which should be fulfilled also in the recursive calls). 237 */ 238 mp_limb_t 239 mpfr_divhigh_n (mpfr_limb_ptr qp, mpfr_limb_ptr np, mpfr_limb_ptr dp, 240 mp_size_t n) 241 { 242 mp_size_t k, l; 243 mp_limb_t qh, cy; 244 mpfr_limb_ptr tp; 245 MPFR_TMP_DECL(marker); 246 247 MPFR_STAT_STATIC_ASSERT (MPFR_DIVHIGH_TAB_SIZE >= 15); /* so that 2*(n/3) >= (n+4)/2 */ 248 MPFR_ASSERTD(n >= 2); 249 k = MPFR_LIKELY (n < MPFR_DIVHIGH_TAB_SIZE) ? divhigh_ktab[n] : 2*(n/3); 250 251 if (k == 0) 252 { 253 #if defined(WANT_GMP_INTERNALS) && defined(HAVE___GMPN_SBPI1_DIVAPPR_Q) 254 mpfr_pi1_t dinv2; 255 invert_pi1 (dinv2, dp[n - 1], dp[n - 2]); 256 if (n > 2) /* sbpi1_divappr_q wants n > 2 */ 257 return __gmpn_sbpi1_divappr_q (qp, np, n + n, dp, n, dinv2.inv32); 258 else 259 return mpfr_divhigh_n_basecase (qp, np, dp, n); 260 #else /* use our own code for base-case short division */ 261 return mpfr_divhigh_n_basecase (qp, np, dp, n); 262 #endif 263 } 264 265 /* Check the bounds from [1]. In addition, we forbid k=n-1, which would 266 give l=1 in the recursive call. It follows n >= 5. */ 267 MPFR_ASSERTD ((n+4)/2 <= k && k < n-1); 268 269 MPFR_TMP_MARK (marker); 270 l = n - k; 271 /* first divide the most significant 2k limbs from N by the most significant 272 k limbs of D */ 273 qh = mpn_divrem (qp + l, 0, np + 2 * l, 2 * k, dp + l, k); /* exact */ 274 275 /* it remains {np,2l+k} = {np,n+l} as remainder */ 276 277 /* now we have to subtract high(Q1)*D0 where Q1=qh*B^k+{qp+l,k} and 278 D0={dp,l} */ 279 tp = MPFR_TMP_LIMBS_ALLOC (2 * l); 280 mpfr_mulhigh_n (tp, qp + k, dp, l); 281 /* we are only interested in the upper l limbs from {tp,2l} */ 282 cy = mpn_sub_n (np + n, np + n, tp + l, l); 283 if (qh) 284 cy += mpn_sub_n (np + n, np + n, dp, l); 285 while (cy > 0) /* Q1 was too large: subtract 1 to Q1 and add D to np+l */ 286 { 287 qh -= mpn_sub_1 (qp + l, qp + l, k, MPFR_LIMB_ONE); 288 cy -= mpn_add_n (np + l, np + l, dp, n); 289 } 290 291 /* now it remains {np,n+l} to divide by D */ 292 cy = mpfr_divhigh_n (qp, np + k, dp + k, l); 293 qh += mpn_add_1 (qp + l, qp + l, k, cy); 294 MPFR_TMP_FREE(marker); 295 296 return qh; 297 } 298