1 1.1 mrg /* mpn_mu_div_qr, mpn_preinv_mu_div_qr. 2 1.1 mrg 3 1.1 mrg Compute Q = floor(N / D) and R = N-QD. N is nn limbs and D is dn limbs and 4 1.1 mrg must be normalized, and Q must be nn-dn limbs. The requirement that Q is 5 1.1 mrg nn-dn limbs (and not nn-dn+1 limbs) was put in place in order to allow us to 6 1.1 mrg let N be unmodified during the operation. 7 1.1 mrg 8 1.1 mrg Contributed to the GNU project by Torbjorn Granlund. 9 1.1 mrg 10 1.1 mrg THE FUNCTIONS IN THIS FILE ARE INTERNAL WITH MUTABLE INTERFACES. IT IS ONLY 11 1.1 mrg SAFE TO REACH THEM THROUGH DOCUMENTED INTERFACES. IN FACT, IT IS ALMOST 12 1.1 mrg GUARANTEED THAT THEY WILL CHANGE OR DISAPPEAR IN A FUTURE GMP RELEASE. 13 1.1 mrg 14 1.1.1.3 mrg Copyright 2005-2007, 2009, 2010 Free Software Foundation, Inc. 15 1.1 mrg 16 1.1 mrg This file is part of the GNU MP Library. 17 1.1 mrg 18 1.1 mrg The GNU MP Library is free software; you can redistribute it and/or modify 19 1.1.1.3 mrg it under the terms of either: 20 1.1.1.3 mrg 21 1.1.1.3 mrg * the GNU Lesser General Public License as published by the Free 22 1.1.1.3 mrg Software Foundation; either version 3 of the License, or (at your 23 1.1.1.3 mrg option) any later version. 24 1.1.1.3 mrg 25 1.1.1.3 mrg or 26 1.1.1.3 mrg 27 1.1.1.3 mrg * the GNU General Public License as published by the Free Software 28 1.1.1.3 mrg Foundation; either version 2 of the License, or (at your option) any 29 1.1.1.3 mrg later version. 30 1.1.1.3 mrg 31 1.1.1.3 mrg or both in parallel, as here. 32 1.1 mrg 33 1.1 mrg The GNU MP Library is distributed in the hope that it will be useful, but 34 1.1 mrg WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 35 1.1.1.3 mrg or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 36 1.1.1.3 mrg for more details. 37 1.1 mrg 38 1.1.1.3 mrg You should have received copies of the GNU General Public License and the 39 1.1.1.3 mrg GNU Lesser General Public License along with the GNU MP Library. If not, 40 1.1.1.3 mrg see https://www.gnu.org/licenses/. */ 41 1.1 mrg 42 1.1 mrg 43 1.1 mrg /* 44 1.1 mrg The idea of the algorithm used herein is to compute a smaller inverted value 45 1.1 mrg than used in the standard Barrett algorithm, and thus save time in the 46 1.1 mrg Newton iterations, and pay just a small price when using the inverted value 47 1.1 mrg for developing quotient bits. This algorithm was presented at ICMS 2006. 48 1.1 mrg */ 49 1.1 mrg 50 1.1 mrg /* CAUTION: This code and the code in mu_divappr_q.c should be edited in sync. 51 1.1 mrg 52 1.1 mrg Things to work on: 53 1.1 mrg 54 1.1 mrg * This isn't optimal when the quotient isn't needed, as it might take a lot 55 1.1 mrg of space. The computation is always needed, though, so there is no time to 56 1.1 mrg save with special code. 57 1.1 mrg 58 1.1 mrg * The itch/scratch scheme isn't perhaps such a good idea as it once seemed, 59 1.1 mrg demonstrated by the fact that the mpn_invertappr function's scratch needs 60 1.1 mrg mean that we need to keep a large allocation long after it is needed. 61 1.1 mrg Things are worse as mpn_mul_fft does not accept any scratch parameter, 62 1.1 mrg which means we'll have a large memory hole while in mpn_mul_fft. In 63 1.1 mrg general, a peak scratch need in the beginning of a function isn't 64 1.1 mrg well-handled by the itch/scratch scheme. 65 1.1 mrg */ 66 1.1 mrg 67 1.1 mrg #ifdef STAT 68 1.1 mrg #undef STAT 69 1.1 mrg #define STAT(x) x 70 1.1 mrg #else 71 1.1 mrg #define STAT(x) 72 1.1 mrg #endif 73 1.1 mrg 74 1.1 mrg #include <stdlib.h> /* for NULL */ 75 1.1 mrg #include "gmp-impl.h" 76 1.1 mrg 77 1.1 mrg 78 1.1 mrg /* FIXME: The MU_DIV_QR_SKEW_THRESHOLD was not analysed properly. It gives a 79 1.1 mrg speedup according to old measurements, but does the decision mechanism 80 1.1 mrg really make sense? It seem like the quotient between dn and qn might be 81 1.1 mrg what we really should be checking. */ 82 1.1 mrg #ifndef MU_DIV_QR_SKEW_THRESHOLD 83 1.1 mrg #define MU_DIV_QR_SKEW_THRESHOLD 100 84 1.1 mrg #endif 85 1.1 mrg 86 1.1 mrg #ifdef CHECK /* FIXME: Enable in minithres */ 87 1.1 mrg #undef MU_DIV_QR_SKEW_THRESHOLD 88 1.1 mrg #define MU_DIV_QR_SKEW_THRESHOLD 1 89 1.1 mrg #endif 90 1.1 mrg 91 1.1 mrg 92 1.1 mrg static mp_limb_t mpn_mu_div_qr2 (mp_ptr, mp_ptr, mp_srcptr, mp_size_t, mp_srcptr, mp_size_t, mp_ptr); 93 1.1.1.4 mrg static mp_size_t mpn_mu_div_qr_choose_in (mp_size_t, mp_size_t, int); 94 1.1 mrg 95 1.1 mrg 96 1.1 mrg mp_limb_t 97 1.1 mrg mpn_mu_div_qr (mp_ptr qp, 98 1.1 mrg mp_ptr rp, 99 1.1 mrg mp_srcptr np, 100 1.1 mrg mp_size_t nn, 101 1.1 mrg mp_srcptr dp, 102 1.1 mrg mp_size_t dn, 103 1.1 mrg mp_ptr scratch) 104 1.1 mrg { 105 1.1 mrg mp_size_t qn; 106 1.1 mrg mp_limb_t cy, qh; 107 1.1 mrg 108 1.1 mrg qn = nn - dn; 109 1.1 mrg if (qn + MU_DIV_QR_SKEW_THRESHOLD < dn) 110 1.1 mrg { 111 1.1 mrg /* |______________|_ign_first__| dividend nn 112 1.1 mrg |_______|_ign_first__| divisor dn 113 1.1 mrg 114 1.1 mrg |______| quotient (prel) qn 115 1.1 mrg 116 1.1 mrg |___________________| quotient * ignored-divisor-part dn-1 117 1.1 mrg */ 118 1.1 mrg 119 1.1 mrg /* Compute a preliminary quotient and a partial remainder by dividing the 120 1.1 mrg most significant limbs of each operand. */ 121 1.1 mrg qh = mpn_mu_div_qr2 (qp, rp + nn - (2 * qn + 1), 122 1.1 mrg np + nn - (2 * qn + 1), 2 * qn + 1, 123 1.1 mrg dp + dn - (qn + 1), qn + 1, 124 1.1 mrg scratch); 125 1.1 mrg 126 1.1 mrg /* Multiply the quotient by the divisor limbs ignored above. */ 127 1.1 mrg if (dn - (qn + 1) > qn) 128 1.1 mrg mpn_mul (scratch, dp, dn - (qn + 1), qp, qn); /* prod is dn-1 limbs */ 129 1.1 mrg else 130 1.1 mrg mpn_mul (scratch, qp, qn, dp, dn - (qn + 1)); /* prod is dn-1 limbs */ 131 1.1 mrg 132 1.1 mrg if (qh) 133 1.1 mrg cy = mpn_add_n (scratch + qn, scratch + qn, dp, dn - (qn + 1)); 134 1.1 mrg else 135 1.1 mrg cy = 0; 136 1.1 mrg scratch[dn - 1] = cy; 137 1.1 mrg 138 1.1 mrg cy = mpn_sub_n (rp, np, scratch, nn - (2 * qn + 1)); 139 1.1 mrg cy = mpn_sub_nc (rp + nn - (2 * qn + 1), 140 1.1 mrg rp + nn - (2 * qn + 1), 141 1.1 mrg scratch + nn - (2 * qn + 1), 142 1.1 mrg qn + 1, cy); 143 1.1 mrg if (cy) 144 1.1 mrg { 145 1.1 mrg qh -= mpn_sub_1 (qp, qp, qn, 1); 146 1.1 mrg mpn_add_n (rp, rp, dp, dn); 147 1.1 mrg } 148 1.1 mrg } 149 1.1 mrg else 150 1.1 mrg { 151 1.1 mrg qh = mpn_mu_div_qr2 (qp, rp, np, nn, dp, dn, scratch); 152 1.1 mrg } 153 1.1 mrg 154 1.1 mrg return qh; 155 1.1 mrg } 156 1.1 mrg 157 1.1 mrg static mp_limb_t 158 1.1 mrg mpn_mu_div_qr2 (mp_ptr qp, 159 1.1 mrg mp_ptr rp, 160 1.1 mrg mp_srcptr np, 161 1.1 mrg mp_size_t nn, 162 1.1 mrg mp_srcptr dp, 163 1.1 mrg mp_size_t dn, 164 1.1 mrg mp_ptr scratch) 165 1.1 mrg { 166 1.1 mrg mp_size_t qn, in; 167 1.1 mrg mp_limb_t cy, qh; 168 1.1 mrg mp_ptr ip, tp; 169 1.1 mrg 170 1.1 mrg ASSERT (dn > 1); 171 1.1 mrg 172 1.1 mrg qn = nn - dn; 173 1.1 mrg 174 1.1 mrg /* Compute the inverse size. */ 175 1.1 mrg in = mpn_mu_div_qr_choose_in (qn, dn, 0); 176 1.1 mrg ASSERT (in <= dn); 177 1.1 mrg 178 1.1 mrg #if 1 179 1.1 mrg /* This alternative inverse computation method gets slightly more accurate 180 1.1 mrg results. FIXMEs: (1) Temp allocation needs not analysed (2) itch function 181 1.1 mrg not adapted (3) mpn_invertappr scratch needs not met. */ 182 1.1 mrg ip = scratch; 183 1.1 mrg tp = scratch + in + 1; 184 1.1 mrg 185 1.1 mrg /* compute an approximate inverse on (in+1) limbs */ 186 1.1 mrg if (dn == in) 187 1.1 mrg { 188 1.1 mrg MPN_COPY (tp + 1, dp, in); 189 1.1 mrg tp[0] = 1; 190 1.1.1.3 mrg mpn_invertappr (ip, tp, in + 1, tp + in + 1); 191 1.1 mrg MPN_COPY_INCR (ip, ip + 1, in); 192 1.1 mrg } 193 1.1 mrg else 194 1.1 mrg { 195 1.1 mrg cy = mpn_add_1 (tp, dp + dn - (in + 1), in + 1, 1); 196 1.1 mrg if (UNLIKELY (cy != 0)) 197 1.1 mrg MPN_ZERO (ip, in); 198 1.1 mrg else 199 1.1 mrg { 200 1.1.1.3 mrg mpn_invertappr (ip, tp, in + 1, tp + in + 1); 201 1.1 mrg MPN_COPY_INCR (ip, ip + 1, in); 202 1.1 mrg } 203 1.1 mrg } 204 1.1 mrg #else 205 1.1 mrg /* This older inverse computation method gets slightly worse results than the 206 1.1 mrg one above. */ 207 1.1 mrg ip = scratch; 208 1.1 mrg tp = scratch + in; 209 1.1 mrg 210 1.1 mrg /* Compute inverse of D to in+1 limbs, then round to 'in' limbs. Ideally the 211 1.1 mrg inversion function should do this automatically. */ 212 1.1 mrg if (dn == in) 213 1.1 mrg { 214 1.1 mrg tp[in + 1] = 0; 215 1.1 mrg MPN_COPY (tp + in + 2, dp, in); 216 1.1 mrg mpn_invertappr (tp, tp + in + 1, in + 1, NULL); 217 1.1 mrg } 218 1.1 mrg else 219 1.1 mrg { 220 1.1 mrg mpn_invertappr (tp, dp + dn - (in + 1), in + 1, NULL); 221 1.1 mrg } 222 1.1 mrg cy = mpn_sub_1 (tp, tp, in + 1, GMP_NUMB_HIGHBIT); 223 1.1 mrg if (UNLIKELY (cy != 0)) 224 1.1 mrg MPN_ZERO (tp + 1, in); 225 1.1 mrg MPN_COPY (ip, tp + 1, in); 226 1.1 mrg #endif 227 1.1 mrg 228 1.1 mrg qh = mpn_preinv_mu_div_qr (qp, rp, np, nn, dp, dn, ip, in, scratch + in); 229 1.1 mrg 230 1.1 mrg return qh; 231 1.1 mrg } 232 1.1 mrg 233 1.1 mrg mp_limb_t 234 1.1 mrg mpn_preinv_mu_div_qr (mp_ptr qp, 235 1.1 mrg mp_ptr rp, 236 1.1 mrg mp_srcptr np, 237 1.1 mrg mp_size_t nn, 238 1.1 mrg mp_srcptr dp, 239 1.1 mrg mp_size_t dn, 240 1.1 mrg mp_srcptr ip, 241 1.1 mrg mp_size_t in, 242 1.1 mrg mp_ptr scratch) 243 1.1 mrg { 244 1.1 mrg mp_size_t qn; 245 1.1 mrg mp_limb_t cy, cx, qh; 246 1.1 mrg mp_limb_t r; 247 1.1 mrg mp_size_t tn, wn; 248 1.1 mrg 249 1.1 mrg #define tp scratch 250 1.1 mrg #define scratch_out (scratch + tn) 251 1.1 mrg 252 1.1 mrg qn = nn - dn; 253 1.1 mrg 254 1.1 mrg np += qn; 255 1.1 mrg qp += qn; 256 1.1 mrg 257 1.1 mrg qh = mpn_cmp (np, dp, dn) >= 0; 258 1.1 mrg if (qh != 0) 259 1.1 mrg mpn_sub_n (rp, np, dp, dn); 260 1.1 mrg else 261 1.1.1.2 mrg MPN_COPY_INCR (rp, np, dn); 262 1.1 mrg 263 1.1.1.3 mrg /* if (qn == 0) */ /* The while below handles this case */ 264 1.1.1.3 mrg /* return qh; */ /* Degenerate use. Should we allow this? */ 265 1.1 mrg 266 1.1 mrg while (qn > 0) 267 1.1 mrg { 268 1.1 mrg if (qn < in) 269 1.1 mrg { 270 1.1 mrg ip += in - qn; 271 1.1 mrg in = qn; 272 1.1 mrg } 273 1.1 mrg np -= in; 274 1.1 mrg qp -= in; 275 1.1 mrg 276 1.1 mrg /* Compute the next block of quotient limbs by multiplying the inverse I 277 1.1 mrg by the upper part of the partial remainder R. */ 278 1.1 mrg mpn_mul_n (tp, rp + dn - in, ip, in); /* mulhi */ 279 1.1 mrg cy = mpn_add_n (qp, tp + in, rp + dn - in, in); /* I's msb implicit */ 280 1.1 mrg ASSERT_ALWAYS (cy == 0); 281 1.1 mrg 282 1.1 mrg qn -= in; 283 1.1 mrg 284 1.1 mrg /* Compute the product of the quotient block and the divisor D, to be 285 1.1 mrg subtracted from the partial remainder combined with new limbs from the 286 1.1 mrg dividend N. We only really need the low dn+1 limbs. */ 287 1.1 mrg 288 1.1 mrg if (BELOW_THRESHOLD (in, MUL_TO_MULMOD_BNM1_FOR_2NXN_THRESHOLD)) 289 1.1 mrg mpn_mul (tp, dp, dn, qp, in); /* dn+in limbs, high 'in' cancels */ 290 1.1 mrg else 291 1.1 mrg { 292 1.1 mrg tn = mpn_mulmod_bnm1_next_size (dn + 1); 293 1.1 mrg mpn_mulmod_bnm1 (tp, tn, dp, dn, qp, in, scratch_out); 294 1.1 mrg wn = dn + in - tn; /* number of wrapped limbs */ 295 1.1 mrg if (wn > 0) 296 1.1 mrg { 297 1.1 mrg cy = mpn_sub_n (tp, tp, rp + dn - wn, wn); 298 1.1 mrg cy = mpn_sub_1 (tp + wn, tp + wn, tn - wn, cy); 299 1.1 mrg cx = mpn_cmp (rp + dn - in, tp + dn, tn - dn) < 0; 300 1.1 mrg ASSERT_ALWAYS (cx >= cy); 301 1.1 mrg mpn_incr_u (tp, cx - cy); 302 1.1 mrg } 303 1.1 mrg } 304 1.1 mrg 305 1.1 mrg r = rp[dn - in] - tp[dn]; 306 1.1 mrg 307 1.1 mrg /* Subtract the product from the partial remainder combined with new 308 1.1 mrg limbs from the dividend N, generating a new partial remainder R. */ 309 1.1 mrg if (dn != in) 310 1.1 mrg { 311 1.1 mrg cy = mpn_sub_n (tp, np, tp, in); /* get next 'in' limbs from N */ 312 1.1 mrg cy = mpn_sub_nc (tp + in, rp, tp + in, dn - in, cy); 313 1.1 mrg MPN_COPY (rp, tp, dn); /* FIXME: try to avoid this */ 314 1.1 mrg } 315 1.1 mrg else 316 1.1 mrg { 317 1.1 mrg cy = mpn_sub_n (rp, np, tp, in); /* get next 'in' limbs from N */ 318 1.1 mrg } 319 1.1 mrg 320 1.1 mrg STAT (int i; int err = 0; 321 1.1 mrg static int errarr[5]; static int err_rec; static int tot); 322 1.1 mrg 323 1.1 mrg /* Check the remainder R and adjust the quotient as needed. */ 324 1.1 mrg r -= cy; 325 1.1 mrg while (r != 0) 326 1.1 mrg { 327 1.1 mrg /* We loop 0 times with about 69% probability, 1 time with about 31% 328 1.1 mrg probability, 2 times with about 0.6% probability, if inverse is 329 1.1 mrg computed as recommended. */ 330 1.1 mrg mpn_incr_u (qp, 1); 331 1.1 mrg cy = mpn_sub_n (rp, rp, dp, dn); 332 1.1 mrg r -= cy; 333 1.1 mrg STAT (err++); 334 1.1 mrg } 335 1.1 mrg if (mpn_cmp (rp, dp, dn) >= 0) 336 1.1 mrg { 337 1.1 mrg /* This is executed with about 76% probability. */ 338 1.1 mrg mpn_incr_u (qp, 1); 339 1.1 mrg cy = mpn_sub_n (rp, rp, dp, dn); 340 1.1 mrg STAT (err++); 341 1.1 mrg } 342 1.1 mrg 343 1.1 mrg STAT ( 344 1.1 mrg tot++; 345 1.1 mrg errarr[err]++; 346 1.1 mrg if (err > err_rec) 347 1.1 mrg err_rec = err; 348 1.1 mrg if (tot % 0x10000 == 0) 349 1.1 mrg { 350 1.1 mrg for (i = 0; i <= err_rec; i++) 351 1.1 mrg printf (" %d(%.1f%%)", errarr[i], 100.0*errarr[i]/tot); 352 1.1 mrg printf ("\n"); 353 1.1 mrg } 354 1.1 mrg ); 355 1.1 mrg } 356 1.1 mrg 357 1.1 mrg return qh; 358 1.1 mrg } 359 1.1 mrg 360 1.1 mrg /* In case k=0 (automatic choice), we distinguish 3 cases: 361 1.1 mrg (a) dn < qn: in = ceil(qn / ceil(qn/dn)) 362 1.1 mrg (b) dn/3 < qn <= dn: in = ceil(qn / 2) 363 1.1 mrg (c) qn < dn/3: in = qn 364 1.1 mrg In all cases we have in <= dn. 365 1.1 mrg */ 366 1.1.1.4 mrg static mp_size_t 367 1.1 mrg mpn_mu_div_qr_choose_in (mp_size_t qn, mp_size_t dn, int k) 368 1.1 mrg { 369 1.1 mrg mp_size_t in; 370 1.1 mrg 371 1.1 mrg if (k == 0) 372 1.1 mrg { 373 1.1 mrg mp_size_t b; 374 1.1 mrg if (qn > dn) 375 1.1 mrg { 376 1.1 mrg /* Compute an inverse size that is a nice partition of the quotient. */ 377 1.1 mrg b = (qn - 1) / dn + 1; /* ceil(qn/dn), number of blocks */ 378 1.1 mrg in = (qn - 1) / b + 1; /* ceil(qn/b) = ceil(qn / ceil(qn/dn)) */ 379 1.1 mrg } 380 1.1 mrg else if (3 * qn > dn) 381 1.1 mrg { 382 1.1 mrg in = (qn - 1) / 2 + 1; /* b = 2 */ 383 1.1 mrg } 384 1.1 mrg else 385 1.1 mrg { 386 1.1 mrg in = (qn - 1) / 1 + 1; /* b = 1 */ 387 1.1 mrg } 388 1.1 mrg } 389 1.1 mrg else 390 1.1 mrg { 391 1.1 mrg mp_size_t xn; 392 1.1 mrg xn = MIN (dn, qn); 393 1.1 mrg in = (xn - 1) / k + 1; 394 1.1 mrg } 395 1.1 mrg 396 1.1 mrg return in; 397 1.1 mrg } 398 1.1 mrg 399 1.1 mrg mp_size_t 400 1.1 mrg mpn_mu_div_qr_itch (mp_size_t nn, mp_size_t dn, int mua_k) 401 1.1 mrg { 402 1.1 mrg mp_size_t in = mpn_mu_div_qr_choose_in (nn - dn, dn, mua_k); 403 1.1.1.3 mrg mp_size_t itch_preinv = mpn_preinv_mu_div_qr_itch (nn, dn, in); 404 1.1.1.3 mrg mp_size_t itch_invapp = mpn_invertappr_itch (in + 1) + in + 2; /* 3in + 4 */ 405 1.1 mrg 406 1.1.1.3 mrg ASSERT (itch_preinv >= itch_invapp); 407 1.1.1.3 mrg return in + MAX (itch_invapp, itch_preinv); 408 1.1 mrg } 409 1.1 mrg 410 1.1 mrg mp_size_t 411 1.1 mrg mpn_preinv_mu_div_qr_itch (mp_size_t nn, mp_size_t dn, mp_size_t in) 412 1.1 mrg { 413 1.1 mrg mp_size_t itch_local = mpn_mulmod_bnm1_next_size (dn + 1); 414 1.1 mrg mp_size_t itch_out = mpn_mulmod_bnm1_itch (itch_local, dn, in); 415 1.1 mrg 416 1.1 mrg return itch_local + itch_out; 417 1.1 mrg } 418