mpn/generic/invertappr.c

    1.1  mrg /* mpn_invertappr and helper functions.  Compute I such that
    1.1  mrg    floor((B^{2n}-1)/U - 1 <= I + B^n <= floor((B^{2n}-1)/U.
    1.1  mrg
    1.1  mrg    Contributed to the GNU project by Marco Bodrato.
    1.1  mrg
    1.1  mrg    The algorithm used here was inspired by ApproximateReciprocal from "Modern
    1.1  mrg    Computer Arithmetic", by Richard P. Brent and Paul Zimmermann.  Special
    1.1  mrg    thanks to Paul Zimmermann for his very valuable suggestions on all the
    1.1  mrg    theoretical aspects during the work on this code.
    1.1  mrg
    1.1  mrg    THE FUNCTIONS IN THIS FILE ARE INTERNAL WITH MUTABLE INTERFACES.  IT IS ONLY
    1.1  mrg    SAFE TO REACH THEM THROUGH DOCUMENTED INTERFACES.  IN FACT, IT IS ALMOST
    1.1  mrg    GUARANTEED THAT THEY WILL CHANGE OR DISAPPEAR IN A FUTURE GMP RELEASE.
    1.1  mrg
1.1.1.4  mrg Copyright (C) 2007, 2009, 2010, 2012, 2015, 2016 Free Software
1.1.1.4  mrg Foundation, Inc.
    1.1  mrg
    1.1  mrg This file is part of the GNU MP Library.
    1.1  mrg
    1.1  mrg The GNU MP Library is free software; you can redistribute it and/or modify
1.1.1.3  mrg it under the terms of either:
1.1.1.3  mrg
1.1.1.3  mrg   * the GNU Lesser General Public License as published by the Free
1.1.1.3  mrg     Software Foundation; either version 3 of the License, or (at your
1.1.1.3  mrg     option) any later version.
1.1.1.3  mrg
1.1.1.3  mrg or
1.1.1.3  mrg
1.1.1.3  mrg   * the GNU General Public License as published by the Free Software
1.1.1.3  mrg     Foundation; either version 2 of the License, or (at your option) any
1.1.1.3  mrg     later version.
1.1.1.3  mrg
1.1.1.3  mrg or both in parallel, as here.
    1.1  mrg
    1.1  mrg The GNU MP Library is distributed in the hope that it will be useful, but
    1.1  mrg WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
1.1.1.3  mrg or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
1.1.1.3  mrg for more details.
    1.1  mrg
1.1.1.3  mrg You should have received copies of the GNU General Public License and the
1.1.1.3  mrg GNU Lesser General Public License along with the GNU MP Library.  If not,
1.1.1.3  mrg see https://www.gnu.org/licenses/.  */
    1.1  mrg
    1.1  mrg #include "gmp-impl.h"
    1.1  mrg #include "longlong.h"
    1.1  mrg
1.1.1.3  mrg /* FIXME: The iterative version splits the operand in two slightly unbalanced
    1.1  mrg    parts, the use of log_2 (or counting the bits) underestimate the maximum
    1.1  mrg    number of iterations.  */
    1.1  mrg
    1.1  mrg #if TUNE_PROGRAM_BUILD
    1.1  mrg #define NPOWS \
    1.1  mrg  ((sizeof(mp_size_t) > 6 ? 48 : 8*sizeof(mp_size_t)))
    1.1  mrg #define MAYBE_dcpi1_divappr   1
    1.1  mrg #else
    1.1  mrg #define NPOWS \
    1.1  mrg  ((sizeof(mp_size_t) > 6 ? 48 : 8*sizeof(mp_size_t)) - LOG2C (INV_NEWTON_THRESHOLD))
    1.1  mrg #define MAYBE_dcpi1_divappr \
    1.1  mrg   (INV_NEWTON_THRESHOLD < DC_DIVAPPR_Q_THRESHOLD)
    1.1  mrg #if (INV_NEWTON_THRESHOLD > INV_MULMOD_BNM1_THRESHOLD) && \
    1.1  mrg     (INV_APPR_THRESHOLD > INV_MULMOD_BNM1_THRESHOLD)
    1.1  mrg #undef  INV_MULMOD_BNM1_THRESHOLD
    1.1  mrg #define INV_MULMOD_BNM1_THRESHOLD 0 /* always when Newton */
    1.1  mrg #endif
    1.1  mrg #endif
    1.1  mrg
    1.1  mrg /* All the three functions mpn{,_bc,_ni}_invertappr (ip, dp, n, scratch), take
    1.1  mrg    the strictly normalised value {dp,n} (i.e., most significant bit must be set)
    1.1  mrg    as an input, and compute {ip,n}: the approximate reciprocal of {dp,n}.
    1.1  mrg
    1.1  mrg    Let e = mpn*_invertappr (ip, dp, n, scratch) be the returned value; the
    1.1  mrg    following conditions are satisfied by the output:
    1.1  mrg      0 <= e <= 1;
    1.1  mrg      {dp,n}*(B^n+{ip,n}) < B^{2n} <= {dp,n}*(B^n+{ip,n}+1+e) .
    1.1  mrg    I.e. e=0 means that the result {ip,n} equals the one given by mpn_invert.
    1.1  mrg 	e=1 means that the result _may_ be one less than expected.
    1.1  mrg
    1.1  mrg    The _bc version returns e=1 most of the time.
    1.1  mrg    The _ni version should return e=0 most of the time; only about 1% of
    1.1  mrg    possible random input should give e=1.
    1.1  mrg
    1.1  mrg    When the strict result is needed, i.e., e=0 in the relation above:
    1.1  mrg      {dp,n}*(B^n+{ip,n}) < B^{2n} <= {dp,n}*(B^n+{ip,n}+1) ;
    1.1  mrg    the function mpn_invert (ip, dp, n, scratch) should be used instead.  */
    1.1  mrg
1.1.1.3  mrg /* Maximum scratch needed by this branch (at xp): 2*n */
    1.1  mrg static mp_limb_t
1.1.1.3  mrg mpn_bc_invertappr (mp_ptr ip, mp_srcptr dp, mp_size_t n, mp_ptr xp)
    1.1  mrg {
    1.1  mrg   ASSERT (n > 0);
    1.1  mrg   ASSERT (dp[n-1] & GMP_NUMB_HIGHBIT);
    1.1  mrg   ASSERT (! MPN_OVERLAP_P (ip, n, dp, n));
1.1.1.3  mrg   ASSERT (! MPN_OVERLAP_P (ip, n, xp, mpn_invertappr_itch(n)));
1.1.1.3  mrg   ASSERT (! MPN_OVERLAP_P (dp, n, xp, mpn_invertappr_itch(n)));
    1.1  mrg
    1.1  mrg   /* Compute a base value of r limbs. */
    1.1  mrg   if (n == 1)
    1.1  mrg     invert_limb (*ip, *dp);
    1.1  mrg   else {
1.1.1.3  mrg     /* n > 1 here */
1.1.1.4  mrg     MPN_FILL (xp, n, GMP_NUMB_MAX);
    1.1  mrg     mpn_com (xp + n, dp, n);
    1.1  mrg
    1.1  mrg     /* Now xp contains B^2n - {dp,n}*B^n - 1 */
    1.1  mrg
    1.1  mrg     /* FIXME: if mpn_*pi1_divappr_q handles n==2, use it! */
    1.1  mrg     if (n == 2) {
    1.1  mrg       mpn_divrem_2 (ip, 0, xp, 4, dp);
    1.1  mrg     } else {
    1.1  mrg       gmp_pi1_t inv;
    1.1  mrg       invert_pi1 (inv, dp[n-1], dp[n-2]);
    1.1  mrg       if (! MAYBE_dcpi1_divappr
    1.1  mrg 	  || BELOW_THRESHOLD (n, DC_DIVAPPR_Q_THRESHOLD))
    1.1  mrg 	mpn_sbpi1_divappr_q (ip, xp, 2 * n, dp, n, inv.inv32);
    1.1  mrg       else
    1.1  mrg 	mpn_dcpi1_divappr_q (ip, xp, 2 * n, dp, n, &inv);
1.1.1.3  mrg       MPN_DECR_U(ip, n, CNST_LIMB (1));
    1.1  mrg       return 1;
    1.1  mrg     }
    1.1  mrg   }
    1.1  mrg   return 0;
    1.1  mrg }
    1.1  mrg
    1.1  mrg /* mpn_ni_invertappr: computes the approximate reciprocal using Newton's
    1.1  mrg    iterations (at least one).
    1.1  mrg
    1.1  mrg    Inspired by Algorithm "ApproximateReciprocal", published in "Modern Computer
    1.1  mrg    Arithmetic" by Richard P. Brent and Paul Zimmermann, algorithm 3.5, page 121
    1.1  mrg    in version 0.4 of the book.
    1.1  mrg
    1.1  mrg    Some adaptations were introduced, to allow product mod B^m-1 and return the
    1.1  mrg    value e.
    1.1  mrg
1.1.1.3  mrg    We introduced a correction in such a way that "the value of
1.1.1.3  mrg    B^{n+h}-T computed at step 8 cannot exceed B^n-1" (the book reads
1.1.1.3  mrg    "2B^n-1").
1.1.1.3  mrg
1.1.1.3  mrg    Maximum scratch needed by this branch <= 2*n, but have to fit 3*rn
1.1.1.3  mrg    in the scratch, i.e. 3*rn <= 2*n: we require n>4.
    1.1  mrg
    1.1  mrg    We use a wrapped product modulo B^m-1.  NOTE: is there any normalisation
    1.1  mrg    problem for the [0] class?  It shouldn't: we compute 2*|A*X_h - B^{n+h}| <
    1.1  mrg    B^m-1.  We may get [0] if and only if we get AX_h = B^{n+h}.  This can
    1.1  mrg    happen only if A=B^{n}/2, but this implies X_h = B^{h}*2-1 i.e., AX_h =
    1.1  mrg    B^{n+h} - A, then we get into the "negative" branch, where X_h is not
    1.1  mrg    incremented (because A < B^n).
    1.1  mrg
    1.1  mrg    FIXME: the scratch for mulmod_bnm1 does not currently fit in the scratch, it
1.1.1.3  mrg    is allocated apart.
1.1.1.3  mrg  */
    1.1  mrg
    1.1  mrg mp_limb_t
    1.1  mrg mpn_ni_invertappr (mp_ptr ip, mp_srcptr dp, mp_size_t n, mp_ptr scratch)
    1.1  mrg {
    1.1  mrg   mp_limb_t cy;
    1.1  mrg   mp_size_t rn, mn;
    1.1  mrg   mp_size_t sizes[NPOWS], *sizp;
    1.1  mrg   mp_ptr tp;
    1.1  mrg   TMP_DECL;
1.1.1.3  mrg #define xp scratch
    1.1  mrg
1.1.1.3  mrg   ASSERT (n > 4);
    1.1  mrg   ASSERT (dp[n-1] & GMP_NUMB_HIGHBIT);
    1.1  mrg   ASSERT (! MPN_OVERLAP_P (ip, n, dp, n));
    1.1  mrg   ASSERT (! MPN_OVERLAP_P (ip, n, scratch, mpn_invertappr_itch(n)));
    1.1  mrg   ASSERT (! MPN_OVERLAP_P (dp, n, scratch, mpn_invertappr_itch(n)));
    1.1  mrg
    1.1  mrg   /* Compute the computation precisions from highest to lowest, leaving the
    1.1  mrg      base case size in 'rn'.  */
    1.1  mrg   sizp = sizes;
    1.1  mrg   rn = n;
    1.1  mrg   do {
    1.1  mrg     *sizp = rn;
1.1.1.3  mrg     rn = (rn >> 1) + 1;
1.1.1.3  mrg     ++sizp;
    1.1  mrg   } while (ABOVE_THRESHOLD (rn, INV_NEWTON_THRESHOLD));
    1.1  mrg
    1.1  mrg   /* We search the inverse of 0.{dp,n}, we compute it as 1.{ip,n} */
    1.1  mrg   dp += n;
    1.1  mrg   ip += n;
    1.1  mrg
    1.1  mrg   /* Compute a base value of rn limbs. */
    1.1  mrg   mpn_bc_invertappr (ip - rn, dp - rn, rn, scratch);
    1.1  mrg
    1.1  mrg   TMP_MARK;
    1.1  mrg
    1.1  mrg   if (ABOVE_THRESHOLD (n, INV_MULMOD_BNM1_THRESHOLD))
    1.1  mrg     {
    1.1  mrg       mn = mpn_mulmod_bnm1_next_size (n + 1);
    1.1  mrg       tp = TMP_ALLOC_LIMBS (mpn_mulmod_bnm1_itch (mn, n, (n >> 1) + 1));
    1.1  mrg     }
    1.1  mrg   /* Use Newton's iterations to get the desired precision.*/
    1.1  mrg
    1.1  mrg   while (1) {
    1.1  mrg     n = *--sizp;
    1.1  mrg     /*
    1.1  mrg       v    n  v
    1.1  mrg       +----+--+
    1.1  mrg       ^ rn ^
    1.1  mrg     */
    1.1  mrg
    1.1  mrg     /* Compute i_jd . */
    1.1  mrg     if (BELOW_THRESHOLD (n, INV_MULMOD_BNM1_THRESHOLD)
    1.1  mrg 	|| ((mn = mpn_mulmod_bnm1_next_size (n + 1)) > (n + rn))) {
    1.1  mrg       /* FIXME: We do only need {xp,n+1}*/
    1.1  mrg       mpn_mul (xp, dp - n, n, ip - rn, rn);
    1.1  mrg       mpn_add_n (xp + rn, xp + rn, dp - n, n - rn + 1);
1.1.1.3  mrg       cy = CNST_LIMB(1); /* Remember we truncated, Mod B^(n+1) */
1.1.1.3  mrg       /* We computed (truncated) {xp,n+1} <- 1.{ip,rn} * 0.{dp,n} */
1.1.1.3  mrg     } else { /* Use B^mn-1 wraparound */
    1.1  mrg       mpn_mulmod_bnm1 (xp, mn, dp - n, n, ip - rn, rn, tp);
    1.1  mrg       /* We computed {xp,mn} <- {ip,rn} * {dp,n} mod (B^mn-1) */
    1.1  mrg       /* We know that 2*|ip*dp + dp*B^rn - B^{rn+n}| < B^mn-1 */
    1.1  mrg       /* Add dp*B^rn mod (B^mn-1) */
    1.1  mrg       ASSERT (n >= mn - rn);
1.1.1.3  mrg       cy = mpn_add_n (xp + rn, xp + rn, dp - n, mn - rn);
1.1.1.3  mrg       cy = mpn_add_nc (xp, xp, dp - (n - (mn - rn)), n - (mn - rn), cy);
1.1.1.3  mrg       /* Subtract B^{rn+n}, maybe only compensate the carry*/
1.1.1.3  mrg       xp[mn] = CNST_LIMB (1); /* set a limit for DECR_U */
1.1.1.3  mrg       MPN_DECR_U (xp + rn + n - mn, 2 * mn + 1 - rn - n, CNST_LIMB (1) - cy);
1.1.1.3  mrg       MPN_DECR_U (xp, mn, CNST_LIMB (1) - xp[mn]); /* if DECR_U eroded xp[mn] */
1.1.1.3  mrg       cy = CNST_LIMB(0); /* Remember we are working Mod B^mn-1 */
    1.1  mrg     }
    1.1  mrg
1.1.1.3  mrg     if (xp[n] < CNST_LIMB (2)) { /* "positive" residue class */
1.1.1.3  mrg       cy = xp[n]; /* 0 <= cy <= 1 here. */
1.1.1.3  mrg #if HAVE_NATIVE_mpn_sublsh1_n
1.1.1.3  mrg       if (cy++) {
1.1.1.3  mrg 	if (mpn_cmp (xp, dp - n, n) > 0) {
1.1.1.3  mrg 	  mp_limb_t chk;
1.1.1.3  mrg 	  chk = mpn_sublsh1_n (xp, xp, dp - n, n);
1.1.1.3  mrg 	  ASSERT (chk == xp[n]);
1.1.1.3  mrg 	  ++ cy;
1.1.1.3  mrg 	} else
1.1.1.3  mrg 	  ASSERT_CARRY (mpn_sub_n (xp, xp, dp - n, n));
    1.1  mrg       }
1.1.1.3  mrg #else /* no mpn_sublsh1_n*/
1.1.1.3  mrg       if (cy++ && !mpn_sub_n (xp, xp, dp - n, n)) {
    1.1  mrg 	ASSERT_CARRY (mpn_sub_n (xp, xp, dp - n, n));
1.1.1.3  mrg 	++cy;
1.1.1.3  mrg       }
1.1.1.3  mrg #endif
1.1.1.3  mrg       /* 1 <= cy <= 3 here. */
1.1.1.3  mrg #if HAVE_NATIVE_mpn_rsblsh1_n
1.1.1.3  mrg       if (mpn_cmp (xp, dp - n, n) > 0) {
1.1.1.3  mrg 	ASSERT_NOCARRY (mpn_rsblsh1_n (xp + n, xp, dp - n, n));
1.1.1.3  mrg 	++cy;
1.1.1.3  mrg       } else
1.1.1.3  mrg 	ASSERT_NOCARRY (mpn_sub_nc (xp + 2 * n - rn, dp - rn, xp + n - rn, rn, mpn_cmp (xp, dp - n, n - rn) > 0));
1.1.1.3  mrg #else /* no mpn_rsblsh1_n*/
1.1.1.3  mrg       if (mpn_cmp (xp, dp - n, n) > 0) {
1.1.1.3  mrg 	ASSERT_NOCARRY (mpn_sub_n (xp, xp, dp - n, n));
1.1.1.3  mrg 	++cy;
    1.1  mrg       }
1.1.1.3  mrg       ASSERT_NOCARRY (mpn_sub_nc (xp + 2 * n - rn, dp - rn, xp + n - rn, rn, mpn_cmp (xp, dp - n, n - rn) > 0));
    1.1  mrg #endif
1.1.1.3  mrg       MPN_DECR_U(ip - rn, rn, cy); /* 1 <= cy <= 4 here. */
1.1.1.3  mrg     } else { /* "negative" residue class */
1.1.1.3  mrg       ASSERT (xp[n] >= GMP_NUMB_MAX - CNST_LIMB(1));
1.1.1.3  mrg       MPN_DECR_U(xp, n + 1, cy);
1.1.1.3  mrg       if (xp[n] != GMP_NUMB_MAX) {
1.1.1.3  mrg 	MPN_INCR_U(ip - rn, rn, CNST_LIMB (1));
1.1.1.3  mrg 	ASSERT_CARRY (mpn_add_n (xp, xp, dp - n, n));
1.1.1.3  mrg       }
1.1.1.3  mrg       mpn_com (xp + 2 * n - rn, xp + n - rn, rn);
    1.1  mrg     }
    1.1  mrg
1.1.1.3  mrg     /* Compute x_ju_j. FIXME:We need {xp+rn,rn}, mulhi? */
1.1.1.3  mrg     mpn_mul_n (xp, xp + 2 * n - rn, ip - rn, rn);
1.1.1.3  mrg     cy = mpn_add_n (xp + rn, xp + rn, xp + 2 * n - rn, 2 * rn - n);
1.1.1.3  mrg     cy = mpn_add_nc (ip - n, xp + 3 * rn - n, xp + n + rn, n - rn, cy);
1.1.1.3  mrg     MPN_INCR_U (ip - rn, rn, cy);
    1.1  mrg     if (sizp == sizes) { /* Get out of the cycle */
    1.1  mrg       /* Check for possible carry propagation from below. */
1.1.1.3  mrg       cy = xp[3 * rn - n - 1] > GMP_NUMB_MAX - CNST_LIMB (7); /* Be conservative. */
1.1.1.3  mrg       /*    cy = mpn_add_1 (xp + rn, xp + rn, 2*rn - n, 4); */
    1.1  mrg       break;
    1.1  mrg     }
    1.1  mrg     rn = n;
    1.1  mrg   }
    1.1  mrg   TMP_FREE;
    1.1  mrg
    1.1  mrg   return cy;
1.1.1.3  mrg #undef xp
    1.1  mrg }
    1.1  mrg
    1.1  mrg mp_limb_t
    1.1  mrg mpn_invertappr (mp_ptr ip, mp_srcptr dp, mp_size_t n, mp_ptr scratch)
    1.1  mrg {
    1.1  mrg   ASSERT (n > 0);
    1.1  mrg   ASSERT (dp[n-1] & GMP_NUMB_HIGHBIT);
    1.1  mrg   ASSERT (! MPN_OVERLAP_P (ip, n, dp, n));
    1.1  mrg   ASSERT (! MPN_OVERLAP_P (ip, n, scratch, mpn_invertappr_itch(n)));
    1.1  mrg   ASSERT (! MPN_OVERLAP_P (dp, n, scratch, mpn_invertappr_itch(n)));
    1.1  mrg
    1.1  mrg   if (BELOW_THRESHOLD (n, INV_NEWTON_THRESHOLD))
1.1.1.3  mrg     return mpn_bc_invertappr (ip, dp, n, scratch);
    1.1  mrg   else
1.1.1.3  mrg     return mpn_ni_invertappr (ip, dp, n, scratch);
    1.1  mrg }