mpn/generic/mu_div_qr.c

    1.1  mrg /* mpn_mu_div_qr, mpn_preinv_mu_div_qr.
    1.1  mrg
    1.1  mrg    Compute Q = floor(N / D) and R = N-QD.  N is nn limbs and D is dn limbs and
    1.1  mrg    must be normalized, and Q must be nn-dn limbs.  The requirement that Q is
    1.1  mrg    nn-dn limbs (and not nn-dn+1 limbs) was put in place in order to allow us to
    1.1  mrg    let N be unmodified during the operation.
    1.1  mrg
    1.1  mrg    Contributed to the GNU project by Torbjorn Granlund.
    1.1  mrg
    1.1  mrg    THE FUNCTIONS IN THIS FILE ARE INTERNAL WITH MUTABLE INTERFACES.  IT IS ONLY
    1.1  mrg    SAFE TO REACH THEM THROUGH DOCUMENTED INTERFACES.  IN FACT, IT IS ALMOST
    1.1  mrg    GUARANTEED THAT THEY WILL CHANGE OR DISAPPEAR IN A FUTURE GMP RELEASE.
    1.1  mrg
1.1.1.3  mrg Copyright 2005-2007, 2009, 2010 Free Software Foundation, Inc.
    1.1  mrg
    1.1  mrg This file is part of the GNU MP Library.
    1.1  mrg
    1.1  mrg The GNU MP Library is free software; you can redistribute it and/or modify
1.1.1.3  mrg it under the terms of either:
1.1.1.3  mrg
1.1.1.3  mrg   * the GNU Lesser General Public License as published by the Free
1.1.1.3  mrg     Software Foundation; either version 3 of the License, or (at your
1.1.1.3  mrg     option) any later version.
1.1.1.3  mrg
1.1.1.3  mrg or
1.1.1.3  mrg
1.1.1.3  mrg   * the GNU General Public License as published by the Free Software
1.1.1.3  mrg     Foundation; either version 2 of the License, or (at your option) any
1.1.1.3  mrg     later version.
1.1.1.3  mrg
1.1.1.3  mrg or both in parallel, as here.
    1.1  mrg
    1.1  mrg The GNU MP Library is distributed in the hope that it will be useful, but
    1.1  mrg WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
1.1.1.3  mrg or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
1.1.1.3  mrg for more details.
    1.1  mrg
1.1.1.3  mrg You should have received copies of the GNU General Public License and the
1.1.1.3  mrg GNU Lesser General Public License along with the GNU MP Library.  If not,
1.1.1.3  mrg see https://www.gnu.org/licenses/.  */
    1.1  mrg
    1.1  mrg
    1.1  mrg /*
    1.1  mrg    The idea of the algorithm used herein is to compute a smaller inverted value
    1.1  mrg    than used in the standard Barrett algorithm, and thus save time in the
    1.1  mrg    Newton iterations, and pay just a small price when using the inverted value
    1.1  mrg    for developing quotient bits.  This algorithm was presented at ICMS 2006.
    1.1  mrg */
    1.1  mrg
    1.1  mrg /* CAUTION: This code and the code in mu_divappr_q.c should be edited in sync.
    1.1  mrg
    1.1  mrg  Things to work on:
    1.1  mrg
    1.1  mrg   * This isn't optimal when the quotient isn't needed, as it might take a lot
    1.1  mrg     of space.  The computation is always needed, though, so there is no time to
    1.1  mrg     save with special code.
    1.1  mrg
    1.1  mrg   * The itch/scratch scheme isn't perhaps such a good idea as it once seemed,
    1.1  mrg     demonstrated by the fact that the mpn_invertappr function's scratch needs
    1.1  mrg     mean that we need to keep a large allocation long after it is needed.
    1.1  mrg     Things are worse as mpn_mul_fft does not accept any scratch parameter,
    1.1  mrg     which means we'll have a large memory hole while in mpn_mul_fft.  In
    1.1  mrg     general, a peak scratch need in the beginning of a function isn't
    1.1  mrg     well-handled by the itch/scratch scheme.
    1.1  mrg */
    1.1  mrg
    1.1  mrg #ifdef STAT
    1.1  mrg #undef STAT
    1.1  mrg #define STAT(x) x
    1.1  mrg #else
    1.1  mrg #define STAT(x)
    1.1  mrg #endif
    1.1  mrg
    1.1  mrg #include <stdlib.h>		/* for NULL */
    1.1  mrg #include "gmp-impl.h"
    1.1  mrg
    1.1  mrg
    1.1  mrg /* FIXME: The MU_DIV_QR_SKEW_THRESHOLD was not analysed properly.  It gives a
    1.1  mrg    speedup according to old measurements, but does the decision mechanism
    1.1  mrg    really make sense?  It seem like the quotient between dn and qn might be
    1.1  mrg    what we really should be checking.  */
    1.1  mrg #ifndef MU_DIV_QR_SKEW_THRESHOLD
    1.1  mrg #define MU_DIV_QR_SKEW_THRESHOLD 100
    1.1  mrg #endif
    1.1  mrg
    1.1  mrg #ifdef CHECK				/* FIXME: Enable in minithres */
    1.1  mrg #undef  MU_DIV_QR_SKEW_THRESHOLD
    1.1  mrg #define MU_DIV_QR_SKEW_THRESHOLD 1
    1.1  mrg #endif
    1.1  mrg
    1.1  mrg
    1.1  mrg static mp_limb_t mpn_mu_div_qr2 (mp_ptr, mp_ptr, mp_srcptr, mp_size_t, mp_srcptr, mp_size_t, mp_ptr);
1.1.1.4  mrg static mp_size_t mpn_mu_div_qr_choose_in (mp_size_t, mp_size_t, int);
    1.1  mrg
    1.1  mrg
    1.1  mrg mp_limb_t
    1.1  mrg mpn_mu_div_qr (mp_ptr qp,
    1.1  mrg 	       mp_ptr rp,
    1.1  mrg 	       mp_srcptr np,
    1.1  mrg 	       mp_size_t nn,
    1.1  mrg 	       mp_srcptr dp,
    1.1  mrg 	       mp_size_t dn,
    1.1  mrg 	       mp_ptr scratch)
    1.1  mrg {
    1.1  mrg   mp_size_t qn;
    1.1  mrg   mp_limb_t cy, qh;
    1.1  mrg
    1.1  mrg   qn = nn - dn;
    1.1  mrg   if (qn + MU_DIV_QR_SKEW_THRESHOLD < dn)
    1.1  mrg     {
    1.1  mrg       /* |______________|_ign_first__|   dividend			  nn
    1.1  mrg 		|_______|_ign_first__|   divisor			  dn
    1.1  mrg
    1.1  mrg 		|______|	     quotient (prel)			  qn
    1.1  mrg
    1.1  mrg 		 |___________________|   quotient * ignored-divisor-part  dn-1
    1.1  mrg       */
    1.1  mrg
    1.1  mrg       /* Compute a preliminary quotient and a partial remainder by dividing the
    1.1  mrg 	 most significant limbs of each operand.  */
    1.1  mrg       qh = mpn_mu_div_qr2 (qp, rp + nn - (2 * qn + 1),
    1.1  mrg 			   np + nn - (2 * qn + 1), 2 * qn + 1,
    1.1  mrg 			   dp + dn - (qn + 1), qn + 1,
    1.1  mrg 			   scratch);
    1.1  mrg
    1.1  mrg       /* Multiply the quotient by the divisor limbs ignored above.  */
    1.1  mrg       if (dn - (qn + 1) > qn)
    1.1  mrg 	mpn_mul (scratch, dp, dn - (qn + 1), qp, qn);  /* prod is dn-1 limbs */
    1.1  mrg       else
    1.1  mrg 	mpn_mul (scratch, qp, qn, dp, dn - (qn + 1));  /* prod is dn-1 limbs */
    1.1  mrg
    1.1  mrg       if (qh)
    1.1  mrg 	cy = mpn_add_n (scratch + qn, scratch + qn, dp, dn - (qn + 1));
    1.1  mrg       else
    1.1  mrg 	cy = 0;
    1.1  mrg       scratch[dn - 1] = cy;
    1.1  mrg
    1.1  mrg       cy = mpn_sub_n (rp, np, scratch, nn - (2 * qn + 1));
    1.1  mrg       cy = mpn_sub_nc (rp + nn - (2 * qn + 1),
    1.1  mrg 		       rp + nn - (2 * qn + 1),
    1.1  mrg 		       scratch + nn - (2 * qn + 1),
    1.1  mrg 		       qn + 1, cy);
    1.1  mrg       if (cy)
    1.1  mrg 	{
    1.1  mrg 	  qh -= mpn_sub_1 (qp, qp, qn, 1);
    1.1  mrg 	  mpn_add_n (rp, rp, dp, dn);
    1.1  mrg 	}
    1.1  mrg     }
    1.1  mrg   else
    1.1  mrg     {
    1.1  mrg       qh = mpn_mu_div_qr2 (qp, rp, np, nn, dp, dn, scratch);
    1.1  mrg     }
    1.1  mrg
    1.1  mrg   return qh;
    1.1  mrg }
    1.1  mrg
    1.1  mrg static mp_limb_t
    1.1  mrg mpn_mu_div_qr2 (mp_ptr qp,
    1.1  mrg 		mp_ptr rp,
    1.1  mrg 		mp_srcptr np,
    1.1  mrg 		mp_size_t nn,
    1.1  mrg 		mp_srcptr dp,
    1.1  mrg 		mp_size_t dn,
    1.1  mrg 		mp_ptr scratch)
    1.1  mrg {
    1.1  mrg   mp_size_t qn, in;
    1.1  mrg   mp_limb_t cy, qh;
    1.1  mrg   mp_ptr ip, tp;
    1.1  mrg
    1.1  mrg   ASSERT (dn > 1);
    1.1  mrg
    1.1  mrg   qn = nn - dn;
    1.1  mrg
    1.1  mrg   /* Compute the inverse size.  */
    1.1  mrg   in = mpn_mu_div_qr_choose_in (qn, dn, 0);
    1.1  mrg   ASSERT (in <= dn);
    1.1  mrg
    1.1  mrg #if 1
    1.1  mrg   /* This alternative inverse computation method gets slightly more accurate
    1.1  mrg      results.  FIXMEs: (1) Temp allocation needs not analysed (2) itch function
    1.1  mrg      not adapted (3) mpn_invertappr scratch needs not met.  */
    1.1  mrg   ip = scratch;
    1.1  mrg   tp = scratch + in + 1;
    1.1  mrg
    1.1  mrg   /* compute an approximate inverse on (in+1) limbs */
    1.1  mrg   if (dn == in)
    1.1  mrg     {
    1.1  mrg       MPN_COPY (tp + 1, dp, in);
    1.1  mrg       tp[0] = 1;
1.1.1.3  mrg       mpn_invertappr (ip, tp, in + 1, tp + in + 1);
    1.1  mrg       MPN_COPY_INCR (ip, ip + 1, in);
    1.1  mrg     }
    1.1  mrg   else
    1.1  mrg     {
    1.1  mrg       cy = mpn_add_1 (tp, dp + dn - (in + 1), in + 1, 1);
    1.1  mrg       if (UNLIKELY (cy != 0))
    1.1  mrg 	MPN_ZERO (ip, in);
    1.1  mrg       else
    1.1  mrg 	{
1.1.1.3  mrg 	  mpn_invertappr (ip, tp, in + 1, tp + in + 1);
    1.1  mrg 	  MPN_COPY_INCR (ip, ip + 1, in);
    1.1  mrg 	}
    1.1  mrg     }
    1.1  mrg #else
    1.1  mrg   /* This older inverse computation method gets slightly worse results than the
    1.1  mrg      one above.  */
    1.1  mrg   ip = scratch;
    1.1  mrg   tp = scratch + in;
    1.1  mrg
    1.1  mrg   /* Compute inverse of D to in+1 limbs, then round to 'in' limbs.  Ideally the
    1.1  mrg      inversion function should do this automatically.  */
    1.1  mrg   if (dn == in)
    1.1  mrg     {
    1.1  mrg       tp[in + 1] = 0;
    1.1  mrg       MPN_COPY (tp + in + 2, dp, in);
    1.1  mrg       mpn_invertappr (tp, tp + in + 1, in + 1, NULL);
    1.1  mrg     }
    1.1  mrg   else
    1.1  mrg     {
    1.1  mrg       mpn_invertappr (tp, dp + dn - (in + 1), in + 1, NULL);
    1.1  mrg     }
    1.1  mrg   cy = mpn_sub_1 (tp, tp, in + 1, GMP_NUMB_HIGHBIT);
    1.1  mrg   if (UNLIKELY (cy != 0))
    1.1  mrg     MPN_ZERO (tp + 1, in);
    1.1  mrg   MPN_COPY (ip, tp + 1, in);
    1.1  mrg #endif
    1.1  mrg
    1.1  mrg   qh = mpn_preinv_mu_div_qr (qp, rp, np, nn, dp, dn, ip, in, scratch + in);
    1.1  mrg
    1.1  mrg   return qh;
    1.1  mrg }
    1.1  mrg
    1.1  mrg mp_limb_t
    1.1  mrg mpn_preinv_mu_div_qr (mp_ptr qp,
    1.1  mrg 		      mp_ptr rp,
    1.1  mrg 		      mp_srcptr np,
    1.1  mrg 		      mp_size_t nn,
    1.1  mrg 		      mp_srcptr dp,
    1.1  mrg 		      mp_size_t dn,
    1.1  mrg 		      mp_srcptr ip,
    1.1  mrg 		      mp_size_t in,
    1.1  mrg 		      mp_ptr scratch)
    1.1  mrg {
    1.1  mrg   mp_size_t qn;
    1.1  mrg   mp_limb_t cy, cx, qh;
    1.1  mrg   mp_limb_t r;
    1.1  mrg   mp_size_t tn, wn;
    1.1  mrg
    1.1  mrg #define tp           scratch
    1.1  mrg #define scratch_out  (scratch + tn)
    1.1  mrg
    1.1  mrg   qn = nn - dn;
    1.1  mrg
    1.1  mrg   np += qn;
    1.1  mrg   qp += qn;
    1.1  mrg
    1.1  mrg   qh = mpn_cmp (np, dp, dn) >= 0;
    1.1  mrg   if (qh != 0)
    1.1  mrg     mpn_sub_n (rp, np, dp, dn);
    1.1  mrg   else
1.1.1.2  mrg     MPN_COPY_INCR (rp, np, dn);
    1.1  mrg
1.1.1.3  mrg   /* if (qn == 0) */			/* The while below handles this case */
1.1.1.3  mrg   /*   return qh; */			/* Degenerate use.  Should we allow this? */
    1.1  mrg
    1.1  mrg   while (qn > 0)
    1.1  mrg     {
    1.1  mrg       if (qn < in)
    1.1  mrg 	{
    1.1  mrg 	  ip += in - qn;
    1.1  mrg 	  in = qn;
    1.1  mrg 	}
    1.1  mrg       np -= in;
    1.1  mrg       qp -= in;
    1.1  mrg
    1.1  mrg       /* Compute the next block of quotient limbs by multiplying the inverse I
    1.1  mrg 	 by the upper part of the partial remainder R.  */
    1.1  mrg       mpn_mul_n (tp, rp + dn - in, ip, in);		/* mulhi  */
    1.1  mrg       cy = mpn_add_n (qp, tp + in, rp + dn - in, in);	/* I's msb implicit */
    1.1  mrg       ASSERT_ALWAYS (cy == 0);
    1.1  mrg
    1.1  mrg       qn -= in;
    1.1  mrg
    1.1  mrg       /* Compute the product of the quotient block and the divisor D, to be
    1.1  mrg 	 subtracted from the partial remainder combined with new limbs from the
    1.1  mrg 	 dividend N.  We only really need the low dn+1 limbs.  */
    1.1  mrg
    1.1  mrg       if (BELOW_THRESHOLD (in, MUL_TO_MULMOD_BNM1_FOR_2NXN_THRESHOLD))
    1.1  mrg 	mpn_mul (tp, dp, dn, qp, in);		/* dn+in limbs, high 'in' cancels */
    1.1  mrg       else
    1.1  mrg 	{
    1.1  mrg 	  tn = mpn_mulmod_bnm1_next_size (dn + 1);
    1.1  mrg 	  mpn_mulmod_bnm1 (tp, tn, dp, dn, qp, in, scratch_out);
    1.1  mrg 	  wn = dn + in - tn;			/* number of wrapped limbs */
    1.1  mrg 	  if (wn > 0)
    1.1  mrg 	    {
    1.1  mrg 	      cy = mpn_sub_n (tp, tp, rp + dn - wn, wn);
    1.1  mrg 	      cy = mpn_sub_1 (tp + wn, tp + wn, tn - wn, cy);
    1.1  mrg 	      cx = mpn_cmp (rp + dn - in, tp + dn, tn - dn) < 0;
    1.1  mrg 	      ASSERT_ALWAYS (cx >= cy);
    1.1  mrg 	      mpn_incr_u (tp, cx - cy);
    1.1  mrg 	    }
    1.1  mrg 	}
    1.1  mrg
    1.1  mrg       r = rp[dn - in] - tp[dn];
    1.1  mrg
    1.1  mrg       /* Subtract the product from the partial remainder combined with new
    1.1  mrg 	 limbs from the dividend N, generating a new partial remainder R.  */
    1.1  mrg       if (dn != in)
    1.1  mrg 	{
    1.1  mrg 	  cy = mpn_sub_n (tp, np, tp, in);	/* get next 'in' limbs from N */
    1.1  mrg 	  cy = mpn_sub_nc (tp + in, rp, tp + in, dn - in, cy);
    1.1  mrg 	  MPN_COPY (rp, tp, dn);		/* FIXME: try to avoid this */
    1.1  mrg 	}
    1.1  mrg       else
    1.1  mrg 	{
    1.1  mrg 	  cy = mpn_sub_n (rp, np, tp, in);	/* get next 'in' limbs from N */
    1.1  mrg 	}
    1.1  mrg
    1.1  mrg       STAT (int i; int err = 0;
    1.1  mrg 	    static int errarr[5]; static int err_rec; static int tot);
    1.1  mrg
    1.1  mrg       /* Check the remainder R and adjust the quotient as needed.  */
    1.1  mrg       r -= cy;
    1.1  mrg       while (r != 0)
    1.1  mrg 	{
    1.1  mrg 	  /* We loop 0 times with about 69% probability, 1 time with about 31%
    1.1  mrg 	     probability, 2 times with about 0.6% probability, if inverse is
    1.1  mrg 	     computed as recommended.  */
    1.1  mrg 	  mpn_incr_u (qp, 1);
    1.1  mrg 	  cy = mpn_sub_n (rp, rp, dp, dn);
    1.1  mrg 	  r -= cy;
    1.1  mrg 	  STAT (err++);
    1.1  mrg 	}
    1.1  mrg       if (mpn_cmp (rp, dp, dn) >= 0)
    1.1  mrg 	{
    1.1  mrg 	  /* This is executed with about 76% probability.  */
    1.1  mrg 	  mpn_incr_u (qp, 1);
    1.1  mrg 	  cy = mpn_sub_n (rp, rp, dp, dn);
    1.1  mrg 	  STAT (err++);
    1.1  mrg 	}
    1.1  mrg
    1.1  mrg       STAT (
    1.1  mrg 	    tot++;
    1.1  mrg 	    errarr[err]++;
    1.1  mrg 	    if (err > err_rec)
    1.1  mrg 	      err_rec = err;
    1.1  mrg 	    if (tot % 0x10000 == 0)
    1.1  mrg 	      {
    1.1  mrg 		for (i = 0; i <= err_rec; i++)
    1.1  mrg 		  printf ("  %d(%.1f%%)", errarr[i], 100.0*errarr[i]/tot);
    1.1  mrg 		printf ("\n");
    1.1  mrg 	      }
    1.1  mrg 	    );
    1.1  mrg     }
    1.1  mrg
    1.1  mrg   return qh;
    1.1  mrg }
    1.1  mrg
    1.1  mrg /* In case k=0 (automatic choice), we distinguish 3 cases:
    1.1  mrg    (a) dn < qn:         in = ceil(qn / ceil(qn/dn))
    1.1  mrg    (b) dn/3 < qn <= dn: in = ceil(qn / 2)
    1.1  mrg    (c) qn < dn/3:       in = qn
    1.1  mrg    In all cases we have in <= dn.
    1.1  mrg  */
1.1.1.4  mrg static mp_size_t
    1.1  mrg mpn_mu_div_qr_choose_in (mp_size_t qn, mp_size_t dn, int k)
    1.1  mrg {
    1.1  mrg   mp_size_t in;
    1.1  mrg
    1.1  mrg   if (k == 0)
    1.1  mrg     {
    1.1  mrg       mp_size_t b;
    1.1  mrg       if (qn > dn)
    1.1  mrg 	{
    1.1  mrg 	  /* Compute an inverse size that is a nice partition of the quotient.  */
    1.1  mrg 	  b = (qn - 1) / dn + 1;	/* ceil(qn/dn), number of blocks */
    1.1  mrg 	  in = (qn - 1) / b + 1;	/* ceil(qn/b) = ceil(qn / ceil(qn/dn)) */
    1.1  mrg 	}
    1.1  mrg       else if (3 * qn > dn)
    1.1  mrg 	{
    1.1  mrg 	  in = (qn - 1) / 2 + 1;	/* b = 2 */
    1.1  mrg 	}
    1.1  mrg       else
    1.1  mrg 	{
    1.1  mrg 	  in = (qn - 1) / 1 + 1;	/* b = 1 */
    1.1  mrg 	}
    1.1  mrg     }
    1.1  mrg   else
    1.1  mrg     {
    1.1  mrg       mp_size_t xn;
    1.1  mrg       xn = MIN (dn, qn);
    1.1  mrg       in = (xn - 1) / k + 1;
    1.1  mrg     }
    1.1  mrg
    1.1  mrg   return in;
    1.1  mrg }
    1.1  mrg
    1.1  mrg mp_size_t
    1.1  mrg mpn_mu_div_qr_itch (mp_size_t nn, mp_size_t dn, int mua_k)
    1.1  mrg {
    1.1  mrg   mp_size_t in = mpn_mu_div_qr_choose_in (nn - dn, dn, mua_k);
1.1.1.3  mrg   mp_size_t itch_preinv = mpn_preinv_mu_div_qr_itch (nn, dn, in);
1.1.1.3  mrg   mp_size_t itch_invapp = mpn_invertappr_itch (in + 1) + in + 2; /* 3in + 4 */
    1.1  mrg
1.1.1.3  mrg   ASSERT (itch_preinv >= itch_invapp);
1.1.1.3  mrg   return in + MAX (itch_invapp, itch_preinv);
    1.1  mrg }
    1.1  mrg
    1.1  mrg mp_size_t
    1.1  mrg mpn_preinv_mu_div_qr_itch (mp_size_t nn, mp_size_t dn, mp_size_t in)
    1.1  mrg {
    1.1  mrg   mp_size_t itch_local = mpn_mulmod_bnm1_next_size (dn + 1);
    1.1  mrg   mp_size_t itch_out = mpn_mulmod_bnm1_itch (itch_local, dn, in);
    1.1  mrg
    1.1  mrg   return itch_local + itch_out;
    1.1  mrg }