Home | History | Annotate | Line # | Download | only in generic
mu_divappr_q.c revision 1.1
      1 /* mpn_mu_divappr_q, mpn_preinv_mu_divappr_q.
      2 
      3    Compute Q = floor(N / D) + e.  N is nn limbs, D is dn limbs and must be
      4    normalized, and Q must be nn-dn limbs, 0 <= e <= 4.  The requirement that Q
      5    is nn-dn limbs (and not nn-dn+1 limbs) was put in place in order to allow us
      6    to let N be unmodified during the operation.
      7 
      8    Contributed to the GNU project by Torbjorn Granlund.
      9 
     10    THE FUNCTIONS IN THIS FILE ARE INTERNAL WITH MUTABLE INTERFACES.  IT IS ONLY
     11    SAFE TO REACH THEM THROUGH DOCUMENTED INTERFACES.  IN FACT, IT IS ALMOST
     12    GUARANTEED THAT THEY WILL CHANGE OR DISAPPEAR IN A FUTURE GMP RELEASE.
     13 
     14 Copyright 2005, 2006, 2007, 2009, 2010 Free Software Foundation, Inc.
     15 
     16 This file is part of the GNU MP Library.
     17 
     18 The GNU MP Library is free software; you can redistribute it and/or modify
     19 it under the terms of the GNU Lesser General Public License as published by
     20 the Free Software Foundation; either version 3 of the License, or (at your
     21 option) any later version.
     22 
     23 The GNU MP Library is distributed in the hope that it will be useful, but
     24 WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
     25 or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
     26 License for more details.
     27 
     28 You should have received a copy of the GNU Lesser General Public License
     29 along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.  */
     30 
     31 
     32 /*
     33    The idea of the algorithm used herein is to compute a smaller inverted value
     34    than used in the standard Barrett algorithm, and thus save time in the
     35    Newton iterations, and pay just a small price when using the inverted value
     36    for developing quotient bits.  This algorithm was presented at ICMS 2006.
     37 */
     38 
     39 /* CAUTION: This code and the code in mu_div_qr.c should be edited in sync.
     40 
     41  Things to work on:
     42 
     43   * The itch/scratch scheme isn't perhaps such a good idea as it once seemed,
     44     demonstrated by the fact that the mpn_invertappr function's scratch needs
     45     mean that we need to keep a large allocation long after it is needed.
     46     Things are worse as mpn_mul_fft does not accept any scratch parameter,
     47     which means we'll have a large memory hole while in mpn_mul_fft.  In
     48     general, a peak scratch need in the beginning of a function isn't
     49     well-handled by the itch/scratch scheme.
     50 */
     51 
     52 #ifdef STAT
     53 #undef STAT
     54 #define STAT(x) x
     55 #else
     56 #define STAT(x)
     57 #endif
     58 
     59 #include <stdlib.h>		/* for NULL */
     60 #include "gmp.h"
     61 #include "gmp-impl.h"
     62 
     63 
     64 mp_limb_t
     65 mpn_mu_divappr_q (mp_ptr qp,
     66 		  mp_srcptr np,
     67 		  mp_size_t nn,
     68 		  mp_srcptr dp,
     69 		  mp_size_t dn,
     70 		  mp_ptr scratch)
     71 {
     72   mp_size_t qn, in;
     73   mp_limb_t cy, qh;
     74   mp_ptr ip, tp;
     75 
     76   ASSERT (dn > 1);
     77 
     78   qn = nn - dn;
     79 
     80   /* If Q is smaller than D, truncate operands. */
     81   if (qn + 1 < dn)
     82     {
     83       np += dn - (qn + 1);
     84       nn -= dn - (qn + 1);
     85       dp += dn - (qn + 1);
     86       dn = qn + 1;
     87     }
     88 
     89   /* Compute the inverse size.  */
     90   in = mpn_mu_divappr_q_choose_in (qn, dn, 0);
     91   ASSERT (in <= dn);
     92 
     93 #if 1
     94   /* This alternative inverse computation method gets slightly more accurate
     95      results.  FIXMEs: (1) Temp allocation needs not analysed (2) itch function
     96      not adapted (3) mpn_invertappr scratch needs not met.  */
     97   ip = scratch;
     98   tp = scratch + in + 1;
     99 
    100   /* compute an approximate inverse on (in+1) limbs */
    101   if (dn == in)
    102     {
    103       MPN_COPY (tp + 1, dp, in);
    104       tp[0] = 1;
    105       mpn_invertappr (ip, tp, in + 1, NULL);
    106       MPN_COPY_INCR (ip, ip + 1, in);
    107     }
    108   else
    109     {
    110       cy = mpn_add_1 (tp, dp + dn - (in + 1), in + 1, 1);
    111       if (UNLIKELY (cy != 0))
    112 	MPN_ZERO (ip, in);
    113       else
    114 	{
    115 	  mpn_invertappr (ip, tp, in + 1, NULL);
    116 	  MPN_COPY_INCR (ip, ip + 1, in);
    117 	}
    118     }
    119 #else
    120   /* This older inverse computation method gets slightly worse results than the
    121      one above.  */
    122   ip = scratch;
    123   tp = scratch + in;
    124 
    125   /* Compute inverse of D to in+1 limbs, then round to 'in' limbs.  Ideally the
    126      inversion function should do this automatically.  */
    127   if (dn == in)
    128     {
    129       tp[in + 1] = 0;
    130       MPN_COPY (tp + in + 2, dp, in);
    131       mpn_invertappr (tp, tp + in + 1, in + 1, NULL);
    132     }
    133   else
    134     {
    135       mpn_invertappr (tp, dp + dn - (in + 1), in + 1, NULL);
    136     }
    137   cy = mpn_sub_1 (tp, tp, in + 1, GMP_NUMB_HIGHBIT);
    138   if (UNLIKELY (cy != 0))
    139     MPN_ZERO (tp + 1, in);
    140   MPN_COPY (ip, tp + 1, in);
    141 #endif
    142 
    143   qh = mpn_preinv_mu_divappr_q (qp, np, nn, dp, dn, ip, in, scratch + in);
    144 
    145   return qh;
    146 }
    147 
    148 mp_limb_t
    149 mpn_preinv_mu_divappr_q (mp_ptr qp,
    150 			 mp_srcptr np,
    151 			 mp_size_t nn,
    152 			 mp_srcptr dp,
    153 			 mp_size_t dn,
    154 			 mp_srcptr ip,
    155 			 mp_size_t in,
    156 			 mp_ptr scratch)
    157 {
    158   mp_size_t qn;
    159   mp_limb_t cy, cx, qh;
    160   mp_limb_t r;
    161   mp_size_t tn, wn;
    162 
    163 #define rp           scratch
    164 #define tp           (scratch + dn)
    165 #define scratch_out  (scratch + dn + tn)
    166 
    167   qn = nn - dn;
    168 
    169   np += qn;
    170   qp += qn;
    171 
    172   qh = mpn_cmp (np, dp, dn) >= 0;
    173   if (qh != 0)
    174     mpn_sub_n (rp, np, dp, dn);
    175   else
    176     MPN_COPY (rp, np, dn);
    177 
    178   if (qn == 0)
    179     return qh;			/* Degenerate use.  Should we allow this? */
    180 
    181   while (qn > 0)
    182     {
    183       if (qn < in)
    184 	{
    185 	  ip += in - qn;
    186 	  in = qn;
    187 	}
    188       np -= in;
    189       qp -= in;
    190 
    191       /* Compute the next block of quotient limbs by multiplying the inverse I
    192 	 by the upper part of the partial remainder R.  */
    193       mpn_mul_n (tp, rp + dn - in, ip, in);		/* mulhi  */
    194       cy = mpn_add_n (qp, tp + in, rp + dn - in, in);	/* I's msb implicit */
    195       ASSERT_ALWAYS (cy == 0);
    196 
    197       qn -= in;
    198       if (qn == 0)
    199 	break;
    200 
    201       /* Compute the product of the quotient block and the divisor D, to be
    202 	 subtracted from the partial remainder combined with new limbs from the
    203 	 dividend N.  We only really need the low dn limbs.  */
    204 
    205       if (BELOW_THRESHOLD (in, MUL_TO_MULMOD_BNM1_FOR_2NXN_THRESHOLD))
    206 	mpn_mul (tp, dp, dn, qp, in);		/* dn+in limbs, high 'in' cancels */
    207       else
    208 	{
    209 	  tn = mpn_mulmod_bnm1_next_size (dn + 1);
    210 	  mpn_mulmod_bnm1 (tp, tn, dp, dn, qp, in, scratch_out);
    211 	  wn = dn + in - tn;			/* number of wrapped limbs */
    212 	  if (wn > 0)
    213 	    {
    214 	      cy = mpn_sub_n (tp, tp, rp + dn - wn, wn);
    215 	      cy = mpn_sub_1 (tp + wn, tp + wn, tn - wn, cy);
    216 	      cx = mpn_cmp (rp + dn - in, tp + dn, tn - dn) < 0;
    217 	      ASSERT_ALWAYS (cx >= cy);
    218 	      mpn_incr_u (tp, cx - cy);
    219 	    }
    220 	}
    221 
    222       r = rp[dn - in] - tp[dn];
    223 
    224       /* Subtract the product from the partial remainder combined with new
    225 	 limbs from the dividend N, generating a new partial remainder R.  */
    226       if (dn != in)
    227 	{
    228 	  cy = mpn_sub_n (tp, np, tp, in);	/* get next 'in' limbs from N */
    229 	  cy = mpn_sub_nc (tp + in, rp, tp + in, dn - in, cy);
    230 	  MPN_COPY (rp, tp, dn);		/* FIXME: try to avoid this */
    231 	}
    232       else
    233 	{
    234 	  cy = mpn_sub_n (rp, np, tp, in);	/* get next 'in' limbs from N */
    235 	}
    236 
    237       STAT (int i; int err = 0;
    238 	    static int errarr[5]; static int err_rec; static int tot);
    239 
    240       /* Check the remainder R and adjust the quotient as needed.  */
    241       r -= cy;
    242       while (r != 0)
    243 	{
    244 	  /* We loop 0 times with about 69% probability, 1 time with about 31%
    245 	     probability, 2 times with about 0.6% probability, if inverse is
    246 	     computed as recommended.  */
    247 	  mpn_incr_u (qp, 1);
    248 	  cy = mpn_sub_n (rp, rp, dp, dn);
    249 	  r -= cy;
    250 	  STAT (err++);
    251 	}
    252       if (mpn_cmp (rp, dp, dn) >= 0)
    253 	{
    254 	  /* This is executed with about 76% probability.  */
    255 	  mpn_incr_u (qp, 1);
    256 	  cy = mpn_sub_n (rp, rp, dp, dn);
    257 	  STAT (err++);
    258 	}
    259 
    260       STAT (
    261 	    tot++;
    262 	    errarr[err]++;
    263 	    if (err > err_rec)
    264 	      err_rec = err;
    265 	    if (tot % 0x10000 == 0)
    266 	      {
    267 		for (i = 0; i <= err_rec; i++)
    268 		  printf ("  %d(%.1f%%)", errarr[i], 100.0*errarr[i]/tot);
    269 		printf ("\n");
    270 	      }
    271 	    );
    272     }
    273 
    274   /* FIXME: We should perhaps be somewhat more elegant in our rounding of the
    275      quotient.  For now, just make sure the returned quotient is >= the real
    276      quotient; add 3 with saturating arithmetic.  */
    277   qn = nn - dn;
    278   cy += mpn_add_1 (qp, qp, qn, 3);
    279   if (cy != 0)
    280     {
    281       if (qh != 0)
    282 	{
    283 	  /* Return a quotient of just 1-bits, with qh set.  */
    284 	  mp_size_t i;
    285 	  for (i = 0; i < qn; i++)
    286 	    qp[i] = GMP_NUMB_MAX;
    287 	}
    288       else
    289 	{
    290 	  /* Propagate carry into qh.  */
    291 	  qh = 1;
    292 	}
    293     }
    294 
    295   return qh;
    296 }
    297 
    298 /* In case k=0 (automatic choice), we distinguish 3 cases:
    299    (a) dn < qn:         in = ceil(qn / ceil(qn/dn))
    300    (b) dn/3 < qn <= dn: in = ceil(qn / 2)
    301    (c) qn < dn/3:       in = qn
    302    In all cases we have in <= dn.
    303  */
    304 mp_size_t
    305 mpn_mu_divappr_q_choose_in (mp_size_t qn, mp_size_t dn, int k)
    306 {
    307   mp_size_t in;
    308 
    309   if (k == 0)
    310     {
    311       mp_size_t b;
    312       if (qn > dn)
    313 	{
    314 	  /* Compute an inverse size that is a nice partition of the quotient.  */
    315 	  b = (qn - 1) / dn + 1;	/* ceil(qn/dn), number of blocks */
    316 	  in = (qn - 1) / b + 1;	/* ceil(qn/b) = ceil(qn / ceil(qn/dn)) */
    317 	}
    318       else if (3 * qn > dn)
    319 	{
    320 	  in = (qn - 1) / 2 + 1;	/* b = 2 */
    321 	}
    322       else
    323 	{
    324 	  in = (qn - 1) / 1 + 1;	/* b = 1 */
    325 	}
    326     }
    327   else
    328     {
    329       mp_size_t xn;
    330       xn = MIN (dn, qn);
    331       in = (xn - 1) / k + 1;
    332     }
    333 
    334   return in;
    335 }
    336 
    337 mp_size_t
    338 mpn_mu_divappr_q_itch (mp_size_t nn, mp_size_t dn, int mua_k)
    339 {
    340   mp_size_t qn, in, itch_local, itch_out;
    341 
    342   qn = nn - dn;
    343   if (qn + 1 < dn)
    344     {
    345       dn = qn + 1;
    346     }
    347   in = mpn_mu_divappr_q_choose_in (qn, dn, mua_k);
    348 
    349   itch_local = mpn_mulmod_bnm1_next_size (dn + 1);
    350   itch_out = mpn_mulmod_bnm1_itch (itch_local, dn, in);
    351   return in + dn + itch_local + itch_out;
    352 }
    353