Home | History | Annotate | Line # | Download | only in generic
mu_divappr_q.c revision 1.1.1.3
      1 /* mpn_mu_divappr_q, mpn_preinv_mu_divappr_q.
      2 
      3    Compute Q = floor(N / D) + e.  N is nn limbs, D is dn limbs and must be
      4    normalized, and Q must be nn-dn limbs, 0 <= e <= 4.  The requirement that Q
      5    is nn-dn limbs (and not nn-dn+1 limbs) was put in place in order to allow us
      6    to let N be unmodified during the operation.
      7 
      8    Contributed to the GNU project by Torbjorn Granlund.
      9 
     10    THE FUNCTIONS IN THIS FILE ARE INTERNAL WITH MUTABLE INTERFACES.  IT IS ONLY
     11    SAFE TO REACH THEM THROUGH DOCUMENTED INTERFACES.  IN FACT, IT IS ALMOST
     12    GUARANTEED THAT THEY WILL CHANGE OR DISAPPEAR IN A FUTURE GMP RELEASE.
     13 
     14 Copyright 2005-2007, 2009, 2010 Free Software Foundation, Inc.
     15 
     16 This file is part of the GNU MP Library.
     17 
     18 The GNU MP Library is free software; you can redistribute it and/or modify
     19 it under the terms of either:
     20 
     21   * the GNU Lesser General Public License as published by the Free
     22     Software Foundation; either version 3 of the License, or (at your
     23     option) any later version.
     24 
     25 or
     26 
     27   * the GNU General Public License as published by the Free Software
     28     Foundation; either version 2 of the License, or (at your option) any
     29     later version.
     30 
     31 or both in parallel, as here.
     32 
     33 The GNU MP Library is distributed in the hope that it will be useful, but
     34 WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
     35 or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
     36 for more details.
     37 
     38 You should have received copies of the GNU General Public License and the
     39 GNU Lesser General Public License along with the GNU MP Library.  If not,
     40 see https://www.gnu.org/licenses/.  */
     41 
     42 
     43 /*
     44    The idea of the algorithm used herein is to compute a smaller inverted value
     45    than used in the standard Barrett algorithm, and thus save time in the
     46    Newton iterations, and pay just a small price when using the inverted value
     47    for developing quotient bits.  This algorithm was presented at ICMS 2006.
     48 */
     49 
     50 /* CAUTION: This code and the code in mu_div_qr.c should be edited in sync.
     51 
     52  Things to work on:
     53 
     54   * The itch/scratch scheme isn't perhaps such a good idea as it once seemed,
     55     demonstrated by the fact that the mpn_invertappr function's scratch needs
     56     mean that we need to keep a large allocation long after it is needed.
     57     Things are worse as mpn_mul_fft does not accept any scratch parameter,
     58     which means we'll have a large memory hole while in mpn_mul_fft.  In
     59     general, a peak scratch need in the beginning of a function isn't
     60     well-handled by the itch/scratch scheme.
     61 */
     62 
     63 #ifdef STAT
     64 #undef STAT
     65 #define STAT(x) x
     66 #else
     67 #define STAT(x)
     68 #endif
     69 
     70 #include <stdlib.h>		/* for NULL */
     71 #include "gmp-impl.h"
     72 
     73 static mp_limb_t mpn_preinv_mu_divappr_q (mp_ptr, mp_srcptr, mp_size_t,
     74 			 mp_srcptr, mp_size_t, mp_srcptr, mp_size_t, mp_ptr);
     75 static mp_size_t mpn_mu_divappr_q_choose_in (mp_size_t, mp_size_t, int);
     76 
     77 mp_limb_t
     78 mpn_mu_divappr_q (mp_ptr qp,
     79 		  mp_srcptr np,
     80 		  mp_size_t nn,
     81 		  mp_srcptr dp,
     82 		  mp_size_t dn,
     83 		  mp_ptr scratch)
     84 {
     85   mp_size_t qn, in;
     86   mp_limb_t cy, qh;
     87   mp_ptr ip, tp;
     88 
     89   ASSERT (dn > 1);
     90 
     91   qn = nn - dn;
     92 
     93   /* If Q is smaller than D, truncate operands. */
     94   if (qn + 1 < dn)
     95     {
     96       np += dn - (qn + 1);
     97       nn -= dn - (qn + 1);
     98       dp += dn - (qn + 1);
     99       dn = qn + 1;
    100     }
    101 
    102   /* Compute the inverse size.  */
    103   in = mpn_mu_divappr_q_choose_in (qn, dn, 0);
    104   ASSERT (in <= dn);
    105 
    106 #if 1
    107   /* This alternative inverse computation method gets slightly more accurate
    108      results.  FIXMEs: (1) Temp allocation needs not analysed (2) itch function
    109      not adapted (3) mpn_invertappr scratch needs not met.  */
    110   ip = scratch;
    111   tp = scratch + in + 1;
    112 
    113   /* compute an approximate inverse on (in+1) limbs */
    114   if (dn == in)
    115     {
    116       MPN_COPY (tp + 1, dp, in);
    117       tp[0] = 1;
    118       mpn_invertappr (ip, tp, in + 1, tp + in + 1);
    119       MPN_COPY_INCR (ip, ip + 1, in);
    120     }
    121   else
    122     {
    123       cy = mpn_add_1 (tp, dp + dn - (in + 1), in + 1, 1);
    124       if (UNLIKELY (cy != 0))
    125 	MPN_ZERO (ip, in);
    126       else
    127 	{
    128 	  mpn_invertappr (ip, tp, in + 1, tp + in + 1);
    129 	  MPN_COPY_INCR (ip, ip + 1, in);
    130 	}
    131     }
    132 #else
    133   /* This older inverse computation method gets slightly worse results than the
    134      one above.  */
    135   ip = scratch;
    136   tp = scratch + in;
    137 
    138   /* Compute inverse of D to in+1 limbs, then round to 'in' limbs.  Ideally the
    139      inversion function should do this automatically.  */
    140   if (dn == in)
    141     {
    142       tp[in + 1] = 0;
    143       MPN_COPY (tp + in + 2, dp, in);
    144       mpn_invertappr (tp, tp + in + 1, in + 1, NULL);
    145     }
    146   else
    147     {
    148       mpn_invertappr (tp, dp + dn - (in + 1), in + 1, NULL);
    149     }
    150   cy = mpn_sub_1 (tp, tp, in + 1, GMP_NUMB_HIGHBIT);
    151   if (UNLIKELY (cy != 0))
    152     MPN_ZERO (tp + 1, in);
    153   MPN_COPY (ip, tp + 1, in);
    154 #endif
    155 
    156   qh = mpn_preinv_mu_divappr_q (qp, np, nn, dp, dn, ip, in, scratch + in);
    157 
    158   return qh;
    159 }
    160 
    161 static mp_limb_t
    162 mpn_preinv_mu_divappr_q (mp_ptr qp,
    163 			 mp_srcptr np,
    164 			 mp_size_t nn,
    165 			 mp_srcptr dp,
    166 			 mp_size_t dn,
    167 			 mp_srcptr ip,
    168 			 mp_size_t in,
    169 			 mp_ptr scratch)
    170 {
    171   mp_size_t qn;
    172   mp_limb_t cy, cx, qh;
    173   mp_limb_t r;
    174   mp_size_t tn, wn;
    175 
    176 #define rp           scratch
    177 #define tp           (scratch + dn)
    178 #define scratch_out  (scratch + dn + tn)
    179 
    180   qn = nn - dn;
    181 
    182   np += qn;
    183   qp += qn;
    184 
    185   qh = mpn_cmp (np, dp, dn) >= 0;
    186   if (qh != 0)
    187     mpn_sub_n (rp, np, dp, dn);
    188   else
    189     MPN_COPY (rp, np, dn);
    190 
    191   if (qn == 0)
    192     return qh;			/* Degenerate use.  Should we allow this? */
    193 
    194   while (qn > 0)
    195     {
    196       if (qn < in)
    197 	{
    198 	  ip += in - qn;
    199 	  in = qn;
    200 	}
    201       np -= in;
    202       qp -= in;
    203 
    204       /* Compute the next block of quotient limbs by multiplying the inverse I
    205 	 by the upper part of the partial remainder R.  */
    206       mpn_mul_n (tp, rp + dn - in, ip, in);		/* mulhi  */
    207       cy = mpn_add_n (qp, tp + in, rp + dn - in, in);	/* I's msb implicit */
    208       ASSERT_ALWAYS (cy == 0);
    209 
    210       qn -= in;
    211       if (qn == 0)
    212 	break;
    213 
    214       /* Compute the product of the quotient block and the divisor D, to be
    215 	 subtracted from the partial remainder combined with new limbs from the
    216 	 dividend N.  We only really need the low dn limbs.  */
    217 
    218       if (BELOW_THRESHOLD (in, MUL_TO_MULMOD_BNM1_FOR_2NXN_THRESHOLD))
    219 	mpn_mul (tp, dp, dn, qp, in);		/* dn+in limbs, high 'in' cancels */
    220       else
    221 	{
    222 	  tn = mpn_mulmod_bnm1_next_size (dn + 1);
    223 	  mpn_mulmod_bnm1 (tp, tn, dp, dn, qp, in, scratch_out);
    224 	  wn = dn + in - tn;			/* number of wrapped limbs */
    225 	  if (wn > 0)
    226 	    {
    227 	      cy = mpn_sub_n (tp, tp, rp + dn - wn, wn);
    228 	      cy = mpn_sub_1 (tp + wn, tp + wn, tn - wn, cy);
    229 	      cx = mpn_cmp (rp + dn - in, tp + dn, tn - dn) < 0;
    230 	      ASSERT_ALWAYS (cx >= cy);
    231 	      mpn_incr_u (tp, cx - cy);
    232 	    }
    233 	}
    234 
    235       r = rp[dn - in] - tp[dn];
    236 
    237       /* Subtract the product from the partial remainder combined with new
    238 	 limbs from the dividend N, generating a new partial remainder R.  */
    239       if (dn != in)
    240 	{
    241 	  cy = mpn_sub_n (tp, np, tp, in);	/* get next 'in' limbs from N */
    242 	  cy = mpn_sub_nc (tp + in, rp, tp + in, dn - in, cy);
    243 	  MPN_COPY (rp, tp, dn);		/* FIXME: try to avoid this */
    244 	}
    245       else
    246 	{
    247 	  cy = mpn_sub_n (rp, np, tp, in);	/* get next 'in' limbs from N */
    248 	}
    249 
    250       STAT (int i; int err = 0;
    251 	    static int errarr[5]; static int err_rec; static int tot);
    252 
    253       /* Check the remainder R and adjust the quotient as needed.  */
    254       r -= cy;
    255       while (r != 0)
    256 	{
    257 	  /* We loop 0 times with about 69% probability, 1 time with about 31%
    258 	     probability, 2 times with about 0.6% probability, if inverse is
    259 	     computed as recommended.  */
    260 	  mpn_incr_u (qp, 1);
    261 	  cy = mpn_sub_n (rp, rp, dp, dn);
    262 	  r -= cy;
    263 	  STAT (err++);
    264 	}
    265       if (mpn_cmp (rp, dp, dn) >= 0)
    266 	{
    267 	  /* This is executed with about 76% probability.  */
    268 	  mpn_incr_u (qp, 1);
    269 	  cy = mpn_sub_n (rp, rp, dp, dn);
    270 	  STAT (err++);
    271 	}
    272 
    273       STAT (
    274 	    tot++;
    275 	    errarr[err]++;
    276 	    if (err > err_rec)
    277 	      err_rec = err;
    278 	    if (tot % 0x10000 == 0)
    279 	      {
    280 		for (i = 0; i <= err_rec; i++)
    281 		  printf ("  %d(%.1f%%)", errarr[i], 100.0*errarr[i]/tot);
    282 		printf ("\n");
    283 	      }
    284 	    );
    285     }
    286 
    287   /* FIXME: We should perhaps be somewhat more elegant in our rounding of the
    288      quotient.  For now, just make sure the returned quotient is >= the real
    289      quotient; add 3 with saturating arithmetic.  */
    290   qn = nn - dn;
    291   cy += mpn_add_1 (qp, qp, qn, 3);
    292   if (cy != 0)
    293     {
    294       if (qh != 0)
    295 	{
    296 	  /* Return a quotient of just 1-bits, with qh set.  */
    297 	  mp_size_t i;
    298 	  for (i = 0; i < qn; i++)
    299 	    qp[i] = GMP_NUMB_MAX;
    300 	}
    301       else
    302 	{
    303 	  /* Propagate carry into qh.  */
    304 	  qh = 1;
    305 	}
    306     }
    307 
    308   return qh;
    309 }
    310 
    311 /* In case k=0 (automatic choice), we distinguish 3 cases:
    312    (a) dn < qn:         in = ceil(qn / ceil(qn/dn))
    313    (b) dn/3 < qn <= dn: in = ceil(qn / 2)
    314    (c) qn < dn/3:       in = qn
    315    In all cases we have in <= dn.
    316  */
    317 static mp_size_t
    318 mpn_mu_divappr_q_choose_in (mp_size_t qn, mp_size_t dn, int k)
    319 {
    320   mp_size_t in;
    321 
    322   if (k == 0)
    323     {
    324       mp_size_t b;
    325       if (qn > dn)
    326 	{
    327 	  /* Compute an inverse size that is a nice partition of the quotient.  */
    328 	  b = (qn - 1) / dn + 1;	/* ceil(qn/dn), number of blocks */
    329 	  in = (qn - 1) / b + 1;	/* ceil(qn/b) = ceil(qn / ceil(qn/dn)) */
    330 	}
    331       else if (3 * qn > dn)
    332 	{
    333 	  in = (qn - 1) / 2 + 1;	/* b = 2 */
    334 	}
    335       else
    336 	{
    337 	  in = (qn - 1) / 1 + 1;	/* b = 1 */
    338 	}
    339     }
    340   else
    341     {
    342       mp_size_t xn;
    343       xn = MIN (dn, qn);
    344       in = (xn - 1) / k + 1;
    345     }
    346 
    347   return in;
    348 }
    349 
    350 mp_size_t
    351 mpn_mu_divappr_q_itch (mp_size_t nn, mp_size_t dn, int mua_k)
    352 {
    353   mp_size_t qn, in, itch_local, itch_out, itch_invapp;
    354 
    355   qn = nn - dn;
    356   if (qn + 1 < dn)
    357     {
    358       dn = qn + 1;
    359     }
    360   in = mpn_mu_divappr_q_choose_in (qn, dn, mua_k);
    361 
    362   itch_local = mpn_mulmod_bnm1_next_size (dn + 1);
    363   itch_out = mpn_mulmod_bnm1_itch (itch_local, dn, in);
    364   itch_invapp = mpn_invertappr_itch (in + 1) + in + 2; /* 3in + 4 */
    365 
    366   ASSERT (dn + itch_local + itch_out >= itch_invapp);
    367   return in + MAX (dn + itch_local + itch_out, itch_invapp);
    368 }
    369