Home | History | Annotate | Line # | Download | only in mpz
n_pow_ui.c revision 1.1.1.1.8.1
      1 /* mpz_n_pow_ui -- mpn raised to ulong.
      2 
      3    THE FUNCTIONS IN THIS FILE ARE FOR INTERNAL USE ONLY.  THEY'RE ALMOST
      4    CERTAIN TO BE SUBJECT TO INCOMPATIBLE CHANGES OR DISAPPEAR COMPLETELY IN
      5    FUTURE GNU MP RELEASES.
      6 
      7 Copyright 2001, 2002, 2005, 2012 Free Software Foundation, Inc.
      8 
      9 This file is part of the GNU MP Library.
     10 
     11 The GNU MP Library is free software; you can redistribute it and/or modify
     12 it under the terms of the GNU Lesser General Public License as published by
     13 the Free Software Foundation; either version 3 of the License, or (at your
     14 option) any later version.
     15 
     16 The GNU MP Library is distributed in the hope that it will be useful, but
     17 WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
     18 or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
     19 License for more details.
     20 
     21 You should have received a copy of the GNU Lesser General Public License
     22 along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.  */
     23 
     24 #include "gmp.h"
     25 #include "gmp-impl.h"
     26 #include "longlong.h"
     27 
     28 
     29 /* Change this to "#define TRACE(x) x" for some traces. */
     30 #define TRACE(x)
     31 
     32 
     33 /* Use this to test the mul_2 code on a CPU without a native version of that
     34    routine.  */
     35 #if 0
     36 #define mpn_mul_2  refmpn_mul_2
     37 #define HAVE_NATIVE_mpn_mul_2  1
     38 #endif
     39 
     40 
     41 /* mpz_pow_ui and mpz_ui_pow_ui want to share almost all of this code.
     42    ui_pow_ui doesn't need the mpn_mul based powering loop or the tests on
     43    bsize==2 or >2, but separating that isn't easy because there's shared
     44    code both before and after (the size calculations and the powers of 2
     45    handling).
     46 
     47    Alternatives:
     48 
     49    It would work to just use the mpn_mul powering loop for 1 and 2 limb
     50    bases, but the current separate loop allows mul_1 and mul_2 to be done
     51    in-place, which might help cache locality a bit.  If mpn_mul was relaxed
     52    to allow source==dest when vn==1 or 2 then some pointer twiddling might
     53    let us get the same effect in one loop.
     54 
     55    The initial powering for bsize==1 into blimb or blimb:blimb_low doesn't
     56    form the biggest possible power of b that fits, only the biggest power of
     57    2 power, ie. b^(2^n).  It'd be possible to choose a bigger power, perhaps
     58    using mp_bases[b].big_base for small b, and thereby get better value
     59    from mpn_mul_1 or mpn_mul_2 in the bignum powering.  It's felt that doing
     60    so would be more complicated than it's worth, and could well end up being
     61    a slowdown for small e.  For big e on the other hand the algorithm is
     62    dominated by mpn_sqr so there wouldn't much of a saving.  The current
     63    code can be viewed as simply doing the first few steps of the powering in
     64    a single or double limb where possible.
     65 
     66    If r==b, and blow_twos==0, and r must be realloc'ed, then the temporary
     67    copy made of b is unnecessary.  We could just use the old alloc'ed block
     68    and free it at the end.  But arranging this seems like a lot more trouble
     69    than it's worth.  */
     70 
     71 
     72 /* floor(sqrt(GMP_NUMB_MAX)), ie. the biggest value that can be squared in
     73    a limb without overflowing.
     74    FIXME: This formula is an underestimate when GMP_NUMB_BITS is odd. */
     75 
     76 #define GMP_NUMB_HALFMAX  (((mp_limb_t) 1 << GMP_NUMB_BITS/2) - 1)
     77 
     78 
     79 /* The following are for convenience, they update the size and check the
     80    alloc.  */
     81 
     82 #define MPN_SQR(dst, alloc, src, size)          \
     83   do {                                          \
     84     ASSERT (2*(size) <= (alloc));               \
     85     mpn_sqr (dst, src, size);                   \
     86     (size) *= 2;                                \
     87     (size) -= ((dst)[(size)-1] == 0);           \
     88   } while (0)
     89 
     90 #define MPN_MUL(dst, alloc, src, size, src2, size2)     \
     91   do {                                                  \
     92     mp_limb_t  cy;                                      \
     93     ASSERT ((size) + (size2) <= (alloc));               \
     94     cy = mpn_mul (dst, src, size, src2, size2);         \
     95     (size) += (size2) - (cy == 0);                      \
     96   } while (0)
     97 
     98 #define MPN_MUL_2(ptr, size, alloc, mult)       \
     99   do {                                          \
    100     mp_limb_t  cy;                              \
    101     ASSERT ((size)+2 <= (alloc));               \
    102     cy = mpn_mul_2 (ptr, ptr, size, mult);      \
    103     (size)++;                                   \
    104     (ptr)[(size)] = cy;                         \
    105     (size) += (cy != 0);                        \
    106   } while (0)
    107 
    108 #define MPN_MUL_1(ptr, size, alloc, limb)       \
    109   do {                                          \
    110     mp_limb_t  cy;                              \
    111     ASSERT ((size)+1 <= (alloc));               \
    112     cy = mpn_mul_1 (ptr, ptr, size, limb);      \
    113     (ptr)[size] = cy;                           \
    114     (size) += (cy != 0);                        \
    115   } while (0)
    116 
    117 #define MPN_LSHIFT(ptr, size, alloc, shift)     \
    118   do {                                          \
    119     mp_limb_t  cy;                              \
    120     ASSERT ((size)+1 <= (alloc));               \
    121     cy = mpn_lshift (ptr, ptr, size, shift);    \
    122     (ptr)[size] = cy;                           \
    123     (size) += (cy != 0);                        \
    124   } while (0)
    125 
    126 #define MPN_RSHIFT_OR_COPY(dst, src, size, shift)       \
    127   do {                                                  \
    128     if ((shift) == 0)                                   \
    129       MPN_COPY (dst, src, size);                        \
    130     else                                                \
    131       {                                                 \
    132         mpn_rshift (dst, src, size, shift);             \
    133         (size) -= ((dst)[(size)-1] == 0);               \
    134       }                                                 \
    135   } while (0)
    136 
    137 
    138 /* ralloc and talloc are only wanted for ASSERTs, after the initial space
    139    allocations.  Avoid writing values to them in a normal build, to ensure
    140    the compiler lets them go dead.  gcc already figures this out itself
    141    actually.  */
    142 
    143 #define SWAP_RP_TP                                      \
    144   do {                                                  \
    145     MP_PTR_SWAP (rp, tp);                               \
    146     ASSERT_CODE (MP_SIZE_T_SWAP (ralloc, talloc));      \
    147   } while (0)
    148 
    149 
    150 void
    151 mpz_n_pow_ui (mpz_ptr r, mp_srcptr bp, mp_size_t bsize, unsigned long int e)
    152 {
    153   mp_ptr         rp;
    154   mp_size_t      rtwos_limbs, ralloc, rsize;
    155   int            rneg, i, cnt, btwos, r_bp_overlap;
    156   mp_limb_t      blimb, rl;
    157   mp_bitcnt_t    rtwos_bits;
    158 #if HAVE_NATIVE_mpn_mul_2
    159   mp_limb_t      blimb_low, rl_high;
    160 #else
    161   mp_limb_t      b_twolimbs[2];
    162 #endif
    163   TMP_DECL;
    164 
    165   TRACE (printf ("mpz_n_pow_ui rp=0x%lX bp=0x%lX bsize=%ld e=%lu (0x%lX)\n",
    166 		 PTR(r), bp, bsize, e, e);
    167 	 mpn_trace ("b", bp, bsize));
    168 
    169   ASSERT (bsize == 0 || bp[ABS(bsize)-1] != 0);
    170   ASSERT (MPN_SAME_OR_SEPARATE2_P (PTR(r), ALLOC(r), bp, ABS(bsize)));
    171 
    172   /* b^0 == 1, including 0^0 == 1 */
    173   if (e == 0)
    174     {
    175       PTR(r)[0] = 1;
    176       SIZ(r) = 1;
    177       return;
    178     }
    179 
    180   /* 0^e == 0 apart from 0^0 above */
    181   if (bsize == 0)
    182     {
    183       SIZ(r) = 0;
    184       return;
    185     }
    186 
    187   /* Sign of the final result. */
    188   rneg = (bsize < 0 && (e & 1) != 0);
    189   bsize = ABS (bsize);
    190   TRACE (printf ("rneg %d\n", rneg));
    191 
    192   r_bp_overlap = (PTR(r) == bp);
    193 
    194   /* Strip low zero limbs from b. */
    195   rtwos_limbs = 0;
    196   for (blimb = *bp; blimb == 0; blimb = *++bp)
    197     {
    198       rtwos_limbs += e;
    199       bsize--; ASSERT (bsize >= 1);
    200     }
    201   TRACE (printf ("trailing zero rtwos_limbs=%ld\n", rtwos_limbs));
    202 
    203   /* Strip low zero bits from b. */
    204   count_trailing_zeros (btwos, blimb);
    205   blimb >>= btwos;
    206   rtwos_bits = e * btwos;
    207   rtwos_limbs += rtwos_bits / GMP_NUMB_BITS;
    208   rtwos_bits %= GMP_NUMB_BITS;
    209   TRACE (printf ("trailing zero btwos=%d rtwos_limbs=%ld rtwos_bits=%lu\n",
    210 		 btwos, rtwos_limbs, rtwos_bits));
    211 
    212   TMP_MARK;
    213 
    214   rl = 1;
    215 #if HAVE_NATIVE_mpn_mul_2
    216   rl_high = 0;
    217 #endif
    218 
    219   if (bsize == 1)
    220     {
    221     bsize_1:
    222       /* Power up as far as possible within blimb.  We start here with e!=0,
    223 	 but if e is small then we might reach e==0 and the whole b^e in rl.
    224 	 Notice this code works when blimb==1 too, reaching e==0.  */
    225 
    226       while (blimb <= GMP_NUMB_HALFMAX)
    227 	{
    228 	  TRACE (printf ("small e=0x%lX blimb=0x%lX rl=0x%lX\n",
    229 			 e, blimb, rl));
    230 	  ASSERT (e != 0);
    231 	  if ((e & 1) != 0)
    232 	    rl *= blimb;
    233 	  e >>= 1;
    234 	  if (e == 0)
    235 	    goto got_rl;
    236 	  blimb *= blimb;
    237 	}
    238 
    239 #if HAVE_NATIVE_mpn_mul_2
    240       TRACE (printf ("single power, e=0x%lX b=0x%lX rl=0x%lX\n",
    241 		     e, blimb, rl));
    242 
    243       /* Can power b once more into blimb:blimb_low */
    244       bsize = 2;
    245       ASSERT (e != 0);
    246       if ((e & 1) != 0)
    247 	{
    248 	  umul_ppmm (rl_high, rl, rl, blimb << GMP_NAIL_BITS);
    249 	  rl >>= GMP_NAIL_BITS;
    250 	}
    251       e >>= 1;
    252       umul_ppmm (blimb, blimb_low, blimb, blimb << GMP_NAIL_BITS);
    253       blimb_low >>= GMP_NAIL_BITS;
    254 
    255     got_rl:
    256       TRACE (printf ("double power e=0x%lX blimb=0x%lX:0x%lX rl=0x%lX:%lX\n",
    257 		     e, blimb, blimb_low, rl_high, rl));
    258 
    259       /* Combine left-over rtwos_bits into rl_high:rl to be handled by the
    260 	 final mul_1 or mul_2 rather than a separate lshift.
    261 	 - rl_high:rl mustn't be 1 (since then there's no final mul)
    262 	 - rl_high mustn't overflow
    263 	 - rl_high mustn't change to non-zero, since mul_1+lshift is
    264 	 probably faster than mul_2 (FIXME: is this true?)  */
    265 
    266       if (rtwos_bits != 0
    267 	  && ! (rl_high == 0 && rl == 1)
    268 	  && (rl_high >> (GMP_NUMB_BITS-rtwos_bits)) == 0)
    269 	{
    270 	  mp_limb_t  new_rl_high = (rl_high << rtwos_bits)
    271 	    | (rl >> (GMP_NUMB_BITS-rtwos_bits));
    272 	  if (! (rl_high == 0 && new_rl_high != 0))
    273 	    {
    274 	      rl_high = new_rl_high;
    275 	      rl <<= rtwos_bits;
    276 	      rtwos_bits = 0;
    277 	      TRACE (printf ("merged rtwos_bits, rl=0x%lX:%lX\n",
    278 			     rl_high, rl));
    279 	    }
    280 	}
    281 #else
    282     got_rl:
    283       TRACE (printf ("small power e=0x%lX blimb=0x%lX rl=0x%lX\n",
    284 		     e, blimb, rl));
    285 
    286       /* Combine left-over rtwos_bits into rl to be handled by the final
    287 	 mul_1 rather than a separate lshift.
    288 	 - rl mustn't be 1 (since then there's no final mul)
    289 	 - rl mustn't overflow	*/
    290 
    291       if (rtwos_bits != 0
    292 	  && rl != 1
    293 	  && (rl >> (GMP_NUMB_BITS-rtwos_bits)) == 0)
    294 	{
    295 	  rl <<= rtwos_bits;
    296 	  rtwos_bits = 0;
    297 	  TRACE (printf ("merged rtwos_bits, rl=0x%lX\n", rl));
    298 	}
    299 #endif
    300     }
    301   else if (bsize == 2)
    302     {
    303       mp_limb_t  bsecond = bp[1];
    304       if (btwos != 0)
    305 	blimb |= (bsecond << (GMP_NUMB_BITS - btwos)) & GMP_NUMB_MASK;
    306       bsecond >>= btwos;
    307       if (bsecond == 0)
    308 	{
    309 	  /* Two limbs became one after rshift. */
    310 	  bsize = 1;
    311 	  goto bsize_1;
    312 	}
    313 
    314       TRACE (printf ("bsize==2 using b=0x%lX:%lX", bsecond, blimb));
    315 #if HAVE_NATIVE_mpn_mul_2
    316       blimb_low = blimb;
    317 #else
    318       bp = b_twolimbs;
    319       b_twolimbs[0] = blimb;
    320       b_twolimbs[1] = bsecond;
    321 #endif
    322       blimb = bsecond;
    323     }
    324   else
    325     {
    326       if (r_bp_overlap || btwos != 0)
    327 	{
    328 	  mp_ptr tp = TMP_ALLOC_LIMBS (bsize);
    329 	  MPN_RSHIFT_OR_COPY (tp, bp, bsize, btwos);
    330 	  bp = tp;
    331 	  TRACE (printf ("rshift or copy bp,bsize, new bsize=%ld\n", bsize));
    332 	}
    333 #if HAVE_NATIVE_mpn_mul_2
    334       /* in case 3 limbs rshift to 2 and hence use the mul_2 loop below */
    335       blimb_low = bp[0];
    336 #endif
    337       blimb = bp[bsize-1];
    338 
    339       TRACE (printf ("big bsize=%ld  ", bsize);
    340 	     mpn_trace ("b", bp, bsize));
    341     }
    342 
    343   /* At this point blimb is the most significant limb of the base to use.
    344 
    345      Each factor of b takes (bsize*BPML-cnt) bits and there's e of them; +1
    346      limb to round up the division; +1 for multiplies all using an extra
    347      limb over the true size; +2 for rl at the end; +1 for lshift at the
    348      end.
    349 
    350      The size calculation here is reasonably accurate.  The base is at least
    351      half a limb, so in 32 bits the worst case is 2^16+1 treated as 17 bits
    352      when it will power up as just over 16, an overestimate of 17/16 =
    353      6.25%.  For a 64-bit limb it's half that.
    354 
    355      If e==0 then blimb won't be anything useful (though it will be
    356      non-zero), but that doesn't matter since we just end up with ralloc==5,
    357      and that's fine for 2 limbs of rl and 1 of lshift.  */
    358 
    359   ASSERT (blimb != 0);
    360   count_leading_zeros (cnt, blimb);
    361   ralloc = (bsize*GMP_NUMB_BITS - cnt + GMP_NAIL_BITS) * e / GMP_NUMB_BITS + 5;
    362   TRACE (printf ("ralloc %ld, from bsize=%ld blimb=0x%lX cnt=%d\n",
    363 		 ralloc, bsize, blimb, cnt));
    364   rp = MPZ_REALLOC (r, ralloc + rtwos_limbs);
    365 
    366   /* Low zero limbs resulting from powers of 2. */
    367   MPN_ZERO (rp, rtwos_limbs);
    368   rp += rtwos_limbs;
    369 
    370   if (e == 0)
    371     {
    372       /* Any e==0 other than via bsize==1 or bsize==2 is covered at the
    373 	 start. */
    374       rp[0] = rl;
    375       rsize = 1;
    376 #if HAVE_NATIVE_mpn_mul_2
    377       rp[1] = rl_high;
    378       rsize += (rl_high != 0);
    379 #endif
    380       ASSERT (rp[rsize-1] != 0);
    381     }
    382   else
    383     {
    384       mp_ptr     tp;
    385       mp_size_t  talloc;
    386 
    387       /* In the mpn_mul_1 or mpn_mul_2 loops or in the mpn_mul loop when the
    388 	 low bit of e is zero, tp only has to hold the second last power
    389 	 step, which is half the size of the final result.  There's no need
    390 	 to round up the divide by 2, since ralloc includes a +2 for rl
    391 	 which not needed by tp.  In the mpn_mul loop when the low bit of e
    392 	 is 1, tp must hold nearly the full result, so just size it the same
    393 	 as rp.  */
    394 
    395       talloc = ralloc;
    396 #if HAVE_NATIVE_mpn_mul_2
    397       if (bsize <= 2 || (e & 1) == 0)
    398 	talloc /= 2;
    399 #else
    400       if (bsize <= 1 || (e & 1) == 0)
    401 	talloc /= 2;
    402 #endif
    403       TRACE (printf ("talloc %ld\n", talloc));
    404       tp = TMP_ALLOC_LIMBS (talloc);
    405 
    406       /* Go from high to low over the bits of e, starting with i pointing at
    407 	 the bit below the highest 1 (which will mean i==-1 if e==1).  */
    408       count_leading_zeros (cnt, (mp_limb_t) e);
    409       i = GMP_LIMB_BITS - cnt - 2;
    410 
    411 #if HAVE_NATIVE_mpn_mul_2
    412       if (bsize <= 2)
    413 	{
    414 	  mp_limb_t  mult[2];
    415 
    416 	  /* Any bsize==1 will have been powered above to be two limbs. */
    417 	  ASSERT (bsize == 2);
    418 	  ASSERT (blimb != 0);
    419 
    420 	  /* Arrange the final result ends up in r, not in the temp space */
    421 	  if ((i & 1) == 0)
    422 	    SWAP_RP_TP;
    423 
    424 	  rp[0] = blimb_low;
    425 	  rp[1] = blimb;
    426 	  rsize = 2;
    427 
    428 	  mult[0] = blimb_low;
    429 	  mult[1] = blimb;
    430 
    431 	  for ( ; i >= 0; i--)
    432 	    {
    433 	      TRACE (printf ("mul_2 loop i=%d e=0x%lX, rsize=%ld ralloc=%ld talloc=%ld\n",
    434 			     i, e, rsize, ralloc, talloc);
    435 		     mpn_trace ("r", rp, rsize));
    436 
    437 	      MPN_SQR (tp, talloc, rp, rsize);
    438 	      SWAP_RP_TP;
    439 	      if ((e & (1L << i)) != 0)
    440 		MPN_MUL_2 (rp, rsize, ralloc, mult);
    441 	    }
    442 
    443 	  TRACE (mpn_trace ("mul_2 before rl, r", rp, rsize));
    444 	  if (rl_high != 0)
    445 	    {
    446 	      mult[0] = rl;
    447 	      mult[1] = rl_high;
    448 	      MPN_MUL_2 (rp, rsize, ralloc, mult);
    449 	    }
    450 	  else if (rl != 1)
    451 	    MPN_MUL_1 (rp, rsize, ralloc, rl);
    452 	}
    453 #else
    454       if (bsize == 1)
    455 	{
    456 	  /* Arrange the final result ends up in r, not in the temp space */
    457 	  if ((i & 1) == 0)
    458 	    SWAP_RP_TP;
    459 
    460 	  rp[0] = blimb;
    461 	  rsize = 1;
    462 
    463 	  for ( ; i >= 0; i--)
    464 	    {
    465 	      TRACE (printf ("mul_1 loop i=%d e=0x%lX, rsize=%ld ralloc=%ld talloc=%ld\n",
    466 			     i, e, rsize, ralloc, talloc);
    467 		     mpn_trace ("r", rp, rsize));
    468 
    469 	      MPN_SQR (tp, talloc, rp, rsize);
    470 	      SWAP_RP_TP;
    471 	      if ((e & (1L << i)) != 0)
    472 		MPN_MUL_1 (rp, rsize, ralloc, blimb);
    473 	    }
    474 
    475 	  TRACE (mpn_trace ("mul_1 before rl, r", rp, rsize));
    476 	  if (rl != 1)
    477 	    MPN_MUL_1 (rp, rsize, ralloc, rl);
    478 	}
    479 #endif
    480       else
    481 	{
    482 	  int  parity;
    483 
    484 	  /* Arrange the final result ends up in r, not in the temp space */
    485 	  ULONG_PARITY (parity, e);
    486 	  if (((parity ^ i) & 1) != 0)
    487 	    SWAP_RP_TP;
    488 
    489 	  MPN_COPY (rp, bp, bsize);
    490 	  rsize = bsize;
    491 
    492 	  for ( ; i >= 0; i--)
    493 	    {
    494 	      TRACE (printf ("mul loop i=%d e=0x%lX, rsize=%ld ralloc=%ld talloc=%ld\n",
    495 			     i, e, rsize, ralloc, talloc);
    496 		     mpn_trace ("r", rp, rsize));
    497 
    498 	      MPN_SQR (tp, talloc, rp, rsize);
    499 	      SWAP_RP_TP;
    500 	      if ((e & (1L << i)) != 0)
    501 		{
    502 		  MPN_MUL (tp, talloc, rp, rsize, bp, bsize);
    503 		  SWAP_RP_TP;
    504 		}
    505 	    }
    506 	}
    507     }
    508 
    509   ASSERT (rp == PTR(r) + rtwos_limbs);
    510   TRACE (mpn_trace ("end loop r", rp, rsize));
    511   TMP_FREE;
    512 
    513   /* Apply any partial limb factors of 2. */
    514   if (rtwos_bits != 0)
    515     {
    516       MPN_LSHIFT (rp, rsize, ralloc, (unsigned) rtwos_bits);
    517       TRACE (mpn_trace ("lshift r", rp, rsize));
    518     }
    519 
    520   rsize += rtwos_limbs;
    521   SIZ(r) = (rneg ? -rsize : rsize);
    522 }
    523