Home | History | Annotate | Line # | Download | only in mpz
n_pow_ui.c revision 1.1.1.3
      1 /* mpz_n_pow_ui -- mpn raised to ulong.
      2 
      3    THE FUNCTIONS IN THIS FILE ARE FOR INTERNAL USE ONLY.  THEY'RE ALMOST
      4    CERTAIN TO BE SUBJECT TO INCOMPATIBLE CHANGES OR DISAPPEAR COMPLETELY IN
      5    FUTURE GNU MP RELEASES.
      6 
      7 Copyright 2001, 2002, 2005, 2012 Free Software Foundation, Inc.
      8 
      9 This file is part of the GNU MP Library.
     10 
     11 The GNU MP Library is free software; you can redistribute it and/or modify
     12 it under the terms of either:
     13 
     14   * the GNU Lesser General Public License as published by the Free
     15     Software Foundation; either version 3 of the License, or (at your
     16     option) any later version.
     17 
     18 or
     19 
     20   * the GNU General Public License as published by the Free Software
     21     Foundation; either version 2 of the License, or (at your option) any
     22     later version.
     23 
     24 or both in parallel, as here.
     25 
     26 The GNU MP Library is distributed in the hope that it will be useful, but
     27 WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
     28 or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
     29 for more details.
     30 
     31 You should have received copies of the GNU General Public License and the
     32 GNU Lesser General Public License along with the GNU MP Library.  If not,
     33 see https://www.gnu.org/licenses/.  */
     34 
     35 #include "gmp.h"
     36 #include "gmp-impl.h"
     37 #include "longlong.h"
     38 
     39 
     40 /* Change this to "#define TRACE(x) x" for some traces. */
     41 #define TRACE(x)
     42 
     43 
     44 /* Use this to test the mul_2 code on a CPU without a native version of that
     45    routine.  */
     46 #if 0
     47 #define mpn_mul_2  refmpn_mul_2
     48 #define HAVE_NATIVE_mpn_mul_2  1
     49 #endif
     50 
     51 
     52 /* mpz_pow_ui and mpz_ui_pow_ui want to share almost all of this code.
     53    ui_pow_ui doesn't need the mpn_mul based powering loop or the tests on
     54    bsize==2 or >2, but separating that isn't easy because there's shared
     55    code both before and after (the size calculations and the powers of 2
     56    handling).
     57 
     58    Alternatives:
     59 
     60    It would work to just use the mpn_mul powering loop for 1 and 2 limb
     61    bases, but the current separate loop allows mul_1 and mul_2 to be done
     62    in-place, which might help cache locality a bit.  If mpn_mul was relaxed
     63    to allow source==dest when vn==1 or 2 then some pointer twiddling might
     64    let us get the same effect in one loop.
     65 
     66    The initial powering for bsize==1 into blimb or blimb:blimb_low doesn't
     67    form the biggest possible power of b that fits, only the biggest power of
     68    2 power, ie. b^(2^n).  It'd be possible to choose a bigger power, perhaps
     69    using mp_bases[b].big_base for small b, and thereby get better value
     70    from mpn_mul_1 or mpn_mul_2 in the bignum powering.  It's felt that doing
     71    so would be more complicated than it's worth, and could well end up being
     72    a slowdown for small e.  For big e on the other hand the algorithm is
     73    dominated by mpn_sqr so there wouldn't much of a saving.  The current
     74    code can be viewed as simply doing the first few steps of the powering in
     75    a single or double limb where possible.
     76 
     77    If r==b, and blow_twos==0, and r must be realloc'ed, then the temporary
     78    copy made of b is unnecessary.  We could just use the old alloc'ed block
     79    and free it at the end.  But arranging this seems like a lot more trouble
     80    than it's worth.  */
     81 
     82 
     83 /* floor(sqrt(GMP_NUMB_MAX)), ie. the biggest value that can be squared in
     84    a limb without overflowing.
     85    FIXME: This formula is an underestimate when GMP_NUMB_BITS is odd. */
     86 
     87 #define GMP_NUMB_HALFMAX  (((mp_limb_t) 1 << GMP_NUMB_BITS/2) - 1)
     88 
     89 
     90 /* The following are for convenience, they update the size and check the
     91    alloc.  */
     92 
     93 #define MPN_SQR(dst, alloc, src, size)          \
     94   do {                                          \
     95     ASSERT (2*(size) <= (alloc));               \
     96     mpn_sqr (dst, src, size);                   \
     97     (size) *= 2;                                \
     98     (size) -= ((dst)[(size)-1] == 0);           \
     99   } while (0)
    100 
    101 #define MPN_MUL(dst, alloc, src, size, src2, size2)     \
    102   do {                                                  \
    103     mp_limb_t  cy;                                      \
    104     ASSERT ((size) + (size2) <= (alloc));               \
    105     cy = mpn_mul (dst, src, size, src2, size2);         \
    106     (size) += (size2) - (cy == 0);                      \
    107   } while (0)
    108 
    109 #define MPN_MUL_2(ptr, size, alloc, mult)       \
    110   do {                                          \
    111     mp_limb_t  cy;                              \
    112     ASSERT ((size)+2 <= (alloc));               \
    113     cy = mpn_mul_2 (ptr, ptr, size, mult);      \
    114     (size)++;                                   \
    115     (ptr)[(size)] = cy;                         \
    116     (size) += (cy != 0);                        \
    117   } while (0)
    118 
    119 #define MPN_MUL_1(ptr, size, alloc, limb)       \
    120   do {                                          \
    121     mp_limb_t  cy;                              \
    122     ASSERT ((size)+1 <= (alloc));               \
    123     cy = mpn_mul_1 (ptr, ptr, size, limb);      \
    124     (ptr)[size] = cy;                           \
    125     (size) += (cy != 0);                        \
    126   } while (0)
    127 
    128 #define MPN_LSHIFT(ptr, size, alloc, shift)     \
    129   do {                                          \
    130     mp_limb_t  cy;                              \
    131     ASSERT ((size)+1 <= (alloc));               \
    132     cy = mpn_lshift (ptr, ptr, size, shift);    \
    133     (ptr)[size] = cy;                           \
    134     (size) += (cy != 0);                        \
    135   } while (0)
    136 
    137 #define MPN_RSHIFT_OR_COPY(dst, src, size, shift)       \
    138   do {                                                  \
    139     if ((shift) == 0)                                   \
    140       MPN_COPY (dst, src, size);                        \
    141     else                                                \
    142       {                                                 \
    143         mpn_rshift (dst, src, size, shift);             \
    144         (size) -= ((dst)[(size)-1] == 0);               \
    145       }                                                 \
    146   } while (0)
    147 
    148 
    149 /* ralloc and talloc are only wanted for ASSERTs, after the initial space
    150    allocations.  Avoid writing values to them in a normal build, to ensure
    151    the compiler lets them go dead.  gcc already figures this out itself
    152    actually.  */
    153 
    154 #define SWAP_RP_TP                                      \
    155   do {                                                  \
    156     MP_PTR_SWAP (rp, tp);                               \
    157     ASSERT_CODE (MP_SIZE_T_SWAP (ralloc, talloc));      \
    158   } while (0)
    159 
    160 
    161 void
    162 mpz_n_pow_ui (mpz_ptr r, mp_srcptr bp, mp_size_t bsize, unsigned long int e)
    163 {
    164   mp_ptr         rp;
    165   mp_size_t      rtwos_limbs, ralloc, rsize;
    166   int            rneg, i, cnt, btwos, r_bp_overlap;
    167   mp_limb_t      blimb, rl;
    168   mp_bitcnt_t    rtwos_bits;
    169 #if HAVE_NATIVE_mpn_mul_2
    170   mp_limb_t      blimb_low, rl_high;
    171 #else
    172   mp_limb_t      b_twolimbs[2];
    173 #endif
    174   TMP_DECL;
    175 
    176   TRACE (printf ("mpz_n_pow_ui rp=0x%lX bp=0x%lX bsize=%ld e=%lu (0x%lX)\n",
    177 		 PTR(r), bp, bsize, e, e);
    178 	 mpn_trace ("b", bp, bsize));
    179 
    180   ASSERT (bsize == 0 || bp[ABS(bsize)-1] != 0);
    181   ASSERT (MPN_SAME_OR_SEPARATE2_P (PTR(r), ALLOC(r), bp, ABS(bsize)));
    182 
    183   /* b^0 == 1, including 0^0 == 1 */
    184   if (e == 0)
    185     {
    186       PTR(r)[0] = 1;
    187       SIZ(r) = 1;
    188       return;
    189     }
    190 
    191   /* 0^e == 0 apart from 0^0 above */
    192   if (bsize == 0)
    193     {
    194       SIZ(r) = 0;
    195       return;
    196     }
    197 
    198   /* Sign of the final result. */
    199   rneg = (bsize < 0 && (e & 1) != 0);
    200   bsize = ABS (bsize);
    201   TRACE (printf ("rneg %d\n", rneg));
    202 
    203   r_bp_overlap = (PTR(r) == bp);
    204 
    205   /* Strip low zero limbs from b. */
    206   rtwos_limbs = 0;
    207   for (blimb = *bp; blimb == 0; blimb = *++bp)
    208     {
    209       rtwos_limbs += e;
    210       bsize--; ASSERT (bsize >= 1);
    211     }
    212   TRACE (printf ("trailing zero rtwos_limbs=%ld\n", rtwos_limbs));
    213 
    214   /* Strip low zero bits from b. */
    215   count_trailing_zeros (btwos, blimb);
    216   blimb >>= btwos;
    217   rtwos_bits = e * btwos;
    218   rtwos_limbs += rtwos_bits / GMP_NUMB_BITS;
    219   rtwos_bits %= GMP_NUMB_BITS;
    220   TRACE (printf ("trailing zero btwos=%d rtwos_limbs=%ld rtwos_bits=%lu\n",
    221 		 btwos, rtwos_limbs, rtwos_bits));
    222 
    223   TMP_MARK;
    224 
    225   rl = 1;
    226 #if HAVE_NATIVE_mpn_mul_2
    227   rl_high = 0;
    228 #endif
    229 
    230   if (bsize == 1)
    231     {
    232     bsize_1:
    233       /* Power up as far as possible within blimb.  We start here with e!=0,
    234 	 but if e is small then we might reach e==0 and the whole b^e in rl.
    235 	 Notice this code works when blimb==1 too, reaching e==0.  */
    236 
    237       while (blimb <= GMP_NUMB_HALFMAX)
    238 	{
    239 	  TRACE (printf ("small e=0x%lX blimb=0x%lX rl=0x%lX\n",
    240 			 e, blimb, rl));
    241 	  ASSERT (e != 0);
    242 	  if ((e & 1) != 0)
    243 	    rl *= blimb;
    244 	  e >>= 1;
    245 	  if (e == 0)
    246 	    goto got_rl;
    247 	  blimb *= blimb;
    248 	}
    249 
    250 #if HAVE_NATIVE_mpn_mul_2
    251       TRACE (printf ("single power, e=0x%lX b=0x%lX rl=0x%lX\n",
    252 		     e, blimb, rl));
    253 
    254       /* Can power b once more into blimb:blimb_low */
    255       bsize = 2;
    256       ASSERT (e != 0);
    257       if ((e & 1) != 0)
    258 	{
    259 	  umul_ppmm (rl_high, rl, rl, blimb << GMP_NAIL_BITS);
    260 	  rl >>= GMP_NAIL_BITS;
    261 	}
    262       e >>= 1;
    263       umul_ppmm (blimb, blimb_low, blimb, blimb << GMP_NAIL_BITS);
    264       blimb_low >>= GMP_NAIL_BITS;
    265 
    266     got_rl:
    267       TRACE (printf ("double power e=0x%lX blimb=0x%lX:0x%lX rl=0x%lX:%lX\n",
    268 		     e, blimb, blimb_low, rl_high, rl));
    269 
    270       /* Combine left-over rtwos_bits into rl_high:rl to be handled by the
    271 	 final mul_1 or mul_2 rather than a separate lshift.
    272 	 - rl_high:rl mustn't be 1 (since then there's no final mul)
    273 	 - rl_high mustn't overflow
    274 	 - rl_high mustn't change to non-zero, since mul_1+lshift is
    275 	 probably faster than mul_2 (FIXME: is this true?)  */
    276 
    277       if (rtwos_bits != 0
    278 	  && ! (rl_high == 0 && rl == 1)
    279 	  && (rl_high >> (GMP_NUMB_BITS-rtwos_bits)) == 0)
    280 	{
    281 	  mp_limb_t  new_rl_high = (rl_high << rtwos_bits)
    282 	    | (rl >> (GMP_NUMB_BITS-rtwos_bits));
    283 	  if (! (rl_high == 0 && new_rl_high != 0))
    284 	    {
    285 	      rl_high = new_rl_high;
    286 	      rl <<= rtwos_bits;
    287 	      rtwos_bits = 0;
    288 	      TRACE (printf ("merged rtwos_bits, rl=0x%lX:%lX\n",
    289 			     rl_high, rl));
    290 	    }
    291 	}
    292 #else
    293     got_rl:
    294       TRACE (printf ("small power e=0x%lX blimb=0x%lX rl=0x%lX\n",
    295 		     e, blimb, rl));
    296 
    297       /* Combine left-over rtwos_bits into rl to be handled by the final
    298 	 mul_1 rather than a separate lshift.
    299 	 - rl mustn't be 1 (since then there's no final mul)
    300 	 - rl mustn't overflow	*/
    301 
    302       if (rtwos_bits != 0
    303 	  && rl != 1
    304 	  && (rl >> (GMP_NUMB_BITS-rtwos_bits)) == 0)
    305 	{
    306 	  rl <<= rtwos_bits;
    307 	  rtwos_bits = 0;
    308 	  TRACE (printf ("merged rtwos_bits, rl=0x%lX\n", rl));
    309 	}
    310 #endif
    311     }
    312   else if (bsize == 2)
    313     {
    314       mp_limb_t  bsecond = bp[1];
    315       if (btwos != 0)
    316 	blimb |= (bsecond << (GMP_NUMB_BITS - btwos)) & GMP_NUMB_MASK;
    317       bsecond >>= btwos;
    318       if (bsecond == 0)
    319 	{
    320 	  /* Two limbs became one after rshift. */
    321 	  bsize = 1;
    322 	  goto bsize_1;
    323 	}
    324 
    325       TRACE (printf ("bsize==2 using b=0x%lX:%lX", bsecond, blimb));
    326 #if HAVE_NATIVE_mpn_mul_2
    327       blimb_low = blimb;
    328 #else
    329       bp = b_twolimbs;
    330       b_twolimbs[0] = blimb;
    331       b_twolimbs[1] = bsecond;
    332 #endif
    333       blimb = bsecond;
    334     }
    335   else
    336     {
    337       if (r_bp_overlap || btwos != 0)
    338 	{
    339 	  mp_ptr tp = TMP_ALLOC_LIMBS (bsize);
    340 	  MPN_RSHIFT_OR_COPY (tp, bp, bsize, btwos);
    341 	  bp = tp;
    342 	  TRACE (printf ("rshift or copy bp,bsize, new bsize=%ld\n", bsize));
    343 	}
    344 #if HAVE_NATIVE_mpn_mul_2
    345       /* in case 3 limbs rshift to 2 and hence use the mul_2 loop below */
    346       blimb_low = bp[0];
    347 #endif
    348       blimb = bp[bsize-1];
    349 
    350       TRACE (printf ("big bsize=%ld  ", bsize);
    351 	     mpn_trace ("b", bp, bsize));
    352     }
    353 
    354   /* At this point blimb is the most significant limb of the base to use.
    355 
    356      Each factor of b takes (bsize*BPML-cnt) bits and there's e of them; +1
    357      limb to round up the division; +1 for multiplies all using an extra
    358      limb over the true size; +2 for rl at the end; +1 for lshift at the
    359      end.
    360 
    361      The size calculation here is reasonably accurate.  The base is at least
    362      half a limb, so in 32 bits the worst case is 2^16+1 treated as 17 bits
    363      when it will power up as just over 16, an overestimate of 17/16 =
    364      6.25%.  For a 64-bit limb it's half that.
    365 
    366      If e==0 then blimb won't be anything useful (though it will be
    367      non-zero), but that doesn't matter since we just end up with ralloc==5,
    368      and that's fine for 2 limbs of rl and 1 of lshift.  */
    369 
    370   ASSERT (blimb != 0);
    371   count_leading_zeros (cnt, blimb);
    372   ralloc = (bsize*GMP_NUMB_BITS - cnt + GMP_NAIL_BITS) * e / GMP_NUMB_BITS + 5;
    373   TRACE (printf ("ralloc %ld, from bsize=%ld blimb=0x%lX cnt=%d\n",
    374 		 ralloc, bsize, blimb, cnt));
    375   rp = MPZ_REALLOC (r, ralloc + rtwos_limbs);
    376 
    377   /* Low zero limbs resulting from powers of 2. */
    378   MPN_ZERO (rp, rtwos_limbs);
    379   rp += rtwos_limbs;
    380 
    381   if (e == 0)
    382     {
    383       /* Any e==0 other than via bsize==1 or bsize==2 is covered at the
    384 	 start. */
    385       rp[0] = rl;
    386       rsize = 1;
    387 #if HAVE_NATIVE_mpn_mul_2
    388       rp[1] = rl_high;
    389       rsize += (rl_high != 0);
    390 #endif
    391       ASSERT (rp[rsize-1] != 0);
    392     }
    393   else
    394     {
    395       mp_ptr     tp;
    396       mp_size_t  talloc;
    397 
    398       /* In the mpn_mul_1 or mpn_mul_2 loops or in the mpn_mul loop when the
    399 	 low bit of e is zero, tp only has to hold the second last power
    400 	 step, which is half the size of the final result.  There's no need
    401 	 to round up the divide by 2, since ralloc includes a +2 for rl
    402 	 which not needed by tp.  In the mpn_mul loop when the low bit of e
    403 	 is 1, tp must hold nearly the full result, so just size it the same
    404 	 as rp.  */
    405 
    406       talloc = ralloc;
    407 #if HAVE_NATIVE_mpn_mul_2
    408       if (bsize <= 2 || (e & 1) == 0)
    409 	talloc /= 2;
    410 #else
    411       if (bsize <= 1 || (e & 1) == 0)
    412 	talloc /= 2;
    413 #endif
    414       TRACE (printf ("talloc %ld\n", talloc));
    415       tp = TMP_ALLOC_LIMBS (talloc);
    416 
    417       /* Go from high to low over the bits of e, starting with i pointing at
    418 	 the bit below the highest 1 (which will mean i==-1 if e==1).  */
    419       count_leading_zeros (cnt, (mp_limb_t) e);
    420       i = GMP_LIMB_BITS - cnt - 2;
    421 
    422 #if HAVE_NATIVE_mpn_mul_2
    423       if (bsize <= 2)
    424 	{
    425 	  mp_limb_t  mult[2];
    426 
    427 	  /* Any bsize==1 will have been powered above to be two limbs. */
    428 	  ASSERT (bsize == 2);
    429 	  ASSERT (blimb != 0);
    430 
    431 	  /* Arrange the final result ends up in r, not in the temp space */
    432 	  if ((i & 1) == 0)
    433 	    SWAP_RP_TP;
    434 
    435 	  rp[0] = blimb_low;
    436 	  rp[1] = blimb;
    437 	  rsize = 2;
    438 
    439 	  mult[0] = blimb_low;
    440 	  mult[1] = blimb;
    441 
    442 	  for ( ; i >= 0; i--)
    443 	    {
    444 	      TRACE (printf ("mul_2 loop i=%d e=0x%lX, rsize=%ld ralloc=%ld talloc=%ld\n",
    445 			     i, e, rsize, ralloc, talloc);
    446 		     mpn_trace ("r", rp, rsize));
    447 
    448 	      MPN_SQR (tp, talloc, rp, rsize);
    449 	      SWAP_RP_TP;
    450 	      if ((e & (1L << i)) != 0)
    451 		MPN_MUL_2 (rp, rsize, ralloc, mult);
    452 	    }
    453 
    454 	  TRACE (mpn_trace ("mul_2 before rl, r", rp, rsize));
    455 	  if (rl_high != 0)
    456 	    {
    457 	      mult[0] = rl;
    458 	      mult[1] = rl_high;
    459 	      MPN_MUL_2 (rp, rsize, ralloc, mult);
    460 	    }
    461 	  else if (rl != 1)
    462 	    MPN_MUL_1 (rp, rsize, ralloc, rl);
    463 	}
    464 #else
    465       if (bsize == 1)
    466 	{
    467 	  /* Arrange the final result ends up in r, not in the temp space */
    468 	  if ((i & 1) == 0)
    469 	    SWAP_RP_TP;
    470 
    471 	  rp[0] = blimb;
    472 	  rsize = 1;
    473 
    474 	  for ( ; i >= 0; i--)
    475 	    {
    476 	      TRACE (printf ("mul_1 loop i=%d e=0x%lX, rsize=%ld ralloc=%ld talloc=%ld\n",
    477 			     i, e, rsize, ralloc, talloc);
    478 		     mpn_trace ("r", rp, rsize));
    479 
    480 	      MPN_SQR (tp, talloc, rp, rsize);
    481 	      SWAP_RP_TP;
    482 	      if ((e & (1L << i)) != 0)
    483 		MPN_MUL_1 (rp, rsize, ralloc, blimb);
    484 	    }
    485 
    486 	  TRACE (mpn_trace ("mul_1 before rl, r", rp, rsize));
    487 	  if (rl != 1)
    488 	    MPN_MUL_1 (rp, rsize, ralloc, rl);
    489 	}
    490 #endif
    491       else
    492 	{
    493 	  int  parity;
    494 
    495 	  /* Arrange the final result ends up in r, not in the temp space */
    496 	  ULONG_PARITY (parity, e);
    497 	  if (((parity ^ i) & 1) != 0)
    498 	    SWAP_RP_TP;
    499 
    500 	  MPN_COPY (rp, bp, bsize);
    501 	  rsize = bsize;
    502 
    503 	  for ( ; i >= 0; i--)
    504 	    {
    505 	      TRACE (printf ("mul loop i=%d e=0x%lX, rsize=%ld ralloc=%ld talloc=%ld\n",
    506 			     i, e, rsize, ralloc, talloc);
    507 		     mpn_trace ("r", rp, rsize));
    508 
    509 	      MPN_SQR (tp, talloc, rp, rsize);
    510 	      SWAP_RP_TP;
    511 	      if ((e & (1L << i)) != 0)
    512 		{
    513 		  MPN_MUL (tp, talloc, rp, rsize, bp, bsize);
    514 		  SWAP_RP_TP;
    515 		}
    516 	    }
    517 	}
    518     }
    519 
    520   ASSERT (rp == PTR(r) + rtwos_limbs);
    521   TRACE (mpn_trace ("end loop r", rp, rsize));
    522   TMP_FREE;
    523 
    524   /* Apply any partial limb factors of 2. */
    525   if (rtwos_bits != 0)
    526     {
    527       MPN_LSHIFT (rp, rsize, ralloc, (unsigned) rtwos_bits);
    528       TRACE (mpn_trace ("lshift r", rp, rsize));
    529     }
    530 
    531   rsize += rtwos_limbs;
    532   SIZ(r) = (rneg ? -rsize : rsize);
    533 }
    534