Home | History | Annotate | Line # | Download | only in mpz
n_pow_ui.c revision 1.1.1.4
      1 /* mpz_n_pow_ui -- mpn raised to ulong.
      2 
      3    THE FUNCTIONS IN THIS FILE ARE FOR INTERNAL USE ONLY.  THEY'RE ALMOST
      4    CERTAIN TO BE SUBJECT TO INCOMPATIBLE CHANGES OR DISAPPEAR COMPLETELY IN
      5    FUTURE GNU MP RELEASES.
      6 
      7 Copyright 2001, 2002, 2005, 2012, 2015 Free Software Foundation, Inc.
      8 
      9 This file is part of the GNU MP Library.
     10 
     11 The GNU MP Library is free software; you can redistribute it and/or modify
     12 it under the terms of either:
     13 
     14   * the GNU Lesser General Public License as published by the Free
     15     Software Foundation; either version 3 of the License, or (at your
     16     option) any later version.
     17 
     18 or
     19 
     20   * the GNU General Public License as published by the Free Software
     21     Foundation; either version 2 of the License, or (at your option) any
     22     later version.
     23 
     24 or both in parallel, as here.
     25 
     26 The GNU MP Library is distributed in the hope that it will be useful, but
     27 WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
     28 or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
     29 for more details.
     30 
     31 You should have received copies of the GNU General Public License and the
     32 GNU Lesser General Public License along with the GNU MP Library.  If not,
     33 see https://www.gnu.org/licenses/.  */
     34 
     35 #include "gmp-impl.h"
     36 #include "longlong.h"
     37 
     38 
     39 /* Change this to "#define TRACE(x) x" for some traces. */
     40 #define TRACE(x)
     41 
     42 
     43 /* Use this to test the mul_2 code on a CPU without a native version of that
     44    routine.  */
     45 #if 0
     46 #define mpn_mul_2  refmpn_mul_2
     47 #define HAVE_NATIVE_mpn_mul_2  1
     48 #endif
     49 
     50 
     51 /* mpz_pow_ui and mpz_ui_pow_ui want to share almost all of this code.
     52    ui_pow_ui doesn't need the mpn_mul based powering loop or the tests on
     53    bsize==2 or >2, but separating that isn't easy because there's shared
     54    code both before and after (the size calculations and the powers of 2
     55    handling).
     56 
     57    Alternatives:
     58 
     59    It would work to just use the mpn_mul powering loop for 1 and 2 limb
     60    bases, but the current separate loop allows mul_1 and mul_2 to be done
     61    in-place, which might help cache locality a bit.  If mpn_mul was relaxed
     62    to allow source==dest when vn==1 or 2 then some pointer twiddling might
     63    let us get the same effect in one loop.
     64 
     65    The initial powering for bsize==1 into blimb or blimb:blimb_low doesn't
     66    form the biggest possible power of b that fits, only the biggest power of
     67    2 power, ie. b^(2^n).  It'd be possible to choose a bigger power, perhaps
     68    using mp_bases[b].big_base for small b, and thereby get better value
     69    from mpn_mul_1 or mpn_mul_2 in the bignum powering.  It's felt that doing
     70    so would be more complicated than it's worth, and could well end up being
     71    a slowdown for small e.  For big e on the other hand the algorithm is
     72    dominated by mpn_sqr so there wouldn't much of a saving.  The current
     73    code can be viewed as simply doing the first few steps of the powering in
     74    a single or double limb where possible.
     75 
     76    If r==b, and blow_twos==0, and r must be realloc'ed, then the temporary
     77    copy made of b is unnecessary.  We could just use the old alloc'ed block
     78    and free it at the end.  But arranging this seems like a lot more trouble
     79    than it's worth.  */
     80 
     81 
     82 /* floor(sqrt(GMP_NUMB_MAX)), ie. the biggest value that can be squared in
     83    a limb without overflowing.
     84    FIXME: This formula is an underestimate when GMP_NUMB_BITS is odd. */
     85 
     86 #define GMP_NUMB_HALFMAX  (((mp_limb_t) 1 << GMP_NUMB_BITS/2) - 1)
     87 
     88 
     89 /* The following are for convenience, they update the size and check the
     90    alloc.  */
     91 
     92 #define MPN_SQR(dst, alloc, src, size)          \
     93   do {                                          \
     94     ASSERT (2*(size) <= (alloc));               \
     95     mpn_sqr (dst, src, size);                   \
     96     (size) *= 2;                                \
     97     (size) -= ((dst)[(size)-1] == 0);           \
     98   } while (0)
     99 
    100 #define MPN_MUL(dst, alloc, src, size, src2, size2)     \
    101   do {                                                  \
    102     mp_limb_t  cy;                                      \
    103     ASSERT ((size) + (size2) <= (alloc));               \
    104     cy = mpn_mul (dst, src, size, src2, size2);         \
    105     (size) += (size2) - (cy == 0);                      \
    106   } while (0)
    107 
    108 #define MPN_MUL_2(ptr, size, alloc, mult)       \
    109   do {                                          \
    110     mp_limb_t  cy;                              \
    111     ASSERT ((size)+2 <= (alloc));               \
    112     cy = mpn_mul_2 (ptr, ptr, size, mult);      \
    113     (size)++;                                   \
    114     (ptr)[(size)] = cy;                         \
    115     (size) += (cy != 0);                        \
    116   } while (0)
    117 
    118 #define MPN_MUL_1(ptr, size, alloc, limb)       \
    119   do {                                          \
    120     mp_limb_t  cy;                              \
    121     ASSERT ((size)+1 <= (alloc));               \
    122     cy = mpn_mul_1 (ptr, ptr, size, limb);      \
    123     (ptr)[size] = cy;                           \
    124     (size) += (cy != 0);                        \
    125   } while (0)
    126 
    127 #define MPN_LSHIFT(ptr, size, alloc, shift)     \
    128   do {                                          \
    129     mp_limb_t  cy;                              \
    130     ASSERT ((size)+1 <= (alloc));               \
    131     cy = mpn_lshift (ptr, ptr, size, shift);    \
    132     (ptr)[size] = cy;                           \
    133     (size) += (cy != 0);                        \
    134   } while (0)
    135 
    136 #define MPN_RSHIFT_OR_COPY(dst, src, size, shift)       \
    137   do {                                                  \
    138     if ((shift) == 0)                                   \
    139       MPN_COPY (dst, src, size);                        \
    140     else                                                \
    141       {                                                 \
    142         mpn_rshift (dst, src, size, shift);             \
    143         (size) -= ((dst)[(size)-1] == 0);               \
    144       }                                                 \
    145   } while (0)
    146 
    147 
    148 /* ralloc and talloc are only wanted for ASSERTs, after the initial space
    149    allocations.  Avoid writing values to them in a normal build, to ensure
    150    the compiler lets them go dead.  gcc already figures this out itself
    151    actually.  */
    152 
    153 #define SWAP_RP_TP                                      \
    154   do {                                                  \
    155     MP_PTR_SWAP (rp, tp);                               \
    156     ASSERT_CODE (MP_SIZE_T_SWAP (ralloc, talloc));      \
    157   } while (0)
    158 
    159 
    160 void
    161 mpz_n_pow_ui (mpz_ptr r, mp_srcptr bp, mp_size_t bsize, unsigned long int e)
    162 {
    163   mp_ptr         rp;
    164   mp_size_t      rtwos_limbs, ralloc, rsize;
    165   int            rneg, i, cnt, btwos, r_bp_overlap;
    166   mp_limb_t      blimb, rl;
    167   mp_bitcnt_t    rtwos_bits;
    168 #if HAVE_NATIVE_mpn_mul_2
    169   mp_limb_t      blimb_low, rl_high;
    170 #else
    171   mp_limb_t      b_twolimbs[2];
    172 #endif
    173   TMP_DECL;
    174 
    175   TRACE (printf ("mpz_n_pow_ui rp=0x%lX bp=0x%lX bsize=%ld e=%lu (0x%lX)\n",
    176 		 PTR(r), bp, bsize, e, e);
    177 	 mpn_trace ("b", bp, bsize));
    178 
    179   ASSERT (bsize == 0 || bp[ABS(bsize)-1] != 0);
    180   ASSERT (MPN_SAME_OR_SEPARATE2_P (PTR(r), ALLOC(r), bp, ABS(bsize)));
    181 
    182   /* b^0 == 1, including 0^0 == 1 */
    183   if (e == 0)
    184     {
    185       MPZ_NEWALLOC (r, 1)[0] = 1;
    186       SIZ(r) = 1;
    187       return;
    188     }
    189 
    190   /* 0^e == 0 apart from 0^0 above */
    191   if (bsize == 0)
    192     {
    193       SIZ(r) = 0;
    194       return;
    195     }
    196 
    197   /* Sign of the final result. */
    198   rneg = (bsize < 0 && (e & 1) != 0);
    199   bsize = ABS (bsize);
    200   TRACE (printf ("rneg %d\n", rneg));
    201 
    202   r_bp_overlap = (PTR(r) == bp);
    203 
    204   /* Strip low zero limbs from b. */
    205   rtwos_limbs = 0;
    206   for (blimb = *bp; blimb == 0; blimb = *++bp)
    207     {
    208       rtwos_limbs += e;
    209       bsize--; ASSERT (bsize >= 1);
    210     }
    211   TRACE (printf ("trailing zero rtwos_limbs=%ld\n", rtwos_limbs));
    212 
    213   /* Strip low zero bits from b. */
    214   count_trailing_zeros (btwos, blimb);
    215   blimb >>= btwos;
    216   rtwos_bits = e * btwos;
    217   rtwos_limbs += rtwos_bits / GMP_NUMB_BITS;
    218   rtwos_bits %= GMP_NUMB_BITS;
    219   TRACE (printf ("trailing zero btwos=%d rtwos_limbs=%ld rtwos_bits=%lu\n",
    220 		 btwos, rtwos_limbs, rtwos_bits));
    221 
    222   TMP_MARK;
    223 
    224   rl = 1;
    225 #if HAVE_NATIVE_mpn_mul_2
    226   rl_high = 0;
    227 #endif
    228 
    229   if (bsize == 1)
    230     {
    231     bsize_1:
    232       /* Power up as far as possible within blimb.  We start here with e!=0,
    233 	 but if e is small then we might reach e==0 and the whole b^e in rl.
    234 	 Notice this code works when blimb==1 too, reaching e==0.  */
    235 
    236       while (blimb <= GMP_NUMB_HALFMAX)
    237 	{
    238 	  TRACE (printf ("small e=0x%lX blimb=0x%lX rl=0x%lX\n",
    239 			 e, blimb, rl));
    240 	  ASSERT (e != 0);
    241 	  if ((e & 1) != 0)
    242 	    rl *= blimb;
    243 	  e >>= 1;
    244 	  if (e == 0)
    245 	    goto got_rl;
    246 	  blimb *= blimb;
    247 	}
    248 
    249 #if HAVE_NATIVE_mpn_mul_2
    250       TRACE (printf ("single power, e=0x%lX b=0x%lX rl=0x%lX\n",
    251 		     e, blimb, rl));
    252 
    253       /* Can power b once more into blimb:blimb_low */
    254       bsize = 2;
    255       ASSERT (e != 0);
    256       if ((e & 1) != 0)
    257 	{
    258 	  umul_ppmm (rl_high, rl, rl, blimb << GMP_NAIL_BITS);
    259 	  rl >>= GMP_NAIL_BITS;
    260 	}
    261       e >>= 1;
    262       umul_ppmm (blimb, blimb_low, blimb, blimb << GMP_NAIL_BITS);
    263       blimb_low >>= GMP_NAIL_BITS;
    264 
    265     got_rl:
    266       TRACE (printf ("double power e=0x%lX blimb=0x%lX:0x%lX rl=0x%lX:%lX\n",
    267 		     e, blimb, blimb_low, rl_high, rl));
    268 
    269       /* Combine left-over rtwos_bits into rl_high:rl to be handled by the
    270 	 final mul_1 or mul_2 rather than a separate lshift.
    271 	 - rl_high:rl mustn't be 1 (since then there's no final mul)
    272 	 - rl_high mustn't overflow
    273 	 - rl_high mustn't change to non-zero, since mul_1+lshift is
    274 	 probably faster than mul_2 (FIXME: is this true?)  */
    275 
    276       if (rtwos_bits != 0
    277 	  && ! (rl_high == 0 && rl == 1)
    278 	  && (rl_high >> (GMP_NUMB_BITS-rtwos_bits)) == 0)
    279 	{
    280 	  mp_limb_t  new_rl_high = (rl_high << rtwos_bits)
    281 	    | (rl >> (GMP_NUMB_BITS-rtwos_bits));
    282 	  if (! (rl_high == 0 && new_rl_high != 0))
    283 	    {
    284 	      rl_high = new_rl_high;
    285 	      rl <<= rtwos_bits;
    286 	      rtwos_bits = 0;
    287 	      TRACE (printf ("merged rtwos_bits, rl=0x%lX:%lX\n",
    288 			     rl_high, rl));
    289 	    }
    290 	}
    291 #else
    292     got_rl:
    293       TRACE (printf ("small power e=0x%lX blimb=0x%lX rl=0x%lX\n",
    294 		     e, blimb, rl));
    295 
    296       /* Combine left-over rtwos_bits into rl to be handled by the final
    297 	 mul_1 rather than a separate lshift.
    298 	 - rl mustn't be 1 (since then there's no final mul)
    299 	 - rl mustn't overflow	*/
    300 
    301       if (rtwos_bits != 0
    302 	  && rl != 1
    303 	  && (rl >> (GMP_NUMB_BITS-rtwos_bits)) == 0)
    304 	{
    305 	  rl <<= rtwos_bits;
    306 	  rtwos_bits = 0;
    307 	  TRACE (printf ("merged rtwos_bits, rl=0x%lX\n", rl));
    308 	}
    309 #endif
    310     }
    311   else if (bsize == 2)
    312     {
    313       mp_limb_t  bsecond = bp[1];
    314       if (btwos != 0)
    315 	blimb |= (bsecond << (GMP_NUMB_BITS - btwos)) & GMP_NUMB_MASK;
    316       bsecond >>= btwos;
    317       if (bsecond == 0)
    318 	{
    319 	  /* Two limbs became one after rshift. */
    320 	  bsize = 1;
    321 	  goto bsize_1;
    322 	}
    323 
    324       TRACE (printf ("bsize==2 using b=0x%lX:%lX", bsecond, blimb));
    325 #if HAVE_NATIVE_mpn_mul_2
    326       blimb_low = blimb;
    327 #else
    328       bp = b_twolimbs;
    329       b_twolimbs[0] = blimb;
    330       b_twolimbs[1] = bsecond;
    331 #endif
    332       blimb = bsecond;
    333     }
    334   else
    335     {
    336       if (r_bp_overlap || btwos != 0)
    337 	{
    338 	  mp_ptr tp = TMP_ALLOC_LIMBS (bsize);
    339 	  MPN_RSHIFT_OR_COPY (tp, bp, bsize, btwos);
    340 	  bp = tp;
    341 	  TRACE (printf ("rshift or copy bp,bsize, new bsize=%ld\n", bsize));
    342 	}
    343 #if HAVE_NATIVE_mpn_mul_2
    344       /* in case 3 limbs rshift to 2 and hence use the mul_2 loop below */
    345       blimb_low = bp[0];
    346 #endif
    347       blimb = bp[bsize-1];
    348 
    349       TRACE (printf ("big bsize=%ld  ", bsize);
    350 	     mpn_trace ("b", bp, bsize));
    351     }
    352 
    353   /* At this point blimb is the most significant limb of the base to use.
    354 
    355      Each factor of b takes (bsize*BPML-cnt) bits and there's e of them; +1
    356      limb to round up the division; +1 for multiplies all using an extra
    357      limb over the true size; +2 for rl at the end; +1 for lshift at the
    358      end.
    359 
    360      The size calculation here is reasonably accurate.  The base is at least
    361      half a limb, so in 32 bits the worst case is 2^16+1 treated as 17 bits
    362      when it will power up as just over 16, an overestimate of 17/16 =
    363      6.25%.  For a 64-bit limb it's half that.
    364 
    365      If e==0 then blimb won't be anything useful (though it will be
    366      non-zero), but that doesn't matter since we just end up with ralloc==5,
    367      and that's fine for 2 limbs of rl and 1 of lshift.  */
    368 
    369   ASSERT (blimb != 0);
    370   count_leading_zeros (cnt, blimb);
    371   ralloc = (bsize*GMP_NUMB_BITS - cnt + GMP_NAIL_BITS) * e / GMP_NUMB_BITS + 5;
    372   TRACE (printf ("ralloc %ld, from bsize=%ld blimb=0x%lX cnt=%d\n",
    373 		 ralloc, bsize, blimb, cnt));
    374   rp = MPZ_NEWALLOC (r, ralloc + rtwos_limbs);
    375 
    376   /* Low zero limbs resulting from powers of 2. */
    377   MPN_ZERO (rp, rtwos_limbs);
    378   rp += rtwos_limbs;
    379 
    380   if (e == 0)
    381     {
    382       /* Any e==0 other than via bsize==1 or bsize==2 is covered at the
    383 	 start. */
    384       rp[0] = rl;
    385       rsize = 1;
    386 #if HAVE_NATIVE_mpn_mul_2
    387       rp[1] = rl_high;
    388       rsize += (rl_high != 0);
    389 #endif
    390       ASSERT (rp[rsize-1] != 0);
    391     }
    392   else
    393     {
    394       mp_ptr     tp;
    395       mp_size_t  talloc;
    396 
    397       /* In the mpn_mul_1 or mpn_mul_2 loops or in the mpn_mul loop when the
    398 	 low bit of e is zero, tp only has to hold the second last power
    399 	 step, which is half the size of the final result.  There's no need
    400 	 to round up the divide by 2, since ralloc includes a +2 for rl
    401 	 which not needed by tp.  In the mpn_mul loop when the low bit of e
    402 	 is 1, tp must hold nearly the full result, so just size it the same
    403 	 as rp.  */
    404 
    405       talloc = ralloc;
    406 #if HAVE_NATIVE_mpn_mul_2
    407       if (bsize <= 2 || (e & 1) == 0)
    408 	talloc /= 2;
    409 #else
    410       if (bsize <= 1 || (e & 1) == 0)
    411 	talloc /= 2;
    412 #endif
    413       TRACE (printf ("talloc %ld\n", talloc));
    414       tp = TMP_ALLOC_LIMBS (talloc);
    415 
    416       /* Go from high to low over the bits of e, starting with i pointing at
    417 	 the bit below the highest 1 (which will mean i==-1 if e==1).  */
    418       count_leading_zeros (cnt, (mp_limb_t) e);
    419       i = GMP_LIMB_BITS - cnt - 2;
    420 
    421 #if HAVE_NATIVE_mpn_mul_2
    422       if (bsize <= 2)
    423 	{
    424 	  mp_limb_t  mult[2];
    425 
    426 	  /* Any bsize==1 will have been powered above to be two limbs. */
    427 	  ASSERT (bsize == 2);
    428 	  ASSERT (blimb != 0);
    429 
    430 	  /* Arrange the final result ends up in r, not in the temp space */
    431 	  if ((i & 1) == 0)
    432 	    SWAP_RP_TP;
    433 
    434 	  rp[0] = blimb_low;
    435 	  rp[1] = blimb;
    436 	  rsize = 2;
    437 
    438 	  mult[0] = blimb_low;
    439 	  mult[1] = blimb;
    440 
    441 	  for ( ; i >= 0; i--)
    442 	    {
    443 	      TRACE (printf ("mul_2 loop i=%d e=0x%lX, rsize=%ld ralloc=%ld talloc=%ld\n",
    444 			     i, e, rsize, ralloc, talloc);
    445 		     mpn_trace ("r", rp, rsize));
    446 
    447 	      MPN_SQR (tp, talloc, rp, rsize);
    448 	      SWAP_RP_TP;
    449 	      if ((e & (1L << i)) != 0)
    450 		MPN_MUL_2 (rp, rsize, ralloc, mult);
    451 	    }
    452 
    453 	  TRACE (mpn_trace ("mul_2 before rl, r", rp, rsize));
    454 	  if (rl_high != 0)
    455 	    {
    456 	      mult[0] = rl;
    457 	      mult[1] = rl_high;
    458 	      MPN_MUL_2 (rp, rsize, ralloc, mult);
    459 	    }
    460 	  else if (rl != 1)
    461 	    MPN_MUL_1 (rp, rsize, ralloc, rl);
    462 	}
    463 #else
    464       if (bsize == 1)
    465 	{
    466 	  /* Arrange the final result ends up in r, not in the temp space */
    467 	  if ((i & 1) == 0)
    468 	    SWAP_RP_TP;
    469 
    470 	  rp[0] = blimb;
    471 	  rsize = 1;
    472 
    473 	  for ( ; i >= 0; i--)
    474 	    {
    475 	      TRACE (printf ("mul_1 loop i=%d e=0x%lX, rsize=%ld ralloc=%ld talloc=%ld\n",
    476 			     i, e, rsize, ralloc, talloc);
    477 		     mpn_trace ("r", rp, rsize));
    478 
    479 	      MPN_SQR (tp, talloc, rp, rsize);
    480 	      SWAP_RP_TP;
    481 	      if ((e & (1L << i)) != 0)
    482 		MPN_MUL_1 (rp, rsize, ralloc, blimb);
    483 	    }
    484 
    485 	  TRACE (mpn_trace ("mul_1 before rl, r", rp, rsize));
    486 	  if (rl != 1)
    487 	    MPN_MUL_1 (rp, rsize, ralloc, rl);
    488 	}
    489 #endif
    490       else
    491 	{
    492 	  int  parity;
    493 
    494 	  /* Arrange the final result ends up in r, not in the temp space */
    495 	  ULONG_PARITY (parity, e);
    496 	  if (((parity ^ i) & 1) != 0)
    497 	    SWAP_RP_TP;
    498 
    499 	  MPN_COPY (rp, bp, bsize);
    500 	  rsize = bsize;
    501 
    502 	  for ( ; i >= 0; i--)
    503 	    {
    504 	      TRACE (printf ("mul loop i=%d e=0x%lX, rsize=%ld ralloc=%ld talloc=%ld\n",
    505 			     i, e, rsize, ralloc, talloc);
    506 		     mpn_trace ("r", rp, rsize));
    507 
    508 	      MPN_SQR (tp, talloc, rp, rsize);
    509 	      SWAP_RP_TP;
    510 	      if ((e & (1L << i)) != 0)
    511 		{
    512 		  MPN_MUL (tp, talloc, rp, rsize, bp, bsize);
    513 		  SWAP_RP_TP;
    514 		}
    515 	    }
    516 	}
    517     }
    518 
    519   ASSERT (rp == PTR(r) + rtwos_limbs);
    520   TRACE (mpn_trace ("end loop r", rp, rsize));
    521   TMP_FREE;
    522 
    523   /* Apply any partial limb factors of 2. */
    524   if (rtwos_bits != 0)
    525     {
    526       MPN_LSHIFT (rp, rsize, ralloc, (unsigned) rtwos_bits);
    527       TRACE (mpn_trace ("lshift r", rp, rsize));
    528     }
    529 
    530   rsize += rtwos_limbs;
    531   SIZ(r) = (rneg ? -rsize : rsize);
    532 }
    533