Home | History | Annotate | Line # | Download | only in mpz
n_pow_ui.c revision 1.1.1.5
      1 /* mpz_n_pow_ui -- mpn raised to ulong.
      2 
      3    THE FUNCTIONS IN THIS FILE ARE FOR INTERNAL USE ONLY.  THEY'RE ALMOST
      4    CERTAIN TO BE SUBJECT TO INCOMPATIBLE CHANGES OR DISAPPEAR COMPLETELY IN
      5    FUTURE GNU MP RELEASES.
      6 
      7 Copyright 2001, 2002, 2005, 2012, 2015, 2020 Free Software Foundation, Inc.
      8 
      9 This file is part of the GNU MP Library.
     10 
     11 The GNU MP Library is free software; you can redistribute it and/or modify
     12 it under the terms of either:
     13 
     14   * the GNU Lesser General Public License as published by the Free
     15     Software Foundation; either version 3 of the License, or (at your
     16     option) any later version.
     17 
     18 or
     19 
     20   * the GNU General Public License as published by the Free Software
     21     Foundation; either version 2 of the License, or (at your option) any
     22     later version.
     23 
     24 or both in parallel, as here.
     25 
     26 The GNU MP Library is distributed in the hope that it will be useful, but
     27 WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
     28 or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
     29 for more details.
     30 
     31 You should have received copies of the GNU General Public License and the
     32 GNU Lesser General Public License along with the GNU MP Library.  If not,
     33 see https://www.gnu.org/licenses/.  */
     34 
     35 #include <stdlib.h>
     36 #include <stdio.h>
     37 #include "gmp-impl.h"
     38 #include "longlong.h"
     39 
     40 
     41 /* Change this to "#define TRACE(x) x" for some traces. */
     42 #define TRACE(x)
     43 
     44 
     45 /* Use this to test the mul_2 code on a CPU without a native version of that
     46    routine.  */
     47 #if 0
     48 #define mpn_mul_2  refmpn_mul_2
     49 #define HAVE_NATIVE_mpn_mul_2  1
     50 #endif
     51 
     52 
     53 /* mpz_pow_ui and mpz_ui_pow_ui want to share almost all of this code.
     54    ui_pow_ui doesn't need the mpn_mul based powering loop or the tests on
     55    bsize==2 or >2, but separating that isn't easy because there's shared
     56    code both before and after (the size calculations and the powers of 2
     57    handling).
     58 
     59    Alternatives:
     60 
     61    It would work to just use the mpn_mul powering loop for 1 and 2 limb
     62    bases, but the current separate loop allows mul_1 and mul_2 to be done
     63    in-place, which might help cache locality a bit.  If mpn_mul was relaxed
     64    to allow source==dest when vn==1 or 2 then some pointer twiddling might
     65    let us get the same effect in one loop.
     66 
     67    The initial powering for bsize==1 into blimb or blimb:blimb_low doesn't
     68    form the biggest possible power of b that fits, only the biggest power of
     69    2 power, ie. b^(2^n).  It'd be possible to choose a bigger power, perhaps
     70    using mp_bases[b].big_base for small b, and thereby get better value
     71    from mpn_mul_1 or mpn_mul_2 in the bignum powering.  It's felt that doing
     72    so would be more complicated than it's worth, and could well end up being
     73    a slowdown for small e.  For big e on the other hand the algorithm is
     74    dominated by mpn_sqr so there wouldn't much of a saving.  The current
     75    code can be viewed as simply doing the first few steps of the powering in
     76    a single or double limb where possible.
     77 
     78    If r==b, and blow_twos==0, and r must be realloc'ed, then the temporary
     79    copy made of b is unnecessary.  We could just use the old alloc'ed block
     80    and free it at the end.  But arranging this seems like a lot more trouble
     81    than it's worth.  */
     82 
     83 
     84 /* floor(sqrt(GMP_NUMB_MAX)), ie. the biggest value that can be squared in
     85    a limb without overflowing.
     86    FIXME: This formula is an underestimate when GMP_NUMB_BITS is odd. */
     87 
     88 #define GMP_NUMB_HALFMAX  (((mp_limb_t) 1 << GMP_NUMB_BITS/2) - 1)
     89 
     90 
     91 /* The following are for convenience, they update the size and check the
     92    alloc.  */
     93 
     94 #define MPN_SQR(dst, alloc, src, size)          \
     95   do {                                          \
     96     ASSERT (2*(size) <= (alloc));               \
     97     mpn_sqr (dst, src, size);                   \
     98     (size) *= 2;                                \
     99     (size) -= ((dst)[(size)-1] == 0);           \
    100   } while (0)
    101 
    102 #define MPN_MUL(dst, alloc, src, size, src2, size2)     \
    103   do {                                                  \
    104     mp_limb_t  cy;                                      \
    105     ASSERT ((size) + (size2) <= (alloc));               \
    106     cy = mpn_mul (dst, src, size, src2, size2);         \
    107     (size) += (size2) - (cy == 0);                      \
    108   } while (0)
    109 
    110 #define MPN_MUL_2(ptr, size, alloc, mult)       \
    111   do {                                          \
    112     mp_limb_t  cy;                              \
    113     ASSERT ((size)+2 <= (alloc));               \
    114     cy = mpn_mul_2 (ptr, ptr, size, mult);      \
    115     (size)++;                                   \
    116     (ptr)[(size)] = cy;                         \
    117     (size) += (cy != 0);                        \
    118   } while (0)
    119 
    120 #define MPN_MUL_1(ptr, size, alloc, limb)       \
    121   do {                                          \
    122     mp_limb_t  cy;                              \
    123     ASSERT ((size)+1 <= (alloc));               \
    124     cy = mpn_mul_1 (ptr, ptr, size, limb);      \
    125     (ptr)[size] = cy;                           \
    126     (size) += (cy != 0);                        \
    127   } while (0)
    128 
    129 #define MPN_LSHIFT(ptr, size, alloc, shift)     \
    130   do {                                          \
    131     mp_limb_t  cy;                              \
    132     ASSERT ((size)+1 <= (alloc));               \
    133     cy = mpn_lshift (ptr, ptr, size, shift);    \
    134     (ptr)[size] = cy;                           \
    135     (size) += (cy != 0);                        \
    136   } while (0)
    137 
    138 #define MPN_RSHIFT_OR_COPY(dst, src, size, shift)       \
    139   do {                                                  \
    140     if ((shift) == 0)                                   \
    141       MPN_COPY (dst, src, size);                        \
    142     else                                                \
    143       {                                                 \
    144         mpn_rshift (dst, src, size, shift);             \
    145         (size) -= ((dst)[(size)-1] == 0);               \
    146       }                                                 \
    147   } while (0)
    148 
    149 
    150 /* ralloc and talloc are only wanted for ASSERTs, after the initial space
    151    allocations.  Avoid writing values to them in a normal build, to ensure
    152    the compiler lets them go dead.  gcc already figures this out itself
    153    actually.  */
    154 
    155 #define SWAP_RP_TP                                      \
    156   do {                                                  \
    157     MP_PTR_SWAP (rp, tp);                               \
    158     ASSERT_CODE (MP_SIZE_T_SWAP (ralloc, talloc));      \
    159   } while (0)
    160 
    161 
    162 void
    163 mpz_n_pow_ui (mpz_ptr r, mp_srcptr bp, mp_size_t bsize, unsigned long int e)
    164 {
    165   mp_ptr         rp;
    166   mp_size_t      rtwos_limbs, ralloc, rsize;
    167   int            rneg, i, cnt, btwos, r_bp_overlap;
    168   mp_limb_t      blimb, rl;
    169   mp_bitcnt_t    rtwos_bits;
    170 #if HAVE_NATIVE_mpn_mul_2
    171   mp_limb_t      blimb_low, rl_high;
    172 #else
    173   mp_limb_t      b_twolimbs[2];
    174 #endif
    175   mp_limb_t ovfl;
    176   TMP_DECL;
    177 
    178   TRACE (printf ("mpz_n_pow_ui rp=0x%lX bp=0x%lX bsize=%ld e=%lu (0x%lX)\n",
    179 		 PTR(r), bp, bsize, e, e);
    180 	 mpn_trace ("b", bp, bsize));
    181 
    182   ASSERT (bsize == 0 || bp[ABS(bsize)-1] != 0);
    183   ASSERT (MPN_SAME_OR_SEPARATE2_P (PTR(r), ALLOC(r), bp, ABS(bsize)));
    184 
    185   /* b^0 == 1, including 0^0 == 1 */
    186   if (e == 0)
    187     {
    188       MPZ_NEWALLOC (r, 1)[0] = 1;
    189       SIZ(r) = 1;
    190       return;
    191     }
    192 
    193   /* 0^e == 0 apart from 0^0 above */
    194   if (bsize == 0)
    195     {
    196       SIZ(r) = 0;
    197       return;
    198     }
    199 
    200   /* Sign of the final result. */
    201   rneg = (bsize < 0 && (e & 1) != 0);
    202   bsize = ABS (bsize);
    203   TRACE (printf ("rneg %d\n", rneg));
    204 
    205   r_bp_overlap = (PTR(r) == bp);
    206 
    207   /* Strip low zero limbs from b. */
    208   rtwos_limbs = 0;
    209   for (blimb = *bp; blimb == 0; blimb = *++bp)
    210     {
    211       rtwos_limbs += e;
    212       bsize--; ASSERT (bsize >= 1);
    213     }
    214   TRACE (printf ("trailing zero rtwos_limbs=%ld\n", rtwos_limbs));
    215 
    216   /* Strip low zero bits from b. */
    217   count_trailing_zeros (btwos, blimb);
    218   blimb >>= btwos;
    219 
    220   umul_ppmm (ovfl, rtwos_bits, e, btwos);
    221   if (ovfl)
    222     {
    223       fprintf (stderr, "gmp: overflow in mpz type\n");
    224       abort ();
    225     }
    226 
    227   rtwos_limbs += rtwos_bits / GMP_NUMB_BITS;
    228   rtwos_bits %= GMP_NUMB_BITS;
    229   TRACE (printf ("trailing zero btwos=%d rtwos_limbs=%ld rtwos_bits=%lu\n",
    230 		 btwos, rtwos_limbs, rtwos_bits));
    231 
    232   TMP_MARK;
    233 
    234   rl = 1;
    235 #if HAVE_NATIVE_mpn_mul_2
    236   rl_high = 0;
    237 #endif
    238 
    239   if (bsize == 1)
    240     {
    241     bsize_1:
    242       /* Power up as far as possible within blimb.  We start here with e!=0,
    243 	 but if e is small then we might reach e==0 and the whole b^e in rl.
    244 	 Notice this code works when blimb==1 too, reaching e==0.  */
    245 
    246       while (blimb <= GMP_NUMB_HALFMAX)
    247 	{
    248 	  TRACE (printf ("small e=0x%lX blimb=0x%lX rl=0x%lX\n",
    249 			 e, blimb, rl));
    250 	  ASSERT (e != 0);
    251 	  if ((e & 1) != 0)
    252 	    rl *= blimb;
    253 	  e >>= 1;
    254 	  if (e == 0)
    255 	    goto got_rl;
    256 	  blimb *= blimb;
    257 	}
    258 
    259 #if HAVE_NATIVE_mpn_mul_2
    260       TRACE (printf ("single power, e=0x%lX b=0x%lX rl=0x%lX\n",
    261 		     e, blimb, rl));
    262 
    263       /* Can power b once more into blimb:blimb_low */
    264       bsize = 2;
    265       ASSERT (e != 0);
    266       if ((e & 1) != 0)
    267 	{
    268 	  umul_ppmm (rl_high, rl, rl, blimb << GMP_NAIL_BITS);
    269 	  rl >>= GMP_NAIL_BITS;
    270 	}
    271       e >>= 1;
    272       umul_ppmm (blimb, blimb_low, blimb, blimb << GMP_NAIL_BITS);
    273       blimb_low >>= GMP_NAIL_BITS;
    274 
    275     got_rl:
    276       TRACE (printf ("double power e=0x%lX blimb=0x%lX:0x%lX rl=0x%lX:%lX\n",
    277 		     e, blimb, blimb_low, rl_high, rl));
    278 
    279       /* Combine left-over rtwos_bits into rl_high:rl to be handled by the
    280 	 final mul_1 or mul_2 rather than a separate lshift.
    281 	 - rl_high:rl mustn't be 1 (since then there's no final mul)
    282 	 - rl_high mustn't overflow
    283 	 - rl_high mustn't change to non-zero, since mul_1+lshift is
    284 	 probably faster than mul_2 (FIXME: is this true?)  */
    285 
    286       if (rtwos_bits != 0
    287 	  && ! (rl_high == 0 && rl == 1)
    288 	  && (rl_high >> (GMP_NUMB_BITS-rtwos_bits)) == 0)
    289 	{
    290 	  mp_limb_t  new_rl_high = (rl_high << rtwos_bits)
    291 	    | (rl >> (GMP_NUMB_BITS-rtwos_bits));
    292 	  if (! (rl_high == 0 && new_rl_high != 0))
    293 	    {
    294 	      rl_high = new_rl_high;
    295 	      rl <<= rtwos_bits;
    296 	      rtwos_bits = 0;
    297 	      TRACE (printf ("merged rtwos_bits, rl=0x%lX:%lX\n",
    298 			     rl_high, rl));
    299 	    }
    300 	}
    301 #else
    302     got_rl:
    303       TRACE (printf ("small power e=0x%lX blimb=0x%lX rl=0x%lX\n",
    304 		     e, blimb, rl));
    305 
    306       /* Combine left-over rtwos_bits into rl to be handled by the final
    307 	 mul_1 rather than a separate lshift.
    308 	 - rl mustn't be 1 (since then there's no final mul)
    309 	 - rl mustn't overflow	*/
    310 
    311       if (rtwos_bits != 0
    312 	  && rl != 1
    313 	  && (rl >> (GMP_NUMB_BITS-rtwos_bits)) == 0)
    314 	{
    315 	  rl <<= rtwos_bits;
    316 	  rtwos_bits = 0;
    317 	  TRACE (printf ("merged rtwos_bits, rl=0x%lX\n", rl));
    318 	}
    319 #endif
    320     }
    321   else if (bsize == 2)
    322     {
    323       mp_limb_t  bsecond = bp[1];
    324       if (btwos != 0)
    325 	blimb |= (bsecond << (GMP_NUMB_BITS - btwos)) & GMP_NUMB_MASK;
    326       bsecond >>= btwos;
    327       if (bsecond == 0)
    328 	{
    329 	  /* Two limbs became one after rshift. */
    330 	  bsize = 1;
    331 	  goto bsize_1;
    332 	}
    333 
    334       TRACE (printf ("bsize==2 using b=0x%lX:%lX", bsecond, blimb));
    335 #if HAVE_NATIVE_mpn_mul_2
    336       blimb_low = blimb;
    337 #else
    338       bp = b_twolimbs;
    339       b_twolimbs[0] = blimb;
    340       b_twolimbs[1] = bsecond;
    341 #endif
    342       blimb = bsecond;
    343     }
    344   else
    345     {
    346       if (r_bp_overlap || btwos != 0)
    347 	{
    348 	  mp_ptr tp = TMP_ALLOC_LIMBS (bsize);
    349 	  MPN_RSHIFT_OR_COPY (tp, bp, bsize, btwos);
    350 	  bp = tp;
    351 	  TRACE (printf ("rshift or copy bp,bsize, new bsize=%ld\n", bsize));
    352 	}
    353 #if HAVE_NATIVE_mpn_mul_2
    354       /* in case 3 limbs rshift to 2 and hence use the mul_2 loop below */
    355       blimb_low = bp[0];
    356 #endif
    357       blimb = bp[bsize-1];
    358 
    359       TRACE (printf ("big bsize=%ld  ", bsize);
    360 	     mpn_trace ("b", bp, bsize));
    361     }
    362 
    363   /* At this point blimb is the most significant limb of the base to use.
    364 
    365      Each factor of b takes (bsize*BPML-cnt) bits and there's e of them; +1
    366      limb to round up the division; +1 for multiplies all using an extra
    367      limb over the true size; +2 for rl at the end; +1 for lshift at the
    368      end.
    369 
    370      The size calculation here is reasonably accurate.  The base is at least
    371      half a limb, so in 32 bits the worst case is 2^16+1 treated as 17 bits
    372      when it will power up as just over 16, an overestimate of 17/16 =
    373      6.25%.  For a 64-bit limb it's half that.
    374 
    375      If e==0 then blimb won't be anything useful (though it will be
    376      non-zero), but that doesn't matter since we just end up with ralloc==5,
    377      and that's fine for 2 limbs of rl and 1 of lshift.  */
    378 
    379   ASSERT (blimb != 0);
    380   count_leading_zeros (cnt, blimb);
    381 
    382   umul_ppmm (ovfl, ralloc, (bsize*GMP_NUMB_BITS - cnt + GMP_NAIL_BITS), e);
    383   if (ovfl)
    384     {
    385       fprintf (stderr, "gmp: overflow in mpz type\n");
    386       abort ();
    387     }
    388   ralloc = ralloc / GMP_NUMB_BITS + 5;
    389 
    390   TRACE (printf ("ralloc %ld, from bsize=%ld blimb=0x%lX cnt=%d\n",
    391 		 ralloc, bsize, blimb, cnt));
    392   rp = MPZ_NEWALLOC (r, ralloc + rtwos_limbs);
    393 
    394   /* Low zero limbs resulting from powers of 2. */
    395   MPN_ZERO (rp, rtwos_limbs);
    396   rp += rtwos_limbs;
    397 
    398   if (e == 0)
    399     {
    400       /* Any e==0 other than via bsize==1 or bsize==2 is covered at the
    401 	 start. */
    402       rp[0] = rl;
    403       rsize = 1;
    404 #if HAVE_NATIVE_mpn_mul_2
    405       rp[1] = rl_high;
    406       rsize += (rl_high != 0);
    407 #endif
    408       ASSERT (rp[rsize-1] != 0);
    409     }
    410   else
    411     {
    412       mp_ptr     tp;
    413       mp_size_t  talloc;
    414 
    415       /* In the mpn_mul_1 or mpn_mul_2 loops or in the mpn_mul loop when the
    416 	 low bit of e is zero, tp only has to hold the second last power
    417 	 step, which is half the size of the final result.  There's no need
    418 	 to round up the divide by 2, since ralloc includes a +2 for rl
    419 	 which not needed by tp.  In the mpn_mul loop when the low bit of e
    420 	 is 1, tp must hold nearly the full result, so just size it the same
    421 	 as rp.  */
    422 
    423       talloc = ralloc;
    424 #if HAVE_NATIVE_mpn_mul_2
    425       if (bsize <= 2 || (e & 1) == 0)
    426 	talloc /= 2;
    427 #else
    428       if (bsize <= 1 || (e & 1) == 0)
    429 	talloc /= 2;
    430 #endif
    431       TRACE (printf ("talloc %ld\n", talloc));
    432       tp = TMP_ALLOC_LIMBS (talloc);
    433 
    434       /* Go from high to low over the bits of e, starting with i pointing at
    435 	 the bit below the highest 1 (which will mean i==-1 if e==1).  */
    436       count_leading_zeros (cnt, (mp_limb_t) e);
    437       i = GMP_LIMB_BITS - cnt - 2;
    438 
    439 #if HAVE_NATIVE_mpn_mul_2
    440       if (bsize <= 2)
    441 	{
    442 	  mp_limb_t  mult[2];
    443 
    444 	  /* Any bsize==1 will have been powered above to be two limbs. */
    445 	  ASSERT (bsize == 2);
    446 	  ASSERT (blimb != 0);
    447 
    448 	  /* Arrange the final result ends up in r, not in the temp space */
    449 	  if ((i & 1) == 0)
    450 	    SWAP_RP_TP;
    451 
    452 	  rp[0] = blimb_low;
    453 	  rp[1] = blimb;
    454 	  rsize = 2;
    455 
    456 	  mult[0] = blimb_low;
    457 	  mult[1] = blimb;
    458 
    459 	  for ( ; i >= 0; i--)
    460 	    {
    461 	      TRACE (printf ("mul_2 loop i=%d e=0x%lX, rsize=%ld ralloc=%ld talloc=%ld\n",
    462 			     i, e, rsize, ralloc, talloc);
    463 		     mpn_trace ("r", rp, rsize));
    464 
    465 	      MPN_SQR (tp, talloc, rp, rsize);
    466 	      SWAP_RP_TP;
    467 	      if ((e & (1L << i)) != 0)
    468 		MPN_MUL_2 (rp, rsize, ralloc, mult);
    469 	    }
    470 
    471 	  TRACE (mpn_trace ("mul_2 before rl, r", rp, rsize));
    472 	  if (rl_high != 0)
    473 	    {
    474 	      mult[0] = rl;
    475 	      mult[1] = rl_high;
    476 	      MPN_MUL_2 (rp, rsize, ralloc, mult);
    477 	    }
    478 	  else if (rl != 1)
    479 	    MPN_MUL_1 (rp, rsize, ralloc, rl);
    480 	}
    481 #else
    482       if (bsize == 1)
    483 	{
    484 	  /* Arrange the final result ends up in r, not in the temp space */
    485 	  if ((i & 1) == 0)
    486 	    SWAP_RP_TP;
    487 
    488 	  rp[0] = blimb;
    489 	  rsize = 1;
    490 
    491 	  for ( ; i >= 0; i--)
    492 	    {
    493 	      TRACE (printf ("mul_1 loop i=%d e=0x%lX, rsize=%ld ralloc=%ld talloc=%ld\n",
    494 			     i, e, rsize, ralloc, talloc);
    495 		     mpn_trace ("r", rp, rsize));
    496 
    497 	      MPN_SQR (tp, talloc, rp, rsize);
    498 	      SWAP_RP_TP;
    499 	      if ((e & (1L << i)) != 0)
    500 		MPN_MUL_1 (rp, rsize, ralloc, blimb);
    501 	    }
    502 
    503 	  TRACE (mpn_trace ("mul_1 before rl, r", rp, rsize));
    504 	  if (rl != 1)
    505 	    MPN_MUL_1 (rp, rsize, ralloc, rl);
    506 	}
    507 #endif
    508       else
    509 	{
    510 	  int  parity;
    511 
    512 	  /* Arrange the final result ends up in r, not in the temp space */
    513 	  ULONG_PARITY (parity, e);
    514 	  if (((parity ^ i) & 1) != 0)
    515 	    SWAP_RP_TP;
    516 
    517 	  MPN_COPY (rp, bp, bsize);
    518 	  rsize = bsize;
    519 
    520 	  for ( ; i >= 0; i--)
    521 	    {
    522 	      TRACE (printf ("mul loop i=%d e=0x%lX, rsize=%ld ralloc=%ld talloc=%ld\n",
    523 			     i, e, rsize, ralloc, talloc);
    524 		     mpn_trace ("r", rp, rsize));
    525 
    526 	      MPN_SQR (tp, talloc, rp, rsize);
    527 	      SWAP_RP_TP;
    528 	      if ((e & (1L << i)) != 0)
    529 		{
    530 		  MPN_MUL (tp, talloc, rp, rsize, bp, bsize);
    531 		  SWAP_RP_TP;
    532 		}
    533 	    }
    534 	}
    535     }
    536 
    537   ASSERT (rp == PTR(r) + rtwos_limbs);
    538   TRACE (mpn_trace ("end loop r", rp, rsize));
    539   TMP_FREE;
    540 
    541   /* Apply any partial limb factors of 2. */
    542   if (rtwos_bits != 0)
    543     {
    544       MPN_LSHIFT (rp, rsize, ralloc, (unsigned) rtwos_bits);
    545       TRACE (mpn_trace ("lshift r", rp, rsize));
    546     }
    547 
    548   rsize += rtwos_limbs;
    549   SIZ(r) = (rneg ? -rsize : rsize);
    550 }
    551