dist/mpz/n_pow_ui.c

        1.1  mrg /* mpz_n_pow_ui -- mpn raised to ulong.
        1.1  mrg
        1.1  mrg    THE FUNCTIONS IN THIS FILE ARE FOR INTERNAL USE ONLY.  THEY'RE ALMOST
        1.1  mrg    CERTAIN TO BE SUBJECT TO INCOMPATIBLE CHANGES OR DISAPPEAR COMPLETELY IN
        1.1  mrg    FUTURE GNU MP RELEASES.
        1.1  mrg
1.1.1.1.8.1  tls Copyright 2001, 2002, 2005, 2012 Free Software Foundation, Inc.
        1.1  mrg
        1.1  mrg This file is part of the GNU MP Library.
        1.1  mrg
        1.1  mrg The GNU MP Library is free software; you can redistribute it and/or modify
        1.1  mrg it under the terms of the GNU Lesser General Public License as published by
        1.1  mrg the Free Software Foundation; either version 3 of the License, or (at your
        1.1  mrg option) any later version.
        1.1  mrg
        1.1  mrg The GNU MP Library is distributed in the hope that it will be useful, but
        1.1  mrg WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
        1.1  mrg or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
        1.1  mrg License for more details.
        1.1  mrg
        1.1  mrg You should have received a copy of the GNU Lesser General Public License
        1.1  mrg along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.  */
        1.1  mrg
        1.1  mrg #include "gmp.h"
        1.1  mrg #include "gmp-impl.h"
        1.1  mrg #include "longlong.h"
        1.1  mrg
        1.1  mrg
        1.1  mrg /* Change this to "#define TRACE(x) x" for some traces. */
        1.1  mrg #define TRACE(x)
        1.1  mrg
        1.1  mrg
        1.1  mrg /* Use this to test the mul_2 code on a CPU without a native version of that
        1.1  mrg    routine.  */
        1.1  mrg #if 0
        1.1  mrg #define mpn_mul_2  refmpn_mul_2
        1.1  mrg #define HAVE_NATIVE_mpn_mul_2  1
        1.1  mrg #endif
        1.1  mrg
        1.1  mrg
        1.1  mrg /* mpz_pow_ui and mpz_ui_pow_ui want to share almost all of this code.
        1.1  mrg    ui_pow_ui doesn't need the mpn_mul based powering loop or the tests on
        1.1  mrg    bsize==2 or >2, but separating that isn't easy because there's shared
        1.1  mrg    code both before and after (the size calculations and the powers of 2
        1.1  mrg    handling).
        1.1  mrg
        1.1  mrg    Alternatives:
        1.1  mrg
        1.1  mrg    It would work to just use the mpn_mul powering loop for 1 and 2 limb
        1.1  mrg    bases, but the current separate loop allows mul_1 and mul_2 to be done
        1.1  mrg    in-place, which might help cache locality a bit.  If mpn_mul was relaxed
        1.1  mrg    to allow source==dest when vn==1 or 2 then some pointer twiddling might
        1.1  mrg    let us get the same effect in one loop.
        1.1  mrg
        1.1  mrg    The initial powering for bsize==1 into blimb or blimb:blimb_low doesn't
        1.1  mrg    form the biggest possible power of b that fits, only the biggest power of
        1.1  mrg    2 power, ie. b^(2^n).  It'd be possible to choose a bigger power, perhaps
        1.1  mrg    using mp_bases[b].big_base for small b, and thereby get better value
        1.1  mrg    from mpn_mul_1 or mpn_mul_2 in the bignum powering.  It's felt that doing
        1.1  mrg    so would be more complicated than it's worth, and could well end up being
        1.1  mrg    a slowdown for small e.  For big e on the other hand the algorithm is
        1.1  mrg    dominated by mpn_sqr so there wouldn't much of a saving.  The current
        1.1  mrg    code can be viewed as simply doing the first few steps of the powering in
        1.1  mrg    a single or double limb where possible.
        1.1  mrg
        1.1  mrg    If r==b, and blow_twos==0, and r must be realloc'ed, then the temporary
        1.1  mrg    copy made of b is unnecessary.  We could just use the old alloc'ed block
        1.1  mrg    and free it at the end.  But arranging this seems like a lot more trouble
        1.1  mrg    than it's worth.  */
        1.1  mrg
        1.1  mrg
        1.1  mrg /* floor(sqrt(GMP_NUMB_MAX)), ie. the biggest value that can be squared in
        1.1  mrg    a limb without overflowing.
        1.1  mrg    FIXME: This formula is an underestimate when GMP_NUMB_BITS is odd. */
        1.1  mrg
        1.1  mrg #define GMP_NUMB_HALFMAX  (((mp_limb_t) 1 << GMP_NUMB_BITS/2) - 1)
        1.1  mrg
        1.1  mrg
        1.1  mrg /* The following are for convenience, they update the size and check the
        1.1  mrg    alloc.  */
        1.1  mrg
        1.1  mrg #define MPN_SQR(dst, alloc, src, size)          \
        1.1  mrg   do {                                          \
        1.1  mrg     ASSERT (2*(size) <= (alloc));               \
        1.1  mrg     mpn_sqr (dst, src, size);                   \
        1.1  mrg     (size) *= 2;                                \
        1.1  mrg     (size) -= ((dst)[(size)-1] == 0);           \
        1.1  mrg   } while (0)
        1.1  mrg
        1.1  mrg #define MPN_MUL(dst, alloc, src, size, src2, size2)     \
        1.1  mrg   do {                                                  \
        1.1  mrg     mp_limb_t  cy;                                      \
        1.1  mrg     ASSERT ((size) + (size2) <= (alloc));               \
        1.1  mrg     cy = mpn_mul (dst, src, size, src2, size2);         \
        1.1  mrg     (size) += (size2) - (cy == 0);                      \
        1.1  mrg   } while (0)
        1.1  mrg
        1.1  mrg #define MPN_MUL_2(ptr, size, alloc, mult)       \
        1.1  mrg   do {                                          \
        1.1  mrg     mp_limb_t  cy;                              \
        1.1  mrg     ASSERT ((size)+2 <= (alloc));               \
        1.1  mrg     cy = mpn_mul_2 (ptr, ptr, size, mult);      \
        1.1  mrg     (size)++;                                   \
        1.1  mrg     (ptr)[(size)] = cy;                         \
        1.1  mrg     (size) += (cy != 0);                        \
        1.1  mrg   } while (0)
        1.1  mrg
        1.1  mrg #define MPN_MUL_1(ptr, size, alloc, limb)       \
        1.1  mrg   do {                                          \
        1.1  mrg     mp_limb_t  cy;                              \
        1.1  mrg     ASSERT ((size)+1 <= (alloc));               \
        1.1  mrg     cy = mpn_mul_1 (ptr, ptr, size, limb);      \
        1.1  mrg     (ptr)[size] = cy;                           \
        1.1  mrg     (size) += (cy != 0);                        \
        1.1  mrg   } while (0)
        1.1  mrg
        1.1  mrg #define MPN_LSHIFT(ptr, size, alloc, shift)     \
        1.1  mrg   do {                                          \
        1.1  mrg     mp_limb_t  cy;                              \
        1.1  mrg     ASSERT ((size)+1 <= (alloc));               \
        1.1  mrg     cy = mpn_lshift (ptr, ptr, size, shift);    \
        1.1  mrg     (ptr)[size] = cy;                           \
        1.1  mrg     (size) += (cy != 0);                        \
        1.1  mrg   } while (0)
        1.1  mrg
        1.1  mrg #define MPN_RSHIFT_OR_COPY(dst, src, size, shift)       \
        1.1  mrg   do {                                                  \
        1.1  mrg     if ((shift) == 0)                                   \
        1.1  mrg       MPN_COPY (dst, src, size);                        \
        1.1  mrg     else                                                \
        1.1  mrg       {                                                 \
        1.1  mrg         mpn_rshift (dst, src, size, shift);             \
        1.1  mrg         (size) -= ((dst)[(size)-1] == 0);               \
        1.1  mrg       }                                                 \
        1.1  mrg   } while (0)
        1.1  mrg
        1.1  mrg
        1.1  mrg /* ralloc and talloc are only wanted for ASSERTs, after the initial space
        1.1  mrg    allocations.  Avoid writing values to them in a normal build, to ensure
        1.1  mrg    the compiler lets them go dead.  gcc already figures this out itself
        1.1  mrg    actually.  */
        1.1  mrg
        1.1  mrg #define SWAP_RP_TP                                      \
        1.1  mrg   do {                                                  \
        1.1  mrg     MP_PTR_SWAP (rp, tp);                               \
        1.1  mrg     ASSERT_CODE (MP_SIZE_T_SWAP (ralloc, talloc));      \
        1.1  mrg   } while (0)
        1.1  mrg
        1.1  mrg
        1.1  mrg void
        1.1  mrg mpz_n_pow_ui (mpz_ptr r, mp_srcptr bp, mp_size_t bsize, unsigned long int e)
        1.1  mrg {
        1.1  mrg   mp_ptr         rp;
        1.1  mrg   mp_size_t      rtwos_limbs, ralloc, rsize;
        1.1  mrg   int            rneg, i, cnt, btwos, r_bp_overlap;
        1.1  mrg   mp_limb_t      blimb, rl;
        1.1  mrg   mp_bitcnt_t    rtwos_bits;
        1.1  mrg #if HAVE_NATIVE_mpn_mul_2
        1.1  mrg   mp_limb_t      blimb_low, rl_high;
        1.1  mrg #else
        1.1  mrg   mp_limb_t      b_twolimbs[2];
        1.1  mrg #endif
        1.1  mrg   TMP_DECL;
        1.1  mrg
        1.1  mrg   TRACE (printf ("mpz_n_pow_ui rp=0x%lX bp=0x%lX bsize=%ld e=%lu (0x%lX)\n",
1.1.1.1.8.1  tls 		 PTR(r), bp, bsize, e, e);
1.1.1.1.8.1  tls 	 mpn_trace ("b", bp, bsize));
        1.1  mrg
        1.1  mrg   ASSERT (bsize == 0 || bp[ABS(bsize)-1] != 0);
1.1.1.1.8.1  tls   ASSERT (MPN_SAME_OR_SEPARATE2_P (PTR(r), ALLOC(r), bp, ABS(bsize)));
        1.1  mrg
        1.1  mrg   /* b^0 == 1, including 0^0 == 1 */
        1.1  mrg   if (e == 0)
        1.1  mrg     {
        1.1  mrg       PTR(r)[0] = 1;
        1.1  mrg       SIZ(r) = 1;
        1.1  mrg       return;
        1.1  mrg     }
        1.1  mrg
        1.1  mrg   /* 0^e == 0 apart from 0^0 above */
        1.1  mrg   if (bsize == 0)
        1.1  mrg     {
        1.1  mrg       SIZ(r) = 0;
        1.1  mrg       return;
        1.1  mrg     }
        1.1  mrg
        1.1  mrg   /* Sign of the final result. */
        1.1  mrg   rneg = (bsize < 0 && (e & 1) != 0);
        1.1  mrg   bsize = ABS (bsize);
        1.1  mrg   TRACE (printf ("rneg %d\n", rneg));
        1.1  mrg
        1.1  mrg   r_bp_overlap = (PTR(r) == bp);
        1.1  mrg
        1.1  mrg   /* Strip low zero limbs from b. */
        1.1  mrg   rtwos_limbs = 0;
        1.1  mrg   for (blimb = *bp; blimb == 0; blimb = *++bp)
        1.1  mrg     {
        1.1  mrg       rtwos_limbs += e;
        1.1  mrg       bsize--; ASSERT (bsize >= 1);
        1.1  mrg     }
        1.1  mrg   TRACE (printf ("trailing zero rtwos_limbs=%ld\n", rtwos_limbs));
        1.1  mrg
        1.1  mrg   /* Strip low zero bits from b. */
        1.1  mrg   count_trailing_zeros (btwos, blimb);
        1.1  mrg   blimb >>= btwos;
        1.1  mrg   rtwos_bits = e * btwos;
        1.1  mrg   rtwos_limbs += rtwos_bits / GMP_NUMB_BITS;
        1.1  mrg   rtwos_bits %= GMP_NUMB_BITS;
        1.1  mrg   TRACE (printf ("trailing zero btwos=%d rtwos_limbs=%ld rtwos_bits=%lu\n",
1.1.1.1.8.1  tls 		 btwos, rtwos_limbs, rtwos_bits));
        1.1  mrg
        1.1  mrg   TMP_MARK;
        1.1  mrg
        1.1  mrg   rl = 1;
        1.1  mrg #if HAVE_NATIVE_mpn_mul_2
        1.1  mrg   rl_high = 0;
        1.1  mrg #endif
        1.1  mrg
        1.1  mrg   if (bsize == 1)
        1.1  mrg     {
        1.1  mrg     bsize_1:
        1.1  mrg       /* Power up as far as possible within blimb.  We start here with e!=0,
1.1.1.1.8.1  tls 	 but if e is small then we might reach e==0 and the whole b^e in rl.
1.1.1.1.8.1  tls 	 Notice this code works when blimb==1 too, reaching e==0.  */
        1.1  mrg
        1.1  mrg       while (blimb <= GMP_NUMB_HALFMAX)
1.1.1.1.8.1  tls 	{
1.1.1.1.8.1  tls 	  TRACE (printf ("small e=0x%lX blimb=0x%lX rl=0x%lX\n",
1.1.1.1.8.1  tls 			 e, blimb, rl));
1.1.1.1.8.1  tls 	  ASSERT (e != 0);
1.1.1.1.8.1  tls 	  if ((e & 1) != 0)
1.1.1.1.8.1  tls 	    rl *= blimb;
1.1.1.1.8.1  tls 	  e >>= 1;
1.1.1.1.8.1  tls 	  if (e == 0)
1.1.1.1.8.1  tls 	    goto got_rl;
1.1.1.1.8.1  tls 	  blimb *= blimb;
1.1.1.1.8.1  tls 	}
        1.1  mrg
        1.1  mrg #if HAVE_NATIVE_mpn_mul_2
        1.1  mrg       TRACE (printf ("single power, e=0x%lX b=0x%lX rl=0x%lX\n",
1.1.1.1.8.1  tls 		     e, blimb, rl));
        1.1  mrg
        1.1  mrg       /* Can power b once more into blimb:blimb_low */
        1.1  mrg       bsize = 2;
        1.1  mrg       ASSERT (e != 0);
        1.1  mrg       if ((e & 1) != 0)
        1.1  mrg 	{
        1.1  mrg 	  umul_ppmm (rl_high, rl, rl, blimb << GMP_NAIL_BITS);
        1.1  mrg 	  rl >>= GMP_NAIL_BITS;
        1.1  mrg 	}
        1.1  mrg       e >>= 1;
        1.1  mrg       umul_ppmm (blimb, blimb_low, blimb, blimb << GMP_NAIL_BITS);
        1.1  mrg       blimb_low >>= GMP_NAIL_BITS;
        1.1  mrg
        1.1  mrg     got_rl:
        1.1  mrg       TRACE (printf ("double power e=0x%lX blimb=0x%lX:0x%lX rl=0x%lX:%lX\n",
1.1.1.1.8.1  tls 		     e, blimb, blimb_low, rl_high, rl));
        1.1  mrg
        1.1  mrg       /* Combine left-over rtwos_bits into rl_high:rl to be handled by the
1.1.1.1.8.1  tls 	 final mul_1 or mul_2 rather than a separate lshift.
1.1.1.1.8.1  tls 	 - rl_high:rl mustn't be 1 (since then there's no final mul)
1.1.1.1.8.1  tls 	 - rl_high mustn't overflow
1.1.1.1.8.1  tls 	 - rl_high mustn't change to non-zero, since mul_1+lshift is
1.1.1.1.8.1  tls 	 probably faster than mul_2 (FIXME: is this true?)  */
        1.1  mrg
        1.1  mrg       if (rtwos_bits != 0
1.1.1.1.8.1  tls 	  && ! (rl_high == 0 && rl == 1)
1.1.1.1.8.1  tls 	  && (rl_high >> (GMP_NUMB_BITS-rtwos_bits)) == 0)
1.1.1.1.8.1  tls 	{
1.1.1.1.8.1  tls 	  mp_limb_t  new_rl_high = (rl_high << rtwos_bits)
1.1.1.1.8.1  tls 	    | (rl >> (GMP_NUMB_BITS-rtwos_bits));
1.1.1.1.8.1  tls 	  if (! (rl_high == 0 && new_rl_high != 0))
1.1.1.1.8.1  tls 	    {
1.1.1.1.8.1  tls 	      rl_high = new_rl_high;
1.1.1.1.8.1  tls 	      rl <<= rtwos_bits;
1.1.1.1.8.1  tls 	      rtwos_bits = 0;
1.1.1.1.8.1  tls 	      TRACE (printf ("merged rtwos_bits, rl=0x%lX:%lX\n",
1.1.1.1.8.1  tls 			     rl_high, rl));
1.1.1.1.8.1  tls 	    }
1.1.1.1.8.1  tls 	}
        1.1  mrg #else
        1.1  mrg     got_rl:
        1.1  mrg       TRACE (printf ("small power e=0x%lX blimb=0x%lX rl=0x%lX\n",
1.1.1.1.8.1  tls 		     e, blimb, rl));
        1.1  mrg
        1.1  mrg       /* Combine left-over rtwos_bits into rl to be handled by the final
1.1.1.1.8.1  tls 	 mul_1 rather than a separate lshift.
1.1.1.1.8.1  tls 	 - rl mustn't be 1 (since then there's no final mul)
1.1.1.1.8.1  tls 	 - rl mustn't overflow	*/
        1.1  mrg
        1.1  mrg       if (rtwos_bits != 0
1.1.1.1.8.1  tls 	  && rl != 1
1.1.1.1.8.1  tls 	  && (rl >> (GMP_NUMB_BITS-rtwos_bits)) == 0)
1.1.1.1.8.1  tls 	{
1.1.1.1.8.1  tls 	  rl <<= rtwos_bits;
1.1.1.1.8.1  tls 	  rtwos_bits = 0;
1.1.1.1.8.1  tls 	  TRACE (printf ("merged rtwos_bits, rl=0x%lX\n", rl));
1.1.1.1.8.1  tls 	}
        1.1  mrg #endif
        1.1  mrg     }
        1.1  mrg   else if (bsize == 2)
        1.1  mrg     {
        1.1  mrg       mp_limb_t  bsecond = bp[1];
        1.1  mrg       if (btwos != 0)
1.1.1.1.8.1  tls 	blimb |= (bsecond << (GMP_NUMB_BITS - btwos)) & GMP_NUMB_MASK;
        1.1  mrg       bsecond >>= btwos;
        1.1  mrg       if (bsecond == 0)
1.1.1.1.8.1  tls 	{
1.1.1.1.8.1  tls 	  /* Two limbs became one after rshift. */
1.1.1.1.8.1  tls 	  bsize = 1;
1.1.1.1.8.1  tls 	  goto bsize_1;
1.1.1.1.8.1  tls 	}
        1.1  mrg
        1.1  mrg       TRACE (printf ("bsize==2 using b=0x%lX:%lX", bsecond, blimb));
        1.1  mrg #if HAVE_NATIVE_mpn_mul_2
        1.1  mrg       blimb_low = blimb;
        1.1  mrg #else
        1.1  mrg       bp = b_twolimbs;
        1.1  mrg       b_twolimbs[0] = blimb;
        1.1  mrg       b_twolimbs[1] = bsecond;
        1.1  mrg #endif
        1.1  mrg       blimb = bsecond;
        1.1  mrg     }
        1.1  mrg   else
        1.1  mrg     {
        1.1  mrg       if (r_bp_overlap || btwos != 0)
1.1.1.1.8.1  tls 	{
1.1.1.1.8.1  tls 	  mp_ptr tp = TMP_ALLOC_LIMBS (bsize);
1.1.1.1.8.1  tls 	  MPN_RSHIFT_OR_COPY (tp, bp, bsize, btwos);
1.1.1.1.8.1  tls 	  bp = tp;
1.1.1.1.8.1  tls 	  TRACE (printf ("rshift or copy bp,bsize, new bsize=%ld\n", bsize));
1.1.1.1.8.1  tls 	}
        1.1  mrg #if HAVE_NATIVE_mpn_mul_2
        1.1  mrg       /* in case 3 limbs rshift to 2 and hence use the mul_2 loop below */
        1.1  mrg       blimb_low = bp[0];
        1.1  mrg #endif
        1.1  mrg       blimb = bp[bsize-1];
        1.1  mrg
        1.1  mrg       TRACE (printf ("big bsize=%ld  ", bsize);
1.1.1.1.8.1  tls 	     mpn_trace ("b", bp, bsize));
        1.1  mrg     }
        1.1  mrg
        1.1  mrg   /* At this point blimb is the most significant limb of the base to use.
        1.1  mrg
        1.1  mrg      Each factor of b takes (bsize*BPML-cnt) bits and there's e of them; +1
        1.1  mrg      limb to round up the division; +1 for multiplies all using an extra
        1.1  mrg      limb over the true size; +2 for rl at the end; +1 for lshift at the
        1.1  mrg      end.
        1.1  mrg
        1.1  mrg      The size calculation here is reasonably accurate.  The base is at least
        1.1  mrg      half a limb, so in 32 bits the worst case is 2^16+1 treated as 17 bits
        1.1  mrg      when it will power up as just over 16, an overestimate of 17/16 =
        1.1  mrg      6.25%.  For a 64-bit limb it's half that.
        1.1  mrg
        1.1  mrg      If e==0 then blimb won't be anything useful (though it will be
        1.1  mrg      non-zero), but that doesn't matter since we just end up with ralloc==5,
        1.1  mrg      and that's fine for 2 limbs of rl and 1 of lshift.  */
        1.1  mrg
        1.1  mrg   ASSERT (blimb != 0);
        1.1  mrg   count_leading_zeros (cnt, blimb);
        1.1  mrg   ralloc = (bsize*GMP_NUMB_BITS - cnt + GMP_NAIL_BITS) * e / GMP_NUMB_BITS + 5;
        1.1  mrg   TRACE (printf ("ralloc %ld, from bsize=%ld blimb=0x%lX cnt=%d\n",
1.1.1.1.8.1  tls 		 ralloc, bsize, blimb, cnt));
1.1.1.1.8.1  tls   rp = MPZ_REALLOC (r, ralloc + rtwos_limbs);
        1.1  mrg
        1.1  mrg   /* Low zero limbs resulting from powers of 2. */
        1.1  mrg   MPN_ZERO (rp, rtwos_limbs);
        1.1  mrg   rp += rtwos_limbs;
        1.1  mrg
        1.1  mrg   if (e == 0)
        1.1  mrg     {
        1.1  mrg       /* Any e==0 other than via bsize==1 or bsize==2 is covered at the
1.1.1.1.8.1  tls 	 start. */
        1.1  mrg       rp[0] = rl;
        1.1  mrg       rsize = 1;
        1.1  mrg #if HAVE_NATIVE_mpn_mul_2
        1.1  mrg       rp[1] = rl_high;
        1.1  mrg       rsize += (rl_high != 0);
        1.1  mrg #endif
        1.1  mrg       ASSERT (rp[rsize-1] != 0);
        1.1  mrg     }
        1.1  mrg   else
        1.1  mrg     {
        1.1  mrg       mp_ptr     tp;
        1.1  mrg       mp_size_t  talloc;
        1.1  mrg
        1.1  mrg       /* In the mpn_mul_1 or mpn_mul_2 loops or in the mpn_mul loop when the
1.1.1.1.8.1  tls 	 low bit of e is zero, tp only has to hold the second last power
1.1.1.1.8.1  tls 	 step, which is half the size of the final result.  There's no need
1.1.1.1.8.1  tls 	 to round up the divide by 2, since ralloc includes a +2 for rl
1.1.1.1.8.1  tls 	 which not needed by tp.  In the mpn_mul loop when the low bit of e
1.1.1.1.8.1  tls 	 is 1, tp must hold nearly the full result, so just size it the same
1.1.1.1.8.1  tls 	 as rp.  */
        1.1  mrg
        1.1  mrg       talloc = ralloc;
        1.1  mrg #if HAVE_NATIVE_mpn_mul_2
        1.1  mrg       if (bsize <= 2 || (e & 1) == 0)
1.1.1.1.8.1  tls 	talloc /= 2;
        1.1  mrg #else
        1.1  mrg       if (bsize <= 1 || (e & 1) == 0)
1.1.1.1.8.1  tls 	talloc /= 2;
        1.1  mrg #endif
        1.1  mrg       TRACE (printf ("talloc %ld\n", talloc));
        1.1  mrg       tp = TMP_ALLOC_LIMBS (talloc);
        1.1  mrg
        1.1  mrg       /* Go from high to low over the bits of e, starting with i pointing at
1.1.1.1.8.1  tls 	 the bit below the highest 1 (which will mean i==-1 if e==1).  */
1.1.1.1.8.1  tls       count_leading_zeros (cnt, (mp_limb_t) e);
        1.1  mrg       i = GMP_LIMB_BITS - cnt - 2;
        1.1  mrg
        1.1  mrg #if HAVE_NATIVE_mpn_mul_2
        1.1  mrg       if (bsize <= 2)
1.1.1.1.8.1  tls 	{
1.1.1.1.8.1  tls 	  mp_limb_t  mult[2];
        1.1  mrg
1.1.1.1.8.1  tls 	  /* Any bsize==1 will have been powered above to be two limbs. */
1.1.1.1.8.1  tls 	  ASSERT (bsize == 2);
1.1.1.1.8.1  tls 	  ASSERT (blimb != 0);
1.1.1.1.8.1  tls
1.1.1.1.8.1  tls 	  /* Arrange the final result ends up in r, not in the temp space */
1.1.1.1.8.1  tls 	  if ((i & 1) == 0)
1.1.1.1.8.1  tls 	    SWAP_RP_TP;
1.1.1.1.8.1  tls
1.1.1.1.8.1  tls 	  rp[0] = blimb_low;
1.1.1.1.8.1  tls 	  rp[1] = blimb;
1.1.1.1.8.1  tls 	  rsize = 2;
1.1.1.1.8.1  tls
1.1.1.1.8.1  tls 	  mult[0] = blimb_low;
1.1.1.1.8.1  tls 	  mult[1] = blimb;
1.1.1.1.8.1  tls
1.1.1.1.8.1  tls 	  for ( ; i >= 0; i--)
1.1.1.1.8.1  tls 	    {
1.1.1.1.8.1  tls 	      TRACE (printf ("mul_2 loop i=%d e=0x%lX, rsize=%ld ralloc=%ld talloc=%ld\n",
1.1.1.1.8.1  tls 			     i, e, rsize, ralloc, talloc);
1.1.1.1.8.1  tls 		     mpn_trace ("r", rp, rsize));
1.1.1.1.8.1  tls
1.1.1.1.8.1  tls 	      MPN_SQR (tp, talloc, rp, rsize);
1.1.1.1.8.1  tls 	      SWAP_RP_TP;
1.1.1.1.8.1  tls 	      if ((e & (1L << i)) != 0)
1.1.1.1.8.1  tls 		MPN_MUL_2 (rp, rsize, ralloc, mult);
1.1.1.1.8.1  tls 	    }
1.1.1.1.8.1  tls
1.1.1.1.8.1  tls 	  TRACE (mpn_trace ("mul_2 before rl, r", rp, rsize));
1.1.1.1.8.1  tls 	  if (rl_high != 0)
1.1.1.1.8.1  tls 	    {
1.1.1.1.8.1  tls 	      mult[0] = rl;
1.1.1.1.8.1  tls 	      mult[1] = rl_high;
1.1.1.1.8.1  tls 	      MPN_MUL_2 (rp, rsize, ralloc, mult);
1.1.1.1.8.1  tls 	    }
1.1.1.1.8.1  tls 	  else if (rl != 1)
1.1.1.1.8.1  tls 	    MPN_MUL_1 (rp, rsize, ralloc, rl);
1.1.1.1.8.1  tls 	}
        1.1  mrg #else
        1.1  mrg       if (bsize == 1)
1.1.1.1.8.1  tls 	{
1.1.1.1.8.1  tls 	  /* Arrange the final result ends up in r, not in the temp space */
1.1.1.1.8.1  tls 	  if ((i & 1) == 0)
1.1.1.1.8.1  tls 	    SWAP_RP_TP;
1.1.1.1.8.1  tls
1.1.1.1.8.1  tls 	  rp[0] = blimb;
1.1.1.1.8.1  tls 	  rsize = 1;
1.1.1.1.8.1  tls
1.1.1.1.8.1  tls 	  for ( ; i >= 0; i--)
1.1.1.1.8.1  tls 	    {
1.1.1.1.8.1  tls 	      TRACE (printf ("mul_1 loop i=%d e=0x%lX, rsize=%ld ralloc=%ld talloc=%ld\n",
1.1.1.1.8.1  tls 			     i, e, rsize, ralloc, talloc);
1.1.1.1.8.1  tls 		     mpn_trace ("r", rp, rsize));
1.1.1.1.8.1  tls
1.1.1.1.8.1  tls 	      MPN_SQR (tp, talloc, rp, rsize);
1.1.1.1.8.1  tls 	      SWAP_RP_TP;
1.1.1.1.8.1  tls 	      if ((e & (1L << i)) != 0)
1.1.1.1.8.1  tls 		MPN_MUL_1 (rp, rsize, ralloc, blimb);
1.1.1.1.8.1  tls 	    }
1.1.1.1.8.1  tls
1.1.1.1.8.1  tls 	  TRACE (mpn_trace ("mul_1 before rl, r", rp, rsize));
1.1.1.1.8.1  tls 	  if (rl != 1)
1.1.1.1.8.1  tls 	    MPN_MUL_1 (rp, rsize, ralloc, rl);
1.1.1.1.8.1  tls 	}
        1.1  mrg #endif
        1.1  mrg       else
1.1.1.1.8.1  tls 	{
1.1.1.1.8.1  tls 	  int  parity;
        1.1  mrg
1.1.1.1.8.1  tls 	  /* Arrange the final result ends up in r, not in the temp space */
1.1.1.1.8.1  tls 	  ULONG_PARITY (parity, e);
1.1.1.1.8.1  tls 	  if (((parity ^ i) & 1) != 0)
1.1.1.1.8.1  tls 	    SWAP_RP_TP;
1.1.1.1.8.1  tls
1.1.1.1.8.1  tls 	  MPN_COPY (rp, bp, bsize);
1.1.1.1.8.1  tls 	  rsize = bsize;
1.1.1.1.8.1  tls
1.1.1.1.8.1  tls 	  for ( ; i >= 0; i--)
1.1.1.1.8.1  tls 	    {
1.1.1.1.8.1  tls 	      TRACE (printf ("mul loop i=%d e=0x%lX, rsize=%ld ralloc=%ld talloc=%ld\n",
1.1.1.1.8.1  tls 			     i, e, rsize, ralloc, talloc);
1.1.1.1.8.1  tls 		     mpn_trace ("r", rp, rsize));
1.1.1.1.8.1  tls
1.1.1.1.8.1  tls 	      MPN_SQR (tp, talloc, rp, rsize);
1.1.1.1.8.1  tls 	      SWAP_RP_TP;
1.1.1.1.8.1  tls 	      if ((e & (1L << i)) != 0)
1.1.1.1.8.1  tls 		{
1.1.1.1.8.1  tls 		  MPN_MUL (tp, talloc, rp, rsize, bp, bsize);
1.1.1.1.8.1  tls 		  SWAP_RP_TP;
1.1.1.1.8.1  tls 		}
1.1.1.1.8.1  tls 	    }
1.1.1.1.8.1  tls 	}
        1.1  mrg     }
        1.1  mrg
        1.1  mrg   ASSERT (rp == PTR(r) + rtwos_limbs);
        1.1  mrg   TRACE (mpn_trace ("end loop r", rp, rsize));
        1.1  mrg   TMP_FREE;
        1.1  mrg
        1.1  mrg   /* Apply any partial limb factors of 2. */
        1.1  mrg   if (rtwos_bits != 0)
        1.1  mrg     {
        1.1  mrg       MPN_LSHIFT (rp, rsize, ralloc, (unsigned) rtwos_bits);
        1.1  mrg       TRACE (mpn_trace ("lshift r", rp, rsize));
        1.1  mrg     }
        1.1  mrg
        1.1  mrg   rsize += rtwos_limbs;
        1.1  mrg   SIZ(r) = (rneg ? -rsize : rsize);
        1.1  mrg }