Home | History | Annotate | Line # | Download | only in mpz
n_pow_ui.c revision 1.1
      1  1.1  mrg /* mpz_n_pow_ui -- mpn raised to ulong.
      2  1.1  mrg 
      3  1.1  mrg    THE FUNCTIONS IN THIS FILE ARE FOR INTERNAL USE ONLY.  THEY'RE ALMOST
      4  1.1  mrg    CERTAIN TO BE SUBJECT TO INCOMPATIBLE CHANGES OR DISAPPEAR COMPLETELY IN
      5  1.1  mrg    FUTURE GNU MP RELEASES.
      6  1.1  mrg 
      7  1.1  mrg Copyright 2001, 2002, 2005 Free Software Foundation, Inc.
      8  1.1  mrg 
      9  1.1  mrg This file is part of the GNU MP Library.
     10  1.1  mrg 
     11  1.1  mrg The GNU MP Library is free software; you can redistribute it and/or modify
     12  1.1  mrg it under the terms of the GNU Lesser General Public License as published by
     13  1.1  mrg the Free Software Foundation; either version 3 of the License, or (at your
     14  1.1  mrg option) any later version.
     15  1.1  mrg 
     16  1.1  mrg The GNU MP Library is distributed in the hope that it will be useful, but
     17  1.1  mrg WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
     18  1.1  mrg or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
     19  1.1  mrg License for more details.
     20  1.1  mrg 
     21  1.1  mrg You should have received a copy of the GNU Lesser General Public License
     22  1.1  mrg along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.  */
     23  1.1  mrg 
     24  1.1  mrg #include "gmp.h"
     25  1.1  mrg #include "gmp-impl.h"
     26  1.1  mrg #include "longlong.h"
     27  1.1  mrg 
     28  1.1  mrg 
     29  1.1  mrg /* Change this to "#define TRACE(x) x" for some traces. */
     30  1.1  mrg #define TRACE(x)
     31  1.1  mrg 
     32  1.1  mrg 
     33  1.1  mrg /* Use this to test the mul_2 code on a CPU without a native version of that
     34  1.1  mrg    routine.  */
     35  1.1  mrg #if 0
     36  1.1  mrg #define mpn_mul_2  refmpn_mul_2
     37  1.1  mrg #define HAVE_NATIVE_mpn_mul_2  1
     38  1.1  mrg #endif
     39  1.1  mrg 
     40  1.1  mrg 
     41  1.1  mrg /* mpz_pow_ui and mpz_ui_pow_ui want to share almost all of this code.
     42  1.1  mrg    ui_pow_ui doesn't need the mpn_mul based powering loop or the tests on
     43  1.1  mrg    bsize==2 or >2, but separating that isn't easy because there's shared
     44  1.1  mrg    code both before and after (the size calculations and the powers of 2
     45  1.1  mrg    handling).
     46  1.1  mrg 
     47  1.1  mrg    Alternatives:
     48  1.1  mrg 
     49  1.1  mrg    It would work to just use the mpn_mul powering loop for 1 and 2 limb
     50  1.1  mrg    bases, but the current separate loop allows mul_1 and mul_2 to be done
     51  1.1  mrg    in-place, which might help cache locality a bit.  If mpn_mul was relaxed
     52  1.1  mrg    to allow source==dest when vn==1 or 2 then some pointer twiddling might
     53  1.1  mrg    let us get the same effect in one loop.
     54  1.1  mrg 
     55  1.1  mrg    The initial powering for bsize==1 into blimb or blimb:blimb_low doesn't
     56  1.1  mrg    form the biggest possible power of b that fits, only the biggest power of
     57  1.1  mrg    2 power, ie. b^(2^n).  It'd be possible to choose a bigger power, perhaps
     58  1.1  mrg    using mp_bases[b].big_base for small b, and thereby get better value
     59  1.1  mrg    from mpn_mul_1 or mpn_mul_2 in the bignum powering.  It's felt that doing
     60  1.1  mrg    so would be more complicated than it's worth, and could well end up being
     61  1.1  mrg    a slowdown for small e.  For big e on the other hand the algorithm is
     62  1.1  mrg    dominated by mpn_sqr so there wouldn't much of a saving.  The current
     63  1.1  mrg    code can be viewed as simply doing the first few steps of the powering in
     64  1.1  mrg    a single or double limb where possible.
     65  1.1  mrg 
     66  1.1  mrg    If r==b, and blow_twos==0, and r must be realloc'ed, then the temporary
     67  1.1  mrg    copy made of b is unnecessary.  We could just use the old alloc'ed block
     68  1.1  mrg    and free it at the end.  But arranging this seems like a lot more trouble
     69  1.1  mrg    than it's worth.  */
     70  1.1  mrg 
     71  1.1  mrg 
     72  1.1  mrg /* floor(sqrt(GMP_NUMB_MAX)), ie. the biggest value that can be squared in
     73  1.1  mrg    a limb without overflowing.
     74  1.1  mrg    FIXME: This formula is an underestimate when GMP_NUMB_BITS is odd. */
     75  1.1  mrg 
     76  1.1  mrg #define GMP_NUMB_HALFMAX  (((mp_limb_t) 1 << GMP_NUMB_BITS/2) - 1)
     77  1.1  mrg 
     78  1.1  mrg 
     79  1.1  mrg /* The following are for convenience, they update the size and check the
     80  1.1  mrg    alloc.  */
     81  1.1  mrg 
     82  1.1  mrg #define MPN_SQR(dst, alloc, src, size)          \
     83  1.1  mrg   do {                                          \
     84  1.1  mrg     ASSERT (2*(size) <= (alloc));               \
     85  1.1  mrg     mpn_sqr (dst, src, size);                   \
     86  1.1  mrg     (size) *= 2;                                \
     87  1.1  mrg     (size) -= ((dst)[(size)-1] == 0);           \
     88  1.1  mrg   } while (0)
     89  1.1  mrg 
     90  1.1  mrg #define MPN_MUL(dst, alloc, src, size, src2, size2)     \
     91  1.1  mrg   do {                                                  \
     92  1.1  mrg     mp_limb_t  cy;                                      \
     93  1.1  mrg     ASSERT ((size) + (size2) <= (alloc));               \
     94  1.1  mrg     cy = mpn_mul (dst, src, size, src2, size2);         \
     95  1.1  mrg     (size) += (size2) - (cy == 0);                      \
     96  1.1  mrg   } while (0)
     97  1.1  mrg 
     98  1.1  mrg #define MPN_MUL_2(ptr, size, alloc, mult)       \
     99  1.1  mrg   do {                                          \
    100  1.1  mrg     mp_limb_t  cy;                              \
    101  1.1  mrg     ASSERT ((size)+2 <= (alloc));               \
    102  1.1  mrg     cy = mpn_mul_2 (ptr, ptr, size, mult);      \
    103  1.1  mrg     (size)++;                                   \
    104  1.1  mrg     (ptr)[(size)] = cy;                         \
    105  1.1  mrg     (size) += (cy != 0);                        \
    106  1.1  mrg   } while (0)
    107  1.1  mrg 
    108  1.1  mrg #define MPN_MUL_1(ptr, size, alloc, limb)       \
    109  1.1  mrg   do {                                          \
    110  1.1  mrg     mp_limb_t  cy;                              \
    111  1.1  mrg     ASSERT ((size)+1 <= (alloc));               \
    112  1.1  mrg     cy = mpn_mul_1 (ptr, ptr, size, limb);      \
    113  1.1  mrg     (ptr)[size] = cy;                           \
    114  1.1  mrg     (size) += (cy != 0);                        \
    115  1.1  mrg   } while (0)
    116  1.1  mrg 
    117  1.1  mrg #define MPN_LSHIFT(ptr, size, alloc, shift)     \
    118  1.1  mrg   do {                                          \
    119  1.1  mrg     mp_limb_t  cy;                              \
    120  1.1  mrg     ASSERT ((size)+1 <= (alloc));               \
    121  1.1  mrg     cy = mpn_lshift (ptr, ptr, size, shift);    \
    122  1.1  mrg     (ptr)[size] = cy;                           \
    123  1.1  mrg     (size) += (cy != 0);                        \
    124  1.1  mrg   } while (0)
    125  1.1  mrg 
    126  1.1  mrg #define MPN_RSHIFT_OR_COPY(dst, src, size, shift)       \
    127  1.1  mrg   do {                                                  \
    128  1.1  mrg     if ((shift) == 0)                                   \
    129  1.1  mrg       MPN_COPY (dst, src, size);                        \
    130  1.1  mrg     else                                                \
    131  1.1  mrg       {                                                 \
    132  1.1  mrg         mpn_rshift (dst, src, size, shift);             \
    133  1.1  mrg         (size) -= ((dst)[(size)-1] == 0);               \
    134  1.1  mrg       }                                                 \
    135  1.1  mrg   } while (0)
    136  1.1  mrg 
    137  1.1  mrg 
    138  1.1  mrg /* ralloc and talloc are only wanted for ASSERTs, after the initial space
    139  1.1  mrg    allocations.  Avoid writing values to them in a normal build, to ensure
    140  1.1  mrg    the compiler lets them go dead.  gcc already figures this out itself
    141  1.1  mrg    actually.  */
    142  1.1  mrg 
    143  1.1  mrg #define SWAP_RP_TP                                      \
    144  1.1  mrg   do {                                                  \
    145  1.1  mrg     MP_PTR_SWAP (rp, tp);                               \
    146  1.1  mrg     ASSERT_CODE (MP_SIZE_T_SWAP (ralloc, talloc));      \
    147  1.1  mrg   } while (0)
    148  1.1  mrg 
    149  1.1  mrg 
    150  1.1  mrg void
    151  1.1  mrg mpz_n_pow_ui (mpz_ptr r, mp_srcptr bp, mp_size_t bsize, unsigned long int e)
    152  1.1  mrg {
    153  1.1  mrg   mp_ptr         rp;
    154  1.1  mrg   mp_size_t      rtwos_limbs, ralloc, rsize;
    155  1.1  mrg   int            rneg, i, cnt, btwos, r_bp_overlap;
    156  1.1  mrg   mp_limb_t      blimb, rl;
    157  1.1  mrg   mp_bitcnt_t    rtwos_bits;
    158  1.1  mrg #if HAVE_NATIVE_mpn_mul_2
    159  1.1  mrg   mp_limb_t      blimb_low, rl_high;
    160  1.1  mrg #else
    161  1.1  mrg   mp_limb_t      b_twolimbs[2];
    162  1.1  mrg #endif
    163  1.1  mrg   TMP_DECL;
    164  1.1  mrg 
    165  1.1  mrg   TRACE (printf ("mpz_n_pow_ui rp=0x%lX bp=0x%lX bsize=%ld e=%lu (0x%lX)\n",
    166  1.1  mrg                  PTR(r), bp, bsize, e, e);
    167  1.1  mrg          mpn_trace ("b", bp, bsize));
    168  1.1  mrg 
    169  1.1  mrg   ASSERT (bsize == 0 || bp[ABS(bsize)-1] != 0);
    170  1.1  mrg   ASSERT (MPN_SAME_OR_SEPARATE2_P (PTR(r), ABSIZ(r), bp, bsize));
    171  1.1  mrg 
    172  1.1  mrg   /* b^0 == 1, including 0^0 == 1 */
    173  1.1  mrg   if (e == 0)
    174  1.1  mrg     {
    175  1.1  mrg       PTR(r)[0] = 1;
    176  1.1  mrg       SIZ(r) = 1;
    177  1.1  mrg       return;
    178  1.1  mrg     }
    179  1.1  mrg 
    180  1.1  mrg   /* 0^e == 0 apart from 0^0 above */
    181  1.1  mrg   if (bsize == 0)
    182  1.1  mrg     {
    183  1.1  mrg       SIZ(r) = 0;
    184  1.1  mrg       return;
    185  1.1  mrg     }
    186  1.1  mrg 
    187  1.1  mrg   /* Sign of the final result. */
    188  1.1  mrg   rneg = (bsize < 0 && (e & 1) != 0);
    189  1.1  mrg   bsize = ABS (bsize);
    190  1.1  mrg   TRACE (printf ("rneg %d\n", rneg));
    191  1.1  mrg 
    192  1.1  mrg   r_bp_overlap = (PTR(r) == bp);
    193  1.1  mrg 
    194  1.1  mrg   /* Strip low zero limbs from b. */
    195  1.1  mrg   rtwos_limbs = 0;
    196  1.1  mrg   for (blimb = *bp; blimb == 0; blimb = *++bp)
    197  1.1  mrg     {
    198  1.1  mrg       rtwos_limbs += e;
    199  1.1  mrg       bsize--; ASSERT (bsize >= 1);
    200  1.1  mrg     }
    201  1.1  mrg   TRACE (printf ("trailing zero rtwos_limbs=%ld\n", rtwos_limbs));
    202  1.1  mrg 
    203  1.1  mrg   /* Strip low zero bits from b. */
    204  1.1  mrg   count_trailing_zeros (btwos, blimb);
    205  1.1  mrg   blimb >>= btwos;
    206  1.1  mrg   rtwos_bits = e * btwos;
    207  1.1  mrg   rtwos_limbs += rtwos_bits / GMP_NUMB_BITS;
    208  1.1  mrg   rtwos_bits %= GMP_NUMB_BITS;
    209  1.1  mrg   TRACE (printf ("trailing zero btwos=%d rtwos_limbs=%ld rtwos_bits=%lu\n",
    210  1.1  mrg                  btwos, rtwos_limbs, rtwos_bits));
    211  1.1  mrg 
    212  1.1  mrg   TMP_MARK;
    213  1.1  mrg 
    214  1.1  mrg   rl = 1;
    215  1.1  mrg #if HAVE_NATIVE_mpn_mul_2
    216  1.1  mrg   rl_high = 0;
    217  1.1  mrg #endif
    218  1.1  mrg 
    219  1.1  mrg   if (bsize == 1)
    220  1.1  mrg     {
    221  1.1  mrg     bsize_1:
    222  1.1  mrg       /* Power up as far as possible within blimb.  We start here with e!=0,
    223  1.1  mrg          but if e is small then we might reach e==0 and the whole b^e in rl.
    224  1.1  mrg          Notice this code works when blimb==1 too, reaching e==0.  */
    225  1.1  mrg 
    226  1.1  mrg       while (blimb <= GMP_NUMB_HALFMAX)
    227  1.1  mrg         {
    228  1.1  mrg           TRACE (printf ("small e=0x%lX blimb=0x%lX rl=0x%lX\n",
    229  1.1  mrg                          e, blimb, rl));
    230  1.1  mrg           ASSERT (e != 0);
    231  1.1  mrg           if ((e & 1) != 0)
    232  1.1  mrg             rl *= blimb;
    233  1.1  mrg           e >>= 1;
    234  1.1  mrg           if (e == 0)
    235  1.1  mrg             goto got_rl;
    236  1.1  mrg           blimb *= blimb;
    237  1.1  mrg         }
    238  1.1  mrg 
    239  1.1  mrg #if HAVE_NATIVE_mpn_mul_2
    240  1.1  mrg       TRACE (printf ("single power, e=0x%lX b=0x%lX rl=0x%lX\n",
    241  1.1  mrg                      e, blimb, rl));
    242  1.1  mrg 
    243  1.1  mrg       /* Can power b once more into blimb:blimb_low */
    244  1.1  mrg       bsize = 2;
    245  1.1  mrg       ASSERT (e != 0);
    246  1.1  mrg       if ((e & 1) != 0)
    247  1.1  mrg 	{
    248  1.1  mrg 	  umul_ppmm (rl_high, rl, rl, blimb << GMP_NAIL_BITS);
    249  1.1  mrg 	  rl >>= GMP_NAIL_BITS;
    250  1.1  mrg 	}
    251  1.1  mrg       e >>= 1;
    252  1.1  mrg       umul_ppmm (blimb, blimb_low, blimb, blimb << GMP_NAIL_BITS);
    253  1.1  mrg       blimb_low >>= GMP_NAIL_BITS;
    254  1.1  mrg 
    255  1.1  mrg     got_rl:
    256  1.1  mrg       TRACE (printf ("double power e=0x%lX blimb=0x%lX:0x%lX rl=0x%lX:%lX\n",
    257  1.1  mrg                      e, blimb, blimb_low, rl_high, rl));
    258  1.1  mrg 
    259  1.1  mrg       /* Combine left-over rtwos_bits into rl_high:rl to be handled by the
    260  1.1  mrg          final mul_1 or mul_2 rather than a separate lshift.
    261  1.1  mrg          - rl_high:rl mustn't be 1 (since then there's no final mul)
    262  1.1  mrg          - rl_high mustn't overflow
    263  1.1  mrg          - rl_high mustn't change to non-zero, since mul_1+lshift is
    264  1.1  mrg          probably faster than mul_2 (FIXME: is this true?)  */
    265  1.1  mrg 
    266  1.1  mrg       if (rtwos_bits != 0
    267  1.1  mrg           && ! (rl_high == 0 && rl == 1)
    268  1.1  mrg           && (rl_high >> (GMP_NUMB_BITS-rtwos_bits)) == 0)
    269  1.1  mrg         {
    270  1.1  mrg           mp_limb_t  new_rl_high = (rl_high << rtwos_bits)
    271  1.1  mrg             | (rl >> (GMP_NUMB_BITS-rtwos_bits));
    272  1.1  mrg           if (! (rl_high == 0 && new_rl_high != 0))
    273  1.1  mrg             {
    274  1.1  mrg               rl_high = new_rl_high;
    275  1.1  mrg               rl <<= rtwos_bits;
    276  1.1  mrg               rtwos_bits = 0;
    277  1.1  mrg               TRACE (printf ("merged rtwos_bits, rl=0x%lX:%lX\n",
    278  1.1  mrg                              rl_high, rl));
    279  1.1  mrg             }
    280  1.1  mrg         }
    281  1.1  mrg #else
    282  1.1  mrg     got_rl:
    283  1.1  mrg       TRACE (printf ("small power e=0x%lX blimb=0x%lX rl=0x%lX\n",
    284  1.1  mrg                      e, blimb, rl));
    285  1.1  mrg 
    286  1.1  mrg       /* Combine left-over rtwos_bits into rl to be handled by the final
    287  1.1  mrg          mul_1 rather than a separate lshift.
    288  1.1  mrg          - rl mustn't be 1 (since then there's no final mul)
    289  1.1  mrg          - rl mustn't overflow  */
    290  1.1  mrg 
    291  1.1  mrg       if (rtwos_bits != 0
    292  1.1  mrg           && rl != 1
    293  1.1  mrg           && (rl >> (GMP_NUMB_BITS-rtwos_bits)) == 0)
    294  1.1  mrg         {
    295  1.1  mrg           rl <<= rtwos_bits;
    296  1.1  mrg           rtwos_bits = 0;
    297  1.1  mrg           TRACE (printf ("merged rtwos_bits, rl=0x%lX\n", rl));
    298  1.1  mrg         }
    299  1.1  mrg #endif
    300  1.1  mrg     }
    301  1.1  mrg   else if (bsize == 2)
    302  1.1  mrg     {
    303  1.1  mrg       mp_limb_t  bsecond = bp[1];
    304  1.1  mrg       if (btwos != 0)
    305  1.1  mrg         blimb |= (bsecond << (GMP_NUMB_BITS - btwos)) & GMP_NUMB_MASK;
    306  1.1  mrg       bsecond >>= btwos;
    307  1.1  mrg       if (bsecond == 0)
    308  1.1  mrg         {
    309  1.1  mrg           /* Two limbs became one after rshift. */
    310  1.1  mrg           bsize = 1;
    311  1.1  mrg           goto bsize_1;
    312  1.1  mrg         }
    313  1.1  mrg 
    314  1.1  mrg       TRACE (printf ("bsize==2 using b=0x%lX:%lX", bsecond, blimb));
    315  1.1  mrg #if HAVE_NATIVE_mpn_mul_2
    316  1.1  mrg       blimb_low = blimb;
    317  1.1  mrg #else
    318  1.1  mrg       bp = b_twolimbs;
    319  1.1  mrg       b_twolimbs[0] = blimb;
    320  1.1  mrg       b_twolimbs[1] = bsecond;
    321  1.1  mrg #endif
    322  1.1  mrg       blimb = bsecond;
    323  1.1  mrg     }
    324  1.1  mrg   else
    325  1.1  mrg     {
    326  1.1  mrg       if (r_bp_overlap || btwos != 0)
    327  1.1  mrg         {
    328  1.1  mrg           mp_ptr tp = TMP_ALLOC_LIMBS (bsize);
    329  1.1  mrg           MPN_RSHIFT_OR_COPY (tp, bp, bsize, btwos);
    330  1.1  mrg           bp = tp;
    331  1.1  mrg           TRACE (printf ("rshift or copy bp,bsize, new bsize=%ld\n", bsize));
    332  1.1  mrg         }
    333  1.1  mrg #if HAVE_NATIVE_mpn_mul_2
    334  1.1  mrg       /* in case 3 limbs rshift to 2 and hence use the mul_2 loop below */
    335  1.1  mrg       blimb_low = bp[0];
    336  1.1  mrg #endif
    337  1.1  mrg       blimb = bp[bsize-1];
    338  1.1  mrg 
    339  1.1  mrg       TRACE (printf ("big bsize=%ld  ", bsize);
    340  1.1  mrg              mpn_trace ("b", bp, bsize));
    341  1.1  mrg     }
    342  1.1  mrg 
    343  1.1  mrg   /* At this point blimb is the most significant limb of the base to use.
    344  1.1  mrg 
    345  1.1  mrg      Each factor of b takes (bsize*BPML-cnt) bits and there's e of them; +1
    346  1.1  mrg      limb to round up the division; +1 for multiplies all using an extra
    347  1.1  mrg      limb over the true size; +2 for rl at the end; +1 for lshift at the
    348  1.1  mrg      end.
    349  1.1  mrg 
    350  1.1  mrg      The size calculation here is reasonably accurate.  The base is at least
    351  1.1  mrg      half a limb, so in 32 bits the worst case is 2^16+1 treated as 17 bits
    352  1.1  mrg      when it will power up as just over 16, an overestimate of 17/16 =
    353  1.1  mrg      6.25%.  For a 64-bit limb it's half that.
    354  1.1  mrg 
    355  1.1  mrg      If e==0 then blimb won't be anything useful (though it will be
    356  1.1  mrg      non-zero), but that doesn't matter since we just end up with ralloc==5,
    357  1.1  mrg      and that's fine for 2 limbs of rl and 1 of lshift.  */
    358  1.1  mrg 
    359  1.1  mrg   ASSERT (blimb != 0);
    360  1.1  mrg   count_leading_zeros (cnt, blimb);
    361  1.1  mrg   ralloc = (bsize*GMP_NUMB_BITS - cnt + GMP_NAIL_BITS) * e / GMP_NUMB_BITS + 5;
    362  1.1  mrg   TRACE (printf ("ralloc %ld, from bsize=%ld blimb=0x%lX cnt=%d\n",
    363  1.1  mrg                  ralloc, bsize, blimb, cnt));
    364  1.1  mrg   MPZ_REALLOC (r, ralloc + rtwos_limbs);
    365  1.1  mrg   rp = PTR(r);
    366  1.1  mrg 
    367  1.1  mrg   /* Low zero limbs resulting from powers of 2. */
    368  1.1  mrg   MPN_ZERO (rp, rtwos_limbs);
    369  1.1  mrg   rp += rtwos_limbs;
    370  1.1  mrg 
    371  1.1  mrg   if (e == 0)
    372  1.1  mrg     {
    373  1.1  mrg       /* Any e==0 other than via bsize==1 or bsize==2 is covered at the
    374  1.1  mrg          start. */
    375  1.1  mrg       rp[0] = rl;
    376  1.1  mrg       rsize = 1;
    377  1.1  mrg #if HAVE_NATIVE_mpn_mul_2
    378  1.1  mrg       rp[1] = rl_high;
    379  1.1  mrg       rsize += (rl_high != 0);
    380  1.1  mrg #endif
    381  1.1  mrg       ASSERT (rp[rsize-1] != 0);
    382  1.1  mrg     }
    383  1.1  mrg   else
    384  1.1  mrg     {
    385  1.1  mrg       mp_ptr     tp;
    386  1.1  mrg       mp_size_t  talloc;
    387  1.1  mrg 
    388  1.1  mrg       /* In the mpn_mul_1 or mpn_mul_2 loops or in the mpn_mul loop when the
    389  1.1  mrg          low bit of e is zero, tp only has to hold the second last power
    390  1.1  mrg          step, which is half the size of the final result.  There's no need
    391  1.1  mrg          to round up the divide by 2, since ralloc includes a +2 for rl
    392  1.1  mrg          which not needed by tp.  In the mpn_mul loop when the low bit of e
    393  1.1  mrg          is 1, tp must hold nearly the full result, so just size it the same
    394  1.1  mrg          as rp.  */
    395  1.1  mrg 
    396  1.1  mrg       talloc = ralloc;
    397  1.1  mrg #if HAVE_NATIVE_mpn_mul_2
    398  1.1  mrg       if (bsize <= 2 || (e & 1) == 0)
    399  1.1  mrg         talloc /= 2;
    400  1.1  mrg #else
    401  1.1  mrg       if (bsize <= 1 || (e & 1) == 0)
    402  1.1  mrg         talloc /= 2;
    403  1.1  mrg #endif
    404  1.1  mrg       TRACE (printf ("talloc %ld\n", talloc));
    405  1.1  mrg       tp = TMP_ALLOC_LIMBS (talloc);
    406  1.1  mrg 
    407  1.1  mrg       /* Go from high to low over the bits of e, starting with i pointing at
    408  1.1  mrg          the bit below the highest 1 (which will mean i==-1 if e==1).  */
    409  1.1  mrg       count_leading_zeros (cnt, e);
    410  1.1  mrg       i = GMP_LIMB_BITS - cnt - 2;
    411  1.1  mrg 
    412  1.1  mrg #if HAVE_NATIVE_mpn_mul_2
    413  1.1  mrg       if (bsize <= 2)
    414  1.1  mrg         {
    415  1.1  mrg           mp_limb_t  mult[2];
    416  1.1  mrg 
    417  1.1  mrg           /* Any bsize==1 will have been powered above to be two limbs. */
    418  1.1  mrg           ASSERT (bsize == 2);
    419  1.1  mrg           ASSERT (blimb != 0);
    420  1.1  mrg 
    421  1.1  mrg           /* Arrange the final result ends up in r, not in the temp space */
    422  1.1  mrg           if ((i & 1) == 0)
    423  1.1  mrg             SWAP_RP_TP;
    424  1.1  mrg 
    425  1.1  mrg           rp[0] = blimb_low;
    426  1.1  mrg           rp[1] = blimb;
    427  1.1  mrg           rsize = 2;
    428  1.1  mrg 
    429  1.1  mrg           mult[0] = blimb_low;
    430  1.1  mrg           mult[1] = blimb;
    431  1.1  mrg 
    432  1.1  mrg           for ( ; i >= 0; i--)
    433  1.1  mrg             {
    434  1.1  mrg               TRACE (printf ("mul_2 loop i=%d e=0x%lX, rsize=%ld ralloc=%ld talloc=%ld\n",
    435  1.1  mrg                              i, e, rsize, ralloc, talloc);
    436  1.1  mrg                      mpn_trace ("r", rp, rsize));
    437  1.1  mrg 
    438  1.1  mrg               MPN_SQR (tp, talloc, rp, rsize);
    439  1.1  mrg               SWAP_RP_TP;
    440  1.1  mrg               if ((e & (1L << i)) != 0)
    441  1.1  mrg                 MPN_MUL_2 (rp, rsize, ralloc, mult);
    442  1.1  mrg             }
    443  1.1  mrg 
    444  1.1  mrg           TRACE (mpn_trace ("mul_2 before rl, r", rp, rsize));
    445  1.1  mrg           if (rl_high != 0)
    446  1.1  mrg             {
    447  1.1  mrg               mult[0] = rl;
    448  1.1  mrg               mult[1] = rl_high;
    449  1.1  mrg               MPN_MUL_2 (rp, rsize, ralloc, mult);
    450  1.1  mrg             }
    451  1.1  mrg           else if (rl != 1)
    452  1.1  mrg             MPN_MUL_1 (rp, rsize, ralloc, rl);
    453  1.1  mrg         }
    454  1.1  mrg #else
    455  1.1  mrg       if (bsize == 1)
    456  1.1  mrg         {
    457  1.1  mrg           /* Arrange the final result ends up in r, not in the temp space */
    458  1.1  mrg           if ((i & 1) == 0)
    459  1.1  mrg             SWAP_RP_TP;
    460  1.1  mrg 
    461  1.1  mrg           rp[0] = blimb;
    462  1.1  mrg           rsize = 1;
    463  1.1  mrg 
    464  1.1  mrg           for ( ; i >= 0; i--)
    465  1.1  mrg             {
    466  1.1  mrg               TRACE (printf ("mul_1 loop i=%d e=0x%lX, rsize=%ld ralloc=%ld talloc=%ld\n",
    467  1.1  mrg                              i, e, rsize, ralloc, talloc);
    468  1.1  mrg                      mpn_trace ("r", rp, rsize));
    469  1.1  mrg 
    470  1.1  mrg               MPN_SQR (tp, talloc, rp, rsize);
    471  1.1  mrg               SWAP_RP_TP;
    472  1.1  mrg               if ((e & (1L << i)) != 0)
    473  1.1  mrg                 MPN_MUL_1 (rp, rsize, ralloc, blimb);
    474  1.1  mrg             }
    475  1.1  mrg 
    476  1.1  mrg           TRACE (mpn_trace ("mul_1 before rl, r", rp, rsize));
    477  1.1  mrg           if (rl != 1)
    478  1.1  mrg             MPN_MUL_1 (rp, rsize, ralloc, rl);
    479  1.1  mrg         }
    480  1.1  mrg #endif
    481  1.1  mrg       else
    482  1.1  mrg         {
    483  1.1  mrg           int  parity;
    484  1.1  mrg 
    485  1.1  mrg           /* Arrange the final result ends up in r, not in the temp space */
    486  1.1  mrg           ULONG_PARITY (parity, e);
    487  1.1  mrg           if (((parity ^ i) & 1) != 0)
    488  1.1  mrg             SWAP_RP_TP;
    489  1.1  mrg 
    490  1.1  mrg           MPN_COPY (rp, bp, bsize);
    491  1.1  mrg           rsize = bsize;
    492  1.1  mrg 
    493  1.1  mrg           for ( ; i >= 0; i--)
    494  1.1  mrg             {
    495  1.1  mrg               TRACE (printf ("mul loop i=%d e=0x%lX, rsize=%ld ralloc=%ld talloc=%ld\n",
    496  1.1  mrg                              i, e, rsize, ralloc, talloc);
    497  1.1  mrg                      mpn_trace ("r", rp, rsize));
    498  1.1  mrg 
    499  1.1  mrg               MPN_SQR (tp, talloc, rp, rsize);
    500  1.1  mrg               SWAP_RP_TP;
    501  1.1  mrg               if ((e & (1L << i)) != 0)
    502  1.1  mrg                 {
    503  1.1  mrg                   MPN_MUL (tp, talloc, rp, rsize, bp, bsize);
    504  1.1  mrg                   SWAP_RP_TP;
    505  1.1  mrg                 }
    506  1.1  mrg             }
    507  1.1  mrg         }
    508  1.1  mrg     }
    509  1.1  mrg 
    510  1.1  mrg   ASSERT (rp == PTR(r) + rtwos_limbs);
    511  1.1  mrg   TRACE (mpn_trace ("end loop r", rp, rsize));
    512  1.1  mrg   TMP_FREE;
    513  1.1  mrg 
    514  1.1  mrg   /* Apply any partial limb factors of 2. */
    515  1.1  mrg   if (rtwos_bits != 0)
    516  1.1  mrg     {
    517  1.1  mrg       MPN_LSHIFT (rp, rsize, ralloc, (unsigned) rtwos_bits);
    518  1.1  mrg       TRACE (mpn_trace ("lshift r", rp, rsize));
    519  1.1  mrg     }
    520  1.1  mrg 
    521  1.1  mrg   rsize += rtwos_limbs;
    522  1.1  mrg   SIZ(r) = (rneg ? -rsize : rsize);
    523  1.1  mrg }
    524