n_pow_ui.c revision 1.1.1.1.8.1 1 1.1 mrg /* mpz_n_pow_ui -- mpn raised to ulong.
2 1.1 mrg
3 1.1 mrg THE FUNCTIONS IN THIS FILE ARE FOR INTERNAL USE ONLY. THEY'RE ALMOST
4 1.1 mrg CERTAIN TO BE SUBJECT TO INCOMPATIBLE CHANGES OR DISAPPEAR COMPLETELY IN
5 1.1 mrg FUTURE GNU MP RELEASES.
6 1.1 mrg
7 1.1.1.1.8.1 tls Copyright 2001, 2002, 2005, 2012 Free Software Foundation, Inc.
8 1.1 mrg
9 1.1 mrg This file is part of the GNU MP Library.
10 1.1 mrg
11 1.1 mrg The GNU MP Library is free software; you can redistribute it and/or modify
12 1.1 mrg it under the terms of the GNU Lesser General Public License as published by
13 1.1 mrg the Free Software Foundation; either version 3 of the License, or (at your
14 1.1 mrg option) any later version.
15 1.1 mrg
16 1.1 mrg The GNU MP Library is distributed in the hope that it will be useful, but
17 1.1 mrg WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
18 1.1 mrg or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
19 1.1 mrg License for more details.
20 1.1 mrg
21 1.1 mrg You should have received a copy of the GNU Lesser General Public License
22 1.1 mrg along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */
23 1.1 mrg
24 1.1 mrg #include "gmp.h"
25 1.1 mrg #include "gmp-impl.h"
26 1.1 mrg #include "longlong.h"
27 1.1 mrg
28 1.1 mrg
29 1.1 mrg /* Change this to "#define TRACE(x) x" for some traces. */
30 1.1 mrg #define TRACE(x)
31 1.1 mrg
32 1.1 mrg
33 1.1 mrg /* Use this to test the mul_2 code on a CPU without a native version of that
34 1.1 mrg routine. */
35 1.1 mrg #if 0
36 1.1 mrg #define mpn_mul_2 refmpn_mul_2
37 1.1 mrg #define HAVE_NATIVE_mpn_mul_2 1
38 1.1 mrg #endif
39 1.1 mrg
40 1.1 mrg
41 1.1 mrg /* mpz_pow_ui and mpz_ui_pow_ui want to share almost all of this code.
42 1.1 mrg ui_pow_ui doesn't need the mpn_mul based powering loop or the tests on
43 1.1 mrg bsize==2 or >2, but separating that isn't easy because there's shared
44 1.1 mrg code both before and after (the size calculations and the powers of 2
45 1.1 mrg handling).
46 1.1 mrg
47 1.1 mrg Alternatives:
48 1.1 mrg
49 1.1 mrg It would work to just use the mpn_mul powering loop for 1 and 2 limb
50 1.1 mrg bases, but the current separate loop allows mul_1 and mul_2 to be done
51 1.1 mrg in-place, which might help cache locality a bit. If mpn_mul was relaxed
52 1.1 mrg to allow source==dest when vn==1 or 2 then some pointer twiddling might
53 1.1 mrg let us get the same effect in one loop.
54 1.1 mrg
55 1.1 mrg The initial powering for bsize==1 into blimb or blimb:blimb_low doesn't
56 1.1 mrg form the biggest possible power of b that fits, only the biggest power of
57 1.1 mrg 2 power, ie. b^(2^n). It'd be possible to choose a bigger power, perhaps
58 1.1 mrg using mp_bases[b].big_base for small b, and thereby get better value
59 1.1 mrg from mpn_mul_1 or mpn_mul_2 in the bignum powering. It's felt that doing
60 1.1 mrg so would be more complicated than it's worth, and could well end up being
61 1.1 mrg a slowdown for small e. For big e on the other hand the algorithm is
62 1.1 mrg dominated by mpn_sqr so there wouldn't much of a saving. The current
63 1.1 mrg code can be viewed as simply doing the first few steps of the powering in
64 1.1 mrg a single or double limb where possible.
65 1.1 mrg
66 1.1 mrg If r==b, and blow_twos==0, and r must be realloc'ed, then the temporary
67 1.1 mrg copy made of b is unnecessary. We could just use the old alloc'ed block
68 1.1 mrg and free it at the end. But arranging this seems like a lot more trouble
69 1.1 mrg than it's worth. */
70 1.1 mrg
71 1.1 mrg
72 1.1 mrg /* floor(sqrt(GMP_NUMB_MAX)), ie. the biggest value that can be squared in
73 1.1 mrg a limb without overflowing.
74 1.1 mrg FIXME: This formula is an underestimate when GMP_NUMB_BITS is odd. */
75 1.1 mrg
76 1.1 mrg #define GMP_NUMB_HALFMAX (((mp_limb_t) 1 << GMP_NUMB_BITS/2) - 1)
77 1.1 mrg
78 1.1 mrg
79 1.1 mrg /* The following are for convenience, they update the size and check the
80 1.1 mrg alloc. */
81 1.1 mrg
82 1.1 mrg #define MPN_SQR(dst, alloc, src, size) \
83 1.1 mrg do { \
84 1.1 mrg ASSERT (2*(size) <= (alloc)); \
85 1.1 mrg mpn_sqr (dst, src, size); \
86 1.1 mrg (size) *= 2; \
87 1.1 mrg (size) -= ((dst)[(size)-1] == 0); \
88 1.1 mrg } while (0)
89 1.1 mrg
90 1.1 mrg #define MPN_MUL(dst, alloc, src, size, src2, size2) \
91 1.1 mrg do { \
92 1.1 mrg mp_limb_t cy; \
93 1.1 mrg ASSERT ((size) + (size2) <= (alloc)); \
94 1.1 mrg cy = mpn_mul (dst, src, size, src2, size2); \
95 1.1 mrg (size) += (size2) - (cy == 0); \
96 1.1 mrg } while (0)
97 1.1 mrg
98 1.1 mrg #define MPN_MUL_2(ptr, size, alloc, mult) \
99 1.1 mrg do { \
100 1.1 mrg mp_limb_t cy; \
101 1.1 mrg ASSERT ((size)+2 <= (alloc)); \
102 1.1 mrg cy = mpn_mul_2 (ptr, ptr, size, mult); \
103 1.1 mrg (size)++; \
104 1.1 mrg (ptr)[(size)] = cy; \
105 1.1 mrg (size) += (cy != 0); \
106 1.1 mrg } while (0)
107 1.1 mrg
108 1.1 mrg #define MPN_MUL_1(ptr, size, alloc, limb) \
109 1.1 mrg do { \
110 1.1 mrg mp_limb_t cy; \
111 1.1 mrg ASSERT ((size)+1 <= (alloc)); \
112 1.1 mrg cy = mpn_mul_1 (ptr, ptr, size, limb); \
113 1.1 mrg (ptr)[size] = cy; \
114 1.1 mrg (size) += (cy != 0); \
115 1.1 mrg } while (0)
116 1.1 mrg
117 1.1 mrg #define MPN_LSHIFT(ptr, size, alloc, shift) \
118 1.1 mrg do { \
119 1.1 mrg mp_limb_t cy; \
120 1.1 mrg ASSERT ((size)+1 <= (alloc)); \
121 1.1 mrg cy = mpn_lshift (ptr, ptr, size, shift); \
122 1.1 mrg (ptr)[size] = cy; \
123 1.1 mrg (size) += (cy != 0); \
124 1.1 mrg } while (0)
125 1.1 mrg
126 1.1 mrg #define MPN_RSHIFT_OR_COPY(dst, src, size, shift) \
127 1.1 mrg do { \
128 1.1 mrg if ((shift) == 0) \
129 1.1 mrg MPN_COPY (dst, src, size); \
130 1.1 mrg else \
131 1.1 mrg { \
132 1.1 mrg mpn_rshift (dst, src, size, shift); \
133 1.1 mrg (size) -= ((dst)[(size)-1] == 0); \
134 1.1 mrg } \
135 1.1 mrg } while (0)
136 1.1 mrg
137 1.1 mrg
138 1.1 mrg /* ralloc and talloc are only wanted for ASSERTs, after the initial space
139 1.1 mrg allocations. Avoid writing values to them in a normal build, to ensure
140 1.1 mrg the compiler lets them go dead. gcc already figures this out itself
141 1.1 mrg actually. */
142 1.1 mrg
143 1.1 mrg #define SWAP_RP_TP \
144 1.1 mrg do { \
145 1.1 mrg MP_PTR_SWAP (rp, tp); \
146 1.1 mrg ASSERT_CODE (MP_SIZE_T_SWAP (ralloc, talloc)); \
147 1.1 mrg } while (0)
148 1.1 mrg
149 1.1 mrg
150 1.1 mrg void
151 1.1 mrg mpz_n_pow_ui (mpz_ptr r, mp_srcptr bp, mp_size_t bsize, unsigned long int e)
152 1.1 mrg {
153 1.1 mrg mp_ptr rp;
154 1.1 mrg mp_size_t rtwos_limbs, ralloc, rsize;
155 1.1 mrg int rneg, i, cnt, btwos, r_bp_overlap;
156 1.1 mrg mp_limb_t blimb, rl;
157 1.1 mrg mp_bitcnt_t rtwos_bits;
158 1.1 mrg #if HAVE_NATIVE_mpn_mul_2
159 1.1 mrg mp_limb_t blimb_low, rl_high;
160 1.1 mrg #else
161 1.1 mrg mp_limb_t b_twolimbs[2];
162 1.1 mrg #endif
163 1.1 mrg TMP_DECL;
164 1.1 mrg
165 1.1 mrg TRACE (printf ("mpz_n_pow_ui rp=0x%lX bp=0x%lX bsize=%ld e=%lu (0x%lX)\n",
166 1.1.1.1.8.1 tls PTR(r), bp, bsize, e, e);
167 1.1.1.1.8.1 tls mpn_trace ("b", bp, bsize));
168 1.1 mrg
169 1.1 mrg ASSERT (bsize == 0 || bp[ABS(bsize)-1] != 0);
170 1.1.1.1.8.1 tls ASSERT (MPN_SAME_OR_SEPARATE2_P (PTR(r), ALLOC(r), bp, ABS(bsize)));
171 1.1 mrg
172 1.1 mrg /* b^0 == 1, including 0^0 == 1 */
173 1.1 mrg if (e == 0)
174 1.1 mrg {
175 1.1 mrg PTR(r)[0] = 1;
176 1.1 mrg SIZ(r) = 1;
177 1.1 mrg return;
178 1.1 mrg }
179 1.1 mrg
180 1.1 mrg /* 0^e == 0 apart from 0^0 above */
181 1.1 mrg if (bsize == 0)
182 1.1 mrg {
183 1.1 mrg SIZ(r) = 0;
184 1.1 mrg return;
185 1.1 mrg }
186 1.1 mrg
187 1.1 mrg /* Sign of the final result. */
188 1.1 mrg rneg = (bsize < 0 && (e & 1) != 0);
189 1.1 mrg bsize = ABS (bsize);
190 1.1 mrg TRACE (printf ("rneg %d\n", rneg));
191 1.1 mrg
192 1.1 mrg r_bp_overlap = (PTR(r) == bp);
193 1.1 mrg
194 1.1 mrg /* Strip low zero limbs from b. */
195 1.1 mrg rtwos_limbs = 0;
196 1.1 mrg for (blimb = *bp; blimb == 0; blimb = *++bp)
197 1.1 mrg {
198 1.1 mrg rtwos_limbs += e;
199 1.1 mrg bsize--; ASSERT (bsize >= 1);
200 1.1 mrg }
201 1.1 mrg TRACE (printf ("trailing zero rtwos_limbs=%ld\n", rtwos_limbs));
202 1.1 mrg
203 1.1 mrg /* Strip low zero bits from b. */
204 1.1 mrg count_trailing_zeros (btwos, blimb);
205 1.1 mrg blimb >>= btwos;
206 1.1 mrg rtwos_bits = e * btwos;
207 1.1 mrg rtwos_limbs += rtwos_bits / GMP_NUMB_BITS;
208 1.1 mrg rtwos_bits %= GMP_NUMB_BITS;
209 1.1 mrg TRACE (printf ("trailing zero btwos=%d rtwos_limbs=%ld rtwos_bits=%lu\n",
210 1.1.1.1.8.1 tls btwos, rtwos_limbs, rtwos_bits));
211 1.1 mrg
212 1.1 mrg TMP_MARK;
213 1.1 mrg
214 1.1 mrg rl = 1;
215 1.1 mrg #if HAVE_NATIVE_mpn_mul_2
216 1.1 mrg rl_high = 0;
217 1.1 mrg #endif
218 1.1 mrg
219 1.1 mrg if (bsize == 1)
220 1.1 mrg {
221 1.1 mrg bsize_1:
222 1.1 mrg /* Power up as far as possible within blimb. We start here with e!=0,
223 1.1.1.1.8.1 tls but if e is small then we might reach e==0 and the whole b^e in rl.
224 1.1.1.1.8.1 tls Notice this code works when blimb==1 too, reaching e==0. */
225 1.1 mrg
226 1.1 mrg while (blimb <= GMP_NUMB_HALFMAX)
227 1.1.1.1.8.1 tls {
228 1.1.1.1.8.1 tls TRACE (printf ("small e=0x%lX blimb=0x%lX rl=0x%lX\n",
229 1.1.1.1.8.1 tls e, blimb, rl));
230 1.1.1.1.8.1 tls ASSERT (e != 0);
231 1.1.1.1.8.1 tls if ((e & 1) != 0)
232 1.1.1.1.8.1 tls rl *= blimb;
233 1.1.1.1.8.1 tls e >>= 1;
234 1.1.1.1.8.1 tls if (e == 0)
235 1.1.1.1.8.1 tls goto got_rl;
236 1.1.1.1.8.1 tls blimb *= blimb;
237 1.1.1.1.8.1 tls }
238 1.1 mrg
239 1.1 mrg #if HAVE_NATIVE_mpn_mul_2
240 1.1 mrg TRACE (printf ("single power, e=0x%lX b=0x%lX rl=0x%lX\n",
241 1.1.1.1.8.1 tls e, blimb, rl));
242 1.1 mrg
243 1.1 mrg /* Can power b once more into blimb:blimb_low */
244 1.1 mrg bsize = 2;
245 1.1 mrg ASSERT (e != 0);
246 1.1 mrg if ((e & 1) != 0)
247 1.1 mrg {
248 1.1 mrg umul_ppmm (rl_high, rl, rl, blimb << GMP_NAIL_BITS);
249 1.1 mrg rl >>= GMP_NAIL_BITS;
250 1.1 mrg }
251 1.1 mrg e >>= 1;
252 1.1 mrg umul_ppmm (blimb, blimb_low, blimb, blimb << GMP_NAIL_BITS);
253 1.1 mrg blimb_low >>= GMP_NAIL_BITS;
254 1.1 mrg
255 1.1 mrg got_rl:
256 1.1 mrg TRACE (printf ("double power e=0x%lX blimb=0x%lX:0x%lX rl=0x%lX:%lX\n",
257 1.1.1.1.8.1 tls e, blimb, blimb_low, rl_high, rl));
258 1.1 mrg
259 1.1 mrg /* Combine left-over rtwos_bits into rl_high:rl to be handled by the
260 1.1.1.1.8.1 tls final mul_1 or mul_2 rather than a separate lshift.
261 1.1.1.1.8.1 tls - rl_high:rl mustn't be 1 (since then there's no final mul)
262 1.1.1.1.8.1 tls - rl_high mustn't overflow
263 1.1.1.1.8.1 tls - rl_high mustn't change to non-zero, since mul_1+lshift is
264 1.1.1.1.8.1 tls probably faster than mul_2 (FIXME: is this true?) */
265 1.1 mrg
266 1.1 mrg if (rtwos_bits != 0
267 1.1.1.1.8.1 tls && ! (rl_high == 0 && rl == 1)
268 1.1.1.1.8.1 tls && (rl_high >> (GMP_NUMB_BITS-rtwos_bits)) == 0)
269 1.1.1.1.8.1 tls {
270 1.1.1.1.8.1 tls mp_limb_t new_rl_high = (rl_high << rtwos_bits)
271 1.1.1.1.8.1 tls | (rl >> (GMP_NUMB_BITS-rtwos_bits));
272 1.1.1.1.8.1 tls if (! (rl_high == 0 && new_rl_high != 0))
273 1.1.1.1.8.1 tls {
274 1.1.1.1.8.1 tls rl_high = new_rl_high;
275 1.1.1.1.8.1 tls rl <<= rtwos_bits;
276 1.1.1.1.8.1 tls rtwos_bits = 0;
277 1.1.1.1.8.1 tls TRACE (printf ("merged rtwos_bits, rl=0x%lX:%lX\n",
278 1.1.1.1.8.1 tls rl_high, rl));
279 1.1.1.1.8.1 tls }
280 1.1.1.1.8.1 tls }
281 1.1 mrg #else
282 1.1 mrg got_rl:
283 1.1 mrg TRACE (printf ("small power e=0x%lX blimb=0x%lX rl=0x%lX\n",
284 1.1.1.1.8.1 tls e, blimb, rl));
285 1.1 mrg
286 1.1 mrg /* Combine left-over rtwos_bits into rl to be handled by the final
287 1.1.1.1.8.1 tls mul_1 rather than a separate lshift.
288 1.1.1.1.8.1 tls - rl mustn't be 1 (since then there's no final mul)
289 1.1.1.1.8.1 tls - rl mustn't overflow */
290 1.1 mrg
291 1.1 mrg if (rtwos_bits != 0
292 1.1.1.1.8.1 tls && rl != 1
293 1.1.1.1.8.1 tls && (rl >> (GMP_NUMB_BITS-rtwos_bits)) == 0)
294 1.1.1.1.8.1 tls {
295 1.1.1.1.8.1 tls rl <<= rtwos_bits;
296 1.1.1.1.8.1 tls rtwos_bits = 0;
297 1.1.1.1.8.1 tls TRACE (printf ("merged rtwos_bits, rl=0x%lX\n", rl));
298 1.1.1.1.8.1 tls }
299 1.1 mrg #endif
300 1.1 mrg }
301 1.1 mrg else if (bsize == 2)
302 1.1 mrg {
303 1.1 mrg mp_limb_t bsecond = bp[1];
304 1.1 mrg if (btwos != 0)
305 1.1.1.1.8.1 tls blimb |= (bsecond << (GMP_NUMB_BITS - btwos)) & GMP_NUMB_MASK;
306 1.1 mrg bsecond >>= btwos;
307 1.1 mrg if (bsecond == 0)
308 1.1.1.1.8.1 tls {
309 1.1.1.1.8.1 tls /* Two limbs became one after rshift. */
310 1.1.1.1.8.1 tls bsize = 1;
311 1.1.1.1.8.1 tls goto bsize_1;
312 1.1.1.1.8.1 tls }
313 1.1 mrg
314 1.1 mrg TRACE (printf ("bsize==2 using b=0x%lX:%lX", bsecond, blimb));
315 1.1 mrg #if HAVE_NATIVE_mpn_mul_2
316 1.1 mrg blimb_low = blimb;
317 1.1 mrg #else
318 1.1 mrg bp = b_twolimbs;
319 1.1 mrg b_twolimbs[0] = blimb;
320 1.1 mrg b_twolimbs[1] = bsecond;
321 1.1 mrg #endif
322 1.1 mrg blimb = bsecond;
323 1.1 mrg }
324 1.1 mrg else
325 1.1 mrg {
326 1.1 mrg if (r_bp_overlap || btwos != 0)
327 1.1.1.1.8.1 tls {
328 1.1.1.1.8.1 tls mp_ptr tp = TMP_ALLOC_LIMBS (bsize);
329 1.1.1.1.8.1 tls MPN_RSHIFT_OR_COPY (tp, bp, bsize, btwos);
330 1.1.1.1.8.1 tls bp = tp;
331 1.1.1.1.8.1 tls TRACE (printf ("rshift or copy bp,bsize, new bsize=%ld\n", bsize));
332 1.1.1.1.8.1 tls }
333 1.1 mrg #if HAVE_NATIVE_mpn_mul_2
334 1.1 mrg /* in case 3 limbs rshift to 2 and hence use the mul_2 loop below */
335 1.1 mrg blimb_low = bp[0];
336 1.1 mrg #endif
337 1.1 mrg blimb = bp[bsize-1];
338 1.1 mrg
339 1.1 mrg TRACE (printf ("big bsize=%ld ", bsize);
340 1.1.1.1.8.1 tls mpn_trace ("b", bp, bsize));
341 1.1 mrg }
342 1.1 mrg
343 1.1 mrg /* At this point blimb is the most significant limb of the base to use.
344 1.1 mrg
345 1.1 mrg Each factor of b takes (bsize*BPML-cnt) bits and there's e of them; +1
346 1.1 mrg limb to round up the division; +1 for multiplies all using an extra
347 1.1 mrg limb over the true size; +2 for rl at the end; +1 for lshift at the
348 1.1 mrg end.
349 1.1 mrg
350 1.1 mrg The size calculation here is reasonably accurate. The base is at least
351 1.1 mrg half a limb, so in 32 bits the worst case is 2^16+1 treated as 17 bits
352 1.1 mrg when it will power up as just over 16, an overestimate of 17/16 =
353 1.1 mrg 6.25%. For a 64-bit limb it's half that.
354 1.1 mrg
355 1.1 mrg If e==0 then blimb won't be anything useful (though it will be
356 1.1 mrg non-zero), but that doesn't matter since we just end up with ralloc==5,
357 1.1 mrg and that's fine for 2 limbs of rl and 1 of lshift. */
358 1.1 mrg
359 1.1 mrg ASSERT (blimb != 0);
360 1.1 mrg count_leading_zeros (cnt, blimb);
361 1.1 mrg ralloc = (bsize*GMP_NUMB_BITS - cnt + GMP_NAIL_BITS) * e / GMP_NUMB_BITS + 5;
362 1.1 mrg TRACE (printf ("ralloc %ld, from bsize=%ld blimb=0x%lX cnt=%d\n",
363 1.1.1.1.8.1 tls ralloc, bsize, blimb, cnt));
364 1.1.1.1.8.1 tls rp = MPZ_REALLOC (r, ralloc + rtwos_limbs);
365 1.1 mrg
366 1.1 mrg /* Low zero limbs resulting from powers of 2. */
367 1.1 mrg MPN_ZERO (rp, rtwos_limbs);
368 1.1 mrg rp += rtwos_limbs;
369 1.1 mrg
370 1.1 mrg if (e == 0)
371 1.1 mrg {
372 1.1 mrg /* Any e==0 other than via bsize==1 or bsize==2 is covered at the
373 1.1.1.1.8.1 tls start. */
374 1.1 mrg rp[0] = rl;
375 1.1 mrg rsize = 1;
376 1.1 mrg #if HAVE_NATIVE_mpn_mul_2
377 1.1 mrg rp[1] = rl_high;
378 1.1 mrg rsize += (rl_high != 0);
379 1.1 mrg #endif
380 1.1 mrg ASSERT (rp[rsize-1] != 0);
381 1.1 mrg }
382 1.1 mrg else
383 1.1 mrg {
384 1.1 mrg mp_ptr tp;
385 1.1 mrg mp_size_t talloc;
386 1.1 mrg
387 1.1 mrg /* In the mpn_mul_1 or mpn_mul_2 loops or in the mpn_mul loop when the
388 1.1.1.1.8.1 tls low bit of e is zero, tp only has to hold the second last power
389 1.1.1.1.8.1 tls step, which is half the size of the final result. There's no need
390 1.1.1.1.8.1 tls to round up the divide by 2, since ralloc includes a +2 for rl
391 1.1.1.1.8.1 tls which not needed by tp. In the mpn_mul loop when the low bit of e
392 1.1.1.1.8.1 tls is 1, tp must hold nearly the full result, so just size it the same
393 1.1.1.1.8.1 tls as rp. */
394 1.1 mrg
395 1.1 mrg talloc = ralloc;
396 1.1 mrg #if HAVE_NATIVE_mpn_mul_2
397 1.1 mrg if (bsize <= 2 || (e & 1) == 0)
398 1.1.1.1.8.1 tls talloc /= 2;
399 1.1 mrg #else
400 1.1 mrg if (bsize <= 1 || (e & 1) == 0)
401 1.1.1.1.8.1 tls talloc /= 2;
402 1.1 mrg #endif
403 1.1 mrg TRACE (printf ("talloc %ld\n", talloc));
404 1.1 mrg tp = TMP_ALLOC_LIMBS (talloc);
405 1.1 mrg
406 1.1 mrg /* Go from high to low over the bits of e, starting with i pointing at
407 1.1.1.1.8.1 tls the bit below the highest 1 (which will mean i==-1 if e==1). */
408 1.1.1.1.8.1 tls count_leading_zeros (cnt, (mp_limb_t) e);
409 1.1 mrg i = GMP_LIMB_BITS - cnt - 2;
410 1.1 mrg
411 1.1 mrg #if HAVE_NATIVE_mpn_mul_2
412 1.1 mrg if (bsize <= 2)
413 1.1.1.1.8.1 tls {
414 1.1.1.1.8.1 tls mp_limb_t mult[2];
415 1.1 mrg
416 1.1.1.1.8.1 tls /* Any bsize==1 will have been powered above to be two limbs. */
417 1.1.1.1.8.1 tls ASSERT (bsize == 2);
418 1.1.1.1.8.1 tls ASSERT (blimb != 0);
419 1.1.1.1.8.1 tls
420 1.1.1.1.8.1 tls /* Arrange the final result ends up in r, not in the temp space */
421 1.1.1.1.8.1 tls if ((i & 1) == 0)
422 1.1.1.1.8.1 tls SWAP_RP_TP;
423 1.1.1.1.8.1 tls
424 1.1.1.1.8.1 tls rp[0] = blimb_low;
425 1.1.1.1.8.1 tls rp[1] = blimb;
426 1.1.1.1.8.1 tls rsize = 2;
427 1.1.1.1.8.1 tls
428 1.1.1.1.8.1 tls mult[0] = blimb_low;
429 1.1.1.1.8.1 tls mult[1] = blimb;
430 1.1.1.1.8.1 tls
431 1.1.1.1.8.1 tls for ( ; i >= 0; i--)
432 1.1.1.1.8.1 tls {
433 1.1.1.1.8.1 tls TRACE (printf ("mul_2 loop i=%d e=0x%lX, rsize=%ld ralloc=%ld talloc=%ld\n",
434 1.1.1.1.8.1 tls i, e, rsize, ralloc, talloc);
435 1.1.1.1.8.1 tls mpn_trace ("r", rp, rsize));
436 1.1.1.1.8.1 tls
437 1.1.1.1.8.1 tls MPN_SQR (tp, talloc, rp, rsize);
438 1.1.1.1.8.1 tls SWAP_RP_TP;
439 1.1.1.1.8.1 tls if ((e & (1L << i)) != 0)
440 1.1.1.1.8.1 tls MPN_MUL_2 (rp, rsize, ralloc, mult);
441 1.1.1.1.8.1 tls }
442 1.1.1.1.8.1 tls
443 1.1.1.1.8.1 tls TRACE (mpn_trace ("mul_2 before rl, r", rp, rsize));
444 1.1.1.1.8.1 tls if (rl_high != 0)
445 1.1.1.1.8.1 tls {
446 1.1.1.1.8.1 tls mult[0] = rl;
447 1.1.1.1.8.1 tls mult[1] = rl_high;
448 1.1.1.1.8.1 tls MPN_MUL_2 (rp, rsize, ralloc, mult);
449 1.1.1.1.8.1 tls }
450 1.1.1.1.8.1 tls else if (rl != 1)
451 1.1.1.1.8.1 tls MPN_MUL_1 (rp, rsize, ralloc, rl);
452 1.1.1.1.8.1 tls }
453 1.1 mrg #else
454 1.1 mrg if (bsize == 1)
455 1.1.1.1.8.1 tls {
456 1.1.1.1.8.1 tls /* Arrange the final result ends up in r, not in the temp space */
457 1.1.1.1.8.1 tls if ((i & 1) == 0)
458 1.1.1.1.8.1 tls SWAP_RP_TP;
459 1.1.1.1.8.1 tls
460 1.1.1.1.8.1 tls rp[0] = blimb;
461 1.1.1.1.8.1 tls rsize = 1;
462 1.1.1.1.8.1 tls
463 1.1.1.1.8.1 tls for ( ; i >= 0; i--)
464 1.1.1.1.8.1 tls {
465 1.1.1.1.8.1 tls TRACE (printf ("mul_1 loop i=%d e=0x%lX, rsize=%ld ralloc=%ld talloc=%ld\n",
466 1.1.1.1.8.1 tls i, e, rsize, ralloc, talloc);
467 1.1.1.1.8.1 tls mpn_trace ("r", rp, rsize));
468 1.1.1.1.8.1 tls
469 1.1.1.1.8.1 tls MPN_SQR (tp, talloc, rp, rsize);
470 1.1.1.1.8.1 tls SWAP_RP_TP;
471 1.1.1.1.8.1 tls if ((e & (1L << i)) != 0)
472 1.1.1.1.8.1 tls MPN_MUL_1 (rp, rsize, ralloc, blimb);
473 1.1.1.1.8.1 tls }
474 1.1.1.1.8.1 tls
475 1.1.1.1.8.1 tls TRACE (mpn_trace ("mul_1 before rl, r", rp, rsize));
476 1.1.1.1.8.1 tls if (rl != 1)
477 1.1.1.1.8.1 tls MPN_MUL_1 (rp, rsize, ralloc, rl);
478 1.1.1.1.8.1 tls }
479 1.1 mrg #endif
480 1.1 mrg else
481 1.1.1.1.8.1 tls {
482 1.1.1.1.8.1 tls int parity;
483 1.1 mrg
484 1.1.1.1.8.1 tls /* Arrange the final result ends up in r, not in the temp space */
485 1.1.1.1.8.1 tls ULONG_PARITY (parity, e);
486 1.1.1.1.8.1 tls if (((parity ^ i) & 1) != 0)
487 1.1.1.1.8.1 tls SWAP_RP_TP;
488 1.1.1.1.8.1 tls
489 1.1.1.1.8.1 tls MPN_COPY (rp, bp, bsize);
490 1.1.1.1.8.1 tls rsize = bsize;
491 1.1.1.1.8.1 tls
492 1.1.1.1.8.1 tls for ( ; i >= 0; i--)
493 1.1.1.1.8.1 tls {
494 1.1.1.1.8.1 tls TRACE (printf ("mul loop i=%d e=0x%lX, rsize=%ld ralloc=%ld talloc=%ld\n",
495 1.1.1.1.8.1 tls i, e, rsize, ralloc, talloc);
496 1.1.1.1.8.1 tls mpn_trace ("r", rp, rsize));
497 1.1.1.1.8.1 tls
498 1.1.1.1.8.1 tls MPN_SQR (tp, talloc, rp, rsize);
499 1.1.1.1.8.1 tls SWAP_RP_TP;
500 1.1.1.1.8.1 tls if ((e & (1L << i)) != 0)
501 1.1.1.1.8.1 tls {
502 1.1.1.1.8.1 tls MPN_MUL (tp, talloc, rp, rsize, bp, bsize);
503 1.1.1.1.8.1 tls SWAP_RP_TP;
504 1.1.1.1.8.1 tls }
505 1.1.1.1.8.1 tls }
506 1.1.1.1.8.1 tls }
507 1.1 mrg }
508 1.1 mrg
509 1.1 mrg ASSERT (rp == PTR(r) + rtwos_limbs);
510 1.1 mrg TRACE (mpn_trace ("end loop r", rp, rsize));
511 1.1 mrg TMP_FREE;
512 1.1 mrg
513 1.1 mrg /* Apply any partial limb factors of 2. */
514 1.1 mrg if (rtwos_bits != 0)
515 1.1 mrg {
516 1.1 mrg MPN_LSHIFT (rp, rsize, ralloc, (unsigned) rtwos_bits);
517 1.1 mrg TRACE (mpn_trace ("lshift r", rp, rsize));
518 1.1 mrg }
519 1.1 mrg
520 1.1 mrg rsize += rtwos_limbs;
521 1.1 mrg SIZ(r) = (rneg ? -rsize : rsize);
522 1.1 mrg }
523