n_pow_ui.c revision 1.1.1.5 1 1.1 mrg /* mpz_n_pow_ui -- mpn raised to ulong.
2 1.1 mrg
3 1.1 mrg THE FUNCTIONS IN THIS FILE ARE FOR INTERNAL USE ONLY. THEY'RE ALMOST
4 1.1 mrg CERTAIN TO BE SUBJECT TO INCOMPATIBLE CHANGES OR DISAPPEAR COMPLETELY IN
5 1.1 mrg FUTURE GNU MP RELEASES.
6 1.1 mrg
7 1.1.1.5 mrg Copyright 2001, 2002, 2005, 2012, 2015, 2020 Free Software Foundation, Inc.
8 1.1 mrg
9 1.1 mrg This file is part of the GNU MP Library.
10 1.1 mrg
11 1.1 mrg The GNU MP Library is free software; you can redistribute it and/or modify
12 1.1.1.3 mrg it under the terms of either:
13 1.1.1.3 mrg
14 1.1.1.3 mrg * the GNU Lesser General Public License as published by the Free
15 1.1.1.3 mrg Software Foundation; either version 3 of the License, or (at your
16 1.1.1.3 mrg option) any later version.
17 1.1.1.3 mrg
18 1.1.1.3 mrg or
19 1.1.1.3 mrg
20 1.1.1.3 mrg * the GNU General Public License as published by the Free Software
21 1.1.1.3 mrg Foundation; either version 2 of the License, or (at your option) any
22 1.1.1.3 mrg later version.
23 1.1.1.3 mrg
24 1.1.1.3 mrg or both in parallel, as here.
25 1.1 mrg
26 1.1 mrg The GNU MP Library is distributed in the hope that it will be useful, but
27 1.1 mrg WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
28 1.1.1.3 mrg or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
29 1.1.1.3 mrg for more details.
30 1.1 mrg
31 1.1.1.3 mrg You should have received copies of the GNU General Public License and the
32 1.1.1.3 mrg GNU Lesser General Public License along with the GNU MP Library. If not,
33 1.1.1.3 mrg see https://www.gnu.org/licenses/. */
34 1.1 mrg
35 1.1.1.5 mrg #include <stdlib.h>
36 1.1.1.5 mrg #include <stdio.h>
37 1.1 mrg #include "gmp-impl.h"
38 1.1 mrg #include "longlong.h"
39 1.1 mrg
40 1.1 mrg
41 1.1 mrg /* Change this to "#define TRACE(x) x" for some traces. */
42 1.1 mrg #define TRACE(x)
43 1.1 mrg
44 1.1 mrg
45 1.1 mrg /* Use this to test the mul_2 code on a CPU without a native version of that
46 1.1 mrg routine. */
47 1.1 mrg #if 0
48 1.1 mrg #define mpn_mul_2 refmpn_mul_2
49 1.1 mrg #define HAVE_NATIVE_mpn_mul_2 1
50 1.1 mrg #endif
51 1.1 mrg
52 1.1 mrg
53 1.1 mrg /* mpz_pow_ui and mpz_ui_pow_ui want to share almost all of this code.
54 1.1 mrg ui_pow_ui doesn't need the mpn_mul based powering loop or the tests on
55 1.1 mrg bsize==2 or >2, but separating that isn't easy because there's shared
56 1.1 mrg code both before and after (the size calculations and the powers of 2
57 1.1 mrg handling).
58 1.1 mrg
59 1.1 mrg Alternatives:
60 1.1 mrg
61 1.1 mrg It would work to just use the mpn_mul powering loop for 1 and 2 limb
62 1.1 mrg bases, but the current separate loop allows mul_1 and mul_2 to be done
63 1.1 mrg in-place, which might help cache locality a bit. If mpn_mul was relaxed
64 1.1 mrg to allow source==dest when vn==1 or 2 then some pointer twiddling might
65 1.1 mrg let us get the same effect in one loop.
66 1.1 mrg
67 1.1 mrg The initial powering for bsize==1 into blimb or blimb:blimb_low doesn't
68 1.1 mrg form the biggest possible power of b that fits, only the biggest power of
69 1.1 mrg 2 power, ie. b^(2^n). It'd be possible to choose a bigger power, perhaps
70 1.1 mrg using mp_bases[b].big_base for small b, and thereby get better value
71 1.1 mrg from mpn_mul_1 or mpn_mul_2 in the bignum powering. It's felt that doing
72 1.1 mrg so would be more complicated than it's worth, and could well end up being
73 1.1 mrg a slowdown for small e. For big e on the other hand the algorithm is
74 1.1 mrg dominated by mpn_sqr so there wouldn't much of a saving. The current
75 1.1 mrg code can be viewed as simply doing the first few steps of the powering in
76 1.1 mrg a single or double limb where possible.
77 1.1 mrg
78 1.1 mrg If r==b, and blow_twos==0, and r must be realloc'ed, then the temporary
79 1.1 mrg copy made of b is unnecessary. We could just use the old alloc'ed block
80 1.1 mrg and free it at the end. But arranging this seems like a lot more trouble
81 1.1 mrg than it's worth. */
82 1.1 mrg
83 1.1 mrg
84 1.1 mrg /* floor(sqrt(GMP_NUMB_MAX)), ie. the biggest value that can be squared in
85 1.1 mrg a limb without overflowing.
86 1.1 mrg FIXME: This formula is an underestimate when GMP_NUMB_BITS is odd. */
87 1.1 mrg
88 1.1 mrg #define GMP_NUMB_HALFMAX (((mp_limb_t) 1 << GMP_NUMB_BITS/2) - 1)
89 1.1 mrg
90 1.1 mrg
91 1.1 mrg /* The following are for convenience, they update the size and check the
92 1.1 mrg alloc. */
93 1.1 mrg
94 1.1 mrg #define MPN_SQR(dst, alloc, src, size) \
95 1.1 mrg do { \
96 1.1 mrg ASSERT (2*(size) <= (alloc)); \
97 1.1 mrg mpn_sqr (dst, src, size); \
98 1.1 mrg (size) *= 2; \
99 1.1 mrg (size) -= ((dst)[(size)-1] == 0); \
100 1.1 mrg } while (0)
101 1.1 mrg
102 1.1 mrg #define MPN_MUL(dst, alloc, src, size, src2, size2) \
103 1.1 mrg do { \
104 1.1 mrg mp_limb_t cy; \
105 1.1 mrg ASSERT ((size) + (size2) <= (alloc)); \
106 1.1 mrg cy = mpn_mul (dst, src, size, src2, size2); \
107 1.1 mrg (size) += (size2) - (cy == 0); \
108 1.1 mrg } while (0)
109 1.1 mrg
110 1.1 mrg #define MPN_MUL_2(ptr, size, alloc, mult) \
111 1.1 mrg do { \
112 1.1 mrg mp_limb_t cy; \
113 1.1 mrg ASSERT ((size)+2 <= (alloc)); \
114 1.1 mrg cy = mpn_mul_2 (ptr, ptr, size, mult); \
115 1.1 mrg (size)++; \
116 1.1 mrg (ptr)[(size)] = cy; \
117 1.1 mrg (size) += (cy != 0); \
118 1.1 mrg } while (0)
119 1.1 mrg
120 1.1 mrg #define MPN_MUL_1(ptr, size, alloc, limb) \
121 1.1 mrg do { \
122 1.1 mrg mp_limb_t cy; \
123 1.1 mrg ASSERT ((size)+1 <= (alloc)); \
124 1.1 mrg cy = mpn_mul_1 (ptr, ptr, size, limb); \
125 1.1 mrg (ptr)[size] = cy; \
126 1.1 mrg (size) += (cy != 0); \
127 1.1 mrg } while (0)
128 1.1 mrg
129 1.1 mrg #define MPN_LSHIFT(ptr, size, alloc, shift) \
130 1.1 mrg do { \
131 1.1 mrg mp_limb_t cy; \
132 1.1 mrg ASSERT ((size)+1 <= (alloc)); \
133 1.1 mrg cy = mpn_lshift (ptr, ptr, size, shift); \
134 1.1 mrg (ptr)[size] = cy; \
135 1.1 mrg (size) += (cy != 0); \
136 1.1 mrg } while (0)
137 1.1 mrg
138 1.1 mrg #define MPN_RSHIFT_OR_COPY(dst, src, size, shift) \
139 1.1 mrg do { \
140 1.1 mrg if ((shift) == 0) \
141 1.1 mrg MPN_COPY (dst, src, size); \
142 1.1 mrg else \
143 1.1 mrg { \
144 1.1 mrg mpn_rshift (dst, src, size, shift); \
145 1.1 mrg (size) -= ((dst)[(size)-1] == 0); \
146 1.1 mrg } \
147 1.1 mrg } while (0)
148 1.1 mrg
149 1.1 mrg
150 1.1 mrg /* ralloc and talloc are only wanted for ASSERTs, after the initial space
151 1.1 mrg allocations. Avoid writing values to them in a normal build, to ensure
152 1.1 mrg the compiler lets them go dead. gcc already figures this out itself
153 1.1 mrg actually. */
154 1.1 mrg
155 1.1 mrg #define SWAP_RP_TP \
156 1.1 mrg do { \
157 1.1 mrg MP_PTR_SWAP (rp, tp); \
158 1.1 mrg ASSERT_CODE (MP_SIZE_T_SWAP (ralloc, talloc)); \
159 1.1 mrg } while (0)
160 1.1 mrg
161 1.1 mrg
162 1.1 mrg void
163 1.1 mrg mpz_n_pow_ui (mpz_ptr r, mp_srcptr bp, mp_size_t bsize, unsigned long int e)
164 1.1 mrg {
165 1.1 mrg mp_ptr rp;
166 1.1 mrg mp_size_t rtwos_limbs, ralloc, rsize;
167 1.1 mrg int rneg, i, cnt, btwos, r_bp_overlap;
168 1.1 mrg mp_limb_t blimb, rl;
169 1.1 mrg mp_bitcnt_t rtwos_bits;
170 1.1 mrg #if HAVE_NATIVE_mpn_mul_2
171 1.1 mrg mp_limb_t blimb_low, rl_high;
172 1.1 mrg #else
173 1.1 mrg mp_limb_t b_twolimbs[2];
174 1.1 mrg #endif
175 1.1.1.5 mrg mp_limb_t ovfl;
176 1.1 mrg TMP_DECL;
177 1.1 mrg
178 1.1 mrg TRACE (printf ("mpz_n_pow_ui rp=0x%lX bp=0x%lX bsize=%ld e=%lu (0x%lX)\n",
179 1.1.1.2 mrg PTR(r), bp, bsize, e, e);
180 1.1.1.2 mrg mpn_trace ("b", bp, bsize));
181 1.1 mrg
182 1.1 mrg ASSERT (bsize == 0 || bp[ABS(bsize)-1] != 0);
183 1.1.1.2 mrg ASSERT (MPN_SAME_OR_SEPARATE2_P (PTR(r), ALLOC(r), bp, ABS(bsize)));
184 1.1 mrg
185 1.1 mrg /* b^0 == 1, including 0^0 == 1 */
186 1.1 mrg if (e == 0)
187 1.1 mrg {
188 1.1.1.4 mrg MPZ_NEWALLOC (r, 1)[0] = 1;
189 1.1 mrg SIZ(r) = 1;
190 1.1 mrg return;
191 1.1 mrg }
192 1.1 mrg
193 1.1 mrg /* 0^e == 0 apart from 0^0 above */
194 1.1 mrg if (bsize == 0)
195 1.1 mrg {
196 1.1 mrg SIZ(r) = 0;
197 1.1 mrg return;
198 1.1 mrg }
199 1.1 mrg
200 1.1 mrg /* Sign of the final result. */
201 1.1 mrg rneg = (bsize < 0 && (e & 1) != 0);
202 1.1 mrg bsize = ABS (bsize);
203 1.1 mrg TRACE (printf ("rneg %d\n", rneg));
204 1.1 mrg
205 1.1 mrg r_bp_overlap = (PTR(r) == bp);
206 1.1 mrg
207 1.1 mrg /* Strip low zero limbs from b. */
208 1.1 mrg rtwos_limbs = 0;
209 1.1 mrg for (blimb = *bp; blimb == 0; blimb = *++bp)
210 1.1 mrg {
211 1.1 mrg rtwos_limbs += e;
212 1.1 mrg bsize--; ASSERT (bsize >= 1);
213 1.1 mrg }
214 1.1 mrg TRACE (printf ("trailing zero rtwos_limbs=%ld\n", rtwos_limbs));
215 1.1 mrg
216 1.1 mrg /* Strip low zero bits from b. */
217 1.1 mrg count_trailing_zeros (btwos, blimb);
218 1.1 mrg blimb >>= btwos;
219 1.1.1.5 mrg
220 1.1.1.5 mrg umul_ppmm (ovfl, rtwos_bits, e, btwos);
221 1.1.1.5 mrg if (ovfl)
222 1.1.1.5 mrg {
223 1.1.1.5 mrg fprintf (stderr, "gmp: overflow in mpz type\n");
224 1.1.1.5 mrg abort ();
225 1.1.1.5 mrg }
226 1.1.1.5 mrg
227 1.1 mrg rtwos_limbs += rtwos_bits / GMP_NUMB_BITS;
228 1.1 mrg rtwos_bits %= GMP_NUMB_BITS;
229 1.1 mrg TRACE (printf ("trailing zero btwos=%d rtwos_limbs=%ld rtwos_bits=%lu\n",
230 1.1.1.2 mrg btwos, rtwos_limbs, rtwos_bits));
231 1.1 mrg
232 1.1 mrg TMP_MARK;
233 1.1 mrg
234 1.1 mrg rl = 1;
235 1.1 mrg #if HAVE_NATIVE_mpn_mul_2
236 1.1 mrg rl_high = 0;
237 1.1 mrg #endif
238 1.1 mrg
239 1.1 mrg if (bsize == 1)
240 1.1 mrg {
241 1.1 mrg bsize_1:
242 1.1 mrg /* Power up as far as possible within blimb. We start here with e!=0,
243 1.1.1.2 mrg but if e is small then we might reach e==0 and the whole b^e in rl.
244 1.1.1.2 mrg Notice this code works when blimb==1 too, reaching e==0. */
245 1.1 mrg
246 1.1 mrg while (blimb <= GMP_NUMB_HALFMAX)
247 1.1.1.2 mrg {
248 1.1.1.2 mrg TRACE (printf ("small e=0x%lX blimb=0x%lX rl=0x%lX\n",
249 1.1.1.2 mrg e, blimb, rl));
250 1.1.1.2 mrg ASSERT (e != 0);
251 1.1.1.2 mrg if ((e & 1) != 0)
252 1.1.1.2 mrg rl *= blimb;
253 1.1.1.2 mrg e >>= 1;
254 1.1.1.2 mrg if (e == 0)
255 1.1.1.2 mrg goto got_rl;
256 1.1.1.2 mrg blimb *= blimb;
257 1.1.1.2 mrg }
258 1.1 mrg
259 1.1 mrg #if HAVE_NATIVE_mpn_mul_2
260 1.1 mrg TRACE (printf ("single power, e=0x%lX b=0x%lX rl=0x%lX\n",
261 1.1.1.2 mrg e, blimb, rl));
262 1.1 mrg
263 1.1 mrg /* Can power b once more into blimb:blimb_low */
264 1.1 mrg bsize = 2;
265 1.1 mrg ASSERT (e != 0);
266 1.1 mrg if ((e & 1) != 0)
267 1.1 mrg {
268 1.1 mrg umul_ppmm (rl_high, rl, rl, blimb << GMP_NAIL_BITS);
269 1.1 mrg rl >>= GMP_NAIL_BITS;
270 1.1 mrg }
271 1.1 mrg e >>= 1;
272 1.1 mrg umul_ppmm (blimb, blimb_low, blimb, blimb << GMP_NAIL_BITS);
273 1.1 mrg blimb_low >>= GMP_NAIL_BITS;
274 1.1 mrg
275 1.1 mrg got_rl:
276 1.1 mrg TRACE (printf ("double power e=0x%lX blimb=0x%lX:0x%lX rl=0x%lX:%lX\n",
277 1.1.1.2 mrg e, blimb, blimb_low, rl_high, rl));
278 1.1 mrg
279 1.1 mrg /* Combine left-over rtwos_bits into rl_high:rl to be handled by the
280 1.1.1.2 mrg final mul_1 or mul_2 rather than a separate lshift.
281 1.1.1.2 mrg - rl_high:rl mustn't be 1 (since then there's no final mul)
282 1.1.1.2 mrg - rl_high mustn't overflow
283 1.1.1.2 mrg - rl_high mustn't change to non-zero, since mul_1+lshift is
284 1.1.1.2 mrg probably faster than mul_2 (FIXME: is this true?) */
285 1.1 mrg
286 1.1 mrg if (rtwos_bits != 0
287 1.1.1.2 mrg && ! (rl_high == 0 && rl == 1)
288 1.1.1.2 mrg && (rl_high >> (GMP_NUMB_BITS-rtwos_bits)) == 0)
289 1.1.1.2 mrg {
290 1.1.1.2 mrg mp_limb_t new_rl_high = (rl_high << rtwos_bits)
291 1.1.1.2 mrg | (rl >> (GMP_NUMB_BITS-rtwos_bits));
292 1.1.1.2 mrg if (! (rl_high == 0 && new_rl_high != 0))
293 1.1.1.2 mrg {
294 1.1.1.2 mrg rl_high = new_rl_high;
295 1.1.1.2 mrg rl <<= rtwos_bits;
296 1.1.1.2 mrg rtwos_bits = 0;
297 1.1.1.2 mrg TRACE (printf ("merged rtwos_bits, rl=0x%lX:%lX\n",
298 1.1.1.2 mrg rl_high, rl));
299 1.1.1.2 mrg }
300 1.1.1.2 mrg }
301 1.1 mrg #else
302 1.1 mrg got_rl:
303 1.1 mrg TRACE (printf ("small power e=0x%lX blimb=0x%lX rl=0x%lX\n",
304 1.1.1.2 mrg e, blimb, rl));
305 1.1 mrg
306 1.1 mrg /* Combine left-over rtwos_bits into rl to be handled by the final
307 1.1.1.2 mrg mul_1 rather than a separate lshift.
308 1.1.1.2 mrg - rl mustn't be 1 (since then there's no final mul)
309 1.1.1.2 mrg - rl mustn't overflow */
310 1.1 mrg
311 1.1 mrg if (rtwos_bits != 0
312 1.1.1.2 mrg && rl != 1
313 1.1.1.2 mrg && (rl >> (GMP_NUMB_BITS-rtwos_bits)) == 0)
314 1.1.1.2 mrg {
315 1.1.1.2 mrg rl <<= rtwos_bits;
316 1.1.1.2 mrg rtwos_bits = 0;
317 1.1.1.2 mrg TRACE (printf ("merged rtwos_bits, rl=0x%lX\n", rl));
318 1.1.1.2 mrg }
319 1.1 mrg #endif
320 1.1 mrg }
321 1.1 mrg else if (bsize == 2)
322 1.1 mrg {
323 1.1 mrg mp_limb_t bsecond = bp[1];
324 1.1 mrg if (btwos != 0)
325 1.1.1.2 mrg blimb |= (bsecond << (GMP_NUMB_BITS - btwos)) & GMP_NUMB_MASK;
326 1.1 mrg bsecond >>= btwos;
327 1.1 mrg if (bsecond == 0)
328 1.1.1.2 mrg {
329 1.1.1.2 mrg /* Two limbs became one after rshift. */
330 1.1.1.2 mrg bsize = 1;
331 1.1.1.2 mrg goto bsize_1;
332 1.1.1.2 mrg }
333 1.1 mrg
334 1.1 mrg TRACE (printf ("bsize==2 using b=0x%lX:%lX", bsecond, blimb));
335 1.1 mrg #if HAVE_NATIVE_mpn_mul_2
336 1.1 mrg blimb_low = blimb;
337 1.1 mrg #else
338 1.1 mrg bp = b_twolimbs;
339 1.1 mrg b_twolimbs[0] = blimb;
340 1.1 mrg b_twolimbs[1] = bsecond;
341 1.1 mrg #endif
342 1.1 mrg blimb = bsecond;
343 1.1 mrg }
344 1.1 mrg else
345 1.1 mrg {
346 1.1 mrg if (r_bp_overlap || btwos != 0)
347 1.1.1.2 mrg {
348 1.1.1.2 mrg mp_ptr tp = TMP_ALLOC_LIMBS (bsize);
349 1.1.1.2 mrg MPN_RSHIFT_OR_COPY (tp, bp, bsize, btwos);
350 1.1.1.2 mrg bp = tp;
351 1.1.1.2 mrg TRACE (printf ("rshift or copy bp,bsize, new bsize=%ld\n", bsize));
352 1.1.1.2 mrg }
353 1.1 mrg #if HAVE_NATIVE_mpn_mul_2
354 1.1 mrg /* in case 3 limbs rshift to 2 and hence use the mul_2 loop below */
355 1.1 mrg blimb_low = bp[0];
356 1.1 mrg #endif
357 1.1 mrg blimb = bp[bsize-1];
358 1.1 mrg
359 1.1 mrg TRACE (printf ("big bsize=%ld ", bsize);
360 1.1.1.2 mrg mpn_trace ("b", bp, bsize));
361 1.1 mrg }
362 1.1 mrg
363 1.1 mrg /* At this point blimb is the most significant limb of the base to use.
364 1.1 mrg
365 1.1 mrg Each factor of b takes (bsize*BPML-cnt) bits and there's e of them; +1
366 1.1 mrg limb to round up the division; +1 for multiplies all using an extra
367 1.1 mrg limb over the true size; +2 for rl at the end; +1 for lshift at the
368 1.1 mrg end.
369 1.1 mrg
370 1.1 mrg The size calculation here is reasonably accurate. The base is at least
371 1.1 mrg half a limb, so in 32 bits the worst case is 2^16+1 treated as 17 bits
372 1.1 mrg when it will power up as just over 16, an overestimate of 17/16 =
373 1.1 mrg 6.25%. For a 64-bit limb it's half that.
374 1.1 mrg
375 1.1 mrg If e==0 then blimb won't be anything useful (though it will be
376 1.1 mrg non-zero), but that doesn't matter since we just end up with ralloc==5,
377 1.1 mrg and that's fine for 2 limbs of rl and 1 of lshift. */
378 1.1 mrg
379 1.1 mrg ASSERT (blimb != 0);
380 1.1 mrg count_leading_zeros (cnt, blimb);
381 1.1.1.5 mrg
382 1.1.1.5 mrg umul_ppmm (ovfl, ralloc, (bsize*GMP_NUMB_BITS - cnt + GMP_NAIL_BITS), e);
383 1.1.1.5 mrg if (ovfl)
384 1.1.1.5 mrg {
385 1.1.1.5 mrg fprintf (stderr, "gmp: overflow in mpz type\n");
386 1.1.1.5 mrg abort ();
387 1.1.1.5 mrg }
388 1.1.1.5 mrg ralloc = ralloc / GMP_NUMB_BITS + 5;
389 1.1.1.5 mrg
390 1.1 mrg TRACE (printf ("ralloc %ld, from bsize=%ld blimb=0x%lX cnt=%d\n",
391 1.1.1.2 mrg ralloc, bsize, blimb, cnt));
392 1.1.1.4 mrg rp = MPZ_NEWALLOC (r, ralloc + rtwos_limbs);
393 1.1 mrg
394 1.1 mrg /* Low zero limbs resulting from powers of 2. */
395 1.1 mrg MPN_ZERO (rp, rtwos_limbs);
396 1.1 mrg rp += rtwos_limbs;
397 1.1 mrg
398 1.1 mrg if (e == 0)
399 1.1 mrg {
400 1.1 mrg /* Any e==0 other than via bsize==1 or bsize==2 is covered at the
401 1.1.1.2 mrg start. */
402 1.1 mrg rp[0] = rl;
403 1.1 mrg rsize = 1;
404 1.1 mrg #if HAVE_NATIVE_mpn_mul_2
405 1.1 mrg rp[1] = rl_high;
406 1.1 mrg rsize += (rl_high != 0);
407 1.1 mrg #endif
408 1.1 mrg ASSERT (rp[rsize-1] != 0);
409 1.1 mrg }
410 1.1 mrg else
411 1.1 mrg {
412 1.1 mrg mp_ptr tp;
413 1.1 mrg mp_size_t talloc;
414 1.1 mrg
415 1.1 mrg /* In the mpn_mul_1 or mpn_mul_2 loops or in the mpn_mul loop when the
416 1.1.1.2 mrg low bit of e is zero, tp only has to hold the second last power
417 1.1.1.2 mrg step, which is half the size of the final result. There's no need
418 1.1.1.2 mrg to round up the divide by 2, since ralloc includes a +2 for rl
419 1.1.1.2 mrg which not needed by tp. In the mpn_mul loop when the low bit of e
420 1.1.1.2 mrg is 1, tp must hold nearly the full result, so just size it the same
421 1.1.1.2 mrg as rp. */
422 1.1 mrg
423 1.1 mrg talloc = ralloc;
424 1.1 mrg #if HAVE_NATIVE_mpn_mul_2
425 1.1 mrg if (bsize <= 2 || (e & 1) == 0)
426 1.1.1.2 mrg talloc /= 2;
427 1.1 mrg #else
428 1.1 mrg if (bsize <= 1 || (e & 1) == 0)
429 1.1.1.2 mrg talloc /= 2;
430 1.1 mrg #endif
431 1.1 mrg TRACE (printf ("talloc %ld\n", talloc));
432 1.1 mrg tp = TMP_ALLOC_LIMBS (talloc);
433 1.1 mrg
434 1.1 mrg /* Go from high to low over the bits of e, starting with i pointing at
435 1.1.1.2 mrg the bit below the highest 1 (which will mean i==-1 if e==1). */
436 1.1.1.2 mrg count_leading_zeros (cnt, (mp_limb_t) e);
437 1.1 mrg i = GMP_LIMB_BITS - cnt - 2;
438 1.1 mrg
439 1.1 mrg #if HAVE_NATIVE_mpn_mul_2
440 1.1 mrg if (bsize <= 2)
441 1.1.1.2 mrg {
442 1.1.1.2 mrg mp_limb_t mult[2];
443 1.1 mrg
444 1.1.1.2 mrg /* Any bsize==1 will have been powered above to be two limbs. */
445 1.1.1.2 mrg ASSERT (bsize == 2);
446 1.1.1.2 mrg ASSERT (blimb != 0);
447 1.1.1.2 mrg
448 1.1.1.2 mrg /* Arrange the final result ends up in r, not in the temp space */
449 1.1.1.2 mrg if ((i & 1) == 0)
450 1.1.1.2 mrg SWAP_RP_TP;
451 1.1.1.2 mrg
452 1.1.1.2 mrg rp[0] = blimb_low;
453 1.1.1.2 mrg rp[1] = blimb;
454 1.1.1.2 mrg rsize = 2;
455 1.1.1.2 mrg
456 1.1.1.2 mrg mult[0] = blimb_low;
457 1.1.1.2 mrg mult[1] = blimb;
458 1.1.1.2 mrg
459 1.1.1.2 mrg for ( ; i >= 0; i--)
460 1.1.1.2 mrg {
461 1.1.1.2 mrg TRACE (printf ("mul_2 loop i=%d e=0x%lX, rsize=%ld ralloc=%ld talloc=%ld\n",
462 1.1.1.2 mrg i, e, rsize, ralloc, talloc);
463 1.1.1.2 mrg mpn_trace ("r", rp, rsize));
464 1.1.1.2 mrg
465 1.1.1.2 mrg MPN_SQR (tp, talloc, rp, rsize);
466 1.1.1.2 mrg SWAP_RP_TP;
467 1.1.1.2 mrg if ((e & (1L << i)) != 0)
468 1.1.1.2 mrg MPN_MUL_2 (rp, rsize, ralloc, mult);
469 1.1.1.2 mrg }
470 1.1.1.2 mrg
471 1.1.1.2 mrg TRACE (mpn_trace ("mul_2 before rl, r", rp, rsize));
472 1.1.1.2 mrg if (rl_high != 0)
473 1.1.1.2 mrg {
474 1.1.1.2 mrg mult[0] = rl;
475 1.1.1.2 mrg mult[1] = rl_high;
476 1.1.1.2 mrg MPN_MUL_2 (rp, rsize, ralloc, mult);
477 1.1.1.2 mrg }
478 1.1.1.2 mrg else if (rl != 1)
479 1.1.1.2 mrg MPN_MUL_1 (rp, rsize, ralloc, rl);
480 1.1.1.2 mrg }
481 1.1 mrg #else
482 1.1 mrg if (bsize == 1)
483 1.1.1.2 mrg {
484 1.1.1.2 mrg /* Arrange the final result ends up in r, not in the temp space */
485 1.1.1.2 mrg if ((i & 1) == 0)
486 1.1.1.2 mrg SWAP_RP_TP;
487 1.1.1.2 mrg
488 1.1.1.2 mrg rp[0] = blimb;
489 1.1.1.2 mrg rsize = 1;
490 1.1.1.2 mrg
491 1.1.1.2 mrg for ( ; i >= 0; i--)
492 1.1.1.2 mrg {
493 1.1.1.2 mrg TRACE (printf ("mul_1 loop i=%d e=0x%lX, rsize=%ld ralloc=%ld talloc=%ld\n",
494 1.1.1.2 mrg i, e, rsize, ralloc, talloc);
495 1.1.1.2 mrg mpn_trace ("r", rp, rsize));
496 1.1.1.2 mrg
497 1.1.1.2 mrg MPN_SQR (tp, talloc, rp, rsize);
498 1.1.1.2 mrg SWAP_RP_TP;
499 1.1.1.2 mrg if ((e & (1L << i)) != 0)
500 1.1.1.2 mrg MPN_MUL_1 (rp, rsize, ralloc, blimb);
501 1.1.1.2 mrg }
502 1.1.1.2 mrg
503 1.1.1.2 mrg TRACE (mpn_trace ("mul_1 before rl, r", rp, rsize));
504 1.1.1.2 mrg if (rl != 1)
505 1.1.1.2 mrg MPN_MUL_1 (rp, rsize, ralloc, rl);
506 1.1.1.2 mrg }
507 1.1 mrg #endif
508 1.1 mrg else
509 1.1.1.2 mrg {
510 1.1.1.2 mrg int parity;
511 1.1 mrg
512 1.1.1.2 mrg /* Arrange the final result ends up in r, not in the temp space */
513 1.1.1.2 mrg ULONG_PARITY (parity, e);
514 1.1.1.2 mrg if (((parity ^ i) & 1) != 0)
515 1.1.1.2 mrg SWAP_RP_TP;
516 1.1.1.2 mrg
517 1.1.1.2 mrg MPN_COPY (rp, bp, bsize);
518 1.1.1.2 mrg rsize = bsize;
519 1.1.1.2 mrg
520 1.1.1.2 mrg for ( ; i >= 0; i--)
521 1.1.1.2 mrg {
522 1.1.1.2 mrg TRACE (printf ("mul loop i=%d e=0x%lX, rsize=%ld ralloc=%ld talloc=%ld\n",
523 1.1.1.2 mrg i, e, rsize, ralloc, talloc);
524 1.1.1.2 mrg mpn_trace ("r", rp, rsize));
525 1.1.1.2 mrg
526 1.1.1.2 mrg MPN_SQR (tp, talloc, rp, rsize);
527 1.1.1.2 mrg SWAP_RP_TP;
528 1.1.1.2 mrg if ((e & (1L << i)) != 0)
529 1.1.1.2 mrg {
530 1.1.1.2 mrg MPN_MUL (tp, talloc, rp, rsize, bp, bsize);
531 1.1.1.2 mrg SWAP_RP_TP;
532 1.1.1.2 mrg }
533 1.1.1.2 mrg }
534 1.1.1.2 mrg }
535 1.1 mrg }
536 1.1 mrg
537 1.1 mrg ASSERT (rp == PTR(r) + rtwos_limbs);
538 1.1 mrg TRACE (mpn_trace ("end loop r", rp, rsize));
539 1.1 mrg TMP_FREE;
540 1.1 mrg
541 1.1 mrg /* Apply any partial limb factors of 2. */
542 1.1 mrg if (rtwos_bits != 0)
543 1.1 mrg {
544 1.1 mrg MPN_LSHIFT (rp, rsize, ralloc, (unsigned) rtwos_bits);
545 1.1 mrg TRACE (mpn_trace ("lshift r", rp, rsize));
546 1.1 mrg }
547 1.1 mrg
548 1.1 mrg rsize += rtwos_limbs;
549 1.1 mrg SIZ(r) = (rneg ? -rsize : rsize);
550 1.1 mrg }
551