longlong.h revision 1.6 1 /* longlong.h -- definitions for mixed size 32/64 bit arithmetic.
2
3 Copyright 1991-1994, 1996, 1997, 1999-2005, 2007-2009, 2011-2020 Free Software
4 Foundation, Inc.
5
6 This file is part of the GNU MP Library.
7
8 The GNU MP Library is free software; you can redistribute it and/or modify
9 it under the terms of either:
10
11 * the GNU Lesser General Public License as published by the Free
12 Software Foundation; either version 3 of the License, or (at your
13 option) any later version.
14
15 or
16
17 * the GNU General Public License as published by the Free Software
18 Foundation; either version 2 of the License, or (at your option) any
19 later version.
20
21 or both in parallel, as here.
22
23 The GNU MP Library is distributed in the hope that it will be useful, but
24 WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
25 or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
26 for more details.
27
28 You should have received copies of the GNU General Public License and the
29 GNU Lesser General Public License along with the GNU MP Library. If not,
30 see https://www.gnu.org/licenses/. */
31
32 /* You have to define the following before including this file:
33
34 UWtype -- An unsigned type, default type for operations (typically a "word")
35 UHWtype -- An unsigned type, at least half the size of UWtype
36 UDWtype -- An unsigned type, at least twice as large a UWtype
37 W_TYPE_SIZE -- size in bits of UWtype
38
39 SItype, USItype -- Signed and unsigned 32 bit types
40 DItype, UDItype -- Signed and unsigned 64 bit types
41
42 On a 32 bit machine UWtype should typically be USItype;
43 on a 64 bit machine, UWtype should typically be UDItype.
44
45 Optionally, define:
46
47 LONGLONG_STANDALONE -- Avoid code that needs machine-dependent support files
48 NO_ASM -- Disable inline asm
49
50
51 CAUTION! Using this version of longlong.h outside of GMP is not safe. You
52 need to include gmp.h and gmp-impl.h, or certain things might not work as
53 expected.
54 */
55
56 #define __BITS4 (W_TYPE_SIZE / 4)
57 #define __ll_B ((UWtype) 1 << (W_TYPE_SIZE / 2))
58 #define __ll_lowpart(t) ((UWtype) (t) & (__ll_B - 1))
59 #define __ll_highpart(t) ((UWtype) (t) >> (W_TYPE_SIZE / 2))
60
61 /* This is used to make sure no undesirable sharing between different libraries
62 that use this file takes place. */
63 #ifndef __MPN
64 #define __MPN(x) __##x
65 #endif
66
67 /* Define auxiliary asm macros.
68
69 1) umul_ppmm(high_prod, low_prod, multiplier, multiplicand) multiplies two
70 UWtype integers MULTIPLIER and MULTIPLICAND, and generates a two UWtype
71 word product in HIGH_PROD and LOW_PROD.
72
73 2) __umulsidi3(a,b) multiplies two UWtype integers A and B, and returns a
74 UDWtype product. This is just a variant of umul_ppmm.
75
76 3) udiv_qrnnd(quotient, remainder, high_numerator, low_numerator,
77 denominator) divides a UDWtype, composed by the UWtype integers
78 HIGH_NUMERATOR and LOW_NUMERATOR, by DENOMINATOR and places the quotient
79 in QUOTIENT and the remainder in REMAINDER. HIGH_NUMERATOR must be less
80 than DENOMINATOR for correct operation. If, in addition, the most
81 significant bit of DENOMINATOR must be 1, then the pre-processor symbol
82 UDIV_NEEDS_NORMALIZATION is defined to 1.
83
84 4) sdiv_qrnnd(quotient, remainder, high_numerator, low_numerator,
85 denominator). Like udiv_qrnnd but the numbers are signed. The quotient
86 is rounded towards 0.
87
88 5) count_leading_zeros(count, x) counts the number of zero-bits from the
89 msb to the first non-zero bit in the UWtype X. This is the number of
90 steps X needs to be shifted left to set the msb. Undefined for X == 0,
91 unless the symbol COUNT_LEADING_ZEROS_0 is defined to some value.
92
93 6) count_trailing_zeros(count, x) like count_leading_zeros, but counts
94 from the least significant end.
95
96 7) add_ssaaaa(high_sum, low_sum, high_addend_1, low_addend_1,
97 high_addend_2, low_addend_2) adds two UWtype integers, composed by
98 HIGH_ADDEND_1 and LOW_ADDEND_1, and HIGH_ADDEND_2 and LOW_ADDEND_2
99 respectively. The result is placed in HIGH_SUM and LOW_SUM. Overflow
100 (i.e. carry out) is not stored anywhere, and is lost.
101
102 8) sub_ddmmss(high_difference, low_difference, high_minuend, low_minuend,
103 high_subtrahend, low_subtrahend) subtracts two two-word UWtype integers,
104 composed by HIGH_MINUEND_1 and LOW_MINUEND_1, and HIGH_SUBTRAHEND_2 and
105 LOW_SUBTRAHEND_2 respectively. The result is placed in HIGH_DIFFERENCE
106 and LOW_DIFFERENCE. Overflow (i.e. carry out) is not stored anywhere,
107 and is lost.
108
109 If any of these macros are left undefined for a particular CPU,
110 C macros are used.
111
112
113 Notes:
114
115 For add_ssaaaa the two high and two low addends can both commute, but
116 unfortunately gcc only supports one "%" commutative in each asm block.
117 This has always been so but is only documented in recent versions
118 (eg. pre-release 3.3). Having two or more "%"s can cause an internal
119 compiler error in certain rare circumstances.
120
121 Apparently it was only the last "%" that was ever actually respected, so
122 the code has been updated to leave just that. Clearly there's a free
123 choice whether high or low should get it, if there's a reason to favour
124 one over the other. Also obviously when the constraints on the two
125 operands are identical there's no benefit to the reloader in any "%" at
126 all.
127
128 */
129
130 /* The CPUs come in alphabetical order below.
131
132 Please add support for more CPUs here, or improve the current support
133 for the CPUs below! */
134
135
136 /* count_leading_zeros_gcc_clz is count_leading_zeros implemented with gcc
137 3.4 __builtin_clzl or __builtin_clzll, according to our limb size.
138 Similarly count_trailing_zeros_gcc_ctz using __builtin_ctzl or
139 __builtin_ctzll.
140
141 These builtins are only used when we check what code comes out, on some
142 chips they're merely libgcc calls, where we will instead want an inline
143 in that case (either asm or generic C).
144
145 These builtins are better than an asm block of the same insn, since an
146 asm block doesn't give gcc any information about scheduling or resource
147 usage. We keep an asm block for use on prior versions of gcc though.
148
149 For reference, __builtin_ffs existed in gcc prior to __builtin_clz, but
150 it's not used (for count_leading_zeros) because it generally gives extra
151 code to ensure the result is 0 when the input is 0, which we don't need
152 or want. */
153
154 #ifdef _LONG_LONG_LIMB
155 #define count_leading_zeros_gcc_clz(count,x) \
156 do { \
157 ASSERT ((x) != 0); \
158 (count) = __builtin_clzll (x); \
159 } while (0)
160 #else
161 #define count_leading_zeros_gcc_clz(count,x) \
162 do { \
163 ASSERT ((x) != 0); \
164 (count) = __builtin_clzl (x); \
165 } while (0)
166 #endif
167
168 #ifdef _LONG_LONG_LIMB
169 #define count_trailing_zeros_gcc_ctz(count,x) \
170 do { \
171 ASSERT ((x) != 0); \
172 (count) = __builtin_ctzll (x); \
173 } while (0)
174 #else
175 #define count_trailing_zeros_gcc_ctz(count,x) \
176 do { \
177 ASSERT ((x) != 0); \
178 (count) = __builtin_ctzl (x); \
179 } while (0)
180 #endif
181
182
183 /* FIXME: The macros using external routines like __MPN(count_leading_zeros)
184 don't need to be under !NO_ASM */
185 #if ! defined (NO_ASM)
186
187 #if defined (__alpha) && W_TYPE_SIZE == 64
188 /* Most alpha-based machines, except Cray systems. */
189 #if defined (__GNUC__)
190 #if __GMP_GNUC_PREREQ (3,3)
191 #define umul_ppmm(ph, pl, m0, m1) \
192 do { \
193 UDItype __m0 = (m0), __m1 = (m1); \
194 (ph) = __builtin_alpha_umulh (__m0, __m1); \
195 (pl) = __m0 * __m1; \
196 } while (0)
197 #else
198 #define umul_ppmm(ph, pl, m0, m1) \
199 do { \
200 UDItype __m0 = (m0), __m1 = (m1); \
201 __asm__ ("umulh %r1,%2,%0" \
202 : "=r" (ph) \
203 : "%rJ" (__m0), "rI" (__m1)); \
204 (pl) = __m0 * __m1; \
205 } while (0)
206 #endif
207 #else /* ! __GNUC__ */
208 #include <machine/builtins.h>
209 #define umul_ppmm(ph, pl, m0, m1) \
210 do { \
211 UDItype __m0 = (m0), __m1 = (m1); \
212 (ph) = __UMULH (__m0, __m1); \
213 (pl) = __m0 * __m1; \
214 } while (0)
215 #endif
216 #ifndef LONGLONG_STANDALONE
217 #define udiv_qrnnd(q, r, n1, n0, d) \
218 do { UWtype __di; \
219 __di = __MPN(invert_limb) (d); \
220 udiv_qrnnd_preinv (q, r, n1, n0, d, __di); \
221 } while (0)
222 #define UDIV_PREINV_ALWAYS 1
223 #define UDIV_NEEDS_NORMALIZATION 1
224 #endif /* LONGLONG_STANDALONE */
225
226 /* clz_tab is required in all configurations, since mpn/alpha/cntlz.asm
227 always goes into libgmp.so, even when not actually used. */
228 #define COUNT_LEADING_ZEROS_NEED_CLZ_TAB
229
230 #if defined (__GNUC__) && HAVE_HOST_CPU_alpha_CIX
231 #define count_leading_zeros(COUNT,X) \
232 __asm__("ctlz %1,%0" : "=r"(COUNT) : "r"(X))
233 #define count_trailing_zeros(COUNT,X) \
234 __asm__("cttz %1,%0" : "=r"(COUNT) : "r"(X))
235 #endif /* clz/ctz using cix */
236
237 #if ! defined (count_leading_zeros) \
238 && defined (__GNUC__) && ! defined (LONGLONG_STANDALONE)
239 /* ALPHA_CMPBGE_0 gives "cmpbge $31,src,dst", ie. test src bytes == 0.
240 "$31" is written explicitly in the asm, since an "r" constraint won't
241 select reg 31. There seems no need to worry about "r31" syntax for cray,
242 since gcc itself (pre-release 3.4) emits just $31 in various places. */
243 #define ALPHA_CMPBGE_0(dst, src) \
244 do { asm ("cmpbge $31, %1, %0" : "=r" (dst) : "r" (src)); } while (0)
245 /* Zero bytes are turned into bits with cmpbge, a __clz_tab lookup counts
246 them, locating the highest non-zero byte. A second __clz_tab lookup
247 counts the leading zero bits in that byte, giving the result. */
248 #define count_leading_zeros(count, x) \
249 do { \
250 UWtype __clz__b, __clz__c, __clz__x = (x); \
251 ALPHA_CMPBGE_0 (__clz__b, __clz__x); /* zero bytes */ \
252 __clz__b = __clz_tab [(__clz__b >> 1) ^ 0x7F]; /* 8 to 1 byte */ \
253 __clz__b = __clz__b * 8 - 7; /* 57 to 1 shift */ \
254 __clz__x >>= __clz__b; \
255 __clz__c = __clz_tab [__clz__x]; /* 8 to 1 bit */ \
256 __clz__b = 65 - __clz__b; \
257 (count) = __clz__b - __clz__c; \
258 } while (0)
259 #define COUNT_LEADING_ZEROS_NEED_CLZ_TAB
260 #endif /* clz using cmpbge */
261
262 #if ! defined (count_leading_zeros) && ! defined (LONGLONG_STANDALONE)
263 #if HAVE_ATTRIBUTE_CONST
264 long __MPN(count_leading_zeros) (UDItype) __attribute__ ((const));
265 #else
266 long __MPN(count_leading_zeros) (UDItype);
267 #endif
268 #define count_leading_zeros(count, x) \
269 ((count) = __MPN(count_leading_zeros) (x))
270 #endif /* clz using mpn */
271 #endif /* __alpha */
272
273 #if defined (__AVR) && W_TYPE_SIZE == 8
274 #define umul_ppmm(ph, pl, m0, m1) \
275 do { \
276 unsigned short __p = (unsigned short) (m0) * (m1); \
277 (ph) = __p >> 8; \
278 (pl) = __p; \
279 } while (0)
280 #endif /* AVR */
281
282 #if defined (_CRAY) && W_TYPE_SIZE == 64
283 #include <intrinsics.h>
284 #define UDIV_PREINV_ALWAYS 1
285 #define UDIV_NEEDS_NORMALIZATION 1
286 long __MPN(count_leading_zeros) (UDItype);
287 #define count_leading_zeros(count, x) \
288 ((count) = _leadz ((UWtype) (x)))
289 #if defined (_CRAYIEEE) /* I.e., Cray T90/ieee, T3D, and T3E */
290 #define umul_ppmm(ph, pl, m0, m1) \
291 do { \
292 UDItype __m0 = (m0), __m1 = (m1); \
293 (ph) = _int_mult_upper (__m0, __m1); \
294 (pl) = __m0 * __m1; \
295 } while (0)
296 #ifndef LONGLONG_STANDALONE
297 #define udiv_qrnnd(q, r, n1, n0, d) \
298 do { UWtype __di; \
299 __di = __MPN(invert_limb) (d); \
300 udiv_qrnnd_preinv (q, r, n1, n0, d, __di); \
301 } while (0)
302 #endif /* LONGLONG_STANDALONE */
303 #endif /* _CRAYIEEE */
304 #endif /* _CRAY */
305
306 #if defined (__ia64) && W_TYPE_SIZE == 64
307 /* This form encourages gcc (pre-release 3.4 at least) to emit predicated
308 "sub r=r,r" and "sub r=r,r,1", giving a 2 cycle latency. The generic
309 code using "al<bl" arithmetically comes out making an actual 0 or 1 in a
310 register, which takes an extra cycle. */
311 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \
312 do { \
313 UWtype __x; \
314 __x = (al) - (bl); \
315 if ((al) < (bl)) \
316 (sh) = (ah) - (bh) - 1; \
317 else \
318 (sh) = (ah) - (bh); \
319 (sl) = __x; \
320 } while (0)
321 #if defined (__GNUC__) && ! defined (__INTEL_COMPILER)
322 /* Do both product parts in assembly, since that gives better code with
323 all gcc versions. Some callers will just use the upper part, and in
324 that situation we waste an instruction, but not any cycles. */
325 #define umul_ppmm(ph, pl, m0, m1) \
326 __asm__ ("xma.hu %0 = %2, %3, f0\n\txma.l %1 = %2, %3, f0" \
327 : "=&f" (ph), "=f" (pl) \
328 : "f" (m0), "f" (m1))
329 #define count_leading_zeros(count, x) \
330 do { \
331 UWtype _x = (x), _y, _a, _c; \
332 __asm__ ("mux1 %0 = %1, @rev" : "=r" (_y) : "r" (_x)); \
333 __asm__ ("czx1.l %0 = %1" : "=r" (_a) : "r" (-_y | _y)); \
334 _c = (_a - 1) << 3; \
335 _x >>= _c; \
336 if (_x >= 1 << 4) \
337 _x >>= 4, _c += 4; \
338 if (_x >= 1 << 2) \
339 _x >>= 2, _c += 2; \
340 _c += _x >> 1; \
341 (count) = W_TYPE_SIZE - 1 - _c; \
342 } while (0)
343 /* similar to what gcc does for __builtin_ffs, but 0 based rather than 1
344 based, and we don't need a special case for x==0 here */
345 #define count_trailing_zeros(count, x) \
346 do { \
347 UWtype __ctz_x = (x); \
348 __asm__ ("popcnt %0 = %1" \
349 : "=r" (count) \
350 : "r" ((__ctz_x-1) & ~__ctz_x)); \
351 } while (0)
352 #endif
353 #if defined (__INTEL_COMPILER)
354 #include <ia64intrin.h>
355 #define umul_ppmm(ph, pl, m0, m1) \
356 do { \
357 UWtype __m0 = (m0), __m1 = (m1); \
358 ph = _m64_xmahu (__m0, __m1, 0); \
359 pl = __m0 * __m1; \
360 } while (0)
361 #endif
362 #ifndef LONGLONG_STANDALONE
363 #define udiv_qrnnd(q, r, n1, n0, d) \
364 do { UWtype __di; \
365 __di = __MPN(invert_limb) (d); \
366 udiv_qrnnd_preinv (q, r, n1, n0, d, __di); \
367 } while (0)
368 #define UDIV_PREINV_ALWAYS 1
369 #define UDIV_NEEDS_NORMALIZATION 1
370 #endif
371 #endif
372
373
374 #if defined (__GNUC__) || defined(__lint__)
375
376 /* We sometimes need to clobber "cc" with gcc2, but that would not be
377 understood by gcc1. Use cpp to avoid major code duplication. */
378 #if __GNUC__ < 2
379 #define __CLOBBER_CC
380 #define __AND_CLOBBER_CC
381 #else /* __GNUC__ >= 2 */
382 #define __CLOBBER_CC : "cc"
383 #define __AND_CLOBBER_CC , "cc"
384 #endif /* __GNUC__ < 2 */
385
386 #if (defined (__a29k__) || defined (_AM29K)) && W_TYPE_SIZE == 32
387 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \
388 __asm__ ("add %1,%4,%5\n\taddc %0,%2,%3" \
389 : "=r" (sh), "=&r" (sl) \
390 : "r" (ah), "rI" (bh), "%r" (al), "rI" (bl))
391 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \
392 __asm__ ("sub %1,%4,%5\n\tsubc %0,%2,%3" \
393 : "=r" (sh), "=&r" (sl) \
394 : "r" (ah), "rI" (bh), "r" (al), "rI" (bl))
395 #define umul_ppmm(xh, xl, m0, m1) \
396 do { \
397 USItype __m0 = (m0), __m1 = (m1); \
398 __asm__ ("multiplu %0,%1,%2" \
399 : "=r" (xl) \
400 : "r" (__m0), "r" (__m1)); \
401 __asm__ ("multmu %0,%1,%2" \
402 : "=r" (xh) \
403 : "r" (__m0), "r" (__m1)); \
404 } while (0)
405 #define udiv_qrnnd(q, r, n1, n0, d) \
406 __asm__ ("dividu %0,%3,%4" \
407 : "=r" (q), "=q" (r) \
408 : "1" (n1), "r" (n0), "r" (d))
409 #define count_leading_zeros(count, x) \
410 __asm__ ("clz %0,%1" \
411 : "=r" (count) \
412 : "r" (x))
413 #define COUNT_LEADING_ZEROS_0 32
414 #endif /* __a29k__ */
415
416 #if defined (__arc__)
417 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \
418 __asm__ ("add.f\t%1, %4, %5\n\tadc\t%0, %2, %3" \
419 : "=r" (sh), \
420 "=&r" (sl) \
421 : "r" ((USItype) (ah)), \
422 "rICal" ((USItype) (bh)), \
423 "%r" ((USItype) (al)), \
424 "rICal" ((USItype) (bl)))
425 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \
426 __asm__ ("sub.f\t%1, %4, %5\n\tsbc\t%0, %2, %3" \
427 : "=r" (sh), \
428 "=&r" (sl) \
429 : "r" ((USItype) (ah)), \
430 "rICal" ((USItype) (bh)), \
431 "r" ((USItype) (al)), \
432 "rICal" ((USItype) (bl)))
433 #endif
434
435 #if defined (__arm__) && (defined (__thumb2__) || !defined (__thumb__)) \
436 && W_TYPE_SIZE == 32
437 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \
438 do { \
439 if (__builtin_constant_p (bl) && -(USItype)(bl) < (USItype)(bl)) \
440 __asm__ ("subs\t%1, %4, %5\n\tadc\t%0, %2, %3" \
441 : "=r" (sh), "=&r" (sl) \
442 : "r" (ah), "rI" (bh), \
443 "%r" (al), "rI" (-(USItype)(bl)) __CLOBBER_CC); \
444 else \
445 __asm__ ("adds\t%1, %4, %5\n\tadc\t%0, %2, %3" \
446 : "=r" (sh), "=&r" (sl) \
447 : "r" (ah), "rI" (bh), "%r" (al), "rI" (bl) __CLOBBER_CC); \
448 } while (0)
449 /* FIXME: Extend the immediate range for the low word by using both ADDS and
450 SUBS, since they set carry in the same way. We need separate definitions
451 for thumb and non-thumb since thumb lacks RSC. */
452 #if defined (__thumb__)
453 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \
454 do { \
455 if (__builtin_constant_p (ah) && __builtin_constant_p (bh) \
456 && (ah) == (bh)) \
457 __asm__ ("subs\t%1, %2, %3\n\tsbc\t%0, %0, %0" \
458 : "=r" (sh), "=r" (sl) \
459 : "r" (al), "rI" (bl) __CLOBBER_CC); \
460 else if (__builtin_constant_p (al)) \
461 __asm__ ("rsbs\t%1, %5, %4\n\tsbc\t%0, %2, %3" \
462 : "=r" (sh), "=&r" (sl) \
463 : "r" (ah), "rI" (bh), "rI" (al), "r" (bl) __CLOBBER_CC); \
464 else if (__builtin_constant_p (bl)) \
465 __asm__ ("subs\t%1, %4, %5\n\tsbc\t%0, %2, %3" \
466 : "=r" (sh), "=&r" (sl) \
467 : "r" (ah), "rI" (bh), "r" (al), "rI" (bl) __CLOBBER_CC); \
468 else \
469 __asm__ ("subs\t%1, %4, %5\n\tsbc\t%0, %2, %3" \
470 : "=r" (sh), "=&r" (sl) \
471 : "r" (ah), "rI" (bh), "r" (al), "rI" (bl) __CLOBBER_CC); \
472 } while (0)
473 #else
474 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \
475 do { \
476 if (__builtin_constant_p (ah) && __builtin_constant_p (bh) \
477 && (ah) == (bh)) \
478 __asm__ ("subs\t%1, %2, %3\n\tsbc\t%0, %0, %0" \
479 : "=r" (sh), "=r" (sl) \
480 : "r" (al), "rI" (bl) __CLOBBER_CC); \
481 else if (__builtin_constant_p (al)) \
482 { \
483 if (__builtin_constant_p (ah)) \
484 __asm__ ("rsbs\t%1, %5, %4\n\trsc\t%0, %3, %2" \
485 : "=r" (sh), "=&r" (sl) \
486 : "rI" (ah), "r" (bh), "rI" (al), "r" (bl) __CLOBBER_CC); \
487 else \
488 __asm__ ("rsbs\t%1, %5, %4\n\tsbc\t%0, %2, %3" \
489 : "=r" (sh), "=&r" (sl) \
490 : "r" (ah), "rI" (bh), "rI" (al), "r" (bl) __CLOBBER_CC); \
491 } \
492 else if (__builtin_constant_p (ah)) \
493 { \
494 if (__builtin_constant_p (bl)) \
495 __asm__ ("subs\t%1, %4, %5\n\trsc\t%0, %3, %2" \
496 : "=r" (sh), "=&r" (sl) \
497 : "rI" (ah), "r" (bh), "r" (al), "rI" (bl) __CLOBBER_CC); \
498 else \
499 __asm__ ("rsbs\t%1, %5, %4\n\trsc\t%0, %3, %2" \
500 : "=r" (sh), "=&r" (sl) \
501 : "rI" (ah), "r" (bh), "rI" (al), "r" (bl) __CLOBBER_CC); \
502 } \
503 else if (__builtin_constant_p (bl)) \
504 __asm__ ("subs\t%1, %4, %5\n\tsbc\t%0, %2, %3" \
505 : "=r" (sh), "=&r" (sl) \
506 : "r" (ah), "rI" (bh), "r" (al), "rI" (bl) __CLOBBER_CC); \
507 else \
508 __asm__ ("subs\t%1, %4, %5\n\tsbc\t%0, %2, %3" \
509 : "=r" (sh), "=&r" (sl) \
510 : "r" (ah), "rI" (bh), "r" (al), "rI" (bl) __CLOBBER_CC); \
511 } while (0)
512 #endif
513 #if defined (__ARM_ARCH_2__) || defined (__ARM_ARCH_2A__) \
514 || defined (__ARM_ARCH_3__)
515 #define umul_ppmm(xh, xl, a, b) \
516 do { \
517 register USItype __t0, __t1, __t2; \
518 __asm__ ("%@ Inlined umul_ppmm\n" \
519 " mov %2, %5, lsr #16\n" \
520 " mov %0, %6, lsr #16\n" \
521 " bic %3, %5, %2, lsl #16\n" \
522 " bic %4, %6, %0, lsl #16\n" \
523 " mul %1, %3, %4\n" \
524 " mul %4, %2, %4\n" \
525 " mul %3, %0, %3\n" \
526 " mul %0, %2, %0\n" \
527 " adds %3, %4, %3\n" \
528 " addcs %0, %0, #65536\n" \
529 " adds %1, %1, %3, lsl #16\n" \
530 " adc %0, %0, %3, lsr #16" \
531 : "=&r" ((USItype) (xh)), "=r" ((USItype) (xl)), \
532 "=&r" (__t0), "=&r" (__t1), "=r" (__t2) \
533 : "r" ((USItype) (a)), "r" ((USItype) (b)) __CLOBBER_CC); \
534 } while (0)
535 #ifndef LONGLONG_STANDALONE
536 #define udiv_qrnnd(q, r, n1, n0, d) \
537 do { UWtype __r; \
538 (q) = __MPN(udiv_qrnnd) (&__r, (n1), (n0), (d)); \
539 (r) = __r; \
540 } while (0)
541 extern UWtype __MPN(udiv_qrnnd) (UWtype *, UWtype, UWtype, UWtype);
542 #endif /* LONGLONG_STANDALONE */
543 #else /* ARMv4 or newer */
544 #define umul_ppmm(xh, xl, a, b) \
545 __asm__ ("umull %0,%1,%2,%3" : "=&r" (xl), "=&r" (xh) : "r" (a), "r" (b))
546 #define smul_ppmm(xh, xl, a, b) \
547 __asm__ ("smull %0,%1,%2,%3" : "=&r" (xl), "=&r" (xh) : "r" (a), "r" (b))
548 #ifndef LONGLONG_STANDALONE
549 #define udiv_qrnnd(q, r, n1, n0, d) \
550 do { UWtype __di; \
551 __di = __MPN(invert_limb) (d); \
552 udiv_qrnnd_preinv (q, r, n1, n0, d, __di); \
553 } while (0)
554 #define UDIV_PREINV_ALWAYS 1
555 #define UDIV_NEEDS_NORMALIZATION 1
556 #endif /* LONGLONG_STANDALONE */
557 #endif /* defined(__ARM_ARCH_2__) ... */
558 #define count_leading_zeros(count, x) count_leading_zeros_gcc_clz(count, x)
559 #define count_trailing_zeros(count, x) count_trailing_zeros_gcc_ctz(count, x)
560 #endif /* __arm__ */
561
562 #if defined (__aarch64__) && W_TYPE_SIZE == 64
563 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \
564 do { \
565 if (__builtin_constant_p (bl) && ~(UDItype)(bl) <= (UDItype)(bl)) \
566 __asm__ ("subs\t%1, %x4, %5\n\tadc\t%0, %x2, %x3" \
567 : "=r" (sh), "=&r" (sl) \
568 : "rZ" ((UDItype)(ah)), "rZ" ((UDItype)(bh)), \
569 "%r" ((UDItype)(al)), "rI" (-(UDItype)(bl)) __CLOBBER_CC);\
570 else \
571 __asm__ ("adds\t%1, %x4, %5\n\tadc\t%0, %x2, %x3" \
572 : "=r" (sh), "=&r" (sl) \
573 : "rZ" ((UDItype)(ah)), "rZ" ((UDItype)(bh)), \
574 "%r" ((UDItype)(al)), "rI" ((UDItype)(bl)) __CLOBBER_CC);\
575 } while (0)
576 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \
577 do { \
578 if (__builtin_constant_p (bl) && ~(UDItype)(bl) <= (UDItype)(bl)) \
579 __asm__ ("adds\t%1, %x4, %5\n\tsbc\t%0, %x2, %x3" \
580 : "=r,r" (sh), "=&r,&r" (sl) \
581 : "rZ,rZ" ((UDItype)(ah)), "rZ,rZ" ((UDItype)(bh)), \
582 "r,Z" ((UDItype)(al)), "rI,r" (-(UDItype)(bl)) __CLOBBER_CC);\
583 else \
584 __asm__ ("subs\t%1, %x4, %5\n\tsbc\t%0, %x2, %x3" \
585 : "=r,r" (sh), "=&r,&r" (sl) \
586 : "rZ,rZ" ((UDItype)(ah)), "rZ,rZ" ((UDItype)(bh)), \
587 "r,Z" ((UDItype)(al)), "rI,r" ((UDItype)(bl)) __CLOBBER_CC);\
588 } while(0);
589 #if __GMP_GNUC_PREREQ (4,9)
590 #define umul_ppmm(w1, w0, u, v) \
591 do { \
592 typedef unsigned int __ll_UTItype __attribute__((mode(TI))); \
593 __ll_UTItype __ll = (__ll_UTItype)(u) * (v); \
594 w1 = __ll >> 64; \
595 w0 = __ll; \
596 } while (0)
597 #endif
598 #if !defined (umul_ppmm)
599 #define umul_ppmm(ph, pl, m0, m1) \
600 do { \
601 UDItype __m0 = (m0), __m1 = (m1); \
602 __asm__ ("umulh\t%0, %1, %2" : "=r" (ph) : "r" (__m0), "r" (__m1)); \
603 (pl) = __m0 * __m1; \
604 } while (0)
605 #endif
606 #define count_leading_zeros(count, x) count_leading_zeros_gcc_clz(count, x)
607 #define count_trailing_zeros(count, x) count_trailing_zeros_gcc_ctz(count, x)
608 #endif /* __aarch64__ */
609
610 #if defined (__clipper__) && W_TYPE_SIZE == 32
611 #define umul_ppmm(w1, w0, u, v) \
612 ({union {UDItype __ll; \
613 struct {USItype __l, __h;} __i; \
614 } __x; \
615 __asm__ ("mulwux %2,%0" \
616 : "=r" (__x.__ll) \
617 : "%0" ((USItype)(u)), "r" ((USItype)(v))); \
618 (w1) = __x.__i.__h; (w0) = __x.__i.__l;})
619 #define smul_ppmm(w1, w0, u, v) \
620 ({union {DItype __ll; \
621 struct {SItype __l, __h;} __i; \
622 } __x; \
623 __asm__ ("mulwx %2,%0" \
624 : "=r" (__x.__ll) \
625 : "%0" ((SItype)(u)), "r" ((SItype)(v))); \
626 (w1) = __x.__i.__h; (w0) = __x.__i.__l;})
627 #define __umulsidi3(u, v) \
628 ({UDItype __w; \
629 __asm__ ("mulwux %2,%0" \
630 : "=r" (__w) : "%0" ((USItype)(u)), "r" ((USItype)(v))); \
631 __w; })
632 #endif /* __clipper__ */
633
634 /* Fujitsu vector computers. */
635 #if defined (__uxp__) && W_TYPE_SIZE == 32
636 #define umul_ppmm(ph, pl, u, v) \
637 do { \
638 union {UDItype __ll; \
639 struct {USItype __h, __l;} __i; \
640 } __x; \
641 __asm__ ("mult.lu %1,%2,%0" : "=r" (__x.__ll) : "%r" (u), "rK" (v));\
642 (ph) = __x.__i.__h; \
643 (pl) = __x.__i.__l; \
644 } while (0)
645 #define smul_ppmm(ph, pl, u, v) \
646 do { \
647 union {UDItype __ll; \
648 struct {USItype __h, __l;} __i; \
649 } __x; \
650 __asm__ ("mult.l %1,%2,%0" : "=r" (__x.__ll) : "%r" (u), "rK" (v)); \
651 (ph) = __x.__i.__h; \
652 (pl) = __x.__i.__l; \
653 } while (0)
654 #endif
655
656 #if defined (__gmicro__) && W_TYPE_SIZE == 32
657 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \
658 __asm__ ("add.w %5,%1\n\taddx %3,%0" \
659 : "=g" (sh), "=&g" (sl) \
660 : "0" ((USItype)(ah)), "g" ((USItype)(bh)), \
661 "%1" ((USItype)(al)), "g" ((USItype)(bl)))
662 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \
663 __asm__ ("sub.w %5,%1\n\tsubx %3,%0" \
664 : "=g" (sh), "=&g" (sl) \
665 : "0" ((USItype)(ah)), "g" ((USItype)(bh)), \
666 "1" ((USItype)(al)), "g" ((USItype)(bl)))
667 #define umul_ppmm(ph, pl, m0, m1) \
668 __asm__ ("mulx %3,%0,%1" \
669 : "=g" (ph), "=r" (pl) \
670 : "%0" ((USItype)(m0)), "g" ((USItype)(m1)))
671 #define udiv_qrnnd(q, r, nh, nl, d) \
672 __asm__ ("divx %4,%0,%1" \
673 : "=g" (q), "=r" (r) \
674 : "1" ((USItype)(nh)), "0" ((USItype)(nl)), "g" ((USItype)(d)))
675 #define count_leading_zeros(count, x) \
676 __asm__ ("bsch/1 %1,%0" \
677 : "=g" (count) : "g" ((USItype)(x)), "0" ((USItype)0))
678 #endif
679
680 #if defined (__hppa) && W_TYPE_SIZE == 32
681 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \
682 __asm__ ("add%I5 %5,%r4,%1\n\taddc %r2,%r3,%0" \
683 : "=r" (sh), "=&r" (sl) \
684 : "rM" (ah), "rM" (bh), "%rM" (al), "rI" (bl))
685 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \
686 __asm__ ("sub%I4 %4,%r5,%1\n\tsubb %r2,%r3,%0" \
687 : "=r" (sh), "=&r" (sl) \
688 : "rM" (ah), "rM" (bh), "rI" (al), "rM" (bl))
689 #if defined (_PA_RISC1_1)
690 #define umul_ppmm(wh, wl, u, v) \
691 do { \
692 union {UDItype __ll; \
693 struct {USItype __h, __l;} __i; \
694 } __x; \
695 __asm__ ("xmpyu %1,%2,%0" : "=*f" (__x.__ll) : "*f" (u), "*f" (v)); \
696 (wh) = __x.__i.__h; \
697 (wl) = __x.__i.__l; \
698 } while (0)
699 #endif
700 #define count_leading_zeros(count, x) \
701 do { \
702 USItype __tmp; \
703 __asm__ ( \
704 "ldi 1,%0\n" \
705 " extru,= %1,15,16,%%r0 ; Bits 31..16 zero?\n" \
706 " extru,tr %1,15,16,%1 ; No. Shift down, skip add.\n" \
707 " ldo 16(%0),%0 ; Yes. Perform add.\n" \
708 " extru,= %1,23,8,%%r0 ; Bits 15..8 zero?\n" \
709 " extru,tr %1,23,8,%1 ; No. Shift down, skip add.\n" \
710 " ldo 8(%0),%0 ; Yes. Perform add.\n" \
711 " extru,= %1,27,4,%%r0 ; Bits 7..4 zero?\n" \
712 " extru,tr %1,27,4,%1 ; No. Shift down, skip add.\n" \
713 " ldo 4(%0),%0 ; Yes. Perform add.\n" \
714 " extru,= %1,29,2,%%r0 ; Bits 3..2 zero?\n" \
715 " extru,tr %1,29,2,%1 ; No. Shift down, skip add.\n" \
716 " ldo 2(%0),%0 ; Yes. Perform add.\n" \
717 " extru %1,30,1,%1 ; Extract bit 1.\n" \
718 " sub %0,%1,%0 ; Subtract it.\n" \
719 : "=r" (count), "=r" (__tmp) : "1" (x)); \
720 } while (0)
721 #endif /* hppa */
722
723 /* These macros are for ABI=2.0w. In ABI=2.0n they can't be used, since GCC
724 (3.2) puts longlong into two adjacent 32-bit registers. Presumably this
725 is just a case of no direct support for 2.0n but treating it like 1.0. */
726 #if defined (__hppa) && W_TYPE_SIZE == 64 && ! defined (_LONG_LONG_LIMB)
727 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \
728 __asm__ ("add%I5 %5,%r4,%1\n\tadd,dc %r2,%r3,%0" \
729 : "=r" (sh), "=&r" (sl) \
730 : "rM" (ah), "rM" (bh), "%rM" (al), "rI" (bl))
731 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \
732 __asm__ ("sub%I4 %4,%r5,%1\n\tsub,db %r2,%r3,%0" \
733 : "=r" (sh), "=&r" (sl) \
734 : "rM" (ah), "rM" (bh), "rI" (al), "rM" (bl))
735 #endif /* hppa */
736
737 #if (defined (__i370__) || defined (__s390__) || defined (__mvs__)) && W_TYPE_SIZE == 32
738 #if defined (__zarch__) || defined (HAVE_HOST_CPU_s390_zarch)
739 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \
740 do { \
741 /* if (__builtin_constant_p (bl)) \
742 __asm__ ("alfi\t%1,%o5\n\talcr\t%0,%3" \
743 : "=r" (sh), "=&r" (sl) \
744 : "0" (ah), "r" (bh), "%1" (al), "n" (bl) __CLOBBER_CC);\
745 else \
746 */ __asm__ ("alr\t%1,%5\n\talcr\t%0,%3" \
747 : "=r" (sh), "=&r" (sl) \
748 : "0" (ah), "r" (bh), "%1" (al), "r" (bl)__CLOBBER_CC); \
749 } while (0)
750 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \
751 do { \
752 /* if (__builtin_constant_p (bl)) \
753 __asm__ ("slfi\t%1,%o5\n\tslbr\t%0,%3" \
754 : "=r" (sh), "=&r" (sl) \
755 : "0" (ah), "r" (bh), "1" (al), "n" (bl) __CLOBBER_CC); \
756 else \
757 */ __asm__ ("slr\t%1,%5\n\tslbr\t%0,%3" \
758 : "=r" (sh), "=&r" (sl) \
759 : "0" (ah), "r" (bh), "1" (al), "r" (bl) __CLOBBER_CC); \
760 } while (0)
761 #if __GMP_GNUC_PREREQ (4,5)
762 #define umul_ppmm(xh, xl, m0, m1) \
763 do { \
764 union {UDItype __ll; \
765 struct {USItype __h, __l;} __i; \
766 } __x; \
767 __x.__ll = (UDItype) (m0) * (UDItype) (m1); \
768 (xh) = __x.__i.__h; (xl) = __x.__i.__l; \
769 } while (0)
770 #else
771 #if 0
772 /* FIXME: this fails if gcc knows about the 64-bit registers. Use only
773 with a new enough processor pretending we have 32-bit registers. */
774 #define umul_ppmm(xh, xl, m0, m1) \
775 do { \
776 union {UDItype __ll; \
777 struct {USItype __h, __l;} __i; \
778 } __x; \
779 __asm__ ("mlr\t%0,%2" \
780 : "=r" (__x.__ll) \
781 : "%0" (m0), "r" (m1)); \
782 (xh) = __x.__i.__h; (xl) = __x.__i.__l; \
783 } while (0)
784 #else
785 #define umul_ppmm(xh, xl, m0, m1) \
786 do { \
787 /* When we have 64-bit regs and gcc is aware of that, we cannot simply use
788 DImode for the product, since that would be allocated to a single 64-bit
789 register, whereas mlr uses the low 32-bits of an even-odd register pair.
790 */ \
791 register USItype __r0 __asm__ ("0"); \
792 register USItype __r1 __asm__ ("1") = (m0); \
793 __asm__ ("mlr\t%0,%3" \
794 : "=r" (__r0), "=r" (__r1) \
795 : "r" (__r1), "r" (m1)); \
796 (xh) = __r0; (xl) = __r1; \
797 } while (0)
798 #endif /* if 0 */
799 #endif
800 #if 0
801 /* FIXME: this fails if gcc knows about the 64-bit registers. Use only
802 with a new enough processor pretending we have 32-bit registers. */
803 #define udiv_qrnnd(q, r, n1, n0, d) \
804 do { \
805 union {UDItype __ll; \
806 struct {USItype __h, __l;} __i; \
807 } __x; \
808 __x.__i.__h = n1; __x.__i.__l = n0; \
809 __asm__ ("dlr\t%0,%2" \
810 : "=r" (__x.__ll) \
811 : "0" (__x.__ll), "r" (d)); \
812 (q) = __x.__i.__l; (r) = __x.__i.__h; \
813 } while (0)
814 #else
815 #define udiv_qrnnd(q, r, n1, n0, d) \
816 do { \
817 register USItype __r0 __asm__ ("0") = (n1); \
818 register USItype __r1 __asm__ ("1") = (n0); \
819 __asm__ ("dlr\t%0,%4" \
820 : "=r" (__r0), "=r" (__r1) \
821 : "r" (__r0), "r" (__r1), "r" (d)); \
822 (q) = __r1; (r) = __r0; \
823 } while (0)
824 #endif /* if 0 */
825 #else /* if __zarch__ */
826 /* FIXME: this fails if gcc knows about the 64-bit registers. */
827 #define smul_ppmm(xh, xl, m0, m1) \
828 do { \
829 union {DItype __ll; \
830 struct {USItype __h, __l;} __i; \
831 } __x; \
832 __asm__ ("mr\t%0,%2" \
833 : "=r" (__x.__ll) \
834 : "%0" (m0), "r" (m1)); \
835 (xh) = __x.__i.__h; (xl) = __x.__i.__l; \
836 } while (0)
837 /* FIXME: this fails if gcc knows about the 64-bit registers. */
838 #define sdiv_qrnnd(q, r, n1, n0, d) \
839 do { \
840 union {DItype __ll; \
841 struct {USItype __h, __l;} __i; \
842 } __x; \
843 __x.__i.__h = n1; __x.__i.__l = n0; \
844 __asm__ ("dr\t%0,%2" \
845 : "=r" (__x.__ll) \
846 : "0" (__x.__ll), "r" (d)); \
847 (q) = __x.__i.__l; (r) = __x.__i.__h; \
848 } while (0)
849 #endif /* if __zarch__ */
850 #endif
851
852 #if defined (__s390x__) && W_TYPE_SIZE == 64
853 /* We need to cast operands with register constraints, otherwise their types
854 will be assumed to be SImode by gcc. For these machines, such operations
855 will insert a value into the low 32 bits, and leave the high 32 bits with
856 garbage. */
857 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \
858 do { \
859 __asm__ ("algr\t%1,%5\n\talcgr\t%0,%3" \
860 : "=r" (sh), "=&r" (sl) \
861 : "0" ((UDItype)(ah)), "r" ((UDItype)(bh)), \
862 "%1" ((UDItype)(al)), "r" ((UDItype)(bl)) __CLOBBER_CC); \
863 } while (0)
864 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \
865 do { \
866 __asm__ ("slgr\t%1,%5\n\tslbgr\t%0,%3" \
867 : "=r" (sh), "=&r" (sl) \
868 : "0" ((UDItype)(ah)), "r" ((UDItype)(bh)), \
869 "1" ((UDItype)(al)), "r" ((UDItype)(bl)) __CLOBBER_CC); \
870 } while (0)
871 #define umul_ppmm(xh, xl, m0, m1) \
872 do { \
873 union {unsigned int __attribute__ ((mode(TI))) __ll; \
874 struct {UDItype __h, __l;} __i; \
875 } __x; \
876 __asm__ ("mlgr\t%0,%2" \
877 : "=r" (__x.__ll) \
878 : "%0" ((UDItype)(m0)), "r" ((UDItype)(m1))); \
879 (xh) = __x.__i.__h; (xl) = __x.__i.__l; \
880 } while (0)
881 #define udiv_qrnnd(q, r, n1, n0, d) \
882 do { \
883 union {unsigned int __attribute__ ((mode(TI))) __ll; \
884 struct {UDItype __h, __l;} __i; \
885 } __x; \
886 __x.__i.__h = n1; __x.__i.__l = n0; \
887 __asm__ ("dlgr\t%0,%2" \
888 : "=r" (__x.__ll) \
889 : "0" (__x.__ll), "r" ((UDItype)(d))); \
890 (q) = __x.__i.__l; (r) = __x.__i.__h; \
891 } while (0)
892 #if 0 /* FIXME: Enable for z10 (?) */
893 #define count_leading_zeros(cnt, x) \
894 do { \
895 union {unsigned int __attribute__ ((mode(TI))) __ll; \
896 struct {UDItype __h, __l;} __i; \
897 } __clr_cnt; \
898 __asm__ ("flogr\t%0,%1" \
899 : "=r" (__clr_cnt.__ll) \
900 : "r" (x) __CLOBBER_CC); \
901 (cnt) = __clr_cnt.__i.__h; \
902 } while (0)
903 #endif
904 #endif
905
906 /* On x86 and x86_64, every asm implicitly clobbers "flags" and "fpsr",
907 so we don't need __CLOBBER_CC. */
908 #if (defined (__i386__) || defined (__i486__)) && W_TYPE_SIZE == 32
909 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \
910 __asm__ ("addl %5,%k1\n\tadcl %3,%k0" \
911 : "=r" (sh), "=&r" (sl) \
912 : "0" ((USItype)(ah)), "g" ((USItype)(bh)), \
913 "%1" ((USItype)(al)), "g" ((USItype)(bl)))
914 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \
915 __asm__ ("subl %5,%k1\n\tsbbl %3,%k0" \
916 : "=r" (sh), "=&r" (sl) \
917 : "0" ((USItype)(ah)), "g" ((USItype)(bh)), \
918 "1" ((USItype)(al)), "g" ((USItype)(bl)))
919 #define umul_ppmm(w1, w0, u, v) \
920 __asm__ ("mull %3" \
921 : "=a" (w0), "=d" (w1) \
922 : "%0" ((USItype)(u)), "rm" ((USItype)(v)))
923 #define udiv_qrnnd(q, r, n1, n0, dx) /* d renamed to dx avoiding "=d" */\
924 __asm__ ("divl %4" /* stringification in K&R C */ \
925 : "=a" (q), "=d" (r) \
926 : "0" ((USItype)(n0)), "1" ((USItype)(n1)), "rm" ((USItype)(dx)))
927
928 #if HAVE_HOST_CPU_i586 || HAVE_HOST_CPU_pentium || HAVE_HOST_CPU_pentiummmx
929 /* Pentium bsrl takes between 10 and 72 cycles depending where the most
930 significant 1 bit is, hence the use of the following alternatives. bsfl
931 is slow too, between 18 and 42 depending where the least significant 1
932 bit is, so let the generic count_trailing_zeros below make use of the
933 count_leading_zeros here too. */
934
935 #if HAVE_HOST_CPU_pentiummmx && ! defined (LONGLONG_STANDALONE)
936 /* The following should be a fixed 14 or 15 cycles, but possibly plus an L1
937 cache miss reading from __clz_tab. For P55 it's favoured over the float
938 below so as to avoid mixing MMX and x87, since the penalty for switching
939 between the two is about 100 cycles.
940
941 The asm block sets __shift to -3 if the high 24 bits are clear, -2 for
942 16, -1 for 8, or 0 otherwise. This could be written equivalently as
943 follows, but as of gcc 2.95.2 it results in conditional jumps.
944
945 __shift = -(__n < 0x1000000);
946 __shift -= (__n < 0x10000);
947 __shift -= (__n < 0x100);
948
949 The middle two sbbl and cmpl's pair, and with luck something gcc
950 generates might pair with the first cmpl and the last sbbl. The "32+1"
951 constant could be folded into __clz_tab[], but it doesn't seem worth
952 making a different table just for that. */
953
954 #define count_leading_zeros(c,n) \
955 do { \
956 USItype __n = (n); \
957 USItype __shift; \
958 __asm__ ("cmpl $0x1000000, %1\n" \
959 "sbbl %0, %0\n" \
960 "cmpl $0x10000, %1\n" \
961 "sbbl $0, %0\n" \
962 "cmpl $0x100, %1\n" \
963 "sbbl $0, %0\n" \
964 : "=&r" (__shift) : "r" (__n)); \
965 __shift = __shift*8 + 24 + 1; \
966 (c) = 32 + 1 - __shift - __clz_tab[__n >> __shift]; \
967 } while (0)
968 #define COUNT_LEADING_ZEROS_NEED_CLZ_TAB
969 #define COUNT_LEADING_ZEROS_0 31 /* n==0 indistinguishable from n==1 */
970
971 #else /* ! pentiummmx || LONGLONG_STANDALONE */
972 /* The following should be a fixed 14 cycles or so. Some scheduling
973 opportunities should be available between the float load/store too. This
974 sort of code is used in gcc 3 for __builtin_ffs (with "n&-n") and is
975 apparently suggested by the Intel optimizing manual (don't know exactly
976 where). gcc 2.95 or up will be best for this, so the "double" is
977 correctly aligned on the stack. */
978 #define count_leading_zeros(c,n) \
979 do { \
980 union { \
981 double d; \
982 unsigned a[2]; \
983 } __u; \
984 __u.d = (UWtype) (n); \
985 (c) = 0x3FF + 31 - (__u.a[1] >> 20); \
986 } while (0)
987 #define COUNT_LEADING_ZEROS_0 (0x3FF + 31)
988 #endif /* pentiummx */
989
990 #else /* ! pentium */
991
992 #if __GMP_GNUC_PREREQ (3,4) /* using bsrl */
993 #define count_leading_zeros(count,x) count_leading_zeros_gcc_clz(count,x)
994 #endif /* gcc clz */
995
996 /* On P6, gcc prior to 3.0 generates a partial register stall for
997 __cbtmp^31, due to using "xorb $31" instead of "xorl $31", the former
998 being 1 code byte smaller. "31-__cbtmp" is a workaround, probably at the
999 cost of one extra instruction. Do this for "i386" too, since that means
1000 generic x86. */
1001 #if ! defined (count_leading_zeros) && __GNUC__ < 3 \
1002 && (HAVE_HOST_CPU_i386 \
1003 || HAVE_HOST_CPU_i686 \
1004 || HAVE_HOST_CPU_pentiumpro \
1005 || HAVE_HOST_CPU_pentium2 \
1006 || HAVE_HOST_CPU_pentium3)
1007 #define count_leading_zeros(count, x) \
1008 do { \
1009 USItype __cbtmp; \
1010 ASSERT ((x) != 0); \
1011 __asm__ ("bsrl %1,%0" : "=r" (__cbtmp) : "rm" ((USItype)(x))); \
1012 (count) = 31 - __cbtmp; \
1013 } while (0)
1014 #endif /* gcc<3 asm bsrl */
1015
1016 #ifndef count_leading_zeros
1017 #define count_leading_zeros(count, x) \
1018 do { \
1019 USItype __cbtmp; \
1020 ASSERT ((x) != 0); \
1021 __asm__ ("bsrl %1,%0" : "=r" (__cbtmp) : "rm" ((USItype)(x))); \
1022 (count) = __cbtmp ^ 31; \
1023 } while (0)
1024 #endif /* asm bsrl */
1025
1026 #if __GMP_GNUC_PREREQ (3,4) /* using bsfl */
1027 #define count_trailing_zeros(count,x) count_trailing_zeros_gcc_ctz(count,x)
1028 #endif /* gcc ctz */
1029
1030 #ifndef count_trailing_zeros
1031 #define count_trailing_zeros(count, x) \
1032 do { \
1033 ASSERT ((x) != 0); \
1034 __asm__ ("bsfl %1,%k0" : "=r" (count) : "rm" ((USItype)(x))); \
1035 } while (0)
1036 #endif /* asm bsfl */
1037
1038 #endif /* ! pentium */
1039
1040 #endif /* 80x86 */
1041
1042 #if defined (__amd64__) && W_TYPE_SIZE == 64
1043 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \
1044 __asm__ ("addq %5,%q1\n\tadcq %3,%q0" \
1045 : "=r" (sh), "=&r" (sl) \
1046 : "0" ((UDItype)(ah)), "rme" ((UDItype)(bh)), \
1047 "%1" ((UDItype)(al)), "rme" ((UDItype)(bl)))
1048 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \
1049 __asm__ ("subq %5,%q1\n\tsbbq %3,%q0" \
1050 : "=r" (sh), "=&r" (sl) \
1051 : "0" ((UDItype)(ah)), "rme" ((UDItype)(bh)), \
1052 "1" ((UDItype)(al)), "rme" ((UDItype)(bl)))
1053 #if X86_ASM_MULX \
1054 && (HAVE_HOST_CPU_haswell || HAVE_HOST_CPU_broadwell \
1055 || HAVE_HOST_CPU_skylake || HAVE_HOST_CPU_bd4 || HAVE_HOST_CPU_zen)
1056 #define umul_ppmm(w1, w0, u, v) \
1057 __asm__ ("mulx\t%3, %q0, %q1" \
1058 : "=r" (w0), "=r" (w1) \
1059 : "%d" ((UDItype)(u)), "rm" ((UDItype)(v)))
1060 #else
1061 #define umul_ppmm(w1, w0, u, v) \
1062 __asm__ ("mulq\t%3" \
1063 : "=a" (w0), "=d" (w1) \
1064 : "%0" ((UDItype)(u)), "rm" ((UDItype)(v)))
1065 #endif
1066 #define udiv_qrnnd(q, r, n1, n0, dx) /* d renamed to dx avoiding "=d" */\
1067 __asm__ ("divq %4" /* stringification in K&R C */ \
1068 : "=a" (q), "=d" (r) \
1069 : "0" ((UDItype)(n0)), "1" ((UDItype)(n1)), "rm" ((UDItype)(dx)))
1070
1071 #if HAVE_HOST_CPU_haswell || HAVE_HOST_CPU_broadwell || HAVE_HOST_CPU_skylake \
1072 || HAVE_HOST_CPU_k10 || HAVE_HOST_CPU_bd1 || HAVE_HOST_CPU_bd2 \
1073 || HAVE_HOST_CPU_bd3 || HAVE_HOST_CPU_bd4 || HAVE_HOST_CPU_zen \
1074 || HAVE_HOST_CPU_bobcat || HAVE_HOST_CPU_jaguar
1075 #define count_leading_zeros(count, x) \
1076 do { \
1077 /* This is lzcnt, spelled for older assemblers. Destination and */ \
1078 /* source must be a 64-bit registers, hence cast and %q. */ \
1079 __asm__ ("rep;bsr\t%1, %q0" : "=r" (count) : "rm" ((UDItype)(x))); \
1080 } while (0)
1081 #define COUNT_LEADING_ZEROS_0 64
1082 #else
1083 #define count_leading_zeros(count, x) \
1084 do { \
1085 UDItype __cbtmp; \
1086 ASSERT ((x) != 0); \
1087 __asm__ ("bsr\t%1,%0" : "=r" (__cbtmp) : "rm" ((UDItype)(x))); \
1088 (count) = __cbtmp ^ 63; \
1089 } while (0)
1090 #endif
1091
1092 #if HAVE_HOST_CPU_bd2 || HAVE_HOST_CPU_bd3 || HAVE_HOST_CPU_bd4 \
1093 || HAVE_HOST_CPU_zen || HAVE_HOST_CPU_jaguar
1094 #define count_trailing_zeros(count, x) \
1095 do { \
1096 /* This is tzcnt, spelled for older assemblers. Destination and */ \
1097 /* source must be a 64-bit registers, hence cast and %q. */ \
1098 __asm__ ("rep;bsf\t%1, %q0" : "=r" (count) : "rm" ((UDItype)(x))); \
1099 } while (0)
1100 #define COUNT_TRAILING_ZEROS_0 64
1101 #else
1102 #define count_trailing_zeros(count, x) \
1103 do { \
1104 ASSERT ((x) != 0); \
1105 __asm__ ("bsf\t%1, %q0" : "=r" (count) : "rm" ((UDItype)(x))); \
1106 } while (0)
1107 #endif
1108 #endif /* __amd64__ */
1109
1110 #if defined (__i860__) && W_TYPE_SIZE == 32
1111 #define rshift_rhlc(r,h,l,c) \
1112 __asm__ ("shr %3,r0,r0\;shrd %1,%2,%0" \
1113 "=r" (r) : "r" (h), "r" (l), "rn" (c))
1114 #endif /* i860 */
1115
1116 #if defined (__i960__) && W_TYPE_SIZE == 32
1117 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \
1118 __asm__ ("cmpo 1,0\;addc %5,%4,%1\;addc %3,%2,%0" \
1119 : "=r" (sh), "=&r" (sl) \
1120 : "dI" (ah), "dI" (bh), "%dI" (al), "dI" (bl))
1121 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \
1122 __asm__ ("cmpo 0,0\;subc %5,%4,%1\;subc %3,%2,%0" \
1123 : "=r" (sh), "=&r" (sl) \
1124 : "dI" (ah), "dI" (bh), "dI" (al), "dI" (bl))
1125 #define umul_ppmm(w1, w0, u, v) \
1126 ({union {UDItype __ll; \
1127 struct {USItype __l, __h;} __i; \
1128 } __x; \
1129 __asm__ ("emul %2,%1,%0" \
1130 : "=d" (__x.__ll) : "%dI" (u), "dI" (v)); \
1131 (w1) = __x.__i.__h; (w0) = __x.__i.__l;})
1132 #define __umulsidi3(u, v) \
1133 ({UDItype __w; \
1134 __asm__ ("emul %2,%1,%0" : "=d" (__w) : "%dI" (u), "dI" (v)); \
1135 __w; })
1136 #define udiv_qrnnd(q, r, nh, nl, d) \
1137 do { \
1138 union {UDItype __ll; \
1139 struct {USItype __l, __h;} __i; \
1140 } __nn; \
1141 __nn.__i.__h = (nh); __nn.__i.__l = (nl); \
1142 __asm__ ("ediv %d,%n,%0" \
1143 : "=d" (__rq.__ll) : "dI" (__nn.__ll), "dI" (d)); \
1144 (r) = __rq.__i.__l; (q) = __rq.__i.__h; \
1145 } while (0)
1146 #define count_leading_zeros(count, x) \
1147 do { \
1148 USItype __cbtmp; \
1149 __asm__ ("scanbit %1,%0" : "=r" (__cbtmp) : "r" (x)); \
1150 (count) = __cbtmp ^ 31; \
1151 } while (0)
1152 #define COUNT_LEADING_ZEROS_0 (-32) /* sic */
1153 #if defined (__i960mx) /* what is the proper symbol to test??? */
1154 #define rshift_rhlc(r,h,l,c) \
1155 do { \
1156 union {UDItype __ll; \
1157 struct {USItype __l, __h;} __i; \
1158 } __nn; \
1159 __nn.__i.__h = (h); __nn.__i.__l = (l); \
1160 __asm__ ("shre %2,%1,%0" : "=d" (r) : "dI" (__nn.__ll), "dI" (c)); \
1161 }
1162 #endif /* i960mx */
1163 #endif /* i960 */
1164
1165 #if (defined (__mc68000__) || defined (__mc68020__) || defined(mc68020) \
1166 || defined (__m68k__) || defined (__mc5200__) || defined (__mc5206e__) \
1167 || defined (__mc5307__)) && W_TYPE_SIZE == 32
1168 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \
1169 __asm__ ("add%.l %5,%1\n\taddx%.l %3,%0" \
1170 : "=d" (sh), "=&d" (sl) \
1171 : "0" ((USItype)(ah)), "d" ((USItype)(bh)), \
1172 "%1" ((USItype)(al)), "g" ((USItype)(bl)))
1173 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \
1174 __asm__ ("sub%.l %5,%1\n\tsubx%.l %3,%0" \
1175 : "=d" (sh), "=&d" (sl) \
1176 : "0" ((USItype)(ah)), "d" ((USItype)(bh)), \
1177 "1" ((USItype)(al)), "g" ((USItype)(bl)))
1178 /* The '020, '030, '040 and CPU32 have 32x32->64 and 64/32->32q-32r. */
1179 #if defined (__mc68020__) || defined(mc68020) \
1180 || defined (__mc68030__) || defined (mc68030) \
1181 || defined (__mc68040__) || defined (mc68040) \
1182 || defined (__mcpu32__) || defined (mcpu32) \
1183 || defined (__NeXT__)
1184 #define umul_ppmm(w1, w0, u, v) \
1185 __asm__ ("mulu%.l %3,%1:%0" \
1186 : "=d" (w0), "=d" (w1) \
1187 : "%0" ((USItype)(u)), "dmi" ((USItype)(v)))
1188 #define udiv_qrnnd(q, r, n1, n0, d) \
1189 __asm__ ("divu%.l %4,%1:%0" \
1190 : "=d" (q), "=d" (r) \
1191 : "0" ((USItype)(n0)), "1" ((USItype)(n1)), "dmi" ((USItype)(d)))
1192 #define sdiv_qrnnd(q, r, n1, n0, d) \
1193 __asm__ ("divs%.l %4,%1:%0" \
1194 : "=d" (q), "=d" (r) \
1195 : "0" ((USItype)(n0)), "1" ((USItype)(n1)), "dmi" ((USItype)(d)))
1196 #else /* for other 68k family members use 16x16->32 multiplication */
1197 #define umul_ppmm(xh, xl, a, b) \
1198 do { USItype __umul_tmp1, __umul_tmp2; \
1199 __asm__ ("| Inlined umul_ppmm\n" \
1200 " move%.l %5,%3\n" \
1201 " move%.l %2,%0\n" \
1202 " move%.w %3,%1\n" \
1203 " swap %3\n" \
1204 " swap %0\n" \
1205 " mulu%.w %2,%1\n" \
1206 " mulu%.w %3,%0\n" \
1207 " mulu%.w %2,%3\n" \
1208 " swap %2\n" \
1209 " mulu%.w %5,%2\n" \
1210 " add%.l %3,%2\n" \
1211 " jcc 1f\n" \
1212 " add%.l %#0x10000,%0\n" \
1213 "1: move%.l %2,%3\n" \
1214 " clr%.w %2\n" \
1215 " swap %2\n" \
1216 " swap %3\n" \
1217 " clr%.w %3\n" \
1218 " add%.l %3,%1\n" \
1219 " addx%.l %2,%0\n" \
1220 " | End inlined umul_ppmm" \
1221 : "=&d" (xh), "=&d" (xl), \
1222 "=d" (__umul_tmp1), "=&d" (__umul_tmp2) \
1223 : "%2" ((USItype)(a)), "d" ((USItype)(b))); \
1224 } while (0)
1225 #endif /* not mc68020 */
1226 /* The '020, '030, '040 and '060 have bitfield insns.
1227 GCC 3.4 defines __mc68020__ when in CPU32 mode, check for __mcpu32__ to
1228 exclude bfffo on that chip (bitfield insns not available). */
1229 #if (defined (__mc68020__) || defined (mc68020) \
1230 || defined (__mc68030__) || defined (mc68030) \
1231 || defined (__mc68040__) || defined (mc68040) \
1232 || defined (__mc68060__) || defined (mc68060) \
1233 || defined (__NeXT__)) \
1234 && ! defined (__mcpu32__)
1235 #define count_leading_zeros(count, x) \
1236 __asm__ ("bfffo %1{%b2:%b2},%0" \
1237 : "=d" (count) \
1238 : "od" ((USItype) (x)), "n" (0))
1239 #define COUNT_LEADING_ZEROS_0 32
1240 #endif
1241 #endif /* mc68000 */
1242
1243 #if defined (__m88000__) && W_TYPE_SIZE == 32
1244 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \
1245 __asm__ ("addu.co %1,%r4,%r5\n\taddu.ci %0,%r2,%r3" \
1246 : "=r" (sh), "=&r" (sl) \
1247 : "rJ" (ah), "rJ" (bh), "%rJ" (al), "rJ" (bl))
1248 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \
1249 __asm__ ("subu.co %1,%r4,%r5\n\tsubu.ci %0,%r2,%r3" \
1250 : "=r" (sh), "=&r" (sl) \
1251 : "rJ" (ah), "rJ" (bh), "rJ" (al), "rJ" (bl))
1252 #define count_leading_zeros(count, x) \
1253 do { \
1254 USItype __cbtmp; \
1255 __asm__ ("ff1 %0,%1" : "=r" (__cbtmp) : "r" (x)); \
1256 (count) = __cbtmp ^ 31; \
1257 } while (0)
1258 #define COUNT_LEADING_ZEROS_0 63 /* sic */
1259 #if defined (__m88110__)
1260 #define umul_ppmm(wh, wl, u, v) \
1261 do { \
1262 union {UDItype __ll; \
1263 struct {USItype __h, __l;} __i; \
1264 } __x; \
1265 __asm__ ("mulu.d %0,%1,%2" : "=r" (__x.__ll) : "r" (u), "r" (v)); \
1266 (wh) = __x.__i.__h; \
1267 (wl) = __x.__i.__l; \
1268 } while (0)
1269 #define udiv_qrnnd(q, r, n1, n0, d) \
1270 ({union {UDItype __ll; \
1271 struct {USItype __h, __l;} __i; \
1272 } __x, __q; \
1273 __x.__i.__h = (n1); __x.__i.__l = (n0); \
1274 __asm__ ("divu.d %0,%1,%2" \
1275 : "=r" (__q.__ll) : "r" (__x.__ll), "r" (d)); \
1276 (r) = (n0) - __q.__l * (d); (q) = __q.__l; })
1277 #endif /* __m88110__ */
1278 #endif /* __m88000__ */
1279
1280 #if defined (__mips) && W_TYPE_SIZE == 32
1281 #if __GMP_GNUC_PREREQ (4,4) || defined(__clang__)
1282 #define umul_ppmm(w1, w0, u, v) \
1283 do { \
1284 UDItype __ll = (UDItype)(u) * (v); \
1285 w1 = __ll >> 32; \
1286 w0 = __ll; \
1287 } while (0)
1288 #endif
1289 #if !defined (umul_ppmm) && __GMP_GNUC_PREREQ (2,7) && !defined (__clang__)
1290 #define umul_ppmm(w1, w0, u, v) \
1291 __asm__ ("multu %2,%3" : "=l" (w0), "=h" (w1) : "d" (u), "d" (v))
1292 #endif
1293 #if !defined (umul_ppmm)
1294 #define umul_ppmm(w1, w0, u, v) \
1295 __asm__ ("multu %2,%3\n\tmflo %0\n\tmfhi %1" \
1296 : "=d" (w0), "=d" (w1) : "d" (u), "d" (v))
1297 #endif
1298 #endif /* __mips */
1299
1300 #if (defined (__mips) && __mips >= 3) && W_TYPE_SIZE == 64
1301 #if defined (_MIPS_ARCH_MIPS64R6)
1302 #define umul_ppmm(w1, w0, u, v) \
1303 do { \
1304 UDItype __m0 = (u), __m1 = (v); \
1305 (w0) = __m0 * __m1; \
1306 __asm__ ("dmuhu\t%0, %1, %2" : "=d" (w1) : "d" (__m0), "d" (__m1)); \
1307 } while (0)
1308 #endif
1309 #if !defined (umul_ppmm) && (__GMP_GNUC_PREREQ (4,4) || defined(__clang__))
1310 #define umul_ppmm(w1, w0, u, v) \
1311 do { \
1312 typedef unsigned int __ll_UTItype __attribute__((mode(TI))); \
1313 __ll_UTItype __ll = (__ll_UTItype)(u) * (v); \
1314 w1 = __ll >> 64; \
1315 w0 = __ll; \
1316 } while (0)
1317 #endif
1318 #if !defined (umul_ppmm) && __GMP_GNUC_PREREQ (2,7) && !defined (__clang__)
1319 #define umul_ppmm(w1, w0, u, v) \
1320 __asm__ ("dmultu %2,%3" \
1321 : "=l" (w0), "=h" (w1) \
1322 : "d" ((UDItype)(u)), "d" ((UDItype)(v)))
1323 #endif
1324 #if !defined (umul_ppmm)
1325 #define umul_ppmm(w1, w0, u, v) \
1326 __asm__ ("dmultu %2,%3\n\tmflo %0\n\tmfhi %1" \
1327 : "=d" (w0), "=d" (w1) \
1328 : "d" ((UDItype)(u)), "d" ((UDItype)(v)))
1329 #endif
1330 #endif /* __mips */
1331
1332 #if defined (__mmix__) && W_TYPE_SIZE == 64
1333 #define umul_ppmm(w1, w0, u, v) \
1334 __asm__ ("MULU %0,%2,%3" : "=r" (w0), "=z" (w1) : "r" (u), "r" (v))
1335 #endif
1336
1337 #if defined (__ns32000__) && W_TYPE_SIZE == 32
1338 #define umul_ppmm(w1, w0, u, v) \
1339 ({union {UDItype __ll; \
1340 struct {USItype __l, __h;} __i; \
1341 } __x; \
1342 __asm__ ("meid %2,%0" \
1343 : "=g" (__x.__ll) \
1344 : "%0" ((USItype)(u)), "g" ((USItype)(v))); \
1345 (w1) = __x.__i.__h; (w0) = __x.__i.__l;})
1346 #define __umulsidi3(u, v) \
1347 ({UDItype __w; \
1348 __asm__ ("meid %2,%0" \
1349 : "=g" (__w) \
1350 : "%0" ((USItype)(u)), "g" ((USItype)(v))); \
1351 __w; })
1352 #define udiv_qrnnd(q, r, n1, n0, d) \
1353 ({union {UDItype __ll; \
1354 struct {USItype __l, __h;} __i; \
1355 } __x; \
1356 __x.__i.__h = (n1); __x.__i.__l = (n0); \
1357 __asm__ ("deid %2,%0" \
1358 : "=g" (__x.__ll) \
1359 : "0" (__x.__ll), "g" ((USItype)(d))); \
1360 (r) = __x.__i.__l; (q) = __x.__i.__h; })
1361 #define count_trailing_zeros(count,x) \
1362 do { \
1363 __asm__ ("ffsd %2,%0" \
1364 : "=r" (count) \
1365 : "0" ((USItype) 0), "r" ((USItype) (x))); \
1366 } while (0)
1367 #endif /* __ns32000__ */
1368
1369 /* In the past we had a block of various #defines tested
1370 _ARCH_PPC - AIX
1371 _ARCH_PWR - AIX
1372 __powerpc__ - gcc
1373 __POWERPC__ - BEOS
1374 __ppc__ - Darwin
1375 PPC - old gcc, GNU/Linux, SysV
1376 The plain PPC test was not good for vxWorks, since PPC is defined on all
1377 CPUs there (eg. m68k too), as a constant one is expected to compare
1378 CPU_FAMILY against.
1379
1380 At any rate, this was pretty unattractive and a bit fragile. The use of
1381 HAVE_HOST_CPU_FAMILY is designed to cut through it all and be sure of
1382 getting the desired effect.
1383
1384 ENHANCE-ME: We should test _IBMR2 here when we add assembly support for
1385 the system vendor compilers. (Is that vendor compilers with inline asm,
1386 or what?) */
1387
1388 #if (HAVE_HOST_CPU_FAMILY_power || HAVE_HOST_CPU_FAMILY_powerpc) \
1389 && W_TYPE_SIZE == 32
1390 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \
1391 do { \
1392 if (__builtin_constant_p (bh) && (bh) == 0) \
1393 __asm__ ("add%I4c %1,%3,%4\n\taddze %0,%2" \
1394 : "=r" (sh), "=&r" (sl) : "r" (ah), "%r" (al), "rI" (bl) \
1395 __CLOBBER_CC); \
1396 else if (__builtin_constant_p (bh) && (bh) == ~(USItype) 0) \
1397 __asm__ ("add%I4c %1,%3,%4\n\taddme %0,%2" \
1398 : "=r" (sh), "=&r" (sl) : "r" (ah), "%r" (al), "rI" (bl) \
1399 __CLOBBER_CC); \
1400 else \
1401 __asm__ ("add%I5c %1,%4,%5\n\tadde %0,%2,%3" \
1402 : "=r" (sh), "=&r" (sl) \
1403 : "r" (ah), "r" (bh), "%r" (al), "rI" (bl) \
1404 __CLOBBER_CC); \
1405 } while (0)
1406 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \
1407 do { \
1408 if (__builtin_constant_p (ah) && (ah) == 0) \
1409 __asm__ ("subf%I3c %1,%4,%3\n\tsubfze %0,%2" \
1410 : "=r" (sh), "=&r" (sl) : "r" (bh), "rI" (al), "r" (bl) \
1411 __CLOBBER_CC); \
1412 else if (__builtin_constant_p (ah) && (ah) == ~(USItype) 0) \
1413 __asm__ ("subf%I3c %1,%4,%3\n\tsubfme %0,%2" \
1414 : "=r" (sh), "=&r" (sl) : "r" (bh), "rI" (al), "r" (bl) \
1415 __CLOBBER_CC); \
1416 else if (__builtin_constant_p (bh) && (bh) == 0) \
1417 __asm__ ("subf%I3c %1,%4,%3\n\taddme %0,%2" \
1418 : "=r" (sh), "=&r" (sl) : "r" (ah), "rI" (al), "r" (bl) \
1419 __CLOBBER_CC); \
1420 else if (__builtin_constant_p (bh) && (bh) == ~(USItype) 0) \
1421 __asm__ ("subf%I3c %1,%4,%3\n\taddze %0,%2" \
1422 : "=r" (sh), "=&r" (sl) : "r" (ah), "rI" (al), "r" (bl) \
1423 __CLOBBER_CC); \
1424 else \
1425 __asm__ ("subf%I4c %1,%5,%4\n\tsubfe %0,%3,%2" \
1426 : "=r" (sh), "=&r" (sl) \
1427 : "r" (ah), "r" (bh), "rI" (al), "r" (bl) \
1428 __CLOBBER_CC); \
1429 } while (0)
1430 #define count_leading_zeros(count, x) \
1431 __asm__ ("cntlzw %0,%1" : "=r" (count) : "r" (x))
1432 #define COUNT_LEADING_ZEROS_0 32
1433 #if HAVE_HOST_CPU_FAMILY_powerpc
1434 #if __GMP_GNUC_PREREQ (4,4) || defined(__clang__)
1435 #define umul_ppmm(w1, w0, u, v) \
1436 do { \
1437 UDItype __ll = (UDItype)(u) * (v); \
1438 w1 = __ll >> 32; \
1439 w0 = __ll; \
1440 } while (0)
1441 #endif
1442 #if !defined (umul_ppmm)
1443 #define umul_ppmm(ph, pl, m0, m1) \
1444 do { \
1445 USItype __m0 = (m0), __m1 = (m1); \
1446 __asm__ ("mulhwu %0,%1,%2" : "=r" (ph) : "%r" (m0), "r" (m1)); \
1447 (pl) = __m0 * __m1; \
1448 } while (0)
1449 #endif
1450 #define smul_ppmm(ph, pl, m0, m1) \
1451 do { \
1452 SItype __m0 = (m0), __m1 = (m1); \
1453 __asm__ ("mulhw %0,%1,%2" : "=r" (ph) : "%r" (m0), "r" (m1)); \
1454 (pl) = __m0 * __m1; \
1455 } while (0)
1456 #else
1457 #define smul_ppmm(xh, xl, m0, m1) \
1458 __asm__ ("mul %0,%2,%3" : "=r" (xh), "=q" (xl) : "r" (m0), "r" (m1))
1459 #define sdiv_qrnnd(q, r, nh, nl, d) \
1460 __asm__ ("div %0,%2,%4" : "=r" (q), "=q" (r) : "r" (nh), "1" (nl), "r" (d))
1461 #endif
1462 #endif /* 32-bit POWER architecture variants. */
1463
1464 /* We should test _IBMR2 here when we add assembly support for the system
1465 vendor compilers. */
1466 #if HAVE_HOST_CPU_FAMILY_powerpc && W_TYPE_SIZE == 64
1467 #if !defined (_LONG_LONG_LIMB)
1468 /* _LONG_LONG_LIMB is ABI=mode32 where adde operates on 32-bit values. So
1469 use adde etc only when not _LONG_LONG_LIMB. */
1470 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \
1471 do { \
1472 if (__builtin_constant_p (bh) && (bh) == 0) \
1473 __asm__ ("add%I4c %1,%3,%4\n\taddze %0,%2" \
1474 : "=r" (sh), "=&r" (sl) \
1475 : "r" ((UDItype)(ah)), \
1476 "%r" ((UDItype)(al)), "rI" ((UDItype)(bl)) \
1477 __CLOBBER_CC); \
1478 else if (__builtin_constant_p (bh) && (bh) == ~(UDItype) 0) \
1479 __asm__ ("add%I4c %1,%3,%4\n\taddme %0,%2" \
1480 : "=r" (sh), "=&r" (sl) \
1481 : "r" ((UDItype)(ah)), \
1482 "%r" ((UDItype)(al)), "rI" ((UDItype)(bl)) \
1483 __CLOBBER_CC); \
1484 else \
1485 __asm__ ("add%I5c %1,%4,%5\n\tadde %0,%2,%3" \
1486 : "=r" (sh), "=&r" (sl) \
1487 : "r" ((UDItype)(ah)), "r" ((UDItype)(bh)), \
1488 "%r" ((UDItype)(al)), "rI" ((UDItype)(bl)) \
1489 __CLOBBER_CC); \
1490 } while (0)
1491 /* We use "*rI" for the constant operand here, since with just "I", gcc barfs.
1492 This might seem strange, but gcc folds away the dead code late. */
1493 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \
1494 do { \
1495 if (__builtin_constant_p (bl) \
1496 && (bl) > -0x8000 && (bl) <= 0x8000 && (bl) != 0) { \
1497 if (__builtin_constant_p (ah) && (ah) == 0) \
1498 __asm__ ("addic %1,%3,%4\n\tsubfze %0,%2" \
1499 : "=r" (sh), "=&r" (sl) \
1500 : "r" ((UDItype)(bh)), \
1501 "r" ((UDItype)(al)), "*rI" (-((UDItype)(bl))) \
1502 __CLOBBER_CC); \
1503 else if (__builtin_constant_p (ah) && (ah) == ~(UDItype) 0) \
1504 __asm__ ("addic %1,%3,%4\n\tsubfme %0,%2" \
1505 : "=r" (sh), "=&r" (sl) \
1506 : "r" ((UDItype)(bh)), \
1507 "r" ((UDItype)(al)), "*rI" (-((UDItype)(bl))) \
1508 __CLOBBER_CC); \
1509 else if (__builtin_constant_p (bh) && (bh) == 0) \
1510 __asm__ ("addic %1,%3,%4\n\taddme %0,%2" \
1511 : "=r" (sh), "=&r" (sl) \
1512 : "r" ((UDItype)(ah)), \
1513 "r" ((UDItype)(al)), "*rI" (-((UDItype)(bl))) \
1514 __CLOBBER_CC); \
1515 else if (__builtin_constant_p (bh) && (bh) == ~(UDItype) 0) \
1516 __asm__ ("addic %1,%3,%4\n\taddze %0,%2" \
1517 : "=r" (sh), "=&r" (sl) \
1518 : "r" ((UDItype)(ah)), \
1519 "r" ((UDItype)(al)), "*rI" (-((UDItype)(bl))) \
1520 __CLOBBER_CC); \
1521 else \
1522 __asm__ ("addic %1,%4,%5\n\tsubfe %0,%3,%2" \
1523 : "=r" (sh), "=&r" (sl) \
1524 : "r" ((UDItype)(ah)), "r" ((UDItype)(bh)), \
1525 "r" ((UDItype)(al)), "*rI" (-((UDItype)(bl))) \
1526 __CLOBBER_CC); \
1527 } else { \
1528 if (__builtin_constant_p (ah) && (ah) == 0) \
1529 __asm__ ("subf%I3c %1,%4,%3\n\tsubfze %0,%2" \
1530 : "=r" (sh), "=&r" (sl) \
1531 : "r" ((UDItype)(bh)), \
1532 "rI" ((UDItype)(al)), "r" ((UDItype)(bl)) \
1533 __CLOBBER_CC); \
1534 else if (__builtin_constant_p (ah) && (ah) == ~(UDItype) 0) \
1535 __asm__ ("subf%I3c %1,%4,%3\n\tsubfme %0,%2" \
1536 : "=r" (sh), "=&r" (sl) \
1537 : "r" ((UDItype)(bh)), \
1538 "rI" ((UDItype)(al)), "r" ((UDItype)(bl)) \
1539 __CLOBBER_CC); \
1540 else if (__builtin_constant_p (bh) && (bh) == 0) \
1541 __asm__ ("subf%I3c %1,%4,%3\n\taddme %0,%2" \
1542 : "=r" (sh), "=&r" (sl) \
1543 : "r" ((UDItype)(ah)), \
1544 "rI" ((UDItype)(al)), "r" ((UDItype)(bl)) \
1545 __CLOBBER_CC); \
1546 else if (__builtin_constant_p (bh) && (bh) == ~(UDItype) 0) \
1547 __asm__ ("subf%I3c %1,%4,%3\n\taddze %0,%2" \
1548 : "=r" (sh), "=&r" (sl) \
1549 : "r" ((UDItype)(ah)), \
1550 "rI" ((UDItype)(al)), "r" ((UDItype)(bl)) \
1551 __CLOBBER_CC); \
1552 else \
1553 __asm__ ("subf%I4c %1,%5,%4\n\tsubfe %0,%3,%2" \
1554 : "=r" (sh), "=&r" (sl) \
1555 : "r" ((UDItype)(ah)), "r" ((UDItype)(bh)), \
1556 "rI" ((UDItype)(al)), "r" ((UDItype)(bl)) \
1557 __CLOBBER_CC); \
1558 } \
1559 } while (0)
1560 #endif /* ! _LONG_LONG_LIMB */
1561 #define count_leading_zeros(count, x) \
1562 __asm__ ("cntlzd %0,%1" : "=r" (count) : "r" (x))
1563 #define COUNT_LEADING_ZEROS_0 64
1564 #if __GMP_GNUC_PREREQ (4,8)
1565 #define umul_ppmm(w1, w0, u, v) \
1566 do { \
1567 typedef unsigned int __ll_UTItype __attribute__((mode(TI))); \
1568 __ll_UTItype __ll = (__ll_UTItype)(u) * (v); \
1569 w1 = __ll >> 64; \
1570 w0 = __ll; \
1571 } while (0)
1572 #endif
1573 #if !defined (umul_ppmm)
1574 #define umul_ppmm(ph, pl, m0, m1) \
1575 do { \
1576 UDItype __m0 = (m0), __m1 = (m1); \
1577 __asm__ ("mulhdu %0,%1,%2" : "=r" (ph) : "%r" (__m0), "r" (__m1)); \
1578 (pl) = __m0 * __m1; \
1579 } while (0)
1580 #endif
1581 #define smul_ppmm(ph, pl, m0, m1) \
1582 do { \
1583 DItype __m0 = (m0), __m1 = (m1); \
1584 __asm__ ("mulhd %0,%1,%2" : "=r" (ph) : "%r" (__m0), "r" (__m1)); \
1585 (pl) = __m0 * __m1; \
1586 } while (0)
1587 #endif /* 64-bit PowerPC. */
1588
1589 #if defined (__pyr__) && W_TYPE_SIZE == 32
1590 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \
1591 __asm__ ("addw %5,%1\n\taddwc %3,%0" \
1592 : "=r" (sh), "=&r" (sl) \
1593 : "0" ((USItype)(ah)), "g" ((USItype)(bh)), \
1594 "%1" ((USItype)(al)), "g" ((USItype)(bl)))
1595 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \
1596 __asm__ ("subw %5,%1\n\tsubwb %3,%0" \
1597 : "=r" (sh), "=&r" (sl) \
1598 : "0" ((USItype)(ah)), "g" ((USItype)(bh)), \
1599 "1" ((USItype)(al)), "g" ((USItype)(bl)))
1600 /* This insn works on Pyramids with AP, XP, or MI CPUs, but not with SP. */
1601 #define umul_ppmm(w1, w0, u, v) \
1602 ({union {UDItype __ll; \
1603 struct {USItype __h, __l;} __i; \
1604 } __x; \
1605 __asm__ ("movw %1,%R0\n\tuemul %2,%0" \
1606 : "=&r" (__x.__ll) \
1607 : "g" ((USItype) (u)), "g" ((USItype)(v))); \
1608 (w1) = __x.__i.__h; (w0) = __x.__i.__l;})
1609 #endif /* __pyr__ */
1610
1611 #if defined (__ibm032__) /* RT/ROMP */ && W_TYPE_SIZE == 32
1612 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \
1613 __asm__ ("a %1,%5\n\tae %0,%3" \
1614 : "=r" (sh), "=&r" (sl) \
1615 : "0" ((USItype)(ah)), "r" ((USItype)(bh)), \
1616 "%1" ((USItype)(al)), "r" ((USItype)(bl)))
1617 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \
1618 __asm__ ("s %1,%5\n\tse %0,%3" \
1619 : "=r" (sh), "=&r" (sl) \
1620 : "0" ((USItype)(ah)), "r" ((USItype)(bh)), \
1621 "1" ((USItype)(al)), "r" ((USItype)(bl)))
1622 #define smul_ppmm(ph, pl, m0, m1) \
1623 __asm__ ( \
1624 "s r2,r2\n" \
1625 " mts r10,%2\n" \
1626 " m r2,%3\n" \
1627 " m r2,%3\n" \
1628 " m r2,%3\n" \
1629 " m r2,%3\n" \
1630 " m r2,%3\n" \
1631 " m r2,%3\n" \
1632 " m r2,%3\n" \
1633 " m r2,%3\n" \
1634 " m r2,%3\n" \
1635 " m r2,%3\n" \
1636 " m r2,%3\n" \
1637 " m r2,%3\n" \
1638 " m r2,%3\n" \
1639 " m r2,%3\n" \
1640 " m r2,%3\n" \
1641 " m r2,%3\n" \
1642 " cas %0,r2,r0\n" \
1643 " mfs r10,%1" \
1644 : "=r" (ph), "=r" (pl) \
1645 : "%r" ((USItype)(m0)), "r" ((USItype)(m1)) \
1646 : "r2")
1647 #define count_leading_zeros(count, x) \
1648 do { \
1649 if ((x) >= 0x10000) \
1650 __asm__ ("clz %0,%1" \
1651 : "=r" (count) : "r" ((USItype)(x) >> 16)); \
1652 else \
1653 { \
1654 __asm__ ("clz %0,%1" \
1655 : "=r" (count) : "r" ((USItype)(x))); \
1656 (count) += 16; \
1657 } \
1658 } while (0)
1659 #endif /* RT/ROMP */
1660
1661 #if defined (__riscv64) && W_TYPE_SIZE == 64
1662 #define umul_ppmm(ph, pl, u, v) \
1663 do { \
1664 UDItype __u = (u), __v = (v); \
1665 (pl) = __u * __v; \
1666 __asm__ ("mulhu\t%2, %1, %0" : "=r" (ph) : "%r" (__u), "r" (__v)); \
1667 } while (0)
1668 #endif
1669
1670 #if (defined (__SH2__) || defined (__SH3__) || defined (__SH4__)) && W_TYPE_SIZE == 32
1671 #define umul_ppmm(w1, w0, u, v) \
1672 __asm__ ("dmulu.l %2,%3\n\tsts macl,%1\n\tsts mach,%0" \
1673 : "=r" (w1), "=r" (w0) : "r" (u), "r" (v) : "macl", "mach")
1674 #endif
1675
1676 #if defined (__sparc__) && W_TYPE_SIZE == 32
1677 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \
1678 __asm__ ("addcc %r4,%5,%1\n\taddx %r2,%3,%0" \
1679 : "=r" (sh), "=&r" (sl) \
1680 : "rJ" (ah), "rI" (bh),"%rJ" (al), "rI" (bl) \
1681 __CLOBBER_CC)
1682 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \
1683 __asm__ ("subcc %r4,%5,%1\n\tsubx %r2,%3,%0" \
1684 : "=r" (sh), "=&r" (sl) \
1685 : "rJ" (ah), "rI" (bh), "rJ" (al), "rI" (bl) \
1686 __CLOBBER_CC)
1687 /* FIXME: When gcc -mcpu=v9 is used on solaris, gcc/config/sol2-sld-64.h
1688 doesn't define anything to indicate that to us, it only sets __sparcv8. */
1689 #if defined (__sparc_v9__) || defined (__sparcv9)
1690 /* Perhaps we should use floating-point operations here? */
1691 #if 0
1692 /* Triggers a bug making mpz/tests/t-gcd.c fail.
1693 Perhaps we simply need explicitly zero-extend the inputs? */
1694 #define umul_ppmm(w1, w0, u, v) \
1695 __asm__ ("mulx %2,%3,%%g1; srl %%g1,0,%1; srlx %%g1,32,%0" : \
1696 "=r" (w1), "=r" (w0) : "r" (u), "r" (v) : "g1")
1697 #else
1698 /* Use v8 umul until above bug is fixed. */
1699 #define umul_ppmm(w1, w0, u, v) \
1700 __asm__ ("umul %2,%3,%1;rd %%y,%0" : "=r" (w1), "=r" (w0) : "r" (u), "r" (v))
1701 #endif
1702 /* Use a plain v8 divide for v9. */
1703 #define udiv_qrnnd(q, r, n1, n0, d) \
1704 do { \
1705 USItype __q; \
1706 __asm__ ("mov %1,%%y;nop;nop;nop;udiv %2,%3,%0" \
1707 : "=r" (__q) : "r" (n1), "r" (n0), "r" (d)); \
1708 (r) = (n0) - __q * (d); \
1709 (q) = __q; \
1710 } while (0)
1711 #else
1712 #if defined (__sparc_v8__) /* gcc normal */ \
1713 || defined (__sparcv8) /* gcc solaris */ \
1714 || HAVE_HOST_CPU_supersparc
1715 /* Don't match immediate range because, 1) it is not often useful,
1716 2) the 'I' flag thinks of the range as a 13 bit signed interval,
1717 while we want to match a 13 bit interval, sign extended to 32 bits,
1718 but INTERPRETED AS UNSIGNED. */
1719 #define umul_ppmm(w1, w0, u, v) \
1720 __asm__ ("umul %2,%3,%1;rd %%y,%0" : "=r" (w1), "=r" (w0) : "r" (u), "r" (v))
1721
1722 #if HAVE_HOST_CPU_supersparc
1723 #else
1724 /* Don't use this on SuperSPARC because its udiv only handles 53 bit
1725 dividends and will trap to the kernel for the rest. */
1726 #define udiv_qrnnd(q, r, n1, n0, d) \
1727 do { \
1728 USItype __q; \
1729 __asm__ ("mov %1,%%y;nop;nop;nop;udiv %2,%3,%0" \
1730 : "=r" (__q) : "r" (n1), "r" (n0), "r" (d)); \
1731 (r) = (n0) - __q * (d); \
1732 (q) = __q; \
1733 } while (0)
1734 #endif /* HAVE_HOST_CPU_supersparc */
1735
1736 #else /* ! __sparc_v8__ */
1737 #if defined (__sparclite__)
1738 /* This has hardware multiply but not divide. It also has two additional
1739 instructions scan (ffs from high bit) and divscc. */
1740 #define umul_ppmm(w1, w0, u, v) \
1741 __asm__ ("umul %2,%3,%1;rd %%y,%0" : "=r" (w1), "=r" (w0) : "r" (u), "r" (v))
1742 #define udiv_qrnnd(q, r, n1, n0, d) \
1743 __asm__ ("! Inlined udiv_qrnnd\n" \
1744 " wr %%g0,%2,%%y ! Not a delayed write for sparclite\n" \
1745 " tst %%g0\n" \
1746 " divscc %3,%4,%%g1\n" \
1747 " divscc %%g1,%4,%%g1\n" \
1748 " divscc %%g1,%4,%%g1\n" \
1749 " divscc %%g1,%4,%%g1\n" \
1750 " divscc %%g1,%4,%%g1\n" \
1751 " divscc %%g1,%4,%%g1\n" \
1752 " divscc %%g1,%4,%%g1\n" \
1753 " divscc %%g1,%4,%%g1\n" \
1754 " divscc %%g1,%4,%%g1\n" \
1755 " divscc %%g1,%4,%%g1\n" \
1756 " divscc %%g1,%4,%%g1\n" \
1757 " divscc %%g1,%4,%%g1\n" \
1758 " divscc %%g1,%4,%%g1\n" \
1759 " divscc %%g1,%4,%%g1\n" \
1760 " divscc %%g1,%4,%%g1\n" \
1761 " divscc %%g1,%4,%%g1\n" \
1762 " divscc %%g1,%4,%%g1\n" \
1763 " divscc %%g1,%4,%%g1\n" \
1764 " divscc %%g1,%4,%%g1\n" \
1765 " divscc %%g1,%4,%%g1\n" \
1766 " divscc %%g1,%4,%%g1\n" \
1767 " divscc %%g1,%4,%%g1\n" \
1768 " divscc %%g1,%4,%%g1\n" \
1769 " divscc %%g1,%4,%%g1\n" \
1770 " divscc %%g1,%4,%%g1\n" \
1771 " divscc %%g1,%4,%%g1\n" \
1772 " divscc %%g1,%4,%%g1\n" \
1773 " divscc %%g1,%4,%%g1\n" \
1774 " divscc %%g1,%4,%%g1\n" \
1775 " divscc %%g1,%4,%%g1\n" \
1776 " divscc %%g1,%4,%%g1\n" \
1777 " divscc %%g1,%4,%0\n" \
1778 " rd %%y,%1\n" \
1779 " bl,a 1f\n" \
1780 " add %1,%4,%1\n" \
1781 "1: ! End of inline udiv_qrnnd" \
1782 : "=r" (q), "=r" (r) : "r" (n1), "r" (n0), "rI" (d) \
1783 : "%g1" __AND_CLOBBER_CC)
1784 #define count_leading_zeros(count, x) \
1785 __asm__ ("scan %1,1,%0" : "=r" (count) : "r" (x))
1786 /* Early sparclites return 63 for an argument of 0, but they warn that future
1787 implementations might change this. Therefore, leave COUNT_LEADING_ZEROS_0
1788 undefined. */
1789 #endif /* __sparclite__ */
1790 #endif /* __sparc_v8__ */
1791 #endif /* __sparc_v9__ */
1792 /* Default to sparc v7 versions of umul_ppmm and udiv_qrnnd. */
1793 #ifndef umul_ppmm
1794 #define umul_ppmm(w1, w0, u, v) \
1795 __asm__ ("! Inlined umul_ppmm\n" \
1796 " wr %%g0,%2,%%y ! SPARC has 0-3 delay insn after a wr\n" \
1797 " sra %3,31,%%g2 ! Don't move this insn\n" \
1798 " and %2,%%g2,%%g2 ! Don't move this insn\n" \
1799 " andcc %%g0,0,%%g1 ! Don't move this insn\n" \
1800 " mulscc %%g1,%3,%%g1\n" \
1801 " mulscc %%g1,%3,%%g1\n" \
1802 " mulscc %%g1,%3,%%g1\n" \
1803 " mulscc %%g1,%3,%%g1\n" \
1804 " mulscc %%g1,%3,%%g1\n" \
1805 " mulscc %%g1,%3,%%g1\n" \
1806 " mulscc %%g1,%3,%%g1\n" \
1807 " mulscc %%g1,%3,%%g1\n" \
1808 " mulscc %%g1,%3,%%g1\n" \
1809 " mulscc %%g1,%3,%%g1\n" \
1810 " mulscc %%g1,%3,%%g1\n" \
1811 " mulscc %%g1,%3,%%g1\n" \
1812 " mulscc %%g1,%3,%%g1\n" \
1813 " mulscc %%g1,%3,%%g1\n" \
1814 " mulscc %%g1,%3,%%g1\n" \
1815 " mulscc %%g1,%3,%%g1\n" \
1816 " mulscc %%g1,%3,%%g1\n" \
1817 " mulscc %%g1,%3,%%g1\n" \
1818 " mulscc %%g1,%3,%%g1\n" \
1819 " mulscc %%g1,%3,%%g1\n" \
1820 " mulscc %%g1,%3,%%g1\n" \
1821 " mulscc %%g1,%3,%%g1\n" \
1822 " mulscc %%g1,%3,%%g1\n" \
1823 " mulscc %%g1,%3,%%g1\n" \
1824 " mulscc %%g1,%3,%%g1\n" \
1825 " mulscc %%g1,%3,%%g1\n" \
1826 " mulscc %%g1,%3,%%g1\n" \
1827 " mulscc %%g1,%3,%%g1\n" \
1828 " mulscc %%g1,%3,%%g1\n" \
1829 " mulscc %%g1,%3,%%g1\n" \
1830 " mulscc %%g1,%3,%%g1\n" \
1831 " mulscc %%g1,%3,%%g1\n" \
1832 " mulscc %%g1,0,%%g1\n" \
1833 " add %%g1,%%g2,%0\n" \
1834 " rd %%y,%1" \
1835 : "=r" (w1), "=r" (w0) : "%rI" (u), "r" (v) \
1836 : "%g1", "%g2" __AND_CLOBBER_CC)
1837 #endif
1838 #ifndef udiv_qrnnd
1839 #ifndef LONGLONG_STANDALONE
1840 #define udiv_qrnnd(q, r, n1, n0, d) \
1841 do { UWtype __r; \
1842 (q) = __MPN(udiv_qrnnd) (&__r, (n1), (n0), (d)); \
1843 (r) = __r; \
1844 } while (0)
1845 extern UWtype __MPN(udiv_qrnnd) (UWtype *, UWtype, UWtype, UWtype);
1846 #endif /* LONGLONG_STANDALONE */
1847 #endif /* udiv_qrnnd */
1848 #endif /* __sparc__ */
1849
1850 #if defined (__sparc__) && W_TYPE_SIZE == 64
1851 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \
1852 __asm__ ( \
1853 "addcc %r4,%5,%1\n" \
1854 " addccc %r6,%7,%%g0\n" \
1855 " addc %r2,%3,%0" \
1856 : "=r" (sh), "=&r" (sl) \
1857 : "rJ" ((UDItype)(ah)), "rI" ((UDItype)(bh)), \
1858 "%rJ" ((UDItype)(al)), "rI" ((UDItype)(bl)), \
1859 "%rJ" ((UDItype)(al) >> 32), "rI" ((UDItype)(bl) >> 32) \
1860 __CLOBBER_CC)
1861 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \
1862 __asm__ ( \
1863 "subcc %r4,%5,%1\n" \
1864 " subccc %r6,%7,%%g0\n" \
1865 " subc %r2,%3,%0" \
1866 : "=r" (sh), "=&r" (sl) \
1867 : "rJ" ((UDItype)(ah)), "rI" ((UDItype)(bh)), \
1868 "rJ" ((UDItype)(al)), "rI" ((UDItype)(bl)), \
1869 "rJ" ((UDItype)(al) >> 32), "rI" ((UDItype)(bl) >> 32) \
1870 __CLOBBER_CC)
1871 #if __VIS__ >= 0x300
1872 #undef add_ssaaaa
1873 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \
1874 __asm__ ( \
1875 "addcc %r4, %5, %1\n" \
1876 " addxc %r2, %r3, %0" \
1877 : "=r" (sh), "=&r" (sl) \
1878 : "rJ" ((UDItype)(ah)), "rJ" ((UDItype)(bh)), \
1879 "%rJ" ((UDItype)(al)), "rI" ((UDItype)(bl)) __CLOBBER_CC)
1880 #define umul_ppmm(ph, pl, m0, m1) \
1881 do { \
1882 UDItype __m0 = (m0), __m1 = (m1); \
1883 (pl) = __m0 * __m1; \
1884 __asm__ ("umulxhi\t%2, %1, %0" \
1885 : "=r" (ph) \
1886 : "%r" (__m0), "r" (__m1)); \
1887 } while (0)
1888 #define count_leading_zeros(count, x) \
1889 __asm__ ("lzd\t%1,%0" : "=r" (count) : "r" (x))
1890 /* Needed by count_leading_zeros_32 in sparc64.h. */
1891 #define COUNT_LEADING_ZEROS_NEED_CLZ_TAB
1892 #endif
1893 #endif
1894
1895 #if (defined (__vax) || defined (__vax__)) && W_TYPE_SIZE == 32
1896 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \
1897 __asm__ ("addl2 %5,%1\n\tadwc %3,%0" \
1898 : "=g" (sh), "=&g" (sl) \
1899 : "0" ((USItype)(ah)), "g" ((USItype)(bh)), \
1900 "%1" ((USItype)(al)), "g" ((USItype)(bl)))
1901 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \
1902 __asm__ ("subl2 %5,%1\n\tsbwc %3,%0" \
1903 : "=g" (sh), "=&g" (sl) \
1904 : "0" ((USItype)(ah)), "g" ((USItype)(bh)), \
1905 "1" ((USItype)(al)), "g" ((USItype)(bl)))
1906 #define smul_ppmm(xh, xl, m0, m1) \
1907 do { \
1908 union {UDItype __ll; \
1909 struct {USItype __l, __h;} __i; \
1910 } __x; \
1911 USItype __m0 = (m0), __m1 = (m1); \
1912 __asm__ ("emul %1,%2,$0,%0" \
1913 : "=g" (__x.__ll) : "g" (__m0), "g" (__m1)); \
1914 (xh) = __x.__i.__h; (xl) = __x.__i.__l; \
1915 } while (0)
1916 #define sdiv_qrnnd(q, r, n1, n0, d) \
1917 do { \
1918 union {DItype __ll; \
1919 struct {SItype __l, __h;} __i; \
1920 } __x; \
1921 __x.__i.__h = n1; __x.__i.__l = n0; \
1922 __asm__ ("ediv %3,%2,%0,%1" \
1923 : "=g" (q), "=g" (r) : "g" (__x.__ll), "g" (d)); \
1924 } while (0)
1925 #if 0
1926 /* FIXME: This instruction appears to be unimplemented on some systems (vax
1927 8800 maybe). */
1928 #define count_trailing_zeros(count,x) \
1929 do { \
1930 __asm__ ("ffs 0, 31, %1, %0" \
1931 : "=g" (count) \
1932 : "g" ((USItype) (x))); \
1933 } while (0)
1934 #endif
1935 #endif /* vax */
1936
1937 #if defined (__z8000__) && W_TYPE_SIZE == 16
1938 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \
1939 __asm__ ("add %H1,%H5\n\tadc %H0,%H3" \
1940 : "=r" (sh), "=&r" (sl) \
1941 : "0" ((unsigned int)(ah)), "r" ((unsigned int)(bh)), \
1942 "%1" ((unsigned int)(al)), "rQR" ((unsigned int)(bl)))
1943 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \
1944 __asm__ ("sub %H1,%H5\n\tsbc %H0,%H3" \
1945 : "=r" (sh), "=&r" (sl) \
1946 : "0" ((unsigned int)(ah)), "r" ((unsigned int)(bh)), \
1947 "1" ((unsigned int)(al)), "rQR" ((unsigned int)(bl)))
1948 #define umul_ppmm(xh, xl, m0, m1) \
1949 do { \
1950 union {long int __ll; \
1951 struct {unsigned int __h, __l;} __i; \
1952 } __x; \
1953 unsigned int __m0 = (m0), __m1 = (m1); \
1954 __asm__ ("mult %S0,%H3" \
1955 : "=r" (__x.__i.__h), "=r" (__x.__i.__l) \
1956 : "%1" (m0), "rQR" (m1)); \
1957 (xh) = __x.__i.__h; (xl) = __x.__i.__l; \
1958 (xh) += ((((signed int) __m0 >> 15) & __m1) \
1959 + (((signed int) __m1 >> 15) & __m0)); \
1960 } while (0)
1961 #endif /* __z8000__ */
1962
1963 #endif /* __GNUC__ */
1964
1965 #endif /* NO_ASM */
1966
1967
1968 /* FIXME: "sidi" here is highly doubtful, should sometimes be "diti". */
1969 #if !defined (umul_ppmm) && defined (__umulsidi3)
1970 #define umul_ppmm(ph, pl, m0, m1) \
1971 do { \
1972 UDWtype __ll = __umulsidi3 (m0, m1); \
1973 ph = (UWtype) (__ll >> W_TYPE_SIZE); \
1974 pl = (UWtype) __ll; \
1975 } while (0)
1976 #endif
1977
1978 #if !defined (__umulsidi3)
1979 #define __umulsidi3(u, v) \
1980 ({UWtype __hi, __lo; \
1981 umul_ppmm (__hi, __lo, u, v); \
1982 ((UDWtype) __hi << W_TYPE_SIZE) | __lo; })
1983 #endif
1984
1985
1986 #if defined (__cplusplus)
1987 #define __longlong_h_C "C"
1988 #else
1989 #define __longlong_h_C
1990 #endif
1991
1992 /* Use mpn_umul_ppmm or mpn_udiv_qrnnd functions, if they exist. The "_r"
1993 forms have "reversed" arguments, meaning the pointer is last, which
1994 sometimes allows better parameter passing, in particular on 64-bit
1995 hppa. */
1996
1997 #define mpn_umul_ppmm __MPN(umul_ppmm)
1998 extern __longlong_h_C UWtype mpn_umul_ppmm (UWtype *, UWtype, UWtype);
1999
2000 #if ! defined (umul_ppmm) && HAVE_NATIVE_mpn_umul_ppmm \
2001 && ! defined (LONGLONG_STANDALONE)
2002 #define umul_ppmm(wh, wl, u, v) \
2003 do { \
2004 UWtype __umul_ppmm__p0; \
2005 (wh) = mpn_umul_ppmm (&__umul_ppmm__p0, (UWtype) (u), (UWtype) (v));\
2006 (wl) = __umul_ppmm__p0; \
2007 } while (0)
2008 #endif
2009
2010 #define mpn_umul_ppmm_r __MPN(umul_ppmm_r)
2011 extern __longlong_h_C UWtype mpn_umul_ppmm_r (UWtype, UWtype, UWtype *);
2012
2013 #if ! defined (umul_ppmm) && HAVE_NATIVE_mpn_umul_ppmm_r \
2014 && ! defined (LONGLONG_STANDALONE)
2015 #define umul_ppmm(wh, wl, u, v) \
2016 do { \
2017 UWtype __umul_p0; \
2018 (wh) = mpn_umul_ppmm_r ((UWtype) (u), (UWtype) (v), &__umul_p0); \
2019 (wl) = __umul_p0; \
2020 } while (0)
2021 #endif
2022
2023 #define mpn_udiv_qrnnd __MPN(udiv_qrnnd)
2024 extern __longlong_h_C UWtype mpn_udiv_qrnnd (UWtype *, UWtype, UWtype, UWtype);
2025
2026 #if ! defined (udiv_qrnnd) && HAVE_NATIVE_mpn_udiv_qrnnd \
2027 && ! defined (LONGLONG_STANDALONE)
2028 #define udiv_qrnnd(q, r, n1, n0, d) \
2029 do { \
2030 UWtype __udiv_qrnnd_r; \
2031 (q) = mpn_udiv_qrnnd (&__udiv_qrnnd_r, \
2032 (UWtype) (n1), (UWtype) (n0), (UWtype) d); \
2033 (r) = __udiv_qrnnd_r; \
2034 } while (0)
2035 #endif
2036
2037 #define mpn_udiv_qrnnd_r __MPN(udiv_qrnnd_r)
2038 extern __longlong_h_C UWtype mpn_udiv_qrnnd_r (UWtype, UWtype, UWtype, UWtype *);
2039
2040 #if ! defined (udiv_qrnnd) && HAVE_NATIVE_mpn_udiv_qrnnd_r \
2041 && ! defined (LONGLONG_STANDALONE)
2042 #define udiv_qrnnd(q, r, n1, n0, d) \
2043 do { \
2044 UWtype __udiv_qrnnd_r; \
2045 (q) = mpn_udiv_qrnnd_r ((UWtype) (n1), (UWtype) (n0), (UWtype) d, \
2046 &__udiv_qrnnd_r); \
2047 (r) = __udiv_qrnnd_r; \
2048 } while (0)
2049 #endif
2050
2051
2052 /* If this machine has no inline assembler, use C macros. */
2053
2054 #if !defined (add_ssaaaa)
2055 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \
2056 do { \
2057 UWtype __x; \
2058 UWtype __al = (al); \
2059 UWtype __bl = (bl); \
2060 __x = __al + __bl; \
2061 (sh) = (ah) + (bh) + (__x < __al); \
2062 (sl) = __x; \
2063 } while (0)
2064 #endif
2065
2066 #if !defined (sub_ddmmss)
2067 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \
2068 do { \
2069 UWtype __x; \
2070 UWtype __al = (al); \
2071 UWtype __bl = (bl); \
2072 __x = __al - __bl; \
2073 (sh) = (ah) - (bh) - (__al < __bl); \
2074 (sl) = __x; \
2075 } while (0)
2076 #endif
2077
2078 /* If we lack umul_ppmm but have smul_ppmm, define umul_ppmm in terms of
2079 smul_ppmm. */
2080 #if !defined (umul_ppmm) && defined (smul_ppmm)
2081 #define umul_ppmm(w1, w0, u, v) \
2082 do { \
2083 UWtype __w1; \
2084 UWtype __xm0 = (u), __xm1 = (v); \
2085 smul_ppmm (__w1, w0, __xm0, __xm1); \
2086 (w1) = __w1 + (-(__xm0 >> (W_TYPE_SIZE - 1)) & __xm1) \
2087 + (-(__xm1 >> (W_TYPE_SIZE - 1)) & __xm0); \
2088 } while (0)
2089 #endif
2090
2091 /* If we still don't have umul_ppmm, define it using plain C.
2092
2093 For reference, when this code is used for squaring (ie. u and v identical
2094 expressions), gcc recognises __x1 and __x2 are the same and generates 3
2095 multiplies, not 4. The subsequent additions could be optimized a bit,
2096 but the only place GMP currently uses such a square is mpn_sqr_basecase,
2097 and chips obliged to use this generic C umul will have plenty of worse
2098 performance problems than a couple of extra instructions on the diagonal
2099 of sqr_basecase. */
2100
2101 #if !defined (umul_ppmm)
2102 #define umul_ppmm(w1, w0, u, v) \
2103 do { \
2104 UWtype __x0, __x1, __x2, __x3; \
2105 UHWtype __ul, __vl, __uh, __vh; \
2106 UWtype __u = (u), __v = (v); \
2107 \
2108 __ul = __ll_lowpart (__u); \
2109 __uh = __ll_highpart (__u); \
2110 __vl = __ll_lowpart (__v); \
2111 __vh = __ll_highpart (__v); \
2112 \
2113 __x0 = (UWtype) __ul * __vl; \
2114 __x1 = (UWtype) __ul * __vh; \
2115 __x2 = (UWtype) __uh * __vl; \
2116 __x3 = (UWtype) __uh * __vh; \
2117 \
2118 __x1 += __ll_highpart (__x0);/* this can't give carry */ \
2119 __x1 += __x2; /* but this indeed can */ \
2120 if (__x1 < __x2) /* did we get it? */ \
2121 __x3 += __ll_B; /* yes, add it in the proper pos. */ \
2122 \
2123 (w1) = __x3 + __ll_highpart (__x1); \
2124 (w0) = (__x1 << W_TYPE_SIZE/2) + __ll_lowpart (__x0); \
2125 } while (0)
2126 #endif
2127
2128 /* If we don't have smul_ppmm, define it using umul_ppmm (which surely will
2129 exist in one form or another. */
2130 #if !defined (smul_ppmm)
2131 #define smul_ppmm(w1, w0, u, v) \
2132 do { \
2133 UWtype __w1; \
2134 UWtype __xm0 = (u), __xm1 = (v); \
2135 umul_ppmm (__w1, w0, __xm0, __xm1); \
2136 (w1) = __w1 - (-(__xm0 >> (W_TYPE_SIZE - 1)) & __xm1) \
2137 - (-(__xm1 >> (W_TYPE_SIZE - 1)) & __xm0); \
2138 } while (0)
2139 #endif
2140
2141 /* Define this unconditionally, so it can be used for debugging. */
2142 #define __udiv_qrnnd_c(q, r, n1, n0, d) \
2143 do { \
2144 UWtype __d1, __d0, __q1, __q0, __r1, __r0, __m; \
2145 \
2146 ASSERT ((d) != 0); \
2147 ASSERT ((n1) < (d)); \
2148 \
2149 __d1 = __ll_highpart (d); \
2150 __d0 = __ll_lowpart (d); \
2151 \
2152 __q1 = (n1) / __d1; \
2153 __r1 = (n1) - __q1 * __d1; \
2154 __m = __q1 * __d0; \
2155 __r1 = __r1 * __ll_B | __ll_highpart (n0); \
2156 if (__r1 < __m) \
2157 { \
2158 __q1--, __r1 += (d); \
2159 if (__r1 >= (d)) /* i.e. we didn't get carry when adding to __r1 */\
2160 if (__r1 < __m) \
2161 __q1--, __r1 += (d); \
2162 } \
2163 __r1 -= __m; \
2164 \
2165 __q0 = __r1 / __d1; \
2166 __r0 = __r1 - __q0 * __d1; \
2167 __m = __q0 * __d0; \
2168 __r0 = __r0 * __ll_B | __ll_lowpart (n0); \
2169 if (__r0 < __m) \
2170 { \
2171 __q0--, __r0 += (d); \
2172 if (__r0 >= (d)) \
2173 if (__r0 < __m) \
2174 __q0--, __r0 += (d); \
2175 } \
2176 __r0 -= __m; \
2177 \
2178 (q) = __q1 * __ll_B | __q0; \
2179 (r) = __r0; \
2180 } while (0)
2181
2182 /* If the processor has no udiv_qrnnd but sdiv_qrnnd, go through
2183 __udiv_w_sdiv (defined in libgcc or elsewhere). */
2184 #if !defined (udiv_qrnnd) && defined (sdiv_qrnnd) \
2185 && ! defined (LONGLONG_STANDALONE)
2186 #define udiv_qrnnd(q, r, nh, nl, d) \
2187 do { \
2188 UWtype __r; \
2189 (q) = __MPN(udiv_w_sdiv) (&__r, nh, nl, d); \
2190 (r) = __r; \
2191 } while (0)
2192 __GMP_DECLSPEC UWtype __MPN(udiv_w_sdiv) (UWtype *, UWtype, UWtype, UWtype);
2193 #endif
2194
2195 /* If udiv_qrnnd was not defined for this processor, use __udiv_qrnnd_c. */
2196 #if !defined (udiv_qrnnd)
2197 #define UDIV_NEEDS_NORMALIZATION 1
2198 #define udiv_qrnnd __udiv_qrnnd_c
2199 #endif
2200
2201 #if !defined (count_leading_zeros)
2202 #define count_leading_zeros(count, x) \
2203 do { \
2204 UWtype __xr = (x); \
2205 UWtype __a; \
2206 \
2207 if (W_TYPE_SIZE == 32) \
2208 { \
2209 __a = __xr < ((UWtype) 1 << 2*__BITS4) \
2210 ? (__xr < ((UWtype) 1 << __BITS4) ? 1 : __BITS4 + 1) \
2211 : (__xr < ((UWtype) 1 << 3*__BITS4) ? 2*__BITS4 + 1 \
2212 : 3*__BITS4 + 1); \
2213 } \
2214 else \
2215 { \
2216 for (__a = W_TYPE_SIZE - 8; __a > 0; __a -= 8) \
2217 if (((__xr >> __a) & 0xff) != 0) \
2218 break; \
2219 ++__a; \
2220 } \
2221 \
2222 (count) = W_TYPE_SIZE + 1 - __a - __clz_tab[__xr >> __a]; \
2223 } while (0)
2224 /* This version gives a well-defined value for zero. */
2225 #define COUNT_LEADING_ZEROS_0 (W_TYPE_SIZE - 1)
2226 #define COUNT_LEADING_ZEROS_NEED_CLZ_TAB
2227 #define COUNT_LEADING_ZEROS_SLOW
2228 #endif
2229
2230 /* clz_tab needed by mpn/x86/pentium/mod_1.asm in a fat binary */
2231 #if HAVE_HOST_CPU_FAMILY_x86 && WANT_FAT_BINARY
2232 #define COUNT_LEADING_ZEROS_NEED_CLZ_TAB
2233 #endif
2234
2235 #ifdef COUNT_LEADING_ZEROS_NEED_CLZ_TAB
2236 extern const unsigned char __GMP_DECLSPEC __clz_tab[129];
2237 #endif
2238
2239 #if !defined (count_trailing_zeros)
2240 #if !defined (COUNT_LEADING_ZEROS_SLOW)
2241 /* Define count_trailing_zeros using an asm count_leading_zeros. */
2242 #define count_trailing_zeros(count, x) \
2243 do { \
2244 UWtype __ctz_x = (x); \
2245 UWtype __ctz_c; \
2246 ASSERT (__ctz_x != 0); \
2247 count_leading_zeros (__ctz_c, __ctz_x & -__ctz_x); \
2248 (count) = W_TYPE_SIZE - 1 - __ctz_c; \
2249 } while (0)
2250 #else
2251 /* Define count_trailing_zeros in plain C, assuming small counts are common.
2252 We use clz_tab without ado, since the C count_leading_zeros above will have
2253 pulled it in. */
2254 #define count_trailing_zeros(count, x) \
2255 do { \
2256 UWtype __ctz_x = (x); \
2257 int __ctz_c; \
2258 \
2259 if (LIKELY ((__ctz_x & 0xff) != 0)) \
2260 (count) = __clz_tab[__ctz_x & -__ctz_x] - 2; \
2261 else \
2262 { \
2263 for (__ctz_c = 8 - 2; __ctz_c < W_TYPE_SIZE - 2; __ctz_c += 8) \
2264 { \
2265 __ctz_x >>= 8; \
2266 if (LIKELY ((__ctz_x & 0xff) != 0)) \
2267 break; \
2268 } \
2269 \
2270 (count) = __ctz_c + __clz_tab[__ctz_x & -__ctz_x]; \
2271 } \
2272 } while (0)
2273 #endif
2274 #endif
2275
2276 #ifndef UDIV_NEEDS_NORMALIZATION
2277 #define UDIV_NEEDS_NORMALIZATION 0
2278 #endif
2279
2280 /* Whether udiv_qrnnd is actually implemented with udiv_qrnnd_preinv, and
2281 that hence the latter should always be used. */
2282 #ifndef UDIV_PREINV_ALWAYS
2283 #define UDIV_PREINV_ALWAYS 0
2284 #endif
2285