longlong.h revision 1.1 1 /* longlong.h -- definitions for mixed size 32/64 bit arithmetic.
2
3 Copyright 1991, 1992, 1993, 1994, 1996, 1997, 1999, 2000, 2001, 2002, 2003,
4 2004, 2005, 2007, 2008, 2009 Free Software Foundation, Inc.
5
6 This file is free software; you can redistribute it and/or modify it under the
7 terms of the GNU Lesser General Public License as published by the Free
8 Software Foundation; either version 3 of the License, or (at your option) any
9 later version.
10
11 This file is distributed in the hope that it will be useful, but WITHOUT ANY
12 WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
13 PARTICULAR PURPOSE. See the GNU Lesser General Public License for more
14 details.
15
16 You should have received a copy of the GNU Lesser General Public License
17 along with this file. If not, see http://www.gnu.org/licenses/. */
18
19 /* You have to define the following before including this file:
20
21 UWtype -- An unsigned type, default type for operations (typically a "word")
22 UHWtype -- An unsigned type, at least half the size of UWtype
23 UDWtype -- An unsigned type, at least twice as large a UWtype
24 W_TYPE_SIZE -- size in bits of UWtype
25
26 SItype, USItype -- Signed and unsigned 32 bit types
27 DItype, UDItype -- Signed and unsigned 64 bit types
28
29 On a 32 bit machine UWtype should typically be USItype;
30 on a 64 bit machine, UWtype should typically be UDItype.
31
32 Optionally, define:
33
34 LONGLONG_STANDALONE -- Avoid code that needs machine-dependent support files
35 NO_ASM -- Disable inline asm
36
37
38 CAUTION! Using this version of longlong.h outside of GMP is not safe. You
39 need to include gmp.h and gmp-impl.h, or certain things might not work as
40 expected.
41 */
42
43 #define __BITS4 (W_TYPE_SIZE / 4)
44 #define __ll_B ((UWtype) 1 << (W_TYPE_SIZE / 2))
45 #define __ll_lowpart(t) ((UWtype) (t) & (__ll_B - 1))
46 #define __ll_highpart(t) ((UWtype) (t) >> (W_TYPE_SIZE / 2))
47
48 /* This is used to make sure no undesirable sharing between different libraries
49 that use this file takes place. */
50 #ifndef __MPN
51 #define __MPN(x) __##x
52 #endif
53
54 #ifndef _PROTO
55 #if (__STDC__-0) || defined (__cplusplus)
56 #define _PROTO(x) x
57 #else
58 #define _PROTO(x) ()
59 #endif
60 #endif
61
62 /* Define auxiliary asm macros.
63
64 1) umul_ppmm(high_prod, low_prod, multiplier, multiplicand) multiplies two
65 UWtype integers MULTIPLIER and MULTIPLICAND, and generates a two UWtype
66 word product in HIGH_PROD and LOW_PROD.
67
68 2) __umulsidi3(a,b) multiplies two UWtype integers A and B, and returns a
69 UDWtype product. This is just a variant of umul_ppmm.
70
71 3) udiv_qrnnd(quotient, remainder, high_numerator, low_numerator,
72 denominator) divides a UDWtype, composed by the UWtype integers
73 HIGH_NUMERATOR and LOW_NUMERATOR, by DENOMINATOR and places the quotient
74 in QUOTIENT and the remainder in REMAINDER. HIGH_NUMERATOR must be less
75 than DENOMINATOR for correct operation. If, in addition, the most
76 significant bit of DENOMINATOR must be 1, then the pre-processor symbol
77 UDIV_NEEDS_NORMALIZATION is defined to 1.
78
79 4) sdiv_qrnnd(quotient, remainder, high_numerator, low_numerator,
80 denominator). Like udiv_qrnnd but the numbers are signed. The quotient
81 is rounded towards 0.
82
83 5) count_leading_zeros(count, x) counts the number of zero-bits from the
84 msb to the first non-zero bit in the UWtype X. This is the number of
85 steps X needs to be shifted left to set the msb. Undefined for X == 0,
86 unless the symbol COUNT_LEADING_ZEROS_0 is defined to some value.
87
88 6) count_trailing_zeros(count, x) like count_leading_zeros, but counts
89 from the least significant end.
90
91 7) add_ssaaaa(high_sum, low_sum, high_addend_1, low_addend_1,
92 high_addend_2, low_addend_2) adds two UWtype integers, composed by
93 HIGH_ADDEND_1 and LOW_ADDEND_1, and HIGH_ADDEND_2 and LOW_ADDEND_2
94 respectively. The result is placed in HIGH_SUM and LOW_SUM. Overflow
95 (i.e. carry out) is not stored anywhere, and is lost.
96
97 8) sub_ddmmss(high_difference, low_difference, high_minuend, low_minuend,
98 high_subtrahend, low_subtrahend) subtracts two two-word UWtype integers,
99 composed by HIGH_MINUEND_1 and LOW_MINUEND_1, and HIGH_SUBTRAHEND_2 and
100 LOW_SUBTRAHEND_2 respectively. The result is placed in HIGH_DIFFERENCE
101 and LOW_DIFFERENCE. Overflow (i.e. carry out) is not stored anywhere,
102 and is lost.
103
104 If any of these macros are left undefined for a particular CPU,
105 C macros are used.
106
107
108 Notes:
109
110 For add_ssaaaa the two high and two low addends can both commute, but
111 unfortunately gcc only supports one "%" commutative in each asm block.
112 This has always been so but is only documented in recent versions
113 (eg. pre-release 3.3). Having two or more "%"s can cause an internal
114 compiler error in certain rare circumstances.
115
116 Apparently it was only the last "%" that was ever actually respected, so
117 the code has been updated to leave just that. Clearly there's a free
118 choice whether high or low should get it, if there's a reason to favour
119 one over the other. Also obviously when the constraints on the two
120 operands are identical there's no benefit to the reloader in any "%" at
121 all.
122
123 */
124
125 /* The CPUs come in alphabetical order below.
126
127 Please add support for more CPUs here, or improve the current support
128 for the CPUs below! */
129
130
131 /* count_leading_zeros_gcc_clz is count_leading_zeros implemented with gcc
132 3.4 __builtin_clzl or __builtin_clzll, according to our limb size.
133 Similarly count_trailing_zeros_gcc_ctz using __builtin_ctzl or
134 __builtin_ctzll.
135
136 These builtins are only used when we check what code comes out, on some
137 chips they're merely libgcc calls, where we will instead want an inline
138 in that case (either asm or generic C).
139
140 These builtins are better than an asm block of the same insn, since an
141 asm block doesn't give gcc any information about scheduling or resource
142 usage. We keep an asm block for use on prior versions of gcc though.
143
144 For reference, __builtin_ffs existed in gcc prior to __builtin_clz, but
145 it's not used (for count_leading_zeros) because it generally gives extra
146 code to ensure the result is 0 when the input is 0, which we don't need
147 or want. */
148
149 #ifdef _LONG_LONG_LIMB
150 #define count_leading_zeros_gcc_clz(count,x) \
151 do { \
152 ASSERT ((x) != 0); \
153 (count) = __builtin_clzll (x); \
154 } while (0)
155 #else
156 #define count_leading_zeros_gcc_clz(count,x) \
157 do { \
158 ASSERT ((x) != 0); \
159 (count) = __builtin_clzl (x); \
160 } while (0)
161 #endif
162
163 #ifdef _LONG_LONG_LIMB
164 #define count_trailing_zeros_gcc_ctz(count,x) \
165 do { \
166 ASSERT ((x) != 0); \
167 (count) = __builtin_ctzll (x); \
168 } while (0)
169 #else
170 #define count_trailing_zeros_gcc_ctz(count,x) \
171 do { \
172 ASSERT ((x) != 0); \
173 (count) = __builtin_ctzl (x); \
174 } while (0)
175 #endif
176
177
178 /* FIXME: The macros using external routines like __MPN(count_leading_zeros)
179 don't need to be under !NO_ASM */
180 #if ! defined (NO_ASM)
181
182 #if defined (__alpha) && W_TYPE_SIZE == 64
183 /* Most alpha-based machines, except Cray systems. */
184 #if defined (__GNUC__)
185 #if __GMP_GNUC_PREREQ (3,3)
186 #define umul_ppmm(ph, pl, m0, m1) \
187 do { \
188 UDItype __m0 = (m0), __m1 = (m1); \
189 (ph) = __builtin_alpha_umulh (__m0, __m1); \
190 (pl) = __m0 * __m1; \
191 } while (0)
192 #else
193 #define umul_ppmm(ph, pl, m0, m1) \
194 do { \
195 UDItype __m0 = (m0), __m1 = (m1); \
196 __asm__ ("umulh %r1,%2,%0" \
197 : "=r" (ph) \
198 : "%rJ" (m0), "rI" (m1)); \
199 (pl) = __m0 * __m1; \
200 } while (0)
201 #endif
202 #define UMUL_TIME 18
203 #else /* ! __GNUC__ */
204 #include <machine/builtins.h>
205 #define umul_ppmm(ph, pl, m0, m1) \
206 do { \
207 UDItype __m0 = (m0), __m1 = (m1); \
208 (ph) = __UMULH (m0, m1); \
209 (pl) = __m0 * __m1; \
210 } while (0)
211 #endif
212 #ifndef LONGLONG_STANDALONE
213 #define udiv_qrnnd(q, r, n1, n0, d) \
214 do { UWtype __di; \
215 __di = __MPN(invert_limb) (d); \
216 udiv_qrnnd_preinv (q, r, n1, n0, d, __di); \
217 } while (0)
218 #define UDIV_PREINV_ALWAYS 1
219 #define UDIV_NEEDS_NORMALIZATION 1
220 #define UDIV_TIME 220
221 #endif /* LONGLONG_STANDALONE */
222
223 /* clz_tab is required in all configurations, since mpn/alpha/cntlz.asm
224 always goes into libgmp.so, even when not actually used. */
225 #define COUNT_LEADING_ZEROS_NEED_CLZ_TAB
226
227 #if defined (__GNUC__) && HAVE_HOST_CPU_alpha_CIX
228 #define count_leading_zeros(COUNT,X) \
229 __asm__("ctlz %1,%0" : "=r"(COUNT) : "r"(X))
230 #define count_trailing_zeros(COUNT,X) \
231 __asm__("cttz %1,%0" : "=r"(COUNT) : "r"(X))
232 #endif /* clz/ctz using cix */
233
234 #if ! defined (count_leading_zeros) \
235 && defined (__GNUC__) && ! defined (LONGLONG_STANDALONE)
236 /* ALPHA_CMPBGE_0 gives "cmpbge $31,src,dst", ie. test src bytes == 0.
237 "$31" is written explicitly in the asm, since an "r" constraint won't
238 select reg 31. There seems no need to worry about "r31" syntax for cray,
239 since gcc itself (pre-release 3.4) emits just $31 in various places. */
240 #define ALPHA_CMPBGE_0(dst, src) \
241 do { asm ("cmpbge $31, %1, %0" : "=r" (dst) : "r" (src)); } while (0)
242 /* Zero bytes are turned into bits with cmpbge, a __clz_tab lookup counts
243 them, locating the highest non-zero byte. A second __clz_tab lookup
244 counts the leading zero bits in that byte, giving the result. */
245 #define count_leading_zeros(count, x) \
246 do { \
247 UWtype __clz__b, __clz__c, __clz__x = (x); \
248 ALPHA_CMPBGE_0 (__clz__b, __clz__x); /* zero bytes */ \
249 __clz__b = __clz_tab [(__clz__b >> 1) ^ 0x7F]; /* 8 to 1 byte */ \
250 __clz__b = __clz__b * 8 - 7; /* 57 to 1 shift */ \
251 __clz__x >>= __clz__b; \
252 __clz__c = __clz_tab [__clz__x]; /* 8 to 1 bit */ \
253 __clz__b = 65 - __clz__b; \
254 (count) = __clz__b - __clz__c; \
255 } while (0)
256 #define COUNT_LEADING_ZEROS_NEED_CLZ_TAB
257 #endif /* clz using cmpbge */
258
259 #if ! defined (count_leading_zeros) && ! defined (LONGLONG_STANDALONE)
260 #if HAVE_ATTRIBUTE_CONST
261 long __MPN(count_leading_zeros) _PROTO ((UDItype)) __attribute__ ((const));
262 #else
263 long __MPN(count_leading_zeros) _PROTO ((UDItype));
264 #endif
265 #define count_leading_zeros(count, x) \
266 ((count) = __MPN(count_leading_zeros) (x))
267 #endif /* clz using mpn */
268 #endif /* __alpha */
269
270 #if defined (_CRAY) && W_TYPE_SIZE == 64
271 #include <intrinsics.h>
272 #define UDIV_PREINV_ALWAYS 1
273 #define UDIV_NEEDS_NORMALIZATION 1
274 #define UDIV_TIME 220
275 long __MPN(count_leading_zeros) _PROTO ((UDItype));
276 #define count_leading_zeros(count, x) \
277 ((count) = _leadz ((UWtype) (x)))
278 #if defined (_CRAYIEEE) /* I.e., Cray T90/ieee, T3D, and T3E */
279 #define umul_ppmm(ph, pl, m0, m1) \
280 do { \
281 UDItype __m0 = (m0), __m1 = (m1); \
282 (ph) = _int_mult_upper (m0, m1); \
283 (pl) = __m0 * __m1; \
284 } while (0)
285 #ifndef LONGLONG_STANDALONE
286 #define udiv_qrnnd(q, r, n1, n0, d) \
287 do { UWtype __di; \
288 __di = __MPN(invert_limb) (d); \
289 udiv_qrnnd_preinv (q, r, n1, n0, d, __di); \
290 } while (0)
291 #endif /* LONGLONG_STANDALONE */
292 #endif /* _CRAYIEEE */
293 #endif /* _CRAY */
294
295 #if defined (__ia64) && W_TYPE_SIZE == 64
296 /* This form encourages gcc (pre-release 3.4 at least) to emit predicated
297 "sub r=r,r" and "sub r=r,r,1", giving a 2 cycle latency. The generic
298 code using "al<bl" arithmetically comes out making an actual 0 or 1 in a
299 register, which takes an extra cycle. */
300 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \
301 do { \
302 UWtype __x; \
303 __x = (al) - (bl); \
304 if ((al) < (bl)) \
305 (sh) = (ah) - (bh) - 1; \
306 else \
307 (sh) = (ah) - (bh); \
308 (sl) = __x; \
309 } while (0)
310 #if defined (__GNUC__) && ! defined (__INTEL_COMPILER)
311 /* Do both product parts in assembly, since that gives better code with
312 all gcc versions. Some callers will just use the upper part, and in
313 that situation we waste an instruction, but not any cycles. */
314 #define umul_ppmm(ph, pl, m0, m1) \
315 __asm__ ("xma.hu %0 = %2, %3, f0\n\txma.l %1 = %2, %3, f0" \
316 : "=&f" (ph), "=f" (pl) \
317 : "f" (m0), "f" (m1))
318 #define UMUL_TIME 14
319 #define count_leading_zeros(count, x) \
320 do { \
321 UWtype _x = (x), _y, _a, _c; \
322 __asm__ ("mux1 %0 = %1, @rev" : "=r" (_y) : "r" (_x)); \
323 __asm__ ("czx1.l %0 = %1" : "=r" (_a) : "r" (-_y | _y)); \
324 _c = (_a - 1) << 3; \
325 _x >>= _c; \
326 if (_x >= 1 << 4) \
327 _x >>= 4, _c += 4; \
328 if (_x >= 1 << 2) \
329 _x >>= 2, _c += 2; \
330 _c += _x >> 1; \
331 (count) = W_TYPE_SIZE - 1 - _c; \
332 } while (0)
333 /* similar to what gcc does for __builtin_ffs, but 0 based rather than 1
334 based, and we don't need a special case for x==0 here */
335 #define count_trailing_zeros(count, x) \
336 do { \
337 UWtype __ctz_x = (x); \
338 __asm__ ("popcnt %0 = %1" \
339 : "=r" (count) \
340 : "r" ((__ctz_x-1) & ~__ctz_x)); \
341 } while (0)
342 #endif
343 #if defined (__INTEL_COMPILER)
344 #include <ia64intrin.h>
345 #define umul_ppmm(ph, pl, m0, m1) \
346 do { \
347 UWtype _m0 = (m0), _m1 = (m1); \
348 ph = _m64_xmahu (_m0, _m1, 0); \
349 pl = _m0 * _m1; \
350 } while (0)
351 #endif
352 #ifndef LONGLONG_STANDALONE
353 #define udiv_qrnnd(q, r, n1, n0, d) \
354 do { UWtype __di; \
355 __di = __MPN(invert_limb) (d); \
356 udiv_qrnnd_preinv (q, r, n1, n0, d, __di); \
357 } while (0)
358 #define UDIV_PREINV_ALWAYS 1
359 #define UDIV_NEEDS_NORMALIZATION 1
360 #endif
361 #define UDIV_TIME 220
362 #endif
363
364
365 #if defined (__GNUC__)
366
367 /* We sometimes need to clobber "cc" with gcc2, but that would not be
368 understood by gcc1. Use cpp to avoid major code duplication. */
369 #if __GNUC__ < 2
370 #define __CLOBBER_CC
371 #define __AND_CLOBBER_CC
372 #else /* __GNUC__ >= 2 */
373 #define __CLOBBER_CC : "cc"
374 #define __AND_CLOBBER_CC , "cc"
375 #endif /* __GNUC__ < 2 */
376
377 #if (defined (__a29k__) || defined (_AM29K)) && W_TYPE_SIZE == 32
378 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \
379 __asm__ ("add %1,%4,%5\n\taddc %0,%2,%3" \
380 : "=r" (sh), "=&r" (sl) \
381 : "r" (ah), "rI" (bh), "%r" (al), "rI" (bl))
382 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \
383 __asm__ ("sub %1,%4,%5\n\tsubc %0,%2,%3" \
384 : "=r" (sh), "=&r" (sl) \
385 : "r" (ah), "rI" (bh), "r" (al), "rI" (bl))
386 #define umul_ppmm(xh, xl, m0, m1) \
387 do { \
388 USItype __m0 = (m0), __m1 = (m1); \
389 __asm__ ("multiplu %0,%1,%2" \
390 : "=r" (xl) \
391 : "r" (__m0), "r" (__m1)); \
392 __asm__ ("multmu %0,%1,%2" \
393 : "=r" (xh) \
394 : "r" (__m0), "r" (__m1)); \
395 } while (0)
396 #define udiv_qrnnd(q, r, n1, n0, d) \
397 __asm__ ("dividu %0,%3,%4" \
398 : "=r" (q), "=q" (r) \
399 : "1" (n1), "r" (n0), "r" (d))
400 #define count_leading_zeros(count, x) \
401 __asm__ ("clz %0,%1" \
402 : "=r" (count) \
403 : "r" (x))
404 #define COUNT_LEADING_ZEROS_0 32
405 #endif /* __a29k__ */
406
407 #if defined (__arc__)
408 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \
409 __asm__ ("add.f\t%1, %4, %5\n\tadc\t%0, %2, %3" \
410 : "=r" (sh), \
411 "=&r" (sl) \
412 : "r" ((USItype) (ah)), \
413 "rIJ" ((USItype) (bh)), \
414 "%r" ((USItype) (al)), \
415 "rIJ" ((USItype) (bl)))
416 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \
417 __asm__ ("sub.f\t%1, %4, %5\n\tsbc\t%0, %2, %3" \
418 : "=r" (sh), \
419 "=&r" (sl) \
420 : "r" ((USItype) (ah)), \
421 "rIJ" ((USItype) (bh)), \
422 "r" ((USItype) (al)), \
423 "rIJ" ((USItype) (bl)))
424 #endif
425
426 #if defined (__arm__) && W_TYPE_SIZE == 32
427 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \
428 __asm__ ("adds\t%1, %4, %5\n\tadc\t%0, %2, %3" \
429 : "=r" (sh), "=&r" (sl) \
430 : "r" (ah), "rI" (bh), "%r" (al), "rI" (bl) __CLOBBER_CC)
431 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \
432 do { \
433 if (__builtin_constant_p (al)) \
434 { \
435 if (__builtin_constant_p (ah)) \
436 __asm__ ("rsbs\t%1, %5, %4\n\trsc\t%0, %3, %2" \
437 : "=r" (sh), "=&r" (sl) \
438 : "rI" (ah), "r" (bh), "rI" (al), "r" (bl) __CLOBBER_CC); \
439 else \
440 __asm__ ("rsbs\t%1, %5, %4\n\tsbc\t%0, %2, %3" \
441 : "=r" (sh), "=&r" (sl) \
442 : "r" (ah), "rI" (bh), "rI" (al), "r" (bl) __CLOBBER_CC); \
443 } \
444 else if (__builtin_constant_p (ah)) \
445 { \
446 if (__builtin_constant_p (bl)) \
447 __asm__ ("subs\t%1, %4, %5\n\trsc\t%0, %3, %2" \
448 : "=r" (sh), "=&r" (sl) \
449 : "rI" (ah), "r" (bh), "r" (al), "rI" (bl) __CLOBBER_CC); \
450 else \
451 __asm__ ("rsbs\t%1, %5, %4\n\trsc\t%0, %3, %2" \
452 : "=r" (sh), "=&r" (sl) \
453 : "rI" (ah), "r" (bh), "rI" (al), "r" (bl) __CLOBBER_CC); \
454 } \
455 else if (__builtin_constant_p (bl)) \
456 { \
457 if (__builtin_constant_p (bh)) \
458 __asm__ ("subs\t%1, %4, %5\n\tsbc\t%0, %2, %3" \
459 : "=r" (sh), "=&r" (sl) \
460 : "r" (ah), "rI" (bh), "r" (al), "rI" (bl) __CLOBBER_CC); \
461 else \
462 __asm__ ("subs\t%1, %4, %5\n\trsc\t%0, %3, %2" \
463 : "=r" (sh), "=&r" (sl) \
464 : "rI" (ah), "r" (bh), "r" (al), "rI" (bl) __CLOBBER_CC); \
465 } \
466 else /* only bh might be a constant */ \
467 __asm__ ("subs\t%1, %4, %5\n\tsbc\t%0, %2, %3" \
468 : "=r" (sh), "=&r" (sl) \
469 : "r" (ah), "rI" (bh), "r" (al), "rI" (bl) __CLOBBER_CC);\
470 } while (0)
471 #if 1 || defined (__arm_m__) /* `M' series has widening multiply support */
472 #define umul_ppmm(xh, xl, a, b) \
473 __asm__ ("umull %0,%1,%2,%3" : "=&r" (xl), "=&r" (xh) : "r" (a), "r" (b))
474 #define UMUL_TIME 5
475 #define smul_ppmm(xh, xl, a, b) \
476 __asm__ ("smull %0,%1,%2,%3" : "=&r" (xl), "=&r" (xh) : "r" (a), "r" (b))
477 #ifndef LONGLONG_STANDALONE
478 #define udiv_qrnnd(q, r, n1, n0, d) \
479 do { UWtype __di; \
480 __di = __MPN(invert_limb) (d); \
481 udiv_qrnnd_preinv (q, r, n1, n0, d, __di); \
482 } while (0)
483 #define UDIV_PREINV_ALWAYS 1
484 #define UDIV_NEEDS_NORMALIZATION 1
485 #define UDIV_TIME 70
486 #endif /* LONGLONG_STANDALONE */
487 #else
488 #define umul_ppmm(xh, xl, a, b) \
489 __asm__ ("%@ Inlined umul_ppmm\n" \
490 " mov %|r0, %2, lsr #16\n" \
491 " mov %|r2, %3, lsr #16\n" \
492 " bic %|r1, %2, %|r0, lsl #16\n" \
493 " bic %|r2, %3, %|r2, lsl #16\n" \
494 " mul %1, %|r1, %|r2\n" \
495 " mul %|r2, %|r0, %|r2\n" \
496 " mul %|r1, %0, %|r1\n" \
497 " mul %0, %|r0, %0\n" \
498 " adds %|r1, %|r2, %|r1\n" \
499 " addcs %0, %0, #65536\n" \
500 " adds %1, %1, %|r1, lsl #16\n" \
501 " adc %0, %0, %|r1, lsr #16" \
502 : "=&r" (xh), "=r" (xl) \
503 : "r" (a), "r" (b) \
504 : "r0", "r1", "r2")
505 #define UMUL_TIME 20
506 #ifndef LONGLONG_STANDALONE
507 #define udiv_qrnnd(q, r, n1, n0, d) \
508 do { UWtype __r; \
509 (q) = __MPN(udiv_qrnnd) (&__r, (n1), (n0), (d)); \
510 (r) = __r; \
511 } while (0)
512 extern UWtype __MPN(udiv_qrnnd) _PROTO ((UWtype *, UWtype, UWtype, UWtype));
513 #define UDIV_TIME 200
514 #endif /* LONGLONG_STANDALONE */
515 #endif
516 #if defined (__ARM_ARCH_5__)
517 /* This actually requires arm 5 */
518 #define count_leading_zeros(count, x) \
519 __asm__ ("clz\t%0, %1" : "=r" (count) : "r" (x))
520 #define COUNT_LEADING_ZEROS_0 32
521 #endif
522 #endif /* __arm__ */
523
524 #if defined (__clipper__) && W_TYPE_SIZE == 32
525 #define umul_ppmm(w1, w0, u, v) \
526 ({union {UDItype __ll; \
527 struct {USItype __l, __h;} __i; \
528 } __x; \
529 __asm__ ("mulwux %2,%0" \
530 : "=r" (__x.__ll) \
531 : "%0" ((USItype)(u)), "r" ((USItype)(v))); \
532 (w1) = __x.__i.__h; (w0) = __x.__i.__l;})
533 #define smul_ppmm(w1, w0, u, v) \
534 ({union {DItype __ll; \
535 struct {SItype __l, __h;} __i; \
536 } __x; \
537 __asm__ ("mulwx %2,%0" \
538 : "=r" (__x.__ll) \
539 : "%0" ((SItype)(u)), "r" ((SItype)(v))); \
540 (w1) = __x.__i.__h; (w0) = __x.__i.__l;})
541 #define __umulsidi3(u, v) \
542 ({UDItype __w; \
543 __asm__ ("mulwux %2,%0" \
544 : "=r" (__w) : "%0" ((USItype)(u)), "r" ((USItype)(v))); \
545 __w; })
546 #endif /* __clipper__ */
547
548 /* Fujitsu vector computers. */
549 #if defined (__uxp__) && W_TYPE_SIZE == 32
550 #define umul_ppmm(ph, pl, u, v) \
551 do { \
552 union {UDItype __ll; \
553 struct {USItype __h, __l;} __i; \
554 } __x; \
555 __asm__ ("mult.lu %1,%2,%0" : "=r" (__x.__ll) : "%r" (u), "rK" (v));\
556 (ph) = __x.__i.__h; \
557 (pl) = __x.__i.__l; \
558 } while (0)
559 #define smul_ppmm(ph, pl, u, v) \
560 do { \
561 union {UDItype __ll; \
562 struct {USItype __h, __l;} __i; \
563 } __x; \
564 __asm__ ("mult.l %1,%2,%0" : "=r" (__x.__ll) : "%r" (u), "rK" (v)); \
565 (ph) = __x.__i.__h; \
566 (pl) = __x.__i.__l; \
567 } while (0)
568 #endif
569
570 #if defined (__gmicro__) && W_TYPE_SIZE == 32
571 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \
572 __asm__ ("add.w %5,%1\n\taddx %3,%0" \
573 : "=g" (sh), "=&g" (sl) \
574 : "0" ((USItype)(ah)), "g" ((USItype)(bh)), \
575 "%1" ((USItype)(al)), "g" ((USItype)(bl)))
576 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \
577 __asm__ ("sub.w %5,%1\n\tsubx %3,%0" \
578 : "=g" (sh), "=&g" (sl) \
579 : "0" ((USItype)(ah)), "g" ((USItype)(bh)), \
580 "1" ((USItype)(al)), "g" ((USItype)(bl)))
581 #define umul_ppmm(ph, pl, m0, m1) \
582 __asm__ ("mulx %3,%0,%1" \
583 : "=g" (ph), "=r" (pl) \
584 : "%0" ((USItype)(m0)), "g" ((USItype)(m1)))
585 #define udiv_qrnnd(q, r, nh, nl, d) \
586 __asm__ ("divx %4,%0,%1" \
587 : "=g" (q), "=r" (r) \
588 : "1" ((USItype)(nh)), "0" ((USItype)(nl)), "g" ((USItype)(d)))
589 #define count_leading_zeros(count, x) \
590 __asm__ ("bsch/1 %1,%0" \
591 : "=g" (count) : "g" ((USItype)(x)), "0" ((USItype)0))
592 #endif
593
594 #if defined (__hppa) && W_TYPE_SIZE == 32
595 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \
596 __asm__ ("add%I5 %5,%r4,%1\n\taddc %r2,%r3,%0" \
597 : "=r" (sh), "=&r" (sl) \
598 : "rM" (ah), "rM" (bh), "%rM" (al), "rI" (bl))
599 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \
600 __asm__ ("sub%I4 %4,%r5,%1\n\tsubb %r2,%r3,%0" \
601 : "=r" (sh), "=&r" (sl) \
602 : "rM" (ah), "rM" (bh), "rI" (al), "rM" (bl))
603 #if defined (_PA_RISC1_1)
604 #define umul_ppmm(wh, wl, u, v) \
605 do { \
606 union {UDItype __ll; \
607 struct {USItype __h, __l;} __i; \
608 } __x; \
609 __asm__ ("xmpyu %1,%2,%0" : "=*f" (__x.__ll) : "*f" (u), "*f" (v)); \
610 (wh) = __x.__i.__h; \
611 (wl) = __x.__i.__l; \
612 } while (0)
613 #define UMUL_TIME 8
614 #define UDIV_TIME 60
615 #else
616 #define UMUL_TIME 40
617 #define UDIV_TIME 80
618 #endif
619 #define count_leading_zeros(count, x) \
620 do { \
621 USItype __tmp; \
622 __asm__ ( \
623 "ldi 1,%0\n" \
624 " extru,= %1,15,16,%%r0 ; Bits 31..16 zero?\n" \
625 " extru,tr %1,15,16,%1 ; No. Shift down, skip add.\n" \
626 " ldo 16(%0),%0 ; Yes. Perform add.\n" \
627 " extru,= %1,23,8,%%r0 ; Bits 15..8 zero?\n" \
628 " extru,tr %1,23,8,%1 ; No. Shift down, skip add.\n" \
629 " ldo 8(%0),%0 ; Yes. Perform add.\n" \
630 " extru,= %1,27,4,%%r0 ; Bits 7..4 zero?\n" \
631 " extru,tr %1,27,4,%1 ; No. Shift down, skip add.\n" \
632 " ldo 4(%0),%0 ; Yes. Perform add.\n" \
633 " extru,= %1,29,2,%%r0 ; Bits 3..2 zero?\n" \
634 " extru,tr %1,29,2,%1 ; No. Shift down, skip add.\n" \
635 " ldo 2(%0),%0 ; Yes. Perform add.\n" \
636 " extru %1,30,1,%1 ; Extract bit 1.\n" \
637 " sub %0,%1,%0 ; Subtract it.\n" \
638 : "=r" (count), "=r" (__tmp) : "1" (x)); \
639 } while (0)
640 #endif /* hppa */
641
642 /* These macros are for ABI=2.0w. In ABI=2.0n they can't be used, since GCC
643 (3.2) puts longlong into two adjacent 32-bit registers. Presumably this
644 is just a case of no direct support for 2.0n but treating it like 1.0. */
645 #if defined (__hppa) && W_TYPE_SIZE == 64 && ! defined (_LONG_LONG_LIMB)
646 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \
647 __asm__ ("add%I5 %5,%r4,%1\n\tadd,dc %r2,%r3,%0" \
648 : "=r" (sh), "=&r" (sl) \
649 : "rM" (ah), "rM" (bh), "%rM" (al), "rI" (bl))
650 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \
651 __asm__ ("sub%I4 %4,%r5,%1\n\tsub,db %r2,%r3,%0" \
652 : "=r" (sh), "=&r" (sl) \
653 : "rM" (ah), "rM" (bh), "rI" (al), "rM" (bl))
654 #endif /* hppa */
655
656 #if (defined (__i370__) || defined (__s390__) || defined (__mvs__)) && W_TYPE_SIZE == 32
657 #define smul_ppmm(xh, xl, m0, m1) \
658 do { \
659 union {DItype __ll; \
660 struct {USItype __h, __l;} __i; \
661 } __x; \
662 __asm__ ("lr %N0,%1\n\tmr %0,%2" \
663 : "=&r" (__x.__ll) \
664 : "r" (m0), "r" (m1)); \
665 (xh) = __x.__i.__h; (xl) = __x.__i.__l; \
666 } while (0)
667 #define sdiv_qrnnd(q, r, n1, n0, d) \
668 do { \
669 union {DItype __ll; \
670 struct {USItype __h, __l;} __i; \
671 } __x; \
672 __x.__i.__h = n1; __x.__i.__l = n0; \
673 __asm__ ("dr %0,%2" \
674 : "=r" (__x.__ll) \
675 : "0" (__x.__ll), "r" (d)); \
676 (q) = __x.__i.__l; (r) = __x.__i.__h; \
677 } while (0)
678 #endif
679
680 #if (defined (__i386__) || defined (__i486__)) && W_TYPE_SIZE == 32
681 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \
682 __asm__ ("addl %5,%k1\n\tadcl %3,%k0" \
683 : "=r" (sh), "=&r" (sl) \
684 : "0" ((USItype)(ah)), "g" ((USItype)(bh)), \
685 "%1" ((USItype)(al)), "g" ((USItype)(bl)))
686 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \
687 __asm__ ("subl %5,%k1\n\tsbbl %3,%k0" \
688 : "=r" (sh), "=&r" (sl) \
689 : "0" ((USItype)(ah)), "g" ((USItype)(bh)), \
690 "1" ((USItype)(al)), "g" ((USItype)(bl)))
691 #define umul_ppmm(w1, w0, u, v) \
692 __asm__ ("mull %3" \
693 : "=a" (w0), "=d" (w1) \
694 : "%0" ((USItype)(u)), "rm" ((USItype)(v)))
695 #define udiv_qrnnd(q, r, n1, n0, dx) /* d renamed to dx avoiding "=d" */\
696 __asm__ ("divl %4" /* stringification in K&R C */ \
697 : "=a" (q), "=d" (r) \
698 : "0" ((USItype)(n0)), "1" ((USItype)(n1)), "rm" ((USItype)(dx)))
699
700 #if HAVE_HOST_CPU_i586 || HAVE_HOST_CPU_pentium || HAVE_HOST_CPU_pentiummmx
701 /* Pentium bsrl takes between 10 and 72 cycles depending where the most
702 significant 1 bit is, hence the use of the following alternatives. bsfl
703 is slow too, between 18 and 42 depending where the least significant 1
704 bit is, so let the generic count_trailing_zeros below make use of the
705 count_leading_zeros here too. */
706
707 #if HAVE_HOST_CPU_pentiummmx && ! defined (LONGLONG_STANDALONE)
708 /* The following should be a fixed 14 or 15 cycles, but possibly plus an L1
709 cache miss reading from __clz_tab. For P55 it's favoured over the float
710 below so as to avoid mixing MMX and x87, since the penalty for switching
711 between the two is about 100 cycles.
712
713 The asm block sets __shift to -3 if the high 24 bits are clear, -2 for
714 16, -1 for 8, or 0 otherwise. This could be written equivalently as
715 follows, but as of gcc 2.95.2 it results in conditional jumps.
716
717 __shift = -(__n < 0x1000000);
718 __shift -= (__n < 0x10000);
719 __shift -= (__n < 0x100);
720
721 The middle two sbbl and cmpl's pair, and with luck something gcc
722 generates might pair with the first cmpl and the last sbbl. The "32+1"
723 constant could be folded into __clz_tab[], but it doesn't seem worth
724 making a different table just for that. */
725
726 #define count_leading_zeros(c,n) \
727 do { \
728 USItype __n = (n); \
729 USItype __shift; \
730 __asm__ ("cmpl $0x1000000, %1\n" \
731 "sbbl %0, %0\n" \
732 "cmpl $0x10000, %1\n" \
733 "sbbl $0, %0\n" \
734 "cmpl $0x100, %1\n" \
735 "sbbl $0, %0\n" \
736 : "=&r" (__shift) : "r" (__n)); \
737 __shift = __shift*8 + 24 + 1; \
738 (c) = 32 + 1 - __shift - __clz_tab[__n >> __shift]; \
739 } while (0)
740 #define COUNT_LEADING_ZEROS_NEED_CLZ_TAB
741 #define COUNT_LEADING_ZEROS_0 31 /* n==0 indistinguishable from n==1 */
742
743 #else /* ! pentiummmx || LONGLONG_STANDALONE */
744 /* The following should be a fixed 14 cycles or so. Some scheduling
745 opportunities should be available between the float load/store too. This
746 sort of code is used in gcc 3 for __builtin_ffs (with "n&-n") and is
747 apparently suggested by the Intel optimizing manual (don't know exactly
748 where). gcc 2.95 or up will be best for this, so the "double" is
749 correctly aligned on the stack. */
750 #define count_leading_zeros(c,n) \
751 do { \
752 union { \
753 double d; \
754 unsigned a[2]; \
755 } __u; \
756 ASSERT ((n) != 0); \
757 __u.d = (UWtype) (n); \
758 (c) = 0x3FF + 31 - (__u.a[1] >> 20); \
759 } while (0)
760 #define COUNT_LEADING_ZEROS_0 (0x3FF + 31)
761 #endif /* pentiummx */
762
763 #else /* ! pentium */
764
765 #if __GMP_GNUC_PREREQ (3,4) /* using bsrl */
766 #define count_leading_zeros(count,x) count_leading_zeros_gcc_clz(count,x)
767 #endif /* gcc clz */
768
769 /* On P6, gcc prior to 3.0 generates a partial register stall for
770 __cbtmp^31, due to using "xorb $31" instead of "xorl $31", the former
771 being 1 code byte smaller. "31-__cbtmp" is a workaround, probably at the
772 cost of one extra instruction. Do this for "i386" too, since that means
773 generic x86. */
774 #if ! defined (count_leading_zeros) && __GNUC__ < 3 \
775 && (HAVE_HOST_CPU_i386 \
776 || HAVE_HOST_CPU_i686 \
777 || HAVE_HOST_CPU_pentiumpro \
778 || HAVE_HOST_CPU_pentium2 \
779 || HAVE_HOST_CPU_pentium3)
780 #define count_leading_zeros(count, x) \
781 do { \
782 USItype __cbtmp; \
783 ASSERT ((x) != 0); \
784 __asm__ ("bsrl %1,%0" : "=r" (__cbtmp) : "rm" ((USItype)(x))); \
785 (count) = 31 - __cbtmp; \
786 } while (0)
787 #endif /* gcc<3 asm bsrl */
788
789 #ifndef count_leading_zeros
790 #define count_leading_zeros(count, x) \
791 do { \
792 USItype __cbtmp; \
793 ASSERT ((x) != 0); \
794 __asm__ ("bsrl %1,%0" : "=r" (__cbtmp) : "rm" ((USItype)(x))); \
795 (count) = __cbtmp ^ 31; \
796 } while (0)
797 #endif /* asm bsrl */
798
799 #if __GMP_GNUC_PREREQ (3,4) /* using bsfl */
800 #define count_trailing_zeros(count,x) count_trailing_zeros_gcc_ctz(count,x)
801 #endif /* gcc ctz */
802
803 #ifndef count_trailing_zeros
804 #define count_trailing_zeros(count, x) \
805 do { \
806 ASSERT ((x) != 0); \
807 __asm__ ("bsfl %1,%k0" : "=r" (count) : "rm" ((USItype)(x))); \
808 } while (0)
809 #endif /* asm bsfl */
810
811 #endif /* ! pentium */
812
813 #ifndef UMUL_TIME
814 #define UMUL_TIME 10
815 #endif
816 #ifndef UDIV_TIME
817 #define UDIV_TIME 40
818 #endif
819 #endif /* 80x86 */
820
821 #if defined (__amd64__) && W_TYPE_SIZE == 64
822 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \
823 __asm__ ("addq %5,%q1\n\tadcq %3,%q0" \
824 : "=r" (sh), "=&r" (sl) \
825 : "0" ((UDItype)(ah)), "rme" ((UDItype)(bh)), \
826 "%1" ((UDItype)(al)), "rme" ((UDItype)(bl)))
827 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \
828 __asm__ ("subq %5,%q1\n\tsbbq %3,%q0" \
829 : "=r" (sh), "=&r" (sl) \
830 : "0" ((UDItype)(ah)), "rme" ((UDItype)(bh)), \
831 "1" ((UDItype)(al)), "rme" ((UDItype)(bl)))
832 #define umul_ppmm(w1, w0, u, v) \
833 __asm__ ("mulq %3" \
834 : "=a" (w0), "=d" (w1) \
835 : "%0" ((UDItype)(u)), "rm" ((UDItype)(v)))
836 #define udiv_qrnnd(q, r, n1, n0, dx) /* d renamed to dx avoiding "=d" */\
837 __asm__ ("divq %4" /* stringification in K&R C */ \
838 : "=a" (q), "=d" (r) \
839 : "0" ((UDItype)(n0)), "1" ((UDItype)(n1)), "rm" ((UDItype)(dx)))
840 /* bsrq destination must be a 64-bit register, hence UDItype for __cbtmp. */
841 #define count_leading_zeros(count, x) \
842 do { \
843 UDItype __cbtmp; \
844 ASSERT ((x) != 0); \
845 __asm__ ("bsrq %1,%0" : "=r" (__cbtmp) : "rm" ((UDItype)(x))); \
846 (count) = __cbtmp ^ 63; \
847 } while (0)
848 /* bsfq destination must be a 64-bit register, "%q0" forces this in case
849 count is only an int. */
850 #define count_trailing_zeros(count, x) \
851 do { \
852 ASSERT ((x) != 0); \
853 __asm__ ("bsfq %1,%q0" : "=r" (count) : "rm" ((UDItype)(x))); \
854 } while (0)
855 #endif /* x86_64 */
856
857 #if defined (__i860__) && W_TYPE_SIZE == 32
858 #define rshift_rhlc(r,h,l,c) \
859 __asm__ ("shr %3,r0,r0\;shrd %1,%2,%0" \
860 "=r" (r) : "r" (h), "r" (l), "rn" (c))
861 #endif /* i860 */
862
863 #if defined (__i960__) && W_TYPE_SIZE == 32
864 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \
865 __asm__ ("cmpo 1,0\;addc %5,%4,%1\;addc %3,%2,%0" \
866 : "=r" (sh), "=&r" (sl) \
867 : "dI" (ah), "dI" (bh), "%dI" (al), "dI" (bl))
868 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \
869 __asm__ ("cmpo 0,0\;subc %5,%4,%1\;subc %3,%2,%0" \
870 : "=r" (sh), "=&r" (sl) \
871 : "dI" (ah), "dI" (bh), "dI" (al), "dI" (bl))
872 #define umul_ppmm(w1, w0, u, v) \
873 ({union {UDItype __ll; \
874 struct {USItype __l, __h;} __i; \
875 } __x; \
876 __asm__ ("emul %2,%1,%0" \
877 : "=d" (__x.__ll) : "%dI" (u), "dI" (v)); \
878 (w1) = __x.__i.__h; (w0) = __x.__i.__l;})
879 #define __umulsidi3(u, v) \
880 ({UDItype __w; \
881 __asm__ ("emul %2,%1,%0" : "=d" (__w) : "%dI" (u), "dI" (v)); \
882 __w; })
883 #define udiv_qrnnd(q, r, nh, nl, d) \
884 do { \
885 union {UDItype __ll; \
886 struct {USItype __l, __h;} __i; \
887 } __nn; \
888 __nn.__i.__h = (nh); __nn.__i.__l = (nl); \
889 __asm__ ("ediv %d,%n,%0" \
890 : "=d" (__rq.__ll) : "dI" (__nn.__ll), "dI" (d)); \
891 (r) = __rq.__i.__l; (q) = __rq.__i.__h; \
892 } while (0)
893 #define count_leading_zeros(count, x) \
894 do { \
895 USItype __cbtmp; \
896 __asm__ ("scanbit %1,%0" : "=r" (__cbtmp) : "r" (x)); \
897 (count) = __cbtmp ^ 31; \
898 } while (0)
899 #define COUNT_LEADING_ZEROS_0 (-32) /* sic */
900 #if defined (__i960mx) /* what is the proper symbol to test??? */
901 #define rshift_rhlc(r,h,l,c) \
902 do { \
903 union {UDItype __ll; \
904 struct {USItype __l, __h;} __i; \
905 } __nn; \
906 __nn.__i.__h = (h); __nn.__i.__l = (l); \
907 __asm__ ("shre %2,%1,%0" : "=d" (r) : "dI" (__nn.__ll), "dI" (c)); \
908 }
909 #endif /* i960mx */
910 #endif /* i960 */
911
912 #if (defined (__mc68000__) || defined (__mc68020__) || defined(mc68020) \
913 || defined (__m68k__) || defined (__mc5200__) || defined (__mc5206e__) \
914 || defined (__mc5307__)) && W_TYPE_SIZE == 32
915 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \
916 __asm__ ("add%.l %5,%1\n\taddx%.l %3,%0" \
917 : "=d" (sh), "=&d" (sl) \
918 : "0" ((USItype)(ah)), "d" ((USItype)(bh)), \
919 "%1" ((USItype)(al)), "g" ((USItype)(bl)))
920 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \
921 __asm__ ("sub%.l %5,%1\n\tsubx%.l %3,%0" \
922 : "=d" (sh), "=&d" (sl) \
923 : "0" ((USItype)(ah)), "d" ((USItype)(bh)), \
924 "1" ((USItype)(al)), "g" ((USItype)(bl)))
925 /* The '020, '030, '040 and CPU32 have 32x32->64 and 64/32->32q-32r. */
926 #if defined (__mc68020__) || defined(mc68020) \
927 || defined (__mc68030__) || defined (mc68030) \
928 || defined (__mc68040__) || defined (mc68040) \
929 || defined (__mcpu32__) || defined (mcpu32) \
930 || defined (__NeXT__)
931 #define umul_ppmm(w1, w0, u, v) \
932 __asm__ ("mulu%.l %3,%1:%0" \
933 : "=d" (w0), "=d" (w1) \
934 : "%0" ((USItype)(u)), "dmi" ((USItype)(v)))
935 #define UMUL_TIME 45
936 #define udiv_qrnnd(q, r, n1, n0, d) \
937 __asm__ ("divu%.l %4,%1:%0" \
938 : "=d" (q), "=d" (r) \
939 : "0" ((USItype)(n0)), "1" ((USItype)(n1)), "dmi" ((USItype)(d)))
940 #define UDIV_TIME 90
941 #define sdiv_qrnnd(q, r, n1, n0, d) \
942 __asm__ ("divs%.l %4,%1:%0" \
943 : "=d" (q), "=d" (r) \
944 : "0" ((USItype)(n0)), "1" ((USItype)(n1)), "dmi" ((USItype)(d)))
945 #else /* for other 68k family members use 16x16->32 multiplication */
946 #define umul_ppmm(xh, xl, a, b) \
947 do { USItype __umul_tmp1, __umul_tmp2; \
948 __asm__ ("| Inlined umul_ppmm\n" \
949 " move%.l %5,%3\n" \
950 " move%.l %2,%0\n" \
951 " move%.w %3,%1\n" \
952 " swap %3\n" \
953 " swap %0\n" \
954 " mulu%.w %2,%1\n" \
955 " mulu%.w %3,%0\n" \
956 " mulu%.w %2,%3\n" \
957 " swap %2\n" \
958 " mulu%.w %5,%2\n" \
959 " add%.l %3,%2\n" \
960 " jcc 1f\n" \
961 " add%.l %#0x10000,%0\n" \
962 "1: move%.l %2,%3\n" \
963 " clr%.w %2\n" \
964 " swap %2\n" \
965 " swap %3\n" \
966 " clr%.w %3\n" \
967 " add%.l %3,%1\n" \
968 " addx%.l %2,%0\n" \
969 " | End inlined umul_ppmm" \
970 : "=&d" (xh), "=&d" (xl), \
971 "=d" (__umul_tmp1), "=&d" (__umul_tmp2) \
972 : "%2" ((USItype)(a)), "d" ((USItype)(b))); \
973 } while (0)
974 #define UMUL_TIME 100
975 #define UDIV_TIME 400
976 #endif /* not mc68020 */
977 /* The '020, '030, '040 and '060 have bitfield insns.
978 GCC 3.4 defines __mc68020__ when in CPU32 mode, check for __mcpu32__ to
979 exclude bfffo on that chip (bitfield insns not available). */
980 #if (defined (__mc68020__) || defined (mc68020) \
981 || defined (__mc68030__) || defined (mc68030) \
982 || defined (__mc68040__) || defined (mc68040) \
983 || defined (__mc68060__) || defined (mc68060) \
984 || defined (__NeXT__)) \
985 && ! defined (__mcpu32__)
986 #define count_leading_zeros(count, x) \
987 __asm__ ("bfffo %1{%b2:%b2},%0" \
988 : "=d" (count) \
989 : "od" ((USItype) (x)), "n" (0))
990 #define COUNT_LEADING_ZEROS_0 32
991 #endif
992 #endif /* mc68000 */
993
994 #if defined (__m88000__) && W_TYPE_SIZE == 32
995 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \
996 __asm__ ("addu.co %1,%r4,%r5\n\taddu.ci %0,%r2,%r3" \
997 : "=r" (sh), "=&r" (sl) \
998 : "rJ" (ah), "rJ" (bh), "%rJ" (al), "rJ" (bl))
999 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \
1000 __asm__ ("subu.co %1,%r4,%r5\n\tsubu.ci %0,%r2,%r3" \
1001 : "=r" (sh), "=&r" (sl) \
1002 : "rJ" (ah), "rJ" (bh), "rJ" (al), "rJ" (bl))
1003 #define count_leading_zeros(count, x) \
1004 do { \
1005 USItype __cbtmp; \
1006 __asm__ ("ff1 %0,%1" : "=r" (__cbtmp) : "r" (x)); \
1007 (count) = __cbtmp ^ 31; \
1008 } while (0)
1009 #define COUNT_LEADING_ZEROS_0 63 /* sic */
1010 #if defined (__m88110__)
1011 #define umul_ppmm(wh, wl, u, v) \
1012 do { \
1013 union {UDItype __ll; \
1014 struct {USItype __h, __l;} __i; \
1015 } __x; \
1016 __asm__ ("mulu.d %0,%1,%2" : "=r" (__x.__ll) : "r" (u), "r" (v)); \
1017 (wh) = __x.__i.__h; \
1018 (wl) = __x.__i.__l; \
1019 } while (0)
1020 #define udiv_qrnnd(q, r, n1, n0, d) \
1021 ({union {UDItype __ll; \
1022 struct {USItype __h, __l;} __i; \
1023 } __x, __q; \
1024 __x.__i.__h = (n1); __x.__i.__l = (n0); \
1025 __asm__ ("divu.d %0,%1,%2" \
1026 : "=r" (__q.__ll) : "r" (__x.__ll), "r" (d)); \
1027 (r) = (n0) - __q.__l * (d); (q) = __q.__l; })
1028 #define UMUL_TIME 5
1029 #define UDIV_TIME 25
1030 #else
1031 #define UMUL_TIME 17
1032 #define UDIV_TIME 150
1033 #endif /* __m88110__ */
1034 #endif /* __m88000__ */
1035
1036 #if defined (__mips) && W_TYPE_SIZE == 32
1037 #if __GMP_GNUC_PREREQ (4,4)
1038 #define umul_ppmm(w1, w0, u, v) \
1039 do { \
1040 UDItype __ll = (UDItype)(u) * (v); \
1041 w1 = __ll >> 32; \
1042 w0 = __ll; \
1043 } while (0)
1044 #endif
1045 #if !defined (umul_ppmm) && __GMP_GNUC_PREREQ (2,7)
1046 #define umul_ppmm(w1, w0, u, v) \
1047 __asm__ ("multu %2,%3" : "=l" (w0), "=h" (w1) : "d" (u), "d" (v))
1048 #endif
1049 #if !defined (umul_ppmm)
1050 #define umul_ppmm(w1, w0, u, v) \
1051 __asm__ ("multu %2,%3\n\tmflo %0\n\tmfhi %1" \
1052 : "=d" (w0), "=d" (w1) : "d" (u), "d" (v))
1053 #endif
1054 #define UMUL_TIME 10
1055 #define UDIV_TIME 100
1056 #endif /* __mips */
1057
1058 #if (defined (__mips) && __mips >= 3) && W_TYPE_SIZE == 64
1059 #if __GMP_GNUC_PREREQ (4,4)
1060 #define umul_ppmm(w1, w0, u, v) \
1061 do { \
1062 typedef unsigned int __ll_UTItype __attribute__((mode(TI))); \
1063 __ll_UTItype __ll = (__ll_UTItype)(u) * (v); \
1064 w1 = __ll >> 64; \
1065 w0 = __ll; \
1066 } while (0)
1067 #endif
1068 #if !defined (umul_ppmm) && __GMP_GNUC_PREREQ (2,7)
1069 #define umul_ppmm(w1, w0, u, v) \
1070 __asm__ ("dmultu %2,%3" : "=l" (w0), "=h" (w1) : "d" (u), "d" (v))
1071 #endif
1072 #if !defined (umul_ppmm)
1073 #define umul_ppmm(w1, w0, u, v) \
1074 __asm__ ("dmultu %2,%3\n\tmflo %0\n\tmfhi %1" \
1075 : "=d" (w0), "=d" (w1) : "d" (u), "d" (v))
1076 #endif
1077 #define UMUL_TIME 20
1078 #define UDIV_TIME 140
1079 #endif /* __mips */
1080
1081 #if defined (__mmix__) && W_TYPE_SIZE == 64
1082 #define umul_ppmm(w1, w0, u, v) \
1083 __asm__ ("MULU %0,%2,%3" : "=r" (w0), "=z" (w1) : "r" (u), "r" (v))
1084 #endif
1085
1086 #if defined (__ns32000__) && W_TYPE_SIZE == 32
1087 #define umul_ppmm(w1, w0, u, v) \
1088 ({union {UDItype __ll; \
1089 struct {USItype __l, __h;} __i; \
1090 } __x; \
1091 __asm__ ("meid %2,%0" \
1092 : "=g" (__x.__ll) \
1093 : "%0" ((USItype)(u)), "g" ((USItype)(v))); \
1094 (w1) = __x.__i.__h; (w0) = __x.__i.__l;})
1095 #define __umulsidi3(u, v) \
1096 ({UDItype __w; \
1097 __asm__ ("meid %2,%0" \
1098 : "=g" (__w) \
1099 : "%0" ((USItype)(u)), "g" ((USItype)(v))); \
1100 __w; })
1101 #define udiv_qrnnd(q, r, n1, n0, d) \
1102 ({union {UDItype __ll; \
1103 struct {USItype __l, __h;} __i; \
1104 } __x; \
1105 __x.__i.__h = (n1); __x.__i.__l = (n0); \
1106 __asm__ ("deid %2,%0" \
1107 : "=g" (__x.__ll) \
1108 : "0" (__x.__ll), "g" ((USItype)(d))); \
1109 (r) = __x.__i.__l; (q) = __x.__i.__h; })
1110 #define count_trailing_zeros(count,x) \
1111 do { \
1112 __asm__ ("ffsd %2,%0" \
1113 : "=r" (count) \
1114 : "0" ((USItype) 0), "r" ((USItype) (x))); \
1115 } while (0)
1116 #endif /* __ns32000__ */
1117
1118 /* In the past we had a block of various #defines tested
1119 _ARCH_PPC - AIX
1120 _ARCH_PWR - AIX
1121 __powerpc__ - gcc
1122 __POWERPC__ - BEOS
1123 __ppc__ - Darwin
1124 PPC - old gcc, GNU/Linux, SysV
1125 The plain PPC test was not good for vxWorks, since PPC is defined on all
1126 CPUs there (eg. m68k too), as a constant one is expected to compare
1127 CPU_FAMILY against.
1128
1129 At any rate, this was pretty unattractive and a bit fragile. The use of
1130 HAVE_HOST_CPU_FAMILY is designed to cut through it all and be sure of
1131 getting the desired effect.
1132
1133 ENHANCE-ME: We should test _IBMR2 here when we add assembly support for
1134 the system vendor compilers. (Is that vendor compilers with inline asm,
1135 or what?) */
1136
1137 #if (HAVE_HOST_CPU_FAMILY_power || HAVE_HOST_CPU_FAMILY_powerpc) \
1138 && W_TYPE_SIZE == 32
1139 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \
1140 do { \
1141 if (__builtin_constant_p (bh) && (bh) == 0) \
1142 __asm__ ("{a%I4|add%I4c} %1,%3,%4\n\t{aze|addze} %0,%2" \
1143 : "=r" (sh), "=&r" (sl) : "r" (ah), "%r" (al), "rI" (bl));\
1144 else if (__builtin_constant_p (bh) && (bh) == ~(USItype) 0) \
1145 __asm__ ("{a%I4|add%I4c} %1,%3,%4\n\t{ame|addme} %0,%2" \
1146 : "=r" (sh), "=&r" (sl) : "r" (ah), "%r" (al), "rI" (bl));\
1147 else \
1148 __asm__ ("{a%I5|add%I5c} %1,%4,%5\n\t{ae|adde} %0,%2,%3" \
1149 : "=r" (sh), "=&r" (sl) \
1150 : "r" (ah), "r" (bh), "%r" (al), "rI" (bl)); \
1151 } while (0)
1152 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \
1153 do { \
1154 if (__builtin_constant_p (ah) && (ah) == 0) \
1155 __asm__ ("{sf%I3|subf%I3c} %1,%4,%3\n\t{sfze|subfze} %0,%2" \
1156 : "=r" (sh), "=&r" (sl) : "r" (bh), "rI" (al), "r" (bl));\
1157 else if (__builtin_constant_p (ah) && (ah) == ~(USItype) 0) \
1158 __asm__ ("{sf%I3|subf%I3c} %1,%4,%3\n\t{sfme|subfme} %0,%2" \
1159 : "=r" (sh), "=&r" (sl) : "r" (bh), "rI" (al), "r" (bl));\
1160 else if (__builtin_constant_p (bh) && (bh) == 0) \
1161 __asm__ ("{sf%I3|subf%I3c} %1,%4,%3\n\t{ame|addme} %0,%2" \
1162 : "=r" (sh), "=&r" (sl) : "r" (ah), "rI" (al), "r" (bl));\
1163 else if (__builtin_constant_p (bh) && (bh) == ~(USItype) 0) \
1164 __asm__ ("{sf%I3|subf%I3c} %1,%4,%3\n\t{aze|addze} %0,%2" \
1165 : "=r" (sh), "=&r" (sl) : "r" (ah), "rI" (al), "r" (bl));\
1166 else \
1167 __asm__ ("{sf%I4|subf%I4c} %1,%5,%4\n\t{sfe|subfe} %0,%3,%2" \
1168 : "=r" (sh), "=&r" (sl) \
1169 : "r" (ah), "r" (bh), "rI" (al), "r" (bl)); \
1170 } while (0)
1171 #define count_leading_zeros(count, x) \
1172 __asm__ ("{cntlz|cntlzw} %0,%1" : "=r" (count) : "r" (x))
1173 #define COUNT_LEADING_ZEROS_0 32
1174 #if HAVE_HOST_CPU_FAMILY_powerpc
1175 #if __GMP_GNUC_PREREQ (4,4)
1176 #define umul_ppmm(w1, w0, u, v) \
1177 do { \
1178 UDItype __ll = (UDItype)(u) * (v); \
1179 w1 = __ll >> 32; \
1180 w0 = __ll; \
1181 } while (0)
1182 #endif
1183 #if !defined (umul_ppmm)
1184 #define umul_ppmm(ph, pl, m0, m1) \
1185 do { \
1186 USItype __m0 = (m0), __m1 = (m1); \
1187 __asm__ ("mulhwu %0,%1,%2" : "=r" (ph) : "%r" (m0), "r" (m1)); \
1188 (pl) = __m0 * __m1; \
1189 } while (0)
1190 #endif
1191 #define UMUL_TIME 15
1192 #define smul_ppmm(ph, pl, m0, m1) \
1193 do { \
1194 SItype __m0 = (m0), __m1 = (m1); \
1195 __asm__ ("mulhw %0,%1,%2" : "=r" (ph) : "%r" (m0), "r" (m1)); \
1196 (pl) = __m0 * __m1; \
1197 } while (0)
1198 #define SMUL_TIME 14
1199 #define UDIV_TIME 120
1200 #else
1201 #define UMUL_TIME 8
1202 #define smul_ppmm(xh, xl, m0, m1) \
1203 __asm__ ("mul %0,%2,%3" : "=r" (xh), "=q" (xl) : "r" (m0), "r" (m1))
1204 #define SMUL_TIME 4
1205 #define sdiv_qrnnd(q, r, nh, nl, d) \
1206 __asm__ ("div %0,%2,%4" : "=r" (q), "=q" (r) : "r" (nh), "1" (nl), "r" (d))
1207 #define UDIV_TIME 100
1208 #endif
1209 #endif /* 32-bit POWER architecture variants. */
1210
1211 /* We should test _IBMR2 here when we add assembly support for the system
1212 vendor compilers. */
1213 #if HAVE_HOST_CPU_FAMILY_powerpc && W_TYPE_SIZE == 64
1214 #if !defined (_LONG_LONG_LIMB)
1215 /* _LONG_LONG_LIMB is ABI=mode32 where adde operates on 32-bit values. So
1216 use adde etc only when not _LONG_LONG_LIMB. */
1217 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \
1218 do { \
1219 if (__builtin_constant_p (bh) && (bh) == 0) \
1220 __asm__ ("{a%I4|add%I4c} %1,%3,%4\n\t{aze|addze} %0,%2" \
1221 : "=r" (sh), "=&r" (sl) : "r" (ah), "%r" (al), "rI" (bl));\
1222 else if (__builtin_constant_p (bh) && (bh) == ~(UDItype) 0) \
1223 __asm__ ("{a%I4|add%I4c} %1,%3,%4\n\t{ame|addme} %0,%2" \
1224 : "=r" (sh), "=&r" (sl) : "r" (ah), "%r" (al), "rI" (bl));\
1225 else \
1226 __asm__ ("{a%I5|add%I5c} %1,%4,%5\n\t{ae|adde} %0,%2,%3" \
1227 : "=r" (sh), "=&r" (sl) \
1228 : "r" (ah), "r" (bh), "%r" (al), "rI" (bl)); \
1229 } while (0)
1230 /* We use "*rI" for the constant operand here, since with just "I", gcc barfs.
1231 This might seem strange, but gcc folds away the dead code late. */
1232 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \
1233 do { \
1234 if (__builtin_constant_p (bl) && bl > -0x8000 && bl <= 0x8000) { \
1235 if (__builtin_constant_p (ah) && (ah) == 0) \
1236 __asm__ ("{ai|addic} %1,%3,%4\n\t{sfze|subfze} %0,%2" \
1237 : "=r" (sh), "=&r" (sl) : "r" (bh), "rI" (al), "*rI" (-bl)); \
1238 else if (__builtin_constant_p (ah) && (ah) == ~(UDItype) 0) \
1239 __asm__ ("{ai|addic} %1,%3,%4\n\t{sfme|subfme} %0,%2" \
1240 : "=r" (sh), "=&r" (sl) : "r" (bh), "rI" (al), "*rI" (-bl)); \
1241 else if (__builtin_constant_p (bh) && (bh) == 0) \
1242 __asm__ ("{ai|addic} %1,%3,%4\n\t{ame|addme} %0,%2" \
1243 : "=r" (sh), "=&r" (sl) : "r" (ah), "rI" (al), "*rI" (-bl)); \
1244 else if (__builtin_constant_p (bh) && (bh) == ~(UDItype) 0) \
1245 __asm__ ("{ai|addic} %1,%3,%4\n\t{aze|addze} %0,%2" \
1246 : "=r" (sh), "=&r" (sl) : "r" (ah), "rI" (al), "*rI" (-bl)); \
1247 else \
1248 __asm__ ("{ai|addic} %1,%4,%5\n\t{sfe|subfe} %0,%3,%2" \
1249 : "=r" (sh), "=&r" (sl) \
1250 : "r" (ah), "r" (bh), "rI" (al), "*rI" (-bl)); \
1251 } else { \
1252 if (__builtin_constant_p (ah) && (ah) == 0) \
1253 __asm__ ("{sf%I3|subf%I3c} %1,%4,%3\n\t{sfze|subfze} %0,%2" \
1254 : "=r" (sh), "=&r" (sl) : "r" (bh), "rI" (al), "r" (bl)); \
1255 else if (__builtin_constant_p (ah) && (ah) == ~(UDItype) 0) \
1256 __asm__ ("{sf%I3|subf%I3c} %1,%4,%3\n\t{sfme|subfme} %0,%2" \
1257 : "=r" (sh), "=&r" (sl) : "r" (bh), "rI" (al), "r" (bl)); \
1258 else if (__builtin_constant_p (bh) && (bh) == 0) \
1259 __asm__ ("{sf%I3|subf%I3c} %1,%4,%3\n\t{ame|addme} %0,%2" \
1260 : "=r" (sh), "=&r" (sl) : "r" (ah), "rI" (al), "r" (bl)); \
1261 else if (__builtin_constant_p (bh) && (bh) == ~(UDItype) 0) \
1262 __asm__ ("{sf%I3|subf%I3c} %1,%4,%3\n\t{aze|addze} %0,%2" \
1263 : "=r" (sh), "=&r" (sl) : "r" (ah), "rI" (al), "r" (bl)); \
1264 else \
1265 __asm__ ("{sf%I4|subf%I4c} %1,%5,%4\n\t{sfe|subfe} %0,%3,%2" \
1266 : "=r" (sh), "=&r" (sl) \
1267 : "r" (ah), "r" (bh), "rI" (al), "r" (bl)); \
1268 } \
1269 } while (0)
1270 #endif /* ! _LONG_LONG_LIMB */
1271 #define count_leading_zeros(count, x) \
1272 __asm__ ("cntlzd %0,%1" : "=r" (count) : "r" (x))
1273 #define COUNT_LEADING_ZEROS_0 64
1274 #if __GMP_GNUC_PREREQ (4,4)
1275 #define umul_ppmm(w1, w0, u, v) \
1276 do { \
1277 typedef unsigned int __ll_UTItype __attribute__((mode(TI))); \
1278 __ll_UTItype __ll = (__ll_UTItype)(u) * (v); \
1279 w1 = __ll >> 64; \
1280 w0 = __ll; \
1281 } while (0)
1282 #endif
1283 #if !defined (umul_ppmm)
1284 #define umul_ppmm(ph, pl, m0, m1) \
1285 do { \
1286 UDItype __m0 = (m0), __m1 = (m1); \
1287 __asm__ ("mulhdu %0,%1,%2" : "=r" (ph) : "%r" (m0), "r" (m1)); \
1288 (pl) = __m0 * __m1; \
1289 } while (0)
1290 #endif
1291 #define UMUL_TIME 15
1292 #define smul_ppmm(ph, pl, m0, m1) \
1293 do { \
1294 DItype __m0 = (m0), __m1 = (m1); \
1295 __asm__ ("mulhd %0,%1,%2" : "=r" (ph) : "%r" (m0), "r" (m1)); \
1296 (pl) = __m0 * __m1; \
1297 } while (0)
1298 #define SMUL_TIME 14 /* ??? */
1299 #define UDIV_TIME 120 /* ??? */
1300 #endif /* 64-bit PowerPC. */
1301
1302 #if defined (__pyr__) && W_TYPE_SIZE == 32
1303 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \
1304 __asm__ ("addw %5,%1\n\taddwc %3,%0" \
1305 : "=r" (sh), "=&r" (sl) \
1306 : "0" ((USItype)(ah)), "g" ((USItype)(bh)), \
1307 "%1" ((USItype)(al)), "g" ((USItype)(bl)))
1308 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \
1309 __asm__ ("subw %5,%1\n\tsubwb %3,%0" \
1310 : "=r" (sh), "=&r" (sl) \
1311 : "0" ((USItype)(ah)), "g" ((USItype)(bh)), \
1312 "1" ((USItype)(al)), "g" ((USItype)(bl)))
1313 /* This insn works on Pyramids with AP, XP, or MI CPUs, but not with SP. */
1314 #define umul_ppmm(w1, w0, u, v) \
1315 ({union {UDItype __ll; \
1316 struct {USItype __h, __l;} __i; \
1317 } __x; \
1318 __asm__ ("movw %1,%R0\n\tuemul %2,%0" \
1319 : "=&r" (__x.__ll) \
1320 : "g" ((USItype) (u)), "g" ((USItype)(v))); \
1321 (w1) = __x.__i.__h; (w0) = __x.__i.__l;})
1322 #endif /* __pyr__ */
1323
1324 #if defined (__ibm032__) /* RT/ROMP */ && W_TYPE_SIZE == 32
1325 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \
1326 __asm__ ("a %1,%5\n\tae %0,%3" \
1327 : "=r" (sh), "=&r" (sl) \
1328 : "0" ((USItype)(ah)), "r" ((USItype)(bh)), \
1329 "%1" ((USItype)(al)), "r" ((USItype)(bl)))
1330 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \
1331 __asm__ ("s %1,%5\n\tse %0,%3" \
1332 : "=r" (sh), "=&r" (sl) \
1333 : "0" ((USItype)(ah)), "r" ((USItype)(bh)), \
1334 "1" ((USItype)(al)), "r" ((USItype)(bl)))
1335 #define smul_ppmm(ph, pl, m0, m1) \
1336 __asm__ ( \
1337 "s r2,r2\n" \
1338 " mts r10,%2\n" \
1339 " m r2,%3\n" \
1340 " m r2,%3\n" \
1341 " m r2,%3\n" \
1342 " m r2,%3\n" \
1343 " m r2,%3\n" \
1344 " m r2,%3\n" \
1345 " m r2,%3\n" \
1346 " m r2,%3\n" \
1347 " m r2,%3\n" \
1348 " m r2,%3\n" \
1349 " m r2,%3\n" \
1350 " m r2,%3\n" \
1351 " m r2,%3\n" \
1352 " m r2,%3\n" \
1353 " m r2,%3\n" \
1354 " m r2,%3\n" \
1355 " cas %0,r2,r0\n" \
1356 " mfs r10,%1" \
1357 : "=r" (ph), "=r" (pl) \
1358 : "%r" ((USItype)(m0)), "r" ((USItype)(m1)) \
1359 : "r2")
1360 #define UMUL_TIME 20
1361 #define UDIV_TIME 200
1362 #define count_leading_zeros(count, x) \
1363 do { \
1364 if ((x) >= 0x10000) \
1365 __asm__ ("clz %0,%1" \
1366 : "=r" (count) : "r" ((USItype)(x) >> 16)); \
1367 else \
1368 { \
1369 __asm__ ("clz %0,%1" \
1370 : "=r" (count) : "r" ((USItype)(x))); \
1371 (count) += 16; \
1372 } \
1373 } while (0)
1374 #endif /* RT/ROMP */
1375
1376 #if defined (__sh2__) && W_TYPE_SIZE == 32
1377 #define umul_ppmm(w1, w0, u, v) \
1378 __asm__ ("dmulu.l %2,%3\n\tsts macl,%1\n\tsts mach,%0" \
1379 : "=r" (w1), "=r" (w0) : "r" (u), "r" (v) : "macl", "mach")
1380 #define UMUL_TIME 5
1381 #endif
1382
1383 #if defined (__sparc__) && W_TYPE_SIZE == 32
1384 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \
1385 __asm__ ("addcc %r4,%5,%1\n\taddx %r2,%3,%0" \
1386 : "=r" (sh), "=&r" (sl) \
1387 : "rJ" (ah), "rI" (bh),"%rJ" (al), "rI" (bl) \
1388 __CLOBBER_CC)
1389 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \
1390 __asm__ ("subcc %r4,%5,%1\n\tsubx %r2,%3,%0" \
1391 : "=r" (sh), "=&r" (sl) \
1392 : "rJ" (ah), "rI" (bh), "rJ" (al), "rI" (bl) \
1393 __CLOBBER_CC)
1394 /* FIXME: When gcc -mcpu=v9 is used on solaris, gcc/config/sol2-sld-64.h
1395 doesn't define anything to indicate that to us, it only sets __sparcv8. */
1396 #if defined (__sparc_v9__) || defined (__sparcv9)
1397 /* Perhaps we should use floating-point operations here? */
1398 #if 0
1399 /* Triggers a bug making mpz/tests/t-gcd.c fail.
1400 Perhaps we simply need explicitly zero-extend the inputs? */
1401 #define umul_ppmm(w1, w0, u, v) \
1402 __asm__ ("mulx %2,%3,%%g1; srl %%g1,0,%1; srlx %%g1,32,%0" : \
1403 "=r" (w1), "=r" (w0) : "r" (u), "r" (v) : "g1")
1404 #else
1405 /* Use v8 umul until above bug is fixed. */
1406 #define umul_ppmm(w1, w0, u, v) \
1407 __asm__ ("umul %2,%3,%1;rd %%y,%0" : "=r" (w1), "=r" (w0) : "r" (u), "r" (v))
1408 #endif
1409 /* Use a plain v8 divide for v9. */
1410 #define udiv_qrnnd(q, r, n1, n0, d) \
1411 do { \
1412 USItype __q; \
1413 __asm__ ("mov %1,%%y;nop;nop;nop;udiv %2,%3,%0" \
1414 : "=r" (__q) : "r" (n1), "r" (n0), "r" (d)); \
1415 (r) = (n0) - __q * (d); \
1416 (q) = __q; \
1417 } while (0)
1418 #else
1419 #if defined (__sparc_v8__) /* gcc normal */ \
1420 || defined (__sparcv8) /* gcc solaris */ \
1421 || HAVE_HOST_CPU_supersparc
1422 /* Don't match immediate range because, 1) it is not often useful,
1423 2) the 'I' flag thinks of the range as a 13 bit signed interval,
1424 while we want to match a 13 bit interval, sign extended to 32 bits,
1425 but INTERPRETED AS UNSIGNED. */
1426 #define umul_ppmm(w1, w0, u, v) \
1427 __asm__ ("umul %2,%3,%1;rd %%y,%0" : "=r" (w1), "=r" (w0) : "r" (u), "r" (v))
1428 #define UMUL_TIME 5
1429
1430 #if HAVE_HOST_CPU_supersparc
1431 #define UDIV_TIME 60 /* SuperSPARC timing */
1432 #else
1433 /* Don't use this on SuperSPARC because its udiv only handles 53 bit
1434 dividends and will trap to the kernel for the rest. */
1435 #define udiv_qrnnd(q, r, n1, n0, d) \
1436 do { \
1437 USItype __q; \
1438 __asm__ ("mov %1,%%y;nop;nop;nop;udiv %2,%3,%0" \
1439 : "=r" (__q) : "r" (n1), "r" (n0), "r" (d)); \
1440 (r) = (n0) - __q * (d); \
1441 (q) = __q; \
1442 } while (0)
1443 #define UDIV_TIME 25
1444 #endif /* HAVE_HOST_CPU_supersparc */
1445
1446 #else /* ! __sparc_v8__ */
1447 #if defined (__sparclite__)
1448 /* This has hardware multiply but not divide. It also has two additional
1449 instructions scan (ffs from high bit) and divscc. */
1450 #define umul_ppmm(w1, w0, u, v) \
1451 __asm__ ("umul %2,%3,%1;rd %%y,%0" : "=r" (w1), "=r" (w0) : "r" (u), "r" (v))
1452 #define UMUL_TIME 5
1453 #define udiv_qrnnd(q, r, n1, n0, d) \
1454 __asm__ ("! Inlined udiv_qrnnd\n" \
1455 " wr %%g0,%2,%%y ! Not a delayed write for sparclite\n" \
1456 " tst %%g0\n" \
1457 " divscc %3,%4,%%g1\n" \
1458 " divscc %%g1,%4,%%g1\n" \
1459 " divscc %%g1,%4,%%g1\n" \
1460 " divscc %%g1,%4,%%g1\n" \
1461 " divscc %%g1,%4,%%g1\n" \
1462 " divscc %%g1,%4,%%g1\n" \
1463 " divscc %%g1,%4,%%g1\n" \
1464 " divscc %%g1,%4,%%g1\n" \
1465 " divscc %%g1,%4,%%g1\n" \
1466 " divscc %%g1,%4,%%g1\n" \
1467 " divscc %%g1,%4,%%g1\n" \
1468 " divscc %%g1,%4,%%g1\n" \
1469 " divscc %%g1,%4,%%g1\n" \
1470 " divscc %%g1,%4,%%g1\n" \
1471 " divscc %%g1,%4,%%g1\n" \
1472 " divscc %%g1,%4,%%g1\n" \
1473 " divscc %%g1,%4,%%g1\n" \
1474 " divscc %%g1,%4,%%g1\n" \
1475 " divscc %%g1,%4,%%g1\n" \
1476 " divscc %%g1,%4,%%g1\n" \
1477 " divscc %%g1,%4,%%g1\n" \
1478 " divscc %%g1,%4,%%g1\n" \
1479 " divscc %%g1,%4,%%g1\n" \
1480 " divscc %%g1,%4,%%g1\n" \
1481 " divscc %%g1,%4,%%g1\n" \
1482 " divscc %%g1,%4,%%g1\n" \
1483 " divscc %%g1,%4,%%g1\n" \
1484 " divscc %%g1,%4,%%g1\n" \
1485 " divscc %%g1,%4,%%g1\n" \
1486 " divscc %%g1,%4,%%g1\n" \
1487 " divscc %%g1,%4,%%g1\n" \
1488 " divscc %%g1,%4,%0\n" \
1489 " rd %%y,%1\n" \
1490 " bl,a 1f\n" \
1491 " add %1,%4,%1\n" \
1492 "1: ! End of inline udiv_qrnnd" \
1493 : "=r" (q), "=r" (r) : "r" (n1), "r" (n0), "rI" (d) \
1494 : "%g1" __AND_CLOBBER_CC)
1495 #define UDIV_TIME 37
1496 #define count_leading_zeros(count, x) \
1497 __asm__ ("scan %1,1,%0" : "=r" (count) : "r" (x))
1498 /* Early sparclites return 63 for an argument of 0, but they warn that future
1499 implementations might change this. Therefore, leave COUNT_LEADING_ZEROS_0
1500 undefined. */
1501 #endif /* __sparclite__ */
1502 #endif /* __sparc_v8__ */
1503 #endif /* __sparc_v9__ */
1504 /* Default to sparc v7 versions of umul_ppmm and udiv_qrnnd. */
1505 #ifndef umul_ppmm
1506 #define umul_ppmm(w1, w0, u, v) \
1507 __asm__ ("! Inlined umul_ppmm\n" \
1508 " wr %%g0,%2,%%y ! SPARC has 0-3 delay insn after a wr\n" \
1509 " sra %3,31,%%g2 ! Don't move this insn\n" \
1510 " and %2,%%g2,%%g2 ! Don't move this insn\n" \
1511 " andcc %%g0,0,%%g1 ! Don't move this insn\n" \
1512 " mulscc %%g1,%3,%%g1\n" \
1513 " mulscc %%g1,%3,%%g1\n" \
1514 " mulscc %%g1,%3,%%g1\n" \
1515 " mulscc %%g1,%3,%%g1\n" \
1516 " mulscc %%g1,%3,%%g1\n" \
1517 " mulscc %%g1,%3,%%g1\n" \
1518 " mulscc %%g1,%3,%%g1\n" \
1519 " mulscc %%g1,%3,%%g1\n" \
1520 " mulscc %%g1,%3,%%g1\n" \
1521 " mulscc %%g1,%3,%%g1\n" \
1522 " mulscc %%g1,%3,%%g1\n" \
1523 " mulscc %%g1,%3,%%g1\n" \
1524 " mulscc %%g1,%3,%%g1\n" \
1525 " mulscc %%g1,%3,%%g1\n" \
1526 " mulscc %%g1,%3,%%g1\n" \
1527 " mulscc %%g1,%3,%%g1\n" \
1528 " mulscc %%g1,%3,%%g1\n" \
1529 " mulscc %%g1,%3,%%g1\n" \
1530 " mulscc %%g1,%3,%%g1\n" \
1531 " mulscc %%g1,%3,%%g1\n" \
1532 " mulscc %%g1,%3,%%g1\n" \
1533 " mulscc %%g1,%3,%%g1\n" \
1534 " mulscc %%g1,%3,%%g1\n" \
1535 " mulscc %%g1,%3,%%g1\n" \
1536 " mulscc %%g1,%3,%%g1\n" \
1537 " mulscc %%g1,%3,%%g1\n" \
1538 " mulscc %%g1,%3,%%g1\n" \
1539 " mulscc %%g1,%3,%%g1\n" \
1540 " mulscc %%g1,%3,%%g1\n" \
1541 " mulscc %%g1,%3,%%g1\n" \
1542 " mulscc %%g1,%3,%%g1\n" \
1543 " mulscc %%g1,%3,%%g1\n" \
1544 " mulscc %%g1,0,%%g1\n" \
1545 " add %%g1,%%g2,%0\n" \
1546 " rd %%y,%1" \
1547 : "=r" (w1), "=r" (w0) : "%rI" (u), "r" (v) \
1548 : "%g1", "%g2" __AND_CLOBBER_CC)
1549 #define UMUL_TIME 39 /* 39 instructions */
1550 #endif
1551 #ifndef udiv_qrnnd
1552 #ifndef LONGLONG_STANDALONE
1553 #define udiv_qrnnd(q, r, n1, n0, d) \
1554 do { UWtype __r; \
1555 (q) = __MPN(udiv_qrnnd) (&__r, (n1), (n0), (d)); \
1556 (r) = __r; \
1557 } while (0)
1558 extern UWtype __MPN(udiv_qrnnd) _PROTO ((UWtype *, UWtype, UWtype, UWtype));
1559 #ifndef UDIV_TIME
1560 #define UDIV_TIME 140
1561 #endif
1562 #endif /* LONGLONG_STANDALONE */
1563 #endif /* udiv_qrnnd */
1564 #endif /* __sparc__ */
1565
1566 #if defined (__sparc__) && W_TYPE_SIZE == 64
1567 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \
1568 __asm__ ( \
1569 "addcc %r4,%5,%1\n" \
1570 " addccc %r6,%7,%%g0\n" \
1571 " addc %r2,%3,%0" \
1572 : "=r" (sh), "=&r" (sl) \
1573 : "rJ" (ah), "rI" (bh), "%rJ" (al), "rI" (bl), \
1574 "%rJ" ((al) >> 32), "rI" ((bl) >> 32) \
1575 __CLOBBER_CC)
1576 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \
1577 __asm__ ( \
1578 "subcc %r4,%5,%1\n" \
1579 " subccc %r6,%7,%%g0\n" \
1580 " subc %r2,%3,%0" \
1581 : "=r" (sh), "=&r" (sl) \
1582 : "rJ" (ah), "rI" (bh), "rJ" (al), "rI" (bl), \
1583 "rJ" ((al) >> 32), "rI" ((bl) >> 32) \
1584 __CLOBBER_CC)
1585 #endif
1586
1587 #if defined (__vax__) && W_TYPE_SIZE == 32
1588 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \
1589 __asm__ ("addl2 %5,%1\n\tadwc %3,%0" \
1590 : "=g" (sh), "=&g" (sl) \
1591 : "0" ((USItype)(ah)), "g" ((USItype)(bh)), \
1592 "%1" ((USItype)(al)), "g" ((USItype)(bl)))
1593 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \
1594 __asm__ ("subl2 %5,%1\n\tsbwc %3,%0" \
1595 : "=g" (sh), "=&g" (sl) \
1596 : "0" ((USItype)(ah)), "g" ((USItype)(bh)), \
1597 "1" ((USItype)(al)), "g" ((USItype)(bl)))
1598 #define smul_ppmm(xh, xl, m0, m1) \
1599 do { \
1600 union {UDItype __ll; \
1601 struct {USItype __l, __h;} __i; \
1602 } __x; \
1603 USItype __m0 = (m0), __m1 = (m1); \
1604 __asm__ ("emul %1,%2,$0,%0" \
1605 : "=g" (__x.__ll) : "g" (__m0), "g" (__m1)); \
1606 (xh) = __x.__i.__h; (xl) = __x.__i.__l; \
1607 } while (0)
1608 #define sdiv_qrnnd(q, r, n1, n0, d) \
1609 do { \
1610 union {DItype __ll; \
1611 struct {SItype __l, __h;} __i; \
1612 } __x; \
1613 __x.__i.__h = n1; __x.__i.__l = n0; \
1614 __asm__ ("ediv %3,%2,%0,%1" \
1615 : "=g" (q), "=g" (r) : "g" (__x.__ll), "g" (d)); \
1616 } while (0)
1617 #if 0
1618 /* FIXME: This instruction appears to be unimplemented on some systems (vax
1619 8800 maybe). */
1620 #define count_trailing_zeros(count,x) \
1621 do { \
1622 __asm__ ("ffs 0, 31, %1, %0" \
1623 : "=g" (count) \
1624 : "g" ((USItype) (x))); \
1625 } while (0)
1626 #endif
1627 #endif /* __vax__ */
1628
1629 #if defined (__z8000__) && W_TYPE_SIZE == 16
1630 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \
1631 __asm__ ("add %H1,%H5\n\tadc %H0,%H3" \
1632 : "=r" (sh), "=&r" (sl) \
1633 : "0" ((unsigned int)(ah)), "r" ((unsigned int)(bh)), \
1634 "%1" ((unsigned int)(al)), "rQR" ((unsigned int)(bl)))
1635 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \
1636 __asm__ ("sub %H1,%H5\n\tsbc %H0,%H3" \
1637 : "=r" (sh), "=&r" (sl) \
1638 : "0" ((unsigned int)(ah)), "r" ((unsigned int)(bh)), \
1639 "1" ((unsigned int)(al)), "rQR" ((unsigned int)(bl)))
1640 #define umul_ppmm(xh, xl, m0, m1) \
1641 do { \
1642 union {long int __ll; \
1643 struct {unsigned int __h, __l;} __i; \
1644 } __x; \
1645 unsigned int __m0 = (m0), __m1 = (m1); \
1646 __asm__ ("mult %S0,%H3" \
1647 : "=r" (__x.__i.__h), "=r" (__x.__i.__l) \
1648 : "%1" (m0), "rQR" (m1)); \
1649 (xh) = __x.__i.__h; (xl) = __x.__i.__l; \
1650 (xh) += ((((signed int) __m0 >> 15) & __m1) \
1651 + (((signed int) __m1 >> 15) & __m0)); \
1652 } while (0)
1653 #endif /* __z8000__ */
1654
1655 #endif /* __GNUC__ */
1656
1657 #endif /* NO_ASM */
1658
1659
1660 #if !defined (umul_ppmm) && defined (__umulsidi3)
1661 #define umul_ppmm(ph, pl, m0, m1) \
1662 { \
1663 UDWtype __ll = __umulsidi3 (m0, m1); \
1664 ph = (UWtype) (__ll >> W_TYPE_SIZE); \
1665 pl = (UWtype) __ll; \
1666 }
1667 #endif
1668
1669 #if !defined (__umulsidi3)
1670 #define __umulsidi3(u, v) \
1671 ({UWtype __hi, __lo; \
1672 umul_ppmm (__hi, __lo, u, v); \
1673 ((UDWtype) __hi << W_TYPE_SIZE) | __lo; })
1674 #endif
1675
1676
1677 /* Use mpn_umul_ppmm or mpn_udiv_qrnnd functions, if they exist. The "_r"
1678 forms have "reversed" arguments, meaning the pointer is last, which
1679 sometimes allows better parameter passing, in particular on 64-bit
1680 hppa. */
1681
1682 #define mpn_umul_ppmm __MPN(umul_ppmm)
1683 extern UWtype mpn_umul_ppmm _PROTO ((UWtype *, UWtype, UWtype));
1684
1685 #if ! defined (umul_ppmm) && HAVE_NATIVE_mpn_umul_ppmm \
1686 && ! defined (LONGLONG_STANDALONE)
1687 #define umul_ppmm(wh, wl, u, v) \
1688 do { \
1689 UWtype __umul_ppmm__p0; \
1690 (wh) = mpn_umul_ppmm (&__umul_ppmm__p0, (UWtype) (u), (UWtype) (v)); \
1691 (wl) = __umul_ppmm__p0; \
1692 } while (0)
1693 #endif
1694
1695 #define mpn_umul_ppmm_r __MPN(umul_ppmm_r)
1696 extern UWtype mpn_umul_ppmm_r _PROTO ((UWtype, UWtype, UWtype *));
1697
1698 #if ! defined (umul_ppmm) && HAVE_NATIVE_mpn_umul_ppmm_r \
1699 && ! defined (LONGLONG_STANDALONE)
1700 #define umul_ppmm(wh, wl, u, v) \
1701 do { \
1702 UWtype __umul_ppmm__p0; \
1703 (wh) = mpn_umul_ppmm_r ((UWtype) (u), (UWtype) (v), &__umul_ppmm__p0); \
1704 (wl) = __umul_ppmm__p0; \
1705 } while (0)
1706 #endif
1707
1708 #define mpn_udiv_qrnnd __MPN(udiv_qrnnd)
1709 extern UWtype mpn_udiv_qrnnd _PROTO ((UWtype *, UWtype, UWtype, UWtype));
1710
1711 #if ! defined (udiv_qrnnd) && HAVE_NATIVE_mpn_udiv_qrnnd \
1712 && ! defined (LONGLONG_STANDALONE)
1713 #define udiv_qrnnd(q, r, n1, n0, d) \
1714 do { \
1715 UWtype __udiv_qrnnd__r; \
1716 (q) = mpn_udiv_qrnnd (&__udiv_qrnnd__r, \
1717 (UWtype) (n1), (UWtype) (n0), (UWtype) d); \
1718 (r) = __udiv_qrnnd__r; \
1719 } while (0)
1720 #endif
1721
1722 #define mpn_udiv_qrnnd_r __MPN(udiv_qrnnd_r)
1723 extern UWtype mpn_udiv_qrnnd_r _PROTO ((UWtype, UWtype, UWtype, UWtype *));
1724
1725 #if ! defined (udiv_qrnnd) && HAVE_NATIVE_mpn_udiv_qrnnd_r \
1726 && ! defined (LONGLONG_STANDALONE)
1727 #define udiv_qrnnd(q, r, n1, n0, d) \
1728 do { \
1729 UWtype __udiv_qrnnd__r; \
1730 (q) = mpn_udiv_qrnnd_r ((UWtype) (n1), (UWtype) (n0), (UWtype) d, \
1731 &__udiv_qrnnd__r); \
1732 (r) = __udiv_qrnnd__r; \
1733 } while (0)
1734 #endif
1735
1736
1737 /* If this machine has no inline assembler, use C macros. */
1738
1739 #if !defined (add_ssaaaa)
1740 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \
1741 do { \
1742 UWtype __x; \
1743 __x = (al) + (bl); \
1744 (sh) = (ah) + (bh) + (__x < (al)); \
1745 (sl) = __x; \
1746 } while (0)
1747 #endif
1748
1749 #if !defined (sub_ddmmss)
1750 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \
1751 do { \
1752 UWtype __x; \
1753 __x = (al) - (bl); \
1754 (sh) = (ah) - (bh) - ((al) < (bl)); \
1755 (sl) = __x; \
1756 } while (0)
1757 #endif
1758
1759 /* If we lack umul_ppmm but have smul_ppmm, define umul_ppmm in terms of
1760 smul_ppmm. */
1761 #if !defined (umul_ppmm) && defined (smul_ppmm)
1762 #define umul_ppmm(w1, w0, u, v) \
1763 do { \
1764 UWtype __w1; \
1765 UWtype __xm0 = (u), __xm1 = (v); \
1766 smul_ppmm (__w1, w0, __xm0, __xm1); \
1767 (w1) = __w1 + (-(__xm0 >> (W_TYPE_SIZE - 1)) & __xm1) \
1768 + (-(__xm1 >> (W_TYPE_SIZE - 1)) & __xm0); \
1769 } while (0)
1770 #endif
1771
1772 /* If we still don't have umul_ppmm, define it using plain C.
1773
1774 For reference, when this code is used for squaring (ie. u and v identical
1775 expressions), gcc recognises __x1 and __x2 are the same and generates 3
1776 multiplies, not 4. The subsequent additions could be optimized a bit,
1777 but the only place GMP currently uses such a square is mpn_sqr_basecase,
1778 and chips obliged to use this generic C umul will have plenty of worse
1779 performance problems than a couple of extra instructions on the diagonal
1780 of sqr_basecase. */
1781
1782 #if !defined (umul_ppmm)
1783 #define umul_ppmm(w1, w0, u, v) \
1784 do { \
1785 UWtype __x0, __x1, __x2, __x3; \
1786 UHWtype __ul, __vl, __uh, __vh; \
1787 UWtype __u = (u), __v = (v); \
1788 \
1789 __ul = __ll_lowpart (__u); \
1790 __uh = __ll_highpart (__u); \
1791 __vl = __ll_lowpart (__v); \
1792 __vh = __ll_highpart (__v); \
1793 \
1794 __x0 = (UWtype) __ul * __vl; \
1795 __x1 = (UWtype) __ul * __vh; \
1796 __x2 = (UWtype) __uh * __vl; \
1797 __x3 = (UWtype) __uh * __vh; \
1798 \
1799 __x1 += __ll_highpart (__x0);/* this can't give carry */ \
1800 __x1 += __x2; /* but this indeed can */ \
1801 if (__x1 < __x2) /* did we get it? */ \
1802 __x3 += __ll_B; /* yes, add it in the proper pos. */ \
1803 \
1804 (w1) = __x3 + __ll_highpart (__x1); \
1805 (w0) = (__x1 << W_TYPE_SIZE/2) + __ll_lowpart (__x0); \
1806 } while (0)
1807 #endif
1808
1809 /* If we don't have smul_ppmm, define it using umul_ppmm (which surely will
1810 exist in one form or another. */
1811 #if !defined (smul_ppmm)
1812 #define smul_ppmm(w1, w0, u, v) \
1813 do { \
1814 UWtype __w1; \
1815 UWtype __xm0 = (u), __xm1 = (v); \
1816 umul_ppmm (__w1, w0, __xm0, __xm1); \
1817 (w1) = __w1 - (-(__xm0 >> (W_TYPE_SIZE - 1)) & __xm1) \
1818 - (-(__xm1 >> (W_TYPE_SIZE - 1)) & __xm0); \
1819 } while (0)
1820 #endif
1821
1822 /* Define this unconditionally, so it can be used for debugging. */
1823 #define __udiv_qrnnd_c(q, r, n1, n0, d) \
1824 do { \
1825 UWtype __d1, __d0, __q1, __q0, __r1, __r0, __m; \
1826 \
1827 ASSERT ((d) != 0); \
1828 ASSERT ((n1) < (d)); \
1829 \
1830 __d1 = __ll_highpart (d); \
1831 __d0 = __ll_lowpart (d); \
1832 \
1833 __q1 = (n1) / __d1; \
1834 __r1 = (n1) - __q1 * __d1; \
1835 __m = __q1 * __d0; \
1836 __r1 = __r1 * __ll_B | __ll_highpart (n0); \
1837 if (__r1 < __m) \
1838 { \
1839 __q1--, __r1 += (d); \
1840 if (__r1 >= (d)) /* i.e. we didn't get carry when adding to __r1 */\
1841 if (__r1 < __m) \
1842 __q1--, __r1 += (d); \
1843 } \
1844 __r1 -= __m; \
1845 \
1846 __q0 = __r1 / __d1; \
1847 __r0 = __r1 - __q0 * __d1; \
1848 __m = __q0 * __d0; \
1849 __r0 = __r0 * __ll_B | __ll_lowpart (n0); \
1850 if (__r0 < __m) \
1851 { \
1852 __q0--, __r0 += (d); \
1853 if (__r0 >= (d)) \
1854 if (__r0 < __m) \
1855 __q0--, __r0 += (d); \
1856 } \
1857 __r0 -= __m; \
1858 \
1859 (q) = __q1 * __ll_B | __q0; \
1860 (r) = __r0; \
1861 } while (0)
1862
1863 /* If the processor has no udiv_qrnnd but sdiv_qrnnd, go through
1864 __udiv_w_sdiv (defined in libgcc or elsewhere). */
1865 #if !defined (udiv_qrnnd) && defined (sdiv_qrnnd)
1866 #define udiv_qrnnd(q, r, nh, nl, d) \
1867 do { \
1868 UWtype __r; \
1869 (q) = __MPN(udiv_w_sdiv) (&__r, nh, nl, d); \
1870 (r) = __r; \
1871 } while (0)
1872 #endif
1873
1874 /* If udiv_qrnnd was not defined for this processor, use __udiv_qrnnd_c. */
1875 #if !defined (udiv_qrnnd)
1876 #define UDIV_NEEDS_NORMALIZATION 1
1877 #define udiv_qrnnd __udiv_qrnnd_c
1878 #endif
1879
1880 #if !defined (count_leading_zeros)
1881 #define count_leading_zeros(count, x) \
1882 do { \
1883 UWtype __xr = (x); \
1884 UWtype __a; \
1885 \
1886 if (W_TYPE_SIZE == 32) \
1887 { \
1888 __a = __xr < ((UWtype) 1 << 2*__BITS4) \
1889 ? (__xr < ((UWtype) 1 << __BITS4) ? 1 : __BITS4 + 1) \
1890 : (__xr < ((UWtype) 1 << 3*__BITS4) ? 2*__BITS4 + 1 \
1891 : 3*__BITS4 + 1); \
1892 } \
1893 else \
1894 { \
1895 for (__a = W_TYPE_SIZE - 8; __a > 0; __a -= 8) \
1896 if (((__xr >> __a) & 0xff) != 0) \
1897 break; \
1898 ++__a; \
1899 } \
1900 \
1901 (count) = W_TYPE_SIZE + 1 - __a - __clz_tab[__xr >> __a]; \
1902 } while (0)
1903 /* This version gives a well-defined value for zero. */
1904 #define COUNT_LEADING_ZEROS_0 (W_TYPE_SIZE - 1)
1905 #define COUNT_LEADING_ZEROS_NEED_CLZ_TAB
1906 #endif
1907
1908 /* clz_tab needed by mpn/x86/pentium/mod_1.asm in a fat binary */
1909 #if HAVE_HOST_CPU_FAMILY_x86 && WANT_FAT_BINARY
1910 #define COUNT_LEADING_ZEROS_NEED_CLZ_TAB
1911 #endif
1912
1913 #ifdef COUNT_LEADING_ZEROS_NEED_CLZ_TAB
1914 extern const unsigned char __GMP_DECLSPEC __clz_tab[128];
1915 #endif
1916
1917 #if !defined (count_trailing_zeros)
1918 /* Define count_trailing_zeros using count_leading_zeros. The latter might be
1919 defined in asm, but if it is not, the C version above is good enough. */
1920 #define count_trailing_zeros(count, x) \
1921 do { \
1922 UWtype __ctz_x = (x); \
1923 UWtype __ctz_c; \
1924 ASSERT (__ctz_x != 0); \
1925 count_leading_zeros (__ctz_c, __ctz_x & -__ctz_x); \
1926 (count) = W_TYPE_SIZE - 1 - __ctz_c; \
1927 } while (0)
1928 #endif
1929
1930 #ifndef UDIV_NEEDS_NORMALIZATION
1931 #define UDIV_NEEDS_NORMALIZATION 0
1932 #endif
1933
1934 /* Whether udiv_qrnnd is actually implemented with udiv_qrnnd_preinv, and
1935 that hence the latter should always be used. */
1936 #ifndef UDIV_PREINV_ALWAYS
1937 #define UDIV_PREINV_ALWAYS 0
1938 #endif
1939
1940 /* Give defaults for UMUL_TIME and UDIV_TIME. */
1941 #ifndef UMUL_TIME
1942 #define UMUL_TIME 1
1943 #endif
1944
1945 #ifndef UDIV_TIME
1946 #define UDIV_TIME UMUL_TIME
1947 #endif
1948