Home | History | Annotate | Line # | Download | only in dist
longlong.h revision 1.1
      1 /* longlong.h -- definitions for mixed size 32/64 bit arithmetic.
      2 
      3 Copyright 1991, 1992, 1993, 1994, 1996, 1997, 1999, 2000, 2001, 2002, 2003,
      4 2004, 2005, 2007, 2008, 2009 Free Software Foundation, Inc.
      5 
      6 This file is free software; you can redistribute it and/or modify it under the
      7 terms of the GNU Lesser General Public License as published by the Free
      8 Software Foundation; either version 3 of the License, or (at your option) any
      9 later version.
     10 
     11 This file is distributed in the hope that it will be useful, but WITHOUT ANY
     12 WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
     13 PARTICULAR PURPOSE.  See the GNU Lesser General Public License for more
     14 details.
     15 
     16 You should have received a copy of the GNU Lesser General Public License
     17 along with this file.  If not, see http://www.gnu.org/licenses/.  */
     18 
     19 /* You have to define the following before including this file:
     20 
     21    UWtype -- An unsigned type, default type for operations (typically a "word")
     22    UHWtype -- An unsigned type, at least half the size of UWtype
     23    UDWtype -- An unsigned type, at least twice as large a UWtype
     24    W_TYPE_SIZE -- size in bits of UWtype
     25 
     26    SItype, USItype -- Signed and unsigned 32 bit types
     27    DItype, UDItype -- Signed and unsigned 64 bit types
     28 
     29    On a 32 bit machine UWtype should typically be USItype;
     30    on a 64 bit machine, UWtype should typically be UDItype.
     31 
     32    Optionally, define:
     33 
     34    LONGLONG_STANDALONE -- Avoid code that needs machine-dependent support files
     35    NO_ASM -- Disable inline asm
     36 
     37 
     38    CAUTION!  Using this version of longlong.h outside of GMP is not safe.  You
     39    need to include gmp.h and gmp-impl.h, or certain things might not work as
     40    expected.
     41 */
     42 
     43 #define __BITS4 (W_TYPE_SIZE / 4)
     44 #define __ll_B ((UWtype) 1 << (W_TYPE_SIZE / 2))
     45 #define __ll_lowpart(t) ((UWtype) (t) & (__ll_B - 1))
     46 #define __ll_highpart(t) ((UWtype) (t) >> (W_TYPE_SIZE / 2))
     47 
     48 /* This is used to make sure no undesirable sharing between different libraries
     49    that use this file takes place.  */
     50 #ifndef __MPN
     51 #define __MPN(x) __##x
     52 #endif
     53 
     54 #ifndef _PROTO
     55 #if (__STDC__-0) || defined (__cplusplus)
     56 #define _PROTO(x) x
     57 #else
     58 #define _PROTO(x) ()
     59 #endif
     60 #endif
     61 
     62 /* Define auxiliary asm macros.
     63 
     64    1) umul_ppmm(high_prod, low_prod, multiplier, multiplicand) multiplies two
     65    UWtype integers MULTIPLIER and MULTIPLICAND, and generates a two UWtype
     66    word product in HIGH_PROD and LOW_PROD.
     67 
     68    2) __umulsidi3(a,b) multiplies two UWtype integers A and B, and returns a
     69    UDWtype product.  This is just a variant of umul_ppmm.
     70 
     71    3) udiv_qrnnd(quotient, remainder, high_numerator, low_numerator,
     72    denominator) divides a UDWtype, composed by the UWtype integers
     73    HIGH_NUMERATOR and LOW_NUMERATOR, by DENOMINATOR and places the quotient
     74    in QUOTIENT and the remainder in REMAINDER.  HIGH_NUMERATOR must be less
     75    than DENOMINATOR for correct operation.  If, in addition, the most
     76    significant bit of DENOMINATOR must be 1, then the pre-processor symbol
     77    UDIV_NEEDS_NORMALIZATION is defined to 1.
     78 
     79    4) sdiv_qrnnd(quotient, remainder, high_numerator, low_numerator,
     80    denominator).  Like udiv_qrnnd but the numbers are signed.  The quotient
     81    is rounded towards 0.
     82 
     83    5) count_leading_zeros(count, x) counts the number of zero-bits from the
     84    msb to the first non-zero bit in the UWtype X.  This is the number of
     85    steps X needs to be shifted left to set the msb.  Undefined for X == 0,
     86    unless the symbol COUNT_LEADING_ZEROS_0 is defined to some value.
     87 
     88    6) count_trailing_zeros(count, x) like count_leading_zeros, but counts
     89    from the least significant end.
     90 
     91    7) add_ssaaaa(high_sum, low_sum, high_addend_1, low_addend_1,
     92    high_addend_2, low_addend_2) adds two UWtype integers, composed by
     93    HIGH_ADDEND_1 and LOW_ADDEND_1, and HIGH_ADDEND_2 and LOW_ADDEND_2
     94    respectively.  The result is placed in HIGH_SUM and LOW_SUM.  Overflow
     95    (i.e. carry out) is not stored anywhere, and is lost.
     96 
     97    8) sub_ddmmss(high_difference, low_difference, high_minuend, low_minuend,
     98    high_subtrahend, low_subtrahend) subtracts two two-word UWtype integers,
     99    composed by HIGH_MINUEND_1 and LOW_MINUEND_1, and HIGH_SUBTRAHEND_2 and
    100    LOW_SUBTRAHEND_2 respectively.  The result is placed in HIGH_DIFFERENCE
    101    and LOW_DIFFERENCE.  Overflow (i.e. carry out) is not stored anywhere,
    102    and is lost.
    103 
    104    If any of these macros are left undefined for a particular CPU,
    105    C macros are used.
    106 
    107 
    108    Notes:
    109 
    110    For add_ssaaaa the two high and two low addends can both commute, but
    111    unfortunately gcc only supports one "%" commutative in each asm block.
    112    This has always been so but is only documented in recent versions
    113    (eg. pre-release 3.3).  Having two or more "%"s can cause an internal
    114    compiler error in certain rare circumstances.
    115 
    116    Apparently it was only the last "%" that was ever actually respected, so
    117    the code has been updated to leave just that.  Clearly there's a free
    118    choice whether high or low should get it, if there's a reason to favour
    119    one over the other.  Also obviously when the constraints on the two
    120    operands are identical there's no benefit to the reloader in any "%" at
    121    all.
    122 
    123    */
    124 
    125 /* The CPUs come in alphabetical order below.
    126 
    127    Please add support for more CPUs here, or improve the current support
    128    for the CPUs below!  */
    129 
    130 
    131 /* count_leading_zeros_gcc_clz is count_leading_zeros implemented with gcc
    132    3.4 __builtin_clzl or __builtin_clzll, according to our limb size.
    133    Similarly count_trailing_zeros_gcc_ctz using __builtin_ctzl or
    134    __builtin_ctzll.
    135 
    136    These builtins are only used when we check what code comes out, on some
    137    chips they're merely libgcc calls, where we will instead want an inline
    138    in that case (either asm or generic C).
    139 
    140    These builtins are better than an asm block of the same insn, since an
    141    asm block doesn't give gcc any information about scheduling or resource
    142    usage.  We keep an asm block for use on prior versions of gcc though.
    143 
    144    For reference, __builtin_ffs existed in gcc prior to __builtin_clz, but
    145    it's not used (for count_leading_zeros) because it generally gives extra
    146    code to ensure the result is 0 when the input is 0, which we don't need
    147    or want.  */
    148 
    149 #ifdef _LONG_LONG_LIMB
    150 #define count_leading_zeros_gcc_clz(count,x)    \
    151   do {                                          \
    152     ASSERT ((x) != 0);                          \
    153     (count) = __builtin_clzll (x);              \
    154   } while (0)
    155 #else
    156 #define count_leading_zeros_gcc_clz(count,x)    \
    157   do {                                          \
    158     ASSERT ((x) != 0);                          \
    159     (count) = __builtin_clzl (x);               \
    160   } while (0)
    161 #endif
    162 
    163 #ifdef _LONG_LONG_LIMB
    164 #define count_trailing_zeros_gcc_ctz(count,x)   \
    165   do {                                          \
    166     ASSERT ((x) != 0);                          \
    167     (count) = __builtin_ctzll (x);              \
    168   } while (0)
    169 #else
    170 #define count_trailing_zeros_gcc_ctz(count,x)   \
    171   do {                                          \
    172     ASSERT ((x) != 0);                          \
    173     (count) = __builtin_ctzl (x);               \
    174   } while (0)
    175 #endif
    176 
    177 
    178 /* FIXME: The macros using external routines like __MPN(count_leading_zeros)
    179    don't need to be under !NO_ASM */
    180 #if ! defined (NO_ASM)
    181 
    182 #if defined (__alpha) && W_TYPE_SIZE == 64
    183 /* Most alpha-based machines, except Cray systems. */
    184 #if defined (__GNUC__)
    185 #if __GMP_GNUC_PREREQ (3,3)
    186 #define umul_ppmm(ph, pl, m0, m1) \
    187   do {									\
    188     UDItype __m0 = (m0), __m1 = (m1);					\
    189     (ph) = __builtin_alpha_umulh (__m0, __m1);				\
    190     (pl) = __m0 * __m1;							\
    191   } while (0)
    192 #else
    193 #define umul_ppmm(ph, pl, m0, m1) \
    194   do {									\
    195     UDItype __m0 = (m0), __m1 = (m1);					\
    196     __asm__ ("umulh %r1,%2,%0"						\
    197 	     : "=r" (ph)						\
    198 	     : "%rJ" (m0), "rI" (m1));					\
    199     (pl) = __m0 * __m1;							\
    200   } while (0)
    201 #endif
    202 #define UMUL_TIME 18
    203 #else /* ! __GNUC__ */
    204 #include <machine/builtins.h>
    205 #define umul_ppmm(ph, pl, m0, m1) \
    206   do {									\
    207     UDItype __m0 = (m0), __m1 = (m1);					\
    208     (ph) = __UMULH (m0, m1);						\
    209     (pl) = __m0 * __m1;							\
    210   } while (0)
    211 #endif
    212 #ifndef LONGLONG_STANDALONE
    213 #define udiv_qrnnd(q, r, n1, n0, d) \
    214   do { UWtype __di;							\
    215     __di = __MPN(invert_limb) (d);					\
    216     udiv_qrnnd_preinv (q, r, n1, n0, d, __di);				\
    217   } while (0)
    218 #define UDIV_PREINV_ALWAYS  1
    219 #define UDIV_NEEDS_NORMALIZATION 1
    220 #define UDIV_TIME 220
    221 #endif /* LONGLONG_STANDALONE */
    222 
    223 /* clz_tab is required in all configurations, since mpn/alpha/cntlz.asm
    224    always goes into libgmp.so, even when not actually used.  */
    225 #define COUNT_LEADING_ZEROS_NEED_CLZ_TAB
    226 
    227 #if defined (__GNUC__) && HAVE_HOST_CPU_alpha_CIX
    228 #define count_leading_zeros(COUNT,X) \
    229   __asm__("ctlz %1,%0" : "=r"(COUNT) : "r"(X))
    230 #define count_trailing_zeros(COUNT,X) \
    231   __asm__("cttz %1,%0" : "=r"(COUNT) : "r"(X))
    232 #endif /* clz/ctz using cix */
    233 
    234 #if ! defined (count_leading_zeros)                             \
    235   && defined (__GNUC__) && ! defined (LONGLONG_STANDALONE)
    236 /* ALPHA_CMPBGE_0 gives "cmpbge $31,src,dst", ie. test src bytes == 0.
    237    "$31" is written explicitly in the asm, since an "r" constraint won't
    238    select reg 31.  There seems no need to worry about "r31" syntax for cray,
    239    since gcc itself (pre-release 3.4) emits just $31 in various places.  */
    240 #define ALPHA_CMPBGE_0(dst, src)                                        \
    241   do { asm ("cmpbge $31, %1, %0" : "=r" (dst) : "r" (src)); } while (0)
    242 /* Zero bytes are turned into bits with cmpbge, a __clz_tab lookup counts
    243    them, locating the highest non-zero byte.  A second __clz_tab lookup
    244    counts the leading zero bits in that byte, giving the result.  */
    245 #define count_leading_zeros(count, x)                                   \
    246   do {                                                                  \
    247     UWtype  __clz__b, __clz__c, __clz__x = (x);                         \
    248     ALPHA_CMPBGE_0 (__clz__b,  __clz__x);           /* zero bytes */    \
    249     __clz__b = __clz_tab [(__clz__b >> 1) ^ 0x7F];  /* 8 to 1 byte */   \
    250     __clz__b = __clz__b * 8 - 7;                    /* 57 to 1 shift */ \
    251     __clz__x >>= __clz__b;                                              \
    252     __clz__c = __clz_tab [__clz__x];                /* 8 to 1 bit */    \
    253     __clz__b = 65 - __clz__b;                                           \
    254     (count) = __clz__b - __clz__c;                                      \
    255   } while (0)
    256 #define COUNT_LEADING_ZEROS_NEED_CLZ_TAB
    257 #endif /* clz using cmpbge */
    258 
    259 #if ! defined (count_leading_zeros) && ! defined (LONGLONG_STANDALONE)
    260 #if HAVE_ATTRIBUTE_CONST
    261 long __MPN(count_leading_zeros) _PROTO ((UDItype)) __attribute__ ((const));
    262 #else
    263 long __MPN(count_leading_zeros) _PROTO ((UDItype));
    264 #endif
    265 #define count_leading_zeros(count, x) \
    266   ((count) = __MPN(count_leading_zeros) (x))
    267 #endif /* clz using mpn */
    268 #endif /* __alpha */
    269 
    270 #if defined (_CRAY) && W_TYPE_SIZE == 64
    271 #include <intrinsics.h>
    272 #define UDIV_PREINV_ALWAYS  1
    273 #define UDIV_NEEDS_NORMALIZATION 1
    274 #define UDIV_TIME 220
    275 long __MPN(count_leading_zeros) _PROTO ((UDItype));
    276 #define count_leading_zeros(count, x) \
    277   ((count) = _leadz ((UWtype) (x)))
    278 #if defined (_CRAYIEEE)		/* I.e., Cray T90/ieee, T3D, and T3E */
    279 #define umul_ppmm(ph, pl, m0, m1) \
    280   do {									\
    281     UDItype __m0 = (m0), __m1 = (m1);					\
    282     (ph) = _int_mult_upper (m0, m1);					\
    283     (pl) = __m0 * __m1;							\
    284   } while (0)
    285 #ifndef LONGLONG_STANDALONE
    286 #define udiv_qrnnd(q, r, n1, n0, d) \
    287   do { UWtype __di;							\
    288     __di = __MPN(invert_limb) (d);					\
    289     udiv_qrnnd_preinv (q, r, n1, n0, d, __di);				\
    290   } while (0)
    291 #endif /* LONGLONG_STANDALONE */
    292 #endif /* _CRAYIEEE */
    293 #endif /* _CRAY */
    294 
    295 #if defined (__ia64) && W_TYPE_SIZE == 64
    296 /* This form encourages gcc (pre-release 3.4 at least) to emit predicated
    297    "sub r=r,r" and "sub r=r,r,1", giving a 2 cycle latency.  The generic
    298    code using "al<bl" arithmetically comes out making an actual 0 or 1 in a
    299    register, which takes an extra cycle.  */
    300 #define sub_ddmmss(sh, sl, ah, al, bh, bl)      \
    301   do {                                          \
    302     UWtype __x;                                 \
    303     __x = (al) - (bl);                          \
    304     if ((al) < (bl))                            \
    305       (sh) = (ah) - (bh) - 1;                   \
    306     else                                        \
    307       (sh) = (ah) - (bh);                       \
    308     (sl) = __x;                                 \
    309   } while (0)
    310 #if defined (__GNUC__) && ! defined (__INTEL_COMPILER)
    311 /* Do both product parts in assembly, since that gives better code with
    312    all gcc versions.  Some callers will just use the upper part, and in
    313    that situation we waste an instruction, but not any cycles.  */
    314 #define umul_ppmm(ph, pl, m0, m1) \
    315     __asm__ ("xma.hu %0 = %2, %3, f0\n\txma.l %1 = %2, %3, f0"		\
    316 	     : "=&f" (ph), "=f" (pl)					\
    317 	     : "f" (m0), "f" (m1))
    318 #define UMUL_TIME 14
    319 #define count_leading_zeros(count, x) \
    320   do {									\
    321     UWtype _x = (x), _y, _a, _c;					\
    322     __asm__ ("mux1 %0 = %1, @rev" : "=r" (_y) : "r" (_x));		\
    323     __asm__ ("czx1.l %0 = %1" : "=r" (_a) : "r" (-_y | _y));		\
    324     _c = (_a - 1) << 3;							\
    325     _x >>= _c;								\
    326     if (_x >= 1 << 4)							\
    327       _x >>= 4, _c += 4;						\
    328     if (_x >= 1 << 2)							\
    329       _x >>= 2, _c += 2;						\
    330     _c += _x >> 1;							\
    331     (count) =  W_TYPE_SIZE - 1 - _c;					\
    332   } while (0)
    333 /* similar to what gcc does for __builtin_ffs, but 0 based rather than 1
    334    based, and we don't need a special case for x==0 here */
    335 #define count_trailing_zeros(count, x)					\
    336   do {									\
    337     UWtype __ctz_x = (x);						\
    338     __asm__ ("popcnt %0 = %1"						\
    339 	     : "=r" (count)						\
    340 	     : "r" ((__ctz_x-1) & ~__ctz_x));				\
    341   } while (0)
    342 #endif
    343 #if defined (__INTEL_COMPILER)
    344 #include <ia64intrin.h>
    345 #define umul_ppmm(ph, pl, m0, m1)					\
    346   do {									\
    347     UWtype _m0 = (m0), _m1 = (m1);					\
    348     ph = _m64_xmahu (_m0, _m1, 0);					\
    349     pl = _m0 * _m1;							\
    350   } while (0)
    351 #endif
    352 #ifndef LONGLONG_STANDALONE
    353 #define udiv_qrnnd(q, r, n1, n0, d) \
    354   do { UWtype __di;							\
    355     __di = __MPN(invert_limb) (d);					\
    356     udiv_qrnnd_preinv (q, r, n1, n0, d, __di);				\
    357   } while (0)
    358 #define UDIV_PREINV_ALWAYS  1
    359 #define UDIV_NEEDS_NORMALIZATION 1
    360 #endif
    361 #define UDIV_TIME 220
    362 #endif
    363 
    364 
    365 #if defined (__GNUC__)
    366 
    367 /* We sometimes need to clobber "cc" with gcc2, but that would not be
    368    understood by gcc1.  Use cpp to avoid major code duplication.  */
    369 #if __GNUC__ < 2
    370 #define __CLOBBER_CC
    371 #define __AND_CLOBBER_CC
    372 #else /* __GNUC__ >= 2 */
    373 #define __CLOBBER_CC : "cc"
    374 #define __AND_CLOBBER_CC , "cc"
    375 #endif /* __GNUC__ < 2 */
    376 
    377 #if (defined (__a29k__) || defined (_AM29K)) && W_TYPE_SIZE == 32
    378 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \
    379   __asm__ ("add %1,%4,%5\n\taddc %0,%2,%3"				\
    380 	   : "=r" (sh), "=&r" (sl)					\
    381 	   : "r" (ah), "rI" (bh), "%r" (al), "rI" (bl))
    382 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \
    383   __asm__ ("sub %1,%4,%5\n\tsubc %0,%2,%3"				\
    384 	   : "=r" (sh), "=&r" (sl)					\
    385 	   : "r" (ah), "rI" (bh), "r" (al), "rI" (bl))
    386 #define umul_ppmm(xh, xl, m0, m1) \
    387   do {									\
    388     USItype __m0 = (m0), __m1 = (m1);					\
    389     __asm__ ("multiplu %0,%1,%2"					\
    390 	     : "=r" (xl)						\
    391 	     : "r" (__m0), "r" (__m1));					\
    392     __asm__ ("multmu %0,%1,%2"						\
    393 	     : "=r" (xh)						\
    394 	     : "r" (__m0), "r" (__m1));					\
    395   } while (0)
    396 #define udiv_qrnnd(q, r, n1, n0, d) \
    397   __asm__ ("dividu %0,%3,%4"						\
    398 	   : "=r" (q), "=q" (r)						\
    399 	   : "1" (n1), "r" (n0), "r" (d))
    400 #define count_leading_zeros(count, x) \
    401     __asm__ ("clz %0,%1"						\
    402 	     : "=r" (count)						\
    403 	     : "r" (x))
    404 #define COUNT_LEADING_ZEROS_0 32
    405 #endif /* __a29k__ */
    406 
    407 #if defined (__arc__)
    408 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \
    409   __asm__ ("add.f\t%1, %4, %5\n\tadc\t%0, %2, %3"			\
    410 	   : "=r" (sh),							\
    411 	     "=&r" (sl)							\
    412 	   : "r"  ((USItype) (ah)),					\
    413 	     "rIJ" ((USItype) (bh)),					\
    414 	     "%r" ((USItype) (al)),					\
    415 	     "rIJ" ((USItype) (bl)))
    416 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \
    417   __asm__ ("sub.f\t%1, %4, %5\n\tsbc\t%0, %2, %3"			\
    418 	   : "=r" (sh),							\
    419 	     "=&r" (sl)							\
    420 	   : "r" ((USItype) (ah)),					\
    421 	     "rIJ" ((USItype) (bh)),					\
    422 	     "r" ((USItype) (al)),					\
    423 	     "rIJ" ((USItype) (bl)))
    424 #endif
    425 
    426 #if defined (__arm__) && W_TYPE_SIZE == 32
    427 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \
    428   __asm__ ("adds\t%1, %4, %5\n\tadc\t%0, %2, %3"			\
    429 	   : "=r" (sh), "=&r" (sl)					\
    430 	   : "r" (ah), "rI" (bh), "%r" (al), "rI" (bl) __CLOBBER_CC)
    431 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \
    432   do {									\
    433     if (__builtin_constant_p (al))					\
    434       {									\
    435 	if (__builtin_constant_p (ah))					\
    436 	  __asm__ ("rsbs\t%1, %5, %4\n\trsc\t%0, %3, %2"		\
    437 		   : "=r" (sh), "=&r" (sl)				\
    438 		   : "rI" (ah), "r" (bh), "rI" (al), "r" (bl) __CLOBBER_CC); \
    439 	else								\
    440 	  __asm__ ("rsbs\t%1, %5, %4\n\tsbc\t%0, %2, %3"		\
    441 		   : "=r" (sh), "=&r" (sl)				\
    442 		   : "r" (ah), "rI" (bh), "rI" (al), "r" (bl) __CLOBBER_CC); \
    443       }									\
    444     else if (__builtin_constant_p (ah))					\
    445       {									\
    446 	if (__builtin_constant_p (bl))					\
    447 	  __asm__ ("subs\t%1, %4, %5\n\trsc\t%0, %3, %2"		\
    448 		   : "=r" (sh), "=&r" (sl)				\
    449 		   : "rI" (ah), "r" (bh), "r" (al), "rI" (bl) __CLOBBER_CC); \
    450 	else								\
    451 	  __asm__ ("rsbs\t%1, %5, %4\n\trsc\t%0, %3, %2"		\
    452 		   : "=r" (sh), "=&r" (sl)				\
    453 		   : "rI" (ah), "r" (bh), "rI" (al), "r" (bl) __CLOBBER_CC); \
    454       }									\
    455     else if (__builtin_constant_p (bl))					\
    456       {									\
    457 	if (__builtin_constant_p (bh))					\
    458 	  __asm__ ("subs\t%1, %4, %5\n\tsbc\t%0, %2, %3"		\
    459 		   : "=r" (sh), "=&r" (sl)				\
    460 		   : "r" (ah), "rI" (bh), "r" (al), "rI" (bl) __CLOBBER_CC); \
    461 	else								\
    462 	  __asm__ ("subs\t%1, %4, %5\n\trsc\t%0, %3, %2"		\
    463 		   : "=r" (sh), "=&r" (sl)				\
    464 		   : "rI" (ah), "r" (bh), "r" (al), "rI" (bl) __CLOBBER_CC); \
    465       }									\
    466     else /* only bh might be a constant */				\
    467       __asm__ ("subs\t%1, %4, %5\n\tsbc\t%0, %2, %3"			\
    468 	       : "=r" (sh), "=&r" (sl)					\
    469 	       : "r" (ah), "rI" (bh), "r" (al), "rI" (bl) __CLOBBER_CC);\
    470     } while (0)
    471 #if 1 || defined (__arm_m__)	/* `M' series has widening multiply support */
    472 #define umul_ppmm(xh, xl, a, b) \
    473   __asm__ ("umull %0,%1,%2,%3" : "=&r" (xl), "=&r" (xh) : "r" (a), "r" (b))
    474 #define UMUL_TIME 5
    475 #define smul_ppmm(xh, xl, a, b) \
    476   __asm__ ("smull %0,%1,%2,%3" : "=&r" (xl), "=&r" (xh) : "r" (a), "r" (b))
    477 #ifndef LONGLONG_STANDALONE
    478 #define udiv_qrnnd(q, r, n1, n0, d) \
    479   do { UWtype __di;							\
    480     __di = __MPN(invert_limb) (d);					\
    481     udiv_qrnnd_preinv (q, r, n1, n0, d, __di);				\
    482   } while (0)
    483 #define UDIV_PREINV_ALWAYS  1
    484 #define UDIV_NEEDS_NORMALIZATION 1
    485 #define UDIV_TIME 70
    486 #endif /* LONGLONG_STANDALONE */
    487 #else
    488 #define umul_ppmm(xh, xl, a, b) \
    489   __asm__ ("%@ Inlined umul_ppmm\n"					\
    490 "	mov	%|r0, %2, lsr #16\n"					\
    491 "	mov	%|r2, %3, lsr #16\n"					\
    492 "	bic	%|r1, %2, %|r0, lsl #16\n"				\
    493 "	bic	%|r2, %3, %|r2, lsl #16\n"				\
    494 "	mul	%1, %|r1, %|r2\n"					\
    495 "	mul	%|r2, %|r0, %|r2\n"					\
    496 "	mul	%|r1, %0, %|r1\n"					\
    497 "	mul	%0, %|r0, %0\n"						\
    498 "	adds	%|r1, %|r2, %|r1\n"					\
    499 "	addcs	%0, %0, #65536\n"					\
    500 "	adds	%1, %1, %|r1, lsl #16\n"				\
    501 "	adc	%0, %0, %|r1, lsr #16"					\
    502 	   : "=&r" (xh), "=r" (xl)					\
    503 	   : "r" (a), "r" (b)						\
    504 	   : "r0", "r1", "r2")
    505 #define UMUL_TIME 20
    506 #ifndef LONGLONG_STANDALONE
    507 #define udiv_qrnnd(q, r, n1, n0, d) \
    508   do { UWtype __r;							\
    509     (q) = __MPN(udiv_qrnnd) (&__r, (n1), (n0), (d));			\
    510     (r) = __r;								\
    511   } while (0)
    512 extern UWtype __MPN(udiv_qrnnd) _PROTO ((UWtype *, UWtype, UWtype, UWtype));
    513 #define UDIV_TIME 200
    514 #endif /* LONGLONG_STANDALONE */
    515 #endif
    516 #if defined (__ARM_ARCH_5__)
    517 /* This actually requires arm 5 */
    518 #define count_leading_zeros(count, x) \
    519   __asm__ ("clz\t%0, %1" : "=r" (count) : "r" (x))
    520 #define COUNT_LEADING_ZEROS_0 32
    521 #endif
    522 #endif /* __arm__ */
    523 
    524 #if defined (__clipper__) && W_TYPE_SIZE == 32
    525 #define umul_ppmm(w1, w0, u, v) \
    526   ({union {UDItype __ll;						\
    527 	   struct {USItype __l, __h;} __i;				\
    528 	  } __x;							\
    529   __asm__ ("mulwux %2,%0"						\
    530 	   : "=r" (__x.__ll)						\
    531 	   : "%0" ((USItype)(u)), "r" ((USItype)(v)));			\
    532   (w1) = __x.__i.__h; (w0) = __x.__i.__l;})
    533 #define smul_ppmm(w1, w0, u, v) \
    534   ({union {DItype __ll;							\
    535 	   struct {SItype __l, __h;} __i;				\
    536 	  } __x;							\
    537   __asm__ ("mulwx %2,%0"						\
    538 	   : "=r" (__x.__ll)						\
    539 	   : "%0" ((SItype)(u)), "r" ((SItype)(v)));			\
    540   (w1) = __x.__i.__h; (w0) = __x.__i.__l;})
    541 #define __umulsidi3(u, v) \
    542   ({UDItype __w;							\
    543     __asm__ ("mulwux %2,%0"						\
    544 	     : "=r" (__w) : "%0" ((USItype)(u)), "r" ((USItype)(v)));	\
    545     __w; })
    546 #endif /* __clipper__ */
    547 
    548 /* Fujitsu vector computers.  */
    549 #if defined (__uxp__) && W_TYPE_SIZE == 32
    550 #define umul_ppmm(ph, pl, u, v) \
    551   do {									\
    552     union {UDItype __ll;						\
    553 	   struct {USItype __h, __l;} __i;				\
    554 	  } __x;							\
    555     __asm__ ("mult.lu %1,%2,%0"	: "=r" (__x.__ll) : "%r" (u), "rK" (v));\
    556     (ph) = __x.__i.__h;							\
    557     (pl) = __x.__i.__l;							\
    558   } while (0)
    559 #define smul_ppmm(ph, pl, u, v) \
    560   do {									\
    561     union {UDItype __ll;						\
    562 	   struct {USItype __h, __l;} __i;				\
    563 	  } __x;							\
    564     __asm__ ("mult.l %1,%2,%0" : "=r" (__x.__ll) : "%r" (u), "rK" (v));	\
    565     (ph) = __x.__i.__h;							\
    566     (pl) = __x.__i.__l;							\
    567   } while (0)
    568 #endif
    569 
    570 #if defined (__gmicro__) && W_TYPE_SIZE == 32
    571 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \
    572   __asm__ ("add.w %5,%1\n\taddx %3,%0"					\
    573 	   : "=g" (sh), "=&g" (sl)					\
    574 	   : "0"  ((USItype)(ah)), "g" ((USItype)(bh)),			\
    575 	     "%1" ((USItype)(al)), "g" ((USItype)(bl)))
    576 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \
    577   __asm__ ("sub.w %5,%1\n\tsubx %3,%0"					\
    578 	   : "=g" (sh), "=&g" (sl)					\
    579 	   : "0" ((USItype)(ah)), "g" ((USItype)(bh)),			\
    580 	     "1" ((USItype)(al)), "g" ((USItype)(bl)))
    581 #define umul_ppmm(ph, pl, m0, m1) \
    582   __asm__ ("mulx %3,%0,%1"						\
    583 	   : "=g" (ph), "=r" (pl)					\
    584 	   : "%0" ((USItype)(m0)), "g" ((USItype)(m1)))
    585 #define udiv_qrnnd(q, r, nh, nl, d) \
    586   __asm__ ("divx %4,%0,%1"						\
    587 	   : "=g" (q), "=r" (r)						\
    588 	   : "1" ((USItype)(nh)), "0" ((USItype)(nl)), "g" ((USItype)(d)))
    589 #define count_leading_zeros(count, x) \
    590   __asm__ ("bsch/1 %1,%0"						\
    591 	   : "=g" (count) : "g" ((USItype)(x)), "0" ((USItype)0))
    592 #endif
    593 
    594 #if defined (__hppa) && W_TYPE_SIZE == 32
    595 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \
    596   __asm__ ("add%I5 %5,%r4,%1\n\taddc %r2,%r3,%0"			\
    597 	   : "=r" (sh), "=&r" (sl)					\
    598 	   : "rM" (ah), "rM" (bh), "%rM" (al), "rI" (bl))
    599 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \
    600   __asm__ ("sub%I4 %4,%r5,%1\n\tsubb %r2,%r3,%0"			\
    601 	   : "=r" (sh), "=&r" (sl)					\
    602 	   : "rM" (ah), "rM" (bh), "rI" (al), "rM" (bl))
    603 #if defined (_PA_RISC1_1)
    604 #define umul_ppmm(wh, wl, u, v) \
    605   do {									\
    606     union {UDItype __ll;						\
    607 	   struct {USItype __h, __l;} __i;				\
    608 	  } __x;							\
    609     __asm__ ("xmpyu %1,%2,%0" : "=*f" (__x.__ll) : "*f" (u), "*f" (v));	\
    610     (wh) = __x.__i.__h;							\
    611     (wl) = __x.__i.__l;							\
    612   } while (0)
    613 #define UMUL_TIME 8
    614 #define UDIV_TIME 60
    615 #else
    616 #define UMUL_TIME 40
    617 #define UDIV_TIME 80
    618 #endif
    619 #define count_leading_zeros(count, x) \
    620   do {									\
    621     USItype __tmp;							\
    622     __asm__ (								\
    623        "ldi		1,%0\n"						\
    624 "	extru,=		%1,15,16,%%r0	; Bits 31..16 zero?\n"		\
    625 "	extru,tr	%1,15,16,%1	; No.  Shift down, skip add.\n"	\
    626 "	ldo		16(%0),%0	; Yes.  Perform add.\n"		\
    627 "	extru,=		%1,23,8,%%r0	; Bits 15..8 zero?\n"		\
    628 "	extru,tr	%1,23,8,%1	; No.  Shift down, skip add.\n"	\
    629 "	ldo		8(%0),%0	; Yes.  Perform add.\n"		\
    630 "	extru,=		%1,27,4,%%r0	; Bits 7..4 zero?\n"		\
    631 "	extru,tr	%1,27,4,%1	; No.  Shift down, skip add.\n"	\
    632 "	ldo		4(%0),%0	; Yes.  Perform add.\n"		\
    633 "	extru,=		%1,29,2,%%r0	; Bits 3..2 zero?\n"		\
    634 "	extru,tr	%1,29,2,%1	; No.  Shift down, skip add.\n"	\
    635 "	ldo		2(%0),%0	; Yes.  Perform add.\n"		\
    636 "	extru		%1,30,1,%1	; Extract bit 1.\n"		\
    637 "	sub		%0,%1,%0	; Subtract it.\n"		\
    638 	: "=r" (count), "=r" (__tmp) : "1" (x));			\
    639   } while (0)
    640 #endif /* hppa */
    641 
    642 /* These macros are for ABI=2.0w.  In ABI=2.0n they can't be used, since GCC
    643    (3.2) puts longlong into two adjacent 32-bit registers.  Presumably this
    644    is just a case of no direct support for 2.0n but treating it like 1.0. */
    645 #if defined (__hppa) && W_TYPE_SIZE == 64 && ! defined (_LONG_LONG_LIMB)
    646 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \
    647   __asm__ ("add%I5 %5,%r4,%1\n\tadd,dc %r2,%r3,%0"			\
    648 	   : "=r" (sh), "=&r" (sl)					\
    649 	   : "rM" (ah), "rM" (bh), "%rM" (al), "rI" (bl))
    650 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \
    651   __asm__ ("sub%I4 %4,%r5,%1\n\tsub,db %r2,%r3,%0"			\
    652 	   : "=r" (sh), "=&r" (sl)					\
    653 	   : "rM" (ah), "rM" (bh), "rI" (al), "rM" (bl))
    654 #endif /* hppa */
    655 
    656 #if (defined (__i370__) || defined (__s390__) || defined (__mvs__)) && W_TYPE_SIZE == 32
    657 #define smul_ppmm(xh, xl, m0, m1) \
    658   do {									\
    659     union {DItype __ll;							\
    660 	   struct {USItype __h, __l;} __i;				\
    661 	  } __x;							\
    662     __asm__ ("lr %N0,%1\n\tmr %0,%2"					\
    663 	     : "=&r" (__x.__ll)						\
    664 	     : "r" (m0), "r" (m1));					\
    665     (xh) = __x.__i.__h; (xl) = __x.__i.__l;				\
    666   } while (0)
    667 #define sdiv_qrnnd(q, r, n1, n0, d) \
    668   do {									\
    669     union {DItype __ll;							\
    670 	   struct {USItype __h, __l;} __i;				\
    671 	  } __x;							\
    672     __x.__i.__h = n1; __x.__i.__l = n0;					\
    673     __asm__ ("dr %0,%2"							\
    674 	     : "=r" (__x.__ll)						\
    675 	     : "0" (__x.__ll), "r" (d));				\
    676     (q) = __x.__i.__l; (r) = __x.__i.__h;				\
    677   } while (0)
    678 #endif
    679 
    680 #if (defined (__i386__) || defined (__i486__)) && W_TYPE_SIZE == 32
    681 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \
    682   __asm__ ("addl %5,%k1\n\tadcl %3,%k0"					\
    683 	   : "=r" (sh), "=&r" (sl)					\
    684 	   : "0"  ((USItype)(ah)), "g" ((USItype)(bh)),			\
    685 	     "%1" ((USItype)(al)), "g" ((USItype)(bl)))
    686 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \
    687   __asm__ ("subl %5,%k1\n\tsbbl %3,%k0"					\
    688 	   : "=r" (sh), "=&r" (sl)					\
    689 	   : "0" ((USItype)(ah)), "g" ((USItype)(bh)),			\
    690 	     "1" ((USItype)(al)), "g" ((USItype)(bl)))
    691 #define umul_ppmm(w1, w0, u, v) \
    692   __asm__ ("mull %3"							\
    693 	   : "=a" (w0), "=d" (w1)					\
    694 	   : "%0" ((USItype)(u)), "rm" ((USItype)(v)))
    695 #define udiv_qrnnd(q, r, n1, n0, dx) /* d renamed to dx avoiding "=d" */\
    696   __asm__ ("divl %4"		     /* stringification in K&R C */	\
    697 	   : "=a" (q), "=d" (r)						\
    698 	   : "0" ((USItype)(n0)), "1" ((USItype)(n1)), "rm" ((USItype)(dx)))
    699 
    700 #if HAVE_HOST_CPU_i586 || HAVE_HOST_CPU_pentium || HAVE_HOST_CPU_pentiummmx
    701 /* Pentium bsrl takes between 10 and 72 cycles depending where the most
    702    significant 1 bit is, hence the use of the following alternatives.  bsfl
    703    is slow too, between 18 and 42 depending where the least significant 1
    704    bit is, so let the generic count_trailing_zeros below make use of the
    705    count_leading_zeros here too.  */
    706 
    707 #if HAVE_HOST_CPU_pentiummmx && ! defined (LONGLONG_STANDALONE)
    708 /* The following should be a fixed 14 or 15 cycles, but possibly plus an L1
    709    cache miss reading from __clz_tab.  For P55 it's favoured over the float
    710    below so as to avoid mixing MMX and x87, since the penalty for switching
    711    between the two is about 100 cycles.
    712 
    713    The asm block sets __shift to -3 if the high 24 bits are clear, -2 for
    714    16, -1 for 8, or 0 otherwise.  This could be written equivalently as
    715    follows, but as of gcc 2.95.2 it results in conditional jumps.
    716 
    717        __shift = -(__n < 0x1000000);
    718        __shift -= (__n < 0x10000);
    719        __shift -= (__n < 0x100);
    720 
    721    The middle two sbbl and cmpl's pair, and with luck something gcc
    722    generates might pair with the first cmpl and the last sbbl.  The "32+1"
    723    constant could be folded into __clz_tab[], but it doesn't seem worth
    724    making a different table just for that.  */
    725 
    726 #define count_leading_zeros(c,n)					\
    727   do {									\
    728     USItype  __n = (n);							\
    729     USItype  __shift;							\
    730     __asm__ ("cmpl  $0x1000000, %1\n"					\
    731 	     "sbbl  %0, %0\n"						\
    732 	     "cmpl  $0x10000, %1\n"					\
    733 	     "sbbl  $0, %0\n"						\
    734 	     "cmpl  $0x100, %1\n"					\
    735 	     "sbbl  $0, %0\n"						\
    736 	     : "=&r" (__shift) : "r"  (__n));				\
    737     __shift = __shift*8 + 24 + 1;					\
    738     (c) = 32 + 1 - __shift - __clz_tab[__n >> __shift];			\
    739   } while (0)
    740 #define COUNT_LEADING_ZEROS_NEED_CLZ_TAB
    741 #define COUNT_LEADING_ZEROS_0   31   /* n==0 indistinguishable from n==1 */
    742 
    743 #else /* ! pentiummmx || LONGLONG_STANDALONE */
    744 /* The following should be a fixed 14 cycles or so.  Some scheduling
    745    opportunities should be available between the float load/store too.  This
    746    sort of code is used in gcc 3 for __builtin_ffs (with "n&-n") and is
    747    apparently suggested by the Intel optimizing manual (don't know exactly
    748    where).  gcc 2.95 or up will be best for this, so the "double" is
    749    correctly aligned on the stack.  */
    750 #define count_leading_zeros(c,n)					\
    751   do {									\
    752     union {								\
    753       double    d;							\
    754       unsigned  a[2];							\
    755     } __u;								\
    756     ASSERT ((n) != 0);							\
    757     __u.d = (UWtype) (n);						\
    758     (c) = 0x3FF + 31 - (__u.a[1] >> 20);				\
    759   } while (0)
    760 #define COUNT_LEADING_ZEROS_0   (0x3FF + 31)
    761 #endif /* pentiummx */
    762 
    763 #else /* ! pentium */
    764 
    765 #if __GMP_GNUC_PREREQ (3,4)  /* using bsrl */
    766 #define count_leading_zeros(count,x)  count_leading_zeros_gcc_clz(count,x)
    767 #endif /* gcc clz */
    768 
    769 /* On P6, gcc prior to 3.0 generates a partial register stall for
    770    __cbtmp^31, due to using "xorb $31" instead of "xorl $31", the former
    771    being 1 code byte smaller.  "31-__cbtmp" is a workaround, probably at the
    772    cost of one extra instruction.  Do this for "i386" too, since that means
    773    generic x86.  */
    774 #if ! defined (count_leading_zeros) && __GNUC__ < 3                     \
    775   && (HAVE_HOST_CPU_i386						\
    776       || HAVE_HOST_CPU_i686						\
    777       || HAVE_HOST_CPU_pentiumpro					\
    778       || HAVE_HOST_CPU_pentium2						\
    779       || HAVE_HOST_CPU_pentium3)
    780 #define count_leading_zeros(count, x)					\
    781   do {									\
    782     USItype __cbtmp;							\
    783     ASSERT ((x) != 0);							\
    784     __asm__ ("bsrl %1,%0" : "=r" (__cbtmp) : "rm" ((USItype)(x)));	\
    785     (count) = 31 - __cbtmp;						\
    786   } while (0)
    787 #endif /* gcc<3 asm bsrl */
    788 
    789 #ifndef count_leading_zeros
    790 #define count_leading_zeros(count, x)					\
    791   do {									\
    792     USItype __cbtmp;							\
    793     ASSERT ((x) != 0);							\
    794     __asm__ ("bsrl %1,%0" : "=r" (__cbtmp) : "rm" ((USItype)(x)));	\
    795     (count) = __cbtmp ^ 31;						\
    796   } while (0)
    797 #endif /* asm bsrl */
    798 
    799 #if __GMP_GNUC_PREREQ (3,4)  /* using bsfl */
    800 #define count_trailing_zeros(count,x)  count_trailing_zeros_gcc_ctz(count,x)
    801 #endif /* gcc ctz */
    802 
    803 #ifndef count_trailing_zeros
    804 #define count_trailing_zeros(count, x)					\
    805   do {									\
    806     ASSERT ((x) != 0);							\
    807     __asm__ ("bsfl %1,%k0" : "=r" (count) : "rm" ((USItype)(x)));	\
    808   } while (0)
    809 #endif /* asm bsfl */
    810 
    811 #endif /* ! pentium */
    812 
    813 #ifndef UMUL_TIME
    814 #define UMUL_TIME 10
    815 #endif
    816 #ifndef UDIV_TIME
    817 #define UDIV_TIME 40
    818 #endif
    819 #endif /* 80x86 */
    820 
    821 #if defined (__amd64__) && W_TYPE_SIZE == 64
    822 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \
    823   __asm__ ("addq %5,%q1\n\tadcq %3,%q0"					\
    824 	   : "=r" (sh), "=&r" (sl)					\
    825 	   : "0"  ((UDItype)(ah)), "rme" ((UDItype)(bh)),		\
    826 	     "%1" ((UDItype)(al)), "rme" ((UDItype)(bl)))
    827 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \
    828   __asm__ ("subq %5,%q1\n\tsbbq %3,%q0"					\
    829 	   : "=r" (sh), "=&r" (sl)					\
    830 	   : "0" ((UDItype)(ah)), "rme" ((UDItype)(bh)),		\
    831 	     "1" ((UDItype)(al)), "rme" ((UDItype)(bl)))
    832 #define umul_ppmm(w1, w0, u, v) \
    833   __asm__ ("mulq %3"							\
    834 	   : "=a" (w0), "=d" (w1)					\
    835 	   : "%0" ((UDItype)(u)), "rm" ((UDItype)(v)))
    836 #define udiv_qrnnd(q, r, n1, n0, dx) /* d renamed to dx avoiding "=d" */\
    837   __asm__ ("divq %4"		     /* stringification in K&R C */	\
    838 	   : "=a" (q), "=d" (r)						\
    839 	   : "0" ((UDItype)(n0)), "1" ((UDItype)(n1)), "rm" ((UDItype)(dx)))
    840 /* bsrq destination must be a 64-bit register, hence UDItype for __cbtmp. */
    841 #define count_leading_zeros(count, x)					\
    842   do {									\
    843     UDItype __cbtmp;							\
    844     ASSERT ((x) != 0);							\
    845     __asm__ ("bsrq %1,%0" : "=r" (__cbtmp) : "rm" ((UDItype)(x)));	\
    846     (count) = __cbtmp ^ 63;						\
    847   } while (0)
    848 /* bsfq destination must be a 64-bit register, "%q0" forces this in case
    849    count is only an int. */
    850 #define count_trailing_zeros(count, x)					\
    851   do {									\
    852     ASSERT ((x) != 0);							\
    853     __asm__ ("bsfq %1,%q0" : "=r" (count) : "rm" ((UDItype)(x)));	\
    854   } while (0)
    855 #endif /* x86_64 */
    856 
    857 #if defined (__i860__) && W_TYPE_SIZE == 32
    858 #define rshift_rhlc(r,h,l,c) \
    859   __asm__ ("shr %3,r0,r0\;shrd %1,%2,%0"				\
    860 	   "=r" (r) : "r" (h), "r" (l), "rn" (c))
    861 #endif /* i860 */
    862 
    863 #if defined (__i960__) && W_TYPE_SIZE == 32
    864 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \
    865   __asm__ ("cmpo 1,0\;addc %5,%4,%1\;addc %3,%2,%0"			\
    866 	   : "=r" (sh), "=&r" (sl)					\
    867 	   : "dI" (ah), "dI" (bh), "%dI" (al), "dI" (bl))
    868 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \
    869   __asm__ ("cmpo 0,0\;subc %5,%4,%1\;subc %3,%2,%0"			\
    870 	   : "=r" (sh), "=&r" (sl)					\
    871 	   : "dI" (ah), "dI" (bh), "dI" (al), "dI" (bl))
    872 #define umul_ppmm(w1, w0, u, v) \
    873   ({union {UDItype __ll;						\
    874 	   struct {USItype __l, __h;} __i;				\
    875 	  } __x;							\
    876   __asm__ ("emul %2,%1,%0"						\
    877 	   : "=d" (__x.__ll) : "%dI" (u), "dI" (v));			\
    878   (w1) = __x.__i.__h; (w0) = __x.__i.__l;})
    879 #define __umulsidi3(u, v) \
    880   ({UDItype __w;							\
    881     __asm__ ("emul %2,%1,%0" : "=d" (__w) : "%dI" (u), "dI" (v));	\
    882     __w; })
    883 #define udiv_qrnnd(q, r, nh, nl, d) \
    884   do {									\
    885     union {UDItype __ll;						\
    886 	   struct {USItype __l, __h;} __i;				\
    887 	  } __nn;							\
    888     __nn.__i.__h = (nh); __nn.__i.__l = (nl);				\
    889     __asm__ ("ediv %d,%n,%0"						\
    890 	   : "=d" (__rq.__ll) : "dI" (__nn.__ll), "dI" (d));		\
    891     (r) = __rq.__i.__l; (q) = __rq.__i.__h;				\
    892   } while (0)
    893 #define count_leading_zeros(count, x) \
    894   do {									\
    895     USItype __cbtmp;							\
    896     __asm__ ("scanbit %1,%0" : "=r" (__cbtmp) : "r" (x));		\
    897     (count) = __cbtmp ^ 31;						\
    898   } while (0)
    899 #define COUNT_LEADING_ZEROS_0 (-32) /* sic */
    900 #if defined (__i960mx)		/* what is the proper symbol to test??? */
    901 #define rshift_rhlc(r,h,l,c) \
    902   do {									\
    903     union {UDItype __ll;						\
    904 	   struct {USItype __l, __h;} __i;				\
    905 	  } __nn;							\
    906     __nn.__i.__h = (h); __nn.__i.__l = (l);				\
    907     __asm__ ("shre %2,%1,%0" : "=d" (r) : "dI" (__nn.__ll), "dI" (c));	\
    908   }
    909 #endif /* i960mx */
    910 #endif /* i960 */
    911 
    912 #if (defined (__mc68000__) || defined (__mc68020__) || defined(mc68020) \
    913      || defined (__m68k__) || defined (__mc5200__) || defined (__mc5206e__) \
    914      || defined (__mc5307__)) && W_TYPE_SIZE == 32
    915 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \
    916   __asm__ ("add%.l %5,%1\n\taddx%.l %3,%0"				\
    917 	   : "=d" (sh), "=&d" (sl)					\
    918 	   : "0"  ((USItype)(ah)), "d" ((USItype)(bh)),			\
    919 	     "%1" ((USItype)(al)), "g" ((USItype)(bl)))
    920 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \
    921   __asm__ ("sub%.l %5,%1\n\tsubx%.l %3,%0"				\
    922 	   : "=d" (sh), "=&d" (sl)					\
    923 	   : "0" ((USItype)(ah)), "d" ((USItype)(bh)),			\
    924 	     "1" ((USItype)(al)), "g" ((USItype)(bl)))
    925 /* The '020, '030, '040 and CPU32 have 32x32->64 and 64/32->32q-32r.  */
    926 #if defined (__mc68020__) || defined(mc68020) \
    927      || defined (__mc68030__) || defined (mc68030) \
    928      || defined (__mc68040__) || defined (mc68040) \
    929      || defined (__mcpu32__) || defined (mcpu32) \
    930      || defined (__NeXT__)
    931 #define umul_ppmm(w1, w0, u, v) \
    932   __asm__ ("mulu%.l %3,%1:%0"						\
    933 	   : "=d" (w0), "=d" (w1)					\
    934 	   : "%0" ((USItype)(u)), "dmi" ((USItype)(v)))
    935 #define UMUL_TIME 45
    936 #define udiv_qrnnd(q, r, n1, n0, d) \
    937   __asm__ ("divu%.l %4,%1:%0"						\
    938 	   : "=d" (q), "=d" (r)						\
    939 	   : "0" ((USItype)(n0)), "1" ((USItype)(n1)), "dmi" ((USItype)(d)))
    940 #define UDIV_TIME 90
    941 #define sdiv_qrnnd(q, r, n1, n0, d) \
    942   __asm__ ("divs%.l %4,%1:%0"						\
    943 	   : "=d" (q), "=d" (r)						\
    944 	   : "0" ((USItype)(n0)), "1" ((USItype)(n1)), "dmi" ((USItype)(d)))
    945 #else /* for other 68k family members use 16x16->32 multiplication */
    946 #define umul_ppmm(xh, xl, a, b) \
    947   do { USItype __umul_tmp1, __umul_tmp2;				\
    948 	__asm__ ("| Inlined umul_ppmm\n"				\
    949 "	move%.l	%5,%3\n"						\
    950 "	move%.l	%2,%0\n"						\
    951 "	move%.w	%3,%1\n"						\
    952 "	swap	%3\n"							\
    953 "	swap	%0\n"							\
    954 "	mulu%.w	%2,%1\n"						\
    955 "	mulu%.w	%3,%0\n"						\
    956 "	mulu%.w	%2,%3\n"						\
    957 "	swap	%2\n"							\
    958 "	mulu%.w	%5,%2\n"						\
    959 "	add%.l	%3,%2\n"						\
    960 "	jcc	1f\n"							\
    961 "	add%.l	%#0x10000,%0\n"						\
    962 "1:	move%.l	%2,%3\n"						\
    963 "	clr%.w	%2\n"							\
    964 "	swap	%2\n"							\
    965 "	swap	%3\n"							\
    966 "	clr%.w	%3\n"							\
    967 "	add%.l	%3,%1\n"						\
    968 "	addx%.l	%2,%0\n"						\
    969 "	| End inlined umul_ppmm"					\
    970 	      : "=&d" (xh), "=&d" (xl),					\
    971 		"=d" (__umul_tmp1), "=&d" (__umul_tmp2)			\
    972 	      : "%2" ((USItype)(a)), "d" ((USItype)(b)));		\
    973   } while (0)
    974 #define UMUL_TIME 100
    975 #define UDIV_TIME 400
    976 #endif /* not mc68020 */
    977 /* The '020, '030, '040 and '060 have bitfield insns.
    978    GCC 3.4 defines __mc68020__ when in CPU32 mode, check for __mcpu32__ to
    979    exclude bfffo on that chip (bitfield insns not available).  */
    980 #if (defined (__mc68020__) || defined (mc68020)    \
    981      || defined (__mc68030__) || defined (mc68030) \
    982      || defined (__mc68040__) || defined (mc68040) \
    983      || defined (__mc68060__) || defined (mc68060) \
    984      || defined (__NeXT__))                        \
    985   && ! defined (__mcpu32__)
    986 #define count_leading_zeros(count, x) \
    987   __asm__ ("bfffo %1{%b2:%b2},%0"					\
    988 	   : "=d" (count)						\
    989 	   : "od" ((USItype) (x)), "n" (0))
    990 #define COUNT_LEADING_ZEROS_0 32
    991 #endif
    992 #endif /* mc68000 */
    993 
    994 #if defined (__m88000__) && W_TYPE_SIZE == 32
    995 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \
    996   __asm__ ("addu.co %1,%r4,%r5\n\taddu.ci %0,%r2,%r3"			\
    997 	   : "=r" (sh), "=&r" (sl)					\
    998 	   : "rJ" (ah), "rJ" (bh), "%rJ" (al), "rJ" (bl))
    999 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \
   1000   __asm__ ("subu.co %1,%r4,%r5\n\tsubu.ci %0,%r2,%r3"			\
   1001 	   : "=r" (sh), "=&r" (sl)					\
   1002 	   : "rJ" (ah), "rJ" (bh), "rJ" (al), "rJ" (bl))
   1003 #define count_leading_zeros(count, x) \
   1004   do {									\
   1005     USItype __cbtmp;							\
   1006     __asm__ ("ff1 %0,%1" : "=r" (__cbtmp) : "r" (x));			\
   1007     (count) = __cbtmp ^ 31;						\
   1008   } while (0)
   1009 #define COUNT_LEADING_ZEROS_0 63 /* sic */
   1010 #if defined (__m88110__)
   1011 #define umul_ppmm(wh, wl, u, v) \
   1012   do {									\
   1013     union {UDItype __ll;						\
   1014 	   struct {USItype __h, __l;} __i;				\
   1015 	  } __x;							\
   1016     __asm__ ("mulu.d %0,%1,%2" : "=r" (__x.__ll) : "r" (u), "r" (v));	\
   1017     (wh) = __x.__i.__h;							\
   1018     (wl) = __x.__i.__l;							\
   1019   } while (0)
   1020 #define udiv_qrnnd(q, r, n1, n0, d) \
   1021   ({union {UDItype __ll;						\
   1022 	   struct {USItype __h, __l;} __i;				\
   1023 	  } __x, __q;							\
   1024   __x.__i.__h = (n1); __x.__i.__l = (n0);				\
   1025   __asm__ ("divu.d %0,%1,%2"						\
   1026 	   : "=r" (__q.__ll) : "r" (__x.__ll), "r" (d));		\
   1027   (r) = (n0) - __q.__l * (d); (q) = __q.__l; })
   1028 #define UMUL_TIME 5
   1029 #define UDIV_TIME 25
   1030 #else
   1031 #define UMUL_TIME 17
   1032 #define UDIV_TIME 150
   1033 #endif /* __m88110__ */
   1034 #endif /* __m88000__ */
   1035 
   1036 #if defined (__mips) && W_TYPE_SIZE == 32
   1037 #if __GMP_GNUC_PREREQ (4,4)
   1038 #define umul_ppmm(w1, w0, u, v) \
   1039   do {									\
   1040     UDItype __ll = (UDItype)(u) * (v);					\
   1041     w1 = __ll >> 32;							\
   1042     w0 = __ll;								\
   1043   } while (0)
   1044 #endif
   1045 #if !defined (umul_ppmm) && __GMP_GNUC_PREREQ (2,7)
   1046 #define umul_ppmm(w1, w0, u, v) \
   1047   __asm__ ("multu %2,%3" : "=l" (w0), "=h" (w1) : "d" (u), "d" (v))
   1048 #endif
   1049 #if !defined (umul_ppmm)
   1050 #define umul_ppmm(w1, w0, u, v) \
   1051   __asm__ ("multu %2,%3\n\tmflo %0\n\tmfhi %1"				\
   1052 	   : "=d" (w0), "=d" (w1) : "d" (u), "d" (v))
   1053 #endif
   1054 #define UMUL_TIME 10
   1055 #define UDIV_TIME 100
   1056 #endif /* __mips */
   1057 
   1058 #if (defined (__mips) && __mips >= 3) && W_TYPE_SIZE == 64
   1059 #if __GMP_GNUC_PREREQ (4,4)
   1060 #define umul_ppmm(w1, w0, u, v) \
   1061   do {									\
   1062     typedef unsigned int __ll_UTItype __attribute__((mode(TI)));	\
   1063     __ll_UTItype __ll = (__ll_UTItype)(u) * (v);			\
   1064     w1 = __ll >> 64;							\
   1065     w0 = __ll;								\
   1066   } while (0)
   1067 #endif
   1068 #if !defined (umul_ppmm) && __GMP_GNUC_PREREQ (2,7)
   1069 #define umul_ppmm(w1, w0, u, v) \
   1070   __asm__ ("dmultu %2,%3" : "=l" (w0), "=h" (w1) : "d" (u), "d" (v))
   1071 #endif
   1072 #if !defined (umul_ppmm)
   1073 #define umul_ppmm(w1, w0, u, v) \
   1074   __asm__ ("dmultu %2,%3\n\tmflo %0\n\tmfhi %1"				\
   1075 	   : "=d" (w0), "=d" (w1) : "d" (u), "d" (v))
   1076 #endif
   1077 #define UMUL_TIME 20
   1078 #define UDIV_TIME 140
   1079 #endif /* __mips */
   1080 
   1081 #if defined (__mmix__) && W_TYPE_SIZE == 64
   1082 #define umul_ppmm(w1, w0, u, v) \
   1083   __asm__ ("MULU %0,%2,%3" : "=r" (w0), "=z" (w1) : "r" (u), "r" (v))
   1084 #endif
   1085 
   1086 #if defined (__ns32000__) && W_TYPE_SIZE == 32
   1087 #define umul_ppmm(w1, w0, u, v) \
   1088   ({union {UDItype __ll;						\
   1089 	   struct {USItype __l, __h;} __i;				\
   1090 	  } __x;							\
   1091   __asm__ ("meid %2,%0"							\
   1092 	   : "=g" (__x.__ll)						\
   1093 	   : "%0" ((USItype)(u)), "g" ((USItype)(v)));			\
   1094   (w1) = __x.__i.__h; (w0) = __x.__i.__l;})
   1095 #define __umulsidi3(u, v) \
   1096   ({UDItype __w;							\
   1097     __asm__ ("meid %2,%0"						\
   1098 	     : "=g" (__w)						\
   1099 	     : "%0" ((USItype)(u)), "g" ((USItype)(v)));		\
   1100     __w; })
   1101 #define udiv_qrnnd(q, r, n1, n0, d) \
   1102   ({union {UDItype __ll;						\
   1103 	   struct {USItype __l, __h;} __i;				\
   1104 	  } __x;							\
   1105   __x.__i.__h = (n1); __x.__i.__l = (n0);				\
   1106   __asm__ ("deid %2,%0"							\
   1107 	   : "=g" (__x.__ll)						\
   1108 	   : "0" (__x.__ll), "g" ((USItype)(d)));			\
   1109   (r) = __x.__i.__l; (q) = __x.__i.__h; })
   1110 #define count_trailing_zeros(count,x) \
   1111   do {									\
   1112     __asm__ ("ffsd	%2,%0"						\
   1113 	     : "=r" (count)						\
   1114 	     : "0" ((USItype) 0), "r" ((USItype) (x)));			\
   1115   } while (0)
   1116 #endif /* __ns32000__ */
   1117 
   1118 /* In the past we had a block of various #defines tested
   1119        _ARCH_PPC    - AIX
   1120        _ARCH_PWR    - AIX
   1121        __powerpc__  - gcc
   1122        __POWERPC__  - BEOS
   1123        __ppc__      - Darwin
   1124        PPC          - old gcc, GNU/Linux, SysV
   1125    The plain PPC test was not good for vxWorks, since PPC is defined on all
   1126    CPUs there (eg. m68k too), as a constant one is expected to compare
   1127    CPU_FAMILY against.
   1128 
   1129    At any rate, this was pretty unattractive and a bit fragile.  The use of
   1130    HAVE_HOST_CPU_FAMILY is designed to cut through it all and be sure of
   1131    getting the desired effect.
   1132 
   1133    ENHANCE-ME: We should test _IBMR2 here when we add assembly support for
   1134    the system vendor compilers.  (Is that vendor compilers with inline asm,
   1135    or what?)  */
   1136 
   1137 #if (HAVE_HOST_CPU_FAMILY_power || HAVE_HOST_CPU_FAMILY_powerpc)        \
   1138   && W_TYPE_SIZE == 32
   1139 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \
   1140   do {									\
   1141     if (__builtin_constant_p (bh) && (bh) == 0)				\
   1142       __asm__ ("{a%I4|add%I4c} %1,%3,%4\n\t{aze|addze} %0,%2"		\
   1143 	     : "=r" (sh), "=&r" (sl) : "r" (ah), "%r" (al), "rI" (bl));\
   1144     else if (__builtin_constant_p (bh) && (bh) == ~(USItype) 0)		\
   1145       __asm__ ("{a%I4|add%I4c} %1,%3,%4\n\t{ame|addme} %0,%2"		\
   1146 	     : "=r" (sh), "=&r" (sl) : "r" (ah), "%r" (al), "rI" (bl));\
   1147     else								\
   1148       __asm__ ("{a%I5|add%I5c} %1,%4,%5\n\t{ae|adde} %0,%2,%3"		\
   1149 	     : "=r" (sh), "=&r" (sl)					\
   1150 	     : "r" (ah), "r" (bh), "%r" (al), "rI" (bl));		\
   1151   } while (0)
   1152 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \
   1153   do {									\
   1154     if (__builtin_constant_p (ah) && (ah) == 0)				\
   1155       __asm__ ("{sf%I3|subf%I3c} %1,%4,%3\n\t{sfze|subfze} %0,%2"	\
   1156 	       : "=r" (sh), "=&r" (sl) : "r" (bh), "rI" (al), "r" (bl));\
   1157     else if (__builtin_constant_p (ah) && (ah) == ~(USItype) 0)		\
   1158       __asm__ ("{sf%I3|subf%I3c} %1,%4,%3\n\t{sfme|subfme} %0,%2"	\
   1159 	       : "=r" (sh), "=&r" (sl) : "r" (bh), "rI" (al), "r" (bl));\
   1160     else if (__builtin_constant_p (bh) && (bh) == 0)			\
   1161       __asm__ ("{sf%I3|subf%I3c} %1,%4,%3\n\t{ame|addme} %0,%2"		\
   1162 	       : "=r" (sh), "=&r" (sl) : "r" (ah), "rI" (al), "r" (bl));\
   1163     else if (__builtin_constant_p (bh) && (bh) == ~(USItype) 0)		\
   1164       __asm__ ("{sf%I3|subf%I3c} %1,%4,%3\n\t{aze|addze} %0,%2"		\
   1165 	       : "=r" (sh), "=&r" (sl) : "r" (ah), "rI" (al), "r" (bl));\
   1166     else								\
   1167       __asm__ ("{sf%I4|subf%I4c} %1,%5,%4\n\t{sfe|subfe} %0,%3,%2"	\
   1168 	       : "=r" (sh), "=&r" (sl)					\
   1169 	       : "r" (ah), "r" (bh), "rI" (al), "r" (bl));		\
   1170   } while (0)
   1171 #define count_leading_zeros(count, x) \
   1172   __asm__ ("{cntlz|cntlzw} %0,%1" : "=r" (count) : "r" (x))
   1173 #define COUNT_LEADING_ZEROS_0 32
   1174 #if HAVE_HOST_CPU_FAMILY_powerpc
   1175 #if __GMP_GNUC_PREREQ (4,4)
   1176 #define umul_ppmm(w1, w0, u, v) \
   1177   do {									\
   1178     UDItype __ll = (UDItype)(u) * (v);					\
   1179     w1 = __ll >> 32;							\
   1180     w0 = __ll;								\
   1181   } while (0)
   1182 #endif
   1183 #if !defined (umul_ppmm)
   1184 #define umul_ppmm(ph, pl, m0, m1) \
   1185   do {									\
   1186     USItype __m0 = (m0), __m1 = (m1);					\
   1187     __asm__ ("mulhwu %0,%1,%2" : "=r" (ph) : "%r" (m0), "r" (m1));	\
   1188     (pl) = __m0 * __m1;							\
   1189   } while (0)
   1190 #endif
   1191 #define UMUL_TIME 15
   1192 #define smul_ppmm(ph, pl, m0, m1) \
   1193   do {									\
   1194     SItype __m0 = (m0), __m1 = (m1);					\
   1195     __asm__ ("mulhw %0,%1,%2" : "=r" (ph) : "%r" (m0), "r" (m1));	\
   1196     (pl) = __m0 * __m1;							\
   1197   } while (0)
   1198 #define SMUL_TIME 14
   1199 #define UDIV_TIME 120
   1200 #else
   1201 #define UMUL_TIME 8
   1202 #define smul_ppmm(xh, xl, m0, m1) \
   1203   __asm__ ("mul %0,%2,%3" : "=r" (xh), "=q" (xl) : "r" (m0), "r" (m1))
   1204 #define SMUL_TIME 4
   1205 #define sdiv_qrnnd(q, r, nh, nl, d) \
   1206   __asm__ ("div %0,%2,%4" : "=r" (q), "=q" (r) : "r" (nh), "1" (nl), "r" (d))
   1207 #define UDIV_TIME 100
   1208 #endif
   1209 #endif /* 32-bit POWER architecture variants.  */
   1210 
   1211 /* We should test _IBMR2 here when we add assembly support for the system
   1212    vendor compilers.  */
   1213 #if HAVE_HOST_CPU_FAMILY_powerpc && W_TYPE_SIZE == 64
   1214 #if !defined (_LONG_LONG_LIMB)
   1215 /* _LONG_LONG_LIMB is ABI=mode32 where adde operates on 32-bit values.  So
   1216    use adde etc only when not _LONG_LONG_LIMB.  */
   1217 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \
   1218   do {									\
   1219     if (__builtin_constant_p (bh) && (bh) == 0)				\
   1220       __asm__ ("{a%I4|add%I4c} %1,%3,%4\n\t{aze|addze} %0,%2"		\
   1221 	     : "=r" (sh), "=&r" (sl) : "r" (ah), "%r" (al), "rI" (bl));\
   1222     else if (__builtin_constant_p (bh) && (bh) == ~(UDItype) 0)		\
   1223       __asm__ ("{a%I4|add%I4c} %1,%3,%4\n\t{ame|addme} %0,%2"		\
   1224 	     : "=r" (sh), "=&r" (sl) : "r" (ah), "%r" (al), "rI" (bl));\
   1225     else								\
   1226       __asm__ ("{a%I5|add%I5c} %1,%4,%5\n\t{ae|adde} %0,%2,%3"		\
   1227 	     : "=r" (sh), "=&r" (sl)					\
   1228 	     : "r" (ah), "r" (bh), "%r" (al), "rI" (bl));		\
   1229   } while (0)
   1230 /* We use "*rI" for the constant operand here, since with just "I", gcc barfs.
   1231    This might seem strange, but gcc folds away the dead code late.  */
   1232 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \
   1233   do {									      \
   1234     if (__builtin_constant_p (bl) && bl > -0x8000 && bl <= 0x8000) {	      \
   1235 	if (__builtin_constant_p (ah) && (ah) == 0)			      \
   1236 	  __asm__ ("{ai|addic} %1,%3,%4\n\t{sfze|subfze} %0,%2"		      \
   1237 		   : "=r" (sh), "=&r" (sl) : "r" (bh), "rI" (al), "*rI" (-bl)); \
   1238 	else if (__builtin_constant_p (ah) && (ah) == ~(UDItype) 0)	      \
   1239 	  __asm__ ("{ai|addic} %1,%3,%4\n\t{sfme|subfme} %0,%2"		      \
   1240 		   : "=r" (sh), "=&r" (sl) : "r" (bh), "rI" (al), "*rI" (-bl)); \
   1241 	else if (__builtin_constant_p (bh) && (bh) == 0)		      \
   1242 	  __asm__ ("{ai|addic} %1,%3,%4\n\t{ame|addme} %0,%2"		      \
   1243 		   : "=r" (sh), "=&r" (sl) : "r" (ah), "rI" (al), "*rI" (-bl)); \
   1244 	else if (__builtin_constant_p (bh) && (bh) == ~(UDItype) 0)	      \
   1245 	  __asm__ ("{ai|addic} %1,%3,%4\n\t{aze|addze} %0,%2"		      \
   1246 		   : "=r" (sh), "=&r" (sl) : "r" (ah), "rI" (al), "*rI" (-bl)); \
   1247 	else								      \
   1248 	  __asm__ ("{ai|addic} %1,%4,%5\n\t{sfe|subfe} %0,%3,%2"	      \
   1249 		   : "=r" (sh), "=&r" (sl)				      \
   1250 		   : "r" (ah), "r" (bh), "rI" (al), "*rI" (-bl));	      \
   1251       } else {								      \
   1252 	if (__builtin_constant_p (ah) && (ah) == 0)			      \
   1253 	  __asm__ ("{sf%I3|subf%I3c} %1,%4,%3\n\t{sfze|subfze} %0,%2"	      \
   1254 		   : "=r" (sh), "=&r" (sl) : "r" (bh), "rI" (al), "r" (bl));  \
   1255 	else if (__builtin_constant_p (ah) && (ah) == ~(UDItype) 0)	      \
   1256 	  __asm__ ("{sf%I3|subf%I3c} %1,%4,%3\n\t{sfme|subfme} %0,%2"	      \
   1257 		   : "=r" (sh), "=&r" (sl) : "r" (bh), "rI" (al), "r" (bl));  \
   1258 	else if (__builtin_constant_p (bh) && (bh) == 0)		      \
   1259 	  __asm__ ("{sf%I3|subf%I3c} %1,%4,%3\n\t{ame|addme} %0,%2"	      \
   1260 		   : "=r" (sh), "=&r" (sl) : "r" (ah), "rI" (al), "r" (bl));  \
   1261 	else if (__builtin_constant_p (bh) && (bh) == ~(UDItype) 0)	      \
   1262 	  __asm__ ("{sf%I3|subf%I3c} %1,%4,%3\n\t{aze|addze} %0,%2"	      \
   1263 		   : "=r" (sh), "=&r" (sl) : "r" (ah), "rI" (al), "r" (bl));  \
   1264 	else								      \
   1265 	  __asm__ ("{sf%I4|subf%I4c} %1,%5,%4\n\t{sfe|subfe} %0,%3,%2"	      \
   1266 		   : "=r" (sh), "=&r" (sl)				      \
   1267 		   : "r" (ah), "r" (bh), "rI" (al), "r" (bl));		      \
   1268       }									      \
   1269   } while (0)
   1270 #endif /* ! _LONG_LONG_LIMB */
   1271 #define count_leading_zeros(count, x) \
   1272   __asm__ ("cntlzd %0,%1" : "=r" (count) : "r" (x))
   1273 #define COUNT_LEADING_ZEROS_0 64
   1274 #if __GMP_GNUC_PREREQ (4,4)
   1275 #define umul_ppmm(w1, w0, u, v) \
   1276   do {									\
   1277     typedef unsigned int __ll_UTItype __attribute__((mode(TI)));	\
   1278     __ll_UTItype __ll = (__ll_UTItype)(u) * (v);			\
   1279     w1 = __ll >> 64;							\
   1280     w0 = __ll;								\
   1281   } while (0)
   1282 #endif
   1283 #if !defined (umul_ppmm)
   1284 #define umul_ppmm(ph, pl, m0, m1) \
   1285   do {									\
   1286     UDItype __m0 = (m0), __m1 = (m1);					\
   1287     __asm__ ("mulhdu %0,%1,%2" : "=r" (ph) : "%r" (m0), "r" (m1));	\
   1288     (pl) = __m0 * __m1;							\
   1289   } while (0)
   1290 #endif
   1291 #define UMUL_TIME 15
   1292 #define smul_ppmm(ph, pl, m0, m1) \
   1293   do {									\
   1294     DItype __m0 = (m0), __m1 = (m1);					\
   1295     __asm__ ("mulhd %0,%1,%2" : "=r" (ph) : "%r" (m0), "r" (m1));	\
   1296     (pl) = __m0 * __m1;							\
   1297   } while (0)
   1298 #define SMUL_TIME 14  /* ??? */
   1299 #define UDIV_TIME 120 /* ??? */
   1300 #endif /* 64-bit PowerPC.  */
   1301 
   1302 #if defined (__pyr__) && W_TYPE_SIZE == 32
   1303 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \
   1304   __asm__ ("addw %5,%1\n\taddwc %3,%0"					\
   1305 	   : "=r" (sh), "=&r" (sl)					\
   1306 	   : "0"  ((USItype)(ah)), "g" ((USItype)(bh)),			\
   1307 	     "%1" ((USItype)(al)), "g" ((USItype)(bl)))
   1308 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \
   1309   __asm__ ("subw %5,%1\n\tsubwb %3,%0"					\
   1310 	   : "=r" (sh), "=&r" (sl)					\
   1311 	   : "0" ((USItype)(ah)), "g" ((USItype)(bh)),			\
   1312 	     "1" ((USItype)(al)), "g" ((USItype)(bl)))
   1313 /* This insn works on Pyramids with AP, XP, or MI CPUs, but not with SP.  */
   1314 #define umul_ppmm(w1, w0, u, v) \
   1315   ({union {UDItype __ll;						\
   1316 	   struct {USItype __h, __l;} __i;				\
   1317 	  } __x;							\
   1318   __asm__ ("movw %1,%R0\n\tuemul %2,%0"					\
   1319 	   : "=&r" (__x.__ll)						\
   1320 	   : "g" ((USItype) (u)), "g" ((USItype)(v)));			\
   1321   (w1) = __x.__i.__h; (w0) = __x.__i.__l;})
   1322 #endif /* __pyr__ */
   1323 
   1324 #if defined (__ibm032__) /* RT/ROMP */  && W_TYPE_SIZE == 32
   1325 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \
   1326   __asm__ ("a %1,%5\n\tae %0,%3"					\
   1327 	   : "=r" (sh), "=&r" (sl)					\
   1328 	   : "0"  ((USItype)(ah)), "r" ((USItype)(bh)),			\
   1329 	     "%1" ((USItype)(al)), "r" ((USItype)(bl)))
   1330 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \
   1331   __asm__ ("s %1,%5\n\tse %0,%3"					\
   1332 	   : "=r" (sh), "=&r" (sl)					\
   1333 	   : "0" ((USItype)(ah)), "r" ((USItype)(bh)),			\
   1334 	     "1" ((USItype)(al)), "r" ((USItype)(bl)))
   1335 #define smul_ppmm(ph, pl, m0, m1) \
   1336   __asm__ (								\
   1337        "s	r2,r2\n"						\
   1338 "	mts r10,%2\n"							\
   1339 "	m	r2,%3\n"						\
   1340 "	m	r2,%3\n"						\
   1341 "	m	r2,%3\n"						\
   1342 "	m	r2,%3\n"						\
   1343 "	m	r2,%3\n"						\
   1344 "	m	r2,%3\n"						\
   1345 "	m	r2,%3\n"						\
   1346 "	m	r2,%3\n"						\
   1347 "	m	r2,%3\n"						\
   1348 "	m	r2,%3\n"						\
   1349 "	m	r2,%3\n"						\
   1350 "	m	r2,%3\n"						\
   1351 "	m	r2,%3\n"						\
   1352 "	m	r2,%3\n"						\
   1353 "	m	r2,%3\n"						\
   1354 "	m	r2,%3\n"						\
   1355 "	cas	%0,r2,r0\n"						\
   1356 "	mfs	r10,%1"							\
   1357 	   : "=r" (ph), "=r" (pl)					\
   1358 	   : "%r" ((USItype)(m0)), "r" ((USItype)(m1))			\
   1359 	   : "r2")
   1360 #define UMUL_TIME 20
   1361 #define UDIV_TIME 200
   1362 #define count_leading_zeros(count, x) \
   1363   do {									\
   1364     if ((x) >= 0x10000)							\
   1365       __asm__ ("clz	%0,%1"						\
   1366 	       : "=r" (count) : "r" ((USItype)(x) >> 16));		\
   1367     else								\
   1368       {									\
   1369 	__asm__ ("clz	%0,%1"						\
   1370 		 : "=r" (count) : "r" ((USItype)(x)));			\
   1371 	(count) += 16;							\
   1372       }									\
   1373   } while (0)
   1374 #endif /* RT/ROMP */
   1375 
   1376 #if defined (__sh2__) && W_TYPE_SIZE == 32
   1377 #define umul_ppmm(w1, w0, u, v) \
   1378   __asm__ ("dmulu.l %2,%3\n\tsts macl,%1\n\tsts mach,%0"		\
   1379 	   : "=r" (w1), "=r" (w0) : "r" (u), "r" (v) : "macl", "mach")
   1380 #define UMUL_TIME 5
   1381 #endif
   1382 
   1383 #if defined (__sparc__) && W_TYPE_SIZE == 32
   1384 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \
   1385   __asm__ ("addcc %r4,%5,%1\n\taddx %r2,%3,%0"				\
   1386 	   : "=r" (sh), "=&r" (sl)					\
   1387 	   : "rJ" (ah), "rI" (bh),"%rJ" (al), "rI" (bl)			\
   1388 	   __CLOBBER_CC)
   1389 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \
   1390   __asm__ ("subcc %r4,%5,%1\n\tsubx %r2,%3,%0"				\
   1391 	   : "=r" (sh), "=&r" (sl)					\
   1392 	   : "rJ" (ah), "rI" (bh), "rJ" (al), "rI" (bl)	\
   1393 	   __CLOBBER_CC)
   1394 /* FIXME: When gcc -mcpu=v9 is used on solaris, gcc/config/sol2-sld-64.h
   1395    doesn't define anything to indicate that to us, it only sets __sparcv8. */
   1396 #if defined (__sparc_v9__) || defined (__sparcv9)
   1397 /* Perhaps we should use floating-point operations here?  */
   1398 #if 0
   1399 /* Triggers a bug making mpz/tests/t-gcd.c fail.
   1400    Perhaps we simply need explicitly zero-extend the inputs?  */
   1401 #define umul_ppmm(w1, w0, u, v) \
   1402   __asm__ ("mulx %2,%3,%%g1; srl %%g1,0,%1; srlx %%g1,32,%0" :		\
   1403 	   "=r" (w1), "=r" (w0) : "r" (u), "r" (v) : "g1")
   1404 #else
   1405 /* Use v8 umul until above bug is fixed.  */
   1406 #define umul_ppmm(w1, w0, u, v) \
   1407   __asm__ ("umul %2,%3,%1;rd %%y,%0" : "=r" (w1), "=r" (w0) : "r" (u), "r" (v))
   1408 #endif
   1409 /* Use a plain v8 divide for v9.  */
   1410 #define udiv_qrnnd(q, r, n1, n0, d) \
   1411   do {									\
   1412     USItype __q;							\
   1413     __asm__ ("mov %1,%%y;nop;nop;nop;udiv %2,%3,%0"			\
   1414 	     : "=r" (__q) : "r" (n1), "r" (n0), "r" (d));		\
   1415     (r) = (n0) - __q * (d);						\
   1416     (q) = __q;								\
   1417   } while (0)
   1418 #else
   1419 #if defined (__sparc_v8__)   /* gcc normal */				\
   1420   || defined (__sparcv8)     /* gcc solaris */				\
   1421   || HAVE_HOST_CPU_supersparc
   1422 /* Don't match immediate range because, 1) it is not often useful,
   1423    2) the 'I' flag thinks of the range as a 13 bit signed interval,
   1424    while we want to match a 13 bit interval, sign extended to 32 bits,
   1425    but INTERPRETED AS UNSIGNED.  */
   1426 #define umul_ppmm(w1, w0, u, v) \
   1427   __asm__ ("umul %2,%3,%1;rd %%y,%0" : "=r" (w1), "=r" (w0) : "r" (u), "r" (v))
   1428 #define UMUL_TIME 5
   1429 
   1430 #if HAVE_HOST_CPU_supersparc
   1431 #define UDIV_TIME 60		/* SuperSPARC timing */
   1432 #else
   1433 /* Don't use this on SuperSPARC because its udiv only handles 53 bit
   1434    dividends and will trap to the kernel for the rest. */
   1435 #define udiv_qrnnd(q, r, n1, n0, d) \
   1436   do {									\
   1437     USItype __q;							\
   1438     __asm__ ("mov %1,%%y;nop;nop;nop;udiv %2,%3,%0"			\
   1439 	     : "=r" (__q) : "r" (n1), "r" (n0), "r" (d));		\
   1440     (r) = (n0) - __q * (d);						\
   1441     (q) = __q;								\
   1442   } while (0)
   1443 #define UDIV_TIME 25
   1444 #endif /* HAVE_HOST_CPU_supersparc */
   1445 
   1446 #else /* ! __sparc_v8__ */
   1447 #if defined (__sparclite__)
   1448 /* This has hardware multiply but not divide.  It also has two additional
   1449    instructions scan (ffs from high bit) and divscc.  */
   1450 #define umul_ppmm(w1, w0, u, v) \
   1451   __asm__ ("umul %2,%3,%1;rd %%y,%0" : "=r" (w1), "=r" (w0) : "r" (u), "r" (v))
   1452 #define UMUL_TIME 5
   1453 #define udiv_qrnnd(q, r, n1, n0, d) \
   1454   __asm__ ("! Inlined udiv_qrnnd\n"					\
   1455 "	wr	%%g0,%2,%%y	! Not a delayed write for sparclite\n"	\
   1456 "	tst	%%g0\n"							\
   1457 "	divscc	%3,%4,%%g1\n"						\
   1458 "	divscc	%%g1,%4,%%g1\n"						\
   1459 "	divscc	%%g1,%4,%%g1\n"						\
   1460 "	divscc	%%g1,%4,%%g1\n"						\
   1461 "	divscc	%%g1,%4,%%g1\n"						\
   1462 "	divscc	%%g1,%4,%%g1\n"						\
   1463 "	divscc	%%g1,%4,%%g1\n"						\
   1464 "	divscc	%%g1,%4,%%g1\n"						\
   1465 "	divscc	%%g1,%4,%%g1\n"						\
   1466 "	divscc	%%g1,%4,%%g1\n"						\
   1467 "	divscc	%%g1,%4,%%g1\n"						\
   1468 "	divscc	%%g1,%4,%%g1\n"						\
   1469 "	divscc	%%g1,%4,%%g1\n"						\
   1470 "	divscc	%%g1,%4,%%g1\n"						\
   1471 "	divscc	%%g1,%4,%%g1\n"						\
   1472 "	divscc	%%g1,%4,%%g1\n"						\
   1473 "	divscc	%%g1,%4,%%g1\n"						\
   1474 "	divscc	%%g1,%4,%%g1\n"						\
   1475 "	divscc	%%g1,%4,%%g1\n"						\
   1476 "	divscc	%%g1,%4,%%g1\n"						\
   1477 "	divscc	%%g1,%4,%%g1\n"						\
   1478 "	divscc	%%g1,%4,%%g1\n"						\
   1479 "	divscc	%%g1,%4,%%g1\n"						\
   1480 "	divscc	%%g1,%4,%%g1\n"						\
   1481 "	divscc	%%g1,%4,%%g1\n"						\
   1482 "	divscc	%%g1,%4,%%g1\n"						\
   1483 "	divscc	%%g1,%4,%%g1\n"						\
   1484 "	divscc	%%g1,%4,%%g1\n"						\
   1485 "	divscc	%%g1,%4,%%g1\n"						\
   1486 "	divscc	%%g1,%4,%%g1\n"						\
   1487 "	divscc	%%g1,%4,%%g1\n"						\
   1488 "	divscc	%%g1,%4,%0\n"						\
   1489 "	rd	%%y,%1\n"						\
   1490 "	bl,a 1f\n"							\
   1491 "	add	%1,%4,%1\n"						\
   1492 "1:	! End of inline udiv_qrnnd"					\
   1493 	   : "=r" (q), "=r" (r) : "r" (n1), "r" (n0), "rI" (d)		\
   1494 	   : "%g1" __AND_CLOBBER_CC)
   1495 #define UDIV_TIME 37
   1496 #define count_leading_zeros(count, x) \
   1497   __asm__ ("scan %1,1,%0" : "=r" (count) : "r" (x))
   1498 /* Early sparclites return 63 for an argument of 0, but they warn that future
   1499    implementations might change this.  Therefore, leave COUNT_LEADING_ZEROS_0
   1500    undefined.  */
   1501 #endif /* __sparclite__ */
   1502 #endif /* __sparc_v8__ */
   1503 #endif /* __sparc_v9__ */
   1504 /* Default to sparc v7 versions of umul_ppmm and udiv_qrnnd.  */
   1505 #ifndef umul_ppmm
   1506 #define umul_ppmm(w1, w0, u, v) \
   1507   __asm__ ("! Inlined umul_ppmm\n"					\
   1508 "	wr	%%g0,%2,%%y	! SPARC has 0-3 delay insn after a wr\n" \
   1509 "	sra	%3,31,%%g2	! Don't move this insn\n"		\
   1510 "	and	%2,%%g2,%%g2	! Don't move this insn\n"		\
   1511 "	andcc	%%g0,0,%%g1	! Don't move this insn\n"		\
   1512 "	mulscc	%%g1,%3,%%g1\n"						\
   1513 "	mulscc	%%g1,%3,%%g1\n"						\
   1514 "	mulscc	%%g1,%3,%%g1\n"						\
   1515 "	mulscc	%%g1,%3,%%g1\n"						\
   1516 "	mulscc	%%g1,%3,%%g1\n"						\
   1517 "	mulscc	%%g1,%3,%%g1\n"						\
   1518 "	mulscc	%%g1,%3,%%g1\n"						\
   1519 "	mulscc	%%g1,%3,%%g1\n"						\
   1520 "	mulscc	%%g1,%3,%%g1\n"						\
   1521 "	mulscc	%%g1,%3,%%g1\n"						\
   1522 "	mulscc	%%g1,%3,%%g1\n"						\
   1523 "	mulscc	%%g1,%3,%%g1\n"						\
   1524 "	mulscc	%%g1,%3,%%g1\n"						\
   1525 "	mulscc	%%g1,%3,%%g1\n"						\
   1526 "	mulscc	%%g1,%3,%%g1\n"						\
   1527 "	mulscc	%%g1,%3,%%g1\n"						\
   1528 "	mulscc	%%g1,%3,%%g1\n"						\
   1529 "	mulscc	%%g1,%3,%%g1\n"						\
   1530 "	mulscc	%%g1,%3,%%g1\n"						\
   1531 "	mulscc	%%g1,%3,%%g1\n"						\
   1532 "	mulscc	%%g1,%3,%%g1\n"						\
   1533 "	mulscc	%%g1,%3,%%g1\n"						\
   1534 "	mulscc	%%g1,%3,%%g1\n"						\
   1535 "	mulscc	%%g1,%3,%%g1\n"						\
   1536 "	mulscc	%%g1,%3,%%g1\n"						\
   1537 "	mulscc	%%g1,%3,%%g1\n"						\
   1538 "	mulscc	%%g1,%3,%%g1\n"						\
   1539 "	mulscc	%%g1,%3,%%g1\n"						\
   1540 "	mulscc	%%g1,%3,%%g1\n"						\
   1541 "	mulscc	%%g1,%3,%%g1\n"						\
   1542 "	mulscc	%%g1,%3,%%g1\n"						\
   1543 "	mulscc	%%g1,%3,%%g1\n"						\
   1544 "	mulscc	%%g1,0,%%g1\n"						\
   1545 "	add	%%g1,%%g2,%0\n"						\
   1546 "	rd	%%y,%1"							\
   1547 	   : "=r" (w1), "=r" (w0) : "%rI" (u), "r" (v)			\
   1548 	   : "%g1", "%g2" __AND_CLOBBER_CC)
   1549 #define UMUL_TIME 39		/* 39 instructions */
   1550 #endif
   1551 #ifndef udiv_qrnnd
   1552 #ifndef LONGLONG_STANDALONE
   1553 #define udiv_qrnnd(q, r, n1, n0, d) \
   1554   do { UWtype __r;							\
   1555     (q) = __MPN(udiv_qrnnd) (&__r, (n1), (n0), (d));			\
   1556     (r) = __r;								\
   1557   } while (0)
   1558 extern UWtype __MPN(udiv_qrnnd) _PROTO ((UWtype *, UWtype, UWtype, UWtype));
   1559 #ifndef UDIV_TIME
   1560 #define UDIV_TIME 140
   1561 #endif
   1562 #endif /* LONGLONG_STANDALONE */
   1563 #endif /* udiv_qrnnd */
   1564 #endif /* __sparc__ */
   1565 
   1566 #if defined (__sparc__) && W_TYPE_SIZE == 64
   1567 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \
   1568   __asm__ (								\
   1569        "addcc	%r4,%5,%1\n"						\
   1570       "	addccc	%r6,%7,%%g0\n"						\
   1571       "	addc	%r2,%3,%0"						\
   1572 	  : "=r" (sh), "=&r" (sl)					\
   1573 	  : "rJ" (ah), "rI" (bh), "%rJ" (al), "rI" (bl),		\
   1574 	    "%rJ" ((al) >> 32), "rI" ((bl) >> 32)			\
   1575 	   __CLOBBER_CC)
   1576 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \
   1577   __asm__ (								\
   1578        "subcc	%r4,%5,%1\n"						\
   1579       "	subccc	%r6,%7,%%g0\n"						\
   1580       "	subc	%r2,%3,%0"						\
   1581 	  : "=r" (sh), "=&r" (sl)					\
   1582 	  : "rJ" (ah), "rI" (bh), "rJ" (al), "rI" (bl),		\
   1583 	    "rJ" ((al) >> 32), "rI" ((bl) >> 32)			\
   1584 	   __CLOBBER_CC)
   1585 #endif
   1586 
   1587 #if defined (__vax__) && W_TYPE_SIZE == 32
   1588 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \
   1589   __asm__ ("addl2 %5,%1\n\tadwc %3,%0"					\
   1590 	   : "=g" (sh), "=&g" (sl)					\
   1591 	   : "0"  ((USItype)(ah)), "g" ((USItype)(bh)),			\
   1592 	     "%1" ((USItype)(al)), "g" ((USItype)(bl)))
   1593 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \
   1594   __asm__ ("subl2 %5,%1\n\tsbwc %3,%0"					\
   1595 	   : "=g" (sh), "=&g" (sl)					\
   1596 	   : "0" ((USItype)(ah)), "g" ((USItype)(bh)),			\
   1597 	     "1" ((USItype)(al)), "g" ((USItype)(bl)))
   1598 #define smul_ppmm(xh, xl, m0, m1) \
   1599   do {									\
   1600     union {UDItype __ll;						\
   1601 	   struct {USItype __l, __h;} __i;				\
   1602 	  } __x;							\
   1603     USItype __m0 = (m0), __m1 = (m1);					\
   1604     __asm__ ("emul %1,%2,$0,%0"						\
   1605 	     : "=g" (__x.__ll) : "g" (__m0), "g" (__m1));		\
   1606     (xh) = __x.__i.__h; (xl) = __x.__i.__l;				\
   1607   } while (0)
   1608 #define sdiv_qrnnd(q, r, n1, n0, d) \
   1609   do {									\
   1610     union {DItype __ll;							\
   1611 	   struct {SItype __l, __h;} __i;				\
   1612 	  } __x;							\
   1613     __x.__i.__h = n1; __x.__i.__l = n0;					\
   1614     __asm__ ("ediv %3,%2,%0,%1"						\
   1615 	     : "=g" (q), "=g" (r) : "g" (__x.__ll), "g" (d));		\
   1616   } while (0)
   1617 #if 0
   1618 /* FIXME: This instruction appears to be unimplemented on some systems (vax
   1619    8800 maybe). */
   1620 #define count_trailing_zeros(count,x)					\
   1621   do {									\
   1622     __asm__ ("ffs 0, 31, %1, %0"					\
   1623 	     : "=g" (count)						\
   1624 	     : "g" ((USItype) (x)));					\
   1625   } while (0)
   1626 #endif
   1627 #endif /* __vax__ */
   1628 
   1629 #if defined (__z8000__) && W_TYPE_SIZE == 16
   1630 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \
   1631   __asm__ ("add	%H1,%H5\n\tadc	%H0,%H3"				\
   1632 	   : "=r" (sh), "=&r" (sl)					\
   1633 	   : "0"  ((unsigned int)(ah)), "r" ((unsigned int)(bh)),	\
   1634 	     "%1" ((unsigned int)(al)), "rQR" ((unsigned int)(bl)))
   1635 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \
   1636   __asm__ ("sub	%H1,%H5\n\tsbc	%H0,%H3"				\
   1637 	   : "=r" (sh), "=&r" (sl)					\
   1638 	   : "0" ((unsigned int)(ah)), "r" ((unsigned int)(bh)),	\
   1639 	     "1" ((unsigned int)(al)), "rQR" ((unsigned int)(bl)))
   1640 #define umul_ppmm(xh, xl, m0, m1) \
   1641   do {									\
   1642     union {long int __ll;						\
   1643 	   struct {unsigned int __h, __l;} __i;				\
   1644 	  } __x;							\
   1645     unsigned int __m0 = (m0), __m1 = (m1);				\
   1646     __asm__ ("mult	%S0,%H3"					\
   1647 	     : "=r" (__x.__i.__h), "=r" (__x.__i.__l)			\
   1648 	     : "%1" (m0), "rQR" (m1));					\
   1649     (xh) = __x.__i.__h; (xl) = __x.__i.__l;				\
   1650     (xh) += ((((signed int) __m0 >> 15) & __m1)				\
   1651 	     + (((signed int) __m1 >> 15) & __m0));			\
   1652   } while (0)
   1653 #endif /* __z8000__ */
   1654 
   1655 #endif /* __GNUC__ */
   1656 
   1657 #endif /* NO_ASM */
   1658 
   1659 
   1660 #if !defined (umul_ppmm) && defined (__umulsidi3)
   1661 #define umul_ppmm(ph, pl, m0, m1) \
   1662   {									\
   1663     UDWtype __ll = __umulsidi3 (m0, m1);				\
   1664     ph = (UWtype) (__ll >> W_TYPE_SIZE);				\
   1665     pl = (UWtype) __ll;							\
   1666   }
   1667 #endif
   1668 
   1669 #if !defined (__umulsidi3)
   1670 #define __umulsidi3(u, v) \
   1671   ({UWtype __hi, __lo;							\
   1672     umul_ppmm (__hi, __lo, u, v);					\
   1673     ((UDWtype) __hi << W_TYPE_SIZE) | __lo; })
   1674 #endif
   1675 
   1676 
   1677 /* Use mpn_umul_ppmm or mpn_udiv_qrnnd functions, if they exist.  The "_r"
   1678    forms have "reversed" arguments, meaning the pointer is last, which
   1679    sometimes allows better parameter passing, in particular on 64-bit
   1680    hppa. */
   1681 
   1682 #define mpn_umul_ppmm  __MPN(umul_ppmm)
   1683 extern UWtype mpn_umul_ppmm _PROTO ((UWtype *, UWtype, UWtype));
   1684 
   1685 #if ! defined (umul_ppmm) && HAVE_NATIVE_mpn_umul_ppmm  \
   1686   && ! defined (LONGLONG_STANDALONE)
   1687 #define umul_ppmm(wh, wl, u, v)						      \
   1688   do {									      \
   1689     UWtype __umul_ppmm__p0;						      \
   1690     (wh) = mpn_umul_ppmm (&__umul_ppmm__p0, (UWtype) (u), (UWtype) (v));      \
   1691     (wl) = __umul_ppmm__p0;						      \
   1692   } while (0)
   1693 #endif
   1694 
   1695 #define mpn_umul_ppmm_r  __MPN(umul_ppmm_r)
   1696 extern UWtype mpn_umul_ppmm_r _PROTO ((UWtype, UWtype, UWtype *));
   1697 
   1698 #if ! defined (umul_ppmm) && HAVE_NATIVE_mpn_umul_ppmm_r	\
   1699   && ! defined (LONGLONG_STANDALONE)
   1700 #define umul_ppmm(wh, wl, u, v)						      \
   1701   do {									      \
   1702     UWtype __umul_ppmm__p0;						      \
   1703     (wh) = mpn_umul_ppmm_r ((UWtype) (u), (UWtype) (v), &__umul_ppmm__p0);    \
   1704     (wl) = __umul_ppmm__p0;						      \
   1705   } while (0)
   1706 #endif
   1707 
   1708 #define mpn_udiv_qrnnd  __MPN(udiv_qrnnd)
   1709 extern UWtype mpn_udiv_qrnnd _PROTO ((UWtype *, UWtype, UWtype, UWtype));
   1710 
   1711 #if ! defined (udiv_qrnnd) && HAVE_NATIVE_mpn_udiv_qrnnd	\
   1712   && ! defined (LONGLONG_STANDALONE)
   1713 #define udiv_qrnnd(q, r, n1, n0, d)					\
   1714   do {									\
   1715     UWtype __udiv_qrnnd__r;						\
   1716     (q) = mpn_udiv_qrnnd (&__udiv_qrnnd__r,				\
   1717 			  (UWtype) (n1), (UWtype) (n0), (UWtype) d);	\
   1718     (r) = __udiv_qrnnd__r;						\
   1719   } while (0)
   1720 #endif
   1721 
   1722 #define mpn_udiv_qrnnd_r  __MPN(udiv_qrnnd_r)
   1723 extern UWtype mpn_udiv_qrnnd_r _PROTO ((UWtype, UWtype, UWtype, UWtype *));
   1724 
   1725 #if ! defined (udiv_qrnnd) && HAVE_NATIVE_mpn_udiv_qrnnd_r	\
   1726   && ! defined (LONGLONG_STANDALONE)
   1727 #define udiv_qrnnd(q, r, n1, n0, d)					\
   1728   do {									\
   1729     UWtype __udiv_qrnnd__r;						\
   1730     (q) = mpn_udiv_qrnnd_r ((UWtype) (n1), (UWtype) (n0), (UWtype) d,	\
   1731 			    &__udiv_qrnnd__r);				\
   1732     (r) = __udiv_qrnnd__r;						\
   1733   } while (0)
   1734 #endif
   1735 
   1736 
   1737 /* If this machine has no inline assembler, use C macros.  */
   1738 
   1739 #if !defined (add_ssaaaa)
   1740 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \
   1741   do {									\
   1742     UWtype __x;								\
   1743     __x = (al) + (bl);							\
   1744     (sh) = (ah) + (bh) + (__x < (al));					\
   1745     (sl) = __x;								\
   1746   } while (0)
   1747 #endif
   1748 
   1749 #if !defined (sub_ddmmss)
   1750 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \
   1751   do {									\
   1752     UWtype __x;								\
   1753     __x = (al) - (bl);							\
   1754     (sh) = (ah) - (bh) - ((al) < (bl));                                 \
   1755     (sl) = __x;								\
   1756   } while (0)
   1757 #endif
   1758 
   1759 /* If we lack umul_ppmm but have smul_ppmm, define umul_ppmm in terms of
   1760    smul_ppmm.  */
   1761 #if !defined (umul_ppmm) && defined (smul_ppmm)
   1762 #define umul_ppmm(w1, w0, u, v)						\
   1763   do {									\
   1764     UWtype __w1;							\
   1765     UWtype __xm0 = (u), __xm1 = (v);					\
   1766     smul_ppmm (__w1, w0, __xm0, __xm1);					\
   1767     (w1) = __w1 + (-(__xm0 >> (W_TYPE_SIZE - 1)) & __xm1)		\
   1768 		+ (-(__xm1 >> (W_TYPE_SIZE - 1)) & __xm0);		\
   1769   } while (0)
   1770 #endif
   1771 
   1772 /* If we still don't have umul_ppmm, define it using plain C.
   1773 
   1774    For reference, when this code is used for squaring (ie. u and v identical
   1775    expressions), gcc recognises __x1 and __x2 are the same and generates 3
   1776    multiplies, not 4.  The subsequent additions could be optimized a bit,
   1777    but the only place GMP currently uses such a square is mpn_sqr_basecase,
   1778    and chips obliged to use this generic C umul will have plenty of worse
   1779    performance problems than a couple of extra instructions on the diagonal
   1780    of sqr_basecase.  */
   1781 
   1782 #if !defined (umul_ppmm)
   1783 #define umul_ppmm(w1, w0, u, v)						\
   1784   do {									\
   1785     UWtype __x0, __x1, __x2, __x3;					\
   1786     UHWtype __ul, __vl, __uh, __vh;					\
   1787     UWtype __u = (u), __v = (v);					\
   1788 									\
   1789     __ul = __ll_lowpart (__u);						\
   1790     __uh = __ll_highpart (__u);						\
   1791     __vl = __ll_lowpart (__v);						\
   1792     __vh = __ll_highpart (__v);						\
   1793 									\
   1794     __x0 = (UWtype) __ul * __vl;					\
   1795     __x1 = (UWtype) __ul * __vh;					\
   1796     __x2 = (UWtype) __uh * __vl;					\
   1797     __x3 = (UWtype) __uh * __vh;					\
   1798 									\
   1799     __x1 += __ll_highpart (__x0);/* this can't give carry */		\
   1800     __x1 += __x2;		/* but this indeed can */		\
   1801     if (__x1 < __x2)		/* did we get it? */			\
   1802       __x3 += __ll_B;		/* yes, add it in the proper pos. */	\
   1803 									\
   1804     (w1) = __x3 + __ll_highpart (__x1);					\
   1805     (w0) = (__x1 << W_TYPE_SIZE/2) + __ll_lowpart (__x0);		\
   1806   } while (0)
   1807 #endif
   1808 
   1809 /* If we don't have smul_ppmm, define it using umul_ppmm (which surely will
   1810    exist in one form or another.  */
   1811 #if !defined (smul_ppmm)
   1812 #define smul_ppmm(w1, w0, u, v)						\
   1813   do {									\
   1814     UWtype __w1;							\
   1815     UWtype __xm0 = (u), __xm1 = (v);					\
   1816     umul_ppmm (__w1, w0, __xm0, __xm1);					\
   1817     (w1) = __w1 - (-(__xm0 >> (W_TYPE_SIZE - 1)) & __xm1)		\
   1818 		- (-(__xm1 >> (W_TYPE_SIZE - 1)) & __xm0);		\
   1819   } while (0)
   1820 #endif
   1821 
   1822 /* Define this unconditionally, so it can be used for debugging.  */
   1823 #define __udiv_qrnnd_c(q, r, n1, n0, d) \
   1824   do {									\
   1825     UWtype __d1, __d0, __q1, __q0, __r1, __r0, __m;			\
   1826 									\
   1827     ASSERT ((d) != 0);							\
   1828     ASSERT ((n1) < (d));						\
   1829 									\
   1830     __d1 = __ll_highpart (d);						\
   1831     __d0 = __ll_lowpart (d);						\
   1832 									\
   1833     __q1 = (n1) / __d1;							\
   1834     __r1 = (n1) - __q1 * __d1;						\
   1835     __m = __q1 * __d0;							\
   1836     __r1 = __r1 * __ll_B | __ll_highpart (n0);				\
   1837     if (__r1 < __m)							\
   1838       {									\
   1839 	__q1--, __r1 += (d);						\
   1840 	if (__r1 >= (d)) /* i.e. we didn't get carry when adding to __r1 */\
   1841 	  if (__r1 < __m)						\
   1842 	    __q1--, __r1 += (d);					\
   1843       }									\
   1844     __r1 -= __m;							\
   1845 									\
   1846     __q0 = __r1 / __d1;							\
   1847     __r0 = __r1  - __q0 * __d1;						\
   1848     __m = __q0 * __d0;							\
   1849     __r0 = __r0 * __ll_B | __ll_lowpart (n0);				\
   1850     if (__r0 < __m)							\
   1851       {									\
   1852 	__q0--, __r0 += (d);						\
   1853 	if (__r0 >= (d))						\
   1854 	  if (__r0 < __m)						\
   1855 	    __q0--, __r0 += (d);					\
   1856       }									\
   1857     __r0 -= __m;							\
   1858 									\
   1859     (q) = __q1 * __ll_B | __q0;						\
   1860     (r) = __r0;								\
   1861   } while (0)
   1862 
   1863 /* If the processor has no udiv_qrnnd but sdiv_qrnnd, go through
   1864    __udiv_w_sdiv (defined in libgcc or elsewhere).  */
   1865 #if !defined (udiv_qrnnd) && defined (sdiv_qrnnd)
   1866 #define udiv_qrnnd(q, r, nh, nl, d) \
   1867   do {									\
   1868     UWtype __r;								\
   1869     (q) = __MPN(udiv_w_sdiv) (&__r, nh, nl, d);				\
   1870     (r) = __r;								\
   1871   } while (0)
   1872 #endif
   1873 
   1874 /* If udiv_qrnnd was not defined for this processor, use __udiv_qrnnd_c.  */
   1875 #if !defined (udiv_qrnnd)
   1876 #define UDIV_NEEDS_NORMALIZATION 1
   1877 #define udiv_qrnnd __udiv_qrnnd_c
   1878 #endif
   1879 
   1880 #if !defined (count_leading_zeros)
   1881 #define count_leading_zeros(count, x) \
   1882   do {									\
   1883     UWtype __xr = (x);							\
   1884     UWtype __a;								\
   1885 									\
   1886     if (W_TYPE_SIZE == 32)						\
   1887       {									\
   1888 	__a = __xr < ((UWtype) 1 << 2*__BITS4)				\
   1889 	  ? (__xr < ((UWtype) 1 << __BITS4) ? 1 : __BITS4 + 1)		\
   1890 	  : (__xr < ((UWtype) 1 << 3*__BITS4) ? 2*__BITS4 + 1		\
   1891 	  : 3*__BITS4 + 1);						\
   1892       }									\
   1893     else								\
   1894       {									\
   1895 	for (__a = W_TYPE_SIZE - 8; __a > 0; __a -= 8)			\
   1896 	  if (((__xr >> __a) & 0xff) != 0)				\
   1897 	    break;							\
   1898 	++__a;								\
   1899       }									\
   1900 									\
   1901     (count) = W_TYPE_SIZE + 1 - __a - __clz_tab[__xr >> __a];		\
   1902   } while (0)
   1903 /* This version gives a well-defined value for zero. */
   1904 #define COUNT_LEADING_ZEROS_0 (W_TYPE_SIZE - 1)
   1905 #define COUNT_LEADING_ZEROS_NEED_CLZ_TAB
   1906 #endif
   1907 
   1908 /* clz_tab needed by mpn/x86/pentium/mod_1.asm in a fat binary */
   1909 #if HAVE_HOST_CPU_FAMILY_x86 && WANT_FAT_BINARY
   1910 #define COUNT_LEADING_ZEROS_NEED_CLZ_TAB
   1911 #endif
   1912 
   1913 #ifdef COUNT_LEADING_ZEROS_NEED_CLZ_TAB
   1914 extern const unsigned char __GMP_DECLSPEC __clz_tab[128];
   1915 #endif
   1916 
   1917 #if !defined (count_trailing_zeros)
   1918 /* Define count_trailing_zeros using count_leading_zeros.  The latter might be
   1919    defined in asm, but if it is not, the C version above is good enough.  */
   1920 #define count_trailing_zeros(count, x) \
   1921   do {									\
   1922     UWtype __ctz_x = (x);						\
   1923     UWtype __ctz_c;							\
   1924     ASSERT (__ctz_x != 0);						\
   1925     count_leading_zeros (__ctz_c, __ctz_x & -__ctz_x);			\
   1926     (count) = W_TYPE_SIZE - 1 - __ctz_c;				\
   1927   } while (0)
   1928 #endif
   1929 
   1930 #ifndef UDIV_NEEDS_NORMALIZATION
   1931 #define UDIV_NEEDS_NORMALIZATION 0
   1932 #endif
   1933 
   1934 /* Whether udiv_qrnnd is actually implemented with udiv_qrnnd_preinv, and
   1935    that hence the latter should always be used.  */
   1936 #ifndef UDIV_PREINV_ALWAYS
   1937 #define UDIV_PREINV_ALWAYS 0
   1938 #endif
   1939 
   1940 /* Give defaults for UMUL_TIME and UDIV_TIME.  */
   1941 #ifndef UMUL_TIME
   1942 #define UMUL_TIME 1
   1943 #endif
   1944 
   1945 #ifndef UDIV_TIME
   1946 #define UDIV_TIME UMUL_TIME
   1947 #endif
   1948