Home | History | Annotate | Line # | Download | only in dist
longlong.h revision 1.1.1.3
      1 /* longlong.h -- definitions for mixed size 32/64 bit arithmetic.
      2 
      3 Copyright 1991-1994, 1996, 1997, 1999-2005, 2007-2009, 2011-2016 Free Software
      4 Foundation, Inc.
      5 
      6 This file is part of the GNU MP Library.
      7 
      8 The GNU MP Library is free software; you can redistribute it and/or modify
      9 it under the terms of either:
     10 
     11   * the GNU Lesser General Public License as published by the Free
     12     Software Foundation; either version 3 of the License, or (at your
     13     option) any later version.
     14 
     15 or
     16 
     17   * the GNU General Public License as published by the Free Software
     18     Foundation; either version 2 of the License, or (at your option) any
     19     later version.
     20 
     21 or both in parallel, as here.
     22 
     23 The GNU MP Library is distributed in the hope that it will be useful, but
     24 WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
     25 or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
     26 for more details.
     27 
     28 You should have received copies of the GNU General Public License and the
     29 GNU Lesser General Public License along with the GNU MP Library.  If not,
     30 see https://www.gnu.org/licenses/.  */
     31 
     32 /* You have to define the following before including this file:
     33 
     34    UWtype -- An unsigned type, default type for operations (typically a "word")
     35    UHWtype -- An unsigned type, at least half the size of UWtype
     36    UDWtype -- An unsigned type, at least twice as large a UWtype
     37    W_TYPE_SIZE -- size in bits of UWtype
     38 
     39    SItype, USItype -- Signed and unsigned 32 bit types
     40    DItype, UDItype -- Signed and unsigned 64 bit types
     41 
     42    On a 32 bit machine UWtype should typically be USItype;
     43    on a 64 bit machine, UWtype should typically be UDItype.
     44 
     45    Optionally, define:
     46 
     47    LONGLONG_STANDALONE -- Avoid code that needs machine-dependent support files
     48    NO_ASM -- Disable inline asm
     49 
     50 
     51    CAUTION!  Using this version of longlong.h outside of GMP is not safe.  You
     52    need to include gmp.h and gmp-impl.h, or certain things might not work as
     53    expected.
     54 */
     55 
     56 #define __BITS4 (W_TYPE_SIZE / 4)
     57 #define __ll_B ((UWtype) 1 << (W_TYPE_SIZE / 2))
     58 #define __ll_lowpart(t) ((UWtype) (t) & (__ll_B - 1))
     59 #define __ll_highpart(t) ((UWtype) (t) >> (W_TYPE_SIZE / 2))
     60 
     61 /* This is used to make sure no undesirable sharing between different libraries
     62    that use this file takes place.  */
     63 #ifndef __MPN
     64 #define __MPN(x) __##x
     65 #endif
     66 
     67 /* Define auxiliary asm macros.
     68 
     69    1) umul_ppmm(high_prod, low_prod, multiplier, multiplicand) multiplies two
     70    UWtype integers MULTIPLIER and MULTIPLICAND, and generates a two UWtype
     71    word product in HIGH_PROD and LOW_PROD.
     72 
     73    2) __umulsidi3(a,b) multiplies two UWtype integers A and B, and returns a
     74    UDWtype product.  This is just a variant of umul_ppmm.
     75 
     76    3) udiv_qrnnd(quotient, remainder, high_numerator, low_numerator,
     77    denominator) divides a UDWtype, composed by the UWtype integers
     78    HIGH_NUMERATOR and LOW_NUMERATOR, by DENOMINATOR and places the quotient
     79    in QUOTIENT and the remainder in REMAINDER.  HIGH_NUMERATOR must be less
     80    than DENOMINATOR for correct operation.  If, in addition, the most
     81    significant bit of DENOMINATOR must be 1, then the pre-processor symbol
     82    UDIV_NEEDS_NORMALIZATION is defined to 1.
     83 
     84    4) sdiv_qrnnd(quotient, remainder, high_numerator, low_numerator,
     85    denominator).  Like udiv_qrnnd but the numbers are signed.  The quotient
     86    is rounded towards 0.
     87 
     88    5) count_leading_zeros(count, x) counts the number of zero-bits from the
     89    msb to the first non-zero bit in the UWtype X.  This is the number of
     90    steps X needs to be shifted left to set the msb.  Undefined for X == 0,
     91    unless the symbol COUNT_LEADING_ZEROS_0 is defined to some value.
     92 
     93    6) count_trailing_zeros(count, x) like count_leading_zeros, but counts
     94    from the least significant end.
     95 
     96    7) add_ssaaaa(high_sum, low_sum, high_addend_1, low_addend_1,
     97    high_addend_2, low_addend_2) adds two UWtype integers, composed by
     98    HIGH_ADDEND_1 and LOW_ADDEND_1, and HIGH_ADDEND_2 and LOW_ADDEND_2
     99    respectively.  The result is placed in HIGH_SUM and LOW_SUM.  Overflow
    100    (i.e. carry out) is not stored anywhere, and is lost.
    101 
    102    8) sub_ddmmss(high_difference, low_difference, high_minuend, low_minuend,
    103    high_subtrahend, low_subtrahend) subtracts two two-word UWtype integers,
    104    composed by HIGH_MINUEND_1 and LOW_MINUEND_1, and HIGH_SUBTRAHEND_2 and
    105    LOW_SUBTRAHEND_2 respectively.  The result is placed in HIGH_DIFFERENCE
    106    and LOW_DIFFERENCE.  Overflow (i.e. carry out) is not stored anywhere,
    107    and is lost.
    108 
    109    If any of these macros are left undefined for a particular CPU,
    110    C macros are used.
    111 
    112 
    113    Notes:
    114 
    115    For add_ssaaaa the two high and two low addends can both commute, but
    116    unfortunately gcc only supports one "%" commutative in each asm block.
    117    This has always been so but is only documented in recent versions
    118    (eg. pre-release 3.3).  Having two or more "%"s can cause an internal
    119    compiler error in certain rare circumstances.
    120 
    121    Apparently it was only the last "%" that was ever actually respected, so
    122    the code has been updated to leave just that.  Clearly there's a free
    123    choice whether high or low should get it, if there's a reason to favour
    124    one over the other.  Also obviously when the constraints on the two
    125    operands are identical there's no benefit to the reloader in any "%" at
    126    all.
    127 
    128    */
    129 
    130 /* The CPUs come in alphabetical order below.
    131 
    132    Please add support for more CPUs here, or improve the current support
    133    for the CPUs below!  */
    134 
    135 
    136 /* count_leading_zeros_gcc_clz is count_leading_zeros implemented with gcc
    137    3.4 __builtin_clzl or __builtin_clzll, according to our limb size.
    138    Similarly count_trailing_zeros_gcc_ctz using __builtin_ctzl or
    139    __builtin_ctzll.
    140 
    141    These builtins are only used when we check what code comes out, on some
    142    chips they're merely libgcc calls, where we will instead want an inline
    143    in that case (either asm or generic C).
    144 
    145    These builtins are better than an asm block of the same insn, since an
    146    asm block doesn't give gcc any information about scheduling or resource
    147    usage.  We keep an asm block for use on prior versions of gcc though.
    148 
    149    For reference, __builtin_ffs existed in gcc prior to __builtin_clz, but
    150    it's not used (for count_leading_zeros) because it generally gives extra
    151    code to ensure the result is 0 when the input is 0, which we don't need
    152    or want.  */
    153 
    154 #ifdef _LONG_LONG_LIMB
    155 #define count_leading_zeros_gcc_clz(count,x)	\
    156   do {						\
    157     ASSERT ((x) != 0);				\
    158     (count) = __builtin_clzll (x);		\
    159   } while (0)
    160 #else
    161 #define count_leading_zeros_gcc_clz(count,x)	\
    162   do {						\
    163     ASSERT ((x) != 0);				\
    164     (count) = __builtin_clzl (x);		\
    165   } while (0)
    166 #endif
    167 
    168 #ifdef _LONG_LONG_LIMB
    169 #define count_trailing_zeros_gcc_ctz(count,x)	\
    170   do {						\
    171     ASSERT ((x) != 0);				\
    172     (count) = __builtin_ctzll (x);		\
    173   } while (0)
    174 #else
    175 #define count_trailing_zeros_gcc_ctz(count,x)	\
    176   do {						\
    177     ASSERT ((x) != 0);				\
    178     (count) = __builtin_ctzl (x);		\
    179   } while (0)
    180 #endif
    181 
    182 
    183 /* FIXME: The macros using external routines like __MPN(count_leading_zeros)
    184    don't need to be under !NO_ASM */
    185 #if ! defined (NO_ASM)
    186 
    187 #if defined (__alpha) && W_TYPE_SIZE == 64
    188 /* Most alpha-based machines, except Cray systems. */
    189 #if defined (__GNUC__)
    190 #if __GMP_GNUC_PREREQ (3,3)
    191 #define umul_ppmm(ph, pl, m0, m1) \
    192   do {									\
    193     UDItype __m0 = (m0), __m1 = (m1);					\
    194     (ph) = __builtin_alpha_umulh (__m0, __m1);				\
    195     (pl) = __m0 * __m1;							\
    196   } while (0)
    197 #else
    198 #define umul_ppmm(ph, pl, m0, m1) \
    199   do {									\
    200     UDItype __m0 = (m0), __m1 = (m1);					\
    201     __asm__ ("umulh %r1,%2,%0"						\
    202 	     : "=r" (ph)						\
    203 	     : "%rJ" (__m0), "rI" (__m1));				\
    204     (pl) = __m0 * __m1;							\
    205   } while (0)
    206 #endif
    207 #define UMUL_TIME 18
    208 #else /* ! __GNUC__ */
    209 #include <machine/builtins.h>
    210 #define umul_ppmm(ph, pl, m0, m1) \
    211   do {									\
    212     UDItype __m0 = (m0), __m1 = (m1);					\
    213     (ph) = __UMULH (__m0, __m1);					\
    214     (pl) = __m0 * __m1;							\
    215   } while (0)
    216 #endif
    217 #ifndef LONGLONG_STANDALONE
    218 #define udiv_qrnnd(q, r, n1, n0, d) \
    219   do { UWtype __di;							\
    220     __di = __MPN(invert_limb) (d);					\
    221     udiv_qrnnd_preinv (q, r, n1, n0, d, __di);				\
    222   } while (0)
    223 #define UDIV_PREINV_ALWAYS  1
    224 #define UDIV_NEEDS_NORMALIZATION 1
    225 #define UDIV_TIME 220
    226 #endif /* LONGLONG_STANDALONE */
    227 
    228 /* clz_tab is required in all configurations, since mpn/alpha/cntlz.asm
    229    always goes into libgmp.so, even when not actually used.  */
    230 #define COUNT_LEADING_ZEROS_NEED_CLZ_TAB
    231 
    232 #if defined (__GNUC__) && HAVE_HOST_CPU_alpha_CIX
    233 #define count_leading_zeros(COUNT,X) \
    234   __asm__("ctlz %1,%0" : "=r"(COUNT) : "r"(X))
    235 #define count_trailing_zeros(COUNT,X) \
    236   __asm__("cttz %1,%0" : "=r"(COUNT) : "r"(X))
    237 #endif /* clz/ctz using cix */
    238 
    239 #if ! defined (count_leading_zeros)				\
    240   && defined (__GNUC__) && ! defined (LONGLONG_STANDALONE)
    241 /* ALPHA_CMPBGE_0 gives "cmpbge $31,src,dst", ie. test src bytes == 0.
    242    "$31" is written explicitly in the asm, since an "r" constraint won't
    243    select reg 31.  There seems no need to worry about "r31" syntax for cray,
    244    since gcc itself (pre-release 3.4) emits just $31 in various places.	 */
    245 #define ALPHA_CMPBGE_0(dst, src)					\
    246   do { asm ("cmpbge $31, %1, %0" : "=r" (dst) : "r" (src)); } while (0)
    247 /* Zero bytes are turned into bits with cmpbge, a __clz_tab lookup counts
    248    them, locating the highest non-zero byte.  A second __clz_tab lookup
    249    counts the leading zero bits in that byte, giving the result.  */
    250 #define count_leading_zeros(count, x)					\
    251   do {									\
    252     UWtype  __clz__b, __clz__c, __clz__x = (x);				\
    253     ALPHA_CMPBGE_0 (__clz__b,  __clz__x);	    /* zero bytes */	\
    254     __clz__b = __clz_tab [(__clz__b >> 1) ^ 0x7F];  /* 8 to 1 byte */	\
    255     __clz__b = __clz__b * 8 - 7;		    /* 57 to 1 shift */ \
    256     __clz__x >>= __clz__b;						\
    257     __clz__c = __clz_tab [__clz__x];		    /* 8 to 1 bit */	\
    258     __clz__b = 65 - __clz__b;						\
    259     (count) = __clz__b - __clz__c;					\
    260   } while (0)
    261 #define COUNT_LEADING_ZEROS_NEED_CLZ_TAB
    262 #endif /* clz using cmpbge */
    263 
    264 #if ! defined (count_leading_zeros) && ! defined (LONGLONG_STANDALONE)
    265 #if HAVE_ATTRIBUTE_CONST
    266 long __MPN(count_leading_zeros) (UDItype) __attribute__ ((const));
    267 #else
    268 long __MPN(count_leading_zeros) (UDItype);
    269 #endif
    270 #define count_leading_zeros(count, x) \
    271   ((count) = __MPN(count_leading_zeros) (x))
    272 #endif /* clz using mpn */
    273 #endif /* __alpha */
    274 
    275 #if defined (__AVR) && W_TYPE_SIZE == 8
    276 #define umul_ppmm(ph, pl, m0, m1) \
    277   do {									\
    278     unsigned short __p = (unsigned short) (m0) * (m1);			\
    279     (ph) = __p >> 8;							\
    280     (pl) = __p;								\
    281   } while (0)
    282 #endif /* AVR */
    283 
    284 #if defined (_CRAY) && W_TYPE_SIZE == 64
    285 #include <intrinsics.h>
    286 #define UDIV_PREINV_ALWAYS  1
    287 #define UDIV_NEEDS_NORMALIZATION 1
    288 #define UDIV_TIME 220
    289 long __MPN(count_leading_zeros) (UDItype);
    290 #define count_leading_zeros(count, x) \
    291   ((count) = _leadz ((UWtype) (x)))
    292 #if defined (_CRAYIEEE)		/* I.e., Cray T90/ieee, T3D, and T3E */
    293 #define umul_ppmm(ph, pl, m0, m1) \
    294   do {									\
    295     UDItype __m0 = (m0), __m1 = (m1);					\
    296     (ph) = _int_mult_upper (__m0, __m1);				\
    297     (pl) = __m0 * __m1;							\
    298   } while (0)
    299 #ifndef LONGLONG_STANDALONE
    300 #define udiv_qrnnd(q, r, n1, n0, d) \
    301   do { UWtype __di;							\
    302     __di = __MPN(invert_limb) (d);					\
    303     udiv_qrnnd_preinv (q, r, n1, n0, d, __di);				\
    304   } while (0)
    305 #endif /* LONGLONG_STANDALONE */
    306 #endif /* _CRAYIEEE */
    307 #endif /* _CRAY */
    308 
    309 #if defined (__ia64) && W_TYPE_SIZE == 64
    310 /* This form encourages gcc (pre-release 3.4 at least) to emit predicated
    311    "sub r=r,r" and "sub r=r,r,1", giving a 2 cycle latency.  The generic
    312    code using "al<bl" arithmetically comes out making an actual 0 or 1 in a
    313    register, which takes an extra cycle.  */
    314 #define sub_ddmmss(sh, sl, ah, al, bh, bl)      \
    315   do {						\
    316     UWtype __x;					\
    317     __x = (al) - (bl);				\
    318     if ((al) < (bl))				\
    319       (sh) = (ah) - (bh) - 1;			\
    320     else					\
    321       (sh) = (ah) - (bh);			\
    322     (sl) = __x;					\
    323   } while (0)
    324 #if defined (__GNUC__) && ! defined (__INTEL_COMPILER)
    325 /* Do both product parts in assembly, since that gives better code with
    326    all gcc versions.  Some callers will just use the upper part, and in
    327    that situation we waste an instruction, but not any cycles.  */
    328 #define umul_ppmm(ph, pl, m0, m1) \
    329     __asm__ ("xma.hu %0 = %2, %3, f0\n\txma.l %1 = %2, %3, f0"		\
    330 	     : "=&f" (ph), "=f" (pl)					\
    331 	     : "f" (m0), "f" (m1))
    332 #define UMUL_TIME 14
    333 #define count_leading_zeros(count, x) \
    334   do {									\
    335     UWtype _x = (x), _y, _a, _c;					\
    336     __asm__ ("mux1 %0 = %1, @rev" : "=r" (_y) : "r" (_x));		\
    337     __asm__ ("czx1.l %0 = %1" : "=r" (_a) : "r" (-_y | _y));		\
    338     _c = (_a - 1) << 3;							\
    339     _x >>= _c;								\
    340     if (_x >= 1 << 4)							\
    341       _x >>= 4, _c += 4;						\
    342     if (_x >= 1 << 2)							\
    343       _x >>= 2, _c += 2;						\
    344     _c += _x >> 1;							\
    345     (count) =  W_TYPE_SIZE - 1 - _c;					\
    346   } while (0)
    347 /* similar to what gcc does for __builtin_ffs, but 0 based rather than 1
    348    based, and we don't need a special case for x==0 here */
    349 #define count_trailing_zeros(count, x)					\
    350   do {									\
    351     UWtype __ctz_x = (x);						\
    352     __asm__ ("popcnt %0 = %1"						\
    353 	     : "=r" (count)						\
    354 	     : "r" ((__ctz_x-1) & ~__ctz_x));				\
    355   } while (0)
    356 #endif
    357 #if defined (__INTEL_COMPILER)
    358 #include <ia64intrin.h>
    359 #define umul_ppmm(ph, pl, m0, m1)					\
    360   do {									\
    361     UWtype __m0 = (m0), __m1 = (m1);					\
    362     ph = _m64_xmahu (__m0, __m1, 0);					\
    363     pl = __m0 * __m1;							\
    364   } while (0)
    365 #endif
    366 #ifndef LONGLONG_STANDALONE
    367 #define udiv_qrnnd(q, r, n1, n0, d) \
    368   do { UWtype __di;							\
    369     __di = __MPN(invert_limb) (d);					\
    370     udiv_qrnnd_preinv (q, r, n1, n0, d, __di);				\
    371   } while (0)
    372 #define UDIV_PREINV_ALWAYS  1
    373 #define UDIV_NEEDS_NORMALIZATION 1
    374 #endif
    375 #define UDIV_TIME 220
    376 #endif
    377 
    378 
    379 #if defined (__GNUC__)
    380 
    381 /* We sometimes need to clobber "cc" with gcc2, but that would not be
    382    understood by gcc1.  Use cpp to avoid major code duplication.  */
    383 #if __GNUC__ < 2
    384 #define __CLOBBER_CC
    385 #define __AND_CLOBBER_CC
    386 #else /* __GNUC__ >= 2 */
    387 #define __CLOBBER_CC : "cc"
    388 #define __AND_CLOBBER_CC , "cc"
    389 #endif /* __GNUC__ < 2 */
    390 
    391 #if (defined (__a29k__) || defined (_AM29K)) && W_TYPE_SIZE == 32
    392 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \
    393   __asm__ ("add %1,%4,%5\n\taddc %0,%2,%3"				\
    394 	   : "=r" (sh), "=&r" (sl)					\
    395 	   : "r" (ah), "rI" (bh), "%r" (al), "rI" (bl))
    396 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \
    397   __asm__ ("sub %1,%4,%5\n\tsubc %0,%2,%3"				\
    398 	   : "=r" (sh), "=&r" (sl)					\
    399 	   : "r" (ah), "rI" (bh), "r" (al), "rI" (bl))
    400 #define umul_ppmm(xh, xl, m0, m1) \
    401   do {									\
    402     USItype __m0 = (m0), __m1 = (m1);					\
    403     __asm__ ("multiplu %0,%1,%2"					\
    404 	     : "=r" (xl)						\
    405 	     : "r" (__m0), "r" (__m1));					\
    406     __asm__ ("multmu %0,%1,%2"						\
    407 	     : "=r" (xh)						\
    408 	     : "r" (__m0), "r" (__m1));					\
    409   } while (0)
    410 #define udiv_qrnnd(q, r, n1, n0, d) \
    411   __asm__ ("dividu %0,%3,%4"						\
    412 	   : "=r" (q), "=q" (r)						\
    413 	   : "1" (n1), "r" (n0), "r" (d))
    414 #define count_leading_zeros(count, x) \
    415     __asm__ ("clz %0,%1"						\
    416 	     : "=r" (count)						\
    417 	     : "r" (x))
    418 #define COUNT_LEADING_ZEROS_0 32
    419 #endif /* __a29k__ */
    420 
    421 #if defined (__arc__)
    422 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \
    423   __asm__ ("add.f\t%1, %4, %5\n\tadc\t%0, %2, %3"			\
    424 	   : "=r" (sh),							\
    425 	     "=&r" (sl)							\
    426 	   : "r"  ((USItype) (ah)),					\
    427 	     "rICal" ((USItype) (bh)),					\
    428 	     "%r" ((USItype) (al)),					\
    429 	     "rICal" ((USItype) (bl)))
    430 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \
    431   __asm__ ("sub.f\t%1, %4, %5\n\tsbc\t%0, %2, %3"			\
    432 	   : "=r" (sh),							\
    433 	     "=&r" (sl)							\
    434 	   : "r" ((USItype) (ah)),					\
    435 	     "rICal" ((USItype) (bh)),					\
    436 	     "r" ((USItype) (al)),					\
    437 	     "rICal" ((USItype) (bl)))
    438 #endif
    439 
    440 #if defined (__arm__) && (defined (__thumb2__) || !defined (__thumb__)) \
    441     && W_TYPE_SIZE == 32
    442 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \
    443   __asm__ ("adds\t%1, %4, %5\n\tadc\t%0, %2, %3"			\
    444 	   : "=r" (sh), "=&r" (sl)					\
    445 	   : "r" (ah), "rI" (bh), "%r" (al), "rI" (bl) __CLOBBER_CC)
    446 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \
    447   do {									\
    448     if (__builtin_constant_p (al))					\
    449       {									\
    450 	if (__builtin_constant_p (ah))					\
    451 	  __asm__ ("rsbs\t%1, %5, %4\n\trsc\t%0, %3, %2"		\
    452 		   : "=r" (sh), "=&r" (sl)				\
    453 		   : "rI" (ah), "r" (bh), "rI" (al), "r" (bl) __CLOBBER_CC); \
    454 	else								\
    455 	  __asm__ ("rsbs\t%1, %5, %4\n\tsbc\t%0, %2, %3"		\
    456 		   : "=r" (sh), "=&r" (sl)				\
    457 		   : "r" (ah), "rI" (bh), "rI" (al), "r" (bl) __CLOBBER_CC); \
    458       }									\
    459     else if (__builtin_constant_p (ah))					\
    460       {									\
    461 	if (__builtin_constant_p (bl))					\
    462 	  __asm__ ("subs\t%1, %4, %5\n\trsc\t%0, %3, %2"		\
    463 		   : "=r" (sh), "=&r" (sl)				\
    464 		   : "rI" (ah), "r" (bh), "r" (al), "rI" (bl) __CLOBBER_CC); \
    465 	else								\
    466 	  __asm__ ("rsbs\t%1, %5, %4\n\trsc\t%0, %3, %2"		\
    467 		   : "=r" (sh), "=&r" (sl)				\
    468 		   : "rI" (ah), "r" (bh), "rI" (al), "r" (bl) __CLOBBER_CC); \
    469       }									\
    470     else if (__builtin_constant_p (bl))					\
    471       {									\
    472 	if (__builtin_constant_p (bh))					\
    473 	  __asm__ ("subs\t%1, %4, %5\n\tsbc\t%0, %2, %3"		\
    474 		   : "=r" (sh), "=&r" (sl)				\
    475 		   : "r" (ah), "rI" (bh), "r" (al), "rI" (bl) __CLOBBER_CC); \
    476 	else								\
    477 	  __asm__ ("subs\t%1, %4, %5\n\trsc\t%0, %3, %2"		\
    478 		   : "=r" (sh), "=&r" (sl)				\
    479 		   : "rI" (ah), "r" (bh), "r" (al), "rI" (bl) __CLOBBER_CC); \
    480       }									\
    481     else /* only bh might be a constant */				\
    482       __asm__ ("subs\t%1, %4, %5\n\tsbc\t%0, %2, %3"			\
    483 	       : "=r" (sh), "=&r" (sl)					\
    484 	       : "r" (ah), "rI" (bh), "r" (al), "rI" (bl) __CLOBBER_CC);\
    485     } while (0)
    486 #if defined (__ARM_ARCH_2__) || defined (__ARM_ARCH_2A__) \
    487     || defined (__ARM_ARCH_3__)
    488 #define umul_ppmm(xh, xl, a, b)						\
    489   do {									\
    490     register USItype __t0, __t1, __t2;					\
    491     __asm__ ("%@ Inlined umul_ppmm\n"					\
    492 	   "	mov	%2, %5, lsr #16\n"				\
    493 	   "	mov	%0, %6, lsr #16\n"				\
    494 	   "	bic	%3, %5, %2, lsl #16\n"				\
    495 	   "	bic	%4, %6, %0, lsl #16\n"				\
    496 	   "	mul	%1, %3, %4\n"					\
    497 	   "	mul	%4, %2, %4\n"					\
    498 	   "	mul	%3, %0, %3\n"					\
    499 	   "	mul	%0, %2, %0\n"					\
    500 	   "	adds	%3, %4, %3\n"					\
    501 	   "	addcs	%0, %0, #65536\n"				\
    502 	   "	adds	%1, %1, %3, lsl #16\n"				\
    503 	   "	adc	%0, %0, %3, lsr #16"				\
    504 	   : "=&r" ((USItype) (xh)), "=r" ((USItype) (xl)),		\
    505 	     "=&r" (__t0), "=&r" (__t1), "=r" (__t2)			\
    506 	   : "r" ((USItype) (a)), "r" ((USItype) (b)) __CLOBBER_CC);	\
    507   } while (0)
    508 #define UMUL_TIME 20
    509 #ifndef LONGLONG_STANDALONE
    510 #define udiv_qrnnd(q, r, n1, n0, d) \
    511   do { UWtype __r;							\
    512     (q) = __MPN(udiv_qrnnd) (&__r, (n1), (n0), (d));			\
    513     (r) = __r;								\
    514   } while (0)
    515 extern UWtype __MPN(udiv_qrnnd) (UWtype *, UWtype, UWtype, UWtype);
    516 #define UDIV_TIME 200
    517 #endif /* LONGLONG_STANDALONE */
    518 #else /* ARMv4 or newer */
    519 #define umul_ppmm(xh, xl, a, b) \
    520   __asm__ ("umull %0,%1,%2,%3" : "=&r" (xl), "=&r" (xh) : "r" (a), "r" (b))
    521 #define UMUL_TIME 5
    522 #define smul_ppmm(xh, xl, a, b) \
    523   __asm__ ("smull %0,%1,%2,%3" : "=&r" (xl), "=&r" (xh) : "r" (a), "r" (b))
    524 #ifndef LONGLONG_STANDALONE
    525 #define udiv_qrnnd(q, r, n1, n0, d) \
    526   do { UWtype __di;							\
    527     __di = __MPN(invert_limb) (d);					\
    528     udiv_qrnnd_preinv (q, r, n1, n0, d, __di);				\
    529   } while (0)
    530 #define UDIV_PREINV_ALWAYS  1
    531 #define UDIV_NEEDS_NORMALIZATION 1
    532 #define UDIV_TIME 70
    533 #endif /* LONGLONG_STANDALONE */
    534 #endif /* defined(__ARM_ARCH_2__) ... */
    535 #define count_leading_zeros(count, x)  count_leading_zeros_gcc_clz(count, x)
    536 #define count_trailing_zeros(count, x)  count_trailing_zeros_gcc_ctz(count, x)
    537 #define COUNT_LEADING_ZEROS_0 32
    538 #endif /* __arm__ */
    539 
    540 #if defined (__aarch64__) && W_TYPE_SIZE == 64
    541 /* FIXME: Extend the immediate range for the low word by using both
    542    ADDS and SUBS, since they set carry in the same way.  */
    543 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \
    544   __asm__ ("adds\t%1, %x4, %5\n\tadc\t%0, %x2, %x3"			\
    545 	   : "=r" (sh), "=&r" (sl)					\
    546 	   : "rZ" ((UDItype)(ah)), "rZ" ((UDItype)(bh)),		\
    547 	     "%r" ((UDItype)(al)), "rI" ((UDItype)(bl)) __CLOBBER_CC)
    548 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \
    549   __asm__ ("subs\t%1, %x4, %5\n\tsbc\t%0, %x2, %x3"			\
    550 	   : "=r,r" (sh), "=&r,&r" (sl)					\
    551 	   : "rZ,rZ" ((UDItype)(ah)), "rZ,rZ" ((UDItype)(bh)),		\
    552 	     "r,Z"   ((UDItype)(al)), "rI,r"  ((UDItype)(bl)) __CLOBBER_CC)
    553 #define umul_ppmm(ph, pl, m0, m1) \
    554   do {									\
    555     UDItype __m0 = (m0), __m1 = (m1);					\
    556     __asm__ ("umulh\t%0, %1, %2" : "=r" (ph) : "r" (__m0), "r" (__m1));	\
    557     (pl) = __m0 * __m1;							\
    558   } while (0)
    559 #define count_leading_zeros(count, x)  count_leading_zeros_gcc_clz(count, x)
    560 #define count_trailing_zeros(count, x)  count_trailing_zeros_gcc_ctz(count, x)
    561 #define COUNT_LEADING_ZEROS_0 64
    562 #endif /* __aarch64__ */
    563 
    564 #if defined (__clipper__) && W_TYPE_SIZE == 32
    565 #define umul_ppmm(w1, w0, u, v) \
    566   ({union {UDItype __ll;						\
    567 	   struct {USItype __l, __h;} __i;				\
    568 	  } __x;							\
    569   __asm__ ("mulwux %2,%0"						\
    570 	   : "=r" (__x.__ll)						\
    571 	   : "%0" ((USItype)(u)), "r" ((USItype)(v)));			\
    572   (w1) = __x.__i.__h; (w0) = __x.__i.__l;})
    573 #define smul_ppmm(w1, w0, u, v) \
    574   ({union {DItype __ll;							\
    575 	   struct {SItype __l, __h;} __i;				\
    576 	  } __x;							\
    577   __asm__ ("mulwx %2,%0"						\
    578 	   : "=r" (__x.__ll)						\
    579 	   : "%0" ((SItype)(u)), "r" ((SItype)(v)));			\
    580   (w1) = __x.__i.__h; (w0) = __x.__i.__l;})
    581 #define __umulsidi3(u, v) \
    582   ({UDItype __w;							\
    583     __asm__ ("mulwux %2,%0"						\
    584 	     : "=r" (__w) : "%0" ((USItype)(u)), "r" ((USItype)(v)));	\
    585     __w; })
    586 #endif /* __clipper__ */
    587 
    588 /* Fujitsu vector computers.  */
    589 #if defined (__uxp__) && W_TYPE_SIZE == 32
    590 #define umul_ppmm(ph, pl, u, v) \
    591   do {									\
    592     union {UDItype __ll;						\
    593 	   struct {USItype __h, __l;} __i;				\
    594 	  } __x;							\
    595     __asm__ ("mult.lu %1,%2,%0"	: "=r" (__x.__ll) : "%r" (u), "rK" (v));\
    596     (ph) = __x.__i.__h;							\
    597     (pl) = __x.__i.__l;							\
    598   } while (0)
    599 #define smul_ppmm(ph, pl, u, v) \
    600   do {									\
    601     union {UDItype __ll;						\
    602 	   struct {USItype __h, __l;} __i;				\
    603 	  } __x;							\
    604     __asm__ ("mult.l %1,%2,%0" : "=r" (__x.__ll) : "%r" (u), "rK" (v));	\
    605     (ph) = __x.__i.__h;							\
    606     (pl) = __x.__i.__l;							\
    607   } while (0)
    608 #endif
    609 
    610 #if defined (__gmicro__) && W_TYPE_SIZE == 32
    611 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \
    612   __asm__ ("add.w %5,%1\n\taddx %3,%0"					\
    613 	   : "=g" (sh), "=&g" (sl)					\
    614 	   : "0"  ((USItype)(ah)), "g" ((USItype)(bh)),			\
    615 	     "%1" ((USItype)(al)), "g" ((USItype)(bl)))
    616 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \
    617   __asm__ ("sub.w %5,%1\n\tsubx %3,%0"					\
    618 	   : "=g" (sh), "=&g" (sl)					\
    619 	   : "0" ((USItype)(ah)), "g" ((USItype)(bh)),			\
    620 	     "1" ((USItype)(al)), "g" ((USItype)(bl)))
    621 #define umul_ppmm(ph, pl, m0, m1) \
    622   __asm__ ("mulx %3,%0,%1"						\
    623 	   : "=g" (ph), "=r" (pl)					\
    624 	   : "%0" ((USItype)(m0)), "g" ((USItype)(m1)))
    625 #define udiv_qrnnd(q, r, nh, nl, d) \
    626   __asm__ ("divx %4,%0,%1"						\
    627 	   : "=g" (q), "=r" (r)						\
    628 	   : "1" ((USItype)(nh)), "0" ((USItype)(nl)), "g" ((USItype)(d)))
    629 #define count_leading_zeros(count, x) \
    630   __asm__ ("bsch/1 %1,%0"						\
    631 	   : "=g" (count) : "g" ((USItype)(x)), "0" ((USItype)0))
    632 #endif
    633 
    634 #if defined (__hppa) && W_TYPE_SIZE == 32
    635 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \
    636   __asm__ ("add%I5 %5,%r4,%1\n\taddc %r2,%r3,%0"			\
    637 	   : "=r" (sh), "=&r" (sl)					\
    638 	   : "rM" (ah), "rM" (bh), "%rM" (al), "rI" (bl))
    639 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \
    640   __asm__ ("sub%I4 %4,%r5,%1\n\tsubb %r2,%r3,%0"			\
    641 	   : "=r" (sh), "=&r" (sl)					\
    642 	   : "rM" (ah), "rM" (bh), "rI" (al), "rM" (bl))
    643 #if defined (_PA_RISC1_1)
    644 #define umul_ppmm(wh, wl, u, v) \
    645   do {									\
    646     union {UDItype __ll;						\
    647 	   struct {USItype __h, __l;} __i;				\
    648 	  } __x;							\
    649     __asm__ ("xmpyu %1,%2,%0" : "=*f" (__x.__ll) : "*f" (u), "*f" (v));	\
    650     (wh) = __x.__i.__h;							\
    651     (wl) = __x.__i.__l;							\
    652   } while (0)
    653 #define UMUL_TIME 8
    654 #define UDIV_TIME 60
    655 #else
    656 #define UMUL_TIME 40
    657 #define UDIV_TIME 80
    658 #endif
    659 #define count_leading_zeros(count, x) \
    660   do {									\
    661     USItype __tmp;							\
    662     __asm__ (								\
    663        "ldi		1,%0\n"						\
    664 "	extru,=		%1,15,16,%%r0	; Bits 31..16 zero?\n"		\
    665 "	extru,tr	%1,15,16,%1	; No.  Shift down, skip add.\n"	\
    666 "	ldo		16(%0),%0	; Yes.  Perform add.\n"		\
    667 "	extru,=		%1,23,8,%%r0	; Bits 15..8 zero?\n"		\
    668 "	extru,tr	%1,23,8,%1	; No.  Shift down, skip add.\n"	\
    669 "	ldo		8(%0),%0	; Yes.  Perform add.\n"		\
    670 "	extru,=		%1,27,4,%%r0	; Bits 7..4 zero?\n"		\
    671 "	extru,tr	%1,27,4,%1	; No.  Shift down, skip add.\n"	\
    672 "	ldo		4(%0),%0	; Yes.  Perform add.\n"		\
    673 "	extru,=		%1,29,2,%%r0	; Bits 3..2 zero?\n"		\
    674 "	extru,tr	%1,29,2,%1	; No.  Shift down, skip add.\n"	\
    675 "	ldo		2(%0),%0	; Yes.  Perform add.\n"		\
    676 "	extru		%1,30,1,%1	; Extract bit 1.\n"		\
    677 "	sub		%0,%1,%0	; Subtract it.\n"		\
    678 	: "=r" (count), "=r" (__tmp) : "1" (x));			\
    679   } while (0)
    680 #endif /* hppa */
    681 
    682 /* These macros are for ABI=2.0w.  In ABI=2.0n they can't be used, since GCC
    683    (3.2) puts longlong into two adjacent 32-bit registers.  Presumably this
    684    is just a case of no direct support for 2.0n but treating it like 1.0. */
    685 #if defined (__hppa) && W_TYPE_SIZE == 64 && ! defined (_LONG_LONG_LIMB)
    686 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \
    687   __asm__ ("add%I5 %5,%r4,%1\n\tadd,dc %r2,%r3,%0"			\
    688 	   : "=r" (sh), "=&r" (sl)					\
    689 	   : "rM" (ah), "rM" (bh), "%rM" (al), "rI" (bl))
    690 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \
    691   __asm__ ("sub%I4 %4,%r5,%1\n\tsub,db %r2,%r3,%0"			\
    692 	   : "=r" (sh), "=&r" (sl)					\
    693 	   : "rM" (ah), "rM" (bh), "rI" (al), "rM" (bl))
    694 #endif /* hppa */
    695 
    696 #if (defined (__i370__) || defined (__s390__) || defined (__mvs__)) && W_TYPE_SIZE == 32
    697 #if defined (__zarch__) || defined (HAVE_HOST_CPU_s390_zarch)
    698 #define add_ssaaaa(sh, sl, ah, al, bh, bl)				\
    699   do {									\
    700 /*  if (__builtin_constant_p (bl))					\
    701       __asm__ ("alfi\t%1,%o5\n\talcr\t%0,%3"				\
    702 	       : "=r" (sh), "=&r" (sl)					\
    703 	       : "0"  (ah), "r" (bh), "%1" (al), "n" (bl) __CLOBBER_CC);\
    704     else								\
    705 */    __asm__ ("alr\t%1,%5\n\talcr\t%0,%3"				\
    706 	       : "=r" (sh), "=&r" (sl)					\
    707 	       : "0"  (ah), "r" (bh), "%1" (al), "r" (bl)__CLOBBER_CC);	\
    708   } while (0)
    709 #define sub_ddmmss(sh, sl, ah, al, bh, bl)				\
    710   do {									\
    711 /*  if (__builtin_constant_p (bl))					\
    712       __asm__ ("slfi\t%1,%o5\n\tslbr\t%0,%3"				\
    713 	       : "=r" (sh), "=&r" (sl)					\
    714 	       : "0" (ah), "r" (bh), "1" (al), "n" (bl) __CLOBBER_CC);	\
    715     else								\
    716 */    __asm__ ("slr\t%1,%5\n\tslbr\t%0,%3"				\
    717 	       : "=r" (sh), "=&r" (sl)					\
    718 	       : "0" (ah), "r" (bh), "1" (al), "r" (bl) __CLOBBER_CC);	\
    719   } while (0)
    720 #if __GMP_GNUC_PREREQ (4,5)
    721 #define umul_ppmm(xh, xl, m0, m1)					\
    722   do {									\
    723     union {UDItype __ll;						\
    724 	   struct {USItype __h, __l;} __i;				\
    725 	  } __x;							\
    726     __x.__ll = (UDItype) (m0) * (UDItype) (m1);				\
    727     (xh) = __x.__i.__h; (xl) = __x.__i.__l;				\
    728   } while (0)
    729 #else
    730 #if 0
    731 /* FIXME: this fails if gcc knows about the 64-bit registers.  Use only
    732    with a new enough processor pretending we have 32-bit registers.  */
    733 #define umul_ppmm(xh, xl, m0, m1)					\
    734   do {									\
    735     union {UDItype __ll;						\
    736 	   struct {USItype __h, __l;} __i;				\
    737 	  } __x;							\
    738     __asm__ ("mlr\t%0,%2"						\
    739 	     : "=r" (__x.__ll)						\
    740 	     : "%0" (m0), "r" (m1));					\
    741     (xh) = __x.__i.__h; (xl) = __x.__i.__l;				\
    742   } while (0)
    743 #else
    744 #define umul_ppmm(xh, xl, m0, m1)					\
    745   do {									\
    746   /* When we have 64-bit regs and gcc is aware of that, we cannot simply use
    747      DImode for the product, since that would be allocated to a single 64-bit
    748      register, whereas mlr uses the low 32-bits of an even-odd register pair.
    749   */									\
    750     register USItype __r0 __asm__ ("0");				\
    751     register USItype __r1 __asm__ ("1") = (m0);				\
    752     __asm__ ("mlr\t%0,%3"						\
    753 	     : "=r" (__r0), "=r" (__r1)					\
    754 	     : "r" (__r1), "r" (m1));					\
    755     (xh) = __r0; (xl) = __r1;						\
    756   } while (0)
    757 #endif /* if 0 */
    758 #endif
    759 #if 0
    760 /* FIXME: this fails if gcc knows about the 64-bit registers.  Use only
    761    with a new enough processor pretending we have 32-bit registers.  */
    762 #define udiv_qrnnd(q, r, n1, n0, d)					\
    763   do {									\
    764     union {UDItype __ll;						\
    765 	   struct {USItype __h, __l;} __i;				\
    766 	  } __x;							\
    767     __x.__i.__h = n1; __x.__i.__l = n0;					\
    768     __asm__ ("dlr\t%0,%2"						\
    769 	     : "=r" (__x.__ll)						\
    770 	     : "0" (__x.__ll), "r" (d));				\
    771     (q) = __x.__i.__l; (r) = __x.__i.__h;				\
    772   } while (0)
    773 #else
    774 #define udiv_qrnnd(q, r, n1, n0, d)					\
    775   do {									\
    776     register USItype __r0 __asm__ ("0") = (n1);				\
    777     register USItype __r1 __asm__ ("1") = (n0);				\
    778     __asm__ ("dlr\t%0,%4"						\
    779 	     : "=r" (__r0), "=r" (__r1)					\
    780 	     : "r" (__r0), "r" (__r1), "r" (d));			\
    781     (q) = __r1; (r) = __r0;						\
    782   } while (0)
    783 #endif /* if 0 */
    784 #else /* if __zarch__ */
    785 /* FIXME: this fails if gcc knows about the 64-bit registers.  */
    786 #define smul_ppmm(xh, xl, m0, m1)					\
    787   do {									\
    788     union {DItype __ll;							\
    789 	   struct {USItype __h, __l;} __i;				\
    790 	  } __x;							\
    791     __asm__ ("mr\t%0,%2"						\
    792 	     : "=r" (__x.__ll)						\
    793 	     : "%0" (m0), "r" (m1));					\
    794     (xh) = __x.__i.__h; (xl) = __x.__i.__l;				\
    795   } while (0)
    796 /* FIXME: this fails if gcc knows about the 64-bit registers.  */
    797 #define sdiv_qrnnd(q, r, n1, n0, d)					\
    798   do {									\
    799     union {DItype __ll;							\
    800 	   struct {USItype __h, __l;} __i;				\
    801 	  } __x;							\
    802     __x.__i.__h = n1; __x.__i.__l = n0;					\
    803     __asm__ ("dr\t%0,%2"						\
    804 	     : "=r" (__x.__ll)						\
    805 	     : "0" (__x.__ll), "r" (d));				\
    806     (q) = __x.__i.__l; (r) = __x.__i.__h;				\
    807   } while (0)
    808 #endif /* if __zarch__ */
    809 #endif
    810 
    811 #if defined (__s390x__) && W_TYPE_SIZE == 64
    812 /* We need to cast operands with register constraints, otherwise their types
    813    will be assumed to be SImode by gcc.  For these machines, such operations
    814    will insert a value into the low 32 bits, and leave the high 32 bits with
    815    garbage.  */
    816 #define add_ssaaaa(sh, sl, ah, al, bh, bl)				\
    817   do {									\
    818     __asm__ ("algr\t%1,%5\n\talcgr\t%0,%3"				\
    819 	       : "=r" (sh), "=&r" (sl)					\
    820 	       : "0"  ((UDItype)(ah)), "r" ((UDItype)(bh)),		\
    821 		 "%1" ((UDItype)(al)), "r" ((UDItype)(bl)) __CLOBBER_CC); \
    822   } while (0)
    823 #define sub_ddmmss(sh, sl, ah, al, bh, bl)				\
    824   do {									\
    825     __asm__ ("slgr\t%1,%5\n\tslbgr\t%0,%3"				\
    826 	     : "=r" (sh), "=&r" (sl)					\
    827 	     : "0" ((UDItype)(ah)), "r" ((UDItype)(bh)),		\
    828 	       "1" ((UDItype)(al)), "r" ((UDItype)(bl)) __CLOBBER_CC);	\
    829   } while (0)
    830 #define umul_ppmm(xh, xl, m0, m1)					\
    831   do {									\
    832     union {unsigned int __attribute__ ((mode(TI))) __ll;		\
    833 	   struct {UDItype __h, __l;} __i;				\
    834 	  } __x;							\
    835     __asm__ ("mlgr\t%0,%2"						\
    836 	     : "=r" (__x.__ll)						\
    837 	     : "%0" ((UDItype)(m0)), "r" ((UDItype)(m1)));		\
    838     (xh) = __x.__i.__h; (xl) = __x.__i.__l;				\
    839   } while (0)
    840 #define udiv_qrnnd(q, r, n1, n0, d)					\
    841   do {									\
    842     union {unsigned int __attribute__ ((mode(TI))) __ll;		\
    843 	   struct {UDItype __h, __l;} __i;				\
    844 	  } __x;							\
    845     __x.__i.__h = n1; __x.__i.__l = n0;					\
    846     __asm__ ("dlgr\t%0,%2"						\
    847 	     : "=r" (__x.__ll)						\
    848 	     : "0" (__x.__ll), "r" ((UDItype)(d)));			\
    849     (q) = __x.__i.__l; (r) = __x.__i.__h;				\
    850   } while (0)
    851 #if 0 /* FIXME: Enable for z10 (?) */
    852 #define count_leading_zeros(cnt, x)					\
    853   do {									\
    854     union {unsigned int __attribute__ ((mode(TI))) __ll;		\
    855 	   struct {UDItype __h, __l;} __i;				\
    856 	  } __clr_cnt;							\
    857     __asm__ ("flogr\t%0,%1"						\
    858 	     : "=r" (__clr_cnt.__ll)					\
    859 	     : "r" (x) __CLOBBER_CC);					\
    860     (cnt) = __clr_cnt.__i.__h;						\
    861   } while (0)
    862 #endif
    863 #endif
    864 
    865 /* On x86 and x86_64, every asm implicitly clobbers "flags" and "fpsr",
    866    so we don't need __CLOBBER_CC.  */
    867 #if (defined (__i386__) || defined (__i486__)) && W_TYPE_SIZE == 32
    868 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \
    869   __asm__ ("addl %5,%k1\n\tadcl %3,%k0"					\
    870 	   : "=r" (sh), "=&r" (sl)					\
    871 	   : "0"  ((USItype)(ah)), "g" ((USItype)(bh)),			\
    872 	     "%1" ((USItype)(al)), "g" ((USItype)(bl)))
    873 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \
    874   __asm__ ("subl %5,%k1\n\tsbbl %3,%k0"					\
    875 	   : "=r" (sh), "=&r" (sl)					\
    876 	   : "0" ((USItype)(ah)), "g" ((USItype)(bh)),			\
    877 	     "1" ((USItype)(al)), "g" ((USItype)(bl)))
    878 #define umul_ppmm(w1, w0, u, v) \
    879   __asm__ ("mull %3"							\
    880 	   : "=a" (w0), "=d" (w1)					\
    881 	   : "%0" ((USItype)(u)), "rm" ((USItype)(v)))
    882 #define udiv_qrnnd(q, r, n1, n0, dx) /* d renamed to dx avoiding "=d" */\
    883   __asm__ ("divl %4"		     /* stringification in K&R C */	\
    884 	   : "=a" (q), "=d" (r)						\
    885 	   : "0" ((USItype)(n0)), "1" ((USItype)(n1)), "rm" ((USItype)(dx)))
    886 
    887 #if HAVE_HOST_CPU_i586 || HAVE_HOST_CPU_pentium || HAVE_HOST_CPU_pentiummmx
    888 /* Pentium bsrl takes between 10 and 72 cycles depending where the most
    889    significant 1 bit is, hence the use of the following alternatives.  bsfl
    890    is slow too, between 18 and 42 depending where the least significant 1
    891    bit is, so let the generic count_trailing_zeros below make use of the
    892    count_leading_zeros here too.  */
    893 
    894 #if HAVE_HOST_CPU_pentiummmx && ! defined (LONGLONG_STANDALONE)
    895 /* The following should be a fixed 14 or 15 cycles, but possibly plus an L1
    896    cache miss reading from __clz_tab.  For P55 it's favoured over the float
    897    below so as to avoid mixing MMX and x87, since the penalty for switching
    898    between the two is about 100 cycles.
    899 
    900    The asm block sets __shift to -3 if the high 24 bits are clear, -2 for
    901    16, -1 for 8, or 0 otherwise.  This could be written equivalently as
    902    follows, but as of gcc 2.95.2 it results in conditional jumps.
    903 
    904        __shift = -(__n < 0x1000000);
    905        __shift -= (__n < 0x10000);
    906        __shift -= (__n < 0x100);
    907 
    908    The middle two sbbl and cmpl's pair, and with luck something gcc
    909    generates might pair with the first cmpl and the last sbbl.  The "32+1"
    910    constant could be folded into __clz_tab[], but it doesn't seem worth
    911    making a different table just for that.  */
    912 
    913 #define count_leading_zeros(c,n)					\
    914   do {									\
    915     USItype  __n = (n);							\
    916     USItype  __shift;							\
    917     __asm__ ("cmpl  $0x1000000, %1\n"					\
    918 	     "sbbl  %0, %0\n"						\
    919 	     "cmpl  $0x10000, %1\n"					\
    920 	     "sbbl  $0, %0\n"						\
    921 	     "cmpl  $0x100, %1\n"					\
    922 	     "sbbl  $0, %0\n"						\
    923 	     : "=&r" (__shift) : "r"  (__n));				\
    924     __shift = __shift*8 + 24 + 1;					\
    925     (c) = 32 + 1 - __shift - __clz_tab[__n >> __shift];			\
    926   } while (0)
    927 #define COUNT_LEADING_ZEROS_NEED_CLZ_TAB
    928 #define COUNT_LEADING_ZEROS_0   31   /* n==0 indistinguishable from n==1 */
    929 
    930 #else /* ! pentiummmx || LONGLONG_STANDALONE */
    931 /* The following should be a fixed 14 cycles or so.  Some scheduling
    932    opportunities should be available between the float load/store too.  This
    933    sort of code is used in gcc 3 for __builtin_ffs (with "n&-n") and is
    934    apparently suggested by the Intel optimizing manual (don't know exactly
    935    where).  gcc 2.95 or up will be best for this, so the "double" is
    936    correctly aligned on the stack.  */
    937 #define count_leading_zeros(c,n)					\
    938   do {									\
    939     union {								\
    940       double    d;							\
    941       unsigned  a[2];							\
    942     } __u;								\
    943     ASSERT ((n) != 0);							\
    944     __u.d = (UWtype) (n);						\
    945     (c) = 0x3FF + 31 - (__u.a[1] >> 20);				\
    946   } while (0)
    947 #define COUNT_LEADING_ZEROS_0   (0x3FF + 31)
    948 #endif /* pentiummx */
    949 
    950 #else /* ! pentium */
    951 
    952 #if __GMP_GNUC_PREREQ (3,4)  /* using bsrl */
    953 #define count_leading_zeros(count,x)  count_leading_zeros_gcc_clz(count,x)
    954 #endif /* gcc clz */
    955 
    956 /* On P6, gcc prior to 3.0 generates a partial register stall for
    957    __cbtmp^31, due to using "xorb $31" instead of "xorl $31", the former
    958    being 1 code byte smaller.  "31-__cbtmp" is a workaround, probably at the
    959    cost of one extra instruction.  Do this for "i386" too, since that means
    960    generic x86.  */
    961 #if ! defined (count_leading_zeros) && __GNUC__ < 3			\
    962   && (HAVE_HOST_CPU_i386						\
    963       || HAVE_HOST_CPU_i686						\
    964       || HAVE_HOST_CPU_pentiumpro					\
    965       || HAVE_HOST_CPU_pentium2						\
    966       || HAVE_HOST_CPU_pentium3)
    967 #define count_leading_zeros(count, x)					\
    968   do {									\
    969     USItype __cbtmp;							\
    970     ASSERT ((x) != 0);							\
    971     __asm__ ("bsrl %1,%0" : "=r" (__cbtmp) : "rm" ((USItype)(x)));	\
    972     (count) = 31 - __cbtmp;						\
    973   } while (0)
    974 #endif /* gcc<3 asm bsrl */
    975 
    976 #ifndef count_leading_zeros
    977 #define count_leading_zeros(count, x)					\
    978   do {									\
    979     USItype __cbtmp;							\
    980     ASSERT ((x) != 0);							\
    981     __asm__ ("bsrl %1,%0" : "=r" (__cbtmp) : "rm" ((USItype)(x)));	\
    982     (count) = __cbtmp ^ 31;						\
    983   } while (0)
    984 #endif /* asm bsrl */
    985 
    986 #if __GMP_GNUC_PREREQ (3,4)  /* using bsfl */
    987 #define count_trailing_zeros(count,x)  count_trailing_zeros_gcc_ctz(count,x)
    988 #endif /* gcc ctz */
    989 
    990 #ifndef count_trailing_zeros
    991 #define count_trailing_zeros(count, x)					\
    992   do {									\
    993     ASSERT ((x) != 0);							\
    994     __asm__ ("bsfl %1,%k0" : "=r" (count) : "rm" ((USItype)(x)));	\
    995   } while (0)
    996 #endif /* asm bsfl */
    997 
    998 #endif /* ! pentium */
    999 
   1000 #ifndef UMUL_TIME
   1001 #define UMUL_TIME 10
   1002 #endif
   1003 #ifndef UDIV_TIME
   1004 #define UDIV_TIME 40
   1005 #endif
   1006 #endif /* 80x86 */
   1007 
   1008 #if defined (__amd64__) && W_TYPE_SIZE == 64
   1009 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \
   1010   __asm__ ("addq %5,%q1\n\tadcq %3,%q0"					\
   1011 	   : "=r" (sh), "=&r" (sl)					\
   1012 	   : "0"  ((UDItype)(ah)), "rme" ((UDItype)(bh)),		\
   1013 	     "%1" ((UDItype)(al)), "rme" ((UDItype)(bl)))
   1014 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \
   1015   __asm__ ("subq %5,%q1\n\tsbbq %3,%q0"					\
   1016 	   : "=r" (sh), "=&r" (sl)					\
   1017 	   : "0" ((UDItype)(ah)), "rme" ((UDItype)(bh)),		\
   1018 	     "1" ((UDItype)(al)), "rme" ((UDItype)(bl)))
   1019 #define umul_ppmm(w1, w0, u, v) \
   1020   __asm__ ("mulq %3"							\
   1021 	   : "=a" (w0), "=d" (w1)					\
   1022 	   : "%0" ((UDItype)(u)), "rm" ((UDItype)(v)))
   1023 #define udiv_qrnnd(q, r, n1, n0, dx) /* d renamed to dx avoiding "=d" */\
   1024   __asm__ ("divq %4"		     /* stringification in K&R C */	\
   1025 	   : "=a" (q), "=d" (r)						\
   1026 	   : "0" ((UDItype)(n0)), "1" ((UDItype)(n1)), "rm" ((UDItype)(dx)))
   1027 /* bsrq destination must be a 64-bit register, hence UDItype for __cbtmp. */
   1028 #define count_leading_zeros(count, x)					\
   1029   do {									\
   1030     UDItype __cbtmp;							\
   1031     ASSERT ((x) != 0);							\
   1032     __asm__ ("bsrq %1,%0" : "=r" (__cbtmp) : "rm" ((UDItype)(x)));	\
   1033     (count) = __cbtmp ^ 63;						\
   1034   } while (0)
   1035 /* bsfq destination must be a 64-bit register, "%q0" forces this in case
   1036    count is only an int. */
   1037 #define count_trailing_zeros(count, x)					\
   1038   do {									\
   1039     ASSERT ((x) != 0);							\
   1040     __asm__ ("bsfq %1,%q0" : "=r" (count) : "rm" ((UDItype)(x)));	\
   1041   } while (0)
   1042 #endif /* __amd64__ */
   1043 
   1044 #if defined (__i860__) && W_TYPE_SIZE == 32
   1045 #define rshift_rhlc(r,h,l,c) \
   1046   __asm__ ("shr %3,r0,r0\;shrd %1,%2,%0"				\
   1047 	   "=r" (r) : "r" (h), "r" (l), "rn" (c))
   1048 #endif /* i860 */
   1049 
   1050 #if defined (__i960__) && W_TYPE_SIZE == 32
   1051 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \
   1052   __asm__ ("cmpo 1,0\;addc %5,%4,%1\;addc %3,%2,%0"			\
   1053 	   : "=r" (sh), "=&r" (sl)					\
   1054 	   : "dI" (ah), "dI" (bh), "%dI" (al), "dI" (bl))
   1055 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \
   1056   __asm__ ("cmpo 0,0\;subc %5,%4,%1\;subc %3,%2,%0"			\
   1057 	   : "=r" (sh), "=&r" (sl)					\
   1058 	   : "dI" (ah), "dI" (bh), "dI" (al), "dI" (bl))
   1059 #define umul_ppmm(w1, w0, u, v) \
   1060   ({union {UDItype __ll;						\
   1061 	   struct {USItype __l, __h;} __i;				\
   1062 	  } __x;							\
   1063   __asm__ ("emul %2,%1,%0"						\
   1064 	   : "=d" (__x.__ll) : "%dI" (u), "dI" (v));			\
   1065   (w1) = __x.__i.__h; (w0) = __x.__i.__l;})
   1066 #define __umulsidi3(u, v) \
   1067   ({UDItype __w;							\
   1068     __asm__ ("emul %2,%1,%0" : "=d" (__w) : "%dI" (u), "dI" (v));	\
   1069     __w; })
   1070 #define udiv_qrnnd(q, r, nh, nl, d) \
   1071   do {									\
   1072     union {UDItype __ll;						\
   1073 	   struct {USItype __l, __h;} __i;				\
   1074 	  } __nn;							\
   1075     __nn.__i.__h = (nh); __nn.__i.__l = (nl);				\
   1076     __asm__ ("ediv %d,%n,%0"						\
   1077 	   : "=d" (__rq.__ll) : "dI" (__nn.__ll), "dI" (d));		\
   1078     (r) = __rq.__i.__l; (q) = __rq.__i.__h;				\
   1079   } while (0)
   1080 #define count_leading_zeros(count, x) \
   1081   do {									\
   1082     USItype __cbtmp;							\
   1083     __asm__ ("scanbit %1,%0" : "=r" (__cbtmp) : "r" (x));		\
   1084     (count) = __cbtmp ^ 31;						\
   1085   } while (0)
   1086 #define COUNT_LEADING_ZEROS_0 (-32) /* sic */
   1087 #if defined (__i960mx)		/* what is the proper symbol to test??? */
   1088 #define rshift_rhlc(r,h,l,c) \
   1089   do {									\
   1090     union {UDItype __ll;						\
   1091 	   struct {USItype __l, __h;} __i;				\
   1092 	  } __nn;							\
   1093     __nn.__i.__h = (h); __nn.__i.__l = (l);				\
   1094     __asm__ ("shre %2,%1,%0" : "=d" (r) : "dI" (__nn.__ll), "dI" (c));	\
   1095   }
   1096 #endif /* i960mx */
   1097 #endif /* i960 */
   1098 
   1099 #if (defined (__mc68000__) || defined (__mc68020__) || defined(mc68020) \
   1100      || defined (__m68k__) || defined (__mc5200__) || defined (__mc5206e__) \
   1101      || defined (__mc5307__)) && W_TYPE_SIZE == 32
   1102 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \
   1103   __asm__ ("add%.l %5,%1\n\taddx%.l %3,%0"				\
   1104 	   : "=d" (sh), "=&d" (sl)					\
   1105 	   : "0"  ((USItype)(ah)), "d" ((USItype)(bh)),			\
   1106 	     "%1" ((USItype)(al)), "g" ((USItype)(bl)))
   1107 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \
   1108   __asm__ ("sub%.l %5,%1\n\tsubx%.l %3,%0"				\
   1109 	   : "=d" (sh), "=&d" (sl)					\
   1110 	   : "0" ((USItype)(ah)), "d" ((USItype)(bh)),			\
   1111 	     "1" ((USItype)(al)), "g" ((USItype)(bl)))
   1112 /* The '020, '030, '040 and CPU32 have 32x32->64 and 64/32->32q-32r.  */
   1113 #if defined (__mc68020__) || defined(mc68020) \
   1114      || defined (__mc68030__) || defined (mc68030) \
   1115      || defined (__mc68040__) || defined (mc68040) \
   1116      || defined (__mcpu32__) || defined (mcpu32) \
   1117      || defined (__NeXT__)
   1118 #define umul_ppmm(w1, w0, u, v) \
   1119   __asm__ ("mulu%.l %3,%1:%0"						\
   1120 	   : "=d" (w0), "=d" (w1)					\
   1121 	   : "%0" ((USItype)(u)), "dmi" ((USItype)(v)))
   1122 #define UMUL_TIME 45
   1123 #define udiv_qrnnd(q, r, n1, n0, d) \
   1124   __asm__ ("divu%.l %4,%1:%0"						\
   1125 	   : "=d" (q), "=d" (r)						\
   1126 	   : "0" ((USItype)(n0)), "1" ((USItype)(n1)), "dmi" ((USItype)(d)))
   1127 #define UDIV_TIME 90
   1128 #define sdiv_qrnnd(q, r, n1, n0, d) \
   1129   __asm__ ("divs%.l %4,%1:%0"						\
   1130 	   : "=d" (q), "=d" (r)						\
   1131 	   : "0" ((USItype)(n0)), "1" ((USItype)(n1)), "dmi" ((USItype)(d)))
   1132 #else /* for other 68k family members use 16x16->32 multiplication */
   1133 #define umul_ppmm(xh, xl, a, b) \
   1134   do { USItype __umul_tmp1, __umul_tmp2;				\
   1135 	__asm__ ("| Inlined umul_ppmm\n"				\
   1136 "	move%.l	%5,%3\n"						\
   1137 "	move%.l	%2,%0\n"						\
   1138 "	move%.w	%3,%1\n"						\
   1139 "	swap	%3\n"							\
   1140 "	swap	%0\n"							\
   1141 "	mulu%.w	%2,%1\n"						\
   1142 "	mulu%.w	%3,%0\n"						\
   1143 "	mulu%.w	%2,%3\n"						\
   1144 "	swap	%2\n"							\
   1145 "	mulu%.w	%5,%2\n"						\
   1146 "	add%.l	%3,%2\n"						\
   1147 "	jcc	1f\n"							\
   1148 "	add%.l	%#0x10000,%0\n"						\
   1149 "1:	move%.l	%2,%3\n"						\
   1150 "	clr%.w	%2\n"							\
   1151 "	swap	%2\n"							\
   1152 "	swap	%3\n"							\
   1153 "	clr%.w	%3\n"							\
   1154 "	add%.l	%3,%1\n"						\
   1155 "	addx%.l	%2,%0\n"						\
   1156 "	| End inlined umul_ppmm"					\
   1157 	      : "=&d" (xh), "=&d" (xl),					\
   1158 		"=d" (__umul_tmp1), "=&d" (__umul_tmp2)			\
   1159 	      : "%2" ((USItype)(a)), "d" ((USItype)(b)));		\
   1160   } while (0)
   1161 #define UMUL_TIME 100
   1162 #define UDIV_TIME 400
   1163 #endif /* not mc68020 */
   1164 /* The '020, '030, '040 and '060 have bitfield insns.
   1165    GCC 3.4 defines __mc68020__ when in CPU32 mode, check for __mcpu32__ to
   1166    exclude bfffo on that chip (bitfield insns not available).  */
   1167 #if (defined (__mc68020__) || defined (mc68020)    \
   1168      || defined (__mc68030__) || defined (mc68030) \
   1169      || defined (__mc68040__) || defined (mc68040) \
   1170      || defined (__mc68060__) || defined (mc68060) \
   1171      || defined (__NeXT__))			   \
   1172   && ! defined (__mcpu32__)
   1173 #define count_leading_zeros(count, x) \
   1174   __asm__ ("bfffo %1{%b2:%b2},%0"					\
   1175 	   : "=d" (count)						\
   1176 	   : "od" ((USItype) (x)), "n" (0))
   1177 #define COUNT_LEADING_ZEROS_0 32
   1178 #endif
   1179 #endif /* mc68000 */
   1180 
   1181 #if defined (__m88000__) && W_TYPE_SIZE == 32
   1182 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \
   1183   __asm__ ("addu.co %1,%r4,%r5\n\taddu.ci %0,%r2,%r3"			\
   1184 	   : "=r" (sh), "=&r" (sl)					\
   1185 	   : "rJ" (ah), "rJ" (bh), "%rJ" (al), "rJ" (bl))
   1186 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \
   1187   __asm__ ("subu.co %1,%r4,%r5\n\tsubu.ci %0,%r2,%r3"			\
   1188 	   : "=r" (sh), "=&r" (sl)					\
   1189 	   : "rJ" (ah), "rJ" (bh), "rJ" (al), "rJ" (bl))
   1190 #define count_leading_zeros(count, x) \
   1191   do {									\
   1192     USItype __cbtmp;							\
   1193     __asm__ ("ff1 %0,%1" : "=r" (__cbtmp) : "r" (x));			\
   1194     (count) = __cbtmp ^ 31;						\
   1195   } while (0)
   1196 #define COUNT_LEADING_ZEROS_0 63 /* sic */
   1197 #if defined (__m88110__)
   1198 #define umul_ppmm(wh, wl, u, v) \
   1199   do {									\
   1200     union {UDItype __ll;						\
   1201 	   struct {USItype __h, __l;} __i;				\
   1202 	  } __x;							\
   1203     __asm__ ("mulu.d %0,%1,%2" : "=r" (__x.__ll) : "r" (u), "r" (v));	\
   1204     (wh) = __x.__i.__h;							\
   1205     (wl) = __x.__i.__l;							\
   1206   } while (0)
   1207 #define udiv_qrnnd(q, r, n1, n0, d) \
   1208   ({union {UDItype __ll;						\
   1209 	   struct {USItype __h, __l;} __i;				\
   1210 	  } __x, __q;							\
   1211   __x.__i.__h = (n1); __x.__i.__l = (n0);				\
   1212   __asm__ ("divu.d %0,%1,%2"						\
   1213 	   : "=r" (__q.__ll) : "r" (__x.__ll), "r" (d));		\
   1214   (r) = (n0) - __q.__l * (d); (q) = __q.__l; })
   1215 #define UMUL_TIME 5
   1216 #define UDIV_TIME 25
   1217 #else
   1218 #define UMUL_TIME 17
   1219 #define UDIV_TIME 150
   1220 #endif /* __m88110__ */
   1221 #endif /* __m88000__ */
   1222 
   1223 #if defined (__mips) && W_TYPE_SIZE == 32
   1224 #if __GMP_GNUC_PREREQ (4,4)
   1225 #define umul_ppmm(w1, w0, u, v) \
   1226   do {									\
   1227     UDItype __ll = (UDItype)(u) * (v);					\
   1228     w1 = __ll >> 32;							\
   1229     w0 = __ll;								\
   1230   } while (0)
   1231 #endif
   1232 #if !defined (umul_ppmm) && __GMP_GNUC_PREREQ (2,7) && !defined (__clang__)
   1233 #define umul_ppmm(w1, w0, u, v) \
   1234   __asm__ ("multu %2,%3" : "=l" (w0), "=h" (w1) : "d" (u), "d" (v))
   1235 #endif
   1236 #if !defined (umul_ppmm)
   1237 #define umul_ppmm(w1, w0, u, v) \
   1238   __asm__ ("multu %2,%3\n\tmflo %0\n\tmfhi %1"				\
   1239 	   : "=d" (w0), "=d" (w1) : "d" (u), "d" (v))
   1240 #endif
   1241 #define UMUL_TIME 10
   1242 #define UDIV_TIME 100
   1243 #endif /* __mips */
   1244 
   1245 #if (defined (__mips) && __mips >= 3) && W_TYPE_SIZE == 64
   1246 #if __GMP_GNUC_PREREQ (4,4)
   1247 #define umul_ppmm(w1, w0, u, v) \
   1248   do {									\
   1249     typedef unsigned int __ll_UTItype __attribute__((mode(TI)));	\
   1250     __ll_UTItype __ll = (__ll_UTItype)(u) * (v);			\
   1251     w1 = __ll >> 64;							\
   1252     w0 = __ll;								\
   1253   } while (0)
   1254 #endif
   1255 #if !defined (umul_ppmm) && __GMP_GNUC_PREREQ (2,7) && !defined (__clang__)
   1256 #define umul_ppmm(w1, w0, u, v) \
   1257   __asm__ ("dmultu %2,%3"						\
   1258 	   : "=l" (w0), "=h" (w1)					\
   1259 	   : "d" ((UDItype)(u)), "d" ((UDItype)(v)))
   1260 #endif
   1261 #if !defined (umul_ppmm)
   1262 #define umul_ppmm(w1, w0, u, v) \
   1263   __asm__ ("dmultu %2,%3\n\tmflo %0\n\tmfhi %1"				\
   1264 	   : "=d" (w0), "=d" (w1)					\
   1265 	   : "d" ((UDItype)(u)), "d" ((UDItype)(v)))
   1266 #endif
   1267 #define UMUL_TIME 20
   1268 #define UDIV_TIME 140
   1269 #endif /* __mips */
   1270 
   1271 #if defined (__mmix__) && W_TYPE_SIZE == 64
   1272 #define umul_ppmm(w1, w0, u, v) \
   1273   __asm__ ("MULU %0,%2,%3" : "=r" (w0), "=z" (w1) : "r" (u), "r" (v))
   1274 #endif
   1275 
   1276 #if defined (__ns32000__) && W_TYPE_SIZE == 32
   1277 #define umul_ppmm(w1, w0, u, v) \
   1278   ({union {UDItype __ll;						\
   1279 	   struct {USItype __l, __h;} __i;				\
   1280 	  } __x;							\
   1281   __asm__ ("meid %2,%0"							\
   1282 	   : "=g" (__x.__ll)						\
   1283 	   : "%0" ((USItype)(u)), "g" ((USItype)(v)));			\
   1284   (w1) = __x.__i.__h; (w0) = __x.__i.__l;})
   1285 #define __umulsidi3(u, v) \
   1286   ({UDItype __w;							\
   1287     __asm__ ("meid %2,%0"						\
   1288 	     : "=g" (__w)						\
   1289 	     : "%0" ((USItype)(u)), "g" ((USItype)(v)));		\
   1290     __w; })
   1291 #define udiv_qrnnd(q, r, n1, n0, d) \
   1292   ({union {UDItype __ll;						\
   1293 	   struct {USItype __l, __h;} __i;				\
   1294 	  } __x;							\
   1295   __x.__i.__h = (n1); __x.__i.__l = (n0);				\
   1296   __asm__ ("deid %2,%0"							\
   1297 	   : "=g" (__x.__ll)						\
   1298 	   : "0" (__x.__ll), "g" ((USItype)(d)));			\
   1299   (r) = __x.__i.__l; (q) = __x.__i.__h; })
   1300 #define count_trailing_zeros(count,x) \
   1301   do {									\
   1302     __asm__ ("ffsd	%2,%0"						\
   1303 	     : "=r" (count)						\
   1304 	     : "0" ((USItype) 0), "r" ((USItype) (x)));			\
   1305   } while (0)
   1306 #endif /* __ns32000__ */
   1307 
   1308 /* In the past we had a block of various #defines tested
   1309        _ARCH_PPC    - AIX
   1310        _ARCH_PWR    - AIX
   1311        __powerpc__  - gcc
   1312        __POWERPC__  - BEOS
   1313        __ppc__      - Darwin
   1314        PPC          - old gcc, GNU/Linux, SysV
   1315    The plain PPC test was not good for vxWorks, since PPC is defined on all
   1316    CPUs there (eg. m68k too), as a constant one is expected to compare
   1317    CPU_FAMILY against.
   1318 
   1319    At any rate, this was pretty unattractive and a bit fragile.  The use of
   1320    HAVE_HOST_CPU_FAMILY is designed to cut through it all and be sure of
   1321    getting the desired effect.
   1322 
   1323    ENHANCE-ME: We should test _IBMR2 here when we add assembly support for
   1324    the system vendor compilers.  (Is that vendor compilers with inline asm,
   1325    or what?)  */
   1326 
   1327 #if (HAVE_HOST_CPU_FAMILY_power || HAVE_HOST_CPU_FAMILY_powerpc)	\
   1328   && W_TYPE_SIZE == 32
   1329 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \
   1330   do {									\
   1331     if (__builtin_constant_p (bh) && (bh) == 0)				\
   1332       __asm__ ("add%I4c %1,%3,%4\n\taddze %0,%2"			\
   1333 	     : "=r" (sh), "=&r" (sl) : "r" (ah), "%r" (al), "rI" (bl));	\
   1334     else if (__builtin_constant_p (bh) && (bh) == ~(USItype) 0)		\
   1335       __asm__ ("add%I4c %1,%3,%4\n\taddme %0,%2"			\
   1336 	     : "=r" (sh), "=&r" (sl) : "r" (ah), "%r" (al), "rI" (bl));	\
   1337     else								\
   1338       __asm__ ("add%I5c %1,%4,%5\n\tadde %0,%2,%3"			\
   1339 	     : "=r" (sh), "=&r" (sl)					\
   1340 	     : "r" (ah), "r" (bh), "%r" (al), "rI" (bl));		\
   1341   } while (0)
   1342 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \
   1343   do {									\
   1344     if (__builtin_constant_p (ah) && (ah) == 0)				\
   1345       __asm__ ("subf%I3c %1,%4,%3\n\tsubfze %0,%2"			\
   1346 	       : "=r" (sh), "=&r" (sl) : "r" (bh), "rI" (al), "r" (bl));\
   1347     else if (__builtin_constant_p (ah) && (ah) == ~(USItype) 0)		\
   1348       __asm__ ("subf%I3c %1,%4,%3\n\tsubfme %0,%2"			\
   1349 	       : "=r" (sh), "=&r" (sl) : "r" (bh), "rI" (al), "r" (bl));\
   1350     else if (__builtin_constant_p (bh) && (bh) == 0)			\
   1351       __asm__ ("subf%I3c %1,%4,%3\n\taddme %0,%2"			\
   1352 	       : "=r" (sh), "=&r" (sl) : "r" (ah), "rI" (al), "r" (bl));\
   1353     else if (__builtin_constant_p (bh) && (bh) == ~(USItype) 0)		\
   1354       __asm__ ("subf%I3c %1,%4,%3\n\taddze %0,%2"			\
   1355 	       : "=r" (sh), "=&r" (sl) : "r" (ah), "rI" (al), "r" (bl));\
   1356     else								\
   1357       __asm__ ("subf%I4c %1,%5,%4\n\tsubfe %0,%3,%2"			\
   1358 	       : "=r" (sh), "=&r" (sl)					\
   1359 	       : "r" (ah), "r" (bh), "rI" (al), "r" (bl));		\
   1360   } while (0)
   1361 #define count_leading_zeros(count, x) \
   1362   __asm__ ("cntlzw %0,%1" : "=r" (count) : "r" (x))
   1363 #define COUNT_LEADING_ZEROS_0 32
   1364 #if HAVE_HOST_CPU_FAMILY_powerpc
   1365 #if __GMP_GNUC_PREREQ (4,4)
   1366 #define umul_ppmm(w1, w0, u, v) \
   1367   do {									\
   1368     UDItype __ll = (UDItype)(u) * (v);					\
   1369     w1 = __ll >> 32;							\
   1370     w0 = __ll;								\
   1371   } while (0)
   1372 #endif
   1373 #if !defined (umul_ppmm)
   1374 #define umul_ppmm(ph, pl, m0, m1) \
   1375   do {									\
   1376     USItype __m0 = (m0), __m1 = (m1);					\
   1377     __asm__ ("mulhwu %0,%1,%2" : "=r" (ph) : "%r" (m0), "r" (m1));	\
   1378     (pl) = __m0 * __m1;							\
   1379   } while (0)
   1380 #endif
   1381 #define UMUL_TIME 15
   1382 #define smul_ppmm(ph, pl, m0, m1) \
   1383   do {									\
   1384     SItype __m0 = (m0), __m1 = (m1);					\
   1385     __asm__ ("mulhw %0,%1,%2" : "=r" (ph) : "%r" (m0), "r" (m1));	\
   1386     (pl) = __m0 * __m1;							\
   1387   } while (0)
   1388 #define SMUL_TIME 14
   1389 #define UDIV_TIME 120
   1390 #else
   1391 #define UMUL_TIME 8
   1392 #define smul_ppmm(xh, xl, m0, m1) \
   1393   __asm__ ("mul %0,%2,%3" : "=r" (xh), "=q" (xl) : "r" (m0), "r" (m1))
   1394 #define SMUL_TIME 4
   1395 #define sdiv_qrnnd(q, r, nh, nl, d) \
   1396   __asm__ ("div %0,%2,%4" : "=r" (q), "=q" (r) : "r" (nh), "1" (nl), "r" (d))
   1397 #define UDIV_TIME 100
   1398 #endif
   1399 #endif /* 32-bit POWER architecture variants.  */
   1400 
   1401 /* We should test _IBMR2 here when we add assembly support for the system
   1402    vendor compilers.  */
   1403 #if HAVE_HOST_CPU_FAMILY_powerpc && W_TYPE_SIZE == 64
   1404 #if !defined (_LONG_LONG_LIMB)
   1405 /* _LONG_LONG_LIMB is ABI=mode32 where adde operates on 32-bit values.  So
   1406    use adde etc only when not _LONG_LONG_LIMB.  */
   1407 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \
   1408   do {									\
   1409     if (__builtin_constant_p (bh) && (bh) == 0)				\
   1410       __asm__ ("add%I4c %1,%3,%4\n\taddze %0,%2"			\
   1411 	       : "=r" (sh), "=&r" (sl)					\
   1412 	       : "r"  ((UDItype)(ah)),					\
   1413 		 "%r" ((UDItype)(al)), "rI" ((UDItype)(bl)));		\
   1414     else if (__builtin_constant_p (bh) && (bh) == ~(UDItype) 0)		\
   1415       __asm__ ("add%I4c %1,%3,%4\n\taddme %0,%2"			\
   1416 	       : "=r" (sh), "=&r" (sl)					\
   1417 	       : "r"  ((UDItype)(ah)),					\
   1418 		 "%r" ((UDItype)(al)), "rI" ((UDItype)(bl)));		\
   1419     else								\
   1420       __asm__ ("add%I5c %1,%4,%5\n\tadde %0,%2,%3"			\
   1421 	       : "=r" (sh), "=&r" (sl)					\
   1422 	       : "r"  ((UDItype)(ah)), "r"  ((UDItype)(bh)),		\
   1423 		 "%r" ((UDItype)(al)), "rI" ((UDItype)(bl)));		\
   1424   } while (0)
   1425 /* We use "*rI" for the constant operand here, since with just "I", gcc barfs.
   1426    This might seem strange, but gcc folds away the dead code late.  */
   1427 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \
   1428   do {									\
   1429     if (__builtin_constant_p (bl) && bl > -0x8000 && bl <= 0x8000) {	\
   1430 	if (__builtin_constant_p (ah) && (ah) == 0)			\
   1431 	  __asm__ ("addic %1,%3,%4\n\tsubfze %0,%2"			\
   1432 		   : "=r" (sh), "=&r" (sl)				\
   1433 		   :                       "r" ((UDItype)(bh)),		\
   1434 		     "rI" ((UDItype)(al)), "*rI" (-((UDItype)(bl))));	\
   1435 	else if (__builtin_constant_p (ah) && (ah) == ~(UDItype) 0)	\
   1436 	  __asm__ ("addic %1,%3,%4\n\tsubfme %0,%2"			\
   1437 		   : "=r" (sh), "=&r" (sl)				\
   1438 		   :                       "r" ((UDItype)(bh)),		\
   1439 		     "rI" ((UDItype)(al)), "*rI" (-((UDItype)(bl))));	\
   1440 	else if (__builtin_constant_p (bh) && (bh) == 0)		\
   1441 	  __asm__ ("addic %1,%3,%4\n\taddme %0,%2"			\
   1442 		   : "=r" (sh), "=&r" (sl)				\
   1443 		   : "r"  ((UDItype)(ah)),				\
   1444 		     "rI" ((UDItype)(al)), "*rI" (-((UDItype)(bl))));	\
   1445 	else if (__builtin_constant_p (bh) && (bh) == ~(UDItype) 0)	\
   1446 	  __asm__ ("addic %1,%3,%4\n\taddze %0,%2"			\
   1447 		   : "=r" (sh), "=&r" (sl)				\
   1448 		   : "r"  ((UDItype)(ah)),				\
   1449 		     "rI" ((UDItype)(al)), "*rI" (-((UDItype)(bl))));	\
   1450 	else								\
   1451 	  __asm__ ("addic %1,%4,%5\n\tsubfe %0,%3,%2"			\
   1452 		   : "=r" (sh), "=&r" (sl)				\
   1453 		   : "r"  ((UDItype)(ah)), "r" ((UDItype)(bh)),		\
   1454 		     "rI" ((UDItype)(al)), "*rI" (-((UDItype)(bl))));	\
   1455     } else {								\
   1456 	if (__builtin_constant_p (ah) && (ah) == 0)			\
   1457 	  __asm__ ("subf%I3c %1,%4,%3\n\tsubfze %0,%2"			\
   1458 		   : "=r" (sh), "=&r" (sl)				\
   1459 		   :                       "r" ((UDItype)(bh)),		\
   1460 		     "rI" ((UDItype)(al)), "r" ((UDItype)(bl)));	\
   1461 	else if (__builtin_constant_p (ah) && (ah) == ~(UDItype) 0)	\
   1462 	  __asm__ ("subf%I3c %1,%4,%3\n\tsubfme %0,%2"			\
   1463 		   : "=r" (sh), "=&r" (sl)				\
   1464 		   :                       "r" ((UDItype)(bh)),		\
   1465 		     "rI" ((UDItype)(al)), "r" ((UDItype)(bl)));	\
   1466 	else if (__builtin_constant_p (bh) && (bh) == 0)		\
   1467 	  __asm__ ("subf%I3c %1,%4,%3\n\taddme %0,%2"			\
   1468 		   : "=r" (sh), "=&r" (sl)				\
   1469 		   : "r"  ((UDItype)(ah)),				\
   1470 		     "rI" ((UDItype)(al)), "r" ((UDItype)(bl)));	\
   1471 	else if (__builtin_constant_p (bh) && (bh) == ~(UDItype) 0)	\
   1472 	  __asm__ ("subf%I3c %1,%4,%3\n\taddze %0,%2"			\
   1473 		   : "=r" (sh), "=&r" (sl)				\
   1474 		   : "r"  ((UDItype)(ah)),				\
   1475 		     "rI" ((UDItype)(al)), "r" ((UDItype)(bl)));	\
   1476 	else								\
   1477 	  __asm__ ("subf%I4c %1,%5,%4\n\tsubfe %0,%3,%2"		\
   1478 		   : "=r" (sh), "=&r" (sl)				\
   1479 		   : "r"  ((UDItype)(ah)), "r" ((UDItype)(bh)),		\
   1480 		     "rI" ((UDItype)(al)), "r" ((UDItype)(bl)));	\
   1481     }									\
   1482   } while (0)
   1483 #endif /* ! _LONG_LONG_LIMB */
   1484 #define count_leading_zeros(count, x) \
   1485   __asm__ ("cntlzd %0,%1" : "=r" (count) : "r" (x))
   1486 #define COUNT_LEADING_ZEROS_0 64
   1487 #if 0 && __GMP_GNUC_PREREQ (4,4) /* Disable, this results in libcalls! */
   1488 #define umul_ppmm(w1, w0, u, v) \
   1489   do {									\
   1490     typedef unsigned int __ll_UTItype __attribute__((mode(TI)));	\
   1491     __ll_UTItype __ll = (__ll_UTItype)(u) * (v);			\
   1492     w1 = __ll >> 64;							\
   1493     w0 = __ll;								\
   1494   } while (0)
   1495 #endif
   1496 #if !defined (umul_ppmm)
   1497 #define umul_ppmm(ph, pl, m0, m1) \
   1498   do {									\
   1499     UDItype __m0 = (m0), __m1 = (m1);					\
   1500     __asm__ ("mulhdu %0,%1,%2" : "=r" (ph) : "%r" (__m0), "r" (__m1));	\
   1501     (pl) = __m0 * __m1;							\
   1502   } while (0)
   1503 #endif
   1504 #define UMUL_TIME 15
   1505 #define smul_ppmm(ph, pl, m0, m1) \
   1506   do {									\
   1507     DItype __m0 = (m0), __m1 = (m1);					\
   1508     __asm__ ("mulhd %0,%1,%2" : "=r" (ph) : "%r" (__m0), "r" (__m1));	\
   1509     (pl) = __m0 * __m1;							\
   1510   } while (0)
   1511 #define SMUL_TIME 14  /* ??? */
   1512 #define UDIV_TIME 120 /* ??? */
   1513 #endif /* 64-bit PowerPC.  */
   1514 
   1515 #if defined (__pyr__) && W_TYPE_SIZE == 32
   1516 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \
   1517   __asm__ ("addw %5,%1\n\taddwc %3,%0"					\
   1518 	   : "=r" (sh), "=&r" (sl)					\
   1519 	   : "0"  ((USItype)(ah)), "g" ((USItype)(bh)),			\
   1520 	     "%1" ((USItype)(al)), "g" ((USItype)(bl)))
   1521 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \
   1522   __asm__ ("subw %5,%1\n\tsubwb %3,%0"					\
   1523 	   : "=r" (sh), "=&r" (sl)					\
   1524 	   : "0" ((USItype)(ah)), "g" ((USItype)(bh)),			\
   1525 	     "1" ((USItype)(al)), "g" ((USItype)(bl)))
   1526 /* This insn works on Pyramids with AP, XP, or MI CPUs, but not with SP.  */
   1527 #define umul_ppmm(w1, w0, u, v) \
   1528   ({union {UDItype __ll;						\
   1529 	   struct {USItype __h, __l;} __i;				\
   1530 	  } __x;							\
   1531   __asm__ ("movw %1,%R0\n\tuemul %2,%0"					\
   1532 	   : "=&r" (__x.__ll)						\
   1533 	   : "g" ((USItype) (u)), "g" ((USItype)(v)));			\
   1534   (w1) = __x.__i.__h; (w0) = __x.__i.__l;})
   1535 #endif /* __pyr__ */
   1536 
   1537 #if defined (__ibm032__) /* RT/ROMP */  && W_TYPE_SIZE == 32
   1538 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \
   1539   __asm__ ("a %1,%5\n\tae %0,%3"					\
   1540 	   : "=r" (sh), "=&r" (sl)					\
   1541 	   : "0"  ((USItype)(ah)), "r" ((USItype)(bh)),			\
   1542 	     "%1" ((USItype)(al)), "r" ((USItype)(bl)))
   1543 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \
   1544   __asm__ ("s %1,%5\n\tse %0,%3"					\
   1545 	   : "=r" (sh), "=&r" (sl)					\
   1546 	   : "0" ((USItype)(ah)), "r" ((USItype)(bh)),			\
   1547 	     "1" ((USItype)(al)), "r" ((USItype)(bl)))
   1548 #define smul_ppmm(ph, pl, m0, m1) \
   1549   __asm__ (								\
   1550        "s	r2,r2\n"						\
   1551 "	mts r10,%2\n"							\
   1552 "	m	r2,%3\n"						\
   1553 "	m	r2,%3\n"						\
   1554 "	m	r2,%3\n"						\
   1555 "	m	r2,%3\n"						\
   1556 "	m	r2,%3\n"						\
   1557 "	m	r2,%3\n"						\
   1558 "	m	r2,%3\n"						\
   1559 "	m	r2,%3\n"						\
   1560 "	m	r2,%3\n"						\
   1561 "	m	r2,%3\n"						\
   1562 "	m	r2,%3\n"						\
   1563 "	m	r2,%3\n"						\
   1564 "	m	r2,%3\n"						\
   1565 "	m	r2,%3\n"						\
   1566 "	m	r2,%3\n"						\
   1567 "	m	r2,%3\n"						\
   1568 "	cas	%0,r2,r0\n"						\
   1569 "	mfs	r10,%1"							\
   1570 	   : "=r" (ph), "=r" (pl)					\
   1571 	   : "%r" ((USItype)(m0)), "r" ((USItype)(m1))			\
   1572 	   : "r2")
   1573 #define UMUL_TIME 20
   1574 #define UDIV_TIME 200
   1575 #define count_leading_zeros(count, x) \
   1576   do {									\
   1577     if ((x) >= 0x10000)							\
   1578       __asm__ ("clz	%0,%1"						\
   1579 	       : "=r" (count) : "r" ((USItype)(x) >> 16));		\
   1580     else								\
   1581       {									\
   1582 	__asm__ ("clz	%0,%1"						\
   1583 		 : "=r" (count) : "r" ((USItype)(x)));			\
   1584 	(count) += 16;							\
   1585       }									\
   1586   } while (0)
   1587 #endif /* RT/ROMP */
   1588 
   1589 #if (defined (__SH2__) || defined (__SH3__) || defined (__SH4__)) && W_TYPE_SIZE == 32
   1590 #define umul_ppmm(w1, w0, u, v) \
   1591   __asm__ ("dmulu.l %2,%3\n\tsts macl,%1\n\tsts mach,%0"		\
   1592 	   : "=r" (w1), "=r" (w0) : "r" (u), "r" (v) : "macl", "mach")
   1593 #define UMUL_TIME 5
   1594 #endif
   1595 
   1596 #if defined (__sparc__) && W_TYPE_SIZE == 32
   1597 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \
   1598   __asm__ ("addcc %r4,%5,%1\n\taddx %r2,%3,%0"				\
   1599 	   : "=r" (sh), "=&r" (sl)					\
   1600 	   : "rJ" (ah), "rI" (bh),"%rJ" (al), "rI" (bl)			\
   1601 	   __CLOBBER_CC)
   1602 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \
   1603   __asm__ ("subcc %r4,%5,%1\n\tsubx %r2,%3,%0"				\
   1604 	   : "=r" (sh), "=&r" (sl)					\
   1605 	   : "rJ" (ah), "rI" (bh), "rJ" (al), "rI" (bl)	\
   1606 	   __CLOBBER_CC)
   1607 /* FIXME: When gcc -mcpu=v9 is used on solaris, gcc/config/sol2-sld-64.h
   1608    doesn't define anything to indicate that to us, it only sets __sparcv8. */
   1609 #if defined (__sparc_v9__) || defined (__sparcv9)
   1610 /* Perhaps we should use floating-point operations here?  */
   1611 #if 0
   1612 /* Triggers a bug making mpz/tests/t-gcd.c fail.
   1613    Perhaps we simply need explicitly zero-extend the inputs?  */
   1614 #define umul_ppmm(w1, w0, u, v) \
   1615   __asm__ ("mulx %2,%3,%%g1; srl %%g1,0,%1; srlx %%g1,32,%0" :		\
   1616 	   "=r" (w1), "=r" (w0) : "r" (u), "r" (v) : "g1")
   1617 #else
   1618 /* Use v8 umul until above bug is fixed.  */
   1619 #define umul_ppmm(w1, w0, u, v) \
   1620   __asm__ ("umul %2,%3,%1;rd %%y,%0" : "=r" (w1), "=r" (w0) : "r" (u), "r" (v))
   1621 #endif
   1622 /* Use a plain v8 divide for v9.  */
   1623 #define udiv_qrnnd(q, r, n1, n0, d) \
   1624   do {									\
   1625     USItype __q;							\
   1626     __asm__ ("mov %1,%%y;nop;nop;nop;udiv %2,%3,%0"			\
   1627 	     : "=r" (__q) : "r" (n1), "r" (n0), "r" (d));		\
   1628     (r) = (n0) - __q * (d);						\
   1629     (q) = __q;								\
   1630   } while (0)
   1631 #else
   1632 #if defined (__sparc_v8__)   /* gcc normal */				\
   1633   || defined (__sparcv8)     /* gcc solaris */				\
   1634   || HAVE_HOST_CPU_supersparc
   1635 /* Don't match immediate range because, 1) it is not often useful,
   1636    2) the 'I' flag thinks of the range as a 13 bit signed interval,
   1637    while we want to match a 13 bit interval, sign extended to 32 bits,
   1638    but INTERPRETED AS UNSIGNED.  */
   1639 #define umul_ppmm(w1, w0, u, v) \
   1640   __asm__ ("umul %2,%3,%1;rd %%y,%0" : "=r" (w1), "=r" (w0) : "r" (u), "r" (v))
   1641 #define UMUL_TIME 5
   1642 
   1643 #if HAVE_HOST_CPU_supersparc
   1644 #define UDIV_TIME 60		/* SuperSPARC timing */
   1645 #else
   1646 /* Don't use this on SuperSPARC because its udiv only handles 53 bit
   1647    dividends and will trap to the kernel for the rest. */
   1648 #define udiv_qrnnd(q, r, n1, n0, d) \
   1649   do {									\
   1650     USItype __q;							\
   1651     __asm__ ("mov %1,%%y;nop;nop;nop;udiv %2,%3,%0"			\
   1652 	     : "=r" (__q) : "r" (n1), "r" (n0), "r" (d));		\
   1653     (r) = (n0) - __q * (d);						\
   1654     (q) = __q;								\
   1655   } while (0)
   1656 #define UDIV_TIME 25
   1657 #endif /* HAVE_HOST_CPU_supersparc */
   1658 
   1659 #else /* ! __sparc_v8__ */
   1660 #if defined (__sparclite__)
   1661 /* This has hardware multiply but not divide.  It also has two additional
   1662    instructions scan (ffs from high bit) and divscc.  */
   1663 #define umul_ppmm(w1, w0, u, v) \
   1664   __asm__ ("umul %2,%3,%1;rd %%y,%0" : "=r" (w1), "=r" (w0) : "r" (u), "r" (v))
   1665 #define UMUL_TIME 5
   1666 #define udiv_qrnnd(q, r, n1, n0, d) \
   1667   __asm__ ("! Inlined udiv_qrnnd\n"					\
   1668 "	wr	%%g0,%2,%%y	! Not a delayed write for sparclite\n"	\
   1669 "	tst	%%g0\n"							\
   1670 "	divscc	%3,%4,%%g1\n"						\
   1671 "	divscc	%%g1,%4,%%g1\n"						\
   1672 "	divscc	%%g1,%4,%%g1\n"						\
   1673 "	divscc	%%g1,%4,%%g1\n"						\
   1674 "	divscc	%%g1,%4,%%g1\n"						\
   1675 "	divscc	%%g1,%4,%%g1\n"						\
   1676 "	divscc	%%g1,%4,%%g1\n"						\
   1677 "	divscc	%%g1,%4,%%g1\n"						\
   1678 "	divscc	%%g1,%4,%%g1\n"						\
   1679 "	divscc	%%g1,%4,%%g1\n"						\
   1680 "	divscc	%%g1,%4,%%g1\n"						\
   1681 "	divscc	%%g1,%4,%%g1\n"						\
   1682 "	divscc	%%g1,%4,%%g1\n"						\
   1683 "	divscc	%%g1,%4,%%g1\n"						\
   1684 "	divscc	%%g1,%4,%%g1\n"						\
   1685 "	divscc	%%g1,%4,%%g1\n"						\
   1686 "	divscc	%%g1,%4,%%g1\n"						\
   1687 "	divscc	%%g1,%4,%%g1\n"						\
   1688 "	divscc	%%g1,%4,%%g1\n"						\
   1689 "	divscc	%%g1,%4,%%g1\n"						\
   1690 "	divscc	%%g1,%4,%%g1\n"						\
   1691 "	divscc	%%g1,%4,%%g1\n"						\
   1692 "	divscc	%%g1,%4,%%g1\n"						\
   1693 "	divscc	%%g1,%4,%%g1\n"						\
   1694 "	divscc	%%g1,%4,%%g1\n"						\
   1695 "	divscc	%%g1,%4,%%g1\n"						\
   1696 "	divscc	%%g1,%4,%%g1\n"						\
   1697 "	divscc	%%g1,%4,%%g1\n"						\
   1698 "	divscc	%%g1,%4,%%g1\n"						\
   1699 "	divscc	%%g1,%4,%%g1\n"						\
   1700 "	divscc	%%g1,%4,%%g1\n"						\
   1701 "	divscc	%%g1,%4,%0\n"						\
   1702 "	rd	%%y,%1\n"						\
   1703 "	bl,a 1f\n"							\
   1704 "	add	%1,%4,%1\n"						\
   1705 "1:	! End of inline udiv_qrnnd"					\
   1706 	   : "=r" (q), "=r" (r) : "r" (n1), "r" (n0), "rI" (d)		\
   1707 	   : "%g1" __AND_CLOBBER_CC)
   1708 #define UDIV_TIME 37
   1709 #define count_leading_zeros(count, x) \
   1710   __asm__ ("scan %1,1,%0" : "=r" (count) : "r" (x))
   1711 /* Early sparclites return 63 for an argument of 0, but they warn that future
   1712    implementations might change this.  Therefore, leave COUNT_LEADING_ZEROS_0
   1713    undefined.  */
   1714 #endif /* __sparclite__ */
   1715 #endif /* __sparc_v8__ */
   1716 #endif /* __sparc_v9__ */
   1717 /* Default to sparc v7 versions of umul_ppmm and udiv_qrnnd.  */
   1718 #ifndef umul_ppmm
   1719 #define umul_ppmm(w1, w0, u, v) \
   1720   __asm__ ("! Inlined umul_ppmm\n"					\
   1721 "	wr	%%g0,%2,%%y	! SPARC has 0-3 delay insn after a wr\n" \
   1722 "	sra	%3,31,%%g2	! Don't move this insn\n"		\
   1723 "	and	%2,%%g2,%%g2	! Don't move this insn\n"		\
   1724 "	andcc	%%g0,0,%%g1	! Don't move this insn\n"		\
   1725 "	mulscc	%%g1,%3,%%g1\n"						\
   1726 "	mulscc	%%g1,%3,%%g1\n"						\
   1727 "	mulscc	%%g1,%3,%%g1\n"						\
   1728 "	mulscc	%%g1,%3,%%g1\n"						\
   1729 "	mulscc	%%g1,%3,%%g1\n"						\
   1730 "	mulscc	%%g1,%3,%%g1\n"						\
   1731 "	mulscc	%%g1,%3,%%g1\n"						\
   1732 "	mulscc	%%g1,%3,%%g1\n"						\
   1733 "	mulscc	%%g1,%3,%%g1\n"						\
   1734 "	mulscc	%%g1,%3,%%g1\n"						\
   1735 "	mulscc	%%g1,%3,%%g1\n"						\
   1736 "	mulscc	%%g1,%3,%%g1\n"						\
   1737 "	mulscc	%%g1,%3,%%g1\n"						\
   1738 "	mulscc	%%g1,%3,%%g1\n"						\
   1739 "	mulscc	%%g1,%3,%%g1\n"						\
   1740 "	mulscc	%%g1,%3,%%g1\n"						\
   1741 "	mulscc	%%g1,%3,%%g1\n"						\
   1742 "	mulscc	%%g1,%3,%%g1\n"						\
   1743 "	mulscc	%%g1,%3,%%g1\n"						\
   1744 "	mulscc	%%g1,%3,%%g1\n"						\
   1745 "	mulscc	%%g1,%3,%%g1\n"						\
   1746 "	mulscc	%%g1,%3,%%g1\n"						\
   1747 "	mulscc	%%g1,%3,%%g1\n"						\
   1748 "	mulscc	%%g1,%3,%%g1\n"						\
   1749 "	mulscc	%%g1,%3,%%g1\n"						\
   1750 "	mulscc	%%g1,%3,%%g1\n"						\
   1751 "	mulscc	%%g1,%3,%%g1\n"						\
   1752 "	mulscc	%%g1,%3,%%g1\n"						\
   1753 "	mulscc	%%g1,%3,%%g1\n"						\
   1754 "	mulscc	%%g1,%3,%%g1\n"						\
   1755 "	mulscc	%%g1,%3,%%g1\n"						\
   1756 "	mulscc	%%g1,%3,%%g1\n"						\
   1757 "	mulscc	%%g1,0,%%g1\n"						\
   1758 "	add	%%g1,%%g2,%0\n"						\
   1759 "	rd	%%y,%1"							\
   1760 	   : "=r" (w1), "=r" (w0) : "%rI" (u), "r" (v)			\
   1761 	   : "%g1", "%g2" __AND_CLOBBER_CC)
   1762 #define UMUL_TIME 39		/* 39 instructions */
   1763 #endif
   1764 #ifndef udiv_qrnnd
   1765 #ifndef LONGLONG_STANDALONE
   1766 #define udiv_qrnnd(q, r, n1, n0, d) \
   1767   do { UWtype __r;							\
   1768     (q) = __MPN(udiv_qrnnd) (&__r, (n1), (n0), (d));			\
   1769     (r) = __r;								\
   1770   } while (0)
   1771 extern UWtype __MPN(udiv_qrnnd) (UWtype *, UWtype, UWtype, UWtype);
   1772 #ifndef UDIV_TIME
   1773 #define UDIV_TIME 140
   1774 #endif
   1775 #endif /* LONGLONG_STANDALONE */
   1776 #endif /* udiv_qrnnd */
   1777 #endif /* __sparc__ */
   1778 
   1779 #if defined (__sparc__) && W_TYPE_SIZE == 64
   1780 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \
   1781   __asm__ (								\
   1782        "addcc	%r4,%5,%1\n"						\
   1783       "	addccc	%r6,%7,%%g0\n"						\
   1784       "	addc	%r2,%3,%0"						\
   1785        : "=r" (sh), "=&r" (sl)						\
   1786        : "rJ"  ((UDItype)(ah)), "rI" ((UDItype)(bh)),			\
   1787 	 "%rJ" ((UDItype)(al)), "rI" ((UDItype)(bl)),			\
   1788 	 "%rJ" ((UDItype)(al) >> 32), "rI" ((UDItype)(bl) >> 32)	\
   1789 	   __CLOBBER_CC)
   1790 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \
   1791   __asm__ (								\
   1792        "subcc	%r4,%5,%1\n"						\
   1793       "	subccc	%r6,%7,%%g0\n"						\
   1794       "	subc	%r2,%3,%0"						\
   1795        : "=r" (sh), "=&r" (sl)						\
   1796        : "rJ" ((UDItype)(ah)), "rI" ((UDItype)(bh)),			\
   1797 	 "rJ" ((UDItype)(al)), "rI" ((UDItype)(bl)),			\
   1798 	 "rJ" ((UDItype)(al) >> 32), "rI" ((UDItype)(bl) >> 32)		\
   1799 	   __CLOBBER_CC)
   1800 #if __VIS__ >= 0x300
   1801 #undef add_ssaaaa
   1802 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \
   1803   __asm__ (								\
   1804        "addcc	%r4, %5, %1\n"						\
   1805       "	addxc	%r2, %r3, %0"						\
   1806 	  : "=r" (sh), "=&r" (sl)					\
   1807        : "rJ"  ((UDItype)(ah)), "rJ" ((UDItype)(bh)),			\
   1808 	 "%rJ" ((UDItype)(al)), "rI" ((UDItype)(bl)) __CLOBBER_CC)
   1809 #define umul_ppmm(ph, pl, m0, m1) \
   1810   do {									\
   1811     UDItype __m0 = (m0), __m1 = (m1);					\
   1812     (pl) = __m0 * __m1;							\
   1813     __asm__ ("umulxhi\t%2, %1, %0"					\
   1814 	     : "=r" (ph)						\
   1815 	     : "%r" (__m0), "r" (__m1));				\
   1816   } while (0)
   1817 #define count_leading_zeros(count, x) \
   1818   __asm__ ("lzd\t%1,%0" : "=r" (count) : "r" (x))
   1819 /* Needed by count_leading_zeros_32 in sparc64.h.  */
   1820 #define COUNT_LEADING_ZEROS_NEED_CLZ_TAB
   1821 #endif
   1822 #endif
   1823 
   1824 #if (defined (__vax) || defined (__vax__)) && W_TYPE_SIZE == 32
   1825 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \
   1826   __asm__ ("addl2 %5,%1\n\tadwc %3,%0"					\
   1827 	   : "=g" (sh), "=&g" (sl)					\
   1828 	   : "0"  ((USItype)(ah)), "g" ((USItype)(bh)),			\
   1829 	     "%1" ((USItype)(al)), "g" ((USItype)(bl)))
   1830 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \
   1831   __asm__ ("subl2 %5,%1\n\tsbwc %3,%0"					\
   1832 	   : "=g" (sh), "=&g" (sl)					\
   1833 	   : "0" ((USItype)(ah)), "g" ((USItype)(bh)),			\
   1834 	     "1" ((USItype)(al)), "g" ((USItype)(bl)))
   1835 #define smul_ppmm(xh, xl, m0, m1) \
   1836   do {									\
   1837     union {UDItype __ll;						\
   1838 	   struct {USItype __l, __h;} __i;				\
   1839 	  } __x;							\
   1840     USItype __m0 = (m0), __m1 = (m1);					\
   1841     __asm__ ("emul %1,%2,$0,%0"						\
   1842 	     : "=g" (__x.__ll) : "g" (__m0), "g" (__m1));		\
   1843     (xh) = __x.__i.__h; (xl) = __x.__i.__l;				\
   1844   } while (0)
   1845 #define sdiv_qrnnd(q, r, n1, n0, d) \
   1846   do {									\
   1847     union {DItype __ll;							\
   1848 	   struct {SItype __l, __h;} __i;				\
   1849 	  } __x;							\
   1850     __x.__i.__h = n1; __x.__i.__l = n0;					\
   1851     __asm__ ("ediv %3,%2,%0,%1"						\
   1852 	     : "=g" (q), "=g" (r) : "g" (__x.__ll), "g" (d));		\
   1853   } while (0)
   1854 #if 0
   1855 /* FIXME: This instruction appears to be unimplemented on some systems (vax
   1856    8800 maybe). */
   1857 #define count_trailing_zeros(count,x)					\
   1858   do {									\
   1859     __asm__ ("ffs 0, 31, %1, %0"					\
   1860 	     : "=g" (count)						\
   1861 	     : "g" ((USItype) (x)));					\
   1862   } while (0)
   1863 #endif
   1864 #endif /* vax */
   1865 
   1866 #if defined (__z8000__) && W_TYPE_SIZE == 16
   1867 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \
   1868   __asm__ ("add	%H1,%H5\n\tadc	%H0,%H3"				\
   1869 	   : "=r" (sh), "=&r" (sl)					\
   1870 	   : "0"  ((unsigned int)(ah)), "r" ((unsigned int)(bh)),	\
   1871 	     "%1" ((unsigned int)(al)), "rQR" ((unsigned int)(bl)))
   1872 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \
   1873   __asm__ ("sub	%H1,%H5\n\tsbc	%H0,%H3"				\
   1874 	   : "=r" (sh), "=&r" (sl)					\
   1875 	   : "0" ((unsigned int)(ah)), "r" ((unsigned int)(bh)),	\
   1876 	     "1" ((unsigned int)(al)), "rQR" ((unsigned int)(bl)))
   1877 #define umul_ppmm(xh, xl, m0, m1) \
   1878   do {									\
   1879     union {long int __ll;						\
   1880 	   struct {unsigned int __h, __l;} __i;				\
   1881 	  } __x;							\
   1882     unsigned int __m0 = (m0), __m1 = (m1);				\
   1883     __asm__ ("mult	%S0,%H3"					\
   1884 	     : "=r" (__x.__i.__h), "=r" (__x.__i.__l)			\
   1885 	     : "%1" (m0), "rQR" (m1));					\
   1886     (xh) = __x.__i.__h; (xl) = __x.__i.__l;				\
   1887     (xh) += ((((signed int) __m0 >> 15) & __m1)				\
   1888 	     + (((signed int) __m1 >> 15) & __m0));			\
   1889   } while (0)
   1890 #endif /* __z8000__ */
   1891 
   1892 #endif /* __GNUC__ */
   1893 
   1894 #endif /* NO_ASM */
   1895 
   1896 
   1897 /* FIXME: "sidi" here is highly doubtful, should sometimes be "diti".  */
   1898 #if !defined (umul_ppmm) && defined (__umulsidi3)
   1899 #define umul_ppmm(ph, pl, m0, m1) \
   1900   do {									\
   1901     UDWtype __ll = __umulsidi3 (m0, m1);				\
   1902     ph = (UWtype) (__ll >> W_TYPE_SIZE);				\
   1903     pl = (UWtype) __ll;							\
   1904   } while (0)
   1905 #endif
   1906 
   1907 #if !defined (__umulsidi3)
   1908 #define __umulsidi3(u, v) \
   1909   ({UWtype __hi, __lo;							\
   1910     umul_ppmm (__hi, __lo, u, v);					\
   1911     ((UDWtype) __hi << W_TYPE_SIZE) | __lo; })
   1912 #endif
   1913 
   1914 
   1915 #if defined (__cplusplus)
   1916 #define __longlong_h_C "C"
   1917 #else
   1918 #define __longlong_h_C
   1919 #endif
   1920 
   1921 /* Use mpn_umul_ppmm or mpn_udiv_qrnnd functions, if they exist.  The "_r"
   1922    forms have "reversed" arguments, meaning the pointer is last, which
   1923    sometimes allows better parameter passing, in particular on 64-bit
   1924    hppa. */
   1925 
   1926 #define mpn_umul_ppmm  __MPN(umul_ppmm)
   1927 extern __longlong_h_C UWtype mpn_umul_ppmm (UWtype *, UWtype, UWtype);
   1928 
   1929 #if ! defined (umul_ppmm) && HAVE_NATIVE_mpn_umul_ppmm  \
   1930   && ! defined (LONGLONG_STANDALONE)
   1931 #define umul_ppmm(wh, wl, u, v)						\
   1932   do {									\
   1933     UWtype __umul_ppmm__p0;						\
   1934     (wh) = mpn_umul_ppmm (&__umul_ppmm__p0, (UWtype) (u), (UWtype) (v));\
   1935     (wl) = __umul_ppmm__p0;						\
   1936   } while (0)
   1937 #endif
   1938 
   1939 #define mpn_umul_ppmm_r  __MPN(umul_ppmm_r)
   1940 extern __longlong_h_C UWtype mpn_umul_ppmm_r (UWtype, UWtype, UWtype *);
   1941 
   1942 #if ! defined (umul_ppmm) && HAVE_NATIVE_mpn_umul_ppmm_r	\
   1943   && ! defined (LONGLONG_STANDALONE)
   1944 #define umul_ppmm(wh, wl, u, v)						\
   1945   do {									\
   1946     UWtype __umul_p0;							\
   1947     (wh) = mpn_umul_ppmm_r ((UWtype) (u), (UWtype) (v), &__umul_p0);	\
   1948     (wl) = __umul_p0;							\
   1949   } while (0)
   1950 #endif
   1951 
   1952 #define mpn_udiv_qrnnd  __MPN(udiv_qrnnd)
   1953 extern __longlong_h_C UWtype mpn_udiv_qrnnd (UWtype *, UWtype, UWtype, UWtype);
   1954 
   1955 #if ! defined (udiv_qrnnd) && HAVE_NATIVE_mpn_udiv_qrnnd	\
   1956   && ! defined (LONGLONG_STANDALONE)
   1957 #define udiv_qrnnd(q, r, n1, n0, d)					\
   1958   do {									\
   1959     UWtype __udiv_qrnnd_r;						\
   1960     (q) = mpn_udiv_qrnnd (&__udiv_qrnnd_r,				\
   1961 			  (UWtype) (n1), (UWtype) (n0), (UWtype) d);	\
   1962     (r) = __udiv_qrnnd_r;						\
   1963   } while (0)
   1964 #endif
   1965 
   1966 #define mpn_udiv_qrnnd_r  __MPN(udiv_qrnnd_r)
   1967 extern __longlong_h_C UWtype mpn_udiv_qrnnd_r (UWtype, UWtype, UWtype, UWtype *);
   1968 
   1969 #if ! defined (udiv_qrnnd) && HAVE_NATIVE_mpn_udiv_qrnnd_r	\
   1970   && ! defined (LONGLONG_STANDALONE)
   1971 #define udiv_qrnnd(q, r, n1, n0, d)					\
   1972   do {									\
   1973     UWtype __udiv_qrnnd_r;						\
   1974     (q) = mpn_udiv_qrnnd_r ((UWtype) (n1), (UWtype) (n0), (UWtype) d,	\
   1975 			    &__udiv_qrnnd_r);				\
   1976     (r) = __udiv_qrnnd_r;						\
   1977   } while (0)
   1978 #endif
   1979 
   1980 
   1981 /* If this machine has no inline assembler, use C macros.  */
   1982 
   1983 #if !defined (add_ssaaaa)
   1984 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \
   1985   do {									\
   1986     UWtype __x;								\
   1987     __x = (al) + (bl);							\
   1988     (sh) = (ah) + (bh) + (__x < (al));					\
   1989     (sl) = __x;								\
   1990   } while (0)
   1991 #endif
   1992 
   1993 #if !defined (sub_ddmmss)
   1994 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \
   1995   do {									\
   1996     UWtype __x;								\
   1997     __x = (al) - (bl);							\
   1998     (sh) = (ah) - (bh) - ((al) < (bl));					\
   1999     (sl) = __x;								\
   2000   } while (0)
   2001 #endif
   2002 
   2003 /* If we lack umul_ppmm but have smul_ppmm, define umul_ppmm in terms of
   2004    smul_ppmm.  */
   2005 #if !defined (umul_ppmm) && defined (smul_ppmm)
   2006 #define umul_ppmm(w1, w0, u, v)						\
   2007   do {									\
   2008     UWtype __w1;							\
   2009     UWtype __xm0 = (u), __xm1 = (v);					\
   2010     smul_ppmm (__w1, w0, __xm0, __xm1);					\
   2011     (w1) = __w1 + (-(__xm0 >> (W_TYPE_SIZE - 1)) & __xm1)		\
   2012 		+ (-(__xm1 >> (W_TYPE_SIZE - 1)) & __xm0);		\
   2013   } while (0)
   2014 #endif
   2015 
   2016 /* If we still don't have umul_ppmm, define it using plain C.
   2017 
   2018    For reference, when this code is used for squaring (ie. u and v identical
   2019    expressions), gcc recognises __x1 and __x2 are the same and generates 3
   2020    multiplies, not 4.  The subsequent additions could be optimized a bit,
   2021    but the only place GMP currently uses such a square is mpn_sqr_basecase,
   2022    and chips obliged to use this generic C umul will have plenty of worse
   2023    performance problems than a couple of extra instructions on the diagonal
   2024    of sqr_basecase.  */
   2025 
   2026 #if !defined (umul_ppmm)
   2027 #define umul_ppmm(w1, w0, u, v)						\
   2028   do {									\
   2029     UWtype __x0, __x1, __x2, __x3;					\
   2030     UHWtype __ul, __vl, __uh, __vh;					\
   2031     UWtype __u = (u), __v = (v);					\
   2032 									\
   2033     __ul = __ll_lowpart (__u);						\
   2034     __uh = __ll_highpart (__u);						\
   2035     __vl = __ll_lowpart (__v);						\
   2036     __vh = __ll_highpart (__v);						\
   2037 									\
   2038     __x0 = (UWtype) __ul * __vl;					\
   2039     __x1 = (UWtype) __ul * __vh;					\
   2040     __x2 = (UWtype) __uh * __vl;					\
   2041     __x3 = (UWtype) __uh * __vh;					\
   2042 									\
   2043     __x1 += __ll_highpart (__x0);/* this can't give carry */		\
   2044     __x1 += __x2;		/* but this indeed can */		\
   2045     if (__x1 < __x2)		/* did we get it? */			\
   2046       __x3 += __ll_B;		/* yes, add it in the proper pos. */	\
   2047 									\
   2048     (w1) = __x3 + __ll_highpart (__x1);					\
   2049     (w0) = (__x1 << W_TYPE_SIZE/2) + __ll_lowpart (__x0);		\
   2050   } while (0)
   2051 #endif
   2052 
   2053 /* If we don't have smul_ppmm, define it using umul_ppmm (which surely will
   2054    exist in one form or another.  */
   2055 #if !defined (smul_ppmm)
   2056 #define smul_ppmm(w1, w0, u, v)						\
   2057   do {									\
   2058     UWtype __w1;							\
   2059     UWtype __xm0 = (u), __xm1 = (v);					\
   2060     umul_ppmm (__w1, w0, __xm0, __xm1);					\
   2061     (w1) = __w1 - (-(__xm0 >> (W_TYPE_SIZE - 1)) & __xm1)		\
   2062 		- (-(__xm1 >> (W_TYPE_SIZE - 1)) & __xm0);		\
   2063   } while (0)
   2064 #endif
   2065 
   2066 /* Define this unconditionally, so it can be used for debugging.  */
   2067 #define __udiv_qrnnd_c(q, r, n1, n0, d) \
   2068   do {									\
   2069     UWtype __d1, __d0, __q1, __q0, __r1, __r0, __m;			\
   2070 									\
   2071     ASSERT ((d) != 0);							\
   2072     ASSERT ((n1) < (d));						\
   2073 									\
   2074     __d1 = __ll_highpart (d);						\
   2075     __d0 = __ll_lowpart (d);						\
   2076 									\
   2077     __q1 = (n1) / __d1;							\
   2078     __r1 = (n1) - __q1 * __d1;						\
   2079     __m = __q1 * __d0;							\
   2080     __r1 = __r1 * __ll_B | __ll_highpart (n0);				\
   2081     if (__r1 < __m)							\
   2082       {									\
   2083 	__q1--, __r1 += (d);						\
   2084 	if (__r1 >= (d)) /* i.e. we didn't get carry when adding to __r1 */\
   2085 	  if (__r1 < __m)						\
   2086 	    __q1--, __r1 += (d);					\
   2087       }									\
   2088     __r1 -= __m;							\
   2089 									\
   2090     __q0 = __r1 / __d1;							\
   2091     __r0 = __r1  - __q0 * __d1;						\
   2092     __m = __q0 * __d0;							\
   2093     __r0 = __r0 * __ll_B | __ll_lowpart (n0);				\
   2094     if (__r0 < __m)							\
   2095       {									\
   2096 	__q0--, __r0 += (d);						\
   2097 	if (__r0 >= (d))						\
   2098 	  if (__r0 < __m)						\
   2099 	    __q0--, __r0 += (d);					\
   2100       }									\
   2101     __r0 -= __m;							\
   2102 									\
   2103     (q) = __q1 * __ll_B | __q0;						\
   2104     (r) = __r0;								\
   2105   } while (0)
   2106 
   2107 /* If the processor has no udiv_qrnnd but sdiv_qrnnd, go through
   2108    __udiv_w_sdiv (defined in libgcc or elsewhere).  */
   2109 #if !defined (udiv_qrnnd) && defined (sdiv_qrnnd) \
   2110   && ! defined (LONGLONG_STANDALONE)
   2111 #define udiv_qrnnd(q, r, nh, nl, d) \
   2112   do {									\
   2113     UWtype __r;								\
   2114     (q) = __MPN(udiv_w_sdiv) (&__r, nh, nl, d);				\
   2115     (r) = __r;								\
   2116   } while (0)
   2117 __GMP_DECLSPEC UWtype __MPN(udiv_w_sdiv) (UWtype *, UWtype, UWtype, UWtype);
   2118 #endif
   2119 
   2120 /* If udiv_qrnnd was not defined for this processor, use __udiv_qrnnd_c.  */
   2121 #if !defined (udiv_qrnnd)
   2122 #define UDIV_NEEDS_NORMALIZATION 1
   2123 #define udiv_qrnnd __udiv_qrnnd_c
   2124 #endif
   2125 
   2126 #if !defined (count_leading_zeros)
   2127 #define count_leading_zeros(count, x) \
   2128   do {									\
   2129     UWtype __xr = (x);							\
   2130     UWtype __a;								\
   2131 									\
   2132     if (W_TYPE_SIZE == 32)						\
   2133       {									\
   2134 	__a = __xr < ((UWtype) 1 << 2*__BITS4)				\
   2135 	  ? (__xr < ((UWtype) 1 << __BITS4) ? 1 : __BITS4 + 1)		\
   2136 	  : (__xr < ((UWtype) 1 << 3*__BITS4) ? 2*__BITS4 + 1		\
   2137 	  : 3*__BITS4 + 1);						\
   2138       }									\
   2139     else								\
   2140       {									\
   2141 	for (__a = W_TYPE_SIZE - 8; __a > 0; __a -= 8)			\
   2142 	  if (((__xr >> __a) & 0xff) != 0)				\
   2143 	    break;							\
   2144 	++__a;								\
   2145       }									\
   2146 									\
   2147     (count) = W_TYPE_SIZE + 1 - __a - __clz_tab[__xr >> __a];		\
   2148   } while (0)
   2149 /* This version gives a well-defined value for zero. */
   2150 #define COUNT_LEADING_ZEROS_0 (W_TYPE_SIZE - 1)
   2151 #define COUNT_LEADING_ZEROS_NEED_CLZ_TAB
   2152 #define COUNT_LEADING_ZEROS_SLOW
   2153 #endif
   2154 
   2155 /* clz_tab needed by mpn/x86/pentium/mod_1.asm in a fat binary */
   2156 #if HAVE_HOST_CPU_FAMILY_x86 && WANT_FAT_BINARY
   2157 #define COUNT_LEADING_ZEROS_NEED_CLZ_TAB
   2158 #endif
   2159 
   2160 #ifdef COUNT_LEADING_ZEROS_NEED_CLZ_TAB
   2161 extern const unsigned char __GMP_DECLSPEC __clz_tab[129];
   2162 #endif
   2163 
   2164 #if !defined (count_trailing_zeros)
   2165 #if !defined (COUNT_LEADING_ZEROS_SLOW)
   2166 /* Define count_trailing_zeros using an asm count_leading_zeros.  */
   2167 #define count_trailing_zeros(count, x)					\
   2168   do {									\
   2169     UWtype __ctz_x = (x);						\
   2170     UWtype __ctz_c;							\
   2171     ASSERT (__ctz_x != 0);						\
   2172     count_leading_zeros (__ctz_c, __ctz_x & -__ctz_x);			\
   2173     (count) = W_TYPE_SIZE - 1 - __ctz_c;				\
   2174   } while (0)
   2175 #else
   2176 /* Define count_trailing_zeros in plain C, assuming small counts are common.
   2177    We use clz_tab without ado, since the C count_leading_zeros above will have
   2178    pulled it in.  */
   2179 #define count_trailing_zeros(count, x)					\
   2180   do {									\
   2181     UWtype __ctz_x = (x);						\
   2182     int __ctz_c;							\
   2183 									\
   2184     if (LIKELY ((__ctz_x & 0xff) != 0))					\
   2185       (count) = __clz_tab[__ctz_x & -__ctz_x] - 2;			\
   2186     else								\
   2187       {									\
   2188 	for (__ctz_c = 8 - 2; __ctz_c < W_TYPE_SIZE - 2; __ctz_c += 8)	\
   2189 	  {								\
   2190 	    __ctz_x >>= 8;						\
   2191 	    if (LIKELY ((__ctz_x & 0xff) != 0))				\
   2192 	      break;							\
   2193 	  }								\
   2194 									\
   2195 	(count) = __ctz_c + __clz_tab[__ctz_x & -__ctz_x];		\
   2196       }									\
   2197   } while (0)
   2198 #endif
   2199 #endif
   2200 
   2201 #ifndef UDIV_NEEDS_NORMALIZATION
   2202 #define UDIV_NEEDS_NORMALIZATION 0
   2203 #endif
   2204 
   2205 /* Whether udiv_qrnnd is actually implemented with udiv_qrnnd_preinv, and
   2206    that hence the latter should always be used.  */
   2207 #ifndef UDIV_PREINV_ALWAYS
   2208 #define UDIV_PREINV_ALWAYS 0
   2209 #endif
   2210 
   2211 /* Give defaults for UMUL_TIME and UDIV_TIME.  */
   2212 #ifndef UMUL_TIME
   2213 #define UMUL_TIME 1
   2214 #endif
   2215 
   2216 #ifndef UDIV_TIME
   2217 #define UDIV_TIME UMUL_TIME
   2218 #endif
   2219