Home | History | Annotate | Line # | Download | only in dist
longlong.h revision 1.1.1.4
      1 /* longlong.h -- definitions for mixed size 32/64 bit arithmetic.
      2 
      3 Copyright 1991-1994, 1996, 1997, 1999-2005, 2007-2009, 2011-2020 Free Software
      4 Foundation, Inc.
      5 
      6 This file is part of the GNU MP Library.
      7 
      8 The GNU MP Library is free software; you can redistribute it and/or modify
      9 it under the terms of either:
     10 
     11   * the GNU Lesser General Public License as published by the Free
     12     Software Foundation; either version 3 of the License, or (at your
     13     option) any later version.
     14 
     15 or
     16 
     17   * the GNU General Public License as published by the Free Software
     18     Foundation; either version 2 of the License, or (at your option) any
     19     later version.
     20 
     21 or both in parallel, as here.
     22 
     23 The GNU MP Library is distributed in the hope that it will be useful, but
     24 WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
     25 or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
     26 for more details.
     27 
     28 You should have received copies of the GNU General Public License and the
     29 GNU Lesser General Public License along with the GNU MP Library.  If not,
     30 see https://www.gnu.org/licenses/.  */
     31 
     32 /* You have to define the following before including this file:
     33 
     34    UWtype -- An unsigned type, default type for operations (typically a "word")
     35    UHWtype -- An unsigned type, at least half the size of UWtype
     36    UDWtype -- An unsigned type, at least twice as large a UWtype
     37    W_TYPE_SIZE -- size in bits of UWtype
     38 
     39    SItype, USItype -- Signed and unsigned 32 bit types
     40    DItype, UDItype -- Signed and unsigned 64 bit types
     41 
     42    On a 32 bit machine UWtype should typically be USItype;
     43    on a 64 bit machine, UWtype should typically be UDItype.
     44 
     45    Optionally, define:
     46 
     47    LONGLONG_STANDALONE -- Avoid code that needs machine-dependent support files
     48    NO_ASM -- Disable inline asm
     49 
     50 
     51    CAUTION!  Using this version of longlong.h outside of GMP is not safe.  You
     52    need to include gmp.h and gmp-impl.h, or certain things might not work as
     53    expected.
     54 */
     55 
     56 #define __BITS4 (W_TYPE_SIZE / 4)
     57 #define __ll_B ((UWtype) 1 << (W_TYPE_SIZE / 2))
     58 #define __ll_lowpart(t) ((UWtype) (t) & (__ll_B - 1))
     59 #define __ll_highpart(t) ((UWtype) (t) >> (W_TYPE_SIZE / 2))
     60 
     61 /* This is used to make sure no undesirable sharing between different libraries
     62    that use this file takes place.  */
     63 #ifndef __MPN
     64 #define __MPN(x) __##x
     65 #endif
     66 
     67 /* Define auxiliary asm macros.
     68 
     69    1) umul_ppmm(high_prod, low_prod, multiplier, multiplicand) multiplies two
     70    UWtype integers MULTIPLIER and MULTIPLICAND, and generates a two UWtype
     71    word product in HIGH_PROD and LOW_PROD.
     72 
     73    2) __umulsidi3(a,b) multiplies two UWtype integers A and B, and returns a
     74    UDWtype product.  This is just a variant of umul_ppmm.
     75 
     76    3) udiv_qrnnd(quotient, remainder, high_numerator, low_numerator,
     77    denominator) divides a UDWtype, composed by the UWtype integers
     78    HIGH_NUMERATOR and LOW_NUMERATOR, by DENOMINATOR and places the quotient
     79    in QUOTIENT and the remainder in REMAINDER.  HIGH_NUMERATOR must be less
     80    than DENOMINATOR for correct operation.  If, in addition, the most
     81    significant bit of DENOMINATOR must be 1, then the pre-processor symbol
     82    UDIV_NEEDS_NORMALIZATION is defined to 1.
     83 
     84    4) sdiv_qrnnd(quotient, remainder, high_numerator, low_numerator,
     85    denominator).  Like udiv_qrnnd but the numbers are signed.  The quotient
     86    is rounded towards 0.
     87 
     88    5) count_leading_zeros(count, x) counts the number of zero-bits from the
     89    msb to the first non-zero bit in the UWtype X.  This is the number of
     90    steps X needs to be shifted left to set the msb.  Undefined for X == 0,
     91    unless the symbol COUNT_LEADING_ZEROS_0 is defined to some value.
     92 
     93    6) count_trailing_zeros(count, x) like count_leading_zeros, but counts
     94    from the least significant end.
     95 
     96    7) add_ssaaaa(high_sum, low_sum, high_addend_1, low_addend_1,
     97    high_addend_2, low_addend_2) adds two UWtype integers, composed by
     98    HIGH_ADDEND_1 and LOW_ADDEND_1, and HIGH_ADDEND_2 and LOW_ADDEND_2
     99    respectively.  The result is placed in HIGH_SUM and LOW_SUM.  Overflow
    100    (i.e. carry out) is not stored anywhere, and is lost.
    101 
    102    8) sub_ddmmss(high_difference, low_difference, high_minuend, low_minuend,
    103    high_subtrahend, low_subtrahend) subtracts two two-word UWtype integers,
    104    composed by HIGH_MINUEND_1 and LOW_MINUEND_1, and HIGH_SUBTRAHEND_2 and
    105    LOW_SUBTRAHEND_2 respectively.  The result is placed in HIGH_DIFFERENCE
    106    and LOW_DIFFERENCE.  Overflow (i.e. carry out) is not stored anywhere,
    107    and is lost.
    108 
    109    If any of these macros are left undefined for a particular CPU,
    110    C macros are used.
    111 
    112 
    113    Notes:
    114 
    115    For add_ssaaaa the two high and two low addends can both commute, but
    116    unfortunately gcc only supports one "%" commutative in each asm block.
    117    This has always been so but is only documented in recent versions
    118    (eg. pre-release 3.3).  Having two or more "%"s can cause an internal
    119    compiler error in certain rare circumstances.
    120 
    121    Apparently it was only the last "%" that was ever actually respected, so
    122    the code has been updated to leave just that.  Clearly there's a free
    123    choice whether high or low should get it, if there's a reason to favour
    124    one over the other.  Also obviously when the constraints on the two
    125    operands are identical there's no benefit to the reloader in any "%" at
    126    all.
    127 
    128    */
    129 
    130 /* The CPUs come in alphabetical order below.
    131 
    132    Please add support for more CPUs here, or improve the current support
    133    for the CPUs below!  */
    134 
    135 
    136 /* count_leading_zeros_gcc_clz is count_leading_zeros implemented with gcc
    137    3.4 __builtin_clzl or __builtin_clzll, according to our limb size.
    138    Similarly count_trailing_zeros_gcc_ctz using __builtin_ctzl or
    139    __builtin_ctzll.
    140 
    141    These builtins are only used when we check what code comes out, on some
    142    chips they're merely libgcc calls, where we will instead want an inline
    143    in that case (either asm or generic C).
    144 
    145    These builtins are better than an asm block of the same insn, since an
    146    asm block doesn't give gcc any information about scheduling or resource
    147    usage.  We keep an asm block for use on prior versions of gcc though.
    148 
    149    For reference, __builtin_ffs existed in gcc prior to __builtin_clz, but
    150    it's not used (for count_leading_zeros) because it generally gives extra
    151    code to ensure the result is 0 when the input is 0, which we don't need
    152    or want.  */
    153 
    154 #ifdef _LONG_LONG_LIMB
    155 #define count_leading_zeros_gcc_clz(count,x)	\
    156   do {						\
    157     ASSERT ((x) != 0);				\
    158     (count) = __builtin_clzll (x);		\
    159   } while (0)
    160 #else
    161 #define count_leading_zeros_gcc_clz(count,x)	\
    162   do {						\
    163     ASSERT ((x) != 0);				\
    164     (count) = __builtin_clzl (x);		\
    165   } while (0)
    166 #endif
    167 
    168 #ifdef _LONG_LONG_LIMB
    169 #define count_trailing_zeros_gcc_ctz(count,x)	\
    170   do {						\
    171     ASSERT ((x) != 0);				\
    172     (count) = __builtin_ctzll (x);		\
    173   } while (0)
    174 #else
    175 #define count_trailing_zeros_gcc_ctz(count,x)	\
    176   do {						\
    177     ASSERT ((x) != 0);				\
    178     (count) = __builtin_ctzl (x);		\
    179   } while (0)
    180 #endif
    181 
    182 
    183 /* FIXME: The macros using external routines like __MPN(count_leading_zeros)
    184    don't need to be under !NO_ASM */
    185 #if ! defined (NO_ASM)
    186 
    187 #if defined (__alpha) && W_TYPE_SIZE == 64
    188 /* Most alpha-based machines, except Cray systems. */
    189 #if defined (__GNUC__)
    190 #if __GMP_GNUC_PREREQ (3,3)
    191 #define umul_ppmm(ph, pl, m0, m1) \
    192   do {									\
    193     UDItype __m0 = (m0), __m1 = (m1);					\
    194     (ph) = __builtin_alpha_umulh (__m0, __m1);				\
    195     (pl) = __m0 * __m1;							\
    196   } while (0)
    197 #else
    198 #define umul_ppmm(ph, pl, m0, m1) \
    199   do {									\
    200     UDItype __m0 = (m0), __m1 = (m1);					\
    201     __asm__ ("umulh %r1,%2,%0"						\
    202 	     : "=r" (ph)						\
    203 	     : "%rJ" (__m0), "rI" (__m1));				\
    204     (pl) = __m0 * __m1;							\
    205   } while (0)
    206 #endif
    207 #else /* ! __GNUC__ */
    208 #include <machine/builtins.h>
    209 #define umul_ppmm(ph, pl, m0, m1) \
    210   do {									\
    211     UDItype __m0 = (m0), __m1 = (m1);					\
    212     (ph) = __UMULH (__m0, __m1);					\
    213     (pl) = __m0 * __m1;							\
    214   } while (0)
    215 #endif
    216 #ifndef LONGLONG_STANDALONE
    217 #define udiv_qrnnd(q, r, n1, n0, d) \
    218   do { UWtype __di;							\
    219     __di = __MPN(invert_limb) (d);					\
    220     udiv_qrnnd_preinv (q, r, n1, n0, d, __di);				\
    221   } while (0)
    222 #define UDIV_PREINV_ALWAYS  1
    223 #define UDIV_NEEDS_NORMALIZATION 1
    224 #endif /* LONGLONG_STANDALONE */
    225 
    226 /* clz_tab is required in all configurations, since mpn/alpha/cntlz.asm
    227    always goes into libgmp.so, even when not actually used.  */
    228 #define COUNT_LEADING_ZEROS_NEED_CLZ_TAB
    229 
    230 #if defined (__GNUC__) && HAVE_HOST_CPU_alpha_CIX
    231 #define count_leading_zeros(COUNT,X) \
    232   __asm__("ctlz %1,%0" : "=r"(COUNT) : "r"(X))
    233 #define count_trailing_zeros(COUNT,X) \
    234   __asm__("cttz %1,%0" : "=r"(COUNT) : "r"(X))
    235 #endif /* clz/ctz using cix */
    236 
    237 #if ! defined (count_leading_zeros)				\
    238   && defined (__GNUC__) && ! defined (LONGLONG_STANDALONE)
    239 /* ALPHA_CMPBGE_0 gives "cmpbge $31,src,dst", ie. test src bytes == 0.
    240    "$31" is written explicitly in the asm, since an "r" constraint won't
    241    select reg 31.  There seems no need to worry about "r31" syntax for cray,
    242    since gcc itself (pre-release 3.4) emits just $31 in various places.	 */
    243 #define ALPHA_CMPBGE_0(dst, src)					\
    244   do { asm ("cmpbge $31, %1, %0" : "=r" (dst) : "r" (src)); } while (0)
    245 /* Zero bytes are turned into bits with cmpbge, a __clz_tab lookup counts
    246    them, locating the highest non-zero byte.  A second __clz_tab lookup
    247    counts the leading zero bits in that byte, giving the result.  */
    248 #define count_leading_zeros(count, x)					\
    249   do {									\
    250     UWtype  __clz__b, __clz__c, __clz__x = (x);				\
    251     ALPHA_CMPBGE_0 (__clz__b,  __clz__x);	    /* zero bytes */	\
    252     __clz__b = __clz_tab [(__clz__b >> 1) ^ 0x7F];  /* 8 to 1 byte */	\
    253     __clz__b = __clz__b * 8 - 7;		    /* 57 to 1 shift */ \
    254     __clz__x >>= __clz__b;						\
    255     __clz__c = __clz_tab [__clz__x];		    /* 8 to 1 bit */	\
    256     __clz__b = 65 - __clz__b;						\
    257     (count) = __clz__b - __clz__c;					\
    258   } while (0)
    259 #define COUNT_LEADING_ZEROS_NEED_CLZ_TAB
    260 #endif /* clz using cmpbge */
    261 
    262 #if ! defined (count_leading_zeros) && ! defined (LONGLONG_STANDALONE)
    263 #if HAVE_ATTRIBUTE_CONST
    264 long __MPN(count_leading_zeros) (UDItype) __attribute__ ((const));
    265 #else
    266 long __MPN(count_leading_zeros) (UDItype);
    267 #endif
    268 #define count_leading_zeros(count, x) \
    269   ((count) = __MPN(count_leading_zeros) (x))
    270 #endif /* clz using mpn */
    271 #endif /* __alpha */
    272 
    273 #if defined (__AVR) && W_TYPE_SIZE == 8
    274 #define umul_ppmm(ph, pl, m0, m1) \
    275   do {									\
    276     unsigned short __p = (unsigned short) (m0) * (m1);			\
    277     (ph) = __p >> 8;							\
    278     (pl) = __p;								\
    279   } while (0)
    280 #endif /* AVR */
    281 
    282 #if defined (_CRAY) && W_TYPE_SIZE == 64
    283 #include <intrinsics.h>
    284 #define UDIV_PREINV_ALWAYS  1
    285 #define UDIV_NEEDS_NORMALIZATION 1
    286 long __MPN(count_leading_zeros) (UDItype);
    287 #define count_leading_zeros(count, x) \
    288   ((count) = _leadz ((UWtype) (x)))
    289 #if defined (_CRAYIEEE)		/* I.e., Cray T90/ieee, T3D, and T3E */
    290 #define umul_ppmm(ph, pl, m0, m1) \
    291   do {									\
    292     UDItype __m0 = (m0), __m1 = (m1);					\
    293     (ph) = _int_mult_upper (__m0, __m1);				\
    294     (pl) = __m0 * __m1;							\
    295   } while (0)
    296 #ifndef LONGLONG_STANDALONE
    297 #define udiv_qrnnd(q, r, n1, n0, d) \
    298   do { UWtype __di;							\
    299     __di = __MPN(invert_limb) (d);					\
    300     udiv_qrnnd_preinv (q, r, n1, n0, d, __di);				\
    301   } while (0)
    302 #endif /* LONGLONG_STANDALONE */
    303 #endif /* _CRAYIEEE */
    304 #endif /* _CRAY */
    305 
    306 #if defined (__ia64) && W_TYPE_SIZE == 64
    307 /* This form encourages gcc (pre-release 3.4 at least) to emit predicated
    308    "sub r=r,r" and "sub r=r,r,1", giving a 2 cycle latency.  The generic
    309    code using "al<bl" arithmetically comes out making an actual 0 or 1 in a
    310    register, which takes an extra cycle.  */
    311 #define sub_ddmmss(sh, sl, ah, al, bh, bl)      \
    312   do {						\
    313     UWtype __x;					\
    314     __x = (al) - (bl);				\
    315     if ((al) < (bl))				\
    316       (sh) = (ah) - (bh) - 1;			\
    317     else					\
    318       (sh) = (ah) - (bh);			\
    319     (sl) = __x;					\
    320   } while (0)
    321 #if defined (__GNUC__) && ! defined (__INTEL_COMPILER)
    322 /* Do both product parts in assembly, since that gives better code with
    323    all gcc versions.  Some callers will just use the upper part, and in
    324    that situation we waste an instruction, but not any cycles.  */
    325 #define umul_ppmm(ph, pl, m0, m1) \
    326     __asm__ ("xma.hu %0 = %2, %3, f0\n\txma.l %1 = %2, %3, f0"		\
    327 	     : "=&f" (ph), "=f" (pl)					\
    328 	     : "f" (m0), "f" (m1))
    329 #define count_leading_zeros(count, x) \
    330   do {									\
    331     UWtype _x = (x), _y, _a, _c;					\
    332     __asm__ ("mux1 %0 = %1, @rev" : "=r" (_y) : "r" (_x));		\
    333     __asm__ ("czx1.l %0 = %1" : "=r" (_a) : "r" (-_y | _y));		\
    334     _c = (_a - 1) << 3;							\
    335     _x >>= _c;								\
    336     if (_x >= 1 << 4)							\
    337       _x >>= 4, _c += 4;						\
    338     if (_x >= 1 << 2)							\
    339       _x >>= 2, _c += 2;						\
    340     _c += _x >> 1;							\
    341     (count) =  W_TYPE_SIZE - 1 - _c;					\
    342   } while (0)
    343 /* similar to what gcc does for __builtin_ffs, but 0 based rather than 1
    344    based, and we don't need a special case for x==0 here */
    345 #define count_trailing_zeros(count, x)					\
    346   do {									\
    347     UWtype __ctz_x = (x);						\
    348     __asm__ ("popcnt %0 = %1"						\
    349 	     : "=r" (count)						\
    350 	     : "r" ((__ctz_x-1) & ~__ctz_x));				\
    351   } while (0)
    352 #endif
    353 #if defined (__INTEL_COMPILER)
    354 #include <ia64intrin.h>
    355 #define umul_ppmm(ph, pl, m0, m1)					\
    356   do {									\
    357     UWtype __m0 = (m0), __m1 = (m1);					\
    358     ph = _m64_xmahu (__m0, __m1, 0);					\
    359     pl = __m0 * __m1;							\
    360   } while (0)
    361 #endif
    362 #ifndef LONGLONG_STANDALONE
    363 #define udiv_qrnnd(q, r, n1, n0, d) \
    364   do { UWtype __di;							\
    365     __di = __MPN(invert_limb) (d);					\
    366     udiv_qrnnd_preinv (q, r, n1, n0, d, __di);				\
    367   } while (0)
    368 #define UDIV_PREINV_ALWAYS  1
    369 #define UDIV_NEEDS_NORMALIZATION 1
    370 #endif
    371 #endif
    372 
    373 
    374 #if defined (__GNUC__)
    375 
    376 /* We sometimes need to clobber "cc" with gcc2, but that would not be
    377    understood by gcc1.  Use cpp to avoid major code duplication.  */
    378 #if __GNUC__ < 2
    379 #define __CLOBBER_CC
    380 #define __AND_CLOBBER_CC
    381 #else /* __GNUC__ >= 2 */
    382 #define __CLOBBER_CC : "cc"
    383 #define __AND_CLOBBER_CC , "cc"
    384 #endif /* __GNUC__ < 2 */
    385 
    386 #if (defined (__a29k__) || defined (_AM29K)) && W_TYPE_SIZE == 32
    387 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \
    388   __asm__ ("add %1,%4,%5\n\taddc %0,%2,%3"				\
    389 	   : "=r" (sh), "=&r" (sl)					\
    390 	   : "r" (ah), "rI" (bh), "%r" (al), "rI" (bl))
    391 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \
    392   __asm__ ("sub %1,%4,%5\n\tsubc %0,%2,%3"				\
    393 	   : "=r" (sh), "=&r" (sl)					\
    394 	   : "r" (ah), "rI" (bh), "r" (al), "rI" (bl))
    395 #define umul_ppmm(xh, xl, m0, m1) \
    396   do {									\
    397     USItype __m0 = (m0), __m1 = (m1);					\
    398     __asm__ ("multiplu %0,%1,%2"					\
    399 	     : "=r" (xl)						\
    400 	     : "r" (__m0), "r" (__m1));					\
    401     __asm__ ("multmu %0,%1,%2"						\
    402 	     : "=r" (xh)						\
    403 	     : "r" (__m0), "r" (__m1));					\
    404   } while (0)
    405 #define udiv_qrnnd(q, r, n1, n0, d) \
    406   __asm__ ("dividu %0,%3,%4"						\
    407 	   : "=r" (q), "=q" (r)						\
    408 	   : "1" (n1), "r" (n0), "r" (d))
    409 #define count_leading_zeros(count, x) \
    410     __asm__ ("clz %0,%1"						\
    411 	     : "=r" (count)						\
    412 	     : "r" (x))
    413 #define COUNT_LEADING_ZEROS_0 32
    414 #endif /* __a29k__ */
    415 
    416 #if defined (__arc__)
    417 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \
    418   __asm__ ("add.f\t%1, %4, %5\n\tadc\t%0, %2, %3"			\
    419 	   : "=r" (sh),							\
    420 	     "=&r" (sl)							\
    421 	   : "r"  ((USItype) (ah)),					\
    422 	     "rICal" ((USItype) (bh)),					\
    423 	     "%r" ((USItype) (al)),					\
    424 	     "rICal" ((USItype) (bl)))
    425 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \
    426   __asm__ ("sub.f\t%1, %4, %5\n\tsbc\t%0, %2, %3"			\
    427 	   : "=r" (sh),							\
    428 	     "=&r" (sl)							\
    429 	   : "r" ((USItype) (ah)),					\
    430 	     "rICal" ((USItype) (bh)),					\
    431 	     "r" ((USItype) (al)),					\
    432 	     "rICal" ((USItype) (bl)))
    433 #endif
    434 
    435 #if defined (__arm__) && (defined (__thumb2__) || !defined (__thumb__)) \
    436     && W_TYPE_SIZE == 32
    437 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \
    438   do {									\
    439     if (__builtin_constant_p (bl) && -(USItype)(bl) < 0x100)		\
    440       __asm__ ("subs\t%1, %4, %5\n\tadc\t%0, %2, %3"			\
    441 	   : "=r" (sh), "=&r" (sl)					\
    442 	       : "r" (ah), "rI" (bh),					\
    443 		 "%r" (al), "rI" (-(USItype)(bl)) __CLOBBER_CC);	\
    444     else								\
    445       __asm__ ("adds\t%1, %4, %5\n\tadc\t%0, %2, %3"			\
    446 	   : "=r" (sh), "=&r" (sl)					\
    447 	   : "r" (ah), "rI" (bh), "%r" (al), "rI" (bl) __CLOBBER_CC);	\
    448   } while (0)
    449 /* FIXME: Extend the immediate range for the low word by using both ADDS and
    450    SUBS, since they set carry in the same way.  Note: We need separate
    451    definitions for thumb and non-thumb to to th absense of RSC under thumb.  */
    452 #if defined (__thumb__)
    453 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \
    454   do {									\
    455     if (__builtin_constant_p (ah) && __builtin_constant_p (bh)		\
    456 	&& (ah) == (bh))						\
    457       __asm__ ("subs\t%1, %2, %3\n\tsbc\t%0, %0, %0"			\
    458 	       : "=r" (sh), "=r" (sl)					\
    459 	       : "r" (al), "rI" (bl) __CLOBBER_CC);			\
    460     else if (__builtin_constant_p (al))					\
    461       __asm__ ("rsbs\t%1, %5, %4\n\tsbc\t%0, %2, %3"			\
    462 	       : "=r" (sh), "=&r" (sl)					\
    463 	       : "r" (ah), "rI" (bh), "rI" (al), "r" (bl) __CLOBBER_CC); \
    464     else if (__builtin_constant_p (bl))					\
    465       __asm__ ("subs\t%1, %4, %5\n\tsbc\t%0, %2, %3"			\
    466 	       : "=r" (sh), "=&r" (sl)					\
    467 	       : "r" (ah), "rI" (bh), "r" (al), "rI" (bl) __CLOBBER_CC); \
    468     else								\
    469       __asm__ ("subs\t%1, %4, %5\n\tsbc\t%0, %2, %3"			\
    470 	       : "=r" (sh), "=&r" (sl)					\
    471 	       : "r" (ah), "rI" (bh), "r" (al), "rI" (bl) __CLOBBER_CC); \
    472     } while (0)
    473 #else
    474 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \
    475   do {									\
    476     if (__builtin_constant_p (ah) && __builtin_constant_p (bh)		\
    477 	&& (ah) == (bh))						\
    478       __asm__ ("subs\t%1, %2, %3\n\tsbc\t%0, %0, %0"			\
    479 	       : "=r" (sh), "=r" (sl)					\
    480 	       : "r" (al), "rI" (bl) __CLOBBER_CC);			\
    481     else if (__builtin_constant_p (al))					\
    482       {									\
    483 	if (__builtin_constant_p (ah))					\
    484 	  __asm__ ("rsbs\t%1, %5, %4\n\trsc\t%0, %3, %2"		\
    485 		   : "=r" (sh), "=&r" (sl)				\
    486 		   : "rI" (ah), "r" (bh), "rI" (al), "r" (bl) __CLOBBER_CC); \
    487 	else								\
    488 	  __asm__ ("rsbs\t%1, %5, %4\n\tsbc\t%0, %2, %3"		\
    489 		   : "=r" (sh), "=&r" (sl)				\
    490 		   : "r" (ah), "rI" (bh), "rI" (al), "r" (bl) __CLOBBER_CC); \
    491       }									\
    492     else if (__builtin_constant_p (ah))					\
    493       {									\
    494 	if (__builtin_constant_p (bl))					\
    495 	  __asm__ ("subs\t%1, %4, %5\n\trsc\t%0, %3, %2"		\
    496 		   : "=r" (sh), "=&r" (sl)				\
    497 		   : "rI" (ah), "r" (bh), "r" (al), "rI" (bl) __CLOBBER_CC); \
    498 	else								\
    499 	  __asm__ ("rsbs\t%1, %5, %4\n\trsc\t%0, %3, %2"		\
    500 		   : "=r" (sh), "=&r" (sl)				\
    501 		   : "rI" (ah), "r" (bh), "rI" (al), "r" (bl) __CLOBBER_CC); \
    502       }									\
    503     else if (__builtin_constant_p (bl))					\
    504       __asm__ ("subs\t%1, %4, %5\n\tsbc\t%0, %2, %3"			\
    505 	       : "=r" (sh), "=&r" (sl)					\
    506 	       : "r" (ah), "rI" (bh), "r" (al), "rI" (bl) __CLOBBER_CC); \
    507     else /* only bh might be a constant */				\
    508       __asm__ ("subs\t%1, %4, %5\n\tsbc\t%0, %2, %3"			\
    509 	       : "=r" (sh), "=&r" (sl)					\
    510 	       : "r" (ah), "rI" (bh), "r" (al), "rI" (bl) __CLOBBER_CC); \
    511     } while (0)
    512 #endif
    513 #if defined (__ARM_ARCH_2__) || defined (__ARM_ARCH_2A__) \
    514     || defined (__ARM_ARCH_3__)
    515 #define umul_ppmm(xh, xl, a, b)						\
    516   do {									\
    517     register USItype __t0, __t1, __t2;					\
    518     __asm__ ("%@ Inlined umul_ppmm\n"					\
    519 	   "	mov	%2, %5, lsr #16\n"				\
    520 	   "	mov	%0, %6, lsr #16\n"				\
    521 	   "	bic	%3, %5, %2, lsl #16\n"				\
    522 	   "	bic	%4, %6, %0, lsl #16\n"				\
    523 	   "	mul	%1, %3, %4\n"					\
    524 	   "	mul	%4, %2, %4\n"					\
    525 	   "	mul	%3, %0, %3\n"					\
    526 	   "	mul	%0, %2, %0\n"					\
    527 	   "	adds	%3, %4, %3\n"					\
    528 	   "	addcs	%0, %0, #65536\n"				\
    529 	   "	adds	%1, %1, %3, lsl #16\n"				\
    530 	   "	adc	%0, %0, %3, lsr #16"				\
    531 	   : "=&r" ((USItype) (xh)), "=r" ((USItype) (xl)),		\
    532 	     "=&r" (__t0), "=&r" (__t1), "=r" (__t2)			\
    533 	   : "r" ((USItype) (a)), "r" ((USItype) (b)) __CLOBBER_CC);	\
    534   } while (0)
    535 #ifndef LONGLONG_STANDALONE
    536 #define udiv_qrnnd(q, r, n1, n0, d) \
    537   do { UWtype __r;							\
    538     (q) = __MPN(udiv_qrnnd) (&__r, (n1), (n0), (d));			\
    539     (r) = __r;								\
    540   } while (0)
    541 extern UWtype __MPN(udiv_qrnnd) (UWtype *, UWtype, UWtype, UWtype);
    542 #endif /* LONGLONG_STANDALONE */
    543 #else /* ARMv4 or newer */
    544 #define umul_ppmm(xh, xl, a, b) \
    545   __asm__ ("umull %0,%1,%2,%3" : "=&r" (xl), "=&r" (xh) : "r" (a), "r" (b))
    546 #define smul_ppmm(xh, xl, a, b) \
    547   __asm__ ("smull %0,%1,%2,%3" : "=&r" (xl), "=&r" (xh) : "r" (a), "r" (b))
    548 #ifndef LONGLONG_STANDALONE
    549 #define udiv_qrnnd(q, r, n1, n0, d) \
    550   do { UWtype __di;							\
    551     __di = __MPN(invert_limb) (d);					\
    552     udiv_qrnnd_preinv (q, r, n1, n0, d, __di);				\
    553   } while (0)
    554 #define UDIV_PREINV_ALWAYS  1
    555 #define UDIV_NEEDS_NORMALIZATION 1
    556 #endif /* LONGLONG_STANDALONE */
    557 #endif /* defined(__ARM_ARCH_2__) ... */
    558 #define count_leading_zeros(count, x)  count_leading_zeros_gcc_clz(count, x)
    559 #define count_trailing_zeros(count, x)  count_trailing_zeros_gcc_ctz(count, x)
    560 #endif /* __arm__ */
    561 
    562 #if defined (__aarch64__) && W_TYPE_SIZE == 64
    563 /* FIXME: Extend the immediate range for the low word by using both
    564    ADDS and SUBS, since they set carry in the same way.  */
    565 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \
    566   do {									\
    567     if (__builtin_constant_p (bl) && -(UDItype)(bl) < 0x1000)		\
    568       __asm__ ("subs\t%1, %x4, %5\n\tadc\t%0, %x2, %x3"			\
    569 	       : "=r" (sh), "=&r" (sl)					\
    570 	       : "rZ" ((UDItype)(ah)), "rZ" ((UDItype)(bh)),		\
    571 		 "%r" ((UDItype)(al)), "rI" (-(UDItype)(bl)) __CLOBBER_CC);\
    572     else								\
    573       __asm__ ("adds\t%1, %x4, %5\n\tadc\t%0, %x2, %x3"			\
    574 	       : "=r" (sh), "=&r" (sl)					\
    575 	       : "rZ" ((UDItype)(ah)), "rZ" ((UDItype)(bh)),		\
    576 		 "%r" ((UDItype)(al)), "rI" ((UDItype)(bl)) __CLOBBER_CC);\
    577   } while (0)
    578 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \
    579   do {									\
    580     if (__builtin_constant_p (bl) && -(UDItype)(bl) < 0x1000)		\
    581       __asm__ ("adds\t%1, %x4, %5\n\tsbc\t%0, %x2, %x3"			\
    582 	       : "=r,r" (sh), "=&r,&r" (sl)				\
    583 	       : "rZ,rZ" ((UDItype)(ah)), "rZ,rZ" ((UDItype)(bh)),	\
    584 		 "r,Z"   ((UDItype)(al)), "rI,r" (-(UDItype)(bl)) __CLOBBER_CC);\
    585     else								\
    586       __asm__ ("subs\t%1, %x4, %5\n\tsbc\t%0, %x2, %x3"			\
    587 	       : "=r,r" (sh), "=&r,&r" (sl)				\
    588 	       : "rZ,rZ" ((UDItype)(ah)), "rZ,rZ" ((UDItype)(bh)),	\
    589 		 "r,Z"   ((UDItype)(al)), "rI,r"  ((UDItype)(bl)) __CLOBBER_CC);\
    590   } while(0);
    591 #if __GMP_GNUC_PREREQ (4,9)
    592 #define umul_ppmm(w1, w0, u, v) \
    593   do {									\
    594     typedef unsigned int __ll_UTItype __attribute__((mode(TI)));	\
    595     __ll_UTItype __ll = (__ll_UTItype)(u) * (v);			\
    596     w1 = __ll >> 64;							\
    597     w0 = __ll;								\
    598   } while (0)
    599 #endif
    600 #if !defined (umul_ppmm)
    601 #define umul_ppmm(ph, pl, m0, m1) \
    602   do {									\
    603     UDItype __m0 = (m0), __m1 = (m1);					\
    604     __asm__ ("umulh\t%0, %1, %2" : "=r" (ph) : "r" (__m0), "r" (__m1));	\
    605     (pl) = __m0 * __m1;							\
    606   } while (0)
    607 #endif
    608 #define count_leading_zeros(count, x)  count_leading_zeros_gcc_clz(count, x)
    609 #define count_trailing_zeros(count, x)  count_trailing_zeros_gcc_ctz(count, x)
    610 #endif /* __aarch64__ */
    611 
    612 #if defined (__clipper__) && W_TYPE_SIZE == 32
    613 #define umul_ppmm(w1, w0, u, v) \
    614   ({union {UDItype __ll;						\
    615 	   struct {USItype __l, __h;} __i;				\
    616 	  } __x;							\
    617   __asm__ ("mulwux %2,%0"						\
    618 	   : "=r" (__x.__ll)						\
    619 	   : "%0" ((USItype)(u)), "r" ((USItype)(v)));			\
    620   (w1) = __x.__i.__h; (w0) = __x.__i.__l;})
    621 #define smul_ppmm(w1, w0, u, v) \
    622   ({union {DItype __ll;							\
    623 	   struct {SItype __l, __h;} __i;				\
    624 	  } __x;							\
    625   __asm__ ("mulwx %2,%0"						\
    626 	   : "=r" (__x.__ll)						\
    627 	   : "%0" ((SItype)(u)), "r" ((SItype)(v)));			\
    628   (w1) = __x.__i.__h; (w0) = __x.__i.__l;})
    629 #define __umulsidi3(u, v) \
    630   ({UDItype __w;							\
    631     __asm__ ("mulwux %2,%0"						\
    632 	     : "=r" (__w) : "%0" ((USItype)(u)), "r" ((USItype)(v)));	\
    633     __w; })
    634 #endif /* __clipper__ */
    635 
    636 /* Fujitsu vector computers.  */
    637 #if defined (__uxp__) && W_TYPE_SIZE == 32
    638 #define umul_ppmm(ph, pl, u, v) \
    639   do {									\
    640     union {UDItype __ll;						\
    641 	   struct {USItype __h, __l;} __i;				\
    642 	  } __x;							\
    643     __asm__ ("mult.lu %1,%2,%0"	: "=r" (__x.__ll) : "%r" (u), "rK" (v));\
    644     (ph) = __x.__i.__h;							\
    645     (pl) = __x.__i.__l;							\
    646   } while (0)
    647 #define smul_ppmm(ph, pl, u, v) \
    648   do {									\
    649     union {UDItype __ll;						\
    650 	   struct {USItype __h, __l;} __i;				\
    651 	  } __x;							\
    652     __asm__ ("mult.l %1,%2,%0" : "=r" (__x.__ll) : "%r" (u), "rK" (v));	\
    653     (ph) = __x.__i.__h;							\
    654     (pl) = __x.__i.__l;							\
    655   } while (0)
    656 #endif
    657 
    658 #if defined (__gmicro__) && W_TYPE_SIZE == 32
    659 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \
    660   __asm__ ("add.w %5,%1\n\taddx %3,%0"					\
    661 	   : "=g" (sh), "=&g" (sl)					\
    662 	   : "0"  ((USItype)(ah)), "g" ((USItype)(bh)),			\
    663 	     "%1" ((USItype)(al)), "g" ((USItype)(bl)))
    664 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \
    665   __asm__ ("sub.w %5,%1\n\tsubx %3,%0"					\
    666 	   : "=g" (sh), "=&g" (sl)					\
    667 	   : "0" ((USItype)(ah)), "g" ((USItype)(bh)),			\
    668 	     "1" ((USItype)(al)), "g" ((USItype)(bl)))
    669 #define umul_ppmm(ph, pl, m0, m1) \
    670   __asm__ ("mulx %3,%0,%1"						\
    671 	   : "=g" (ph), "=r" (pl)					\
    672 	   : "%0" ((USItype)(m0)), "g" ((USItype)(m1)))
    673 #define udiv_qrnnd(q, r, nh, nl, d) \
    674   __asm__ ("divx %4,%0,%1"						\
    675 	   : "=g" (q), "=r" (r)						\
    676 	   : "1" ((USItype)(nh)), "0" ((USItype)(nl)), "g" ((USItype)(d)))
    677 #define count_leading_zeros(count, x) \
    678   __asm__ ("bsch/1 %1,%0"						\
    679 	   : "=g" (count) : "g" ((USItype)(x)), "0" ((USItype)0))
    680 #endif
    681 
    682 #if defined (__hppa) && W_TYPE_SIZE == 32
    683 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \
    684   __asm__ ("add%I5 %5,%r4,%1\n\taddc %r2,%r3,%0"			\
    685 	   : "=r" (sh), "=&r" (sl)					\
    686 	   : "rM" (ah), "rM" (bh), "%rM" (al), "rI" (bl))
    687 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \
    688   __asm__ ("sub%I4 %4,%r5,%1\n\tsubb %r2,%r3,%0"			\
    689 	   : "=r" (sh), "=&r" (sl)					\
    690 	   : "rM" (ah), "rM" (bh), "rI" (al), "rM" (bl))
    691 #if defined (_PA_RISC1_1)
    692 #define umul_ppmm(wh, wl, u, v) \
    693   do {									\
    694     union {UDItype __ll;						\
    695 	   struct {USItype __h, __l;} __i;				\
    696 	  } __x;							\
    697     __asm__ ("xmpyu %1,%2,%0" : "=*f" (__x.__ll) : "*f" (u), "*f" (v));	\
    698     (wh) = __x.__i.__h;							\
    699     (wl) = __x.__i.__l;							\
    700   } while (0)
    701 #endif
    702 #define count_leading_zeros(count, x) \
    703   do {									\
    704     USItype __tmp;							\
    705     __asm__ (								\
    706        "ldi		1,%0\n"						\
    707 "	extru,=		%1,15,16,%%r0	; Bits 31..16 zero?\n"		\
    708 "	extru,tr	%1,15,16,%1	; No.  Shift down, skip add.\n"	\
    709 "	ldo		16(%0),%0	; Yes.  Perform add.\n"		\
    710 "	extru,=		%1,23,8,%%r0	; Bits 15..8 zero?\n"		\
    711 "	extru,tr	%1,23,8,%1	; No.  Shift down, skip add.\n"	\
    712 "	ldo		8(%0),%0	; Yes.  Perform add.\n"		\
    713 "	extru,=		%1,27,4,%%r0	; Bits 7..4 zero?\n"		\
    714 "	extru,tr	%1,27,4,%1	; No.  Shift down, skip add.\n"	\
    715 "	ldo		4(%0),%0	; Yes.  Perform add.\n"		\
    716 "	extru,=		%1,29,2,%%r0	; Bits 3..2 zero?\n"		\
    717 "	extru,tr	%1,29,2,%1	; No.  Shift down, skip add.\n"	\
    718 "	ldo		2(%0),%0	; Yes.  Perform add.\n"		\
    719 "	extru		%1,30,1,%1	; Extract bit 1.\n"		\
    720 "	sub		%0,%1,%0	; Subtract it.\n"		\
    721 	: "=r" (count), "=r" (__tmp) : "1" (x));			\
    722   } while (0)
    723 #endif /* hppa */
    724 
    725 /* These macros are for ABI=2.0w.  In ABI=2.0n they can't be used, since GCC
    726    (3.2) puts longlong into two adjacent 32-bit registers.  Presumably this
    727    is just a case of no direct support for 2.0n but treating it like 1.0. */
    728 #if defined (__hppa) && W_TYPE_SIZE == 64 && ! defined (_LONG_LONG_LIMB)
    729 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \
    730   __asm__ ("add%I5 %5,%r4,%1\n\tadd,dc %r2,%r3,%0"			\
    731 	   : "=r" (sh), "=&r" (sl)					\
    732 	   : "rM" (ah), "rM" (bh), "%rM" (al), "rI" (bl))
    733 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \
    734   __asm__ ("sub%I4 %4,%r5,%1\n\tsub,db %r2,%r3,%0"			\
    735 	   : "=r" (sh), "=&r" (sl)					\
    736 	   : "rM" (ah), "rM" (bh), "rI" (al), "rM" (bl))
    737 #endif /* hppa */
    738 
    739 #if (defined (__i370__) || defined (__s390__) || defined (__mvs__)) && W_TYPE_SIZE == 32
    740 #if defined (__zarch__) || defined (HAVE_HOST_CPU_s390_zarch)
    741 #define add_ssaaaa(sh, sl, ah, al, bh, bl)				\
    742   do {									\
    743 /*  if (__builtin_constant_p (bl))					\
    744       __asm__ ("alfi\t%1,%o5\n\talcr\t%0,%3"				\
    745 	       : "=r" (sh), "=&r" (sl)					\
    746 	       : "0"  (ah), "r" (bh), "%1" (al), "n" (bl) __CLOBBER_CC);\
    747     else								\
    748 */    __asm__ ("alr\t%1,%5\n\talcr\t%0,%3"				\
    749 	       : "=r" (sh), "=&r" (sl)					\
    750 	       : "0"  (ah), "r" (bh), "%1" (al), "r" (bl)__CLOBBER_CC);	\
    751   } while (0)
    752 #define sub_ddmmss(sh, sl, ah, al, bh, bl)				\
    753   do {									\
    754 /*  if (__builtin_constant_p (bl))					\
    755       __asm__ ("slfi\t%1,%o5\n\tslbr\t%0,%3"				\
    756 	       : "=r" (sh), "=&r" (sl)					\
    757 	       : "0" (ah), "r" (bh), "1" (al), "n" (bl) __CLOBBER_CC);	\
    758     else								\
    759 */    __asm__ ("slr\t%1,%5\n\tslbr\t%0,%3"				\
    760 	       : "=r" (sh), "=&r" (sl)					\
    761 	       : "0" (ah), "r" (bh), "1" (al), "r" (bl) __CLOBBER_CC);	\
    762   } while (0)
    763 #if __GMP_GNUC_PREREQ (4,5)
    764 #define umul_ppmm(xh, xl, m0, m1)					\
    765   do {									\
    766     union {UDItype __ll;						\
    767 	   struct {USItype __h, __l;} __i;				\
    768 	  } __x;							\
    769     __x.__ll = (UDItype) (m0) * (UDItype) (m1);				\
    770     (xh) = __x.__i.__h; (xl) = __x.__i.__l;				\
    771   } while (0)
    772 #else
    773 #if 0
    774 /* FIXME: this fails if gcc knows about the 64-bit registers.  Use only
    775    with a new enough processor pretending we have 32-bit registers.  */
    776 #define umul_ppmm(xh, xl, m0, m1)					\
    777   do {									\
    778     union {UDItype __ll;						\
    779 	   struct {USItype __h, __l;} __i;				\
    780 	  } __x;							\
    781     __asm__ ("mlr\t%0,%2"						\
    782 	     : "=r" (__x.__ll)						\
    783 	     : "%0" (m0), "r" (m1));					\
    784     (xh) = __x.__i.__h; (xl) = __x.__i.__l;				\
    785   } while (0)
    786 #else
    787 #define umul_ppmm(xh, xl, m0, m1)					\
    788   do {									\
    789   /* When we have 64-bit regs and gcc is aware of that, we cannot simply use
    790      DImode for the product, since that would be allocated to a single 64-bit
    791      register, whereas mlr uses the low 32-bits of an even-odd register pair.
    792   */									\
    793     register USItype __r0 __asm__ ("0");				\
    794     register USItype __r1 __asm__ ("1") = (m0);				\
    795     __asm__ ("mlr\t%0,%3"						\
    796 	     : "=r" (__r0), "=r" (__r1)					\
    797 	     : "r" (__r1), "r" (m1));					\
    798     (xh) = __r0; (xl) = __r1;						\
    799   } while (0)
    800 #endif /* if 0 */
    801 #endif
    802 #if 0
    803 /* FIXME: this fails if gcc knows about the 64-bit registers.  Use only
    804    with a new enough processor pretending we have 32-bit registers.  */
    805 #define udiv_qrnnd(q, r, n1, n0, d)					\
    806   do {									\
    807     union {UDItype __ll;						\
    808 	   struct {USItype __h, __l;} __i;				\
    809 	  } __x;							\
    810     __x.__i.__h = n1; __x.__i.__l = n0;					\
    811     __asm__ ("dlr\t%0,%2"						\
    812 	     : "=r" (__x.__ll)						\
    813 	     : "0" (__x.__ll), "r" (d));				\
    814     (q) = __x.__i.__l; (r) = __x.__i.__h;				\
    815   } while (0)
    816 #else
    817 #define udiv_qrnnd(q, r, n1, n0, d)					\
    818   do {									\
    819     register USItype __r0 __asm__ ("0") = (n1);				\
    820     register USItype __r1 __asm__ ("1") = (n0);				\
    821     __asm__ ("dlr\t%0,%4"						\
    822 	     : "=r" (__r0), "=r" (__r1)					\
    823 	     : "r" (__r0), "r" (__r1), "r" (d));			\
    824     (q) = __r1; (r) = __r0;						\
    825   } while (0)
    826 #endif /* if 0 */
    827 #else /* if __zarch__ */
    828 /* FIXME: this fails if gcc knows about the 64-bit registers.  */
    829 #define smul_ppmm(xh, xl, m0, m1)					\
    830   do {									\
    831     union {DItype __ll;							\
    832 	   struct {USItype __h, __l;} __i;				\
    833 	  } __x;							\
    834     __asm__ ("mr\t%0,%2"						\
    835 	     : "=r" (__x.__ll)						\
    836 	     : "%0" (m0), "r" (m1));					\
    837     (xh) = __x.__i.__h; (xl) = __x.__i.__l;				\
    838   } while (0)
    839 /* FIXME: this fails if gcc knows about the 64-bit registers.  */
    840 #define sdiv_qrnnd(q, r, n1, n0, d)					\
    841   do {									\
    842     union {DItype __ll;							\
    843 	   struct {USItype __h, __l;} __i;				\
    844 	  } __x;							\
    845     __x.__i.__h = n1; __x.__i.__l = n0;					\
    846     __asm__ ("dr\t%0,%2"						\
    847 	     : "=r" (__x.__ll)						\
    848 	     : "0" (__x.__ll), "r" (d));				\
    849     (q) = __x.__i.__l; (r) = __x.__i.__h;				\
    850   } while (0)
    851 #endif /* if __zarch__ */
    852 #endif
    853 
    854 #if defined (__s390x__) && W_TYPE_SIZE == 64
    855 /* We need to cast operands with register constraints, otherwise their types
    856    will be assumed to be SImode by gcc.  For these machines, such operations
    857    will insert a value into the low 32 bits, and leave the high 32 bits with
    858    garbage.  */
    859 #define add_ssaaaa(sh, sl, ah, al, bh, bl)				\
    860   do {									\
    861     __asm__ ("algr\t%1,%5\n\talcgr\t%0,%3"				\
    862 	       : "=r" (sh), "=&r" (sl)					\
    863 	       : "0"  ((UDItype)(ah)), "r" ((UDItype)(bh)),		\
    864 		 "%1" ((UDItype)(al)), "r" ((UDItype)(bl)) __CLOBBER_CC); \
    865   } while (0)
    866 #define sub_ddmmss(sh, sl, ah, al, bh, bl)				\
    867   do {									\
    868     __asm__ ("slgr\t%1,%5\n\tslbgr\t%0,%3"				\
    869 	     : "=r" (sh), "=&r" (sl)					\
    870 	     : "0" ((UDItype)(ah)), "r" ((UDItype)(bh)),		\
    871 	       "1" ((UDItype)(al)), "r" ((UDItype)(bl)) __CLOBBER_CC);	\
    872   } while (0)
    873 #define umul_ppmm(xh, xl, m0, m1)					\
    874   do {									\
    875     union {unsigned int __attribute__ ((mode(TI))) __ll;		\
    876 	   struct {UDItype __h, __l;} __i;				\
    877 	  } __x;							\
    878     __asm__ ("mlgr\t%0,%2"						\
    879 	     : "=r" (__x.__ll)						\
    880 	     : "%0" ((UDItype)(m0)), "r" ((UDItype)(m1)));		\
    881     (xh) = __x.__i.__h; (xl) = __x.__i.__l;				\
    882   } while (0)
    883 #define udiv_qrnnd(q, r, n1, n0, d)					\
    884   do {									\
    885     union {unsigned int __attribute__ ((mode(TI))) __ll;		\
    886 	   struct {UDItype __h, __l;} __i;				\
    887 	  } __x;							\
    888     __x.__i.__h = n1; __x.__i.__l = n0;					\
    889     __asm__ ("dlgr\t%0,%2"						\
    890 	     : "=r" (__x.__ll)						\
    891 	     : "0" (__x.__ll), "r" ((UDItype)(d)));			\
    892     (q) = __x.__i.__l; (r) = __x.__i.__h;				\
    893   } while (0)
    894 #if 0 /* FIXME: Enable for z10 (?) */
    895 #define count_leading_zeros(cnt, x)					\
    896   do {									\
    897     union {unsigned int __attribute__ ((mode(TI))) __ll;		\
    898 	   struct {UDItype __h, __l;} __i;				\
    899 	  } __clr_cnt;							\
    900     __asm__ ("flogr\t%0,%1"						\
    901 	     : "=r" (__clr_cnt.__ll)					\
    902 	     : "r" (x) __CLOBBER_CC);					\
    903     (cnt) = __clr_cnt.__i.__h;						\
    904   } while (0)
    905 #endif
    906 #endif
    907 
    908 /* On x86 and x86_64, every asm implicitly clobbers "flags" and "fpsr",
    909    so we don't need __CLOBBER_CC.  */
    910 #if (defined (__i386__) || defined (__i486__)) && W_TYPE_SIZE == 32
    911 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \
    912   __asm__ ("addl %5,%k1\n\tadcl %3,%k0"					\
    913 	   : "=r" (sh), "=&r" (sl)					\
    914 	   : "0"  ((USItype)(ah)), "g" ((USItype)(bh)),			\
    915 	     "%1" ((USItype)(al)), "g" ((USItype)(bl)))
    916 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \
    917   __asm__ ("subl %5,%k1\n\tsbbl %3,%k0"					\
    918 	   : "=r" (sh), "=&r" (sl)					\
    919 	   : "0" ((USItype)(ah)), "g" ((USItype)(bh)),			\
    920 	     "1" ((USItype)(al)), "g" ((USItype)(bl)))
    921 #define umul_ppmm(w1, w0, u, v) \
    922   __asm__ ("mull %3"							\
    923 	   : "=a" (w0), "=d" (w1)					\
    924 	   : "%0" ((USItype)(u)), "rm" ((USItype)(v)))
    925 #define udiv_qrnnd(q, r, n1, n0, dx) /* d renamed to dx avoiding "=d" */\
    926   __asm__ ("divl %4"		     /* stringification in K&R C */	\
    927 	   : "=a" (q), "=d" (r)						\
    928 	   : "0" ((USItype)(n0)), "1" ((USItype)(n1)), "rm" ((USItype)(dx)))
    929 
    930 #if HAVE_HOST_CPU_i586 || HAVE_HOST_CPU_pentium || HAVE_HOST_CPU_pentiummmx
    931 /* Pentium bsrl takes between 10 and 72 cycles depending where the most
    932    significant 1 bit is, hence the use of the following alternatives.  bsfl
    933    is slow too, between 18 and 42 depending where the least significant 1
    934    bit is, so let the generic count_trailing_zeros below make use of the
    935    count_leading_zeros here too.  */
    936 
    937 #if HAVE_HOST_CPU_pentiummmx && ! defined (LONGLONG_STANDALONE)
    938 /* The following should be a fixed 14 or 15 cycles, but possibly plus an L1
    939    cache miss reading from __clz_tab.  For P55 it's favoured over the float
    940    below so as to avoid mixing MMX and x87, since the penalty for switching
    941    between the two is about 100 cycles.
    942 
    943    The asm block sets __shift to -3 if the high 24 bits are clear, -2 for
    944    16, -1 for 8, or 0 otherwise.  This could be written equivalently as
    945    follows, but as of gcc 2.95.2 it results in conditional jumps.
    946 
    947        __shift = -(__n < 0x1000000);
    948        __shift -= (__n < 0x10000);
    949        __shift -= (__n < 0x100);
    950 
    951    The middle two sbbl and cmpl's pair, and with luck something gcc
    952    generates might pair with the first cmpl and the last sbbl.  The "32+1"
    953    constant could be folded into __clz_tab[], but it doesn't seem worth
    954    making a different table just for that.  */
    955 
    956 #define count_leading_zeros(c,n)					\
    957   do {									\
    958     USItype  __n = (n);							\
    959     USItype  __shift;							\
    960     __asm__ ("cmpl  $0x1000000, %1\n"					\
    961 	     "sbbl  %0, %0\n"						\
    962 	     "cmpl  $0x10000, %1\n"					\
    963 	     "sbbl  $0, %0\n"						\
    964 	     "cmpl  $0x100, %1\n"					\
    965 	     "sbbl  $0, %0\n"						\
    966 	     : "=&r" (__shift) : "r"  (__n));				\
    967     __shift = __shift*8 + 24 + 1;					\
    968     (c) = 32 + 1 - __shift - __clz_tab[__n >> __shift];			\
    969   } while (0)
    970 #define COUNT_LEADING_ZEROS_NEED_CLZ_TAB
    971 #define COUNT_LEADING_ZEROS_0   31   /* n==0 indistinguishable from n==1 */
    972 
    973 #else /* ! pentiummmx || LONGLONG_STANDALONE */
    974 /* The following should be a fixed 14 cycles or so.  Some scheduling
    975    opportunities should be available between the float load/store too.  This
    976    sort of code is used in gcc 3 for __builtin_ffs (with "n&-n") and is
    977    apparently suggested by the Intel optimizing manual (don't know exactly
    978    where).  gcc 2.95 or up will be best for this, so the "double" is
    979    correctly aligned on the stack.  */
    980 #define count_leading_zeros(c,n)					\
    981   do {									\
    982     union {								\
    983       double    d;							\
    984       unsigned  a[2];							\
    985     } __u;								\
    986     __u.d = (UWtype) (n);						\
    987     (c) = 0x3FF + 31 - (__u.a[1] >> 20);				\
    988   } while (0)
    989 #define COUNT_LEADING_ZEROS_0   (0x3FF + 31)
    990 #endif /* pentiummx */
    991 
    992 #else /* ! pentium */
    993 
    994 #if __GMP_GNUC_PREREQ (3,4)  /* using bsrl */
    995 #define count_leading_zeros(count,x)  count_leading_zeros_gcc_clz(count,x)
    996 #endif /* gcc clz */
    997 
    998 /* On P6, gcc prior to 3.0 generates a partial register stall for
    999    __cbtmp^31, due to using "xorb $31" instead of "xorl $31", the former
   1000    being 1 code byte smaller.  "31-__cbtmp" is a workaround, probably at the
   1001    cost of one extra instruction.  Do this for "i386" too, since that means
   1002    generic x86.  */
   1003 #if ! defined (count_leading_zeros) && __GNUC__ < 3			\
   1004   && (HAVE_HOST_CPU_i386						\
   1005       || HAVE_HOST_CPU_i686						\
   1006       || HAVE_HOST_CPU_pentiumpro					\
   1007       || HAVE_HOST_CPU_pentium2						\
   1008       || HAVE_HOST_CPU_pentium3)
   1009 #define count_leading_zeros(count, x)					\
   1010   do {									\
   1011     USItype __cbtmp;							\
   1012     ASSERT ((x) != 0);							\
   1013     __asm__ ("bsrl %1,%0" : "=r" (__cbtmp) : "rm" ((USItype)(x)));	\
   1014     (count) = 31 - __cbtmp;						\
   1015   } while (0)
   1016 #endif /* gcc<3 asm bsrl */
   1017 
   1018 #ifndef count_leading_zeros
   1019 #define count_leading_zeros(count, x)					\
   1020   do {									\
   1021     USItype __cbtmp;							\
   1022     ASSERT ((x) != 0);							\
   1023     __asm__ ("bsrl %1,%0" : "=r" (__cbtmp) : "rm" ((USItype)(x)));	\
   1024     (count) = __cbtmp ^ 31;						\
   1025   } while (0)
   1026 #endif /* asm bsrl */
   1027 
   1028 #if __GMP_GNUC_PREREQ (3,4)  /* using bsfl */
   1029 #define count_trailing_zeros(count,x)  count_trailing_zeros_gcc_ctz(count,x)
   1030 #endif /* gcc ctz */
   1031 
   1032 #ifndef count_trailing_zeros
   1033 #define count_trailing_zeros(count, x)					\
   1034   do {									\
   1035     ASSERT ((x) != 0);							\
   1036     __asm__ ("bsfl %1,%k0" : "=r" (count) : "rm" ((USItype)(x)));	\
   1037   } while (0)
   1038 #endif /* asm bsfl */
   1039 
   1040 #endif /* ! pentium */
   1041 
   1042 #endif /* 80x86 */
   1043 
   1044 #if defined (__amd64__) && W_TYPE_SIZE == 64
   1045 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \
   1046   __asm__ ("addq %5,%q1\n\tadcq %3,%q0"					\
   1047 	   : "=r" (sh), "=&r" (sl)					\
   1048 	   : "0"  ((UDItype)(ah)), "rme" ((UDItype)(bh)),		\
   1049 	     "%1" ((UDItype)(al)), "rme" ((UDItype)(bl)))
   1050 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \
   1051   __asm__ ("subq %5,%q1\n\tsbbq %3,%q0"					\
   1052 	   : "=r" (sh), "=&r" (sl)					\
   1053 	   : "0" ((UDItype)(ah)), "rme" ((UDItype)(bh)),		\
   1054 	     "1" ((UDItype)(al)), "rme" ((UDItype)(bl)))
   1055 #if X86_ASM_MULX \
   1056    && (HAVE_HOST_CPU_haswell || HAVE_HOST_CPU_broadwell \
   1057        || HAVE_HOST_CPU_skylake || HAVE_HOST_CPU_bd4 || HAVE_HOST_CPU_zen)
   1058 #define umul_ppmm(w1, w0, u, v) \
   1059   __asm__ ("mulx\t%3, %0, %1"						\
   1060 	   : "=r" (w0), "=r" (w1)					\
   1061 	   : "%d" ((UDItype)(u)), "rm" ((UDItype)(v)))
   1062 #else
   1063 #define umul_ppmm(w1, w0, u, v) \
   1064   __asm__ ("mulq\t%3"							\
   1065 	   : "=a" (w0), "=d" (w1)					\
   1066 	   : "%0" ((UDItype)(u)), "rm" ((UDItype)(v)))
   1067 #endif
   1068 #define udiv_qrnnd(q, r, n1, n0, dx) /* d renamed to dx avoiding "=d" */\
   1069   __asm__ ("divq %4"		     /* stringification in K&R C */	\
   1070 	   : "=a" (q), "=d" (r)						\
   1071 	   : "0" ((UDItype)(n0)), "1" ((UDItype)(n1)), "rm" ((UDItype)(dx)))
   1072 
   1073 #if HAVE_HOST_CPU_haswell || HAVE_HOST_CPU_broadwell || HAVE_HOST_CPU_skylake \
   1074   || HAVE_HOST_CPU_k10 || HAVE_HOST_CPU_bd1 || HAVE_HOST_CPU_bd2	\
   1075   || HAVE_HOST_CPU_bd3 || HAVE_HOST_CPU_bd4 || HAVE_HOST_CPU_zen	\
   1076   || HAVE_HOST_CPU_bobcat || HAVE_HOST_CPU_jaguar
   1077 #define count_leading_zeros(count, x)					\
   1078   do {									\
   1079     /* This is lzcnt, spelled for older assemblers.  Destination and */	\
   1080     /* source must be a 64-bit registers, hence cast and %q.         */	\
   1081     __asm__ ("rep;bsr\t%1, %q0" : "=r" (count) : "rm" ((UDItype)(x)));	\
   1082   } while (0)
   1083 #define COUNT_LEADING_ZEROS_0 64
   1084 #else
   1085 #define count_leading_zeros(count, x)					\
   1086   do {									\
   1087     UDItype __cbtmp;							\
   1088     ASSERT ((x) != 0);							\
   1089     __asm__ ("bsr\t%1,%0" : "=r" (__cbtmp) : "rm" ((UDItype)(x)));	\
   1090     (count) = __cbtmp ^ 63;						\
   1091   } while (0)
   1092 #endif
   1093 
   1094 #if HAVE_HOST_CPU_bd2 || HAVE_HOST_CPU_bd3 || HAVE_HOST_CPU_bd4 \
   1095   || HAVE_HOST_CPU_zen || HAVE_HOST_CPU_jaguar
   1096 #define count_trailing_zeros(count, x)					\
   1097   do {									\
   1098     /* This is tzcnt, spelled for older assemblers.  Destination and */	\
   1099     /* source must be a 64-bit registers, hence cast and %q.         */	\
   1100     __asm__ ("rep;bsf\t%1, %q0" : "=r" (count) : "rm" ((UDItype)(x)));	\
   1101   } while (0)
   1102 #define COUNT_TRAILING_ZEROS_0 64
   1103 #else
   1104 #define count_trailing_zeros(count, x)					\
   1105   do {									\
   1106     ASSERT ((x) != 0);							\
   1107     __asm__ ("bsf\t%1, %q0" : "=r" (count) : "rm" ((UDItype)(x)));	\
   1108   } while (0)
   1109 #endif
   1110 #endif /* __amd64__ */
   1111 
   1112 #if defined (__i860__) && W_TYPE_SIZE == 32
   1113 #define rshift_rhlc(r,h,l,c) \
   1114   __asm__ ("shr %3,r0,r0\;shrd %1,%2,%0"				\
   1115 	   "=r" (r) : "r" (h), "r" (l), "rn" (c))
   1116 #endif /* i860 */
   1117 
   1118 #if defined (__i960__) && W_TYPE_SIZE == 32
   1119 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \
   1120   __asm__ ("cmpo 1,0\;addc %5,%4,%1\;addc %3,%2,%0"			\
   1121 	   : "=r" (sh), "=&r" (sl)					\
   1122 	   : "dI" (ah), "dI" (bh), "%dI" (al), "dI" (bl))
   1123 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \
   1124   __asm__ ("cmpo 0,0\;subc %5,%4,%1\;subc %3,%2,%0"			\
   1125 	   : "=r" (sh), "=&r" (sl)					\
   1126 	   : "dI" (ah), "dI" (bh), "dI" (al), "dI" (bl))
   1127 #define umul_ppmm(w1, w0, u, v) \
   1128   ({union {UDItype __ll;						\
   1129 	   struct {USItype __l, __h;} __i;				\
   1130 	  } __x;							\
   1131   __asm__ ("emul %2,%1,%0"						\
   1132 	   : "=d" (__x.__ll) : "%dI" (u), "dI" (v));			\
   1133   (w1) = __x.__i.__h; (w0) = __x.__i.__l;})
   1134 #define __umulsidi3(u, v) \
   1135   ({UDItype __w;							\
   1136     __asm__ ("emul %2,%1,%0" : "=d" (__w) : "%dI" (u), "dI" (v));	\
   1137     __w; })
   1138 #define udiv_qrnnd(q, r, nh, nl, d) \
   1139   do {									\
   1140     union {UDItype __ll;						\
   1141 	   struct {USItype __l, __h;} __i;				\
   1142 	  } __nn;							\
   1143     __nn.__i.__h = (nh); __nn.__i.__l = (nl);				\
   1144     __asm__ ("ediv %d,%n,%0"						\
   1145 	   : "=d" (__rq.__ll) : "dI" (__nn.__ll), "dI" (d));		\
   1146     (r) = __rq.__i.__l; (q) = __rq.__i.__h;				\
   1147   } while (0)
   1148 #define count_leading_zeros(count, x) \
   1149   do {									\
   1150     USItype __cbtmp;							\
   1151     __asm__ ("scanbit %1,%0" : "=r" (__cbtmp) : "r" (x));		\
   1152     (count) = __cbtmp ^ 31;						\
   1153   } while (0)
   1154 #define COUNT_LEADING_ZEROS_0 (-32) /* sic */
   1155 #if defined (__i960mx)		/* what is the proper symbol to test??? */
   1156 #define rshift_rhlc(r,h,l,c) \
   1157   do {									\
   1158     union {UDItype __ll;						\
   1159 	   struct {USItype __l, __h;} __i;				\
   1160 	  } __nn;							\
   1161     __nn.__i.__h = (h); __nn.__i.__l = (l);				\
   1162     __asm__ ("shre %2,%1,%0" : "=d" (r) : "dI" (__nn.__ll), "dI" (c));	\
   1163   }
   1164 #endif /* i960mx */
   1165 #endif /* i960 */
   1166 
   1167 #if (defined (__mc68000__) || defined (__mc68020__) || defined(mc68020) \
   1168      || defined (__m68k__) || defined (__mc5200__) || defined (__mc5206e__) \
   1169      || defined (__mc5307__)) && W_TYPE_SIZE == 32
   1170 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \
   1171   __asm__ ("add%.l %5,%1\n\taddx%.l %3,%0"				\
   1172 	   : "=d" (sh), "=&d" (sl)					\
   1173 	   : "0"  ((USItype)(ah)), "d" ((USItype)(bh)),			\
   1174 	     "%1" ((USItype)(al)), "g" ((USItype)(bl)))
   1175 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \
   1176   __asm__ ("sub%.l %5,%1\n\tsubx%.l %3,%0"				\
   1177 	   : "=d" (sh), "=&d" (sl)					\
   1178 	   : "0" ((USItype)(ah)), "d" ((USItype)(bh)),			\
   1179 	     "1" ((USItype)(al)), "g" ((USItype)(bl)))
   1180 /* The '020, '030, '040 and CPU32 have 32x32->64 and 64/32->32q-32r.  */
   1181 #if defined (__mc68020__) || defined(mc68020) \
   1182      || defined (__mc68030__) || defined (mc68030) \
   1183      || defined (__mc68040__) || defined (mc68040) \
   1184      || defined (__mcpu32__) || defined (mcpu32) \
   1185      || defined (__NeXT__)
   1186 #define umul_ppmm(w1, w0, u, v) \
   1187   __asm__ ("mulu%.l %3,%1:%0"						\
   1188 	   : "=d" (w0), "=d" (w1)					\
   1189 	   : "%0" ((USItype)(u)), "dmi" ((USItype)(v)))
   1190 #define udiv_qrnnd(q, r, n1, n0, d) \
   1191   __asm__ ("divu%.l %4,%1:%0"						\
   1192 	   : "=d" (q), "=d" (r)						\
   1193 	   : "0" ((USItype)(n0)), "1" ((USItype)(n1)), "dmi" ((USItype)(d)))
   1194 #define sdiv_qrnnd(q, r, n1, n0, d) \
   1195   __asm__ ("divs%.l %4,%1:%0"						\
   1196 	   : "=d" (q), "=d" (r)						\
   1197 	   : "0" ((USItype)(n0)), "1" ((USItype)(n1)), "dmi" ((USItype)(d)))
   1198 #else /* for other 68k family members use 16x16->32 multiplication */
   1199 #define umul_ppmm(xh, xl, a, b) \
   1200   do { USItype __umul_tmp1, __umul_tmp2;				\
   1201 	__asm__ ("| Inlined umul_ppmm\n"				\
   1202 "	move%.l	%5,%3\n"						\
   1203 "	move%.l	%2,%0\n"						\
   1204 "	move%.w	%3,%1\n"						\
   1205 "	swap	%3\n"							\
   1206 "	swap	%0\n"							\
   1207 "	mulu%.w	%2,%1\n"						\
   1208 "	mulu%.w	%3,%0\n"						\
   1209 "	mulu%.w	%2,%3\n"						\
   1210 "	swap	%2\n"							\
   1211 "	mulu%.w	%5,%2\n"						\
   1212 "	add%.l	%3,%2\n"						\
   1213 "	jcc	1f\n"							\
   1214 "	add%.l	%#0x10000,%0\n"						\
   1215 "1:	move%.l	%2,%3\n"						\
   1216 "	clr%.w	%2\n"							\
   1217 "	swap	%2\n"							\
   1218 "	swap	%3\n"							\
   1219 "	clr%.w	%3\n"							\
   1220 "	add%.l	%3,%1\n"						\
   1221 "	addx%.l	%2,%0\n"						\
   1222 "	| End inlined umul_ppmm"					\
   1223 	      : "=&d" (xh), "=&d" (xl),					\
   1224 		"=d" (__umul_tmp1), "=&d" (__umul_tmp2)			\
   1225 	      : "%2" ((USItype)(a)), "d" ((USItype)(b)));		\
   1226   } while (0)
   1227 #endif /* not mc68020 */
   1228 /* The '020, '030, '040 and '060 have bitfield insns.
   1229    GCC 3.4 defines __mc68020__ when in CPU32 mode, check for __mcpu32__ to
   1230    exclude bfffo on that chip (bitfield insns not available).  */
   1231 #if (defined (__mc68020__) || defined (mc68020)    \
   1232      || defined (__mc68030__) || defined (mc68030) \
   1233      || defined (__mc68040__) || defined (mc68040) \
   1234      || defined (__mc68060__) || defined (mc68060) \
   1235      || defined (__NeXT__))			   \
   1236   && ! defined (__mcpu32__)
   1237 #define count_leading_zeros(count, x) \
   1238   __asm__ ("bfffo %1{%b2:%b2},%0"					\
   1239 	   : "=d" (count)						\
   1240 	   : "od" ((USItype) (x)), "n" (0))
   1241 #define COUNT_LEADING_ZEROS_0 32
   1242 #endif
   1243 #endif /* mc68000 */
   1244 
   1245 #if defined (__m88000__) && W_TYPE_SIZE == 32
   1246 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \
   1247   __asm__ ("addu.co %1,%r4,%r5\n\taddu.ci %0,%r2,%r3"			\
   1248 	   : "=r" (sh), "=&r" (sl)					\
   1249 	   : "rJ" (ah), "rJ" (bh), "%rJ" (al), "rJ" (bl))
   1250 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \
   1251   __asm__ ("subu.co %1,%r4,%r5\n\tsubu.ci %0,%r2,%r3"			\
   1252 	   : "=r" (sh), "=&r" (sl)					\
   1253 	   : "rJ" (ah), "rJ" (bh), "rJ" (al), "rJ" (bl))
   1254 #define count_leading_zeros(count, x) \
   1255   do {									\
   1256     USItype __cbtmp;							\
   1257     __asm__ ("ff1 %0,%1" : "=r" (__cbtmp) : "r" (x));			\
   1258     (count) = __cbtmp ^ 31;						\
   1259   } while (0)
   1260 #define COUNT_LEADING_ZEROS_0 63 /* sic */
   1261 #if defined (__m88110__)
   1262 #define umul_ppmm(wh, wl, u, v) \
   1263   do {									\
   1264     union {UDItype __ll;						\
   1265 	   struct {USItype __h, __l;} __i;				\
   1266 	  } __x;							\
   1267     __asm__ ("mulu.d %0,%1,%2" : "=r" (__x.__ll) : "r" (u), "r" (v));	\
   1268     (wh) = __x.__i.__h;							\
   1269     (wl) = __x.__i.__l;							\
   1270   } while (0)
   1271 #define udiv_qrnnd(q, r, n1, n0, d) \
   1272   ({union {UDItype __ll;						\
   1273 	   struct {USItype __h, __l;} __i;				\
   1274 	  } __x, __q;							\
   1275   __x.__i.__h = (n1); __x.__i.__l = (n0);				\
   1276   __asm__ ("divu.d %0,%1,%2"						\
   1277 	   : "=r" (__q.__ll) : "r" (__x.__ll), "r" (d));		\
   1278   (r) = (n0) - __q.__l * (d); (q) = __q.__l; })
   1279 #endif /* __m88110__ */
   1280 #endif /* __m88000__ */
   1281 
   1282 #if defined (__mips) && W_TYPE_SIZE == 32
   1283 #if __GMP_GNUC_PREREQ (4,4)
   1284 #define umul_ppmm(w1, w0, u, v) \
   1285   do {									\
   1286     UDItype __ll = (UDItype)(u) * (v);					\
   1287     w1 = __ll >> 32;							\
   1288     w0 = __ll;								\
   1289   } while (0)
   1290 #endif
   1291 #if !defined (umul_ppmm) && __GMP_GNUC_PREREQ (2,7) && !defined (__clang__)
   1292 #define umul_ppmm(w1, w0, u, v) \
   1293   __asm__ ("multu %2,%3" : "=l" (w0), "=h" (w1) : "d" (u), "d" (v))
   1294 #endif
   1295 #if !defined (umul_ppmm)
   1296 #define umul_ppmm(w1, w0, u, v) \
   1297   __asm__ ("multu %2,%3\n\tmflo %0\n\tmfhi %1"				\
   1298 	   : "=d" (w0), "=d" (w1) : "d" (u), "d" (v))
   1299 #endif
   1300 #endif /* __mips */
   1301 
   1302 #if (defined (__mips) && __mips >= 3) && W_TYPE_SIZE == 64
   1303 #if defined (_MIPS_ARCH_MIPS64R6)
   1304 #define umul_ppmm(w1, w0, u, v) \
   1305   do {									\
   1306     UDItype __m0 = (u), __m1 = (v);					\
   1307     (w0) = __m0 * __m1;							\
   1308     __asm__ ("dmuhu\t%0, %1, %2" : "=d" (w1) : "d" (__m0), "d" (__m1));	\
   1309   } while (0)
   1310 #endif
   1311 #if !defined (umul_ppmm) && __GMP_GNUC_PREREQ (4,4)
   1312 #define umul_ppmm(w1, w0, u, v) \
   1313   do {									\
   1314     typedef unsigned int __ll_UTItype __attribute__((mode(TI)));	\
   1315     __ll_UTItype __ll = (__ll_UTItype)(u) * (v);			\
   1316     w1 = __ll >> 64;							\
   1317     w0 = __ll;								\
   1318   } while (0)
   1319 #endif
   1320 #if !defined (umul_ppmm) && __GMP_GNUC_PREREQ (2,7) && !defined (__clang__)
   1321 #define umul_ppmm(w1, w0, u, v) \
   1322   __asm__ ("dmultu %2,%3"						\
   1323 	   : "=l" (w0), "=h" (w1)					\
   1324 	   : "d" ((UDItype)(u)), "d" ((UDItype)(v)))
   1325 #endif
   1326 #if !defined (umul_ppmm)
   1327 #define umul_ppmm(w1, w0, u, v) \
   1328   __asm__ ("dmultu %2,%3\n\tmflo %0\n\tmfhi %1"				\
   1329 	   : "=d" (w0), "=d" (w1)					\
   1330 	   : "d" ((UDItype)(u)), "d" ((UDItype)(v)))
   1331 #endif
   1332 #endif /* __mips */
   1333 
   1334 #if defined (__mmix__) && W_TYPE_SIZE == 64
   1335 #define umul_ppmm(w1, w0, u, v) \
   1336   __asm__ ("MULU %0,%2,%3" : "=r" (w0), "=z" (w1) : "r" (u), "r" (v))
   1337 #endif
   1338 
   1339 #if defined (__ns32000__) && W_TYPE_SIZE == 32
   1340 #define umul_ppmm(w1, w0, u, v) \
   1341   ({union {UDItype __ll;						\
   1342 	   struct {USItype __l, __h;} __i;				\
   1343 	  } __x;							\
   1344   __asm__ ("meid %2,%0"							\
   1345 	   : "=g" (__x.__ll)						\
   1346 	   : "%0" ((USItype)(u)), "g" ((USItype)(v)));			\
   1347   (w1) = __x.__i.__h; (w0) = __x.__i.__l;})
   1348 #define __umulsidi3(u, v) \
   1349   ({UDItype __w;							\
   1350     __asm__ ("meid %2,%0"						\
   1351 	     : "=g" (__w)						\
   1352 	     : "%0" ((USItype)(u)), "g" ((USItype)(v)));		\
   1353     __w; })
   1354 #define udiv_qrnnd(q, r, n1, n0, d) \
   1355   ({union {UDItype __ll;						\
   1356 	   struct {USItype __l, __h;} __i;				\
   1357 	  } __x;							\
   1358   __x.__i.__h = (n1); __x.__i.__l = (n0);				\
   1359   __asm__ ("deid %2,%0"							\
   1360 	   : "=g" (__x.__ll)						\
   1361 	   : "0" (__x.__ll), "g" ((USItype)(d)));			\
   1362   (r) = __x.__i.__l; (q) = __x.__i.__h; })
   1363 #define count_trailing_zeros(count,x) \
   1364   do {									\
   1365     __asm__ ("ffsd	%2,%0"						\
   1366 	     : "=r" (count)						\
   1367 	     : "0" ((USItype) 0), "r" ((USItype) (x)));			\
   1368   } while (0)
   1369 #endif /* __ns32000__ */
   1370 
   1371 /* In the past we had a block of various #defines tested
   1372        _ARCH_PPC    - AIX
   1373        _ARCH_PWR    - AIX
   1374        __powerpc__  - gcc
   1375        __POWERPC__  - BEOS
   1376        __ppc__      - Darwin
   1377        PPC          - old gcc, GNU/Linux, SysV
   1378    The plain PPC test was not good for vxWorks, since PPC is defined on all
   1379    CPUs there (eg. m68k too), as a constant one is expected to compare
   1380    CPU_FAMILY against.
   1381 
   1382    At any rate, this was pretty unattractive and a bit fragile.  The use of
   1383    HAVE_HOST_CPU_FAMILY is designed to cut through it all and be sure of
   1384    getting the desired effect.
   1385 
   1386    ENHANCE-ME: We should test _IBMR2 here when we add assembly support for
   1387    the system vendor compilers.  (Is that vendor compilers with inline asm,
   1388    or what?)  */
   1389 
   1390 #if (HAVE_HOST_CPU_FAMILY_power || HAVE_HOST_CPU_FAMILY_powerpc)	\
   1391   && W_TYPE_SIZE == 32
   1392 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \
   1393   do {									\
   1394     if (__builtin_constant_p (bh) && (bh) == 0)				\
   1395       __asm__ ("add%I4c %1,%3,%4\n\taddze %0,%2"			\
   1396 	       : "=r" (sh), "=&r" (sl) : "r" (ah), "%r" (al), "rI" (bl)	\
   1397 		 __CLOBBER_CC);						\
   1398     else if (__builtin_constant_p (bh) && (bh) == ~(USItype) 0)		\
   1399       __asm__ ("add%I4c %1,%3,%4\n\taddme %0,%2"			\
   1400 	       : "=r" (sh), "=&r" (sl) : "r" (ah), "%r" (al), "rI" (bl)	\
   1401 		 __CLOBBER_CC);						\
   1402     else								\
   1403       __asm__ ("add%I5c %1,%4,%5\n\tadde %0,%2,%3"			\
   1404 	       : "=r" (sh), "=&r" (sl)					\
   1405 	       : "r" (ah), "r" (bh), "%r" (al), "rI" (bl)		\
   1406 		 __CLOBBER_CC);						\
   1407   } while (0)
   1408 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \
   1409   do {									\
   1410     if (__builtin_constant_p (ah) && (ah) == 0)				\
   1411       __asm__ ("subf%I3c %1,%4,%3\n\tsubfze %0,%2"			\
   1412 	       : "=r" (sh), "=&r" (sl) : "r" (bh), "rI" (al), "r" (bl)	\
   1413 		 __CLOBBER_CC);						\
   1414     else if (__builtin_constant_p (ah) && (ah) == ~(USItype) 0)		\
   1415       __asm__ ("subf%I3c %1,%4,%3\n\tsubfme %0,%2"			\
   1416 	       : "=r" (sh), "=&r" (sl) : "r" (bh), "rI" (al), "r" (bl)	\
   1417 		 __CLOBBER_CC);						\
   1418     else if (__builtin_constant_p (bh) && (bh) == 0)			\
   1419       __asm__ ("subf%I3c %1,%4,%3\n\taddme %0,%2"			\
   1420 	       : "=r" (sh), "=&r" (sl) : "r" (ah), "rI" (al), "r" (bl)	\
   1421 		 __CLOBBER_CC);						\
   1422     else if (__builtin_constant_p (bh) && (bh) == ~(USItype) 0)		\
   1423       __asm__ ("subf%I3c %1,%4,%3\n\taddze %0,%2"			\
   1424 	       : "=r" (sh), "=&r" (sl) : "r" (ah), "rI" (al), "r" (bl)	\
   1425 		 __CLOBBER_CC);						\
   1426     else								\
   1427       __asm__ ("subf%I4c %1,%5,%4\n\tsubfe %0,%3,%2"			\
   1428 	       : "=r" (sh), "=&r" (sl)					\
   1429 	       : "r" (ah), "r" (bh), "rI" (al), "r" (bl)		\
   1430 		 __CLOBBER_CC);						\
   1431   } while (0)
   1432 #define count_leading_zeros(count, x) \
   1433   __asm__ ("cntlzw %0,%1" : "=r" (count) : "r" (x))
   1434 #define COUNT_LEADING_ZEROS_0 32
   1435 #if HAVE_HOST_CPU_FAMILY_powerpc
   1436 #if __GMP_GNUC_PREREQ (4,4)
   1437 #define umul_ppmm(w1, w0, u, v) \
   1438   do {									\
   1439     UDItype __ll = (UDItype)(u) * (v);					\
   1440     w1 = __ll >> 32;							\
   1441     w0 = __ll;								\
   1442   } while (0)
   1443 #endif
   1444 #if !defined (umul_ppmm)
   1445 #define umul_ppmm(ph, pl, m0, m1) \
   1446   do {									\
   1447     USItype __m0 = (m0), __m1 = (m1);					\
   1448     __asm__ ("mulhwu %0,%1,%2" : "=r" (ph) : "%r" (m0), "r" (m1));	\
   1449     (pl) = __m0 * __m1;							\
   1450   } while (0)
   1451 #endif
   1452 #define smul_ppmm(ph, pl, m0, m1) \
   1453   do {									\
   1454     SItype __m0 = (m0), __m1 = (m1);					\
   1455     __asm__ ("mulhw %0,%1,%2" : "=r" (ph) : "%r" (m0), "r" (m1));	\
   1456     (pl) = __m0 * __m1;							\
   1457   } while (0)
   1458 #else
   1459 #define smul_ppmm(xh, xl, m0, m1) \
   1460   __asm__ ("mul %0,%2,%3" : "=r" (xh), "=q" (xl) : "r" (m0), "r" (m1))
   1461 #define sdiv_qrnnd(q, r, nh, nl, d) \
   1462   __asm__ ("div %0,%2,%4" : "=r" (q), "=q" (r) : "r" (nh), "1" (nl), "r" (d))
   1463 #endif
   1464 #endif /* 32-bit POWER architecture variants.  */
   1465 
   1466 /* We should test _IBMR2 here when we add assembly support for the system
   1467    vendor compilers.  */
   1468 #if HAVE_HOST_CPU_FAMILY_powerpc && W_TYPE_SIZE == 64
   1469 #if !defined (_LONG_LONG_LIMB)
   1470 /* _LONG_LONG_LIMB is ABI=mode32 where adde operates on 32-bit values.  So
   1471    use adde etc only when not _LONG_LONG_LIMB.  */
   1472 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \
   1473   do {									\
   1474     if (__builtin_constant_p (bh) && (bh) == 0)				\
   1475       __asm__ ("add%I4c %1,%3,%4\n\taddze %0,%2"			\
   1476 	       : "=r" (sh), "=&r" (sl)					\
   1477 	       : "r"  ((UDItype)(ah)),					\
   1478 		 "%r" ((UDItype)(al)), "rI" ((UDItype)(bl))		\
   1479 		 __CLOBBER_CC);						\
   1480     else if (__builtin_constant_p (bh) && (bh) == ~(UDItype) 0)		\
   1481       __asm__ ("add%I4c %1,%3,%4\n\taddme %0,%2"			\
   1482 	       : "=r" (sh), "=&r" (sl)					\
   1483 	       : "r"  ((UDItype)(ah)),					\
   1484 		 "%r" ((UDItype)(al)), "rI" ((UDItype)(bl))		\
   1485 		 __CLOBBER_CC);						\
   1486     else								\
   1487       __asm__ ("add%I5c %1,%4,%5\n\tadde %0,%2,%3"			\
   1488 	       : "=r" (sh), "=&r" (sl)					\
   1489 	       : "r"  ((UDItype)(ah)), "r"  ((UDItype)(bh)),		\
   1490 		 "%r" ((UDItype)(al)), "rI" ((UDItype)(bl))		\
   1491 		 __CLOBBER_CC);						\
   1492   } while (0)
   1493 /* We use "*rI" for the constant operand here, since with just "I", gcc barfs.
   1494    This might seem strange, but gcc folds away the dead code late.  */
   1495 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \
   1496   do {									\
   1497     if (__builtin_constant_p (bl) && bl > -0x8000 && bl <= 0x8000) {	\
   1498 	if (__builtin_constant_p (ah) && (ah) == 0)			\
   1499 	  __asm__ ("addic %1,%3,%4\n\tsubfze %0,%2"			\
   1500 		   : "=r" (sh), "=&r" (sl)				\
   1501 		   :                       "r" ((UDItype)(bh)),		\
   1502 		     "rI" ((UDItype)(al)), "*rI" (-((UDItype)(bl)))	\
   1503 		     __CLOBBER_CC);					\
   1504 	else if (__builtin_constant_p (ah) && (ah) == ~(UDItype) 0)	\
   1505 	  __asm__ ("addic %1,%3,%4\n\tsubfme %0,%2"			\
   1506 		   : "=r" (sh), "=&r" (sl)				\
   1507 		   :                       "r" ((UDItype)(bh)),		\
   1508 		     "rI" ((UDItype)(al)), "*rI" (-((UDItype)(bl)))	\
   1509 		     __CLOBBER_CC);					\
   1510 	else if (__builtin_constant_p (bh) && (bh) == 0)		\
   1511 	  __asm__ ("addic %1,%3,%4\n\taddme %0,%2"			\
   1512 		   : "=r" (sh), "=&r" (sl)				\
   1513 		   : "r"  ((UDItype)(ah)),				\
   1514 		     "rI" ((UDItype)(al)), "*rI" (-((UDItype)(bl)))	\
   1515 		     __CLOBBER_CC);					\
   1516 	else if (__builtin_constant_p (bh) && (bh) == ~(UDItype) 0)	\
   1517 	  __asm__ ("addic %1,%3,%4\n\taddze %0,%2"			\
   1518 		   : "=r" (sh), "=&r" (sl)				\
   1519 		   : "r"  ((UDItype)(ah)),				\
   1520 		     "rI" ((UDItype)(al)), "*rI" (-((UDItype)(bl)))	\
   1521 		     __CLOBBER_CC);					\
   1522 	else								\
   1523 	  __asm__ ("addic %1,%4,%5\n\tsubfe %0,%3,%2"			\
   1524 		   : "=r" (sh), "=&r" (sl)				\
   1525 		   : "r"  ((UDItype)(ah)), "r" ((UDItype)(bh)),		\
   1526 		     "rI" ((UDItype)(al)), "*rI" (-((UDItype)(bl)))	\
   1527 		     __CLOBBER_CC);					\
   1528     } else {								\
   1529 	if (__builtin_constant_p (ah) && (ah) == 0)			\
   1530 	  __asm__ ("subf%I3c %1,%4,%3\n\tsubfze %0,%2"			\
   1531 		   : "=r" (sh), "=&r" (sl)				\
   1532 		   :                       "r" ((UDItype)(bh)),		\
   1533 		     "rI" ((UDItype)(al)), "r" ((UDItype)(bl))		\
   1534 		     __CLOBBER_CC);					\
   1535 	else if (__builtin_constant_p (ah) && (ah) == ~(UDItype) 0)	\
   1536 	  __asm__ ("subf%I3c %1,%4,%3\n\tsubfme %0,%2"			\
   1537 		   : "=r" (sh), "=&r" (sl)				\
   1538 		   :                       "r" ((UDItype)(bh)),		\
   1539 		     "rI" ((UDItype)(al)), "r" ((UDItype)(bl))		\
   1540 		     __CLOBBER_CC);					\
   1541 	else if (__builtin_constant_p (bh) && (bh) == 0)		\
   1542 	  __asm__ ("subf%I3c %1,%4,%3\n\taddme %0,%2"			\
   1543 		   : "=r" (sh), "=&r" (sl)				\
   1544 		   : "r"  ((UDItype)(ah)),				\
   1545 		     "rI" ((UDItype)(al)), "r" ((UDItype)(bl))		\
   1546 		     __CLOBBER_CC);					\
   1547 	else if (__builtin_constant_p (bh) && (bh) == ~(UDItype) 0)	\
   1548 	  __asm__ ("subf%I3c %1,%4,%3\n\taddze %0,%2"			\
   1549 		   : "=r" (sh), "=&r" (sl)				\
   1550 		   : "r"  ((UDItype)(ah)),				\
   1551 		     "rI" ((UDItype)(al)), "r" ((UDItype)(bl))		\
   1552 		     __CLOBBER_CC);					\
   1553 	else								\
   1554 	  __asm__ ("subf%I4c %1,%5,%4\n\tsubfe %0,%3,%2"		\
   1555 		   : "=r" (sh), "=&r" (sl)				\
   1556 		   : "r"  ((UDItype)(ah)), "r" ((UDItype)(bh)),		\
   1557 		     "rI" ((UDItype)(al)), "r" ((UDItype)(bl))		\
   1558 		     __CLOBBER_CC);					\
   1559     }									\
   1560   } while (0)
   1561 #endif /* ! _LONG_LONG_LIMB */
   1562 #define count_leading_zeros(count, x) \
   1563   __asm__ ("cntlzd %0,%1" : "=r" (count) : "r" (x))
   1564 #define COUNT_LEADING_ZEROS_0 64
   1565 #if __GMP_GNUC_PREREQ (4,8)
   1566 #define umul_ppmm(w1, w0, u, v) \
   1567   do {									\
   1568     typedef unsigned int __ll_UTItype __attribute__((mode(TI)));	\
   1569     __ll_UTItype __ll = (__ll_UTItype)(u) * (v);			\
   1570     w1 = __ll >> 64;							\
   1571     w0 = __ll;								\
   1572   } while (0)
   1573 #endif
   1574 #if !defined (umul_ppmm)
   1575 #define umul_ppmm(ph, pl, m0, m1) \
   1576   do {									\
   1577     UDItype __m0 = (m0), __m1 = (m1);					\
   1578     __asm__ ("mulhdu %0,%1,%2" : "=r" (ph) : "%r" (__m0), "r" (__m1));	\
   1579     (pl) = __m0 * __m1;							\
   1580   } while (0)
   1581 #endif
   1582 #define smul_ppmm(ph, pl, m0, m1) \
   1583   do {									\
   1584     DItype __m0 = (m0), __m1 = (m1);					\
   1585     __asm__ ("mulhd %0,%1,%2" : "=r" (ph) : "%r" (__m0), "r" (__m1));	\
   1586     (pl) = __m0 * __m1;							\
   1587   } while (0)
   1588 #endif /* 64-bit PowerPC.  */
   1589 
   1590 #if defined (__pyr__) && W_TYPE_SIZE == 32
   1591 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \
   1592   __asm__ ("addw %5,%1\n\taddwc %3,%0"					\
   1593 	   : "=r" (sh), "=&r" (sl)					\
   1594 	   : "0"  ((USItype)(ah)), "g" ((USItype)(bh)),			\
   1595 	     "%1" ((USItype)(al)), "g" ((USItype)(bl)))
   1596 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \
   1597   __asm__ ("subw %5,%1\n\tsubwb %3,%0"					\
   1598 	   : "=r" (sh), "=&r" (sl)					\
   1599 	   : "0" ((USItype)(ah)), "g" ((USItype)(bh)),			\
   1600 	     "1" ((USItype)(al)), "g" ((USItype)(bl)))
   1601 /* This insn works on Pyramids with AP, XP, or MI CPUs, but not with SP.  */
   1602 #define umul_ppmm(w1, w0, u, v) \
   1603   ({union {UDItype __ll;						\
   1604 	   struct {USItype __h, __l;} __i;				\
   1605 	  } __x;							\
   1606   __asm__ ("movw %1,%R0\n\tuemul %2,%0"					\
   1607 	   : "=&r" (__x.__ll)						\
   1608 	   : "g" ((USItype) (u)), "g" ((USItype)(v)));			\
   1609   (w1) = __x.__i.__h; (w0) = __x.__i.__l;})
   1610 #endif /* __pyr__ */
   1611 
   1612 #if defined (__ibm032__) /* RT/ROMP */  && W_TYPE_SIZE == 32
   1613 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \
   1614   __asm__ ("a %1,%5\n\tae %0,%3"					\
   1615 	   : "=r" (sh), "=&r" (sl)					\
   1616 	   : "0"  ((USItype)(ah)), "r" ((USItype)(bh)),			\
   1617 	     "%1" ((USItype)(al)), "r" ((USItype)(bl)))
   1618 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \
   1619   __asm__ ("s %1,%5\n\tse %0,%3"					\
   1620 	   : "=r" (sh), "=&r" (sl)					\
   1621 	   : "0" ((USItype)(ah)), "r" ((USItype)(bh)),			\
   1622 	     "1" ((USItype)(al)), "r" ((USItype)(bl)))
   1623 #define smul_ppmm(ph, pl, m0, m1) \
   1624   __asm__ (								\
   1625        "s	r2,r2\n"						\
   1626 "	mts r10,%2\n"							\
   1627 "	m	r2,%3\n"						\
   1628 "	m	r2,%3\n"						\
   1629 "	m	r2,%3\n"						\
   1630 "	m	r2,%3\n"						\
   1631 "	m	r2,%3\n"						\
   1632 "	m	r2,%3\n"						\
   1633 "	m	r2,%3\n"						\
   1634 "	m	r2,%3\n"						\
   1635 "	m	r2,%3\n"						\
   1636 "	m	r2,%3\n"						\
   1637 "	m	r2,%3\n"						\
   1638 "	m	r2,%3\n"						\
   1639 "	m	r2,%3\n"						\
   1640 "	m	r2,%3\n"						\
   1641 "	m	r2,%3\n"						\
   1642 "	m	r2,%3\n"						\
   1643 "	cas	%0,r2,r0\n"						\
   1644 "	mfs	r10,%1"							\
   1645 	   : "=r" (ph), "=r" (pl)					\
   1646 	   : "%r" ((USItype)(m0)), "r" ((USItype)(m1))			\
   1647 	   : "r2")
   1648 #define count_leading_zeros(count, x) \
   1649   do {									\
   1650     if ((x) >= 0x10000)							\
   1651       __asm__ ("clz	%0,%1"						\
   1652 	       : "=r" (count) : "r" ((USItype)(x) >> 16));		\
   1653     else								\
   1654       {									\
   1655 	__asm__ ("clz	%0,%1"						\
   1656 		 : "=r" (count) : "r" ((USItype)(x)));			\
   1657 	(count) += 16;							\
   1658       }									\
   1659   } while (0)
   1660 #endif /* RT/ROMP */
   1661 
   1662 #if defined (__riscv64) && W_TYPE_SIZE == 64
   1663 #define umul_ppmm(ph, pl, u, v) \
   1664   do {									\
   1665     UDItype __u = (u), __v = (v);					\
   1666     (pl) = __u * __v;							\
   1667     __asm__ ("mulhu\t%2, %1, %0" : "=r" (ph) : "%r" (__u), "r" (__v));	\
   1668   } while (0)
   1669 #endif
   1670 
   1671 #if (defined (__SH2__) || defined (__SH3__) || defined (__SH4__)) && W_TYPE_SIZE == 32
   1672 #define umul_ppmm(w1, w0, u, v) \
   1673   __asm__ ("dmulu.l %2,%3\n\tsts macl,%1\n\tsts mach,%0"		\
   1674 	   : "=r" (w1), "=r" (w0) : "r" (u), "r" (v) : "macl", "mach")
   1675 #endif
   1676 
   1677 #if defined (__sparc__) && W_TYPE_SIZE == 32
   1678 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \
   1679   __asm__ ("addcc %r4,%5,%1\n\taddx %r2,%3,%0"				\
   1680 	   : "=r" (sh), "=&r" (sl)					\
   1681 	   : "rJ" (ah), "rI" (bh),"%rJ" (al), "rI" (bl)			\
   1682 	   __CLOBBER_CC)
   1683 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \
   1684   __asm__ ("subcc %r4,%5,%1\n\tsubx %r2,%3,%0"				\
   1685 	   : "=r" (sh), "=&r" (sl)					\
   1686 	   : "rJ" (ah), "rI" (bh), "rJ" (al), "rI" (bl)	\
   1687 	   __CLOBBER_CC)
   1688 /* FIXME: When gcc -mcpu=v9 is used on solaris, gcc/config/sol2-sld-64.h
   1689    doesn't define anything to indicate that to us, it only sets __sparcv8. */
   1690 #if defined (__sparc_v9__) || defined (__sparcv9)
   1691 /* Perhaps we should use floating-point operations here?  */
   1692 #if 0
   1693 /* Triggers a bug making mpz/tests/t-gcd.c fail.
   1694    Perhaps we simply need explicitly zero-extend the inputs?  */
   1695 #define umul_ppmm(w1, w0, u, v) \
   1696   __asm__ ("mulx %2,%3,%%g1; srl %%g1,0,%1; srlx %%g1,32,%0" :		\
   1697 	   "=r" (w1), "=r" (w0) : "r" (u), "r" (v) : "g1")
   1698 #else
   1699 /* Use v8 umul until above bug is fixed.  */
   1700 #define umul_ppmm(w1, w0, u, v) \
   1701   __asm__ ("umul %2,%3,%1;rd %%y,%0" : "=r" (w1), "=r" (w0) : "r" (u), "r" (v))
   1702 #endif
   1703 /* Use a plain v8 divide for v9.  */
   1704 #define udiv_qrnnd(q, r, n1, n0, d) \
   1705   do {									\
   1706     USItype __q;							\
   1707     __asm__ ("mov %1,%%y;nop;nop;nop;udiv %2,%3,%0"			\
   1708 	     : "=r" (__q) : "r" (n1), "r" (n0), "r" (d));		\
   1709     (r) = (n0) - __q * (d);						\
   1710     (q) = __q;								\
   1711   } while (0)
   1712 #else
   1713 #if defined (__sparc_v8__)   /* gcc normal */				\
   1714   || defined (__sparcv8)     /* gcc solaris */				\
   1715   || HAVE_HOST_CPU_supersparc
   1716 /* Don't match immediate range because, 1) it is not often useful,
   1717    2) the 'I' flag thinks of the range as a 13 bit signed interval,
   1718    while we want to match a 13 bit interval, sign extended to 32 bits,
   1719    but INTERPRETED AS UNSIGNED.  */
   1720 #define umul_ppmm(w1, w0, u, v) \
   1721   __asm__ ("umul %2,%3,%1;rd %%y,%0" : "=r" (w1), "=r" (w0) : "r" (u), "r" (v))
   1722 
   1723 #if HAVE_HOST_CPU_supersparc
   1724 #else
   1725 /* Don't use this on SuperSPARC because its udiv only handles 53 bit
   1726    dividends and will trap to the kernel for the rest. */
   1727 #define udiv_qrnnd(q, r, n1, n0, d) \
   1728   do {									\
   1729     USItype __q;							\
   1730     __asm__ ("mov %1,%%y;nop;nop;nop;udiv %2,%3,%0"			\
   1731 	     : "=r" (__q) : "r" (n1), "r" (n0), "r" (d));		\
   1732     (r) = (n0) - __q * (d);						\
   1733     (q) = __q;								\
   1734   } while (0)
   1735 #endif /* HAVE_HOST_CPU_supersparc */
   1736 
   1737 #else /* ! __sparc_v8__ */
   1738 #if defined (__sparclite__)
   1739 /* This has hardware multiply but not divide.  It also has two additional
   1740    instructions scan (ffs from high bit) and divscc.  */
   1741 #define umul_ppmm(w1, w0, u, v) \
   1742   __asm__ ("umul %2,%3,%1;rd %%y,%0" : "=r" (w1), "=r" (w0) : "r" (u), "r" (v))
   1743 #define udiv_qrnnd(q, r, n1, n0, d) \
   1744   __asm__ ("! Inlined udiv_qrnnd\n"					\
   1745 "	wr	%%g0,%2,%%y	! Not a delayed write for sparclite\n"	\
   1746 "	tst	%%g0\n"							\
   1747 "	divscc	%3,%4,%%g1\n"						\
   1748 "	divscc	%%g1,%4,%%g1\n"						\
   1749 "	divscc	%%g1,%4,%%g1\n"						\
   1750 "	divscc	%%g1,%4,%%g1\n"						\
   1751 "	divscc	%%g1,%4,%%g1\n"						\
   1752 "	divscc	%%g1,%4,%%g1\n"						\
   1753 "	divscc	%%g1,%4,%%g1\n"						\
   1754 "	divscc	%%g1,%4,%%g1\n"						\
   1755 "	divscc	%%g1,%4,%%g1\n"						\
   1756 "	divscc	%%g1,%4,%%g1\n"						\
   1757 "	divscc	%%g1,%4,%%g1\n"						\
   1758 "	divscc	%%g1,%4,%%g1\n"						\
   1759 "	divscc	%%g1,%4,%%g1\n"						\
   1760 "	divscc	%%g1,%4,%%g1\n"						\
   1761 "	divscc	%%g1,%4,%%g1\n"						\
   1762 "	divscc	%%g1,%4,%%g1\n"						\
   1763 "	divscc	%%g1,%4,%%g1\n"						\
   1764 "	divscc	%%g1,%4,%%g1\n"						\
   1765 "	divscc	%%g1,%4,%%g1\n"						\
   1766 "	divscc	%%g1,%4,%%g1\n"						\
   1767 "	divscc	%%g1,%4,%%g1\n"						\
   1768 "	divscc	%%g1,%4,%%g1\n"						\
   1769 "	divscc	%%g1,%4,%%g1\n"						\
   1770 "	divscc	%%g1,%4,%%g1\n"						\
   1771 "	divscc	%%g1,%4,%%g1\n"						\
   1772 "	divscc	%%g1,%4,%%g1\n"						\
   1773 "	divscc	%%g1,%4,%%g1\n"						\
   1774 "	divscc	%%g1,%4,%%g1\n"						\
   1775 "	divscc	%%g1,%4,%%g1\n"						\
   1776 "	divscc	%%g1,%4,%%g1\n"						\
   1777 "	divscc	%%g1,%4,%%g1\n"						\
   1778 "	divscc	%%g1,%4,%0\n"						\
   1779 "	rd	%%y,%1\n"						\
   1780 "	bl,a 1f\n"							\
   1781 "	add	%1,%4,%1\n"						\
   1782 "1:	! End of inline udiv_qrnnd"					\
   1783 	   : "=r" (q), "=r" (r) : "r" (n1), "r" (n0), "rI" (d)		\
   1784 	   : "%g1" __AND_CLOBBER_CC)
   1785 #define count_leading_zeros(count, x) \
   1786   __asm__ ("scan %1,1,%0" : "=r" (count) : "r" (x))
   1787 /* Early sparclites return 63 for an argument of 0, but they warn that future
   1788    implementations might change this.  Therefore, leave COUNT_LEADING_ZEROS_0
   1789    undefined.  */
   1790 #endif /* __sparclite__ */
   1791 #endif /* __sparc_v8__ */
   1792 #endif /* __sparc_v9__ */
   1793 /* Default to sparc v7 versions of umul_ppmm and udiv_qrnnd.  */
   1794 #ifndef umul_ppmm
   1795 #define umul_ppmm(w1, w0, u, v) \
   1796   __asm__ ("! Inlined umul_ppmm\n"					\
   1797 "	wr	%%g0,%2,%%y	! SPARC has 0-3 delay insn after a wr\n" \
   1798 "	sra	%3,31,%%g2	! Don't move this insn\n"		\
   1799 "	and	%2,%%g2,%%g2	! Don't move this insn\n"		\
   1800 "	andcc	%%g0,0,%%g1	! Don't move this insn\n"		\
   1801 "	mulscc	%%g1,%3,%%g1\n"						\
   1802 "	mulscc	%%g1,%3,%%g1\n"						\
   1803 "	mulscc	%%g1,%3,%%g1\n"						\
   1804 "	mulscc	%%g1,%3,%%g1\n"						\
   1805 "	mulscc	%%g1,%3,%%g1\n"						\
   1806 "	mulscc	%%g1,%3,%%g1\n"						\
   1807 "	mulscc	%%g1,%3,%%g1\n"						\
   1808 "	mulscc	%%g1,%3,%%g1\n"						\
   1809 "	mulscc	%%g1,%3,%%g1\n"						\
   1810 "	mulscc	%%g1,%3,%%g1\n"						\
   1811 "	mulscc	%%g1,%3,%%g1\n"						\
   1812 "	mulscc	%%g1,%3,%%g1\n"						\
   1813 "	mulscc	%%g1,%3,%%g1\n"						\
   1814 "	mulscc	%%g1,%3,%%g1\n"						\
   1815 "	mulscc	%%g1,%3,%%g1\n"						\
   1816 "	mulscc	%%g1,%3,%%g1\n"						\
   1817 "	mulscc	%%g1,%3,%%g1\n"						\
   1818 "	mulscc	%%g1,%3,%%g1\n"						\
   1819 "	mulscc	%%g1,%3,%%g1\n"						\
   1820 "	mulscc	%%g1,%3,%%g1\n"						\
   1821 "	mulscc	%%g1,%3,%%g1\n"						\
   1822 "	mulscc	%%g1,%3,%%g1\n"						\
   1823 "	mulscc	%%g1,%3,%%g1\n"						\
   1824 "	mulscc	%%g1,%3,%%g1\n"						\
   1825 "	mulscc	%%g1,%3,%%g1\n"						\
   1826 "	mulscc	%%g1,%3,%%g1\n"						\
   1827 "	mulscc	%%g1,%3,%%g1\n"						\
   1828 "	mulscc	%%g1,%3,%%g1\n"						\
   1829 "	mulscc	%%g1,%3,%%g1\n"						\
   1830 "	mulscc	%%g1,%3,%%g1\n"						\
   1831 "	mulscc	%%g1,%3,%%g1\n"						\
   1832 "	mulscc	%%g1,%3,%%g1\n"						\
   1833 "	mulscc	%%g1,0,%%g1\n"						\
   1834 "	add	%%g1,%%g2,%0\n"						\
   1835 "	rd	%%y,%1"							\
   1836 	   : "=r" (w1), "=r" (w0) : "%rI" (u), "r" (v)			\
   1837 	   : "%g1", "%g2" __AND_CLOBBER_CC)
   1838 #endif
   1839 #ifndef udiv_qrnnd
   1840 #ifndef LONGLONG_STANDALONE
   1841 #define udiv_qrnnd(q, r, n1, n0, d) \
   1842   do { UWtype __r;							\
   1843     (q) = __MPN(udiv_qrnnd) (&__r, (n1), (n0), (d));			\
   1844     (r) = __r;								\
   1845   } while (0)
   1846 extern UWtype __MPN(udiv_qrnnd) (UWtype *, UWtype, UWtype, UWtype);
   1847 #endif /* LONGLONG_STANDALONE */
   1848 #endif /* udiv_qrnnd */
   1849 #endif /* __sparc__ */
   1850 
   1851 #if defined (__sparc__) && W_TYPE_SIZE == 64
   1852 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \
   1853   __asm__ (								\
   1854        "addcc	%r4,%5,%1\n"						\
   1855       "	addccc	%r6,%7,%%g0\n"						\
   1856       "	addc	%r2,%3,%0"						\
   1857        : "=r" (sh), "=&r" (sl)						\
   1858        : "rJ"  ((UDItype)(ah)), "rI" ((UDItype)(bh)),			\
   1859 	 "%rJ" ((UDItype)(al)), "rI" ((UDItype)(bl)),			\
   1860 	 "%rJ" ((UDItype)(al) >> 32), "rI" ((UDItype)(bl) >> 32)	\
   1861 	   __CLOBBER_CC)
   1862 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \
   1863   __asm__ (								\
   1864        "subcc	%r4,%5,%1\n"						\
   1865       "	subccc	%r6,%7,%%g0\n"						\
   1866       "	subc	%r2,%3,%0"						\
   1867        : "=r" (sh), "=&r" (sl)						\
   1868        : "rJ" ((UDItype)(ah)), "rI" ((UDItype)(bh)),			\
   1869 	 "rJ" ((UDItype)(al)), "rI" ((UDItype)(bl)),			\
   1870 	 "rJ" ((UDItype)(al) >> 32), "rI" ((UDItype)(bl) >> 32)		\
   1871 	   __CLOBBER_CC)
   1872 #if __VIS__ >= 0x300
   1873 #undef add_ssaaaa
   1874 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \
   1875   __asm__ (								\
   1876        "addcc	%r4, %5, %1\n"						\
   1877       "	addxc	%r2, %r3, %0"						\
   1878 	  : "=r" (sh), "=&r" (sl)					\
   1879        : "rJ"  ((UDItype)(ah)), "rJ" ((UDItype)(bh)),			\
   1880 	 "%rJ" ((UDItype)(al)), "rI" ((UDItype)(bl)) __CLOBBER_CC)
   1881 #define umul_ppmm(ph, pl, m0, m1) \
   1882   do {									\
   1883     UDItype __m0 = (m0), __m1 = (m1);					\
   1884     (pl) = __m0 * __m1;							\
   1885     __asm__ ("umulxhi\t%2, %1, %0"					\
   1886 	     : "=r" (ph)						\
   1887 	     : "%r" (__m0), "r" (__m1));				\
   1888   } while (0)
   1889 #define count_leading_zeros(count, x) \
   1890   __asm__ ("lzd\t%1,%0" : "=r" (count) : "r" (x))
   1891 /* Needed by count_leading_zeros_32 in sparc64.h.  */
   1892 #define COUNT_LEADING_ZEROS_NEED_CLZ_TAB
   1893 #endif
   1894 #endif
   1895 
   1896 #if (defined (__vax) || defined (__vax__)) && W_TYPE_SIZE == 32
   1897 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \
   1898   __asm__ ("addl2 %5,%1\n\tadwc %3,%0"					\
   1899 	   : "=g" (sh), "=&g" (sl)					\
   1900 	   : "0"  ((USItype)(ah)), "g" ((USItype)(bh)),			\
   1901 	     "%1" ((USItype)(al)), "g" ((USItype)(bl)))
   1902 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \
   1903   __asm__ ("subl2 %5,%1\n\tsbwc %3,%0"					\
   1904 	   : "=g" (sh), "=&g" (sl)					\
   1905 	   : "0" ((USItype)(ah)), "g" ((USItype)(bh)),			\
   1906 	     "1" ((USItype)(al)), "g" ((USItype)(bl)))
   1907 #define smul_ppmm(xh, xl, m0, m1) \
   1908   do {									\
   1909     union {UDItype __ll;						\
   1910 	   struct {USItype __l, __h;} __i;				\
   1911 	  } __x;							\
   1912     USItype __m0 = (m0), __m1 = (m1);					\
   1913     __asm__ ("emul %1,%2,$0,%0"						\
   1914 	     : "=g" (__x.__ll) : "g" (__m0), "g" (__m1));		\
   1915     (xh) = __x.__i.__h; (xl) = __x.__i.__l;				\
   1916   } while (0)
   1917 #define sdiv_qrnnd(q, r, n1, n0, d) \
   1918   do {									\
   1919     union {DItype __ll;							\
   1920 	   struct {SItype __l, __h;} __i;				\
   1921 	  } __x;							\
   1922     __x.__i.__h = n1; __x.__i.__l = n0;					\
   1923     __asm__ ("ediv %3,%2,%0,%1"						\
   1924 	     : "=g" (q), "=g" (r) : "g" (__x.__ll), "g" (d));		\
   1925   } while (0)
   1926 #if 0
   1927 /* FIXME: This instruction appears to be unimplemented on some systems (vax
   1928    8800 maybe). */
   1929 #define count_trailing_zeros(count,x)					\
   1930   do {									\
   1931     __asm__ ("ffs 0, 31, %1, %0"					\
   1932 	     : "=g" (count)						\
   1933 	     : "g" ((USItype) (x)));					\
   1934   } while (0)
   1935 #endif
   1936 #endif /* vax */
   1937 
   1938 #if defined (__z8000__) && W_TYPE_SIZE == 16
   1939 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \
   1940   __asm__ ("add	%H1,%H5\n\tadc	%H0,%H3"				\
   1941 	   : "=r" (sh), "=&r" (sl)					\
   1942 	   : "0"  ((unsigned int)(ah)), "r" ((unsigned int)(bh)),	\
   1943 	     "%1" ((unsigned int)(al)), "rQR" ((unsigned int)(bl)))
   1944 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \
   1945   __asm__ ("sub	%H1,%H5\n\tsbc	%H0,%H3"				\
   1946 	   : "=r" (sh), "=&r" (sl)					\
   1947 	   : "0" ((unsigned int)(ah)), "r" ((unsigned int)(bh)),	\
   1948 	     "1" ((unsigned int)(al)), "rQR" ((unsigned int)(bl)))
   1949 #define umul_ppmm(xh, xl, m0, m1) \
   1950   do {									\
   1951     union {long int __ll;						\
   1952 	   struct {unsigned int __h, __l;} __i;				\
   1953 	  } __x;							\
   1954     unsigned int __m0 = (m0), __m1 = (m1);				\
   1955     __asm__ ("mult	%S0,%H3"					\
   1956 	     : "=r" (__x.__i.__h), "=r" (__x.__i.__l)			\
   1957 	     : "%1" (m0), "rQR" (m1));					\
   1958     (xh) = __x.__i.__h; (xl) = __x.__i.__l;				\
   1959     (xh) += ((((signed int) __m0 >> 15) & __m1)				\
   1960 	     + (((signed int) __m1 >> 15) & __m0));			\
   1961   } while (0)
   1962 #endif /* __z8000__ */
   1963 
   1964 #endif /* __GNUC__ */
   1965 
   1966 #endif /* NO_ASM */
   1967 
   1968 
   1969 /* FIXME: "sidi" here is highly doubtful, should sometimes be "diti".  */
   1970 #if !defined (umul_ppmm) && defined (__umulsidi3)
   1971 #define umul_ppmm(ph, pl, m0, m1) \
   1972   do {									\
   1973     UDWtype __ll = __umulsidi3 (m0, m1);				\
   1974     ph = (UWtype) (__ll >> W_TYPE_SIZE);				\
   1975     pl = (UWtype) __ll;							\
   1976   } while (0)
   1977 #endif
   1978 
   1979 #if !defined (__umulsidi3)
   1980 #define __umulsidi3(u, v) \
   1981   ({UWtype __hi, __lo;							\
   1982     umul_ppmm (__hi, __lo, u, v);					\
   1983     ((UDWtype) __hi << W_TYPE_SIZE) | __lo; })
   1984 #endif
   1985 
   1986 
   1987 #if defined (__cplusplus)
   1988 #define __longlong_h_C "C"
   1989 #else
   1990 #define __longlong_h_C
   1991 #endif
   1992 
   1993 /* Use mpn_umul_ppmm or mpn_udiv_qrnnd functions, if they exist.  The "_r"
   1994    forms have "reversed" arguments, meaning the pointer is last, which
   1995    sometimes allows better parameter passing, in particular on 64-bit
   1996    hppa. */
   1997 
   1998 #define mpn_umul_ppmm  __MPN(umul_ppmm)
   1999 extern __longlong_h_C UWtype mpn_umul_ppmm (UWtype *, UWtype, UWtype);
   2000 
   2001 #if ! defined (umul_ppmm) && HAVE_NATIVE_mpn_umul_ppmm  \
   2002   && ! defined (LONGLONG_STANDALONE)
   2003 #define umul_ppmm(wh, wl, u, v)						\
   2004   do {									\
   2005     UWtype __umul_ppmm__p0;						\
   2006     (wh) = mpn_umul_ppmm (&__umul_ppmm__p0, (UWtype) (u), (UWtype) (v));\
   2007     (wl) = __umul_ppmm__p0;						\
   2008   } while (0)
   2009 #endif
   2010 
   2011 #define mpn_umul_ppmm_r  __MPN(umul_ppmm_r)
   2012 extern __longlong_h_C UWtype mpn_umul_ppmm_r (UWtype, UWtype, UWtype *);
   2013 
   2014 #if ! defined (umul_ppmm) && HAVE_NATIVE_mpn_umul_ppmm_r	\
   2015   && ! defined (LONGLONG_STANDALONE)
   2016 #define umul_ppmm(wh, wl, u, v)						\
   2017   do {									\
   2018     UWtype __umul_p0;							\
   2019     (wh) = mpn_umul_ppmm_r ((UWtype) (u), (UWtype) (v), &__umul_p0);	\
   2020     (wl) = __umul_p0;							\
   2021   } while (0)
   2022 #endif
   2023 
   2024 #define mpn_udiv_qrnnd  __MPN(udiv_qrnnd)
   2025 extern __longlong_h_C UWtype mpn_udiv_qrnnd (UWtype *, UWtype, UWtype, UWtype);
   2026 
   2027 #if ! defined (udiv_qrnnd) && HAVE_NATIVE_mpn_udiv_qrnnd	\
   2028   && ! defined (LONGLONG_STANDALONE)
   2029 #define udiv_qrnnd(q, r, n1, n0, d)					\
   2030   do {									\
   2031     UWtype __udiv_qrnnd_r;						\
   2032     (q) = mpn_udiv_qrnnd (&__udiv_qrnnd_r,				\
   2033 			  (UWtype) (n1), (UWtype) (n0), (UWtype) d);	\
   2034     (r) = __udiv_qrnnd_r;						\
   2035   } while (0)
   2036 #endif
   2037 
   2038 #define mpn_udiv_qrnnd_r  __MPN(udiv_qrnnd_r)
   2039 extern __longlong_h_C UWtype mpn_udiv_qrnnd_r (UWtype, UWtype, UWtype, UWtype *);
   2040 
   2041 #if ! defined (udiv_qrnnd) && HAVE_NATIVE_mpn_udiv_qrnnd_r	\
   2042   && ! defined (LONGLONG_STANDALONE)
   2043 #define udiv_qrnnd(q, r, n1, n0, d)					\
   2044   do {									\
   2045     UWtype __udiv_qrnnd_r;						\
   2046     (q) = mpn_udiv_qrnnd_r ((UWtype) (n1), (UWtype) (n0), (UWtype) d,	\
   2047 			    &__udiv_qrnnd_r);				\
   2048     (r) = __udiv_qrnnd_r;						\
   2049   } while (0)
   2050 #endif
   2051 
   2052 
   2053 /* If this machine has no inline assembler, use C macros.  */
   2054 
   2055 #if !defined (add_ssaaaa)
   2056 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \
   2057   do {									\
   2058     UWtype __x;								\
   2059     __x = (al) + (bl);							\
   2060     (sh) = (ah) + (bh) + (__x < (al));					\
   2061     (sl) = __x;								\
   2062   } while (0)
   2063 #endif
   2064 
   2065 #if !defined (sub_ddmmss)
   2066 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \
   2067   do {									\
   2068     UWtype __x;								\
   2069     __x = (al) - (bl);							\
   2070     (sh) = (ah) - (bh) - ((al) < (bl));					\
   2071     (sl) = __x;								\
   2072   } while (0)
   2073 #endif
   2074 
   2075 /* If we lack umul_ppmm but have smul_ppmm, define umul_ppmm in terms of
   2076    smul_ppmm.  */
   2077 #if !defined (umul_ppmm) && defined (smul_ppmm)
   2078 #define umul_ppmm(w1, w0, u, v)						\
   2079   do {									\
   2080     UWtype __w1;							\
   2081     UWtype __xm0 = (u), __xm1 = (v);					\
   2082     smul_ppmm (__w1, w0, __xm0, __xm1);					\
   2083     (w1) = __w1 + (-(__xm0 >> (W_TYPE_SIZE - 1)) & __xm1)		\
   2084 		+ (-(__xm1 >> (W_TYPE_SIZE - 1)) & __xm0);		\
   2085   } while (0)
   2086 #endif
   2087 
   2088 /* If we still don't have umul_ppmm, define it using plain C.
   2089 
   2090    For reference, when this code is used for squaring (ie. u and v identical
   2091    expressions), gcc recognises __x1 and __x2 are the same and generates 3
   2092    multiplies, not 4.  The subsequent additions could be optimized a bit,
   2093    but the only place GMP currently uses such a square is mpn_sqr_basecase,
   2094    and chips obliged to use this generic C umul will have plenty of worse
   2095    performance problems than a couple of extra instructions on the diagonal
   2096    of sqr_basecase.  */
   2097 
   2098 #if !defined (umul_ppmm)
   2099 #define umul_ppmm(w1, w0, u, v)						\
   2100   do {									\
   2101     UWtype __x0, __x1, __x2, __x3;					\
   2102     UHWtype __ul, __vl, __uh, __vh;					\
   2103     UWtype __u = (u), __v = (v);					\
   2104 									\
   2105     __ul = __ll_lowpart (__u);						\
   2106     __uh = __ll_highpart (__u);						\
   2107     __vl = __ll_lowpart (__v);						\
   2108     __vh = __ll_highpart (__v);						\
   2109 									\
   2110     __x0 = (UWtype) __ul * __vl;					\
   2111     __x1 = (UWtype) __ul * __vh;					\
   2112     __x2 = (UWtype) __uh * __vl;					\
   2113     __x3 = (UWtype) __uh * __vh;					\
   2114 									\
   2115     __x1 += __ll_highpart (__x0);/* this can't give carry */		\
   2116     __x1 += __x2;		/* but this indeed can */		\
   2117     if (__x1 < __x2)		/* did we get it? */			\
   2118       __x3 += __ll_B;		/* yes, add it in the proper pos. */	\
   2119 									\
   2120     (w1) = __x3 + __ll_highpart (__x1);					\
   2121     (w0) = (__x1 << W_TYPE_SIZE/2) + __ll_lowpart (__x0);		\
   2122   } while (0)
   2123 #endif
   2124 
   2125 /* If we don't have smul_ppmm, define it using umul_ppmm (which surely will
   2126    exist in one form or another.  */
   2127 #if !defined (smul_ppmm)
   2128 #define smul_ppmm(w1, w0, u, v)						\
   2129   do {									\
   2130     UWtype __w1;							\
   2131     UWtype __xm0 = (u), __xm1 = (v);					\
   2132     umul_ppmm (__w1, w0, __xm0, __xm1);					\
   2133     (w1) = __w1 - (-(__xm0 >> (W_TYPE_SIZE - 1)) & __xm1)		\
   2134 		- (-(__xm1 >> (W_TYPE_SIZE - 1)) & __xm0);		\
   2135   } while (0)
   2136 #endif
   2137 
   2138 /* Define this unconditionally, so it can be used for debugging.  */
   2139 #define __udiv_qrnnd_c(q, r, n1, n0, d) \
   2140   do {									\
   2141     UWtype __d1, __d0, __q1, __q0, __r1, __r0, __m;			\
   2142 									\
   2143     ASSERT ((d) != 0);							\
   2144     ASSERT ((n1) < (d));						\
   2145 									\
   2146     __d1 = __ll_highpart (d);						\
   2147     __d0 = __ll_lowpart (d);						\
   2148 									\
   2149     __q1 = (n1) / __d1;							\
   2150     __r1 = (n1) - __q1 * __d1;						\
   2151     __m = __q1 * __d0;							\
   2152     __r1 = __r1 * __ll_B | __ll_highpart (n0);				\
   2153     if (__r1 < __m)							\
   2154       {									\
   2155 	__q1--, __r1 += (d);						\
   2156 	if (__r1 >= (d)) /* i.e. we didn't get carry when adding to __r1 */\
   2157 	  if (__r1 < __m)						\
   2158 	    __q1--, __r1 += (d);					\
   2159       }									\
   2160     __r1 -= __m;							\
   2161 									\
   2162     __q0 = __r1 / __d1;							\
   2163     __r0 = __r1  - __q0 * __d1;						\
   2164     __m = __q0 * __d0;							\
   2165     __r0 = __r0 * __ll_B | __ll_lowpart (n0);				\
   2166     if (__r0 < __m)							\
   2167       {									\
   2168 	__q0--, __r0 += (d);						\
   2169 	if (__r0 >= (d))						\
   2170 	  if (__r0 < __m)						\
   2171 	    __q0--, __r0 += (d);					\
   2172       }									\
   2173     __r0 -= __m;							\
   2174 									\
   2175     (q) = __q1 * __ll_B | __q0;						\
   2176     (r) = __r0;								\
   2177   } while (0)
   2178 
   2179 /* If the processor has no udiv_qrnnd but sdiv_qrnnd, go through
   2180    __udiv_w_sdiv (defined in libgcc or elsewhere).  */
   2181 #if !defined (udiv_qrnnd) && defined (sdiv_qrnnd) \
   2182   && ! defined (LONGLONG_STANDALONE)
   2183 #define udiv_qrnnd(q, r, nh, nl, d) \
   2184   do {									\
   2185     UWtype __r;								\
   2186     (q) = __MPN(udiv_w_sdiv) (&__r, nh, nl, d);				\
   2187     (r) = __r;								\
   2188   } while (0)
   2189 __GMP_DECLSPEC UWtype __MPN(udiv_w_sdiv) (UWtype *, UWtype, UWtype, UWtype);
   2190 #endif
   2191 
   2192 /* If udiv_qrnnd was not defined for this processor, use __udiv_qrnnd_c.  */
   2193 #if !defined (udiv_qrnnd)
   2194 #define UDIV_NEEDS_NORMALIZATION 1
   2195 #define udiv_qrnnd __udiv_qrnnd_c
   2196 #endif
   2197 
   2198 #if !defined (count_leading_zeros)
   2199 #define count_leading_zeros(count, x) \
   2200   do {									\
   2201     UWtype __xr = (x);							\
   2202     UWtype __a;								\
   2203 									\
   2204     if (W_TYPE_SIZE == 32)						\
   2205       {									\
   2206 	__a = __xr < ((UWtype) 1 << 2*__BITS4)				\
   2207 	  ? (__xr < ((UWtype) 1 << __BITS4) ? 1 : __BITS4 + 1)		\
   2208 	  : (__xr < ((UWtype) 1 << 3*__BITS4) ? 2*__BITS4 + 1		\
   2209 	  : 3*__BITS4 + 1);						\
   2210       }									\
   2211     else								\
   2212       {									\
   2213 	for (__a = W_TYPE_SIZE - 8; __a > 0; __a -= 8)			\
   2214 	  if (((__xr >> __a) & 0xff) != 0)				\
   2215 	    break;							\
   2216 	++__a;								\
   2217       }									\
   2218 									\
   2219     (count) = W_TYPE_SIZE + 1 - __a - __clz_tab[__xr >> __a];		\
   2220   } while (0)
   2221 /* This version gives a well-defined value for zero. */
   2222 #define COUNT_LEADING_ZEROS_0 (W_TYPE_SIZE - 1)
   2223 #define COUNT_LEADING_ZEROS_NEED_CLZ_TAB
   2224 #define COUNT_LEADING_ZEROS_SLOW
   2225 #endif
   2226 
   2227 /* clz_tab needed by mpn/x86/pentium/mod_1.asm in a fat binary */
   2228 #if HAVE_HOST_CPU_FAMILY_x86 && WANT_FAT_BINARY
   2229 #define COUNT_LEADING_ZEROS_NEED_CLZ_TAB
   2230 #endif
   2231 
   2232 #ifdef COUNT_LEADING_ZEROS_NEED_CLZ_TAB
   2233 extern const unsigned char __GMP_DECLSPEC __clz_tab[129];
   2234 #endif
   2235 
   2236 #if !defined (count_trailing_zeros)
   2237 #if !defined (COUNT_LEADING_ZEROS_SLOW)
   2238 /* Define count_trailing_zeros using an asm count_leading_zeros.  */
   2239 #define count_trailing_zeros(count, x)					\
   2240   do {									\
   2241     UWtype __ctz_x = (x);						\
   2242     UWtype __ctz_c;							\
   2243     ASSERT (__ctz_x != 0);						\
   2244     count_leading_zeros (__ctz_c, __ctz_x & -__ctz_x);			\
   2245     (count) = W_TYPE_SIZE - 1 - __ctz_c;				\
   2246   } while (0)
   2247 #else
   2248 /* Define count_trailing_zeros in plain C, assuming small counts are common.
   2249    We use clz_tab without ado, since the C count_leading_zeros above will have
   2250    pulled it in.  */
   2251 #define count_trailing_zeros(count, x)					\
   2252   do {									\
   2253     UWtype __ctz_x = (x);						\
   2254     int __ctz_c;							\
   2255 									\
   2256     if (LIKELY ((__ctz_x & 0xff) != 0))					\
   2257       (count) = __clz_tab[__ctz_x & -__ctz_x] - 2;			\
   2258     else								\
   2259       {									\
   2260 	for (__ctz_c = 8 - 2; __ctz_c < W_TYPE_SIZE - 2; __ctz_c += 8)	\
   2261 	  {								\
   2262 	    __ctz_x >>= 8;						\
   2263 	    if (LIKELY ((__ctz_x & 0xff) != 0))				\
   2264 	      break;							\
   2265 	  }								\
   2266 									\
   2267 	(count) = __ctz_c + __clz_tab[__ctz_x & -__ctz_x];		\
   2268       }									\
   2269   } while (0)
   2270 #endif
   2271 #endif
   2272 
   2273 #ifndef UDIV_NEEDS_NORMALIZATION
   2274 #define UDIV_NEEDS_NORMALIZATION 0
   2275 #endif
   2276 
   2277 /* Whether udiv_qrnnd is actually implemented with udiv_qrnnd_preinv, and
   2278    that hence the latter should always be used.  */
   2279 #ifndef UDIV_PREINV_ALWAYS
   2280 #define UDIV_PREINV_ALWAYS 0
   2281 #endif
   2282