Home | History | Annotate | Line # | Download | only in dist
longlong.h revision 1.5
      1 /* longlong.h -- definitions for mixed size 32/64 bit arithmetic.
      2 
      3 Copyright 1991-1994, 1996, 1997, 1999-2005, 2007-2009, 2011-2020 Free Software
      4 Foundation, Inc.
      5 
      6 This file is part of the GNU MP Library.
      7 
      8 The GNU MP Library is free software; you can redistribute it and/or modify
      9 it under the terms of either:
     10 
     11   * the GNU Lesser General Public License as published by the Free
     12     Software Foundation; either version 3 of the License, or (at your
     13     option) any later version.
     14 
     15 or
     16 
     17   * the GNU General Public License as published by the Free Software
     18     Foundation; either version 2 of the License, or (at your option) any
     19     later version.
     20 
     21 or both in parallel, as here.
     22 
     23 The GNU MP Library is distributed in the hope that it will be useful, but
     24 WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
     25 or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
     26 for more details.
     27 
     28 You should have received copies of the GNU General Public License and the
     29 GNU Lesser General Public License along with the GNU MP Library.  If not,
     30 see https://www.gnu.org/licenses/.  */
     31 
     32 /* You have to define the following before including this file:
     33 
     34    UWtype -- An unsigned type, default type for operations (typically a "word")
     35    UHWtype -- An unsigned type, at least half the size of UWtype
     36    UDWtype -- An unsigned type, at least twice as large a UWtype
     37    W_TYPE_SIZE -- size in bits of UWtype
     38 
     39    SItype, USItype -- Signed and unsigned 32 bit types
     40    DItype, UDItype -- Signed and unsigned 64 bit types
     41 
     42    On a 32 bit machine UWtype should typically be USItype;
     43    on a 64 bit machine, UWtype should typically be UDItype.
     44 
     45    Optionally, define:
     46 
     47    LONGLONG_STANDALONE -- Avoid code that needs machine-dependent support files
     48    NO_ASM -- Disable inline asm
     49 
     50 
     51    CAUTION!  Using this version of longlong.h outside of GMP is not safe.  You
     52    need to include gmp.h and gmp-impl.h, or certain things might not work as
     53    expected.
     54 */
     55 
     56 #define __BITS4 (W_TYPE_SIZE / 4)
     57 #define __ll_B ((UWtype) 1 << (W_TYPE_SIZE / 2))
     58 #define __ll_lowpart(t) ((UWtype) (t) & (__ll_B - 1))
     59 #define __ll_highpart(t) ((UWtype) (t) >> (W_TYPE_SIZE / 2))
     60 
     61 /* This is used to make sure no undesirable sharing between different libraries
     62    that use this file takes place.  */
     63 #ifndef __MPN
     64 #define __MPN(x) __##x
     65 #endif
     66 
     67 /* Define auxiliary asm macros.
     68 
     69    1) umul_ppmm(high_prod, low_prod, multiplier, multiplicand) multiplies two
     70    UWtype integers MULTIPLIER and MULTIPLICAND, and generates a two UWtype
     71    word product in HIGH_PROD and LOW_PROD.
     72 
     73    2) __umulsidi3(a,b) multiplies two UWtype integers A and B, and returns a
     74    UDWtype product.  This is just a variant of umul_ppmm.
     75 
     76    3) udiv_qrnnd(quotient, remainder, high_numerator, low_numerator,
     77    denominator) divides a UDWtype, composed by the UWtype integers
     78    HIGH_NUMERATOR and LOW_NUMERATOR, by DENOMINATOR and places the quotient
     79    in QUOTIENT and the remainder in REMAINDER.  HIGH_NUMERATOR must be less
     80    than DENOMINATOR for correct operation.  If, in addition, the most
     81    significant bit of DENOMINATOR must be 1, then the pre-processor symbol
     82    UDIV_NEEDS_NORMALIZATION is defined to 1.
     83 
     84    4) sdiv_qrnnd(quotient, remainder, high_numerator, low_numerator,
     85    denominator).  Like udiv_qrnnd but the numbers are signed.  The quotient
     86    is rounded towards 0.
     87 
     88    5) count_leading_zeros(count, x) counts the number of zero-bits from the
     89    msb to the first non-zero bit in the UWtype X.  This is the number of
     90    steps X needs to be shifted left to set the msb.  Undefined for X == 0,
     91    unless the symbol COUNT_LEADING_ZEROS_0 is defined to some value.
     92 
     93    6) count_trailing_zeros(count, x) like count_leading_zeros, but counts
     94    from the least significant end.
     95 
     96    7) add_ssaaaa(high_sum, low_sum, high_addend_1, low_addend_1,
     97    high_addend_2, low_addend_2) adds two UWtype integers, composed by
     98    HIGH_ADDEND_1 and LOW_ADDEND_1, and HIGH_ADDEND_2 and LOW_ADDEND_2
     99    respectively.  The result is placed in HIGH_SUM and LOW_SUM.  Overflow
    100    (i.e. carry out) is not stored anywhere, and is lost.
    101 
    102    8) sub_ddmmss(high_difference, low_difference, high_minuend, low_minuend,
    103    high_subtrahend, low_subtrahend) subtracts two two-word UWtype integers,
    104    composed by HIGH_MINUEND_1 and LOW_MINUEND_1, and HIGH_SUBTRAHEND_2 and
    105    LOW_SUBTRAHEND_2 respectively.  The result is placed in HIGH_DIFFERENCE
    106    and LOW_DIFFERENCE.  Overflow (i.e. carry out) is not stored anywhere,
    107    and is lost.
    108 
    109    If any of these macros are left undefined for a particular CPU,
    110    C macros are used.
    111 
    112 
    113    Notes:
    114 
    115    For add_ssaaaa the two high and two low addends can both commute, but
    116    unfortunately gcc only supports one "%" commutative in each asm block.
    117    This has always been so but is only documented in recent versions
    118    (eg. pre-release 3.3).  Having two or more "%"s can cause an internal
    119    compiler error in certain rare circumstances.
    120 
    121    Apparently it was only the last "%" that was ever actually respected, so
    122    the code has been updated to leave just that.  Clearly there's a free
    123    choice whether high or low should get it, if there's a reason to favour
    124    one over the other.  Also obviously when the constraints on the two
    125    operands are identical there's no benefit to the reloader in any "%" at
    126    all.
    127 
    128    */
    129 
    130 /* The CPUs come in alphabetical order below.
    131 
    132    Please add support for more CPUs here, or improve the current support
    133    for the CPUs below!  */
    134 
    135 
    136 /* count_leading_zeros_gcc_clz is count_leading_zeros implemented with gcc
    137    3.4 __builtin_clzl or __builtin_clzll, according to our limb size.
    138    Similarly count_trailing_zeros_gcc_ctz using __builtin_ctzl or
    139    __builtin_ctzll.
    140 
    141    These builtins are only used when we check what code comes out, on some
    142    chips they're merely libgcc calls, where we will instead want an inline
    143    in that case (either asm or generic C).
    144 
    145    These builtins are better than an asm block of the same insn, since an
    146    asm block doesn't give gcc any information about scheduling or resource
    147    usage.  We keep an asm block for use on prior versions of gcc though.
    148 
    149    For reference, __builtin_ffs existed in gcc prior to __builtin_clz, but
    150    it's not used (for count_leading_zeros) because it generally gives extra
    151    code to ensure the result is 0 when the input is 0, which we don't need
    152    or want.  */
    153 
    154 #ifdef _LONG_LONG_LIMB
    155 #define count_leading_zeros_gcc_clz(count,x)	\
    156   do {						\
    157     ASSERT ((x) != 0);				\
    158     (count) = __builtin_clzll (x);		\
    159   } while (0)
    160 #else
    161 #define count_leading_zeros_gcc_clz(count,x)	\
    162   do {						\
    163     ASSERT ((x) != 0);				\
    164     (count) = __builtin_clzl (x);		\
    165   } while (0)
    166 #endif
    167 
    168 #ifdef _LONG_LONG_LIMB
    169 #define count_trailing_zeros_gcc_ctz(count,x)	\
    170   do {						\
    171     ASSERT ((x) != 0);				\
    172     (count) = __builtin_ctzll (x);		\
    173   } while (0)
    174 #else
    175 #define count_trailing_zeros_gcc_ctz(count,x)	\
    176   do {						\
    177     ASSERT ((x) != 0);				\
    178     (count) = __builtin_ctzl (x);		\
    179   } while (0)
    180 #endif
    181 
    182 
    183 /* FIXME: The macros using external routines like __MPN(count_leading_zeros)
    184    don't need to be under !NO_ASM */
    185 #if ! defined (NO_ASM)
    186 
    187 #if defined (__alpha) && W_TYPE_SIZE == 64
    188 /* Most alpha-based machines, except Cray systems. */
    189 #if defined (__GNUC__)
    190 #if __GMP_GNUC_PREREQ (3,3)
    191 #define umul_ppmm(ph, pl, m0, m1) \
    192   do {									\
    193     UDItype __m0 = (m0), __m1 = (m1);					\
    194     (ph) = __builtin_alpha_umulh (__m0, __m1);				\
    195     (pl) = __m0 * __m1;							\
    196   } while (0)
    197 #else
    198 #define umul_ppmm(ph, pl, m0, m1) \
    199   do {									\
    200     UDItype __m0 = (m0), __m1 = (m1);					\
    201     __asm__ ("umulh %r1,%2,%0"						\
    202 	     : "=r" (ph)						\
    203 	     : "%rJ" (__m0), "rI" (__m1));				\
    204     (pl) = __m0 * __m1;							\
    205   } while (0)
    206 #endif
    207 #else /* ! __GNUC__ */
    208 #include <machine/builtins.h>
    209 #define umul_ppmm(ph, pl, m0, m1) \
    210   do {									\
    211     UDItype __m0 = (m0), __m1 = (m1);					\
    212     (ph) = __UMULH (__m0, __m1);					\
    213     (pl) = __m0 * __m1;							\
    214   } while (0)
    215 #endif
    216 #ifndef LONGLONG_STANDALONE
    217 #define udiv_qrnnd(q, r, n1, n0, d) \
    218   do { UWtype __di;							\
    219     __di = __MPN(invert_limb) (d);					\
    220     udiv_qrnnd_preinv (q, r, n1, n0, d, __di);				\
    221   } while (0)
    222 #define UDIV_PREINV_ALWAYS  1
    223 #define UDIV_NEEDS_NORMALIZATION 1
    224 #endif /* LONGLONG_STANDALONE */
    225 
    226 /* clz_tab is required in all configurations, since mpn/alpha/cntlz.asm
    227    always goes into libgmp.so, even when not actually used.  */
    228 #define COUNT_LEADING_ZEROS_NEED_CLZ_TAB
    229 
    230 #if defined (__GNUC__) && HAVE_HOST_CPU_alpha_CIX
    231 #define count_leading_zeros(COUNT,X) \
    232   __asm__("ctlz %1,%0" : "=r"(COUNT) : "r"(X))
    233 #define count_trailing_zeros(COUNT,X) \
    234   __asm__("cttz %1,%0" : "=r"(COUNT) : "r"(X))
    235 #endif /* clz/ctz using cix */
    236 
    237 #if ! defined (count_leading_zeros)				\
    238   && defined (__GNUC__) && ! defined (LONGLONG_STANDALONE)
    239 /* ALPHA_CMPBGE_0 gives "cmpbge $31,src,dst", ie. test src bytes == 0.
    240    "$31" is written explicitly in the asm, since an "r" constraint won't
    241    select reg 31.  There seems no need to worry about "r31" syntax for cray,
    242    since gcc itself (pre-release 3.4) emits just $31 in various places.	 */
    243 #define ALPHA_CMPBGE_0(dst, src)					\
    244   do { asm ("cmpbge $31, %1, %0" : "=r" (dst) : "r" (src)); } while (0)
    245 /* Zero bytes are turned into bits with cmpbge, a __clz_tab lookup counts
    246    them, locating the highest non-zero byte.  A second __clz_tab lookup
    247    counts the leading zero bits in that byte, giving the result.  */
    248 #define count_leading_zeros(count, x)					\
    249   do {									\
    250     UWtype  __clz__b, __clz__c, __clz__x = (x);				\
    251     ALPHA_CMPBGE_0 (__clz__b,  __clz__x);	    /* zero bytes */	\
    252     __clz__b = __clz_tab [(__clz__b >> 1) ^ 0x7F];  /* 8 to 1 byte */	\
    253     __clz__b = __clz__b * 8 - 7;		    /* 57 to 1 shift */ \
    254     __clz__x >>= __clz__b;						\
    255     __clz__c = __clz_tab [__clz__x];		    /* 8 to 1 bit */	\
    256     __clz__b = 65 - __clz__b;						\
    257     (count) = __clz__b - __clz__c;					\
    258   } while (0)
    259 #define COUNT_LEADING_ZEROS_NEED_CLZ_TAB
    260 #endif /* clz using cmpbge */
    261 
    262 #if ! defined (count_leading_zeros) && ! defined (LONGLONG_STANDALONE)
    263 #if HAVE_ATTRIBUTE_CONST
    264 long __MPN(count_leading_zeros) (UDItype) __attribute__ ((const));
    265 #else
    266 long __MPN(count_leading_zeros) (UDItype);
    267 #endif
    268 #define count_leading_zeros(count, x) \
    269   ((count) = __MPN(count_leading_zeros) (x))
    270 #endif /* clz using mpn */
    271 #endif /* __alpha */
    272 
    273 #if defined (__AVR) && W_TYPE_SIZE == 8
    274 #define umul_ppmm(ph, pl, m0, m1) \
    275   do {									\
    276     unsigned short __p = (unsigned short) (m0) * (m1);			\
    277     (ph) = __p >> 8;							\
    278     (pl) = __p;								\
    279   } while (0)
    280 #endif /* AVR */
    281 
    282 #if defined (_CRAY) && W_TYPE_SIZE == 64
    283 #include <intrinsics.h>
    284 #define UDIV_PREINV_ALWAYS  1
    285 #define UDIV_NEEDS_NORMALIZATION 1
    286 long __MPN(count_leading_zeros) (UDItype);
    287 #define count_leading_zeros(count, x) \
    288   ((count) = _leadz ((UWtype) (x)))
    289 #if defined (_CRAYIEEE)		/* I.e., Cray T90/ieee, T3D, and T3E */
    290 #define umul_ppmm(ph, pl, m0, m1) \
    291   do {									\
    292     UDItype __m0 = (m0), __m1 = (m1);					\
    293     (ph) = _int_mult_upper (__m0, __m1);				\
    294     (pl) = __m0 * __m1;							\
    295   } while (0)
    296 #ifndef LONGLONG_STANDALONE
    297 #define udiv_qrnnd(q, r, n1, n0, d) \
    298   do { UWtype __di;							\
    299     __di = __MPN(invert_limb) (d);					\
    300     udiv_qrnnd_preinv (q, r, n1, n0, d, __di);				\
    301   } while (0)
    302 #endif /* LONGLONG_STANDALONE */
    303 #endif /* _CRAYIEEE */
    304 #endif /* _CRAY */
    305 
    306 #if defined (__ia64) && W_TYPE_SIZE == 64
    307 /* This form encourages gcc (pre-release 3.4 at least) to emit predicated
    308    "sub r=r,r" and "sub r=r,r,1", giving a 2 cycle latency.  The generic
    309    code using "al<bl" arithmetically comes out making an actual 0 or 1 in a
    310    register, which takes an extra cycle.  */
    311 #define sub_ddmmss(sh, sl, ah, al, bh, bl)      \
    312   do {						\
    313     UWtype __x;					\
    314     __x = (al) - (bl);				\
    315     if ((al) < (bl))				\
    316       (sh) = (ah) - (bh) - 1;			\
    317     else					\
    318       (sh) = (ah) - (bh);			\
    319     (sl) = __x;					\
    320   } while (0)
    321 #if defined (__GNUC__) && ! defined (__INTEL_COMPILER)
    322 /* Do both product parts in assembly, since that gives better code with
    323    all gcc versions.  Some callers will just use the upper part, and in
    324    that situation we waste an instruction, but not any cycles.  */
    325 #define umul_ppmm(ph, pl, m0, m1) \
    326     __asm__ ("xma.hu %0 = %2, %3, f0\n\txma.l %1 = %2, %3, f0"		\
    327 	     : "=&f" (ph), "=f" (pl)					\
    328 	     : "f" (m0), "f" (m1))
    329 #define count_leading_zeros(count, x) \
    330   do {									\
    331     UWtype _x = (x), _y, _a, _c;					\
    332     __asm__ ("mux1 %0 = %1, @rev" : "=r" (_y) : "r" (_x));		\
    333     __asm__ ("czx1.l %0 = %1" : "=r" (_a) : "r" (-_y | _y));		\
    334     _c = (_a - 1) << 3;							\
    335     _x >>= _c;								\
    336     if (_x >= 1 << 4)							\
    337       _x >>= 4, _c += 4;						\
    338     if (_x >= 1 << 2)							\
    339       _x >>= 2, _c += 2;						\
    340     _c += _x >> 1;							\
    341     (count) =  W_TYPE_SIZE - 1 - _c;					\
    342   } while (0)
    343 /* similar to what gcc does for __builtin_ffs, but 0 based rather than 1
    344    based, and we don't need a special case for x==0 here */
    345 #define count_trailing_zeros(count, x)					\
    346   do {									\
    347     UWtype __ctz_x = (x);						\
    348     __asm__ ("popcnt %0 = %1"						\
    349 	     : "=r" (count)						\
    350 	     : "r" ((__ctz_x-1) & ~__ctz_x));				\
    351   } while (0)
    352 #endif
    353 #if defined (__INTEL_COMPILER)
    354 #include <ia64intrin.h>
    355 #define umul_ppmm(ph, pl, m0, m1)					\
    356   do {									\
    357     UWtype __m0 = (m0), __m1 = (m1);					\
    358     ph = _m64_xmahu (__m0, __m1, 0);					\
    359     pl = __m0 * __m1;							\
    360   } while (0)
    361 #endif
    362 #ifndef LONGLONG_STANDALONE
    363 #define udiv_qrnnd(q, r, n1, n0, d) \
    364   do { UWtype __di;							\
    365     __di = __MPN(invert_limb) (d);					\
    366     udiv_qrnnd_preinv (q, r, n1, n0, d, __di);				\
    367   } while (0)
    368 #define UDIV_PREINV_ALWAYS  1
    369 #define UDIV_NEEDS_NORMALIZATION 1
    370 #endif
    371 #endif
    372 
    373 
    374 #if defined (__GNUC__)
    375 
    376 /* We sometimes need to clobber "cc" with gcc2, but that would not be
    377    understood by gcc1.  Use cpp to avoid major code duplication.  */
    378 #if __GNUC__ < 2
    379 #define __CLOBBER_CC
    380 #define __AND_CLOBBER_CC
    381 #else /* __GNUC__ >= 2 */
    382 #define __CLOBBER_CC : "cc"
    383 #define __AND_CLOBBER_CC , "cc"
    384 #endif /* __GNUC__ < 2 */
    385 
    386 #if (defined (__a29k__) || defined (_AM29K)) && W_TYPE_SIZE == 32
    387 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \
    388   __asm__ ("add %1,%4,%5\n\taddc %0,%2,%3"				\
    389 	   : "=r" (sh), "=&r" (sl)					\
    390 	   : "r" (ah), "rI" (bh), "%r" (al), "rI" (bl))
    391 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \
    392   __asm__ ("sub %1,%4,%5\n\tsubc %0,%2,%3"				\
    393 	   : "=r" (sh), "=&r" (sl)					\
    394 	   : "r" (ah), "rI" (bh), "r" (al), "rI" (bl))
    395 #define umul_ppmm(xh, xl, m0, m1) \
    396   do {									\
    397     USItype __m0 = (m0), __m1 = (m1);					\
    398     __asm__ ("multiplu %0,%1,%2"					\
    399 	     : "=r" (xl)						\
    400 	     : "r" (__m0), "r" (__m1));					\
    401     __asm__ ("multmu %0,%1,%2"						\
    402 	     : "=r" (xh)						\
    403 	     : "r" (__m0), "r" (__m1));					\
    404   } while (0)
    405 #define udiv_qrnnd(q, r, n1, n0, d) \
    406   __asm__ ("dividu %0,%3,%4"						\
    407 	   : "=r" (q), "=q" (r)						\
    408 	   : "1" (n1), "r" (n0), "r" (d))
    409 #define count_leading_zeros(count, x) \
    410     __asm__ ("clz %0,%1"						\
    411 	     : "=r" (count)						\
    412 	     : "r" (x))
    413 #define COUNT_LEADING_ZEROS_0 32
    414 #endif /* __a29k__ */
    415 
    416 #if defined (__arc__)
    417 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \
    418   __asm__ ("add.f\t%1, %4, %5\n\tadc\t%0, %2, %3"			\
    419 	   : "=r" (sh),							\
    420 	     "=&r" (sl)							\
    421 	   : "r"  ((USItype) (ah)),					\
    422 	     "rICal" ((USItype) (bh)),					\
    423 	     "%r" ((USItype) (al)),					\
    424 	     "rICal" ((USItype) (bl)))
    425 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \
    426   __asm__ ("sub.f\t%1, %4, %5\n\tsbc\t%0, %2, %3"			\
    427 	   : "=r" (sh),							\
    428 	     "=&r" (sl)							\
    429 	   : "r" ((USItype) (ah)),					\
    430 	     "rICal" ((USItype) (bh)),					\
    431 	     "r" ((USItype) (al)),					\
    432 	     "rICal" ((USItype) (bl)))
    433 #endif
    434 
    435 #if defined (__arm__) && (defined (__thumb2__) || !defined (__thumb__)) \
    436     && W_TYPE_SIZE == 32
    437 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \
    438   do {									\
    439     if (__builtin_constant_p (bl) && -(USItype)(bl) < (USItype)(bl))	\
    440       __asm__ ("subs\t%1, %4, %5\n\tadc\t%0, %2, %3"			\
    441 	   : "=r" (sh), "=&r" (sl)					\
    442 	       : "r" (ah), "rI" (bh),					\
    443 		 "%r" (al), "rI" (-(USItype)(bl)) __CLOBBER_CC);	\
    444     else								\
    445       __asm__ ("adds\t%1, %4, %5\n\tadc\t%0, %2, %3"			\
    446 	   : "=r" (sh), "=&r" (sl)					\
    447 	   : "r" (ah), "rI" (bh), "%r" (al), "rI" (bl) __CLOBBER_CC);	\
    448   } while (0)
    449 /* FIXME: Extend the immediate range for the low word by using both ADDS and
    450    SUBS, since they set carry in the same way.  We need separate definitions
    451    for thumb and non-thumb since thumb lacks RSC.  */
    452 #if defined (__thumb__)
    453 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \
    454   do {									\
    455     if (__builtin_constant_p (ah) && __builtin_constant_p (bh)		\
    456 	&& (ah) == (bh))						\
    457       __asm__ ("subs\t%1, %2, %3\n\tsbc\t%0, %0, %0"			\
    458 	       : "=r" (sh), "=r" (sl)					\
    459 	       : "r" (al), "rI" (bl) __CLOBBER_CC);			\
    460     else if (__builtin_constant_p (al))					\
    461       __asm__ ("rsbs\t%1, %5, %4\n\tsbc\t%0, %2, %3"			\
    462 	       : "=r" (sh), "=&r" (sl)					\
    463 	       : "r" (ah), "rI" (bh), "rI" (al), "r" (bl) __CLOBBER_CC); \
    464     else if (__builtin_constant_p (bl))					\
    465       __asm__ ("subs\t%1, %4, %5\n\tsbc\t%0, %2, %3"			\
    466 	       : "=r" (sh), "=&r" (sl)					\
    467 	       : "r" (ah), "rI" (bh), "r" (al), "rI" (bl) __CLOBBER_CC); \
    468     else								\
    469       __asm__ ("subs\t%1, %4, %5\n\tsbc\t%0, %2, %3"			\
    470 	       : "=r" (sh), "=&r" (sl)					\
    471 	       : "r" (ah), "rI" (bh), "r" (al), "rI" (bl) __CLOBBER_CC); \
    472     } while (0)
    473 #else
    474 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \
    475   do {									\
    476     if (__builtin_constant_p (ah) && __builtin_constant_p (bh)		\
    477 	&& (ah) == (bh))						\
    478       __asm__ ("subs\t%1, %2, %3\n\tsbc\t%0, %0, %0"			\
    479 	       : "=r" (sh), "=r" (sl)					\
    480 	       : "r" (al), "rI" (bl) __CLOBBER_CC);			\
    481     else if (__builtin_constant_p (al))					\
    482       {									\
    483 	if (__builtin_constant_p (ah))					\
    484 	  __asm__ ("rsbs\t%1, %5, %4\n\trsc\t%0, %3, %2"		\
    485 		   : "=r" (sh), "=&r" (sl)				\
    486 		   : "rI" (ah), "r" (bh), "rI" (al), "r" (bl) __CLOBBER_CC); \
    487 	else								\
    488 	  __asm__ ("rsbs\t%1, %5, %4\n\tsbc\t%0, %2, %3"		\
    489 		   : "=r" (sh), "=&r" (sl)				\
    490 		   : "r" (ah), "rI" (bh), "rI" (al), "r" (bl) __CLOBBER_CC); \
    491       }									\
    492     else if (__builtin_constant_p (ah))					\
    493       {									\
    494 	if (__builtin_constant_p (bl))					\
    495 	  __asm__ ("subs\t%1, %4, %5\n\trsc\t%0, %3, %2"		\
    496 		   : "=r" (sh), "=&r" (sl)				\
    497 		   : "rI" (ah), "r" (bh), "r" (al), "rI" (bl) __CLOBBER_CC); \
    498 	else								\
    499 	  __asm__ ("rsbs\t%1, %5, %4\n\trsc\t%0, %3, %2"		\
    500 		   : "=r" (sh), "=&r" (sl)				\
    501 		   : "rI" (ah), "r" (bh), "rI" (al), "r" (bl) __CLOBBER_CC); \
    502       }									\
    503     else if (__builtin_constant_p (bl))					\
    504       __asm__ ("subs\t%1, %4, %5\n\tsbc\t%0, %2, %3"			\
    505 	       : "=r" (sh), "=&r" (sl)					\
    506 	       : "r" (ah), "rI" (bh), "r" (al), "rI" (bl) __CLOBBER_CC); \
    507     else								\
    508       __asm__ ("subs\t%1, %4, %5\n\tsbc\t%0, %2, %3"			\
    509 	       : "=r" (sh), "=&r" (sl)					\
    510 	       : "r" (ah), "rI" (bh), "r" (al), "rI" (bl) __CLOBBER_CC); \
    511     } while (0)
    512 #endif
    513 #if defined (__ARM_ARCH_2__) || defined (__ARM_ARCH_2A__) \
    514     || defined (__ARM_ARCH_3__)
    515 #define umul_ppmm(xh, xl, a, b)						\
    516   do {									\
    517     register USItype __t0, __t1, __t2;					\
    518     __asm__ ("%@ Inlined umul_ppmm\n"					\
    519 	   "	mov	%2, %5, lsr #16\n"				\
    520 	   "	mov	%0, %6, lsr #16\n"				\
    521 	   "	bic	%3, %5, %2, lsl #16\n"				\
    522 	   "	bic	%4, %6, %0, lsl #16\n"				\
    523 	   "	mul	%1, %3, %4\n"					\
    524 	   "	mul	%4, %2, %4\n"					\
    525 	   "	mul	%3, %0, %3\n"					\
    526 	   "	mul	%0, %2, %0\n"					\
    527 	   "	adds	%3, %4, %3\n"					\
    528 	   "	addcs	%0, %0, #65536\n"				\
    529 	   "	adds	%1, %1, %3, lsl #16\n"				\
    530 	   "	adc	%0, %0, %3, lsr #16"				\
    531 	   : "=&r" ((USItype) (xh)), "=r" ((USItype) (xl)),		\
    532 	     "=&r" (__t0), "=&r" (__t1), "=r" (__t2)			\
    533 	   : "r" ((USItype) (a)), "r" ((USItype) (b)) __CLOBBER_CC);	\
    534   } while (0)
    535 #ifndef LONGLONG_STANDALONE
    536 #define udiv_qrnnd(q, r, n1, n0, d) \
    537   do { UWtype __r;							\
    538     (q) = __MPN(udiv_qrnnd) (&__r, (n1), (n0), (d));			\
    539     (r) = __r;								\
    540   } while (0)
    541 extern UWtype __MPN(udiv_qrnnd) (UWtype *, UWtype, UWtype, UWtype);
    542 #endif /* LONGLONG_STANDALONE */
    543 #else /* ARMv4 or newer */
    544 #define umul_ppmm(xh, xl, a, b) \
    545   __asm__ ("umull %0,%1,%2,%3" : "=&r" (xl), "=&r" (xh) : "r" (a), "r" (b))
    546 #define smul_ppmm(xh, xl, a, b) \
    547   __asm__ ("smull %0,%1,%2,%3" : "=&r" (xl), "=&r" (xh) : "r" (a), "r" (b))
    548 #ifndef LONGLONG_STANDALONE
    549 #define udiv_qrnnd(q, r, n1, n0, d) \
    550   do { UWtype __di;							\
    551     __di = __MPN(invert_limb) (d);					\
    552     udiv_qrnnd_preinv (q, r, n1, n0, d, __di);				\
    553   } while (0)
    554 #define UDIV_PREINV_ALWAYS  1
    555 #define UDIV_NEEDS_NORMALIZATION 1
    556 #endif /* LONGLONG_STANDALONE */
    557 #endif /* defined(__ARM_ARCH_2__) ... */
    558 #define count_leading_zeros(count, x)  count_leading_zeros_gcc_clz(count, x)
    559 #define count_trailing_zeros(count, x)  count_trailing_zeros_gcc_ctz(count, x)
    560 #endif /* __arm__ */
    561 
    562 #if defined (__aarch64__) && W_TYPE_SIZE == 64
    563 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \
    564   do {									\
    565     if (__builtin_constant_p (bl) && ~(UDItype)(bl) <= (UDItype)(bl))	\
    566       __asm__ ("subs\t%1, %x4, %5\n\tadc\t%0, %x2, %x3"			\
    567 	       : "=r" (sh), "=&r" (sl)					\
    568 	       : "rZ" ((UDItype)(ah)), "rZ" ((UDItype)(bh)),		\
    569 		 "%r" ((UDItype)(al)), "rI" (-(UDItype)(bl)) __CLOBBER_CC);\
    570     else								\
    571       __asm__ ("adds\t%1, %x4, %5\n\tadc\t%0, %x2, %x3"			\
    572 	       : "=r" (sh), "=&r" (sl)					\
    573 	       : "rZ" ((UDItype)(ah)), "rZ" ((UDItype)(bh)),		\
    574 		 "%r" ((UDItype)(al)), "rI" ((UDItype)(bl)) __CLOBBER_CC);\
    575   } while (0)
    576 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \
    577   do {									\
    578     if (__builtin_constant_p (bl) && ~(UDItype)(bl) <= (UDItype)(bl))	\
    579       __asm__ ("adds\t%1, %x4, %5\n\tsbc\t%0, %x2, %x3"			\
    580 	       : "=r,r" (sh), "=&r,&r" (sl)				\
    581 	       : "rZ,rZ" ((UDItype)(ah)), "rZ,rZ" ((UDItype)(bh)),	\
    582 		 "r,Z"   ((UDItype)(al)), "rI,r" (-(UDItype)(bl)) __CLOBBER_CC);\
    583     else								\
    584       __asm__ ("subs\t%1, %x4, %5\n\tsbc\t%0, %x2, %x3"			\
    585 	       : "=r,r" (sh), "=&r,&r" (sl)				\
    586 	       : "rZ,rZ" ((UDItype)(ah)), "rZ,rZ" ((UDItype)(bh)),	\
    587 		 "r,Z"   ((UDItype)(al)), "rI,r"  ((UDItype)(bl)) __CLOBBER_CC);\
    588   } while(0);
    589 #if __GMP_GNUC_PREREQ (4,9)
    590 #define umul_ppmm(w1, w0, u, v) \
    591   do {									\
    592     typedef unsigned int __ll_UTItype __attribute__((mode(TI)));	\
    593     __ll_UTItype __ll = (__ll_UTItype)(u) * (v);			\
    594     w1 = __ll >> 64;							\
    595     w0 = __ll;								\
    596   } while (0)
    597 #endif
    598 #if !defined (umul_ppmm)
    599 #define umul_ppmm(ph, pl, m0, m1) \
    600   do {									\
    601     UDItype __m0 = (m0), __m1 = (m1);					\
    602     __asm__ ("umulh\t%0, %1, %2" : "=r" (ph) : "r" (__m0), "r" (__m1));	\
    603     (pl) = __m0 * __m1;							\
    604   } while (0)
    605 #endif
    606 #define count_leading_zeros(count, x)  count_leading_zeros_gcc_clz(count, x)
    607 #define count_trailing_zeros(count, x)  count_trailing_zeros_gcc_ctz(count, x)
    608 #endif /* __aarch64__ */
    609 
    610 #if defined (__clipper__) && W_TYPE_SIZE == 32
    611 #define umul_ppmm(w1, w0, u, v) \
    612   ({union {UDItype __ll;						\
    613 	   struct {USItype __l, __h;} __i;				\
    614 	  } __x;							\
    615   __asm__ ("mulwux %2,%0"						\
    616 	   : "=r" (__x.__ll)						\
    617 	   : "%0" ((USItype)(u)), "r" ((USItype)(v)));			\
    618   (w1) = __x.__i.__h; (w0) = __x.__i.__l;})
    619 #define smul_ppmm(w1, w0, u, v) \
    620   ({union {DItype __ll;							\
    621 	   struct {SItype __l, __h;} __i;				\
    622 	  } __x;							\
    623   __asm__ ("mulwx %2,%0"						\
    624 	   : "=r" (__x.__ll)						\
    625 	   : "%0" ((SItype)(u)), "r" ((SItype)(v)));			\
    626   (w1) = __x.__i.__h; (w0) = __x.__i.__l;})
    627 #define __umulsidi3(u, v) \
    628   ({UDItype __w;							\
    629     __asm__ ("mulwux %2,%0"						\
    630 	     : "=r" (__w) : "%0" ((USItype)(u)), "r" ((USItype)(v)));	\
    631     __w; })
    632 #endif /* __clipper__ */
    633 
    634 /* Fujitsu vector computers.  */
    635 #if defined (__uxp__) && W_TYPE_SIZE == 32
    636 #define umul_ppmm(ph, pl, u, v) \
    637   do {									\
    638     union {UDItype __ll;						\
    639 	   struct {USItype __h, __l;} __i;				\
    640 	  } __x;							\
    641     __asm__ ("mult.lu %1,%2,%0"	: "=r" (__x.__ll) : "%r" (u), "rK" (v));\
    642     (ph) = __x.__i.__h;							\
    643     (pl) = __x.__i.__l;							\
    644   } while (0)
    645 #define smul_ppmm(ph, pl, u, v) \
    646   do {									\
    647     union {UDItype __ll;						\
    648 	   struct {USItype __h, __l;} __i;				\
    649 	  } __x;							\
    650     __asm__ ("mult.l %1,%2,%0" : "=r" (__x.__ll) : "%r" (u), "rK" (v));	\
    651     (ph) = __x.__i.__h;							\
    652     (pl) = __x.__i.__l;							\
    653   } while (0)
    654 #endif
    655 
    656 #if defined (__gmicro__) && W_TYPE_SIZE == 32
    657 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \
    658   __asm__ ("add.w %5,%1\n\taddx %3,%0"					\
    659 	   : "=g" (sh), "=&g" (sl)					\
    660 	   : "0"  ((USItype)(ah)), "g" ((USItype)(bh)),			\
    661 	     "%1" ((USItype)(al)), "g" ((USItype)(bl)))
    662 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \
    663   __asm__ ("sub.w %5,%1\n\tsubx %3,%0"					\
    664 	   : "=g" (sh), "=&g" (sl)					\
    665 	   : "0" ((USItype)(ah)), "g" ((USItype)(bh)),			\
    666 	     "1" ((USItype)(al)), "g" ((USItype)(bl)))
    667 #define umul_ppmm(ph, pl, m0, m1) \
    668   __asm__ ("mulx %3,%0,%1"						\
    669 	   : "=g" (ph), "=r" (pl)					\
    670 	   : "%0" ((USItype)(m0)), "g" ((USItype)(m1)))
    671 #define udiv_qrnnd(q, r, nh, nl, d) \
    672   __asm__ ("divx %4,%0,%1"						\
    673 	   : "=g" (q), "=r" (r)						\
    674 	   : "1" ((USItype)(nh)), "0" ((USItype)(nl)), "g" ((USItype)(d)))
    675 #define count_leading_zeros(count, x) \
    676   __asm__ ("bsch/1 %1,%0"						\
    677 	   : "=g" (count) : "g" ((USItype)(x)), "0" ((USItype)0))
    678 #endif
    679 
    680 #if defined (__hppa) && W_TYPE_SIZE == 32
    681 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \
    682   __asm__ ("add%I5 %5,%r4,%1\n\taddc %r2,%r3,%0"			\
    683 	   : "=r" (sh), "=&r" (sl)					\
    684 	   : "rM" (ah), "rM" (bh), "%rM" (al), "rI" (bl))
    685 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \
    686   __asm__ ("sub%I4 %4,%r5,%1\n\tsubb %r2,%r3,%0"			\
    687 	   : "=r" (sh), "=&r" (sl)					\
    688 	   : "rM" (ah), "rM" (bh), "rI" (al), "rM" (bl))
    689 #if defined (_PA_RISC1_1)
    690 #define umul_ppmm(wh, wl, u, v) \
    691   do {									\
    692     union {UDItype __ll;						\
    693 	   struct {USItype __h, __l;} __i;				\
    694 	  } __x;							\
    695     __asm__ ("xmpyu %1,%2,%0" : "=*f" (__x.__ll) : "*f" (u), "*f" (v));	\
    696     (wh) = __x.__i.__h;							\
    697     (wl) = __x.__i.__l;							\
    698   } while (0)
    699 #endif
    700 #define count_leading_zeros(count, x) \
    701   do {									\
    702     USItype __tmp;							\
    703     __asm__ (								\
    704        "ldi		1,%0\n"						\
    705 "	extru,=		%1,15,16,%%r0	; Bits 31..16 zero?\n"		\
    706 "	extru,tr	%1,15,16,%1	; No.  Shift down, skip add.\n"	\
    707 "	ldo		16(%0),%0	; Yes.  Perform add.\n"		\
    708 "	extru,=		%1,23,8,%%r0	; Bits 15..8 zero?\n"		\
    709 "	extru,tr	%1,23,8,%1	; No.  Shift down, skip add.\n"	\
    710 "	ldo		8(%0),%0	; Yes.  Perform add.\n"		\
    711 "	extru,=		%1,27,4,%%r0	; Bits 7..4 zero?\n"		\
    712 "	extru,tr	%1,27,4,%1	; No.  Shift down, skip add.\n"	\
    713 "	ldo		4(%0),%0	; Yes.  Perform add.\n"		\
    714 "	extru,=		%1,29,2,%%r0	; Bits 3..2 zero?\n"		\
    715 "	extru,tr	%1,29,2,%1	; No.  Shift down, skip add.\n"	\
    716 "	ldo		2(%0),%0	; Yes.  Perform add.\n"		\
    717 "	extru		%1,30,1,%1	; Extract bit 1.\n"		\
    718 "	sub		%0,%1,%0	; Subtract it.\n"		\
    719 	: "=r" (count), "=r" (__tmp) : "1" (x));			\
    720   } while (0)
    721 #endif /* hppa */
    722 
    723 /* These macros are for ABI=2.0w.  In ABI=2.0n they can't be used, since GCC
    724    (3.2) puts longlong into two adjacent 32-bit registers.  Presumably this
    725    is just a case of no direct support for 2.0n but treating it like 1.0. */
    726 #if defined (__hppa) && W_TYPE_SIZE == 64 && ! defined (_LONG_LONG_LIMB)
    727 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \
    728   __asm__ ("add%I5 %5,%r4,%1\n\tadd,dc %r2,%r3,%0"			\
    729 	   : "=r" (sh), "=&r" (sl)					\
    730 	   : "rM" (ah), "rM" (bh), "%rM" (al), "rI" (bl))
    731 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \
    732   __asm__ ("sub%I4 %4,%r5,%1\n\tsub,db %r2,%r3,%0"			\
    733 	   : "=r" (sh), "=&r" (sl)					\
    734 	   : "rM" (ah), "rM" (bh), "rI" (al), "rM" (bl))
    735 #endif /* hppa */
    736 
    737 #if (defined (__i370__) || defined (__s390__) || defined (__mvs__)) && W_TYPE_SIZE == 32
    738 #if defined (__zarch__) || defined (HAVE_HOST_CPU_s390_zarch)
    739 #define add_ssaaaa(sh, sl, ah, al, bh, bl)				\
    740   do {									\
    741 /*  if (__builtin_constant_p (bl))					\
    742       __asm__ ("alfi\t%1,%o5\n\talcr\t%0,%3"				\
    743 	       : "=r" (sh), "=&r" (sl)					\
    744 	       : "0"  (ah), "r" (bh), "%1" (al), "n" (bl) __CLOBBER_CC);\
    745     else								\
    746 */    __asm__ ("alr\t%1,%5\n\talcr\t%0,%3"				\
    747 	       : "=r" (sh), "=&r" (sl)					\
    748 	       : "0"  (ah), "r" (bh), "%1" (al), "r" (bl)__CLOBBER_CC);	\
    749   } while (0)
    750 #define sub_ddmmss(sh, sl, ah, al, bh, bl)				\
    751   do {									\
    752 /*  if (__builtin_constant_p (bl))					\
    753       __asm__ ("slfi\t%1,%o5\n\tslbr\t%0,%3"				\
    754 	       : "=r" (sh), "=&r" (sl)					\
    755 	       : "0" (ah), "r" (bh), "1" (al), "n" (bl) __CLOBBER_CC);	\
    756     else								\
    757 */    __asm__ ("slr\t%1,%5\n\tslbr\t%0,%3"				\
    758 	       : "=r" (sh), "=&r" (sl)					\
    759 	       : "0" (ah), "r" (bh), "1" (al), "r" (bl) __CLOBBER_CC);	\
    760   } while (0)
    761 #if __GMP_GNUC_PREREQ (4,5)
    762 #define umul_ppmm(xh, xl, m0, m1)					\
    763   do {									\
    764     union {UDItype __ll;						\
    765 	   struct {USItype __h, __l;} __i;				\
    766 	  } __x;							\
    767     __x.__ll = (UDItype) (m0) * (UDItype) (m1);				\
    768     (xh) = __x.__i.__h; (xl) = __x.__i.__l;				\
    769   } while (0)
    770 #else
    771 #if 0
    772 /* FIXME: this fails if gcc knows about the 64-bit registers.  Use only
    773    with a new enough processor pretending we have 32-bit registers.  */
    774 #define umul_ppmm(xh, xl, m0, m1)					\
    775   do {									\
    776     union {UDItype __ll;						\
    777 	   struct {USItype __h, __l;} __i;				\
    778 	  } __x;							\
    779     __asm__ ("mlr\t%0,%2"						\
    780 	     : "=r" (__x.__ll)						\
    781 	     : "%0" (m0), "r" (m1));					\
    782     (xh) = __x.__i.__h; (xl) = __x.__i.__l;				\
    783   } while (0)
    784 #else
    785 #define umul_ppmm(xh, xl, m0, m1)					\
    786   do {									\
    787   /* When we have 64-bit regs and gcc is aware of that, we cannot simply use
    788      DImode for the product, since that would be allocated to a single 64-bit
    789      register, whereas mlr uses the low 32-bits of an even-odd register pair.
    790   */									\
    791     register USItype __r0 __asm__ ("0");				\
    792     register USItype __r1 __asm__ ("1") = (m0);				\
    793     __asm__ ("mlr\t%0,%3"						\
    794 	     : "=r" (__r0), "=r" (__r1)					\
    795 	     : "r" (__r1), "r" (m1));					\
    796     (xh) = __r0; (xl) = __r1;						\
    797   } while (0)
    798 #endif /* if 0 */
    799 #endif
    800 #if 0
    801 /* FIXME: this fails if gcc knows about the 64-bit registers.  Use only
    802    with a new enough processor pretending we have 32-bit registers.  */
    803 #define udiv_qrnnd(q, r, n1, n0, d)					\
    804   do {									\
    805     union {UDItype __ll;						\
    806 	   struct {USItype __h, __l;} __i;				\
    807 	  } __x;							\
    808     __x.__i.__h = n1; __x.__i.__l = n0;					\
    809     __asm__ ("dlr\t%0,%2"						\
    810 	     : "=r" (__x.__ll)						\
    811 	     : "0" (__x.__ll), "r" (d));				\
    812     (q) = __x.__i.__l; (r) = __x.__i.__h;				\
    813   } while (0)
    814 #else
    815 #define udiv_qrnnd(q, r, n1, n0, d)					\
    816   do {									\
    817     register USItype __r0 __asm__ ("0") = (n1);				\
    818     register USItype __r1 __asm__ ("1") = (n0);				\
    819     __asm__ ("dlr\t%0,%4"						\
    820 	     : "=r" (__r0), "=r" (__r1)					\
    821 	     : "r" (__r0), "r" (__r1), "r" (d));			\
    822     (q) = __r1; (r) = __r0;						\
    823   } while (0)
    824 #endif /* if 0 */
    825 #else /* if __zarch__ */
    826 /* FIXME: this fails if gcc knows about the 64-bit registers.  */
    827 #define smul_ppmm(xh, xl, m0, m1)					\
    828   do {									\
    829     union {DItype __ll;							\
    830 	   struct {USItype __h, __l;} __i;				\
    831 	  } __x;							\
    832     __asm__ ("mr\t%0,%2"						\
    833 	     : "=r" (__x.__ll)						\
    834 	     : "%0" (m0), "r" (m1));					\
    835     (xh) = __x.__i.__h; (xl) = __x.__i.__l;				\
    836   } while (0)
    837 /* FIXME: this fails if gcc knows about the 64-bit registers.  */
    838 #define sdiv_qrnnd(q, r, n1, n0, d)					\
    839   do {									\
    840     union {DItype __ll;							\
    841 	   struct {USItype __h, __l;} __i;				\
    842 	  } __x;							\
    843     __x.__i.__h = n1; __x.__i.__l = n0;					\
    844     __asm__ ("dr\t%0,%2"						\
    845 	     : "=r" (__x.__ll)						\
    846 	     : "0" (__x.__ll), "r" (d));				\
    847     (q) = __x.__i.__l; (r) = __x.__i.__h;				\
    848   } while (0)
    849 #endif /* if __zarch__ */
    850 #endif
    851 
    852 #if defined (__s390x__) && W_TYPE_SIZE == 64
    853 /* We need to cast operands with register constraints, otherwise their types
    854    will be assumed to be SImode by gcc.  For these machines, such operations
    855    will insert a value into the low 32 bits, and leave the high 32 bits with
    856    garbage.  */
    857 #define add_ssaaaa(sh, sl, ah, al, bh, bl)				\
    858   do {									\
    859     __asm__ ("algr\t%1,%5\n\talcgr\t%0,%3"				\
    860 	       : "=r" (sh), "=&r" (sl)					\
    861 	       : "0"  ((UDItype)(ah)), "r" ((UDItype)(bh)),		\
    862 		 "%1" ((UDItype)(al)), "r" ((UDItype)(bl)) __CLOBBER_CC); \
    863   } while (0)
    864 #define sub_ddmmss(sh, sl, ah, al, bh, bl)				\
    865   do {									\
    866     __asm__ ("slgr\t%1,%5\n\tslbgr\t%0,%3"				\
    867 	     : "=r" (sh), "=&r" (sl)					\
    868 	     : "0" ((UDItype)(ah)), "r" ((UDItype)(bh)),		\
    869 	       "1" ((UDItype)(al)), "r" ((UDItype)(bl)) __CLOBBER_CC);	\
    870   } while (0)
    871 #define umul_ppmm(xh, xl, m0, m1)					\
    872   do {									\
    873     union {unsigned int __attribute__ ((mode(TI))) __ll;		\
    874 	   struct {UDItype __h, __l;} __i;				\
    875 	  } __x;							\
    876     __asm__ ("mlgr\t%0,%2"						\
    877 	     : "=r" (__x.__ll)						\
    878 	     : "%0" ((UDItype)(m0)), "r" ((UDItype)(m1)));		\
    879     (xh) = __x.__i.__h; (xl) = __x.__i.__l;				\
    880   } while (0)
    881 #define udiv_qrnnd(q, r, n1, n0, d)					\
    882   do {									\
    883     union {unsigned int __attribute__ ((mode(TI))) __ll;		\
    884 	   struct {UDItype __h, __l;} __i;				\
    885 	  } __x;							\
    886     __x.__i.__h = n1; __x.__i.__l = n0;					\
    887     __asm__ ("dlgr\t%0,%2"						\
    888 	     : "=r" (__x.__ll)						\
    889 	     : "0" (__x.__ll), "r" ((UDItype)(d)));			\
    890     (q) = __x.__i.__l; (r) = __x.__i.__h;				\
    891   } while (0)
    892 #if 0 /* FIXME: Enable for z10 (?) */
    893 #define count_leading_zeros(cnt, x)					\
    894   do {									\
    895     union {unsigned int __attribute__ ((mode(TI))) __ll;		\
    896 	   struct {UDItype __h, __l;} __i;				\
    897 	  } __clr_cnt;							\
    898     __asm__ ("flogr\t%0,%1"						\
    899 	     : "=r" (__clr_cnt.__ll)					\
    900 	     : "r" (x) __CLOBBER_CC);					\
    901     (cnt) = __clr_cnt.__i.__h;						\
    902   } while (0)
    903 #endif
    904 #endif
    905 
    906 /* On x86 and x86_64, every asm implicitly clobbers "flags" and "fpsr",
    907    so we don't need __CLOBBER_CC.  */
    908 #if (defined (__i386__) || defined (__i486__)) && W_TYPE_SIZE == 32
    909 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \
    910   __asm__ ("addl %5,%k1\n\tadcl %3,%k0"					\
    911 	   : "=r" (sh), "=&r" (sl)					\
    912 	   : "0"  ((USItype)(ah)), "g" ((USItype)(bh)),			\
    913 	     "%1" ((USItype)(al)), "g" ((USItype)(bl)))
    914 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \
    915   __asm__ ("subl %5,%k1\n\tsbbl %3,%k0"					\
    916 	   : "=r" (sh), "=&r" (sl)					\
    917 	   : "0" ((USItype)(ah)), "g" ((USItype)(bh)),			\
    918 	     "1" ((USItype)(al)), "g" ((USItype)(bl)))
    919 #define umul_ppmm(w1, w0, u, v) \
    920   __asm__ ("mull %3"							\
    921 	   : "=a" (w0), "=d" (w1)					\
    922 	   : "%0" ((USItype)(u)), "rm" ((USItype)(v)))
    923 #define udiv_qrnnd(q, r, n1, n0, dx) /* d renamed to dx avoiding "=d" */\
    924   __asm__ ("divl %4"		     /* stringification in K&R C */	\
    925 	   : "=a" (q), "=d" (r)						\
    926 	   : "0" ((USItype)(n0)), "1" ((USItype)(n1)), "rm" ((USItype)(dx)))
    927 
    928 #if HAVE_HOST_CPU_i586 || HAVE_HOST_CPU_pentium || HAVE_HOST_CPU_pentiummmx
    929 /* Pentium bsrl takes between 10 and 72 cycles depending where the most
    930    significant 1 bit is, hence the use of the following alternatives.  bsfl
    931    is slow too, between 18 and 42 depending where the least significant 1
    932    bit is, so let the generic count_trailing_zeros below make use of the
    933    count_leading_zeros here too.  */
    934 
    935 #if HAVE_HOST_CPU_pentiummmx && ! defined (LONGLONG_STANDALONE)
    936 /* The following should be a fixed 14 or 15 cycles, but possibly plus an L1
    937    cache miss reading from __clz_tab.  For P55 it's favoured over the float
    938    below so as to avoid mixing MMX and x87, since the penalty for switching
    939    between the two is about 100 cycles.
    940 
    941    The asm block sets __shift to -3 if the high 24 bits are clear, -2 for
    942    16, -1 for 8, or 0 otherwise.  This could be written equivalently as
    943    follows, but as of gcc 2.95.2 it results in conditional jumps.
    944 
    945        __shift = -(__n < 0x1000000);
    946        __shift -= (__n < 0x10000);
    947        __shift -= (__n < 0x100);
    948 
    949    The middle two sbbl and cmpl's pair, and with luck something gcc
    950    generates might pair with the first cmpl and the last sbbl.  The "32+1"
    951    constant could be folded into __clz_tab[], but it doesn't seem worth
    952    making a different table just for that.  */
    953 
    954 #define count_leading_zeros(c,n)					\
    955   do {									\
    956     USItype  __n = (n);							\
    957     USItype  __shift;							\
    958     __asm__ ("cmpl  $0x1000000, %1\n"					\
    959 	     "sbbl  %0, %0\n"						\
    960 	     "cmpl  $0x10000, %1\n"					\
    961 	     "sbbl  $0, %0\n"						\
    962 	     "cmpl  $0x100, %1\n"					\
    963 	     "sbbl  $0, %0\n"						\
    964 	     : "=&r" (__shift) : "r"  (__n));				\
    965     __shift = __shift*8 + 24 + 1;					\
    966     (c) = 32 + 1 - __shift - __clz_tab[__n >> __shift];			\
    967   } while (0)
    968 #define COUNT_LEADING_ZEROS_NEED_CLZ_TAB
    969 #define COUNT_LEADING_ZEROS_0   31   /* n==0 indistinguishable from n==1 */
    970 
    971 #else /* ! pentiummmx || LONGLONG_STANDALONE */
    972 /* The following should be a fixed 14 cycles or so.  Some scheduling
    973    opportunities should be available between the float load/store too.  This
    974    sort of code is used in gcc 3 for __builtin_ffs (with "n&-n") and is
    975    apparently suggested by the Intel optimizing manual (don't know exactly
    976    where).  gcc 2.95 or up will be best for this, so the "double" is
    977    correctly aligned on the stack.  */
    978 #define count_leading_zeros(c,n)					\
    979   do {									\
    980     union {								\
    981       double    d;							\
    982       unsigned  a[2];							\
    983     } __u;								\
    984     __u.d = (UWtype) (n);						\
    985     (c) = 0x3FF + 31 - (__u.a[1] >> 20);				\
    986   } while (0)
    987 #define COUNT_LEADING_ZEROS_0   (0x3FF + 31)
    988 #endif /* pentiummx */
    989 
    990 #else /* ! pentium */
    991 
    992 #if __GMP_GNUC_PREREQ (3,4)  /* using bsrl */
    993 #define count_leading_zeros(count,x)  count_leading_zeros_gcc_clz(count,x)
    994 #endif /* gcc clz */
    995 
    996 /* On P6, gcc prior to 3.0 generates a partial register stall for
    997    __cbtmp^31, due to using "xorb $31" instead of "xorl $31", the former
    998    being 1 code byte smaller.  "31-__cbtmp" is a workaround, probably at the
    999    cost of one extra instruction.  Do this for "i386" too, since that means
   1000    generic x86.  */
   1001 #if ! defined (count_leading_zeros) && __GNUC__ < 3			\
   1002   && (HAVE_HOST_CPU_i386						\
   1003       || HAVE_HOST_CPU_i686						\
   1004       || HAVE_HOST_CPU_pentiumpro					\
   1005       || HAVE_HOST_CPU_pentium2						\
   1006       || HAVE_HOST_CPU_pentium3)
   1007 #define count_leading_zeros(count, x)					\
   1008   do {									\
   1009     USItype __cbtmp;							\
   1010     ASSERT ((x) != 0);							\
   1011     __asm__ ("bsrl %1,%0" : "=r" (__cbtmp) : "rm" ((USItype)(x)));	\
   1012     (count) = 31 - __cbtmp;						\
   1013   } while (0)
   1014 #endif /* gcc<3 asm bsrl */
   1015 
   1016 #ifndef count_leading_zeros
   1017 #define count_leading_zeros(count, x)					\
   1018   do {									\
   1019     USItype __cbtmp;							\
   1020     ASSERT ((x) != 0);							\
   1021     __asm__ ("bsrl %1,%0" : "=r" (__cbtmp) : "rm" ((USItype)(x)));	\
   1022     (count) = __cbtmp ^ 31;						\
   1023   } while (0)
   1024 #endif /* asm bsrl */
   1025 
   1026 #if __GMP_GNUC_PREREQ (3,4)  /* using bsfl */
   1027 #define count_trailing_zeros(count,x)  count_trailing_zeros_gcc_ctz(count,x)
   1028 #endif /* gcc ctz */
   1029 
   1030 #ifndef count_trailing_zeros
   1031 #define count_trailing_zeros(count, x)					\
   1032   do {									\
   1033     ASSERT ((x) != 0);							\
   1034     __asm__ ("bsfl %1,%k0" : "=r" (count) : "rm" ((USItype)(x)));	\
   1035   } while (0)
   1036 #endif /* asm bsfl */
   1037 
   1038 #endif /* ! pentium */
   1039 
   1040 #endif /* 80x86 */
   1041 
   1042 #if defined (__amd64__) && W_TYPE_SIZE == 64
   1043 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \
   1044   __asm__ ("addq %5,%q1\n\tadcq %3,%q0"					\
   1045 	   : "=r" (sh), "=&r" (sl)					\
   1046 	   : "0"  ((UDItype)(ah)), "rme" ((UDItype)(bh)),		\
   1047 	     "%1" ((UDItype)(al)), "rme" ((UDItype)(bl)))
   1048 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \
   1049   __asm__ ("subq %5,%q1\n\tsbbq %3,%q0"					\
   1050 	   : "=r" (sh), "=&r" (sl)					\
   1051 	   : "0" ((UDItype)(ah)), "rme" ((UDItype)(bh)),		\
   1052 	     "1" ((UDItype)(al)), "rme" ((UDItype)(bl)))
   1053 #if X86_ASM_MULX \
   1054    && (HAVE_HOST_CPU_haswell || HAVE_HOST_CPU_broadwell \
   1055        || HAVE_HOST_CPU_skylake || HAVE_HOST_CPU_bd4 || HAVE_HOST_CPU_zen)
   1056 #define umul_ppmm(w1, w0, u, v) \
   1057   __asm__ ("mulx\t%3, %q0, %q1"						\
   1058 	   : "=r" (w0), "=r" (w1)					\
   1059 	   : "%d" ((UDItype)(u)), "rm" ((UDItype)(v)))
   1060 #else
   1061 #define umul_ppmm(w1, w0, u, v) \
   1062   __asm__ ("mulq\t%3"							\
   1063 	   : "=a" (w0), "=d" (w1)					\
   1064 	   : "%0" ((UDItype)(u)), "rm" ((UDItype)(v)))
   1065 #endif
   1066 #define udiv_qrnnd(q, r, n1, n0, dx) /* d renamed to dx avoiding "=d" */\
   1067   __asm__ ("divq %4"		     /* stringification in K&R C */	\
   1068 	   : "=a" (q), "=d" (r)						\
   1069 	   : "0" ((UDItype)(n0)), "1" ((UDItype)(n1)), "rm" ((UDItype)(dx)))
   1070 
   1071 #if HAVE_HOST_CPU_haswell || HAVE_HOST_CPU_broadwell || HAVE_HOST_CPU_skylake \
   1072   || HAVE_HOST_CPU_k10 || HAVE_HOST_CPU_bd1 || HAVE_HOST_CPU_bd2	\
   1073   || HAVE_HOST_CPU_bd3 || HAVE_HOST_CPU_bd4 || HAVE_HOST_CPU_zen	\
   1074   || HAVE_HOST_CPU_bobcat || HAVE_HOST_CPU_jaguar
   1075 #define count_leading_zeros(count, x)					\
   1076   do {									\
   1077     /* This is lzcnt, spelled for older assemblers.  Destination and */	\
   1078     /* source must be a 64-bit registers, hence cast and %q.         */	\
   1079     __asm__ ("rep;bsr\t%1, %q0" : "=r" (count) : "rm" ((UDItype)(x)));	\
   1080   } while (0)
   1081 #define COUNT_LEADING_ZEROS_0 64
   1082 #else
   1083 #define count_leading_zeros(count, x)					\
   1084   do {									\
   1085     UDItype __cbtmp;							\
   1086     ASSERT ((x) != 0);							\
   1087     __asm__ ("bsr\t%1,%0" : "=r" (__cbtmp) : "rm" ((UDItype)(x)));	\
   1088     (count) = __cbtmp ^ 63;						\
   1089   } while (0)
   1090 #endif
   1091 
   1092 #if HAVE_HOST_CPU_bd2 || HAVE_HOST_CPU_bd3 || HAVE_HOST_CPU_bd4 \
   1093   || HAVE_HOST_CPU_zen || HAVE_HOST_CPU_jaguar
   1094 #define count_trailing_zeros(count, x)					\
   1095   do {									\
   1096     /* This is tzcnt, spelled for older assemblers.  Destination and */	\
   1097     /* source must be a 64-bit registers, hence cast and %q.         */	\
   1098     __asm__ ("rep;bsf\t%1, %q0" : "=r" (count) : "rm" ((UDItype)(x)));	\
   1099   } while (0)
   1100 #define COUNT_TRAILING_ZEROS_0 64
   1101 #else
   1102 #define count_trailing_zeros(count, x)					\
   1103   do {									\
   1104     ASSERT ((x) != 0);							\
   1105     __asm__ ("bsf\t%1, %q0" : "=r" (count) : "rm" ((UDItype)(x)));	\
   1106   } while (0)
   1107 #endif
   1108 #endif /* __amd64__ */
   1109 
   1110 #if defined (__i860__) && W_TYPE_SIZE == 32
   1111 #define rshift_rhlc(r,h,l,c) \
   1112   __asm__ ("shr %3,r0,r0\;shrd %1,%2,%0"				\
   1113 	   "=r" (r) : "r" (h), "r" (l), "rn" (c))
   1114 #endif /* i860 */
   1115 
   1116 #if defined (__i960__) && W_TYPE_SIZE == 32
   1117 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \
   1118   __asm__ ("cmpo 1,0\;addc %5,%4,%1\;addc %3,%2,%0"			\
   1119 	   : "=r" (sh), "=&r" (sl)					\
   1120 	   : "dI" (ah), "dI" (bh), "%dI" (al), "dI" (bl))
   1121 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \
   1122   __asm__ ("cmpo 0,0\;subc %5,%4,%1\;subc %3,%2,%0"			\
   1123 	   : "=r" (sh), "=&r" (sl)					\
   1124 	   : "dI" (ah), "dI" (bh), "dI" (al), "dI" (bl))
   1125 #define umul_ppmm(w1, w0, u, v) \
   1126   ({union {UDItype __ll;						\
   1127 	   struct {USItype __l, __h;} __i;				\
   1128 	  } __x;							\
   1129   __asm__ ("emul %2,%1,%0"						\
   1130 	   : "=d" (__x.__ll) : "%dI" (u), "dI" (v));			\
   1131   (w1) = __x.__i.__h; (w0) = __x.__i.__l;})
   1132 #define __umulsidi3(u, v) \
   1133   ({UDItype __w;							\
   1134     __asm__ ("emul %2,%1,%0" : "=d" (__w) : "%dI" (u), "dI" (v));	\
   1135     __w; })
   1136 #define udiv_qrnnd(q, r, nh, nl, d) \
   1137   do {									\
   1138     union {UDItype __ll;						\
   1139 	   struct {USItype __l, __h;} __i;				\
   1140 	  } __nn;							\
   1141     __nn.__i.__h = (nh); __nn.__i.__l = (nl);				\
   1142     __asm__ ("ediv %d,%n,%0"						\
   1143 	   : "=d" (__rq.__ll) : "dI" (__nn.__ll), "dI" (d));		\
   1144     (r) = __rq.__i.__l; (q) = __rq.__i.__h;				\
   1145   } while (0)
   1146 #define count_leading_zeros(count, x) \
   1147   do {									\
   1148     USItype __cbtmp;							\
   1149     __asm__ ("scanbit %1,%0" : "=r" (__cbtmp) : "r" (x));		\
   1150     (count) = __cbtmp ^ 31;						\
   1151   } while (0)
   1152 #define COUNT_LEADING_ZEROS_0 (-32) /* sic */
   1153 #if defined (__i960mx)		/* what is the proper symbol to test??? */
   1154 #define rshift_rhlc(r,h,l,c) \
   1155   do {									\
   1156     union {UDItype __ll;						\
   1157 	   struct {USItype __l, __h;} __i;				\
   1158 	  } __nn;							\
   1159     __nn.__i.__h = (h); __nn.__i.__l = (l);				\
   1160     __asm__ ("shre %2,%1,%0" : "=d" (r) : "dI" (__nn.__ll), "dI" (c));	\
   1161   }
   1162 #endif /* i960mx */
   1163 #endif /* i960 */
   1164 
   1165 #if (defined (__mc68000__) || defined (__mc68020__) || defined(mc68020) \
   1166      || defined (__m68k__) || defined (__mc5200__) || defined (__mc5206e__) \
   1167      || defined (__mc5307__)) && W_TYPE_SIZE == 32
   1168 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \
   1169   __asm__ ("add%.l %5,%1\n\taddx%.l %3,%0"				\
   1170 	   : "=d" (sh), "=&d" (sl)					\
   1171 	   : "0"  ((USItype)(ah)), "d" ((USItype)(bh)),			\
   1172 	     "%1" ((USItype)(al)), "g" ((USItype)(bl)))
   1173 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \
   1174   __asm__ ("sub%.l %5,%1\n\tsubx%.l %3,%0"				\
   1175 	   : "=d" (sh), "=&d" (sl)					\
   1176 	   : "0" ((USItype)(ah)), "d" ((USItype)(bh)),			\
   1177 	     "1" ((USItype)(al)), "g" ((USItype)(bl)))
   1178 /* The '020, '030, '040 and CPU32 have 32x32->64 and 64/32->32q-32r.  */
   1179 #if defined (__mc68020__) || defined(mc68020) \
   1180      || defined (__mc68030__) || defined (mc68030) \
   1181      || defined (__mc68040__) || defined (mc68040) \
   1182      || defined (__mcpu32__) || defined (mcpu32) \
   1183      || defined (__NeXT__)
   1184 #define umul_ppmm(w1, w0, u, v) \
   1185   __asm__ ("mulu%.l %3,%1:%0"						\
   1186 	   : "=d" (w0), "=d" (w1)					\
   1187 	   : "%0" ((USItype)(u)), "dmi" ((USItype)(v)))
   1188 #define udiv_qrnnd(q, r, n1, n0, d) \
   1189   __asm__ ("divu%.l %4,%1:%0"						\
   1190 	   : "=d" (q), "=d" (r)						\
   1191 	   : "0" ((USItype)(n0)), "1" ((USItype)(n1)), "dmi" ((USItype)(d)))
   1192 #define sdiv_qrnnd(q, r, n1, n0, d) \
   1193   __asm__ ("divs%.l %4,%1:%0"						\
   1194 	   : "=d" (q), "=d" (r)						\
   1195 	   : "0" ((USItype)(n0)), "1" ((USItype)(n1)), "dmi" ((USItype)(d)))
   1196 #else /* for other 68k family members use 16x16->32 multiplication */
   1197 #define umul_ppmm(xh, xl, a, b) \
   1198   do { USItype __umul_tmp1, __umul_tmp2;				\
   1199 	__asm__ ("| Inlined umul_ppmm\n"				\
   1200 "	move%.l	%5,%3\n"						\
   1201 "	move%.l	%2,%0\n"						\
   1202 "	move%.w	%3,%1\n"						\
   1203 "	swap	%3\n"							\
   1204 "	swap	%0\n"							\
   1205 "	mulu%.w	%2,%1\n"						\
   1206 "	mulu%.w	%3,%0\n"						\
   1207 "	mulu%.w	%2,%3\n"						\
   1208 "	swap	%2\n"							\
   1209 "	mulu%.w	%5,%2\n"						\
   1210 "	add%.l	%3,%2\n"						\
   1211 "	jcc	1f\n"							\
   1212 "	add%.l	%#0x10000,%0\n"						\
   1213 "1:	move%.l	%2,%3\n"						\
   1214 "	clr%.w	%2\n"							\
   1215 "	swap	%2\n"							\
   1216 "	swap	%3\n"							\
   1217 "	clr%.w	%3\n"							\
   1218 "	add%.l	%3,%1\n"						\
   1219 "	addx%.l	%2,%0\n"						\
   1220 "	| End inlined umul_ppmm"					\
   1221 	      : "=&d" (xh), "=&d" (xl),					\
   1222 		"=d" (__umul_tmp1), "=&d" (__umul_tmp2)			\
   1223 	      : "%2" ((USItype)(a)), "d" ((USItype)(b)));		\
   1224   } while (0)
   1225 #endif /* not mc68020 */
   1226 /* The '020, '030, '040 and '060 have bitfield insns.
   1227    GCC 3.4 defines __mc68020__ when in CPU32 mode, check for __mcpu32__ to
   1228    exclude bfffo on that chip (bitfield insns not available).  */
   1229 #if (defined (__mc68020__) || defined (mc68020)    \
   1230      || defined (__mc68030__) || defined (mc68030) \
   1231      || defined (__mc68040__) || defined (mc68040) \
   1232      || defined (__mc68060__) || defined (mc68060) \
   1233      || defined (__NeXT__))			   \
   1234   && ! defined (__mcpu32__)
   1235 #define count_leading_zeros(count, x) \
   1236   __asm__ ("bfffo %1{%b2:%b2},%0"					\
   1237 	   : "=d" (count)						\
   1238 	   : "od" ((USItype) (x)), "n" (0))
   1239 #define COUNT_LEADING_ZEROS_0 32
   1240 #endif
   1241 #endif /* mc68000 */
   1242 
   1243 #if defined (__m88000__) && W_TYPE_SIZE == 32
   1244 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \
   1245   __asm__ ("addu.co %1,%r4,%r5\n\taddu.ci %0,%r2,%r3"			\
   1246 	   : "=r" (sh), "=&r" (sl)					\
   1247 	   : "rJ" (ah), "rJ" (bh), "%rJ" (al), "rJ" (bl))
   1248 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \
   1249   __asm__ ("subu.co %1,%r4,%r5\n\tsubu.ci %0,%r2,%r3"			\
   1250 	   : "=r" (sh), "=&r" (sl)					\
   1251 	   : "rJ" (ah), "rJ" (bh), "rJ" (al), "rJ" (bl))
   1252 #define count_leading_zeros(count, x) \
   1253   do {									\
   1254     USItype __cbtmp;							\
   1255     __asm__ ("ff1 %0,%1" : "=r" (__cbtmp) : "r" (x));			\
   1256     (count) = __cbtmp ^ 31;						\
   1257   } while (0)
   1258 #define COUNT_LEADING_ZEROS_0 63 /* sic */
   1259 #if defined (__m88110__)
   1260 #define umul_ppmm(wh, wl, u, v) \
   1261   do {									\
   1262     union {UDItype __ll;						\
   1263 	   struct {USItype __h, __l;} __i;				\
   1264 	  } __x;							\
   1265     __asm__ ("mulu.d %0,%1,%2" : "=r" (__x.__ll) : "r" (u), "r" (v));	\
   1266     (wh) = __x.__i.__h;							\
   1267     (wl) = __x.__i.__l;							\
   1268   } while (0)
   1269 #define udiv_qrnnd(q, r, n1, n0, d) \
   1270   ({union {UDItype __ll;						\
   1271 	   struct {USItype __h, __l;} __i;				\
   1272 	  } __x, __q;							\
   1273   __x.__i.__h = (n1); __x.__i.__l = (n0);				\
   1274   __asm__ ("divu.d %0,%1,%2"						\
   1275 	   : "=r" (__q.__ll) : "r" (__x.__ll), "r" (d));		\
   1276   (r) = (n0) - __q.__l * (d); (q) = __q.__l; })
   1277 #endif /* __m88110__ */
   1278 #endif /* __m88000__ */
   1279 
   1280 #if defined (__mips) && W_TYPE_SIZE == 32
   1281 #if __GMP_GNUC_PREREQ (4,4) || defined(__clang__)
   1282 #define umul_ppmm(w1, w0, u, v) \
   1283   do {									\
   1284     UDItype __ll = (UDItype)(u) * (v);					\
   1285     w1 = __ll >> 32;							\
   1286     w0 = __ll;								\
   1287   } while (0)
   1288 #endif
   1289 #if !defined (umul_ppmm) && __GMP_GNUC_PREREQ (2,7) && !defined (__clang__)
   1290 #define umul_ppmm(w1, w0, u, v) \
   1291   __asm__ ("multu %2,%3" : "=l" (w0), "=h" (w1) : "d" (u), "d" (v))
   1292 #endif
   1293 #if !defined (umul_ppmm)
   1294 #define umul_ppmm(w1, w0, u, v) \
   1295   __asm__ ("multu %2,%3\n\tmflo %0\n\tmfhi %1"				\
   1296 	   : "=d" (w0), "=d" (w1) : "d" (u), "d" (v))
   1297 #endif
   1298 #endif /* __mips */
   1299 
   1300 #if (defined (__mips) && __mips >= 3) && W_TYPE_SIZE == 64
   1301 #if defined (_MIPS_ARCH_MIPS64R6)
   1302 #define umul_ppmm(w1, w0, u, v) \
   1303   do {									\
   1304     UDItype __m0 = (u), __m1 = (v);					\
   1305     (w0) = __m0 * __m1;							\
   1306     __asm__ ("dmuhu\t%0, %1, %2" : "=d" (w1) : "d" (__m0), "d" (__m1));	\
   1307   } while (0)
   1308 #endif
   1309 #if !defined (umul_ppmm) && (__GMP_GNUC_PREREQ (4,4) || defined(__clang__))
   1310 #define umul_ppmm(w1, w0, u, v) \
   1311   do {									\
   1312     typedef unsigned int __ll_UTItype __attribute__((mode(TI)));	\
   1313     __ll_UTItype __ll = (__ll_UTItype)(u) * (v);			\
   1314     w1 = __ll >> 64;							\
   1315     w0 = __ll;								\
   1316   } while (0)
   1317 #endif
   1318 #if !defined (umul_ppmm) && __GMP_GNUC_PREREQ (2,7) && !defined (__clang__)
   1319 #define umul_ppmm(w1, w0, u, v) \
   1320   __asm__ ("dmultu %2,%3"						\
   1321 	   : "=l" (w0), "=h" (w1)					\
   1322 	   : "d" ((UDItype)(u)), "d" ((UDItype)(v)))
   1323 #endif
   1324 #if !defined (umul_ppmm)
   1325 #define umul_ppmm(w1, w0, u, v) \
   1326   __asm__ ("dmultu %2,%3\n\tmflo %0\n\tmfhi %1"				\
   1327 	   : "=d" (w0), "=d" (w1)					\
   1328 	   : "d" ((UDItype)(u)), "d" ((UDItype)(v)))
   1329 #endif
   1330 #endif /* __mips */
   1331 
   1332 #if defined (__mmix__) && W_TYPE_SIZE == 64
   1333 #define umul_ppmm(w1, w0, u, v) \
   1334   __asm__ ("MULU %0,%2,%3" : "=r" (w0), "=z" (w1) : "r" (u), "r" (v))
   1335 #endif
   1336 
   1337 #if defined (__ns32000__) && W_TYPE_SIZE == 32
   1338 #define umul_ppmm(w1, w0, u, v) \
   1339   ({union {UDItype __ll;						\
   1340 	   struct {USItype __l, __h;} __i;				\
   1341 	  } __x;							\
   1342   __asm__ ("meid %2,%0"							\
   1343 	   : "=g" (__x.__ll)						\
   1344 	   : "%0" ((USItype)(u)), "g" ((USItype)(v)));			\
   1345   (w1) = __x.__i.__h; (w0) = __x.__i.__l;})
   1346 #define __umulsidi3(u, v) \
   1347   ({UDItype __w;							\
   1348     __asm__ ("meid %2,%0"						\
   1349 	     : "=g" (__w)						\
   1350 	     : "%0" ((USItype)(u)), "g" ((USItype)(v)));		\
   1351     __w; })
   1352 #define udiv_qrnnd(q, r, n1, n0, d) \
   1353   ({union {UDItype __ll;						\
   1354 	   struct {USItype __l, __h;} __i;				\
   1355 	  } __x;							\
   1356   __x.__i.__h = (n1); __x.__i.__l = (n0);				\
   1357   __asm__ ("deid %2,%0"							\
   1358 	   : "=g" (__x.__ll)						\
   1359 	   : "0" (__x.__ll), "g" ((USItype)(d)));			\
   1360   (r) = __x.__i.__l; (q) = __x.__i.__h; })
   1361 #define count_trailing_zeros(count,x) \
   1362   do {									\
   1363     __asm__ ("ffsd	%2,%0"						\
   1364 	     : "=r" (count)						\
   1365 	     : "0" ((USItype) 0), "r" ((USItype) (x)));			\
   1366   } while (0)
   1367 #endif /* __ns32000__ */
   1368 
   1369 /* In the past we had a block of various #defines tested
   1370        _ARCH_PPC    - AIX
   1371        _ARCH_PWR    - AIX
   1372        __powerpc__  - gcc
   1373        __POWERPC__  - BEOS
   1374        __ppc__      - Darwin
   1375        PPC          - old gcc, GNU/Linux, SysV
   1376    The plain PPC test was not good for vxWorks, since PPC is defined on all
   1377    CPUs there (eg. m68k too), as a constant one is expected to compare
   1378    CPU_FAMILY against.
   1379 
   1380    At any rate, this was pretty unattractive and a bit fragile.  The use of
   1381    HAVE_HOST_CPU_FAMILY is designed to cut through it all and be sure of
   1382    getting the desired effect.
   1383 
   1384    ENHANCE-ME: We should test _IBMR2 here when we add assembly support for
   1385    the system vendor compilers.  (Is that vendor compilers with inline asm,
   1386    or what?)  */
   1387 
   1388 #if (HAVE_HOST_CPU_FAMILY_power || HAVE_HOST_CPU_FAMILY_powerpc)	\
   1389   && W_TYPE_SIZE == 32
   1390 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \
   1391   do {									\
   1392     if (__builtin_constant_p (bh) && (bh) == 0)				\
   1393       __asm__ ("add%I4c %1,%3,%4\n\taddze %0,%2"			\
   1394 	       : "=r" (sh), "=&r" (sl) : "r" (ah), "%r" (al), "rI" (bl)	\
   1395 		 __CLOBBER_CC);						\
   1396     else if (__builtin_constant_p (bh) && (bh) == ~(USItype) 0)		\
   1397       __asm__ ("add%I4c %1,%3,%4\n\taddme %0,%2"			\
   1398 	       : "=r" (sh), "=&r" (sl) : "r" (ah), "%r" (al), "rI" (bl)	\
   1399 		 __CLOBBER_CC);						\
   1400     else								\
   1401       __asm__ ("add%I5c %1,%4,%5\n\tadde %0,%2,%3"			\
   1402 	       : "=r" (sh), "=&r" (sl)					\
   1403 	       : "r" (ah), "r" (bh), "%r" (al), "rI" (bl)		\
   1404 		 __CLOBBER_CC);						\
   1405   } while (0)
   1406 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \
   1407   do {									\
   1408     if (__builtin_constant_p (ah) && (ah) == 0)				\
   1409       __asm__ ("subf%I3c %1,%4,%3\n\tsubfze %0,%2"			\
   1410 	       : "=r" (sh), "=&r" (sl) : "r" (bh), "rI" (al), "r" (bl)	\
   1411 		 __CLOBBER_CC);						\
   1412     else if (__builtin_constant_p (ah) && (ah) == ~(USItype) 0)		\
   1413       __asm__ ("subf%I3c %1,%4,%3\n\tsubfme %0,%2"			\
   1414 	       : "=r" (sh), "=&r" (sl) : "r" (bh), "rI" (al), "r" (bl)	\
   1415 		 __CLOBBER_CC);						\
   1416     else if (__builtin_constant_p (bh) && (bh) == 0)			\
   1417       __asm__ ("subf%I3c %1,%4,%3\n\taddme %0,%2"			\
   1418 	       : "=r" (sh), "=&r" (sl) : "r" (ah), "rI" (al), "r" (bl)	\
   1419 		 __CLOBBER_CC);						\
   1420     else if (__builtin_constant_p (bh) && (bh) == ~(USItype) 0)		\
   1421       __asm__ ("subf%I3c %1,%4,%3\n\taddze %0,%2"			\
   1422 	       : "=r" (sh), "=&r" (sl) : "r" (ah), "rI" (al), "r" (bl)	\
   1423 		 __CLOBBER_CC);						\
   1424     else								\
   1425       __asm__ ("subf%I4c %1,%5,%4\n\tsubfe %0,%3,%2"			\
   1426 	       : "=r" (sh), "=&r" (sl)					\
   1427 	       : "r" (ah), "r" (bh), "rI" (al), "r" (bl)		\
   1428 		 __CLOBBER_CC);						\
   1429   } while (0)
   1430 #define count_leading_zeros(count, x) \
   1431   __asm__ ("cntlzw %0,%1" : "=r" (count) : "r" (x))
   1432 #define COUNT_LEADING_ZEROS_0 32
   1433 #if HAVE_HOST_CPU_FAMILY_powerpc
   1434 #if __GMP_GNUC_PREREQ (4,4) || defined(__clang__)
   1435 #define umul_ppmm(w1, w0, u, v) \
   1436   do {									\
   1437     UDItype __ll = (UDItype)(u) * (v);					\
   1438     w1 = __ll >> 32;							\
   1439     w0 = __ll;								\
   1440   } while (0)
   1441 #endif
   1442 #if !defined (umul_ppmm)
   1443 #define umul_ppmm(ph, pl, m0, m1) \
   1444   do {									\
   1445     USItype __m0 = (m0), __m1 = (m1);					\
   1446     __asm__ ("mulhwu %0,%1,%2" : "=r" (ph) : "%r" (m0), "r" (m1));	\
   1447     (pl) = __m0 * __m1;							\
   1448   } while (0)
   1449 #endif
   1450 #define smul_ppmm(ph, pl, m0, m1) \
   1451   do {									\
   1452     SItype __m0 = (m0), __m1 = (m1);					\
   1453     __asm__ ("mulhw %0,%1,%2" : "=r" (ph) : "%r" (m0), "r" (m1));	\
   1454     (pl) = __m0 * __m1;							\
   1455   } while (0)
   1456 #else
   1457 #define smul_ppmm(xh, xl, m0, m1) \
   1458   __asm__ ("mul %0,%2,%3" : "=r" (xh), "=q" (xl) : "r" (m0), "r" (m1))
   1459 #define sdiv_qrnnd(q, r, nh, nl, d) \
   1460   __asm__ ("div %0,%2,%4" : "=r" (q), "=q" (r) : "r" (nh), "1" (nl), "r" (d))
   1461 #endif
   1462 #endif /* 32-bit POWER architecture variants.  */
   1463 
   1464 /* We should test _IBMR2 here when we add assembly support for the system
   1465    vendor compilers.  */
   1466 #if HAVE_HOST_CPU_FAMILY_powerpc && W_TYPE_SIZE == 64
   1467 #if !defined (_LONG_LONG_LIMB)
   1468 /* _LONG_LONG_LIMB is ABI=mode32 where adde operates on 32-bit values.  So
   1469    use adde etc only when not _LONG_LONG_LIMB.  */
   1470 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \
   1471   do {									\
   1472     if (__builtin_constant_p (bh) && (bh) == 0)				\
   1473       __asm__ ("add%I4c %1,%3,%4\n\taddze %0,%2"			\
   1474 	       : "=r" (sh), "=&r" (sl)					\
   1475 	       : "r"  ((UDItype)(ah)),					\
   1476 		 "%r" ((UDItype)(al)), "rI" ((UDItype)(bl))		\
   1477 		 __CLOBBER_CC);						\
   1478     else if (__builtin_constant_p (bh) && (bh) == ~(UDItype) 0)		\
   1479       __asm__ ("add%I4c %1,%3,%4\n\taddme %0,%2"			\
   1480 	       : "=r" (sh), "=&r" (sl)					\
   1481 	       : "r"  ((UDItype)(ah)),					\
   1482 		 "%r" ((UDItype)(al)), "rI" ((UDItype)(bl))		\
   1483 		 __CLOBBER_CC);						\
   1484     else								\
   1485       __asm__ ("add%I5c %1,%4,%5\n\tadde %0,%2,%3"			\
   1486 	       : "=r" (sh), "=&r" (sl)					\
   1487 	       : "r"  ((UDItype)(ah)), "r"  ((UDItype)(bh)),		\
   1488 		 "%r" ((UDItype)(al)), "rI" ((UDItype)(bl))		\
   1489 		 __CLOBBER_CC);						\
   1490   } while (0)
   1491 /* We use "*rI" for the constant operand here, since with just "I", gcc barfs.
   1492    This might seem strange, but gcc folds away the dead code late.  */
   1493 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \
   1494   do {									\
   1495     if (__builtin_constant_p (bl)					\
   1496 	&& (bl) > -0x8000 && (bl) <= 0x8000 && (bl) != 0) {		\
   1497 	if (__builtin_constant_p (ah) && (ah) == 0)			\
   1498 	  __asm__ ("addic %1,%3,%4\n\tsubfze %0,%2"			\
   1499 		   : "=r" (sh), "=&r" (sl)				\
   1500 		   :                       "r" ((UDItype)(bh)),		\
   1501 		     "r" ((UDItype)(al)), "*rI" (-((UDItype)(bl)))	\
   1502 		     __CLOBBER_CC);					\
   1503 	else if (__builtin_constant_p (ah) && (ah) == ~(UDItype) 0)	\
   1504 	  __asm__ ("addic %1,%3,%4\n\tsubfme %0,%2"			\
   1505 		   : "=r" (sh), "=&r" (sl)				\
   1506 		   :                       "r" ((UDItype)(bh)),		\
   1507 		     "r" ((UDItype)(al)), "*rI" (-((UDItype)(bl)))	\
   1508 		     __CLOBBER_CC);					\
   1509 	else if (__builtin_constant_p (bh) && (bh) == 0)		\
   1510 	  __asm__ ("addic %1,%3,%4\n\taddme %0,%2"			\
   1511 		   : "=r" (sh), "=&r" (sl)				\
   1512 		   : "r" ((UDItype)(ah)),				\
   1513 		     "r" ((UDItype)(al)), "*rI" (-((UDItype)(bl)))	\
   1514 		     __CLOBBER_CC);					\
   1515 	else if (__builtin_constant_p (bh) && (bh) == ~(UDItype) 0)	\
   1516 	  __asm__ ("addic %1,%3,%4\n\taddze %0,%2"			\
   1517 		   : "=r" (sh), "=&r" (sl)				\
   1518 		   : "r" ((UDItype)(ah)),				\
   1519 		     "r" ((UDItype)(al)), "*rI" (-((UDItype)(bl)))	\
   1520 		     __CLOBBER_CC);					\
   1521 	else								\
   1522 	  __asm__ ("addic %1,%4,%5\n\tsubfe %0,%3,%2"			\
   1523 		   : "=r" (sh), "=&r" (sl)				\
   1524 		   : "r" ((UDItype)(ah)), "r" ((UDItype)(bh)),		\
   1525 		     "r" ((UDItype)(al)), "*rI" (-((UDItype)(bl)))	\
   1526 		     __CLOBBER_CC);					\
   1527     } else {								\
   1528 	if (__builtin_constant_p (ah) && (ah) == 0)			\
   1529 	  __asm__ ("subf%I3c %1,%4,%3\n\tsubfze %0,%2"			\
   1530 		   : "=r" (sh), "=&r" (sl)				\
   1531 		   :                       "r" ((UDItype)(bh)),		\
   1532 		     "rI" ((UDItype)(al)), "r" ((UDItype)(bl))		\
   1533 		     __CLOBBER_CC);					\
   1534 	else if (__builtin_constant_p (ah) && (ah) == ~(UDItype) 0)	\
   1535 	  __asm__ ("subf%I3c %1,%4,%3\n\tsubfme %0,%2"			\
   1536 		   : "=r" (sh), "=&r" (sl)				\
   1537 		   :                       "r" ((UDItype)(bh)),		\
   1538 		     "rI" ((UDItype)(al)), "r" ((UDItype)(bl))		\
   1539 		     __CLOBBER_CC);					\
   1540 	else if (__builtin_constant_p (bh) && (bh) == 0)		\
   1541 	  __asm__ ("subf%I3c %1,%4,%3\n\taddme %0,%2"			\
   1542 		   : "=r" (sh), "=&r" (sl)				\
   1543 		   : "r"  ((UDItype)(ah)),				\
   1544 		     "rI" ((UDItype)(al)), "r" ((UDItype)(bl))		\
   1545 		     __CLOBBER_CC);					\
   1546 	else if (__builtin_constant_p (bh) && (bh) == ~(UDItype) 0)	\
   1547 	  __asm__ ("subf%I3c %1,%4,%3\n\taddze %0,%2"			\
   1548 		   : "=r" (sh), "=&r" (sl)				\
   1549 		   : "r"  ((UDItype)(ah)),				\
   1550 		     "rI" ((UDItype)(al)), "r" ((UDItype)(bl))		\
   1551 		     __CLOBBER_CC);					\
   1552 	else								\
   1553 	  __asm__ ("subf%I4c %1,%5,%4\n\tsubfe %0,%3,%2"		\
   1554 		   : "=r" (sh), "=&r" (sl)				\
   1555 		   : "r"  ((UDItype)(ah)), "r" ((UDItype)(bh)),		\
   1556 		     "rI" ((UDItype)(al)), "r" ((UDItype)(bl))		\
   1557 		     __CLOBBER_CC);					\
   1558     }									\
   1559   } while (0)
   1560 #endif /* ! _LONG_LONG_LIMB */
   1561 #define count_leading_zeros(count, x) \
   1562   __asm__ ("cntlzd %0,%1" : "=r" (count) : "r" (x))
   1563 #define COUNT_LEADING_ZEROS_0 64
   1564 #if __GMP_GNUC_PREREQ (4,8)
   1565 #define umul_ppmm(w1, w0, u, v) \
   1566   do {									\
   1567     typedef unsigned int __ll_UTItype __attribute__((mode(TI)));	\
   1568     __ll_UTItype __ll = (__ll_UTItype)(u) * (v);			\
   1569     w1 = __ll >> 64;							\
   1570     w0 = __ll;								\
   1571   } while (0)
   1572 #endif
   1573 #if !defined (umul_ppmm)
   1574 #define umul_ppmm(ph, pl, m0, m1) \
   1575   do {									\
   1576     UDItype __m0 = (m0), __m1 = (m1);					\
   1577     __asm__ ("mulhdu %0,%1,%2" : "=r" (ph) : "%r" (__m0), "r" (__m1));	\
   1578     (pl) = __m0 * __m1;							\
   1579   } while (0)
   1580 #endif
   1581 #define smul_ppmm(ph, pl, m0, m1) \
   1582   do {									\
   1583     DItype __m0 = (m0), __m1 = (m1);					\
   1584     __asm__ ("mulhd %0,%1,%2" : "=r" (ph) : "%r" (__m0), "r" (__m1));	\
   1585     (pl) = __m0 * __m1;							\
   1586   } while (0)
   1587 #endif /* 64-bit PowerPC.  */
   1588 
   1589 #if defined (__pyr__) && W_TYPE_SIZE == 32
   1590 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \
   1591   __asm__ ("addw %5,%1\n\taddwc %3,%0"					\
   1592 	   : "=r" (sh), "=&r" (sl)					\
   1593 	   : "0"  ((USItype)(ah)), "g" ((USItype)(bh)),			\
   1594 	     "%1" ((USItype)(al)), "g" ((USItype)(bl)))
   1595 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \
   1596   __asm__ ("subw %5,%1\n\tsubwb %3,%0"					\
   1597 	   : "=r" (sh), "=&r" (sl)					\
   1598 	   : "0" ((USItype)(ah)), "g" ((USItype)(bh)),			\
   1599 	     "1" ((USItype)(al)), "g" ((USItype)(bl)))
   1600 /* This insn works on Pyramids with AP, XP, or MI CPUs, but not with SP.  */
   1601 #define umul_ppmm(w1, w0, u, v) \
   1602   ({union {UDItype __ll;						\
   1603 	   struct {USItype __h, __l;} __i;				\
   1604 	  } __x;							\
   1605   __asm__ ("movw %1,%R0\n\tuemul %2,%0"					\
   1606 	   : "=&r" (__x.__ll)						\
   1607 	   : "g" ((USItype) (u)), "g" ((USItype)(v)));			\
   1608   (w1) = __x.__i.__h; (w0) = __x.__i.__l;})
   1609 #endif /* __pyr__ */
   1610 
   1611 #if defined (__ibm032__) /* RT/ROMP */  && W_TYPE_SIZE == 32
   1612 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \
   1613   __asm__ ("a %1,%5\n\tae %0,%3"					\
   1614 	   : "=r" (sh), "=&r" (sl)					\
   1615 	   : "0"  ((USItype)(ah)), "r" ((USItype)(bh)),			\
   1616 	     "%1" ((USItype)(al)), "r" ((USItype)(bl)))
   1617 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \
   1618   __asm__ ("s %1,%5\n\tse %0,%3"					\
   1619 	   : "=r" (sh), "=&r" (sl)					\
   1620 	   : "0" ((USItype)(ah)), "r" ((USItype)(bh)),			\
   1621 	     "1" ((USItype)(al)), "r" ((USItype)(bl)))
   1622 #define smul_ppmm(ph, pl, m0, m1) \
   1623   __asm__ (								\
   1624        "s	r2,r2\n"						\
   1625 "	mts r10,%2\n"							\
   1626 "	m	r2,%3\n"						\
   1627 "	m	r2,%3\n"						\
   1628 "	m	r2,%3\n"						\
   1629 "	m	r2,%3\n"						\
   1630 "	m	r2,%3\n"						\
   1631 "	m	r2,%3\n"						\
   1632 "	m	r2,%3\n"						\
   1633 "	m	r2,%3\n"						\
   1634 "	m	r2,%3\n"						\
   1635 "	m	r2,%3\n"						\
   1636 "	m	r2,%3\n"						\
   1637 "	m	r2,%3\n"						\
   1638 "	m	r2,%3\n"						\
   1639 "	m	r2,%3\n"						\
   1640 "	m	r2,%3\n"						\
   1641 "	m	r2,%3\n"						\
   1642 "	cas	%0,r2,r0\n"						\
   1643 "	mfs	r10,%1"							\
   1644 	   : "=r" (ph), "=r" (pl)					\
   1645 	   : "%r" ((USItype)(m0)), "r" ((USItype)(m1))			\
   1646 	   : "r2")
   1647 #define count_leading_zeros(count, x) \
   1648   do {									\
   1649     if ((x) >= 0x10000)							\
   1650       __asm__ ("clz	%0,%1"						\
   1651 	       : "=r" (count) : "r" ((USItype)(x) >> 16));		\
   1652     else								\
   1653       {									\
   1654 	__asm__ ("clz	%0,%1"						\
   1655 		 : "=r" (count) : "r" ((USItype)(x)));			\
   1656 	(count) += 16;							\
   1657       }									\
   1658   } while (0)
   1659 #endif /* RT/ROMP */
   1660 
   1661 #if defined (__riscv64) && W_TYPE_SIZE == 64
   1662 #define umul_ppmm(ph, pl, u, v) \
   1663   do {									\
   1664     UDItype __u = (u), __v = (v);					\
   1665     (pl) = __u * __v;							\
   1666     __asm__ ("mulhu\t%2, %1, %0" : "=r" (ph) : "%r" (__u), "r" (__v));	\
   1667   } while (0)
   1668 #endif
   1669 
   1670 #if (defined (__SH2__) || defined (__SH3__) || defined (__SH4__)) && W_TYPE_SIZE == 32
   1671 #define umul_ppmm(w1, w0, u, v) \
   1672   __asm__ ("dmulu.l %2,%3\n\tsts macl,%1\n\tsts mach,%0"		\
   1673 	   : "=r" (w1), "=r" (w0) : "r" (u), "r" (v) : "macl", "mach")
   1674 #endif
   1675 
   1676 #if defined (__sparc__) && W_TYPE_SIZE == 32
   1677 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \
   1678   __asm__ ("addcc %r4,%5,%1\n\taddx %r2,%3,%0"				\
   1679 	   : "=r" (sh), "=&r" (sl)					\
   1680 	   : "rJ" (ah), "rI" (bh),"%rJ" (al), "rI" (bl)			\
   1681 	   __CLOBBER_CC)
   1682 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \
   1683   __asm__ ("subcc %r4,%5,%1\n\tsubx %r2,%3,%0"				\
   1684 	   : "=r" (sh), "=&r" (sl)					\
   1685 	   : "rJ" (ah), "rI" (bh), "rJ" (al), "rI" (bl)	\
   1686 	   __CLOBBER_CC)
   1687 /* FIXME: When gcc -mcpu=v9 is used on solaris, gcc/config/sol2-sld-64.h
   1688    doesn't define anything to indicate that to us, it only sets __sparcv8. */
   1689 #if defined (__sparc_v9__) || defined (__sparcv9)
   1690 /* Perhaps we should use floating-point operations here?  */
   1691 #if 0
   1692 /* Triggers a bug making mpz/tests/t-gcd.c fail.
   1693    Perhaps we simply need explicitly zero-extend the inputs?  */
   1694 #define umul_ppmm(w1, w0, u, v) \
   1695   __asm__ ("mulx %2,%3,%%g1; srl %%g1,0,%1; srlx %%g1,32,%0" :		\
   1696 	   "=r" (w1), "=r" (w0) : "r" (u), "r" (v) : "g1")
   1697 #else
   1698 /* Use v8 umul until above bug is fixed.  */
   1699 #define umul_ppmm(w1, w0, u, v) \
   1700   __asm__ ("umul %2,%3,%1;rd %%y,%0" : "=r" (w1), "=r" (w0) : "r" (u), "r" (v))
   1701 #endif
   1702 /* Use a plain v8 divide for v9.  */
   1703 #define udiv_qrnnd(q, r, n1, n0, d) \
   1704   do {									\
   1705     USItype __q;							\
   1706     __asm__ ("mov %1,%%y;nop;nop;nop;udiv %2,%3,%0"			\
   1707 	     : "=r" (__q) : "r" (n1), "r" (n0), "r" (d));		\
   1708     (r) = (n0) - __q * (d);						\
   1709     (q) = __q;								\
   1710   } while (0)
   1711 #else
   1712 #if defined (__sparc_v8__)   /* gcc normal */				\
   1713   || defined (__sparcv8)     /* gcc solaris */				\
   1714   || HAVE_HOST_CPU_supersparc
   1715 /* Don't match immediate range because, 1) it is not often useful,
   1716    2) the 'I' flag thinks of the range as a 13 bit signed interval,
   1717    while we want to match a 13 bit interval, sign extended to 32 bits,
   1718    but INTERPRETED AS UNSIGNED.  */
   1719 #define umul_ppmm(w1, w0, u, v) \
   1720   __asm__ ("umul %2,%3,%1;rd %%y,%0" : "=r" (w1), "=r" (w0) : "r" (u), "r" (v))
   1721 
   1722 #if HAVE_HOST_CPU_supersparc
   1723 #else
   1724 /* Don't use this on SuperSPARC because its udiv only handles 53 bit
   1725    dividends and will trap to the kernel for the rest. */
   1726 #define udiv_qrnnd(q, r, n1, n0, d) \
   1727   do {									\
   1728     USItype __q;							\
   1729     __asm__ ("mov %1,%%y;nop;nop;nop;udiv %2,%3,%0"			\
   1730 	     : "=r" (__q) : "r" (n1), "r" (n0), "r" (d));		\
   1731     (r) = (n0) - __q * (d);						\
   1732     (q) = __q;								\
   1733   } while (0)
   1734 #endif /* HAVE_HOST_CPU_supersparc */
   1735 
   1736 #else /* ! __sparc_v8__ */
   1737 #if defined (__sparclite__)
   1738 /* This has hardware multiply but not divide.  It also has two additional
   1739    instructions scan (ffs from high bit) and divscc.  */
   1740 #define umul_ppmm(w1, w0, u, v) \
   1741   __asm__ ("umul %2,%3,%1;rd %%y,%0" : "=r" (w1), "=r" (w0) : "r" (u), "r" (v))
   1742 #define udiv_qrnnd(q, r, n1, n0, d) \
   1743   __asm__ ("! Inlined udiv_qrnnd\n"					\
   1744 "	wr	%%g0,%2,%%y	! Not a delayed write for sparclite\n"	\
   1745 "	tst	%%g0\n"							\
   1746 "	divscc	%3,%4,%%g1\n"						\
   1747 "	divscc	%%g1,%4,%%g1\n"						\
   1748 "	divscc	%%g1,%4,%%g1\n"						\
   1749 "	divscc	%%g1,%4,%%g1\n"						\
   1750 "	divscc	%%g1,%4,%%g1\n"						\
   1751 "	divscc	%%g1,%4,%%g1\n"						\
   1752 "	divscc	%%g1,%4,%%g1\n"						\
   1753 "	divscc	%%g1,%4,%%g1\n"						\
   1754 "	divscc	%%g1,%4,%%g1\n"						\
   1755 "	divscc	%%g1,%4,%%g1\n"						\
   1756 "	divscc	%%g1,%4,%%g1\n"						\
   1757 "	divscc	%%g1,%4,%%g1\n"						\
   1758 "	divscc	%%g1,%4,%%g1\n"						\
   1759 "	divscc	%%g1,%4,%%g1\n"						\
   1760 "	divscc	%%g1,%4,%%g1\n"						\
   1761 "	divscc	%%g1,%4,%%g1\n"						\
   1762 "	divscc	%%g1,%4,%%g1\n"						\
   1763 "	divscc	%%g1,%4,%%g1\n"						\
   1764 "	divscc	%%g1,%4,%%g1\n"						\
   1765 "	divscc	%%g1,%4,%%g1\n"						\
   1766 "	divscc	%%g1,%4,%%g1\n"						\
   1767 "	divscc	%%g1,%4,%%g1\n"						\
   1768 "	divscc	%%g1,%4,%%g1\n"						\
   1769 "	divscc	%%g1,%4,%%g1\n"						\
   1770 "	divscc	%%g1,%4,%%g1\n"						\
   1771 "	divscc	%%g1,%4,%%g1\n"						\
   1772 "	divscc	%%g1,%4,%%g1\n"						\
   1773 "	divscc	%%g1,%4,%%g1\n"						\
   1774 "	divscc	%%g1,%4,%%g1\n"						\
   1775 "	divscc	%%g1,%4,%%g1\n"						\
   1776 "	divscc	%%g1,%4,%%g1\n"						\
   1777 "	divscc	%%g1,%4,%0\n"						\
   1778 "	rd	%%y,%1\n"						\
   1779 "	bl,a 1f\n"							\
   1780 "	add	%1,%4,%1\n"						\
   1781 "1:	! End of inline udiv_qrnnd"					\
   1782 	   : "=r" (q), "=r" (r) : "r" (n1), "r" (n0), "rI" (d)		\
   1783 	   : "%g1" __AND_CLOBBER_CC)
   1784 #define count_leading_zeros(count, x) \
   1785   __asm__ ("scan %1,1,%0" : "=r" (count) : "r" (x))
   1786 /* Early sparclites return 63 for an argument of 0, but they warn that future
   1787    implementations might change this.  Therefore, leave COUNT_LEADING_ZEROS_0
   1788    undefined.  */
   1789 #endif /* __sparclite__ */
   1790 #endif /* __sparc_v8__ */
   1791 #endif /* __sparc_v9__ */
   1792 /* Default to sparc v7 versions of umul_ppmm and udiv_qrnnd.  */
   1793 #ifndef umul_ppmm
   1794 #define umul_ppmm(w1, w0, u, v) \
   1795   __asm__ ("! Inlined umul_ppmm\n"					\
   1796 "	wr	%%g0,%2,%%y	! SPARC has 0-3 delay insn after a wr\n" \
   1797 "	sra	%3,31,%%g2	! Don't move this insn\n"		\
   1798 "	and	%2,%%g2,%%g2	! Don't move this insn\n"		\
   1799 "	andcc	%%g0,0,%%g1	! Don't move this insn\n"		\
   1800 "	mulscc	%%g1,%3,%%g1\n"						\
   1801 "	mulscc	%%g1,%3,%%g1\n"						\
   1802 "	mulscc	%%g1,%3,%%g1\n"						\
   1803 "	mulscc	%%g1,%3,%%g1\n"						\
   1804 "	mulscc	%%g1,%3,%%g1\n"						\
   1805 "	mulscc	%%g1,%3,%%g1\n"						\
   1806 "	mulscc	%%g1,%3,%%g1\n"						\
   1807 "	mulscc	%%g1,%3,%%g1\n"						\
   1808 "	mulscc	%%g1,%3,%%g1\n"						\
   1809 "	mulscc	%%g1,%3,%%g1\n"						\
   1810 "	mulscc	%%g1,%3,%%g1\n"						\
   1811 "	mulscc	%%g1,%3,%%g1\n"						\
   1812 "	mulscc	%%g1,%3,%%g1\n"						\
   1813 "	mulscc	%%g1,%3,%%g1\n"						\
   1814 "	mulscc	%%g1,%3,%%g1\n"						\
   1815 "	mulscc	%%g1,%3,%%g1\n"						\
   1816 "	mulscc	%%g1,%3,%%g1\n"						\
   1817 "	mulscc	%%g1,%3,%%g1\n"						\
   1818 "	mulscc	%%g1,%3,%%g1\n"						\
   1819 "	mulscc	%%g1,%3,%%g1\n"						\
   1820 "	mulscc	%%g1,%3,%%g1\n"						\
   1821 "	mulscc	%%g1,%3,%%g1\n"						\
   1822 "	mulscc	%%g1,%3,%%g1\n"						\
   1823 "	mulscc	%%g1,%3,%%g1\n"						\
   1824 "	mulscc	%%g1,%3,%%g1\n"						\
   1825 "	mulscc	%%g1,%3,%%g1\n"						\
   1826 "	mulscc	%%g1,%3,%%g1\n"						\
   1827 "	mulscc	%%g1,%3,%%g1\n"						\
   1828 "	mulscc	%%g1,%3,%%g1\n"						\
   1829 "	mulscc	%%g1,%3,%%g1\n"						\
   1830 "	mulscc	%%g1,%3,%%g1\n"						\
   1831 "	mulscc	%%g1,%3,%%g1\n"						\
   1832 "	mulscc	%%g1,0,%%g1\n"						\
   1833 "	add	%%g1,%%g2,%0\n"						\
   1834 "	rd	%%y,%1"							\
   1835 	   : "=r" (w1), "=r" (w0) : "%rI" (u), "r" (v)			\
   1836 	   : "%g1", "%g2" __AND_CLOBBER_CC)
   1837 #endif
   1838 #ifndef udiv_qrnnd
   1839 #ifndef LONGLONG_STANDALONE
   1840 #define udiv_qrnnd(q, r, n1, n0, d) \
   1841   do { UWtype __r;							\
   1842     (q) = __MPN(udiv_qrnnd) (&__r, (n1), (n0), (d));			\
   1843     (r) = __r;								\
   1844   } while (0)
   1845 extern UWtype __MPN(udiv_qrnnd) (UWtype *, UWtype, UWtype, UWtype);
   1846 #endif /* LONGLONG_STANDALONE */
   1847 #endif /* udiv_qrnnd */
   1848 #endif /* __sparc__ */
   1849 
   1850 #if defined (__sparc__) && W_TYPE_SIZE == 64
   1851 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \
   1852   __asm__ (								\
   1853        "addcc	%r4,%5,%1\n"						\
   1854       "	addccc	%r6,%7,%%g0\n"						\
   1855       "	addc	%r2,%3,%0"						\
   1856        : "=r" (sh), "=&r" (sl)						\
   1857        : "rJ"  ((UDItype)(ah)), "rI" ((UDItype)(bh)),			\
   1858 	 "%rJ" ((UDItype)(al)), "rI" ((UDItype)(bl)),			\
   1859 	 "%rJ" ((UDItype)(al) >> 32), "rI" ((UDItype)(bl) >> 32)	\
   1860 	   __CLOBBER_CC)
   1861 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \
   1862   __asm__ (								\
   1863        "subcc	%r4,%5,%1\n"						\
   1864       "	subccc	%r6,%7,%%g0\n"						\
   1865       "	subc	%r2,%3,%0"						\
   1866        : "=r" (sh), "=&r" (sl)						\
   1867        : "rJ" ((UDItype)(ah)), "rI" ((UDItype)(bh)),			\
   1868 	 "rJ" ((UDItype)(al)), "rI" ((UDItype)(bl)),			\
   1869 	 "rJ" ((UDItype)(al) >> 32), "rI" ((UDItype)(bl) >> 32)		\
   1870 	   __CLOBBER_CC)
   1871 #if __VIS__ >= 0x300
   1872 #undef add_ssaaaa
   1873 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \
   1874   __asm__ (								\
   1875        "addcc	%r4, %5, %1\n"						\
   1876       "	addxc	%r2, %r3, %0"						\
   1877 	  : "=r" (sh), "=&r" (sl)					\
   1878        : "rJ"  ((UDItype)(ah)), "rJ" ((UDItype)(bh)),			\
   1879 	 "%rJ" ((UDItype)(al)), "rI" ((UDItype)(bl)) __CLOBBER_CC)
   1880 #define umul_ppmm(ph, pl, m0, m1) \
   1881   do {									\
   1882     UDItype __m0 = (m0), __m1 = (m1);					\
   1883     (pl) = __m0 * __m1;							\
   1884     __asm__ ("umulxhi\t%2, %1, %0"					\
   1885 	     : "=r" (ph)						\
   1886 	     : "%r" (__m0), "r" (__m1));				\
   1887   } while (0)
   1888 #define count_leading_zeros(count, x) \
   1889   __asm__ ("lzd\t%1,%0" : "=r" (count) : "r" (x))
   1890 /* Needed by count_leading_zeros_32 in sparc64.h.  */
   1891 #define COUNT_LEADING_ZEROS_NEED_CLZ_TAB
   1892 #endif
   1893 #endif
   1894 
   1895 #if (defined (__vax) || defined (__vax__)) && W_TYPE_SIZE == 32
   1896 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \
   1897   __asm__ ("addl2 %5,%1\n\tadwc %3,%0"					\
   1898 	   : "=g" (sh), "=&g" (sl)					\
   1899 	   : "0"  ((USItype)(ah)), "g" ((USItype)(bh)),			\
   1900 	     "%1" ((USItype)(al)), "g" ((USItype)(bl)))
   1901 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \
   1902   __asm__ ("subl2 %5,%1\n\tsbwc %3,%0"					\
   1903 	   : "=g" (sh), "=&g" (sl)					\
   1904 	   : "0" ((USItype)(ah)), "g" ((USItype)(bh)),			\
   1905 	     "1" ((USItype)(al)), "g" ((USItype)(bl)))
   1906 #define smul_ppmm(xh, xl, m0, m1) \
   1907   do {									\
   1908     union {UDItype __ll;						\
   1909 	   struct {USItype __l, __h;} __i;				\
   1910 	  } __x;							\
   1911     USItype __m0 = (m0), __m1 = (m1);					\
   1912     __asm__ ("emul %1,%2,$0,%0"						\
   1913 	     : "=g" (__x.__ll) : "g" (__m0), "g" (__m1));		\
   1914     (xh) = __x.__i.__h; (xl) = __x.__i.__l;				\
   1915   } while (0)
   1916 #define sdiv_qrnnd(q, r, n1, n0, d) \
   1917   do {									\
   1918     union {DItype __ll;							\
   1919 	   struct {SItype __l, __h;} __i;				\
   1920 	  } __x;							\
   1921     __x.__i.__h = n1; __x.__i.__l = n0;					\
   1922     __asm__ ("ediv %3,%2,%0,%1"						\
   1923 	     : "=g" (q), "=g" (r) : "g" (__x.__ll), "g" (d));		\
   1924   } while (0)
   1925 #if 0
   1926 /* FIXME: This instruction appears to be unimplemented on some systems (vax
   1927    8800 maybe). */
   1928 #define count_trailing_zeros(count,x)					\
   1929   do {									\
   1930     __asm__ ("ffs 0, 31, %1, %0"					\
   1931 	     : "=g" (count)						\
   1932 	     : "g" ((USItype) (x)));					\
   1933   } while (0)
   1934 #endif
   1935 #endif /* vax */
   1936 
   1937 #if defined (__z8000__) && W_TYPE_SIZE == 16
   1938 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \
   1939   __asm__ ("add	%H1,%H5\n\tadc	%H0,%H3"				\
   1940 	   : "=r" (sh), "=&r" (sl)					\
   1941 	   : "0"  ((unsigned int)(ah)), "r" ((unsigned int)(bh)),	\
   1942 	     "%1" ((unsigned int)(al)), "rQR" ((unsigned int)(bl)))
   1943 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \
   1944   __asm__ ("sub	%H1,%H5\n\tsbc	%H0,%H3"				\
   1945 	   : "=r" (sh), "=&r" (sl)					\
   1946 	   : "0" ((unsigned int)(ah)), "r" ((unsigned int)(bh)),	\
   1947 	     "1" ((unsigned int)(al)), "rQR" ((unsigned int)(bl)))
   1948 #define umul_ppmm(xh, xl, m0, m1) \
   1949   do {									\
   1950     union {long int __ll;						\
   1951 	   struct {unsigned int __h, __l;} __i;				\
   1952 	  } __x;							\
   1953     unsigned int __m0 = (m0), __m1 = (m1);				\
   1954     __asm__ ("mult	%S0,%H3"					\
   1955 	     : "=r" (__x.__i.__h), "=r" (__x.__i.__l)			\
   1956 	     : "%1" (m0), "rQR" (m1));					\
   1957     (xh) = __x.__i.__h; (xl) = __x.__i.__l;				\
   1958     (xh) += ((((signed int) __m0 >> 15) & __m1)				\
   1959 	     + (((signed int) __m1 >> 15) & __m0));			\
   1960   } while (0)
   1961 #endif /* __z8000__ */
   1962 
   1963 #endif /* __GNUC__ */
   1964 
   1965 #endif /* NO_ASM */
   1966 
   1967 
   1968 /* FIXME: "sidi" here is highly doubtful, should sometimes be "diti".  */
   1969 #if !defined (umul_ppmm) && defined (__umulsidi3)
   1970 #define umul_ppmm(ph, pl, m0, m1) \
   1971   do {									\
   1972     UDWtype __ll = __umulsidi3 (m0, m1);				\
   1973     ph = (UWtype) (__ll >> W_TYPE_SIZE);				\
   1974     pl = (UWtype) __ll;							\
   1975   } while (0)
   1976 #endif
   1977 
   1978 #if !defined (__umulsidi3)
   1979 #define __umulsidi3(u, v) \
   1980   ({UWtype __hi, __lo;							\
   1981     umul_ppmm (__hi, __lo, u, v);					\
   1982     ((UDWtype) __hi << W_TYPE_SIZE) | __lo; })
   1983 #endif
   1984 
   1985 
   1986 #if defined (__cplusplus)
   1987 #define __longlong_h_C "C"
   1988 #else
   1989 #define __longlong_h_C
   1990 #endif
   1991 
   1992 /* Use mpn_umul_ppmm or mpn_udiv_qrnnd functions, if they exist.  The "_r"
   1993    forms have "reversed" arguments, meaning the pointer is last, which
   1994    sometimes allows better parameter passing, in particular on 64-bit
   1995    hppa. */
   1996 
   1997 #define mpn_umul_ppmm  __MPN(umul_ppmm)
   1998 extern __longlong_h_C UWtype mpn_umul_ppmm (UWtype *, UWtype, UWtype);
   1999 
   2000 #if ! defined (umul_ppmm) && HAVE_NATIVE_mpn_umul_ppmm  \
   2001   && ! defined (LONGLONG_STANDALONE)
   2002 #define umul_ppmm(wh, wl, u, v)						\
   2003   do {									\
   2004     UWtype __umul_ppmm__p0;						\
   2005     (wh) = mpn_umul_ppmm (&__umul_ppmm__p0, (UWtype) (u), (UWtype) (v));\
   2006     (wl) = __umul_ppmm__p0;						\
   2007   } while (0)
   2008 #endif
   2009 
   2010 #define mpn_umul_ppmm_r  __MPN(umul_ppmm_r)
   2011 extern __longlong_h_C UWtype mpn_umul_ppmm_r (UWtype, UWtype, UWtype *);
   2012 
   2013 #if ! defined (umul_ppmm) && HAVE_NATIVE_mpn_umul_ppmm_r	\
   2014   && ! defined (LONGLONG_STANDALONE)
   2015 #define umul_ppmm(wh, wl, u, v)						\
   2016   do {									\
   2017     UWtype __umul_p0;							\
   2018     (wh) = mpn_umul_ppmm_r ((UWtype) (u), (UWtype) (v), &__umul_p0);	\
   2019     (wl) = __umul_p0;							\
   2020   } while (0)
   2021 #endif
   2022 
   2023 #define mpn_udiv_qrnnd  __MPN(udiv_qrnnd)
   2024 extern __longlong_h_C UWtype mpn_udiv_qrnnd (UWtype *, UWtype, UWtype, UWtype);
   2025 
   2026 #if ! defined (udiv_qrnnd) && HAVE_NATIVE_mpn_udiv_qrnnd	\
   2027   && ! defined (LONGLONG_STANDALONE)
   2028 #define udiv_qrnnd(q, r, n1, n0, d)					\
   2029   do {									\
   2030     UWtype __udiv_qrnnd_r;						\
   2031     (q) = mpn_udiv_qrnnd (&__udiv_qrnnd_r,				\
   2032 			  (UWtype) (n1), (UWtype) (n0), (UWtype) d);	\
   2033     (r) = __udiv_qrnnd_r;						\
   2034   } while (0)
   2035 #endif
   2036 
   2037 #define mpn_udiv_qrnnd_r  __MPN(udiv_qrnnd_r)
   2038 extern __longlong_h_C UWtype mpn_udiv_qrnnd_r (UWtype, UWtype, UWtype, UWtype *);
   2039 
   2040 #if ! defined (udiv_qrnnd) && HAVE_NATIVE_mpn_udiv_qrnnd_r	\
   2041   && ! defined (LONGLONG_STANDALONE)
   2042 #define udiv_qrnnd(q, r, n1, n0, d)					\
   2043   do {									\
   2044     UWtype __udiv_qrnnd_r;						\
   2045     (q) = mpn_udiv_qrnnd_r ((UWtype) (n1), (UWtype) (n0), (UWtype) d,	\
   2046 			    &__udiv_qrnnd_r);				\
   2047     (r) = __udiv_qrnnd_r;						\
   2048   } while (0)
   2049 #endif
   2050 
   2051 
   2052 /* If this machine has no inline assembler, use C macros.  */
   2053 
   2054 #if !defined (add_ssaaaa)
   2055 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \
   2056   do {									\
   2057     UWtype __x;								\
   2058     UWtype __al = (al);							\
   2059     UWtype __bl = (bl);							\
   2060     __x = __al + __bl;							\
   2061     (sh) = (ah) + (bh) + (__x < __al);					\
   2062     (sl) = __x;								\
   2063   } while (0)
   2064 #endif
   2065 
   2066 #if !defined (sub_ddmmss)
   2067 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \
   2068   do {									\
   2069     UWtype __x;								\
   2070     UWtype __al = (al);							\
   2071     UWtype __bl = (bl);							\
   2072     __x = __al - __bl;							\
   2073     (sh) = (ah) - (bh) - (__al < __bl);					\
   2074     (sl) = __x;								\
   2075   } while (0)
   2076 #endif
   2077 
   2078 /* If we lack umul_ppmm but have smul_ppmm, define umul_ppmm in terms of
   2079    smul_ppmm.  */
   2080 #if !defined (umul_ppmm) && defined (smul_ppmm)
   2081 #define umul_ppmm(w1, w0, u, v)						\
   2082   do {									\
   2083     UWtype __w1;							\
   2084     UWtype __xm0 = (u), __xm1 = (v);					\
   2085     smul_ppmm (__w1, w0, __xm0, __xm1);					\
   2086     (w1) = __w1 + (-(__xm0 >> (W_TYPE_SIZE - 1)) & __xm1)		\
   2087 		+ (-(__xm1 >> (W_TYPE_SIZE - 1)) & __xm0);		\
   2088   } while (0)
   2089 #endif
   2090 
   2091 /* If we still don't have umul_ppmm, define it using plain C.
   2092 
   2093    For reference, when this code is used for squaring (ie. u and v identical
   2094    expressions), gcc recognises __x1 and __x2 are the same and generates 3
   2095    multiplies, not 4.  The subsequent additions could be optimized a bit,
   2096    but the only place GMP currently uses such a square is mpn_sqr_basecase,
   2097    and chips obliged to use this generic C umul will have plenty of worse
   2098    performance problems than a couple of extra instructions on the diagonal
   2099    of sqr_basecase.  */
   2100 
   2101 #if !defined (umul_ppmm)
   2102 #define umul_ppmm(w1, w0, u, v)						\
   2103   do {									\
   2104     UWtype __x0, __x1, __x2, __x3;					\
   2105     UHWtype __ul, __vl, __uh, __vh;					\
   2106     UWtype __u = (u), __v = (v);					\
   2107 									\
   2108     __ul = __ll_lowpart (__u);						\
   2109     __uh = __ll_highpart (__u);						\
   2110     __vl = __ll_lowpart (__v);						\
   2111     __vh = __ll_highpart (__v);						\
   2112 									\
   2113     __x0 = (UWtype) __ul * __vl;					\
   2114     __x1 = (UWtype) __ul * __vh;					\
   2115     __x2 = (UWtype) __uh * __vl;					\
   2116     __x3 = (UWtype) __uh * __vh;					\
   2117 									\
   2118     __x1 += __ll_highpart (__x0);/* this can't give carry */		\
   2119     __x1 += __x2;		/* but this indeed can */		\
   2120     if (__x1 < __x2)		/* did we get it? */			\
   2121       __x3 += __ll_B;		/* yes, add it in the proper pos. */	\
   2122 									\
   2123     (w1) = __x3 + __ll_highpart (__x1);					\
   2124     (w0) = (__x1 << W_TYPE_SIZE/2) + __ll_lowpart (__x0);		\
   2125   } while (0)
   2126 #endif
   2127 
   2128 /* If we don't have smul_ppmm, define it using umul_ppmm (which surely will
   2129    exist in one form or another.  */
   2130 #if !defined (smul_ppmm)
   2131 #define smul_ppmm(w1, w0, u, v)						\
   2132   do {									\
   2133     UWtype __w1;							\
   2134     UWtype __xm0 = (u), __xm1 = (v);					\
   2135     umul_ppmm (__w1, w0, __xm0, __xm1);					\
   2136     (w1) = __w1 - (-(__xm0 >> (W_TYPE_SIZE - 1)) & __xm1)		\
   2137 		- (-(__xm1 >> (W_TYPE_SIZE - 1)) & __xm0);		\
   2138   } while (0)
   2139 #endif
   2140 
   2141 /* Define this unconditionally, so it can be used for debugging.  */
   2142 #define __udiv_qrnnd_c(q, r, n1, n0, d) \
   2143   do {									\
   2144     UWtype __d1, __d0, __q1, __q0, __r1, __r0, __m;			\
   2145 									\
   2146     ASSERT ((d) != 0);							\
   2147     ASSERT ((n1) < (d));						\
   2148 									\
   2149     __d1 = __ll_highpart (d);						\
   2150     __d0 = __ll_lowpart (d);						\
   2151 									\
   2152     __q1 = (n1) / __d1;							\
   2153     __r1 = (n1) - __q1 * __d1;						\
   2154     __m = __q1 * __d0;							\
   2155     __r1 = __r1 * __ll_B | __ll_highpart (n0);				\
   2156     if (__r1 < __m)							\
   2157       {									\
   2158 	__q1--, __r1 += (d);						\
   2159 	if (__r1 >= (d)) /* i.e. we didn't get carry when adding to __r1 */\
   2160 	  if (__r1 < __m)						\
   2161 	    __q1--, __r1 += (d);					\
   2162       }									\
   2163     __r1 -= __m;							\
   2164 									\
   2165     __q0 = __r1 / __d1;							\
   2166     __r0 = __r1  - __q0 * __d1;						\
   2167     __m = __q0 * __d0;							\
   2168     __r0 = __r0 * __ll_B | __ll_lowpart (n0);				\
   2169     if (__r0 < __m)							\
   2170       {									\
   2171 	__q0--, __r0 += (d);						\
   2172 	if (__r0 >= (d))						\
   2173 	  if (__r0 < __m)						\
   2174 	    __q0--, __r0 += (d);					\
   2175       }									\
   2176     __r0 -= __m;							\
   2177 									\
   2178     (q) = __q1 * __ll_B | __q0;						\
   2179     (r) = __r0;								\
   2180   } while (0)
   2181 
   2182 /* If the processor has no udiv_qrnnd but sdiv_qrnnd, go through
   2183    __udiv_w_sdiv (defined in libgcc or elsewhere).  */
   2184 #if !defined (udiv_qrnnd) && defined (sdiv_qrnnd) \
   2185   && ! defined (LONGLONG_STANDALONE)
   2186 #define udiv_qrnnd(q, r, nh, nl, d) \
   2187   do {									\
   2188     UWtype __r;								\
   2189     (q) = __MPN(udiv_w_sdiv) (&__r, nh, nl, d);				\
   2190     (r) = __r;								\
   2191   } while (0)
   2192 __GMP_DECLSPEC UWtype __MPN(udiv_w_sdiv) (UWtype *, UWtype, UWtype, UWtype);
   2193 #endif
   2194 
   2195 /* If udiv_qrnnd was not defined for this processor, use __udiv_qrnnd_c.  */
   2196 #if !defined (udiv_qrnnd)
   2197 #define UDIV_NEEDS_NORMALIZATION 1
   2198 #define udiv_qrnnd __udiv_qrnnd_c
   2199 #endif
   2200 
   2201 #if !defined (count_leading_zeros)
   2202 #define count_leading_zeros(count, x) \
   2203   do {									\
   2204     UWtype __xr = (x);							\
   2205     UWtype __a;								\
   2206 									\
   2207     if (W_TYPE_SIZE == 32)						\
   2208       {									\
   2209 	__a = __xr < ((UWtype) 1 << 2*__BITS4)				\
   2210 	  ? (__xr < ((UWtype) 1 << __BITS4) ? 1 : __BITS4 + 1)		\
   2211 	  : (__xr < ((UWtype) 1 << 3*__BITS4) ? 2*__BITS4 + 1		\
   2212 	  : 3*__BITS4 + 1);						\
   2213       }									\
   2214     else								\
   2215       {									\
   2216 	for (__a = W_TYPE_SIZE - 8; __a > 0; __a -= 8)			\
   2217 	  if (((__xr >> __a) & 0xff) != 0)				\
   2218 	    break;							\
   2219 	++__a;								\
   2220       }									\
   2221 									\
   2222     (count) = W_TYPE_SIZE + 1 - __a - __clz_tab[__xr >> __a];		\
   2223   } while (0)
   2224 /* This version gives a well-defined value for zero. */
   2225 #define COUNT_LEADING_ZEROS_0 (W_TYPE_SIZE - 1)
   2226 #define COUNT_LEADING_ZEROS_NEED_CLZ_TAB
   2227 #define COUNT_LEADING_ZEROS_SLOW
   2228 #endif
   2229 
   2230 /* clz_tab needed by mpn/x86/pentium/mod_1.asm in a fat binary */
   2231 #if HAVE_HOST_CPU_FAMILY_x86 && WANT_FAT_BINARY
   2232 #define COUNT_LEADING_ZEROS_NEED_CLZ_TAB
   2233 #endif
   2234 
   2235 #ifdef COUNT_LEADING_ZEROS_NEED_CLZ_TAB
   2236 extern const unsigned char __GMP_DECLSPEC __clz_tab[129];
   2237 #endif
   2238 
   2239 #if !defined (count_trailing_zeros)
   2240 #if !defined (COUNT_LEADING_ZEROS_SLOW)
   2241 /* Define count_trailing_zeros using an asm count_leading_zeros.  */
   2242 #define count_trailing_zeros(count, x)					\
   2243   do {									\
   2244     UWtype __ctz_x = (x);						\
   2245     UWtype __ctz_c;							\
   2246     ASSERT (__ctz_x != 0);						\
   2247     count_leading_zeros (__ctz_c, __ctz_x & -__ctz_x);			\
   2248     (count) = W_TYPE_SIZE - 1 - __ctz_c;				\
   2249   } while (0)
   2250 #else
   2251 /* Define count_trailing_zeros in plain C, assuming small counts are common.
   2252    We use clz_tab without ado, since the C count_leading_zeros above will have
   2253    pulled it in.  */
   2254 #define count_trailing_zeros(count, x)					\
   2255   do {									\
   2256     UWtype __ctz_x = (x);						\
   2257     int __ctz_c;							\
   2258 									\
   2259     if (LIKELY ((__ctz_x & 0xff) != 0))					\
   2260       (count) = __clz_tab[__ctz_x & -__ctz_x] - 2;			\
   2261     else								\
   2262       {									\
   2263 	for (__ctz_c = 8 - 2; __ctz_c < W_TYPE_SIZE - 2; __ctz_c += 8)	\
   2264 	  {								\
   2265 	    __ctz_x >>= 8;						\
   2266 	    if (LIKELY ((__ctz_x & 0xff) != 0))				\
   2267 	      break;							\
   2268 	  }								\
   2269 									\
   2270 	(count) = __ctz_c + __clz_tab[__ctz_x & -__ctz_x];		\
   2271       }									\
   2272   } while (0)
   2273 #endif
   2274 #endif
   2275 
   2276 #ifndef UDIV_NEEDS_NORMALIZATION
   2277 #define UDIV_NEEDS_NORMALIZATION 0
   2278 #endif
   2279 
   2280 /* Whether udiv_qrnnd is actually implemented with udiv_qrnnd_preinv, and
   2281    that hence the latter should always be used.  */
   2282 #ifndef UDIV_PREINV_ALWAYS
   2283 #define UDIV_PREINV_ALWAYS 0
   2284 #endif
   2285