Home | History | Annotate | Line # | Download | only in include
longlong.h revision 1.1.1.6
      1 /* longlong.h -- definitions for mixed size 32/64 bit arithmetic.
      2    Copyright (C) 1991-2020 Free Software Foundation, Inc.
      3 
      4    This file is part of the GNU C Library.
      5 
      6    The GNU C Library is free software; you can redistribute it and/or
      7    modify it under the terms of the GNU Lesser General Public
      8    License as published by the Free Software Foundation; either
      9    version 2.1 of the License, or (at your option) any later version.
     10 
     11    In addition to the permissions in the GNU Lesser General Public
     12    License, the Free Software Foundation gives you unlimited
     13    permission to link the compiled version of this file into
     14    combinations with other programs, and to distribute those
     15    combinations without any restriction coming from the use of this
     16    file.  (The Lesser General Public License restrictions do apply in
     17    other respects; for example, they cover modification of the file,
     18    and distribution when not linked into a combine executable.)
     19 
     20    The GNU C Library is distributed in the hope that it will be useful,
     21    but WITHOUT ANY WARRANTY; without even the implied warranty of
     22    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
     23    Lesser General Public License for more details.
     24 
     25    You should have received a copy of the GNU Lesser General Public
     26    License along with the GNU C Library; if not, see
     27    <http://www.gnu.org/licenses/>.  */
     28 
     29 /* You have to define the following before including this file:
     30 
     31    UWtype -- An unsigned type, default type for operations (typically a "word")
     32    UHWtype -- An unsigned type, at least half the size of UWtype.
     33    UDWtype -- An unsigned type, at least twice as large a UWtype
     34    W_TYPE_SIZE -- size in bits of UWtype
     35 
     36    UQItype -- Unsigned 8 bit type.
     37    SItype, USItype -- Signed and unsigned 32 bit types.
     38    DItype, UDItype -- Signed and unsigned 64 bit types.
     39 
     40    On a 32 bit machine UWtype should typically be USItype;
     41    on a 64 bit machine, UWtype should typically be UDItype.  */
     42 
     43 #define __BITS4 (W_TYPE_SIZE / 4)
     44 #define __ll_B ((UWtype) 1 << (W_TYPE_SIZE / 2))
     45 #define __ll_lowpart(t) ((UWtype) (t) & (__ll_B - 1))
     46 #define __ll_highpart(t) ((UWtype) (t) >> (W_TYPE_SIZE / 2))
     47 
     48 #ifndef W_TYPE_SIZE
     49 #define W_TYPE_SIZE	32
     50 #define UWtype		USItype
     51 #define UHWtype		USItype
     52 #define UDWtype		UDItype
     53 #endif
     54 
     55 /* Used in glibc only.  */
     56 #ifndef attribute_hidden
     57 #define attribute_hidden
     58 #endif
     59 
     60 extern const UQItype __clz_tab[256] attribute_hidden;
     61 
     62 /* Define auxiliary asm macros.
     63 
     64    1) umul_ppmm(high_prod, low_prod, multiplier, multiplicand) multiplies two
     65    UWtype integers MULTIPLIER and MULTIPLICAND, and generates a two UWtype
     66    word product in HIGH_PROD and LOW_PROD.
     67 
     68    2) __umulsidi3(a,b) multiplies two UWtype integers A and B, and returns a
     69    UDWtype product.  This is just a variant of umul_ppmm.
     70 
     71    3) udiv_qrnnd(quotient, remainder, high_numerator, low_numerator,
     72    denominator) divides a UDWtype, composed by the UWtype integers
     73    HIGH_NUMERATOR and LOW_NUMERATOR, by DENOMINATOR and places the quotient
     74    in QUOTIENT and the remainder in REMAINDER.  HIGH_NUMERATOR must be less
     75    than DENOMINATOR for correct operation.  If, in addition, the most
     76    significant bit of DENOMINATOR must be 1, then the pre-processor symbol
     77    UDIV_NEEDS_NORMALIZATION is defined to 1.
     78 
     79    4) sdiv_qrnnd(quotient, remainder, high_numerator, low_numerator,
     80    denominator).  Like udiv_qrnnd but the numbers are signed.  The quotient
     81    is rounded towards 0.
     82 
     83    5) count_leading_zeros(count, x) counts the number of zero-bits from the
     84    msb to the first nonzero bit in the UWtype X.  This is the number of
     85    steps X needs to be shifted left to set the msb.  Undefined for X == 0,
     86    unless the symbol COUNT_LEADING_ZEROS_0 is defined to some value.
     87 
     88    6) count_trailing_zeros(count, x) like count_leading_zeros, but counts
     89    from the least significant end.
     90 
     91    7) add_ssaaaa(high_sum, low_sum, high_addend_1, low_addend_1,
     92    high_addend_2, low_addend_2) adds two UWtype integers, composed by
     93    HIGH_ADDEND_1 and LOW_ADDEND_1, and HIGH_ADDEND_2 and LOW_ADDEND_2
     94    respectively.  The result is placed in HIGH_SUM and LOW_SUM.  Overflow
     95    (i.e. carry out) is not stored anywhere, and is lost.
     96 
     97    8) sub_ddmmss(high_difference, low_difference, high_minuend, low_minuend,
     98    high_subtrahend, low_subtrahend) subtracts two two-word UWtype integers,
     99    composed by HIGH_MINUEND_1 and LOW_MINUEND_1, and HIGH_SUBTRAHEND_2 and
    100    LOW_SUBTRAHEND_2 respectively.  The result is placed in HIGH_DIFFERENCE
    101    and LOW_DIFFERENCE.  Overflow (i.e. carry out) is not stored anywhere,
    102    and is lost.
    103 
    104    If any of these macros are left undefined for a particular CPU,
    105    C macros are used.  */
    106 
    107 /* The CPUs come in alphabetical order below.
    108 
    109    Please add support for more CPUs here, or improve the current support
    110    for the CPUs below!
    111    (E.g. WE32100, IBM360.)  */
    112 
    113 #if defined (__GNUC__) && !defined (NO_ASM)
    114 
    115 /* We sometimes need to clobber "cc" with gcc2, but that would not be
    116    understood by gcc1.  Use cpp to avoid major code duplication.  */
    117 #if __GNUC__ < 2
    118 #define __CLOBBER_CC
    119 #define __AND_CLOBBER_CC
    120 #else /* __GNUC__ >= 2 */
    121 #define __CLOBBER_CC : "cc"
    122 #define __AND_CLOBBER_CC , "cc"
    123 #endif /* __GNUC__ < 2 */
    124 
    125 #if defined (__aarch64__)
    126 
    127 #if W_TYPE_SIZE == 32
    128 #define count_leading_zeros(COUNT, X)	((COUNT) = __builtin_clz (X))
    129 #define count_trailing_zeros(COUNT, X)   ((COUNT) = __builtin_ctz (X))
    130 #define COUNT_LEADING_ZEROS_0 32
    131 #endif /* W_TYPE_SIZE == 32 */
    132 
    133 #if W_TYPE_SIZE == 64
    134 #define count_leading_zeros(COUNT, X)	((COUNT) = __builtin_clzll (X))
    135 #define count_trailing_zeros(COUNT, X)   ((COUNT) = __builtin_ctzll (X))
    136 #define COUNT_LEADING_ZEROS_0 64
    137 #endif /* W_TYPE_SIZE == 64 */
    138 
    139 #endif /* __aarch64__ */
    140 
    141 #if defined (__alpha) && W_TYPE_SIZE == 64
    142 /* There is a bug in g++ before version 5 that
    143    errors on __builtin_alpha_umulh.  */
    144 #if !defined(__cplusplus) || __GNUC__ >= 5
    145 #define umul_ppmm(ph, pl, m0, m1) \
    146   do {									\
    147     UDItype __m0 = (m0), __m1 = (m1);					\
    148     (ph) = __builtin_alpha_umulh (__m0, __m1);				\
    149     (pl) = __m0 * __m1;							\
    150   } while (0)
    151 #define UMUL_TIME 46
    152 #endif /* !c++ */
    153 #ifndef LONGLONG_STANDALONE
    154 #define udiv_qrnnd(q, r, n1, n0, d) \
    155   do { UDItype __r;							\
    156     (q) = __udiv_qrnnd (&__r, (n1), (n0), (d));				\
    157     (r) = __r;								\
    158   } while (0)
    159 extern UDItype __udiv_qrnnd (UDItype *, UDItype, UDItype, UDItype);
    160 #define UDIV_TIME 220
    161 #endif /* LONGLONG_STANDALONE */
    162 #ifdef __alpha_cix__
    163 #define count_leading_zeros(COUNT,X)	((COUNT) = __builtin_clzl (X))
    164 #define count_trailing_zeros(COUNT,X)	((COUNT) = __builtin_ctzl (X))
    165 #define COUNT_LEADING_ZEROS_0 64
    166 #else
    167 #define count_leading_zeros(COUNT,X) \
    168   do {									\
    169     UDItype __xr = (X), __t, __a;					\
    170     __t = __builtin_alpha_cmpbge (0, __xr);				\
    171     __a = __clz_tab[__t ^ 0xff] - 1;					\
    172     __t = __builtin_alpha_extbl (__xr, __a);				\
    173     (COUNT) = 64 - (__clz_tab[__t] + __a*8);				\
    174   } while (0)
    175 #define count_trailing_zeros(COUNT,X) \
    176   do {									\
    177     UDItype __xr = (X), __t, __a;					\
    178     __t = __builtin_alpha_cmpbge (0, __xr);				\
    179     __t = ~__t & -~__t;							\
    180     __a = ((__t & 0xCC) != 0) * 2;					\
    181     __a += ((__t & 0xF0) != 0) * 4;					\
    182     __a += ((__t & 0xAA) != 0);						\
    183     __t = __builtin_alpha_extbl (__xr, __a);				\
    184     __a <<= 3;								\
    185     __t &= -__t;							\
    186     __a += ((__t & 0xCC) != 0) * 2;					\
    187     __a += ((__t & 0xF0) != 0) * 4;					\
    188     __a += ((__t & 0xAA) != 0);						\
    189     (COUNT) = __a;							\
    190   } while (0)
    191 #endif /* __alpha_cix__ */
    192 #endif /* __alpha */
    193 
    194 #if defined (__arc__) && W_TYPE_SIZE == 32
    195 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \
    196   __asm__ ("add.f	%1, %4, %5\n\tadc	%0, %2, %3"		\
    197 	   : "=r" ((USItype) (sh)),					\
    198 	     "=&r" ((USItype) (sl))					\
    199 	   : "%r" ((USItype) (ah)),					\
    200 	     "rICal" ((USItype) (bh)),					\
    201 	     "%r" ((USItype) (al)),					\
    202 	     "rICal" ((USItype) (bl))					\
    203 	   : "cc")
    204 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \
    205   __asm__ ("sub.f	%1, %4, %5\n\tsbc	%0, %2, %3"		\
    206 	   : "=r" ((USItype) (sh)),					\
    207 	     "=&r" ((USItype) (sl))					\
    208 	   : "r" ((USItype) (ah)),					\
    209 	     "rICal" ((USItype) (bh)),					\
    210 	     "r" ((USItype) (al)),					\
    211 	     "rICal" ((USItype) (bl))					\
    212 	   : "cc")
    213 
    214 #define __umulsidi3(u,v) ((UDItype)(USItype)u*(USItype)v)
    215 #ifdef __ARC_NORM__
    216 #define count_leading_zeros(count, x) \
    217   do									\
    218     {									\
    219       SItype c_;							\
    220 									\
    221       __asm__ ("norm.f\t%0,%1\n\tmov.mi\t%0,-1" : "=r" (c_) : "r" (x) : "cc");\
    222       (count) = c_ + 1;							\
    223     }									\
    224   while (0)
    225 #define COUNT_LEADING_ZEROS_0 32
    226 #endif /* __ARC_NORM__ */
    227 #endif /* __arc__ */
    228 
    229 #if defined (__arm__) && (defined (__thumb2__) || !defined (__thumb__)) \
    230  && W_TYPE_SIZE == 32
    231 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \
    232   __asm__ ("adds	%1, %4, %5\n\tadc	%0, %2, %3"		\
    233 	   : "=r" ((USItype) (sh)),					\
    234 	     "=&r" ((USItype) (sl))					\
    235 	   : "%r" ((USItype) (ah)),					\
    236 	     "rI" ((USItype) (bh)),					\
    237 	     "%r" ((USItype) (al)),					\
    238 	     "rI" ((USItype) (bl)) __CLOBBER_CC)
    239 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \
    240   __asm__ ("subs	%1, %4, %5\n\tsbc	%0, %2, %3"		\
    241 	   : "=r" ((USItype) (sh)),					\
    242 	     "=&r" ((USItype) (sl))					\
    243 	   : "r" ((USItype) (ah)),					\
    244 	     "rI" ((USItype) (bh)),					\
    245 	     "r" ((USItype) (al)),					\
    246 	     "rI" ((USItype) (bl)) __CLOBBER_CC)
    247 # if defined(__ARM_ARCH_2__) || defined(__ARM_ARCH_2A__) \
    248      || defined(__ARM_ARCH_3__)
    249 #  define umul_ppmm(xh, xl, a, b)					\
    250   do {									\
    251     register USItype __t0, __t1, __t2;					\
    252     __asm__ ("%@ Inlined umul_ppmm\n"					\
    253 	   "	mov	%2, %5, lsr #16\n"				\
    254 	   "	mov	%0, %6, lsr #16\n"				\
    255 	   "	bic	%3, %5, %2, lsl #16\n"				\
    256 	   "	bic	%4, %6, %0, lsl #16\n"				\
    257 	   "	mul	%1, %3, %4\n"					\
    258 	   "	mul	%4, %2, %4\n"					\
    259 	   "	mul	%3, %0, %3\n"					\
    260 	   "	mul	%0, %2, %0\n"					\
    261 	   "	adds	%3, %4, %3\n"					\
    262 	   "	addcs	%0, %0, #65536\n"				\
    263 	   "	adds	%1, %1, %3, lsl #16\n"				\
    264 	   "	adc	%0, %0, %3, lsr #16"				\
    265 	   : "=&r" ((USItype) (xh)),					\
    266 	     "=r" ((USItype) (xl)),					\
    267 	     "=&r" (__t0), "=&r" (__t1), "=r" (__t2)			\
    268 	   : "r" ((USItype) (a)),					\
    269 	     "r" ((USItype) (b)) __CLOBBER_CC );			\
    270   } while (0)
    271 #  define UMUL_TIME 20
    272 # else
    273 #  define umul_ppmm(xh, xl, a, b)					\
    274   do {									\
    275     /* Generate umull, under compiler control.  */			\
    276     register UDItype __t0 = (UDItype)(USItype)(a) * (USItype)(b);	\
    277     (xl) = (USItype)__t0;						\
    278     (xh) = (USItype)(__t0 >> 32);					\
    279   } while (0)
    280 #  define UMUL_TIME 3
    281 # endif
    282 # define UDIV_TIME 100
    283 #endif /* __arm__ */
    284 
    285 #if defined(__arm__)
    286 /* Let gcc decide how best to implement count_leading_zeros.  */
    287 #define count_leading_zeros(COUNT,X)	((COUNT) = __builtin_clz (X))
    288 #define count_trailing_zeros(COUNT,X)   ((COUNT) = __builtin_ctz (X))
    289 #define COUNT_LEADING_ZEROS_0 32
    290 #endif
    291 
    292 #if defined (__AVR__)
    293 
    294 #if W_TYPE_SIZE == 16
    295 #define count_leading_zeros(COUNT,X)  ((COUNT) = __builtin_clz (X))
    296 #define count_trailing_zeros(COUNT,X) ((COUNT) = __builtin_ctz (X))
    297 #define COUNT_LEADING_ZEROS_0 16
    298 #endif /* W_TYPE_SIZE == 16 */
    299 
    300 #if W_TYPE_SIZE == 32
    301 #define count_leading_zeros(COUNT,X)  ((COUNT) = __builtin_clzl (X))
    302 #define count_trailing_zeros(COUNT,X) ((COUNT) = __builtin_ctzl (X))
    303 #define COUNT_LEADING_ZEROS_0 32
    304 #endif /* W_TYPE_SIZE == 32 */
    305 
    306 #if W_TYPE_SIZE == 64
    307 #define count_leading_zeros(COUNT,X)  ((COUNT) = __builtin_clzll (X))
    308 #define count_trailing_zeros(COUNT,X) ((COUNT) = __builtin_ctzll (X))
    309 #define COUNT_LEADING_ZEROS_0 64
    310 #endif /* W_TYPE_SIZE == 64 */
    311 
    312 #endif /* defined (__AVR__) */
    313 
    314 #if defined (__CRIS__)
    315 
    316 #if __CRIS_arch_version >= 3
    317 #define count_leading_zeros(COUNT, X) ((COUNT) = __builtin_clz (X))
    318 #define COUNT_LEADING_ZEROS_0 32
    319 #endif /* __CRIS_arch_version >= 3 */
    320 
    321 #if __CRIS_arch_version >= 8
    322 #define count_trailing_zeros(COUNT, X) ((COUNT) = __builtin_ctz (X))
    323 #endif /* __CRIS_arch_version >= 8 */
    324 
    325 #if __CRIS_arch_version >= 10
    326 #define __umulsidi3(u,v) ((UDItype)(USItype) (u) * (UDItype)(USItype) (v))
    327 #else
    328 #define __umulsidi3 __umulsidi3
    329 extern UDItype __umulsidi3 (USItype, USItype);
    330 #endif /* __CRIS_arch_version >= 10 */
    331 
    332 #define umul_ppmm(w1, w0, u, v)		\
    333   do {					\
    334     UDItype __x = __umulsidi3 (u, v);	\
    335     (w0) = (USItype) (__x);		\
    336     (w1) = (USItype) (__x >> 32);	\
    337   } while (0)
    338 
    339 /* FIXME: defining add_ssaaaa and sub_ddmmss should be advantageous for
    340    DFmode ("double" intrinsics, avoiding two of the three insns handling
    341    carry), but defining them as open-code C composing and doing the
    342    operation in DImode (UDImode) shows that the DImode needs work:
    343    register pressure from requiring neighboring registers and the
    344    traffic to and from them come to dominate, in the 4.7 series.  */
    345 
    346 #endif /* defined (__CRIS__) */
    347 
    348 #if defined (__hppa) && W_TYPE_SIZE == 32
    349 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \
    350   __asm__ ("add %4,%5,%1\n\taddc %2,%3,%0"				\
    351 	   : "=r" ((USItype) (sh)),					\
    352 	     "=&r" ((USItype) (sl))					\
    353 	   : "%rM" ((USItype) (ah)),					\
    354 	     "rM" ((USItype) (bh)),					\
    355 	     "%rM" ((USItype) (al)),					\
    356 	     "rM" ((USItype) (bl)))
    357 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \
    358   __asm__ ("sub %4,%5,%1\n\tsubb %2,%3,%0"				\
    359 	   : "=r" ((USItype) (sh)),					\
    360 	     "=&r" ((USItype) (sl))					\
    361 	   : "rM" ((USItype) (ah)),					\
    362 	     "rM" ((USItype) (bh)),					\
    363 	     "rM" ((USItype) (al)),					\
    364 	     "rM" ((USItype) (bl)))
    365 #if defined (_PA_RISC1_1)
    366 #define umul_ppmm(w1, w0, u, v) \
    367   do {									\
    368     union								\
    369       {									\
    370 	UDItype __f;							\
    371 	struct {USItype __w1, __w0;} __w1w0;				\
    372       } __t;								\
    373     __asm__ ("xmpyu %1,%2,%0"						\
    374 	     : "=x" (__t.__f)						\
    375 	     : "x" ((USItype) (u)),					\
    376 	       "x" ((USItype) (v)));					\
    377     (w1) = __t.__w1w0.__w1;						\
    378     (w0) = __t.__w1w0.__w0;						\
    379      } while (0)
    380 #define UMUL_TIME 8
    381 #else
    382 #define UMUL_TIME 30
    383 #endif
    384 #define UDIV_TIME 40
    385 #define count_leading_zeros(count, x) \
    386   do {									\
    387     USItype __tmp;							\
    388     __asm__ (								\
    389        "ldi		1,%0\n"						\
    390 "	extru,=		%1,15,16,%%r0		; Bits 31..16 zero?\n"	\
    391 "	extru,tr	%1,15,16,%1		; No.  Shift down, skip add.\n"\
    392 "	ldo		16(%0),%0		; Yes.  Perform add.\n"	\
    393 "	extru,=		%1,23,8,%%r0		; Bits 15..8 zero?\n"	\
    394 "	extru,tr	%1,23,8,%1		; No.  Shift down, skip add.\n"\
    395 "	ldo		8(%0),%0		; Yes.  Perform add.\n"	\
    396 "	extru,=		%1,27,4,%%r0		; Bits 7..4 zero?\n"	\
    397 "	extru,tr	%1,27,4,%1		; No.  Shift down, skip add.\n"\
    398 "	ldo		4(%0),%0		; Yes.  Perform add.\n"	\
    399 "	extru,=		%1,29,2,%%r0		; Bits 3..2 zero?\n"	\
    400 "	extru,tr	%1,29,2,%1		; No.  Shift down, skip add.\n"\
    401 "	ldo		2(%0),%0		; Yes.  Perform add.\n"	\
    402 "	extru		%1,30,1,%1		; Extract bit 1.\n"	\
    403 "	sub		%0,%1,%0		; Subtract it.\n"	\
    404 	: "=r" (count), "=r" (__tmp) : "1" (x));			\
    405   } while (0)
    406 #endif
    407 
    408 #if (defined (__i370__) || defined (__s390__) || defined (__mvs__)) && W_TYPE_SIZE == 32
    409 #if !defined (__zarch__)
    410 #define smul_ppmm(xh, xl, m0, m1) \
    411   do {									\
    412     union {DItype __ll;							\
    413 	   struct {USItype __h, __l;} __i;				\
    414 	  } __x;							\
    415     __asm__ ("lr %N0,%1\n\tmr %0,%2"					\
    416 	     : "=&r" (__x.__ll)						\
    417 	     : "r" (m0), "r" (m1));					\
    418     (xh) = __x.__i.__h; (xl) = __x.__i.__l;				\
    419   } while (0)
    420 #define sdiv_qrnnd(q, r, n1, n0, d) \
    421   do {									\
    422     union {DItype __ll;							\
    423 	   struct {USItype __h, __l;} __i;				\
    424 	  } __x;							\
    425     __x.__i.__h = n1; __x.__i.__l = n0;					\
    426     __asm__ ("dr %0,%2"							\
    427 	     : "=r" (__x.__ll)						\
    428 	     : "0" (__x.__ll), "r" (d));				\
    429     (q) = __x.__i.__l; (r) = __x.__i.__h;				\
    430   } while (0)
    431 #else
    432 #define smul_ppmm(xh, xl, m0, m1) \
    433   do {                                                                  \
    434     register SItype __r0 __asm__ ("0");					\
    435     register SItype __r1 __asm__ ("1") = (m0);				\
    436 									\
    437     __asm__ ("mr\t%%r0,%3"                                              \
    438 	     : "=r" (__r0), "=r" (__r1)					\
    439 	     : "r"  (__r1),  "r" (m1));					\
    440     (xh) = __r0; (xl) = __r1;						\
    441   } while (0)
    442 
    443 #define sdiv_qrnnd(q, r, n1, n0, d) \
    444   do {									\
    445     register SItype __r0 __asm__ ("0") = (n1);				\
    446     register SItype __r1 __asm__ ("1") = (n0);				\
    447 									\
    448     __asm__ ("dr\t%%r0,%4"                                              \
    449 	     : "=r" (__r0), "=r" (__r1)					\
    450 	     : "r" (__r0), "r" (__r1), "r" (d));			\
    451     (q) = __r1; (r) = __r0;						\
    452   } while (0)
    453 #endif /* __zarch__ */
    454 #endif
    455 
    456 #if (defined (__i386__) || defined (__i486__)) && W_TYPE_SIZE == 32
    457 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \
    458   __asm__ ("add{l} {%5,%1|%1,%5}\n\tadc{l} {%3,%0|%0,%3}"		\
    459 	   : "=r" ((USItype) (sh)),					\
    460 	     "=&r" ((USItype) (sl))					\
    461 	   : "%0" ((USItype) (ah)),					\
    462 	     "g" ((USItype) (bh)),					\
    463 	     "%1" ((USItype) (al)),					\
    464 	     "g" ((USItype) (bl)))
    465 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \
    466   __asm__ ("sub{l} {%5,%1|%1,%5}\n\tsbb{l} {%3,%0|%0,%3}"		\
    467 	   : "=r" ((USItype) (sh)),					\
    468 	     "=&r" ((USItype) (sl))					\
    469 	   : "0" ((USItype) (ah)),					\
    470 	     "g" ((USItype) (bh)),					\
    471 	     "1" ((USItype) (al)),					\
    472 	     "g" ((USItype) (bl)))
    473 #define umul_ppmm(w1, w0, u, v) \
    474   __asm__ ("mul{l} %3"							\
    475 	   : "=a" ((USItype) (w0)),					\
    476 	     "=d" ((USItype) (w1))					\
    477 	   : "%0" ((USItype) (u)),					\
    478 	     "rm" ((USItype) (v)))
    479 #define udiv_qrnnd(q, r, n1, n0, dv) \
    480   __asm__ ("div{l} %4"							\
    481 	   : "=a" ((USItype) (q)),					\
    482 	     "=d" ((USItype) (r))					\
    483 	   : "0" ((USItype) (n0)),					\
    484 	     "1" ((USItype) (n1)),					\
    485 	     "rm" ((USItype) (dv)))
    486 #define count_leading_zeros(count, x)	((count) = __builtin_clz (x))
    487 #define count_trailing_zeros(count, x)	((count) = __builtin_ctz (x))
    488 #define UMUL_TIME 40
    489 #define UDIV_TIME 40
    490 #endif /* 80x86 */
    491 
    492 #if defined (__x86_64__) && W_TYPE_SIZE == 64
    493 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \
    494   __asm__ ("add{q} {%5,%1|%1,%5}\n\tadc{q} {%3,%0|%0,%3}"		\
    495 	   : "=r" ((UDItype) (sh)),					\
    496 	     "=&r" ((UDItype) (sl))					\
    497 	   : "%0" ((UDItype) (ah)),					\
    498 	     "rme" ((UDItype) (bh)),					\
    499 	     "%1" ((UDItype) (al)),					\
    500 	     "rme" ((UDItype) (bl)))
    501 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \
    502   __asm__ ("sub{q} {%5,%1|%1,%5}\n\tsbb{q} {%3,%0|%0,%3}"		\
    503 	   : "=r" ((UDItype) (sh)),					\
    504 	     "=&r" ((UDItype) (sl))					\
    505 	   : "0" ((UDItype) (ah)),					\
    506 	     "rme" ((UDItype) (bh)),					\
    507 	     "1" ((UDItype) (al)),					\
    508 	     "rme" ((UDItype) (bl)))
    509 #define umul_ppmm(w1, w0, u, v) \
    510   __asm__ ("mul{q} %3"							\
    511 	   : "=a" ((UDItype) (w0)),					\
    512 	     "=d" ((UDItype) (w1))					\
    513 	   : "%0" ((UDItype) (u)),					\
    514 	     "rm" ((UDItype) (v)))
    515 #define udiv_qrnnd(q, r, n1, n0, dv) \
    516   __asm__ ("div{q} %4"							\
    517 	   : "=a" ((UDItype) (q)),					\
    518 	     "=d" ((UDItype) (r))					\
    519 	   : "0" ((UDItype) (n0)),					\
    520 	     "1" ((UDItype) (n1)),					\
    521 	     "rm" ((UDItype) (dv)))
    522 #define count_leading_zeros(count, x)	((count) = __builtin_clzll (x))
    523 #define count_trailing_zeros(count, x)	((count) = __builtin_ctzll (x))
    524 #define UMUL_TIME 40
    525 #define UDIV_TIME 40
    526 #endif /* x86_64 */
    527 
    528 #if defined (__i960__) && W_TYPE_SIZE == 32
    529 #define umul_ppmm(w1, w0, u, v) \
    530   ({union {UDItype __ll;						\
    531 	   struct {USItype __l, __h;} __i;				\
    532 	  } __xx;							\
    533   __asm__ ("emul	%2,%1,%0"					\
    534 	   : "=d" (__xx.__ll)						\
    535 	   : "%dI" ((USItype) (u)),					\
    536 	     "dI" ((USItype) (v)));					\
    537   (w1) = __xx.__i.__h; (w0) = __xx.__i.__l;})
    538 #define __umulsidi3(u, v) \
    539   ({UDItype __w;							\
    540     __asm__ ("emul	%2,%1,%0"					\
    541 	     : "=d" (__w)						\
    542 	     : "%dI" ((USItype) (u)),					\
    543 	       "dI" ((USItype) (v)));					\
    544     __w; })
    545 #endif /* __i960__ */
    546 
    547 #if defined (__ia64) && W_TYPE_SIZE == 64
    548 /* This form encourages gcc (pre-release 3.4 at least) to emit predicated
    549    "sub r=r,r" and "sub r=r,r,1", giving a 2 cycle latency.  The generic
    550    code using "al<bl" arithmetically comes out making an actual 0 or 1 in a
    551    register, which takes an extra cycle.  */
    552 #define sub_ddmmss(sh, sl, ah, al, bh, bl)				\
    553   do {									\
    554     UWtype __x;								\
    555     __x = (al) - (bl);							\
    556     if ((al) < (bl))							\
    557       (sh) = (ah) - (bh) - 1;						\
    558     else								\
    559       (sh) = (ah) - (bh);						\
    560     (sl) = __x;								\
    561   } while (0)
    562 
    563 /* Do both product parts in assembly, since that gives better code with
    564    all gcc versions.  Some callers will just use the upper part, and in
    565    that situation we waste an instruction, but not any cycles.  */
    566 #define umul_ppmm(ph, pl, m0, m1)					\
    567   __asm__ ("xma.hu %0 = %2, %3, f0\n\txma.l %1 = %2, %3, f0"		\
    568 	   : "=&f" (ph), "=f" (pl)					\
    569 	   : "f" (m0), "f" (m1))
    570 #define count_leading_zeros(count, x)					\
    571   do {									\
    572     UWtype _x = (x), _y, _a, _c;					\
    573     __asm__ ("mux1 %0 = %1, @rev" : "=r" (_y) : "r" (_x));		\
    574     __asm__ ("czx1.l %0 = %1" : "=r" (_a) : "r" (-_y | _y));		\
    575     _c = (_a - 1) << 3;							\
    576     _x >>= _c;								\
    577     if (_x >= 1 << 4)							\
    578       _x >>= 4, _c += 4;						\
    579     if (_x >= 1 << 2)							\
    580       _x >>= 2, _c += 2;						\
    581     _c += _x >> 1;							\
    582     (count) =  W_TYPE_SIZE - 1 - _c;					\
    583   } while (0)
    584 /* similar to what gcc does for __builtin_ffs, but 0 based rather than 1
    585    based, and we don't need a special case for x==0 here */
    586 #define count_trailing_zeros(count, x)					\
    587   do {									\
    588     UWtype __ctz_x = (x);						\
    589     __asm__ ("popcnt %0 = %1"						\
    590 	     : "=r" (count)						\
    591 	     : "r" ((__ctz_x-1) & ~__ctz_x));				\
    592   } while (0)
    593 #define UMUL_TIME 14
    594 #endif
    595 
    596 #if defined (__M32R__) && W_TYPE_SIZE == 32
    597 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \
    598   /* The cmp clears the condition bit.  */ \
    599   __asm__ ("cmp %0,%0\n\taddx %1,%5\n\taddx %0,%3"			\
    600 	   : "=r" ((USItype) (sh)),					\
    601 	     "=&r" ((USItype) (sl))					\
    602 	   : "0" ((USItype) (ah)),					\
    603 	     "r" ((USItype) (bh)),					\
    604 	     "1" ((USItype) (al)),					\
    605 	     "r" ((USItype) (bl))					\
    606 	   : "cbit")
    607 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \
    608   /* The cmp clears the condition bit.  */ \
    609   __asm__ ("cmp %0,%0\n\tsubx %1,%5\n\tsubx %0,%3"			\
    610 	   : "=r" ((USItype) (sh)),					\
    611 	     "=&r" ((USItype) (sl))					\
    612 	   : "0" ((USItype) (ah)),					\
    613 	     "r" ((USItype) (bh)),					\
    614 	     "1" ((USItype) (al)),					\
    615 	     "r" ((USItype) (bl))					\
    616 	   : "cbit")
    617 #endif /* __M32R__ */
    618 
    619 #if defined (__mc68000__) && W_TYPE_SIZE == 32
    620 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \
    621   __asm__ ("add%.l %5,%1\n\taddx%.l %3,%0"				\
    622 	   : "=d" ((USItype) (sh)),					\
    623 	     "=&d" ((USItype) (sl))					\
    624 	   : "%0" ((USItype) (ah)),					\
    625 	     "d" ((USItype) (bh)),					\
    626 	     "%1" ((USItype) (al)),					\
    627 	     "g" ((USItype) (bl)))
    628 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \
    629   __asm__ ("sub%.l %5,%1\n\tsubx%.l %3,%0"				\
    630 	   : "=d" ((USItype) (sh)),					\
    631 	     "=&d" ((USItype) (sl))					\
    632 	   : "0" ((USItype) (ah)),					\
    633 	     "d" ((USItype) (bh)),					\
    634 	     "1" ((USItype) (al)),					\
    635 	     "g" ((USItype) (bl)))
    636 
    637 /* The '020, '030, '040, '060 and CPU32 have 32x32->64 and 64/32->32q-32r.  */
    638 #if (defined (__mc68020__) && !defined (__mc68060__))
    639 #define umul_ppmm(w1, w0, u, v) \
    640   __asm__ ("mulu%.l %3,%1:%0"						\
    641 	   : "=d" ((USItype) (w0)),					\
    642 	     "=d" ((USItype) (w1))					\
    643 	   : "%0" ((USItype) (u)),					\
    644 	     "dmi" ((USItype) (v)))
    645 #define UMUL_TIME 45
    646 #define udiv_qrnnd(q, r, n1, n0, d) \
    647   __asm__ ("divu%.l %4,%1:%0"						\
    648 	   : "=d" ((USItype) (q)),					\
    649 	     "=d" ((USItype) (r))					\
    650 	   : "0" ((USItype) (n0)),					\
    651 	     "1" ((USItype) (n1)),					\
    652 	     "dmi" ((USItype) (d)))
    653 #define UDIV_TIME 90
    654 #define sdiv_qrnnd(q, r, n1, n0, d) \
    655   __asm__ ("divs%.l %4,%1:%0"						\
    656 	   : "=d" ((USItype) (q)),					\
    657 	     "=d" ((USItype) (r))					\
    658 	   : "0" ((USItype) (n0)),					\
    659 	     "1" ((USItype) (n1)),					\
    660 	     "dmi" ((USItype) (d)))
    661 
    662 #elif defined (__mcoldfire__) /* not mc68020 */
    663 
    664 #define umul_ppmm(xh, xl, a, b) \
    665   __asm__ ("| Inlined umul_ppmm\n"					\
    666 	   "	move%.l	%2,%/d0\n"					\
    667 	   "	move%.l	%3,%/d1\n"					\
    668 	   "	move%.l	%/d0,%/d2\n"					\
    669 	   "	swap	%/d0\n"						\
    670 	   "	move%.l	%/d1,%/d3\n"					\
    671 	   "	swap	%/d1\n"						\
    672 	   "	move%.w	%/d2,%/d4\n"					\
    673 	   "	mulu	%/d3,%/d4\n"					\
    674 	   "	mulu	%/d1,%/d2\n"					\
    675 	   "	mulu	%/d0,%/d3\n"					\
    676 	   "	mulu	%/d0,%/d1\n"					\
    677 	   "	move%.l	%/d4,%/d0\n"					\
    678 	   "	clr%.w	%/d0\n"						\
    679 	   "	swap	%/d0\n"						\
    680 	   "	add%.l	%/d0,%/d2\n"					\
    681 	   "	add%.l	%/d3,%/d2\n"					\
    682 	   "	jcc	1f\n"						\
    683 	   "	add%.l	%#65536,%/d1\n"					\
    684 	   "1:	swap	%/d2\n"						\
    685 	   "	moveq	%#0,%/d0\n"					\
    686 	   "	move%.w	%/d2,%/d0\n"					\
    687 	   "	move%.w	%/d4,%/d2\n"					\
    688 	   "	move%.l	%/d2,%1\n"					\
    689 	   "	add%.l	%/d1,%/d0\n"					\
    690 	   "	move%.l	%/d0,%0"					\
    691 	   : "=g" ((USItype) (xh)),					\
    692 	     "=g" ((USItype) (xl))					\
    693 	   : "g" ((USItype) (a)),					\
    694 	     "g" ((USItype) (b))					\
    695 	   : "d0", "d1", "d2", "d3", "d4")
    696 #define UMUL_TIME 100
    697 #define UDIV_TIME 400
    698 #else /* not ColdFire */
    699 /* %/ inserts REGISTER_PREFIX, %# inserts IMMEDIATE_PREFIX.  */
    700 #define umul_ppmm(xh, xl, a, b) \
    701   __asm__ ("| Inlined umul_ppmm\n"					\
    702 	   "	move%.l	%2,%/d0\n"					\
    703 	   "	move%.l	%3,%/d1\n"					\
    704 	   "	move%.l	%/d0,%/d2\n"					\
    705 	   "	swap	%/d0\n"						\
    706 	   "	move%.l	%/d1,%/d3\n"					\
    707 	   "	swap	%/d1\n"						\
    708 	   "	move%.w	%/d2,%/d4\n"					\
    709 	   "	mulu	%/d3,%/d4\n"					\
    710 	   "	mulu	%/d1,%/d2\n"					\
    711 	   "	mulu	%/d0,%/d3\n"					\
    712 	   "	mulu	%/d0,%/d1\n"					\
    713 	   "	move%.l	%/d4,%/d0\n"					\
    714 	   "	eor%.w	%/d0,%/d0\n"					\
    715 	   "	swap	%/d0\n"						\
    716 	   "	add%.l	%/d0,%/d2\n"					\
    717 	   "	add%.l	%/d3,%/d2\n"					\
    718 	   "	jcc	1f\n"						\
    719 	   "	add%.l	%#65536,%/d1\n"					\
    720 	   "1:	swap	%/d2\n"						\
    721 	   "	moveq	%#0,%/d0\n"					\
    722 	   "	move%.w	%/d2,%/d0\n"					\
    723 	   "	move%.w	%/d4,%/d2\n"					\
    724 	   "	move%.l	%/d2,%1\n"					\
    725 	   "	add%.l	%/d1,%/d0\n"					\
    726 	   "	move%.l	%/d0,%0"					\
    727 	   : "=g" ((USItype) (xh)),					\
    728 	     "=g" ((USItype) (xl))					\
    729 	   : "g" ((USItype) (a)),					\
    730 	     "g" ((USItype) (b))					\
    731 	   : "d0", "d1", "d2", "d3", "d4")
    732 #define UMUL_TIME 100
    733 #define UDIV_TIME 400
    734 
    735 #endif /* not mc68020 */
    736 
    737 /* The '020, '030, '040 and '060 have bitfield insns.
    738    cpu32 disguises as a 68020, but lacks them.  */
    739 #if defined (__mc68020__) && !defined (__mcpu32__)
    740 #define count_leading_zeros(count, x) \
    741   __asm__ ("bfffo %1{%b2:%b2},%0"					\
    742 	   : "=d" ((USItype) (count))					\
    743 	   : "od" ((USItype) (x)), "n" (0))
    744 /* Some ColdFire architectures have a ff1 instruction supported via
    745    __builtin_clz. */
    746 #elif defined (__mcfisaaplus__) || defined (__mcfisac__)
    747 #define count_leading_zeros(count,x) ((count) = __builtin_clz (x))
    748 #define COUNT_LEADING_ZEROS_0 32
    749 #endif
    750 #endif /* mc68000 */
    751 
    752 #if defined (__m88000__) && W_TYPE_SIZE == 32
    753 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \
    754   __asm__ ("addu.co %1,%r4,%r5\n\taddu.ci %0,%r2,%r3"			\
    755 	   : "=r" ((USItype) (sh)),					\
    756 	     "=&r" ((USItype) (sl))					\
    757 	   : "%rJ" ((USItype) (ah)),					\
    758 	     "rJ" ((USItype) (bh)),					\
    759 	     "%rJ" ((USItype) (al)),					\
    760 	     "rJ" ((USItype) (bl)))
    761 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \
    762   __asm__ ("subu.co %1,%r4,%r5\n\tsubu.ci %0,%r2,%r3"			\
    763 	   : "=r" ((USItype) (sh)),					\
    764 	     "=&r" ((USItype) (sl))					\
    765 	   : "rJ" ((USItype) (ah)),					\
    766 	     "rJ" ((USItype) (bh)),					\
    767 	     "rJ" ((USItype) (al)),					\
    768 	     "rJ" ((USItype) (bl)))
    769 #define count_leading_zeros(count, x) \
    770   do {									\
    771     USItype __cbtmp;							\
    772     __asm__ ("ff1 %0,%1"						\
    773 	     : "=r" (__cbtmp)						\
    774 	     : "r" ((USItype) (x)));					\
    775     (count) = __cbtmp ^ 31;						\
    776   } while (0)
    777 #define COUNT_LEADING_ZEROS_0 63 /* sic */
    778 #if defined (__mc88110__)
    779 #define umul_ppmm(wh, wl, u, v) \
    780   do {									\
    781     union {UDItype __ll;						\
    782 	   struct {USItype __h, __l;} __i;				\
    783 	  } __xx;							\
    784     __asm__ ("mulu.d	%0,%1,%2"					\
    785 	     : "=r" (__xx.__ll)						\
    786 	     : "r" ((USItype) (u)),					\
    787 	       "r" ((USItype) (v)));					\
    788     (wh) = __xx.__i.__h;						\
    789     (wl) = __xx.__i.__l;						\
    790   } while (0)
    791 #define udiv_qrnnd(q, r, n1, n0, d) \
    792   ({union {UDItype __ll;						\
    793 	   struct {USItype __h, __l;} __i;				\
    794 	  } __xx;							\
    795   USItype __q;								\
    796   __xx.__i.__h = (n1); __xx.__i.__l = (n0);				\
    797   __asm__ ("divu.d %0,%1,%2"						\
    798 	   : "=r" (__q)							\
    799 	   : "r" (__xx.__ll),						\
    800 	     "r" ((USItype) (d)));					\
    801   (r) = (n0) - __q * (d); (q) = __q; })
    802 #define UMUL_TIME 5
    803 #define UDIV_TIME 25
    804 #else
    805 #define UMUL_TIME 17
    806 #define UDIV_TIME 150
    807 #endif /* __mc88110__ */
    808 #endif /* __m88000__ */
    809 
    810 #if defined (__mn10300__)
    811 # if defined (__AM33__)
    812 #  define count_leading_zeros(COUNT,X)	((COUNT) = __builtin_clz (X))
    813 #  define umul_ppmm(w1, w0, u, v)		\
    814     asm("mulu %3,%2,%1,%0" : "=r"(w0), "=r"(w1) : "r"(u), "r"(v))
    815 #  define smul_ppmm(w1, w0, u, v)		\
    816     asm("mul %3,%2,%1,%0" : "=r"(w0), "=r"(w1) : "r"(u), "r"(v))
    817 # else
    818 #  define umul_ppmm(w1, w0, u, v)		\
    819     asm("nop; nop; mulu %3,%0" : "=d"(w0), "=z"(w1) : "%0"(u), "d"(v))
    820 #  define smul_ppmm(w1, w0, u, v)		\
    821     asm("nop; nop; mul %3,%0" : "=d"(w0), "=z"(w1) : "%0"(u), "d"(v))
    822 # endif
    823 # define add_ssaaaa(sh, sl, ah, al, bh, bl)	\
    824   do {						\
    825     DWunion __s, __a, __b;			\
    826     __a.s.low = (al); __a.s.high = (ah);	\
    827     __b.s.low = (bl); __b.s.high = (bh);	\
    828     __s.ll = __a.ll + __b.ll;			\
    829     (sl) = __s.s.low; (sh) = __s.s.high;	\
    830   } while (0)
    831 # define sub_ddmmss(sh, sl, ah, al, bh, bl)	\
    832   do {						\
    833     DWunion __s, __a, __b;			\
    834     __a.s.low = (al); __a.s.high = (ah);	\
    835     __b.s.low = (bl); __b.s.high = (bh);	\
    836     __s.ll = __a.ll - __b.ll;			\
    837     (sl) = __s.s.low; (sh) = __s.s.high;	\
    838   } while (0)
    839 # define udiv_qrnnd(q, r, nh, nl, d)		\
    840   asm("divu %2,%0" : "=D"(q), "=z"(r) : "D"(d), "0"(nl), "1"(nh))
    841 # define sdiv_qrnnd(q, r, nh, nl, d)		\
    842   asm("div %2,%0" : "=D"(q), "=z"(r) : "D"(d), "0"(nl), "1"(nh))
    843 # define UMUL_TIME 3
    844 # define UDIV_TIME 38
    845 #endif
    846 
    847 #if defined (__mips__) && W_TYPE_SIZE == 32
    848 #define umul_ppmm(w1, w0, u, v)						\
    849   do {									\
    850     UDItype __x = (UDItype) (USItype) (u) * (USItype) (v);		\
    851     (w1) = (USItype) (__x >> 32);					\
    852     (w0) = (USItype) (__x);						\
    853   } while (0)
    854 #define UMUL_TIME 10
    855 #define UDIV_TIME 100
    856 
    857 #if (__mips == 32 || __mips == 64) && ! defined (__mips16)
    858 #define count_leading_zeros(COUNT,X)	((COUNT) = __builtin_clz (X))
    859 #define COUNT_LEADING_ZEROS_0 32
    860 #endif
    861 #endif /* __mips__ */
    862 
    863 /* FIXME: We should test _IBMR2 here when we add assembly support for the
    864    system vendor compilers.
    865    FIXME: What's needed for gcc PowerPC VxWorks?  __vxworks__ is not good
    866    enough, since that hits ARM and m68k too.  */
    867 #if (defined (_ARCH_PPC)	/* AIX */				\
    868      || defined (__powerpc__)	/* gcc */				\
    869      || defined (__POWERPC__)	/* BEOS */				\
    870      || defined (__ppc__)	/* Darwin */				\
    871      || (defined (PPC) && ! defined (CPU_FAMILY)) /* gcc 2.7.x GNU&SysV */    \
    872      || (defined (PPC) && defined (CPU_FAMILY)    /* VxWorks */               \
    873 	 && CPU_FAMILY == PPC)                                                \
    874      ) && W_TYPE_SIZE == 32
    875 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \
    876   do {									\
    877     if (__builtin_constant_p (bh) && (bh) == 0)				\
    878       __asm__ ("add%I4c %1,%3,%4\n\taddze %0,%2"		\
    879 	     : "=r" (sh), "=&r" (sl) : "r" (ah), "%r" (al), "rI" (bl));\
    880     else if (__builtin_constant_p (bh) && (bh) == ~(USItype) 0)		\
    881       __asm__ ("add%I4c %1,%3,%4\n\taddme %0,%2"		\
    882 	     : "=r" (sh), "=&r" (sl) : "r" (ah), "%r" (al), "rI" (bl));\
    883     else								\
    884       __asm__ ("add%I5c %1,%4,%5\n\tadde %0,%2,%3"		\
    885 	     : "=r" (sh), "=&r" (sl)					\
    886 	     : "%r" (ah), "r" (bh), "%r" (al), "rI" (bl));		\
    887   } while (0)
    888 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \
    889   do {									\
    890     if (__builtin_constant_p (ah) && (ah) == 0)				\
    891       __asm__ ("subf%I3c %1,%4,%3\n\tsubfze %0,%2"	\
    892 	       : "=r" (sh), "=&r" (sl) : "r" (bh), "rI" (al), "r" (bl));\
    893     else if (__builtin_constant_p (ah) && (ah) == ~(USItype) 0)		\
    894       __asm__ ("subf%I3c %1,%4,%3\n\tsubfme %0,%2"	\
    895 	       : "=r" (sh), "=&r" (sl) : "r" (bh), "rI" (al), "r" (bl));\
    896     else if (__builtin_constant_p (bh) && (bh) == 0)			\
    897       __asm__ ("subf%I3c %1,%4,%3\n\taddme %0,%2"		\
    898 	       : "=r" (sh), "=&r" (sl) : "r" (ah), "rI" (al), "r" (bl));\
    899     else if (__builtin_constant_p (bh) && (bh) == ~(USItype) 0)		\
    900       __asm__ ("subf%I3c %1,%4,%3\n\taddze %0,%2"		\
    901 	       : "=r" (sh), "=&r" (sl) : "r" (ah), "rI" (al), "r" (bl));\
    902     else								\
    903       __asm__ ("subf%I4c %1,%5,%4\n\tsubfe %0,%3,%2"	\
    904 	       : "=r" (sh), "=&r" (sl)					\
    905 	       : "r" (ah), "r" (bh), "rI" (al), "r" (bl));		\
    906   } while (0)
    907 #define count_leading_zeros(count, x) \
    908   __asm__ ("cntlzw %0,%1" : "=r" (count) : "r" (x))
    909 #define COUNT_LEADING_ZEROS_0 32
    910 #if defined (_ARCH_PPC) || defined (__powerpc__) || defined (__POWERPC__) \
    911   || defined (__ppc__)                                                    \
    912   || (defined (PPC) && ! defined (CPU_FAMILY)) /* gcc 2.7.x GNU&SysV */       \
    913   || (defined (PPC) && defined (CPU_FAMILY)    /* VxWorks */                  \
    914 	 && CPU_FAMILY == PPC)
    915 #define umul_ppmm(ph, pl, m0, m1) \
    916   do {									\
    917     USItype __m0 = (m0), __m1 = (m1);					\
    918     __asm__ ("mulhwu %0,%1,%2" : "=r" (ph) : "%r" (m0), "r" (m1));	\
    919     (pl) = __m0 * __m1;							\
    920   } while (0)
    921 #define UMUL_TIME 15
    922 #define smul_ppmm(ph, pl, m0, m1) \
    923   do {									\
    924     SItype __m0 = (m0), __m1 = (m1);					\
    925     __asm__ ("mulhw %0,%1,%2" : "=r" (ph) : "%r" (m0), "r" (m1));	\
    926     (pl) = __m0 * __m1;							\
    927   } while (0)
    928 #define SMUL_TIME 14
    929 #define UDIV_TIME 120
    930 #endif
    931 #endif /* 32-bit POWER architecture variants.  */
    932 
    933 /* We should test _IBMR2 here when we add assembly support for the system
    934    vendor compilers.  */
    935 #if (defined (_ARCH_PPC64) || defined (__powerpc64__)) && W_TYPE_SIZE == 64
    936 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \
    937   do {									\
    938     if (__builtin_constant_p (bh) && (bh) == 0)				\
    939       __asm__ ("add%I4c %1,%3,%4\n\taddze %0,%2"		\
    940 	     : "=r" (sh), "=&r" (sl) : "r" (ah), "%r" (al), "rI" (bl));\
    941     else if (__builtin_constant_p (bh) && (bh) == ~(UDItype) 0)		\
    942       __asm__ ("add%I4c %1,%3,%4\n\taddme %0,%2"		\
    943 	     : "=r" (sh), "=&r" (sl) : "r" (ah), "%r" (al), "rI" (bl));\
    944     else								\
    945       __asm__ ("add%I5c %1,%4,%5\n\tadde %0,%2,%3"		\
    946 	     : "=r" (sh), "=&r" (sl)					\
    947 	     : "%r" (ah), "r" (bh), "%r" (al), "rI" (bl));		\
    948   } while (0)
    949 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \
    950   do {									\
    951     if (__builtin_constant_p (ah) && (ah) == 0)				\
    952       __asm__ ("subf%I3c %1,%4,%3\n\tsubfze %0,%2"	\
    953 	       : "=r" (sh), "=&r" (sl) : "r" (bh), "rI" (al), "r" (bl));\
    954     else if (__builtin_constant_p (ah) && (ah) == ~(UDItype) 0)		\
    955       __asm__ ("subf%I3c %1,%4,%3\n\tsubfme %0,%2"	\
    956 	       : "=r" (sh), "=&r" (sl) : "r" (bh), "rI" (al), "r" (bl));\
    957     else if (__builtin_constant_p (bh) && (bh) == 0)			\
    958       __asm__ ("subf%I3c %1,%4,%3\n\taddme %0,%2"		\
    959 	       : "=r" (sh), "=&r" (sl) : "r" (ah), "rI" (al), "r" (bl));\
    960     else if (__builtin_constant_p (bh) && (bh) == ~(UDItype) 0)		\
    961       __asm__ ("subf%I3c %1,%4,%3\n\taddze %0,%2"		\
    962 	       : "=r" (sh), "=&r" (sl) : "r" (ah), "rI" (al), "r" (bl));\
    963     else								\
    964       __asm__ ("subf%I4c %1,%5,%4\n\tsubfe %0,%3,%2"	\
    965 	       : "=r" (sh), "=&r" (sl)					\
    966 	       : "r" (ah), "r" (bh), "rI" (al), "r" (bl));		\
    967   } while (0)
    968 #define count_leading_zeros(count, x) \
    969   __asm__ ("cntlzd %0,%1" : "=r" (count) : "r" (x))
    970 #define COUNT_LEADING_ZEROS_0 64
    971 #define umul_ppmm(ph, pl, m0, m1) \
    972   do {									\
    973     UDItype __m0 = (m0), __m1 = (m1);					\
    974     __asm__ ("mulhdu %0,%1,%2" : "=r" (ph) : "%r" (m0), "r" (m1));	\
    975     (pl) = __m0 * __m1;							\
    976   } while (0)
    977 #define UMUL_TIME 15
    978 #define smul_ppmm(ph, pl, m0, m1) \
    979   do {									\
    980     DItype __m0 = (m0), __m1 = (m1);					\
    981     __asm__ ("mulhd %0,%1,%2" : "=r" (ph) : "%r" (m0), "r" (m1));	\
    982     (pl) = __m0 * __m1;							\
    983   } while (0)
    984 #define SMUL_TIME 14  /* ??? */
    985 #define UDIV_TIME 120 /* ??? */
    986 #endif /* 64-bit PowerPC.  */
    987 
    988 #if defined (__ibm032__) /* RT/ROMP */ && W_TYPE_SIZE == 32
    989 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \
    990   __asm__ ("a %1,%5\n\tae %0,%3"					\
    991 	   : "=r" ((USItype) (sh)),					\
    992 	     "=&r" ((USItype) (sl))					\
    993 	   : "%0" ((USItype) (ah)),					\
    994 	     "r" ((USItype) (bh)),					\
    995 	     "%1" ((USItype) (al)),					\
    996 	     "r" ((USItype) (bl)))
    997 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \
    998   __asm__ ("s %1,%5\n\tse %0,%3"					\
    999 	   : "=r" ((USItype) (sh)),					\
   1000 	     "=&r" ((USItype) (sl))					\
   1001 	   : "0" ((USItype) (ah)),					\
   1002 	     "r" ((USItype) (bh)),					\
   1003 	     "1" ((USItype) (al)),					\
   1004 	     "r" ((USItype) (bl)))
   1005 #define umul_ppmm(ph, pl, m0, m1) \
   1006   do {									\
   1007     USItype __m0 = (m0), __m1 = (m1);					\
   1008     __asm__ (								\
   1009        "s	r2,r2\n"						\
   1010 "	mts	r10,%2\n"						\
   1011 "	m	r2,%3\n"						\
   1012 "	m	r2,%3\n"						\
   1013 "	m	r2,%3\n"						\
   1014 "	m	r2,%3\n"						\
   1015 "	m	r2,%3\n"						\
   1016 "	m	r2,%3\n"						\
   1017 "	m	r2,%3\n"						\
   1018 "	m	r2,%3\n"						\
   1019 "	m	r2,%3\n"						\
   1020 "	m	r2,%3\n"						\
   1021 "	m	r2,%3\n"						\
   1022 "	m	r2,%3\n"						\
   1023 "	m	r2,%3\n"						\
   1024 "	m	r2,%3\n"						\
   1025 "	m	r2,%3\n"						\
   1026 "	m	r2,%3\n"						\
   1027 "	cas	%0,r2,r0\n"						\
   1028 "	mfs	r10,%1"							\
   1029 	     : "=r" ((USItype) (ph)),					\
   1030 	       "=r" ((USItype) (pl))					\
   1031 	     : "%r" (__m0),						\
   1032 		"r" (__m1)						\
   1033 	     : "r2");							\
   1034     (ph) += ((((SItype) __m0 >> 31) & __m1)				\
   1035 	     + (((SItype) __m1 >> 31) & __m0));				\
   1036   } while (0)
   1037 #define UMUL_TIME 20
   1038 #define UDIV_TIME 200
   1039 #define count_leading_zeros(count, x) \
   1040   do {									\
   1041     if ((x) >= 0x10000)							\
   1042       __asm__ ("clz	%0,%1"						\
   1043 	       : "=r" ((USItype) (count))				\
   1044 	       : "r" ((USItype) (x) >> 16));				\
   1045     else								\
   1046       {									\
   1047 	__asm__ ("clz	%0,%1"						\
   1048 		 : "=r" ((USItype) (count))				\
   1049 		 : "r" ((USItype) (x)));					\
   1050 	(count) += 16;							\
   1051       }									\
   1052   } while (0)
   1053 #endif
   1054 
   1055 #if defined(__riscv)
   1056 #ifdef __riscv_mul
   1057 #define __umulsidi3(u,v) ((UDWtype)(UWtype)(u) * (UWtype)(v))
   1058 #define __muluw3(a, b) ((UWtype)(a) * (UWtype)(b))
   1059 #else
   1060 #if __riscv_xlen == 32
   1061   #define MULUW3 "call __mulsi3"
   1062 #elif __riscv_xlen == 64
   1063   #define MULUW3 "call __muldi3"
   1064 #else
   1065 #error unsupport xlen
   1066 #endif /* __riscv_xlen */
   1067 /* We rely on the fact that MULUW3 doesn't clobber the t-registers.
   1068    It can get better register allocation result.  */
   1069 #define __muluw3(a, b) \
   1070   ({ \
   1071     register UWtype __op0 asm ("a0") = a; \
   1072     register UWtype __op1 asm ("a1") = b; \
   1073     asm volatile (MULUW3 \
   1074                   : "+r" (__op0), "+r" (__op1) \
   1075                   : \
   1076                   : "ra", "a2", "a3"); \
   1077     __op0; \
   1078   })
   1079 #endif /* __riscv_mul */
   1080 #define umul_ppmm(w1, w0, u, v) \
   1081   do { \
   1082     UWtype __x0, __x1, __x2, __x3; \
   1083     UHWtype __ul, __vl, __uh, __vh; \
   1084  \
   1085     __ul = __ll_lowpart (u); \
   1086     __uh = __ll_highpart (u); \
   1087     __vl = __ll_lowpart (v); \
   1088     __vh = __ll_highpart (v); \
   1089  \
   1090     __x0 = __muluw3 (__ul, __vl); \
   1091     __x1 = __muluw3 (__ul, __vh); \
   1092     __x2 = __muluw3 (__uh, __vl); \
   1093     __x3 = __muluw3 (__uh, __vh); \
   1094  \
   1095     __x1 += __ll_highpart (__x0);/* this can't give carry */ \
   1096     __x1 += __x2; /* but this indeed can */ \
   1097     if (__x1 < __x2) /* did we get it? */ \
   1098       __x3 += __ll_B; /* yes, add it in the proper pos.  */ \
   1099  \
   1100     (w1) = __x3 + __ll_highpart (__x1); \
   1101     (w0) = __ll_lowpart (__x1) * __ll_B + __ll_lowpart (__x0); \
   1102   } while (0)
   1103 #endif /* __riscv */
   1104 
   1105 #if defined(__sh__) && W_TYPE_SIZE == 32
   1106 #ifndef __sh1__
   1107 #define umul_ppmm(w1, w0, u, v) \
   1108   __asm__ (								\
   1109        "dmulu.l	%2,%3\n\tsts%M1	macl,%1\n\tsts%M0	mach,%0"	\
   1110 	   : "=r<" ((USItype)(w1)),					\
   1111 	     "=r<" ((USItype)(w0))					\
   1112 	   : "r" ((USItype)(u)),					\
   1113 	     "r" ((USItype)(v))						\
   1114 	   : "macl", "mach")
   1115 #define UMUL_TIME 5
   1116 #endif
   1117 
   1118 /* This is the same algorithm as __udiv_qrnnd_c.  */
   1119 #define UDIV_NEEDS_NORMALIZATION 1
   1120 
   1121 #ifdef __FDPIC__
   1122 /* FDPIC needs a special version of the asm fragment to extract the
   1123    code address from the function descriptor. __udiv_qrnnd_16 is
   1124    assumed to be local and not to use the GOT, so loading r12 is
   1125    not needed. */
   1126 #define udiv_qrnnd(q, r, n1, n0, d) \
   1127   do {									\
   1128     extern UWtype __udiv_qrnnd_16 (UWtype, UWtype)			\
   1129 			__attribute__ ((visibility ("hidden")));	\
   1130     /* r0: rn r1: qn */ /* r0: n1 r4: n0 r5: d r6: d1 */ /* r2: __m */	\
   1131     __asm__ (								\
   1132 	"mov%M4	%4,r5\n"						\
   1133 "	swap.w	%3,r4\n"						\
   1134 "	swap.w	r5,r6\n"						\
   1135 "	mov.l	@%5,r2\n"						\
   1136 "	jsr	@r2\n"							\
   1137 "	shll16	r6\n"							\
   1138 "	swap.w	r4,r4\n"						\
   1139 "	mov.l	@%5,r2\n"						\
   1140 "	jsr	@r2\n"							\
   1141 "	swap.w	r1,%0\n"						\
   1142 "	or	r1,%0"							\
   1143 	: "=r" (q), "=&z" (r)						\
   1144 	: "1" (n1), "r" (n0), "rm" (d), "r" (&__udiv_qrnnd_16)		\
   1145 	: "r1", "r2", "r4", "r5", "r6", "pr", "t");			\
   1146   } while (0)
   1147 #else
   1148 #define udiv_qrnnd(q, r, n1, n0, d) \
   1149   do {									\
   1150     extern UWtype __udiv_qrnnd_16 (UWtype, UWtype)			\
   1151 			__attribute__ ((visibility ("hidden")));	\
   1152     /* r0: rn r1: qn */ /* r0: n1 r4: n0 r5: d r6: d1 */ /* r2: __m */	\
   1153     __asm__ (								\
   1154 	"mov%M4 %4,r5\n"						\
   1155 "	swap.w %3,r4\n"							\
   1156 "	swap.w r5,r6\n"							\
   1157 "	jsr @%5\n"							\
   1158 "	shll16 r6\n"							\
   1159 "	swap.w r4,r4\n"							\
   1160 "	jsr @%5\n"							\
   1161 "	swap.w r1,%0\n"							\
   1162 "	or r1,%0"							\
   1163 	: "=r" (q), "=&z" (r)						\
   1164 	: "1" (n1), "r" (n0), "rm" (d), "r" (&__udiv_qrnnd_16)		\
   1165 	: "r1", "r2", "r4", "r5", "r6", "pr", "t");			\
   1166   } while (0)
   1167 #endif /* __FDPIC__  */
   1168 
   1169 #define UDIV_TIME 80
   1170 
   1171 #define sub_ddmmss(sh, sl, ah, al, bh, bl)				\
   1172   __asm__ ("clrt;subc %5,%1; subc %4,%0"				\
   1173 	   : "=r" (sh), "=r" (sl)					\
   1174 	   : "0" (ah), "1" (al), "r" (bh), "r" (bl) : "t")
   1175 
   1176 #endif /* __sh__ */
   1177 
   1178 #if defined (__sparc__) && !defined (__arch64__) && !defined (__sparcv9) \
   1179     && W_TYPE_SIZE == 32
   1180 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \
   1181   __asm__ ("addcc %r4,%5,%1\n\taddx %r2,%3,%0"				\
   1182 	   : "=r" ((USItype) (sh)),					\
   1183 	     "=&r" ((USItype) (sl))					\
   1184 	   : "%rJ" ((USItype) (ah)),					\
   1185 	     "rI" ((USItype) (bh)),					\
   1186 	     "%rJ" ((USItype) (al)),					\
   1187 	     "rI" ((USItype) (bl))					\
   1188 	   __CLOBBER_CC)
   1189 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \
   1190   __asm__ ("subcc %r4,%5,%1\n\tsubx %r2,%3,%0"				\
   1191 	   : "=r" ((USItype) (sh)),					\
   1192 	     "=&r" ((USItype) (sl))					\
   1193 	   : "rJ" ((USItype) (ah)),					\
   1194 	     "rI" ((USItype) (bh)),					\
   1195 	     "rJ" ((USItype) (al)),					\
   1196 	     "rI" ((USItype) (bl))					\
   1197 	   __CLOBBER_CC)
   1198 #if defined (__sparc_v9__)
   1199 #define umul_ppmm(w1, w0, u, v) \
   1200   do {									\
   1201     register USItype __g1 asm ("g1");					\
   1202     __asm__ ("umul\t%2,%3,%1\n\t"					\
   1203 	     "srlx\t%1, 32, %0"						\
   1204 	     : "=r" ((USItype) (w1)),					\
   1205 	       "=r" (__g1)						\
   1206 	     : "r" ((USItype) (u)),					\
   1207 	       "r" ((USItype) (v)));					\
   1208     (w0) = __g1;							\
   1209   } while (0)
   1210 #define udiv_qrnnd(__q, __r, __n1, __n0, __d) \
   1211   __asm__ ("mov\t%2,%%y\n\t"						\
   1212 	   "udiv\t%3,%4,%0\n\t"						\
   1213 	   "umul\t%0,%4,%1\n\t"						\
   1214 	   "sub\t%3,%1,%1"						\
   1215 	   : "=&r" ((USItype) (__q)),					\
   1216 	     "=&r" ((USItype) (__r))					\
   1217 	   : "r" ((USItype) (__n1)),					\
   1218 	     "r" ((USItype) (__n0)),					\
   1219 	     "r" ((USItype) (__d)))
   1220 #else
   1221 #if defined (__sparc_v8__)
   1222 #define umul_ppmm(w1, w0, u, v) \
   1223   __asm__ ("umul %2,%3,%1;rd %%y,%0"					\
   1224 	   : "=r" ((USItype) (w1)),					\
   1225 	     "=r" ((USItype) (w0))					\
   1226 	   : "r" ((USItype) (u)),					\
   1227 	     "r" ((USItype) (v)))
   1228 #define udiv_qrnnd(__q, __r, __n1, __n0, __d) \
   1229   __asm__ ("mov %2,%%y;nop;nop;nop;udiv %3,%4,%0;umul %0,%4,%1;sub %3,%1,%1"\
   1230 	   : "=&r" ((USItype) (__q)),					\
   1231 	     "=&r" ((USItype) (__r))					\
   1232 	   : "r" ((USItype) (__n1)),					\
   1233 	     "r" ((USItype) (__n0)),					\
   1234 	     "r" ((USItype) (__d)))
   1235 #else
   1236 #if defined (__sparclite__)
   1237 /* This has hardware multiply but not divide.  It also has two additional
   1238    instructions scan (ffs from high bit) and divscc.  */
   1239 #define umul_ppmm(w1, w0, u, v) \
   1240   __asm__ ("umul %2,%3,%1;rd %%y,%0"					\
   1241 	   : "=r" ((USItype) (w1)),					\
   1242 	     "=r" ((USItype) (w0))					\
   1243 	   : "r" ((USItype) (u)),					\
   1244 	     "r" ((USItype) (v)))
   1245 #define udiv_qrnnd(q, r, n1, n0, d) \
   1246   __asm__ ("! Inlined udiv_qrnnd\n"					\
   1247 "	wr	%%g0,%2,%%y	! Not a delayed write for sparclite\n"	\
   1248 "	tst	%%g0\n"							\
   1249 "	divscc	%3,%4,%%g1\n"						\
   1250 "	divscc	%%g1,%4,%%g1\n"						\
   1251 "	divscc	%%g1,%4,%%g1\n"						\
   1252 "	divscc	%%g1,%4,%%g1\n"						\
   1253 "	divscc	%%g1,%4,%%g1\n"						\
   1254 "	divscc	%%g1,%4,%%g1\n"						\
   1255 "	divscc	%%g1,%4,%%g1\n"						\
   1256 "	divscc	%%g1,%4,%%g1\n"						\
   1257 "	divscc	%%g1,%4,%%g1\n"						\
   1258 "	divscc	%%g1,%4,%%g1\n"						\
   1259 "	divscc	%%g1,%4,%%g1\n"						\
   1260 "	divscc	%%g1,%4,%%g1\n"						\
   1261 "	divscc	%%g1,%4,%%g1\n"						\
   1262 "	divscc	%%g1,%4,%%g1\n"						\
   1263 "	divscc	%%g1,%4,%%g1\n"						\
   1264 "	divscc	%%g1,%4,%%g1\n"						\
   1265 "	divscc	%%g1,%4,%%g1\n"						\
   1266 "	divscc	%%g1,%4,%%g1\n"						\
   1267 "	divscc	%%g1,%4,%%g1\n"						\
   1268 "	divscc	%%g1,%4,%%g1\n"						\
   1269 "	divscc	%%g1,%4,%%g1\n"						\
   1270 "	divscc	%%g1,%4,%%g1\n"						\
   1271 "	divscc	%%g1,%4,%%g1\n"						\
   1272 "	divscc	%%g1,%4,%%g1\n"						\
   1273 "	divscc	%%g1,%4,%%g1\n"						\
   1274 "	divscc	%%g1,%4,%%g1\n"						\
   1275 "	divscc	%%g1,%4,%%g1\n"						\
   1276 "	divscc	%%g1,%4,%%g1\n"						\
   1277 "	divscc	%%g1,%4,%%g1\n"						\
   1278 "	divscc	%%g1,%4,%%g1\n"						\
   1279 "	divscc	%%g1,%4,%%g1\n"						\
   1280 "	divscc	%%g1,%4,%0\n"						\
   1281 "	rd	%%y,%1\n"						\
   1282 "	bl,a 1f\n"							\
   1283 "	add	%1,%4,%1\n"						\
   1284 "1:	! End of inline udiv_qrnnd"					\
   1285 	   : "=r" ((USItype) (q)),					\
   1286 	     "=r" ((USItype) (r))					\
   1287 	   : "r" ((USItype) (n1)),					\
   1288 	     "r" ((USItype) (n0)),					\
   1289 	     "rI" ((USItype) (d))					\
   1290 	   : "g1" __AND_CLOBBER_CC)
   1291 #define UDIV_TIME 37
   1292 #define count_leading_zeros(count, x) \
   1293   do {                                                                  \
   1294   __asm__ ("scan %1,1,%0"                                               \
   1295 	   : "=r" ((USItype) (count))                                   \
   1296 	   : "r" ((USItype) (x)));					\
   1297   } while (0)
   1298 /* Early sparclites return 63 for an argument of 0, but they warn that future
   1299    implementations might change this.  Therefore, leave COUNT_LEADING_ZEROS_0
   1300    undefined.  */
   1301 #else
   1302 /* SPARC without integer multiplication and divide instructions.
   1303    (i.e. at least Sun4/20,40,60,65,75,110,260,280,330,360,380,470,490) */
   1304 #define umul_ppmm(w1, w0, u, v) \
   1305   __asm__ ("! Inlined umul_ppmm\n"					\
   1306 "	wr	%%g0,%2,%%y	! SPARC has 0-3 delay insn after a wr\n"\
   1307 "	sra	%3,31,%%o5	! Don't move this insn\n"		\
   1308 "	and	%2,%%o5,%%o5	! Don't move this insn\n"		\
   1309 "	andcc	%%g0,0,%%g1	! Don't move this insn\n"		\
   1310 "	mulscc	%%g1,%3,%%g1\n"						\
   1311 "	mulscc	%%g1,%3,%%g1\n"						\
   1312 "	mulscc	%%g1,%3,%%g1\n"						\
   1313 "	mulscc	%%g1,%3,%%g1\n"						\
   1314 "	mulscc	%%g1,%3,%%g1\n"						\
   1315 "	mulscc	%%g1,%3,%%g1\n"						\
   1316 "	mulscc	%%g1,%3,%%g1\n"						\
   1317 "	mulscc	%%g1,%3,%%g1\n"						\
   1318 "	mulscc	%%g1,%3,%%g1\n"						\
   1319 "	mulscc	%%g1,%3,%%g1\n"						\
   1320 "	mulscc	%%g1,%3,%%g1\n"						\
   1321 "	mulscc	%%g1,%3,%%g1\n"						\
   1322 "	mulscc	%%g1,%3,%%g1\n"						\
   1323 "	mulscc	%%g1,%3,%%g1\n"						\
   1324 "	mulscc	%%g1,%3,%%g1\n"						\
   1325 "	mulscc	%%g1,%3,%%g1\n"						\
   1326 "	mulscc	%%g1,%3,%%g1\n"						\
   1327 "	mulscc	%%g1,%3,%%g1\n"						\
   1328 "	mulscc	%%g1,%3,%%g1\n"						\
   1329 "	mulscc	%%g1,%3,%%g1\n"						\
   1330 "	mulscc	%%g1,%3,%%g1\n"						\
   1331 "	mulscc	%%g1,%3,%%g1\n"						\
   1332 "	mulscc	%%g1,%3,%%g1\n"						\
   1333 "	mulscc	%%g1,%3,%%g1\n"						\
   1334 "	mulscc	%%g1,%3,%%g1\n"						\
   1335 "	mulscc	%%g1,%3,%%g1\n"						\
   1336 "	mulscc	%%g1,%3,%%g1\n"						\
   1337 "	mulscc	%%g1,%3,%%g1\n"						\
   1338 "	mulscc	%%g1,%3,%%g1\n"						\
   1339 "	mulscc	%%g1,%3,%%g1\n"						\
   1340 "	mulscc	%%g1,%3,%%g1\n"						\
   1341 "	mulscc	%%g1,%3,%%g1\n"						\
   1342 "	mulscc	%%g1,0,%%g1\n"						\
   1343 "	add	%%g1,%%o5,%0\n"						\
   1344 "	rd	%%y,%1"							\
   1345 	   : "=r" ((USItype) (w1)),					\
   1346 	     "=r" ((USItype) (w0))					\
   1347 	   : "%rI" ((USItype) (u)),					\
   1348 	     "r" ((USItype) (v))						\
   1349 	   : "g1", "o5" __AND_CLOBBER_CC)
   1350 #define UMUL_TIME 39		/* 39 instructions */
   1351 /* It's quite necessary to add this much assembler for the sparc.
   1352    The default udiv_qrnnd (in C) is more than 10 times slower!  */
   1353 #define udiv_qrnnd(__q, __r, __n1, __n0, __d) \
   1354   __asm__ ("! Inlined udiv_qrnnd\n"					\
   1355 "	mov	32,%%g1\n"						\
   1356 "	subcc	%1,%2,%%g0\n"						\
   1357 "1:	bcs	5f\n"							\
   1358 "	 addxcc %0,%0,%0	! shift n1n0 and a q-bit in lsb\n"	\
   1359 "	sub	%1,%2,%1	! this kills msb of n\n"		\
   1360 "	addx	%1,%1,%1	! so this can't give carry\n"		\
   1361 "	subcc	%%g1,1,%%g1\n"						\
   1362 "2:	bne	1b\n"							\
   1363 "	 subcc	%1,%2,%%g0\n"						\
   1364 "	bcs	3f\n"							\
   1365 "	 addxcc %0,%0,%0	! shift n1n0 and a q-bit in lsb\n"	\
   1366 "	b	3f\n"							\
   1367 "	 sub	%1,%2,%1	! this kills msb of n\n"		\
   1368 "4:	sub	%1,%2,%1\n"						\
   1369 "5:	addxcc	%1,%1,%1\n"						\
   1370 "	bcc	2b\n"							\
   1371 "	 subcc	%%g1,1,%%g1\n"						\
   1372 "! Got carry from n.  Subtract next step to cancel this carry.\n"	\
   1373 "	bne	4b\n"							\
   1374 "	 addcc	%0,%0,%0	! shift n1n0 and a 0-bit in lsb\n"	\
   1375 "	sub	%1,%2,%1\n"						\
   1376 "3:	xnor	%0,0,%0\n"						\
   1377 "	! End of inline udiv_qrnnd"					\
   1378 	   : "=&r" ((USItype) (__q)),					\
   1379 	     "=&r" ((USItype) (__r))					\
   1380 	   : "r" ((USItype) (__d)),					\
   1381 	     "1" ((USItype) (__n1)),					\
   1382 	     "0" ((USItype) (__n0)) : "g1" __AND_CLOBBER_CC)
   1383 #define UDIV_TIME (3+7*32)	/* 7 instructions/iteration. 32 iterations.  */
   1384 #endif /* __sparclite__ */
   1385 #endif /* __sparc_v8__ */
   1386 #endif /* __sparc_v9__ */
   1387 #endif /* sparc32 */
   1388 
   1389 #if ((defined (__sparc__) && defined (__arch64__)) || defined (__sparcv9)) \
   1390     && W_TYPE_SIZE == 64
   1391 #define add_ssaaaa(sh, sl, ah, al, bh, bl)				\
   1392   do {									\
   1393     UDItype __carry = 0;						\
   1394     __asm__ ("addcc\t%r5,%6,%1\n\t"					\
   1395 	     "add\t%r3,%4,%0\n\t"					\
   1396 	     "movcs\t%%xcc, 1, %2\n\t"					\
   1397 	     "add\t%0, %2, %0"						\
   1398 	     : "=r" ((UDItype)(sh)),				      	\
   1399 	       "=&r" ((UDItype)(sl)),				      	\
   1400 	       "+r" (__carry)				      		\
   1401 	     : "%rJ" ((UDItype)(ah)),				     	\
   1402 	       "rI" ((UDItype)(bh)),				      	\
   1403 	       "%rJ" ((UDItype)(al)),				     	\
   1404 	       "rI" ((UDItype)(bl))				       	\
   1405 	     __CLOBBER_CC);						\
   1406   } while (0)
   1407 
   1408 #define sub_ddmmss(sh, sl, ah, al, bh, bl)				\
   1409   do {									\
   1410     UDItype __carry = 0;						\
   1411     __asm__ ("subcc\t%r5,%6,%1\n\t"					\
   1412 	     "sub\t%r3,%4,%0\n\t"					\
   1413 	     "movcs\t%%xcc, 1, %2\n\t"					\
   1414 	     "sub\t%0, %2, %0"						\
   1415 	     : "=r" ((UDItype)(sh)),				      	\
   1416 	       "=&r" ((UDItype)(sl)),				      	\
   1417 	       "+r" (__carry)				      		\
   1418 	     : "%rJ" ((UDItype)(ah)),				     	\
   1419 	       "rI" ((UDItype)(bh)),				      	\
   1420 	       "%rJ" ((UDItype)(al)),				     	\
   1421 	       "rI" ((UDItype)(bl))				       	\
   1422 	     __CLOBBER_CC);						\
   1423   } while (0)
   1424 
   1425 #define umul_ppmm(wh, wl, u, v)						\
   1426   do {									\
   1427 	  UDItype tmp1, tmp2, tmp3, tmp4;				\
   1428 	  __asm__ __volatile__ (					\
   1429 		   "srl %7,0,%3\n\t"					\
   1430 		   "mulx %3,%6,%1\n\t"					\
   1431 		   "srlx %6,32,%2\n\t"					\
   1432 		   "mulx %2,%3,%4\n\t"					\
   1433 		   "sllx %4,32,%5\n\t"					\
   1434 		   "srl %6,0,%3\n\t"					\
   1435 		   "sub %1,%5,%5\n\t"					\
   1436 		   "srlx %5,32,%5\n\t"					\
   1437 		   "addcc %4,%5,%4\n\t"					\
   1438 		   "srlx %7,32,%5\n\t"					\
   1439 		   "mulx %3,%5,%3\n\t"					\
   1440 		   "mulx %2,%5,%5\n\t"					\
   1441 		   "sethi %%hi(0x80000000),%2\n\t"			\
   1442 		   "addcc %4,%3,%4\n\t"					\
   1443 		   "srlx %4,32,%4\n\t"					\
   1444 		   "add %2,%2,%2\n\t"					\
   1445 		   "movcc %%xcc,%%g0,%2\n\t"				\
   1446 		   "addcc %5,%4,%5\n\t"					\
   1447 		   "sllx %3,32,%3\n\t"					\
   1448 		   "add %1,%3,%1\n\t"					\
   1449 		   "add %5,%2,%0"					\
   1450 	   : "=r" ((UDItype)(wh)),					\
   1451 	     "=&r" ((UDItype)(wl)),					\
   1452 	     "=&r" (tmp1), "=&r" (tmp2), "=&r" (tmp3), "=&r" (tmp4)	\
   1453 	   : "r" ((UDItype)(u)),					\
   1454 	     "r" ((UDItype)(v))						\
   1455 	   __CLOBBER_CC);						\
   1456   } while (0)
   1457 #define UMUL_TIME 96
   1458 #define UDIV_TIME 230
   1459 #endif /* sparc64 */
   1460 
   1461 #if defined (__vax__) && W_TYPE_SIZE == 32
   1462 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \
   1463   __asm__ ("addl2 %5,%1\n\tadwc %3,%0"					\
   1464 	   : "=g" ((USItype) (sh)),					\
   1465 	     "=&g" ((USItype) (sl))					\
   1466 	   : "%0" ((USItype) (ah)),					\
   1467 	     "g" ((USItype) (bh)),					\
   1468 	     "%1" ((USItype) (al)),					\
   1469 	     "g" ((USItype) (bl)))
   1470 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \
   1471   __asm__ ("subl2 %5,%1\n\tsbwc %3,%0"					\
   1472 	   : "=g" ((USItype) (sh)),					\
   1473 	     "=&g" ((USItype) (sl))					\
   1474 	   : "0" ((USItype) (ah)),					\
   1475 	     "g" ((USItype) (bh)),					\
   1476 	     "1" ((USItype) (al)),					\
   1477 	     "g" ((USItype) (bl)))
   1478 #define umul_ppmm(xh, xl, m0, m1) \
   1479   do {									\
   1480     union {								\
   1481 	UDItype __ll;							\
   1482 	struct {USItype __l, __h;} __i;					\
   1483       } __xx;								\
   1484     USItype __m0 = (m0), __m1 = (m1);					\
   1485     __asm__ ("emul %1,%2,$0,%0"						\
   1486 	     : "=r" (__xx.__ll)						\
   1487 	     : "g" (__m0),						\
   1488 	       "g" (__m1));						\
   1489     (xh) = __xx.__i.__h;						\
   1490     (xl) = __xx.__i.__l;						\
   1491     (xh) += ((((SItype) __m0 >> 31) & __m1)				\
   1492 	     + (((SItype) __m1 >> 31) & __m0));				\
   1493   } while (0)
   1494 #define sdiv_qrnnd(q, r, n1, n0, d) \
   1495   do {									\
   1496     union {DItype __ll;							\
   1497 	   struct {SItype __l, __h;} __i;				\
   1498 	  } __xx;							\
   1499     __xx.__i.__h = n1; __xx.__i.__l = n0;				\
   1500     __asm__ ("ediv %3,%2,%0,%1"						\
   1501 	     : "=g" (q), "=g" (r)					\
   1502 	     : "g" (__xx.__ll), "g" (d));				\
   1503   } while (0)
   1504 #endif /* __vax__ */
   1505 
   1506 #ifdef _TMS320C6X
   1507 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \
   1508   do									\
   1509     {									\
   1510       UDItype __ll;							\
   1511       __asm__ ("addu .l1 %1, %2, %0"					\
   1512 	       : "=a" (__ll) : "a" (al), "a" (bl));			\
   1513       (sl) = (USItype)__ll;						\
   1514       (sh) = ((USItype)(__ll >> 32)) + (ah) + (bh);			\
   1515     }									\
   1516   while (0)
   1517 
   1518 #ifdef _TMS320C6400_PLUS
   1519 #define __umulsidi3(u,v) ((UDItype)(USItype)u*(USItype)v)
   1520 #define umul_ppmm(w1, w0, u, v)						\
   1521   do {									\
   1522     UDItype __x = (UDItype) (USItype) (u) * (USItype) (v);		\
   1523     (w1) = (USItype) (__x >> 32);					\
   1524     (w0) = (USItype) (__x);						\
   1525   } while (0)
   1526 #endif  /* _TMS320C6400_PLUS */
   1527 
   1528 #define count_leading_zeros(count, x)	((count) = __builtin_clz (x))
   1529 #ifdef _TMS320C6400
   1530 #define count_trailing_zeros(count, x)	((count) = __builtin_ctz (x))
   1531 #endif
   1532 #define UMUL_TIME 4
   1533 #define UDIV_TIME 40
   1534 #endif /* _TMS320C6X */
   1535 
   1536 #if defined (__xtensa__) && W_TYPE_SIZE == 32
   1537 /* This code is not Xtensa-configuration-specific, so rely on the compiler
   1538    to expand builtin functions depending on what configuration features
   1539    are available.  This avoids library calls when the operation can be
   1540    performed in-line.  */
   1541 #define umul_ppmm(w1, w0, u, v)						\
   1542   do {									\
   1543     DWunion __w;							\
   1544     __w.ll = __builtin_umulsidi3 (u, v);				\
   1545     w1 = __w.s.high;							\
   1546     w0 = __w.s.low;							\
   1547   } while (0)
   1548 #define __umulsidi3(u, v)		__builtin_umulsidi3 (u, v)
   1549 #define count_leading_zeros(COUNT, X)	((COUNT) = __builtin_clz (X))
   1550 #define count_trailing_zeros(COUNT, X)	((COUNT) = __builtin_ctz (X))
   1551 #endif /* __xtensa__ */
   1552 
   1553 #if defined xstormy16
   1554 extern UHItype __stormy16_count_leading_zeros (UHItype);
   1555 #define count_leading_zeros(count, x)					\
   1556   do									\
   1557     {									\
   1558       UHItype size;							\
   1559 									\
   1560       /* We assume that W_TYPE_SIZE is a multiple of 16...  */		\
   1561       for ((count) = 0, size = W_TYPE_SIZE; size; size -= 16)		\
   1562 	{								\
   1563 	  UHItype c;							\
   1564 									\
   1565 	  c = __clzhi2 ((x) >> (size - 16));				\
   1566 	  (count) += c;							\
   1567 	  if (c != 16)							\
   1568 	    break;							\
   1569 	}								\
   1570     }									\
   1571   while (0)
   1572 #define COUNT_LEADING_ZEROS_0 W_TYPE_SIZE
   1573 #endif
   1574 
   1575 #if defined (__z8000__) && W_TYPE_SIZE == 16
   1576 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \
   1577   __asm__ ("add	%H1,%H5\n\tadc	%H0,%H3"				\
   1578 	   : "=r" ((unsigned int)(sh)),					\
   1579 	     "=&r" ((unsigned int)(sl))					\
   1580 	   : "%0" ((unsigned int)(ah)),					\
   1581 	     "r" ((unsigned int)(bh)),					\
   1582 	     "%1" ((unsigned int)(al)),					\
   1583 	     "rQR" ((unsigned int)(bl)))
   1584 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \
   1585   __asm__ ("sub	%H1,%H5\n\tsbc	%H0,%H3"				\
   1586 	   : "=r" ((unsigned int)(sh)),					\
   1587 	     "=&r" ((unsigned int)(sl))					\
   1588 	   : "0" ((unsigned int)(ah)),					\
   1589 	     "r" ((unsigned int)(bh)),					\
   1590 	     "1" ((unsigned int)(al)),					\
   1591 	     "rQR" ((unsigned int)(bl)))
   1592 #define umul_ppmm(xh, xl, m0, m1) \
   1593   do {									\
   1594     union {long int __ll;						\
   1595 	   struct {unsigned int __h, __l;} __i;				\
   1596 	  } __xx;							\
   1597     unsigned int __m0 = (m0), __m1 = (m1);				\
   1598     __asm__ ("mult	%S0,%H3"					\
   1599 	     : "=r" (__xx.__i.__h),					\
   1600 	       "=r" (__xx.__i.__l)					\
   1601 	     : "%1" (__m0),						\
   1602 	       "rQR" (__m1));						\
   1603     (xh) = __xx.__i.__h; (xl) = __xx.__i.__l;				\
   1604     (xh) += ((((signed int) __m0 >> 15) & __m1)				\
   1605 	     + (((signed int) __m1 >> 15) & __m0));			\
   1606   } while (0)
   1607 #endif /* __z8000__ */
   1608 
   1609 #endif /* __GNUC__ */
   1610 
   1611 /* If this machine has no inline assembler, use C macros.  */
   1612 
   1613 #if !defined (add_ssaaaa)
   1614 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \
   1615   do {									\
   1616     UWtype __x;								\
   1617     __x = (al) + (bl);							\
   1618     (sh) = (ah) + (bh) + (__x < (al));					\
   1619     (sl) = __x;								\
   1620   } while (0)
   1621 #endif
   1622 
   1623 #if !defined (sub_ddmmss)
   1624 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \
   1625   do {									\
   1626     UWtype __x;								\
   1627     __x = (al) - (bl);							\
   1628     (sh) = (ah) - (bh) - (__x > (al));					\
   1629     (sl) = __x;								\
   1630   } while (0)
   1631 #endif
   1632 
   1633 /* If we lack umul_ppmm but have smul_ppmm, define umul_ppmm in terms of
   1634    smul_ppmm.  */
   1635 #if !defined (umul_ppmm) && defined (smul_ppmm)
   1636 #define umul_ppmm(w1, w0, u, v)						\
   1637   do {									\
   1638     UWtype __w1;							\
   1639     UWtype __xm0 = (u), __xm1 = (v);					\
   1640     smul_ppmm (__w1, w0, __xm0, __xm1);					\
   1641     (w1) = __w1 + (-(__xm0 >> (W_TYPE_SIZE - 1)) & __xm1)		\
   1642 		+ (-(__xm1 >> (W_TYPE_SIZE - 1)) & __xm0);		\
   1643   } while (0)
   1644 #endif
   1645 
   1646 /* If we still don't have umul_ppmm, define it using plain C.  */
   1647 #if !defined (umul_ppmm)
   1648 #define umul_ppmm(w1, w0, u, v)						\
   1649   do {									\
   1650     UWtype __x0, __x1, __x2, __x3;					\
   1651     UHWtype __ul, __vl, __uh, __vh;					\
   1652 									\
   1653     __ul = __ll_lowpart (u);						\
   1654     __uh = __ll_highpart (u);						\
   1655     __vl = __ll_lowpart (v);						\
   1656     __vh = __ll_highpart (v);						\
   1657 									\
   1658     __x0 = (UWtype) __ul * __vl;					\
   1659     __x1 = (UWtype) __ul * __vh;					\
   1660     __x2 = (UWtype) __uh * __vl;					\
   1661     __x3 = (UWtype) __uh * __vh;					\
   1662 									\
   1663     __x1 += __ll_highpart (__x0);/* this can't give carry */		\
   1664     __x1 += __x2;		/* but this indeed can */		\
   1665     if (__x1 < __x2)		/* did we get it? */			\
   1666       __x3 += __ll_B;		/* yes, add it in the proper pos.  */	\
   1667 									\
   1668     (w1) = __x3 + __ll_highpart (__x1);					\
   1669     (w0) = __ll_lowpart (__x1) * __ll_B + __ll_lowpart (__x0);		\
   1670   } while (0)
   1671 #endif
   1672 
   1673 #if !defined (__umulsidi3)
   1674 #define __umulsidi3(u, v) \
   1675   ({DWunion __w;							\
   1676     umul_ppmm (__w.s.high, __w.s.low, u, v);				\
   1677     __w.ll; })
   1678 #endif
   1679 
   1680 /* Define this unconditionally, so it can be used for debugging.  */
   1681 #define __udiv_qrnnd_c(q, r, n1, n0, d) \
   1682   do {									\
   1683     UWtype __d1, __d0, __q1, __q0;					\
   1684     UWtype __r1, __r0, __m;						\
   1685     __d1 = __ll_highpart (d);						\
   1686     __d0 = __ll_lowpart (d);						\
   1687 									\
   1688     __r1 = (n1) % __d1;							\
   1689     __q1 = (n1) / __d1;							\
   1690     __m = (UWtype) __q1 * __d0;						\
   1691     __r1 = __r1 * __ll_B | __ll_highpart (n0);				\
   1692     if (__r1 < __m)							\
   1693       {									\
   1694 	__q1--, __r1 += (d);						\
   1695 	if (__r1 >= (d)) /* i.e. we didn't get carry when adding to __r1 */\
   1696 	  if (__r1 < __m)						\
   1697 	    __q1--, __r1 += (d);					\
   1698       }									\
   1699     __r1 -= __m;							\
   1700 									\
   1701     __r0 = __r1 % __d1;							\
   1702     __q0 = __r1 / __d1;							\
   1703     __m = (UWtype) __q0 * __d0;						\
   1704     __r0 = __r0 * __ll_B | __ll_lowpart (n0);				\
   1705     if (__r0 < __m)							\
   1706       {									\
   1707 	__q0--, __r0 += (d);						\
   1708 	if (__r0 >= (d))						\
   1709 	  if (__r0 < __m)						\
   1710 	    __q0--, __r0 += (d);					\
   1711       }									\
   1712     __r0 -= __m;							\
   1713 									\
   1714     (q) = (UWtype) __q1 * __ll_B | __q0;				\
   1715     (r) = __r0;								\
   1716   } while (0)
   1717 
   1718 /* If the processor has no udiv_qrnnd but sdiv_qrnnd, go through
   1719    __udiv_w_sdiv (defined in libgcc or elsewhere).  */
   1720 #if !defined (udiv_qrnnd) && defined (sdiv_qrnnd)
   1721 #define udiv_qrnnd(q, r, nh, nl, d) \
   1722   do {									\
   1723     extern UWtype __udiv_w_sdiv (UWtype *, UWtype, UWtype, UWtype);	\
   1724     UWtype __r;								\
   1725     (q) = __udiv_w_sdiv (&__r, nh, nl, d);				\
   1726     (r) = __r;								\
   1727   } while (0)
   1728 #endif
   1729 
   1730 /* If udiv_qrnnd was not defined for this processor, use __udiv_qrnnd_c.  */
   1731 #if !defined (udiv_qrnnd)
   1732 #define UDIV_NEEDS_NORMALIZATION 1
   1733 #define udiv_qrnnd __udiv_qrnnd_c
   1734 #endif
   1735 
   1736 #if !defined (count_leading_zeros)
   1737 #define count_leading_zeros(count, x) \
   1738   do {									\
   1739     UWtype __xr = (x);							\
   1740     UWtype __a;								\
   1741 									\
   1742     if (W_TYPE_SIZE <= 32)						\
   1743       {									\
   1744 	__a = __xr < ((UWtype)1<<2*__BITS4)				\
   1745 	  ? (__xr < ((UWtype)1<<__BITS4) ? 0 : __BITS4)			\
   1746 	  : (__xr < ((UWtype)1<<3*__BITS4) ?  2*__BITS4 : 3*__BITS4);	\
   1747       }									\
   1748     else								\
   1749       {									\
   1750 	for (__a = W_TYPE_SIZE - 8; __a > 0; __a -= 8)			\
   1751 	  if (((__xr >> __a) & 0xff) != 0)				\
   1752 	    break;							\
   1753       }									\
   1754 									\
   1755     (count) = W_TYPE_SIZE - (__clz_tab[__xr >> __a] + __a);		\
   1756   } while (0)
   1757 #define COUNT_LEADING_ZEROS_0 W_TYPE_SIZE
   1758 #endif
   1759 
   1760 #if !defined (count_trailing_zeros)
   1761 /* Define count_trailing_zeros using count_leading_zeros.  The latter might be
   1762    defined in asm, but if it is not, the C version above is good enough.  */
   1763 #define count_trailing_zeros(count, x) \
   1764   do {									\
   1765     UWtype __ctz_x = (x);						\
   1766     UWtype __ctz_c;							\
   1767     count_leading_zeros (__ctz_c, __ctz_x & -__ctz_x);			\
   1768     (count) = W_TYPE_SIZE - 1 - __ctz_c;				\
   1769   } while (0)
   1770 #endif
   1771 
   1772 #ifndef UDIV_NEEDS_NORMALIZATION
   1773 #define UDIV_NEEDS_NORMALIZATION 0
   1774 #endif
   1775