Home | History | Annotate | Line # | Download | only in include
longlong.h revision 1.1
      1 /* longlong.h -- definitions for mixed size 32/64 bit arithmetic.
      2    Copyright (C) 1991-2013 Free Software Foundation, Inc.
      3 
      4    This file is part of the GNU C Library.
      5 
      6    The GNU C Library is free software; you can redistribute it and/or
      7    modify it under the terms of the GNU Lesser General Public
      8    License as published by the Free Software Foundation; either
      9    version 2.1 of the License, or (at your option) any later version.
     10 
     11    In addition to the permissions in the GNU Lesser General Public
     12    License, the Free Software Foundation gives you unlimited
     13    permission to link the compiled version of this file into
     14    combinations with other programs, and to distribute those
     15    combinations without any restriction coming from the use of this
     16    file.  (The Lesser General Public License restrictions do apply in
     17    other respects; for example, they cover modification of the file,
     18    and distribution when not linked into a combine executable.)
     19 
     20    The GNU C Library is distributed in the hope that it will be useful,
     21    but WITHOUT ANY WARRANTY; without even the implied warranty of
     22    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
     23    Lesser General Public License for more details.
     24 
     25    You should have received a copy of the GNU Lesser General Public
     26    License along with the GNU C Library; if not, see
     27    <http://www.gnu.org/licenses/>.  */
     28 
     29 /* You have to define the following before including this file:
     30 
     31    UWtype -- An unsigned type, default type for operations (typically a "word")
     32    UHWtype -- An unsigned type, at least half the size of UWtype.
     33    UDWtype -- An unsigned type, at least twice as large a UWtype
     34    W_TYPE_SIZE -- size in bits of UWtype
     35 
     36    UQItype -- Unsigned 8 bit type.
     37    SItype, USItype -- Signed and unsigned 32 bit types.
     38    DItype, UDItype -- Signed and unsigned 64 bit types.
     39 
     40    On a 32 bit machine UWtype should typically be USItype;
     41    on a 64 bit machine, UWtype should typically be UDItype.  */
     42 
     43 #define __BITS4 (W_TYPE_SIZE / 4)
     44 #define __ll_B ((UWtype) 1 << (W_TYPE_SIZE / 2))
     45 #define __ll_lowpart(t) ((UWtype) (t) & (__ll_B - 1))
     46 #define __ll_highpart(t) ((UWtype) (t) >> (W_TYPE_SIZE / 2))
     47 
     48 #ifndef W_TYPE_SIZE
     49 #define W_TYPE_SIZE	32
     50 #define UWtype		USItype
     51 #define UHWtype		USItype
     52 #define UDWtype		UDItype
     53 #endif
     54 
     55 /* Used in glibc only.  */
     56 #ifndef attribute_hidden
     57 #define attribute_hidden
     58 #endif
     59 
     60 extern const UQItype __clz_tab[256] attribute_hidden;
     61 
     62 /* Define auxiliary asm macros.
     63 
     64    1) umul_ppmm(high_prod, low_prod, multiplier, multiplicand) multiplies two
     65    UWtype integers MULTIPLIER and MULTIPLICAND, and generates a two UWtype
     66    word product in HIGH_PROD and LOW_PROD.
     67 
     68    2) __umulsidi3(a,b) multiplies two UWtype integers A and B, and returns a
     69    UDWtype product.  This is just a variant of umul_ppmm.
     70 
     71    3) udiv_qrnnd(quotient, remainder, high_numerator, low_numerator,
     72    denominator) divides a UDWtype, composed by the UWtype integers
     73    HIGH_NUMERATOR and LOW_NUMERATOR, by DENOMINATOR and places the quotient
     74    in QUOTIENT and the remainder in REMAINDER.  HIGH_NUMERATOR must be less
     75    than DENOMINATOR for correct operation.  If, in addition, the most
     76    significant bit of DENOMINATOR must be 1, then the pre-processor symbol
     77    UDIV_NEEDS_NORMALIZATION is defined to 1.
     78 
     79    4) sdiv_qrnnd(quotient, remainder, high_numerator, low_numerator,
     80    denominator).  Like udiv_qrnnd but the numbers are signed.  The quotient
     81    is rounded towards 0.
     82 
     83    5) count_leading_zeros(count, x) counts the number of zero-bits from the
     84    msb to the first nonzero bit in the UWtype X.  This is the number of
     85    steps X needs to be shifted left to set the msb.  Undefined for X == 0,
     86    unless the symbol COUNT_LEADING_ZEROS_0 is defined to some value.
     87 
     88    6) count_trailing_zeros(count, x) like count_leading_zeros, but counts
     89    from the least significant end.
     90 
     91    7) add_ssaaaa(high_sum, low_sum, high_addend_1, low_addend_1,
     92    high_addend_2, low_addend_2) adds two UWtype integers, composed by
     93    HIGH_ADDEND_1 and LOW_ADDEND_1, and HIGH_ADDEND_2 and LOW_ADDEND_2
     94    respectively.  The result is placed in HIGH_SUM and LOW_SUM.  Overflow
     95    (i.e. carry out) is not stored anywhere, and is lost.
     96 
     97    8) sub_ddmmss(high_difference, low_difference, high_minuend, low_minuend,
     98    high_subtrahend, low_subtrahend) subtracts two two-word UWtype integers,
     99    composed by HIGH_MINUEND_1 and LOW_MINUEND_1, and HIGH_SUBTRAHEND_2 and
    100    LOW_SUBTRAHEND_2 respectively.  The result is placed in HIGH_DIFFERENCE
    101    and LOW_DIFFERENCE.  Overflow (i.e. carry out) is not stored anywhere,
    102    and is lost.
    103 
    104    If any of these macros are left undefined for a particular CPU,
    105    C macros are used.  */
    106 
    107 /* The CPUs come in alphabetical order below.
    108 
    109    Please add support for more CPUs here, or improve the current support
    110    for the CPUs below!
    111    (E.g. WE32100, IBM360.)  */
    112 
    113 #if defined (__GNUC__) && !defined (NO_ASM)
    114 
    115 /* We sometimes need to clobber "cc" with gcc2, but that would not be
    116    understood by gcc1.  Use cpp to avoid major code duplication.  */
    117 #if __GNUC__ < 2
    118 #define __CLOBBER_CC
    119 #define __AND_CLOBBER_CC
    120 #else /* __GNUC__ >= 2 */
    121 #define __CLOBBER_CC : "cc"
    122 #define __AND_CLOBBER_CC , "cc"
    123 #endif /* __GNUC__ < 2 */
    124 
    125 #if defined (__alpha) && W_TYPE_SIZE == 64
    126 #define umul_ppmm(ph, pl, m0, m1) \
    127   do {									\
    128     UDItype __m0 = (m0), __m1 = (m1);					\
    129     (ph) = __builtin_alpha_umulh (__m0, __m1);				\
    130     (pl) = __m0 * __m1;							\
    131   } while (0)
    132 #define UMUL_TIME 46
    133 #ifndef LONGLONG_STANDALONE
    134 #define udiv_qrnnd(q, r, n1, n0, d) \
    135   do { UDItype __r;							\
    136     (q) = __udiv_qrnnd (&__r, (n1), (n0), (d));				\
    137     (r) = __r;								\
    138   } while (0)
    139 extern UDItype __udiv_qrnnd (UDItype *, UDItype, UDItype, UDItype);
    140 #define UDIV_TIME 220
    141 #endif /* LONGLONG_STANDALONE */
    142 #ifdef __alpha_cix__
    143 #define count_leading_zeros(COUNT,X)	((COUNT) = __builtin_clzl (X))
    144 #define count_trailing_zeros(COUNT,X)	((COUNT) = __builtin_ctzl (X))
    145 #define COUNT_LEADING_ZEROS_0 64
    146 #else
    147 #define count_leading_zeros(COUNT,X) \
    148   do {									\
    149     UDItype __xr = (X), __t, __a;					\
    150     __t = __builtin_alpha_cmpbge (0, __xr);				\
    151     __a = __clz_tab[__t ^ 0xff] - 1;					\
    152     __t = __builtin_alpha_extbl (__xr, __a);				\
    153     (COUNT) = 64 - (__clz_tab[__t] + __a*8);				\
    154   } while (0)
    155 #define count_trailing_zeros(COUNT,X) \
    156   do {									\
    157     UDItype __xr = (X), __t, __a;					\
    158     __t = __builtin_alpha_cmpbge (0, __xr);				\
    159     __t = ~__t & -~__t;							\
    160     __a = ((__t & 0xCC) != 0) * 2;					\
    161     __a += ((__t & 0xF0) != 0) * 4;					\
    162     __a += ((__t & 0xAA) != 0);						\
    163     __t = __builtin_alpha_extbl (__xr, __a);				\
    164     __a <<= 3;								\
    165     __t &= -__t;							\
    166     __a += ((__t & 0xCC) != 0) * 2;					\
    167     __a += ((__t & 0xF0) != 0) * 4;					\
    168     __a += ((__t & 0xAA) != 0);						\
    169     (COUNT) = __a;							\
    170   } while (0)
    171 #endif /* __alpha_cix__ */
    172 #endif /* __alpha */
    173 
    174 #if defined (__arc__) && W_TYPE_SIZE == 32
    175 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \
    176   __asm__ ("add.f	%1, %4, %5\n\tadc	%0, %2, %3"		\
    177 	   : "=r" ((USItype) (sh)),					\
    178 	     "=&r" ((USItype) (sl))					\
    179 	   : "%r" ((USItype) (ah)),					\
    180 	     "rIJ" ((USItype) (bh)),					\
    181 	     "%r" ((USItype) (al)),					\
    182 	     "rIJ" ((USItype) (bl)))
    183 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \
    184   __asm__ ("sub.f	%1, %4, %5\n\tsbc	%0, %2, %3"		\
    185 	   : "=r" ((USItype) (sh)),					\
    186 	     "=&r" ((USItype) (sl))					\
    187 	   : "r" ((USItype) (ah)),					\
    188 	     "rIJ" ((USItype) (bh)),					\
    189 	     "r" ((USItype) (al)),					\
    190 	     "rIJ" ((USItype) (bl)))
    191 
    192 #define __umulsidi3(u,v) ((UDItype)(USItype)u*(USItype)v)
    193 #ifdef __ARC_NORM__
    194 #define count_leading_zeros(count, x) \
    195   do									\
    196     {									\
    197       SItype c_;							\
    198 									\
    199       __asm__ ("norm.f\t%0,%1\n\tmov.mi\t%0,-1" : "=r" (c_) : "r" (x) : "cc");\
    200       (count) = c_ + 1;							\
    201     }									\
    202   while (0)
    203 #define COUNT_LEADING_ZEROS_0 32
    204 #endif
    205 #endif
    206 
    207 #if defined (__arm__) && (defined (__thumb2__) || !defined (__thumb__)) \
    208  && W_TYPE_SIZE == 32
    209 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \
    210   __asm__ ("adds	%1, %4, %5\n\tadc	%0, %2, %3"		\
    211 	   : "=r" ((USItype) (sh)),					\
    212 	     "=&r" ((USItype) (sl))					\
    213 	   : "%r" ((USItype) (ah)),					\
    214 	     "rI" ((USItype) (bh)),					\
    215 	     "%r" ((USItype) (al)),					\
    216 	     "rI" ((USItype) (bl)) __CLOBBER_CC)
    217 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \
    218   __asm__ ("subs	%1, %4, %5\n\tsbc	%0, %2, %3"		\
    219 	   : "=r" ((USItype) (sh)),					\
    220 	     "=&r" ((USItype) (sl))					\
    221 	   : "r" ((USItype) (ah)),					\
    222 	     "rI" ((USItype) (bh)),					\
    223 	     "r" ((USItype) (al)),					\
    224 	     "rI" ((USItype) (bl)) __CLOBBER_CC)
    225 # if defined(__ARM_ARCH_2__) || defined(__ARM_ARCH_2A__) \
    226      || defined(__ARM_ARCH_3__)
    227 #  define umul_ppmm(xh, xl, a, b)					\
    228   do {									\
    229     register USItype __t0, __t1, __t2;					\
    230     __asm__ ("%@ Inlined umul_ppmm\n"					\
    231 	   "	mov	%2, %5, lsr #16\n"				\
    232 	   "	mov	%0, %6, lsr #16\n"				\
    233 	   "	bic	%3, %5, %2, lsl #16\n"				\
    234 	   "	bic	%4, %6, %0, lsl #16\n"				\
    235 	   "	mul	%1, %3, %4\n"					\
    236 	   "	mul	%4, %2, %4\n"					\
    237 	   "	mul	%3, %0, %3\n"					\
    238 	   "	mul	%0, %2, %0\n"					\
    239 	   "	adds	%3, %4, %3\n"					\
    240 	   "	addcs	%0, %0, #65536\n"				\
    241 	   "	adds	%1, %1, %3, lsl #16\n"				\
    242 	   "	adc	%0, %0, %3, lsr #16"				\
    243 	   : "=&r" ((USItype) (xh)),					\
    244 	     "=r" ((USItype) (xl)),					\
    245 	     "=&r" (__t0), "=&r" (__t1), "=r" (__t2)			\
    246 	   : "r" ((USItype) (a)),					\
    247 	     "r" ((USItype) (b)) __CLOBBER_CC );			\
    248   } while (0)
    249 #  define UMUL_TIME 20
    250 # else
    251 #  define umul_ppmm(xh, xl, a, b)					\
    252   do {									\
    253     /* Generate umull, under compiler control.  */			\
    254     register UDItype __t0 = (UDItype)(USItype)(a) * (USItype)(b);	\
    255     (xl) = (USItype)__t0;						\
    256     (xh) = (USItype)(__t0 >> 32);					\
    257   } while (0)
    258 #  define UMUL_TIME 3
    259 # endif
    260 # define UDIV_TIME 100
    261 #endif /* __arm__ */
    262 
    263 #if defined(__arm__)
    264 /* Let gcc decide how best to implement count_leading_zeros.  */
    265 #define count_leading_zeros(COUNT,X)	((COUNT) = __builtin_clz (X))
    266 #define count_trailing_zeros(COUNT,X)   ((COUNT) = __builtin_ctz (X))
    267 #define COUNT_LEADING_ZEROS_0 32
    268 #endif
    269 
    270 #if defined (__AVR__)
    271 
    272 #if W_TYPE_SIZE == 16
    273 #define count_leading_zeros(COUNT,X)  ((COUNT) = __builtin_clz (X))
    274 #define count_trailing_zeros(COUNT,X) ((COUNT) = __builtin_ctz (X))
    275 #define COUNT_LEADING_ZEROS_0 16
    276 #endif /* W_TYPE_SIZE == 16 */
    277 
    278 #if W_TYPE_SIZE == 32
    279 #define count_leading_zeros(COUNT,X)  ((COUNT) = __builtin_clzl (X))
    280 #define count_trailing_zeros(COUNT,X) ((COUNT) = __builtin_ctzl (X))
    281 #define COUNT_LEADING_ZEROS_0 32
    282 #endif /* W_TYPE_SIZE == 32 */
    283 
    284 #if W_TYPE_SIZE == 64
    285 #define count_leading_zeros(COUNT,X)  ((COUNT) = __builtin_clzll (X))
    286 #define count_trailing_zeros(COUNT,X) ((COUNT) = __builtin_ctzll (X))
    287 #define COUNT_LEADING_ZEROS_0 64
    288 #endif /* W_TYPE_SIZE == 64 */
    289 
    290 #endif /* defined (__AVR__) */
    291 
    292 #if defined (__CRIS__)
    293 
    294 #if __CRIS_arch_version >= 3
    295 #define count_leading_zeros(COUNT, X) ((COUNT) = __builtin_clz (X))
    296 #define COUNT_LEADING_ZEROS_0 32
    297 #endif /* __CRIS_arch_version >= 3 */
    298 
    299 #if __CRIS_arch_version >= 8
    300 #define count_trailing_zeros(COUNT, X) ((COUNT) = __builtin_ctz (X))
    301 #endif /* __CRIS_arch_version >= 8 */
    302 
    303 #if __CRIS_arch_version >= 10
    304 #define __umulsidi3(u,v) ((UDItype)(USItype) (u) * (UDItype)(USItype) (v))
    305 #else
    306 #define __umulsidi3 __umulsidi3
    307 extern UDItype __umulsidi3 (USItype, USItype);
    308 #endif /* __CRIS_arch_version >= 10 */
    309 
    310 #define umul_ppmm(w1, w0, u, v)		\
    311   do {					\
    312     UDItype __x = __umulsidi3 (u, v);	\
    313     (w0) = (USItype) (__x);		\
    314     (w1) = (USItype) (__x >> 32);	\
    315   } while (0)
    316 
    317 /* FIXME: defining add_ssaaaa and sub_ddmmss should be advantageous for
    318    DFmode ("double" intrinsics, avoiding two of the three insns handling
    319    carry), but defining them as open-code C composing and doing the
    320    operation in DImode (UDImode) shows that the DImode needs work:
    321    register pressure from requiring neighboring registers and the
    322    traffic to and from them come to dominate, in the 4.7 series.  */
    323 
    324 #endif /* defined (__CRIS__) */
    325 
    326 #if defined (__hppa) && W_TYPE_SIZE == 32
    327 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \
    328   __asm__ ("add %4,%5,%1\n\taddc %2,%3,%0"				\
    329 	   : "=r" ((USItype) (sh)),					\
    330 	     "=&r" ((USItype) (sl))					\
    331 	   : "%rM" ((USItype) (ah)),					\
    332 	     "rM" ((USItype) (bh)),					\
    333 	     "%rM" ((USItype) (al)),					\
    334 	     "rM" ((USItype) (bl)))
    335 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \
    336   __asm__ ("sub %4,%5,%1\n\tsubb %2,%3,%0"				\
    337 	   : "=r" ((USItype) (sh)),					\
    338 	     "=&r" ((USItype) (sl))					\
    339 	   : "rM" ((USItype) (ah)),					\
    340 	     "rM" ((USItype) (bh)),					\
    341 	     "rM" ((USItype) (al)),					\
    342 	     "rM" ((USItype) (bl)))
    343 #if defined (_PA_RISC1_1)
    344 #define umul_ppmm(w1, w0, u, v) \
    345   do {									\
    346     union								\
    347       {									\
    348 	UDItype __f;							\
    349 	struct {USItype __w1, __w0;} __w1w0;				\
    350       } __t;								\
    351     __asm__ ("xmpyu %1,%2,%0"						\
    352 	     : "=x" (__t.__f)						\
    353 	     : "x" ((USItype) (u)),					\
    354 	       "x" ((USItype) (v)));					\
    355     (w1) = __t.__w1w0.__w1;						\
    356     (w0) = __t.__w1w0.__w0;						\
    357      } while (0)
    358 #define UMUL_TIME 8
    359 #else
    360 #define UMUL_TIME 30
    361 #endif
    362 #define UDIV_TIME 40
    363 #define count_leading_zeros(count, x) \
    364   do {									\
    365     USItype __tmp;							\
    366     __asm__ (								\
    367        "ldi		1,%0\n"						\
    368 "	extru,=		%1,15,16,%%r0		; Bits 31..16 zero?\n"	\
    369 "	extru,tr	%1,15,16,%1		; No.  Shift down, skip add.\n"\
    370 "	ldo		16(%0),%0		; Yes.  Perform add.\n"	\
    371 "	extru,=		%1,23,8,%%r0		; Bits 15..8 zero?\n"	\
    372 "	extru,tr	%1,23,8,%1		; No.  Shift down, skip add.\n"\
    373 "	ldo		8(%0),%0		; Yes.  Perform add.\n"	\
    374 "	extru,=		%1,27,4,%%r0		; Bits 7..4 zero?\n"	\
    375 "	extru,tr	%1,27,4,%1		; No.  Shift down, skip add.\n"\
    376 "	ldo		4(%0),%0		; Yes.  Perform add.\n"	\
    377 "	extru,=		%1,29,2,%%r0		; Bits 3..2 zero?\n"	\
    378 "	extru,tr	%1,29,2,%1		; No.  Shift down, skip add.\n"\
    379 "	ldo		2(%0),%0		; Yes.  Perform add.\n"	\
    380 "	extru		%1,30,1,%1		; Extract bit 1.\n"	\
    381 "	sub		%0,%1,%0		; Subtract it.\n"	\
    382 	: "=r" (count), "=r" (__tmp) : "1" (x));			\
    383   } while (0)
    384 #endif
    385 
    386 #if (defined (__i370__) || defined (__s390__) || defined (__mvs__)) && W_TYPE_SIZE == 32
    387 #if !defined (__zarch__)
    388 #define smul_ppmm(xh, xl, m0, m1) \
    389   do {									\
    390     union {DItype __ll;							\
    391 	   struct {USItype __h, __l;} __i;				\
    392 	  } __x;							\
    393     __asm__ ("lr %N0,%1\n\tmr %0,%2"					\
    394 	     : "=&r" (__x.__ll)						\
    395 	     : "r" (m0), "r" (m1));					\
    396     (xh) = __x.__i.__h; (xl) = __x.__i.__l;				\
    397   } while (0)
    398 #define sdiv_qrnnd(q, r, n1, n0, d) \
    399   do {									\
    400     union {DItype __ll;							\
    401 	   struct {USItype __h, __l;} __i;				\
    402 	  } __x;							\
    403     __x.__i.__h = n1; __x.__i.__l = n0;					\
    404     __asm__ ("dr %0,%2"							\
    405 	     : "=r" (__x.__ll)						\
    406 	     : "0" (__x.__ll), "r" (d));				\
    407     (q) = __x.__i.__l; (r) = __x.__i.__h;				\
    408   } while (0)
    409 #else
    410 #define smul_ppmm(xh, xl, m0, m1) \
    411   do {                                                                  \
    412     register SItype __r0 __asm__ ("0");					\
    413     register SItype __r1 __asm__ ("1") = (m0);				\
    414 									\
    415     __asm__ ("mr\t%%r0,%3"                                              \
    416 	     : "=r" (__r0), "=r" (__r1)					\
    417 	     : "r"  (__r1),  "r" (m1));					\
    418     (xh) = __r0; (xl) = __r1;						\
    419   } while (0)
    420 
    421 #define sdiv_qrnnd(q, r, n1, n0, d) \
    422   do {									\
    423     register SItype __r0 __asm__ ("0") = (n1);				\
    424     register SItype __r1 __asm__ ("1") = (n0);				\
    425 									\
    426     __asm__ ("dr\t%%r0,%4"                                              \
    427 	     : "=r" (__r0), "=r" (__r1)					\
    428 	     : "r" (__r0), "r" (__r1), "r" (d));			\
    429     (q) = __r1; (r) = __r0;						\
    430   } while (0)
    431 #endif /* __zarch__ */
    432 #endif
    433 
    434 #if (defined (__i386__) || defined (__i486__)) && W_TYPE_SIZE == 32
    435 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \
    436   __asm__ ("add{l} {%5,%1|%1,%5}\n\tadc{l} {%3,%0|%0,%3}"		\
    437 	   : "=r" ((USItype) (sh)),					\
    438 	     "=&r" ((USItype) (sl))					\
    439 	   : "%0" ((USItype) (ah)),					\
    440 	     "g" ((USItype) (bh)),					\
    441 	     "%1" ((USItype) (al)),					\
    442 	     "g" ((USItype) (bl)))
    443 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \
    444   __asm__ ("sub{l} {%5,%1|%1,%5}\n\tsbb{l} {%3,%0|%0,%3}"		\
    445 	   : "=r" ((USItype) (sh)),					\
    446 	     "=&r" ((USItype) (sl))					\
    447 	   : "0" ((USItype) (ah)),					\
    448 	     "g" ((USItype) (bh)),					\
    449 	     "1" ((USItype) (al)),					\
    450 	     "g" ((USItype) (bl)))
    451 #define umul_ppmm(w1, w0, u, v) \
    452   __asm__ ("mul{l} %3"							\
    453 	   : "=a" ((USItype) (w0)),					\
    454 	     "=d" ((USItype) (w1))					\
    455 	   : "%0" ((USItype) (u)),					\
    456 	     "rm" ((USItype) (v)))
    457 #define udiv_qrnnd(q, r, n1, n0, dv) \
    458   __asm__ ("div{l} %4"							\
    459 	   : "=a" ((USItype) (q)),					\
    460 	     "=d" ((USItype) (r))					\
    461 	   : "0" ((USItype) (n0)),					\
    462 	     "1" ((USItype) (n1)),					\
    463 	     "rm" ((USItype) (dv)))
    464 #define count_leading_zeros(count, x)	((count) = __builtin_clz (x))
    465 #define count_trailing_zeros(count, x)	((count) = __builtin_ctz (x))
    466 #define UMUL_TIME 40
    467 #define UDIV_TIME 40
    468 #endif /* 80x86 */
    469 
    470 #if (defined (__x86_64__) || defined (__i386__)) && W_TYPE_SIZE == 64
    471 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \
    472   __asm__ ("add{q} {%5,%1|%1,%5}\n\tadc{q} {%3,%0|%0,%3}"		\
    473 	   : "=r" ((UDItype) (sh)),					\
    474 	     "=&r" ((UDItype) (sl))					\
    475 	   : "%0" ((UDItype) (ah)),					\
    476 	     "rme" ((UDItype) (bh)),					\
    477 	     "%1" ((UDItype) (al)),					\
    478 	     "rme" ((UDItype) (bl)))
    479 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \
    480   __asm__ ("sub{q} {%5,%1|%1,%5}\n\tsbb{q} {%3,%0|%0,%3}"		\
    481 	   : "=r" ((UDItype) (sh)),					\
    482 	     "=&r" ((UDItype) (sl))					\
    483 	   : "0" ((UDItype) (ah)),					\
    484 	     "rme" ((UDItype) (bh)),					\
    485 	     "1" ((UDItype) (al)),					\
    486 	     "rme" ((UDItype) (bl)))
    487 #define umul_ppmm(w1, w0, u, v) \
    488   __asm__ ("mul{q} %3"							\
    489 	   : "=a" ((UDItype) (w0)),					\
    490 	     "=d" ((UDItype) (w1))					\
    491 	   : "%0" ((UDItype) (u)),					\
    492 	     "rm" ((UDItype) (v)))
    493 #define udiv_qrnnd(q, r, n1, n0, dv) \
    494   __asm__ ("div{q} %4"							\
    495 	   : "=a" ((UDItype) (q)),					\
    496 	     "=d" ((UDItype) (r))					\
    497 	   : "0" ((UDItype) (n0)),					\
    498 	     "1" ((UDItype) (n1)),					\
    499 	     "rm" ((UDItype) (dv)))
    500 #define count_leading_zeros(count, x)	((count) = __builtin_clzll (x))
    501 #define count_trailing_zeros(count, x)	((count) = __builtin_ctzll (x))
    502 #define UMUL_TIME 40
    503 #define UDIV_TIME 40
    504 #endif /* x86_64 */
    505 
    506 #if defined (__i960__) && W_TYPE_SIZE == 32
    507 #define umul_ppmm(w1, w0, u, v) \
    508   ({union {UDItype __ll;						\
    509 	   struct {USItype __l, __h;} __i;				\
    510 	  } __xx;							\
    511   __asm__ ("emul	%2,%1,%0"					\
    512 	   : "=d" (__xx.__ll)						\
    513 	   : "%dI" ((USItype) (u)),					\
    514 	     "dI" ((USItype) (v)));					\
    515   (w1) = __xx.__i.__h; (w0) = __xx.__i.__l;})
    516 #define __umulsidi3(u, v) \
    517   ({UDItype __w;							\
    518     __asm__ ("emul	%2,%1,%0"					\
    519 	     : "=d" (__w)						\
    520 	     : "%dI" ((USItype) (u)),					\
    521 	       "dI" ((USItype) (v)));					\
    522     __w; })
    523 #endif /* __i960__ */
    524 
    525 #if defined (__ia64) && W_TYPE_SIZE == 64
    526 /* This form encourages gcc (pre-release 3.4 at least) to emit predicated
    527    "sub r=r,r" and "sub r=r,r,1", giving a 2 cycle latency.  The generic
    528    code using "al<bl" arithmetically comes out making an actual 0 or 1 in a
    529    register, which takes an extra cycle.  */
    530 #define sub_ddmmss(sh, sl, ah, al, bh, bl)				\
    531   do {									\
    532     UWtype __x;								\
    533     __x = (al) - (bl);							\
    534     if ((al) < (bl))							\
    535       (sh) = (ah) - (bh) - 1;						\
    536     else								\
    537       (sh) = (ah) - (bh);						\
    538     (sl) = __x;								\
    539   } while (0)
    540 
    541 /* Do both product parts in assembly, since that gives better code with
    542    all gcc versions.  Some callers will just use the upper part, and in
    543    that situation we waste an instruction, but not any cycles.  */
    544 #define umul_ppmm(ph, pl, m0, m1)					\
    545   __asm__ ("xma.hu %0 = %2, %3, f0\n\txma.l %1 = %2, %3, f0"		\
    546 	   : "=&f" (ph), "=f" (pl)					\
    547 	   : "f" (m0), "f" (m1))
    548 #define count_leading_zeros(count, x)					\
    549   do {									\
    550     UWtype _x = (x), _y, _a, _c;					\
    551     __asm__ ("mux1 %0 = %1, @rev" : "=r" (_y) : "r" (_x));		\
    552     __asm__ ("czx1.l %0 = %1" : "=r" (_a) : "r" (-_y | _y));		\
    553     _c = (_a - 1) << 3;							\
    554     _x >>= _c;								\
    555     if (_x >= 1 << 4)							\
    556       _x >>= 4, _c += 4;						\
    557     if (_x >= 1 << 2)							\
    558       _x >>= 2, _c += 2;						\
    559     _c += _x >> 1;							\
    560     (count) =  W_TYPE_SIZE - 1 - _c;					\
    561   } while (0)
    562 /* similar to what gcc does for __builtin_ffs, but 0 based rather than 1
    563    based, and we don't need a special case for x==0 here */
    564 #define count_trailing_zeros(count, x)					\
    565   do {									\
    566     UWtype __ctz_x = (x);						\
    567     __asm__ ("popcnt %0 = %1"						\
    568 	     : "=r" (count)						\
    569 	     : "r" ((__ctz_x-1) & ~__ctz_x));				\
    570   } while (0)
    571 #define UMUL_TIME 14
    572 #endif
    573 
    574 #if defined (__M32R__) && W_TYPE_SIZE == 32
    575 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \
    576   /* The cmp clears the condition bit.  */ \
    577   __asm__ ("cmp %0,%0\n\taddx %1,%5\n\taddx %0,%3"			\
    578 	   : "=r" ((USItype) (sh)),					\
    579 	     "=&r" ((USItype) (sl))					\
    580 	   : "0" ((USItype) (ah)),					\
    581 	     "r" ((USItype) (bh)),					\
    582 	     "1" ((USItype) (al)),					\
    583 	     "r" ((USItype) (bl))					\
    584 	   : "cbit")
    585 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \
    586   /* The cmp clears the condition bit.  */ \
    587   __asm__ ("cmp %0,%0\n\tsubx %1,%5\n\tsubx %0,%3"			\
    588 	   : "=r" ((USItype) (sh)),					\
    589 	     "=&r" ((USItype) (sl))					\
    590 	   : "0" ((USItype) (ah)),					\
    591 	     "r" ((USItype) (bh)),					\
    592 	     "1" ((USItype) (al)),					\
    593 	     "r" ((USItype) (bl))					\
    594 	   : "cbit")
    595 #endif /* __M32R__ */
    596 
    597 #if defined (__mc68000__) && W_TYPE_SIZE == 32
    598 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \
    599   __asm__ ("add%.l %5,%1\n\taddx%.l %3,%0"				\
    600 	   : "=d" ((USItype) (sh)),					\
    601 	     "=&d" ((USItype) (sl))					\
    602 	   : "%0" ((USItype) (ah)),					\
    603 	     "d" ((USItype) (bh)),					\
    604 	     "%1" ((USItype) (al)),					\
    605 	     "g" ((USItype) (bl)))
    606 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \
    607   __asm__ ("sub%.l %5,%1\n\tsubx%.l %3,%0"				\
    608 	   : "=d" ((USItype) (sh)),					\
    609 	     "=&d" ((USItype) (sl))					\
    610 	   : "0" ((USItype) (ah)),					\
    611 	     "d" ((USItype) (bh)),					\
    612 	     "1" ((USItype) (al)),					\
    613 	     "g" ((USItype) (bl)))
    614 
    615 /* The '020, '030, '040, '060 and CPU32 have 32x32->64 and 64/32->32q-32r.  */
    616 #if (defined (__mc68020__) && !defined (__mc68060__))
    617 #define umul_ppmm(w1, w0, u, v) \
    618   __asm__ ("mulu%.l %3,%1:%0"						\
    619 	   : "=d" ((USItype) (w0)),					\
    620 	     "=d" ((USItype) (w1))					\
    621 	   : "%0" ((USItype) (u)),					\
    622 	     "dmi" ((USItype) (v)))
    623 #define UMUL_TIME 45
    624 #define udiv_qrnnd(q, r, n1, n0, d) \
    625   __asm__ ("divu%.l %4,%1:%0"						\
    626 	   : "=d" ((USItype) (q)),					\
    627 	     "=d" ((USItype) (r))					\
    628 	   : "0" ((USItype) (n0)),					\
    629 	     "1" ((USItype) (n1)),					\
    630 	     "dmi" ((USItype) (d)))
    631 #define UDIV_TIME 90
    632 #define sdiv_qrnnd(q, r, n1, n0, d) \
    633   __asm__ ("divs%.l %4,%1:%0"						\
    634 	   : "=d" ((USItype) (q)),					\
    635 	     "=d" ((USItype) (r))					\
    636 	   : "0" ((USItype) (n0)),					\
    637 	     "1" ((USItype) (n1)),					\
    638 	     "dmi" ((USItype) (d)))
    639 
    640 #elif defined (__mcoldfire__) /* not mc68020 */
    641 
    642 #define umul_ppmm(xh, xl, a, b) \
    643   __asm__ ("| Inlined umul_ppmm\n"					\
    644 	   "	move%.l	%2,%/d0\n"					\
    645 	   "	move%.l	%3,%/d1\n"					\
    646 	   "	move%.l	%/d0,%/d2\n"					\
    647 	   "	swap	%/d0\n"						\
    648 	   "	move%.l	%/d1,%/d3\n"					\
    649 	   "	swap	%/d1\n"						\
    650 	   "	move%.w	%/d2,%/d4\n"					\
    651 	   "	mulu	%/d3,%/d4\n"					\
    652 	   "	mulu	%/d1,%/d2\n"					\
    653 	   "	mulu	%/d0,%/d3\n"					\
    654 	   "	mulu	%/d0,%/d1\n"					\
    655 	   "	move%.l	%/d4,%/d0\n"					\
    656 	   "	clr%.w	%/d0\n"						\
    657 	   "	swap	%/d0\n"						\
    658 	   "	add%.l	%/d0,%/d2\n"					\
    659 	   "	add%.l	%/d3,%/d2\n"					\
    660 	   "	jcc	1f\n"						\
    661 	   "	add%.l	%#65536,%/d1\n"					\
    662 	   "1:	swap	%/d2\n"						\
    663 	   "	moveq	%#0,%/d0\n"					\
    664 	   "	move%.w	%/d2,%/d0\n"					\
    665 	   "	move%.w	%/d4,%/d2\n"					\
    666 	   "	move%.l	%/d2,%1\n"					\
    667 	   "	add%.l	%/d1,%/d0\n"					\
    668 	   "	move%.l	%/d0,%0"					\
    669 	   : "=g" ((USItype) (xh)),					\
    670 	     "=g" ((USItype) (xl))					\
    671 	   : "g" ((USItype) (a)),					\
    672 	     "g" ((USItype) (b))					\
    673 	   : "d0", "d1", "d2", "d3", "d4")
    674 #define UMUL_TIME 100
    675 #define UDIV_TIME 400
    676 #else /* not ColdFire */
    677 /* %/ inserts REGISTER_PREFIX, %# inserts IMMEDIATE_PREFIX.  */
    678 #define umul_ppmm(xh, xl, a, b) \
    679   __asm__ ("| Inlined umul_ppmm\n"					\
    680 	   "	move%.l	%2,%/d0\n"					\
    681 	   "	move%.l	%3,%/d1\n"					\
    682 	   "	move%.l	%/d0,%/d2\n"					\
    683 	   "	swap	%/d0\n"						\
    684 	   "	move%.l	%/d1,%/d3\n"					\
    685 	   "	swap	%/d1\n"						\
    686 	   "	move%.w	%/d2,%/d4\n"					\
    687 	   "	mulu	%/d3,%/d4\n"					\
    688 	   "	mulu	%/d1,%/d2\n"					\
    689 	   "	mulu	%/d0,%/d3\n"					\
    690 	   "	mulu	%/d0,%/d1\n"					\
    691 	   "	move%.l	%/d4,%/d0\n"					\
    692 	   "	eor%.w	%/d0,%/d0\n"					\
    693 	   "	swap	%/d0\n"						\
    694 	   "	add%.l	%/d0,%/d2\n"					\
    695 	   "	add%.l	%/d3,%/d2\n"					\
    696 	   "	jcc	1f\n"						\
    697 	   "	add%.l	%#65536,%/d1\n"					\
    698 	   "1:	swap	%/d2\n"						\
    699 	   "	moveq	%#0,%/d0\n"					\
    700 	   "	move%.w	%/d2,%/d0\n"					\
    701 	   "	move%.w	%/d4,%/d2\n"					\
    702 	   "	move%.l	%/d2,%1\n"					\
    703 	   "	add%.l	%/d1,%/d0\n"					\
    704 	   "	move%.l	%/d0,%0"					\
    705 	   : "=g" ((USItype) (xh)),					\
    706 	     "=g" ((USItype) (xl))					\
    707 	   : "g" ((USItype) (a)),					\
    708 	     "g" ((USItype) (b))					\
    709 	   : "d0", "d1", "d2", "d3", "d4")
    710 #define UMUL_TIME 100
    711 #define UDIV_TIME 400
    712 
    713 #endif /* not mc68020 */
    714 
    715 /* The '020, '030, '040 and '060 have bitfield insns.
    716    cpu32 disguises as a 68020, but lacks them.  */
    717 #if defined (__mc68020__) && !defined (__mcpu32__)
    718 #define count_leading_zeros(count, x) \
    719   __asm__ ("bfffo %1{%b2:%b2},%0"					\
    720 	   : "=d" ((USItype) (count))					\
    721 	   : "od" ((USItype) (x)), "n" (0))
    722 /* Some ColdFire architectures have a ff1 instruction supported via
    723    __builtin_clz. */
    724 #elif defined (__mcfisaaplus__) || defined (__mcfisac__)
    725 #define count_leading_zeros(count,x) ((count) = __builtin_clz (x))
    726 #define COUNT_LEADING_ZEROS_0 32
    727 #endif
    728 #endif /* mc68000 */
    729 
    730 #if defined (__m88000__) && W_TYPE_SIZE == 32
    731 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \
    732   __asm__ ("addu.co %1,%r4,%r5\n\taddu.ci %0,%r2,%r3"			\
    733 	   : "=r" ((USItype) (sh)),					\
    734 	     "=&r" ((USItype) (sl))					\
    735 	   : "%rJ" ((USItype) (ah)),					\
    736 	     "rJ" ((USItype) (bh)),					\
    737 	     "%rJ" ((USItype) (al)),					\
    738 	     "rJ" ((USItype) (bl)))
    739 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \
    740   __asm__ ("subu.co %1,%r4,%r5\n\tsubu.ci %0,%r2,%r3"			\
    741 	   : "=r" ((USItype) (sh)),					\
    742 	     "=&r" ((USItype) (sl))					\
    743 	   : "rJ" ((USItype) (ah)),					\
    744 	     "rJ" ((USItype) (bh)),					\
    745 	     "rJ" ((USItype) (al)),					\
    746 	     "rJ" ((USItype) (bl)))
    747 #define count_leading_zeros(count, x) \
    748   do {									\
    749     USItype __cbtmp;							\
    750     __asm__ ("ff1 %0,%1"						\
    751 	     : "=r" (__cbtmp)						\
    752 	     : "r" ((USItype) (x)));					\
    753     (count) = __cbtmp ^ 31;						\
    754   } while (0)
    755 #define COUNT_LEADING_ZEROS_0 63 /* sic */
    756 #if defined (__mc88110__)
    757 #define umul_ppmm(wh, wl, u, v) \
    758   do {									\
    759     union {UDItype __ll;						\
    760 	   struct {USItype __h, __l;} __i;				\
    761 	  } __xx;							\
    762     __asm__ ("mulu.d	%0,%1,%2"					\
    763 	     : "=r" (__xx.__ll)						\
    764 	     : "r" ((USItype) (u)),					\
    765 	       "r" ((USItype) (v)));					\
    766     (wh) = __xx.__i.__h;						\
    767     (wl) = __xx.__i.__l;						\
    768   } while (0)
    769 #define udiv_qrnnd(q, r, n1, n0, d) \
    770   ({union {UDItype __ll;						\
    771 	   struct {USItype __h, __l;} __i;				\
    772 	  } __xx;							\
    773   USItype __q;								\
    774   __xx.__i.__h = (n1); __xx.__i.__l = (n0);				\
    775   __asm__ ("divu.d %0,%1,%2"						\
    776 	   : "=r" (__q)							\
    777 	   : "r" (__xx.__ll),						\
    778 	     "r" ((USItype) (d)));					\
    779   (r) = (n0) - __q * (d); (q) = __q; })
    780 #define UMUL_TIME 5
    781 #define UDIV_TIME 25
    782 #else
    783 #define UMUL_TIME 17
    784 #define UDIV_TIME 150
    785 #endif /* __mc88110__ */
    786 #endif /* __m88000__ */
    787 
    788 #if defined (__mn10300__)
    789 # if defined (__AM33__)
    790 #  define count_leading_zeros(COUNT,X)	((COUNT) = __builtin_clz (X))
    791 #  define umul_ppmm(w1, w0, u, v)		\
    792     asm("mulu %3,%2,%1,%0" : "=r"(w0), "=r"(w1) : "r"(u), "r"(v))
    793 #  define smul_ppmm(w1, w0, u, v)		\
    794     asm("mul %3,%2,%1,%0" : "=r"(w0), "=r"(w1) : "r"(u), "r"(v))
    795 # else
    796 #  define umul_ppmm(w1, w0, u, v)		\
    797     asm("nop; nop; mulu %3,%0" : "=d"(w0), "=z"(w1) : "%0"(u), "d"(v))
    798 #  define smul_ppmm(w1, w0, u, v)		\
    799     asm("nop; nop; mul %3,%0" : "=d"(w0), "=z"(w1) : "%0"(u), "d"(v))
    800 # endif
    801 # define add_ssaaaa(sh, sl, ah, al, bh, bl)	\
    802   do {						\
    803     DWunion __s, __a, __b;			\
    804     __a.s.low = (al); __a.s.high = (ah);	\
    805     __b.s.low = (bl); __b.s.high = (bh);	\
    806     __s.ll = __a.ll + __b.ll;			\
    807     (sl) = __s.s.low; (sh) = __s.s.high;	\
    808   } while (0)
    809 # define sub_ddmmss(sh, sl, ah, al, bh, bl)	\
    810   do {						\
    811     DWunion __s, __a, __b;			\
    812     __a.s.low = (al); __a.s.high = (ah);	\
    813     __b.s.low = (bl); __b.s.high = (bh);	\
    814     __s.ll = __a.ll - __b.ll;			\
    815     (sl) = __s.s.low; (sh) = __s.s.high;	\
    816   } while (0)
    817 # define udiv_qrnnd(q, r, nh, nl, d)		\
    818   asm("divu %2,%0" : "=D"(q), "=z"(r) : "D"(d), "0"(nl), "1"(nh))
    819 # define sdiv_qrnnd(q, r, nh, nl, d)		\
    820   asm("div %2,%0" : "=D"(q), "=z"(r) : "D"(d), "0"(nl), "1"(nh))
    821 # define UMUL_TIME 3
    822 # define UDIV_TIME 38
    823 #endif
    824 
    825 #if defined (__mips__) && W_TYPE_SIZE == 32
    826 #define umul_ppmm(w1, w0, u, v)						\
    827   do {									\
    828     UDItype __x = (UDItype) (USItype) (u) * (USItype) (v);		\
    829     (w1) = (USItype) (__x >> 32);					\
    830     (w0) = (USItype) (__x);						\
    831   } while (0)
    832 #define UMUL_TIME 10
    833 #define UDIV_TIME 100
    834 
    835 #if (__mips == 32 || __mips == 64) && ! __mips16
    836 #define count_leading_zeros(COUNT,X)	((COUNT) = __builtin_clz (X))
    837 #define COUNT_LEADING_ZEROS_0 32
    838 #endif
    839 #endif /* __mips__ */
    840 
    841 #if defined (__ns32000__) && W_TYPE_SIZE == 32
    842 #define umul_ppmm(w1, w0, u, v) \
    843   ({union {UDItype __ll;						\
    844 	   struct {USItype __l, __h;} __i;				\
    845 	  } __xx;							\
    846   __asm__ ("meid %2,%0"							\
    847 	   : "=g" (__xx.__ll)						\
    848 	   : "%0" ((USItype) (u)),					\
    849 	     "g" ((USItype) (v)));					\
    850   (w1) = __xx.__i.__h; (w0) = __xx.__i.__l;})
    851 #define __umulsidi3(u, v) \
    852   ({UDItype __w;							\
    853     __asm__ ("meid %2,%0"						\
    854 	     : "=g" (__w)						\
    855 	     : "%0" ((USItype) (u)),					\
    856 	       "g" ((USItype) (v)));					\
    857     __w; })
    858 #define udiv_qrnnd(q, r, n1, n0, d) \
    859   ({union {UDItype __ll;						\
    860 	   struct {USItype __l, __h;} __i;				\
    861 	  } __xx;							\
    862   __xx.__i.__h = (n1); __xx.__i.__l = (n0);				\
    863   __asm__ ("deid %2,%0"							\
    864 	   : "=g" (__xx.__ll)						\
    865 	   : "0" (__xx.__ll),						\
    866 	     "g" ((USItype) (d)));					\
    867   (r) = __xx.__i.__l; (q) = __xx.__i.__h; })
    868 #define count_trailing_zeros(count,x) \
    869   do {									\
    870     __asm__ ("ffsd     %2,%0"						\
    871 	    : "=r" ((USItype) (count))					\
    872 	    : "0" ((USItype) 0),					\
    873 	      "r" ((USItype) (x)));					\
    874   } while (0)
    875 #endif /* __ns32000__ */
    876 
    877 /* FIXME: We should test _IBMR2 here when we add assembly support for the
    878    system vendor compilers.
    879    FIXME: What's needed for gcc PowerPC VxWorks?  __vxworks__ is not good
    880    enough, since that hits ARM and m68k too.  */
    881 #if (defined (_ARCH_PPC)	/* AIX */				\
    882      || defined (__powerpc__)	/* gcc */				\
    883      || defined (__POWERPC__)	/* BEOS */				\
    884      || defined (__ppc__)	/* Darwin */				\
    885      || (defined (PPC) && ! defined (CPU_FAMILY)) /* gcc 2.7.x GNU&SysV */    \
    886      || (defined (PPC) && defined (CPU_FAMILY)    /* VxWorks */               \
    887 	 && CPU_FAMILY == PPC)                                                \
    888      ) && W_TYPE_SIZE == 32
    889 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \
    890   do {									\
    891     if (__builtin_constant_p (bh) && (bh) == 0)				\
    892       __asm__ ("add%I4c %1,%3,%4\n\taddze %0,%2"		\
    893 	     : "=r" (sh), "=&r" (sl) : "r" (ah), "%r" (al), "rI" (bl));\
    894     else if (__builtin_constant_p (bh) && (bh) == ~(USItype) 0)		\
    895       __asm__ ("add%I4c %1,%3,%4\n\taddme %0,%2"		\
    896 	     : "=r" (sh), "=&r" (sl) : "r" (ah), "%r" (al), "rI" (bl));\
    897     else								\
    898       __asm__ ("add%I5c %1,%4,%5\n\tadde %0,%2,%3"		\
    899 	     : "=r" (sh), "=&r" (sl)					\
    900 	     : "%r" (ah), "r" (bh), "%r" (al), "rI" (bl));		\
    901   } while (0)
    902 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \
    903   do {									\
    904     if (__builtin_constant_p (ah) && (ah) == 0)				\
    905       __asm__ ("subf%I3c %1,%4,%3\n\tsubfze %0,%2"	\
    906 	       : "=r" (sh), "=&r" (sl) : "r" (bh), "rI" (al), "r" (bl));\
    907     else if (__builtin_constant_p (ah) && (ah) == ~(USItype) 0)		\
    908       __asm__ ("subf%I3c %1,%4,%3\n\tsubfme %0,%2"	\
    909 	       : "=r" (sh), "=&r" (sl) : "r" (bh), "rI" (al), "r" (bl));\
    910     else if (__builtin_constant_p (bh) && (bh) == 0)			\
    911       __asm__ ("subf%I3c %1,%4,%3\n\taddme %0,%2"		\
    912 	       : "=r" (sh), "=&r" (sl) : "r" (ah), "rI" (al), "r" (bl));\
    913     else if (__builtin_constant_p (bh) && (bh) == ~(USItype) 0)		\
    914       __asm__ ("subf%I3c %1,%4,%3\n\taddze %0,%2"		\
    915 	       : "=r" (sh), "=&r" (sl) : "r" (ah), "rI" (al), "r" (bl));\
    916     else								\
    917       __asm__ ("subf%I4c %1,%5,%4\n\tsubfe %0,%3,%2"	\
    918 	       : "=r" (sh), "=&r" (sl)					\
    919 	       : "r" (ah), "r" (bh), "rI" (al), "r" (bl));		\
    920   } while (0)
    921 #define count_leading_zeros(count, x) \
    922   __asm__ ("cntlzw %0,%1" : "=r" (count) : "r" (x))
    923 #define COUNT_LEADING_ZEROS_0 32
    924 #if defined (_ARCH_PPC) || defined (__powerpc__) || defined (__POWERPC__) \
    925   || defined (__ppc__)                                                    \
    926   || (defined (PPC) && ! defined (CPU_FAMILY)) /* gcc 2.7.x GNU&SysV */       \
    927   || (defined (PPC) && defined (CPU_FAMILY)    /* VxWorks */                  \
    928 	 && CPU_FAMILY == PPC)
    929 #define umul_ppmm(ph, pl, m0, m1) \
    930   do {									\
    931     USItype __m0 = (m0), __m1 = (m1);					\
    932     __asm__ ("mulhwu %0,%1,%2" : "=r" (ph) : "%r" (m0), "r" (m1));	\
    933     (pl) = __m0 * __m1;							\
    934   } while (0)
    935 #define UMUL_TIME 15
    936 #define smul_ppmm(ph, pl, m0, m1) \
    937   do {									\
    938     SItype __m0 = (m0), __m1 = (m1);					\
    939     __asm__ ("mulhw %0,%1,%2" : "=r" (ph) : "%r" (m0), "r" (m1));	\
    940     (pl) = __m0 * __m1;							\
    941   } while (0)
    942 #define SMUL_TIME 14
    943 #define UDIV_TIME 120
    944 #endif
    945 #endif /* 32-bit POWER architecture variants.  */
    946 
    947 /* We should test _IBMR2 here when we add assembly support for the system
    948    vendor compilers.  */
    949 #if (defined (_ARCH_PPC64) || defined (__powerpc64__)) && W_TYPE_SIZE == 64
    950 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \
    951   do {									\
    952     if (__builtin_constant_p (bh) && (bh) == 0)				\
    953       __asm__ ("add%I4c %1,%3,%4\n\taddze %0,%2"		\
    954 	     : "=r" (sh), "=&r" (sl) : "r" (ah), "%r" (al), "rI" (bl));\
    955     else if (__builtin_constant_p (bh) && (bh) == ~(UDItype) 0)		\
    956       __asm__ ("add%I4c %1,%3,%4\n\taddme %0,%2"		\
    957 	     : "=r" (sh), "=&r" (sl) : "r" (ah), "%r" (al), "rI" (bl));\
    958     else								\
    959       __asm__ ("add%I5c %1,%4,%5\n\tadde %0,%2,%3"		\
    960 	     : "=r" (sh), "=&r" (sl)					\
    961 	     : "%r" (ah), "r" (bh), "%r" (al), "rI" (bl));		\
    962   } while (0)
    963 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \
    964   do {									\
    965     if (__builtin_constant_p (ah) && (ah) == 0)				\
    966       __asm__ ("subf%I3c %1,%4,%3\n\tsubfze %0,%2"	\
    967 	       : "=r" (sh), "=&r" (sl) : "r" (bh), "rI" (al), "r" (bl));\
    968     else if (__builtin_constant_p (ah) && (ah) == ~(UDItype) 0)		\
    969       __asm__ ("subf%I3c %1,%4,%3\n\tsubfme %0,%2"	\
    970 	       : "=r" (sh), "=&r" (sl) : "r" (bh), "rI" (al), "r" (bl));\
    971     else if (__builtin_constant_p (bh) && (bh) == 0)			\
    972       __asm__ ("subf%I3c %1,%4,%3\n\taddme %0,%2"		\
    973 	       : "=r" (sh), "=&r" (sl) : "r" (ah), "rI" (al), "r" (bl));\
    974     else if (__builtin_constant_p (bh) && (bh) == ~(UDItype) 0)		\
    975       __asm__ ("subf%I3c %1,%4,%3\n\taddze %0,%2"		\
    976 	       : "=r" (sh), "=&r" (sl) : "r" (ah), "rI" (al), "r" (bl));\
    977     else								\
    978       __asm__ ("subf%I4c %1,%5,%4\n\tsubfe %0,%3,%2"	\
    979 	       : "=r" (sh), "=&r" (sl)					\
    980 	       : "r" (ah), "r" (bh), "rI" (al), "r" (bl));		\
    981   } while (0)
    982 #define count_leading_zeros(count, x) \
    983   __asm__ ("cntlzd %0,%1" : "=r" (count) : "r" (x))
    984 #define COUNT_LEADING_ZEROS_0 64
    985 #define umul_ppmm(ph, pl, m0, m1) \
    986   do {									\
    987     UDItype __m0 = (m0), __m1 = (m1);					\
    988     __asm__ ("mulhdu %0,%1,%2" : "=r" (ph) : "%r" (m0), "r" (m1));	\
    989     (pl) = __m0 * __m1;							\
    990   } while (0)
    991 #define UMUL_TIME 15
    992 #define smul_ppmm(ph, pl, m0, m1) \
    993   do {									\
    994     DItype __m0 = (m0), __m1 = (m1);					\
    995     __asm__ ("mulhd %0,%1,%2" : "=r" (ph) : "%r" (m0), "r" (m1));	\
    996     (pl) = __m0 * __m1;							\
    997   } while (0)
    998 #define SMUL_TIME 14  /* ??? */
    999 #define UDIV_TIME 120 /* ??? */
   1000 #endif /* 64-bit PowerPC.  */
   1001 
   1002 #if defined (__ibm032__) /* RT/ROMP */ && W_TYPE_SIZE == 32
   1003 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \
   1004   __asm__ ("a %1,%5\n\tae %0,%3"					\
   1005 	   : "=r" ((USItype) (sh)),					\
   1006 	     "=&r" ((USItype) (sl))					\
   1007 	   : "%0" ((USItype) (ah)),					\
   1008 	     "r" ((USItype) (bh)),					\
   1009 	     "%1" ((USItype) (al)),					\
   1010 	     "r" ((USItype) (bl)))
   1011 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \
   1012   __asm__ ("s %1,%5\n\tse %0,%3"					\
   1013 	   : "=r" ((USItype) (sh)),					\
   1014 	     "=&r" ((USItype) (sl))					\
   1015 	   : "0" ((USItype) (ah)),					\
   1016 	     "r" ((USItype) (bh)),					\
   1017 	     "1" ((USItype) (al)),					\
   1018 	     "r" ((USItype) (bl)))
   1019 #define umul_ppmm(ph, pl, m0, m1) \
   1020   do {									\
   1021     USItype __m0 = (m0), __m1 = (m1);					\
   1022     __asm__ (								\
   1023        "s	r2,r2\n"						\
   1024 "	mts	r10,%2\n"						\
   1025 "	m	r2,%3\n"						\
   1026 "	m	r2,%3\n"						\
   1027 "	m	r2,%3\n"						\
   1028 "	m	r2,%3\n"						\
   1029 "	m	r2,%3\n"						\
   1030 "	m	r2,%3\n"						\
   1031 "	m	r2,%3\n"						\
   1032 "	m	r2,%3\n"						\
   1033 "	m	r2,%3\n"						\
   1034 "	m	r2,%3\n"						\
   1035 "	m	r2,%3\n"						\
   1036 "	m	r2,%3\n"						\
   1037 "	m	r2,%3\n"						\
   1038 "	m	r2,%3\n"						\
   1039 "	m	r2,%3\n"						\
   1040 "	m	r2,%3\n"						\
   1041 "	cas	%0,r2,r0\n"						\
   1042 "	mfs	r10,%1"							\
   1043 	     : "=r" ((USItype) (ph)),					\
   1044 	       "=r" ((USItype) (pl))					\
   1045 	     : "%r" (__m0),						\
   1046 		"r" (__m1)						\
   1047 	     : "r2");							\
   1048     (ph) += ((((SItype) __m0 >> 31) & __m1)				\
   1049 	     + (((SItype) __m1 >> 31) & __m0));				\
   1050   } while (0)
   1051 #define UMUL_TIME 20
   1052 #define UDIV_TIME 200
   1053 #define count_leading_zeros(count, x) \
   1054   do {									\
   1055     if ((x) >= 0x10000)							\
   1056       __asm__ ("clz	%0,%1"						\
   1057 	       : "=r" ((USItype) (count))				\
   1058 	       : "r" ((USItype) (x) >> 16));				\
   1059     else								\
   1060       {									\
   1061 	__asm__ ("clz	%0,%1"						\
   1062 		 : "=r" ((USItype) (count))				\
   1063 		 : "r" ((USItype) (x)));					\
   1064 	(count) += 16;							\
   1065       }									\
   1066   } while (0)
   1067 #endif
   1068 
   1069 #if defined(__sh__) && !__SHMEDIA__ && W_TYPE_SIZE == 32
   1070 #ifndef __sh1__
   1071 #define umul_ppmm(w1, w0, u, v) \
   1072   __asm__ (								\
   1073        "dmulu.l	%2,%3\n\tsts%M1	macl,%1\n\tsts%M0	mach,%0"	\
   1074 	   : "=r<" ((USItype)(w1)),					\
   1075 	     "=r<" ((USItype)(w0))					\
   1076 	   : "r" ((USItype)(u)),					\
   1077 	     "r" ((USItype)(v))						\
   1078 	   : "macl", "mach")
   1079 #define UMUL_TIME 5
   1080 #endif
   1081 
   1082 /* This is the same algorithm as __udiv_qrnnd_c.  */
   1083 #define UDIV_NEEDS_NORMALIZATION 1
   1084 
   1085 #define udiv_qrnnd(q, r, n1, n0, d) \
   1086   do {									\
   1087     extern UWtype __udiv_qrnnd_16 (UWtype, UWtype)			\
   1088 			__attribute__ ((visibility ("hidden")));	\
   1089     /* r0: rn r1: qn */ /* r0: n1 r4: n0 r5: d r6: d1 */ /* r2: __m */	\
   1090     __asm__ (								\
   1091 	"mov%M4 %4,r5\n"						\
   1092 "	swap.w %3,r4\n"							\
   1093 "	swap.w r5,r6\n"							\
   1094 "	jsr @%5\n"							\
   1095 "	shll16 r6\n"							\
   1096 "	swap.w r4,r4\n"							\
   1097 "	jsr @%5\n"							\
   1098 "	swap.w r1,%0\n"							\
   1099 "	or r1,%0"							\
   1100 	: "=r" (q), "=&z" (r)						\
   1101 	: "1" (n1), "r" (n0), "rm" (d), "r" (&__udiv_qrnnd_16)		\
   1102 	: "r1", "r2", "r4", "r5", "r6", "pr", "t");			\
   1103   } while (0)
   1104 
   1105 #define UDIV_TIME 80
   1106 
   1107 #define sub_ddmmss(sh, sl, ah, al, bh, bl)				\
   1108   __asm__ ("clrt;subc %5,%1; subc %4,%0"				\
   1109 	   : "=r" (sh), "=r" (sl)					\
   1110 	   : "0" (ah), "1" (al), "r" (bh), "r" (bl) : "t")
   1111 
   1112 #endif /* __sh__ */
   1113 
   1114 #if defined (__SH5__) && __SHMEDIA__ && W_TYPE_SIZE == 32
   1115 #define __umulsidi3(u,v) ((UDItype)(USItype)u*(USItype)v)
   1116 #define count_leading_zeros(count, x) \
   1117   do									\
   1118     {									\
   1119       UDItype x_ = (USItype)(x);					\
   1120       SItype c_;							\
   1121 									\
   1122       __asm__ ("nsb %1, %0" : "=r" (c_) : "r" (x_));			\
   1123       (count) = c_ - 31;						\
   1124     }									\
   1125   while (0)
   1126 #define COUNT_LEADING_ZEROS_0 32
   1127 #endif
   1128 
   1129 #if defined (__sparc__) && !defined (__arch64__) && !defined (__sparcv9) \
   1130     && W_TYPE_SIZE == 32
   1131 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \
   1132   __asm__ ("addcc %r4,%5,%1\n\taddx %r2,%3,%0"				\
   1133 	   : "=r" ((USItype) (sh)),					\
   1134 	     "=&r" ((USItype) (sl))					\
   1135 	   : "%rJ" ((USItype) (ah)),					\
   1136 	     "rI" ((USItype) (bh)),					\
   1137 	     "%rJ" ((USItype) (al)),					\
   1138 	     "rI" ((USItype) (bl))					\
   1139 	   __CLOBBER_CC)
   1140 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \
   1141   __asm__ ("subcc %r4,%5,%1\n\tsubx %r2,%3,%0"				\
   1142 	   : "=r" ((USItype) (sh)),					\
   1143 	     "=&r" ((USItype) (sl))					\
   1144 	   : "rJ" ((USItype) (ah)),					\
   1145 	     "rI" ((USItype) (bh)),					\
   1146 	     "rJ" ((USItype) (al)),					\
   1147 	     "rI" ((USItype) (bl))					\
   1148 	   __CLOBBER_CC)
   1149 #if defined (__sparc_v9__)
   1150 #define umul_ppmm(w1, w0, u, v) \
   1151   do {									\
   1152     register USItype __g1 asm ("g1");					\
   1153     __asm__ ("umul\t%2,%3,%1\n\t"					\
   1154 	     "srlx\t%1, 32, %0"						\
   1155 	     : "=r" ((USItype) (w1)),					\
   1156 	       "=r" (__g1)						\
   1157 	     : "r" ((USItype) (u)),					\
   1158 	       "r" ((USItype) (v)));					\
   1159     (w0) = __g1;							\
   1160   } while (0)
   1161 #define udiv_qrnnd(__q, __r, __n1, __n0, __d) \
   1162   __asm__ ("mov\t%2,%%y\n\t"						\
   1163 	   "udiv\t%3,%4,%0\n\t"						\
   1164 	   "umul\t%0,%4,%1\n\t"						\
   1165 	   "sub\t%3,%1,%1"						\
   1166 	   : "=&r" ((USItype) (__q)),					\
   1167 	     "=&r" ((USItype) (__r))					\
   1168 	   : "r" ((USItype) (__n1)),					\
   1169 	     "r" ((USItype) (__n0)),					\
   1170 	     "r" ((USItype) (__d)))
   1171 #else
   1172 #if defined (__sparc_v8__)
   1173 #define umul_ppmm(w1, w0, u, v) \
   1174   __asm__ ("umul %2,%3,%1;rd %%y,%0"					\
   1175 	   : "=r" ((USItype) (w1)),					\
   1176 	     "=r" ((USItype) (w0))					\
   1177 	   : "r" ((USItype) (u)),					\
   1178 	     "r" ((USItype) (v)))
   1179 #define udiv_qrnnd(__q, __r, __n1, __n0, __d) \
   1180   __asm__ ("mov %2,%%y;nop;nop;nop;udiv %3,%4,%0;umul %0,%4,%1;sub %3,%1,%1"\
   1181 	   : "=&r" ((USItype) (__q)),					\
   1182 	     "=&r" ((USItype) (__r))					\
   1183 	   : "r" ((USItype) (__n1)),					\
   1184 	     "r" ((USItype) (__n0)),					\
   1185 	     "r" ((USItype) (__d)))
   1186 #else
   1187 #if defined (__sparclite__)
   1188 /* This has hardware multiply but not divide.  It also has two additional
   1189    instructions scan (ffs from high bit) and divscc.  */
   1190 #define umul_ppmm(w1, w0, u, v) \
   1191   __asm__ ("umul %2,%3,%1;rd %%y,%0"					\
   1192 	   : "=r" ((USItype) (w1)),					\
   1193 	     "=r" ((USItype) (w0))					\
   1194 	   : "r" ((USItype) (u)),					\
   1195 	     "r" ((USItype) (v)))
   1196 #define udiv_qrnnd(q, r, n1, n0, d) \
   1197   __asm__ ("! Inlined udiv_qrnnd\n"					\
   1198 "	wr	%%g0,%2,%%y	! Not a delayed write for sparclite\n"	\
   1199 "	tst	%%g0\n"							\
   1200 "	divscc	%3,%4,%%g1\n"						\
   1201 "	divscc	%%g1,%4,%%g1\n"						\
   1202 "	divscc	%%g1,%4,%%g1\n"						\
   1203 "	divscc	%%g1,%4,%%g1\n"						\
   1204 "	divscc	%%g1,%4,%%g1\n"						\
   1205 "	divscc	%%g1,%4,%%g1\n"						\
   1206 "	divscc	%%g1,%4,%%g1\n"						\
   1207 "	divscc	%%g1,%4,%%g1\n"						\
   1208 "	divscc	%%g1,%4,%%g1\n"						\
   1209 "	divscc	%%g1,%4,%%g1\n"						\
   1210 "	divscc	%%g1,%4,%%g1\n"						\
   1211 "	divscc	%%g1,%4,%%g1\n"						\
   1212 "	divscc	%%g1,%4,%%g1\n"						\
   1213 "	divscc	%%g1,%4,%%g1\n"						\
   1214 "	divscc	%%g1,%4,%%g1\n"						\
   1215 "	divscc	%%g1,%4,%%g1\n"						\
   1216 "	divscc	%%g1,%4,%%g1\n"						\
   1217 "	divscc	%%g1,%4,%%g1\n"						\
   1218 "	divscc	%%g1,%4,%%g1\n"						\
   1219 "	divscc	%%g1,%4,%%g1\n"						\
   1220 "	divscc	%%g1,%4,%%g1\n"						\
   1221 "	divscc	%%g1,%4,%%g1\n"						\
   1222 "	divscc	%%g1,%4,%%g1\n"						\
   1223 "	divscc	%%g1,%4,%%g1\n"						\
   1224 "	divscc	%%g1,%4,%%g1\n"						\
   1225 "	divscc	%%g1,%4,%%g1\n"						\
   1226 "	divscc	%%g1,%4,%%g1\n"						\
   1227 "	divscc	%%g1,%4,%%g1\n"						\
   1228 "	divscc	%%g1,%4,%%g1\n"						\
   1229 "	divscc	%%g1,%4,%%g1\n"						\
   1230 "	divscc	%%g1,%4,%%g1\n"						\
   1231 "	divscc	%%g1,%4,%0\n"						\
   1232 "	rd	%%y,%1\n"						\
   1233 "	bl,a 1f\n"							\
   1234 "	add	%1,%4,%1\n"						\
   1235 "1:	! End of inline udiv_qrnnd"					\
   1236 	   : "=r" ((USItype) (q)),					\
   1237 	     "=r" ((USItype) (r))					\
   1238 	   : "r" ((USItype) (n1)),					\
   1239 	     "r" ((USItype) (n0)),					\
   1240 	     "rI" ((USItype) (d))					\
   1241 	   : "g1" __AND_CLOBBER_CC)
   1242 #define UDIV_TIME 37
   1243 #define count_leading_zeros(count, x) \
   1244   do {                                                                  \
   1245   __asm__ ("scan %1,1,%0"                                               \
   1246 	   : "=r" ((USItype) (count))                                   \
   1247 	   : "r" ((USItype) (x)));					\
   1248   } while (0)
   1249 /* Early sparclites return 63 for an argument of 0, but they warn that future
   1250    implementations might change this.  Therefore, leave COUNT_LEADING_ZEROS_0
   1251    undefined.  */
   1252 #else
   1253 /* SPARC without integer multiplication and divide instructions.
   1254    (i.e. at least Sun4/20,40,60,65,75,110,260,280,330,360,380,470,490) */
   1255 #define umul_ppmm(w1, w0, u, v) \
   1256   __asm__ ("! Inlined umul_ppmm\n"					\
   1257 "	wr	%%g0,%2,%%y	! SPARC has 0-3 delay insn after a wr\n"\
   1258 "	sra	%3,31,%%o5	! Don't move this insn\n"		\
   1259 "	and	%2,%%o5,%%o5	! Don't move this insn\n"		\
   1260 "	andcc	%%g0,0,%%g1	! Don't move this insn\n"		\
   1261 "	mulscc	%%g1,%3,%%g1\n"						\
   1262 "	mulscc	%%g1,%3,%%g1\n"						\
   1263 "	mulscc	%%g1,%3,%%g1\n"						\
   1264 "	mulscc	%%g1,%3,%%g1\n"						\
   1265 "	mulscc	%%g1,%3,%%g1\n"						\
   1266 "	mulscc	%%g1,%3,%%g1\n"						\
   1267 "	mulscc	%%g1,%3,%%g1\n"						\
   1268 "	mulscc	%%g1,%3,%%g1\n"						\
   1269 "	mulscc	%%g1,%3,%%g1\n"						\
   1270 "	mulscc	%%g1,%3,%%g1\n"						\
   1271 "	mulscc	%%g1,%3,%%g1\n"						\
   1272 "	mulscc	%%g1,%3,%%g1\n"						\
   1273 "	mulscc	%%g1,%3,%%g1\n"						\
   1274 "	mulscc	%%g1,%3,%%g1\n"						\
   1275 "	mulscc	%%g1,%3,%%g1\n"						\
   1276 "	mulscc	%%g1,%3,%%g1\n"						\
   1277 "	mulscc	%%g1,%3,%%g1\n"						\
   1278 "	mulscc	%%g1,%3,%%g1\n"						\
   1279 "	mulscc	%%g1,%3,%%g1\n"						\
   1280 "	mulscc	%%g1,%3,%%g1\n"						\
   1281 "	mulscc	%%g1,%3,%%g1\n"						\
   1282 "	mulscc	%%g1,%3,%%g1\n"						\
   1283 "	mulscc	%%g1,%3,%%g1\n"						\
   1284 "	mulscc	%%g1,%3,%%g1\n"						\
   1285 "	mulscc	%%g1,%3,%%g1\n"						\
   1286 "	mulscc	%%g1,%3,%%g1\n"						\
   1287 "	mulscc	%%g1,%3,%%g1\n"						\
   1288 "	mulscc	%%g1,%3,%%g1\n"						\
   1289 "	mulscc	%%g1,%3,%%g1\n"						\
   1290 "	mulscc	%%g1,%3,%%g1\n"						\
   1291 "	mulscc	%%g1,%3,%%g1\n"						\
   1292 "	mulscc	%%g1,%3,%%g1\n"						\
   1293 "	mulscc	%%g1,0,%%g1\n"						\
   1294 "	add	%%g1,%%o5,%0\n"						\
   1295 "	rd	%%y,%1"							\
   1296 	   : "=r" ((USItype) (w1)),					\
   1297 	     "=r" ((USItype) (w0))					\
   1298 	   : "%rI" ((USItype) (u)),					\
   1299 	     "r" ((USItype) (v))						\
   1300 	   : "g1", "o5" __AND_CLOBBER_CC)
   1301 #define UMUL_TIME 39		/* 39 instructions */
   1302 /* It's quite necessary to add this much assembler for the sparc.
   1303    The default udiv_qrnnd (in C) is more than 10 times slower!  */
   1304 #define udiv_qrnnd(__q, __r, __n1, __n0, __d) \
   1305   __asm__ ("! Inlined udiv_qrnnd\n"					\
   1306 "	mov	32,%%g1\n"						\
   1307 "	subcc	%1,%2,%%g0\n"						\
   1308 "1:	bcs	5f\n"							\
   1309 "	 addxcc %0,%0,%0	! shift n1n0 and a q-bit in lsb\n"	\
   1310 "	sub	%1,%2,%1	! this kills msb of n\n"		\
   1311 "	addx	%1,%1,%1	! so this can't give carry\n"		\
   1312 "	subcc	%%g1,1,%%g1\n"						\
   1313 "2:	bne	1b\n"							\
   1314 "	 subcc	%1,%2,%%g0\n"						\
   1315 "	bcs	3f\n"							\
   1316 "	 addxcc %0,%0,%0	! shift n1n0 and a q-bit in lsb\n"	\
   1317 "	b	3f\n"							\
   1318 "	 sub	%1,%2,%1	! this kills msb of n\n"		\
   1319 "4:	sub	%1,%2,%1\n"						\
   1320 "5:	addxcc	%1,%1,%1\n"						\
   1321 "	bcc	2b\n"							\
   1322 "	 subcc	%%g1,1,%%g1\n"						\
   1323 "! Got carry from n.  Subtract next step to cancel this carry.\n"	\
   1324 "	bne	4b\n"							\
   1325 "	 addcc	%0,%0,%0	! shift n1n0 and a 0-bit in lsb\n"	\
   1326 "	sub	%1,%2,%1\n"						\
   1327 "3:	xnor	%0,0,%0\n"						\
   1328 "	! End of inline udiv_qrnnd"					\
   1329 	   : "=&r" ((USItype) (__q)),					\
   1330 	     "=&r" ((USItype) (__r))					\
   1331 	   : "r" ((USItype) (__d)),					\
   1332 	     "1" ((USItype) (__n1)),					\
   1333 	     "0" ((USItype) (__n0)) : "g1" __AND_CLOBBER_CC)
   1334 #define UDIV_TIME (3+7*32)	/* 7 instructions/iteration. 32 iterations.  */
   1335 #endif /* __sparclite__ */
   1336 #endif /* __sparc_v8__ */
   1337 #endif /* __sparc_v9__ */
   1338 #endif /* sparc32 */
   1339 
   1340 #if ((defined (__sparc__) && defined (__arch64__)) || defined (__sparcv9)) \
   1341     && W_TYPE_SIZE == 64
   1342 #define add_ssaaaa(sh, sl, ah, al, bh, bl)				\
   1343   do {									\
   1344     UDItype __carry = 0;						\
   1345     __asm__ ("addcc\t%r5,%6,%1\n\t"					\
   1346 	     "add\t%r3,%4,%0\n\t"					\
   1347 	     "movcs\t%%xcc, 1, %2\n\t"					\
   1348 	     "add\t%0, %2, %0"						\
   1349 	     : "=r" ((UDItype)(sh)),				      	\
   1350 	       "=&r" ((UDItype)(sl)),				      	\
   1351 	       "+r" (__carry)				      		\
   1352 	     : "%rJ" ((UDItype)(ah)),				     	\
   1353 	       "rI" ((UDItype)(bh)),				      	\
   1354 	       "%rJ" ((UDItype)(al)),				     	\
   1355 	       "rI" ((UDItype)(bl))				       	\
   1356 	     __CLOBBER_CC);						\
   1357   } while (0)
   1358 
   1359 #define sub_ddmmss(sh, sl, ah, al, bh, bl)				\
   1360   do {									\
   1361     UDItype __carry = 0;						\
   1362     __asm__ ("subcc\t%r5,%6,%1\n\t"					\
   1363 	     "sub\t%r3,%4,%0\n\t"					\
   1364 	     "movcs\t%%xcc, 1, %2\n\t"					\
   1365 	     "sub\t%0, %2, %0"						\
   1366 	     : "=r" ((UDItype)(sh)),				      	\
   1367 	       "=&r" ((UDItype)(sl)),				      	\
   1368 	       "+r" (__carry)				      		\
   1369 	     : "%rJ" ((UDItype)(ah)),				     	\
   1370 	       "rI" ((UDItype)(bh)),				      	\
   1371 	       "%rJ" ((UDItype)(al)),				     	\
   1372 	       "rI" ((UDItype)(bl))				       	\
   1373 	     __CLOBBER_CC);						\
   1374   } while (0)
   1375 
   1376 #define umul_ppmm(wh, wl, u, v)						\
   1377   do {									\
   1378 	  UDItype tmp1, tmp2, tmp3, tmp4;				\
   1379 	  __asm__ __volatile__ (					\
   1380 		   "srl %7,0,%3\n\t"					\
   1381 		   "mulx %3,%6,%1\n\t"					\
   1382 		   "srlx %6,32,%2\n\t"					\
   1383 		   "mulx %2,%3,%4\n\t"					\
   1384 		   "sllx %4,32,%5\n\t"					\
   1385 		   "srl %6,0,%3\n\t"					\
   1386 		   "sub %1,%5,%5\n\t"					\
   1387 		   "srlx %5,32,%5\n\t"					\
   1388 		   "addcc %4,%5,%4\n\t"					\
   1389 		   "srlx %7,32,%5\n\t"					\
   1390 		   "mulx %3,%5,%3\n\t"					\
   1391 		   "mulx %2,%5,%5\n\t"					\
   1392 		   "sethi %%hi(0x80000000),%2\n\t"			\
   1393 		   "addcc %4,%3,%4\n\t"					\
   1394 		   "srlx %4,32,%4\n\t"					\
   1395 		   "add %2,%2,%2\n\t"					\
   1396 		   "movcc %%xcc,%%g0,%2\n\t"				\
   1397 		   "addcc %5,%4,%5\n\t"					\
   1398 		   "sllx %3,32,%3\n\t"					\
   1399 		   "add %1,%3,%1\n\t"					\
   1400 		   "add %5,%2,%0"					\
   1401 	   : "=r" ((UDItype)(wh)),					\
   1402 	     "=&r" ((UDItype)(wl)),					\
   1403 	     "=&r" (tmp1), "=&r" (tmp2), "=&r" (tmp3), "=&r" (tmp4)	\
   1404 	   : "r" ((UDItype)(u)),					\
   1405 	     "r" ((UDItype)(v))						\
   1406 	   __CLOBBER_CC);						\
   1407   } while (0)
   1408 #define UMUL_TIME 96
   1409 #define UDIV_TIME 230
   1410 #endif /* sparc64 */
   1411 
   1412 #if defined (__vax__) && W_TYPE_SIZE == 32
   1413 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \
   1414   __asm__ ("addl2 %5,%1\n\tadwc %3,%0"					\
   1415 	   : "=g" ((USItype) (sh)),					\
   1416 	     "=&g" ((USItype) (sl))					\
   1417 	   : "%0" ((USItype) (ah)),					\
   1418 	     "g" ((USItype) (bh)),					\
   1419 	     "%1" ((USItype) (al)),					\
   1420 	     "g" ((USItype) (bl)))
   1421 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \
   1422   __asm__ ("subl2 %5,%1\n\tsbwc %3,%0"					\
   1423 	   : "=g" ((USItype) (sh)),					\
   1424 	     "=&g" ((USItype) (sl))					\
   1425 	   : "0" ((USItype) (ah)),					\
   1426 	     "g" ((USItype) (bh)),					\
   1427 	     "1" ((USItype) (al)),					\
   1428 	     "g" ((USItype) (bl)))
   1429 #define umul_ppmm(xh, xl, m0, m1) \
   1430   do {									\
   1431     union {								\
   1432 	UDItype __ll;							\
   1433 	struct {USItype __l, __h;} __i;					\
   1434       } __xx;								\
   1435     USItype __m0 = (m0), __m1 = (m1);					\
   1436     __asm__ ("emul %1,%2,$0,%0"						\
   1437 	     : "=r" (__xx.__ll)						\
   1438 	     : "g" (__m0),						\
   1439 	       "g" (__m1));						\
   1440     (xh) = __xx.__i.__h;						\
   1441     (xl) = __xx.__i.__l;						\
   1442     (xh) += ((((SItype) __m0 >> 31) & __m1)				\
   1443 	     + (((SItype) __m1 >> 31) & __m0));				\
   1444   } while (0)
   1445 #define sdiv_qrnnd(q, r, n1, n0, d) \
   1446   do {									\
   1447     union {DItype __ll;							\
   1448 	   struct {SItype __l, __h;} __i;				\
   1449 	  } __xx;							\
   1450     __xx.__i.__h = n1; __xx.__i.__l = n0;				\
   1451     __asm__ ("ediv %3,%2,%0,%1"						\
   1452 	     : "=g" (q), "=g" (r)					\
   1453 	     : "g" (__xx.__ll), "g" (d));				\
   1454   } while (0)
   1455 #endif /* __vax__ */
   1456 
   1457 #ifdef _TMS320C6X
   1458 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \
   1459   do									\
   1460     {									\
   1461       UDItype __ll;							\
   1462       __asm__ ("addu .l1 %1, %2, %0"					\
   1463 	       : "=a" (__ll) : "a" (al), "a" (bl));			\
   1464       (sl) = (USItype)__ll;						\
   1465       (sh) = ((USItype)(__ll >> 32)) + (ah) + (bh);			\
   1466     }									\
   1467   while (0)
   1468 
   1469 #ifdef _TMS320C6400_PLUS
   1470 #define __umulsidi3(u,v) ((UDItype)(USItype)u*(USItype)v)
   1471 #define umul_ppmm(w1, w0, u, v)						\
   1472   do {									\
   1473     UDItype __x = (UDItype) (USItype) (u) * (USItype) (v);		\
   1474     (w1) = (USItype) (__x >> 32);					\
   1475     (w0) = (USItype) (__x);						\
   1476   } while (0)
   1477 #endif  /* _TMS320C6400_PLUS */
   1478 
   1479 #define count_leading_zeros(count, x)	((count) = __builtin_clz (x))
   1480 #ifdef _TMS320C6400
   1481 #define count_trailing_zeros(count, x)	((count) = __builtin_ctz (x))
   1482 #endif
   1483 #define UMUL_TIME 4
   1484 #define UDIV_TIME 40
   1485 #endif /* _TMS320C6X */
   1486 
   1487 #if defined (__xtensa__) && W_TYPE_SIZE == 32
   1488 /* This code is not Xtensa-configuration-specific, so rely on the compiler
   1489    to expand builtin functions depending on what configuration features
   1490    are available.  This avoids library calls when the operation can be
   1491    performed in-line.  */
   1492 #define umul_ppmm(w1, w0, u, v)						\
   1493   do {									\
   1494     DWunion __w;							\
   1495     __w.ll = __builtin_umulsidi3 (u, v);				\
   1496     w1 = __w.s.high;							\
   1497     w0 = __w.s.low;							\
   1498   } while (0)
   1499 #define __umulsidi3(u, v)		__builtin_umulsidi3 (u, v)
   1500 #define count_leading_zeros(COUNT, X)	((COUNT) = __builtin_clz (X))
   1501 #define count_trailing_zeros(COUNT, X)	((COUNT) = __builtin_ctz (X))
   1502 #endif /* __xtensa__ */
   1503 
   1504 #if defined xstormy16
   1505 extern UHItype __stormy16_count_leading_zeros (UHItype);
   1506 #define count_leading_zeros(count, x)					\
   1507   do									\
   1508     {									\
   1509       UHItype size;							\
   1510 									\
   1511       /* We assume that W_TYPE_SIZE is a multiple of 16...  */		\
   1512       for ((count) = 0, size = W_TYPE_SIZE; size; size -= 16)		\
   1513 	{								\
   1514 	  UHItype c;							\
   1515 									\
   1516 	  c = __clzhi2 ((x) >> (size - 16));				\
   1517 	  (count) += c;							\
   1518 	  if (c != 16)							\
   1519 	    break;							\
   1520 	}								\
   1521     }									\
   1522   while (0)
   1523 #define COUNT_LEADING_ZEROS_0 W_TYPE_SIZE
   1524 #endif
   1525 
   1526 #if defined (__z8000__) && W_TYPE_SIZE == 16
   1527 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \
   1528   __asm__ ("add	%H1,%H5\n\tadc	%H0,%H3"				\
   1529 	   : "=r" ((unsigned int)(sh)),					\
   1530 	     "=&r" ((unsigned int)(sl))					\
   1531 	   : "%0" ((unsigned int)(ah)),					\
   1532 	     "r" ((unsigned int)(bh)),					\
   1533 	     "%1" ((unsigned int)(al)),					\
   1534 	     "rQR" ((unsigned int)(bl)))
   1535 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \
   1536   __asm__ ("sub	%H1,%H5\n\tsbc	%H0,%H3"				\
   1537 	   : "=r" ((unsigned int)(sh)),					\
   1538 	     "=&r" ((unsigned int)(sl))					\
   1539 	   : "0" ((unsigned int)(ah)),					\
   1540 	     "r" ((unsigned int)(bh)),					\
   1541 	     "1" ((unsigned int)(al)),					\
   1542 	     "rQR" ((unsigned int)(bl)))
   1543 #define umul_ppmm(xh, xl, m0, m1) \
   1544   do {									\
   1545     union {long int __ll;						\
   1546 	   struct {unsigned int __h, __l;} __i;				\
   1547 	  } __xx;							\
   1548     unsigned int __m0 = (m0), __m1 = (m1);				\
   1549     __asm__ ("mult	%S0,%H3"					\
   1550 	     : "=r" (__xx.__i.__h),					\
   1551 	       "=r" (__xx.__i.__l)					\
   1552 	     : "%1" (__m0),						\
   1553 	       "rQR" (__m1));						\
   1554     (xh) = __xx.__i.__h; (xl) = __xx.__i.__l;				\
   1555     (xh) += ((((signed int) __m0 >> 15) & __m1)				\
   1556 	     + (((signed int) __m1 >> 15) & __m0));			\
   1557   } while (0)
   1558 #endif /* __z8000__ */
   1559 
   1560 #endif /* __GNUC__ */
   1561 
   1562 /* If this machine has no inline assembler, use C macros.  */
   1563 
   1564 #if !defined (add_ssaaaa)
   1565 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \
   1566   do {									\
   1567     UWtype __x;								\
   1568     __x = (al) + (bl);							\
   1569     (sh) = (ah) + (bh) + (__x < (al));					\
   1570     (sl) = __x;								\
   1571   } while (0)
   1572 #endif
   1573 
   1574 #if !defined (sub_ddmmss)
   1575 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \
   1576   do {									\
   1577     UWtype __x;								\
   1578     __x = (al) - (bl);							\
   1579     (sh) = (ah) - (bh) - (__x > (al));					\
   1580     (sl) = __x;								\
   1581   } while (0)
   1582 #endif
   1583 
   1584 /* If we lack umul_ppmm but have smul_ppmm, define umul_ppmm in terms of
   1585    smul_ppmm.  */
   1586 #if !defined (umul_ppmm) && defined (smul_ppmm)
   1587 #define umul_ppmm(w1, w0, u, v)						\
   1588   do {									\
   1589     UWtype __w1;							\
   1590     UWtype __xm0 = (u), __xm1 = (v);					\
   1591     smul_ppmm (__w1, w0, __xm0, __xm1);					\
   1592     (w1) = __w1 + (-(__xm0 >> (W_TYPE_SIZE - 1)) & __xm1)		\
   1593 		+ (-(__xm1 >> (W_TYPE_SIZE - 1)) & __xm0);		\
   1594   } while (0)
   1595 #endif
   1596 
   1597 /* If we still don't have umul_ppmm, define it using plain C.  */
   1598 #if !defined (umul_ppmm)
   1599 #define umul_ppmm(w1, w0, u, v)						\
   1600   do {									\
   1601     UWtype __x0, __x1, __x2, __x3;					\
   1602     UHWtype __ul, __vl, __uh, __vh;					\
   1603 									\
   1604     __ul = __ll_lowpart (u);						\
   1605     __uh = __ll_highpart (u);						\
   1606     __vl = __ll_lowpart (v);						\
   1607     __vh = __ll_highpart (v);						\
   1608 									\
   1609     __x0 = (UWtype) __ul * __vl;					\
   1610     __x1 = (UWtype) __ul * __vh;					\
   1611     __x2 = (UWtype) __uh * __vl;					\
   1612     __x3 = (UWtype) __uh * __vh;					\
   1613 									\
   1614     __x1 += __ll_highpart (__x0);/* this can't give carry */		\
   1615     __x1 += __x2;		/* but this indeed can */		\
   1616     if (__x1 < __x2)		/* did we get it? */			\
   1617       __x3 += __ll_B;		/* yes, add it in the proper pos.  */	\
   1618 									\
   1619     (w1) = __x3 + __ll_highpart (__x1);					\
   1620     (w0) = __ll_lowpart (__x1) * __ll_B + __ll_lowpart (__x0);		\
   1621   } while (0)
   1622 #endif
   1623 
   1624 #if !defined (__umulsidi3)
   1625 #define __umulsidi3(u, v) \
   1626   ({DWunion __w;							\
   1627     umul_ppmm (__w.s.high, __w.s.low, u, v);				\
   1628     __w.ll; })
   1629 #endif
   1630 
   1631 /* Define this unconditionally, so it can be used for debugging.  */
   1632 #define __udiv_qrnnd_c(q, r, n1, n0, d) \
   1633   do {									\
   1634     UWtype __d1, __d0, __q1, __q0;					\
   1635     UWtype __r1, __r0, __m;						\
   1636     __d1 = __ll_highpart (d);						\
   1637     __d0 = __ll_lowpart (d);						\
   1638 									\
   1639     __r1 = (n1) % __d1;							\
   1640     __q1 = (n1) / __d1;							\
   1641     __m = (UWtype) __q1 * __d0;						\
   1642     __r1 = __r1 * __ll_B | __ll_highpart (n0);				\
   1643     if (__r1 < __m)							\
   1644       {									\
   1645 	__q1--, __r1 += (d);						\
   1646 	if (__r1 >= (d)) /* i.e. we didn't get carry when adding to __r1 */\
   1647 	  if (__r1 < __m)						\
   1648 	    __q1--, __r1 += (d);					\
   1649       }									\
   1650     __r1 -= __m;							\
   1651 									\
   1652     __r0 = __r1 % __d1;							\
   1653     __q0 = __r1 / __d1;							\
   1654     __m = (UWtype) __q0 * __d0;						\
   1655     __r0 = __r0 * __ll_B | __ll_lowpart (n0);				\
   1656     if (__r0 < __m)							\
   1657       {									\
   1658 	__q0--, __r0 += (d);						\
   1659 	if (__r0 >= (d))						\
   1660 	  if (__r0 < __m)						\
   1661 	    __q0--, __r0 += (d);					\
   1662       }									\
   1663     __r0 -= __m;							\
   1664 									\
   1665     (q) = (UWtype) __q1 * __ll_B | __q0;				\
   1666     (r) = __r0;								\
   1667   } while (0)
   1668 
   1669 /* If the processor has no udiv_qrnnd but sdiv_qrnnd, go through
   1670    __udiv_w_sdiv (defined in libgcc or elsewhere).  */
   1671 #if !defined (udiv_qrnnd) && defined (sdiv_qrnnd)
   1672 #define udiv_qrnnd(q, r, nh, nl, d) \
   1673   do {									\
   1674     USItype __r;							\
   1675     (q) = __udiv_w_sdiv (&__r, nh, nl, d);				\
   1676     (r) = __r;								\
   1677   } while (0)
   1678 #endif
   1679 
   1680 /* If udiv_qrnnd was not defined for this processor, use __udiv_qrnnd_c.  */
   1681 #if !defined (udiv_qrnnd)
   1682 #define UDIV_NEEDS_NORMALIZATION 1
   1683 #define udiv_qrnnd __udiv_qrnnd_c
   1684 #endif
   1685 
   1686 #if !defined (count_leading_zeros)
   1687 #define count_leading_zeros(count, x) \
   1688   do {									\
   1689     UWtype __xr = (x);							\
   1690     UWtype __a;								\
   1691 									\
   1692     if (W_TYPE_SIZE <= 32)						\
   1693       {									\
   1694 	__a = __xr < ((UWtype)1<<2*__BITS4)				\
   1695 	  ? (__xr < ((UWtype)1<<__BITS4) ? 0 : __BITS4)			\
   1696 	  : (__xr < ((UWtype)1<<3*__BITS4) ?  2*__BITS4 : 3*__BITS4);	\
   1697       }									\
   1698     else								\
   1699       {									\
   1700 	for (__a = W_TYPE_SIZE - 8; __a > 0; __a -= 8)			\
   1701 	  if (((__xr >> __a) & 0xff) != 0)				\
   1702 	    break;							\
   1703       }									\
   1704 									\
   1705     (count) = W_TYPE_SIZE - (__clz_tab[__xr >> __a] + __a);		\
   1706   } while (0)
   1707 #define COUNT_LEADING_ZEROS_0 W_TYPE_SIZE
   1708 #endif
   1709 
   1710 #if !defined (count_trailing_zeros)
   1711 /* Define count_trailing_zeros using count_leading_zeros.  The latter might be
   1712    defined in asm, but if it is not, the C version above is good enough.  */
   1713 #define count_trailing_zeros(count, x) \
   1714   do {									\
   1715     UWtype __ctz_x = (x);						\
   1716     UWtype __ctz_c;							\
   1717     count_leading_zeros (__ctz_c, __ctz_x & -__ctz_x);			\
   1718     (count) = W_TYPE_SIZE - 1 - __ctz_c;				\
   1719   } while (0)
   1720 #endif
   1721 
   1722 #ifndef UDIV_NEEDS_NORMALIZATION
   1723 #define UDIV_NEEDS_NORMALIZATION 0
   1724 #endif
   1725