Home | History | Annotate | Line # | Download | only in xtensa
ieee754-sf.S revision 1.1.1.1.8.2
      1 /* IEEE-754 single-precision functions for Xtensa
      2    Copyright (C) 2006-2013 Free Software Foundation, Inc.
      3    Contributed by Bob Wilson (bwilson (at) tensilica.com) at Tensilica.
      4 
      5    This file is part of GCC.
      6 
      7    GCC is free software; you can redistribute it and/or modify it
      8    under the terms of the GNU General Public License as published by
      9    the Free Software Foundation; either version 3, or (at your option)
     10    any later version.
     11 
     12    GCC is distributed in the hope that it will be useful, but WITHOUT
     13    ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
     14    or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public
     15    License for more details.
     16 
     17    Under Section 7 of GPL version 3, you are granted additional
     18    permissions described in the GCC Runtime Library Exception, version
     19    3.1, as published by the Free Software Foundation.
     20 
     21    You should have received a copy of the GNU General Public License and
     22    a copy of the GCC Runtime Library Exception along with this program;
     23    see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
     24    <http://www.gnu.org/licenses/>.  */
     25 
     26 #ifdef __XTENSA_EB__
     27 #define xh a2
     28 #define xl a3
     29 #define yh a4
     30 #define yl a5
     31 #else
     32 #define xh a3
     33 #define xl a2
     34 #define yh a5
     35 #define yl a4
     36 #endif
     37 
     38 /*  Warning!  The branch displacements for some Xtensa branch instructions
     39     are quite small, and this code has been carefully laid out to keep
     40     branch targets in range.  If you change anything, be sure to check that
     41     the assembler is not relaxing anything to branch over a jump.  */
     42 
     43 #ifdef L_negsf2
     44 
     45 	.align	4
     46 	.global	__negsf2
     47 	.type	__negsf2, @function
     48 __negsf2:
     49 	leaf_entry sp, 16
     50 	movi	a4, 0x80000000
     51 	xor	a2, a2, a4
     52 	leaf_return
     53 
     54 #endif /* L_negsf2 */
     55 
     56 #ifdef L_addsubsf3
     57 
     58 	/* Addition */
     59 __addsf3_aux:
     60 
     61 	/* Handle NaNs and Infinities.  (This code is placed before the
     62 	   start of the function just to keep it in range of the limited
     63 	   branch displacements.)  */
     64 
     65 .Ladd_xnan_or_inf:
     66 	/* If y is neither Infinity nor NaN, return x.  */
     67 	bnall	a3, a6, 1f
     68 	/* If x is a NaN, return it.  Otherwise, return y.  */
     69 	slli	a7, a2, 9
     70 	beqz	a7, .Ladd_ynan_or_inf
     71 1:	leaf_return
     72 
     73 .Ladd_ynan_or_inf:
     74 	/* Return y.  */
     75 	mov	a2, a3
     76 	leaf_return
     77 
     78 .Ladd_opposite_signs:
     79 	/* Operand signs differ.  Do a subtraction.  */
     80 	slli	a7, a6, 8
     81 	xor	a3, a3, a7
     82 	j	.Lsub_same_sign
     83 
     84 	.align	4
     85 	.global	__addsf3
     86 	.type	__addsf3, @function
     87 __addsf3:
     88 	leaf_entry sp, 16
     89 	movi	a6, 0x7f800000
     90 
     91 	/* Check if the two operands have the same sign.  */
     92 	xor	a7, a2, a3
     93 	bltz	a7, .Ladd_opposite_signs
     94 
     95 .Ladd_same_sign:
     96 	/* Check if either exponent == 0x7f8 (i.e., NaN or Infinity).  */
     97 	ball	a2, a6, .Ladd_xnan_or_inf
     98 	ball	a3, a6, .Ladd_ynan_or_inf
     99 
    100 	/* Compare the exponents.  The smaller operand will be shifted
    101 	   right by the exponent difference and added to the larger
    102 	   one.  */
    103 	extui	a7, a2, 23, 9
    104 	extui	a8, a3, 23, 9
    105 	bltu	a7, a8, .Ladd_shiftx
    106 
    107 .Ladd_shifty:
    108 	/* Check if the smaller (or equal) exponent is zero.  */
    109 	bnone	a3, a6, .Ladd_yexpzero
    110 
    111 	/* Replace y sign/exponent with 0x008.  */
    112 	or	a3, a3, a6
    113 	slli	a3, a3, 8
    114 	srli	a3, a3, 8
    115 
    116 .Ladd_yexpdiff:
    117 	/* Compute the exponent difference.  */
    118 	sub	a10, a7, a8
    119 
    120 	/* Exponent difference > 32 -- just return the bigger value.  */
    121 	bgeui	a10, 32, 1f
    122 
    123 	/* Shift y right by the exponent difference.  Any bits that are
    124 	   shifted out of y are saved in a9 for rounding the result.  */
    125 	ssr	a10
    126 	movi	a9, 0
    127 	src	a9, a3, a9
    128 	srl	a3, a3
    129 
    130 	/* Do the addition.  */
    131 	add	a2, a2, a3
    132 
    133 	/* Check if the add overflowed into the exponent.  */
    134 	extui	a10, a2, 23, 9
    135 	beq	a10, a7, .Ladd_round
    136 	mov	a8, a7
    137 	j	.Ladd_carry
    138 
    139 .Ladd_yexpzero:
    140 	/* y is a subnormal value.  Replace its sign/exponent with zero,
    141 	   i.e., no implicit "1.0", and increment the apparent exponent
    142 	   because subnormals behave as if they had the minimum (nonzero)
    143 	   exponent.  Test for the case when both exponents are zero.  */
    144 	slli	a3, a3, 9
    145 	srli	a3, a3, 9
    146 	bnone	a2, a6, .Ladd_bothexpzero
    147 	addi	a8, a8, 1
    148 	j	.Ladd_yexpdiff
    149 
    150 .Ladd_bothexpzero:
    151 	/* Both exponents are zero.  Handle this as a special case.  There
    152 	   is no need to shift or round, and the normal code for handling
    153 	   a carry into the exponent field will not work because it
    154 	   assumes there is an implicit "1.0" that needs to be added.  */
    155 	add	a2, a2, a3
    156 1:	leaf_return
    157 
    158 .Ladd_xexpzero:
    159 	/* Same as "yexpzero" except skip handling the case when both
    160 	   exponents are zero.  */
    161 	slli	a2, a2, 9
    162 	srli	a2, a2, 9
    163 	addi	a7, a7, 1
    164 	j	.Ladd_xexpdiff
    165 
    166 .Ladd_shiftx:
    167 	/* Same thing as the "shifty" code, but with x and y swapped.  Also,
    168 	   because the exponent difference is always nonzero in this version,
    169 	   the shift sequence can use SLL and skip loading a constant zero.  */
    170 	bnone	a2, a6, .Ladd_xexpzero
    171 
    172 	or	a2, a2, a6
    173 	slli	a2, a2, 8
    174 	srli	a2, a2, 8
    175 
    176 .Ladd_xexpdiff:
    177 	sub	a10, a8, a7
    178 	bgeui	a10, 32, .Ladd_returny
    179 
    180 	ssr	a10
    181 	sll	a9, a2
    182 	srl	a2, a2
    183 
    184 	add	a2, a2, a3
    185 
    186 	/* Check if the add overflowed into the exponent.  */
    187 	extui	a10, a2, 23, 9
    188 	bne	a10, a8, .Ladd_carry
    189 
    190 .Ladd_round:
    191 	/* Round up if the leftover fraction is >= 1/2.  */
    192 	bgez	a9, 1f
    193 	addi	a2, a2, 1
    194 
    195 	/* Check if the leftover fraction is exactly 1/2.  */
    196 	slli	a9, a9, 1
    197 	beqz	a9, .Ladd_exactlyhalf
    198 1:	leaf_return
    199 
    200 .Ladd_returny:
    201 	mov	a2, a3
    202 	leaf_return
    203 
    204 .Ladd_carry:
    205 	/* The addition has overflowed into the exponent field, so the
    206 	   value needs to be renormalized.  The mantissa of the result
    207 	   can be recovered by subtracting the original exponent and
    208 	   adding 0x800000 (which is the explicit "1.0" for the
    209 	   mantissa of the non-shifted operand -- the "1.0" for the
    210 	   shifted operand was already added).  The mantissa can then
    211 	   be shifted right by one bit.  The explicit "1.0" of the
    212 	   shifted mantissa then needs to be replaced by the exponent,
    213 	   incremented by one to account for the normalizing shift.
    214 	   It is faster to combine these operations: do the shift first
    215 	   and combine the additions and subtractions.  If x is the
    216 	   original exponent, the result is:
    217 	       shifted mantissa - (x << 22) + (1 << 22) + (x << 23)
    218 	   or:
    219 	       shifted mantissa + ((x + 1) << 22)
    220 	   Note that the exponent is incremented here by leaving the
    221 	   explicit "1.0" of the mantissa in the exponent field.  */
    222 
    223 	/* Shift x right by one bit.  Save the lsb.  */
    224 	mov	a10, a2
    225 	srli	a2, a2, 1
    226 
    227 	/* See explanation above.  The original exponent is in a8.  */
    228 	addi	a8, a8, 1
    229 	slli	a8, a8, 22
    230 	add	a2, a2, a8
    231 
    232 	/* Return an Infinity if the exponent overflowed.  */
    233 	ball	a2, a6, .Ladd_infinity
    234 
    235 	/* Same thing as the "round" code except the msb of the leftover
    236 	   fraction is bit 0 of a10, with the rest of the fraction in a9.  */
    237 	bbci.l	a10, 0, 1f
    238 	addi	a2, a2, 1
    239 	beqz	a9, .Ladd_exactlyhalf
    240 1:	leaf_return
    241 
    242 .Ladd_infinity:
    243 	/* Clear the mantissa.  */
    244 	srli	a2, a2, 23
    245 	slli	a2, a2, 23
    246 
    247 	/* The sign bit may have been lost in a carry-out.  Put it back.  */
    248 	slli	a8, a8, 1
    249 	or	a2, a2, a8
    250 	leaf_return
    251 
    252 .Ladd_exactlyhalf:
    253 	/* Round down to the nearest even value.  */
    254 	srli	a2, a2, 1
    255 	slli	a2, a2, 1
    256 	leaf_return
    257 
    258 
    259 	/* Subtraction */
    260 __subsf3_aux:
    261 
    262 	/* Handle NaNs and Infinities.  (This code is placed before the
    263 	   start of the function just to keep it in range of the limited
    264 	   branch displacements.)  */
    265 
    266 .Lsub_xnan_or_inf:
    267 	/* If y is neither Infinity nor NaN, return x.  */
    268 	bnall	a3, a6, 1f
    269 	/* Both x and y are either NaN or Inf, so the result is NaN.  */
    270 	movi	a4, 0x400000	/* make it a quiet NaN */
    271 	or	a2, a2, a4
    272 1:	leaf_return
    273 
    274 .Lsub_ynan_or_inf:
    275 	/* Negate y and return it.  */
    276 	slli	a7, a6, 8
    277 	xor	a2, a3, a7
    278 	leaf_return
    279 
    280 .Lsub_opposite_signs:
    281 	/* Operand signs differ.  Do an addition.  */
    282 	slli	a7, a6, 8
    283 	xor	a3, a3, a7
    284 	j	.Ladd_same_sign
    285 
    286 	.align	4
    287 	.global	__subsf3
    288 	.type	__subsf3, @function
    289 __subsf3:
    290 	leaf_entry sp, 16
    291 	movi	a6, 0x7f800000
    292 
    293 	/* Check if the two operands have the same sign.  */
    294 	xor	a7, a2, a3
    295 	bltz	a7, .Lsub_opposite_signs
    296 
    297 .Lsub_same_sign:
    298 	/* Check if either exponent == 0x7f8 (i.e., NaN or Infinity).  */
    299 	ball	a2, a6, .Lsub_xnan_or_inf
    300 	ball	a3, a6, .Lsub_ynan_or_inf
    301 
    302 	/* Compare the operands.  In contrast to addition, the entire
    303 	   value matters here.  */
    304 	extui	a7, a2, 23, 8
    305 	extui	a8, a3, 23, 8
    306 	bltu	a2, a3, .Lsub_xsmaller
    307 
    308 .Lsub_ysmaller:
    309 	/* Check if the smaller (or equal) exponent is zero.  */
    310 	bnone	a3, a6, .Lsub_yexpzero
    311 
    312 	/* Replace y sign/exponent with 0x008.  */
    313 	or	a3, a3, a6
    314 	slli	a3, a3, 8
    315 	srli	a3, a3, 8
    316 
    317 .Lsub_yexpdiff:
    318 	/* Compute the exponent difference.  */
    319 	sub	a10, a7, a8
    320 
    321 	/* Exponent difference > 32 -- just return the bigger value.  */
    322 	bgeui	a10, 32, 1f
    323 
    324 	/* Shift y right by the exponent difference.  Any bits that are
    325 	   shifted out of y are saved in a9 for rounding the result.  */
    326 	ssr	a10
    327 	movi	a9, 0
    328 	src	a9, a3, a9
    329 	srl	a3, a3
    330 
    331 	sub	a2, a2, a3
    332 
    333 	/* Subtract the leftover bits in a9 from zero and propagate any
    334 	   borrow from a2.  */
    335 	neg	a9, a9
    336 	addi	a10, a2, -1
    337 	movnez	a2, a10, a9
    338 
    339 	/* Check if the subtract underflowed into the exponent.  */
    340 	extui	a10, a2, 23, 8
    341 	beq	a10, a7, .Lsub_round
    342 	j	.Lsub_borrow
    343 
    344 .Lsub_yexpzero:
    345 	/* Return zero if the inputs are equal.  (For the non-subnormal
    346 	   case, subtracting the "1.0" will cause a borrow from the exponent
    347 	   and this case can be detected when handling the borrow.)  */
    348 	beq	a2, a3, .Lsub_return_zero
    349 
    350 	/* y is a subnormal value.  Replace its sign/exponent with zero,
    351 	   i.e., no implicit "1.0".  Unless x is also a subnormal, increment
    352 	   y's apparent exponent because subnormals behave as if they had
    353 	   the minimum (nonzero) exponent.  */
    354 	slli	a3, a3, 9
    355 	srli	a3, a3, 9
    356 	bnone	a2, a6, .Lsub_yexpdiff
    357 	addi	a8, a8, 1
    358 	j	.Lsub_yexpdiff
    359 
    360 .Lsub_returny:
    361 	/* Negate and return y.  */
    362 	slli	a7, a6, 8
    363 	xor	a2, a3, a7
    364 1:	leaf_return
    365 
    366 .Lsub_xsmaller:
    367 	/* Same thing as the "ysmaller" code, but with x and y swapped and
    368 	   with y negated.  */
    369 	bnone	a2, a6, .Lsub_xexpzero
    370 
    371 	or	a2, a2, a6
    372 	slli	a2, a2, 8
    373 	srli	a2, a2, 8
    374 
    375 .Lsub_xexpdiff:
    376 	sub	a10, a8, a7
    377 	bgeui	a10, 32, .Lsub_returny
    378 
    379 	ssr	a10
    380 	movi	a9, 0
    381 	src	a9, a2, a9
    382 	srl	a2, a2
    383 
    384 	/* Negate y.  */
    385 	slli	a11, a6, 8
    386 	xor	a3, a3, a11
    387 
    388 	sub	a2, a3, a2
    389 
    390 	neg	a9, a9
    391 	addi	a10, a2, -1
    392 	movnez	a2, a10, a9
    393 
    394 	/* Check if the subtract underflowed into the exponent.  */
    395 	extui	a10, a2, 23, 8
    396 	bne	a10, a8, .Lsub_borrow
    397 
    398 .Lsub_round:
    399 	/* Round up if the leftover fraction is >= 1/2.  */
    400 	bgez	a9, 1f
    401 	addi	a2, a2, 1
    402 
    403 	/* Check if the leftover fraction is exactly 1/2.  */
    404 	slli	a9, a9, 1
    405 	beqz	a9, .Lsub_exactlyhalf
    406 1:	leaf_return
    407 
    408 .Lsub_xexpzero:
    409 	/* Same as "yexpzero".  */
    410 	beq	a2, a3, .Lsub_return_zero
    411 	slli	a2, a2, 9
    412 	srli	a2, a2, 9
    413 	bnone	a3, a6, .Lsub_xexpdiff
    414 	addi	a7, a7, 1
    415 	j	.Lsub_xexpdiff
    416 
    417 .Lsub_return_zero:
    418 	movi	a2, 0
    419 	leaf_return
    420 
    421 .Lsub_borrow:
    422 	/* The subtraction has underflowed into the exponent field, so the
    423 	   value needs to be renormalized.  Shift the mantissa left as
    424 	   needed to remove any leading zeros and adjust the exponent
    425 	   accordingly.  If the exponent is not large enough to remove
    426 	   all the leading zeros, the result will be a subnormal value.  */
    427 
    428 	slli	a8, a2, 9
    429 	beqz	a8, .Lsub_xzero
    430 	do_nsau	a6, a8, a7, a11
    431 	srli	a8, a8, 9
    432 	bge	a6, a10, .Lsub_subnormal
    433 	addi	a6, a6, 1
    434 
    435 .Lsub_normalize_shift:
    436 	/* Shift the mantissa (a8/a9) left by a6.  */
    437 	ssl	a6
    438 	src	a8, a8, a9
    439 	sll	a9, a9
    440 
    441 	/* Combine the shifted mantissa with the sign and exponent,
    442 	   decrementing the exponent by a6.  (The exponent has already
    443 	   been decremented by one due to the borrow from the subtraction,
    444 	   but adding the mantissa will increment the exponent by one.)  */
    445 	srli	a2, a2, 23
    446 	sub	a2, a2, a6
    447 	slli	a2, a2, 23
    448 	add	a2, a2, a8
    449 	j	.Lsub_round
    450 
    451 .Lsub_exactlyhalf:
    452 	/* Round down to the nearest even value.  */
    453 	srli	a2, a2, 1
    454 	slli	a2, a2, 1
    455 	leaf_return
    456 
    457 .Lsub_xzero:
    458 	/* If there was a borrow from the exponent, and the mantissa and
    459 	   guard digits are all zero, then the inputs were equal and the
    460 	   result should be zero.  */
    461 	beqz	a9, .Lsub_return_zero
    462 
    463 	/* Only the guard digit is nonzero.  Shift by min(24, a10).  */
    464 	addi	a11, a10, -24
    465 	movi	a6, 24
    466 	movltz	a6, a10, a11
    467 	j	.Lsub_normalize_shift
    468 
    469 .Lsub_subnormal:
    470 	/* The exponent is too small to shift away all the leading zeros.
    471 	   Set a6 to the current exponent (which has already been
    472 	   decremented by the borrow) so that the exponent of the result
    473 	   will be zero.  Do not add 1 to a6 in this case, because: (1)
    474 	   adding the mantissa will not increment the exponent, so there is
    475 	   no need to subtract anything extra from the exponent to
    476 	   compensate, and (2) the effective exponent of a subnormal is 1
    477 	   not 0 so the shift amount must be 1 smaller than normal. */
    478 	mov	a6, a10
    479 	j	.Lsub_normalize_shift
    480 
    481 #endif /* L_addsubsf3 */
    482 
    483 #ifdef L_mulsf3
    484 
    485 	/* Multiplication */
    486 #if !XCHAL_HAVE_MUL16 && !XCHAL_HAVE_MUL32 && !XCHAL_HAVE_MAC16
    487 #define XCHAL_NO_MUL 1
    488 #endif
    489 
    490 __mulsf3_aux:
    491 
    492 	/* Handle unusual cases (zeros, subnormals, NaNs and Infinities).
    493 	   (This code is placed before the start of the function just to
    494 	   keep it in range of the limited branch displacements.)  */
    495 
    496 .Lmul_xexpzero:
    497 	/* Clear the sign bit of x.  */
    498 	slli	a2, a2, 1
    499 	srli	a2, a2, 1
    500 
    501 	/* If x is zero, return zero.  */
    502 	beqz	a2, .Lmul_return_zero
    503 
    504 	/* Normalize x.  Adjust the exponent in a8.  */
    505 	do_nsau	a10, a2, a11, a12
    506 	addi	a10, a10, -8
    507 	ssl	a10
    508 	sll	a2, a2
    509 	movi	a8, 1
    510 	sub	a8, a8, a10
    511 	j	.Lmul_xnormalized
    512 
    513 .Lmul_yexpzero:
    514 	/* Clear the sign bit of y.  */
    515 	slli	a3, a3, 1
    516 	srli	a3, a3, 1
    517 
    518 	/* If y is zero, return zero.  */
    519 	beqz	a3, .Lmul_return_zero
    520 
    521 	/* Normalize y.  Adjust the exponent in a9.  */
    522 	do_nsau	a10, a3, a11, a12
    523 	addi	a10, a10, -8
    524 	ssl	a10
    525 	sll	a3, a3
    526 	movi	a9, 1
    527 	sub	a9, a9, a10
    528 	j	.Lmul_ynormalized
    529 
    530 .Lmul_return_zero:
    531 	/* Return zero with the appropriate sign bit.  */
    532 	srli	a2, a7, 31
    533 	slli	a2, a2, 31
    534 	j	.Lmul_done
    535 
    536 .Lmul_xnan_or_inf:
    537 	/* If y is zero, return NaN.  */
    538 	slli	a8, a3, 1
    539 	bnez	a8, 1f
    540 	movi	a4, 0x400000	/* make it a quiet NaN */
    541 	or	a2, a2, a4
    542 	j	.Lmul_done
    543 1:
    544 	/* If y is NaN, return y.  */
    545 	bnall	a3, a6, .Lmul_returnx
    546 	slli	a8, a3, 9
    547 	beqz	a8, .Lmul_returnx
    548 
    549 .Lmul_returny:
    550 	mov	a2, a3
    551 
    552 .Lmul_returnx:
    553 	/* Set the sign bit and return.  */
    554 	extui	a7, a7, 31, 1
    555 	slli	a2, a2, 1
    556 	ssai	1
    557 	src	a2, a7, a2
    558 	j	.Lmul_done
    559 
    560 .Lmul_ynan_or_inf:
    561 	/* If x is zero, return NaN.  */
    562 	slli	a8, a2, 1
    563 	bnez	a8, .Lmul_returny
    564 	movi	a7, 0x400000	/* make it a quiet NaN */
    565 	or	a2, a3, a7
    566 	j	.Lmul_done
    567 
    568 	.align	4
    569 	.global	__mulsf3
    570 	.type	__mulsf3, @function
    571 __mulsf3:
    572 #if __XTENSA_CALL0_ABI__
    573 	leaf_entry sp, 32
    574 	addi	sp, sp, -32
    575 	s32i	a12, sp, 16
    576 	s32i	a13, sp, 20
    577 	s32i	a14, sp, 24
    578 	s32i	a15, sp, 28
    579 #elif XCHAL_NO_MUL
    580 	/* This is not really a leaf function; allocate enough stack space
    581 	   to allow CALL12s to a helper function.  */
    582 	leaf_entry sp, 64
    583 #else
    584 	leaf_entry sp, 32
    585 #endif
    586 	movi	a6, 0x7f800000
    587 
    588 	/* Get the sign of the result.  */
    589 	xor	a7, a2, a3
    590 
    591 	/* Check for NaN and infinity.  */
    592 	ball	a2, a6, .Lmul_xnan_or_inf
    593 	ball	a3, a6, .Lmul_ynan_or_inf
    594 
    595 	/* Extract the exponents.  */
    596 	extui	a8, a2, 23, 8
    597 	extui	a9, a3, 23, 8
    598 
    599 	beqz	a8, .Lmul_xexpzero
    600 .Lmul_xnormalized:
    601 	beqz	a9, .Lmul_yexpzero
    602 .Lmul_ynormalized:
    603 
    604 	/* Add the exponents.  */
    605 	add	a8, a8, a9
    606 
    607 	/* Replace sign/exponent fields with explicit "1.0".  */
    608 	movi	a10, 0xffffff
    609 	or	a2, a2, a6
    610 	and	a2, a2, a10
    611 	or	a3, a3, a6
    612 	and	a3, a3, a10
    613 
    614 	/* Multiply 32x32 to 64 bits.  The result ends up in a2/a6.  */
    615 
    616 #if XCHAL_HAVE_MUL32_HIGH
    617 
    618 	mull	a6, a2, a3
    619 	muluh	a2, a2, a3
    620 
    621 #else
    622 
    623 	/* Break the inputs into 16-bit chunks and compute 4 32-bit partial
    624 	   products.  These partial products are:
    625 
    626 		0 xl * yl
    627 
    628 		1 xl * yh
    629 		2 xh * yl
    630 
    631 		3 xh * yh
    632 
    633 	   If using the Mul16 or Mul32 multiplier options, these input
    634 	   chunks must be stored in separate registers.  For Mac16, the
    635 	   UMUL.AA.* opcodes can specify that the inputs come from either
    636 	   half of the registers, so there is no need to shift them out
    637 	   ahead of time.  If there is no multiply hardware, the 16-bit
    638 	   chunks can be extracted when setting up the arguments to the
    639 	   separate multiply function.  */
    640 
    641 #if __XTENSA_CALL0_ABI__ && XCHAL_NO_MUL
    642 	/* Calling a separate multiply function will clobber a0 and requires
    643 	   use of a8 as a temporary, so save those values now.  (The function
    644 	   uses a custom ABI so nothing else needs to be saved.)  */
    645 	s32i	a0, sp, 0
    646 	s32i	a8, sp, 4
    647 #endif
    648 
    649 #if XCHAL_HAVE_MUL16 || XCHAL_HAVE_MUL32
    650 
    651 #define a2h a4
    652 #define a3h a5
    653 
    654 	/* Get the high halves of the inputs into registers.  */
    655 	srli	a2h, a2, 16
    656 	srli	a3h, a3, 16
    657 
    658 #define a2l a2
    659 #define a3l a3
    660 
    661 #if XCHAL_HAVE_MUL32 && !XCHAL_HAVE_MUL16
    662 	/* Clear the high halves of the inputs.  This does not matter
    663 	   for MUL16 because the high bits are ignored.  */
    664 	extui	a2, a2, 0, 16
    665 	extui	a3, a3, 0, 16
    666 #endif
    667 #endif /* MUL16 || MUL32 */
    668 
    669 
    670 #if XCHAL_HAVE_MUL16
    671 
    672 #define do_mul(dst, xreg, xhalf, yreg, yhalf) \
    673 	mul16u	dst, xreg ## xhalf, yreg ## yhalf
    674 
    675 #elif XCHAL_HAVE_MUL32
    676 
    677 #define do_mul(dst, xreg, xhalf, yreg, yhalf) \
    678 	mull	dst, xreg ## xhalf, yreg ## yhalf
    679 
    680 #elif XCHAL_HAVE_MAC16
    681 
    682 /* The preprocessor insists on inserting a space when concatenating after
    683    a period in the definition of do_mul below.  These macros are a workaround
    684    using underscores instead of periods when doing the concatenation.  */
    685 #define umul_aa_ll umul.aa.ll
    686 #define umul_aa_lh umul.aa.lh
    687 #define umul_aa_hl umul.aa.hl
    688 #define umul_aa_hh umul.aa.hh
    689 
    690 #define do_mul(dst, xreg, xhalf, yreg, yhalf) \
    691 	umul_aa_ ## xhalf ## yhalf	xreg, yreg; \
    692 	rsr	dst, ACCLO
    693 
    694 #else /* no multiply hardware */
    695 
    696 #define set_arg_l(dst, src) \
    697 	extui	dst, src, 0, 16
    698 #define set_arg_h(dst, src) \
    699 	srli	dst, src, 16
    700 
    701 #if __XTENSA_CALL0_ABI__
    702 #define do_mul(dst, xreg, xhalf, yreg, yhalf) \
    703 	set_arg_ ## xhalf (a13, xreg); \
    704 	set_arg_ ## yhalf (a14, yreg); \
    705 	call0	.Lmul_mulsi3; \
    706 	mov	dst, a12
    707 #else
    708 #define do_mul(dst, xreg, xhalf, yreg, yhalf) \
    709 	set_arg_ ## xhalf (a14, xreg); \
    710 	set_arg_ ## yhalf (a15, yreg); \
    711 	call12	.Lmul_mulsi3; \
    712 	mov	dst, a14
    713 #endif /* __XTENSA_CALL0_ABI__ */
    714 
    715 #endif /* no multiply hardware */
    716 
    717 	/* Add pp1 and pp2 into a6 with carry-out in a9.  */
    718 	do_mul(a6, a2, l, a3, h)	/* pp 1 */
    719 	do_mul(a11, a2, h, a3, l)	/* pp 2 */
    720 	movi	a9, 0
    721 	add	a6, a6, a11
    722 	bgeu	a6, a11, 1f
    723 	addi	a9, a9, 1
    724 1:
    725 	/* Shift the high half of a9/a6 into position in a9.  Note that
    726 	   this value can be safely incremented without any carry-outs.  */
    727 	ssai	16
    728 	src	a9, a9, a6
    729 
    730 	/* Compute the low word into a6.  */
    731 	do_mul(a11, a2, l, a3, l)	/* pp 0 */
    732 	sll	a6, a6
    733 	add	a6, a6, a11
    734 	bgeu	a6, a11, 1f
    735 	addi	a9, a9, 1
    736 1:
    737 	/* Compute the high word into a2.  */
    738 	do_mul(a2, a2, h, a3, h)	/* pp 3 */
    739 	add	a2, a2, a9
    740 
    741 #if __XTENSA_CALL0_ABI__ && XCHAL_NO_MUL
    742 	/* Restore values saved on the stack during the multiplication.  */
    743 	l32i	a0, sp, 0
    744 	l32i	a8, sp, 4
    745 #endif
    746 #endif /* ! XCHAL_HAVE_MUL32_HIGH */
    747 
    748 	/* Shift left by 9 bits, unless there was a carry-out from the
    749 	   multiply, in which case, shift by 8 bits and increment the
    750 	   exponent.  */
    751 	movi	a4, 9
    752 	srli	a5, a2, 24 - 9
    753 	beqz	a5, 1f
    754 	addi	a4, a4, -1
    755 	addi	a8, a8, 1
    756 1:	ssl	a4
    757 	src	a2, a2, a6
    758 	sll	a6, a6
    759 
    760 	/* Subtract the extra bias from the exponent sum (plus one to account
    761 	   for the explicit "1.0" of the mantissa that will be added to the
    762 	   exponent in the final result).  */
    763 	movi	a4, 0x80
    764 	sub	a8, a8, a4
    765 
    766 	/* Check for over/underflow.  The value in a8 is one less than the
    767 	   final exponent, so values in the range 0..fd are OK here.  */
    768 	movi	a4, 0xfe
    769 	bgeu	a8, a4, .Lmul_overflow
    770 
    771 .Lmul_round:
    772 	/* Round.  */
    773 	bgez	a6, .Lmul_rounded
    774 	addi	a2, a2, 1
    775 	slli	a6, a6, 1
    776 	beqz	a6, .Lmul_exactlyhalf
    777 
    778 .Lmul_rounded:
    779 	/* Add the exponent to the mantissa.  */
    780 	slli	a8, a8, 23
    781 	add	a2, a2, a8
    782 
    783 .Lmul_addsign:
    784 	/* Add the sign bit.  */
    785 	srli	a7, a7, 31
    786 	slli	a7, a7, 31
    787 	or	a2, a2, a7
    788 
    789 .Lmul_done:
    790 #if __XTENSA_CALL0_ABI__
    791 	l32i	a12, sp, 16
    792 	l32i	a13, sp, 20
    793 	l32i	a14, sp, 24
    794 	l32i	a15, sp, 28
    795 	addi	sp, sp, 32
    796 #endif
    797 	leaf_return
    798 
    799 .Lmul_exactlyhalf:
    800 	/* Round down to the nearest even value.  */
    801 	srli	a2, a2, 1
    802 	slli	a2, a2, 1
    803 	j	.Lmul_rounded
    804 
    805 .Lmul_overflow:
    806 	bltz	a8, .Lmul_underflow
    807 	/* Return +/- Infinity.  */
    808 	movi	a8, 0xff
    809 	slli	a2, a8, 23
    810 	j	.Lmul_addsign
    811 
    812 .Lmul_underflow:
    813 	/* Create a subnormal value, where the exponent field contains zero,
    814 	   but the effective exponent is 1.  The value of a8 is one less than
    815 	   the actual exponent, so just negate it to get the shift amount.  */
    816 	neg	a8, a8
    817 	mov	a9, a6
    818 	ssr	a8
    819 	bgeui	a8, 32, .Lmul_flush_to_zero
    820 
    821 	/* Shift a2 right.  Any bits that are shifted out of a2 are saved
    822 	   in a6 (combined with the shifted-out bits currently in a6) for
    823 	   rounding the result.  */
    824 	sll	a6, a2
    825 	srl	a2, a2
    826 
    827 	/* Set the exponent to zero.  */
    828 	movi	a8, 0
    829 
    830 	/* Pack any nonzero bits shifted out into a6.  */
    831 	beqz	a9, .Lmul_round
    832 	movi	a9, 1
    833 	or	a6, a6, a9
    834 	j	.Lmul_round
    835 
    836 .Lmul_flush_to_zero:
    837 	/* Return zero with the appropriate sign bit.  */
    838 	srli	a2, a7, 31
    839 	slli	a2, a2, 31
    840 	j	.Lmul_done
    841 
    842 #if XCHAL_NO_MUL
    843 
    844 	/* For Xtensa processors with no multiply hardware, this simplified
    845 	   version of _mulsi3 is used for multiplying 16-bit chunks of
    846 	   the floating-point mantissas.  When using CALL0, this function
    847 	   uses a custom ABI: the inputs are passed in a13 and a14, the
    848 	   result is returned in a12, and a8 and a15 are clobbered.  */
    849 	.align	4
    850 .Lmul_mulsi3:
    851 	leaf_entry sp, 16
    852 	.macro mul_mulsi3_body dst, src1, src2, tmp1, tmp2
    853 	movi	\dst, 0
    854 1:	add	\tmp1, \src2, \dst
    855 	extui	\tmp2, \src1, 0, 1
    856 	movnez	\dst, \tmp1, \tmp2
    857 
    858 	do_addx2 \tmp1, \src2, \dst, \tmp1
    859 	extui	\tmp2, \src1, 1, 1
    860 	movnez	\dst, \tmp1, \tmp2
    861 
    862 	do_addx4 \tmp1, \src2, \dst, \tmp1
    863 	extui	\tmp2, \src1, 2, 1
    864 	movnez	\dst, \tmp1, \tmp2
    865 
    866 	do_addx8 \tmp1, \src2, \dst, \tmp1
    867 	extui	\tmp2, \src1, 3, 1
    868 	movnez	\dst, \tmp1, \tmp2
    869 
    870 	srli	\src1, \src1, 4
    871 	slli	\src2, \src2, 4
    872 	bnez	\src1, 1b
    873 	.endm
    874 #if __XTENSA_CALL0_ABI__
    875 	mul_mulsi3_body a12, a13, a14, a15, a8
    876 #else
    877 	/* The result will be written into a2, so save that argument in a4.  */
    878 	mov	a4, a2
    879 	mul_mulsi3_body a2, a4, a3, a5, a6
    880 #endif
    881 	leaf_return
    882 #endif /* XCHAL_NO_MUL */
    883 #endif /* L_mulsf3 */
    884 
    885 #ifdef L_divsf3
    886 
    887 	/* Division */
    888 __divsf3_aux:
    889 
    890 	/* Handle unusual cases (zeros, subnormals, NaNs and Infinities).
    891 	   (This code is placed before the start of the function just to
    892 	   keep it in range of the limited branch displacements.)  */
    893 
    894 .Ldiv_yexpzero:
    895 	/* Clear the sign bit of y.  */
    896 	slli	a3, a3, 1
    897 	srli	a3, a3, 1
    898 
    899 	/* Check for division by zero.  */
    900 	beqz	a3, .Ldiv_yzero
    901 
    902 	/* Normalize y.  Adjust the exponent in a9.  */
    903 	do_nsau	a10, a3, a4, a5
    904 	addi	a10, a10, -8
    905 	ssl	a10
    906 	sll	a3, a3
    907 	movi	a9, 1
    908 	sub	a9, a9, a10
    909 	j	.Ldiv_ynormalized
    910 
    911 .Ldiv_yzero:
    912 	/* y is zero.  Return NaN if x is also zero; otherwise, infinity.  */
    913 	slli	a4, a2, 1
    914 	srli	a4, a4, 1
    915 	srli	a2, a7, 31
    916 	slli	a2, a2, 31
    917 	or	a2, a2, a6
    918 	bnez	a4, 1f
    919 	movi	a4, 0x400000	/* make it a quiet NaN */
    920 	or	a2, a2, a4
    921 1:	leaf_return
    922 
    923 .Ldiv_xexpzero:
    924 	/* Clear the sign bit of x.  */
    925 	slli	a2, a2, 1
    926 	srli	a2, a2, 1
    927 
    928 	/* If x is zero, return zero.  */
    929 	beqz	a2, .Ldiv_return_zero
    930 
    931 	/* Normalize x.  Adjust the exponent in a8.  */
    932 	do_nsau	a10, a2, a4, a5
    933 	addi	a10, a10, -8
    934 	ssl	a10
    935 	sll	a2, a2
    936 	movi	a8, 1
    937 	sub	a8, a8, a10
    938 	j	.Ldiv_xnormalized
    939 
    940 .Ldiv_return_zero:
    941 	/* Return zero with the appropriate sign bit.  */
    942 	srli	a2, a7, 31
    943 	slli	a2, a2, 31
    944 	leaf_return
    945 
    946 .Ldiv_xnan_or_inf:
    947 	/* Set the sign bit of the result.  */
    948 	srli	a7, a3, 31
    949 	slli	a7, a7, 31
    950 	xor	a2, a2, a7
    951 	/* If y is NaN or Inf, return NaN.  */
    952 	bnall	a3, a6, 1f
    953 	movi	a4, 0x400000	/* make it a quiet NaN */
    954 	or	a2, a2, a4
    955 1:	leaf_return
    956 
    957 .Ldiv_ynan_or_inf:
    958 	/* If y is Infinity, return zero.  */
    959 	slli	a8, a3, 9
    960 	beqz	a8, .Ldiv_return_zero
    961 	/* y is NaN; return it.  */
    962 	mov	a2, a3
    963 	leaf_return
    964 
    965 	.align	4
    966 	.global	__divsf3
    967 	.type	__divsf3, @function
    968 __divsf3:
    969 	leaf_entry sp, 16
    970 	movi	a6, 0x7f800000
    971 
    972 	/* Get the sign of the result.  */
    973 	xor	a7, a2, a3
    974 
    975 	/* Check for NaN and infinity.  */
    976 	ball	a2, a6, .Ldiv_xnan_or_inf
    977 	ball	a3, a6, .Ldiv_ynan_or_inf
    978 
    979 	/* Extract the exponents.  */
    980 	extui	a8, a2, 23, 8
    981 	extui	a9, a3, 23, 8
    982 
    983 	beqz	a9, .Ldiv_yexpzero
    984 .Ldiv_ynormalized:
    985 	beqz	a8, .Ldiv_xexpzero
    986 .Ldiv_xnormalized:
    987 
    988 	/* Subtract the exponents.  */
    989 	sub	a8, a8, a9
    990 
    991 	/* Replace sign/exponent fields with explicit "1.0".  */
    992 	movi	a10, 0xffffff
    993 	or	a2, a2, a6
    994 	and	a2, a2, a10
    995 	or	a3, a3, a6
    996 	and	a3, a3, a10
    997 
    998 	/* The first digit of the mantissa division must be a one.
    999 	   Shift x (and adjust the exponent) as needed to make this true.  */
   1000 	bltu	a3, a2, 1f
   1001 	slli	a2, a2, 1
   1002 	addi	a8, a8, -1
   1003 1:
   1004 	/* Do the first subtraction and shift.  */
   1005 	sub	a2, a2, a3
   1006 	slli	a2, a2, 1
   1007 
   1008 	/* Put the quotient into a10.  */
   1009 	movi	a10, 1
   1010 
   1011 	/* Divide one bit at a time for 23 bits.  */
   1012 	movi	a9, 23
   1013 #if XCHAL_HAVE_LOOPS
   1014 	loop	a9, .Ldiv_loopend
   1015 #endif
   1016 .Ldiv_loop:
   1017 	/* Shift the quotient << 1.  */
   1018 	slli	a10, a10, 1
   1019 
   1020 	/* Is this digit a 0 or 1?  */
   1021 	bltu	a2, a3, 1f
   1022 
   1023 	/* Output a 1 and subtract.  */
   1024 	addi	a10, a10, 1
   1025 	sub	a2, a2, a3
   1026 
   1027 	/* Shift the dividend << 1.  */
   1028 1:	slli	a2, a2, 1
   1029 
   1030 #if !XCHAL_HAVE_LOOPS
   1031 	addi	a9, a9, -1
   1032 	bnez	a9, .Ldiv_loop
   1033 #endif
   1034 .Ldiv_loopend:
   1035 
   1036 	/* Add the exponent bias (less one to account for the explicit "1.0"
   1037 	   of the mantissa that will be added to the exponent in the final
   1038 	   result).  */
   1039 	addi	a8, a8, 0x7e
   1040 
   1041 	/* Check for over/underflow.  The value in a8 is one less than the
   1042 	   final exponent, so values in the range 0..fd are OK here.  */
   1043 	movi	a4, 0xfe
   1044 	bgeu	a8, a4, .Ldiv_overflow
   1045 
   1046 .Ldiv_round:
   1047 	/* Round.  The remainder (<< 1) is in a2.  */
   1048 	bltu	a2, a3, .Ldiv_rounded
   1049 	addi	a10, a10, 1
   1050 	beq	a2, a3, .Ldiv_exactlyhalf
   1051 
   1052 .Ldiv_rounded:
   1053 	/* Add the exponent to the mantissa.  */
   1054 	slli	a8, a8, 23
   1055 	add	a2, a10, a8
   1056 
   1057 .Ldiv_addsign:
   1058 	/* Add the sign bit.  */
   1059 	srli	a7, a7, 31
   1060 	slli	a7, a7, 31
   1061 	or	a2, a2, a7
   1062 	leaf_return
   1063 
   1064 .Ldiv_overflow:
   1065 	bltz	a8, .Ldiv_underflow
   1066 	/* Return +/- Infinity.  */
   1067 	addi	a8, a4, 1	/* 0xff */
   1068 	slli	a2, a8, 23
   1069 	j	.Ldiv_addsign
   1070 
   1071 .Ldiv_exactlyhalf:
   1072 	/* Remainder is exactly half the divisor.  Round even.  */
   1073 	srli	a10, a10, 1
   1074 	slli	a10, a10, 1
   1075 	j	.Ldiv_rounded
   1076 
   1077 .Ldiv_underflow:
   1078 	/* Create a subnormal value, where the exponent field contains zero,
   1079 	   but the effective exponent is 1.  The value of a8 is one less than
   1080 	   the actual exponent, so just negate it to get the shift amount.  */
   1081 	neg	a8, a8
   1082 	ssr	a8
   1083 	bgeui	a8, 32, .Ldiv_flush_to_zero
   1084 
   1085 	/* Shift a10 right.  Any bits that are shifted out of a10 are
   1086 	   saved in a6 for rounding the result.  */
   1087 	sll	a6, a10
   1088 	srl	a10, a10
   1089 
   1090 	/* Set the exponent to zero.  */
   1091 	movi	a8, 0
   1092 
   1093 	/* Pack any nonzero remainder (in a2) into a6.  */
   1094 	beqz	a2, 1f
   1095 	movi	a9, 1
   1096 	or	a6, a6, a9
   1097 
   1098 	/* Round a10 based on the bits shifted out into a6.  */
   1099 1:	bgez	a6, .Ldiv_rounded
   1100 	addi	a10, a10, 1
   1101 	slli	a6, a6, 1
   1102 	bnez	a6, .Ldiv_rounded
   1103 	srli	a10, a10, 1
   1104 	slli	a10, a10, 1
   1105 	j	.Ldiv_rounded
   1106 
   1107 .Ldiv_flush_to_zero:
   1108 	/* Return zero with the appropriate sign bit.  */
   1109 	srli	a2, a7, 31
   1110 	slli	a2, a2, 31
   1111 	leaf_return
   1112 
   1113 #endif /* L_divsf3 */
   1114 
   1115 #ifdef L_cmpsf2
   1116 
   1117 	/* Equal and Not Equal */
   1118 
   1119 	.align	4
   1120 	.global	__eqsf2
   1121 	.global	__nesf2
   1122 	.set	__nesf2, __eqsf2
   1123 	.type	__eqsf2, @function
   1124 __eqsf2:
   1125 	leaf_entry sp, 16
   1126 	bne	a2, a3, 4f
   1127 
   1128 	/* The values are equal but NaN != NaN.  Check the exponent.  */
   1129 	movi	a6, 0x7f800000
   1130 	ball	a2, a6, 3f
   1131 
   1132 	/* Equal.  */
   1133 	movi	a2, 0
   1134 	leaf_return
   1135 
   1136 	/* Not equal.  */
   1137 2:	movi	a2, 1
   1138 	leaf_return
   1139 
   1140 	/* Check if the mantissas are nonzero.  */
   1141 3:	slli	a7, a2, 9
   1142 	j	5f
   1143 
   1144 	/* Check if x and y are zero with different signs.  */
   1145 4:	or	a7, a2, a3
   1146 	slli	a7, a7, 1
   1147 
   1148 	/* Equal if a7 == 0, where a7 is either abs(x | y) or the mantissa
   1149 	   or x when exponent(x) = 0x7f8 and x == y.  */
   1150 5:	movi	a2, 0
   1151 	movi	a3, 1
   1152 	movnez	a2, a3, a7
   1153 	leaf_return
   1154 
   1155 
   1156 	/* Greater Than */
   1157 
   1158 	.align	4
   1159 	.global	__gtsf2
   1160 	.type	__gtsf2, @function
   1161 __gtsf2:
   1162 	leaf_entry sp, 16
   1163 	movi	a6, 0x7f800000
   1164 	ball	a2, a6, 2f
   1165 1:	bnall	a3, a6, .Lle_cmp
   1166 
   1167 	/* Check if y is a NaN.  */
   1168 	slli	a7, a3, 9
   1169 	beqz	a7, .Lle_cmp
   1170 	movi	a2, 0
   1171 	leaf_return
   1172 
   1173 	/* Check if x is a NaN.  */
   1174 2:	slli	a7, a2, 9
   1175 	beqz	a7, 1b
   1176 	movi	a2, 0
   1177 	leaf_return
   1178 
   1179 
   1180 	/* Less Than or Equal */
   1181 
   1182 	.align	4
   1183 	.global	__lesf2
   1184 	.type	__lesf2, @function
   1185 __lesf2:
   1186 	leaf_entry sp, 16
   1187 	movi	a6, 0x7f800000
   1188 	ball	a2, a6, 2f
   1189 1:	bnall	a3, a6, .Lle_cmp
   1190 
   1191 	/* Check if y is a NaN.  */
   1192 	slli	a7, a3, 9
   1193 	beqz	a7, .Lle_cmp
   1194 	movi	a2, 1
   1195 	leaf_return
   1196 
   1197 	/* Check if x is a NaN.  */
   1198 2:	slli	a7, a2, 9
   1199 	beqz	a7, 1b
   1200 	movi	a2, 1
   1201 	leaf_return
   1202 
   1203 .Lle_cmp:
   1204 	/* Check if x and y have different signs.  */
   1205 	xor	a7, a2, a3
   1206 	bltz	a7, .Lle_diff_signs
   1207 
   1208 	/* Check if x is negative.  */
   1209 	bltz	a2, .Lle_xneg
   1210 
   1211 	/* Check if x <= y.  */
   1212 	bltu	a3, a2, 5f
   1213 4:	movi	a2, 0
   1214 	leaf_return
   1215 
   1216 .Lle_xneg:
   1217 	/* Check if y <= x.  */
   1218 	bgeu	a2, a3, 4b
   1219 5:	movi	a2, 1
   1220 	leaf_return
   1221 
   1222 .Lle_diff_signs:
   1223 	bltz	a2, 4b
   1224 
   1225 	/* Check if both x and y are zero.  */
   1226 	or	a7, a2, a3
   1227 	slli	a7, a7, 1
   1228 	movi	a2, 1
   1229 	movi	a3, 0
   1230 	moveqz	a2, a3, a7
   1231 	leaf_return
   1232 
   1233 
   1234 	/* Greater Than or Equal */
   1235 
   1236 	.align	4
   1237 	.global	__gesf2
   1238 	.type	__gesf2, @function
   1239 __gesf2:
   1240 	leaf_entry sp, 16
   1241 	movi	a6, 0x7f800000
   1242 	ball	a2, a6, 2f
   1243 1:	bnall	a3, a6, .Llt_cmp
   1244 
   1245 	/* Check if y is a NaN.  */
   1246 	slli	a7, a3, 9
   1247 	beqz	a7, .Llt_cmp
   1248 	movi	a2, -1
   1249 	leaf_return
   1250 
   1251 	/* Check if x is a NaN.  */
   1252 2:	slli	a7, a2, 9
   1253 	beqz	a7, 1b
   1254 	movi	a2, -1
   1255 	leaf_return
   1256 
   1257 
   1258 	/* Less Than */
   1259 
   1260 	.align	4
   1261 	.global	__ltsf2
   1262 	.type	__ltsf2, @function
   1263 __ltsf2:
   1264 	leaf_entry sp, 16
   1265 	movi	a6, 0x7f800000
   1266 	ball	a2, a6, 2f
   1267 1:	bnall	a3, a6, .Llt_cmp
   1268 
   1269 	/* Check if y is a NaN.  */
   1270 	slli	a7, a3, 9
   1271 	beqz	a7, .Llt_cmp
   1272 	movi	a2, 0
   1273 	leaf_return
   1274 
   1275 	/* Check if x is a NaN.  */
   1276 2:	slli	a7, a2, 9
   1277 	beqz	a7, 1b
   1278 	movi	a2, 0
   1279 	leaf_return
   1280 
   1281 .Llt_cmp:
   1282 	/* Check if x and y have different signs.  */
   1283 	xor	a7, a2, a3
   1284 	bltz	a7, .Llt_diff_signs
   1285 
   1286 	/* Check if x is negative.  */
   1287 	bltz	a2, .Llt_xneg
   1288 
   1289 	/* Check if x < y.  */
   1290 	bgeu	a2, a3, 5f
   1291 4:	movi	a2, -1
   1292 	leaf_return
   1293 
   1294 .Llt_xneg:
   1295 	/* Check if y < x.  */
   1296 	bltu	a3, a2, 4b
   1297 5:	movi	a2, 0
   1298 	leaf_return
   1299 
   1300 .Llt_diff_signs:
   1301 	bgez	a2, 5b
   1302 
   1303 	/* Check if both x and y are nonzero.  */
   1304 	or	a7, a2, a3
   1305 	slli	a7, a7, 1
   1306 	movi	a2, 0
   1307 	movi	a3, -1
   1308 	movnez	a2, a3, a7
   1309 	leaf_return
   1310 
   1311 
   1312 	/* Unordered */
   1313 
   1314 	.align	4
   1315 	.global	__unordsf2
   1316 	.type	__unordsf2, @function
   1317 __unordsf2:
   1318 	leaf_entry sp, 16
   1319 	movi	a6, 0x7f800000
   1320 	ball	a2, a6, 3f
   1321 1:	ball	a3, a6, 4f
   1322 2:	movi	a2, 0
   1323 	leaf_return
   1324 
   1325 3:	slli	a7, a2, 9
   1326 	beqz	a7, 1b
   1327 	movi	a2, 1
   1328 	leaf_return
   1329 
   1330 4:	slli	a7, a3, 9
   1331 	beqz	a7, 2b
   1332 	movi	a2, 1
   1333 	leaf_return
   1334 
   1335 #endif /* L_cmpsf2 */
   1336 
   1337 #ifdef L_fixsfsi
   1338 
   1339 	.align	4
   1340 	.global	__fixsfsi
   1341 	.type	__fixsfsi, @function
   1342 __fixsfsi:
   1343 	leaf_entry sp, 16
   1344 
   1345 	/* Check for NaN and Infinity.  */
   1346 	movi	a6, 0x7f800000
   1347 	ball	a2, a6, .Lfixsfsi_nan_or_inf
   1348 
   1349 	/* Extract the exponent and check if 0 < (exp - 0x7e) < 32.  */
   1350 	extui	a4, a2, 23, 8
   1351 	addi	a4, a4, -0x7e
   1352 	bgei	a4, 32, .Lfixsfsi_maxint
   1353 	blti	a4, 1, .Lfixsfsi_zero
   1354 
   1355 	/* Add explicit "1.0" and shift << 8.  */
   1356 	or	a7, a2, a6
   1357 	slli	a5, a7, 8
   1358 
   1359 	/* Shift back to the right, based on the exponent.  */
   1360 	ssl	a4		/* shift by 32 - a4 */
   1361 	srl	a5, a5
   1362 
   1363 	/* Negate the result if sign != 0.  */
   1364 	neg	a2, a5
   1365 	movgez	a2, a5, a7
   1366 	leaf_return
   1367 
   1368 .Lfixsfsi_nan_or_inf:
   1369 	/* Handle Infinity and NaN.  */
   1370 	slli	a4, a2, 9
   1371 	beqz	a4, .Lfixsfsi_maxint
   1372 
   1373 	/* Translate NaN to +maxint.  */
   1374 	movi	a2, 0
   1375 
   1376 .Lfixsfsi_maxint:
   1377 	slli	a4, a6, 8	/* 0x80000000 */
   1378 	addi	a5, a4, -1	/* 0x7fffffff */
   1379 	movgez	a4, a5, a2
   1380 	mov	a2, a4
   1381 	leaf_return
   1382 
   1383 .Lfixsfsi_zero:
   1384 	movi	a2, 0
   1385 	leaf_return
   1386 
   1387 #endif /* L_fixsfsi */
   1388 
   1389 #ifdef L_fixsfdi
   1390 
   1391 	.align	4
   1392 	.global	__fixsfdi
   1393 	.type	__fixsfdi, @function
   1394 __fixsfdi:
   1395 	leaf_entry sp, 16
   1396 
   1397 	/* Check for NaN and Infinity.  */
   1398 	movi	a6, 0x7f800000
   1399 	ball	a2, a6, .Lfixsfdi_nan_or_inf
   1400 
   1401 	/* Extract the exponent and check if 0 < (exp - 0x7e) < 64.  */
   1402 	extui	a4, a2, 23, 8
   1403 	addi	a4, a4, -0x7e
   1404 	bgei	a4, 64, .Lfixsfdi_maxint
   1405 	blti	a4, 1, .Lfixsfdi_zero
   1406 
   1407 	/* Add explicit "1.0" and shift << 8.  */
   1408 	or	a7, a2, a6
   1409 	slli	xh, a7, 8
   1410 
   1411 	/* Shift back to the right, based on the exponent.  */
   1412 	ssl	a4		/* shift by 64 - a4 */
   1413 	bgei	a4, 32, .Lfixsfdi_smallshift
   1414 	srl	xl, xh
   1415 	movi	xh, 0
   1416 
   1417 .Lfixsfdi_shifted:
   1418 	/* Negate the result if sign != 0.  */
   1419 	bgez	a7, 1f
   1420 	neg	xl, xl
   1421 	neg	xh, xh
   1422 	beqz	xl, 1f
   1423 	addi	xh, xh, -1
   1424 1:	leaf_return
   1425 
   1426 .Lfixsfdi_smallshift:
   1427 	movi	xl, 0
   1428 	sll	xl, xh
   1429 	srl	xh, xh
   1430 	j	.Lfixsfdi_shifted
   1431 
   1432 .Lfixsfdi_nan_or_inf:
   1433 	/* Handle Infinity and NaN.  */
   1434 	slli	a4, a2, 9
   1435 	beqz	a4, .Lfixsfdi_maxint
   1436 
   1437 	/* Translate NaN to +maxint.  */
   1438 	movi	a2, 0
   1439 
   1440 .Lfixsfdi_maxint:
   1441 	slli	a7, a6, 8	/* 0x80000000 */
   1442 	bgez	a2, 1f
   1443 	mov	xh, a7
   1444 	movi	xl, 0
   1445 	leaf_return
   1446 
   1447 1:	addi	xh, a7, -1	/* 0x7fffffff */
   1448 	movi	xl, -1
   1449 	leaf_return
   1450 
   1451 .Lfixsfdi_zero:
   1452 	movi	xh, 0
   1453 	movi	xl, 0
   1454 	leaf_return
   1455 
   1456 #endif /* L_fixsfdi */
   1457 
   1458 #ifdef L_fixunssfsi
   1459 
   1460 	.align	4
   1461 	.global	__fixunssfsi
   1462 	.type	__fixunssfsi, @function
   1463 __fixunssfsi:
   1464 	leaf_entry sp, 16
   1465 
   1466 	/* Check for NaN and Infinity.  */
   1467 	movi	a6, 0x7f800000
   1468 	ball	a2, a6, .Lfixunssfsi_nan_or_inf
   1469 
   1470 	/* Extract the exponent and check if 0 <= (exp - 0x7f) < 32.  */
   1471 	extui	a4, a2, 23, 8
   1472 	addi	a4, a4, -0x7f
   1473 	bgei	a4, 32, .Lfixunssfsi_maxint
   1474 	bltz	a4, .Lfixunssfsi_zero
   1475 
   1476 	/* Add explicit "1.0" and shift << 8.  */
   1477 	or	a7, a2, a6
   1478 	slli	a5, a7, 8
   1479 
   1480 	/* Shift back to the right, based on the exponent.  */
   1481 	addi	a4, a4, 1
   1482 	beqi	a4, 32, .Lfixunssfsi_bigexp
   1483 	ssl	a4		/* shift by 32 - a4 */
   1484 	srl	a5, a5
   1485 
   1486 	/* Negate the result if sign != 0.  */
   1487 	neg	a2, a5
   1488 	movgez	a2, a5, a7
   1489 	leaf_return
   1490 
   1491 .Lfixunssfsi_nan_or_inf:
   1492 	/* Handle Infinity and NaN.  */
   1493 	slli	a4, a2, 9
   1494 	beqz	a4, .Lfixunssfsi_maxint
   1495 
   1496 	/* Translate NaN to 0xffffffff.  */
   1497 	movi	a2, -1
   1498 	leaf_return
   1499 
   1500 .Lfixunssfsi_maxint:
   1501 	slli	a4, a6, 8	/* 0x80000000 */
   1502 	movi	a5, -1		/* 0xffffffff */
   1503 	movgez	a4, a5, a2
   1504 	mov	a2, a4
   1505 	leaf_return
   1506 
   1507 .Lfixunssfsi_zero:
   1508 	movi	a2, 0
   1509 	leaf_return
   1510 
   1511 .Lfixunssfsi_bigexp:
   1512 	/* Handle unsigned maximum exponent case.  */
   1513 	bltz	a2, 1f
   1514 	mov	a2, a5		/* no shift needed */
   1515 	leaf_return
   1516 
   1517 	/* Return 0x80000000 if negative.  */
   1518 1:	slli	a2, a6, 8
   1519 	leaf_return
   1520 
   1521 #endif /* L_fixunssfsi */
   1522 
   1523 #ifdef L_fixunssfdi
   1524 
   1525 	.align	4
   1526 	.global	__fixunssfdi
   1527 	.type	__fixunssfdi, @function
   1528 __fixunssfdi:
   1529 	leaf_entry sp, 16
   1530 
   1531 	/* Check for NaN and Infinity.  */
   1532 	movi	a6, 0x7f800000
   1533 	ball	a2, a6, .Lfixunssfdi_nan_or_inf
   1534 
   1535 	/* Extract the exponent and check if 0 <= (exp - 0x7f) < 64.  */
   1536 	extui	a4, a2, 23, 8
   1537 	addi	a4, a4, -0x7f
   1538 	bgei	a4, 64, .Lfixunssfdi_maxint
   1539 	bltz	a4, .Lfixunssfdi_zero
   1540 
   1541 	/* Add explicit "1.0" and shift << 8.  */
   1542 	or	a7, a2, a6
   1543 	slli	xh, a7, 8
   1544 
   1545 	/* Shift back to the right, based on the exponent.  */
   1546 	addi	a4, a4, 1
   1547 	beqi	a4, 64, .Lfixunssfdi_bigexp
   1548 	ssl	a4		/* shift by 64 - a4 */
   1549 	bgei	a4, 32, .Lfixunssfdi_smallshift
   1550 	srl	xl, xh
   1551 	movi	xh, 0
   1552 
   1553 .Lfixunssfdi_shifted:
   1554 	/* Negate the result if sign != 0.  */
   1555 	bgez	a7, 1f
   1556 	neg	xl, xl
   1557 	neg	xh, xh
   1558 	beqz	xl, 1f
   1559 	addi	xh, xh, -1
   1560 1:	leaf_return
   1561 
   1562 .Lfixunssfdi_smallshift:
   1563 	movi	xl, 0
   1564 	src	xl, xh, xl
   1565 	srl	xh, xh
   1566 	j	.Lfixunssfdi_shifted
   1567 
   1568 .Lfixunssfdi_nan_or_inf:
   1569 	/* Handle Infinity and NaN.  */
   1570 	slli	a4, a2, 9
   1571 	beqz	a4, .Lfixunssfdi_maxint
   1572 
   1573 	/* Translate NaN to 0xffffffff.... */
   1574 1:	movi	xh, -1
   1575 	movi	xl, -1
   1576 	leaf_return
   1577 
   1578 .Lfixunssfdi_maxint:
   1579 	bgez	a2, 1b
   1580 2:	slli	xh, a6, 8	/* 0x80000000 */
   1581 	movi	xl, 0
   1582 	leaf_return
   1583 
   1584 .Lfixunssfdi_zero:
   1585 	movi	xh, 0
   1586 	movi	xl, 0
   1587 	leaf_return
   1588 
   1589 .Lfixunssfdi_bigexp:
   1590 	/* Handle unsigned maximum exponent case.  */
   1591 	bltz	a7, 2b
   1592 	movi	xl, 0
   1593 	leaf_return		/* no shift needed */
   1594 
   1595 #endif /* L_fixunssfdi */
   1596 
   1597 #ifdef L_floatsisf
   1598 
   1599 	.align	4
   1600 	.global	__floatunsisf
   1601 	.type	__floatunsisf, @function
   1602 __floatunsisf:
   1603 	leaf_entry sp, 16
   1604 	beqz	a2, .Lfloatsisf_return
   1605 
   1606 	/* Set the sign to zero and jump to the floatsisf code.  */
   1607 	movi	a7, 0
   1608 	j	.Lfloatsisf_normalize
   1609 
   1610 	.align	4
   1611 	.global	__floatsisf
   1612 	.type	__floatsisf, @function
   1613 __floatsisf:
   1614 	leaf_entry sp, 16
   1615 
   1616 	/* Check for zero.  */
   1617 	beqz	a2, .Lfloatsisf_return
   1618 
   1619 	/* Save the sign.  */
   1620 	extui	a7, a2, 31, 1
   1621 
   1622 	/* Get the absolute value.  */
   1623 #if XCHAL_HAVE_ABS
   1624 	abs	a2, a2
   1625 #else
   1626 	neg	a4, a2
   1627 	movltz	a2, a4, a2
   1628 #endif
   1629 
   1630 .Lfloatsisf_normalize:
   1631 	/* Normalize with the first 1 bit in the msb.  */
   1632 	do_nsau	a4, a2, a5, a6
   1633 	ssl	a4
   1634 	sll	a5, a2
   1635 
   1636 	/* Shift the mantissa into position, with rounding bits in a6.  */
   1637 	srli	a2, a5, 8
   1638 	slli	a6, a5, (32 - 8)
   1639 
   1640 	/* Set the exponent.  */
   1641 	movi	a5, 0x9d	/* 0x7e + 31 */
   1642 	sub	a5, a5, a4
   1643 	slli	a5, a5, 23
   1644 	add	a2, a2, a5
   1645 
   1646 	/* Add the sign.  */
   1647 	slli	a7, a7, 31
   1648 	or	a2, a2, a7
   1649 
   1650 	/* Round up if the leftover fraction is >= 1/2.  */
   1651 	bgez	a6, .Lfloatsisf_return
   1652 	addi	a2, a2, 1	/* Overflow to the exponent is OK.  */
   1653 
   1654 	/* Check if the leftover fraction is exactly 1/2.  */
   1655 	slli	a6, a6, 1
   1656 	beqz	a6, .Lfloatsisf_exactlyhalf
   1657 
   1658 .Lfloatsisf_return:
   1659 	leaf_return
   1660 
   1661 .Lfloatsisf_exactlyhalf:
   1662 	/* Round down to the nearest even value.  */
   1663 	srli	a2, a2, 1
   1664 	slli	a2, a2, 1
   1665 	leaf_return
   1666 
   1667 #endif /* L_floatsisf */
   1668 
   1669 #ifdef L_floatdisf
   1670 
   1671 	.align	4
   1672 	.global	__floatundisf
   1673 	.type	__floatundisf, @function
   1674 __floatundisf:
   1675 	leaf_entry sp, 16
   1676 
   1677 	/* Check for zero.  */
   1678 	or	a4, xh, xl
   1679 	beqz	a4, 2f
   1680 
   1681 	/* Set the sign to zero and jump to the floatdisf code.  */
   1682 	movi	a7, 0
   1683 	j	.Lfloatdisf_normalize
   1684 
   1685 	.align	4
   1686 	.global	__floatdisf
   1687 	.type	__floatdisf, @function
   1688 __floatdisf:
   1689 	leaf_entry sp, 16
   1690 
   1691 	/* Check for zero.  */
   1692 	or	a4, xh, xl
   1693 	beqz	a4, 2f
   1694 
   1695 	/* Save the sign.  */
   1696 	extui	a7, xh, 31, 1
   1697 
   1698 	/* Get the absolute value.  */
   1699 	bgez	xh, .Lfloatdisf_normalize
   1700 	neg	xl, xl
   1701 	neg	xh, xh
   1702 	beqz	xl, .Lfloatdisf_normalize
   1703 	addi	xh, xh, -1
   1704 
   1705 .Lfloatdisf_normalize:
   1706 	/* Normalize with the first 1 bit in the msb of xh.  */
   1707 	beqz	xh, .Lfloatdisf_bigshift
   1708 	do_nsau	a4, xh, a5, a6
   1709 	ssl	a4
   1710 	src	xh, xh, xl
   1711 	sll	xl, xl
   1712 
   1713 .Lfloatdisf_shifted:
   1714 	/* Shift the mantissa into position, with rounding bits in a6.  */
   1715 	ssai	8
   1716 	sll	a5, xl
   1717 	src	a6, xh, xl
   1718 	srl	xh, xh
   1719 	beqz	a5, 1f
   1720 	movi	a5, 1
   1721 	or	a6, a6, a5
   1722 1:
   1723 	/* Set the exponent.  */
   1724 	movi	a5, 0xbd	/* 0x7e + 63 */
   1725 	sub	a5, a5, a4
   1726 	slli	a5, a5, 23
   1727 	add	a2, xh, a5
   1728 
   1729 	/* Add the sign.  */
   1730 	slli	a7, a7, 31
   1731 	or	a2, a2, a7
   1732 
   1733 	/* Round up if the leftover fraction is >= 1/2.  */
   1734 	bgez	a6, 2f
   1735 	addi	a2, a2, 1	/* Overflow to the exponent is OK.  */
   1736 
   1737 	/* Check if the leftover fraction is exactly 1/2.  */
   1738 	slli	a6, a6, 1
   1739 	beqz	a6, .Lfloatdisf_exactlyhalf
   1740 2:	leaf_return
   1741 
   1742 .Lfloatdisf_bigshift:
   1743 	/* xh is zero.  Normalize with first 1 bit of xl in the msb of xh.  */
   1744 	do_nsau	a4, xl, a5, a6
   1745 	ssl	a4
   1746 	sll	xh, xl
   1747 	movi	xl, 0
   1748 	addi	a4, a4, 32
   1749 	j	.Lfloatdisf_shifted
   1750 
   1751 .Lfloatdisf_exactlyhalf:
   1752 	/* Round down to the nearest even value.  */
   1753 	srli	a2, a2, 1
   1754 	slli	a2, a2, 1
   1755 	leaf_return
   1756 
   1757 #endif /* L_floatdisf */
   1758