Home | History | Annotate | Line # | Download | only in libkern
      1 /* $NetBSD: softfloat.c,v 1.8 2020/09/02 04:06:43 thorpej Exp $ */
      2 
      3 /*
      4  * This version hacked for use with gcc -msoft-float by bjh21.
      5  * (Mostly a case of #ifdefing out things GCC doesn't need or provides
      6  *  itself).
      7  */
      8 
      9 /*
     10  * Things you may want to define:
     11  *
     12  * SOFTFLOAT_FOR_GCC - build only those functions necessary for GCC (with
     13  *   -msoft-float) to work.  Include "softfloat-for-gcc.h" to get them
     14  *   properly renamed.
     15  */
     16 
     17 /*============================================================================
     18 
     19 This C source file is part of the SoftFloat IEC/IEEE Floating-point Arithmetic
     20 Package, Release 2b.
     21 
     22 Written by John R. Hauser.  This work was made possible in part by the
     23 International Computer Science Institute, located at Suite 600, 1947 Center
     24 Street, Berkeley, California 94704.  Funding was partially provided by the
     25 National Science Foundation under grant MIP-9311980.  The original version
     26 of this code was written as part of a project to build a fixed-point vector
     27 processor in collaboration with the University of California at Berkeley,
     28 overseen by Profs. Nelson Morgan and John Wawrzynek.  More information
     29 is available through the Web page `http://www.cs.berkeley.edu/~jhauser/
     30 arithmetic/SoftFloat.html'.
     31 
     32 THIS SOFTWARE IS DISTRIBUTED AS IS, FOR FREE.  Although reasonable effort has
     33 been made to avoid it, THIS SOFTWARE MAY CONTAIN FAULTS THAT WILL AT TIMES
     34 RESULT IN INCORRECT BEHAVIOR.  USE OF THIS SOFTWARE IS RESTRICTED TO PERSONS
     35 AND ORGANIZATIONS WHO CAN AND WILL TAKE FULL RESPONSIBILITY FOR ALL LOSSES,
     36 COSTS, OR OTHER PROBLEMS THEY INCUR DUE TO THE SOFTWARE, AND WHO FURTHERMORE
     37 EFFECTIVELY INDEMNIFY JOHN HAUSER AND THE INTERNATIONAL COMPUTER SCIENCE
     38 INSTITUTE (possibly via similar legal warning) AGAINST ALL LOSSES, COSTS, OR
     39 OTHER PROBLEMS INCURRED BY THEIR CUSTOMERS AND CLIENTS DUE TO THE SOFTWARE.
     40 
     41 Derivative works are acceptable, even for commercial purposes, so long as
     42 (1) the source code for the derivative work includes prominent notice that
     43 the work is derivative, and (2) the source code includes prominent notice with
     44 these four paragraphs for those parts of this code that are retained.
     45 
     46 =============================================================================*/
     47 
     48 /* If you need this in a boot program, you have bigger problems... */
     49 #ifndef _STANDALONE
     50 
     51 #include <sys/cdefs.h>
     52 #if defined(LIBC_SCCS) && !defined(lint)
     53 __RCSID("$NetBSD: softfloat.c,v 1.8 2020/09/02 04:06:43 thorpej Exp $");
     54 #endif /* LIBC_SCCS and not lint */
     55 
     56 #ifdef SOFTFLOAT_FOR_GCC
     57 #include "softfloat-for-gcc.h"
     58 #endif
     59 
     60 #include "milieu.h"
     61 #include "softfloat.h"
     62 
     63 /*----------------------------------------------------------------------------
     64 | Floating-point rounding mode, extended double-precision rounding precision,
     65 | and exception flags.
     66 *----------------------------------------------------------------------------*/
     67 /*
     68  * XXX: This may cause options-MULTIPROCESSOR or thread problems someday.
     69  * 	Right now, it does not.  I've removed all other dynamic global
     70  * 	variables. [ross]
     71  */
     72 #ifdef FLOATX80
     73 int8 floatx80_rounding_precision = 80;
     74 #endif
     75 
     76 /*----------------------------------------------------------------------------
     77 | Primitive arithmetic functions, including multi-word arithmetic, and
     78 | division and square root approximations.  (Can be specialized to target if
     79 | desired.)
     80 *----------------------------------------------------------------------------*/
     81 #include "softfloat-macros.h"
     82 
     83 /*----------------------------------------------------------------------------
     84 | Functions and definitions to determine:  (1) whether tininess for underflow
     85 | is detected before or after rounding by default, (2) what (if anything)
     86 | happens when exceptions are raised, (3) how signaling NaNs are distinguished
     87 | from quiet NaNs, (4) the default generated quiet NaNs, and (5) how NaNs
     88 | are propagated from function inputs to output.  These details are target-
     89 | specific.
     90 *----------------------------------------------------------------------------*/
     91 #include "softfloat-specialize.h"
     92 
     93 #ifndef SOFTFLOAT_FOR_GCC /* Not used */
     94 /*----------------------------------------------------------------------------
     95 | Takes a 64-bit fixed-point value `absZ' with binary point between bits 6
     96 | and 7, and returns the properly rounded 32-bit integer corresponding to the
     97 | input.  If `zSign' is 1, the input is negated before being converted to an
     98 | integer.  Bit 63 of `absZ' must be zero.  Ordinarily, the fixed-point input
     99 | is simply rounded to an integer, with the inexact exception raised if the
    100 | input cannot be represented exactly as an integer.  However, if the fixed-
    101 | point input is too large, the invalid exception is raised and the largest
    102 | positive or negative integer is returned.
    103 *----------------------------------------------------------------------------*/
    104 
    105 static int32 roundAndPackInt32( flag zSign, bits64 absZ )
    106 {
    107     int8 roundingMode;
    108     flag roundNearestEven;
    109     int8 roundIncrement, roundBits;
    110     int32 z;
    111 
    112     roundingMode = float_rounding_mode();
    113     roundNearestEven = ( roundingMode == float_round_nearest_even );
    114     roundIncrement = 0x40;
    115     if ( ! roundNearestEven ) {
    116         if ( roundingMode == float_round_to_zero ) {
    117             roundIncrement = 0;
    118         }
    119         else {
    120             roundIncrement = 0x7F;
    121             if ( zSign ) {
    122                 if ( roundingMode == float_round_up ) roundIncrement = 0;
    123             }
    124             else {
    125                 if ( roundingMode == float_round_down ) roundIncrement = 0;
    126             }
    127         }
    128     }
    129     roundBits = absZ & 0x7F;
    130     absZ = ( absZ + roundIncrement )>>7;
    131     absZ &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven );
    132     z = absZ;
    133     if ( zSign ) z = - z;
    134     if ( ( absZ>>32 ) || ( z && ( ( z < 0 ) ^ zSign ) ) ) {
    135         float_raise( float_flag_invalid );
    136         return zSign ? (sbits32) 0x80000000 : 0x7FFFFFFF;
    137     }
    138     if ( roundBits ) float_set_inexact();
    139     return z;
    140 
    141 }
    142 
    143 /*----------------------------------------------------------------------------
    144 | Takes the 128-bit fixed-point value formed by concatenating `absZ0' and
    145 | `absZ1', with binary point between bits 63 and 64 (between the input words),
    146 | and returns the properly rounded 64-bit integer corresponding to the input.
    147 | If `zSign' is 1, the input is negated before being converted to an integer.
    148 | Ordinarily, the fixed-point input is simply rounded to an integer, with
    149 | the inexact exception raised if the input cannot be represented exactly as
    150 | an integer.  However, if the fixed-point input is too large, the invalid
    151 | exception is raised and the largest positive or negative integer is
    152 | returned.
    153 *----------------------------------------------------------------------------*/
    154 
    155 static int64 roundAndPackInt64( flag zSign, bits64 absZ0, bits64 absZ1 )
    156 {
    157     int8 roundingMode;
    158     flag roundNearestEven, increment;
    159     int64 z;
    160 
    161     roundingMode = float_rounding_mode();
    162     roundNearestEven = ( roundingMode == float_round_nearest_even );
    163     increment = ( (sbits64) absZ1 < 0 );
    164     if ( ! roundNearestEven ) {
    165         if ( roundingMode == float_round_to_zero ) {
    166             increment = 0;
    167         }
    168         else {
    169             if ( zSign ) {
    170                 increment = ( roundingMode == float_round_down ) && absZ1;
    171             }
    172             else {
    173                 increment = ( roundingMode == float_round_up ) && absZ1;
    174             }
    175         }
    176     }
    177     if ( increment ) {
    178         ++absZ0;
    179         if ( absZ0 == 0 ) goto overflow;
    180         absZ0 &= ~ ( ( (bits64) ( absZ1<<1 ) == 0 ) & roundNearestEven );
    181     }
    182     z = absZ0;
    183     if ( zSign ) z = - z;
    184     if ( z && ( ( z < 0 ) ^ zSign ) ) {
    185  overflow:
    186         float_raise( float_flag_invalid );
    187         return
    188               zSign ? (sbits64) LIT64( 0x8000000000000000 )
    189             : LIT64( 0x7FFFFFFFFFFFFFFF );
    190     }
    191     if ( absZ1 ) float_set_inexact();
    192     return z;
    193 
    194 }
    195 
    196 /* same as above, but for unsigned values */
    197 static uint64 roundAndPackUInt64( bits64 absZ0, bits64 absZ1 )
    198 {
    199     int8 roundingMode;
    200     flag roundNearestEven, increment;
    201     uint64 z;
    202 
    203     roundingMode = float_rounding_mode();
    204     roundNearestEven = ( roundingMode == float_round_nearest_even );
    205     increment = ( (sbits64) absZ1 < 0 );
    206     if ( ! roundNearestEven ) {
    207         if ( roundingMode == float_round_to_zero ) {
    208             increment = 0;
    209         }
    210         else {
    211             increment = ( roundingMode == float_round_up ) && absZ1;
    212         }
    213     }
    214     if ( increment ) {
    215         ++absZ0;
    216         if ( absZ0 == 0 ) {
    217             float_raise( float_flag_invalid );
    218             return LIT64( 0x7FFFFFFFFFFFFFFF );
    219 	}
    220         absZ0 &= ~ ( ( (bits64) ( absZ1<<1 ) == 0 ) & roundNearestEven );
    221     }
    222     z = absZ0;
    223     if ( absZ1 ) float_set_inexact();
    224     return z;
    225 
    226 }
    227 #endif /* SOFTFLOAT_FOR_GCC */
    228 
    229 /*----------------------------------------------------------------------------
    230 | Returns the fraction bits of the single-precision floating-point value `a'.
    231 *----------------------------------------------------------------------------*/
    232 
    233 INLINE bits32 extractFloat32Frac( float32 a )
    234 {
    235 
    236     return a & 0x007FFFFF;
    237 
    238 }
    239 
    240 /*----------------------------------------------------------------------------
    241 | Returns the exponent bits of the single-precision floating-point value `a'.
    242 *----------------------------------------------------------------------------*/
    243 
    244 INLINE int16 extractFloat32Exp( float32 a )
    245 {
    246 
    247     return ( a>>23 ) & 0xFF;
    248 
    249 }
    250 
    251 /*----------------------------------------------------------------------------
    252 | Returns the sign bit of the single-precision floating-point value `a'.
    253 *----------------------------------------------------------------------------*/
    254 
    255 INLINE flag extractFloat32Sign( float32 a )
    256 {
    257 
    258     return a>>31;
    259 
    260 }
    261 
    262 /*----------------------------------------------------------------------------
    263 | Normalizes the subnormal single-precision floating-point value represented
    264 | by the denormalized significand `aSig'.  The normalized exponent and
    265 | significand are stored at the locations pointed to by `zExpPtr' and
    266 | `zSigPtr', respectively.
    267 *----------------------------------------------------------------------------*/
    268 
    269 static void
    270  normalizeFloat32Subnormal( bits32 aSig, int16 *zExpPtr, bits32 *zSigPtr )
    271 {
    272     int8 shiftCount;
    273 
    274     shiftCount = countLeadingZeros32( aSig ) - 8;
    275     *zSigPtr = aSig<<shiftCount;
    276     *zExpPtr = 1 - shiftCount;
    277 
    278 }
    279 
    280 /*----------------------------------------------------------------------------
    281 | Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
    282 | single-precision floating-point value, returning the result.  After being
    283 | shifted into the proper positions, the three fields are simply added
    284 | together to form the result.  This means that any integer portion of `zSig'
    285 | will be added into the exponent.  Since a properly normalized significand
    286 | will have an integer portion equal to 1, the `zExp' input should be 1 less
    287 | than the desired result exponent whenever `zSig' is a complete, normalized
    288 | significand.
    289 *----------------------------------------------------------------------------*/
    290 
    291 INLINE float32 packFloat32( flag zSign, int16 zExp, bits32 zSig )
    292 {
    293 
    294     return ( ( (bits32) zSign )<<31 ) + ( ( (bits32) zExp )<<23 ) + zSig;
    295 
    296 }
    297 
    298 /*----------------------------------------------------------------------------
    299 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
    300 | and significand `zSig', and returns the proper single-precision floating-
    301 | point value corresponding to the abstract input.  Ordinarily, the abstract
    302 | value is simply rounded and packed into the single-precision format, with
    303 | the inexact exception raised if the abstract input cannot be represented
    304 | exactly.  However, if the abstract value is too large, the overflow and
    305 | inexact exceptions are raised and an infinity or maximal finite value is
    306 | returned.  If the abstract value is too small, the input value is rounded to
    307 | a subnormal number, and the underflow and inexact exceptions are raised if
    308 | the abstract input cannot be represented exactly as a subnormal single-
    309 | precision floating-point number.
    310 |     The input significand `zSig' has its binary point between bits 30
    311 | and 29, which is 7 bits to the left of the usual location.  This shifted
    312 | significand must be normalized or smaller.  If `zSig' is not normalized,
    313 | `zExp' must be 0; in that case, the result returned is a subnormal number,
    314 | and it must not require rounding.  In the usual case that `zSig' is
    315 | normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
    316 | The handling of underflow and overflow follows the IEC/IEEE Standard for
    317 | Binary Floating-Point Arithmetic.
    318 *----------------------------------------------------------------------------*/
    319 
    320 static float32 roundAndPackFloat32( flag zSign, int16 zExp, bits32 zSig )
    321 {
    322     int8 roundingMode;
    323     flag roundNearestEven;
    324     int8 roundIncrement, roundBits;
    325     flag isTiny;
    326 
    327     roundingMode = float_rounding_mode();
    328     roundNearestEven = ( roundingMode == float_round_nearest_even );
    329     roundIncrement = 0x40;
    330     if ( ! roundNearestEven ) {
    331         if ( roundingMode == float_round_to_zero ) {
    332             roundIncrement = 0;
    333         }
    334         else {
    335             roundIncrement = 0x7F;
    336             if ( zSign ) {
    337                 if ( roundingMode == float_round_up ) roundIncrement = 0;
    338             }
    339             else {
    340                 if ( roundingMode == float_round_down ) roundIncrement = 0;
    341             }
    342         }
    343     }
    344     roundBits = zSig & 0x7F;
    345     if ( 0xFD <= (bits16) zExp ) {
    346         if (    ( 0xFD < zExp )
    347              || (    ( zExp == 0xFD )
    348                   && ( (sbits32) ( zSig + roundIncrement ) < 0 ) )
    349            ) {
    350             float_raise( float_flag_overflow | float_flag_inexact );
    351             return packFloat32( zSign, 0xFF, 0 ) - ( roundIncrement == 0 );
    352         }
    353         if ( zExp < 0 ) {
    354             isTiny =
    355                    ( float_detect_tininess == float_tininess_before_rounding )
    356                 || ( zExp < -1 )
    357                 || ( zSig + roundIncrement < 0x80000000 );
    358             shift32RightJamming( zSig, - zExp, &zSig );
    359             zExp = 0;
    360             roundBits = zSig & 0x7F;
    361             if ( isTiny && roundBits ) float_raise( float_flag_underflow );
    362         }
    363     }
    364     if ( roundBits ) float_set_inexact();
    365     zSig = ( zSig + roundIncrement )>>7;
    366     zSig &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven );
    367     if ( zSig == 0 ) zExp = 0;
    368     return packFloat32( zSign, zExp, zSig );
    369 
    370 }
    371 
    372 /*----------------------------------------------------------------------------
    373 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
    374 | and significand `zSig', and returns the proper single-precision floating-
    375 | point value corresponding to the abstract input.  This routine is just like
    376 | `roundAndPackFloat32' except that `zSig' does not have to be normalized.
    377 | Bit 31 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
    378 | floating-point exponent.
    379 *----------------------------------------------------------------------------*/
    380 
    381 static float32
    382  normalizeRoundAndPackFloat32( flag zSign, int16 zExp, bits32 zSig )
    383 {
    384     int8 shiftCount;
    385 
    386     shiftCount = countLeadingZeros32( zSig ) - 1;
    387     return roundAndPackFloat32( zSign, zExp - shiftCount, zSig<<shiftCount );
    388 
    389 }
    390 
    391 /*----------------------------------------------------------------------------
    392 | Returns the fraction bits of the double-precision floating-point value `a'.
    393 *----------------------------------------------------------------------------*/
    394 
    395 INLINE bits64 extractFloat64Frac( float64 a )
    396 {
    397 
    398     return a & LIT64( 0x000FFFFFFFFFFFFF );
    399 
    400 }
    401 
    402 /*----------------------------------------------------------------------------
    403 | Returns the exponent bits of the double-precision floating-point value `a'.
    404 *----------------------------------------------------------------------------*/
    405 
    406 INLINE int16 extractFloat64Exp( float64 a )
    407 {
    408 
    409     return ( a>>52 ) & 0x7FF;
    410 
    411 }
    412 /*----------------------------------------------------------------------------
    413 | Returns the sign bit of the double-precision floating-point value `a'.
    414 *----------------------------------------------------------------------------*/
    415 
    416 INLINE flag extractFloat64Sign( float64 a )
    417 {
    418 
    419     return a>>63;
    420 
    421 }
    422 
    423 /*----------------------------------------------------------------------------
    424 | Normalizes the subnormal double-precision floating-point value represented
    425 | by the denormalized significand `aSig'.  The normalized exponent and
    426 | significand are stored at the locations pointed to by `zExpPtr' and
    427 | `zSigPtr', respectively.
    428 *----------------------------------------------------------------------------*/
    429 
    430 static void
    431  normalizeFloat64Subnormal( bits64 aSig, int16 *zExpPtr, bits64 *zSigPtr )
    432 {
    433     int8 shiftCount;
    434 
    435     shiftCount = countLeadingZeros64( aSig ) - 11;
    436     *zSigPtr = aSig<<shiftCount;
    437     *zExpPtr = 1 - shiftCount;
    438 
    439 }
    440 
    441 /*----------------------------------------------------------------------------
    442 | Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
    443 | double-precision floating-point value, returning the result.  After being
    444 | shifted into the proper positions, the three fields are simply added
    445 | together to form the result.  This means that any integer portion of `zSig'
    446 | will be added into the exponent.  Since a properly normalized significand
    447 | will have an integer portion equal to 1, the `zExp' input should be 1 less
    448 | than the desired result exponent whenever `zSig' is a complete, normalized
    449 | significand.
    450 *----------------------------------------------------------------------------*/
    451 
    452 INLINE float64 packFloat64( flag zSign, int16 zExp, bits64 zSig )
    453 {
    454 
    455     return ( ( (bits64) zSign )<<63 ) + ( ( (bits64) zExp )<<52 ) + zSig;
    456 
    457 }
    458 
    459 /*----------------------------------------------------------------------------
    460 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
    461 | and significand `zSig', and returns the proper double-precision floating-
    462 | point value corresponding to the abstract input.  Ordinarily, the abstract
    463 | value is simply rounded and packed into the double-precision format, with
    464 | the inexact exception raised if the abstract input cannot be represented
    465 | exactly.  However, if the abstract value is too large, the overflow and
    466 | inexact exceptions are raised and an infinity or maximal finite value is
    467 | returned.  If the abstract value is too small, the input value is rounded
    468 | to a subnormal number, and the underflow and inexact exceptions are raised
    469 | if the abstract input cannot be represented exactly as a subnormal double-
    470 | precision floating-point number.
    471 |     The input significand `zSig' has its binary point between bits 62
    472 | and 61, which is 10 bits to the left of the usual location.  This shifted
    473 | significand must be normalized or smaller.  If `zSig' is not normalized,
    474 | `zExp' must be 0; in that case, the result returned is a subnormal number,
    475 | and it must not require rounding.  In the usual case that `zSig' is
    476 | normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
    477 | The handling of underflow and overflow follows the IEC/IEEE Standard for
    478 | Binary Floating-Point Arithmetic.
    479 *----------------------------------------------------------------------------*/
    480 
    481 static float64 roundAndPackFloat64( flag zSign, int16 zExp, bits64 zSig )
    482 {
    483     int8 roundingMode;
    484     flag roundNearestEven;
    485     int16 roundIncrement, roundBits;
    486     flag isTiny;
    487 
    488     roundingMode = float_rounding_mode();
    489     roundNearestEven = ( roundingMode == float_round_nearest_even );
    490     roundIncrement = 0x200;
    491     if ( ! roundNearestEven ) {
    492         if ( roundingMode == float_round_to_zero ) {
    493             roundIncrement = 0;
    494         }
    495         else {
    496             roundIncrement = 0x3FF;
    497             if ( zSign ) {
    498                 if ( roundingMode == float_round_up ) roundIncrement = 0;
    499             }
    500             else {
    501                 if ( roundingMode == float_round_down ) roundIncrement = 0;
    502             }
    503         }
    504     }
    505     roundBits = zSig & 0x3FF;
    506     if ( 0x7FD <= (bits16) zExp ) {
    507         if (    ( 0x7FD < zExp )
    508              || (    ( zExp == 0x7FD )
    509                   && ( (sbits64) ( zSig + roundIncrement ) < 0 ) )
    510            ) {
    511             float_raise( float_flag_overflow | float_flag_inexact );
    512             return packFloat64( zSign, 0x7FF, 0 ) - ( roundIncrement == 0 );
    513         }
    514         if ( zExp < 0 ) {
    515             isTiny =
    516                    ( float_detect_tininess == float_tininess_before_rounding )
    517                 || ( zExp < -1 )
    518                 || ( zSig + roundIncrement < LIT64( 0x8000000000000000 ) );
    519             shift64RightJamming( zSig, - zExp, &zSig );
    520             zExp = 0;
    521             roundBits = zSig & 0x3FF;
    522             if ( isTiny && roundBits ) float_raise( float_flag_underflow );
    523         }
    524     }
    525     if ( roundBits ) float_set_inexact();
    526     zSig = ( zSig + roundIncrement )>>10;
    527     zSig &= ~ ( ( ( roundBits ^ 0x200 ) == 0 ) & roundNearestEven );
    528     if ( zSig == 0 ) zExp = 0;
    529     return packFloat64( zSign, zExp, zSig );
    530 
    531 }
    532 
    533 /*----------------------------------------------------------------------------
    534 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
    535 | and significand `zSig', and returns the proper double-precision floating-
    536 | point value corresponding to the abstract input.  This routine is just like
    537 | `roundAndPackFloat64' except that `zSig' does not have to be normalized.
    538 | Bit 63 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
    539 | floating-point exponent.
    540 *----------------------------------------------------------------------------*/
    541 
    542 static float64
    543  normalizeRoundAndPackFloat64( flag zSign, int16 zExp, bits64 zSig )
    544 {
    545     int8 shiftCount;
    546 
    547     shiftCount = countLeadingZeros64( zSig ) - 1;
    548     return roundAndPackFloat64( zSign, zExp - shiftCount, zSig<<shiftCount );
    549 
    550 }
    551 
    552 #ifdef FLOATX80
    553 
    554 /*----------------------------------------------------------------------------
    555 | Returns the fraction bits of the extended double-precision floating-point
    556 | value `a'.
    557 *----------------------------------------------------------------------------*/
    558 
    559 INLINE bits64 extractFloatx80Frac( floatx80 a )
    560 {
    561 
    562     return a.low;
    563 
    564 }
    565 
    566 /*----------------------------------------------------------------------------
    567 | Returns the exponent bits of the extended double-precision floating-point
    568 | value `a'.
    569 *----------------------------------------------------------------------------*/
    570 
    571 INLINE int32 extractFloatx80Exp( floatx80 a )
    572 {
    573 
    574     return a.high & 0x7FFF;
    575 
    576 }
    577 
    578 /*----------------------------------------------------------------------------
    579 | Returns the sign bit of the extended double-precision floating-point value
    580 | `a'.
    581 *----------------------------------------------------------------------------*/
    582 
    583 INLINE flag extractFloatx80Sign( floatx80 a )
    584 {
    585 
    586     return a.high>>15;
    587 
    588 }
    589 
    590 /*----------------------------------------------------------------------------
    591 | Normalizes the subnormal extended double-precision floating-point value
    592 | represented by the denormalized significand `aSig'.  The normalized exponent
    593 | and significand are stored at the locations pointed to by `zExpPtr' and
    594 | `zSigPtr', respectively.
    595 *----------------------------------------------------------------------------*/
    596 
    597 static void
    598  normalizeFloatx80Subnormal( bits64 aSig, int32 *zExpPtr, bits64 *zSigPtr )
    599 {
    600     int8 shiftCount;
    601 
    602     shiftCount = countLeadingZeros64( aSig );
    603     *zSigPtr = aSig<<shiftCount;
    604     *zExpPtr = 1 - shiftCount;
    605 
    606 }
    607 
    608 /*----------------------------------------------------------------------------
    609 | Packs the sign `zSign', exponent `zExp', and significand `zSig' into an
    610 | extended double-precision floating-point value, returning the result.
    611 *----------------------------------------------------------------------------*/
    612 
    613 INLINE floatx80 packFloatx80( flag zSign, int32 zExp, bits64 zSig )
    614 {
    615     floatx80 z;
    616 
    617     z.low = zSig;
    618     z.high = ( ( (bits16) zSign )<<15 ) + zExp;
    619     return z;
    620 
    621 }
    622 
    623 /*----------------------------------------------------------------------------
    624 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
    625 | and extended significand formed by the concatenation of `zSig0' and `zSig1',
    626 | and returns the proper extended double-precision floating-point value
    627 | corresponding to the abstract input.  Ordinarily, the abstract value is
    628 | rounded and packed into the extended double-precision format, with the
    629 | inexact exception raised if the abstract input cannot be represented
    630 | exactly.  However, if the abstract value is too large, the overflow and
    631 | inexact exceptions are raised and an infinity or maximal finite value is
    632 | returned.  If the abstract value is too small, the input value is rounded to
    633 | a subnormal number, and the underflow and inexact exceptions are raised if
    634 | the abstract input cannot be represented exactly as a subnormal extended
    635 | double-precision floating-point number.
    636 |     If `roundingPrecision' is 32 or 64, the result is rounded to the same
    637 | number of bits as single or double precision, respectively.  Otherwise, the
    638 | result is rounded to the full precision of the extended double-precision
    639 | format.
    640 |     The input significand must be normalized or smaller.  If the input
    641 | significand is not normalized, `zExp' must be 0; in that case, the result
    642 | returned is a subnormal number, and it must not require rounding.  The
    643 | handling of underflow and overflow follows the IEC/IEEE Standard for Binary
    644 | Floating-Point Arithmetic.
    645 *----------------------------------------------------------------------------*/
    646 
    647 static floatx80
    648  roundAndPackFloatx80(
    649      int8 roundingPrecision, flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1
    650  )
    651 {
    652     int8 roundingMode;
    653     flag roundNearestEven, increment, isTiny;
    654     int64 roundIncrement, roundMask, roundBits;
    655 
    656     roundingMode = float_rounding_mode();
    657     roundNearestEven = ( roundingMode == float_round_nearest_even );
    658     if ( roundingPrecision == 80 ) goto precision80;
    659     if ( roundingPrecision == 64 ) {
    660         roundIncrement = LIT64( 0x0000000000000400 );
    661         roundMask = LIT64( 0x00000000000007FF );
    662     }
    663     else if ( roundingPrecision == 32 ) {
    664         roundIncrement = LIT64( 0x0000008000000000 );
    665         roundMask = LIT64( 0x000000FFFFFFFFFF );
    666     }
    667     else {
    668         goto precision80;
    669     }
    670     zSig0 |= ( zSig1 != 0 );
    671     if ( ! roundNearestEven ) {
    672         if ( roundingMode == float_round_to_zero ) {
    673             roundIncrement = 0;
    674         }
    675         else {
    676             roundIncrement = roundMask;
    677             if ( zSign ) {
    678                 if ( roundingMode == float_round_up ) roundIncrement = 0;
    679             }
    680             else {
    681                 if ( roundingMode == float_round_down ) roundIncrement = 0;
    682             }
    683         }
    684     }
    685     roundBits = zSig0 & roundMask;
    686     if ( 0x7FFD <= (bits32) ( zExp - 1 ) ) {
    687         if (    ( 0x7FFE < zExp )
    688              || ( ( zExp == 0x7FFE ) && ( zSig0 + roundIncrement < zSig0 ) )
    689            ) {
    690             goto overflow;
    691         }
    692         if ( zExp <= 0 ) {
    693             isTiny =
    694                    ( float_detect_tininess == float_tininess_before_rounding )
    695                 || ( zExp < 0 )
    696                 || ( zSig0 <= zSig0 + roundIncrement );
    697             shift64RightJamming( zSig0, 1 - zExp, &zSig0 );
    698             zExp = 0;
    699             roundBits = zSig0 & roundMask;
    700             if ( isTiny && roundBits ) float_raise( float_flag_underflow );
    701             if ( roundBits ) float_set_inexact();
    702             zSig0 += roundIncrement;
    703             if ( (sbits64) zSig0 < 0 ) zExp = 1;
    704             roundIncrement = roundMask + 1;
    705             if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) {
    706                 roundMask |= roundIncrement;
    707             }
    708             zSig0 &= ~ roundMask;
    709             return packFloatx80( zSign, zExp, zSig0 );
    710         }
    711     }
    712     if ( roundBits ) float_set_inexact();
    713     zSig0 += roundIncrement;
    714     if ( zSig0 < roundIncrement ) {
    715         ++zExp;
    716         zSig0 = LIT64( 0x8000000000000000 );
    717     }
    718     roundIncrement = roundMask + 1;
    719     if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) {
    720         roundMask |= roundIncrement;
    721     }
    722     zSig0 &= ~ roundMask;
    723     if ( zSig0 == 0 ) zExp = 0;
    724     return packFloatx80( zSign, zExp, zSig0 );
    725  precision80:
    726     increment = ( (sbits64) zSig1 < 0 );
    727     if ( ! roundNearestEven ) {
    728         if ( roundingMode == float_round_to_zero ) {
    729             increment = 0;
    730         }
    731         else {
    732             if ( zSign ) {
    733                 increment = ( roundingMode == float_round_down ) && zSig1;
    734             }
    735             else {
    736                 increment = ( roundingMode == float_round_up ) && zSig1;
    737             }
    738         }
    739     }
    740     if ( 0x7FFD <= (bits32) ( zExp - 1 ) ) {
    741         if (    ( 0x7FFE < zExp )
    742              || (    ( zExp == 0x7FFE )
    743                   && ( zSig0 == LIT64( 0xFFFFFFFFFFFFFFFF ) )
    744                   && increment
    745                 )
    746            ) {
    747             roundMask = 0;
    748  overflow:
    749             float_raise( float_flag_overflow | float_flag_inexact );
    750             if (    ( roundingMode == float_round_to_zero )
    751                  || ( zSign && ( roundingMode == float_round_up ) )
    752                  || ( ! zSign && ( roundingMode == float_round_down ) )
    753                ) {
    754                 return packFloatx80( zSign, 0x7FFE, ~ roundMask );
    755             }
    756             return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
    757         }
    758         if ( zExp <= 0 ) {
    759             isTiny =
    760                    ( float_detect_tininess == float_tininess_before_rounding )
    761                 || ( zExp < 0 )
    762                 || ! increment
    763                 || ( zSig0 < LIT64( 0xFFFFFFFFFFFFFFFF ) );
    764             shift64ExtraRightJamming( zSig0, zSig1, 1 - zExp, &zSig0, &zSig1 );
    765             zExp = 0;
    766             if ( isTiny && zSig1 ) float_raise( float_flag_underflow );
    767             if ( zSig1 ) float_set_inexact();
    768             if ( roundNearestEven ) {
    769                 increment = ( (sbits64) zSig1 < 0 );
    770             }
    771             else {
    772                 if ( zSign ) {
    773                     increment = ( roundingMode == float_round_down ) && zSig1;
    774                 }
    775                 else {
    776                     increment = ( roundingMode == float_round_up ) && zSig1;
    777                 }
    778             }
    779             if ( increment ) {
    780                 ++zSig0;
    781                 zSig0 &=
    782                     ~ ( ( (bits64) ( zSig1<<1 ) == 0 ) & roundNearestEven );
    783                 if ( (sbits64) zSig0 < 0 ) zExp = 1;
    784             }
    785             return packFloatx80( zSign, zExp, zSig0 );
    786         }
    787     }
    788     if ( zSig1 ) float_set_inexact();
    789     if ( increment ) {
    790         ++zSig0;
    791         if ( zSig0 == 0 ) {
    792             ++zExp;
    793             zSig0 = LIT64( 0x8000000000000000 );
    794         }
    795         else {
    796             zSig0 &= ~ ( ( (bits64) ( zSig1<<1 ) == 0 ) & roundNearestEven );
    797         }
    798     }
    799     else {
    800         if ( zSig0 == 0 ) zExp = 0;
    801     }
    802     return packFloatx80( zSign, zExp, zSig0 );
    803 
    804 }
    805 
    806 /*----------------------------------------------------------------------------
    807 | Takes an abstract floating-point value having sign `zSign', exponent
    808 | `zExp', and significand formed by the concatenation of `zSig0' and `zSig1',
    809 | and returns the proper extended double-precision floating-point value
    810 | corresponding to the abstract input.  This routine is just like
    811 | `roundAndPackFloatx80' except that the input significand does not have to be
    812 | normalized.
    813 *----------------------------------------------------------------------------*/
    814 
    815 static floatx80
    816  normalizeRoundAndPackFloatx80(
    817      int8 roundingPrecision, flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1
    818  )
    819 {
    820     int8 shiftCount;
    821 
    822     if ( zSig0 == 0 ) {
    823         zSig0 = zSig1;
    824         zSig1 = 0;
    825         zExp -= 64;
    826     }
    827     shiftCount = countLeadingZeros64( zSig0 );
    828     shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
    829     zExp -= shiftCount;
    830     return
    831         roundAndPackFloatx80( roundingPrecision, zSign, zExp, zSig0, zSig1 );
    832 
    833 }
    834 
    835 #endif
    836 
    837 #ifdef FLOAT128
    838 
    839 /*----------------------------------------------------------------------------
    840 | Returns the least-significant 64 fraction bits of the quadruple-precision
    841 | floating-point value `a'.
    842 *----------------------------------------------------------------------------*/
    843 
    844 INLINE bits64 extractFloat128Frac1( float128 a )
    845 {
    846 
    847     return a.low;
    848 
    849 }
    850 
    851 /*----------------------------------------------------------------------------
    852 | Returns the most-significant 48 fraction bits of the quadruple-precision
    853 | floating-point value `a'.
    854 *----------------------------------------------------------------------------*/
    855 
    856 INLINE bits64 extractFloat128Frac0( float128 a )
    857 {
    858 
    859     return a.high & LIT64( 0x0000FFFFFFFFFFFF );
    860 
    861 }
    862 
    863 /*----------------------------------------------------------------------------
    864 | Returns the exponent bits of the quadruple-precision floating-point value
    865 | `a'.
    866 *----------------------------------------------------------------------------*/
    867 
    868 INLINE int32 extractFloat128Exp( float128 a )
    869 {
    870 
    871     return ( a.high>>48 ) & 0x7FFF;
    872 
    873 }
    874 
    875 /*----------------------------------------------------------------------------
    876 | Returns the sign bit of the quadruple-precision floating-point value `a'.
    877 *----------------------------------------------------------------------------*/
    878 
    879 INLINE flag extractFloat128Sign( float128 a )
    880 {
    881 
    882     return a.high>>63;
    883 
    884 }
    885 
    886 /*----------------------------------------------------------------------------
    887 | Normalizes the subnormal quadruple-precision floating-point value
    888 | represented by the denormalized significand formed by the concatenation of
    889 | `aSig0' and `aSig1'.  The normalized exponent is stored at the location
    890 | pointed to by `zExpPtr'.  The most significant 49 bits of the normalized
    891 | significand are stored at the location pointed to by `zSig0Ptr', and the
    892 | least significant 64 bits of the normalized significand are stored at the
    893 | location pointed to by `zSig1Ptr'.
    894 *----------------------------------------------------------------------------*/
    895 
    896 static void
    897  normalizeFloat128Subnormal(
    898      bits64 aSig0,
    899      bits64 aSig1,
    900      int32 *zExpPtr,
    901      bits64 *zSig0Ptr,
    902      bits64 *zSig1Ptr
    903  )
    904 {
    905     int8 shiftCount;
    906 
    907     if ( aSig0 == 0 ) {
    908         shiftCount = countLeadingZeros64( aSig1 ) - 15;
    909         if ( shiftCount < 0 ) {
    910             *zSig0Ptr = aSig1>>( - shiftCount );
    911             *zSig1Ptr = aSig1<<( shiftCount & 63 );
    912         }
    913         else {
    914             *zSig0Ptr = aSig1<<shiftCount;
    915             *zSig1Ptr = 0;
    916         }
    917         *zExpPtr = - shiftCount - 63;
    918     }
    919     else {
    920         shiftCount = countLeadingZeros64( aSig0 ) - 15;
    921         shortShift128Left( aSig0, aSig1, shiftCount, zSig0Ptr, zSig1Ptr );
    922         *zExpPtr = 1 - shiftCount;
    923     }
    924 
    925 }
    926 
    927 /*----------------------------------------------------------------------------
    928 | Packs the sign `zSign', the exponent `zExp', and the significand formed
    929 | by the concatenation of `zSig0' and `zSig1' into a quadruple-precision
    930 | floating-point value, returning the result.  After being shifted into the
    931 | proper positions, the three fields `zSign', `zExp', and `zSig0' are simply
    932 | added together to form the most significant 32 bits of the result.  This
    933 | means that any integer portion of `zSig0' will be added into the exponent.
    934 | Since a properly normalized significand will have an integer portion equal
    935 | to 1, the `zExp' input should be 1 less than the desired result exponent
    936 | whenever `zSig0' and `zSig1' concatenated form a complete, normalized
    937 | significand.
    938 *----------------------------------------------------------------------------*/
    939 
    940 INLINE float128
    941  packFloat128( flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1 )
    942 {
    943     float128 z;
    944 
    945     z.low = zSig1;
    946     z.high = ( ( (bits64) zSign )<<63 ) + ( ( (bits64) zExp )<<48 ) + zSig0;
    947     return z;
    948 
    949 }
    950 
    951 /*----------------------------------------------------------------------------
    952 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
    953 | and extended significand formed by the concatenation of `zSig0', `zSig1',
    954 | and `zSig2', and returns the proper quadruple-precision floating-point value
    955 | corresponding to the abstract input.  Ordinarily, the abstract value is
    956 | simply rounded and packed into the quadruple-precision format, with the
    957 | inexact exception raised if the abstract input cannot be represented
    958 | exactly.  However, if the abstract value is too large, the overflow and
    959 | inexact exceptions are raised and an infinity or maximal finite value is
    960 | returned.  If the abstract value is too small, the input value is rounded to
    961 | a subnormal number, and the underflow and inexact exceptions are raised if
    962 | the abstract input cannot be represented exactly as a subnormal quadruple-
    963 | precision floating-point number.
    964 |     The input significand must be normalized or smaller.  If the input
    965 | significand is not normalized, `zExp' must be 0; in that case, the result
    966 | returned is a subnormal number, and it must not require rounding.  In the
    967 | usual case that the input significand is normalized, `zExp' must be 1 less
    968 | than the ``true'' floating-point exponent.  The handling of underflow and
    969 | overflow follows the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
    970 *----------------------------------------------------------------------------*/
    971 
    972 static float128
    973  roundAndPackFloat128(
    974      flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1, bits64 zSig2 )
    975 {
    976     int8 roundingMode;
    977     flag roundNearestEven, increment, isTiny;
    978 
    979     roundingMode = float_rounding_mode();
    980     roundNearestEven = ( roundingMode == float_round_nearest_even );
    981     increment = ( (sbits64) zSig2 < 0 );
    982     if ( ! roundNearestEven ) {
    983         if ( roundingMode == float_round_to_zero ) {
    984             increment = 0;
    985         }
    986         else {
    987             if ( zSign ) {
    988                 increment = ( roundingMode == float_round_down ) && zSig2;
    989             }
    990             else {
    991                 increment = ( roundingMode == float_round_up ) && zSig2;
    992             }
    993         }
    994     }
    995     if ( 0x7FFD <= (bits32) zExp ) {
    996         if (    ( 0x7FFD < zExp )
    997              || (    ( zExp == 0x7FFD )
    998                   && eq128(
    999                          LIT64( 0x0001FFFFFFFFFFFF ),
   1000                          LIT64( 0xFFFFFFFFFFFFFFFF ),
   1001                          zSig0,
   1002                          zSig1
   1003                      )
   1004                   && increment
   1005                 )
   1006            ) {
   1007             float_raise( float_flag_overflow | float_flag_inexact );
   1008             if (    ( roundingMode == float_round_to_zero )
   1009                  || ( zSign && ( roundingMode == float_round_up ) )
   1010                  || ( ! zSign && ( roundingMode == float_round_down ) )
   1011                ) {
   1012                 return
   1013                     packFloat128(
   1014                         zSign,
   1015                         0x7FFE,
   1016                         LIT64( 0x0000FFFFFFFFFFFF ),
   1017                         LIT64( 0xFFFFFFFFFFFFFFFF )
   1018                     );
   1019             }
   1020             return packFloat128( zSign, 0x7FFF, 0, 0 );
   1021         }
   1022         if ( zExp < 0 ) {
   1023             isTiny =
   1024                    ( float_detect_tininess == float_tininess_before_rounding )
   1025                 || ( zExp < -1 )
   1026                 || ! increment
   1027                 || lt128(
   1028                        zSig0,
   1029                        zSig1,
   1030                        LIT64( 0x0001FFFFFFFFFFFF ),
   1031                        LIT64( 0xFFFFFFFFFFFFFFFF )
   1032                    );
   1033             shift128ExtraRightJamming(
   1034                 zSig0, zSig1, zSig2, - zExp, &zSig0, &zSig1, &zSig2 );
   1035             zExp = 0;
   1036             if ( isTiny && zSig2 ) float_raise( float_flag_underflow );
   1037             if ( roundNearestEven ) {
   1038                 increment = ( (sbits64) zSig2 < 0 );
   1039             }
   1040             else {
   1041                 if ( zSign ) {
   1042                     increment = ( roundingMode == float_round_down ) && zSig2;
   1043                 }
   1044                 else {
   1045                     increment = ( roundingMode == float_round_up ) && zSig2;
   1046                 }
   1047             }
   1048         }
   1049     }
   1050     if ( zSig2 ) float_set_inexact();
   1051     if ( increment ) {
   1052         add128( zSig0, zSig1, 0, 1, &zSig0, &zSig1 );
   1053         zSig1 &= ~ ( ( zSig2 + zSig2 == 0 ) & roundNearestEven );
   1054     }
   1055     else {
   1056         if ( ( zSig0 | zSig1 ) == 0 ) zExp = 0;
   1057     }
   1058     return packFloat128( zSign, zExp, zSig0, zSig1 );
   1059 
   1060 }
   1061 
   1062 /*----------------------------------------------------------------------------
   1063 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
   1064 | and significand formed by the concatenation of `zSig0' and `zSig1', and
   1065 | returns the proper quadruple-precision floating-point value corresponding
   1066 | to the abstract input.  This routine is just like `roundAndPackFloat128'
   1067 | except that the input significand has fewer bits and does not have to be
   1068 | normalized.  In all cases, `zExp' must be 1 less than the ``true'' floating-
   1069 | point exponent.
   1070 *----------------------------------------------------------------------------*/
   1071 
   1072 static float128
   1073  normalizeRoundAndPackFloat128(
   1074      flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1 )
   1075 {
   1076     int8 shiftCount;
   1077     bits64 zSig2;
   1078 
   1079     if ( zSig0 == 0 ) {
   1080         zSig0 = zSig1;
   1081         zSig1 = 0;
   1082         zExp -= 64;
   1083     }
   1084     shiftCount = countLeadingZeros64( zSig0 ) - 15;
   1085     if ( 0 <= shiftCount ) {
   1086         zSig2 = 0;
   1087         shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
   1088     }
   1089     else {
   1090         shift128ExtraRightJamming(
   1091             zSig0, zSig1, 0, - shiftCount, &zSig0, &zSig1, &zSig2 );
   1092     }
   1093     zExp -= shiftCount;
   1094     return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 );
   1095 
   1096 }
   1097 
   1098 #endif
   1099 
   1100 /*----------------------------------------------------------------------------
   1101 | Returns the result of converting the 32-bit two's complement integer `a'
   1102 | to the single-precision floating-point format.  The conversion is performed
   1103 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
   1104 *----------------------------------------------------------------------------*/
   1105 
   1106 float32 int32_to_float32( int32 a )
   1107 {
   1108     flag zSign;
   1109 
   1110     if ( a == 0 ) return 0;
   1111     if ( a == (sbits32) 0x80000000 ) return packFloat32( 1, 0x9E, 0 );
   1112     zSign = ( a < 0 );
   1113     return normalizeRoundAndPackFloat32( zSign, 0x9C, zSign ? - a : a );
   1114 
   1115 }
   1116 
   1117 /*----------------------------------------------------------------------------
   1118 | Returns the result of converting the 32-bit two's complement integer `a'
   1119 | to the double-precision floating-point format.  The conversion is performed
   1120 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
   1121 *----------------------------------------------------------------------------*/
   1122 
   1123 float64 int32_to_float64( int32 a )
   1124 {
   1125     flag zSign;
   1126     uint32 absA;
   1127     int8 shiftCount;
   1128     bits64 zSig;
   1129 
   1130     if ( a == 0 ) return 0;
   1131     zSign = ( a < 0 );
   1132     absA = zSign ? - a : a;
   1133     shiftCount = countLeadingZeros32( absA ) + 21;
   1134     zSig = absA;
   1135     return packFloat64( zSign, 0x432 - shiftCount, zSig<<shiftCount );
   1136 
   1137 }
   1138 
   1139 #ifdef FLOATX80
   1140 
   1141 /*----------------------------------------------------------------------------
   1142 | Returns the result of converting the 32-bit two's complement integer `a'
   1143 | to the extended double-precision floating-point format.  The conversion
   1144 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
   1145 | Arithmetic.
   1146 *----------------------------------------------------------------------------*/
   1147 
   1148 floatx80 int32_to_floatx80( int32 a )
   1149 {
   1150     flag zSign;
   1151     uint32 absA;
   1152     int8 shiftCount;
   1153     bits64 zSig;
   1154 
   1155     if ( a == 0 ) return packFloatx80( 0, 0, 0 );
   1156     zSign = ( a < 0 );
   1157     absA = zSign ? - a : a;
   1158     shiftCount = countLeadingZeros32( absA ) + 32;
   1159     zSig = absA;
   1160     return packFloatx80( zSign, 0x403E - shiftCount, zSig<<shiftCount );
   1161 
   1162 }
   1163 
   1164 #endif
   1165 
   1166 #ifdef FLOAT128
   1167 
   1168 /*----------------------------------------------------------------------------
   1169 | Returns the result of converting the 32-bit two's complement integer `a' to
   1170 | the quadruple-precision floating-point format.  The conversion is performed
   1171 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
   1172 *----------------------------------------------------------------------------*/
   1173 
   1174 float128 int32_to_float128( int32 a )
   1175 {
   1176     flag zSign;
   1177     uint32 absA;
   1178     int8 shiftCount;
   1179     bits64 zSig0;
   1180 
   1181     if ( a == 0 ) return packFloat128( 0, 0, 0, 0 );
   1182     zSign = ( a < 0 );
   1183     absA = zSign ? - a : a;
   1184     shiftCount = countLeadingZeros32( absA ) + 17;
   1185     zSig0 = absA;
   1186     return packFloat128( zSign, 0x402E - shiftCount, zSig0<<shiftCount, 0 );
   1187 
   1188 }
   1189 
   1190 #endif
   1191 
   1192 #ifndef SOFTFLOAT_FOR_GCC /* __floatdi?f is in libgcc2.c */
   1193 /*----------------------------------------------------------------------------
   1194 | Returns the result of converting the 64-bit two's complement integer `a'
   1195 | to the single-precision floating-point format.  The conversion is performed
   1196 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
   1197 *----------------------------------------------------------------------------*/
   1198 
   1199 float32 int64_to_float32( int64 a )
   1200 {
   1201     flag zSign;
   1202     uint64 absA;
   1203     int8 shiftCount;
   1204 
   1205     if ( a == 0 ) return 0;
   1206     zSign = ( a < 0 );
   1207     absA = zSign ? - a : a;
   1208     shiftCount = countLeadingZeros64( absA ) - 40;
   1209     if ( 0 <= shiftCount ) {
   1210         return packFloat32( zSign, 0x95 - shiftCount, absA<<shiftCount );
   1211     }
   1212     else {
   1213         shiftCount += 7;
   1214         if ( shiftCount < 0 ) {
   1215             shift64RightJamming( absA, - shiftCount, &absA );
   1216         }
   1217         else {
   1218             absA <<= shiftCount;
   1219         }
   1220         return roundAndPackFloat32( zSign, 0x9C - shiftCount, absA );
   1221     }
   1222 
   1223 }
   1224 
   1225 /*----------------------------------------------------------------------------
   1226 | Returns the result of converting the 64-bit two's complement integer `a'
   1227 | to the double-precision floating-point format.  The conversion is performed
   1228 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
   1229 *----------------------------------------------------------------------------*/
   1230 
   1231 float64 int64_to_float64( int64 a )
   1232 {
   1233     flag zSign;
   1234 
   1235     if ( a == 0 ) return 0;
   1236     if ( a == (sbits64) LIT64( 0x8000000000000000 ) ) {
   1237         return packFloat64( 1, 0x43E, 0 );
   1238     }
   1239     zSign = ( a < 0 );
   1240     return normalizeRoundAndPackFloat64( zSign, 0x43C, zSign ? - a : a );
   1241 
   1242 }
   1243 
   1244 #ifdef FLOATX80
   1245 
   1246 /*----------------------------------------------------------------------------
   1247 | Returns the result of converting the 64-bit two's complement integer `a'
   1248 | to the extended double-precision floating-point format.  The conversion
   1249 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
   1250 | Arithmetic.
   1251 *----------------------------------------------------------------------------*/
   1252 
   1253 floatx80 int64_to_floatx80( int64 a )
   1254 {
   1255     flag zSign;
   1256     uint64 absA;
   1257     int8 shiftCount;
   1258 
   1259     if ( a == 0 ) return packFloatx80( 0, 0, 0 );
   1260     zSign = ( a < 0 );
   1261     absA = zSign ? - a : a;
   1262     shiftCount = countLeadingZeros64( absA );
   1263     return packFloatx80( zSign, 0x403E - shiftCount, absA<<shiftCount );
   1264 
   1265 }
   1266 
   1267 #endif
   1268 
   1269 #ifdef FLOAT128
   1270 
   1271 /*----------------------------------------------------------------------------
   1272 | Returns the result of converting the 64-bit two's complement integer `a' to
   1273 | the quadruple-precision floating-point format.  The conversion is performed
   1274 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
   1275 *----------------------------------------------------------------------------*/
   1276 
   1277 float128 int64_to_float128( int64 a )
   1278 {
   1279     flag zSign;
   1280     uint64 absA;
   1281     int8 shiftCount;
   1282     int32 zExp;
   1283     bits64 zSig0, zSig1;
   1284 
   1285     if ( a == 0 ) return packFloat128( 0, 0, 0, 0 );
   1286     zSign = ( a < 0 );
   1287     absA = zSign ? - a : a;
   1288     shiftCount = countLeadingZeros64( absA ) + 49;
   1289     zExp = 0x406E - shiftCount;
   1290     if ( 64 <= shiftCount ) {
   1291         zSig1 = 0;
   1292         zSig0 = absA;
   1293         shiftCount -= 64;
   1294     }
   1295     else {
   1296         zSig1 = absA;
   1297         zSig0 = 0;
   1298     }
   1299     shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
   1300     return packFloat128( zSign, zExp, zSig0, zSig1 );
   1301 
   1302 }
   1303 
   1304 #endif
   1305 #endif /* !SOFTFLOAT_FOR_GCC */
   1306 
   1307 #ifndef SOFTFLOAT_FOR_GCC /* Not needed */
   1308 /*----------------------------------------------------------------------------
   1309 | Returns the result of converting the single-precision floating-point value
   1310 | `a' to the 32-bit two's complement integer format.  The conversion is
   1311 | performed according to the IEC/IEEE Standard for Binary Floating-Point
   1312 | Arithmetic---which means in particular that the conversion is rounded
   1313 | according to the current rounding mode.  If `a' is a NaN, the largest
   1314 | positive integer is returned.  Otherwise, if the conversion overflows, the
   1315 | largest integer with the same sign as `a' is returned.
   1316 *----------------------------------------------------------------------------*/
   1317 
   1318 int32 float32_to_int32( float32 a )
   1319 {
   1320     flag aSign;
   1321     int16 aExp, shiftCount;
   1322     bits32 aSig;
   1323     bits64 aSig64;
   1324 
   1325     aSig = extractFloat32Frac( a );
   1326     aExp = extractFloat32Exp( a );
   1327     aSign = extractFloat32Sign( a );
   1328     if ( ( aExp == 0xFF ) && aSig ) aSign = 0;
   1329     if ( aExp ) aSig |= 0x00800000;
   1330     shiftCount = 0xAF - aExp;
   1331     aSig64 = aSig;
   1332     aSig64 <<= 32;
   1333     if ( 0 < shiftCount ) shift64RightJamming( aSig64, shiftCount, &aSig64 );
   1334     return roundAndPackInt32( aSign, aSig64 );
   1335 
   1336 }
   1337 #endif /* !SOFTFLOAT_FOR_GCC */
   1338 
   1339 /*----------------------------------------------------------------------------
   1340 | Returns the result of converting the single-precision floating-point value
   1341 | `a' to the 32-bit two's complement integer format.  The conversion is
   1342 | performed according to the IEC/IEEE Standard for Binary Floating-Point
   1343 | Arithmetic, except that the conversion is always rounded toward zero.
   1344 | If `a' is a NaN, the largest positive integer is returned.  Otherwise, if
   1345 | the conversion overflows, the largest integer with the same sign as `a' is
   1346 | returned.
   1347 *----------------------------------------------------------------------------*/
   1348 
   1349 int32 float32_to_int32_round_to_zero( float32 a )
   1350 {
   1351     flag aSign;
   1352     int16 aExp, shiftCount;
   1353     bits32 aSig;
   1354     int32 z;
   1355 
   1356     aSig = extractFloat32Frac( a );
   1357     aExp = extractFloat32Exp( a );
   1358     aSign = extractFloat32Sign( a );
   1359     shiftCount = aExp - 0x9E;
   1360     if ( 0 <= shiftCount ) {
   1361         if ( a != 0xCF000000 ) {
   1362             float_raise( float_flag_invalid );
   1363             if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) return 0x7FFFFFFF;
   1364         }
   1365         return (sbits32) 0x80000000;
   1366     }
   1367     else if ( aExp <= 0x7E ) {
   1368         if ( aExp | aSig ) float_set_inexact();
   1369         return 0;
   1370     }
   1371     aSig = ( aSig | 0x00800000 )<<8;
   1372     z = aSig>>( - shiftCount );
   1373     if ( (bits32) ( aSig<<( shiftCount & 31 ) ) ) {
   1374         float_set_inexact();
   1375     }
   1376     if ( aSign ) z = - z;
   1377     return z;
   1378 
   1379 }
   1380 
   1381 #ifndef SOFTFLOAT_FOR_GCC /* __fix?fdi provided by libgcc2.c */
   1382 /*----------------------------------------------------------------------------
   1383 | Returns the result of converting the single-precision floating-point value
   1384 | `a' to the 64-bit two's complement integer format.  The conversion is
   1385 | performed according to the IEC/IEEE Standard for Binary Floating-Point
   1386 | Arithmetic---which means in particular that the conversion is rounded
   1387 | according to the current rounding mode.  If `a' is a NaN, the largest
   1388 | positive integer is returned.  Otherwise, if the conversion overflows, the
   1389 | largest integer with the same sign as `a' is returned.
   1390 *----------------------------------------------------------------------------*/
   1391 
   1392 int64 float32_to_int64( float32 a )
   1393 {
   1394     flag aSign;
   1395     int16 aExp, shiftCount;
   1396     bits32 aSig;
   1397     bits64 aSig64, aSigExtra;
   1398 
   1399     aSig = extractFloat32Frac( a );
   1400     aExp = extractFloat32Exp( a );
   1401     aSign = extractFloat32Sign( a );
   1402     shiftCount = 0xBE - aExp;
   1403     if ( shiftCount < 0 ) {
   1404         float_raise( float_flag_invalid );
   1405         if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) {
   1406             return LIT64( 0x7FFFFFFFFFFFFFFF );
   1407         }
   1408         return (sbits64) LIT64( 0x8000000000000000 );
   1409     }
   1410     if ( aExp ) aSig |= 0x00800000;
   1411     aSig64 = aSig;
   1412     aSig64 <<= 40;
   1413     shift64ExtraRightJamming( aSig64, 0, shiftCount, &aSig64, &aSigExtra );
   1414     return roundAndPackInt64( aSign, aSig64, aSigExtra );
   1415 
   1416 }
   1417 
   1418 /*----------------------------------------------------------------------------
   1419 | Returns the result of converting the single-precision floating-point value
   1420 | `a' to the 64-bit two's complement integer format.  The conversion is
   1421 | performed according to the IEC/IEEE Standard for Binary Floating-Point
   1422 | Arithmetic, except that the conversion is always rounded toward zero.  If
   1423 | `a' is a NaN, the largest positive integer is returned.  Otherwise, if the
   1424 | conversion overflows, the largest integer with the same sign as `a' is
   1425 | returned.
   1426 *----------------------------------------------------------------------------*/
   1427 
   1428 int64 float32_to_int64_round_to_zero( float32 a )
   1429 {
   1430     flag aSign;
   1431     int16 aExp, shiftCount;
   1432     bits32 aSig;
   1433     bits64 aSig64;
   1434     int64 z;
   1435 
   1436     aSig = extractFloat32Frac( a );
   1437     aExp = extractFloat32Exp( a );
   1438     aSign = extractFloat32Sign( a );
   1439     shiftCount = aExp - 0xBE;
   1440     if ( 0 <= shiftCount ) {
   1441         if ( a != 0xDF000000 ) {
   1442             float_raise( float_flag_invalid );
   1443             if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) {
   1444                 return LIT64( 0x7FFFFFFFFFFFFFFF );
   1445             }
   1446         }
   1447         return (sbits64) LIT64( 0x8000000000000000 );
   1448     }
   1449     else if ( aExp <= 0x7E ) {
   1450         if ( aExp | aSig ) float_set_inexact();
   1451         return 0;
   1452     }
   1453     aSig64 = aSig | 0x00800000;
   1454     aSig64 <<= 40;
   1455     z = aSig64>>( - shiftCount );
   1456     if ( (bits64) ( aSig64<<( shiftCount & 63 ) ) ) {
   1457         float_set_inexact();
   1458     }
   1459     if ( aSign ) z = - z;
   1460     return z;
   1461 
   1462 }
   1463 #endif /* !SOFTFLOAT_FOR_GCC */
   1464 
   1465 /*----------------------------------------------------------------------------
   1466 | Returns the result of converting the single-precision floating-point value
   1467 | `a' to the double-precision floating-point format.  The conversion is
   1468 | performed according to the IEC/IEEE Standard for Binary Floating-Point
   1469 | Arithmetic.
   1470 *----------------------------------------------------------------------------*/
   1471 
   1472 float64 float32_to_float64( float32 a )
   1473 {
   1474     flag aSign;
   1475     int16 aExp;
   1476     bits32 aSig;
   1477 
   1478     aSig = extractFloat32Frac( a );
   1479     aExp = extractFloat32Exp( a );
   1480     aSign = extractFloat32Sign( a );
   1481     if ( aExp == 0xFF ) {
   1482         if ( aSig ) return commonNaNToFloat64( float32ToCommonNaN( a ) );
   1483         return packFloat64( aSign, 0x7FF, 0 );
   1484     }
   1485     if ( aExp == 0 ) {
   1486         if ( aSig == 0 ) return packFloat64( aSign, 0, 0 );
   1487         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
   1488         --aExp;
   1489     }
   1490     return packFloat64( aSign, aExp + 0x380, ( (bits64) aSig )<<29 );
   1491 
   1492 }
   1493 
   1494 #ifdef FLOATX80
   1495 
   1496 /*----------------------------------------------------------------------------
   1497 | Returns the result of converting the single-precision floating-point value
   1498 | `a' to the extended double-precision floating-point format.  The conversion
   1499 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
   1500 | Arithmetic.
   1501 *----------------------------------------------------------------------------*/
   1502 
   1503 floatx80 float32_to_floatx80( float32 a )
   1504 {
   1505     flag aSign;
   1506     int16 aExp;
   1507     bits32 aSig;
   1508 
   1509     aSig = extractFloat32Frac( a );
   1510     aExp = extractFloat32Exp( a );
   1511     aSign = extractFloat32Sign( a );
   1512     if ( aExp == 0xFF ) {
   1513         if ( aSig ) return commonNaNToFloatx80( float32ToCommonNaN( a ) );
   1514         return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
   1515     }
   1516     if ( aExp == 0 ) {
   1517         if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 );
   1518         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
   1519     }
   1520     aSig |= 0x00800000;
   1521     return packFloatx80( aSign, aExp + 0x3F80, ( (bits64) aSig )<<40 );
   1522 
   1523 }
   1524 
   1525 #endif
   1526 
   1527 #ifdef FLOAT128
   1528 
   1529 /*----------------------------------------------------------------------------
   1530 | Returns the result of converting the single-precision floating-point value
   1531 | `a' to the double-precision floating-point format.  The conversion is
   1532 | performed according to the IEC/IEEE Standard for Binary Floating-Point
   1533 | Arithmetic.
   1534 *----------------------------------------------------------------------------*/
   1535 
   1536 float128 float32_to_float128( float32 a )
   1537 {
   1538     flag aSign;
   1539     int16 aExp;
   1540     bits32 aSig;
   1541 
   1542     aSig = extractFloat32Frac( a );
   1543     aExp = extractFloat32Exp( a );
   1544     aSign = extractFloat32Sign( a );
   1545     if ( aExp == 0xFF ) {
   1546         if ( aSig ) return commonNaNToFloat128( float32ToCommonNaN( a ) );
   1547         return packFloat128( aSign, 0x7FFF, 0, 0 );
   1548     }
   1549     if ( aExp == 0 ) {
   1550         if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 );
   1551         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
   1552         --aExp;
   1553     }
   1554     return packFloat128( aSign, aExp + 0x3F80, ( (bits64) aSig )<<25, 0 );
   1555 
   1556 }
   1557 
   1558 #endif
   1559 
   1560 #ifndef SOFTFLOAT_FOR_GCC /* Not needed */
   1561 /*----------------------------------------------------------------------------
   1562 | Rounds the single-precision floating-point value `a' to an integer, and
   1563 | returns the result as a single-precision floating-point value.  The
   1564 | operation is performed according to the IEC/IEEE Standard for Binary
   1565 | Floating-Point Arithmetic.
   1566 *----------------------------------------------------------------------------*/
   1567 
   1568 float32 float32_round_to_int( float32 a )
   1569 {
   1570     flag aSign;
   1571     int16 aExp;
   1572     bits32 lastBitMask, roundBitsMask;
   1573     int8 roundingMode;
   1574     float32 z;
   1575 
   1576     aExp = extractFloat32Exp( a );
   1577     if ( 0x96 <= aExp ) {
   1578         if ( ( aExp == 0xFF ) && extractFloat32Frac( a ) ) {
   1579             return propagateFloat32NaN( a, a );
   1580         }
   1581         return a;
   1582     }
   1583     if ( aExp <= 0x7E ) {
   1584         if ( (bits32) ( a<<1 ) == 0 ) return a;
   1585         float_set_inexact();
   1586         aSign = extractFloat32Sign( a );
   1587         switch ( float_rounding_mode() ) {
   1588          case float_round_nearest_even:
   1589             if ( ( aExp == 0x7E ) && extractFloat32Frac( a ) ) {
   1590                 return packFloat32( aSign, 0x7F, 0 );
   1591             }
   1592             break;
   1593          case float_round_down:
   1594             return aSign ? 0xBF800000 : 0;
   1595          case float_round_up:
   1596             return aSign ? 0x80000000 : 0x3F800000;
   1597         }
   1598         return packFloat32( aSign, 0, 0 );
   1599     }
   1600     lastBitMask = 1;
   1601     lastBitMask <<= 0x96 - aExp;
   1602     roundBitsMask = lastBitMask - 1;
   1603     z = a;
   1604     roundingMode = float_rounding_mode();
   1605     if ( roundingMode == float_round_nearest_even ) {
   1606         z += lastBitMask>>1;
   1607         if ( ( z & roundBitsMask ) == 0 ) z &= ~ lastBitMask;
   1608     }
   1609     else if ( roundingMode != float_round_to_zero ) {
   1610         if ( extractFloat32Sign( z ) ^ ( roundingMode == float_round_up ) ) {
   1611             z += roundBitsMask;
   1612         }
   1613     }
   1614     z &= ~ roundBitsMask;
   1615     if ( z != a ) float_set_inexact();
   1616     return z;
   1617 
   1618 }
   1619 #endif /* !SOFTFLOAT_FOR_GCC */
   1620 
   1621 /*----------------------------------------------------------------------------
   1622 | Returns the result of adding the absolute values of the single-precision
   1623 | floating-point values `a' and `b'.  If `zSign' is 1, the sum is negated
   1624 | before being returned.  `zSign' is ignored if the result is a NaN.
   1625 | The addition is performed according to the IEC/IEEE Standard for Binary
   1626 | Floating-Point Arithmetic.
   1627 *----------------------------------------------------------------------------*/
   1628 
   1629 static float32 addFloat32Sigs( float32 a, float32 b, flag zSign )
   1630 {
   1631     int16 aExp, bExp, zExp;
   1632     bits32 aSig, bSig, zSig;
   1633     int16 expDiff;
   1634 
   1635     aSig = extractFloat32Frac( a );
   1636     aExp = extractFloat32Exp( a );
   1637     bSig = extractFloat32Frac( b );
   1638     bExp = extractFloat32Exp( b );
   1639     expDiff = aExp - bExp;
   1640     aSig <<= 6;
   1641     bSig <<= 6;
   1642     if ( 0 < expDiff ) {
   1643         if ( aExp == 0xFF ) {
   1644             if ( aSig ) return propagateFloat32NaN( a, b );
   1645             return a;
   1646         }
   1647         if ( bExp == 0 ) {
   1648             --expDiff;
   1649         }
   1650         else {
   1651             bSig |= 0x20000000;
   1652         }
   1653         shift32RightJamming( bSig, expDiff, &bSig );
   1654         zExp = aExp;
   1655     }
   1656     else if ( expDiff < 0 ) {
   1657         if ( bExp == 0xFF ) {
   1658             if ( bSig ) return propagateFloat32NaN( a, b );
   1659             return packFloat32( zSign, 0xFF, 0 );
   1660         }
   1661         if ( aExp == 0 ) {
   1662             ++expDiff;
   1663         }
   1664         else {
   1665             aSig |= 0x20000000;
   1666         }
   1667         shift32RightJamming( aSig, - expDiff, &aSig );
   1668         zExp = bExp;
   1669     }
   1670     else {
   1671         if ( aExp == 0xFF ) {
   1672             if ( aSig | bSig ) return propagateFloat32NaN( a, b );
   1673             return a;
   1674         }
   1675         if ( aExp == 0 ) return packFloat32( zSign, 0, ( aSig + bSig )>>6 );
   1676         zSig = 0x40000000 + aSig + bSig;
   1677         zExp = aExp;
   1678         goto roundAndPack;
   1679     }
   1680     aSig |= 0x20000000;
   1681     zSig = ( aSig + bSig )<<1;
   1682     --zExp;
   1683     if ( (sbits32) zSig < 0 ) {
   1684         zSig = aSig + bSig;
   1685         ++zExp;
   1686     }
   1687  roundAndPack:
   1688     return roundAndPackFloat32( zSign, zExp, zSig );
   1689 
   1690 }
   1691 
   1692 /*----------------------------------------------------------------------------
   1693 | Returns the result of subtracting the absolute values of the single-
   1694 | precision floating-point values `a' and `b'.  If `zSign' is 1, the
   1695 | difference is negated before being returned.  `zSign' is ignored if the
   1696 | result is a NaN.  The subtraction is performed according to the IEC/IEEE
   1697 | Standard for Binary Floating-Point Arithmetic.
   1698 *----------------------------------------------------------------------------*/
   1699 
   1700 static float32 subFloat32Sigs( float32 a, float32 b, flag zSign )
   1701 {
   1702     int16 aExp, bExp, zExp;
   1703     bits32 aSig, bSig, zSig;
   1704     int16 expDiff;
   1705 
   1706     aSig = extractFloat32Frac( a );
   1707     aExp = extractFloat32Exp( a );
   1708     bSig = extractFloat32Frac( b );
   1709     bExp = extractFloat32Exp( b );
   1710     expDiff = aExp - bExp;
   1711     aSig <<= 7;
   1712     bSig <<= 7;
   1713     if ( 0 < expDiff ) goto aExpBigger;
   1714     if ( expDiff < 0 ) goto bExpBigger;
   1715     if ( aExp == 0xFF ) {
   1716         if ( aSig | bSig ) return propagateFloat32NaN( a, b );
   1717         float_raise( float_flag_invalid );
   1718         return float32_default_nan;
   1719     }
   1720     if ( aExp == 0 ) {
   1721         aExp = 1;
   1722         bExp = 1;
   1723     }
   1724     if ( bSig < aSig ) goto aBigger;
   1725     if ( aSig < bSig ) goto bBigger;
   1726     return packFloat32( float_rounding_mode() == float_round_down, 0, 0 );
   1727  bExpBigger:
   1728     if ( bExp == 0xFF ) {
   1729         if ( bSig ) return propagateFloat32NaN( a, b );
   1730         return packFloat32( zSign ^ 1, 0xFF, 0 );
   1731     }
   1732     if ( aExp == 0 ) {
   1733         ++expDiff;
   1734     }
   1735     else {
   1736         aSig |= 0x40000000;
   1737     }
   1738     shift32RightJamming( aSig, - expDiff, &aSig );
   1739     bSig |= 0x40000000;
   1740  bBigger:
   1741     zSig = bSig - aSig;
   1742     zExp = bExp;
   1743     zSign ^= 1;
   1744     goto normalizeRoundAndPack;
   1745  aExpBigger:
   1746     if ( aExp == 0xFF ) {
   1747         if ( aSig ) return propagateFloat32NaN( a, b );
   1748         return a;
   1749     }
   1750     if ( bExp == 0 ) {
   1751         --expDiff;
   1752     }
   1753     else {
   1754         bSig |= 0x40000000;
   1755     }
   1756     shift32RightJamming( bSig, expDiff, &bSig );
   1757     aSig |= 0x40000000;
   1758  aBigger:
   1759     zSig = aSig - bSig;
   1760     zExp = aExp;
   1761  normalizeRoundAndPack:
   1762     --zExp;
   1763     return normalizeRoundAndPackFloat32( zSign, zExp, zSig );
   1764 
   1765 }
   1766 
   1767 /*----------------------------------------------------------------------------
   1768 | Returns the result of adding the single-precision floating-point values `a'
   1769 | and `b'.  The operation is performed according to the IEC/IEEE Standard for
   1770 | Binary Floating-Point Arithmetic.
   1771 *----------------------------------------------------------------------------*/
   1772 
   1773 float32 float32_add( float32 a, float32 b )
   1774 {
   1775     flag aSign, bSign;
   1776 
   1777     aSign = extractFloat32Sign( a );
   1778     bSign = extractFloat32Sign( b );
   1779     if ( aSign == bSign ) {
   1780         return addFloat32Sigs( a, b, aSign );
   1781     }
   1782     else {
   1783         return subFloat32Sigs( a, b, aSign );
   1784     }
   1785 
   1786 }
   1787 
   1788 /*----------------------------------------------------------------------------
   1789 | Returns the result of subtracting the single-precision floating-point values
   1790 | `a' and `b'.  The operation is performed according to the IEC/IEEE Standard
   1791 | for Binary Floating-Point Arithmetic.
   1792 *----------------------------------------------------------------------------*/
   1793 
   1794 float32 float32_sub( float32 a, float32 b )
   1795 {
   1796     flag aSign, bSign;
   1797 
   1798     aSign = extractFloat32Sign( a );
   1799     bSign = extractFloat32Sign( b );
   1800     if ( aSign == bSign ) {
   1801         return subFloat32Sigs( a, b, aSign );
   1802     }
   1803     else {
   1804         return addFloat32Sigs( a, b, aSign );
   1805     }
   1806 
   1807 }
   1808 
   1809 /*----------------------------------------------------------------------------
   1810 | Returns the result of multiplying the single-precision floating-point values
   1811 | `a' and `b'.  The operation is performed according to the IEC/IEEE Standard
   1812 | for Binary Floating-Point Arithmetic.
   1813 *----------------------------------------------------------------------------*/
   1814 
   1815 float32 float32_mul( float32 a, float32 b )
   1816 {
   1817     flag aSign, bSign, zSign;
   1818     int16 aExp, bExp, zExp;
   1819     bits32 aSig, bSig;
   1820     bits64 zSig64;
   1821     bits32 zSig;
   1822 
   1823     aSig = extractFloat32Frac( a );
   1824     aExp = extractFloat32Exp( a );
   1825     aSign = extractFloat32Sign( a );
   1826     bSig = extractFloat32Frac( b );
   1827     bExp = extractFloat32Exp( b );
   1828     bSign = extractFloat32Sign( b );
   1829     zSign = aSign ^ bSign;
   1830     if ( aExp == 0xFF ) {
   1831         if ( aSig || ( ( bExp == 0xFF ) && bSig ) ) {
   1832             return propagateFloat32NaN( a, b );
   1833         }
   1834         if ( ( bExp | bSig ) == 0 ) {
   1835             float_raise( float_flag_invalid );
   1836             return float32_default_nan;
   1837         }
   1838         return packFloat32( zSign, 0xFF, 0 );
   1839     }
   1840     if ( bExp == 0xFF ) {
   1841         if ( bSig ) return propagateFloat32NaN( a, b );
   1842         if ( ( aExp | aSig ) == 0 ) {
   1843             float_raise( float_flag_invalid );
   1844             return float32_default_nan;
   1845         }
   1846         return packFloat32( zSign, 0xFF, 0 );
   1847     }
   1848     if ( aExp == 0 ) {
   1849         if ( aSig == 0 ) return packFloat32( zSign, 0, 0 );
   1850         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
   1851     }
   1852     if ( bExp == 0 ) {
   1853         if ( bSig == 0 ) return packFloat32( zSign, 0, 0 );
   1854         normalizeFloat32Subnormal( bSig, &bExp, &bSig );
   1855     }
   1856     zExp = aExp + bExp - 0x7F;
   1857     aSig = ( aSig | 0x00800000 )<<7;
   1858     bSig = ( bSig | 0x00800000 )<<8;
   1859     shift64RightJamming( ( (bits64) aSig ) * bSig, 32, &zSig64 );
   1860     zSig = zSig64;
   1861     if ( 0 <= (sbits32) ( zSig<<1 ) ) {
   1862         zSig <<= 1;
   1863         --zExp;
   1864     }
   1865     return roundAndPackFloat32( zSign, zExp, zSig );
   1866 
   1867 }
   1868 
   1869 /*----------------------------------------------------------------------------
   1870 | Returns the result of dividing the single-precision floating-point value `a'
   1871 | by the corresponding value `b'.  The operation is performed according to the
   1872 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
   1873 *----------------------------------------------------------------------------*/
   1874 
   1875 float32 float32_div( float32 a, float32 b )
   1876 {
   1877     flag aSign, bSign, zSign;
   1878     int16 aExp, bExp, zExp;
   1879     bits32 aSig, bSig, zSig;
   1880 
   1881     aSig = extractFloat32Frac( a );
   1882     aExp = extractFloat32Exp( a );
   1883     aSign = extractFloat32Sign( a );
   1884     bSig = extractFloat32Frac( b );
   1885     bExp = extractFloat32Exp( b );
   1886     bSign = extractFloat32Sign( b );
   1887     zSign = aSign ^ bSign;
   1888     if ( aExp == 0xFF ) {
   1889         if ( aSig ) return propagateFloat32NaN( a, b );
   1890         if ( bExp == 0xFF ) {
   1891             if ( bSig ) return propagateFloat32NaN( a, b );
   1892             float_raise( float_flag_invalid );
   1893             return float32_default_nan;
   1894         }
   1895         return packFloat32( zSign, 0xFF, 0 );
   1896     }
   1897     if ( bExp == 0xFF ) {
   1898         if ( bSig ) return propagateFloat32NaN( a, b );
   1899         return packFloat32( zSign, 0, 0 );
   1900     }
   1901     if ( bExp == 0 ) {
   1902         if ( bSig == 0 ) {
   1903             if ( ( aExp | aSig ) == 0 ) {
   1904                 float_raise( float_flag_invalid );
   1905                 return float32_default_nan;
   1906             }
   1907             float_raise( float_flag_divbyzero );
   1908             return packFloat32( zSign, 0xFF, 0 );
   1909         }
   1910         normalizeFloat32Subnormal( bSig, &bExp, &bSig );
   1911     }
   1912     if ( aExp == 0 ) {
   1913         if ( aSig == 0 ) return packFloat32( zSign, 0, 0 );
   1914         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
   1915     }
   1916     zExp = aExp - bExp + 0x7D;
   1917     aSig = ( aSig | 0x00800000 )<<7;
   1918     bSig = ( bSig | 0x00800000 )<<8;
   1919     if ( bSig <= ( aSig + aSig ) ) {
   1920         aSig >>= 1;
   1921         ++zExp;
   1922     }
   1923     zSig = ( ( (bits64) aSig )<<32 ) / bSig;
   1924     if ( ( zSig & 0x3F ) == 0 ) {
   1925         zSig |= ( (bits64) bSig * zSig != ( (bits64) aSig )<<32 );
   1926     }
   1927     return roundAndPackFloat32( zSign, zExp, zSig );
   1928 
   1929 }
   1930 
   1931 #ifndef SOFTFLOAT_FOR_GCC /* Not needed */
   1932 /*----------------------------------------------------------------------------
   1933 | Returns the remainder of the single-precision floating-point value `a'
   1934 | with respect to the corresponding value `b'.  The operation is performed
   1935 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
   1936 *----------------------------------------------------------------------------*/
   1937 
   1938 float32 float32_rem( float32 a, float32 b )
   1939 {
   1940     flag aSign, bSign __unused, zSign;
   1941     int16 aExp, bExp, expDiff;
   1942     bits32 aSig, bSig;
   1943     bits32 q;
   1944     bits64 aSig64, bSig64, q64;
   1945     bits32 alternateASig;
   1946     sbits32 sigMean;
   1947 
   1948     aSig = extractFloat32Frac( a );
   1949     aExp = extractFloat32Exp( a );
   1950     aSign = extractFloat32Sign( a );
   1951     bSig = extractFloat32Frac( b );
   1952     bExp = extractFloat32Exp( b );
   1953     bSign = extractFloat32Sign( b );
   1954     if ( aExp == 0xFF ) {
   1955         if ( aSig || ( ( bExp == 0xFF ) && bSig ) ) {
   1956             return propagateFloat32NaN( a, b );
   1957         }
   1958         float_raise( float_flag_invalid );
   1959         return float32_default_nan;
   1960     }
   1961     if ( bExp == 0xFF ) {
   1962         if ( bSig ) return propagateFloat32NaN( a, b );
   1963         return a;
   1964     }
   1965     if ( bExp == 0 ) {
   1966         if ( bSig == 0 ) {
   1967             float_raise( float_flag_invalid );
   1968             return float32_default_nan;
   1969         }
   1970         normalizeFloat32Subnormal( bSig, &bExp, &bSig );
   1971     }
   1972     if ( aExp == 0 ) {
   1973         if ( aSig == 0 ) return a;
   1974         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
   1975     }
   1976     expDiff = aExp - bExp;
   1977     aSig |= 0x00800000;
   1978     bSig |= 0x00800000;
   1979     if ( expDiff < 32 ) {
   1980         aSig <<= 8;
   1981         bSig <<= 8;
   1982         if ( expDiff < 0 ) {
   1983             if ( expDiff < -1 ) return a;
   1984             aSig >>= 1;
   1985         }
   1986         q = ( bSig <= aSig );
   1987         if ( q ) aSig -= bSig;
   1988         if ( 0 < expDiff ) {
   1989             q = ( ( (bits64) aSig )<<32 ) / bSig;
   1990             q >>= 32 - expDiff;
   1991             bSig >>= 2;
   1992             aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q;
   1993         }
   1994         else {
   1995             aSig >>= 2;
   1996             bSig >>= 2;
   1997         }
   1998     }
   1999     else {
   2000         if ( bSig <= aSig ) aSig -= bSig;
   2001         aSig64 = ( (bits64) aSig )<<40;
   2002         bSig64 = ( (bits64) bSig )<<40;
   2003         expDiff -= 64;
   2004         while ( 0 < expDiff ) {
   2005             q64 = estimateDiv128To64( aSig64, 0, bSig64 );
   2006             q64 = ( 2 < q64 ) ? q64 - 2 : 0;
   2007             aSig64 = - ( ( bSig * q64 )<<38 );
   2008             expDiff -= 62;
   2009         }
   2010         expDiff += 64;
   2011         q64 = estimateDiv128To64( aSig64, 0, bSig64 );
   2012         q64 = ( 2 < q64 ) ? q64 - 2 : 0;
   2013         q = q64>>( 64 - expDiff );
   2014         bSig <<= 6;
   2015         aSig = ( ( aSig64>>33 )<<( expDiff - 1 ) ) - bSig * q;
   2016     }
   2017     do {
   2018         alternateASig = aSig;
   2019         ++q;
   2020         aSig -= bSig;
   2021     } while ( 0 <= (sbits32) aSig );
   2022     sigMean = aSig + alternateASig;
   2023     if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) {
   2024         aSig = alternateASig;
   2025     }
   2026     zSign = ( (sbits32) aSig < 0 );
   2027     if ( zSign ) aSig = - aSig;
   2028     return normalizeRoundAndPackFloat32( aSign ^ zSign, bExp, aSig );
   2029 
   2030 }
   2031 #endif /* !SOFTFLOAT_FOR_GCC */
   2032 
   2033 #ifndef SOFTFLOAT_FOR_GCC /* Not needed */
   2034 
   2035 /*----------------------------------------------------------------------------
   2036 | Returns the square root of the single-precision floating-point value `a'.
   2037 | The operation is performed according to the IEC/IEEE Standard for Binary
   2038 | Floating-Point Arithmetic.
   2039 *----------------------------------------------------------------------------*/
   2040 
   2041 float32 float32_sqrt( float32 a )
   2042 {
   2043     flag aSign;
   2044     int16 aExp, zExp;
   2045     bits32 aSig, zSig;
   2046     bits64 rem, term;
   2047 
   2048     aSig = extractFloat32Frac( a );
   2049     aExp = extractFloat32Exp( a );
   2050     aSign = extractFloat32Sign( a );
   2051     if ( aExp == 0xFF ) {
   2052         if ( aSig ) return propagateFloat32NaN( a, 0 );
   2053         if ( ! aSign ) return a;
   2054         float_raise( float_flag_invalid );
   2055         return float32_default_nan;
   2056     }
   2057     if ( aSign ) {
   2058         if ( ( aExp | aSig ) == 0 ) return a;
   2059         float_raise( float_flag_invalid );
   2060         return float32_default_nan;
   2061     }
   2062     if ( aExp == 0 ) {
   2063         if ( aSig == 0 ) return 0;
   2064         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
   2065     }
   2066     zExp = ( ( aExp - 0x7F )>>1 ) + 0x7E;
   2067     aSig = ( aSig | 0x00800000 )<<8;
   2068     zSig = estimateSqrt32( aExp, aSig ) + 2;
   2069     if ( ( zSig & 0x7F ) <= 5 ) {
   2070         if ( zSig < 2 ) {
   2071             zSig = 0x7FFFFFFF;
   2072             goto roundAndPack;
   2073         }
   2074         aSig >>= aExp & 1;
   2075         term = ( (bits64) zSig ) * zSig;
   2076         rem = ( ( (bits64) aSig )<<32 ) - term;
   2077         while ( (sbits64) rem < 0 ) {
   2078             --zSig;
   2079             rem += ( ( (bits64) zSig )<<1 ) | 1;
   2080         }
   2081         zSig |= ( rem != 0 );
   2082     }
   2083     shift32RightJamming( zSig, 1, &zSig );
   2084  roundAndPack:
   2085     return roundAndPackFloat32( 0, zExp, zSig );
   2086 
   2087 }
   2088 #endif /* !SOFTFLOAT_FOR_GCC */
   2089 
   2090 /*----------------------------------------------------------------------------
   2091 | Returns 1 if the single-precision floating-point value `a' is equal to
   2092 | the corresponding value `b', and 0 otherwise.  The comparison is performed
   2093 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
   2094 *----------------------------------------------------------------------------*/
   2095 
   2096 flag float32_eq( float32 a, float32 b )
   2097 {
   2098 
   2099     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
   2100          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
   2101        ) {
   2102         if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) {
   2103             float_raise( float_flag_invalid );
   2104         }
   2105         return 0;
   2106     }
   2107     return ( a == b ) || ( (bits32) ( ( a | b )<<1 ) == 0 );
   2108 
   2109 }
   2110 
   2111 /*----------------------------------------------------------------------------
   2112 | Returns 1 if the single-precision floating-point value `a' is less than
   2113 | or equal to the corresponding value `b', and 0 otherwise.  The comparison
   2114 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
   2115 | Arithmetic.
   2116 *----------------------------------------------------------------------------*/
   2117 
   2118 flag float32_le( float32 a, float32 b )
   2119 {
   2120     flag aSign, bSign;
   2121 
   2122     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
   2123          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
   2124        ) {
   2125         float_raise( float_flag_invalid );
   2126         return 0;
   2127     }
   2128     aSign = extractFloat32Sign( a );
   2129     bSign = extractFloat32Sign( b );
   2130     if ( aSign != bSign ) return aSign || ( (bits32) ( ( a | b )<<1 ) == 0 );
   2131     return ( a == b ) || ( aSign ^ ( a < b ) );
   2132 
   2133 }
   2134 
   2135 /*----------------------------------------------------------------------------
   2136 | Returns 1 if the single-precision floating-point value `a' is less than
   2137 | the corresponding value `b', and 0 otherwise.  The comparison is performed
   2138 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
   2139 *----------------------------------------------------------------------------*/
   2140 
   2141 flag float32_lt( float32 a, float32 b )
   2142 {
   2143     flag aSign, bSign;
   2144 
   2145     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
   2146          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
   2147        ) {
   2148         float_raise( float_flag_invalid );
   2149         return 0;
   2150     }
   2151     aSign = extractFloat32Sign( a );
   2152     bSign = extractFloat32Sign( b );
   2153     if ( aSign != bSign ) return aSign && ( (bits32) ( ( a | b )<<1 ) != 0 );
   2154     return ( a != b ) && ( aSign ^ ( a < b ) );
   2155 
   2156 }
   2157 
   2158 #ifndef SOFTFLOAT_FOR_GCC /* Not needed */
   2159 /*----------------------------------------------------------------------------
   2160 | Returns 1 if the single-precision floating-point value `a' is equal to
   2161 | the corresponding value `b', and 0 otherwise.  The invalid exception is
   2162 | raised if either operand is a NaN.  Otherwise, the comparison is performed
   2163 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
   2164 *----------------------------------------------------------------------------*/
   2165 
   2166 flag float32_eq_signaling( float32 a, float32 b )
   2167 {
   2168 
   2169     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
   2170          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
   2171        ) {
   2172         float_raise( float_flag_invalid );
   2173         return 0;
   2174     }
   2175     return ( a == b ) || ( (bits32) ( ( a | b )<<1 ) == 0 );
   2176 
   2177 }
   2178 
   2179 /*----------------------------------------------------------------------------
   2180 | Returns 1 if the single-precision floating-point value `a' is less than or
   2181 | equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs do not
   2182 | cause an exception.  Otherwise, the comparison is performed according to the
   2183 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
   2184 *----------------------------------------------------------------------------*/
   2185 
   2186 flag float32_le_quiet( float32 a, float32 b )
   2187 {
   2188     flag aSign, bSign;
   2189 
   2190     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
   2191          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
   2192        ) {
   2193         if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) {
   2194             float_raise( float_flag_invalid );
   2195         }
   2196         return 0;
   2197     }
   2198     aSign = extractFloat32Sign( a );
   2199     bSign = extractFloat32Sign( b );
   2200     if ( aSign != bSign ) return aSign || ( (bits32) ( ( a | b )<<1 ) == 0 );
   2201     return ( a == b ) || ( aSign ^ ( a < b ) );
   2202 
   2203 }
   2204 
   2205 /*----------------------------------------------------------------------------
   2206 | Returns 1 if the single-precision floating-point value `a' is less than
   2207 | the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
   2208 | exception.  Otherwise, the comparison is performed according to the IEC/IEEE
   2209 | Standard for Binary Floating-Point Arithmetic.
   2210 *----------------------------------------------------------------------------*/
   2211 
   2212 flag float32_lt_quiet( float32 a, float32 b )
   2213 {
   2214     flag aSign, bSign;
   2215 
   2216     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
   2217          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
   2218        ) {
   2219         if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) {
   2220             float_raise( float_flag_invalid );
   2221         }
   2222         return 0;
   2223     }
   2224     aSign = extractFloat32Sign( a );
   2225     bSign = extractFloat32Sign( b );
   2226     if ( aSign != bSign ) return aSign && ( (bits32) ( ( a | b )<<1 ) != 0 );
   2227     return ( a != b ) && ( aSign ^ ( a < b ) );
   2228 
   2229 }
   2230 #endif /* !SOFTFLOAT_FOR_GCC */
   2231 
   2232 #ifndef SOFTFLOAT_FOR_GCC /* Not needed */
   2233 /*----------------------------------------------------------------------------
   2234 | Returns the result of converting the double-precision floating-point value
   2235 | `a' to the 32-bit two's complement integer format.  The conversion is
   2236 | performed according to the IEC/IEEE Standard for Binary Floating-Point
   2237 | Arithmetic---which means in particular that the conversion is rounded
   2238 | according to the current rounding mode.  If `a' is a NaN, the largest
   2239 | positive integer is returned.  Otherwise, if the conversion overflows, the
   2240 | largest integer with the same sign as `a' is returned.
   2241 *----------------------------------------------------------------------------*/
   2242 
   2243 int32 float64_to_int32( float64 a )
   2244 {
   2245     flag aSign;
   2246     int16 aExp, shiftCount;
   2247     bits64 aSig;
   2248 
   2249     aSig = extractFloat64Frac( a );
   2250     aExp = extractFloat64Exp( a );
   2251     aSign = extractFloat64Sign( a );
   2252     if ( ( aExp == 0x7FF ) && aSig ) aSign = 0;
   2253     if ( aExp ) aSig |= LIT64( 0x0010000000000000 );
   2254     shiftCount = 0x42C - aExp;
   2255     if ( 0 < shiftCount ) shift64RightJamming( aSig, shiftCount, &aSig );
   2256     return roundAndPackInt32( aSign, aSig );
   2257 
   2258 }
   2259 #endif /* !SOFTFLOAT_FOR_GCC */
   2260 
   2261 /*----------------------------------------------------------------------------
   2262 | Returns the result of converting the double-precision floating-point value
   2263 | `a' to the 32-bit two's complement integer format.  The conversion is
   2264 | performed according to the IEC/IEEE Standard for Binary Floating-Point
   2265 | Arithmetic, except that the conversion is always rounded toward zero.
   2266 | If `a' is a NaN, the largest positive integer is returned.  Otherwise, if
   2267 | the conversion overflows, the largest integer with the same sign as `a' is
   2268 | returned.
   2269 *----------------------------------------------------------------------------*/
   2270 
   2271 int32 float64_to_int32_round_to_zero( float64 a )
   2272 {
   2273     flag aSign;
   2274     int16 aExp, shiftCount;
   2275     bits64 aSig, savedASig;
   2276     int32 z;
   2277 
   2278     aSig = extractFloat64Frac( a );
   2279     aExp = extractFloat64Exp( a );
   2280     aSign = extractFloat64Sign( a );
   2281     if ( 0x41E < aExp ) {
   2282         if ( ( aExp == 0x7FF ) && aSig ) aSign = 0;
   2283         goto invalid;
   2284     }
   2285     else if ( aExp < 0x3FF ) {
   2286         if ( aExp || aSig ) float_set_inexact();
   2287         return 0;
   2288     }
   2289     aSig |= LIT64( 0x0010000000000000 );
   2290     shiftCount = 0x433 - aExp;
   2291     savedASig = aSig;
   2292     aSig >>= shiftCount;
   2293     z = aSig;
   2294     if ( aSign ) z = - z;
   2295     if ( ( z < 0 ) ^ aSign ) {
   2296  invalid:
   2297         float_raise( float_flag_invalid );
   2298         return aSign ? (sbits32) 0x80000000 : 0x7FFFFFFF;
   2299     }
   2300     if ( ( aSig<<shiftCount ) != savedASig ) {
   2301         float_set_inexact();
   2302     }
   2303     return z;
   2304 
   2305 }
   2306 
   2307 #ifndef SOFTFLOAT_FOR_GCC /* Not needed */
   2308 /*----------------------------------------------------------------------------
   2309 | Returns the result of converting the double-precision floating-point value
   2310 | `a' to the 64-bit two's complement integer format.  The conversion is
   2311 | performed according to the IEC/IEEE Standard for Binary Floating-Point
   2312 | Arithmetic---which means in particular that the conversion is rounded
   2313 | according to the current rounding mode.  If `a' is a NaN, the largest
   2314 | positive integer is returned.  Otherwise, if the conversion overflows, the
   2315 | largest integer with the same sign as `a' is returned.
   2316 *----------------------------------------------------------------------------*/
   2317 
   2318 int64 float64_to_int64( float64 a )
   2319 {
   2320     flag aSign;
   2321     int16 aExp, shiftCount;
   2322     bits64 aSig, aSigExtra;
   2323 
   2324     aSig = extractFloat64Frac( a );
   2325     aExp = extractFloat64Exp( a );
   2326     aSign = extractFloat64Sign( a );
   2327     if ( aExp ) aSig |= LIT64( 0x0010000000000000 );
   2328     shiftCount = 0x433 - aExp;
   2329     if ( shiftCount <= 0 ) {
   2330         if ( 0x43E < aExp ) {
   2331             float_raise( float_flag_invalid );
   2332             if (    ! aSign
   2333                  || (    ( aExp == 0x7FF )
   2334                       && ( aSig != LIT64( 0x0010000000000000 ) ) )
   2335                ) {
   2336                 return LIT64( 0x7FFFFFFFFFFFFFFF );
   2337             }
   2338             return (sbits64) LIT64( 0x8000000000000000 );
   2339         }
   2340         aSigExtra = 0;
   2341         aSig <<= - shiftCount;
   2342     }
   2343     else {
   2344         shift64ExtraRightJamming( aSig, 0, shiftCount, &aSig, &aSigExtra );
   2345     }
   2346     return roundAndPackInt64( aSign, aSig, aSigExtra );
   2347 
   2348 }
   2349 
   2350 /* like above, but result is unsigned */
   2351 uint64 float64_to_uint64( float64 a )
   2352 {
   2353     flag aSign;
   2354     int16 aExp, shiftCount;
   2355     bits64 aSig, aSigExtra;
   2356 
   2357     aSig = extractFloat64Frac( a );
   2358     aExp = extractFloat64Exp( a );
   2359     aSign = extractFloat64Sign( a );
   2360 
   2361     if (aSign) {
   2362 	return float64_to_int64(a);
   2363     }
   2364 
   2365     if ( aExp ) aSig |= LIT64( 0x0010000000000000 );
   2366     shiftCount = 0x433 - aExp;
   2367     if ( shiftCount <= 0 ) {
   2368         if ( 0x43E < aExp ) {
   2369             float_raise( float_flag_invalid );
   2370             if (    ! aSign
   2371                  || (    ( aExp == 0x7FF )
   2372                       && ( aSig != LIT64( 0x0010000000000000 ) ) )
   2373                ) {
   2374                 return LIT64( 0x7FFFFFFFFFFFFFFF );
   2375             }
   2376             return (sbits64) LIT64( 0x8000000000000000 );
   2377         }
   2378         aSigExtra = 0;
   2379         aSig <<= - shiftCount;
   2380     }
   2381     else {
   2382         shift64ExtraRightJamming( aSig, 0, shiftCount, &aSig, &aSigExtra );
   2383     }
   2384     return roundAndPackUInt64( aSig, aSigExtra );
   2385 
   2386 }
   2387 
   2388 /*----------------------------------------------------------------------------
   2389 | Returns the result of converting the double-precision floating-point value
   2390 | `a' to the 64-bit two's complement integer format.  The conversion is
   2391 | performed according to the IEC/IEEE Standard for Binary Floating-Point
   2392 | Arithmetic, except that the conversion is always rounded toward zero.
   2393 | If `a' is a NaN, the largest positive integer is returned.  Otherwise, if
   2394 | the conversion overflows, the largest integer with the same sign as `a' is
   2395 | returned.
   2396 *----------------------------------------------------------------------------*/
   2397 
   2398 int64 float64_to_int64_round_to_zero( float64 a )
   2399 {
   2400     flag aSign;
   2401     int16 aExp, shiftCount;
   2402     bits64 aSig;
   2403     int64 z;
   2404 
   2405     aSig = extractFloat64Frac( a );
   2406     aExp = extractFloat64Exp( a );
   2407     aSign = extractFloat64Sign( a );
   2408     if ( aExp ) aSig |= LIT64( 0x0010000000000000 );
   2409     shiftCount = aExp - 0x433;
   2410     if ( 0 <= shiftCount ) {
   2411         if ( 0x43E <= aExp ) {
   2412             if ( a != LIT64( 0xC3E0000000000000 ) ) {
   2413                 float_raise( float_flag_invalid );
   2414                 if (    ! aSign
   2415                      || (    ( aExp == 0x7FF )
   2416                           && ( aSig != LIT64( 0x0010000000000000 ) ) )
   2417                    ) {
   2418                     return LIT64( 0x7FFFFFFFFFFFFFFF );
   2419                 }
   2420             }
   2421             return (sbits64) LIT64( 0x8000000000000000 );
   2422         }
   2423         z = aSig<<shiftCount;
   2424     }
   2425     else {
   2426         if ( aExp < 0x3FE ) {
   2427             if ( aExp | aSig ) float_set_inexact();
   2428             return 0;
   2429         }
   2430         z = aSig>>( - shiftCount );
   2431         if ( (bits64) ( aSig<<( shiftCount & 63 ) ) ) {
   2432             float_set_inexact();
   2433         }
   2434     }
   2435     if ( aSign ) z = - z;
   2436     return z;
   2437 
   2438 }
   2439 #endif /* !SOFTFLOAT_FOR_GCC */
   2440 
   2441 /*----------------------------------------------------------------------------
   2442 | Returns the result of converting the double-precision floating-point value
   2443 | `a' to the single-precision floating-point format.  The conversion is
   2444 | performed according to the IEC/IEEE Standard for Binary Floating-Point
   2445 | Arithmetic.
   2446 *----------------------------------------------------------------------------*/
   2447 
   2448 float32 float64_to_float32( float64 a )
   2449 {
   2450     flag aSign;
   2451     int16 aExp;
   2452     bits64 aSig;
   2453     bits32 zSig;
   2454 
   2455     aSig = extractFloat64Frac( a );
   2456     aExp = extractFloat64Exp( a );
   2457     aSign = extractFloat64Sign( a );
   2458     if ( aExp == 0x7FF ) {
   2459         if ( aSig ) return commonNaNToFloat32( float64ToCommonNaN( a ) );
   2460         return packFloat32( aSign, 0xFF, 0 );
   2461     }
   2462     shift64RightJamming( aSig, 22, &aSig );
   2463     zSig = aSig;
   2464     if ( aExp || zSig ) {
   2465         zSig |= 0x40000000;
   2466         aExp -= 0x381;
   2467     }
   2468     return roundAndPackFloat32( aSign, aExp, zSig );
   2469 
   2470 }
   2471 
   2472 #ifdef FLOATX80
   2473 
   2474 /*----------------------------------------------------------------------------
   2475 | Returns the result of converting the double-precision floating-point value
   2476 | `a' to the extended double-precision floating-point format.  The conversion
   2477 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
   2478 | Arithmetic.
   2479 *----------------------------------------------------------------------------*/
   2480 
   2481 floatx80 float64_to_floatx80( float64 a )
   2482 {
   2483     flag aSign;
   2484     int16 aExp;
   2485     bits64 aSig;
   2486 
   2487     aSig = extractFloat64Frac( a );
   2488     aExp = extractFloat64Exp( a );
   2489     aSign = extractFloat64Sign( a );
   2490     if ( aExp == 0x7FF ) {
   2491         if ( aSig ) return commonNaNToFloatx80( float64ToCommonNaN( a ) );
   2492         return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
   2493     }
   2494     if ( aExp == 0 ) {
   2495         if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 );
   2496         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
   2497     }
   2498     return
   2499         packFloatx80(
   2500             aSign, aExp + 0x3C00, ( aSig | LIT64( 0x0010000000000000 ) )<<11 );
   2501 
   2502 }
   2503 
   2504 #endif
   2505 
   2506 #ifdef FLOAT128
   2507 
   2508 /*----------------------------------------------------------------------------
   2509 | Returns the result of converting the double-precision floating-point value
   2510 | `a' to the quadruple-precision floating-point format.  The conversion is
   2511 | performed according to the IEC/IEEE Standard for Binary Floating-Point
   2512 | Arithmetic.
   2513 *----------------------------------------------------------------------------*/
   2514 
   2515 float128 float64_to_float128( float64 a )
   2516 {
   2517     flag aSign;
   2518     int16 aExp;
   2519     bits64 aSig, zSig0, zSig1;
   2520 
   2521     aSig = extractFloat64Frac( a );
   2522     aExp = extractFloat64Exp( a );
   2523     aSign = extractFloat64Sign( a );
   2524     if ( aExp == 0x7FF ) {
   2525         if ( aSig ) return commonNaNToFloat128( float64ToCommonNaN( a ) );
   2526         return packFloat128( aSign, 0x7FFF, 0, 0 );
   2527     }
   2528     if ( aExp == 0 ) {
   2529         if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 );
   2530         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
   2531         --aExp;
   2532     }
   2533     shift128Right( aSig, 0, 4, &zSig0, &zSig1 );
   2534     return packFloat128( aSign, aExp + 0x3C00, zSig0, zSig1 );
   2535 
   2536 }
   2537 
   2538 #endif
   2539 
   2540 #ifndef SOFTFLOAT_FOR_GCC
   2541 /*----------------------------------------------------------------------------
   2542 | Rounds the double-precision floating-point value `a' to an integer, and
   2543 | returns the result as a double-precision floating-point value.  The
   2544 | operation is performed according to the IEC/IEEE Standard for Binary
   2545 | Floating-Point Arithmetic.
   2546 *----------------------------------------------------------------------------*/
   2547 
   2548 float64 float64_round_to_int( float64 a )
   2549 {
   2550     flag aSign;
   2551     int16 aExp;
   2552     bits64 lastBitMask, roundBitsMask;
   2553     int8 roundingMode;
   2554     float64 z;
   2555 
   2556     aExp = extractFloat64Exp( a );
   2557     if ( 0x433 <= aExp ) {
   2558         if ( ( aExp == 0x7FF ) && extractFloat64Frac( a ) ) {
   2559             return propagateFloat64NaN( a, a );
   2560         }
   2561         return a;
   2562     }
   2563     if ( aExp < 0x3FF ) {
   2564         if ( (bits64) ( a<<1 ) == 0 ) return a;
   2565         float_set_inexact();
   2566         aSign = extractFloat64Sign( a );
   2567         switch ( float_rounding_mode() ) {
   2568          case float_round_nearest_even:
   2569             if ( ( aExp == 0x3FE ) && extractFloat64Frac( a ) ) {
   2570                 return packFloat64( aSign, 0x3FF, 0 );
   2571             }
   2572             break;
   2573          case float_round_down:
   2574             return aSign ? LIT64( 0xBFF0000000000000 ) : 0;
   2575          case float_round_up:
   2576             return
   2577             aSign ? LIT64( 0x8000000000000000 ) : LIT64( 0x3FF0000000000000 );
   2578         }
   2579         return packFloat64( aSign, 0, 0 );
   2580     }
   2581     lastBitMask = 1;
   2582     lastBitMask <<= 0x433 - aExp;
   2583     roundBitsMask = lastBitMask - 1;
   2584     z = a;
   2585     roundingMode = float_rounding_mode();
   2586     if ( roundingMode == float_round_nearest_even ) {
   2587         z += lastBitMask>>1;
   2588         if ( ( z & roundBitsMask ) == 0 ) z &= ~ lastBitMask;
   2589     }
   2590     else if ( roundingMode != float_round_to_zero ) {
   2591         if ( extractFloat64Sign( z ) ^ ( roundingMode == float_round_up ) ) {
   2592             z += roundBitsMask;
   2593         }
   2594     }
   2595     z &= ~ roundBitsMask;
   2596     if ( z != a ) float_set_inexact();
   2597     return z;
   2598 
   2599 }
   2600 #endif
   2601 
   2602 /*----------------------------------------------------------------------------
   2603 | Returns the result of adding the absolute values of the double-precision
   2604 | floating-point values `a' and `b'.  If `zSign' is 1, the sum is negated
   2605 | before being returned.  `zSign' is ignored if the result is a NaN.
   2606 | The addition is performed according to the IEC/IEEE Standard for Binary
   2607 | Floating-Point Arithmetic.
   2608 *----------------------------------------------------------------------------*/
   2609 
   2610 static float64 addFloat64Sigs( float64 a, float64 b, flag zSign )
   2611 {
   2612     int16 aExp, bExp, zExp;
   2613     bits64 aSig, bSig, zSig;
   2614     int16 expDiff;
   2615 
   2616     aSig = extractFloat64Frac( a );
   2617     aExp = extractFloat64Exp( a );
   2618     bSig = extractFloat64Frac( b );
   2619     bExp = extractFloat64Exp( b );
   2620     expDiff = aExp - bExp;
   2621     aSig <<= 9;
   2622     bSig <<= 9;
   2623     if ( 0 < expDiff ) {
   2624         if ( aExp == 0x7FF ) {
   2625             if ( aSig ) return propagateFloat64NaN( a, b );
   2626             return a;
   2627         }
   2628         if ( bExp == 0 ) {
   2629             --expDiff;
   2630         }
   2631         else {
   2632             bSig |= LIT64( 0x2000000000000000 );
   2633         }
   2634         shift64RightJamming( bSig, expDiff, &bSig );
   2635         zExp = aExp;
   2636     }
   2637     else if ( expDiff < 0 ) {
   2638         if ( bExp == 0x7FF ) {
   2639             if ( bSig ) return propagateFloat64NaN( a, b );
   2640             return packFloat64( zSign, 0x7FF, 0 );
   2641         }
   2642         if ( aExp == 0 ) {
   2643             ++expDiff;
   2644         }
   2645         else {
   2646             aSig |= LIT64( 0x2000000000000000 );
   2647         }
   2648         shift64RightJamming( aSig, - expDiff, &aSig );
   2649         zExp = bExp;
   2650     }
   2651     else {
   2652         if ( aExp == 0x7FF ) {
   2653             if ( aSig | bSig ) return propagateFloat64NaN( a, b );
   2654             return a;
   2655         }
   2656         if ( aExp == 0 ) return packFloat64( zSign, 0, ( aSig + bSig )>>9 );
   2657         zSig = LIT64( 0x4000000000000000 ) + aSig + bSig;
   2658         zExp = aExp;
   2659         goto roundAndPack;
   2660     }
   2661     aSig |= LIT64( 0x2000000000000000 );
   2662     zSig = ( aSig + bSig )<<1;
   2663     --zExp;
   2664     if ( (sbits64) zSig < 0 ) {
   2665         zSig = aSig + bSig;
   2666         ++zExp;
   2667     }
   2668  roundAndPack:
   2669     return roundAndPackFloat64( zSign, zExp, zSig );
   2670 
   2671 }
   2672 
   2673 /*----------------------------------------------------------------------------
   2674 | Returns the result of subtracting the absolute values of the double-
   2675 | precision floating-point values `a' and `b'.  If `zSign' is 1, the
   2676 | difference is negated before being returned.  `zSign' is ignored if the
   2677 | result is a NaN.  The subtraction is performed according to the IEC/IEEE
   2678 | Standard for Binary Floating-Point Arithmetic.
   2679 *----------------------------------------------------------------------------*/
   2680 
   2681 static float64 subFloat64Sigs( float64 a, float64 b, flag zSign )
   2682 {
   2683     int16 aExp, bExp, zExp;
   2684     bits64 aSig, bSig, zSig;
   2685     int16 expDiff;
   2686 
   2687     aSig = extractFloat64Frac( a );
   2688     aExp = extractFloat64Exp( a );
   2689     bSig = extractFloat64Frac( b );
   2690     bExp = extractFloat64Exp( b );
   2691     expDiff = aExp - bExp;
   2692     aSig <<= 10;
   2693     bSig <<= 10;
   2694     if ( 0 < expDiff ) goto aExpBigger;
   2695     if ( expDiff < 0 ) goto bExpBigger;
   2696     if ( aExp == 0x7FF ) {
   2697         if ( aSig | bSig ) return propagateFloat64NaN( a, b );
   2698         float_raise( float_flag_invalid );
   2699         return float64_default_nan;
   2700     }
   2701     if ( aExp == 0 ) {
   2702         aExp = 1;
   2703         bExp = 1;
   2704     }
   2705     if ( bSig < aSig ) goto aBigger;
   2706     if ( aSig < bSig ) goto bBigger;
   2707     return packFloat64( float_rounding_mode() == float_round_down, 0, 0 );
   2708  bExpBigger:
   2709     if ( bExp == 0x7FF ) {
   2710         if ( bSig ) return propagateFloat64NaN( a, b );
   2711         return packFloat64( zSign ^ 1, 0x7FF, 0 );
   2712     }
   2713     if ( aExp == 0 ) {
   2714         ++expDiff;
   2715     }
   2716     else {
   2717         aSig |= LIT64( 0x4000000000000000 );
   2718     }
   2719     shift64RightJamming( aSig, - expDiff, &aSig );
   2720     bSig |= LIT64( 0x4000000000000000 );
   2721  bBigger:
   2722     zSig = bSig - aSig;
   2723     zExp = bExp;
   2724     zSign ^= 1;
   2725     goto normalizeRoundAndPack;
   2726  aExpBigger:
   2727     if ( aExp == 0x7FF ) {
   2728         if ( aSig ) return propagateFloat64NaN( a, b );
   2729         return a;
   2730     }
   2731     if ( bExp == 0 ) {
   2732         --expDiff;
   2733     }
   2734     else {
   2735         bSig |= LIT64( 0x4000000000000000 );
   2736     }
   2737     shift64RightJamming( bSig, expDiff, &bSig );
   2738     aSig |= LIT64( 0x4000000000000000 );
   2739  aBigger:
   2740     zSig = aSig - bSig;
   2741     zExp = aExp;
   2742  normalizeRoundAndPack:
   2743     --zExp;
   2744     return normalizeRoundAndPackFloat64( zSign, zExp, zSig );
   2745 
   2746 }
   2747 
   2748 /*----------------------------------------------------------------------------
   2749 | Returns the result of adding the double-precision floating-point values `a'
   2750 | and `b'.  The operation is performed according to the IEC/IEEE Standard for
   2751 | Binary Floating-Point Arithmetic.
   2752 *----------------------------------------------------------------------------*/
   2753 
   2754 float64 float64_add( float64 a, float64 b )
   2755 {
   2756     flag aSign, bSign;
   2757 
   2758     aSign = extractFloat64Sign( a );
   2759     bSign = extractFloat64Sign( b );
   2760     if ( aSign == bSign ) {
   2761         return addFloat64Sigs( a, b, aSign );
   2762     }
   2763     else {
   2764         return subFloat64Sigs( a, b, aSign );
   2765     }
   2766 
   2767 }
   2768 
   2769 /*----------------------------------------------------------------------------
   2770 | Returns the result of subtracting the double-precision floating-point values
   2771 | `a' and `b'.  The operation is performed according to the IEC/IEEE Standard
   2772 | for Binary Floating-Point Arithmetic.
   2773 *----------------------------------------------------------------------------*/
   2774 
   2775 float64 float64_sub( float64 a, float64 b )
   2776 {
   2777     flag aSign, bSign;
   2778 
   2779     aSign = extractFloat64Sign( a );
   2780     bSign = extractFloat64Sign( b );
   2781     if ( aSign == bSign ) {
   2782         return subFloat64Sigs( a, b, aSign );
   2783     }
   2784     else {
   2785         return addFloat64Sigs( a, b, aSign );
   2786     }
   2787 
   2788 }
   2789 
   2790 /*----------------------------------------------------------------------------
   2791 | Returns the result of multiplying the double-precision floating-point values
   2792 | `a' and `b'.  The operation is performed according to the IEC/IEEE Standard
   2793 | for Binary Floating-Point Arithmetic.
   2794 *----------------------------------------------------------------------------*/
   2795 
   2796 float64 float64_mul( float64 a, float64 b )
   2797 {
   2798     flag aSign, bSign, zSign;
   2799     int16 aExp, bExp, zExp;
   2800     bits64 aSig, bSig, zSig0, zSig1;
   2801 
   2802     aSig = extractFloat64Frac( a );
   2803     aExp = extractFloat64Exp( a );
   2804     aSign = extractFloat64Sign( a );
   2805     bSig = extractFloat64Frac( b );
   2806     bExp = extractFloat64Exp( b );
   2807     bSign = extractFloat64Sign( b );
   2808     zSign = aSign ^ bSign;
   2809     if ( aExp == 0x7FF ) {
   2810         if ( aSig || ( ( bExp == 0x7FF ) && bSig ) ) {
   2811             return propagateFloat64NaN( a, b );
   2812         }
   2813         if ( ( bExp | bSig ) == 0 ) {
   2814             float_raise( float_flag_invalid );
   2815             return float64_default_nan;
   2816         }
   2817         return packFloat64( zSign, 0x7FF, 0 );
   2818     }
   2819     if ( bExp == 0x7FF ) {
   2820         if ( bSig ) return propagateFloat64NaN( a, b );
   2821         if ( ( aExp | aSig ) == 0 ) {
   2822             float_raise( float_flag_invalid );
   2823             return float64_default_nan;
   2824         }
   2825         return packFloat64( zSign, 0x7FF, 0 );
   2826     }
   2827     if ( aExp == 0 ) {
   2828         if ( aSig == 0 ) return packFloat64( zSign, 0, 0 );
   2829         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
   2830     }
   2831     if ( bExp == 0 ) {
   2832         if ( bSig == 0 ) return packFloat64( zSign, 0, 0 );
   2833         normalizeFloat64Subnormal( bSig, &bExp, &bSig );
   2834     }
   2835     zExp = aExp + bExp - 0x3FF;
   2836     aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<10;
   2837     bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;
   2838     mul64To128( aSig, bSig, &zSig0, &zSig1 );
   2839     zSig0 |= ( zSig1 != 0 );
   2840     if ( 0 <= (sbits64) ( zSig0<<1 ) ) {
   2841         zSig0 <<= 1;
   2842         --zExp;
   2843     }
   2844     return roundAndPackFloat64( zSign, zExp, zSig0 );
   2845 
   2846 }
   2847 
   2848 /*----------------------------------------------------------------------------
   2849 | Returns the result of dividing the double-precision floating-point value `a'
   2850 | by the corresponding value `b'.  The operation is performed according to
   2851 | the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
   2852 *----------------------------------------------------------------------------*/
   2853 
   2854 float64 float64_div( float64 a, float64 b )
   2855 {
   2856     flag aSign, bSign, zSign;
   2857     int16 aExp, bExp, zExp;
   2858     bits64 aSig, bSig, zSig;
   2859     bits64 rem0, rem1;
   2860     bits64 term0, term1;
   2861 
   2862     aSig = extractFloat64Frac( a );
   2863     aExp = extractFloat64Exp( a );
   2864     aSign = extractFloat64Sign( a );
   2865     bSig = extractFloat64Frac( b );
   2866     bExp = extractFloat64Exp( b );
   2867     bSign = extractFloat64Sign( b );
   2868     zSign = aSign ^ bSign;
   2869     if ( aExp == 0x7FF ) {
   2870         if ( aSig ) return propagateFloat64NaN( a, b );
   2871         if ( bExp == 0x7FF ) {
   2872             if ( bSig ) return propagateFloat64NaN( a, b );
   2873             float_raise( float_flag_invalid );
   2874             return float64_default_nan;
   2875         }
   2876         return packFloat64( zSign, 0x7FF, 0 );
   2877     }
   2878     if ( bExp == 0x7FF ) {
   2879         if ( bSig ) return propagateFloat64NaN( a, b );
   2880         return packFloat64( zSign, 0, 0 );
   2881     }
   2882     if ( bExp == 0 ) {
   2883         if ( bSig == 0 ) {
   2884             if ( ( aExp | aSig ) == 0 ) {
   2885                 float_raise( float_flag_invalid );
   2886                 return float64_default_nan;
   2887             }
   2888             float_raise( float_flag_divbyzero );
   2889             return packFloat64( zSign, 0x7FF, 0 );
   2890         }
   2891         normalizeFloat64Subnormal( bSig, &bExp, &bSig );
   2892     }
   2893     if ( aExp == 0 ) {
   2894         if ( aSig == 0 ) return packFloat64( zSign, 0, 0 );
   2895         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
   2896     }
   2897     zExp = aExp - bExp + 0x3FD;
   2898     aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<10;
   2899     bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;
   2900     if ( bSig <= ( aSig + aSig ) ) {
   2901         aSig >>= 1;
   2902         ++zExp;
   2903     }
   2904     zSig = estimateDiv128To64( aSig, 0, bSig );
   2905     if ( ( zSig & 0x1FF ) <= 2 ) {
   2906         mul64To128( bSig, zSig, &term0, &term1 );
   2907         sub128( aSig, 0, term0, term1, &rem0, &rem1 );
   2908         while ( (sbits64) rem0 < 0 ) {
   2909             --zSig;
   2910             add128( rem0, rem1, 0, bSig, &rem0, &rem1 );
   2911         }
   2912         zSig |= ( rem1 != 0 );
   2913     }
   2914     return roundAndPackFloat64( zSign, zExp, zSig );
   2915 
   2916 }
   2917 
   2918 #ifndef SOFTFLOAT_FOR_GCC
   2919 /*----------------------------------------------------------------------------
   2920 | Returns the remainder of the double-precision floating-point value `a'
   2921 | with respect to the corresponding value `b'.  The operation is performed
   2922 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
   2923 *----------------------------------------------------------------------------*/
   2924 
   2925 float64 float64_rem( float64 a, float64 b )
   2926 {
   2927     flag aSign, bSign __unused, zSign;
   2928     int16 aExp, bExp, expDiff;
   2929     bits64 aSig, bSig;
   2930     bits64 q, alternateASig;
   2931     sbits64 sigMean;
   2932 
   2933     aSig = extractFloat64Frac( a );
   2934     aExp = extractFloat64Exp( a );
   2935     aSign = extractFloat64Sign( a );
   2936     bSig = extractFloat64Frac( b );
   2937     bExp = extractFloat64Exp( b );
   2938     bSign = extractFloat64Sign( b );
   2939     if ( aExp == 0x7FF ) {
   2940         if ( aSig || ( ( bExp == 0x7FF ) && bSig ) ) {
   2941             return propagateFloat64NaN( a, b );
   2942         }
   2943         float_raise( float_flag_invalid );
   2944         return float64_default_nan;
   2945     }
   2946     if ( bExp == 0x7FF ) {
   2947         if ( bSig ) return propagateFloat64NaN( a, b );
   2948         return a;
   2949     }
   2950     if ( bExp == 0 ) {
   2951         if ( bSig == 0 ) {
   2952             float_raise( float_flag_invalid );
   2953             return float64_default_nan;
   2954         }
   2955         normalizeFloat64Subnormal( bSig, &bExp, &bSig );
   2956     }
   2957     if ( aExp == 0 ) {
   2958         if ( aSig == 0 ) return a;
   2959         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
   2960     }
   2961     expDiff = aExp - bExp;
   2962     aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<11;
   2963     bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;
   2964     if ( expDiff < 0 ) {
   2965         if ( expDiff < -1 ) return a;
   2966         aSig >>= 1;
   2967     }
   2968     q = ( bSig <= aSig );
   2969     if ( q ) aSig -= bSig;
   2970     expDiff -= 64;
   2971     while ( 0 < expDiff ) {
   2972         q = estimateDiv128To64( aSig, 0, bSig );
   2973         q = ( 2 < q ) ? q - 2 : 0;
   2974         aSig = - ( ( bSig>>2 ) * q );
   2975         expDiff -= 62;
   2976     }
   2977     expDiff += 64;
   2978     if ( 0 < expDiff ) {
   2979         q = estimateDiv128To64( aSig, 0, bSig );
   2980         q = ( 2 < q ) ? q - 2 : 0;
   2981         q >>= 64 - expDiff;
   2982         bSig >>= 2;
   2983         aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q;
   2984     }
   2985     else {
   2986         aSig >>= 2;
   2987         bSig >>= 2;
   2988     }
   2989     do {
   2990         alternateASig = aSig;
   2991         ++q;
   2992         aSig -= bSig;
   2993     } while ( 0 <= (sbits64) aSig );
   2994     sigMean = aSig + alternateASig;
   2995     if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) {
   2996         aSig = alternateASig;
   2997     }
   2998     zSign = ( (sbits64) aSig < 0 );
   2999     if ( zSign ) aSig = - aSig;
   3000     return normalizeRoundAndPackFloat64( aSign ^ zSign, bExp, aSig );
   3001 
   3002 }
   3003 
   3004 /*----------------------------------------------------------------------------
   3005 | Returns the square root of the double-precision floating-point value `a'.
   3006 | The operation is performed according to the IEC/IEEE Standard for Binary
   3007 | Floating-Point Arithmetic.
   3008 *----------------------------------------------------------------------------*/
   3009 
   3010 float64 float64_sqrt( float64 a )
   3011 {
   3012     flag aSign;
   3013     int16 aExp, zExp;
   3014     bits64 aSig, zSig, doubleZSig;
   3015     bits64 rem0, rem1, term0, term1;
   3016 
   3017     aSig = extractFloat64Frac( a );
   3018     aExp = extractFloat64Exp( a );
   3019     aSign = extractFloat64Sign( a );
   3020     if ( aExp == 0x7FF ) {
   3021         if ( aSig ) return propagateFloat64NaN( a, a );
   3022         if ( ! aSign ) return a;
   3023         float_raise( float_flag_invalid );
   3024         return float64_default_nan;
   3025     }
   3026     if ( aSign ) {
   3027         if ( ( aExp | aSig ) == 0 ) return a;
   3028         float_raise( float_flag_invalid );
   3029         return float64_default_nan;
   3030     }
   3031     if ( aExp == 0 ) {
   3032         if ( aSig == 0 ) return 0;
   3033         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
   3034     }
   3035     zExp = ( ( aExp - 0x3FF )>>1 ) + 0x3FE;
   3036     aSig |= LIT64( 0x0010000000000000 );
   3037     zSig = estimateSqrt32( aExp, aSig>>21 );
   3038     aSig <<= 9 - ( aExp & 1 );
   3039     zSig = estimateDiv128To64( aSig, 0, zSig<<32 ) + ( zSig<<30 );
   3040     if ( ( zSig & 0x1FF ) <= 5 ) {
   3041         doubleZSig = zSig<<1;
   3042         mul64To128( zSig, zSig, &term0, &term1 );
   3043         sub128( aSig, 0, term0, term1, &rem0, &rem1 );
   3044         while ( (sbits64) rem0 < 0 ) {
   3045             --zSig;
   3046             doubleZSig -= 2;
   3047             add128( rem0, rem1, zSig>>63, doubleZSig | 1, &rem0, &rem1 );
   3048         }
   3049         zSig |= ( ( rem0 | rem1 ) != 0 );
   3050     }
   3051     return roundAndPackFloat64( 0, zExp, zSig );
   3052 
   3053 }
   3054 #endif
   3055 
   3056 /*----------------------------------------------------------------------------
   3057 | Returns 1 if the double-precision floating-point value `a' is equal to the
   3058 | corresponding value `b', and 0 otherwise.  The comparison is performed
   3059 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
   3060 *----------------------------------------------------------------------------*/
   3061 
   3062 flag float64_eq( float64 a, float64 b )
   3063 {
   3064 
   3065     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
   3066          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
   3067        ) {
   3068         if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) {
   3069             float_raise( float_flag_invalid );
   3070         }
   3071         return 0;
   3072     }
   3073     return ( a == b ) || ( (bits64) ( ( a | b )<<1 ) == 0 );
   3074 
   3075 }
   3076 
   3077 /*----------------------------------------------------------------------------
   3078 | Returns 1 if the double-precision floating-point value `a' is less than or
   3079 | equal to the corresponding value `b', and 0 otherwise.  The comparison is
   3080 | performed according to the IEC/IEEE Standard for Binary Floating-Point
   3081 | Arithmetic.
   3082 *----------------------------------------------------------------------------*/
   3083 
   3084 flag float64_le( float64 a, float64 b )
   3085 {
   3086     flag aSign, bSign;
   3087 
   3088     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
   3089          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
   3090        ) {
   3091         float_raise( float_flag_invalid );
   3092         return 0;
   3093     }
   3094     aSign = extractFloat64Sign( a );
   3095     bSign = extractFloat64Sign( b );
   3096     if ( aSign != bSign ) return aSign || ( (bits64) ( ( a | b )<<1 ) == 0 );
   3097     return ( a == b ) || ( aSign ^ ( a < b ) );
   3098 
   3099 }
   3100 
   3101 /*----------------------------------------------------------------------------
   3102 | Returns 1 if the double-precision floating-point value `a' is less than
   3103 | the corresponding value `b', and 0 otherwise.  The comparison is performed
   3104 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
   3105 *----------------------------------------------------------------------------*/
   3106 
   3107 flag float64_lt( float64 a, float64 b )
   3108 {
   3109     flag aSign, bSign;
   3110 
   3111     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
   3112          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
   3113        ) {
   3114         float_raise( float_flag_invalid );
   3115         return 0;
   3116     }
   3117     aSign = extractFloat64Sign( a );
   3118     bSign = extractFloat64Sign( b );
   3119     if ( aSign != bSign ) return aSign && ( (bits64) ( ( a | b )<<1 ) != 0 );
   3120     return ( a != b ) && ( aSign ^ ( a < b ) );
   3121 
   3122 }
   3123 
   3124 #ifndef SOFTFLOAT_FOR_GCC
   3125 /*----------------------------------------------------------------------------
   3126 | Returns 1 if the double-precision floating-point value `a' is equal to the
   3127 | corresponding value `b', and 0 otherwise.  The invalid exception is raised
   3128 | if either operand is a NaN.  Otherwise, the comparison is performed
   3129 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
   3130 *----------------------------------------------------------------------------*/
   3131 
   3132 flag float64_eq_signaling( float64 a, float64 b )
   3133 {
   3134 
   3135     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
   3136          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
   3137        ) {
   3138         float_raise( float_flag_invalid );
   3139         return 0;
   3140     }
   3141     return ( a == b ) || ( (bits64) ( ( a | b )<<1 ) == 0 );
   3142 
   3143 }
   3144 
   3145 /*----------------------------------------------------------------------------
   3146 | Returns 1 if the double-precision floating-point value `a' is less than or
   3147 | equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs do not
   3148 | cause an exception.  Otherwise, the comparison is performed according to the
   3149 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
   3150 *----------------------------------------------------------------------------*/
   3151 
   3152 flag float64_le_quiet( float64 a, float64 b )
   3153 {
   3154     flag aSign, bSign;
   3155 
   3156     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
   3157          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
   3158        ) {
   3159         if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) {
   3160             float_raise( float_flag_invalid );
   3161         }
   3162         return 0;
   3163     }
   3164     aSign = extractFloat64Sign( a );
   3165     bSign = extractFloat64Sign( b );
   3166     if ( aSign != bSign ) return aSign || ( (bits64) ( ( a | b )<<1 ) == 0 );
   3167     return ( a == b ) || ( aSign ^ ( a < b ) );
   3168 
   3169 }
   3170 
   3171 /*----------------------------------------------------------------------------
   3172 | Returns 1 if the double-precision floating-point value `a' is less than
   3173 | the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
   3174 | exception.  Otherwise, the comparison is performed according to the IEC/IEEE
   3175 | Standard for Binary Floating-Point Arithmetic.
   3176 *----------------------------------------------------------------------------*/
   3177 
   3178 flag float64_lt_quiet( float64 a, float64 b )
   3179 {
   3180     flag aSign, bSign;
   3181 
   3182     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
   3183          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
   3184        ) {
   3185         if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) {
   3186             float_raise( float_flag_invalid );
   3187         }
   3188         return 0;
   3189     }
   3190     aSign = extractFloat64Sign( a );
   3191     bSign = extractFloat64Sign( b );
   3192     if ( aSign != bSign ) return aSign && ( (bits64) ( ( a | b )<<1 ) != 0 );
   3193     return ( a != b ) && ( aSign ^ ( a < b ) );
   3194 
   3195 }
   3196 #endif
   3197 
   3198 #ifdef FLOATX80
   3199 
   3200 /*----------------------------------------------------------------------------
   3201 | Returns the result of converting the extended double-precision floating-
   3202 | point value `a' to the 32-bit two's complement integer format.  The
   3203 | conversion is performed according to the IEC/IEEE Standard for Binary
   3204 | Floating-Point Arithmetic---which means in particular that the conversion
   3205 | is rounded according to the current rounding mode.  If `a' is a NaN, the
   3206 | largest positive integer is returned.  Otherwise, if the conversion
   3207 | overflows, the largest integer with the same sign as `a' is returned.
   3208 *----------------------------------------------------------------------------*/
   3209 
   3210 int32 floatx80_to_int32( floatx80 a )
   3211 {
   3212     flag aSign;
   3213     int32 aExp, shiftCount;
   3214     bits64 aSig;
   3215 
   3216     aSig = extractFloatx80Frac( a );
   3217     aExp = extractFloatx80Exp( a );
   3218     aSign = extractFloatx80Sign( a );
   3219     if ( ( aExp == 0x7FFF ) && (bits64) ( aSig<<1 ) ) aSign = 0;
   3220     shiftCount = 0x4037 - aExp;
   3221     if ( shiftCount <= 0 ) shiftCount = 1;
   3222     shift64RightJamming( aSig, shiftCount, &aSig );
   3223     return roundAndPackInt32( aSign, aSig );
   3224 
   3225 }
   3226 
   3227 /*----------------------------------------------------------------------------
   3228 | Returns the result of converting the extended double-precision floating-
   3229 | point value `a' to the 32-bit two's complement integer format.  The
   3230 | conversion is performed according to the IEC/IEEE Standard for Binary
   3231 | Floating-Point Arithmetic, except that the conversion is always rounded
   3232 | toward zero.  If `a' is a NaN, the largest positive integer is returned.
   3233 | Otherwise, if the conversion overflows, the largest integer with the same
   3234 | sign as `a' is returned.
   3235 *----------------------------------------------------------------------------*/
   3236 
   3237 int32 floatx80_to_int32_round_to_zero( floatx80 a )
   3238 {
   3239     flag aSign;
   3240     int32 aExp, shiftCount;
   3241     bits64 aSig, savedASig;
   3242     int32 z;
   3243 
   3244     aSig = extractFloatx80Frac( a );
   3245     aExp = extractFloatx80Exp( a );
   3246     aSign = extractFloatx80Sign( a );
   3247     if ( 0x401E < aExp ) {
   3248         if ( ( aExp == 0x7FFF ) && (bits64) ( aSig<<1 ) ) aSign = 0;
   3249         goto invalid;
   3250     }
   3251     else if ( aExp < 0x3FFF ) {
   3252         if ( aExp || aSig ) float_set_inexact();
   3253         return 0;
   3254     }
   3255     shiftCount = 0x403E - aExp;
   3256     savedASig = aSig;
   3257     aSig >>= shiftCount;
   3258     z = aSig;
   3259     if ( aSign ) z = - z;
   3260     if ( ( z < 0 ) ^ aSign ) {
   3261  invalid:
   3262         float_raise( float_flag_invalid );
   3263         return aSign ? (sbits32) 0x80000000 : 0x7FFFFFFF;
   3264     }
   3265     if ( ( aSig<<shiftCount ) != savedASig ) {
   3266         float_set_inexact();
   3267     }
   3268     return z;
   3269 
   3270 }
   3271 
   3272 /*----------------------------------------------------------------------------
   3273 | Returns the result of converting the extended double-precision floating-
   3274 | point value `a' to the 64-bit two's complement integer format.  The
   3275 | conversion is performed according to the IEC/IEEE Standard for Binary
   3276 | Floating-Point Arithmetic---which means in particular that the conversion
   3277 | is rounded according to the current rounding mode.  If `a' is a NaN,
   3278 | the largest positive integer is returned.  Otherwise, if the conversion
   3279 | overflows, the largest integer with the same sign as `a' is returned.
   3280 *----------------------------------------------------------------------------*/
   3281 
   3282 int64 floatx80_to_int64( floatx80 a )
   3283 {
   3284     flag aSign;
   3285     int32 aExp, shiftCount;
   3286     bits64 aSig, aSigExtra;
   3287 
   3288     aSig = extractFloatx80Frac( a );
   3289     aExp = extractFloatx80Exp( a );
   3290     aSign = extractFloatx80Sign( a );
   3291     shiftCount = 0x403E - aExp;
   3292     if ( shiftCount <= 0 ) {
   3293         if ( shiftCount ) {
   3294             float_raise( float_flag_invalid );
   3295             if (    ! aSign
   3296                  || (    ( aExp == 0x7FFF )
   3297                       && ( aSig != LIT64( 0x8000000000000000 ) ) )
   3298                ) {
   3299                 return LIT64( 0x7FFFFFFFFFFFFFFF );
   3300             }
   3301             return (sbits64) LIT64( 0x8000000000000000 );
   3302         }
   3303         aSigExtra = 0;
   3304     }
   3305     else {
   3306         shift64ExtraRightJamming( aSig, 0, shiftCount, &aSig, &aSigExtra );
   3307     }
   3308     return roundAndPackInt64( aSign, aSig, aSigExtra );
   3309 
   3310 }
   3311 
   3312 /*----------------------------------------------------------------------------
   3313 | Returns the result of converting the extended double-precision floating-
   3314 | point value `a' to the 64-bit two's complement integer format.  The
   3315 | conversion is performed according to the IEC/IEEE Standard for Binary
   3316 | Floating-Point Arithmetic, except that the conversion is always rounded
   3317 | toward zero.  If `a' is a NaN, the largest positive integer is returned.
   3318 | Otherwise, if the conversion overflows, the largest integer with the same
   3319 | sign as `a' is returned.
   3320 *----------------------------------------------------------------------------*/
   3321 
   3322 int64 floatx80_to_int64_round_to_zero( floatx80 a )
   3323 {
   3324     flag aSign;
   3325     int32 aExp, shiftCount;
   3326     bits64 aSig;
   3327     int64 z;
   3328 
   3329     aSig = extractFloatx80Frac( a );
   3330     aExp = extractFloatx80Exp( a );
   3331     aSign = extractFloatx80Sign( a );
   3332     shiftCount = aExp - 0x403E;
   3333     if ( 0 <= shiftCount ) {
   3334         aSig &= LIT64( 0x7FFFFFFFFFFFFFFF );
   3335         if ( ( a.high != 0xC03E ) || aSig ) {
   3336             float_raise( float_flag_invalid );
   3337             if ( ! aSign || ( ( aExp == 0x7FFF ) && aSig ) ) {
   3338                 return LIT64( 0x7FFFFFFFFFFFFFFF );
   3339             }
   3340         }
   3341         return (sbits64) LIT64( 0x8000000000000000 );
   3342     }
   3343     else if ( aExp < 0x3FFF ) {
   3344         if ( aExp | aSig ) float_set_inexact();
   3345         return 0;
   3346     }
   3347     z = aSig>>( - shiftCount );
   3348     if ( (bits64) ( aSig<<( shiftCount & 63 ) ) ) {
   3349         float_set_inexact();
   3350     }
   3351     if ( aSign ) z = - z;
   3352     return z;
   3353 
   3354 }
   3355 
   3356 /*----------------------------------------------------------------------------
   3357 | Returns the result of converting the extended double-precision floating-
   3358 | point value `a' to the single-precision floating-point format.  The
   3359 | conversion is performed according to the IEC/IEEE Standard for Binary
   3360 | Floating-Point Arithmetic.
   3361 *----------------------------------------------------------------------------*/
   3362 
   3363 float32 floatx80_to_float32( floatx80 a )
   3364 {
   3365     flag aSign;
   3366     int32 aExp;
   3367     bits64 aSig;
   3368 
   3369     aSig = extractFloatx80Frac( a );
   3370     aExp = extractFloatx80Exp( a );
   3371     aSign = extractFloatx80Sign( a );
   3372     if ( aExp == 0x7FFF ) {
   3373         if ( (bits64) ( aSig<<1 ) ) {
   3374             return commonNaNToFloat32( floatx80ToCommonNaN( a ) );
   3375         }
   3376         return packFloat32( aSign, 0xFF, 0 );
   3377     }
   3378     shift64RightJamming( aSig, 33, &aSig );
   3379     if ( aExp || aSig ) aExp -= 0x3F81;
   3380     return roundAndPackFloat32( aSign, aExp, aSig );
   3381 
   3382 }
   3383 
   3384 /*----------------------------------------------------------------------------
   3385 | Returns the result of converting the extended double-precision floating-
   3386 | point value `a' to the double-precision floating-point format.  The
   3387 | conversion is performed according to the IEC/IEEE Standard for Binary
   3388 | Floating-Point Arithmetic.
   3389 *----------------------------------------------------------------------------*/
   3390 
   3391 float64 floatx80_to_float64( floatx80 a )
   3392 {
   3393     flag aSign;
   3394     int32 aExp;
   3395     bits64 aSig, zSig;
   3396 
   3397     aSig = extractFloatx80Frac( a );
   3398     aExp = extractFloatx80Exp( a );
   3399     aSign = extractFloatx80Sign( a );
   3400     if ( aExp == 0x7FFF ) {
   3401         if ( (bits64) ( aSig<<1 ) ) {
   3402             return commonNaNToFloat64( floatx80ToCommonNaN( a ) );
   3403         }
   3404         return packFloat64( aSign, 0x7FF, 0 );
   3405     }
   3406     shift64RightJamming( aSig, 1, &zSig );
   3407     if ( aExp || aSig ) aExp -= 0x3C01;
   3408     return roundAndPackFloat64( aSign, aExp, zSig );
   3409 
   3410 }
   3411 
   3412 #ifdef FLOAT128
   3413 
   3414 /*----------------------------------------------------------------------------
   3415 | Returns the result of converting the extended double-precision floating-
   3416 | point value `a' to the quadruple-precision floating-point format.  The
   3417 | conversion is performed according to the IEC/IEEE Standard for Binary
   3418 | Floating-Point Arithmetic.
   3419 *----------------------------------------------------------------------------*/
   3420 
   3421 float128 floatx80_to_float128( floatx80 a )
   3422 {
   3423     flag aSign;
   3424     int16 aExp;
   3425     bits64 aSig, zSig0, zSig1;
   3426 
   3427     aSig = extractFloatx80Frac( a );
   3428     aExp = extractFloatx80Exp( a );
   3429     aSign = extractFloatx80Sign( a );
   3430     if ( ( aExp == 0x7FFF ) && (bits64) ( aSig<<1 ) ) {
   3431         return commonNaNToFloat128( floatx80ToCommonNaN( a ) );
   3432     }
   3433     shift128Right( aSig<<1, 0, 16, &zSig0, &zSig1 );
   3434     return packFloat128( aSign, aExp, zSig0, zSig1 );
   3435 
   3436 }
   3437 
   3438 #endif
   3439 
   3440 /*----------------------------------------------------------------------------
   3441 | Rounds the extended double-precision floating-point value `a' to an integer,
   3442 | and returns the result as an extended quadruple-precision floating-point
   3443 | value.  The operation is performed according to the IEC/IEEE Standard for
   3444 | Binary Floating-Point Arithmetic.
   3445 *----------------------------------------------------------------------------*/
   3446 
   3447 floatx80 floatx80_round_to_int( floatx80 a )
   3448 {
   3449     flag aSign;
   3450     int32 aExp;
   3451     bits64 lastBitMask, roundBitsMask;
   3452     int8 roundingMode;
   3453     floatx80 z;
   3454 
   3455     aExp = extractFloatx80Exp( a );
   3456     if ( 0x403E <= aExp ) {
   3457         if ( ( aExp == 0x7FFF ) && (bits64) ( extractFloatx80Frac( a )<<1 ) ) {
   3458             return propagateFloatx80NaN( a, a );
   3459         }
   3460         return a;
   3461     }
   3462     if ( aExp < 0x3FFF ) {
   3463         if (    ( aExp == 0 )
   3464              && ( (bits64) ( extractFloatx80Frac( a )<<1 ) == 0 ) ) {
   3465             return a;
   3466         }
   3467         float_set_inexact();
   3468         aSign = extractFloatx80Sign( a );
   3469         switch ( float_rounding_mode() ) {
   3470          case float_round_nearest_even:
   3471             if ( ( aExp == 0x3FFE ) && (bits64) ( extractFloatx80Frac( a )<<1 )
   3472                ) {
   3473                 return
   3474                     packFloatx80( aSign, 0x3FFF, LIT64( 0x8000000000000000 ) );
   3475             }
   3476             break;
   3477          case float_round_down:
   3478             return
   3479                   aSign ?
   3480                       packFloatx80( 1, 0x3FFF, LIT64( 0x8000000000000000 ) )
   3481                 : packFloatx80( 0, 0, 0 );
   3482          case float_round_up:
   3483             return
   3484                   aSign ? packFloatx80( 1, 0, 0 )
   3485                 : packFloatx80( 0, 0x3FFF, LIT64( 0x8000000000000000 ) );
   3486         }
   3487         return packFloatx80( aSign, 0, 0 );
   3488     }
   3489     lastBitMask = 1;
   3490     lastBitMask <<= 0x403E - aExp;
   3491     roundBitsMask = lastBitMask - 1;
   3492     z = a;
   3493     roundingMode = float_rounding_mode();
   3494     if ( roundingMode == float_round_nearest_even ) {
   3495         z.low += lastBitMask>>1;
   3496         if ( ( z.low & roundBitsMask ) == 0 ) z.low &= ~ lastBitMask;
   3497     }
   3498     else if ( roundingMode != float_round_to_zero ) {
   3499         if ( extractFloatx80Sign( z ) ^ ( roundingMode == float_round_up ) ) {
   3500             z.low += roundBitsMask;
   3501         }
   3502     }
   3503     z.low &= ~ roundBitsMask;
   3504     if ( z.low == 0 ) {
   3505         ++z.high;
   3506         z.low = LIT64( 0x8000000000000000 );
   3507     }
   3508     if ( z.low != a.low ) float_set_inexact();
   3509     return z;
   3510 
   3511 }
   3512 
   3513 /*----------------------------------------------------------------------------
   3514 | Returns the result of adding the absolute values of the extended double-
   3515 | precision floating-point values `a' and `b'.  If `zSign' is 1, the sum is
   3516 | negated before being returned.  `zSign' is ignored if the result is a NaN.
   3517 | The addition is performed according to the IEC/IEEE Standard for Binary
   3518 | Floating-Point Arithmetic.
   3519 *----------------------------------------------------------------------------*/
   3520 
   3521 static floatx80 addFloatx80Sigs( floatx80 a, floatx80 b, flag zSign )
   3522 {
   3523     int32 aExp, bExp, zExp;
   3524     bits64 aSig, bSig, zSig0, zSig1;
   3525     int32 expDiff;
   3526 
   3527     aSig = extractFloatx80Frac( a );
   3528     aExp = extractFloatx80Exp( a );
   3529     bSig = extractFloatx80Frac( b );
   3530     bExp = extractFloatx80Exp( b );
   3531     expDiff = aExp - bExp;
   3532     if ( 0 < expDiff ) {
   3533         if ( aExp == 0x7FFF ) {
   3534             if ( (bits64) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b );
   3535             return a;
   3536         }
   3537         if ( bExp == 0 ) --expDiff;
   3538         shift64ExtraRightJamming( bSig, 0, expDiff, &bSig, &zSig1 );
   3539         zExp = aExp;
   3540     }
   3541     else if ( expDiff < 0 ) {
   3542         if ( bExp == 0x7FFF ) {
   3543             if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b );
   3544             return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
   3545         }
   3546         if ( aExp == 0 ) ++expDiff;
   3547         shift64ExtraRightJamming( aSig, 0, - expDiff, &aSig, &zSig1 );
   3548         zExp = bExp;
   3549     }
   3550     else {
   3551         if ( aExp == 0x7FFF ) {
   3552             if ( (bits64) ( ( aSig | bSig )<<1 ) ) {
   3553                 return propagateFloatx80NaN( a, b );
   3554             }
   3555             return a;
   3556         }
   3557         zSig1 = 0;
   3558         zSig0 = aSig + bSig;
   3559         if ( aExp == 0 ) {
   3560             normalizeFloatx80Subnormal( zSig0, &zExp, &zSig0 );
   3561             goto roundAndPack;
   3562         }
   3563         zExp = aExp;
   3564         goto shiftRight1;
   3565     }
   3566     zSig0 = aSig + bSig;
   3567     if ( (sbits64) zSig0 < 0 ) goto roundAndPack;
   3568  shiftRight1:
   3569     shift64ExtraRightJamming( zSig0, zSig1, 1, &zSig0, &zSig1 );
   3570     zSig0 |= LIT64( 0x8000000000000000 );
   3571     ++zExp;
   3572  roundAndPack:
   3573     return
   3574         roundAndPackFloatx80(
   3575             floatx80_rounding_precision, zSign, zExp, zSig0, zSig1 );
   3576 
   3577 }
   3578 
   3579 /*----------------------------------------------------------------------------
   3580 | Returns the result of subtracting the absolute values of the extended
   3581 | double-precision floating-point values `a' and `b'.  If `zSign' is 1, the
   3582 | difference is negated before being returned.  `zSign' is ignored if the
   3583 | result is a NaN.  The subtraction is performed according to the IEC/IEEE
   3584 | Standard for Binary Floating-Point Arithmetic.
   3585 *----------------------------------------------------------------------------*/
   3586 
   3587 static floatx80 subFloatx80Sigs( floatx80 a, floatx80 b, flag zSign )
   3588 {
   3589     int32 aExp, bExp, zExp;
   3590     bits64 aSig, bSig, zSig0, zSig1;
   3591     int32 expDiff;
   3592     floatx80 z;
   3593 
   3594     aSig = extractFloatx80Frac( a );
   3595     aExp = extractFloatx80Exp( a );
   3596     bSig = extractFloatx80Frac( b );
   3597     bExp = extractFloatx80Exp( b );
   3598     expDiff = aExp - bExp;
   3599     if ( 0 < expDiff ) goto aExpBigger;
   3600     if ( expDiff < 0 ) goto bExpBigger;
   3601     if ( aExp == 0x7FFF ) {
   3602         if ( (bits64) ( ( aSig | bSig )<<1 ) ) {
   3603             return propagateFloatx80NaN( a, b );
   3604         }
   3605         float_raise( float_flag_invalid );
   3606         z.low = floatx80_default_nan_low;
   3607         z.high = floatx80_default_nan_high;
   3608         return z;
   3609     }
   3610     if ( aExp == 0 ) {
   3611         aExp = 1;
   3612         bExp = 1;
   3613     }
   3614     zSig1 = 0;
   3615     if ( bSig < aSig ) goto aBigger;
   3616     if ( aSig < bSig ) goto bBigger;
   3617     return packFloatx80( float_rounding_mode() == float_round_down, 0, 0 );
   3618  bExpBigger:
   3619     if ( bExp == 0x7FFF ) {
   3620         if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b );
   3621         return packFloatx80( zSign ^ 1, 0x7FFF, LIT64( 0x8000000000000000 ) );
   3622     }
   3623     if ( aExp == 0 ) ++expDiff;
   3624     shift128RightJamming( aSig, 0, - expDiff, &aSig, &zSig1 );
   3625  bBigger:
   3626     sub128( bSig, 0, aSig, zSig1, &zSig0, &zSig1 );
   3627     zExp = bExp;
   3628     zSign ^= 1;
   3629     goto normalizeRoundAndPack;
   3630  aExpBigger:
   3631     if ( aExp == 0x7FFF ) {
   3632         if ( (bits64) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b );
   3633         return a;
   3634     }
   3635     if ( bExp == 0 ) --expDiff;
   3636     shift128RightJamming( bSig, 0, expDiff, &bSig, &zSig1 );
   3637  aBigger:
   3638     sub128( aSig, 0, bSig, zSig1, &zSig0, &zSig1 );
   3639     zExp = aExp;
   3640  normalizeRoundAndPack:
   3641     return
   3642         normalizeRoundAndPackFloatx80(
   3643             floatx80_rounding_precision, zSign, zExp, zSig0, zSig1 );
   3644 
   3645 }
   3646 
   3647 /*----------------------------------------------------------------------------
   3648 | Returns the result of adding the extended double-precision floating-point
   3649 | values `a' and `b'.  The operation is performed according to the IEC/IEEE
   3650 | Standard for Binary Floating-Point Arithmetic.
   3651 *----------------------------------------------------------------------------*/
   3652 
   3653 floatx80 floatx80_add( floatx80 a, floatx80 b )
   3654 {
   3655     flag aSign, bSign;
   3656 
   3657     aSign = extractFloatx80Sign( a );
   3658     bSign = extractFloatx80Sign( b );
   3659     if ( aSign == bSign ) {
   3660         return addFloatx80Sigs( a, b, aSign );
   3661     }
   3662     else {
   3663         return subFloatx80Sigs( a, b, aSign );
   3664     }
   3665 
   3666 }
   3667 
   3668 /*----------------------------------------------------------------------------
   3669 | Returns the result of subtracting the extended double-precision floating-
   3670 | point values `a' and `b'.  The operation is performed according to the
   3671 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
   3672 *----------------------------------------------------------------------------*/
   3673 
   3674 floatx80 floatx80_sub( floatx80 a, floatx80 b )
   3675 {
   3676     flag aSign, bSign;
   3677 
   3678     aSign = extractFloatx80Sign( a );
   3679     bSign = extractFloatx80Sign( b );
   3680     if ( aSign == bSign ) {
   3681         return subFloatx80Sigs( a, b, aSign );
   3682     }
   3683     else {
   3684         return addFloatx80Sigs( a, b, aSign );
   3685     }
   3686 
   3687 }
   3688 
   3689 /*----------------------------------------------------------------------------
   3690 | Returns the result of multiplying the extended double-precision floating-
   3691 | point values `a' and `b'.  The operation is performed according to the
   3692 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
   3693 *----------------------------------------------------------------------------*/
   3694 
   3695 floatx80 floatx80_mul( floatx80 a, floatx80 b )
   3696 {
   3697     flag aSign, bSign, zSign;
   3698     int32 aExp, bExp, zExp;
   3699     bits64 aSig, bSig, zSig0, zSig1;
   3700     floatx80 z;
   3701 
   3702     aSig = extractFloatx80Frac( a );
   3703     aExp = extractFloatx80Exp( a );
   3704     aSign = extractFloatx80Sign( a );
   3705     bSig = extractFloatx80Frac( b );
   3706     bExp = extractFloatx80Exp( b );
   3707     bSign = extractFloatx80Sign( b );
   3708     zSign = aSign ^ bSign;
   3709     if ( aExp == 0x7FFF ) {
   3710         if (    (bits64) ( aSig<<1 )
   3711              || ( ( bExp == 0x7FFF ) && (bits64) ( bSig<<1 ) ) ) {
   3712             return propagateFloatx80NaN( a, b );
   3713         }
   3714         if ( ( bExp | bSig ) == 0 ) goto invalid;
   3715         return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
   3716     }
   3717     if ( bExp == 0x7FFF ) {
   3718         if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b );
   3719         if ( ( aExp | aSig ) == 0 ) {
   3720  invalid:
   3721             float_raise( float_flag_invalid );
   3722             z.low = floatx80_default_nan_low;
   3723             z.high = floatx80_default_nan_high;
   3724             return z;
   3725         }
   3726         return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
   3727     }
   3728     if ( aExp == 0 ) {
   3729         if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 );
   3730         normalizeFloatx80Subnormal( aSig, &aExp, &aSig );
   3731     }
   3732     if ( bExp == 0 ) {
   3733         if ( bSig == 0 ) return packFloatx80( zSign, 0, 0 );
   3734         normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
   3735     }
   3736     zExp = aExp + bExp - 0x3FFE;
   3737     mul64To128( aSig, bSig, &zSig0, &zSig1 );
   3738     if ( 0 < (sbits64) zSig0 ) {
   3739         shortShift128Left( zSig0, zSig1, 1, &zSig0, &zSig1 );
   3740         --zExp;
   3741     }
   3742     return
   3743         roundAndPackFloatx80(
   3744             floatx80_rounding_precision, zSign, zExp, zSig0, zSig1 );
   3745 
   3746 }
   3747 
   3748 /*----------------------------------------------------------------------------
   3749 | Returns the result of dividing the extended double-precision floating-point
   3750 | value `a' by the corresponding value `b'.  The operation is performed
   3751 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
   3752 *----------------------------------------------------------------------------*/
   3753 
   3754 floatx80 floatx80_div( floatx80 a, floatx80 b )
   3755 {
   3756     flag aSign, bSign, zSign;
   3757     int32 aExp, bExp, zExp;
   3758     bits64 aSig, bSig, zSig0, zSig1;
   3759     bits64 rem0, rem1, rem2, term0, term1, term2;
   3760     floatx80 z;
   3761 
   3762     aSig = extractFloatx80Frac( a );
   3763     aExp = extractFloatx80Exp( a );
   3764     aSign = extractFloatx80Sign( a );
   3765     bSig = extractFloatx80Frac( b );
   3766     bExp = extractFloatx80Exp( b );
   3767     bSign = extractFloatx80Sign( b );
   3768     zSign = aSign ^ bSign;
   3769     if ( aExp == 0x7FFF ) {
   3770         if ( (bits64) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b );
   3771         if ( bExp == 0x7FFF ) {
   3772             if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b );
   3773             goto invalid;
   3774         }
   3775         return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
   3776     }
   3777     if ( bExp == 0x7FFF ) {
   3778         if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b );
   3779         return packFloatx80( zSign, 0, 0 );
   3780     }
   3781     if ( bExp == 0 ) {
   3782         if ( bSig == 0 ) {
   3783             if ( ( aExp | aSig ) == 0 ) {
   3784  invalid:
   3785                 float_raise( float_flag_invalid );
   3786                 z.low = floatx80_default_nan_low;
   3787                 z.high = floatx80_default_nan_high;
   3788                 return z;
   3789             }
   3790             float_raise( float_flag_divbyzero );
   3791             return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
   3792         }
   3793         normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
   3794     }
   3795     if ( aExp == 0 ) {
   3796         if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 );
   3797         normalizeFloatx80Subnormal( aSig, &aExp, &aSig );
   3798     }
   3799     zExp = aExp - bExp + 0x3FFE;
   3800     rem1 = 0;
   3801     if ( bSig <= aSig ) {
   3802         shift128Right( aSig, 0, 1, &aSig, &rem1 );
   3803         ++zExp;
   3804     }
   3805     zSig0 = estimateDiv128To64( aSig, rem1, bSig );
   3806     mul64To128( bSig, zSig0, &term0, &term1 );
   3807     sub128( aSig, rem1, term0, term1, &rem0, &rem1 );
   3808     while ( (sbits64) rem0 < 0 ) {
   3809         --zSig0;
   3810         add128( rem0, rem1, 0, bSig, &rem0, &rem1 );
   3811     }
   3812     zSig1 = estimateDiv128To64( rem1, 0, bSig );
   3813     if ( (bits64) ( zSig1<<1 ) <= 8 ) {
   3814         mul64To128( bSig, zSig1, &term1, &term2 );
   3815         sub128( rem1, 0, term1, term2, &rem1, &rem2 );
   3816         while ( (sbits64) rem1 < 0 ) {
   3817             --zSig1;
   3818             add128( rem1, rem2, 0, bSig, &rem1, &rem2 );
   3819         }
   3820         zSig1 |= ( ( rem1 | rem2 ) != 0 );
   3821     }
   3822     return
   3823         roundAndPackFloatx80(
   3824             floatx80_rounding_precision, zSign, zExp, zSig0, zSig1 );
   3825 
   3826 }
   3827 
   3828 /*----------------------------------------------------------------------------
   3829 | Returns the remainder of the extended double-precision floating-point value
   3830 | `a' with respect to the corresponding value `b'.  The operation is performed
   3831 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
   3832 *----------------------------------------------------------------------------*/
   3833 
   3834 floatx80 floatx80_rem( floatx80 a, floatx80 b )
   3835 {
   3836     flag aSign, bSign, zSign;
   3837     int32 aExp, bExp, expDiff;
   3838     bits64 aSig0, aSig1, bSig;
   3839     bits64 q, term0, term1, alternateASig0, alternateASig1;
   3840     floatx80 z;
   3841 
   3842     aSig0 = extractFloatx80Frac( a );
   3843     aExp = extractFloatx80Exp( a );
   3844     aSign = extractFloatx80Sign( a );
   3845     bSig = extractFloatx80Frac( b );
   3846     bExp = extractFloatx80Exp( b );
   3847     bSign = extractFloatx80Sign( b );
   3848     if ( aExp == 0x7FFF ) {
   3849         if (    (bits64) ( aSig0<<1 )
   3850              || ( ( bExp == 0x7FFF ) && (bits64) ( bSig<<1 ) ) ) {
   3851             return propagateFloatx80NaN( a, b );
   3852         }
   3853         goto invalid;
   3854     }
   3855     if ( bExp == 0x7FFF ) {
   3856         if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b );
   3857         return a;
   3858     }
   3859     if ( bExp == 0 ) {
   3860         if ( bSig == 0 ) {
   3861  invalid:
   3862             float_raise( float_flag_invalid );
   3863             z.low = floatx80_default_nan_low;
   3864             z.high = floatx80_default_nan_high;
   3865             return z;
   3866         }
   3867         normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
   3868     }
   3869     if ( aExp == 0 ) {
   3870         if ( (bits64) ( aSig0<<1 ) == 0 ) return a;
   3871         normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 );
   3872     }
   3873     bSig |= LIT64( 0x8000000000000000 );
   3874     zSign = aSign;
   3875     expDiff = aExp - bExp;
   3876     aSig1 = 0;
   3877     if ( expDiff < 0 ) {
   3878         if ( expDiff < -1 ) return a;
   3879         shift128Right( aSig0, 0, 1, &aSig0, &aSig1 );
   3880         expDiff = 0;
   3881     }
   3882     q = ( bSig <= aSig0 );
   3883     if ( q ) aSig0 -= bSig;
   3884     expDiff -= 64;
   3885     while ( 0 < expDiff ) {
   3886         q = estimateDiv128To64( aSig0, aSig1, bSig );
   3887         q = ( 2 < q ) ? q - 2 : 0;
   3888         mul64To128( bSig, q, &term0, &term1 );
   3889         sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
   3890         shortShift128Left( aSig0, aSig1, 62, &aSig0, &aSig1 );
   3891         expDiff -= 62;
   3892     }
   3893     expDiff += 64;
   3894     if ( 0 < expDiff ) {
   3895         q = estimateDiv128To64( aSig0, aSig1, bSig );
   3896         q = ( 2 < q ) ? q - 2 : 0;
   3897         q >>= 64 - expDiff;
   3898         mul64To128( bSig, q<<( 64 - expDiff ), &term0, &term1 );
   3899         sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
   3900         shortShift128Left( 0, bSig, 64 - expDiff, &term0, &term1 );
   3901         while ( le128( term0, term1, aSig0, aSig1 ) ) {
   3902             ++q;
   3903             sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
   3904         }
   3905     }
   3906     else {
   3907         term1 = 0;
   3908         term0 = bSig;
   3909     }
   3910     sub128( term0, term1, aSig0, aSig1, &alternateASig0, &alternateASig1 );
   3911     if (    lt128( alternateASig0, alternateASig1, aSig0, aSig1 )
   3912          || (    eq128( alternateASig0, alternateASig1, aSig0, aSig1 )
   3913               && ( q & 1 ) )
   3914        ) {
   3915         aSig0 = alternateASig0;
   3916         aSig1 = alternateASig1;
   3917         zSign = ! zSign;
   3918     }
   3919     return
   3920         normalizeRoundAndPackFloatx80(
   3921             80, zSign, bExp + expDiff, aSig0, aSig1 );
   3922 
   3923 }
   3924 
   3925 /*----------------------------------------------------------------------------
   3926 | Returns the square root of the extended double-precision floating-point
   3927 | value `a'.  The operation is performed according to the IEC/IEEE Standard
   3928 | for Binary Floating-Point Arithmetic.
   3929 *----------------------------------------------------------------------------*/
   3930 
   3931 floatx80 floatx80_sqrt( floatx80 a )
   3932 {
   3933     flag aSign;
   3934     int32 aExp, zExp;
   3935     bits64 aSig0, aSig1, zSig0, zSig1, doubleZSig0;
   3936     bits64 rem0, rem1, rem2, rem3, term0, term1, term2, term3;
   3937     floatx80 z;
   3938 
   3939     aSig0 = extractFloatx80Frac( a );
   3940     aExp = extractFloatx80Exp( a );
   3941     aSign = extractFloatx80Sign( a );
   3942     if ( aExp == 0x7FFF ) {
   3943         if ( (bits64) ( aSig0<<1 ) ) return propagateFloatx80NaN( a, a );
   3944         if ( ! aSign ) return a;
   3945         goto invalid;
   3946     }
   3947     if ( aSign ) {
   3948         if ( ( aExp | aSig0 ) == 0 ) return a;
   3949  invalid:
   3950         float_raise( float_flag_invalid );
   3951         z.low = floatx80_default_nan_low;
   3952         z.high = floatx80_default_nan_high;
   3953         return z;
   3954     }
   3955     if ( aExp == 0 ) {
   3956         if ( aSig0 == 0 ) return packFloatx80( 0, 0, 0 );
   3957         normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 );
   3958     }
   3959     zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFF;
   3960     zSig0 = estimateSqrt32( aExp, aSig0>>32 );
   3961     shift128Right( aSig0, 0, 2 + ( aExp & 1 ), &aSig0, &aSig1 );
   3962     zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 );
   3963     doubleZSig0 = zSig0<<1;
   3964     mul64To128( zSig0, zSig0, &term0, &term1 );
   3965     sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 );
   3966     while ( (sbits64) rem0 < 0 ) {
   3967         --zSig0;
   3968         doubleZSig0 -= 2;
   3969         add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 );
   3970     }
   3971     zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 );
   3972     if ( ( zSig1 & LIT64( 0x3FFFFFFFFFFFFFFF ) ) <= 5 ) {
   3973         if ( zSig1 == 0 ) zSig1 = 1;
   3974         mul64To128( doubleZSig0, zSig1, &term1, &term2 );
   3975         sub128( rem1, 0, term1, term2, &rem1, &rem2 );
   3976         mul64To128( zSig1, zSig1, &term2, &term3 );
   3977         sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 );
   3978         while ( (sbits64) rem1 < 0 ) {
   3979             --zSig1;
   3980             shortShift128Left( 0, zSig1, 1, &term2, &term3 );
   3981             term3 |= 1;
   3982             term2 |= doubleZSig0;
   3983             add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 );
   3984         }
   3985         zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
   3986     }
   3987     shortShift128Left( 0, zSig1, 1, &zSig0, &zSig1 );
   3988     zSig0 |= doubleZSig0;
   3989     return
   3990         roundAndPackFloatx80(
   3991             floatx80_rounding_precision, 0, zExp, zSig0, zSig1 );
   3992 
   3993 }
   3994 
   3995 /*----------------------------------------------------------------------------
   3996 | Returns 1 if the extended double-precision floating-point value `a' is
   3997 | equal to the corresponding value `b', and 0 otherwise.  The comparison is
   3998 | performed according to the IEC/IEEE Standard for Binary Floating-Point
   3999 | Arithmetic.
   4000 *----------------------------------------------------------------------------*/
   4001 
   4002 flag floatx80_eq( floatx80 a, floatx80 b )
   4003 {
   4004 
   4005     if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
   4006               && (bits64) ( extractFloatx80Frac( a )<<1 ) )
   4007          || (    ( extractFloatx80Exp( b ) == 0x7FFF )
   4008               && (bits64) ( extractFloatx80Frac( b )<<1 ) )
   4009        ) {
   4010         if (    floatx80_is_signaling_nan( a )
   4011              || floatx80_is_signaling_nan( b ) ) {
   4012             float_raise( float_flag_invalid );
   4013         }
   4014         return 0;
   4015     }
   4016     return
   4017            ( a.low == b.low )
   4018         && (    ( a.high == b.high )
   4019              || (    ( a.low == 0 )
   4020                   && ( (bits16) ( ( a.high | b.high )<<1 ) == 0 ) )
   4021            );
   4022 
   4023 }
   4024 
   4025 /*----------------------------------------------------------------------------
   4026 | Returns 1 if the extended double-precision floating-point value `a' is
   4027 | less than or equal to the corresponding value `b', and 0 otherwise.  The
   4028 | comparison is performed according to the IEC/IEEE Standard for Binary
   4029 | Floating-Point Arithmetic.
   4030 *----------------------------------------------------------------------------*/
   4031 
   4032 flag floatx80_le( floatx80 a, floatx80 b )
   4033 {
   4034     flag aSign, bSign;
   4035 
   4036     if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
   4037               && (bits64) ( extractFloatx80Frac( a )<<1 ) )
   4038          || (    ( extractFloatx80Exp( b ) == 0x7FFF )
   4039               && (bits64) ( extractFloatx80Frac( b )<<1 ) )
   4040        ) {
   4041         float_raise( float_flag_invalid );
   4042         return 0;
   4043     }
   4044     aSign = extractFloatx80Sign( a );
   4045     bSign = extractFloatx80Sign( b );
   4046     if ( aSign != bSign ) {
   4047         return
   4048                aSign
   4049             || (    ( ( (bits16) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
   4050                  == 0 );
   4051     }
   4052     return
   4053           aSign ? le128( b.high, b.low, a.high, a.low )
   4054         : le128( a.high, a.low, b.high, b.low );
   4055 
   4056 }
   4057 
   4058 /*----------------------------------------------------------------------------
   4059 | Returns 1 if the extended double-precision floating-point value `a' is
   4060 | less than the corresponding value `b', and 0 otherwise.  The comparison
   4061 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
   4062 | Arithmetic.
   4063 *----------------------------------------------------------------------------*/
   4064 
   4065 flag floatx80_lt( floatx80 a, floatx80 b )
   4066 {
   4067     flag aSign, bSign;
   4068 
   4069     if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
   4070               && (bits64) ( extractFloatx80Frac( a )<<1 ) )
   4071          || (    ( extractFloatx80Exp( b ) == 0x7FFF )
   4072               && (bits64) ( extractFloatx80Frac( b )<<1 ) )
   4073        ) {
   4074         float_raise( float_flag_invalid );
   4075         return 0;
   4076     }
   4077     aSign = extractFloatx80Sign( a );
   4078     bSign = extractFloatx80Sign( b );
   4079     if ( aSign != bSign ) {
   4080         return
   4081                aSign
   4082             && (    ( ( (bits16) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
   4083                  != 0 );
   4084     }
   4085     return
   4086           aSign ? lt128( b.high, b.low, a.high, a.low )
   4087         : lt128( a.high, a.low, b.high, b.low );
   4088 
   4089 }
   4090 
   4091 /*----------------------------------------------------------------------------
   4092 | Returns 1 if the extended double-precision floating-point value `a' is equal
   4093 | to the corresponding value `b', and 0 otherwise.  The invalid exception is
   4094 | raised if either operand is a NaN.  Otherwise, the comparison is performed
   4095 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
   4096 *----------------------------------------------------------------------------*/
   4097 
   4098 flag floatx80_eq_signaling( floatx80 a, floatx80 b )
   4099 {
   4100 
   4101     if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
   4102               && (bits64) ( extractFloatx80Frac( a )<<1 ) )
   4103          || (    ( extractFloatx80Exp( b ) == 0x7FFF )
   4104               && (bits64) ( extractFloatx80Frac( b )<<1 ) )
   4105        ) {
   4106         float_raise( float_flag_invalid );
   4107         return 0;
   4108     }
   4109     return
   4110            ( a.low == b.low )
   4111         && (    ( a.high == b.high )
   4112              || (    ( a.low == 0 )
   4113                   && ( (bits16) ( ( a.high | b.high )<<1 ) == 0 ) )
   4114            );
   4115 
   4116 }
   4117 
   4118 /*----------------------------------------------------------------------------
   4119 | Returns 1 if the extended double-precision floating-point value `a' is less
   4120 | than or equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs
   4121 | do not cause an exception.  Otherwise, the comparison is performed according
   4122 | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
   4123 *----------------------------------------------------------------------------*/
   4124 
   4125 flag floatx80_le_quiet( floatx80 a, floatx80 b )
   4126 {
   4127     flag aSign, bSign;
   4128 
   4129     if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
   4130               && (bits64) ( extractFloatx80Frac( a )<<1 ) )
   4131          || (    ( extractFloatx80Exp( b ) == 0x7FFF )
   4132               && (bits64) ( extractFloatx80Frac( b )<<1 ) )
   4133        ) {
   4134         if (    floatx80_is_signaling_nan( a )
   4135              || floatx80_is_signaling_nan( b ) ) {
   4136             float_raise( float_flag_invalid );
   4137         }
   4138         return 0;
   4139     }
   4140     aSign = extractFloatx80Sign( a );
   4141     bSign = extractFloatx80Sign( b );
   4142     if ( aSign != bSign ) {
   4143         return
   4144                aSign
   4145             || (    ( ( (bits16) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
   4146                  == 0 );
   4147     }
   4148     return
   4149           aSign ? le128( b.high, b.low, a.high, a.low )
   4150         : le128( a.high, a.low, b.high, b.low );
   4151 
   4152 }
   4153 
   4154 /*----------------------------------------------------------------------------
   4155 | Returns 1 if the extended double-precision floating-point value `a' is less
   4156 | than the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause
   4157 | an exception.  Otherwise, the comparison is performed according to the
   4158 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
   4159 *----------------------------------------------------------------------------*/
   4160 
   4161 flag floatx80_lt_quiet( floatx80 a, floatx80 b )
   4162 {
   4163     flag aSign, bSign;
   4164 
   4165     if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
   4166               && (bits64) ( extractFloatx80Frac( a )<<1 ) )
   4167          || (    ( extractFloatx80Exp( b ) == 0x7FFF )
   4168               && (bits64) ( extractFloatx80Frac( b )<<1 ) )
   4169        ) {
   4170         if (    floatx80_is_signaling_nan( a )
   4171              || floatx80_is_signaling_nan( b ) ) {
   4172             float_raise( float_flag_invalid );
   4173         }
   4174         return 0;
   4175     }
   4176     aSign = extractFloatx80Sign( a );
   4177     bSign = extractFloatx80Sign( b );
   4178     if ( aSign != bSign ) {
   4179         return
   4180                aSign
   4181             && (    ( ( (bits16) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
   4182                  != 0 );
   4183     }
   4184     return
   4185           aSign ? lt128( b.high, b.low, a.high, a.low )
   4186         : lt128( a.high, a.low, b.high, b.low );
   4187 
   4188 }
   4189 
   4190 #endif
   4191 
   4192 #ifdef FLOAT128
   4193 
   4194 /*----------------------------------------------------------------------------
   4195 | Returns the result of converting the quadruple-precision floating-point
   4196 | value `a' to the 32-bit two's complement integer format.  The conversion
   4197 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
   4198 | Arithmetic---which means in particular that the conversion is rounded
   4199 | according to the current rounding mode.  If `a' is a NaN, the largest
   4200 | positive integer is returned.  Otherwise, if the conversion overflows, the
   4201 | largest integer with the same sign as `a' is returned.
   4202 *----------------------------------------------------------------------------*/
   4203 
   4204 int32 float128_to_int32( float128 a )
   4205 {
   4206     flag aSign;
   4207     int32 aExp, shiftCount;
   4208     bits64 aSig0, aSig1;
   4209 
   4210     aSig1 = extractFloat128Frac1( a );
   4211     aSig0 = extractFloat128Frac0( a );
   4212     aExp = extractFloat128Exp( a );
   4213     aSign = extractFloat128Sign( a );
   4214     if ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) aSign = 0;
   4215     if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
   4216     aSig0 |= ( aSig1 != 0 );
   4217     shiftCount = 0x4028 - aExp;
   4218     if ( 0 < shiftCount ) shift64RightJamming( aSig0, shiftCount, &aSig0 );
   4219     return roundAndPackInt32( aSign, aSig0 );
   4220 
   4221 }
   4222 
   4223 /*----------------------------------------------------------------------------
   4224 | Returns the result of converting the quadruple-precision floating-point
   4225 | value `a' to the 32-bit two's complement integer format.  The conversion
   4226 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
   4227 | Arithmetic, except that the conversion is always rounded toward zero.  If
   4228 | `a' is a NaN, the largest positive integer is returned.  Otherwise, if the
   4229 | conversion overflows, the largest integer with the same sign as `a' is
   4230 | returned.
   4231 *----------------------------------------------------------------------------*/
   4232 
   4233 int32 float128_to_int32_round_to_zero( float128 a )
   4234 {
   4235     flag aSign;
   4236     int32 aExp, shiftCount;
   4237     bits64 aSig0, aSig1, savedASig;
   4238     int32 z;
   4239 
   4240     aSig1 = extractFloat128Frac1( a );
   4241     aSig0 = extractFloat128Frac0( a );
   4242     aExp = extractFloat128Exp( a );
   4243     aSign = extractFloat128Sign( a );
   4244     aSig0 |= ( aSig1 != 0 );
   4245     if ( 0x401E < aExp ) {
   4246         if ( ( aExp == 0x7FFF ) && aSig0 ) aSign = 0;
   4247         goto invalid;
   4248     }
   4249     else if ( aExp < 0x3FFF ) {
   4250         if ( aExp || aSig0 ) float_set_inexact();
   4251         return 0;
   4252     }
   4253     aSig0 |= LIT64( 0x0001000000000000 );
   4254     shiftCount = 0x402F - aExp;
   4255     savedASig = aSig0;
   4256     aSig0 >>= shiftCount;
   4257     z = aSig0;
   4258     if ( aSign ) z = - z;
   4259     if ( ( z < 0 ) ^ aSign ) {
   4260  invalid:
   4261         float_raise( float_flag_invalid );
   4262         return aSign ? (sbits32) 0x80000000 : 0x7FFFFFFF;
   4263     }
   4264     if ( ( aSig0<<shiftCount ) != savedASig ) {
   4265         float_set_inexact();
   4266     }
   4267     return z;
   4268 
   4269 }
   4270 
   4271 /*----------------------------------------------------------------------------
   4272 | Returns the result of converting the quadruple-precision floating-point
   4273 | value `a' to the 64-bit two's complement integer format.  The conversion
   4274 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
   4275 | Arithmetic---which means in particular that the conversion is rounded
   4276 | according to the current rounding mode.  If `a' is a NaN, the largest
   4277 | positive integer is returned.  Otherwise, if the conversion overflows, the
   4278 | largest integer with the same sign as `a' is returned.
   4279 *----------------------------------------------------------------------------*/
   4280 
   4281 int64 float128_to_int64( float128 a )
   4282 {
   4283     flag aSign;
   4284     int32 aExp, shiftCount;
   4285     bits64 aSig0, aSig1;
   4286 
   4287     aSig1 = extractFloat128Frac1( a );
   4288     aSig0 = extractFloat128Frac0( a );
   4289     aExp = extractFloat128Exp( a );
   4290     aSign = extractFloat128Sign( a );
   4291     if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
   4292     shiftCount = 0x402F - aExp;
   4293     if ( shiftCount <= 0 ) {
   4294         if ( 0x403E < aExp ) {
   4295             float_raise( float_flag_invalid );
   4296             if (    ! aSign
   4297                  || (    ( aExp == 0x7FFF )
   4298                       && ( aSig1 || ( aSig0 != LIT64( 0x0001000000000000 ) ) )
   4299                     )
   4300                ) {
   4301                 return LIT64( 0x7FFFFFFFFFFFFFFF );
   4302             }
   4303             return (sbits64) LIT64( 0x8000000000000000 );
   4304         }
   4305         shortShift128Left( aSig0, aSig1, - shiftCount, &aSig0, &aSig1 );
   4306     }
   4307     else {
   4308         shift64ExtraRightJamming( aSig0, aSig1, shiftCount, &aSig0, &aSig1 );
   4309     }
   4310     return roundAndPackInt64( aSign, aSig0, aSig1 );
   4311 
   4312 }
   4313 
   4314 /*----------------------------------------------------------------------------
   4315 | Returns the result of converting the quadruple-precision floating-point
   4316 | value `a' to the 64-bit two's complement integer format.  The conversion
   4317 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
   4318 | Arithmetic, except that the conversion is always rounded toward zero.
   4319 | If `a' is a NaN, the largest positive integer is returned.  Otherwise, if
   4320 | the conversion overflows, the largest integer with the same sign as `a' is
   4321 | returned.
   4322 *----------------------------------------------------------------------------*/
   4323 
   4324 int64 float128_to_int64_round_to_zero( float128 a )
   4325 {
   4326     flag aSign;
   4327     int32 aExp, shiftCount;
   4328     bits64 aSig0, aSig1;
   4329     int64 z;
   4330 
   4331     aSig1 = extractFloat128Frac1( a );
   4332     aSig0 = extractFloat128Frac0( a );
   4333     aExp = extractFloat128Exp( a );
   4334     aSign = extractFloat128Sign( a );
   4335     if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
   4336     shiftCount = aExp - 0x402F;
   4337     if ( 0 < shiftCount ) {
   4338         if ( 0x403E <= aExp ) {
   4339             aSig0 &= LIT64( 0x0000FFFFFFFFFFFF );
   4340             if (    ( a.high == LIT64( 0xC03E000000000000 ) )
   4341                  && ( aSig1 < LIT64( 0x0002000000000000 ) ) ) {
   4342                 if ( aSig1 ) float_set_inexact();
   4343             }
   4344             else {
   4345                 float_raise( float_flag_invalid );
   4346                 if ( ! aSign || ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) ) {
   4347                     return LIT64( 0x7FFFFFFFFFFFFFFF );
   4348                 }
   4349             }
   4350             return (sbits64) LIT64( 0x8000000000000000 );
   4351         }
   4352         z = ( aSig0<<shiftCount ) | ( aSig1>>( ( - shiftCount ) & 63 ) );
   4353         if ( (bits64) ( aSig1<<shiftCount ) ) {
   4354             float_set_inexact();
   4355         }
   4356     }
   4357     else {
   4358         if ( aExp < 0x3FFF ) {
   4359             if ( aExp | aSig0 | aSig1 ) {
   4360                 float_set_inexact();
   4361             }
   4362             return 0;
   4363         }
   4364         z = aSig0>>( - shiftCount );
   4365         if (    aSig1
   4366              || ( shiftCount && (bits64) ( aSig0<<( shiftCount & 63 ) ) ) ) {
   4367             float_set_inexact();
   4368         }
   4369     }
   4370     if ( aSign ) z = - z;
   4371     return z;
   4372 
   4373 }
   4374 
   4375 /*----------------------------------------------------------------------------
   4376 | Returns the result of converting the quadruple-precision floating-point
   4377 | value `a' to the single-precision floating-point format.  The conversion
   4378 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
   4379 | Arithmetic.
   4380 *----------------------------------------------------------------------------*/
   4381 
   4382 float32 float128_to_float32( float128 a )
   4383 {
   4384     flag aSign;
   4385     int32 aExp;
   4386     bits64 aSig0, aSig1;
   4387     bits32 zSig;
   4388 
   4389     aSig1 = extractFloat128Frac1( a );
   4390     aSig0 = extractFloat128Frac0( a );
   4391     aExp = extractFloat128Exp( a );
   4392     aSign = extractFloat128Sign( a );
   4393     if ( aExp == 0x7FFF ) {
   4394         if ( aSig0 | aSig1 ) {
   4395             return commonNaNToFloat32( float128ToCommonNaN( a ) );
   4396         }
   4397         return packFloat32( aSign, 0xFF, 0 );
   4398     }
   4399     aSig0 |= ( aSig1 != 0 );
   4400     shift64RightJamming( aSig0, 18, &aSig0 );
   4401     zSig = aSig0;
   4402     if ( aExp || zSig ) {
   4403         zSig |= 0x40000000;
   4404         aExp -= 0x3F81;
   4405     }
   4406     return roundAndPackFloat32( aSign, aExp, zSig );
   4407 
   4408 }
   4409 
   4410 /*----------------------------------------------------------------------------
   4411 | Returns the result of converting the quadruple-precision floating-point
   4412 | value `a' to the double-precision floating-point format.  The conversion
   4413 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
   4414 | Arithmetic.
   4415 *----------------------------------------------------------------------------*/
   4416 
   4417 float64 float128_to_float64( float128 a )
   4418 {
   4419     flag aSign;
   4420     int32 aExp;
   4421     bits64 aSig0, aSig1;
   4422 
   4423     aSig1 = extractFloat128Frac1( a );
   4424     aSig0 = extractFloat128Frac0( a );
   4425     aExp = extractFloat128Exp( a );
   4426     aSign = extractFloat128Sign( a );
   4427     if ( aExp == 0x7FFF ) {
   4428         if ( aSig0 | aSig1 ) {
   4429             return commonNaNToFloat64( float128ToCommonNaN( a ) );
   4430         }
   4431         return packFloat64( aSign, 0x7FF, 0 );
   4432     }
   4433     shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 );
   4434     aSig0 |= ( aSig1 != 0 );
   4435     if ( aExp || aSig0 ) {
   4436         aSig0 |= LIT64( 0x4000000000000000 );
   4437         aExp -= 0x3C01;
   4438     }
   4439     return roundAndPackFloat64( aSign, aExp, aSig0 );
   4440 
   4441 }
   4442 
   4443 #ifdef FLOATX80
   4444 
   4445 /*----------------------------------------------------------------------------
   4446 | Returns the result of converting the quadruple-precision floating-point
   4447 | value `a' to the extended double-precision floating-point format.  The
   4448 | conversion is performed according to the IEC/IEEE Standard for Binary
   4449 | Floating-Point Arithmetic.
   4450 *----------------------------------------------------------------------------*/
   4451 
   4452 floatx80 float128_to_floatx80( float128 a )
   4453 {
   4454     flag aSign;
   4455     int32 aExp;
   4456     bits64 aSig0, aSig1;
   4457 
   4458     aSig1 = extractFloat128Frac1( a );
   4459     aSig0 = extractFloat128Frac0( a );
   4460     aExp = extractFloat128Exp( a );
   4461     aSign = extractFloat128Sign( a );
   4462     if ( aExp == 0x7FFF ) {
   4463         if ( aSig0 | aSig1 ) {
   4464             return commonNaNToFloatx80( float128ToCommonNaN( a ) );
   4465         }
   4466         return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
   4467     }
   4468     if ( aExp == 0 ) {
   4469         if ( ( aSig0 | aSig1 ) == 0 ) return packFloatx80( aSign, 0, 0 );
   4470         normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
   4471     }
   4472     else {
   4473         aSig0 |= LIT64( 0x0001000000000000 );
   4474     }
   4475     shortShift128Left( aSig0, aSig1, 15, &aSig0, &aSig1 );
   4476     return roundAndPackFloatx80( 80, aSign, aExp, aSig0, aSig1 );
   4477 
   4478 }
   4479 
   4480 #endif
   4481 
   4482 /*----------------------------------------------------------------------------
   4483 | Rounds the quadruple-precision floating-point value `a' to an integer, and
   4484 | returns the result as a quadruple-precision floating-point value.  The
   4485 | operation is performed according to the IEC/IEEE Standard for Binary
   4486 | Floating-Point Arithmetic.
   4487 *----------------------------------------------------------------------------*/
   4488 
   4489 float128 float128_round_to_int( float128 a )
   4490 {
   4491     flag aSign;
   4492     int32 aExp;
   4493     bits64 lastBitMask, roundBitsMask;
   4494     int8 roundingMode;
   4495     float128 z;
   4496 
   4497     aExp = extractFloat128Exp( a );
   4498     if ( 0x402F <= aExp ) {
   4499         if ( 0x406F <= aExp ) {
   4500             if (    ( aExp == 0x7FFF )
   4501                  && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) )
   4502                ) {
   4503                 return propagateFloat128NaN( a, a );
   4504             }
   4505             return a;
   4506         }
   4507         lastBitMask = 1;
   4508         lastBitMask = ( lastBitMask<<( 0x406E - aExp ) )<<1;
   4509         roundBitsMask = lastBitMask - 1;
   4510         z = a;
   4511         roundingMode = float_rounding_mode();
   4512         if ( roundingMode == float_round_nearest_even ) {
   4513             if ( lastBitMask ) {
   4514                 add128( z.high, z.low, 0, lastBitMask>>1, &z.high, &z.low );
   4515                 if ( ( z.low & roundBitsMask ) == 0 ) z.low &= ~ lastBitMask;
   4516             }
   4517             else {
   4518                 if ( (sbits64) z.low < 0 ) {
   4519                     ++z.high;
   4520                     if ( (bits64) ( z.low<<1 ) == 0 ) z.high &= ~1;
   4521                 }
   4522             }
   4523         }
   4524         else if ( roundingMode != float_round_to_zero ) {
   4525             if (   extractFloat128Sign( z )
   4526                  ^ ( roundingMode == float_round_up ) ) {
   4527                 add128( z.high, z.low, 0, roundBitsMask, &z.high, &z.low );
   4528             }
   4529         }
   4530         z.low &= ~ roundBitsMask;
   4531     }
   4532     else {
   4533         if ( aExp < 0x3FFF ) {
   4534             if ( ( ( (bits64) ( a.high<<1 ) ) | a.low ) == 0 ) return a;
   4535             float_set_inexact();
   4536             aSign = extractFloat128Sign( a );
   4537             switch ( float_rounding_mode() ) {
   4538              case float_round_nearest_even:
   4539                 if (    ( aExp == 0x3FFE )
   4540                      && (   extractFloat128Frac0( a )
   4541                           | extractFloat128Frac1( a ) )
   4542                    ) {
   4543                     return packFloat128( aSign, 0x3FFF, 0, 0 );
   4544                 }
   4545                 break;
   4546              case float_round_down:
   4547                 return
   4548                       aSign ? packFloat128( 1, 0x3FFF, 0, 0 )
   4549                     : packFloat128( 0, 0, 0, 0 );
   4550              case float_round_up:
   4551                 return
   4552                       aSign ? packFloat128( 1, 0, 0, 0 )
   4553                     : packFloat128( 0, 0x3FFF, 0, 0 );
   4554             }
   4555             return packFloat128( aSign, 0, 0, 0 );
   4556         }
   4557         lastBitMask = 1;
   4558         lastBitMask <<= 0x402F - aExp;
   4559         roundBitsMask = lastBitMask - 1;
   4560         z.low = 0;
   4561         z.high = a.high;
   4562         roundingMode = float_rounding_mode();
   4563         if ( roundingMode == float_round_nearest_even ) {
   4564             z.high += lastBitMask>>1;
   4565             if ( ( ( z.high & roundBitsMask ) | a.low ) == 0 ) {
   4566                 z.high &= ~ lastBitMask;
   4567             }
   4568         }
   4569         else if ( roundingMode != float_round_to_zero ) {
   4570             if (   extractFloat128Sign( z )
   4571                  ^ ( roundingMode == float_round_up ) ) {
   4572                 z.high |= ( a.low != 0 );
   4573                 z.high += roundBitsMask;
   4574             }
   4575         }
   4576         z.high &= ~ roundBitsMask;
   4577     }
   4578     if ( ( z.low != a.low ) || ( z.high != a.high ) ) {
   4579         float_set_inexact();
   4580     }
   4581     return z;
   4582 
   4583 }
   4584 
   4585 /*----------------------------------------------------------------------------
   4586 | Returns the result of adding the absolute values of the quadruple-precision
   4587 | floating-point values `a' and `b'.  If `zSign' is 1, the sum is negated
   4588 | before being returned.  `zSign' is ignored if the result is a NaN.
   4589 | The addition is performed according to the IEC/IEEE Standard for Binary
   4590 | Floating-Point Arithmetic.
   4591 *----------------------------------------------------------------------------*/
   4592 
   4593 static float128 addFloat128Sigs( float128 a, float128 b, flag zSign )
   4594 {
   4595     int32 aExp, bExp, zExp;
   4596     bits64 aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2;
   4597     int32 expDiff;
   4598 
   4599     aSig1 = extractFloat128Frac1( a );
   4600     aSig0 = extractFloat128Frac0( a );
   4601     aExp = extractFloat128Exp( a );
   4602     bSig1 = extractFloat128Frac1( b );
   4603     bSig0 = extractFloat128Frac0( b );
   4604     bExp = extractFloat128Exp( b );
   4605     expDiff = aExp - bExp;
   4606     if ( 0 < expDiff ) {
   4607         if ( aExp == 0x7FFF ) {
   4608             if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, b );
   4609             return a;
   4610         }
   4611         if ( bExp == 0 ) {
   4612             --expDiff;
   4613         }
   4614         else {
   4615             bSig0 |= LIT64( 0x0001000000000000 );
   4616         }
   4617         shift128ExtraRightJamming(
   4618             bSig0, bSig1, 0, expDiff, &bSig0, &bSig1, &zSig2 );
   4619         zExp = aExp;
   4620     }
   4621     else if ( expDiff < 0 ) {
   4622         if ( bExp == 0x7FFF ) {
   4623             if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b );
   4624             return packFloat128( zSign, 0x7FFF, 0, 0 );
   4625         }
   4626         if ( aExp == 0 ) {
   4627             ++expDiff;
   4628         }
   4629         else {
   4630             aSig0 |= LIT64( 0x0001000000000000 );
   4631         }
   4632         shift128ExtraRightJamming(
   4633             aSig0, aSig1, 0, - expDiff, &aSig0, &aSig1, &zSig2 );
   4634         zExp = bExp;
   4635     }
   4636     else {
   4637         if ( aExp == 0x7FFF ) {
   4638             if ( aSig0 | aSig1 | bSig0 | bSig1 ) {
   4639                 return propagateFloat128NaN( a, b );
   4640             }
   4641             return a;
   4642         }
   4643         add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
   4644         if ( aExp == 0 ) return packFloat128( zSign, 0, zSig0, zSig1 );
   4645         zSig2 = 0;
   4646         zSig0 |= LIT64( 0x0002000000000000 );
   4647         zExp = aExp;
   4648         goto shiftRight1;
   4649     }
   4650     aSig0 |= LIT64( 0x0001000000000000 );
   4651     add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
   4652     --zExp;
   4653     if ( zSig0 < LIT64( 0x0002000000000000 ) ) goto roundAndPack;
   4654     ++zExp;
   4655  shiftRight1:
   4656     shift128ExtraRightJamming(
   4657         zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 );
   4658  roundAndPack:
   4659     return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 );
   4660 
   4661 }
   4662 
   4663 /*----------------------------------------------------------------------------
   4664 | Returns the result of subtracting the absolute values of the quadruple-
   4665 | precision floating-point values `a' and `b'.  If `zSign' is 1, the
   4666 | difference is negated before being returned.  `zSign' is ignored if the
   4667 | result is a NaN.  The subtraction is performed according to the IEC/IEEE
   4668 | Standard for Binary Floating-Point Arithmetic.
   4669 *----------------------------------------------------------------------------*/
   4670 
   4671 static float128 subFloat128Sigs( float128 a, float128 b, flag zSign )
   4672 {
   4673     int32 aExp, bExp, zExp;
   4674     bits64 aSig0, aSig1, bSig0, bSig1, zSig0, zSig1;
   4675     int32 expDiff;
   4676     float128 z;
   4677 
   4678     aSig1 = extractFloat128Frac1( a );
   4679     aSig0 = extractFloat128Frac0( a );
   4680     aExp = extractFloat128Exp( a );
   4681     bSig1 = extractFloat128Frac1( b );
   4682     bSig0 = extractFloat128Frac0( b );
   4683     bExp = extractFloat128Exp( b );
   4684     expDiff = aExp - bExp;
   4685     shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 );
   4686     shortShift128Left( bSig0, bSig1, 14, &bSig0, &bSig1 );
   4687     if ( 0 < expDiff ) goto aExpBigger;
   4688     if ( expDiff < 0 ) goto bExpBigger;
   4689     if ( aExp == 0x7FFF ) {
   4690         if ( aSig0 | aSig1 | bSig0 | bSig1 ) {
   4691             return propagateFloat128NaN( a, b );
   4692         }
   4693         float_raise( float_flag_invalid );
   4694         z.low = float128_default_nan_low;
   4695         z.high = float128_default_nan_high;
   4696         return z;
   4697     }
   4698     if ( aExp == 0 ) {
   4699         aExp = 1;
   4700         bExp = 1;
   4701     }
   4702     if ( bSig0 < aSig0 ) goto aBigger;
   4703     if ( aSig0 < bSig0 ) goto bBigger;
   4704     if ( bSig1 < aSig1 ) goto aBigger;
   4705     if ( aSig1 < bSig1 ) goto bBigger;
   4706     return packFloat128( float_rounding_mode() == float_round_down, 0, 0, 0 );
   4707  bExpBigger:
   4708     if ( bExp == 0x7FFF ) {
   4709         if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b );
   4710         return packFloat128( zSign ^ 1, 0x7FFF, 0, 0 );
   4711     }
   4712     if ( aExp == 0 ) {
   4713         ++expDiff;
   4714     }
   4715     else {
   4716         aSig0 |= LIT64( 0x4000000000000000 );
   4717     }
   4718     shift128RightJamming( aSig0, aSig1, - expDiff, &aSig0, &aSig1 );
   4719     bSig0 |= LIT64( 0x4000000000000000 );
   4720  bBigger:
   4721     sub128( bSig0, bSig1, aSig0, aSig1, &zSig0, &zSig1 );
   4722     zExp = bExp;
   4723     zSign ^= 1;
   4724     goto normalizeRoundAndPack;
   4725  aExpBigger:
   4726     if ( aExp == 0x7FFF ) {
   4727         if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, b );
   4728         return a;
   4729     }
   4730     if ( bExp == 0 ) {
   4731         --expDiff;
   4732     }
   4733     else {
   4734         bSig0 |= LIT64( 0x4000000000000000 );
   4735     }
   4736     shift128RightJamming( bSig0, bSig1, expDiff, &bSig0, &bSig1 );
   4737     aSig0 |= LIT64( 0x4000000000000000 );
   4738  aBigger:
   4739     sub128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
   4740     zExp = aExp;
   4741  normalizeRoundAndPack:
   4742     --zExp;
   4743     return normalizeRoundAndPackFloat128( zSign, zExp - 14, zSig0, zSig1 );
   4744 
   4745 }
   4746 
   4747 /*----------------------------------------------------------------------------
   4748 | Returns the result of adding the quadruple-precision floating-point values
   4749 | `a' and `b'.  The operation is performed according to the IEC/IEEE Standard
   4750 | for Binary Floating-Point Arithmetic.
   4751 *----------------------------------------------------------------------------*/
   4752 
   4753 float128 float128_add( float128 a, float128 b )
   4754 {
   4755     flag aSign, bSign;
   4756 
   4757     aSign = extractFloat128Sign( a );
   4758     bSign = extractFloat128Sign( b );
   4759     if ( aSign == bSign ) {
   4760         return addFloat128Sigs( a, b, aSign );
   4761     }
   4762     else {
   4763         return subFloat128Sigs( a, b, aSign );
   4764     }
   4765 
   4766 }
   4767 
   4768 /*----------------------------------------------------------------------------
   4769 | Returns the result of subtracting the quadruple-precision floating-point
   4770 | values `a' and `b'.  The operation is performed according to the IEC/IEEE
   4771 | Standard for Binary Floating-Point Arithmetic.
   4772 *----------------------------------------------------------------------------*/
   4773 
   4774 float128 float128_sub( float128 a, float128 b )
   4775 {
   4776     flag aSign, bSign;
   4777 
   4778     aSign = extractFloat128Sign( a );
   4779     bSign = extractFloat128Sign( b );
   4780     if ( aSign == bSign ) {
   4781         return subFloat128Sigs( a, b, aSign );
   4782     }
   4783     else {
   4784         return addFloat128Sigs( a, b, aSign );
   4785     }
   4786 
   4787 }
   4788 
   4789 /*----------------------------------------------------------------------------
   4790 | Returns the result of multiplying the quadruple-precision floating-point
   4791 | values `a' and `b'.  The operation is performed according to the IEC/IEEE
   4792 | Standard for Binary Floating-Point Arithmetic.
   4793 *----------------------------------------------------------------------------*/
   4794 
   4795 float128 float128_mul( float128 a, float128 b )
   4796 {
   4797     flag aSign, bSign, zSign;
   4798     int32 aExp, bExp, zExp;
   4799     bits64 aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2, zSig3;
   4800     float128 z;
   4801 
   4802     aSig1 = extractFloat128Frac1( a );
   4803     aSig0 = extractFloat128Frac0( a );
   4804     aExp = extractFloat128Exp( a );
   4805     aSign = extractFloat128Sign( a );
   4806     bSig1 = extractFloat128Frac1( b );
   4807     bSig0 = extractFloat128Frac0( b );
   4808     bExp = extractFloat128Exp( b );
   4809     bSign = extractFloat128Sign( b );
   4810     zSign = aSign ^ bSign;
   4811     if ( aExp == 0x7FFF ) {
   4812         if (    ( aSig0 | aSig1 )
   4813              || ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) {
   4814             return propagateFloat128NaN( a, b );
   4815         }
   4816         if ( ( bExp | bSig0 | bSig1 ) == 0 ) goto invalid;
   4817         return packFloat128( zSign, 0x7FFF, 0, 0 );
   4818     }
   4819     if ( bExp == 0x7FFF ) {
   4820         if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b );
   4821         if ( ( aExp | aSig0 | aSig1 ) == 0 ) {
   4822  invalid:
   4823             float_raise( float_flag_invalid );
   4824             z.low = float128_default_nan_low;
   4825             z.high = float128_default_nan_high;
   4826             return z;
   4827         }
   4828         return packFloat128( zSign, 0x7FFF, 0, 0 );
   4829     }
   4830     if ( aExp == 0 ) {
   4831         if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
   4832         normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
   4833     }
   4834     if ( bExp == 0 ) {
   4835         if ( ( bSig0 | bSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
   4836         normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
   4837     }
   4838     zExp = aExp + bExp - 0x4000;
   4839     aSig0 |= LIT64( 0x0001000000000000 );
   4840     shortShift128Left( bSig0, bSig1, 16, &bSig0, &bSig1 );
   4841     mul128To256( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1, &zSig2, &zSig3 );
   4842     add128( zSig0, zSig1, aSig0, aSig1, &zSig0, &zSig1 );
   4843     zSig2 |= ( zSig3 != 0 );
   4844     if ( LIT64( 0x0002000000000000 ) <= zSig0 ) {
   4845         shift128ExtraRightJamming(
   4846             zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 );
   4847         ++zExp;
   4848     }
   4849     return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 );
   4850 
   4851 }
   4852 
   4853 /*----------------------------------------------------------------------------
   4854 | Returns the result of dividing the quadruple-precision floating-point value
   4855 | `a' by the corresponding value `b'.  The operation is performed according to
   4856 | the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
   4857 *----------------------------------------------------------------------------*/
   4858 
   4859 float128 float128_div( float128 a, float128 b )
   4860 {
   4861     flag aSign, bSign, zSign;
   4862     int32 aExp, bExp, zExp;
   4863     bits64 aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2;
   4864     bits64 rem0, rem1, rem2, rem3, term0, term1, term2, term3;
   4865     float128 z;
   4866 
   4867     aSig1 = extractFloat128Frac1( a );
   4868     aSig0 = extractFloat128Frac0( a );
   4869     aExp = extractFloat128Exp( a );
   4870     aSign = extractFloat128Sign( a );
   4871     bSig1 = extractFloat128Frac1( b );
   4872     bSig0 = extractFloat128Frac0( b );
   4873     bExp = extractFloat128Exp( b );
   4874     bSign = extractFloat128Sign( b );
   4875     zSign = aSign ^ bSign;
   4876     if ( aExp == 0x7FFF ) {
   4877         if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, b );
   4878         if ( bExp == 0x7FFF ) {
   4879             if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b );
   4880             goto invalid;
   4881         }
   4882         return packFloat128( zSign, 0x7FFF, 0, 0 );
   4883     }
   4884     if ( bExp == 0x7FFF ) {
   4885         if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b );
   4886         return packFloat128( zSign, 0, 0, 0 );
   4887     }
   4888     if ( bExp == 0 ) {
   4889         if ( ( bSig0 | bSig1 ) == 0 ) {
   4890             if ( ( aExp | aSig0 | aSig1 ) == 0 ) {
   4891  invalid:
   4892                 float_raise( float_flag_invalid );
   4893                 z.low = float128_default_nan_low;
   4894                 z.high = float128_default_nan_high;
   4895                 return z;
   4896             }
   4897             float_raise( float_flag_divbyzero );
   4898             return packFloat128( zSign, 0x7FFF, 0, 0 );
   4899         }
   4900         normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
   4901     }
   4902     if ( aExp == 0 ) {
   4903         if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
   4904         normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
   4905     }
   4906     zExp = aExp - bExp + 0x3FFD;
   4907     shortShift128Left(
   4908         aSig0 | LIT64( 0x0001000000000000 ), aSig1, 15, &aSig0, &aSig1 );
   4909     shortShift128Left(
   4910         bSig0 | LIT64( 0x0001000000000000 ), bSig1, 15, &bSig0, &bSig1 );
   4911     if ( le128( bSig0, bSig1, aSig0, aSig1 ) ) {
   4912         shift128Right( aSig0, aSig1, 1, &aSig0, &aSig1 );
   4913         ++zExp;
   4914     }
   4915     zSig0 = estimateDiv128To64( aSig0, aSig1, bSig0 );
   4916     mul128By64To192( bSig0, bSig1, zSig0, &term0, &term1, &term2 );
   4917     sub192( aSig0, aSig1, 0, term0, term1, term2, &rem0, &rem1, &rem2 );
   4918     while ( (sbits64) rem0 < 0 ) {
   4919         --zSig0;
   4920         add192( rem0, rem1, rem2, 0, bSig0, bSig1, &rem0, &rem1, &rem2 );
   4921     }
   4922     zSig1 = estimateDiv128To64( rem1, rem2, bSig0 );
   4923     if ( ( zSig1 & 0x3FFF ) <= 4 ) {
   4924         mul128By64To192( bSig0, bSig1, zSig1, &term1, &term2, &term3 );
   4925         sub192( rem1, rem2, 0, term1, term2, term3, &rem1, &rem2, &rem3 );
   4926         while ( (sbits64) rem1 < 0 ) {
   4927             --zSig1;
   4928             add192( rem1, rem2, rem3, 0, bSig0, bSig1, &rem1, &rem2, &rem3 );
   4929         }
   4930         zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
   4931     }
   4932     shift128ExtraRightJamming( zSig0, zSig1, 0, 15, &zSig0, &zSig1, &zSig2 );
   4933     return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 );
   4934 
   4935 }
   4936 
   4937 /*----------------------------------------------------------------------------
   4938 | Returns the remainder of the quadruple-precision floating-point value `a'
   4939 | with respect to the corresponding value `b'.  The operation is performed
   4940 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
   4941 *----------------------------------------------------------------------------*/
   4942 
   4943 float128 float128_rem( float128 a, float128 b )
   4944 {
   4945     flag aSign, bSign, zSign;
   4946     int32 aExp, bExp, expDiff;
   4947     bits64 aSig0, aSig1, bSig0, bSig1, q, term0, term1, term2;
   4948     bits64 allZero, alternateASig0, alternateASig1, sigMean1;
   4949     sbits64 sigMean0;
   4950     float128 z;
   4951 
   4952     aSig1 = extractFloat128Frac1( a );
   4953     aSig0 = extractFloat128Frac0( a );
   4954     aExp = extractFloat128Exp( a );
   4955     aSign = extractFloat128Sign( a );
   4956     bSig1 = extractFloat128Frac1( b );
   4957     bSig0 = extractFloat128Frac0( b );
   4958     bExp = extractFloat128Exp( b );
   4959     bSign = extractFloat128Sign( b );
   4960     if ( aExp == 0x7FFF ) {
   4961         if (    ( aSig0 | aSig1 )
   4962              || ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) {
   4963             return propagateFloat128NaN( a, b );
   4964         }
   4965         goto invalid;
   4966     }
   4967     if ( bExp == 0x7FFF ) {
   4968         if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b );
   4969         return a;
   4970     }
   4971     if ( bExp == 0 ) {
   4972         if ( ( bSig0 | bSig1 ) == 0 ) {
   4973  invalid:
   4974             float_raise( float_flag_invalid );
   4975             z.low = float128_default_nan_low;
   4976             z.high = float128_default_nan_high;
   4977             return z;
   4978         }
   4979         normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
   4980     }
   4981     if ( aExp == 0 ) {
   4982         if ( ( aSig0 | aSig1 ) == 0 ) return a;
   4983         normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
   4984     }
   4985     expDiff = aExp - bExp;
   4986     if ( expDiff < -1 ) return a;
   4987     shortShift128Left(
   4988         aSig0 | LIT64( 0x0001000000000000 ),
   4989         aSig1,
   4990         15 - ( expDiff < 0 ),
   4991         &aSig0,
   4992         &aSig1
   4993     );
   4994     shortShift128Left(
   4995         bSig0 | LIT64( 0x0001000000000000 ), bSig1, 15, &bSig0, &bSig1 );
   4996     q = le128( bSig0, bSig1, aSig0, aSig1 );
   4997     if ( q ) sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 );
   4998     expDiff -= 64;
   4999     while ( 0 < expDiff ) {
   5000         q = estimateDiv128To64( aSig0, aSig1, bSig0 );
   5001         q = ( 4 < q ) ? q - 4 : 0;
   5002         mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 );
   5003         shortShift192Left( term0, term1, term2, 61, &term1, &term2, &allZero );
   5004         shortShift128Left( aSig0, aSig1, 61, &aSig0, &allZero );
   5005         sub128( aSig0, 0, term1, term2, &aSig0, &aSig1 );
   5006         expDiff -= 61;
   5007     }
   5008     if ( -64 < expDiff ) {
   5009         q = estimateDiv128To64( aSig0, aSig1, bSig0 );
   5010         q = ( 4 < q ) ? q - 4 : 0;
   5011         q >>= - expDiff;
   5012         shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 );
   5013         expDiff += 52;
   5014         if ( expDiff < 0 ) {
   5015             shift128Right( aSig0, aSig1, - expDiff, &aSig0, &aSig1 );
   5016         }
   5017         else {
   5018             shortShift128Left( aSig0, aSig1, expDiff, &aSig0, &aSig1 );
   5019         }
   5020         mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 );
   5021         sub128( aSig0, aSig1, term1, term2, &aSig0, &aSig1 );
   5022     }
   5023     else {
   5024         shift128Right( aSig0, aSig1, 12, &aSig0, &aSig1 );
   5025         shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 );
   5026     }
   5027     do {
   5028         alternateASig0 = aSig0;
   5029         alternateASig1 = aSig1;
   5030         ++q;
   5031         sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 );
   5032     } while ( 0 <= (sbits64) aSig0 );
   5033     add128(
   5034         aSig0, aSig1, alternateASig0, alternateASig1, &sigMean0, &sigMean1 );
   5035     if (    ( sigMean0 < 0 )
   5036          || ( ( ( sigMean0 | sigMean1 ) == 0 ) && ( q & 1 ) ) ) {
   5037         aSig0 = alternateASig0;
   5038         aSig1 = alternateASig1;
   5039     }
   5040     zSign = ( (sbits64) aSig0 < 0 );
   5041     if ( zSign ) sub128( 0, 0, aSig0, aSig1, &aSig0, &aSig1 );
   5042     return
   5043         normalizeRoundAndPackFloat128( aSign ^ zSign, bExp - 4, aSig0, aSig1 );
   5044 
   5045 }
   5046 
   5047 /*----------------------------------------------------------------------------
   5048 | Returns the square root of the quadruple-precision floating-point value `a'.
   5049 | The operation is performed according to the IEC/IEEE Standard for Binary
   5050 | Floating-Point Arithmetic.
   5051 *----------------------------------------------------------------------------*/
   5052 
   5053 float128 float128_sqrt( float128 a )
   5054 {
   5055     flag aSign;
   5056     int32 aExp, zExp;
   5057     bits64 aSig0, aSig1, zSig0, zSig1, zSig2, doubleZSig0;
   5058     bits64 rem0, rem1, rem2, rem3, term0, term1, term2, term3;
   5059     float128 z;
   5060 
   5061     aSig1 = extractFloat128Frac1( a );
   5062     aSig0 = extractFloat128Frac0( a );
   5063     aExp = extractFloat128Exp( a );
   5064     aSign = extractFloat128Sign( a );
   5065     if ( aExp == 0x7FFF ) {
   5066         if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, a );
   5067         if ( ! aSign ) return a;
   5068         goto invalid;
   5069     }
   5070     if ( aSign ) {
   5071         if ( ( aExp | aSig0 | aSig1 ) == 0 ) return a;
   5072  invalid:
   5073         float_raise( float_flag_invalid );
   5074         z.low = float128_default_nan_low;
   5075         z.high = float128_default_nan_high;
   5076         return z;
   5077     }
   5078     if ( aExp == 0 ) {
   5079         if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( 0, 0, 0, 0 );
   5080         normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
   5081     }
   5082     zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFE;
   5083     aSig0 |= LIT64( 0x0001000000000000 );
   5084     zSig0 = estimateSqrt32( aExp, aSig0>>17 );
   5085     shortShift128Left( aSig0, aSig1, 13 - ( aExp & 1 ), &aSig0, &aSig1 );
   5086     zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 );
   5087     doubleZSig0 = zSig0<<1;
   5088     mul64To128( zSig0, zSig0, &term0, &term1 );
   5089     sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 );
   5090     while ( (sbits64) rem0 < 0 ) {
   5091         --zSig0;
   5092         doubleZSig0 -= 2;
   5093         add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 );
   5094     }
   5095     zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 );
   5096     if ( ( zSig1 & 0x1FFF ) <= 5 ) {
   5097         if ( zSig1 == 0 ) zSig1 = 1;
   5098         mul64To128( doubleZSig0, zSig1, &term1, &term2 );
   5099         sub128( rem1, 0, term1, term2, &rem1, &rem2 );
   5100         mul64To128( zSig1, zSig1, &term2, &term3 );
   5101         sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 );
   5102         while ( (sbits64) rem1 < 0 ) {
   5103             --zSig1;
   5104             shortShift128Left( 0, zSig1, 1, &term2, &term3 );
   5105             term3 |= 1;
   5106             term2 |= doubleZSig0;
   5107             add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 );
   5108         }
   5109         zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
   5110     }
   5111     shift128ExtraRightJamming( zSig0, zSig1, 0, 14, &zSig0, &zSig1, &zSig2 );
   5112     return roundAndPackFloat128( 0, zExp, zSig0, zSig1, zSig2 );
   5113 
   5114 }
   5115 
   5116 /*----------------------------------------------------------------------------
   5117 | Returns 1 if the quadruple-precision floating-point value `a' is equal to
   5118 | the corresponding value `b', and 0 otherwise.  The comparison is performed
   5119 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
   5120 *----------------------------------------------------------------------------*/
   5121 
   5122 flag float128_eq( float128 a, float128 b )
   5123 {
   5124 
   5125     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
   5126               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
   5127          || (    ( extractFloat128Exp( b ) == 0x7FFF )
   5128               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
   5129        ) {
   5130         if (    float128_is_signaling_nan( a )
   5131              || float128_is_signaling_nan( b ) ) {
   5132             float_raise( float_flag_invalid );
   5133         }
   5134         return 0;
   5135     }
   5136     return
   5137            ( a.low == b.low )
   5138         && (    ( a.high == b.high )
   5139              || (    ( a.low == 0 )
   5140                   && ( (bits64) ( ( a.high | b.high )<<1 ) == 0 ) )
   5141            );
   5142 
   5143 }
   5144 
   5145 /*----------------------------------------------------------------------------
   5146 | Returns 1 if the quadruple-precision floating-point value `a' is less than
   5147 | or equal to the corresponding value `b', and 0 otherwise.  The comparison
   5148 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
   5149 | Arithmetic.
   5150 *----------------------------------------------------------------------------*/
   5151 
   5152 flag float128_le( float128 a, float128 b )
   5153 {
   5154     flag aSign, bSign;
   5155 
   5156     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
   5157               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
   5158          || (    ( extractFloat128Exp( b ) == 0x7FFF )
   5159               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
   5160        ) {
   5161         float_raise( float_flag_invalid );
   5162         return 0;
   5163     }
   5164     aSign = extractFloat128Sign( a );
   5165     bSign = extractFloat128Sign( b );
   5166     if ( aSign != bSign ) {
   5167         return
   5168                aSign
   5169             || (    ( ( (bits64) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
   5170                  == 0 );
   5171     }
   5172     return
   5173           aSign ? le128( b.high, b.low, a.high, a.low )
   5174         : le128( a.high, a.low, b.high, b.low );
   5175 
   5176 }
   5177 
   5178 /*----------------------------------------------------------------------------
   5179 | Returns 1 if the quadruple-precision floating-point value `a' is less than
   5180 | the corresponding value `b', and 0 otherwise.  The comparison is performed
   5181 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
   5182 *----------------------------------------------------------------------------*/
   5183 
   5184 flag float128_lt( float128 a, float128 b )
   5185 {
   5186     flag aSign, bSign;
   5187 
   5188     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
   5189               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
   5190          || (    ( extractFloat128Exp( b ) == 0x7FFF )
   5191               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
   5192        ) {
   5193         float_raise( float_flag_invalid );
   5194         return 0;
   5195     }
   5196     aSign = extractFloat128Sign( a );
   5197     bSign = extractFloat128Sign( b );
   5198     if ( aSign != bSign ) {
   5199         return
   5200                aSign
   5201             && (    ( ( (bits64) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
   5202                  != 0 );
   5203     }
   5204     return
   5205           aSign ? lt128( b.high, b.low, a.high, a.low )
   5206         : lt128( a.high, a.low, b.high, b.low );
   5207 
   5208 }
   5209 
   5210 /*----------------------------------------------------------------------------
   5211 | Returns 1 if the quadruple-precision floating-point value `a' is equal to
   5212 | the corresponding value `b', and 0 otherwise.  The invalid exception is
   5213 | raised if either operand is a NaN.  Otherwise, the comparison is performed
   5214 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
   5215 *----------------------------------------------------------------------------*/
   5216 
   5217 flag float128_eq_signaling( float128 a, float128 b )
   5218 {
   5219 
   5220     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
   5221               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
   5222          || (    ( extractFloat128Exp( b ) == 0x7FFF )
   5223               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
   5224        ) {
   5225         float_raise( float_flag_invalid );
   5226         return 0;
   5227     }
   5228     return
   5229            ( a.low == b.low )
   5230         && (    ( a.high == b.high )
   5231              || (    ( a.low == 0 )
   5232                   && ( (bits64) ( ( a.high | b.high )<<1 ) == 0 ) )
   5233            );
   5234 
   5235 }
   5236 
   5237 /*----------------------------------------------------------------------------
   5238 | Returns 1 if the quadruple-precision floating-point value `a' is less than
   5239 | or equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs do not
   5240 | cause an exception.  Otherwise, the comparison is performed according to the
   5241 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
   5242 *----------------------------------------------------------------------------*/
   5243 
   5244 flag float128_le_quiet( float128 a, float128 b )
   5245 {
   5246     flag aSign, bSign;
   5247 
   5248     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
   5249               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
   5250          || (    ( extractFloat128Exp( b ) == 0x7FFF )
   5251               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
   5252        ) {
   5253         if (    float128_is_signaling_nan( a )
   5254              || float128_is_signaling_nan( b ) ) {
   5255             float_raise( float_flag_invalid );
   5256         }
   5257         return 0;
   5258     }
   5259     aSign = extractFloat128Sign( a );
   5260     bSign = extractFloat128Sign( b );
   5261     if ( aSign != bSign ) {
   5262         return
   5263                aSign
   5264             || (    ( ( (bits64) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
   5265                  == 0 );
   5266     }
   5267     return
   5268           aSign ? le128( b.high, b.low, a.high, a.low )
   5269         : le128( a.high, a.low, b.high, b.low );
   5270 
   5271 }
   5272 
   5273 /*----------------------------------------------------------------------------
   5274 | Returns 1 if the quadruple-precision floating-point value `a' is less than
   5275 | the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
   5276 | exception.  Otherwise, the comparison is performed according to the IEC/IEEE
   5277 | Standard for Binary Floating-Point Arithmetic.
   5278 *----------------------------------------------------------------------------*/
   5279 
   5280 flag float128_lt_quiet( float128 a, float128 b )
   5281 {
   5282     flag aSign, bSign;
   5283 
   5284     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
   5285               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
   5286          || (    ( extractFloat128Exp( b ) == 0x7FFF )
   5287               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
   5288        ) {
   5289         if (    float128_is_signaling_nan( a )
   5290              || float128_is_signaling_nan( b ) ) {
   5291             float_raise( float_flag_invalid );
   5292         }
   5293         return 0;
   5294     }
   5295     aSign = extractFloat128Sign( a );
   5296     bSign = extractFloat128Sign( b );
   5297     if ( aSign != bSign ) {
   5298         return
   5299                aSign
   5300             && (    ( ( (bits64) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
   5301                  != 0 );
   5302     }
   5303     return
   5304           aSign ? lt128( b.high, b.low, a.high, a.low )
   5305         : lt128( a.high, a.low, b.high, b.low );
   5306 
   5307 }
   5308 
   5309 #endif
   5310 
   5311 
   5312 #if defined(SOFTFLOAT_FOR_GCC) && defined(SOFTFLOAT_NEED_FIXUNS)
   5313 
   5314 /*
   5315  * These two routines are not part of the original softfloat distribution.
   5316  *
   5317  * They are based on the corresponding conversions to integer but return
   5318  * unsigned numbers instead since these functions are required by GCC.
   5319  *
   5320  * Added by Mark Brinicombe <mark (at) NetBSD.org>	27/09/97
   5321  *
   5322  * float64 version overhauled for SoftFloat 2a [bjh21 2000-07-15]
   5323  */
   5324 
   5325 /*----------------------------------------------------------------------------
   5326 | Returns the result of converting the double-precision floating-point value
   5327 | `a' to the 32-bit unsigned integer format.  The conversion is
   5328 | performed according to the IEC/IEEE Standard for Binary Floating-point
   5329 | Arithmetic, except that the conversion is always rounded toward zero.  If
   5330 | `a' is a NaN, the largest positive integer is returned.  If the conversion
   5331 | overflows, the largest integer positive is returned.
   5332 *----------------------------------------------------------------------------*/
   5333 
   5334 uint32 float64_to_uint32_round_to_zero( float64 a )
   5335 {
   5336     flag aSign;
   5337     int16 aExp, shiftCount;
   5338     bits64 aSig, savedASig;
   5339     uint32 z;
   5340 
   5341     aSig = extractFloat64Frac( a );
   5342     aExp = extractFloat64Exp( a );
   5343     aSign = extractFloat64Sign( a );
   5344 
   5345     if (aSign) {
   5346         float_raise( float_flag_invalid );
   5347     	return(0);
   5348     }
   5349 
   5350     if ( 0x41E < aExp ) {
   5351         float_raise( float_flag_invalid );
   5352         return 0xffffffff;
   5353     }
   5354     else if ( aExp < 0x3FF ) {
   5355         if ( aExp || aSig ) float_set_inexact();
   5356         return 0;
   5357     }
   5358     aSig |= LIT64( 0x0010000000000000 );
   5359     shiftCount = 0x433 - aExp;
   5360     savedASig = aSig;
   5361     aSig >>= shiftCount;
   5362     z = aSig;
   5363     if ( ( aSig<<shiftCount ) != savedASig ) {
   5364         float_set_inexact();
   5365     }
   5366     return z;
   5367 
   5368 }
   5369 
   5370 /*----------------------------------------------------------------------------
   5371 | Returns the result of converting the single-precision floating-point value
   5372 | `a' to the 32-bit unsigned integer format.  The conversion is
   5373 | performed according to the IEC/IEEE Standard for Binary Floating-point
   5374 | Arithmetic, except that the conversion is always rounded toward zero.  If
   5375 | `a' is a NaN, the largest positive integer is returned.  If the conversion
   5376 | overflows, the largest positive integer is returned.
   5377 *----------------------------------------------------------------------------*/
   5378 
   5379 uint32 float32_to_uint32_round_to_zero( float32 a )
   5380 {
   5381     flag aSign;
   5382     int16 aExp, shiftCount;
   5383     bits32 aSig;
   5384     uint32 z;
   5385 
   5386     aSig = extractFloat32Frac( a );
   5387     aExp = extractFloat32Exp( a );
   5388     aSign = extractFloat32Sign( a );
   5389     shiftCount = aExp - 0x9E;
   5390 
   5391     if (aSign) {
   5392         float_raise( float_flag_invalid );
   5393     	return(0);
   5394     }
   5395     if ( 0 < shiftCount ) {
   5396         float_raise( float_flag_invalid );
   5397         return 0xFFFFFFFF;
   5398     }
   5399     else if ( aExp <= 0x7E ) {
   5400         if ( aExp | aSig ) float_set_inexact();
   5401         return 0;
   5402     }
   5403     aSig = ( aSig | 0x800000 )<<8;
   5404     z = aSig>>( - shiftCount );
   5405     if ( aSig<<( shiftCount & 31 ) ) {
   5406         float_set_inexact();
   5407     }
   5408     return z;
   5409 
   5410 }
   5411 
   5412 #endif
   5413 
   5414 #endif /* _STANDALONE */
   5415