Home | History | Annotate | Line # | Download | only in libkern
softfloat.c revision 1.4.94.1
      1 /* $NetBSD: softfloat.c,v 1.4.94.1 2011/02/05 06:00:14 cliff Exp $ */
      2 
      3 /*
      4  * This version hacked for use with gcc -msoft-float by bjh21.
      5  * (Mostly a case of #ifdefing out things GCC doesn't need or provides
      6  *  itself).
      7  */
      8 
      9 /*
     10  * Things you may want to define:
     11  *
     12  * SOFTFLOAT_FOR_GCC - build only those functions necessary for GCC (with
     13  *   -msoft-float) to work.  Include "softfloat-for-gcc.h" to get them
     14  *   properly renamed.
     15  */
     16 
     17 /*
     18 ===============================================================================
     19 
     20 This C source file is part of the SoftFloat IEC/IEEE Floating-point
     21 Arithmetic Package, Release 2a.
     22 
     23 Written by John R. Hauser.  This work was made possible in part by the
     24 International Computer Science Institute, located at Suite 600, 1947 Center
     25 Street, Berkeley, California 94704.  Funding was partially provided by the
     26 National Science Foundation under grant MIP-9311980.  The original version
     27 of this code was written as part of a project to build a fixed-point vector
     28 processor in collaboration with the University of California at Berkeley,
     29 overseen by Profs. Nelson Morgan and John Wawrzynek.  More information
     30 is available through the Web page `http://HTTP.CS.Berkeley.EDU/~jhauser/
     31 arithmetic/SoftFloat.html'.
     32 
     33 THIS SOFTWARE IS DISTRIBUTED AS IS, FOR FREE.  Although reasonable effort
     34 has been made to avoid it, THIS SOFTWARE MAY CONTAIN FAULTS THAT WILL AT
     35 TIMES RESULT IN INCORRECT BEHAVIOR.  USE OF THIS SOFTWARE IS RESTRICTED TO
     36 PERSONS AND ORGANIZATIONS WHO CAN AND WILL TAKE FULL RESPONSIBILITY FOR ANY
     37 AND ALL LOSSES, COSTS, OR OTHER PROBLEMS ARISING FROM ITS USE.
     38 
     39 Derivative works are acceptable, even for commercial purposes, so long as
     40 (1) they include prominent notice that the work is derivative, and (2) they
     41 include prominent notice akin to these four paragraphs for those parts of
     42 this code that are retained.
     43 
     44 ===============================================================================
     45 */
     46 
     47 /* If you need this in a boot program, you have bigger problems... */
     48 #ifndef _STANDALONE
     49 
     50 #include "opt_multiprocessor.h"
     51 
     52 #include <sys/cdefs.h>
     53 #if defined(LIBC_SCCS) && !defined(lint)
     54 __RCSID("$NetBSD: softfloat.c,v 1.4.94.1 2011/02/05 06:00:14 cliff Exp $");
     55 #endif /* LIBC_SCCS and not lint */
     56 
     57 #ifdef SOFTFLOAT_FOR_GCC
     58 #include "softfloat-for-gcc.h"
     59 #endif
     60 
     61 #include "milieu.h"
     62 #include "softfloat.h"
     63 
     64 /*
     65  * Conversions between floats as stored in memory and floats as
     66  * SoftFloat uses them
     67  */
     68 #ifndef FLOAT64_DEMANGLE
     69 #define FLOAT64_DEMANGLE(a)	(a)
     70 #endif
     71 #ifndef FLOAT64_MANGLE
     72 #define FLOAT64_MANGLE(a)	(a)
     73 #endif
     74 
     75 /*
     76 -------------------------------------------------------------------------------
     77 Floating-point rounding mode, extended double-precision rounding precision,
     78 and exception flags.
     79 -------------------------------------------------------------------------------
     80 */
     81 
     82 /*
     83  * XXX: This may cause options-MULTIPROCESSOR or thread problems someday.
     84  * 	Right now, it does not.  I've removed all other dynamic global
     85  * 	variables. [ross]
     86  */
     87 #ifdef FLOATX80
     88 int8 floatx80_rounding_precision = 80;
     89 #endif
     90 
     91 /*
     92 -------------------------------------------------------------------------------
     93 Primitive arithmetic functions, including multi-word arithmetic, and
     94 division and square root approximations.  (Can be specialized to target if
     95 desired.)
     96 -------------------------------------------------------------------------------
     97 */
     98 #include "softfloat-macros.h"
     99 
    100 /*
    101 -------------------------------------------------------------------------------
    102 Functions and definitions to determine:  (1) whether tininess for underflow
    103 is detected before or after rounding by default, (2) what (if anything)
    104 happens when exceptions are raised, (3) how signaling NaNs are distinguished
    105 from quiet NaNs, (4) the default generated quiet NaNs, and (5) how NaNs
    106 are propagated from function inputs to output.  These details are target-
    107 specific.
    108 -------------------------------------------------------------------------------
    109 */
    110 #include "softfloat-specialize.h"
    111 
    112 #ifndef SOFTFLOAT_FOR_GCC /* Not used */
    113 /*
    114 -------------------------------------------------------------------------------
    115 Takes a 64-bit fixed-point value `absZ' with binary point between bits 6
    116 and 7, and returns the properly rounded 32-bit integer corresponding to the
    117 input.  If `zSign' is 1, the input is negated before being converted to an
    118 integer.  Bit 63 of `absZ' must be zero.  Ordinarily, the fixed-point input
    119 is simply rounded to an integer, with the inexact exception raised if the
    120 input cannot be represented exactly as an integer.  However, if the fixed-
    121 point input is too large, the invalid exception is raised and the largest
    122 positive or negative integer is returned.
    123 -------------------------------------------------------------------------------
    124 */
    125 static int32 roundAndPackInt32( flag zSign, bits64 absZ )
    126 {
    127     int8 roundingMode;
    128     flag roundNearestEven;
    129     int8 roundIncrement, roundBits;
    130     int32 z;
    131 
    132     roundingMode = float_rounding_mode();
    133     roundNearestEven = ( roundingMode == float_round_nearest_even );
    134     roundIncrement = 0x40;
    135     if ( ! roundNearestEven ) {
    136         if ( roundingMode == float_round_to_zero ) {
    137             roundIncrement = 0;
    138         }
    139         else {
    140             roundIncrement = 0x7F;
    141             if ( zSign ) {
    142                 if ( roundingMode == float_round_up ) roundIncrement = 0;
    143             }
    144             else {
    145                 if ( roundingMode == float_round_down ) roundIncrement = 0;
    146             }
    147         }
    148     }
    149     roundBits = absZ & 0x7F;
    150     absZ = ( absZ + roundIncrement )>>7;
    151     absZ &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven );
    152     z = absZ;
    153     if ( zSign ) z = - z;
    154     if ( ( absZ>>32 ) || ( z && ( ( z < 0 ) ^ zSign ) ) ) {
    155         float_raise( float_flag_invalid );
    156         return zSign ? (sbits32) 0x80000000 : 0x7FFFFFFF;
    157     }
    158     if ( roundBits ) float_set_inexact();
    159     return z;
    160 
    161 }
    162 
    163 /*
    164 -------------------------------------------------------------------------------
    165 Takes the 128-bit fixed-point value formed by concatenating `absZ0' and
    166 `absZ1', with binary point between bits 63 and 64 (between the input words),
    167 and returns the properly rounded 64-bit integer corresponding to the input.
    168 If `zSign' is 1, the input is negated before being converted to an integer.
    169 Ordinarily, the fixed-point input is simply rounded to an integer, with
    170 the inexact exception raised if the input cannot be represented exactly as
    171 an integer.  However, if the fixed-point input is too large, the invalid
    172 exception is raised and the largest positive or negative integer is
    173 returned.
    174 -------------------------------------------------------------------------------
    175 */
    176 static int64 roundAndPackInt64( flag zSign, bits64 absZ0, bits64 absZ1 )
    177 {
    178     int8 roundingMode;
    179     flag roundNearestEven, increment;
    180     int64 z;
    181 
    182     roundingMode = float_rounding_mode();
    183     roundNearestEven = ( roundingMode == float_round_nearest_even );
    184     increment = ( (sbits64) absZ1 < 0 );
    185     if ( ! roundNearestEven ) {
    186         if ( roundingMode == float_round_to_zero ) {
    187             increment = 0;
    188         }
    189         else {
    190             if ( zSign ) {
    191                 increment = ( roundingMode == float_round_down ) && absZ1;
    192             }
    193             else {
    194                 increment = ( roundingMode == float_round_up ) && absZ1;
    195             }
    196         }
    197     }
    198     if ( increment ) {
    199         ++absZ0;
    200         if ( absZ0 == 0 ) goto overflow;
    201         absZ0 &= ~ ( ( (bits64) ( absZ1<<1 ) == 0 ) & roundNearestEven );
    202     }
    203     z = absZ0;
    204     if ( zSign ) z = - z;
    205     if ( z && ( ( z < 0 ) ^ zSign ) ) {
    206  overflow:
    207         float_raise( float_flag_invalid );
    208         return
    209               zSign ? (sbits64) LIT64( 0x8000000000000000 )
    210             : LIT64( 0x7FFFFFFFFFFFFFFF );
    211     }
    212     if ( absZ1 ) float_set_inexact();
    213     return z;
    214 
    215 }
    216 #endif
    217 
    218 /*
    219 -------------------------------------------------------------------------------
    220 Returns the fraction bits of the single-precision floating-point value `a'.
    221 -------------------------------------------------------------------------------
    222 */
    223 INLINE bits32 extractFloat32Frac( float32 a )
    224 {
    225 
    226     return a & 0x007FFFFF;
    227 
    228 }
    229 
    230 /*
    231 -------------------------------------------------------------------------------
    232 Returns the exponent bits of the single-precision floating-point value `a'.
    233 -------------------------------------------------------------------------------
    234 */
    235 INLINE int16 extractFloat32Exp( float32 a )
    236 {
    237 
    238     return ( a>>23 ) & 0xFF;
    239 
    240 }
    241 
    242 /*
    243 -------------------------------------------------------------------------------
    244 Returns the sign bit of the single-precision floating-point value `a'.
    245 -------------------------------------------------------------------------------
    246 */
    247 INLINE flag extractFloat32Sign( float32 a )
    248 {
    249 
    250     return a>>31;
    251 
    252 }
    253 
    254 /*
    255 -------------------------------------------------------------------------------
    256 Normalizes the subnormal single-precision floating-point value represented
    257 by the denormalized significand `aSig'.  The normalized exponent and
    258 significand are stored at the locations pointed to by `zExpPtr' and
    259 `zSigPtr', respectively.
    260 -------------------------------------------------------------------------------
    261 */
    262 static void
    263  normalizeFloat32Subnormal( bits32 aSig, int16 *zExpPtr, bits32 *zSigPtr )
    264 {
    265     int8 shiftCount;
    266 
    267     shiftCount = countLeadingZeros32( aSig ) - 8;
    268     *zSigPtr = aSig<<shiftCount;
    269     *zExpPtr = 1 - shiftCount;
    270 
    271 }
    272 
    273 /*
    274 -------------------------------------------------------------------------------
    275 Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
    276 single-precision floating-point value, returning the result.  After being
    277 shifted into the proper positions, the three fields are simply added
    278 together to form the result.  This means that any integer portion of `zSig'
    279 will be added into the exponent.  Since a properly normalized significand
    280 will have an integer portion equal to 1, the `zExp' input should be 1 less
    281 than the desired result exponent whenever `zSig' is a complete, normalized
    282 significand.
    283 -------------------------------------------------------------------------------
    284 */
    285 INLINE float32 packFloat32( flag zSign, int16 zExp, bits32 zSig )
    286 {
    287 
    288     return ( ( (bits32) zSign )<<31 ) + ( ( (bits32) zExp )<<23 ) + zSig;
    289 
    290 }
    291 
    292 /*
    293 -------------------------------------------------------------------------------
    294 Takes an abstract floating-point value having sign `zSign', exponent `zExp',
    295 and significand `zSig', and returns the proper single-precision floating-
    296 point value corresponding to the abstract input.  Ordinarily, the abstract
    297 value is simply rounded and packed into the single-precision format, with
    298 the inexact exception raised if the abstract input cannot be represented
    299 exactly.  However, if the abstract value is too large, the overflow and
    300 inexact exceptions are raised and an infinity or maximal finite value is
    301 returned.  If the abstract value is too small, the input value is rounded to
    302 a subnormal number, and the underflow and inexact exceptions are raised if
    303 the abstract input cannot be represented exactly as a subnormal single-
    304 precision floating-point number.
    305     The input significand `zSig' has its binary point between bits 30
    306 and 29, which is 7 bits to the left of the usual location.  This shifted
    307 significand must be normalized or smaller.  If `zSig' is not normalized,
    308 `zExp' must be 0; in that case, the result returned is a subnormal number,
    309 and it must not require rounding.  In the usual case that `zSig' is
    310 normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
    311 The handling of underflow and overflow follows the IEC/IEEE Standard for
    312 Binary Floating-Point Arithmetic.
    313 -------------------------------------------------------------------------------
    314 */
    315 static float32 roundAndPackFloat32( flag zSign, int16 zExp, bits32 zSig )
    316 {
    317     int8 roundingMode;
    318     flag roundNearestEven;
    319     int8 roundIncrement, roundBits;
    320     flag isTiny;
    321 
    322     roundingMode = float_rounding_mode();
    323     roundNearestEven = ( roundingMode == float_round_nearest_even );
    324     roundIncrement = 0x40;
    325     if ( ! roundNearestEven ) {
    326         if ( roundingMode == float_round_to_zero ) {
    327             roundIncrement = 0;
    328         }
    329         else {
    330             roundIncrement = 0x7F;
    331             if ( zSign ) {
    332                 if ( roundingMode == float_round_up ) roundIncrement = 0;
    333             }
    334             else {
    335                 if ( roundingMode == float_round_down ) roundIncrement = 0;
    336             }
    337         }
    338     }
    339     roundBits = zSig & 0x7F;
    340     if ( 0xFD <= (bits16) zExp ) {
    341         if (    ( 0xFD < zExp )
    342              || (    ( zExp == 0xFD )
    343                   && ( (sbits32) ( zSig + roundIncrement ) < 0 ) )
    344            ) {
    345             float_raise( float_flag_overflow | float_flag_inexact );
    346             return packFloat32( zSign, 0xFF, 0 ) - ( roundIncrement == 0 );
    347         }
    348         if ( zExp < 0 ) {
    349             isTiny =
    350                    ( float_detect_tininess == float_tininess_before_rounding )
    351                 || ( zExp < -1 )
    352                 || ( zSig + roundIncrement < 0x80000000 );
    353             shift32RightJamming( zSig, - zExp, &zSig );
    354             zExp = 0;
    355             roundBits = zSig & 0x7F;
    356             if ( isTiny && roundBits ) float_raise( float_flag_underflow );
    357         }
    358     }
    359     if ( roundBits ) float_set_inexact();
    360     zSig = ( zSig + roundIncrement )>>7;
    361     zSig &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven );
    362     if ( zSig == 0 ) zExp = 0;
    363     return packFloat32( zSign, zExp, zSig );
    364 
    365 }
    366 
    367 /*
    368 -------------------------------------------------------------------------------
    369 Takes an abstract floating-point value having sign `zSign', exponent `zExp',
    370 and significand `zSig', and returns the proper single-precision floating-
    371 point value corresponding to the abstract input.  This routine is just like
    372 `roundAndPackFloat32' except that `zSig' does not have to be normalized.
    373 Bit 31 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
    374 floating-point exponent.
    375 -------------------------------------------------------------------------------
    376 */
    377 static float32
    378  normalizeRoundAndPackFloat32( flag zSign, int16 zExp, bits32 zSig )
    379 {
    380     int8 shiftCount;
    381 
    382     shiftCount = countLeadingZeros32( zSig ) - 1;
    383     return roundAndPackFloat32( zSign, zExp - shiftCount, zSig<<shiftCount );
    384 
    385 }
    386 
    387 /*
    388 -------------------------------------------------------------------------------
    389 Returns the fraction bits of the double-precision floating-point value `a'.
    390 -------------------------------------------------------------------------------
    391 */
    392 INLINE bits64 extractFloat64Frac( float64 a )
    393 {
    394 
    395     return FLOAT64_DEMANGLE(a) & LIT64( 0x000FFFFFFFFFFFFF );
    396 
    397 }
    398 
    399 /*
    400 -------------------------------------------------------------------------------
    401 Returns the exponent bits of the double-precision floating-point value `a'.
    402 -------------------------------------------------------------------------------
    403 */
    404 INLINE int16 extractFloat64Exp( float64 a )
    405 {
    406 
    407     return ( FLOAT64_DEMANGLE(a)>>52 ) & 0x7FF;
    408 
    409 }
    410 
    411 /*
    412 -------------------------------------------------------------------------------
    413 Returns the sign bit of the double-precision floating-point value `a'.
    414 -------------------------------------------------------------------------------
    415 */
    416 INLINE flag extractFloat64Sign( float64 a )
    417 {
    418 
    419     return FLOAT64_DEMANGLE(a)>>63;
    420 
    421 }
    422 
    423 /*
    424 -------------------------------------------------------------------------------
    425 Normalizes the subnormal double-precision floating-point value represented
    426 by the denormalized significand `aSig'.  The normalized exponent and
    427 significand are stored at the locations pointed to by `zExpPtr' and
    428 `zSigPtr', respectively.
    429 -------------------------------------------------------------------------------
    430 */
    431 static void
    432  normalizeFloat64Subnormal( bits64 aSig, int16 *zExpPtr, bits64 *zSigPtr )
    433 {
    434     int8 shiftCount;
    435 
    436     shiftCount = countLeadingZeros64( aSig ) - 11;
    437     *zSigPtr = aSig<<shiftCount;
    438     *zExpPtr = 1 - shiftCount;
    439 
    440 }
    441 
    442 /*
    443 -------------------------------------------------------------------------------
    444 Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
    445 double-precision floating-point value, returning the result.  After being
    446 shifted into the proper positions, the three fields are simply added
    447 together to form the result.  This means that any integer portion of `zSig'
    448 will be added into the exponent.  Since a properly normalized significand
    449 will have an integer portion equal to 1, the `zExp' input should be 1 less
    450 than the desired result exponent whenever `zSig' is a complete, normalized
    451 significand.
    452 -------------------------------------------------------------------------------
    453 */
    454 INLINE float64 packFloat64( flag zSign, int16 zExp, bits64 zSig )
    455 {
    456 
    457     return FLOAT64_MANGLE( ( ( (bits64) zSign )<<63 ) +
    458 			   ( ( (bits64) zExp )<<52 ) + zSig );
    459 
    460 }
    461 
    462 /*
    463 -------------------------------------------------------------------------------
    464 Takes an abstract floating-point value having sign `zSign', exponent `zExp',
    465 and significand `zSig', and returns the proper double-precision floating-
    466 point value corresponding to the abstract input.  Ordinarily, the abstract
    467 value is simply rounded and packed into the double-precision format, with
    468 the inexact exception raised if the abstract input cannot be represented
    469 exactly.  However, if the abstract value is too large, the overflow and
    470 inexact exceptions are raised and an infinity or maximal finite value is
    471 returned.  If the abstract value is too small, the input value is rounded to
    472 a subnormal number, and the underflow and inexact exceptions are raised if
    473 the abstract input cannot be represented exactly as a subnormal double-
    474 precision floating-point number.
    475     The input significand `zSig' has its binary point between bits 62
    476 and 61, which is 10 bits to the left of the usual location.  This shifted
    477 significand must be normalized or smaller.  If `zSig' is not normalized,
    478 `zExp' must be 0; in that case, the result returned is a subnormal number,
    479 and it must not require rounding.  In the usual case that `zSig' is
    480 normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
    481 The handling of underflow and overflow follows the IEC/IEEE Standard for
    482 Binary Floating-Point Arithmetic.
    483 -------------------------------------------------------------------------------
    484 */
    485 static float64 roundAndPackFloat64( flag zSign, int16 zExp, bits64 zSig )
    486 {
    487     int8 roundingMode;
    488     flag roundNearestEven;
    489     int16 roundIncrement, roundBits;
    490     flag isTiny;
    491 
    492     roundingMode = float_rounding_mode();
    493     roundNearestEven = ( roundingMode == float_round_nearest_even );
    494     roundIncrement = 0x200;
    495     if ( ! roundNearestEven ) {
    496         if ( roundingMode == float_round_to_zero ) {
    497             roundIncrement = 0;
    498         }
    499         else {
    500             roundIncrement = 0x3FF;
    501             if ( zSign ) {
    502                 if ( roundingMode == float_round_up ) roundIncrement = 0;
    503             }
    504             else {
    505                 if ( roundingMode == float_round_down ) roundIncrement = 0;
    506             }
    507         }
    508     }
    509     roundBits = zSig & 0x3FF;
    510     if ( 0x7FD <= (bits16) zExp ) {
    511         if (    ( 0x7FD < zExp )
    512              || (    ( zExp == 0x7FD )
    513                   && ( (sbits64) ( zSig + roundIncrement ) < 0 ) )
    514            ) {
    515             float_raise( float_flag_overflow | float_flag_inexact );
    516             return FLOAT64_MANGLE(
    517 		FLOAT64_DEMANGLE(packFloat64( zSign, 0x7FF, 0 )) -
    518 		( roundIncrement == 0 ));
    519         }
    520         if ( zExp < 0 ) {
    521             isTiny =
    522                    ( float_detect_tininess == float_tininess_before_rounding )
    523                 || ( zExp < -1 )
    524                 || ( zSig + roundIncrement < LIT64( 0x8000000000000000 ) );
    525             shift64RightJamming( zSig, - zExp, &zSig );
    526             zExp = 0;
    527             roundBits = zSig & 0x3FF;
    528             if ( isTiny && roundBits ) float_raise( float_flag_underflow );
    529         }
    530     }
    531     if ( roundBits ) float_set_inexact();
    532     zSig = ( zSig + roundIncrement )>>10;
    533     zSig &= ~ ( ( ( roundBits ^ 0x200 ) == 0 ) & roundNearestEven );
    534     if ( zSig == 0 ) zExp = 0;
    535     return packFloat64( zSign, zExp, zSig );
    536 
    537 }
    538 
    539 /*
    540 -------------------------------------------------------------------------------
    541 Takes an abstract floating-point value having sign `zSign', exponent `zExp',
    542 and significand `zSig', and returns the proper double-precision floating-
    543 point value corresponding to the abstract input.  This routine is just like
    544 `roundAndPackFloat64' except that `zSig' does not have to be normalized.
    545 Bit 63 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
    546 floating-point exponent.
    547 -------------------------------------------------------------------------------
    548 */
    549 static float64
    550  normalizeRoundAndPackFloat64( flag zSign, int16 zExp, bits64 zSig )
    551 {
    552     int8 shiftCount;
    553 
    554     shiftCount = countLeadingZeros64( zSig ) - 1;
    555     return roundAndPackFloat64( zSign, zExp - shiftCount, zSig<<shiftCount );
    556 
    557 }
    558 
    559 #ifdef FLOATX80
    560 
    561 /*
    562 -------------------------------------------------------------------------------
    563 Returns the fraction bits of the extended double-precision floating-point
    564 value `a'.
    565 -------------------------------------------------------------------------------
    566 */
    567 INLINE bits64 extractFloatx80Frac( floatx80 a )
    568 {
    569 
    570     return a.low;
    571 
    572 }
    573 
    574 /*
    575 -------------------------------------------------------------------------------
    576 Returns the exponent bits of the extended double-precision floating-point
    577 value `a'.
    578 -------------------------------------------------------------------------------
    579 */
    580 INLINE int32 extractFloatx80Exp( floatx80 a )
    581 {
    582 
    583     return a.high & 0x7FFF;
    584 
    585 }
    586 
    587 /*
    588 -------------------------------------------------------------------------------
    589 Returns the sign bit of the extended double-precision floating-point value
    590 `a'.
    591 -------------------------------------------------------------------------------
    592 */
    593 INLINE flag extractFloatx80Sign( floatx80 a )
    594 {
    595 
    596     return a.high>>15;
    597 
    598 }
    599 
    600 /*
    601 -------------------------------------------------------------------------------
    602 Normalizes the subnormal extended double-precision floating-point value
    603 represented by the denormalized significand `aSig'.  The normalized exponent
    604 and significand are stored at the locations pointed to by `zExpPtr' and
    605 `zSigPtr', respectively.
    606 -------------------------------------------------------------------------------
    607 */
    608 static void
    609  normalizeFloatx80Subnormal( bits64 aSig, int32 *zExpPtr, bits64 *zSigPtr )
    610 {
    611     int8 shiftCount;
    612 
    613     shiftCount = countLeadingZeros64( aSig );
    614     *zSigPtr = aSig<<shiftCount;
    615     *zExpPtr = 1 - shiftCount;
    616 
    617 }
    618 
    619 /*
    620 -------------------------------------------------------------------------------
    621 Packs the sign `zSign', exponent `zExp', and significand `zSig' into an
    622 extended double-precision floating-point value, returning the result.
    623 -------------------------------------------------------------------------------
    624 */
    625 INLINE floatx80 packFloatx80( flag zSign, int32 zExp, bits64 zSig )
    626 {
    627     floatx80 z;
    628 
    629     z.low = zSig;
    630     z.high = ( ( (bits16) zSign )<<15 ) + zExp;
    631     return z;
    632 
    633 }
    634 
    635 /*
    636 -------------------------------------------------------------------------------
    637 Takes an abstract floating-point value having sign `zSign', exponent `zExp',
    638 and extended significand formed by the concatenation of `zSig0' and `zSig1',
    639 and returns the proper extended double-precision floating-point value
    640 corresponding to the abstract input.  Ordinarily, the abstract value is
    641 rounded and packed into the extended double-precision format, with the
    642 inexact exception raised if the abstract input cannot be represented
    643 exactly.  However, if the abstract value is too large, the overflow and
    644 inexact exceptions are raised and an infinity or maximal finite value is
    645 returned.  If the abstract value is too small, the input value is rounded to
    646 a subnormal number, and the underflow and inexact exceptions are raised if
    647 the abstract input cannot be represented exactly as a subnormal extended
    648 double-precision floating-point number.
    649     If `roundingPrecision' is 32 or 64, the result is rounded to the same
    650 number of bits as single or double precision, respectively.  Otherwise, the
    651 result is rounded to the full precision of the extended double-precision
    652 format.
    653     The input significand must be normalized or smaller.  If the input
    654 significand is not normalized, `zExp' must be 0; in that case, the result
    655 returned is a subnormal number, and it must not require rounding.  The
    656 handling of underflow and overflow follows the IEC/IEEE Standard for Binary
    657 Floating-Point Arithmetic.
    658 -------------------------------------------------------------------------------
    659 */
    660 static floatx80
    661  roundAndPackFloatx80(
    662      int8 roundingPrecision, flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1
    663  )
    664 {
    665     int8 roundingMode;
    666     flag roundNearestEven, increment, isTiny;
    667     int64 roundIncrement, roundMask, roundBits;
    668 
    669     roundingMode = float_rounding_mode();
    670     roundNearestEven = ( roundingMode == float_round_nearest_even );
    671     if ( roundingPrecision == 80 ) goto precision80;
    672     if ( roundingPrecision == 64 ) {
    673         roundIncrement = LIT64( 0x0000000000000400 );
    674         roundMask = LIT64( 0x00000000000007FF );
    675     }
    676     else if ( roundingPrecision == 32 ) {
    677         roundIncrement = LIT64( 0x0000008000000000 );
    678         roundMask = LIT64( 0x000000FFFFFFFFFF );
    679     }
    680     else {
    681         goto precision80;
    682     }
    683     zSig0 |= ( zSig1 != 0 );
    684     if ( ! roundNearestEven ) {
    685         if ( roundingMode == float_round_to_zero ) {
    686             roundIncrement = 0;
    687         }
    688         else {
    689             roundIncrement = roundMask;
    690             if ( zSign ) {
    691                 if ( roundingMode == float_round_up ) roundIncrement = 0;
    692             }
    693             else {
    694                 if ( roundingMode == float_round_down ) roundIncrement = 0;
    695             }
    696         }
    697     }
    698     roundBits = zSig0 & roundMask;
    699     if ( 0x7FFD <= (bits32) ( zExp - 1 ) ) {
    700         if (    ( 0x7FFE < zExp )
    701              || ( ( zExp == 0x7FFE ) && ( zSig0 + roundIncrement < zSig0 ) )
    702            ) {
    703             goto overflow;
    704         }
    705         if ( zExp <= 0 ) {
    706             isTiny =
    707                    ( float_detect_tininess == float_tininess_before_rounding )
    708                 || ( zExp < 0 )
    709                 || ( zSig0 <= zSig0 + roundIncrement );
    710             shift64RightJamming( zSig0, 1 - zExp, &zSig0 );
    711             zExp = 0;
    712             roundBits = zSig0 & roundMask;
    713             if ( isTiny && roundBits ) float_raise( float_flag_underflow );
    714             if ( roundBits ) float_set_inexact();
    715             zSig0 += roundIncrement;
    716             if ( (sbits64) zSig0 < 0 ) zExp = 1;
    717             roundIncrement = roundMask + 1;
    718             if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) {
    719                 roundMask |= roundIncrement;
    720             }
    721             zSig0 &= ~ roundMask;
    722             return packFloatx80( zSign, zExp, zSig0 );
    723         }
    724     }
    725     if ( roundBits ) float_set_inexact();
    726     zSig0 += roundIncrement;
    727     if ( zSig0 < roundIncrement ) {
    728         ++zExp;
    729         zSig0 = LIT64( 0x8000000000000000 );
    730     }
    731     roundIncrement = roundMask + 1;
    732     if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) {
    733         roundMask |= roundIncrement;
    734     }
    735     zSig0 &= ~ roundMask;
    736     if ( zSig0 == 0 ) zExp = 0;
    737     return packFloatx80( zSign, zExp, zSig0 );
    738  precision80:
    739     increment = ( (sbits64) zSig1 < 0 );
    740     if ( ! roundNearestEven ) {
    741         if ( roundingMode == float_round_to_zero ) {
    742             increment = 0;
    743         }
    744         else {
    745             if ( zSign ) {
    746                 increment = ( roundingMode == float_round_down ) && zSig1;
    747             }
    748             else {
    749                 increment = ( roundingMode == float_round_up ) && zSig1;
    750             }
    751         }
    752     }
    753     if ( 0x7FFD <= (bits32) ( zExp - 1 ) ) {
    754         if (    ( 0x7FFE < zExp )
    755              || (    ( zExp == 0x7FFE )
    756                   && ( zSig0 == LIT64( 0xFFFFFFFFFFFFFFFF ) )
    757                   && increment
    758                 )
    759            ) {
    760             roundMask = 0;
    761  overflow:
    762             float_raise( float_flag_overflow | float_flag_inexact );
    763             if (    ( roundingMode == float_round_to_zero )
    764                  || ( zSign && ( roundingMode == float_round_up ) )
    765                  || ( ! zSign && ( roundingMode == float_round_down ) )
    766                ) {
    767                 return packFloatx80( zSign, 0x7FFE, ~ roundMask );
    768             }
    769             return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
    770         }
    771         if ( zExp <= 0 ) {
    772             isTiny =
    773                    ( float_detect_tininess == float_tininess_before_rounding )
    774                 || ( zExp < 0 )
    775                 || ! increment
    776                 || ( zSig0 < LIT64( 0xFFFFFFFFFFFFFFFF ) );
    777             shift64ExtraRightJamming( zSig0, zSig1, 1 - zExp, &zSig0, &zSig1 );
    778             zExp = 0;
    779             if ( isTiny && zSig1 ) float_raise( float_flag_underflow );
    780             if ( zSig1 ) float_set_inexact();
    781             if ( roundNearestEven ) {
    782                 increment = ( (sbits64) zSig1 < 0 );
    783             }
    784             else {
    785                 if ( zSign ) {
    786                     increment = ( roundingMode == float_round_down ) && zSig1;
    787                 }
    788                 else {
    789                     increment = ( roundingMode == float_round_up ) && zSig1;
    790                 }
    791             }
    792             if ( increment ) {
    793                 ++zSig0;
    794                 zSig0 &=
    795                     ~ ( ( (bits64) ( zSig1<<1 ) == 0 ) & roundNearestEven );
    796                 if ( (sbits64) zSig0 < 0 ) zExp = 1;
    797             }
    798             return packFloatx80( zSign, zExp, zSig0 );
    799         }
    800     }
    801     if ( zSig1 ) float_set_inexact();
    802     if ( increment ) {
    803         ++zSig0;
    804         if ( zSig0 == 0 ) {
    805             ++zExp;
    806             zSig0 = LIT64( 0x8000000000000000 );
    807         }
    808         else {
    809             zSig0 &= ~ ( ( (bits64) ( zSig1<<1 ) == 0 ) & roundNearestEven );
    810         }
    811     }
    812     else {
    813         if ( zSig0 == 0 ) zExp = 0;
    814     }
    815     return packFloatx80( zSign, zExp, zSig0 );
    816 
    817 }
    818 
    819 /*
    820 -------------------------------------------------------------------------------
    821 Takes an abstract floating-point value having sign `zSign', exponent
    822 `zExp', and significand formed by the concatenation of `zSig0' and `zSig1',
    823 and returns the proper extended double-precision floating-point value
    824 corresponding to the abstract input.  This routine is just like
    825 `roundAndPackFloatx80' except that the input significand does not have to be
    826 normalized.
    827 -------------------------------------------------------------------------------
    828 */
    829 static floatx80
    830  normalizeRoundAndPackFloatx80(
    831      int8 roundingPrecision, flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1
    832  )
    833 {
    834     int8 shiftCount;
    835 
    836     if ( zSig0 == 0 ) {
    837         zSig0 = zSig1;
    838         zSig1 = 0;
    839         zExp -= 64;
    840     }
    841     shiftCount = countLeadingZeros64( zSig0 );
    842     shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
    843     zExp -= shiftCount;
    844     return
    845         roundAndPackFloatx80( roundingPrecision, zSign, zExp, zSig0, zSig1 );
    846 
    847 }
    848 
    849 #endif
    850 
    851 #ifdef FLOAT128
    852 
    853 /*
    854 -------------------------------------------------------------------------------
    855 Returns the least-significant 64 fraction bits of the quadruple-precision
    856 floating-point value `a'.
    857 -------------------------------------------------------------------------------
    858 */
    859 INLINE bits64 extractFloat128Frac1( float128 a )
    860 {
    861 
    862     return a.low;
    863 
    864 }
    865 
    866 /*
    867 -------------------------------------------------------------------------------
    868 Returns the most-significant 48 fraction bits of the quadruple-precision
    869 floating-point value `a'.
    870 -------------------------------------------------------------------------------
    871 */
    872 INLINE bits64 extractFloat128Frac0( float128 a )
    873 {
    874 
    875     return a.high & LIT64( 0x0000FFFFFFFFFFFF );
    876 
    877 }
    878 
    879 /*
    880 -------------------------------------------------------------------------------
    881 Returns the exponent bits of the quadruple-precision floating-point value
    882 `a'.
    883 -------------------------------------------------------------------------------
    884 */
    885 INLINE int32 extractFloat128Exp( float128 a )
    886 {
    887 
    888     return ( a.high>>48 ) & 0x7FFF;
    889 
    890 }
    891 
    892 /*
    893 -------------------------------------------------------------------------------
    894 Returns the sign bit of the quadruple-precision floating-point value `a'.
    895 -------------------------------------------------------------------------------
    896 */
    897 INLINE flag extractFloat128Sign( float128 a )
    898 {
    899 
    900     return a.high>>63;
    901 
    902 }
    903 
    904 /*
    905 -------------------------------------------------------------------------------
    906 Normalizes the subnormal quadruple-precision floating-point value
    907 represented by the denormalized significand formed by the concatenation of
    908 `aSig0' and `aSig1'.  The normalized exponent is stored at the location
    909 pointed to by `zExpPtr'.  The most significant 49 bits of the normalized
    910 significand are stored at the location pointed to by `zSig0Ptr', and the
    911 least significant 64 bits of the normalized significand are stored at the
    912 location pointed to by `zSig1Ptr'.
    913 -------------------------------------------------------------------------------
    914 */
    915 static void
    916  normalizeFloat128Subnormal(
    917      bits64 aSig0,
    918      bits64 aSig1,
    919      int32 *zExpPtr,
    920      bits64 *zSig0Ptr,
    921      bits64 *zSig1Ptr
    922  )
    923 {
    924     int8 shiftCount;
    925 
    926     if ( aSig0 == 0 ) {
    927         shiftCount = countLeadingZeros64( aSig1 ) - 15;
    928         if ( shiftCount < 0 ) {
    929             *zSig0Ptr = aSig1>>( - shiftCount );
    930             *zSig1Ptr = aSig1<<( shiftCount & 63 );
    931         }
    932         else {
    933             *zSig0Ptr = aSig1<<shiftCount;
    934             *zSig1Ptr = 0;
    935         }
    936         *zExpPtr = - shiftCount - 63;
    937     }
    938     else {
    939         shiftCount = countLeadingZeros64( aSig0 ) - 15;
    940         shortShift128Left( aSig0, aSig1, shiftCount, zSig0Ptr, zSig1Ptr );
    941         *zExpPtr = 1 - shiftCount;
    942     }
    943 
    944 }
    945 
    946 /*
    947 -------------------------------------------------------------------------------
    948 Packs the sign `zSign', the exponent `zExp', and the significand formed
    949 by the concatenation of `zSig0' and `zSig1' into a quadruple-precision
    950 floating-point value, returning the result.  After being shifted into the
    951 proper positions, the three fields `zSign', `zExp', and `zSig0' are simply
    952 added together to form the most significant 32 bits of the result.  This
    953 means that any integer portion of `zSig0' will be added into the exponent.
    954 Since a properly normalized significand will have an integer portion equal
    955 to 1, the `zExp' input should be 1 less than the desired result exponent
    956 whenever `zSig0' and `zSig1' concatenated form a complete, normalized
    957 significand.
    958 -------------------------------------------------------------------------------
    959 */
    960 INLINE float128
    961  packFloat128( flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1 )
    962 {
    963     float128 z;
    964 
    965     z.low = zSig1;
    966     z.high = ( ( (bits64) zSign )<<63 ) + ( ( (bits64) zExp )<<48 ) + zSig0;
    967     return z;
    968 
    969 }
    970 
    971 /*
    972 -------------------------------------------------------------------------------
    973 Takes an abstract floating-point value having sign `zSign', exponent `zExp',
    974 and extended significand formed by the concatenation of `zSig0', `zSig1',
    975 and `zSig2', and returns the proper quadruple-precision floating-point value
    976 corresponding to the abstract input.  Ordinarily, the abstract value is
    977 simply rounded and packed into the quadruple-precision format, with the
    978 inexact exception raised if the abstract input cannot be represented
    979 exactly.  However, if the abstract value is too large, the overflow and
    980 inexact exceptions are raised and an infinity or maximal finite value is
    981 returned.  If the abstract value is too small, the input value is rounded to
    982 a subnormal number, and the underflow and inexact exceptions are raised if
    983 the abstract input cannot be represented exactly as a subnormal quadruple-
    984 precision floating-point number.
    985     The input significand must be normalized or smaller.  If the input
    986 significand is not normalized, `zExp' must be 0; in that case, the result
    987 returned is a subnormal number, and it must not require rounding.  In the
    988 usual case that the input significand is normalized, `zExp' must be 1 less
    989 than the ``true'' floating-point exponent.  The handling of underflow and
    990 overflow follows the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
    991 -------------------------------------------------------------------------------
    992 */
    993 static float128
    994  roundAndPackFloat128(
    995      flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1, bits64 zSig2 )
    996 {
    997     int8 roundingMode;
    998     flag roundNearestEven, increment, isTiny;
    999 
   1000     roundingMode = float_rounding_mode();
   1001     roundNearestEven = ( roundingMode == float_round_nearest_even );
   1002     increment = ( (sbits64) zSig2 < 0 );
   1003     if ( ! roundNearestEven ) {
   1004         if ( roundingMode == float_round_to_zero ) {
   1005             increment = 0;
   1006         }
   1007         else {
   1008             if ( zSign ) {
   1009                 increment = ( roundingMode == float_round_down ) && zSig2;
   1010             }
   1011             else {
   1012                 increment = ( roundingMode == float_round_up ) && zSig2;
   1013             }
   1014         }
   1015     }
   1016     if ( 0x7FFD <= (bits32) zExp ) {
   1017         if (    ( 0x7FFD < zExp )
   1018              || (    ( zExp == 0x7FFD )
   1019                   && eq128(
   1020                          LIT64( 0x0001FFFFFFFFFFFF ),
   1021                          LIT64( 0xFFFFFFFFFFFFFFFF ),
   1022                          zSig0,
   1023                          zSig1
   1024                      )
   1025                   && increment
   1026                 )
   1027            ) {
   1028             float_raise( float_flag_overflow | float_flag_inexact );
   1029             if (    ( roundingMode == float_round_to_zero )
   1030                  || ( zSign && ( roundingMode == float_round_up ) )
   1031                  || ( ! zSign && ( roundingMode == float_round_down ) )
   1032                ) {
   1033                 return
   1034                     packFloat128(
   1035                         zSign,
   1036                         0x7FFE,
   1037                         LIT64( 0x0000FFFFFFFFFFFF ),
   1038                         LIT64( 0xFFFFFFFFFFFFFFFF )
   1039                     );
   1040             }
   1041             return packFloat128( zSign, 0x7FFF, 0, 0 );
   1042         }
   1043         if ( zExp < 0 ) {
   1044             isTiny =
   1045                    ( float_detect_tininess == float_tininess_before_rounding )
   1046                 || ( zExp < -1 )
   1047                 || ! increment
   1048                 || lt128(
   1049                        zSig0,
   1050                        zSig1,
   1051                        LIT64( 0x0001FFFFFFFFFFFF ),
   1052                        LIT64( 0xFFFFFFFFFFFFFFFF )
   1053                    );
   1054             shift128ExtraRightJamming(
   1055                 zSig0, zSig1, zSig2, - zExp, &zSig0, &zSig1, &zSig2 );
   1056             zExp = 0;
   1057             if ( isTiny && zSig2 ) float_raise( float_flag_underflow );
   1058             if ( roundNearestEven ) {
   1059                 increment = ( (sbits64) zSig2 < 0 );
   1060             }
   1061             else {
   1062                 if ( zSign ) {
   1063                     increment = ( roundingMode == float_round_down ) && zSig2;
   1064                 }
   1065                 else {
   1066                     increment = ( roundingMode == float_round_up ) && zSig2;
   1067                 }
   1068             }
   1069         }
   1070     }
   1071     if ( zSig2 ) float_set_inexact();
   1072     if ( increment ) {
   1073         add128( zSig0, zSig1, 0, 1, &zSig0, &zSig1 );
   1074         zSig1 &= ~ ( ( zSig2 + zSig2 == 0 ) & roundNearestEven );
   1075     }
   1076     else {
   1077         if ( ( zSig0 | zSig1 ) == 0 ) zExp = 0;
   1078     }
   1079     return packFloat128( zSign, zExp, zSig0, zSig1 );
   1080 
   1081 }
   1082 
   1083 /*
   1084 -------------------------------------------------------------------------------
   1085 Takes an abstract floating-point value having sign `zSign', exponent `zExp',
   1086 and significand formed by the concatenation of `zSig0' and `zSig1', and
   1087 returns the proper quadruple-precision floating-point value corresponding
   1088 to the abstract input.  This routine is just like `roundAndPackFloat128'
   1089 except that the input significand has fewer bits and does not have to be
   1090 normalized.  In all cases, `zExp' must be 1 less than the ``true'' floating-
   1091 point exponent.
   1092 -------------------------------------------------------------------------------
   1093 */
   1094 static float128
   1095  normalizeRoundAndPackFloat128(
   1096      flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1 )
   1097 {
   1098     int8 shiftCount;
   1099     bits64 zSig2;
   1100 
   1101     if ( zSig0 == 0 ) {
   1102         zSig0 = zSig1;
   1103         zSig1 = 0;
   1104         zExp -= 64;
   1105     }
   1106     shiftCount = countLeadingZeros64( zSig0 ) - 15;
   1107     if ( 0 <= shiftCount ) {
   1108         zSig2 = 0;
   1109         shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
   1110     }
   1111     else {
   1112         shift128ExtraRightJamming(
   1113             zSig0, zSig1, 0, - shiftCount, &zSig0, &zSig1, &zSig2 );
   1114     }
   1115     zExp -= shiftCount;
   1116     return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 );
   1117 
   1118 }
   1119 
   1120 #endif
   1121 
   1122 /*
   1123 -------------------------------------------------------------------------------
   1124 Returns the result of converting the 32-bit two's complement integer `a'
   1125 to the single-precision floating-point format.  The conversion is performed
   1126 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
   1127 -------------------------------------------------------------------------------
   1128 */
   1129 float32 int32_to_float32( int32 a )
   1130 {
   1131     flag zSign;
   1132 
   1133     if ( a == 0 ) return 0;
   1134     if ( a == (sbits32) 0x80000000 ) return packFloat32( 1, 0x9E, 0 );
   1135     zSign = ( a < 0 );
   1136     return normalizeRoundAndPackFloat32( zSign, 0x9C, zSign ? - a : a );
   1137 
   1138 }
   1139 
   1140 /*
   1141 -------------------------------------------------------------------------------
   1142 Returns the result of converting the 32-bit two's complement integer `a'
   1143 to the double-precision floating-point format.  The conversion is performed
   1144 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
   1145 -------------------------------------------------------------------------------
   1146 */
   1147 float64 int32_to_float64( int32 a )
   1148 {
   1149     flag zSign;
   1150     uint32 absA;
   1151     int8 shiftCount;
   1152     bits64 zSig;
   1153 
   1154     if ( a == 0 ) return 0;
   1155     zSign = ( a < 0 );
   1156     absA = zSign ? - a : a;
   1157     shiftCount = countLeadingZeros32( absA ) + 21;
   1158     zSig = absA;
   1159     return packFloat64( zSign, 0x432 - shiftCount, zSig<<shiftCount );
   1160 
   1161 }
   1162 
   1163 #ifdef FLOATX80
   1164 
   1165 /*
   1166 -------------------------------------------------------------------------------
   1167 Returns the result of converting the 32-bit two's complement integer `a'
   1168 to the extended double-precision floating-point format.  The conversion
   1169 is performed according to the IEC/IEEE Standard for Binary Floating-Point
   1170 Arithmetic.
   1171 -------------------------------------------------------------------------------
   1172 */
   1173 floatx80 int32_to_floatx80( int32 a )
   1174 {
   1175     flag zSign;
   1176     uint32 absA;
   1177     int8 shiftCount;
   1178     bits64 zSig;
   1179 
   1180     if ( a == 0 ) return packFloatx80( 0, 0, 0 );
   1181     zSign = ( a < 0 );
   1182     absA = zSign ? - a : a;
   1183     shiftCount = countLeadingZeros32( absA ) + 32;
   1184     zSig = absA;
   1185     return packFloatx80( zSign, 0x403E - shiftCount, zSig<<shiftCount );
   1186 
   1187 }
   1188 
   1189 #endif
   1190 
   1191 #ifdef FLOAT128
   1192 
   1193 /*
   1194 -------------------------------------------------------------------------------
   1195 Returns the result of converting the 32-bit two's complement integer `a' to
   1196 the quadruple-precision floating-point format.  The conversion is performed
   1197 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
   1198 -------------------------------------------------------------------------------
   1199 */
   1200 float128 int32_to_float128( int32 a )
   1201 {
   1202     flag zSign;
   1203     uint32 absA;
   1204     int8 shiftCount;
   1205     bits64 zSig0;
   1206 
   1207     if ( a == 0 ) return packFloat128( 0, 0, 0, 0 );
   1208     zSign = ( a < 0 );
   1209     absA = zSign ? - a : a;
   1210     shiftCount = countLeadingZeros32( absA ) + 17;
   1211     zSig0 = absA;
   1212     return packFloat128( zSign, 0x402E - shiftCount, zSig0<<shiftCount, 0 );
   1213 
   1214 }
   1215 
   1216 #endif
   1217 
   1218 #ifndef SOFTFLOAT_FOR_GCC /* __floatdi?f is in libgcc2.c */
   1219 /*
   1220 -------------------------------------------------------------------------------
   1221 Returns the result of converting the 64-bit two's complement integer `a'
   1222 to the single-precision floating-point format.  The conversion is performed
   1223 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
   1224 -------------------------------------------------------------------------------
   1225 */
   1226 float32 int64_to_float32( int64 a )
   1227 {
   1228     flag zSign;
   1229     uint64 absA;
   1230     int8 shiftCount;
   1231 
   1232     if ( a == 0 ) return 0;
   1233     zSign = ( a < 0 );
   1234     absA = zSign ? - a : a;
   1235     shiftCount = countLeadingZeros64( absA ) - 40;
   1236     if ( 0 <= shiftCount ) {
   1237         return packFloat32( zSign, 0x95 - shiftCount, absA<<shiftCount );
   1238     }
   1239     else {
   1240         shiftCount += 7;
   1241         if ( shiftCount < 0 ) {
   1242             shift64RightJamming( absA, - shiftCount, &absA );
   1243         }
   1244         else {
   1245             absA <<= shiftCount;
   1246         }
   1247         return roundAndPackFloat32( zSign, 0x9C - shiftCount, absA );
   1248     }
   1249 
   1250 }
   1251 
   1252 /*
   1253 -------------------------------------------------------------------------------
   1254 Returns the result of converting the 64-bit two's complement integer `a'
   1255 to the double-precision floating-point format.  The conversion is performed
   1256 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
   1257 -------------------------------------------------------------------------------
   1258 */
   1259 float64 int64_to_float64( int64 a )
   1260 {
   1261     flag zSign;
   1262 
   1263     if ( a == 0 ) return 0;
   1264     if ( a == (sbits64) LIT64( 0x8000000000000000 ) ) {
   1265         return packFloat64( 1, 0x43E, 0 );
   1266     }
   1267     zSign = ( a < 0 );
   1268     return normalizeRoundAndPackFloat64( zSign, 0x43C, zSign ? - a : a );
   1269 
   1270 }
   1271 
   1272 #ifdef FLOATX80
   1273 
   1274 /*
   1275 -------------------------------------------------------------------------------
   1276 Returns the result of converting the 64-bit two's complement integer `a'
   1277 to the extended double-precision floating-point format.  The conversion
   1278 is performed according to the IEC/IEEE Standard for Binary Floating-Point
   1279 Arithmetic.
   1280 -------------------------------------------------------------------------------
   1281 */
   1282 floatx80 int64_to_floatx80( int64 a )
   1283 {
   1284     flag zSign;
   1285     uint64 absA;
   1286     int8 shiftCount;
   1287 
   1288     if ( a == 0 ) return packFloatx80( 0, 0, 0 );
   1289     zSign = ( a < 0 );
   1290     absA = zSign ? - a : a;
   1291     shiftCount = countLeadingZeros64( absA );
   1292     return packFloatx80( zSign, 0x403E - shiftCount, absA<<shiftCount );
   1293 
   1294 }
   1295 
   1296 #endif
   1297 
   1298 #ifdef FLOAT128
   1299 
   1300 /*
   1301 -------------------------------------------------------------------------------
   1302 Returns the result of converting the 64-bit two's complement integer `a' to
   1303 the quadruple-precision floating-point format.  The conversion is performed
   1304 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
   1305 -------------------------------------------------------------------------------
   1306 */
   1307 float128 int64_to_float128( int64 a )
   1308 {
   1309     flag zSign;
   1310     uint64 absA;
   1311     int8 shiftCount;
   1312     int32 zExp;
   1313     bits64 zSig0, zSig1;
   1314 
   1315     if ( a == 0 ) return packFloat128( 0, 0, 0, 0 );
   1316     zSign = ( a < 0 );
   1317     absA = zSign ? - a : a;
   1318     shiftCount = countLeadingZeros64( absA ) + 49;
   1319     zExp = 0x406E - shiftCount;
   1320     if ( 64 <= shiftCount ) {
   1321         zSig1 = 0;
   1322         zSig0 = absA;
   1323         shiftCount -= 64;
   1324     }
   1325     else {
   1326         zSig1 = absA;
   1327         zSig0 = 0;
   1328     }
   1329     shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
   1330     return packFloat128( zSign, zExp, zSig0, zSig1 );
   1331 
   1332 }
   1333 
   1334 #endif
   1335 #endif /* !SOFTFLOAT_FOR_GCC */
   1336 
   1337 #ifndef SOFTFLOAT_FOR_GCC /* Not needed */
   1338 /*
   1339 -------------------------------------------------------------------------------
   1340 Returns the result of converting the single-precision floating-point value
   1341 `a' to the 32-bit two's complement integer format.  The conversion is
   1342 performed according to the IEC/IEEE Standard for Binary Floating-Point
   1343 Arithmetic---which means in particular that the conversion is rounded
   1344 according to the current rounding mode.  If `a' is a NaN, the largest
   1345 positive integer is returned.  Otherwise, if the conversion overflows, the
   1346 largest integer with the same sign as `a' is returned.
   1347 -------------------------------------------------------------------------------
   1348 */
   1349 int32 float32_to_int32( float32 a )
   1350 {
   1351     flag aSign;
   1352     int16 aExp, shiftCount;
   1353     bits32 aSig;
   1354     bits64 aSig64;
   1355 
   1356     aSig = extractFloat32Frac( a );
   1357     aExp = extractFloat32Exp( a );
   1358     aSign = extractFloat32Sign( a );
   1359     if ( ( aExp == 0xFF ) && aSig ) aSign = 0;
   1360     if ( aExp ) aSig |= 0x00800000;
   1361     shiftCount = 0xAF - aExp;
   1362     aSig64 = aSig;
   1363     aSig64 <<= 32;
   1364     if ( 0 < shiftCount ) shift64RightJamming( aSig64, shiftCount, &aSig64 );
   1365     return roundAndPackInt32( aSign, aSig64 );
   1366 
   1367 }
   1368 #endif /* !SOFTFLOAT_FOR_GCC */
   1369 
   1370 /*
   1371 -------------------------------------------------------------------------------
   1372 Returns the result of converting the single-precision floating-point value
   1373 `a' to the 32-bit two's complement integer format.  The conversion is
   1374 performed according to the IEC/IEEE Standard for Binary Floating-Point
   1375 Arithmetic, except that the conversion is always rounded toward zero.
   1376 If `a' is a NaN, the largest positive integer is returned.  Otherwise, if
   1377 the conversion overflows, the largest integer with the same sign as `a' is
   1378 returned.
   1379 -------------------------------------------------------------------------------
   1380 */
   1381 int32 float32_to_int32_round_to_zero( float32 a )
   1382 {
   1383     flag aSign;
   1384     int16 aExp, shiftCount;
   1385     bits32 aSig;
   1386     int32 z;
   1387 
   1388     aSig = extractFloat32Frac( a );
   1389     aExp = extractFloat32Exp( a );
   1390     aSign = extractFloat32Sign( a );
   1391     shiftCount = aExp - 0x9E;
   1392     if ( 0 <= shiftCount ) {
   1393         if ( a != 0xCF000000 ) {
   1394             float_raise( float_flag_invalid );
   1395             if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) return 0x7FFFFFFF;
   1396         }
   1397         return (sbits32) 0x80000000;
   1398     }
   1399     else if ( aExp <= 0x7E ) {
   1400         if ( aExp | aSig ) float_set_inexact();
   1401         return 0;
   1402     }
   1403     aSig = ( aSig | 0x00800000 )<<8;
   1404     z = aSig>>( - shiftCount );
   1405     if ( (bits32) ( aSig<<( shiftCount & 31 ) ) ) {
   1406         float_set_inexact();
   1407     }
   1408     if ( aSign ) z = - z;
   1409     return z;
   1410 
   1411 }
   1412 
   1413 #ifndef SOFTFLOAT_FOR_GCC /* __fix?fdi provided by libgcc2.c */
   1414 /*
   1415 -------------------------------------------------------------------------------
   1416 Returns the result of converting the single-precision floating-point value
   1417 `a' to the 64-bit two's complement integer format.  The conversion is
   1418 performed according to the IEC/IEEE Standard for Binary Floating-Point
   1419 Arithmetic---which means in particular that the conversion is rounded
   1420 according to the current rounding mode.  If `a' is a NaN, the largest
   1421 positive integer is returned.  Otherwise, if the conversion overflows, the
   1422 largest integer with the same sign as `a' is returned.
   1423 -------------------------------------------------------------------------------
   1424 */
   1425 int64 float32_to_int64( float32 a )
   1426 {
   1427     flag aSign;
   1428     int16 aExp, shiftCount;
   1429     bits32 aSig;
   1430     bits64 aSig64, aSigExtra;
   1431 
   1432     aSig = extractFloat32Frac( a );
   1433     aExp = extractFloat32Exp( a );
   1434     aSign = extractFloat32Sign( a );
   1435     shiftCount = 0xBE - aExp;
   1436     if ( shiftCount < 0 ) {
   1437         float_raise( float_flag_invalid );
   1438         if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) {
   1439             return LIT64( 0x7FFFFFFFFFFFFFFF );
   1440         }
   1441         return (sbits64) LIT64( 0x8000000000000000 );
   1442     }
   1443     if ( aExp ) aSig |= 0x00800000;
   1444     aSig64 = aSig;
   1445     aSig64 <<= 40;
   1446     shift64ExtraRightJamming( aSig64, 0, shiftCount, &aSig64, &aSigExtra );
   1447     return roundAndPackInt64( aSign, aSig64, aSigExtra );
   1448 
   1449 }
   1450 
   1451 /*
   1452 -------------------------------------------------------------------------------
   1453 Returns the result of converting the single-precision floating-point value
   1454 `a' to the 64-bit two's complement integer format.  The conversion is
   1455 performed according to the IEC/IEEE Standard for Binary Floating-Point
   1456 Arithmetic, except that the conversion is always rounded toward zero.  If
   1457 `a' is a NaN, the largest positive integer is returned.  Otherwise, if the
   1458 conversion overflows, the largest integer with the same sign as `a' is
   1459 returned.
   1460 -------------------------------------------------------------------------------
   1461 */
   1462 int64 float32_to_int64_round_to_zero( float32 a )
   1463 {
   1464     flag aSign;
   1465     int16 aExp, shiftCount;
   1466     bits32 aSig;
   1467     bits64 aSig64;
   1468     int64 z;
   1469 
   1470     aSig = extractFloat32Frac( a );
   1471     aExp = extractFloat32Exp( a );
   1472     aSign = extractFloat32Sign( a );
   1473     shiftCount = aExp - 0xBE;
   1474     if ( 0 <= shiftCount ) {
   1475         if ( a != 0xDF000000 ) {
   1476             float_raise( float_flag_invalid );
   1477             if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) {
   1478                 return LIT64( 0x7FFFFFFFFFFFFFFF );
   1479             }
   1480         }
   1481         return (sbits64) LIT64( 0x8000000000000000 );
   1482     }
   1483     else if ( aExp <= 0x7E ) {
   1484         if ( aExp | aSig ) float_set_inexact();
   1485         return 0;
   1486     }
   1487     aSig64 = aSig | 0x00800000;
   1488     aSig64 <<= 40;
   1489     z = aSig64>>( - shiftCount );
   1490     if ( (bits64) ( aSig64<<( shiftCount & 63 ) ) ) {
   1491         float_set_inexact();
   1492     }
   1493     if ( aSign ) z = - z;
   1494     return z;
   1495 
   1496 }
   1497 #endif /* !SOFTFLOAT_FOR_GCC */
   1498 
   1499 /*
   1500 -------------------------------------------------------------------------------
   1501 Returns the result of converting the single-precision floating-point value
   1502 `a' to the double-precision floating-point format.  The conversion is
   1503 performed according to the IEC/IEEE Standard for Binary Floating-Point
   1504 Arithmetic.
   1505 -------------------------------------------------------------------------------
   1506 */
   1507 float64 float32_to_float64( float32 a )
   1508 {
   1509     flag aSign;
   1510     int16 aExp;
   1511     bits32 aSig;
   1512 
   1513     aSig = extractFloat32Frac( a );
   1514     aExp = extractFloat32Exp( a );
   1515     aSign = extractFloat32Sign( a );
   1516     if ( aExp == 0xFF ) {
   1517         if ( aSig ) return commonNaNToFloat64( float32ToCommonNaN( a ) );
   1518         return packFloat64( aSign, 0x7FF, 0 );
   1519     }
   1520     if ( aExp == 0 ) {
   1521         if ( aSig == 0 ) return packFloat64( aSign, 0, 0 );
   1522         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
   1523         --aExp;
   1524     }
   1525     return packFloat64( aSign, aExp + 0x380, ( (bits64) aSig )<<29 );
   1526 
   1527 }
   1528 
   1529 #ifdef FLOATX80
   1530 
   1531 /*
   1532 -------------------------------------------------------------------------------
   1533 Returns the result of converting the single-precision floating-point value
   1534 `a' to the extended double-precision floating-point format.  The conversion
   1535 is performed according to the IEC/IEEE Standard for Binary Floating-Point
   1536 Arithmetic.
   1537 -------------------------------------------------------------------------------
   1538 */
   1539 floatx80 float32_to_floatx80( float32 a )
   1540 {
   1541     flag aSign;
   1542     int16 aExp;
   1543     bits32 aSig;
   1544 
   1545     aSig = extractFloat32Frac( a );
   1546     aExp = extractFloat32Exp( a );
   1547     aSign = extractFloat32Sign( a );
   1548     if ( aExp == 0xFF ) {
   1549         if ( aSig ) return commonNaNToFloatx80( float32ToCommonNaN( a ) );
   1550         return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
   1551     }
   1552     if ( aExp == 0 ) {
   1553         if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 );
   1554         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
   1555     }
   1556     aSig |= 0x00800000;
   1557     return packFloatx80( aSign, aExp + 0x3F80, ( (bits64) aSig )<<40 );
   1558 
   1559 }
   1560 
   1561 #endif
   1562 
   1563 #ifdef FLOAT128
   1564 
   1565 /*
   1566 -------------------------------------------------------------------------------
   1567 Returns the result of converting the single-precision floating-point value
   1568 `a' to the double-precision floating-point format.  The conversion is
   1569 performed according to the IEC/IEEE Standard for Binary Floating-Point
   1570 Arithmetic.
   1571 -------------------------------------------------------------------------------
   1572 */
   1573 float128 float32_to_float128( float32 a )
   1574 {
   1575     flag aSign;
   1576     int16 aExp;
   1577     bits32 aSig;
   1578 
   1579     aSig = extractFloat32Frac( a );
   1580     aExp = extractFloat32Exp( a );
   1581     aSign = extractFloat32Sign( a );
   1582     if ( aExp == 0xFF ) {
   1583         if ( aSig ) return commonNaNToFloat128( float32ToCommonNaN( a ) );
   1584         return packFloat128( aSign, 0x7FFF, 0, 0 );
   1585     }
   1586     if ( aExp == 0 ) {
   1587         if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 );
   1588         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
   1589         --aExp;
   1590     }
   1591     return packFloat128( aSign, aExp + 0x3F80, ( (bits64) aSig )<<25, 0 );
   1592 
   1593 }
   1594 
   1595 #endif
   1596 
   1597 #ifndef SOFTFLOAT_FOR_GCC /* Not needed */
   1598 /*
   1599 -------------------------------------------------------------------------------
   1600 Rounds the single-precision floating-point value `a' to an integer, and
   1601 returns the result as a single-precision floating-point value.  The
   1602 operation is performed according to the IEC/IEEE Standard for Binary
   1603 Floating-Point Arithmetic.
   1604 -------------------------------------------------------------------------------
   1605 */
   1606 float32 float32_round_to_int( float32 a )
   1607 {
   1608     flag aSign;
   1609     int16 aExp;
   1610     bits32 lastBitMask, roundBitsMask;
   1611     int8 roundingMode;
   1612     float32 z;
   1613 
   1614     aExp = extractFloat32Exp( a );
   1615     if ( 0x96 <= aExp ) {
   1616         if ( ( aExp == 0xFF ) && extractFloat32Frac( a ) ) {
   1617             return propagateFloat32NaN( a, a );
   1618         }
   1619         return a;
   1620     }
   1621     if ( aExp <= 0x7E ) {
   1622         if ( (bits32) ( a<<1 ) == 0 ) return a;
   1623         float_set_inexact();
   1624         aSign = extractFloat32Sign( a );
   1625         switch ( float_rounding_mode() ) {
   1626          case float_round_nearest_even:
   1627             if ( ( aExp == 0x7E ) && extractFloat32Frac( a ) ) {
   1628                 return packFloat32( aSign, 0x7F, 0 );
   1629             }
   1630             break;
   1631          case float_round_down:
   1632             return aSign ? 0xBF800000 : 0;
   1633          case float_round_up:
   1634             return aSign ? 0x80000000 : 0x3F800000;
   1635         }
   1636         return packFloat32( aSign, 0, 0 );
   1637     }
   1638     lastBitMask = 1;
   1639     lastBitMask <<= 0x96 - aExp;
   1640     roundBitsMask = lastBitMask - 1;
   1641     z = a;
   1642     roundingMode = float_rounding_mode();
   1643     if ( roundingMode == float_round_nearest_even ) {
   1644         z += lastBitMask>>1;
   1645         if ( ( z & roundBitsMask ) == 0 ) z &= ~ lastBitMask;
   1646     }
   1647     else if ( roundingMode != float_round_to_zero ) {
   1648         if ( extractFloat32Sign( z ) ^ ( roundingMode == float_round_up ) ) {
   1649             z += roundBitsMask;
   1650         }
   1651     }
   1652     z &= ~ roundBitsMask;
   1653     if ( z != a ) float_set_inexact();
   1654     return z;
   1655 
   1656 }
   1657 #endif /* !SOFTFLOAT_FOR_GCC */
   1658 
   1659 /*
   1660 -------------------------------------------------------------------------------
   1661 Returns the result of adding the absolute values of the single-precision
   1662 floating-point values `a' and `b'.  If `zSign' is 1, the sum is negated
   1663 before being returned.  `zSign' is ignored if the result is a NaN.
   1664 The addition is performed according to the IEC/IEEE Standard for Binary
   1665 Floating-Point Arithmetic.
   1666 -------------------------------------------------------------------------------
   1667 */
   1668 static float32 addFloat32Sigs( float32 a, float32 b, flag zSign )
   1669 {
   1670     int16 aExp, bExp, zExp;
   1671     bits32 aSig, bSig, zSig;
   1672     int16 expDiff;
   1673 
   1674     aSig = extractFloat32Frac( a );
   1675     aExp = extractFloat32Exp( a );
   1676     bSig = extractFloat32Frac( b );
   1677     bExp = extractFloat32Exp( b );
   1678     expDiff = aExp - bExp;
   1679     aSig <<= 6;
   1680     bSig <<= 6;
   1681     if ( 0 < expDiff ) {
   1682         if ( aExp == 0xFF ) {
   1683             if ( aSig ) return propagateFloat32NaN( a, b );
   1684             return a;
   1685         }
   1686         if ( bExp == 0 ) {
   1687             --expDiff;
   1688         }
   1689         else {
   1690             bSig |= 0x20000000;
   1691         }
   1692         shift32RightJamming( bSig, expDiff, &bSig );
   1693         zExp = aExp;
   1694     }
   1695     else if ( expDiff < 0 ) {
   1696         if ( bExp == 0xFF ) {
   1697             if ( bSig ) return propagateFloat32NaN( a, b );
   1698             return packFloat32( zSign, 0xFF, 0 );
   1699         }
   1700         if ( aExp == 0 ) {
   1701             ++expDiff;
   1702         }
   1703         else {
   1704             aSig |= 0x20000000;
   1705         }
   1706         shift32RightJamming( aSig, - expDiff, &aSig );
   1707         zExp = bExp;
   1708     }
   1709     else {
   1710         if ( aExp == 0xFF ) {
   1711             if ( aSig | bSig ) return propagateFloat32NaN( a, b );
   1712             return a;
   1713         }
   1714         if ( aExp == 0 ) return packFloat32( zSign, 0, ( aSig + bSig )>>6 );
   1715         zSig = 0x40000000 + aSig + bSig;
   1716         zExp = aExp;
   1717         goto roundAndPack;
   1718     }
   1719     aSig |= 0x20000000;
   1720     zSig = ( aSig + bSig )<<1;
   1721     --zExp;
   1722     if ( (sbits32) zSig < 0 ) {
   1723         zSig = aSig + bSig;
   1724         ++zExp;
   1725     }
   1726  roundAndPack:
   1727     return roundAndPackFloat32( zSign, zExp, zSig );
   1728 
   1729 }
   1730 
   1731 /*
   1732 -------------------------------------------------------------------------------
   1733 Returns the result of subtracting the absolute values of the single-
   1734 precision floating-point values `a' and `b'.  If `zSign' is 1, the
   1735 difference is negated before being returned.  `zSign' is ignored if the
   1736 result is a NaN.  The subtraction is performed according to the IEC/IEEE
   1737 Standard for Binary Floating-Point Arithmetic.
   1738 -------------------------------------------------------------------------------
   1739 */
   1740 static float32 subFloat32Sigs( float32 a, float32 b, flag zSign )
   1741 {
   1742     int16 aExp, bExp, zExp;
   1743     bits32 aSig, bSig, zSig;
   1744     int16 expDiff;
   1745 
   1746     aSig = extractFloat32Frac( a );
   1747     aExp = extractFloat32Exp( a );
   1748     bSig = extractFloat32Frac( b );
   1749     bExp = extractFloat32Exp( b );
   1750     expDiff = aExp - bExp;
   1751     aSig <<= 7;
   1752     bSig <<= 7;
   1753     if ( 0 < expDiff ) goto aExpBigger;
   1754     if ( expDiff < 0 ) goto bExpBigger;
   1755     if ( aExp == 0xFF ) {
   1756         if ( aSig | bSig ) return propagateFloat32NaN( a, b );
   1757         float_raise( float_flag_invalid );
   1758         return float32_default_nan;
   1759     }
   1760     if ( aExp == 0 ) {
   1761         aExp = 1;
   1762         bExp = 1;
   1763     }
   1764     if ( bSig < aSig ) goto aBigger;
   1765     if ( aSig < bSig ) goto bBigger;
   1766     return packFloat32( float_rounding_mode() == float_round_down, 0, 0 );
   1767  bExpBigger:
   1768     if ( bExp == 0xFF ) {
   1769         if ( bSig ) return propagateFloat32NaN( a, b );
   1770         return packFloat32( zSign ^ 1, 0xFF, 0 );
   1771     }
   1772     if ( aExp == 0 ) {
   1773         ++expDiff;
   1774     }
   1775     else {
   1776         aSig |= 0x40000000;
   1777     }
   1778     shift32RightJamming( aSig, - expDiff, &aSig );
   1779     bSig |= 0x40000000;
   1780  bBigger:
   1781     zSig = bSig - aSig;
   1782     zExp = bExp;
   1783     zSign ^= 1;
   1784     goto normalizeRoundAndPack;
   1785  aExpBigger:
   1786     if ( aExp == 0xFF ) {
   1787         if ( aSig ) return propagateFloat32NaN( a, b );
   1788         return a;
   1789     }
   1790     if ( bExp == 0 ) {
   1791         --expDiff;
   1792     }
   1793     else {
   1794         bSig |= 0x40000000;
   1795     }
   1796     shift32RightJamming( bSig, expDiff, &bSig );
   1797     aSig |= 0x40000000;
   1798  aBigger:
   1799     zSig = aSig - bSig;
   1800     zExp = aExp;
   1801  normalizeRoundAndPack:
   1802     --zExp;
   1803     return normalizeRoundAndPackFloat32( zSign, zExp, zSig );
   1804 
   1805 }
   1806 
   1807 /*
   1808 -------------------------------------------------------------------------------
   1809 Returns the result of adding the single-precision floating-point values `a'
   1810 and `b'.  The operation is performed according to the IEC/IEEE Standard for
   1811 Binary Floating-Point Arithmetic.
   1812 -------------------------------------------------------------------------------
   1813 */
   1814 float32 float32_add( float32 a, float32 b )
   1815 {
   1816     flag aSign, bSign;
   1817 
   1818     aSign = extractFloat32Sign( a );
   1819     bSign = extractFloat32Sign( b );
   1820     if ( aSign == bSign ) {
   1821         return addFloat32Sigs( a, b, aSign );
   1822     }
   1823     else {
   1824         return subFloat32Sigs( a, b, aSign );
   1825     }
   1826 
   1827 }
   1828 
   1829 /*
   1830 -------------------------------------------------------------------------------
   1831 Returns the result of subtracting the single-precision floating-point values
   1832 `a' and `b'.  The operation is performed according to the IEC/IEEE Standard
   1833 for Binary Floating-Point Arithmetic.
   1834 -------------------------------------------------------------------------------
   1835 */
   1836 float32 float32_sub( float32 a, float32 b )
   1837 {
   1838     flag aSign, bSign;
   1839 
   1840     aSign = extractFloat32Sign( a );
   1841     bSign = extractFloat32Sign( b );
   1842     if ( aSign == bSign ) {
   1843         return subFloat32Sigs( a, b, aSign );
   1844     }
   1845     else {
   1846         return addFloat32Sigs( a, b, aSign );
   1847     }
   1848 
   1849 }
   1850 
   1851 /*
   1852 -------------------------------------------------------------------------------
   1853 Returns the result of multiplying the single-precision floating-point values
   1854 `a' and `b'.  The operation is performed according to the IEC/IEEE Standard
   1855 for Binary Floating-Point Arithmetic.
   1856 -------------------------------------------------------------------------------
   1857 */
   1858 float32 float32_mul( float32 a, float32 b )
   1859 {
   1860     flag aSign, bSign, zSign;
   1861     int16 aExp, bExp, zExp;
   1862     bits32 aSig, bSig;
   1863     bits64 zSig64;
   1864     bits32 zSig;
   1865 
   1866     aSig = extractFloat32Frac( a );
   1867     aExp = extractFloat32Exp( a );
   1868     aSign = extractFloat32Sign( a );
   1869     bSig = extractFloat32Frac( b );
   1870     bExp = extractFloat32Exp( b );
   1871     bSign = extractFloat32Sign( b );
   1872     zSign = aSign ^ bSign;
   1873     if ( aExp == 0xFF ) {
   1874         if ( aSig || ( ( bExp == 0xFF ) && bSig ) ) {
   1875             return propagateFloat32NaN( a, b );
   1876         }
   1877         if ( ( bExp | bSig ) == 0 ) {
   1878             float_raise( float_flag_invalid );
   1879             return float32_default_nan;
   1880         }
   1881         return packFloat32( zSign, 0xFF, 0 );
   1882     }
   1883     if ( bExp == 0xFF ) {
   1884         if ( bSig ) return propagateFloat32NaN( a, b );
   1885         if ( ( aExp | aSig ) == 0 ) {
   1886             float_raise( float_flag_invalid );
   1887             return float32_default_nan;
   1888         }
   1889         return packFloat32( zSign, 0xFF, 0 );
   1890     }
   1891     if ( aExp == 0 ) {
   1892         if ( aSig == 0 ) return packFloat32( zSign, 0, 0 );
   1893         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
   1894     }
   1895     if ( bExp == 0 ) {
   1896         if ( bSig == 0 ) return packFloat32( zSign, 0, 0 );
   1897         normalizeFloat32Subnormal( bSig, &bExp, &bSig );
   1898     }
   1899     zExp = aExp + bExp - 0x7F;
   1900     aSig = ( aSig | 0x00800000 )<<7;
   1901     bSig = ( bSig | 0x00800000 )<<8;
   1902     shift64RightJamming( ( (bits64) aSig ) * bSig, 32, &zSig64 );
   1903     zSig = zSig64;
   1904     if ( 0 <= (sbits32) ( zSig<<1 ) ) {
   1905         zSig <<= 1;
   1906         --zExp;
   1907     }
   1908     return roundAndPackFloat32( zSign, zExp, zSig );
   1909 
   1910 }
   1911 
   1912 /*
   1913 -------------------------------------------------------------------------------
   1914 Returns the result of dividing the single-precision floating-point value `a'
   1915 by the corresponding value `b'.  The operation is performed according to the
   1916 IEC/IEEE Standard for Binary Floating-Point Arithmetic.
   1917 -------------------------------------------------------------------------------
   1918 */
   1919 float32 float32_div( float32 a, float32 b )
   1920 {
   1921     flag aSign, bSign, zSign;
   1922     int16 aExp, bExp, zExp;
   1923     bits32 aSig, bSig, zSig;
   1924 
   1925     aSig = extractFloat32Frac( a );
   1926     aExp = extractFloat32Exp( a );
   1927     aSign = extractFloat32Sign( a );
   1928     bSig = extractFloat32Frac( b );
   1929     bExp = extractFloat32Exp( b );
   1930     bSign = extractFloat32Sign( b );
   1931     zSign = aSign ^ bSign;
   1932     if ( aExp == 0xFF ) {
   1933         if ( aSig ) return propagateFloat32NaN( a, b );
   1934         if ( bExp == 0xFF ) {
   1935             if ( bSig ) return propagateFloat32NaN( a, b );
   1936             float_raise( float_flag_invalid );
   1937             return float32_default_nan;
   1938         }
   1939         return packFloat32( zSign, 0xFF, 0 );
   1940     }
   1941     if ( bExp == 0xFF ) {
   1942         if ( bSig ) return propagateFloat32NaN( a, b );
   1943         return packFloat32( zSign, 0, 0 );
   1944     }
   1945     if ( bExp == 0 ) {
   1946         if ( bSig == 0 ) {
   1947             if ( ( aExp | aSig ) == 0 ) {
   1948                 float_raise( float_flag_invalid );
   1949                 return float32_default_nan;
   1950             }
   1951             float_raise( float_flag_divbyzero );
   1952             return packFloat32( zSign, 0xFF, 0 );
   1953         }
   1954         normalizeFloat32Subnormal( bSig, &bExp, &bSig );
   1955     }
   1956     if ( aExp == 0 ) {
   1957         if ( aSig == 0 ) return packFloat32( zSign, 0, 0 );
   1958         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
   1959     }
   1960     zExp = aExp - bExp + 0x7D;
   1961     aSig = ( aSig | 0x00800000 )<<7;
   1962     bSig = ( bSig | 0x00800000 )<<8;
   1963     if ( bSig <= ( aSig + aSig ) ) {
   1964         aSig >>= 1;
   1965         ++zExp;
   1966     }
   1967     zSig = ( ( (bits64) aSig )<<32 ) / bSig;
   1968     if ( ( zSig & 0x3F ) == 0 ) {
   1969         zSig |= ( (bits64) bSig * zSig != ( (bits64) aSig )<<32 );
   1970     }
   1971     return roundAndPackFloat32( zSign, zExp, zSig );
   1972 
   1973 }
   1974 
   1975 #ifndef SOFTFLOAT_FOR_GCC /* Not needed */
   1976 /*
   1977 -------------------------------------------------------------------------------
   1978 Returns the remainder of the single-precision floating-point value `a'
   1979 with respect to the corresponding value `b'.  The operation is performed
   1980 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
   1981 -------------------------------------------------------------------------------
   1982 */
   1983 float32 float32_rem( float32 a, float32 b )
   1984 {
   1985     flag aSign, bSign, zSign;
   1986     int16 aExp, bExp, expDiff;
   1987     bits32 aSig, bSig;
   1988     bits32 q;
   1989     bits64 aSig64, bSig64, q64;
   1990     bits32 alternateASig;
   1991     sbits32 sigMean;
   1992 
   1993     aSig = extractFloat32Frac( a );
   1994     aExp = extractFloat32Exp( a );
   1995     aSign = extractFloat32Sign( a );
   1996     bSig = extractFloat32Frac( b );
   1997     bExp = extractFloat32Exp( b );
   1998     bSign = extractFloat32Sign( b );
   1999     if ( aExp == 0xFF ) {
   2000         if ( aSig || ( ( bExp == 0xFF ) && bSig ) ) {
   2001             return propagateFloat32NaN( a, b );
   2002         }
   2003         float_raise( float_flag_invalid );
   2004         return float32_default_nan;
   2005     }
   2006     if ( bExp == 0xFF ) {
   2007         if ( bSig ) return propagateFloat32NaN( a, b );
   2008         return a;
   2009     }
   2010     if ( bExp == 0 ) {
   2011         if ( bSig == 0 ) {
   2012             float_raise( float_flag_invalid );
   2013             return float32_default_nan;
   2014         }
   2015         normalizeFloat32Subnormal( bSig, &bExp, &bSig );
   2016     }
   2017     if ( aExp == 0 ) {
   2018         if ( aSig == 0 ) return a;
   2019         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
   2020     }
   2021     expDiff = aExp - bExp;
   2022     aSig |= 0x00800000;
   2023     bSig |= 0x00800000;
   2024     if ( expDiff < 32 ) {
   2025         aSig <<= 8;
   2026         bSig <<= 8;
   2027         if ( expDiff < 0 ) {
   2028             if ( expDiff < -1 ) return a;
   2029             aSig >>= 1;
   2030         }
   2031         q = ( bSig <= aSig );
   2032         if ( q ) aSig -= bSig;
   2033         if ( 0 < expDiff ) {
   2034             q = ( ( (bits64) aSig )<<32 ) / bSig;
   2035             q >>= 32 - expDiff;
   2036             bSig >>= 2;
   2037             aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q;
   2038         }
   2039         else {
   2040             aSig >>= 2;
   2041             bSig >>= 2;
   2042         }
   2043     }
   2044     else {
   2045         if ( bSig <= aSig ) aSig -= bSig;
   2046         aSig64 = ( (bits64) aSig )<<40;
   2047         bSig64 = ( (bits64) bSig )<<40;
   2048         expDiff -= 64;
   2049         while ( 0 < expDiff ) {
   2050             q64 = estimateDiv128To64( aSig64, 0, bSig64 );
   2051             q64 = ( 2 < q64 ) ? q64 - 2 : 0;
   2052             aSig64 = - ( ( bSig * q64 )<<38 );
   2053             expDiff -= 62;
   2054         }
   2055         expDiff += 64;
   2056         q64 = estimateDiv128To64( aSig64, 0, bSig64 );
   2057         q64 = ( 2 < q64 ) ? q64 - 2 : 0;
   2058         q = q64>>( 64 - expDiff );
   2059         bSig <<= 6;
   2060         aSig = ( ( aSig64>>33 )<<( expDiff - 1 ) ) - bSig * q;
   2061     }
   2062     do {
   2063         alternateASig = aSig;
   2064         ++q;
   2065         aSig -= bSig;
   2066     } while ( 0 <= (sbits32) aSig );
   2067     sigMean = aSig + alternateASig;
   2068     if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) {
   2069         aSig = alternateASig;
   2070     }
   2071     zSign = ( (sbits32) aSig < 0 );
   2072     if ( zSign ) aSig = - aSig;
   2073     return normalizeRoundAndPackFloat32( aSign ^ zSign, bExp, aSig );
   2074 
   2075 }
   2076 #endif /* !SOFTFLOAT_FOR_GCC */
   2077 
   2078 #ifndef SOFTFLOAT_FOR_GCC /* Not needed */
   2079 /*
   2080 -------------------------------------------------------------------------------
   2081 Returns the square root of the single-precision floating-point value `a'.
   2082 The operation is performed according to the IEC/IEEE Standard for Binary
   2083 Floating-Point Arithmetic.
   2084 -------------------------------------------------------------------------------
   2085 */
   2086 float32 float32_sqrt( float32 a )
   2087 {
   2088     flag aSign;
   2089     int16 aExp, zExp;
   2090     bits32 aSig, zSig;
   2091     bits64 rem, term;
   2092 
   2093     aSig = extractFloat32Frac( a );
   2094     aExp = extractFloat32Exp( a );
   2095     aSign = extractFloat32Sign( a );
   2096     if ( aExp == 0xFF ) {
   2097         if ( aSig ) return propagateFloat32NaN( a, 0 );
   2098         if ( ! aSign ) return a;
   2099         float_raise( float_flag_invalid );
   2100         return float32_default_nan;
   2101     }
   2102     if ( aSign ) {
   2103         if ( ( aExp | aSig ) == 0 ) return a;
   2104         float_raise( float_flag_invalid );
   2105         return float32_default_nan;
   2106     }
   2107     if ( aExp == 0 ) {
   2108         if ( aSig == 0 ) return 0;
   2109         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
   2110     }
   2111     zExp = ( ( aExp - 0x7F )>>1 ) + 0x7E;
   2112     aSig = ( aSig | 0x00800000 )<<8;
   2113     zSig = estimateSqrt32( aExp, aSig ) + 2;
   2114     if ( ( zSig & 0x7F ) <= 5 ) {
   2115         if ( zSig < 2 ) {
   2116             zSig = 0x7FFFFFFF;
   2117             goto roundAndPack;
   2118         }
   2119         aSig >>= aExp & 1;
   2120         term = ( (bits64) zSig ) * zSig;
   2121         rem = ( ( (bits64) aSig )<<32 ) - term;
   2122         while ( (sbits64) rem < 0 ) {
   2123             --zSig;
   2124             rem += ( ( (bits64) zSig )<<1 ) | 1;
   2125         }
   2126         zSig |= ( rem != 0 );
   2127     }
   2128     shift32RightJamming( zSig, 1, &zSig );
   2129  roundAndPack:
   2130     return roundAndPackFloat32( 0, zExp, zSig );
   2131 
   2132 }
   2133 #endif /* !SOFTFLOAT_FOR_GCC */
   2134 
   2135 /*
   2136 -------------------------------------------------------------------------------
   2137 Returns 1 if the single-precision floating-point value `a' is equal to
   2138 the corresponding value `b', and 0 otherwise.  The comparison is performed
   2139 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
   2140 -------------------------------------------------------------------------------
   2141 */
   2142 flag float32_eq( float32 a, float32 b )
   2143 {
   2144 
   2145     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
   2146          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
   2147        ) {
   2148         if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) {
   2149             float_raise( float_flag_invalid );
   2150         }
   2151         return 0;
   2152     }
   2153     return ( a == b ) || ( (bits32) ( ( a | b )<<1 ) == 0 );
   2154 
   2155 }
   2156 
   2157 /*
   2158 -------------------------------------------------------------------------------
   2159 Returns 1 if the single-precision floating-point value `a' is less than
   2160 or equal to the corresponding value `b', and 0 otherwise.  The comparison
   2161 is performed according to the IEC/IEEE Standard for Binary Floating-Point
   2162 Arithmetic.
   2163 -------------------------------------------------------------------------------
   2164 */
   2165 flag float32_le( float32 a, float32 b )
   2166 {
   2167     flag aSign, bSign;
   2168 
   2169     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
   2170          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
   2171        ) {
   2172         float_raise( float_flag_invalid );
   2173         return 0;
   2174     }
   2175     aSign = extractFloat32Sign( a );
   2176     bSign = extractFloat32Sign( b );
   2177     if ( aSign != bSign ) return aSign || ( (bits32) ( ( a | b )<<1 ) == 0 );
   2178     return ( a == b ) || ( aSign ^ ( a < b ) );
   2179 
   2180 }
   2181 
   2182 /*
   2183 -------------------------------------------------------------------------------
   2184 Returns 1 if the single-precision floating-point value `a' is less than
   2185 the corresponding value `b', and 0 otherwise.  The comparison is performed
   2186 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
   2187 -------------------------------------------------------------------------------
   2188 */
   2189 flag float32_lt( float32 a, float32 b )
   2190 {
   2191     flag aSign, bSign;
   2192 
   2193     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
   2194          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
   2195        ) {
   2196         float_raise( float_flag_invalid );
   2197         return 0;
   2198     }
   2199     aSign = extractFloat32Sign( a );
   2200     bSign = extractFloat32Sign( b );
   2201     if ( aSign != bSign ) return aSign && ( (bits32) ( ( a | b )<<1 ) != 0 );
   2202     return ( a != b ) && ( aSign ^ ( a < b ) );
   2203 
   2204 }
   2205 
   2206 #ifndef SOFTFLOAT_FOR_GCC /* Not needed */
   2207 /*
   2208 -------------------------------------------------------------------------------
   2209 Returns 1 if the single-precision floating-point value `a' is equal to
   2210 the corresponding value `b', and 0 otherwise.  The invalid exception is
   2211 raised if either operand is a NaN.  Otherwise, the comparison is performed
   2212 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
   2213 -------------------------------------------------------------------------------
   2214 */
   2215 flag float32_eq_signaling( float32 a, float32 b )
   2216 {
   2217 
   2218     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
   2219          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
   2220        ) {
   2221         float_raise( float_flag_invalid );
   2222         return 0;
   2223     }
   2224     return ( a == b ) || ( (bits32) ( ( a | b )<<1 ) == 0 );
   2225 
   2226 }
   2227 
   2228 /*
   2229 -------------------------------------------------------------------------------
   2230 Returns 1 if the single-precision floating-point value `a' is less than or
   2231 equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs do not
   2232 cause an exception.  Otherwise, the comparison is performed according to the
   2233 IEC/IEEE Standard for Binary Floating-Point Arithmetic.
   2234 -------------------------------------------------------------------------------
   2235 */
   2236 flag float32_le_quiet( float32 a, float32 b )
   2237 {
   2238     flag aSign, bSign;
   2239 
   2240     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
   2241          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
   2242        ) {
   2243         if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) {
   2244             float_raise( float_flag_invalid );
   2245         }
   2246         return 0;
   2247     }
   2248     aSign = extractFloat32Sign( a );
   2249     bSign = extractFloat32Sign( b );
   2250     if ( aSign != bSign ) return aSign || ( (bits32) ( ( a | b )<<1 ) == 0 );
   2251     return ( a == b ) || ( aSign ^ ( a < b ) );
   2252 
   2253 }
   2254 
   2255 /*
   2256 -------------------------------------------------------------------------------
   2257 Returns 1 if the single-precision floating-point value `a' is less than
   2258 the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
   2259 exception.  Otherwise, the comparison is performed according to the IEC/IEEE
   2260 Standard for Binary Floating-Point Arithmetic.
   2261 -------------------------------------------------------------------------------
   2262 */
   2263 flag float32_lt_quiet( float32 a, float32 b )
   2264 {
   2265     flag aSign, bSign;
   2266 
   2267     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
   2268          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
   2269        ) {
   2270         if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) {
   2271             float_raise( float_flag_invalid );
   2272         }
   2273         return 0;
   2274     }
   2275     aSign = extractFloat32Sign( a );
   2276     bSign = extractFloat32Sign( b );
   2277     if ( aSign != bSign ) return aSign && ( (bits32) ( ( a | b )<<1 ) != 0 );
   2278     return ( a != b ) && ( aSign ^ ( a < b ) );
   2279 
   2280 }
   2281 #endif /* !SOFTFLOAT_FOR_GCC */
   2282 
   2283 #ifndef SOFTFLOAT_FOR_GCC /* Not needed */
   2284 /*
   2285 -------------------------------------------------------------------------------
   2286 Returns the result of converting the double-precision floating-point value
   2287 `a' to the 32-bit two's complement integer format.  The conversion is
   2288 performed according to the IEC/IEEE Standard for Binary Floating-Point
   2289 Arithmetic---which means in particular that the conversion is rounded
   2290 according to the current rounding mode.  If `a' is a NaN, the largest
   2291 positive integer is returned.  Otherwise, if the conversion overflows, the
   2292 largest integer with the same sign as `a' is returned.
   2293 -------------------------------------------------------------------------------
   2294 */
   2295 int32 float64_to_int32( float64 a )
   2296 {
   2297     flag aSign;
   2298     int16 aExp, shiftCount;
   2299     bits64 aSig;
   2300 
   2301     aSig = extractFloat64Frac( a );
   2302     aExp = extractFloat64Exp( a );
   2303     aSign = extractFloat64Sign( a );
   2304     if ( ( aExp == 0x7FF ) && aSig ) aSign = 0;
   2305     if ( aExp ) aSig |= LIT64( 0x0010000000000000 );
   2306     shiftCount = 0x42C - aExp;
   2307     if ( 0 < shiftCount ) shift64RightJamming( aSig, shiftCount, &aSig );
   2308     return roundAndPackInt32( aSign, aSig );
   2309 
   2310 }
   2311 #endif /* !SOFTFLOAT_FOR_GCC */
   2312 
   2313 /*
   2314 -------------------------------------------------------------------------------
   2315 Returns the result of converting the double-precision floating-point value
   2316 `a' to the 32-bit two's complement integer format.  The conversion is
   2317 performed according to the IEC/IEEE Standard for Binary Floating-Point
   2318 Arithmetic, except that the conversion is always rounded toward zero.
   2319 If `a' is a NaN, the largest positive integer is returned.  Otherwise, if
   2320 the conversion overflows, the largest integer with the same sign as `a' is
   2321 returned.
   2322 -------------------------------------------------------------------------------
   2323 */
   2324 int32 float64_to_int32_round_to_zero( float64 a )
   2325 {
   2326     flag aSign;
   2327     int16 aExp, shiftCount;
   2328     bits64 aSig, savedASig;
   2329     int32 z;
   2330 
   2331     aSig = extractFloat64Frac( a );
   2332     aExp = extractFloat64Exp( a );
   2333     aSign = extractFloat64Sign( a );
   2334     if ( 0x41E < aExp ) {
   2335         if ( ( aExp == 0x7FF ) && aSig ) aSign = 0;
   2336         goto invalid;
   2337     }
   2338     else if ( aExp < 0x3FF ) {
   2339         if ( aExp || aSig ) float_set_inexact();
   2340         return 0;
   2341     }
   2342     aSig |= LIT64( 0x0010000000000000 );
   2343     shiftCount = 0x433 - aExp;
   2344     savedASig = aSig;
   2345     aSig >>= shiftCount;
   2346     z = aSig;
   2347     if ( aSign ) z = - z;
   2348     if ( ( z < 0 ) ^ aSign ) {
   2349  invalid:
   2350         float_raise( float_flag_invalid );
   2351         return aSign ? (sbits32) 0x80000000 : 0x7FFFFFFF;
   2352     }
   2353     if ( ( aSig<<shiftCount ) != savedASig ) {
   2354         float_set_inexact();
   2355     }
   2356     return z;
   2357 
   2358 }
   2359 
   2360 #ifndef SOFTFLOAT_FOR_GCC /* Not needed */
   2361 /*
   2362 -------------------------------------------------------------------------------
   2363 Returns the result of converting the double-precision floating-point value
   2364 `a' to the 64-bit two's complement integer format.  The conversion is
   2365 performed according to the IEC/IEEE Standard for Binary Floating-Point
   2366 Arithmetic---which means in particular that the conversion is rounded
   2367 according to the current rounding mode.  If `a' is a NaN, the largest
   2368 positive integer is returned.  Otherwise, if the conversion overflows, the
   2369 largest integer with the same sign as `a' is returned.
   2370 -------------------------------------------------------------------------------
   2371 */
   2372 int64 float64_to_int64( float64 a )
   2373 {
   2374     flag aSign;
   2375     int16 aExp, shiftCount;
   2376     bits64 aSig, aSigExtra;
   2377 
   2378     aSig = extractFloat64Frac( a );
   2379     aExp = extractFloat64Exp( a );
   2380     aSign = extractFloat64Sign( a );
   2381     if ( aExp ) aSig |= LIT64( 0x0010000000000000 );
   2382     shiftCount = 0x433 - aExp;
   2383     if ( shiftCount <= 0 ) {
   2384         if ( 0x43E < aExp ) {
   2385             float_raise( float_flag_invalid );
   2386             if (    ! aSign
   2387                  || (    ( aExp == 0x7FF )
   2388                       && ( aSig != LIT64( 0x0010000000000000 ) ) )
   2389                ) {
   2390                 return LIT64( 0x7FFFFFFFFFFFFFFF );
   2391             }
   2392             return (sbits64) LIT64( 0x8000000000000000 );
   2393         }
   2394         aSigExtra = 0;
   2395         aSig <<= - shiftCount;
   2396     }
   2397     else {
   2398         shift64ExtraRightJamming( aSig, 0, shiftCount, &aSig, &aSigExtra );
   2399     }
   2400     return roundAndPackInt64( aSign, aSig, aSigExtra );
   2401 
   2402 }
   2403 
   2404 /*
   2405 -------------------------------------------------------------------------------
   2406 Returns the result of converting the double-precision floating-point value
   2407 `a' to the 64-bit two's complement integer format.  The conversion is
   2408 performed according to the IEC/IEEE Standard for Binary Floating-Point
   2409 Arithmetic, except that the conversion is always rounded toward zero.
   2410 If `a' is a NaN, the largest positive integer is returned.  Otherwise, if
   2411 the conversion overflows, the largest integer with the same sign as `a' is
   2412 returned.
   2413 -------------------------------------------------------------------------------
   2414 */
   2415 int64 float64_to_int64_round_to_zero( float64 a )
   2416 {
   2417     flag aSign;
   2418     int16 aExp, shiftCount;
   2419     bits64 aSig;
   2420     int64 z;
   2421 
   2422     aSig = extractFloat64Frac( a );
   2423     aExp = extractFloat64Exp( a );
   2424     aSign = extractFloat64Sign( a );
   2425     if ( aExp ) aSig |= LIT64( 0x0010000000000000 );
   2426     shiftCount = aExp - 0x433;
   2427     if ( 0 <= shiftCount ) {
   2428         if ( 0x43E <= aExp ) {
   2429             if ( a != LIT64( 0xC3E0000000000000 ) ) {
   2430                 float_raise( float_flag_invalid );
   2431                 if (    ! aSign
   2432                      || (    ( aExp == 0x7FF )
   2433                           && ( aSig != LIT64( 0x0010000000000000 ) ) )
   2434                    ) {
   2435                     return LIT64( 0x7FFFFFFFFFFFFFFF );
   2436                 }
   2437             }
   2438             return (sbits64) LIT64( 0x8000000000000000 );
   2439         }
   2440         z = aSig<<shiftCount;
   2441     }
   2442     else {
   2443         if ( aExp < 0x3FE ) {
   2444             if ( aExp | aSig ) float_set_inexact();
   2445             return 0;
   2446         }
   2447         z = aSig>>( - shiftCount );
   2448         if ( (bits64) ( aSig<<( shiftCount & 63 ) ) ) {
   2449             float_set_inexact();
   2450         }
   2451     }
   2452     if ( aSign ) z = - z;
   2453     return z;
   2454 
   2455 }
   2456 #endif /* !SOFTFLOAT_FOR_GCC */
   2457 
   2458 /*
   2459 -------------------------------------------------------------------------------
   2460 Returns the result of converting the double-precision floating-point value
   2461 `a' to the single-precision floating-point format.  The conversion is
   2462 performed according to the IEC/IEEE Standard for Binary Floating-Point
   2463 Arithmetic.
   2464 -------------------------------------------------------------------------------
   2465 */
   2466 float32 float64_to_float32( float64 a )
   2467 {
   2468     flag aSign;
   2469     int16 aExp;
   2470     bits64 aSig;
   2471     bits32 zSig;
   2472 
   2473     aSig = extractFloat64Frac( a );
   2474     aExp = extractFloat64Exp( a );
   2475     aSign = extractFloat64Sign( a );
   2476     if ( aExp == 0x7FF ) {
   2477         if ( aSig ) return commonNaNToFloat32( float64ToCommonNaN( a ) );
   2478         return packFloat32( aSign, 0xFF, 0 );
   2479     }
   2480     shift64RightJamming( aSig, 22, &aSig );
   2481     zSig = aSig;
   2482     if ( aExp || zSig ) {
   2483         zSig |= 0x40000000;
   2484         aExp -= 0x381;
   2485     }
   2486     return roundAndPackFloat32( aSign, aExp, zSig );
   2487 
   2488 }
   2489 
   2490 #ifdef FLOATX80
   2491 
   2492 /*
   2493 -------------------------------------------------------------------------------
   2494 Returns the result of converting the double-precision floating-point value
   2495 `a' to the extended double-precision floating-point format.  The conversion
   2496 is performed according to the IEC/IEEE Standard for Binary Floating-Point
   2497 Arithmetic.
   2498 -------------------------------------------------------------------------------
   2499 */
   2500 floatx80 float64_to_floatx80( float64 a )
   2501 {
   2502     flag aSign;
   2503     int16 aExp;
   2504     bits64 aSig;
   2505 
   2506     aSig = extractFloat64Frac( a );
   2507     aExp = extractFloat64Exp( a );
   2508     aSign = extractFloat64Sign( a );
   2509     if ( aExp == 0x7FF ) {
   2510         if ( aSig ) return commonNaNToFloatx80( float64ToCommonNaN( a ) );
   2511         return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
   2512     }
   2513     if ( aExp == 0 ) {
   2514         if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 );
   2515         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
   2516     }
   2517     return
   2518         packFloatx80(
   2519             aSign, aExp + 0x3C00, ( aSig | LIT64( 0x0010000000000000 ) )<<11 );
   2520 
   2521 }
   2522 
   2523 #endif
   2524 
   2525 #ifdef FLOAT128
   2526 
   2527 /*
   2528 -------------------------------------------------------------------------------
   2529 Returns the result of converting the double-precision floating-point value
   2530 `a' to the quadruple-precision floating-point format.  The conversion is
   2531 performed according to the IEC/IEEE Standard for Binary Floating-Point
   2532 Arithmetic.
   2533 -------------------------------------------------------------------------------
   2534 */
   2535 float128 float64_to_float128( float64 a )
   2536 {
   2537     flag aSign;
   2538     int16 aExp;
   2539     bits64 aSig, zSig0, zSig1;
   2540 
   2541     aSig = extractFloat64Frac( a );
   2542     aExp = extractFloat64Exp( a );
   2543     aSign = extractFloat64Sign( a );
   2544     if ( aExp == 0x7FF ) {
   2545         if ( aSig ) return commonNaNToFloat128( float64ToCommonNaN( a ) );
   2546         return packFloat128( aSign, 0x7FFF, 0, 0 );
   2547     }
   2548     if ( aExp == 0 ) {
   2549         if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 );
   2550         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
   2551         --aExp;
   2552     }
   2553     shift128Right( aSig, 0, 4, &zSig0, &zSig1 );
   2554     return packFloat128( aSign, aExp + 0x3C00, zSig0, zSig1 );
   2555 
   2556 }
   2557 
   2558 #endif
   2559 
   2560 #ifndef SOFTFLOAT_FOR_GCC
   2561 /*
   2562 -------------------------------------------------------------------------------
   2563 Rounds the double-precision floating-point value `a' to an integer, and
   2564 returns the result as a double-precision floating-point value.  The
   2565 operation is performed according to the IEC/IEEE Standard for Binary
   2566 Floating-Point Arithmetic.
   2567 -------------------------------------------------------------------------------
   2568 */
   2569 float64 float64_round_to_int( float64 a )
   2570 {
   2571     flag aSign;
   2572     int16 aExp;
   2573     bits64 lastBitMask, roundBitsMask;
   2574     int8 roundingMode;
   2575     float64 z;
   2576 
   2577     aExp = extractFloat64Exp( a );
   2578     if ( 0x433 <= aExp ) {
   2579         if ( ( aExp == 0x7FF ) && extractFloat64Frac( a ) ) {
   2580             return propagateFloat64NaN( a, a );
   2581         }
   2582         return a;
   2583     }
   2584     if ( aExp < 0x3FF ) {
   2585         if ( (bits64) ( a<<1 ) == 0 ) return a;
   2586         float_set_inexact();
   2587         aSign = extractFloat64Sign( a );
   2588         switch ( float_rounding_mode() ) {
   2589          case float_round_nearest_even:
   2590             if ( ( aExp == 0x3FE ) && extractFloat64Frac( a ) ) {
   2591                 return packFloat64( aSign, 0x3FF, 0 );
   2592             }
   2593             break;
   2594          case float_round_down:
   2595             return aSign ? LIT64( 0xBFF0000000000000 ) : 0;
   2596          case float_round_up:
   2597             return
   2598             aSign ? LIT64( 0x8000000000000000 ) : LIT64( 0x3FF0000000000000 );
   2599         }
   2600         return packFloat64( aSign, 0, 0 );
   2601     }
   2602     lastBitMask = 1;
   2603     lastBitMask <<= 0x433 - aExp;
   2604     roundBitsMask = lastBitMask - 1;
   2605     z = a;
   2606     roundingMode = float_rounding_mode();
   2607     if ( roundingMode == float_round_nearest_even ) {
   2608         z += lastBitMask>>1;
   2609         if ( ( z & roundBitsMask ) == 0 ) z &= ~ lastBitMask;
   2610     }
   2611     else if ( roundingMode != float_round_to_zero ) {
   2612         if ( extractFloat64Sign( z ) ^ ( roundingMode == float_round_up ) ) {
   2613             z += roundBitsMask;
   2614         }
   2615     }
   2616     z &= ~ roundBitsMask;
   2617     if ( z != a ) float_set_inexact();
   2618     return z;
   2619 
   2620 }
   2621 #endif
   2622 
   2623 /*
   2624 -------------------------------------------------------------------------------
   2625 Returns the result of adding the absolute values of the double-precision
   2626 floating-point values `a' and `b'.  If `zSign' is 1, the sum is negated
   2627 before being returned.  `zSign' is ignored if the result is a NaN.
   2628 The addition is performed according to the IEC/IEEE Standard for Binary
   2629 Floating-Point Arithmetic.
   2630 -------------------------------------------------------------------------------
   2631 */
   2632 static float64 addFloat64Sigs( float64 a, float64 b, flag zSign )
   2633 {
   2634     int16 aExp, bExp, zExp;
   2635     bits64 aSig, bSig, zSig;
   2636     int16 expDiff;
   2637 
   2638     aSig = extractFloat64Frac( a );
   2639     aExp = extractFloat64Exp( a );
   2640     bSig = extractFloat64Frac( b );
   2641     bExp = extractFloat64Exp( b );
   2642     expDiff = aExp - bExp;
   2643     aSig <<= 9;
   2644     bSig <<= 9;
   2645     if ( 0 < expDiff ) {
   2646         if ( aExp == 0x7FF ) {
   2647             if ( aSig ) return propagateFloat64NaN( a, b );
   2648             return a;
   2649         }
   2650         if ( bExp == 0 ) {
   2651             --expDiff;
   2652         }
   2653         else {
   2654             bSig |= LIT64( 0x2000000000000000 );
   2655         }
   2656         shift64RightJamming( bSig, expDiff, &bSig );
   2657         zExp = aExp;
   2658     }
   2659     else if ( expDiff < 0 ) {
   2660         if ( bExp == 0x7FF ) {
   2661             if ( bSig ) return propagateFloat64NaN( a, b );
   2662             return packFloat64( zSign, 0x7FF, 0 );
   2663         }
   2664         if ( aExp == 0 ) {
   2665             ++expDiff;
   2666         }
   2667         else {
   2668             aSig |= LIT64( 0x2000000000000000 );
   2669         }
   2670         shift64RightJamming( aSig, - expDiff, &aSig );
   2671         zExp = bExp;
   2672     }
   2673     else {
   2674         if ( aExp == 0x7FF ) {
   2675             if ( aSig | bSig ) return propagateFloat64NaN( a, b );
   2676             return a;
   2677         }
   2678         if ( aExp == 0 ) return packFloat64( zSign, 0, ( aSig + bSig )>>9 );
   2679         zSig = LIT64( 0x4000000000000000 ) + aSig + bSig;
   2680         zExp = aExp;
   2681         goto roundAndPack;
   2682     }
   2683     aSig |= LIT64( 0x2000000000000000 );
   2684     zSig = ( aSig + bSig )<<1;
   2685     --zExp;
   2686     if ( (sbits64) zSig < 0 ) {
   2687         zSig = aSig + bSig;
   2688         ++zExp;
   2689     }
   2690  roundAndPack:
   2691     return roundAndPackFloat64( zSign, zExp, zSig );
   2692 
   2693 }
   2694 
   2695 /*
   2696 -------------------------------------------------------------------------------
   2697 Returns the result of subtracting the absolute values of the double-
   2698 precision floating-point values `a' and `b'.  If `zSign' is 1, the
   2699 difference is negated before being returned.  `zSign' is ignored if the
   2700 result is a NaN.  The subtraction is performed according to the IEC/IEEE
   2701 Standard for Binary Floating-Point Arithmetic.
   2702 -------------------------------------------------------------------------------
   2703 */
   2704 static float64 subFloat64Sigs( float64 a, float64 b, flag zSign )
   2705 {
   2706     int16 aExp, bExp, zExp;
   2707     bits64 aSig, bSig, zSig;
   2708     int16 expDiff;
   2709 
   2710     aSig = extractFloat64Frac( a );
   2711     aExp = extractFloat64Exp( a );
   2712     bSig = extractFloat64Frac( b );
   2713     bExp = extractFloat64Exp( b );
   2714     expDiff = aExp - bExp;
   2715     aSig <<= 10;
   2716     bSig <<= 10;
   2717     if ( 0 < expDiff ) goto aExpBigger;
   2718     if ( expDiff < 0 ) goto bExpBigger;
   2719     if ( aExp == 0x7FF ) {
   2720         if ( aSig | bSig ) return propagateFloat64NaN( a, b );
   2721         float_raise( float_flag_invalid );
   2722         return float64_default_nan;
   2723     }
   2724     if ( aExp == 0 ) {
   2725         aExp = 1;
   2726         bExp = 1;
   2727     }
   2728     if ( bSig < aSig ) goto aBigger;
   2729     if ( aSig < bSig ) goto bBigger;
   2730     return packFloat64( float_rounding_mode() == float_round_down, 0, 0 );
   2731  bExpBigger:
   2732     if ( bExp == 0x7FF ) {
   2733         if ( bSig ) return propagateFloat64NaN( a, b );
   2734         return packFloat64( zSign ^ 1, 0x7FF, 0 );
   2735     }
   2736     if ( aExp == 0 ) {
   2737         ++expDiff;
   2738     }
   2739     else {
   2740         aSig |= LIT64( 0x4000000000000000 );
   2741     }
   2742     shift64RightJamming( aSig, - expDiff, &aSig );
   2743     bSig |= LIT64( 0x4000000000000000 );
   2744  bBigger:
   2745     zSig = bSig - aSig;
   2746     zExp = bExp;
   2747     zSign ^= 1;
   2748     goto normalizeRoundAndPack;
   2749  aExpBigger:
   2750     if ( aExp == 0x7FF ) {
   2751         if ( aSig ) return propagateFloat64NaN( a, b );
   2752         return a;
   2753     }
   2754     if ( bExp == 0 ) {
   2755         --expDiff;
   2756     }
   2757     else {
   2758         bSig |= LIT64( 0x4000000000000000 );
   2759     }
   2760     shift64RightJamming( bSig, expDiff, &bSig );
   2761     aSig |= LIT64( 0x4000000000000000 );
   2762  aBigger:
   2763     zSig = aSig - bSig;
   2764     zExp = aExp;
   2765  normalizeRoundAndPack:
   2766     --zExp;
   2767     return normalizeRoundAndPackFloat64( zSign, zExp, zSig );
   2768 
   2769 }
   2770 
   2771 /*
   2772 -------------------------------------------------------------------------------
   2773 Returns the result of adding the double-precision floating-point values `a'
   2774 and `b'.  The operation is performed according to the IEC/IEEE Standard for
   2775 Binary Floating-Point Arithmetic.
   2776 -------------------------------------------------------------------------------
   2777 */
   2778 float64 float64_add( float64 a, float64 b )
   2779 {
   2780     flag aSign, bSign;
   2781 
   2782     aSign = extractFloat64Sign( a );
   2783     bSign = extractFloat64Sign( b );
   2784     if ( aSign == bSign ) {
   2785         return addFloat64Sigs( a, b, aSign );
   2786     }
   2787     else {
   2788         return subFloat64Sigs( a, b, aSign );
   2789     }
   2790 
   2791 }
   2792 
   2793 /*
   2794 -------------------------------------------------------------------------------
   2795 Returns the result of subtracting the double-precision floating-point values
   2796 `a' and `b'.  The operation is performed according to the IEC/IEEE Standard
   2797 for Binary Floating-Point Arithmetic.
   2798 -------------------------------------------------------------------------------
   2799 */
   2800 float64 float64_sub( float64 a, float64 b )
   2801 {
   2802     flag aSign, bSign;
   2803 
   2804     aSign = extractFloat64Sign( a );
   2805     bSign = extractFloat64Sign( b );
   2806     if ( aSign == bSign ) {
   2807         return subFloat64Sigs( a, b, aSign );
   2808     }
   2809     else {
   2810         return addFloat64Sigs( a, b, aSign );
   2811     }
   2812 
   2813 }
   2814 
   2815 /*
   2816 -------------------------------------------------------------------------------
   2817 Returns the result of multiplying the double-precision floating-point values
   2818 `a' and `b'.  The operation is performed according to the IEC/IEEE Standard
   2819 for Binary Floating-Point Arithmetic.
   2820 -------------------------------------------------------------------------------
   2821 */
   2822 float64 float64_mul( float64 a, float64 b )
   2823 {
   2824     flag aSign, bSign, zSign;
   2825     int16 aExp, bExp, zExp;
   2826     bits64 aSig, bSig, zSig0, zSig1;
   2827 
   2828     aSig = extractFloat64Frac( a );
   2829     aExp = extractFloat64Exp( a );
   2830     aSign = extractFloat64Sign( a );
   2831     bSig = extractFloat64Frac( b );
   2832     bExp = extractFloat64Exp( b );
   2833     bSign = extractFloat64Sign( b );
   2834     zSign = aSign ^ bSign;
   2835     if ( aExp == 0x7FF ) {
   2836         if ( aSig || ( ( bExp == 0x7FF ) && bSig ) ) {
   2837             return propagateFloat64NaN( a, b );
   2838         }
   2839         if ( ( bExp | bSig ) == 0 ) {
   2840             float_raise( float_flag_invalid );
   2841             return float64_default_nan;
   2842         }
   2843         return packFloat64( zSign, 0x7FF, 0 );
   2844     }
   2845     if ( bExp == 0x7FF ) {
   2846         if ( bSig ) return propagateFloat64NaN( a, b );
   2847         if ( ( aExp | aSig ) == 0 ) {
   2848             float_raise( float_flag_invalid );
   2849             return float64_default_nan;
   2850         }
   2851         return packFloat64( zSign, 0x7FF, 0 );
   2852     }
   2853     if ( aExp == 0 ) {
   2854         if ( aSig == 0 ) return packFloat64( zSign, 0, 0 );
   2855         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
   2856     }
   2857     if ( bExp == 0 ) {
   2858         if ( bSig == 0 ) return packFloat64( zSign, 0, 0 );
   2859         normalizeFloat64Subnormal( bSig, &bExp, &bSig );
   2860     }
   2861     zExp = aExp + bExp - 0x3FF;
   2862     aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<10;
   2863     bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;
   2864     mul64To128( aSig, bSig, &zSig0, &zSig1 );
   2865     zSig0 |= ( zSig1 != 0 );
   2866     if ( 0 <= (sbits64) ( zSig0<<1 ) ) {
   2867         zSig0 <<= 1;
   2868         --zExp;
   2869     }
   2870     return roundAndPackFloat64( zSign, zExp, zSig0 );
   2871 
   2872 }
   2873 
   2874 /*
   2875 -------------------------------------------------------------------------------
   2876 Returns the result of dividing the double-precision floating-point value `a'
   2877 by the corresponding value `b'.  The operation is performed according to
   2878 the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
   2879 -------------------------------------------------------------------------------
   2880 */
   2881 float64 float64_div( float64 a, float64 b )
   2882 {
   2883     flag aSign, bSign, zSign;
   2884     int16 aExp, bExp, zExp;
   2885     bits64 aSig, bSig, zSig;
   2886     bits64 rem0, rem1;
   2887     bits64 term0, term1;
   2888 
   2889     aSig = extractFloat64Frac( a );
   2890     aExp = extractFloat64Exp( a );
   2891     aSign = extractFloat64Sign( a );
   2892     bSig = extractFloat64Frac( b );
   2893     bExp = extractFloat64Exp( b );
   2894     bSign = extractFloat64Sign( b );
   2895     zSign = aSign ^ bSign;
   2896     if ( aExp == 0x7FF ) {
   2897         if ( aSig ) return propagateFloat64NaN( a, b );
   2898         if ( bExp == 0x7FF ) {
   2899             if ( bSig ) return propagateFloat64NaN( a, b );
   2900             float_raise( float_flag_invalid );
   2901             return float64_default_nan;
   2902         }
   2903         return packFloat64( zSign, 0x7FF, 0 );
   2904     }
   2905     if ( bExp == 0x7FF ) {
   2906         if ( bSig ) return propagateFloat64NaN( a, b );
   2907         return packFloat64( zSign, 0, 0 );
   2908     }
   2909     if ( bExp == 0 ) {
   2910         if ( bSig == 0 ) {
   2911             if ( ( aExp | aSig ) == 0 ) {
   2912                 float_raise( float_flag_invalid );
   2913                 return float64_default_nan;
   2914             }
   2915             float_raise( float_flag_divbyzero );
   2916             return packFloat64( zSign, 0x7FF, 0 );
   2917         }
   2918         normalizeFloat64Subnormal( bSig, &bExp, &bSig );
   2919     }
   2920     if ( aExp == 0 ) {
   2921         if ( aSig == 0 ) return packFloat64( zSign, 0, 0 );
   2922         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
   2923     }
   2924     zExp = aExp - bExp + 0x3FD;
   2925     aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<10;
   2926     bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;
   2927     if ( bSig <= ( aSig + aSig ) ) {
   2928         aSig >>= 1;
   2929         ++zExp;
   2930     }
   2931     zSig = estimateDiv128To64( aSig, 0, bSig );
   2932     if ( ( zSig & 0x1FF ) <= 2 ) {
   2933         mul64To128( bSig, zSig, &term0, &term1 );
   2934         sub128( aSig, 0, term0, term1, &rem0, &rem1 );
   2935         while ( (sbits64) rem0 < 0 ) {
   2936             --zSig;
   2937             add128( rem0, rem1, 0, bSig, &rem0, &rem1 );
   2938         }
   2939         zSig |= ( rem1 != 0 );
   2940     }
   2941     return roundAndPackFloat64( zSign, zExp, zSig );
   2942 
   2943 }
   2944 
   2945 #ifndef SOFTFLOAT_FOR_GCC
   2946 /*
   2947 -------------------------------------------------------------------------------
   2948 Returns the remainder of the double-precision floating-point value `a'
   2949 with respect to the corresponding value `b'.  The operation is performed
   2950 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
   2951 -------------------------------------------------------------------------------
   2952 */
   2953 float64 float64_rem( float64 a, float64 b )
   2954 {
   2955     flag aSign, bSign, zSign;
   2956     int16 aExp, bExp, expDiff;
   2957     bits64 aSig, bSig;
   2958     bits64 q, alternateASig;
   2959     sbits64 sigMean;
   2960 
   2961     aSig = extractFloat64Frac( a );
   2962     aExp = extractFloat64Exp( a );
   2963     aSign = extractFloat64Sign( a );
   2964     bSig = extractFloat64Frac( b );
   2965     bExp = extractFloat64Exp( b );
   2966     bSign = extractFloat64Sign( b );
   2967     if ( aExp == 0x7FF ) {
   2968         if ( aSig || ( ( bExp == 0x7FF ) && bSig ) ) {
   2969             return propagateFloat64NaN( a, b );
   2970         }
   2971         float_raise( float_flag_invalid );
   2972         return float64_default_nan;
   2973     }
   2974     if ( bExp == 0x7FF ) {
   2975         if ( bSig ) return propagateFloat64NaN( a, b );
   2976         return a;
   2977     }
   2978     if ( bExp == 0 ) {
   2979         if ( bSig == 0 ) {
   2980             float_raise( float_flag_invalid );
   2981             return float64_default_nan;
   2982         }
   2983         normalizeFloat64Subnormal( bSig, &bExp, &bSig );
   2984     }
   2985     if ( aExp == 0 ) {
   2986         if ( aSig == 0 ) return a;
   2987         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
   2988     }
   2989     expDiff = aExp - bExp;
   2990     aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<11;
   2991     bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;
   2992     if ( expDiff < 0 ) {
   2993         if ( expDiff < -1 ) return a;
   2994         aSig >>= 1;
   2995     }
   2996     q = ( bSig <= aSig );
   2997     if ( q ) aSig -= bSig;
   2998     expDiff -= 64;
   2999     while ( 0 < expDiff ) {
   3000         q = estimateDiv128To64( aSig, 0, bSig );
   3001         q = ( 2 < q ) ? q - 2 : 0;
   3002         aSig = - ( ( bSig>>2 ) * q );
   3003         expDiff -= 62;
   3004     }
   3005     expDiff += 64;
   3006     if ( 0 < expDiff ) {
   3007         q = estimateDiv128To64( aSig, 0, bSig );
   3008         q = ( 2 < q ) ? q - 2 : 0;
   3009         q >>= 64 - expDiff;
   3010         bSig >>= 2;
   3011         aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q;
   3012     }
   3013     else {
   3014         aSig >>= 2;
   3015         bSig >>= 2;
   3016     }
   3017     do {
   3018         alternateASig = aSig;
   3019         ++q;
   3020         aSig -= bSig;
   3021     } while ( 0 <= (sbits64) aSig );
   3022     sigMean = aSig + alternateASig;
   3023     if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) {
   3024         aSig = alternateASig;
   3025     }
   3026     zSign = ( (sbits64) aSig < 0 );
   3027     if ( zSign ) aSig = - aSig;
   3028     return normalizeRoundAndPackFloat64( aSign ^ zSign, bExp, aSig );
   3029 
   3030 }
   3031 
   3032 /*
   3033 -------------------------------------------------------------------------------
   3034 Returns the square root of the double-precision floating-point value `a'.
   3035 The operation is performed according to the IEC/IEEE Standard for Binary
   3036 Floating-Point Arithmetic.
   3037 -------------------------------------------------------------------------------
   3038 */
   3039 float64 float64_sqrt( float64 a )
   3040 {
   3041     flag aSign;
   3042     int16 aExp, zExp;
   3043     bits64 aSig, zSig, doubleZSig;
   3044     bits64 rem0, rem1, term0, term1;
   3045 
   3046     aSig = extractFloat64Frac( a );
   3047     aExp = extractFloat64Exp( a );
   3048     aSign = extractFloat64Sign( a );
   3049     if ( aExp == 0x7FF ) {
   3050         if ( aSig ) return propagateFloat64NaN( a, a );
   3051         if ( ! aSign ) return a;
   3052         float_raise( float_flag_invalid );
   3053         return float64_default_nan;
   3054     }
   3055     if ( aSign ) {
   3056         if ( ( aExp | aSig ) == 0 ) return a;
   3057         float_raise( float_flag_invalid );
   3058         return float64_default_nan;
   3059     }
   3060     if ( aExp == 0 ) {
   3061         if ( aSig == 0 ) return 0;
   3062         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
   3063     }
   3064     zExp = ( ( aExp - 0x3FF )>>1 ) + 0x3FE;
   3065     aSig |= LIT64( 0x0010000000000000 );
   3066     zSig = estimateSqrt32( aExp, aSig>>21 );
   3067     aSig <<= 9 - ( aExp & 1 );
   3068     zSig = estimateDiv128To64( aSig, 0, zSig<<32 ) + ( zSig<<30 );
   3069     if ( ( zSig & 0x1FF ) <= 5 ) {
   3070         doubleZSig = zSig<<1;
   3071         mul64To128( zSig, zSig, &term0, &term1 );
   3072         sub128( aSig, 0, term0, term1, &rem0, &rem1 );
   3073         while ( (sbits64) rem0 < 0 ) {
   3074             --zSig;
   3075             doubleZSig -= 2;
   3076             add128( rem0, rem1, zSig>>63, doubleZSig | 1, &rem0, &rem1 );
   3077         }
   3078         zSig |= ( ( rem0 | rem1 ) != 0 );
   3079     }
   3080     return roundAndPackFloat64( 0, zExp, zSig );
   3081 
   3082 }
   3083 #endif
   3084 
   3085 /*
   3086 -------------------------------------------------------------------------------
   3087 Returns 1 if the double-precision floating-point value `a' is equal to the
   3088 corresponding value `b', and 0 otherwise.  The comparison is performed
   3089 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
   3090 -------------------------------------------------------------------------------
   3091 */
   3092 flag float64_eq( float64 a, float64 b )
   3093 {
   3094 
   3095     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
   3096          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
   3097        ) {
   3098         if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) {
   3099             float_raise( float_flag_invalid );
   3100         }
   3101         return 0;
   3102     }
   3103     return ( a == b ) ||
   3104 	( (bits64) ( ( FLOAT64_DEMANGLE(a) | FLOAT64_DEMANGLE(b) )<<1 ) == 0 );
   3105 
   3106 }
   3107 
   3108 /*
   3109 -------------------------------------------------------------------------------
   3110 Returns 1 if the double-precision floating-point value `a' is less than or
   3111 equal to the corresponding value `b', and 0 otherwise.  The comparison is
   3112 performed according to the IEC/IEEE Standard for Binary Floating-Point
   3113 Arithmetic.
   3114 -------------------------------------------------------------------------------
   3115 */
   3116 flag float64_le( float64 a, float64 b )
   3117 {
   3118     flag aSign, bSign;
   3119 
   3120     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
   3121          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
   3122        ) {
   3123         float_raise( float_flag_invalid );
   3124         return 0;
   3125     }
   3126     aSign = extractFloat64Sign( a );
   3127     bSign = extractFloat64Sign( b );
   3128     if ( aSign != bSign )
   3129 	return aSign ||
   3130 	    ( (bits64) ( ( FLOAT64_DEMANGLE(a) | FLOAT64_DEMANGLE(b) )<<1 ) ==
   3131 	      0 );
   3132     return ( a == b ) ||
   3133 	( aSign ^ ( FLOAT64_DEMANGLE(a) < FLOAT64_DEMANGLE(b) ) );
   3134 
   3135 }
   3136 
   3137 /*
   3138 -------------------------------------------------------------------------------
   3139 Returns 1 if the double-precision floating-point value `a' is less than
   3140 the corresponding value `b', and 0 otherwise.  The comparison is performed
   3141 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
   3142 -------------------------------------------------------------------------------
   3143 */
   3144 flag float64_lt( float64 a, float64 b )
   3145 {
   3146     flag aSign, bSign;
   3147 
   3148     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
   3149          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
   3150        ) {
   3151         float_raise( float_flag_invalid );
   3152         return 0;
   3153     }
   3154     aSign = extractFloat64Sign( a );
   3155     bSign = extractFloat64Sign( b );
   3156     if ( aSign != bSign )
   3157 	return aSign &&
   3158 	    ( (bits64) ( ( FLOAT64_DEMANGLE(a) | FLOAT64_DEMANGLE(b) )<<1 ) !=
   3159 	      0 );
   3160     return ( a != b ) &&
   3161 	( aSign ^ ( FLOAT64_DEMANGLE(a) < FLOAT64_DEMANGLE(b) ) );
   3162 
   3163 }
   3164 
   3165 #ifndef SOFTFLOAT_FOR_GCC
   3166 /*
   3167 -------------------------------------------------------------------------------
   3168 Returns 1 if the double-precision floating-point value `a' is equal to the
   3169 corresponding value `b', and 0 otherwise.  The invalid exception is raised
   3170 if either operand is a NaN.  Otherwise, the comparison is performed
   3171 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
   3172 -------------------------------------------------------------------------------
   3173 */
   3174 flag float64_eq_signaling( float64 a, float64 b )
   3175 {
   3176 
   3177     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
   3178          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
   3179        ) {
   3180         float_raise( float_flag_invalid );
   3181         return 0;
   3182     }
   3183     return ( a == b ) || ( (bits64) ( ( a | b )<<1 ) == 0 );
   3184 
   3185 }
   3186 
   3187 /*
   3188 -------------------------------------------------------------------------------
   3189 Returns 1 if the double-precision floating-point value `a' is less than or
   3190 equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs do not
   3191 cause an exception.  Otherwise, the comparison is performed according to the
   3192 IEC/IEEE Standard for Binary Floating-Point Arithmetic.
   3193 -------------------------------------------------------------------------------
   3194 */
   3195 flag float64_le_quiet( float64 a, float64 b )
   3196 {
   3197     flag aSign, bSign;
   3198 
   3199     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
   3200          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
   3201        ) {
   3202         if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) {
   3203             float_raise( float_flag_invalid );
   3204         }
   3205         return 0;
   3206     }
   3207     aSign = extractFloat64Sign( a );
   3208     bSign = extractFloat64Sign( b );
   3209     if ( aSign != bSign ) return aSign || ( (bits64) ( ( a | b )<<1 ) == 0 );
   3210     return ( a == b ) || ( aSign ^ ( a < b ) );
   3211 
   3212 }
   3213 
   3214 /*
   3215 -------------------------------------------------------------------------------
   3216 Returns 1 if the double-precision floating-point value `a' is less than
   3217 the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
   3218 exception.  Otherwise, the comparison is performed according to the IEC/IEEE
   3219 Standard for Binary Floating-Point Arithmetic.
   3220 -------------------------------------------------------------------------------
   3221 */
   3222 flag float64_lt_quiet( float64 a, float64 b )
   3223 {
   3224     flag aSign, bSign;
   3225 
   3226     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
   3227          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
   3228        ) {
   3229         if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) {
   3230             float_raise( float_flag_invalid );
   3231         }
   3232         return 0;
   3233     }
   3234     aSign = extractFloat64Sign( a );
   3235     bSign = extractFloat64Sign( b );
   3236     if ( aSign != bSign ) return aSign && ( (bits64) ( ( a | b )<<1 ) != 0 );
   3237     return ( a != b ) && ( aSign ^ ( a < b ) );
   3238 
   3239 }
   3240 #endif
   3241 
   3242 #ifdef FLOATX80
   3243 
   3244 /*
   3245 -------------------------------------------------------------------------------
   3246 Returns the result of converting the extended double-precision floating-
   3247 point value `a' to the 32-bit two's complement integer format.  The
   3248 conversion is performed according to the IEC/IEEE Standard for Binary
   3249 Floating-Point Arithmetic---which means in particular that the conversion
   3250 is rounded according to the current rounding mode.  If `a' is a NaN, the
   3251 largest positive integer is returned.  Otherwise, if the conversion
   3252 overflows, the largest integer with the same sign as `a' is returned.
   3253 -------------------------------------------------------------------------------
   3254 */
   3255 int32 floatx80_to_int32( floatx80 a )
   3256 {
   3257     flag aSign;
   3258     int32 aExp, shiftCount;
   3259     bits64 aSig;
   3260 
   3261     aSig = extractFloatx80Frac( a );
   3262     aExp = extractFloatx80Exp( a );
   3263     aSign = extractFloatx80Sign( a );
   3264     if ( ( aExp == 0x7FFF ) && (bits64) ( aSig<<1 ) ) aSign = 0;
   3265     shiftCount = 0x4037 - aExp;
   3266     if ( shiftCount <= 0 ) shiftCount = 1;
   3267     shift64RightJamming( aSig, shiftCount, &aSig );
   3268     return roundAndPackInt32( aSign, aSig );
   3269 
   3270 }
   3271 
   3272 /*
   3273 -------------------------------------------------------------------------------
   3274 Returns the result of converting the extended double-precision floating-
   3275 point value `a' to the 32-bit two's complement integer format.  The
   3276 conversion is performed according to the IEC/IEEE Standard for Binary
   3277 Floating-Point Arithmetic, except that the conversion is always rounded
   3278 toward zero.  If `a' is a NaN, the largest positive integer is returned.
   3279 Otherwise, if the conversion overflows, the largest integer with the same
   3280 sign as `a' is returned.
   3281 -------------------------------------------------------------------------------
   3282 */
   3283 int32 floatx80_to_int32_round_to_zero( floatx80 a )
   3284 {
   3285     flag aSign;
   3286     int32 aExp, shiftCount;
   3287     bits64 aSig, savedASig;
   3288     int32 z;
   3289 
   3290     aSig = extractFloatx80Frac( a );
   3291     aExp = extractFloatx80Exp( a );
   3292     aSign = extractFloatx80Sign( a );
   3293     if ( 0x401E < aExp ) {
   3294         if ( ( aExp == 0x7FFF ) && (bits64) ( aSig<<1 ) ) aSign = 0;
   3295         goto invalid;
   3296     }
   3297     else if ( aExp < 0x3FFF ) {
   3298         if ( aExp || aSig ) float_set_inexact();
   3299         return 0;
   3300     }
   3301     shiftCount = 0x403E - aExp;
   3302     savedASig = aSig;
   3303     aSig >>= shiftCount;
   3304     z = aSig;
   3305     if ( aSign ) z = - z;
   3306     if ( ( z < 0 ) ^ aSign ) {
   3307  invalid:
   3308         float_raise( float_flag_invalid );
   3309         return aSign ? (sbits32) 0x80000000 : 0x7FFFFFFF;
   3310     }
   3311     if ( ( aSig<<shiftCount ) != savedASig ) {
   3312         float_set_inexact();
   3313     }
   3314     return z;
   3315 
   3316 }
   3317 
   3318 /*
   3319 -------------------------------------------------------------------------------
   3320 Returns the result of converting the extended double-precision floating-
   3321 point value `a' to the 64-bit two's complement integer format.  The
   3322 conversion is performed according to the IEC/IEEE Standard for Binary
   3323 Floating-Point Arithmetic---which means in particular that the conversion
   3324 is rounded according to the current rounding mode.  If `a' is a NaN,
   3325 the largest positive integer is returned.  Otherwise, if the conversion
   3326 overflows, the largest integer with the same sign as `a' is returned.
   3327 -------------------------------------------------------------------------------
   3328 */
   3329 int64 floatx80_to_int64( floatx80 a )
   3330 {
   3331     flag aSign;
   3332     int32 aExp, shiftCount;
   3333     bits64 aSig, aSigExtra;
   3334 
   3335     aSig = extractFloatx80Frac( a );
   3336     aExp = extractFloatx80Exp( a );
   3337     aSign = extractFloatx80Sign( a );
   3338     shiftCount = 0x403E - aExp;
   3339     if ( shiftCount <= 0 ) {
   3340         if ( shiftCount ) {
   3341             float_raise( float_flag_invalid );
   3342             if (    ! aSign
   3343                  || (    ( aExp == 0x7FFF )
   3344                       && ( aSig != LIT64( 0x8000000000000000 ) ) )
   3345                ) {
   3346                 return LIT64( 0x7FFFFFFFFFFFFFFF );
   3347             }
   3348             return (sbits64) LIT64( 0x8000000000000000 );
   3349         }
   3350         aSigExtra = 0;
   3351     }
   3352     else {
   3353         shift64ExtraRightJamming( aSig, 0, shiftCount, &aSig, &aSigExtra );
   3354     }
   3355     return roundAndPackInt64( aSign, aSig, aSigExtra );
   3356 
   3357 }
   3358 
   3359 /*
   3360 -------------------------------------------------------------------------------
   3361 Returns the result of converting the extended double-precision floating-
   3362 point value `a' to the 64-bit two's complement integer format.  The
   3363 conversion is performed according to the IEC/IEEE Standard for Binary
   3364 Floating-Point Arithmetic, except that the conversion is always rounded
   3365 toward zero.  If `a' is a NaN, the largest positive integer is returned.
   3366 Otherwise, if the conversion overflows, the largest integer with the same
   3367 sign as `a' is returned.
   3368 -------------------------------------------------------------------------------
   3369 */
   3370 int64 floatx80_to_int64_round_to_zero( floatx80 a )
   3371 {
   3372     flag aSign;
   3373     int32 aExp, shiftCount;
   3374     bits64 aSig;
   3375     int64 z;
   3376 
   3377     aSig = extractFloatx80Frac( a );
   3378     aExp = extractFloatx80Exp( a );
   3379     aSign = extractFloatx80Sign( a );
   3380     shiftCount = aExp - 0x403E;
   3381     if ( 0 <= shiftCount ) {
   3382         aSig &= LIT64( 0x7FFFFFFFFFFFFFFF );
   3383         if ( ( a.high != 0xC03E ) || aSig ) {
   3384             float_raise( float_flag_invalid );
   3385             if ( ! aSign || ( ( aExp == 0x7FFF ) && aSig ) ) {
   3386                 return LIT64( 0x7FFFFFFFFFFFFFFF );
   3387             }
   3388         }
   3389         return (sbits64) LIT64( 0x8000000000000000 );
   3390     }
   3391     else if ( aExp < 0x3FFF ) {
   3392         if ( aExp | aSig ) float_set_inexact();
   3393         return 0;
   3394     }
   3395     z = aSig>>( - shiftCount );
   3396     if ( (bits64) ( aSig<<( shiftCount & 63 ) ) ) {
   3397         float_set_inexact();
   3398     }
   3399     if ( aSign ) z = - z;
   3400     return z;
   3401 
   3402 }
   3403 
   3404 /*
   3405 -------------------------------------------------------------------------------
   3406 Returns the result of converting the extended double-precision floating-
   3407 point value `a' to the single-precision floating-point format.  The
   3408 conversion is performed according to the IEC/IEEE Standard for Binary
   3409 Floating-Point Arithmetic.
   3410 -------------------------------------------------------------------------------
   3411 */
   3412 float32 floatx80_to_float32( floatx80 a )
   3413 {
   3414     flag aSign;
   3415     int32 aExp;
   3416     bits64 aSig;
   3417 
   3418     aSig = extractFloatx80Frac( a );
   3419     aExp = extractFloatx80Exp( a );
   3420     aSign = extractFloatx80Sign( a );
   3421     if ( aExp == 0x7FFF ) {
   3422         if ( (bits64) ( aSig<<1 ) ) {
   3423             return commonNaNToFloat32( floatx80ToCommonNaN( a ) );
   3424         }
   3425         return packFloat32( aSign, 0xFF, 0 );
   3426     }
   3427     shift64RightJamming( aSig, 33, &aSig );
   3428     if ( aExp || aSig ) aExp -= 0x3F81;
   3429     return roundAndPackFloat32( aSign, aExp, aSig );
   3430 
   3431 }
   3432 
   3433 /*
   3434 -------------------------------------------------------------------------------
   3435 Returns the result of converting the extended double-precision floating-
   3436 point value `a' to the double-precision floating-point format.  The
   3437 conversion is performed according to the IEC/IEEE Standard for Binary
   3438 Floating-Point Arithmetic.
   3439 -------------------------------------------------------------------------------
   3440 */
   3441 float64 floatx80_to_float64( floatx80 a )
   3442 {
   3443     flag aSign;
   3444     int32 aExp;
   3445     bits64 aSig, zSig;
   3446 
   3447     aSig = extractFloatx80Frac( a );
   3448     aExp = extractFloatx80Exp( a );
   3449     aSign = extractFloatx80Sign( a );
   3450     if ( aExp == 0x7FFF ) {
   3451         if ( (bits64) ( aSig<<1 ) ) {
   3452             return commonNaNToFloat64( floatx80ToCommonNaN( a ) );
   3453         }
   3454         return packFloat64( aSign, 0x7FF, 0 );
   3455     }
   3456     shift64RightJamming( aSig, 1, &zSig );
   3457     if ( aExp || aSig ) aExp -= 0x3C01;
   3458     return roundAndPackFloat64( aSign, aExp, zSig );
   3459 
   3460 }
   3461 
   3462 #ifdef FLOAT128
   3463 
   3464 /*
   3465 -------------------------------------------------------------------------------
   3466 Returns the result of converting the extended double-precision floating-
   3467 point value `a' to the quadruple-precision floating-point format.  The
   3468 conversion is performed according to the IEC/IEEE Standard for Binary
   3469 Floating-Point Arithmetic.
   3470 -------------------------------------------------------------------------------
   3471 */
   3472 float128 floatx80_to_float128( floatx80 a )
   3473 {
   3474     flag aSign;
   3475     int16 aExp;
   3476     bits64 aSig, zSig0, zSig1;
   3477 
   3478     aSig = extractFloatx80Frac( a );
   3479     aExp = extractFloatx80Exp( a );
   3480     aSign = extractFloatx80Sign( a );
   3481     if ( ( aExp == 0x7FFF ) && (bits64) ( aSig<<1 ) ) {
   3482         return commonNaNToFloat128( floatx80ToCommonNaN( a ) );
   3483     }
   3484     shift128Right( aSig<<1, 0, 16, &zSig0, &zSig1 );
   3485     return packFloat128( aSign, aExp, zSig0, zSig1 );
   3486 
   3487 }
   3488 
   3489 #endif
   3490 
   3491 /*
   3492 -------------------------------------------------------------------------------
   3493 Rounds the extended double-precision floating-point value `a' to an integer,
   3494 and returns the result as an extended quadruple-precision floating-point
   3495 value.  The operation is performed according to the IEC/IEEE Standard for
   3496 Binary Floating-Point Arithmetic.
   3497 -------------------------------------------------------------------------------
   3498 */
   3499 floatx80 floatx80_round_to_int( floatx80 a )
   3500 {
   3501     flag aSign;
   3502     int32 aExp;
   3503     bits64 lastBitMask, roundBitsMask;
   3504     int8 roundingMode;
   3505     floatx80 z;
   3506 
   3507     aExp = extractFloatx80Exp( a );
   3508     if ( 0x403E <= aExp ) {
   3509         if ( ( aExp == 0x7FFF ) && (bits64) ( extractFloatx80Frac( a )<<1 ) ) {
   3510             return propagateFloatx80NaN( a, a );
   3511         }
   3512         return a;
   3513     }
   3514     if ( aExp < 0x3FFF ) {
   3515         if (    ( aExp == 0 )
   3516              && ( (bits64) ( extractFloatx80Frac( a )<<1 ) == 0 ) ) {
   3517             return a;
   3518         }
   3519         float_set_inexact();
   3520         aSign = extractFloatx80Sign( a );
   3521         switch ( float_rounding_mode() ) {
   3522          case float_round_nearest_even:
   3523             if ( ( aExp == 0x3FFE ) && (bits64) ( extractFloatx80Frac( a )<<1 )
   3524                ) {
   3525                 return
   3526                     packFloatx80( aSign, 0x3FFF, LIT64( 0x8000000000000000 ) );
   3527             }
   3528             break;
   3529          case float_round_down:
   3530             return
   3531                   aSign ?
   3532                       packFloatx80( 1, 0x3FFF, LIT64( 0x8000000000000000 ) )
   3533                 : packFloatx80( 0, 0, 0 );
   3534          case float_round_up:
   3535             return
   3536                   aSign ? packFloatx80( 1, 0, 0 )
   3537                 : packFloatx80( 0, 0x3FFF, LIT64( 0x8000000000000000 ) );
   3538         }
   3539         return packFloatx80( aSign, 0, 0 );
   3540     }
   3541     lastBitMask = 1;
   3542     lastBitMask <<= 0x403E - aExp;
   3543     roundBitsMask = lastBitMask - 1;
   3544     z = a;
   3545     roundingMode = float_rounding_mode();
   3546     if ( roundingMode == float_round_nearest_even ) {
   3547         z.low += lastBitMask>>1;
   3548         if ( ( z.low & roundBitsMask ) == 0 ) z.low &= ~ lastBitMask;
   3549     }
   3550     else if ( roundingMode != float_round_to_zero ) {
   3551         if ( extractFloatx80Sign( z ) ^ ( roundingMode == float_round_up ) ) {
   3552             z.low += roundBitsMask;
   3553         }
   3554     }
   3555     z.low &= ~ roundBitsMask;
   3556     if ( z.low == 0 ) {
   3557         ++z.high;
   3558         z.low = LIT64( 0x8000000000000000 );
   3559     }
   3560     if ( z.low != a.low ) float_set_inexact();
   3561     return z;
   3562 
   3563 }
   3564 
   3565 /*
   3566 -------------------------------------------------------------------------------
   3567 Returns the result of adding the absolute values of the extended double-
   3568 precision floating-point values `a' and `b'.  If `zSign' is 1, the sum is
   3569 negated before being returned.  `zSign' is ignored if the result is a NaN.
   3570 The addition is performed according to the IEC/IEEE Standard for Binary
   3571 Floating-Point Arithmetic.
   3572 -------------------------------------------------------------------------------
   3573 */
   3574 static floatx80 addFloatx80Sigs( floatx80 a, floatx80 b, flag zSign )
   3575 {
   3576     int32 aExp, bExp, zExp;
   3577     bits64 aSig, bSig, zSig0, zSig1;
   3578     int32 expDiff;
   3579 
   3580     aSig = extractFloatx80Frac( a );
   3581     aExp = extractFloatx80Exp( a );
   3582     bSig = extractFloatx80Frac( b );
   3583     bExp = extractFloatx80Exp( b );
   3584     expDiff = aExp - bExp;
   3585     if ( 0 < expDiff ) {
   3586         if ( aExp == 0x7FFF ) {
   3587             if ( (bits64) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b );
   3588             return a;
   3589         }
   3590         if ( bExp == 0 ) --expDiff;
   3591         shift64ExtraRightJamming( bSig, 0, expDiff, &bSig, &zSig1 );
   3592         zExp = aExp;
   3593     }
   3594     else if ( expDiff < 0 ) {
   3595         if ( bExp == 0x7FFF ) {
   3596             if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b );
   3597             return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
   3598         }
   3599         if ( aExp == 0 ) ++expDiff;
   3600         shift64ExtraRightJamming( aSig, 0, - expDiff, &aSig, &zSig1 );
   3601         zExp = bExp;
   3602     }
   3603     else {
   3604         if ( aExp == 0x7FFF ) {
   3605             if ( (bits64) ( ( aSig | bSig )<<1 ) ) {
   3606                 return propagateFloatx80NaN( a, b );
   3607             }
   3608             return a;
   3609         }
   3610         zSig1 = 0;
   3611         zSig0 = aSig + bSig;
   3612         if ( aExp == 0 ) {
   3613             normalizeFloatx80Subnormal( zSig0, &zExp, &zSig0 );
   3614             goto roundAndPack;
   3615         }
   3616         zExp = aExp;
   3617         goto shiftRight1;
   3618     }
   3619     zSig0 = aSig + bSig;
   3620     if ( (sbits64) zSig0 < 0 ) goto roundAndPack;
   3621  shiftRight1:
   3622     shift64ExtraRightJamming( zSig0, zSig1, 1, &zSig0, &zSig1 );
   3623     zSig0 |= LIT64( 0x8000000000000000 );
   3624     ++zExp;
   3625  roundAndPack:
   3626     return
   3627         roundAndPackFloatx80(
   3628             floatx80_rounding_precision, zSign, zExp, zSig0, zSig1 );
   3629 
   3630 }
   3631 
   3632 /*
   3633 -------------------------------------------------------------------------------
   3634 Returns the result of subtracting the absolute values of the extended
   3635 double-precision floating-point values `a' and `b'.  If `zSign' is 1, the
   3636 difference is negated before being returned.  `zSign' is ignored if the
   3637 result is a NaN.  The subtraction is performed according to the IEC/IEEE
   3638 Standard for Binary Floating-Point Arithmetic.
   3639 -------------------------------------------------------------------------------
   3640 */
   3641 static floatx80 subFloatx80Sigs( floatx80 a, floatx80 b, flag zSign )
   3642 {
   3643     int32 aExp, bExp, zExp;
   3644     bits64 aSig, bSig, zSig0, zSig1;
   3645     int32 expDiff;
   3646     floatx80 z;
   3647 
   3648     aSig = extractFloatx80Frac( a );
   3649     aExp = extractFloatx80Exp( a );
   3650     bSig = extractFloatx80Frac( b );
   3651     bExp = extractFloatx80Exp( b );
   3652     expDiff = aExp - bExp;
   3653     if ( 0 < expDiff ) goto aExpBigger;
   3654     if ( expDiff < 0 ) goto bExpBigger;
   3655     if ( aExp == 0x7FFF ) {
   3656         if ( (bits64) ( ( aSig | bSig )<<1 ) ) {
   3657             return propagateFloatx80NaN( a, b );
   3658         }
   3659         float_raise( float_flag_invalid );
   3660         z.low = floatx80_default_nan_low;
   3661         z.high = floatx80_default_nan_high;
   3662         return z;
   3663     }
   3664     if ( aExp == 0 ) {
   3665         aExp = 1;
   3666         bExp = 1;
   3667     }
   3668     zSig1 = 0;
   3669     if ( bSig < aSig ) goto aBigger;
   3670     if ( aSig < bSig ) goto bBigger;
   3671     return packFloatx80( float_rounding_mode() == float_round_down, 0, 0 );
   3672  bExpBigger:
   3673     if ( bExp == 0x7FFF ) {
   3674         if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b );
   3675         return packFloatx80( zSign ^ 1, 0x7FFF, LIT64( 0x8000000000000000 ) );
   3676     }
   3677     if ( aExp == 0 ) ++expDiff;
   3678     shift128RightJamming( aSig, 0, - expDiff, &aSig, &zSig1 );
   3679  bBigger:
   3680     sub128( bSig, 0, aSig, zSig1, &zSig0, &zSig1 );
   3681     zExp = bExp;
   3682     zSign ^= 1;
   3683     goto normalizeRoundAndPack;
   3684  aExpBigger:
   3685     if ( aExp == 0x7FFF ) {
   3686         if ( (bits64) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b );
   3687         return a;
   3688     }
   3689     if ( bExp == 0 ) --expDiff;
   3690     shift128RightJamming( bSig, 0, expDiff, &bSig, &zSig1 );
   3691  aBigger:
   3692     sub128( aSig, 0, bSig, zSig1, &zSig0, &zSig1 );
   3693     zExp = aExp;
   3694  normalizeRoundAndPack:
   3695     return
   3696         normalizeRoundAndPackFloatx80(
   3697             floatx80_rounding_precision, zSign, zExp, zSig0, zSig1 );
   3698 
   3699 }
   3700 
   3701 /*
   3702 -------------------------------------------------------------------------------
   3703 Returns the result of adding the extended double-precision floating-point
   3704 values `a' and `b'.  The operation is performed according to the IEC/IEEE
   3705 Standard for Binary Floating-Point Arithmetic.
   3706 -------------------------------------------------------------------------------
   3707 */
   3708 floatx80 floatx80_add( floatx80 a, floatx80 b )
   3709 {
   3710     flag aSign, bSign;
   3711 
   3712     aSign = extractFloatx80Sign( a );
   3713     bSign = extractFloatx80Sign( b );
   3714     if ( aSign == bSign ) {
   3715         return addFloatx80Sigs( a, b, aSign );
   3716     }
   3717     else {
   3718         return subFloatx80Sigs( a, b, aSign );
   3719     }
   3720 
   3721 }
   3722 
   3723 /*
   3724 -------------------------------------------------------------------------------
   3725 Returns the result of subtracting the extended double-precision floating-
   3726 point values `a' and `b'.  The operation is performed according to the
   3727 IEC/IEEE Standard for Binary Floating-Point Arithmetic.
   3728 -------------------------------------------------------------------------------
   3729 */
   3730 floatx80 floatx80_sub( floatx80 a, floatx80 b )
   3731 {
   3732     flag aSign, bSign;
   3733 
   3734     aSign = extractFloatx80Sign( a );
   3735     bSign = extractFloatx80Sign( b );
   3736     if ( aSign == bSign ) {
   3737         return subFloatx80Sigs( a, b, aSign );
   3738     }
   3739     else {
   3740         return addFloatx80Sigs( a, b, aSign );
   3741     }
   3742 
   3743 }
   3744 
   3745 /*
   3746 -------------------------------------------------------------------------------
   3747 Returns the result of multiplying the extended double-precision floating-
   3748 point values `a' and `b'.  The operation is performed according to the
   3749 IEC/IEEE Standard for Binary Floating-Point Arithmetic.
   3750 -------------------------------------------------------------------------------
   3751 */
   3752 floatx80 floatx80_mul( floatx80 a, floatx80 b )
   3753 {
   3754     flag aSign, bSign, zSign;
   3755     int32 aExp, bExp, zExp;
   3756     bits64 aSig, bSig, zSig0, zSig1;
   3757     floatx80 z;
   3758 
   3759     aSig = extractFloatx80Frac( a );
   3760     aExp = extractFloatx80Exp( a );
   3761     aSign = extractFloatx80Sign( a );
   3762     bSig = extractFloatx80Frac( b );
   3763     bExp = extractFloatx80Exp( b );
   3764     bSign = extractFloatx80Sign( b );
   3765     zSign = aSign ^ bSign;
   3766     if ( aExp == 0x7FFF ) {
   3767         if (    (bits64) ( aSig<<1 )
   3768              || ( ( bExp == 0x7FFF ) && (bits64) ( bSig<<1 ) ) ) {
   3769             return propagateFloatx80NaN( a, b );
   3770         }
   3771         if ( ( bExp | bSig ) == 0 ) goto invalid;
   3772         return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
   3773     }
   3774     if ( bExp == 0x7FFF ) {
   3775         if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b );
   3776         if ( ( aExp | aSig ) == 0 ) {
   3777  invalid:
   3778             float_raise( float_flag_invalid );
   3779             z.low = floatx80_default_nan_low;
   3780             z.high = floatx80_default_nan_high;
   3781             return z;
   3782         }
   3783         return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
   3784     }
   3785     if ( aExp == 0 ) {
   3786         if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 );
   3787         normalizeFloatx80Subnormal( aSig, &aExp, &aSig );
   3788     }
   3789     if ( bExp == 0 ) {
   3790         if ( bSig == 0 ) return packFloatx80( zSign, 0, 0 );
   3791         normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
   3792     }
   3793     zExp = aExp + bExp - 0x3FFE;
   3794     mul64To128( aSig, bSig, &zSig0, &zSig1 );
   3795     if ( 0 < (sbits64) zSig0 ) {
   3796         shortShift128Left( zSig0, zSig1, 1, &zSig0, &zSig1 );
   3797         --zExp;
   3798     }
   3799     return
   3800         roundAndPackFloatx80(
   3801             floatx80_rounding_precision, zSign, zExp, zSig0, zSig1 );
   3802 
   3803 }
   3804 
   3805 /*
   3806 -------------------------------------------------------------------------------
   3807 Returns the result of dividing the extended double-precision floating-point
   3808 value `a' by the corresponding value `b'.  The operation is performed
   3809 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
   3810 -------------------------------------------------------------------------------
   3811 */
   3812 floatx80 floatx80_div( floatx80 a, floatx80 b )
   3813 {
   3814     flag aSign, bSign, zSign;
   3815     int32 aExp, bExp, zExp;
   3816     bits64 aSig, bSig, zSig0, zSig1;
   3817     bits64 rem0, rem1, rem2, term0, term1, term2;
   3818     floatx80 z;
   3819 
   3820     aSig = extractFloatx80Frac( a );
   3821     aExp = extractFloatx80Exp( a );
   3822     aSign = extractFloatx80Sign( a );
   3823     bSig = extractFloatx80Frac( b );
   3824     bExp = extractFloatx80Exp( b );
   3825     bSign = extractFloatx80Sign( b );
   3826     zSign = aSign ^ bSign;
   3827     if ( aExp == 0x7FFF ) {
   3828         if ( (bits64) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b );
   3829         if ( bExp == 0x7FFF ) {
   3830             if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b );
   3831             goto invalid;
   3832         }
   3833         return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
   3834     }
   3835     if ( bExp == 0x7FFF ) {
   3836         if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b );
   3837         return packFloatx80( zSign, 0, 0 );
   3838     }
   3839     if ( bExp == 0 ) {
   3840         if ( bSig == 0 ) {
   3841             if ( ( aExp | aSig ) == 0 ) {
   3842  invalid:
   3843                 float_raise( float_flag_invalid );
   3844                 z.low = floatx80_default_nan_low;
   3845                 z.high = floatx80_default_nan_high;
   3846                 return z;
   3847             }
   3848             float_raise( float_flag_divbyzero );
   3849             return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
   3850         }
   3851         normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
   3852     }
   3853     if ( aExp == 0 ) {
   3854         if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 );
   3855         normalizeFloatx80Subnormal( aSig, &aExp, &aSig );
   3856     }
   3857     zExp = aExp - bExp + 0x3FFE;
   3858     rem1 = 0;
   3859     if ( bSig <= aSig ) {
   3860         shift128Right( aSig, 0, 1, &aSig, &rem1 );
   3861         ++zExp;
   3862     }
   3863     zSig0 = estimateDiv128To64( aSig, rem1, bSig );
   3864     mul64To128( bSig, zSig0, &term0, &term1 );
   3865     sub128( aSig, rem1, term0, term1, &rem0, &rem1 );
   3866     while ( (sbits64) rem0 < 0 ) {
   3867         --zSig0;
   3868         add128( rem0, rem1, 0, bSig, &rem0, &rem1 );
   3869     }
   3870     zSig1 = estimateDiv128To64( rem1, 0, bSig );
   3871     if ( (bits64) ( zSig1<<1 ) <= 8 ) {
   3872         mul64To128( bSig, zSig1, &term1, &term2 );
   3873         sub128( rem1, 0, term1, term2, &rem1, &rem2 );
   3874         while ( (sbits64) rem1 < 0 ) {
   3875             --zSig1;
   3876             add128( rem1, rem2, 0, bSig, &rem1, &rem2 );
   3877         }
   3878         zSig1 |= ( ( rem1 | rem2 ) != 0 );
   3879     }
   3880     return
   3881         roundAndPackFloatx80(
   3882             floatx80_rounding_precision, zSign, zExp, zSig0, zSig1 );
   3883 
   3884 }
   3885 
   3886 /*
   3887 -------------------------------------------------------------------------------
   3888 Returns the remainder of the extended double-precision floating-point value
   3889 `a' with respect to the corresponding value `b'.  The operation is performed
   3890 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
   3891 -------------------------------------------------------------------------------
   3892 */
   3893 floatx80 floatx80_rem( floatx80 a, floatx80 b )
   3894 {
   3895     flag aSign, bSign, zSign;
   3896     int32 aExp, bExp, expDiff;
   3897     bits64 aSig0, aSig1, bSig;
   3898     bits64 q, term0, term1, alternateASig0, alternateASig1;
   3899     floatx80 z;
   3900 
   3901     aSig0 = extractFloatx80Frac( a );
   3902     aExp = extractFloatx80Exp( a );
   3903     aSign = extractFloatx80Sign( a );
   3904     bSig = extractFloatx80Frac( b );
   3905     bExp = extractFloatx80Exp( b );
   3906     bSign = extractFloatx80Sign( b );
   3907     if ( aExp == 0x7FFF ) {
   3908         if (    (bits64) ( aSig0<<1 )
   3909              || ( ( bExp == 0x7FFF ) && (bits64) ( bSig<<1 ) ) ) {
   3910             return propagateFloatx80NaN( a, b );
   3911         }
   3912         goto invalid;
   3913     }
   3914     if ( bExp == 0x7FFF ) {
   3915         if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b );
   3916         return a;
   3917     }
   3918     if ( bExp == 0 ) {
   3919         if ( bSig == 0 ) {
   3920  invalid:
   3921             float_raise( float_flag_invalid );
   3922             z.low = floatx80_default_nan_low;
   3923             z.high = floatx80_default_nan_high;
   3924             return z;
   3925         }
   3926         normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
   3927     }
   3928     if ( aExp == 0 ) {
   3929         if ( (bits64) ( aSig0<<1 ) == 0 ) return a;
   3930         normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 );
   3931     }
   3932     bSig |= LIT64( 0x8000000000000000 );
   3933     zSign = aSign;
   3934     expDiff = aExp - bExp;
   3935     aSig1 = 0;
   3936     if ( expDiff < 0 ) {
   3937         if ( expDiff < -1 ) return a;
   3938         shift128Right( aSig0, 0, 1, &aSig0, &aSig1 );
   3939         expDiff = 0;
   3940     }
   3941     q = ( bSig <= aSig0 );
   3942     if ( q ) aSig0 -= bSig;
   3943     expDiff -= 64;
   3944     while ( 0 < expDiff ) {
   3945         q = estimateDiv128To64( aSig0, aSig1, bSig );
   3946         q = ( 2 < q ) ? q - 2 : 0;
   3947         mul64To128( bSig, q, &term0, &term1 );
   3948         sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
   3949         shortShift128Left( aSig0, aSig1, 62, &aSig0, &aSig1 );
   3950         expDiff -= 62;
   3951     }
   3952     expDiff += 64;
   3953     if ( 0 < expDiff ) {
   3954         q = estimateDiv128To64( aSig0, aSig1, bSig );
   3955         q = ( 2 < q ) ? q - 2 : 0;
   3956         q >>= 64 - expDiff;
   3957         mul64To128( bSig, q<<( 64 - expDiff ), &term0, &term1 );
   3958         sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
   3959         shortShift128Left( 0, bSig, 64 - expDiff, &term0, &term1 );
   3960         while ( le128( term0, term1, aSig0, aSig1 ) ) {
   3961             ++q;
   3962             sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
   3963         }
   3964     }
   3965     else {
   3966         term1 = 0;
   3967         term0 = bSig;
   3968     }
   3969     sub128( term0, term1, aSig0, aSig1, &alternateASig0, &alternateASig1 );
   3970     if (    lt128( alternateASig0, alternateASig1, aSig0, aSig1 )
   3971          || (    eq128( alternateASig0, alternateASig1, aSig0, aSig1 )
   3972               && ( q & 1 ) )
   3973        ) {
   3974         aSig0 = alternateASig0;
   3975         aSig1 = alternateASig1;
   3976         zSign = ! zSign;
   3977     }
   3978     return
   3979         normalizeRoundAndPackFloatx80(
   3980             80, zSign, bExp + expDiff, aSig0, aSig1 );
   3981 
   3982 }
   3983 
   3984 /*
   3985 -------------------------------------------------------------------------------
   3986 Returns the square root of the extended double-precision floating-point
   3987 value `a'.  The operation is performed according to the IEC/IEEE Standard
   3988 for Binary Floating-Point Arithmetic.
   3989 -------------------------------------------------------------------------------
   3990 */
   3991 floatx80 floatx80_sqrt( floatx80 a )
   3992 {
   3993     flag aSign;
   3994     int32 aExp, zExp;
   3995     bits64 aSig0, aSig1, zSig0, zSig1, doubleZSig0;
   3996     bits64 rem0, rem1, rem2, rem3, term0, term1, term2, term3;
   3997     floatx80 z;
   3998 
   3999     aSig0 = extractFloatx80Frac( a );
   4000     aExp = extractFloatx80Exp( a );
   4001     aSign = extractFloatx80Sign( a );
   4002     if ( aExp == 0x7FFF ) {
   4003         if ( (bits64) ( aSig0<<1 ) ) return propagateFloatx80NaN( a, a );
   4004         if ( ! aSign ) return a;
   4005         goto invalid;
   4006     }
   4007     if ( aSign ) {
   4008         if ( ( aExp | aSig0 ) == 0 ) return a;
   4009  invalid:
   4010         float_raise( float_flag_invalid );
   4011         z.low = floatx80_default_nan_low;
   4012         z.high = floatx80_default_nan_high;
   4013         return z;
   4014     }
   4015     if ( aExp == 0 ) {
   4016         if ( aSig0 == 0 ) return packFloatx80( 0, 0, 0 );
   4017         normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 );
   4018     }
   4019     zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFF;
   4020     zSig0 = estimateSqrt32( aExp, aSig0>>32 );
   4021     shift128Right( aSig0, 0, 2 + ( aExp & 1 ), &aSig0, &aSig1 );
   4022     zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 );
   4023     doubleZSig0 = zSig0<<1;
   4024     mul64To128( zSig0, zSig0, &term0, &term1 );
   4025     sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 );
   4026     while ( (sbits64) rem0 < 0 ) {
   4027         --zSig0;
   4028         doubleZSig0 -= 2;
   4029         add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 );
   4030     }
   4031     zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 );
   4032     if ( ( zSig1 & LIT64( 0x3FFFFFFFFFFFFFFF ) ) <= 5 ) {
   4033         if ( zSig1 == 0 ) zSig1 = 1;
   4034         mul64To128( doubleZSig0, zSig1, &term1, &term2 );
   4035         sub128( rem1, 0, term1, term2, &rem1, &rem2 );
   4036         mul64To128( zSig1, zSig1, &term2, &term3 );
   4037         sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 );
   4038         while ( (sbits64) rem1 < 0 ) {
   4039             --zSig1;
   4040             shortShift128Left( 0, zSig1, 1, &term2, &term3 );
   4041             term3 |= 1;
   4042             term2 |= doubleZSig0;
   4043             add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 );
   4044         }
   4045         zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
   4046     }
   4047     shortShift128Left( 0, zSig1, 1, &zSig0, &zSig1 );
   4048     zSig0 |= doubleZSig0;
   4049     return
   4050         roundAndPackFloatx80(
   4051             floatx80_rounding_precision, 0, zExp, zSig0, zSig1 );
   4052 
   4053 }
   4054 
   4055 /*
   4056 -------------------------------------------------------------------------------
   4057 Returns 1 if the extended double-precision floating-point value `a' is
   4058 equal to the corresponding value `b', and 0 otherwise.  The comparison is
   4059 performed according to the IEC/IEEE Standard for Binary Floating-Point
   4060 Arithmetic.
   4061 -------------------------------------------------------------------------------
   4062 */
   4063 flag floatx80_eq( floatx80 a, floatx80 b )
   4064 {
   4065 
   4066     if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
   4067               && (bits64) ( extractFloatx80Frac( a )<<1 ) )
   4068          || (    ( extractFloatx80Exp( b ) == 0x7FFF )
   4069               && (bits64) ( extractFloatx80Frac( b )<<1 ) )
   4070        ) {
   4071         if (    floatx80_is_signaling_nan( a )
   4072              || floatx80_is_signaling_nan( b ) ) {
   4073             float_raise( float_flag_invalid );
   4074         }
   4075         return 0;
   4076     }
   4077     return
   4078            ( a.low == b.low )
   4079         && (    ( a.high == b.high )
   4080              || (    ( a.low == 0 )
   4081                   && ( (bits16) ( ( a.high | b.high )<<1 ) == 0 ) )
   4082            );
   4083 
   4084 }
   4085 
   4086 /*
   4087 -------------------------------------------------------------------------------
   4088 Returns 1 if the extended double-precision floating-point value `a' is
   4089 less than or equal to the corresponding value `b', and 0 otherwise.  The
   4090 comparison is performed according to the IEC/IEEE Standard for Binary
   4091 Floating-Point Arithmetic.
   4092 -------------------------------------------------------------------------------
   4093 */
   4094 flag floatx80_le( floatx80 a, floatx80 b )
   4095 {
   4096     flag aSign, bSign;
   4097 
   4098     if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
   4099               && (bits64) ( extractFloatx80Frac( a )<<1 ) )
   4100          || (    ( extractFloatx80Exp( b ) == 0x7FFF )
   4101               && (bits64) ( extractFloatx80Frac( b )<<1 ) )
   4102        ) {
   4103         float_raise( float_flag_invalid );
   4104         return 0;
   4105     }
   4106     aSign = extractFloatx80Sign( a );
   4107     bSign = extractFloatx80Sign( b );
   4108     if ( aSign != bSign ) {
   4109         return
   4110                aSign
   4111             || (    ( ( (bits16) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
   4112                  == 0 );
   4113     }
   4114     return
   4115           aSign ? le128( b.high, b.low, a.high, a.low )
   4116         : le128( a.high, a.low, b.high, b.low );
   4117 
   4118 }
   4119 
   4120 /*
   4121 -------------------------------------------------------------------------------
   4122 Returns 1 if the extended double-precision floating-point value `a' is
   4123 less than the corresponding value `b', and 0 otherwise.  The comparison
   4124 is performed according to the IEC/IEEE Standard for Binary Floating-Point
   4125 Arithmetic.
   4126 -------------------------------------------------------------------------------
   4127 */
   4128 flag floatx80_lt( floatx80 a, floatx80 b )
   4129 {
   4130     flag aSign, bSign;
   4131 
   4132     if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
   4133               && (bits64) ( extractFloatx80Frac( a )<<1 ) )
   4134          || (    ( extractFloatx80Exp( b ) == 0x7FFF )
   4135               && (bits64) ( extractFloatx80Frac( b )<<1 ) )
   4136        ) {
   4137         float_raise( float_flag_invalid );
   4138         return 0;
   4139     }
   4140     aSign = extractFloatx80Sign( a );
   4141     bSign = extractFloatx80Sign( b );
   4142     if ( aSign != bSign ) {
   4143         return
   4144                aSign
   4145             && (    ( ( (bits16) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
   4146                  != 0 );
   4147     }
   4148     return
   4149           aSign ? lt128( b.high, b.low, a.high, a.low )
   4150         : lt128( a.high, a.low, b.high, b.low );
   4151 
   4152 }
   4153 
   4154 /*
   4155 -------------------------------------------------------------------------------
   4156 Returns 1 if the extended double-precision floating-point value `a' is equal
   4157 to the corresponding value `b', and 0 otherwise.  The invalid exception is
   4158 raised if either operand is a NaN.  Otherwise, the comparison is performed
   4159 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
   4160 -------------------------------------------------------------------------------
   4161 */
   4162 flag floatx80_eq_signaling( floatx80 a, floatx80 b )
   4163 {
   4164 
   4165     if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
   4166               && (bits64) ( extractFloatx80Frac( a )<<1 ) )
   4167          || (    ( extractFloatx80Exp( b ) == 0x7FFF )
   4168               && (bits64) ( extractFloatx80Frac( b )<<1 ) )
   4169        ) {
   4170         float_raise( float_flag_invalid );
   4171         return 0;
   4172     }
   4173     return
   4174            ( a.low == b.low )
   4175         && (    ( a.high == b.high )
   4176              || (    ( a.low == 0 )
   4177                   && ( (bits16) ( ( a.high | b.high )<<1 ) == 0 ) )
   4178            );
   4179 
   4180 }
   4181 
   4182 /*
   4183 -------------------------------------------------------------------------------
   4184 Returns 1 if the extended double-precision floating-point value `a' is less
   4185 than or equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs
   4186 do not cause an exception.  Otherwise, the comparison is performed according
   4187 to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
   4188 -------------------------------------------------------------------------------
   4189 */
   4190 flag floatx80_le_quiet( floatx80 a, floatx80 b )
   4191 {
   4192     flag aSign, bSign;
   4193 
   4194     if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
   4195               && (bits64) ( extractFloatx80Frac( a )<<1 ) )
   4196          || (    ( extractFloatx80Exp( b ) == 0x7FFF )
   4197               && (bits64) ( extractFloatx80Frac( b )<<1 ) )
   4198        ) {
   4199         if (    floatx80_is_signaling_nan( a )
   4200              || floatx80_is_signaling_nan( b ) ) {
   4201             float_raise( float_flag_invalid );
   4202         }
   4203         return 0;
   4204     }
   4205     aSign = extractFloatx80Sign( a );
   4206     bSign = extractFloatx80Sign( b );
   4207     if ( aSign != bSign ) {
   4208         return
   4209                aSign
   4210             || (    ( ( (bits16) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
   4211                  == 0 );
   4212     }
   4213     return
   4214           aSign ? le128( b.high, b.low, a.high, a.low )
   4215         : le128( a.high, a.low, b.high, b.low );
   4216 
   4217 }
   4218 
   4219 /*
   4220 -------------------------------------------------------------------------------
   4221 Returns 1 if the extended double-precision floating-point value `a' is less
   4222 than the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause
   4223 an exception.  Otherwise, the comparison is performed according to the
   4224 IEC/IEEE Standard for Binary Floating-Point Arithmetic.
   4225 -------------------------------------------------------------------------------
   4226 */
   4227 flag floatx80_lt_quiet( floatx80 a, floatx80 b )
   4228 {
   4229     flag aSign, bSign;
   4230 
   4231     if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
   4232               && (bits64) ( extractFloatx80Frac( a )<<1 ) )
   4233          || (    ( extractFloatx80Exp( b ) == 0x7FFF )
   4234               && (bits64) ( extractFloatx80Frac( b )<<1 ) )
   4235        ) {
   4236         if (    floatx80_is_signaling_nan( a )
   4237              || floatx80_is_signaling_nan( b ) ) {
   4238             float_raise( float_flag_invalid );
   4239         }
   4240         return 0;
   4241     }
   4242     aSign = extractFloatx80Sign( a );
   4243     bSign = extractFloatx80Sign( b );
   4244     if ( aSign != bSign ) {
   4245         return
   4246                aSign
   4247             && (    ( ( (bits16) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
   4248                  != 0 );
   4249     }
   4250     return
   4251           aSign ? lt128( b.high, b.low, a.high, a.low )
   4252         : lt128( a.high, a.low, b.high, b.low );
   4253 
   4254 }
   4255 
   4256 #endif
   4257 
   4258 #ifdef FLOAT128
   4259 
   4260 /*
   4261 -------------------------------------------------------------------------------
   4262 Returns the result of converting the quadruple-precision floating-point
   4263 value `a' to the 32-bit two's complement integer format.  The conversion
   4264 is performed according to the IEC/IEEE Standard for Binary Floating-Point
   4265 Arithmetic---which means in particular that the conversion is rounded
   4266 according to the current rounding mode.  If `a' is a NaN, the largest
   4267 positive integer is returned.  Otherwise, if the conversion overflows, the
   4268 largest integer with the same sign as `a' is returned.
   4269 -------------------------------------------------------------------------------
   4270 */
   4271 int32 float128_to_int32( float128 a )
   4272 {
   4273     flag aSign;
   4274     int32 aExp, shiftCount;
   4275     bits64 aSig0, aSig1;
   4276 
   4277     aSig1 = extractFloat128Frac1( a );
   4278     aSig0 = extractFloat128Frac0( a );
   4279     aExp = extractFloat128Exp( a );
   4280     aSign = extractFloat128Sign( a );
   4281     if ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) aSign = 0;
   4282     if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
   4283     aSig0 |= ( aSig1 != 0 );
   4284     shiftCount = 0x4028 - aExp;
   4285     if ( 0 < shiftCount ) shift64RightJamming( aSig0, shiftCount, &aSig0 );
   4286     return roundAndPackInt32( aSign, aSig0 );
   4287 
   4288 }
   4289 
   4290 /*
   4291 -------------------------------------------------------------------------------
   4292 Returns the result of converting the quadruple-precision floating-point
   4293 value `a' to the 32-bit two's complement integer format.  The conversion
   4294 is performed according to the IEC/IEEE Standard for Binary Floating-Point
   4295 Arithmetic, except that the conversion is always rounded toward zero.  If
   4296 `a' is a NaN, the largest positive integer is returned.  Otherwise, if the
   4297 conversion overflows, the largest integer with the same sign as `a' is
   4298 returned.
   4299 -------------------------------------------------------------------------------
   4300 */
   4301 int32 float128_to_int32_round_to_zero( float128 a )
   4302 {
   4303     flag aSign;
   4304     int32 aExp, shiftCount;
   4305     bits64 aSig0, aSig1, savedASig;
   4306     int32 z;
   4307 
   4308     aSig1 = extractFloat128Frac1( a );
   4309     aSig0 = extractFloat128Frac0( a );
   4310     aExp = extractFloat128Exp( a );
   4311     aSign = extractFloat128Sign( a );
   4312     aSig0 |= ( aSig1 != 0 );
   4313     if ( 0x401E < aExp ) {
   4314         if ( ( aExp == 0x7FFF ) && aSig0 ) aSign = 0;
   4315         goto invalid;
   4316     }
   4317     else if ( aExp < 0x3FFF ) {
   4318         if ( aExp || aSig0 ) float_set_inexact();
   4319         return 0;
   4320     }
   4321     aSig0 |= LIT64( 0x0001000000000000 );
   4322     shiftCount = 0x402F - aExp;
   4323     savedASig = aSig0;
   4324     aSig0 >>= shiftCount;
   4325     z = aSig0;
   4326     if ( aSign ) z = - z;
   4327     if ( ( z < 0 ) ^ aSign ) {
   4328  invalid:
   4329         float_raise( float_flag_invalid );
   4330         return aSign ? (sbits32) 0x80000000 : 0x7FFFFFFF;
   4331     }
   4332     if ( ( aSig0<<shiftCount ) != savedASig ) {
   4333         float_set_inexact();
   4334     }
   4335     return z;
   4336 
   4337 }
   4338 
   4339 /*
   4340 -------------------------------------------------------------------------------
   4341 Returns the result of converting the quadruple-precision floating-point
   4342 value `a' to the 64-bit two's complement integer format.  The conversion
   4343 is performed according to the IEC/IEEE Standard for Binary Floating-Point
   4344 Arithmetic---which means in particular that the conversion is rounded
   4345 according to the current rounding mode.  If `a' is a NaN, the largest
   4346 positive integer is returned.  Otherwise, if the conversion overflows, the
   4347 largest integer with the same sign as `a' is returned.
   4348 -------------------------------------------------------------------------------
   4349 */
   4350 int64 float128_to_int64( float128 a )
   4351 {
   4352     flag aSign;
   4353     int32 aExp, shiftCount;
   4354     bits64 aSig0, aSig1;
   4355 
   4356     aSig1 = extractFloat128Frac1( a );
   4357     aSig0 = extractFloat128Frac0( a );
   4358     aExp = extractFloat128Exp( a );
   4359     aSign = extractFloat128Sign( a );
   4360     if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
   4361     shiftCount = 0x402F - aExp;
   4362     if ( shiftCount <= 0 ) {
   4363         if ( 0x403E < aExp ) {
   4364             float_raise( float_flag_invalid );
   4365             if (    ! aSign
   4366                  || (    ( aExp == 0x7FFF )
   4367                       && ( aSig1 || ( aSig0 != LIT64( 0x0001000000000000 ) ) )
   4368                     )
   4369                ) {
   4370                 return LIT64( 0x7FFFFFFFFFFFFFFF );
   4371             }
   4372             return (sbits64) LIT64( 0x8000000000000000 );
   4373         }
   4374         shortShift128Left( aSig0, aSig1, - shiftCount, &aSig0, &aSig1 );
   4375     }
   4376     else {
   4377         shift64ExtraRightJamming( aSig0, aSig1, shiftCount, &aSig0, &aSig1 );
   4378     }
   4379     return roundAndPackInt64( aSign, aSig0, aSig1 );
   4380 
   4381 }
   4382 
   4383 /*
   4384 -------------------------------------------------------------------------------
   4385 Returns the result of converting the quadruple-precision floating-point
   4386 value `a' to the 64-bit two's complement integer format.  The conversion
   4387 is performed according to the IEC/IEEE Standard for Binary Floating-Point
   4388 Arithmetic, except that the conversion is always rounded toward zero.
   4389 If `a' is a NaN, the largest positive integer is returned.  Otherwise, if
   4390 the conversion overflows, the largest integer with the same sign as `a' is
   4391 returned.
   4392 -------------------------------------------------------------------------------
   4393 */
   4394 int64 float128_to_int64_round_to_zero( float128 a )
   4395 {
   4396     flag aSign;
   4397     int32 aExp, shiftCount;
   4398     bits64 aSig0, aSig1;
   4399     int64 z;
   4400 
   4401     aSig1 = extractFloat128Frac1( a );
   4402     aSig0 = extractFloat128Frac0( a );
   4403     aExp = extractFloat128Exp( a );
   4404     aSign = extractFloat128Sign( a );
   4405     if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
   4406     shiftCount = aExp - 0x402F;
   4407     if ( 0 < shiftCount ) {
   4408         if ( 0x403E <= aExp ) {
   4409             aSig0 &= LIT64( 0x0000FFFFFFFFFFFF );
   4410             if (    ( a.high == LIT64( 0xC03E000000000000 ) )
   4411                  && ( aSig1 < LIT64( 0x0002000000000000 ) ) ) {
   4412                 if ( aSig1 ) float_set_inexact();
   4413             }
   4414             else {
   4415                 float_raise( float_flag_invalid );
   4416                 if ( ! aSign || ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) ) {
   4417                     return LIT64( 0x7FFFFFFFFFFFFFFF );
   4418                 }
   4419             }
   4420             return (sbits64) LIT64( 0x8000000000000000 );
   4421         }
   4422         z = ( aSig0<<shiftCount ) | ( aSig1>>( ( - shiftCount ) & 63 ) );
   4423         if ( (bits64) ( aSig1<<shiftCount ) ) {
   4424             float_set_inexact();
   4425         }
   4426     }
   4427     else {
   4428         if ( aExp < 0x3FFF ) {
   4429             if ( aExp | aSig0 | aSig1 ) {
   4430                 float_set_inexact();
   4431             }
   4432             return 0;
   4433         }
   4434         z = aSig0>>( - shiftCount );
   4435         if (    aSig1
   4436              || ( shiftCount && (bits64) ( aSig0<<( shiftCount & 63 ) ) ) ) {
   4437             float_set_inexact();
   4438         }
   4439     }
   4440     if ( aSign ) z = - z;
   4441     return z;
   4442 
   4443 }
   4444 
   4445 /*
   4446 -------------------------------------------------------------------------------
   4447 Returns the result of converting the quadruple-precision floating-point
   4448 value `a' to the single-precision floating-point format.  The conversion
   4449 is performed according to the IEC/IEEE Standard for Binary Floating-Point
   4450 Arithmetic.
   4451 -------------------------------------------------------------------------------
   4452 */
   4453 float32 float128_to_float32( float128 a )
   4454 {
   4455     flag aSign;
   4456     int32 aExp;
   4457     bits64 aSig0, aSig1;
   4458     bits32 zSig;
   4459 
   4460     aSig1 = extractFloat128Frac1( a );
   4461     aSig0 = extractFloat128Frac0( a );
   4462     aExp = extractFloat128Exp( a );
   4463     aSign = extractFloat128Sign( a );
   4464     if ( aExp == 0x7FFF ) {
   4465         if ( aSig0 | aSig1 ) {
   4466             return commonNaNToFloat32( float128ToCommonNaN( a ) );
   4467         }
   4468         return packFloat32( aSign, 0xFF, 0 );
   4469     }
   4470     aSig0 |= ( aSig1 != 0 );
   4471     shift64RightJamming( aSig0, 18, &aSig0 );
   4472     zSig = aSig0;
   4473     if ( aExp || zSig ) {
   4474         zSig |= 0x40000000;
   4475         aExp -= 0x3F81;
   4476     }
   4477     return roundAndPackFloat32( aSign, aExp, zSig );
   4478 
   4479 }
   4480 
   4481 /*
   4482 -------------------------------------------------------------------------------
   4483 Returns the result of converting the quadruple-precision floating-point
   4484 value `a' to the double-precision floating-point format.  The conversion
   4485 is performed according to the IEC/IEEE Standard for Binary Floating-Point
   4486 Arithmetic.
   4487 -------------------------------------------------------------------------------
   4488 */
   4489 float64 float128_to_float64( float128 a )
   4490 {
   4491     flag aSign;
   4492     int32 aExp;
   4493     bits64 aSig0, aSig1;
   4494 
   4495     aSig1 = extractFloat128Frac1( a );
   4496     aSig0 = extractFloat128Frac0( a );
   4497     aExp = extractFloat128Exp( a );
   4498     aSign = extractFloat128Sign( a );
   4499     if ( aExp == 0x7FFF ) {
   4500         if ( aSig0 | aSig1 ) {
   4501             return commonNaNToFloat64( float128ToCommonNaN( a ) );
   4502         }
   4503         return packFloat64( aSign, 0x7FF, 0 );
   4504     }
   4505     shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 );
   4506     aSig0 |= ( aSig1 != 0 );
   4507     if ( aExp || aSig0 ) {
   4508         aSig0 |= LIT64( 0x4000000000000000 );
   4509         aExp -= 0x3C01;
   4510     }
   4511     return roundAndPackFloat64( aSign, aExp, aSig0 );
   4512 
   4513 }
   4514 
   4515 #ifdef FLOATX80
   4516 
   4517 /*
   4518 -------------------------------------------------------------------------------
   4519 Returns the result of converting the quadruple-precision floating-point
   4520 value `a' to the extended double-precision floating-point format.  The
   4521 conversion is performed according to the IEC/IEEE Standard for Binary
   4522 Floating-Point Arithmetic.
   4523 -------------------------------------------------------------------------------
   4524 */
   4525 floatx80 float128_to_floatx80( float128 a )
   4526 {
   4527     flag aSign;
   4528     int32 aExp;
   4529     bits64 aSig0, aSig1;
   4530 
   4531     aSig1 = extractFloat128Frac1( a );
   4532     aSig0 = extractFloat128Frac0( a );
   4533     aExp = extractFloat128Exp( a );
   4534     aSign = extractFloat128Sign( a );
   4535     if ( aExp == 0x7FFF ) {
   4536         if ( aSig0 | aSig1 ) {
   4537             return commonNaNToFloatx80( float128ToCommonNaN( a ) );
   4538         }
   4539         return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
   4540     }
   4541     if ( aExp == 0 ) {
   4542         if ( ( aSig0 | aSig1 ) == 0 ) return packFloatx80( aSign, 0, 0 );
   4543         normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
   4544     }
   4545     else {
   4546         aSig0 |= LIT64( 0x0001000000000000 );
   4547     }
   4548     shortShift128Left( aSig0, aSig1, 15, &aSig0, &aSig1 );
   4549     return roundAndPackFloatx80( 80, aSign, aExp, aSig0, aSig1 );
   4550 
   4551 }
   4552 
   4553 #endif
   4554 
   4555 /*
   4556 -------------------------------------------------------------------------------
   4557 Rounds the quadruple-precision floating-point value `a' to an integer, and
   4558 returns the result as a quadruple-precision floating-point value.  The
   4559 operation is performed according to the IEC/IEEE Standard for Binary
   4560 Floating-Point Arithmetic.
   4561 -------------------------------------------------------------------------------
   4562 */
   4563 float128 float128_round_to_int( float128 a )
   4564 {
   4565     flag aSign;
   4566     int32 aExp;
   4567     bits64 lastBitMask, roundBitsMask;
   4568     int8 roundingMode;
   4569     float128 z;
   4570 
   4571     aExp = extractFloat128Exp( a );
   4572     if ( 0x402F <= aExp ) {
   4573         if ( 0x406F <= aExp ) {
   4574             if (    ( aExp == 0x7FFF )
   4575                  && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) )
   4576                ) {
   4577                 return propagateFloat128NaN( a, a );
   4578             }
   4579             return a;
   4580         }
   4581         lastBitMask = 1;
   4582         lastBitMask = ( lastBitMask<<( 0x406E - aExp ) )<<1;
   4583         roundBitsMask = lastBitMask - 1;
   4584         z = a;
   4585         roundingMode = float_rounding_mode();
   4586         if ( roundingMode == float_round_nearest_even ) {
   4587             if ( lastBitMask ) {
   4588                 add128( z.high, z.low, 0, lastBitMask>>1, &z.high, &z.low );
   4589                 if ( ( z.low & roundBitsMask ) == 0 ) z.low &= ~ lastBitMask;
   4590             }
   4591             else {
   4592                 if ( (sbits64) z.low < 0 ) {
   4593                     ++z.high;
   4594                     if ( (bits64) ( z.low<<1 ) == 0 ) z.high &= ~1;
   4595                 }
   4596             }
   4597         }
   4598         else if ( roundingMode != float_round_to_zero ) {
   4599             if (   extractFloat128Sign( z )
   4600                  ^ ( roundingMode == float_round_up ) ) {
   4601                 add128( z.high, z.low, 0, roundBitsMask, &z.high, &z.low );
   4602             }
   4603         }
   4604         z.low &= ~ roundBitsMask;
   4605     }
   4606     else {
   4607         if ( aExp < 0x3FFF ) {
   4608             if ( ( ( (bits64) ( a.high<<1 ) ) | a.low ) == 0 ) return a;
   4609             float_set_inexact();
   4610             aSign = extractFloat128Sign( a );
   4611             switch ( float_rounding_mode() ) {
   4612              case float_round_nearest_even:
   4613                 if (    ( aExp == 0x3FFE )
   4614                      && (   extractFloat128Frac0( a )
   4615                           | extractFloat128Frac1( a ) )
   4616                    ) {
   4617                     return packFloat128( aSign, 0x3FFF, 0, 0 );
   4618                 }
   4619                 break;
   4620              case float_round_down:
   4621                 return
   4622                       aSign ? packFloat128( 1, 0x3FFF, 0, 0 )
   4623                     : packFloat128( 0, 0, 0, 0 );
   4624              case float_round_up:
   4625                 return
   4626                       aSign ? packFloat128( 1, 0, 0, 0 )
   4627                     : packFloat128( 0, 0x3FFF, 0, 0 );
   4628             }
   4629             return packFloat128( aSign, 0, 0, 0 );
   4630         }
   4631         lastBitMask = 1;
   4632         lastBitMask <<= 0x402F - aExp;
   4633         roundBitsMask = lastBitMask - 1;
   4634         z.low = 0;
   4635         z.high = a.high;
   4636         roundingMode = float_rounding_mode();
   4637         if ( roundingMode == float_round_nearest_even ) {
   4638             z.high += lastBitMask>>1;
   4639             if ( ( ( z.high & roundBitsMask ) | a.low ) == 0 ) {
   4640                 z.high &= ~ lastBitMask;
   4641             }
   4642         }
   4643         else if ( roundingMode != float_round_to_zero ) {
   4644             if (   extractFloat128Sign( z )
   4645                  ^ ( roundingMode == float_round_up ) ) {
   4646                 z.high |= ( a.low != 0 );
   4647                 z.high += roundBitsMask;
   4648             }
   4649         }
   4650         z.high &= ~ roundBitsMask;
   4651     }
   4652     if ( ( z.low != a.low ) || ( z.high != a.high ) ) {
   4653         float_set_inexact();
   4654     }
   4655     return z;
   4656 
   4657 }
   4658 
   4659 /*
   4660 -------------------------------------------------------------------------------
   4661 Returns the result of adding the absolute values of the quadruple-precision
   4662 floating-point values `a' and `b'.  If `zSign' is 1, the sum is negated
   4663 before being returned.  `zSign' is ignored if the result is a NaN.
   4664 The addition is performed according to the IEC/IEEE Standard for Binary
   4665 Floating-Point Arithmetic.
   4666 -------------------------------------------------------------------------------
   4667 */
   4668 static float128 addFloat128Sigs( float128 a, float128 b, flag zSign )
   4669 {
   4670     int32 aExp, bExp, zExp;
   4671     bits64 aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2;
   4672     int32 expDiff;
   4673 
   4674     aSig1 = extractFloat128Frac1( a );
   4675     aSig0 = extractFloat128Frac0( a );
   4676     aExp = extractFloat128Exp( a );
   4677     bSig1 = extractFloat128Frac1( b );
   4678     bSig0 = extractFloat128Frac0( b );
   4679     bExp = extractFloat128Exp( b );
   4680     expDiff = aExp - bExp;
   4681     if ( 0 < expDiff ) {
   4682         if ( aExp == 0x7FFF ) {
   4683             if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, b );
   4684             return a;
   4685         }
   4686         if ( bExp == 0 ) {
   4687             --expDiff;
   4688         }
   4689         else {
   4690             bSig0 |= LIT64( 0x0001000000000000 );
   4691         }
   4692         shift128ExtraRightJamming(
   4693             bSig0, bSig1, 0, expDiff, &bSig0, &bSig1, &zSig2 );
   4694         zExp = aExp;
   4695     }
   4696     else if ( expDiff < 0 ) {
   4697         if ( bExp == 0x7FFF ) {
   4698             if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b );
   4699             return packFloat128( zSign, 0x7FFF, 0, 0 );
   4700         }
   4701         if ( aExp == 0 ) {
   4702             ++expDiff;
   4703         }
   4704         else {
   4705             aSig0 |= LIT64( 0x0001000000000000 );
   4706         }
   4707         shift128ExtraRightJamming(
   4708             aSig0, aSig1, 0, - expDiff, &aSig0, &aSig1, &zSig2 );
   4709         zExp = bExp;
   4710     }
   4711     else {
   4712         if ( aExp == 0x7FFF ) {
   4713             if ( aSig0 | aSig1 | bSig0 | bSig1 ) {
   4714                 return propagateFloat128NaN( a, b );
   4715             }
   4716             return a;
   4717         }
   4718         add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
   4719         if ( aExp == 0 ) return packFloat128( zSign, 0, zSig0, zSig1 );
   4720         zSig2 = 0;
   4721         zSig0 |= LIT64( 0x0002000000000000 );
   4722         zExp = aExp;
   4723         goto shiftRight1;
   4724     }
   4725     aSig0 |= LIT64( 0x0001000000000000 );
   4726     add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
   4727     --zExp;
   4728     if ( zSig0 < LIT64( 0x0002000000000000 ) ) goto roundAndPack;
   4729     ++zExp;
   4730  shiftRight1:
   4731     shift128ExtraRightJamming(
   4732         zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 );
   4733  roundAndPack:
   4734     return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 );
   4735 
   4736 }
   4737 
   4738 /*
   4739 -------------------------------------------------------------------------------
   4740 Returns the result of subtracting the absolute values of the quadruple-
   4741 precision floating-point values `a' and `b'.  If `zSign' is 1, the
   4742 difference is negated before being returned.  `zSign' is ignored if the
   4743 result is a NaN.  The subtraction is performed according to the IEC/IEEE
   4744 Standard for Binary Floating-Point Arithmetic.
   4745 -------------------------------------------------------------------------------
   4746 */
   4747 static float128 subFloat128Sigs( float128 a, float128 b, flag zSign )
   4748 {
   4749     int32 aExp, bExp, zExp;
   4750     bits64 aSig0, aSig1, bSig0, bSig1, zSig0, zSig1;
   4751     int32 expDiff;
   4752     float128 z;
   4753 
   4754     aSig1 = extractFloat128Frac1( a );
   4755     aSig0 = extractFloat128Frac0( a );
   4756     aExp = extractFloat128Exp( a );
   4757     bSig1 = extractFloat128Frac1( b );
   4758     bSig0 = extractFloat128Frac0( b );
   4759     bExp = extractFloat128Exp( b );
   4760     expDiff = aExp - bExp;
   4761     shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 );
   4762     shortShift128Left( bSig0, bSig1, 14, &bSig0, &bSig1 );
   4763     if ( 0 < expDiff ) goto aExpBigger;
   4764     if ( expDiff < 0 ) goto bExpBigger;
   4765     if ( aExp == 0x7FFF ) {
   4766         if ( aSig0 | aSig1 | bSig0 | bSig1 ) {
   4767             return propagateFloat128NaN( a, b );
   4768         }
   4769         float_raise( float_flag_invalid );
   4770         z.low = float128_default_nan_low;
   4771         z.high = float128_default_nan_high;
   4772         return z;
   4773     }
   4774     if ( aExp == 0 ) {
   4775         aExp = 1;
   4776         bExp = 1;
   4777     }
   4778     if ( bSig0 < aSig0 ) goto aBigger;
   4779     if ( aSig0 < bSig0 ) goto bBigger;
   4780     if ( bSig1 < aSig1 ) goto aBigger;
   4781     if ( aSig1 < bSig1 ) goto bBigger;
   4782     return packFloat128( float_rounding_mode() == float_round_down, 0, 0, 0 );
   4783  bExpBigger:
   4784     if ( bExp == 0x7FFF ) {
   4785         if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b );
   4786         return packFloat128( zSign ^ 1, 0x7FFF, 0, 0 );
   4787     }
   4788     if ( aExp == 0 ) {
   4789         ++expDiff;
   4790     }
   4791     else {
   4792         aSig0 |= LIT64( 0x4000000000000000 );
   4793     }
   4794     shift128RightJamming( aSig0, aSig1, - expDiff, &aSig0, &aSig1 );
   4795     bSig0 |= LIT64( 0x4000000000000000 );
   4796  bBigger:
   4797     sub128( bSig0, bSig1, aSig0, aSig1, &zSig0, &zSig1 );
   4798     zExp = bExp;
   4799     zSign ^= 1;
   4800     goto normalizeRoundAndPack;
   4801  aExpBigger:
   4802     if ( aExp == 0x7FFF ) {
   4803         if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, b );
   4804         return a;
   4805     }
   4806     if ( bExp == 0 ) {
   4807         --expDiff;
   4808     }
   4809     else {
   4810         bSig0 |= LIT64( 0x4000000000000000 );
   4811     }
   4812     shift128RightJamming( bSig0, bSig1, expDiff, &bSig0, &bSig1 );
   4813     aSig0 |= LIT64( 0x4000000000000000 );
   4814  aBigger:
   4815     sub128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
   4816     zExp = aExp;
   4817  normalizeRoundAndPack:
   4818     --zExp;
   4819     return normalizeRoundAndPackFloat128( zSign, zExp - 14, zSig0, zSig1 );
   4820 
   4821 }
   4822 
   4823 /*
   4824 -------------------------------------------------------------------------------
   4825 Returns the result of adding the quadruple-precision floating-point values
   4826 `a' and `b'.  The operation is performed according to the IEC/IEEE Standard
   4827 for Binary Floating-Point Arithmetic.
   4828 -------------------------------------------------------------------------------
   4829 */
   4830 float128 float128_add( float128 a, float128 b )
   4831 {
   4832     flag aSign, bSign;
   4833 
   4834     aSign = extractFloat128Sign( a );
   4835     bSign = extractFloat128Sign( b );
   4836     if ( aSign == bSign ) {
   4837         return addFloat128Sigs( a, b, aSign );
   4838     }
   4839     else {
   4840         return subFloat128Sigs( a, b, aSign );
   4841     }
   4842 
   4843 }
   4844 
   4845 /*
   4846 -------------------------------------------------------------------------------
   4847 Returns the result of subtracting the quadruple-precision floating-point
   4848 values `a' and `b'.  The operation is performed according to the IEC/IEEE
   4849 Standard for Binary Floating-Point Arithmetic.
   4850 -------------------------------------------------------------------------------
   4851 */
   4852 float128 float128_sub( float128 a, float128 b )
   4853 {
   4854     flag aSign, bSign;
   4855 
   4856     aSign = extractFloat128Sign( a );
   4857     bSign = extractFloat128Sign( b );
   4858     if ( aSign == bSign ) {
   4859         return subFloat128Sigs( a, b, aSign );
   4860     }
   4861     else {
   4862         return addFloat128Sigs( a, b, aSign );
   4863     }
   4864 
   4865 }
   4866 
   4867 /*
   4868 -------------------------------------------------------------------------------
   4869 Returns the result of multiplying the quadruple-precision floating-point
   4870 values `a' and `b'.  The operation is performed according to the IEC/IEEE
   4871 Standard for Binary Floating-Point Arithmetic.
   4872 -------------------------------------------------------------------------------
   4873 */
   4874 float128 float128_mul( float128 a, float128 b )
   4875 {
   4876     flag aSign, bSign, zSign;
   4877     int32 aExp, bExp, zExp;
   4878     bits64 aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2, zSig3;
   4879     float128 z;
   4880 
   4881     aSig1 = extractFloat128Frac1( a );
   4882     aSig0 = extractFloat128Frac0( a );
   4883     aExp = extractFloat128Exp( a );
   4884     aSign = extractFloat128Sign( a );
   4885     bSig1 = extractFloat128Frac1( b );
   4886     bSig0 = extractFloat128Frac0( b );
   4887     bExp = extractFloat128Exp( b );
   4888     bSign = extractFloat128Sign( b );
   4889     zSign = aSign ^ bSign;
   4890     if ( aExp == 0x7FFF ) {
   4891         if (    ( aSig0 | aSig1 )
   4892              || ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) {
   4893             return propagateFloat128NaN( a, b );
   4894         }
   4895         if ( ( bExp | bSig0 | bSig1 ) == 0 ) goto invalid;
   4896         return packFloat128( zSign, 0x7FFF, 0, 0 );
   4897     }
   4898     if ( bExp == 0x7FFF ) {
   4899         if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b );
   4900         if ( ( aExp | aSig0 | aSig1 ) == 0 ) {
   4901  invalid:
   4902             float_raise( float_flag_invalid );
   4903             z.low = float128_default_nan_low;
   4904             z.high = float128_default_nan_high;
   4905             return z;
   4906         }
   4907         return packFloat128( zSign, 0x7FFF, 0, 0 );
   4908     }
   4909     if ( aExp == 0 ) {
   4910         if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
   4911         normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
   4912     }
   4913     if ( bExp == 0 ) {
   4914         if ( ( bSig0 | bSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
   4915         normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
   4916     }
   4917     zExp = aExp + bExp - 0x4000;
   4918     aSig0 |= LIT64( 0x0001000000000000 );
   4919     shortShift128Left( bSig0, bSig1, 16, &bSig0, &bSig1 );
   4920     mul128To256( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1, &zSig2, &zSig3 );
   4921     add128( zSig0, zSig1, aSig0, aSig1, &zSig0, &zSig1 );
   4922     zSig2 |= ( zSig3 != 0 );
   4923     if ( LIT64( 0x0002000000000000 ) <= zSig0 ) {
   4924         shift128ExtraRightJamming(
   4925             zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 );
   4926         ++zExp;
   4927     }
   4928     return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 );
   4929 
   4930 }
   4931 
   4932 /*
   4933 -------------------------------------------------------------------------------
   4934 Returns the result of dividing the quadruple-precision floating-point value
   4935 `a' by the corresponding value `b'.  The operation is performed according to
   4936 the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
   4937 -------------------------------------------------------------------------------
   4938 */
   4939 float128 float128_div( float128 a, float128 b )
   4940 {
   4941     flag aSign, bSign, zSign;
   4942     int32 aExp, bExp, zExp;
   4943     bits64 aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2;
   4944     bits64 rem0, rem1, rem2, rem3, term0, term1, term2, term3;
   4945     float128 z;
   4946 
   4947     aSig1 = extractFloat128Frac1( a );
   4948     aSig0 = extractFloat128Frac0( a );
   4949     aExp = extractFloat128Exp( a );
   4950     aSign = extractFloat128Sign( a );
   4951     bSig1 = extractFloat128Frac1( b );
   4952     bSig0 = extractFloat128Frac0( b );
   4953     bExp = extractFloat128Exp( b );
   4954     bSign = extractFloat128Sign( b );
   4955     zSign = aSign ^ bSign;
   4956     if ( aExp == 0x7FFF ) {
   4957         if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, b );
   4958         if ( bExp == 0x7FFF ) {
   4959             if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b );
   4960             goto invalid;
   4961         }
   4962         return packFloat128( zSign, 0x7FFF, 0, 0 );
   4963     }
   4964     if ( bExp == 0x7FFF ) {
   4965         if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b );
   4966         return packFloat128( zSign, 0, 0, 0 );
   4967     }
   4968     if ( bExp == 0 ) {
   4969         if ( ( bSig0 | bSig1 ) == 0 ) {
   4970             if ( ( aExp | aSig0 | aSig1 ) == 0 ) {
   4971  invalid:
   4972                 float_raise( float_flag_invalid );
   4973                 z.low = float128_default_nan_low;
   4974                 z.high = float128_default_nan_high;
   4975                 return z;
   4976             }
   4977             float_raise( float_flag_divbyzero );
   4978             return packFloat128( zSign, 0x7FFF, 0, 0 );
   4979         }
   4980         normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
   4981     }
   4982     if ( aExp == 0 ) {
   4983         if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
   4984         normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
   4985     }
   4986     zExp = aExp - bExp + 0x3FFD;
   4987     shortShift128Left(
   4988         aSig0 | LIT64( 0x0001000000000000 ), aSig1, 15, &aSig0, &aSig1 );
   4989     shortShift128Left(
   4990         bSig0 | LIT64( 0x0001000000000000 ), bSig1, 15, &bSig0, &bSig1 );
   4991     if ( le128( bSig0, bSig1, aSig0, aSig1 ) ) {
   4992         shift128Right( aSig0, aSig1, 1, &aSig0, &aSig1 );
   4993         ++zExp;
   4994     }
   4995     zSig0 = estimateDiv128To64( aSig0, aSig1, bSig0 );
   4996     mul128By64To192( bSig0, bSig1, zSig0, &term0, &term1, &term2 );
   4997     sub192( aSig0, aSig1, 0, term0, term1, term2, &rem0, &rem1, &rem2 );
   4998     while ( (sbits64) rem0 < 0 ) {
   4999         --zSig0;
   5000         add192( rem0, rem1, rem2, 0, bSig0, bSig1, &rem0, &rem1, &rem2 );
   5001     }
   5002     zSig1 = estimateDiv128To64( rem1, rem2, bSig0 );
   5003     if ( ( zSig1 & 0x3FFF ) <= 4 ) {
   5004         mul128By64To192( bSig0, bSig1, zSig1, &term1, &term2, &term3 );
   5005         sub192( rem1, rem2, 0, term1, term2, term3, &rem1, &rem2, &rem3 );
   5006         while ( (sbits64) rem1 < 0 ) {
   5007             --zSig1;
   5008             add192( rem1, rem2, rem3, 0, bSig0, bSig1, &rem1, &rem2, &rem3 );
   5009         }
   5010         zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
   5011     }
   5012     shift128ExtraRightJamming( zSig0, zSig1, 0, 15, &zSig0, &zSig1, &zSig2 );
   5013     return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 );
   5014 
   5015 }
   5016 
   5017 /*
   5018 -------------------------------------------------------------------------------
   5019 Returns the remainder of the quadruple-precision floating-point value `a'
   5020 with respect to the corresponding value `b'.  The operation is performed
   5021 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
   5022 -------------------------------------------------------------------------------
   5023 */
   5024 float128 float128_rem( float128 a, float128 b )
   5025 {
   5026     flag aSign, bSign, zSign;
   5027     int32 aExp, bExp, expDiff;
   5028     bits64 aSig0, aSig1, bSig0, bSig1, q, term0, term1, term2;
   5029     bits64 allZero, alternateASig0, alternateASig1, sigMean1;
   5030     sbits64 sigMean0;
   5031     float128 z;
   5032 
   5033     aSig1 = extractFloat128Frac1( a );
   5034     aSig0 = extractFloat128Frac0( a );
   5035     aExp = extractFloat128Exp( a );
   5036     aSign = extractFloat128Sign( a );
   5037     bSig1 = extractFloat128Frac1( b );
   5038     bSig0 = extractFloat128Frac0( b );
   5039     bExp = extractFloat128Exp( b );
   5040     bSign = extractFloat128Sign( b );
   5041     if ( aExp == 0x7FFF ) {
   5042         if (    ( aSig0 | aSig1 )
   5043              || ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) {
   5044             return propagateFloat128NaN( a, b );
   5045         }
   5046         goto invalid;
   5047     }
   5048     if ( bExp == 0x7FFF ) {
   5049         if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b );
   5050         return a;
   5051     }
   5052     if ( bExp == 0 ) {
   5053         if ( ( bSig0 | bSig1 ) == 0 ) {
   5054  invalid:
   5055             float_raise( float_flag_invalid );
   5056             z.low = float128_default_nan_low;
   5057             z.high = float128_default_nan_high;
   5058             return z;
   5059         }
   5060         normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
   5061     }
   5062     if ( aExp == 0 ) {
   5063         if ( ( aSig0 | aSig1 ) == 0 ) return a;
   5064         normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
   5065     }
   5066     expDiff = aExp - bExp;
   5067     if ( expDiff < -1 ) return a;
   5068     shortShift128Left(
   5069         aSig0 | LIT64( 0x0001000000000000 ),
   5070         aSig1,
   5071         15 - ( expDiff < 0 ),
   5072         &aSig0,
   5073         &aSig1
   5074     );
   5075     shortShift128Left(
   5076         bSig0 | LIT64( 0x0001000000000000 ), bSig1, 15, &bSig0, &bSig1 );
   5077     q = le128( bSig0, bSig1, aSig0, aSig1 );
   5078     if ( q ) sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 );
   5079     expDiff -= 64;
   5080     while ( 0 < expDiff ) {
   5081         q = estimateDiv128To64( aSig0, aSig1, bSig0 );
   5082         q = ( 4 < q ) ? q - 4 : 0;
   5083         mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 );
   5084         shortShift192Left( term0, term1, term2, 61, &term1, &term2, &allZero );
   5085         shortShift128Left( aSig0, aSig1, 61, &aSig0, &allZero );
   5086         sub128( aSig0, 0, term1, term2, &aSig0, &aSig1 );
   5087         expDiff -= 61;
   5088     }
   5089     if ( -64 < expDiff ) {
   5090         q = estimateDiv128To64( aSig0, aSig1, bSig0 );
   5091         q = ( 4 < q ) ? q - 4 : 0;
   5092         q >>= - expDiff;
   5093         shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 );
   5094         expDiff += 52;
   5095         if ( expDiff < 0 ) {
   5096             shift128Right( aSig0, aSig1, - expDiff, &aSig0, &aSig1 );
   5097         }
   5098         else {
   5099             shortShift128Left( aSig0, aSig1, expDiff, &aSig0, &aSig1 );
   5100         }
   5101         mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 );
   5102         sub128( aSig0, aSig1, term1, term2, &aSig0, &aSig1 );
   5103     }
   5104     else {
   5105         shift128Right( aSig0, aSig1, 12, &aSig0, &aSig1 );
   5106         shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 );
   5107     }
   5108     do {
   5109         alternateASig0 = aSig0;
   5110         alternateASig1 = aSig1;
   5111         ++q;
   5112         sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 );
   5113     } while ( 0 <= (sbits64) aSig0 );
   5114     add128(
   5115         aSig0, aSig1, alternateASig0, alternateASig1, &sigMean0, &sigMean1 );
   5116     if (    ( sigMean0 < 0 )
   5117          || ( ( ( sigMean0 | sigMean1 ) == 0 ) && ( q & 1 ) ) ) {
   5118         aSig0 = alternateASig0;
   5119         aSig1 = alternateASig1;
   5120     }
   5121     zSign = ( (sbits64) aSig0 < 0 );
   5122     if ( zSign ) sub128( 0, 0, aSig0, aSig1, &aSig0, &aSig1 );
   5123     return
   5124         normalizeRoundAndPackFloat128( aSign ^ zSign, bExp - 4, aSig0, aSig1 );
   5125 
   5126 }
   5127 
   5128 /*
   5129 -------------------------------------------------------------------------------
   5130 Returns the square root of the quadruple-precision floating-point value `a'.
   5131 The operation is performed according to the IEC/IEEE Standard for Binary
   5132 Floating-Point Arithmetic.
   5133 -------------------------------------------------------------------------------
   5134 */
   5135 float128 float128_sqrt( float128 a )
   5136 {
   5137     flag aSign;
   5138     int32 aExp, zExp;
   5139     bits64 aSig0, aSig1, zSig0, zSig1, zSig2, doubleZSig0;
   5140     bits64 rem0, rem1, rem2, rem3, term0, term1, term2, term3;
   5141     float128 z;
   5142 
   5143     aSig1 = extractFloat128Frac1( a );
   5144     aSig0 = extractFloat128Frac0( a );
   5145     aExp = extractFloat128Exp( a );
   5146     aSign = extractFloat128Sign( a );
   5147     if ( aExp == 0x7FFF ) {
   5148         if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, a );
   5149         if ( ! aSign ) return a;
   5150         goto invalid;
   5151     }
   5152     if ( aSign ) {
   5153         if ( ( aExp | aSig0 | aSig1 ) == 0 ) return a;
   5154  invalid:
   5155         float_raise( float_flag_invalid );
   5156         z.low = float128_default_nan_low;
   5157         z.high = float128_default_nan_high;
   5158         return z;
   5159     }
   5160     if ( aExp == 0 ) {
   5161         if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( 0, 0, 0, 0 );
   5162         normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
   5163     }
   5164     zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFE;
   5165     aSig0 |= LIT64( 0x0001000000000000 );
   5166     zSig0 = estimateSqrt32( aExp, aSig0>>17 );
   5167     shortShift128Left( aSig0, aSig1, 13 - ( aExp & 1 ), &aSig0, &aSig1 );
   5168     zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 );
   5169     doubleZSig0 = zSig0<<1;
   5170     mul64To128( zSig0, zSig0, &term0, &term1 );
   5171     sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 );
   5172     while ( (sbits64) rem0 < 0 ) {
   5173         --zSig0;
   5174         doubleZSig0 -= 2;
   5175         add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 );
   5176     }
   5177     zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 );
   5178     if ( ( zSig1 & 0x1FFF ) <= 5 ) {
   5179         if ( zSig1 == 0 ) zSig1 = 1;
   5180         mul64To128( doubleZSig0, zSig1, &term1, &term2 );
   5181         sub128( rem1, 0, term1, term2, &rem1, &rem2 );
   5182         mul64To128( zSig1, zSig1, &term2, &term3 );
   5183         sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 );
   5184         while ( (sbits64) rem1 < 0 ) {
   5185             --zSig1;
   5186             shortShift128Left( 0, zSig1, 1, &term2, &term3 );
   5187             term3 |= 1;
   5188             term2 |= doubleZSig0;
   5189             add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 );
   5190         }
   5191         zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
   5192     }
   5193     shift128ExtraRightJamming( zSig0, zSig1, 0, 14, &zSig0, &zSig1, &zSig2 );
   5194     return roundAndPackFloat128( 0, zExp, zSig0, zSig1, zSig2 );
   5195 
   5196 }
   5197 
   5198 /*
   5199 -------------------------------------------------------------------------------
   5200 Returns 1 if the quadruple-precision floating-point value `a' is equal to
   5201 the corresponding value `b', and 0 otherwise.  The comparison is performed
   5202 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
   5203 -------------------------------------------------------------------------------
   5204 */
   5205 flag float128_eq( float128 a, float128 b )
   5206 {
   5207 
   5208     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
   5209               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
   5210          || (    ( extractFloat128Exp( b ) == 0x7FFF )
   5211               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
   5212        ) {
   5213         if (    float128_is_signaling_nan( a )
   5214              || float128_is_signaling_nan( b ) ) {
   5215             float_raise( float_flag_invalid );
   5216         }
   5217         return 0;
   5218     }
   5219     return
   5220            ( a.low == b.low )
   5221         && (    ( a.high == b.high )
   5222              || (    ( a.low == 0 )
   5223                   && ( (bits64) ( ( a.high | b.high )<<1 ) == 0 ) )
   5224            );
   5225 
   5226 }
   5227 
   5228 /*
   5229 -------------------------------------------------------------------------------
   5230 Returns 1 if the quadruple-precision floating-point value `a' is less than
   5231 or equal to the corresponding value `b', and 0 otherwise.  The comparison
   5232 is performed according to the IEC/IEEE Standard for Binary Floating-Point
   5233 Arithmetic.
   5234 -------------------------------------------------------------------------------
   5235 */
   5236 flag float128_le( float128 a, float128 b )
   5237 {
   5238     flag aSign, bSign;
   5239 
   5240     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
   5241               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
   5242          || (    ( extractFloat128Exp( b ) == 0x7FFF )
   5243               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
   5244        ) {
   5245         float_raise( float_flag_invalid );
   5246         return 0;
   5247     }
   5248     aSign = extractFloat128Sign( a );
   5249     bSign = extractFloat128Sign( b );
   5250     if ( aSign != bSign ) {
   5251         return
   5252                aSign
   5253             || (    ( ( (bits64) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
   5254                  == 0 );
   5255     }
   5256     return
   5257           aSign ? le128( b.high, b.low, a.high, a.low )
   5258         : le128( a.high, a.low, b.high, b.low );
   5259 
   5260 }
   5261 
   5262 /*
   5263 -------------------------------------------------------------------------------
   5264 Returns 1 if the quadruple-precision floating-point value `a' is less than
   5265 the corresponding value `b', and 0 otherwise.  The comparison is performed
   5266 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
   5267 -------------------------------------------------------------------------------
   5268 */
   5269 flag float128_lt( float128 a, float128 b )
   5270 {
   5271     flag aSign, bSign;
   5272 
   5273     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
   5274               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
   5275          || (    ( extractFloat128Exp( b ) == 0x7FFF )
   5276               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
   5277        ) {
   5278         float_raise( float_flag_invalid );
   5279         return 0;
   5280     }
   5281     aSign = extractFloat128Sign( a );
   5282     bSign = extractFloat128Sign( b );
   5283     if ( aSign != bSign ) {
   5284         return
   5285                aSign
   5286             && (    ( ( (bits64) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
   5287                  != 0 );
   5288     }
   5289     return
   5290           aSign ? lt128( b.high, b.low, a.high, a.low )
   5291         : lt128( a.high, a.low, b.high, b.low );
   5292 
   5293 }
   5294 
   5295 /*
   5296 -------------------------------------------------------------------------------
   5297 Returns 1 if the quadruple-precision floating-point value `a' is equal to
   5298 the corresponding value `b', and 0 otherwise.  The invalid exception is
   5299 raised if either operand is a NaN.  Otherwise, the comparison is performed
   5300 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
   5301 -------------------------------------------------------------------------------
   5302 */
   5303 flag float128_eq_signaling( float128 a, float128 b )
   5304 {
   5305 
   5306     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
   5307               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
   5308          || (    ( extractFloat128Exp( b ) == 0x7FFF )
   5309               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
   5310        ) {
   5311         float_raise( float_flag_invalid );
   5312         return 0;
   5313     }
   5314     return
   5315            ( a.low == b.low )
   5316         && (    ( a.high == b.high )
   5317              || (    ( a.low == 0 )
   5318                   && ( (bits64) ( ( a.high | b.high )<<1 ) == 0 ) )
   5319            );
   5320 
   5321 }
   5322 
   5323 /*
   5324 -------------------------------------------------------------------------------
   5325 Returns 1 if the quadruple-precision floating-point value `a' is less than
   5326 or equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs do not
   5327 cause an exception.  Otherwise, the comparison is performed according to the
   5328 IEC/IEEE Standard for Binary Floating-Point Arithmetic.
   5329 -------------------------------------------------------------------------------
   5330 */
   5331 flag float128_le_quiet( float128 a, float128 b )
   5332 {
   5333     flag aSign, bSign;
   5334 
   5335     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
   5336               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
   5337          || (    ( extractFloat128Exp( b ) == 0x7FFF )
   5338               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
   5339        ) {
   5340         if (    float128_is_signaling_nan( a )
   5341              || float128_is_signaling_nan( b ) ) {
   5342             float_raise( float_flag_invalid );
   5343         }
   5344         return 0;
   5345     }
   5346     aSign = extractFloat128Sign( a );
   5347     bSign = extractFloat128Sign( b );
   5348     if ( aSign != bSign ) {
   5349         return
   5350                aSign
   5351             || (    ( ( (bits64) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
   5352                  == 0 );
   5353     }
   5354     return
   5355           aSign ? le128( b.high, b.low, a.high, a.low )
   5356         : le128( a.high, a.low, b.high, b.low );
   5357 
   5358 }
   5359 
   5360 /*
   5361 -------------------------------------------------------------------------------
   5362 Returns 1 if the quadruple-precision floating-point value `a' is less than
   5363 the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
   5364 exception.  Otherwise, the comparison is performed according to the IEC/IEEE
   5365 Standard for Binary Floating-Point Arithmetic.
   5366 -------------------------------------------------------------------------------
   5367 */
   5368 flag float128_lt_quiet( float128 a, float128 b )
   5369 {
   5370     flag aSign, bSign;
   5371 
   5372     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
   5373               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
   5374          || (    ( extractFloat128Exp( b ) == 0x7FFF )
   5375               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
   5376        ) {
   5377         if (    float128_is_signaling_nan( a )
   5378              || float128_is_signaling_nan( b ) ) {
   5379             float_raise( float_flag_invalid );
   5380         }
   5381         return 0;
   5382     }
   5383     aSign = extractFloat128Sign( a );
   5384     bSign = extractFloat128Sign( b );
   5385     if ( aSign != bSign ) {
   5386         return
   5387                aSign
   5388             && (    ( ( (bits64) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
   5389                  != 0 );
   5390     }
   5391     return
   5392           aSign ? lt128( b.high, b.low, a.high, a.low )
   5393         : lt128( a.high, a.low, b.high, b.low );
   5394 
   5395 }
   5396 
   5397 #endif
   5398 
   5399 
   5400 #if defined(SOFTFLOAT_FOR_GCC) && defined(SOFTFLOAT_NEED_FIXUNS)
   5401 
   5402 /*
   5403  * These two routines are not part of the original softfloat distribution.
   5404  *
   5405  * They are based on the corresponding conversions to integer but return
   5406  * unsigned numbers instead since these functions are required by GCC.
   5407  *
   5408  * Added by Mark Brinicombe <mark (at) NetBSD.org>	27/09/97
   5409  *
   5410  * float64 version overhauled for SoftFloat 2a [bjh21 2000-07-15]
   5411  */
   5412 
   5413 /*
   5414 -------------------------------------------------------------------------------
   5415 Returns the result of converting the double-precision floating-point value
   5416 `a' to the 32-bit unsigned integer format.  The conversion is
   5417 performed according to the IEC/IEEE Standard for Binary Floating-point
   5418 Arithmetic, except that the conversion is always rounded toward zero.  If
   5419 `a' is a NaN, the largest positive integer is returned.  If the conversion
   5420 overflows, the largest integer positive is returned.
   5421 -------------------------------------------------------------------------------
   5422 */
   5423 uint32 float64_to_uint32_round_to_zero( float64 a )
   5424 {
   5425     flag aSign;
   5426     int16 aExp, shiftCount;
   5427     bits64 aSig, savedASig;
   5428     uint32 z;
   5429 
   5430     aSig = extractFloat64Frac( a );
   5431     aExp = extractFloat64Exp( a );
   5432     aSign = extractFloat64Sign( a );
   5433 
   5434     if (aSign) {
   5435         float_raise( float_flag_invalid );
   5436     	return(0);
   5437     }
   5438 
   5439     if ( 0x41E < aExp ) {
   5440         float_raise( float_flag_invalid );
   5441         return 0xffffffff;
   5442     }
   5443     else if ( aExp < 0x3FF ) {
   5444         if ( aExp || aSig ) float_set_inexact();
   5445         return 0;
   5446     }
   5447     aSig |= LIT64( 0x0010000000000000 );
   5448     shiftCount = 0x433 - aExp;
   5449     savedASig = aSig;
   5450     aSig >>= shiftCount;
   5451     z = aSig;
   5452     if ( ( aSig<<shiftCount ) != savedASig ) {
   5453         float_set_inexact();
   5454     }
   5455     return z;
   5456 
   5457 }
   5458 
   5459 /*
   5460 -------------------------------------------------------------------------------
   5461 Returns the result of converting the single-precision floating-point value
   5462 `a' to the 32-bit unsigned integer format.  The conversion is
   5463 performed according to the IEC/IEEE Standard for Binary Floating-point
   5464 Arithmetic, except that the conversion is always rounded toward zero.  If
   5465 `a' is a NaN, the largest positive integer is returned.  If the conversion
   5466 overflows, the largest positive integer is returned.
   5467 -------------------------------------------------------------------------------
   5468 */
   5469 uint32 float32_to_uint32_round_to_zero( float32 a )
   5470 {
   5471     flag aSign;
   5472     int16 aExp, shiftCount;
   5473     bits32 aSig;
   5474     uint32 z;
   5475 
   5476     aSig = extractFloat32Frac( a );
   5477     aExp = extractFloat32Exp( a );
   5478     aSign = extractFloat32Sign( a );
   5479     shiftCount = aExp - 0x9E;
   5480 
   5481     if (aSign) {
   5482         float_raise( float_flag_invalid );
   5483     	return(0);
   5484     }
   5485     if ( 0 < shiftCount ) {
   5486         float_raise( float_flag_invalid );
   5487         return 0xFFFFFFFF;
   5488     }
   5489     else if ( aExp <= 0x7E ) {
   5490         if ( aExp | aSig ) float_set_inexact();
   5491         return 0;
   5492     }
   5493     aSig = ( aSig | 0x800000 )<<8;
   5494     z = aSig>>( - shiftCount );
   5495     if ( aSig<<( shiftCount & 31 ) ) {
   5496         float_set_inexact();
   5497     }
   5498     return z;
   5499 
   5500 }
   5501 
   5502 #endif
   5503 
   5504 #endif /* _STANDALONE */
   5505