Home | History | Annotate | Line # | Download | only in libkern
softfloat.c revision 1.6
      1  1.6    martin /* $NetBSD: softfloat.c,v 1.6 2017/12/31 11:43:42 martin Exp $ */
      2  1.1      ross 
      3  1.1      ross /*
      4  1.1      ross  * This version hacked for use with gcc -msoft-float by bjh21.
      5  1.1      ross  * (Mostly a case of #ifdefing out things GCC doesn't need or provides
      6  1.1      ross  *  itself).
      7  1.1      ross  */
      8  1.1      ross 
      9  1.1      ross /*
     10  1.1      ross  * Things you may want to define:
     11  1.1      ross  *
     12  1.1      ross  * SOFTFLOAT_FOR_GCC - build only those functions necessary for GCC (with
     13  1.1      ross  *   -msoft-float) to work.  Include "softfloat-for-gcc.h" to get them
     14  1.1      ross  *   properly renamed.
     15  1.1      ross  */
     16  1.1      ross 
     17  1.1      ross /*
     18  1.1      ross ===============================================================================
     19  1.1      ross 
     20  1.1      ross This C source file is part of the SoftFloat IEC/IEEE Floating-point
     21  1.1      ross Arithmetic Package, Release 2a.
     22  1.1      ross 
     23  1.1      ross Written by John R. Hauser.  This work was made possible in part by the
     24  1.1      ross International Computer Science Institute, located at Suite 600, 1947 Center
     25  1.1      ross Street, Berkeley, California 94704.  Funding was partially provided by the
     26  1.1      ross National Science Foundation under grant MIP-9311980.  The original version
     27  1.1      ross of this code was written as part of a project to build a fixed-point vector
     28  1.1      ross processor in collaboration with the University of California at Berkeley,
     29  1.1      ross overseen by Profs. Nelson Morgan and John Wawrzynek.  More information
     30  1.1      ross is available through the Web page `http://HTTP.CS.Berkeley.EDU/~jhauser/
     31  1.1      ross arithmetic/SoftFloat.html'.
     32  1.1      ross 
     33  1.1      ross THIS SOFTWARE IS DISTRIBUTED AS IS, FOR FREE.  Although reasonable effort
     34  1.1      ross has been made to avoid it, THIS SOFTWARE MAY CONTAIN FAULTS THAT WILL AT
     35  1.1      ross TIMES RESULT IN INCORRECT BEHAVIOR.  USE OF THIS SOFTWARE IS RESTRICTED TO
     36  1.1      ross PERSONS AND ORGANIZATIONS WHO CAN AND WILL TAKE FULL RESPONSIBILITY FOR ANY
     37  1.1      ross AND ALL LOSSES, COSTS, OR OTHER PROBLEMS ARISING FROM ITS USE.
     38  1.1      ross 
     39  1.1      ross Derivative works are acceptable, even for commercial purposes, so long as
     40  1.1      ross (1) they include prominent notice that the work is derivative, and (2) they
     41  1.1      ross include prominent notice akin to these four paragraphs for those parts of
     42  1.1      ross this code that are retained.
     43  1.1      ross 
     44  1.1      ross ===============================================================================
     45  1.1      ross */
     46  1.1      ross 
     47  1.2   thorpej /* If you need this in a boot program, you have bigger problems... */
     48  1.2   thorpej #ifndef _STANDALONE
     49  1.2   thorpej 
     50  1.1      ross #include <sys/cdefs.h>
     51  1.1      ross #if defined(LIBC_SCCS) && !defined(lint)
     52  1.6    martin __RCSID("$NetBSD: softfloat.c,v 1.6 2017/12/31 11:43:42 martin Exp $");
     53  1.1      ross #endif /* LIBC_SCCS and not lint */
     54  1.1      ross 
     55  1.1      ross #ifdef SOFTFLOAT_FOR_GCC
     56  1.1      ross #include "softfloat-for-gcc.h"
     57  1.1      ross #endif
     58  1.1      ross 
     59  1.1      ross #include "milieu.h"
     60  1.1      ross #include "softfloat.h"
     61  1.1      ross 
     62  1.1      ross /*
     63  1.1      ross  * Conversions between floats as stored in memory and floats as
     64  1.1      ross  * SoftFloat uses them
     65  1.1      ross  */
     66  1.1      ross #ifndef FLOAT64_DEMANGLE
     67  1.1      ross #define FLOAT64_DEMANGLE(a)	(a)
     68  1.1      ross #endif
     69  1.1      ross #ifndef FLOAT64_MANGLE
     70  1.1      ross #define FLOAT64_MANGLE(a)	(a)
     71  1.1      ross #endif
     72  1.1      ross 
     73  1.1      ross /*
     74  1.1      ross -------------------------------------------------------------------------------
     75  1.1      ross Floating-point rounding mode, extended double-precision rounding precision,
     76  1.1      ross and exception flags.
     77  1.1      ross -------------------------------------------------------------------------------
     78  1.1      ross */
     79  1.1      ross 
     80  1.1      ross /*
     81  1.1      ross  * XXX: This may cause options-MULTIPROCESSOR or thread problems someday.
     82  1.1      ross  * 	Right now, it does not.  I've removed all other dynamic global
     83  1.1      ross  * 	variables. [ross]
     84  1.1      ross  */
     85  1.1      ross #ifdef FLOATX80
     86  1.1      ross int8 floatx80_rounding_precision = 80;
     87  1.1      ross #endif
     88  1.1      ross 
     89  1.1      ross /*
     90  1.1      ross -------------------------------------------------------------------------------
     91  1.1      ross Primitive arithmetic functions, including multi-word arithmetic, and
     92  1.1      ross division and square root approximations.  (Can be specialized to target if
     93  1.1      ross desired.)
     94  1.1      ross -------------------------------------------------------------------------------
     95  1.1      ross */
     96  1.1      ross #include "softfloat-macros.h"
     97  1.1      ross 
     98  1.1      ross /*
     99  1.1      ross -------------------------------------------------------------------------------
    100  1.1      ross Functions and definitions to determine:  (1) whether tininess for underflow
    101  1.1      ross is detected before or after rounding by default, (2) what (if anything)
    102  1.1      ross happens when exceptions are raised, (3) how signaling NaNs are distinguished
    103  1.1      ross from quiet NaNs, (4) the default generated quiet NaNs, and (5) how NaNs
    104  1.1      ross are propagated from function inputs to output.  These details are target-
    105  1.1      ross specific.
    106  1.1      ross -------------------------------------------------------------------------------
    107  1.1      ross */
    108  1.1      ross #include "softfloat-specialize.h"
    109  1.1      ross 
    110  1.1      ross #ifndef SOFTFLOAT_FOR_GCC /* Not used */
    111  1.1      ross /*
    112  1.1      ross -------------------------------------------------------------------------------
    113  1.1      ross Takes a 64-bit fixed-point value `absZ' with binary point between bits 6
    114  1.1      ross and 7, and returns the properly rounded 32-bit integer corresponding to the
    115  1.1      ross input.  If `zSign' is 1, the input is negated before being converted to an
    116  1.1      ross integer.  Bit 63 of `absZ' must be zero.  Ordinarily, the fixed-point input
    117  1.1      ross is simply rounded to an integer, with the inexact exception raised if the
    118  1.1      ross input cannot be represented exactly as an integer.  However, if the fixed-
    119  1.1      ross point input is too large, the invalid exception is raised and the largest
    120  1.1      ross positive or negative integer is returned.
    121  1.1      ross -------------------------------------------------------------------------------
    122  1.1      ross */
    123  1.1      ross static int32 roundAndPackInt32( flag zSign, bits64 absZ )
    124  1.1      ross {
    125  1.1      ross     int8 roundingMode;
    126  1.1      ross     flag roundNearestEven;
    127  1.1      ross     int8 roundIncrement, roundBits;
    128  1.1      ross     int32 z;
    129  1.1      ross 
    130  1.1      ross     roundingMode = float_rounding_mode();
    131  1.1      ross     roundNearestEven = ( roundingMode == float_round_nearest_even );
    132  1.1      ross     roundIncrement = 0x40;
    133  1.1      ross     if ( ! roundNearestEven ) {
    134  1.1      ross         if ( roundingMode == float_round_to_zero ) {
    135  1.1      ross             roundIncrement = 0;
    136  1.1      ross         }
    137  1.1      ross         else {
    138  1.1      ross             roundIncrement = 0x7F;
    139  1.1      ross             if ( zSign ) {
    140  1.1      ross                 if ( roundingMode == float_round_up ) roundIncrement = 0;
    141  1.1      ross             }
    142  1.1      ross             else {
    143  1.1      ross                 if ( roundingMode == float_round_down ) roundIncrement = 0;
    144  1.1      ross             }
    145  1.1      ross         }
    146  1.1      ross     }
    147  1.1      ross     roundBits = absZ & 0x7F;
    148  1.1      ross     absZ = ( absZ + roundIncrement )>>7;
    149  1.1      ross     absZ &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven );
    150  1.1      ross     z = absZ;
    151  1.1      ross     if ( zSign ) z = - z;
    152  1.1      ross     if ( ( absZ>>32 ) || ( z && ( ( z < 0 ) ^ zSign ) ) ) {
    153  1.1      ross         float_raise( float_flag_invalid );
    154  1.1      ross         return zSign ? (sbits32) 0x80000000 : 0x7FFFFFFF;
    155  1.1      ross     }
    156  1.1      ross     if ( roundBits ) float_set_inexact();
    157  1.1      ross     return z;
    158  1.1      ross 
    159  1.1      ross }
    160  1.1      ross 
    161  1.1      ross /*
    162  1.1      ross -------------------------------------------------------------------------------
    163  1.1      ross Takes the 128-bit fixed-point value formed by concatenating `absZ0' and
    164  1.1      ross `absZ1', with binary point between bits 63 and 64 (between the input words),
    165  1.1      ross and returns the properly rounded 64-bit integer corresponding to the input.
    166  1.1      ross If `zSign' is 1, the input is negated before being converted to an integer.
    167  1.1      ross Ordinarily, the fixed-point input is simply rounded to an integer, with
    168  1.1      ross the inexact exception raised if the input cannot be represented exactly as
    169  1.1      ross an integer.  However, if the fixed-point input is too large, the invalid
    170  1.1      ross exception is raised and the largest positive or negative integer is
    171  1.1      ross returned.
    172  1.1      ross -------------------------------------------------------------------------------
    173  1.1      ross */
    174  1.1      ross static int64 roundAndPackInt64( flag zSign, bits64 absZ0, bits64 absZ1 )
    175  1.1      ross {
    176  1.1      ross     int8 roundingMode;
    177  1.1      ross     flag roundNearestEven, increment;
    178  1.1      ross     int64 z;
    179  1.1      ross 
    180  1.1      ross     roundingMode = float_rounding_mode();
    181  1.1      ross     roundNearestEven = ( roundingMode == float_round_nearest_even );
    182  1.1      ross     increment = ( (sbits64) absZ1 < 0 );
    183  1.1      ross     if ( ! roundNearestEven ) {
    184  1.1      ross         if ( roundingMode == float_round_to_zero ) {
    185  1.1      ross             increment = 0;
    186  1.1      ross         }
    187  1.1      ross         else {
    188  1.1      ross             if ( zSign ) {
    189  1.1      ross                 increment = ( roundingMode == float_round_down ) && absZ1;
    190  1.1      ross             }
    191  1.1      ross             else {
    192  1.1      ross                 increment = ( roundingMode == float_round_up ) && absZ1;
    193  1.1      ross             }
    194  1.1      ross         }
    195  1.1      ross     }
    196  1.1      ross     if ( increment ) {
    197  1.1      ross         ++absZ0;
    198  1.1      ross         if ( absZ0 == 0 ) goto overflow;
    199  1.1      ross         absZ0 &= ~ ( ( (bits64) ( absZ1<<1 ) == 0 ) & roundNearestEven );
    200  1.1      ross     }
    201  1.1      ross     z = absZ0;
    202  1.1      ross     if ( zSign ) z = - z;
    203  1.1      ross     if ( z && ( ( z < 0 ) ^ zSign ) ) {
    204  1.1      ross  overflow:
    205  1.1      ross         float_raise( float_flag_invalid );
    206  1.1      ross         return
    207  1.1      ross               zSign ? (sbits64) LIT64( 0x8000000000000000 )
    208  1.1      ross             : LIT64( 0x7FFFFFFFFFFFFFFF );
    209  1.1      ross     }
    210  1.1      ross     if ( absZ1 ) float_set_inexact();
    211  1.1      ross     return z;
    212  1.1      ross 
    213  1.1      ross }
    214  1.6    martin 
    215  1.6    martin /* same as above, but for unsigned values */
    216  1.6    martin static uint64 roundAndPackUInt64( bits64 absZ0, bits64 absZ1 )
    217  1.6    martin {
    218  1.6    martin     int8 roundingMode;
    219  1.6    martin     flag roundNearestEven, increment;
    220  1.6    martin     uint64 z;
    221  1.6    martin 
    222  1.6    martin     roundingMode = float_rounding_mode();
    223  1.6    martin     roundNearestEven = ( roundingMode == float_round_nearest_even );
    224  1.6    martin     increment = ( (sbits64) absZ1 < 0 );
    225  1.6    martin     if ( ! roundNearestEven ) {
    226  1.6    martin         if ( roundingMode == float_round_to_zero ) {
    227  1.6    martin             increment = 0;
    228  1.6    martin         }
    229  1.6    martin         else {
    230  1.6    martin             increment = ( roundingMode == float_round_up ) && absZ1;
    231  1.6    martin         }
    232  1.6    martin     }
    233  1.6    martin     if ( increment ) {
    234  1.6    martin         ++absZ0;
    235  1.6    martin         if ( absZ0 == 0 ) {
    236  1.6    martin             float_raise( float_flag_invalid );
    237  1.6    martin             return LIT64( 0x7FFFFFFFFFFFFFFF );
    238  1.6    martin 	}
    239  1.6    martin         absZ0 &= ~ ( ( (bits64) ( absZ1<<1 ) == 0 ) & roundNearestEven );
    240  1.6    martin     }
    241  1.6    martin     z = absZ0;
    242  1.6    martin     if ( absZ1 ) float_set_inexact();
    243  1.6    martin     return z;
    244  1.6    martin 
    245  1.6    martin }
    246  1.1      ross #endif
    247  1.1      ross 
    248  1.1      ross /*
    249  1.1      ross -------------------------------------------------------------------------------
    250  1.1      ross Returns the fraction bits of the single-precision floating-point value `a'.
    251  1.1      ross -------------------------------------------------------------------------------
    252  1.1      ross */
    253  1.1      ross INLINE bits32 extractFloat32Frac( float32 a )
    254  1.1      ross {
    255  1.1      ross 
    256  1.1      ross     return a & 0x007FFFFF;
    257  1.1      ross 
    258  1.1      ross }
    259  1.1      ross 
    260  1.1      ross /*
    261  1.1      ross -------------------------------------------------------------------------------
    262  1.1      ross Returns the exponent bits of the single-precision floating-point value `a'.
    263  1.1      ross -------------------------------------------------------------------------------
    264  1.1      ross */
    265  1.1      ross INLINE int16 extractFloat32Exp( float32 a )
    266  1.1      ross {
    267  1.1      ross 
    268  1.1      ross     return ( a>>23 ) & 0xFF;
    269  1.1      ross 
    270  1.1      ross }
    271  1.1      ross 
    272  1.1      ross /*
    273  1.1      ross -------------------------------------------------------------------------------
    274  1.1      ross Returns the sign bit of the single-precision floating-point value `a'.
    275  1.1      ross -------------------------------------------------------------------------------
    276  1.1      ross */
    277  1.1      ross INLINE flag extractFloat32Sign( float32 a )
    278  1.1      ross {
    279  1.1      ross 
    280  1.1      ross     return a>>31;
    281  1.1      ross 
    282  1.1      ross }
    283  1.1      ross 
    284  1.1      ross /*
    285  1.1      ross -------------------------------------------------------------------------------
    286  1.1      ross Normalizes the subnormal single-precision floating-point value represented
    287  1.1      ross by the denormalized significand `aSig'.  The normalized exponent and
    288  1.1      ross significand are stored at the locations pointed to by `zExpPtr' and
    289  1.1      ross `zSigPtr', respectively.
    290  1.1      ross -------------------------------------------------------------------------------
    291  1.1      ross */
    292  1.1      ross static void
    293  1.1      ross  normalizeFloat32Subnormal( bits32 aSig, int16 *zExpPtr, bits32 *zSigPtr )
    294  1.1      ross {
    295  1.1      ross     int8 shiftCount;
    296  1.1      ross 
    297  1.1      ross     shiftCount = countLeadingZeros32( aSig ) - 8;
    298  1.1      ross     *zSigPtr = aSig<<shiftCount;
    299  1.1      ross     *zExpPtr = 1 - shiftCount;
    300  1.1      ross 
    301  1.1      ross }
    302  1.1      ross 
    303  1.1      ross /*
    304  1.1      ross -------------------------------------------------------------------------------
    305  1.1      ross Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
    306  1.1      ross single-precision floating-point value, returning the result.  After being
    307  1.1      ross shifted into the proper positions, the three fields are simply added
    308  1.1      ross together to form the result.  This means that any integer portion of `zSig'
    309  1.1      ross will be added into the exponent.  Since a properly normalized significand
    310  1.1      ross will have an integer portion equal to 1, the `zExp' input should be 1 less
    311  1.1      ross than the desired result exponent whenever `zSig' is a complete, normalized
    312  1.1      ross significand.
    313  1.1      ross -------------------------------------------------------------------------------
    314  1.1      ross */
    315  1.1      ross INLINE float32 packFloat32( flag zSign, int16 zExp, bits32 zSig )
    316  1.1      ross {
    317  1.1      ross 
    318  1.1      ross     return ( ( (bits32) zSign )<<31 ) + ( ( (bits32) zExp )<<23 ) + zSig;
    319  1.1      ross 
    320  1.1      ross }
    321  1.1      ross 
    322  1.1      ross /*
    323  1.1      ross -------------------------------------------------------------------------------
    324  1.1      ross Takes an abstract floating-point value having sign `zSign', exponent `zExp',
    325  1.1      ross and significand `zSig', and returns the proper single-precision floating-
    326  1.1      ross point value corresponding to the abstract input.  Ordinarily, the abstract
    327  1.1      ross value is simply rounded and packed into the single-precision format, with
    328  1.1      ross the inexact exception raised if the abstract input cannot be represented
    329  1.1      ross exactly.  However, if the abstract value is too large, the overflow and
    330  1.1      ross inexact exceptions are raised and an infinity or maximal finite value is
    331  1.1      ross returned.  If the abstract value is too small, the input value is rounded to
    332  1.1      ross a subnormal number, and the underflow and inexact exceptions are raised if
    333  1.1      ross the abstract input cannot be represented exactly as a subnormal single-
    334  1.1      ross precision floating-point number.
    335  1.1      ross     The input significand `zSig' has its binary point between bits 30
    336  1.1      ross and 29, which is 7 bits to the left of the usual location.  This shifted
    337  1.1      ross significand must be normalized or smaller.  If `zSig' is not normalized,
    338  1.1      ross `zExp' must be 0; in that case, the result returned is a subnormal number,
    339  1.1      ross and it must not require rounding.  In the usual case that `zSig' is
    340  1.1      ross normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
    341  1.1      ross The handling of underflow and overflow follows the IEC/IEEE Standard for
    342  1.1      ross Binary Floating-Point Arithmetic.
    343  1.1      ross -------------------------------------------------------------------------------
    344  1.1      ross */
    345  1.1      ross static float32 roundAndPackFloat32( flag zSign, int16 zExp, bits32 zSig )
    346  1.1      ross {
    347  1.1      ross     int8 roundingMode;
    348  1.1      ross     flag roundNearestEven;
    349  1.1      ross     int8 roundIncrement, roundBits;
    350  1.1      ross     flag isTiny;
    351  1.1      ross 
    352  1.1      ross     roundingMode = float_rounding_mode();
    353  1.1      ross     roundNearestEven = ( roundingMode == float_round_nearest_even );
    354  1.1      ross     roundIncrement = 0x40;
    355  1.1      ross     if ( ! roundNearestEven ) {
    356  1.1      ross         if ( roundingMode == float_round_to_zero ) {
    357  1.1      ross             roundIncrement = 0;
    358  1.1      ross         }
    359  1.1      ross         else {
    360  1.1      ross             roundIncrement = 0x7F;
    361  1.1      ross             if ( zSign ) {
    362  1.1      ross                 if ( roundingMode == float_round_up ) roundIncrement = 0;
    363  1.1      ross             }
    364  1.1      ross             else {
    365  1.1      ross                 if ( roundingMode == float_round_down ) roundIncrement = 0;
    366  1.1      ross             }
    367  1.1      ross         }
    368  1.1      ross     }
    369  1.1      ross     roundBits = zSig & 0x7F;
    370  1.1      ross     if ( 0xFD <= (bits16) zExp ) {
    371  1.1      ross         if (    ( 0xFD < zExp )
    372  1.1      ross              || (    ( zExp == 0xFD )
    373  1.1      ross                   && ( (sbits32) ( zSig + roundIncrement ) < 0 ) )
    374  1.1      ross            ) {
    375  1.1      ross             float_raise( float_flag_overflow | float_flag_inexact );
    376  1.1      ross             return packFloat32( zSign, 0xFF, 0 ) - ( roundIncrement == 0 );
    377  1.1      ross         }
    378  1.1      ross         if ( zExp < 0 ) {
    379  1.1      ross             isTiny =
    380  1.1      ross                    ( float_detect_tininess == float_tininess_before_rounding )
    381  1.1      ross                 || ( zExp < -1 )
    382  1.1      ross                 || ( zSig + roundIncrement < 0x80000000 );
    383  1.1      ross             shift32RightJamming( zSig, - zExp, &zSig );
    384  1.1      ross             zExp = 0;
    385  1.1      ross             roundBits = zSig & 0x7F;
    386  1.1      ross             if ( isTiny && roundBits ) float_raise( float_flag_underflow );
    387  1.1      ross         }
    388  1.1      ross     }
    389  1.1      ross     if ( roundBits ) float_set_inexact();
    390  1.1      ross     zSig = ( zSig + roundIncrement )>>7;
    391  1.1      ross     zSig &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven );
    392  1.1      ross     if ( zSig == 0 ) zExp = 0;
    393  1.1      ross     return packFloat32( zSign, zExp, zSig );
    394  1.1      ross 
    395  1.1      ross }
    396  1.1      ross 
    397  1.1      ross /*
    398  1.1      ross -------------------------------------------------------------------------------
    399  1.1      ross Takes an abstract floating-point value having sign `zSign', exponent `zExp',
    400  1.1      ross and significand `zSig', and returns the proper single-precision floating-
    401  1.1      ross point value corresponding to the abstract input.  This routine is just like
    402  1.1      ross `roundAndPackFloat32' except that `zSig' does not have to be normalized.
    403  1.1      ross Bit 31 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
    404  1.1      ross floating-point exponent.
    405  1.1      ross -------------------------------------------------------------------------------
    406  1.1      ross */
    407  1.1      ross static float32
    408  1.1      ross  normalizeRoundAndPackFloat32( flag zSign, int16 zExp, bits32 zSig )
    409  1.1      ross {
    410  1.1      ross     int8 shiftCount;
    411  1.1      ross 
    412  1.1      ross     shiftCount = countLeadingZeros32( zSig ) - 1;
    413  1.1      ross     return roundAndPackFloat32( zSign, zExp - shiftCount, zSig<<shiftCount );
    414  1.1      ross 
    415  1.1      ross }
    416  1.1      ross 
    417  1.1      ross /*
    418  1.1      ross -------------------------------------------------------------------------------
    419  1.1      ross Returns the fraction bits of the double-precision floating-point value `a'.
    420  1.1      ross -------------------------------------------------------------------------------
    421  1.1      ross */
    422  1.1      ross INLINE bits64 extractFloat64Frac( float64 a )
    423  1.1      ross {
    424  1.1      ross 
    425  1.1      ross     return FLOAT64_DEMANGLE(a) & LIT64( 0x000FFFFFFFFFFFFF );
    426  1.1      ross 
    427  1.1      ross }
    428  1.1      ross 
    429  1.1      ross /*
    430  1.1      ross -------------------------------------------------------------------------------
    431  1.1      ross Returns the exponent bits of the double-precision floating-point value `a'.
    432  1.1      ross -------------------------------------------------------------------------------
    433  1.1      ross */
    434  1.1      ross INLINE int16 extractFloat64Exp( float64 a )
    435  1.1      ross {
    436  1.1      ross 
    437  1.1      ross     return ( FLOAT64_DEMANGLE(a)>>52 ) & 0x7FF;
    438  1.1      ross 
    439  1.1      ross }
    440  1.1      ross 
    441  1.1      ross /*
    442  1.1      ross -------------------------------------------------------------------------------
    443  1.1      ross Returns the sign bit of the double-precision floating-point value `a'.
    444  1.1      ross -------------------------------------------------------------------------------
    445  1.1      ross */
    446  1.1      ross INLINE flag extractFloat64Sign( float64 a )
    447  1.1      ross {
    448  1.1      ross 
    449  1.1      ross     return FLOAT64_DEMANGLE(a)>>63;
    450  1.1      ross 
    451  1.1      ross }
    452  1.1      ross 
    453  1.1      ross /*
    454  1.1      ross -------------------------------------------------------------------------------
    455  1.1      ross Normalizes the subnormal double-precision floating-point value represented
    456  1.1      ross by the denormalized significand `aSig'.  The normalized exponent and
    457  1.1      ross significand are stored at the locations pointed to by `zExpPtr' and
    458  1.1      ross `zSigPtr', respectively.
    459  1.1      ross -------------------------------------------------------------------------------
    460  1.1      ross */
    461  1.1      ross static void
    462  1.1      ross  normalizeFloat64Subnormal( bits64 aSig, int16 *zExpPtr, bits64 *zSigPtr )
    463  1.1      ross {
    464  1.1      ross     int8 shiftCount;
    465  1.1      ross 
    466  1.1      ross     shiftCount = countLeadingZeros64( aSig ) - 11;
    467  1.1      ross     *zSigPtr = aSig<<shiftCount;
    468  1.1      ross     *zExpPtr = 1 - shiftCount;
    469  1.1      ross 
    470  1.1      ross }
    471  1.1      ross 
    472  1.1      ross /*
    473  1.1      ross -------------------------------------------------------------------------------
    474  1.1      ross Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
    475  1.1      ross double-precision floating-point value, returning the result.  After being
    476  1.1      ross shifted into the proper positions, the three fields are simply added
    477  1.1      ross together to form the result.  This means that any integer portion of `zSig'
    478  1.1      ross will be added into the exponent.  Since a properly normalized significand
    479  1.1      ross will have an integer portion equal to 1, the `zExp' input should be 1 less
    480  1.1      ross than the desired result exponent whenever `zSig' is a complete, normalized
    481  1.1      ross significand.
    482  1.1      ross -------------------------------------------------------------------------------
    483  1.1      ross */
    484  1.1      ross INLINE float64 packFloat64( flag zSign, int16 zExp, bits64 zSig )
    485  1.1      ross {
    486  1.1      ross 
    487  1.1      ross     return FLOAT64_MANGLE( ( ( (bits64) zSign )<<63 ) +
    488  1.1      ross 			   ( ( (bits64) zExp )<<52 ) + zSig );
    489  1.1      ross 
    490  1.1      ross }
    491  1.1      ross 
    492  1.1      ross /*
    493  1.1      ross -------------------------------------------------------------------------------
    494  1.1      ross Takes an abstract floating-point value having sign `zSign', exponent `zExp',
    495  1.1      ross and significand `zSig', and returns the proper double-precision floating-
    496  1.1      ross point value corresponding to the abstract input.  Ordinarily, the abstract
    497  1.1      ross value is simply rounded and packed into the double-precision format, with
    498  1.1      ross the inexact exception raised if the abstract input cannot be represented
    499  1.1      ross exactly.  However, if the abstract value is too large, the overflow and
    500  1.1      ross inexact exceptions are raised and an infinity or maximal finite value is
    501  1.1      ross returned.  If the abstract value is too small, the input value is rounded to
    502  1.1      ross a subnormal number, and the underflow and inexact exceptions are raised if
    503  1.1      ross the abstract input cannot be represented exactly as a subnormal double-
    504  1.1      ross precision floating-point number.
    505  1.1      ross     The input significand `zSig' has its binary point between bits 62
    506  1.1      ross and 61, which is 10 bits to the left of the usual location.  This shifted
    507  1.1      ross significand must be normalized or smaller.  If `zSig' is not normalized,
    508  1.1      ross `zExp' must be 0; in that case, the result returned is a subnormal number,
    509  1.1      ross and it must not require rounding.  In the usual case that `zSig' is
    510  1.1      ross normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
    511  1.1      ross The handling of underflow and overflow follows the IEC/IEEE Standard for
    512  1.1      ross Binary Floating-Point Arithmetic.
    513  1.1      ross -------------------------------------------------------------------------------
    514  1.1      ross */
    515  1.1      ross static float64 roundAndPackFloat64( flag zSign, int16 zExp, bits64 zSig )
    516  1.1      ross {
    517  1.1      ross     int8 roundingMode;
    518  1.1      ross     flag roundNearestEven;
    519  1.1      ross     int16 roundIncrement, roundBits;
    520  1.1      ross     flag isTiny;
    521  1.1      ross 
    522  1.1      ross     roundingMode = float_rounding_mode();
    523  1.1      ross     roundNearestEven = ( roundingMode == float_round_nearest_even );
    524  1.1      ross     roundIncrement = 0x200;
    525  1.1      ross     if ( ! roundNearestEven ) {
    526  1.1      ross         if ( roundingMode == float_round_to_zero ) {
    527  1.1      ross             roundIncrement = 0;
    528  1.1      ross         }
    529  1.1      ross         else {
    530  1.1      ross             roundIncrement = 0x3FF;
    531  1.1      ross             if ( zSign ) {
    532  1.1      ross                 if ( roundingMode == float_round_up ) roundIncrement = 0;
    533  1.1      ross             }
    534  1.1      ross             else {
    535  1.1      ross                 if ( roundingMode == float_round_down ) roundIncrement = 0;
    536  1.1      ross             }
    537  1.1      ross         }
    538  1.1      ross     }
    539  1.1      ross     roundBits = zSig & 0x3FF;
    540  1.1      ross     if ( 0x7FD <= (bits16) zExp ) {
    541  1.1      ross         if (    ( 0x7FD < zExp )
    542  1.1      ross              || (    ( zExp == 0x7FD )
    543  1.1      ross                   && ( (sbits64) ( zSig + roundIncrement ) < 0 ) )
    544  1.1      ross            ) {
    545  1.1      ross             float_raise( float_flag_overflow | float_flag_inexact );
    546  1.1      ross             return FLOAT64_MANGLE(
    547  1.1      ross 		FLOAT64_DEMANGLE(packFloat64( zSign, 0x7FF, 0 )) -
    548  1.1      ross 		( roundIncrement == 0 ));
    549  1.1      ross         }
    550  1.1      ross         if ( zExp < 0 ) {
    551  1.1      ross             isTiny =
    552  1.1      ross                    ( float_detect_tininess == float_tininess_before_rounding )
    553  1.1      ross                 || ( zExp < -1 )
    554  1.1      ross                 || ( zSig + roundIncrement < LIT64( 0x8000000000000000 ) );
    555  1.1      ross             shift64RightJamming( zSig, - zExp, &zSig );
    556  1.1      ross             zExp = 0;
    557  1.1      ross             roundBits = zSig & 0x3FF;
    558  1.1      ross             if ( isTiny && roundBits ) float_raise( float_flag_underflow );
    559  1.1      ross         }
    560  1.1      ross     }
    561  1.1      ross     if ( roundBits ) float_set_inexact();
    562  1.1      ross     zSig = ( zSig + roundIncrement )>>10;
    563  1.1      ross     zSig &= ~ ( ( ( roundBits ^ 0x200 ) == 0 ) & roundNearestEven );
    564  1.1      ross     if ( zSig == 0 ) zExp = 0;
    565  1.1      ross     return packFloat64( zSign, zExp, zSig );
    566  1.1      ross 
    567  1.1      ross }
    568  1.1      ross 
    569  1.1      ross /*
    570  1.1      ross -------------------------------------------------------------------------------
    571  1.1      ross Takes an abstract floating-point value having sign `zSign', exponent `zExp',
    572  1.1      ross and significand `zSig', and returns the proper double-precision floating-
    573  1.1      ross point value corresponding to the abstract input.  This routine is just like
    574  1.1      ross `roundAndPackFloat64' except that `zSig' does not have to be normalized.
    575  1.1      ross Bit 63 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
    576  1.1      ross floating-point exponent.
    577  1.1      ross -------------------------------------------------------------------------------
    578  1.1      ross */
    579  1.1      ross static float64
    580  1.1      ross  normalizeRoundAndPackFloat64( flag zSign, int16 zExp, bits64 zSig )
    581  1.1      ross {
    582  1.1      ross     int8 shiftCount;
    583  1.1      ross 
    584  1.1      ross     shiftCount = countLeadingZeros64( zSig ) - 1;
    585  1.1      ross     return roundAndPackFloat64( zSign, zExp - shiftCount, zSig<<shiftCount );
    586  1.1      ross 
    587  1.1      ross }
    588  1.1      ross 
    589  1.1      ross #ifdef FLOATX80
    590  1.1      ross 
    591  1.1      ross /*
    592  1.1      ross -------------------------------------------------------------------------------
    593  1.1      ross Returns the fraction bits of the extended double-precision floating-point
    594  1.1      ross value `a'.
    595  1.1      ross -------------------------------------------------------------------------------
    596  1.1      ross */
    597  1.1      ross INLINE bits64 extractFloatx80Frac( floatx80 a )
    598  1.1      ross {
    599  1.1      ross 
    600  1.1      ross     return a.low;
    601  1.1      ross 
    602  1.1      ross }
    603  1.1      ross 
    604  1.1      ross /*
    605  1.1      ross -------------------------------------------------------------------------------
    606  1.1      ross Returns the exponent bits of the extended double-precision floating-point
    607  1.1      ross value `a'.
    608  1.1      ross -------------------------------------------------------------------------------
    609  1.1      ross */
    610  1.1      ross INLINE int32 extractFloatx80Exp( floatx80 a )
    611  1.1      ross {
    612  1.1      ross 
    613  1.1      ross     return a.high & 0x7FFF;
    614  1.1      ross 
    615  1.1      ross }
    616  1.1      ross 
    617  1.1      ross /*
    618  1.1      ross -------------------------------------------------------------------------------
    619  1.1      ross Returns the sign bit of the extended double-precision floating-point value
    620  1.1      ross `a'.
    621  1.1      ross -------------------------------------------------------------------------------
    622  1.1      ross */
    623  1.1      ross INLINE flag extractFloatx80Sign( floatx80 a )
    624  1.1      ross {
    625  1.1      ross 
    626  1.1      ross     return a.high>>15;
    627  1.1      ross 
    628  1.1      ross }
    629  1.1      ross 
    630  1.1      ross /*
    631  1.1      ross -------------------------------------------------------------------------------
    632  1.1      ross Normalizes the subnormal extended double-precision floating-point value
    633  1.1      ross represented by the denormalized significand `aSig'.  The normalized exponent
    634  1.1      ross and significand are stored at the locations pointed to by `zExpPtr' and
    635  1.1      ross `zSigPtr', respectively.
    636  1.1      ross -------------------------------------------------------------------------------
    637  1.1      ross */
    638  1.1      ross static void
    639  1.1      ross  normalizeFloatx80Subnormal( bits64 aSig, int32 *zExpPtr, bits64 *zSigPtr )
    640  1.1      ross {
    641  1.1      ross     int8 shiftCount;
    642  1.1      ross 
    643  1.1      ross     shiftCount = countLeadingZeros64( aSig );
    644  1.1      ross     *zSigPtr = aSig<<shiftCount;
    645  1.1      ross     *zExpPtr = 1 - shiftCount;
    646  1.1      ross 
    647  1.1      ross }
    648  1.1      ross 
    649  1.1      ross /*
    650  1.1      ross -------------------------------------------------------------------------------
    651  1.1      ross Packs the sign `zSign', exponent `zExp', and significand `zSig' into an
    652  1.1      ross extended double-precision floating-point value, returning the result.
    653  1.1      ross -------------------------------------------------------------------------------
    654  1.1      ross */
    655  1.1      ross INLINE floatx80 packFloatx80( flag zSign, int32 zExp, bits64 zSig )
    656  1.1      ross {
    657  1.1      ross     floatx80 z;
    658  1.1      ross 
    659  1.1      ross     z.low = zSig;
    660  1.1      ross     z.high = ( ( (bits16) zSign )<<15 ) + zExp;
    661  1.1      ross     return z;
    662  1.1      ross 
    663  1.1      ross }
    664  1.1      ross 
    665  1.1      ross /*
    666  1.1      ross -------------------------------------------------------------------------------
    667  1.1      ross Takes an abstract floating-point value having sign `zSign', exponent `zExp',
    668  1.1      ross and extended significand formed by the concatenation of `zSig0' and `zSig1',
    669  1.1      ross and returns the proper extended double-precision floating-point value
    670  1.1      ross corresponding to the abstract input.  Ordinarily, the abstract value is
    671  1.1      ross rounded and packed into the extended double-precision format, with the
    672  1.1      ross inexact exception raised if the abstract input cannot be represented
    673  1.1      ross exactly.  However, if the abstract value is too large, the overflow and
    674  1.1      ross inexact exceptions are raised and an infinity or maximal finite value is
    675  1.1      ross returned.  If the abstract value is too small, the input value is rounded to
    676  1.1      ross a subnormal number, and the underflow and inexact exceptions are raised if
    677  1.1      ross the abstract input cannot be represented exactly as a subnormal extended
    678  1.1      ross double-precision floating-point number.
    679  1.1      ross     If `roundingPrecision' is 32 or 64, the result is rounded to the same
    680  1.1      ross number of bits as single or double precision, respectively.  Otherwise, the
    681  1.1      ross result is rounded to the full precision of the extended double-precision
    682  1.1      ross format.
    683  1.1      ross     The input significand must be normalized or smaller.  If the input
    684  1.1      ross significand is not normalized, `zExp' must be 0; in that case, the result
    685  1.1      ross returned is a subnormal number, and it must not require rounding.  The
    686  1.1      ross handling of underflow and overflow follows the IEC/IEEE Standard for Binary
    687  1.1      ross Floating-Point Arithmetic.
    688  1.1      ross -------------------------------------------------------------------------------
    689  1.1      ross */
    690  1.1      ross static floatx80
    691  1.1      ross  roundAndPackFloatx80(
    692  1.1      ross      int8 roundingPrecision, flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1
    693  1.1      ross  )
    694  1.1      ross {
    695  1.1      ross     int8 roundingMode;
    696  1.1      ross     flag roundNearestEven, increment, isTiny;
    697  1.1      ross     int64 roundIncrement, roundMask, roundBits;
    698  1.1      ross 
    699  1.1      ross     roundingMode = float_rounding_mode();
    700  1.1      ross     roundNearestEven = ( roundingMode == float_round_nearest_even );
    701  1.1      ross     if ( roundingPrecision == 80 ) goto precision80;
    702  1.1      ross     if ( roundingPrecision == 64 ) {
    703  1.1      ross         roundIncrement = LIT64( 0x0000000000000400 );
    704  1.1      ross         roundMask = LIT64( 0x00000000000007FF );
    705  1.1      ross     }
    706  1.1      ross     else if ( roundingPrecision == 32 ) {
    707  1.1      ross         roundIncrement = LIT64( 0x0000008000000000 );
    708  1.1      ross         roundMask = LIT64( 0x000000FFFFFFFFFF );
    709  1.1      ross     }
    710  1.1      ross     else {
    711  1.1      ross         goto precision80;
    712  1.1      ross     }
    713  1.1      ross     zSig0 |= ( zSig1 != 0 );
    714  1.1      ross     if ( ! roundNearestEven ) {
    715  1.1      ross         if ( roundingMode == float_round_to_zero ) {
    716  1.1      ross             roundIncrement = 0;
    717  1.1      ross         }
    718  1.1      ross         else {
    719  1.1      ross             roundIncrement = roundMask;
    720  1.1      ross             if ( zSign ) {
    721  1.1      ross                 if ( roundingMode == float_round_up ) roundIncrement = 0;
    722  1.1      ross             }
    723  1.1      ross             else {
    724  1.1      ross                 if ( roundingMode == float_round_down ) roundIncrement = 0;
    725  1.1      ross             }
    726  1.1      ross         }
    727  1.1      ross     }
    728  1.1      ross     roundBits = zSig0 & roundMask;
    729  1.1      ross     if ( 0x7FFD <= (bits32) ( zExp - 1 ) ) {
    730  1.1      ross         if (    ( 0x7FFE < zExp )
    731  1.1      ross              || ( ( zExp == 0x7FFE ) && ( zSig0 + roundIncrement < zSig0 ) )
    732  1.1      ross            ) {
    733  1.1      ross             goto overflow;
    734  1.1      ross         }
    735  1.1      ross         if ( zExp <= 0 ) {
    736  1.1      ross             isTiny =
    737  1.1      ross                    ( float_detect_tininess == float_tininess_before_rounding )
    738  1.1      ross                 || ( zExp < 0 )
    739  1.1      ross                 || ( zSig0 <= zSig0 + roundIncrement );
    740  1.1      ross             shift64RightJamming( zSig0, 1 - zExp, &zSig0 );
    741  1.1      ross             zExp = 0;
    742  1.1      ross             roundBits = zSig0 & roundMask;
    743  1.1      ross             if ( isTiny && roundBits ) float_raise( float_flag_underflow );
    744  1.1      ross             if ( roundBits ) float_set_inexact();
    745  1.1      ross             zSig0 += roundIncrement;
    746  1.1      ross             if ( (sbits64) zSig0 < 0 ) zExp = 1;
    747  1.1      ross             roundIncrement = roundMask + 1;
    748  1.1      ross             if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) {
    749  1.1      ross                 roundMask |= roundIncrement;
    750  1.1      ross             }
    751  1.1      ross             zSig0 &= ~ roundMask;
    752  1.1      ross             return packFloatx80( zSign, zExp, zSig0 );
    753  1.1      ross         }
    754  1.1      ross     }
    755  1.1      ross     if ( roundBits ) float_set_inexact();
    756  1.1      ross     zSig0 += roundIncrement;
    757  1.1      ross     if ( zSig0 < roundIncrement ) {
    758  1.1      ross         ++zExp;
    759  1.1      ross         zSig0 = LIT64( 0x8000000000000000 );
    760  1.1      ross     }
    761  1.1      ross     roundIncrement = roundMask + 1;
    762  1.1      ross     if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) {
    763  1.1      ross         roundMask |= roundIncrement;
    764  1.1      ross     }
    765  1.1      ross     zSig0 &= ~ roundMask;
    766  1.1      ross     if ( zSig0 == 0 ) zExp = 0;
    767  1.1      ross     return packFloatx80( zSign, zExp, zSig0 );
    768  1.1      ross  precision80:
    769  1.1      ross     increment = ( (sbits64) zSig1 < 0 );
    770  1.1      ross     if ( ! roundNearestEven ) {
    771  1.1      ross         if ( roundingMode == float_round_to_zero ) {
    772  1.1      ross             increment = 0;
    773  1.1      ross         }
    774  1.1      ross         else {
    775  1.1      ross             if ( zSign ) {
    776  1.1      ross                 increment = ( roundingMode == float_round_down ) && zSig1;
    777  1.1      ross             }
    778  1.1      ross             else {
    779  1.1      ross                 increment = ( roundingMode == float_round_up ) && zSig1;
    780  1.1      ross             }
    781  1.1      ross         }
    782  1.1      ross     }
    783  1.1      ross     if ( 0x7FFD <= (bits32) ( zExp - 1 ) ) {
    784  1.1      ross         if (    ( 0x7FFE < zExp )
    785  1.1      ross              || (    ( zExp == 0x7FFE )
    786  1.1      ross                   && ( zSig0 == LIT64( 0xFFFFFFFFFFFFFFFF ) )
    787  1.1      ross                   && increment
    788  1.1      ross                 )
    789  1.1      ross            ) {
    790  1.1      ross             roundMask = 0;
    791  1.1      ross  overflow:
    792  1.1      ross             float_raise( float_flag_overflow | float_flag_inexact );
    793  1.1      ross             if (    ( roundingMode == float_round_to_zero )
    794  1.1      ross                  || ( zSign && ( roundingMode == float_round_up ) )
    795  1.1      ross                  || ( ! zSign && ( roundingMode == float_round_down ) )
    796  1.1      ross                ) {
    797  1.1      ross                 return packFloatx80( zSign, 0x7FFE, ~ roundMask );
    798  1.1      ross             }
    799  1.1      ross             return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
    800  1.1      ross         }
    801  1.1      ross         if ( zExp <= 0 ) {
    802  1.1      ross             isTiny =
    803  1.1      ross                    ( float_detect_tininess == float_tininess_before_rounding )
    804  1.1      ross                 || ( zExp < 0 )
    805  1.1      ross                 || ! increment
    806  1.1      ross                 || ( zSig0 < LIT64( 0xFFFFFFFFFFFFFFFF ) );
    807  1.1      ross             shift64ExtraRightJamming( zSig0, zSig1, 1 - zExp, &zSig0, &zSig1 );
    808  1.1      ross             zExp = 0;
    809  1.1      ross             if ( isTiny && zSig1 ) float_raise( float_flag_underflow );
    810  1.1      ross             if ( zSig1 ) float_set_inexact();
    811  1.1      ross             if ( roundNearestEven ) {
    812  1.1      ross                 increment = ( (sbits64) zSig1 < 0 );
    813  1.1      ross             }
    814  1.1      ross             else {
    815  1.1      ross                 if ( zSign ) {
    816  1.1      ross                     increment = ( roundingMode == float_round_down ) && zSig1;
    817  1.1      ross                 }
    818  1.1      ross                 else {
    819  1.1      ross                     increment = ( roundingMode == float_round_up ) && zSig1;
    820  1.1      ross                 }
    821  1.1      ross             }
    822  1.1      ross             if ( increment ) {
    823  1.1      ross                 ++zSig0;
    824  1.1      ross                 zSig0 &=
    825  1.1      ross                     ~ ( ( (bits64) ( zSig1<<1 ) == 0 ) & roundNearestEven );
    826  1.1      ross                 if ( (sbits64) zSig0 < 0 ) zExp = 1;
    827  1.1      ross             }
    828  1.1      ross             return packFloatx80( zSign, zExp, zSig0 );
    829  1.1      ross         }
    830  1.1      ross     }
    831  1.1      ross     if ( zSig1 ) float_set_inexact();
    832  1.1      ross     if ( increment ) {
    833  1.1      ross         ++zSig0;
    834  1.1      ross         if ( zSig0 == 0 ) {
    835  1.1      ross             ++zExp;
    836  1.1      ross             zSig0 = LIT64( 0x8000000000000000 );
    837  1.1      ross         }
    838  1.1      ross         else {
    839  1.1      ross             zSig0 &= ~ ( ( (bits64) ( zSig1<<1 ) == 0 ) & roundNearestEven );
    840  1.1      ross         }
    841  1.1      ross     }
    842  1.1      ross     else {
    843  1.1      ross         if ( zSig0 == 0 ) zExp = 0;
    844  1.1      ross     }
    845  1.1      ross     return packFloatx80( zSign, zExp, zSig0 );
    846  1.1      ross 
    847  1.1      ross }
    848  1.1      ross 
    849  1.1      ross /*
    850  1.1      ross -------------------------------------------------------------------------------
    851  1.1      ross Takes an abstract floating-point value having sign `zSign', exponent
    852  1.1      ross `zExp', and significand formed by the concatenation of `zSig0' and `zSig1',
    853  1.1      ross and returns the proper extended double-precision floating-point value
    854  1.1      ross corresponding to the abstract input.  This routine is just like
    855  1.1      ross `roundAndPackFloatx80' except that the input significand does not have to be
    856  1.1      ross normalized.
    857  1.1      ross -------------------------------------------------------------------------------
    858  1.1      ross */
    859  1.1      ross static floatx80
    860  1.1      ross  normalizeRoundAndPackFloatx80(
    861  1.1      ross      int8 roundingPrecision, flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1
    862  1.1      ross  )
    863  1.1      ross {
    864  1.1      ross     int8 shiftCount;
    865  1.1      ross 
    866  1.1      ross     if ( zSig0 == 0 ) {
    867  1.1      ross         zSig0 = zSig1;
    868  1.1      ross         zSig1 = 0;
    869  1.1      ross         zExp -= 64;
    870  1.1      ross     }
    871  1.1      ross     shiftCount = countLeadingZeros64( zSig0 );
    872  1.1      ross     shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
    873  1.1      ross     zExp -= shiftCount;
    874  1.1      ross     return
    875  1.1      ross         roundAndPackFloatx80( roundingPrecision, zSign, zExp, zSig0, zSig1 );
    876  1.1      ross 
    877  1.1      ross }
    878  1.1      ross 
    879  1.1      ross #endif
    880  1.1      ross 
    881  1.1      ross #ifdef FLOAT128
    882  1.1      ross 
    883  1.1      ross /*
    884  1.1      ross -------------------------------------------------------------------------------
    885  1.1      ross Returns the least-significant 64 fraction bits of the quadruple-precision
    886  1.1      ross floating-point value `a'.
    887  1.1      ross -------------------------------------------------------------------------------
    888  1.1      ross */
    889  1.1      ross INLINE bits64 extractFloat128Frac1( float128 a )
    890  1.1      ross {
    891  1.1      ross 
    892  1.1      ross     return a.low;
    893  1.1      ross 
    894  1.1      ross }
    895  1.1      ross 
    896  1.1      ross /*
    897  1.1      ross -------------------------------------------------------------------------------
    898  1.1      ross Returns the most-significant 48 fraction bits of the quadruple-precision
    899  1.1      ross floating-point value `a'.
    900  1.1      ross -------------------------------------------------------------------------------
    901  1.1      ross */
    902  1.1      ross INLINE bits64 extractFloat128Frac0( float128 a )
    903  1.1      ross {
    904  1.1      ross 
    905  1.1      ross     return a.high & LIT64( 0x0000FFFFFFFFFFFF );
    906  1.1      ross 
    907  1.1      ross }
    908  1.1      ross 
    909  1.1      ross /*
    910  1.1      ross -------------------------------------------------------------------------------
    911  1.1      ross Returns the exponent bits of the quadruple-precision floating-point value
    912  1.1      ross `a'.
    913  1.1      ross -------------------------------------------------------------------------------
    914  1.1      ross */
    915  1.1      ross INLINE int32 extractFloat128Exp( float128 a )
    916  1.1      ross {
    917  1.1      ross 
    918  1.1      ross     return ( a.high>>48 ) & 0x7FFF;
    919  1.1      ross 
    920  1.1      ross }
    921  1.1      ross 
    922  1.1      ross /*
    923  1.1      ross -------------------------------------------------------------------------------
    924  1.1      ross Returns the sign bit of the quadruple-precision floating-point value `a'.
    925  1.1      ross -------------------------------------------------------------------------------
    926  1.1      ross */
    927  1.1      ross INLINE flag extractFloat128Sign( float128 a )
    928  1.1      ross {
    929  1.1      ross 
    930  1.1      ross     return a.high>>63;
    931  1.1      ross 
    932  1.1      ross }
    933  1.1      ross 
    934  1.1      ross /*
    935  1.1      ross -------------------------------------------------------------------------------
    936  1.1      ross Normalizes the subnormal quadruple-precision floating-point value
    937  1.1      ross represented by the denormalized significand formed by the concatenation of
    938  1.1      ross `aSig0' and `aSig1'.  The normalized exponent is stored at the location
    939  1.1      ross pointed to by `zExpPtr'.  The most significant 49 bits of the normalized
    940  1.1      ross significand are stored at the location pointed to by `zSig0Ptr', and the
    941  1.1      ross least significant 64 bits of the normalized significand are stored at the
    942  1.1      ross location pointed to by `zSig1Ptr'.
    943  1.1      ross -------------------------------------------------------------------------------
    944  1.1      ross */
    945  1.1      ross static void
    946  1.1      ross  normalizeFloat128Subnormal(
    947  1.1      ross      bits64 aSig0,
    948  1.1      ross      bits64 aSig1,
    949  1.1      ross      int32 *zExpPtr,
    950  1.1      ross      bits64 *zSig0Ptr,
    951  1.1      ross      bits64 *zSig1Ptr
    952  1.1      ross  )
    953  1.1      ross {
    954  1.1      ross     int8 shiftCount;
    955  1.1      ross 
    956  1.1      ross     if ( aSig0 == 0 ) {
    957  1.1      ross         shiftCount = countLeadingZeros64( aSig1 ) - 15;
    958  1.1      ross         if ( shiftCount < 0 ) {
    959  1.1      ross             *zSig0Ptr = aSig1>>( - shiftCount );
    960  1.1      ross             *zSig1Ptr = aSig1<<( shiftCount & 63 );
    961  1.1      ross         }
    962  1.1      ross         else {
    963  1.1      ross             *zSig0Ptr = aSig1<<shiftCount;
    964  1.1      ross             *zSig1Ptr = 0;
    965  1.1      ross         }
    966  1.1      ross         *zExpPtr = - shiftCount - 63;
    967  1.1      ross     }
    968  1.1      ross     else {
    969  1.1      ross         shiftCount = countLeadingZeros64( aSig0 ) - 15;
    970  1.1      ross         shortShift128Left( aSig0, aSig1, shiftCount, zSig0Ptr, zSig1Ptr );
    971  1.1      ross         *zExpPtr = 1 - shiftCount;
    972  1.1      ross     }
    973  1.1      ross 
    974  1.1      ross }
    975  1.1      ross 
    976  1.1      ross /*
    977  1.1      ross -------------------------------------------------------------------------------
    978  1.1      ross Packs the sign `zSign', the exponent `zExp', and the significand formed
    979  1.1      ross by the concatenation of `zSig0' and `zSig1' into a quadruple-precision
    980  1.1      ross floating-point value, returning the result.  After being shifted into the
    981  1.1      ross proper positions, the three fields `zSign', `zExp', and `zSig0' are simply
    982  1.1      ross added together to form the most significant 32 bits of the result.  This
    983  1.1      ross means that any integer portion of `zSig0' will be added into the exponent.
    984  1.1      ross Since a properly normalized significand will have an integer portion equal
    985  1.1      ross to 1, the `zExp' input should be 1 less than the desired result exponent
    986  1.1      ross whenever `zSig0' and `zSig1' concatenated form a complete, normalized
    987  1.1      ross significand.
    988  1.1      ross -------------------------------------------------------------------------------
    989  1.1      ross */
    990  1.1      ross INLINE float128
    991  1.1      ross  packFloat128( flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1 )
    992  1.1      ross {
    993  1.1      ross     float128 z;
    994  1.1      ross 
    995  1.1      ross     z.low = zSig1;
    996  1.1      ross     z.high = ( ( (bits64) zSign )<<63 ) + ( ( (bits64) zExp )<<48 ) + zSig0;
    997  1.1      ross     return z;
    998  1.1      ross 
    999  1.1      ross }
   1000  1.1      ross 
   1001  1.1      ross /*
   1002  1.1      ross -------------------------------------------------------------------------------
   1003  1.1      ross Takes an abstract floating-point value having sign `zSign', exponent `zExp',
   1004  1.1      ross and extended significand formed by the concatenation of `zSig0', `zSig1',
   1005  1.1      ross and `zSig2', and returns the proper quadruple-precision floating-point value
   1006  1.1      ross corresponding to the abstract input.  Ordinarily, the abstract value is
   1007  1.1      ross simply rounded and packed into the quadruple-precision format, with the
   1008  1.1      ross inexact exception raised if the abstract input cannot be represented
   1009  1.1      ross exactly.  However, if the abstract value is too large, the overflow and
   1010  1.1      ross inexact exceptions are raised and an infinity or maximal finite value is
   1011  1.1      ross returned.  If the abstract value is too small, the input value is rounded to
   1012  1.1      ross a subnormal number, and the underflow and inexact exceptions are raised if
   1013  1.1      ross the abstract input cannot be represented exactly as a subnormal quadruple-
   1014  1.1      ross precision floating-point number.
   1015  1.1      ross     The input significand must be normalized or smaller.  If the input
   1016  1.1      ross significand is not normalized, `zExp' must be 0; in that case, the result
   1017  1.1      ross returned is a subnormal number, and it must not require rounding.  In the
   1018  1.1      ross usual case that the input significand is normalized, `zExp' must be 1 less
   1019  1.1      ross than the ``true'' floating-point exponent.  The handling of underflow and
   1020  1.1      ross overflow follows the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
   1021  1.1      ross -------------------------------------------------------------------------------
   1022  1.1      ross */
   1023  1.1      ross static float128
   1024  1.1      ross  roundAndPackFloat128(
   1025  1.1      ross      flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1, bits64 zSig2 )
   1026  1.1      ross {
   1027  1.1      ross     int8 roundingMode;
   1028  1.1      ross     flag roundNearestEven, increment, isTiny;
   1029  1.1      ross 
   1030  1.1      ross     roundingMode = float_rounding_mode();
   1031  1.1      ross     roundNearestEven = ( roundingMode == float_round_nearest_even );
   1032  1.1      ross     increment = ( (sbits64) zSig2 < 0 );
   1033  1.1      ross     if ( ! roundNearestEven ) {
   1034  1.1      ross         if ( roundingMode == float_round_to_zero ) {
   1035  1.1      ross             increment = 0;
   1036  1.1      ross         }
   1037  1.1      ross         else {
   1038  1.1      ross             if ( zSign ) {
   1039  1.1      ross                 increment = ( roundingMode == float_round_down ) && zSig2;
   1040  1.1      ross             }
   1041  1.1      ross             else {
   1042  1.1      ross                 increment = ( roundingMode == float_round_up ) && zSig2;
   1043  1.1      ross             }
   1044  1.1      ross         }
   1045  1.1      ross     }
   1046  1.1      ross     if ( 0x7FFD <= (bits32) zExp ) {
   1047  1.1      ross         if (    ( 0x7FFD < zExp )
   1048  1.1      ross              || (    ( zExp == 0x7FFD )
   1049  1.1      ross                   && eq128(
   1050  1.1      ross                          LIT64( 0x0001FFFFFFFFFFFF ),
   1051  1.1      ross                          LIT64( 0xFFFFFFFFFFFFFFFF ),
   1052  1.1      ross                          zSig0,
   1053  1.1      ross                          zSig1
   1054  1.1      ross                      )
   1055  1.1      ross                   && increment
   1056  1.1      ross                 )
   1057  1.1      ross            ) {
   1058  1.1      ross             float_raise( float_flag_overflow | float_flag_inexact );
   1059  1.1      ross             if (    ( roundingMode == float_round_to_zero )
   1060  1.1      ross                  || ( zSign && ( roundingMode == float_round_up ) )
   1061  1.1      ross                  || ( ! zSign && ( roundingMode == float_round_down ) )
   1062  1.1      ross                ) {
   1063  1.1      ross                 return
   1064  1.1      ross                     packFloat128(
   1065  1.1      ross                         zSign,
   1066  1.1      ross                         0x7FFE,
   1067  1.1      ross                         LIT64( 0x0000FFFFFFFFFFFF ),
   1068  1.1      ross                         LIT64( 0xFFFFFFFFFFFFFFFF )
   1069  1.1      ross                     );
   1070  1.1      ross             }
   1071  1.1      ross             return packFloat128( zSign, 0x7FFF, 0, 0 );
   1072  1.1      ross         }
   1073  1.1      ross         if ( zExp < 0 ) {
   1074  1.1      ross             isTiny =
   1075  1.1      ross                    ( float_detect_tininess == float_tininess_before_rounding )
   1076  1.1      ross                 || ( zExp < -1 )
   1077  1.1      ross                 || ! increment
   1078  1.1      ross                 || lt128(
   1079  1.1      ross                        zSig0,
   1080  1.1      ross                        zSig1,
   1081  1.1      ross                        LIT64( 0x0001FFFFFFFFFFFF ),
   1082  1.1      ross                        LIT64( 0xFFFFFFFFFFFFFFFF )
   1083  1.1      ross                    );
   1084  1.1      ross             shift128ExtraRightJamming(
   1085  1.1      ross                 zSig0, zSig1, zSig2, - zExp, &zSig0, &zSig1, &zSig2 );
   1086  1.1      ross             zExp = 0;
   1087  1.1      ross             if ( isTiny && zSig2 ) float_raise( float_flag_underflow );
   1088  1.1      ross             if ( roundNearestEven ) {
   1089  1.1      ross                 increment = ( (sbits64) zSig2 < 0 );
   1090  1.1      ross             }
   1091  1.1      ross             else {
   1092  1.1      ross                 if ( zSign ) {
   1093  1.1      ross                     increment = ( roundingMode == float_round_down ) && zSig2;
   1094  1.1      ross                 }
   1095  1.1      ross                 else {
   1096  1.1      ross                     increment = ( roundingMode == float_round_up ) && zSig2;
   1097  1.1      ross                 }
   1098  1.1      ross             }
   1099  1.1      ross         }
   1100  1.1      ross     }
   1101  1.1      ross     if ( zSig2 ) float_set_inexact();
   1102  1.1      ross     if ( increment ) {
   1103  1.1      ross         add128( zSig0, zSig1, 0, 1, &zSig0, &zSig1 );
   1104  1.1      ross         zSig1 &= ~ ( ( zSig2 + zSig2 == 0 ) & roundNearestEven );
   1105  1.1      ross     }
   1106  1.1      ross     else {
   1107  1.1      ross         if ( ( zSig0 | zSig1 ) == 0 ) zExp = 0;
   1108  1.1      ross     }
   1109  1.1      ross     return packFloat128( zSign, zExp, zSig0, zSig1 );
   1110  1.1      ross 
   1111  1.1      ross }
   1112  1.1      ross 
   1113  1.1      ross /*
   1114  1.1      ross -------------------------------------------------------------------------------
   1115  1.1      ross Takes an abstract floating-point value having sign `zSign', exponent `zExp',
   1116  1.1      ross and significand formed by the concatenation of `zSig0' and `zSig1', and
   1117  1.1      ross returns the proper quadruple-precision floating-point value corresponding
   1118  1.1      ross to the abstract input.  This routine is just like `roundAndPackFloat128'
   1119  1.1      ross except that the input significand has fewer bits and does not have to be
   1120  1.1      ross normalized.  In all cases, `zExp' must be 1 less than the ``true'' floating-
   1121  1.1      ross point exponent.
   1122  1.1      ross -------------------------------------------------------------------------------
   1123  1.1      ross */
   1124  1.1      ross static float128
   1125  1.1      ross  normalizeRoundAndPackFloat128(
   1126  1.1      ross      flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1 )
   1127  1.1      ross {
   1128  1.1      ross     int8 shiftCount;
   1129  1.1      ross     bits64 zSig2;
   1130  1.1      ross 
   1131  1.1      ross     if ( zSig0 == 0 ) {
   1132  1.1      ross         zSig0 = zSig1;
   1133  1.1      ross         zSig1 = 0;
   1134  1.1      ross         zExp -= 64;
   1135  1.1      ross     }
   1136  1.1      ross     shiftCount = countLeadingZeros64( zSig0 ) - 15;
   1137  1.1      ross     if ( 0 <= shiftCount ) {
   1138  1.1      ross         zSig2 = 0;
   1139  1.1      ross         shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
   1140  1.1      ross     }
   1141  1.1      ross     else {
   1142  1.1      ross         shift128ExtraRightJamming(
   1143  1.1      ross             zSig0, zSig1, 0, - shiftCount, &zSig0, &zSig1, &zSig2 );
   1144  1.1      ross     }
   1145  1.1      ross     zExp -= shiftCount;
   1146  1.1      ross     return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 );
   1147  1.1      ross 
   1148  1.1      ross }
   1149  1.1      ross 
   1150  1.1      ross #endif
   1151  1.1      ross 
   1152  1.1      ross /*
   1153  1.1      ross -------------------------------------------------------------------------------
   1154  1.1      ross Returns the result of converting the 32-bit two's complement integer `a'
   1155  1.1      ross to the single-precision floating-point format.  The conversion is performed
   1156  1.1      ross according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
   1157  1.1      ross -------------------------------------------------------------------------------
   1158  1.1      ross */
   1159  1.1      ross float32 int32_to_float32( int32 a )
   1160  1.1      ross {
   1161  1.1      ross     flag zSign;
   1162  1.1      ross 
   1163  1.1      ross     if ( a == 0 ) return 0;
   1164  1.1      ross     if ( a == (sbits32) 0x80000000 ) return packFloat32( 1, 0x9E, 0 );
   1165  1.1      ross     zSign = ( a < 0 );
   1166  1.1      ross     return normalizeRoundAndPackFloat32( zSign, 0x9C, zSign ? - a : a );
   1167  1.1      ross 
   1168  1.1      ross }
   1169  1.1      ross 
   1170  1.1      ross /*
   1171  1.1      ross -------------------------------------------------------------------------------
   1172  1.1      ross Returns the result of converting the 32-bit two's complement integer `a'
   1173  1.1      ross to the double-precision floating-point format.  The conversion is performed
   1174  1.1      ross according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
   1175  1.1      ross -------------------------------------------------------------------------------
   1176  1.1      ross */
   1177  1.1      ross float64 int32_to_float64( int32 a )
   1178  1.1      ross {
   1179  1.1      ross     flag zSign;
   1180  1.1      ross     uint32 absA;
   1181  1.1      ross     int8 shiftCount;
   1182  1.1      ross     bits64 zSig;
   1183  1.1      ross 
   1184  1.1      ross     if ( a == 0 ) return 0;
   1185  1.1      ross     zSign = ( a < 0 );
   1186  1.1      ross     absA = zSign ? - a : a;
   1187  1.1      ross     shiftCount = countLeadingZeros32( absA ) + 21;
   1188  1.1      ross     zSig = absA;
   1189  1.1      ross     return packFloat64( zSign, 0x432 - shiftCount, zSig<<shiftCount );
   1190  1.1      ross 
   1191  1.1      ross }
   1192  1.1      ross 
   1193  1.1      ross #ifdef FLOATX80
   1194  1.1      ross 
   1195  1.1      ross /*
   1196  1.1      ross -------------------------------------------------------------------------------
   1197  1.1      ross Returns the result of converting the 32-bit two's complement integer `a'
   1198  1.1      ross to the extended double-precision floating-point format.  The conversion
   1199  1.1      ross is performed according to the IEC/IEEE Standard for Binary Floating-Point
   1200  1.1      ross Arithmetic.
   1201  1.1      ross -------------------------------------------------------------------------------
   1202  1.1      ross */
   1203  1.1      ross floatx80 int32_to_floatx80( int32 a )
   1204  1.1      ross {
   1205  1.1      ross     flag zSign;
   1206  1.1      ross     uint32 absA;
   1207  1.1      ross     int8 shiftCount;
   1208  1.1      ross     bits64 zSig;
   1209  1.1      ross 
   1210  1.1      ross     if ( a == 0 ) return packFloatx80( 0, 0, 0 );
   1211  1.1      ross     zSign = ( a < 0 );
   1212  1.1      ross     absA = zSign ? - a : a;
   1213  1.1      ross     shiftCount = countLeadingZeros32( absA ) + 32;
   1214  1.1      ross     zSig = absA;
   1215  1.1      ross     return packFloatx80( zSign, 0x403E - shiftCount, zSig<<shiftCount );
   1216  1.1      ross 
   1217  1.1      ross }
   1218  1.1      ross 
   1219  1.1      ross #endif
   1220  1.1      ross 
   1221  1.1      ross #ifdef FLOAT128
   1222  1.1      ross 
   1223  1.1      ross /*
   1224  1.1      ross -------------------------------------------------------------------------------
   1225  1.1      ross Returns the result of converting the 32-bit two's complement integer `a' to
   1226  1.1      ross the quadruple-precision floating-point format.  The conversion is performed
   1227  1.1      ross according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
   1228  1.1      ross -------------------------------------------------------------------------------
   1229  1.1      ross */
   1230  1.1      ross float128 int32_to_float128( int32 a )
   1231  1.1      ross {
   1232  1.1      ross     flag zSign;
   1233  1.1      ross     uint32 absA;
   1234  1.1      ross     int8 shiftCount;
   1235  1.1      ross     bits64 zSig0;
   1236  1.1      ross 
   1237  1.1      ross     if ( a == 0 ) return packFloat128( 0, 0, 0, 0 );
   1238  1.1      ross     zSign = ( a < 0 );
   1239  1.1      ross     absA = zSign ? - a : a;
   1240  1.1      ross     shiftCount = countLeadingZeros32( absA ) + 17;
   1241  1.1      ross     zSig0 = absA;
   1242  1.1      ross     return packFloat128( zSign, 0x402E - shiftCount, zSig0<<shiftCount, 0 );
   1243  1.1      ross 
   1244  1.1      ross }
   1245  1.1      ross 
   1246  1.1      ross #endif
   1247  1.1      ross 
   1248  1.1      ross #ifndef SOFTFLOAT_FOR_GCC /* __floatdi?f is in libgcc2.c */
   1249  1.1      ross /*
   1250  1.1      ross -------------------------------------------------------------------------------
   1251  1.1      ross Returns the result of converting the 64-bit two's complement integer `a'
   1252  1.1      ross to the single-precision floating-point format.  The conversion is performed
   1253  1.1      ross according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
   1254  1.1      ross -------------------------------------------------------------------------------
   1255  1.1      ross */
   1256  1.1      ross float32 int64_to_float32( int64 a )
   1257  1.1      ross {
   1258  1.1      ross     flag zSign;
   1259  1.1      ross     uint64 absA;
   1260  1.1      ross     int8 shiftCount;
   1261  1.1      ross 
   1262  1.1      ross     if ( a == 0 ) return 0;
   1263  1.1      ross     zSign = ( a < 0 );
   1264  1.1      ross     absA = zSign ? - a : a;
   1265  1.1      ross     shiftCount = countLeadingZeros64( absA ) - 40;
   1266  1.1      ross     if ( 0 <= shiftCount ) {
   1267  1.1      ross         return packFloat32( zSign, 0x95 - shiftCount, absA<<shiftCount );
   1268  1.1      ross     }
   1269  1.1      ross     else {
   1270  1.1      ross         shiftCount += 7;
   1271  1.1      ross         if ( shiftCount < 0 ) {
   1272  1.1      ross             shift64RightJamming( absA, - shiftCount, &absA );
   1273  1.1      ross         }
   1274  1.1      ross         else {
   1275  1.1      ross             absA <<= shiftCount;
   1276  1.1      ross         }
   1277  1.1      ross         return roundAndPackFloat32( zSign, 0x9C - shiftCount, absA );
   1278  1.1      ross     }
   1279  1.1      ross 
   1280  1.1      ross }
   1281  1.1      ross 
   1282  1.1      ross /*
   1283  1.1      ross -------------------------------------------------------------------------------
   1284  1.1      ross Returns the result of converting the 64-bit two's complement integer `a'
   1285  1.1      ross to the double-precision floating-point format.  The conversion is performed
   1286  1.1      ross according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
   1287  1.1      ross -------------------------------------------------------------------------------
   1288  1.1      ross */
   1289  1.1      ross float64 int64_to_float64( int64 a )
   1290  1.1      ross {
   1291  1.1      ross     flag zSign;
   1292  1.1      ross 
   1293  1.1      ross     if ( a == 0 ) return 0;
   1294  1.1      ross     if ( a == (sbits64) LIT64( 0x8000000000000000 ) ) {
   1295  1.1      ross         return packFloat64( 1, 0x43E, 0 );
   1296  1.1      ross     }
   1297  1.1      ross     zSign = ( a < 0 );
   1298  1.1      ross     return normalizeRoundAndPackFloat64( zSign, 0x43C, zSign ? - a : a );
   1299  1.1      ross 
   1300  1.1      ross }
   1301  1.1      ross 
   1302  1.1      ross #ifdef FLOATX80
   1303  1.1      ross 
   1304  1.1      ross /*
   1305  1.1      ross -------------------------------------------------------------------------------
   1306  1.1      ross Returns the result of converting the 64-bit two's complement integer `a'
   1307  1.1      ross to the extended double-precision floating-point format.  The conversion
   1308  1.1      ross is performed according to the IEC/IEEE Standard for Binary Floating-Point
   1309  1.1      ross Arithmetic.
   1310  1.1      ross -------------------------------------------------------------------------------
   1311  1.1      ross */
   1312  1.1      ross floatx80 int64_to_floatx80( int64 a )
   1313  1.1      ross {
   1314  1.1      ross     flag zSign;
   1315  1.1      ross     uint64 absA;
   1316  1.1      ross     int8 shiftCount;
   1317  1.1      ross 
   1318  1.1      ross     if ( a == 0 ) return packFloatx80( 0, 0, 0 );
   1319  1.1      ross     zSign = ( a < 0 );
   1320  1.1      ross     absA = zSign ? - a : a;
   1321  1.1      ross     shiftCount = countLeadingZeros64( absA );
   1322  1.1      ross     return packFloatx80( zSign, 0x403E - shiftCount, absA<<shiftCount );
   1323  1.1      ross 
   1324  1.1      ross }
   1325  1.1      ross 
   1326  1.1      ross #endif
   1327  1.1      ross 
   1328  1.1      ross #ifdef FLOAT128
   1329  1.1      ross 
   1330  1.1      ross /*
   1331  1.1      ross -------------------------------------------------------------------------------
   1332  1.1      ross Returns the result of converting the 64-bit two's complement integer `a' to
   1333  1.1      ross the quadruple-precision floating-point format.  The conversion is performed
   1334  1.1      ross according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
   1335  1.1      ross -------------------------------------------------------------------------------
   1336  1.1      ross */
   1337  1.1      ross float128 int64_to_float128( int64 a )
   1338  1.1      ross {
   1339  1.1      ross     flag zSign;
   1340  1.1      ross     uint64 absA;
   1341  1.1      ross     int8 shiftCount;
   1342  1.1      ross     int32 zExp;
   1343  1.1      ross     bits64 zSig0, zSig1;
   1344  1.1      ross 
   1345  1.1      ross     if ( a == 0 ) return packFloat128( 0, 0, 0, 0 );
   1346  1.1      ross     zSign = ( a < 0 );
   1347  1.1      ross     absA = zSign ? - a : a;
   1348  1.1      ross     shiftCount = countLeadingZeros64( absA ) + 49;
   1349  1.1      ross     zExp = 0x406E - shiftCount;
   1350  1.1      ross     if ( 64 <= shiftCount ) {
   1351  1.1      ross         zSig1 = 0;
   1352  1.1      ross         zSig0 = absA;
   1353  1.1      ross         shiftCount -= 64;
   1354  1.1      ross     }
   1355  1.1      ross     else {
   1356  1.1      ross         zSig1 = absA;
   1357  1.1      ross         zSig0 = 0;
   1358  1.1      ross     }
   1359  1.1      ross     shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
   1360  1.1      ross     return packFloat128( zSign, zExp, zSig0, zSig1 );
   1361  1.1      ross 
   1362  1.1      ross }
   1363  1.1      ross 
   1364  1.1      ross #endif
   1365  1.1      ross #endif /* !SOFTFLOAT_FOR_GCC */
   1366  1.1      ross 
   1367  1.1      ross #ifndef SOFTFLOAT_FOR_GCC /* Not needed */
   1368  1.1      ross /*
   1369  1.1      ross -------------------------------------------------------------------------------
   1370  1.1      ross Returns the result of converting the single-precision floating-point value
   1371  1.1      ross `a' to the 32-bit two's complement integer format.  The conversion is
   1372  1.1      ross performed according to the IEC/IEEE Standard for Binary Floating-Point
   1373  1.1      ross Arithmetic---which means in particular that the conversion is rounded
   1374  1.1      ross according to the current rounding mode.  If `a' is a NaN, the largest
   1375  1.1      ross positive integer is returned.  Otherwise, if the conversion overflows, the
   1376  1.1      ross largest integer with the same sign as `a' is returned.
   1377  1.1      ross -------------------------------------------------------------------------------
   1378  1.1      ross */
   1379  1.1      ross int32 float32_to_int32( float32 a )
   1380  1.1      ross {
   1381  1.1      ross     flag aSign;
   1382  1.1      ross     int16 aExp, shiftCount;
   1383  1.1      ross     bits32 aSig;
   1384  1.1      ross     bits64 aSig64;
   1385  1.1      ross 
   1386  1.1      ross     aSig = extractFloat32Frac( a );
   1387  1.1      ross     aExp = extractFloat32Exp( a );
   1388  1.1      ross     aSign = extractFloat32Sign( a );
   1389  1.1      ross     if ( ( aExp == 0xFF ) && aSig ) aSign = 0;
   1390  1.1      ross     if ( aExp ) aSig |= 0x00800000;
   1391  1.1      ross     shiftCount = 0xAF - aExp;
   1392  1.1      ross     aSig64 = aSig;
   1393  1.1      ross     aSig64 <<= 32;
   1394  1.1      ross     if ( 0 < shiftCount ) shift64RightJamming( aSig64, shiftCount, &aSig64 );
   1395  1.1      ross     return roundAndPackInt32( aSign, aSig64 );
   1396  1.1      ross 
   1397  1.1      ross }
   1398  1.1      ross #endif /* !SOFTFLOAT_FOR_GCC */
   1399  1.1      ross 
   1400  1.1      ross /*
   1401  1.1      ross -------------------------------------------------------------------------------
   1402  1.1      ross Returns the result of converting the single-precision floating-point value
   1403  1.1      ross `a' to the 32-bit two's complement integer format.  The conversion is
   1404  1.1      ross performed according to the IEC/IEEE Standard for Binary Floating-Point
   1405  1.1      ross Arithmetic, except that the conversion is always rounded toward zero.
   1406  1.1      ross If `a' is a NaN, the largest positive integer is returned.  Otherwise, if
   1407  1.1      ross the conversion overflows, the largest integer with the same sign as `a' is
   1408  1.1      ross returned.
   1409  1.1      ross -------------------------------------------------------------------------------
   1410  1.1      ross */
   1411  1.1      ross int32 float32_to_int32_round_to_zero( float32 a )
   1412  1.1      ross {
   1413  1.1      ross     flag aSign;
   1414  1.1      ross     int16 aExp, shiftCount;
   1415  1.1      ross     bits32 aSig;
   1416  1.1      ross     int32 z;
   1417  1.1      ross 
   1418  1.1      ross     aSig = extractFloat32Frac( a );
   1419  1.1      ross     aExp = extractFloat32Exp( a );
   1420  1.1      ross     aSign = extractFloat32Sign( a );
   1421  1.1      ross     shiftCount = aExp - 0x9E;
   1422  1.1      ross     if ( 0 <= shiftCount ) {
   1423  1.1      ross         if ( a != 0xCF000000 ) {
   1424  1.1      ross             float_raise( float_flag_invalid );
   1425  1.1      ross             if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) return 0x7FFFFFFF;
   1426  1.1      ross         }
   1427  1.1      ross         return (sbits32) 0x80000000;
   1428  1.1      ross     }
   1429  1.1      ross     else if ( aExp <= 0x7E ) {
   1430  1.1      ross         if ( aExp | aSig ) float_set_inexact();
   1431  1.1      ross         return 0;
   1432  1.1      ross     }
   1433  1.1      ross     aSig = ( aSig | 0x00800000 )<<8;
   1434  1.1      ross     z = aSig>>( - shiftCount );
   1435  1.1      ross     if ( (bits32) ( aSig<<( shiftCount & 31 ) ) ) {
   1436  1.1      ross         float_set_inexact();
   1437  1.1      ross     }
   1438  1.1      ross     if ( aSign ) z = - z;
   1439  1.1      ross     return z;
   1440  1.1      ross 
   1441  1.1      ross }
   1442  1.1      ross 
   1443  1.1      ross #ifndef SOFTFLOAT_FOR_GCC /* __fix?fdi provided by libgcc2.c */
   1444  1.1      ross /*
   1445  1.1      ross -------------------------------------------------------------------------------
   1446  1.1      ross Returns the result of converting the single-precision floating-point value
   1447  1.1      ross `a' to the 64-bit two's complement integer format.  The conversion is
   1448  1.1      ross performed according to the IEC/IEEE Standard for Binary Floating-Point
   1449  1.1      ross Arithmetic---which means in particular that the conversion is rounded
   1450  1.1      ross according to the current rounding mode.  If `a' is a NaN, the largest
   1451  1.1      ross positive integer is returned.  Otherwise, if the conversion overflows, the
   1452  1.1      ross largest integer with the same sign as `a' is returned.
   1453  1.1      ross -------------------------------------------------------------------------------
   1454  1.1      ross */
   1455  1.1      ross int64 float32_to_int64( float32 a )
   1456  1.1      ross {
   1457  1.1      ross     flag aSign;
   1458  1.1      ross     int16 aExp, shiftCount;
   1459  1.1      ross     bits32 aSig;
   1460  1.1      ross     bits64 aSig64, aSigExtra;
   1461  1.1      ross 
   1462  1.1      ross     aSig = extractFloat32Frac( a );
   1463  1.1      ross     aExp = extractFloat32Exp( a );
   1464  1.1      ross     aSign = extractFloat32Sign( a );
   1465  1.1      ross     shiftCount = 0xBE - aExp;
   1466  1.1      ross     if ( shiftCount < 0 ) {
   1467  1.1      ross         float_raise( float_flag_invalid );
   1468  1.1      ross         if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) {
   1469  1.1      ross             return LIT64( 0x7FFFFFFFFFFFFFFF );
   1470  1.1      ross         }
   1471  1.1      ross         return (sbits64) LIT64( 0x8000000000000000 );
   1472  1.1      ross     }
   1473  1.1      ross     if ( aExp ) aSig |= 0x00800000;
   1474  1.1      ross     aSig64 = aSig;
   1475  1.1      ross     aSig64 <<= 40;
   1476  1.1      ross     shift64ExtraRightJamming( aSig64, 0, shiftCount, &aSig64, &aSigExtra );
   1477  1.1      ross     return roundAndPackInt64( aSign, aSig64, aSigExtra );
   1478  1.1      ross 
   1479  1.1      ross }
   1480  1.1      ross 
   1481  1.1      ross /*
   1482  1.1      ross -------------------------------------------------------------------------------
   1483  1.1      ross Returns the result of converting the single-precision floating-point value
   1484  1.1      ross `a' to the 64-bit two's complement integer format.  The conversion is
   1485  1.1      ross performed according to the IEC/IEEE Standard for Binary Floating-Point
   1486  1.1      ross Arithmetic, except that the conversion is always rounded toward zero.  If
   1487  1.1      ross `a' is a NaN, the largest positive integer is returned.  Otherwise, if the
   1488  1.1      ross conversion overflows, the largest integer with the same sign as `a' is
   1489  1.1      ross returned.
   1490  1.1      ross -------------------------------------------------------------------------------
   1491  1.1      ross */
   1492  1.1      ross int64 float32_to_int64_round_to_zero( float32 a )
   1493  1.1      ross {
   1494  1.1      ross     flag aSign;
   1495  1.1      ross     int16 aExp, shiftCount;
   1496  1.1      ross     bits32 aSig;
   1497  1.1      ross     bits64 aSig64;
   1498  1.1      ross     int64 z;
   1499  1.1      ross 
   1500  1.1      ross     aSig = extractFloat32Frac( a );
   1501  1.1      ross     aExp = extractFloat32Exp( a );
   1502  1.1      ross     aSign = extractFloat32Sign( a );
   1503  1.1      ross     shiftCount = aExp - 0xBE;
   1504  1.1      ross     if ( 0 <= shiftCount ) {
   1505  1.1      ross         if ( a != 0xDF000000 ) {
   1506  1.1      ross             float_raise( float_flag_invalid );
   1507  1.1      ross             if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) {
   1508  1.1      ross                 return LIT64( 0x7FFFFFFFFFFFFFFF );
   1509  1.1      ross             }
   1510  1.1      ross         }
   1511  1.1      ross         return (sbits64) LIT64( 0x8000000000000000 );
   1512  1.1      ross     }
   1513  1.1      ross     else if ( aExp <= 0x7E ) {
   1514  1.1      ross         if ( aExp | aSig ) float_set_inexact();
   1515  1.1      ross         return 0;
   1516  1.1      ross     }
   1517  1.1      ross     aSig64 = aSig | 0x00800000;
   1518  1.1      ross     aSig64 <<= 40;
   1519  1.1      ross     z = aSig64>>( - shiftCount );
   1520  1.1      ross     if ( (bits64) ( aSig64<<( shiftCount & 63 ) ) ) {
   1521  1.1      ross         float_set_inexact();
   1522  1.1      ross     }
   1523  1.1      ross     if ( aSign ) z = - z;
   1524  1.1      ross     return z;
   1525  1.1      ross 
   1526  1.1      ross }
   1527  1.1      ross #endif /* !SOFTFLOAT_FOR_GCC */
   1528  1.1      ross 
   1529  1.1      ross /*
   1530  1.1      ross -------------------------------------------------------------------------------
   1531  1.1      ross Returns the result of converting the single-precision floating-point value
   1532  1.1      ross `a' to the double-precision floating-point format.  The conversion is
   1533  1.1      ross performed according to the IEC/IEEE Standard for Binary Floating-Point
   1534  1.1      ross Arithmetic.
   1535  1.1      ross -------------------------------------------------------------------------------
   1536  1.1      ross */
   1537  1.1      ross float64 float32_to_float64( float32 a )
   1538  1.1      ross {
   1539  1.1      ross     flag aSign;
   1540  1.1      ross     int16 aExp;
   1541  1.1      ross     bits32 aSig;
   1542  1.1      ross 
   1543  1.1      ross     aSig = extractFloat32Frac( a );
   1544  1.1      ross     aExp = extractFloat32Exp( a );
   1545  1.1      ross     aSign = extractFloat32Sign( a );
   1546  1.1      ross     if ( aExp == 0xFF ) {
   1547  1.1      ross         if ( aSig ) return commonNaNToFloat64( float32ToCommonNaN( a ) );
   1548  1.1      ross         return packFloat64( aSign, 0x7FF, 0 );
   1549  1.1      ross     }
   1550  1.1      ross     if ( aExp == 0 ) {
   1551  1.1      ross         if ( aSig == 0 ) return packFloat64( aSign, 0, 0 );
   1552  1.1      ross         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
   1553  1.1      ross         --aExp;
   1554  1.1      ross     }
   1555  1.1      ross     return packFloat64( aSign, aExp + 0x380, ( (bits64) aSig )<<29 );
   1556  1.1      ross 
   1557  1.1      ross }
   1558  1.1      ross 
   1559  1.1      ross #ifdef FLOATX80
   1560  1.1      ross 
   1561  1.1      ross /*
   1562  1.1      ross -------------------------------------------------------------------------------
   1563  1.1      ross Returns the result of converting the single-precision floating-point value
   1564  1.1      ross `a' to the extended double-precision floating-point format.  The conversion
   1565  1.1      ross is performed according to the IEC/IEEE Standard for Binary Floating-Point
   1566  1.1      ross Arithmetic.
   1567  1.1      ross -------------------------------------------------------------------------------
   1568  1.1      ross */
   1569  1.1      ross floatx80 float32_to_floatx80( float32 a )
   1570  1.1      ross {
   1571  1.1      ross     flag aSign;
   1572  1.1      ross     int16 aExp;
   1573  1.1      ross     bits32 aSig;
   1574  1.1      ross 
   1575  1.1      ross     aSig = extractFloat32Frac( a );
   1576  1.1      ross     aExp = extractFloat32Exp( a );
   1577  1.1      ross     aSign = extractFloat32Sign( a );
   1578  1.1      ross     if ( aExp == 0xFF ) {
   1579  1.1      ross         if ( aSig ) return commonNaNToFloatx80( float32ToCommonNaN( a ) );
   1580  1.1      ross         return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
   1581  1.1      ross     }
   1582  1.1      ross     if ( aExp == 0 ) {
   1583  1.1      ross         if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 );
   1584  1.1      ross         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
   1585  1.1      ross     }
   1586  1.1      ross     aSig |= 0x00800000;
   1587  1.1      ross     return packFloatx80( aSign, aExp + 0x3F80, ( (bits64) aSig )<<40 );
   1588  1.1      ross 
   1589  1.1      ross }
   1590  1.1      ross 
   1591  1.1      ross #endif
   1592  1.1      ross 
   1593  1.1      ross #ifdef FLOAT128
   1594  1.1      ross 
   1595  1.1      ross /*
   1596  1.1      ross -------------------------------------------------------------------------------
   1597  1.1      ross Returns the result of converting the single-precision floating-point value
   1598  1.1      ross `a' to the double-precision floating-point format.  The conversion is
   1599  1.1      ross performed according to the IEC/IEEE Standard for Binary Floating-Point
   1600  1.1      ross Arithmetic.
   1601  1.1      ross -------------------------------------------------------------------------------
   1602  1.1      ross */
   1603  1.1      ross float128 float32_to_float128( float32 a )
   1604  1.1      ross {
   1605  1.1      ross     flag aSign;
   1606  1.1      ross     int16 aExp;
   1607  1.1      ross     bits32 aSig;
   1608  1.1      ross 
   1609  1.1      ross     aSig = extractFloat32Frac( a );
   1610  1.1      ross     aExp = extractFloat32Exp( a );
   1611  1.1      ross     aSign = extractFloat32Sign( a );
   1612  1.1      ross     if ( aExp == 0xFF ) {
   1613  1.1      ross         if ( aSig ) return commonNaNToFloat128( float32ToCommonNaN( a ) );
   1614  1.1      ross         return packFloat128( aSign, 0x7FFF, 0, 0 );
   1615  1.1      ross     }
   1616  1.1      ross     if ( aExp == 0 ) {
   1617  1.1      ross         if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 );
   1618  1.1      ross         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
   1619  1.1      ross         --aExp;
   1620  1.1      ross     }
   1621  1.1      ross     return packFloat128( aSign, aExp + 0x3F80, ( (bits64) aSig )<<25, 0 );
   1622  1.1      ross 
   1623  1.1      ross }
   1624  1.1      ross 
   1625  1.1      ross #endif
   1626  1.1      ross 
   1627  1.1      ross #ifndef SOFTFLOAT_FOR_GCC /* Not needed */
   1628  1.1      ross /*
   1629  1.1      ross -------------------------------------------------------------------------------
   1630  1.1      ross Rounds the single-precision floating-point value `a' to an integer, and
   1631  1.1      ross returns the result as a single-precision floating-point value.  The
   1632  1.1      ross operation is performed according to the IEC/IEEE Standard for Binary
   1633  1.1      ross Floating-Point Arithmetic.
   1634  1.1      ross -------------------------------------------------------------------------------
   1635  1.1      ross */
   1636  1.1      ross float32 float32_round_to_int( float32 a )
   1637  1.1      ross {
   1638  1.1      ross     flag aSign;
   1639  1.1      ross     int16 aExp;
   1640  1.1      ross     bits32 lastBitMask, roundBitsMask;
   1641  1.1      ross     int8 roundingMode;
   1642  1.1      ross     float32 z;
   1643  1.1      ross 
   1644  1.1      ross     aExp = extractFloat32Exp( a );
   1645  1.1      ross     if ( 0x96 <= aExp ) {
   1646  1.1      ross         if ( ( aExp == 0xFF ) && extractFloat32Frac( a ) ) {
   1647  1.1      ross             return propagateFloat32NaN( a, a );
   1648  1.1      ross         }
   1649  1.1      ross         return a;
   1650  1.1      ross     }
   1651  1.1      ross     if ( aExp <= 0x7E ) {
   1652  1.1      ross         if ( (bits32) ( a<<1 ) == 0 ) return a;
   1653  1.1      ross         float_set_inexact();
   1654  1.1      ross         aSign = extractFloat32Sign( a );
   1655  1.1      ross         switch ( float_rounding_mode() ) {
   1656  1.1      ross          case float_round_nearest_even:
   1657  1.1      ross             if ( ( aExp == 0x7E ) && extractFloat32Frac( a ) ) {
   1658  1.1      ross                 return packFloat32( aSign, 0x7F, 0 );
   1659  1.1      ross             }
   1660  1.1      ross             break;
   1661  1.1      ross          case float_round_down:
   1662  1.1      ross             return aSign ? 0xBF800000 : 0;
   1663  1.1      ross          case float_round_up:
   1664  1.1      ross             return aSign ? 0x80000000 : 0x3F800000;
   1665  1.1      ross         }
   1666  1.1      ross         return packFloat32( aSign, 0, 0 );
   1667  1.1      ross     }
   1668  1.1      ross     lastBitMask = 1;
   1669  1.1      ross     lastBitMask <<= 0x96 - aExp;
   1670  1.1      ross     roundBitsMask = lastBitMask - 1;
   1671  1.1      ross     z = a;
   1672  1.1      ross     roundingMode = float_rounding_mode();
   1673  1.1      ross     if ( roundingMode == float_round_nearest_even ) {
   1674  1.1      ross         z += lastBitMask>>1;
   1675  1.1      ross         if ( ( z & roundBitsMask ) == 0 ) z &= ~ lastBitMask;
   1676  1.1      ross     }
   1677  1.1      ross     else if ( roundingMode != float_round_to_zero ) {
   1678  1.1      ross         if ( extractFloat32Sign( z ) ^ ( roundingMode == float_round_up ) ) {
   1679  1.1      ross             z += roundBitsMask;
   1680  1.1      ross         }
   1681  1.1      ross     }
   1682  1.1      ross     z &= ~ roundBitsMask;
   1683  1.1      ross     if ( z != a ) float_set_inexact();
   1684  1.1      ross     return z;
   1685  1.1      ross 
   1686  1.1      ross }
   1687  1.1      ross #endif /* !SOFTFLOAT_FOR_GCC */
   1688  1.1      ross 
   1689  1.1      ross /*
   1690  1.1      ross -------------------------------------------------------------------------------
   1691  1.1      ross Returns the result of adding the absolute values of the single-precision
   1692  1.1      ross floating-point values `a' and `b'.  If `zSign' is 1, the sum is negated
   1693  1.1      ross before being returned.  `zSign' is ignored if the result is a NaN.
   1694  1.1      ross The addition is performed according to the IEC/IEEE Standard for Binary
   1695  1.1      ross Floating-Point Arithmetic.
   1696  1.1      ross -------------------------------------------------------------------------------
   1697  1.1      ross */
   1698  1.1      ross static float32 addFloat32Sigs( float32 a, float32 b, flag zSign )
   1699  1.1      ross {
   1700  1.1      ross     int16 aExp, bExp, zExp;
   1701  1.1      ross     bits32 aSig, bSig, zSig;
   1702  1.1      ross     int16 expDiff;
   1703  1.1      ross 
   1704  1.1      ross     aSig = extractFloat32Frac( a );
   1705  1.1      ross     aExp = extractFloat32Exp( a );
   1706  1.1      ross     bSig = extractFloat32Frac( b );
   1707  1.1      ross     bExp = extractFloat32Exp( b );
   1708  1.1      ross     expDiff = aExp - bExp;
   1709  1.1      ross     aSig <<= 6;
   1710  1.1      ross     bSig <<= 6;
   1711  1.1      ross     if ( 0 < expDiff ) {
   1712  1.1      ross         if ( aExp == 0xFF ) {
   1713  1.1      ross             if ( aSig ) return propagateFloat32NaN( a, b );
   1714  1.1      ross             return a;
   1715  1.1      ross         }
   1716  1.1      ross         if ( bExp == 0 ) {
   1717  1.1      ross             --expDiff;
   1718  1.1      ross         }
   1719  1.1      ross         else {
   1720  1.1      ross             bSig |= 0x20000000;
   1721  1.1      ross         }
   1722  1.1      ross         shift32RightJamming( bSig, expDiff, &bSig );
   1723  1.1      ross         zExp = aExp;
   1724  1.1      ross     }
   1725  1.1      ross     else if ( expDiff < 0 ) {
   1726  1.1      ross         if ( bExp == 0xFF ) {
   1727  1.1      ross             if ( bSig ) return propagateFloat32NaN( a, b );
   1728  1.1      ross             return packFloat32( zSign, 0xFF, 0 );
   1729  1.1      ross         }
   1730  1.1      ross         if ( aExp == 0 ) {
   1731  1.1      ross             ++expDiff;
   1732  1.1      ross         }
   1733  1.1      ross         else {
   1734  1.1      ross             aSig |= 0x20000000;
   1735  1.1      ross         }
   1736  1.1      ross         shift32RightJamming( aSig, - expDiff, &aSig );
   1737  1.1      ross         zExp = bExp;
   1738  1.1      ross     }
   1739  1.1      ross     else {
   1740  1.1      ross         if ( aExp == 0xFF ) {
   1741  1.1      ross             if ( aSig | bSig ) return propagateFloat32NaN( a, b );
   1742  1.1      ross             return a;
   1743  1.1      ross         }
   1744  1.1      ross         if ( aExp == 0 ) return packFloat32( zSign, 0, ( aSig + bSig )>>6 );
   1745  1.1      ross         zSig = 0x40000000 + aSig + bSig;
   1746  1.1      ross         zExp = aExp;
   1747  1.1      ross         goto roundAndPack;
   1748  1.1      ross     }
   1749  1.1      ross     aSig |= 0x20000000;
   1750  1.1      ross     zSig = ( aSig + bSig )<<1;
   1751  1.1      ross     --zExp;
   1752  1.1      ross     if ( (sbits32) zSig < 0 ) {
   1753  1.1      ross         zSig = aSig + bSig;
   1754  1.1      ross         ++zExp;
   1755  1.1      ross     }
   1756  1.1      ross  roundAndPack:
   1757  1.1      ross     return roundAndPackFloat32( zSign, zExp, zSig );
   1758  1.1      ross 
   1759  1.1      ross }
   1760  1.1      ross 
   1761  1.1      ross /*
   1762  1.1      ross -------------------------------------------------------------------------------
   1763  1.1      ross Returns the result of subtracting the absolute values of the single-
   1764  1.1      ross precision floating-point values `a' and `b'.  If `zSign' is 1, the
   1765  1.1      ross difference is negated before being returned.  `zSign' is ignored if the
   1766  1.1      ross result is a NaN.  The subtraction is performed according to the IEC/IEEE
   1767  1.1      ross Standard for Binary Floating-Point Arithmetic.
   1768  1.1      ross -------------------------------------------------------------------------------
   1769  1.1      ross */
   1770  1.1      ross static float32 subFloat32Sigs( float32 a, float32 b, flag zSign )
   1771  1.1      ross {
   1772  1.1      ross     int16 aExp, bExp, zExp;
   1773  1.1      ross     bits32 aSig, bSig, zSig;
   1774  1.1      ross     int16 expDiff;
   1775  1.1      ross 
   1776  1.1      ross     aSig = extractFloat32Frac( a );
   1777  1.1      ross     aExp = extractFloat32Exp( a );
   1778  1.1      ross     bSig = extractFloat32Frac( b );
   1779  1.1      ross     bExp = extractFloat32Exp( b );
   1780  1.1      ross     expDiff = aExp - bExp;
   1781  1.1      ross     aSig <<= 7;
   1782  1.1      ross     bSig <<= 7;
   1783  1.1      ross     if ( 0 < expDiff ) goto aExpBigger;
   1784  1.1      ross     if ( expDiff < 0 ) goto bExpBigger;
   1785  1.1      ross     if ( aExp == 0xFF ) {
   1786  1.1      ross         if ( aSig | bSig ) return propagateFloat32NaN( a, b );
   1787  1.1      ross         float_raise( float_flag_invalid );
   1788  1.1      ross         return float32_default_nan;
   1789  1.1      ross     }
   1790  1.1      ross     if ( aExp == 0 ) {
   1791  1.1      ross         aExp = 1;
   1792  1.1      ross         bExp = 1;
   1793  1.1      ross     }
   1794  1.1      ross     if ( bSig < aSig ) goto aBigger;
   1795  1.1      ross     if ( aSig < bSig ) goto bBigger;
   1796  1.1      ross     return packFloat32( float_rounding_mode() == float_round_down, 0, 0 );
   1797  1.1      ross  bExpBigger:
   1798  1.1      ross     if ( bExp == 0xFF ) {
   1799  1.1      ross         if ( bSig ) return propagateFloat32NaN( a, b );
   1800  1.1      ross         return packFloat32( zSign ^ 1, 0xFF, 0 );
   1801  1.1      ross     }
   1802  1.1      ross     if ( aExp == 0 ) {
   1803  1.1      ross         ++expDiff;
   1804  1.1      ross     }
   1805  1.1      ross     else {
   1806  1.1      ross         aSig |= 0x40000000;
   1807  1.1      ross     }
   1808  1.1      ross     shift32RightJamming( aSig, - expDiff, &aSig );
   1809  1.1      ross     bSig |= 0x40000000;
   1810  1.1      ross  bBigger:
   1811  1.1      ross     zSig = bSig - aSig;
   1812  1.1      ross     zExp = bExp;
   1813  1.1      ross     zSign ^= 1;
   1814  1.1      ross     goto normalizeRoundAndPack;
   1815  1.1      ross  aExpBigger:
   1816  1.1      ross     if ( aExp == 0xFF ) {
   1817  1.1      ross         if ( aSig ) return propagateFloat32NaN( a, b );
   1818  1.1      ross         return a;
   1819  1.1      ross     }
   1820  1.1      ross     if ( bExp == 0 ) {
   1821  1.1      ross         --expDiff;
   1822  1.1      ross     }
   1823  1.1      ross     else {
   1824  1.1      ross         bSig |= 0x40000000;
   1825  1.1      ross     }
   1826  1.1      ross     shift32RightJamming( bSig, expDiff, &bSig );
   1827  1.1      ross     aSig |= 0x40000000;
   1828  1.1      ross  aBigger:
   1829  1.1      ross     zSig = aSig - bSig;
   1830  1.1      ross     zExp = aExp;
   1831  1.1      ross  normalizeRoundAndPack:
   1832  1.1      ross     --zExp;
   1833  1.1      ross     return normalizeRoundAndPackFloat32( zSign, zExp, zSig );
   1834  1.1      ross 
   1835  1.1      ross }
   1836  1.1      ross 
   1837  1.1      ross /*
   1838  1.1      ross -------------------------------------------------------------------------------
   1839  1.1      ross Returns the result of adding the single-precision floating-point values `a'
   1840  1.1      ross and `b'.  The operation is performed according to the IEC/IEEE Standard for
   1841  1.1      ross Binary Floating-Point Arithmetic.
   1842  1.1      ross -------------------------------------------------------------------------------
   1843  1.1      ross */
   1844  1.1      ross float32 float32_add( float32 a, float32 b )
   1845  1.1      ross {
   1846  1.1      ross     flag aSign, bSign;
   1847  1.1      ross 
   1848  1.1      ross     aSign = extractFloat32Sign( a );
   1849  1.1      ross     bSign = extractFloat32Sign( b );
   1850  1.1      ross     if ( aSign == bSign ) {
   1851  1.1      ross         return addFloat32Sigs( a, b, aSign );
   1852  1.1      ross     }
   1853  1.1      ross     else {
   1854  1.1      ross         return subFloat32Sigs( a, b, aSign );
   1855  1.1      ross     }
   1856  1.1      ross 
   1857  1.1      ross }
   1858  1.1      ross 
   1859  1.1      ross /*
   1860  1.1      ross -------------------------------------------------------------------------------
   1861  1.1      ross Returns the result of subtracting the single-precision floating-point values
   1862  1.1      ross `a' and `b'.  The operation is performed according to the IEC/IEEE Standard
   1863  1.1      ross for Binary Floating-Point Arithmetic.
   1864  1.1      ross -------------------------------------------------------------------------------
   1865  1.1      ross */
   1866  1.1      ross float32 float32_sub( float32 a, float32 b )
   1867  1.1      ross {
   1868  1.1      ross     flag aSign, bSign;
   1869  1.1      ross 
   1870  1.1      ross     aSign = extractFloat32Sign( a );
   1871  1.1      ross     bSign = extractFloat32Sign( b );
   1872  1.1      ross     if ( aSign == bSign ) {
   1873  1.1      ross         return subFloat32Sigs( a, b, aSign );
   1874  1.1      ross     }
   1875  1.1      ross     else {
   1876  1.1      ross         return addFloat32Sigs( a, b, aSign );
   1877  1.1      ross     }
   1878  1.1      ross 
   1879  1.1      ross }
   1880  1.1      ross 
   1881  1.1      ross /*
   1882  1.1      ross -------------------------------------------------------------------------------
   1883  1.1      ross Returns the result of multiplying the single-precision floating-point values
   1884  1.1      ross `a' and `b'.  The operation is performed according to the IEC/IEEE Standard
   1885  1.1      ross for Binary Floating-Point Arithmetic.
   1886  1.1      ross -------------------------------------------------------------------------------
   1887  1.1      ross */
   1888  1.1      ross float32 float32_mul( float32 a, float32 b )
   1889  1.1      ross {
   1890  1.1      ross     flag aSign, bSign, zSign;
   1891  1.1      ross     int16 aExp, bExp, zExp;
   1892  1.1      ross     bits32 aSig, bSig;
   1893  1.1      ross     bits64 zSig64;
   1894  1.1      ross     bits32 zSig;
   1895  1.1      ross 
   1896  1.1      ross     aSig = extractFloat32Frac( a );
   1897  1.1      ross     aExp = extractFloat32Exp( a );
   1898  1.1      ross     aSign = extractFloat32Sign( a );
   1899  1.1      ross     bSig = extractFloat32Frac( b );
   1900  1.1      ross     bExp = extractFloat32Exp( b );
   1901  1.1      ross     bSign = extractFloat32Sign( b );
   1902  1.1      ross     zSign = aSign ^ bSign;
   1903  1.1      ross     if ( aExp == 0xFF ) {
   1904  1.1      ross         if ( aSig || ( ( bExp == 0xFF ) && bSig ) ) {
   1905  1.1      ross             return propagateFloat32NaN( a, b );
   1906  1.1      ross         }
   1907  1.1      ross         if ( ( bExp | bSig ) == 0 ) {
   1908  1.1      ross             float_raise( float_flag_invalid );
   1909  1.1      ross             return float32_default_nan;
   1910  1.1      ross         }
   1911  1.1      ross         return packFloat32( zSign, 0xFF, 0 );
   1912  1.1      ross     }
   1913  1.1      ross     if ( bExp == 0xFF ) {
   1914  1.1      ross         if ( bSig ) return propagateFloat32NaN( a, b );
   1915  1.1      ross         if ( ( aExp | aSig ) == 0 ) {
   1916  1.1      ross             float_raise( float_flag_invalid );
   1917  1.1      ross             return float32_default_nan;
   1918  1.1      ross         }
   1919  1.1      ross         return packFloat32( zSign, 0xFF, 0 );
   1920  1.1      ross     }
   1921  1.1      ross     if ( aExp == 0 ) {
   1922  1.1      ross         if ( aSig == 0 ) return packFloat32( zSign, 0, 0 );
   1923  1.1      ross         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
   1924  1.1      ross     }
   1925  1.1      ross     if ( bExp == 0 ) {
   1926  1.1      ross         if ( bSig == 0 ) return packFloat32( zSign, 0, 0 );
   1927  1.1      ross         normalizeFloat32Subnormal( bSig, &bExp, &bSig );
   1928  1.1      ross     }
   1929  1.1      ross     zExp = aExp + bExp - 0x7F;
   1930  1.1      ross     aSig = ( aSig | 0x00800000 )<<7;
   1931  1.1      ross     bSig = ( bSig | 0x00800000 )<<8;
   1932  1.1      ross     shift64RightJamming( ( (bits64) aSig ) * bSig, 32, &zSig64 );
   1933  1.1      ross     zSig = zSig64;
   1934  1.1      ross     if ( 0 <= (sbits32) ( zSig<<1 ) ) {
   1935  1.1      ross         zSig <<= 1;
   1936  1.1      ross         --zExp;
   1937  1.1      ross     }
   1938  1.1      ross     return roundAndPackFloat32( zSign, zExp, zSig );
   1939  1.1      ross 
   1940  1.1      ross }
   1941  1.1      ross 
   1942  1.1      ross /*
   1943  1.1      ross -------------------------------------------------------------------------------
   1944  1.1      ross Returns the result of dividing the single-precision floating-point value `a'
   1945  1.1      ross by the corresponding value `b'.  The operation is performed according to the
   1946  1.1      ross IEC/IEEE Standard for Binary Floating-Point Arithmetic.
   1947  1.1      ross -------------------------------------------------------------------------------
   1948  1.1      ross */
   1949  1.1      ross float32 float32_div( float32 a, float32 b )
   1950  1.1      ross {
   1951  1.1      ross     flag aSign, bSign, zSign;
   1952  1.1      ross     int16 aExp, bExp, zExp;
   1953  1.1      ross     bits32 aSig, bSig, zSig;
   1954  1.1      ross 
   1955  1.1      ross     aSig = extractFloat32Frac( a );
   1956  1.1      ross     aExp = extractFloat32Exp( a );
   1957  1.1      ross     aSign = extractFloat32Sign( a );
   1958  1.1      ross     bSig = extractFloat32Frac( b );
   1959  1.1      ross     bExp = extractFloat32Exp( b );
   1960  1.1      ross     bSign = extractFloat32Sign( b );
   1961  1.1      ross     zSign = aSign ^ bSign;
   1962  1.1      ross     if ( aExp == 0xFF ) {
   1963  1.1      ross         if ( aSig ) return propagateFloat32NaN( a, b );
   1964  1.1      ross         if ( bExp == 0xFF ) {
   1965  1.1      ross             if ( bSig ) return propagateFloat32NaN( a, b );
   1966  1.1      ross             float_raise( float_flag_invalid );
   1967  1.1      ross             return float32_default_nan;
   1968  1.1      ross         }
   1969  1.1      ross         return packFloat32( zSign, 0xFF, 0 );
   1970  1.1      ross     }
   1971  1.1      ross     if ( bExp == 0xFF ) {
   1972  1.1      ross         if ( bSig ) return propagateFloat32NaN( a, b );
   1973  1.1      ross         return packFloat32( zSign, 0, 0 );
   1974  1.1      ross     }
   1975  1.1      ross     if ( bExp == 0 ) {
   1976  1.1      ross         if ( bSig == 0 ) {
   1977  1.1      ross             if ( ( aExp | aSig ) == 0 ) {
   1978  1.1      ross                 float_raise( float_flag_invalid );
   1979  1.1      ross                 return float32_default_nan;
   1980  1.1      ross             }
   1981  1.1      ross             float_raise( float_flag_divbyzero );
   1982  1.1      ross             return packFloat32( zSign, 0xFF, 0 );
   1983  1.1      ross         }
   1984  1.1      ross         normalizeFloat32Subnormal( bSig, &bExp, &bSig );
   1985  1.1      ross     }
   1986  1.1      ross     if ( aExp == 0 ) {
   1987  1.1      ross         if ( aSig == 0 ) return packFloat32( zSign, 0, 0 );
   1988  1.1      ross         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
   1989  1.1      ross     }
   1990  1.1      ross     zExp = aExp - bExp + 0x7D;
   1991  1.1      ross     aSig = ( aSig | 0x00800000 )<<7;
   1992  1.1      ross     bSig = ( bSig | 0x00800000 )<<8;
   1993  1.1      ross     if ( bSig <= ( aSig + aSig ) ) {
   1994  1.1      ross         aSig >>= 1;
   1995  1.1      ross         ++zExp;
   1996  1.1      ross     }
   1997  1.1      ross     zSig = ( ( (bits64) aSig )<<32 ) / bSig;
   1998  1.1      ross     if ( ( zSig & 0x3F ) == 0 ) {
   1999  1.1      ross         zSig |= ( (bits64) bSig * zSig != ( (bits64) aSig )<<32 );
   2000  1.1      ross     }
   2001  1.1      ross     return roundAndPackFloat32( zSign, zExp, zSig );
   2002  1.1      ross 
   2003  1.1      ross }
   2004  1.1      ross 
   2005  1.1      ross #ifndef SOFTFLOAT_FOR_GCC /* Not needed */
   2006  1.1      ross /*
   2007  1.1      ross -------------------------------------------------------------------------------
   2008  1.1      ross Returns the remainder of the single-precision floating-point value `a'
   2009  1.1      ross with respect to the corresponding value `b'.  The operation is performed
   2010  1.1      ross according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
   2011  1.1      ross -------------------------------------------------------------------------------
   2012  1.1      ross */
   2013  1.1      ross float32 float32_rem( float32 a, float32 b )
   2014  1.1      ross {
   2015  1.5  christos     flag aSign, bSign __unused, zSign;
   2016  1.1      ross     int16 aExp, bExp, expDiff;
   2017  1.1      ross     bits32 aSig, bSig;
   2018  1.1      ross     bits32 q;
   2019  1.1      ross     bits64 aSig64, bSig64, q64;
   2020  1.1      ross     bits32 alternateASig;
   2021  1.1      ross     sbits32 sigMean;
   2022  1.1      ross 
   2023  1.1      ross     aSig = extractFloat32Frac( a );
   2024  1.1      ross     aExp = extractFloat32Exp( a );
   2025  1.1      ross     aSign = extractFloat32Sign( a );
   2026  1.1      ross     bSig = extractFloat32Frac( b );
   2027  1.1      ross     bExp = extractFloat32Exp( b );
   2028  1.1      ross     bSign = extractFloat32Sign( b );
   2029  1.1      ross     if ( aExp == 0xFF ) {
   2030  1.1      ross         if ( aSig || ( ( bExp == 0xFF ) && bSig ) ) {
   2031  1.1      ross             return propagateFloat32NaN( a, b );
   2032  1.1      ross         }
   2033  1.1      ross         float_raise( float_flag_invalid );
   2034  1.1      ross         return float32_default_nan;
   2035  1.1      ross     }
   2036  1.1      ross     if ( bExp == 0xFF ) {
   2037  1.1      ross         if ( bSig ) return propagateFloat32NaN( a, b );
   2038  1.1      ross         return a;
   2039  1.1      ross     }
   2040  1.1      ross     if ( bExp == 0 ) {
   2041  1.1      ross         if ( bSig == 0 ) {
   2042  1.1      ross             float_raise( float_flag_invalid );
   2043  1.1      ross             return float32_default_nan;
   2044  1.1      ross         }
   2045  1.1      ross         normalizeFloat32Subnormal( bSig, &bExp, &bSig );
   2046  1.1      ross     }
   2047  1.1      ross     if ( aExp == 0 ) {
   2048  1.1      ross         if ( aSig == 0 ) return a;
   2049  1.1      ross         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
   2050  1.1      ross     }
   2051  1.1      ross     expDiff = aExp - bExp;
   2052  1.1      ross     aSig |= 0x00800000;
   2053  1.1      ross     bSig |= 0x00800000;
   2054  1.1      ross     if ( expDiff < 32 ) {
   2055  1.1      ross         aSig <<= 8;
   2056  1.1      ross         bSig <<= 8;
   2057  1.1      ross         if ( expDiff < 0 ) {
   2058  1.1      ross             if ( expDiff < -1 ) return a;
   2059  1.1      ross             aSig >>= 1;
   2060  1.1      ross         }
   2061  1.1      ross         q = ( bSig <= aSig );
   2062  1.1      ross         if ( q ) aSig -= bSig;
   2063  1.1      ross         if ( 0 < expDiff ) {
   2064  1.1      ross             q = ( ( (bits64) aSig )<<32 ) / bSig;
   2065  1.1      ross             q >>= 32 - expDiff;
   2066  1.1      ross             bSig >>= 2;
   2067  1.1      ross             aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q;
   2068  1.1      ross         }
   2069  1.1      ross         else {
   2070  1.1      ross             aSig >>= 2;
   2071  1.1      ross             bSig >>= 2;
   2072  1.1      ross         }
   2073  1.1      ross     }
   2074  1.1      ross     else {
   2075  1.1      ross         if ( bSig <= aSig ) aSig -= bSig;
   2076  1.1      ross         aSig64 = ( (bits64) aSig )<<40;
   2077  1.1      ross         bSig64 = ( (bits64) bSig )<<40;
   2078  1.1      ross         expDiff -= 64;
   2079  1.1      ross         while ( 0 < expDiff ) {
   2080  1.1      ross             q64 = estimateDiv128To64( aSig64, 0, bSig64 );
   2081  1.1      ross             q64 = ( 2 < q64 ) ? q64 - 2 : 0;
   2082  1.1      ross             aSig64 = - ( ( bSig * q64 )<<38 );
   2083  1.1      ross             expDiff -= 62;
   2084  1.1      ross         }
   2085  1.1      ross         expDiff += 64;
   2086  1.1      ross         q64 = estimateDiv128To64( aSig64, 0, bSig64 );
   2087  1.1      ross         q64 = ( 2 < q64 ) ? q64 - 2 : 0;
   2088  1.1      ross         q = q64>>( 64 - expDiff );
   2089  1.1      ross         bSig <<= 6;
   2090  1.1      ross         aSig = ( ( aSig64>>33 )<<( expDiff - 1 ) ) - bSig * q;
   2091  1.1      ross     }
   2092  1.1      ross     do {
   2093  1.1      ross         alternateASig = aSig;
   2094  1.1      ross         ++q;
   2095  1.1      ross         aSig -= bSig;
   2096  1.1      ross     } while ( 0 <= (sbits32) aSig );
   2097  1.1      ross     sigMean = aSig + alternateASig;
   2098  1.1      ross     if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) {
   2099  1.1      ross         aSig = alternateASig;
   2100  1.1      ross     }
   2101  1.1      ross     zSign = ( (sbits32) aSig < 0 );
   2102  1.1      ross     if ( zSign ) aSig = - aSig;
   2103  1.1      ross     return normalizeRoundAndPackFloat32( aSign ^ zSign, bExp, aSig );
   2104  1.1      ross 
   2105  1.1      ross }
   2106  1.1      ross #endif /* !SOFTFLOAT_FOR_GCC */
   2107  1.1      ross 
   2108  1.1      ross #ifndef SOFTFLOAT_FOR_GCC /* Not needed */
   2109  1.1      ross /*
   2110  1.1      ross -------------------------------------------------------------------------------
   2111  1.1      ross Returns the square root of the single-precision floating-point value `a'.
   2112  1.1      ross The operation is performed according to the IEC/IEEE Standard for Binary
   2113  1.1      ross Floating-Point Arithmetic.
   2114  1.1      ross -------------------------------------------------------------------------------
   2115  1.1      ross */
   2116  1.1      ross float32 float32_sqrt( float32 a )
   2117  1.1      ross {
   2118  1.1      ross     flag aSign;
   2119  1.1      ross     int16 aExp, zExp;
   2120  1.1      ross     bits32 aSig, zSig;
   2121  1.1      ross     bits64 rem, term;
   2122  1.1      ross 
   2123  1.1      ross     aSig = extractFloat32Frac( a );
   2124  1.1      ross     aExp = extractFloat32Exp( a );
   2125  1.1      ross     aSign = extractFloat32Sign( a );
   2126  1.1      ross     if ( aExp == 0xFF ) {
   2127  1.1      ross         if ( aSig ) return propagateFloat32NaN( a, 0 );
   2128  1.1      ross         if ( ! aSign ) return a;
   2129  1.1      ross         float_raise( float_flag_invalid );
   2130  1.1      ross         return float32_default_nan;
   2131  1.1      ross     }
   2132  1.1      ross     if ( aSign ) {
   2133  1.1      ross         if ( ( aExp | aSig ) == 0 ) return a;
   2134  1.1      ross         float_raise( float_flag_invalid );
   2135  1.1      ross         return float32_default_nan;
   2136  1.1      ross     }
   2137  1.1      ross     if ( aExp == 0 ) {
   2138  1.1      ross         if ( aSig == 0 ) return 0;
   2139  1.1      ross         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
   2140  1.1      ross     }
   2141  1.1      ross     zExp = ( ( aExp - 0x7F )>>1 ) + 0x7E;
   2142  1.1      ross     aSig = ( aSig | 0x00800000 )<<8;
   2143  1.1      ross     zSig = estimateSqrt32( aExp, aSig ) + 2;
   2144  1.1      ross     if ( ( zSig & 0x7F ) <= 5 ) {
   2145  1.1      ross         if ( zSig < 2 ) {
   2146  1.1      ross             zSig = 0x7FFFFFFF;
   2147  1.1      ross             goto roundAndPack;
   2148  1.1      ross         }
   2149  1.1      ross         aSig >>= aExp & 1;
   2150  1.1      ross         term = ( (bits64) zSig ) * zSig;
   2151  1.1      ross         rem = ( ( (bits64) aSig )<<32 ) - term;
   2152  1.1      ross         while ( (sbits64) rem < 0 ) {
   2153  1.1      ross             --zSig;
   2154  1.1      ross             rem += ( ( (bits64) zSig )<<1 ) | 1;
   2155  1.1      ross         }
   2156  1.1      ross         zSig |= ( rem != 0 );
   2157  1.1      ross     }
   2158  1.1      ross     shift32RightJamming( zSig, 1, &zSig );
   2159  1.1      ross  roundAndPack:
   2160  1.1      ross     return roundAndPackFloat32( 0, zExp, zSig );
   2161  1.1      ross 
   2162  1.1      ross }
   2163  1.1      ross #endif /* !SOFTFLOAT_FOR_GCC */
   2164  1.1      ross 
   2165  1.1      ross /*
   2166  1.1      ross -------------------------------------------------------------------------------
   2167  1.1      ross Returns 1 if the single-precision floating-point value `a' is equal to
   2168  1.1      ross the corresponding value `b', and 0 otherwise.  The comparison is performed
   2169  1.1      ross according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
   2170  1.1      ross -------------------------------------------------------------------------------
   2171  1.1      ross */
   2172  1.1      ross flag float32_eq( float32 a, float32 b )
   2173  1.1      ross {
   2174  1.1      ross 
   2175  1.1      ross     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
   2176  1.1      ross          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
   2177  1.1      ross        ) {
   2178  1.1      ross         if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) {
   2179  1.1      ross             float_raise( float_flag_invalid );
   2180  1.1      ross         }
   2181  1.1      ross         return 0;
   2182  1.1      ross     }
   2183  1.1      ross     return ( a == b ) || ( (bits32) ( ( a | b )<<1 ) == 0 );
   2184  1.1      ross 
   2185  1.1      ross }
   2186  1.1      ross 
   2187  1.1      ross /*
   2188  1.1      ross -------------------------------------------------------------------------------
   2189  1.1      ross Returns 1 if the single-precision floating-point value `a' is less than
   2190  1.1      ross or equal to the corresponding value `b', and 0 otherwise.  The comparison
   2191  1.1      ross is performed according to the IEC/IEEE Standard for Binary Floating-Point
   2192  1.1      ross Arithmetic.
   2193  1.1      ross -------------------------------------------------------------------------------
   2194  1.1      ross */
   2195  1.1      ross flag float32_le( float32 a, float32 b )
   2196  1.1      ross {
   2197  1.1      ross     flag aSign, bSign;
   2198  1.1      ross 
   2199  1.1      ross     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
   2200  1.1      ross          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
   2201  1.1      ross        ) {
   2202  1.1      ross         float_raise( float_flag_invalid );
   2203  1.1      ross         return 0;
   2204  1.1      ross     }
   2205  1.1      ross     aSign = extractFloat32Sign( a );
   2206  1.1      ross     bSign = extractFloat32Sign( b );
   2207  1.1      ross     if ( aSign != bSign ) return aSign || ( (bits32) ( ( a | b )<<1 ) == 0 );
   2208  1.1      ross     return ( a == b ) || ( aSign ^ ( a < b ) );
   2209  1.1      ross 
   2210  1.1      ross }
   2211  1.1      ross 
   2212  1.1      ross /*
   2213  1.1      ross -------------------------------------------------------------------------------
   2214  1.1      ross Returns 1 if the single-precision floating-point value `a' is less than
   2215  1.1      ross the corresponding value `b', and 0 otherwise.  The comparison is performed
   2216  1.1      ross according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
   2217  1.1      ross -------------------------------------------------------------------------------
   2218  1.1      ross */
   2219  1.1      ross flag float32_lt( float32 a, float32 b )
   2220  1.1      ross {
   2221  1.1      ross     flag aSign, bSign;
   2222  1.1      ross 
   2223  1.1      ross     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
   2224  1.1      ross          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
   2225  1.1      ross        ) {
   2226  1.1      ross         float_raise( float_flag_invalid );
   2227  1.1      ross         return 0;
   2228  1.1      ross     }
   2229  1.1      ross     aSign = extractFloat32Sign( a );
   2230  1.1      ross     bSign = extractFloat32Sign( b );
   2231  1.1      ross     if ( aSign != bSign ) return aSign && ( (bits32) ( ( a | b )<<1 ) != 0 );
   2232  1.1      ross     return ( a != b ) && ( aSign ^ ( a < b ) );
   2233  1.1      ross 
   2234  1.1      ross }
   2235  1.1      ross 
   2236  1.1      ross #ifndef SOFTFLOAT_FOR_GCC /* Not needed */
   2237  1.1      ross /*
   2238  1.1      ross -------------------------------------------------------------------------------
   2239  1.1      ross Returns 1 if the single-precision floating-point value `a' is equal to
   2240  1.1      ross the corresponding value `b', and 0 otherwise.  The invalid exception is
   2241  1.1      ross raised if either operand is a NaN.  Otherwise, the comparison is performed
   2242  1.1      ross according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
   2243  1.1      ross -------------------------------------------------------------------------------
   2244  1.1      ross */
   2245  1.1      ross flag float32_eq_signaling( float32 a, float32 b )
   2246  1.1      ross {
   2247  1.1      ross 
   2248  1.1      ross     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
   2249  1.1      ross          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
   2250  1.1      ross        ) {
   2251  1.1      ross         float_raise( float_flag_invalid );
   2252  1.1      ross         return 0;
   2253  1.1      ross     }
   2254  1.1      ross     return ( a == b ) || ( (bits32) ( ( a | b )<<1 ) == 0 );
   2255  1.1      ross 
   2256  1.1      ross }
   2257  1.1      ross 
   2258  1.1      ross /*
   2259  1.1      ross -------------------------------------------------------------------------------
   2260  1.1      ross Returns 1 if the single-precision floating-point value `a' is less than or
   2261  1.1      ross equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs do not
   2262  1.1      ross cause an exception.  Otherwise, the comparison is performed according to the
   2263  1.1      ross IEC/IEEE Standard for Binary Floating-Point Arithmetic.
   2264  1.1      ross -------------------------------------------------------------------------------
   2265  1.1      ross */
   2266  1.1      ross flag float32_le_quiet( float32 a, float32 b )
   2267  1.1      ross {
   2268  1.1      ross     flag aSign, bSign;
   2269  1.1      ross 
   2270  1.1      ross     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
   2271  1.1      ross          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
   2272  1.1      ross        ) {
   2273  1.1      ross         if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) {
   2274  1.1      ross             float_raise( float_flag_invalid );
   2275  1.1      ross         }
   2276  1.1      ross         return 0;
   2277  1.1      ross     }
   2278  1.1      ross     aSign = extractFloat32Sign( a );
   2279  1.1      ross     bSign = extractFloat32Sign( b );
   2280  1.1      ross     if ( aSign != bSign ) return aSign || ( (bits32) ( ( a | b )<<1 ) == 0 );
   2281  1.1      ross     return ( a == b ) || ( aSign ^ ( a < b ) );
   2282  1.1      ross 
   2283  1.1      ross }
   2284  1.1      ross 
   2285  1.1      ross /*
   2286  1.1      ross -------------------------------------------------------------------------------
   2287  1.1      ross Returns 1 if the single-precision floating-point value `a' is less than
   2288  1.1      ross the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
   2289  1.1      ross exception.  Otherwise, the comparison is performed according to the IEC/IEEE
   2290  1.1      ross Standard for Binary Floating-Point Arithmetic.
   2291  1.1      ross -------------------------------------------------------------------------------
   2292  1.1      ross */
   2293  1.1      ross flag float32_lt_quiet( float32 a, float32 b )
   2294  1.1      ross {
   2295  1.1      ross     flag aSign, bSign;
   2296  1.1      ross 
   2297  1.1      ross     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
   2298  1.1      ross          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
   2299  1.1      ross        ) {
   2300  1.1      ross         if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) {
   2301  1.1      ross             float_raise( float_flag_invalid );
   2302  1.1      ross         }
   2303  1.1      ross         return 0;
   2304  1.1      ross     }
   2305  1.1      ross     aSign = extractFloat32Sign( a );
   2306  1.1      ross     bSign = extractFloat32Sign( b );
   2307  1.1      ross     if ( aSign != bSign ) return aSign && ( (bits32) ( ( a | b )<<1 ) != 0 );
   2308  1.1      ross     return ( a != b ) && ( aSign ^ ( a < b ) );
   2309  1.1      ross 
   2310  1.1      ross }
   2311  1.1      ross #endif /* !SOFTFLOAT_FOR_GCC */
   2312  1.1      ross 
   2313  1.1      ross #ifndef SOFTFLOAT_FOR_GCC /* Not needed */
   2314  1.1      ross /*
   2315  1.1      ross -------------------------------------------------------------------------------
   2316  1.1      ross Returns the result of converting the double-precision floating-point value
   2317  1.1      ross `a' to the 32-bit two's complement integer format.  The conversion is
   2318  1.1      ross performed according to the IEC/IEEE Standard for Binary Floating-Point
   2319  1.1      ross Arithmetic---which means in particular that the conversion is rounded
   2320  1.1      ross according to the current rounding mode.  If `a' is a NaN, the largest
   2321  1.1      ross positive integer is returned.  Otherwise, if the conversion overflows, the
   2322  1.1      ross largest integer with the same sign as `a' is returned.
   2323  1.1      ross -------------------------------------------------------------------------------
   2324  1.1      ross */
   2325  1.1      ross int32 float64_to_int32( float64 a )
   2326  1.1      ross {
   2327  1.1      ross     flag aSign;
   2328  1.1      ross     int16 aExp, shiftCount;
   2329  1.1      ross     bits64 aSig;
   2330  1.1      ross 
   2331  1.1      ross     aSig = extractFloat64Frac( a );
   2332  1.1      ross     aExp = extractFloat64Exp( a );
   2333  1.1      ross     aSign = extractFloat64Sign( a );
   2334  1.1      ross     if ( ( aExp == 0x7FF ) && aSig ) aSign = 0;
   2335  1.1      ross     if ( aExp ) aSig |= LIT64( 0x0010000000000000 );
   2336  1.1      ross     shiftCount = 0x42C - aExp;
   2337  1.1      ross     if ( 0 < shiftCount ) shift64RightJamming( aSig, shiftCount, &aSig );
   2338  1.1      ross     return roundAndPackInt32( aSign, aSig );
   2339  1.1      ross 
   2340  1.1      ross }
   2341  1.1      ross #endif /* !SOFTFLOAT_FOR_GCC */
   2342  1.1      ross 
   2343  1.1      ross /*
   2344  1.1      ross -------------------------------------------------------------------------------
   2345  1.1      ross Returns the result of converting the double-precision floating-point value
   2346  1.1      ross `a' to the 32-bit two's complement integer format.  The conversion is
   2347  1.1      ross performed according to the IEC/IEEE Standard for Binary Floating-Point
   2348  1.1      ross Arithmetic, except that the conversion is always rounded toward zero.
   2349  1.1      ross If `a' is a NaN, the largest positive integer is returned.  Otherwise, if
   2350  1.1      ross the conversion overflows, the largest integer with the same sign as `a' is
   2351  1.1      ross returned.
   2352  1.1      ross -------------------------------------------------------------------------------
   2353  1.1      ross */
   2354  1.1      ross int32 float64_to_int32_round_to_zero( float64 a )
   2355  1.1      ross {
   2356  1.1      ross     flag aSign;
   2357  1.1      ross     int16 aExp, shiftCount;
   2358  1.1      ross     bits64 aSig, savedASig;
   2359  1.1      ross     int32 z;
   2360  1.1      ross 
   2361  1.1      ross     aSig = extractFloat64Frac( a );
   2362  1.1      ross     aExp = extractFloat64Exp( a );
   2363  1.1      ross     aSign = extractFloat64Sign( a );
   2364  1.1      ross     if ( 0x41E < aExp ) {
   2365  1.1      ross         if ( ( aExp == 0x7FF ) && aSig ) aSign = 0;
   2366  1.1      ross         goto invalid;
   2367  1.1      ross     }
   2368  1.1      ross     else if ( aExp < 0x3FF ) {
   2369  1.1      ross         if ( aExp || aSig ) float_set_inexact();
   2370  1.1      ross         return 0;
   2371  1.1      ross     }
   2372  1.1      ross     aSig |= LIT64( 0x0010000000000000 );
   2373  1.1      ross     shiftCount = 0x433 - aExp;
   2374  1.1      ross     savedASig = aSig;
   2375  1.1      ross     aSig >>= shiftCount;
   2376  1.1      ross     z = aSig;
   2377  1.1      ross     if ( aSign ) z = - z;
   2378  1.1      ross     if ( ( z < 0 ) ^ aSign ) {
   2379  1.1      ross  invalid:
   2380  1.1      ross         float_raise( float_flag_invalid );
   2381  1.1      ross         return aSign ? (sbits32) 0x80000000 : 0x7FFFFFFF;
   2382  1.1      ross     }
   2383  1.1      ross     if ( ( aSig<<shiftCount ) != savedASig ) {
   2384  1.1      ross         float_set_inexact();
   2385  1.1      ross     }
   2386  1.1      ross     return z;
   2387  1.1      ross 
   2388  1.1      ross }
   2389  1.1      ross 
   2390  1.1      ross #ifndef SOFTFLOAT_FOR_GCC /* Not needed */
   2391  1.1      ross /*
   2392  1.1      ross -------------------------------------------------------------------------------
   2393  1.1      ross Returns the result of converting the double-precision floating-point value
   2394  1.1      ross `a' to the 64-bit two's complement integer format.  The conversion is
   2395  1.1      ross performed according to the IEC/IEEE Standard for Binary Floating-Point
   2396  1.1      ross Arithmetic---which means in particular that the conversion is rounded
   2397  1.1      ross according to the current rounding mode.  If `a' is a NaN, the largest
   2398  1.1      ross positive integer is returned.  Otherwise, if the conversion overflows, the
   2399  1.1      ross largest integer with the same sign as `a' is returned.
   2400  1.1      ross -------------------------------------------------------------------------------
   2401  1.1      ross */
   2402  1.1      ross int64 float64_to_int64( float64 a )
   2403  1.1      ross {
   2404  1.1      ross     flag aSign;
   2405  1.1      ross     int16 aExp, shiftCount;
   2406  1.1      ross     bits64 aSig, aSigExtra;
   2407  1.1      ross 
   2408  1.1      ross     aSig = extractFloat64Frac( a );
   2409  1.1      ross     aExp = extractFloat64Exp( a );
   2410  1.1      ross     aSign = extractFloat64Sign( a );
   2411  1.1      ross     if ( aExp ) aSig |= LIT64( 0x0010000000000000 );
   2412  1.1      ross     shiftCount = 0x433 - aExp;
   2413  1.1      ross     if ( shiftCount <= 0 ) {
   2414  1.1      ross         if ( 0x43E < aExp ) {
   2415  1.1      ross             float_raise( float_flag_invalid );
   2416  1.1      ross             if (    ! aSign
   2417  1.1      ross                  || (    ( aExp == 0x7FF )
   2418  1.1      ross                       && ( aSig != LIT64( 0x0010000000000000 ) ) )
   2419  1.1      ross                ) {
   2420  1.1      ross                 return LIT64( 0x7FFFFFFFFFFFFFFF );
   2421  1.1      ross             }
   2422  1.1      ross             return (sbits64) LIT64( 0x8000000000000000 );
   2423  1.1      ross         }
   2424  1.1      ross         aSigExtra = 0;
   2425  1.1      ross         aSig <<= - shiftCount;
   2426  1.1      ross     }
   2427  1.1      ross     else {
   2428  1.1      ross         shift64ExtraRightJamming( aSig, 0, shiftCount, &aSig, &aSigExtra );
   2429  1.1      ross     }
   2430  1.1      ross     return roundAndPackInt64( aSign, aSig, aSigExtra );
   2431  1.1      ross 
   2432  1.1      ross }
   2433  1.1      ross 
   2434  1.6    martin /* like above, but result is unsigned */
   2435  1.6    martin uint64 float64_to_uint64( float64 a )
   2436  1.6    martin {
   2437  1.6    martin     flag aSign;
   2438  1.6    martin     int16 aExp, shiftCount;
   2439  1.6    martin     bits64 aSig, aSigExtra;
   2440  1.6    martin 
   2441  1.6    martin     aSig = extractFloat64Frac( a );
   2442  1.6    martin     aExp = extractFloat64Exp( a );
   2443  1.6    martin     aSign = extractFloat64Sign( a );
   2444  1.6    martin 
   2445  1.6    martin     if (aSign) {
   2446  1.6    martin 	return float64_to_int64(a);
   2447  1.6    martin     }
   2448  1.6    martin 
   2449  1.6    martin     if ( aExp ) aSig |= LIT64( 0x0010000000000000 );
   2450  1.6    martin     shiftCount = 0x433 - aExp;
   2451  1.6    martin     if ( shiftCount <= 0 ) {
   2452  1.6    martin         if ( 0x43E < aExp ) {
   2453  1.6    martin             float_raise( float_flag_invalid );
   2454  1.6    martin             if (    ! aSign
   2455  1.6    martin                  || (    ( aExp == 0x7FF )
   2456  1.6    martin                       && ( aSig != LIT64( 0x0010000000000000 ) ) )
   2457  1.6    martin                ) {
   2458  1.6    martin                 return LIT64( 0x7FFFFFFFFFFFFFFF );
   2459  1.6    martin             }
   2460  1.6    martin             return (sbits64) LIT64( 0x8000000000000000 );
   2461  1.6    martin         }
   2462  1.6    martin         aSigExtra = 0;
   2463  1.6    martin         aSig <<= - shiftCount;
   2464  1.6    martin     }
   2465  1.6    martin     else {
   2466  1.6    martin         shift64ExtraRightJamming( aSig, 0, shiftCount, &aSig, &aSigExtra );
   2467  1.6    martin     }
   2468  1.6    martin     return roundAndPackUInt64( aSig, aSigExtra );
   2469  1.6    martin 
   2470  1.6    martin }
   2471  1.6    martin 
   2472  1.1      ross /*
   2473  1.1      ross -------------------------------------------------------------------------------
   2474  1.1      ross Returns the result of converting the double-precision floating-point value
   2475  1.1      ross `a' to the 64-bit two's complement integer format.  The conversion is
   2476  1.1      ross performed according to the IEC/IEEE Standard for Binary Floating-Point
   2477  1.1      ross Arithmetic, except that the conversion is always rounded toward zero.
   2478  1.1      ross If `a' is a NaN, the largest positive integer is returned.  Otherwise, if
   2479  1.1      ross the conversion overflows, the largest integer with the same sign as `a' is
   2480  1.1      ross returned.
   2481  1.1      ross -------------------------------------------------------------------------------
   2482  1.1      ross */
   2483  1.1      ross int64 float64_to_int64_round_to_zero( float64 a )
   2484  1.1      ross {
   2485  1.1      ross     flag aSign;
   2486  1.1      ross     int16 aExp, shiftCount;
   2487  1.1      ross     bits64 aSig;
   2488  1.1      ross     int64 z;
   2489  1.1      ross 
   2490  1.1      ross     aSig = extractFloat64Frac( a );
   2491  1.1      ross     aExp = extractFloat64Exp( a );
   2492  1.1      ross     aSign = extractFloat64Sign( a );
   2493  1.1      ross     if ( aExp ) aSig |= LIT64( 0x0010000000000000 );
   2494  1.1      ross     shiftCount = aExp - 0x433;
   2495  1.1      ross     if ( 0 <= shiftCount ) {
   2496  1.1      ross         if ( 0x43E <= aExp ) {
   2497  1.1      ross             if ( a != LIT64( 0xC3E0000000000000 ) ) {
   2498  1.1      ross                 float_raise( float_flag_invalid );
   2499  1.1      ross                 if (    ! aSign
   2500  1.1      ross                      || (    ( aExp == 0x7FF )
   2501  1.1      ross                           && ( aSig != LIT64( 0x0010000000000000 ) ) )
   2502  1.1      ross                    ) {
   2503  1.1      ross                     return LIT64( 0x7FFFFFFFFFFFFFFF );
   2504  1.1      ross                 }
   2505  1.1      ross             }
   2506  1.1      ross             return (sbits64) LIT64( 0x8000000000000000 );
   2507  1.1      ross         }
   2508  1.1      ross         z = aSig<<shiftCount;
   2509  1.1      ross     }
   2510  1.1      ross     else {
   2511  1.1      ross         if ( aExp < 0x3FE ) {
   2512  1.1      ross             if ( aExp | aSig ) float_set_inexact();
   2513  1.1      ross             return 0;
   2514  1.1      ross         }
   2515  1.1      ross         z = aSig>>( - shiftCount );
   2516  1.1      ross         if ( (bits64) ( aSig<<( shiftCount & 63 ) ) ) {
   2517  1.1      ross             float_set_inexact();
   2518  1.1      ross         }
   2519  1.1      ross     }
   2520  1.1      ross     if ( aSign ) z = - z;
   2521  1.1      ross     return z;
   2522  1.1      ross 
   2523  1.1      ross }
   2524  1.1      ross #endif /* !SOFTFLOAT_FOR_GCC */
   2525  1.1      ross 
   2526  1.1      ross /*
   2527  1.1      ross -------------------------------------------------------------------------------
   2528  1.1      ross Returns the result of converting the double-precision floating-point value
   2529  1.1      ross `a' to the single-precision floating-point format.  The conversion is
   2530  1.1      ross performed according to the IEC/IEEE Standard for Binary Floating-Point
   2531  1.1      ross Arithmetic.
   2532  1.1      ross -------------------------------------------------------------------------------
   2533  1.1      ross */
   2534  1.1      ross float32 float64_to_float32( float64 a )
   2535  1.1      ross {
   2536  1.1      ross     flag aSign;
   2537  1.1      ross     int16 aExp;
   2538  1.1      ross     bits64 aSig;
   2539  1.1      ross     bits32 zSig;
   2540  1.1      ross 
   2541  1.1      ross     aSig = extractFloat64Frac( a );
   2542  1.1      ross     aExp = extractFloat64Exp( a );
   2543  1.1      ross     aSign = extractFloat64Sign( a );
   2544  1.1      ross     if ( aExp == 0x7FF ) {
   2545  1.1      ross         if ( aSig ) return commonNaNToFloat32( float64ToCommonNaN( a ) );
   2546  1.1      ross         return packFloat32( aSign, 0xFF, 0 );
   2547  1.1      ross     }
   2548  1.1      ross     shift64RightJamming( aSig, 22, &aSig );
   2549  1.1      ross     zSig = aSig;
   2550  1.1      ross     if ( aExp || zSig ) {
   2551  1.1      ross         zSig |= 0x40000000;
   2552  1.1      ross         aExp -= 0x381;
   2553  1.1      ross     }
   2554  1.1      ross     return roundAndPackFloat32( aSign, aExp, zSig );
   2555  1.1      ross 
   2556  1.1      ross }
   2557  1.1      ross 
   2558  1.1      ross #ifdef FLOATX80
   2559  1.1      ross 
   2560  1.1      ross /*
   2561  1.1      ross -------------------------------------------------------------------------------
   2562  1.1      ross Returns the result of converting the double-precision floating-point value
   2563  1.1      ross `a' to the extended double-precision floating-point format.  The conversion
   2564  1.1      ross is performed according to the IEC/IEEE Standard for Binary Floating-Point
   2565  1.1      ross Arithmetic.
   2566  1.1      ross -------------------------------------------------------------------------------
   2567  1.1      ross */
   2568  1.1      ross floatx80 float64_to_floatx80( float64 a )
   2569  1.1      ross {
   2570  1.1      ross     flag aSign;
   2571  1.1      ross     int16 aExp;
   2572  1.1      ross     bits64 aSig;
   2573  1.1      ross 
   2574  1.1      ross     aSig = extractFloat64Frac( a );
   2575  1.1      ross     aExp = extractFloat64Exp( a );
   2576  1.1      ross     aSign = extractFloat64Sign( a );
   2577  1.1      ross     if ( aExp == 0x7FF ) {
   2578  1.1      ross         if ( aSig ) return commonNaNToFloatx80( float64ToCommonNaN( a ) );
   2579  1.1      ross         return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
   2580  1.1      ross     }
   2581  1.1      ross     if ( aExp == 0 ) {
   2582  1.1      ross         if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 );
   2583  1.1      ross         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
   2584  1.1      ross     }
   2585  1.1      ross     return
   2586  1.1      ross         packFloatx80(
   2587  1.1      ross             aSign, aExp + 0x3C00, ( aSig | LIT64( 0x0010000000000000 ) )<<11 );
   2588  1.1      ross 
   2589  1.1      ross }
   2590  1.1      ross 
   2591  1.1      ross #endif
   2592  1.1      ross 
   2593  1.1      ross #ifdef FLOAT128
   2594  1.1      ross 
   2595  1.1      ross /*
   2596  1.1      ross -------------------------------------------------------------------------------
   2597  1.1      ross Returns the result of converting the double-precision floating-point value
   2598  1.1      ross `a' to the quadruple-precision floating-point format.  The conversion is
   2599  1.1      ross performed according to the IEC/IEEE Standard for Binary Floating-Point
   2600  1.1      ross Arithmetic.
   2601  1.1      ross -------------------------------------------------------------------------------
   2602  1.1      ross */
   2603  1.1      ross float128 float64_to_float128( float64 a )
   2604  1.1      ross {
   2605  1.1      ross     flag aSign;
   2606  1.1      ross     int16 aExp;
   2607  1.1      ross     bits64 aSig, zSig0, zSig1;
   2608  1.1      ross 
   2609  1.1      ross     aSig = extractFloat64Frac( a );
   2610  1.1      ross     aExp = extractFloat64Exp( a );
   2611  1.1      ross     aSign = extractFloat64Sign( a );
   2612  1.1      ross     if ( aExp == 0x7FF ) {
   2613  1.1      ross         if ( aSig ) return commonNaNToFloat128( float64ToCommonNaN( a ) );
   2614  1.1      ross         return packFloat128( aSign, 0x7FFF, 0, 0 );
   2615  1.1      ross     }
   2616  1.1      ross     if ( aExp == 0 ) {
   2617  1.1      ross         if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 );
   2618  1.1      ross         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
   2619  1.1      ross         --aExp;
   2620  1.1      ross     }
   2621  1.1      ross     shift128Right( aSig, 0, 4, &zSig0, &zSig1 );
   2622  1.1      ross     return packFloat128( aSign, aExp + 0x3C00, zSig0, zSig1 );
   2623  1.1      ross 
   2624  1.1      ross }
   2625  1.1      ross 
   2626  1.1      ross #endif
   2627  1.1      ross 
   2628  1.1      ross #ifndef SOFTFLOAT_FOR_GCC
   2629  1.1      ross /*
   2630  1.1      ross -------------------------------------------------------------------------------
   2631  1.1      ross Rounds the double-precision floating-point value `a' to an integer, and
   2632  1.1      ross returns the result as a double-precision floating-point value.  The
   2633  1.1      ross operation is performed according to the IEC/IEEE Standard for Binary
   2634  1.1      ross Floating-Point Arithmetic.
   2635  1.1      ross -------------------------------------------------------------------------------
   2636  1.1      ross */
   2637  1.1      ross float64 float64_round_to_int( float64 a )
   2638  1.1      ross {
   2639  1.1      ross     flag aSign;
   2640  1.1      ross     int16 aExp;
   2641  1.1      ross     bits64 lastBitMask, roundBitsMask;
   2642  1.1      ross     int8 roundingMode;
   2643  1.1      ross     float64 z;
   2644  1.1      ross 
   2645  1.1      ross     aExp = extractFloat64Exp( a );
   2646  1.1      ross     if ( 0x433 <= aExp ) {
   2647  1.1      ross         if ( ( aExp == 0x7FF ) && extractFloat64Frac( a ) ) {
   2648  1.1      ross             return propagateFloat64NaN( a, a );
   2649  1.1      ross         }
   2650  1.1      ross         return a;
   2651  1.1      ross     }
   2652  1.1      ross     if ( aExp < 0x3FF ) {
   2653  1.1      ross         if ( (bits64) ( a<<1 ) == 0 ) return a;
   2654  1.1      ross         float_set_inexact();
   2655  1.1      ross         aSign = extractFloat64Sign( a );
   2656  1.1      ross         switch ( float_rounding_mode() ) {
   2657  1.1      ross          case float_round_nearest_even:
   2658  1.1      ross             if ( ( aExp == 0x3FE ) && extractFloat64Frac( a ) ) {
   2659  1.1      ross                 return packFloat64( aSign, 0x3FF, 0 );
   2660  1.1      ross             }
   2661  1.1      ross             break;
   2662  1.1      ross          case float_round_down:
   2663  1.1      ross             return aSign ? LIT64( 0xBFF0000000000000 ) : 0;
   2664  1.1      ross          case float_round_up:
   2665  1.1      ross             return
   2666  1.1      ross             aSign ? LIT64( 0x8000000000000000 ) : LIT64( 0x3FF0000000000000 );
   2667  1.1      ross         }
   2668  1.1      ross         return packFloat64( aSign, 0, 0 );
   2669  1.1      ross     }
   2670  1.1      ross     lastBitMask = 1;
   2671  1.1      ross     lastBitMask <<= 0x433 - aExp;
   2672  1.1      ross     roundBitsMask = lastBitMask - 1;
   2673  1.1      ross     z = a;
   2674  1.1      ross     roundingMode = float_rounding_mode();
   2675  1.1      ross     if ( roundingMode == float_round_nearest_even ) {
   2676  1.1      ross         z += lastBitMask>>1;
   2677  1.1      ross         if ( ( z & roundBitsMask ) == 0 ) z &= ~ lastBitMask;
   2678  1.1      ross     }
   2679  1.1      ross     else if ( roundingMode != float_round_to_zero ) {
   2680  1.1      ross         if ( extractFloat64Sign( z ) ^ ( roundingMode == float_round_up ) ) {
   2681  1.1      ross             z += roundBitsMask;
   2682  1.1      ross         }
   2683  1.1      ross     }
   2684  1.1      ross     z &= ~ roundBitsMask;
   2685  1.1      ross     if ( z != a ) float_set_inexact();
   2686  1.1      ross     return z;
   2687  1.1      ross 
   2688  1.1      ross }
   2689  1.1      ross #endif
   2690  1.1      ross 
   2691  1.1      ross /*
   2692  1.1      ross -------------------------------------------------------------------------------
   2693  1.1      ross Returns the result of adding the absolute values of the double-precision
   2694  1.1      ross floating-point values `a' and `b'.  If `zSign' is 1, the sum is negated
   2695  1.1      ross before being returned.  `zSign' is ignored if the result is a NaN.
   2696  1.1      ross The addition is performed according to the IEC/IEEE Standard for Binary
   2697  1.1      ross Floating-Point Arithmetic.
   2698  1.1      ross -------------------------------------------------------------------------------
   2699  1.1      ross */
   2700  1.1      ross static float64 addFloat64Sigs( float64 a, float64 b, flag zSign )
   2701  1.1      ross {
   2702  1.1      ross     int16 aExp, bExp, zExp;
   2703  1.1      ross     bits64 aSig, bSig, zSig;
   2704  1.1      ross     int16 expDiff;
   2705  1.1      ross 
   2706  1.1      ross     aSig = extractFloat64Frac( a );
   2707  1.1      ross     aExp = extractFloat64Exp( a );
   2708  1.1      ross     bSig = extractFloat64Frac( b );
   2709  1.1      ross     bExp = extractFloat64Exp( b );
   2710  1.1      ross     expDiff = aExp - bExp;
   2711  1.1      ross     aSig <<= 9;
   2712  1.1      ross     bSig <<= 9;
   2713  1.1      ross     if ( 0 < expDiff ) {
   2714  1.1      ross         if ( aExp == 0x7FF ) {
   2715  1.1      ross             if ( aSig ) return propagateFloat64NaN( a, b );
   2716  1.1      ross             return a;
   2717  1.1      ross         }
   2718  1.1      ross         if ( bExp == 0 ) {
   2719  1.1      ross             --expDiff;
   2720  1.1      ross         }
   2721  1.1      ross         else {
   2722  1.1      ross             bSig |= LIT64( 0x2000000000000000 );
   2723  1.1      ross         }
   2724  1.1      ross         shift64RightJamming( bSig, expDiff, &bSig );
   2725  1.1      ross         zExp = aExp;
   2726  1.1      ross     }
   2727  1.1      ross     else if ( expDiff < 0 ) {
   2728  1.1      ross         if ( bExp == 0x7FF ) {
   2729  1.1      ross             if ( bSig ) return propagateFloat64NaN( a, b );
   2730  1.1      ross             return packFloat64( zSign, 0x7FF, 0 );
   2731  1.1      ross         }
   2732  1.1      ross         if ( aExp == 0 ) {
   2733  1.1      ross             ++expDiff;
   2734  1.1      ross         }
   2735  1.1      ross         else {
   2736  1.1      ross             aSig |= LIT64( 0x2000000000000000 );
   2737  1.1      ross         }
   2738  1.1      ross         shift64RightJamming( aSig, - expDiff, &aSig );
   2739  1.1      ross         zExp = bExp;
   2740  1.1      ross     }
   2741  1.1      ross     else {
   2742  1.1      ross         if ( aExp == 0x7FF ) {
   2743  1.1      ross             if ( aSig | bSig ) return propagateFloat64NaN( a, b );
   2744  1.1      ross             return a;
   2745  1.1      ross         }
   2746  1.1      ross         if ( aExp == 0 ) return packFloat64( zSign, 0, ( aSig + bSig )>>9 );
   2747  1.1      ross         zSig = LIT64( 0x4000000000000000 ) + aSig + bSig;
   2748  1.1      ross         zExp = aExp;
   2749  1.1      ross         goto roundAndPack;
   2750  1.1      ross     }
   2751  1.1      ross     aSig |= LIT64( 0x2000000000000000 );
   2752  1.1      ross     zSig = ( aSig + bSig )<<1;
   2753  1.1      ross     --zExp;
   2754  1.1      ross     if ( (sbits64) zSig < 0 ) {
   2755  1.1      ross         zSig = aSig + bSig;
   2756  1.1      ross         ++zExp;
   2757  1.1      ross     }
   2758  1.1      ross  roundAndPack:
   2759  1.1      ross     return roundAndPackFloat64( zSign, zExp, zSig );
   2760  1.1      ross 
   2761  1.1      ross }
   2762  1.1      ross 
   2763  1.1      ross /*
   2764  1.1      ross -------------------------------------------------------------------------------
   2765  1.1      ross Returns the result of subtracting the absolute values of the double-
   2766  1.1      ross precision floating-point values `a' and `b'.  If `zSign' is 1, the
   2767  1.1      ross difference is negated before being returned.  `zSign' is ignored if the
   2768  1.1      ross result is a NaN.  The subtraction is performed according to the IEC/IEEE
   2769  1.1      ross Standard for Binary Floating-Point Arithmetic.
   2770  1.1      ross -------------------------------------------------------------------------------
   2771  1.1      ross */
   2772  1.1      ross static float64 subFloat64Sigs( float64 a, float64 b, flag zSign )
   2773  1.1      ross {
   2774  1.1      ross     int16 aExp, bExp, zExp;
   2775  1.1      ross     bits64 aSig, bSig, zSig;
   2776  1.1      ross     int16 expDiff;
   2777  1.1      ross 
   2778  1.1      ross     aSig = extractFloat64Frac( a );
   2779  1.1      ross     aExp = extractFloat64Exp( a );
   2780  1.1      ross     bSig = extractFloat64Frac( b );
   2781  1.1      ross     bExp = extractFloat64Exp( b );
   2782  1.1      ross     expDiff = aExp - bExp;
   2783  1.1      ross     aSig <<= 10;
   2784  1.1      ross     bSig <<= 10;
   2785  1.1      ross     if ( 0 < expDiff ) goto aExpBigger;
   2786  1.1      ross     if ( expDiff < 0 ) goto bExpBigger;
   2787  1.1      ross     if ( aExp == 0x7FF ) {
   2788  1.1      ross         if ( aSig | bSig ) return propagateFloat64NaN( a, b );
   2789  1.1      ross         float_raise( float_flag_invalid );
   2790  1.1      ross         return float64_default_nan;
   2791  1.1      ross     }
   2792  1.1      ross     if ( aExp == 0 ) {
   2793  1.1      ross         aExp = 1;
   2794  1.1      ross         bExp = 1;
   2795  1.1      ross     }
   2796  1.1      ross     if ( bSig < aSig ) goto aBigger;
   2797  1.1      ross     if ( aSig < bSig ) goto bBigger;
   2798  1.1      ross     return packFloat64( float_rounding_mode() == float_round_down, 0, 0 );
   2799  1.1      ross  bExpBigger:
   2800  1.1      ross     if ( bExp == 0x7FF ) {
   2801  1.1      ross         if ( bSig ) return propagateFloat64NaN( a, b );
   2802  1.1      ross         return packFloat64( zSign ^ 1, 0x7FF, 0 );
   2803  1.1      ross     }
   2804  1.1      ross     if ( aExp == 0 ) {
   2805  1.1      ross         ++expDiff;
   2806  1.1      ross     }
   2807  1.1      ross     else {
   2808  1.1      ross         aSig |= LIT64( 0x4000000000000000 );
   2809  1.1      ross     }
   2810  1.1      ross     shift64RightJamming( aSig, - expDiff, &aSig );
   2811  1.1      ross     bSig |= LIT64( 0x4000000000000000 );
   2812  1.1      ross  bBigger:
   2813  1.1      ross     zSig = bSig - aSig;
   2814  1.1      ross     zExp = bExp;
   2815  1.1      ross     zSign ^= 1;
   2816  1.1      ross     goto normalizeRoundAndPack;
   2817  1.1      ross  aExpBigger:
   2818  1.1      ross     if ( aExp == 0x7FF ) {
   2819  1.1      ross         if ( aSig ) return propagateFloat64NaN( a, b );
   2820  1.1      ross         return a;
   2821  1.1      ross     }
   2822  1.1      ross     if ( bExp == 0 ) {
   2823  1.1      ross         --expDiff;
   2824  1.1      ross     }
   2825  1.1      ross     else {
   2826  1.1      ross         bSig |= LIT64( 0x4000000000000000 );
   2827  1.1      ross     }
   2828  1.1      ross     shift64RightJamming( bSig, expDiff, &bSig );
   2829  1.1      ross     aSig |= LIT64( 0x4000000000000000 );
   2830  1.1      ross  aBigger:
   2831  1.1      ross     zSig = aSig - bSig;
   2832  1.1      ross     zExp = aExp;
   2833  1.1      ross  normalizeRoundAndPack:
   2834  1.1      ross     --zExp;
   2835  1.1      ross     return normalizeRoundAndPackFloat64( zSign, zExp, zSig );
   2836  1.1      ross 
   2837  1.1      ross }
   2838  1.1      ross 
   2839  1.1      ross /*
   2840  1.1      ross -------------------------------------------------------------------------------
   2841  1.1      ross Returns the result of adding the double-precision floating-point values `a'
   2842  1.1      ross and `b'.  The operation is performed according to the IEC/IEEE Standard for
   2843  1.1      ross Binary Floating-Point Arithmetic.
   2844  1.1      ross -------------------------------------------------------------------------------
   2845  1.1      ross */
   2846  1.1      ross float64 float64_add( float64 a, float64 b )
   2847  1.1      ross {
   2848  1.1      ross     flag aSign, bSign;
   2849  1.1      ross 
   2850  1.1      ross     aSign = extractFloat64Sign( a );
   2851  1.1      ross     bSign = extractFloat64Sign( b );
   2852  1.1      ross     if ( aSign == bSign ) {
   2853  1.1      ross         return addFloat64Sigs( a, b, aSign );
   2854  1.1      ross     }
   2855  1.1      ross     else {
   2856  1.1      ross         return subFloat64Sigs( a, b, aSign );
   2857  1.1      ross     }
   2858  1.1      ross 
   2859  1.1      ross }
   2860  1.1      ross 
   2861  1.1      ross /*
   2862  1.1      ross -------------------------------------------------------------------------------
   2863  1.1      ross Returns the result of subtracting the double-precision floating-point values
   2864  1.1      ross `a' and `b'.  The operation is performed according to the IEC/IEEE Standard
   2865  1.1      ross for Binary Floating-Point Arithmetic.
   2866  1.1      ross -------------------------------------------------------------------------------
   2867  1.1      ross */
   2868  1.1      ross float64 float64_sub( float64 a, float64 b )
   2869  1.1      ross {
   2870  1.1      ross     flag aSign, bSign;
   2871  1.1      ross 
   2872  1.1      ross     aSign = extractFloat64Sign( a );
   2873  1.1      ross     bSign = extractFloat64Sign( b );
   2874  1.1      ross     if ( aSign == bSign ) {
   2875  1.1      ross         return subFloat64Sigs( a, b, aSign );
   2876  1.1      ross     }
   2877  1.1      ross     else {
   2878  1.1      ross         return addFloat64Sigs( a, b, aSign );
   2879  1.1      ross     }
   2880  1.1      ross 
   2881  1.1      ross }
   2882  1.1      ross 
   2883  1.1      ross /*
   2884  1.1      ross -------------------------------------------------------------------------------
   2885  1.1      ross Returns the result of multiplying the double-precision floating-point values
   2886  1.1      ross `a' and `b'.  The operation is performed according to the IEC/IEEE Standard
   2887  1.1      ross for Binary Floating-Point Arithmetic.
   2888  1.1      ross -------------------------------------------------------------------------------
   2889  1.1      ross */
   2890  1.1      ross float64 float64_mul( float64 a, float64 b )
   2891  1.1      ross {
   2892  1.1      ross     flag aSign, bSign, zSign;
   2893  1.1      ross     int16 aExp, bExp, zExp;
   2894  1.1      ross     bits64 aSig, bSig, zSig0, zSig1;
   2895  1.1      ross 
   2896  1.1      ross     aSig = extractFloat64Frac( a );
   2897  1.1      ross     aExp = extractFloat64Exp( a );
   2898  1.1      ross     aSign = extractFloat64Sign( a );
   2899  1.1      ross     bSig = extractFloat64Frac( b );
   2900  1.1      ross     bExp = extractFloat64Exp( b );
   2901  1.1      ross     bSign = extractFloat64Sign( b );
   2902  1.1      ross     zSign = aSign ^ bSign;
   2903  1.1      ross     if ( aExp == 0x7FF ) {
   2904  1.1      ross         if ( aSig || ( ( bExp == 0x7FF ) && bSig ) ) {
   2905  1.1      ross             return propagateFloat64NaN( a, b );
   2906  1.1      ross         }
   2907  1.1      ross         if ( ( bExp | bSig ) == 0 ) {
   2908  1.1      ross             float_raise( float_flag_invalid );
   2909  1.1      ross             return float64_default_nan;
   2910  1.1      ross         }
   2911  1.1      ross         return packFloat64( zSign, 0x7FF, 0 );
   2912  1.1      ross     }
   2913  1.1      ross     if ( bExp == 0x7FF ) {
   2914  1.1      ross         if ( bSig ) return propagateFloat64NaN( a, b );
   2915  1.1      ross         if ( ( aExp | aSig ) == 0 ) {
   2916  1.1      ross             float_raise( float_flag_invalid );
   2917  1.1      ross             return float64_default_nan;
   2918  1.1      ross         }
   2919  1.1      ross         return packFloat64( zSign, 0x7FF, 0 );
   2920  1.1      ross     }
   2921  1.1      ross     if ( aExp == 0 ) {
   2922  1.1      ross         if ( aSig == 0 ) return packFloat64( zSign, 0, 0 );
   2923  1.1      ross         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
   2924  1.1      ross     }
   2925  1.1      ross     if ( bExp == 0 ) {
   2926  1.1      ross         if ( bSig == 0 ) return packFloat64( zSign, 0, 0 );
   2927  1.1      ross         normalizeFloat64Subnormal( bSig, &bExp, &bSig );
   2928  1.1      ross     }
   2929  1.1      ross     zExp = aExp + bExp - 0x3FF;
   2930  1.1      ross     aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<10;
   2931  1.1      ross     bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;
   2932  1.1      ross     mul64To128( aSig, bSig, &zSig0, &zSig1 );
   2933  1.1      ross     zSig0 |= ( zSig1 != 0 );
   2934  1.1      ross     if ( 0 <= (sbits64) ( zSig0<<1 ) ) {
   2935  1.1      ross         zSig0 <<= 1;
   2936  1.1      ross         --zExp;
   2937  1.1      ross     }
   2938  1.1      ross     return roundAndPackFloat64( zSign, zExp, zSig0 );
   2939  1.1      ross 
   2940  1.1      ross }
   2941  1.1      ross 
   2942  1.1      ross /*
   2943  1.1      ross -------------------------------------------------------------------------------
   2944  1.1      ross Returns the result of dividing the double-precision floating-point value `a'
   2945  1.1      ross by the corresponding value `b'.  The operation is performed according to
   2946  1.1      ross the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
   2947  1.1      ross -------------------------------------------------------------------------------
   2948  1.1      ross */
   2949  1.1      ross float64 float64_div( float64 a, float64 b )
   2950  1.1      ross {
   2951  1.1      ross     flag aSign, bSign, zSign;
   2952  1.1      ross     int16 aExp, bExp, zExp;
   2953  1.1      ross     bits64 aSig, bSig, zSig;
   2954  1.1      ross     bits64 rem0, rem1;
   2955  1.1      ross     bits64 term0, term1;
   2956  1.1      ross 
   2957  1.1      ross     aSig = extractFloat64Frac( a );
   2958  1.1      ross     aExp = extractFloat64Exp( a );
   2959  1.1      ross     aSign = extractFloat64Sign( a );
   2960  1.1      ross     bSig = extractFloat64Frac( b );
   2961  1.1      ross     bExp = extractFloat64Exp( b );
   2962  1.1      ross     bSign = extractFloat64Sign( b );
   2963  1.1      ross     zSign = aSign ^ bSign;
   2964  1.1      ross     if ( aExp == 0x7FF ) {
   2965  1.1      ross         if ( aSig ) return propagateFloat64NaN( a, b );
   2966  1.1      ross         if ( bExp == 0x7FF ) {
   2967  1.1      ross             if ( bSig ) return propagateFloat64NaN( a, b );
   2968  1.1      ross             float_raise( float_flag_invalid );
   2969  1.1      ross             return float64_default_nan;
   2970  1.1      ross         }
   2971  1.1      ross         return packFloat64( zSign, 0x7FF, 0 );
   2972  1.1      ross     }
   2973  1.1      ross     if ( bExp == 0x7FF ) {
   2974  1.1      ross         if ( bSig ) return propagateFloat64NaN( a, b );
   2975  1.1      ross         return packFloat64( zSign, 0, 0 );
   2976  1.1      ross     }
   2977  1.1      ross     if ( bExp == 0 ) {
   2978  1.1      ross         if ( bSig == 0 ) {
   2979  1.1      ross             if ( ( aExp | aSig ) == 0 ) {
   2980  1.1      ross                 float_raise( float_flag_invalid );
   2981  1.1      ross                 return float64_default_nan;
   2982  1.1      ross             }
   2983  1.1      ross             float_raise( float_flag_divbyzero );
   2984  1.1      ross             return packFloat64( zSign, 0x7FF, 0 );
   2985  1.1      ross         }
   2986  1.1      ross         normalizeFloat64Subnormal( bSig, &bExp, &bSig );
   2987  1.1      ross     }
   2988  1.1      ross     if ( aExp == 0 ) {
   2989  1.1      ross         if ( aSig == 0 ) return packFloat64( zSign, 0, 0 );
   2990  1.1      ross         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
   2991  1.1      ross     }
   2992  1.1      ross     zExp = aExp - bExp + 0x3FD;
   2993  1.1      ross     aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<10;
   2994  1.1      ross     bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;
   2995  1.1      ross     if ( bSig <= ( aSig + aSig ) ) {
   2996  1.1      ross         aSig >>= 1;
   2997  1.1      ross         ++zExp;
   2998  1.1      ross     }
   2999  1.1      ross     zSig = estimateDiv128To64( aSig, 0, bSig );
   3000  1.1      ross     if ( ( zSig & 0x1FF ) <= 2 ) {
   3001  1.1      ross         mul64To128( bSig, zSig, &term0, &term1 );
   3002  1.1      ross         sub128( aSig, 0, term0, term1, &rem0, &rem1 );
   3003  1.1      ross         while ( (sbits64) rem0 < 0 ) {
   3004  1.1      ross             --zSig;
   3005  1.1      ross             add128( rem0, rem1, 0, bSig, &rem0, &rem1 );
   3006  1.1      ross         }
   3007  1.1      ross         zSig |= ( rem1 != 0 );
   3008  1.1      ross     }
   3009  1.1      ross     return roundAndPackFloat64( zSign, zExp, zSig );
   3010  1.1      ross 
   3011  1.1      ross }
   3012  1.1      ross 
   3013  1.1      ross #ifndef SOFTFLOAT_FOR_GCC
   3014  1.1      ross /*
   3015  1.1      ross -------------------------------------------------------------------------------
   3016  1.1      ross Returns the remainder of the double-precision floating-point value `a'
   3017  1.1      ross with respect to the corresponding value `b'.  The operation is performed
   3018  1.1      ross according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
   3019  1.1      ross -------------------------------------------------------------------------------
   3020  1.1      ross */
   3021  1.1      ross float64 float64_rem( float64 a, float64 b )
   3022  1.1      ross {
   3023  1.5  christos     flag aSign, bSign __unused, zSign;
   3024  1.1      ross     int16 aExp, bExp, expDiff;
   3025  1.1      ross     bits64 aSig, bSig;
   3026  1.1      ross     bits64 q, alternateASig;
   3027  1.1      ross     sbits64 sigMean;
   3028  1.1      ross 
   3029  1.1      ross     aSig = extractFloat64Frac( a );
   3030  1.1      ross     aExp = extractFloat64Exp( a );
   3031  1.1      ross     aSign = extractFloat64Sign( a );
   3032  1.1      ross     bSig = extractFloat64Frac( b );
   3033  1.1      ross     bExp = extractFloat64Exp( b );
   3034  1.1      ross     bSign = extractFloat64Sign( b );
   3035  1.1      ross     if ( aExp == 0x7FF ) {
   3036  1.1      ross         if ( aSig || ( ( bExp == 0x7FF ) && bSig ) ) {
   3037  1.1      ross             return propagateFloat64NaN( a, b );
   3038  1.1      ross         }
   3039  1.1      ross         float_raise( float_flag_invalid );
   3040  1.1      ross         return float64_default_nan;
   3041  1.1      ross     }
   3042  1.1      ross     if ( bExp == 0x7FF ) {
   3043  1.1      ross         if ( bSig ) return propagateFloat64NaN( a, b );
   3044  1.1      ross         return a;
   3045  1.1      ross     }
   3046  1.1      ross     if ( bExp == 0 ) {
   3047  1.1      ross         if ( bSig == 0 ) {
   3048  1.1      ross             float_raise( float_flag_invalid );
   3049  1.1      ross             return float64_default_nan;
   3050  1.1      ross         }
   3051  1.1      ross         normalizeFloat64Subnormal( bSig, &bExp, &bSig );
   3052  1.1      ross     }
   3053  1.1      ross     if ( aExp == 0 ) {
   3054  1.1      ross         if ( aSig == 0 ) return a;
   3055  1.1      ross         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
   3056  1.1      ross     }
   3057  1.1      ross     expDiff = aExp - bExp;
   3058  1.1      ross     aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<11;
   3059  1.1      ross     bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;
   3060  1.1      ross     if ( expDiff < 0 ) {
   3061  1.1      ross         if ( expDiff < -1 ) return a;
   3062  1.1      ross         aSig >>= 1;
   3063  1.1      ross     }
   3064  1.1      ross     q = ( bSig <= aSig );
   3065  1.1      ross     if ( q ) aSig -= bSig;
   3066  1.1      ross     expDiff -= 64;
   3067  1.1      ross     while ( 0 < expDiff ) {
   3068  1.1      ross         q = estimateDiv128To64( aSig, 0, bSig );
   3069  1.1      ross         q = ( 2 < q ) ? q - 2 : 0;
   3070  1.1      ross         aSig = - ( ( bSig>>2 ) * q );
   3071  1.1      ross         expDiff -= 62;
   3072  1.1      ross     }
   3073  1.1      ross     expDiff += 64;
   3074  1.1      ross     if ( 0 < expDiff ) {
   3075  1.1      ross         q = estimateDiv128To64( aSig, 0, bSig );
   3076  1.1      ross         q = ( 2 < q ) ? q - 2 : 0;
   3077  1.1      ross         q >>= 64 - expDiff;
   3078  1.1      ross         bSig >>= 2;
   3079  1.1      ross         aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q;
   3080  1.1      ross     }
   3081  1.1      ross     else {
   3082  1.1      ross         aSig >>= 2;
   3083  1.1      ross         bSig >>= 2;
   3084  1.1      ross     }
   3085  1.1      ross     do {
   3086  1.1      ross         alternateASig = aSig;
   3087  1.1      ross         ++q;
   3088  1.1      ross         aSig -= bSig;
   3089  1.1      ross     } while ( 0 <= (sbits64) aSig );
   3090  1.1      ross     sigMean = aSig + alternateASig;
   3091  1.1      ross     if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) {
   3092  1.1      ross         aSig = alternateASig;
   3093  1.1      ross     }
   3094  1.1      ross     zSign = ( (sbits64) aSig < 0 );
   3095  1.1      ross     if ( zSign ) aSig = - aSig;
   3096  1.1      ross     return normalizeRoundAndPackFloat64( aSign ^ zSign, bExp, aSig );
   3097  1.1      ross 
   3098  1.1      ross }
   3099  1.1      ross 
   3100  1.1      ross /*
   3101  1.1      ross -------------------------------------------------------------------------------
   3102  1.1      ross Returns the square root of the double-precision floating-point value `a'.
   3103  1.1      ross The operation is performed according to the IEC/IEEE Standard for Binary
   3104  1.1      ross Floating-Point Arithmetic.
   3105  1.1      ross -------------------------------------------------------------------------------
   3106  1.1      ross */
   3107  1.1      ross float64 float64_sqrt( float64 a )
   3108  1.1      ross {
   3109  1.1      ross     flag aSign;
   3110  1.1      ross     int16 aExp, zExp;
   3111  1.1      ross     bits64 aSig, zSig, doubleZSig;
   3112  1.1      ross     bits64 rem0, rem1, term0, term1;
   3113  1.1      ross 
   3114  1.1      ross     aSig = extractFloat64Frac( a );
   3115  1.1      ross     aExp = extractFloat64Exp( a );
   3116  1.1      ross     aSign = extractFloat64Sign( a );
   3117  1.1      ross     if ( aExp == 0x7FF ) {
   3118  1.1      ross         if ( aSig ) return propagateFloat64NaN( a, a );
   3119  1.1      ross         if ( ! aSign ) return a;
   3120  1.1      ross         float_raise( float_flag_invalid );
   3121  1.1      ross         return float64_default_nan;
   3122  1.1      ross     }
   3123  1.1      ross     if ( aSign ) {
   3124  1.1      ross         if ( ( aExp | aSig ) == 0 ) return a;
   3125  1.1      ross         float_raise( float_flag_invalid );
   3126  1.1      ross         return float64_default_nan;
   3127  1.1      ross     }
   3128  1.1      ross     if ( aExp == 0 ) {
   3129  1.1      ross         if ( aSig == 0 ) return 0;
   3130  1.1      ross         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
   3131  1.1      ross     }
   3132  1.1      ross     zExp = ( ( aExp - 0x3FF )>>1 ) + 0x3FE;
   3133  1.1      ross     aSig |= LIT64( 0x0010000000000000 );
   3134  1.1      ross     zSig = estimateSqrt32( aExp, aSig>>21 );
   3135  1.1      ross     aSig <<= 9 - ( aExp & 1 );
   3136  1.1      ross     zSig = estimateDiv128To64( aSig, 0, zSig<<32 ) + ( zSig<<30 );
   3137  1.1      ross     if ( ( zSig & 0x1FF ) <= 5 ) {
   3138  1.1      ross         doubleZSig = zSig<<1;
   3139  1.1      ross         mul64To128( zSig, zSig, &term0, &term1 );
   3140  1.1      ross         sub128( aSig, 0, term0, term1, &rem0, &rem1 );
   3141  1.1      ross         while ( (sbits64) rem0 < 0 ) {
   3142  1.1      ross             --zSig;
   3143  1.1      ross             doubleZSig -= 2;
   3144  1.1      ross             add128( rem0, rem1, zSig>>63, doubleZSig | 1, &rem0, &rem1 );
   3145  1.1      ross         }
   3146  1.1      ross         zSig |= ( ( rem0 | rem1 ) != 0 );
   3147  1.1      ross     }
   3148  1.1      ross     return roundAndPackFloat64( 0, zExp, zSig );
   3149  1.1      ross 
   3150  1.1      ross }
   3151  1.1      ross #endif
   3152  1.1      ross 
   3153  1.1      ross /*
   3154  1.1      ross -------------------------------------------------------------------------------
   3155  1.1      ross Returns 1 if the double-precision floating-point value `a' is equal to the
   3156  1.1      ross corresponding value `b', and 0 otherwise.  The comparison is performed
   3157  1.1      ross according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
   3158  1.1      ross -------------------------------------------------------------------------------
   3159  1.1      ross */
   3160  1.1      ross flag float64_eq( float64 a, float64 b )
   3161  1.1      ross {
   3162  1.1      ross 
   3163  1.1      ross     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
   3164  1.1      ross          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
   3165  1.1      ross        ) {
   3166  1.1      ross         if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) {
   3167  1.1      ross             float_raise( float_flag_invalid );
   3168  1.1      ross         }
   3169  1.1      ross         return 0;
   3170  1.1      ross     }
   3171  1.1      ross     return ( a == b ) ||
   3172  1.1      ross 	( (bits64) ( ( FLOAT64_DEMANGLE(a) | FLOAT64_DEMANGLE(b) )<<1 ) == 0 );
   3173  1.1      ross 
   3174  1.1      ross }
   3175  1.1      ross 
   3176  1.1      ross /*
   3177  1.1      ross -------------------------------------------------------------------------------
   3178  1.1      ross Returns 1 if the double-precision floating-point value `a' is less than or
   3179  1.1      ross equal to the corresponding value `b', and 0 otherwise.  The comparison is
   3180  1.1      ross performed according to the IEC/IEEE Standard for Binary Floating-Point
   3181  1.1      ross Arithmetic.
   3182  1.1      ross -------------------------------------------------------------------------------
   3183  1.1      ross */
   3184  1.1      ross flag float64_le( float64 a, float64 b )
   3185  1.1      ross {
   3186  1.1      ross     flag aSign, bSign;
   3187  1.1      ross 
   3188  1.1      ross     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
   3189  1.1      ross          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
   3190  1.1      ross        ) {
   3191  1.1      ross         float_raise( float_flag_invalid );
   3192  1.1      ross         return 0;
   3193  1.1      ross     }
   3194  1.1      ross     aSign = extractFloat64Sign( a );
   3195  1.1      ross     bSign = extractFloat64Sign( b );
   3196  1.1      ross     if ( aSign != bSign )
   3197  1.1      ross 	return aSign ||
   3198  1.1      ross 	    ( (bits64) ( ( FLOAT64_DEMANGLE(a) | FLOAT64_DEMANGLE(b) )<<1 ) ==
   3199  1.1      ross 	      0 );
   3200  1.1      ross     return ( a == b ) ||
   3201  1.1      ross 	( aSign ^ ( FLOAT64_DEMANGLE(a) < FLOAT64_DEMANGLE(b) ) );
   3202  1.1      ross 
   3203  1.1      ross }
   3204  1.1      ross 
   3205  1.1      ross /*
   3206  1.1      ross -------------------------------------------------------------------------------
   3207  1.1      ross Returns 1 if the double-precision floating-point value `a' is less than
   3208  1.1      ross the corresponding value `b', and 0 otherwise.  The comparison is performed
   3209  1.1      ross according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
   3210  1.1      ross -------------------------------------------------------------------------------
   3211  1.1      ross */
   3212  1.1      ross flag float64_lt( float64 a, float64 b )
   3213  1.1      ross {
   3214  1.1      ross     flag aSign, bSign;
   3215  1.1      ross 
   3216  1.1      ross     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
   3217  1.1      ross          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
   3218  1.1      ross        ) {
   3219  1.1      ross         float_raise( float_flag_invalid );
   3220  1.1      ross         return 0;
   3221  1.1      ross     }
   3222  1.1      ross     aSign = extractFloat64Sign( a );
   3223  1.1      ross     bSign = extractFloat64Sign( b );
   3224  1.1      ross     if ( aSign != bSign )
   3225  1.1      ross 	return aSign &&
   3226  1.1      ross 	    ( (bits64) ( ( FLOAT64_DEMANGLE(a) | FLOAT64_DEMANGLE(b) )<<1 ) !=
   3227  1.1      ross 	      0 );
   3228  1.1      ross     return ( a != b ) &&
   3229  1.1      ross 	( aSign ^ ( FLOAT64_DEMANGLE(a) < FLOAT64_DEMANGLE(b) ) );
   3230  1.1      ross 
   3231  1.1      ross }
   3232  1.1      ross 
   3233  1.1      ross #ifndef SOFTFLOAT_FOR_GCC
   3234  1.1      ross /*
   3235  1.1      ross -------------------------------------------------------------------------------
   3236  1.1      ross Returns 1 if the double-precision floating-point value `a' is equal to the
   3237  1.1      ross corresponding value `b', and 0 otherwise.  The invalid exception is raised
   3238  1.1      ross if either operand is a NaN.  Otherwise, the comparison is performed
   3239  1.1      ross according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
   3240  1.1      ross -------------------------------------------------------------------------------
   3241  1.1      ross */
   3242  1.1      ross flag float64_eq_signaling( float64 a, float64 b )
   3243  1.1      ross {
   3244  1.1      ross 
   3245  1.1      ross     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
   3246  1.1      ross          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
   3247  1.1      ross        ) {
   3248  1.1      ross         float_raise( float_flag_invalid );
   3249  1.1      ross         return 0;
   3250  1.1      ross     }
   3251  1.1      ross     return ( a == b ) || ( (bits64) ( ( a | b )<<1 ) == 0 );
   3252  1.1      ross 
   3253  1.1      ross }
   3254  1.1      ross 
   3255  1.1      ross /*
   3256  1.1      ross -------------------------------------------------------------------------------
   3257  1.1      ross Returns 1 if the double-precision floating-point value `a' is less than or
   3258  1.1      ross equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs do not
   3259  1.1      ross cause an exception.  Otherwise, the comparison is performed according to the
   3260  1.1      ross IEC/IEEE Standard for Binary Floating-Point Arithmetic.
   3261  1.1      ross -------------------------------------------------------------------------------
   3262  1.1      ross */
   3263  1.1      ross flag float64_le_quiet( float64 a, float64 b )
   3264  1.1      ross {
   3265  1.1      ross     flag aSign, bSign;
   3266  1.1      ross 
   3267  1.1      ross     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
   3268  1.1      ross          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
   3269  1.1      ross        ) {
   3270  1.1      ross         if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) {
   3271  1.1      ross             float_raise( float_flag_invalid );
   3272  1.1      ross         }
   3273  1.1      ross         return 0;
   3274  1.1      ross     }
   3275  1.1      ross     aSign = extractFloat64Sign( a );
   3276  1.1      ross     bSign = extractFloat64Sign( b );
   3277  1.1      ross     if ( aSign != bSign ) return aSign || ( (bits64) ( ( a | b )<<1 ) == 0 );
   3278  1.1      ross     return ( a == b ) || ( aSign ^ ( a < b ) );
   3279  1.1      ross 
   3280  1.1      ross }
   3281  1.1      ross 
   3282  1.1      ross /*
   3283  1.1      ross -------------------------------------------------------------------------------
   3284  1.1      ross Returns 1 if the double-precision floating-point value `a' is less than
   3285  1.1      ross the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
   3286  1.1      ross exception.  Otherwise, the comparison is performed according to the IEC/IEEE
   3287  1.1      ross Standard for Binary Floating-Point Arithmetic.
   3288  1.1      ross -------------------------------------------------------------------------------
   3289  1.1      ross */
   3290  1.1      ross flag float64_lt_quiet( float64 a, float64 b )
   3291  1.1      ross {
   3292  1.1      ross     flag aSign, bSign;
   3293  1.1      ross 
   3294  1.1      ross     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
   3295  1.1      ross          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
   3296  1.1      ross        ) {
   3297  1.1      ross         if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) {
   3298  1.1      ross             float_raise( float_flag_invalid );
   3299  1.1      ross         }
   3300  1.1      ross         return 0;
   3301  1.1      ross     }
   3302  1.1      ross     aSign = extractFloat64Sign( a );
   3303  1.1      ross     bSign = extractFloat64Sign( b );
   3304  1.1      ross     if ( aSign != bSign ) return aSign && ( (bits64) ( ( a | b )<<1 ) != 0 );
   3305  1.1      ross     return ( a != b ) && ( aSign ^ ( a < b ) );
   3306  1.1      ross 
   3307  1.1      ross }
   3308  1.1      ross #endif
   3309  1.1      ross 
   3310  1.1      ross #ifdef FLOATX80
   3311  1.1      ross 
   3312  1.1      ross /*
   3313  1.1      ross -------------------------------------------------------------------------------
   3314  1.1      ross Returns the result of converting the extended double-precision floating-
   3315  1.1      ross point value `a' to the 32-bit two's complement integer format.  The
   3316  1.1      ross conversion is performed according to the IEC/IEEE Standard for Binary
   3317  1.1      ross Floating-Point Arithmetic---which means in particular that the conversion
   3318  1.1      ross is rounded according to the current rounding mode.  If `a' is a NaN, the
   3319  1.1      ross largest positive integer is returned.  Otherwise, if the conversion
   3320  1.1      ross overflows, the largest integer with the same sign as `a' is returned.
   3321  1.1      ross -------------------------------------------------------------------------------
   3322  1.1      ross */
   3323  1.1      ross int32 floatx80_to_int32( floatx80 a )
   3324  1.1      ross {
   3325  1.1      ross     flag aSign;
   3326  1.1      ross     int32 aExp, shiftCount;
   3327  1.1      ross     bits64 aSig;
   3328  1.1      ross 
   3329  1.1      ross     aSig = extractFloatx80Frac( a );
   3330  1.1      ross     aExp = extractFloatx80Exp( a );
   3331  1.1      ross     aSign = extractFloatx80Sign( a );
   3332  1.1      ross     if ( ( aExp == 0x7FFF ) && (bits64) ( aSig<<1 ) ) aSign = 0;
   3333  1.1      ross     shiftCount = 0x4037 - aExp;
   3334  1.1      ross     if ( shiftCount <= 0 ) shiftCount = 1;
   3335  1.1      ross     shift64RightJamming( aSig, shiftCount, &aSig );
   3336  1.1      ross     return roundAndPackInt32( aSign, aSig );
   3337  1.1      ross 
   3338  1.1      ross }
   3339  1.1      ross 
   3340  1.1      ross /*
   3341  1.1      ross -------------------------------------------------------------------------------
   3342  1.1      ross Returns the result of converting the extended double-precision floating-
   3343  1.1      ross point value `a' to the 32-bit two's complement integer format.  The
   3344  1.1      ross conversion is performed according to the IEC/IEEE Standard for Binary
   3345  1.1      ross Floating-Point Arithmetic, except that the conversion is always rounded
   3346  1.1      ross toward zero.  If `a' is a NaN, the largest positive integer is returned.
   3347  1.1      ross Otherwise, if the conversion overflows, the largest integer with the same
   3348  1.1      ross sign as `a' is returned.
   3349  1.1      ross -------------------------------------------------------------------------------
   3350  1.1      ross */
   3351  1.1      ross int32 floatx80_to_int32_round_to_zero( floatx80 a )
   3352  1.1      ross {
   3353  1.1      ross     flag aSign;
   3354  1.1      ross     int32 aExp, shiftCount;
   3355  1.1      ross     bits64 aSig, savedASig;
   3356  1.1      ross     int32 z;
   3357  1.1      ross 
   3358  1.1      ross     aSig = extractFloatx80Frac( a );
   3359  1.1      ross     aExp = extractFloatx80Exp( a );
   3360  1.1      ross     aSign = extractFloatx80Sign( a );
   3361  1.1      ross     if ( 0x401E < aExp ) {
   3362  1.1      ross         if ( ( aExp == 0x7FFF ) && (bits64) ( aSig<<1 ) ) aSign = 0;
   3363  1.1      ross         goto invalid;
   3364  1.1      ross     }
   3365  1.1      ross     else if ( aExp < 0x3FFF ) {
   3366  1.1      ross         if ( aExp || aSig ) float_set_inexact();
   3367  1.1      ross         return 0;
   3368  1.1      ross     }
   3369  1.1      ross     shiftCount = 0x403E - aExp;
   3370  1.1      ross     savedASig = aSig;
   3371  1.1      ross     aSig >>= shiftCount;
   3372  1.1      ross     z = aSig;
   3373  1.1      ross     if ( aSign ) z = - z;
   3374  1.1      ross     if ( ( z < 0 ) ^ aSign ) {
   3375  1.1      ross  invalid:
   3376  1.1      ross         float_raise( float_flag_invalid );
   3377  1.1      ross         return aSign ? (sbits32) 0x80000000 : 0x7FFFFFFF;
   3378  1.1      ross     }
   3379  1.1      ross     if ( ( aSig<<shiftCount ) != savedASig ) {
   3380  1.1      ross         float_set_inexact();
   3381  1.1      ross     }
   3382  1.1      ross     return z;
   3383  1.1      ross 
   3384  1.1      ross }
   3385  1.1      ross 
   3386  1.1      ross /*
   3387  1.1      ross -------------------------------------------------------------------------------
   3388  1.1      ross Returns the result of converting the extended double-precision floating-
   3389  1.1      ross point value `a' to the 64-bit two's complement integer format.  The
   3390  1.1      ross conversion is performed according to the IEC/IEEE Standard for Binary
   3391  1.1      ross Floating-Point Arithmetic---which means in particular that the conversion
   3392  1.1      ross is rounded according to the current rounding mode.  If `a' is a NaN,
   3393  1.1      ross the largest positive integer is returned.  Otherwise, if the conversion
   3394  1.1      ross overflows, the largest integer with the same sign as `a' is returned.
   3395  1.1      ross -------------------------------------------------------------------------------
   3396  1.1      ross */
   3397  1.1      ross int64 floatx80_to_int64( floatx80 a )
   3398  1.1      ross {
   3399  1.1      ross     flag aSign;
   3400  1.1      ross     int32 aExp, shiftCount;
   3401  1.1      ross     bits64 aSig, aSigExtra;
   3402  1.1      ross 
   3403  1.1      ross     aSig = extractFloatx80Frac( a );
   3404  1.1      ross     aExp = extractFloatx80Exp( a );
   3405  1.1      ross     aSign = extractFloatx80Sign( a );
   3406  1.1      ross     shiftCount = 0x403E - aExp;
   3407  1.1      ross     if ( shiftCount <= 0 ) {
   3408  1.1      ross         if ( shiftCount ) {
   3409  1.1      ross             float_raise( float_flag_invalid );
   3410  1.1      ross             if (    ! aSign
   3411  1.1      ross                  || (    ( aExp == 0x7FFF )
   3412  1.1      ross                       && ( aSig != LIT64( 0x8000000000000000 ) ) )
   3413  1.1      ross                ) {
   3414  1.1      ross                 return LIT64( 0x7FFFFFFFFFFFFFFF );
   3415  1.1      ross             }
   3416  1.1      ross             return (sbits64) LIT64( 0x8000000000000000 );
   3417  1.1      ross         }
   3418  1.1      ross         aSigExtra = 0;
   3419  1.1      ross     }
   3420  1.1      ross     else {
   3421  1.1      ross         shift64ExtraRightJamming( aSig, 0, shiftCount, &aSig, &aSigExtra );
   3422  1.1      ross     }
   3423  1.1      ross     return roundAndPackInt64( aSign, aSig, aSigExtra );
   3424  1.1      ross 
   3425  1.1      ross }
   3426  1.1      ross 
   3427  1.1      ross /*
   3428  1.1      ross -------------------------------------------------------------------------------
   3429  1.1      ross Returns the result of converting the extended double-precision floating-
   3430  1.1      ross point value `a' to the 64-bit two's complement integer format.  The
   3431  1.1      ross conversion is performed according to the IEC/IEEE Standard for Binary
   3432  1.1      ross Floating-Point Arithmetic, except that the conversion is always rounded
   3433  1.1      ross toward zero.  If `a' is a NaN, the largest positive integer is returned.
   3434  1.1      ross Otherwise, if the conversion overflows, the largest integer with the same
   3435  1.1      ross sign as `a' is returned.
   3436  1.1      ross -------------------------------------------------------------------------------
   3437  1.1      ross */
   3438  1.1      ross int64 floatx80_to_int64_round_to_zero( floatx80 a )
   3439  1.1      ross {
   3440  1.1      ross     flag aSign;
   3441  1.1      ross     int32 aExp, shiftCount;
   3442  1.1      ross     bits64 aSig;
   3443  1.1      ross     int64 z;
   3444  1.1      ross 
   3445  1.1      ross     aSig = extractFloatx80Frac( a );
   3446  1.1      ross     aExp = extractFloatx80Exp( a );
   3447  1.1      ross     aSign = extractFloatx80Sign( a );
   3448  1.1      ross     shiftCount = aExp - 0x403E;
   3449  1.1      ross     if ( 0 <= shiftCount ) {
   3450  1.1      ross         aSig &= LIT64( 0x7FFFFFFFFFFFFFFF );
   3451  1.1      ross         if ( ( a.high != 0xC03E ) || aSig ) {
   3452  1.1      ross             float_raise( float_flag_invalid );
   3453  1.1      ross             if ( ! aSign || ( ( aExp == 0x7FFF ) && aSig ) ) {
   3454  1.1      ross                 return LIT64( 0x7FFFFFFFFFFFFFFF );
   3455  1.1      ross             }
   3456  1.1      ross         }
   3457  1.1      ross         return (sbits64) LIT64( 0x8000000000000000 );
   3458  1.1      ross     }
   3459  1.1      ross     else if ( aExp < 0x3FFF ) {
   3460  1.1      ross         if ( aExp | aSig ) float_set_inexact();
   3461  1.1      ross         return 0;
   3462  1.1      ross     }
   3463  1.1      ross     z = aSig>>( - shiftCount );
   3464  1.1      ross     if ( (bits64) ( aSig<<( shiftCount & 63 ) ) ) {
   3465  1.1      ross         float_set_inexact();
   3466  1.1      ross     }
   3467  1.1      ross     if ( aSign ) z = - z;
   3468  1.1      ross     return z;
   3469  1.1      ross 
   3470  1.1      ross }
   3471  1.1      ross 
   3472  1.1      ross /*
   3473  1.1      ross -------------------------------------------------------------------------------
   3474  1.1      ross Returns the result of converting the extended double-precision floating-
   3475  1.1      ross point value `a' to the single-precision floating-point format.  The
   3476  1.1      ross conversion is performed according to the IEC/IEEE Standard for Binary
   3477  1.1      ross Floating-Point Arithmetic.
   3478  1.1      ross -------------------------------------------------------------------------------
   3479  1.1      ross */
   3480  1.1      ross float32 floatx80_to_float32( floatx80 a )
   3481  1.1      ross {
   3482  1.1      ross     flag aSign;
   3483  1.1      ross     int32 aExp;
   3484  1.1      ross     bits64 aSig;
   3485  1.1      ross 
   3486  1.1      ross     aSig = extractFloatx80Frac( a );
   3487  1.1      ross     aExp = extractFloatx80Exp( a );
   3488  1.1      ross     aSign = extractFloatx80Sign( a );
   3489  1.1      ross     if ( aExp == 0x7FFF ) {
   3490  1.1      ross         if ( (bits64) ( aSig<<1 ) ) {
   3491  1.1      ross             return commonNaNToFloat32( floatx80ToCommonNaN( a ) );
   3492  1.1      ross         }
   3493  1.1      ross         return packFloat32( aSign, 0xFF, 0 );
   3494  1.1      ross     }
   3495  1.1      ross     shift64RightJamming( aSig, 33, &aSig );
   3496  1.1      ross     if ( aExp || aSig ) aExp -= 0x3F81;
   3497  1.1      ross     return roundAndPackFloat32( aSign, aExp, aSig );
   3498  1.1      ross 
   3499  1.1      ross }
   3500  1.1      ross 
   3501  1.1      ross /*
   3502  1.1      ross -------------------------------------------------------------------------------
   3503  1.1      ross Returns the result of converting the extended double-precision floating-
   3504  1.1      ross point value `a' to the double-precision floating-point format.  The
   3505  1.1      ross conversion is performed according to the IEC/IEEE Standard for Binary
   3506  1.1      ross Floating-Point Arithmetic.
   3507  1.1      ross -------------------------------------------------------------------------------
   3508  1.1      ross */
   3509  1.1      ross float64 floatx80_to_float64( floatx80 a )
   3510  1.1      ross {
   3511  1.1      ross     flag aSign;
   3512  1.1      ross     int32 aExp;
   3513  1.1      ross     bits64 aSig, zSig;
   3514  1.1      ross 
   3515  1.1      ross     aSig = extractFloatx80Frac( a );
   3516  1.1      ross     aExp = extractFloatx80Exp( a );
   3517  1.1      ross     aSign = extractFloatx80Sign( a );
   3518  1.1      ross     if ( aExp == 0x7FFF ) {
   3519  1.1      ross         if ( (bits64) ( aSig<<1 ) ) {
   3520  1.1      ross             return commonNaNToFloat64( floatx80ToCommonNaN( a ) );
   3521  1.1      ross         }
   3522  1.1      ross         return packFloat64( aSign, 0x7FF, 0 );
   3523  1.1      ross     }
   3524  1.1      ross     shift64RightJamming( aSig, 1, &zSig );
   3525  1.1      ross     if ( aExp || aSig ) aExp -= 0x3C01;
   3526  1.1      ross     return roundAndPackFloat64( aSign, aExp, zSig );
   3527  1.1      ross 
   3528  1.1      ross }
   3529  1.1      ross 
   3530  1.1      ross #ifdef FLOAT128
   3531  1.1      ross 
   3532  1.1      ross /*
   3533  1.1      ross -------------------------------------------------------------------------------
   3534  1.1      ross Returns the result of converting the extended double-precision floating-
   3535  1.1      ross point value `a' to the quadruple-precision floating-point format.  The
   3536  1.1      ross conversion is performed according to the IEC/IEEE Standard for Binary
   3537  1.1      ross Floating-Point Arithmetic.
   3538  1.1      ross -------------------------------------------------------------------------------
   3539  1.1      ross */
   3540  1.1      ross float128 floatx80_to_float128( floatx80 a )
   3541  1.1      ross {
   3542  1.1      ross     flag aSign;
   3543  1.1      ross     int16 aExp;
   3544  1.1      ross     bits64 aSig, zSig0, zSig1;
   3545  1.1      ross 
   3546  1.1      ross     aSig = extractFloatx80Frac( a );
   3547  1.1      ross     aExp = extractFloatx80Exp( a );
   3548  1.1      ross     aSign = extractFloatx80Sign( a );
   3549  1.1      ross     if ( ( aExp == 0x7FFF ) && (bits64) ( aSig<<1 ) ) {
   3550  1.1      ross         return commonNaNToFloat128( floatx80ToCommonNaN( a ) );
   3551  1.1      ross     }
   3552  1.1      ross     shift128Right( aSig<<1, 0, 16, &zSig0, &zSig1 );
   3553  1.1      ross     return packFloat128( aSign, aExp, zSig0, zSig1 );
   3554  1.1      ross 
   3555  1.1      ross }
   3556  1.1      ross 
   3557  1.1      ross #endif
   3558  1.1      ross 
   3559  1.1      ross /*
   3560  1.1      ross -------------------------------------------------------------------------------
   3561  1.1      ross Rounds the extended double-precision floating-point value `a' to an integer,
   3562  1.1      ross and returns the result as an extended quadruple-precision floating-point
   3563  1.1      ross value.  The operation is performed according to the IEC/IEEE Standard for
   3564  1.1      ross Binary Floating-Point Arithmetic.
   3565  1.1      ross -------------------------------------------------------------------------------
   3566  1.1      ross */
   3567  1.1      ross floatx80 floatx80_round_to_int( floatx80 a )
   3568  1.1      ross {
   3569  1.1      ross     flag aSign;
   3570  1.1      ross     int32 aExp;
   3571  1.1      ross     bits64 lastBitMask, roundBitsMask;
   3572  1.1      ross     int8 roundingMode;
   3573  1.1      ross     floatx80 z;
   3574  1.1      ross 
   3575  1.1      ross     aExp = extractFloatx80Exp( a );
   3576  1.1      ross     if ( 0x403E <= aExp ) {
   3577  1.1      ross         if ( ( aExp == 0x7FFF ) && (bits64) ( extractFloatx80Frac( a )<<1 ) ) {
   3578  1.1      ross             return propagateFloatx80NaN( a, a );
   3579  1.1      ross         }
   3580  1.1      ross         return a;
   3581  1.1      ross     }
   3582  1.1      ross     if ( aExp < 0x3FFF ) {
   3583  1.1      ross         if (    ( aExp == 0 )
   3584  1.1      ross              && ( (bits64) ( extractFloatx80Frac( a )<<1 ) == 0 ) ) {
   3585  1.1      ross             return a;
   3586  1.1      ross         }
   3587  1.1      ross         float_set_inexact();
   3588  1.1      ross         aSign = extractFloatx80Sign( a );
   3589  1.1      ross         switch ( float_rounding_mode() ) {
   3590  1.1      ross          case float_round_nearest_even:
   3591  1.1      ross             if ( ( aExp == 0x3FFE ) && (bits64) ( extractFloatx80Frac( a )<<1 )
   3592  1.1      ross                ) {
   3593  1.1      ross                 return
   3594  1.1      ross                     packFloatx80( aSign, 0x3FFF, LIT64( 0x8000000000000000 ) );
   3595  1.1      ross             }
   3596  1.1      ross             break;
   3597  1.1      ross          case float_round_down:
   3598  1.1      ross             return
   3599  1.1      ross                   aSign ?
   3600  1.1      ross                       packFloatx80( 1, 0x3FFF, LIT64( 0x8000000000000000 ) )
   3601  1.1      ross                 : packFloatx80( 0, 0, 0 );
   3602  1.1      ross          case float_round_up:
   3603  1.1      ross             return
   3604  1.1      ross                   aSign ? packFloatx80( 1, 0, 0 )
   3605  1.1      ross                 : packFloatx80( 0, 0x3FFF, LIT64( 0x8000000000000000 ) );
   3606  1.1      ross         }
   3607  1.1      ross         return packFloatx80( aSign, 0, 0 );
   3608  1.1      ross     }
   3609  1.1      ross     lastBitMask = 1;
   3610  1.1      ross     lastBitMask <<= 0x403E - aExp;
   3611  1.1      ross     roundBitsMask = lastBitMask - 1;
   3612  1.1      ross     z = a;
   3613  1.1      ross     roundingMode = float_rounding_mode();
   3614  1.1      ross     if ( roundingMode == float_round_nearest_even ) {
   3615  1.1      ross         z.low += lastBitMask>>1;
   3616  1.1      ross         if ( ( z.low & roundBitsMask ) == 0 ) z.low &= ~ lastBitMask;
   3617  1.1      ross     }
   3618  1.1      ross     else if ( roundingMode != float_round_to_zero ) {
   3619  1.1      ross         if ( extractFloatx80Sign( z ) ^ ( roundingMode == float_round_up ) ) {
   3620  1.1      ross             z.low += roundBitsMask;
   3621  1.1      ross         }
   3622  1.1      ross     }
   3623  1.1      ross     z.low &= ~ roundBitsMask;
   3624  1.1      ross     if ( z.low == 0 ) {
   3625  1.1      ross         ++z.high;
   3626  1.1      ross         z.low = LIT64( 0x8000000000000000 );
   3627  1.1      ross     }
   3628  1.1      ross     if ( z.low != a.low ) float_set_inexact();
   3629  1.1      ross     return z;
   3630  1.1      ross 
   3631  1.1      ross }
   3632  1.1      ross 
   3633  1.1      ross /*
   3634  1.1      ross -------------------------------------------------------------------------------
   3635  1.1      ross Returns the result of adding the absolute values of the extended double-
   3636  1.1      ross precision floating-point values `a' and `b'.  If `zSign' is 1, the sum is
   3637  1.1      ross negated before being returned.  `zSign' is ignored if the result is a NaN.
   3638  1.1      ross The addition is performed according to the IEC/IEEE Standard for Binary
   3639  1.1      ross Floating-Point Arithmetic.
   3640  1.1      ross -------------------------------------------------------------------------------
   3641  1.1      ross */
   3642  1.1      ross static floatx80 addFloatx80Sigs( floatx80 a, floatx80 b, flag zSign )
   3643  1.1      ross {
   3644  1.1      ross     int32 aExp, bExp, zExp;
   3645  1.1      ross     bits64 aSig, bSig, zSig0, zSig1;
   3646  1.1      ross     int32 expDiff;
   3647  1.1      ross 
   3648  1.1      ross     aSig = extractFloatx80Frac( a );
   3649  1.1      ross     aExp = extractFloatx80Exp( a );
   3650  1.1      ross     bSig = extractFloatx80Frac( b );
   3651  1.1      ross     bExp = extractFloatx80Exp( b );
   3652  1.1      ross     expDiff = aExp - bExp;
   3653  1.1      ross     if ( 0 < expDiff ) {
   3654  1.1      ross         if ( aExp == 0x7FFF ) {
   3655  1.1      ross             if ( (bits64) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b );
   3656  1.1      ross             return a;
   3657  1.1      ross         }
   3658  1.1      ross         if ( bExp == 0 ) --expDiff;
   3659  1.1      ross         shift64ExtraRightJamming( bSig, 0, expDiff, &bSig, &zSig1 );
   3660  1.1      ross         zExp = aExp;
   3661  1.1      ross     }
   3662  1.1      ross     else if ( expDiff < 0 ) {
   3663  1.1      ross         if ( bExp == 0x7FFF ) {
   3664  1.1      ross             if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b );
   3665  1.1      ross             return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
   3666  1.1      ross         }
   3667  1.1      ross         if ( aExp == 0 ) ++expDiff;
   3668  1.1      ross         shift64ExtraRightJamming( aSig, 0, - expDiff, &aSig, &zSig1 );
   3669  1.1      ross         zExp = bExp;
   3670  1.1      ross     }
   3671  1.1      ross     else {
   3672  1.1      ross         if ( aExp == 0x7FFF ) {
   3673  1.1      ross             if ( (bits64) ( ( aSig | bSig )<<1 ) ) {
   3674  1.1      ross                 return propagateFloatx80NaN( a, b );
   3675  1.1      ross             }
   3676  1.1      ross             return a;
   3677  1.1      ross         }
   3678  1.1      ross         zSig1 = 0;
   3679  1.1      ross         zSig0 = aSig + bSig;
   3680  1.1      ross         if ( aExp == 0 ) {
   3681  1.1      ross             normalizeFloatx80Subnormal( zSig0, &zExp, &zSig0 );
   3682  1.1      ross             goto roundAndPack;
   3683  1.1      ross         }
   3684  1.1      ross         zExp = aExp;
   3685  1.1      ross         goto shiftRight1;
   3686  1.1      ross     }
   3687  1.1      ross     zSig0 = aSig + bSig;
   3688  1.1      ross     if ( (sbits64) zSig0 < 0 ) goto roundAndPack;
   3689  1.1      ross  shiftRight1:
   3690  1.1      ross     shift64ExtraRightJamming( zSig0, zSig1, 1, &zSig0, &zSig1 );
   3691  1.1      ross     zSig0 |= LIT64( 0x8000000000000000 );
   3692  1.1      ross     ++zExp;
   3693  1.1      ross  roundAndPack:
   3694  1.1      ross     return
   3695  1.1      ross         roundAndPackFloatx80(
   3696  1.1      ross             floatx80_rounding_precision, zSign, zExp, zSig0, zSig1 );
   3697  1.1      ross 
   3698  1.1      ross }
   3699  1.1      ross 
   3700  1.1      ross /*
   3701  1.1      ross -------------------------------------------------------------------------------
   3702  1.1      ross Returns the result of subtracting the absolute values of the extended
   3703  1.1      ross double-precision floating-point values `a' and `b'.  If `zSign' is 1, the
   3704  1.1      ross difference is negated before being returned.  `zSign' is ignored if the
   3705  1.1      ross result is a NaN.  The subtraction is performed according to the IEC/IEEE
   3706  1.1      ross Standard for Binary Floating-Point Arithmetic.
   3707  1.1      ross -------------------------------------------------------------------------------
   3708  1.1      ross */
   3709  1.1      ross static floatx80 subFloatx80Sigs( floatx80 a, floatx80 b, flag zSign )
   3710  1.1      ross {
   3711  1.1      ross     int32 aExp, bExp, zExp;
   3712  1.1      ross     bits64 aSig, bSig, zSig0, zSig1;
   3713  1.1      ross     int32 expDiff;
   3714  1.1      ross     floatx80 z;
   3715  1.1      ross 
   3716  1.1      ross     aSig = extractFloatx80Frac( a );
   3717  1.1      ross     aExp = extractFloatx80Exp( a );
   3718  1.1      ross     bSig = extractFloatx80Frac( b );
   3719  1.1      ross     bExp = extractFloatx80Exp( b );
   3720  1.1      ross     expDiff = aExp - bExp;
   3721  1.1      ross     if ( 0 < expDiff ) goto aExpBigger;
   3722  1.1      ross     if ( expDiff < 0 ) goto bExpBigger;
   3723  1.1      ross     if ( aExp == 0x7FFF ) {
   3724  1.1      ross         if ( (bits64) ( ( aSig | bSig )<<1 ) ) {
   3725  1.1      ross             return propagateFloatx80NaN( a, b );
   3726  1.1      ross         }
   3727  1.1      ross         float_raise( float_flag_invalid );
   3728  1.1      ross         z.low = floatx80_default_nan_low;
   3729  1.1      ross         z.high = floatx80_default_nan_high;
   3730  1.1      ross         return z;
   3731  1.1      ross     }
   3732  1.1      ross     if ( aExp == 0 ) {
   3733  1.1      ross         aExp = 1;
   3734  1.1      ross         bExp = 1;
   3735  1.1      ross     }
   3736  1.1      ross     zSig1 = 0;
   3737  1.1      ross     if ( bSig < aSig ) goto aBigger;
   3738  1.1      ross     if ( aSig < bSig ) goto bBigger;
   3739  1.1      ross     return packFloatx80( float_rounding_mode() == float_round_down, 0, 0 );
   3740  1.1      ross  bExpBigger:
   3741  1.1      ross     if ( bExp == 0x7FFF ) {
   3742  1.1      ross         if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b );
   3743  1.1      ross         return packFloatx80( zSign ^ 1, 0x7FFF, LIT64( 0x8000000000000000 ) );
   3744  1.1      ross     }
   3745  1.1      ross     if ( aExp == 0 ) ++expDiff;
   3746  1.1      ross     shift128RightJamming( aSig, 0, - expDiff, &aSig, &zSig1 );
   3747  1.1      ross  bBigger:
   3748  1.1      ross     sub128( bSig, 0, aSig, zSig1, &zSig0, &zSig1 );
   3749  1.1      ross     zExp = bExp;
   3750  1.1      ross     zSign ^= 1;
   3751  1.1      ross     goto normalizeRoundAndPack;
   3752  1.1      ross  aExpBigger:
   3753  1.1      ross     if ( aExp == 0x7FFF ) {
   3754  1.1      ross         if ( (bits64) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b );
   3755  1.1      ross         return a;
   3756  1.1      ross     }
   3757  1.1      ross     if ( bExp == 0 ) --expDiff;
   3758  1.1      ross     shift128RightJamming( bSig, 0, expDiff, &bSig, &zSig1 );
   3759  1.1      ross  aBigger:
   3760  1.1      ross     sub128( aSig, 0, bSig, zSig1, &zSig0, &zSig1 );
   3761  1.1      ross     zExp = aExp;
   3762  1.1      ross  normalizeRoundAndPack:
   3763  1.1      ross     return
   3764  1.1      ross         normalizeRoundAndPackFloatx80(
   3765  1.1      ross             floatx80_rounding_precision, zSign, zExp, zSig0, zSig1 );
   3766  1.1      ross 
   3767  1.1      ross }
   3768  1.1      ross 
   3769  1.1      ross /*
   3770  1.1      ross -------------------------------------------------------------------------------
   3771  1.1      ross Returns the result of adding the extended double-precision floating-point
   3772  1.1      ross values `a' and `b'.  The operation is performed according to the IEC/IEEE
   3773  1.1      ross Standard for Binary Floating-Point Arithmetic.
   3774  1.1      ross -------------------------------------------------------------------------------
   3775  1.1      ross */
   3776  1.1      ross floatx80 floatx80_add( floatx80 a, floatx80 b )
   3777  1.1      ross {
   3778  1.1      ross     flag aSign, bSign;
   3779  1.1      ross 
   3780  1.1      ross     aSign = extractFloatx80Sign( a );
   3781  1.1      ross     bSign = extractFloatx80Sign( b );
   3782  1.1      ross     if ( aSign == bSign ) {
   3783  1.1      ross         return addFloatx80Sigs( a, b, aSign );
   3784  1.1      ross     }
   3785  1.1      ross     else {
   3786  1.1      ross         return subFloatx80Sigs( a, b, aSign );
   3787  1.1      ross     }
   3788  1.1      ross 
   3789  1.1      ross }
   3790  1.1      ross 
   3791  1.1      ross /*
   3792  1.1      ross -------------------------------------------------------------------------------
   3793  1.1      ross Returns the result of subtracting the extended double-precision floating-
   3794  1.1      ross point values `a' and `b'.  The operation is performed according to the
   3795  1.1      ross IEC/IEEE Standard for Binary Floating-Point Arithmetic.
   3796  1.1      ross -------------------------------------------------------------------------------
   3797  1.1      ross */
   3798  1.1      ross floatx80 floatx80_sub( floatx80 a, floatx80 b )
   3799  1.1      ross {
   3800  1.1      ross     flag aSign, bSign;
   3801  1.1      ross 
   3802  1.1      ross     aSign = extractFloatx80Sign( a );
   3803  1.1      ross     bSign = extractFloatx80Sign( b );
   3804  1.1      ross     if ( aSign == bSign ) {
   3805  1.1      ross         return subFloatx80Sigs( a, b, aSign );
   3806  1.1      ross     }
   3807  1.1      ross     else {
   3808  1.1      ross         return addFloatx80Sigs( a, b, aSign );
   3809  1.1      ross     }
   3810  1.1      ross 
   3811  1.1      ross }
   3812  1.1      ross 
   3813  1.1      ross /*
   3814  1.1      ross -------------------------------------------------------------------------------
   3815  1.1      ross Returns the result of multiplying the extended double-precision floating-
   3816  1.1      ross point values `a' and `b'.  The operation is performed according to the
   3817  1.1      ross IEC/IEEE Standard for Binary Floating-Point Arithmetic.
   3818  1.1      ross -------------------------------------------------------------------------------
   3819  1.1      ross */
   3820  1.1      ross floatx80 floatx80_mul( floatx80 a, floatx80 b )
   3821  1.1      ross {
   3822  1.1      ross     flag aSign, bSign, zSign;
   3823  1.1      ross     int32 aExp, bExp, zExp;
   3824  1.1      ross     bits64 aSig, bSig, zSig0, zSig1;
   3825  1.1      ross     floatx80 z;
   3826  1.1      ross 
   3827  1.1      ross     aSig = extractFloatx80Frac( a );
   3828  1.1      ross     aExp = extractFloatx80Exp( a );
   3829  1.1      ross     aSign = extractFloatx80Sign( a );
   3830  1.1      ross     bSig = extractFloatx80Frac( b );
   3831  1.1      ross     bExp = extractFloatx80Exp( b );
   3832  1.1      ross     bSign = extractFloatx80Sign( b );
   3833  1.1      ross     zSign = aSign ^ bSign;
   3834  1.1      ross     if ( aExp == 0x7FFF ) {
   3835  1.1      ross         if (    (bits64) ( aSig<<1 )
   3836  1.1      ross              || ( ( bExp == 0x7FFF ) && (bits64) ( bSig<<1 ) ) ) {
   3837  1.1      ross             return propagateFloatx80NaN( a, b );
   3838  1.1      ross         }
   3839  1.1      ross         if ( ( bExp | bSig ) == 0 ) goto invalid;
   3840  1.1      ross         return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
   3841  1.1      ross     }
   3842  1.1      ross     if ( bExp == 0x7FFF ) {
   3843  1.1      ross         if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b );
   3844  1.1      ross         if ( ( aExp | aSig ) == 0 ) {
   3845  1.1      ross  invalid:
   3846  1.1      ross             float_raise( float_flag_invalid );
   3847  1.1      ross             z.low = floatx80_default_nan_low;
   3848  1.1      ross             z.high = floatx80_default_nan_high;
   3849  1.1      ross             return z;
   3850  1.1      ross         }
   3851  1.1      ross         return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
   3852  1.1      ross     }
   3853  1.1      ross     if ( aExp == 0 ) {
   3854  1.1      ross         if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 );
   3855  1.1      ross         normalizeFloatx80Subnormal( aSig, &aExp, &aSig );
   3856  1.1      ross     }
   3857  1.1      ross     if ( bExp == 0 ) {
   3858  1.1      ross         if ( bSig == 0 ) return packFloatx80( zSign, 0, 0 );
   3859  1.1      ross         normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
   3860  1.1      ross     }
   3861  1.1      ross     zExp = aExp + bExp - 0x3FFE;
   3862  1.1      ross     mul64To128( aSig, bSig, &zSig0, &zSig1 );
   3863  1.1      ross     if ( 0 < (sbits64) zSig0 ) {
   3864  1.1      ross         shortShift128Left( zSig0, zSig1, 1, &zSig0, &zSig1 );
   3865  1.1      ross         --zExp;
   3866  1.1      ross     }
   3867  1.1      ross     return
   3868  1.1      ross         roundAndPackFloatx80(
   3869  1.1      ross             floatx80_rounding_precision, zSign, zExp, zSig0, zSig1 );
   3870  1.1      ross 
   3871  1.1      ross }
   3872  1.1      ross 
   3873  1.1      ross /*
   3874  1.1      ross -------------------------------------------------------------------------------
   3875  1.1      ross Returns the result of dividing the extended double-precision floating-point
   3876  1.1      ross value `a' by the corresponding value `b'.  The operation is performed
   3877  1.1      ross according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
   3878  1.1      ross -------------------------------------------------------------------------------
   3879  1.1      ross */
   3880  1.1      ross floatx80 floatx80_div( floatx80 a, floatx80 b )
   3881  1.1      ross {
   3882  1.1      ross     flag aSign, bSign, zSign;
   3883  1.1      ross     int32 aExp, bExp, zExp;
   3884  1.1      ross     bits64 aSig, bSig, zSig0, zSig1;
   3885  1.1      ross     bits64 rem0, rem1, rem2, term0, term1, term2;
   3886  1.1      ross     floatx80 z;
   3887  1.1      ross 
   3888  1.1      ross     aSig = extractFloatx80Frac( a );
   3889  1.1      ross     aExp = extractFloatx80Exp( a );
   3890  1.1      ross     aSign = extractFloatx80Sign( a );
   3891  1.1      ross     bSig = extractFloatx80Frac( b );
   3892  1.1      ross     bExp = extractFloatx80Exp( b );
   3893  1.1      ross     bSign = extractFloatx80Sign( b );
   3894  1.1      ross     zSign = aSign ^ bSign;
   3895  1.1      ross     if ( aExp == 0x7FFF ) {
   3896  1.1      ross         if ( (bits64) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b );
   3897  1.1      ross         if ( bExp == 0x7FFF ) {
   3898  1.1      ross             if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b );
   3899  1.1      ross             goto invalid;
   3900  1.1      ross         }
   3901  1.1      ross         return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
   3902  1.1      ross     }
   3903  1.1      ross     if ( bExp == 0x7FFF ) {
   3904  1.1      ross         if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b );
   3905  1.1      ross         return packFloatx80( zSign, 0, 0 );
   3906  1.1      ross     }
   3907  1.1      ross     if ( bExp == 0 ) {
   3908  1.1      ross         if ( bSig == 0 ) {
   3909  1.1      ross             if ( ( aExp | aSig ) == 0 ) {
   3910  1.1      ross  invalid:
   3911  1.1      ross                 float_raise( float_flag_invalid );
   3912  1.1      ross                 z.low = floatx80_default_nan_low;
   3913  1.1      ross                 z.high = floatx80_default_nan_high;
   3914  1.1      ross                 return z;
   3915  1.1      ross             }
   3916  1.1      ross             float_raise( float_flag_divbyzero );
   3917  1.1      ross             return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
   3918  1.1      ross         }
   3919  1.1      ross         normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
   3920  1.1      ross     }
   3921  1.1      ross     if ( aExp == 0 ) {
   3922  1.1      ross         if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 );
   3923  1.1      ross         normalizeFloatx80Subnormal( aSig, &aExp, &aSig );
   3924  1.1      ross     }
   3925  1.1      ross     zExp = aExp - bExp + 0x3FFE;
   3926  1.1      ross     rem1 = 0;
   3927  1.1      ross     if ( bSig <= aSig ) {
   3928  1.1      ross         shift128Right( aSig, 0, 1, &aSig, &rem1 );
   3929  1.1      ross         ++zExp;
   3930  1.1      ross     }
   3931  1.1      ross     zSig0 = estimateDiv128To64( aSig, rem1, bSig );
   3932  1.1      ross     mul64To128( bSig, zSig0, &term0, &term1 );
   3933  1.1      ross     sub128( aSig, rem1, term0, term1, &rem0, &rem1 );
   3934  1.1      ross     while ( (sbits64) rem0 < 0 ) {
   3935  1.1      ross         --zSig0;
   3936  1.1      ross         add128( rem0, rem1, 0, bSig, &rem0, &rem1 );
   3937  1.1      ross     }
   3938  1.1      ross     zSig1 = estimateDiv128To64( rem1, 0, bSig );
   3939  1.1      ross     if ( (bits64) ( zSig1<<1 ) <= 8 ) {
   3940  1.1      ross         mul64To128( bSig, zSig1, &term1, &term2 );
   3941  1.1      ross         sub128( rem1, 0, term1, term2, &rem1, &rem2 );
   3942  1.1      ross         while ( (sbits64) rem1 < 0 ) {
   3943  1.1      ross             --zSig1;
   3944  1.1      ross             add128( rem1, rem2, 0, bSig, &rem1, &rem2 );
   3945  1.1      ross         }
   3946  1.1      ross         zSig1 |= ( ( rem1 | rem2 ) != 0 );
   3947  1.1      ross     }
   3948  1.1      ross     return
   3949  1.1      ross         roundAndPackFloatx80(
   3950  1.1      ross             floatx80_rounding_precision, zSign, zExp, zSig0, zSig1 );
   3951  1.1      ross 
   3952  1.1      ross }
   3953  1.1      ross 
   3954  1.1      ross /*
   3955  1.1      ross -------------------------------------------------------------------------------
   3956  1.1      ross Returns the remainder of the extended double-precision floating-point value
   3957  1.1      ross `a' with respect to the corresponding value `b'.  The operation is performed
   3958  1.1      ross according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
   3959  1.1      ross -------------------------------------------------------------------------------
   3960  1.1      ross */
   3961  1.1      ross floatx80 floatx80_rem( floatx80 a, floatx80 b )
   3962  1.1      ross {
   3963  1.1      ross     flag aSign, bSign, zSign;
   3964  1.1      ross     int32 aExp, bExp, expDiff;
   3965  1.1      ross     bits64 aSig0, aSig1, bSig;
   3966  1.1      ross     bits64 q, term0, term1, alternateASig0, alternateASig1;
   3967  1.1      ross     floatx80 z;
   3968  1.1      ross 
   3969  1.1      ross     aSig0 = extractFloatx80Frac( a );
   3970  1.1      ross     aExp = extractFloatx80Exp( a );
   3971  1.1      ross     aSign = extractFloatx80Sign( a );
   3972  1.1      ross     bSig = extractFloatx80Frac( b );
   3973  1.1      ross     bExp = extractFloatx80Exp( b );
   3974  1.1      ross     bSign = extractFloatx80Sign( b );
   3975  1.1      ross     if ( aExp == 0x7FFF ) {
   3976  1.1      ross         if (    (bits64) ( aSig0<<1 )
   3977  1.1      ross              || ( ( bExp == 0x7FFF ) && (bits64) ( bSig<<1 ) ) ) {
   3978  1.1      ross             return propagateFloatx80NaN( a, b );
   3979  1.1      ross         }
   3980  1.1      ross         goto invalid;
   3981  1.1      ross     }
   3982  1.1      ross     if ( bExp == 0x7FFF ) {
   3983  1.1      ross         if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b );
   3984  1.1      ross         return a;
   3985  1.1      ross     }
   3986  1.1      ross     if ( bExp == 0 ) {
   3987  1.1      ross         if ( bSig == 0 ) {
   3988  1.1      ross  invalid:
   3989  1.1      ross             float_raise( float_flag_invalid );
   3990  1.1      ross             z.low = floatx80_default_nan_low;
   3991  1.1      ross             z.high = floatx80_default_nan_high;
   3992  1.1      ross             return z;
   3993  1.1      ross         }
   3994  1.1      ross         normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
   3995  1.1      ross     }
   3996  1.1      ross     if ( aExp == 0 ) {
   3997  1.1      ross         if ( (bits64) ( aSig0<<1 ) == 0 ) return a;
   3998  1.1      ross         normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 );
   3999  1.1      ross     }
   4000  1.1      ross     bSig |= LIT64( 0x8000000000000000 );
   4001  1.1      ross     zSign = aSign;
   4002  1.1      ross     expDiff = aExp - bExp;
   4003  1.1      ross     aSig1 = 0;
   4004  1.1      ross     if ( expDiff < 0 ) {
   4005  1.1      ross         if ( expDiff < -1 ) return a;
   4006  1.1      ross         shift128Right( aSig0, 0, 1, &aSig0, &aSig1 );
   4007  1.1      ross         expDiff = 0;
   4008  1.1      ross     }
   4009  1.1      ross     q = ( bSig <= aSig0 );
   4010  1.1      ross     if ( q ) aSig0 -= bSig;
   4011  1.1      ross     expDiff -= 64;
   4012  1.1      ross     while ( 0 < expDiff ) {
   4013  1.1      ross         q = estimateDiv128To64( aSig0, aSig1, bSig );
   4014  1.1      ross         q = ( 2 < q ) ? q - 2 : 0;
   4015  1.1      ross         mul64To128( bSig, q, &term0, &term1 );
   4016  1.1      ross         sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
   4017  1.1      ross         shortShift128Left( aSig0, aSig1, 62, &aSig0, &aSig1 );
   4018  1.1      ross         expDiff -= 62;
   4019  1.1      ross     }
   4020  1.1      ross     expDiff += 64;
   4021  1.1      ross     if ( 0 < expDiff ) {
   4022  1.1      ross         q = estimateDiv128To64( aSig0, aSig1, bSig );
   4023  1.1      ross         q = ( 2 < q ) ? q - 2 : 0;
   4024  1.1      ross         q >>= 64 - expDiff;
   4025  1.1      ross         mul64To128( bSig, q<<( 64 - expDiff ), &term0, &term1 );
   4026  1.1      ross         sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
   4027  1.1      ross         shortShift128Left( 0, bSig, 64 - expDiff, &term0, &term1 );
   4028  1.1      ross         while ( le128( term0, term1, aSig0, aSig1 ) ) {
   4029  1.1      ross             ++q;
   4030  1.1      ross             sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
   4031  1.1      ross         }
   4032  1.1      ross     }
   4033  1.1      ross     else {
   4034  1.1      ross         term1 = 0;
   4035  1.1      ross         term0 = bSig;
   4036  1.1      ross     }
   4037  1.1      ross     sub128( term0, term1, aSig0, aSig1, &alternateASig0, &alternateASig1 );
   4038  1.1      ross     if (    lt128( alternateASig0, alternateASig1, aSig0, aSig1 )
   4039  1.1      ross          || (    eq128( alternateASig0, alternateASig1, aSig0, aSig1 )
   4040  1.1      ross               && ( q & 1 ) )
   4041  1.1      ross        ) {
   4042  1.1      ross         aSig0 = alternateASig0;
   4043  1.1      ross         aSig1 = alternateASig1;
   4044  1.1      ross         zSign = ! zSign;
   4045  1.1      ross     }
   4046  1.1      ross     return
   4047  1.1      ross         normalizeRoundAndPackFloatx80(
   4048  1.1      ross             80, zSign, bExp + expDiff, aSig0, aSig1 );
   4049  1.1      ross 
   4050  1.1      ross }
   4051  1.1      ross 
   4052  1.1      ross /*
   4053  1.1      ross -------------------------------------------------------------------------------
   4054  1.1      ross Returns the square root of the extended double-precision floating-point
   4055  1.1      ross value `a'.  The operation is performed according to the IEC/IEEE Standard
   4056  1.1      ross for Binary Floating-Point Arithmetic.
   4057  1.1      ross -------------------------------------------------------------------------------
   4058  1.1      ross */
   4059  1.1      ross floatx80 floatx80_sqrt( floatx80 a )
   4060  1.1      ross {
   4061  1.1      ross     flag aSign;
   4062  1.1      ross     int32 aExp, zExp;
   4063  1.1      ross     bits64 aSig0, aSig1, zSig0, zSig1, doubleZSig0;
   4064  1.1      ross     bits64 rem0, rem1, rem2, rem3, term0, term1, term2, term3;
   4065  1.1      ross     floatx80 z;
   4066  1.1      ross 
   4067  1.1      ross     aSig0 = extractFloatx80Frac( a );
   4068  1.1      ross     aExp = extractFloatx80Exp( a );
   4069  1.1      ross     aSign = extractFloatx80Sign( a );
   4070  1.1      ross     if ( aExp == 0x7FFF ) {
   4071  1.1      ross         if ( (bits64) ( aSig0<<1 ) ) return propagateFloatx80NaN( a, a );
   4072  1.1      ross         if ( ! aSign ) return a;
   4073  1.1      ross         goto invalid;
   4074  1.1      ross     }
   4075  1.1      ross     if ( aSign ) {
   4076  1.1      ross         if ( ( aExp | aSig0 ) == 0 ) return a;
   4077  1.1      ross  invalid:
   4078  1.1      ross         float_raise( float_flag_invalid );
   4079  1.1      ross         z.low = floatx80_default_nan_low;
   4080  1.1      ross         z.high = floatx80_default_nan_high;
   4081  1.1      ross         return z;
   4082  1.1      ross     }
   4083  1.1      ross     if ( aExp == 0 ) {
   4084  1.1      ross         if ( aSig0 == 0 ) return packFloatx80( 0, 0, 0 );
   4085  1.1      ross         normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 );
   4086  1.1      ross     }
   4087  1.1      ross     zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFF;
   4088  1.1      ross     zSig0 = estimateSqrt32( aExp, aSig0>>32 );
   4089  1.1      ross     shift128Right( aSig0, 0, 2 + ( aExp & 1 ), &aSig0, &aSig1 );
   4090  1.1      ross     zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 );
   4091  1.1      ross     doubleZSig0 = zSig0<<1;
   4092  1.1      ross     mul64To128( zSig0, zSig0, &term0, &term1 );
   4093  1.1      ross     sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 );
   4094  1.1      ross     while ( (sbits64) rem0 < 0 ) {
   4095  1.1      ross         --zSig0;
   4096  1.1      ross         doubleZSig0 -= 2;
   4097  1.1      ross         add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 );
   4098  1.1      ross     }
   4099  1.1      ross     zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 );
   4100  1.1      ross     if ( ( zSig1 & LIT64( 0x3FFFFFFFFFFFFFFF ) ) <= 5 ) {
   4101  1.1      ross         if ( zSig1 == 0 ) zSig1 = 1;
   4102  1.1      ross         mul64To128( doubleZSig0, zSig1, &term1, &term2 );
   4103  1.1      ross         sub128( rem1, 0, term1, term2, &rem1, &rem2 );
   4104  1.1      ross         mul64To128( zSig1, zSig1, &term2, &term3 );
   4105  1.1      ross         sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 );
   4106  1.1      ross         while ( (sbits64) rem1 < 0 ) {
   4107  1.1      ross             --zSig1;
   4108  1.1      ross             shortShift128Left( 0, zSig1, 1, &term2, &term3 );
   4109  1.1      ross             term3 |= 1;
   4110  1.1      ross             term2 |= doubleZSig0;
   4111  1.1      ross             add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 );
   4112  1.1      ross         }
   4113  1.1      ross         zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
   4114  1.1      ross     }
   4115  1.1      ross     shortShift128Left( 0, zSig1, 1, &zSig0, &zSig1 );
   4116  1.1      ross     zSig0 |= doubleZSig0;
   4117  1.1      ross     return
   4118  1.1      ross         roundAndPackFloatx80(
   4119  1.1      ross             floatx80_rounding_precision, 0, zExp, zSig0, zSig1 );
   4120  1.1      ross 
   4121  1.1      ross }
   4122  1.1      ross 
   4123  1.1      ross /*
   4124  1.1      ross -------------------------------------------------------------------------------
   4125  1.1      ross Returns 1 if the extended double-precision floating-point value `a' is
   4126  1.1      ross equal to the corresponding value `b', and 0 otherwise.  The comparison is
   4127  1.1      ross performed according to the IEC/IEEE Standard for Binary Floating-Point
   4128  1.1      ross Arithmetic.
   4129  1.1      ross -------------------------------------------------------------------------------
   4130  1.1      ross */
   4131  1.1      ross flag floatx80_eq( floatx80 a, floatx80 b )
   4132  1.1      ross {
   4133  1.1      ross 
   4134  1.1      ross     if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
   4135  1.1      ross               && (bits64) ( extractFloatx80Frac( a )<<1 ) )
   4136  1.1      ross          || (    ( extractFloatx80Exp( b ) == 0x7FFF )
   4137  1.1      ross               && (bits64) ( extractFloatx80Frac( b )<<1 ) )
   4138  1.1      ross        ) {
   4139  1.1      ross         if (    floatx80_is_signaling_nan( a )
   4140  1.1      ross              || floatx80_is_signaling_nan( b ) ) {
   4141  1.1      ross             float_raise( float_flag_invalid );
   4142  1.1      ross         }
   4143  1.1      ross         return 0;
   4144  1.1      ross     }
   4145  1.1      ross     return
   4146  1.1      ross            ( a.low == b.low )
   4147  1.1      ross         && (    ( a.high == b.high )
   4148  1.1      ross              || (    ( a.low == 0 )
   4149  1.1      ross                   && ( (bits16) ( ( a.high | b.high )<<1 ) == 0 ) )
   4150  1.1      ross            );
   4151  1.1      ross 
   4152  1.1      ross }
   4153  1.1      ross 
   4154  1.1      ross /*
   4155  1.1      ross -------------------------------------------------------------------------------
   4156  1.1      ross Returns 1 if the extended double-precision floating-point value `a' is
   4157  1.1      ross less than or equal to the corresponding value `b', and 0 otherwise.  The
   4158  1.1      ross comparison is performed according to the IEC/IEEE Standard for Binary
   4159  1.1      ross Floating-Point Arithmetic.
   4160  1.1      ross -------------------------------------------------------------------------------
   4161  1.1      ross */
   4162  1.1      ross flag floatx80_le( floatx80 a, floatx80 b )
   4163  1.1      ross {
   4164  1.1      ross     flag aSign, bSign;
   4165  1.1      ross 
   4166  1.1      ross     if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
   4167  1.1      ross               && (bits64) ( extractFloatx80Frac( a )<<1 ) )
   4168  1.1      ross          || (    ( extractFloatx80Exp( b ) == 0x7FFF )
   4169  1.1      ross               && (bits64) ( extractFloatx80Frac( b )<<1 ) )
   4170  1.1      ross        ) {
   4171  1.1      ross         float_raise( float_flag_invalid );
   4172  1.1      ross         return 0;
   4173  1.1      ross     }
   4174  1.1      ross     aSign = extractFloatx80Sign( a );
   4175  1.1      ross     bSign = extractFloatx80Sign( b );
   4176  1.1      ross     if ( aSign != bSign ) {
   4177  1.1      ross         return
   4178  1.1      ross                aSign
   4179  1.1      ross             || (    ( ( (bits16) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
   4180  1.1      ross                  == 0 );
   4181  1.1      ross     }
   4182  1.1      ross     return
   4183  1.1      ross           aSign ? le128( b.high, b.low, a.high, a.low )
   4184  1.1      ross         : le128( a.high, a.low, b.high, b.low );
   4185  1.1      ross 
   4186  1.1      ross }
   4187  1.1      ross 
   4188  1.1      ross /*
   4189  1.1      ross -------------------------------------------------------------------------------
   4190  1.1      ross Returns 1 if the extended double-precision floating-point value `a' is
   4191  1.1      ross less than the corresponding value `b', and 0 otherwise.  The comparison
   4192  1.1      ross is performed according to the IEC/IEEE Standard for Binary Floating-Point
   4193  1.1      ross Arithmetic.
   4194  1.1      ross -------------------------------------------------------------------------------
   4195  1.1      ross */
   4196  1.1      ross flag floatx80_lt( floatx80 a, floatx80 b )
   4197  1.1      ross {
   4198  1.1      ross     flag aSign, bSign;
   4199  1.1      ross 
   4200  1.1      ross     if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
   4201  1.1      ross               && (bits64) ( extractFloatx80Frac( a )<<1 ) )
   4202  1.1      ross          || (    ( extractFloatx80Exp( b ) == 0x7FFF )
   4203  1.1      ross               && (bits64) ( extractFloatx80Frac( b )<<1 ) )
   4204  1.1      ross        ) {
   4205  1.1      ross         float_raise( float_flag_invalid );
   4206  1.1      ross         return 0;
   4207  1.1      ross     }
   4208  1.1      ross     aSign = extractFloatx80Sign( a );
   4209  1.1      ross     bSign = extractFloatx80Sign( b );
   4210  1.1      ross     if ( aSign != bSign ) {
   4211  1.1      ross         return
   4212  1.1      ross                aSign
   4213  1.1      ross             && (    ( ( (bits16) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
   4214  1.1      ross                  != 0 );
   4215  1.1      ross     }
   4216  1.1      ross     return
   4217  1.1      ross           aSign ? lt128( b.high, b.low, a.high, a.low )
   4218  1.1      ross         : lt128( a.high, a.low, b.high, b.low );
   4219  1.1      ross 
   4220  1.1      ross }
   4221  1.1      ross 
   4222  1.1      ross /*
   4223  1.1      ross -------------------------------------------------------------------------------
   4224  1.1      ross Returns 1 if the extended double-precision floating-point value `a' is equal
   4225  1.1      ross to the corresponding value `b', and 0 otherwise.  The invalid exception is
   4226  1.1      ross raised if either operand is a NaN.  Otherwise, the comparison is performed
   4227  1.1      ross according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
   4228  1.1      ross -------------------------------------------------------------------------------
   4229  1.1      ross */
   4230  1.1      ross flag floatx80_eq_signaling( floatx80 a, floatx80 b )
   4231  1.1      ross {
   4232  1.1      ross 
   4233  1.1      ross     if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
   4234  1.1      ross               && (bits64) ( extractFloatx80Frac( a )<<1 ) )
   4235  1.1      ross          || (    ( extractFloatx80Exp( b ) == 0x7FFF )
   4236  1.1      ross               && (bits64) ( extractFloatx80Frac( b )<<1 ) )
   4237  1.1      ross        ) {
   4238  1.1      ross         float_raise( float_flag_invalid );
   4239  1.1      ross         return 0;
   4240  1.1      ross     }
   4241  1.1      ross     return
   4242  1.1      ross            ( a.low == b.low )
   4243  1.1      ross         && (    ( a.high == b.high )
   4244  1.1      ross              || (    ( a.low == 0 )
   4245  1.1      ross                   && ( (bits16) ( ( a.high | b.high )<<1 ) == 0 ) )
   4246  1.1      ross            );
   4247  1.1      ross 
   4248  1.1      ross }
   4249  1.1      ross 
   4250  1.1      ross /*
   4251  1.1      ross -------------------------------------------------------------------------------
   4252  1.1      ross Returns 1 if the extended double-precision floating-point value `a' is less
   4253  1.1      ross than or equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs
   4254  1.1      ross do not cause an exception.  Otherwise, the comparison is performed according
   4255  1.1      ross to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
   4256  1.1      ross -------------------------------------------------------------------------------
   4257  1.1      ross */
   4258  1.1      ross flag floatx80_le_quiet( floatx80 a, floatx80 b )
   4259  1.1      ross {
   4260  1.1      ross     flag aSign, bSign;
   4261  1.1      ross 
   4262  1.1      ross     if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
   4263  1.1      ross               && (bits64) ( extractFloatx80Frac( a )<<1 ) )
   4264  1.1      ross          || (    ( extractFloatx80Exp( b ) == 0x7FFF )
   4265  1.1      ross               && (bits64) ( extractFloatx80Frac( b )<<1 ) )
   4266  1.1      ross        ) {
   4267  1.1      ross         if (    floatx80_is_signaling_nan( a )
   4268  1.1      ross              || floatx80_is_signaling_nan( b ) ) {
   4269  1.1      ross             float_raise( float_flag_invalid );
   4270  1.1      ross         }
   4271  1.1      ross         return 0;
   4272  1.1      ross     }
   4273  1.1      ross     aSign = extractFloatx80Sign( a );
   4274  1.1      ross     bSign = extractFloatx80Sign( b );
   4275  1.1      ross     if ( aSign != bSign ) {
   4276  1.1      ross         return
   4277  1.1      ross                aSign
   4278  1.1      ross             || (    ( ( (bits16) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
   4279  1.1      ross                  == 0 );
   4280  1.1      ross     }
   4281  1.1      ross     return
   4282  1.1      ross           aSign ? le128( b.high, b.low, a.high, a.low )
   4283  1.1      ross         : le128( a.high, a.low, b.high, b.low );
   4284  1.1      ross 
   4285  1.1      ross }
   4286  1.1      ross 
   4287  1.1      ross /*
   4288  1.1      ross -------------------------------------------------------------------------------
   4289  1.1      ross Returns 1 if the extended double-precision floating-point value `a' is less
   4290  1.1      ross than the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause
   4291  1.1      ross an exception.  Otherwise, the comparison is performed according to the
   4292  1.1      ross IEC/IEEE Standard for Binary Floating-Point Arithmetic.
   4293  1.1      ross -------------------------------------------------------------------------------
   4294  1.1      ross */
   4295  1.1      ross flag floatx80_lt_quiet( floatx80 a, floatx80 b )
   4296  1.1      ross {
   4297  1.1      ross     flag aSign, bSign;
   4298  1.1      ross 
   4299  1.1      ross     if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
   4300  1.1      ross               && (bits64) ( extractFloatx80Frac( a )<<1 ) )
   4301  1.1      ross          || (    ( extractFloatx80Exp( b ) == 0x7FFF )
   4302  1.1      ross               && (bits64) ( extractFloatx80Frac( b )<<1 ) )
   4303  1.1      ross        ) {
   4304  1.1      ross         if (    floatx80_is_signaling_nan( a )
   4305  1.1      ross              || floatx80_is_signaling_nan( b ) ) {
   4306  1.1      ross             float_raise( float_flag_invalid );
   4307  1.1      ross         }
   4308  1.1      ross         return 0;
   4309  1.1      ross     }
   4310  1.1      ross     aSign = extractFloatx80Sign( a );
   4311  1.1      ross     bSign = extractFloatx80Sign( b );
   4312  1.1      ross     if ( aSign != bSign ) {
   4313  1.1      ross         return
   4314  1.1      ross                aSign
   4315  1.1      ross             && (    ( ( (bits16) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
   4316  1.1      ross                  != 0 );
   4317  1.1      ross     }
   4318  1.1      ross     return
   4319  1.1      ross           aSign ? lt128( b.high, b.low, a.high, a.low )
   4320  1.1      ross         : lt128( a.high, a.low, b.high, b.low );
   4321  1.1      ross 
   4322  1.1      ross }
   4323  1.1      ross 
   4324  1.1      ross #endif
   4325  1.1      ross 
   4326  1.1      ross #ifdef FLOAT128
   4327  1.1      ross 
   4328  1.1      ross /*
   4329  1.1      ross -------------------------------------------------------------------------------
   4330  1.1      ross Returns the result of converting the quadruple-precision floating-point
   4331  1.1      ross value `a' to the 32-bit two's complement integer format.  The conversion
   4332  1.1      ross is performed according to the IEC/IEEE Standard for Binary Floating-Point
   4333  1.1      ross Arithmetic---which means in particular that the conversion is rounded
   4334  1.1      ross according to the current rounding mode.  If `a' is a NaN, the largest
   4335  1.1      ross positive integer is returned.  Otherwise, if the conversion overflows, the
   4336  1.1      ross largest integer with the same sign as `a' is returned.
   4337  1.1      ross -------------------------------------------------------------------------------
   4338  1.1      ross */
   4339  1.1      ross int32 float128_to_int32( float128 a )
   4340  1.1      ross {
   4341  1.1      ross     flag aSign;
   4342  1.1      ross     int32 aExp, shiftCount;
   4343  1.1      ross     bits64 aSig0, aSig1;
   4344  1.1      ross 
   4345  1.1      ross     aSig1 = extractFloat128Frac1( a );
   4346  1.1      ross     aSig0 = extractFloat128Frac0( a );
   4347  1.1      ross     aExp = extractFloat128Exp( a );
   4348  1.1      ross     aSign = extractFloat128Sign( a );
   4349  1.1      ross     if ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) aSign = 0;
   4350  1.1      ross     if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
   4351  1.1      ross     aSig0 |= ( aSig1 != 0 );
   4352  1.1      ross     shiftCount = 0x4028 - aExp;
   4353  1.1      ross     if ( 0 < shiftCount ) shift64RightJamming( aSig0, shiftCount, &aSig0 );
   4354  1.1      ross     return roundAndPackInt32( aSign, aSig0 );
   4355  1.1      ross 
   4356  1.1      ross }
   4357  1.1      ross 
   4358  1.1      ross /*
   4359  1.1      ross -------------------------------------------------------------------------------
   4360  1.1      ross Returns the result of converting the quadruple-precision floating-point
   4361  1.1      ross value `a' to the 32-bit two's complement integer format.  The conversion
   4362  1.1      ross is performed according to the IEC/IEEE Standard for Binary Floating-Point
   4363  1.1      ross Arithmetic, except that the conversion is always rounded toward zero.  If
   4364  1.1      ross `a' is a NaN, the largest positive integer is returned.  Otherwise, if the
   4365  1.1      ross conversion overflows, the largest integer with the same sign as `a' is
   4366  1.1      ross returned.
   4367  1.1      ross -------------------------------------------------------------------------------
   4368  1.1      ross */
   4369  1.1      ross int32 float128_to_int32_round_to_zero( float128 a )
   4370  1.1      ross {
   4371  1.1      ross     flag aSign;
   4372  1.1      ross     int32 aExp, shiftCount;
   4373  1.1      ross     bits64 aSig0, aSig1, savedASig;
   4374  1.1      ross     int32 z;
   4375  1.1      ross 
   4376  1.1      ross     aSig1 = extractFloat128Frac1( a );
   4377  1.1      ross     aSig0 = extractFloat128Frac0( a );
   4378  1.1      ross     aExp = extractFloat128Exp( a );
   4379  1.1      ross     aSign = extractFloat128Sign( a );
   4380  1.1      ross     aSig0 |= ( aSig1 != 0 );
   4381  1.1      ross     if ( 0x401E < aExp ) {
   4382  1.1      ross         if ( ( aExp == 0x7FFF ) && aSig0 ) aSign = 0;
   4383  1.1      ross         goto invalid;
   4384  1.1      ross     }
   4385  1.1      ross     else if ( aExp < 0x3FFF ) {
   4386  1.1      ross         if ( aExp || aSig0 ) float_set_inexact();
   4387  1.1      ross         return 0;
   4388  1.1      ross     }
   4389  1.1      ross     aSig0 |= LIT64( 0x0001000000000000 );
   4390  1.1      ross     shiftCount = 0x402F - aExp;
   4391  1.1      ross     savedASig = aSig0;
   4392  1.1      ross     aSig0 >>= shiftCount;
   4393  1.1      ross     z = aSig0;
   4394  1.1      ross     if ( aSign ) z = - z;
   4395  1.1      ross     if ( ( z < 0 ) ^ aSign ) {
   4396  1.1      ross  invalid:
   4397  1.1      ross         float_raise( float_flag_invalid );
   4398  1.1      ross         return aSign ? (sbits32) 0x80000000 : 0x7FFFFFFF;
   4399  1.1      ross     }
   4400  1.1      ross     if ( ( aSig0<<shiftCount ) != savedASig ) {
   4401  1.1      ross         float_set_inexact();
   4402  1.1      ross     }
   4403  1.1      ross     return z;
   4404  1.1      ross 
   4405  1.1      ross }
   4406  1.1      ross 
   4407  1.1      ross /*
   4408  1.1      ross -------------------------------------------------------------------------------
   4409  1.1      ross Returns the result of converting the quadruple-precision floating-point
   4410  1.1      ross value `a' to the 64-bit two's complement integer format.  The conversion
   4411  1.1      ross is performed according to the IEC/IEEE Standard for Binary Floating-Point
   4412  1.1      ross Arithmetic---which means in particular that the conversion is rounded
   4413  1.1      ross according to the current rounding mode.  If `a' is a NaN, the largest
   4414  1.1      ross positive integer is returned.  Otherwise, if the conversion overflows, the
   4415  1.1      ross largest integer with the same sign as `a' is returned.
   4416  1.1      ross -------------------------------------------------------------------------------
   4417  1.1      ross */
   4418  1.1      ross int64 float128_to_int64( float128 a )
   4419  1.1      ross {
   4420  1.1      ross     flag aSign;
   4421  1.1      ross     int32 aExp, shiftCount;
   4422  1.1      ross     bits64 aSig0, aSig1;
   4423  1.1      ross 
   4424  1.1      ross     aSig1 = extractFloat128Frac1( a );
   4425  1.1      ross     aSig0 = extractFloat128Frac0( a );
   4426  1.1      ross     aExp = extractFloat128Exp( a );
   4427  1.1      ross     aSign = extractFloat128Sign( a );
   4428  1.1      ross     if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
   4429  1.1      ross     shiftCount = 0x402F - aExp;
   4430  1.1      ross     if ( shiftCount <= 0 ) {
   4431  1.1      ross         if ( 0x403E < aExp ) {
   4432  1.1      ross             float_raise( float_flag_invalid );
   4433  1.1      ross             if (    ! aSign
   4434  1.1      ross                  || (    ( aExp == 0x7FFF )
   4435  1.1      ross                       && ( aSig1 || ( aSig0 != LIT64( 0x0001000000000000 ) ) )
   4436  1.1      ross                     )
   4437  1.1      ross                ) {
   4438  1.1      ross                 return LIT64( 0x7FFFFFFFFFFFFFFF );
   4439  1.1      ross             }
   4440  1.1      ross             return (sbits64) LIT64( 0x8000000000000000 );
   4441  1.1      ross         }
   4442  1.1      ross         shortShift128Left( aSig0, aSig1, - shiftCount, &aSig0, &aSig1 );
   4443  1.1      ross     }
   4444  1.1      ross     else {
   4445  1.1      ross         shift64ExtraRightJamming( aSig0, aSig1, shiftCount, &aSig0, &aSig1 );
   4446  1.1      ross     }
   4447  1.1      ross     return roundAndPackInt64( aSign, aSig0, aSig1 );
   4448  1.1      ross 
   4449  1.1      ross }
   4450  1.1      ross 
   4451  1.1      ross /*
   4452  1.1      ross -------------------------------------------------------------------------------
   4453  1.1      ross Returns the result of converting the quadruple-precision floating-point
   4454  1.1      ross value `a' to the 64-bit two's complement integer format.  The conversion
   4455  1.1      ross is performed according to the IEC/IEEE Standard for Binary Floating-Point
   4456  1.1      ross Arithmetic, except that the conversion is always rounded toward zero.
   4457  1.1      ross If `a' is a NaN, the largest positive integer is returned.  Otherwise, if
   4458  1.1      ross the conversion overflows, the largest integer with the same sign as `a' is
   4459  1.1      ross returned.
   4460  1.1      ross -------------------------------------------------------------------------------
   4461  1.1      ross */
   4462  1.1      ross int64 float128_to_int64_round_to_zero( float128 a )
   4463  1.1      ross {
   4464  1.1      ross     flag aSign;
   4465  1.1      ross     int32 aExp, shiftCount;
   4466  1.1      ross     bits64 aSig0, aSig1;
   4467  1.1      ross     int64 z;
   4468  1.1      ross 
   4469  1.1      ross     aSig1 = extractFloat128Frac1( a );
   4470  1.1      ross     aSig0 = extractFloat128Frac0( a );
   4471  1.1      ross     aExp = extractFloat128Exp( a );
   4472  1.1      ross     aSign = extractFloat128Sign( a );
   4473  1.1      ross     if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
   4474  1.1      ross     shiftCount = aExp - 0x402F;
   4475  1.1      ross     if ( 0 < shiftCount ) {
   4476  1.1      ross         if ( 0x403E <= aExp ) {
   4477  1.1      ross             aSig0 &= LIT64( 0x0000FFFFFFFFFFFF );
   4478  1.1      ross             if (    ( a.high == LIT64( 0xC03E000000000000 ) )
   4479  1.1      ross                  && ( aSig1 < LIT64( 0x0002000000000000 ) ) ) {
   4480  1.1      ross                 if ( aSig1 ) float_set_inexact();
   4481  1.1      ross             }
   4482  1.1      ross             else {
   4483  1.1      ross                 float_raise( float_flag_invalid );
   4484  1.1      ross                 if ( ! aSign || ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) ) {
   4485  1.1      ross                     return LIT64( 0x7FFFFFFFFFFFFFFF );
   4486  1.1      ross                 }
   4487  1.1      ross             }
   4488  1.1      ross             return (sbits64) LIT64( 0x8000000000000000 );
   4489  1.1      ross         }
   4490  1.1      ross         z = ( aSig0<<shiftCount ) | ( aSig1>>( ( - shiftCount ) & 63 ) );
   4491  1.1      ross         if ( (bits64) ( aSig1<<shiftCount ) ) {
   4492  1.1      ross             float_set_inexact();
   4493  1.1      ross         }
   4494  1.1      ross     }
   4495  1.1      ross     else {
   4496  1.1      ross         if ( aExp < 0x3FFF ) {
   4497  1.1      ross             if ( aExp | aSig0 | aSig1 ) {
   4498  1.1      ross                 float_set_inexact();
   4499  1.1      ross             }
   4500  1.1      ross             return 0;
   4501  1.1      ross         }
   4502  1.1      ross         z = aSig0>>( - shiftCount );
   4503  1.1      ross         if (    aSig1
   4504  1.1      ross              || ( shiftCount && (bits64) ( aSig0<<( shiftCount & 63 ) ) ) ) {
   4505  1.1      ross             float_set_inexact();
   4506  1.1      ross         }
   4507  1.1      ross     }
   4508  1.1      ross     if ( aSign ) z = - z;
   4509  1.1      ross     return z;
   4510  1.1      ross 
   4511  1.1      ross }
   4512  1.1      ross 
   4513  1.1      ross /*
   4514  1.1      ross -------------------------------------------------------------------------------
   4515  1.1      ross Returns the result of converting the quadruple-precision floating-point
   4516  1.1      ross value `a' to the single-precision floating-point format.  The conversion
   4517  1.1      ross is performed according to the IEC/IEEE Standard for Binary Floating-Point
   4518  1.1      ross Arithmetic.
   4519  1.1      ross -------------------------------------------------------------------------------
   4520  1.1      ross */
   4521  1.1      ross float32 float128_to_float32( float128 a )
   4522  1.1      ross {
   4523  1.1      ross     flag aSign;
   4524  1.1      ross     int32 aExp;
   4525  1.1      ross     bits64 aSig0, aSig1;
   4526  1.1      ross     bits32 zSig;
   4527  1.1      ross 
   4528  1.1      ross     aSig1 = extractFloat128Frac1( a );
   4529  1.1      ross     aSig0 = extractFloat128Frac0( a );
   4530  1.1      ross     aExp = extractFloat128Exp( a );
   4531  1.1      ross     aSign = extractFloat128Sign( a );
   4532  1.1      ross     if ( aExp == 0x7FFF ) {
   4533  1.1      ross         if ( aSig0 | aSig1 ) {
   4534  1.1      ross             return commonNaNToFloat32( float128ToCommonNaN( a ) );
   4535  1.1      ross         }
   4536  1.1      ross         return packFloat32( aSign, 0xFF, 0 );
   4537  1.1      ross     }
   4538  1.1      ross     aSig0 |= ( aSig1 != 0 );
   4539  1.1      ross     shift64RightJamming( aSig0, 18, &aSig0 );
   4540  1.1      ross     zSig = aSig0;
   4541  1.1      ross     if ( aExp || zSig ) {
   4542  1.1      ross         zSig |= 0x40000000;
   4543  1.1      ross         aExp -= 0x3F81;
   4544  1.1      ross     }
   4545  1.1      ross     return roundAndPackFloat32( aSign, aExp, zSig );
   4546  1.1      ross 
   4547  1.1      ross }
   4548  1.1      ross 
   4549  1.1      ross /*
   4550  1.1      ross -------------------------------------------------------------------------------
   4551  1.1      ross Returns the result of converting the quadruple-precision floating-point
   4552  1.1      ross value `a' to the double-precision floating-point format.  The conversion
   4553  1.1      ross is performed according to the IEC/IEEE Standard for Binary Floating-Point
   4554  1.1      ross Arithmetic.
   4555  1.1      ross -------------------------------------------------------------------------------
   4556  1.1      ross */
   4557  1.1      ross float64 float128_to_float64( float128 a )
   4558  1.1      ross {
   4559  1.1      ross     flag aSign;
   4560  1.1      ross     int32 aExp;
   4561  1.1      ross     bits64 aSig0, aSig1;
   4562  1.1      ross 
   4563  1.1      ross     aSig1 = extractFloat128Frac1( a );
   4564  1.1      ross     aSig0 = extractFloat128Frac0( a );
   4565  1.1      ross     aExp = extractFloat128Exp( a );
   4566  1.1      ross     aSign = extractFloat128Sign( a );
   4567  1.1      ross     if ( aExp == 0x7FFF ) {
   4568  1.1      ross         if ( aSig0 | aSig1 ) {
   4569  1.1      ross             return commonNaNToFloat64( float128ToCommonNaN( a ) );
   4570  1.1      ross         }
   4571  1.1      ross         return packFloat64( aSign, 0x7FF, 0 );
   4572  1.1      ross     }
   4573  1.1      ross     shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 );
   4574  1.1      ross     aSig0 |= ( aSig1 != 0 );
   4575  1.1      ross     if ( aExp || aSig0 ) {
   4576  1.1      ross         aSig0 |= LIT64( 0x4000000000000000 );
   4577  1.1      ross         aExp -= 0x3C01;
   4578  1.1      ross     }
   4579  1.1      ross     return roundAndPackFloat64( aSign, aExp, aSig0 );
   4580  1.1      ross 
   4581  1.1      ross }
   4582  1.1      ross 
   4583  1.1      ross #ifdef FLOATX80
   4584  1.1      ross 
   4585  1.1      ross /*
   4586  1.1      ross -------------------------------------------------------------------------------
   4587  1.1      ross Returns the result of converting the quadruple-precision floating-point
   4588  1.1      ross value `a' to the extended double-precision floating-point format.  The
   4589  1.1      ross conversion is performed according to the IEC/IEEE Standard for Binary
   4590  1.1      ross Floating-Point Arithmetic.
   4591  1.1      ross -------------------------------------------------------------------------------
   4592  1.1      ross */
   4593  1.1      ross floatx80 float128_to_floatx80( float128 a )
   4594  1.1      ross {
   4595  1.1      ross     flag aSign;
   4596  1.1      ross     int32 aExp;
   4597  1.1      ross     bits64 aSig0, aSig1;
   4598  1.1      ross 
   4599  1.1      ross     aSig1 = extractFloat128Frac1( a );
   4600  1.1      ross     aSig0 = extractFloat128Frac0( a );
   4601  1.1      ross     aExp = extractFloat128Exp( a );
   4602  1.1      ross     aSign = extractFloat128Sign( a );
   4603  1.1      ross     if ( aExp == 0x7FFF ) {
   4604  1.1      ross         if ( aSig0 | aSig1 ) {
   4605  1.1      ross             return commonNaNToFloatx80( float128ToCommonNaN( a ) );
   4606  1.1      ross         }
   4607  1.1      ross         return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
   4608  1.1      ross     }
   4609  1.1      ross     if ( aExp == 0 ) {
   4610  1.1      ross         if ( ( aSig0 | aSig1 ) == 0 ) return packFloatx80( aSign, 0, 0 );
   4611  1.1      ross         normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
   4612  1.1      ross     }
   4613  1.1      ross     else {
   4614  1.1      ross         aSig0 |= LIT64( 0x0001000000000000 );
   4615  1.1      ross     }
   4616  1.1      ross     shortShift128Left( aSig0, aSig1, 15, &aSig0, &aSig1 );
   4617  1.1      ross     return roundAndPackFloatx80( 80, aSign, aExp, aSig0, aSig1 );
   4618  1.1      ross 
   4619  1.1      ross }
   4620  1.1      ross 
   4621  1.1      ross #endif
   4622  1.1      ross 
   4623  1.1      ross /*
   4624  1.1      ross -------------------------------------------------------------------------------
   4625  1.1      ross Rounds the quadruple-precision floating-point value `a' to an integer, and
   4626  1.1      ross returns the result as a quadruple-precision floating-point value.  The
   4627  1.1      ross operation is performed according to the IEC/IEEE Standard for Binary
   4628  1.1      ross Floating-Point Arithmetic.
   4629  1.1      ross -------------------------------------------------------------------------------
   4630  1.1      ross */
   4631  1.1      ross float128 float128_round_to_int( float128 a )
   4632  1.1      ross {
   4633  1.1      ross     flag aSign;
   4634  1.1      ross     int32 aExp;
   4635  1.1      ross     bits64 lastBitMask, roundBitsMask;
   4636  1.1      ross     int8 roundingMode;
   4637  1.1      ross     float128 z;
   4638  1.1      ross 
   4639  1.1      ross     aExp = extractFloat128Exp( a );
   4640  1.1      ross     if ( 0x402F <= aExp ) {
   4641  1.1      ross         if ( 0x406F <= aExp ) {
   4642  1.1      ross             if (    ( aExp == 0x7FFF )
   4643  1.1      ross                  && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) )
   4644  1.1      ross                ) {
   4645  1.1      ross                 return propagateFloat128NaN( a, a );
   4646  1.1      ross             }
   4647  1.1      ross             return a;
   4648  1.1      ross         }
   4649  1.1      ross         lastBitMask = 1;
   4650  1.1      ross         lastBitMask = ( lastBitMask<<( 0x406E - aExp ) )<<1;
   4651  1.1      ross         roundBitsMask = lastBitMask - 1;
   4652  1.1      ross         z = a;
   4653  1.1      ross         roundingMode = float_rounding_mode();
   4654  1.1      ross         if ( roundingMode == float_round_nearest_even ) {
   4655  1.1      ross             if ( lastBitMask ) {
   4656  1.1      ross                 add128( z.high, z.low, 0, lastBitMask>>1, &z.high, &z.low );
   4657  1.1      ross                 if ( ( z.low & roundBitsMask ) == 0 ) z.low &= ~ lastBitMask;
   4658  1.1      ross             }
   4659  1.1      ross             else {
   4660  1.1      ross                 if ( (sbits64) z.low < 0 ) {
   4661  1.1      ross                     ++z.high;
   4662  1.1      ross                     if ( (bits64) ( z.low<<1 ) == 0 ) z.high &= ~1;
   4663  1.1      ross                 }
   4664  1.1      ross             }
   4665  1.1      ross         }
   4666  1.1      ross         else if ( roundingMode != float_round_to_zero ) {
   4667  1.1      ross             if (   extractFloat128Sign( z )
   4668  1.1      ross                  ^ ( roundingMode == float_round_up ) ) {
   4669  1.1      ross                 add128( z.high, z.low, 0, roundBitsMask, &z.high, &z.low );
   4670  1.1      ross             }
   4671  1.1      ross         }
   4672  1.1      ross         z.low &= ~ roundBitsMask;
   4673  1.1      ross     }
   4674  1.1      ross     else {
   4675  1.1      ross         if ( aExp < 0x3FFF ) {
   4676  1.1      ross             if ( ( ( (bits64) ( a.high<<1 ) ) | a.low ) == 0 ) return a;
   4677  1.1      ross             float_set_inexact();
   4678  1.1      ross             aSign = extractFloat128Sign( a );
   4679  1.1      ross             switch ( float_rounding_mode() ) {
   4680  1.1      ross              case float_round_nearest_even:
   4681  1.1      ross                 if (    ( aExp == 0x3FFE )
   4682  1.1      ross                      && (   extractFloat128Frac0( a )
   4683  1.1      ross                           | extractFloat128Frac1( a ) )
   4684  1.1      ross                    ) {
   4685  1.1      ross                     return packFloat128( aSign, 0x3FFF, 0, 0 );
   4686  1.1      ross                 }
   4687  1.1      ross                 break;
   4688  1.1      ross              case float_round_down:
   4689  1.1      ross                 return
   4690  1.1      ross                       aSign ? packFloat128( 1, 0x3FFF, 0, 0 )
   4691  1.1      ross                     : packFloat128( 0, 0, 0, 0 );
   4692  1.1      ross              case float_round_up:
   4693  1.1      ross                 return
   4694  1.1      ross                       aSign ? packFloat128( 1, 0, 0, 0 )
   4695  1.1      ross                     : packFloat128( 0, 0x3FFF, 0, 0 );
   4696  1.1      ross             }
   4697  1.1      ross             return packFloat128( aSign, 0, 0, 0 );
   4698  1.1      ross         }
   4699  1.1      ross         lastBitMask = 1;
   4700  1.1      ross         lastBitMask <<= 0x402F - aExp;
   4701  1.1      ross         roundBitsMask = lastBitMask - 1;
   4702  1.1      ross         z.low = 0;
   4703  1.1      ross         z.high = a.high;
   4704  1.1      ross         roundingMode = float_rounding_mode();
   4705  1.1      ross         if ( roundingMode == float_round_nearest_even ) {
   4706  1.1      ross             z.high += lastBitMask>>1;
   4707  1.1      ross             if ( ( ( z.high & roundBitsMask ) | a.low ) == 0 ) {
   4708  1.1      ross                 z.high &= ~ lastBitMask;
   4709  1.1      ross             }
   4710  1.1      ross         }
   4711  1.1      ross         else if ( roundingMode != float_round_to_zero ) {
   4712  1.1      ross             if (   extractFloat128Sign( z )
   4713  1.1      ross                  ^ ( roundingMode == float_round_up ) ) {
   4714  1.1      ross                 z.high |= ( a.low != 0 );
   4715  1.1      ross                 z.high += roundBitsMask;
   4716  1.1      ross             }
   4717  1.1      ross         }
   4718  1.1      ross         z.high &= ~ roundBitsMask;
   4719  1.1      ross     }
   4720  1.1      ross     if ( ( z.low != a.low ) || ( z.high != a.high ) ) {
   4721  1.1      ross         float_set_inexact();
   4722  1.1      ross     }
   4723  1.1      ross     return z;
   4724  1.1      ross 
   4725  1.1      ross }
   4726  1.1      ross 
   4727  1.1      ross /*
   4728  1.1      ross -------------------------------------------------------------------------------
   4729  1.1      ross Returns the result of adding the absolute values of the quadruple-precision
   4730  1.1      ross floating-point values `a' and `b'.  If `zSign' is 1, the sum is negated
   4731  1.1      ross before being returned.  `zSign' is ignored if the result is a NaN.
   4732  1.1      ross The addition is performed according to the IEC/IEEE Standard for Binary
   4733  1.1      ross Floating-Point Arithmetic.
   4734  1.1      ross -------------------------------------------------------------------------------
   4735  1.1      ross */
   4736  1.1      ross static float128 addFloat128Sigs( float128 a, float128 b, flag zSign )
   4737  1.1      ross {
   4738  1.1      ross     int32 aExp, bExp, zExp;
   4739  1.1      ross     bits64 aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2;
   4740  1.1      ross     int32 expDiff;
   4741  1.1      ross 
   4742  1.1      ross     aSig1 = extractFloat128Frac1( a );
   4743  1.1      ross     aSig0 = extractFloat128Frac0( a );
   4744  1.1      ross     aExp = extractFloat128Exp( a );
   4745  1.1      ross     bSig1 = extractFloat128Frac1( b );
   4746  1.1      ross     bSig0 = extractFloat128Frac0( b );
   4747  1.1      ross     bExp = extractFloat128Exp( b );
   4748  1.1      ross     expDiff = aExp - bExp;
   4749  1.1      ross     if ( 0 < expDiff ) {
   4750  1.1      ross         if ( aExp == 0x7FFF ) {
   4751  1.1      ross             if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, b );
   4752  1.1      ross             return a;
   4753  1.1      ross         }
   4754  1.1      ross         if ( bExp == 0 ) {
   4755  1.1      ross             --expDiff;
   4756  1.1      ross         }
   4757  1.1      ross         else {
   4758  1.1      ross             bSig0 |= LIT64( 0x0001000000000000 );
   4759  1.1      ross         }
   4760  1.1      ross         shift128ExtraRightJamming(
   4761  1.1      ross             bSig0, bSig1, 0, expDiff, &bSig0, &bSig1, &zSig2 );
   4762  1.1      ross         zExp = aExp;
   4763  1.1      ross     }
   4764  1.1      ross     else if ( expDiff < 0 ) {
   4765  1.1      ross         if ( bExp == 0x7FFF ) {
   4766  1.1      ross             if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b );
   4767  1.1      ross             return packFloat128( zSign, 0x7FFF, 0, 0 );
   4768  1.1      ross         }
   4769  1.1      ross         if ( aExp == 0 ) {
   4770  1.1      ross             ++expDiff;
   4771  1.1      ross         }
   4772  1.1      ross         else {
   4773  1.1      ross             aSig0 |= LIT64( 0x0001000000000000 );
   4774  1.1      ross         }
   4775  1.1      ross         shift128ExtraRightJamming(
   4776  1.1      ross             aSig0, aSig1, 0, - expDiff, &aSig0, &aSig1, &zSig2 );
   4777  1.1      ross         zExp = bExp;
   4778  1.1      ross     }
   4779  1.1      ross     else {
   4780  1.1      ross         if ( aExp == 0x7FFF ) {
   4781  1.1      ross             if ( aSig0 | aSig1 | bSig0 | bSig1 ) {
   4782  1.1      ross                 return propagateFloat128NaN( a, b );
   4783  1.1      ross             }
   4784  1.1      ross             return a;
   4785  1.1      ross         }
   4786  1.1      ross         add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
   4787  1.1      ross         if ( aExp == 0 ) return packFloat128( zSign, 0, zSig0, zSig1 );
   4788  1.1      ross         zSig2 = 0;
   4789  1.1      ross         zSig0 |= LIT64( 0x0002000000000000 );
   4790  1.1      ross         zExp = aExp;
   4791  1.1      ross         goto shiftRight1;
   4792  1.1      ross     }
   4793  1.1      ross     aSig0 |= LIT64( 0x0001000000000000 );
   4794  1.1      ross     add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
   4795  1.1      ross     --zExp;
   4796  1.1      ross     if ( zSig0 < LIT64( 0x0002000000000000 ) ) goto roundAndPack;
   4797  1.1      ross     ++zExp;
   4798  1.1      ross  shiftRight1:
   4799  1.1      ross     shift128ExtraRightJamming(
   4800  1.1      ross         zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 );
   4801  1.1      ross  roundAndPack:
   4802  1.1      ross     return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 );
   4803  1.1      ross 
   4804  1.1      ross }
   4805  1.1      ross 
   4806  1.1      ross /*
   4807  1.1      ross -------------------------------------------------------------------------------
   4808  1.1      ross Returns the result of subtracting the absolute values of the quadruple-
   4809  1.1      ross precision floating-point values `a' and `b'.  If `zSign' is 1, the
   4810  1.1      ross difference is negated before being returned.  `zSign' is ignored if the
   4811  1.1      ross result is a NaN.  The subtraction is performed according to the IEC/IEEE
   4812  1.1      ross Standard for Binary Floating-Point Arithmetic.
   4813  1.1      ross -------------------------------------------------------------------------------
   4814  1.1      ross */
   4815  1.1      ross static float128 subFloat128Sigs( float128 a, float128 b, flag zSign )
   4816  1.1      ross {
   4817  1.1      ross     int32 aExp, bExp, zExp;
   4818  1.1      ross     bits64 aSig0, aSig1, bSig0, bSig1, zSig0, zSig1;
   4819  1.1      ross     int32 expDiff;
   4820  1.1      ross     float128 z;
   4821  1.1      ross 
   4822  1.1      ross     aSig1 = extractFloat128Frac1( a );
   4823  1.1      ross     aSig0 = extractFloat128Frac0( a );
   4824  1.1      ross     aExp = extractFloat128Exp( a );
   4825  1.1      ross     bSig1 = extractFloat128Frac1( b );
   4826  1.1      ross     bSig0 = extractFloat128Frac0( b );
   4827  1.1      ross     bExp = extractFloat128Exp( b );
   4828  1.1      ross     expDiff = aExp - bExp;
   4829  1.1      ross     shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 );
   4830  1.1      ross     shortShift128Left( bSig0, bSig1, 14, &bSig0, &bSig1 );
   4831  1.1      ross     if ( 0 < expDiff ) goto aExpBigger;
   4832  1.1      ross     if ( expDiff < 0 ) goto bExpBigger;
   4833  1.1      ross     if ( aExp == 0x7FFF ) {
   4834  1.1      ross         if ( aSig0 | aSig1 | bSig0 | bSig1 ) {
   4835  1.1      ross             return propagateFloat128NaN( a, b );
   4836  1.1      ross         }
   4837  1.1      ross         float_raise( float_flag_invalid );
   4838  1.1      ross         z.low = float128_default_nan_low;
   4839  1.1      ross         z.high = float128_default_nan_high;
   4840  1.1      ross         return z;
   4841  1.1      ross     }
   4842  1.1      ross     if ( aExp == 0 ) {
   4843  1.1      ross         aExp = 1;
   4844  1.1      ross         bExp = 1;
   4845  1.1      ross     }
   4846  1.1      ross     if ( bSig0 < aSig0 ) goto aBigger;
   4847  1.1      ross     if ( aSig0 < bSig0 ) goto bBigger;
   4848  1.1      ross     if ( bSig1 < aSig1 ) goto aBigger;
   4849  1.1      ross     if ( aSig1 < bSig1 ) goto bBigger;
   4850  1.1      ross     return packFloat128( float_rounding_mode() == float_round_down, 0, 0, 0 );
   4851  1.1      ross  bExpBigger:
   4852  1.1      ross     if ( bExp == 0x7FFF ) {
   4853  1.1      ross         if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b );
   4854  1.1      ross         return packFloat128( zSign ^ 1, 0x7FFF, 0, 0 );
   4855  1.1      ross     }
   4856  1.1      ross     if ( aExp == 0 ) {
   4857  1.1      ross         ++expDiff;
   4858  1.1      ross     }
   4859  1.1      ross     else {
   4860  1.1      ross         aSig0 |= LIT64( 0x4000000000000000 );
   4861  1.1      ross     }
   4862  1.1      ross     shift128RightJamming( aSig0, aSig1, - expDiff, &aSig0, &aSig1 );
   4863  1.1      ross     bSig0 |= LIT64( 0x4000000000000000 );
   4864  1.1      ross  bBigger:
   4865  1.1      ross     sub128( bSig0, bSig1, aSig0, aSig1, &zSig0, &zSig1 );
   4866  1.1      ross     zExp = bExp;
   4867  1.1      ross     zSign ^= 1;
   4868  1.1      ross     goto normalizeRoundAndPack;
   4869  1.1      ross  aExpBigger:
   4870  1.1      ross     if ( aExp == 0x7FFF ) {
   4871  1.1      ross         if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, b );
   4872  1.1      ross         return a;
   4873  1.1      ross     }
   4874  1.1      ross     if ( bExp == 0 ) {
   4875  1.1      ross         --expDiff;
   4876  1.1      ross     }
   4877  1.1      ross     else {
   4878  1.1      ross         bSig0 |= LIT64( 0x4000000000000000 );
   4879  1.1      ross     }
   4880  1.1      ross     shift128RightJamming( bSig0, bSig1, expDiff, &bSig0, &bSig1 );
   4881  1.1      ross     aSig0 |= LIT64( 0x4000000000000000 );
   4882  1.1      ross  aBigger:
   4883  1.1      ross     sub128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
   4884  1.1      ross     zExp = aExp;
   4885  1.1      ross  normalizeRoundAndPack:
   4886  1.1      ross     --zExp;
   4887  1.1      ross     return normalizeRoundAndPackFloat128( zSign, zExp - 14, zSig0, zSig1 );
   4888  1.1      ross 
   4889  1.1      ross }
   4890  1.1      ross 
   4891  1.1      ross /*
   4892  1.1      ross -------------------------------------------------------------------------------
   4893  1.1      ross Returns the result of adding the quadruple-precision floating-point values
   4894  1.1      ross `a' and `b'.  The operation is performed according to the IEC/IEEE Standard
   4895  1.1      ross for Binary Floating-Point Arithmetic.
   4896  1.1      ross -------------------------------------------------------------------------------
   4897  1.1      ross */
   4898  1.1      ross float128 float128_add( float128 a, float128 b )
   4899  1.1      ross {
   4900  1.1      ross     flag aSign, bSign;
   4901  1.1      ross 
   4902  1.1      ross     aSign = extractFloat128Sign( a );
   4903  1.1      ross     bSign = extractFloat128Sign( b );
   4904  1.1      ross     if ( aSign == bSign ) {
   4905  1.1      ross         return addFloat128Sigs( a, b, aSign );
   4906  1.1      ross     }
   4907  1.1      ross     else {
   4908  1.1      ross         return subFloat128Sigs( a, b, aSign );
   4909  1.1      ross     }
   4910  1.1      ross 
   4911  1.1      ross }
   4912  1.1      ross 
   4913  1.1      ross /*
   4914  1.1      ross -------------------------------------------------------------------------------
   4915  1.1      ross Returns the result of subtracting the quadruple-precision floating-point
   4916  1.1      ross values `a' and `b'.  The operation is performed according to the IEC/IEEE
   4917  1.1      ross Standard for Binary Floating-Point Arithmetic.
   4918  1.1      ross -------------------------------------------------------------------------------
   4919  1.1      ross */
   4920  1.1      ross float128 float128_sub( float128 a, float128 b )
   4921  1.1      ross {
   4922  1.1      ross     flag aSign, bSign;
   4923  1.1      ross 
   4924  1.1      ross     aSign = extractFloat128Sign( a );
   4925  1.1      ross     bSign = extractFloat128Sign( b );
   4926  1.1      ross     if ( aSign == bSign ) {
   4927  1.1      ross         return subFloat128Sigs( a, b, aSign );
   4928  1.1      ross     }
   4929  1.1      ross     else {
   4930  1.1      ross         return addFloat128Sigs( a, b, aSign );
   4931  1.1      ross     }
   4932  1.1      ross 
   4933  1.1      ross }
   4934  1.1      ross 
   4935  1.1      ross /*
   4936  1.1      ross -------------------------------------------------------------------------------
   4937  1.1      ross Returns the result of multiplying the quadruple-precision floating-point
   4938  1.1      ross values `a' and `b'.  The operation is performed according to the IEC/IEEE
   4939  1.1      ross Standard for Binary Floating-Point Arithmetic.
   4940  1.1      ross -------------------------------------------------------------------------------
   4941  1.1      ross */
   4942  1.1      ross float128 float128_mul( float128 a, float128 b )
   4943  1.1      ross {
   4944  1.1      ross     flag aSign, bSign, zSign;
   4945  1.1      ross     int32 aExp, bExp, zExp;
   4946  1.1      ross     bits64 aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2, zSig3;
   4947  1.1      ross     float128 z;
   4948  1.1      ross 
   4949  1.1      ross     aSig1 = extractFloat128Frac1( a );
   4950  1.1      ross     aSig0 = extractFloat128Frac0( a );
   4951  1.1      ross     aExp = extractFloat128Exp( a );
   4952  1.1      ross     aSign = extractFloat128Sign( a );
   4953  1.1      ross     bSig1 = extractFloat128Frac1( b );
   4954  1.1      ross     bSig0 = extractFloat128Frac0( b );
   4955  1.1      ross     bExp = extractFloat128Exp( b );
   4956  1.1      ross     bSign = extractFloat128Sign( b );
   4957  1.1      ross     zSign = aSign ^ bSign;
   4958  1.1      ross     if ( aExp == 0x7FFF ) {
   4959  1.1      ross         if (    ( aSig0 | aSig1 )
   4960  1.1      ross              || ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) {
   4961  1.1      ross             return propagateFloat128NaN( a, b );
   4962  1.1      ross         }
   4963  1.1      ross         if ( ( bExp | bSig0 | bSig1 ) == 0 ) goto invalid;
   4964  1.1      ross         return packFloat128( zSign, 0x7FFF, 0, 0 );
   4965  1.1      ross     }
   4966  1.1      ross     if ( bExp == 0x7FFF ) {
   4967  1.1      ross         if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b );
   4968  1.1      ross         if ( ( aExp | aSig0 | aSig1 ) == 0 ) {
   4969  1.1      ross  invalid:
   4970  1.1      ross             float_raise( float_flag_invalid );
   4971  1.1      ross             z.low = float128_default_nan_low;
   4972  1.1      ross             z.high = float128_default_nan_high;
   4973  1.1      ross             return z;
   4974  1.1      ross         }
   4975  1.1      ross         return packFloat128( zSign, 0x7FFF, 0, 0 );
   4976  1.1      ross     }
   4977  1.1      ross     if ( aExp == 0 ) {
   4978  1.1      ross         if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
   4979  1.1      ross         normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
   4980  1.1      ross     }
   4981  1.1      ross     if ( bExp == 0 ) {
   4982  1.1      ross         if ( ( bSig0 | bSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
   4983  1.1      ross         normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
   4984  1.1      ross     }
   4985  1.1      ross     zExp = aExp + bExp - 0x4000;
   4986  1.1      ross     aSig0 |= LIT64( 0x0001000000000000 );
   4987  1.1      ross     shortShift128Left( bSig0, bSig1, 16, &bSig0, &bSig1 );
   4988  1.1      ross     mul128To256( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1, &zSig2, &zSig3 );
   4989  1.1      ross     add128( zSig0, zSig1, aSig0, aSig1, &zSig0, &zSig1 );
   4990  1.1      ross     zSig2 |= ( zSig3 != 0 );
   4991  1.1      ross     if ( LIT64( 0x0002000000000000 ) <= zSig0 ) {
   4992  1.1      ross         shift128ExtraRightJamming(
   4993  1.1      ross             zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 );
   4994  1.1      ross         ++zExp;
   4995  1.1      ross     }
   4996  1.1      ross     return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 );
   4997  1.1      ross 
   4998  1.1      ross }
   4999  1.1      ross 
   5000  1.1      ross /*
   5001  1.1      ross -------------------------------------------------------------------------------
   5002  1.1      ross Returns the result of dividing the quadruple-precision floating-point value
   5003  1.1      ross `a' by the corresponding value `b'.  The operation is performed according to
   5004  1.1      ross the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
   5005  1.1      ross -------------------------------------------------------------------------------
   5006  1.1      ross */
   5007  1.1      ross float128 float128_div( float128 a, float128 b )
   5008  1.1      ross {
   5009  1.1      ross     flag aSign, bSign, zSign;
   5010  1.1      ross     int32 aExp, bExp, zExp;
   5011  1.1      ross     bits64 aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2;
   5012  1.1      ross     bits64 rem0, rem1, rem2, rem3, term0, term1, term2, term3;
   5013  1.1      ross     float128 z;
   5014  1.1      ross 
   5015  1.1      ross     aSig1 = extractFloat128Frac1( a );
   5016  1.1      ross     aSig0 = extractFloat128Frac0( a );
   5017  1.1      ross     aExp = extractFloat128Exp( a );
   5018  1.1      ross     aSign = extractFloat128Sign( a );
   5019  1.1      ross     bSig1 = extractFloat128Frac1( b );
   5020  1.1      ross     bSig0 = extractFloat128Frac0( b );
   5021  1.1      ross     bExp = extractFloat128Exp( b );
   5022  1.1      ross     bSign = extractFloat128Sign( b );
   5023  1.1      ross     zSign = aSign ^ bSign;
   5024  1.1      ross     if ( aExp == 0x7FFF ) {
   5025  1.1      ross         if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, b );
   5026  1.1      ross         if ( bExp == 0x7FFF ) {
   5027  1.1      ross             if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b );
   5028  1.1      ross             goto invalid;
   5029  1.1      ross         }
   5030  1.1      ross         return packFloat128( zSign, 0x7FFF, 0, 0 );
   5031  1.1      ross     }
   5032  1.1      ross     if ( bExp == 0x7FFF ) {
   5033  1.1      ross         if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b );
   5034  1.1      ross         return packFloat128( zSign, 0, 0, 0 );
   5035  1.1      ross     }
   5036  1.1      ross     if ( bExp == 0 ) {
   5037  1.1      ross         if ( ( bSig0 | bSig1 ) == 0 ) {
   5038  1.1      ross             if ( ( aExp | aSig0 | aSig1 ) == 0 ) {
   5039  1.1      ross  invalid:
   5040  1.1      ross                 float_raise( float_flag_invalid );
   5041  1.1      ross                 z.low = float128_default_nan_low;
   5042  1.1      ross                 z.high = float128_default_nan_high;
   5043  1.1      ross                 return z;
   5044  1.1      ross             }
   5045  1.1      ross             float_raise( float_flag_divbyzero );
   5046  1.1      ross             return packFloat128( zSign, 0x7FFF, 0, 0 );
   5047  1.1      ross         }
   5048  1.1      ross         normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
   5049  1.1      ross     }
   5050  1.1      ross     if ( aExp == 0 ) {
   5051  1.1      ross         if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
   5052  1.1      ross         normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
   5053  1.1      ross     }
   5054  1.1      ross     zExp = aExp - bExp + 0x3FFD;
   5055  1.1      ross     shortShift128Left(
   5056  1.1      ross         aSig0 | LIT64( 0x0001000000000000 ), aSig1, 15, &aSig0, &aSig1 );
   5057  1.1      ross     shortShift128Left(
   5058  1.1      ross         bSig0 | LIT64( 0x0001000000000000 ), bSig1, 15, &bSig0, &bSig1 );
   5059  1.1      ross     if ( le128( bSig0, bSig1, aSig0, aSig1 ) ) {
   5060  1.1      ross         shift128Right( aSig0, aSig1, 1, &aSig0, &aSig1 );
   5061  1.1      ross         ++zExp;
   5062  1.1      ross     }
   5063  1.1      ross     zSig0 = estimateDiv128To64( aSig0, aSig1, bSig0 );
   5064  1.1      ross     mul128By64To192( bSig0, bSig1, zSig0, &term0, &term1, &term2 );
   5065  1.1      ross     sub192( aSig0, aSig1, 0, term0, term1, term2, &rem0, &rem1, &rem2 );
   5066  1.1      ross     while ( (sbits64) rem0 < 0 ) {
   5067  1.1      ross         --zSig0;
   5068  1.1      ross         add192( rem0, rem1, rem2, 0, bSig0, bSig1, &rem0, &rem1, &rem2 );
   5069  1.1      ross     }
   5070  1.1      ross     zSig1 = estimateDiv128To64( rem1, rem2, bSig0 );
   5071  1.1      ross     if ( ( zSig1 & 0x3FFF ) <= 4 ) {
   5072  1.1      ross         mul128By64To192( bSig0, bSig1, zSig1, &term1, &term2, &term3 );
   5073  1.1      ross         sub192( rem1, rem2, 0, term1, term2, term3, &rem1, &rem2, &rem3 );
   5074  1.1      ross         while ( (sbits64) rem1 < 0 ) {
   5075  1.1      ross             --zSig1;
   5076  1.1      ross             add192( rem1, rem2, rem3, 0, bSig0, bSig1, &rem1, &rem2, &rem3 );
   5077  1.1      ross         }
   5078  1.1      ross         zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
   5079  1.1      ross     }
   5080  1.1      ross     shift128ExtraRightJamming( zSig0, zSig1, 0, 15, &zSig0, &zSig1, &zSig2 );
   5081  1.1      ross     return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 );
   5082  1.1      ross 
   5083  1.1      ross }
   5084  1.1      ross 
   5085  1.1      ross /*
   5086  1.1      ross -------------------------------------------------------------------------------
   5087  1.1      ross Returns the remainder of the quadruple-precision floating-point value `a'
   5088  1.1      ross with respect to the corresponding value `b'.  The operation is performed
   5089  1.1      ross according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
   5090  1.1      ross -------------------------------------------------------------------------------
   5091  1.1      ross */
   5092  1.1      ross float128 float128_rem( float128 a, float128 b )
   5093  1.1      ross {
   5094  1.1      ross     flag aSign, bSign, zSign;
   5095  1.1      ross     int32 aExp, bExp, expDiff;
   5096  1.1      ross     bits64 aSig0, aSig1, bSig0, bSig1, q, term0, term1, term2;
   5097  1.1      ross     bits64 allZero, alternateASig0, alternateASig1, sigMean1;
   5098  1.1      ross     sbits64 sigMean0;
   5099  1.1      ross     float128 z;
   5100  1.1      ross 
   5101  1.1      ross     aSig1 = extractFloat128Frac1( a );
   5102  1.1      ross     aSig0 = extractFloat128Frac0( a );
   5103  1.1      ross     aExp = extractFloat128Exp( a );
   5104  1.1      ross     aSign = extractFloat128Sign( a );
   5105  1.1      ross     bSig1 = extractFloat128Frac1( b );
   5106  1.1      ross     bSig0 = extractFloat128Frac0( b );
   5107  1.1      ross     bExp = extractFloat128Exp( b );
   5108  1.1      ross     bSign = extractFloat128Sign( b );
   5109  1.1      ross     if ( aExp == 0x7FFF ) {
   5110  1.1      ross         if (    ( aSig0 | aSig1 )
   5111  1.1      ross              || ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) {
   5112  1.1      ross             return propagateFloat128NaN( a, b );
   5113  1.1      ross         }
   5114  1.1      ross         goto invalid;
   5115  1.1      ross     }
   5116  1.1      ross     if ( bExp == 0x7FFF ) {
   5117  1.1      ross         if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b );
   5118  1.1      ross         return a;
   5119  1.1      ross     }
   5120  1.1      ross     if ( bExp == 0 ) {
   5121  1.1      ross         if ( ( bSig0 | bSig1 ) == 0 ) {
   5122  1.1      ross  invalid:
   5123  1.1      ross             float_raise( float_flag_invalid );
   5124  1.1      ross             z.low = float128_default_nan_low;
   5125  1.1      ross             z.high = float128_default_nan_high;
   5126  1.1      ross             return z;
   5127  1.1      ross         }
   5128  1.1      ross         normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
   5129  1.1      ross     }
   5130  1.1      ross     if ( aExp == 0 ) {
   5131  1.1      ross         if ( ( aSig0 | aSig1 ) == 0 ) return a;
   5132  1.1      ross         normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
   5133  1.1      ross     }
   5134  1.1      ross     expDiff = aExp - bExp;
   5135  1.1      ross     if ( expDiff < -1 ) return a;
   5136  1.1      ross     shortShift128Left(
   5137  1.1      ross         aSig0 | LIT64( 0x0001000000000000 ),
   5138  1.1      ross         aSig1,
   5139  1.1      ross         15 - ( expDiff < 0 ),
   5140  1.1      ross         &aSig0,
   5141  1.1      ross         &aSig1
   5142  1.1      ross     );
   5143  1.1      ross     shortShift128Left(
   5144  1.1      ross         bSig0 | LIT64( 0x0001000000000000 ), bSig1, 15, &bSig0, &bSig1 );
   5145  1.1      ross     q = le128( bSig0, bSig1, aSig0, aSig1 );
   5146  1.1      ross     if ( q ) sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 );
   5147  1.1      ross     expDiff -= 64;
   5148  1.1      ross     while ( 0 < expDiff ) {
   5149  1.1      ross         q = estimateDiv128To64( aSig0, aSig1, bSig0 );
   5150  1.1      ross         q = ( 4 < q ) ? q - 4 : 0;
   5151  1.1      ross         mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 );
   5152  1.1      ross         shortShift192Left( term0, term1, term2, 61, &term1, &term2, &allZero );
   5153  1.1      ross         shortShift128Left( aSig0, aSig1, 61, &aSig0, &allZero );
   5154  1.1      ross         sub128( aSig0, 0, term1, term2, &aSig0, &aSig1 );
   5155  1.1      ross         expDiff -= 61;
   5156  1.1      ross     }
   5157  1.1      ross     if ( -64 < expDiff ) {
   5158  1.1      ross         q = estimateDiv128To64( aSig0, aSig1, bSig0 );
   5159  1.1      ross         q = ( 4 < q ) ? q - 4 : 0;
   5160  1.1      ross         q >>= - expDiff;
   5161  1.1      ross         shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 );
   5162  1.1      ross         expDiff += 52;
   5163  1.1      ross         if ( expDiff < 0 ) {
   5164  1.1      ross             shift128Right( aSig0, aSig1, - expDiff, &aSig0, &aSig1 );
   5165  1.1      ross         }
   5166  1.1      ross         else {
   5167  1.1      ross             shortShift128Left( aSig0, aSig1, expDiff, &aSig0, &aSig1 );
   5168  1.1      ross         }
   5169  1.1      ross         mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 );
   5170  1.1      ross         sub128( aSig0, aSig1, term1, term2, &aSig0, &aSig1 );
   5171  1.1      ross     }
   5172  1.1      ross     else {
   5173  1.1      ross         shift128Right( aSig0, aSig1, 12, &aSig0, &aSig1 );
   5174  1.1      ross         shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 );
   5175  1.1      ross     }
   5176  1.1      ross     do {
   5177  1.1      ross         alternateASig0 = aSig0;
   5178  1.1      ross         alternateASig1 = aSig1;
   5179  1.1      ross         ++q;
   5180  1.1      ross         sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 );
   5181  1.1      ross     } while ( 0 <= (sbits64) aSig0 );
   5182  1.1      ross     add128(
   5183  1.1      ross         aSig0, aSig1, alternateASig0, alternateASig1, &sigMean0, &sigMean1 );
   5184  1.1      ross     if (    ( sigMean0 < 0 )
   5185  1.1      ross          || ( ( ( sigMean0 | sigMean1 ) == 0 ) && ( q & 1 ) ) ) {
   5186  1.1      ross         aSig0 = alternateASig0;
   5187  1.1      ross         aSig1 = alternateASig1;
   5188  1.1      ross     }
   5189  1.1      ross     zSign = ( (sbits64) aSig0 < 0 );
   5190  1.1      ross     if ( zSign ) sub128( 0, 0, aSig0, aSig1, &aSig0, &aSig1 );
   5191  1.1      ross     return
   5192  1.1      ross         normalizeRoundAndPackFloat128( aSign ^ zSign, bExp - 4, aSig0, aSig1 );
   5193  1.1      ross 
   5194  1.1      ross }
   5195  1.1      ross 
   5196  1.1      ross /*
   5197  1.1      ross -------------------------------------------------------------------------------
   5198  1.1      ross Returns the square root of the quadruple-precision floating-point value `a'.
   5199  1.1      ross The operation is performed according to the IEC/IEEE Standard for Binary
   5200  1.1      ross Floating-Point Arithmetic.
   5201  1.1      ross -------------------------------------------------------------------------------
   5202  1.1      ross */
   5203  1.1      ross float128 float128_sqrt( float128 a )
   5204  1.1      ross {
   5205  1.1      ross     flag aSign;
   5206  1.1      ross     int32 aExp, zExp;
   5207  1.1      ross     bits64 aSig0, aSig1, zSig0, zSig1, zSig2, doubleZSig0;
   5208  1.1      ross     bits64 rem0, rem1, rem2, rem3, term0, term1, term2, term3;
   5209  1.1      ross     float128 z;
   5210  1.1      ross 
   5211  1.1      ross     aSig1 = extractFloat128Frac1( a );
   5212  1.1      ross     aSig0 = extractFloat128Frac0( a );
   5213  1.1      ross     aExp = extractFloat128Exp( a );
   5214  1.1      ross     aSign = extractFloat128Sign( a );
   5215  1.1      ross     if ( aExp == 0x7FFF ) {
   5216  1.1      ross         if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, a );
   5217  1.1      ross         if ( ! aSign ) return a;
   5218  1.1      ross         goto invalid;
   5219  1.1      ross     }
   5220  1.1      ross     if ( aSign ) {
   5221  1.1      ross         if ( ( aExp | aSig0 | aSig1 ) == 0 ) return a;
   5222  1.1      ross  invalid:
   5223  1.1      ross         float_raise( float_flag_invalid );
   5224  1.1      ross         z.low = float128_default_nan_low;
   5225  1.1      ross         z.high = float128_default_nan_high;
   5226  1.1      ross         return z;
   5227  1.1      ross     }
   5228  1.1      ross     if ( aExp == 0 ) {
   5229  1.1      ross         if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( 0, 0, 0, 0 );
   5230  1.1      ross         normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
   5231  1.1      ross     }
   5232  1.1      ross     zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFE;
   5233  1.1      ross     aSig0 |= LIT64( 0x0001000000000000 );
   5234  1.1      ross     zSig0 = estimateSqrt32( aExp, aSig0>>17 );
   5235  1.1      ross     shortShift128Left( aSig0, aSig1, 13 - ( aExp & 1 ), &aSig0, &aSig1 );
   5236  1.1      ross     zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 );
   5237  1.1      ross     doubleZSig0 = zSig0<<1;
   5238  1.1      ross     mul64To128( zSig0, zSig0, &term0, &term1 );
   5239  1.1      ross     sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 );
   5240  1.1      ross     while ( (sbits64) rem0 < 0 ) {
   5241  1.1      ross         --zSig0;
   5242  1.1      ross         doubleZSig0 -= 2;
   5243  1.1      ross         add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 );
   5244  1.1      ross     }
   5245  1.1      ross     zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 );
   5246  1.1      ross     if ( ( zSig1 & 0x1FFF ) <= 5 ) {
   5247  1.1      ross         if ( zSig1 == 0 ) zSig1 = 1;
   5248  1.1      ross         mul64To128( doubleZSig0, zSig1, &term1, &term2 );
   5249  1.1      ross         sub128( rem1, 0, term1, term2, &rem1, &rem2 );
   5250  1.1      ross         mul64To128( zSig1, zSig1, &term2, &term3 );
   5251  1.1      ross         sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 );
   5252  1.1      ross         while ( (sbits64) rem1 < 0 ) {
   5253  1.1      ross             --zSig1;
   5254  1.1      ross             shortShift128Left( 0, zSig1, 1, &term2, &term3 );
   5255  1.1      ross             term3 |= 1;
   5256  1.1      ross             term2 |= doubleZSig0;
   5257  1.1      ross             add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 );
   5258  1.1      ross         }
   5259  1.1      ross         zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
   5260  1.1      ross     }
   5261  1.1      ross     shift128ExtraRightJamming( zSig0, zSig1, 0, 14, &zSig0, &zSig1, &zSig2 );
   5262  1.1      ross     return roundAndPackFloat128( 0, zExp, zSig0, zSig1, zSig2 );
   5263  1.1      ross 
   5264  1.1      ross }
   5265  1.1      ross 
   5266  1.1      ross /*
   5267  1.1      ross -------------------------------------------------------------------------------
   5268  1.1      ross Returns 1 if the quadruple-precision floating-point value `a' is equal to
   5269  1.1      ross the corresponding value `b', and 0 otherwise.  The comparison is performed
   5270  1.1      ross according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
   5271  1.1      ross -------------------------------------------------------------------------------
   5272  1.1      ross */
   5273  1.1      ross flag float128_eq( float128 a, float128 b )
   5274  1.1      ross {
   5275  1.1      ross 
   5276  1.1      ross     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
   5277  1.1      ross               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
   5278  1.1      ross          || (    ( extractFloat128Exp( b ) == 0x7FFF )
   5279  1.1      ross               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
   5280  1.1      ross        ) {
   5281  1.1      ross         if (    float128_is_signaling_nan( a )
   5282  1.1      ross              || float128_is_signaling_nan( b ) ) {
   5283  1.1      ross             float_raise( float_flag_invalid );
   5284  1.1      ross         }
   5285  1.1      ross         return 0;
   5286  1.1      ross     }
   5287  1.1      ross     return
   5288  1.1      ross            ( a.low == b.low )
   5289  1.1      ross         && (    ( a.high == b.high )
   5290  1.1      ross              || (    ( a.low == 0 )
   5291  1.1      ross                   && ( (bits64) ( ( a.high | b.high )<<1 ) == 0 ) )
   5292  1.1      ross            );
   5293  1.1      ross 
   5294  1.1      ross }
   5295  1.1      ross 
   5296  1.1      ross /*
   5297  1.1      ross -------------------------------------------------------------------------------
   5298  1.1      ross Returns 1 if the quadruple-precision floating-point value `a' is less than
   5299  1.1      ross or equal to the corresponding value `b', and 0 otherwise.  The comparison
   5300  1.1      ross is performed according to the IEC/IEEE Standard for Binary Floating-Point
   5301  1.1      ross Arithmetic.
   5302  1.1      ross -------------------------------------------------------------------------------
   5303  1.1      ross */
   5304  1.1      ross flag float128_le( float128 a, float128 b )
   5305  1.1      ross {
   5306  1.1      ross     flag aSign, bSign;
   5307  1.1      ross 
   5308  1.1      ross     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
   5309  1.1      ross               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
   5310  1.1      ross          || (    ( extractFloat128Exp( b ) == 0x7FFF )
   5311  1.1      ross               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
   5312  1.1      ross        ) {
   5313  1.1      ross         float_raise( float_flag_invalid );
   5314  1.1      ross         return 0;
   5315  1.1      ross     }
   5316  1.1      ross     aSign = extractFloat128Sign( a );
   5317  1.1      ross     bSign = extractFloat128Sign( b );
   5318  1.1      ross     if ( aSign != bSign ) {
   5319  1.1      ross         return
   5320  1.1      ross                aSign
   5321  1.1      ross             || (    ( ( (bits64) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
   5322  1.1      ross                  == 0 );
   5323  1.1      ross     }
   5324  1.1      ross     return
   5325  1.1      ross           aSign ? le128( b.high, b.low, a.high, a.low )
   5326  1.1      ross         : le128( a.high, a.low, b.high, b.low );
   5327  1.1      ross 
   5328  1.1      ross }
   5329  1.1      ross 
   5330  1.1      ross /*
   5331  1.1      ross -------------------------------------------------------------------------------
   5332  1.1      ross Returns 1 if the quadruple-precision floating-point value `a' is less than
   5333  1.1      ross the corresponding value `b', and 0 otherwise.  The comparison is performed
   5334  1.1      ross according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
   5335  1.1      ross -------------------------------------------------------------------------------
   5336  1.1      ross */
   5337  1.1      ross flag float128_lt( float128 a, float128 b )
   5338  1.1      ross {
   5339  1.1      ross     flag aSign, bSign;
   5340  1.1      ross 
   5341  1.1      ross     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
   5342  1.1      ross               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
   5343  1.1      ross          || (    ( extractFloat128Exp( b ) == 0x7FFF )
   5344  1.1      ross               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
   5345  1.1      ross        ) {
   5346  1.1      ross         float_raise( float_flag_invalid );
   5347  1.1      ross         return 0;
   5348  1.1      ross     }
   5349  1.1      ross     aSign = extractFloat128Sign( a );
   5350  1.1      ross     bSign = extractFloat128Sign( b );
   5351  1.1      ross     if ( aSign != bSign ) {
   5352  1.1      ross         return
   5353  1.1      ross                aSign
   5354  1.1      ross             && (    ( ( (bits64) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
   5355  1.1      ross                  != 0 );
   5356  1.1      ross     }
   5357  1.1      ross     return
   5358  1.1      ross           aSign ? lt128( b.high, b.low, a.high, a.low )
   5359  1.1      ross         : lt128( a.high, a.low, b.high, b.low );
   5360  1.1      ross 
   5361  1.1      ross }
   5362  1.1      ross 
   5363  1.1      ross /*
   5364  1.1      ross -------------------------------------------------------------------------------
   5365  1.1      ross Returns 1 if the quadruple-precision floating-point value `a' is equal to
   5366  1.1      ross the corresponding value `b', and 0 otherwise.  The invalid exception is
   5367  1.1      ross raised if either operand is a NaN.  Otherwise, the comparison is performed
   5368  1.1      ross according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
   5369  1.1      ross -------------------------------------------------------------------------------
   5370  1.1      ross */
   5371  1.1      ross flag float128_eq_signaling( float128 a, float128 b )
   5372  1.1      ross {
   5373  1.1      ross 
   5374  1.1      ross     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
   5375  1.1      ross               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
   5376  1.1      ross          || (    ( extractFloat128Exp( b ) == 0x7FFF )
   5377  1.1      ross               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
   5378  1.1      ross        ) {
   5379  1.1      ross         float_raise( float_flag_invalid );
   5380  1.1      ross         return 0;
   5381  1.1      ross     }
   5382  1.1      ross     return
   5383  1.1      ross            ( a.low == b.low )
   5384  1.1      ross         && (    ( a.high == b.high )
   5385  1.1      ross              || (    ( a.low == 0 )
   5386  1.1      ross                   && ( (bits64) ( ( a.high | b.high )<<1 ) == 0 ) )
   5387  1.1      ross            );
   5388  1.1      ross 
   5389  1.1      ross }
   5390  1.1      ross 
   5391  1.1      ross /*
   5392  1.1      ross -------------------------------------------------------------------------------
   5393  1.1      ross Returns 1 if the quadruple-precision floating-point value `a' is less than
   5394  1.1      ross or equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs do not
   5395  1.1      ross cause an exception.  Otherwise, the comparison is performed according to the
   5396  1.1      ross IEC/IEEE Standard for Binary Floating-Point Arithmetic.
   5397  1.1      ross -------------------------------------------------------------------------------
   5398  1.1      ross */
   5399  1.1      ross flag float128_le_quiet( float128 a, float128 b )
   5400  1.1      ross {
   5401  1.1      ross     flag aSign, bSign;
   5402  1.1      ross 
   5403  1.1      ross     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
   5404  1.1      ross               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
   5405  1.1      ross          || (    ( extractFloat128Exp( b ) == 0x7FFF )
   5406  1.1      ross               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
   5407  1.1      ross        ) {
   5408  1.1      ross         if (    float128_is_signaling_nan( a )
   5409  1.1      ross              || float128_is_signaling_nan( b ) ) {
   5410  1.1      ross             float_raise( float_flag_invalid );
   5411  1.1      ross         }
   5412  1.1      ross         return 0;
   5413  1.1      ross     }
   5414  1.1      ross     aSign = extractFloat128Sign( a );
   5415  1.1      ross     bSign = extractFloat128Sign( b );
   5416  1.1      ross     if ( aSign != bSign ) {
   5417  1.1      ross         return
   5418  1.1      ross                aSign
   5419  1.1      ross             || (    ( ( (bits64) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
   5420  1.1      ross                  == 0 );
   5421  1.1      ross     }
   5422  1.1      ross     return
   5423  1.1      ross           aSign ? le128( b.high, b.low, a.high, a.low )
   5424  1.1      ross         : le128( a.high, a.low, b.high, b.low );
   5425  1.1      ross 
   5426  1.1      ross }
   5427  1.1      ross 
   5428  1.1      ross /*
   5429  1.1      ross -------------------------------------------------------------------------------
   5430  1.1      ross Returns 1 if the quadruple-precision floating-point value `a' is less than
   5431  1.1      ross the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
   5432  1.1      ross exception.  Otherwise, the comparison is performed according to the IEC/IEEE
   5433  1.1      ross Standard for Binary Floating-Point Arithmetic.
   5434  1.1      ross -------------------------------------------------------------------------------
   5435  1.1      ross */
   5436  1.1      ross flag float128_lt_quiet( float128 a, float128 b )
   5437  1.1      ross {
   5438  1.1      ross     flag aSign, bSign;
   5439  1.1      ross 
   5440  1.1      ross     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
   5441  1.1      ross               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
   5442  1.1      ross          || (    ( extractFloat128Exp( b ) == 0x7FFF )
   5443  1.1      ross               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
   5444  1.1      ross        ) {
   5445  1.1      ross         if (    float128_is_signaling_nan( a )
   5446  1.1      ross              || float128_is_signaling_nan( b ) ) {
   5447  1.1      ross             float_raise( float_flag_invalid );
   5448  1.1      ross         }
   5449  1.1      ross         return 0;
   5450  1.1      ross     }
   5451  1.1      ross     aSign = extractFloat128Sign( a );
   5452  1.1      ross     bSign = extractFloat128Sign( b );
   5453  1.1      ross     if ( aSign != bSign ) {
   5454  1.1      ross         return
   5455  1.1      ross                aSign
   5456  1.1      ross             && (    ( ( (bits64) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
   5457  1.1      ross                  != 0 );
   5458  1.1      ross     }
   5459  1.1      ross     return
   5460  1.1      ross           aSign ? lt128( b.high, b.low, a.high, a.low )
   5461  1.1      ross         : lt128( a.high, a.low, b.high, b.low );
   5462  1.1      ross 
   5463  1.1      ross }
   5464  1.1      ross 
   5465  1.1      ross #endif
   5466  1.1      ross 
   5467  1.1      ross 
   5468  1.1      ross #if defined(SOFTFLOAT_FOR_GCC) && defined(SOFTFLOAT_NEED_FIXUNS)
   5469  1.1      ross 
   5470  1.1      ross /*
   5471  1.1      ross  * These two routines are not part of the original softfloat distribution.
   5472  1.1      ross  *
   5473  1.1      ross  * They are based on the corresponding conversions to integer but return
   5474  1.1      ross  * unsigned numbers instead since these functions are required by GCC.
   5475  1.1      ross  *
   5476  1.3    keihan  * Added by Mark Brinicombe <mark (at) NetBSD.org>	27/09/97
   5477  1.1      ross  *
   5478  1.1      ross  * float64 version overhauled for SoftFloat 2a [bjh21 2000-07-15]
   5479  1.1      ross  */
   5480  1.1      ross 
   5481  1.1      ross /*
   5482  1.1      ross -------------------------------------------------------------------------------
   5483  1.1      ross Returns the result of converting the double-precision floating-point value
   5484  1.1      ross `a' to the 32-bit unsigned integer format.  The conversion is
   5485  1.1      ross performed according to the IEC/IEEE Standard for Binary Floating-point
   5486  1.1      ross Arithmetic, except that the conversion is always rounded toward zero.  If
   5487  1.1      ross `a' is a NaN, the largest positive integer is returned.  If the conversion
   5488  1.1      ross overflows, the largest integer positive is returned.
   5489  1.1      ross -------------------------------------------------------------------------------
   5490  1.1      ross */
   5491  1.1      ross uint32 float64_to_uint32_round_to_zero( float64 a )
   5492  1.1      ross {
   5493  1.1      ross     flag aSign;
   5494  1.1      ross     int16 aExp, shiftCount;
   5495  1.1      ross     bits64 aSig, savedASig;
   5496  1.1      ross     uint32 z;
   5497  1.1      ross 
   5498  1.1      ross     aSig = extractFloat64Frac( a );
   5499  1.1      ross     aExp = extractFloat64Exp( a );
   5500  1.1      ross     aSign = extractFloat64Sign( a );
   5501  1.1      ross 
   5502  1.1      ross     if (aSign) {
   5503  1.1      ross         float_raise( float_flag_invalid );
   5504  1.1      ross     	return(0);
   5505  1.1      ross     }
   5506  1.1      ross 
   5507  1.1      ross     if ( 0x41E < aExp ) {
   5508  1.1      ross         float_raise( float_flag_invalid );
   5509  1.1      ross         return 0xffffffff;
   5510  1.1      ross     }
   5511  1.1      ross     else if ( aExp < 0x3FF ) {
   5512  1.1      ross         if ( aExp || aSig ) float_set_inexact();
   5513  1.1      ross         return 0;
   5514  1.1      ross     }
   5515  1.1      ross     aSig |= LIT64( 0x0010000000000000 );
   5516  1.1      ross     shiftCount = 0x433 - aExp;
   5517  1.1      ross     savedASig = aSig;
   5518  1.1      ross     aSig >>= shiftCount;
   5519  1.1      ross     z = aSig;
   5520  1.1      ross     if ( ( aSig<<shiftCount ) != savedASig ) {
   5521  1.1      ross         float_set_inexact();
   5522  1.1      ross     }
   5523  1.1      ross     return z;
   5524  1.1      ross 
   5525  1.1      ross }
   5526  1.1      ross 
   5527  1.1      ross /*
   5528  1.1      ross -------------------------------------------------------------------------------
   5529  1.1      ross Returns the result of converting the single-precision floating-point value
   5530  1.1      ross `a' to the 32-bit unsigned integer format.  The conversion is
   5531  1.1      ross performed according to the IEC/IEEE Standard for Binary Floating-point
   5532  1.1      ross Arithmetic, except that the conversion is always rounded toward zero.  If
   5533  1.1      ross `a' is a NaN, the largest positive integer is returned.  If the conversion
   5534  1.1      ross overflows, the largest positive integer is returned.
   5535  1.1      ross -------------------------------------------------------------------------------
   5536  1.1      ross */
   5537  1.1      ross uint32 float32_to_uint32_round_to_zero( float32 a )
   5538  1.1      ross {
   5539  1.1      ross     flag aSign;
   5540  1.1      ross     int16 aExp, shiftCount;
   5541  1.1      ross     bits32 aSig;
   5542  1.1      ross     uint32 z;
   5543  1.1      ross 
   5544  1.1      ross     aSig = extractFloat32Frac( a );
   5545  1.1      ross     aExp = extractFloat32Exp( a );
   5546  1.1      ross     aSign = extractFloat32Sign( a );
   5547  1.1      ross     shiftCount = aExp - 0x9E;
   5548  1.1      ross 
   5549  1.1      ross     if (aSign) {
   5550  1.1      ross         float_raise( float_flag_invalid );
   5551  1.1      ross     	return(0);
   5552  1.1      ross     }
   5553  1.1      ross     if ( 0 < shiftCount ) {
   5554  1.1      ross         float_raise( float_flag_invalid );
   5555  1.1      ross         return 0xFFFFFFFF;
   5556  1.1      ross     }
   5557  1.1      ross     else if ( aExp <= 0x7E ) {
   5558  1.1      ross         if ( aExp | aSig ) float_set_inexact();
   5559  1.1      ross         return 0;
   5560  1.1      ross     }
   5561  1.1      ross     aSig = ( aSig | 0x800000 )<<8;
   5562  1.1      ross     z = aSig>>( - shiftCount );
   5563  1.1      ross     if ( aSig<<( shiftCount & 31 ) ) {
   5564  1.1      ross         float_set_inexact();
   5565  1.1      ross     }
   5566  1.1      ross     return z;
   5567  1.1      ross 
   5568  1.1      ross }
   5569  1.1      ross 
   5570  1.1      ross #endif
   5571  1.2   thorpej 
   5572  1.2   thorpej #endif /* _STANDALONE */
   5573