Home | History | Annotate | Line # | Download | only in libkern
softfloat.c revision 1.7
      1  1.7   thorpej /* $NetBSD: softfloat.c,v 1.7 2020/09/02 03:45:54 thorpej Exp $ */
      2  1.1      ross 
      3  1.1      ross /*
      4  1.1      ross  * This version hacked for use with gcc -msoft-float by bjh21.
      5  1.1      ross  * (Mostly a case of #ifdefing out things GCC doesn't need or provides
      6  1.1      ross  *  itself).
      7  1.1      ross  */
      8  1.1      ross 
      9  1.1      ross /*
     10  1.1      ross  * Things you may want to define:
     11  1.1      ross  *
     12  1.1      ross  * SOFTFLOAT_FOR_GCC - build only those functions necessary for GCC (with
     13  1.1      ross  *   -msoft-float) to work.  Include "softfloat-for-gcc.h" to get them
     14  1.1      ross  *   properly renamed.
     15  1.1      ross  */
     16  1.1      ross 
     17  1.7   thorpej /*============================================================================
     18  1.1      ross 
     19  1.7   thorpej This C source file is part of the SoftFloat IEC/IEEE Floating-point Arithmetic
     20  1.7   thorpej Package, Release 2b.
     21  1.1      ross 
     22  1.1      ross Written by John R. Hauser.  This work was made possible in part by the
     23  1.1      ross International Computer Science Institute, located at Suite 600, 1947 Center
     24  1.1      ross Street, Berkeley, California 94704.  Funding was partially provided by the
     25  1.1      ross National Science Foundation under grant MIP-9311980.  The original version
     26  1.1      ross of this code was written as part of a project to build a fixed-point vector
     27  1.1      ross processor in collaboration with the University of California at Berkeley,
     28  1.1      ross overseen by Profs. Nelson Morgan and John Wawrzynek.  More information
     29  1.7   thorpej is available through the Web page `http://www.cs.berkeley.edu/~jhauser/
     30  1.1      ross arithmetic/SoftFloat.html'.
     31  1.1      ross 
     32  1.7   thorpej THIS SOFTWARE IS DISTRIBUTED AS IS, FOR FREE.  Although reasonable effort has
     33  1.7   thorpej been made to avoid it, THIS SOFTWARE MAY CONTAIN FAULTS THAT WILL AT TIMES
     34  1.7   thorpej RESULT IN INCORRECT BEHAVIOR.  USE OF THIS SOFTWARE IS RESTRICTED TO PERSONS
     35  1.7   thorpej AND ORGANIZATIONS WHO CAN AND WILL TAKE FULL RESPONSIBILITY FOR ALL LOSSES,
     36  1.7   thorpej COSTS, OR OTHER PROBLEMS THEY INCUR DUE TO THE SOFTWARE, AND WHO FURTHERMORE
     37  1.7   thorpej EFFECTIVELY INDEMNIFY JOHN HAUSER AND THE INTERNATIONAL COMPUTER SCIENCE
     38  1.7   thorpej INSTITUTE (possibly via similar legal warning) AGAINST ALL LOSSES, COSTS, OR
     39  1.7   thorpej OTHER PROBLEMS INCURRED BY THEIR CUSTOMERS AND CLIENTS DUE TO THE SOFTWARE.
     40  1.1      ross 
     41  1.1      ross Derivative works are acceptable, even for commercial purposes, so long as
     42  1.7   thorpej (1) the source code for the derivative work includes prominent notice that
     43  1.7   thorpej the work is derivative, and (2) the source code includes prominent notice with
     44  1.7   thorpej these four paragraphs for those parts of this code that are retained.
     45  1.1      ross 
     46  1.7   thorpej =============================================================================*/
     47  1.1      ross 
     48  1.2   thorpej /* If you need this in a boot program, you have bigger problems... */
     49  1.2   thorpej #ifndef _STANDALONE
     50  1.2   thorpej 
     51  1.1      ross #include <sys/cdefs.h>
     52  1.1      ross #if defined(LIBC_SCCS) && !defined(lint)
     53  1.7   thorpej __RCSID("$NetBSD: softfloat.c,v 1.7 2020/09/02 03:45:54 thorpej Exp $");
     54  1.1      ross #endif /* LIBC_SCCS and not lint */
     55  1.1      ross 
     56  1.1      ross #ifdef SOFTFLOAT_FOR_GCC
     57  1.1      ross #include "softfloat-for-gcc.h"
     58  1.1      ross #endif
     59  1.1      ross 
     60  1.1      ross #include "milieu.h"
     61  1.1      ross #include "softfloat.h"
     62  1.1      ross 
     63  1.1      ross /*
     64  1.1      ross  * Conversions between floats as stored in memory and floats as
     65  1.1      ross  * SoftFloat uses them
     66  1.1      ross  */
     67  1.1      ross #ifndef FLOAT64_DEMANGLE
     68  1.1      ross #define FLOAT64_DEMANGLE(a)	(a)
     69  1.1      ross #endif
     70  1.1      ross #ifndef FLOAT64_MANGLE
     71  1.1      ross #define FLOAT64_MANGLE(a)	(a)
     72  1.1      ross #endif
     73  1.1      ross 
     74  1.7   thorpej /*----------------------------------------------------------------------------
     75  1.7   thorpej | Floating-point rounding mode, extended double-precision rounding precision,
     76  1.7   thorpej | and exception flags.
     77  1.7   thorpej *----------------------------------------------------------------------------*/
     78  1.1      ross /*
     79  1.1      ross  * XXX: This may cause options-MULTIPROCESSOR or thread problems someday.
     80  1.1      ross  * 	Right now, it does not.  I've removed all other dynamic global
     81  1.1      ross  * 	variables. [ross]
     82  1.1      ross  */
     83  1.1      ross #ifdef FLOATX80
     84  1.1      ross int8 floatx80_rounding_precision = 80;
     85  1.1      ross #endif
     86  1.1      ross 
     87  1.7   thorpej /*----------------------------------------------------------------------------
     88  1.7   thorpej | Primitive arithmetic functions, including multi-word arithmetic, and
     89  1.7   thorpej | division and square root approximations.  (Can be specialized to target if
     90  1.7   thorpej | desired.)
     91  1.7   thorpej *----------------------------------------------------------------------------*/
     92  1.1      ross #include "softfloat-macros.h"
     93  1.1      ross 
     94  1.7   thorpej /*----------------------------------------------------------------------------
     95  1.7   thorpej | Functions and definitions to determine:  (1) whether tininess for underflow
     96  1.7   thorpej | is detected before or after rounding by default, (2) what (if anything)
     97  1.7   thorpej | happens when exceptions are raised, (3) how signaling NaNs are distinguished
     98  1.7   thorpej | from quiet NaNs, (4) the default generated quiet NaNs, and (5) how NaNs
     99  1.7   thorpej | are propagated from function inputs to output.  These details are target-
    100  1.7   thorpej | specific.
    101  1.7   thorpej *----------------------------------------------------------------------------*/
    102  1.1      ross #include "softfloat-specialize.h"
    103  1.1      ross 
    104  1.1      ross #ifndef SOFTFLOAT_FOR_GCC /* Not used */
    105  1.7   thorpej /*----------------------------------------------------------------------------
    106  1.7   thorpej | Takes a 64-bit fixed-point value `absZ' with binary point between bits 6
    107  1.7   thorpej | and 7, and returns the properly rounded 32-bit integer corresponding to the
    108  1.7   thorpej | input.  If `zSign' is 1, the input is negated before being converted to an
    109  1.7   thorpej | integer.  Bit 63 of `absZ' must be zero.  Ordinarily, the fixed-point input
    110  1.7   thorpej | is simply rounded to an integer, with the inexact exception raised if the
    111  1.7   thorpej | input cannot be represented exactly as an integer.  However, if the fixed-
    112  1.7   thorpej | point input is too large, the invalid exception is raised and the largest
    113  1.7   thorpej | positive or negative integer is returned.
    114  1.7   thorpej *----------------------------------------------------------------------------*/
    115  1.7   thorpej 
    116  1.1      ross static int32 roundAndPackInt32( flag zSign, bits64 absZ )
    117  1.1      ross {
    118  1.1      ross     int8 roundingMode;
    119  1.1      ross     flag roundNearestEven;
    120  1.1      ross     int8 roundIncrement, roundBits;
    121  1.1      ross     int32 z;
    122  1.1      ross 
    123  1.1      ross     roundingMode = float_rounding_mode();
    124  1.1      ross     roundNearestEven = ( roundingMode == float_round_nearest_even );
    125  1.1      ross     roundIncrement = 0x40;
    126  1.1      ross     if ( ! roundNearestEven ) {
    127  1.1      ross         if ( roundingMode == float_round_to_zero ) {
    128  1.1      ross             roundIncrement = 0;
    129  1.1      ross         }
    130  1.1      ross         else {
    131  1.1      ross             roundIncrement = 0x7F;
    132  1.1      ross             if ( zSign ) {
    133  1.1      ross                 if ( roundingMode == float_round_up ) roundIncrement = 0;
    134  1.1      ross             }
    135  1.1      ross             else {
    136  1.1      ross                 if ( roundingMode == float_round_down ) roundIncrement = 0;
    137  1.1      ross             }
    138  1.1      ross         }
    139  1.1      ross     }
    140  1.1      ross     roundBits = absZ & 0x7F;
    141  1.1      ross     absZ = ( absZ + roundIncrement )>>7;
    142  1.1      ross     absZ &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven );
    143  1.1      ross     z = absZ;
    144  1.1      ross     if ( zSign ) z = - z;
    145  1.1      ross     if ( ( absZ>>32 ) || ( z && ( ( z < 0 ) ^ zSign ) ) ) {
    146  1.1      ross         float_raise( float_flag_invalid );
    147  1.1      ross         return zSign ? (sbits32) 0x80000000 : 0x7FFFFFFF;
    148  1.1      ross     }
    149  1.1      ross     if ( roundBits ) float_set_inexact();
    150  1.1      ross     return z;
    151  1.1      ross 
    152  1.1      ross }
    153  1.1      ross 
    154  1.7   thorpej /*----------------------------------------------------------------------------
    155  1.7   thorpej | Takes the 128-bit fixed-point value formed by concatenating `absZ0' and
    156  1.7   thorpej | `absZ1', with binary point between bits 63 and 64 (between the input words),
    157  1.7   thorpej | and returns the properly rounded 64-bit integer corresponding to the input.
    158  1.7   thorpej | If `zSign' is 1, the input is negated before being converted to an integer.
    159  1.7   thorpej | Ordinarily, the fixed-point input is simply rounded to an integer, with
    160  1.7   thorpej | the inexact exception raised if the input cannot be represented exactly as
    161  1.7   thorpej | an integer.  However, if the fixed-point input is too large, the invalid
    162  1.7   thorpej | exception is raised and the largest positive or negative integer is
    163  1.7   thorpej | returned.
    164  1.7   thorpej *----------------------------------------------------------------------------*/
    165  1.7   thorpej 
    166  1.1      ross static int64 roundAndPackInt64( flag zSign, bits64 absZ0, bits64 absZ1 )
    167  1.1      ross {
    168  1.1      ross     int8 roundingMode;
    169  1.1      ross     flag roundNearestEven, increment;
    170  1.1      ross     int64 z;
    171  1.1      ross 
    172  1.1      ross     roundingMode = float_rounding_mode();
    173  1.1      ross     roundNearestEven = ( roundingMode == float_round_nearest_even );
    174  1.1      ross     increment = ( (sbits64) absZ1 < 0 );
    175  1.1      ross     if ( ! roundNearestEven ) {
    176  1.1      ross         if ( roundingMode == float_round_to_zero ) {
    177  1.1      ross             increment = 0;
    178  1.1      ross         }
    179  1.1      ross         else {
    180  1.1      ross             if ( zSign ) {
    181  1.1      ross                 increment = ( roundingMode == float_round_down ) && absZ1;
    182  1.1      ross             }
    183  1.1      ross             else {
    184  1.1      ross                 increment = ( roundingMode == float_round_up ) && absZ1;
    185  1.1      ross             }
    186  1.1      ross         }
    187  1.1      ross     }
    188  1.1      ross     if ( increment ) {
    189  1.1      ross         ++absZ0;
    190  1.1      ross         if ( absZ0 == 0 ) goto overflow;
    191  1.1      ross         absZ0 &= ~ ( ( (bits64) ( absZ1<<1 ) == 0 ) & roundNearestEven );
    192  1.1      ross     }
    193  1.1      ross     z = absZ0;
    194  1.1      ross     if ( zSign ) z = - z;
    195  1.1      ross     if ( z && ( ( z < 0 ) ^ zSign ) ) {
    196  1.1      ross  overflow:
    197  1.1      ross         float_raise( float_flag_invalid );
    198  1.1      ross         return
    199  1.1      ross               zSign ? (sbits64) LIT64( 0x8000000000000000 )
    200  1.1      ross             : LIT64( 0x7FFFFFFFFFFFFFFF );
    201  1.1      ross     }
    202  1.1      ross     if ( absZ1 ) float_set_inexact();
    203  1.1      ross     return z;
    204  1.1      ross 
    205  1.1      ross }
    206  1.6    martin 
    207  1.6    martin /* same as above, but for unsigned values */
    208  1.6    martin static uint64 roundAndPackUInt64( bits64 absZ0, bits64 absZ1 )
    209  1.6    martin {
    210  1.6    martin     int8 roundingMode;
    211  1.6    martin     flag roundNearestEven, increment;
    212  1.6    martin     uint64 z;
    213  1.6    martin 
    214  1.6    martin     roundingMode = float_rounding_mode();
    215  1.6    martin     roundNearestEven = ( roundingMode == float_round_nearest_even );
    216  1.6    martin     increment = ( (sbits64) absZ1 < 0 );
    217  1.6    martin     if ( ! roundNearestEven ) {
    218  1.6    martin         if ( roundingMode == float_round_to_zero ) {
    219  1.6    martin             increment = 0;
    220  1.6    martin         }
    221  1.6    martin         else {
    222  1.6    martin             increment = ( roundingMode == float_round_up ) && absZ1;
    223  1.6    martin         }
    224  1.6    martin     }
    225  1.6    martin     if ( increment ) {
    226  1.6    martin         ++absZ0;
    227  1.6    martin         if ( absZ0 == 0 ) {
    228  1.6    martin             float_raise( float_flag_invalid );
    229  1.6    martin             return LIT64( 0x7FFFFFFFFFFFFFFF );
    230  1.6    martin 	}
    231  1.6    martin         absZ0 &= ~ ( ( (bits64) ( absZ1<<1 ) == 0 ) & roundNearestEven );
    232  1.6    martin     }
    233  1.6    martin     z = absZ0;
    234  1.6    martin     if ( absZ1 ) float_set_inexact();
    235  1.6    martin     return z;
    236  1.6    martin 
    237  1.6    martin }
    238  1.7   thorpej #endif /* SOFTFLOAT_FOR_GCC */
    239  1.7   thorpej 
    240  1.7   thorpej /*----------------------------------------------------------------------------
    241  1.7   thorpej | Returns the fraction bits of the single-precision floating-point value `a'.
    242  1.7   thorpej *----------------------------------------------------------------------------*/
    243  1.1      ross 
    244  1.1      ross INLINE bits32 extractFloat32Frac( float32 a )
    245  1.1      ross {
    246  1.1      ross 
    247  1.1      ross     return a & 0x007FFFFF;
    248  1.1      ross 
    249  1.1      ross }
    250  1.1      ross 
    251  1.7   thorpej /*----------------------------------------------------------------------------
    252  1.7   thorpej | Returns the exponent bits of the single-precision floating-point value `a'.
    253  1.7   thorpej *----------------------------------------------------------------------------*/
    254  1.7   thorpej 
    255  1.1      ross INLINE int16 extractFloat32Exp( float32 a )
    256  1.1      ross {
    257  1.1      ross 
    258  1.1      ross     return ( a>>23 ) & 0xFF;
    259  1.1      ross 
    260  1.1      ross }
    261  1.1      ross 
    262  1.7   thorpej /*----------------------------------------------------------------------------
    263  1.7   thorpej | Returns the sign bit of the single-precision floating-point value `a'.
    264  1.7   thorpej *----------------------------------------------------------------------------*/
    265  1.7   thorpej 
    266  1.1      ross INLINE flag extractFloat32Sign( float32 a )
    267  1.1      ross {
    268  1.1      ross 
    269  1.1      ross     return a>>31;
    270  1.1      ross 
    271  1.1      ross }
    272  1.1      ross 
    273  1.7   thorpej /*----------------------------------------------------------------------------
    274  1.7   thorpej | Normalizes the subnormal single-precision floating-point value represented
    275  1.7   thorpej | by the denormalized significand `aSig'.  The normalized exponent and
    276  1.7   thorpej | significand are stored at the locations pointed to by `zExpPtr' and
    277  1.7   thorpej | `zSigPtr', respectively.
    278  1.7   thorpej *----------------------------------------------------------------------------*/
    279  1.7   thorpej 
    280  1.1      ross static void
    281  1.1      ross  normalizeFloat32Subnormal( bits32 aSig, int16 *zExpPtr, bits32 *zSigPtr )
    282  1.1      ross {
    283  1.1      ross     int8 shiftCount;
    284  1.1      ross 
    285  1.1      ross     shiftCount = countLeadingZeros32( aSig ) - 8;
    286  1.1      ross     *zSigPtr = aSig<<shiftCount;
    287  1.1      ross     *zExpPtr = 1 - shiftCount;
    288  1.1      ross 
    289  1.1      ross }
    290  1.1      ross 
    291  1.7   thorpej /*----------------------------------------------------------------------------
    292  1.7   thorpej | Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
    293  1.7   thorpej | single-precision floating-point value, returning the result.  After being
    294  1.7   thorpej | shifted into the proper positions, the three fields are simply added
    295  1.7   thorpej | together to form the result.  This means that any integer portion of `zSig'
    296  1.7   thorpej | will be added into the exponent.  Since a properly normalized significand
    297  1.7   thorpej | will have an integer portion equal to 1, the `zExp' input should be 1 less
    298  1.7   thorpej | than the desired result exponent whenever `zSig' is a complete, normalized
    299  1.7   thorpej | significand.
    300  1.7   thorpej *----------------------------------------------------------------------------*/
    301  1.7   thorpej 
    302  1.1      ross INLINE float32 packFloat32( flag zSign, int16 zExp, bits32 zSig )
    303  1.1      ross {
    304  1.1      ross 
    305  1.1      ross     return ( ( (bits32) zSign )<<31 ) + ( ( (bits32) zExp )<<23 ) + zSig;
    306  1.1      ross 
    307  1.1      ross }
    308  1.1      ross 
    309  1.7   thorpej /*----------------------------------------------------------------------------
    310  1.7   thorpej | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
    311  1.7   thorpej | and significand `zSig', and returns the proper single-precision floating-
    312  1.7   thorpej | point value corresponding to the abstract input.  Ordinarily, the abstract
    313  1.7   thorpej | value is simply rounded and packed into the single-precision format, with
    314  1.7   thorpej | the inexact exception raised if the abstract input cannot be represented
    315  1.7   thorpej | exactly.  However, if the abstract value is too large, the overflow and
    316  1.7   thorpej | inexact exceptions are raised and an infinity or maximal finite value is
    317  1.7   thorpej | returned.  If the abstract value is too small, the input value is rounded to
    318  1.7   thorpej | a subnormal number, and the underflow and inexact exceptions are raised if
    319  1.7   thorpej | the abstract input cannot be represented exactly as a subnormal single-
    320  1.7   thorpej | precision floating-point number.
    321  1.7   thorpej |     The input significand `zSig' has its binary point between bits 30
    322  1.7   thorpej | and 29, which is 7 bits to the left of the usual location.  This shifted
    323  1.7   thorpej | significand must be normalized or smaller.  If `zSig' is not normalized,
    324  1.7   thorpej | `zExp' must be 0; in that case, the result returned is a subnormal number,
    325  1.7   thorpej | and it must not require rounding.  In the usual case that `zSig' is
    326  1.7   thorpej | normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
    327  1.7   thorpej | The handling of underflow and overflow follows the IEC/IEEE Standard for
    328  1.7   thorpej | Binary Floating-Point Arithmetic.
    329  1.7   thorpej *----------------------------------------------------------------------------*/
    330  1.7   thorpej 
    331  1.1      ross static float32 roundAndPackFloat32( flag zSign, int16 zExp, bits32 zSig )
    332  1.1      ross {
    333  1.1      ross     int8 roundingMode;
    334  1.1      ross     flag roundNearestEven;
    335  1.1      ross     int8 roundIncrement, roundBits;
    336  1.1      ross     flag isTiny;
    337  1.1      ross 
    338  1.1      ross     roundingMode = float_rounding_mode();
    339  1.1      ross     roundNearestEven = ( roundingMode == float_round_nearest_even );
    340  1.1      ross     roundIncrement = 0x40;
    341  1.1      ross     if ( ! roundNearestEven ) {
    342  1.1      ross         if ( roundingMode == float_round_to_zero ) {
    343  1.1      ross             roundIncrement = 0;
    344  1.1      ross         }
    345  1.1      ross         else {
    346  1.1      ross             roundIncrement = 0x7F;
    347  1.1      ross             if ( zSign ) {
    348  1.1      ross                 if ( roundingMode == float_round_up ) roundIncrement = 0;
    349  1.1      ross             }
    350  1.1      ross             else {
    351  1.1      ross                 if ( roundingMode == float_round_down ) roundIncrement = 0;
    352  1.1      ross             }
    353  1.1      ross         }
    354  1.1      ross     }
    355  1.1      ross     roundBits = zSig & 0x7F;
    356  1.1      ross     if ( 0xFD <= (bits16) zExp ) {
    357  1.1      ross         if (    ( 0xFD < zExp )
    358  1.1      ross              || (    ( zExp == 0xFD )
    359  1.1      ross                   && ( (sbits32) ( zSig + roundIncrement ) < 0 ) )
    360  1.1      ross            ) {
    361  1.1      ross             float_raise( float_flag_overflow | float_flag_inexact );
    362  1.1      ross             return packFloat32( zSign, 0xFF, 0 ) - ( roundIncrement == 0 );
    363  1.1      ross         }
    364  1.1      ross         if ( zExp < 0 ) {
    365  1.1      ross             isTiny =
    366  1.1      ross                    ( float_detect_tininess == float_tininess_before_rounding )
    367  1.1      ross                 || ( zExp < -1 )
    368  1.1      ross                 || ( zSig + roundIncrement < 0x80000000 );
    369  1.1      ross             shift32RightJamming( zSig, - zExp, &zSig );
    370  1.1      ross             zExp = 0;
    371  1.1      ross             roundBits = zSig & 0x7F;
    372  1.1      ross             if ( isTiny && roundBits ) float_raise( float_flag_underflow );
    373  1.1      ross         }
    374  1.1      ross     }
    375  1.1      ross     if ( roundBits ) float_set_inexact();
    376  1.1      ross     zSig = ( zSig + roundIncrement )>>7;
    377  1.1      ross     zSig &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven );
    378  1.1      ross     if ( zSig == 0 ) zExp = 0;
    379  1.1      ross     return packFloat32( zSign, zExp, zSig );
    380  1.1      ross 
    381  1.1      ross }
    382  1.1      ross 
    383  1.7   thorpej /*----------------------------------------------------------------------------
    384  1.7   thorpej | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
    385  1.7   thorpej | and significand `zSig', and returns the proper single-precision floating-
    386  1.7   thorpej | point value corresponding to the abstract input.  This routine is just like
    387  1.7   thorpej | `roundAndPackFloat32' except that `zSig' does not have to be normalized.
    388  1.7   thorpej | Bit 31 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
    389  1.7   thorpej | floating-point exponent.
    390  1.7   thorpej *----------------------------------------------------------------------------*/
    391  1.7   thorpej 
    392  1.1      ross static float32
    393  1.1      ross  normalizeRoundAndPackFloat32( flag zSign, int16 zExp, bits32 zSig )
    394  1.1      ross {
    395  1.1      ross     int8 shiftCount;
    396  1.1      ross 
    397  1.1      ross     shiftCount = countLeadingZeros32( zSig ) - 1;
    398  1.1      ross     return roundAndPackFloat32( zSign, zExp - shiftCount, zSig<<shiftCount );
    399  1.1      ross 
    400  1.1      ross }
    401  1.1      ross 
    402  1.7   thorpej /*----------------------------------------------------------------------------
    403  1.7   thorpej | Returns the fraction bits of the double-precision floating-point value `a'.
    404  1.7   thorpej *----------------------------------------------------------------------------*/
    405  1.7   thorpej 
    406  1.1      ross INLINE bits64 extractFloat64Frac( float64 a )
    407  1.1      ross {
    408  1.1      ross 
    409  1.1      ross     return FLOAT64_DEMANGLE(a) & LIT64( 0x000FFFFFFFFFFFFF );
    410  1.1      ross 
    411  1.1      ross }
    412  1.1      ross 
    413  1.7   thorpej /*----------------------------------------------------------------------------
    414  1.7   thorpej | Returns the exponent bits of the double-precision floating-point value `a'.
    415  1.7   thorpej *----------------------------------------------------------------------------*/
    416  1.7   thorpej 
    417  1.1      ross INLINE int16 extractFloat64Exp( float64 a )
    418  1.1      ross {
    419  1.1      ross 
    420  1.1      ross     return ( FLOAT64_DEMANGLE(a)>>52 ) & 0x7FF;
    421  1.1      ross 
    422  1.1      ross }
    423  1.7   thorpej /*----------------------------------------------------------------------------
    424  1.7   thorpej | Returns the sign bit of the double-precision floating-point value `a'.
    425  1.7   thorpej *----------------------------------------------------------------------------*/
    426  1.1      ross 
    427  1.1      ross INLINE flag extractFloat64Sign( float64 a )
    428  1.1      ross {
    429  1.1      ross 
    430  1.1      ross     return FLOAT64_DEMANGLE(a)>>63;
    431  1.1      ross 
    432  1.1      ross }
    433  1.1      ross 
    434  1.7   thorpej /*----------------------------------------------------------------------------
    435  1.7   thorpej | Normalizes the subnormal double-precision floating-point value represented
    436  1.7   thorpej | by the denormalized significand `aSig'.  The normalized exponent and
    437  1.7   thorpej | significand are stored at the locations pointed to by `zExpPtr' and
    438  1.7   thorpej | `zSigPtr', respectively.
    439  1.7   thorpej *----------------------------------------------------------------------------*/
    440  1.7   thorpej 
    441  1.1      ross static void
    442  1.1      ross  normalizeFloat64Subnormal( bits64 aSig, int16 *zExpPtr, bits64 *zSigPtr )
    443  1.1      ross {
    444  1.1      ross     int8 shiftCount;
    445  1.1      ross 
    446  1.1      ross     shiftCount = countLeadingZeros64( aSig ) - 11;
    447  1.1      ross     *zSigPtr = aSig<<shiftCount;
    448  1.1      ross     *zExpPtr = 1 - shiftCount;
    449  1.1      ross 
    450  1.1      ross }
    451  1.1      ross 
    452  1.7   thorpej /*----------------------------------------------------------------------------
    453  1.7   thorpej | Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
    454  1.7   thorpej | double-precision floating-point value, returning the result.  After being
    455  1.7   thorpej | shifted into the proper positions, the three fields are simply added
    456  1.7   thorpej | together to form the result.  This means that any integer portion of `zSig'
    457  1.7   thorpej | will be added into the exponent.  Since a properly normalized significand
    458  1.7   thorpej | will have an integer portion equal to 1, the `zExp' input should be 1 less
    459  1.7   thorpej | than the desired result exponent whenever `zSig' is a complete, normalized
    460  1.7   thorpej | significand.
    461  1.7   thorpej *----------------------------------------------------------------------------*/
    462  1.7   thorpej 
    463  1.1      ross INLINE float64 packFloat64( flag zSign, int16 zExp, bits64 zSig )
    464  1.1      ross {
    465  1.1      ross 
    466  1.1      ross     return FLOAT64_MANGLE( ( ( (bits64) zSign )<<63 ) +
    467  1.1      ross 			   ( ( (bits64) zExp )<<52 ) + zSig );
    468  1.1      ross 
    469  1.1      ross }
    470  1.1      ross 
    471  1.7   thorpej /*----------------------------------------------------------------------------
    472  1.7   thorpej | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
    473  1.7   thorpej | and significand `zSig', and returns the proper double-precision floating-
    474  1.7   thorpej | point value corresponding to the abstract input.  Ordinarily, the abstract
    475  1.7   thorpej | value is simply rounded and packed into the double-precision format, with
    476  1.7   thorpej | the inexact exception raised if the abstract input cannot be represented
    477  1.7   thorpej | exactly.  However, if the abstract value is too large, the overflow and
    478  1.7   thorpej | inexact exceptions are raised and an infinity or maximal finite value is
    479  1.7   thorpej | returned.  If the abstract value is too small, the input value is rounded
    480  1.7   thorpej | to a subnormal number, and the underflow and inexact exceptions are raised
    481  1.7   thorpej | if the abstract input cannot be represented exactly as a subnormal double-
    482  1.7   thorpej | precision floating-point number.
    483  1.7   thorpej |     The input significand `zSig' has its binary point between bits 62
    484  1.7   thorpej | and 61, which is 10 bits to the left of the usual location.  This shifted
    485  1.7   thorpej | significand must be normalized or smaller.  If `zSig' is not normalized,
    486  1.7   thorpej | `zExp' must be 0; in that case, the result returned is a subnormal number,
    487  1.7   thorpej | and it must not require rounding.  In the usual case that `zSig' is
    488  1.7   thorpej | normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
    489  1.7   thorpej | The handling of underflow and overflow follows the IEC/IEEE Standard for
    490  1.7   thorpej | Binary Floating-Point Arithmetic.
    491  1.7   thorpej *----------------------------------------------------------------------------*/
    492  1.7   thorpej 
    493  1.1      ross static float64 roundAndPackFloat64( flag zSign, int16 zExp, bits64 zSig )
    494  1.1      ross {
    495  1.1      ross     int8 roundingMode;
    496  1.1      ross     flag roundNearestEven;
    497  1.1      ross     int16 roundIncrement, roundBits;
    498  1.1      ross     flag isTiny;
    499  1.1      ross 
    500  1.1      ross     roundingMode = float_rounding_mode();
    501  1.1      ross     roundNearestEven = ( roundingMode == float_round_nearest_even );
    502  1.1      ross     roundIncrement = 0x200;
    503  1.1      ross     if ( ! roundNearestEven ) {
    504  1.1      ross         if ( roundingMode == float_round_to_zero ) {
    505  1.1      ross             roundIncrement = 0;
    506  1.1      ross         }
    507  1.1      ross         else {
    508  1.1      ross             roundIncrement = 0x3FF;
    509  1.1      ross             if ( zSign ) {
    510  1.1      ross                 if ( roundingMode == float_round_up ) roundIncrement = 0;
    511  1.1      ross             }
    512  1.1      ross             else {
    513  1.1      ross                 if ( roundingMode == float_round_down ) roundIncrement = 0;
    514  1.1      ross             }
    515  1.1      ross         }
    516  1.1      ross     }
    517  1.1      ross     roundBits = zSig & 0x3FF;
    518  1.1      ross     if ( 0x7FD <= (bits16) zExp ) {
    519  1.1      ross         if (    ( 0x7FD < zExp )
    520  1.1      ross              || (    ( zExp == 0x7FD )
    521  1.1      ross                   && ( (sbits64) ( zSig + roundIncrement ) < 0 ) )
    522  1.1      ross            ) {
    523  1.1      ross             float_raise( float_flag_overflow | float_flag_inexact );
    524  1.1      ross             return FLOAT64_MANGLE(
    525  1.1      ross 		FLOAT64_DEMANGLE(packFloat64( zSign, 0x7FF, 0 )) -
    526  1.1      ross 		( roundIncrement == 0 ));
    527  1.1      ross         }
    528  1.1      ross         if ( zExp < 0 ) {
    529  1.1      ross             isTiny =
    530  1.1      ross                    ( float_detect_tininess == float_tininess_before_rounding )
    531  1.1      ross                 || ( zExp < -1 )
    532  1.1      ross                 || ( zSig + roundIncrement < LIT64( 0x8000000000000000 ) );
    533  1.1      ross             shift64RightJamming( zSig, - zExp, &zSig );
    534  1.1      ross             zExp = 0;
    535  1.1      ross             roundBits = zSig & 0x3FF;
    536  1.1      ross             if ( isTiny && roundBits ) float_raise( float_flag_underflow );
    537  1.1      ross         }
    538  1.1      ross     }
    539  1.1      ross     if ( roundBits ) float_set_inexact();
    540  1.1      ross     zSig = ( zSig + roundIncrement )>>10;
    541  1.1      ross     zSig &= ~ ( ( ( roundBits ^ 0x200 ) == 0 ) & roundNearestEven );
    542  1.1      ross     if ( zSig == 0 ) zExp = 0;
    543  1.1      ross     return packFloat64( zSign, zExp, zSig );
    544  1.1      ross 
    545  1.1      ross }
    546  1.1      ross 
    547  1.7   thorpej /*----------------------------------------------------------------------------
    548  1.7   thorpej | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
    549  1.7   thorpej | and significand `zSig', and returns the proper double-precision floating-
    550  1.7   thorpej | point value corresponding to the abstract input.  This routine is just like
    551  1.7   thorpej | `roundAndPackFloat64' except that `zSig' does not have to be normalized.
    552  1.7   thorpej | Bit 63 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
    553  1.7   thorpej | floating-point exponent.
    554  1.7   thorpej *----------------------------------------------------------------------------*/
    555  1.7   thorpej 
    556  1.1      ross static float64
    557  1.1      ross  normalizeRoundAndPackFloat64( flag zSign, int16 zExp, bits64 zSig )
    558  1.1      ross {
    559  1.1      ross     int8 shiftCount;
    560  1.1      ross 
    561  1.1      ross     shiftCount = countLeadingZeros64( zSig ) - 1;
    562  1.1      ross     return roundAndPackFloat64( zSign, zExp - shiftCount, zSig<<shiftCount );
    563  1.1      ross 
    564  1.1      ross }
    565  1.1      ross 
    566  1.1      ross #ifdef FLOATX80
    567  1.1      ross 
    568  1.7   thorpej /*----------------------------------------------------------------------------
    569  1.7   thorpej | Returns the fraction bits of the extended double-precision floating-point
    570  1.7   thorpej | value `a'.
    571  1.7   thorpej *----------------------------------------------------------------------------*/
    572  1.7   thorpej 
    573  1.1      ross INLINE bits64 extractFloatx80Frac( floatx80 a )
    574  1.1      ross {
    575  1.1      ross 
    576  1.1      ross     return a.low;
    577  1.1      ross 
    578  1.1      ross }
    579  1.1      ross 
    580  1.7   thorpej /*----------------------------------------------------------------------------
    581  1.7   thorpej | Returns the exponent bits of the extended double-precision floating-point
    582  1.7   thorpej | value `a'.
    583  1.7   thorpej *----------------------------------------------------------------------------*/
    584  1.7   thorpej 
    585  1.1      ross INLINE int32 extractFloatx80Exp( floatx80 a )
    586  1.1      ross {
    587  1.1      ross 
    588  1.1      ross     return a.high & 0x7FFF;
    589  1.1      ross 
    590  1.1      ross }
    591  1.1      ross 
    592  1.7   thorpej /*----------------------------------------------------------------------------
    593  1.7   thorpej | Returns the sign bit of the extended double-precision floating-point value
    594  1.7   thorpej | `a'.
    595  1.7   thorpej *----------------------------------------------------------------------------*/
    596  1.7   thorpej 
    597  1.1      ross INLINE flag extractFloatx80Sign( floatx80 a )
    598  1.1      ross {
    599  1.1      ross 
    600  1.1      ross     return a.high>>15;
    601  1.1      ross 
    602  1.1      ross }
    603  1.1      ross 
    604  1.7   thorpej /*----------------------------------------------------------------------------
    605  1.7   thorpej | Normalizes the subnormal extended double-precision floating-point value
    606  1.7   thorpej | represented by the denormalized significand `aSig'.  The normalized exponent
    607  1.7   thorpej | and significand are stored at the locations pointed to by `zExpPtr' and
    608  1.7   thorpej | `zSigPtr', respectively.
    609  1.7   thorpej *----------------------------------------------------------------------------*/
    610  1.7   thorpej 
    611  1.1      ross static void
    612  1.1      ross  normalizeFloatx80Subnormal( bits64 aSig, int32 *zExpPtr, bits64 *zSigPtr )
    613  1.1      ross {
    614  1.1      ross     int8 shiftCount;
    615  1.1      ross 
    616  1.1      ross     shiftCount = countLeadingZeros64( aSig );
    617  1.1      ross     *zSigPtr = aSig<<shiftCount;
    618  1.1      ross     *zExpPtr = 1 - shiftCount;
    619  1.1      ross 
    620  1.1      ross }
    621  1.1      ross 
    622  1.7   thorpej /*----------------------------------------------------------------------------
    623  1.7   thorpej | Packs the sign `zSign', exponent `zExp', and significand `zSig' into an
    624  1.7   thorpej | extended double-precision floating-point value, returning the result.
    625  1.7   thorpej *----------------------------------------------------------------------------*/
    626  1.7   thorpej 
    627  1.1      ross INLINE floatx80 packFloatx80( flag zSign, int32 zExp, bits64 zSig )
    628  1.1      ross {
    629  1.1      ross     floatx80 z;
    630  1.1      ross 
    631  1.1      ross     z.low = zSig;
    632  1.1      ross     z.high = ( ( (bits16) zSign )<<15 ) + zExp;
    633  1.1      ross     return z;
    634  1.1      ross 
    635  1.1      ross }
    636  1.1      ross 
    637  1.7   thorpej /*----------------------------------------------------------------------------
    638  1.7   thorpej | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
    639  1.7   thorpej | and extended significand formed by the concatenation of `zSig0' and `zSig1',
    640  1.7   thorpej | and returns the proper extended double-precision floating-point value
    641  1.7   thorpej | corresponding to the abstract input.  Ordinarily, the abstract value is
    642  1.7   thorpej | rounded and packed into the extended double-precision format, with the
    643  1.7   thorpej | inexact exception raised if the abstract input cannot be represented
    644  1.7   thorpej | exactly.  However, if the abstract value is too large, the overflow and
    645  1.7   thorpej | inexact exceptions are raised and an infinity or maximal finite value is
    646  1.7   thorpej | returned.  If the abstract value is too small, the input value is rounded to
    647  1.7   thorpej | a subnormal number, and the underflow and inexact exceptions are raised if
    648  1.7   thorpej | the abstract input cannot be represented exactly as a subnormal extended
    649  1.7   thorpej | double-precision floating-point number.
    650  1.7   thorpej |     If `roundingPrecision' is 32 or 64, the result is rounded to the same
    651  1.7   thorpej | number of bits as single or double precision, respectively.  Otherwise, the
    652  1.7   thorpej | result is rounded to the full precision of the extended double-precision
    653  1.7   thorpej | format.
    654  1.7   thorpej |     The input significand must be normalized or smaller.  If the input
    655  1.7   thorpej | significand is not normalized, `zExp' must be 0; in that case, the result
    656  1.7   thorpej | returned is a subnormal number, and it must not require rounding.  The
    657  1.7   thorpej | handling of underflow and overflow follows the IEC/IEEE Standard for Binary
    658  1.7   thorpej | Floating-Point Arithmetic.
    659  1.7   thorpej *----------------------------------------------------------------------------*/
    660  1.7   thorpej 
    661  1.1      ross static floatx80
    662  1.1      ross  roundAndPackFloatx80(
    663  1.1      ross      int8 roundingPrecision, flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1
    664  1.1      ross  )
    665  1.1      ross {
    666  1.1      ross     int8 roundingMode;
    667  1.1      ross     flag roundNearestEven, increment, isTiny;
    668  1.1      ross     int64 roundIncrement, roundMask, roundBits;
    669  1.1      ross 
    670  1.1      ross     roundingMode = float_rounding_mode();
    671  1.1      ross     roundNearestEven = ( roundingMode == float_round_nearest_even );
    672  1.1      ross     if ( roundingPrecision == 80 ) goto precision80;
    673  1.1      ross     if ( roundingPrecision == 64 ) {
    674  1.1      ross         roundIncrement = LIT64( 0x0000000000000400 );
    675  1.1      ross         roundMask = LIT64( 0x00000000000007FF );
    676  1.1      ross     }
    677  1.1      ross     else if ( roundingPrecision == 32 ) {
    678  1.1      ross         roundIncrement = LIT64( 0x0000008000000000 );
    679  1.1      ross         roundMask = LIT64( 0x000000FFFFFFFFFF );
    680  1.1      ross     }
    681  1.1      ross     else {
    682  1.1      ross         goto precision80;
    683  1.1      ross     }
    684  1.1      ross     zSig0 |= ( zSig1 != 0 );
    685  1.1      ross     if ( ! roundNearestEven ) {
    686  1.1      ross         if ( roundingMode == float_round_to_zero ) {
    687  1.1      ross             roundIncrement = 0;
    688  1.1      ross         }
    689  1.1      ross         else {
    690  1.1      ross             roundIncrement = roundMask;
    691  1.1      ross             if ( zSign ) {
    692  1.1      ross                 if ( roundingMode == float_round_up ) roundIncrement = 0;
    693  1.1      ross             }
    694  1.1      ross             else {
    695  1.1      ross                 if ( roundingMode == float_round_down ) roundIncrement = 0;
    696  1.1      ross             }
    697  1.1      ross         }
    698  1.1      ross     }
    699  1.1      ross     roundBits = zSig0 & roundMask;
    700  1.1      ross     if ( 0x7FFD <= (bits32) ( zExp - 1 ) ) {
    701  1.1      ross         if (    ( 0x7FFE < zExp )
    702  1.1      ross              || ( ( zExp == 0x7FFE ) && ( zSig0 + roundIncrement < zSig0 ) )
    703  1.1      ross            ) {
    704  1.1      ross             goto overflow;
    705  1.1      ross         }
    706  1.1      ross         if ( zExp <= 0 ) {
    707  1.1      ross             isTiny =
    708  1.1      ross                    ( float_detect_tininess == float_tininess_before_rounding )
    709  1.1      ross                 || ( zExp < 0 )
    710  1.1      ross                 || ( zSig0 <= zSig0 + roundIncrement );
    711  1.1      ross             shift64RightJamming( zSig0, 1 - zExp, &zSig0 );
    712  1.1      ross             zExp = 0;
    713  1.1      ross             roundBits = zSig0 & roundMask;
    714  1.1      ross             if ( isTiny && roundBits ) float_raise( float_flag_underflow );
    715  1.1      ross             if ( roundBits ) float_set_inexact();
    716  1.1      ross             zSig0 += roundIncrement;
    717  1.1      ross             if ( (sbits64) zSig0 < 0 ) zExp = 1;
    718  1.1      ross             roundIncrement = roundMask + 1;
    719  1.1      ross             if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) {
    720  1.1      ross                 roundMask |= roundIncrement;
    721  1.1      ross             }
    722  1.1      ross             zSig0 &= ~ roundMask;
    723  1.1      ross             return packFloatx80( zSign, zExp, zSig0 );
    724  1.1      ross         }
    725  1.1      ross     }
    726  1.1      ross     if ( roundBits ) float_set_inexact();
    727  1.1      ross     zSig0 += roundIncrement;
    728  1.1      ross     if ( zSig0 < roundIncrement ) {
    729  1.1      ross         ++zExp;
    730  1.1      ross         zSig0 = LIT64( 0x8000000000000000 );
    731  1.1      ross     }
    732  1.1      ross     roundIncrement = roundMask + 1;
    733  1.1      ross     if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) {
    734  1.1      ross         roundMask |= roundIncrement;
    735  1.1      ross     }
    736  1.1      ross     zSig0 &= ~ roundMask;
    737  1.1      ross     if ( zSig0 == 0 ) zExp = 0;
    738  1.1      ross     return packFloatx80( zSign, zExp, zSig0 );
    739  1.1      ross  precision80:
    740  1.1      ross     increment = ( (sbits64) zSig1 < 0 );
    741  1.1      ross     if ( ! roundNearestEven ) {
    742  1.1      ross         if ( roundingMode == float_round_to_zero ) {
    743  1.1      ross             increment = 0;
    744  1.1      ross         }
    745  1.1      ross         else {
    746  1.1      ross             if ( zSign ) {
    747  1.1      ross                 increment = ( roundingMode == float_round_down ) && zSig1;
    748  1.1      ross             }
    749  1.1      ross             else {
    750  1.1      ross                 increment = ( roundingMode == float_round_up ) && zSig1;
    751  1.1      ross             }
    752  1.1      ross         }
    753  1.1      ross     }
    754  1.1      ross     if ( 0x7FFD <= (bits32) ( zExp - 1 ) ) {
    755  1.1      ross         if (    ( 0x7FFE < zExp )
    756  1.1      ross              || (    ( zExp == 0x7FFE )
    757  1.1      ross                   && ( zSig0 == LIT64( 0xFFFFFFFFFFFFFFFF ) )
    758  1.1      ross                   && increment
    759  1.1      ross                 )
    760  1.1      ross            ) {
    761  1.1      ross             roundMask = 0;
    762  1.1      ross  overflow:
    763  1.1      ross             float_raise( float_flag_overflow | float_flag_inexact );
    764  1.1      ross             if (    ( roundingMode == float_round_to_zero )
    765  1.1      ross                  || ( zSign && ( roundingMode == float_round_up ) )
    766  1.1      ross                  || ( ! zSign && ( roundingMode == float_round_down ) )
    767  1.1      ross                ) {
    768  1.1      ross                 return packFloatx80( zSign, 0x7FFE, ~ roundMask );
    769  1.1      ross             }
    770  1.1      ross             return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
    771  1.1      ross         }
    772  1.1      ross         if ( zExp <= 0 ) {
    773  1.1      ross             isTiny =
    774  1.1      ross                    ( float_detect_tininess == float_tininess_before_rounding )
    775  1.1      ross                 || ( zExp < 0 )
    776  1.1      ross                 || ! increment
    777  1.1      ross                 || ( zSig0 < LIT64( 0xFFFFFFFFFFFFFFFF ) );
    778  1.1      ross             shift64ExtraRightJamming( zSig0, zSig1, 1 - zExp, &zSig0, &zSig1 );
    779  1.1      ross             zExp = 0;
    780  1.1      ross             if ( isTiny && zSig1 ) float_raise( float_flag_underflow );
    781  1.1      ross             if ( zSig1 ) float_set_inexact();
    782  1.1      ross             if ( roundNearestEven ) {
    783  1.1      ross                 increment = ( (sbits64) zSig1 < 0 );
    784  1.1      ross             }
    785  1.1      ross             else {
    786  1.1      ross                 if ( zSign ) {
    787  1.1      ross                     increment = ( roundingMode == float_round_down ) && zSig1;
    788  1.1      ross                 }
    789  1.1      ross                 else {
    790  1.1      ross                     increment = ( roundingMode == float_round_up ) && zSig1;
    791  1.1      ross                 }
    792  1.1      ross             }
    793  1.1      ross             if ( increment ) {
    794  1.1      ross                 ++zSig0;
    795  1.1      ross                 zSig0 &=
    796  1.1      ross                     ~ ( ( (bits64) ( zSig1<<1 ) == 0 ) & roundNearestEven );
    797  1.1      ross                 if ( (sbits64) zSig0 < 0 ) zExp = 1;
    798  1.1      ross             }
    799  1.1      ross             return packFloatx80( zSign, zExp, zSig0 );
    800  1.1      ross         }
    801  1.1      ross     }
    802  1.1      ross     if ( zSig1 ) float_set_inexact();
    803  1.1      ross     if ( increment ) {
    804  1.1      ross         ++zSig0;
    805  1.1      ross         if ( zSig0 == 0 ) {
    806  1.1      ross             ++zExp;
    807  1.1      ross             zSig0 = LIT64( 0x8000000000000000 );
    808  1.1      ross         }
    809  1.1      ross         else {
    810  1.1      ross             zSig0 &= ~ ( ( (bits64) ( zSig1<<1 ) == 0 ) & roundNearestEven );
    811  1.1      ross         }
    812  1.1      ross     }
    813  1.1      ross     else {
    814  1.1      ross         if ( zSig0 == 0 ) zExp = 0;
    815  1.1      ross     }
    816  1.1      ross     return packFloatx80( zSign, zExp, zSig0 );
    817  1.1      ross 
    818  1.1      ross }
    819  1.1      ross 
    820  1.7   thorpej /*----------------------------------------------------------------------------
    821  1.7   thorpej | Takes an abstract floating-point value having sign `zSign', exponent
    822  1.7   thorpej | `zExp', and significand formed by the concatenation of `zSig0' and `zSig1',
    823  1.7   thorpej | and returns the proper extended double-precision floating-point value
    824  1.7   thorpej | corresponding to the abstract input.  This routine is just like
    825  1.7   thorpej | `roundAndPackFloatx80' except that the input significand does not have to be
    826  1.7   thorpej | normalized.
    827  1.7   thorpej *----------------------------------------------------------------------------*/
    828  1.7   thorpej 
    829  1.1      ross static floatx80
    830  1.1      ross  normalizeRoundAndPackFloatx80(
    831  1.1      ross      int8 roundingPrecision, flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1
    832  1.1      ross  )
    833  1.1      ross {
    834  1.1      ross     int8 shiftCount;
    835  1.1      ross 
    836  1.1      ross     if ( zSig0 == 0 ) {
    837  1.1      ross         zSig0 = zSig1;
    838  1.1      ross         zSig1 = 0;
    839  1.1      ross         zExp -= 64;
    840  1.1      ross     }
    841  1.1      ross     shiftCount = countLeadingZeros64( zSig0 );
    842  1.1      ross     shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
    843  1.1      ross     zExp -= shiftCount;
    844  1.1      ross     return
    845  1.1      ross         roundAndPackFloatx80( roundingPrecision, zSign, zExp, zSig0, zSig1 );
    846  1.1      ross 
    847  1.1      ross }
    848  1.1      ross 
    849  1.1      ross #endif
    850  1.1      ross 
    851  1.1      ross #ifdef FLOAT128
    852  1.1      ross 
    853  1.7   thorpej /*----------------------------------------------------------------------------
    854  1.7   thorpej | Returns the least-significant 64 fraction bits of the quadruple-precision
    855  1.7   thorpej | floating-point value `a'.
    856  1.7   thorpej *----------------------------------------------------------------------------*/
    857  1.7   thorpej 
    858  1.1      ross INLINE bits64 extractFloat128Frac1( float128 a )
    859  1.1      ross {
    860  1.1      ross 
    861  1.1      ross     return a.low;
    862  1.1      ross 
    863  1.1      ross }
    864  1.1      ross 
    865  1.7   thorpej /*----------------------------------------------------------------------------
    866  1.7   thorpej | Returns the most-significant 48 fraction bits of the quadruple-precision
    867  1.7   thorpej | floating-point value `a'.
    868  1.7   thorpej *----------------------------------------------------------------------------*/
    869  1.7   thorpej 
    870  1.1      ross INLINE bits64 extractFloat128Frac0( float128 a )
    871  1.1      ross {
    872  1.1      ross 
    873  1.1      ross     return a.high & LIT64( 0x0000FFFFFFFFFFFF );
    874  1.1      ross 
    875  1.1      ross }
    876  1.1      ross 
    877  1.7   thorpej /*----------------------------------------------------------------------------
    878  1.7   thorpej | Returns the exponent bits of the quadruple-precision floating-point value
    879  1.7   thorpej | `a'.
    880  1.7   thorpej *----------------------------------------------------------------------------*/
    881  1.7   thorpej 
    882  1.1      ross INLINE int32 extractFloat128Exp( float128 a )
    883  1.1      ross {
    884  1.1      ross 
    885  1.1      ross     return ( a.high>>48 ) & 0x7FFF;
    886  1.1      ross 
    887  1.1      ross }
    888  1.1      ross 
    889  1.7   thorpej 
    890  1.7   thorpej /*----------------------------------------------------------------------------
    891  1.7   thorpej | Returns the sign bit of the quadruple-precision floating-point value `a'.
    892  1.7   thorpej *----------------------------------------------------------------------------*/
    893  1.7   thorpej 
    894  1.1      ross INLINE flag extractFloat128Sign( float128 a )
    895  1.1      ross {
    896  1.1      ross 
    897  1.1      ross     return a.high>>63;
    898  1.1      ross 
    899  1.1      ross }
    900  1.1      ross 
    901  1.7   thorpej /*----------------------------------------------------------------------------
    902  1.7   thorpej | Normalizes the subnormal quadruple-precision floating-point value
    903  1.7   thorpej | represented by the denormalized significand formed by the concatenation of
    904  1.7   thorpej | `aSig0' and `aSig1'.  The normalized exponent is stored at the location
    905  1.7   thorpej | pointed to by `zExpPtr'.  The most significant 49 bits of the normalized
    906  1.7   thorpej | significand are stored at the location pointed to by `zSig0Ptr', and the
    907  1.7   thorpej | least significant 64 bits of the normalized significand are stored at the
    908  1.7   thorpej | location pointed to by `zSig1Ptr'.
    909  1.7   thorpej *----------------------------------------------------------------------------*/
    910  1.7   thorpej 
    911  1.1      ross static void
    912  1.1      ross  normalizeFloat128Subnormal(
    913  1.1      ross      bits64 aSig0,
    914  1.1      ross      bits64 aSig1,
    915  1.1      ross      int32 *zExpPtr,
    916  1.1      ross      bits64 *zSig0Ptr,
    917  1.1      ross      bits64 *zSig1Ptr
    918  1.1      ross  )
    919  1.1      ross {
    920  1.1      ross     int8 shiftCount;
    921  1.1      ross 
    922  1.1      ross     if ( aSig0 == 0 ) {
    923  1.1      ross         shiftCount = countLeadingZeros64( aSig1 ) - 15;
    924  1.1      ross         if ( shiftCount < 0 ) {
    925  1.1      ross             *zSig0Ptr = aSig1>>( - shiftCount );
    926  1.1      ross             *zSig1Ptr = aSig1<<( shiftCount & 63 );
    927  1.1      ross         }
    928  1.1      ross         else {
    929  1.1      ross             *zSig0Ptr = aSig1<<shiftCount;
    930  1.1      ross             *zSig1Ptr = 0;
    931  1.1      ross         }
    932  1.1      ross         *zExpPtr = - shiftCount - 63;
    933  1.1      ross     }
    934  1.1      ross     else {
    935  1.1      ross         shiftCount = countLeadingZeros64( aSig0 ) - 15;
    936  1.1      ross         shortShift128Left( aSig0, aSig1, shiftCount, zSig0Ptr, zSig1Ptr );
    937  1.1      ross         *zExpPtr = 1 - shiftCount;
    938  1.1      ross     }
    939  1.1      ross 
    940  1.1      ross }
    941  1.1      ross 
    942  1.7   thorpej /*----------------------------------------------------------------------------
    943  1.7   thorpej | Packs the sign `zSign', the exponent `zExp', and the significand formed
    944  1.7   thorpej | by the concatenation of `zSig0' and `zSig1' into a quadruple-precision
    945  1.7   thorpej | floating-point value, returning the result.  After being shifted into the
    946  1.7   thorpej | proper positions, the three fields `zSign', `zExp', and `zSig0' are simply
    947  1.7   thorpej | added together to form the most significant 32 bits of the result.  This
    948  1.7   thorpej | means that any integer portion of `zSig0' will be added into the exponent.
    949  1.7   thorpej | Since a properly normalized significand will have an integer portion equal
    950  1.7   thorpej | to 1, the `zExp' input should be 1 less than the desired result exponent
    951  1.7   thorpej | whenever `zSig0' and `zSig1' concatenated form a complete, normalized
    952  1.7   thorpej | significand.
    953  1.7   thorpej *----------------------------------------------------------------------------*/
    954  1.7   thorpej 
    955  1.1      ross INLINE float128
    956  1.1      ross  packFloat128( flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1 )
    957  1.1      ross {
    958  1.1      ross     float128 z;
    959  1.1      ross 
    960  1.1      ross     z.low = zSig1;
    961  1.1      ross     z.high = ( ( (bits64) zSign )<<63 ) + ( ( (bits64) zExp )<<48 ) + zSig0;
    962  1.1      ross     return z;
    963  1.1      ross 
    964  1.1      ross }
    965  1.1      ross 
    966  1.7   thorpej /*----------------------------------------------------------------------------
    967  1.7   thorpej | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
    968  1.7   thorpej | and extended significand formed by the concatenation of `zSig0', `zSig1',
    969  1.7   thorpej | and `zSig2', and returns the proper quadruple-precision floating-point value
    970  1.7   thorpej | corresponding to the abstract input.  Ordinarily, the abstract value is
    971  1.7   thorpej | simply rounded and packed into the quadruple-precision format, with the
    972  1.7   thorpej | inexact exception raised if the abstract input cannot be represented
    973  1.7   thorpej | exactly.  However, if the abstract value is too large, the overflow and
    974  1.7   thorpej | inexact exceptions are raised and an infinity or maximal finite value is
    975  1.7   thorpej | returned.  If the abstract value is too small, the input value is rounded to
    976  1.7   thorpej | a subnormal number, and the underflow and inexact exceptions are raised if
    977  1.7   thorpej | the abstract input cannot be represented exactly as a subnormal quadruple-
    978  1.7   thorpej | precision floating-point number.
    979  1.7   thorpej |     The input significand must be normalized or smaller.  If the input
    980  1.7   thorpej | significand is not normalized, `zExp' must be 0; in that case, the result
    981  1.7   thorpej | returned is a subnormal number, and it must not require rounding.  In the
    982  1.7   thorpej | usual case that the input significand is normalized, `zExp' must be 1 less
    983  1.7   thorpej | than the ``true'' floating-point exponent.  The handling of underflow and
    984  1.7   thorpej | overflow follows the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
    985  1.7   thorpej *----------------------------------------------------------------------------*/
    986  1.7   thorpej 
    987  1.1      ross static float128
    988  1.1      ross  roundAndPackFloat128(
    989  1.1      ross      flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1, bits64 zSig2 )
    990  1.1      ross {
    991  1.1      ross     int8 roundingMode;
    992  1.1      ross     flag roundNearestEven, increment, isTiny;
    993  1.1      ross 
    994  1.1      ross     roundingMode = float_rounding_mode();
    995  1.1      ross     roundNearestEven = ( roundingMode == float_round_nearest_even );
    996  1.1      ross     increment = ( (sbits64) zSig2 < 0 );
    997  1.1      ross     if ( ! roundNearestEven ) {
    998  1.1      ross         if ( roundingMode == float_round_to_zero ) {
    999  1.1      ross             increment = 0;
   1000  1.1      ross         }
   1001  1.1      ross         else {
   1002  1.1      ross             if ( zSign ) {
   1003  1.1      ross                 increment = ( roundingMode == float_round_down ) && zSig2;
   1004  1.1      ross             }
   1005  1.1      ross             else {
   1006  1.1      ross                 increment = ( roundingMode == float_round_up ) && zSig2;
   1007  1.1      ross             }
   1008  1.1      ross         }
   1009  1.1      ross     }
   1010  1.1      ross     if ( 0x7FFD <= (bits32) zExp ) {
   1011  1.1      ross         if (    ( 0x7FFD < zExp )
   1012  1.1      ross              || (    ( zExp == 0x7FFD )
   1013  1.1      ross                   && eq128(
   1014  1.1      ross                          LIT64( 0x0001FFFFFFFFFFFF ),
   1015  1.1      ross                          LIT64( 0xFFFFFFFFFFFFFFFF ),
   1016  1.1      ross                          zSig0,
   1017  1.1      ross                          zSig1
   1018  1.1      ross                      )
   1019  1.1      ross                   && increment
   1020  1.1      ross                 )
   1021  1.1      ross            ) {
   1022  1.1      ross             float_raise( float_flag_overflow | float_flag_inexact );
   1023  1.1      ross             if (    ( roundingMode == float_round_to_zero )
   1024  1.1      ross                  || ( zSign && ( roundingMode == float_round_up ) )
   1025  1.1      ross                  || ( ! zSign && ( roundingMode == float_round_down ) )
   1026  1.1      ross                ) {
   1027  1.1      ross                 return
   1028  1.1      ross                     packFloat128(
   1029  1.1      ross                         zSign,
   1030  1.1      ross                         0x7FFE,
   1031  1.1      ross                         LIT64( 0x0000FFFFFFFFFFFF ),
   1032  1.1      ross                         LIT64( 0xFFFFFFFFFFFFFFFF )
   1033  1.1      ross                     );
   1034  1.1      ross             }
   1035  1.1      ross             return packFloat128( zSign, 0x7FFF, 0, 0 );
   1036  1.1      ross         }
   1037  1.1      ross         if ( zExp < 0 ) {
   1038  1.1      ross             isTiny =
   1039  1.1      ross                    ( float_detect_tininess == float_tininess_before_rounding )
   1040  1.1      ross                 || ( zExp < -1 )
   1041  1.1      ross                 || ! increment
   1042  1.1      ross                 || lt128(
   1043  1.1      ross                        zSig0,
   1044  1.1      ross                        zSig1,
   1045  1.1      ross                        LIT64( 0x0001FFFFFFFFFFFF ),
   1046  1.1      ross                        LIT64( 0xFFFFFFFFFFFFFFFF )
   1047  1.1      ross                    );
   1048  1.1      ross             shift128ExtraRightJamming(
   1049  1.1      ross                 zSig0, zSig1, zSig2, - zExp, &zSig0, &zSig1, &zSig2 );
   1050  1.1      ross             zExp = 0;
   1051  1.1      ross             if ( isTiny && zSig2 ) float_raise( float_flag_underflow );
   1052  1.1      ross             if ( roundNearestEven ) {
   1053  1.1      ross                 increment = ( (sbits64) zSig2 < 0 );
   1054  1.1      ross             }
   1055  1.1      ross             else {
   1056  1.1      ross                 if ( zSign ) {
   1057  1.1      ross                     increment = ( roundingMode == float_round_down ) && zSig2;
   1058  1.1      ross                 }
   1059  1.1      ross                 else {
   1060  1.1      ross                     increment = ( roundingMode == float_round_up ) && zSig2;
   1061  1.1      ross                 }
   1062  1.1      ross             }
   1063  1.1      ross         }
   1064  1.1      ross     }
   1065  1.1      ross     if ( zSig2 ) float_set_inexact();
   1066  1.1      ross     if ( increment ) {
   1067  1.1      ross         add128( zSig0, zSig1, 0, 1, &zSig0, &zSig1 );
   1068  1.1      ross         zSig1 &= ~ ( ( zSig2 + zSig2 == 0 ) & roundNearestEven );
   1069  1.1      ross     }
   1070  1.1      ross     else {
   1071  1.1      ross         if ( ( zSig0 | zSig1 ) == 0 ) zExp = 0;
   1072  1.1      ross     }
   1073  1.1      ross     return packFloat128( zSign, zExp, zSig0, zSig1 );
   1074  1.1      ross 
   1075  1.1      ross }
   1076  1.1      ross 
   1077  1.7   thorpej /*----------------------------------------------------------------------------
   1078  1.7   thorpej | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
   1079  1.7   thorpej | and significand formed by the concatenation of `zSig0' and `zSig1', and
   1080  1.7   thorpej | returns the proper quadruple-precision floating-point value corresponding
   1081  1.7   thorpej | to the abstract input.  This routine is just like `roundAndPackFloat128'
   1082  1.7   thorpej | except that the input significand has fewer bits and does not have to be
   1083  1.7   thorpej | normalized.  In all cases, `zExp' must be 1 less than the ``true'' floating-
   1084  1.7   thorpej | point exponent.
   1085  1.7   thorpej *----------------------------------------------------------------------------*/
   1086  1.7   thorpej 
   1087  1.1      ross static float128
   1088  1.1      ross  normalizeRoundAndPackFloat128(
   1089  1.1      ross      flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1 )
   1090  1.1      ross {
   1091  1.1      ross     int8 shiftCount;
   1092  1.1      ross     bits64 zSig2;
   1093  1.1      ross 
   1094  1.1      ross     if ( zSig0 == 0 ) {
   1095  1.1      ross         zSig0 = zSig1;
   1096  1.1      ross         zSig1 = 0;
   1097  1.1      ross         zExp -= 64;
   1098  1.1      ross     }
   1099  1.1      ross     shiftCount = countLeadingZeros64( zSig0 ) - 15;
   1100  1.1      ross     if ( 0 <= shiftCount ) {
   1101  1.1      ross         zSig2 = 0;
   1102  1.1      ross         shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
   1103  1.1      ross     }
   1104  1.1      ross     else {
   1105  1.1      ross         shift128ExtraRightJamming(
   1106  1.1      ross             zSig0, zSig1, 0, - shiftCount, &zSig0, &zSig1, &zSig2 );
   1107  1.1      ross     }
   1108  1.1      ross     zExp -= shiftCount;
   1109  1.1      ross     return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 );
   1110  1.1      ross 
   1111  1.1      ross }
   1112  1.1      ross 
   1113  1.1      ross #endif
   1114  1.1      ross 
   1115  1.7   thorpej /*----------------------------------------------------------------------------
   1116  1.7   thorpej | Returns the result of converting the 32-bit two's complement integer `a'
   1117  1.7   thorpej | to the single-precision floating-point format.  The conversion is performed
   1118  1.7   thorpej | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
   1119  1.7   thorpej *----------------------------------------------------------------------------*/
   1120  1.7   thorpej 
   1121  1.1      ross float32 int32_to_float32( int32 a )
   1122  1.1      ross {
   1123  1.1      ross     flag zSign;
   1124  1.1      ross 
   1125  1.1      ross     if ( a == 0 ) return 0;
   1126  1.1      ross     if ( a == (sbits32) 0x80000000 ) return packFloat32( 1, 0x9E, 0 );
   1127  1.1      ross     zSign = ( a < 0 );
   1128  1.1      ross     return normalizeRoundAndPackFloat32( zSign, 0x9C, zSign ? - a : a );
   1129  1.1      ross 
   1130  1.1      ross }
   1131  1.1      ross 
   1132  1.7   thorpej /*----------------------------------------------------------------------------
   1133  1.7   thorpej | Returns the result of converting the 32-bit two's complement integer `a'
   1134  1.7   thorpej | to the double-precision floating-point format.  The conversion is performed
   1135  1.7   thorpej | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
   1136  1.7   thorpej *----------------------------------------------------------------------------*/
   1137  1.7   thorpej 
   1138  1.1      ross float64 int32_to_float64( int32 a )
   1139  1.1      ross {
   1140  1.1      ross     flag zSign;
   1141  1.1      ross     uint32 absA;
   1142  1.1      ross     int8 shiftCount;
   1143  1.1      ross     bits64 zSig;
   1144  1.1      ross 
   1145  1.1      ross     if ( a == 0 ) return 0;
   1146  1.1      ross     zSign = ( a < 0 );
   1147  1.1      ross     absA = zSign ? - a : a;
   1148  1.1      ross     shiftCount = countLeadingZeros32( absA ) + 21;
   1149  1.1      ross     zSig = absA;
   1150  1.1      ross     return packFloat64( zSign, 0x432 - shiftCount, zSig<<shiftCount );
   1151  1.1      ross 
   1152  1.1      ross }
   1153  1.1      ross 
   1154  1.1      ross #ifdef FLOATX80
   1155  1.1      ross 
   1156  1.7   thorpej /*----------------------------------------------------------------------------
   1157  1.7   thorpej | Returns the result of converting the 32-bit two's complement integer `a'
   1158  1.7   thorpej | to the extended double-precision floating-point format.  The conversion
   1159  1.7   thorpej | is performed according to the IEC/IEEE Standard for Binary Floating-Point
   1160  1.7   thorpej | Arithmetic.
   1161  1.7   thorpej *----------------------------------------------------------------------------*/
   1162  1.7   thorpej 
   1163  1.1      ross floatx80 int32_to_floatx80( int32 a )
   1164  1.1      ross {
   1165  1.1      ross     flag zSign;
   1166  1.1      ross     uint32 absA;
   1167  1.1      ross     int8 shiftCount;
   1168  1.1      ross     bits64 zSig;
   1169  1.1      ross 
   1170  1.1      ross     if ( a == 0 ) return packFloatx80( 0, 0, 0 );
   1171  1.1      ross     zSign = ( a < 0 );
   1172  1.1      ross     absA = zSign ? - a : a;
   1173  1.1      ross     shiftCount = countLeadingZeros32( absA ) + 32;
   1174  1.1      ross     zSig = absA;
   1175  1.1      ross     return packFloatx80( zSign, 0x403E - shiftCount, zSig<<shiftCount );
   1176  1.1      ross 
   1177  1.1      ross }
   1178  1.1      ross 
   1179  1.1      ross #endif
   1180  1.1      ross 
   1181  1.1      ross #ifdef FLOAT128
   1182  1.1      ross 
   1183  1.7   thorpej /*----------------------------------------------------------------------------
   1184  1.7   thorpej | Returns the result of converting the 32-bit two's complement integer `a' to
   1185  1.7   thorpej | the quadruple-precision floating-point format.  The conversion is performed
   1186  1.7   thorpej | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
   1187  1.7   thorpej *----------------------------------------------------------------------------*/
   1188  1.7   thorpej 
   1189  1.1      ross float128 int32_to_float128( int32 a )
   1190  1.1      ross {
   1191  1.1      ross     flag zSign;
   1192  1.1      ross     uint32 absA;
   1193  1.1      ross     int8 shiftCount;
   1194  1.1      ross     bits64 zSig0;
   1195  1.1      ross 
   1196  1.1      ross     if ( a == 0 ) return packFloat128( 0, 0, 0, 0 );
   1197  1.1      ross     zSign = ( a < 0 );
   1198  1.1      ross     absA = zSign ? - a : a;
   1199  1.1      ross     shiftCount = countLeadingZeros32( absA ) + 17;
   1200  1.1      ross     zSig0 = absA;
   1201  1.1      ross     return packFloat128( zSign, 0x402E - shiftCount, zSig0<<shiftCount, 0 );
   1202  1.1      ross 
   1203  1.1      ross }
   1204  1.1      ross 
   1205  1.1      ross #endif
   1206  1.1      ross 
   1207  1.1      ross #ifndef SOFTFLOAT_FOR_GCC /* __floatdi?f is in libgcc2.c */
   1208  1.7   thorpej /*----------------------------------------------------------------------------
   1209  1.7   thorpej | Returns the result of converting the 64-bit two's complement integer `a'
   1210  1.7   thorpej | to the single-precision floating-point format.  The conversion is performed
   1211  1.7   thorpej | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
   1212  1.7   thorpej *----------------------------------------------------------------------------*/
   1213  1.7   thorpej 
   1214  1.1      ross float32 int64_to_float32( int64 a )
   1215  1.1      ross {
   1216  1.1      ross     flag zSign;
   1217  1.1      ross     uint64 absA;
   1218  1.1      ross     int8 shiftCount;
   1219  1.1      ross 
   1220  1.1      ross     if ( a == 0 ) return 0;
   1221  1.1      ross     zSign = ( a < 0 );
   1222  1.1      ross     absA = zSign ? - a : a;
   1223  1.1      ross     shiftCount = countLeadingZeros64( absA ) - 40;
   1224  1.1      ross     if ( 0 <= shiftCount ) {
   1225  1.1      ross         return packFloat32( zSign, 0x95 - shiftCount, absA<<shiftCount );
   1226  1.1      ross     }
   1227  1.1      ross     else {
   1228  1.1      ross         shiftCount += 7;
   1229  1.1      ross         if ( shiftCount < 0 ) {
   1230  1.1      ross             shift64RightJamming( absA, - shiftCount, &absA );
   1231  1.1      ross         }
   1232  1.1      ross         else {
   1233  1.1      ross             absA <<= shiftCount;
   1234  1.1      ross         }
   1235  1.1      ross         return roundAndPackFloat32( zSign, 0x9C - shiftCount, absA );
   1236  1.1      ross     }
   1237  1.1      ross 
   1238  1.1      ross }
   1239  1.1      ross 
   1240  1.7   thorpej /*----------------------------------------------------------------------------
   1241  1.7   thorpej | Returns the result of converting the 64-bit two's complement integer `a'
   1242  1.7   thorpej | to the double-precision floating-point format.  The conversion is performed
   1243  1.7   thorpej | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
   1244  1.7   thorpej *----------------------------------------------------------------------------*/
   1245  1.7   thorpej 
   1246  1.1      ross float64 int64_to_float64( int64 a )
   1247  1.1      ross {
   1248  1.1      ross     flag zSign;
   1249  1.1      ross 
   1250  1.1      ross     if ( a == 0 ) return 0;
   1251  1.1      ross     if ( a == (sbits64) LIT64( 0x8000000000000000 ) ) {
   1252  1.1      ross         return packFloat64( 1, 0x43E, 0 );
   1253  1.1      ross     }
   1254  1.1      ross     zSign = ( a < 0 );
   1255  1.1      ross     return normalizeRoundAndPackFloat64( zSign, 0x43C, zSign ? - a : a );
   1256  1.1      ross 
   1257  1.1      ross }
   1258  1.1      ross 
   1259  1.1      ross #ifdef FLOATX80
   1260  1.1      ross 
   1261  1.7   thorpej /*----------------------------------------------------------------------------
   1262  1.7   thorpej | Returns the result of converting the 64-bit two's complement integer `a'
   1263  1.7   thorpej | to the extended double-precision floating-point format.  The conversion
   1264  1.7   thorpej | is performed according to the IEC/IEEE Standard for Binary Floating-Point
   1265  1.7   thorpej | Arithmetic.
   1266  1.7   thorpej *----------------------------------------------------------------------------*/
   1267  1.7   thorpej 
   1268  1.1      ross floatx80 int64_to_floatx80( int64 a )
   1269  1.1      ross {
   1270  1.1      ross     flag zSign;
   1271  1.1      ross     uint64 absA;
   1272  1.1      ross     int8 shiftCount;
   1273  1.1      ross 
   1274  1.1      ross     if ( a == 0 ) return packFloatx80( 0, 0, 0 );
   1275  1.1      ross     zSign = ( a < 0 );
   1276  1.1      ross     absA = zSign ? - a : a;
   1277  1.1      ross     shiftCount = countLeadingZeros64( absA );
   1278  1.1      ross     return packFloatx80( zSign, 0x403E - shiftCount, absA<<shiftCount );
   1279  1.1      ross 
   1280  1.1      ross }
   1281  1.1      ross 
   1282  1.1      ross #endif
   1283  1.1      ross 
   1284  1.1      ross #ifdef FLOAT128
   1285  1.1      ross 
   1286  1.7   thorpej /*----------------------------------------------------------------------------
   1287  1.7   thorpej | Returns the result of converting the 64-bit two's complement integer `a' to
   1288  1.7   thorpej | the quadruple-precision floating-point format.  The conversion is performed
   1289  1.7   thorpej | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
   1290  1.7   thorpej *----------------------------------------------------------------------------*/
   1291  1.7   thorpej 
   1292  1.1      ross float128 int64_to_float128( int64 a )
   1293  1.1      ross {
   1294  1.1      ross     flag zSign;
   1295  1.1      ross     uint64 absA;
   1296  1.1      ross     int8 shiftCount;
   1297  1.1      ross     int32 zExp;
   1298  1.1      ross     bits64 zSig0, zSig1;
   1299  1.1      ross 
   1300  1.1      ross     if ( a == 0 ) return packFloat128( 0, 0, 0, 0 );
   1301  1.1      ross     zSign = ( a < 0 );
   1302  1.1      ross     absA = zSign ? - a : a;
   1303  1.1      ross     shiftCount = countLeadingZeros64( absA ) + 49;
   1304  1.1      ross     zExp = 0x406E - shiftCount;
   1305  1.1      ross     if ( 64 <= shiftCount ) {
   1306  1.1      ross         zSig1 = 0;
   1307  1.1      ross         zSig0 = absA;
   1308  1.1      ross         shiftCount -= 64;
   1309  1.1      ross     }
   1310  1.1      ross     else {
   1311  1.1      ross         zSig1 = absA;
   1312  1.1      ross         zSig0 = 0;
   1313  1.1      ross     }
   1314  1.1      ross     shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
   1315  1.1      ross     return packFloat128( zSign, zExp, zSig0, zSig1 );
   1316  1.1      ross 
   1317  1.1      ross }
   1318  1.1      ross 
   1319  1.1      ross #endif
   1320  1.1      ross #endif /* !SOFTFLOAT_FOR_GCC */
   1321  1.1      ross 
   1322  1.1      ross #ifndef SOFTFLOAT_FOR_GCC /* Not needed */
   1323  1.7   thorpej /*----------------------------------------------------------------------------
   1324  1.7   thorpej | Returns the result of converting the single-precision floating-point value
   1325  1.7   thorpej | `a' to the 32-bit two's complement integer format.  The conversion is
   1326  1.7   thorpej | performed according to the IEC/IEEE Standard for Binary Floating-Point
   1327  1.7   thorpej | Arithmetic---which means in particular that the conversion is rounded
   1328  1.7   thorpej | according to the current rounding mode.  If `a' is a NaN, the largest
   1329  1.7   thorpej | positive integer is returned.  Otherwise, if the conversion overflows, the
   1330  1.7   thorpej | largest integer with the same sign as `a' is returned.
   1331  1.7   thorpej *----------------------------------------------------------------------------*/
   1332  1.7   thorpej 
   1333  1.1      ross int32 float32_to_int32( float32 a )
   1334  1.1      ross {
   1335  1.1      ross     flag aSign;
   1336  1.1      ross     int16 aExp, shiftCount;
   1337  1.1      ross     bits32 aSig;
   1338  1.1      ross     bits64 aSig64;
   1339  1.1      ross 
   1340  1.1      ross     aSig = extractFloat32Frac( a );
   1341  1.1      ross     aExp = extractFloat32Exp( a );
   1342  1.1      ross     aSign = extractFloat32Sign( a );
   1343  1.1      ross     if ( ( aExp == 0xFF ) && aSig ) aSign = 0;
   1344  1.1      ross     if ( aExp ) aSig |= 0x00800000;
   1345  1.1      ross     shiftCount = 0xAF - aExp;
   1346  1.1      ross     aSig64 = aSig;
   1347  1.1      ross     aSig64 <<= 32;
   1348  1.1      ross     if ( 0 < shiftCount ) shift64RightJamming( aSig64, shiftCount, &aSig64 );
   1349  1.1      ross     return roundAndPackInt32( aSign, aSig64 );
   1350  1.1      ross 
   1351  1.1      ross }
   1352  1.1      ross #endif /* !SOFTFLOAT_FOR_GCC */
   1353  1.1      ross 
   1354  1.7   thorpej /*----------------------------------------------------------------------------
   1355  1.7   thorpej | Returns the result of converting the single-precision floating-point value
   1356  1.7   thorpej | `a' to the 32-bit two's complement integer format.  The conversion is
   1357  1.7   thorpej | performed according to the IEC/IEEE Standard for Binary Floating-Point
   1358  1.7   thorpej | Arithmetic, except that the conversion is always rounded toward zero.
   1359  1.7   thorpej | If `a' is a NaN, the largest positive integer is returned.  Otherwise, if
   1360  1.7   thorpej | the conversion overflows, the largest integer with the same sign as `a' is
   1361  1.7   thorpej | returned.
   1362  1.7   thorpej *----------------------------------------------------------------------------*/
   1363  1.7   thorpej 
   1364  1.1      ross int32 float32_to_int32_round_to_zero( float32 a )
   1365  1.1      ross {
   1366  1.1      ross     flag aSign;
   1367  1.1      ross     int16 aExp, shiftCount;
   1368  1.1      ross     bits32 aSig;
   1369  1.1      ross     int32 z;
   1370  1.1      ross 
   1371  1.1      ross     aSig = extractFloat32Frac( a );
   1372  1.1      ross     aExp = extractFloat32Exp( a );
   1373  1.1      ross     aSign = extractFloat32Sign( a );
   1374  1.1      ross     shiftCount = aExp - 0x9E;
   1375  1.1      ross     if ( 0 <= shiftCount ) {
   1376  1.1      ross         if ( a != 0xCF000000 ) {
   1377  1.1      ross             float_raise( float_flag_invalid );
   1378  1.1      ross             if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) return 0x7FFFFFFF;
   1379  1.1      ross         }
   1380  1.1      ross         return (sbits32) 0x80000000;
   1381  1.1      ross     }
   1382  1.1      ross     else if ( aExp <= 0x7E ) {
   1383  1.1      ross         if ( aExp | aSig ) float_set_inexact();
   1384  1.1      ross         return 0;
   1385  1.1      ross     }
   1386  1.1      ross     aSig = ( aSig | 0x00800000 )<<8;
   1387  1.1      ross     z = aSig>>( - shiftCount );
   1388  1.1      ross     if ( (bits32) ( aSig<<( shiftCount & 31 ) ) ) {
   1389  1.1      ross         float_set_inexact();
   1390  1.1      ross     }
   1391  1.1      ross     if ( aSign ) z = - z;
   1392  1.1      ross     return z;
   1393  1.1      ross 
   1394  1.1      ross }
   1395  1.1      ross 
   1396  1.1      ross #ifndef SOFTFLOAT_FOR_GCC /* __fix?fdi provided by libgcc2.c */
   1397  1.7   thorpej /*----------------------------------------------------------------------------
   1398  1.7   thorpej | Returns the result of converting the single-precision floating-point value
   1399  1.7   thorpej | `a' to the 64-bit two's complement integer format.  The conversion is
   1400  1.7   thorpej | performed according to the IEC/IEEE Standard for Binary Floating-Point
   1401  1.7   thorpej | Arithmetic---which means in particular that the conversion is rounded
   1402  1.7   thorpej | according to the current rounding mode.  If `a' is a NaN, the largest
   1403  1.7   thorpej | positive integer is returned.  Otherwise, if the conversion overflows, the
   1404  1.7   thorpej | largest integer with the same sign as `a' is returned.
   1405  1.7   thorpej *----------------------------------------------------------------------------*/
   1406  1.7   thorpej 
   1407  1.1      ross int64 float32_to_int64( float32 a )
   1408  1.1      ross {
   1409  1.1      ross     flag aSign;
   1410  1.1      ross     int16 aExp, shiftCount;
   1411  1.1      ross     bits32 aSig;
   1412  1.1      ross     bits64 aSig64, aSigExtra;
   1413  1.1      ross 
   1414  1.1      ross     aSig = extractFloat32Frac( a );
   1415  1.1      ross     aExp = extractFloat32Exp( a );
   1416  1.1      ross     aSign = extractFloat32Sign( a );
   1417  1.1      ross     shiftCount = 0xBE - aExp;
   1418  1.1      ross     if ( shiftCount < 0 ) {
   1419  1.1      ross         float_raise( float_flag_invalid );
   1420  1.1      ross         if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) {
   1421  1.1      ross             return LIT64( 0x7FFFFFFFFFFFFFFF );
   1422  1.1      ross         }
   1423  1.1      ross         return (sbits64) LIT64( 0x8000000000000000 );
   1424  1.1      ross     }
   1425  1.1      ross     if ( aExp ) aSig |= 0x00800000;
   1426  1.1      ross     aSig64 = aSig;
   1427  1.1      ross     aSig64 <<= 40;
   1428  1.1      ross     shift64ExtraRightJamming( aSig64, 0, shiftCount, &aSig64, &aSigExtra );
   1429  1.1      ross     return roundAndPackInt64( aSign, aSig64, aSigExtra );
   1430  1.1      ross 
   1431  1.1      ross }
   1432  1.1      ross 
   1433  1.7   thorpej /*----------------------------------------------------------------------------
   1434  1.7   thorpej | Returns the result of converting the single-precision floating-point value
   1435  1.7   thorpej | `a' to the 64-bit two's complement integer format.  The conversion is
   1436  1.7   thorpej | performed according to the IEC/IEEE Standard for Binary Floating-Point
   1437  1.7   thorpej | Arithmetic, except that the conversion is always rounded toward zero.  If
   1438  1.7   thorpej | `a' is a NaN, the largest positive integer is returned.  Otherwise, if the
   1439  1.7   thorpej | conversion overflows, the largest integer with the same sign as `a' is
   1440  1.7   thorpej | returned.
   1441  1.7   thorpej *----------------------------------------------------------------------------*/
   1442  1.7   thorpej 
   1443  1.1      ross int64 float32_to_int64_round_to_zero( float32 a )
   1444  1.1      ross {
   1445  1.1      ross     flag aSign;
   1446  1.1      ross     int16 aExp, shiftCount;
   1447  1.1      ross     bits32 aSig;
   1448  1.1      ross     bits64 aSig64;
   1449  1.1      ross     int64 z;
   1450  1.1      ross 
   1451  1.1      ross     aSig = extractFloat32Frac( a );
   1452  1.1      ross     aExp = extractFloat32Exp( a );
   1453  1.1      ross     aSign = extractFloat32Sign( a );
   1454  1.1      ross     shiftCount = aExp - 0xBE;
   1455  1.1      ross     if ( 0 <= shiftCount ) {
   1456  1.1      ross         if ( a != 0xDF000000 ) {
   1457  1.1      ross             float_raise( float_flag_invalid );
   1458  1.1      ross             if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) {
   1459  1.1      ross                 return LIT64( 0x7FFFFFFFFFFFFFFF );
   1460  1.1      ross             }
   1461  1.1      ross         }
   1462  1.1      ross         return (sbits64) LIT64( 0x8000000000000000 );
   1463  1.1      ross     }
   1464  1.1      ross     else if ( aExp <= 0x7E ) {
   1465  1.1      ross         if ( aExp | aSig ) float_set_inexact();
   1466  1.1      ross         return 0;
   1467  1.1      ross     }
   1468  1.1      ross     aSig64 = aSig | 0x00800000;
   1469  1.1      ross     aSig64 <<= 40;
   1470  1.1      ross     z = aSig64>>( - shiftCount );
   1471  1.1      ross     if ( (bits64) ( aSig64<<( shiftCount & 63 ) ) ) {
   1472  1.1      ross         float_set_inexact();
   1473  1.1      ross     }
   1474  1.1      ross     if ( aSign ) z = - z;
   1475  1.1      ross     return z;
   1476  1.1      ross 
   1477  1.1      ross }
   1478  1.1      ross #endif /* !SOFTFLOAT_FOR_GCC */
   1479  1.1      ross 
   1480  1.7   thorpej /*----------------------------------------------------------------------------
   1481  1.7   thorpej | Returns the result of converting the single-precision floating-point value
   1482  1.7   thorpej | `a' to the double-precision floating-point format.  The conversion is
   1483  1.7   thorpej | performed according to the IEC/IEEE Standard for Binary Floating-Point
   1484  1.7   thorpej | Arithmetic.
   1485  1.7   thorpej *----------------------------------------------------------------------------*/
   1486  1.7   thorpej 
   1487  1.1      ross float64 float32_to_float64( float32 a )
   1488  1.1      ross {
   1489  1.1      ross     flag aSign;
   1490  1.1      ross     int16 aExp;
   1491  1.1      ross     bits32 aSig;
   1492  1.1      ross 
   1493  1.1      ross     aSig = extractFloat32Frac( a );
   1494  1.1      ross     aExp = extractFloat32Exp( a );
   1495  1.1      ross     aSign = extractFloat32Sign( a );
   1496  1.1      ross     if ( aExp == 0xFF ) {
   1497  1.1      ross         if ( aSig ) return commonNaNToFloat64( float32ToCommonNaN( a ) );
   1498  1.1      ross         return packFloat64( aSign, 0x7FF, 0 );
   1499  1.1      ross     }
   1500  1.1      ross     if ( aExp == 0 ) {
   1501  1.1      ross         if ( aSig == 0 ) return packFloat64( aSign, 0, 0 );
   1502  1.1      ross         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
   1503  1.1      ross         --aExp;
   1504  1.1      ross     }
   1505  1.1      ross     return packFloat64( aSign, aExp + 0x380, ( (bits64) aSig )<<29 );
   1506  1.1      ross 
   1507  1.1      ross }
   1508  1.1      ross 
   1509  1.1      ross #ifdef FLOATX80
   1510  1.1      ross 
   1511  1.7   thorpej /*----------------------------------------------------------------------------
   1512  1.7   thorpej | Returns the result of converting the single-precision floating-point value
   1513  1.7   thorpej | `a' to the extended double-precision floating-point format.  The conversion
   1514  1.7   thorpej | is performed according to the IEC/IEEE Standard for Binary Floating-Point
   1515  1.7   thorpej | Arithmetic.
   1516  1.7   thorpej *----------------------------------------------------------------------------*/
   1517  1.7   thorpej 
   1518  1.1      ross floatx80 float32_to_floatx80( float32 a )
   1519  1.1      ross {
   1520  1.1      ross     flag aSign;
   1521  1.1      ross     int16 aExp;
   1522  1.1      ross     bits32 aSig;
   1523  1.1      ross 
   1524  1.1      ross     aSig = extractFloat32Frac( a );
   1525  1.1      ross     aExp = extractFloat32Exp( a );
   1526  1.1      ross     aSign = extractFloat32Sign( a );
   1527  1.1      ross     if ( aExp == 0xFF ) {
   1528  1.1      ross         if ( aSig ) return commonNaNToFloatx80( float32ToCommonNaN( a ) );
   1529  1.1      ross         return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
   1530  1.1      ross     }
   1531  1.1      ross     if ( aExp == 0 ) {
   1532  1.1      ross         if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 );
   1533  1.1      ross         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
   1534  1.1      ross     }
   1535  1.1      ross     aSig |= 0x00800000;
   1536  1.1      ross     return packFloatx80( aSign, aExp + 0x3F80, ( (bits64) aSig )<<40 );
   1537  1.1      ross 
   1538  1.1      ross }
   1539  1.1      ross 
   1540  1.1      ross #endif
   1541  1.1      ross 
   1542  1.1      ross #ifdef FLOAT128
   1543  1.1      ross 
   1544  1.7   thorpej /*----------------------------------------------------------------------------
   1545  1.7   thorpej | Returns the result of converting the single-precision floating-point value
   1546  1.7   thorpej | `a' to the double-precision floating-point format.  The conversion is
   1547  1.7   thorpej | performed according to the IEC/IEEE Standard for Binary Floating-Point
   1548  1.7   thorpej | Arithmetic.
   1549  1.7   thorpej *----------------------------------------------------------------------------*/
   1550  1.7   thorpej 
   1551  1.1      ross float128 float32_to_float128( float32 a )
   1552  1.1      ross {
   1553  1.1      ross     flag aSign;
   1554  1.1      ross     int16 aExp;
   1555  1.1      ross     bits32 aSig;
   1556  1.1      ross 
   1557  1.1      ross     aSig = extractFloat32Frac( a );
   1558  1.1      ross     aExp = extractFloat32Exp( a );
   1559  1.1      ross     aSign = extractFloat32Sign( a );
   1560  1.1      ross     if ( aExp == 0xFF ) {
   1561  1.1      ross         if ( aSig ) return commonNaNToFloat128( float32ToCommonNaN( a ) );
   1562  1.1      ross         return packFloat128( aSign, 0x7FFF, 0, 0 );
   1563  1.1      ross     }
   1564  1.1      ross     if ( aExp == 0 ) {
   1565  1.1      ross         if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 );
   1566  1.1      ross         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
   1567  1.1      ross         --aExp;
   1568  1.1      ross     }
   1569  1.1      ross     return packFloat128( aSign, aExp + 0x3F80, ( (bits64) aSig )<<25, 0 );
   1570  1.1      ross 
   1571  1.1      ross }
   1572  1.1      ross 
   1573  1.1      ross #endif
   1574  1.1      ross 
   1575  1.1      ross #ifndef SOFTFLOAT_FOR_GCC /* Not needed */
   1576  1.7   thorpej /*----------------------------------------------------------------------------
   1577  1.7   thorpej | Rounds the single-precision floating-point value `a' to an integer, and
   1578  1.7   thorpej | returns the result as a single-precision floating-point value.  The
   1579  1.7   thorpej | operation is performed according to the IEC/IEEE Standard for Binary
   1580  1.7   thorpej | Floating-Point Arithmetic.
   1581  1.7   thorpej *----------------------------------------------------------------------------*/
   1582  1.7   thorpej 
   1583  1.1      ross float32 float32_round_to_int( float32 a )
   1584  1.1      ross {
   1585  1.1      ross     flag aSign;
   1586  1.1      ross     int16 aExp;
   1587  1.1      ross     bits32 lastBitMask, roundBitsMask;
   1588  1.1      ross     int8 roundingMode;
   1589  1.1      ross     float32 z;
   1590  1.1      ross 
   1591  1.1      ross     aExp = extractFloat32Exp( a );
   1592  1.1      ross     if ( 0x96 <= aExp ) {
   1593  1.1      ross         if ( ( aExp == 0xFF ) && extractFloat32Frac( a ) ) {
   1594  1.1      ross             return propagateFloat32NaN( a, a );
   1595  1.1      ross         }
   1596  1.1      ross         return a;
   1597  1.1      ross     }
   1598  1.1      ross     if ( aExp <= 0x7E ) {
   1599  1.1      ross         if ( (bits32) ( a<<1 ) == 0 ) return a;
   1600  1.1      ross         float_set_inexact();
   1601  1.1      ross         aSign = extractFloat32Sign( a );
   1602  1.1      ross         switch ( float_rounding_mode() ) {
   1603  1.1      ross          case float_round_nearest_even:
   1604  1.1      ross             if ( ( aExp == 0x7E ) && extractFloat32Frac( a ) ) {
   1605  1.1      ross                 return packFloat32( aSign, 0x7F, 0 );
   1606  1.1      ross             }
   1607  1.1      ross             break;
   1608  1.1      ross          case float_round_down:
   1609  1.1      ross             return aSign ? 0xBF800000 : 0;
   1610  1.1      ross          case float_round_up:
   1611  1.1      ross             return aSign ? 0x80000000 : 0x3F800000;
   1612  1.1      ross         }
   1613  1.1      ross         return packFloat32( aSign, 0, 0 );
   1614  1.1      ross     }
   1615  1.1      ross     lastBitMask = 1;
   1616  1.1      ross     lastBitMask <<= 0x96 - aExp;
   1617  1.1      ross     roundBitsMask = lastBitMask - 1;
   1618  1.1      ross     z = a;
   1619  1.1      ross     roundingMode = float_rounding_mode();
   1620  1.1      ross     if ( roundingMode == float_round_nearest_even ) {
   1621  1.1      ross         z += lastBitMask>>1;
   1622  1.1      ross         if ( ( z & roundBitsMask ) == 0 ) z &= ~ lastBitMask;
   1623  1.1      ross     }
   1624  1.1      ross     else if ( roundingMode != float_round_to_zero ) {
   1625  1.1      ross         if ( extractFloat32Sign( z ) ^ ( roundingMode == float_round_up ) ) {
   1626  1.1      ross             z += roundBitsMask;
   1627  1.1      ross         }
   1628  1.1      ross     }
   1629  1.1      ross     z &= ~ roundBitsMask;
   1630  1.1      ross     if ( z != a ) float_set_inexact();
   1631  1.1      ross     return z;
   1632  1.1      ross 
   1633  1.1      ross }
   1634  1.1      ross #endif /* !SOFTFLOAT_FOR_GCC */
   1635  1.1      ross 
   1636  1.7   thorpej /*----------------------------------------------------------------------------
   1637  1.7   thorpej | Returns the result of adding the absolute values of the single-precision
   1638  1.7   thorpej | floating-point values `a' and `b'.  If `zSign' is 1, the sum is negated
   1639  1.7   thorpej | before being returned.  `zSign' is ignored if the result is a NaN.
   1640  1.7   thorpej | The addition is performed according to the IEC/IEEE Standard for Binary
   1641  1.7   thorpej | Floating-Point Arithmetic.
   1642  1.7   thorpej *----------------------------------------------------------------------------*/
   1643  1.7   thorpej 
   1644  1.1      ross static float32 addFloat32Sigs( float32 a, float32 b, flag zSign )
   1645  1.1      ross {
   1646  1.1      ross     int16 aExp, bExp, zExp;
   1647  1.1      ross     bits32 aSig, bSig, zSig;
   1648  1.1      ross     int16 expDiff;
   1649  1.1      ross 
   1650  1.1      ross     aSig = extractFloat32Frac( a );
   1651  1.1      ross     aExp = extractFloat32Exp( a );
   1652  1.1      ross     bSig = extractFloat32Frac( b );
   1653  1.1      ross     bExp = extractFloat32Exp( b );
   1654  1.1      ross     expDiff = aExp - bExp;
   1655  1.1      ross     aSig <<= 6;
   1656  1.1      ross     bSig <<= 6;
   1657  1.1      ross     if ( 0 < expDiff ) {
   1658  1.1      ross         if ( aExp == 0xFF ) {
   1659  1.1      ross             if ( aSig ) return propagateFloat32NaN( a, b );
   1660  1.1      ross             return a;
   1661  1.1      ross         }
   1662  1.1      ross         if ( bExp == 0 ) {
   1663  1.1      ross             --expDiff;
   1664  1.1      ross         }
   1665  1.1      ross         else {
   1666  1.1      ross             bSig |= 0x20000000;
   1667  1.1      ross         }
   1668  1.1      ross         shift32RightJamming( bSig, expDiff, &bSig );
   1669  1.1      ross         zExp = aExp;
   1670  1.1      ross     }
   1671  1.1      ross     else if ( expDiff < 0 ) {
   1672  1.1      ross         if ( bExp == 0xFF ) {
   1673  1.1      ross             if ( bSig ) return propagateFloat32NaN( a, b );
   1674  1.1      ross             return packFloat32( zSign, 0xFF, 0 );
   1675  1.1      ross         }
   1676  1.1      ross         if ( aExp == 0 ) {
   1677  1.1      ross             ++expDiff;
   1678  1.1      ross         }
   1679  1.1      ross         else {
   1680  1.1      ross             aSig |= 0x20000000;
   1681  1.1      ross         }
   1682  1.1      ross         shift32RightJamming( aSig, - expDiff, &aSig );
   1683  1.1      ross         zExp = bExp;
   1684  1.1      ross     }
   1685  1.1      ross     else {
   1686  1.1      ross         if ( aExp == 0xFF ) {
   1687  1.1      ross             if ( aSig | bSig ) return propagateFloat32NaN( a, b );
   1688  1.1      ross             return a;
   1689  1.1      ross         }
   1690  1.1      ross         if ( aExp == 0 ) return packFloat32( zSign, 0, ( aSig + bSig )>>6 );
   1691  1.1      ross         zSig = 0x40000000 + aSig + bSig;
   1692  1.1      ross         zExp = aExp;
   1693  1.1      ross         goto roundAndPack;
   1694  1.1      ross     }
   1695  1.1      ross     aSig |= 0x20000000;
   1696  1.1      ross     zSig = ( aSig + bSig )<<1;
   1697  1.1      ross     --zExp;
   1698  1.1      ross     if ( (sbits32) zSig < 0 ) {
   1699  1.1      ross         zSig = aSig + bSig;
   1700  1.1      ross         ++zExp;
   1701  1.1      ross     }
   1702  1.1      ross  roundAndPack:
   1703  1.1      ross     return roundAndPackFloat32( zSign, zExp, zSig );
   1704  1.1      ross 
   1705  1.1      ross }
   1706  1.1      ross 
   1707  1.7   thorpej /*----------------------------------------------------------------------------
   1708  1.7   thorpej | Returns the result of subtracting the absolute values of the single-
   1709  1.7   thorpej | precision floating-point values `a' and `b'.  If `zSign' is 1, the
   1710  1.7   thorpej | difference is negated before being returned.  `zSign' is ignored if the
   1711  1.7   thorpej | result is a NaN.  The subtraction is performed according to the IEC/IEEE
   1712  1.7   thorpej | Standard for Binary Floating-Point Arithmetic.
   1713  1.7   thorpej *----------------------------------------------------------------------------*/
   1714  1.7   thorpej 
   1715  1.1      ross static float32 subFloat32Sigs( float32 a, float32 b, flag zSign )
   1716  1.1      ross {
   1717  1.1      ross     int16 aExp, bExp, zExp;
   1718  1.1      ross     bits32 aSig, bSig, zSig;
   1719  1.1      ross     int16 expDiff;
   1720  1.1      ross 
   1721  1.1      ross     aSig = extractFloat32Frac( a );
   1722  1.1      ross     aExp = extractFloat32Exp( a );
   1723  1.1      ross     bSig = extractFloat32Frac( b );
   1724  1.1      ross     bExp = extractFloat32Exp( b );
   1725  1.1      ross     expDiff = aExp - bExp;
   1726  1.1      ross     aSig <<= 7;
   1727  1.1      ross     bSig <<= 7;
   1728  1.1      ross     if ( 0 < expDiff ) goto aExpBigger;
   1729  1.1      ross     if ( expDiff < 0 ) goto bExpBigger;
   1730  1.1      ross     if ( aExp == 0xFF ) {
   1731  1.1      ross         if ( aSig | bSig ) return propagateFloat32NaN( a, b );
   1732  1.1      ross         float_raise( float_flag_invalid );
   1733  1.1      ross         return float32_default_nan;
   1734  1.1      ross     }
   1735  1.1      ross     if ( aExp == 0 ) {
   1736  1.1      ross         aExp = 1;
   1737  1.1      ross         bExp = 1;
   1738  1.1      ross     }
   1739  1.1      ross     if ( bSig < aSig ) goto aBigger;
   1740  1.1      ross     if ( aSig < bSig ) goto bBigger;
   1741  1.1      ross     return packFloat32( float_rounding_mode() == float_round_down, 0, 0 );
   1742  1.1      ross  bExpBigger:
   1743  1.1      ross     if ( bExp == 0xFF ) {
   1744  1.1      ross         if ( bSig ) return propagateFloat32NaN( a, b );
   1745  1.1      ross         return packFloat32( zSign ^ 1, 0xFF, 0 );
   1746  1.1      ross     }
   1747  1.1      ross     if ( aExp == 0 ) {
   1748  1.1      ross         ++expDiff;
   1749  1.1      ross     }
   1750  1.1      ross     else {
   1751  1.1      ross         aSig |= 0x40000000;
   1752  1.1      ross     }
   1753  1.1      ross     shift32RightJamming( aSig, - expDiff, &aSig );
   1754  1.1      ross     bSig |= 0x40000000;
   1755  1.1      ross  bBigger:
   1756  1.1      ross     zSig = bSig - aSig;
   1757  1.1      ross     zExp = bExp;
   1758  1.1      ross     zSign ^= 1;
   1759  1.1      ross     goto normalizeRoundAndPack;
   1760  1.1      ross  aExpBigger:
   1761  1.1      ross     if ( aExp == 0xFF ) {
   1762  1.1      ross         if ( aSig ) return propagateFloat32NaN( a, b );
   1763  1.1      ross         return a;
   1764  1.1      ross     }
   1765  1.1      ross     if ( bExp == 0 ) {
   1766  1.1      ross         --expDiff;
   1767  1.1      ross     }
   1768  1.1      ross     else {
   1769  1.1      ross         bSig |= 0x40000000;
   1770  1.1      ross     }
   1771  1.1      ross     shift32RightJamming( bSig, expDiff, &bSig );
   1772  1.1      ross     aSig |= 0x40000000;
   1773  1.1      ross  aBigger:
   1774  1.1      ross     zSig = aSig - bSig;
   1775  1.1      ross     zExp = aExp;
   1776  1.1      ross  normalizeRoundAndPack:
   1777  1.1      ross     --zExp;
   1778  1.1      ross     return normalizeRoundAndPackFloat32( zSign, zExp, zSig );
   1779  1.1      ross 
   1780  1.1      ross }
   1781  1.1      ross 
   1782  1.7   thorpej /*----------------------------------------------------------------------------
   1783  1.7   thorpej | Returns the result of adding the single-precision floating-point values `a'
   1784  1.7   thorpej | and `b'.  The operation is performed according to the IEC/IEEE Standard for
   1785  1.7   thorpej | Binary Floating-Point Arithmetic.
   1786  1.7   thorpej *----------------------------------------------------------------------------*/
   1787  1.7   thorpej 
   1788  1.1      ross float32 float32_add( float32 a, float32 b )
   1789  1.1      ross {
   1790  1.1      ross     flag aSign, bSign;
   1791  1.1      ross 
   1792  1.1      ross     aSign = extractFloat32Sign( a );
   1793  1.1      ross     bSign = extractFloat32Sign( b );
   1794  1.1      ross     if ( aSign == bSign ) {
   1795  1.1      ross         return addFloat32Sigs( a, b, aSign );
   1796  1.1      ross     }
   1797  1.1      ross     else {
   1798  1.1      ross         return subFloat32Sigs( a, b, aSign );
   1799  1.1      ross     }
   1800  1.1      ross 
   1801  1.1      ross }
   1802  1.1      ross 
   1803  1.7   thorpej /*----------------------------------------------------------------------------
   1804  1.7   thorpej | Returns the result of subtracting the single-precision floating-point values
   1805  1.7   thorpej | `a' and `b'.  The operation is performed according to the IEC/IEEE Standard
   1806  1.7   thorpej | for Binary Floating-Point Arithmetic.
   1807  1.7   thorpej *----------------------------------------------------------------------------*/
   1808  1.7   thorpej 
   1809  1.1      ross float32 float32_sub( float32 a, float32 b )
   1810  1.1      ross {
   1811  1.1      ross     flag aSign, bSign;
   1812  1.1      ross 
   1813  1.1      ross     aSign = extractFloat32Sign( a );
   1814  1.1      ross     bSign = extractFloat32Sign( b );
   1815  1.1      ross     if ( aSign == bSign ) {
   1816  1.1      ross         return subFloat32Sigs( a, b, aSign );
   1817  1.1      ross     }
   1818  1.1      ross     else {
   1819  1.1      ross         return addFloat32Sigs( a, b, aSign );
   1820  1.1      ross     }
   1821  1.1      ross 
   1822  1.1      ross }
   1823  1.1      ross 
   1824  1.7   thorpej /*----------------------------------------------------------------------------
   1825  1.7   thorpej | Returns the result of multiplying the single-precision floating-point values
   1826  1.7   thorpej | `a' and `b'.  The operation is performed according to the IEC/IEEE Standard
   1827  1.7   thorpej | for Binary Floating-Point Arithmetic.
   1828  1.7   thorpej *----------------------------------------------------------------------------*/
   1829  1.7   thorpej 
   1830  1.1      ross float32 float32_mul( float32 a, float32 b )
   1831  1.1      ross {
   1832  1.1      ross     flag aSign, bSign, zSign;
   1833  1.1      ross     int16 aExp, bExp, zExp;
   1834  1.1      ross     bits32 aSig, bSig;
   1835  1.1      ross     bits64 zSig64;
   1836  1.1      ross     bits32 zSig;
   1837  1.1      ross 
   1838  1.1      ross     aSig = extractFloat32Frac( a );
   1839  1.1      ross     aExp = extractFloat32Exp( a );
   1840  1.1      ross     aSign = extractFloat32Sign( a );
   1841  1.1      ross     bSig = extractFloat32Frac( b );
   1842  1.1      ross     bExp = extractFloat32Exp( b );
   1843  1.1      ross     bSign = extractFloat32Sign( b );
   1844  1.1      ross     zSign = aSign ^ bSign;
   1845  1.1      ross     if ( aExp == 0xFF ) {
   1846  1.1      ross         if ( aSig || ( ( bExp == 0xFF ) && bSig ) ) {
   1847  1.1      ross             return propagateFloat32NaN( a, b );
   1848  1.1      ross         }
   1849  1.1      ross         if ( ( bExp | bSig ) == 0 ) {
   1850  1.1      ross             float_raise( float_flag_invalid );
   1851  1.1      ross             return float32_default_nan;
   1852  1.1      ross         }
   1853  1.1      ross         return packFloat32( zSign, 0xFF, 0 );
   1854  1.1      ross     }
   1855  1.1      ross     if ( bExp == 0xFF ) {
   1856  1.1      ross         if ( bSig ) return propagateFloat32NaN( a, b );
   1857  1.1      ross         if ( ( aExp | aSig ) == 0 ) {
   1858  1.1      ross             float_raise( float_flag_invalid );
   1859  1.1      ross             return float32_default_nan;
   1860  1.1      ross         }
   1861  1.1      ross         return packFloat32( zSign, 0xFF, 0 );
   1862  1.1      ross     }
   1863  1.1      ross     if ( aExp == 0 ) {
   1864  1.1      ross         if ( aSig == 0 ) return packFloat32( zSign, 0, 0 );
   1865  1.1      ross         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
   1866  1.1      ross     }
   1867  1.1      ross     if ( bExp == 0 ) {
   1868  1.1      ross         if ( bSig == 0 ) return packFloat32( zSign, 0, 0 );
   1869  1.1      ross         normalizeFloat32Subnormal( bSig, &bExp, &bSig );
   1870  1.1      ross     }
   1871  1.1      ross     zExp = aExp + bExp - 0x7F;
   1872  1.1      ross     aSig = ( aSig | 0x00800000 )<<7;
   1873  1.1      ross     bSig = ( bSig | 0x00800000 )<<8;
   1874  1.1      ross     shift64RightJamming( ( (bits64) aSig ) * bSig, 32, &zSig64 );
   1875  1.1      ross     zSig = zSig64;
   1876  1.1      ross     if ( 0 <= (sbits32) ( zSig<<1 ) ) {
   1877  1.1      ross         zSig <<= 1;
   1878  1.1      ross         --zExp;
   1879  1.1      ross     }
   1880  1.1      ross     return roundAndPackFloat32( zSign, zExp, zSig );
   1881  1.1      ross 
   1882  1.1      ross }
   1883  1.1      ross 
   1884  1.7   thorpej /*----------------------------------------------------------------------------
   1885  1.7   thorpej | Returns the result of dividing the single-precision floating-point value `a'
   1886  1.7   thorpej | by the corresponding value `b'.  The operation is performed according to the
   1887  1.7   thorpej | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
   1888  1.7   thorpej *----------------------------------------------------------------------------*/
   1889  1.7   thorpej 
   1890  1.1      ross float32 float32_div( float32 a, float32 b )
   1891  1.1      ross {
   1892  1.1      ross     flag aSign, bSign, zSign;
   1893  1.1      ross     int16 aExp, bExp, zExp;
   1894  1.1      ross     bits32 aSig, bSig, zSig;
   1895  1.1      ross 
   1896  1.1      ross     aSig = extractFloat32Frac( a );
   1897  1.1      ross     aExp = extractFloat32Exp( a );
   1898  1.1      ross     aSign = extractFloat32Sign( a );
   1899  1.1      ross     bSig = extractFloat32Frac( b );
   1900  1.1      ross     bExp = extractFloat32Exp( b );
   1901  1.1      ross     bSign = extractFloat32Sign( b );
   1902  1.1      ross     zSign = aSign ^ bSign;
   1903  1.1      ross     if ( aExp == 0xFF ) {
   1904  1.1      ross         if ( aSig ) return propagateFloat32NaN( a, b );
   1905  1.1      ross         if ( bExp == 0xFF ) {
   1906  1.1      ross             if ( bSig ) return propagateFloat32NaN( a, b );
   1907  1.1      ross             float_raise( float_flag_invalid );
   1908  1.1      ross             return float32_default_nan;
   1909  1.1      ross         }
   1910  1.1      ross         return packFloat32( zSign, 0xFF, 0 );
   1911  1.1      ross     }
   1912  1.1      ross     if ( bExp == 0xFF ) {
   1913  1.1      ross         if ( bSig ) return propagateFloat32NaN( a, b );
   1914  1.1      ross         return packFloat32( zSign, 0, 0 );
   1915  1.1      ross     }
   1916  1.1      ross     if ( bExp == 0 ) {
   1917  1.1      ross         if ( bSig == 0 ) {
   1918  1.1      ross             if ( ( aExp | aSig ) == 0 ) {
   1919  1.1      ross                 float_raise( float_flag_invalid );
   1920  1.1      ross                 return float32_default_nan;
   1921  1.1      ross             }
   1922  1.1      ross             float_raise( float_flag_divbyzero );
   1923  1.1      ross             return packFloat32( zSign, 0xFF, 0 );
   1924  1.1      ross         }
   1925  1.1      ross         normalizeFloat32Subnormal( bSig, &bExp, &bSig );
   1926  1.1      ross     }
   1927  1.1      ross     if ( aExp == 0 ) {
   1928  1.1      ross         if ( aSig == 0 ) return packFloat32( zSign, 0, 0 );
   1929  1.1      ross         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
   1930  1.1      ross     }
   1931  1.1      ross     zExp = aExp - bExp + 0x7D;
   1932  1.1      ross     aSig = ( aSig | 0x00800000 )<<7;
   1933  1.1      ross     bSig = ( bSig | 0x00800000 )<<8;
   1934  1.1      ross     if ( bSig <= ( aSig + aSig ) ) {
   1935  1.1      ross         aSig >>= 1;
   1936  1.1      ross         ++zExp;
   1937  1.1      ross     }
   1938  1.1      ross     zSig = ( ( (bits64) aSig )<<32 ) / bSig;
   1939  1.1      ross     if ( ( zSig & 0x3F ) == 0 ) {
   1940  1.1      ross         zSig |= ( (bits64) bSig * zSig != ( (bits64) aSig )<<32 );
   1941  1.1      ross     }
   1942  1.1      ross     return roundAndPackFloat32( zSign, zExp, zSig );
   1943  1.1      ross 
   1944  1.1      ross }
   1945  1.1      ross 
   1946  1.1      ross #ifndef SOFTFLOAT_FOR_GCC /* Not needed */
   1947  1.7   thorpej /*----------------------------------------------------------------------------
   1948  1.7   thorpej | Returns the remainder of the single-precision floating-point value `a'
   1949  1.7   thorpej | with respect to the corresponding value `b'.  The operation is performed
   1950  1.7   thorpej | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
   1951  1.7   thorpej *----------------------------------------------------------------------------*/
   1952  1.7   thorpej 
   1953  1.1      ross float32 float32_rem( float32 a, float32 b )
   1954  1.1      ross {
   1955  1.5  christos     flag aSign, bSign __unused, zSign;
   1956  1.1      ross     int16 aExp, bExp, expDiff;
   1957  1.1      ross     bits32 aSig, bSig;
   1958  1.1      ross     bits32 q;
   1959  1.1      ross     bits64 aSig64, bSig64, q64;
   1960  1.1      ross     bits32 alternateASig;
   1961  1.1      ross     sbits32 sigMean;
   1962  1.1      ross 
   1963  1.1      ross     aSig = extractFloat32Frac( a );
   1964  1.1      ross     aExp = extractFloat32Exp( a );
   1965  1.1      ross     aSign = extractFloat32Sign( a );
   1966  1.1      ross     bSig = extractFloat32Frac( b );
   1967  1.1      ross     bExp = extractFloat32Exp( b );
   1968  1.1      ross     bSign = extractFloat32Sign( b );
   1969  1.1      ross     if ( aExp == 0xFF ) {
   1970  1.1      ross         if ( aSig || ( ( bExp == 0xFF ) && bSig ) ) {
   1971  1.1      ross             return propagateFloat32NaN( a, b );
   1972  1.1      ross         }
   1973  1.1      ross         float_raise( float_flag_invalid );
   1974  1.1      ross         return float32_default_nan;
   1975  1.1      ross     }
   1976  1.1      ross     if ( bExp == 0xFF ) {
   1977  1.1      ross         if ( bSig ) return propagateFloat32NaN( a, b );
   1978  1.1      ross         return a;
   1979  1.1      ross     }
   1980  1.1      ross     if ( bExp == 0 ) {
   1981  1.1      ross         if ( bSig == 0 ) {
   1982  1.1      ross             float_raise( float_flag_invalid );
   1983  1.1      ross             return float32_default_nan;
   1984  1.1      ross         }
   1985  1.1      ross         normalizeFloat32Subnormal( bSig, &bExp, &bSig );
   1986  1.1      ross     }
   1987  1.1      ross     if ( aExp == 0 ) {
   1988  1.1      ross         if ( aSig == 0 ) return a;
   1989  1.1      ross         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
   1990  1.1      ross     }
   1991  1.1      ross     expDiff = aExp - bExp;
   1992  1.1      ross     aSig |= 0x00800000;
   1993  1.1      ross     bSig |= 0x00800000;
   1994  1.1      ross     if ( expDiff < 32 ) {
   1995  1.1      ross         aSig <<= 8;
   1996  1.1      ross         bSig <<= 8;
   1997  1.1      ross         if ( expDiff < 0 ) {
   1998  1.1      ross             if ( expDiff < -1 ) return a;
   1999  1.1      ross             aSig >>= 1;
   2000  1.1      ross         }
   2001  1.1      ross         q = ( bSig <= aSig );
   2002  1.1      ross         if ( q ) aSig -= bSig;
   2003  1.1      ross         if ( 0 < expDiff ) {
   2004  1.1      ross             q = ( ( (bits64) aSig )<<32 ) / bSig;
   2005  1.1      ross             q >>= 32 - expDiff;
   2006  1.1      ross             bSig >>= 2;
   2007  1.1      ross             aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q;
   2008  1.1      ross         }
   2009  1.1      ross         else {
   2010  1.1      ross             aSig >>= 2;
   2011  1.1      ross             bSig >>= 2;
   2012  1.1      ross         }
   2013  1.1      ross     }
   2014  1.1      ross     else {
   2015  1.1      ross         if ( bSig <= aSig ) aSig -= bSig;
   2016  1.1      ross         aSig64 = ( (bits64) aSig )<<40;
   2017  1.1      ross         bSig64 = ( (bits64) bSig )<<40;
   2018  1.1      ross         expDiff -= 64;
   2019  1.1      ross         while ( 0 < expDiff ) {
   2020  1.1      ross             q64 = estimateDiv128To64( aSig64, 0, bSig64 );
   2021  1.1      ross             q64 = ( 2 < q64 ) ? q64 - 2 : 0;
   2022  1.1      ross             aSig64 = - ( ( bSig * q64 )<<38 );
   2023  1.1      ross             expDiff -= 62;
   2024  1.1      ross         }
   2025  1.1      ross         expDiff += 64;
   2026  1.1      ross         q64 = estimateDiv128To64( aSig64, 0, bSig64 );
   2027  1.1      ross         q64 = ( 2 < q64 ) ? q64 - 2 : 0;
   2028  1.1      ross         q = q64>>( 64 - expDiff );
   2029  1.1      ross         bSig <<= 6;
   2030  1.1      ross         aSig = ( ( aSig64>>33 )<<( expDiff - 1 ) ) - bSig * q;
   2031  1.1      ross     }
   2032  1.1      ross     do {
   2033  1.1      ross         alternateASig = aSig;
   2034  1.1      ross         ++q;
   2035  1.1      ross         aSig -= bSig;
   2036  1.1      ross     } while ( 0 <= (sbits32) aSig );
   2037  1.1      ross     sigMean = aSig + alternateASig;
   2038  1.1      ross     if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) {
   2039  1.1      ross         aSig = alternateASig;
   2040  1.1      ross     }
   2041  1.1      ross     zSign = ( (sbits32) aSig < 0 );
   2042  1.1      ross     if ( zSign ) aSig = - aSig;
   2043  1.1      ross     return normalizeRoundAndPackFloat32( aSign ^ zSign, bExp, aSig );
   2044  1.1      ross 
   2045  1.1      ross }
   2046  1.1      ross #endif /* !SOFTFLOAT_FOR_GCC */
   2047  1.1      ross 
   2048  1.1      ross #ifndef SOFTFLOAT_FOR_GCC /* Not needed */
   2049  1.7   thorpej 
   2050  1.7   thorpej /*----------------------------------------------------------------------------
   2051  1.7   thorpej | Returns the square root of the single-precision floating-point value `a'.
   2052  1.7   thorpej | The operation is performed according to the IEC/IEEE Standard for Binary
   2053  1.7   thorpej | Floating-Point Arithmetic.
   2054  1.7   thorpej *----------------------------------------------------------------------------*/
   2055  1.7   thorpej 
   2056  1.1      ross float32 float32_sqrt( float32 a )
   2057  1.1      ross {
   2058  1.1      ross     flag aSign;
   2059  1.1      ross     int16 aExp, zExp;
   2060  1.1      ross     bits32 aSig, zSig;
   2061  1.1      ross     bits64 rem, term;
   2062  1.1      ross 
   2063  1.1      ross     aSig = extractFloat32Frac( a );
   2064  1.1      ross     aExp = extractFloat32Exp( a );
   2065  1.1      ross     aSign = extractFloat32Sign( a );
   2066  1.1      ross     if ( aExp == 0xFF ) {
   2067  1.1      ross         if ( aSig ) return propagateFloat32NaN( a, 0 );
   2068  1.1      ross         if ( ! aSign ) return a;
   2069  1.1      ross         float_raise( float_flag_invalid );
   2070  1.1      ross         return float32_default_nan;
   2071  1.1      ross     }
   2072  1.1      ross     if ( aSign ) {
   2073  1.1      ross         if ( ( aExp | aSig ) == 0 ) return a;
   2074  1.1      ross         float_raise( float_flag_invalid );
   2075  1.1      ross         return float32_default_nan;
   2076  1.1      ross     }
   2077  1.1      ross     if ( aExp == 0 ) {
   2078  1.1      ross         if ( aSig == 0 ) return 0;
   2079  1.1      ross         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
   2080  1.1      ross     }
   2081  1.1      ross     zExp = ( ( aExp - 0x7F )>>1 ) + 0x7E;
   2082  1.1      ross     aSig = ( aSig | 0x00800000 )<<8;
   2083  1.1      ross     zSig = estimateSqrt32( aExp, aSig ) + 2;
   2084  1.1      ross     if ( ( zSig & 0x7F ) <= 5 ) {
   2085  1.1      ross         if ( zSig < 2 ) {
   2086  1.1      ross             zSig = 0x7FFFFFFF;
   2087  1.1      ross             goto roundAndPack;
   2088  1.1      ross         }
   2089  1.1      ross         aSig >>= aExp & 1;
   2090  1.1      ross         term = ( (bits64) zSig ) * zSig;
   2091  1.1      ross         rem = ( ( (bits64) aSig )<<32 ) - term;
   2092  1.1      ross         while ( (sbits64) rem < 0 ) {
   2093  1.1      ross             --zSig;
   2094  1.1      ross             rem += ( ( (bits64) zSig )<<1 ) | 1;
   2095  1.1      ross         }
   2096  1.1      ross         zSig |= ( rem != 0 );
   2097  1.1      ross     }
   2098  1.1      ross     shift32RightJamming( zSig, 1, &zSig );
   2099  1.1      ross  roundAndPack:
   2100  1.1      ross     return roundAndPackFloat32( 0, zExp, zSig );
   2101  1.1      ross 
   2102  1.1      ross }
   2103  1.1      ross #endif /* !SOFTFLOAT_FOR_GCC */
   2104  1.1      ross 
   2105  1.7   thorpej /*----------------------------------------------------------------------------
   2106  1.7   thorpej | Returns 1 if the single-precision floating-point value `a' is equal to
   2107  1.7   thorpej | the corresponding value `b', and 0 otherwise.  The comparison is performed
   2108  1.7   thorpej | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
   2109  1.7   thorpej *----------------------------------------------------------------------------*/
   2110  1.7   thorpej 
   2111  1.1      ross flag float32_eq( float32 a, float32 b )
   2112  1.1      ross {
   2113  1.1      ross 
   2114  1.1      ross     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
   2115  1.1      ross          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
   2116  1.1      ross        ) {
   2117  1.1      ross         if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) {
   2118  1.1      ross             float_raise( float_flag_invalid );
   2119  1.1      ross         }
   2120  1.1      ross         return 0;
   2121  1.1      ross     }
   2122  1.1      ross     return ( a == b ) || ( (bits32) ( ( a | b )<<1 ) == 0 );
   2123  1.1      ross 
   2124  1.1      ross }
   2125  1.1      ross 
   2126  1.7   thorpej /*----------------------------------------------------------------------------
   2127  1.7   thorpej | Returns 1 if the single-precision floating-point value `a' is less than
   2128  1.7   thorpej | or equal to the corresponding value `b', and 0 otherwise.  The comparison
   2129  1.7   thorpej | is performed according to the IEC/IEEE Standard for Binary Floating-Point
   2130  1.7   thorpej | Arithmetic.
   2131  1.7   thorpej *----------------------------------------------------------------------------*/
   2132  1.7   thorpej 
   2133  1.1      ross flag float32_le( float32 a, float32 b )
   2134  1.1      ross {
   2135  1.1      ross     flag aSign, bSign;
   2136  1.1      ross 
   2137  1.1      ross     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
   2138  1.1      ross          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
   2139  1.1      ross        ) {
   2140  1.1      ross         float_raise( float_flag_invalid );
   2141  1.1      ross         return 0;
   2142  1.1      ross     }
   2143  1.1      ross     aSign = extractFloat32Sign( a );
   2144  1.1      ross     bSign = extractFloat32Sign( b );
   2145  1.1      ross     if ( aSign != bSign ) return aSign || ( (bits32) ( ( a | b )<<1 ) == 0 );
   2146  1.1      ross     return ( a == b ) || ( aSign ^ ( a < b ) );
   2147  1.1      ross 
   2148  1.1      ross }
   2149  1.1      ross 
   2150  1.7   thorpej /*----------------------------------------------------------------------------
   2151  1.7   thorpej | Returns 1 if the single-precision floating-point value `a' is less than
   2152  1.7   thorpej | the corresponding value `b', and 0 otherwise.  The comparison is performed
   2153  1.7   thorpej | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
   2154  1.7   thorpej *----------------------------------------------------------------------------*/
   2155  1.7   thorpej 
   2156  1.1      ross flag float32_lt( float32 a, float32 b )
   2157  1.1      ross {
   2158  1.1      ross     flag aSign, bSign;
   2159  1.1      ross 
   2160  1.1      ross     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
   2161  1.1      ross          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
   2162  1.1      ross        ) {
   2163  1.1      ross         float_raise( float_flag_invalid );
   2164  1.1      ross         return 0;
   2165  1.1      ross     }
   2166  1.1      ross     aSign = extractFloat32Sign( a );
   2167  1.1      ross     bSign = extractFloat32Sign( b );
   2168  1.1      ross     if ( aSign != bSign ) return aSign && ( (bits32) ( ( a | b )<<1 ) != 0 );
   2169  1.1      ross     return ( a != b ) && ( aSign ^ ( a < b ) );
   2170  1.1      ross 
   2171  1.1      ross }
   2172  1.1      ross 
   2173  1.1      ross #ifndef SOFTFLOAT_FOR_GCC /* Not needed */
   2174  1.7   thorpej /*----------------------------------------------------------------------------
   2175  1.7   thorpej | Returns 1 if the single-precision floating-point value `a' is equal to
   2176  1.7   thorpej | the corresponding value `b', and 0 otherwise.  The invalid exception is
   2177  1.7   thorpej | raised if either operand is a NaN.  Otherwise, the comparison is performed
   2178  1.7   thorpej | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
   2179  1.7   thorpej *----------------------------------------------------------------------------*/
   2180  1.7   thorpej 
   2181  1.1      ross flag float32_eq_signaling( float32 a, float32 b )
   2182  1.1      ross {
   2183  1.1      ross 
   2184  1.1      ross     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
   2185  1.1      ross          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
   2186  1.1      ross        ) {
   2187  1.1      ross         float_raise( float_flag_invalid );
   2188  1.1      ross         return 0;
   2189  1.1      ross     }
   2190  1.1      ross     return ( a == b ) || ( (bits32) ( ( a | b )<<1 ) == 0 );
   2191  1.1      ross 
   2192  1.1      ross }
   2193  1.1      ross 
   2194  1.7   thorpej /*----------------------------------------------------------------------------
   2195  1.7   thorpej | Returns 1 if the single-precision floating-point value `a' is less than or
   2196  1.7   thorpej | equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs do not
   2197  1.7   thorpej | cause an exception.  Otherwise, the comparison is performed according to the
   2198  1.7   thorpej | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
   2199  1.7   thorpej *----------------------------------------------------------------------------*/
   2200  1.7   thorpej 
   2201  1.1      ross flag float32_le_quiet( float32 a, float32 b )
   2202  1.1      ross {
   2203  1.1      ross     flag aSign, bSign;
   2204  1.1      ross 
   2205  1.1      ross     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
   2206  1.1      ross          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
   2207  1.1      ross        ) {
   2208  1.1      ross         if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) {
   2209  1.1      ross             float_raise( float_flag_invalid );
   2210  1.1      ross         }
   2211  1.1      ross         return 0;
   2212  1.1      ross     }
   2213  1.1      ross     aSign = extractFloat32Sign( a );
   2214  1.1      ross     bSign = extractFloat32Sign( b );
   2215  1.1      ross     if ( aSign != bSign ) return aSign || ( (bits32) ( ( a | b )<<1 ) == 0 );
   2216  1.1      ross     return ( a == b ) || ( aSign ^ ( a < b ) );
   2217  1.1      ross 
   2218  1.1      ross }
   2219  1.1      ross 
   2220  1.7   thorpej /*----------------------------------------------------------------------------
   2221  1.7   thorpej | Returns 1 if the single-precision floating-point value `a' is less than
   2222  1.7   thorpej | the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
   2223  1.7   thorpej | exception.  Otherwise, the comparison is performed according to the IEC/IEEE
   2224  1.7   thorpej | Standard for Binary Floating-Point Arithmetic.
   2225  1.7   thorpej *----------------------------------------------------------------------------*/
   2226  1.7   thorpej 
   2227  1.1      ross flag float32_lt_quiet( float32 a, float32 b )
   2228  1.1      ross {
   2229  1.1      ross     flag aSign, bSign;
   2230  1.1      ross 
   2231  1.1      ross     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
   2232  1.1      ross          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
   2233  1.1      ross        ) {
   2234  1.1      ross         if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) {
   2235  1.1      ross             float_raise( float_flag_invalid );
   2236  1.1      ross         }
   2237  1.1      ross         return 0;
   2238  1.1      ross     }
   2239  1.1      ross     aSign = extractFloat32Sign( a );
   2240  1.1      ross     bSign = extractFloat32Sign( b );
   2241  1.1      ross     if ( aSign != bSign ) return aSign && ( (bits32) ( ( a | b )<<1 ) != 0 );
   2242  1.1      ross     return ( a != b ) && ( aSign ^ ( a < b ) );
   2243  1.1      ross 
   2244  1.1      ross }
   2245  1.1      ross #endif /* !SOFTFLOAT_FOR_GCC */
   2246  1.1      ross 
   2247  1.1      ross #ifndef SOFTFLOAT_FOR_GCC /* Not needed */
   2248  1.7   thorpej /*----------------------------------------------------------------------------
   2249  1.7   thorpej | Returns the result of converting the double-precision floating-point value
   2250  1.7   thorpej | `a' to the 32-bit two's complement integer format.  The conversion is
   2251  1.7   thorpej | performed according to the IEC/IEEE Standard for Binary Floating-Point
   2252  1.7   thorpej | Arithmetic---which means in particular that the conversion is rounded
   2253  1.7   thorpej | according to the current rounding mode.  If `a' is a NaN, the largest
   2254  1.7   thorpej | positive integer is returned.  Otherwise, if the conversion overflows, the
   2255  1.7   thorpej | largest integer with the same sign as `a' is returned.
   2256  1.7   thorpej *----------------------------------------------------------------------------*/
   2257  1.7   thorpej 
   2258  1.1      ross int32 float64_to_int32( float64 a )
   2259  1.1      ross {
   2260  1.1      ross     flag aSign;
   2261  1.1      ross     int16 aExp, shiftCount;
   2262  1.1      ross     bits64 aSig;
   2263  1.1      ross 
   2264  1.1      ross     aSig = extractFloat64Frac( a );
   2265  1.1      ross     aExp = extractFloat64Exp( a );
   2266  1.1      ross     aSign = extractFloat64Sign( a );
   2267  1.1      ross     if ( ( aExp == 0x7FF ) && aSig ) aSign = 0;
   2268  1.1      ross     if ( aExp ) aSig |= LIT64( 0x0010000000000000 );
   2269  1.1      ross     shiftCount = 0x42C - aExp;
   2270  1.1      ross     if ( 0 < shiftCount ) shift64RightJamming( aSig, shiftCount, &aSig );
   2271  1.1      ross     return roundAndPackInt32( aSign, aSig );
   2272  1.1      ross 
   2273  1.1      ross }
   2274  1.1      ross #endif /* !SOFTFLOAT_FOR_GCC */
   2275  1.1      ross 
   2276  1.7   thorpej /*----------------------------------------------------------------------------
   2277  1.7   thorpej | Returns the result of converting the double-precision floating-point value
   2278  1.7   thorpej | `a' to the 32-bit two's complement integer format.  The conversion is
   2279  1.7   thorpej | performed according to the IEC/IEEE Standard for Binary Floating-Point
   2280  1.7   thorpej | Arithmetic, except that the conversion is always rounded toward zero.
   2281  1.7   thorpej | If `a' is a NaN, the largest positive integer is returned.  Otherwise, if
   2282  1.7   thorpej | the conversion overflows, the largest integer with the same sign as `a' is
   2283  1.7   thorpej | returned.
   2284  1.7   thorpej *----------------------------------------------------------------------------*/
   2285  1.7   thorpej 
   2286  1.1      ross int32 float64_to_int32_round_to_zero( float64 a )
   2287  1.1      ross {
   2288  1.1      ross     flag aSign;
   2289  1.1      ross     int16 aExp, shiftCount;
   2290  1.1      ross     bits64 aSig, savedASig;
   2291  1.1      ross     int32 z;
   2292  1.1      ross 
   2293  1.1      ross     aSig = extractFloat64Frac( a );
   2294  1.1      ross     aExp = extractFloat64Exp( a );
   2295  1.1      ross     aSign = extractFloat64Sign( a );
   2296  1.1      ross     if ( 0x41E < aExp ) {
   2297  1.1      ross         if ( ( aExp == 0x7FF ) && aSig ) aSign = 0;
   2298  1.1      ross         goto invalid;
   2299  1.1      ross     }
   2300  1.1      ross     else if ( aExp < 0x3FF ) {
   2301  1.1      ross         if ( aExp || aSig ) float_set_inexact();
   2302  1.1      ross         return 0;
   2303  1.1      ross     }
   2304  1.1      ross     aSig |= LIT64( 0x0010000000000000 );
   2305  1.1      ross     shiftCount = 0x433 - aExp;
   2306  1.1      ross     savedASig = aSig;
   2307  1.1      ross     aSig >>= shiftCount;
   2308  1.1      ross     z = aSig;
   2309  1.1      ross     if ( aSign ) z = - z;
   2310  1.1      ross     if ( ( z < 0 ) ^ aSign ) {
   2311  1.1      ross  invalid:
   2312  1.1      ross         float_raise( float_flag_invalid );
   2313  1.1      ross         return aSign ? (sbits32) 0x80000000 : 0x7FFFFFFF;
   2314  1.1      ross     }
   2315  1.1      ross     if ( ( aSig<<shiftCount ) != savedASig ) {
   2316  1.1      ross         float_set_inexact();
   2317  1.1      ross     }
   2318  1.1      ross     return z;
   2319  1.1      ross 
   2320  1.1      ross }
   2321  1.1      ross 
   2322  1.1      ross #ifndef SOFTFLOAT_FOR_GCC /* Not needed */
   2323  1.7   thorpej /*----------------------------------------------------------------------------
   2324  1.7   thorpej | Returns the result of converting the double-precision floating-point value
   2325  1.7   thorpej | `a' to the 64-bit two's complement integer format.  The conversion is
   2326  1.7   thorpej | performed according to the IEC/IEEE Standard for Binary Floating-Point
   2327  1.7   thorpej | Arithmetic---which means in particular that the conversion is rounded
   2328  1.7   thorpej | according to the current rounding mode.  If `a' is a NaN, the largest
   2329  1.7   thorpej | positive integer is returned.  Otherwise, if the conversion overflows, the
   2330  1.7   thorpej | largest integer with the same sign as `a' is returned.
   2331  1.7   thorpej *----------------------------------------------------------------------------*/
   2332  1.7   thorpej 
   2333  1.1      ross int64 float64_to_int64( float64 a )
   2334  1.1      ross {
   2335  1.1      ross     flag aSign;
   2336  1.1      ross     int16 aExp, shiftCount;
   2337  1.1      ross     bits64 aSig, aSigExtra;
   2338  1.1      ross 
   2339  1.1      ross     aSig = extractFloat64Frac( a );
   2340  1.1      ross     aExp = extractFloat64Exp( a );
   2341  1.1      ross     aSign = extractFloat64Sign( a );
   2342  1.1      ross     if ( aExp ) aSig |= LIT64( 0x0010000000000000 );
   2343  1.1      ross     shiftCount = 0x433 - aExp;
   2344  1.1      ross     if ( shiftCount <= 0 ) {
   2345  1.1      ross         if ( 0x43E < aExp ) {
   2346  1.1      ross             float_raise( float_flag_invalid );
   2347  1.1      ross             if (    ! aSign
   2348  1.1      ross                  || (    ( aExp == 0x7FF )
   2349  1.1      ross                       && ( aSig != LIT64( 0x0010000000000000 ) ) )
   2350  1.1      ross                ) {
   2351  1.1      ross                 return LIT64( 0x7FFFFFFFFFFFFFFF );
   2352  1.1      ross             }
   2353  1.1      ross             return (sbits64) LIT64( 0x8000000000000000 );
   2354  1.1      ross         }
   2355  1.1      ross         aSigExtra = 0;
   2356  1.1      ross         aSig <<= - shiftCount;
   2357  1.1      ross     }
   2358  1.1      ross     else {
   2359  1.1      ross         shift64ExtraRightJamming( aSig, 0, shiftCount, &aSig, &aSigExtra );
   2360  1.1      ross     }
   2361  1.1      ross     return roundAndPackInt64( aSign, aSig, aSigExtra );
   2362  1.1      ross 
   2363  1.1      ross }
   2364  1.1      ross 
   2365  1.6    martin /* like above, but result is unsigned */
   2366  1.6    martin uint64 float64_to_uint64( float64 a )
   2367  1.6    martin {
   2368  1.6    martin     flag aSign;
   2369  1.6    martin     int16 aExp, shiftCount;
   2370  1.6    martin     bits64 aSig, aSigExtra;
   2371  1.6    martin 
   2372  1.6    martin     aSig = extractFloat64Frac( a );
   2373  1.6    martin     aExp = extractFloat64Exp( a );
   2374  1.6    martin     aSign = extractFloat64Sign( a );
   2375  1.6    martin 
   2376  1.6    martin     if (aSign) {
   2377  1.6    martin 	return float64_to_int64(a);
   2378  1.6    martin     }
   2379  1.6    martin 
   2380  1.6    martin     if ( aExp ) aSig |= LIT64( 0x0010000000000000 );
   2381  1.6    martin     shiftCount = 0x433 - aExp;
   2382  1.6    martin     if ( shiftCount <= 0 ) {
   2383  1.6    martin         if ( 0x43E < aExp ) {
   2384  1.6    martin             float_raise( float_flag_invalid );
   2385  1.6    martin             if (    ! aSign
   2386  1.6    martin                  || (    ( aExp == 0x7FF )
   2387  1.6    martin                       && ( aSig != LIT64( 0x0010000000000000 ) ) )
   2388  1.6    martin                ) {
   2389  1.6    martin                 return LIT64( 0x7FFFFFFFFFFFFFFF );
   2390  1.6    martin             }
   2391  1.6    martin             return (sbits64) LIT64( 0x8000000000000000 );
   2392  1.6    martin         }
   2393  1.6    martin         aSigExtra = 0;
   2394  1.6    martin         aSig <<= - shiftCount;
   2395  1.6    martin     }
   2396  1.6    martin     else {
   2397  1.6    martin         shift64ExtraRightJamming( aSig, 0, shiftCount, &aSig, &aSigExtra );
   2398  1.6    martin     }
   2399  1.6    martin     return roundAndPackUInt64( aSig, aSigExtra );
   2400  1.6    martin 
   2401  1.6    martin }
   2402  1.6    martin 
   2403  1.7   thorpej /*----------------------------------------------------------------------------
   2404  1.7   thorpej | Returns the result of converting the double-precision floating-point value
   2405  1.7   thorpej | `a' to the 64-bit two's complement integer format.  The conversion is
   2406  1.7   thorpej | performed according to the IEC/IEEE Standard for Binary Floating-Point
   2407  1.7   thorpej | Arithmetic, except that the conversion is always rounded toward zero.
   2408  1.7   thorpej | If `a' is a NaN, the largest positive integer is returned.  Otherwise, if
   2409  1.7   thorpej | the conversion overflows, the largest integer with the same sign as `a' is
   2410  1.7   thorpej | returned.
   2411  1.7   thorpej *----------------------------------------------------------------------------*/
   2412  1.7   thorpej 
   2413  1.1      ross int64 float64_to_int64_round_to_zero( float64 a )
   2414  1.1      ross {
   2415  1.1      ross     flag aSign;
   2416  1.1      ross     int16 aExp, shiftCount;
   2417  1.1      ross     bits64 aSig;
   2418  1.1      ross     int64 z;
   2419  1.1      ross 
   2420  1.1      ross     aSig = extractFloat64Frac( a );
   2421  1.1      ross     aExp = extractFloat64Exp( a );
   2422  1.1      ross     aSign = extractFloat64Sign( a );
   2423  1.1      ross     if ( aExp ) aSig |= LIT64( 0x0010000000000000 );
   2424  1.1      ross     shiftCount = aExp - 0x433;
   2425  1.1      ross     if ( 0 <= shiftCount ) {
   2426  1.1      ross         if ( 0x43E <= aExp ) {
   2427  1.1      ross             if ( a != LIT64( 0xC3E0000000000000 ) ) {
   2428  1.1      ross                 float_raise( float_flag_invalid );
   2429  1.1      ross                 if (    ! aSign
   2430  1.1      ross                      || (    ( aExp == 0x7FF )
   2431  1.1      ross                           && ( aSig != LIT64( 0x0010000000000000 ) ) )
   2432  1.1      ross                    ) {
   2433  1.1      ross                     return LIT64( 0x7FFFFFFFFFFFFFFF );
   2434  1.1      ross                 }
   2435  1.1      ross             }
   2436  1.1      ross             return (sbits64) LIT64( 0x8000000000000000 );
   2437  1.1      ross         }
   2438  1.1      ross         z = aSig<<shiftCount;
   2439  1.1      ross     }
   2440  1.1      ross     else {
   2441  1.1      ross         if ( aExp < 0x3FE ) {
   2442  1.1      ross             if ( aExp | aSig ) float_set_inexact();
   2443  1.1      ross             return 0;
   2444  1.1      ross         }
   2445  1.1      ross         z = aSig>>( - shiftCount );
   2446  1.1      ross         if ( (bits64) ( aSig<<( shiftCount & 63 ) ) ) {
   2447  1.1      ross             float_set_inexact();
   2448  1.1      ross         }
   2449  1.1      ross     }
   2450  1.1      ross     if ( aSign ) z = - z;
   2451  1.1      ross     return z;
   2452  1.1      ross 
   2453  1.1      ross }
   2454  1.1      ross #endif /* !SOFTFLOAT_FOR_GCC */
   2455  1.1      ross 
   2456  1.7   thorpej /*----------------------------------------------------------------------------
   2457  1.7   thorpej | Returns the result of converting the double-precision floating-point value
   2458  1.7   thorpej | `a' to the single-precision floating-point format.  The conversion is
   2459  1.7   thorpej | performed according to the IEC/IEEE Standard for Binary Floating-Point
   2460  1.7   thorpej | Arithmetic.
   2461  1.7   thorpej *----------------------------------------------------------------------------*/
   2462  1.7   thorpej 
   2463  1.1      ross float32 float64_to_float32( float64 a )
   2464  1.1      ross {
   2465  1.1      ross     flag aSign;
   2466  1.1      ross     int16 aExp;
   2467  1.1      ross     bits64 aSig;
   2468  1.1      ross     bits32 zSig;
   2469  1.1      ross 
   2470  1.1      ross     aSig = extractFloat64Frac( a );
   2471  1.1      ross     aExp = extractFloat64Exp( a );
   2472  1.1      ross     aSign = extractFloat64Sign( a );
   2473  1.1      ross     if ( aExp == 0x7FF ) {
   2474  1.1      ross         if ( aSig ) return commonNaNToFloat32( float64ToCommonNaN( a ) );
   2475  1.1      ross         return packFloat32( aSign, 0xFF, 0 );
   2476  1.1      ross     }
   2477  1.1      ross     shift64RightJamming( aSig, 22, &aSig );
   2478  1.1      ross     zSig = aSig;
   2479  1.1      ross     if ( aExp || zSig ) {
   2480  1.1      ross         zSig |= 0x40000000;
   2481  1.1      ross         aExp -= 0x381;
   2482  1.1      ross     }
   2483  1.1      ross     return roundAndPackFloat32( aSign, aExp, zSig );
   2484  1.1      ross 
   2485  1.1      ross }
   2486  1.1      ross 
   2487  1.1      ross #ifdef FLOATX80
   2488  1.1      ross 
   2489  1.7   thorpej /*----------------------------------------------------------------------------
   2490  1.7   thorpej | Returns the result of converting the double-precision floating-point value
   2491  1.7   thorpej | `a' to the extended double-precision floating-point format.  The conversion
   2492  1.7   thorpej | is performed according to the IEC/IEEE Standard for Binary Floating-Point
   2493  1.7   thorpej | Arithmetic.
   2494  1.7   thorpej *----------------------------------------------------------------------------*/
   2495  1.7   thorpej 
   2496  1.1      ross floatx80 float64_to_floatx80( float64 a )
   2497  1.1      ross {
   2498  1.1      ross     flag aSign;
   2499  1.1      ross     int16 aExp;
   2500  1.1      ross     bits64 aSig;
   2501  1.1      ross 
   2502  1.1      ross     aSig = extractFloat64Frac( a );
   2503  1.1      ross     aExp = extractFloat64Exp( a );
   2504  1.1      ross     aSign = extractFloat64Sign( a );
   2505  1.1      ross     if ( aExp == 0x7FF ) {
   2506  1.1      ross         if ( aSig ) return commonNaNToFloatx80( float64ToCommonNaN( a ) );
   2507  1.1      ross         return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
   2508  1.1      ross     }
   2509  1.1      ross     if ( aExp == 0 ) {
   2510  1.1      ross         if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 );
   2511  1.1      ross         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
   2512  1.1      ross     }
   2513  1.1      ross     return
   2514  1.1      ross         packFloatx80(
   2515  1.1      ross             aSign, aExp + 0x3C00, ( aSig | LIT64( 0x0010000000000000 ) )<<11 );
   2516  1.1      ross 
   2517  1.1      ross }
   2518  1.1      ross 
   2519  1.1      ross #endif
   2520  1.1      ross 
   2521  1.1      ross #ifdef FLOAT128
   2522  1.1      ross 
   2523  1.7   thorpej /*----------------------------------------------------------------------------
   2524  1.7   thorpej | Returns the result of converting the double-precision floating-point value
   2525  1.7   thorpej | `a' to the quadruple-precision floating-point format.  The conversion is
   2526  1.7   thorpej | performed according to the IEC/IEEE Standard for Binary Floating-Point
   2527  1.7   thorpej | Arithmetic.
   2528  1.7   thorpej *----------------------------------------------------------------------------*/
   2529  1.7   thorpej 
   2530  1.1      ross float128 float64_to_float128( float64 a )
   2531  1.1      ross {
   2532  1.1      ross     flag aSign;
   2533  1.1      ross     int16 aExp;
   2534  1.1      ross     bits64 aSig, zSig0, zSig1;
   2535  1.1      ross 
   2536  1.1      ross     aSig = extractFloat64Frac( a );
   2537  1.1      ross     aExp = extractFloat64Exp( a );
   2538  1.1      ross     aSign = extractFloat64Sign( a );
   2539  1.1      ross     if ( aExp == 0x7FF ) {
   2540  1.1      ross         if ( aSig ) return commonNaNToFloat128( float64ToCommonNaN( a ) );
   2541  1.1      ross         return packFloat128( aSign, 0x7FFF, 0, 0 );
   2542  1.1      ross     }
   2543  1.1      ross     if ( aExp == 0 ) {
   2544  1.1      ross         if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 );
   2545  1.1      ross         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
   2546  1.1      ross         --aExp;
   2547  1.1      ross     }
   2548  1.1      ross     shift128Right( aSig, 0, 4, &zSig0, &zSig1 );
   2549  1.1      ross     return packFloat128( aSign, aExp + 0x3C00, zSig0, zSig1 );
   2550  1.1      ross 
   2551  1.1      ross }
   2552  1.1      ross 
   2553  1.1      ross #endif
   2554  1.1      ross 
   2555  1.1      ross #ifndef SOFTFLOAT_FOR_GCC
   2556  1.7   thorpej /*----------------------------------------------------------------------------
   2557  1.7   thorpej | Rounds the double-precision floating-point value `a' to an integer, and
   2558  1.7   thorpej | returns the result as a double-precision floating-point value.  The
   2559  1.7   thorpej | operation is performed according to the IEC/IEEE Standard for Binary
   2560  1.7   thorpej | Floating-Point Arithmetic.
   2561  1.7   thorpej *----------------------------------------------------------------------------*/
   2562  1.7   thorpej 
   2563  1.1      ross float64 float64_round_to_int( float64 a )
   2564  1.1      ross {
   2565  1.1      ross     flag aSign;
   2566  1.1      ross     int16 aExp;
   2567  1.1      ross     bits64 lastBitMask, roundBitsMask;
   2568  1.1      ross     int8 roundingMode;
   2569  1.1      ross     float64 z;
   2570  1.1      ross 
   2571  1.1      ross     aExp = extractFloat64Exp( a );
   2572  1.1      ross     if ( 0x433 <= aExp ) {
   2573  1.1      ross         if ( ( aExp == 0x7FF ) && extractFloat64Frac( a ) ) {
   2574  1.1      ross             return propagateFloat64NaN( a, a );
   2575  1.1      ross         }
   2576  1.1      ross         return a;
   2577  1.1      ross     }
   2578  1.1      ross     if ( aExp < 0x3FF ) {
   2579  1.1      ross         if ( (bits64) ( a<<1 ) == 0 ) return a;
   2580  1.1      ross         float_set_inexact();
   2581  1.1      ross         aSign = extractFloat64Sign( a );
   2582  1.1      ross         switch ( float_rounding_mode() ) {
   2583  1.1      ross          case float_round_nearest_even:
   2584  1.1      ross             if ( ( aExp == 0x3FE ) && extractFloat64Frac( a ) ) {
   2585  1.1      ross                 return packFloat64( aSign, 0x3FF, 0 );
   2586  1.1      ross             }
   2587  1.1      ross             break;
   2588  1.1      ross          case float_round_down:
   2589  1.1      ross             return aSign ? LIT64( 0xBFF0000000000000 ) : 0;
   2590  1.1      ross          case float_round_up:
   2591  1.1      ross             return
   2592  1.1      ross             aSign ? LIT64( 0x8000000000000000 ) : LIT64( 0x3FF0000000000000 );
   2593  1.1      ross         }
   2594  1.1      ross         return packFloat64( aSign, 0, 0 );
   2595  1.1      ross     }
   2596  1.1      ross     lastBitMask = 1;
   2597  1.1      ross     lastBitMask <<= 0x433 - aExp;
   2598  1.1      ross     roundBitsMask = lastBitMask - 1;
   2599  1.1      ross     z = a;
   2600  1.1      ross     roundingMode = float_rounding_mode();
   2601  1.1      ross     if ( roundingMode == float_round_nearest_even ) {
   2602  1.1      ross         z += lastBitMask>>1;
   2603  1.1      ross         if ( ( z & roundBitsMask ) == 0 ) z &= ~ lastBitMask;
   2604  1.1      ross     }
   2605  1.1      ross     else if ( roundingMode != float_round_to_zero ) {
   2606  1.1      ross         if ( extractFloat64Sign( z ) ^ ( roundingMode == float_round_up ) ) {
   2607  1.1      ross             z += roundBitsMask;
   2608  1.1      ross         }
   2609  1.1      ross     }
   2610  1.1      ross     z &= ~ roundBitsMask;
   2611  1.1      ross     if ( z != a ) float_set_inexact();
   2612  1.1      ross     return z;
   2613  1.1      ross 
   2614  1.1      ross }
   2615  1.1      ross #endif
   2616  1.1      ross 
   2617  1.7   thorpej /*----------------------------------------------------------------------------
   2618  1.7   thorpej | Returns the result of adding the absolute values of the double-precision
   2619  1.7   thorpej | floating-point values `a' and `b'.  If `zSign' is 1, the sum is negated
   2620  1.7   thorpej | before being returned.  `zSign' is ignored if the result is a NaN.
   2621  1.7   thorpej | The addition is performed according to the IEC/IEEE Standard for Binary
   2622  1.7   thorpej | Floating-Point Arithmetic.
   2623  1.7   thorpej *----------------------------------------------------------------------------*/
   2624  1.7   thorpej 
   2625  1.1      ross static float64 addFloat64Sigs( float64 a, float64 b, flag zSign )
   2626  1.1      ross {
   2627  1.1      ross     int16 aExp, bExp, zExp;
   2628  1.1      ross     bits64 aSig, bSig, zSig;
   2629  1.1      ross     int16 expDiff;
   2630  1.1      ross 
   2631  1.1      ross     aSig = extractFloat64Frac( a );
   2632  1.1      ross     aExp = extractFloat64Exp( a );
   2633  1.1      ross     bSig = extractFloat64Frac( b );
   2634  1.1      ross     bExp = extractFloat64Exp( b );
   2635  1.1      ross     expDiff = aExp - bExp;
   2636  1.1      ross     aSig <<= 9;
   2637  1.1      ross     bSig <<= 9;
   2638  1.1      ross     if ( 0 < expDiff ) {
   2639  1.1      ross         if ( aExp == 0x7FF ) {
   2640  1.1      ross             if ( aSig ) return propagateFloat64NaN( a, b );
   2641  1.1      ross             return a;
   2642  1.1      ross         }
   2643  1.1      ross         if ( bExp == 0 ) {
   2644  1.1      ross             --expDiff;
   2645  1.1      ross         }
   2646  1.1      ross         else {
   2647  1.1      ross             bSig |= LIT64( 0x2000000000000000 );
   2648  1.1      ross         }
   2649  1.1      ross         shift64RightJamming( bSig, expDiff, &bSig );
   2650  1.1      ross         zExp = aExp;
   2651  1.1      ross     }
   2652  1.1      ross     else if ( expDiff < 0 ) {
   2653  1.1      ross         if ( bExp == 0x7FF ) {
   2654  1.1      ross             if ( bSig ) return propagateFloat64NaN( a, b );
   2655  1.1      ross             return packFloat64( zSign, 0x7FF, 0 );
   2656  1.1      ross         }
   2657  1.1      ross         if ( aExp == 0 ) {
   2658  1.1      ross             ++expDiff;
   2659  1.1      ross         }
   2660  1.1      ross         else {
   2661  1.1      ross             aSig |= LIT64( 0x2000000000000000 );
   2662  1.1      ross         }
   2663  1.1      ross         shift64RightJamming( aSig, - expDiff, &aSig );
   2664  1.1      ross         zExp = bExp;
   2665  1.1      ross     }
   2666  1.1      ross     else {
   2667  1.1      ross         if ( aExp == 0x7FF ) {
   2668  1.1      ross             if ( aSig | bSig ) return propagateFloat64NaN( a, b );
   2669  1.1      ross             return a;
   2670  1.1      ross         }
   2671  1.1      ross         if ( aExp == 0 ) return packFloat64( zSign, 0, ( aSig + bSig )>>9 );
   2672  1.1      ross         zSig = LIT64( 0x4000000000000000 ) + aSig + bSig;
   2673  1.1      ross         zExp = aExp;
   2674  1.1      ross         goto roundAndPack;
   2675  1.1      ross     }
   2676  1.1      ross     aSig |= LIT64( 0x2000000000000000 );
   2677  1.1      ross     zSig = ( aSig + bSig )<<1;
   2678  1.1      ross     --zExp;
   2679  1.1      ross     if ( (sbits64) zSig < 0 ) {
   2680  1.1      ross         zSig = aSig + bSig;
   2681  1.1      ross         ++zExp;
   2682  1.1      ross     }
   2683  1.1      ross  roundAndPack:
   2684  1.1      ross     return roundAndPackFloat64( zSign, zExp, zSig );
   2685  1.1      ross 
   2686  1.1      ross }
   2687  1.1      ross 
   2688  1.7   thorpej /*----------------------------------------------------------------------------
   2689  1.7   thorpej | Returns the result of subtracting the absolute values of the double-
   2690  1.7   thorpej | precision floating-point values `a' and `b'.  If `zSign' is 1, the
   2691  1.7   thorpej | difference is negated before being returned.  `zSign' is ignored if the
   2692  1.7   thorpej | result is a NaN.  The subtraction is performed according to the IEC/IEEE
   2693  1.7   thorpej | Standard for Binary Floating-Point Arithmetic.
   2694  1.7   thorpej *----------------------------------------------------------------------------*/
   2695  1.7   thorpej 
   2696  1.1      ross static float64 subFloat64Sigs( float64 a, float64 b, flag zSign )
   2697  1.1      ross {
   2698  1.1      ross     int16 aExp, bExp, zExp;
   2699  1.1      ross     bits64 aSig, bSig, zSig;
   2700  1.1      ross     int16 expDiff;
   2701  1.1      ross 
   2702  1.1      ross     aSig = extractFloat64Frac( a );
   2703  1.1      ross     aExp = extractFloat64Exp( a );
   2704  1.1      ross     bSig = extractFloat64Frac( b );
   2705  1.1      ross     bExp = extractFloat64Exp( b );
   2706  1.1      ross     expDiff = aExp - bExp;
   2707  1.1      ross     aSig <<= 10;
   2708  1.1      ross     bSig <<= 10;
   2709  1.1      ross     if ( 0 < expDiff ) goto aExpBigger;
   2710  1.1      ross     if ( expDiff < 0 ) goto bExpBigger;
   2711  1.1      ross     if ( aExp == 0x7FF ) {
   2712  1.1      ross         if ( aSig | bSig ) return propagateFloat64NaN( a, b );
   2713  1.1      ross         float_raise( float_flag_invalid );
   2714  1.1      ross         return float64_default_nan;
   2715  1.1      ross     }
   2716  1.1      ross     if ( aExp == 0 ) {
   2717  1.1      ross         aExp = 1;
   2718  1.1      ross         bExp = 1;
   2719  1.1      ross     }
   2720  1.1      ross     if ( bSig < aSig ) goto aBigger;
   2721  1.1      ross     if ( aSig < bSig ) goto bBigger;
   2722  1.1      ross     return packFloat64( float_rounding_mode() == float_round_down, 0, 0 );
   2723  1.1      ross  bExpBigger:
   2724  1.1      ross     if ( bExp == 0x7FF ) {
   2725  1.1      ross         if ( bSig ) return propagateFloat64NaN( a, b );
   2726  1.1      ross         return packFloat64( zSign ^ 1, 0x7FF, 0 );
   2727  1.1      ross     }
   2728  1.1      ross     if ( aExp == 0 ) {
   2729  1.1      ross         ++expDiff;
   2730  1.1      ross     }
   2731  1.1      ross     else {
   2732  1.1      ross         aSig |= LIT64( 0x4000000000000000 );
   2733  1.1      ross     }
   2734  1.1      ross     shift64RightJamming( aSig, - expDiff, &aSig );
   2735  1.1      ross     bSig |= LIT64( 0x4000000000000000 );
   2736  1.1      ross  bBigger:
   2737  1.1      ross     zSig = bSig - aSig;
   2738  1.1      ross     zExp = bExp;
   2739  1.1      ross     zSign ^= 1;
   2740  1.1      ross     goto normalizeRoundAndPack;
   2741  1.1      ross  aExpBigger:
   2742  1.1      ross     if ( aExp == 0x7FF ) {
   2743  1.1      ross         if ( aSig ) return propagateFloat64NaN( a, b );
   2744  1.1      ross         return a;
   2745  1.1      ross     }
   2746  1.1      ross     if ( bExp == 0 ) {
   2747  1.1      ross         --expDiff;
   2748  1.1      ross     }
   2749  1.1      ross     else {
   2750  1.1      ross         bSig |= LIT64( 0x4000000000000000 );
   2751  1.1      ross     }
   2752  1.1      ross     shift64RightJamming( bSig, expDiff, &bSig );
   2753  1.1      ross     aSig |= LIT64( 0x4000000000000000 );
   2754  1.1      ross  aBigger:
   2755  1.1      ross     zSig = aSig - bSig;
   2756  1.1      ross     zExp = aExp;
   2757  1.1      ross  normalizeRoundAndPack:
   2758  1.1      ross     --zExp;
   2759  1.1      ross     return normalizeRoundAndPackFloat64( zSign, zExp, zSig );
   2760  1.1      ross 
   2761  1.1      ross }
   2762  1.1      ross 
   2763  1.7   thorpej /*----------------------------------------------------------------------------
   2764  1.7   thorpej | Returns the result of adding the double-precision floating-point values `a'
   2765  1.7   thorpej | and `b'.  The operation is performed according to the IEC/IEEE Standard for
   2766  1.7   thorpej | Binary Floating-Point Arithmetic.
   2767  1.7   thorpej *----------------------------------------------------------------------------*/
   2768  1.7   thorpej 
   2769  1.1      ross float64 float64_add( float64 a, float64 b )
   2770  1.1      ross {
   2771  1.1      ross     flag aSign, bSign;
   2772  1.1      ross 
   2773  1.1      ross     aSign = extractFloat64Sign( a );
   2774  1.1      ross     bSign = extractFloat64Sign( b );
   2775  1.1      ross     if ( aSign == bSign ) {
   2776  1.1      ross         return addFloat64Sigs( a, b, aSign );
   2777  1.1      ross     }
   2778  1.1      ross     else {
   2779  1.1      ross         return subFloat64Sigs( a, b, aSign );
   2780  1.1      ross     }
   2781  1.1      ross 
   2782  1.1      ross }
   2783  1.1      ross 
   2784  1.7   thorpej /*----------------------------------------------------------------------------
   2785  1.7   thorpej | Returns the result of subtracting the double-precision floating-point values
   2786  1.7   thorpej | `a' and `b'.  The operation is performed according to the IEC/IEEE Standard
   2787  1.7   thorpej | for Binary Floating-Point Arithmetic.
   2788  1.7   thorpej *----------------------------------------------------------------------------*/
   2789  1.7   thorpej 
   2790  1.1      ross float64 float64_sub( float64 a, float64 b )
   2791  1.1      ross {
   2792  1.1      ross     flag aSign, bSign;
   2793  1.1      ross 
   2794  1.1      ross     aSign = extractFloat64Sign( a );
   2795  1.1      ross     bSign = extractFloat64Sign( b );
   2796  1.1      ross     if ( aSign == bSign ) {
   2797  1.1      ross         return subFloat64Sigs( a, b, aSign );
   2798  1.1      ross     }
   2799  1.1      ross     else {
   2800  1.1      ross         return addFloat64Sigs( a, b, aSign );
   2801  1.1      ross     }
   2802  1.1      ross 
   2803  1.1      ross }
   2804  1.1      ross 
   2805  1.7   thorpej /*----------------------------------------------------------------------------
   2806  1.7   thorpej | Returns the result of multiplying the double-precision floating-point values
   2807  1.7   thorpej | `a' and `b'.  The operation is performed according to the IEC/IEEE Standard
   2808  1.7   thorpej | for Binary Floating-Point Arithmetic.
   2809  1.7   thorpej *----------------------------------------------------------------------------*/
   2810  1.7   thorpej 
   2811  1.1      ross float64 float64_mul( float64 a, float64 b )
   2812  1.1      ross {
   2813  1.1      ross     flag aSign, bSign, zSign;
   2814  1.1      ross     int16 aExp, bExp, zExp;
   2815  1.1      ross     bits64 aSig, bSig, zSig0, zSig1;
   2816  1.1      ross 
   2817  1.1      ross     aSig = extractFloat64Frac( a );
   2818  1.1      ross     aExp = extractFloat64Exp( a );
   2819  1.1      ross     aSign = extractFloat64Sign( a );
   2820  1.1      ross     bSig = extractFloat64Frac( b );
   2821  1.1      ross     bExp = extractFloat64Exp( b );
   2822  1.1      ross     bSign = extractFloat64Sign( b );
   2823  1.1      ross     zSign = aSign ^ bSign;
   2824  1.1      ross     if ( aExp == 0x7FF ) {
   2825  1.1      ross         if ( aSig || ( ( bExp == 0x7FF ) && bSig ) ) {
   2826  1.1      ross             return propagateFloat64NaN( a, b );
   2827  1.1      ross         }
   2828  1.1      ross         if ( ( bExp | bSig ) == 0 ) {
   2829  1.1      ross             float_raise( float_flag_invalid );
   2830  1.1      ross             return float64_default_nan;
   2831  1.1      ross         }
   2832  1.1      ross         return packFloat64( zSign, 0x7FF, 0 );
   2833  1.1      ross     }
   2834  1.1      ross     if ( bExp == 0x7FF ) {
   2835  1.1      ross         if ( bSig ) return propagateFloat64NaN( a, b );
   2836  1.1      ross         if ( ( aExp | aSig ) == 0 ) {
   2837  1.1      ross             float_raise( float_flag_invalid );
   2838  1.1      ross             return float64_default_nan;
   2839  1.1      ross         }
   2840  1.1      ross         return packFloat64( zSign, 0x7FF, 0 );
   2841  1.1      ross     }
   2842  1.1      ross     if ( aExp == 0 ) {
   2843  1.1      ross         if ( aSig == 0 ) return packFloat64( zSign, 0, 0 );
   2844  1.1      ross         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
   2845  1.1      ross     }
   2846  1.1      ross     if ( bExp == 0 ) {
   2847  1.1      ross         if ( bSig == 0 ) return packFloat64( zSign, 0, 0 );
   2848  1.1      ross         normalizeFloat64Subnormal( bSig, &bExp, &bSig );
   2849  1.1      ross     }
   2850  1.1      ross     zExp = aExp + bExp - 0x3FF;
   2851  1.1      ross     aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<10;
   2852  1.1      ross     bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;
   2853  1.1      ross     mul64To128( aSig, bSig, &zSig0, &zSig1 );
   2854  1.1      ross     zSig0 |= ( zSig1 != 0 );
   2855  1.1      ross     if ( 0 <= (sbits64) ( zSig0<<1 ) ) {
   2856  1.1      ross         zSig0 <<= 1;
   2857  1.1      ross         --zExp;
   2858  1.1      ross     }
   2859  1.1      ross     return roundAndPackFloat64( zSign, zExp, zSig0 );
   2860  1.1      ross 
   2861  1.1      ross }
   2862  1.1      ross 
   2863  1.7   thorpej /*----------------------------------------------------------------------------
   2864  1.7   thorpej | Returns the result of dividing the double-precision floating-point value `a'
   2865  1.7   thorpej | by the corresponding value `b'.  The operation is performed according to
   2866  1.7   thorpej | the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
   2867  1.7   thorpej *----------------------------------------------------------------------------*/
   2868  1.7   thorpej 
   2869  1.1      ross float64 float64_div( float64 a, float64 b )
   2870  1.1      ross {
   2871  1.1      ross     flag aSign, bSign, zSign;
   2872  1.1      ross     int16 aExp, bExp, zExp;
   2873  1.1      ross     bits64 aSig, bSig, zSig;
   2874  1.1      ross     bits64 rem0, rem1;
   2875  1.1      ross     bits64 term0, term1;
   2876  1.1      ross 
   2877  1.1      ross     aSig = extractFloat64Frac( a );
   2878  1.1      ross     aExp = extractFloat64Exp( a );
   2879  1.1      ross     aSign = extractFloat64Sign( a );
   2880  1.1      ross     bSig = extractFloat64Frac( b );
   2881  1.1      ross     bExp = extractFloat64Exp( b );
   2882  1.1      ross     bSign = extractFloat64Sign( b );
   2883  1.1      ross     zSign = aSign ^ bSign;
   2884  1.1      ross     if ( aExp == 0x7FF ) {
   2885  1.1      ross         if ( aSig ) return propagateFloat64NaN( a, b );
   2886  1.1      ross         if ( bExp == 0x7FF ) {
   2887  1.1      ross             if ( bSig ) return propagateFloat64NaN( a, b );
   2888  1.1      ross             float_raise( float_flag_invalid );
   2889  1.1      ross             return float64_default_nan;
   2890  1.1      ross         }
   2891  1.1      ross         return packFloat64( zSign, 0x7FF, 0 );
   2892  1.1      ross     }
   2893  1.1      ross     if ( bExp == 0x7FF ) {
   2894  1.1      ross         if ( bSig ) return propagateFloat64NaN( a, b );
   2895  1.1      ross         return packFloat64( zSign, 0, 0 );
   2896  1.1      ross     }
   2897  1.1      ross     if ( bExp == 0 ) {
   2898  1.1      ross         if ( bSig == 0 ) {
   2899  1.1      ross             if ( ( aExp | aSig ) == 0 ) {
   2900  1.1      ross                 float_raise( float_flag_invalid );
   2901  1.1      ross                 return float64_default_nan;
   2902  1.1      ross             }
   2903  1.1      ross             float_raise( float_flag_divbyzero );
   2904  1.1      ross             return packFloat64( zSign, 0x7FF, 0 );
   2905  1.1      ross         }
   2906  1.1      ross         normalizeFloat64Subnormal( bSig, &bExp, &bSig );
   2907  1.1      ross     }
   2908  1.1      ross     if ( aExp == 0 ) {
   2909  1.1      ross         if ( aSig == 0 ) return packFloat64( zSign, 0, 0 );
   2910  1.1      ross         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
   2911  1.1      ross     }
   2912  1.1      ross     zExp = aExp - bExp + 0x3FD;
   2913  1.1      ross     aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<10;
   2914  1.1      ross     bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;
   2915  1.1      ross     if ( bSig <= ( aSig + aSig ) ) {
   2916  1.1      ross         aSig >>= 1;
   2917  1.1      ross         ++zExp;
   2918  1.1      ross     }
   2919  1.1      ross     zSig = estimateDiv128To64( aSig, 0, bSig );
   2920  1.1      ross     if ( ( zSig & 0x1FF ) <= 2 ) {
   2921  1.1      ross         mul64To128( bSig, zSig, &term0, &term1 );
   2922  1.1      ross         sub128( aSig, 0, term0, term1, &rem0, &rem1 );
   2923  1.1      ross         while ( (sbits64) rem0 < 0 ) {
   2924  1.1      ross             --zSig;
   2925  1.1      ross             add128( rem0, rem1, 0, bSig, &rem0, &rem1 );
   2926  1.1      ross         }
   2927  1.1      ross         zSig |= ( rem1 != 0 );
   2928  1.1      ross     }
   2929  1.1      ross     return roundAndPackFloat64( zSign, zExp, zSig );
   2930  1.1      ross 
   2931  1.1      ross }
   2932  1.1      ross 
   2933  1.1      ross #ifndef SOFTFLOAT_FOR_GCC
   2934  1.7   thorpej /*----------------------------------------------------------------------------
   2935  1.7   thorpej | Returns the remainder of the double-precision floating-point value `a'
   2936  1.7   thorpej | with respect to the corresponding value `b'.  The operation is performed
   2937  1.7   thorpej | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
   2938  1.7   thorpej *----------------------------------------------------------------------------*/
   2939  1.7   thorpej 
   2940  1.1      ross float64 float64_rem( float64 a, float64 b )
   2941  1.1      ross {
   2942  1.5  christos     flag aSign, bSign __unused, zSign;
   2943  1.1      ross     int16 aExp, bExp, expDiff;
   2944  1.1      ross     bits64 aSig, bSig;
   2945  1.1      ross     bits64 q, alternateASig;
   2946  1.1      ross     sbits64 sigMean;
   2947  1.1      ross 
   2948  1.1      ross     aSig = extractFloat64Frac( a );
   2949  1.1      ross     aExp = extractFloat64Exp( a );
   2950  1.1      ross     aSign = extractFloat64Sign( a );
   2951  1.1      ross     bSig = extractFloat64Frac( b );
   2952  1.1      ross     bExp = extractFloat64Exp( b );
   2953  1.1      ross     bSign = extractFloat64Sign( b );
   2954  1.1      ross     if ( aExp == 0x7FF ) {
   2955  1.1      ross         if ( aSig || ( ( bExp == 0x7FF ) && bSig ) ) {
   2956  1.1      ross             return propagateFloat64NaN( a, b );
   2957  1.1      ross         }
   2958  1.1      ross         float_raise( float_flag_invalid );
   2959  1.1      ross         return float64_default_nan;
   2960  1.1      ross     }
   2961  1.1      ross     if ( bExp == 0x7FF ) {
   2962  1.1      ross         if ( bSig ) return propagateFloat64NaN( a, b );
   2963  1.1      ross         return a;
   2964  1.1      ross     }
   2965  1.1      ross     if ( bExp == 0 ) {
   2966  1.1      ross         if ( bSig == 0 ) {
   2967  1.1      ross             float_raise( float_flag_invalid );
   2968  1.1      ross             return float64_default_nan;
   2969  1.1      ross         }
   2970  1.1      ross         normalizeFloat64Subnormal( bSig, &bExp, &bSig );
   2971  1.1      ross     }
   2972  1.1      ross     if ( aExp == 0 ) {
   2973  1.1      ross         if ( aSig == 0 ) return a;
   2974  1.1      ross         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
   2975  1.1      ross     }
   2976  1.1      ross     expDiff = aExp - bExp;
   2977  1.1      ross     aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<11;
   2978  1.1      ross     bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;
   2979  1.1      ross     if ( expDiff < 0 ) {
   2980  1.1      ross         if ( expDiff < -1 ) return a;
   2981  1.1      ross         aSig >>= 1;
   2982  1.1      ross     }
   2983  1.1      ross     q = ( bSig <= aSig );
   2984  1.1      ross     if ( q ) aSig -= bSig;
   2985  1.1      ross     expDiff -= 64;
   2986  1.1      ross     while ( 0 < expDiff ) {
   2987  1.1      ross         q = estimateDiv128To64( aSig, 0, bSig );
   2988  1.1      ross         q = ( 2 < q ) ? q - 2 : 0;
   2989  1.1      ross         aSig = - ( ( bSig>>2 ) * q );
   2990  1.1      ross         expDiff -= 62;
   2991  1.1      ross     }
   2992  1.1      ross     expDiff += 64;
   2993  1.1      ross     if ( 0 < expDiff ) {
   2994  1.1      ross         q = estimateDiv128To64( aSig, 0, bSig );
   2995  1.1      ross         q = ( 2 < q ) ? q - 2 : 0;
   2996  1.1      ross         q >>= 64 - expDiff;
   2997  1.1      ross         bSig >>= 2;
   2998  1.1      ross         aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q;
   2999  1.1      ross     }
   3000  1.1      ross     else {
   3001  1.1      ross         aSig >>= 2;
   3002  1.1      ross         bSig >>= 2;
   3003  1.1      ross     }
   3004  1.1      ross     do {
   3005  1.1      ross         alternateASig = aSig;
   3006  1.1      ross         ++q;
   3007  1.1      ross         aSig -= bSig;
   3008  1.1      ross     } while ( 0 <= (sbits64) aSig );
   3009  1.1      ross     sigMean = aSig + alternateASig;
   3010  1.1      ross     if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) {
   3011  1.1      ross         aSig = alternateASig;
   3012  1.1      ross     }
   3013  1.1      ross     zSign = ( (sbits64) aSig < 0 );
   3014  1.1      ross     if ( zSign ) aSig = - aSig;
   3015  1.1      ross     return normalizeRoundAndPackFloat64( aSign ^ zSign, bExp, aSig );
   3016  1.1      ross 
   3017  1.1      ross }
   3018  1.1      ross 
   3019  1.7   thorpej /*----------------------------------------------------------------------------
   3020  1.7   thorpej | Returns the square root of the double-precision floating-point value `a'.
   3021  1.7   thorpej | The operation is performed according to the IEC/IEEE Standard for Binary
   3022  1.7   thorpej | Floating-Point Arithmetic.
   3023  1.7   thorpej *----------------------------------------------------------------------------*/
   3024  1.7   thorpej 
   3025  1.1      ross float64 float64_sqrt( float64 a )
   3026  1.1      ross {
   3027  1.1      ross     flag aSign;
   3028  1.1      ross     int16 aExp, zExp;
   3029  1.1      ross     bits64 aSig, zSig, doubleZSig;
   3030  1.1      ross     bits64 rem0, rem1, term0, term1;
   3031  1.1      ross 
   3032  1.1      ross     aSig = extractFloat64Frac( a );
   3033  1.1      ross     aExp = extractFloat64Exp( a );
   3034  1.1      ross     aSign = extractFloat64Sign( a );
   3035  1.1      ross     if ( aExp == 0x7FF ) {
   3036  1.1      ross         if ( aSig ) return propagateFloat64NaN( a, a );
   3037  1.1      ross         if ( ! aSign ) return a;
   3038  1.1      ross         float_raise( float_flag_invalid );
   3039  1.1      ross         return float64_default_nan;
   3040  1.1      ross     }
   3041  1.1      ross     if ( aSign ) {
   3042  1.1      ross         if ( ( aExp | aSig ) == 0 ) return a;
   3043  1.1      ross         float_raise( float_flag_invalid );
   3044  1.1      ross         return float64_default_nan;
   3045  1.1      ross     }
   3046  1.1      ross     if ( aExp == 0 ) {
   3047  1.1      ross         if ( aSig == 0 ) return 0;
   3048  1.1      ross         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
   3049  1.1      ross     }
   3050  1.1      ross     zExp = ( ( aExp - 0x3FF )>>1 ) + 0x3FE;
   3051  1.1      ross     aSig |= LIT64( 0x0010000000000000 );
   3052  1.1      ross     zSig = estimateSqrt32( aExp, aSig>>21 );
   3053  1.1      ross     aSig <<= 9 - ( aExp & 1 );
   3054  1.1      ross     zSig = estimateDiv128To64( aSig, 0, zSig<<32 ) + ( zSig<<30 );
   3055  1.1      ross     if ( ( zSig & 0x1FF ) <= 5 ) {
   3056  1.1      ross         doubleZSig = zSig<<1;
   3057  1.1      ross         mul64To128( zSig, zSig, &term0, &term1 );
   3058  1.1      ross         sub128( aSig, 0, term0, term1, &rem0, &rem1 );
   3059  1.1      ross         while ( (sbits64) rem0 < 0 ) {
   3060  1.1      ross             --zSig;
   3061  1.1      ross             doubleZSig -= 2;
   3062  1.1      ross             add128( rem0, rem1, zSig>>63, doubleZSig | 1, &rem0, &rem1 );
   3063  1.1      ross         }
   3064  1.1      ross         zSig |= ( ( rem0 | rem1 ) != 0 );
   3065  1.1      ross     }
   3066  1.1      ross     return roundAndPackFloat64( 0, zExp, zSig );
   3067  1.1      ross 
   3068  1.1      ross }
   3069  1.1      ross #endif
   3070  1.1      ross 
   3071  1.7   thorpej /*----------------------------------------------------------------------------
   3072  1.7   thorpej | Returns 1 if the double-precision floating-point value `a' is equal to the
   3073  1.7   thorpej | corresponding value `b', and 0 otherwise.  The comparison is performed
   3074  1.7   thorpej | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
   3075  1.7   thorpej *----------------------------------------------------------------------------*/
   3076  1.7   thorpej 
   3077  1.1      ross flag float64_eq( float64 a, float64 b )
   3078  1.1      ross {
   3079  1.1      ross 
   3080  1.1      ross     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
   3081  1.1      ross          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
   3082  1.1      ross        ) {
   3083  1.1      ross         if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) {
   3084  1.1      ross             float_raise( float_flag_invalid );
   3085  1.1      ross         }
   3086  1.1      ross         return 0;
   3087  1.1      ross     }
   3088  1.1      ross     return ( a == b ) ||
   3089  1.1      ross 	( (bits64) ( ( FLOAT64_DEMANGLE(a) | FLOAT64_DEMANGLE(b) )<<1 ) == 0 );
   3090  1.1      ross 
   3091  1.1      ross }
   3092  1.1      ross 
   3093  1.7   thorpej /*----------------------------------------------------------------------------
   3094  1.7   thorpej | Returns 1 if the double-precision floating-point value `a' is less than or
   3095  1.7   thorpej | equal to the corresponding value `b', and 0 otherwise.  The comparison is
   3096  1.7   thorpej | performed according to the IEC/IEEE Standard for Binary Floating-Point
   3097  1.7   thorpej | Arithmetic.
   3098  1.7   thorpej *----------------------------------------------------------------------------*/
   3099  1.7   thorpej 
   3100  1.1      ross flag float64_le( float64 a, float64 b )
   3101  1.1      ross {
   3102  1.1      ross     flag aSign, bSign;
   3103  1.1      ross 
   3104  1.1      ross     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
   3105  1.1      ross          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
   3106  1.1      ross        ) {
   3107  1.1      ross         float_raise( float_flag_invalid );
   3108  1.1      ross         return 0;
   3109  1.1      ross     }
   3110  1.1      ross     aSign = extractFloat64Sign( a );
   3111  1.1      ross     bSign = extractFloat64Sign( b );
   3112  1.1      ross     if ( aSign != bSign )
   3113  1.1      ross 	return aSign ||
   3114  1.1      ross 	    ( (bits64) ( ( FLOAT64_DEMANGLE(a) | FLOAT64_DEMANGLE(b) )<<1 ) ==
   3115  1.1      ross 	      0 );
   3116  1.1      ross     return ( a == b ) ||
   3117  1.1      ross 	( aSign ^ ( FLOAT64_DEMANGLE(a) < FLOAT64_DEMANGLE(b) ) );
   3118  1.1      ross 
   3119  1.1      ross }
   3120  1.1      ross 
   3121  1.7   thorpej /*----------------------------------------------------------------------------
   3122  1.7   thorpej | Returns 1 if the double-precision floating-point value `a' is less than
   3123  1.7   thorpej | the corresponding value `b', and 0 otherwise.  The comparison is performed
   3124  1.7   thorpej | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
   3125  1.7   thorpej *----------------------------------------------------------------------------*/
   3126  1.7   thorpej 
   3127  1.1      ross flag float64_lt( float64 a, float64 b )
   3128  1.1      ross {
   3129  1.1      ross     flag aSign, bSign;
   3130  1.1      ross 
   3131  1.1      ross     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
   3132  1.1      ross          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
   3133  1.1      ross        ) {
   3134  1.1      ross         float_raise( float_flag_invalid );
   3135  1.1      ross         return 0;
   3136  1.1      ross     }
   3137  1.1      ross     aSign = extractFloat64Sign( a );
   3138  1.1      ross     bSign = extractFloat64Sign( b );
   3139  1.1      ross     if ( aSign != bSign )
   3140  1.1      ross 	return aSign &&
   3141  1.1      ross 	    ( (bits64) ( ( FLOAT64_DEMANGLE(a) | FLOAT64_DEMANGLE(b) )<<1 ) !=
   3142  1.1      ross 	      0 );
   3143  1.1      ross     return ( a != b ) &&
   3144  1.1      ross 	( aSign ^ ( FLOAT64_DEMANGLE(a) < FLOAT64_DEMANGLE(b) ) );
   3145  1.1      ross 
   3146  1.1      ross }
   3147  1.1      ross 
   3148  1.1      ross #ifndef SOFTFLOAT_FOR_GCC
   3149  1.7   thorpej /*----------------------------------------------------------------------------
   3150  1.7   thorpej | Returns 1 if the double-precision floating-point value `a' is equal to the
   3151  1.7   thorpej | corresponding value `b', and 0 otherwise.  The invalid exception is raised
   3152  1.7   thorpej | if either operand is a NaN.  Otherwise, the comparison is performed
   3153  1.7   thorpej | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
   3154  1.7   thorpej *----------------------------------------------------------------------------*/
   3155  1.7   thorpej 
   3156  1.1      ross flag float64_eq_signaling( float64 a, float64 b )
   3157  1.1      ross {
   3158  1.1      ross 
   3159  1.1      ross     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
   3160  1.1      ross          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
   3161  1.1      ross        ) {
   3162  1.1      ross         float_raise( float_flag_invalid );
   3163  1.1      ross         return 0;
   3164  1.1      ross     }
   3165  1.1      ross     return ( a == b ) || ( (bits64) ( ( a | b )<<1 ) == 0 );
   3166  1.1      ross 
   3167  1.1      ross }
   3168  1.1      ross 
   3169  1.7   thorpej /*----------------------------------------------------------------------------
   3170  1.7   thorpej | Returns 1 if the double-precision floating-point value `a' is less than or
   3171  1.7   thorpej | equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs do not
   3172  1.7   thorpej | cause an exception.  Otherwise, the comparison is performed according to the
   3173  1.7   thorpej | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
   3174  1.7   thorpej *----------------------------------------------------------------------------*/
   3175  1.7   thorpej 
   3176  1.1      ross flag float64_le_quiet( float64 a, float64 b )
   3177  1.1      ross {
   3178  1.1      ross     flag aSign, bSign;
   3179  1.1      ross 
   3180  1.1      ross     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
   3181  1.1      ross          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
   3182  1.1      ross        ) {
   3183  1.1      ross         if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) {
   3184  1.1      ross             float_raise( float_flag_invalid );
   3185  1.1      ross         }
   3186  1.1      ross         return 0;
   3187  1.1      ross     }
   3188  1.1      ross     aSign = extractFloat64Sign( a );
   3189  1.1      ross     bSign = extractFloat64Sign( b );
   3190  1.1      ross     if ( aSign != bSign ) return aSign || ( (bits64) ( ( a | b )<<1 ) == 0 );
   3191  1.1      ross     return ( a == b ) || ( aSign ^ ( a < b ) );
   3192  1.1      ross 
   3193  1.1      ross }
   3194  1.1      ross 
   3195  1.7   thorpej /*----------------------------------------------------------------------------
   3196  1.7   thorpej | Returns 1 if the double-precision floating-point value `a' is less than
   3197  1.7   thorpej | the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
   3198  1.7   thorpej | exception.  Otherwise, the comparison is performed according to the IEC/IEEE
   3199  1.7   thorpej | Standard for Binary Floating-Point Arithmetic.
   3200  1.7   thorpej *----------------------------------------------------------------------------*/
   3201  1.7   thorpej 
   3202  1.1      ross flag float64_lt_quiet( float64 a, float64 b )
   3203  1.1      ross {
   3204  1.1      ross     flag aSign, bSign;
   3205  1.1      ross 
   3206  1.1      ross     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
   3207  1.1      ross          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
   3208  1.1      ross        ) {
   3209  1.1      ross         if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) {
   3210  1.1      ross             float_raise( float_flag_invalid );
   3211  1.1      ross         }
   3212  1.1      ross         return 0;
   3213  1.1      ross     }
   3214  1.1      ross     aSign = extractFloat64Sign( a );
   3215  1.1      ross     bSign = extractFloat64Sign( b );
   3216  1.1      ross     if ( aSign != bSign ) return aSign && ( (bits64) ( ( a | b )<<1 ) != 0 );
   3217  1.1      ross     return ( a != b ) && ( aSign ^ ( a < b ) );
   3218  1.1      ross 
   3219  1.1      ross }
   3220  1.1      ross #endif
   3221  1.1      ross 
   3222  1.1      ross #ifdef FLOATX80
   3223  1.1      ross 
   3224  1.7   thorpej /*----------------------------------------------------------------------------
   3225  1.7   thorpej | Returns the result of converting the extended double-precision floating-
   3226  1.7   thorpej | point value `a' to the 32-bit two's complement integer format.  The
   3227  1.7   thorpej | conversion is performed according to the IEC/IEEE Standard for Binary
   3228  1.7   thorpej | Floating-Point Arithmetic---which means in particular that the conversion
   3229  1.7   thorpej | is rounded according to the current rounding mode.  If `a' is a NaN, the
   3230  1.7   thorpej | largest positive integer is returned.  Otherwise, if the conversion
   3231  1.7   thorpej | overflows, the largest integer with the same sign as `a' is returned.
   3232  1.7   thorpej *----------------------------------------------------------------------------*/
   3233  1.7   thorpej 
   3234  1.1      ross int32 floatx80_to_int32( floatx80 a )
   3235  1.1      ross {
   3236  1.1      ross     flag aSign;
   3237  1.1      ross     int32 aExp, shiftCount;
   3238  1.1      ross     bits64 aSig;
   3239  1.1      ross 
   3240  1.1      ross     aSig = extractFloatx80Frac( a );
   3241  1.1      ross     aExp = extractFloatx80Exp( a );
   3242  1.1      ross     aSign = extractFloatx80Sign( a );
   3243  1.1      ross     if ( ( aExp == 0x7FFF ) && (bits64) ( aSig<<1 ) ) aSign = 0;
   3244  1.1      ross     shiftCount = 0x4037 - aExp;
   3245  1.1      ross     if ( shiftCount <= 0 ) shiftCount = 1;
   3246  1.1      ross     shift64RightJamming( aSig, shiftCount, &aSig );
   3247  1.1      ross     return roundAndPackInt32( aSign, aSig );
   3248  1.1      ross 
   3249  1.1      ross }
   3250  1.1      ross 
   3251  1.7   thorpej /*----------------------------------------------------------------------------
   3252  1.7   thorpej | Returns the result of converting the extended double-precision floating-
   3253  1.7   thorpej | point value `a' to the 32-bit two's complement integer format.  The
   3254  1.7   thorpej | conversion is performed according to the IEC/IEEE Standard for Binary
   3255  1.7   thorpej | Floating-Point Arithmetic, except that the conversion is always rounded
   3256  1.7   thorpej | toward zero.  If `a' is a NaN, the largest positive integer is returned.
   3257  1.7   thorpej | Otherwise, if the conversion overflows, the largest integer with the same
   3258  1.7   thorpej | sign as `a' is returned.
   3259  1.7   thorpej *----------------------------------------------------------------------------*/
   3260  1.7   thorpej 
   3261  1.1      ross int32 floatx80_to_int32_round_to_zero( floatx80 a )
   3262  1.1      ross {
   3263  1.1      ross     flag aSign;
   3264  1.1      ross     int32 aExp, shiftCount;
   3265  1.1      ross     bits64 aSig, savedASig;
   3266  1.1      ross     int32 z;
   3267  1.1      ross 
   3268  1.1      ross     aSig = extractFloatx80Frac( a );
   3269  1.1      ross     aExp = extractFloatx80Exp( a );
   3270  1.1      ross     aSign = extractFloatx80Sign( a );
   3271  1.1      ross     if ( 0x401E < aExp ) {
   3272  1.1      ross         if ( ( aExp == 0x7FFF ) && (bits64) ( aSig<<1 ) ) aSign = 0;
   3273  1.1      ross         goto invalid;
   3274  1.1      ross     }
   3275  1.1      ross     else if ( aExp < 0x3FFF ) {
   3276  1.1      ross         if ( aExp || aSig ) float_set_inexact();
   3277  1.1      ross         return 0;
   3278  1.1      ross     }
   3279  1.1      ross     shiftCount = 0x403E - aExp;
   3280  1.1      ross     savedASig = aSig;
   3281  1.1      ross     aSig >>= shiftCount;
   3282  1.1      ross     z = aSig;
   3283  1.1      ross     if ( aSign ) z = - z;
   3284  1.1      ross     if ( ( z < 0 ) ^ aSign ) {
   3285  1.1      ross  invalid:
   3286  1.1      ross         float_raise( float_flag_invalid );
   3287  1.1      ross         return aSign ? (sbits32) 0x80000000 : 0x7FFFFFFF;
   3288  1.1      ross     }
   3289  1.1      ross     if ( ( aSig<<shiftCount ) != savedASig ) {
   3290  1.1      ross         float_set_inexact();
   3291  1.1      ross     }
   3292  1.1      ross     return z;
   3293  1.1      ross 
   3294  1.1      ross }
   3295  1.1      ross 
   3296  1.7   thorpej /*----------------------------------------------------------------------------
   3297  1.7   thorpej | Returns the result of converting the extended double-precision floating-
   3298  1.7   thorpej | point value `a' to the 64-bit two's complement integer format.  The
   3299  1.7   thorpej | conversion is performed according to the IEC/IEEE Standard for Binary
   3300  1.7   thorpej | Floating-Point Arithmetic---which means in particular that the conversion
   3301  1.7   thorpej | is rounded according to the current rounding mode.  If `a' is a NaN,
   3302  1.7   thorpej | the largest positive integer is returned.  Otherwise, if the conversion
   3303  1.7   thorpej | overflows, the largest integer with the same sign as `a' is returned.
   3304  1.7   thorpej *----------------------------------------------------------------------------*/
   3305  1.7   thorpej 
   3306  1.1      ross int64 floatx80_to_int64( floatx80 a )
   3307  1.1      ross {
   3308  1.1      ross     flag aSign;
   3309  1.1      ross     int32 aExp, shiftCount;
   3310  1.1      ross     bits64 aSig, aSigExtra;
   3311  1.1      ross 
   3312  1.1      ross     aSig = extractFloatx80Frac( a );
   3313  1.1      ross     aExp = extractFloatx80Exp( a );
   3314  1.1      ross     aSign = extractFloatx80Sign( a );
   3315  1.1      ross     shiftCount = 0x403E - aExp;
   3316  1.1      ross     if ( shiftCount <= 0 ) {
   3317  1.1      ross         if ( shiftCount ) {
   3318  1.1      ross             float_raise( float_flag_invalid );
   3319  1.1      ross             if (    ! aSign
   3320  1.1      ross                  || (    ( aExp == 0x7FFF )
   3321  1.1      ross                       && ( aSig != LIT64( 0x8000000000000000 ) ) )
   3322  1.1      ross                ) {
   3323  1.1      ross                 return LIT64( 0x7FFFFFFFFFFFFFFF );
   3324  1.1      ross             }
   3325  1.1      ross             return (sbits64) LIT64( 0x8000000000000000 );
   3326  1.1      ross         }
   3327  1.1      ross         aSigExtra = 0;
   3328  1.1      ross     }
   3329  1.1      ross     else {
   3330  1.1      ross         shift64ExtraRightJamming( aSig, 0, shiftCount, &aSig, &aSigExtra );
   3331  1.1      ross     }
   3332  1.1      ross     return roundAndPackInt64( aSign, aSig, aSigExtra );
   3333  1.1      ross 
   3334  1.1      ross }
   3335  1.1      ross 
   3336  1.7   thorpej /*----------------------------------------------------------------------------
   3337  1.7   thorpej | Returns the result of converting the extended double-precision floating-
   3338  1.7   thorpej | point value `a' to the 64-bit two's complement integer format.  The
   3339  1.7   thorpej | conversion is performed according to the IEC/IEEE Standard for Binary
   3340  1.7   thorpej | Floating-Point Arithmetic, except that the conversion is always rounded
   3341  1.7   thorpej | toward zero.  If `a' is a NaN, the largest positive integer is returned.
   3342  1.7   thorpej | Otherwise, if the conversion overflows, the largest integer with the same
   3343  1.7   thorpej | sign as `a' is returned.
   3344  1.7   thorpej *----------------------------------------------------------------------------*/
   3345  1.7   thorpej 
   3346  1.1      ross int64 floatx80_to_int64_round_to_zero( floatx80 a )
   3347  1.1      ross {
   3348  1.1      ross     flag aSign;
   3349  1.1      ross     int32 aExp, shiftCount;
   3350  1.1      ross     bits64 aSig;
   3351  1.1      ross     int64 z;
   3352  1.1      ross 
   3353  1.1      ross     aSig = extractFloatx80Frac( a );
   3354  1.1      ross     aExp = extractFloatx80Exp( a );
   3355  1.1      ross     aSign = extractFloatx80Sign( a );
   3356  1.1      ross     shiftCount = aExp - 0x403E;
   3357  1.1      ross     if ( 0 <= shiftCount ) {
   3358  1.1      ross         aSig &= LIT64( 0x7FFFFFFFFFFFFFFF );
   3359  1.1      ross         if ( ( a.high != 0xC03E ) || aSig ) {
   3360  1.1      ross             float_raise( float_flag_invalid );
   3361  1.1      ross             if ( ! aSign || ( ( aExp == 0x7FFF ) && aSig ) ) {
   3362  1.1      ross                 return LIT64( 0x7FFFFFFFFFFFFFFF );
   3363  1.1      ross             }
   3364  1.1      ross         }
   3365  1.1      ross         return (sbits64) LIT64( 0x8000000000000000 );
   3366  1.1      ross     }
   3367  1.1      ross     else if ( aExp < 0x3FFF ) {
   3368  1.1      ross         if ( aExp | aSig ) float_set_inexact();
   3369  1.1      ross         return 0;
   3370  1.1      ross     }
   3371  1.1      ross     z = aSig>>( - shiftCount );
   3372  1.1      ross     if ( (bits64) ( aSig<<( shiftCount & 63 ) ) ) {
   3373  1.1      ross         float_set_inexact();
   3374  1.1      ross     }
   3375  1.1      ross     if ( aSign ) z = - z;
   3376  1.1      ross     return z;
   3377  1.1      ross 
   3378  1.1      ross }
   3379  1.1      ross 
   3380  1.7   thorpej /*----------------------------------------------------------------------------
   3381  1.7   thorpej | Returns the result of converting the extended double-precision floating-
   3382  1.7   thorpej | point value `a' to the single-precision floating-point format.  The
   3383  1.7   thorpej | conversion is performed according to the IEC/IEEE Standard for Binary
   3384  1.7   thorpej | Floating-Point Arithmetic.
   3385  1.7   thorpej *----------------------------------------------------------------------------*/
   3386  1.7   thorpej 
   3387  1.1      ross float32 floatx80_to_float32( floatx80 a )
   3388  1.1      ross {
   3389  1.1      ross     flag aSign;
   3390  1.1      ross     int32 aExp;
   3391  1.1      ross     bits64 aSig;
   3392  1.1      ross 
   3393  1.1      ross     aSig = extractFloatx80Frac( a );
   3394  1.1      ross     aExp = extractFloatx80Exp( a );
   3395  1.1      ross     aSign = extractFloatx80Sign( a );
   3396  1.1      ross     if ( aExp == 0x7FFF ) {
   3397  1.1      ross         if ( (bits64) ( aSig<<1 ) ) {
   3398  1.1      ross             return commonNaNToFloat32( floatx80ToCommonNaN( a ) );
   3399  1.1      ross         }
   3400  1.1      ross         return packFloat32( aSign, 0xFF, 0 );
   3401  1.1      ross     }
   3402  1.1      ross     shift64RightJamming( aSig, 33, &aSig );
   3403  1.1      ross     if ( aExp || aSig ) aExp -= 0x3F81;
   3404  1.1      ross     return roundAndPackFloat32( aSign, aExp, aSig );
   3405  1.1      ross 
   3406  1.1      ross }
   3407  1.1      ross 
   3408  1.7   thorpej /*----------------------------------------------------------------------------
   3409  1.7   thorpej | Returns the result of converting the extended double-precision floating-
   3410  1.7   thorpej | point value `a' to the double-precision floating-point format.  The
   3411  1.7   thorpej | conversion is performed according to the IEC/IEEE Standard for Binary
   3412  1.7   thorpej | Floating-Point Arithmetic.
   3413  1.7   thorpej *----------------------------------------------------------------------------*/
   3414  1.7   thorpej 
   3415  1.1      ross float64 floatx80_to_float64( floatx80 a )
   3416  1.1      ross {
   3417  1.1      ross     flag aSign;
   3418  1.1      ross     int32 aExp;
   3419  1.1      ross     bits64 aSig, zSig;
   3420  1.1      ross 
   3421  1.1      ross     aSig = extractFloatx80Frac( a );
   3422  1.1      ross     aExp = extractFloatx80Exp( a );
   3423  1.1      ross     aSign = extractFloatx80Sign( a );
   3424  1.1      ross     if ( aExp == 0x7FFF ) {
   3425  1.1      ross         if ( (bits64) ( aSig<<1 ) ) {
   3426  1.1      ross             return commonNaNToFloat64( floatx80ToCommonNaN( a ) );
   3427  1.1      ross         }
   3428  1.1      ross         return packFloat64( aSign, 0x7FF, 0 );
   3429  1.1      ross     }
   3430  1.1      ross     shift64RightJamming( aSig, 1, &zSig );
   3431  1.1      ross     if ( aExp || aSig ) aExp -= 0x3C01;
   3432  1.1      ross     return roundAndPackFloat64( aSign, aExp, zSig );
   3433  1.1      ross 
   3434  1.1      ross }
   3435  1.1      ross 
   3436  1.1      ross #ifdef FLOAT128
   3437  1.1      ross 
   3438  1.7   thorpej /*----------------------------------------------------------------------------
   3439  1.7   thorpej | Returns the result of converting the extended double-precision floating-
   3440  1.7   thorpej | point value `a' to the quadruple-precision floating-point format.  The
   3441  1.7   thorpej | conversion is performed according to the IEC/IEEE Standard for Binary
   3442  1.7   thorpej | Floating-Point Arithmetic.
   3443  1.7   thorpej *----------------------------------------------------------------------------*/
   3444  1.7   thorpej 
   3445  1.1      ross float128 floatx80_to_float128( floatx80 a )
   3446  1.1      ross {
   3447  1.1      ross     flag aSign;
   3448  1.1      ross     int16 aExp;
   3449  1.1      ross     bits64 aSig, zSig0, zSig1;
   3450  1.1      ross 
   3451  1.1      ross     aSig = extractFloatx80Frac( a );
   3452  1.1      ross     aExp = extractFloatx80Exp( a );
   3453  1.1      ross     aSign = extractFloatx80Sign( a );
   3454  1.1      ross     if ( ( aExp == 0x7FFF ) && (bits64) ( aSig<<1 ) ) {
   3455  1.1      ross         return commonNaNToFloat128( floatx80ToCommonNaN( a ) );
   3456  1.1      ross     }
   3457  1.1      ross     shift128Right( aSig<<1, 0, 16, &zSig0, &zSig1 );
   3458  1.1      ross     return packFloat128( aSign, aExp, zSig0, zSig1 );
   3459  1.1      ross 
   3460  1.1      ross }
   3461  1.1      ross 
   3462  1.1      ross #endif
   3463  1.1      ross 
   3464  1.7   thorpej /*----------------------------------------------------------------------------
   3465  1.7   thorpej | Rounds the extended double-precision floating-point value `a' to an integer,
   3466  1.7   thorpej | and returns the result as an extended quadruple-precision floating-point
   3467  1.7   thorpej | value.  The operation is performed according to the IEC/IEEE Standard for
   3468  1.7   thorpej | Binary Floating-Point Arithmetic.
   3469  1.7   thorpej *----------------------------------------------------------------------------*/
   3470  1.7   thorpej 
   3471  1.1      ross floatx80 floatx80_round_to_int( floatx80 a )
   3472  1.1      ross {
   3473  1.1      ross     flag aSign;
   3474  1.1      ross     int32 aExp;
   3475  1.1      ross     bits64 lastBitMask, roundBitsMask;
   3476  1.1      ross     int8 roundingMode;
   3477  1.1      ross     floatx80 z;
   3478  1.1      ross 
   3479  1.1      ross     aExp = extractFloatx80Exp( a );
   3480  1.1      ross     if ( 0x403E <= aExp ) {
   3481  1.1      ross         if ( ( aExp == 0x7FFF ) && (bits64) ( extractFloatx80Frac( a )<<1 ) ) {
   3482  1.1      ross             return propagateFloatx80NaN( a, a );
   3483  1.1      ross         }
   3484  1.1      ross         return a;
   3485  1.1      ross     }
   3486  1.1      ross     if ( aExp < 0x3FFF ) {
   3487  1.1      ross         if (    ( aExp == 0 )
   3488  1.1      ross              && ( (bits64) ( extractFloatx80Frac( a )<<1 ) == 0 ) ) {
   3489  1.1      ross             return a;
   3490  1.1      ross         }
   3491  1.1      ross         float_set_inexact();
   3492  1.1      ross         aSign = extractFloatx80Sign( a );
   3493  1.1      ross         switch ( float_rounding_mode() ) {
   3494  1.1      ross          case float_round_nearest_even:
   3495  1.1      ross             if ( ( aExp == 0x3FFE ) && (bits64) ( extractFloatx80Frac( a )<<1 )
   3496  1.1      ross                ) {
   3497  1.1      ross                 return
   3498  1.1      ross                     packFloatx80( aSign, 0x3FFF, LIT64( 0x8000000000000000 ) );
   3499  1.1      ross             }
   3500  1.1      ross             break;
   3501  1.1      ross          case float_round_down:
   3502  1.1      ross             return
   3503  1.1      ross                   aSign ?
   3504  1.1      ross                       packFloatx80( 1, 0x3FFF, LIT64( 0x8000000000000000 ) )
   3505  1.1      ross                 : packFloatx80( 0, 0, 0 );
   3506  1.1      ross          case float_round_up:
   3507  1.1      ross             return
   3508  1.1      ross                   aSign ? packFloatx80( 1, 0, 0 )
   3509  1.1      ross                 : packFloatx80( 0, 0x3FFF, LIT64( 0x8000000000000000 ) );
   3510  1.1      ross         }
   3511  1.1      ross         return packFloatx80( aSign, 0, 0 );
   3512  1.1      ross     }
   3513  1.1      ross     lastBitMask = 1;
   3514  1.1      ross     lastBitMask <<= 0x403E - aExp;
   3515  1.1      ross     roundBitsMask = lastBitMask - 1;
   3516  1.1      ross     z = a;
   3517  1.1      ross     roundingMode = float_rounding_mode();
   3518  1.1      ross     if ( roundingMode == float_round_nearest_even ) {
   3519  1.1      ross         z.low += lastBitMask>>1;
   3520  1.1      ross         if ( ( z.low & roundBitsMask ) == 0 ) z.low &= ~ lastBitMask;
   3521  1.1      ross     }
   3522  1.1      ross     else if ( roundingMode != float_round_to_zero ) {
   3523  1.1      ross         if ( extractFloatx80Sign( z ) ^ ( roundingMode == float_round_up ) ) {
   3524  1.1      ross             z.low += roundBitsMask;
   3525  1.1      ross         }
   3526  1.1      ross     }
   3527  1.1      ross     z.low &= ~ roundBitsMask;
   3528  1.1      ross     if ( z.low == 0 ) {
   3529  1.1      ross         ++z.high;
   3530  1.1      ross         z.low = LIT64( 0x8000000000000000 );
   3531  1.1      ross     }
   3532  1.1      ross     if ( z.low != a.low ) float_set_inexact();
   3533  1.1      ross     return z;
   3534  1.1      ross 
   3535  1.1      ross }
   3536  1.1      ross 
   3537  1.7   thorpej /*----------------------------------------------------------------------------
   3538  1.7   thorpej | Returns the result of adding the absolute values of the extended double-
   3539  1.7   thorpej | precision floating-point values `a' and `b'.  If `zSign' is 1, the sum is
   3540  1.7   thorpej | negated before being returned.  `zSign' is ignored if the result is a NaN.
   3541  1.7   thorpej | The addition is performed according to the IEC/IEEE Standard for Binary
   3542  1.7   thorpej | Floating-Point Arithmetic.
   3543  1.7   thorpej *----------------------------------------------------------------------------*/
   3544  1.7   thorpej 
   3545  1.1      ross static floatx80 addFloatx80Sigs( floatx80 a, floatx80 b, flag zSign )
   3546  1.1      ross {
   3547  1.1      ross     int32 aExp, bExp, zExp;
   3548  1.1      ross     bits64 aSig, bSig, zSig0, zSig1;
   3549  1.1      ross     int32 expDiff;
   3550  1.1      ross 
   3551  1.1      ross     aSig = extractFloatx80Frac( a );
   3552  1.1      ross     aExp = extractFloatx80Exp( a );
   3553  1.1      ross     bSig = extractFloatx80Frac( b );
   3554  1.1      ross     bExp = extractFloatx80Exp( b );
   3555  1.1      ross     expDiff = aExp - bExp;
   3556  1.1      ross     if ( 0 < expDiff ) {
   3557  1.1      ross         if ( aExp == 0x7FFF ) {
   3558  1.1      ross             if ( (bits64) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b );
   3559  1.1      ross             return a;
   3560  1.1      ross         }
   3561  1.1      ross         if ( bExp == 0 ) --expDiff;
   3562  1.1      ross         shift64ExtraRightJamming( bSig, 0, expDiff, &bSig, &zSig1 );
   3563  1.1      ross         zExp = aExp;
   3564  1.1      ross     }
   3565  1.1      ross     else if ( expDiff < 0 ) {
   3566  1.1      ross         if ( bExp == 0x7FFF ) {
   3567  1.1      ross             if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b );
   3568  1.1      ross             return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
   3569  1.1      ross         }
   3570  1.1      ross         if ( aExp == 0 ) ++expDiff;
   3571  1.1      ross         shift64ExtraRightJamming( aSig, 0, - expDiff, &aSig, &zSig1 );
   3572  1.1      ross         zExp = bExp;
   3573  1.1      ross     }
   3574  1.1      ross     else {
   3575  1.1      ross         if ( aExp == 0x7FFF ) {
   3576  1.1      ross             if ( (bits64) ( ( aSig | bSig )<<1 ) ) {
   3577  1.1      ross                 return propagateFloatx80NaN( a, b );
   3578  1.1      ross             }
   3579  1.1      ross             return a;
   3580  1.1      ross         }
   3581  1.1      ross         zSig1 = 0;
   3582  1.1      ross         zSig0 = aSig + bSig;
   3583  1.1      ross         if ( aExp == 0 ) {
   3584  1.1      ross             normalizeFloatx80Subnormal( zSig0, &zExp, &zSig0 );
   3585  1.1      ross             goto roundAndPack;
   3586  1.1      ross         }
   3587  1.1      ross         zExp = aExp;
   3588  1.1      ross         goto shiftRight1;
   3589  1.1      ross     }
   3590  1.1      ross     zSig0 = aSig + bSig;
   3591  1.1      ross     if ( (sbits64) zSig0 < 0 ) goto roundAndPack;
   3592  1.1      ross  shiftRight1:
   3593  1.1      ross     shift64ExtraRightJamming( zSig0, zSig1, 1, &zSig0, &zSig1 );
   3594  1.1      ross     zSig0 |= LIT64( 0x8000000000000000 );
   3595  1.1      ross     ++zExp;
   3596  1.1      ross  roundAndPack:
   3597  1.1      ross     return
   3598  1.1      ross         roundAndPackFloatx80(
   3599  1.1      ross             floatx80_rounding_precision, zSign, zExp, zSig0, zSig1 );
   3600  1.1      ross 
   3601  1.1      ross }
   3602  1.1      ross 
   3603  1.7   thorpej /*----------------------------------------------------------------------------
   3604  1.7   thorpej | Returns the result of subtracting the absolute values of the extended
   3605  1.7   thorpej | double-precision floating-point values `a' and `b'.  If `zSign' is 1, the
   3606  1.7   thorpej | difference is negated before being returned.  `zSign' is ignored if the
   3607  1.7   thorpej | result is a NaN.  The subtraction is performed according to the IEC/IEEE
   3608  1.7   thorpej | Standard for Binary Floating-Point Arithmetic.
   3609  1.7   thorpej *----------------------------------------------------------------------------*/
   3610  1.7   thorpej 
   3611  1.1      ross static floatx80 subFloatx80Sigs( floatx80 a, floatx80 b, flag zSign )
   3612  1.1      ross {
   3613  1.1      ross     int32 aExp, bExp, zExp;
   3614  1.1      ross     bits64 aSig, bSig, zSig0, zSig1;
   3615  1.1      ross     int32 expDiff;
   3616  1.1      ross     floatx80 z;
   3617  1.1      ross 
   3618  1.1      ross     aSig = extractFloatx80Frac( a );
   3619  1.1      ross     aExp = extractFloatx80Exp( a );
   3620  1.1      ross     bSig = extractFloatx80Frac( b );
   3621  1.1      ross     bExp = extractFloatx80Exp( b );
   3622  1.1      ross     expDiff = aExp - bExp;
   3623  1.1      ross     if ( 0 < expDiff ) goto aExpBigger;
   3624  1.1      ross     if ( expDiff < 0 ) goto bExpBigger;
   3625  1.1      ross     if ( aExp == 0x7FFF ) {
   3626  1.1      ross         if ( (bits64) ( ( aSig | bSig )<<1 ) ) {
   3627  1.1      ross             return propagateFloatx80NaN( a, b );
   3628  1.1      ross         }
   3629  1.1      ross         float_raise( float_flag_invalid );
   3630  1.1      ross         z.low = floatx80_default_nan_low;
   3631  1.1      ross         z.high = floatx80_default_nan_high;
   3632  1.1      ross         return z;
   3633  1.1      ross     }
   3634  1.1      ross     if ( aExp == 0 ) {
   3635  1.1      ross         aExp = 1;
   3636  1.1      ross         bExp = 1;
   3637  1.1      ross     }
   3638  1.1      ross     zSig1 = 0;
   3639  1.1      ross     if ( bSig < aSig ) goto aBigger;
   3640  1.1      ross     if ( aSig < bSig ) goto bBigger;
   3641  1.1      ross     return packFloatx80( float_rounding_mode() == float_round_down, 0, 0 );
   3642  1.1      ross  bExpBigger:
   3643  1.1      ross     if ( bExp == 0x7FFF ) {
   3644  1.1      ross         if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b );
   3645  1.1      ross         return packFloatx80( zSign ^ 1, 0x7FFF, LIT64( 0x8000000000000000 ) );
   3646  1.1      ross     }
   3647  1.1      ross     if ( aExp == 0 ) ++expDiff;
   3648  1.1      ross     shift128RightJamming( aSig, 0, - expDiff, &aSig, &zSig1 );
   3649  1.1      ross  bBigger:
   3650  1.1      ross     sub128( bSig, 0, aSig, zSig1, &zSig0, &zSig1 );
   3651  1.1      ross     zExp = bExp;
   3652  1.1      ross     zSign ^= 1;
   3653  1.1      ross     goto normalizeRoundAndPack;
   3654  1.1      ross  aExpBigger:
   3655  1.1      ross     if ( aExp == 0x7FFF ) {
   3656  1.1      ross         if ( (bits64) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b );
   3657  1.1      ross         return a;
   3658  1.1      ross     }
   3659  1.1      ross     if ( bExp == 0 ) --expDiff;
   3660  1.1      ross     shift128RightJamming( bSig, 0, expDiff, &bSig, &zSig1 );
   3661  1.1      ross  aBigger:
   3662  1.1      ross     sub128( aSig, 0, bSig, zSig1, &zSig0, &zSig1 );
   3663  1.1      ross     zExp = aExp;
   3664  1.1      ross  normalizeRoundAndPack:
   3665  1.1      ross     return
   3666  1.1      ross         normalizeRoundAndPackFloatx80(
   3667  1.1      ross             floatx80_rounding_precision, zSign, zExp, zSig0, zSig1 );
   3668  1.1      ross 
   3669  1.1      ross }
   3670  1.1      ross 
   3671  1.7   thorpej /*----------------------------------------------------------------------------
   3672  1.7   thorpej | Returns the result of adding the extended double-precision floating-point
   3673  1.7   thorpej | values `a' and `b'.  The operation is performed according to the IEC/IEEE
   3674  1.7   thorpej | Standard for Binary Floating-Point Arithmetic.
   3675  1.7   thorpej *----------------------------------------------------------------------------*/
   3676  1.7   thorpej 
   3677  1.1      ross floatx80 floatx80_add( floatx80 a, floatx80 b )
   3678  1.1      ross {
   3679  1.1      ross     flag aSign, bSign;
   3680  1.1      ross 
   3681  1.1      ross     aSign = extractFloatx80Sign( a );
   3682  1.1      ross     bSign = extractFloatx80Sign( b );
   3683  1.1      ross     if ( aSign == bSign ) {
   3684  1.1      ross         return addFloatx80Sigs( a, b, aSign );
   3685  1.1      ross     }
   3686  1.1      ross     else {
   3687  1.1      ross         return subFloatx80Sigs( a, b, aSign );
   3688  1.1      ross     }
   3689  1.1      ross 
   3690  1.1      ross }
   3691  1.1      ross 
   3692  1.7   thorpej /*----------------------------------------------------------------------------
   3693  1.7   thorpej | Returns the result of subtracting the extended double-precision floating-
   3694  1.7   thorpej | point values `a' and `b'.  The operation is performed according to the
   3695  1.7   thorpej | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
   3696  1.7   thorpej *----------------------------------------------------------------------------*/
   3697  1.7   thorpej 
   3698  1.1      ross floatx80 floatx80_sub( floatx80 a, floatx80 b )
   3699  1.1      ross {
   3700  1.1      ross     flag aSign, bSign;
   3701  1.1      ross 
   3702  1.1      ross     aSign = extractFloatx80Sign( a );
   3703  1.1      ross     bSign = extractFloatx80Sign( b );
   3704  1.1      ross     if ( aSign == bSign ) {
   3705  1.1      ross         return subFloatx80Sigs( a, b, aSign );
   3706  1.1      ross     }
   3707  1.1      ross     else {
   3708  1.1      ross         return addFloatx80Sigs( a, b, aSign );
   3709  1.1      ross     }
   3710  1.1      ross 
   3711  1.1      ross }
   3712  1.1      ross 
   3713  1.7   thorpej /*----------------------------------------------------------------------------
   3714  1.7   thorpej | Returns the result of multiplying the extended double-precision floating-
   3715  1.7   thorpej | point values `a' and `b'.  The operation is performed according to the
   3716  1.7   thorpej | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
   3717  1.7   thorpej *----------------------------------------------------------------------------*/
   3718  1.7   thorpej 
   3719  1.1      ross floatx80 floatx80_mul( floatx80 a, floatx80 b )
   3720  1.1      ross {
   3721  1.1      ross     flag aSign, bSign, zSign;
   3722  1.1      ross     int32 aExp, bExp, zExp;
   3723  1.1      ross     bits64 aSig, bSig, zSig0, zSig1;
   3724  1.1      ross     floatx80 z;
   3725  1.1      ross 
   3726  1.1      ross     aSig = extractFloatx80Frac( a );
   3727  1.1      ross     aExp = extractFloatx80Exp( a );
   3728  1.1      ross     aSign = extractFloatx80Sign( a );
   3729  1.1      ross     bSig = extractFloatx80Frac( b );
   3730  1.1      ross     bExp = extractFloatx80Exp( b );
   3731  1.1      ross     bSign = extractFloatx80Sign( b );
   3732  1.1      ross     zSign = aSign ^ bSign;
   3733  1.1      ross     if ( aExp == 0x7FFF ) {
   3734  1.1      ross         if (    (bits64) ( aSig<<1 )
   3735  1.1      ross              || ( ( bExp == 0x7FFF ) && (bits64) ( bSig<<1 ) ) ) {
   3736  1.1      ross             return propagateFloatx80NaN( a, b );
   3737  1.1      ross         }
   3738  1.1      ross         if ( ( bExp | bSig ) == 0 ) goto invalid;
   3739  1.1      ross         return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
   3740  1.1      ross     }
   3741  1.1      ross     if ( bExp == 0x7FFF ) {
   3742  1.1      ross         if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b );
   3743  1.1      ross         if ( ( aExp | aSig ) == 0 ) {
   3744  1.1      ross  invalid:
   3745  1.1      ross             float_raise( float_flag_invalid );
   3746  1.1      ross             z.low = floatx80_default_nan_low;
   3747  1.1      ross             z.high = floatx80_default_nan_high;
   3748  1.1      ross             return z;
   3749  1.1      ross         }
   3750  1.1      ross         return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
   3751  1.1      ross     }
   3752  1.1      ross     if ( aExp == 0 ) {
   3753  1.1      ross         if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 );
   3754  1.1      ross         normalizeFloatx80Subnormal( aSig, &aExp, &aSig );
   3755  1.1      ross     }
   3756  1.1      ross     if ( bExp == 0 ) {
   3757  1.1      ross         if ( bSig == 0 ) return packFloatx80( zSign, 0, 0 );
   3758  1.1      ross         normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
   3759  1.1      ross     }
   3760  1.1      ross     zExp = aExp + bExp - 0x3FFE;
   3761  1.1      ross     mul64To128( aSig, bSig, &zSig0, &zSig1 );
   3762  1.1      ross     if ( 0 < (sbits64) zSig0 ) {
   3763  1.1      ross         shortShift128Left( zSig0, zSig1, 1, &zSig0, &zSig1 );
   3764  1.1      ross         --zExp;
   3765  1.1      ross     }
   3766  1.1      ross     return
   3767  1.1      ross         roundAndPackFloatx80(
   3768  1.1      ross             floatx80_rounding_precision, zSign, zExp, zSig0, zSig1 );
   3769  1.1      ross 
   3770  1.1      ross }
   3771  1.1      ross 
   3772  1.7   thorpej /*----------------------------------------------------------------------------
   3773  1.7   thorpej | Returns the result of dividing the extended double-precision floating-point
   3774  1.7   thorpej | value `a' by the corresponding value `b'.  The operation is performed
   3775  1.7   thorpej | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
   3776  1.7   thorpej *----------------------------------------------------------------------------*/
   3777  1.7   thorpej 
   3778  1.1      ross floatx80 floatx80_div( floatx80 a, floatx80 b )
   3779  1.1      ross {
   3780  1.1      ross     flag aSign, bSign, zSign;
   3781  1.1      ross     int32 aExp, bExp, zExp;
   3782  1.1      ross     bits64 aSig, bSig, zSig0, zSig1;
   3783  1.1      ross     bits64 rem0, rem1, rem2, term0, term1, term2;
   3784  1.1      ross     floatx80 z;
   3785  1.1      ross 
   3786  1.1      ross     aSig = extractFloatx80Frac( a );
   3787  1.1      ross     aExp = extractFloatx80Exp( a );
   3788  1.1      ross     aSign = extractFloatx80Sign( a );
   3789  1.1      ross     bSig = extractFloatx80Frac( b );
   3790  1.1      ross     bExp = extractFloatx80Exp( b );
   3791  1.1      ross     bSign = extractFloatx80Sign( b );
   3792  1.1      ross     zSign = aSign ^ bSign;
   3793  1.1      ross     if ( aExp == 0x7FFF ) {
   3794  1.1      ross         if ( (bits64) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b );
   3795  1.1      ross         if ( bExp == 0x7FFF ) {
   3796  1.1      ross             if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b );
   3797  1.1      ross             goto invalid;
   3798  1.1      ross         }
   3799  1.1      ross         return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
   3800  1.1      ross     }
   3801  1.1      ross     if ( bExp == 0x7FFF ) {
   3802  1.1      ross         if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b );
   3803  1.1      ross         return packFloatx80( zSign, 0, 0 );
   3804  1.1      ross     }
   3805  1.1      ross     if ( bExp == 0 ) {
   3806  1.1      ross         if ( bSig == 0 ) {
   3807  1.1      ross             if ( ( aExp | aSig ) == 0 ) {
   3808  1.1      ross  invalid:
   3809  1.1      ross                 float_raise( float_flag_invalid );
   3810  1.1      ross                 z.low = floatx80_default_nan_low;
   3811  1.1      ross                 z.high = floatx80_default_nan_high;
   3812  1.1      ross                 return z;
   3813  1.1      ross             }
   3814  1.1      ross             float_raise( float_flag_divbyzero );
   3815  1.1      ross             return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
   3816  1.1      ross         }
   3817  1.1      ross         normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
   3818  1.1      ross     }
   3819  1.1      ross     if ( aExp == 0 ) {
   3820  1.1      ross         if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 );
   3821  1.1      ross         normalizeFloatx80Subnormal( aSig, &aExp, &aSig );
   3822  1.1      ross     }
   3823  1.1      ross     zExp = aExp - bExp + 0x3FFE;
   3824  1.1      ross     rem1 = 0;
   3825  1.1      ross     if ( bSig <= aSig ) {
   3826  1.1      ross         shift128Right( aSig, 0, 1, &aSig, &rem1 );
   3827  1.1      ross         ++zExp;
   3828  1.1      ross     }
   3829  1.1      ross     zSig0 = estimateDiv128To64( aSig, rem1, bSig );
   3830  1.1      ross     mul64To128( bSig, zSig0, &term0, &term1 );
   3831  1.1      ross     sub128( aSig, rem1, term0, term1, &rem0, &rem1 );
   3832  1.1      ross     while ( (sbits64) rem0 < 0 ) {
   3833  1.1      ross         --zSig0;
   3834  1.1      ross         add128( rem0, rem1, 0, bSig, &rem0, &rem1 );
   3835  1.1      ross     }
   3836  1.1      ross     zSig1 = estimateDiv128To64( rem1, 0, bSig );
   3837  1.1      ross     if ( (bits64) ( zSig1<<1 ) <= 8 ) {
   3838  1.1      ross         mul64To128( bSig, zSig1, &term1, &term2 );
   3839  1.1      ross         sub128( rem1, 0, term1, term2, &rem1, &rem2 );
   3840  1.1      ross         while ( (sbits64) rem1 < 0 ) {
   3841  1.1      ross             --zSig1;
   3842  1.1      ross             add128( rem1, rem2, 0, bSig, &rem1, &rem2 );
   3843  1.1      ross         }
   3844  1.1      ross         zSig1 |= ( ( rem1 | rem2 ) != 0 );
   3845  1.1      ross     }
   3846  1.1      ross     return
   3847  1.1      ross         roundAndPackFloatx80(
   3848  1.1      ross             floatx80_rounding_precision, zSign, zExp, zSig0, zSig1 );
   3849  1.1      ross 
   3850  1.1      ross }
   3851  1.1      ross 
   3852  1.7   thorpej /*----------------------------------------------------------------------------
   3853  1.7   thorpej | Returns the remainder of the extended double-precision floating-point value
   3854  1.7   thorpej | `a' with respect to the corresponding value `b'.  The operation is performed
   3855  1.7   thorpej | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
   3856  1.7   thorpej *----------------------------------------------------------------------------*/
   3857  1.7   thorpej 
   3858  1.1      ross floatx80 floatx80_rem( floatx80 a, floatx80 b )
   3859  1.1      ross {
   3860  1.1      ross     flag aSign, bSign, zSign;
   3861  1.1      ross     int32 aExp, bExp, expDiff;
   3862  1.1      ross     bits64 aSig0, aSig1, bSig;
   3863  1.1      ross     bits64 q, term0, term1, alternateASig0, alternateASig1;
   3864  1.1      ross     floatx80 z;
   3865  1.1      ross 
   3866  1.1      ross     aSig0 = extractFloatx80Frac( a );
   3867  1.1      ross     aExp = extractFloatx80Exp( a );
   3868  1.1      ross     aSign = extractFloatx80Sign( a );
   3869  1.1      ross     bSig = extractFloatx80Frac( b );
   3870  1.1      ross     bExp = extractFloatx80Exp( b );
   3871  1.1      ross     bSign = extractFloatx80Sign( b );
   3872  1.1      ross     if ( aExp == 0x7FFF ) {
   3873  1.1      ross         if (    (bits64) ( aSig0<<1 )
   3874  1.1      ross              || ( ( bExp == 0x7FFF ) && (bits64) ( bSig<<1 ) ) ) {
   3875  1.1      ross             return propagateFloatx80NaN( a, b );
   3876  1.1      ross         }
   3877  1.1      ross         goto invalid;
   3878  1.1      ross     }
   3879  1.1      ross     if ( bExp == 0x7FFF ) {
   3880  1.1      ross         if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b );
   3881  1.1      ross         return a;
   3882  1.1      ross     }
   3883  1.1      ross     if ( bExp == 0 ) {
   3884  1.1      ross         if ( bSig == 0 ) {
   3885  1.1      ross  invalid:
   3886  1.1      ross             float_raise( float_flag_invalid );
   3887  1.1      ross             z.low = floatx80_default_nan_low;
   3888  1.1      ross             z.high = floatx80_default_nan_high;
   3889  1.1      ross             return z;
   3890  1.1      ross         }
   3891  1.1      ross         normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
   3892  1.1      ross     }
   3893  1.1      ross     if ( aExp == 0 ) {
   3894  1.1      ross         if ( (bits64) ( aSig0<<1 ) == 0 ) return a;
   3895  1.1      ross         normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 );
   3896  1.1      ross     }
   3897  1.1      ross     bSig |= LIT64( 0x8000000000000000 );
   3898  1.1      ross     zSign = aSign;
   3899  1.1      ross     expDiff = aExp - bExp;
   3900  1.1      ross     aSig1 = 0;
   3901  1.1      ross     if ( expDiff < 0 ) {
   3902  1.1      ross         if ( expDiff < -1 ) return a;
   3903  1.1      ross         shift128Right( aSig0, 0, 1, &aSig0, &aSig1 );
   3904  1.1      ross         expDiff = 0;
   3905  1.1      ross     }
   3906  1.1      ross     q = ( bSig <= aSig0 );
   3907  1.1      ross     if ( q ) aSig0 -= bSig;
   3908  1.1      ross     expDiff -= 64;
   3909  1.1      ross     while ( 0 < expDiff ) {
   3910  1.1      ross         q = estimateDiv128To64( aSig0, aSig1, bSig );
   3911  1.1      ross         q = ( 2 < q ) ? q - 2 : 0;
   3912  1.1      ross         mul64To128( bSig, q, &term0, &term1 );
   3913  1.1      ross         sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
   3914  1.1      ross         shortShift128Left( aSig0, aSig1, 62, &aSig0, &aSig1 );
   3915  1.1      ross         expDiff -= 62;
   3916  1.1      ross     }
   3917  1.1      ross     expDiff += 64;
   3918  1.1      ross     if ( 0 < expDiff ) {
   3919  1.1      ross         q = estimateDiv128To64( aSig0, aSig1, bSig );
   3920  1.1      ross         q = ( 2 < q ) ? q - 2 : 0;
   3921  1.1      ross         q >>= 64 - expDiff;
   3922  1.1      ross         mul64To128( bSig, q<<( 64 - expDiff ), &term0, &term1 );
   3923  1.1      ross         sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
   3924  1.1      ross         shortShift128Left( 0, bSig, 64 - expDiff, &term0, &term1 );
   3925  1.1      ross         while ( le128( term0, term1, aSig0, aSig1 ) ) {
   3926  1.1      ross             ++q;
   3927  1.1      ross             sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
   3928  1.1      ross         }
   3929  1.1      ross     }
   3930  1.1      ross     else {
   3931  1.1      ross         term1 = 0;
   3932  1.1      ross         term0 = bSig;
   3933  1.1      ross     }
   3934  1.1      ross     sub128( term0, term1, aSig0, aSig1, &alternateASig0, &alternateASig1 );
   3935  1.1      ross     if (    lt128( alternateASig0, alternateASig1, aSig0, aSig1 )
   3936  1.1      ross          || (    eq128( alternateASig0, alternateASig1, aSig0, aSig1 )
   3937  1.1      ross               && ( q & 1 ) )
   3938  1.1      ross        ) {
   3939  1.1      ross         aSig0 = alternateASig0;
   3940  1.1      ross         aSig1 = alternateASig1;
   3941  1.1      ross         zSign = ! zSign;
   3942  1.1      ross     }
   3943  1.1      ross     return
   3944  1.1      ross         normalizeRoundAndPackFloatx80(
   3945  1.1      ross             80, zSign, bExp + expDiff, aSig0, aSig1 );
   3946  1.1      ross 
   3947  1.1      ross }
   3948  1.1      ross 
   3949  1.7   thorpej /*----------------------------------------------------------------------------
   3950  1.7   thorpej | Returns the square root of the extended double-precision floating-point
   3951  1.7   thorpej | value `a'.  The operation is performed according to the IEC/IEEE Standard
   3952  1.7   thorpej | for Binary Floating-Point Arithmetic.
   3953  1.7   thorpej *----------------------------------------------------------------------------*/
   3954  1.7   thorpej 
   3955  1.1      ross floatx80 floatx80_sqrt( floatx80 a )
   3956  1.1      ross {
   3957  1.1      ross     flag aSign;
   3958  1.1      ross     int32 aExp, zExp;
   3959  1.1      ross     bits64 aSig0, aSig1, zSig0, zSig1, doubleZSig0;
   3960  1.1      ross     bits64 rem0, rem1, rem2, rem3, term0, term1, term2, term3;
   3961  1.1      ross     floatx80 z;
   3962  1.1      ross 
   3963  1.1      ross     aSig0 = extractFloatx80Frac( a );
   3964  1.1      ross     aExp = extractFloatx80Exp( a );
   3965  1.1      ross     aSign = extractFloatx80Sign( a );
   3966  1.1      ross     if ( aExp == 0x7FFF ) {
   3967  1.1      ross         if ( (bits64) ( aSig0<<1 ) ) return propagateFloatx80NaN( a, a );
   3968  1.1      ross         if ( ! aSign ) return a;
   3969  1.1      ross         goto invalid;
   3970  1.1      ross     }
   3971  1.1      ross     if ( aSign ) {
   3972  1.1      ross         if ( ( aExp | aSig0 ) == 0 ) return a;
   3973  1.1      ross  invalid:
   3974  1.1      ross         float_raise( float_flag_invalid );
   3975  1.1      ross         z.low = floatx80_default_nan_low;
   3976  1.1      ross         z.high = floatx80_default_nan_high;
   3977  1.1      ross         return z;
   3978  1.1      ross     }
   3979  1.1      ross     if ( aExp == 0 ) {
   3980  1.1      ross         if ( aSig0 == 0 ) return packFloatx80( 0, 0, 0 );
   3981  1.1      ross         normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 );
   3982  1.1      ross     }
   3983  1.1      ross     zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFF;
   3984  1.1      ross     zSig0 = estimateSqrt32( aExp, aSig0>>32 );
   3985  1.1      ross     shift128Right( aSig0, 0, 2 + ( aExp & 1 ), &aSig0, &aSig1 );
   3986  1.1      ross     zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 );
   3987  1.1      ross     doubleZSig0 = zSig0<<1;
   3988  1.1      ross     mul64To128( zSig0, zSig0, &term0, &term1 );
   3989  1.1      ross     sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 );
   3990  1.1      ross     while ( (sbits64) rem0 < 0 ) {
   3991  1.1      ross         --zSig0;
   3992  1.1      ross         doubleZSig0 -= 2;
   3993  1.1      ross         add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 );
   3994  1.1      ross     }
   3995  1.1      ross     zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 );
   3996  1.1      ross     if ( ( zSig1 & LIT64( 0x3FFFFFFFFFFFFFFF ) ) <= 5 ) {
   3997  1.1      ross         if ( zSig1 == 0 ) zSig1 = 1;
   3998  1.1      ross         mul64To128( doubleZSig0, zSig1, &term1, &term2 );
   3999  1.1      ross         sub128( rem1, 0, term1, term2, &rem1, &rem2 );
   4000  1.1      ross         mul64To128( zSig1, zSig1, &term2, &term3 );
   4001  1.1      ross         sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 );
   4002  1.1      ross         while ( (sbits64) rem1 < 0 ) {
   4003  1.1      ross             --zSig1;
   4004  1.1      ross             shortShift128Left( 0, zSig1, 1, &term2, &term3 );
   4005  1.1      ross             term3 |= 1;
   4006  1.1      ross             term2 |= doubleZSig0;
   4007  1.1      ross             add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 );
   4008  1.1      ross         }
   4009  1.1      ross         zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
   4010  1.1      ross     }
   4011  1.1      ross     shortShift128Left( 0, zSig1, 1, &zSig0, &zSig1 );
   4012  1.1      ross     zSig0 |= doubleZSig0;
   4013  1.1      ross     return
   4014  1.1      ross         roundAndPackFloatx80(
   4015  1.1      ross             floatx80_rounding_precision, 0, zExp, zSig0, zSig1 );
   4016  1.1      ross 
   4017  1.1      ross }
   4018  1.1      ross 
   4019  1.7   thorpej /*----------------------------------------------------------------------------
   4020  1.7   thorpej | Returns 1 if the extended double-precision floating-point value `a' is
   4021  1.7   thorpej | equal to the corresponding value `b', and 0 otherwise.  The comparison is
   4022  1.7   thorpej | performed according to the IEC/IEEE Standard for Binary Floating-Point
   4023  1.7   thorpej | Arithmetic.
   4024  1.7   thorpej *----------------------------------------------------------------------------*/
   4025  1.7   thorpej 
   4026  1.1      ross flag floatx80_eq( floatx80 a, floatx80 b )
   4027  1.1      ross {
   4028  1.1      ross 
   4029  1.1      ross     if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
   4030  1.1      ross               && (bits64) ( extractFloatx80Frac( a )<<1 ) )
   4031  1.1      ross          || (    ( extractFloatx80Exp( b ) == 0x7FFF )
   4032  1.1      ross               && (bits64) ( extractFloatx80Frac( b )<<1 ) )
   4033  1.1      ross        ) {
   4034  1.1      ross         if (    floatx80_is_signaling_nan( a )
   4035  1.1      ross              || floatx80_is_signaling_nan( b ) ) {
   4036  1.1      ross             float_raise( float_flag_invalid );
   4037  1.1      ross         }
   4038  1.1      ross         return 0;
   4039  1.1      ross     }
   4040  1.1      ross     return
   4041  1.1      ross            ( a.low == b.low )
   4042  1.1      ross         && (    ( a.high == b.high )
   4043  1.1      ross              || (    ( a.low == 0 )
   4044  1.1      ross                   && ( (bits16) ( ( a.high | b.high )<<1 ) == 0 ) )
   4045  1.1      ross            );
   4046  1.1      ross 
   4047  1.1      ross }
   4048  1.1      ross 
   4049  1.7   thorpej /*----------------------------------------------------------------------------
   4050  1.7   thorpej | Returns 1 if the extended double-precision floating-point value `a' is
   4051  1.7   thorpej | less than or equal to the corresponding value `b', and 0 otherwise.  The
   4052  1.7   thorpej | comparison is performed according to the IEC/IEEE Standard for Binary
   4053  1.7   thorpej | Floating-Point Arithmetic.
   4054  1.7   thorpej *----------------------------------------------------------------------------*/
   4055  1.7   thorpej 
   4056  1.1      ross flag floatx80_le( floatx80 a, floatx80 b )
   4057  1.1      ross {
   4058  1.1      ross     flag aSign, bSign;
   4059  1.1      ross 
   4060  1.1      ross     if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
   4061  1.1      ross               && (bits64) ( extractFloatx80Frac( a )<<1 ) )
   4062  1.1      ross          || (    ( extractFloatx80Exp( b ) == 0x7FFF )
   4063  1.1      ross               && (bits64) ( extractFloatx80Frac( b )<<1 ) )
   4064  1.1      ross        ) {
   4065  1.1      ross         float_raise( float_flag_invalid );
   4066  1.1      ross         return 0;
   4067  1.1      ross     }
   4068  1.1      ross     aSign = extractFloatx80Sign( a );
   4069  1.1      ross     bSign = extractFloatx80Sign( b );
   4070  1.1      ross     if ( aSign != bSign ) {
   4071  1.1      ross         return
   4072  1.1      ross                aSign
   4073  1.1      ross             || (    ( ( (bits16) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
   4074  1.1      ross                  == 0 );
   4075  1.1      ross     }
   4076  1.1      ross     return
   4077  1.1      ross           aSign ? le128( b.high, b.low, a.high, a.low )
   4078  1.1      ross         : le128( a.high, a.low, b.high, b.low );
   4079  1.1      ross 
   4080  1.1      ross }
   4081  1.1      ross 
   4082  1.7   thorpej /*----------------------------------------------------------------------------
   4083  1.7   thorpej | Returns 1 if the extended double-precision floating-point value `a' is
   4084  1.7   thorpej | less than the corresponding value `b', and 0 otherwise.  The comparison
   4085  1.7   thorpej | is performed according to the IEC/IEEE Standard for Binary Floating-Point
   4086  1.7   thorpej | Arithmetic.
   4087  1.7   thorpej *----------------------------------------------------------------------------*/
   4088  1.7   thorpej 
   4089  1.1      ross flag floatx80_lt( floatx80 a, floatx80 b )
   4090  1.1      ross {
   4091  1.1      ross     flag aSign, bSign;
   4092  1.1      ross 
   4093  1.1      ross     if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
   4094  1.1      ross               && (bits64) ( extractFloatx80Frac( a )<<1 ) )
   4095  1.1      ross          || (    ( extractFloatx80Exp( b ) == 0x7FFF )
   4096  1.1      ross               && (bits64) ( extractFloatx80Frac( b )<<1 ) )
   4097  1.1      ross        ) {
   4098  1.1      ross         float_raise( float_flag_invalid );
   4099  1.1      ross         return 0;
   4100  1.1      ross     }
   4101  1.1      ross     aSign = extractFloatx80Sign( a );
   4102  1.1      ross     bSign = extractFloatx80Sign( b );
   4103  1.1      ross     if ( aSign != bSign ) {
   4104  1.1      ross         return
   4105  1.1      ross                aSign
   4106  1.1      ross             && (    ( ( (bits16) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
   4107  1.1      ross                  != 0 );
   4108  1.1      ross     }
   4109  1.1      ross     return
   4110  1.1      ross           aSign ? lt128( b.high, b.low, a.high, a.low )
   4111  1.1      ross         : lt128( a.high, a.low, b.high, b.low );
   4112  1.1      ross 
   4113  1.1      ross }
   4114  1.1      ross 
   4115  1.7   thorpej /*----------------------------------------------------------------------------
   4116  1.7   thorpej | Returns 1 if the extended double-precision floating-point value `a' is equal
   4117  1.7   thorpej | to the corresponding value `b', and 0 otherwise.  The invalid exception is
   4118  1.7   thorpej | raised if either operand is a NaN.  Otherwise, the comparison is performed
   4119  1.7   thorpej | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
   4120  1.7   thorpej *----------------------------------------------------------------------------*/
   4121  1.7   thorpej 
   4122  1.1      ross flag floatx80_eq_signaling( floatx80 a, floatx80 b )
   4123  1.1      ross {
   4124  1.1      ross 
   4125  1.1      ross     if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
   4126  1.1      ross               && (bits64) ( extractFloatx80Frac( a )<<1 ) )
   4127  1.1      ross          || (    ( extractFloatx80Exp( b ) == 0x7FFF )
   4128  1.1      ross               && (bits64) ( extractFloatx80Frac( b )<<1 ) )
   4129  1.1      ross        ) {
   4130  1.1      ross         float_raise( float_flag_invalid );
   4131  1.1      ross         return 0;
   4132  1.1      ross     }
   4133  1.1      ross     return
   4134  1.1      ross            ( a.low == b.low )
   4135  1.1      ross         && (    ( a.high == b.high )
   4136  1.1      ross              || (    ( a.low == 0 )
   4137  1.1      ross                   && ( (bits16) ( ( a.high | b.high )<<1 ) == 0 ) )
   4138  1.1      ross            );
   4139  1.1      ross 
   4140  1.1      ross }
   4141  1.1      ross 
   4142  1.7   thorpej /*----------------------------------------------------------------------------
   4143  1.7   thorpej | Returns 1 if the extended double-precision floating-point value `a' is less
   4144  1.7   thorpej | than or equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs
   4145  1.7   thorpej | do not cause an exception.  Otherwise, the comparison is performed according
   4146  1.7   thorpej | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
   4147  1.7   thorpej *----------------------------------------------------------------------------*/
   4148  1.7   thorpej 
   4149  1.1      ross flag floatx80_le_quiet( floatx80 a, floatx80 b )
   4150  1.1      ross {
   4151  1.1      ross     flag aSign, bSign;
   4152  1.1      ross 
   4153  1.1      ross     if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
   4154  1.1      ross               && (bits64) ( extractFloatx80Frac( a )<<1 ) )
   4155  1.1      ross          || (    ( extractFloatx80Exp( b ) == 0x7FFF )
   4156  1.1      ross               && (bits64) ( extractFloatx80Frac( b )<<1 ) )
   4157  1.1      ross        ) {
   4158  1.1      ross         if (    floatx80_is_signaling_nan( a )
   4159  1.1      ross              || floatx80_is_signaling_nan( b ) ) {
   4160  1.1      ross             float_raise( float_flag_invalid );
   4161  1.1      ross         }
   4162  1.1      ross         return 0;
   4163  1.1      ross     }
   4164  1.1      ross     aSign = extractFloatx80Sign( a );
   4165  1.1      ross     bSign = extractFloatx80Sign( b );
   4166  1.1      ross     if ( aSign != bSign ) {
   4167  1.1      ross         return
   4168  1.1      ross                aSign
   4169  1.1      ross             || (    ( ( (bits16) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
   4170  1.1      ross                  == 0 );
   4171  1.1      ross     }
   4172  1.1      ross     return
   4173  1.1      ross           aSign ? le128( b.high, b.low, a.high, a.low )
   4174  1.1      ross         : le128( a.high, a.low, b.high, b.low );
   4175  1.1      ross 
   4176  1.1      ross }
   4177  1.1      ross 
   4178  1.7   thorpej /*----------------------------------------------------------------------------
   4179  1.7   thorpej | Returns 1 if the extended double-precision floating-point value `a' is less
   4180  1.7   thorpej | than the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause
   4181  1.7   thorpej | an exception.  Otherwise, the comparison is performed according to the
   4182  1.7   thorpej | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
   4183  1.7   thorpej *----------------------------------------------------------------------------*/
   4184  1.7   thorpej 
   4185  1.1      ross flag floatx80_lt_quiet( floatx80 a, floatx80 b )
   4186  1.1      ross {
   4187  1.1      ross     flag aSign, bSign;
   4188  1.1      ross 
   4189  1.1      ross     if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
   4190  1.1      ross               && (bits64) ( extractFloatx80Frac( a )<<1 ) )
   4191  1.1      ross          || (    ( extractFloatx80Exp( b ) == 0x7FFF )
   4192  1.1      ross               && (bits64) ( extractFloatx80Frac( b )<<1 ) )
   4193  1.1      ross        ) {
   4194  1.1      ross         if (    floatx80_is_signaling_nan( a )
   4195  1.1      ross              || floatx80_is_signaling_nan( b ) ) {
   4196  1.1      ross             float_raise( float_flag_invalid );
   4197  1.1      ross         }
   4198  1.1      ross         return 0;
   4199  1.1      ross     }
   4200  1.1      ross     aSign = extractFloatx80Sign( a );
   4201  1.1      ross     bSign = extractFloatx80Sign( b );
   4202  1.1      ross     if ( aSign != bSign ) {
   4203  1.1      ross         return
   4204  1.1      ross                aSign
   4205  1.1      ross             && (    ( ( (bits16) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
   4206  1.1      ross                  != 0 );
   4207  1.1      ross     }
   4208  1.1      ross     return
   4209  1.1      ross           aSign ? lt128( b.high, b.low, a.high, a.low )
   4210  1.1      ross         : lt128( a.high, a.low, b.high, b.low );
   4211  1.1      ross 
   4212  1.1      ross }
   4213  1.1      ross 
   4214  1.1      ross #endif
   4215  1.1      ross 
   4216  1.1      ross #ifdef FLOAT128
   4217  1.1      ross 
   4218  1.7   thorpej /*----------------------------------------------------------------------------
   4219  1.7   thorpej | Returns the result of converting the quadruple-precision floating-point
   4220  1.7   thorpej | value `a' to the 32-bit two's complement integer format.  The conversion
   4221  1.7   thorpej | is performed according to the IEC/IEEE Standard for Binary Floating-Point
   4222  1.7   thorpej | Arithmetic---which means in particular that the conversion is rounded
   4223  1.7   thorpej | according to the current rounding mode.  If `a' is a NaN, the largest
   4224  1.7   thorpej | positive integer is returned.  Otherwise, if the conversion overflows, the
   4225  1.7   thorpej | largest integer with the same sign as `a' is returned.
   4226  1.7   thorpej *----------------------------------------------------------------------------*/
   4227  1.7   thorpej 
   4228  1.1      ross int32 float128_to_int32( float128 a )
   4229  1.1      ross {
   4230  1.1      ross     flag aSign;
   4231  1.1      ross     int32 aExp, shiftCount;
   4232  1.1      ross     bits64 aSig0, aSig1;
   4233  1.1      ross 
   4234  1.1      ross     aSig1 = extractFloat128Frac1( a );
   4235  1.1      ross     aSig0 = extractFloat128Frac0( a );
   4236  1.1      ross     aExp = extractFloat128Exp( a );
   4237  1.1      ross     aSign = extractFloat128Sign( a );
   4238  1.1      ross     if ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) aSign = 0;
   4239  1.1      ross     if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
   4240  1.1      ross     aSig0 |= ( aSig1 != 0 );
   4241  1.1      ross     shiftCount = 0x4028 - aExp;
   4242  1.1      ross     if ( 0 < shiftCount ) shift64RightJamming( aSig0, shiftCount, &aSig0 );
   4243  1.1      ross     return roundAndPackInt32( aSign, aSig0 );
   4244  1.1      ross 
   4245  1.1      ross }
   4246  1.1      ross 
   4247  1.7   thorpej /*----------------------------------------------------------------------------
   4248  1.7   thorpej | Returns the result of converting the quadruple-precision floating-point
   4249  1.7   thorpej | value `a' to the 32-bit two's complement integer format.  The conversion
   4250  1.7   thorpej | is performed according to the IEC/IEEE Standard for Binary Floating-Point
   4251  1.7   thorpej | Arithmetic, except that the conversion is always rounded toward zero.  If
   4252  1.7   thorpej | `a' is a NaN, the largest positive integer is returned.  Otherwise, if the
   4253  1.7   thorpej | conversion overflows, the largest integer with the same sign as `a' is
   4254  1.7   thorpej | returned.
   4255  1.7   thorpej *----------------------------------------------------------------------------*/
   4256  1.7   thorpej 
   4257  1.1      ross int32 float128_to_int32_round_to_zero( float128 a )
   4258  1.1      ross {
   4259  1.1      ross     flag aSign;
   4260  1.1      ross     int32 aExp, shiftCount;
   4261  1.1      ross     bits64 aSig0, aSig1, savedASig;
   4262  1.1      ross     int32 z;
   4263  1.1      ross 
   4264  1.1      ross     aSig1 = extractFloat128Frac1( a );
   4265  1.1      ross     aSig0 = extractFloat128Frac0( a );
   4266  1.1      ross     aExp = extractFloat128Exp( a );
   4267  1.1      ross     aSign = extractFloat128Sign( a );
   4268  1.1      ross     aSig0 |= ( aSig1 != 0 );
   4269  1.1      ross     if ( 0x401E < aExp ) {
   4270  1.1      ross         if ( ( aExp == 0x7FFF ) && aSig0 ) aSign = 0;
   4271  1.1      ross         goto invalid;
   4272  1.1      ross     }
   4273  1.1      ross     else if ( aExp < 0x3FFF ) {
   4274  1.1      ross         if ( aExp || aSig0 ) float_set_inexact();
   4275  1.1      ross         return 0;
   4276  1.1      ross     }
   4277  1.1      ross     aSig0 |= LIT64( 0x0001000000000000 );
   4278  1.1      ross     shiftCount = 0x402F - aExp;
   4279  1.1      ross     savedASig = aSig0;
   4280  1.1      ross     aSig0 >>= shiftCount;
   4281  1.1      ross     z = aSig0;
   4282  1.1      ross     if ( aSign ) z = - z;
   4283  1.1      ross     if ( ( z < 0 ) ^ aSign ) {
   4284  1.1      ross  invalid:
   4285  1.1      ross         float_raise( float_flag_invalid );
   4286  1.1      ross         return aSign ? (sbits32) 0x80000000 : 0x7FFFFFFF;
   4287  1.1      ross     }
   4288  1.1      ross     if ( ( aSig0<<shiftCount ) != savedASig ) {
   4289  1.1      ross         float_set_inexact();
   4290  1.1      ross     }
   4291  1.1      ross     return z;
   4292  1.1      ross 
   4293  1.1      ross }
   4294  1.1      ross 
   4295  1.7   thorpej /*----------------------------------------------------------------------------
   4296  1.7   thorpej | Returns the result of converting the quadruple-precision floating-point
   4297  1.7   thorpej | value `a' to the 64-bit two's complement integer format.  The conversion
   4298  1.7   thorpej | is performed according to the IEC/IEEE Standard for Binary Floating-Point
   4299  1.7   thorpej | Arithmetic---which means in particular that the conversion is rounded
   4300  1.7   thorpej | according to the current rounding mode.  If `a' is a NaN, the largest
   4301  1.7   thorpej | positive integer is returned.  Otherwise, if the conversion overflows, the
   4302  1.7   thorpej | largest integer with the same sign as `a' is returned.
   4303  1.7   thorpej *----------------------------------------------------------------------------*/
   4304  1.7   thorpej 
   4305  1.1      ross int64 float128_to_int64( float128 a )
   4306  1.1      ross {
   4307  1.1      ross     flag aSign;
   4308  1.1      ross     int32 aExp, shiftCount;
   4309  1.1      ross     bits64 aSig0, aSig1;
   4310  1.1      ross 
   4311  1.1      ross     aSig1 = extractFloat128Frac1( a );
   4312  1.1      ross     aSig0 = extractFloat128Frac0( a );
   4313  1.1      ross     aExp = extractFloat128Exp( a );
   4314  1.1      ross     aSign = extractFloat128Sign( a );
   4315  1.1      ross     if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
   4316  1.1      ross     shiftCount = 0x402F - aExp;
   4317  1.1      ross     if ( shiftCount <= 0 ) {
   4318  1.1      ross         if ( 0x403E < aExp ) {
   4319  1.1      ross             float_raise( float_flag_invalid );
   4320  1.1      ross             if (    ! aSign
   4321  1.1      ross                  || (    ( aExp == 0x7FFF )
   4322  1.1      ross                       && ( aSig1 || ( aSig0 != LIT64( 0x0001000000000000 ) ) )
   4323  1.1      ross                     )
   4324  1.1      ross                ) {
   4325  1.1      ross                 return LIT64( 0x7FFFFFFFFFFFFFFF );
   4326  1.1      ross             }
   4327  1.1      ross             return (sbits64) LIT64( 0x8000000000000000 );
   4328  1.1      ross         }
   4329  1.1      ross         shortShift128Left( aSig0, aSig1, - shiftCount, &aSig0, &aSig1 );
   4330  1.1      ross     }
   4331  1.1      ross     else {
   4332  1.1      ross         shift64ExtraRightJamming( aSig0, aSig1, shiftCount, &aSig0, &aSig1 );
   4333  1.1      ross     }
   4334  1.1      ross     return roundAndPackInt64( aSign, aSig0, aSig1 );
   4335  1.1      ross 
   4336  1.1      ross }
   4337  1.1      ross 
   4338  1.7   thorpej /*----------------------------------------------------------------------------
   4339  1.7   thorpej | Returns the result of converting the quadruple-precision floating-point
   4340  1.7   thorpej | value `a' to the 64-bit two's complement integer format.  The conversion
   4341  1.7   thorpej | is performed according to the IEC/IEEE Standard for Binary Floating-Point
   4342  1.7   thorpej | Arithmetic, except that the conversion is always rounded toward zero.
   4343  1.7   thorpej | If `a' is a NaN, the largest positive integer is returned.  Otherwise, if
   4344  1.7   thorpej | the conversion overflows, the largest integer with the same sign as `a' is
   4345  1.7   thorpej | returned.
   4346  1.7   thorpej *----------------------------------------------------------------------------*/
   4347  1.7   thorpej 
   4348  1.1      ross int64 float128_to_int64_round_to_zero( float128 a )
   4349  1.1      ross {
   4350  1.1      ross     flag aSign;
   4351  1.1      ross     int32 aExp, shiftCount;
   4352  1.1      ross     bits64 aSig0, aSig1;
   4353  1.1      ross     int64 z;
   4354  1.1      ross 
   4355  1.1      ross     aSig1 = extractFloat128Frac1( a );
   4356  1.1      ross     aSig0 = extractFloat128Frac0( a );
   4357  1.1      ross     aExp = extractFloat128Exp( a );
   4358  1.1      ross     aSign = extractFloat128Sign( a );
   4359  1.1      ross     if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
   4360  1.1      ross     shiftCount = aExp - 0x402F;
   4361  1.1      ross     if ( 0 < shiftCount ) {
   4362  1.1      ross         if ( 0x403E <= aExp ) {
   4363  1.1      ross             aSig0 &= LIT64( 0x0000FFFFFFFFFFFF );
   4364  1.1      ross             if (    ( a.high == LIT64( 0xC03E000000000000 ) )
   4365  1.1      ross                  && ( aSig1 < LIT64( 0x0002000000000000 ) ) ) {
   4366  1.1      ross                 if ( aSig1 ) float_set_inexact();
   4367  1.1      ross             }
   4368  1.1      ross             else {
   4369  1.1      ross                 float_raise( float_flag_invalid );
   4370  1.1      ross                 if ( ! aSign || ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) ) {
   4371  1.1      ross                     return LIT64( 0x7FFFFFFFFFFFFFFF );
   4372  1.1      ross                 }
   4373  1.1      ross             }
   4374  1.1      ross             return (sbits64) LIT64( 0x8000000000000000 );
   4375  1.1      ross         }
   4376  1.1      ross         z = ( aSig0<<shiftCount ) | ( aSig1>>( ( - shiftCount ) & 63 ) );
   4377  1.1      ross         if ( (bits64) ( aSig1<<shiftCount ) ) {
   4378  1.1      ross             float_set_inexact();
   4379  1.1      ross         }
   4380  1.1      ross     }
   4381  1.1      ross     else {
   4382  1.1      ross         if ( aExp < 0x3FFF ) {
   4383  1.1      ross             if ( aExp | aSig0 | aSig1 ) {
   4384  1.1      ross                 float_set_inexact();
   4385  1.1      ross             }
   4386  1.1      ross             return 0;
   4387  1.1      ross         }
   4388  1.1      ross         z = aSig0>>( - shiftCount );
   4389  1.1      ross         if (    aSig1
   4390  1.1      ross              || ( shiftCount && (bits64) ( aSig0<<( shiftCount & 63 ) ) ) ) {
   4391  1.1      ross             float_set_inexact();
   4392  1.1      ross         }
   4393  1.1      ross     }
   4394  1.1      ross     if ( aSign ) z = - z;
   4395  1.1      ross     return z;
   4396  1.1      ross 
   4397  1.1      ross }
   4398  1.1      ross 
   4399  1.7   thorpej /*----------------------------------------------------------------------------
   4400  1.7   thorpej | Returns the result of converting the quadruple-precision floating-point
   4401  1.7   thorpej | value `a' to the single-precision floating-point format.  The conversion
   4402  1.7   thorpej | is performed according to the IEC/IEEE Standard for Binary Floating-Point
   4403  1.7   thorpej | Arithmetic.
   4404  1.7   thorpej *----------------------------------------------------------------------------*/
   4405  1.7   thorpej 
   4406  1.1      ross float32 float128_to_float32( float128 a )
   4407  1.1      ross {
   4408  1.1      ross     flag aSign;
   4409  1.1      ross     int32 aExp;
   4410  1.1      ross     bits64 aSig0, aSig1;
   4411  1.1      ross     bits32 zSig;
   4412  1.1      ross 
   4413  1.1      ross     aSig1 = extractFloat128Frac1( a );
   4414  1.1      ross     aSig0 = extractFloat128Frac0( a );
   4415  1.1      ross     aExp = extractFloat128Exp( a );
   4416  1.1      ross     aSign = extractFloat128Sign( a );
   4417  1.1      ross     if ( aExp == 0x7FFF ) {
   4418  1.1      ross         if ( aSig0 | aSig1 ) {
   4419  1.1      ross             return commonNaNToFloat32( float128ToCommonNaN( a ) );
   4420  1.1      ross         }
   4421  1.1      ross         return packFloat32( aSign, 0xFF, 0 );
   4422  1.1      ross     }
   4423  1.1      ross     aSig0 |= ( aSig1 != 0 );
   4424  1.1      ross     shift64RightJamming( aSig0, 18, &aSig0 );
   4425  1.1      ross     zSig = aSig0;
   4426  1.1      ross     if ( aExp || zSig ) {
   4427  1.1      ross         zSig |= 0x40000000;
   4428  1.1      ross         aExp -= 0x3F81;
   4429  1.1      ross     }
   4430  1.1      ross     return roundAndPackFloat32( aSign, aExp, zSig );
   4431  1.1      ross 
   4432  1.1      ross }
   4433  1.1      ross 
   4434  1.7   thorpej /*----------------------------------------------------------------------------
   4435  1.7   thorpej | Returns the result of converting the quadruple-precision floating-point
   4436  1.7   thorpej | value `a' to the double-precision floating-point format.  The conversion
   4437  1.7   thorpej | is performed according to the IEC/IEEE Standard for Binary Floating-Point
   4438  1.7   thorpej | Arithmetic.
   4439  1.7   thorpej *----------------------------------------------------------------------------*/
   4440  1.7   thorpej 
   4441  1.1      ross float64 float128_to_float64( float128 a )
   4442  1.1      ross {
   4443  1.1      ross     flag aSign;
   4444  1.1      ross     int32 aExp;
   4445  1.1      ross     bits64 aSig0, aSig1;
   4446  1.1      ross 
   4447  1.1      ross     aSig1 = extractFloat128Frac1( a );
   4448  1.1      ross     aSig0 = extractFloat128Frac0( a );
   4449  1.1      ross     aExp = extractFloat128Exp( a );
   4450  1.1      ross     aSign = extractFloat128Sign( a );
   4451  1.1      ross     if ( aExp == 0x7FFF ) {
   4452  1.1      ross         if ( aSig0 | aSig1 ) {
   4453  1.1      ross             return commonNaNToFloat64( float128ToCommonNaN( a ) );
   4454  1.1      ross         }
   4455  1.1      ross         return packFloat64( aSign, 0x7FF, 0 );
   4456  1.1      ross     }
   4457  1.1      ross     shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 );
   4458  1.1      ross     aSig0 |= ( aSig1 != 0 );
   4459  1.1      ross     if ( aExp || aSig0 ) {
   4460  1.1      ross         aSig0 |= LIT64( 0x4000000000000000 );
   4461  1.1      ross         aExp -= 0x3C01;
   4462  1.1      ross     }
   4463  1.1      ross     return roundAndPackFloat64( aSign, aExp, aSig0 );
   4464  1.1      ross 
   4465  1.1      ross }
   4466  1.1      ross 
   4467  1.1      ross #ifdef FLOATX80
   4468  1.1      ross 
   4469  1.7   thorpej /*----------------------------------------------------------------------------
   4470  1.7   thorpej | Returns the result of converting the quadruple-precision floating-point
   4471  1.7   thorpej | value `a' to the extended double-precision floating-point format.  The
   4472  1.7   thorpej | conversion is performed according to the IEC/IEEE Standard for Binary
   4473  1.7   thorpej | Floating-Point Arithmetic.
   4474  1.7   thorpej *----------------------------------------------------------------------------*/
   4475  1.7   thorpej 
   4476  1.1      ross floatx80 float128_to_floatx80( float128 a )
   4477  1.1      ross {
   4478  1.1      ross     flag aSign;
   4479  1.1      ross     int32 aExp;
   4480  1.1      ross     bits64 aSig0, aSig1;
   4481  1.1      ross 
   4482  1.1      ross     aSig1 = extractFloat128Frac1( a );
   4483  1.1      ross     aSig0 = extractFloat128Frac0( a );
   4484  1.1      ross     aExp = extractFloat128Exp( a );
   4485  1.1      ross     aSign = extractFloat128Sign( a );
   4486  1.1      ross     if ( aExp == 0x7FFF ) {
   4487  1.1      ross         if ( aSig0 | aSig1 ) {
   4488  1.1      ross             return commonNaNToFloatx80( float128ToCommonNaN( a ) );
   4489  1.1      ross         }
   4490  1.1      ross         return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
   4491  1.1      ross     }
   4492  1.1      ross     if ( aExp == 0 ) {
   4493  1.1      ross         if ( ( aSig0 | aSig1 ) == 0 ) return packFloatx80( aSign, 0, 0 );
   4494  1.1      ross         normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
   4495  1.1      ross     }
   4496  1.1      ross     else {
   4497  1.1      ross         aSig0 |= LIT64( 0x0001000000000000 );
   4498  1.1      ross     }
   4499  1.1      ross     shortShift128Left( aSig0, aSig1, 15, &aSig0, &aSig1 );
   4500  1.1      ross     return roundAndPackFloatx80( 80, aSign, aExp, aSig0, aSig1 );
   4501  1.1      ross 
   4502  1.1      ross }
   4503  1.1      ross 
   4504  1.1      ross #endif
   4505  1.1      ross 
   4506  1.7   thorpej /*----------------------------------------------------------------------------
   4507  1.7   thorpej | Rounds the quadruple-precision floating-point value `a' to an integer, and
   4508  1.7   thorpej | returns the result as a quadruple-precision floating-point value.  The
   4509  1.7   thorpej | operation is performed according to the IEC/IEEE Standard for Binary
   4510  1.7   thorpej | Floating-Point Arithmetic.
   4511  1.7   thorpej *----------------------------------------------------------------------------*/
   4512  1.7   thorpej 
   4513  1.1      ross float128 float128_round_to_int( float128 a )
   4514  1.1      ross {
   4515  1.1      ross     flag aSign;
   4516  1.1      ross     int32 aExp;
   4517  1.1      ross     bits64 lastBitMask, roundBitsMask;
   4518  1.1      ross     int8 roundingMode;
   4519  1.1      ross     float128 z;
   4520  1.1      ross 
   4521  1.1      ross     aExp = extractFloat128Exp( a );
   4522  1.1      ross     if ( 0x402F <= aExp ) {
   4523  1.1      ross         if ( 0x406F <= aExp ) {
   4524  1.1      ross             if (    ( aExp == 0x7FFF )
   4525  1.1      ross                  && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) )
   4526  1.1      ross                ) {
   4527  1.1      ross                 return propagateFloat128NaN( a, a );
   4528  1.1      ross             }
   4529  1.1      ross             return a;
   4530  1.1      ross         }
   4531  1.1      ross         lastBitMask = 1;
   4532  1.1      ross         lastBitMask = ( lastBitMask<<( 0x406E - aExp ) )<<1;
   4533  1.1      ross         roundBitsMask = lastBitMask - 1;
   4534  1.1      ross         z = a;
   4535  1.1      ross         roundingMode = float_rounding_mode();
   4536  1.1      ross         if ( roundingMode == float_round_nearest_even ) {
   4537  1.1      ross             if ( lastBitMask ) {
   4538  1.1      ross                 add128( z.high, z.low, 0, lastBitMask>>1, &z.high, &z.low );
   4539  1.1      ross                 if ( ( z.low & roundBitsMask ) == 0 ) z.low &= ~ lastBitMask;
   4540  1.1      ross             }
   4541  1.1      ross             else {
   4542  1.1      ross                 if ( (sbits64) z.low < 0 ) {
   4543  1.1      ross                     ++z.high;
   4544  1.1      ross                     if ( (bits64) ( z.low<<1 ) == 0 ) z.high &= ~1;
   4545  1.1      ross                 }
   4546  1.1      ross             }
   4547  1.1      ross         }
   4548  1.1      ross         else if ( roundingMode != float_round_to_zero ) {
   4549  1.1      ross             if (   extractFloat128Sign( z )
   4550  1.1      ross                  ^ ( roundingMode == float_round_up ) ) {
   4551  1.1      ross                 add128( z.high, z.low, 0, roundBitsMask, &z.high, &z.low );
   4552  1.1      ross             }
   4553  1.1      ross         }
   4554  1.1      ross         z.low &= ~ roundBitsMask;
   4555  1.1      ross     }
   4556  1.1      ross     else {
   4557  1.1      ross         if ( aExp < 0x3FFF ) {
   4558  1.1      ross             if ( ( ( (bits64) ( a.high<<1 ) ) | a.low ) == 0 ) return a;
   4559  1.1      ross             float_set_inexact();
   4560  1.1      ross             aSign = extractFloat128Sign( a );
   4561  1.1      ross             switch ( float_rounding_mode() ) {
   4562  1.1      ross              case float_round_nearest_even:
   4563  1.1      ross                 if (    ( aExp == 0x3FFE )
   4564  1.1      ross                      && (   extractFloat128Frac0( a )
   4565  1.1      ross                           | extractFloat128Frac1( a ) )
   4566  1.1      ross                    ) {
   4567  1.1      ross                     return packFloat128( aSign, 0x3FFF, 0, 0 );
   4568  1.1      ross                 }
   4569  1.1      ross                 break;
   4570  1.1      ross              case float_round_down:
   4571  1.1      ross                 return
   4572  1.1      ross                       aSign ? packFloat128( 1, 0x3FFF, 0, 0 )
   4573  1.1      ross                     : packFloat128( 0, 0, 0, 0 );
   4574  1.1      ross              case float_round_up:
   4575  1.1      ross                 return
   4576  1.1      ross                       aSign ? packFloat128( 1, 0, 0, 0 )
   4577  1.1      ross                     : packFloat128( 0, 0x3FFF, 0, 0 );
   4578  1.1      ross             }
   4579  1.1      ross             return packFloat128( aSign, 0, 0, 0 );
   4580  1.1      ross         }
   4581  1.1      ross         lastBitMask = 1;
   4582  1.1      ross         lastBitMask <<= 0x402F - aExp;
   4583  1.1      ross         roundBitsMask = lastBitMask - 1;
   4584  1.1      ross         z.low = 0;
   4585  1.1      ross         z.high = a.high;
   4586  1.1      ross         roundingMode = float_rounding_mode();
   4587  1.1      ross         if ( roundingMode == float_round_nearest_even ) {
   4588  1.1      ross             z.high += lastBitMask>>1;
   4589  1.1      ross             if ( ( ( z.high & roundBitsMask ) | a.low ) == 0 ) {
   4590  1.1      ross                 z.high &= ~ lastBitMask;
   4591  1.1      ross             }
   4592  1.1      ross         }
   4593  1.1      ross         else if ( roundingMode != float_round_to_zero ) {
   4594  1.1      ross             if (   extractFloat128Sign( z )
   4595  1.1      ross                  ^ ( roundingMode == float_round_up ) ) {
   4596  1.1      ross                 z.high |= ( a.low != 0 );
   4597  1.1      ross                 z.high += roundBitsMask;
   4598  1.1      ross             }
   4599  1.1      ross         }
   4600  1.1      ross         z.high &= ~ roundBitsMask;
   4601  1.1      ross     }
   4602  1.1      ross     if ( ( z.low != a.low ) || ( z.high != a.high ) ) {
   4603  1.1      ross         float_set_inexact();
   4604  1.1      ross     }
   4605  1.1      ross     return z;
   4606  1.1      ross 
   4607  1.1      ross }
   4608  1.1      ross 
   4609  1.7   thorpej /*----------------------------------------------------------------------------
   4610  1.7   thorpej | Returns the result of adding the absolute values of the quadruple-precision
   4611  1.7   thorpej | floating-point values `a' and `b'.  If `zSign' is 1, the sum is negated
   4612  1.7   thorpej | before being returned.  `zSign' is ignored if the result is a NaN.
   4613  1.7   thorpej | The addition is performed according to the IEC/IEEE Standard for Binary
   4614  1.7   thorpej | Floating-Point Arithmetic.
   4615  1.7   thorpej *----------------------------------------------------------------------------*/
   4616  1.7   thorpej 
   4617  1.1      ross static float128 addFloat128Sigs( float128 a, float128 b, flag zSign )
   4618  1.1      ross {
   4619  1.1      ross     int32 aExp, bExp, zExp;
   4620  1.1      ross     bits64 aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2;
   4621  1.1      ross     int32 expDiff;
   4622  1.1      ross 
   4623  1.1      ross     aSig1 = extractFloat128Frac1( a );
   4624  1.1      ross     aSig0 = extractFloat128Frac0( a );
   4625  1.1      ross     aExp = extractFloat128Exp( a );
   4626  1.1      ross     bSig1 = extractFloat128Frac1( b );
   4627  1.1      ross     bSig0 = extractFloat128Frac0( b );
   4628  1.1      ross     bExp = extractFloat128Exp( b );
   4629  1.1      ross     expDiff = aExp - bExp;
   4630  1.1      ross     if ( 0 < expDiff ) {
   4631  1.1      ross         if ( aExp == 0x7FFF ) {
   4632  1.1      ross             if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, b );
   4633  1.1      ross             return a;
   4634  1.1      ross         }
   4635  1.1      ross         if ( bExp == 0 ) {
   4636  1.1      ross             --expDiff;
   4637  1.1      ross         }
   4638  1.1      ross         else {
   4639  1.1      ross             bSig0 |= LIT64( 0x0001000000000000 );
   4640  1.1      ross         }
   4641  1.1      ross         shift128ExtraRightJamming(
   4642  1.1      ross             bSig0, bSig1, 0, expDiff, &bSig0, &bSig1, &zSig2 );
   4643  1.1      ross         zExp = aExp;
   4644  1.1      ross     }
   4645  1.1      ross     else if ( expDiff < 0 ) {
   4646  1.1      ross         if ( bExp == 0x7FFF ) {
   4647  1.1      ross             if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b );
   4648  1.1      ross             return packFloat128( zSign, 0x7FFF, 0, 0 );
   4649  1.1      ross         }
   4650  1.1      ross         if ( aExp == 0 ) {
   4651  1.1      ross             ++expDiff;
   4652  1.1      ross         }
   4653  1.1      ross         else {
   4654  1.1      ross             aSig0 |= LIT64( 0x0001000000000000 );
   4655  1.1      ross         }
   4656  1.1      ross         shift128ExtraRightJamming(
   4657  1.1      ross             aSig0, aSig1, 0, - expDiff, &aSig0, &aSig1, &zSig2 );
   4658  1.1      ross         zExp = bExp;
   4659  1.1      ross     }
   4660  1.1      ross     else {
   4661  1.1      ross         if ( aExp == 0x7FFF ) {
   4662  1.1      ross             if ( aSig0 | aSig1 | bSig0 | bSig1 ) {
   4663  1.1      ross                 return propagateFloat128NaN( a, b );
   4664  1.1      ross             }
   4665  1.1      ross             return a;
   4666  1.1      ross         }
   4667  1.1      ross         add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
   4668  1.1      ross         if ( aExp == 0 ) return packFloat128( zSign, 0, zSig0, zSig1 );
   4669  1.1      ross         zSig2 = 0;
   4670  1.1      ross         zSig0 |= LIT64( 0x0002000000000000 );
   4671  1.1      ross         zExp = aExp;
   4672  1.1      ross         goto shiftRight1;
   4673  1.1      ross     }
   4674  1.1      ross     aSig0 |= LIT64( 0x0001000000000000 );
   4675  1.1      ross     add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
   4676  1.1      ross     --zExp;
   4677  1.1      ross     if ( zSig0 < LIT64( 0x0002000000000000 ) ) goto roundAndPack;
   4678  1.1      ross     ++zExp;
   4679  1.1      ross  shiftRight1:
   4680  1.1      ross     shift128ExtraRightJamming(
   4681  1.1      ross         zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 );
   4682  1.1      ross  roundAndPack:
   4683  1.1      ross     return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 );
   4684  1.1      ross 
   4685  1.1      ross }
   4686  1.1      ross 
   4687  1.7   thorpej /*----------------------------------------------------------------------------
   4688  1.7   thorpej | Returns the result of subtracting the absolute values of the quadruple-
   4689  1.7   thorpej | precision floating-point values `a' and `b'.  If `zSign' is 1, the
   4690  1.7   thorpej | difference is negated before being returned.  `zSign' is ignored if the
   4691  1.7   thorpej | result is a NaN.  The subtraction is performed according to the IEC/IEEE
   4692  1.7   thorpej | Standard for Binary Floating-Point Arithmetic.
   4693  1.7   thorpej *----------------------------------------------------------------------------*/
   4694  1.7   thorpej 
   4695  1.1      ross static float128 subFloat128Sigs( float128 a, float128 b, flag zSign )
   4696  1.1      ross {
   4697  1.1      ross     int32 aExp, bExp, zExp;
   4698  1.1      ross     bits64 aSig0, aSig1, bSig0, bSig1, zSig0, zSig1;
   4699  1.1      ross     int32 expDiff;
   4700  1.1      ross     float128 z;
   4701  1.1      ross 
   4702  1.1      ross     aSig1 = extractFloat128Frac1( a );
   4703  1.1      ross     aSig0 = extractFloat128Frac0( a );
   4704  1.1      ross     aExp = extractFloat128Exp( a );
   4705  1.1      ross     bSig1 = extractFloat128Frac1( b );
   4706  1.1      ross     bSig0 = extractFloat128Frac0( b );
   4707  1.1      ross     bExp = extractFloat128Exp( b );
   4708  1.1      ross     expDiff = aExp - bExp;
   4709  1.1      ross     shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 );
   4710  1.1      ross     shortShift128Left( bSig0, bSig1, 14, &bSig0, &bSig1 );
   4711  1.1      ross     if ( 0 < expDiff ) goto aExpBigger;
   4712  1.1      ross     if ( expDiff < 0 ) goto bExpBigger;
   4713  1.1      ross     if ( aExp == 0x7FFF ) {
   4714  1.1      ross         if ( aSig0 | aSig1 | bSig0 | bSig1 ) {
   4715  1.1      ross             return propagateFloat128NaN( a, b );
   4716  1.1      ross         }
   4717  1.1      ross         float_raise( float_flag_invalid );
   4718  1.1      ross         z.low = float128_default_nan_low;
   4719  1.1      ross         z.high = float128_default_nan_high;
   4720  1.1      ross         return z;
   4721  1.1      ross     }
   4722  1.1      ross     if ( aExp == 0 ) {
   4723  1.1      ross         aExp = 1;
   4724  1.1      ross         bExp = 1;
   4725  1.1      ross     }
   4726  1.1      ross     if ( bSig0 < aSig0 ) goto aBigger;
   4727  1.1      ross     if ( aSig0 < bSig0 ) goto bBigger;
   4728  1.1      ross     if ( bSig1 < aSig1 ) goto aBigger;
   4729  1.1      ross     if ( aSig1 < bSig1 ) goto bBigger;
   4730  1.1      ross     return packFloat128( float_rounding_mode() == float_round_down, 0, 0, 0 );
   4731  1.1      ross  bExpBigger:
   4732  1.1      ross     if ( bExp == 0x7FFF ) {
   4733  1.1      ross         if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b );
   4734  1.1      ross         return packFloat128( zSign ^ 1, 0x7FFF, 0, 0 );
   4735  1.1      ross     }
   4736  1.1      ross     if ( aExp == 0 ) {
   4737  1.1      ross         ++expDiff;
   4738  1.1      ross     }
   4739  1.1      ross     else {
   4740  1.1      ross         aSig0 |= LIT64( 0x4000000000000000 );
   4741  1.1      ross     }
   4742  1.1      ross     shift128RightJamming( aSig0, aSig1, - expDiff, &aSig0, &aSig1 );
   4743  1.1      ross     bSig0 |= LIT64( 0x4000000000000000 );
   4744  1.1      ross  bBigger:
   4745  1.1      ross     sub128( bSig0, bSig1, aSig0, aSig1, &zSig0, &zSig1 );
   4746  1.1      ross     zExp = bExp;
   4747  1.1      ross     zSign ^= 1;
   4748  1.1      ross     goto normalizeRoundAndPack;
   4749  1.1      ross  aExpBigger:
   4750  1.1      ross     if ( aExp == 0x7FFF ) {
   4751  1.1      ross         if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, b );
   4752  1.1      ross         return a;
   4753  1.1      ross     }
   4754  1.1      ross     if ( bExp == 0 ) {
   4755  1.1      ross         --expDiff;
   4756  1.1      ross     }
   4757  1.1      ross     else {
   4758  1.1      ross         bSig0 |= LIT64( 0x4000000000000000 );
   4759  1.1      ross     }
   4760  1.1      ross     shift128RightJamming( bSig0, bSig1, expDiff, &bSig0, &bSig1 );
   4761  1.1      ross     aSig0 |= LIT64( 0x4000000000000000 );
   4762  1.1      ross  aBigger:
   4763  1.1      ross     sub128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
   4764  1.1      ross     zExp = aExp;
   4765  1.1      ross  normalizeRoundAndPack:
   4766  1.1      ross     --zExp;
   4767  1.1      ross     return normalizeRoundAndPackFloat128( zSign, zExp - 14, zSig0, zSig1 );
   4768  1.1      ross 
   4769  1.1      ross }
   4770  1.1      ross 
   4771  1.7   thorpej /*----------------------------------------------------------------------------
   4772  1.7   thorpej | Returns the result of adding the quadruple-precision floating-point values
   4773  1.7   thorpej | `a' and `b'.  The operation is performed according to the IEC/IEEE Standard
   4774  1.7   thorpej | for Binary Floating-Point Arithmetic.
   4775  1.7   thorpej *----------------------------------------------------------------------------*/
   4776  1.7   thorpej 
   4777  1.1      ross float128 float128_add( float128 a, float128 b )
   4778  1.1      ross {
   4779  1.1      ross     flag aSign, bSign;
   4780  1.1      ross 
   4781  1.1      ross     aSign = extractFloat128Sign( a );
   4782  1.1      ross     bSign = extractFloat128Sign( b );
   4783  1.1      ross     if ( aSign == bSign ) {
   4784  1.1      ross         return addFloat128Sigs( a, b, aSign );
   4785  1.1      ross     }
   4786  1.1      ross     else {
   4787  1.1      ross         return subFloat128Sigs( a, b, aSign );
   4788  1.1      ross     }
   4789  1.1      ross 
   4790  1.1      ross }
   4791  1.1      ross 
   4792  1.7   thorpej /*----------------------------------------------------------------------------
   4793  1.7   thorpej | Returns the result of subtracting the quadruple-precision floating-point
   4794  1.7   thorpej | values `a' and `b'.  The operation is performed according to the IEC/IEEE
   4795  1.7   thorpej | Standard for Binary Floating-Point Arithmetic.
   4796  1.7   thorpej *----------------------------------------------------------------------------*/
   4797  1.7   thorpej 
   4798  1.1      ross float128 float128_sub( float128 a, float128 b )
   4799  1.1      ross {
   4800  1.1      ross     flag aSign, bSign;
   4801  1.1      ross 
   4802  1.1      ross     aSign = extractFloat128Sign( a );
   4803  1.1      ross     bSign = extractFloat128Sign( b );
   4804  1.1      ross     if ( aSign == bSign ) {
   4805  1.1      ross         return subFloat128Sigs( a, b, aSign );
   4806  1.1      ross     }
   4807  1.1      ross     else {
   4808  1.1      ross         return addFloat128Sigs( a, b, aSign );
   4809  1.1      ross     }
   4810  1.1      ross 
   4811  1.1      ross }
   4812  1.1      ross 
   4813  1.7   thorpej /*----------------------------------------------------------------------------
   4814  1.7   thorpej | Returns the result of multiplying the quadruple-precision floating-point
   4815  1.7   thorpej | values `a' and `b'.  The operation is performed according to the IEC/IEEE
   4816  1.7   thorpej | Standard for Binary Floating-Point Arithmetic.
   4817  1.7   thorpej *----------------------------------------------------------------------------*/
   4818  1.7   thorpej 
   4819  1.1      ross float128 float128_mul( float128 a, float128 b )
   4820  1.1      ross {
   4821  1.1      ross     flag aSign, bSign, zSign;
   4822  1.1      ross     int32 aExp, bExp, zExp;
   4823  1.1      ross     bits64 aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2, zSig3;
   4824  1.1      ross     float128 z;
   4825  1.1      ross 
   4826  1.1      ross     aSig1 = extractFloat128Frac1( a );
   4827  1.1      ross     aSig0 = extractFloat128Frac0( a );
   4828  1.1      ross     aExp = extractFloat128Exp( a );
   4829  1.1      ross     aSign = extractFloat128Sign( a );
   4830  1.1      ross     bSig1 = extractFloat128Frac1( b );
   4831  1.1      ross     bSig0 = extractFloat128Frac0( b );
   4832  1.1      ross     bExp = extractFloat128Exp( b );
   4833  1.1      ross     bSign = extractFloat128Sign( b );
   4834  1.1      ross     zSign = aSign ^ bSign;
   4835  1.1      ross     if ( aExp == 0x7FFF ) {
   4836  1.1      ross         if (    ( aSig0 | aSig1 )
   4837  1.1      ross              || ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) {
   4838  1.1      ross             return propagateFloat128NaN( a, b );
   4839  1.1      ross         }
   4840  1.1      ross         if ( ( bExp | bSig0 | bSig1 ) == 0 ) goto invalid;
   4841  1.1      ross         return packFloat128( zSign, 0x7FFF, 0, 0 );
   4842  1.1      ross     }
   4843  1.1      ross     if ( bExp == 0x7FFF ) {
   4844  1.1      ross         if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b );
   4845  1.1      ross         if ( ( aExp | aSig0 | aSig1 ) == 0 ) {
   4846  1.1      ross  invalid:
   4847  1.1      ross             float_raise( float_flag_invalid );
   4848  1.1      ross             z.low = float128_default_nan_low;
   4849  1.1      ross             z.high = float128_default_nan_high;
   4850  1.1      ross             return z;
   4851  1.1      ross         }
   4852  1.1      ross         return packFloat128( zSign, 0x7FFF, 0, 0 );
   4853  1.1      ross     }
   4854  1.1      ross     if ( aExp == 0 ) {
   4855  1.1      ross         if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
   4856  1.1      ross         normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
   4857  1.1      ross     }
   4858  1.1      ross     if ( bExp == 0 ) {
   4859  1.1      ross         if ( ( bSig0 | bSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
   4860  1.1      ross         normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
   4861  1.1      ross     }
   4862  1.1      ross     zExp = aExp + bExp - 0x4000;
   4863  1.1      ross     aSig0 |= LIT64( 0x0001000000000000 );
   4864  1.1      ross     shortShift128Left( bSig0, bSig1, 16, &bSig0, &bSig1 );
   4865  1.1      ross     mul128To256( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1, &zSig2, &zSig3 );
   4866  1.1      ross     add128( zSig0, zSig1, aSig0, aSig1, &zSig0, &zSig1 );
   4867  1.1      ross     zSig2 |= ( zSig3 != 0 );
   4868  1.1      ross     if ( LIT64( 0x0002000000000000 ) <= zSig0 ) {
   4869  1.1      ross         shift128ExtraRightJamming(
   4870  1.1      ross             zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 );
   4871  1.1      ross         ++zExp;
   4872  1.1      ross     }
   4873  1.1      ross     return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 );
   4874  1.1      ross 
   4875  1.1      ross }
   4876  1.1      ross 
   4877  1.7   thorpej /*----------------------------------------------------------------------------
   4878  1.7   thorpej | Returns the result of dividing the quadruple-precision floating-point value
   4879  1.7   thorpej | `a' by the corresponding value `b'.  The operation is performed according to
   4880  1.7   thorpej | the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
   4881  1.7   thorpej *----------------------------------------------------------------------------*/
   4882  1.7   thorpej 
   4883  1.1      ross float128 float128_div( float128 a, float128 b )
   4884  1.1      ross {
   4885  1.1      ross     flag aSign, bSign, zSign;
   4886  1.1      ross     int32 aExp, bExp, zExp;
   4887  1.1      ross     bits64 aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2;
   4888  1.1      ross     bits64 rem0, rem1, rem2, rem3, term0, term1, term2, term3;
   4889  1.1      ross     float128 z;
   4890  1.1      ross 
   4891  1.1      ross     aSig1 = extractFloat128Frac1( a );
   4892  1.1      ross     aSig0 = extractFloat128Frac0( a );
   4893  1.1      ross     aExp = extractFloat128Exp( a );
   4894  1.1      ross     aSign = extractFloat128Sign( a );
   4895  1.1      ross     bSig1 = extractFloat128Frac1( b );
   4896  1.1      ross     bSig0 = extractFloat128Frac0( b );
   4897  1.1      ross     bExp = extractFloat128Exp( b );
   4898  1.1      ross     bSign = extractFloat128Sign( b );
   4899  1.1      ross     zSign = aSign ^ bSign;
   4900  1.1      ross     if ( aExp == 0x7FFF ) {
   4901  1.1      ross         if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, b );
   4902  1.1      ross         if ( bExp == 0x7FFF ) {
   4903  1.1      ross             if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b );
   4904  1.1      ross             goto invalid;
   4905  1.1      ross         }
   4906  1.1      ross         return packFloat128( zSign, 0x7FFF, 0, 0 );
   4907  1.1      ross     }
   4908  1.1      ross     if ( bExp == 0x7FFF ) {
   4909  1.1      ross         if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b );
   4910  1.1      ross         return packFloat128( zSign, 0, 0, 0 );
   4911  1.1      ross     }
   4912  1.1      ross     if ( bExp == 0 ) {
   4913  1.1      ross         if ( ( bSig0 | bSig1 ) == 0 ) {
   4914  1.1      ross             if ( ( aExp | aSig0 | aSig1 ) == 0 ) {
   4915  1.1      ross  invalid:
   4916  1.1      ross                 float_raise( float_flag_invalid );
   4917  1.1      ross                 z.low = float128_default_nan_low;
   4918  1.1      ross                 z.high = float128_default_nan_high;
   4919  1.1      ross                 return z;
   4920  1.1      ross             }
   4921  1.1      ross             float_raise( float_flag_divbyzero );
   4922  1.1      ross             return packFloat128( zSign, 0x7FFF, 0, 0 );
   4923  1.1      ross         }
   4924  1.1      ross         normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
   4925  1.1      ross     }
   4926  1.1      ross     if ( aExp == 0 ) {
   4927  1.1      ross         if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
   4928  1.1      ross         normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
   4929  1.1      ross     }
   4930  1.1      ross     zExp = aExp - bExp + 0x3FFD;
   4931  1.1      ross     shortShift128Left(
   4932  1.1      ross         aSig0 | LIT64( 0x0001000000000000 ), aSig1, 15, &aSig0, &aSig1 );
   4933  1.1      ross     shortShift128Left(
   4934  1.1      ross         bSig0 | LIT64( 0x0001000000000000 ), bSig1, 15, &bSig0, &bSig1 );
   4935  1.1      ross     if ( le128( bSig0, bSig1, aSig0, aSig1 ) ) {
   4936  1.1      ross         shift128Right( aSig0, aSig1, 1, &aSig0, &aSig1 );
   4937  1.1      ross         ++zExp;
   4938  1.1      ross     }
   4939  1.1      ross     zSig0 = estimateDiv128To64( aSig0, aSig1, bSig0 );
   4940  1.1      ross     mul128By64To192( bSig0, bSig1, zSig0, &term0, &term1, &term2 );
   4941  1.1      ross     sub192( aSig0, aSig1, 0, term0, term1, term2, &rem0, &rem1, &rem2 );
   4942  1.1      ross     while ( (sbits64) rem0 < 0 ) {
   4943  1.1      ross         --zSig0;
   4944  1.1      ross         add192( rem0, rem1, rem2, 0, bSig0, bSig1, &rem0, &rem1, &rem2 );
   4945  1.1      ross     }
   4946  1.1      ross     zSig1 = estimateDiv128To64( rem1, rem2, bSig0 );
   4947  1.1      ross     if ( ( zSig1 & 0x3FFF ) <= 4 ) {
   4948  1.1      ross         mul128By64To192( bSig0, bSig1, zSig1, &term1, &term2, &term3 );
   4949  1.1      ross         sub192( rem1, rem2, 0, term1, term2, term3, &rem1, &rem2, &rem3 );
   4950  1.1      ross         while ( (sbits64) rem1 < 0 ) {
   4951  1.1      ross             --zSig1;
   4952  1.1      ross             add192( rem1, rem2, rem3, 0, bSig0, bSig1, &rem1, &rem2, &rem3 );
   4953  1.1      ross         }
   4954  1.1      ross         zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
   4955  1.1      ross     }
   4956  1.1      ross     shift128ExtraRightJamming( zSig0, zSig1, 0, 15, &zSig0, &zSig1, &zSig2 );
   4957  1.1      ross     return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 );
   4958  1.1      ross 
   4959  1.1      ross }
   4960  1.1      ross 
   4961  1.7   thorpej /*----------------------------------------------------------------------------
   4962  1.7   thorpej | Returns the remainder of the quadruple-precision floating-point value `a'
   4963  1.7   thorpej | with respect to the corresponding value `b'.  The operation is performed
   4964  1.7   thorpej | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
   4965  1.7   thorpej *----------------------------------------------------------------------------*/
   4966  1.7   thorpej 
   4967  1.1      ross float128 float128_rem( float128 a, float128 b )
   4968  1.1      ross {
   4969  1.1      ross     flag aSign, bSign, zSign;
   4970  1.1      ross     int32 aExp, bExp, expDiff;
   4971  1.1      ross     bits64 aSig0, aSig1, bSig0, bSig1, q, term0, term1, term2;
   4972  1.1      ross     bits64 allZero, alternateASig0, alternateASig1, sigMean1;
   4973  1.1      ross     sbits64 sigMean0;
   4974  1.1      ross     float128 z;
   4975  1.1      ross 
   4976  1.1      ross     aSig1 = extractFloat128Frac1( a );
   4977  1.1      ross     aSig0 = extractFloat128Frac0( a );
   4978  1.1      ross     aExp = extractFloat128Exp( a );
   4979  1.1      ross     aSign = extractFloat128Sign( a );
   4980  1.1      ross     bSig1 = extractFloat128Frac1( b );
   4981  1.1      ross     bSig0 = extractFloat128Frac0( b );
   4982  1.1      ross     bExp = extractFloat128Exp( b );
   4983  1.1      ross     bSign = extractFloat128Sign( b );
   4984  1.1      ross     if ( aExp == 0x7FFF ) {
   4985  1.1      ross         if (    ( aSig0 | aSig1 )
   4986  1.1      ross              || ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) {
   4987  1.1      ross             return propagateFloat128NaN( a, b );
   4988  1.1      ross         }
   4989  1.1      ross         goto invalid;
   4990  1.1      ross     }
   4991  1.1      ross     if ( bExp == 0x7FFF ) {
   4992  1.1      ross         if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b );
   4993  1.1      ross         return a;
   4994  1.1      ross     }
   4995  1.1      ross     if ( bExp == 0 ) {
   4996  1.1      ross         if ( ( bSig0 | bSig1 ) == 0 ) {
   4997  1.1      ross  invalid:
   4998  1.1      ross             float_raise( float_flag_invalid );
   4999  1.1      ross             z.low = float128_default_nan_low;
   5000  1.1      ross             z.high = float128_default_nan_high;
   5001  1.1      ross             return z;
   5002  1.1      ross         }
   5003  1.1      ross         normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
   5004  1.1      ross     }
   5005  1.1      ross     if ( aExp == 0 ) {
   5006  1.1      ross         if ( ( aSig0 | aSig1 ) == 0 ) return a;
   5007  1.1      ross         normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
   5008  1.1      ross     }
   5009  1.1      ross     expDiff = aExp - bExp;
   5010  1.1      ross     if ( expDiff < -1 ) return a;
   5011  1.1      ross     shortShift128Left(
   5012  1.1      ross         aSig0 | LIT64( 0x0001000000000000 ),
   5013  1.1      ross         aSig1,
   5014  1.1      ross         15 - ( expDiff < 0 ),
   5015  1.1      ross         &aSig0,
   5016  1.1      ross         &aSig1
   5017  1.1      ross     );
   5018  1.1      ross     shortShift128Left(
   5019  1.1      ross         bSig0 | LIT64( 0x0001000000000000 ), bSig1, 15, &bSig0, &bSig1 );
   5020  1.1      ross     q = le128( bSig0, bSig1, aSig0, aSig1 );
   5021  1.1      ross     if ( q ) sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 );
   5022  1.1      ross     expDiff -= 64;
   5023  1.1      ross     while ( 0 < expDiff ) {
   5024  1.1      ross         q = estimateDiv128To64( aSig0, aSig1, bSig0 );
   5025  1.1      ross         q = ( 4 < q ) ? q - 4 : 0;
   5026  1.1      ross         mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 );
   5027  1.1      ross         shortShift192Left( term0, term1, term2, 61, &term1, &term2, &allZero );
   5028  1.1      ross         shortShift128Left( aSig0, aSig1, 61, &aSig0, &allZero );
   5029  1.1      ross         sub128( aSig0, 0, term1, term2, &aSig0, &aSig1 );
   5030  1.1      ross         expDiff -= 61;
   5031  1.1      ross     }
   5032  1.1      ross     if ( -64 < expDiff ) {
   5033  1.1      ross         q = estimateDiv128To64( aSig0, aSig1, bSig0 );
   5034  1.1      ross         q = ( 4 < q ) ? q - 4 : 0;
   5035  1.1      ross         q >>= - expDiff;
   5036  1.1      ross         shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 );
   5037  1.1      ross         expDiff += 52;
   5038  1.1      ross         if ( expDiff < 0 ) {
   5039  1.1      ross             shift128Right( aSig0, aSig1, - expDiff, &aSig0, &aSig1 );
   5040  1.1      ross         }
   5041  1.1      ross         else {
   5042  1.1      ross             shortShift128Left( aSig0, aSig1, expDiff, &aSig0, &aSig1 );
   5043  1.1      ross         }
   5044  1.1      ross         mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 );
   5045  1.1      ross         sub128( aSig0, aSig1, term1, term2, &aSig0, &aSig1 );
   5046  1.1      ross     }
   5047  1.1      ross     else {
   5048  1.1      ross         shift128Right( aSig0, aSig1, 12, &aSig0, &aSig1 );
   5049  1.1      ross         shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 );
   5050  1.1      ross     }
   5051  1.1      ross     do {
   5052  1.1      ross         alternateASig0 = aSig0;
   5053  1.1      ross         alternateASig1 = aSig1;
   5054  1.1      ross         ++q;
   5055  1.1      ross         sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 );
   5056  1.1      ross     } while ( 0 <= (sbits64) aSig0 );
   5057  1.1      ross     add128(
   5058  1.1      ross         aSig0, aSig1, alternateASig0, alternateASig1, &sigMean0, &sigMean1 );
   5059  1.1      ross     if (    ( sigMean0 < 0 )
   5060  1.1      ross          || ( ( ( sigMean0 | sigMean1 ) == 0 ) && ( q & 1 ) ) ) {
   5061  1.1      ross         aSig0 = alternateASig0;
   5062  1.1      ross         aSig1 = alternateASig1;
   5063  1.1      ross     }
   5064  1.1      ross     zSign = ( (sbits64) aSig0 < 0 );
   5065  1.1      ross     if ( zSign ) sub128( 0, 0, aSig0, aSig1, &aSig0, &aSig1 );
   5066  1.1      ross     return
   5067  1.1      ross         normalizeRoundAndPackFloat128( aSign ^ zSign, bExp - 4, aSig0, aSig1 );
   5068  1.1      ross 
   5069  1.1      ross }
   5070  1.1      ross 
   5071  1.7   thorpej /*----------------------------------------------------------------------------
   5072  1.7   thorpej | Returns the square root of the quadruple-precision floating-point value `a'.
   5073  1.7   thorpej | The operation is performed according to the IEC/IEEE Standard for Binary
   5074  1.7   thorpej | Floating-Point Arithmetic.
   5075  1.7   thorpej *----------------------------------------------------------------------------*/
   5076  1.7   thorpej 
   5077  1.1      ross float128 float128_sqrt( float128 a )
   5078  1.1      ross {
   5079  1.1      ross     flag aSign;
   5080  1.1      ross     int32 aExp, zExp;
   5081  1.1      ross     bits64 aSig0, aSig1, zSig0, zSig1, zSig2, doubleZSig0;
   5082  1.1      ross     bits64 rem0, rem1, rem2, rem3, term0, term1, term2, term3;
   5083  1.1      ross     float128 z;
   5084  1.1      ross 
   5085  1.1      ross     aSig1 = extractFloat128Frac1( a );
   5086  1.1      ross     aSig0 = extractFloat128Frac0( a );
   5087  1.1      ross     aExp = extractFloat128Exp( a );
   5088  1.1      ross     aSign = extractFloat128Sign( a );
   5089  1.1      ross     if ( aExp == 0x7FFF ) {
   5090  1.1      ross         if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, a );
   5091  1.1      ross         if ( ! aSign ) return a;
   5092  1.1      ross         goto invalid;
   5093  1.1      ross     }
   5094  1.1      ross     if ( aSign ) {
   5095  1.1      ross         if ( ( aExp | aSig0 | aSig1 ) == 0 ) return a;
   5096  1.1      ross  invalid:
   5097  1.1      ross         float_raise( float_flag_invalid );
   5098  1.1      ross         z.low = float128_default_nan_low;
   5099  1.1      ross         z.high = float128_default_nan_high;
   5100  1.1      ross         return z;
   5101  1.1      ross     }
   5102  1.1      ross     if ( aExp == 0 ) {
   5103  1.1      ross         if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( 0, 0, 0, 0 );
   5104  1.1      ross         normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
   5105  1.1      ross     }
   5106  1.1      ross     zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFE;
   5107  1.1      ross     aSig0 |= LIT64( 0x0001000000000000 );
   5108  1.1      ross     zSig0 = estimateSqrt32( aExp, aSig0>>17 );
   5109  1.1      ross     shortShift128Left( aSig0, aSig1, 13 - ( aExp & 1 ), &aSig0, &aSig1 );
   5110  1.1      ross     zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 );
   5111  1.1      ross     doubleZSig0 = zSig0<<1;
   5112  1.1      ross     mul64To128( zSig0, zSig0, &term0, &term1 );
   5113  1.1      ross     sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 );
   5114  1.1      ross     while ( (sbits64) rem0 < 0 ) {
   5115  1.1      ross         --zSig0;
   5116  1.1      ross         doubleZSig0 -= 2;
   5117  1.1      ross         add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 );
   5118  1.1      ross     }
   5119  1.1      ross     zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 );
   5120  1.1      ross     if ( ( zSig1 & 0x1FFF ) <= 5 ) {
   5121  1.1      ross         if ( zSig1 == 0 ) zSig1 = 1;
   5122  1.1      ross         mul64To128( doubleZSig0, zSig1, &term1, &term2 );
   5123  1.1      ross         sub128( rem1, 0, term1, term2, &rem1, &rem2 );
   5124  1.1      ross         mul64To128( zSig1, zSig1, &term2, &term3 );
   5125  1.1      ross         sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 );
   5126  1.1      ross         while ( (sbits64) rem1 < 0 ) {
   5127  1.1      ross             --zSig1;
   5128  1.1      ross             shortShift128Left( 0, zSig1, 1, &term2, &term3 );
   5129  1.1      ross             term3 |= 1;
   5130  1.1      ross             term2 |= doubleZSig0;
   5131  1.1      ross             add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 );
   5132  1.1      ross         }
   5133  1.1      ross         zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
   5134  1.1      ross     }
   5135  1.1      ross     shift128ExtraRightJamming( zSig0, zSig1, 0, 14, &zSig0, &zSig1, &zSig2 );
   5136  1.1      ross     return roundAndPackFloat128( 0, zExp, zSig0, zSig1, zSig2 );
   5137  1.1      ross 
   5138  1.1      ross }
   5139  1.1      ross 
   5140  1.7   thorpej /*----------------------------------------------------------------------------
   5141  1.7   thorpej | Returns 1 if the quadruple-precision floating-point value `a' is equal to
   5142  1.7   thorpej | the corresponding value `b', and 0 otherwise.  The comparison is performed
   5143  1.7   thorpej | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
   5144  1.7   thorpej *----------------------------------------------------------------------------*/
   5145  1.7   thorpej 
   5146  1.1      ross flag float128_eq( float128 a, float128 b )
   5147  1.1      ross {
   5148  1.1      ross 
   5149  1.1      ross     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
   5150  1.1      ross               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
   5151  1.1      ross          || (    ( extractFloat128Exp( b ) == 0x7FFF )
   5152  1.1      ross               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
   5153  1.1      ross        ) {
   5154  1.1      ross         if (    float128_is_signaling_nan( a )
   5155  1.1      ross              || float128_is_signaling_nan( b ) ) {
   5156  1.1      ross             float_raise( float_flag_invalid );
   5157  1.1      ross         }
   5158  1.1      ross         return 0;
   5159  1.1      ross     }
   5160  1.1      ross     return
   5161  1.1      ross            ( a.low == b.low )
   5162  1.1      ross         && (    ( a.high == b.high )
   5163  1.1      ross              || (    ( a.low == 0 )
   5164  1.1      ross                   && ( (bits64) ( ( a.high | b.high )<<1 ) == 0 ) )
   5165  1.1      ross            );
   5166  1.1      ross 
   5167  1.1      ross }
   5168  1.1      ross 
   5169  1.7   thorpej /*----------------------------------------------------------------------------
   5170  1.7   thorpej | Returns 1 if the quadruple-precision floating-point value `a' is less than
   5171  1.7   thorpej | or equal to the corresponding value `b', and 0 otherwise.  The comparison
   5172  1.7   thorpej | is performed according to the IEC/IEEE Standard for Binary Floating-Point
   5173  1.7   thorpej | Arithmetic.
   5174  1.7   thorpej *----------------------------------------------------------------------------*/
   5175  1.7   thorpej 
   5176  1.1      ross flag float128_le( float128 a, float128 b )
   5177  1.1      ross {
   5178  1.1      ross     flag aSign, bSign;
   5179  1.1      ross 
   5180  1.1      ross     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
   5181  1.1      ross               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
   5182  1.1      ross          || (    ( extractFloat128Exp( b ) == 0x7FFF )
   5183  1.1      ross               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
   5184  1.1      ross        ) {
   5185  1.1      ross         float_raise( float_flag_invalid );
   5186  1.1      ross         return 0;
   5187  1.1      ross     }
   5188  1.1      ross     aSign = extractFloat128Sign( a );
   5189  1.1      ross     bSign = extractFloat128Sign( b );
   5190  1.1      ross     if ( aSign != bSign ) {
   5191  1.1      ross         return
   5192  1.1      ross                aSign
   5193  1.1      ross             || (    ( ( (bits64) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
   5194  1.1      ross                  == 0 );
   5195  1.1      ross     }
   5196  1.1      ross     return
   5197  1.1      ross           aSign ? le128( b.high, b.low, a.high, a.low )
   5198  1.1      ross         : le128( a.high, a.low, b.high, b.low );
   5199  1.1      ross 
   5200  1.1      ross }
   5201  1.1      ross 
   5202  1.7   thorpej /*----------------------------------------------------------------------------
   5203  1.7   thorpej | Returns 1 if the quadruple-precision floating-point value `a' is less than
   5204  1.7   thorpej | the corresponding value `b', and 0 otherwise.  The comparison is performed
   5205  1.7   thorpej | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
   5206  1.7   thorpej *----------------------------------------------------------------------------*/
   5207  1.7   thorpej 
   5208  1.1      ross flag float128_lt( float128 a, float128 b )
   5209  1.1      ross {
   5210  1.1      ross     flag aSign, bSign;
   5211  1.1      ross 
   5212  1.1      ross     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
   5213  1.1      ross               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
   5214  1.1      ross          || (    ( extractFloat128Exp( b ) == 0x7FFF )
   5215  1.1      ross               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
   5216  1.1      ross        ) {
   5217  1.1      ross         float_raise( float_flag_invalid );
   5218  1.1      ross         return 0;
   5219  1.1      ross     }
   5220  1.1      ross     aSign = extractFloat128Sign( a );
   5221  1.1      ross     bSign = extractFloat128Sign( b );
   5222  1.1      ross     if ( aSign != bSign ) {
   5223  1.1      ross         return
   5224  1.1      ross                aSign
   5225  1.1      ross             && (    ( ( (bits64) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
   5226  1.1      ross                  != 0 );
   5227  1.1      ross     }
   5228  1.1      ross     return
   5229  1.1      ross           aSign ? lt128( b.high, b.low, a.high, a.low )
   5230  1.1      ross         : lt128( a.high, a.low, b.high, b.low );
   5231  1.1      ross 
   5232  1.1      ross }
   5233  1.1      ross 
   5234  1.7   thorpej /*----------------------------------------------------------------------------
   5235  1.7   thorpej | Returns 1 if the quadruple-precision floating-point value `a' is equal to
   5236  1.7   thorpej | the corresponding value `b', and 0 otherwise.  The invalid exception is
   5237  1.7   thorpej | raised if either operand is a NaN.  Otherwise, the comparison is performed
   5238  1.7   thorpej | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
   5239  1.7   thorpej *----------------------------------------------------------------------------*/
   5240  1.7   thorpej 
   5241  1.1      ross flag float128_eq_signaling( float128 a, float128 b )
   5242  1.1      ross {
   5243  1.1      ross 
   5244  1.1      ross     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
   5245  1.1      ross               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
   5246  1.1      ross          || (    ( extractFloat128Exp( b ) == 0x7FFF )
   5247  1.1      ross               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
   5248  1.1      ross        ) {
   5249  1.1      ross         float_raise( float_flag_invalid );
   5250  1.1      ross         return 0;
   5251  1.1      ross     }
   5252  1.1      ross     return
   5253  1.1      ross            ( a.low == b.low )
   5254  1.1      ross         && (    ( a.high == b.high )
   5255  1.1      ross              || (    ( a.low == 0 )
   5256  1.1      ross                   && ( (bits64) ( ( a.high | b.high )<<1 ) == 0 ) )
   5257  1.1      ross            );
   5258  1.1      ross 
   5259  1.1      ross }
   5260  1.1      ross 
   5261  1.7   thorpej /*----------------------------------------------------------------------------
   5262  1.7   thorpej | Returns 1 if the quadruple-precision floating-point value `a' is less than
   5263  1.7   thorpej | or equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs do not
   5264  1.7   thorpej | cause an exception.  Otherwise, the comparison is performed according to the
   5265  1.7   thorpej | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
   5266  1.7   thorpej *----------------------------------------------------------------------------*/
   5267  1.7   thorpej 
   5268  1.1      ross flag float128_le_quiet( float128 a, float128 b )
   5269  1.1      ross {
   5270  1.1      ross     flag aSign, bSign;
   5271  1.1      ross 
   5272  1.1      ross     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
   5273  1.1      ross               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
   5274  1.1      ross          || (    ( extractFloat128Exp( b ) == 0x7FFF )
   5275  1.1      ross               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
   5276  1.1      ross        ) {
   5277  1.1      ross         if (    float128_is_signaling_nan( a )
   5278  1.1      ross              || float128_is_signaling_nan( b ) ) {
   5279  1.1      ross             float_raise( float_flag_invalid );
   5280  1.1      ross         }
   5281  1.1      ross         return 0;
   5282  1.1      ross     }
   5283  1.1      ross     aSign = extractFloat128Sign( a );
   5284  1.1      ross     bSign = extractFloat128Sign( b );
   5285  1.1      ross     if ( aSign != bSign ) {
   5286  1.1      ross         return
   5287  1.1      ross                aSign
   5288  1.1      ross             || (    ( ( (bits64) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
   5289  1.1      ross                  == 0 );
   5290  1.1      ross     }
   5291  1.1      ross     return
   5292  1.1      ross           aSign ? le128( b.high, b.low, a.high, a.low )
   5293  1.1      ross         : le128( a.high, a.low, b.high, b.low );
   5294  1.1      ross 
   5295  1.1      ross }
   5296  1.1      ross 
   5297  1.7   thorpej /*----------------------------------------------------------------------------
   5298  1.7   thorpej | Returns 1 if the quadruple-precision floating-point value `a' is less than
   5299  1.7   thorpej | the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
   5300  1.7   thorpej | exception.  Otherwise, the comparison is performed according to the IEC/IEEE
   5301  1.7   thorpej | Standard for Binary Floating-Point Arithmetic.
   5302  1.7   thorpej *----------------------------------------------------------------------------*/
   5303  1.7   thorpej 
   5304  1.1      ross flag float128_lt_quiet( float128 a, float128 b )
   5305  1.1      ross {
   5306  1.1      ross     flag aSign, bSign;
   5307  1.1      ross 
   5308  1.1      ross     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
   5309  1.1      ross               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
   5310  1.1      ross          || (    ( extractFloat128Exp( b ) == 0x7FFF )
   5311  1.1      ross               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
   5312  1.1      ross        ) {
   5313  1.1      ross         if (    float128_is_signaling_nan( a )
   5314  1.1      ross              || float128_is_signaling_nan( b ) ) {
   5315  1.1      ross             float_raise( float_flag_invalid );
   5316  1.1      ross         }
   5317  1.1      ross         return 0;
   5318  1.1      ross     }
   5319  1.1      ross     aSign = extractFloat128Sign( a );
   5320  1.1      ross     bSign = extractFloat128Sign( b );
   5321  1.1      ross     if ( aSign != bSign ) {
   5322  1.1      ross         return
   5323  1.1      ross                aSign
   5324  1.1      ross             && (    ( ( (bits64) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
   5325  1.1      ross                  != 0 );
   5326  1.1      ross     }
   5327  1.1      ross     return
   5328  1.1      ross           aSign ? lt128( b.high, b.low, a.high, a.low )
   5329  1.1      ross         : lt128( a.high, a.low, b.high, b.low );
   5330  1.1      ross 
   5331  1.1      ross }
   5332  1.1      ross 
   5333  1.1      ross #endif
   5334  1.1      ross 
   5335  1.1      ross 
   5336  1.1      ross #if defined(SOFTFLOAT_FOR_GCC) && defined(SOFTFLOAT_NEED_FIXUNS)
   5337  1.1      ross 
   5338  1.1      ross /*
   5339  1.1      ross  * These two routines are not part of the original softfloat distribution.
   5340  1.1      ross  *
   5341  1.1      ross  * They are based on the corresponding conversions to integer but return
   5342  1.1      ross  * unsigned numbers instead since these functions are required by GCC.
   5343  1.1      ross  *
   5344  1.3    keihan  * Added by Mark Brinicombe <mark (at) NetBSD.org>	27/09/97
   5345  1.1      ross  *
   5346  1.1      ross  * float64 version overhauled for SoftFloat 2a [bjh21 2000-07-15]
   5347  1.1      ross  */
   5348  1.1      ross 
   5349  1.7   thorpej /*----------------------------------------------------------------------------
   5350  1.7   thorpej | Returns the result of converting the double-precision floating-point value
   5351  1.7   thorpej | `a' to the 32-bit unsigned integer format.  The conversion is
   5352  1.7   thorpej | performed according to the IEC/IEEE Standard for Binary Floating-point
   5353  1.7   thorpej | Arithmetic, except that the conversion is always rounded toward zero.  If
   5354  1.7   thorpej | `a' is a NaN, the largest positive integer is returned.  If the conversion
   5355  1.7   thorpej | overflows, the largest integer positive is returned.
   5356  1.7   thorpej *----------------------------------------------------------------------------*/
   5357  1.7   thorpej 
   5358  1.1      ross uint32 float64_to_uint32_round_to_zero( float64 a )
   5359  1.1      ross {
   5360  1.1      ross     flag aSign;
   5361  1.1      ross     int16 aExp, shiftCount;
   5362  1.1      ross     bits64 aSig, savedASig;
   5363  1.1      ross     uint32 z;
   5364  1.1      ross 
   5365  1.1      ross     aSig = extractFloat64Frac( a );
   5366  1.1      ross     aExp = extractFloat64Exp( a );
   5367  1.1      ross     aSign = extractFloat64Sign( a );
   5368  1.1      ross 
   5369  1.1      ross     if (aSign) {
   5370  1.1      ross         float_raise( float_flag_invalid );
   5371  1.1      ross     	return(0);
   5372  1.1      ross     }
   5373  1.1      ross 
   5374  1.1      ross     if ( 0x41E < aExp ) {
   5375  1.1      ross         float_raise( float_flag_invalid );
   5376  1.1      ross         return 0xffffffff;
   5377  1.1      ross     }
   5378  1.1      ross     else if ( aExp < 0x3FF ) {
   5379  1.1      ross         if ( aExp || aSig ) float_set_inexact();
   5380  1.1      ross         return 0;
   5381  1.1      ross     }
   5382  1.1      ross     aSig |= LIT64( 0x0010000000000000 );
   5383  1.1      ross     shiftCount = 0x433 - aExp;
   5384  1.1      ross     savedASig = aSig;
   5385  1.1      ross     aSig >>= shiftCount;
   5386  1.1      ross     z = aSig;
   5387  1.1      ross     if ( ( aSig<<shiftCount ) != savedASig ) {
   5388  1.1      ross         float_set_inexact();
   5389  1.1      ross     }
   5390  1.1      ross     return z;
   5391  1.1      ross 
   5392  1.1      ross }
   5393  1.1      ross 
   5394  1.7   thorpej /*----------------------------------------------------------------------------
   5395  1.7   thorpej | Returns the result of converting the single-precision floating-point value
   5396  1.7   thorpej | `a' to the 32-bit unsigned integer format.  The conversion is
   5397  1.7   thorpej | performed according to the IEC/IEEE Standard for Binary Floating-point
   5398  1.7   thorpej | Arithmetic, except that the conversion is always rounded toward zero.  If
   5399  1.7   thorpej | `a' is a NaN, the largest positive integer is returned.  If the conversion
   5400  1.7   thorpej | overflows, the largest positive integer is returned.
   5401  1.7   thorpej *----------------------------------------------------------------------------*/
   5402  1.7   thorpej 
   5403  1.1      ross uint32 float32_to_uint32_round_to_zero( float32 a )
   5404  1.1      ross {
   5405  1.1      ross     flag aSign;
   5406  1.1      ross     int16 aExp, shiftCount;
   5407  1.1      ross     bits32 aSig;
   5408  1.1      ross     uint32 z;
   5409  1.1      ross 
   5410  1.1      ross     aSig = extractFloat32Frac( a );
   5411  1.1      ross     aExp = extractFloat32Exp( a );
   5412  1.1      ross     aSign = extractFloat32Sign( a );
   5413  1.1      ross     shiftCount = aExp - 0x9E;
   5414  1.1      ross 
   5415  1.1      ross     if (aSign) {
   5416  1.1      ross         float_raise( float_flag_invalid );
   5417  1.1      ross     	return(0);
   5418  1.1      ross     }
   5419  1.1      ross     if ( 0 < shiftCount ) {
   5420  1.1      ross         float_raise( float_flag_invalid );
   5421  1.1      ross         return 0xFFFFFFFF;
   5422  1.1      ross     }
   5423  1.1      ross     else if ( aExp <= 0x7E ) {
   5424  1.1      ross         if ( aExp | aSig ) float_set_inexact();
   5425  1.1      ross         return 0;
   5426  1.1      ross     }
   5427  1.1      ross     aSig = ( aSig | 0x800000 )<<8;
   5428  1.1      ross     z = aSig>>( - shiftCount );
   5429  1.1      ross     if ( aSig<<( shiftCount & 31 ) ) {
   5430  1.1      ross         float_set_inexact();
   5431  1.1      ross     }
   5432  1.1      ross     return z;
   5433  1.1      ross 
   5434  1.1      ross }
   5435  1.1      ross 
   5436  1.1      ross #endif
   5437  1.2   thorpej 
   5438  1.2   thorpej #endif /* _STANDALONE */
   5439