1 /* $NetBSD: softfloat.c,v 1.18 2025/09/29 02:50:20 nat Exp $ */ 2 3 /* 4 * This version hacked for use with gcc -msoft-float by bjh21. 5 * (Mostly a case of #ifdefing out things GCC doesn't need or provides 6 * itself). 7 */ 8 9 /* 10 * Things you may want to define: 11 * 12 * SOFTFLOAT_FOR_GCC - build only those functions necessary for GCC (with 13 * -msoft-float) to work. Include "softfloat-for-gcc.h" to get them 14 * properly renamed. 15 */ 16 17 /* 18 =============================================================================== 19 20 This C source file is part of the SoftFloat IEC/IEEE Floating-point 21 Arithmetic Package, Release 2a. 22 23 Written by John R. Hauser. This work was made possible in part by the 24 International Computer Science Institute, located at Suite 600, 1947 Center 25 Street, Berkeley, California 94704. Funding was partially provided by the 26 National Science Foundation under grant MIP-9311980. The original version 27 of this code was written as part of a project to build a fixed-point vector 28 processor in collaboration with the University of California at Berkeley, 29 overseen by Profs. Nelson Morgan and John Wawrzynek. More information 30 is available through the Web page `http://HTTP.CS.Berkeley.EDU/~jhauser/ 31 arithmetic/SoftFloat.html'. 32 33 THIS SOFTWARE IS DISTRIBUTED AS IS, FOR FREE. Although reasonable effort 34 has been made to avoid it, THIS SOFTWARE MAY CONTAIN FAULTS THAT WILL AT 35 TIMES RESULT IN INCORRECT BEHAVIOR. USE OF THIS SOFTWARE IS RESTRICTED TO 36 PERSONS AND ORGANIZATIONS WHO CAN AND WILL TAKE FULL RESPONSIBILITY FOR ANY 37 AND ALL LOSSES, COSTS, OR OTHER PROBLEMS ARISING FROM ITS USE. 38 39 Derivative works are acceptable, even for commercial purposes, so long as 40 (1) they include prominent notice that the work is derivative, and (2) they 41 include prominent notice akin to these four paragraphs for those parts of 42 this code that are retained. 43 44 =============================================================================== 45 */ 46 47 #include <sys/cdefs.h> 48 #if defined(LIBC_SCCS) && !defined(lint) 49 __RCSID("$NetBSD: softfloat.c,v 1.18 2025/09/29 02:50:20 nat Exp $"); 50 #endif /* LIBC_SCCS and not lint */ 51 52 #ifdef SOFTFLOAT_FOR_GCC 53 #include "softfloat-for-gcc.h" 54 #endif 55 56 #include "milieu.h" 57 #include "softfloat.h" 58 59 /* 60 * Conversions between floats as stored in memory and floats as 61 * SoftFloat uses them 62 */ 63 #ifndef FLOAT64_DEMANGLE 64 #define FLOAT64_DEMANGLE(a) (a) 65 #endif 66 #ifndef FLOAT64_MANGLE 67 #define FLOAT64_MANGLE(a) (a) 68 #endif 69 70 #ifndef X80SHIFT 71 #define X80SHIFT 0 72 #endif 73 /* 74 ------------------------------------------------------------------------------- 75 Floating-point rounding mode, extended double-precision rounding precision, 76 and exception flags. 77 ------------------------------------------------------------------------------- 78 */ 79 #ifndef set_float_rounding_mode 80 fp_rnd float_rounding_mode = float_round_nearest_even; 81 fp_except float_exception_flags = 0; 82 #endif 83 #ifndef set_float_exception_inexact_flag 84 #define set_float_exception_inexact_flag() \ 85 ((void)(float_exception_flags |= float_flag_inexact)) 86 #endif 87 #ifdef FLOATX80 88 int8 floatx80_rounding_precision = 80; 89 #endif 90 91 /* 92 ------------------------------------------------------------------------------- 93 Primitive arithmetic functions, including multi-word arithmetic, and 94 division and square root approximations. (Can be specialized to target if 95 desired.) 96 ------------------------------------------------------------------------------- 97 */ 98 #include "softfloat-macros" 99 100 /* 101 ------------------------------------------------------------------------------- 102 Functions and definitions to determine: (1) whether tininess for underflow 103 is detected before or after rounding by default, (2) what (if anything) 104 happens when exceptions are raised, (3) how signaling NaNs are distinguished 105 from quiet NaNs, (4) the default generated quiet NaNs, and (5) how NaNs 106 are propagated from function inputs to output. These details are target- 107 specific. 108 ------------------------------------------------------------------------------- 109 */ 110 #include "softfloat-specialize" 111 112 #if !defined(SOFTFLOAT_FOR_GCC) || defined(FLOATX80) || defined(FLOAT128) 113 /* 114 ------------------------------------------------------------------------------- 115 Takes a 64-bit fixed-point value `absZ' with binary point between bits 6 116 and 7, and returns the properly rounded 32-bit integer corresponding to the 117 input. If `zSign' is 1, the input is negated before being converted to an 118 integer. Bit 63 of `absZ' must be zero. Ordinarily, the fixed-point input 119 is simply rounded to an integer, with the inexact exception raised if the 120 input cannot be represented exactly as an integer. However, if the fixed- 121 point input is too large, the invalid exception is raised and the largest 122 positive or negative integer is returned. 123 ------------------------------------------------------------------------------- 124 */ 125 static int32 roundAndPackInt32( flag zSign, bits64 absZ ) 126 { 127 int8 roundingMode; 128 flag roundNearestEven; 129 int8 roundIncrement, roundBits; 130 int32 z; 131 132 roundingMode = float_rounding_mode; 133 roundNearestEven = ( roundingMode == float_round_nearest_even ); 134 roundIncrement = 0x40; 135 if ( ! roundNearestEven ) { 136 if ( roundingMode == float_round_to_zero ) { 137 roundIncrement = 0; 138 } 139 else { 140 roundIncrement = 0x7F; 141 if ( zSign ) { 142 if ( roundingMode == float_round_up ) roundIncrement = 0; 143 } 144 else { 145 if ( roundingMode == float_round_down ) roundIncrement = 0; 146 } 147 } 148 } 149 roundBits = (int8)(absZ & 0x7F); 150 absZ = ( absZ + roundIncrement )>>7; 151 absZ &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven ); 152 z = (int32)absZ; 153 if ( zSign ) z = - z; 154 if ( ( absZ>>32 ) || ( z && ( ( z < 0 ) ^ zSign ) ) ) { 155 float_raise( float_flag_invalid ); 156 return zSign ? (sbits32) 0x80000000 : 0x7FFFFFFF; 157 } 158 if ( roundBits ) set_float_exception_inexact_flag(); 159 return z; 160 161 } 162 163 /* 164 ------------------------------------------------------------------------------- 165 Takes the 128-bit fixed-point value formed by concatenating `absZ0' and 166 `absZ1', with binary point between bits 63 and 64 (between the input words), 167 and returns the properly rounded 64-bit integer corresponding to the input. 168 If `zSign' is 1, the input is negated before being converted to an integer. 169 Ordinarily, the fixed-point input is simply rounded to an integer, with 170 the inexact exception raised if the input cannot be represented exactly as 171 an integer. However, if the fixed-point input is too large, the invalid 172 exception is raised and the largest positive or negative integer is 173 returned. 174 ------------------------------------------------------------------------------- 175 */ 176 static int64 roundAndPackInt64( flag zSign, bits64 absZ0, bits64 absZ1 ) 177 { 178 int8 roundingMode; 179 flag roundNearestEven, increment; 180 int64 z; 181 182 roundingMode = float_rounding_mode; 183 roundNearestEven = ( roundingMode == float_round_nearest_even ); 184 increment = ( (sbits64) absZ1 < 0 ); 185 if ( ! roundNearestEven ) { 186 if ( roundingMode == float_round_to_zero ) { 187 increment = 0; 188 } 189 else { 190 if ( zSign ) { 191 increment = ( roundingMode == float_round_down ) && absZ1; 192 } 193 else { 194 increment = ( roundingMode == float_round_up ) && absZ1; 195 } 196 } 197 } 198 if ( increment ) { 199 ++absZ0; 200 if ( absZ0 == 0 ) goto overflow; 201 absZ0 &= ~ ( ( (bits64) ( absZ1<<1 ) == 0 ) & roundNearestEven ); 202 } 203 z = absZ0; 204 if ( zSign ) z = - z; 205 if ( z && ( ( z < 0 ) ^ zSign ) ) { 206 overflow: 207 float_raise( float_flag_invalid ); 208 return 209 zSign ? (sbits64) LIT64( 0x8000000000000000 ) 210 : LIT64( 0x7FFFFFFFFFFFFFFF ); 211 } 212 if ( absZ1 ) set_float_exception_inexact_flag(); 213 return z; 214 215 } 216 #endif 217 218 /* 219 ------------------------------------------------------------------------------- 220 Returns the fraction bits of the single-precision floating-point value `a'. 221 ------------------------------------------------------------------------------- 222 */ 223 INLINE bits32 extractFloat32Frac( float32 a ) 224 { 225 226 return a & 0x007FFFFF; 227 228 } 229 230 /* 231 ------------------------------------------------------------------------------- 232 Returns the exponent bits of the single-precision floating-point value `a'. 233 ------------------------------------------------------------------------------- 234 */ 235 INLINE int16 extractFloat32Exp( float32 a ) 236 { 237 238 return ( a>>23 ) & 0xFF; 239 240 } 241 242 /* 243 ------------------------------------------------------------------------------- 244 Returns the sign bit of the single-precision floating-point value `a'. 245 ------------------------------------------------------------------------------- 246 */ 247 INLINE flag extractFloat32Sign( float32 a ) 248 { 249 250 return a>>31; 251 252 } 253 254 /* 255 ------------------------------------------------------------------------------- 256 Normalizes the subnormal single-precision floating-point value represented 257 by the denormalized significand `aSig'. The normalized exponent and 258 significand are stored at the locations pointed to by `zExpPtr' and 259 `zSigPtr', respectively. 260 ------------------------------------------------------------------------------- 261 */ 262 static void 263 normalizeFloat32Subnormal( bits32 aSig, int16 *zExpPtr, bits32 *zSigPtr ) 264 { 265 int8 shiftCount; 266 267 shiftCount = countLeadingZeros32( aSig ) - 8; 268 *zSigPtr = aSig<<shiftCount; 269 *zExpPtr = 1 - shiftCount; 270 271 } 272 273 /* 274 ------------------------------------------------------------------------------- 275 Packs the sign `zSign', exponent `zExp', and significand `zSig' into a 276 single-precision floating-point value, returning the result. After being 277 shifted into the proper positions, the three fields are simply added 278 together to form the result. This means that any integer portion of `zSig' 279 will be added into the exponent. Since a properly normalized significand 280 will have an integer portion equal to 1, the `zExp' input should be 1 less 281 than the desired result exponent whenever `zSig' is a complete, normalized 282 significand. 283 ------------------------------------------------------------------------------- 284 */ 285 INLINE float32 packFloat32( flag zSign, int16 zExp, bits32 zSig ) 286 { 287 288 return ( ( (bits32) zSign )<<31 ) + ( ( (bits32) zExp )<<23 ) + zSig; 289 290 } 291 292 /* 293 ------------------------------------------------------------------------------- 294 Takes an abstract floating-point value having sign `zSign', exponent `zExp', 295 and significand `zSig', and returns the proper single-precision floating- 296 point value corresponding to the abstract input. Ordinarily, the abstract 297 value is simply rounded and packed into the single-precision format, with 298 the inexact exception raised if the abstract input cannot be represented 299 exactly. However, if the abstract value is too large, the overflow and 300 inexact exceptions are raised and an infinity or maximal finite value is 301 returned. If the abstract value is too small, the input value is rounded to 302 a subnormal number, and the underflow and inexact exceptions are raised if 303 the abstract input cannot be represented exactly as a subnormal single- 304 precision floating-point number. 305 The input significand `zSig' has its binary point between bits 30 306 and 29, which is 7 bits to the left of the usual location. This shifted 307 significand must be normalized or smaller. If `zSig' is not normalized, 308 `zExp' must be 0; in that case, the result returned is a subnormal number, 309 and it must not require rounding. In the usual case that `zSig' is 310 normalized, `zExp' must be 1 less than the ``true'' floating-point exponent. 311 The handling of underflow and overflow follows the IEC/IEEE Standard for 312 Binary Floating-Point Arithmetic. 313 ------------------------------------------------------------------------------- 314 */ 315 static float32 roundAndPackFloat32( flag zSign, int16 zExp, bits32 zSig ) 316 { 317 int8 roundingMode; 318 flag roundNearestEven; 319 int8 roundIncrement, roundBits; 320 flag isTiny; 321 322 roundingMode = float_rounding_mode; 323 roundNearestEven = ( roundingMode == float_round_nearest_even ); 324 roundIncrement = 0x40; 325 if ( ! roundNearestEven ) { 326 if ( roundingMode == float_round_to_zero ) { 327 roundIncrement = 0; 328 } 329 else { 330 roundIncrement = 0x7F; 331 if ( zSign ) { 332 if ( roundingMode == float_round_up ) roundIncrement = 0; 333 } 334 else { 335 if ( roundingMode == float_round_down ) roundIncrement = 0; 336 } 337 } 338 } 339 roundBits = zSig & 0x7F; 340 if ( 0xFD <= (bits16) zExp ) { 341 if ( ( 0xFD < zExp ) 342 || ( ( zExp == 0xFD ) 343 && ( (sbits32) ( zSig + roundIncrement ) < 0 ) ) 344 ) { 345 float_raise( float_flag_overflow | float_flag_inexact ); 346 return packFloat32( zSign, 0xFF, 0 ) - ( roundIncrement == 0 ); 347 } 348 if ( zExp < 0 ) { 349 isTiny = 350 ( float_detect_tininess == float_tininess_before_rounding ) 351 || ( zExp < -1 ) 352 || ( zSig + roundIncrement < 0x80000000U ); 353 shift32RightJamming( zSig, - zExp, &zSig ); 354 zExp = 0; 355 roundBits = zSig & 0x7F; 356 if ( isTiny && roundBits ) float_raise( float_flag_underflow ); 357 } 358 } 359 if ( roundBits ) set_float_exception_inexact_flag(); 360 zSig = ( zSig + roundIncrement )>>7; 361 zSig &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven ); 362 if ( zSig == 0 ) zExp = 0; 363 return packFloat32( zSign, zExp, zSig ); 364 365 } 366 367 /* 368 ------------------------------------------------------------------------------- 369 Takes an abstract floating-point value having sign `zSign', exponent `zExp', 370 and significand `zSig', and returns the proper single-precision floating- 371 point value corresponding to the abstract input. This routine is just like 372 `roundAndPackFloat32' except that `zSig' does not have to be normalized. 373 Bit 31 of `zSig' must be zero, and `zExp' must be 1 less than the ``true'' 374 floating-point exponent. 375 ------------------------------------------------------------------------------- 376 */ 377 static float32 378 normalizeRoundAndPackFloat32( flag zSign, int16 zExp, bits32 zSig ) 379 { 380 int8 shiftCount; 381 382 shiftCount = countLeadingZeros32( zSig ) - 1; 383 return roundAndPackFloat32( zSign, zExp - shiftCount, zSig<<shiftCount ); 384 385 } 386 387 /* 388 ------------------------------------------------------------------------------- 389 Returns the fraction bits of the double-precision floating-point value `a'. 390 ------------------------------------------------------------------------------- 391 */ 392 INLINE bits64 extractFloat64Frac( float64 a ) 393 { 394 395 return FLOAT64_DEMANGLE(a) & LIT64( 0x000FFFFFFFFFFFFF ); 396 397 } 398 399 /* 400 ------------------------------------------------------------------------------- 401 Returns the exponent bits of the double-precision floating-point value `a'. 402 ------------------------------------------------------------------------------- 403 */ 404 INLINE int16 extractFloat64Exp( float64 a ) 405 { 406 407 return (int16)((FLOAT64_DEMANGLE(a) >> 52) & 0x7FF); 408 409 } 410 411 /* 412 ------------------------------------------------------------------------------- 413 Returns the sign bit of the double-precision floating-point value `a'. 414 ------------------------------------------------------------------------------- 415 */ 416 INLINE flag extractFloat64Sign( float64 a ) 417 { 418 419 return (flag)(FLOAT64_DEMANGLE(a) >> 63); 420 421 } 422 423 /* 424 ------------------------------------------------------------------------------- 425 Normalizes the subnormal double-precision floating-point value represented 426 by the denormalized significand `aSig'. The normalized exponent and 427 significand are stored at the locations pointed to by `zExpPtr' and 428 `zSigPtr', respectively. 429 ------------------------------------------------------------------------------- 430 */ 431 static void 432 normalizeFloat64Subnormal( bits64 aSig, int16 *zExpPtr, bits64 *zSigPtr ) 433 { 434 int8 shiftCount; 435 436 shiftCount = countLeadingZeros64( aSig ) - 11; 437 *zSigPtr = aSig<<shiftCount; 438 *zExpPtr = 1 - shiftCount; 439 440 } 441 442 /* 443 ------------------------------------------------------------------------------- 444 Packs the sign `zSign', exponent `zExp', and significand `zSig' into a 445 double-precision floating-point value, returning the result. After being 446 shifted into the proper positions, the three fields are simply added 447 together to form the result. This means that any integer portion of `zSig' 448 will be added into the exponent. Since a properly normalized significand 449 will have an integer portion equal to 1, the `zExp' input should be 1 less 450 than the desired result exponent whenever `zSig' is a complete, normalized 451 significand. 452 ------------------------------------------------------------------------------- 453 */ 454 INLINE float64 packFloat64( flag zSign, int16 zExp, bits64 zSig ) 455 { 456 457 return FLOAT64_MANGLE( ( ( (bits64) zSign )<<63 ) + 458 ( ( (bits64) zExp )<<52 ) + zSig ); 459 460 } 461 462 /* 463 ------------------------------------------------------------------------------- 464 Takes an abstract floating-point value having sign `zSign', exponent `zExp', 465 and significand `zSig', and returns the proper double-precision floating- 466 point value corresponding to the abstract input. Ordinarily, the abstract 467 value is simply rounded and packed into the double-precision format, with 468 the inexact exception raised if the abstract input cannot be represented 469 exactly. However, if the abstract value is too large, the overflow and 470 inexact exceptions are raised and an infinity or maximal finite value is 471 returned. If the abstract value is too small, the input value is rounded to 472 a subnormal number, and the underflow and inexact exceptions are raised if 473 the abstract input cannot be represented exactly as a subnormal double- 474 precision floating-point number. 475 The input significand `zSig' has its binary point between bits 62 476 and 61, which is 10 bits to the left of the usual location. This shifted 477 significand must be normalized or smaller. If `zSig' is not normalized, 478 `zExp' must be 0; in that case, the result returned is a subnormal number, 479 and it must not require rounding. In the usual case that `zSig' is 480 normalized, `zExp' must be 1 less than the ``true'' floating-point exponent. 481 The handling of underflow and overflow follows the IEC/IEEE Standard for 482 Binary Floating-Point Arithmetic. 483 ------------------------------------------------------------------------------- 484 */ 485 static float64 roundAndPackFloat64( flag zSign, int16 zExp, bits64 zSig ) 486 { 487 int8 roundingMode; 488 flag roundNearestEven; 489 int16 roundIncrement, roundBits; 490 flag isTiny; 491 492 roundingMode = float_rounding_mode; 493 roundNearestEven = ( roundingMode == float_round_nearest_even ); 494 roundIncrement = 0x200; 495 if ( ! roundNearestEven ) { 496 if ( roundingMode == float_round_to_zero ) { 497 roundIncrement = 0; 498 } 499 else { 500 roundIncrement = 0x3FF; 501 if ( zSign ) { 502 if ( roundingMode == float_round_up ) roundIncrement = 0; 503 } 504 else { 505 if ( roundingMode == float_round_down ) roundIncrement = 0; 506 } 507 } 508 } 509 roundBits = (int16)(zSig & 0x3FF); 510 if ( 0x7FD <= (bits16) zExp ) { 511 if ( ( 0x7FD < zExp ) 512 || ( ( zExp == 0x7FD ) 513 && ( (sbits64) ( zSig + roundIncrement ) < 0 ) ) 514 ) { 515 float_raise( float_flag_overflow | float_flag_inexact ); 516 return FLOAT64_MANGLE( 517 FLOAT64_DEMANGLE(packFloat64( zSign, 0x7FF, 0 )) - 518 ( roundIncrement == 0 )); 519 } 520 if ( zExp < 0 ) { 521 isTiny = 522 ( float_detect_tininess == float_tininess_before_rounding ) 523 || ( zExp < -1 ) 524 || ( zSig + roundIncrement < (bits64)LIT64( 0x8000000000000000 ) ); 525 shift64RightJamming( zSig, - zExp, &zSig ); 526 zExp = 0; 527 roundBits = (int16)(zSig & 0x3FF); 528 if ( isTiny && roundBits ) float_raise( float_flag_underflow ); 529 } 530 } 531 if ( roundBits ) set_float_exception_inexact_flag(); 532 zSig = ( zSig + roundIncrement )>>10; 533 zSig &= ~ ( ( ( roundBits ^ 0x200 ) == 0 ) & roundNearestEven ); 534 if ( zSig == 0 ) zExp = 0; 535 return packFloat64( zSign, zExp, zSig ); 536 537 } 538 539 /* 540 ------------------------------------------------------------------------------- 541 Takes an abstract floating-point value having sign `zSign', exponent `zExp', 542 and significand `zSig', and returns the proper double-precision floating- 543 point value corresponding to the abstract input. This routine is just like 544 `roundAndPackFloat64' except that `zSig' does not have to be normalized. 545 Bit 63 of `zSig' must be zero, and `zExp' must be 1 less than the ``true'' 546 floating-point exponent. 547 ------------------------------------------------------------------------------- 548 */ 549 static float64 550 normalizeRoundAndPackFloat64( flag zSign, int16 zExp, bits64 zSig ) 551 { 552 int8 shiftCount; 553 554 shiftCount = countLeadingZeros64( zSig ) - 1; 555 return roundAndPackFloat64( zSign, zExp - shiftCount, zSig<<shiftCount ); 556 557 } 558 559 #ifdef FLOATX80 560 561 /* 562 ------------------------------------------------------------------------------- 563 Returns the fraction bits of the extended double-precision floating-point 564 value `a'. 565 ------------------------------------------------------------------------------- 566 */ 567 INLINE bits64 extractFloatx80Frac( floatx80 a ) 568 { 569 570 return a.low; 571 572 } 573 574 /* 575 ------------------------------------------------------------------------------- 576 Returns the exponent bits of the extended double-precision floating-point 577 value `a'. 578 ------------------------------------------------------------------------------- 579 */ 580 INLINE int32 extractFloatx80Exp( floatx80 a ) 581 { 582 583 return (a.high>>X80SHIFT) & 0x7FFF; 584 585 } 586 587 /* 588 ------------------------------------------------------------------------------- 589 Returns the sign bit of the extended double-precision floating-point value 590 `a'. 591 ------------------------------------------------------------------------------- 592 */ 593 INLINE flag extractFloatx80Sign( floatx80 a ) 594 { 595 596 return a.high>>(15 + X80SHIFT); 597 598 } 599 600 /* 601 ------------------------------------------------------------------------------- 602 Normalizes the subnormal extended double-precision floating-point value 603 represented by the denormalized significand `aSig'. The normalized exponent 604 and significand are stored at the locations pointed to by `zExpPtr' and 605 `zSigPtr', respectively. 606 ------------------------------------------------------------------------------- 607 */ 608 static void 609 normalizeFloatx80Subnormal( bits64 aSig, int32 *zExpPtr, bits64 *zSigPtr ) 610 { 611 int8 shiftCount; 612 613 shiftCount = countLeadingZeros64( aSig ); 614 *zSigPtr = aSig<<shiftCount; 615 *zExpPtr = 1 - shiftCount; 616 617 } 618 619 /* 620 ------------------------------------------------------------------------------- 621 Packs the sign `zSign', exponent `zExp', and significand `zSig' into an 622 extended double-precision floating-point value, returning the result. 623 ------------------------------------------------------------------------------- 624 */ 625 INLINE floatx80 packFloatx80( flag zSign, int32 zExp, bits64 zSig ) 626 { 627 floatx80 z; 628 629 z.low = (bits64)zSig; 630 z.high = (bits32)( ( ( (bits32) zSign )<<15 ) + zExp )<<X80SHIFT; 631 return z; 632 633 } 634 635 /* 636 ------------------------------------------------------------------------------- 637 Takes an abstract floating-point value having sign `zSign', exponent `zExp', 638 and extended significand formed by the concatenation of `zSig0' and `zSig1', 639 and returns the proper extended double-precision floating-point value 640 corresponding to the abstract input. Ordinarily, the abstract value is 641 rounded and packed into the extended double-precision format, with the 642 inexact exception raised if the abstract input cannot be represented 643 exactly. However, if the abstract value is too large, the overflow and 644 inexact exceptions are raised and an infinity or maximal finite value is 645 returned. If the abstract value is too small, the input value is rounded to 646 a subnormal number, and the underflow and inexact exceptions are raised if 647 the abstract input cannot be represented exactly as a subnormal extended 648 double-precision floating-point number. 649 If `roundingPrecision' is 32 or 64, the result is rounded to the same 650 number of bits as single or double precision, respectively. Otherwise, the 651 result is rounded to the full precision of the extended double-precision 652 format. 653 The input significand must be normalized or smaller. If the input 654 significand is not normalized, `zExp' must be 0; in that case, the result 655 returned is a subnormal number, and it must not require rounding. The 656 handling of underflow and overflow follows the IEC/IEEE Standard for Binary 657 Floating-Point Arithmetic. 658 ------------------------------------------------------------------------------- 659 */ 660 static floatx80 661 roundAndPackFloatx80( 662 int8 roundingPrecision, flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1 663 ) 664 { 665 int8 roundingMode; 666 flag roundNearestEven, increment, isTiny; 667 uint64 roundIncrement, roundMask, roundBits; 668 669 roundingMode = float_rounding_mode; 670 roundNearestEven = ( roundingMode == float_round_nearest_even ); 671 if ( roundingPrecision == 80 ) goto precision80; 672 if ( roundingPrecision == 64 ) { 673 roundIncrement = LIT64( 0x0000000000000400 ); 674 roundMask = LIT64( 0x00000000000007FF ); 675 } 676 else if ( roundingPrecision == 32 ) { 677 roundIncrement = LIT64( 0x0000008000000000 ); 678 roundMask = LIT64( 0x000000FFFFFFFFFF ); 679 } 680 else { 681 goto precision80; 682 } 683 zSig0 |= ( zSig1 != 0 ); 684 if ( ! roundNearestEven ) { 685 if ( roundingMode == float_round_to_zero ) { 686 roundIncrement = 0; 687 } 688 else { 689 roundIncrement = roundMask; 690 if ( zSign ) { 691 if ( roundingMode == float_round_up ) roundIncrement = 0; 692 } 693 else { 694 if ( roundingMode == float_round_down ) roundIncrement = 0; 695 } 696 } 697 } 698 roundBits = zSig0 & roundMask; 699 if ( 0x7FFD <= (bits32) ( zExp - 1 ) ) { 700 if ( ( 0x7FFE < zExp ) 701 || ( ( zExp == 0x7FFE ) && ( zSig0 + roundIncrement < zSig0 ) ) 702 ) { 703 goto overflow; 704 } 705 if ( zExp <= 0 ) { 706 isTiny = 707 ( float_detect_tininess == float_tininess_before_rounding ) 708 || ( zExp < 0 ) 709 || ( zSig0 <= zSig0 + roundIncrement ); 710 shift64RightJamming( zSig0, 1 - zExp, &zSig0 ); 711 zExp = 0; 712 roundBits = zSig0 & roundMask; 713 if ( isTiny && roundBits ) float_raise( float_flag_underflow ); 714 if ( roundBits ) set_float_exception_inexact_flag(); 715 zSig0 += roundIncrement; 716 if ( (sbits64) zSig0 < 0 ) zExp = 1; 717 roundIncrement = roundMask + 1; 718 if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) { 719 roundMask |= roundIncrement; 720 } 721 zSig0 &= ~ roundMask; 722 return packFloatx80( zSign, zExp, zSig0 ); 723 } 724 } 725 if ( roundBits ) set_float_exception_inexact_flag(); 726 zSig0 += roundIncrement; 727 if ( zSig0 < roundIncrement ) { 728 ++zExp; 729 zSig0 = LIT64( 0x8000000000000000 ); 730 } 731 roundIncrement = roundMask + 1; 732 if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) { 733 roundMask |= roundIncrement; 734 } 735 zSig0 &= ~ roundMask; 736 if ( zSig0 == 0 ) zExp = 0; 737 return packFloatx80( zSign, zExp, zSig0 ); 738 precision80: 739 increment = ( (sbits64) zSig1 < 0 ); 740 if ( ! roundNearestEven ) { 741 if ( roundingMode == float_round_to_zero ) { 742 increment = 0; 743 } 744 else { 745 if ( zSign ) { 746 increment = ( roundingMode == float_round_down ) && zSig1; 747 } 748 else { 749 increment = ( roundingMode == float_round_up ) && zSig1; 750 } 751 } 752 } 753 if ( 0x7FFD <= (bits32) ( zExp - 1 ) ) { 754 if ( ( 0x7FFE < zExp ) 755 || ( ( zExp == 0x7FFE ) 756 && ( zSig0 == LIT64( 0xFFFFFFFFFFFFFFFF ) ) 757 && increment 758 ) 759 ) { 760 roundMask = 0; 761 overflow: 762 float_raise( float_flag_overflow | float_flag_inexact ); 763 if ( ( roundingMode == float_round_to_zero ) 764 || ( zSign && ( roundingMode == float_round_up ) ) 765 || ( ! zSign && ( roundingMode == float_round_down ) ) 766 ) { 767 return packFloatx80( zSign, 0x7FFE, ~ roundMask ); 768 } 769 return packFloatx80( zSign, 0x7FFF, 0 ); 770 } 771 if ( zExp <= 0 ) { 772 isTiny = 773 ( float_detect_tininess == float_tininess_before_rounding ) 774 || ( zExp < 0 ) 775 || ! increment 776 || ( zSig0 < LIT64( 0xFFFFFFFFFFFFFFFF ) ); 777 shift64ExtraRightJamming( zSig0, zSig1, 1 - zExp, &zSig0, &zSig1 ); 778 zExp = 0; 779 if ( isTiny && zSig1 ) float_raise( float_flag_underflow ); 780 if ( zSig1 ) set_float_exception_inexact_flag(); 781 if ( roundNearestEven ) { 782 increment = ( (sbits64) zSig1 < 0 ); 783 } 784 else { 785 if ( zSign ) { 786 increment = ( roundingMode == float_round_down ) && zSig1; 787 } 788 else { 789 increment = ( roundingMode == float_round_up ) && zSig1; 790 } 791 } 792 if ( increment ) { 793 ++zSig0; 794 zSig0 &= 795 ~ ( ( (bits64) ( zSig1<<1 ) == 0 ) & roundNearestEven ); 796 if ( (sbits64) zSig0 < 0 ) zExp = 1; 797 } 798 return packFloatx80( zSign, zExp, zSig0 ); 799 } 800 } 801 if ( zSig1 ) set_float_exception_inexact_flag(); 802 if ( increment ) { 803 ++zSig0; 804 if ( zSig0 == 0 ) { 805 ++zExp; 806 zSig0 = LIT64( 0x8000000000000000 ); 807 } 808 else { 809 zSig0 &= ~ ( ( (bits64) ( zSig1<<1 ) == 0 ) & roundNearestEven ); 810 } 811 } 812 else { 813 if ( zSig0 == 0 ) zExp = 0; 814 } 815 return packFloatx80( zSign, zExp, zSig0 ); 816 817 } 818 819 /* 820 ------------------------------------------------------------------------------- 821 Takes an abstract floating-point value having sign `zSign', exponent 822 `zExp', and significand formed by the concatenation of `zSig0' and `zSig1', 823 and returns the proper extended double-precision floating-point value 824 corresponding to the abstract input. This routine is just like 825 `roundAndPackFloatx80' except that the input significand does not have to be 826 normalized. 827 ------------------------------------------------------------------------------- 828 */ 829 static floatx80 830 normalizeRoundAndPackFloatx80( 831 int8 roundingPrecision, flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1 832 ) 833 { 834 int8 shiftCount; 835 836 if ( zSig0 == 0 ) { 837 zSig0 = zSig1; 838 zSig1 = 0; 839 zExp -= 64; 840 } 841 shiftCount = countLeadingZeros64( zSig0 ); 842 shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 ); 843 zExp -= shiftCount; 844 return 845 roundAndPackFloatx80( roundingPrecision, zSign, zExp, zSig0, zSig1 ); 846 847 } 848 849 #endif 850 851 #ifdef FLOAT128 852 853 /* 854 ------------------------------------------------------------------------------- 855 Returns the least-significant 64 fraction bits of the quadruple-precision 856 floating-point value `a'. 857 ------------------------------------------------------------------------------- 858 */ 859 INLINE bits64 extractFloat128Frac1( float128 a ) 860 { 861 862 return a.low; 863 864 } 865 866 /* 867 ------------------------------------------------------------------------------- 868 Returns the most-significant 48 fraction bits of the quadruple-precision 869 floating-point value `a'. 870 ------------------------------------------------------------------------------- 871 */ 872 INLINE bits64 extractFloat128Frac0( float128 a ) 873 { 874 875 return a.high & LIT64( 0x0000FFFFFFFFFFFF ); 876 877 } 878 879 /* 880 ------------------------------------------------------------------------------- 881 Returns the exponent bits of the quadruple-precision floating-point value 882 `a'. 883 ------------------------------------------------------------------------------- 884 */ 885 INLINE int32 extractFloat128Exp( float128 a ) 886 { 887 888 return (int32)((a.high >> 48) & 0x7FFF); 889 890 } 891 892 /* 893 ------------------------------------------------------------------------------- 894 Returns the sign bit of the quadruple-precision floating-point value `a'. 895 ------------------------------------------------------------------------------- 896 */ 897 INLINE flag extractFloat128Sign( float128 a ) 898 { 899 900 return (flag)(a.high >> 63); 901 902 } 903 904 /* 905 ------------------------------------------------------------------------------- 906 Normalizes the subnormal quadruple-precision floating-point value 907 represented by the denormalized significand formed by the concatenation of 908 `aSig0' and `aSig1'. The normalized exponent is stored at the location 909 pointed to by `zExpPtr'. The most significant 49 bits of the normalized 910 significand are stored at the location pointed to by `zSig0Ptr', and the 911 least significant 64 bits of the normalized significand are stored at the 912 location pointed to by `zSig1Ptr'. 913 ------------------------------------------------------------------------------- 914 */ 915 static void 916 normalizeFloat128Subnormal( 917 bits64 aSig0, 918 bits64 aSig1, 919 int32 *zExpPtr, 920 bits64 *zSig0Ptr, 921 bits64 *zSig1Ptr 922 ) 923 { 924 int8 shiftCount; 925 926 if ( aSig0 == 0 ) { 927 shiftCount = countLeadingZeros64( aSig1 ) - 15; 928 if ( shiftCount < 0 ) { 929 *zSig0Ptr = aSig1>>( - shiftCount ); 930 *zSig1Ptr = aSig1<<( shiftCount & 63 ); 931 } 932 else { 933 *zSig0Ptr = aSig1<<shiftCount; 934 *zSig1Ptr = 0; 935 } 936 *zExpPtr = - shiftCount - 63; 937 } 938 else { 939 shiftCount = countLeadingZeros64( aSig0 ) - 15; 940 shortShift128Left( aSig0, aSig1, shiftCount, zSig0Ptr, zSig1Ptr ); 941 *zExpPtr = 1 - shiftCount; 942 } 943 944 } 945 946 /* 947 ------------------------------------------------------------------------------- 948 Packs the sign `zSign', the exponent `zExp', and the significand formed 949 by the concatenation of `zSig0' and `zSig1' into a quadruple-precision 950 floating-point value, returning the result. After being shifted into the 951 proper positions, the three fields `zSign', `zExp', and `zSig0' are simply 952 added together to form the most significant 32 bits of the result. This 953 means that any integer portion of `zSig0' will be added into the exponent. 954 Since a properly normalized significand will have an integer portion equal 955 to 1, the `zExp' input should be 1 less than the desired result exponent 956 whenever `zSig0' and `zSig1' concatenated form a complete, normalized 957 significand. 958 ------------------------------------------------------------------------------- 959 */ 960 INLINE float128 961 packFloat128( flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1 ) 962 { 963 float128 z; 964 965 z.low = zSig1; 966 z.high = ( ( (bits64) zSign )<<63 ) + ( ( (bits64) zExp )<<48 ) + zSig0; 967 return z; 968 969 } 970 971 /* 972 ------------------------------------------------------------------------------- 973 Takes an abstract floating-point value having sign `zSign', exponent `zExp', 974 and extended significand formed by the concatenation of `zSig0', `zSig1', 975 and `zSig2', and returns the proper quadruple-precision floating-point value 976 corresponding to the abstract input. Ordinarily, the abstract value is 977 simply rounded and packed into the quadruple-precision format, with the 978 inexact exception raised if the abstract input cannot be represented 979 exactly. However, if the abstract value is too large, the overflow and 980 inexact exceptions are raised and an infinity or maximal finite value is 981 returned. If the abstract value is too small, the input value is rounded to 982 a subnormal number, and the underflow and inexact exceptions are raised if 983 the abstract input cannot be represented exactly as a subnormal quadruple- 984 precision floating-point number. 985 The input significand must be normalized or smaller. If the input 986 significand is not normalized, `zExp' must be 0; in that case, the result 987 returned is a subnormal number, and it must not require rounding. In the 988 usual case that the input significand is normalized, `zExp' must be 1 less 989 than the ``true'' floating-point exponent. The handling of underflow and 990 overflow follows the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 991 ------------------------------------------------------------------------------- 992 */ 993 static float128 994 roundAndPackFloat128( 995 flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1, bits64 zSig2 ) 996 { 997 int8 roundingMode; 998 flag roundNearestEven, increment, isTiny; 999 1000 roundingMode = float_rounding_mode; 1001 roundNearestEven = ( roundingMode == float_round_nearest_even ); 1002 increment = ( (sbits64) zSig2 < 0 ); 1003 if ( ! roundNearestEven ) { 1004 if ( roundingMode == float_round_to_zero ) { 1005 increment = 0; 1006 } 1007 else { 1008 if ( zSign ) { 1009 increment = ( roundingMode == float_round_down ) && zSig2; 1010 } 1011 else { 1012 increment = ( roundingMode == float_round_up ) && zSig2; 1013 } 1014 } 1015 } 1016 if ( 0x7FFD <= (bits32) zExp ) { 1017 if ( ( 0x7FFD < zExp ) 1018 || ( ( zExp == 0x7FFD ) 1019 && eq128( 1020 LIT64( 0x0001FFFFFFFFFFFF ), 1021 LIT64( 0xFFFFFFFFFFFFFFFF ), 1022 zSig0, 1023 zSig1 1024 ) 1025 && increment 1026 ) 1027 ) { 1028 float_raise( float_flag_overflow | float_flag_inexact ); 1029 if ( ( roundingMode == float_round_to_zero ) 1030 || ( zSign && ( roundingMode == float_round_up ) ) 1031 || ( ! zSign && ( roundingMode == float_round_down ) ) 1032 ) { 1033 return 1034 packFloat128( 1035 zSign, 1036 0x7FFE, 1037 LIT64( 0x0000FFFFFFFFFFFF ), 1038 LIT64( 0xFFFFFFFFFFFFFFFF ) 1039 ); 1040 } 1041 return packFloat128( zSign, 0x7FFF, 0, 0 ); 1042 } 1043 if ( zExp < 0 ) { 1044 isTiny = 1045 ( float_detect_tininess == float_tininess_before_rounding ) 1046 || ( zExp < -1 ) 1047 || ! increment 1048 || lt128( 1049 zSig0, 1050 zSig1, 1051 LIT64( 0x0001FFFFFFFFFFFF ), 1052 LIT64( 0xFFFFFFFFFFFFFFFF ) 1053 ); 1054 shift128ExtraRightJamming( 1055 zSig0, zSig1, zSig2, - zExp, &zSig0, &zSig1, &zSig2 ); 1056 zExp = 0; 1057 if ( isTiny && zSig2 ) float_raise( float_flag_underflow ); 1058 if ( roundNearestEven ) { 1059 increment = ( (sbits64) zSig2 < 0 ); 1060 } 1061 else { 1062 if ( zSign ) { 1063 increment = ( roundingMode == float_round_down ) && zSig2; 1064 } 1065 else { 1066 increment = ( roundingMode == float_round_up ) && zSig2; 1067 } 1068 } 1069 } 1070 } 1071 if ( zSig2 ) set_float_exception_inexact_flag(); 1072 if ( increment ) { 1073 add128( zSig0, zSig1, 0, 1, &zSig0, &zSig1 ); 1074 zSig1 &= ~ ( ( zSig2 + zSig2 == 0 ) & roundNearestEven ); 1075 } 1076 else { 1077 if ( ( zSig0 | zSig1 ) == 0 ) zExp = 0; 1078 } 1079 return packFloat128( zSign, zExp, zSig0, zSig1 ); 1080 1081 } 1082 1083 /* 1084 ------------------------------------------------------------------------------- 1085 Takes an abstract floating-point value having sign `zSign', exponent `zExp', 1086 and significand formed by the concatenation of `zSig0' and `zSig1', and 1087 returns the proper quadruple-precision floating-point value corresponding 1088 to the abstract input. This routine is just like `roundAndPackFloat128' 1089 except that the input significand has fewer bits and does not have to be 1090 normalized. In all cases, `zExp' must be 1 less than the ``true'' floating- 1091 point exponent. 1092 ------------------------------------------------------------------------------- 1093 */ 1094 static float128 1095 normalizeRoundAndPackFloat128( 1096 flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1 ) 1097 { 1098 int8 shiftCount; 1099 bits64 zSig2; 1100 1101 if ( zSig0 == 0 ) { 1102 zSig0 = zSig1; 1103 zSig1 = 0; 1104 zExp -= 64; 1105 } 1106 shiftCount = countLeadingZeros64( zSig0 ) - 15; 1107 if ( 0 <= shiftCount ) { 1108 zSig2 = 0; 1109 shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 ); 1110 } 1111 else { 1112 shift128ExtraRightJamming( 1113 zSig0, zSig1, 0, - shiftCount, &zSig0, &zSig1, &zSig2 ); 1114 } 1115 zExp -= shiftCount; 1116 return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 ); 1117 1118 } 1119 1120 #endif 1121 1122 /* 1123 ------------------------------------------------------------------------------- 1124 Returns the result of converting the 32-bit two's complement integer `a' 1125 to the single-precision floating-point format. The conversion is performed 1126 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 1127 ------------------------------------------------------------------------------- 1128 */ 1129 float32 int32_to_float32( int32 a ) 1130 { 1131 flag zSign; 1132 1133 if ( a == 0 ) return 0; 1134 if ( a == (sbits32) 0x80000000 ) return packFloat32( 1, 0x9E, 0 ); 1135 zSign = ( a < 0 ); 1136 return normalizeRoundAndPackFloat32(zSign, 0x9C, (uint32)(zSign ? - a : a)); 1137 1138 } 1139 1140 float32 uint32_to_float32( uint32 a ) 1141 { 1142 if ( a == 0 ) return 0; 1143 if ( a & (bits32) 0x80000000 ) 1144 return normalizeRoundAndPackFloat32( 0, 0x9D, a >> 1 ); 1145 return normalizeRoundAndPackFloat32( 0, 0x9C, a ); 1146 } 1147 1148 1149 /* 1150 ------------------------------------------------------------------------------- 1151 Returns the result of converting the 32-bit two's complement integer `a' 1152 to the double-precision floating-point format. The conversion is performed 1153 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 1154 ------------------------------------------------------------------------------- 1155 */ 1156 float64 int32_to_float64( int32 a ) 1157 { 1158 flag zSign; 1159 uint32 absA; 1160 int8 shiftCount; 1161 bits64 zSig; 1162 1163 if ( a == 0 ) return 0; 1164 zSign = ( a < 0 ); 1165 absA = zSign ? - a : a; 1166 shiftCount = countLeadingZeros32( absA ) + 21; 1167 zSig = absA; 1168 return packFloat64( zSign, 0x432 - shiftCount, zSig<<shiftCount ); 1169 1170 } 1171 1172 float64 uint32_to_float64( uint32 a ) 1173 { 1174 int8 shiftCount; 1175 bits64 zSig = a; 1176 1177 if ( a == 0 ) return 0; 1178 shiftCount = countLeadingZeros32( a ) + 21; 1179 return packFloat64( 0, 0x432 - shiftCount, zSig<<shiftCount ); 1180 1181 } 1182 1183 #ifdef FLOATX80 1184 1185 /* 1186 ------------------------------------------------------------------------------- 1187 Returns the result of converting the 32-bit two's complement integer `a' 1188 to the extended double-precision floating-point format. The conversion 1189 is performed according to the IEC/IEEE Standard for Binary Floating-Point 1190 Arithmetic. 1191 ------------------------------------------------------------------------------- 1192 */ 1193 floatx80 int32_to_floatx80( int32 a ) 1194 { 1195 flag zSign; 1196 uint32 absA; 1197 int8 shiftCount; 1198 bits64 zSig; 1199 1200 if ( a == 0 ) return packFloatx80( 0, 0, 0 ); 1201 zSign = ( a < 0 ); 1202 absA = zSign ? - a : a; 1203 shiftCount = countLeadingZeros32( absA ) + 32; 1204 zSig = absA; 1205 return packFloatx80( zSign, 0x403E - shiftCount, zSig<<shiftCount ); 1206 1207 } 1208 1209 floatx80 uint32_to_floatx80( uint32 a ) 1210 { 1211 int8 shiftCount; 1212 bits64 zSig = a; 1213 1214 if ( a == 0 ) return packFloatx80( 0, 0, 0 ); 1215 shiftCount = countLeadingZeros32( a ) + 32; 1216 return packFloatx80( 0, 0x403E - shiftCount, zSig<<shiftCount ); 1217 1218 } 1219 1220 #endif 1221 1222 #ifdef FLOAT128 1223 1224 /* 1225 ------------------------------------------------------------------------------- 1226 Returns the result of converting the 32-bit two's complement integer `a' to 1227 the quadruple-precision floating-point format. The conversion is performed 1228 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 1229 ------------------------------------------------------------------------------- 1230 */ 1231 float128 int32_to_float128( int32 a ) 1232 { 1233 flag zSign; 1234 uint32 absA; 1235 int8 shiftCount; 1236 bits64 zSig0; 1237 1238 if ( a == 0 ) return packFloat128( 0, 0, 0, 0 ); 1239 zSign = ( a < 0 ); 1240 absA = zSign ? - a : a; 1241 shiftCount = countLeadingZeros32( absA ) + 17; 1242 zSig0 = absA; 1243 return packFloat128( zSign, 0x402E - shiftCount, zSig0<<shiftCount, 0 ); 1244 1245 } 1246 1247 float128 uint32_to_float128( uint32 a ) 1248 { 1249 int8 shiftCount; 1250 bits64 zSig0 = a; 1251 1252 if ( a == 0 ) return packFloat128( 0, 0, 0, 0 ); 1253 shiftCount = countLeadingZeros32( a ) + 17; 1254 return packFloat128( 0, 0x402E - shiftCount, zSig0<<shiftCount, 0 ); 1255 1256 } 1257 1258 #endif 1259 1260 #ifndef SOFTFLOAT_FOR_GCC /* __floatdi?f is in libgcc2.c */ 1261 /* 1262 ------------------------------------------------------------------------------- 1263 Returns the result of converting the 64-bit two's complement integer `a' 1264 to the single-precision floating-point format. The conversion is performed 1265 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 1266 ------------------------------------------------------------------------------- 1267 */ 1268 float32 int64_to_float32( int64 a ) 1269 { 1270 flag zSign; 1271 uint64 absA; 1272 int8 shiftCount; 1273 1274 if ( a == 0 ) return 0; 1275 zSign = ( a < 0 ); 1276 absA = zSign ? - a : a; 1277 shiftCount = countLeadingZeros64( absA ) - 40; 1278 if ( 0 <= shiftCount ) { 1279 return packFloat32( zSign, 0x95 - shiftCount, absA<<shiftCount ); 1280 } 1281 else { 1282 shiftCount += 7; 1283 if ( shiftCount < 0 ) { 1284 shift64RightJamming( absA, - shiftCount, &absA ); 1285 } 1286 else { 1287 absA <<= shiftCount; 1288 } 1289 return roundAndPackFloat32( zSign, 0x9C - shiftCount, absA ); 1290 } 1291 1292 } 1293 1294 /* 1295 ------------------------------------------------------------------------------- 1296 Returns the result of converting the 64-bit two's complement integer `a' 1297 to the double-precision floating-point format. The conversion is performed 1298 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 1299 ------------------------------------------------------------------------------- 1300 */ 1301 float64 int64_to_float64( int64 a ) 1302 { 1303 flag zSign; 1304 1305 if ( a == 0 ) return 0; 1306 if ( a == (sbits64) LIT64( 0x8000000000000000 ) ) { 1307 return packFloat64( 1, 0x43E, 0 ); 1308 } 1309 zSign = ( a < 0 ); 1310 return normalizeRoundAndPackFloat64( zSign, 0x43C, zSign ? - a : a ); 1311 1312 } 1313 1314 #ifdef FLOATX80 1315 1316 /* 1317 ------------------------------------------------------------------------------- 1318 Returns the result of converting the 64-bit two's complement integer `a' 1319 to the extended double-precision floating-point format. The conversion 1320 is performed according to the IEC/IEEE Standard for Binary Floating-Point 1321 Arithmetic. 1322 ------------------------------------------------------------------------------- 1323 */ 1324 floatx80 int64_to_floatx80( int64 a ) 1325 { 1326 flag zSign; 1327 uint64 absA; 1328 int8 shiftCount; 1329 1330 if ( a == 0 ) return packFloatx80( 0, 0, 0 ); 1331 zSign = ( a < 0 ); 1332 absA = zSign ? - a : a; 1333 shiftCount = countLeadingZeros64( absA ); 1334 return packFloatx80( zSign, 0x403E - shiftCount, absA<<shiftCount ); 1335 1336 } 1337 1338 #endif 1339 1340 #endif /* !SOFTFLOAT_FOR_GCC */ 1341 1342 #ifdef FLOAT128 1343 1344 /* 1345 ------------------------------------------------------------------------------- 1346 Returns the result of converting the 64-bit two's complement integer `a' to 1347 the quadruple-precision floating-point format. The conversion is performed 1348 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 1349 ------------------------------------------------------------------------------- 1350 */ 1351 float128 int64_to_float128( int64 a ) 1352 { 1353 flag zSign; 1354 uint64 absA; 1355 int8 shiftCount; 1356 int32 zExp; 1357 bits64 zSig0, zSig1; 1358 1359 if ( a == 0 ) return packFloat128( 0, 0, 0, 0 ); 1360 zSign = ( a < 0 ); 1361 absA = zSign ? - a : a; 1362 shiftCount = countLeadingZeros64( absA ) + 49; 1363 zExp = 0x406E - shiftCount; 1364 if ( 64 <= shiftCount ) { 1365 zSig1 = 0; 1366 zSig0 = absA; 1367 shiftCount -= 64; 1368 } 1369 else { 1370 zSig1 = absA; 1371 zSig0 = 0; 1372 } 1373 shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 ); 1374 return packFloat128( zSign, zExp, zSig0, zSig1 ); 1375 1376 } 1377 1378 #endif 1379 1380 #ifndef SOFTFLOAT_FOR_GCC /* Not needed */ 1381 /* 1382 ------------------------------------------------------------------------------- 1383 Returns the result of converting the single-precision floating-point value 1384 `a' to the 32-bit two's complement integer format. The conversion is 1385 performed according to the IEC/IEEE Standard for Binary Floating-Point 1386 Arithmetic---which means in particular that the conversion is rounded 1387 according to the current rounding mode. If `a' is a NaN, the largest 1388 positive integer is returned. Otherwise, if the conversion overflows, the 1389 largest integer with the same sign as `a' is returned. 1390 ------------------------------------------------------------------------------- 1391 */ 1392 int32 float32_to_int32( float32 a ) 1393 { 1394 flag aSign; 1395 int16 aExp, shiftCount; 1396 bits32 aSig; 1397 bits64 aSig64; 1398 1399 aSig = extractFloat32Frac( a ); 1400 aExp = extractFloat32Exp( a ); 1401 aSign = extractFloat32Sign( a ); 1402 if ( ( aExp == 0xFF ) && aSig ) aSign = 0; 1403 if ( aExp ) aSig |= 0x00800000; 1404 shiftCount = 0xAF - aExp; 1405 aSig64 = aSig; 1406 aSig64 <<= 32; 1407 if ( 0 < shiftCount ) shift64RightJamming( aSig64, shiftCount, &aSig64 ); 1408 return roundAndPackInt32( aSign, aSig64 ); 1409 1410 } 1411 #endif /* !SOFTFLOAT_FOR_GCC */ 1412 1413 /* 1414 ------------------------------------------------------------------------------- 1415 Returns the result of converting the single-precision floating-point value 1416 `a' to the 32-bit two's complement integer format. The conversion is 1417 performed according to the IEC/IEEE Standard for Binary Floating-Point 1418 Arithmetic, except that the conversion is always rounded toward zero. 1419 If `a' is a NaN, the largest positive integer is returned. Otherwise, if 1420 the conversion overflows, the largest integer with the same sign as `a' is 1421 returned. 1422 ------------------------------------------------------------------------------- 1423 */ 1424 int32 float32_to_int32_round_to_zero( float32 a ) 1425 { 1426 flag aSign; 1427 int16 aExp, shiftCount; 1428 bits32 aSig; 1429 int32 z; 1430 1431 aSig = extractFloat32Frac( a ); 1432 aExp = extractFloat32Exp( a ); 1433 aSign = extractFloat32Sign( a ); 1434 shiftCount = aExp - 0x9E; 1435 if ( 0 <= shiftCount ) { 1436 if ( a != 0xCF000000 ) { 1437 float_raise( float_flag_invalid ); 1438 if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) return 0x7FFFFFFF; 1439 } 1440 return (sbits32) 0x80000000; 1441 } 1442 else if ( aExp <= 0x7E ) { 1443 if ( aExp | aSig ) set_float_exception_inexact_flag(); 1444 return 0; 1445 } 1446 aSig = ( aSig | 0x00800000 )<<8; 1447 z = aSig>>( - shiftCount ); 1448 if ( (bits32) ( aSig<<( shiftCount & 31 ) ) ) { 1449 set_float_exception_inexact_flag(); 1450 } 1451 if ( aSign ) z = - z; 1452 return z; 1453 1454 } 1455 1456 #ifndef SOFTFLOAT_FOR_GCC /* __fix?fdi provided by libgcc2.c */ 1457 /* 1458 ------------------------------------------------------------------------------- 1459 Returns the result of converting the single-precision floating-point value 1460 `a' to the 64-bit two's complement integer format. The conversion is 1461 performed according to the IEC/IEEE Standard for Binary Floating-Point 1462 Arithmetic---which means in particular that the conversion is rounded 1463 according to the current rounding mode. If `a' is a NaN, the largest 1464 positive integer is returned. Otherwise, if the conversion overflows, the 1465 largest integer with the same sign as `a' is returned. 1466 ------------------------------------------------------------------------------- 1467 */ 1468 int64 float32_to_int64( float32 a ) 1469 { 1470 flag aSign; 1471 int16 aExp, shiftCount; 1472 bits32 aSig; 1473 bits64 aSig64, aSigExtra; 1474 1475 aSig = extractFloat32Frac( a ); 1476 aExp = extractFloat32Exp( a ); 1477 aSign = extractFloat32Sign( a ); 1478 shiftCount = 0xBE - aExp; 1479 if ( shiftCount < 0 ) { 1480 float_raise( float_flag_invalid ); 1481 if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) { 1482 return LIT64( 0x7FFFFFFFFFFFFFFF ); 1483 } 1484 return (sbits64) LIT64( 0x8000000000000000 ); 1485 } 1486 if ( aExp ) aSig |= 0x00800000; 1487 aSig64 = aSig; 1488 aSig64 <<= 40; 1489 shift64ExtraRightJamming( aSig64, 0, shiftCount, &aSig64, &aSigExtra ); 1490 return roundAndPackInt64( aSign, aSig64, aSigExtra ); 1491 1492 } 1493 1494 /* 1495 ------------------------------------------------------------------------------- 1496 Returns the result of converting the single-precision floating-point value 1497 `a' to the 64-bit two's complement integer format. The conversion is 1498 performed according to the IEC/IEEE Standard for Binary Floating-Point 1499 Arithmetic, except that the conversion is always rounded toward zero. If 1500 `a' is a NaN, the largest positive integer is returned. Otherwise, if the 1501 conversion overflows, the largest integer with the same sign as `a' is 1502 returned. 1503 ------------------------------------------------------------------------------- 1504 */ 1505 int64 float32_to_int64_round_to_zero( float32 a ) 1506 { 1507 flag aSign; 1508 int16 aExp, shiftCount; 1509 bits32 aSig; 1510 bits64 aSig64; 1511 int64 z; 1512 1513 aSig = extractFloat32Frac( a ); 1514 aExp = extractFloat32Exp( a ); 1515 aSign = extractFloat32Sign( a ); 1516 shiftCount = aExp - 0xBE; 1517 if ( 0 <= shiftCount ) { 1518 if ( a != 0xDF000000 ) { 1519 float_raise( float_flag_invalid ); 1520 if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) { 1521 return LIT64( 0x7FFFFFFFFFFFFFFF ); 1522 } 1523 } 1524 return (sbits64) LIT64( 0x8000000000000000 ); 1525 } 1526 else if ( aExp <= 0x7E ) { 1527 if ( aExp | aSig ) set_float_exception_inexact_flag(); 1528 return 0; 1529 } 1530 aSig64 = aSig | 0x00800000; 1531 aSig64 <<= 40; 1532 z = aSig64>>( - shiftCount ); 1533 if ( (bits64) ( aSig64<<( shiftCount & 63 ) ) ) { 1534 set_float_exception_inexact_flag(); 1535 } 1536 if ( aSign ) z = - z; 1537 return z; 1538 1539 } 1540 #endif /* !SOFTFLOAT_FOR_GCC */ 1541 1542 /* 1543 ------------------------------------------------------------------------------- 1544 Returns the result of converting the single-precision floating-point value 1545 `a' to the double-precision floating-point format. The conversion is 1546 performed according to the IEC/IEEE Standard for Binary Floating-Point 1547 Arithmetic. 1548 ------------------------------------------------------------------------------- 1549 */ 1550 float64 float32_to_float64( float32 a ) 1551 { 1552 flag aSign; 1553 int16 aExp; 1554 bits32 aSig; 1555 1556 aSig = extractFloat32Frac( a ); 1557 aExp = extractFloat32Exp( a ); 1558 aSign = extractFloat32Sign( a ); 1559 if ( aExp == 0xFF ) { 1560 if ( aSig ) return commonNaNToFloat64( float32ToCommonNaN( a ) ); 1561 return packFloat64( aSign, 0x7FF, 0 ); 1562 } 1563 if ( aExp == 0 ) { 1564 if ( aSig == 0 ) return packFloat64( aSign, 0, 0 ); 1565 normalizeFloat32Subnormal( aSig, &aExp, &aSig ); 1566 --aExp; 1567 } 1568 return packFloat64( aSign, aExp + 0x380, ( (bits64) aSig )<<29 ); 1569 1570 } 1571 1572 #ifdef FLOATX80 1573 1574 /* 1575 ------------------------------------------------------------------------------- 1576 Returns the result of converting the single-precision floating-point value 1577 `a' to the extended double-precision floating-point format. The conversion 1578 is performed according to the IEC/IEEE Standard for Binary Floating-Point 1579 Arithmetic. 1580 ------------------------------------------------------------------------------- 1581 */ 1582 floatx80 float32_to_floatx80( float32 a ) 1583 { 1584 flag aSign; 1585 int16 aExp; 1586 bits32 aSig; 1587 1588 aSig = extractFloat32Frac( a ); 1589 aExp = extractFloat32Exp( a ); 1590 aSign = extractFloat32Sign( a ); 1591 if ( aExp == 0xFF ) { 1592 if ( aSig ) return commonNaNToFloatx80( float32ToCommonNaN( a ) ); 1593 return packFloatx80( aSign, 0x7FFF, 0 ); 1594 } 1595 if ( aExp == 0 ) { 1596 if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 ); 1597 normalizeFloat32Subnormal( aSig, &aExp, &aSig ); 1598 } 1599 aSig |= 0x00800000; 1600 return packFloatx80( aSign, aExp + 0x3F80, ( (bits64) aSig )<<40 ); 1601 1602 } 1603 1604 #endif 1605 1606 #ifdef FLOAT128 1607 1608 /* 1609 ------------------------------------------------------------------------------- 1610 Returns the result of converting the single-precision floating-point value 1611 `a' to the double-precision floating-point format. The conversion is 1612 performed according to the IEC/IEEE Standard for Binary Floating-Point 1613 Arithmetic. 1614 ------------------------------------------------------------------------------- 1615 */ 1616 float128 float32_to_float128( float32 a ) 1617 { 1618 flag aSign; 1619 int16 aExp; 1620 bits32 aSig; 1621 1622 aSig = extractFloat32Frac( a ); 1623 aExp = extractFloat32Exp( a ); 1624 aSign = extractFloat32Sign( a ); 1625 if ( aExp == 0xFF ) { 1626 if ( aSig ) return commonNaNToFloat128( float32ToCommonNaN( a ) ); 1627 return packFloat128( aSign, 0x7FFF, 0, 0 ); 1628 } 1629 if ( aExp == 0 ) { 1630 if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 ); 1631 normalizeFloat32Subnormal( aSig, &aExp, &aSig ); 1632 --aExp; 1633 } 1634 return packFloat128( aSign, aExp + 0x3F80, ( (bits64) aSig )<<25, 0 ); 1635 1636 } 1637 1638 #endif 1639 1640 #ifndef SOFTFLOAT_FOR_GCC /* Not needed */ 1641 /* 1642 ------------------------------------------------------------------------------- 1643 Rounds the single-precision floating-point value `a' to an integer, and 1644 returns the result as a single-precision floating-point value. The 1645 operation is performed according to the IEC/IEEE Standard for Binary 1646 Floating-Point Arithmetic. 1647 ------------------------------------------------------------------------------- 1648 */ 1649 float32 float32_round_to_int( float32 a ) 1650 { 1651 flag aSign; 1652 int16 aExp; 1653 bits32 lastBitMask, roundBitsMask; 1654 int8 roundingMode; 1655 float32 z; 1656 1657 aExp = extractFloat32Exp( a ); 1658 if ( 0x96 <= aExp ) { 1659 if ( ( aExp == 0xFF ) && extractFloat32Frac( a ) ) { 1660 return propagateFloat32NaN( a, a ); 1661 } 1662 return a; 1663 } 1664 if ( aExp <= 0x7E ) { 1665 if ( (bits32) ( a<<1 ) == 0 ) return a; 1666 set_float_exception_inexact_flag(); 1667 aSign = extractFloat32Sign( a ); 1668 switch ( float_rounding_mode ) { 1669 case float_round_nearest_even: 1670 if ( ( aExp == 0x7E ) && extractFloat32Frac( a ) ) { 1671 return packFloat32( aSign, 0x7F, 0 ); 1672 } 1673 break; 1674 case float_round_to_zero: 1675 break; 1676 case float_round_down: 1677 return aSign ? 0xBF800000 : 0; 1678 case float_round_up: 1679 return aSign ? 0x80000000 : 0x3F800000; 1680 } 1681 return packFloat32( aSign, 0, 0 ); 1682 } 1683 lastBitMask = 1; 1684 lastBitMask <<= 0x96 - aExp; 1685 roundBitsMask = lastBitMask - 1; 1686 z = a; 1687 roundingMode = float_rounding_mode; 1688 if ( roundingMode == float_round_nearest_even ) { 1689 z += lastBitMask>>1; 1690 if ( ( z & roundBitsMask ) == 0 ) z &= ~ lastBitMask; 1691 } 1692 else if ( roundingMode != float_round_to_zero ) { 1693 if ( extractFloat32Sign( z ) ^ ( roundingMode == float_round_up ) ) { 1694 z += roundBitsMask; 1695 } 1696 } 1697 z &= ~ roundBitsMask; 1698 if ( z != a ) set_float_exception_inexact_flag(); 1699 return z; 1700 1701 } 1702 #endif /* !SOFTFLOAT_FOR_GCC */ 1703 1704 /* 1705 ------------------------------------------------------------------------------- 1706 Returns the result of adding the absolute values of the single-precision 1707 floating-point values `a' and `b'. If `zSign' is 1, the sum is negated 1708 before being returned. `zSign' is ignored if the result is a NaN. 1709 The addition is performed according to the IEC/IEEE Standard for Binary 1710 Floating-Point Arithmetic. 1711 ------------------------------------------------------------------------------- 1712 */ 1713 static float32 addFloat32Sigs( float32 a, float32 b, flag zSign ) 1714 { 1715 int16 aExp, bExp, zExp; 1716 bits32 aSig, bSig, zSig; 1717 int16 expDiff; 1718 1719 aSig = extractFloat32Frac( a ); 1720 aExp = extractFloat32Exp( a ); 1721 bSig = extractFloat32Frac( b ); 1722 bExp = extractFloat32Exp( b ); 1723 expDiff = aExp - bExp; 1724 aSig <<= 6; 1725 bSig <<= 6; 1726 if ( 0 < expDiff ) { 1727 if ( aExp == 0xFF ) { 1728 if ( aSig ) return propagateFloat32NaN( a, b ); 1729 return a; 1730 } 1731 if ( bExp == 0 ) { 1732 --expDiff; 1733 } 1734 else { 1735 bSig |= 0x20000000; 1736 } 1737 shift32RightJamming( bSig, expDiff, &bSig ); 1738 zExp = aExp; 1739 } 1740 else if ( expDiff < 0 ) { 1741 if ( bExp == 0xFF ) { 1742 if ( bSig ) return propagateFloat32NaN( a, b ); 1743 return packFloat32( zSign, 0xFF, 0 ); 1744 } 1745 if ( aExp == 0 ) { 1746 ++expDiff; 1747 } 1748 else { 1749 aSig |= 0x20000000; 1750 } 1751 shift32RightJamming( aSig, - expDiff, &aSig ); 1752 zExp = bExp; 1753 } 1754 else { 1755 if ( aExp == 0xFF ) { 1756 if ( aSig | bSig ) return propagateFloat32NaN( a, b ); 1757 return a; 1758 } 1759 if ( aExp == 0 ) return packFloat32( zSign, 0, ( aSig + bSig )>>6 ); 1760 zSig = 0x40000000 + aSig + bSig; 1761 zExp = aExp; 1762 goto roundAndPack; 1763 } 1764 aSig |= 0x20000000; 1765 zSig = ( aSig + bSig )<<1; 1766 --zExp; 1767 if ( (sbits32) zSig < 0 ) { 1768 zSig = aSig + bSig; 1769 ++zExp; 1770 } 1771 roundAndPack: 1772 return roundAndPackFloat32( zSign, zExp, zSig ); 1773 1774 } 1775 1776 /* 1777 ------------------------------------------------------------------------------- 1778 Returns the result of subtracting the absolute values of the single- 1779 precision floating-point values `a' and `b'. If `zSign' is 1, the 1780 difference is negated before being returned. `zSign' is ignored if the 1781 result is a NaN. The subtraction is performed according to the IEC/IEEE 1782 Standard for Binary Floating-Point Arithmetic. 1783 ------------------------------------------------------------------------------- 1784 */ 1785 static float32 subFloat32Sigs( float32 a, float32 b, flag zSign ) 1786 { 1787 int16 aExp, bExp, zExp; 1788 bits32 aSig, bSig, zSig; 1789 int16 expDiff; 1790 1791 aSig = extractFloat32Frac( a ); 1792 aExp = extractFloat32Exp( a ); 1793 bSig = extractFloat32Frac( b ); 1794 bExp = extractFloat32Exp( b ); 1795 expDiff = aExp - bExp; 1796 aSig <<= 7; 1797 bSig <<= 7; 1798 if ( 0 < expDiff ) goto aExpBigger; 1799 if ( expDiff < 0 ) goto bExpBigger; 1800 if ( aExp == 0xFF ) { 1801 if ( aSig | bSig ) return propagateFloat32NaN( a, b ); 1802 float_raise( float_flag_invalid ); 1803 return float32_default_nan; 1804 } 1805 if ( aExp == 0 ) { 1806 aExp = 1; 1807 bExp = 1; 1808 } 1809 if ( bSig < aSig ) goto aBigger; 1810 if ( aSig < bSig ) goto bBigger; 1811 return packFloat32( float_rounding_mode == float_round_down, 0, 0 ); 1812 bExpBigger: 1813 if ( bExp == 0xFF ) { 1814 if ( bSig ) return propagateFloat32NaN( a, b ); 1815 return packFloat32( zSign ^ 1, 0xFF, 0 ); 1816 } 1817 if ( aExp == 0 ) { 1818 ++expDiff; 1819 } 1820 else { 1821 aSig |= 0x40000000; 1822 } 1823 shift32RightJamming( aSig, - expDiff, &aSig ); 1824 bSig |= 0x40000000; 1825 bBigger: 1826 zSig = bSig - aSig; 1827 zExp = bExp; 1828 zSign ^= 1; 1829 goto normalizeRoundAndPack; 1830 aExpBigger: 1831 if ( aExp == 0xFF ) { 1832 if ( aSig ) return propagateFloat32NaN( a, b ); 1833 return a; 1834 } 1835 if ( bExp == 0 ) { 1836 --expDiff; 1837 } 1838 else { 1839 bSig |= 0x40000000; 1840 } 1841 shift32RightJamming( bSig, expDiff, &bSig ); 1842 aSig |= 0x40000000; 1843 aBigger: 1844 zSig = aSig - bSig; 1845 zExp = aExp; 1846 normalizeRoundAndPack: 1847 --zExp; 1848 return normalizeRoundAndPackFloat32( zSign, zExp, zSig ); 1849 1850 } 1851 1852 /* 1853 ------------------------------------------------------------------------------- 1854 Returns the result of adding the single-precision floating-point values `a' 1855 and `b'. The operation is performed according to the IEC/IEEE Standard for 1856 Binary Floating-Point Arithmetic. 1857 ------------------------------------------------------------------------------- 1858 */ 1859 float32 float32_add( float32 a, float32 b ) 1860 { 1861 flag aSign, bSign; 1862 1863 aSign = extractFloat32Sign( a ); 1864 bSign = extractFloat32Sign( b ); 1865 if ( aSign == bSign ) { 1866 return addFloat32Sigs( a, b, aSign ); 1867 } 1868 else { 1869 return subFloat32Sigs( a, b, aSign ); 1870 } 1871 1872 } 1873 1874 /* 1875 ------------------------------------------------------------------------------- 1876 Returns the result of subtracting the single-precision floating-point values 1877 `a' and `b'. The operation is performed according to the IEC/IEEE Standard 1878 for Binary Floating-Point Arithmetic. 1879 ------------------------------------------------------------------------------- 1880 */ 1881 float32 float32_sub( float32 a, float32 b ) 1882 { 1883 flag aSign, bSign; 1884 1885 aSign = extractFloat32Sign( a ); 1886 bSign = extractFloat32Sign( b ); 1887 if ( aSign == bSign ) { 1888 return subFloat32Sigs( a, b, aSign ); 1889 } 1890 else { 1891 return addFloat32Sigs( a, b, aSign ); 1892 } 1893 1894 } 1895 1896 /* 1897 ------------------------------------------------------------------------------- 1898 Returns the result of multiplying the single-precision floating-point values 1899 `a' and `b'. The operation is performed according to the IEC/IEEE Standard 1900 for Binary Floating-Point Arithmetic. 1901 ------------------------------------------------------------------------------- 1902 */ 1903 float32 float32_mul( float32 a, float32 b ) 1904 { 1905 flag aSign, bSign, zSign; 1906 int16 aExp, bExp, zExp; 1907 bits32 aSig, bSig; 1908 bits64 zSig64; 1909 bits32 zSig; 1910 1911 aSig = extractFloat32Frac( a ); 1912 aExp = extractFloat32Exp( a ); 1913 aSign = extractFloat32Sign( a ); 1914 bSig = extractFloat32Frac( b ); 1915 bExp = extractFloat32Exp( b ); 1916 bSign = extractFloat32Sign( b ); 1917 zSign = aSign ^ bSign; 1918 if ( aExp == 0xFF ) { 1919 if ( aSig || ( ( bExp == 0xFF ) && bSig ) ) { 1920 return propagateFloat32NaN( a, b ); 1921 } 1922 if ( ( bExp | bSig ) == 0 ) { 1923 float_raise( float_flag_invalid ); 1924 return float32_default_nan; 1925 } 1926 return packFloat32( zSign, 0xFF, 0 ); 1927 } 1928 if ( bExp == 0xFF ) { 1929 if ( bSig ) return propagateFloat32NaN( a, b ); 1930 if ( ( aExp | aSig ) == 0 ) { 1931 float_raise( float_flag_invalid ); 1932 return float32_default_nan; 1933 } 1934 return packFloat32( zSign, 0xFF, 0 ); 1935 } 1936 if ( aExp == 0 ) { 1937 if ( aSig == 0 ) return packFloat32( zSign, 0, 0 ); 1938 normalizeFloat32Subnormal( aSig, &aExp, &aSig ); 1939 } 1940 if ( bExp == 0 ) { 1941 if ( bSig == 0 ) return packFloat32( zSign, 0, 0 ); 1942 normalizeFloat32Subnormal( bSig, &bExp, &bSig ); 1943 } 1944 zExp = aExp + bExp - 0x7F; 1945 aSig = ( aSig | 0x00800000 )<<7; 1946 bSig = ( bSig | 0x00800000 )<<8; 1947 shift64RightJamming( ( (bits64) aSig ) * bSig, 32, &zSig64 ); 1948 zSig = (bits32)zSig64; 1949 if ( 0 <= (sbits32) ( zSig<<1 ) ) { 1950 zSig <<= 1; 1951 --zExp; 1952 } 1953 return roundAndPackFloat32( zSign, zExp, zSig ); 1954 1955 } 1956 1957 /* 1958 ------------------------------------------------------------------------------- 1959 Returns the result of dividing the single-precision floating-point value `a' 1960 by the corresponding value `b'. The operation is performed according to the 1961 IEC/IEEE Standard for Binary Floating-Point Arithmetic. 1962 ------------------------------------------------------------------------------- 1963 */ 1964 float32 float32_div( float32 a, float32 b ) 1965 { 1966 flag aSign, bSign, zSign; 1967 int16 aExp, bExp, zExp; 1968 bits32 aSig, bSig, zSig; 1969 1970 aSig = extractFloat32Frac( a ); 1971 aExp = extractFloat32Exp( a ); 1972 aSign = extractFloat32Sign( a ); 1973 bSig = extractFloat32Frac( b ); 1974 bExp = extractFloat32Exp( b ); 1975 bSign = extractFloat32Sign( b ); 1976 zSign = aSign ^ bSign; 1977 if ( aExp == 0xFF ) { 1978 if ( aSig ) return propagateFloat32NaN( a, b ); 1979 if ( bExp == 0xFF ) { 1980 if ( bSig ) return propagateFloat32NaN( a, b ); 1981 float_raise( float_flag_invalid ); 1982 return float32_default_nan; 1983 } 1984 return packFloat32( zSign, 0xFF, 0 ); 1985 } 1986 if ( bExp == 0xFF ) { 1987 if ( bSig ) return propagateFloat32NaN( a, b ); 1988 return packFloat32( zSign, 0, 0 ); 1989 } 1990 if ( bExp == 0 ) { 1991 if ( bSig == 0 ) { 1992 if ( ( aExp | aSig ) == 0 ) { 1993 float_raise( float_flag_invalid ); 1994 return float32_default_nan; 1995 } 1996 float_raise( float_flag_divbyzero ); 1997 return packFloat32( zSign, 0xFF, 0 ); 1998 } 1999 normalizeFloat32Subnormal( bSig, &bExp, &bSig ); 2000 } 2001 if ( aExp == 0 ) { 2002 if ( aSig == 0 ) return packFloat32( zSign, 0, 0 ); 2003 normalizeFloat32Subnormal( aSig, &aExp, &aSig ); 2004 } 2005 zExp = aExp - bExp + 0x7D; 2006 aSig = ( aSig | 0x00800000 )<<7; 2007 bSig = ( bSig | 0x00800000 )<<8; 2008 if ( bSig <= ( aSig + aSig ) ) { 2009 aSig >>= 1; 2010 ++zExp; 2011 } 2012 zSig = (bits32)((((bits64) aSig) << 32) / bSig); 2013 if ( ( zSig & 0x3F ) == 0 ) { 2014 zSig |= ( (bits64) bSig * zSig != ( (bits64) aSig )<<32 ); 2015 } 2016 return roundAndPackFloat32( zSign, zExp, zSig ); 2017 2018 } 2019 2020 #ifndef SOFTFLOAT_FOR_GCC /* Not needed */ 2021 /* 2022 ------------------------------------------------------------------------------- 2023 Returns the remainder of the single-precision floating-point value `a' 2024 with respect to the corresponding value `b'. The operation is performed 2025 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 2026 ------------------------------------------------------------------------------- 2027 */ 2028 float32 float32_rem( float32 a, float32 b ) 2029 { 2030 flag aSign, bSign, zSign; 2031 int16 aExp, bExp, expDiff; 2032 bits32 aSig, bSig; 2033 bits32 q; 2034 bits64 aSig64, bSig64, q64; 2035 bits32 alternateASig; 2036 sbits32 sigMean; 2037 2038 aSig = extractFloat32Frac( a ); 2039 aExp = extractFloat32Exp( a ); 2040 aSign = extractFloat32Sign( a ); 2041 bSig = extractFloat32Frac( b ); 2042 bExp = extractFloat32Exp( b ); 2043 bSign = extractFloat32Sign( b ); 2044 if ( aExp == 0xFF ) { 2045 if ( aSig || ( ( bExp == 0xFF ) && bSig ) ) { 2046 return propagateFloat32NaN( a, b ); 2047 } 2048 float_raise( float_flag_invalid ); 2049 return float32_default_nan; 2050 } 2051 if ( bExp == 0xFF ) { 2052 if ( bSig ) return propagateFloat32NaN( a, b ); 2053 return a; 2054 } 2055 if ( bExp == 0 ) { 2056 if ( bSig == 0 ) { 2057 float_raise( float_flag_invalid ); 2058 return float32_default_nan; 2059 } 2060 normalizeFloat32Subnormal( bSig, &bExp, &bSig ); 2061 } 2062 if ( aExp == 0 ) { 2063 if ( aSig == 0 ) return a; 2064 normalizeFloat32Subnormal( aSig, &aExp, &aSig ); 2065 } 2066 expDiff = aExp - bExp; 2067 aSig |= 0x00800000; 2068 bSig |= 0x00800000; 2069 if ( expDiff < 32 ) { 2070 aSig <<= 8; 2071 bSig <<= 8; 2072 if ( expDiff < 0 ) { 2073 if ( expDiff < -1 ) return a; 2074 aSig >>= 1; 2075 } 2076 q = ( bSig <= aSig ); 2077 if ( q ) aSig -= bSig; 2078 if ( 0 < expDiff ) { 2079 q = ( ( (bits64) aSig )<<32 ) / bSig; 2080 q >>= 32 - expDiff; 2081 bSig >>= 2; 2082 aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q; 2083 } 2084 else { 2085 aSig >>= 2; 2086 bSig >>= 2; 2087 } 2088 } 2089 else { 2090 if ( bSig <= aSig ) aSig -= bSig; 2091 aSig64 = ( (bits64) aSig )<<40; 2092 bSig64 = ( (bits64) bSig )<<40; 2093 expDiff -= 64; 2094 while ( 0 < expDiff ) { 2095 q64 = estimateDiv128To64( aSig64, 0, bSig64 ); 2096 q64 = ( 2 < q64 ) ? q64 - 2 : 0; 2097 aSig64 = - ( ( bSig * q64 )<<38 ); 2098 expDiff -= 62; 2099 } 2100 expDiff += 64; 2101 q64 = estimateDiv128To64( aSig64, 0, bSig64 ); 2102 q64 = ( 2 < q64 ) ? q64 - 2 : 0; 2103 q = q64>>( 64 - expDiff ); 2104 bSig <<= 6; 2105 aSig = ( ( aSig64>>33 )<<( expDiff - 1 ) ) - bSig * q; 2106 } 2107 do { 2108 alternateASig = aSig; 2109 ++q; 2110 aSig -= bSig; 2111 } while ( 0 <= (sbits32) aSig ); 2112 sigMean = aSig + alternateASig; 2113 if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) { 2114 aSig = alternateASig; 2115 } 2116 zSign = ( (sbits32) aSig < 0 ); 2117 if ( zSign ) aSig = - aSig; 2118 return normalizeRoundAndPackFloat32( aSign ^ zSign, bExp, aSig ); 2119 2120 } 2121 #endif /* !SOFTFLOAT_FOR_GCC */ 2122 2123 #ifndef SOFTFLOAT_FOR_GCC /* Not needed */ 2124 /* 2125 ------------------------------------------------------------------------------- 2126 Returns the square root of the single-precision floating-point value `a'. 2127 The operation is performed according to the IEC/IEEE Standard for Binary 2128 Floating-Point Arithmetic. 2129 ------------------------------------------------------------------------------- 2130 */ 2131 float32 float32_sqrt( float32 a ) 2132 { 2133 flag aSign; 2134 int16 aExp, zExp; 2135 bits32 aSig, zSig; 2136 bits64 rem, term; 2137 2138 aSig = extractFloat32Frac( a ); 2139 aExp = extractFloat32Exp( a ); 2140 aSign = extractFloat32Sign( a ); 2141 if ( aExp == 0xFF ) { 2142 if ( aSig ) return propagateFloat32NaN( a, 0 ); 2143 if ( ! aSign ) return a; 2144 float_raise( float_flag_invalid ); 2145 return float32_default_nan; 2146 } 2147 if ( aSign ) { 2148 if ( ( aExp | aSig ) == 0 ) return a; 2149 float_raise( float_flag_invalid ); 2150 return float32_default_nan; 2151 } 2152 if ( aExp == 0 ) { 2153 if ( aSig == 0 ) return 0; 2154 normalizeFloat32Subnormal( aSig, &aExp, &aSig ); 2155 } 2156 zExp = ( ( aExp - 0x7F )>>1 ) + 0x7E; 2157 aSig = ( aSig | 0x00800000 )<<8; 2158 zSig = estimateSqrt32( aExp, aSig ) + 2; 2159 if ( ( zSig & 0x7F ) <= 5 ) { 2160 if ( zSig < 2 ) { 2161 zSig = 0x7FFFFFFF; 2162 goto roundAndPack; 2163 } 2164 aSig >>= aExp & 1; 2165 term = ( (bits64) zSig ) * zSig; 2166 rem = ( ( (bits64) aSig )<<32 ) - term; 2167 while ( (sbits64) rem < 0 ) { 2168 --zSig; 2169 rem += ( ( (bits64) zSig )<<1 ) | 1; 2170 } 2171 zSig |= ( rem != 0 ); 2172 } 2173 shift32RightJamming( zSig, 1, &zSig ); 2174 roundAndPack: 2175 return roundAndPackFloat32( 0, zExp, zSig ); 2176 2177 } 2178 #endif /* !SOFTFLOAT_FOR_GCC */ 2179 2180 /* 2181 ------------------------------------------------------------------------------- 2182 Returns 1 if the single-precision floating-point value `a' is equal to 2183 the corresponding value `b', and 0 otherwise. The comparison is performed 2184 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 2185 ------------------------------------------------------------------------------- 2186 */ 2187 flag float32_eq( float32 a, float32 b ) 2188 { 2189 2190 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) ) 2191 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) ) 2192 ) { 2193 if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) { 2194 float_raise( float_flag_invalid ); 2195 } 2196 return 0; 2197 } 2198 return ( a == b ) || ( (bits32) ( ( a | b )<<1 ) == 0 ); 2199 2200 } 2201 2202 /* 2203 ------------------------------------------------------------------------------- 2204 Returns 1 if the single-precision floating-point value `a' is less than 2205 or equal to the corresponding value `b', and 0 otherwise. The comparison 2206 is performed according to the IEC/IEEE Standard for Binary Floating-Point 2207 Arithmetic. 2208 ------------------------------------------------------------------------------- 2209 */ 2210 flag float32_le( float32 a, float32 b ) 2211 { 2212 flag aSign, bSign; 2213 2214 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) ) 2215 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) ) 2216 ) { 2217 float_raise( float_flag_invalid ); 2218 return 0; 2219 } 2220 aSign = extractFloat32Sign( a ); 2221 bSign = extractFloat32Sign( b ); 2222 if ( aSign != bSign ) return aSign || ( (bits32) ( ( a | b )<<1 ) == 0 ); 2223 return ( a == b ) || ( aSign ^ ( a < b ) ); 2224 2225 } 2226 2227 /* 2228 ------------------------------------------------------------------------------- 2229 Returns 1 if the single-precision floating-point value `a' is less than 2230 the corresponding value `b', and 0 otherwise. The comparison is performed 2231 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 2232 ------------------------------------------------------------------------------- 2233 */ 2234 flag float32_lt( float32 a, float32 b ) 2235 { 2236 flag aSign, bSign; 2237 2238 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) ) 2239 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) ) 2240 ) { 2241 float_raise( float_flag_invalid ); 2242 return 0; 2243 } 2244 aSign = extractFloat32Sign( a ); 2245 bSign = extractFloat32Sign( b ); 2246 if ( aSign != bSign ) return aSign && ( (bits32) ( ( a | b )<<1 ) != 0 ); 2247 return ( a != b ) && ( aSign ^ ( a < b ) ); 2248 2249 } 2250 2251 #ifndef SOFTFLOAT_FOR_GCC /* Not needed */ 2252 /* 2253 ------------------------------------------------------------------------------- 2254 Returns 1 if the single-precision floating-point value `a' is equal to 2255 the corresponding value `b', and 0 otherwise. The invalid exception is 2256 raised if either operand is a NaN. Otherwise, the comparison is performed 2257 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 2258 ------------------------------------------------------------------------------- 2259 */ 2260 flag float32_eq_signaling( float32 a, float32 b ) 2261 { 2262 2263 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) ) 2264 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) ) 2265 ) { 2266 float_raise( float_flag_invalid ); 2267 return 0; 2268 } 2269 return ( a == b ) || ( (bits32) ( ( a | b )<<1 ) == 0 ); 2270 2271 } 2272 2273 /* 2274 ------------------------------------------------------------------------------- 2275 Returns 1 if the single-precision floating-point value `a' is less than or 2276 equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not 2277 cause an exception. Otherwise, the comparison is performed according to the 2278 IEC/IEEE Standard for Binary Floating-Point Arithmetic. 2279 ------------------------------------------------------------------------------- 2280 */ 2281 flag float32_le_quiet( float32 a, float32 b ) 2282 { 2283 flag aSign, bSign; 2284 2285 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) ) 2286 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) ) 2287 ) { 2288 if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) { 2289 float_raise( float_flag_invalid ); 2290 } 2291 return 0; 2292 } 2293 aSign = extractFloat32Sign( a ); 2294 bSign = extractFloat32Sign( b ); 2295 if ( aSign != bSign ) return aSign || ( (bits32) ( ( a | b )<<1 ) == 0 ); 2296 return ( a == b ) || ( aSign ^ ( a < b ) ); 2297 2298 } 2299 2300 /* 2301 ------------------------------------------------------------------------------- 2302 Returns 1 if the single-precision floating-point value `a' is less than 2303 the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an 2304 exception. Otherwise, the comparison is performed according to the IEC/IEEE 2305 Standard for Binary Floating-Point Arithmetic. 2306 ------------------------------------------------------------------------------- 2307 */ 2308 flag float32_lt_quiet( float32 a, float32 b ) 2309 { 2310 flag aSign, bSign; 2311 2312 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) ) 2313 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) ) 2314 ) { 2315 if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) { 2316 float_raise( float_flag_invalid ); 2317 } 2318 return 0; 2319 } 2320 aSign = extractFloat32Sign( a ); 2321 bSign = extractFloat32Sign( b ); 2322 if ( aSign != bSign ) return aSign && ( (bits32) ( ( a | b )<<1 ) != 0 ); 2323 return ( a != b ) && ( aSign ^ ( a < b ) ); 2324 2325 } 2326 #endif /* !SOFTFLOAT_FOR_GCC */ 2327 2328 #ifndef SOFTFLOAT_FOR_GCC /* Not needed */ 2329 /* 2330 ------------------------------------------------------------------------------- 2331 Returns the result of converting the double-precision floating-point value 2332 `a' to the 32-bit two's complement integer format. The conversion is 2333 performed according to the IEC/IEEE Standard for Binary Floating-Point 2334 Arithmetic---which means in particular that the conversion is rounded 2335 according to the current rounding mode. If `a' is a NaN, the largest 2336 positive integer is returned. Otherwise, if the conversion overflows, the 2337 largest integer with the same sign as `a' is returned. 2338 ------------------------------------------------------------------------------- 2339 */ 2340 int32 float64_to_int32( float64 a ) 2341 { 2342 flag aSign; 2343 int16 aExp, shiftCount; 2344 bits64 aSig; 2345 2346 aSig = extractFloat64Frac( a ); 2347 aExp = extractFloat64Exp( a ); 2348 aSign = extractFloat64Sign( a ); 2349 if ( ( aExp == 0x7FF ) && aSig ) aSign = 0; 2350 if ( aExp ) aSig |= LIT64( 0x0010000000000000 ); 2351 shiftCount = 0x42C - aExp; 2352 if ( 0 < shiftCount ) shift64RightJamming( aSig, shiftCount, &aSig ); 2353 return roundAndPackInt32( aSign, aSig ); 2354 2355 } 2356 #endif /* !SOFTFLOAT_FOR_GCC */ 2357 2358 /* 2359 ------------------------------------------------------------------------------- 2360 Returns the result of converting the double-precision floating-point value 2361 `a' to the 32-bit two's complement integer format. The conversion is 2362 performed according to the IEC/IEEE Standard for Binary Floating-Point 2363 Arithmetic, except that the conversion is always rounded toward zero. 2364 If `a' is a NaN, the largest positive integer is returned. Otherwise, if 2365 the conversion overflows, the largest integer with the same sign as `a' is 2366 returned. 2367 ------------------------------------------------------------------------------- 2368 */ 2369 int32 float64_to_int32_round_to_zero( float64 a ) 2370 { 2371 flag aSign; 2372 int16 aExp, shiftCount; 2373 bits64 aSig, savedASig; 2374 int32 z; 2375 2376 aSig = extractFloat64Frac( a ); 2377 aExp = extractFloat64Exp( a ); 2378 aSign = extractFloat64Sign( a ); 2379 if ( 0x41E < aExp ) { 2380 if ( ( aExp == 0x7FF ) && aSig ) aSign = 0; 2381 goto invalid; 2382 } 2383 else if ( aExp < 0x3FF ) { 2384 if ( aExp || aSig ) set_float_exception_inexact_flag(); 2385 return 0; 2386 } 2387 aSig |= LIT64( 0x0010000000000000 ); 2388 shiftCount = 0x433 - aExp; 2389 savedASig = aSig; 2390 aSig >>= shiftCount; 2391 z = (int32)aSig; 2392 if ( aSign ) z = - z; 2393 if ( ( z < 0 ) ^ aSign ) { 2394 invalid: 2395 float_raise( float_flag_invalid ); 2396 return aSign ? (sbits32) 0x80000000 : 0x7FFFFFFF; 2397 } 2398 if ( ( aSig<<shiftCount ) != savedASig ) { 2399 set_float_exception_inexact_flag(); 2400 } 2401 return z; 2402 2403 } 2404 2405 #ifndef SOFTFLOAT_FOR_GCC /* Not needed */ 2406 /* 2407 ------------------------------------------------------------------------------- 2408 Returns the result of converting the double-precision floating-point value 2409 `a' to the 64-bit two's complement integer format. The conversion is 2410 performed according to the IEC/IEEE Standard for Binary Floating-Point 2411 Arithmetic---which means in particular that the conversion is rounded 2412 according to the current rounding mode. If `a' is a NaN, the largest 2413 positive integer is returned. Otherwise, if the conversion overflows, the 2414 largest integer with the same sign as `a' is returned. 2415 ------------------------------------------------------------------------------- 2416 */ 2417 int64 float64_to_int64( float64 a ) 2418 { 2419 flag aSign; 2420 int16 aExp, shiftCount; 2421 bits64 aSig, aSigExtra; 2422 2423 aSig = extractFloat64Frac( a ); 2424 aExp = extractFloat64Exp( a ); 2425 aSign = extractFloat64Sign( a ); 2426 if ( aExp ) aSig |= LIT64( 0x0010000000000000 ); 2427 shiftCount = 0x433 - aExp; 2428 if ( shiftCount <= 0 ) { 2429 if ( 0x43E < aExp ) { 2430 float_raise( float_flag_invalid ); 2431 if ( ! aSign 2432 || ( ( aExp == 0x7FF ) 2433 && ( aSig != LIT64( 0x0010000000000000 ) ) ) 2434 ) { 2435 return LIT64( 0x7FFFFFFFFFFFFFFF ); 2436 } 2437 return (sbits64) LIT64( 0x8000000000000000 ); 2438 } 2439 aSigExtra = 0; 2440 aSig <<= - shiftCount; 2441 } 2442 else { 2443 shift64ExtraRightJamming( aSig, 0, shiftCount, &aSig, &aSigExtra ); 2444 } 2445 return roundAndPackInt64( aSign, aSig, aSigExtra ); 2446 2447 } 2448 2449 /* 2450 ------------------------------------------------------------------------------- 2451 Returns the result of converting the double-precision floating-point value 2452 `a' to the 64-bit two's complement integer format. The conversion is 2453 performed according to the IEC/IEEE Standard for Binary Floating-Point 2454 Arithmetic, except that the conversion is always rounded toward zero. 2455 If `a' is a NaN, the largest positive integer is returned. Otherwise, if 2456 the conversion overflows, the largest integer with the same sign as `a' is 2457 returned. 2458 ------------------------------------------------------------------------------- 2459 */ 2460 int64 float64_to_int64_round_to_zero( float64 a ) 2461 { 2462 flag aSign; 2463 int16 aExp, shiftCount; 2464 bits64 aSig; 2465 int64 z; 2466 2467 aSig = extractFloat64Frac( a ); 2468 aExp = extractFloat64Exp( a ); 2469 aSign = extractFloat64Sign( a ); 2470 if ( aExp ) aSig |= LIT64( 0x0010000000000000 ); 2471 shiftCount = aExp - 0x433; 2472 if ( 0 <= shiftCount ) { 2473 if ( 0x43E <= aExp ) { 2474 if ( a != LIT64( 0xC3E0000000000000 ) ) { 2475 float_raise( float_flag_invalid ); 2476 if ( ! aSign 2477 || ( ( aExp == 0x7FF ) 2478 && ( aSig != LIT64( 0x0010000000000000 ) ) ) 2479 ) { 2480 return LIT64( 0x7FFFFFFFFFFFFFFF ); 2481 } 2482 } 2483 return (sbits64) LIT64( 0x8000000000000000 ); 2484 } 2485 z = aSig<<shiftCount; 2486 } 2487 else { 2488 if ( aExp < 0x3FE ) { 2489 if ( aExp | aSig ) set_float_exception_inexact_flag(); 2490 return 0; 2491 } 2492 z = aSig>>( - shiftCount ); 2493 if ( (bits64) ( aSig<<( shiftCount & 63 ) ) ) { 2494 set_float_exception_inexact_flag(); 2495 } 2496 } 2497 if ( aSign ) z = - z; 2498 return z; 2499 2500 } 2501 #endif /* !SOFTFLOAT_FOR_GCC */ 2502 2503 /* 2504 ------------------------------------------------------------------------------- 2505 Returns the result of converting the double-precision floating-point value 2506 `a' to the single-precision floating-point format. The conversion is 2507 performed according to the IEC/IEEE Standard for Binary Floating-Point 2508 Arithmetic. 2509 ------------------------------------------------------------------------------- 2510 */ 2511 float32 float64_to_float32( float64 a ) 2512 { 2513 flag aSign; 2514 int16 aExp; 2515 bits64 aSig; 2516 bits32 zSig; 2517 2518 aSig = extractFloat64Frac( a ); 2519 aExp = extractFloat64Exp( a ); 2520 aSign = extractFloat64Sign( a ); 2521 if ( aExp == 0x7FF ) { 2522 if ( aSig ) return commonNaNToFloat32( float64ToCommonNaN( a ) ); 2523 return packFloat32( aSign, 0xFF, 0 ); 2524 } 2525 shift64RightJamming( aSig, 22, &aSig ); 2526 zSig = (bits32)aSig; 2527 if ( aExp || zSig ) { 2528 zSig |= 0x40000000; 2529 aExp -= 0x381; 2530 } 2531 return roundAndPackFloat32( aSign, aExp, zSig ); 2532 2533 } 2534 2535 #ifdef FLOATX80 2536 2537 /* 2538 ------------------------------------------------------------------------------- 2539 Returns the result of converting the double-precision floating-point value 2540 `a' to the extended double-precision floating-point format. The conversion 2541 is performed according to the IEC/IEEE Standard for Binary Floating-Point 2542 Arithmetic. 2543 ------------------------------------------------------------------------------- 2544 */ 2545 floatx80 float64_to_floatx80( float64 a ) 2546 { 2547 flag aSign; 2548 int16 aExp; 2549 bits64 aSig; 2550 2551 aSig = extractFloat64Frac( a ); 2552 aExp = extractFloat64Exp( a ); 2553 aSign = extractFloat64Sign( a ); 2554 if ( aExp == 0x7FF ) { 2555 if ( aSig ) return commonNaNToFloatx80( float64ToCommonNaN( a ) ); 2556 return packFloatx80( aSign, 0x7FFF, 0 ); 2557 } 2558 if ( aExp == 0 ) { 2559 if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 ); 2560 normalizeFloat64Subnormal( aSig, &aExp, &aSig ); 2561 } 2562 return 2563 packFloatx80( 2564 aSign, aExp + 0x3C00, ( aSig | LIT64( 0x0010000000000000 ) )<<11 ); 2565 2566 } 2567 2568 #endif 2569 2570 #ifdef FLOAT128 2571 2572 /* 2573 ------------------------------------------------------------------------------- 2574 Returns the result of converting the double-precision floating-point value 2575 `a' to the quadruple-precision floating-point format. The conversion is 2576 performed according to the IEC/IEEE Standard for Binary Floating-Point 2577 Arithmetic. 2578 ------------------------------------------------------------------------------- 2579 */ 2580 float128 float64_to_float128( float64 a ) 2581 { 2582 flag aSign; 2583 int16 aExp; 2584 bits64 aSig, zSig0, zSig1; 2585 2586 aSig = extractFloat64Frac( a ); 2587 aExp = extractFloat64Exp( a ); 2588 aSign = extractFloat64Sign( a ); 2589 if ( aExp == 0x7FF ) { 2590 if ( aSig ) return commonNaNToFloat128( float64ToCommonNaN( a ) ); 2591 return packFloat128( aSign, 0x7FFF, 0, 0 ); 2592 } 2593 if ( aExp == 0 ) { 2594 if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 ); 2595 normalizeFloat64Subnormal( aSig, &aExp, &aSig ); 2596 --aExp; 2597 } 2598 shift128Right( aSig, 0, 4, &zSig0, &zSig1 ); 2599 return packFloat128( aSign, aExp + 0x3C00, zSig0, zSig1 ); 2600 2601 } 2602 2603 #endif 2604 2605 #ifndef SOFTFLOAT_FOR_GCC 2606 /* 2607 ------------------------------------------------------------------------------- 2608 Rounds the double-precision floating-point value `a' to an integer, and 2609 returns the result as a double-precision floating-point value. The 2610 operation is performed according to the IEC/IEEE Standard for Binary 2611 Floating-Point Arithmetic. 2612 ------------------------------------------------------------------------------- 2613 */ 2614 float64 float64_round_to_int( float64 a ) 2615 { 2616 flag aSign; 2617 int16 aExp; 2618 bits64 lastBitMask, roundBitsMask; 2619 int8 roundingMode; 2620 float64 z; 2621 2622 aExp = extractFloat64Exp( a ); 2623 if ( 0x433 <= aExp ) { 2624 if ( ( aExp == 0x7FF ) && extractFloat64Frac( a ) ) { 2625 return propagateFloat64NaN( a, a ); 2626 } 2627 return a; 2628 } 2629 if ( aExp < 0x3FF ) { 2630 if ( (bits64) ( a<<1 ) == 0 ) return a; 2631 set_float_exception_inexact_flag(); 2632 aSign = extractFloat64Sign( a ); 2633 switch ( float_rounding_mode ) { 2634 case float_round_nearest_even: 2635 if ( ( aExp == 0x3FE ) && extractFloat64Frac( a ) ) { 2636 return packFloat64( aSign, 0x3FF, 0 ); 2637 } 2638 break; 2639 case float_round_to_zero: 2640 break; 2641 case float_round_down: 2642 return aSign ? LIT64( 0xBFF0000000000000 ) : 0; 2643 case float_round_up: 2644 return 2645 aSign ? LIT64( 0x8000000000000000 ) : LIT64( 0x3FF0000000000000 ); 2646 } 2647 return packFloat64( aSign, 0, 0 ); 2648 } 2649 lastBitMask = 1; 2650 lastBitMask <<= 0x433 - aExp; 2651 roundBitsMask = lastBitMask - 1; 2652 z = a; 2653 roundingMode = float_rounding_mode; 2654 if ( roundingMode == float_round_nearest_even ) { 2655 z += lastBitMask>>1; 2656 if ( ( z & roundBitsMask ) == 0 ) z &= ~ lastBitMask; 2657 } 2658 else if ( roundingMode != float_round_to_zero ) { 2659 if ( extractFloat64Sign( z ) ^ ( roundingMode == float_round_up ) ) { 2660 z += roundBitsMask; 2661 } 2662 } 2663 z &= ~ roundBitsMask; 2664 if ( z != a ) set_float_exception_inexact_flag(); 2665 return z; 2666 2667 } 2668 #endif 2669 2670 /* 2671 ------------------------------------------------------------------------------- 2672 Returns the result of adding the absolute values of the double-precision 2673 floating-point values `a' and `b'. If `zSign' is 1, the sum is negated 2674 before being returned. `zSign' is ignored if the result is a NaN. 2675 The addition is performed according to the IEC/IEEE Standard for Binary 2676 Floating-Point Arithmetic. 2677 ------------------------------------------------------------------------------- 2678 */ 2679 static float64 addFloat64Sigs( float64 a, float64 b, flag zSign ) 2680 { 2681 int16 aExp, bExp, zExp; 2682 bits64 aSig, bSig, zSig; 2683 int16 expDiff; 2684 2685 aSig = extractFloat64Frac( a ); 2686 aExp = extractFloat64Exp( a ); 2687 bSig = extractFloat64Frac( b ); 2688 bExp = extractFloat64Exp( b ); 2689 expDiff = aExp - bExp; 2690 aSig <<= 9; 2691 bSig <<= 9; 2692 if ( 0 < expDiff ) { 2693 if ( aExp == 0x7FF ) { 2694 if ( aSig ) return propagateFloat64NaN( a, b ); 2695 return a; 2696 } 2697 if ( bExp == 0 ) { 2698 --expDiff; 2699 } 2700 else { 2701 bSig |= LIT64( 0x2000000000000000 ); 2702 } 2703 shift64RightJamming( bSig, expDiff, &bSig ); 2704 zExp = aExp; 2705 } 2706 else if ( expDiff < 0 ) { 2707 if ( bExp == 0x7FF ) { 2708 if ( bSig ) return propagateFloat64NaN( a, b ); 2709 return packFloat64( zSign, 0x7FF, 0 ); 2710 } 2711 if ( aExp == 0 ) { 2712 ++expDiff; 2713 } 2714 else { 2715 aSig |= LIT64( 0x2000000000000000 ); 2716 } 2717 shift64RightJamming( aSig, - expDiff, &aSig ); 2718 zExp = bExp; 2719 } 2720 else { 2721 if ( aExp == 0x7FF ) { 2722 if ( aSig | bSig ) return propagateFloat64NaN( a, b ); 2723 return a; 2724 } 2725 if ( aExp == 0 ) return packFloat64( zSign, 0, ( aSig + bSig )>>9 ); 2726 zSig = LIT64( 0x4000000000000000 ) + aSig + bSig; 2727 zExp = aExp; 2728 goto roundAndPack; 2729 } 2730 aSig |= LIT64( 0x2000000000000000 ); 2731 zSig = ( aSig + bSig )<<1; 2732 --zExp; 2733 if ( (sbits64) zSig < 0 ) { 2734 zSig = aSig + bSig; 2735 ++zExp; 2736 } 2737 roundAndPack: 2738 return roundAndPackFloat64( zSign, zExp, zSig ); 2739 2740 } 2741 2742 /* 2743 ------------------------------------------------------------------------------- 2744 Returns the result of subtracting the absolute values of the double- 2745 precision floating-point values `a' and `b'. If `zSign' is 1, the 2746 difference is negated before being returned. `zSign' is ignored if the 2747 result is a NaN. The subtraction is performed according to the IEC/IEEE 2748 Standard for Binary Floating-Point Arithmetic. 2749 ------------------------------------------------------------------------------- 2750 */ 2751 static float64 subFloat64Sigs( float64 a, float64 b, flag zSign ) 2752 { 2753 int16 aExp, bExp, zExp; 2754 bits64 aSig, bSig, zSig; 2755 int16 expDiff; 2756 2757 aSig = extractFloat64Frac( a ); 2758 aExp = extractFloat64Exp( a ); 2759 bSig = extractFloat64Frac( b ); 2760 bExp = extractFloat64Exp( b ); 2761 expDiff = aExp - bExp; 2762 aSig <<= 10; 2763 bSig <<= 10; 2764 if ( 0 < expDiff ) goto aExpBigger; 2765 if ( expDiff < 0 ) goto bExpBigger; 2766 if ( aExp == 0x7FF ) { 2767 if ( aSig | bSig ) return propagateFloat64NaN( a, b ); 2768 float_raise( float_flag_invalid ); 2769 return float64_default_nan; 2770 } 2771 if ( aExp == 0 ) { 2772 aExp = 1; 2773 bExp = 1; 2774 } 2775 if ( bSig < aSig ) goto aBigger; 2776 if ( aSig < bSig ) goto bBigger; 2777 return packFloat64( float_rounding_mode == float_round_down, 0, 0 ); 2778 bExpBigger: 2779 if ( bExp == 0x7FF ) { 2780 if ( bSig ) return propagateFloat64NaN( a, b ); 2781 return packFloat64( zSign ^ 1, 0x7FF, 0 ); 2782 } 2783 if ( aExp == 0 ) { 2784 ++expDiff; 2785 } 2786 else { 2787 aSig |= LIT64( 0x4000000000000000 ); 2788 } 2789 shift64RightJamming( aSig, - expDiff, &aSig ); 2790 bSig |= LIT64( 0x4000000000000000 ); 2791 bBigger: 2792 zSig = bSig - aSig; 2793 zExp = bExp; 2794 zSign ^= 1; 2795 goto normalizeRoundAndPack; 2796 aExpBigger: 2797 if ( aExp == 0x7FF ) { 2798 if ( aSig ) return propagateFloat64NaN( a, b ); 2799 return a; 2800 } 2801 if ( bExp == 0 ) { 2802 --expDiff; 2803 } 2804 else { 2805 bSig |= LIT64( 0x4000000000000000 ); 2806 } 2807 shift64RightJamming( bSig, expDiff, &bSig ); 2808 aSig |= LIT64( 0x4000000000000000 ); 2809 aBigger: 2810 zSig = aSig - bSig; 2811 zExp = aExp; 2812 normalizeRoundAndPack: 2813 --zExp; 2814 return normalizeRoundAndPackFloat64( zSign, zExp, zSig ); 2815 2816 } 2817 2818 /* 2819 ------------------------------------------------------------------------------- 2820 Returns the result of adding the double-precision floating-point values `a' 2821 and `b'. The operation is performed according to the IEC/IEEE Standard for 2822 Binary Floating-Point Arithmetic. 2823 ------------------------------------------------------------------------------- 2824 */ 2825 float64 float64_add( float64 a, float64 b ) 2826 { 2827 flag aSign, bSign; 2828 2829 aSign = extractFloat64Sign( a ); 2830 bSign = extractFloat64Sign( b ); 2831 if ( aSign == bSign ) { 2832 return addFloat64Sigs( a, b, aSign ); 2833 } 2834 else { 2835 return subFloat64Sigs( a, b, aSign ); 2836 } 2837 2838 } 2839 2840 /* 2841 ------------------------------------------------------------------------------- 2842 Returns the result of subtracting the double-precision floating-point values 2843 `a' and `b'. The operation is performed according to the IEC/IEEE Standard 2844 for Binary Floating-Point Arithmetic. 2845 ------------------------------------------------------------------------------- 2846 */ 2847 float64 float64_sub( float64 a, float64 b ) 2848 { 2849 flag aSign, bSign; 2850 2851 aSign = extractFloat64Sign( a ); 2852 bSign = extractFloat64Sign( b ); 2853 if ( aSign == bSign ) { 2854 return subFloat64Sigs( a, b, aSign ); 2855 } 2856 else { 2857 return addFloat64Sigs( a, b, aSign ); 2858 } 2859 2860 } 2861 2862 /* 2863 ------------------------------------------------------------------------------- 2864 Returns the result of multiplying the double-precision floating-point values 2865 `a' and `b'. The operation is performed according to the IEC/IEEE Standard 2866 for Binary Floating-Point Arithmetic. 2867 ------------------------------------------------------------------------------- 2868 */ 2869 float64 float64_mul( float64 a, float64 b ) 2870 { 2871 flag aSign, bSign, zSign; 2872 int16 aExp, bExp, zExp; 2873 bits64 aSig, bSig, zSig0, zSig1; 2874 2875 aSig = extractFloat64Frac( a ); 2876 aExp = extractFloat64Exp( a ); 2877 aSign = extractFloat64Sign( a ); 2878 bSig = extractFloat64Frac( b ); 2879 bExp = extractFloat64Exp( b ); 2880 bSign = extractFloat64Sign( b ); 2881 zSign = aSign ^ bSign; 2882 if ( aExp == 0x7FF ) { 2883 if ( aSig || ( ( bExp == 0x7FF ) && bSig ) ) { 2884 return propagateFloat64NaN( a, b ); 2885 } 2886 if ( ( bExp | bSig ) == 0 ) { 2887 float_raise( float_flag_invalid ); 2888 return float64_default_nan; 2889 } 2890 return packFloat64( zSign, 0x7FF, 0 ); 2891 } 2892 if ( bExp == 0x7FF ) { 2893 if ( bSig ) return propagateFloat64NaN( a, b ); 2894 if ( ( aExp | aSig ) == 0 ) { 2895 float_raise( float_flag_invalid ); 2896 return float64_default_nan; 2897 } 2898 return packFloat64( zSign, 0x7FF, 0 ); 2899 } 2900 if ( aExp == 0 ) { 2901 if ( aSig == 0 ) return packFloat64( zSign, 0, 0 ); 2902 normalizeFloat64Subnormal( aSig, &aExp, &aSig ); 2903 } 2904 if ( bExp == 0 ) { 2905 if ( bSig == 0 ) return packFloat64( zSign, 0, 0 ); 2906 normalizeFloat64Subnormal( bSig, &bExp, &bSig ); 2907 } 2908 zExp = aExp + bExp - 0x3FF; 2909 aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<10; 2910 bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11; 2911 mul64To128( aSig, bSig, &zSig0, &zSig1 ); 2912 zSig0 |= ( zSig1 != 0 ); 2913 if ( 0 <= (sbits64) ( zSig0<<1 ) ) { 2914 zSig0 <<= 1; 2915 --zExp; 2916 } 2917 return roundAndPackFloat64( zSign, zExp, zSig0 ); 2918 2919 } 2920 2921 /* 2922 ------------------------------------------------------------------------------- 2923 Returns the result of dividing the double-precision floating-point value `a' 2924 by the corresponding value `b'. The operation is performed according to 2925 the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 2926 ------------------------------------------------------------------------------- 2927 */ 2928 float64 float64_div( float64 a, float64 b ) 2929 { 2930 flag aSign, bSign, zSign; 2931 int16 aExp, bExp, zExp; 2932 bits64 aSig, bSig, zSig; 2933 bits64 rem0, rem1; 2934 bits64 term0, term1; 2935 2936 aSig = extractFloat64Frac( a ); 2937 aExp = extractFloat64Exp( a ); 2938 aSign = extractFloat64Sign( a ); 2939 bSig = extractFloat64Frac( b ); 2940 bExp = extractFloat64Exp( b ); 2941 bSign = extractFloat64Sign( b ); 2942 zSign = aSign ^ bSign; 2943 if ( aExp == 0x7FF ) { 2944 if ( aSig ) return propagateFloat64NaN( a, b ); 2945 if ( bExp == 0x7FF ) { 2946 if ( bSig ) return propagateFloat64NaN( a, b ); 2947 float_raise( float_flag_invalid ); 2948 return float64_default_nan; 2949 } 2950 return packFloat64( zSign, 0x7FF, 0 ); 2951 } 2952 if ( bExp == 0x7FF ) { 2953 if ( bSig ) return propagateFloat64NaN( a, b ); 2954 return packFloat64( zSign, 0, 0 ); 2955 } 2956 if ( bExp == 0 ) { 2957 if ( bSig == 0 ) { 2958 if ( ( aExp | aSig ) == 0 ) { 2959 float_raise( float_flag_invalid ); 2960 return float64_default_nan; 2961 } 2962 float_raise( float_flag_divbyzero ); 2963 return packFloat64( zSign, 0x7FF, 0 ); 2964 } 2965 normalizeFloat64Subnormal( bSig, &bExp, &bSig ); 2966 } 2967 if ( aExp == 0 ) { 2968 if ( aSig == 0 ) return packFloat64( zSign, 0, 0 ); 2969 normalizeFloat64Subnormal( aSig, &aExp, &aSig ); 2970 } 2971 zExp = aExp - bExp + 0x3FD; 2972 aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<10; 2973 bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11; 2974 if ( bSig <= ( aSig + aSig ) ) { 2975 aSig >>= 1; 2976 ++zExp; 2977 } 2978 zSig = estimateDiv128To64( aSig, 0, bSig ); 2979 if ( ( zSig & 0x1FF ) <= 2 ) { 2980 mul64To128( bSig, zSig, &term0, &term1 ); 2981 sub128( aSig, 0, term0, term1, &rem0, &rem1 ); 2982 while ( (sbits64) rem0 < 0 ) { 2983 --zSig; 2984 add128( rem0, rem1, 0, bSig, &rem0, &rem1 ); 2985 } 2986 zSig |= ( rem1 != 0 ); 2987 } 2988 return roundAndPackFloat64( zSign, zExp, zSig ); 2989 2990 } 2991 2992 #ifndef SOFTFLOAT_FOR_GCC 2993 /* 2994 ------------------------------------------------------------------------------- 2995 Returns the remainder of the double-precision floating-point value `a' 2996 with respect to the corresponding value `b'. The operation is performed 2997 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 2998 ------------------------------------------------------------------------------- 2999 */ 3000 float64 float64_rem( float64 a, float64 b ) 3001 { 3002 flag aSign, bSign, zSign; 3003 int16 aExp, bExp, expDiff; 3004 bits64 aSig, bSig; 3005 bits64 q, alternateASig; 3006 sbits64 sigMean; 3007 3008 aSig = extractFloat64Frac( a ); 3009 aExp = extractFloat64Exp( a ); 3010 aSign = extractFloat64Sign( a ); 3011 bSig = extractFloat64Frac( b ); 3012 bExp = extractFloat64Exp( b ); 3013 bSign = extractFloat64Sign( b ); 3014 if ( aExp == 0x7FF ) { 3015 if ( aSig || ( ( bExp == 0x7FF ) && bSig ) ) { 3016 return propagateFloat64NaN( a, b ); 3017 } 3018 float_raise( float_flag_invalid ); 3019 return float64_default_nan; 3020 } 3021 if ( bExp == 0x7FF ) { 3022 if ( bSig ) return propagateFloat64NaN( a, b ); 3023 return a; 3024 } 3025 if ( bExp == 0 ) { 3026 if ( bSig == 0 ) { 3027 float_raise( float_flag_invalid ); 3028 return float64_default_nan; 3029 } 3030 normalizeFloat64Subnormal( bSig, &bExp, &bSig ); 3031 } 3032 if ( aExp == 0 ) { 3033 if ( aSig == 0 ) return a; 3034 normalizeFloat64Subnormal( aSig, &aExp, &aSig ); 3035 } 3036 expDiff = aExp - bExp; 3037 aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<11; 3038 bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11; 3039 if ( expDiff < 0 ) { 3040 if ( expDiff < -1 ) return a; 3041 aSig >>= 1; 3042 } 3043 q = ( bSig <= aSig ); 3044 if ( q ) aSig -= bSig; 3045 expDiff -= 64; 3046 while ( 0 < expDiff ) { 3047 q = estimateDiv128To64( aSig, 0, bSig ); 3048 q = ( 2 < q ) ? q - 2 : 0; 3049 aSig = - ( ( bSig>>2 ) * q ); 3050 expDiff -= 62; 3051 } 3052 expDiff += 64; 3053 if ( 0 < expDiff ) { 3054 q = estimateDiv128To64( aSig, 0, bSig ); 3055 q = ( 2 < q ) ? q - 2 : 0; 3056 q >>= 64 - expDiff; 3057 bSig >>= 2; 3058 aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q; 3059 } 3060 else { 3061 aSig >>= 2; 3062 bSig >>= 2; 3063 } 3064 do { 3065 alternateASig = aSig; 3066 ++q; 3067 aSig -= bSig; 3068 } while ( 0 <= (sbits64) aSig ); 3069 sigMean = aSig + alternateASig; 3070 if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) { 3071 aSig = alternateASig; 3072 } 3073 zSign = ( (sbits64) aSig < 0 ); 3074 if ( zSign ) aSig = - aSig; 3075 return normalizeRoundAndPackFloat64( aSign ^ zSign, bExp, aSig ); 3076 3077 } 3078 3079 /* 3080 ------------------------------------------------------------------------------- 3081 Returns the square root of the double-precision floating-point value `a'. 3082 The operation is performed according to the IEC/IEEE Standard for Binary 3083 Floating-Point Arithmetic. 3084 ------------------------------------------------------------------------------- 3085 */ 3086 float64 float64_sqrt( float64 a ) 3087 { 3088 flag aSign; 3089 int16 aExp, zExp; 3090 bits64 aSig, zSig, doubleZSig; 3091 bits64 rem0, rem1, term0, term1; 3092 3093 aSig = extractFloat64Frac( a ); 3094 aExp = extractFloat64Exp( a ); 3095 aSign = extractFloat64Sign( a ); 3096 if ( aExp == 0x7FF ) { 3097 if ( aSig ) return propagateFloat64NaN( a, a ); 3098 if ( ! aSign ) return a; 3099 float_raise( float_flag_invalid ); 3100 return float64_default_nan; 3101 } 3102 if ( aSign ) { 3103 if ( ( aExp | aSig ) == 0 ) return a; 3104 float_raise( float_flag_invalid ); 3105 return float64_default_nan; 3106 } 3107 if ( aExp == 0 ) { 3108 if ( aSig == 0 ) return 0; 3109 normalizeFloat64Subnormal( aSig, &aExp, &aSig ); 3110 } 3111 zExp = ( ( aExp - 0x3FF )>>1 ) + 0x3FE; 3112 aSig |= LIT64( 0x0010000000000000 ); 3113 zSig = estimateSqrt32( aExp, aSig>>21 ); 3114 aSig <<= 9 - ( aExp & 1 ); 3115 zSig = estimateDiv128To64( aSig, 0, zSig<<32 ) + ( zSig<<30 ); 3116 if ( ( zSig & 0x1FF ) <= 5 ) { 3117 doubleZSig = zSig<<1; 3118 mul64To128( zSig, zSig, &term0, &term1 ); 3119 sub128( aSig, 0, term0, term1, &rem0, &rem1 ); 3120 while ( (sbits64) rem0 < 0 ) { 3121 --zSig; 3122 doubleZSig -= 2; 3123 add128( rem0, rem1, zSig>>63, doubleZSig | 1, &rem0, &rem1 ); 3124 } 3125 zSig |= ( ( rem0 | rem1 ) != 0 ); 3126 } 3127 return roundAndPackFloat64( 0, zExp, zSig ); 3128 3129 } 3130 #endif 3131 3132 /* 3133 ------------------------------------------------------------------------------- 3134 Returns 1 if the double-precision floating-point value `a' is equal to the 3135 corresponding value `b', and 0 otherwise. The comparison is performed 3136 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 3137 ------------------------------------------------------------------------------- 3138 */ 3139 flag float64_eq( float64 a, float64 b ) 3140 { 3141 3142 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) ) 3143 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) ) 3144 ) { 3145 if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) { 3146 float_raise( float_flag_invalid ); 3147 } 3148 return 0; 3149 } 3150 return ( a == b ) || 3151 ( (bits64) ( ( FLOAT64_DEMANGLE(a) | FLOAT64_DEMANGLE(b) )<<1 ) == 0 ); 3152 3153 } 3154 3155 /* 3156 ------------------------------------------------------------------------------- 3157 Returns 1 if the double-precision floating-point value `a' is less than or 3158 equal to the corresponding value `b', and 0 otherwise. The comparison is 3159 performed according to the IEC/IEEE Standard for Binary Floating-Point 3160 Arithmetic. 3161 ------------------------------------------------------------------------------- 3162 */ 3163 flag float64_le( float64 a, float64 b ) 3164 { 3165 flag aSign, bSign; 3166 3167 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) ) 3168 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) ) 3169 ) { 3170 float_raise( float_flag_invalid ); 3171 return 0; 3172 } 3173 aSign = extractFloat64Sign( a ); 3174 bSign = extractFloat64Sign( b ); 3175 if ( aSign != bSign ) 3176 return aSign || 3177 ( (bits64) ( ( FLOAT64_DEMANGLE(a) | FLOAT64_DEMANGLE(b) )<<1 ) == 3178 0 ); 3179 return ( a == b ) || 3180 ( aSign ^ ( FLOAT64_DEMANGLE(a) < FLOAT64_DEMANGLE(b) ) ); 3181 3182 } 3183 3184 /* 3185 ------------------------------------------------------------------------------- 3186 Returns 1 if the double-precision floating-point value `a' is less than 3187 the corresponding value `b', and 0 otherwise. The comparison is performed 3188 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 3189 ------------------------------------------------------------------------------- 3190 */ 3191 flag float64_lt( float64 a, float64 b ) 3192 { 3193 flag aSign, bSign; 3194 3195 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) ) 3196 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) ) 3197 ) { 3198 float_raise( float_flag_invalid ); 3199 return 0; 3200 } 3201 aSign = extractFloat64Sign( a ); 3202 bSign = extractFloat64Sign( b ); 3203 if ( aSign != bSign ) 3204 return aSign && 3205 ( (bits64) ( ( FLOAT64_DEMANGLE(a) | FLOAT64_DEMANGLE(b) )<<1 ) != 3206 0 ); 3207 return ( a != b ) && 3208 ( aSign ^ ( FLOAT64_DEMANGLE(a) < FLOAT64_DEMANGLE(b) ) ); 3209 3210 } 3211 3212 #ifndef SOFTFLOAT_FOR_GCC 3213 /* 3214 ------------------------------------------------------------------------------- 3215 Returns 1 if the double-precision floating-point value `a' is equal to the 3216 corresponding value `b', and 0 otherwise. The invalid exception is raised 3217 if either operand is a NaN. Otherwise, the comparison is performed 3218 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 3219 ------------------------------------------------------------------------------- 3220 */ 3221 flag float64_eq_signaling( float64 a, float64 b ) 3222 { 3223 3224 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) ) 3225 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) ) 3226 ) { 3227 float_raise( float_flag_invalid ); 3228 return 0; 3229 } 3230 return ( a == b ) || ( (bits64) ( ( a | b )<<1 ) == 0 ); 3231 3232 } 3233 3234 /* 3235 ------------------------------------------------------------------------------- 3236 Returns 1 if the double-precision floating-point value `a' is less than or 3237 equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not 3238 cause an exception. Otherwise, the comparison is performed according to the 3239 IEC/IEEE Standard for Binary Floating-Point Arithmetic. 3240 ------------------------------------------------------------------------------- 3241 */ 3242 flag float64_le_quiet( float64 a, float64 b ) 3243 { 3244 flag aSign, bSign; 3245 3246 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) ) 3247 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) ) 3248 ) { 3249 if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) { 3250 float_raise( float_flag_invalid ); 3251 } 3252 return 0; 3253 } 3254 aSign = extractFloat64Sign( a ); 3255 bSign = extractFloat64Sign( b ); 3256 if ( aSign != bSign ) return aSign || ( (bits64) ( ( a | b )<<1 ) == 0 ); 3257 return ( a == b ) || ( aSign ^ ( a < b ) ); 3258 3259 } 3260 3261 /* 3262 ------------------------------------------------------------------------------- 3263 Returns 1 if the double-precision floating-point value `a' is less than 3264 the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an 3265 exception. Otherwise, the comparison is performed according to the IEC/IEEE 3266 Standard for Binary Floating-Point Arithmetic. 3267 ------------------------------------------------------------------------------- 3268 */ 3269 flag float64_lt_quiet( float64 a, float64 b ) 3270 { 3271 flag aSign, bSign; 3272 3273 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) ) 3274 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) ) 3275 ) { 3276 if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) { 3277 float_raise( float_flag_invalid ); 3278 } 3279 return 0; 3280 } 3281 aSign = extractFloat64Sign( a ); 3282 bSign = extractFloat64Sign( b ); 3283 if ( aSign != bSign ) return aSign && ( (bits64) ( ( a | b )<<1 ) != 0 ); 3284 return ( a != b ) && ( aSign ^ ( a < b ) ); 3285 3286 } 3287 #endif 3288 3289 #ifdef FLOATX80 3290 #ifdef X80M68K 3291 flag floatx80_eq( floatx80 a, floatx80 b ) 3292 { 3293 return !floatx80_eq_internal(a, b); 3294 } 3295 3296 flag floatx80_le_quiet( floatx80 a, floatx80 b ) 3297 { 3298 return !floatx80_le_quiet_internal(a, b); 3299 } 3300 3301 flag floatx80_lt_quiet( floatx80 a, floatx80 b ) 3302 { 3303 return floatx80_lt_quiet_internal(a, b) ? -1 : 1; 3304 } 3305 3306 flag floatx80_le( floatx80 a, floatx80 b ) 3307 { 3308 return !floatx80_le_internal(a, b); 3309 } 3310 3311 flag floatx80_lt( floatx80 a, floatx80 b ) 3312 { 3313 return floatx80_lt_internal(a, b) ? -1 : 1; 3314 } 3315 3316 flag floatx80_eq_signaling( floatx80 a, floatx80 b ) 3317 { 3318 return !floatx80_eq_signaling_internal(a, b); 3319 } 3320 #else 3321 3322 #define floatx80_eq_internal floatx80_eq 3323 #define floatx80_le_internal floatx80_le 3324 #define floatx80_lt_internal floatx80_lt 3325 #define floatx80_le_quiet_internal floatx80_le_quiet 3326 #define floatx80_lt_quiet_internal floatx80_lt_quiet 3327 #define floatx80_eq_signaling_internal floatx80_eq_signaling 3328 3329 #endif 3330 3331 /* 3332 ------------------------------------------------------------------------------- 3333 Returns the result of converting the extended double-precision floating- 3334 point value `a' to the 32-bit two's complement integer format. The 3335 conversion is performed according to the IEC/IEEE Standard for Binary 3336 Floating-Point Arithmetic---which means in particular that the conversion 3337 is rounded according to the current rounding mode. If `a' is a NaN, the 3338 largest positive integer is returned. Otherwise, if the conversion 3339 overflows, the largest integer with the same sign as `a' is returned. 3340 ------------------------------------------------------------------------------- 3341 */ 3342 int32 floatx80_to_int32( floatx80 a ) 3343 { 3344 flag aSign; 3345 int32 aExp, shiftCount; 3346 bits64 aSig; 3347 3348 aSig = extractFloatx80Frac( a ); 3349 aExp = extractFloatx80Exp( a ); 3350 aSign = extractFloatx80Sign( a ); 3351 if ( ( aExp == 0x7FFF ) && (bits64) ( aSig<<1 ) ) aSign = 0; 3352 shiftCount = 0x4037 - aExp; 3353 if ( shiftCount <= 0 ) shiftCount = 1; 3354 shift64RightJamming( aSig, shiftCount, &aSig ); 3355 return roundAndPackInt32( aSign, aSig ); 3356 3357 } 3358 3359 /* 3360 ------------------------------------------------------------------------------- 3361 Returns the result of converting the extended double-precision floating- 3362 point value `a' to the 32-bit two's complement integer format. The 3363 conversion is performed according to the IEC/IEEE Standard for Binary 3364 Floating-Point Arithmetic, except that the conversion is always rounded 3365 toward zero. If `a' is a NaN, the largest positive integer is returned. 3366 Otherwise, if the conversion overflows, the largest integer with the same 3367 sign as `a' is returned. 3368 ------------------------------------------------------------------------------- 3369 */ 3370 int32 floatx80_to_int32_round_to_zero( floatx80 a ) 3371 { 3372 flag aSign; 3373 int32 aExp, shiftCount; 3374 bits64 aSig, savedASig; 3375 int32 z; 3376 3377 aSig = extractFloatx80Frac( a ); 3378 aExp = extractFloatx80Exp( a ); 3379 aSign = extractFloatx80Sign( a ); 3380 if ( 0x401E < aExp ) { 3381 if ( ( aExp == 0x7FFF ) && (bits64) ( aSig<<1 ) ) aSign = 0; 3382 goto invalid; 3383 } 3384 else if ( aExp < 0x3FFF ) { 3385 if ( aExp || aSig ) set_float_exception_inexact_flag(); 3386 return 0; 3387 } 3388 shiftCount = 0x403E - aExp; 3389 savedASig = aSig; 3390 aSig >>= shiftCount; 3391 z = aSig; 3392 if ( aSign ) z = - z; 3393 if ( ( z < 0 ) ^ aSign ) { 3394 invalid: 3395 float_raise( float_flag_invalid ); 3396 return aSign ? (sbits32) 0x80000000 : 0x7FFFFFFF; 3397 } 3398 if ( ( aSig<<shiftCount ) != savedASig ) { 3399 set_float_exception_inexact_flag(); 3400 } 3401 return z; 3402 3403 } 3404 3405 /* 3406 ------------------------------------------------------------------------------- 3407 Returns the result of converting the extended double-precision floating- 3408 point value `a' to the 64-bit two's complement integer format. The 3409 conversion is performed according to the IEC/IEEE Standard for Binary 3410 Floating-Point Arithmetic---which means in particular that the conversion 3411 is rounded according to the current rounding mode. If `a' is a NaN, 3412 the largest positive integer is returned. Otherwise, if the conversion 3413 overflows, the largest integer with the same sign as `a' is returned. 3414 ------------------------------------------------------------------------------- 3415 */ 3416 int64 floatx80_to_int64( floatx80 a ) 3417 { 3418 flag aSign; 3419 int32 aExp, shiftCount; 3420 bits64 aSig, aSigExtra; 3421 3422 aSig = extractFloatx80Frac( a ); 3423 aExp = extractFloatx80Exp( a ); 3424 aSign = extractFloatx80Sign( a ); 3425 shiftCount = 0x403E - aExp; 3426 if ( shiftCount <= 0 ) { 3427 if ( shiftCount ) { 3428 float_raise( float_flag_invalid ); 3429 if ( ! aSign 3430 || ( ( aExp == 0x7FFF ) 3431 && ( aSig != LIT64( 0x8000000000000000 ) ) ) 3432 ) { 3433 return LIT64( 0x7FFFFFFFFFFFFFFF ); 3434 } 3435 return (sbits64) LIT64( 0x8000000000000000 ); 3436 } 3437 aSigExtra = 0; 3438 } 3439 else { 3440 shift64ExtraRightJamming( aSig, 0, shiftCount, &aSig, &aSigExtra ); 3441 } 3442 return roundAndPackInt64( aSign, aSig, aSigExtra ); 3443 3444 } 3445 3446 /* 3447 ------------------------------------------------------------------------------- 3448 Returns the result of converting the extended double-precision floating- 3449 point value `a' to the 64-bit two's complement integer format. The 3450 conversion is performed according to the IEC/IEEE Standard for Binary 3451 Floating-Point Arithmetic, except that the conversion is always rounded 3452 toward zero. If `a' is a NaN, the largest positive integer is returned. 3453 Otherwise, if the conversion overflows, the largest integer with the same 3454 sign as `a' is returned. 3455 ------------------------------------------------------------------------------- 3456 */ 3457 int64 floatx80_to_int64_round_to_zero( floatx80 a ) 3458 { 3459 flag aSign; 3460 int32 aExp, shiftCount; 3461 bits64 aSig; 3462 int64 z; 3463 3464 aSig = extractFloatx80Frac( a ); 3465 aExp = extractFloatx80Exp( a ); 3466 aSign = extractFloatx80Sign( a ); 3467 shiftCount = aExp - 0x403E; 3468 if ( 0 <= shiftCount ) { 3469 aSig &= LIT64( 0x7FFFFFFFFFFFFFFF ); 3470 if ( ( (a.high>>X80SHIFT) != 0xC03E ) || aSig ) { 3471 float_raise( float_flag_invalid ); 3472 if ( ! aSign || ( ( aExp == 0x7FFF ) && aSig ) ) { 3473 return LIT64( 0x7FFFFFFFFFFFFFFF ); 3474 } 3475 } 3476 return (sbits64) LIT64( 0x8000000000000000 ); 3477 } 3478 else if ( aExp < 0x3FFF ) { 3479 if ( aExp | aSig ) set_float_exception_inexact_flag(); 3480 return 0; 3481 } 3482 z = aSig>>( - shiftCount ); 3483 if ( (bits64) ( aSig<<( shiftCount & 63 ) ) ) { 3484 set_float_exception_inexact_flag(); 3485 } 3486 if ( aSign ) z = - z; 3487 return z; 3488 3489 } 3490 3491 /* 3492 ------------------------------------------------------------------------------- 3493 Returns the result of converting the extended double-precision floating- 3494 point value `a' to the single-precision floating-point format. The 3495 conversion is performed according to the IEC/IEEE Standard for Binary 3496 Floating-Point Arithmetic. 3497 ------------------------------------------------------------------------------- 3498 */ 3499 float32 floatx80_to_float32( floatx80 a ) 3500 { 3501 flag aSign; 3502 int32 aExp; 3503 bits64 aSig; 3504 3505 aSig = extractFloatx80Frac( a ); 3506 aExp = extractFloatx80Exp( a ); 3507 aSign = extractFloatx80Sign( a ); 3508 if ( aExp == 0x7FFF ) { 3509 if ( (bits64) ( aSig<<1 ) ) { 3510 return commonNaNToFloat32( floatx80ToCommonNaN( a ) ); 3511 } 3512 return packFloat32( aSign, 0xFF, 0 ); 3513 } 3514 shift64RightJamming( aSig, 33, &aSig ); 3515 if ( aExp || aSig ) aExp -= 0x3F81; 3516 return roundAndPackFloat32( aSign, aExp, aSig ); 3517 3518 } 3519 3520 /* 3521 ------------------------------------------------------------------------------- 3522 Returns the result of converting the extended double-precision floating- 3523 point value `a' to the double-precision floating-point format. The 3524 conversion is performed according to the IEC/IEEE Standard for Binary 3525 Floating-Point Arithmetic. 3526 ------------------------------------------------------------------------------- 3527 */ 3528 float64 floatx80_to_float64( floatx80 a ) 3529 { 3530 flag aSign; 3531 int32 aExp; 3532 bits64 aSig, zSig; 3533 3534 aSig = extractFloatx80Frac( a ); 3535 aExp = extractFloatx80Exp( a ); 3536 aSign = extractFloatx80Sign( a ); 3537 if ( aExp == 0x7FFF ) { 3538 if ( (bits64) ( aSig<<1 ) ) { 3539 return commonNaNToFloat64( floatx80ToCommonNaN( a ) ); 3540 } 3541 return packFloat64( aSign, 0x7FF, 0 ); 3542 } 3543 shift64RightJamming( aSig, 1, &zSig ); 3544 if ( aExp || aSig ) aExp -= 0x3C01; 3545 return roundAndPackFloat64( aSign, aExp, zSig ); 3546 3547 } 3548 3549 #ifdef FLOAT128 3550 3551 /* 3552 ------------------------------------------------------------------------------- 3553 Returns the result of converting the extended double-precision floating- 3554 point value `a' to the quadruple-precision floating-point format. The 3555 conversion is performed according to the IEC/IEEE Standard for Binary 3556 Floating-Point Arithmetic. 3557 ------------------------------------------------------------------------------- 3558 */ 3559 float128 floatx80_to_float128( floatx80 a ) 3560 { 3561 flag aSign; 3562 int16 aExp; 3563 bits64 aSig, zSig0, zSig1; 3564 3565 aSig = extractFloatx80Frac( a ); 3566 aExp = extractFloatx80Exp( a ); 3567 aSign = extractFloatx80Sign( a ); 3568 if ( ( aExp == 0x7FFF ) && (bits64) ( aSig<<1 ) ) { 3569 return commonNaNToFloat128( floatx80ToCommonNaN( a ) ); 3570 } 3571 shift128Right( aSig<<1, 0, 16, &zSig0, &zSig1 ); 3572 return packFloat128( aSign, aExp, zSig0, zSig1 ); 3573 3574 } 3575 3576 #endif 3577 3578 /* 3579 ------------------------------------------------------------------------------- 3580 Rounds the extended double-precision floating-point value `a' to an integer, 3581 and returns the result as an extended quadruple-precision floating-point 3582 value. The operation is performed according to the IEC/IEEE Standard for 3583 Binary Floating-Point Arithmetic. 3584 ------------------------------------------------------------------------------- 3585 */ 3586 floatx80 floatx80_round_to_int( floatx80 a ) 3587 { 3588 flag aSign; 3589 int32 aExp; 3590 bits64 lastBitMask, roundBitsMask; 3591 int8 roundingMode; 3592 floatx80 z; 3593 3594 aExp = extractFloatx80Exp( a ); 3595 if ( 0x403E <= aExp ) { 3596 if ( ( aExp == 0x7FFF ) && (bits64) ( extractFloatx80Frac( a )<<1 ) ) { 3597 return propagateFloatx80NaN( a, a ); 3598 } 3599 return a; 3600 } 3601 if ( aExp < 0x3FFF ) { 3602 if ( ( aExp == 0 ) 3603 && ( (bits64) ( extractFloatx80Frac( a )<<1 ) == 0 ) ) { 3604 return a; 3605 } 3606 set_float_exception_inexact_flag(); 3607 aSign = extractFloatx80Sign( a ); 3608 switch ( float_rounding_mode ) { 3609 case float_round_nearest_even: 3610 if ( ( aExp == 0x3FFE ) && (bits64) ( extractFloatx80Frac( a )<<1 ) 3611 ) { 3612 return 3613 packFloatx80( aSign, 0x3FFF, LIT64( 0x8000000000000000 ) ); 3614 } 3615 break; 3616 case float_round_to_zero: 3617 break; 3618 case float_round_down: 3619 return 3620 aSign ? 3621 packFloatx80( 1, 0x3FFF, LIT64( 0x8000000000000000 ) ) 3622 : packFloatx80( 0, 0, 0 ); 3623 case float_round_up: 3624 return 3625 aSign ? packFloatx80( 1, 0, 0 ) 3626 : packFloatx80( 0, 0x3FFF, LIT64( 0x8000000000000000 ) ); 3627 } 3628 return packFloatx80( aSign, 0, 0 ); 3629 } 3630 lastBitMask = 1; 3631 lastBitMask <<= 0x403E - aExp; 3632 roundBitsMask = lastBitMask - 1; 3633 z = a; 3634 roundingMode = float_rounding_mode; 3635 if ( roundingMode == float_round_nearest_even ) { 3636 z.low += lastBitMask>>1; 3637 if ( ( z.low & roundBitsMask ) == 0 ) z.low &= ~ lastBitMask; 3638 } 3639 else if ( roundingMode != float_round_to_zero ) { 3640 if ( extractFloatx80Sign( z ) ^ ( roundingMode == float_round_up ) ) { 3641 z.low += roundBitsMask; 3642 } 3643 } 3644 z.low &= ~ roundBitsMask; 3645 if ( z.low == 0 ) { 3646 ++z.high; 3647 z.low = LIT64( 0x8000000000000000 ); 3648 } 3649 z.high <<= X80SHIFT; 3650 if ( z.low != a.low ) set_float_exception_inexact_flag(); 3651 return z; 3652 3653 } 3654 3655 /* 3656 ------------------------------------------------------------------------------- 3657 Returns the result of adding the absolute values of the extended double- 3658 precision floating-point values `a' and `b'. If `zSign' is 1, the sum is 3659 negated before being returned. `zSign' is ignored if the result is a NaN. 3660 The addition is performed according to the IEC/IEEE Standard for Binary 3661 Floating-Point Arithmetic. 3662 ------------------------------------------------------------------------------- 3663 */ 3664 static floatx80 addFloatx80Sigs( floatx80 a, floatx80 b, flag zSign ) 3665 { 3666 int32 aExp, bExp, zExp; 3667 bits64 aSig, bSig, zSig0, zSig1; 3668 int32 expDiff; 3669 3670 aSig = extractFloatx80Frac( a ); 3671 aExp = extractFloatx80Exp( a ); 3672 bSig = extractFloatx80Frac( b ); 3673 bExp = extractFloatx80Exp( b ); 3674 expDiff = aExp - bExp; 3675 if ( 0 < expDiff ) { 3676 if ( aExp == 0x7FFF ) { 3677 if ( (bits64) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b ); 3678 return a; 3679 } 3680 if ( bExp == 0 ) --expDiff; 3681 shift64ExtraRightJamming( bSig, 0, expDiff, &bSig, &zSig1 ); 3682 zExp = aExp; 3683 } 3684 else if ( expDiff < 0 ) { 3685 if ( bExp == 0x7FFF ) { 3686 if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b ); 3687 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) ); 3688 } 3689 if ( aExp == 0 ) ++expDiff; 3690 shift64ExtraRightJamming( aSig, 0, - expDiff, &aSig, &zSig1 ); 3691 zExp = bExp; 3692 } 3693 else { 3694 if ( aExp == 0x7FFF ) { 3695 if ( (bits64) ( ( aSig | bSig )<<1 ) ) { 3696 return propagateFloatx80NaN( a, b ); 3697 } 3698 return a; 3699 } 3700 zSig1 = 0; 3701 zSig0 = aSig + bSig; 3702 if ( aExp == 0 ) { 3703 normalizeFloatx80Subnormal( zSig0, &zExp, &zSig0 ); 3704 goto roundAndPack; 3705 } 3706 zExp = aExp; 3707 goto shiftRight1; 3708 } 3709 zSig0 = aSig + bSig; 3710 if ( (sbits64) zSig0 < 0 ) goto roundAndPack; 3711 shiftRight1: 3712 shift64ExtraRightJamming( zSig0, zSig1, 1, &zSig0, &zSig1 ); 3713 zSig0 |= LIT64( 0x8000000000000000 ); 3714 ++zExp; 3715 roundAndPack: 3716 return 3717 roundAndPackFloatx80( 3718 floatx80_rounding_precision, zSign, zExp, zSig0, zSig1 ); 3719 3720 } 3721 3722 /* 3723 ------------------------------------------------------------------------------- 3724 Returns the result of subtracting the absolute values of the extended 3725 double-precision floating-point values `a' and `b'. If `zSign' is 1, the 3726 difference is negated before being returned. `zSign' is ignored if the 3727 result is a NaN. The subtraction is performed according to the IEC/IEEE 3728 Standard for Binary Floating-Point Arithmetic. 3729 ------------------------------------------------------------------------------- 3730 */ 3731 static floatx80 subFloatx80Sigs( floatx80 a, floatx80 b, flag zSign ) 3732 { 3733 int32 aExp, bExp, zExp; 3734 bits64 aSig, bSig, zSig0, zSig1; 3735 int32 expDiff; 3736 floatx80 z; 3737 3738 aSig = extractFloatx80Frac( a ); 3739 aExp = extractFloatx80Exp( a ); 3740 bSig = extractFloatx80Frac( b ); 3741 bExp = extractFloatx80Exp( b ); 3742 expDiff = aExp - bExp; 3743 if ( 0 < expDiff ) goto aExpBigger; 3744 if ( expDiff < 0 ) goto bExpBigger; 3745 if ( aExp == 0x7FFF ) { 3746 if ( (bits64) ( ( aSig | bSig )<<1 ) ) { 3747 return propagateFloatx80NaN( a, b ); 3748 } 3749 float_raise( float_flag_invalid ); 3750 z.low = floatx80_default_nan_low; 3751 z.high = floatx80_default_nan_high<<X80SHIFT; 3752 return z; 3753 } 3754 if ( aExp == 0 ) { 3755 aExp = 1; 3756 bExp = 1; 3757 } 3758 zSig1 = 0; 3759 if ( bSig < aSig ) goto aBigger; 3760 if ( aSig < bSig ) goto bBigger; 3761 return packFloatx80( float_rounding_mode == float_round_down, 0, 0 ); 3762 bExpBigger: 3763 if ( bExp == 0x7FFF ) { 3764 if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b ); 3765 return packFloatx80( zSign ^ 1, 0x7FFF, LIT64( 0x8000000000000000 ) ); 3766 } 3767 if ( aExp == 0 ) ++expDiff; 3768 shift128RightJamming( aSig, 0, - expDiff, &aSig, &zSig1 ); 3769 bBigger: 3770 sub128( bSig, 0, aSig, zSig1, &zSig0, &zSig1 ); 3771 zExp = bExp; 3772 zSign ^= 1; 3773 goto normalizeRoundAndPack; 3774 aExpBigger: 3775 if ( aExp == 0x7FFF ) { 3776 if ( (bits64) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b ); 3777 return a; 3778 } 3779 if ( bExp == 0 ) --expDiff; 3780 shift128RightJamming( bSig, 0, expDiff, &bSig, &zSig1 ); 3781 aBigger: 3782 sub128( aSig, 0, bSig, zSig1, &zSig0, &zSig1 ); 3783 zExp = aExp; 3784 normalizeRoundAndPack: 3785 return 3786 normalizeRoundAndPackFloatx80( 3787 floatx80_rounding_precision, zSign, zExp, zSig0, zSig1 ); 3788 3789 } 3790 3791 /* 3792 ------------------------------------------------------------------------------- 3793 Returns the result of adding the extended double-precision floating-point 3794 values `a' and `b'. The operation is performed according to the IEC/IEEE 3795 Standard for Binary Floating-Point Arithmetic. 3796 ------------------------------------------------------------------------------- 3797 */ 3798 floatx80 floatx80_add( floatx80 a, floatx80 b ) 3799 { 3800 flag aSign, bSign; 3801 3802 aSign = extractFloatx80Sign( a ); 3803 bSign = extractFloatx80Sign( b ); 3804 if ( aSign == bSign ) { 3805 return addFloatx80Sigs( a, b, aSign ); 3806 } 3807 else { 3808 return subFloatx80Sigs( a, b, aSign ); 3809 } 3810 3811 } 3812 3813 /* 3814 ------------------------------------------------------------------------------- 3815 Returns the result of subtracting the extended double-precision floating- 3816 point values `a' and `b'. The operation is performed according to the 3817 IEC/IEEE Standard for Binary Floating-Point Arithmetic. 3818 ------------------------------------------------------------------------------- 3819 */ 3820 floatx80 floatx80_sub( floatx80 a, floatx80 b ) 3821 { 3822 flag aSign, bSign; 3823 3824 aSign = extractFloatx80Sign( a ); 3825 bSign = extractFloatx80Sign( b ); 3826 if ( aSign == bSign ) { 3827 return subFloatx80Sigs( a, b, aSign ); 3828 } 3829 else { 3830 return addFloatx80Sigs( a, b, aSign ); 3831 } 3832 3833 } 3834 3835 /* 3836 ------------------------------------------------------------------------------- 3837 Returns the result of multiplying the extended double-precision floating- 3838 point values `a' and `b'. The operation is performed according to the 3839 IEC/IEEE Standard for Binary Floating-Point Arithmetic. 3840 ------------------------------------------------------------------------------- 3841 */ 3842 floatx80 floatx80_mul( floatx80 a, floatx80 b ) 3843 { 3844 flag aSign, bSign, zSign; 3845 int32 aExp, bExp, zExp; 3846 bits64 aSig, bSig, zSig0, zSig1; 3847 floatx80 z; 3848 3849 aSig = extractFloatx80Frac( a ); 3850 aExp = extractFloatx80Exp( a ); 3851 aSign = extractFloatx80Sign( a ); 3852 bSig = extractFloatx80Frac( b ); 3853 bExp = extractFloatx80Exp( b ); 3854 bSign = extractFloatx80Sign( b ); 3855 zSign = aSign ^ bSign; 3856 if ( aExp == 0x7FFF ) { 3857 if ( (bits64) ( aSig<<1 ) 3858 || ( ( bExp == 0x7FFF ) && (bits64) ( bSig<<1 ) ) ) { 3859 return propagateFloatx80NaN( a, b ); 3860 } 3861 if ( ( bExp | bSig ) == 0 ) goto invalid; 3862 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) ); 3863 } 3864 if ( bExp == 0x7FFF ) { 3865 if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b ); 3866 if ( ( aExp | aSig ) == 0 ) { 3867 invalid: 3868 float_raise( float_flag_invalid ); 3869 z.low = floatx80_default_nan_low; 3870 z.high = floatx80_default_nan_high<<X80SHIFT; 3871 return z; 3872 } 3873 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) ); 3874 } 3875 if ( aExp == 0 ) { 3876 if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 ); 3877 normalizeFloatx80Subnormal( aSig, &aExp, &aSig ); 3878 } 3879 if ( bExp == 0 ) { 3880 if ( bSig == 0 ) return packFloatx80( zSign, 0, 0 ); 3881 normalizeFloatx80Subnormal( bSig, &bExp, &bSig ); 3882 } 3883 zExp = aExp + bExp - 0x3FFE; 3884 mul64To128( aSig, bSig, &zSig0, &zSig1 ); 3885 if ( 0 < (sbits64) zSig0 ) { 3886 shortShift128Left( zSig0, zSig1, 1, &zSig0, &zSig1 ); 3887 --zExp; 3888 } 3889 return 3890 roundAndPackFloatx80( 3891 floatx80_rounding_precision, zSign, zExp, zSig0, zSig1 ); 3892 3893 } 3894 3895 /* 3896 ------------------------------------------------------------------------------- 3897 Returns the result of dividing the extended double-precision floating-point 3898 value `a' by the corresponding value `b'. The operation is performed 3899 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 3900 ------------------------------------------------------------------------------- 3901 */ 3902 floatx80 floatx80_div( floatx80 a, floatx80 b ) 3903 { 3904 flag aSign, bSign, zSign; 3905 int32 aExp, bExp, zExp; 3906 bits64 aSig, bSig, zSig0, zSig1; 3907 bits64 rem0, rem1, rem2, term0, term1, term2; 3908 floatx80 z; 3909 3910 aSig = extractFloatx80Frac( a ); 3911 aExp = extractFloatx80Exp( a ); 3912 aSign = extractFloatx80Sign( a ); 3913 bSig = extractFloatx80Frac( b ); 3914 bExp = extractFloatx80Exp( b ); 3915 bSign = extractFloatx80Sign( b ); 3916 zSign = aSign ^ bSign; 3917 if ( aExp == 0x7FFF ) { 3918 if ( (bits64) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b ); 3919 if ( bExp == 0x7FFF ) { 3920 if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b ); 3921 goto invalid; 3922 } 3923 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) ); 3924 } 3925 if ( bExp == 0x7FFF ) { 3926 if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b ); 3927 return packFloatx80( zSign, 0, 0 ); 3928 } 3929 if ( bExp == 0 ) { 3930 if ( bSig == 0 ) { 3931 if ( ( aExp | aSig ) == 0 ) { 3932 invalid: 3933 float_raise( float_flag_invalid ); 3934 z.low = floatx80_default_nan_low; 3935 z.high = floatx80_default_nan_high<<X80SHIFT; 3936 return z; 3937 } 3938 float_raise( float_flag_divbyzero ); 3939 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) ); 3940 } 3941 normalizeFloatx80Subnormal( bSig, &bExp, &bSig ); 3942 } 3943 if ( aExp == 0 ) { 3944 if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 ); 3945 normalizeFloatx80Subnormal( aSig, &aExp, &aSig ); 3946 } 3947 zExp = aExp - bExp + 0x3FFE; 3948 rem1 = 0; 3949 if ( bSig <= aSig ) { 3950 shift128Right( aSig, 0, 1, &aSig, &rem1 ); 3951 ++zExp; 3952 } 3953 zSig0 = estimateDiv128To64( aSig, rem1, bSig ); 3954 mul64To128( bSig, zSig0, &term0, &term1 ); 3955 sub128( aSig, rem1, term0, term1, &rem0, &rem1 ); 3956 while ( (sbits64) rem0 < 0 ) { 3957 --zSig0; 3958 add128( rem0, rem1, 0, bSig, &rem0, &rem1 ); 3959 } 3960 zSig1 = estimateDiv128To64( rem1, 0, bSig ); 3961 if ( (bits64) ( zSig1<<1 ) <= 8 ) { 3962 mul64To128( bSig, zSig1, &term1, &term2 ); 3963 sub128( rem1, 0, term1, term2, &rem1, &rem2 ); 3964 while ( (sbits64) rem1 < 0 ) { 3965 --zSig1; 3966 add128( rem1, rem2, 0, bSig, &rem1, &rem2 ); 3967 } 3968 zSig1 |= ( ( rem1 | rem2 ) != 0 ); 3969 } 3970 return 3971 roundAndPackFloatx80( 3972 floatx80_rounding_precision, zSign, zExp, zSig0, zSig1 ); 3973 3974 } 3975 3976 /* 3977 ------------------------------------------------------------------------------- 3978 Returns the remainder of the extended double-precision floating-point value 3979 `a' with respect to the corresponding value `b'. The operation is performed 3980 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 3981 ------------------------------------------------------------------------------- 3982 */ 3983 floatx80 floatx80_rem( floatx80 a, floatx80 b ) 3984 { 3985 flag aSign, zSign; 3986 int32 aExp, bExp, expDiff; 3987 bits64 aSig0, aSig1, bSig; 3988 bits64 q, term0, term1, alternateASig0, alternateASig1; 3989 floatx80 z; 3990 3991 aSig0 = extractFloatx80Frac( a ); 3992 aExp = extractFloatx80Exp( a ); 3993 aSign = extractFloatx80Sign( a ); 3994 bSig = extractFloatx80Frac( b ); 3995 bExp = extractFloatx80Exp( b ); 3996 3997 if ( aExp == 0x7FFF ) { 3998 if ( (bits64) ( aSig0<<1 ) 3999 || ( ( bExp == 0x7FFF ) && (bits64) ( bSig<<1 ) ) ) { 4000 return propagateFloatx80NaN( a, b ); 4001 } 4002 goto invalid; 4003 } 4004 if ( bExp == 0x7FFF ) { 4005 if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b ); 4006 return a; 4007 } 4008 if ( bExp == 0 ) { 4009 if ( bSig == 0 ) { 4010 invalid: 4011 float_raise( float_flag_invalid ); 4012 z.low = floatx80_default_nan_low; 4013 z.high = floatx80_default_nan_high<<X80SHIFT; 4014 return z; 4015 } 4016 normalizeFloatx80Subnormal( bSig, &bExp, &bSig ); 4017 } 4018 if ( aExp == 0 ) { 4019 if ( (bits64) ( aSig0<<1 ) == 0 ) return a; 4020 normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 ); 4021 } 4022 bSig |= LIT64( 0x8000000000000000 ); 4023 zSign = aSign; 4024 expDiff = aExp - bExp; 4025 aSig1 = 0; 4026 if ( expDiff < 0 ) { 4027 if ( expDiff < -1 ) return a; 4028 shift128Right( aSig0, 0, 1, &aSig0, &aSig1 ); 4029 expDiff = 0; 4030 } 4031 q = ( bSig <= aSig0 ); 4032 if ( q ) aSig0 -= bSig; 4033 expDiff -= 64; 4034 while ( 0 < expDiff ) { 4035 q = estimateDiv128To64( aSig0, aSig1, bSig ); 4036 q = ( 2 < q ) ? q - 2 : 0; 4037 mul64To128( bSig, q, &term0, &term1 ); 4038 sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 ); 4039 shortShift128Left( aSig0, aSig1, 62, &aSig0, &aSig1 ); 4040 expDiff -= 62; 4041 } 4042 expDiff += 64; 4043 if ( 0 < expDiff ) { 4044 q = estimateDiv128To64( aSig0, aSig1, bSig ); 4045 q = ( 2 < q ) ? q - 2 : 0; 4046 q >>= 64 - expDiff; 4047 mul64To128( bSig, q<<( 64 - expDiff ), &term0, &term1 ); 4048 sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 ); 4049 shortShift128Left( 0, bSig, 64 - expDiff, &term0, &term1 ); 4050 while ( le128( term0, term1, aSig0, aSig1 ) ) { 4051 ++q; 4052 sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 ); 4053 } 4054 } 4055 else { 4056 term1 = 0; 4057 term0 = bSig; 4058 } 4059 sub128( term0, term1, aSig0, aSig1, &alternateASig0, &alternateASig1 ); 4060 if ( lt128( alternateASig0, alternateASig1, aSig0, aSig1 ) 4061 || ( eq128( alternateASig0, alternateASig1, aSig0, aSig1 ) 4062 && ( q & 1 ) ) 4063 ) { 4064 aSig0 = alternateASig0; 4065 aSig1 = alternateASig1; 4066 zSign = ! zSign; 4067 } 4068 return 4069 normalizeRoundAndPackFloatx80( 4070 80, zSign, bExp + expDiff, aSig0, aSig1 ); 4071 4072 } 4073 4074 /* 4075 ------------------------------------------------------------------------------- 4076 Returns the square root of the extended double-precision floating-point 4077 value `a'. The operation is performed according to the IEC/IEEE Standard 4078 for Binary Floating-Point Arithmetic. 4079 ------------------------------------------------------------------------------- 4080 */ 4081 floatx80 floatx80_sqrt( floatx80 a ) 4082 { 4083 flag aSign; 4084 int32 aExp, zExp; 4085 bits64 aSig0, aSig1, zSig0, zSig1, doubleZSig0; 4086 bits64 rem0, rem1, rem2, rem3, term0, term1, term2, term3; 4087 floatx80 z; 4088 4089 aSig0 = extractFloatx80Frac( a ); 4090 aExp = extractFloatx80Exp( a ); 4091 aSign = extractFloatx80Sign( a ); 4092 if ( aExp == 0x7FFF ) { 4093 if ( (bits64) ( aSig0<<1 ) ) return propagateFloatx80NaN( a, a ); 4094 if ( ! aSign ) return a; 4095 goto invalid; 4096 } 4097 if ( aSign ) { 4098 if ( ( aExp | aSig0 ) == 0 ) return a; 4099 invalid: 4100 float_raise( float_flag_invalid ); 4101 z.low = floatx80_default_nan_low; 4102 z.high = floatx80_default_nan_high<<X80SHIFT; 4103 return z; 4104 } 4105 if ( aExp == 0 ) { 4106 if ( aSig0 == 0 ) return packFloatx80( 0, 0, 0 ); 4107 normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 ); 4108 } 4109 zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFF; 4110 zSig0 = estimateSqrt32( aExp, aSig0>>32 ); 4111 shift128Right( aSig0, 0, 2 + ( aExp & 1 ), &aSig0, &aSig1 ); 4112 zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 ); 4113 doubleZSig0 = zSig0<<1; 4114 mul64To128( zSig0, zSig0, &term0, &term1 ); 4115 sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 ); 4116 while ( (sbits64) rem0 < 0 ) { 4117 --zSig0; 4118 doubleZSig0 -= 2; 4119 add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 ); 4120 } 4121 zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 ); 4122 if ( ( zSig1 & LIT64( 0x3FFFFFFFFFFFFFFF ) ) <= 5 ) { 4123 if ( zSig1 == 0 ) zSig1 = 1; 4124 mul64To128( doubleZSig0, zSig1, &term1, &term2 ); 4125 sub128( rem1, 0, term1, term2, &rem1, &rem2 ); 4126 mul64To128( zSig1, zSig1, &term2, &term3 ); 4127 sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 ); 4128 while ( (sbits64) rem1 < 0 ) { 4129 --zSig1; 4130 shortShift128Left( 0, zSig1, 1, &term2, &term3 ); 4131 term3 |= 1; 4132 term2 |= doubleZSig0; 4133 add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 ); 4134 } 4135 zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 ); 4136 } 4137 shortShift128Left( 0, zSig1, 1, &zSig0, &zSig1 ); 4138 zSig0 |= doubleZSig0; 4139 return 4140 roundAndPackFloatx80( 4141 floatx80_rounding_precision, 0, zExp, zSig0, zSig1 ); 4142 4143 } 4144 4145 /* 4146 ------------------------------------------------------------------------------- 4147 Returns 1 if the extended double-precision floating-point value `a' is 4148 equal to the corresponding value `b', and 0 otherwise. The comparison is 4149 performed according to the IEC/IEEE Standard for Binary Floating-Point 4150 Arithmetic. 4151 ------------------------------------------------------------------------------- 4152 */ 4153 flag floatx80_eq_internal( floatx80 a, floatx80 b ) 4154 { 4155 4156 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF ) 4157 && (bits64) ( extractFloatx80Frac( a )<<1 ) ) 4158 || ( ( extractFloatx80Exp( b ) == 0x7FFF ) 4159 && (bits64) ( extractFloatx80Frac( b )<<1 ) ) 4160 ) { 4161 if ( floatx80_is_signaling_nan( a ) 4162 || floatx80_is_signaling_nan( b ) ) { 4163 float_raise( float_flag_invalid ); 4164 } 4165 return 0; 4166 } 4167 return 4168 ( a.low == b.low ) 4169 && ( ( a.high == b.high ) 4170 || ( ( a.low == 0 ) 4171 && ( (bits16) ( ((bits32)( a.high | b.high )>>X80SHIFT)<<1 ) 4172 == 0 ) ) ); 4173 4174 } 4175 4176 /* 4177 ------------------------------------------------------------------------------- 4178 Returns 1 if the extended double-precision floating-point value `a' is 4179 less than or equal to the corresponding value `b', and 0 otherwise. The 4180 comparison is performed according to the IEC/IEEE Standard for Binary 4181 Floating-Point Arithmetic. 4182 ------------------------------------------------------------------------------- 4183 */ 4184 flag floatx80_le_internal( floatx80 a, floatx80 b ) 4185 { 4186 flag aSign, bSign; 4187 4188 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF ) 4189 && (bits64) ( extractFloatx80Frac( a )<<1 ) ) 4190 || ( ( extractFloatx80Exp( b ) == 0x7FFF ) 4191 && (bits64) ( extractFloatx80Frac( b )<<1 ) ) 4192 ) { 4193 float_raise( float_flag_invalid ); 4194 return 0; 4195 } 4196 aSign = extractFloatx80Sign( a ); 4197 bSign = extractFloatx80Sign( b ); 4198 if ( aSign != bSign ) { 4199 return 4200 aSign 4201 || ( ( ( (bits16) ( ((bits32)( a.high | b.high )>>X80SHIFT) 4202 <<1 ) ) | a.low | b.low ) == 0 ); 4203 } 4204 return 4205 aSign ? le128( b.high>>X80SHIFT, b.low, a.high>>X80SHIFT, a.low ) 4206 : le128( a.high>>X80SHIFT, a.low, b.high>>X80SHIFT, b.low ); 4207 4208 } 4209 4210 /* 4211 ------------------------------------------------------------------------------- 4212 Returns 1 if the extended double-precision floating-point value `a' is 4213 less than the corresponding value `b', and 0 otherwise. The comparison 4214 is performed according to the IEC/IEEE Standard for Binary Floating-Point 4215 Arithmetic. 4216 ------------------------------------------------------------------------------- 4217 */ 4218 flag floatx80_lt_internal( floatx80 a, floatx80 b ) 4219 { 4220 flag aSign, bSign; 4221 4222 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF ) 4223 && (bits64) ( extractFloatx80Frac( a )<<1 ) ) 4224 || ( ( extractFloatx80Exp( b ) == 0x7FFF ) 4225 && (bits64) ( extractFloatx80Frac( b )<<1 ) ) 4226 ) { 4227 float_raise( float_flag_invalid ); 4228 return 0; 4229 } 4230 aSign = extractFloatx80Sign( a ); 4231 bSign = extractFloatx80Sign( b ); 4232 if ( aSign != bSign ) { 4233 return 4234 aSign 4235 && ( ( ( (bits16) ( ((bits32)( a.high | b.high )>>X80SHIFT) 4236 <<1 ) ) | a.low | b.low ) != 0 ); 4237 } 4238 return 4239 aSign ? lt128( b.high>>X80SHIFT, b.low, a.high>>X80SHIFT, a.low ) 4240 : lt128( a.high>>X80SHIFT, a.low, b.high>>X80SHIFT, b.low ); 4241 4242 } 4243 4244 /* 4245 ------------------------------------------------------------------------------- 4246 Returns 1 if the extended double-precision floating-point value `a' is equal 4247 to the corresponding value `b', and 0 otherwise. The invalid exception is 4248 raised if either operand is a NaN. Otherwise, the comparison is performed 4249 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 4250 ------------------------------------------------------------------------------- 4251 */ 4252 flag floatx80_eq_signaling_internal( floatx80 a, floatx80 b ) 4253 { 4254 4255 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF ) 4256 && (bits64) ( extractFloatx80Frac( a )<<1 ) ) 4257 || ( ( extractFloatx80Exp( b ) == 0x7FFF ) 4258 && (bits64) ( extractFloatx80Frac( b )<<1 ) ) 4259 ) { 4260 float_raise( float_flag_invalid ); 4261 return 0; 4262 } 4263 return 4264 ( a.low == b.low ) 4265 && ( ( (a.high>>X80SHIFT) == (b.high>>X80SHIFT) ) 4266 || ( ( a.low == 0 ) 4267 && ( (bits16) ( ((bits32)( a.high | b.high )>>X80SHIFT)<<1 ) 4268 == 0 ) ) ); 4269 4270 } 4271 4272 /* 4273 ------------------------------------------------------------------------------- 4274 Returns 1 if the extended double-precision floating-point value `a' is less 4275 than or equal to the corresponding value `b', and 0 otherwise. Quiet NaNs 4276 do not cause an exception. Otherwise, the comparison is performed according 4277 to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 4278 ------------------------------------------------------------------------------- 4279 */ 4280 flag floatx80_le_quiet_internal( floatx80 a, floatx80 b ) 4281 { 4282 flag aSign, bSign; 4283 4284 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF ) 4285 && (bits64) ( extractFloatx80Frac( a )<<1 ) ) 4286 || ( ( extractFloatx80Exp( b ) == 0x7FFF ) 4287 && (bits64) ( extractFloatx80Frac( b )<<1 ) ) 4288 ) { 4289 if ( floatx80_is_signaling_nan( a ) 4290 || floatx80_is_signaling_nan( b ) ) { 4291 float_raise( float_flag_invalid ); 4292 } 4293 return 0; 4294 } 4295 aSign = extractFloatx80Sign( a ); 4296 bSign = extractFloatx80Sign( b ); 4297 if ( aSign != bSign ) { 4298 return 4299 aSign 4300 || ( ( ( (bits16) ( ((bits32)( a.high | b.high )>>X80SHIFT)<<1 4301 ) ) | a.low | b.low ) == 0 ); 4302 } 4303 return 4304 aSign ? le128( b.high>>X80SHIFT, b.low, a.high>>X80SHIFT, a.low ) 4305 : le128( a.high>>X80SHIFT, a.low, b.high>>X80SHIFT, b.low ); 4306 4307 } 4308 4309 /* 4310 ------------------------------------------------------------------------------- 4311 Returns 1 if the extended double-precision floating-point value `a' is less 4312 than the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause 4313 an exception. Otherwise, the comparison is performed according to the 4314 IEC/IEEE Standard for Binary Floating-Point Arithmetic. 4315 ------------------------------------------------------------------------------- 4316 */ 4317 flag floatx80_lt_quiet_internal( floatx80 a, floatx80 b ) 4318 { 4319 flag aSign, bSign; 4320 4321 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF ) 4322 && (bits64) ( extractFloatx80Frac( a )<<1 ) ) 4323 || ( ( extractFloatx80Exp( b ) == 0x7FFF ) 4324 && (bits64) ( extractFloatx80Frac( b )<<1 ) ) 4325 ) { 4326 if ( floatx80_is_signaling_nan( a ) 4327 || floatx80_is_signaling_nan( b ) ) { 4328 float_raise( float_flag_invalid ); 4329 } 4330 return 0; 4331 } 4332 aSign = extractFloatx80Sign( a ); 4333 bSign = extractFloatx80Sign( b ); 4334 if ( aSign != bSign ) { 4335 return 4336 aSign 4337 && ( ( ( (bits16) ( ((bits32)( a.high | b.high )>>X80SHIFT)<<1 ) 4338 ) | a.low | b.low ) != 0 ); 4339 } 4340 return 4341 aSign ? lt128( b.high>>X80SHIFT, b.low, a.high>>X80SHIFT, a.low ) 4342 : lt128( a.high>>X80SHIFT, a.low, b.high>>X80SHIFT, b.low ); 4343 4344 } 4345 4346 #endif 4347 4348 #ifdef FLOAT128 4349 4350 /* 4351 ------------------------------------------------------------------------------- 4352 Returns the result of converting the quadruple-precision floating-point 4353 value `a' to the 32-bit two's complement integer format. The conversion 4354 is performed according to the IEC/IEEE Standard for Binary Floating-Point 4355 Arithmetic---which means in particular that the conversion is rounded 4356 according to the current rounding mode. If `a' is a NaN, the largest 4357 positive integer is returned. Otherwise, if the conversion overflows, the 4358 largest integer with the same sign as `a' is returned. 4359 ------------------------------------------------------------------------------- 4360 */ 4361 int32 float128_to_int32( float128 a ) 4362 { 4363 flag aSign; 4364 int32 aExp, shiftCount; 4365 bits64 aSig0, aSig1; 4366 4367 aSig1 = extractFloat128Frac1( a ); 4368 aSig0 = extractFloat128Frac0( a ); 4369 aExp = extractFloat128Exp( a ); 4370 aSign = extractFloat128Sign( a ); 4371 if ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) aSign = 0; 4372 if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 ); 4373 aSig0 |= ( aSig1 != 0 ); 4374 shiftCount = 0x4028 - aExp; 4375 if ( 0 < shiftCount ) shift64RightJamming( aSig0, shiftCount, &aSig0 ); 4376 return roundAndPackInt32( aSign, aSig0 ); 4377 4378 } 4379 4380 /* 4381 ------------------------------------------------------------------------------- 4382 Returns the result of converting the quadruple-precision floating-point 4383 value `a' to the 32-bit two's complement integer format. The conversion 4384 is performed according to the IEC/IEEE Standard for Binary Floating-Point 4385 Arithmetic, except that the conversion is always rounded toward zero. If 4386 `a' is a NaN, the largest positive integer is returned. Otherwise, if the 4387 conversion overflows, the largest integer with the same sign as `a' is 4388 returned. 4389 ------------------------------------------------------------------------------- 4390 */ 4391 int32 float128_to_int32_round_to_zero( float128 a ) 4392 { 4393 flag aSign; 4394 int32 aExp, shiftCount; 4395 bits64 aSig0, aSig1, savedASig; 4396 int32 z; 4397 4398 aSig1 = extractFloat128Frac1( a ); 4399 aSig0 = extractFloat128Frac0( a ); 4400 aExp = extractFloat128Exp( a ); 4401 aSign = extractFloat128Sign( a ); 4402 aSig0 |= ( aSig1 != 0 ); 4403 if ( 0x401E < aExp ) { 4404 if ( ( aExp == 0x7FFF ) && aSig0 ) aSign = 0; 4405 goto invalid; 4406 } 4407 else if ( aExp < 0x3FFF ) { 4408 if ( aExp || aSig0 ) set_float_exception_inexact_flag(); 4409 return 0; 4410 } 4411 aSig0 |= LIT64( 0x0001000000000000 ); 4412 shiftCount = 0x402F - aExp; 4413 savedASig = aSig0; 4414 aSig0 >>= shiftCount; 4415 z = (int32)aSig0; 4416 if ( aSign ) z = - z; 4417 if ( ( z < 0 ) ^ aSign ) { 4418 invalid: 4419 float_raise( float_flag_invalid ); 4420 return aSign ? (sbits32) 0x80000000 : 0x7FFFFFFF; 4421 } 4422 if ( ( aSig0<<shiftCount ) != savedASig ) { 4423 set_float_exception_inexact_flag(); 4424 } 4425 return z; 4426 4427 } 4428 4429 /* 4430 ------------------------------------------------------------------------------- 4431 Returns the result of converting the quadruple-precision floating-point 4432 value `a' to the 64-bit two's complement integer format. The conversion 4433 is performed according to the IEC/IEEE Standard for Binary Floating-Point 4434 Arithmetic---which means in particular that the conversion is rounded 4435 according to the current rounding mode. If `a' is a NaN, the largest 4436 positive integer is returned. Otherwise, if the conversion overflows, the 4437 largest integer with the same sign as `a' is returned. 4438 ------------------------------------------------------------------------------- 4439 */ 4440 int64 float128_to_int64( float128 a ) 4441 { 4442 flag aSign; 4443 int32 aExp, shiftCount; 4444 bits64 aSig0, aSig1; 4445 4446 aSig1 = extractFloat128Frac1( a ); 4447 aSig0 = extractFloat128Frac0( a ); 4448 aExp = extractFloat128Exp( a ); 4449 aSign = extractFloat128Sign( a ); 4450 if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 ); 4451 shiftCount = 0x402F - aExp; 4452 if ( shiftCount <= 0 ) { 4453 if ( 0x403E < aExp ) { 4454 float_raise( float_flag_invalid ); 4455 if ( ! aSign 4456 || ( ( aExp == 0x7FFF ) 4457 && ( aSig1 || ( aSig0 != LIT64( 0x0001000000000000 ) ) ) 4458 ) 4459 ) { 4460 return LIT64( 0x7FFFFFFFFFFFFFFF ); 4461 } 4462 return (sbits64) LIT64( 0x8000000000000000 ); 4463 } 4464 shortShift128Left( aSig0, aSig1, - shiftCount, &aSig0, &aSig1 ); 4465 } 4466 else { 4467 shift64ExtraRightJamming( aSig0, aSig1, shiftCount, &aSig0, &aSig1 ); 4468 } 4469 return roundAndPackInt64( aSign, aSig0, aSig1 ); 4470 4471 } 4472 4473 /* 4474 ------------------------------------------------------------------------------- 4475 Returns the result of converting the quadruple-precision floating-point 4476 value `a' to the 64-bit two's complement integer format. The conversion 4477 is performed according to the IEC/IEEE Standard for Binary Floating-Point 4478 Arithmetic, except that the conversion is always rounded toward zero. 4479 If `a' is a NaN, the largest positive integer is returned. Otherwise, if 4480 the conversion overflows, the largest integer with the same sign as `a' is 4481 returned. 4482 ------------------------------------------------------------------------------- 4483 */ 4484 int64 float128_to_int64_round_to_zero( float128 a ) 4485 { 4486 flag aSign; 4487 int32 aExp, shiftCount; 4488 bits64 aSig0, aSig1; 4489 int64 z; 4490 4491 aSig1 = extractFloat128Frac1( a ); 4492 aSig0 = extractFloat128Frac0( a ); 4493 aExp = extractFloat128Exp( a ); 4494 aSign = extractFloat128Sign( a ); 4495 if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 ); 4496 shiftCount = aExp - 0x402F; 4497 if ( 0 < shiftCount ) { 4498 if ( 0x403E <= aExp ) { 4499 aSig0 &= LIT64( 0x0000FFFFFFFFFFFF ); 4500 if ( ( a.high == LIT64( 0xC03E000000000000 ) ) 4501 && ( aSig1 < LIT64( 0x0002000000000000 ) ) ) { 4502 if ( aSig1 ) set_float_exception_inexact_flag(); 4503 } 4504 else { 4505 float_raise( float_flag_invalid ); 4506 if ( ! aSign || ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) ) { 4507 return LIT64( 0x7FFFFFFFFFFFFFFF ); 4508 } 4509 } 4510 return (sbits64) LIT64( 0x8000000000000000 ); 4511 } 4512 z = ( aSig0<<shiftCount ) | ( aSig1>>( ( - shiftCount ) & 63 ) ); 4513 if ( (bits64) ( aSig1<<shiftCount ) ) { 4514 set_float_exception_inexact_flag(); 4515 } 4516 } 4517 else { 4518 if ( aExp < 0x3FFF ) { 4519 if ( aExp | aSig0 | aSig1 ) { 4520 set_float_exception_inexact_flag(); 4521 } 4522 return 0; 4523 } 4524 z = aSig0>>( - shiftCount ); 4525 if ( aSig1 4526 || ( shiftCount && (bits64) ( aSig0<<( shiftCount & 63 ) ) ) ) { 4527 set_float_exception_inexact_flag(); 4528 } 4529 } 4530 if ( aSign ) z = - z; 4531 return z; 4532 4533 } 4534 4535 #if (defined(SOFTFLOATSPARC64_FOR_GCC) || defined(SOFTFLOAT_FOR_GCC)) \ 4536 && defined(SOFTFLOAT_NEED_FIXUNS) 4537 /* 4538 * just like above - but do not care for overflow of signed results 4539 */ 4540 uint64 float128_to_uint64_round_to_zero( float128 a ) 4541 { 4542 flag aSign; 4543 int32 aExp, shiftCount; 4544 bits64 aSig0, aSig1; 4545 uint64 z; 4546 4547 aSig1 = extractFloat128Frac1( a ); 4548 aSig0 = extractFloat128Frac0( a ); 4549 aExp = extractFloat128Exp( a ); 4550 aSign = extractFloat128Sign( a ); 4551 if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 ); 4552 shiftCount = aExp - 0x402F; 4553 if ( 0 < shiftCount ) { 4554 if ( 0x403F <= aExp ) { 4555 aSig0 &= LIT64( 0x0000FFFFFFFFFFFF ); 4556 if ( ( a.high == LIT64( 0xC03E000000000000 ) ) 4557 && ( aSig1 < LIT64( 0x0002000000000000 ) ) ) { 4558 if ( aSig1 ) set_float_exception_inexact_flag(); 4559 } 4560 else { 4561 float_raise( float_flag_invalid ); 4562 } 4563 return LIT64( 0xFFFFFFFFFFFFFFFF ); 4564 } 4565 z = ( aSig0<<shiftCount ) | ( aSig1>>( ( - shiftCount ) & 63 ) ); 4566 if ( (bits64) ( aSig1<<shiftCount ) ) { 4567 set_float_exception_inexact_flag(); 4568 } 4569 } 4570 else { 4571 if ( aExp < 0x3FFF ) { 4572 if ( aExp | aSig0 | aSig1 ) { 4573 set_float_exception_inexact_flag(); 4574 } 4575 return 0; 4576 } 4577 z = aSig0>>( - shiftCount ); 4578 if (aSig1 || ( shiftCount && (bits64) ( aSig0<<( shiftCount & 63 ) ) ) ) { 4579 set_float_exception_inexact_flag(); 4580 } 4581 } 4582 if ( aSign ) z = - z; 4583 return z; 4584 4585 } 4586 #endif /* (SOFTFLOATSPARC64_FOR_GCC || SOFTFLOAT_FOR_GCC) && SOFTFLOAT_NEED_FIXUNS */ 4587 4588 /* 4589 ------------------------------------------------------------------------------- 4590 Returns the result of converting the quadruple-precision floating-point 4591 value `a' to the single-precision floating-point format. The conversion 4592 is performed according to the IEC/IEEE Standard for Binary Floating-Point 4593 Arithmetic. 4594 ------------------------------------------------------------------------------- 4595 */ 4596 float32 float128_to_float32( float128 a ) 4597 { 4598 flag aSign; 4599 int32 aExp; 4600 bits64 aSig0, aSig1; 4601 bits32 zSig; 4602 4603 aSig1 = extractFloat128Frac1( a ); 4604 aSig0 = extractFloat128Frac0( a ); 4605 aExp = extractFloat128Exp( a ); 4606 aSign = extractFloat128Sign( a ); 4607 if ( aExp == 0x7FFF ) { 4608 if ( aSig0 | aSig1 ) { 4609 return commonNaNToFloat32( float128ToCommonNaN( a ) ); 4610 } 4611 return packFloat32( aSign, 0xFF, 0 ); 4612 } 4613 aSig0 |= ( aSig1 != 0 ); 4614 shift64RightJamming( aSig0, 18, &aSig0 ); 4615 zSig = (bits32)aSig0; 4616 if ( aExp || zSig ) { 4617 zSig |= 0x40000000; 4618 aExp -= 0x3F81; 4619 } 4620 return roundAndPackFloat32( aSign, aExp, zSig ); 4621 4622 } 4623 4624 /* 4625 ------------------------------------------------------------------------------- 4626 Returns the result of converting the quadruple-precision floating-point 4627 value `a' to the double-precision floating-point format. The conversion 4628 is performed according to the IEC/IEEE Standard for Binary Floating-Point 4629 Arithmetic. 4630 ------------------------------------------------------------------------------- 4631 */ 4632 float64 float128_to_float64( float128 a ) 4633 { 4634 flag aSign; 4635 int32 aExp; 4636 bits64 aSig0, aSig1; 4637 4638 aSig1 = extractFloat128Frac1( a ); 4639 aSig0 = extractFloat128Frac0( a ); 4640 aExp = extractFloat128Exp( a ); 4641 aSign = extractFloat128Sign( a ); 4642 if ( aExp == 0x7FFF ) { 4643 if ( aSig0 | aSig1 ) { 4644 return commonNaNToFloat64( float128ToCommonNaN( a ) ); 4645 } 4646 return packFloat64( aSign, 0x7FF, 0 ); 4647 } 4648 shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 ); 4649 aSig0 |= ( aSig1 != 0 ); 4650 if ( aExp || aSig0 ) { 4651 aSig0 |= LIT64( 0x4000000000000000 ); 4652 aExp -= 0x3C01; 4653 } 4654 return roundAndPackFloat64( aSign, aExp, aSig0 ); 4655 4656 } 4657 4658 #ifdef FLOATX80 4659 4660 /* 4661 ------------------------------------------------------------------------------- 4662 Returns the result of converting the quadruple-precision floating-point 4663 value `a' to the extended double-precision floating-point format. The 4664 conversion is performed according to the IEC/IEEE Standard for Binary 4665 Floating-Point Arithmetic. 4666 ------------------------------------------------------------------------------- 4667 */ 4668 floatx80 float128_to_floatx80( float128 a ) 4669 { 4670 flag aSign; 4671 int32 aExp; 4672 bits64 aSig0, aSig1; 4673 4674 aSig1 = extractFloat128Frac1( a ); 4675 aSig0 = extractFloat128Frac0( a ); 4676 aExp = extractFloat128Exp( a ); 4677 aSign = extractFloat128Sign( a ); 4678 if ( aExp == 0x7FFF ) { 4679 if ( aSig0 | aSig1 ) { 4680 return commonNaNToFloatx80( float128ToCommonNaN( a ) ); 4681 } 4682 return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) ); 4683 } 4684 if ( aExp == 0 ) { 4685 if ( ( aSig0 | aSig1 ) == 0 ) return packFloatx80( aSign, 0, 0 ); 4686 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 ); 4687 } 4688 else { 4689 aSig0 |= LIT64( 0x0001000000000000 ); 4690 } 4691 shortShift128Left( aSig0, aSig1, 15, &aSig0, &aSig1 ); 4692 return roundAndPackFloatx80( 80, aSign, aExp, aSig0, aSig1 ); 4693 4694 } 4695 4696 #endif 4697 4698 /* 4699 ------------------------------------------------------------------------------- 4700 Rounds the quadruple-precision floating-point value `a' to an integer, and 4701 returns the result as a quadruple-precision floating-point value. The 4702 operation is performed according to the IEC/IEEE Standard for Binary 4703 Floating-Point Arithmetic. 4704 ------------------------------------------------------------------------------- 4705 */ 4706 float128 float128_round_to_int( float128 a ) 4707 { 4708 flag aSign; 4709 int32 aExp; 4710 bits64 lastBitMask, roundBitsMask; 4711 int8 roundingMode; 4712 float128 z; 4713 4714 aExp = extractFloat128Exp( a ); 4715 if ( 0x402F <= aExp ) { 4716 if ( 0x406F <= aExp ) { 4717 if ( ( aExp == 0x7FFF ) 4718 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) 4719 ) { 4720 return propagateFloat128NaN( a, a ); 4721 } 4722 return a; 4723 } 4724 lastBitMask = 1; 4725 lastBitMask = ( lastBitMask<<( 0x406E - aExp ) )<<1; 4726 roundBitsMask = lastBitMask - 1; 4727 z = a; 4728 roundingMode = float_rounding_mode; 4729 if ( roundingMode == float_round_nearest_even ) { 4730 if ( lastBitMask ) { 4731 add128( z.high, z.low, 0, lastBitMask>>1, &z.high, &z.low ); 4732 if ( ( z.low & roundBitsMask ) == 0 ) z.low &= ~ lastBitMask; 4733 } 4734 else { 4735 if ( (sbits64) z.low < 0 ) { 4736 ++z.high; 4737 if ( (bits64) ( z.low<<1 ) == 0 ) z.high &= ~1; 4738 } 4739 } 4740 } 4741 else if ( roundingMode != float_round_to_zero ) { 4742 if ( extractFloat128Sign( z ) 4743 ^ ( roundingMode == float_round_up ) ) { 4744 add128( z.high, z.low, 0, roundBitsMask, &z.high, &z.low ); 4745 } 4746 } 4747 z.low &= ~ roundBitsMask; 4748 } 4749 else { 4750 if ( aExp < 0x3FFF ) { 4751 if ( ( ( (bits64) ( a.high<<1 ) ) | a.low ) == 0 ) return a; 4752 set_float_exception_inexact_flag(); 4753 aSign = extractFloat128Sign( a ); 4754 switch ( float_rounding_mode ) { 4755 case float_round_nearest_even: 4756 if ( ( aExp == 0x3FFE ) 4757 && ( extractFloat128Frac0( a ) 4758 | extractFloat128Frac1( a ) ) 4759 ) { 4760 return packFloat128( aSign, 0x3FFF, 0, 0 ); 4761 } 4762 break; 4763 case float_round_to_zero: 4764 break; 4765 case float_round_down: 4766 return 4767 aSign ? packFloat128( 1, 0x3FFF, 0, 0 ) 4768 : packFloat128( 0, 0, 0, 0 ); 4769 case float_round_up: 4770 return 4771 aSign ? packFloat128( 1, 0, 0, 0 ) 4772 : packFloat128( 0, 0x3FFF, 0, 0 ); 4773 } 4774 return packFloat128( aSign, 0, 0, 0 ); 4775 } 4776 lastBitMask = 1; 4777 lastBitMask <<= 0x402F - aExp; 4778 roundBitsMask = lastBitMask - 1; 4779 z.low = 0; 4780 z.high = a.high; 4781 roundingMode = float_rounding_mode; 4782 if ( roundingMode == float_round_nearest_even ) { 4783 z.high += lastBitMask>>1; 4784 if ( ( ( z.high & roundBitsMask ) | a.low ) == 0 ) { 4785 z.high &= ~ lastBitMask; 4786 } 4787 } 4788 else if ( roundingMode != float_round_to_zero ) { 4789 if ( extractFloat128Sign( z ) 4790 ^ ( roundingMode == float_round_up ) ) { 4791 z.high |= ( a.low != 0 ); 4792 z.high += roundBitsMask; 4793 } 4794 } 4795 z.high &= ~ roundBitsMask; 4796 } 4797 if ( ( z.low != a.low ) || ( z.high != a.high ) ) { 4798 set_float_exception_inexact_flag(); 4799 } 4800 return z; 4801 4802 } 4803 4804 /* 4805 ------------------------------------------------------------------------------- 4806 Returns the result of adding the absolute values of the quadruple-precision 4807 floating-point values `a' and `b'. If `zSign' is 1, the sum is negated 4808 before being returned. `zSign' is ignored if the result is a NaN. 4809 The addition is performed according to the IEC/IEEE Standard for Binary 4810 Floating-Point Arithmetic. 4811 ------------------------------------------------------------------------------- 4812 */ 4813 static float128 addFloat128Sigs( float128 a, float128 b, flag zSign ) 4814 { 4815 int32 aExp, bExp, zExp; 4816 bits64 aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2; 4817 int32 expDiff; 4818 4819 aSig1 = extractFloat128Frac1( a ); 4820 aSig0 = extractFloat128Frac0( a ); 4821 aExp = extractFloat128Exp( a ); 4822 bSig1 = extractFloat128Frac1( b ); 4823 bSig0 = extractFloat128Frac0( b ); 4824 bExp = extractFloat128Exp( b ); 4825 expDiff = aExp - bExp; 4826 if ( 0 < expDiff ) { 4827 if ( aExp == 0x7FFF ) { 4828 if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, b ); 4829 return a; 4830 } 4831 if ( bExp == 0 ) { 4832 --expDiff; 4833 } 4834 else { 4835 bSig0 |= LIT64( 0x0001000000000000 ); 4836 } 4837 shift128ExtraRightJamming( 4838 bSig0, bSig1, 0, expDiff, &bSig0, &bSig1, &zSig2 ); 4839 zExp = aExp; 4840 } 4841 else if ( expDiff < 0 ) { 4842 if ( bExp == 0x7FFF ) { 4843 if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b ); 4844 return packFloat128( zSign, 0x7FFF, 0, 0 ); 4845 } 4846 if ( aExp == 0 ) { 4847 ++expDiff; 4848 } 4849 else { 4850 aSig0 |= LIT64( 0x0001000000000000 ); 4851 } 4852 shift128ExtraRightJamming( 4853 aSig0, aSig1, 0, - expDiff, &aSig0, &aSig1, &zSig2 ); 4854 zExp = bExp; 4855 } 4856 else { 4857 if ( aExp == 0x7FFF ) { 4858 if ( aSig0 | aSig1 | bSig0 | bSig1 ) { 4859 return propagateFloat128NaN( a, b ); 4860 } 4861 return a; 4862 } 4863 add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 ); 4864 if ( aExp == 0 ) return packFloat128( zSign, 0, zSig0, zSig1 ); 4865 zSig2 = 0; 4866 zSig0 |= LIT64( 0x0002000000000000 ); 4867 zExp = aExp; 4868 goto shiftRight1; 4869 } 4870 aSig0 |= LIT64( 0x0001000000000000 ); 4871 add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 ); 4872 --zExp; 4873 if ( zSig0 < LIT64( 0x0002000000000000 ) ) goto roundAndPack; 4874 ++zExp; 4875 shiftRight1: 4876 shift128ExtraRightJamming( 4877 zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 ); 4878 roundAndPack: 4879 return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 ); 4880 4881 } 4882 4883 /* 4884 ------------------------------------------------------------------------------- 4885 Returns the result of subtracting the absolute values of the quadruple- 4886 precision floating-point values `a' and `b'. If `zSign' is 1, the 4887 difference is negated before being returned. `zSign' is ignored if the 4888 result is a NaN. The subtraction is performed according to the IEC/IEEE 4889 Standard for Binary Floating-Point Arithmetic. 4890 ------------------------------------------------------------------------------- 4891 */ 4892 static float128 subFloat128Sigs( float128 a, float128 b, flag zSign ) 4893 { 4894 int32 aExp, bExp, zExp; 4895 bits64 aSig0, aSig1, bSig0, bSig1, zSig0, zSig1; 4896 int32 expDiff; 4897 float128 z; 4898 4899 aSig1 = extractFloat128Frac1( a ); 4900 aSig0 = extractFloat128Frac0( a ); 4901 aExp = extractFloat128Exp( a ); 4902 bSig1 = extractFloat128Frac1( b ); 4903 bSig0 = extractFloat128Frac0( b ); 4904 bExp = extractFloat128Exp( b ); 4905 expDiff = aExp - bExp; 4906 shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 ); 4907 shortShift128Left( bSig0, bSig1, 14, &bSig0, &bSig1 ); 4908 if ( 0 < expDiff ) goto aExpBigger; 4909 if ( expDiff < 0 ) goto bExpBigger; 4910 if ( aExp == 0x7FFF ) { 4911 if ( aSig0 | aSig1 | bSig0 | bSig1 ) { 4912 return propagateFloat128NaN( a, b ); 4913 } 4914 float_raise( float_flag_invalid ); 4915 z.low = float128_default_nan_low; 4916 z.high = float128_default_nan_high; 4917 return z; 4918 } 4919 if ( aExp == 0 ) { 4920 aExp = 1; 4921 bExp = 1; 4922 } 4923 if ( bSig0 < aSig0 ) goto aBigger; 4924 if ( aSig0 < bSig0 ) goto bBigger; 4925 if ( bSig1 < aSig1 ) goto aBigger; 4926 if ( aSig1 < bSig1 ) goto bBigger; 4927 return packFloat128( float_rounding_mode == float_round_down, 0, 0, 0 ); 4928 bExpBigger: 4929 if ( bExp == 0x7FFF ) { 4930 if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b ); 4931 return packFloat128( zSign ^ 1, 0x7FFF, 0, 0 ); 4932 } 4933 if ( aExp == 0 ) { 4934 ++expDiff; 4935 } 4936 else { 4937 aSig0 |= LIT64( 0x4000000000000000 ); 4938 } 4939 shift128RightJamming( aSig0, aSig1, - expDiff, &aSig0, &aSig1 ); 4940 bSig0 |= LIT64( 0x4000000000000000 ); 4941 bBigger: 4942 sub128( bSig0, bSig1, aSig0, aSig1, &zSig0, &zSig1 ); 4943 zExp = bExp; 4944 zSign ^= 1; 4945 goto normalizeRoundAndPack; 4946 aExpBigger: 4947 if ( aExp == 0x7FFF ) { 4948 if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, b ); 4949 return a; 4950 } 4951 if ( bExp == 0 ) { 4952 --expDiff; 4953 } 4954 else { 4955 bSig0 |= LIT64( 0x4000000000000000 ); 4956 } 4957 shift128RightJamming( bSig0, bSig1, expDiff, &bSig0, &bSig1 ); 4958 aSig0 |= LIT64( 0x4000000000000000 ); 4959 aBigger: 4960 sub128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 ); 4961 zExp = aExp; 4962 normalizeRoundAndPack: 4963 --zExp; 4964 return normalizeRoundAndPackFloat128( zSign, zExp - 14, zSig0, zSig1 ); 4965 4966 } 4967 4968 /* 4969 ------------------------------------------------------------------------------- 4970 Returns the result of adding the quadruple-precision floating-point values 4971 `a' and `b'. The operation is performed according to the IEC/IEEE Standard 4972 for Binary Floating-Point Arithmetic. 4973 ------------------------------------------------------------------------------- 4974 */ 4975 float128 float128_add( float128 a, float128 b ) 4976 { 4977 flag aSign, bSign; 4978 4979 aSign = extractFloat128Sign( a ); 4980 bSign = extractFloat128Sign( b ); 4981 if ( aSign == bSign ) { 4982 return addFloat128Sigs( a, b, aSign ); 4983 } 4984 else { 4985 return subFloat128Sigs( a, b, aSign ); 4986 } 4987 4988 } 4989 4990 /* 4991 ------------------------------------------------------------------------------- 4992 Returns the result of subtracting the quadruple-precision floating-point 4993 values `a' and `b'. The operation is performed according to the IEC/IEEE 4994 Standard for Binary Floating-Point Arithmetic. 4995 ------------------------------------------------------------------------------- 4996 */ 4997 float128 float128_sub( float128 a, float128 b ) 4998 { 4999 flag aSign, bSign; 5000 5001 aSign = extractFloat128Sign( a ); 5002 bSign = extractFloat128Sign( b ); 5003 if ( aSign == bSign ) { 5004 return subFloat128Sigs( a, b, aSign ); 5005 } 5006 else { 5007 return addFloat128Sigs( a, b, aSign ); 5008 } 5009 5010 } 5011 5012 /* 5013 ------------------------------------------------------------------------------- 5014 Returns the result of multiplying the quadruple-precision floating-point 5015 values `a' and `b'. The operation is performed according to the IEC/IEEE 5016 Standard for Binary Floating-Point Arithmetic. 5017 ------------------------------------------------------------------------------- 5018 */ 5019 float128 float128_mul( float128 a, float128 b ) 5020 { 5021 flag aSign, bSign, zSign; 5022 int32 aExp, bExp, zExp; 5023 bits64 aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2, zSig3; 5024 float128 z; 5025 5026 aSig1 = extractFloat128Frac1( a ); 5027 aSig0 = extractFloat128Frac0( a ); 5028 aExp = extractFloat128Exp( a ); 5029 aSign = extractFloat128Sign( a ); 5030 bSig1 = extractFloat128Frac1( b ); 5031 bSig0 = extractFloat128Frac0( b ); 5032 bExp = extractFloat128Exp( b ); 5033 bSign = extractFloat128Sign( b ); 5034 zSign = aSign ^ bSign; 5035 if ( aExp == 0x7FFF ) { 5036 if ( ( aSig0 | aSig1 ) 5037 || ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) { 5038 return propagateFloat128NaN( a, b ); 5039 } 5040 if ( ( bExp | bSig0 | bSig1 ) == 0 ) goto invalid; 5041 return packFloat128( zSign, 0x7FFF, 0, 0 ); 5042 } 5043 if ( bExp == 0x7FFF ) { 5044 if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b ); 5045 if ( ( aExp | aSig0 | aSig1 ) == 0 ) { 5046 invalid: 5047 float_raise( float_flag_invalid ); 5048 z.low = float128_default_nan_low; 5049 z.high = float128_default_nan_high; 5050 return z; 5051 } 5052 return packFloat128( zSign, 0x7FFF, 0, 0 ); 5053 } 5054 if ( aExp == 0 ) { 5055 if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 ); 5056 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 ); 5057 } 5058 if ( bExp == 0 ) { 5059 if ( ( bSig0 | bSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 ); 5060 normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 ); 5061 } 5062 zExp = aExp + bExp - 0x4000; 5063 aSig0 |= LIT64( 0x0001000000000000 ); 5064 shortShift128Left( bSig0, bSig1, 16, &bSig0, &bSig1 ); 5065 mul128To256( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1, &zSig2, &zSig3 ); 5066 add128( zSig0, zSig1, aSig0, aSig1, &zSig0, &zSig1 ); 5067 zSig2 |= ( zSig3 != 0 ); 5068 if ( LIT64( 0x0002000000000000 ) <= zSig0 ) { 5069 shift128ExtraRightJamming( 5070 zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 ); 5071 ++zExp; 5072 } 5073 return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 ); 5074 5075 } 5076 5077 /* 5078 ------------------------------------------------------------------------------- 5079 Returns the result of dividing the quadruple-precision floating-point value 5080 `a' by the corresponding value `b'. The operation is performed according to 5081 the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 5082 ------------------------------------------------------------------------------- 5083 */ 5084 float128 float128_div( float128 a, float128 b ) 5085 { 5086 flag aSign, bSign, zSign; 5087 int32 aExp, bExp, zExp; 5088 bits64 aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2; 5089 bits64 rem0, rem1, rem2, rem3, term0, term1, term2, term3; 5090 float128 z; 5091 5092 aSig1 = extractFloat128Frac1( a ); 5093 aSig0 = extractFloat128Frac0( a ); 5094 aExp = extractFloat128Exp( a ); 5095 aSign = extractFloat128Sign( a ); 5096 bSig1 = extractFloat128Frac1( b ); 5097 bSig0 = extractFloat128Frac0( b ); 5098 bExp = extractFloat128Exp( b ); 5099 bSign = extractFloat128Sign( b ); 5100 zSign = aSign ^ bSign; 5101 if ( aExp == 0x7FFF ) { 5102 if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, b ); 5103 if ( bExp == 0x7FFF ) { 5104 if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b ); 5105 goto invalid; 5106 } 5107 return packFloat128( zSign, 0x7FFF, 0, 0 ); 5108 } 5109 if ( bExp == 0x7FFF ) { 5110 if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b ); 5111 return packFloat128( zSign, 0, 0, 0 ); 5112 } 5113 if ( bExp == 0 ) { 5114 if ( ( bSig0 | bSig1 ) == 0 ) { 5115 if ( ( aExp | aSig0 | aSig1 ) == 0 ) { 5116 invalid: 5117 float_raise( float_flag_invalid ); 5118 z.low = float128_default_nan_low; 5119 z.high = float128_default_nan_high; 5120 return z; 5121 } 5122 float_raise( float_flag_divbyzero ); 5123 return packFloat128( zSign, 0x7FFF, 0, 0 ); 5124 } 5125 normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 ); 5126 } 5127 if ( aExp == 0 ) { 5128 if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 ); 5129 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 ); 5130 } 5131 zExp = aExp - bExp + 0x3FFD; 5132 shortShift128Left( 5133 aSig0 | LIT64( 0x0001000000000000 ), aSig1, 15, &aSig0, &aSig1 ); 5134 shortShift128Left( 5135 bSig0 | LIT64( 0x0001000000000000 ), bSig1, 15, &bSig0, &bSig1 ); 5136 if ( le128( bSig0, bSig1, aSig0, aSig1 ) ) { 5137 shift128Right( aSig0, aSig1, 1, &aSig0, &aSig1 ); 5138 ++zExp; 5139 } 5140 zSig0 = estimateDiv128To64( aSig0, aSig1, bSig0 ); 5141 mul128By64To192( bSig0, bSig1, zSig0, &term0, &term1, &term2 ); 5142 sub192( aSig0, aSig1, 0, term0, term1, term2, &rem0, &rem1, &rem2 ); 5143 while ( (sbits64) rem0 < 0 ) { 5144 --zSig0; 5145 add192( rem0, rem1, rem2, 0, bSig0, bSig1, &rem0, &rem1, &rem2 ); 5146 } 5147 zSig1 = estimateDiv128To64( rem1, rem2, bSig0 ); 5148 if ( ( zSig1 & 0x3FFF ) <= 4 ) { 5149 mul128By64To192( bSig0, bSig1, zSig1, &term1, &term2, &term3 ); 5150 sub192( rem1, rem2, 0, term1, term2, term3, &rem1, &rem2, &rem3 ); 5151 while ( (sbits64) rem1 < 0 ) { 5152 --zSig1; 5153 add192( rem1, rem2, rem3, 0, bSig0, bSig1, &rem1, &rem2, &rem3 ); 5154 } 5155 zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 ); 5156 } 5157 shift128ExtraRightJamming( zSig0, zSig1, 0, 15, &zSig0, &zSig1, &zSig2 ); 5158 return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 ); 5159 5160 } 5161 5162 /* 5163 ------------------------------------------------------------------------------- 5164 Returns the remainder of the quadruple-precision floating-point value `a' 5165 with respect to the corresponding value `b'. The operation is performed 5166 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 5167 ------------------------------------------------------------------------------- 5168 */ 5169 float128 float128_rem( float128 a, float128 b ) 5170 { 5171 flag aSign, zSign; 5172 int32 aExp, bExp, expDiff; 5173 bits64 aSig0, aSig1, bSig0, bSig1, q, term0, term1, term2; 5174 bits64 allZero, alternateASig0, alternateASig1, sigMean1; 5175 sbits64 sigMean0; 5176 float128 z; 5177 5178 aSig1 = extractFloat128Frac1( a ); 5179 aSig0 = extractFloat128Frac0( a ); 5180 aExp = extractFloat128Exp( a ); 5181 aSign = extractFloat128Sign( a ); 5182 bSig1 = extractFloat128Frac1( b ); 5183 bSig0 = extractFloat128Frac0( b ); 5184 bExp = extractFloat128Exp( b ); 5185 if ( aExp == 0x7FFF ) { 5186 if ( ( aSig0 | aSig1 ) 5187 || ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) { 5188 return propagateFloat128NaN( a, b ); 5189 } 5190 goto invalid; 5191 } 5192 if ( bExp == 0x7FFF ) { 5193 if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b ); 5194 return a; 5195 } 5196 if ( bExp == 0 ) { 5197 if ( ( bSig0 | bSig1 ) == 0 ) { 5198 invalid: 5199 float_raise( float_flag_invalid ); 5200 z.low = float128_default_nan_low; 5201 z.high = float128_default_nan_high; 5202 return z; 5203 } 5204 normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 ); 5205 } 5206 if ( aExp == 0 ) { 5207 if ( ( aSig0 | aSig1 ) == 0 ) return a; 5208 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 ); 5209 } 5210 expDiff = aExp - bExp; 5211 if ( expDiff < -1 ) return a; 5212 shortShift128Left( 5213 aSig0 | LIT64( 0x0001000000000000 ), 5214 aSig1, 5215 15 - ( expDiff < 0 ), 5216 &aSig0, 5217 &aSig1 5218 ); 5219 shortShift128Left( 5220 bSig0 | LIT64( 0x0001000000000000 ), bSig1, 15, &bSig0, &bSig1 ); 5221 q = le128( bSig0, bSig1, aSig0, aSig1 ); 5222 if ( q ) sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 ); 5223 expDiff -= 64; 5224 while ( 0 < expDiff ) { 5225 q = estimateDiv128To64( aSig0, aSig1, bSig0 ); 5226 q = ( 4 < q ) ? q - 4 : 0; 5227 mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 ); 5228 shortShift192Left( term0, term1, term2, 61, &term1, &term2, &allZero ); 5229 shortShift128Left( aSig0, aSig1, 61, &aSig0, &allZero ); 5230 sub128( aSig0, 0, term1, term2, &aSig0, &aSig1 ); 5231 expDiff -= 61; 5232 } 5233 if ( -64 < expDiff ) { 5234 q = estimateDiv128To64( aSig0, aSig1, bSig0 ); 5235 q = ( 4 < q ) ? q - 4 : 0; 5236 q >>= - expDiff; 5237 shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 ); 5238 expDiff += 52; 5239 if ( expDiff < 0 ) { 5240 shift128Right( aSig0, aSig1, - expDiff, &aSig0, &aSig1 ); 5241 } 5242 else { 5243 shortShift128Left( aSig0, aSig1, expDiff, &aSig0, &aSig1 ); 5244 } 5245 mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 ); 5246 sub128( aSig0, aSig1, term1, term2, &aSig0, &aSig1 ); 5247 } 5248 else { 5249 shift128Right( aSig0, aSig1, 12, &aSig0, &aSig1 ); 5250 shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 ); 5251 } 5252 do { 5253 alternateASig0 = aSig0; 5254 alternateASig1 = aSig1; 5255 ++q; 5256 sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 ); 5257 } while ( 0 <= (sbits64) aSig0 ); 5258 add128( 5259 aSig0, aSig1, alternateASig0, alternateASig1, (bits64 *)&sigMean0, &sigMean1 ); 5260 if ( ( sigMean0 < 0 ) 5261 || ( ( ( sigMean0 | sigMean1 ) == 0 ) && ( q & 1 ) ) ) { 5262 aSig0 = alternateASig0; 5263 aSig1 = alternateASig1; 5264 } 5265 zSign = ( (sbits64) aSig0 < 0 ); 5266 if ( zSign ) sub128( 0, 0, aSig0, aSig1, &aSig0, &aSig1 ); 5267 return 5268 normalizeRoundAndPackFloat128( aSign ^ zSign, bExp - 4, aSig0, aSig1 ); 5269 5270 } 5271 5272 /* 5273 ------------------------------------------------------------------------------- 5274 Returns the square root of the quadruple-precision floating-point value `a'. 5275 The operation is performed according to the IEC/IEEE Standard for Binary 5276 Floating-Point Arithmetic. 5277 ------------------------------------------------------------------------------- 5278 */ 5279 float128 float128_sqrt( float128 a ) 5280 { 5281 flag aSign; 5282 int32 aExp, zExp; 5283 bits64 aSig0, aSig1, zSig0, zSig1, zSig2, doubleZSig0; 5284 bits64 rem0, rem1, rem2, rem3, term0, term1, term2, term3; 5285 float128 z; 5286 5287 aSig1 = extractFloat128Frac1( a ); 5288 aSig0 = extractFloat128Frac0( a ); 5289 aExp = extractFloat128Exp( a ); 5290 aSign = extractFloat128Sign( a ); 5291 if ( aExp == 0x7FFF ) { 5292 if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, a ); 5293 if ( ! aSign ) return a; 5294 goto invalid; 5295 } 5296 if ( aSign ) { 5297 if ( ( aExp | aSig0 | aSig1 ) == 0 ) return a; 5298 invalid: 5299 float_raise( float_flag_invalid ); 5300 z.low = float128_default_nan_low; 5301 z.high = float128_default_nan_high; 5302 return z; 5303 } 5304 if ( aExp == 0 ) { 5305 if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( 0, 0, 0, 0 ); 5306 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 ); 5307 } 5308 zExp = (int32) ( (bits32)(aExp - 0x3FFF) >> 1) + 0x3FFE; 5309 aSig0 |= LIT64( 0x0001000000000000 ); 5310 zSig0 = estimateSqrt32((int16)aExp, (bits32)(aSig0>>17)); 5311 shortShift128Left( aSig0, aSig1, 13 - ( aExp & 1 ), &aSig0, &aSig1 ); 5312 zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 ); 5313 doubleZSig0 = zSig0<<1; 5314 mul64To128( zSig0, zSig0, &term0, &term1 ); 5315 sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 ); 5316 while ( (sbits64) rem0 < 0 ) { 5317 --zSig0; 5318 doubleZSig0 -= 2; 5319 add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 ); 5320 } 5321 zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 ); 5322 if ( ( zSig1 & 0x1FFF ) <= 5 ) { 5323 if ( zSig1 == 0 ) zSig1 = 1; 5324 mul64To128( doubleZSig0, zSig1, &term1, &term2 ); 5325 sub128( rem1, 0, term1, term2, &rem1, &rem2 ); 5326 mul64To128( zSig1, zSig1, &term2, &term3 ); 5327 sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 ); 5328 while ( (sbits64) rem1 < 0 ) { 5329 --zSig1; 5330 shortShift128Left( 0, zSig1, 1, &term2, &term3 ); 5331 term3 |= 1; 5332 term2 |= doubleZSig0; 5333 add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 ); 5334 } 5335 zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 ); 5336 } 5337 shift128ExtraRightJamming( zSig0, zSig1, 0, 14, &zSig0, &zSig1, &zSig2 ); 5338 return roundAndPackFloat128( 0, zExp, zSig0, zSig1, zSig2 ); 5339 5340 } 5341 5342 /* 5343 ------------------------------------------------------------------------------- 5344 Returns 1 if the quadruple-precision floating-point value `a' is equal to 5345 the corresponding value `b', and 0 otherwise. The comparison is performed 5346 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 5347 ------------------------------------------------------------------------------- 5348 */ 5349 flag float128_eq( float128 a, float128 b ) 5350 { 5351 5352 if ( ( ( extractFloat128Exp( a ) == 0x7FFF ) 5353 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) ) 5354 || ( ( extractFloat128Exp( b ) == 0x7FFF ) 5355 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) ) 5356 ) { 5357 if ( float128_is_signaling_nan( a ) 5358 || float128_is_signaling_nan( b ) ) { 5359 float_raise( float_flag_invalid ); 5360 } 5361 return 0; 5362 } 5363 return 5364 ( a.low == b.low ) 5365 && ( ( a.high == b.high ) 5366 || ( ( a.low == 0 ) 5367 && ( (bits64) ( ( a.high | b.high )<<1 ) == 0 ) ) 5368 ); 5369 5370 } 5371 5372 /* 5373 ------------------------------------------------------------------------------- 5374 Returns 1 if the quadruple-precision floating-point value `a' is less than 5375 or equal to the corresponding value `b', and 0 otherwise. The comparison 5376 is performed according to the IEC/IEEE Standard for Binary Floating-Point 5377 Arithmetic. 5378 ------------------------------------------------------------------------------- 5379 */ 5380 flag float128_le( float128 a, float128 b ) 5381 { 5382 flag aSign, bSign; 5383 5384 if ( ( ( extractFloat128Exp( a ) == 0x7FFF ) 5385 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) ) 5386 || ( ( extractFloat128Exp( b ) == 0x7FFF ) 5387 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) ) 5388 ) { 5389 float_raise( float_flag_invalid ); 5390 return 0; 5391 } 5392 aSign = extractFloat128Sign( a ); 5393 bSign = extractFloat128Sign( b ); 5394 if ( aSign != bSign ) { 5395 return 5396 aSign 5397 || ( ( ( (bits64) ( ( a.high | b.high )<<1 ) ) | a.low | b.low ) 5398 == 0 ); 5399 } 5400 return 5401 aSign ? le128( b.high, b.low, a.high, a.low ) 5402 : le128( a.high, a.low, b.high, b.low ); 5403 5404 } 5405 5406 /* 5407 ------------------------------------------------------------------------------- 5408 Returns 1 if the quadruple-precision floating-point value `a' is less than 5409 the corresponding value `b', and 0 otherwise. The comparison is performed 5410 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 5411 ------------------------------------------------------------------------------- 5412 */ 5413 flag float128_lt( float128 a, float128 b ) 5414 { 5415 flag aSign, bSign; 5416 5417 if ( ( ( extractFloat128Exp( a ) == 0x7FFF ) 5418 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) ) 5419 || ( ( extractFloat128Exp( b ) == 0x7FFF ) 5420 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) ) 5421 ) { 5422 float_raise( float_flag_invalid ); 5423 return 0; 5424 } 5425 aSign = extractFloat128Sign( a ); 5426 bSign = extractFloat128Sign( b ); 5427 if ( aSign != bSign ) { 5428 return 5429 aSign 5430 && ( ( ( (bits64) ( ( a.high | b.high )<<1 ) ) | a.low | b.low ) 5431 != 0 ); 5432 } 5433 return 5434 aSign ? lt128( b.high, b.low, a.high, a.low ) 5435 : lt128( a.high, a.low, b.high, b.low ); 5436 5437 } 5438 5439 /* 5440 ------------------------------------------------------------------------------- 5441 Returns 1 if the quadruple-precision floating-point value `a' is equal to 5442 the corresponding value `b', and 0 otherwise. The invalid exception is 5443 raised if either operand is a NaN. Otherwise, the comparison is performed 5444 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 5445 ------------------------------------------------------------------------------- 5446 */ 5447 flag float128_eq_signaling( float128 a, float128 b ) 5448 { 5449 5450 if ( ( ( extractFloat128Exp( a ) == 0x7FFF ) 5451 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) ) 5452 || ( ( extractFloat128Exp( b ) == 0x7FFF ) 5453 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) ) 5454 ) { 5455 float_raise( float_flag_invalid ); 5456 return 0; 5457 } 5458 return 5459 ( a.low == b.low ) 5460 && ( ( a.high == b.high ) 5461 || ( ( a.low == 0 ) 5462 && ( (bits64) ( ( a.high | b.high )<<1 ) == 0 ) ) 5463 ); 5464 5465 } 5466 5467 /* 5468 ------------------------------------------------------------------------------- 5469 Returns 1 if the quadruple-precision floating-point value `a' is less than 5470 or equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not 5471 cause an exception. Otherwise, the comparison is performed according to the 5472 IEC/IEEE Standard for Binary Floating-Point Arithmetic. 5473 ------------------------------------------------------------------------------- 5474 */ 5475 flag float128_le_quiet( float128 a, float128 b ) 5476 { 5477 flag aSign, bSign; 5478 5479 if ( ( ( extractFloat128Exp( a ) == 0x7FFF ) 5480 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) ) 5481 || ( ( extractFloat128Exp( b ) == 0x7FFF ) 5482 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) ) 5483 ) { 5484 if ( float128_is_signaling_nan( a ) 5485 || float128_is_signaling_nan( b ) ) { 5486 float_raise( float_flag_invalid ); 5487 } 5488 return 0; 5489 } 5490 aSign = extractFloat128Sign( a ); 5491 bSign = extractFloat128Sign( b ); 5492 if ( aSign != bSign ) { 5493 return 5494 aSign 5495 || ( ( ( (bits64) ( ( a.high | b.high )<<1 ) ) | a.low | b.low ) 5496 == 0 ); 5497 } 5498 return 5499 aSign ? le128( b.high, b.low, a.high, a.low ) 5500 : le128( a.high, a.low, b.high, b.low ); 5501 5502 } 5503 5504 /* 5505 ------------------------------------------------------------------------------- 5506 Returns 1 if the quadruple-precision floating-point value `a' is less than 5507 the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an 5508 exception. Otherwise, the comparison is performed according to the IEC/IEEE 5509 Standard for Binary Floating-Point Arithmetic. 5510 ------------------------------------------------------------------------------- 5511 */ 5512 flag float128_lt_quiet( float128 a, float128 b ) 5513 { 5514 flag aSign, bSign; 5515 5516 if ( ( ( extractFloat128Exp( a ) == 0x7FFF ) 5517 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) ) 5518 || ( ( extractFloat128Exp( b ) == 0x7FFF ) 5519 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) ) 5520 ) { 5521 if ( float128_is_signaling_nan( a ) 5522 || float128_is_signaling_nan( b ) ) { 5523 float_raise( float_flag_invalid ); 5524 } 5525 return 0; 5526 } 5527 aSign = extractFloat128Sign( a ); 5528 bSign = extractFloat128Sign( b ); 5529 if ( aSign != bSign ) { 5530 return 5531 aSign 5532 && ( ( ( (bits64) ( ( a.high | b.high )<<1 ) ) | a.low | b.low ) 5533 != 0 ); 5534 } 5535 return 5536 aSign ? lt128( b.high, b.low, a.high, a.low ) 5537 : lt128( a.high, a.low, b.high, b.low ); 5538 5539 } 5540 5541 #endif 5542 5543 5544 #if defined(SOFTFLOAT_FOR_GCC) && defined(SOFTFLOAT_NEED_FIXUNS) 5545 5546 /* 5547 * These two routines are not part of the original softfloat distribution. 5548 * 5549 * They are based on the corresponding conversions to integer but return 5550 * unsigned numbers instead since these functions are required by GCC. 5551 * 5552 * Added by Mark Brinicombe <mark (at) NetBSD.org> 27/09/97 5553 * 5554 * float64 version overhauled for SoftFloat 2a [bjh21 2000-07-15] 5555 */ 5556 5557 /* 5558 ------------------------------------------------------------------------------- 5559 Returns the result of converting the double-precision floating-point value 5560 `a' to the 32-bit unsigned integer format. The conversion is 5561 performed according to the IEC/IEEE Standard for Binary Floating-point 5562 Arithmetic, except that the conversion is always rounded toward zero. If 5563 `a' is a NaN, the largest positive integer is returned. If the conversion 5564 overflows, the largest integer positive is returned. 5565 ------------------------------------------------------------------------------- 5566 */ 5567 uint32 float64_to_uint32_round_to_zero( float64 a ) 5568 { 5569 flag aSign; 5570 int16 aExp, shiftCount; 5571 bits64 aSig, savedASig; 5572 uint32 z; 5573 5574 aSig = extractFloat64Frac( a ); 5575 aExp = extractFloat64Exp( a ); 5576 aSign = extractFloat64Sign( a ); 5577 5578 if (aSign) { 5579 float_raise( float_flag_invalid ); 5580 return(0); 5581 } 5582 5583 if ( 0x41E < aExp ) { 5584 float_raise( float_flag_invalid ); 5585 return 0xffffffff; 5586 } 5587 else if ( aExp < 0x3FF ) { 5588 if ( aExp || aSig ) set_float_exception_inexact_flag(); 5589 return 0; 5590 } 5591 aSig |= LIT64( 0x0010000000000000 ); 5592 shiftCount = 0x433 - aExp; 5593 savedASig = aSig; 5594 aSig >>= shiftCount; 5595 z = (uint32)aSig; 5596 if ( ( aSig<<shiftCount ) != savedASig ) { 5597 set_float_exception_inexact_flag(); 5598 } 5599 return z; 5600 5601 } 5602 5603 /* 5604 ------------------------------------------------------------------------------- 5605 Returns the result of converting the single-precision floating-point value 5606 `a' to the 32-bit unsigned integer format. The conversion is 5607 performed according to the IEC/IEEE Standard for Binary Floating-point 5608 Arithmetic, except that the conversion is always rounded toward zero. If 5609 `a' is a NaN, the largest positive integer is returned. If the conversion 5610 overflows, the largest positive integer is returned. 5611 ------------------------------------------------------------------------------- 5612 */ 5613 uint32 float32_to_uint32_round_to_zero( float32 a ) 5614 { 5615 flag aSign; 5616 int16 aExp, shiftCount; 5617 bits32 aSig; 5618 uint32 z; 5619 5620 aSig = extractFloat32Frac( a ); 5621 aExp = extractFloat32Exp( a ); 5622 aSign = extractFloat32Sign( a ); 5623 shiftCount = aExp - 0x9E; 5624 5625 if (aSign) { 5626 float_raise( float_flag_invalid ); 5627 return(0); 5628 } 5629 if ( 0 < shiftCount ) { 5630 float_raise( float_flag_invalid ); 5631 return 0xFFFFFFFF; 5632 } 5633 else if ( aExp <= 0x7E ) { 5634 if ( aExp | aSig ) set_float_exception_inexact_flag(); 5635 return 0; 5636 } 5637 aSig = ( aSig | 0x800000 )<<8; 5638 z = aSig>>( - shiftCount ); 5639 if ( aSig<<( shiftCount & 31 ) ) { 5640 set_float_exception_inexact_flag(); 5641 } 5642 return z; 5643 5644 } 5645 5646 #endif 5647