softfloat.c revision 1.16 1 /* $NetBSD: softfloat.c,v 1.16 2025/09/17 11:37:38 nat Exp $ */
2
3 /*
4 * This version hacked for use with gcc -msoft-float by bjh21.
5 * (Mostly a case of #ifdefing out things GCC doesn't need or provides
6 * itself).
7 */
8
9 /*
10 * Things you may want to define:
11 *
12 * SOFTFLOAT_FOR_GCC - build only those functions necessary for GCC (with
13 * -msoft-float) to work. Include "softfloat-for-gcc.h" to get them
14 * properly renamed.
15 */
16
17 /*
18 ===============================================================================
19
20 This C source file is part of the SoftFloat IEC/IEEE Floating-point
21 Arithmetic Package, Release 2a.
22
23 Written by John R. Hauser. This work was made possible in part by the
24 International Computer Science Institute, located at Suite 600, 1947 Center
25 Street, Berkeley, California 94704. Funding was partially provided by the
26 National Science Foundation under grant MIP-9311980. The original version
27 of this code was written as part of a project to build a fixed-point vector
28 processor in collaboration with the University of California at Berkeley,
29 overseen by Profs. Nelson Morgan and John Wawrzynek. More information
30 is available through the Web page `http://HTTP.CS.Berkeley.EDU/~jhauser/
31 arithmetic/SoftFloat.html'.
32
33 THIS SOFTWARE IS DISTRIBUTED AS IS, FOR FREE. Although reasonable effort
34 has been made to avoid it, THIS SOFTWARE MAY CONTAIN FAULTS THAT WILL AT
35 TIMES RESULT IN INCORRECT BEHAVIOR. USE OF THIS SOFTWARE IS RESTRICTED TO
36 PERSONS AND ORGANIZATIONS WHO CAN AND WILL TAKE FULL RESPONSIBILITY FOR ANY
37 AND ALL LOSSES, COSTS, OR OTHER PROBLEMS ARISING FROM ITS USE.
38
39 Derivative works are acceptable, even for commercial purposes, so long as
40 (1) they include prominent notice that the work is derivative, and (2) they
41 include prominent notice akin to these four paragraphs for those parts of
42 this code that are retained.
43
44 ===============================================================================
45 */
46
47 #include <sys/cdefs.h>
48 #if defined(LIBC_SCCS) && !defined(lint)
49 __RCSID("$NetBSD: softfloat.c,v 1.16 2025/09/17 11:37:38 nat Exp $");
50 #endif /* LIBC_SCCS and not lint */
51
52 #ifdef SOFTFLOAT_FOR_GCC
53 #include "softfloat-for-gcc.h"
54 #endif
55
56 #include "milieu.h"
57 #include "softfloat.h"
58
59 /*
60 * Conversions between floats as stored in memory and floats as
61 * SoftFloat uses them
62 */
63 #ifndef FLOAT64_DEMANGLE
64 #define FLOAT64_DEMANGLE(a) (a)
65 #endif
66 #ifndef FLOAT64_MANGLE
67 #define FLOAT64_MANGLE(a) (a)
68 #endif
69
70 #ifndef X80SHIFT
71 #define X80SHIFT 0
72 #endif
73 /*
74 -------------------------------------------------------------------------------
75 Floating-point rounding mode, extended double-precision rounding precision,
76 and exception flags.
77 -------------------------------------------------------------------------------
78 */
79 #ifndef set_float_rounding_mode
80 fp_rnd float_rounding_mode = float_round_nearest_even;
81 fp_except float_exception_flags = 0;
82 #endif
83 #ifndef set_float_exception_inexact_flag
84 #define set_float_exception_inexact_flag() \
85 ((void)(float_exception_flags |= float_flag_inexact))
86 #endif
87 #ifdef FLOATX80
88 int8 floatx80_rounding_precision = 80;
89 #endif
90
91 /*
92 -------------------------------------------------------------------------------
93 Primitive arithmetic functions, including multi-word arithmetic, and
94 division and square root approximations. (Can be specialized to target if
95 desired.)
96 -------------------------------------------------------------------------------
97 */
98 #include "softfloat-macros"
99
100 /*
101 -------------------------------------------------------------------------------
102 Functions and definitions to determine: (1) whether tininess for underflow
103 is detected before or after rounding by default, (2) what (if anything)
104 happens when exceptions are raised, (3) how signaling NaNs are distinguished
105 from quiet NaNs, (4) the default generated quiet NaNs, and (5) how NaNs
106 are propagated from function inputs to output. These details are target-
107 specific.
108 -------------------------------------------------------------------------------
109 */
110 #include "softfloat-specialize"
111
112 #if !defined(SOFTFLOAT_FOR_GCC) || defined(FLOATX80) || defined(FLOAT128)
113 /*
114 -------------------------------------------------------------------------------
115 Takes a 64-bit fixed-point value `absZ' with binary point between bits 6
116 and 7, and returns the properly rounded 32-bit integer corresponding to the
117 input. If `zSign' is 1, the input is negated before being converted to an
118 integer. Bit 63 of `absZ' must be zero. Ordinarily, the fixed-point input
119 is simply rounded to an integer, with the inexact exception raised if the
120 input cannot be represented exactly as an integer. However, if the fixed-
121 point input is too large, the invalid exception is raised and the largest
122 positive or negative integer is returned.
123 -------------------------------------------------------------------------------
124 */
125 static int32 roundAndPackInt32( flag zSign, bits64 absZ )
126 {
127 int8 roundingMode;
128 flag roundNearestEven;
129 int8 roundIncrement, roundBits;
130 int32 z;
131
132 roundingMode = float_rounding_mode;
133 roundNearestEven = ( roundingMode == float_round_nearest_even );
134 roundIncrement = 0x40;
135 if ( ! roundNearestEven ) {
136 if ( roundingMode == float_round_to_zero ) {
137 roundIncrement = 0;
138 }
139 else {
140 roundIncrement = 0x7F;
141 if ( zSign ) {
142 if ( roundingMode == float_round_up ) roundIncrement = 0;
143 }
144 else {
145 if ( roundingMode == float_round_down ) roundIncrement = 0;
146 }
147 }
148 }
149 roundBits = (int8)(absZ & 0x7F);
150 absZ = ( absZ + roundIncrement )>>7;
151 absZ &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven );
152 z = (int32)absZ;
153 if ( zSign ) z = - z;
154 if ( ( absZ>>32 ) || ( z && ( ( z < 0 ) ^ zSign ) ) ) {
155 float_raise( float_flag_invalid );
156 return zSign ? (sbits32) 0x80000000 : 0x7FFFFFFF;
157 }
158 if ( roundBits ) set_float_exception_inexact_flag();
159 return z;
160
161 }
162
163 /*
164 -------------------------------------------------------------------------------
165 Takes the 128-bit fixed-point value formed by concatenating `absZ0' and
166 `absZ1', with binary point between bits 63 and 64 (between the input words),
167 and returns the properly rounded 64-bit integer corresponding to the input.
168 If `zSign' is 1, the input is negated before being converted to an integer.
169 Ordinarily, the fixed-point input is simply rounded to an integer, with
170 the inexact exception raised if the input cannot be represented exactly as
171 an integer. However, if the fixed-point input is too large, the invalid
172 exception is raised and the largest positive or negative integer is
173 returned.
174 -------------------------------------------------------------------------------
175 */
176 static int64 roundAndPackInt64( flag zSign, bits64 absZ0, bits64 absZ1 )
177 {
178 int8 roundingMode;
179 flag roundNearestEven, increment;
180 int64 z;
181
182 roundingMode = float_rounding_mode;
183 roundNearestEven = ( roundingMode == float_round_nearest_even );
184 increment = ( (sbits64) absZ1 < 0 );
185 if ( ! roundNearestEven ) {
186 if ( roundingMode == float_round_to_zero ) {
187 increment = 0;
188 }
189 else {
190 if ( zSign ) {
191 increment = ( roundingMode == float_round_down ) && absZ1;
192 }
193 else {
194 increment = ( roundingMode == float_round_up ) && absZ1;
195 }
196 }
197 }
198 if ( increment ) {
199 ++absZ0;
200 if ( absZ0 == 0 ) goto overflow;
201 absZ0 &= ~ ( ( (bits64) ( absZ1<<1 ) == 0 ) & roundNearestEven );
202 }
203 z = absZ0;
204 if ( zSign ) z = - z;
205 if ( z && ( ( z < 0 ) ^ zSign ) ) {
206 overflow:
207 float_raise( float_flag_invalid );
208 return
209 zSign ? (sbits64) LIT64( 0x8000000000000000 )
210 : LIT64( 0x7FFFFFFFFFFFFFFF );
211 }
212 if ( absZ1 ) set_float_exception_inexact_flag();
213 return z;
214
215 }
216 #endif
217
218 /*
219 -------------------------------------------------------------------------------
220 Returns the fraction bits of the single-precision floating-point value `a'.
221 -------------------------------------------------------------------------------
222 */
223 INLINE bits32 extractFloat32Frac( float32 a )
224 {
225
226 return a & 0x007FFFFF;
227
228 }
229
230 /*
231 -------------------------------------------------------------------------------
232 Returns the exponent bits of the single-precision floating-point value `a'.
233 -------------------------------------------------------------------------------
234 */
235 INLINE int16 extractFloat32Exp( float32 a )
236 {
237
238 return ( a>>23 ) & 0xFF;
239
240 }
241
242 /*
243 -------------------------------------------------------------------------------
244 Returns the sign bit of the single-precision floating-point value `a'.
245 -------------------------------------------------------------------------------
246 */
247 INLINE flag extractFloat32Sign( float32 a )
248 {
249
250 return a>>31;
251
252 }
253
254 /*
255 -------------------------------------------------------------------------------
256 Normalizes the subnormal single-precision floating-point value represented
257 by the denormalized significand `aSig'. The normalized exponent and
258 significand are stored at the locations pointed to by `zExpPtr' and
259 `zSigPtr', respectively.
260 -------------------------------------------------------------------------------
261 */
262 static void
263 normalizeFloat32Subnormal( bits32 aSig, int16 *zExpPtr, bits32 *zSigPtr )
264 {
265 int8 shiftCount;
266
267 shiftCount = countLeadingZeros32( aSig ) - 8;
268 *zSigPtr = aSig<<shiftCount;
269 *zExpPtr = 1 - shiftCount;
270
271 }
272
273 /*
274 -------------------------------------------------------------------------------
275 Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
276 single-precision floating-point value, returning the result. After being
277 shifted into the proper positions, the three fields are simply added
278 together to form the result. This means that any integer portion of `zSig'
279 will be added into the exponent. Since a properly normalized significand
280 will have an integer portion equal to 1, the `zExp' input should be 1 less
281 than the desired result exponent whenever `zSig' is a complete, normalized
282 significand.
283 -------------------------------------------------------------------------------
284 */
285 INLINE float32 packFloat32( flag zSign, int16 zExp, bits32 zSig )
286 {
287
288 return ( ( (bits32) zSign )<<31 ) + ( ( (bits32) zExp )<<23 ) + zSig;
289
290 }
291
292 /*
293 -------------------------------------------------------------------------------
294 Takes an abstract floating-point value having sign `zSign', exponent `zExp',
295 and significand `zSig', and returns the proper single-precision floating-
296 point value corresponding to the abstract input. Ordinarily, the abstract
297 value is simply rounded and packed into the single-precision format, with
298 the inexact exception raised if the abstract input cannot be represented
299 exactly. However, if the abstract value is too large, the overflow and
300 inexact exceptions are raised and an infinity or maximal finite value is
301 returned. If the abstract value is too small, the input value is rounded to
302 a subnormal number, and the underflow and inexact exceptions are raised if
303 the abstract input cannot be represented exactly as a subnormal single-
304 precision floating-point number.
305 The input significand `zSig' has its binary point between bits 30
306 and 29, which is 7 bits to the left of the usual location. This shifted
307 significand must be normalized or smaller. If `zSig' is not normalized,
308 `zExp' must be 0; in that case, the result returned is a subnormal number,
309 and it must not require rounding. In the usual case that `zSig' is
310 normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
311 The handling of underflow and overflow follows the IEC/IEEE Standard for
312 Binary Floating-Point Arithmetic.
313 -------------------------------------------------------------------------------
314 */
315 static float32 roundAndPackFloat32( flag zSign, int16 zExp, bits32 zSig )
316 {
317 int8 roundingMode;
318 flag roundNearestEven;
319 int8 roundIncrement, roundBits;
320 flag isTiny;
321
322 roundingMode = float_rounding_mode;
323 roundNearestEven = ( roundingMode == float_round_nearest_even );
324 roundIncrement = 0x40;
325 if ( ! roundNearestEven ) {
326 if ( roundingMode == float_round_to_zero ) {
327 roundIncrement = 0;
328 }
329 else {
330 roundIncrement = 0x7F;
331 if ( zSign ) {
332 if ( roundingMode == float_round_up ) roundIncrement = 0;
333 }
334 else {
335 if ( roundingMode == float_round_down ) roundIncrement = 0;
336 }
337 }
338 }
339 roundBits = zSig & 0x7F;
340 if ( 0xFD <= (bits16) zExp ) {
341 if ( ( 0xFD < zExp )
342 || ( ( zExp == 0xFD )
343 && ( (sbits32) ( zSig + roundIncrement ) < 0 ) )
344 ) {
345 float_raise( float_flag_overflow | float_flag_inexact );
346 return packFloat32( zSign, 0xFF, 0 ) - ( roundIncrement == 0 );
347 }
348 if ( zExp < 0 ) {
349 isTiny =
350 ( float_detect_tininess == float_tininess_before_rounding )
351 || ( zExp < -1 )
352 || ( zSig + roundIncrement < 0x80000000U );
353 shift32RightJamming( zSig, - zExp, &zSig );
354 zExp = 0;
355 roundBits = zSig & 0x7F;
356 if ( isTiny && roundBits ) float_raise( float_flag_underflow );
357 }
358 }
359 if ( roundBits ) set_float_exception_inexact_flag();
360 zSig = ( zSig + roundIncrement )>>7;
361 zSig &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven );
362 if ( zSig == 0 ) zExp = 0;
363 return packFloat32( zSign, zExp, zSig );
364
365 }
366
367 /*
368 -------------------------------------------------------------------------------
369 Takes an abstract floating-point value having sign `zSign', exponent `zExp',
370 and significand `zSig', and returns the proper single-precision floating-
371 point value corresponding to the abstract input. This routine is just like
372 `roundAndPackFloat32' except that `zSig' does not have to be normalized.
373 Bit 31 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
374 floating-point exponent.
375 -------------------------------------------------------------------------------
376 */
377 static float32
378 normalizeRoundAndPackFloat32( flag zSign, int16 zExp, bits32 zSig )
379 {
380 int8 shiftCount;
381
382 shiftCount = countLeadingZeros32( zSig ) - 1;
383 return roundAndPackFloat32( zSign, zExp - shiftCount, zSig<<shiftCount );
384
385 }
386
387 /*
388 -------------------------------------------------------------------------------
389 Returns the fraction bits of the double-precision floating-point value `a'.
390 -------------------------------------------------------------------------------
391 */
392 INLINE bits64 extractFloat64Frac( float64 a )
393 {
394
395 return FLOAT64_DEMANGLE(a) & LIT64( 0x000FFFFFFFFFFFFF );
396
397 }
398
399 /*
400 -------------------------------------------------------------------------------
401 Returns the exponent bits of the double-precision floating-point value `a'.
402 -------------------------------------------------------------------------------
403 */
404 INLINE int16 extractFloat64Exp( float64 a )
405 {
406
407 return (int16)((FLOAT64_DEMANGLE(a) >> 52) & 0x7FF);
408
409 }
410
411 /*
412 -------------------------------------------------------------------------------
413 Returns the sign bit of the double-precision floating-point value `a'.
414 -------------------------------------------------------------------------------
415 */
416 INLINE flag extractFloat64Sign( float64 a )
417 {
418
419 return (flag)(FLOAT64_DEMANGLE(a) >> 63);
420
421 }
422
423 /*
424 -------------------------------------------------------------------------------
425 Normalizes the subnormal double-precision floating-point value represented
426 by the denormalized significand `aSig'. The normalized exponent and
427 significand are stored at the locations pointed to by `zExpPtr' and
428 `zSigPtr', respectively.
429 -------------------------------------------------------------------------------
430 */
431 static void
432 normalizeFloat64Subnormal( bits64 aSig, int16 *zExpPtr, bits64 *zSigPtr )
433 {
434 int8 shiftCount;
435
436 shiftCount = countLeadingZeros64( aSig ) - 11;
437 *zSigPtr = aSig<<shiftCount;
438 *zExpPtr = 1 - shiftCount;
439
440 }
441
442 /*
443 -------------------------------------------------------------------------------
444 Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
445 double-precision floating-point value, returning the result. After being
446 shifted into the proper positions, the three fields are simply added
447 together to form the result. This means that any integer portion of `zSig'
448 will be added into the exponent. Since a properly normalized significand
449 will have an integer portion equal to 1, the `zExp' input should be 1 less
450 than the desired result exponent whenever `zSig' is a complete, normalized
451 significand.
452 -------------------------------------------------------------------------------
453 */
454 INLINE float64 packFloat64( flag zSign, int16 zExp, bits64 zSig )
455 {
456
457 return FLOAT64_MANGLE( ( ( (bits64) zSign )<<63 ) +
458 ( ( (bits64) zExp )<<52 ) + zSig );
459
460 }
461
462 /*
463 -------------------------------------------------------------------------------
464 Takes an abstract floating-point value having sign `zSign', exponent `zExp',
465 and significand `zSig', and returns the proper double-precision floating-
466 point value corresponding to the abstract input. Ordinarily, the abstract
467 value is simply rounded and packed into the double-precision format, with
468 the inexact exception raised if the abstract input cannot be represented
469 exactly. However, if the abstract value is too large, the overflow and
470 inexact exceptions are raised and an infinity or maximal finite value is
471 returned. If the abstract value is too small, the input value is rounded to
472 a subnormal number, and the underflow and inexact exceptions are raised if
473 the abstract input cannot be represented exactly as a subnormal double-
474 precision floating-point number.
475 The input significand `zSig' has its binary point between bits 62
476 and 61, which is 10 bits to the left of the usual location. This shifted
477 significand must be normalized or smaller. If `zSig' is not normalized,
478 `zExp' must be 0; in that case, the result returned is a subnormal number,
479 and it must not require rounding. In the usual case that `zSig' is
480 normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
481 The handling of underflow and overflow follows the IEC/IEEE Standard for
482 Binary Floating-Point Arithmetic.
483 -------------------------------------------------------------------------------
484 */
485 static float64 roundAndPackFloat64( flag zSign, int16 zExp, bits64 zSig )
486 {
487 int8 roundingMode;
488 flag roundNearestEven;
489 int16 roundIncrement, roundBits;
490 flag isTiny;
491
492 roundingMode = float_rounding_mode;
493 roundNearestEven = ( roundingMode == float_round_nearest_even );
494 roundIncrement = 0x200;
495 if ( ! roundNearestEven ) {
496 if ( roundingMode == float_round_to_zero ) {
497 roundIncrement = 0;
498 }
499 else {
500 roundIncrement = 0x3FF;
501 if ( zSign ) {
502 if ( roundingMode == float_round_up ) roundIncrement = 0;
503 }
504 else {
505 if ( roundingMode == float_round_down ) roundIncrement = 0;
506 }
507 }
508 }
509 roundBits = (int16)(zSig & 0x3FF);
510 if ( 0x7FD <= (bits16) zExp ) {
511 if ( ( 0x7FD < zExp )
512 || ( ( zExp == 0x7FD )
513 && ( (sbits64) ( zSig + roundIncrement ) < 0 ) )
514 ) {
515 float_raise( float_flag_overflow | float_flag_inexact );
516 return FLOAT64_MANGLE(
517 FLOAT64_DEMANGLE(packFloat64( zSign, 0x7FF, 0 )) -
518 ( roundIncrement == 0 ));
519 }
520 if ( zExp < 0 ) {
521 isTiny =
522 ( float_detect_tininess == float_tininess_before_rounding )
523 || ( zExp < -1 )
524 || ( zSig + roundIncrement < (bits64)LIT64( 0x8000000000000000 ) );
525 shift64RightJamming( zSig, - zExp, &zSig );
526 zExp = 0;
527 roundBits = (int16)(zSig & 0x3FF);
528 if ( isTiny && roundBits ) float_raise( float_flag_underflow );
529 }
530 }
531 if ( roundBits ) set_float_exception_inexact_flag();
532 zSig = ( zSig + roundIncrement )>>10;
533 zSig &= ~ ( ( ( roundBits ^ 0x200 ) == 0 ) & roundNearestEven );
534 if ( zSig == 0 ) zExp = 0;
535 return packFloat64( zSign, zExp, zSig );
536
537 }
538
539 /*
540 -------------------------------------------------------------------------------
541 Takes an abstract floating-point value having sign `zSign', exponent `zExp',
542 and significand `zSig', and returns the proper double-precision floating-
543 point value corresponding to the abstract input. This routine is just like
544 `roundAndPackFloat64' except that `zSig' does not have to be normalized.
545 Bit 63 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
546 floating-point exponent.
547 -------------------------------------------------------------------------------
548 */
549 static float64
550 normalizeRoundAndPackFloat64( flag zSign, int16 zExp, bits64 zSig )
551 {
552 int8 shiftCount;
553
554 shiftCount = countLeadingZeros64( zSig ) - 1;
555 return roundAndPackFloat64( zSign, zExp - shiftCount, zSig<<shiftCount );
556
557 }
558
559 #ifdef FLOATX80
560
561 /*
562 -------------------------------------------------------------------------------
563 Returns the fraction bits of the extended double-precision floating-point
564 value `a'.
565 -------------------------------------------------------------------------------
566 */
567 INLINE bits64 extractFloatx80Frac( floatx80 a )
568 {
569
570 return a.low;
571
572 }
573
574 /*
575 -------------------------------------------------------------------------------
576 Returns the exponent bits of the extended double-precision floating-point
577 value `a'.
578 -------------------------------------------------------------------------------
579 */
580 INLINE int32 extractFloatx80Exp( floatx80 a )
581 {
582
583 return (a.high>>X80SHIFT) & 0x7FFF;
584
585 }
586
587 /*
588 -------------------------------------------------------------------------------
589 Returns the sign bit of the extended double-precision floating-point value
590 `a'.
591 -------------------------------------------------------------------------------
592 */
593 INLINE flag extractFloatx80Sign( floatx80 a )
594 {
595
596 return a.high>>(15 + X80SHIFT);
597
598 }
599
600 /*
601 -------------------------------------------------------------------------------
602 Normalizes the subnormal extended double-precision floating-point value
603 represented by the denormalized significand `aSig'. The normalized exponent
604 and significand are stored at the locations pointed to by `zExpPtr' and
605 `zSigPtr', respectively.
606 -------------------------------------------------------------------------------
607 */
608 static void
609 normalizeFloatx80Subnormal( bits64 aSig, int32 *zExpPtr, bits64 *zSigPtr )
610 {
611 int8 shiftCount;
612
613 shiftCount = countLeadingZeros64( aSig );
614 *zSigPtr = aSig<<shiftCount;
615 *zExpPtr = 1 - shiftCount;
616
617 }
618
619 /*
620 -------------------------------------------------------------------------------
621 Packs the sign `zSign', exponent `zExp', and significand `zSig' into an
622 extended double-precision floating-point value, returning the result.
623 -------------------------------------------------------------------------------
624 */
625 INLINE floatx80 packFloatx80( flag zSign, int32 zExp, bits64 zSig )
626 {
627 floatx80 z;
628
629 z.low = (bits64)zSig;
630 z.high = (bits32)( ( ( (bits32) zSign )<<15 ) + zExp )<<X80SHIFT;
631 return z;
632
633 }
634
635 /*
636 -------------------------------------------------------------------------------
637 Takes an abstract floating-point value having sign `zSign', exponent `zExp',
638 and extended significand formed by the concatenation of `zSig0' and `zSig1',
639 and returns the proper extended double-precision floating-point value
640 corresponding to the abstract input. Ordinarily, the abstract value is
641 rounded and packed into the extended double-precision format, with the
642 inexact exception raised if the abstract input cannot be represented
643 exactly. However, if the abstract value is too large, the overflow and
644 inexact exceptions are raised and an infinity or maximal finite value is
645 returned. If the abstract value is too small, the input value is rounded to
646 a subnormal number, and the underflow and inexact exceptions are raised if
647 the abstract input cannot be represented exactly as a subnormal extended
648 double-precision floating-point number.
649 If `roundingPrecision' is 32 or 64, the result is rounded to the same
650 number of bits as single or double precision, respectively. Otherwise, the
651 result is rounded to the full precision of the extended double-precision
652 format.
653 The input significand must be normalized or smaller. If the input
654 significand is not normalized, `zExp' must be 0; in that case, the result
655 returned is a subnormal number, and it must not require rounding. The
656 handling of underflow and overflow follows the IEC/IEEE Standard for Binary
657 Floating-Point Arithmetic.
658 -------------------------------------------------------------------------------
659 */
660 static floatx80
661 roundAndPackFloatx80(
662 int8 roundingPrecision, flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1
663 )
664 {
665 int8 roundingMode;
666 flag roundNearestEven, increment, isTiny;
667 uint64 roundIncrement, roundMask, roundBits;
668
669 roundingMode = float_rounding_mode;
670 roundNearestEven = ( roundingMode == float_round_nearest_even );
671 if ( roundingPrecision == 80 ) goto precision80;
672 if ( roundingPrecision == 64 ) {
673 roundIncrement = LIT64( 0x0000000000000400 );
674 roundMask = LIT64( 0x00000000000007FF );
675 }
676 else if ( roundingPrecision == 32 ) {
677 roundIncrement = LIT64( 0x0000008000000000 );
678 roundMask = LIT64( 0x000000FFFFFFFFFF );
679 }
680 else {
681 goto precision80;
682 }
683 zSig0 |= ( zSig1 != 0 );
684 if ( ! roundNearestEven ) {
685 if ( roundingMode == float_round_to_zero ) {
686 roundIncrement = 0;
687 }
688 else {
689 roundIncrement = roundMask;
690 if ( zSign ) {
691 if ( roundingMode == float_round_up ) roundIncrement = 0;
692 }
693 else {
694 if ( roundingMode == float_round_down ) roundIncrement = 0;
695 }
696 }
697 }
698 roundBits = zSig0 & roundMask;
699 if ( 0x7FFD <= (bits32) ( zExp - 1 ) ) {
700 if ( ( 0x7FFE < zExp )
701 || ( ( zExp == 0x7FFE ) && ( zSig0 + roundIncrement < zSig0 ) )
702 ) {
703 goto overflow;
704 }
705 if ( zExp <= 0 ) {
706 isTiny =
707 ( float_detect_tininess == float_tininess_before_rounding )
708 || ( zExp < 0 )
709 || ( zSig0 <= zSig0 + roundIncrement );
710 shift64RightJamming( zSig0, 1 - zExp, &zSig0 );
711 zExp = 0;
712 roundBits = zSig0 & roundMask;
713 if ( isTiny && roundBits ) float_raise( float_flag_underflow );
714 if ( roundBits ) set_float_exception_inexact_flag();
715 zSig0 += roundIncrement;
716 if ( (sbits64) zSig0 < 0 ) zExp = 1;
717 roundIncrement = roundMask + 1;
718 if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) {
719 roundMask |= roundIncrement;
720 }
721 zSig0 &= ~ roundMask;
722 return packFloatx80( zSign, zExp, zSig0 );
723 }
724 }
725 if ( roundBits ) set_float_exception_inexact_flag();
726 zSig0 += roundIncrement;
727 if ( zSig0 < roundIncrement ) {
728 ++zExp;
729 zSig0 = LIT64( 0x8000000000000000 );
730 }
731 roundIncrement = roundMask + 1;
732 if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) {
733 roundMask |= roundIncrement;
734 }
735 zSig0 &= ~ roundMask;
736 if ( zSig0 == 0 ) zExp = 0;
737 return packFloatx80( zSign, zExp, zSig0 );
738 precision80:
739 increment = ( (sbits64) zSig1 < 0 );
740 if ( ! roundNearestEven ) {
741 if ( roundingMode == float_round_to_zero ) {
742 increment = 0;
743 }
744 else {
745 if ( zSign ) {
746 increment = ( roundingMode == float_round_down ) && zSig1;
747 }
748 else {
749 increment = ( roundingMode == float_round_up ) && zSig1;
750 }
751 }
752 }
753 if ( 0x7FFD <= (bits32) ( zExp - 1 ) ) {
754 if ( ( 0x7FFE < zExp )
755 || ( ( zExp == 0x7FFE )
756 && ( zSig0 == LIT64( 0xFFFFFFFFFFFFFFFF ) )
757 && increment
758 )
759 ) {
760 roundMask = 0;
761 overflow:
762 float_raise( float_flag_overflow | float_flag_inexact );
763 if ( ( roundingMode == float_round_to_zero )
764 || ( zSign && ( roundingMode == float_round_up ) )
765 || ( ! zSign && ( roundingMode == float_round_down ) )
766 ) {
767 return packFloatx80( zSign, 0x7FFE, ~ roundMask );
768 }
769 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
770 }
771 if ( zExp <= 0 ) {
772 isTiny =
773 ( float_detect_tininess == float_tininess_before_rounding )
774 || ( zExp < 0 )
775 || ! increment
776 || ( zSig0 < LIT64( 0xFFFFFFFFFFFFFFFF ) );
777 shift64ExtraRightJamming( zSig0, zSig1, 1 - zExp, &zSig0, &zSig1 );
778 zExp = 0;
779 if ( isTiny && zSig1 ) float_raise( float_flag_underflow );
780 if ( zSig1 ) set_float_exception_inexact_flag();
781 if ( roundNearestEven ) {
782 increment = ( (sbits64) zSig1 < 0 );
783 }
784 else {
785 if ( zSign ) {
786 increment = ( roundingMode == float_round_down ) && zSig1;
787 }
788 else {
789 increment = ( roundingMode == float_round_up ) && zSig1;
790 }
791 }
792 if ( increment ) {
793 ++zSig0;
794 zSig0 &=
795 ~ ( ( (bits64) ( zSig1<<1 ) == 0 ) & roundNearestEven );
796 if ( (sbits64) zSig0 < 0 ) zExp = 1;
797 }
798 return packFloatx80( zSign, zExp, zSig0 );
799 }
800 }
801 if ( zSig1 ) set_float_exception_inexact_flag();
802 if ( increment ) {
803 ++zSig0;
804 if ( zSig0 == 0 ) {
805 ++zExp;
806 zSig0 = LIT64( 0x8000000000000000 );
807 }
808 else {
809 zSig0 &= ~ ( ( (bits64) ( zSig1<<1 ) == 0 ) & roundNearestEven );
810 }
811 }
812 else {
813 if ( zSig0 == 0 ) zExp = 0;
814 }
815 return packFloatx80( zSign, zExp, zSig0 );
816
817 }
818
819 /*
820 -------------------------------------------------------------------------------
821 Takes an abstract floating-point value having sign `zSign', exponent
822 `zExp', and significand formed by the concatenation of `zSig0' and `zSig1',
823 and returns the proper extended double-precision floating-point value
824 corresponding to the abstract input. This routine is just like
825 `roundAndPackFloatx80' except that the input significand does not have to be
826 normalized.
827 -------------------------------------------------------------------------------
828 */
829 static floatx80
830 normalizeRoundAndPackFloatx80(
831 int8 roundingPrecision, flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1
832 )
833 {
834 int8 shiftCount;
835
836 if ( zSig0 == 0 ) {
837 zSig0 = zSig1;
838 zSig1 = 0;
839 zExp -= 64;
840 }
841 shiftCount = countLeadingZeros64( zSig0 );
842 shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
843 zExp -= shiftCount;
844 return
845 roundAndPackFloatx80( roundingPrecision, zSign, zExp, zSig0, zSig1 );
846
847 }
848
849 #endif
850
851 #ifdef FLOAT128
852
853 /*
854 -------------------------------------------------------------------------------
855 Returns the least-significant 64 fraction bits of the quadruple-precision
856 floating-point value `a'.
857 -------------------------------------------------------------------------------
858 */
859 INLINE bits64 extractFloat128Frac1( float128 a )
860 {
861
862 return a.low;
863
864 }
865
866 /*
867 -------------------------------------------------------------------------------
868 Returns the most-significant 48 fraction bits of the quadruple-precision
869 floating-point value `a'.
870 -------------------------------------------------------------------------------
871 */
872 INLINE bits64 extractFloat128Frac0( float128 a )
873 {
874
875 return a.high & LIT64( 0x0000FFFFFFFFFFFF );
876
877 }
878
879 /*
880 -------------------------------------------------------------------------------
881 Returns the exponent bits of the quadruple-precision floating-point value
882 `a'.
883 -------------------------------------------------------------------------------
884 */
885 INLINE int32 extractFloat128Exp( float128 a )
886 {
887
888 return (int32)((a.high >> 48) & 0x7FFF);
889
890 }
891
892 /*
893 -------------------------------------------------------------------------------
894 Returns the sign bit of the quadruple-precision floating-point value `a'.
895 -------------------------------------------------------------------------------
896 */
897 INLINE flag extractFloat128Sign( float128 a )
898 {
899
900 return (flag)(a.high >> 63);
901
902 }
903
904 /*
905 -------------------------------------------------------------------------------
906 Normalizes the subnormal quadruple-precision floating-point value
907 represented by the denormalized significand formed by the concatenation of
908 `aSig0' and `aSig1'. The normalized exponent is stored at the location
909 pointed to by `zExpPtr'. The most significant 49 bits of the normalized
910 significand are stored at the location pointed to by `zSig0Ptr', and the
911 least significant 64 bits of the normalized significand are stored at the
912 location pointed to by `zSig1Ptr'.
913 -------------------------------------------------------------------------------
914 */
915 static void
916 normalizeFloat128Subnormal(
917 bits64 aSig0,
918 bits64 aSig1,
919 int32 *zExpPtr,
920 bits64 *zSig0Ptr,
921 bits64 *zSig1Ptr
922 )
923 {
924 int8 shiftCount;
925
926 if ( aSig0 == 0 ) {
927 shiftCount = countLeadingZeros64( aSig1 ) - 15;
928 if ( shiftCount < 0 ) {
929 *zSig0Ptr = aSig1>>( - shiftCount );
930 *zSig1Ptr = aSig1<<( shiftCount & 63 );
931 }
932 else {
933 *zSig0Ptr = aSig1<<shiftCount;
934 *zSig1Ptr = 0;
935 }
936 *zExpPtr = - shiftCount - 63;
937 }
938 else {
939 shiftCount = countLeadingZeros64( aSig0 ) - 15;
940 shortShift128Left( aSig0, aSig1, shiftCount, zSig0Ptr, zSig1Ptr );
941 *zExpPtr = 1 - shiftCount;
942 }
943
944 }
945
946 /*
947 -------------------------------------------------------------------------------
948 Packs the sign `zSign', the exponent `zExp', and the significand formed
949 by the concatenation of `zSig0' and `zSig1' into a quadruple-precision
950 floating-point value, returning the result. After being shifted into the
951 proper positions, the three fields `zSign', `zExp', and `zSig0' are simply
952 added together to form the most significant 32 bits of the result. This
953 means that any integer portion of `zSig0' will be added into the exponent.
954 Since a properly normalized significand will have an integer portion equal
955 to 1, the `zExp' input should be 1 less than the desired result exponent
956 whenever `zSig0' and `zSig1' concatenated form a complete, normalized
957 significand.
958 -------------------------------------------------------------------------------
959 */
960 INLINE float128
961 packFloat128( flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1 )
962 {
963 float128 z;
964
965 z.low = zSig1;
966 z.high = ( ( (bits64) zSign )<<63 ) + ( ( (bits64) zExp )<<48 ) + zSig0;
967 return z;
968
969 }
970
971 /*
972 -------------------------------------------------------------------------------
973 Takes an abstract floating-point value having sign `zSign', exponent `zExp',
974 and extended significand formed by the concatenation of `zSig0', `zSig1',
975 and `zSig2', and returns the proper quadruple-precision floating-point value
976 corresponding to the abstract input. Ordinarily, the abstract value is
977 simply rounded and packed into the quadruple-precision format, with the
978 inexact exception raised if the abstract input cannot be represented
979 exactly. However, if the abstract value is too large, the overflow and
980 inexact exceptions are raised and an infinity or maximal finite value is
981 returned. If the abstract value is too small, the input value is rounded to
982 a subnormal number, and the underflow and inexact exceptions are raised if
983 the abstract input cannot be represented exactly as a subnormal quadruple-
984 precision floating-point number.
985 The input significand must be normalized or smaller. If the input
986 significand is not normalized, `zExp' must be 0; in that case, the result
987 returned is a subnormal number, and it must not require rounding. In the
988 usual case that the input significand is normalized, `zExp' must be 1 less
989 than the ``true'' floating-point exponent. The handling of underflow and
990 overflow follows the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
991 -------------------------------------------------------------------------------
992 */
993 static float128
994 roundAndPackFloat128(
995 flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1, bits64 zSig2 )
996 {
997 int8 roundingMode;
998 flag roundNearestEven, increment, isTiny;
999
1000 roundingMode = float_rounding_mode;
1001 roundNearestEven = ( roundingMode == float_round_nearest_even );
1002 increment = ( (sbits64) zSig2 < 0 );
1003 if ( ! roundNearestEven ) {
1004 if ( roundingMode == float_round_to_zero ) {
1005 increment = 0;
1006 }
1007 else {
1008 if ( zSign ) {
1009 increment = ( roundingMode == float_round_down ) && zSig2;
1010 }
1011 else {
1012 increment = ( roundingMode == float_round_up ) && zSig2;
1013 }
1014 }
1015 }
1016 if ( 0x7FFD <= (bits32) zExp ) {
1017 if ( ( 0x7FFD < zExp )
1018 || ( ( zExp == 0x7FFD )
1019 && eq128(
1020 LIT64( 0x0001FFFFFFFFFFFF ),
1021 LIT64( 0xFFFFFFFFFFFFFFFF ),
1022 zSig0,
1023 zSig1
1024 )
1025 && increment
1026 )
1027 ) {
1028 float_raise( float_flag_overflow | float_flag_inexact );
1029 if ( ( roundingMode == float_round_to_zero )
1030 || ( zSign && ( roundingMode == float_round_up ) )
1031 || ( ! zSign && ( roundingMode == float_round_down ) )
1032 ) {
1033 return
1034 packFloat128(
1035 zSign,
1036 0x7FFE,
1037 LIT64( 0x0000FFFFFFFFFFFF ),
1038 LIT64( 0xFFFFFFFFFFFFFFFF )
1039 );
1040 }
1041 return packFloat128( zSign, 0x7FFF, 0, 0 );
1042 }
1043 if ( zExp < 0 ) {
1044 isTiny =
1045 ( float_detect_tininess == float_tininess_before_rounding )
1046 || ( zExp < -1 )
1047 || ! increment
1048 || lt128(
1049 zSig0,
1050 zSig1,
1051 LIT64( 0x0001FFFFFFFFFFFF ),
1052 LIT64( 0xFFFFFFFFFFFFFFFF )
1053 );
1054 shift128ExtraRightJamming(
1055 zSig0, zSig1, zSig2, - zExp, &zSig0, &zSig1, &zSig2 );
1056 zExp = 0;
1057 if ( isTiny && zSig2 ) float_raise( float_flag_underflow );
1058 if ( roundNearestEven ) {
1059 increment = ( (sbits64) zSig2 < 0 );
1060 }
1061 else {
1062 if ( zSign ) {
1063 increment = ( roundingMode == float_round_down ) && zSig2;
1064 }
1065 else {
1066 increment = ( roundingMode == float_round_up ) && zSig2;
1067 }
1068 }
1069 }
1070 }
1071 if ( zSig2 ) set_float_exception_inexact_flag();
1072 if ( increment ) {
1073 add128( zSig0, zSig1, 0, 1, &zSig0, &zSig1 );
1074 zSig1 &= ~ ( ( zSig2 + zSig2 == 0 ) & roundNearestEven );
1075 }
1076 else {
1077 if ( ( zSig0 | zSig1 ) == 0 ) zExp = 0;
1078 }
1079 return packFloat128( zSign, zExp, zSig0, zSig1 );
1080
1081 }
1082
1083 /*
1084 -------------------------------------------------------------------------------
1085 Takes an abstract floating-point value having sign `zSign', exponent `zExp',
1086 and significand formed by the concatenation of `zSig0' and `zSig1', and
1087 returns the proper quadruple-precision floating-point value corresponding
1088 to the abstract input. This routine is just like `roundAndPackFloat128'
1089 except that the input significand has fewer bits and does not have to be
1090 normalized. In all cases, `zExp' must be 1 less than the ``true'' floating-
1091 point exponent.
1092 -------------------------------------------------------------------------------
1093 */
1094 static float128
1095 normalizeRoundAndPackFloat128(
1096 flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1 )
1097 {
1098 int8 shiftCount;
1099 bits64 zSig2;
1100
1101 if ( zSig0 == 0 ) {
1102 zSig0 = zSig1;
1103 zSig1 = 0;
1104 zExp -= 64;
1105 }
1106 shiftCount = countLeadingZeros64( zSig0 ) - 15;
1107 if ( 0 <= shiftCount ) {
1108 zSig2 = 0;
1109 shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
1110 }
1111 else {
1112 shift128ExtraRightJamming(
1113 zSig0, zSig1, 0, - shiftCount, &zSig0, &zSig1, &zSig2 );
1114 }
1115 zExp -= shiftCount;
1116 return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 );
1117
1118 }
1119
1120 #endif
1121
1122 /*
1123 -------------------------------------------------------------------------------
1124 Returns the result of converting the 32-bit two's complement integer `a'
1125 to the single-precision floating-point format. The conversion is performed
1126 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1127 -------------------------------------------------------------------------------
1128 */
1129 float32 int32_to_float32( int32 a )
1130 {
1131 flag zSign;
1132
1133 if ( a == 0 ) return 0;
1134 if ( a == (sbits32) 0x80000000 ) return packFloat32( 1, 0x9E, 0 );
1135 zSign = ( a < 0 );
1136 return normalizeRoundAndPackFloat32(zSign, 0x9C, (uint32)(zSign ? - a : a));
1137
1138 }
1139
1140 float32 uint32_to_float32( uint32 a )
1141 {
1142 if ( a == 0 ) return 0;
1143 if ( a & (bits32) 0x80000000 )
1144 return normalizeRoundAndPackFloat32( 0, 0x9D, a >> 1 );
1145 return normalizeRoundAndPackFloat32( 0, 0x9C, a );
1146 }
1147
1148
1149 /*
1150 -------------------------------------------------------------------------------
1151 Returns the result of converting the 32-bit two's complement integer `a'
1152 to the double-precision floating-point format. The conversion is performed
1153 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1154 -------------------------------------------------------------------------------
1155 */
1156 float64 int32_to_float64( int32 a )
1157 {
1158 flag zSign;
1159 uint32 absA;
1160 int8 shiftCount;
1161 bits64 zSig;
1162
1163 if ( a == 0 ) return 0;
1164 zSign = ( a < 0 );
1165 absA = zSign ? - a : a;
1166 shiftCount = countLeadingZeros32( absA ) + 21;
1167 zSig = absA;
1168 return packFloat64( zSign, 0x432 - shiftCount, zSig<<shiftCount );
1169
1170 }
1171
1172 float64 uint32_to_float64( uint32 a )
1173 {
1174 int8 shiftCount;
1175 bits64 zSig = a;
1176
1177 if ( a == 0 ) return 0;
1178 shiftCount = countLeadingZeros32( a ) + 21;
1179 return packFloat64( 0, 0x432 - shiftCount, zSig<<shiftCount );
1180
1181 }
1182
1183 #ifdef FLOATX80
1184
1185 /*
1186 -------------------------------------------------------------------------------
1187 Returns the result of converting the 32-bit two's complement integer `a'
1188 to the extended double-precision floating-point format. The conversion
1189 is performed according to the IEC/IEEE Standard for Binary Floating-Point
1190 Arithmetic.
1191 -------------------------------------------------------------------------------
1192 */
1193 floatx80 int32_to_floatx80( int32 a )
1194 {
1195 flag zSign;
1196 uint32 absA;
1197 int8 shiftCount;
1198 bits64 zSig;
1199
1200 if ( a == 0 ) return packFloatx80( 0, 0, 0 );
1201 zSign = ( a < 0 );
1202 absA = zSign ? - a : a;
1203 shiftCount = countLeadingZeros32( absA ) + 32;
1204 zSig = absA;
1205 return packFloatx80( zSign, 0x403E - shiftCount, zSig<<shiftCount );
1206
1207 }
1208
1209 floatx80 uint32_to_floatx80( uint32 a )
1210 {
1211 int8 shiftCount;
1212 bits64 zSig = a;
1213
1214 if ( a == 0 ) return packFloatx80( 0, 0, 0 );
1215 shiftCount = countLeadingZeros32( a ) + 32;
1216 return packFloatx80( 0, 0x403E - shiftCount, zSig<<shiftCount );
1217
1218 }
1219
1220 #endif
1221
1222 #ifdef FLOAT128
1223
1224 /*
1225 -------------------------------------------------------------------------------
1226 Returns the result of converting the 32-bit two's complement integer `a' to
1227 the quadruple-precision floating-point format. The conversion is performed
1228 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1229 -------------------------------------------------------------------------------
1230 */
1231 float128 int32_to_float128( int32 a )
1232 {
1233 flag zSign;
1234 uint32 absA;
1235 int8 shiftCount;
1236 bits64 zSig0;
1237
1238 if ( a == 0 ) return packFloat128( 0, 0, 0, 0 );
1239 zSign = ( a < 0 );
1240 absA = zSign ? - a : a;
1241 shiftCount = countLeadingZeros32( absA ) + 17;
1242 zSig0 = absA;
1243 return packFloat128( zSign, 0x402E - shiftCount, zSig0<<shiftCount, 0 );
1244
1245 }
1246
1247 float128 uint32_to_float128( uint32 a )
1248 {
1249 int8 shiftCount;
1250 bits64 zSig0 = a;
1251
1252 if ( a == 0 ) return packFloat128( 0, 0, 0, 0 );
1253 shiftCount = countLeadingZeros32( a ) + 17;
1254 return packFloat128( 0, 0x402E - shiftCount, zSig0<<shiftCount, 0 );
1255
1256 }
1257
1258 #endif
1259
1260 #ifndef SOFTFLOAT_FOR_GCC /* __floatdi?f is in libgcc2.c */
1261 /*
1262 -------------------------------------------------------------------------------
1263 Returns the result of converting the 64-bit two's complement integer `a'
1264 to the single-precision floating-point format. The conversion is performed
1265 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1266 -------------------------------------------------------------------------------
1267 */
1268 float32 int64_to_float32( int64 a )
1269 {
1270 flag zSign;
1271 uint64 absA;
1272 int8 shiftCount;
1273
1274 if ( a == 0 ) return 0;
1275 zSign = ( a < 0 );
1276 absA = zSign ? - a : a;
1277 shiftCount = countLeadingZeros64( absA ) - 40;
1278 if ( 0 <= shiftCount ) {
1279 return packFloat32( zSign, 0x95 - shiftCount, absA<<shiftCount );
1280 }
1281 else {
1282 shiftCount += 7;
1283 if ( shiftCount < 0 ) {
1284 shift64RightJamming( absA, - shiftCount, &absA );
1285 }
1286 else {
1287 absA <<= shiftCount;
1288 }
1289 return roundAndPackFloat32( zSign, 0x9C - shiftCount, absA );
1290 }
1291
1292 }
1293
1294 /*
1295 -------------------------------------------------------------------------------
1296 Returns the result of converting the 64-bit two's complement integer `a'
1297 to the double-precision floating-point format. The conversion is performed
1298 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1299 -------------------------------------------------------------------------------
1300 */
1301 float64 int64_to_float64( int64 a )
1302 {
1303 flag zSign;
1304
1305 if ( a == 0 ) return 0;
1306 if ( a == (sbits64) LIT64( 0x8000000000000000 ) ) {
1307 return packFloat64( 1, 0x43E, 0 );
1308 }
1309 zSign = ( a < 0 );
1310 return normalizeRoundAndPackFloat64( zSign, 0x43C, zSign ? - a : a );
1311
1312 }
1313
1314 #ifdef FLOATX80
1315
1316 /*
1317 -------------------------------------------------------------------------------
1318 Returns the result of converting the 64-bit two's complement integer `a'
1319 to the extended double-precision floating-point format. The conversion
1320 is performed according to the IEC/IEEE Standard for Binary Floating-Point
1321 Arithmetic.
1322 -------------------------------------------------------------------------------
1323 */
1324 floatx80 int64_to_floatx80( int64 a )
1325 {
1326 flag zSign;
1327 uint64 absA;
1328 int8 shiftCount;
1329
1330 if ( a == 0 ) return packFloatx80( 0, 0, 0 );
1331 zSign = ( a < 0 );
1332 absA = zSign ? - a : a;
1333 shiftCount = countLeadingZeros64( absA );
1334 return packFloatx80( zSign, 0x403E - shiftCount, absA<<shiftCount );
1335
1336 }
1337
1338 #endif
1339
1340 #endif /* !SOFTFLOAT_FOR_GCC */
1341
1342 #ifdef FLOAT128
1343
1344 /*
1345 -------------------------------------------------------------------------------
1346 Returns the result of converting the 64-bit two's complement integer `a' to
1347 the quadruple-precision floating-point format. The conversion is performed
1348 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1349 -------------------------------------------------------------------------------
1350 */
1351 float128 int64_to_float128( int64 a )
1352 {
1353 flag zSign;
1354 uint64 absA;
1355 int8 shiftCount;
1356 int32 zExp;
1357 bits64 zSig0, zSig1;
1358
1359 if ( a == 0 ) return packFloat128( 0, 0, 0, 0 );
1360 zSign = ( a < 0 );
1361 absA = zSign ? - a : a;
1362 shiftCount = countLeadingZeros64( absA ) + 49;
1363 zExp = 0x406E - shiftCount;
1364 if ( 64 <= shiftCount ) {
1365 zSig1 = 0;
1366 zSig0 = absA;
1367 shiftCount -= 64;
1368 }
1369 else {
1370 zSig1 = absA;
1371 zSig0 = 0;
1372 }
1373 shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
1374 return packFloat128( zSign, zExp, zSig0, zSig1 );
1375
1376 }
1377
1378 #endif
1379
1380 #ifndef SOFTFLOAT_FOR_GCC /* Not needed */
1381 /*
1382 -------------------------------------------------------------------------------
1383 Returns the result of converting the single-precision floating-point value
1384 `a' to the 32-bit two's complement integer format. The conversion is
1385 performed according to the IEC/IEEE Standard for Binary Floating-Point
1386 Arithmetic---which means in particular that the conversion is rounded
1387 according to the current rounding mode. If `a' is a NaN, the largest
1388 positive integer is returned. Otherwise, if the conversion overflows, the
1389 largest integer with the same sign as `a' is returned.
1390 -------------------------------------------------------------------------------
1391 */
1392 int32 float32_to_int32( float32 a )
1393 {
1394 flag aSign;
1395 int16 aExp, shiftCount;
1396 bits32 aSig;
1397 bits64 aSig64;
1398
1399 aSig = extractFloat32Frac( a );
1400 aExp = extractFloat32Exp( a );
1401 aSign = extractFloat32Sign( a );
1402 if ( ( aExp == 0xFF ) && aSig ) aSign = 0;
1403 if ( aExp ) aSig |= 0x00800000;
1404 shiftCount = 0xAF - aExp;
1405 aSig64 = aSig;
1406 aSig64 <<= 32;
1407 if ( 0 < shiftCount ) shift64RightJamming( aSig64, shiftCount, &aSig64 );
1408 return roundAndPackInt32( aSign, aSig64 );
1409
1410 }
1411 #endif /* !SOFTFLOAT_FOR_GCC */
1412
1413 /*
1414 -------------------------------------------------------------------------------
1415 Returns the result of converting the single-precision floating-point value
1416 `a' to the 32-bit two's complement integer format. The conversion is
1417 performed according to the IEC/IEEE Standard for Binary Floating-Point
1418 Arithmetic, except that the conversion is always rounded toward zero.
1419 If `a' is a NaN, the largest positive integer is returned. Otherwise, if
1420 the conversion overflows, the largest integer with the same sign as `a' is
1421 returned.
1422 -------------------------------------------------------------------------------
1423 */
1424 int32 float32_to_int32_round_to_zero( float32 a )
1425 {
1426 flag aSign;
1427 int16 aExp, shiftCount;
1428 bits32 aSig;
1429 int32 z;
1430
1431 aSig = extractFloat32Frac( a );
1432 aExp = extractFloat32Exp( a );
1433 aSign = extractFloat32Sign( a );
1434 shiftCount = aExp - 0x9E;
1435 if ( 0 <= shiftCount ) {
1436 if ( a != 0xCF000000 ) {
1437 float_raise( float_flag_invalid );
1438 if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) return 0x7FFFFFFF;
1439 }
1440 return (sbits32) 0x80000000;
1441 }
1442 else if ( aExp <= 0x7E ) {
1443 if ( aExp | aSig ) set_float_exception_inexact_flag();
1444 return 0;
1445 }
1446 aSig = ( aSig | 0x00800000 )<<8;
1447 z = aSig>>( - shiftCount );
1448 if ( (bits32) ( aSig<<( shiftCount & 31 ) ) ) {
1449 set_float_exception_inexact_flag();
1450 }
1451 if ( aSign ) z = - z;
1452 return z;
1453
1454 }
1455
1456 #ifndef SOFTFLOAT_FOR_GCC /* __fix?fdi provided by libgcc2.c */
1457 /*
1458 -------------------------------------------------------------------------------
1459 Returns the result of converting the single-precision floating-point value
1460 `a' to the 64-bit two's complement integer format. The conversion is
1461 performed according to the IEC/IEEE Standard for Binary Floating-Point
1462 Arithmetic---which means in particular that the conversion is rounded
1463 according to the current rounding mode. If `a' is a NaN, the largest
1464 positive integer is returned. Otherwise, if the conversion overflows, the
1465 largest integer with the same sign as `a' is returned.
1466 -------------------------------------------------------------------------------
1467 */
1468 int64 float32_to_int64( float32 a )
1469 {
1470 flag aSign;
1471 int16 aExp, shiftCount;
1472 bits32 aSig;
1473 bits64 aSig64, aSigExtra;
1474
1475 aSig = extractFloat32Frac( a );
1476 aExp = extractFloat32Exp( a );
1477 aSign = extractFloat32Sign( a );
1478 shiftCount = 0xBE - aExp;
1479 if ( shiftCount < 0 ) {
1480 float_raise( float_flag_invalid );
1481 if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) {
1482 return LIT64( 0x7FFFFFFFFFFFFFFF );
1483 }
1484 return (sbits64) LIT64( 0x8000000000000000 );
1485 }
1486 if ( aExp ) aSig |= 0x00800000;
1487 aSig64 = aSig;
1488 aSig64 <<= 40;
1489 shift64ExtraRightJamming( aSig64, 0, shiftCount, &aSig64, &aSigExtra );
1490 return roundAndPackInt64( aSign, aSig64, aSigExtra );
1491
1492 }
1493
1494 /*
1495 -------------------------------------------------------------------------------
1496 Returns the result of converting the single-precision floating-point value
1497 `a' to the 64-bit two's complement integer format. The conversion is
1498 performed according to the IEC/IEEE Standard for Binary Floating-Point
1499 Arithmetic, except that the conversion is always rounded toward zero. If
1500 `a' is a NaN, the largest positive integer is returned. Otherwise, if the
1501 conversion overflows, the largest integer with the same sign as `a' is
1502 returned.
1503 -------------------------------------------------------------------------------
1504 */
1505 int64 float32_to_int64_round_to_zero( float32 a )
1506 {
1507 flag aSign;
1508 int16 aExp, shiftCount;
1509 bits32 aSig;
1510 bits64 aSig64;
1511 int64 z;
1512
1513 aSig = extractFloat32Frac( a );
1514 aExp = extractFloat32Exp( a );
1515 aSign = extractFloat32Sign( a );
1516 shiftCount = aExp - 0xBE;
1517 if ( 0 <= shiftCount ) {
1518 if ( a != 0xDF000000 ) {
1519 float_raise( float_flag_invalid );
1520 if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) {
1521 return LIT64( 0x7FFFFFFFFFFFFFFF );
1522 }
1523 }
1524 return (sbits64) LIT64( 0x8000000000000000 );
1525 }
1526 else if ( aExp <= 0x7E ) {
1527 if ( aExp | aSig ) set_float_exception_inexact_flag();
1528 return 0;
1529 }
1530 aSig64 = aSig | 0x00800000;
1531 aSig64 <<= 40;
1532 z = aSig64>>( - shiftCount );
1533 if ( (bits64) ( aSig64<<( shiftCount & 63 ) ) ) {
1534 set_float_exception_inexact_flag();
1535 }
1536 if ( aSign ) z = - z;
1537 return z;
1538
1539 }
1540 #endif /* !SOFTFLOAT_FOR_GCC */
1541
1542 /*
1543 -------------------------------------------------------------------------------
1544 Returns the result of converting the single-precision floating-point value
1545 `a' to the double-precision floating-point format. The conversion is
1546 performed according to the IEC/IEEE Standard for Binary Floating-Point
1547 Arithmetic.
1548 -------------------------------------------------------------------------------
1549 */
1550 float64 float32_to_float64( float32 a )
1551 {
1552 flag aSign;
1553 int16 aExp;
1554 bits32 aSig;
1555
1556 aSig = extractFloat32Frac( a );
1557 aExp = extractFloat32Exp( a );
1558 aSign = extractFloat32Sign( a );
1559 if ( aExp == 0xFF ) {
1560 if ( aSig ) return commonNaNToFloat64( float32ToCommonNaN( a ) );
1561 return packFloat64( aSign, 0x7FF, 0 );
1562 }
1563 if ( aExp == 0 ) {
1564 if ( aSig == 0 ) return packFloat64( aSign, 0, 0 );
1565 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1566 --aExp;
1567 }
1568 return packFloat64( aSign, aExp + 0x380, ( (bits64) aSig )<<29 );
1569
1570 }
1571
1572 #ifdef FLOATX80
1573
1574 /*
1575 -------------------------------------------------------------------------------
1576 Returns the result of converting the single-precision floating-point value
1577 `a' to the extended double-precision floating-point format. The conversion
1578 is performed according to the IEC/IEEE Standard for Binary Floating-Point
1579 Arithmetic.
1580 -------------------------------------------------------------------------------
1581 */
1582 floatx80 float32_to_floatx80( float32 a )
1583 {
1584 flag aSign;
1585 int16 aExp;
1586 bits32 aSig;
1587
1588 aSig = extractFloat32Frac( a );
1589 aExp = extractFloat32Exp( a );
1590 aSign = extractFloat32Sign( a );
1591 if ( aExp == 0xFF ) {
1592 if ( aSig ) return commonNaNToFloatx80( float32ToCommonNaN( a ) );
1593 return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
1594 }
1595 if ( aExp == 0 ) {
1596 if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 );
1597 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1598 }
1599 aSig |= 0x00800000;
1600 return packFloatx80( aSign, aExp + 0x3F80, ( (bits64) aSig )<<40 );
1601
1602 }
1603
1604 #endif
1605
1606 #ifdef FLOAT128
1607
1608 /*
1609 -------------------------------------------------------------------------------
1610 Returns the result of converting the single-precision floating-point value
1611 `a' to the double-precision floating-point format. The conversion is
1612 performed according to the IEC/IEEE Standard for Binary Floating-Point
1613 Arithmetic.
1614 -------------------------------------------------------------------------------
1615 */
1616 float128 float32_to_float128( float32 a )
1617 {
1618 flag aSign;
1619 int16 aExp;
1620 bits32 aSig;
1621
1622 aSig = extractFloat32Frac( a );
1623 aExp = extractFloat32Exp( a );
1624 aSign = extractFloat32Sign( a );
1625 if ( aExp == 0xFF ) {
1626 if ( aSig ) return commonNaNToFloat128( float32ToCommonNaN( a ) );
1627 return packFloat128( aSign, 0x7FFF, 0, 0 );
1628 }
1629 if ( aExp == 0 ) {
1630 if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 );
1631 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1632 --aExp;
1633 }
1634 return packFloat128( aSign, aExp + 0x3F80, ( (bits64) aSig )<<25, 0 );
1635
1636 }
1637
1638 #endif
1639
1640 #ifndef SOFTFLOAT_FOR_GCC /* Not needed */
1641 /*
1642 -------------------------------------------------------------------------------
1643 Rounds the single-precision floating-point value `a' to an integer, and
1644 returns the result as a single-precision floating-point value. The
1645 operation is performed according to the IEC/IEEE Standard for Binary
1646 Floating-Point Arithmetic.
1647 -------------------------------------------------------------------------------
1648 */
1649 float32 float32_round_to_int( float32 a )
1650 {
1651 flag aSign;
1652 int16 aExp;
1653 bits32 lastBitMask, roundBitsMask;
1654 int8 roundingMode;
1655 float32 z;
1656
1657 aExp = extractFloat32Exp( a );
1658 if ( 0x96 <= aExp ) {
1659 if ( ( aExp == 0xFF ) && extractFloat32Frac( a ) ) {
1660 return propagateFloat32NaN( a, a );
1661 }
1662 return a;
1663 }
1664 if ( aExp <= 0x7E ) {
1665 if ( (bits32) ( a<<1 ) == 0 ) return a;
1666 set_float_exception_inexact_flag();
1667 aSign = extractFloat32Sign( a );
1668 switch ( float_rounding_mode ) {
1669 case float_round_nearest_even:
1670 if ( ( aExp == 0x7E ) && extractFloat32Frac( a ) ) {
1671 return packFloat32( aSign, 0x7F, 0 );
1672 }
1673 break;
1674 case float_round_to_zero:
1675 break;
1676 case float_round_down:
1677 return aSign ? 0xBF800000 : 0;
1678 case float_round_up:
1679 return aSign ? 0x80000000 : 0x3F800000;
1680 }
1681 return packFloat32( aSign, 0, 0 );
1682 }
1683 lastBitMask = 1;
1684 lastBitMask <<= 0x96 - aExp;
1685 roundBitsMask = lastBitMask - 1;
1686 z = a;
1687 roundingMode = float_rounding_mode;
1688 if ( roundingMode == float_round_nearest_even ) {
1689 z += lastBitMask>>1;
1690 if ( ( z & roundBitsMask ) == 0 ) z &= ~ lastBitMask;
1691 }
1692 else if ( roundingMode != float_round_to_zero ) {
1693 if ( extractFloat32Sign( z ) ^ ( roundingMode == float_round_up ) ) {
1694 z += roundBitsMask;
1695 }
1696 }
1697 z &= ~ roundBitsMask;
1698 if ( z != a ) set_float_exception_inexact_flag();
1699 return z;
1700
1701 }
1702 #endif /* !SOFTFLOAT_FOR_GCC */
1703
1704 /*
1705 -------------------------------------------------------------------------------
1706 Returns the result of adding the absolute values of the single-precision
1707 floating-point values `a' and `b'. If `zSign' is 1, the sum is negated
1708 before being returned. `zSign' is ignored if the result is a NaN.
1709 The addition is performed according to the IEC/IEEE Standard for Binary
1710 Floating-Point Arithmetic.
1711 -------------------------------------------------------------------------------
1712 */
1713 static float32 addFloat32Sigs( float32 a, float32 b, flag zSign )
1714 {
1715 int16 aExp, bExp, zExp;
1716 bits32 aSig, bSig, zSig;
1717 int16 expDiff;
1718
1719 aSig = extractFloat32Frac( a );
1720 aExp = extractFloat32Exp( a );
1721 bSig = extractFloat32Frac( b );
1722 bExp = extractFloat32Exp( b );
1723 expDiff = aExp - bExp;
1724 aSig <<= 6;
1725 bSig <<= 6;
1726 if ( 0 < expDiff ) {
1727 if ( aExp == 0xFF ) {
1728 if ( aSig ) return propagateFloat32NaN( a, b );
1729 return a;
1730 }
1731 if ( bExp == 0 ) {
1732 --expDiff;
1733 }
1734 else {
1735 bSig |= 0x20000000;
1736 }
1737 shift32RightJamming( bSig, expDiff, &bSig );
1738 zExp = aExp;
1739 }
1740 else if ( expDiff < 0 ) {
1741 if ( bExp == 0xFF ) {
1742 if ( bSig ) return propagateFloat32NaN( a, b );
1743 return packFloat32( zSign, 0xFF, 0 );
1744 }
1745 if ( aExp == 0 ) {
1746 ++expDiff;
1747 }
1748 else {
1749 aSig |= 0x20000000;
1750 }
1751 shift32RightJamming( aSig, - expDiff, &aSig );
1752 zExp = bExp;
1753 }
1754 else {
1755 if ( aExp == 0xFF ) {
1756 if ( aSig | bSig ) return propagateFloat32NaN( a, b );
1757 return a;
1758 }
1759 if ( aExp == 0 ) return packFloat32( zSign, 0, ( aSig + bSig )>>6 );
1760 zSig = 0x40000000 + aSig + bSig;
1761 zExp = aExp;
1762 goto roundAndPack;
1763 }
1764 aSig |= 0x20000000;
1765 zSig = ( aSig + bSig )<<1;
1766 --zExp;
1767 if ( (sbits32) zSig < 0 ) {
1768 zSig = aSig + bSig;
1769 ++zExp;
1770 }
1771 roundAndPack:
1772 return roundAndPackFloat32( zSign, zExp, zSig );
1773
1774 }
1775
1776 /*
1777 -------------------------------------------------------------------------------
1778 Returns the result of subtracting the absolute values of the single-
1779 precision floating-point values `a' and `b'. If `zSign' is 1, the
1780 difference is negated before being returned. `zSign' is ignored if the
1781 result is a NaN. The subtraction is performed according to the IEC/IEEE
1782 Standard for Binary Floating-Point Arithmetic.
1783 -------------------------------------------------------------------------------
1784 */
1785 static float32 subFloat32Sigs( float32 a, float32 b, flag zSign )
1786 {
1787 int16 aExp, bExp, zExp;
1788 bits32 aSig, bSig, zSig;
1789 int16 expDiff;
1790
1791 aSig = extractFloat32Frac( a );
1792 aExp = extractFloat32Exp( a );
1793 bSig = extractFloat32Frac( b );
1794 bExp = extractFloat32Exp( b );
1795 expDiff = aExp - bExp;
1796 aSig <<= 7;
1797 bSig <<= 7;
1798 if ( 0 < expDiff ) goto aExpBigger;
1799 if ( expDiff < 0 ) goto bExpBigger;
1800 if ( aExp == 0xFF ) {
1801 if ( aSig | bSig ) return propagateFloat32NaN( a, b );
1802 float_raise( float_flag_invalid );
1803 return float32_default_nan;
1804 }
1805 if ( aExp == 0 ) {
1806 aExp = 1;
1807 bExp = 1;
1808 }
1809 if ( bSig < aSig ) goto aBigger;
1810 if ( aSig < bSig ) goto bBigger;
1811 return packFloat32( float_rounding_mode == float_round_down, 0, 0 );
1812 bExpBigger:
1813 if ( bExp == 0xFF ) {
1814 if ( bSig ) return propagateFloat32NaN( a, b );
1815 return packFloat32( zSign ^ 1, 0xFF, 0 );
1816 }
1817 if ( aExp == 0 ) {
1818 ++expDiff;
1819 }
1820 else {
1821 aSig |= 0x40000000;
1822 }
1823 shift32RightJamming( aSig, - expDiff, &aSig );
1824 bSig |= 0x40000000;
1825 bBigger:
1826 zSig = bSig - aSig;
1827 zExp = bExp;
1828 zSign ^= 1;
1829 goto normalizeRoundAndPack;
1830 aExpBigger:
1831 if ( aExp == 0xFF ) {
1832 if ( aSig ) return propagateFloat32NaN( a, b );
1833 return a;
1834 }
1835 if ( bExp == 0 ) {
1836 --expDiff;
1837 }
1838 else {
1839 bSig |= 0x40000000;
1840 }
1841 shift32RightJamming( bSig, expDiff, &bSig );
1842 aSig |= 0x40000000;
1843 aBigger:
1844 zSig = aSig - bSig;
1845 zExp = aExp;
1846 normalizeRoundAndPack:
1847 --zExp;
1848 return normalizeRoundAndPackFloat32( zSign, zExp, zSig );
1849
1850 }
1851
1852 /*
1853 -------------------------------------------------------------------------------
1854 Returns the result of adding the single-precision floating-point values `a'
1855 and `b'. The operation is performed according to the IEC/IEEE Standard for
1856 Binary Floating-Point Arithmetic.
1857 -------------------------------------------------------------------------------
1858 */
1859 float32 float32_add( float32 a, float32 b )
1860 {
1861 flag aSign, bSign;
1862
1863 aSign = extractFloat32Sign( a );
1864 bSign = extractFloat32Sign( b );
1865 if ( aSign == bSign ) {
1866 return addFloat32Sigs( a, b, aSign );
1867 }
1868 else {
1869 return subFloat32Sigs( a, b, aSign );
1870 }
1871
1872 }
1873
1874 /*
1875 -------------------------------------------------------------------------------
1876 Returns the result of subtracting the single-precision floating-point values
1877 `a' and `b'. The operation is performed according to the IEC/IEEE Standard
1878 for Binary Floating-Point Arithmetic.
1879 -------------------------------------------------------------------------------
1880 */
1881 float32 float32_sub( float32 a, float32 b )
1882 {
1883 flag aSign, bSign;
1884
1885 aSign = extractFloat32Sign( a );
1886 bSign = extractFloat32Sign( b );
1887 if ( aSign == bSign ) {
1888 return subFloat32Sigs( a, b, aSign );
1889 }
1890 else {
1891 return addFloat32Sigs( a, b, aSign );
1892 }
1893
1894 }
1895
1896 /*
1897 -------------------------------------------------------------------------------
1898 Returns the result of multiplying the single-precision floating-point values
1899 `a' and `b'. The operation is performed according to the IEC/IEEE Standard
1900 for Binary Floating-Point Arithmetic.
1901 -------------------------------------------------------------------------------
1902 */
1903 float32 float32_mul( float32 a, float32 b )
1904 {
1905 flag aSign, bSign, zSign;
1906 int16 aExp, bExp, zExp;
1907 bits32 aSig, bSig;
1908 bits64 zSig64;
1909 bits32 zSig;
1910
1911 aSig = extractFloat32Frac( a );
1912 aExp = extractFloat32Exp( a );
1913 aSign = extractFloat32Sign( a );
1914 bSig = extractFloat32Frac( b );
1915 bExp = extractFloat32Exp( b );
1916 bSign = extractFloat32Sign( b );
1917 zSign = aSign ^ bSign;
1918 if ( aExp == 0xFF ) {
1919 if ( aSig || ( ( bExp == 0xFF ) && bSig ) ) {
1920 return propagateFloat32NaN( a, b );
1921 }
1922 if ( ( bExp | bSig ) == 0 ) {
1923 float_raise( float_flag_invalid );
1924 return float32_default_nan;
1925 }
1926 return packFloat32( zSign, 0xFF, 0 );
1927 }
1928 if ( bExp == 0xFF ) {
1929 if ( bSig ) return propagateFloat32NaN( a, b );
1930 if ( ( aExp | aSig ) == 0 ) {
1931 float_raise( float_flag_invalid );
1932 return float32_default_nan;
1933 }
1934 return packFloat32( zSign, 0xFF, 0 );
1935 }
1936 if ( aExp == 0 ) {
1937 if ( aSig == 0 ) return packFloat32( zSign, 0, 0 );
1938 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1939 }
1940 if ( bExp == 0 ) {
1941 if ( bSig == 0 ) return packFloat32( zSign, 0, 0 );
1942 normalizeFloat32Subnormal( bSig, &bExp, &bSig );
1943 }
1944 zExp = aExp + bExp - 0x7F;
1945 aSig = ( aSig | 0x00800000 )<<7;
1946 bSig = ( bSig | 0x00800000 )<<8;
1947 shift64RightJamming( ( (bits64) aSig ) * bSig, 32, &zSig64 );
1948 zSig = (bits32)zSig64;
1949 if ( 0 <= (sbits32) ( zSig<<1 ) ) {
1950 zSig <<= 1;
1951 --zExp;
1952 }
1953 return roundAndPackFloat32( zSign, zExp, zSig );
1954
1955 }
1956
1957 /*
1958 -------------------------------------------------------------------------------
1959 Returns the result of dividing the single-precision floating-point value `a'
1960 by the corresponding value `b'. The operation is performed according to the
1961 IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1962 -------------------------------------------------------------------------------
1963 */
1964 float32 float32_div( float32 a, float32 b )
1965 {
1966 flag aSign, bSign, zSign;
1967 int16 aExp, bExp, zExp;
1968 bits32 aSig, bSig, zSig;
1969
1970 aSig = extractFloat32Frac( a );
1971 aExp = extractFloat32Exp( a );
1972 aSign = extractFloat32Sign( a );
1973 bSig = extractFloat32Frac( b );
1974 bExp = extractFloat32Exp( b );
1975 bSign = extractFloat32Sign( b );
1976 zSign = aSign ^ bSign;
1977 if ( aExp == 0xFF ) {
1978 if ( aSig ) return propagateFloat32NaN( a, b );
1979 if ( bExp == 0xFF ) {
1980 if ( bSig ) return propagateFloat32NaN( a, b );
1981 float_raise( float_flag_invalid );
1982 return float32_default_nan;
1983 }
1984 return packFloat32( zSign, 0xFF, 0 );
1985 }
1986 if ( bExp == 0xFF ) {
1987 if ( bSig ) return propagateFloat32NaN( a, b );
1988 return packFloat32( zSign, 0, 0 );
1989 }
1990 if ( bExp == 0 ) {
1991 if ( bSig == 0 ) {
1992 if ( ( aExp | aSig ) == 0 ) {
1993 float_raise( float_flag_invalid );
1994 return float32_default_nan;
1995 }
1996 float_raise( float_flag_divbyzero );
1997 return packFloat32( zSign, 0xFF, 0 );
1998 }
1999 normalizeFloat32Subnormal( bSig, &bExp, &bSig );
2000 }
2001 if ( aExp == 0 ) {
2002 if ( aSig == 0 ) return packFloat32( zSign, 0, 0 );
2003 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
2004 }
2005 zExp = aExp - bExp + 0x7D;
2006 aSig = ( aSig | 0x00800000 )<<7;
2007 bSig = ( bSig | 0x00800000 )<<8;
2008 if ( bSig <= ( aSig + aSig ) ) {
2009 aSig >>= 1;
2010 ++zExp;
2011 }
2012 zSig = (bits32)((((bits64) aSig) << 32) / bSig);
2013 if ( ( zSig & 0x3F ) == 0 ) {
2014 zSig |= ( (bits64) bSig * zSig != ( (bits64) aSig )<<32 );
2015 }
2016 return roundAndPackFloat32( zSign, zExp, zSig );
2017
2018 }
2019
2020 #ifndef SOFTFLOAT_FOR_GCC /* Not needed */
2021 /*
2022 -------------------------------------------------------------------------------
2023 Returns the remainder of the single-precision floating-point value `a'
2024 with respect to the corresponding value `b'. The operation is performed
2025 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2026 -------------------------------------------------------------------------------
2027 */
2028 float32 float32_rem( float32 a, float32 b )
2029 {
2030 flag aSign, bSign, zSign;
2031 int16 aExp, bExp, expDiff;
2032 bits32 aSig, bSig;
2033 bits32 q;
2034 bits64 aSig64, bSig64, q64;
2035 bits32 alternateASig;
2036 sbits32 sigMean;
2037
2038 aSig = extractFloat32Frac( a );
2039 aExp = extractFloat32Exp( a );
2040 aSign = extractFloat32Sign( a );
2041 bSig = extractFloat32Frac( b );
2042 bExp = extractFloat32Exp( b );
2043 bSign = extractFloat32Sign( b );
2044 if ( aExp == 0xFF ) {
2045 if ( aSig || ( ( bExp == 0xFF ) && bSig ) ) {
2046 return propagateFloat32NaN( a, b );
2047 }
2048 float_raise( float_flag_invalid );
2049 return float32_default_nan;
2050 }
2051 if ( bExp == 0xFF ) {
2052 if ( bSig ) return propagateFloat32NaN( a, b );
2053 return a;
2054 }
2055 if ( bExp == 0 ) {
2056 if ( bSig == 0 ) {
2057 float_raise( float_flag_invalid );
2058 return float32_default_nan;
2059 }
2060 normalizeFloat32Subnormal( bSig, &bExp, &bSig );
2061 }
2062 if ( aExp == 0 ) {
2063 if ( aSig == 0 ) return a;
2064 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
2065 }
2066 expDiff = aExp - bExp;
2067 aSig |= 0x00800000;
2068 bSig |= 0x00800000;
2069 if ( expDiff < 32 ) {
2070 aSig <<= 8;
2071 bSig <<= 8;
2072 if ( expDiff < 0 ) {
2073 if ( expDiff < -1 ) return a;
2074 aSig >>= 1;
2075 }
2076 q = ( bSig <= aSig );
2077 if ( q ) aSig -= bSig;
2078 if ( 0 < expDiff ) {
2079 q = ( ( (bits64) aSig )<<32 ) / bSig;
2080 q >>= 32 - expDiff;
2081 bSig >>= 2;
2082 aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q;
2083 }
2084 else {
2085 aSig >>= 2;
2086 bSig >>= 2;
2087 }
2088 }
2089 else {
2090 if ( bSig <= aSig ) aSig -= bSig;
2091 aSig64 = ( (bits64) aSig )<<40;
2092 bSig64 = ( (bits64) bSig )<<40;
2093 expDiff -= 64;
2094 while ( 0 < expDiff ) {
2095 q64 = estimateDiv128To64( aSig64, 0, bSig64 );
2096 q64 = ( 2 < q64 ) ? q64 - 2 : 0;
2097 aSig64 = - ( ( bSig * q64 )<<38 );
2098 expDiff -= 62;
2099 }
2100 expDiff += 64;
2101 q64 = estimateDiv128To64( aSig64, 0, bSig64 );
2102 q64 = ( 2 < q64 ) ? q64 - 2 : 0;
2103 q = q64>>( 64 - expDiff );
2104 bSig <<= 6;
2105 aSig = ( ( aSig64>>33 )<<( expDiff - 1 ) ) - bSig * q;
2106 }
2107 do {
2108 alternateASig = aSig;
2109 ++q;
2110 aSig -= bSig;
2111 } while ( 0 <= (sbits32) aSig );
2112 sigMean = aSig + alternateASig;
2113 if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) {
2114 aSig = alternateASig;
2115 }
2116 zSign = ( (sbits32) aSig < 0 );
2117 if ( zSign ) aSig = - aSig;
2118 return normalizeRoundAndPackFloat32( aSign ^ zSign, bExp, aSig );
2119
2120 }
2121 #endif /* !SOFTFLOAT_FOR_GCC */
2122
2123 #ifndef SOFTFLOAT_FOR_GCC /* Not needed */
2124 /*
2125 -------------------------------------------------------------------------------
2126 Returns the square root of the single-precision floating-point value `a'.
2127 The operation is performed according to the IEC/IEEE Standard for Binary
2128 Floating-Point Arithmetic.
2129 -------------------------------------------------------------------------------
2130 */
2131 float32 float32_sqrt( float32 a )
2132 {
2133 flag aSign;
2134 int16 aExp, zExp;
2135 bits32 aSig, zSig;
2136 bits64 rem, term;
2137
2138 aSig = extractFloat32Frac( a );
2139 aExp = extractFloat32Exp( a );
2140 aSign = extractFloat32Sign( a );
2141 if ( aExp == 0xFF ) {
2142 if ( aSig ) return propagateFloat32NaN( a, 0 );
2143 if ( ! aSign ) return a;
2144 float_raise( float_flag_invalid );
2145 return float32_default_nan;
2146 }
2147 if ( aSign ) {
2148 if ( ( aExp | aSig ) == 0 ) return a;
2149 float_raise( float_flag_invalid );
2150 return float32_default_nan;
2151 }
2152 if ( aExp == 0 ) {
2153 if ( aSig == 0 ) return 0;
2154 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
2155 }
2156 zExp = ( ( aExp - 0x7F )>>1 ) + 0x7E;
2157 aSig = ( aSig | 0x00800000 )<<8;
2158 zSig = estimateSqrt32( aExp, aSig ) + 2;
2159 if ( ( zSig & 0x7F ) <= 5 ) {
2160 if ( zSig < 2 ) {
2161 zSig = 0x7FFFFFFF;
2162 goto roundAndPack;
2163 }
2164 aSig >>= aExp & 1;
2165 term = ( (bits64) zSig ) * zSig;
2166 rem = ( ( (bits64) aSig )<<32 ) - term;
2167 while ( (sbits64) rem < 0 ) {
2168 --zSig;
2169 rem += ( ( (bits64) zSig )<<1 ) | 1;
2170 }
2171 zSig |= ( rem != 0 );
2172 }
2173 shift32RightJamming( zSig, 1, &zSig );
2174 roundAndPack:
2175 return roundAndPackFloat32( 0, zExp, zSig );
2176
2177 }
2178 #endif /* !SOFTFLOAT_FOR_GCC */
2179
2180 /*
2181 -------------------------------------------------------------------------------
2182 Returns 1 if the single-precision floating-point value `a' is equal to
2183 the corresponding value `b', and 0 otherwise. The comparison is performed
2184 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2185 -------------------------------------------------------------------------------
2186 */
2187 flag float32_eq( float32 a, float32 b )
2188 {
2189
2190 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2191 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2192 ) {
2193 if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) {
2194 float_raise( float_flag_invalid );
2195 }
2196 return 0;
2197 }
2198 return ( a == b ) || ( (bits32) ( ( a | b )<<1 ) == 0 );
2199
2200 }
2201
2202 /*
2203 -------------------------------------------------------------------------------
2204 Returns 1 if the single-precision floating-point value `a' is less than
2205 or equal to the corresponding value `b', and 0 otherwise. The comparison
2206 is performed according to the IEC/IEEE Standard for Binary Floating-Point
2207 Arithmetic.
2208 -------------------------------------------------------------------------------
2209 */
2210 flag float32_le( float32 a, float32 b )
2211 {
2212 flag aSign, bSign;
2213
2214 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2215 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2216 ) {
2217 float_raise( float_flag_invalid );
2218 return 0;
2219 }
2220 aSign = extractFloat32Sign( a );
2221 bSign = extractFloat32Sign( b );
2222 if ( aSign != bSign ) return aSign || ( (bits32) ( ( a | b )<<1 ) == 0 );
2223 return ( a == b ) || ( aSign ^ ( a < b ) );
2224
2225 }
2226
2227 /*
2228 -------------------------------------------------------------------------------
2229 Returns 1 if the single-precision floating-point value `a' is less than
2230 the corresponding value `b', and 0 otherwise. The comparison is performed
2231 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2232 -------------------------------------------------------------------------------
2233 */
2234 flag float32_lt( float32 a, float32 b )
2235 {
2236 flag aSign, bSign;
2237
2238 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2239 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2240 ) {
2241 float_raise( float_flag_invalid );
2242 return 0;
2243 }
2244 aSign = extractFloat32Sign( a );
2245 bSign = extractFloat32Sign( b );
2246 if ( aSign != bSign ) return aSign && ( (bits32) ( ( a | b )<<1 ) != 0 );
2247 return ( a != b ) && ( aSign ^ ( a < b ) );
2248
2249 }
2250
2251 #ifndef SOFTFLOAT_FOR_GCC /* Not needed */
2252 /*
2253 -------------------------------------------------------------------------------
2254 Returns 1 if the single-precision floating-point value `a' is equal to
2255 the corresponding value `b', and 0 otherwise. The invalid exception is
2256 raised if either operand is a NaN. Otherwise, the comparison is performed
2257 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2258 -------------------------------------------------------------------------------
2259 */
2260 flag float32_eq_signaling( float32 a, float32 b )
2261 {
2262
2263 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2264 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2265 ) {
2266 float_raise( float_flag_invalid );
2267 return 0;
2268 }
2269 return ( a == b ) || ( (bits32) ( ( a | b )<<1 ) == 0 );
2270
2271 }
2272
2273 /*
2274 -------------------------------------------------------------------------------
2275 Returns 1 if the single-precision floating-point value `a' is less than or
2276 equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not
2277 cause an exception. Otherwise, the comparison is performed according to the
2278 IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2279 -------------------------------------------------------------------------------
2280 */
2281 flag float32_le_quiet( float32 a, float32 b )
2282 {
2283 flag aSign, bSign;
2284
2285 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2286 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2287 ) {
2288 if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) {
2289 float_raise( float_flag_invalid );
2290 }
2291 return 0;
2292 }
2293 aSign = extractFloat32Sign( a );
2294 bSign = extractFloat32Sign( b );
2295 if ( aSign != bSign ) return aSign || ( (bits32) ( ( a | b )<<1 ) == 0 );
2296 return ( a == b ) || ( aSign ^ ( a < b ) );
2297
2298 }
2299
2300 /*
2301 -------------------------------------------------------------------------------
2302 Returns 1 if the single-precision floating-point value `a' is less than
2303 the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
2304 exception. Otherwise, the comparison is performed according to the IEC/IEEE
2305 Standard for Binary Floating-Point Arithmetic.
2306 -------------------------------------------------------------------------------
2307 */
2308 flag float32_lt_quiet( float32 a, float32 b )
2309 {
2310 flag aSign, bSign;
2311
2312 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2313 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2314 ) {
2315 if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) {
2316 float_raise( float_flag_invalid );
2317 }
2318 return 0;
2319 }
2320 aSign = extractFloat32Sign( a );
2321 bSign = extractFloat32Sign( b );
2322 if ( aSign != bSign ) return aSign && ( (bits32) ( ( a | b )<<1 ) != 0 );
2323 return ( a != b ) && ( aSign ^ ( a < b ) );
2324
2325 }
2326 #endif /* !SOFTFLOAT_FOR_GCC */
2327
2328 #ifndef SOFTFLOAT_FOR_GCC /* Not needed */
2329 /*
2330 -------------------------------------------------------------------------------
2331 Returns the result of converting the double-precision floating-point value
2332 `a' to the 32-bit two's complement integer format. The conversion is
2333 performed according to the IEC/IEEE Standard for Binary Floating-Point
2334 Arithmetic---which means in particular that the conversion is rounded
2335 according to the current rounding mode. If `a' is a NaN, the largest
2336 positive integer is returned. Otherwise, if the conversion overflows, the
2337 largest integer with the same sign as `a' is returned.
2338 -------------------------------------------------------------------------------
2339 */
2340 int32 float64_to_int32( float64 a )
2341 {
2342 flag aSign;
2343 int16 aExp, shiftCount;
2344 bits64 aSig;
2345
2346 aSig = extractFloat64Frac( a );
2347 aExp = extractFloat64Exp( a );
2348 aSign = extractFloat64Sign( a );
2349 if ( ( aExp == 0x7FF ) && aSig ) aSign = 0;
2350 if ( aExp ) aSig |= LIT64( 0x0010000000000000 );
2351 shiftCount = 0x42C - aExp;
2352 if ( 0 < shiftCount ) shift64RightJamming( aSig, shiftCount, &aSig );
2353 return roundAndPackInt32( aSign, aSig );
2354
2355 }
2356 #endif /* !SOFTFLOAT_FOR_GCC */
2357
2358 /*
2359 -------------------------------------------------------------------------------
2360 Returns the result of converting the double-precision floating-point value
2361 `a' to the 32-bit two's complement integer format. The conversion is
2362 performed according to the IEC/IEEE Standard for Binary Floating-Point
2363 Arithmetic, except that the conversion is always rounded toward zero.
2364 If `a' is a NaN, the largest positive integer is returned. Otherwise, if
2365 the conversion overflows, the largest integer with the same sign as `a' is
2366 returned.
2367 -------------------------------------------------------------------------------
2368 */
2369 int32 float64_to_int32_round_to_zero( float64 a )
2370 {
2371 flag aSign;
2372 int16 aExp, shiftCount;
2373 bits64 aSig, savedASig;
2374 int32 z;
2375
2376 aSig = extractFloat64Frac( a );
2377 aExp = extractFloat64Exp( a );
2378 aSign = extractFloat64Sign( a );
2379 if ( 0x41E < aExp ) {
2380 if ( ( aExp == 0x7FF ) && aSig ) aSign = 0;
2381 goto invalid;
2382 }
2383 else if ( aExp < 0x3FF ) {
2384 if ( aExp || aSig ) set_float_exception_inexact_flag();
2385 return 0;
2386 }
2387 aSig |= LIT64( 0x0010000000000000 );
2388 shiftCount = 0x433 - aExp;
2389 savedASig = aSig;
2390 aSig >>= shiftCount;
2391 z = (int32)aSig;
2392 if ( aSign ) z = - z;
2393 if ( ( z < 0 ) ^ aSign ) {
2394 invalid:
2395 float_raise( float_flag_invalid );
2396 return aSign ? (sbits32) 0x80000000 : 0x7FFFFFFF;
2397 }
2398 if ( ( aSig<<shiftCount ) != savedASig ) {
2399 set_float_exception_inexact_flag();
2400 }
2401 return z;
2402
2403 }
2404
2405 #ifndef SOFTFLOAT_FOR_GCC /* Not needed */
2406 /*
2407 -------------------------------------------------------------------------------
2408 Returns the result of converting the double-precision floating-point value
2409 `a' to the 64-bit two's complement integer format. The conversion is
2410 performed according to the IEC/IEEE Standard for Binary Floating-Point
2411 Arithmetic---which means in particular that the conversion is rounded
2412 according to the current rounding mode. If `a' is a NaN, the largest
2413 positive integer is returned. Otherwise, if the conversion overflows, the
2414 largest integer with the same sign as `a' is returned.
2415 -------------------------------------------------------------------------------
2416 */
2417 int64 float64_to_int64( float64 a )
2418 {
2419 flag aSign;
2420 int16 aExp, shiftCount;
2421 bits64 aSig, aSigExtra;
2422
2423 aSig = extractFloat64Frac( a );
2424 aExp = extractFloat64Exp( a );
2425 aSign = extractFloat64Sign( a );
2426 if ( aExp ) aSig |= LIT64( 0x0010000000000000 );
2427 shiftCount = 0x433 - aExp;
2428 if ( shiftCount <= 0 ) {
2429 if ( 0x43E < aExp ) {
2430 float_raise( float_flag_invalid );
2431 if ( ! aSign
2432 || ( ( aExp == 0x7FF )
2433 && ( aSig != LIT64( 0x0010000000000000 ) ) )
2434 ) {
2435 return LIT64( 0x7FFFFFFFFFFFFFFF );
2436 }
2437 return (sbits64) LIT64( 0x8000000000000000 );
2438 }
2439 aSigExtra = 0;
2440 aSig <<= - shiftCount;
2441 }
2442 else {
2443 shift64ExtraRightJamming( aSig, 0, shiftCount, &aSig, &aSigExtra );
2444 }
2445 return roundAndPackInt64( aSign, aSig, aSigExtra );
2446
2447 }
2448
2449 /*
2450 -------------------------------------------------------------------------------
2451 Returns the result of converting the double-precision floating-point value
2452 `a' to the 64-bit two's complement integer format. The conversion is
2453 performed according to the IEC/IEEE Standard for Binary Floating-Point
2454 Arithmetic, except that the conversion is always rounded toward zero.
2455 If `a' is a NaN, the largest positive integer is returned. Otherwise, if
2456 the conversion overflows, the largest integer with the same sign as `a' is
2457 returned.
2458 -------------------------------------------------------------------------------
2459 */
2460 int64 float64_to_int64_round_to_zero( float64 a )
2461 {
2462 flag aSign;
2463 int16 aExp, shiftCount;
2464 bits64 aSig;
2465 int64 z;
2466
2467 aSig = extractFloat64Frac( a );
2468 aExp = extractFloat64Exp( a );
2469 aSign = extractFloat64Sign( a );
2470 if ( aExp ) aSig |= LIT64( 0x0010000000000000 );
2471 shiftCount = aExp - 0x433;
2472 if ( 0 <= shiftCount ) {
2473 if ( 0x43E <= aExp ) {
2474 if ( a != LIT64( 0xC3E0000000000000 ) ) {
2475 float_raise( float_flag_invalid );
2476 if ( ! aSign
2477 || ( ( aExp == 0x7FF )
2478 && ( aSig != LIT64( 0x0010000000000000 ) ) )
2479 ) {
2480 return LIT64( 0x7FFFFFFFFFFFFFFF );
2481 }
2482 }
2483 return (sbits64) LIT64( 0x8000000000000000 );
2484 }
2485 z = aSig<<shiftCount;
2486 }
2487 else {
2488 if ( aExp < 0x3FE ) {
2489 if ( aExp | aSig ) set_float_exception_inexact_flag();
2490 return 0;
2491 }
2492 z = aSig>>( - shiftCount );
2493 if ( (bits64) ( aSig<<( shiftCount & 63 ) ) ) {
2494 set_float_exception_inexact_flag();
2495 }
2496 }
2497 if ( aSign ) z = - z;
2498 return z;
2499
2500 }
2501 #endif /* !SOFTFLOAT_FOR_GCC */
2502
2503 /*
2504 -------------------------------------------------------------------------------
2505 Returns the result of converting the double-precision floating-point value
2506 `a' to the single-precision floating-point format. The conversion is
2507 performed according to the IEC/IEEE Standard for Binary Floating-Point
2508 Arithmetic.
2509 -------------------------------------------------------------------------------
2510 */
2511 float32 float64_to_float32( float64 a )
2512 {
2513 flag aSign;
2514 int16 aExp;
2515 bits64 aSig;
2516 bits32 zSig;
2517
2518 aSig = extractFloat64Frac( a );
2519 aExp = extractFloat64Exp( a );
2520 aSign = extractFloat64Sign( a );
2521 if ( aExp == 0x7FF ) {
2522 if ( aSig ) return commonNaNToFloat32( float64ToCommonNaN( a ) );
2523 return packFloat32( aSign, 0xFF, 0 );
2524 }
2525 shift64RightJamming( aSig, 22, &aSig );
2526 zSig = (bits32)aSig;
2527 if ( aExp || zSig ) {
2528 zSig |= 0x40000000;
2529 aExp -= 0x381;
2530 }
2531 return roundAndPackFloat32( aSign, aExp, zSig );
2532
2533 }
2534
2535 #ifdef FLOATX80
2536
2537 /*
2538 -------------------------------------------------------------------------------
2539 Returns the result of converting the double-precision floating-point value
2540 `a' to the extended double-precision floating-point format. The conversion
2541 is performed according to the IEC/IEEE Standard for Binary Floating-Point
2542 Arithmetic.
2543 -------------------------------------------------------------------------------
2544 */
2545 floatx80 float64_to_floatx80( float64 a )
2546 {
2547 flag aSign;
2548 int16 aExp;
2549 bits64 aSig;
2550
2551 aSig = extractFloat64Frac( a );
2552 aExp = extractFloat64Exp( a );
2553 aSign = extractFloat64Sign( a );
2554 if ( aExp == 0x7FF ) {
2555 if ( aSig ) return commonNaNToFloatx80( float64ToCommonNaN( a ) );
2556 return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
2557 }
2558 if ( aExp == 0 ) {
2559 if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 );
2560 normalizeFloat64Subnormal( aSig, &aExp, &aSig );
2561 }
2562 return
2563 packFloatx80(
2564 aSign, aExp + 0x3C00, ( aSig | LIT64( 0x0010000000000000 ) )<<11 );
2565
2566 }
2567
2568 #endif
2569
2570 #ifdef FLOAT128
2571
2572 /*
2573 -------------------------------------------------------------------------------
2574 Returns the result of converting the double-precision floating-point value
2575 `a' to the quadruple-precision floating-point format. The conversion is
2576 performed according to the IEC/IEEE Standard for Binary Floating-Point
2577 Arithmetic.
2578 -------------------------------------------------------------------------------
2579 */
2580 float128 float64_to_float128( float64 a )
2581 {
2582 flag aSign;
2583 int16 aExp;
2584 bits64 aSig, zSig0, zSig1;
2585
2586 aSig = extractFloat64Frac( a );
2587 aExp = extractFloat64Exp( a );
2588 aSign = extractFloat64Sign( a );
2589 if ( aExp == 0x7FF ) {
2590 if ( aSig ) return commonNaNToFloat128( float64ToCommonNaN( a ) );
2591 return packFloat128( aSign, 0x7FFF, 0, 0 );
2592 }
2593 if ( aExp == 0 ) {
2594 if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 );
2595 normalizeFloat64Subnormal( aSig, &aExp, &aSig );
2596 --aExp;
2597 }
2598 shift128Right( aSig, 0, 4, &zSig0, &zSig1 );
2599 return packFloat128( aSign, aExp + 0x3C00, zSig0, zSig1 );
2600
2601 }
2602
2603 #endif
2604
2605 #ifndef SOFTFLOAT_FOR_GCC
2606 /*
2607 -------------------------------------------------------------------------------
2608 Rounds the double-precision floating-point value `a' to an integer, and
2609 returns the result as a double-precision floating-point value. The
2610 operation is performed according to the IEC/IEEE Standard for Binary
2611 Floating-Point Arithmetic.
2612 -------------------------------------------------------------------------------
2613 */
2614 float64 float64_round_to_int( float64 a )
2615 {
2616 flag aSign;
2617 int16 aExp;
2618 bits64 lastBitMask, roundBitsMask;
2619 int8 roundingMode;
2620 float64 z;
2621
2622 aExp = extractFloat64Exp( a );
2623 if ( 0x433 <= aExp ) {
2624 if ( ( aExp == 0x7FF ) && extractFloat64Frac( a ) ) {
2625 return propagateFloat64NaN( a, a );
2626 }
2627 return a;
2628 }
2629 if ( aExp < 0x3FF ) {
2630 if ( (bits64) ( a<<1 ) == 0 ) return a;
2631 set_float_exception_inexact_flag();
2632 aSign = extractFloat64Sign( a );
2633 switch ( float_rounding_mode ) {
2634 case float_round_nearest_even:
2635 if ( ( aExp == 0x3FE ) && extractFloat64Frac( a ) ) {
2636 return packFloat64( aSign, 0x3FF, 0 );
2637 }
2638 break;
2639 case float_round_to_zero:
2640 break;
2641 case float_round_down:
2642 return aSign ? LIT64( 0xBFF0000000000000 ) : 0;
2643 case float_round_up:
2644 return
2645 aSign ? LIT64( 0x8000000000000000 ) : LIT64( 0x3FF0000000000000 );
2646 }
2647 return packFloat64( aSign, 0, 0 );
2648 }
2649 lastBitMask = 1;
2650 lastBitMask <<= 0x433 - aExp;
2651 roundBitsMask = lastBitMask - 1;
2652 z = a;
2653 roundingMode = float_rounding_mode;
2654 if ( roundingMode == float_round_nearest_even ) {
2655 z += lastBitMask>>1;
2656 if ( ( z & roundBitsMask ) == 0 ) z &= ~ lastBitMask;
2657 }
2658 else if ( roundingMode != float_round_to_zero ) {
2659 if ( extractFloat64Sign( z ) ^ ( roundingMode == float_round_up ) ) {
2660 z += roundBitsMask;
2661 }
2662 }
2663 z &= ~ roundBitsMask;
2664 if ( z != a ) set_float_exception_inexact_flag();
2665 return z;
2666
2667 }
2668 #endif
2669
2670 /*
2671 -------------------------------------------------------------------------------
2672 Returns the result of adding the absolute values of the double-precision
2673 floating-point values `a' and `b'. If `zSign' is 1, the sum is negated
2674 before being returned. `zSign' is ignored if the result is a NaN.
2675 The addition is performed according to the IEC/IEEE Standard for Binary
2676 Floating-Point Arithmetic.
2677 -------------------------------------------------------------------------------
2678 */
2679 static float64 addFloat64Sigs( float64 a, float64 b, flag zSign )
2680 {
2681 int16 aExp, bExp, zExp;
2682 bits64 aSig, bSig, zSig;
2683 int16 expDiff;
2684
2685 aSig = extractFloat64Frac( a );
2686 aExp = extractFloat64Exp( a );
2687 bSig = extractFloat64Frac( b );
2688 bExp = extractFloat64Exp( b );
2689 expDiff = aExp - bExp;
2690 aSig <<= 9;
2691 bSig <<= 9;
2692 if ( 0 < expDiff ) {
2693 if ( aExp == 0x7FF ) {
2694 if ( aSig ) return propagateFloat64NaN( a, b );
2695 return a;
2696 }
2697 if ( bExp == 0 ) {
2698 --expDiff;
2699 }
2700 else {
2701 bSig |= LIT64( 0x2000000000000000 );
2702 }
2703 shift64RightJamming( bSig, expDiff, &bSig );
2704 zExp = aExp;
2705 }
2706 else if ( expDiff < 0 ) {
2707 if ( bExp == 0x7FF ) {
2708 if ( bSig ) return propagateFloat64NaN( a, b );
2709 return packFloat64( zSign, 0x7FF, 0 );
2710 }
2711 if ( aExp == 0 ) {
2712 ++expDiff;
2713 }
2714 else {
2715 aSig |= LIT64( 0x2000000000000000 );
2716 }
2717 shift64RightJamming( aSig, - expDiff, &aSig );
2718 zExp = bExp;
2719 }
2720 else {
2721 if ( aExp == 0x7FF ) {
2722 if ( aSig | bSig ) return propagateFloat64NaN( a, b );
2723 return a;
2724 }
2725 if ( aExp == 0 ) return packFloat64( zSign, 0, ( aSig + bSig )>>9 );
2726 zSig = LIT64( 0x4000000000000000 ) + aSig + bSig;
2727 zExp = aExp;
2728 goto roundAndPack;
2729 }
2730 aSig |= LIT64( 0x2000000000000000 );
2731 zSig = ( aSig + bSig )<<1;
2732 --zExp;
2733 if ( (sbits64) zSig < 0 ) {
2734 zSig = aSig + bSig;
2735 ++zExp;
2736 }
2737 roundAndPack:
2738 return roundAndPackFloat64( zSign, zExp, zSig );
2739
2740 }
2741
2742 /*
2743 -------------------------------------------------------------------------------
2744 Returns the result of subtracting the absolute values of the double-
2745 precision floating-point values `a' and `b'. If `zSign' is 1, the
2746 difference is negated before being returned. `zSign' is ignored if the
2747 result is a NaN. The subtraction is performed according to the IEC/IEEE
2748 Standard for Binary Floating-Point Arithmetic.
2749 -------------------------------------------------------------------------------
2750 */
2751 static float64 subFloat64Sigs( float64 a, float64 b, flag zSign )
2752 {
2753 int16 aExp, bExp, zExp;
2754 bits64 aSig, bSig, zSig;
2755 int16 expDiff;
2756
2757 aSig = extractFloat64Frac( a );
2758 aExp = extractFloat64Exp( a );
2759 bSig = extractFloat64Frac( b );
2760 bExp = extractFloat64Exp( b );
2761 expDiff = aExp - bExp;
2762 aSig <<= 10;
2763 bSig <<= 10;
2764 if ( 0 < expDiff ) goto aExpBigger;
2765 if ( expDiff < 0 ) goto bExpBigger;
2766 if ( aExp == 0x7FF ) {
2767 if ( aSig | bSig ) return propagateFloat64NaN( a, b );
2768 float_raise( float_flag_invalid );
2769 return float64_default_nan;
2770 }
2771 if ( aExp == 0 ) {
2772 aExp = 1;
2773 bExp = 1;
2774 }
2775 if ( bSig < aSig ) goto aBigger;
2776 if ( aSig < bSig ) goto bBigger;
2777 return packFloat64( float_rounding_mode == float_round_down, 0, 0 );
2778 bExpBigger:
2779 if ( bExp == 0x7FF ) {
2780 if ( bSig ) return propagateFloat64NaN( a, b );
2781 return packFloat64( zSign ^ 1, 0x7FF, 0 );
2782 }
2783 if ( aExp == 0 ) {
2784 ++expDiff;
2785 }
2786 else {
2787 aSig |= LIT64( 0x4000000000000000 );
2788 }
2789 shift64RightJamming( aSig, - expDiff, &aSig );
2790 bSig |= LIT64( 0x4000000000000000 );
2791 bBigger:
2792 zSig = bSig - aSig;
2793 zExp = bExp;
2794 zSign ^= 1;
2795 goto normalizeRoundAndPack;
2796 aExpBigger:
2797 if ( aExp == 0x7FF ) {
2798 if ( aSig ) return propagateFloat64NaN( a, b );
2799 return a;
2800 }
2801 if ( bExp == 0 ) {
2802 --expDiff;
2803 }
2804 else {
2805 bSig |= LIT64( 0x4000000000000000 );
2806 }
2807 shift64RightJamming( bSig, expDiff, &bSig );
2808 aSig |= LIT64( 0x4000000000000000 );
2809 aBigger:
2810 zSig = aSig - bSig;
2811 zExp = aExp;
2812 normalizeRoundAndPack:
2813 --zExp;
2814 return normalizeRoundAndPackFloat64( zSign, zExp, zSig );
2815
2816 }
2817
2818 /*
2819 -------------------------------------------------------------------------------
2820 Returns the result of adding the double-precision floating-point values `a'
2821 and `b'. The operation is performed according to the IEC/IEEE Standard for
2822 Binary Floating-Point Arithmetic.
2823 -------------------------------------------------------------------------------
2824 */
2825 float64 float64_add( float64 a, float64 b )
2826 {
2827 flag aSign, bSign;
2828
2829 aSign = extractFloat64Sign( a );
2830 bSign = extractFloat64Sign( b );
2831 if ( aSign == bSign ) {
2832 return addFloat64Sigs( a, b, aSign );
2833 }
2834 else {
2835 return subFloat64Sigs( a, b, aSign );
2836 }
2837
2838 }
2839
2840 /*
2841 -------------------------------------------------------------------------------
2842 Returns the result of subtracting the double-precision floating-point values
2843 `a' and `b'. The operation is performed according to the IEC/IEEE Standard
2844 for Binary Floating-Point Arithmetic.
2845 -------------------------------------------------------------------------------
2846 */
2847 float64 float64_sub( float64 a, float64 b )
2848 {
2849 flag aSign, bSign;
2850
2851 aSign = extractFloat64Sign( a );
2852 bSign = extractFloat64Sign( b );
2853 if ( aSign == bSign ) {
2854 return subFloat64Sigs( a, b, aSign );
2855 }
2856 else {
2857 return addFloat64Sigs( a, b, aSign );
2858 }
2859
2860 }
2861
2862 /*
2863 -------------------------------------------------------------------------------
2864 Returns the result of multiplying the double-precision floating-point values
2865 `a' and `b'. The operation is performed according to the IEC/IEEE Standard
2866 for Binary Floating-Point Arithmetic.
2867 -------------------------------------------------------------------------------
2868 */
2869 float64 float64_mul( float64 a, float64 b )
2870 {
2871 flag aSign, bSign, zSign;
2872 int16 aExp, bExp, zExp;
2873 bits64 aSig, bSig, zSig0, zSig1;
2874
2875 aSig = extractFloat64Frac( a );
2876 aExp = extractFloat64Exp( a );
2877 aSign = extractFloat64Sign( a );
2878 bSig = extractFloat64Frac( b );
2879 bExp = extractFloat64Exp( b );
2880 bSign = extractFloat64Sign( b );
2881 zSign = aSign ^ bSign;
2882 if ( aExp == 0x7FF ) {
2883 if ( aSig || ( ( bExp == 0x7FF ) && bSig ) ) {
2884 return propagateFloat64NaN( a, b );
2885 }
2886 if ( ( bExp | bSig ) == 0 ) {
2887 float_raise( float_flag_invalid );
2888 return float64_default_nan;
2889 }
2890 return packFloat64( zSign, 0x7FF, 0 );
2891 }
2892 if ( bExp == 0x7FF ) {
2893 if ( bSig ) return propagateFloat64NaN( a, b );
2894 if ( ( aExp | aSig ) == 0 ) {
2895 float_raise( float_flag_invalid );
2896 return float64_default_nan;
2897 }
2898 return packFloat64( zSign, 0x7FF, 0 );
2899 }
2900 if ( aExp == 0 ) {
2901 if ( aSig == 0 ) return packFloat64( zSign, 0, 0 );
2902 normalizeFloat64Subnormal( aSig, &aExp, &aSig );
2903 }
2904 if ( bExp == 0 ) {
2905 if ( bSig == 0 ) return packFloat64( zSign, 0, 0 );
2906 normalizeFloat64Subnormal( bSig, &bExp, &bSig );
2907 }
2908 zExp = aExp + bExp - 0x3FF;
2909 aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<10;
2910 bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;
2911 mul64To128( aSig, bSig, &zSig0, &zSig1 );
2912 zSig0 |= ( zSig1 != 0 );
2913 if ( 0 <= (sbits64) ( zSig0<<1 ) ) {
2914 zSig0 <<= 1;
2915 --zExp;
2916 }
2917 return roundAndPackFloat64( zSign, zExp, zSig0 );
2918
2919 }
2920
2921 /*
2922 -------------------------------------------------------------------------------
2923 Returns the result of dividing the double-precision floating-point value `a'
2924 by the corresponding value `b'. The operation is performed according to
2925 the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2926 -------------------------------------------------------------------------------
2927 */
2928 float64 float64_div( float64 a, float64 b )
2929 {
2930 flag aSign, bSign, zSign;
2931 int16 aExp, bExp, zExp;
2932 bits64 aSig, bSig, zSig;
2933 bits64 rem0, rem1;
2934 bits64 term0, term1;
2935
2936 aSig = extractFloat64Frac( a );
2937 aExp = extractFloat64Exp( a );
2938 aSign = extractFloat64Sign( a );
2939 bSig = extractFloat64Frac( b );
2940 bExp = extractFloat64Exp( b );
2941 bSign = extractFloat64Sign( b );
2942 zSign = aSign ^ bSign;
2943 if ( aExp == 0x7FF ) {
2944 if ( aSig ) return propagateFloat64NaN( a, b );
2945 if ( bExp == 0x7FF ) {
2946 if ( bSig ) return propagateFloat64NaN( a, b );
2947 float_raise( float_flag_invalid );
2948 return float64_default_nan;
2949 }
2950 return packFloat64( zSign, 0x7FF, 0 );
2951 }
2952 if ( bExp == 0x7FF ) {
2953 if ( bSig ) return propagateFloat64NaN( a, b );
2954 return packFloat64( zSign, 0, 0 );
2955 }
2956 if ( bExp == 0 ) {
2957 if ( bSig == 0 ) {
2958 if ( ( aExp | aSig ) == 0 ) {
2959 float_raise( float_flag_invalid );
2960 return float64_default_nan;
2961 }
2962 float_raise( float_flag_divbyzero );
2963 return packFloat64( zSign, 0x7FF, 0 );
2964 }
2965 normalizeFloat64Subnormal( bSig, &bExp, &bSig );
2966 }
2967 if ( aExp == 0 ) {
2968 if ( aSig == 0 ) return packFloat64( zSign, 0, 0 );
2969 normalizeFloat64Subnormal( aSig, &aExp, &aSig );
2970 }
2971 zExp = aExp - bExp + 0x3FD;
2972 aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<10;
2973 bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;
2974 if ( bSig <= ( aSig + aSig ) ) {
2975 aSig >>= 1;
2976 ++zExp;
2977 }
2978 zSig = estimateDiv128To64( aSig, 0, bSig );
2979 if ( ( zSig & 0x1FF ) <= 2 ) {
2980 mul64To128( bSig, zSig, &term0, &term1 );
2981 sub128( aSig, 0, term0, term1, &rem0, &rem1 );
2982 while ( (sbits64) rem0 < 0 ) {
2983 --zSig;
2984 add128( rem0, rem1, 0, bSig, &rem0, &rem1 );
2985 }
2986 zSig |= ( rem1 != 0 );
2987 }
2988 return roundAndPackFloat64( zSign, zExp, zSig );
2989
2990 }
2991
2992 #ifndef SOFTFLOAT_FOR_GCC
2993 /*
2994 -------------------------------------------------------------------------------
2995 Returns the remainder of the double-precision floating-point value `a'
2996 with respect to the corresponding value `b'. The operation is performed
2997 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2998 -------------------------------------------------------------------------------
2999 */
3000 float64 float64_rem( float64 a, float64 b )
3001 {
3002 flag aSign, bSign, zSign;
3003 int16 aExp, bExp, expDiff;
3004 bits64 aSig, bSig;
3005 bits64 q, alternateASig;
3006 sbits64 sigMean;
3007
3008 aSig = extractFloat64Frac( a );
3009 aExp = extractFloat64Exp( a );
3010 aSign = extractFloat64Sign( a );
3011 bSig = extractFloat64Frac( b );
3012 bExp = extractFloat64Exp( b );
3013 bSign = extractFloat64Sign( b );
3014 if ( aExp == 0x7FF ) {
3015 if ( aSig || ( ( bExp == 0x7FF ) && bSig ) ) {
3016 return propagateFloat64NaN( a, b );
3017 }
3018 float_raise( float_flag_invalid );
3019 return float64_default_nan;
3020 }
3021 if ( bExp == 0x7FF ) {
3022 if ( bSig ) return propagateFloat64NaN( a, b );
3023 return a;
3024 }
3025 if ( bExp == 0 ) {
3026 if ( bSig == 0 ) {
3027 float_raise( float_flag_invalid );
3028 return float64_default_nan;
3029 }
3030 normalizeFloat64Subnormal( bSig, &bExp, &bSig );
3031 }
3032 if ( aExp == 0 ) {
3033 if ( aSig == 0 ) return a;
3034 normalizeFloat64Subnormal( aSig, &aExp, &aSig );
3035 }
3036 expDiff = aExp - bExp;
3037 aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<11;
3038 bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;
3039 if ( expDiff < 0 ) {
3040 if ( expDiff < -1 ) return a;
3041 aSig >>= 1;
3042 }
3043 q = ( bSig <= aSig );
3044 if ( q ) aSig -= bSig;
3045 expDiff -= 64;
3046 while ( 0 < expDiff ) {
3047 q = estimateDiv128To64( aSig, 0, bSig );
3048 q = ( 2 < q ) ? q - 2 : 0;
3049 aSig = - ( ( bSig>>2 ) * q );
3050 expDiff -= 62;
3051 }
3052 expDiff += 64;
3053 if ( 0 < expDiff ) {
3054 q = estimateDiv128To64( aSig, 0, bSig );
3055 q = ( 2 < q ) ? q - 2 : 0;
3056 q >>= 64 - expDiff;
3057 bSig >>= 2;
3058 aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q;
3059 }
3060 else {
3061 aSig >>= 2;
3062 bSig >>= 2;
3063 }
3064 do {
3065 alternateASig = aSig;
3066 ++q;
3067 aSig -= bSig;
3068 } while ( 0 <= (sbits64) aSig );
3069 sigMean = aSig + alternateASig;
3070 if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) {
3071 aSig = alternateASig;
3072 }
3073 zSign = ( (sbits64) aSig < 0 );
3074 if ( zSign ) aSig = - aSig;
3075 return normalizeRoundAndPackFloat64( aSign ^ zSign, bExp, aSig );
3076
3077 }
3078
3079 /*
3080 -------------------------------------------------------------------------------
3081 Returns the square root of the double-precision floating-point value `a'.
3082 The operation is performed according to the IEC/IEEE Standard for Binary
3083 Floating-Point Arithmetic.
3084 -------------------------------------------------------------------------------
3085 */
3086 float64 float64_sqrt( float64 a )
3087 {
3088 flag aSign;
3089 int16 aExp, zExp;
3090 bits64 aSig, zSig, doubleZSig;
3091 bits64 rem0, rem1, term0, term1;
3092
3093 aSig = extractFloat64Frac( a );
3094 aExp = extractFloat64Exp( a );
3095 aSign = extractFloat64Sign( a );
3096 if ( aExp == 0x7FF ) {
3097 if ( aSig ) return propagateFloat64NaN( a, a );
3098 if ( ! aSign ) return a;
3099 float_raise( float_flag_invalid );
3100 return float64_default_nan;
3101 }
3102 if ( aSign ) {
3103 if ( ( aExp | aSig ) == 0 ) return a;
3104 float_raise( float_flag_invalid );
3105 return float64_default_nan;
3106 }
3107 if ( aExp == 0 ) {
3108 if ( aSig == 0 ) return 0;
3109 normalizeFloat64Subnormal( aSig, &aExp, &aSig );
3110 }
3111 zExp = ( ( aExp - 0x3FF )>>1 ) + 0x3FE;
3112 aSig |= LIT64( 0x0010000000000000 );
3113 zSig = estimateSqrt32( aExp, aSig>>21 );
3114 aSig <<= 9 - ( aExp & 1 );
3115 zSig = estimateDiv128To64( aSig, 0, zSig<<32 ) + ( zSig<<30 );
3116 if ( ( zSig & 0x1FF ) <= 5 ) {
3117 doubleZSig = zSig<<1;
3118 mul64To128( zSig, zSig, &term0, &term1 );
3119 sub128( aSig, 0, term0, term1, &rem0, &rem1 );
3120 while ( (sbits64) rem0 < 0 ) {
3121 --zSig;
3122 doubleZSig -= 2;
3123 add128( rem0, rem1, zSig>>63, doubleZSig | 1, &rem0, &rem1 );
3124 }
3125 zSig |= ( ( rem0 | rem1 ) != 0 );
3126 }
3127 return roundAndPackFloat64( 0, zExp, zSig );
3128
3129 }
3130 #endif
3131
3132 /*
3133 -------------------------------------------------------------------------------
3134 Returns 1 if the double-precision floating-point value `a' is equal to the
3135 corresponding value `b', and 0 otherwise. The comparison is performed
3136 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3137 -------------------------------------------------------------------------------
3138 */
3139 flag float64_eq( float64 a, float64 b )
3140 {
3141
3142 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
3143 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
3144 ) {
3145 if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) {
3146 float_raise( float_flag_invalid );
3147 }
3148 return 0;
3149 }
3150 return ( a == b ) ||
3151 ( (bits64) ( ( FLOAT64_DEMANGLE(a) | FLOAT64_DEMANGLE(b) )<<1 ) == 0 );
3152
3153 }
3154
3155 /*
3156 -------------------------------------------------------------------------------
3157 Returns 1 if the double-precision floating-point value `a' is less than or
3158 equal to the corresponding value `b', and 0 otherwise. The comparison is
3159 performed according to the IEC/IEEE Standard for Binary Floating-Point
3160 Arithmetic.
3161 -------------------------------------------------------------------------------
3162 */
3163 flag float64_le( float64 a, float64 b )
3164 {
3165 flag aSign, bSign;
3166
3167 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
3168 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
3169 ) {
3170 float_raise( float_flag_invalid );
3171 return 0;
3172 }
3173 aSign = extractFloat64Sign( a );
3174 bSign = extractFloat64Sign( b );
3175 if ( aSign != bSign )
3176 return aSign ||
3177 ( (bits64) ( ( FLOAT64_DEMANGLE(a) | FLOAT64_DEMANGLE(b) )<<1 ) ==
3178 0 );
3179 return ( a == b ) ||
3180 ( aSign ^ ( FLOAT64_DEMANGLE(a) < FLOAT64_DEMANGLE(b) ) );
3181
3182 }
3183
3184 /*
3185 -------------------------------------------------------------------------------
3186 Returns 1 if the double-precision floating-point value `a' is less than
3187 the corresponding value `b', and 0 otherwise. The comparison is performed
3188 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3189 -------------------------------------------------------------------------------
3190 */
3191 flag float64_lt( float64 a, float64 b )
3192 {
3193 flag aSign, bSign;
3194
3195 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
3196 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
3197 ) {
3198 float_raise( float_flag_invalid );
3199 return 0;
3200 }
3201 aSign = extractFloat64Sign( a );
3202 bSign = extractFloat64Sign( b );
3203 if ( aSign != bSign )
3204 return aSign &&
3205 ( (bits64) ( ( FLOAT64_DEMANGLE(a) | FLOAT64_DEMANGLE(b) )<<1 ) !=
3206 0 );
3207 return ( a != b ) &&
3208 ( aSign ^ ( FLOAT64_DEMANGLE(a) < FLOAT64_DEMANGLE(b) ) );
3209
3210 }
3211
3212 #ifndef SOFTFLOAT_FOR_GCC
3213 /*
3214 -------------------------------------------------------------------------------
3215 Returns 1 if the double-precision floating-point value `a' is equal to the
3216 corresponding value `b', and 0 otherwise. The invalid exception is raised
3217 if either operand is a NaN. Otherwise, the comparison is performed
3218 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3219 -------------------------------------------------------------------------------
3220 */
3221 flag float64_eq_signaling( float64 a, float64 b )
3222 {
3223
3224 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
3225 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
3226 ) {
3227 float_raise( float_flag_invalid );
3228 return 0;
3229 }
3230 return ( a == b ) || ( (bits64) ( ( a | b )<<1 ) == 0 );
3231
3232 }
3233
3234 /*
3235 -------------------------------------------------------------------------------
3236 Returns 1 if the double-precision floating-point value `a' is less than or
3237 equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not
3238 cause an exception. Otherwise, the comparison is performed according to the
3239 IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3240 -------------------------------------------------------------------------------
3241 */
3242 flag float64_le_quiet( float64 a, float64 b )
3243 {
3244 flag aSign, bSign;
3245
3246 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
3247 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
3248 ) {
3249 if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) {
3250 float_raise( float_flag_invalid );
3251 }
3252 return 0;
3253 }
3254 aSign = extractFloat64Sign( a );
3255 bSign = extractFloat64Sign( b );
3256 if ( aSign != bSign ) return aSign || ( (bits64) ( ( a | b )<<1 ) == 0 );
3257 return ( a == b ) || ( aSign ^ ( a < b ) );
3258
3259 }
3260
3261 /*
3262 -------------------------------------------------------------------------------
3263 Returns 1 if the double-precision floating-point value `a' is less than
3264 the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
3265 exception. Otherwise, the comparison is performed according to the IEC/IEEE
3266 Standard for Binary Floating-Point Arithmetic.
3267 -------------------------------------------------------------------------------
3268 */
3269 flag float64_lt_quiet( float64 a, float64 b )
3270 {
3271 flag aSign, bSign;
3272
3273 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
3274 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
3275 ) {
3276 if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) {
3277 float_raise( float_flag_invalid );
3278 }
3279 return 0;
3280 }
3281 aSign = extractFloat64Sign( a );
3282 bSign = extractFloat64Sign( b );
3283 if ( aSign != bSign ) return aSign && ( (bits64) ( ( a | b )<<1 ) != 0 );
3284 return ( a != b ) && ( aSign ^ ( a < b ) );
3285
3286 }
3287 #endif
3288
3289 #ifdef FLOATX80
3290
3291 /*
3292 -------------------------------------------------------------------------------
3293 Returns the result of converting the extended double-precision floating-
3294 point value `a' to the 32-bit two's complement integer format. The
3295 conversion is performed according to the IEC/IEEE Standard for Binary
3296 Floating-Point Arithmetic---which means in particular that the conversion
3297 is rounded according to the current rounding mode. If `a' is a NaN, the
3298 largest positive integer is returned. Otherwise, if the conversion
3299 overflows, the largest integer with the same sign as `a' is returned.
3300 -------------------------------------------------------------------------------
3301 */
3302 int32 floatx80_to_int32( floatx80 a )
3303 {
3304 flag aSign;
3305 int32 aExp, shiftCount;
3306 bits64 aSig;
3307
3308 aSig = extractFloatx80Frac( a );
3309 aExp = extractFloatx80Exp( a );
3310 aSign = extractFloatx80Sign( a );
3311 if ( ( aExp == 0x7FFF ) && (bits64) ( aSig<<1 ) ) aSign = 0;
3312 shiftCount = 0x4037 - aExp;
3313 if ( shiftCount <= 0 ) shiftCount = 1;
3314 shift64RightJamming( aSig, shiftCount, &aSig );
3315 return roundAndPackInt32( aSign, aSig );
3316
3317 }
3318
3319 /*
3320 -------------------------------------------------------------------------------
3321 Returns the result of converting the extended double-precision floating-
3322 point value `a' to the 32-bit two's complement integer format. The
3323 conversion is performed according to the IEC/IEEE Standard for Binary
3324 Floating-Point Arithmetic, except that the conversion is always rounded
3325 toward zero. If `a' is a NaN, the largest positive integer is returned.
3326 Otherwise, if the conversion overflows, the largest integer with the same
3327 sign as `a' is returned.
3328 -------------------------------------------------------------------------------
3329 */
3330 int32 floatx80_to_int32_round_to_zero( floatx80 a )
3331 {
3332 flag aSign;
3333 int32 aExp, shiftCount;
3334 bits64 aSig, savedASig;
3335 int32 z;
3336
3337 aSig = extractFloatx80Frac( a );
3338 aExp = extractFloatx80Exp( a );
3339 aSign = extractFloatx80Sign( a );
3340 if ( 0x401E < aExp ) {
3341 if ( ( aExp == 0x7FFF ) && (bits64) ( aSig<<1 ) ) aSign = 0;
3342 goto invalid;
3343 }
3344 else if ( aExp < 0x3FFF ) {
3345 if ( aExp || aSig ) set_float_exception_inexact_flag();
3346 return 0;
3347 }
3348 shiftCount = 0x403E - aExp;
3349 savedASig = aSig;
3350 aSig >>= shiftCount;
3351 z = aSig;
3352 if ( aSign ) z = - z;
3353 if ( ( z < 0 ) ^ aSign ) {
3354 invalid:
3355 float_raise( float_flag_invalid );
3356 return aSign ? (sbits32) 0x80000000 : 0x7FFFFFFF;
3357 }
3358 if ( ( aSig<<shiftCount ) != savedASig ) {
3359 set_float_exception_inexact_flag();
3360 }
3361 return z;
3362
3363 }
3364
3365 /*
3366 -------------------------------------------------------------------------------
3367 Returns the result of converting the extended double-precision floating-
3368 point value `a' to the 64-bit two's complement integer format. The
3369 conversion is performed according to the IEC/IEEE Standard for Binary
3370 Floating-Point Arithmetic---which means in particular that the conversion
3371 is rounded according to the current rounding mode. If `a' is a NaN,
3372 the largest positive integer is returned. Otherwise, if the conversion
3373 overflows, the largest integer with the same sign as `a' is returned.
3374 -------------------------------------------------------------------------------
3375 */
3376 int64 floatx80_to_int64( floatx80 a )
3377 {
3378 flag aSign;
3379 int32 aExp, shiftCount;
3380 bits64 aSig, aSigExtra;
3381
3382 aSig = extractFloatx80Frac( a );
3383 aExp = extractFloatx80Exp( a );
3384 aSign = extractFloatx80Sign( a );
3385 shiftCount = 0x403E - aExp;
3386 if ( shiftCount <= 0 ) {
3387 if ( shiftCount ) {
3388 float_raise( float_flag_invalid );
3389 if ( ! aSign
3390 || ( ( aExp == 0x7FFF )
3391 && ( aSig != LIT64( 0x8000000000000000 ) ) )
3392 ) {
3393 return LIT64( 0x7FFFFFFFFFFFFFFF );
3394 }
3395 return (sbits64) LIT64( 0x8000000000000000 );
3396 }
3397 aSigExtra = 0;
3398 }
3399 else {
3400 shift64ExtraRightJamming( aSig, 0, shiftCount, &aSig, &aSigExtra );
3401 }
3402 return roundAndPackInt64( aSign, aSig, aSigExtra );
3403
3404 }
3405
3406 /*
3407 -------------------------------------------------------------------------------
3408 Returns the result of converting the extended double-precision floating-
3409 point value `a' to the 64-bit two's complement integer format. The
3410 conversion is performed according to the IEC/IEEE Standard for Binary
3411 Floating-Point Arithmetic, except that the conversion is always rounded
3412 toward zero. If `a' is a NaN, the largest positive integer is returned.
3413 Otherwise, if the conversion overflows, the largest integer with the same
3414 sign as `a' is returned.
3415 -------------------------------------------------------------------------------
3416 */
3417 int64 floatx80_to_int64_round_to_zero( floatx80 a )
3418 {
3419 flag aSign;
3420 int32 aExp, shiftCount;
3421 bits64 aSig;
3422 int64 z;
3423
3424 aSig = extractFloatx80Frac( a );
3425 aExp = extractFloatx80Exp( a );
3426 aSign = extractFloatx80Sign( a );
3427 shiftCount = aExp - 0x403E;
3428 if ( 0 <= shiftCount ) {
3429 aSig &= LIT64( 0x7FFFFFFFFFFFFFFF );
3430 if ( ( (a.high>>X80SHIFT) != 0xC03E ) || aSig ) {
3431 float_raise( float_flag_invalid );
3432 if ( ! aSign || ( ( aExp == 0x7FFF ) && aSig ) ) {
3433 return LIT64( 0x7FFFFFFFFFFFFFFF );
3434 }
3435 }
3436 return (sbits64) LIT64( 0x8000000000000000 );
3437 }
3438 else if ( aExp < 0x3FFF ) {
3439 if ( aExp | aSig ) set_float_exception_inexact_flag();
3440 return 0;
3441 }
3442 z = aSig>>( - shiftCount );
3443 if ( (bits64) ( aSig<<( shiftCount & 63 ) ) ) {
3444 set_float_exception_inexact_flag();
3445 }
3446 if ( aSign ) z = - z;
3447 return z;
3448
3449 }
3450
3451 /*
3452 -------------------------------------------------------------------------------
3453 Returns the result of converting the extended double-precision floating-
3454 point value `a' to the single-precision floating-point format. The
3455 conversion is performed according to the IEC/IEEE Standard for Binary
3456 Floating-Point Arithmetic.
3457 -------------------------------------------------------------------------------
3458 */
3459 float32 floatx80_to_float32( floatx80 a )
3460 {
3461 flag aSign;
3462 int32 aExp;
3463 bits64 aSig;
3464
3465 aSig = extractFloatx80Frac( a );
3466 aExp = extractFloatx80Exp( a );
3467 aSign = extractFloatx80Sign( a );
3468 if ( aExp == 0x7FFF ) {
3469 if ( (bits64) ( aSig<<1 ) ) {
3470 return commonNaNToFloat32( floatx80ToCommonNaN( a ) );
3471 }
3472 return packFloat32( aSign, 0xFF, 0 );
3473 }
3474 shift64RightJamming( aSig, 33, &aSig );
3475 if ( aExp || aSig ) aExp -= 0x3F81;
3476 return roundAndPackFloat32( aSign, aExp, aSig );
3477
3478 }
3479
3480 /*
3481 -------------------------------------------------------------------------------
3482 Returns the result of converting the extended double-precision floating-
3483 point value `a' to the double-precision floating-point format. The
3484 conversion is performed according to the IEC/IEEE Standard for Binary
3485 Floating-Point Arithmetic.
3486 -------------------------------------------------------------------------------
3487 */
3488 float64 floatx80_to_float64( floatx80 a )
3489 {
3490 flag aSign;
3491 int32 aExp;
3492 bits64 aSig, zSig;
3493
3494 aSig = extractFloatx80Frac( a );
3495 aExp = extractFloatx80Exp( a );
3496 aSign = extractFloatx80Sign( a );
3497 if ( aExp == 0x7FFF ) {
3498 if ( (bits64) ( aSig<<1 ) ) {
3499 return commonNaNToFloat64( floatx80ToCommonNaN( a ) );
3500 }
3501 return packFloat64( aSign, 0x7FF, 0 );
3502 }
3503 shift64RightJamming( aSig, 1, &zSig );
3504 if ( aExp || aSig ) aExp -= 0x3C01;
3505 return roundAndPackFloat64( aSign, aExp, zSig );
3506
3507 }
3508
3509 #ifdef FLOAT128
3510
3511 /*
3512 -------------------------------------------------------------------------------
3513 Returns the result of converting the extended double-precision floating-
3514 point value `a' to the quadruple-precision floating-point format. The
3515 conversion is performed according to the IEC/IEEE Standard for Binary
3516 Floating-Point Arithmetic.
3517 -------------------------------------------------------------------------------
3518 */
3519 float128 floatx80_to_float128( floatx80 a )
3520 {
3521 flag aSign;
3522 int16 aExp;
3523 bits64 aSig, zSig0, zSig1;
3524
3525 aSig = extractFloatx80Frac( a );
3526 aExp = extractFloatx80Exp( a );
3527 aSign = extractFloatx80Sign( a );
3528 if ( ( aExp == 0x7FFF ) && (bits64) ( aSig<<1 ) ) {
3529 return commonNaNToFloat128( floatx80ToCommonNaN( a ) );
3530 }
3531 shift128Right( aSig<<1, 0, 16, &zSig0, &zSig1 );
3532 return packFloat128( aSign, aExp, zSig0, zSig1 );
3533
3534 }
3535
3536 #endif
3537
3538 /*
3539 -------------------------------------------------------------------------------
3540 Rounds the extended double-precision floating-point value `a' to an integer,
3541 and returns the result as an extended quadruple-precision floating-point
3542 value. The operation is performed according to the IEC/IEEE Standard for
3543 Binary Floating-Point Arithmetic.
3544 -------------------------------------------------------------------------------
3545 */
3546 floatx80 floatx80_round_to_int( floatx80 a )
3547 {
3548 flag aSign;
3549 int32 aExp;
3550 bits64 lastBitMask, roundBitsMask;
3551 int8 roundingMode;
3552 floatx80 z;
3553
3554 aExp = extractFloatx80Exp( a );
3555 if ( 0x403E <= aExp ) {
3556 if ( ( aExp == 0x7FFF ) && (bits64) ( extractFloatx80Frac( a )<<1 ) ) {
3557 return propagateFloatx80NaN( a, a );
3558 }
3559 return a;
3560 }
3561 if ( aExp < 0x3FFF ) {
3562 if ( ( aExp == 0 )
3563 && ( (bits64) ( extractFloatx80Frac( a )<<1 ) == 0 ) ) {
3564 return a;
3565 }
3566 set_float_exception_inexact_flag();
3567 aSign = extractFloatx80Sign( a );
3568 switch ( float_rounding_mode ) {
3569 case float_round_nearest_even:
3570 if ( ( aExp == 0x3FFE ) && (bits64) ( extractFloatx80Frac( a )<<1 )
3571 ) {
3572 return
3573 packFloatx80( aSign, 0x3FFF, LIT64( 0x8000000000000000 ) );
3574 }
3575 break;
3576 case float_round_to_zero:
3577 break;
3578 case float_round_down:
3579 return
3580 aSign ?
3581 packFloatx80( 1, 0x3FFF, LIT64( 0x8000000000000000 ) )
3582 : packFloatx80( 0, 0, 0 );
3583 case float_round_up:
3584 return
3585 aSign ? packFloatx80( 1, 0, 0 )
3586 : packFloatx80( 0, 0x3FFF, LIT64( 0x8000000000000000 ) );
3587 }
3588 return packFloatx80( aSign, 0, 0 );
3589 }
3590 lastBitMask = 1;
3591 lastBitMask <<= 0x403E - aExp;
3592 roundBitsMask = lastBitMask - 1;
3593 z = a;
3594 roundingMode = float_rounding_mode;
3595 if ( roundingMode == float_round_nearest_even ) {
3596 z.low += lastBitMask>>1;
3597 if ( ( z.low & roundBitsMask ) == 0 ) z.low &= ~ lastBitMask;
3598 }
3599 else if ( roundingMode != float_round_to_zero ) {
3600 if ( extractFloatx80Sign( z ) ^ ( roundingMode == float_round_up ) ) {
3601 z.low += roundBitsMask;
3602 }
3603 }
3604 z.low &= ~ roundBitsMask;
3605 if ( z.low == 0 ) {
3606 ++z.high;
3607 z.low = LIT64( 0x8000000000000000 );
3608 }
3609 z.high <<= X80SHIFT;
3610 if ( z.low != a.low ) set_float_exception_inexact_flag();
3611 return z;
3612
3613 }
3614
3615 /*
3616 -------------------------------------------------------------------------------
3617 Returns the result of adding the absolute values of the extended double-
3618 precision floating-point values `a' and `b'. If `zSign' is 1, the sum is
3619 negated before being returned. `zSign' is ignored if the result is a NaN.
3620 The addition is performed according to the IEC/IEEE Standard for Binary
3621 Floating-Point Arithmetic.
3622 -------------------------------------------------------------------------------
3623 */
3624 static floatx80 addFloatx80Sigs( floatx80 a, floatx80 b, flag zSign )
3625 {
3626 int32 aExp, bExp, zExp;
3627 bits64 aSig, bSig, zSig0, zSig1;
3628 int32 expDiff;
3629
3630 aSig = extractFloatx80Frac( a );
3631 aExp = extractFloatx80Exp( a );
3632 bSig = extractFloatx80Frac( b );
3633 bExp = extractFloatx80Exp( b );
3634 expDiff = aExp - bExp;
3635 if ( 0 < expDiff ) {
3636 if ( aExp == 0x7FFF ) {
3637 if ( (bits64) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b );
3638 return a;
3639 }
3640 if ( bExp == 0 ) --expDiff;
3641 shift64ExtraRightJamming( bSig, 0, expDiff, &bSig, &zSig1 );
3642 zExp = aExp;
3643 }
3644 else if ( expDiff < 0 ) {
3645 if ( bExp == 0x7FFF ) {
3646 if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b );
3647 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
3648 }
3649 if ( aExp == 0 ) ++expDiff;
3650 shift64ExtraRightJamming( aSig, 0, - expDiff, &aSig, &zSig1 );
3651 zExp = bExp;
3652 }
3653 else {
3654 if ( aExp == 0x7FFF ) {
3655 if ( (bits64) ( ( aSig | bSig )<<1 ) ) {
3656 return propagateFloatx80NaN( a, b );
3657 }
3658 return a;
3659 }
3660 zSig1 = 0;
3661 zSig0 = aSig + bSig;
3662 if ( aExp == 0 ) {
3663 normalizeFloatx80Subnormal( zSig0, &zExp, &zSig0 );
3664 goto roundAndPack;
3665 }
3666 zExp = aExp;
3667 goto shiftRight1;
3668 }
3669 zSig0 = aSig + bSig;
3670 if ( (sbits64) zSig0 < 0 ) goto roundAndPack;
3671 shiftRight1:
3672 shift64ExtraRightJamming( zSig0, zSig1, 1, &zSig0, &zSig1 );
3673 zSig0 |= LIT64( 0x8000000000000000 );
3674 ++zExp;
3675 roundAndPack:
3676 return
3677 roundAndPackFloatx80(
3678 floatx80_rounding_precision, zSign, zExp, zSig0, zSig1 );
3679
3680 }
3681
3682 /*
3683 -------------------------------------------------------------------------------
3684 Returns the result of subtracting the absolute values of the extended
3685 double-precision floating-point values `a' and `b'. If `zSign' is 1, the
3686 difference is negated before being returned. `zSign' is ignored if the
3687 result is a NaN. The subtraction is performed according to the IEC/IEEE
3688 Standard for Binary Floating-Point Arithmetic.
3689 -------------------------------------------------------------------------------
3690 */
3691 static floatx80 subFloatx80Sigs( floatx80 a, floatx80 b, flag zSign )
3692 {
3693 int32 aExp, bExp, zExp;
3694 bits64 aSig, bSig, zSig0, zSig1;
3695 int32 expDiff;
3696 floatx80 z;
3697
3698 aSig = extractFloatx80Frac( a );
3699 aExp = extractFloatx80Exp( a );
3700 bSig = extractFloatx80Frac( b );
3701 bExp = extractFloatx80Exp( b );
3702 expDiff = aExp - bExp;
3703 if ( 0 < expDiff ) goto aExpBigger;
3704 if ( expDiff < 0 ) goto bExpBigger;
3705 if ( aExp == 0x7FFF ) {
3706 if ( (bits64) ( ( aSig | bSig )<<1 ) ) {
3707 return propagateFloatx80NaN( a, b );
3708 }
3709 float_raise( float_flag_invalid );
3710 z.low = floatx80_default_nan_low;
3711 z.high = floatx80_default_nan_high<<X80SHIFT;
3712 return z;
3713 }
3714 if ( aExp == 0 ) {
3715 aExp = 1;
3716 bExp = 1;
3717 }
3718 zSig1 = 0;
3719 if ( bSig < aSig ) goto aBigger;
3720 if ( aSig < bSig ) goto bBigger;
3721 return packFloatx80( float_rounding_mode == float_round_down, 0, 0 );
3722 bExpBigger:
3723 if ( bExp == 0x7FFF ) {
3724 if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b );
3725 return packFloatx80( zSign ^ 1, 0x7FFF, LIT64( 0x8000000000000000 ) );
3726 }
3727 if ( aExp == 0 ) ++expDiff;
3728 shift128RightJamming( aSig, 0, - expDiff, &aSig, &zSig1 );
3729 bBigger:
3730 sub128( bSig, 0, aSig, zSig1, &zSig0, &zSig1 );
3731 zExp = bExp;
3732 zSign ^= 1;
3733 goto normalizeRoundAndPack;
3734 aExpBigger:
3735 if ( aExp == 0x7FFF ) {
3736 if ( (bits64) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b );
3737 return a;
3738 }
3739 if ( bExp == 0 ) --expDiff;
3740 shift128RightJamming( bSig, 0, expDiff, &bSig, &zSig1 );
3741 aBigger:
3742 sub128( aSig, 0, bSig, zSig1, &zSig0, &zSig1 );
3743 zExp = aExp;
3744 normalizeRoundAndPack:
3745 return
3746 normalizeRoundAndPackFloatx80(
3747 floatx80_rounding_precision, zSign, zExp, zSig0, zSig1 );
3748
3749 }
3750
3751 /*
3752 -------------------------------------------------------------------------------
3753 Returns the result of adding the extended double-precision floating-point
3754 values `a' and `b'. The operation is performed according to the IEC/IEEE
3755 Standard for Binary Floating-Point Arithmetic.
3756 -------------------------------------------------------------------------------
3757 */
3758 floatx80 floatx80_add( floatx80 a, floatx80 b )
3759 {
3760 flag aSign, bSign;
3761
3762 aSign = extractFloatx80Sign( a );
3763 bSign = extractFloatx80Sign( b );
3764 if ( aSign == bSign ) {
3765 return addFloatx80Sigs( a, b, aSign );
3766 }
3767 else {
3768 return subFloatx80Sigs( a, b, aSign );
3769 }
3770
3771 }
3772
3773 /*
3774 -------------------------------------------------------------------------------
3775 Returns the result of subtracting the extended double-precision floating-
3776 point values `a' and `b'. The operation is performed according to the
3777 IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3778 -------------------------------------------------------------------------------
3779 */
3780 floatx80 floatx80_sub( floatx80 a, floatx80 b )
3781 {
3782 flag aSign, bSign;
3783
3784 aSign = extractFloatx80Sign( a );
3785 bSign = extractFloatx80Sign( b );
3786 if ( aSign == bSign ) {
3787 return subFloatx80Sigs( a, b, aSign );
3788 }
3789 else {
3790 return addFloatx80Sigs( a, b, aSign );
3791 }
3792
3793 }
3794
3795 /*
3796 -------------------------------------------------------------------------------
3797 Returns the result of multiplying the extended double-precision floating-
3798 point values `a' and `b'. The operation is performed according to the
3799 IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3800 -------------------------------------------------------------------------------
3801 */
3802 floatx80 floatx80_mul( floatx80 a, floatx80 b )
3803 {
3804 flag aSign, bSign, zSign;
3805 int32 aExp, bExp, zExp;
3806 bits64 aSig, bSig, zSig0, zSig1;
3807 floatx80 z;
3808
3809 aSig = extractFloatx80Frac( a );
3810 aExp = extractFloatx80Exp( a );
3811 aSign = extractFloatx80Sign( a );
3812 bSig = extractFloatx80Frac( b );
3813 bExp = extractFloatx80Exp( b );
3814 bSign = extractFloatx80Sign( b );
3815 zSign = aSign ^ bSign;
3816 if ( aExp == 0x7FFF ) {
3817 if ( (bits64) ( aSig<<1 )
3818 || ( ( bExp == 0x7FFF ) && (bits64) ( bSig<<1 ) ) ) {
3819 return propagateFloatx80NaN( a, b );
3820 }
3821 if ( ( bExp | bSig ) == 0 ) goto invalid;
3822 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
3823 }
3824 if ( bExp == 0x7FFF ) {
3825 if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b );
3826 if ( ( aExp | aSig ) == 0 ) {
3827 invalid:
3828 float_raise( float_flag_invalid );
3829 z.low = floatx80_default_nan_low;
3830 z.high = floatx80_default_nan_high<<X80SHIFT;
3831 return z;
3832 }
3833 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
3834 }
3835 if ( aExp == 0 ) {
3836 if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 );
3837 normalizeFloatx80Subnormal( aSig, &aExp, &aSig );
3838 }
3839 if ( bExp == 0 ) {
3840 if ( bSig == 0 ) return packFloatx80( zSign, 0, 0 );
3841 normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
3842 }
3843 zExp = aExp + bExp - 0x3FFE;
3844 mul64To128( aSig, bSig, &zSig0, &zSig1 );
3845 if ( 0 < (sbits64) zSig0 ) {
3846 shortShift128Left( zSig0, zSig1, 1, &zSig0, &zSig1 );
3847 --zExp;
3848 }
3849 return
3850 roundAndPackFloatx80(
3851 floatx80_rounding_precision, zSign, zExp, zSig0, zSig1 );
3852
3853 }
3854
3855 /*
3856 -------------------------------------------------------------------------------
3857 Returns the result of dividing the extended double-precision floating-point
3858 value `a' by the corresponding value `b'. The operation is performed
3859 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3860 -------------------------------------------------------------------------------
3861 */
3862 floatx80 floatx80_div( floatx80 a, floatx80 b )
3863 {
3864 flag aSign, bSign, zSign;
3865 int32 aExp, bExp, zExp;
3866 bits64 aSig, bSig, zSig0, zSig1;
3867 bits64 rem0, rem1, rem2, term0, term1, term2;
3868 floatx80 z;
3869
3870 aSig = extractFloatx80Frac( a );
3871 aExp = extractFloatx80Exp( a );
3872 aSign = extractFloatx80Sign( a );
3873 bSig = extractFloatx80Frac( b );
3874 bExp = extractFloatx80Exp( b );
3875 bSign = extractFloatx80Sign( b );
3876 zSign = aSign ^ bSign;
3877 if ( aExp == 0x7FFF ) {
3878 if ( (bits64) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b );
3879 if ( bExp == 0x7FFF ) {
3880 if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b );
3881 goto invalid;
3882 }
3883 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
3884 }
3885 if ( bExp == 0x7FFF ) {
3886 if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b );
3887 return packFloatx80( zSign, 0, 0 );
3888 }
3889 if ( bExp == 0 ) {
3890 if ( bSig == 0 ) {
3891 if ( ( aExp | aSig ) == 0 ) {
3892 invalid:
3893 float_raise( float_flag_invalid );
3894 z.low = floatx80_default_nan_low;
3895 z.high = floatx80_default_nan_high<<X80SHIFT;
3896 return z;
3897 }
3898 float_raise( float_flag_divbyzero );
3899 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
3900 }
3901 normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
3902 }
3903 if ( aExp == 0 ) {
3904 if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 );
3905 normalizeFloatx80Subnormal( aSig, &aExp, &aSig );
3906 }
3907 zExp = aExp - bExp + 0x3FFE;
3908 rem1 = 0;
3909 if ( bSig <= aSig ) {
3910 shift128Right( aSig, 0, 1, &aSig, &rem1 );
3911 ++zExp;
3912 }
3913 zSig0 = estimateDiv128To64( aSig, rem1, bSig );
3914 mul64To128( bSig, zSig0, &term0, &term1 );
3915 sub128( aSig, rem1, term0, term1, &rem0, &rem1 );
3916 while ( (sbits64) rem0 < 0 ) {
3917 --zSig0;
3918 add128( rem0, rem1, 0, bSig, &rem0, &rem1 );
3919 }
3920 zSig1 = estimateDiv128To64( rem1, 0, bSig );
3921 if ( (bits64) ( zSig1<<1 ) <= 8 ) {
3922 mul64To128( bSig, zSig1, &term1, &term2 );
3923 sub128( rem1, 0, term1, term2, &rem1, &rem2 );
3924 while ( (sbits64) rem1 < 0 ) {
3925 --zSig1;
3926 add128( rem1, rem2, 0, bSig, &rem1, &rem2 );
3927 }
3928 zSig1 |= ( ( rem1 | rem2 ) != 0 );
3929 }
3930 return
3931 roundAndPackFloatx80(
3932 floatx80_rounding_precision, zSign, zExp, zSig0, zSig1 );
3933
3934 }
3935
3936 /*
3937 -------------------------------------------------------------------------------
3938 Returns the remainder of the extended double-precision floating-point value
3939 `a' with respect to the corresponding value `b'. The operation is performed
3940 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3941 -------------------------------------------------------------------------------
3942 */
3943 floatx80 floatx80_rem( floatx80 a, floatx80 b )
3944 {
3945 flag aSign, zSign;
3946 int32 aExp, bExp, expDiff;
3947 bits64 aSig0, aSig1, bSig;
3948 bits64 q, term0, term1, alternateASig0, alternateASig1;
3949 floatx80 z;
3950
3951 aSig0 = extractFloatx80Frac( a );
3952 aExp = extractFloatx80Exp( a );
3953 aSign = extractFloatx80Sign( a );
3954 bSig = extractFloatx80Frac( b );
3955 bExp = extractFloatx80Exp( b );
3956
3957 if ( aExp == 0x7FFF ) {
3958 if ( (bits64) ( aSig0<<1 )
3959 || ( ( bExp == 0x7FFF ) && (bits64) ( bSig<<1 ) ) ) {
3960 return propagateFloatx80NaN( a, b );
3961 }
3962 goto invalid;
3963 }
3964 if ( bExp == 0x7FFF ) {
3965 if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b );
3966 return a;
3967 }
3968 if ( bExp == 0 ) {
3969 if ( bSig == 0 ) {
3970 invalid:
3971 float_raise( float_flag_invalid );
3972 z.low = floatx80_default_nan_low;
3973 z.high = floatx80_default_nan_high<<X80SHIFT;
3974 return z;
3975 }
3976 normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
3977 }
3978 if ( aExp == 0 ) {
3979 if ( (bits64) ( aSig0<<1 ) == 0 ) return a;
3980 normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 );
3981 }
3982 bSig |= LIT64( 0x8000000000000000 );
3983 zSign = aSign;
3984 expDiff = aExp - bExp;
3985 aSig1 = 0;
3986 if ( expDiff < 0 ) {
3987 if ( expDiff < -1 ) return a;
3988 shift128Right( aSig0, 0, 1, &aSig0, &aSig1 );
3989 expDiff = 0;
3990 }
3991 q = ( bSig <= aSig0 );
3992 if ( q ) aSig0 -= bSig;
3993 expDiff -= 64;
3994 while ( 0 < expDiff ) {
3995 q = estimateDiv128To64( aSig0, aSig1, bSig );
3996 q = ( 2 < q ) ? q - 2 : 0;
3997 mul64To128( bSig, q, &term0, &term1 );
3998 sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
3999 shortShift128Left( aSig0, aSig1, 62, &aSig0, &aSig1 );
4000 expDiff -= 62;
4001 }
4002 expDiff += 64;
4003 if ( 0 < expDiff ) {
4004 q = estimateDiv128To64( aSig0, aSig1, bSig );
4005 q = ( 2 < q ) ? q - 2 : 0;
4006 q >>= 64 - expDiff;
4007 mul64To128( bSig, q<<( 64 - expDiff ), &term0, &term1 );
4008 sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
4009 shortShift128Left( 0, bSig, 64 - expDiff, &term0, &term1 );
4010 while ( le128( term0, term1, aSig0, aSig1 ) ) {
4011 ++q;
4012 sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
4013 }
4014 }
4015 else {
4016 term1 = 0;
4017 term0 = bSig;
4018 }
4019 sub128( term0, term1, aSig0, aSig1, &alternateASig0, &alternateASig1 );
4020 if ( lt128( alternateASig0, alternateASig1, aSig0, aSig1 )
4021 || ( eq128( alternateASig0, alternateASig1, aSig0, aSig1 )
4022 && ( q & 1 ) )
4023 ) {
4024 aSig0 = alternateASig0;
4025 aSig1 = alternateASig1;
4026 zSign = ! zSign;
4027 }
4028 return
4029 normalizeRoundAndPackFloatx80(
4030 80, zSign, bExp + expDiff, aSig0, aSig1 );
4031
4032 }
4033
4034 /*
4035 -------------------------------------------------------------------------------
4036 Returns the square root of the extended double-precision floating-point
4037 value `a'. The operation is performed according to the IEC/IEEE Standard
4038 for Binary Floating-Point Arithmetic.
4039 -------------------------------------------------------------------------------
4040 */
4041 floatx80 floatx80_sqrt( floatx80 a )
4042 {
4043 flag aSign;
4044 int32 aExp, zExp;
4045 bits64 aSig0, aSig1, zSig0, zSig1, doubleZSig0;
4046 bits64 rem0, rem1, rem2, rem3, term0, term1, term2, term3;
4047 floatx80 z;
4048
4049 aSig0 = extractFloatx80Frac( a );
4050 aExp = extractFloatx80Exp( a );
4051 aSign = extractFloatx80Sign( a );
4052 if ( aExp == 0x7FFF ) {
4053 if ( (bits64) ( aSig0<<1 ) ) return propagateFloatx80NaN( a, a );
4054 if ( ! aSign ) return a;
4055 goto invalid;
4056 }
4057 if ( aSign ) {
4058 if ( ( aExp | aSig0 ) == 0 ) return a;
4059 invalid:
4060 float_raise( float_flag_invalid );
4061 z.low = floatx80_default_nan_low;
4062 z.high = floatx80_default_nan_high<<X80SHIFT;
4063 return z;
4064 }
4065 if ( aExp == 0 ) {
4066 if ( aSig0 == 0 ) return packFloatx80( 0, 0, 0 );
4067 normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 );
4068 }
4069 zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFF;
4070 zSig0 = estimateSqrt32( aExp, aSig0>>32 );
4071 shift128Right( aSig0, 0, 2 + ( aExp & 1 ), &aSig0, &aSig1 );
4072 zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 );
4073 doubleZSig0 = zSig0<<1;
4074 mul64To128( zSig0, zSig0, &term0, &term1 );
4075 sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 );
4076 while ( (sbits64) rem0 < 0 ) {
4077 --zSig0;
4078 doubleZSig0 -= 2;
4079 add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 );
4080 }
4081 zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 );
4082 if ( ( zSig1 & LIT64( 0x3FFFFFFFFFFFFFFF ) ) <= 5 ) {
4083 if ( zSig1 == 0 ) zSig1 = 1;
4084 mul64To128( doubleZSig0, zSig1, &term1, &term2 );
4085 sub128( rem1, 0, term1, term2, &rem1, &rem2 );
4086 mul64To128( zSig1, zSig1, &term2, &term3 );
4087 sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 );
4088 while ( (sbits64) rem1 < 0 ) {
4089 --zSig1;
4090 shortShift128Left( 0, zSig1, 1, &term2, &term3 );
4091 term3 |= 1;
4092 term2 |= doubleZSig0;
4093 add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 );
4094 }
4095 zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
4096 }
4097 shortShift128Left( 0, zSig1, 1, &zSig0, &zSig1 );
4098 zSig0 |= doubleZSig0;
4099 return
4100 roundAndPackFloatx80(
4101 floatx80_rounding_precision, 0, zExp, zSig0, zSig1 );
4102
4103 }
4104
4105 /*
4106 -------------------------------------------------------------------------------
4107 Returns 1 if the extended double-precision floating-point value `a' is
4108 equal to the corresponding value `b', and 0 otherwise. The comparison is
4109 performed according to the IEC/IEEE Standard for Binary Floating-Point
4110 Arithmetic.
4111 -------------------------------------------------------------------------------
4112 */
4113 flag floatx80_eq( floatx80 a, floatx80 b )
4114 {
4115
4116 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
4117 && (bits64) ( extractFloatx80Frac( a )<<1 ) )
4118 || ( ( extractFloatx80Exp( b ) == 0x7FFF )
4119 && (bits64) ( extractFloatx80Frac( b )<<1 ) )
4120 ) {
4121 if ( floatx80_is_signaling_nan( a )
4122 || floatx80_is_signaling_nan( b ) ) {
4123 float_raise( float_flag_invalid );
4124 }
4125 return 0;
4126 }
4127 return
4128 ( a.low == b.low )
4129 && ( ( a.high == b.high )
4130 || ( ( a.low == 0 )
4131 && ( (bits16) ( ((bits32)( a.high | b.high )>>X80SHIFT)<<1 )
4132 == 0 ) ) );
4133
4134 }
4135
4136 /*
4137 -------------------------------------------------------------------------------
4138 Returns 1 if the extended double-precision floating-point value `a' is
4139 less than or equal to the corresponding value `b', and 0 otherwise. The
4140 comparison is performed according to the IEC/IEEE Standard for Binary
4141 Floating-Point Arithmetic.
4142 -------------------------------------------------------------------------------
4143 */
4144 flag floatx80_le( floatx80 a, floatx80 b )
4145 {
4146 flag aSign, bSign;
4147
4148 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
4149 && (bits64) ( extractFloatx80Frac( a )<<1 ) )
4150 || ( ( extractFloatx80Exp( b ) == 0x7FFF )
4151 && (bits64) ( extractFloatx80Frac( b )<<1 ) )
4152 ) {
4153 float_raise( float_flag_invalid );
4154 return 0;
4155 }
4156 aSign = extractFloatx80Sign( a );
4157 bSign = extractFloatx80Sign( b );
4158 if ( aSign != bSign ) {
4159 return
4160 aSign
4161 || ( ( ( (bits16) ( ((bits32)( a.high | b.high )>>X80SHIFT)
4162 <<1 ) ) | a.low | b.low ) == 0 );
4163 }
4164 return
4165 aSign ? le128( b.high>>X80SHIFT, b.low, a.high>>X80SHIFT, a.low )
4166 : le128( a.high>>X80SHIFT, a.low, b.high>>X80SHIFT, b.low );
4167
4168 }
4169
4170 /*
4171 -------------------------------------------------------------------------------
4172 Returns 1 if the extended double-precision floating-point value `a' is
4173 less than the corresponding value `b', and 0 otherwise. The comparison
4174 is performed according to the IEC/IEEE Standard for Binary Floating-Point
4175 Arithmetic.
4176 -------------------------------------------------------------------------------
4177 */
4178 flag floatx80_lt( floatx80 a, floatx80 b )
4179 {
4180 flag aSign, bSign;
4181
4182 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
4183 && (bits64) ( extractFloatx80Frac( a )<<1 ) )
4184 || ( ( extractFloatx80Exp( b ) == 0x7FFF )
4185 && (bits64) ( extractFloatx80Frac( b )<<1 ) )
4186 ) {
4187 float_raise( float_flag_invalid );
4188 return 0;
4189 }
4190 aSign = extractFloatx80Sign( a );
4191 bSign = extractFloatx80Sign( b );
4192 if ( aSign != bSign ) {
4193 return
4194 aSign
4195 && ( ( ( (bits16) ( ((bits32)( a.high | b.high )>>X80SHIFT)
4196 <<1 ) ) | a.low | b.low ) != 0 );
4197 }
4198 return
4199 aSign ? lt128( b.high>>X80SHIFT, b.low, a.high>>X80SHIFT, a.low )
4200 : lt128( a.high>>X80SHIFT, a.low, b.high>>X80SHIFT, b.low );
4201
4202 }
4203
4204 /*
4205 -------------------------------------------------------------------------------
4206 Returns 1 if the extended double-precision floating-point value `a' is equal
4207 to the corresponding value `b', and 0 otherwise. The invalid exception is
4208 raised if either operand is a NaN. Otherwise, the comparison is performed
4209 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4210 -------------------------------------------------------------------------------
4211 */
4212 flag floatx80_eq_signaling( floatx80 a, floatx80 b )
4213 {
4214
4215 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
4216 && (bits64) ( extractFloatx80Frac( a )<<1 ) )
4217 || ( ( extractFloatx80Exp( b ) == 0x7FFF )
4218 && (bits64) ( extractFloatx80Frac( b )<<1 ) )
4219 ) {
4220 float_raise( float_flag_invalid );
4221 return 0;
4222 }
4223 return
4224 ( a.low == b.low )
4225 && ( ( (a.high>>X80SHIFT) == (b.high>>X80SHIFT) )
4226 || ( ( a.low == 0 )
4227 && ( (bits16) ( ((bits32)( a.high | b.high )>>X80SHIFT)<<1 )
4228 == 0 ) ) );
4229
4230 }
4231
4232 /*
4233 -------------------------------------------------------------------------------
4234 Returns 1 if the extended double-precision floating-point value `a' is less
4235 than or equal to the corresponding value `b', and 0 otherwise. Quiet NaNs
4236 do not cause an exception. Otherwise, the comparison is performed according
4237 to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4238 -------------------------------------------------------------------------------
4239 */
4240 flag floatx80_le_quiet( floatx80 a, floatx80 b )
4241 {
4242 flag aSign, bSign;
4243
4244 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
4245 && (bits64) ( extractFloatx80Frac( a )<<1 ) )
4246 || ( ( extractFloatx80Exp( b ) == 0x7FFF )
4247 && (bits64) ( extractFloatx80Frac( b )<<1 ) )
4248 ) {
4249 if ( floatx80_is_signaling_nan( a )
4250 || floatx80_is_signaling_nan( b ) ) {
4251 float_raise( float_flag_invalid );
4252 }
4253 return 0;
4254 }
4255 aSign = extractFloatx80Sign( a );
4256 bSign = extractFloatx80Sign( b );
4257 if ( aSign != bSign ) {
4258 return
4259 aSign
4260 || ( ( ( (bits16) ( ((bits32)( a.high | b.high )>>X80SHIFT)<<1
4261 ) ) | a.low | b.low ) == 0 );
4262 }
4263 return
4264 aSign ? le128( b.high>>X80SHIFT, b.low, a.high>>X80SHIFT, a.low )
4265 : le128( a.high>>X80SHIFT, a.low, b.high>>X80SHIFT, b.low );
4266
4267 }
4268
4269 /*
4270 -------------------------------------------------------------------------------
4271 Returns 1 if the extended double-precision floating-point value `a' is less
4272 than the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause
4273 an exception. Otherwise, the comparison is performed according to the
4274 IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4275 -------------------------------------------------------------------------------
4276 */
4277 flag floatx80_lt_quiet( floatx80 a, floatx80 b )
4278 {
4279 flag aSign, bSign;
4280
4281 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
4282 && (bits64) ( extractFloatx80Frac( a )<<1 ) )
4283 || ( ( extractFloatx80Exp( b ) == 0x7FFF )
4284 && (bits64) ( extractFloatx80Frac( b )<<1 ) )
4285 ) {
4286 if ( floatx80_is_signaling_nan( a )
4287 || floatx80_is_signaling_nan( b ) ) {
4288 float_raise( float_flag_invalid );
4289 }
4290 return 0;
4291 }
4292 aSign = extractFloatx80Sign( a );
4293 bSign = extractFloatx80Sign( b );
4294 if ( aSign != bSign ) {
4295 return
4296 aSign
4297 && ( ( ( (bits16) ( ((bits32)( a.high | b.high )>>X80SHIFT)<<1 )
4298 ) | a.low | b.low ) != 0 );
4299 }
4300 return
4301 aSign ? lt128( b.high>>X80SHIFT, b.low, a.high>>X80SHIFT, a.low )
4302 : lt128( a.high>>X80SHIFT, a.low, b.high>>X80SHIFT, b.low );
4303
4304 }
4305
4306 #endif
4307
4308 #ifdef FLOAT128
4309
4310 /*
4311 -------------------------------------------------------------------------------
4312 Returns the result of converting the quadruple-precision floating-point
4313 value `a' to the 32-bit two's complement integer format. The conversion
4314 is performed according to the IEC/IEEE Standard for Binary Floating-Point
4315 Arithmetic---which means in particular that the conversion is rounded
4316 according to the current rounding mode. If `a' is a NaN, the largest
4317 positive integer is returned. Otherwise, if the conversion overflows, the
4318 largest integer with the same sign as `a' is returned.
4319 -------------------------------------------------------------------------------
4320 */
4321 int32 float128_to_int32( float128 a )
4322 {
4323 flag aSign;
4324 int32 aExp, shiftCount;
4325 bits64 aSig0, aSig1;
4326
4327 aSig1 = extractFloat128Frac1( a );
4328 aSig0 = extractFloat128Frac0( a );
4329 aExp = extractFloat128Exp( a );
4330 aSign = extractFloat128Sign( a );
4331 if ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) aSign = 0;
4332 if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
4333 aSig0 |= ( aSig1 != 0 );
4334 shiftCount = 0x4028 - aExp;
4335 if ( 0 < shiftCount ) shift64RightJamming( aSig0, shiftCount, &aSig0 );
4336 return roundAndPackInt32( aSign, aSig0 );
4337
4338 }
4339
4340 /*
4341 -------------------------------------------------------------------------------
4342 Returns the result of converting the quadruple-precision floating-point
4343 value `a' to the 32-bit two's complement integer format. The conversion
4344 is performed according to the IEC/IEEE Standard for Binary Floating-Point
4345 Arithmetic, except that the conversion is always rounded toward zero. If
4346 `a' is a NaN, the largest positive integer is returned. Otherwise, if the
4347 conversion overflows, the largest integer with the same sign as `a' is
4348 returned.
4349 -------------------------------------------------------------------------------
4350 */
4351 int32 float128_to_int32_round_to_zero( float128 a )
4352 {
4353 flag aSign;
4354 int32 aExp, shiftCount;
4355 bits64 aSig0, aSig1, savedASig;
4356 int32 z;
4357
4358 aSig1 = extractFloat128Frac1( a );
4359 aSig0 = extractFloat128Frac0( a );
4360 aExp = extractFloat128Exp( a );
4361 aSign = extractFloat128Sign( a );
4362 aSig0 |= ( aSig1 != 0 );
4363 if ( 0x401E < aExp ) {
4364 if ( ( aExp == 0x7FFF ) && aSig0 ) aSign = 0;
4365 goto invalid;
4366 }
4367 else if ( aExp < 0x3FFF ) {
4368 if ( aExp || aSig0 ) set_float_exception_inexact_flag();
4369 return 0;
4370 }
4371 aSig0 |= LIT64( 0x0001000000000000 );
4372 shiftCount = 0x402F - aExp;
4373 savedASig = aSig0;
4374 aSig0 >>= shiftCount;
4375 z = (int32)aSig0;
4376 if ( aSign ) z = - z;
4377 if ( ( z < 0 ) ^ aSign ) {
4378 invalid:
4379 float_raise( float_flag_invalid );
4380 return aSign ? (sbits32) 0x80000000 : 0x7FFFFFFF;
4381 }
4382 if ( ( aSig0<<shiftCount ) != savedASig ) {
4383 set_float_exception_inexact_flag();
4384 }
4385 return z;
4386
4387 }
4388
4389 /*
4390 -------------------------------------------------------------------------------
4391 Returns the result of converting the quadruple-precision floating-point
4392 value `a' to the 64-bit two's complement integer format. The conversion
4393 is performed according to the IEC/IEEE Standard for Binary Floating-Point
4394 Arithmetic---which means in particular that the conversion is rounded
4395 according to the current rounding mode. If `a' is a NaN, the largest
4396 positive integer is returned. Otherwise, if the conversion overflows, the
4397 largest integer with the same sign as `a' is returned.
4398 -------------------------------------------------------------------------------
4399 */
4400 int64 float128_to_int64( float128 a )
4401 {
4402 flag aSign;
4403 int32 aExp, shiftCount;
4404 bits64 aSig0, aSig1;
4405
4406 aSig1 = extractFloat128Frac1( a );
4407 aSig0 = extractFloat128Frac0( a );
4408 aExp = extractFloat128Exp( a );
4409 aSign = extractFloat128Sign( a );
4410 if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
4411 shiftCount = 0x402F - aExp;
4412 if ( shiftCount <= 0 ) {
4413 if ( 0x403E < aExp ) {
4414 float_raise( float_flag_invalid );
4415 if ( ! aSign
4416 || ( ( aExp == 0x7FFF )
4417 && ( aSig1 || ( aSig0 != LIT64( 0x0001000000000000 ) ) )
4418 )
4419 ) {
4420 return LIT64( 0x7FFFFFFFFFFFFFFF );
4421 }
4422 return (sbits64) LIT64( 0x8000000000000000 );
4423 }
4424 shortShift128Left( aSig0, aSig1, - shiftCount, &aSig0, &aSig1 );
4425 }
4426 else {
4427 shift64ExtraRightJamming( aSig0, aSig1, shiftCount, &aSig0, &aSig1 );
4428 }
4429 return roundAndPackInt64( aSign, aSig0, aSig1 );
4430
4431 }
4432
4433 /*
4434 -------------------------------------------------------------------------------
4435 Returns the result of converting the quadruple-precision floating-point
4436 value `a' to the 64-bit two's complement integer format. The conversion
4437 is performed according to the IEC/IEEE Standard for Binary Floating-Point
4438 Arithmetic, except that the conversion is always rounded toward zero.
4439 If `a' is a NaN, the largest positive integer is returned. Otherwise, if
4440 the conversion overflows, the largest integer with the same sign as `a' is
4441 returned.
4442 -------------------------------------------------------------------------------
4443 */
4444 int64 float128_to_int64_round_to_zero( float128 a )
4445 {
4446 flag aSign;
4447 int32 aExp, shiftCount;
4448 bits64 aSig0, aSig1;
4449 int64 z;
4450
4451 aSig1 = extractFloat128Frac1( a );
4452 aSig0 = extractFloat128Frac0( a );
4453 aExp = extractFloat128Exp( a );
4454 aSign = extractFloat128Sign( a );
4455 if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
4456 shiftCount = aExp - 0x402F;
4457 if ( 0 < shiftCount ) {
4458 if ( 0x403E <= aExp ) {
4459 aSig0 &= LIT64( 0x0000FFFFFFFFFFFF );
4460 if ( ( a.high == LIT64( 0xC03E000000000000 ) )
4461 && ( aSig1 < LIT64( 0x0002000000000000 ) ) ) {
4462 if ( aSig1 ) set_float_exception_inexact_flag();
4463 }
4464 else {
4465 float_raise( float_flag_invalid );
4466 if ( ! aSign || ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) ) {
4467 return LIT64( 0x7FFFFFFFFFFFFFFF );
4468 }
4469 }
4470 return (sbits64) LIT64( 0x8000000000000000 );
4471 }
4472 z = ( aSig0<<shiftCount ) | ( aSig1>>( ( - shiftCount ) & 63 ) );
4473 if ( (bits64) ( aSig1<<shiftCount ) ) {
4474 set_float_exception_inexact_flag();
4475 }
4476 }
4477 else {
4478 if ( aExp < 0x3FFF ) {
4479 if ( aExp | aSig0 | aSig1 ) {
4480 set_float_exception_inexact_flag();
4481 }
4482 return 0;
4483 }
4484 z = aSig0>>( - shiftCount );
4485 if ( aSig1
4486 || ( shiftCount && (bits64) ( aSig0<<( shiftCount & 63 ) ) ) ) {
4487 set_float_exception_inexact_flag();
4488 }
4489 }
4490 if ( aSign ) z = - z;
4491 return z;
4492
4493 }
4494
4495 #if (defined(SOFTFLOATSPARC64_FOR_GCC) || defined(SOFTFLOAT_FOR_GCC)) \
4496 && defined(SOFTFLOAT_NEED_FIXUNS)
4497 /*
4498 * just like above - but do not care for overflow of signed results
4499 */
4500 uint64 float128_to_uint64_round_to_zero( float128 a )
4501 {
4502 flag aSign;
4503 int32 aExp, shiftCount;
4504 bits64 aSig0, aSig1;
4505 uint64 z;
4506
4507 aSig1 = extractFloat128Frac1( a );
4508 aSig0 = extractFloat128Frac0( a );
4509 aExp = extractFloat128Exp( a );
4510 aSign = extractFloat128Sign( a );
4511 if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
4512 shiftCount = aExp - 0x402F;
4513 if ( 0 < shiftCount ) {
4514 if ( 0x403F <= aExp ) {
4515 aSig0 &= LIT64( 0x0000FFFFFFFFFFFF );
4516 if ( ( a.high == LIT64( 0xC03E000000000000 ) )
4517 && ( aSig1 < LIT64( 0x0002000000000000 ) ) ) {
4518 if ( aSig1 ) set_float_exception_inexact_flag();
4519 }
4520 else {
4521 float_raise( float_flag_invalid );
4522 }
4523 return LIT64( 0xFFFFFFFFFFFFFFFF );
4524 }
4525 z = ( aSig0<<shiftCount ) | ( aSig1>>( ( - shiftCount ) & 63 ) );
4526 if ( (bits64) ( aSig1<<shiftCount ) ) {
4527 set_float_exception_inexact_flag();
4528 }
4529 }
4530 else {
4531 if ( aExp < 0x3FFF ) {
4532 if ( aExp | aSig0 | aSig1 ) {
4533 set_float_exception_inexact_flag();
4534 }
4535 return 0;
4536 }
4537 z = aSig0>>( - shiftCount );
4538 if (aSig1 || ( shiftCount && (bits64) ( aSig0<<( shiftCount & 63 ) ) ) ) {
4539 set_float_exception_inexact_flag();
4540 }
4541 }
4542 if ( aSign ) z = - z;
4543 return z;
4544
4545 }
4546 #endif /* (SOFTFLOATSPARC64_FOR_GCC || SOFTFLOAT_FOR_GCC) && SOFTFLOAT_NEED_FIXUNS */
4547
4548 /*
4549 -------------------------------------------------------------------------------
4550 Returns the result of converting the quadruple-precision floating-point
4551 value `a' to the single-precision floating-point format. The conversion
4552 is performed according to the IEC/IEEE Standard for Binary Floating-Point
4553 Arithmetic.
4554 -------------------------------------------------------------------------------
4555 */
4556 float32 float128_to_float32( float128 a )
4557 {
4558 flag aSign;
4559 int32 aExp;
4560 bits64 aSig0, aSig1;
4561 bits32 zSig;
4562
4563 aSig1 = extractFloat128Frac1( a );
4564 aSig0 = extractFloat128Frac0( a );
4565 aExp = extractFloat128Exp( a );
4566 aSign = extractFloat128Sign( a );
4567 if ( aExp == 0x7FFF ) {
4568 if ( aSig0 | aSig1 ) {
4569 return commonNaNToFloat32( float128ToCommonNaN( a ) );
4570 }
4571 return packFloat32( aSign, 0xFF, 0 );
4572 }
4573 aSig0 |= ( aSig1 != 0 );
4574 shift64RightJamming( aSig0, 18, &aSig0 );
4575 zSig = (bits32)aSig0;
4576 if ( aExp || zSig ) {
4577 zSig |= 0x40000000;
4578 aExp -= 0x3F81;
4579 }
4580 return roundAndPackFloat32( aSign, aExp, zSig );
4581
4582 }
4583
4584 /*
4585 -------------------------------------------------------------------------------
4586 Returns the result of converting the quadruple-precision floating-point
4587 value `a' to the double-precision floating-point format. The conversion
4588 is performed according to the IEC/IEEE Standard for Binary Floating-Point
4589 Arithmetic.
4590 -------------------------------------------------------------------------------
4591 */
4592 float64 float128_to_float64( float128 a )
4593 {
4594 flag aSign;
4595 int32 aExp;
4596 bits64 aSig0, aSig1;
4597
4598 aSig1 = extractFloat128Frac1( a );
4599 aSig0 = extractFloat128Frac0( a );
4600 aExp = extractFloat128Exp( a );
4601 aSign = extractFloat128Sign( a );
4602 if ( aExp == 0x7FFF ) {
4603 if ( aSig0 | aSig1 ) {
4604 return commonNaNToFloat64( float128ToCommonNaN( a ) );
4605 }
4606 return packFloat64( aSign, 0x7FF, 0 );
4607 }
4608 shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 );
4609 aSig0 |= ( aSig1 != 0 );
4610 if ( aExp || aSig0 ) {
4611 aSig0 |= LIT64( 0x4000000000000000 );
4612 aExp -= 0x3C01;
4613 }
4614 return roundAndPackFloat64( aSign, aExp, aSig0 );
4615
4616 }
4617
4618 #ifdef FLOATX80
4619
4620 /*
4621 -------------------------------------------------------------------------------
4622 Returns the result of converting the quadruple-precision floating-point
4623 value `a' to the extended double-precision floating-point format. The
4624 conversion is performed according to the IEC/IEEE Standard for Binary
4625 Floating-Point Arithmetic.
4626 -------------------------------------------------------------------------------
4627 */
4628 floatx80 float128_to_floatx80( float128 a )
4629 {
4630 flag aSign;
4631 int32 aExp;
4632 bits64 aSig0, aSig1;
4633
4634 aSig1 = extractFloat128Frac1( a );
4635 aSig0 = extractFloat128Frac0( a );
4636 aExp = extractFloat128Exp( a );
4637 aSign = extractFloat128Sign( a );
4638 if ( aExp == 0x7FFF ) {
4639 if ( aSig0 | aSig1 ) {
4640 return commonNaNToFloatx80( float128ToCommonNaN( a ) );
4641 }
4642 return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
4643 }
4644 if ( aExp == 0 ) {
4645 if ( ( aSig0 | aSig1 ) == 0 ) return packFloatx80( aSign, 0, 0 );
4646 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
4647 }
4648 else {
4649 aSig0 |= LIT64( 0x0001000000000000 );
4650 }
4651 shortShift128Left( aSig0, aSig1, 15, &aSig0, &aSig1 );
4652 return roundAndPackFloatx80( 80, aSign, aExp, aSig0, aSig1 );
4653
4654 }
4655
4656 #endif
4657
4658 /*
4659 -------------------------------------------------------------------------------
4660 Rounds the quadruple-precision floating-point value `a' to an integer, and
4661 returns the result as a quadruple-precision floating-point value. The
4662 operation is performed according to the IEC/IEEE Standard for Binary
4663 Floating-Point Arithmetic.
4664 -------------------------------------------------------------------------------
4665 */
4666 float128 float128_round_to_int( float128 a )
4667 {
4668 flag aSign;
4669 int32 aExp;
4670 bits64 lastBitMask, roundBitsMask;
4671 int8 roundingMode;
4672 float128 z;
4673
4674 aExp = extractFloat128Exp( a );
4675 if ( 0x402F <= aExp ) {
4676 if ( 0x406F <= aExp ) {
4677 if ( ( aExp == 0x7FFF )
4678 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) )
4679 ) {
4680 return propagateFloat128NaN( a, a );
4681 }
4682 return a;
4683 }
4684 lastBitMask = 1;
4685 lastBitMask = ( lastBitMask<<( 0x406E - aExp ) )<<1;
4686 roundBitsMask = lastBitMask - 1;
4687 z = a;
4688 roundingMode = float_rounding_mode;
4689 if ( roundingMode == float_round_nearest_even ) {
4690 if ( lastBitMask ) {
4691 add128( z.high, z.low, 0, lastBitMask>>1, &z.high, &z.low );
4692 if ( ( z.low & roundBitsMask ) == 0 ) z.low &= ~ lastBitMask;
4693 }
4694 else {
4695 if ( (sbits64) z.low < 0 ) {
4696 ++z.high;
4697 if ( (bits64) ( z.low<<1 ) == 0 ) z.high &= ~1;
4698 }
4699 }
4700 }
4701 else if ( roundingMode != float_round_to_zero ) {
4702 if ( extractFloat128Sign( z )
4703 ^ ( roundingMode == float_round_up ) ) {
4704 add128( z.high, z.low, 0, roundBitsMask, &z.high, &z.low );
4705 }
4706 }
4707 z.low &= ~ roundBitsMask;
4708 }
4709 else {
4710 if ( aExp < 0x3FFF ) {
4711 if ( ( ( (bits64) ( a.high<<1 ) ) | a.low ) == 0 ) return a;
4712 set_float_exception_inexact_flag();
4713 aSign = extractFloat128Sign( a );
4714 switch ( float_rounding_mode ) {
4715 case float_round_nearest_even:
4716 if ( ( aExp == 0x3FFE )
4717 && ( extractFloat128Frac0( a )
4718 | extractFloat128Frac1( a ) )
4719 ) {
4720 return packFloat128( aSign, 0x3FFF, 0, 0 );
4721 }
4722 break;
4723 case float_round_to_zero:
4724 break;
4725 case float_round_down:
4726 return
4727 aSign ? packFloat128( 1, 0x3FFF, 0, 0 )
4728 : packFloat128( 0, 0, 0, 0 );
4729 case float_round_up:
4730 return
4731 aSign ? packFloat128( 1, 0, 0, 0 )
4732 : packFloat128( 0, 0x3FFF, 0, 0 );
4733 }
4734 return packFloat128( aSign, 0, 0, 0 );
4735 }
4736 lastBitMask = 1;
4737 lastBitMask <<= 0x402F - aExp;
4738 roundBitsMask = lastBitMask - 1;
4739 z.low = 0;
4740 z.high = a.high;
4741 roundingMode = float_rounding_mode;
4742 if ( roundingMode == float_round_nearest_even ) {
4743 z.high += lastBitMask>>1;
4744 if ( ( ( z.high & roundBitsMask ) | a.low ) == 0 ) {
4745 z.high &= ~ lastBitMask;
4746 }
4747 }
4748 else if ( roundingMode != float_round_to_zero ) {
4749 if ( extractFloat128Sign( z )
4750 ^ ( roundingMode == float_round_up ) ) {
4751 z.high |= ( a.low != 0 );
4752 z.high += roundBitsMask;
4753 }
4754 }
4755 z.high &= ~ roundBitsMask;
4756 }
4757 if ( ( z.low != a.low ) || ( z.high != a.high ) ) {
4758 set_float_exception_inexact_flag();
4759 }
4760 return z;
4761
4762 }
4763
4764 /*
4765 -------------------------------------------------------------------------------
4766 Returns the result of adding the absolute values of the quadruple-precision
4767 floating-point values `a' and `b'. If `zSign' is 1, the sum is negated
4768 before being returned. `zSign' is ignored if the result is a NaN.
4769 The addition is performed according to the IEC/IEEE Standard for Binary
4770 Floating-Point Arithmetic.
4771 -------------------------------------------------------------------------------
4772 */
4773 static float128 addFloat128Sigs( float128 a, float128 b, flag zSign )
4774 {
4775 int32 aExp, bExp, zExp;
4776 bits64 aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2;
4777 int32 expDiff;
4778
4779 aSig1 = extractFloat128Frac1( a );
4780 aSig0 = extractFloat128Frac0( a );
4781 aExp = extractFloat128Exp( a );
4782 bSig1 = extractFloat128Frac1( b );
4783 bSig0 = extractFloat128Frac0( b );
4784 bExp = extractFloat128Exp( b );
4785 expDiff = aExp - bExp;
4786 if ( 0 < expDiff ) {
4787 if ( aExp == 0x7FFF ) {
4788 if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, b );
4789 return a;
4790 }
4791 if ( bExp == 0 ) {
4792 --expDiff;
4793 }
4794 else {
4795 bSig0 |= LIT64( 0x0001000000000000 );
4796 }
4797 shift128ExtraRightJamming(
4798 bSig0, bSig1, 0, expDiff, &bSig0, &bSig1, &zSig2 );
4799 zExp = aExp;
4800 }
4801 else if ( expDiff < 0 ) {
4802 if ( bExp == 0x7FFF ) {
4803 if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b );
4804 return packFloat128( zSign, 0x7FFF, 0, 0 );
4805 }
4806 if ( aExp == 0 ) {
4807 ++expDiff;
4808 }
4809 else {
4810 aSig0 |= LIT64( 0x0001000000000000 );
4811 }
4812 shift128ExtraRightJamming(
4813 aSig0, aSig1, 0, - expDiff, &aSig0, &aSig1, &zSig2 );
4814 zExp = bExp;
4815 }
4816 else {
4817 if ( aExp == 0x7FFF ) {
4818 if ( aSig0 | aSig1 | bSig0 | bSig1 ) {
4819 return propagateFloat128NaN( a, b );
4820 }
4821 return a;
4822 }
4823 add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
4824 if ( aExp == 0 ) return packFloat128( zSign, 0, zSig0, zSig1 );
4825 zSig2 = 0;
4826 zSig0 |= LIT64( 0x0002000000000000 );
4827 zExp = aExp;
4828 goto shiftRight1;
4829 }
4830 aSig0 |= LIT64( 0x0001000000000000 );
4831 add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
4832 --zExp;
4833 if ( zSig0 < LIT64( 0x0002000000000000 ) ) goto roundAndPack;
4834 ++zExp;
4835 shiftRight1:
4836 shift128ExtraRightJamming(
4837 zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 );
4838 roundAndPack:
4839 return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 );
4840
4841 }
4842
4843 /*
4844 -------------------------------------------------------------------------------
4845 Returns the result of subtracting the absolute values of the quadruple-
4846 precision floating-point values `a' and `b'. If `zSign' is 1, the
4847 difference is negated before being returned. `zSign' is ignored if the
4848 result is a NaN. The subtraction is performed according to the IEC/IEEE
4849 Standard for Binary Floating-Point Arithmetic.
4850 -------------------------------------------------------------------------------
4851 */
4852 static float128 subFloat128Sigs( float128 a, float128 b, flag zSign )
4853 {
4854 int32 aExp, bExp, zExp;
4855 bits64 aSig0, aSig1, bSig0, bSig1, zSig0, zSig1;
4856 int32 expDiff;
4857 float128 z;
4858
4859 aSig1 = extractFloat128Frac1( a );
4860 aSig0 = extractFloat128Frac0( a );
4861 aExp = extractFloat128Exp( a );
4862 bSig1 = extractFloat128Frac1( b );
4863 bSig0 = extractFloat128Frac0( b );
4864 bExp = extractFloat128Exp( b );
4865 expDiff = aExp - bExp;
4866 shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 );
4867 shortShift128Left( bSig0, bSig1, 14, &bSig0, &bSig1 );
4868 if ( 0 < expDiff ) goto aExpBigger;
4869 if ( expDiff < 0 ) goto bExpBigger;
4870 if ( aExp == 0x7FFF ) {
4871 if ( aSig0 | aSig1 | bSig0 | bSig1 ) {
4872 return propagateFloat128NaN( a, b );
4873 }
4874 float_raise( float_flag_invalid );
4875 z.low = float128_default_nan_low;
4876 z.high = float128_default_nan_high;
4877 return z;
4878 }
4879 if ( aExp == 0 ) {
4880 aExp = 1;
4881 bExp = 1;
4882 }
4883 if ( bSig0 < aSig0 ) goto aBigger;
4884 if ( aSig0 < bSig0 ) goto bBigger;
4885 if ( bSig1 < aSig1 ) goto aBigger;
4886 if ( aSig1 < bSig1 ) goto bBigger;
4887 return packFloat128( float_rounding_mode == float_round_down, 0, 0, 0 );
4888 bExpBigger:
4889 if ( bExp == 0x7FFF ) {
4890 if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b );
4891 return packFloat128( zSign ^ 1, 0x7FFF, 0, 0 );
4892 }
4893 if ( aExp == 0 ) {
4894 ++expDiff;
4895 }
4896 else {
4897 aSig0 |= LIT64( 0x4000000000000000 );
4898 }
4899 shift128RightJamming( aSig0, aSig1, - expDiff, &aSig0, &aSig1 );
4900 bSig0 |= LIT64( 0x4000000000000000 );
4901 bBigger:
4902 sub128( bSig0, bSig1, aSig0, aSig1, &zSig0, &zSig1 );
4903 zExp = bExp;
4904 zSign ^= 1;
4905 goto normalizeRoundAndPack;
4906 aExpBigger:
4907 if ( aExp == 0x7FFF ) {
4908 if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, b );
4909 return a;
4910 }
4911 if ( bExp == 0 ) {
4912 --expDiff;
4913 }
4914 else {
4915 bSig0 |= LIT64( 0x4000000000000000 );
4916 }
4917 shift128RightJamming( bSig0, bSig1, expDiff, &bSig0, &bSig1 );
4918 aSig0 |= LIT64( 0x4000000000000000 );
4919 aBigger:
4920 sub128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
4921 zExp = aExp;
4922 normalizeRoundAndPack:
4923 --zExp;
4924 return normalizeRoundAndPackFloat128( zSign, zExp - 14, zSig0, zSig1 );
4925
4926 }
4927
4928 /*
4929 -------------------------------------------------------------------------------
4930 Returns the result of adding the quadruple-precision floating-point values
4931 `a' and `b'. The operation is performed according to the IEC/IEEE Standard
4932 for Binary Floating-Point Arithmetic.
4933 -------------------------------------------------------------------------------
4934 */
4935 float128 float128_add( float128 a, float128 b )
4936 {
4937 flag aSign, bSign;
4938
4939 aSign = extractFloat128Sign( a );
4940 bSign = extractFloat128Sign( b );
4941 if ( aSign == bSign ) {
4942 return addFloat128Sigs( a, b, aSign );
4943 }
4944 else {
4945 return subFloat128Sigs( a, b, aSign );
4946 }
4947
4948 }
4949
4950 /*
4951 -------------------------------------------------------------------------------
4952 Returns the result of subtracting the quadruple-precision floating-point
4953 values `a' and `b'. The operation is performed according to the IEC/IEEE
4954 Standard for Binary Floating-Point Arithmetic.
4955 -------------------------------------------------------------------------------
4956 */
4957 float128 float128_sub( float128 a, float128 b )
4958 {
4959 flag aSign, bSign;
4960
4961 aSign = extractFloat128Sign( a );
4962 bSign = extractFloat128Sign( b );
4963 if ( aSign == bSign ) {
4964 return subFloat128Sigs( a, b, aSign );
4965 }
4966 else {
4967 return addFloat128Sigs( a, b, aSign );
4968 }
4969
4970 }
4971
4972 /*
4973 -------------------------------------------------------------------------------
4974 Returns the result of multiplying the quadruple-precision floating-point
4975 values `a' and `b'. The operation is performed according to the IEC/IEEE
4976 Standard for Binary Floating-Point Arithmetic.
4977 -------------------------------------------------------------------------------
4978 */
4979 float128 float128_mul( float128 a, float128 b )
4980 {
4981 flag aSign, bSign, zSign;
4982 int32 aExp, bExp, zExp;
4983 bits64 aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2, zSig3;
4984 float128 z;
4985
4986 aSig1 = extractFloat128Frac1( a );
4987 aSig0 = extractFloat128Frac0( a );
4988 aExp = extractFloat128Exp( a );
4989 aSign = extractFloat128Sign( a );
4990 bSig1 = extractFloat128Frac1( b );
4991 bSig0 = extractFloat128Frac0( b );
4992 bExp = extractFloat128Exp( b );
4993 bSign = extractFloat128Sign( b );
4994 zSign = aSign ^ bSign;
4995 if ( aExp == 0x7FFF ) {
4996 if ( ( aSig0 | aSig1 )
4997 || ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) {
4998 return propagateFloat128NaN( a, b );
4999 }
5000 if ( ( bExp | bSig0 | bSig1 ) == 0 ) goto invalid;
5001 return packFloat128( zSign, 0x7FFF, 0, 0 );
5002 }
5003 if ( bExp == 0x7FFF ) {
5004 if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b );
5005 if ( ( aExp | aSig0 | aSig1 ) == 0 ) {
5006 invalid:
5007 float_raise( float_flag_invalid );
5008 z.low = float128_default_nan_low;
5009 z.high = float128_default_nan_high;
5010 return z;
5011 }
5012 return packFloat128( zSign, 0x7FFF, 0, 0 );
5013 }
5014 if ( aExp == 0 ) {
5015 if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
5016 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
5017 }
5018 if ( bExp == 0 ) {
5019 if ( ( bSig0 | bSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
5020 normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
5021 }
5022 zExp = aExp + bExp - 0x4000;
5023 aSig0 |= LIT64( 0x0001000000000000 );
5024 shortShift128Left( bSig0, bSig1, 16, &bSig0, &bSig1 );
5025 mul128To256( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1, &zSig2, &zSig3 );
5026 add128( zSig0, zSig1, aSig0, aSig1, &zSig0, &zSig1 );
5027 zSig2 |= ( zSig3 != 0 );
5028 if ( LIT64( 0x0002000000000000 ) <= zSig0 ) {
5029 shift128ExtraRightJamming(
5030 zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 );
5031 ++zExp;
5032 }
5033 return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 );
5034
5035 }
5036
5037 /*
5038 -------------------------------------------------------------------------------
5039 Returns the result of dividing the quadruple-precision floating-point value
5040 `a' by the corresponding value `b'. The operation is performed according to
5041 the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5042 -------------------------------------------------------------------------------
5043 */
5044 float128 float128_div( float128 a, float128 b )
5045 {
5046 flag aSign, bSign, zSign;
5047 int32 aExp, bExp, zExp;
5048 bits64 aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2;
5049 bits64 rem0, rem1, rem2, rem3, term0, term1, term2, term3;
5050 float128 z;
5051
5052 aSig1 = extractFloat128Frac1( a );
5053 aSig0 = extractFloat128Frac0( a );
5054 aExp = extractFloat128Exp( a );
5055 aSign = extractFloat128Sign( a );
5056 bSig1 = extractFloat128Frac1( b );
5057 bSig0 = extractFloat128Frac0( b );
5058 bExp = extractFloat128Exp( b );
5059 bSign = extractFloat128Sign( b );
5060 zSign = aSign ^ bSign;
5061 if ( aExp == 0x7FFF ) {
5062 if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, b );
5063 if ( bExp == 0x7FFF ) {
5064 if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b );
5065 goto invalid;
5066 }
5067 return packFloat128( zSign, 0x7FFF, 0, 0 );
5068 }
5069 if ( bExp == 0x7FFF ) {
5070 if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b );
5071 return packFloat128( zSign, 0, 0, 0 );
5072 }
5073 if ( bExp == 0 ) {
5074 if ( ( bSig0 | bSig1 ) == 0 ) {
5075 if ( ( aExp | aSig0 | aSig1 ) == 0 ) {
5076 invalid:
5077 float_raise( float_flag_invalid );
5078 z.low = float128_default_nan_low;
5079 z.high = float128_default_nan_high;
5080 return z;
5081 }
5082 float_raise( float_flag_divbyzero );
5083 return packFloat128( zSign, 0x7FFF, 0, 0 );
5084 }
5085 normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
5086 }
5087 if ( aExp == 0 ) {
5088 if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
5089 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
5090 }
5091 zExp = aExp - bExp + 0x3FFD;
5092 shortShift128Left(
5093 aSig0 | LIT64( 0x0001000000000000 ), aSig1, 15, &aSig0, &aSig1 );
5094 shortShift128Left(
5095 bSig0 | LIT64( 0x0001000000000000 ), bSig1, 15, &bSig0, &bSig1 );
5096 if ( le128( bSig0, bSig1, aSig0, aSig1 ) ) {
5097 shift128Right( aSig0, aSig1, 1, &aSig0, &aSig1 );
5098 ++zExp;
5099 }
5100 zSig0 = estimateDiv128To64( aSig0, aSig1, bSig0 );
5101 mul128By64To192( bSig0, bSig1, zSig0, &term0, &term1, &term2 );
5102 sub192( aSig0, aSig1, 0, term0, term1, term2, &rem0, &rem1, &rem2 );
5103 while ( (sbits64) rem0 < 0 ) {
5104 --zSig0;
5105 add192( rem0, rem1, rem2, 0, bSig0, bSig1, &rem0, &rem1, &rem2 );
5106 }
5107 zSig1 = estimateDiv128To64( rem1, rem2, bSig0 );
5108 if ( ( zSig1 & 0x3FFF ) <= 4 ) {
5109 mul128By64To192( bSig0, bSig1, zSig1, &term1, &term2, &term3 );
5110 sub192( rem1, rem2, 0, term1, term2, term3, &rem1, &rem2, &rem3 );
5111 while ( (sbits64) rem1 < 0 ) {
5112 --zSig1;
5113 add192( rem1, rem2, rem3, 0, bSig0, bSig1, &rem1, &rem2, &rem3 );
5114 }
5115 zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
5116 }
5117 shift128ExtraRightJamming( zSig0, zSig1, 0, 15, &zSig0, &zSig1, &zSig2 );
5118 return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 );
5119
5120 }
5121
5122 /*
5123 -------------------------------------------------------------------------------
5124 Returns the remainder of the quadruple-precision floating-point value `a'
5125 with respect to the corresponding value `b'. The operation is performed
5126 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5127 -------------------------------------------------------------------------------
5128 */
5129 float128 float128_rem( float128 a, float128 b )
5130 {
5131 flag aSign, zSign;
5132 int32 aExp, bExp, expDiff;
5133 bits64 aSig0, aSig1, bSig0, bSig1, q, term0, term1, term2;
5134 bits64 allZero, alternateASig0, alternateASig1, sigMean1;
5135 sbits64 sigMean0;
5136 float128 z;
5137
5138 aSig1 = extractFloat128Frac1( a );
5139 aSig0 = extractFloat128Frac0( a );
5140 aExp = extractFloat128Exp( a );
5141 aSign = extractFloat128Sign( a );
5142 bSig1 = extractFloat128Frac1( b );
5143 bSig0 = extractFloat128Frac0( b );
5144 bExp = extractFloat128Exp( b );
5145 if ( aExp == 0x7FFF ) {
5146 if ( ( aSig0 | aSig1 )
5147 || ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) {
5148 return propagateFloat128NaN( a, b );
5149 }
5150 goto invalid;
5151 }
5152 if ( bExp == 0x7FFF ) {
5153 if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b );
5154 return a;
5155 }
5156 if ( bExp == 0 ) {
5157 if ( ( bSig0 | bSig1 ) == 0 ) {
5158 invalid:
5159 float_raise( float_flag_invalid );
5160 z.low = float128_default_nan_low;
5161 z.high = float128_default_nan_high;
5162 return z;
5163 }
5164 normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
5165 }
5166 if ( aExp == 0 ) {
5167 if ( ( aSig0 | aSig1 ) == 0 ) return a;
5168 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
5169 }
5170 expDiff = aExp - bExp;
5171 if ( expDiff < -1 ) return a;
5172 shortShift128Left(
5173 aSig0 | LIT64( 0x0001000000000000 ),
5174 aSig1,
5175 15 - ( expDiff < 0 ),
5176 &aSig0,
5177 &aSig1
5178 );
5179 shortShift128Left(
5180 bSig0 | LIT64( 0x0001000000000000 ), bSig1, 15, &bSig0, &bSig1 );
5181 q = le128( bSig0, bSig1, aSig0, aSig1 );
5182 if ( q ) sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 );
5183 expDiff -= 64;
5184 while ( 0 < expDiff ) {
5185 q = estimateDiv128To64( aSig0, aSig1, bSig0 );
5186 q = ( 4 < q ) ? q - 4 : 0;
5187 mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 );
5188 shortShift192Left( term0, term1, term2, 61, &term1, &term2, &allZero );
5189 shortShift128Left( aSig0, aSig1, 61, &aSig0, &allZero );
5190 sub128( aSig0, 0, term1, term2, &aSig0, &aSig1 );
5191 expDiff -= 61;
5192 }
5193 if ( -64 < expDiff ) {
5194 q = estimateDiv128To64( aSig0, aSig1, bSig0 );
5195 q = ( 4 < q ) ? q - 4 : 0;
5196 q >>= - expDiff;
5197 shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 );
5198 expDiff += 52;
5199 if ( expDiff < 0 ) {
5200 shift128Right( aSig0, aSig1, - expDiff, &aSig0, &aSig1 );
5201 }
5202 else {
5203 shortShift128Left( aSig0, aSig1, expDiff, &aSig0, &aSig1 );
5204 }
5205 mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 );
5206 sub128( aSig0, aSig1, term1, term2, &aSig0, &aSig1 );
5207 }
5208 else {
5209 shift128Right( aSig0, aSig1, 12, &aSig0, &aSig1 );
5210 shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 );
5211 }
5212 do {
5213 alternateASig0 = aSig0;
5214 alternateASig1 = aSig1;
5215 ++q;
5216 sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 );
5217 } while ( 0 <= (sbits64) aSig0 );
5218 add128(
5219 aSig0, aSig1, alternateASig0, alternateASig1, (bits64 *)&sigMean0, &sigMean1 );
5220 if ( ( sigMean0 < 0 )
5221 || ( ( ( sigMean0 | sigMean1 ) == 0 ) && ( q & 1 ) ) ) {
5222 aSig0 = alternateASig0;
5223 aSig1 = alternateASig1;
5224 }
5225 zSign = ( (sbits64) aSig0 < 0 );
5226 if ( zSign ) sub128( 0, 0, aSig0, aSig1, &aSig0, &aSig1 );
5227 return
5228 normalizeRoundAndPackFloat128( aSign ^ zSign, bExp - 4, aSig0, aSig1 );
5229
5230 }
5231
5232 /*
5233 -------------------------------------------------------------------------------
5234 Returns the square root of the quadruple-precision floating-point value `a'.
5235 The operation is performed according to the IEC/IEEE Standard for Binary
5236 Floating-Point Arithmetic.
5237 -------------------------------------------------------------------------------
5238 */
5239 float128 float128_sqrt( float128 a )
5240 {
5241 flag aSign;
5242 int32 aExp, zExp;
5243 bits64 aSig0, aSig1, zSig0, zSig1, zSig2, doubleZSig0;
5244 bits64 rem0, rem1, rem2, rem3, term0, term1, term2, term3;
5245 float128 z;
5246
5247 aSig1 = extractFloat128Frac1( a );
5248 aSig0 = extractFloat128Frac0( a );
5249 aExp = extractFloat128Exp( a );
5250 aSign = extractFloat128Sign( a );
5251 if ( aExp == 0x7FFF ) {
5252 if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, a );
5253 if ( ! aSign ) return a;
5254 goto invalid;
5255 }
5256 if ( aSign ) {
5257 if ( ( aExp | aSig0 | aSig1 ) == 0 ) return a;
5258 invalid:
5259 float_raise( float_flag_invalid );
5260 z.low = float128_default_nan_low;
5261 z.high = float128_default_nan_high;
5262 return z;
5263 }
5264 if ( aExp == 0 ) {
5265 if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( 0, 0, 0, 0 );
5266 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
5267 }
5268 zExp = (int32) ( (bits32)(aExp - 0x3FFF) >> 1) + 0x3FFE;
5269 aSig0 |= LIT64( 0x0001000000000000 );
5270 zSig0 = estimateSqrt32((int16)aExp, (bits32)(aSig0>>17));
5271 shortShift128Left( aSig0, aSig1, 13 - ( aExp & 1 ), &aSig0, &aSig1 );
5272 zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 );
5273 doubleZSig0 = zSig0<<1;
5274 mul64To128( zSig0, zSig0, &term0, &term1 );
5275 sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 );
5276 while ( (sbits64) rem0 < 0 ) {
5277 --zSig0;
5278 doubleZSig0 -= 2;
5279 add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 );
5280 }
5281 zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 );
5282 if ( ( zSig1 & 0x1FFF ) <= 5 ) {
5283 if ( zSig1 == 0 ) zSig1 = 1;
5284 mul64To128( doubleZSig0, zSig1, &term1, &term2 );
5285 sub128( rem1, 0, term1, term2, &rem1, &rem2 );
5286 mul64To128( zSig1, zSig1, &term2, &term3 );
5287 sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 );
5288 while ( (sbits64) rem1 < 0 ) {
5289 --zSig1;
5290 shortShift128Left( 0, zSig1, 1, &term2, &term3 );
5291 term3 |= 1;
5292 term2 |= doubleZSig0;
5293 add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 );
5294 }
5295 zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
5296 }
5297 shift128ExtraRightJamming( zSig0, zSig1, 0, 14, &zSig0, &zSig1, &zSig2 );
5298 return roundAndPackFloat128( 0, zExp, zSig0, zSig1, zSig2 );
5299
5300 }
5301
5302 /*
5303 -------------------------------------------------------------------------------
5304 Returns 1 if the quadruple-precision floating-point value `a' is equal to
5305 the corresponding value `b', and 0 otherwise. The comparison is performed
5306 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5307 -------------------------------------------------------------------------------
5308 */
5309 flag float128_eq( float128 a, float128 b )
5310 {
5311
5312 if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
5313 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
5314 || ( ( extractFloat128Exp( b ) == 0x7FFF )
5315 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
5316 ) {
5317 if ( float128_is_signaling_nan( a )
5318 || float128_is_signaling_nan( b ) ) {
5319 float_raise( float_flag_invalid );
5320 }
5321 return 0;
5322 }
5323 return
5324 ( a.low == b.low )
5325 && ( ( a.high == b.high )
5326 || ( ( a.low == 0 )
5327 && ( (bits64) ( ( a.high | b.high )<<1 ) == 0 ) )
5328 );
5329
5330 }
5331
5332 /*
5333 -------------------------------------------------------------------------------
5334 Returns 1 if the quadruple-precision floating-point value `a' is less than
5335 or equal to the corresponding value `b', and 0 otherwise. The comparison
5336 is performed according to the IEC/IEEE Standard for Binary Floating-Point
5337 Arithmetic.
5338 -------------------------------------------------------------------------------
5339 */
5340 flag float128_le( float128 a, float128 b )
5341 {
5342 flag aSign, bSign;
5343
5344 if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
5345 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
5346 || ( ( extractFloat128Exp( b ) == 0x7FFF )
5347 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
5348 ) {
5349 float_raise( float_flag_invalid );
5350 return 0;
5351 }
5352 aSign = extractFloat128Sign( a );
5353 bSign = extractFloat128Sign( b );
5354 if ( aSign != bSign ) {
5355 return
5356 aSign
5357 || ( ( ( (bits64) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5358 == 0 );
5359 }
5360 return
5361 aSign ? le128( b.high, b.low, a.high, a.low )
5362 : le128( a.high, a.low, b.high, b.low );
5363
5364 }
5365
5366 /*
5367 -------------------------------------------------------------------------------
5368 Returns 1 if the quadruple-precision floating-point value `a' is less than
5369 the corresponding value `b', and 0 otherwise. The comparison is performed
5370 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5371 -------------------------------------------------------------------------------
5372 */
5373 flag float128_lt( float128 a, float128 b )
5374 {
5375 flag aSign, bSign;
5376
5377 if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
5378 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
5379 || ( ( extractFloat128Exp( b ) == 0x7FFF )
5380 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
5381 ) {
5382 float_raise( float_flag_invalid );
5383 return 0;
5384 }
5385 aSign = extractFloat128Sign( a );
5386 bSign = extractFloat128Sign( b );
5387 if ( aSign != bSign ) {
5388 return
5389 aSign
5390 && ( ( ( (bits64) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5391 != 0 );
5392 }
5393 return
5394 aSign ? lt128( b.high, b.low, a.high, a.low )
5395 : lt128( a.high, a.low, b.high, b.low );
5396
5397 }
5398
5399 /*
5400 -------------------------------------------------------------------------------
5401 Returns 1 if the quadruple-precision floating-point value `a' is equal to
5402 the corresponding value `b', and 0 otherwise. The invalid exception is
5403 raised if either operand is a NaN. Otherwise, the comparison is performed
5404 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5405 -------------------------------------------------------------------------------
5406 */
5407 flag float128_eq_signaling( float128 a, float128 b )
5408 {
5409
5410 if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
5411 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
5412 || ( ( extractFloat128Exp( b ) == 0x7FFF )
5413 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
5414 ) {
5415 float_raise( float_flag_invalid );
5416 return 0;
5417 }
5418 return
5419 ( a.low == b.low )
5420 && ( ( a.high == b.high )
5421 || ( ( a.low == 0 )
5422 && ( (bits64) ( ( a.high | b.high )<<1 ) == 0 ) )
5423 );
5424
5425 }
5426
5427 /*
5428 -------------------------------------------------------------------------------
5429 Returns 1 if the quadruple-precision floating-point value `a' is less than
5430 or equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not
5431 cause an exception. Otherwise, the comparison is performed according to the
5432 IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5433 -------------------------------------------------------------------------------
5434 */
5435 flag float128_le_quiet( float128 a, float128 b )
5436 {
5437 flag aSign, bSign;
5438
5439 if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
5440 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
5441 || ( ( extractFloat128Exp( b ) == 0x7FFF )
5442 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
5443 ) {
5444 if ( float128_is_signaling_nan( a )
5445 || float128_is_signaling_nan( b ) ) {
5446 float_raise( float_flag_invalid );
5447 }
5448 return 0;
5449 }
5450 aSign = extractFloat128Sign( a );
5451 bSign = extractFloat128Sign( b );
5452 if ( aSign != bSign ) {
5453 return
5454 aSign
5455 || ( ( ( (bits64) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5456 == 0 );
5457 }
5458 return
5459 aSign ? le128( b.high, b.low, a.high, a.low )
5460 : le128( a.high, a.low, b.high, b.low );
5461
5462 }
5463
5464 /*
5465 -------------------------------------------------------------------------------
5466 Returns 1 if the quadruple-precision floating-point value `a' is less than
5467 the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
5468 exception. Otherwise, the comparison is performed according to the IEC/IEEE
5469 Standard for Binary Floating-Point Arithmetic.
5470 -------------------------------------------------------------------------------
5471 */
5472 flag float128_lt_quiet( float128 a, float128 b )
5473 {
5474 flag aSign, bSign;
5475
5476 if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
5477 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
5478 || ( ( extractFloat128Exp( b ) == 0x7FFF )
5479 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
5480 ) {
5481 if ( float128_is_signaling_nan( a )
5482 || float128_is_signaling_nan( b ) ) {
5483 float_raise( float_flag_invalid );
5484 }
5485 return 0;
5486 }
5487 aSign = extractFloat128Sign( a );
5488 bSign = extractFloat128Sign( b );
5489 if ( aSign != bSign ) {
5490 return
5491 aSign
5492 && ( ( ( (bits64) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5493 != 0 );
5494 }
5495 return
5496 aSign ? lt128( b.high, b.low, a.high, a.low )
5497 : lt128( a.high, a.low, b.high, b.low );
5498
5499 }
5500
5501 #endif
5502
5503
5504 #if defined(SOFTFLOAT_FOR_GCC) && defined(SOFTFLOAT_NEED_FIXUNS)
5505
5506 /*
5507 * These two routines are not part of the original softfloat distribution.
5508 *
5509 * They are based on the corresponding conversions to integer but return
5510 * unsigned numbers instead since these functions are required by GCC.
5511 *
5512 * Added by Mark Brinicombe <mark (at) NetBSD.org> 27/09/97
5513 *
5514 * float64 version overhauled for SoftFloat 2a [bjh21 2000-07-15]
5515 */
5516
5517 /*
5518 -------------------------------------------------------------------------------
5519 Returns the result of converting the double-precision floating-point value
5520 `a' to the 32-bit unsigned integer format. The conversion is
5521 performed according to the IEC/IEEE Standard for Binary Floating-point
5522 Arithmetic, except that the conversion is always rounded toward zero. If
5523 `a' is a NaN, the largest positive integer is returned. If the conversion
5524 overflows, the largest integer positive is returned.
5525 -------------------------------------------------------------------------------
5526 */
5527 uint32 float64_to_uint32_round_to_zero( float64 a )
5528 {
5529 flag aSign;
5530 int16 aExp, shiftCount;
5531 bits64 aSig, savedASig;
5532 uint32 z;
5533
5534 aSig = extractFloat64Frac( a );
5535 aExp = extractFloat64Exp( a );
5536 aSign = extractFloat64Sign( a );
5537
5538 if (aSign) {
5539 float_raise( float_flag_invalid );
5540 return(0);
5541 }
5542
5543 if ( 0x41E < aExp ) {
5544 float_raise( float_flag_invalid );
5545 return 0xffffffff;
5546 }
5547 else if ( aExp < 0x3FF ) {
5548 if ( aExp || aSig ) set_float_exception_inexact_flag();
5549 return 0;
5550 }
5551 aSig |= LIT64( 0x0010000000000000 );
5552 shiftCount = 0x433 - aExp;
5553 savedASig = aSig;
5554 aSig >>= shiftCount;
5555 z = (uint32)aSig;
5556 if ( ( aSig<<shiftCount ) != savedASig ) {
5557 set_float_exception_inexact_flag();
5558 }
5559 return z;
5560
5561 }
5562
5563 /*
5564 -------------------------------------------------------------------------------
5565 Returns the result of converting the single-precision floating-point value
5566 `a' to the 32-bit unsigned integer format. The conversion is
5567 performed according to the IEC/IEEE Standard for Binary Floating-point
5568 Arithmetic, except that the conversion is always rounded toward zero. If
5569 `a' is a NaN, the largest positive integer is returned. If the conversion
5570 overflows, the largest positive integer is returned.
5571 -------------------------------------------------------------------------------
5572 */
5573 uint32 float32_to_uint32_round_to_zero( float32 a )
5574 {
5575 flag aSign;
5576 int16 aExp, shiftCount;
5577 bits32 aSig;
5578 uint32 z;
5579
5580 aSig = extractFloat32Frac( a );
5581 aExp = extractFloat32Exp( a );
5582 aSign = extractFloat32Sign( a );
5583 shiftCount = aExp - 0x9E;
5584
5585 if (aSign) {
5586 float_raise( float_flag_invalid );
5587 return(0);
5588 }
5589 if ( 0 < shiftCount ) {
5590 float_raise( float_flag_invalid );
5591 return 0xFFFFFFFF;
5592 }
5593 else if ( aExp <= 0x7E ) {
5594 if ( aExp | aSig ) set_float_exception_inexact_flag();
5595 return 0;
5596 }
5597 aSig = ( aSig | 0x800000 )<<8;
5598 z = aSig>>( - shiftCount );
5599 if ( aSig<<( shiftCount & 31 ) ) {
5600 set_float_exception_inexact_flag();
5601 }
5602 return z;
5603
5604 }
5605
5606 #endif
5607