softfloat.c revision 1.8.2.1 1 /* $NetBSD: softfloat.c,v 1.8.2.1 2012/04/17 00:05:23 yamt Exp $ */
2
3 /*
4 * This version hacked for use with gcc -msoft-float by bjh21.
5 * (Mostly a case of #ifdefing out things GCC doesn't need or provides
6 * itself).
7 */
8
9 /*
10 * Things you may want to define:
11 *
12 * SOFTFLOAT_FOR_GCC - build only those functions necessary for GCC (with
13 * -msoft-float) to work. Include "softfloat-for-gcc.h" to get them
14 * properly renamed.
15 */
16
17 /*
18 ===============================================================================
19
20 This C source file is part of the SoftFloat IEC/IEEE Floating-point
21 Arithmetic Package, Release 2a.
22
23 Written by John R. Hauser. This work was made possible in part by the
24 International Computer Science Institute, located at Suite 600, 1947 Center
25 Street, Berkeley, California 94704. Funding was partially provided by the
26 National Science Foundation under grant MIP-9311980. The original version
27 of this code was written as part of a project to build a fixed-point vector
28 processor in collaboration with the University of California at Berkeley,
29 overseen by Profs. Nelson Morgan and John Wawrzynek. More information
30 is available through the Web page `http://HTTP.CS.Berkeley.EDU/~jhauser/
31 arithmetic/SoftFloat.html'.
32
33 THIS SOFTWARE IS DISTRIBUTED AS IS, FOR FREE. Although reasonable effort
34 has been made to avoid it, THIS SOFTWARE MAY CONTAIN FAULTS THAT WILL AT
35 TIMES RESULT IN INCORRECT BEHAVIOR. USE OF THIS SOFTWARE IS RESTRICTED TO
36 PERSONS AND ORGANIZATIONS WHO CAN AND WILL TAKE FULL RESPONSIBILITY FOR ANY
37 AND ALL LOSSES, COSTS, OR OTHER PROBLEMS ARISING FROM ITS USE.
38
39 Derivative works are acceptable, even for commercial purposes, so long as
40 (1) they include prominent notice that the work is derivative, and (2) they
41 include prominent notice akin to these four paragraphs for those parts of
42 this code that are retained.
43
44 ===============================================================================
45 */
46
47 #include <sys/cdefs.h>
48 #if defined(LIBC_SCCS) && !defined(lint)
49 __RCSID("$NetBSD: softfloat.c,v 1.8.2.1 2012/04/17 00:05:23 yamt Exp $");
50 #endif /* LIBC_SCCS and not lint */
51
52 #ifdef SOFTFLOAT_FOR_GCC
53 #include "softfloat-for-gcc.h"
54 #endif
55
56 #include "milieu.h"
57 #include "softfloat.h"
58
59 /*
60 * Conversions between floats as stored in memory and floats as
61 * SoftFloat uses them
62 */
63 #ifndef FLOAT64_DEMANGLE
64 #define FLOAT64_DEMANGLE(a) (a)
65 #endif
66 #ifndef FLOAT64_MANGLE
67 #define FLOAT64_MANGLE(a) (a)
68 #endif
69
70 /*
71 -------------------------------------------------------------------------------
72 Floating-point rounding mode, extended double-precision rounding precision,
73 and exception flags.
74 -------------------------------------------------------------------------------
75 */
76 fp_rnd float_rounding_mode = float_round_nearest_even;
77 fp_except float_exception_flags = 0;
78 #ifdef FLOATX80
79 int8 floatx80_rounding_precision = 80;
80 #endif
81
82 /*
83 -------------------------------------------------------------------------------
84 Primitive arithmetic functions, including multi-word arithmetic, and
85 division and square root approximations. (Can be specialized to target if
86 desired.)
87 -------------------------------------------------------------------------------
88 */
89 #include "softfloat-macros"
90
91 /*
92 -------------------------------------------------------------------------------
93 Functions and definitions to determine: (1) whether tininess for underflow
94 is detected before or after rounding by default, (2) what (if anything)
95 happens when exceptions are raised, (3) how signaling NaNs are distinguished
96 from quiet NaNs, (4) the default generated quiet NaNs, and (5) how NaNs
97 are propagated from function inputs to output. These details are target-
98 specific.
99 -------------------------------------------------------------------------------
100 */
101 #include "softfloat-specialize"
102
103 #if !defined(SOFTFLOAT_FOR_GCC) || defined(FLOATX80) || defined(FLOAT128)
104 /*
105 -------------------------------------------------------------------------------
106 Takes a 64-bit fixed-point value `absZ' with binary point between bits 6
107 and 7, and returns the properly rounded 32-bit integer corresponding to the
108 input. If `zSign' is 1, the input is negated before being converted to an
109 integer. Bit 63 of `absZ' must be zero. Ordinarily, the fixed-point input
110 is simply rounded to an integer, with the inexact exception raised if the
111 input cannot be represented exactly as an integer. However, if the fixed-
112 point input is too large, the invalid exception is raised and the largest
113 positive or negative integer is returned.
114 -------------------------------------------------------------------------------
115 */
116 static int32 roundAndPackInt32( flag zSign, bits64 absZ )
117 {
118 int8 roundingMode;
119 flag roundNearestEven;
120 int8 roundIncrement, roundBits;
121 int32 z;
122
123 roundingMode = float_rounding_mode;
124 roundNearestEven = ( roundingMode == float_round_nearest_even );
125 roundIncrement = 0x40;
126 if ( ! roundNearestEven ) {
127 if ( roundingMode == float_round_to_zero ) {
128 roundIncrement = 0;
129 }
130 else {
131 roundIncrement = 0x7F;
132 if ( zSign ) {
133 if ( roundingMode == float_round_up ) roundIncrement = 0;
134 }
135 else {
136 if ( roundingMode == float_round_down ) roundIncrement = 0;
137 }
138 }
139 }
140 roundBits = (int8)(absZ & 0x7F);
141 absZ = ( absZ + roundIncrement )>>7;
142 absZ &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven );
143 z = (int32)absZ;
144 if ( zSign ) z = - z;
145 if ( ( absZ>>32 ) || ( z && ( ( z < 0 ) ^ zSign ) ) ) {
146 float_raise( float_flag_invalid );
147 return zSign ? (sbits32) 0x80000000 : 0x7FFFFFFF;
148 }
149 if ( roundBits ) float_exception_flags |= float_flag_inexact;
150 return z;
151
152 }
153
154 /*
155 -------------------------------------------------------------------------------
156 Takes the 128-bit fixed-point value formed by concatenating `absZ0' and
157 `absZ1', with binary point between bits 63 and 64 (between the input words),
158 and returns the properly rounded 64-bit integer corresponding to the input.
159 If `zSign' is 1, the input is negated before being converted to an integer.
160 Ordinarily, the fixed-point input is simply rounded to an integer, with
161 the inexact exception raised if the input cannot be represented exactly as
162 an integer. However, if the fixed-point input is too large, the invalid
163 exception is raised and the largest positive or negative integer is
164 returned.
165 -------------------------------------------------------------------------------
166 */
167 static int64 roundAndPackInt64( flag zSign, bits64 absZ0, bits64 absZ1 )
168 {
169 int8 roundingMode;
170 flag roundNearestEven, increment;
171 int64 z;
172
173 roundingMode = float_rounding_mode;
174 roundNearestEven = ( roundingMode == float_round_nearest_even );
175 increment = ( (sbits64) absZ1 < 0 );
176 if ( ! roundNearestEven ) {
177 if ( roundingMode == float_round_to_zero ) {
178 increment = 0;
179 }
180 else {
181 if ( zSign ) {
182 increment = ( roundingMode == float_round_down ) && absZ1;
183 }
184 else {
185 increment = ( roundingMode == float_round_up ) && absZ1;
186 }
187 }
188 }
189 if ( increment ) {
190 ++absZ0;
191 if ( absZ0 == 0 ) goto overflow;
192 absZ0 &= ~ ( ( (bits64) ( absZ1<<1 ) == 0 ) & roundNearestEven );
193 }
194 z = absZ0;
195 if ( zSign ) z = - z;
196 if ( z && ( ( z < 0 ) ^ zSign ) ) {
197 overflow:
198 float_raise( float_flag_invalid );
199 return
200 zSign ? (sbits64) LIT64( 0x8000000000000000 )
201 : LIT64( 0x7FFFFFFFFFFFFFFF );
202 }
203 if ( absZ1 ) float_exception_flags |= float_flag_inexact;
204 return z;
205
206 }
207 #endif
208
209 /*
210 -------------------------------------------------------------------------------
211 Returns the fraction bits of the single-precision floating-point value `a'.
212 -------------------------------------------------------------------------------
213 */
214 INLINE bits32 extractFloat32Frac( float32 a )
215 {
216
217 return a & 0x007FFFFF;
218
219 }
220
221 /*
222 -------------------------------------------------------------------------------
223 Returns the exponent bits of the single-precision floating-point value `a'.
224 -------------------------------------------------------------------------------
225 */
226 INLINE int16 extractFloat32Exp( float32 a )
227 {
228
229 return ( a>>23 ) & 0xFF;
230
231 }
232
233 /*
234 -------------------------------------------------------------------------------
235 Returns the sign bit of the single-precision floating-point value `a'.
236 -------------------------------------------------------------------------------
237 */
238 INLINE flag extractFloat32Sign( float32 a )
239 {
240
241 return a>>31;
242
243 }
244
245 /*
246 -------------------------------------------------------------------------------
247 Normalizes the subnormal single-precision floating-point value represented
248 by the denormalized significand `aSig'. The normalized exponent and
249 significand are stored at the locations pointed to by `zExpPtr' and
250 `zSigPtr', respectively.
251 -------------------------------------------------------------------------------
252 */
253 static void
254 normalizeFloat32Subnormal( bits32 aSig, int16 *zExpPtr, bits32 *zSigPtr )
255 {
256 int8 shiftCount;
257
258 shiftCount = countLeadingZeros32( aSig ) - 8;
259 *zSigPtr = aSig<<shiftCount;
260 *zExpPtr = 1 - shiftCount;
261
262 }
263
264 /*
265 -------------------------------------------------------------------------------
266 Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
267 single-precision floating-point value, returning the result. After being
268 shifted into the proper positions, the three fields are simply added
269 together to form the result. This means that any integer portion of `zSig'
270 will be added into the exponent. Since a properly normalized significand
271 will have an integer portion equal to 1, the `zExp' input should be 1 less
272 than the desired result exponent whenever `zSig' is a complete, normalized
273 significand.
274 -------------------------------------------------------------------------------
275 */
276 INLINE float32 packFloat32( flag zSign, int16 zExp, bits32 zSig )
277 {
278
279 return ( ( (bits32) zSign )<<31 ) + ( ( (bits32) zExp )<<23 ) + zSig;
280
281 }
282
283 /*
284 -------------------------------------------------------------------------------
285 Takes an abstract floating-point value having sign `zSign', exponent `zExp',
286 and significand `zSig', and returns the proper single-precision floating-
287 point value corresponding to the abstract input. Ordinarily, the abstract
288 value is simply rounded and packed into the single-precision format, with
289 the inexact exception raised if the abstract input cannot be represented
290 exactly. However, if the abstract value is too large, the overflow and
291 inexact exceptions are raised and an infinity or maximal finite value is
292 returned. If the abstract value is too small, the input value is rounded to
293 a subnormal number, and the underflow and inexact exceptions are raised if
294 the abstract input cannot be represented exactly as a subnormal single-
295 precision floating-point number.
296 The input significand `zSig' has its binary point between bits 30
297 and 29, which is 7 bits to the left of the usual location. This shifted
298 significand must be normalized or smaller. If `zSig' is not normalized,
299 `zExp' must be 0; in that case, the result returned is a subnormal number,
300 and it must not require rounding. In the usual case that `zSig' is
301 normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
302 The handling of underflow and overflow follows the IEC/IEEE Standard for
303 Binary Floating-Point Arithmetic.
304 -------------------------------------------------------------------------------
305 */
306 static float32 roundAndPackFloat32( flag zSign, int16 zExp, bits32 zSig )
307 {
308 int8 roundingMode;
309 flag roundNearestEven;
310 int8 roundIncrement, roundBits;
311 flag isTiny;
312
313 roundingMode = float_rounding_mode;
314 roundNearestEven = ( roundingMode == float_round_nearest_even );
315 roundIncrement = 0x40;
316 if ( ! roundNearestEven ) {
317 if ( roundingMode == float_round_to_zero ) {
318 roundIncrement = 0;
319 }
320 else {
321 roundIncrement = 0x7F;
322 if ( zSign ) {
323 if ( roundingMode == float_round_up ) roundIncrement = 0;
324 }
325 else {
326 if ( roundingMode == float_round_down ) roundIncrement = 0;
327 }
328 }
329 }
330 roundBits = zSig & 0x7F;
331 if ( 0xFD <= (bits16) zExp ) {
332 if ( ( 0xFD < zExp )
333 || ( ( zExp == 0xFD )
334 && ( (sbits32) ( zSig + roundIncrement ) < 0 ) )
335 ) {
336 float_raise( float_flag_overflow | float_flag_inexact );
337 return packFloat32( zSign, 0xFF, 0 ) - ( roundIncrement == 0 );
338 }
339 if ( zExp < 0 ) {
340 isTiny =
341 ( float_detect_tininess == float_tininess_before_rounding )
342 || ( zExp < -1 )
343 || ( zSig + roundIncrement < 0x80000000U );
344 shift32RightJamming( zSig, - zExp, &zSig );
345 zExp = 0;
346 roundBits = zSig & 0x7F;
347 if ( isTiny && roundBits ) float_raise( float_flag_underflow );
348 }
349 }
350 if ( roundBits ) float_exception_flags |= float_flag_inexact;
351 zSig = ( zSig + roundIncrement )>>7;
352 zSig &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven );
353 if ( zSig == 0 ) zExp = 0;
354 return packFloat32( zSign, zExp, zSig );
355
356 }
357
358 /*
359 -------------------------------------------------------------------------------
360 Takes an abstract floating-point value having sign `zSign', exponent `zExp',
361 and significand `zSig', and returns the proper single-precision floating-
362 point value corresponding to the abstract input. This routine is just like
363 `roundAndPackFloat32' except that `zSig' does not have to be normalized.
364 Bit 31 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
365 floating-point exponent.
366 -------------------------------------------------------------------------------
367 */
368 static float32
369 normalizeRoundAndPackFloat32( flag zSign, int16 zExp, bits32 zSig )
370 {
371 int8 shiftCount;
372
373 shiftCount = countLeadingZeros32( zSig ) - 1;
374 return roundAndPackFloat32( zSign, zExp - shiftCount, zSig<<shiftCount );
375
376 }
377
378 /*
379 -------------------------------------------------------------------------------
380 Returns the fraction bits of the double-precision floating-point value `a'.
381 -------------------------------------------------------------------------------
382 */
383 INLINE bits64 extractFloat64Frac( float64 a )
384 {
385
386 return FLOAT64_DEMANGLE(a) & LIT64( 0x000FFFFFFFFFFFFF );
387
388 }
389
390 /*
391 -------------------------------------------------------------------------------
392 Returns the exponent bits of the double-precision floating-point value `a'.
393 -------------------------------------------------------------------------------
394 */
395 INLINE int16 extractFloat64Exp( float64 a )
396 {
397
398 return (int16)((FLOAT64_DEMANGLE(a) >> 52) & 0x7FF);
399
400 }
401
402 /*
403 -------------------------------------------------------------------------------
404 Returns the sign bit of the double-precision floating-point value `a'.
405 -------------------------------------------------------------------------------
406 */
407 INLINE flag extractFloat64Sign( float64 a )
408 {
409
410 return (flag)(FLOAT64_DEMANGLE(a) >> 63);
411
412 }
413
414 /*
415 -------------------------------------------------------------------------------
416 Normalizes the subnormal double-precision floating-point value represented
417 by the denormalized significand `aSig'. The normalized exponent and
418 significand are stored at the locations pointed to by `zExpPtr' and
419 `zSigPtr', respectively.
420 -------------------------------------------------------------------------------
421 */
422 static void
423 normalizeFloat64Subnormal( bits64 aSig, int16 *zExpPtr, bits64 *zSigPtr )
424 {
425 int8 shiftCount;
426
427 shiftCount = countLeadingZeros64( aSig ) - 11;
428 *zSigPtr = aSig<<shiftCount;
429 *zExpPtr = 1 - shiftCount;
430
431 }
432
433 /*
434 -------------------------------------------------------------------------------
435 Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
436 double-precision floating-point value, returning the result. After being
437 shifted into the proper positions, the three fields are simply added
438 together to form the result. This means that any integer portion of `zSig'
439 will be added into the exponent. Since a properly normalized significand
440 will have an integer portion equal to 1, the `zExp' input should be 1 less
441 than the desired result exponent whenever `zSig' is a complete, normalized
442 significand.
443 -------------------------------------------------------------------------------
444 */
445 INLINE float64 packFloat64( flag zSign, int16 zExp, bits64 zSig )
446 {
447
448 return FLOAT64_MANGLE( ( ( (bits64) zSign )<<63 ) +
449 ( ( (bits64) zExp )<<52 ) + zSig );
450
451 }
452
453 /*
454 -------------------------------------------------------------------------------
455 Takes an abstract floating-point value having sign `zSign', exponent `zExp',
456 and significand `zSig', and returns the proper double-precision floating-
457 point value corresponding to the abstract input. Ordinarily, the abstract
458 value is simply rounded and packed into the double-precision format, with
459 the inexact exception raised if the abstract input cannot be represented
460 exactly. However, if the abstract value is too large, the overflow and
461 inexact exceptions are raised and an infinity or maximal finite value is
462 returned. If the abstract value is too small, the input value is rounded to
463 a subnormal number, and the underflow and inexact exceptions are raised if
464 the abstract input cannot be represented exactly as a subnormal double-
465 precision floating-point number.
466 The input significand `zSig' has its binary point between bits 62
467 and 61, which is 10 bits to the left of the usual location. This shifted
468 significand must be normalized or smaller. If `zSig' is not normalized,
469 `zExp' must be 0; in that case, the result returned is a subnormal number,
470 and it must not require rounding. In the usual case that `zSig' is
471 normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
472 The handling of underflow and overflow follows the IEC/IEEE Standard for
473 Binary Floating-Point Arithmetic.
474 -------------------------------------------------------------------------------
475 */
476 static float64 roundAndPackFloat64( flag zSign, int16 zExp, bits64 zSig )
477 {
478 int8 roundingMode;
479 flag roundNearestEven;
480 int16 roundIncrement, roundBits;
481 flag isTiny;
482
483 roundingMode = float_rounding_mode;
484 roundNearestEven = ( roundingMode == float_round_nearest_even );
485 roundIncrement = 0x200;
486 if ( ! roundNearestEven ) {
487 if ( roundingMode == float_round_to_zero ) {
488 roundIncrement = 0;
489 }
490 else {
491 roundIncrement = 0x3FF;
492 if ( zSign ) {
493 if ( roundingMode == float_round_up ) roundIncrement = 0;
494 }
495 else {
496 if ( roundingMode == float_round_down ) roundIncrement = 0;
497 }
498 }
499 }
500 roundBits = (int16)(zSig & 0x3FF);
501 if ( 0x7FD <= (bits16) zExp ) {
502 if ( ( 0x7FD < zExp )
503 || ( ( zExp == 0x7FD )
504 && ( (sbits64) ( zSig + roundIncrement ) < 0 ) )
505 ) {
506 float_raise( float_flag_overflow | float_flag_inexact );
507 return FLOAT64_MANGLE(
508 FLOAT64_DEMANGLE(packFloat64( zSign, 0x7FF, 0 )) -
509 ( roundIncrement == 0 ));
510 }
511 if ( zExp < 0 ) {
512 isTiny =
513 ( float_detect_tininess == float_tininess_before_rounding )
514 || ( zExp < -1 )
515 || ( zSig + roundIncrement < (bits64)LIT64( 0x8000000000000000 ) );
516 shift64RightJamming( zSig, - zExp, &zSig );
517 zExp = 0;
518 roundBits = (int16)(zSig & 0x3FF);
519 if ( isTiny && roundBits ) float_raise( float_flag_underflow );
520 }
521 }
522 if ( roundBits ) float_exception_flags |= float_flag_inexact;
523 zSig = ( zSig + roundIncrement )>>10;
524 zSig &= ~ ( ( ( roundBits ^ 0x200 ) == 0 ) & roundNearestEven );
525 if ( zSig == 0 ) zExp = 0;
526 return packFloat64( zSign, zExp, zSig );
527
528 }
529
530 /*
531 -------------------------------------------------------------------------------
532 Takes an abstract floating-point value having sign `zSign', exponent `zExp',
533 and significand `zSig', and returns the proper double-precision floating-
534 point value corresponding to the abstract input. This routine is just like
535 `roundAndPackFloat64' except that `zSig' does not have to be normalized.
536 Bit 63 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
537 floating-point exponent.
538 -------------------------------------------------------------------------------
539 */
540 static float64
541 normalizeRoundAndPackFloat64( flag zSign, int16 zExp, bits64 zSig )
542 {
543 int8 shiftCount;
544
545 shiftCount = countLeadingZeros64( zSig ) - 1;
546 return roundAndPackFloat64( zSign, zExp - shiftCount, zSig<<shiftCount );
547
548 }
549
550 #ifdef FLOATX80
551
552 /*
553 -------------------------------------------------------------------------------
554 Returns the fraction bits of the extended double-precision floating-point
555 value `a'.
556 -------------------------------------------------------------------------------
557 */
558 INLINE bits64 extractFloatx80Frac( floatx80 a )
559 {
560
561 return a.low;
562
563 }
564
565 /*
566 -------------------------------------------------------------------------------
567 Returns the exponent bits of the extended double-precision floating-point
568 value `a'.
569 -------------------------------------------------------------------------------
570 */
571 INLINE int32 extractFloatx80Exp( floatx80 a )
572 {
573
574 return a.high & 0x7FFF;
575
576 }
577
578 /*
579 -------------------------------------------------------------------------------
580 Returns the sign bit of the extended double-precision floating-point value
581 `a'.
582 -------------------------------------------------------------------------------
583 */
584 INLINE flag extractFloatx80Sign( floatx80 a )
585 {
586
587 return a.high>>15;
588
589 }
590
591 /*
592 -------------------------------------------------------------------------------
593 Normalizes the subnormal extended double-precision floating-point value
594 represented by the denormalized significand `aSig'. The normalized exponent
595 and significand are stored at the locations pointed to by `zExpPtr' and
596 `zSigPtr', respectively.
597 -------------------------------------------------------------------------------
598 */
599 static void
600 normalizeFloatx80Subnormal( bits64 aSig, int32 *zExpPtr, bits64 *zSigPtr )
601 {
602 int8 shiftCount;
603
604 shiftCount = countLeadingZeros64( aSig );
605 *zSigPtr = aSig<<shiftCount;
606 *zExpPtr = 1 - shiftCount;
607
608 }
609
610 /*
611 -------------------------------------------------------------------------------
612 Packs the sign `zSign', exponent `zExp', and significand `zSig' into an
613 extended double-precision floating-point value, returning the result.
614 -------------------------------------------------------------------------------
615 */
616 INLINE floatx80 packFloatx80( flag zSign, int32 zExp, bits64 zSig )
617 {
618 floatx80 z;
619
620 z.low = zSig;
621 z.high = ( ( (bits16) zSign )<<15 ) + zExp;
622 return z;
623
624 }
625
626 /*
627 -------------------------------------------------------------------------------
628 Takes an abstract floating-point value having sign `zSign', exponent `zExp',
629 and extended significand formed by the concatenation of `zSig0' and `zSig1',
630 and returns the proper extended double-precision floating-point value
631 corresponding to the abstract input. Ordinarily, the abstract value is
632 rounded and packed into the extended double-precision format, with the
633 inexact exception raised if the abstract input cannot be represented
634 exactly. However, if the abstract value is too large, the overflow and
635 inexact exceptions are raised and an infinity or maximal finite value is
636 returned. If the abstract value is too small, the input value is rounded to
637 a subnormal number, and the underflow and inexact exceptions are raised if
638 the abstract input cannot be represented exactly as a subnormal extended
639 double-precision floating-point number.
640 If `roundingPrecision' is 32 or 64, the result is rounded to the same
641 number of bits as single or double precision, respectively. Otherwise, the
642 result is rounded to the full precision of the extended double-precision
643 format.
644 The input significand must be normalized or smaller. If the input
645 significand is not normalized, `zExp' must be 0; in that case, the result
646 returned is a subnormal number, and it must not require rounding. The
647 handling of underflow and overflow follows the IEC/IEEE Standard for Binary
648 Floating-Point Arithmetic.
649 -------------------------------------------------------------------------------
650 */
651 static floatx80
652 roundAndPackFloatx80(
653 int8 roundingPrecision, flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1
654 )
655 {
656 int8 roundingMode;
657 flag roundNearestEven, increment, isTiny;
658 int64 roundIncrement, roundMask, roundBits;
659
660 roundingMode = float_rounding_mode;
661 roundNearestEven = ( roundingMode == float_round_nearest_even );
662 if ( roundingPrecision == 80 ) goto precision80;
663 if ( roundingPrecision == 64 ) {
664 roundIncrement = LIT64( 0x0000000000000400 );
665 roundMask = LIT64( 0x00000000000007FF );
666 }
667 else if ( roundingPrecision == 32 ) {
668 roundIncrement = LIT64( 0x0000008000000000 );
669 roundMask = LIT64( 0x000000FFFFFFFFFF );
670 }
671 else {
672 goto precision80;
673 }
674 zSig0 |= ( zSig1 != 0 );
675 if ( ! roundNearestEven ) {
676 if ( roundingMode == float_round_to_zero ) {
677 roundIncrement = 0;
678 }
679 else {
680 roundIncrement = roundMask;
681 if ( zSign ) {
682 if ( roundingMode == float_round_up ) roundIncrement = 0;
683 }
684 else {
685 if ( roundingMode == float_round_down ) roundIncrement = 0;
686 }
687 }
688 }
689 roundBits = zSig0 & roundMask;
690 if ( 0x7FFD <= (bits32) ( zExp - 1 ) ) {
691 if ( ( 0x7FFE < zExp )
692 || ( ( zExp == 0x7FFE ) && ( zSig0 + roundIncrement < zSig0 ) )
693 ) {
694 goto overflow;
695 }
696 if ( zExp <= 0 ) {
697 isTiny =
698 ( float_detect_tininess == float_tininess_before_rounding )
699 || ( zExp < 0 )
700 || ( zSig0 <= zSig0 + roundIncrement );
701 shift64RightJamming( zSig0, 1 - zExp, &zSig0 );
702 zExp = 0;
703 roundBits = zSig0 & roundMask;
704 if ( isTiny && roundBits ) float_raise( float_flag_underflow );
705 if ( roundBits ) float_exception_flags |= float_flag_inexact;
706 zSig0 += roundIncrement;
707 if ( (sbits64) zSig0 < 0 ) zExp = 1;
708 roundIncrement = roundMask + 1;
709 if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) {
710 roundMask |= roundIncrement;
711 }
712 zSig0 &= ~ roundMask;
713 return packFloatx80( zSign, zExp, zSig0 );
714 }
715 }
716 if ( roundBits ) float_exception_flags |= float_flag_inexact;
717 zSig0 += roundIncrement;
718 if ( zSig0 < roundIncrement ) {
719 ++zExp;
720 zSig0 = LIT64( 0x8000000000000000 );
721 }
722 roundIncrement = roundMask + 1;
723 if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) {
724 roundMask |= roundIncrement;
725 }
726 zSig0 &= ~ roundMask;
727 if ( zSig0 == 0 ) zExp = 0;
728 return packFloatx80( zSign, zExp, zSig0 );
729 precision80:
730 increment = ( (sbits64) zSig1 < 0 );
731 if ( ! roundNearestEven ) {
732 if ( roundingMode == float_round_to_zero ) {
733 increment = 0;
734 }
735 else {
736 if ( zSign ) {
737 increment = ( roundingMode == float_round_down ) && zSig1;
738 }
739 else {
740 increment = ( roundingMode == float_round_up ) && zSig1;
741 }
742 }
743 }
744 if ( 0x7FFD <= (bits32) ( zExp - 1 ) ) {
745 if ( ( 0x7FFE < zExp )
746 || ( ( zExp == 0x7FFE )
747 && ( zSig0 == LIT64( 0xFFFFFFFFFFFFFFFF ) )
748 && increment
749 )
750 ) {
751 roundMask = 0;
752 overflow:
753 float_raise( float_flag_overflow | float_flag_inexact );
754 if ( ( roundingMode == float_round_to_zero )
755 || ( zSign && ( roundingMode == float_round_up ) )
756 || ( ! zSign && ( roundingMode == float_round_down ) )
757 ) {
758 return packFloatx80( zSign, 0x7FFE, ~ roundMask );
759 }
760 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
761 }
762 if ( zExp <= 0 ) {
763 isTiny =
764 ( float_detect_tininess == float_tininess_before_rounding )
765 || ( zExp < 0 )
766 || ! increment
767 || ( zSig0 < LIT64( 0xFFFFFFFFFFFFFFFF ) );
768 shift64ExtraRightJamming( zSig0, zSig1, 1 - zExp, &zSig0, &zSig1 );
769 zExp = 0;
770 if ( isTiny && zSig1 ) float_raise( float_flag_underflow );
771 if ( zSig1 ) float_exception_flags |= float_flag_inexact;
772 if ( roundNearestEven ) {
773 increment = ( (sbits64) zSig1 < 0 );
774 }
775 else {
776 if ( zSign ) {
777 increment = ( roundingMode == float_round_down ) && zSig1;
778 }
779 else {
780 increment = ( roundingMode == float_round_up ) && zSig1;
781 }
782 }
783 if ( increment ) {
784 ++zSig0;
785 zSig0 &=
786 ~ ( ( (bits64) ( zSig1<<1 ) == 0 ) & roundNearestEven );
787 if ( (sbits64) zSig0 < 0 ) zExp = 1;
788 }
789 return packFloatx80( zSign, zExp, zSig0 );
790 }
791 }
792 if ( zSig1 ) float_exception_flags |= float_flag_inexact;
793 if ( increment ) {
794 ++zSig0;
795 if ( zSig0 == 0 ) {
796 ++zExp;
797 zSig0 = LIT64( 0x8000000000000000 );
798 }
799 else {
800 zSig0 &= ~ ( ( (bits64) ( zSig1<<1 ) == 0 ) & roundNearestEven );
801 }
802 }
803 else {
804 if ( zSig0 == 0 ) zExp = 0;
805 }
806 return packFloatx80( zSign, zExp, zSig0 );
807
808 }
809
810 /*
811 -------------------------------------------------------------------------------
812 Takes an abstract floating-point value having sign `zSign', exponent
813 `zExp', and significand formed by the concatenation of `zSig0' and `zSig1',
814 and returns the proper extended double-precision floating-point value
815 corresponding to the abstract input. This routine is just like
816 `roundAndPackFloatx80' except that the input significand does not have to be
817 normalized.
818 -------------------------------------------------------------------------------
819 */
820 static floatx80
821 normalizeRoundAndPackFloatx80(
822 int8 roundingPrecision, flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1
823 )
824 {
825 int8 shiftCount;
826
827 if ( zSig0 == 0 ) {
828 zSig0 = zSig1;
829 zSig1 = 0;
830 zExp -= 64;
831 }
832 shiftCount = countLeadingZeros64( zSig0 );
833 shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
834 zExp -= shiftCount;
835 return
836 roundAndPackFloatx80( roundingPrecision, zSign, zExp, zSig0, zSig1 );
837
838 }
839
840 #endif
841
842 #ifdef FLOAT128
843
844 /*
845 -------------------------------------------------------------------------------
846 Returns the least-significant 64 fraction bits of the quadruple-precision
847 floating-point value `a'.
848 -------------------------------------------------------------------------------
849 */
850 INLINE bits64 extractFloat128Frac1( float128 a )
851 {
852
853 return a.low;
854
855 }
856
857 /*
858 -------------------------------------------------------------------------------
859 Returns the most-significant 48 fraction bits of the quadruple-precision
860 floating-point value `a'.
861 -------------------------------------------------------------------------------
862 */
863 INLINE bits64 extractFloat128Frac0( float128 a )
864 {
865
866 return a.high & LIT64( 0x0000FFFFFFFFFFFF );
867
868 }
869
870 /*
871 -------------------------------------------------------------------------------
872 Returns the exponent bits of the quadruple-precision floating-point value
873 `a'.
874 -------------------------------------------------------------------------------
875 */
876 INLINE int32 extractFloat128Exp( float128 a )
877 {
878
879 return (int32)((a.high >> 48) & 0x7FFF);
880
881 }
882
883 /*
884 -------------------------------------------------------------------------------
885 Returns the sign bit of the quadruple-precision floating-point value `a'.
886 -------------------------------------------------------------------------------
887 */
888 INLINE flag extractFloat128Sign( float128 a )
889 {
890
891 return (flag)(a.high >> 63);
892
893 }
894
895 /*
896 -------------------------------------------------------------------------------
897 Normalizes the subnormal quadruple-precision floating-point value
898 represented by the denormalized significand formed by the concatenation of
899 `aSig0' and `aSig1'. The normalized exponent is stored at the location
900 pointed to by `zExpPtr'. The most significant 49 bits of the normalized
901 significand are stored at the location pointed to by `zSig0Ptr', and the
902 least significant 64 bits of the normalized significand are stored at the
903 location pointed to by `zSig1Ptr'.
904 -------------------------------------------------------------------------------
905 */
906 static void
907 normalizeFloat128Subnormal(
908 bits64 aSig0,
909 bits64 aSig1,
910 int32 *zExpPtr,
911 bits64 *zSig0Ptr,
912 bits64 *zSig1Ptr
913 )
914 {
915 int8 shiftCount;
916
917 if ( aSig0 == 0 ) {
918 shiftCount = countLeadingZeros64( aSig1 ) - 15;
919 if ( shiftCount < 0 ) {
920 *zSig0Ptr = aSig1>>( - shiftCount );
921 *zSig1Ptr = aSig1<<( shiftCount & 63 );
922 }
923 else {
924 *zSig0Ptr = aSig1<<shiftCount;
925 *zSig1Ptr = 0;
926 }
927 *zExpPtr = - shiftCount - 63;
928 }
929 else {
930 shiftCount = countLeadingZeros64( aSig0 ) - 15;
931 shortShift128Left( aSig0, aSig1, shiftCount, zSig0Ptr, zSig1Ptr );
932 *zExpPtr = 1 - shiftCount;
933 }
934
935 }
936
937 /*
938 -------------------------------------------------------------------------------
939 Packs the sign `zSign', the exponent `zExp', and the significand formed
940 by the concatenation of `zSig0' and `zSig1' into a quadruple-precision
941 floating-point value, returning the result. After being shifted into the
942 proper positions, the three fields `zSign', `zExp', and `zSig0' are simply
943 added together to form the most significant 32 bits of the result. This
944 means that any integer portion of `zSig0' will be added into the exponent.
945 Since a properly normalized significand will have an integer portion equal
946 to 1, the `zExp' input should be 1 less than the desired result exponent
947 whenever `zSig0' and `zSig1' concatenated form a complete, normalized
948 significand.
949 -------------------------------------------------------------------------------
950 */
951 INLINE float128
952 packFloat128( flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1 )
953 {
954 float128 z;
955
956 z.low = zSig1;
957 z.high = ( ( (bits64) zSign )<<63 ) + ( ( (bits64) zExp )<<48 ) + zSig0;
958 return z;
959
960 }
961
962 /*
963 -------------------------------------------------------------------------------
964 Takes an abstract floating-point value having sign `zSign', exponent `zExp',
965 and extended significand formed by the concatenation of `zSig0', `zSig1',
966 and `zSig2', and returns the proper quadruple-precision floating-point value
967 corresponding to the abstract input. Ordinarily, the abstract value is
968 simply rounded and packed into the quadruple-precision format, with the
969 inexact exception raised if the abstract input cannot be represented
970 exactly. However, if the abstract value is too large, the overflow and
971 inexact exceptions are raised and an infinity or maximal finite value is
972 returned. If the abstract value is too small, the input value is rounded to
973 a subnormal number, and the underflow and inexact exceptions are raised if
974 the abstract input cannot be represented exactly as a subnormal quadruple-
975 precision floating-point number.
976 The input significand must be normalized or smaller. If the input
977 significand is not normalized, `zExp' must be 0; in that case, the result
978 returned is a subnormal number, and it must not require rounding. In the
979 usual case that the input significand is normalized, `zExp' must be 1 less
980 than the ``true'' floating-point exponent. The handling of underflow and
981 overflow follows the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
982 -------------------------------------------------------------------------------
983 */
984 static float128
985 roundAndPackFloat128(
986 flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1, bits64 zSig2 )
987 {
988 int8 roundingMode;
989 flag roundNearestEven, increment, isTiny;
990
991 roundingMode = float_rounding_mode;
992 roundNearestEven = ( roundingMode == float_round_nearest_even );
993 increment = ( (sbits64) zSig2 < 0 );
994 if ( ! roundNearestEven ) {
995 if ( roundingMode == float_round_to_zero ) {
996 increment = 0;
997 }
998 else {
999 if ( zSign ) {
1000 increment = ( roundingMode == float_round_down ) && zSig2;
1001 }
1002 else {
1003 increment = ( roundingMode == float_round_up ) && zSig2;
1004 }
1005 }
1006 }
1007 if ( 0x7FFD <= (bits32) zExp ) {
1008 if ( ( 0x7FFD < zExp )
1009 || ( ( zExp == 0x7FFD )
1010 && eq128(
1011 LIT64( 0x0001FFFFFFFFFFFF ),
1012 LIT64( 0xFFFFFFFFFFFFFFFF ),
1013 zSig0,
1014 zSig1
1015 )
1016 && increment
1017 )
1018 ) {
1019 float_raise( float_flag_overflow | float_flag_inexact );
1020 if ( ( roundingMode == float_round_to_zero )
1021 || ( zSign && ( roundingMode == float_round_up ) )
1022 || ( ! zSign && ( roundingMode == float_round_down ) )
1023 ) {
1024 return
1025 packFloat128(
1026 zSign,
1027 0x7FFE,
1028 LIT64( 0x0000FFFFFFFFFFFF ),
1029 LIT64( 0xFFFFFFFFFFFFFFFF )
1030 );
1031 }
1032 return packFloat128( zSign, 0x7FFF, 0, 0 );
1033 }
1034 if ( zExp < 0 ) {
1035 isTiny =
1036 ( float_detect_tininess == float_tininess_before_rounding )
1037 || ( zExp < -1 )
1038 || ! increment
1039 || lt128(
1040 zSig0,
1041 zSig1,
1042 LIT64( 0x0001FFFFFFFFFFFF ),
1043 LIT64( 0xFFFFFFFFFFFFFFFF )
1044 );
1045 shift128ExtraRightJamming(
1046 zSig0, zSig1, zSig2, - zExp, &zSig0, &zSig1, &zSig2 );
1047 zExp = 0;
1048 if ( isTiny && zSig2 ) float_raise( float_flag_underflow );
1049 if ( roundNearestEven ) {
1050 increment = ( (sbits64) zSig2 < 0 );
1051 }
1052 else {
1053 if ( zSign ) {
1054 increment = ( roundingMode == float_round_down ) && zSig2;
1055 }
1056 else {
1057 increment = ( roundingMode == float_round_up ) && zSig2;
1058 }
1059 }
1060 }
1061 }
1062 if ( zSig2 ) float_exception_flags |= float_flag_inexact;
1063 if ( increment ) {
1064 add128( zSig0, zSig1, 0, 1, &zSig0, &zSig1 );
1065 zSig1 &= ~ ( ( zSig2 + zSig2 == 0 ) & roundNearestEven );
1066 }
1067 else {
1068 if ( ( zSig0 | zSig1 ) == 0 ) zExp = 0;
1069 }
1070 return packFloat128( zSign, zExp, zSig0, zSig1 );
1071
1072 }
1073
1074 /*
1075 -------------------------------------------------------------------------------
1076 Takes an abstract floating-point value having sign `zSign', exponent `zExp',
1077 and significand formed by the concatenation of `zSig0' and `zSig1', and
1078 returns the proper quadruple-precision floating-point value corresponding
1079 to the abstract input. This routine is just like `roundAndPackFloat128'
1080 except that the input significand has fewer bits and does not have to be
1081 normalized. In all cases, `zExp' must be 1 less than the ``true'' floating-
1082 point exponent.
1083 -------------------------------------------------------------------------------
1084 */
1085 static float128
1086 normalizeRoundAndPackFloat128(
1087 flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1 )
1088 {
1089 int8 shiftCount;
1090 bits64 zSig2;
1091
1092 if ( zSig0 == 0 ) {
1093 zSig0 = zSig1;
1094 zSig1 = 0;
1095 zExp -= 64;
1096 }
1097 shiftCount = countLeadingZeros64( zSig0 ) - 15;
1098 if ( 0 <= shiftCount ) {
1099 zSig2 = 0;
1100 shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
1101 }
1102 else {
1103 shift128ExtraRightJamming(
1104 zSig0, zSig1, 0, - shiftCount, &zSig0, &zSig1, &zSig2 );
1105 }
1106 zExp -= shiftCount;
1107 return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 );
1108
1109 }
1110
1111 #endif
1112
1113 /*
1114 -------------------------------------------------------------------------------
1115 Returns the result of converting the 32-bit two's complement integer `a'
1116 to the single-precision floating-point format. The conversion is performed
1117 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1118 -------------------------------------------------------------------------------
1119 */
1120 float32 int32_to_float32( int32 a )
1121 {
1122 flag zSign;
1123
1124 if ( a == 0 ) return 0;
1125 if ( a == (sbits32) 0x80000000 ) return packFloat32( 1, 0x9E, 0 );
1126 zSign = ( a < 0 );
1127 return normalizeRoundAndPackFloat32(zSign, 0x9C, (uint32)(zSign ? - a : a));
1128
1129 }
1130
1131 float32 uint32_to_float32( uint32 a )
1132 {
1133 if ( a == 0 ) return 0;
1134 if ( a & (bits32) 0x80000000 )
1135 return normalizeRoundAndPackFloat32( 0, 0x9D, a >> 1 );
1136 return normalizeRoundAndPackFloat32( 0, 0x9C, a );
1137 }
1138
1139
1140 /*
1141 -------------------------------------------------------------------------------
1142 Returns the result of converting the 32-bit two's complement integer `a'
1143 to the double-precision floating-point format. The conversion is performed
1144 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1145 -------------------------------------------------------------------------------
1146 */
1147 float64 int32_to_float64( int32 a )
1148 {
1149 flag zSign;
1150 uint32 absA;
1151 int8 shiftCount;
1152 bits64 zSig;
1153
1154 if ( a == 0 ) return 0;
1155 zSign = ( a < 0 );
1156 absA = zSign ? - a : a;
1157 shiftCount = countLeadingZeros32( absA ) + 21;
1158 zSig = absA;
1159 return packFloat64( zSign, 0x432 - shiftCount, zSig<<shiftCount );
1160
1161 }
1162
1163 float64 uint32_to_float64( uint32 a )
1164 {
1165 int8 shiftCount;
1166 bits64 zSig = a;
1167
1168 if ( a == 0 ) return 0;
1169 shiftCount = countLeadingZeros32( a ) + 21;
1170 return packFloat64( 0, 0x432 - shiftCount, zSig<<shiftCount );
1171
1172 }
1173
1174 #ifdef FLOATX80
1175
1176 /*
1177 -------------------------------------------------------------------------------
1178 Returns the result of converting the 32-bit two's complement integer `a'
1179 to the extended double-precision floating-point format. The conversion
1180 is performed according to the IEC/IEEE Standard for Binary Floating-Point
1181 Arithmetic.
1182 -------------------------------------------------------------------------------
1183 */
1184 floatx80 int32_to_floatx80( int32 a )
1185 {
1186 flag zSign;
1187 uint32 absA;
1188 int8 shiftCount;
1189 bits64 zSig;
1190
1191 if ( a == 0 ) return packFloatx80( 0, 0, 0 );
1192 zSign = ( a < 0 );
1193 absA = zSign ? - a : a;
1194 shiftCount = countLeadingZeros32( absA ) + 32;
1195 zSig = absA;
1196 return packFloatx80( zSign, 0x403E - shiftCount, zSig<<shiftCount );
1197
1198 }
1199
1200 floatx80 uint32_to_floatx80( uint32 a )
1201 {
1202 int8 shiftCount;
1203 bits64 zSig = a;
1204
1205 if ( a == 0 ) return packFloatx80( 0, 0, 0 );
1206 shiftCount = countLeadingZeros32( a ) + 32;
1207 return packFloatx80( 0, 0x403E - shiftCount, zSig<<shiftCount );
1208
1209 }
1210
1211 #endif
1212
1213 #ifdef FLOAT128
1214
1215 /*
1216 -------------------------------------------------------------------------------
1217 Returns the result of converting the 32-bit two's complement integer `a' to
1218 the quadruple-precision floating-point format. The conversion is performed
1219 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1220 -------------------------------------------------------------------------------
1221 */
1222 float128 int32_to_float128( int32 a )
1223 {
1224 flag zSign;
1225 uint32 absA;
1226 int8 shiftCount;
1227 bits64 zSig0;
1228
1229 if ( a == 0 ) return packFloat128( 0, 0, 0, 0 );
1230 zSign = ( a < 0 );
1231 absA = zSign ? - a : a;
1232 shiftCount = countLeadingZeros32( absA ) + 17;
1233 zSig0 = absA;
1234 return packFloat128( zSign, 0x402E - shiftCount, zSig0<<shiftCount, 0 );
1235
1236 }
1237
1238 float128 uint32_to_float128( uint32 a )
1239 {
1240 int8 shiftCount;
1241 bits64 zSig0 = a;
1242
1243 if ( a == 0 ) return packFloat128( 0, 0, 0, 0 );
1244 shiftCount = countLeadingZeros32( a ) + 17;
1245 return packFloat128( 0, 0x402E - shiftCount, zSig0<<shiftCount, 0 );
1246
1247 }
1248
1249 #endif
1250
1251 #ifndef SOFTFLOAT_FOR_GCC /* __floatdi?f is in libgcc2.c */
1252 /*
1253 -------------------------------------------------------------------------------
1254 Returns the result of converting the 64-bit two's complement integer `a'
1255 to the single-precision floating-point format. The conversion is performed
1256 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1257 -------------------------------------------------------------------------------
1258 */
1259 float32 int64_to_float32( int64 a )
1260 {
1261 flag zSign;
1262 uint64 absA;
1263 int8 shiftCount;
1264
1265 if ( a == 0 ) return 0;
1266 zSign = ( a < 0 );
1267 absA = zSign ? - a : a;
1268 shiftCount = countLeadingZeros64( absA ) - 40;
1269 if ( 0 <= shiftCount ) {
1270 return packFloat32( zSign, 0x95 - shiftCount, absA<<shiftCount );
1271 }
1272 else {
1273 shiftCount += 7;
1274 if ( shiftCount < 0 ) {
1275 shift64RightJamming( absA, - shiftCount, &absA );
1276 }
1277 else {
1278 absA <<= shiftCount;
1279 }
1280 return roundAndPackFloat32( zSign, 0x9C - shiftCount, absA );
1281 }
1282
1283 }
1284
1285 /*
1286 -------------------------------------------------------------------------------
1287 Returns the result of converting the 64-bit two's complement integer `a'
1288 to the double-precision floating-point format. The conversion is performed
1289 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1290 -------------------------------------------------------------------------------
1291 */
1292 float64 int64_to_float64( int64 a )
1293 {
1294 flag zSign;
1295
1296 if ( a == 0 ) return 0;
1297 if ( a == (sbits64) LIT64( 0x8000000000000000 ) ) {
1298 return packFloat64( 1, 0x43E, 0 );
1299 }
1300 zSign = ( a < 0 );
1301 return normalizeRoundAndPackFloat64( zSign, 0x43C, zSign ? - a : a );
1302
1303 }
1304
1305 #ifdef FLOATX80
1306
1307 /*
1308 -------------------------------------------------------------------------------
1309 Returns the result of converting the 64-bit two's complement integer `a'
1310 to the extended double-precision floating-point format. The conversion
1311 is performed according to the IEC/IEEE Standard for Binary Floating-Point
1312 Arithmetic.
1313 -------------------------------------------------------------------------------
1314 */
1315 floatx80 int64_to_floatx80( int64 a )
1316 {
1317 flag zSign;
1318 uint64 absA;
1319 int8 shiftCount;
1320
1321 if ( a == 0 ) return packFloatx80( 0, 0, 0 );
1322 zSign = ( a < 0 );
1323 absA = zSign ? - a : a;
1324 shiftCount = countLeadingZeros64( absA );
1325 return packFloatx80( zSign, 0x403E - shiftCount, absA<<shiftCount );
1326
1327 }
1328
1329 #endif
1330
1331 #endif /* !SOFTFLOAT_FOR_GCC */
1332
1333 #ifdef FLOAT128
1334
1335 /*
1336 -------------------------------------------------------------------------------
1337 Returns the result of converting the 64-bit two's complement integer `a' to
1338 the quadruple-precision floating-point format. The conversion is performed
1339 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1340 -------------------------------------------------------------------------------
1341 */
1342 float128 int64_to_float128( int64 a )
1343 {
1344 flag zSign;
1345 uint64 absA;
1346 int8 shiftCount;
1347 int32 zExp;
1348 bits64 zSig0, zSig1;
1349
1350 if ( a == 0 ) return packFloat128( 0, 0, 0, 0 );
1351 zSign = ( a < 0 );
1352 absA = zSign ? - a : a;
1353 shiftCount = countLeadingZeros64( absA ) + 49;
1354 zExp = 0x406E - shiftCount;
1355 if ( 64 <= shiftCount ) {
1356 zSig1 = 0;
1357 zSig0 = absA;
1358 shiftCount -= 64;
1359 }
1360 else {
1361 zSig1 = absA;
1362 zSig0 = 0;
1363 }
1364 shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
1365 return packFloat128( zSign, zExp, zSig0, zSig1 );
1366
1367 }
1368
1369 #endif
1370
1371 #ifndef SOFTFLOAT_FOR_GCC /* Not needed */
1372 /*
1373 -------------------------------------------------------------------------------
1374 Returns the result of converting the single-precision floating-point value
1375 `a' to the 32-bit two's complement integer format. The conversion is
1376 performed according to the IEC/IEEE Standard for Binary Floating-Point
1377 Arithmetic---which means in particular that the conversion is rounded
1378 according to the current rounding mode. If `a' is a NaN, the largest
1379 positive integer is returned. Otherwise, if the conversion overflows, the
1380 largest integer with the same sign as `a' is returned.
1381 -------------------------------------------------------------------------------
1382 */
1383 int32 float32_to_int32( float32 a )
1384 {
1385 flag aSign;
1386 int16 aExp, shiftCount;
1387 bits32 aSig;
1388 bits64 aSig64;
1389
1390 aSig = extractFloat32Frac( a );
1391 aExp = extractFloat32Exp( a );
1392 aSign = extractFloat32Sign( a );
1393 if ( ( aExp == 0xFF ) && aSig ) aSign = 0;
1394 if ( aExp ) aSig |= 0x00800000;
1395 shiftCount = 0xAF - aExp;
1396 aSig64 = aSig;
1397 aSig64 <<= 32;
1398 if ( 0 < shiftCount ) shift64RightJamming( aSig64, shiftCount, &aSig64 );
1399 return roundAndPackInt32( aSign, aSig64 );
1400
1401 }
1402 #endif /* !SOFTFLOAT_FOR_GCC */
1403
1404 /*
1405 -------------------------------------------------------------------------------
1406 Returns the result of converting the single-precision floating-point value
1407 `a' to the 32-bit two's complement integer format. The conversion is
1408 performed according to the IEC/IEEE Standard for Binary Floating-Point
1409 Arithmetic, except that the conversion is always rounded toward zero.
1410 If `a' is a NaN, the largest positive integer is returned. Otherwise, if
1411 the conversion overflows, the largest integer with the same sign as `a' is
1412 returned.
1413 -------------------------------------------------------------------------------
1414 */
1415 int32 float32_to_int32_round_to_zero( float32 a )
1416 {
1417 flag aSign;
1418 int16 aExp, shiftCount;
1419 bits32 aSig;
1420 int32 z;
1421
1422 aSig = extractFloat32Frac( a );
1423 aExp = extractFloat32Exp( a );
1424 aSign = extractFloat32Sign( a );
1425 shiftCount = aExp - 0x9E;
1426 if ( 0 <= shiftCount ) {
1427 if ( a != 0xCF000000 ) {
1428 float_raise( float_flag_invalid );
1429 if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) return 0x7FFFFFFF;
1430 }
1431 return (sbits32) 0x80000000;
1432 }
1433 else if ( aExp <= 0x7E ) {
1434 if ( aExp | aSig ) float_exception_flags |= float_flag_inexact;
1435 return 0;
1436 }
1437 aSig = ( aSig | 0x00800000 )<<8;
1438 z = aSig>>( - shiftCount );
1439 if ( (bits32) ( aSig<<( shiftCount & 31 ) ) ) {
1440 float_exception_flags |= float_flag_inexact;
1441 }
1442 if ( aSign ) z = - z;
1443 return z;
1444
1445 }
1446
1447 #ifndef SOFTFLOAT_FOR_GCC /* __fix?fdi provided by libgcc2.c */
1448 /*
1449 -------------------------------------------------------------------------------
1450 Returns the result of converting the single-precision floating-point value
1451 `a' to the 64-bit two's complement integer format. The conversion is
1452 performed according to the IEC/IEEE Standard for Binary Floating-Point
1453 Arithmetic---which means in particular that the conversion is rounded
1454 according to the current rounding mode. If `a' is a NaN, the largest
1455 positive integer is returned. Otherwise, if the conversion overflows, the
1456 largest integer with the same sign as `a' is returned.
1457 -------------------------------------------------------------------------------
1458 */
1459 int64 float32_to_int64( float32 a )
1460 {
1461 flag aSign;
1462 int16 aExp, shiftCount;
1463 bits32 aSig;
1464 bits64 aSig64, aSigExtra;
1465
1466 aSig = extractFloat32Frac( a );
1467 aExp = extractFloat32Exp( a );
1468 aSign = extractFloat32Sign( a );
1469 shiftCount = 0xBE - aExp;
1470 if ( shiftCount < 0 ) {
1471 float_raise( float_flag_invalid );
1472 if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) {
1473 return LIT64( 0x7FFFFFFFFFFFFFFF );
1474 }
1475 return (sbits64) LIT64( 0x8000000000000000 );
1476 }
1477 if ( aExp ) aSig |= 0x00800000;
1478 aSig64 = aSig;
1479 aSig64 <<= 40;
1480 shift64ExtraRightJamming( aSig64, 0, shiftCount, &aSig64, &aSigExtra );
1481 return roundAndPackInt64( aSign, aSig64, aSigExtra );
1482
1483 }
1484
1485 /*
1486 -------------------------------------------------------------------------------
1487 Returns the result of converting the single-precision floating-point value
1488 `a' to the 64-bit two's complement integer format. The conversion is
1489 performed according to the IEC/IEEE Standard for Binary Floating-Point
1490 Arithmetic, except that the conversion is always rounded toward zero. If
1491 `a' is a NaN, the largest positive integer is returned. Otherwise, if the
1492 conversion overflows, the largest integer with the same sign as `a' is
1493 returned.
1494 -------------------------------------------------------------------------------
1495 */
1496 int64 float32_to_int64_round_to_zero( float32 a )
1497 {
1498 flag aSign;
1499 int16 aExp, shiftCount;
1500 bits32 aSig;
1501 bits64 aSig64;
1502 int64 z;
1503
1504 aSig = extractFloat32Frac( a );
1505 aExp = extractFloat32Exp( a );
1506 aSign = extractFloat32Sign( a );
1507 shiftCount = aExp - 0xBE;
1508 if ( 0 <= shiftCount ) {
1509 if ( a != 0xDF000000 ) {
1510 float_raise( float_flag_invalid );
1511 if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) {
1512 return LIT64( 0x7FFFFFFFFFFFFFFF );
1513 }
1514 }
1515 return (sbits64) LIT64( 0x8000000000000000 );
1516 }
1517 else if ( aExp <= 0x7E ) {
1518 if ( aExp | aSig ) float_exception_flags |= float_flag_inexact;
1519 return 0;
1520 }
1521 aSig64 = aSig | 0x00800000;
1522 aSig64 <<= 40;
1523 z = aSig64>>( - shiftCount );
1524 if ( (bits64) ( aSig64<<( shiftCount & 63 ) ) ) {
1525 float_exception_flags |= float_flag_inexact;
1526 }
1527 if ( aSign ) z = - z;
1528 return z;
1529
1530 }
1531 #endif /* !SOFTFLOAT_FOR_GCC */
1532
1533 /*
1534 -------------------------------------------------------------------------------
1535 Returns the result of converting the single-precision floating-point value
1536 `a' to the double-precision floating-point format. The conversion is
1537 performed according to the IEC/IEEE Standard for Binary Floating-Point
1538 Arithmetic.
1539 -------------------------------------------------------------------------------
1540 */
1541 float64 float32_to_float64( float32 a )
1542 {
1543 flag aSign;
1544 int16 aExp;
1545 bits32 aSig;
1546
1547 aSig = extractFloat32Frac( a );
1548 aExp = extractFloat32Exp( a );
1549 aSign = extractFloat32Sign( a );
1550 if ( aExp == 0xFF ) {
1551 if ( aSig ) return commonNaNToFloat64( float32ToCommonNaN( a ) );
1552 return packFloat64( aSign, 0x7FF, 0 );
1553 }
1554 if ( aExp == 0 ) {
1555 if ( aSig == 0 ) return packFloat64( aSign, 0, 0 );
1556 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1557 --aExp;
1558 }
1559 return packFloat64( aSign, aExp + 0x380, ( (bits64) aSig )<<29 );
1560
1561 }
1562
1563 #ifdef FLOATX80
1564
1565 /*
1566 -------------------------------------------------------------------------------
1567 Returns the result of converting the single-precision floating-point value
1568 `a' to the extended double-precision floating-point format. The conversion
1569 is performed according to the IEC/IEEE Standard for Binary Floating-Point
1570 Arithmetic.
1571 -------------------------------------------------------------------------------
1572 */
1573 floatx80 float32_to_floatx80( float32 a )
1574 {
1575 flag aSign;
1576 int16 aExp;
1577 bits32 aSig;
1578
1579 aSig = extractFloat32Frac( a );
1580 aExp = extractFloat32Exp( a );
1581 aSign = extractFloat32Sign( a );
1582 if ( aExp == 0xFF ) {
1583 if ( aSig ) return commonNaNToFloatx80( float32ToCommonNaN( a ) );
1584 return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
1585 }
1586 if ( aExp == 0 ) {
1587 if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 );
1588 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1589 }
1590 aSig |= 0x00800000;
1591 return packFloatx80( aSign, aExp + 0x3F80, ( (bits64) aSig )<<40 );
1592
1593 }
1594
1595 #endif
1596
1597 #ifdef FLOAT128
1598
1599 /*
1600 -------------------------------------------------------------------------------
1601 Returns the result of converting the single-precision floating-point value
1602 `a' to the double-precision floating-point format. The conversion is
1603 performed according to the IEC/IEEE Standard for Binary Floating-Point
1604 Arithmetic.
1605 -------------------------------------------------------------------------------
1606 */
1607 float128 float32_to_float128( float32 a )
1608 {
1609 flag aSign;
1610 int16 aExp;
1611 bits32 aSig;
1612
1613 aSig = extractFloat32Frac( a );
1614 aExp = extractFloat32Exp( a );
1615 aSign = extractFloat32Sign( a );
1616 if ( aExp == 0xFF ) {
1617 if ( aSig ) return commonNaNToFloat128( float32ToCommonNaN( a ) );
1618 return packFloat128( aSign, 0x7FFF, 0, 0 );
1619 }
1620 if ( aExp == 0 ) {
1621 if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 );
1622 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1623 --aExp;
1624 }
1625 return packFloat128( aSign, aExp + 0x3F80, ( (bits64) aSig )<<25, 0 );
1626
1627 }
1628
1629 #endif
1630
1631 #ifndef SOFTFLOAT_FOR_GCC /* Not needed */
1632 /*
1633 -------------------------------------------------------------------------------
1634 Rounds the single-precision floating-point value `a' to an integer, and
1635 returns the result as a single-precision floating-point value. The
1636 operation is performed according to the IEC/IEEE Standard for Binary
1637 Floating-Point Arithmetic.
1638 -------------------------------------------------------------------------------
1639 */
1640 float32 float32_round_to_int( float32 a )
1641 {
1642 flag aSign;
1643 int16 aExp;
1644 bits32 lastBitMask, roundBitsMask;
1645 int8 roundingMode;
1646 float32 z;
1647
1648 aExp = extractFloat32Exp( a );
1649 if ( 0x96 <= aExp ) {
1650 if ( ( aExp == 0xFF ) && extractFloat32Frac( a ) ) {
1651 return propagateFloat32NaN( a, a );
1652 }
1653 return a;
1654 }
1655 if ( aExp <= 0x7E ) {
1656 if ( (bits32) ( a<<1 ) == 0 ) return a;
1657 float_exception_flags |= float_flag_inexact;
1658 aSign = extractFloat32Sign( a );
1659 switch ( float_rounding_mode ) {
1660 case float_round_nearest_even:
1661 if ( ( aExp == 0x7E ) && extractFloat32Frac( a ) ) {
1662 return packFloat32( aSign, 0x7F, 0 );
1663 }
1664 break;
1665 case float_round_to_zero:
1666 break;
1667 case float_round_down:
1668 return aSign ? 0xBF800000 : 0;
1669 case float_round_up:
1670 return aSign ? 0x80000000 : 0x3F800000;
1671 }
1672 return packFloat32( aSign, 0, 0 );
1673 }
1674 lastBitMask = 1;
1675 lastBitMask <<= 0x96 - aExp;
1676 roundBitsMask = lastBitMask - 1;
1677 z = a;
1678 roundingMode = float_rounding_mode;
1679 if ( roundingMode == float_round_nearest_even ) {
1680 z += lastBitMask>>1;
1681 if ( ( z & roundBitsMask ) == 0 ) z &= ~ lastBitMask;
1682 }
1683 else if ( roundingMode != float_round_to_zero ) {
1684 if ( extractFloat32Sign( z ) ^ ( roundingMode == float_round_up ) ) {
1685 z += roundBitsMask;
1686 }
1687 }
1688 z &= ~ roundBitsMask;
1689 if ( z != a ) float_exception_flags |= float_flag_inexact;
1690 return z;
1691
1692 }
1693 #endif /* !SOFTFLOAT_FOR_GCC */
1694
1695 /*
1696 -------------------------------------------------------------------------------
1697 Returns the result of adding the absolute values of the single-precision
1698 floating-point values `a' and `b'. If `zSign' is 1, the sum is negated
1699 before being returned. `zSign' is ignored if the result is a NaN.
1700 The addition is performed according to the IEC/IEEE Standard for Binary
1701 Floating-Point Arithmetic.
1702 -------------------------------------------------------------------------------
1703 */
1704 static float32 addFloat32Sigs( float32 a, float32 b, flag zSign )
1705 {
1706 int16 aExp, bExp, zExp;
1707 bits32 aSig, bSig, zSig;
1708 int16 expDiff;
1709
1710 aSig = extractFloat32Frac( a );
1711 aExp = extractFloat32Exp( a );
1712 bSig = extractFloat32Frac( b );
1713 bExp = extractFloat32Exp( b );
1714 expDiff = aExp - bExp;
1715 aSig <<= 6;
1716 bSig <<= 6;
1717 if ( 0 < expDiff ) {
1718 if ( aExp == 0xFF ) {
1719 if ( aSig ) return propagateFloat32NaN( a, b );
1720 return a;
1721 }
1722 if ( bExp == 0 ) {
1723 --expDiff;
1724 }
1725 else {
1726 bSig |= 0x20000000;
1727 }
1728 shift32RightJamming( bSig, expDiff, &bSig );
1729 zExp = aExp;
1730 }
1731 else if ( expDiff < 0 ) {
1732 if ( bExp == 0xFF ) {
1733 if ( bSig ) return propagateFloat32NaN( a, b );
1734 return packFloat32( zSign, 0xFF, 0 );
1735 }
1736 if ( aExp == 0 ) {
1737 ++expDiff;
1738 }
1739 else {
1740 aSig |= 0x20000000;
1741 }
1742 shift32RightJamming( aSig, - expDiff, &aSig );
1743 zExp = bExp;
1744 }
1745 else {
1746 if ( aExp == 0xFF ) {
1747 if ( aSig | bSig ) return propagateFloat32NaN( a, b );
1748 return a;
1749 }
1750 if ( aExp == 0 ) return packFloat32( zSign, 0, ( aSig + bSig )>>6 );
1751 zSig = 0x40000000 + aSig + bSig;
1752 zExp = aExp;
1753 goto roundAndPack;
1754 }
1755 aSig |= 0x20000000;
1756 zSig = ( aSig + bSig )<<1;
1757 --zExp;
1758 if ( (sbits32) zSig < 0 ) {
1759 zSig = aSig + bSig;
1760 ++zExp;
1761 }
1762 roundAndPack:
1763 return roundAndPackFloat32( zSign, zExp, zSig );
1764
1765 }
1766
1767 /*
1768 -------------------------------------------------------------------------------
1769 Returns the result of subtracting the absolute values of the single-
1770 precision floating-point values `a' and `b'. If `zSign' is 1, the
1771 difference is negated before being returned. `zSign' is ignored if the
1772 result is a NaN. The subtraction is performed according to the IEC/IEEE
1773 Standard for Binary Floating-Point Arithmetic.
1774 -------------------------------------------------------------------------------
1775 */
1776 static float32 subFloat32Sigs( float32 a, float32 b, flag zSign )
1777 {
1778 int16 aExp, bExp, zExp;
1779 bits32 aSig, bSig, zSig;
1780 int16 expDiff;
1781
1782 aSig = extractFloat32Frac( a );
1783 aExp = extractFloat32Exp( a );
1784 bSig = extractFloat32Frac( b );
1785 bExp = extractFloat32Exp( b );
1786 expDiff = aExp - bExp;
1787 aSig <<= 7;
1788 bSig <<= 7;
1789 if ( 0 < expDiff ) goto aExpBigger;
1790 if ( expDiff < 0 ) goto bExpBigger;
1791 if ( aExp == 0xFF ) {
1792 if ( aSig | bSig ) return propagateFloat32NaN( a, b );
1793 float_raise( float_flag_invalid );
1794 return float32_default_nan;
1795 }
1796 if ( aExp == 0 ) {
1797 aExp = 1;
1798 bExp = 1;
1799 }
1800 if ( bSig < aSig ) goto aBigger;
1801 if ( aSig < bSig ) goto bBigger;
1802 return packFloat32( float_rounding_mode == float_round_down, 0, 0 );
1803 bExpBigger:
1804 if ( bExp == 0xFF ) {
1805 if ( bSig ) return propagateFloat32NaN( a, b );
1806 return packFloat32( zSign ^ 1, 0xFF, 0 );
1807 }
1808 if ( aExp == 0 ) {
1809 ++expDiff;
1810 }
1811 else {
1812 aSig |= 0x40000000;
1813 }
1814 shift32RightJamming( aSig, - expDiff, &aSig );
1815 bSig |= 0x40000000;
1816 bBigger:
1817 zSig = bSig - aSig;
1818 zExp = bExp;
1819 zSign ^= 1;
1820 goto normalizeRoundAndPack;
1821 aExpBigger:
1822 if ( aExp == 0xFF ) {
1823 if ( aSig ) return propagateFloat32NaN( a, b );
1824 return a;
1825 }
1826 if ( bExp == 0 ) {
1827 --expDiff;
1828 }
1829 else {
1830 bSig |= 0x40000000;
1831 }
1832 shift32RightJamming( bSig, expDiff, &bSig );
1833 aSig |= 0x40000000;
1834 aBigger:
1835 zSig = aSig - bSig;
1836 zExp = aExp;
1837 normalizeRoundAndPack:
1838 --zExp;
1839 return normalizeRoundAndPackFloat32( zSign, zExp, zSig );
1840
1841 }
1842
1843 /*
1844 -------------------------------------------------------------------------------
1845 Returns the result of adding the single-precision floating-point values `a'
1846 and `b'. The operation is performed according to the IEC/IEEE Standard for
1847 Binary Floating-Point Arithmetic.
1848 -------------------------------------------------------------------------------
1849 */
1850 float32 float32_add( float32 a, float32 b )
1851 {
1852 flag aSign, bSign;
1853
1854 aSign = extractFloat32Sign( a );
1855 bSign = extractFloat32Sign( b );
1856 if ( aSign == bSign ) {
1857 return addFloat32Sigs( a, b, aSign );
1858 }
1859 else {
1860 return subFloat32Sigs( a, b, aSign );
1861 }
1862
1863 }
1864
1865 /*
1866 -------------------------------------------------------------------------------
1867 Returns the result of subtracting the single-precision floating-point values
1868 `a' and `b'. The operation is performed according to the IEC/IEEE Standard
1869 for Binary Floating-Point Arithmetic.
1870 -------------------------------------------------------------------------------
1871 */
1872 float32 float32_sub( float32 a, float32 b )
1873 {
1874 flag aSign, bSign;
1875
1876 aSign = extractFloat32Sign( a );
1877 bSign = extractFloat32Sign( b );
1878 if ( aSign == bSign ) {
1879 return subFloat32Sigs( a, b, aSign );
1880 }
1881 else {
1882 return addFloat32Sigs( a, b, aSign );
1883 }
1884
1885 }
1886
1887 /*
1888 -------------------------------------------------------------------------------
1889 Returns the result of multiplying the single-precision floating-point values
1890 `a' and `b'. The operation is performed according to the IEC/IEEE Standard
1891 for Binary Floating-Point Arithmetic.
1892 -------------------------------------------------------------------------------
1893 */
1894 float32 float32_mul( float32 a, float32 b )
1895 {
1896 flag aSign, bSign, zSign;
1897 int16 aExp, bExp, zExp;
1898 bits32 aSig, bSig;
1899 bits64 zSig64;
1900 bits32 zSig;
1901
1902 aSig = extractFloat32Frac( a );
1903 aExp = extractFloat32Exp( a );
1904 aSign = extractFloat32Sign( a );
1905 bSig = extractFloat32Frac( b );
1906 bExp = extractFloat32Exp( b );
1907 bSign = extractFloat32Sign( b );
1908 zSign = aSign ^ bSign;
1909 if ( aExp == 0xFF ) {
1910 if ( aSig || ( ( bExp == 0xFF ) && bSig ) ) {
1911 return propagateFloat32NaN( a, b );
1912 }
1913 if ( ( bExp | bSig ) == 0 ) {
1914 float_raise( float_flag_invalid );
1915 return float32_default_nan;
1916 }
1917 return packFloat32( zSign, 0xFF, 0 );
1918 }
1919 if ( bExp == 0xFF ) {
1920 if ( bSig ) return propagateFloat32NaN( a, b );
1921 if ( ( aExp | aSig ) == 0 ) {
1922 float_raise( float_flag_invalid );
1923 return float32_default_nan;
1924 }
1925 return packFloat32( zSign, 0xFF, 0 );
1926 }
1927 if ( aExp == 0 ) {
1928 if ( aSig == 0 ) return packFloat32( zSign, 0, 0 );
1929 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1930 }
1931 if ( bExp == 0 ) {
1932 if ( bSig == 0 ) return packFloat32( zSign, 0, 0 );
1933 normalizeFloat32Subnormal( bSig, &bExp, &bSig );
1934 }
1935 zExp = aExp + bExp - 0x7F;
1936 aSig = ( aSig | 0x00800000 )<<7;
1937 bSig = ( bSig | 0x00800000 )<<8;
1938 shift64RightJamming( ( (bits64) aSig ) * bSig, 32, &zSig64 );
1939 zSig = (bits32)zSig64;
1940 if ( 0 <= (sbits32) ( zSig<<1 ) ) {
1941 zSig <<= 1;
1942 --zExp;
1943 }
1944 return roundAndPackFloat32( zSign, zExp, zSig );
1945
1946 }
1947
1948 /*
1949 -------------------------------------------------------------------------------
1950 Returns the result of dividing the single-precision floating-point value `a'
1951 by the corresponding value `b'. The operation is performed according to the
1952 IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1953 -------------------------------------------------------------------------------
1954 */
1955 float32 float32_div( float32 a, float32 b )
1956 {
1957 flag aSign, bSign, zSign;
1958 int16 aExp, bExp, zExp;
1959 bits32 aSig, bSig, zSig;
1960
1961 aSig = extractFloat32Frac( a );
1962 aExp = extractFloat32Exp( a );
1963 aSign = extractFloat32Sign( a );
1964 bSig = extractFloat32Frac( b );
1965 bExp = extractFloat32Exp( b );
1966 bSign = extractFloat32Sign( b );
1967 zSign = aSign ^ bSign;
1968 if ( aExp == 0xFF ) {
1969 if ( aSig ) return propagateFloat32NaN( a, b );
1970 if ( bExp == 0xFF ) {
1971 if ( bSig ) return propagateFloat32NaN( a, b );
1972 float_raise( float_flag_invalid );
1973 return float32_default_nan;
1974 }
1975 return packFloat32( zSign, 0xFF, 0 );
1976 }
1977 if ( bExp == 0xFF ) {
1978 if ( bSig ) return propagateFloat32NaN( a, b );
1979 return packFloat32( zSign, 0, 0 );
1980 }
1981 if ( bExp == 0 ) {
1982 if ( bSig == 0 ) {
1983 if ( ( aExp | aSig ) == 0 ) {
1984 float_raise( float_flag_invalid );
1985 return float32_default_nan;
1986 }
1987 float_raise( float_flag_divbyzero );
1988 return packFloat32( zSign, 0xFF, 0 );
1989 }
1990 normalizeFloat32Subnormal( bSig, &bExp, &bSig );
1991 }
1992 if ( aExp == 0 ) {
1993 if ( aSig == 0 ) return packFloat32( zSign, 0, 0 );
1994 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1995 }
1996 zExp = aExp - bExp + 0x7D;
1997 aSig = ( aSig | 0x00800000 )<<7;
1998 bSig = ( bSig | 0x00800000 )<<8;
1999 if ( bSig <= ( aSig + aSig ) ) {
2000 aSig >>= 1;
2001 ++zExp;
2002 }
2003 zSig = (bits32)((((bits64) aSig) << 32) / bSig);
2004 if ( ( zSig & 0x3F ) == 0 ) {
2005 zSig |= ( (bits64) bSig * zSig != ( (bits64) aSig )<<32 );
2006 }
2007 return roundAndPackFloat32( zSign, zExp, zSig );
2008
2009 }
2010
2011 #ifndef SOFTFLOAT_FOR_GCC /* Not needed */
2012 /*
2013 -------------------------------------------------------------------------------
2014 Returns the remainder of the single-precision floating-point value `a'
2015 with respect to the corresponding value `b'. The operation is performed
2016 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2017 -------------------------------------------------------------------------------
2018 */
2019 float32 float32_rem( float32 a, float32 b )
2020 {
2021 flag aSign, bSign, zSign;
2022 int16 aExp, bExp, expDiff;
2023 bits32 aSig, bSig;
2024 bits32 q;
2025 bits64 aSig64, bSig64, q64;
2026 bits32 alternateASig;
2027 sbits32 sigMean;
2028
2029 aSig = extractFloat32Frac( a );
2030 aExp = extractFloat32Exp( a );
2031 aSign = extractFloat32Sign( a );
2032 bSig = extractFloat32Frac( b );
2033 bExp = extractFloat32Exp( b );
2034 bSign = extractFloat32Sign( b );
2035 if ( aExp == 0xFF ) {
2036 if ( aSig || ( ( bExp == 0xFF ) && bSig ) ) {
2037 return propagateFloat32NaN( a, b );
2038 }
2039 float_raise( float_flag_invalid );
2040 return float32_default_nan;
2041 }
2042 if ( bExp == 0xFF ) {
2043 if ( bSig ) return propagateFloat32NaN( a, b );
2044 return a;
2045 }
2046 if ( bExp == 0 ) {
2047 if ( bSig == 0 ) {
2048 float_raise( float_flag_invalid );
2049 return float32_default_nan;
2050 }
2051 normalizeFloat32Subnormal( bSig, &bExp, &bSig );
2052 }
2053 if ( aExp == 0 ) {
2054 if ( aSig == 0 ) return a;
2055 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
2056 }
2057 expDiff = aExp - bExp;
2058 aSig |= 0x00800000;
2059 bSig |= 0x00800000;
2060 if ( expDiff < 32 ) {
2061 aSig <<= 8;
2062 bSig <<= 8;
2063 if ( expDiff < 0 ) {
2064 if ( expDiff < -1 ) return a;
2065 aSig >>= 1;
2066 }
2067 q = ( bSig <= aSig );
2068 if ( q ) aSig -= bSig;
2069 if ( 0 < expDiff ) {
2070 q = ( ( (bits64) aSig )<<32 ) / bSig;
2071 q >>= 32 - expDiff;
2072 bSig >>= 2;
2073 aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q;
2074 }
2075 else {
2076 aSig >>= 2;
2077 bSig >>= 2;
2078 }
2079 }
2080 else {
2081 if ( bSig <= aSig ) aSig -= bSig;
2082 aSig64 = ( (bits64) aSig )<<40;
2083 bSig64 = ( (bits64) bSig )<<40;
2084 expDiff -= 64;
2085 while ( 0 < expDiff ) {
2086 q64 = estimateDiv128To64( aSig64, 0, bSig64 );
2087 q64 = ( 2 < q64 ) ? q64 - 2 : 0;
2088 aSig64 = - ( ( bSig * q64 )<<38 );
2089 expDiff -= 62;
2090 }
2091 expDiff += 64;
2092 q64 = estimateDiv128To64( aSig64, 0, bSig64 );
2093 q64 = ( 2 < q64 ) ? q64 - 2 : 0;
2094 q = q64>>( 64 - expDiff );
2095 bSig <<= 6;
2096 aSig = ( ( aSig64>>33 )<<( expDiff - 1 ) ) - bSig * q;
2097 }
2098 do {
2099 alternateASig = aSig;
2100 ++q;
2101 aSig -= bSig;
2102 } while ( 0 <= (sbits32) aSig );
2103 sigMean = aSig + alternateASig;
2104 if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) {
2105 aSig = alternateASig;
2106 }
2107 zSign = ( (sbits32) aSig < 0 );
2108 if ( zSign ) aSig = - aSig;
2109 return normalizeRoundAndPackFloat32( aSign ^ zSign, bExp, aSig );
2110
2111 }
2112 #endif /* !SOFTFLOAT_FOR_GCC */
2113
2114 #ifndef SOFTFLOAT_FOR_GCC /* Not needed */
2115 /*
2116 -------------------------------------------------------------------------------
2117 Returns the square root of the single-precision floating-point value `a'.
2118 The operation is performed according to the IEC/IEEE Standard for Binary
2119 Floating-Point Arithmetic.
2120 -------------------------------------------------------------------------------
2121 */
2122 float32 float32_sqrt( float32 a )
2123 {
2124 flag aSign;
2125 int16 aExp, zExp;
2126 bits32 aSig, zSig;
2127 bits64 rem, term;
2128
2129 aSig = extractFloat32Frac( a );
2130 aExp = extractFloat32Exp( a );
2131 aSign = extractFloat32Sign( a );
2132 if ( aExp == 0xFF ) {
2133 if ( aSig ) return propagateFloat32NaN( a, 0 );
2134 if ( ! aSign ) return a;
2135 float_raise( float_flag_invalid );
2136 return float32_default_nan;
2137 }
2138 if ( aSign ) {
2139 if ( ( aExp | aSig ) == 0 ) return a;
2140 float_raise( float_flag_invalid );
2141 return float32_default_nan;
2142 }
2143 if ( aExp == 0 ) {
2144 if ( aSig == 0 ) return 0;
2145 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
2146 }
2147 zExp = ( ( aExp - 0x7F )>>1 ) + 0x7E;
2148 aSig = ( aSig | 0x00800000 )<<8;
2149 zSig = estimateSqrt32( aExp, aSig ) + 2;
2150 if ( ( zSig & 0x7F ) <= 5 ) {
2151 if ( zSig < 2 ) {
2152 zSig = 0x7FFFFFFF;
2153 goto roundAndPack;
2154 }
2155 aSig >>= aExp & 1;
2156 term = ( (bits64) zSig ) * zSig;
2157 rem = ( ( (bits64) aSig )<<32 ) - term;
2158 while ( (sbits64) rem < 0 ) {
2159 --zSig;
2160 rem += ( ( (bits64) zSig )<<1 ) | 1;
2161 }
2162 zSig |= ( rem != 0 );
2163 }
2164 shift32RightJamming( zSig, 1, &zSig );
2165 roundAndPack:
2166 return roundAndPackFloat32( 0, zExp, zSig );
2167
2168 }
2169 #endif /* !SOFTFLOAT_FOR_GCC */
2170
2171 /*
2172 -------------------------------------------------------------------------------
2173 Returns 1 if the single-precision floating-point value `a' is equal to
2174 the corresponding value `b', and 0 otherwise. The comparison is performed
2175 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2176 -------------------------------------------------------------------------------
2177 */
2178 flag float32_eq( float32 a, float32 b )
2179 {
2180
2181 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2182 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2183 ) {
2184 if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) {
2185 float_raise( float_flag_invalid );
2186 }
2187 return 0;
2188 }
2189 return ( a == b ) || ( (bits32) ( ( a | b )<<1 ) == 0 );
2190
2191 }
2192
2193 /*
2194 -------------------------------------------------------------------------------
2195 Returns 1 if the single-precision floating-point value `a' is less than
2196 or equal to the corresponding value `b', and 0 otherwise. The comparison
2197 is performed according to the IEC/IEEE Standard for Binary Floating-Point
2198 Arithmetic.
2199 -------------------------------------------------------------------------------
2200 */
2201 flag float32_le( float32 a, float32 b )
2202 {
2203 flag aSign, bSign;
2204
2205 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2206 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2207 ) {
2208 float_raise( float_flag_invalid );
2209 return 0;
2210 }
2211 aSign = extractFloat32Sign( a );
2212 bSign = extractFloat32Sign( b );
2213 if ( aSign != bSign ) return aSign || ( (bits32) ( ( a | b )<<1 ) == 0 );
2214 return ( a == b ) || ( aSign ^ ( a < b ) );
2215
2216 }
2217
2218 /*
2219 -------------------------------------------------------------------------------
2220 Returns 1 if the single-precision floating-point value `a' is less than
2221 the corresponding value `b', and 0 otherwise. The comparison is performed
2222 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2223 -------------------------------------------------------------------------------
2224 */
2225 flag float32_lt( float32 a, float32 b )
2226 {
2227 flag aSign, bSign;
2228
2229 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2230 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2231 ) {
2232 float_raise( float_flag_invalid );
2233 return 0;
2234 }
2235 aSign = extractFloat32Sign( a );
2236 bSign = extractFloat32Sign( b );
2237 if ( aSign != bSign ) return aSign && ( (bits32) ( ( a | b )<<1 ) != 0 );
2238 return ( a != b ) && ( aSign ^ ( a < b ) );
2239
2240 }
2241
2242 #ifndef SOFTFLOAT_FOR_GCC /* Not needed */
2243 /*
2244 -------------------------------------------------------------------------------
2245 Returns 1 if the single-precision floating-point value `a' is equal to
2246 the corresponding value `b', and 0 otherwise. The invalid exception is
2247 raised if either operand is a NaN. Otherwise, the comparison is performed
2248 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2249 -------------------------------------------------------------------------------
2250 */
2251 flag float32_eq_signaling( float32 a, float32 b )
2252 {
2253
2254 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2255 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2256 ) {
2257 float_raise( float_flag_invalid );
2258 return 0;
2259 }
2260 return ( a == b ) || ( (bits32) ( ( a | b )<<1 ) == 0 );
2261
2262 }
2263
2264 /*
2265 -------------------------------------------------------------------------------
2266 Returns 1 if the single-precision floating-point value `a' is less than or
2267 equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not
2268 cause an exception. Otherwise, the comparison is performed according to the
2269 IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2270 -------------------------------------------------------------------------------
2271 */
2272 flag float32_le_quiet( float32 a, float32 b )
2273 {
2274 flag aSign, bSign;
2275
2276 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2277 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2278 ) {
2279 if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) {
2280 float_raise( float_flag_invalid );
2281 }
2282 return 0;
2283 }
2284 aSign = extractFloat32Sign( a );
2285 bSign = extractFloat32Sign( b );
2286 if ( aSign != bSign ) return aSign || ( (bits32) ( ( a | b )<<1 ) == 0 );
2287 return ( a == b ) || ( aSign ^ ( a < b ) );
2288
2289 }
2290
2291 /*
2292 -------------------------------------------------------------------------------
2293 Returns 1 if the single-precision floating-point value `a' is less than
2294 the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
2295 exception. Otherwise, the comparison is performed according to the IEC/IEEE
2296 Standard for Binary Floating-Point Arithmetic.
2297 -------------------------------------------------------------------------------
2298 */
2299 flag float32_lt_quiet( float32 a, float32 b )
2300 {
2301 flag aSign, bSign;
2302
2303 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2304 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2305 ) {
2306 if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) {
2307 float_raise( float_flag_invalid );
2308 }
2309 return 0;
2310 }
2311 aSign = extractFloat32Sign( a );
2312 bSign = extractFloat32Sign( b );
2313 if ( aSign != bSign ) return aSign && ( (bits32) ( ( a | b )<<1 ) != 0 );
2314 return ( a != b ) && ( aSign ^ ( a < b ) );
2315
2316 }
2317 #endif /* !SOFTFLOAT_FOR_GCC */
2318
2319 #ifndef SOFTFLOAT_FOR_GCC /* Not needed */
2320 /*
2321 -------------------------------------------------------------------------------
2322 Returns the result of converting the double-precision floating-point value
2323 `a' to the 32-bit two's complement integer format. The conversion is
2324 performed according to the IEC/IEEE Standard for Binary Floating-Point
2325 Arithmetic---which means in particular that the conversion is rounded
2326 according to the current rounding mode. If `a' is a NaN, the largest
2327 positive integer is returned. Otherwise, if the conversion overflows, the
2328 largest integer with the same sign as `a' is returned.
2329 -------------------------------------------------------------------------------
2330 */
2331 int32 float64_to_int32( float64 a )
2332 {
2333 flag aSign;
2334 int16 aExp, shiftCount;
2335 bits64 aSig;
2336
2337 aSig = extractFloat64Frac( a );
2338 aExp = extractFloat64Exp( a );
2339 aSign = extractFloat64Sign( a );
2340 if ( ( aExp == 0x7FF ) && aSig ) aSign = 0;
2341 if ( aExp ) aSig |= LIT64( 0x0010000000000000 );
2342 shiftCount = 0x42C - aExp;
2343 if ( 0 < shiftCount ) shift64RightJamming( aSig, shiftCount, &aSig );
2344 return roundAndPackInt32( aSign, aSig );
2345
2346 }
2347 #endif /* !SOFTFLOAT_FOR_GCC */
2348
2349 /*
2350 -------------------------------------------------------------------------------
2351 Returns the result of converting the double-precision floating-point value
2352 `a' to the 32-bit two's complement integer format. The conversion is
2353 performed according to the IEC/IEEE Standard for Binary Floating-Point
2354 Arithmetic, except that the conversion is always rounded toward zero.
2355 If `a' is a NaN, the largest positive integer is returned. Otherwise, if
2356 the conversion overflows, the largest integer with the same sign as `a' is
2357 returned.
2358 -------------------------------------------------------------------------------
2359 */
2360 int32 float64_to_int32_round_to_zero( float64 a )
2361 {
2362 flag aSign;
2363 int16 aExp, shiftCount;
2364 bits64 aSig, savedASig;
2365 int32 z;
2366
2367 aSig = extractFloat64Frac( a );
2368 aExp = extractFloat64Exp( a );
2369 aSign = extractFloat64Sign( a );
2370 if ( 0x41E < aExp ) {
2371 if ( ( aExp == 0x7FF ) && aSig ) aSign = 0;
2372 goto invalid;
2373 }
2374 else if ( aExp < 0x3FF ) {
2375 if ( aExp || aSig ) float_exception_flags |= float_flag_inexact;
2376 return 0;
2377 }
2378 aSig |= LIT64( 0x0010000000000000 );
2379 shiftCount = 0x433 - aExp;
2380 savedASig = aSig;
2381 aSig >>= shiftCount;
2382 z = (int32)aSig;
2383 if ( aSign ) z = - z;
2384 if ( ( z < 0 ) ^ aSign ) {
2385 invalid:
2386 float_raise( float_flag_invalid );
2387 return aSign ? (sbits32) 0x80000000 : 0x7FFFFFFF;
2388 }
2389 if ( ( aSig<<shiftCount ) != savedASig ) {
2390 float_exception_flags |= float_flag_inexact;
2391 }
2392 return z;
2393
2394 }
2395
2396 #ifndef SOFTFLOAT_FOR_GCC /* Not needed */
2397 /*
2398 -------------------------------------------------------------------------------
2399 Returns the result of converting the double-precision floating-point value
2400 `a' to the 64-bit two's complement integer format. The conversion is
2401 performed according to the IEC/IEEE Standard for Binary Floating-Point
2402 Arithmetic---which means in particular that the conversion is rounded
2403 according to the current rounding mode. If `a' is a NaN, the largest
2404 positive integer is returned. Otherwise, if the conversion overflows, the
2405 largest integer with the same sign as `a' is returned.
2406 -------------------------------------------------------------------------------
2407 */
2408 int64 float64_to_int64( float64 a )
2409 {
2410 flag aSign;
2411 int16 aExp, shiftCount;
2412 bits64 aSig, aSigExtra;
2413
2414 aSig = extractFloat64Frac( a );
2415 aExp = extractFloat64Exp( a );
2416 aSign = extractFloat64Sign( a );
2417 if ( aExp ) aSig |= LIT64( 0x0010000000000000 );
2418 shiftCount = 0x433 - aExp;
2419 if ( shiftCount <= 0 ) {
2420 if ( 0x43E < aExp ) {
2421 float_raise( float_flag_invalid );
2422 if ( ! aSign
2423 || ( ( aExp == 0x7FF )
2424 && ( aSig != LIT64( 0x0010000000000000 ) ) )
2425 ) {
2426 return LIT64( 0x7FFFFFFFFFFFFFFF );
2427 }
2428 return (sbits64) LIT64( 0x8000000000000000 );
2429 }
2430 aSigExtra = 0;
2431 aSig <<= - shiftCount;
2432 }
2433 else {
2434 shift64ExtraRightJamming( aSig, 0, shiftCount, &aSig, &aSigExtra );
2435 }
2436 return roundAndPackInt64( aSign, aSig, aSigExtra );
2437
2438 }
2439
2440 /*
2441 -------------------------------------------------------------------------------
2442 Returns the result of converting the double-precision floating-point value
2443 `a' to the 64-bit two's complement integer format. The conversion is
2444 performed according to the IEC/IEEE Standard for Binary Floating-Point
2445 Arithmetic, except that the conversion is always rounded toward zero.
2446 If `a' is a NaN, the largest positive integer is returned. Otherwise, if
2447 the conversion overflows, the largest integer with the same sign as `a' is
2448 returned.
2449 -------------------------------------------------------------------------------
2450 */
2451 int64 float64_to_int64_round_to_zero( float64 a )
2452 {
2453 flag aSign;
2454 int16 aExp, shiftCount;
2455 bits64 aSig;
2456 int64 z;
2457
2458 aSig = extractFloat64Frac( a );
2459 aExp = extractFloat64Exp( a );
2460 aSign = extractFloat64Sign( a );
2461 if ( aExp ) aSig |= LIT64( 0x0010000000000000 );
2462 shiftCount = aExp - 0x433;
2463 if ( 0 <= shiftCount ) {
2464 if ( 0x43E <= aExp ) {
2465 if ( a != LIT64( 0xC3E0000000000000 ) ) {
2466 float_raise( float_flag_invalid );
2467 if ( ! aSign
2468 || ( ( aExp == 0x7FF )
2469 && ( aSig != LIT64( 0x0010000000000000 ) ) )
2470 ) {
2471 return LIT64( 0x7FFFFFFFFFFFFFFF );
2472 }
2473 }
2474 return (sbits64) LIT64( 0x8000000000000000 );
2475 }
2476 z = aSig<<shiftCount;
2477 }
2478 else {
2479 if ( aExp < 0x3FE ) {
2480 if ( aExp | aSig ) float_exception_flags |= float_flag_inexact;
2481 return 0;
2482 }
2483 z = aSig>>( - shiftCount );
2484 if ( (bits64) ( aSig<<( shiftCount & 63 ) ) ) {
2485 float_exception_flags |= float_flag_inexact;
2486 }
2487 }
2488 if ( aSign ) z = - z;
2489 return z;
2490
2491 }
2492 #endif /* !SOFTFLOAT_FOR_GCC */
2493
2494 /*
2495 -------------------------------------------------------------------------------
2496 Returns the result of converting the double-precision floating-point value
2497 `a' to the single-precision floating-point format. The conversion is
2498 performed according to the IEC/IEEE Standard for Binary Floating-Point
2499 Arithmetic.
2500 -------------------------------------------------------------------------------
2501 */
2502 float32 float64_to_float32( float64 a )
2503 {
2504 flag aSign;
2505 int16 aExp;
2506 bits64 aSig;
2507 bits32 zSig;
2508
2509 aSig = extractFloat64Frac( a );
2510 aExp = extractFloat64Exp( a );
2511 aSign = extractFloat64Sign( a );
2512 if ( aExp == 0x7FF ) {
2513 if ( aSig ) return commonNaNToFloat32( float64ToCommonNaN( a ) );
2514 return packFloat32( aSign, 0xFF, 0 );
2515 }
2516 shift64RightJamming( aSig, 22, &aSig );
2517 zSig = (bits32)aSig;
2518 if ( aExp || zSig ) {
2519 zSig |= 0x40000000;
2520 aExp -= 0x381;
2521 }
2522 return roundAndPackFloat32( aSign, aExp, zSig );
2523
2524 }
2525
2526 #ifdef FLOATX80
2527
2528 /*
2529 -------------------------------------------------------------------------------
2530 Returns the result of converting the double-precision floating-point value
2531 `a' to the extended double-precision floating-point format. The conversion
2532 is performed according to the IEC/IEEE Standard for Binary Floating-Point
2533 Arithmetic.
2534 -------------------------------------------------------------------------------
2535 */
2536 floatx80 float64_to_floatx80( float64 a )
2537 {
2538 flag aSign;
2539 int16 aExp;
2540 bits64 aSig;
2541
2542 aSig = extractFloat64Frac( a );
2543 aExp = extractFloat64Exp( a );
2544 aSign = extractFloat64Sign( a );
2545 if ( aExp == 0x7FF ) {
2546 if ( aSig ) return commonNaNToFloatx80( float64ToCommonNaN( a ) );
2547 return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
2548 }
2549 if ( aExp == 0 ) {
2550 if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 );
2551 normalizeFloat64Subnormal( aSig, &aExp, &aSig );
2552 }
2553 return
2554 packFloatx80(
2555 aSign, aExp + 0x3C00, ( aSig | LIT64( 0x0010000000000000 ) )<<11 );
2556
2557 }
2558
2559 #endif
2560
2561 #ifdef FLOAT128
2562
2563 /*
2564 -------------------------------------------------------------------------------
2565 Returns the result of converting the double-precision floating-point value
2566 `a' to the quadruple-precision floating-point format. The conversion is
2567 performed according to the IEC/IEEE Standard for Binary Floating-Point
2568 Arithmetic.
2569 -------------------------------------------------------------------------------
2570 */
2571 float128 float64_to_float128( float64 a )
2572 {
2573 flag aSign;
2574 int16 aExp;
2575 bits64 aSig, zSig0, zSig1;
2576
2577 aSig = extractFloat64Frac( a );
2578 aExp = extractFloat64Exp( a );
2579 aSign = extractFloat64Sign( a );
2580 if ( aExp == 0x7FF ) {
2581 if ( aSig ) return commonNaNToFloat128( float64ToCommonNaN( a ) );
2582 return packFloat128( aSign, 0x7FFF, 0, 0 );
2583 }
2584 if ( aExp == 0 ) {
2585 if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 );
2586 normalizeFloat64Subnormal( aSig, &aExp, &aSig );
2587 --aExp;
2588 }
2589 shift128Right( aSig, 0, 4, &zSig0, &zSig1 );
2590 return packFloat128( aSign, aExp + 0x3C00, zSig0, zSig1 );
2591
2592 }
2593
2594 #endif
2595
2596 #ifndef SOFTFLOAT_FOR_GCC
2597 /*
2598 -------------------------------------------------------------------------------
2599 Rounds the double-precision floating-point value `a' to an integer, and
2600 returns the result as a double-precision floating-point value. The
2601 operation is performed according to the IEC/IEEE Standard for Binary
2602 Floating-Point Arithmetic.
2603 -------------------------------------------------------------------------------
2604 */
2605 float64 float64_round_to_int( float64 a )
2606 {
2607 flag aSign;
2608 int16 aExp;
2609 bits64 lastBitMask, roundBitsMask;
2610 int8 roundingMode;
2611 float64 z;
2612
2613 aExp = extractFloat64Exp( a );
2614 if ( 0x433 <= aExp ) {
2615 if ( ( aExp == 0x7FF ) && extractFloat64Frac( a ) ) {
2616 return propagateFloat64NaN( a, a );
2617 }
2618 return a;
2619 }
2620 if ( aExp < 0x3FF ) {
2621 if ( (bits64) ( a<<1 ) == 0 ) return a;
2622 float_exception_flags |= float_flag_inexact;
2623 aSign = extractFloat64Sign( a );
2624 switch ( float_rounding_mode ) {
2625 case float_round_nearest_even:
2626 if ( ( aExp == 0x3FE ) && extractFloat64Frac( a ) ) {
2627 return packFloat64( aSign, 0x3FF, 0 );
2628 }
2629 break;
2630 case float_round_to_zero:
2631 break;
2632 case float_round_down:
2633 return aSign ? LIT64( 0xBFF0000000000000 ) : 0;
2634 case float_round_up:
2635 return
2636 aSign ? LIT64( 0x8000000000000000 ) : LIT64( 0x3FF0000000000000 );
2637 }
2638 return packFloat64( aSign, 0, 0 );
2639 }
2640 lastBitMask = 1;
2641 lastBitMask <<= 0x433 - aExp;
2642 roundBitsMask = lastBitMask - 1;
2643 z = a;
2644 roundingMode = float_rounding_mode;
2645 if ( roundingMode == float_round_nearest_even ) {
2646 z += lastBitMask>>1;
2647 if ( ( z & roundBitsMask ) == 0 ) z &= ~ lastBitMask;
2648 }
2649 else if ( roundingMode != float_round_to_zero ) {
2650 if ( extractFloat64Sign( z ) ^ ( roundingMode == float_round_up ) ) {
2651 z += roundBitsMask;
2652 }
2653 }
2654 z &= ~ roundBitsMask;
2655 if ( z != a ) float_exception_flags |= float_flag_inexact;
2656 return z;
2657
2658 }
2659 #endif
2660
2661 /*
2662 -------------------------------------------------------------------------------
2663 Returns the result of adding the absolute values of the double-precision
2664 floating-point values `a' and `b'. If `zSign' is 1, the sum is negated
2665 before being returned. `zSign' is ignored if the result is a NaN.
2666 The addition is performed according to the IEC/IEEE Standard for Binary
2667 Floating-Point Arithmetic.
2668 -------------------------------------------------------------------------------
2669 */
2670 static float64 addFloat64Sigs( float64 a, float64 b, flag zSign )
2671 {
2672 int16 aExp, bExp, zExp;
2673 bits64 aSig, bSig, zSig;
2674 int16 expDiff;
2675
2676 aSig = extractFloat64Frac( a );
2677 aExp = extractFloat64Exp( a );
2678 bSig = extractFloat64Frac( b );
2679 bExp = extractFloat64Exp( b );
2680 expDiff = aExp - bExp;
2681 aSig <<= 9;
2682 bSig <<= 9;
2683 if ( 0 < expDiff ) {
2684 if ( aExp == 0x7FF ) {
2685 if ( aSig ) return propagateFloat64NaN( a, b );
2686 return a;
2687 }
2688 if ( bExp == 0 ) {
2689 --expDiff;
2690 }
2691 else {
2692 bSig |= LIT64( 0x2000000000000000 );
2693 }
2694 shift64RightJamming( bSig, expDiff, &bSig );
2695 zExp = aExp;
2696 }
2697 else if ( expDiff < 0 ) {
2698 if ( bExp == 0x7FF ) {
2699 if ( bSig ) return propagateFloat64NaN( a, b );
2700 return packFloat64( zSign, 0x7FF, 0 );
2701 }
2702 if ( aExp == 0 ) {
2703 ++expDiff;
2704 }
2705 else {
2706 aSig |= LIT64( 0x2000000000000000 );
2707 }
2708 shift64RightJamming( aSig, - expDiff, &aSig );
2709 zExp = bExp;
2710 }
2711 else {
2712 if ( aExp == 0x7FF ) {
2713 if ( aSig | bSig ) return propagateFloat64NaN( a, b );
2714 return a;
2715 }
2716 if ( aExp == 0 ) return packFloat64( zSign, 0, ( aSig + bSig )>>9 );
2717 zSig = LIT64( 0x4000000000000000 ) + aSig + bSig;
2718 zExp = aExp;
2719 goto roundAndPack;
2720 }
2721 aSig |= LIT64( 0x2000000000000000 );
2722 zSig = ( aSig + bSig )<<1;
2723 --zExp;
2724 if ( (sbits64) zSig < 0 ) {
2725 zSig = aSig + bSig;
2726 ++zExp;
2727 }
2728 roundAndPack:
2729 return roundAndPackFloat64( zSign, zExp, zSig );
2730
2731 }
2732
2733 /*
2734 -------------------------------------------------------------------------------
2735 Returns the result of subtracting the absolute values of the double-
2736 precision floating-point values `a' and `b'. If `zSign' is 1, the
2737 difference is negated before being returned. `zSign' is ignored if the
2738 result is a NaN. The subtraction is performed according to the IEC/IEEE
2739 Standard for Binary Floating-Point Arithmetic.
2740 -------------------------------------------------------------------------------
2741 */
2742 static float64 subFloat64Sigs( float64 a, float64 b, flag zSign )
2743 {
2744 int16 aExp, bExp, zExp;
2745 bits64 aSig, bSig, zSig;
2746 int16 expDiff;
2747
2748 aSig = extractFloat64Frac( a );
2749 aExp = extractFloat64Exp( a );
2750 bSig = extractFloat64Frac( b );
2751 bExp = extractFloat64Exp( b );
2752 expDiff = aExp - bExp;
2753 aSig <<= 10;
2754 bSig <<= 10;
2755 if ( 0 < expDiff ) goto aExpBigger;
2756 if ( expDiff < 0 ) goto bExpBigger;
2757 if ( aExp == 0x7FF ) {
2758 if ( aSig | bSig ) return propagateFloat64NaN( a, b );
2759 float_raise( float_flag_invalid );
2760 return float64_default_nan;
2761 }
2762 if ( aExp == 0 ) {
2763 aExp = 1;
2764 bExp = 1;
2765 }
2766 if ( bSig < aSig ) goto aBigger;
2767 if ( aSig < bSig ) goto bBigger;
2768 return packFloat64( float_rounding_mode == float_round_down, 0, 0 );
2769 bExpBigger:
2770 if ( bExp == 0x7FF ) {
2771 if ( bSig ) return propagateFloat64NaN( a, b );
2772 return packFloat64( zSign ^ 1, 0x7FF, 0 );
2773 }
2774 if ( aExp == 0 ) {
2775 ++expDiff;
2776 }
2777 else {
2778 aSig |= LIT64( 0x4000000000000000 );
2779 }
2780 shift64RightJamming( aSig, - expDiff, &aSig );
2781 bSig |= LIT64( 0x4000000000000000 );
2782 bBigger:
2783 zSig = bSig - aSig;
2784 zExp = bExp;
2785 zSign ^= 1;
2786 goto normalizeRoundAndPack;
2787 aExpBigger:
2788 if ( aExp == 0x7FF ) {
2789 if ( aSig ) return propagateFloat64NaN( a, b );
2790 return a;
2791 }
2792 if ( bExp == 0 ) {
2793 --expDiff;
2794 }
2795 else {
2796 bSig |= LIT64( 0x4000000000000000 );
2797 }
2798 shift64RightJamming( bSig, expDiff, &bSig );
2799 aSig |= LIT64( 0x4000000000000000 );
2800 aBigger:
2801 zSig = aSig - bSig;
2802 zExp = aExp;
2803 normalizeRoundAndPack:
2804 --zExp;
2805 return normalizeRoundAndPackFloat64( zSign, zExp, zSig );
2806
2807 }
2808
2809 /*
2810 -------------------------------------------------------------------------------
2811 Returns the result of adding the double-precision floating-point values `a'
2812 and `b'. The operation is performed according to the IEC/IEEE Standard for
2813 Binary Floating-Point Arithmetic.
2814 -------------------------------------------------------------------------------
2815 */
2816 float64 float64_add( float64 a, float64 b )
2817 {
2818 flag aSign, bSign;
2819
2820 aSign = extractFloat64Sign( a );
2821 bSign = extractFloat64Sign( b );
2822 if ( aSign == bSign ) {
2823 return addFloat64Sigs( a, b, aSign );
2824 }
2825 else {
2826 return subFloat64Sigs( a, b, aSign );
2827 }
2828
2829 }
2830
2831 /*
2832 -------------------------------------------------------------------------------
2833 Returns the result of subtracting the double-precision floating-point values
2834 `a' and `b'. The operation is performed according to the IEC/IEEE Standard
2835 for Binary Floating-Point Arithmetic.
2836 -------------------------------------------------------------------------------
2837 */
2838 float64 float64_sub( float64 a, float64 b )
2839 {
2840 flag aSign, bSign;
2841
2842 aSign = extractFloat64Sign( a );
2843 bSign = extractFloat64Sign( b );
2844 if ( aSign == bSign ) {
2845 return subFloat64Sigs( a, b, aSign );
2846 }
2847 else {
2848 return addFloat64Sigs( a, b, aSign );
2849 }
2850
2851 }
2852
2853 /*
2854 -------------------------------------------------------------------------------
2855 Returns the result of multiplying the double-precision floating-point values
2856 `a' and `b'. The operation is performed according to the IEC/IEEE Standard
2857 for Binary Floating-Point Arithmetic.
2858 -------------------------------------------------------------------------------
2859 */
2860 float64 float64_mul( float64 a, float64 b )
2861 {
2862 flag aSign, bSign, zSign;
2863 int16 aExp, bExp, zExp;
2864 bits64 aSig, bSig, zSig0, zSig1;
2865
2866 aSig = extractFloat64Frac( a );
2867 aExp = extractFloat64Exp( a );
2868 aSign = extractFloat64Sign( a );
2869 bSig = extractFloat64Frac( b );
2870 bExp = extractFloat64Exp( b );
2871 bSign = extractFloat64Sign( b );
2872 zSign = aSign ^ bSign;
2873 if ( aExp == 0x7FF ) {
2874 if ( aSig || ( ( bExp == 0x7FF ) && bSig ) ) {
2875 return propagateFloat64NaN( a, b );
2876 }
2877 if ( ( bExp | bSig ) == 0 ) {
2878 float_raise( float_flag_invalid );
2879 return float64_default_nan;
2880 }
2881 return packFloat64( zSign, 0x7FF, 0 );
2882 }
2883 if ( bExp == 0x7FF ) {
2884 if ( bSig ) return propagateFloat64NaN( a, b );
2885 if ( ( aExp | aSig ) == 0 ) {
2886 float_raise( float_flag_invalid );
2887 return float64_default_nan;
2888 }
2889 return packFloat64( zSign, 0x7FF, 0 );
2890 }
2891 if ( aExp == 0 ) {
2892 if ( aSig == 0 ) return packFloat64( zSign, 0, 0 );
2893 normalizeFloat64Subnormal( aSig, &aExp, &aSig );
2894 }
2895 if ( bExp == 0 ) {
2896 if ( bSig == 0 ) return packFloat64( zSign, 0, 0 );
2897 normalizeFloat64Subnormal( bSig, &bExp, &bSig );
2898 }
2899 zExp = aExp + bExp - 0x3FF;
2900 aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<10;
2901 bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;
2902 mul64To128( aSig, bSig, &zSig0, &zSig1 );
2903 zSig0 |= ( zSig1 != 0 );
2904 if ( 0 <= (sbits64) ( zSig0<<1 ) ) {
2905 zSig0 <<= 1;
2906 --zExp;
2907 }
2908 return roundAndPackFloat64( zSign, zExp, zSig0 );
2909
2910 }
2911
2912 /*
2913 -------------------------------------------------------------------------------
2914 Returns the result of dividing the double-precision floating-point value `a'
2915 by the corresponding value `b'. The operation is performed according to
2916 the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2917 -------------------------------------------------------------------------------
2918 */
2919 float64 float64_div( float64 a, float64 b )
2920 {
2921 flag aSign, bSign, zSign;
2922 int16 aExp, bExp, zExp;
2923 bits64 aSig, bSig, zSig;
2924 bits64 rem0, rem1;
2925 bits64 term0, term1;
2926
2927 aSig = extractFloat64Frac( a );
2928 aExp = extractFloat64Exp( a );
2929 aSign = extractFloat64Sign( a );
2930 bSig = extractFloat64Frac( b );
2931 bExp = extractFloat64Exp( b );
2932 bSign = extractFloat64Sign( b );
2933 zSign = aSign ^ bSign;
2934 if ( aExp == 0x7FF ) {
2935 if ( aSig ) return propagateFloat64NaN( a, b );
2936 if ( bExp == 0x7FF ) {
2937 if ( bSig ) return propagateFloat64NaN( a, b );
2938 float_raise( float_flag_invalid );
2939 return float64_default_nan;
2940 }
2941 return packFloat64( zSign, 0x7FF, 0 );
2942 }
2943 if ( bExp == 0x7FF ) {
2944 if ( bSig ) return propagateFloat64NaN( a, b );
2945 return packFloat64( zSign, 0, 0 );
2946 }
2947 if ( bExp == 0 ) {
2948 if ( bSig == 0 ) {
2949 if ( ( aExp | aSig ) == 0 ) {
2950 float_raise( float_flag_invalid );
2951 return float64_default_nan;
2952 }
2953 float_raise( float_flag_divbyzero );
2954 return packFloat64( zSign, 0x7FF, 0 );
2955 }
2956 normalizeFloat64Subnormal( bSig, &bExp, &bSig );
2957 }
2958 if ( aExp == 0 ) {
2959 if ( aSig == 0 ) return packFloat64( zSign, 0, 0 );
2960 normalizeFloat64Subnormal( aSig, &aExp, &aSig );
2961 }
2962 zExp = aExp - bExp + 0x3FD;
2963 aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<10;
2964 bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;
2965 if ( bSig <= ( aSig + aSig ) ) {
2966 aSig >>= 1;
2967 ++zExp;
2968 }
2969 zSig = estimateDiv128To64( aSig, 0, bSig );
2970 if ( ( zSig & 0x1FF ) <= 2 ) {
2971 mul64To128( bSig, zSig, &term0, &term1 );
2972 sub128( aSig, 0, term0, term1, &rem0, &rem1 );
2973 while ( (sbits64) rem0 < 0 ) {
2974 --zSig;
2975 add128( rem0, rem1, 0, bSig, &rem0, &rem1 );
2976 }
2977 zSig |= ( rem1 != 0 );
2978 }
2979 return roundAndPackFloat64( zSign, zExp, zSig );
2980
2981 }
2982
2983 #ifndef SOFTFLOAT_FOR_GCC
2984 /*
2985 -------------------------------------------------------------------------------
2986 Returns the remainder of the double-precision floating-point value `a'
2987 with respect to the corresponding value `b'. The operation is performed
2988 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2989 -------------------------------------------------------------------------------
2990 */
2991 float64 float64_rem( float64 a, float64 b )
2992 {
2993 flag aSign, bSign, zSign;
2994 int16 aExp, bExp, expDiff;
2995 bits64 aSig, bSig;
2996 bits64 q, alternateASig;
2997 sbits64 sigMean;
2998
2999 aSig = extractFloat64Frac( a );
3000 aExp = extractFloat64Exp( a );
3001 aSign = extractFloat64Sign( a );
3002 bSig = extractFloat64Frac( b );
3003 bExp = extractFloat64Exp( b );
3004 bSign = extractFloat64Sign( b );
3005 if ( aExp == 0x7FF ) {
3006 if ( aSig || ( ( bExp == 0x7FF ) && bSig ) ) {
3007 return propagateFloat64NaN( a, b );
3008 }
3009 float_raise( float_flag_invalid );
3010 return float64_default_nan;
3011 }
3012 if ( bExp == 0x7FF ) {
3013 if ( bSig ) return propagateFloat64NaN( a, b );
3014 return a;
3015 }
3016 if ( bExp == 0 ) {
3017 if ( bSig == 0 ) {
3018 float_raise( float_flag_invalid );
3019 return float64_default_nan;
3020 }
3021 normalizeFloat64Subnormal( bSig, &bExp, &bSig );
3022 }
3023 if ( aExp == 0 ) {
3024 if ( aSig == 0 ) return a;
3025 normalizeFloat64Subnormal( aSig, &aExp, &aSig );
3026 }
3027 expDiff = aExp - bExp;
3028 aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<11;
3029 bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;
3030 if ( expDiff < 0 ) {
3031 if ( expDiff < -1 ) return a;
3032 aSig >>= 1;
3033 }
3034 q = ( bSig <= aSig );
3035 if ( q ) aSig -= bSig;
3036 expDiff -= 64;
3037 while ( 0 < expDiff ) {
3038 q = estimateDiv128To64( aSig, 0, bSig );
3039 q = ( 2 < q ) ? q - 2 : 0;
3040 aSig = - ( ( bSig>>2 ) * q );
3041 expDiff -= 62;
3042 }
3043 expDiff += 64;
3044 if ( 0 < expDiff ) {
3045 q = estimateDiv128To64( aSig, 0, bSig );
3046 q = ( 2 < q ) ? q - 2 : 0;
3047 q >>= 64 - expDiff;
3048 bSig >>= 2;
3049 aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q;
3050 }
3051 else {
3052 aSig >>= 2;
3053 bSig >>= 2;
3054 }
3055 do {
3056 alternateASig = aSig;
3057 ++q;
3058 aSig -= bSig;
3059 } while ( 0 <= (sbits64) aSig );
3060 sigMean = aSig + alternateASig;
3061 if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) {
3062 aSig = alternateASig;
3063 }
3064 zSign = ( (sbits64) aSig < 0 );
3065 if ( zSign ) aSig = - aSig;
3066 return normalizeRoundAndPackFloat64( aSign ^ zSign, bExp, aSig );
3067
3068 }
3069
3070 /*
3071 -------------------------------------------------------------------------------
3072 Returns the square root of the double-precision floating-point value `a'.
3073 The operation is performed according to the IEC/IEEE Standard for Binary
3074 Floating-Point Arithmetic.
3075 -------------------------------------------------------------------------------
3076 */
3077 float64 float64_sqrt( float64 a )
3078 {
3079 flag aSign;
3080 int16 aExp, zExp;
3081 bits64 aSig, zSig, doubleZSig;
3082 bits64 rem0, rem1, term0, term1;
3083
3084 aSig = extractFloat64Frac( a );
3085 aExp = extractFloat64Exp( a );
3086 aSign = extractFloat64Sign( a );
3087 if ( aExp == 0x7FF ) {
3088 if ( aSig ) return propagateFloat64NaN( a, a );
3089 if ( ! aSign ) return a;
3090 float_raise( float_flag_invalid );
3091 return float64_default_nan;
3092 }
3093 if ( aSign ) {
3094 if ( ( aExp | aSig ) == 0 ) return a;
3095 float_raise( float_flag_invalid );
3096 return float64_default_nan;
3097 }
3098 if ( aExp == 0 ) {
3099 if ( aSig == 0 ) return 0;
3100 normalizeFloat64Subnormal( aSig, &aExp, &aSig );
3101 }
3102 zExp = ( ( aExp - 0x3FF )>>1 ) + 0x3FE;
3103 aSig |= LIT64( 0x0010000000000000 );
3104 zSig = estimateSqrt32( aExp, aSig>>21 );
3105 aSig <<= 9 - ( aExp & 1 );
3106 zSig = estimateDiv128To64( aSig, 0, zSig<<32 ) + ( zSig<<30 );
3107 if ( ( zSig & 0x1FF ) <= 5 ) {
3108 doubleZSig = zSig<<1;
3109 mul64To128( zSig, zSig, &term0, &term1 );
3110 sub128( aSig, 0, term0, term1, &rem0, &rem1 );
3111 while ( (sbits64) rem0 < 0 ) {
3112 --zSig;
3113 doubleZSig -= 2;
3114 add128( rem0, rem1, zSig>>63, doubleZSig | 1, &rem0, &rem1 );
3115 }
3116 zSig |= ( ( rem0 | rem1 ) != 0 );
3117 }
3118 return roundAndPackFloat64( 0, zExp, zSig );
3119
3120 }
3121 #endif
3122
3123 /*
3124 -------------------------------------------------------------------------------
3125 Returns 1 if the double-precision floating-point value `a' is equal to the
3126 corresponding value `b', and 0 otherwise. The comparison is performed
3127 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3128 -------------------------------------------------------------------------------
3129 */
3130 flag float64_eq( float64 a, float64 b )
3131 {
3132
3133 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
3134 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
3135 ) {
3136 if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) {
3137 float_raise( float_flag_invalid );
3138 }
3139 return 0;
3140 }
3141 return ( a == b ) ||
3142 ( (bits64) ( ( FLOAT64_DEMANGLE(a) | FLOAT64_DEMANGLE(b) )<<1 ) == 0 );
3143
3144 }
3145
3146 /*
3147 -------------------------------------------------------------------------------
3148 Returns 1 if the double-precision floating-point value `a' is less than or
3149 equal to the corresponding value `b', and 0 otherwise. The comparison is
3150 performed according to the IEC/IEEE Standard for Binary Floating-Point
3151 Arithmetic.
3152 -------------------------------------------------------------------------------
3153 */
3154 flag float64_le( float64 a, float64 b )
3155 {
3156 flag aSign, bSign;
3157
3158 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
3159 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
3160 ) {
3161 float_raise( float_flag_invalid );
3162 return 0;
3163 }
3164 aSign = extractFloat64Sign( a );
3165 bSign = extractFloat64Sign( b );
3166 if ( aSign != bSign )
3167 return aSign ||
3168 ( (bits64) ( ( FLOAT64_DEMANGLE(a) | FLOAT64_DEMANGLE(b) )<<1 ) ==
3169 0 );
3170 return ( a == b ) ||
3171 ( aSign ^ ( FLOAT64_DEMANGLE(a) < FLOAT64_DEMANGLE(b) ) );
3172
3173 }
3174
3175 /*
3176 -------------------------------------------------------------------------------
3177 Returns 1 if the double-precision floating-point value `a' is less than
3178 the corresponding value `b', and 0 otherwise. The comparison is performed
3179 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3180 -------------------------------------------------------------------------------
3181 */
3182 flag float64_lt( float64 a, float64 b )
3183 {
3184 flag aSign, bSign;
3185
3186 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
3187 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
3188 ) {
3189 float_raise( float_flag_invalid );
3190 return 0;
3191 }
3192 aSign = extractFloat64Sign( a );
3193 bSign = extractFloat64Sign( b );
3194 if ( aSign != bSign )
3195 return aSign &&
3196 ( (bits64) ( ( FLOAT64_DEMANGLE(a) | FLOAT64_DEMANGLE(b) )<<1 ) !=
3197 0 );
3198 return ( a != b ) &&
3199 ( aSign ^ ( FLOAT64_DEMANGLE(a) < FLOAT64_DEMANGLE(b) ) );
3200
3201 }
3202
3203 #ifndef SOFTFLOAT_FOR_GCC
3204 /*
3205 -------------------------------------------------------------------------------
3206 Returns 1 if the double-precision floating-point value `a' is equal to the
3207 corresponding value `b', and 0 otherwise. The invalid exception is raised
3208 if either operand is a NaN. Otherwise, the comparison is performed
3209 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3210 -------------------------------------------------------------------------------
3211 */
3212 flag float64_eq_signaling( float64 a, float64 b )
3213 {
3214
3215 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
3216 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
3217 ) {
3218 float_raise( float_flag_invalid );
3219 return 0;
3220 }
3221 return ( a == b ) || ( (bits64) ( ( a | b )<<1 ) == 0 );
3222
3223 }
3224
3225 /*
3226 -------------------------------------------------------------------------------
3227 Returns 1 if the double-precision floating-point value `a' is less than or
3228 equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not
3229 cause an exception. Otherwise, the comparison is performed according to the
3230 IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3231 -------------------------------------------------------------------------------
3232 */
3233 flag float64_le_quiet( float64 a, float64 b )
3234 {
3235 flag aSign, bSign;
3236
3237 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
3238 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
3239 ) {
3240 if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) {
3241 float_raise( float_flag_invalid );
3242 }
3243 return 0;
3244 }
3245 aSign = extractFloat64Sign( a );
3246 bSign = extractFloat64Sign( b );
3247 if ( aSign != bSign ) return aSign || ( (bits64) ( ( a | b )<<1 ) == 0 );
3248 return ( a == b ) || ( aSign ^ ( a < b ) );
3249
3250 }
3251
3252 /*
3253 -------------------------------------------------------------------------------
3254 Returns 1 if the double-precision floating-point value `a' is less than
3255 the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
3256 exception. Otherwise, the comparison is performed according to the IEC/IEEE
3257 Standard for Binary Floating-Point Arithmetic.
3258 -------------------------------------------------------------------------------
3259 */
3260 flag float64_lt_quiet( float64 a, float64 b )
3261 {
3262 flag aSign, bSign;
3263
3264 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
3265 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
3266 ) {
3267 if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) {
3268 float_raise( float_flag_invalid );
3269 }
3270 return 0;
3271 }
3272 aSign = extractFloat64Sign( a );
3273 bSign = extractFloat64Sign( b );
3274 if ( aSign != bSign ) return aSign && ( (bits64) ( ( a | b )<<1 ) != 0 );
3275 return ( a != b ) && ( aSign ^ ( a < b ) );
3276
3277 }
3278 #endif
3279
3280 #ifdef FLOATX80
3281
3282 /*
3283 -------------------------------------------------------------------------------
3284 Returns the result of converting the extended double-precision floating-
3285 point value `a' to the 32-bit two's complement integer format. The
3286 conversion is performed according to the IEC/IEEE Standard for Binary
3287 Floating-Point Arithmetic---which means in particular that the conversion
3288 is rounded according to the current rounding mode. If `a' is a NaN, the
3289 largest positive integer is returned. Otherwise, if the conversion
3290 overflows, the largest integer with the same sign as `a' is returned.
3291 -------------------------------------------------------------------------------
3292 */
3293 int32 floatx80_to_int32( floatx80 a )
3294 {
3295 flag aSign;
3296 int32 aExp, shiftCount;
3297 bits64 aSig;
3298
3299 aSig = extractFloatx80Frac( a );
3300 aExp = extractFloatx80Exp( a );
3301 aSign = extractFloatx80Sign( a );
3302 if ( ( aExp == 0x7FFF ) && (bits64) ( aSig<<1 ) ) aSign = 0;
3303 shiftCount = 0x4037 - aExp;
3304 if ( shiftCount <= 0 ) shiftCount = 1;
3305 shift64RightJamming( aSig, shiftCount, &aSig );
3306 return roundAndPackInt32( aSign, aSig );
3307
3308 }
3309
3310 /*
3311 -------------------------------------------------------------------------------
3312 Returns the result of converting the extended double-precision floating-
3313 point value `a' to the 32-bit two's complement integer format. The
3314 conversion is performed according to the IEC/IEEE Standard for Binary
3315 Floating-Point Arithmetic, except that the conversion is always rounded
3316 toward zero. If `a' is a NaN, the largest positive integer is returned.
3317 Otherwise, if the conversion overflows, the largest integer with the same
3318 sign as `a' is returned.
3319 -------------------------------------------------------------------------------
3320 */
3321 int32 floatx80_to_int32_round_to_zero( floatx80 a )
3322 {
3323 flag aSign;
3324 int32 aExp, shiftCount;
3325 bits64 aSig, savedASig;
3326 int32 z;
3327
3328 aSig = extractFloatx80Frac( a );
3329 aExp = extractFloatx80Exp( a );
3330 aSign = extractFloatx80Sign( a );
3331 if ( 0x401E < aExp ) {
3332 if ( ( aExp == 0x7FFF ) && (bits64) ( aSig<<1 ) ) aSign = 0;
3333 goto invalid;
3334 }
3335 else if ( aExp < 0x3FFF ) {
3336 if ( aExp || aSig ) float_exception_flags |= float_flag_inexact;
3337 return 0;
3338 }
3339 shiftCount = 0x403E - aExp;
3340 savedASig = aSig;
3341 aSig >>= shiftCount;
3342 z = aSig;
3343 if ( aSign ) z = - z;
3344 if ( ( z < 0 ) ^ aSign ) {
3345 invalid:
3346 float_raise( float_flag_invalid );
3347 return aSign ? (sbits32) 0x80000000 : 0x7FFFFFFF;
3348 }
3349 if ( ( aSig<<shiftCount ) != savedASig ) {
3350 float_exception_flags |= float_flag_inexact;
3351 }
3352 return z;
3353
3354 }
3355
3356 /*
3357 -------------------------------------------------------------------------------
3358 Returns the result of converting the extended double-precision floating-
3359 point value `a' to the 64-bit two's complement integer format. The
3360 conversion is performed according to the IEC/IEEE Standard for Binary
3361 Floating-Point Arithmetic---which means in particular that the conversion
3362 is rounded according to the current rounding mode. If `a' is a NaN,
3363 the largest positive integer is returned. Otherwise, if the conversion
3364 overflows, the largest integer with the same sign as `a' is returned.
3365 -------------------------------------------------------------------------------
3366 */
3367 int64 floatx80_to_int64( floatx80 a )
3368 {
3369 flag aSign;
3370 int32 aExp, shiftCount;
3371 bits64 aSig, aSigExtra;
3372
3373 aSig = extractFloatx80Frac( a );
3374 aExp = extractFloatx80Exp( a );
3375 aSign = extractFloatx80Sign( a );
3376 shiftCount = 0x403E - aExp;
3377 if ( shiftCount <= 0 ) {
3378 if ( shiftCount ) {
3379 float_raise( float_flag_invalid );
3380 if ( ! aSign
3381 || ( ( aExp == 0x7FFF )
3382 && ( aSig != LIT64( 0x8000000000000000 ) ) )
3383 ) {
3384 return LIT64( 0x7FFFFFFFFFFFFFFF );
3385 }
3386 return (sbits64) LIT64( 0x8000000000000000 );
3387 }
3388 aSigExtra = 0;
3389 }
3390 else {
3391 shift64ExtraRightJamming( aSig, 0, shiftCount, &aSig, &aSigExtra );
3392 }
3393 return roundAndPackInt64( aSign, aSig, aSigExtra );
3394
3395 }
3396
3397 /*
3398 -------------------------------------------------------------------------------
3399 Returns the result of converting the extended double-precision floating-
3400 point value `a' to the 64-bit two's complement integer format. The
3401 conversion is performed according to the IEC/IEEE Standard for Binary
3402 Floating-Point Arithmetic, except that the conversion is always rounded
3403 toward zero. If `a' is a NaN, the largest positive integer is returned.
3404 Otherwise, if the conversion overflows, the largest integer with the same
3405 sign as `a' is returned.
3406 -------------------------------------------------------------------------------
3407 */
3408 int64 floatx80_to_int64_round_to_zero( floatx80 a )
3409 {
3410 flag aSign;
3411 int32 aExp, shiftCount;
3412 bits64 aSig;
3413 int64 z;
3414
3415 aSig = extractFloatx80Frac( a );
3416 aExp = extractFloatx80Exp( a );
3417 aSign = extractFloatx80Sign( a );
3418 shiftCount = aExp - 0x403E;
3419 if ( 0 <= shiftCount ) {
3420 aSig &= LIT64( 0x7FFFFFFFFFFFFFFF );
3421 if ( ( a.high != 0xC03E ) || aSig ) {
3422 float_raise( float_flag_invalid );
3423 if ( ! aSign || ( ( aExp == 0x7FFF ) && aSig ) ) {
3424 return LIT64( 0x7FFFFFFFFFFFFFFF );
3425 }
3426 }
3427 return (sbits64) LIT64( 0x8000000000000000 );
3428 }
3429 else if ( aExp < 0x3FFF ) {
3430 if ( aExp | aSig ) float_exception_flags |= float_flag_inexact;
3431 return 0;
3432 }
3433 z = aSig>>( - shiftCount );
3434 if ( (bits64) ( aSig<<( shiftCount & 63 ) ) ) {
3435 float_exception_flags |= float_flag_inexact;
3436 }
3437 if ( aSign ) z = - z;
3438 return z;
3439
3440 }
3441
3442 /*
3443 -------------------------------------------------------------------------------
3444 Returns the result of converting the extended double-precision floating-
3445 point value `a' to the single-precision floating-point format. The
3446 conversion is performed according to the IEC/IEEE Standard for Binary
3447 Floating-Point Arithmetic.
3448 -------------------------------------------------------------------------------
3449 */
3450 float32 floatx80_to_float32( floatx80 a )
3451 {
3452 flag aSign;
3453 int32 aExp;
3454 bits64 aSig;
3455
3456 aSig = extractFloatx80Frac( a );
3457 aExp = extractFloatx80Exp( a );
3458 aSign = extractFloatx80Sign( a );
3459 if ( aExp == 0x7FFF ) {
3460 if ( (bits64) ( aSig<<1 ) ) {
3461 return commonNaNToFloat32( floatx80ToCommonNaN( a ) );
3462 }
3463 return packFloat32( aSign, 0xFF, 0 );
3464 }
3465 shift64RightJamming( aSig, 33, &aSig );
3466 if ( aExp || aSig ) aExp -= 0x3F81;
3467 return roundAndPackFloat32( aSign, aExp, aSig );
3468
3469 }
3470
3471 /*
3472 -------------------------------------------------------------------------------
3473 Returns the result of converting the extended double-precision floating-
3474 point value `a' to the double-precision floating-point format. The
3475 conversion is performed according to the IEC/IEEE Standard for Binary
3476 Floating-Point Arithmetic.
3477 -------------------------------------------------------------------------------
3478 */
3479 float64 floatx80_to_float64( floatx80 a )
3480 {
3481 flag aSign;
3482 int32 aExp;
3483 bits64 aSig, zSig;
3484
3485 aSig = extractFloatx80Frac( a );
3486 aExp = extractFloatx80Exp( a );
3487 aSign = extractFloatx80Sign( a );
3488 if ( aExp == 0x7FFF ) {
3489 if ( (bits64) ( aSig<<1 ) ) {
3490 return commonNaNToFloat64( floatx80ToCommonNaN( a ) );
3491 }
3492 return packFloat64( aSign, 0x7FF, 0 );
3493 }
3494 shift64RightJamming( aSig, 1, &zSig );
3495 if ( aExp || aSig ) aExp -= 0x3C01;
3496 return roundAndPackFloat64( aSign, aExp, zSig );
3497
3498 }
3499
3500 #ifdef FLOAT128
3501
3502 /*
3503 -------------------------------------------------------------------------------
3504 Returns the result of converting the extended double-precision floating-
3505 point value `a' to the quadruple-precision floating-point format. The
3506 conversion is performed according to the IEC/IEEE Standard for Binary
3507 Floating-Point Arithmetic.
3508 -------------------------------------------------------------------------------
3509 */
3510 float128 floatx80_to_float128( floatx80 a )
3511 {
3512 flag aSign;
3513 int16 aExp;
3514 bits64 aSig, zSig0, zSig1;
3515
3516 aSig = extractFloatx80Frac( a );
3517 aExp = extractFloatx80Exp( a );
3518 aSign = extractFloatx80Sign( a );
3519 if ( ( aExp == 0x7FFF ) && (bits64) ( aSig<<1 ) ) {
3520 return commonNaNToFloat128( floatx80ToCommonNaN( a ) );
3521 }
3522 shift128Right( aSig<<1, 0, 16, &zSig0, &zSig1 );
3523 return packFloat128( aSign, aExp, zSig0, zSig1 );
3524
3525 }
3526
3527 #endif
3528
3529 /*
3530 -------------------------------------------------------------------------------
3531 Rounds the extended double-precision floating-point value `a' to an integer,
3532 and returns the result as an extended quadruple-precision floating-point
3533 value. The operation is performed according to the IEC/IEEE Standard for
3534 Binary Floating-Point Arithmetic.
3535 -------------------------------------------------------------------------------
3536 */
3537 floatx80 floatx80_round_to_int( floatx80 a )
3538 {
3539 flag aSign;
3540 int32 aExp;
3541 bits64 lastBitMask, roundBitsMask;
3542 int8 roundingMode;
3543 floatx80 z;
3544
3545 aExp = extractFloatx80Exp( a );
3546 if ( 0x403E <= aExp ) {
3547 if ( ( aExp == 0x7FFF ) && (bits64) ( extractFloatx80Frac( a )<<1 ) ) {
3548 return propagateFloatx80NaN( a, a );
3549 }
3550 return a;
3551 }
3552 if ( aExp < 0x3FFF ) {
3553 if ( ( aExp == 0 )
3554 && ( (bits64) ( extractFloatx80Frac( a )<<1 ) == 0 ) ) {
3555 return a;
3556 }
3557 float_exception_flags |= float_flag_inexact;
3558 aSign = extractFloatx80Sign( a );
3559 switch ( float_rounding_mode ) {
3560 case float_round_nearest_even:
3561 if ( ( aExp == 0x3FFE ) && (bits64) ( extractFloatx80Frac( a )<<1 )
3562 ) {
3563 return
3564 packFloatx80( aSign, 0x3FFF, LIT64( 0x8000000000000000 ) );
3565 }
3566 break;
3567 case float_round_to_zero:
3568 break;
3569 case float_round_down:
3570 return
3571 aSign ?
3572 packFloatx80( 1, 0x3FFF, LIT64( 0x8000000000000000 ) )
3573 : packFloatx80( 0, 0, 0 );
3574 case float_round_up:
3575 return
3576 aSign ? packFloatx80( 1, 0, 0 )
3577 : packFloatx80( 0, 0x3FFF, LIT64( 0x8000000000000000 ) );
3578 }
3579 return packFloatx80( aSign, 0, 0 );
3580 }
3581 lastBitMask = 1;
3582 lastBitMask <<= 0x403E - aExp;
3583 roundBitsMask = lastBitMask - 1;
3584 z = a;
3585 roundingMode = float_rounding_mode;
3586 if ( roundingMode == float_round_nearest_even ) {
3587 z.low += lastBitMask>>1;
3588 if ( ( z.low & roundBitsMask ) == 0 ) z.low &= ~ lastBitMask;
3589 }
3590 else if ( roundingMode != float_round_to_zero ) {
3591 if ( extractFloatx80Sign( z ) ^ ( roundingMode == float_round_up ) ) {
3592 z.low += roundBitsMask;
3593 }
3594 }
3595 z.low &= ~ roundBitsMask;
3596 if ( z.low == 0 ) {
3597 ++z.high;
3598 z.low = LIT64( 0x8000000000000000 );
3599 }
3600 if ( z.low != a.low ) float_exception_flags |= float_flag_inexact;
3601 return z;
3602
3603 }
3604
3605 /*
3606 -------------------------------------------------------------------------------
3607 Returns the result of adding the absolute values of the extended double-
3608 precision floating-point values `a' and `b'. If `zSign' is 1, the sum is
3609 negated before being returned. `zSign' is ignored if the result is a NaN.
3610 The addition is performed according to the IEC/IEEE Standard for Binary
3611 Floating-Point Arithmetic.
3612 -------------------------------------------------------------------------------
3613 */
3614 static floatx80 addFloatx80Sigs( floatx80 a, floatx80 b, flag zSign )
3615 {
3616 int32 aExp, bExp, zExp;
3617 bits64 aSig, bSig, zSig0, zSig1;
3618 int32 expDiff;
3619
3620 aSig = extractFloatx80Frac( a );
3621 aExp = extractFloatx80Exp( a );
3622 bSig = extractFloatx80Frac( b );
3623 bExp = extractFloatx80Exp( b );
3624 expDiff = aExp - bExp;
3625 if ( 0 < expDiff ) {
3626 if ( aExp == 0x7FFF ) {
3627 if ( (bits64) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b );
3628 return a;
3629 }
3630 if ( bExp == 0 ) --expDiff;
3631 shift64ExtraRightJamming( bSig, 0, expDiff, &bSig, &zSig1 );
3632 zExp = aExp;
3633 }
3634 else if ( expDiff < 0 ) {
3635 if ( bExp == 0x7FFF ) {
3636 if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b );
3637 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
3638 }
3639 if ( aExp == 0 ) ++expDiff;
3640 shift64ExtraRightJamming( aSig, 0, - expDiff, &aSig, &zSig1 );
3641 zExp = bExp;
3642 }
3643 else {
3644 if ( aExp == 0x7FFF ) {
3645 if ( (bits64) ( ( aSig | bSig )<<1 ) ) {
3646 return propagateFloatx80NaN( a, b );
3647 }
3648 return a;
3649 }
3650 zSig1 = 0;
3651 zSig0 = aSig + bSig;
3652 if ( aExp == 0 ) {
3653 normalizeFloatx80Subnormal( zSig0, &zExp, &zSig0 );
3654 goto roundAndPack;
3655 }
3656 zExp = aExp;
3657 goto shiftRight1;
3658 }
3659 zSig0 = aSig + bSig;
3660 if ( (sbits64) zSig0 < 0 ) goto roundAndPack;
3661 shiftRight1:
3662 shift64ExtraRightJamming( zSig0, zSig1, 1, &zSig0, &zSig1 );
3663 zSig0 |= LIT64( 0x8000000000000000 );
3664 ++zExp;
3665 roundAndPack:
3666 return
3667 roundAndPackFloatx80(
3668 floatx80_rounding_precision, zSign, zExp, zSig0, zSig1 );
3669
3670 }
3671
3672 /*
3673 -------------------------------------------------------------------------------
3674 Returns the result of subtracting the absolute values of the extended
3675 double-precision floating-point values `a' and `b'. If `zSign' is 1, the
3676 difference is negated before being returned. `zSign' is ignored if the
3677 result is a NaN. The subtraction is performed according to the IEC/IEEE
3678 Standard for Binary Floating-Point Arithmetic.
3679 -------------------------------------------------------------------------------
3680 */
3681 static floatx80 subFloatx80Sigs( floatx80 a, floatx80 b, flag zSign )
3682 {
3683 int32 aExp, bExp, zExp;
3684 bits64 aSig, bSig, zSig0, zSig1;
3685 int32 expDiff;
3686 floatx80 z;
3687
3688 aSig = extractFloatx80Frac( a );
3689 aExp = extractFloatx80Exp( a );
3690 bSig = extractFloatx80Frac( b );
3691 bExp = extractFloatx80Exp( b );
3692 expDiff = aExp - bExp;
3693 if ( 0 < expDiff ) goto aExpBigger;
3694 if ( expDiff < 0 ) goto bExpBigger;
3695 if ( aExp == 0x7FFF ) {
3696 if ( (bits64) ( ( aSig | bSig )<<1 ) ) {
3697 return propagateFloatx80NaN( a, b );
3698 }
3699 float_raise( float_flag_invalid );
3700 z.low = floatx80_default_nan_low;
3701 z.high = floatx80_default_nan_high;
3702 return z;
3703 }
3704 if ( aExp == 0 ) {
3705 aExp = 1;
3706 bExp = 1;
3707 }
3708 zSig1 = 0;
3709 if ( bSig < aSig ) goto aBigger;
3710 if ( aSig < bSig ) goto bBigger;
3711 return packFloatx80( float_rounding_mode == float_round_down, 0, 0 );
3712 bExpBigger:
3713 if ( bExp == 0x7FFF ) {
3714 if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b );
3715 return packFloatx80( zSign ^ 1, 0x7FFF, LIT64( 0x8000000000000000 ) );
3716 }
3717 if ( aExp == 0 ) ++expDiff;
3718 shift128RightJamming( aSig, 0, - expDiff, &aSig, &zSig1 );
3719 bBigger:
3720 sub128( bSig, 0, aSig, zSig1, &zSig0, &zSig1 );
3721 zExp = bExp;
3722 zSign ^= 1;
3723 goto normalizeRoundAndPack;
3724 aExpBigger:
3725 if ( aExp == 0x7FFF ) {
3726 if ( (bits64) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b );
3727 return a;
3728 }
3729 if ( bExp == 0 ) --expDiff;
3730 shift128RightJamming( bSig, 0, expDiff, &bSig, &zSig1 );
3731 aBigger:
3732 sub128( aSig, 0, bSig, zSig1, &zSig0, &zSig1 );
3733 zExp = aExp;
3734 normalizeRoundAndPack:
3735 return
3736 normalizeRoundAndPackFloatx80(
3737 floatx80_rounding_precision, zSign, zExp, zSig0, zSig1 );
3738
3739 }
3740
3741 /*
3742 -------------------------------------------------------------------------------
3743 Returns the result of adding the extended double-precision floating-point
3744 values `a' and `b'. The operation is performed according to the IEC/IEEE
3745 Standard for Binary Floating-Point Arithmetic.
3746 -------------------------------------------------------------------------------
3747 */
3748 floatx80 floatx80_add( floatx80 a, floatx80 b )
3749 {
3750 flag aSign, bSign;
3751
3752 aSign = extractFloatx80Sign( a );
3753 bSign = extractFloatx80Sign( b );
3754 if ( aSign == bSign ) {
3755 return addFloatx80Sigs( a, b, aSign );
3756 }
3757 else {
3758 return subFloatx80Sigs( a, b, aSign );
3759 }
3760
3761 }
3762
3763 /*
3764 -------------------------------------------------------------------------------
3765 Returns the result of subtracting the extended double-precision floating-
3766 point values `a' and `b'. The operation is performed according to the
3767 IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3768 -------------------------------------------------------------------------------
3769 */
3770 floatx80 floatx80_sub( floatx80 a, floatx80 b )
3771 {
3772 flag aSign, bSign;
3773
3774 aSign = extractFloatx80Sign( a );
3775 bSign = extractFloatx80Sign( b );
3776 if ( aSign == bSign ) {
3777 return subFloatx80Sigs( a, b, aSign );
3778 }
3779 else {
3780 return addFloatx80Sigs( a, b, aSign );
3781 }
3782
3783 }
3784
3785 /*
3786 -------------------------------------------------------------------------------
3787 Returns the result of multiplying the extended double-precision floating-
3788 point values `a' and `b'. The operation is performed according to the
3789 IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3790 -------------------------------------------------------------------------------
3791 */
3792 floatx80 floatx80_mul( floatx80 a, floatx80 b )
3793 {
3794 flag aSign, bSign, zSign;
3795 int32 aExp, bExp, zExp;
3796 bits64 aSig, bSig, zSig0, zSig1;
3797 floatx80 z;
3798
3799 aSig = extractFloatx80Frac( a );
3800 aExp = extractFloatx80Exp( a );
3801 aSign = extractFloatx80Sign( a );
3802 bSig = extractFloatx80Frac( b );
3803 bExp = extractFloatx80Exp( b );
3804 bSign = extractFloatx80Sign( b );
3805 zSign = aSign ^ bSign;
3806 if ( aExp == 0x7FFF ) {
3807 if ( (bits64) ( aSig<<1 )
3808 || ( ( bExp == 0x7FFF ) && (bits64) ( bSig<<1 ) ) ) {
3809 return propagateFloatx80NaN( a, b );
3810 }
3811 if ( ( bExp | bSig ) == 0 ) goto invalid;
3812 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
3813 }
3814 if ( bExp == 0x7FFF ) {
3815 if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b );
3816 if ( ( aExp | aSig ) == 0 ) {
3817 invalid:
3818 float_raise( float_flag_invalid );
3819 z.low = floatx80_default_nan_low;
3820 z.high = floatx80_default_nan_high;
3821 return z;
3822 }
3823 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
3824 }
3825 if ( aExp == 0 ) {
3826 if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 );
3827 normalizeFloatx80Subnormal( aSig, &aExp, &aSig );
3828 }
3829 if ( bExp == 0 ) {
3830 if ( bSig == 0 ) return packFloatx80( zSign, 0, 0 );
3831 normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
3832 }
3833 zExp = aExp + bExp - 0x3FFE;
3834 mul64To128( aSig, bSig, &zSig0, &zSig1 );
3835 if ( 0 < (sbits64) zSig0 ) {
3836 shortShift128Left( zSig0, zSig1, 1, &zSig0, &zSig1 );
3837 --zExp;
3838 }
3839 return
3840 roundAndPackFloatx80(
3841 floatx80_rounding_precision, zSign, zExp, zSig0, zSig1 );
3842
3843 }
3844
3845 /*
3846 -------------------------------------------------------------------------------
3847 Returns the result of dividing the extended double-precision floating-point
3848 value `a' by the corresponding value `b'. The operation is performed
3849 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3850 -------------------------------------------------------------------------------
3851 */
3852 floatx80 floatx80_div( floatx80 a, floatx80 b )
3853 {
3854 flag aSign, bSign, zSign;
3855 int32 aExp, bExp, zExp;
3856 bits64 aSig, bSig, zSig0, zSig1;
3857 bits64 rem0, rem1, rem2, term0, term1, term2;
3858 floatx80 z;
3859
3860 aSig = extractFloatx80Frac( a );
3861 aExp = extractFloatx80Exp( a );
3862 aSign = extractFloatx80Sign( a );
3863 bSig = extractFloatx80Frac( b );
3864 bExp = extractFloatx80Exp( b );
3865 bSign = extractFloatx80Sign( b );
3866 zSign = aSign ^ bSign;
3867 if ( aExp == 0x7FFF ) {
3868 if ( (bits64) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b );
3869 if ( bExp == 0x7FFF ) {
3870 if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b );
3871 goto invalid;
3872 }
3873 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
3874 }
3875 if ( bExp == 0x7FFF ) {
3876 if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b );
3877 return packFloatx80( zSign, 0, 0 );
3878 }
3879 if ( bExp == 0 ) {
3880 if ( bSig == 0 ) {
3881 if ( ( aExp | aSig ) == 0 ) {
3882 invalid:
3883 float_raise( float_flag_invalid );
3884 z.low = floatx80_default_nan_low;
3885 z.high = floatx80_default_nan_high;
3886 return z;
3887 }
3888 float_raise( float_flag_divbyzero );
3889 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
3890 }
3891 normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
3892 }
3893 if ( aExp == 0 ) {
3894 if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 );
3895 normalizeFloatx80Subnormal( aSig, &aExp, &aSig );
3896 }
3897 zExp = aExp - bExp + 0x3FFE;
3898 rem1 = 0;
3899 if ( bSig <= aSig ) {
3900 shift128Right( aSig, 0, 1, &aSig, &rem1 );
3901 ++zExp;
3902 }
3903 zSig0 = estimateDiv128To64( aSig, rem1, bSig );
3904 mul64To128( bSig, zSig0, &term0, &term1 );
3905 sub128( aSig, rem1, term0, term1, &rem0, &rem1 );
3906 while ( (sbits64) rem0 < 0 ) {
3907 --zSig0;
3908 add128( rem0, rem1, 0, bSig, &rem0, &rem1 );
3909 }
3910 zSig1 = estimateDiv128To64( rem1, 0, bSig );
3911 if ( (bits64) ( zSig1<<1 ) <= 8 ) {
3912 mul64To128( bSig, zSig1, &term1, &term2 );
3913 sub128( rem1, 0, term1, term2, &rem1, &rem2 );
3914 while ( (sbits64) rem1 < 0 ) {
3915 --zSig1;
3916 add128( rem1, rem2, 0, bSig, &rem1, &rem2 );
3917 }
3918 zSig1 |= ( ( rem1 | rem2 ) != 0 );
3919 }
3920 return
3921 roundAndPackFloatx80(
3922 floatx80_rounding_precision, zSign, zExp, zSig0, zSig1 );
3923
3924 }
3925
3926 /*
3927 -------------------------------------------------------------------------------
3928 Returns the remainder of the extended double-precision floating-point value
3929 `a' with respect to the corresponding value `b'. The operation is performed
3930 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3931 -------------------------------------------------------------------------------
3932 */
3933 floatx80 floatx80_rem( floatx80 a, floatx80 b )
3934 {
3935 flag aSign, bSign, zSign;
3936 int32 aExp, bExp, expDiff;
3937 bits64 aSig0, aSig1, bSig;
3938 bits64 q, term0, term1, alternateASig0, alternateASig1;
3939 floatx80 z;
3940
3941 aSig0 = extractFloatx80Frac( a );
3942 aExp = extractFloatx80Exp( a );
3943 aSign = extractFloatx80Sign( a );
3944 bSig = extractFloatx80Frac( b );
3945 bExp = extractFloatx80Exp( b );
3946 bSign = extractFloatx80Sign( b );
3947 if ( aExp == 0x7FFF ) {
3948 if ( (bits64) ( aSig0<<1 )
3949 || ( ( bExp == 0x7FFF ) && (bits64) ( bSig<<1 ) ) ) {
3950 return propagateFloatx80NaN( a, b );
3951 }
3952 goto invalid;
3953 }
3954 if ( bExp == 0x7FFF ) {
3955 if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b );
3956 return a;
3957 }
3958 if ( bExp == 0 ) {
3959 if ( bSig == 0 ) {
3960 invalid:
3961 float_raise( float_flag_invalid );
3962 z.low = floatx80_default_nan_low;
3963 z.high = floatx80_default_nan_high;
3964 return z;
3965 }
3966 normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
3967 }
3968 if ( aExp == 0 ) {
3969 if ( (bits64) ( aSig0<<1 ) == 0 ) return a;
3970 normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 );
3971 }
3972 bSig |= LIT64( 0x8000000000000000 );
3973 zSign = aSign;
3974 expDiff = aExp - bExp;
3975 aSig1 = 0;
3976 if ( expDiff < 0 ) {
3977 if ( expDiff < -1 ) return a;
3978 shift128Right( aSig0, 0, 1, &aSig0, &aSig1 );
3979 expDiff = 0;
3980 }
3981 q = ( bSig <= aSig0 );
3982 if ( q ) aSig0 -= bSig;
3983 expDiff -= 64;
3984 while ( 0 < expDiff ) {
3985 q = estimateDiv128To64( aSig0, aSig1, bSig );
3986 q = ( 2 < q ) ? q - 2 : 0;
3987 mul64To128( bSig, q, &term0, &term1 );
3988 sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
3989 shortShift128Left( aSig0, aSig1, 62, &aSig0, &aSig1 );
3990 expDiff -= 62;
3991 }
3992 expDiff += 64;
3993 if ( 0 < expDiff ) {
3994 q = estimateDiv128To64( aSig0, aSig1, bSig );
3995 q = ( 2 < q ) ? q - 2 : 0;
3996 q >>= 64 - expDiff;
3997 mul64To128( bSig, q<<( 64 - expDiff ), &term0, &term1 );
3998 sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
3999 shortShift128Left( 0, bSig, 64 - expDiff, &term0, &term1 );
4000 while ( le128( term0, term1, aSig0, aSig1 ) ) {
4001 ++q;
4002 sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
4003 }
4004 }
4005 else {
4006 term1 = 0;
4007 term0 = bSig;
4008 }
4009 sub128( term0, term1, aSig0, aSig1, &alternateASig0, &alternateASig1 );
4010 if ( lt128( alternateASig0, alternateASig1, aSig0, aSig1 )
4011 || ( eq128( alternateASig0, alternateASig1, aSig0, aSig1 )
4012 && ( q & 1 ) )
4013 ) {
4014 aSig0 = alternateASig0;
4015 aSig1 = alternateASig1;
4016 zSign = ! zSign;
4017 }
4018 return
4019 normalizeRoundAndPackFloatx80(
4020 80, zSign, bExp + expDiff, aSig0, aSig1 );
4021
4022 }
4023
4024 /*
4025 -------------------------------------------------------------------------------
4026 Returns the square root of the extended double-precision floating-point
4027 value `a'. The operation is performed according to the IEC/IEEE Standard
4028 for Binary Floating-Point Arithmetic.
4029 -------------------------------------------------------------------------------
4030 */
4031 floatx80 floatx80_sqrt( floatx80 a )
4032 {
4033 flag aSign;
4034 int32 aExp, zExp;
4035 bits64 aSig0, aSig1, zSig0, zSig1, doubleZSig0;
4036 bits64 rem0, rem1, rem2, rem3, term0, term1, term2, term3;
4037 floatx80 z;
4038
4039 aSig0 = extractFloatx80Frac( a );
4040 aExp = extractFloatx80Exp( a );
4041 aSign = extractFloatx80Sign( a );
4042 if ( aExp == 0x7FFF ) {
4043 if ( (bits64) ( aSig0<<1 ) ) return propagateFloatx80NaN( a, a );
4044 if ( ! aSign ) return a;
4045 goto invalid;
4046 }
4047 if ( aSign ) {
4048 if ( ( aExp | aSig0 ) == 0 ) return a;
4049 invalid:
4050 float_raise( float_flag_invalid );
4051 z.low = floatx80_default_nan_low;
4052 z.high = floatx80_default_nan_high;
4053 return z;
4054 }
4055 if ( aExp == 0 ) {
4056 if ( aSig0 == 0 ) return packFloatx80( 0, 0, 0 );
4057 normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 );
4058 }
4059 zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFF;
4060 zSig0 = estimateSqrt32( aExp, aSig0>>32 );
4061 shift128Right( aSig0, 0, 2 + ( aExp & 1 ), &aSig0, &aSig1 );
4062 zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 );
4063 doubleZSig0 = zSig0<<1;
4064 mul64To128( zSig0, zSig0, &term0, &term1 );
4065 sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 );
4066 while ( (sbits64) rem0 < 0 ) {
4067 --zSig0;
4068 doubleZSig0 -= 2;
4069 add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 );
4070 }
4071 zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 );
4072 if ( ( zSig1 & LIT64( 0x3FFFFFFFFFFFFFFF ) ) <= 5 ) {
4073 if ( zSig1 == 0 ) zSig1 = 1;
4074 mul64To128( doubleZSig0, zSig1, &term1, &term2 );
4075 sub128( rem1, 0, term1, term2, &rem1, &rem2 );
4076 mul64To128( zSig1, zSig1, &term2, &term3 );
4077 sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 );
4078 while ( (sbits64) rem1 < 0 ) {
4079 --zSig1;
4080 shortShift128Left( 0, zSig1, 1, &term2, &term3 );
4081 term3 |= 1;
4082 term2 |= doubleZSig0;
4083 add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 );
4084 }
4085 zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
4086 }
4087 shortShift128Left( 0, zSig1, 1, &zSig0, &zSig1 );
4088 zSig0 |= doubleZSig0;
4089 return
4090 roundAndPackFloatx80(
4091 floatx80_rounding_precision, 0, zExp, zSig0, zSig1 );
4092
4093 }
4094
4095 /*
4096 -------------------------------------------------------------------------------
4097 Returns 1 if the extended double-precision floating-point value `a' is
4098 equal to the corresponding value `b', and 0 otherwise. The comparison is
4099 performed according to the IEC/IEEE Standard for Binary Floating-Point
4100 Arithmetic.
4101 -------------------------------------------------------------------------------
4102 */
4103 flag floatx80_eq( floatx80 a, floatx80 b )
4104 {
4105
4106 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
4107 && (bits64) ( extractFloatx80Frac( a )<<1 ) )
4108 || ( ( extractFloatx80Exp( b ) == 0x7FFF )
4109 && (bits64) ( extractFloatx80Frac( b )<<1 ) )
4110 ) {
4111 if ( floatx80_is_signaling_nan( a )
4112 || floatx80_is_signaling_nan( b ) ) {
4113 float_raise( float_flag_invalid );
4114 }
4115 return 0;
4116 }
4117 return
4118 ( a.low == b.low )
4119 && ( ( a.high == b.high )
4120 || ( ( a.low == 0 )
4121 && ( (bits16) ( ( a.high | b.high )<<1 ) == 0 ) )
4122 );
4123
4124 }
4125
4126 /*
4127 -------------------------------------------------------------------------------
4128 Returns 1 if the extended double-precision floating-point value `a' is
4129 less than or equal to the corresponding value `b', and 0 otherwise. The
4130 comparison is performed according to the IEC/IEEE Standard for Binary
4131 Floating-Point Arithmetic.
4132 -------------------------------------------------------------------------------
4133 */
4134 flag floatx80_le( floatx80 a, floatx80 b )
4135 {
4136 flag aSign, bSign;
4137
4138 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
4139 && (bits64) ( extractFloatx80Frac( a )<<1 ) )
4140 || ( ( extractFloatx80Exp( b ) == 0x7FFF )
4141 && (bits64) ( extractFloatx80Frac( b )<<1 ) )
4142 ) {
4143 float_raise( float_flag_invalid );
4144 return 0;
4145 }
4146 aSign = extractFloatx80Sign( a );
4147 bSign = extractFloatx80Sign( b );
4148 if ( aSign != bSign ) {
4149 return
4150 aSign
4151 || ( ( ( (bits16) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
4152 == 0 );
4153 }
4154 return
4155 aSign ? le128( b.high, b.low, a.high, a.low )
4156 : le128( a.high, a.low, b.high, b.low );
4157
4158 }
4159
4160 /*
4161 -------------------------------------------------------------------------------
4162 Returns 1 if the extended double-precision floating-point value `a' is
4163 less than the corresponding value `b', and 0 otherwise. The comparison
4164 is performed according to the IEC/IEEE Standard for Binary Floating-Point
4165 Arithmetic.
4166 -------------------------------------------------------------------------------
4167 */
4168 flag floatx80_lt( floatx80 a, floatx80 b )
4169 {
4170 flag aSign, bSign;
4171
4172 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
4173 && (bits64) ( extractFloatx80Frac( a )<<1 ) )
4174 || ( ( extractFloatx80Exp( b ) == 0x7FFF )
4175 && (bits64) ( extractFloatx80Frac( b )<<1 ) )
4176 ) {
4177 float_raise( float_flag_invalid );
4178 return 0;
4179 }
4180 aSign = extractFloatx80Sign( a );
4181 bSign = extractFloatx80Sign( b );
4182 if ( aSign != bSign ) {
4183 return
4184 aSign
4185 && ( ( ( (bits16) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
4186 != 0 );
4187 }
4188 return
4189 aSign ? lt128( b.high, b.low, a.high, a.low )
4190 : lt128( a.high, a.low, b.high, b.low );
4191
4192 }
4193
4194 /*
4195 -------------------------------------------------------------------------------
4196 Returns 1 if the extended double-precision floating-point value `a' is equal
4197 to the corresponding value `b', and 0 otherwise. The invalid exception is
4198 raised if either operand is a NaN. Otherwise, the comparison is performed
4199 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4200 -------------------------------------------------------------------------------
4201 */
4202 flag floatx80_eq_signaling( floatx80 a, floatx80 b )
4203 {
4204
4205 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
4206 && (bits64) ( extractFloatx80Frac( a )<<1 ) )
4207 || ( ( extractFloatx80Exp( b ) == 0x7FFF )
4208 && (bits64) ( extractFloatx80Frac( b )<<1 ) )
4209 ) {
4210 float_raise( float_flag_invalid );
4211 return 0;
4212 }
4213 return
4214 ( a.low == b.low )
4215 && ( ( a.high == b.high )
4216 || ( ( a.low == 0 )
4217 && ( (bits16) ( ( a.high | b.high )<<1 ) == 0 ) )
4218 );
4219
4220 }
4221
4222 /*
4223 -------------------------------------------------------------------------------
4224 Returns 1 if the extended double-precision floating-point value `a' is less
4225 than or equal to the corresponding value `b', and 0 otherwise. Quiet NaNs
4226 do not cause an exception. Otherwise, the comparison is performed according
4227 to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4228 -------------------------------------------------------------------------------
4229 */
4230 flag floatx80_le_quiet( floatx80 a, floatx80 b )
4231 {
4232 flag aSign, bSign;
4233
4234 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
4235 && (bits64) ( extractFloatx80Frac( a )<<1 ) )
4236 || ( ( extractFloatx80Exp( b ) == 0x7FFF )
4237 && (bits64) ( extractFloatx80Frac( b )<<1 ) )
4238 ) {
4239 if ( floatx80_is_signaling_nan( a )
4240 || floatx80_is_signaling_nan( b ) ) {
4241 float_raise( float_flag_invalid );
4242 }
4243 return 0;
4244 }
4245 aSign = extractFloatx80Sign( a );
4246 bSign = extractFloatx80Sign( b );
4247 if ( aSign != bSign ) {
4248 return
4249 aSign
4250 || ( ( ( (bits16) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
4251 == 0 );
4252 }
4253 return
4254 aSign ? le128( b.high, b.low, a.high, a.low )
4255 : le128( a.high, a.low, b.high, b.low );
4256
4257 }
4258
4259 /*
4260 -------------------------------------------------------------------------------
4261 Returns 1 if the extended double-precision floating-point value `a' is less
4262 than the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause
4263 an exception. Otherwise, the comparison is performed according to the
4264 IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4265 -------------------------------------------------------------------------------
4266 */
4267 flag floatx80_lt_quiet( floatx80 a, floatx80 b )
4268 {
4269 flag aSign, bSign;
4270
4271 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
4272 && (bits64) ( extractFloatx80Frac( a )<<1 ) )
4273 || ( ( extractFloatx80Exp( b ) == 0x7FFF )
4274 && (bits64) ( extractFloatx80Frac( b )<<1 ) )
4275 ) {
4276 if ( floatx80_is_signaling_nan( a )
4277 || floatx80_is_signaling_nan( b ) ) {
4278 float_raise( float_flag_invalid );
4279 }
4280 return 0;
4281 }
4282 aSign = extractFloatx80Sign( a );
4283 bSign = extractFloatx80Sign( b );
4284 if ( aSign != bSign ) {
4285 return
4286 aSign
4287 && ( ( ( (bits16) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
4288 != 0 );
4289 }
4290 return
4291 aSign ? lt128( b.high, b.low, a.high, a.low )
4292 : lt128( a.high, a.low, b.high, b.low );
4293
4294 }
4295
4296 #endif
4297
4298 #ifdef FLOAT128
4299
4300 /*
4301 -------------------------------------------------------------------------------
4302 Returns the result of converting the quadruple-precision floating-point
4303 value `a' to the 32-bit two's complement integer format. The conversion
4304 is performed according to the IEC/IEEE Standard for Binary Floating-Point
4305 Arithmetic---which means in particular that the conversion is rounded
4306 according to the current rounding mode. If `a' is a NaN, the largest
4307 positive integer is returned. Otherwise, if the conversion overflows, the
4308 largest integer with the same sign as `a' is returned.
4309 -------------------------------------------------------------------------------
4310 */
4311 int32 float128_to_int32( float128 a )
4312 {
4313 flag aSign;
4314 int32 aExp, shiftCount;
4315 bits64 aSig0, aSig1;
4316
4317 aSig1 = extractFloat128Frac1( a );
4318 aSig0 = extractFloat128Frac0( a );
4319 aExp = extractFloat128Exp( a );
4320 aSign = extractFloat128Sign( a );
4321 if ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) aSign = 0;
4322 if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
4323 aSig0 |= ( aSig1 != 0 );
4324 shiftCount = 0x4028 - aExp;
4325 if ( 0 < shiftCount ) shift64RightJamming( aSig0, shiftCount, &aSig0 );
4326 return roundAndPackInt32( aSign, aSig0 );
4327
4328 }
4329
4330 /*
4331 -------------------------------------------------------------------------------
4332 Returns the result of converting the quadruple-precision floating-point
4333 value `a' to the 32-bit two's complement integer format. The conversion
4334 is performed according to the IEC/IEEE Standard for Binary Floating-Point
4335 Arithmetic, except that the conversion is always rounded toward zero. If
4336 `a' is a NaN, the largest positive integer is returned. Otherwise, if the
4337 conversion overflows, the largest integer with the same sign as `a' is
4338 returned.
4339 -------------------------------------------------------------------------------
4340 */
4341 int32 float128_to_int32_round_to_zero( float128 a )
4342 {
4343 flag aSign;
4344 int32 aExp, shiftCount;
4345 bits64 aSig0, aSig1, savedASig;
4346 int32 z;
4347
4348 aSig1 = extractFloat128Frac1( a );
4349 aSig0 = extractFloat128Frac0( a );
4350 aExp = extractFloat128Exp( a );
4351 aSign = extractFloat128Sign( a );
4352 aSig0 |= ( aSig1 != 0 );
4353 if ( 0x401E < aExp ) {
4354 if ( ( aExp == 0x7FFF ) && aSig0 ) aSign = 0;
4355 goto invalid;
4356 }
4357 else if ( aExp < 0x3FFF ) {
4358 if ( aExp || aSig0 ) float_exception_flags |= float_flag_inexact;
4359 return 0;
4360 }
4361 aSig0 |= LIT64( 0x0001000000000000 );
4362 shiftCount = 0x402F - aExp;
4363 savedASig = aSig0;
4364 aSig0 >>= shiftCount;
4365 z = (int32)aSig0;
4366 if ( aSign ) z = - z;
4367 if ( ( z < 0 ) ^ aSign ) {
4368 invalid:
4369 float_raise( float_flag_invalid );
4370 return aSign ? (sbits32) 0x80000000 : 0x7FFFFFFF;
4371 }
4372 if ( ( aSig0<<shiftCount ) != savedASig ) {
4373 float_exception_flags |= float_flag_inexact;
4374 }
4375 return z;
4376
4377 }
4378
4379 /*
4380 -------------------------------------------------------------------------------
4381 Returns the result of converting the quadruple-precision floating-point
4382 value `a' to the 64-bit two's complement integer format. The conversion
4383 is performed according to the IEC/IEEE Standard for Binary Floating-Point
4384 Arithmetic---which means in particular that the conversion is rounded
4385 according to the current rounding mode. If `a' is a NaN, the largest
4386 positive integer is returned. Otherwise, if the conversion overflows, the
4387 largest integer with the same sign as `a' is returned.
4388 -------------------------------------------------------------------------------
4389 */
4390 int64 float128_to_int64( float128 a )
4391 {
4392 flag aSign;
4393 int32 aExp, shiftCount;
4394 bits64 aSig0, aSig1;
4395
4396 aSig1 = extractFloat128Frac1( a );
4397 aSig0 = extractFloat128Frac0( a );
4398 aExp = extractFloat128Exp( a );
4399 aSign = extractFloat128Sign( a );
4400 if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
4401 shiftCount = 0x402F - aExp;
4402 if ( shiftCount <= 0 ) {
4403 if ( 0x403E < aExp ) {
4404 float_raise( float_flag_invalid );
4405 if ( ! aSign
4406 || ( ( aExp == 0x7FFF )
4407 && ( aSig1 || ( aSig0 != LIT64( 0x0001000000000000 ) ) )
4408 )
4409 ) {
4410 return LIT64( 0x7FFFFFFFFFFFFFFF );
4411 }
4412 return (sbits64) LIT64( 0x8000000000000000 );
4413 }
4414 shortShift128Left( aSig0, aSig1, - shiftCount, &aSig0, &aSig1 );
4415 }
4416 else {
4417 shift64ExtraRightJamming( aSig0, aSig1, shiftCount, &aSig0, &aSig1 );
4418 }
4419 return roundAndPackInt64( aSign, aSig0, aSig1 );
4420
4421 }
4422
4423 /*
4424 -------------------------------------------------------------------------------
4425 Returns the result of converting the quadruple-precision floating-point
4426 value `a' to the 64-bit two's complement integer format. The conversion
4427 is performed according to the IEC/IEEE Standard for Binary Floating-Point
4428 Arithmetic, except that the conversion is always rounded toward zero.
4429 If `a' is a NaN, the largest positive integer is returned. Otherwise, if
4430 the conversion overflows, the largest integer with the same sign as `a' is
4431 returned.
4432 -------------------------------------------------------------------------------
4433 */
4434 int64 float128_to_int64_round_to_zero( float128 a )
4435 {
4436 flag aSign;
4437 int32 aExp, shiftCount;
4438 bits64 aSig0, aSig1;
4439 int64 z;
4440
4441 aSig1 = extractFloat128Frac1( a );
4442 aSig0 = extractFloat128Frac0( a );
4443 aExp = extractFloat128Exp( a );
4444 aSign = extractFloat128Sign( a );
4445 if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
4446 shiftCount = aExp - 0x402F;
4447 if ( 0 < shiftCount ) {
4448 if ( 0x403E <= aExp ) {
4449 aSig0 &= LIT64( 0x0000FFFFFFFFFFFF );
4450 if ( ( a.high == LIT64( 0xC03E000000000000 ) )
4451 && ( aSig1 < LIT64( 0x0002000000000000 ) ) ) {
4452 if ( aSig1 ) float_exception_flags |= float_flag_inexact;
4453 }
4454 else {
4455 float_raise( float_flag_invalid );
4456 if ( ! aSign || ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) ) {
4457 return LIT64( 0x7FFFFFFFFFFFFFFF );
4458 }
4459 }
4460 return (sbits64) LIT64( 0x8000000000000000 );
4461 }
4462 z = ( aSig0<<shiftCount ) | ( aSig1>>( ( - shiftCount ) & 63 ) );
4463 if ( (bits64) ( aSig1<<shiftCount ) ) {
4464 float_exception_flags |= float_flag_inexact;
4465 }
4466 }
4467 else {
4468 if ( aExp < 0x3FFF ) {
4469 if ( aExp | aSig0 | aSig1 ) {
4470 float_exception_flags |= float_flag_inexact;
4471 }
4472 return 0;
4473 }
4474 z = aSig0>>( - shiftCount );
4475 if ( aSig1
4476 || ( shiftCount && (bits64) ( aSig0<<( shiftCount & 63 ) ) ) ) {
4477 float_exception_flags |= float_flag_inexact;
4478 }
4479 }
4480 if ( aSign ) z = - z;
4481 return z;
4482
4483 }
4484
4485 #if (defined(SOFTFLOATSPARC64_FOR_GCC) || defined(SOFTFLOAT_FOR_GCC)) \
4486 && defined(SOFTFLOAT_NEED_FIXUNS)
4487 /*
4488 * just like above - but do not care for overflow of signed results
4489 */
4490 uint64 float128_to_uint64_round_to_zero( float128 a )
4491 {
4492 flag aSign;
4493 int32 aExp, shiftCount;
4494 bits64 aSig0, aSig1;
4495 uint64 z;
4496
4497 aSig1 = extractFloat128Frac1( a );
4498 aSig0 = extractFloat128Frac0( a );
4499 aExp = extractFloat128Exp( a );
4500 aSign = extractFloat128Sign( a );
4501 if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
4502 shiftCount = aExp - 0x402F;
4503 if ( 0 < shiftCount ) {
4504 if ( 0x403F <= aExp ) {
4505 aSig0 &= LIT64( 0x0000FFFFFFFFFFFF );
4506 if ( ( a.high == LIT64( 0xC03E000000000000 ) )
4507 && ( aSig1 < LIT64( 0x0002000000000000 ) ) ) {
4508 if ( aSig1 ) float_exception_flags |= float_flag_inexact;
4509 }
4510 else {
4511 float_raise( float_flag_invalid );
4512 }
4513 return LIT64( 0xFFFFFFFFFFFFFFFF );
4514 }
4515 z = ( aSig0<<shiftCount ) | ( aSig1>>( ( - shiftCount ) & 63 ) );
4516 if ( (bits64) ( aSig1<<shiftCount ) ) {
4517 float_exception_flags |= float_flag_inexact;
4518 }
4519 }
4520 else {
4521 if ( aExp < 0x3FFF ) {
4522 if ( aExp | aSig0 | aSig1 ) {
4523 float_exception_flags |= float_flag_inexact;
4524 }
4525 return 0;
4526 }
4527 z = aSig0>>( - shiftCount );
4528 if (aSig1 || ( shiftCount && (bits64) ( aSig0<<( shiftCount & 63 ) ) ) ) {
4529 float_exception_flags |= float_flag_inexact;
4530 }
4531 }
4532 if ( aSign ) z = - z;
4533 return z;
4534
4535 }
4536 #endif /* (SOFTFLOATSPARC64_FOR_GCC || SOFTFLOAT_FOR_GCC) && SOFTFLOAT_NEED_FIXUNS */
4537
4538 /*
4539 -------------------------------------------------------------------------------
4540 Returns the result of converting the quadruple-precision floating-point
4541 value `a' to the single-precision floating-point format. The conversion
4542 is performed according to the IEC/IEEE Standard for Binary Floating-Point
4543 Arithmetic.
4544 -------------------------------------------------------------------------------
4545 */
4546 float32 float128_to_float32( float128 a )
4547 {
4548 flag aSign;
4549 int32 aExp;
4550 bits64 aSig0, aSig1;
4551 bits32 zSig;
4552
4553 aSig1 = extractFloat128Frac1( a );
4554 aSig0 = extractFloat128Frac0( a );
4555 aExp = extractFloat128Exp( a );
4556 aSign = extractFloat128Sign( a );
4557 if ( aExp == 0x7FFF ) {
4558 if ( aSig0 | aSig1 ) {
4559 return commonNaNToFloat32( float128ToCommonNaN( a ) );
4560 }
4561 return packFloat32( aSign, 0xFF, 0 );
4562 }
4563 aSig0 |= ( aSig1 != 0 );
4564 shift64RightJamming( aSig0, 18, &aSig0 );
4565 zSig = (bits32)aSig0;
4566 if ( aExp || zSig ) {
4567 zSig |= 0x40000000;
4568 aExp -= 0x3F81;
4569 }
4570 return roundAndPackFloat32( aSign, aExp, zSig );
4571
4572 }
4573
4574 /*
4575 -------------------------------------------------------------------------------
4576 Returns the result of converting the quadruple-precision floating-point
4577 value `a' to the double-precision floating-point format. The conversion
4578 is performed according to the IEC/IEEE Standard for Binary Floating-Point
4579 Arithmetic.
4580 -------------------------------------------------------------------------------
4581 */
4582 float64 float128_to_float64( float128 a )
4583 {
4584 flag aSign;
4585 int32 aExp;
4586 bits64 aSig0, aSig1;
4587
4588 aSig1 = extractFloat128Frac1( a );
4589 aSig0 = extractFloat128Frac0( a );
4590 aExp = extractFloat128Exp( a );
4591 aSign = extractFloat128Sign( a );
4592 if ( aExp == 0x7FFF ) {
4593 if ( aSig0 | aSig1 ) {
4594 return commonNaNToFloat64( float128ToCommonNaN( a ) );
4595 }
4596 return packFloat64( aSign, 0x7FF, 0 );
4597 }
4598 shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 );
4599 aSig0 |= ( aSig1 != 0 );
4600 if ( aExp || aSig0 ) {
4601 aSig0 |= LIT64( 0x4000000000000000 );
4602 aExp -= 0x3C01;
4603 }
4604 return roundAndPackFloat64( aSign, aExp, aSig0 );
4605
4606 }
4607
4608 #ifdef FLOATX80
4609
4610 /*
4611 -------------------------------------------------------------------------------
4612 Returns the result of converting the quadruple-precision floating-point
4613 value `a' to the extended double-precision floating-point format. The
4614 conversion is performed according to the IEC/IEEE Standard for Binary
4615 Floating-Point Arithmetic.
4616 -------------------------------------------------------------------------------
4617 */
4618 floatx80 float128_to_floatx80( float128 a )
4619 {
4620 flag aSign;
4621 int32 aExp;
4622 bits64 aSig0, aSig1;
4623
4624 aSig1 = extractFloat128Frac1( a );
4625 aSig0 = extractFloat128Frac0( a );
4626 aExp = extractFloat128Exp( a );
4627 aSign = extractFloat128Sign( a );
4628 if ( aExp == 0x7FFF ) {
4629 if ( aSig0 | aSig1 ) {
4630 return commonNaNToFloatx80( float128ToCommonNaN( a ) );
4631 }
4632 return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
4633 }
4634 if ( aExp == 0 ) {
4635 if ( ( aSig0 | aSig1 ) == 0 ) return packFloatx80( aSign, 0, 0 );
4636 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
4637 }
4638 else {
4639 aSig0 |= LIT64( 0x0001000000000000 );
4640 }
4641 shortShift128Left( aSig0, aSig1, 15, &aSig0, &aSig1 );
4642 return roundAndPackFloatx80( 80, aSign, aExp, aSig0, aSig1 );
4643
4644 }
4645
4646 #endif
4647
4648 /*
4649 -------------------------------------------------------------------------------
4650 Rounds the quadruple-precision floating-point value `a' to an integer, and
4651 returns the result as a quadruple-precision floating-point value. The
4652 operation is performed according to the IEC/IEEE Standard for Binary
4653 Floating-Point Arithmetic.
4654 -------------------------------------------------------------------------------
4655 */
4656 float128 float128_round_to_int( float128 a )
4657 {
4658 flag aSign;
4659 int32 aExp;
4660 bits64 lastBitMask, roundBitsMask;
4661 int8 roundingMode;
4662 float128 z;
4663
4664 aExp = extractFloat128Exp( a );
4665 if ( 0x402F <= aExp ) {
4666 if ( 0x406F <= aExp ) {
4667 if ( ( aExp == 0x7FFF )
4668 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) )
4669 ) {
4670 return propagateFloat128NaN( a, a );
4671 }
4672 return a;
4673 }
4674 lastBitMask = 1;
4675 lastBitMask = ( lastBitMask<<( 0x406E - aExp ) )<<1;
4676 roundBitsMask = lastBitMask - 1;
4677 z = a;
4678 roundingMode = float_rounding_mode;
4679 if ( roundingMode == float_round_nearest_even ) {
4680 if ( lastBitMask ) {
4681 add128( z.high, z.low, 0, lastBitMask>>1, &z.high, &z.low );
4682 if ( ( z.low & roundBitsMask ) == 0 ) z.low &= ~ lastBitMask;
4683 }
4684 else {
4685 if ( (sbits64) z.low < 0 ) {
4686 ++z.high;
4687 if ( (bits64) ( z.low<<1 ) == 0 ) z.high &= ~1;
4688 }
4689 }
4690 }
4691 else if ( roundingMode != float_round_to_zero ) {
4692 if ( extractFloat128Sign( z )
4693 ^ ( roundingMode == float_round_up ) ) {
4694 add128( z.high, z.low, 0, roundBitsMask, &z.high, &z.low );
4695 }
4696 }
4697 z.low &= ~ roundBitsMask;
4698 }
4699 else {
4700 if ( aExp < 0x3FFF ) {
4701 if ( ( ( (bits64) ( a.high<<1 ) ) | a.low ) == 0 ) return a;
4702 float_exception_flags |= float_flag_inexact;
4703 aSign = extractFloat128Sign( a );
4704 switch ( float_rounding_mode ) {
4705 case float_round_nearest_even:
4706 if ( ( aExp == 0x3FFE )
4707 && ( extractFloat128Frac0( a )
4708 | extractFloat128Frac1( a ) )
4709 ) {
4710 return packFloat128( aSign, 0x3FFF, 0, 0 );
4711 }
4712 break;
4713 case float_round_to_zero:
4714 break;
4715 case float_round_down:
4716 return
4717 aSign ? packFloat128( 1, 0x3FFF, 0, 0 )
4718 : packFloat128( 0, 0, 0, 0 );
4719 case float_round_up:
4720 return
4721 aSign ? packFloat128( 1, 0, 0, 0 )
4722 : packFloat128( 0, 0x3FFF, 0, 0 );
4723 }
4724 return packFloat128( aSign, 0, 0, 0 );
4725 }
4726 lastBitMask = 1;
4727 lastBitMask <<= 0x402F - aExp;
4728 roundBitsMask = lastBitMask - 1;
4729 z.low = 0;
4730 z.high = a.high;
4731 roundingMode = float_rounding_mode;
4732 if ( roundingMode == float_round_nearest_even ) {
4733 z.high += lastBitMask>>1;
4734 if ( ( ( z.high & roundBitsMask ) | a.low ) == 0 ) {
4735 z.high &= ~ lastBitMask;
4736 }
4737 }
4738 else if ( roundingMode != float_round_to_zero ) {
4739 if ( extractFloat128Sign( z )
4740 ^ ( roundingMode == float_round_up ) ) {
4741 z.high |= ( a.low != 0 );
4742 z.high += roundBitsMask;
4743 }
4744 }
4745 z.high &= ~ roundBitsMask;
4746 }
4747 if ( ( z.low != a.low ) || ( z.high != a.high ) ) {
4748 float_exception_flags |= float_flag_inexact;
4749 }
4750 return z;
4751
4752 }
4753
4754 /*
4755 -------------------------------------------------------------------------------
4756 Returns the result of adding the absolute values of the quadruple-precision
4757 floating-point values `a' and `b'. If `zSign' is 1, the sum is negated
4758 before being returned. `zSign' is ignored if the result is a NaN.
4759 The addition is performed according to the IEC/IEEE Standard for Binary
4760 Floating-Point Arithmetic.
4761 -------------------------------------------------------------------------------
4762 */
4763 static float128 addFloat128Sigs( float128 a, float128 b, flag zSign )
4764 {
4765 int32 aExp, bExp, zExp;
4766 bits64 aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2;
4767 int32 expDiff;
4768
4769 aSig1 = extractFloat128Frac1( a );
4770 aSig0 = extractFloat128Frac0( a );
4771 aExp = extractFloat128Exp( a );
4772 bSig1 = extractFloat128Frac1( b );
4773 bSig0 = extractFloat128Frac0( b );
4774 bExp = extractFloat128Exp( b );
4775 expDiff = aExp - bExp;
4776 if ( 0 < expDiff ) {
4777 if ( aExp == 0x7FFF ) {
4778 if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, b );
4779 return a;
4780 }
4781 if ( bExp == 0 ) {
4782 --expDiff;
4783 }
4784 else {
4785 bSig0 |= LIT64( 0x0001000000000000 );
4786 }
4787 shift128ExtraRightJamming(
4788 bSig0, bSig1, 0, expDiff, &bSig0, &bSig1, &zSig2 );
4789 zExp = aExp;
4790 }
4791 else if ( expDiff < 0 ) {
4792 if ( bExp == 0x7FFF ) {
4793 if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b );
4794 return packFloat128( zSign, 0x7FFF, 0, 0 );
4795 }
4796 if ( aExp == 0 ) {
4797 ++expDiff;
4798 }
4799 else {
4800 aSig0 |= LIT64( 0x0001000000000000 );
4801 }
4802 shift128ExtraRightJamming(
4803 aSig0, aSig1, 0, - expDiff, &aSig0, &aSig1, &zSig2 );
4804 zExp = bExp;
4805 }
4806 else {
4807 if ( aExp == 0x7FFF ) {
4808 if ( aSig0 | aSig1 | bSig0 | bSig1 ) {
4809 return propagateFloat128NaN( a, b );
4810 }
4811 return a;
4812 }
4813 add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
4814 if ( aExp == 0 ) return packFloat128( zSign, 0, zSig0, zSig1 );
4815 zSig2 = 0;
4816 zSig0 |= LIT64( 0x0002000000000000 );
4817 zExp = aExp;
4818 goto shiftRight1;
4819 }
4820 aSig0 |= LIT64( 0x0001000000000000 );
4821 add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
4822 --zExp;
4823 if ( zSig0 < LIT64( 0x0002000000000000 ) ) goto roundAndPack;
4824 ++zExp;
4825 shiftRight1:
4826 shift128ExtraRightJamming(
4827 zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 );
4828 roundAndPack:
4829 return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 );
4830
4831 }
4832
4833 /*
4834 -------------------------------------------------------------------------------
4835 Returns the result of subtracting the absolute values of the quadruple-
4836 precision floating-point values `a' and `b'. If `zSign' is 1, the
4837 difference is negated before being returned. `zSign' is ignored if the
4838 result is a NaN. The subtraction is performed according to the IEC/IEEE
4839 Standard for Binary Floating-Point Arithmetic.
4840 -------------------------------------------------------------------------------
4841 */
4842 static float128 subFloat128Sigs( float128 a, float128 b, flag zSign )
4843 {
4844 int32 aExp, bExp, zExp;
4845 bits64 aSig0, aSig1, bSig0, bSig1, zSig0, zSig1;
4846 int32 expDiff;
4847 float128 z;
4848
4849 aSig1 = extractFloat128Frac1( a );
4850 aSig0 = extractFloat128Frac0( a );
4851 aExp = extractFloat128Exp( a );
4852 bSig1 = extractFloat128Frac1( b );
4853 bSig0 = extractFloat128Frac0( b );
4854 bExp = extractFloat128Exp( b );
4855 expDiff = aExp - bExp;
4856 shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 );
4857 shortShift128Left( bSig0, bSig1, 14, &bSig0, &bSig1 );
4858 if ( 0 < expDiff ) goto aExpBigger;
4859 if ( expDiff < 0 ) goto bExpBigger;
4860 if ( aExp == 0x7FFF ) {
4861 if ( aSig0 | aSig1 | bSig0 | bSig1 ) {
4862 return propagateFloat128NaN( a, b );
4863 }
4864 float_raise( float_flag_invalid );
4865 z.low = float128_default_nan_low;
4866 z.high = float128_default_nan_high;
4867 return z;
4868 }
4869 if ( aExp == 0 ) {
4870 aExp = 1;
4871 bExp = 1;
4872 }
4873 if ( bSig0 < aSig0 ) goto aBigger;
4874 if ( aSig0 < bSig0 ) goto bBigger;
4875 if ( bSig1 < aSig1 ) goto aBigger;
4876 if ( aSig1 < bSig1 ) goto bBigger;
4877 return packFloat128( float_rounding_mode == float_round_down, 0, 0, 0 );
4878 bExpBigger:
4879 if ( bExp == 0x7FFF ) {
4880 if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b );
4881 return packFloat128( zSign ^ 1, 0x7FFF, 0, 0 );
4882 }
4883 if ( aExp == 0 ) {
4884 ++expDiff;
4885 }
4886 else {
4887 aSig0 |= LIT64( 0x4000000000000000 );
4888 }
4889 shift128RightJamming( aSig0, aSig1, - expDiff, &aSig0, &aSig1 );
4890 bSig0 |= LIT64( 0x4000000000000000 );
4891 bBigger:
4892 sub128( bSig0, bSig1, aSig0, aSig1, &zSig0, &zSig1 );
4893 zExp = bExp;
4894 zSign ^= 1;
4895 goto normalizeRoundAndPack;
4896 aExpBigger:
4897 if ( aExp == 0x7FFF ) {
4898 if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, b );
4899 return a;
4900 }
4901 if ( bExp == 0 ) {
4902 --expDiff;
4903 }
4904 else {
4905 bSig0 |= LIT64( 0x4000000000000000 );
4906 }
4907 shift128RightJamming( bSig0, bSig1, expDiff, &bSig0, &bSig1 );
4908 aSig0 |= LIT64( 0x4000000000000000 );
4909 aBigger:
4910 sub128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
4911 zExp = aExp;
4912 normalizeRoundAndPack:
4913 --zExp;
4914 return normalizeRoundAndPackFloat128( zSign, zExp - 14, zSig0, zSig1 );
4915
4916 }
4917
4918 /*
4919 -------------------------------------------------------------------------------
4920 Returns the result of adding the quadruple-precision floating-point values
4921 `a' and `b'. The operation is performed according to the IEC/IEEE Standard
4922 for Binary Floating-Point Arithmetic.
4923 -------------------------------------------------------------------------------
4924 */
4925 float128 float128_add( float128 a, float128 b )
4926 {
4927 flag aSign, bSign;
4928
4929 aSign = extractFloat128Sign( a );
4930 bSign = extractFloat128Sign( b );
4931 if ( aSign == bSign ) {
4932 return addFloat128Sigs( a, b, aSign );
4933 }
4934 else {
4935 return subFloat128Sigs( a, b, aSign );
4936 }
4937
4938 }
4939
4940 /*
4941 -------------------------------------------------------------------------------
4942 Returns the result of subtracting the quadruple-precision floating-point
4943 values `a' and `b'. The operation is performed according to the IEC/IEEE
4944 Standard for Binary Floating-Point Arithmetic.
4945 -------------------------------------------------------------------------------
4946 */
4947 float128 float128_sub( float128 a, float128 b )
4948 {
4949 flag aSign, bSign;
4950
4951 aSign = extractFloat128Sign( a );
4952 bSign = extractFloat128Sign( b );
4953 if ( aSign == bSign ) {
4954 return subFloat128Sigs( a, b, aSign );
4955 }
4956 else {
4957 return addFloat128Sigs( a, b, aSign );
4958 }
4959
4960 }
4961
4962 /*
4963 -------------------------------------------------------------------------------
4964 Returns the result of multiplying the quadruple-precision floating-point
4965 values `a' and `b'. The operation is performed according to the IEC/IEEE
4966 Standard for Binary Floating-Point Arithmetic.
4967 -------------------------------------------------------------------------------
4968 */
4969 float128 float128_mul( float128 a, float128 b )
4970 {
4971 flag aSign, bSign, zSign;
4972 int32 aExp, bExp, zExp;
4973 bits64 aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2, zSig3;
4974 float128 z;
4975
4976 aSig1 = extractFloat128Frac1( a );
4977 aSig0 = extractFloat128Frac0( a );
4978 aExp = extractFloat128Exp( a );
4979 aSign = extractFloat128Sign( a );
4980 bSig1 = extractFloat128Frac1( b );
4981 bSig0 = extractFloat128Frac0( b );
4982 bExp = extractFloat128Exp( b );
4983 bSign = extractFloat128Sign( b );
4984 zSign = aSign ^ bSign;
4985 if ( aExp == 0x7FFF ) {
4986 if ( ( aSig0 | aSig1 )
4987 || ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) {
4988 return propagateFloat128NaN( a, b );
4989 }
4990 if ( ( bExp | bSig0 | bSig1 ) == 0 ) goto invalid;
4991 return packFloat128( zSign, 0x7FFF, 0, 0 );
4992 }
4993 if ( bExp == 0x7FFF ) {
4994 if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b );
4995 if ( ( aExp | aSig0 | aSig1 ) == 0 ) {
4996 invalid:
4997 float_raise( float_flag_invalid );
4998 z.low = float128_default_nan_low;
4999 z.high = float128_default_nan_high;
5000 return z;
5001 }
5002 return packFloat128( zSign, 0x7FFF, 0, 0 );
5003 }
5004 if ( aExp == 0 ) {
5005 if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
5006 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
5007 }
5008 if ( bExp == 0 ) {
5009 if ( ( bSig0 | bSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
5010 normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
5011 }
5012 zExp = aExp + bExp - 0x4000;
5013 aSig0 |= LIT64( 0x0001000000000000 );
5014 shortShift128Left( bSig0, bSig1, 16, &bSig0, &bSig1 );
5015 mul128To256( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1, &zSig2, &zSig3 );
5016 add128( zSig0, zSig1, aSig0, aSig1, &zSig0, &zSig1 );
5017 zSig2 |= ( zSig3 != 0 );
5018 if ( LIT64( 0x0002000000000000 ) <= zSig0 ) {
5019 shift128ExtraRightJamming(
5020 zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 );
5021 ++zExp;
5022 }
5023 return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 );
5024
5025 }
5026
5027 /*
5028 -------------------------------------------------------------------------------
5029 Returns the result of dividing the quadruple-precision floating-point value
5030 `a' by the corresponding value `b'. The operation is performed according to
5031 the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5032 -------------------------------------------------------------------------------
5033 */
5034 float128 float128_div( float128 a, float128 b )
5035 {
5036 flag aSign, bSign, zSign;
5037 int32 aExp, bExp, zExp;
5038 bits64 aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2;
5039 bits64 rem0, rem1, rem2, rem3, term0, term1, term2, term3;
5040 float128 z;
5041
5042 aSig1 = extractFloat128Frac1( a );
5043 aSig0 = extractFloat128Frac0( a );
5044 aExp = extractFloat128Exp( a );
5045 aSign = extractFloat128Sign( a );
5046 bSig1 = extractFloat128Frac1( b );
5047 bSig0 = extractFloat128Frac0( b );
5048 bExp = extractFloat128Exp( b );
5049 bSign = extractFloat128Sign( b );
5050 zSign = aSign ^ bSign;
5051 if ( aExp == 0x7FFF ) {
5052 if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, b );
5053 if ( bExp == 0x7FFF ) {
5054 if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b );
5055 goto invalid;
5056 }
5057 return packFloat128( zSign, 0x7FFF, 0, 0 );
5058 }
5059 if ( bExp == 0x7FFF ) {
5060 if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b );
5061 return packFloat128( zSign, 0, 0, 0 );
5062 }
5063 if ( bExp == 0 ) {
5064 if ( ( bSig0 | bSig1 ) == 0 ) {
5065 if ( ( aExp | aSig0 | aSig1 ) == 0 ) {
5066 invalid:
5067 float_raise( float_flag_invalid );
5068 z.low = float128_default_nan_low;
5069 z.high = float128_default_nan_high;
5070 return z;
5071 }
5072 float_raise( float_flag_divbyzero );
5073 return packFloat128( zSign, 0x7FFF, 0, 0 );
5074 }
5075 normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
5076 }
5077 if ( aExp == 0 ) {
5078 if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
5079 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
5080 }
5081 zExp = aExp - bExp + 0x3FFD;
5082 shortShift128Left(
5083 aSig0 | LIT64( 0x0001000000000000 ), aSig1, 15, &aSig0, &aSig1 );
5084 shortShift128Left(
5085 bSig0 | LIT64( 0x0001000000000000 ), bSig1, 15, &bSig0, &bSig1 );
5086 if ( le128( bSig0, bSig1, aSig0, aSig1 ) ) {
5087 shift128Right( aSig0, aSig1, 1, &aSig0, &aSig1 );
5088 ++zExp;
5089 }
5090 zSig0 = estimateDiv128To64( aSig0, aSig1, bSig0 );
5091 mul128By64To192( bSig0, bSig1, zSig0, &term0, &term1, &term2 );
5092 sub192( aSig0, aSig1, 0, term0, term1, term2, &rem0, &rem1, &rem2 );
5093 while ( (sbits64) rem0 < 0 ) {
5094 --zSig0;
5095 add192( rem0, rem1, rem2, 0, bSig0, bSig1, &rem0, &rem1, &rem2 );
5096 }
5097 zSig1 = estimateDiv128To64( rem1, rem2, bSig0 );
5098 if ( ( zSig1 & 0x3FFF ) <= 4 ) {
5099 mul128By64To192( bSig0, bSig1, zSig1, &term1, &term2, &term3 );
5100 sub192( rem1, rem2, 0, term1, term2, term3, &rem1, &rem2, &rem3 );
5101 while ( (sbits64) rem1 < 0 ) {
5102 --zSig1;
5103 add192( rem1, rem2, rem3, 0, bSig0, bSig1, &rem1, &rem2, &rem3 );
5104 }
5105 zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
5106 }
5107 shift128ExtraRightJamming( zSig0, zSig1, 0, 15, &zSig0, &zSig1, &zSig2 );
5108 return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 );
5109
5110 }
5111
5112 /*
5113 -------------------------------------------------------------------------------
5114 Returns the remainder of the quadruple-precision floating-point value `a'
5115 with respect to the corresponding value `b'. The operation is performed
5116 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5117 -------------------------------------------------------------------------------
5118 */
5119 float128 float128_rem( float128 a, float128 b )
5120 {
5121 flag aSign, zSign;
5122 int32 aExp, bExp, expDiff;
5123 bits64 aSig0, aSig1, bSig0, bSig1, q, term0, term1, term2;
5124 bits64 allZero, alternateASig0, alternateASig1, sigMean1;
5125 sbits64 sigMean0;
5126 float128 z;
5127
5128 aSig1 = extractFloat128Frac1( a );
5129 aSig0 = extractFloat128Frac0( a );
5130 aExp = extractFloat128Exp( a );
5131 aSign = extractFloat128Sign( a );
5132 bSig1 = extractFloat128Frac1( b );
5133 bSig0 = extractFloat128Frac0( b );
5134 bExp = extractFloat128Exp( b );
5135 if ( aExp == 0x7FFF ) {
5136 if ( ( aSig0 | aSig1 )
5137 || ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) {
5138 return propagateFloat128NaN( a, b );
5139 }
5140 goto invalid;
5141 }
5142 if ( bExp == 0x7FFF ) {
5143 if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b );
5144 return a;
5145 }
5146 if ( bExp == 0 ) {
5147 if ( ( bSig0 | bSig1 ) == 0 ) {
5148 invalid:
5149 float_raise( float_flag_invalid );
5150 z.low = float128_default_nan_low;
5151 z.high = float128_default_nan_high;
5152 return z;
5153 }
5154 normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
5155 }
5156 if ( aExp == 0 ) {
5157 if ( ( aSig0 | aSig1 ) == 0 ) return a;
5158 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
5159 }
5160 expDiff = aExp - bExp;
5161 if ( expDiff < -1 ) return a;
5162 shortShift128Left(
5163 aSig0 | LIT64( 0x0001000000000000 ),
5164 aSig1,
5165 15 - ( expDiff < 0 ),
5166 &aSig0,
5167 &aSig1
5168 );
5169 shortShift128Left(
5170 bSig0 | LIT64( 0x0001000000000000 ), bSig1, 15, &bSig0, &bSig1 );
5171 q = le128( bSig0, bSig1, aSig0, aSig1 );
5172 if ( q ) sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 );
5173 expDiff -= 64;
5174 while ( 0 < expDiff ) {
5175 q = estimateDiv128To64( aSig0, aSig1, bSig0 );
5176 q = ( 4 < q ) ? q - 4 : 0;
5177 mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 );
5178 shortShift192Left( term0, term1, term2, 61, &term1, &term2, &allZero );
5179 shortShift128Left( aSig0, aSig1, 61, &aSig0, &allZero );
5180 sub128( aSig0, 0, term1, term2, &aSig0, &aSig1 );
5181 expDiff -= 61;
5182 }
5183 if ( -64 < expDiff ) {
5184 q = estimateDiv128To64( aSig0, aSig1, bSig0 );
5185 q = ( 4 < q ) ? q - 4 : 0;
5186 q >>= - expDiff;
5187 shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 );
5188 expDiff += 52;
5189 if ( expDiff < 0 ) {
5190 shift128Right( aSig0, aSig1, - expDiff, &aSig0, &aSig1 );
5191 }
5192 else {
5193 shortShift128Left( aSig0, aSig1, expDiff, &aSig0, &aSig1 );
5194 }
5195 mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 );
5196 sub128( aSig0, aSig1, term1, term2, &aSig0, &aSig1 );
5197 }
5198 else {
5199 shift128Right( aSig0, aSig1, 12, &aSig0, &aSig1 );
5200 shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 );
5201 }
5202 do {
5203 alternateASig0 = aSig0;
5204 alternateASig1 = aSig1;
5205 ++q;
5206 sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 );
5207 } while ( 0 <= (sbits64) aSig0 );
5208 add128(
5209 aSig0, aSig1, alternateASig0, alternateASig1, (bits64 *)&sigMean0, &sigMean1 );
5210 if ( ( sigMean0 < 0 )
5211 || ( ( ( sigMean0 | sigMean1 ) == 0 ) && ( q & 1 ) ) ) {
5212 aSig0 = alternateASig0;
5213 aSig1 = alternateASig1;
5214 }
5215 zSign = ( (sbits64) aSig0 < 0 );
5216 if ( zSign ) sub128( 0, 0, aSig0, aSig1, &aSig0, &aSig1 );
5217 return
5218 normalizeRoundAndPackFloat128( aSign ^ zSign, bExp - 4, aSig0, aSig1 );
5219
5220 }
5221
5222 /*
5223 -------------------------------------------------------------------------------
5224 Returns the square root of the quadruple-precision floating-point value `a'.
5225 The operation is performed according to the IEC/IEEE Standard for Binary
5226 Floating-Point Arithmetic.
5227 -------------------------------------------------------------------------------
5228 */
5229 float128 float128_sqrt( float128 a )
5230 {
5231 flag aSign;
5232 int32 aExp, zExp;
5233 bits64 aSig0, aSig1, zSig0, zSig1, zSig2, doubleZSig0;
5234 bits64 rem0, rem1, rem2, rem3, term0, term1, term2, term3;
5235 float128 z;
5236
5237 aSig1 = extractFloat128Frac1( a );
5238 aSig0 = extractFloat128Frac0( a );
5239 aExp = extractFloat128Exp( a );
5240 aSign = extractFloat128Sign( a );
5241 if ( aExp == 0x7FFF ) {
5242 if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, a );
5243 if ( ! aSign ) return a;
5244 goto invalid;
5245 }
5246 if ( aSign ) {
5247 if ( ( aExp | aSig0 | aSig1 ) == 0 ) return a;
5248 invalid:
5249 float_raise( float_flag_invalid );
5250 z.low = float128_default_nan_low;
5251 z.high = float128_default_nan_high;
5252 return z;
5253 }
5254 if ( aExp == 0 ) {
5255 if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( 0, 0, 0, 0 );
5256 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
5257 }
5258 zExp = ( (unsigned int)(aExp - 0x3FFF) >> 1) + 0x3FFE;
5259 aSig0 |= LIT64( 0x0001000000000000 );
5260 zSig0 = estimateSqrt32((int16)aExp, (bits32)(aSig0>>17));
5261 shortShift128Left( aSig0, aSig1, 13 - ( aExp & 1 ), &aSig0, &aSig1 );
5262 zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 );
5263 doubleZSig0 = zSig0<<1;
5264 mul64To128( zSig0, zSig0, &term0, &term1 );
5265 sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 );
5266 while ( (sbits64) rem0 < 0 ) {
5267 --zSig0;
5268 doubleZSig0 -= 2;
5269 add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 );
5270 }
5271 zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 );
5272 if ( ( zSig1 & 0x1FFF ) <= 5 ) {
5273 if ( zSig1 == 0 ) zSig1 = 1;
5274 mul64To128( doubleZSig0, zSig1, &term1, &term2 );
5275 sub128( rem1, 0, term1, term2, &rem1, &rem2 );
5276 mul64To128( zSig1, zSig1, &term2, &term3 );
5277 sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 );
5278 while ( (sbits64) rem1 < 0 ) {
5279 --zSig1;
5280 shortShift128Left( 0, zSig1, 1, &term2, &term3 );
5281 term3 |= 1;
5282 term2 |= doubleZSig0;
5283 add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 );
5284 }
5285 zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
5286 }
5287 shift128ExtraRightJamming( zSig0, zSig1, 0, 14, &zSig0, &zSig1, &zSig2 );
5288 return roundAndPackFloat128( 0, zExp, zSig0, zSig1, zSig2 );
5289
5290 }
5291
5292 /*
5293 -------------------------------------------------------------------------------
5294 Returns 1 if the quadruple-precision floating-point value `a' is equal to
5295 the corresponding value `b', and 0 otherwise. The comparison is performed
5296 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5297 -------------------------------------------------------------------------------
5298 */
5299 flag float128_eq( float128 a, float128 b )
5300 {
5301
5302 if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
5303 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
5304 || ( ( extractFloat128Exp( b ) == 0x7FFF )
5305 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
5306 ) {
5307 if ( float128_is_signaling_nan( a )
5308 || float128_is_signaling_nan( b ) ) {
5309 float_raise( float_flag_invalid );
5310 }
5311 return 0;
5312 }
5313 return
5314 ( a.low == b.low )
5315 && ( ( a.high == b.high )
5316 || ( ( a.low == 0 )
5317 && ( (bits64) ( ( a.high | b.high )<<1 ) == 0 ) )
5318 );
5319
5320 }
5321
5322 /*
5323 -------------------------------------------------------------------------------
5324 Returns 1 if the quadruple-precision floating-point value `a' is less than
5325 or equal to the corresponding value `b', and 0 otherwise. The comparison
5326 is performed according to the IEC/IEEE Standard for Binary Floating-Point
5327 Arithmetic.
5328 -------------------------------------------------------------------------------
5329 */
5330 flag float128_le( float128 a, float128 b )
5331 {
5332 flag aSign, bSign;
5333
5334 if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
5335 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
5336 || ( ( extractFloat128Exp( b ) == 0x7FFF )
5337 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
5338 ) {
5339 float_raise( float_flag_invalid );
5340 return 0;
5341 }
5342 aSign = extractFloat128Sign( a );
5343 bSign = extractFloat128Sign( b );
5344 if ( aSign != bSign ) {
5345 return
5346 aSign
5347 || ( ( ( (bits64) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5348 == 0 );
5349 }
5350 return
5351 aSign ? le128( b.high, b.low, a.high, a.low )
5352 : le128( a.high, a.low, b.high, b.low );
5353
5354 }
5355
5356 /*
5357 -------------------------------------------------------------------------------
5358 Returns 1 if the quadruple-precision floating-point value `a' is less than
5359 the corresponding value `b', and 0 otherwise. The comparison is performed
5360 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5361 -------------------------------------------------------------------------------
5362 */
5363 flag float128_lt( float128 a, float128 b )
5364 {
5365 flag aSign, bSign;
5366
5367 if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
5368 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
5369 || ( ( extractFloat128Exp( b ) == 0x7FFF )
5370 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
5371 ) {
5372 float_raise( float_flag_invalid );
5373 return 0;
5374 }
5375 aSign = extractFloat128Sign( a );
5376 bSign = extractFloat128Sign( b );
5377 if ( aSign != bSign ) {
5378 return
5379 aSign
5380 && ( ( ( (bits64) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5381 != 0 );
5382 }
5383 return
5384 aSign ? lt128( b.high, b.low, a.high, a.low )
5385 : lt128( a.high, a.low, b.high, b.low );
5386
5387 }
5388
5389 /*
5390 -------------------------------------------------------------------------------
5391 Returns 1 if the quadruple-precision floating-point value `a' is equal to
5392 the corresponding value `b', and 0 otherwise. The invalid exception is
5393 raised if either operand is a NaN. Otherwise, the comparison is performed
5394 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5395 -------------------------------------------------------------------------------
5396 */
5397 flag float128_eq_signaling( float128 a, float128 b )
5398 {
5399
5400 if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
5401 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
5402 || ( ( extractFloat128Exp( b ) == 0x7FFF )
5403 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
5404 ) {
5405 float_raise( float_flag_invalid );
5406 return 0;
5407 }
5408 return
5409 ( a.low == b.low )
5410 && ( ( a.high == b.high )
5411 || ( ( a.low == 0 )
5412 && ( (bits64) ( ( a.high | b.high )<<1 ) == 0 ) )
5413 );
5414
5415 }
5416
5417 /*
5418 -------------------------------------------------------------------------------
5419 Returns 1 if the quadruple-precision floating-point value `a' is less than
5420 or equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not
5421 cause an exception. Otherwise, the comparison is performed according to the
5422 IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5423 -------------------------------------------------------------------------------
5424 */
5425 flag float128_le_quiet( float128 a, float128 b )
5426 {
5427 flag aSign, bSign;
5428
5429 if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
5430 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
5431 || ( ( extractFloat128Exp( b ) == 0x7FFF )
5432 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
5433 ) {
5434 if ( float128_is_signaling_nan( a )
5435 || float128_is_signaling_nan( b ) ) {
5436 float_raise( float_flag_invalid );
5437 }
5438 return 0;
5439 }
5440 aSign = extractFloat128Sign( a );
5441 bSign = extractFloat128Sign( b );
5442 if ( aSign != bSign ) {
5443 return
5444 aSign
5445 || ( ( ( (bits64) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5446 == 0 );
5447 }
5448 return
5449 aSign ? le128( b.high, b.low, a.high, a.low )
5450 : le128( a.high, a.low, b.high, b.low );
5451
5452 }
5453
5454 /*
5455 -------------------------------------------------------------------------------
5456 Returns 1 if the quadruple-precision floating-point value `a' is less than
5457 the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
5458 exception. Otherwise, the comparison is performed according to the IEC/IEEE
5459 Standard for Binary Floating-Point Arithmetic.
5460 -------------------------------------------------------------------------------
5461 */
5462 flag float128_lt_quiet( float128 a, float128 b )
5463 {
5464 flag aSign, bSign;
5465
5466 if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
5467 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
5468 || ( ( extractFloat128Exp( b ) == 0x7FFF )
5469 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
5470 ) {
5471 if ( float128_is_signaling_nan( a )
5472 || float128_is_signaling_nan( b ) ) {
5473 float_raise( float_flag_invalid );
5474 }
5475 return 0;
5476 }
5477 aSign = extractFloat128Sign( a );
5478 bSign = extractFloat128Sign( b );
5479 if ( aSign != bSign ) {
5480 return
5481 aSign
5482 && ( ( ( (bits64) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5483 != 0 );
5484 }
5485 return
5486 aSign ? lt128( b.high, b.low, a.high, a.low )
5487 : lt128( a.high, a.low, b.high, b.low );
5488
5489 }
5490
5491 #endif
5492
5493
5494 #if defined(SOFTFLOAT_FOR_GCC) && defined(SOFTFLOAT_NEED_FIXUNS)
5495
5496 /*
5497 * These two routines are not part of the original softfloat distribution.
5498 *
5499 * They are based on the corresponding conversions to integer but return
5500 * unsigned numbers instead since these functions are required by GCC.
5501 *
5502 * Added by Mark Brinicombe <mark (at) NetBSD.org> 27/09/97
5503 *
5504 * float64 version overhauled for SoftFloat 2a [bjh21 2000-07-15]
5505 */
5506
5507 /*
5508 -------------------------------------------------------------------------------
5509 Returns the result of converting the double-precision floating-point value
5510 `a' to the 32-bit unsigned integer format. The conversion is
5511 performed according to the IEC/IEEE Standard for Binary Floating-point
5512 Arithmetic, except that the conversion is always rounded toward zero. If
5513 `a' is a NaN, the largest positive integer is returned. If the conversion
5514 overflows, the largest integer positive is returned.
5515 -------------------------------------------------------------------------------
5516 */
5517 uint32 float64_to_uint32_round_to_zero( float64 a )
5518 {
5519 flag aSign;
5520 int16 aExp, shiftCount;
5521 bits64 aSig, savedASig;
5522 uint32 z;
5523
5524 aSig = extractFloat64Frac( a );
5525 aExp = extractFloat64Exp( a );
5526 aSign = extractFloat64Sign( a );
5527
5528 if (aSign) {
5529 float_raise( float_flag_invalid );
5530 return(0);
5531 }
5532
5533 if ( 0x41E < aExp ) {
5534 float_raise( float_flag_invalid );
5535 return 0xffffffff;
5536 }
5537 else if ( aExp < 0x3FF ) {
5538 if ( aExp || aSig ) float_exception_flags |= float_flag_inexact;
5539 return 0;
5540 }
5541 aSig |= LIT64( 0x0010000000000000 );
5542 shiftCount = 0x433 - aExp;
5543 savedASig = aSig;
5544 aSig >>= shiftCount;
5545 z = (uint32)aSig;
5546 if ( ( aSig<<shiftCount ) != savedASig ) {
5547 float_exception_flags |= float_flag_inexact;
5548 }
5549 return z;
5550
5551 }
5552
5553 /*
5554 -------------------------------------------------------------------------------
5555 Returns the result of converting the single-precision floating-point value
5556 `a' to the 32-bit unsigned integer format. The conversion is
5557 performed according to the IEC/IEEE Standard for Binary Floating-point
5558 Arithmetic, except that the conversion is always rounded toward zero. If
5559 `a' is a NaN, the largest positive integer is returned. If the conversion
5560 overflows, the largest positive integer is returned.
5561 -------------------------------------------------------------------------------
5562 */
5563 uint32 float32_to_uint32_round_to_zero( float32 a )
5564 {
5565 flag aSign;
5566 int16 aExp, shiftCount;
5567 bits32 aSig;
5568 uint32 z;
5569
5570 aSig = extractFloat32Frac( a );
5571 aExp = extractFloat32Exp( a );
5572 aSign = extractFloat32Sign( a );
5573 shiftCount = aExp - 0x9E;
5574
5575 if (aSign) {
5576 float_raise( float_flag_invalid );
5577 return(0);
5578 }
5579 if ( 0 < shiftCount ) {
5580 float_raise( float_flag_invalid );
5581 return 0xFFFFFFFF;
5582 }
5583 else if ( aExp <= 0x7E ) {
5584 if ( aExp | aSig ) float_exception_flags |= float_flag_inexact;
5585 return 0;
5586 }
5587 aSig = ( aSig | 0x800000 )<<8;
5588 z = aSig>>( - shiftCount );
5589 if ( aSig<<( shiftCount & 31 ) ) {
5590 float_exception_flags |= float_flag_inexact;
5591 }
5592 return z;
5593
5594 }
5595
5596 #endif
5597