softfloat.c revision 1.7 1 /* $NetBSD: softfloat.c,v 1.7 2020/09/02 03:45:54 thorpej Exp $ */
2
3 /*
4 * This version hacked for use with gcc -msoft-float by bjh21.
5 * (Mostly a case of #ifdefing out things GCC doesn't need or provides
6 * itself).
7 */
8
9 /*
10 * Things you may want to define:
11 *
12 * SOFTFLOAT_FOR_GCC - build only those functions necessary for GCC (with
13 * -msoft-float) to work. Include "softfloat-for-gcc.h" to get them
14 * properly renamed.
15 */
16
17 /*============================================================================
18
19 This C source file is part of the SoftFloat IEC/IEEE Floating-point Arithmetic
20 Package, Release 2b.
21
22 Written by John R. Hauser. This work was made possible in part by the
23 International Computer Science Institute, located at Suite 600, 1947 Center
24 Street, Berkeley, California 94704. Funding was partially provided by the
25 National Science Foundation under grant MIP-9311980. The original version
26 of this code was written as part of a project to build a fixed-point vector
27 processor in collaboration with the University of California at Berkeley,
28 overseen by Profs. Nelson Morgan and John Wawrzynek. More information
29 is available through the Web page `http://www.cs.berkeley.edu/~jhauser/
30 arithmetic/SoftFloat.html'.
31
32 THIS SOFTWARE IS DISTRIBUTED AS IS, FOR FREE. Although reasonable effort has
33 been made to avoid it, THIS SOFTWARE MAY CONTAIN FAULTS THAT WILL AT TIMES
34 RESULT IN INCORRECT BEHAVIOR. USE OF THIS SOFTWARE IS RESTRICTED TO PERSONS
35 AND ORGANIZATIONS WHO CAN AND WILL TAKE FULL RESPONSIBILITY FOR ALL LOSSES,
36 COSTS, OR OTHER PROBLEMS THEY INCUR DUE TO THE SOFTWARE, AND WHO FURTHERMORE
37 EFFECTIVELY INDEMNIFY JOHN HAUSER AND THE INTERNATIONAL COMPUTER SCIENCE
38 INSTITUTE (possibly via similar legal warning) AGAINST ALL LOSSES, COSTS, OR
39 OTHER PROBLEMS INCURRED BY THEIR CUSTOMERS AND CLIENTS DUE TO THE SOFTWARE.
40
41 Derivative works are acceptable, even for commercial purposes, so long as
42 (1) the source code for the derivative work includes prominent notice that
43 the work is derivative, and (2) the source code includes prominent notice with
44 these four paragraphs for those parts of this code that are retained.
45
46 =============================================================================*/
47
48 /* If you need this in a boot program, you have bigger problems... */
49 #ifndef _STANDALONE
50
51 #include <sys/cdefs.h>
52 #if defined(LIBC_SCCS) && !defined(lint)
53 __RCSID("$NetBSD: softfloat.c,v 1.7 2020/09/02 03:45:54 thorpej Exp $");
54 #endif /* LIBC_SCCS and not lint */
55
56 #ifdef SOFTFLOAT_FOR_GCC
57 #include "softfloat-for-gcc.h"
58 #endif
59
60 #include "milieu.h"
61 #include "softfloat.h"
62
63 /*
64 * Conversions between floats as stored in memory and floats as
65 * SoftFloat uses them
66 */
67 #ifndef FLOAT64_DEMANGLE
68 #define FLOAT64_DEMANGLE(a) (a)
69 #endif
70 #ifndef FLOAT64_MANGLE
71 #define FLOAT64_MANGLE(a) (a)
72 #endif
73
74 /*----------------------------------------------------------------------------
75 | Floating-point rounding mode, extended double-precision rounding precision,
76 | and exception flags.
77 *----------------------------------------------------------------------------*/
78 /*
79 * XXX: This may cause options-MULTIPROCESSOR or thread problems someday.
80 * Right now, it does not. I've removed all other dynamic global
81 * variables. [ross]
82 */
83 #ifdef FLOATX80
84 int8 floatx80_rounding_precision = 80;
85 #endif
86
87 /*----------------------------------------------------------------------------
88 | Primitive arithmetic functions, including multi-word arithmetic, and
89 | division and square root approximations. (Can be specialized to target if
90 | desired.)
91 *----------------------------------------------------------------------------*/
92 #include "softfloat-macros.h"
93
94 /*----------------------------------------------------------------------------
95 | Functions and definitions to determine: (1) whether tininess for underflow
96 | is detected before or after rounding by default, (2) what (if anything)
97 | happens when exceptions are raised, (3) how signaling NaNs are distinguished
98 | from quiet NaNs, (4) the default generated quiet NaNs, and (5) how NaNs
99 | are propagated from function inputs to output. These details are target-
100 | specific.
101 *----------------------------------------------------------------------------*/
102 #include "softfloat-specialize.h"
103
104 #ifndef SOFTFLOAT_FOR_GCC /* Not used */
105 /*----------------------------------------------------------------------------
106 | Takes a 64-bit fixed-point value `absZ' with binary point between bits 6
107 | and 7, and returns the properly rounded 32-bit integer corresponding to the
108 | input. If `zSign' is 1, the input is negated before being converted to an
109 | integer. Bit 63 of `absZ' must be zero. Ordinarily, the fixed-point input
110 | is simply rounded to an integer, with the inexact exception raised if the
111 | input cannot be represented exactly as an integer. However, if the fixed-
112 | point input is too large, the invalid exception is raised and the largest
113 | positive or negative integer is returned.
114 *----------------------------------------------------------------------------*/
115
116 static int32 roundAndPackInt32( flag zSign, bits64 absZ )
117 {
118 int8 roundingMode;
119 flag roundNearestEven;
120 int8 roundIncrement, roundBits;
121 int32 z;
122
123 roundingMode = float_rounding_mode();
124 roundNearestEven = ( roundingMode == float_round_nearest_even );
125 roundIncrement = 0x40;
126 if ( ! roundNearestEven ) {
127 if ( roundingMode == float_round_to_zero ) {
128 roundIncrement = 0;
129 }
130 else {
131 roundIncrement = 0x7F;
132 if ( zSign ) {
133 if ( roundingMode == float_round_up ) roundIncrement = 0;
134 }
135 else {
136 if ( roundingMode == float_round_down ) roundIncrement = 0;
137 }
138 }
139 }
140 roundBits = absZ & 0x7F;
141 absZ = ( absZ + roundIncrement )>>7;
142 absZ &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven );
143 z = absZ;
144 if ( zSign ) z = - z;
145 if ( ( absZ>>32 ) || ( z && ( ( z < 0 ) ^ zSign ) ) ) {
146 float_raise( float_flag_invalid );
147 return zSign ? (sbits32) 0x80000000 : 0x7FFFFFFF;
148 }
149 if ( roundBits ) float_set_inexact();
150 return z;
151
152 }
153
154 /*----------------------------------------------------------------------------
155 | Takes the 128-bit fixed-point value formed by concatenating `absZ0' and
156 | `absZ1', with binary point between bits 63 and 64 (between the input words),
157 | and returns the properly rounded 64-bit integer corresponding to the input.
158 | If `zSign' is 1, the input is negated before being converted to an integer.
159 | Ordinarily, the fixed-point input is simply rounded to an integer, with
160 | the inexact exception raised if the input cannot be represented exactly as
161 | an integer. However, if the fixed-point input is too large, the invalid
162 | exception is raised and the largest positive or negative integer is
163 | returned.
164 *----------------------------------------------------------------------------*/
165
166 static int64 roundAndPackInt64( flag zSign, bits64 absZ0, bits64 absZ1 )
167 {
168 int8 roundingMode;
169 flag roundNearestEven, increment;
170 int64 z;
171
172 roundingMode = float_rounding_mode();
173 roundNearestEven = ( roundingMode == float_round_nearest_even );
174 increment = ( (sbits64) absZ1 < 0 );
175 if ( ! roundNearestEven ) {
176 if ( roundingMode == float_round_to_zero ) {
177 increment = 0;
178 }
179 else {
180 if ( zSign ) {
181 increment = ( roundingMode == float_round_down ) && absZ1;
182 }
183 else {
184 increment = ( roundingMode == float_round_up ) && absZ1;
185 }
186 }
187 }
188 if ( increment ) {
189 ++absZ0;
190 if ( absZ0 == 0 ) goto overflow;
191 absZ0 &= ~ ( ( (bits64) ( absZ1<<1 ) == 0 ) & roundNearestEven );
192 }
193 z = absZ0;
194 if ( zSign ) z = - z;
195 if ( z && ( ( z < 0 ) ^ zSign ) ) {
196 overflow:
197 float_raise( float_flag_invalid );
198 return
199 zSign ? (sbits64) LIT64( 0x8000000000000000 )
200 : LIT64( 0x7FFFFFFFFFFFFFFF );
201 }
202 if ( absZ1 ) float_set_inexact();
203 return z;
204
205 }
206
207 /* same as above, but for unsigned values */
208 static uint64 roundAndPackUInt64( bits64 absZ0, bits64 absZ1 )
209 {
210 int8 roundingMode;
211 flag roundNearestEven, increment;
212 uint64 z;
213
214 roundingMode = float_rounding_mode();
215 roundNearestEven = ( roundingMode == float_round_nearest_even );
216 increment = ( (sbits64) absZ1 < 0 );
217 if ( ! roundNearestEven ) {
218 if ( roundingMode == float_round_to_zero ) {
219 increment = 0;
220 }
221 else {
222 increment = ( roundingMode == float_round_up ) && absZ1;
223 }
224 }
225 if ( increment ) {
226 ++absZ0;
227 if ( absZ0 == 0 ) {
228 float_raise( float_flag_invalid );
229 return LIT64( 0x7FFFFFFFFFFFFFFF );
230 }
231 absZ0 &= ~ ( ( (bits64) ( absZ1<<1 ) == 0 ) & roundNearestEven );
232 }
233 z = absZ0;
234 if ( absZ1 ) float_set_inexact();
235 return z;
236
237 }
238 #endif /* SOFTFLOAT_FOR_GCC */
239
240 /*----------------------------------------------------------------------------
241 | Returns the fraction bits of the single-precision floating-point value `a'.
242 *----------------------------------------------------------------------------*/
243
244 INLINE bits32 extractFloat32Frac( float32 a )
245 {
246
247 return a & 0x007FFFFF;
248
249 }
250
251 /*----------------------------------------------------------------------------
252 | Returns the exponent bits of the single-precision floating-point value `a'.
253 *----------------------------------------------------------------------------*/
254
255 INLINE int16 extractFloat32Exp( float32 a )
256 {
257
258 return ( a>>23 ) & 0xFF;
259
260 }
261
262 /*----------------------------------------------------------------------------
263 | Returns the sign bit of the single-precision floating-point value `a'.
264 *----------------------------------------------------------------------------*/
265
266 INLINE flag extractFloat32Sign( float32 a )
267 {
268
269 return a>>31;
270
271 }
272
273 /*----------------------------------------------------------------------------
274 | Normalizes the subnormal single-precision floating-point value represented
275 | by the denormalized significand `aSig'. The normalized exponent and
276 | significand are stored at the locations pointed to by `zExpPtr' and
277 | `zSigPtr', respectively.
278 *----------------------------------------------------------------------------*/
279
280 static void
281 normalizeFloat32Subnormal( bits32 aSig, int16 *zExpPtr, bits32 *zSigPtr )
282 {
283 int8 shiftCount;
284
285 shiftCount = countLeadingZeros32( aSig ) - 8;
286 *zSigPtr = aSig<<shiftCount;
287 *zExpPtr = 1 - shiftCount;
288
289 }
290
291 /*----------------------------------------------------------------------------
292 | Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
293 | single-precision floating-point value, returning the result. After being
294 | shifted into the proper positions, the three fields are simply added
295 | together to form the result. This means that any integer portion of `zSig'
296 | will be added into the exponent. Since a properly normalized significand
297 | will have an integer portion equal to 1, the `zExp' input should be 1 less
298 | than the desired result exponent whenever `zSig' is a complete, normalized
299 | significand.
300 *----------------------------------------------------------------------------*/
301
302 INLINE float32 packFloat32( flag zSign, int16 zExp, bits32 zSig )
303 {
304
305 return ( ( (bits32) zSign )<<31 ) + ( ( (bits32) zExp )<<23 ) + zSig;
306
307 }
308
309 /*----------------------------------------------------------------------------
310 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
311 | and significand `zSig', and returns the proper single-precision floating-
312 | point value corresponding to the abstract input. Ordinarily, the abstract
313 | value is simply rounded and packed into the single-precision format, with
314 | the inexact exception raised if the abstract input cannot be represented
315 | exactly. However, if the abstract value is too large, the overflow and
316 | inexact exceptions are raised and an infinity or maximal finite value is
317 | returned. If the abstract value is too small, the input value is rounded to
318 | a subnormal number, and the underflow and inexact exceptions are raised if
319 | the abstract input cannot be represented exactly as a subnormal single-
320 | precision floating-point number.
321 | The input significand `zSig' has its binary point between bits 30
322 | and 29, which is 7 bits to the left of the usual location. This shifted
323 | significand must be normalized or smaller. If `zSig' is not normalized,
324 | `zExp' must be 0; in that case, the result returned is a subnormal number,
325 | and it must not require rounding. In the usual case that `zSig' is
326 | normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
327 | The handling of underflow and overflow follows the IEC/IEEE Standard for
328 | Binary Floating-Point Arithmetic.
329 *----------------------------------------------------------------------------*/
330
331 static float32 roundAndPackFloat32( flag zSign, int16 zExp, bits32 zSig )
332 {
333 int8 roundingMode;
334 flag roundNearestEven;
335 int8 roundIncrement, roundBits;
336 flag isTiny;
337
338 roundingMode = float_rounding_mode();
339 roundNearestEven = ( roundingMode == float_round_nearest_even );
340 roundIncrement = 0x40;
341 if ( ! roundNearestEven ) {
342 if ( roundingMode == float_round_to_zero ) {
343 roundIncrement = 0;
344 }
345 else {
346 roundIncrement = 0x7F;
347 if ( zSign ) {
348 if ( roundingMode == float_round_up ) roundIncrement = 0;
349 }
350 else {
351 if ( roundingMode == float_round_down ) roundIncrement = 0;
352 }
353 }
354 }
355 roundBits = zSig & 0x7F;
356 if ( 0xFD <= (bits16) zExp ) {
357 if ( ( 0xFD < zExp )
358 || ( ( zExp == 0xFD )
359 && ( (sbits32) ( zSig + roundIncrement ) < 0 ) )
360 ) {
361 float_raise( float_flag_overflow | float_flag_inexact );
362 return packFloat32( zSign, 0xFF, 0 ) - ( roundIncrement == 0 );
363 }
364 if ( zExp < 0 ) {
365 isTiny =
366 ( float_detect_tininess == float_tininess_before_rounding )
367 || ( zExp < -1 )
368 || ( zSig + roundIncrement < 0x80000000 );
369 shift32RightJamming( zSig, - zExp, &zSig );
370 zExp = 0;
371 roundBits = zSig & 0x7F;
372 if ( isTiny && roundBits ) float_raise( float_flag_underflow );
373 }
374 }
375 if ( roundBits ) float_set_inexact();
376 zSig = ( zSig + roundIncrement )>>7;
377 zSig &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven );
378 if ( zSig == 0 ) zExp = 0;
379 return packFloat32( zSign, zExp, zSig );
380
381 }
382
383 /*----------------------------------------------------------------------------
384 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
385 | and significand `zSig', and returns the proper single-precision floating-
386 | point value corresponding to the abstract input. This routine is just like
387 | `roundAndPackFloat32' except that `zSig' does not have to be normalized.
388 | Bit 31 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
389 | floating-point exponent.
390 *----------------------------------------------------------------------------*/
391
392 static float32
393 normalizeRoundAndPackFloat32( flag zSign, int16 zExp, bits32 zSig )
394 {
395 int8 shiftCount;
396
397 shiftCount = countLeadingZeros32( zSig ) - 1;
398 return roundAndPackFloat32( zSign, zExp - shiftCount, zSig<<shiftCount );
399
400 }
401
402 /*----------------------------------------------------------------------------
403 | Returns the fraction bits of the double-precision floating-point value `a'.
404 *----------------------------------------------------------------------------*/
405
406 INLINE bits64 extractFloat64Frac( float64 a )
407 {
408
409 return FLOAT64_DEMANGLE(a) & LIT64( 0x000FFFFFFFFFFFFF );
410
411 }
412
413 /*----------------------------------------------------------------------------
414 | Returns the exponent bits of the double-precision floating-point value `a'.
415 *----------------------------------------------------------------------------*/
416
417 INLINE int16 extractFloat64Exp( float64 a )
418 {
419
420 return ( FLOAT64_DEMANGLE(a)>>52 ) & 0x7FF;
421
422 }
423 /*----------------------------------------------------------------------------
424 | Returns the sign bit of the double-precision floating-point value `a'.
425 *----------------------------------------------------------------------------*/
426
427 INLINE flag extractFloat64Sign( float64 a )
428 {
429
430 return FLOAT64_DEMANGLE(a)>>63;
431
432 }
433
434 /*----------------------------------------------------------------------------
435 | Normalizes the subnormal double-precision floating-point value represented
436 | by the denormalized significand `aSig'. The normalized exponent and
437 | significand are stored at the locations pointed to by `zExpPtr' and
438 | `zSigPtr', respectively.
439 *----------------------------------------------------------------------------*/
440
441 static void
442 normalizeFloat64Subnormal( bits64 aSig, int16 *zExpPtr, bits64 *zSigPtr )
443 {
444 int8 shiftCount;
445
446 shiftCount = countLeadingZeros64( aSig ) - 11;
447 *zSigPtr = aSig<<shiftCount;
448 *zExpPtr = 1 - shiftCount;
449
450 }
451
452 /*----------------------------------------------------------------------------
453 | Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
454 | double-precision floating-point value, returning the result. After being
455 | shifted into the proper positions, the three fields are simply added
456 | together to form the result. This means that any integer portion of `zSig'
457 | will be added into the exponent. Since a properly normalized significand
458 | will have an integer portion equal to 1, the `zExp' input should be 1 less
459 | than the desired result exponent whenever `zSig' is a complete, normalized
460 | significand.
461 *----------------------------------------------------------------------------*/
462
463 INLINE float64 packFloat64( flag zSign, int16 zExp, bits64 zSig )
464 {
465
466 return FLOAT64_MANGLE( ( ( (bits64) zSign )<<63 ) +
467 ( ( (bits64) zExp )<<52 ) + zSig );
468
469 }
470
471 /*----------------------------------------------------------------------------
472 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
473 | and significand `zSig', and returns the proper double-precision floating-
474 | point value corresponding to the abstract input. Ordinarily, the abstract
475 | value is simply rounded and packed into the double-precision format, with
476 | the inexact exception raised if the abstract input cannot be represented
477 | exactly. However, if the abstract value is too large, the overflow and
478 | inexact exceptions are raised and an infinity or maximal finite value is
479 | returned. If the abstract value is too small, the input value is rounded
480 | to a subnormal number, and the underflow and inexact exceptions are raised
481 | if the abstract input cannot be represented exactly as a subnormal double-
482 | precision floating-point number.
483 | The input significand `zSig' has its binary point between bits 62
484 | and 61, which is 10 bits to the left of the usual location. This shifted
485 | significand must be normalized or smaller. If `zSig' is not normalized,
486 | `zExp' must be 0; in that case, the result returned is a subnormal number,
487 | and it must not require rounding. In the usual case that `zSig' is
488 | normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
489 | The handling of underflow and overflow follows the IEC/IEEE Standard for
490 | Binary Floating-Point Arithmetic.
491 *----------------------------------------------------------------------------*/
492
493 static float64 roundAndPackFloat64( flag zSign, int16 zExp, bits64 zSig )
494 {
495 int8 roundingMode;
496 flag roundNearestEven;
497 int16 roundIncrement, roundBits;
498 flag isTiny;
499
500 roundingMode = float_rounding_mode();
501 roundNearestEven = ( roundingMode == float_round_nearest_even );
502 roundIncrement = 0x200;
503 if ( ! roundNearestEven ) {
504 if ( roundingMode == float_round_to_zero ) {
505 roundIncrement = 0;
506 }
507 else {
508 roundIncrement = 0x3FF;
509 if ( zSign ) {
510 if ( roundingMode == float_round_up ) roundIncrement = 0;
511 }
512 else {
513 if ( roundingMode == float_round_down ) roundIncrement = 0;
514 }
515 }
516 }
517 roundBits = zSig & 0x3FF;
518 if ( 0x7FD <= (bits16) zExp ) {
519 if ( ( 0x7FD < zExp )
520 || ( ( zExp == 0x7FD )
521 && ( (sbits64) ( zSig + roundIncrement ) < 0 ) )
522 ) {
523 float_raise( float_flag_overflow | float_flag_inexact );
524 return FLOAT64_MANGLE(
525 FLOAT64_DEMANGLE(packFloat64( zSign, 0x7FF, 0 )) -
526 ( roundIncrement == 0 ));
527 }
528 if ( zExp < 0 ) {
529 isTiny =
530 ( float_detect_tininess == float_tininess_before_rounding )
531 || ( zExp < -1 )
532 || ( zSig + roundIncrement < LIT64( 0x8000000000000000 ) );
533 shift64RightJamming( zSig, - zExp, &zSig );
534 zExp = 0;
535 roundBits = zSig & 0x3FF;
536 if ( isTiny && roundBits ) float_raise( float_flag_underflow );
537 }
538 }
539 if ( roundBits ) float_set_inexact();
540 zSig = ( zSig + roundIncrement )>>10;
541 zSig &= ~ ( ( ( roundBits ^ 0x200 ) == 0 ) & roundNearestEven );
542 if ( zSig == 0 ) zExp = 0;
543 return packFloat64( zSign, zExp, zSig );
544
545 }
546
547 /*----------------------------------------------------------------------------
548 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
549 | and significand `zSig', and returns the proper double-precision floating-
550 | point value corresponding to the abstract input. This routine is just like
551 | `roundAndPackFloat64' except that `zSig' does not have to be normalized.
552 | Bit 63 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
553 | floating-point exponent.
554 *----------------------------------------------------------------------------*/
555
556 static float64
557 normalizeRoundAndPackFloat64( flag zSign, int16 zExp, bits64 zSig )
558 {
559 int8 shiftCount;
560
561 shiftCount = countLeadingZeros64( zSig ) - 1;
562 return roundAndPackFloat64( zSign, zExp - shiftCount, zSig<<shiftCount );
563
564 }
565
566 #ifdef FLOATX80
567
568 /*----------------------------------------------------------------------------
569 | Returns the fraction bits of the extended double-precision floating-point
570 | value `a'.
571 *----------------------------------------------------------------------------*/
572
573 INLINE bits64 extractFloatx80Frac( floatx80 a )
574 {
575
576 return a.low;
577
578 }
579
580 /*----------------------------------------------------------------------------
581 | Returns the exponent bits of the extended double-precision floating-point
582 | value `a'.
583 *----------------------------------------------------------------------------*/
584
585 INLINE int32 extractFloatx80Exp( floatx80 a )
586 {
587
588 return a.high & 0x7FFF;
589
590 }
591
592 /*----------------------------------------------------------------------------
593 | Returns the sign bit of the extended double-precision floating-point value
594 | `a'.
595 *----------------------------------------------------------------------------*/
596
597 INLINE flag extractFloatx80Sign( floatx80 a )
598 {
599
600 return a.high>>15;
601
602 }
603
604 /*----------------------------------------------------------------------------
605 | Normalizes the subnormal extended double-precision floating-point value
606 | represented by the denormalized significand `aSig'. The normalized exponent
607 | and significand are stored at the locations pointed to by `zExpPtr' and
608 | `zSigPtr', respectively.
609 *----------------------------------------------------------------------------*/
610
611 static void
612 normalizeFloatx80Subnormal( bits64 aSig, int32 *zExpPtr, bits64 *zSigPtr )
613 {
614 int8 shiftCount;
615
616 shiftCount = countLeadingZeros64( aSig );
617 *zSigPtr = aSig<<shiftCount;
618 *zExpPtr = 1 - shiftCount;
619
620 }
621
622 /*----------------------------------------------------------------------------
623 | Packs the sign `zSign', exponent `zExp', and significand `zSig' into an
624 | extended double-precision floating-point value, returning the result.
625 *----------------------------------------------------------------------------*/
626
627 INLINE floatx80 packFloatx80( flag zSign, int32 zExp, bits64 zSig )
628 {
629 floatx80 z;
630
631 z.low = zSig;
632 z.high = ( ( (bits16) zSign )<<15 ) + zExp;
633 return z;
634
635 }
636
637 /*----------------------------------------------------------------------------
638 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
639 | and extended significand formed by the concatenation of `zSig0' and `zSig1',
640 | and returns the proper extended double-precision floating-point value
641 | corresponding to the abstract input. Ordinarily, the abstract value is
642 | rounded and packed into the extended double-precision format, with the
643 | inexact exception raised if the abstract input cannot be represented
644 | exactly. However, if the abstract value is too large, the overflow and
645 | inexact exceptions are raised and an infinity or maximal finite value is
646 | returned. If the abstract value is too small, the input value is rounded to
647 | a subnormal number, and the underflow and inexact exceptions are raised if
648 | the abstract input cannot be represented exactly as a subnormal extended
649 | double-precision floating-point number.
650 | If `roundingPrecision' is 32 or 64, the result is rounded to the same
651 | number of bits as single or double precision, respectively. Otherwise, the
652 | result is rounded to the full precision of the extended double-precision
653 | format.
654 | The input significand must be normalized or smaller. If the input
655 | significand is not normalized, `zExp' must be 0; in that case, the result
656 | returned is a subnormal number, and it must not require rounding. The
657 | handling of underflow and overflow follows the IEC/IEEE Standard for Binary
658 | Floating-Point Arithmetic.
659 *----------------------------------------------------------------------------*/
660
661 static floatx80
662 roundAndPackFloatx80(
663 int8 roundingPrecision, flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1
664 )
665 {
666 int8 roundingMode;
667 flag roundNearestEven, increment, isTiny;
668 int64 roundIncrement, roundMask, roundBits;
669
670 roundingMode = float_rounding_mode();
671 roundNearestEven = ( roundingMode == float_round_nearest_even );
672 if ( roundingPrecision == 80 ) goto precision80;
673 if ( roundingPrecision == 64 ) {
674 roundIncrement = LIT64( 0x0000000000000400 );
675 roundMask = LIT64( 0x00000000000007FF );
676 }
677 else if ( roundingPrecision == 32 ) {
678 roundIncrement = LIT64( 0x0000008000000000 );
679 roundMask = LIT64( 0x000000FFFFFFFFFF );
680 }
681 else {
682 goto precision80;
683 }
684 zSig0 |= ( zSig1 != 0 );
685 if ( ! roundNearestEven ) {
686 if ( roundingMode == float_round_to_zero ) {
687 roundIncrement = 0;
688 }
689 else {
690 roundIncrement = roundMask;
691 if ( zSign ) {
692 if ( roundingMode == float_round_up ) roundIncrement = 0;
693 }
694 else {
695 if ( roundingMode == float_round_down ) roundIncrement = 0;
696 }
697 }
698 }
699 roundBits = zSig0 & roundMask;
700 if ( 0x7FFD <= (bits32) ( zExp - 1 ) ) {
701 if ( ( 0x7FFE < zExp )
702 || ( ( zExp == 0x7FFE ) && ( zSig0 + roundIncrement < zSig0 ) )
703 ) {
704 goto overflow;
705 }
706 if ( zExp <= 0 ) {
707 isTiny =
708 ( float_detect_tininess == float_tininess_before_rounding )
709 || ( zExp < 0 )
710 || ( zSig0 <= zSig0 + roundIncrement );
711 shift64RightJamming( zSig0, 1 - zExp, &zSig0 );
712 zExp = 0;
713 roundBits = zSig0 & roundMask;
714 if ( isTiny && roundBits ) float_raise( float_flag_underflow );
715 if ( roundBits ) float_set_inexact();
716 zSig0 += roundIncrement;
717 if ( (sbits64) zSig0 < 0 ) zExp = 1;
718 roundIncrement = roundMask + 1;
719 if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) {
720 roundMask |= roundIncrement;
721 }
722 zSig0 &= ~ roundMask;
723 return packFloatx80( zSign, zExp, zSig0 );
724 }
725 }
726 if ( roundBits ) float_set_inexact();
727 zSig0 += roundIncrement;
728 if ( zSig0 < roundIncrement ) {
729 ++zExp;
730 zSig0 = LIT64( 0x8000000000000000 );
731 }
732 roundIncrement = roundMask + 1;
733 if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) {
734 roundMask |= roundIncrement;
735 }
736 zSig0 &= ~ roundMask;
737 if ( zSig0 == 0 ) zExp = 0;
738 return packFloatx80( zSign, zExp, zSig0 );
739 precision80:
740 increment = ( (sbits64) zSig1 < 0 );
741 if ( ! roundNearestEven ) {
742 if ( roundingMode == float_round_to_zero ) {
743 increment = 0;
744 }
745 else {
746 if ( zSign ) {
747 increment = ( roundingMode == float_round_down ) && zSig1;
748 }
749 else {
750 increment = ( roundingMode == float_round_up ) && zSig1;
751 }
752 }
753 }
754 if ( 0x7FFD <= (bits32) ( zExp - 1 ) ) {
755 if ( ( 0x7FFE < zExp )
756 || ( ( zExp == 0x7FFE )
757 && ( zSig0 == LIT64( 0xFFFFFFFFFFFFFFFF ) )
758 && increment
759 )
760 ) {
761 roundMask = 0;
762 overflow:
763 float_raise( float_flag_overflow | float_flag_inexact );
764 if ( ( roundingMode == float_round_to_zero )
765 || ( zSign && ( roundingMode == float_round_up ) )
766 || ( ! zSign && ( roundingMode == float_round_down ) )
767 ) {
768 return packFloatx80( zSign, 0x7FFE, ~ roundMask );
769 }
770 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
771 }
772 if ( zExp <= 0 ) {
773 isTiny =
774 ( float_detect_tininess == float_tininess_before_rounding )
775 || ( zExp < 0 )
776 || ! increment
777 || ( zSig0 < LIT64( 0xFFFFFFFFFFFFFFFF ) );
778 shift64ExtraRightJamming( zSig0, zSig1, 1 - zExp, &zSig0, &zSig1 );
779 zExp = 0;
780 if ( isTiny && zSig1 ) float_raise( float_flag_underflow );
781 if ( zSig1 ) float_set_inexact();
782 if ( roundNearestEven ) {
783 increment = ( (sbits64) zSig1 < 0 );
784 }
785 else {
786 if ( zSign ) {
787 increment = ( roundingMode == float_round_down ) && zSig1;
788 }
789 else {
790 increment = ( roundingMode == float_round_up ) && zSig1;
791 }
792 }
793 if ( increment ) {
794 ++zSig0;
795 zSig0 &=
796 ~ ( ( (bits64) ( zSig1<<1 ) == 0 ) & roundNearestEven );
797 if ( (sbits64) zSig0 < 0 ) zExp = 1;
798 }
799 return packFloatx80( zSign, zExp, zSig0 );
800 }
801 }
802 if ( zSig1 ) float_set_inexact();
803 if ( increment ) {
804 ++zSig0;
805 if ( zSig0 == 0 ) {
806 ++zExp;
807 zSig0 = LIT64( 0x8000000000000000 );
808 }
809 else {
810 zSig0 &= ~ ( ( (bits64) ( zSig1<<1 ) == 0 ) & roundNearestEven );
811 }
812 }
813 else {
814 if ( zSig0 == 0 ) zExp = 0;
815 }
816 return packFloatx80( zSign, zExp, zSig0 );
817
818 }
819
820 /*----------------------------------------------------------------------------
821 | Takes an abstract floating-point value having sign `zSign', exponent
822 | `zExp', and significand formed by the concatenation of `zSig0' and `zSig1',
823 | and returns the proper extended double-precision floating-point value
824 | corresponding to the abstract input. This routine is just like
825 | `roundAndPackFloatx80' except that the input significand does not have to be
826 | normalized.
827 *----------------------------------------------------------------------------*/
828
829 static floatx80
830 normalizeRoundAndPackFloatx80(
831 int8 roundingPrecision, flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1
832 )
833 {
834 int8 shiftCount;
835
836 if ( zSig0 == 0 ) {
837 zSig0 = zSig1;
838 zSig1 = 0;
839 zExp -= 64;
840 }
841 shiftCount = countLeadingZeros64( zSig0 );
842 shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
843 zExp -= shiftCount;
844 return
845 roundAndPackFloatx80( roundingPrecision, zSign, zExp, zSig0, zSig1 );
846
847 }
848
849 #endif
850
851 #ifdef FLOAT128
852
853 /*----------------------------------------------------------------------------
854 | Returns the least-significant 64 fraction bits of the quadruple-precision
855 | floating-point value `a'.
856 *----------------------------------------------------------------------------*/
857
858 INLINE bits64 extractFloat128Frac1( float128 a )
859 {
860
861 return a.low;
862
863 }
864
865 /*----------------------------------------------------------------------------
866 | Returns the most-significant 48 fraction bits of the quadruple-precision
867 | floating-point value `a'.
868 *----------------------------------------------------------------------------*/
869
870 INLINE bits64 extractFloat128Frac0( float128 a )
871 {
872
873 return a.high & LIT64( 0x0000FFFFFFFFFFFF );
874
875 }
876
877 /*----------------------------------------------------------------------------
878 | Returns the exponent bits of the quadruple-precision floating-point value
879 | `a'.
880 *----------------------------------------------------------------------------*/
881
882 INLINE int32 extractFloat128Exp( float128 a )
883 {
884
885 return ( a.high>>48 ) & 0x7FFF;
886
887 }
888
889
890 /*----------------------------------------------------------------------------
891 | Returns the sign bit of the quadruple-precision floating-point value `a'.
892 *----------------------------------------------------------------------------*/
893
894 INLINE flag extractFloat128Sign( float128 a )
895 {
896
897 return a.high>>63;
898
899 }
900
901 /*----------------------------------------------------------------------------
902 | Normalizes the subnormal quadruple-precision floating-point value
903 | represented by the denormalized significand formed by the concatenation of
904 | `aSig0' and `aSig1'. The normalized exponent is stored at the location
905 | pointed to by `zExpPtr'. The most significant 49 bits of the normalized
906 | significand are stored at the location pointed to by `zSig0Ptr', and the
907 | least significant 64 bits of the normalized significand are stored at the
908 | location pointed to by `zSig1Ptr'.
909 *----------------------------------------------------------------------------*/
910
911 static void
912 normalizeFloat128Subnormal(
913 bits64 aSig0,
914 bits64 aSig1,
915 int32 *zExpPtr,
916 bits64 *zSig0Ptr,
917 bits64 *zSig1Ptr
918 )
919 {
920 int8 shiftCount;
921
922 if ( aSig0 == 0 ) {
923 shiftCount = countLeadingZeros64( aSig1 ) - 15;
924 if ( shiftCount < 0 ) {
925 *zSig0Ptr = aSig1>>( - shiftCount );
926 *zSig1Ptr = aSig1<<( shiftCount & 63 );
927 }
928 else {
929 *zSig0Ptr = aSig1<<shiftCount;
930 *zSig1Ptr = 0;
931 }
932 *zExpPtr = - shiftCount - 63;
933 }
934 else {
935 shiftCount = countLeadingZeros64( aSig0 ) - 15;
936 shortShift128Left( aSig0, aSig1, shiftCount, zSig0Ptr, zSig1Ptr );
937 *zExpPtr = 1 - shiftCount;
938 }
939
940 }
941
942 /*----------------------------------------------------------------------------
943 | Packs the sign `zSign', the exponent `zExp', and the significand formed
944 | by the concatenation of `zSig0' and `zSig1' into a quadruple-precision
945 | floating-point value, returning the result. After being shifted into the
946 | proper positions, the three fields `zSign', `zExp', and `zSig0' are simply
947 | added together to form the most significant 32 bits of the result. This
948 | means that any integer portion of `zSig0' will be added into the exponent.
949 | Since a properly normalized significand will have an integer portion equal
950 | to 1, the `zExp' input should be 1 less than the desired result exponent
951 | whenever `zSig0' and `zSig1' concatenated form a complete, normalized
952 | significand.
953 *----------------------------------------------------------------------------*/
954
955 INLINE float128
956 packFloat128( flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1 )
957 {
958 float128 z;
959
960 z.low = zSig1;
961 z.high = ( ( (bits64) zSign )<<63 ) + ( ( (bits64) zExp )<<48 ) + zSig0;
962 return z;
963
964 }
965
966 /*----------------------------------------------------------------------------
967 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
968 | and extended significand formed by the concatenation of `zSig0', `zSig1',
969 | and `zSig2', and returns the proper quadruple-precision floating-point value
970 | corresponding to the abstract input. Ordinarily, the abstract value is
971 | simply rounded and packed into the quadruple-precision format, with the
972 | inexact exception raised if the abstract input cannot be represented
973 | exactly. However, if the abstract value is too large, the overflow and
974 | inexact exceptions are raised and an infinity or maximal finite value is
975 | returned. If the abstract value is too small, the input value is rounded to
976 | a subnormal number, and the underflow and inexact exceptions are raised if
977 | the abstract input cannot be represented exactly as a subnormal quadruple-
978 | precision floating-point number.
979 | The input significand must be normalized or smaller. If the input
980 | significand is not normalized, `zExp' must be 0; in that case, the result
981 | returned is a subnormal number, and it must not require rounding. In the
982 | usual case that the input significand is normalized, `zExp' must be 1 less
983 | than the ``true'' floating-point exponent. The handling of underflow and
984 | overflow follows the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
985 *----------------------------------------------------------------------------*/
986
987 static float128
988 roundAndPackFloat128(
989 flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1, bits64 zSig2 )
990 {
991 int8 roundingMode;
992 flag roundNearestEven, increment, isTiny;
993
994 roundingMode = float_rounding_mode();
995 roundNearestEven = ( roundingMode == float_round_nearest_even );
996 increment = ( (sbits64) zSig2 < 0 );
997 if ( ! roundNearestEven ) {
998 if ( roundingMode == float_round_to_zero ) {
999 increment = 0;
1000 }
1001 else {
1002 if ( zSign ) {
1003 increment = ( roundingMode == float_round_down ) && zSig2;
1004 }
1005 else {
1006 increment = ( roundingMode == float_round_up ) && zSig2;
1007 }
1008 }
1009 }
1010 if ( 0x7FFD <= (bits32) zExp ) {
1011 if ( ( 0x7FFD < zExp )
1012 || ( ( zExp == 0x7FFD )
1013 && eq128(
1014 LIT64( 0x0001FFFFFFFFFFFF ),
1015 LIT64( 0xFFFFFFFFFFFFFFFF ),
1016 zSig0,
1017 zSig1
1018 )
1019 && increment
1020 )
1021 ) {
1022 float_raise( float_flag_overflow | float_flag_inexact );
1023 if ( ( roundingMode == float_round_to_zero )
1024 || ( zSign && ( roundingMode == float_round_up ) )
1025 || ( ! zSign && ( roundingMode == float_round_down ) )
1026 ) {
1027 return
1028 packFloat128(
1029 zSign,
1030 0x7FFE,
1031 LIT64( 0x0000FFFFFFFFFFFF ),
1032 LIT64( 0xFFFFFFFFFFFFFFFF )
1033 );
1034 }
1035 return packFloat128( zSign, 0x7FFF, 0, 0 );
1036 }
1037 if ( zExp < 0 ) {
1038 isTiny =
1039 ( float_detect_tininess == float_tininess_before_rounding )
1040 || ( zExp < -1 )
1041 || ! increment
1042 || lt128(
1043 zSig0,
1044 zSig1,
1045 LIT64( 0x0001FFFFFFFFFFFF ),
1046 LIT64( 0xFFFFFFFFFFFFFFFF )
1047 );
1048 shift128ExtraRightJamming(
1049 zSig0, zSig1, zSig2, - zExp, &zSig0, &zSig1, &zSig2 );
1050 zExp = 0;
1051 if ( isTiny && zSig2 ) float_raise( float_flag_underflow );
1052 if ( roundNearestEven ) {
1053 increment = ( (sbits64) zSig2 < 0 );
1054 }
1055 else {
1056 if ( zSign ) {
1057 increment = ( roundingMode == float_round_down ) && zSig2;
1058 }
1059 else {
1060 increment = ( roundingMode == float_round_up ) && zSig2;
1061 }
1062 }
1063 }
1064 }
1065 if ( zSig2 ) float_set_inexact();
1066 if ( increment ) {
1067 add128( zSig0, zSig1, 0, 1, &zSig0, &zSig1 );
1068 zSig1 &= ~ ( ( zSig2 + zSig2 == 0 ) & roundNearestEven );
1069 }
1070 else {
1071 if ( ( zSig0 | zSig1 ) == 0 ) zExp = 0;
1072 }
1073 return packFloat128( zSign, zExp, zSig0, zSig1 );
1074
1075 }
1076
1077 /*----------------------------------------------------------------------------
1078 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
1079 | and significand formed by the concatenation of `zSig0' and `zSig1', and
1080 | returns the proper quadruple-precision floating-point value corresponding
1081 | to the abstract input. This routine is just like `roundAndPackFloat128'
1082 | except that the input significand has fewer bits and does not have to be
1083 | normalized. In all cases, `zExp' must be 1 less than the ``true'' floating-
1084 | point exponent.
1085 *----------------------------------------------------------------------------*/
1086
1087 static float128
1088 normalizeRoundAndPackFloat128(
1089 flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1 )
1090 {
1091 int8 shiftCount;
1092 bits64 zSig2;
1093
1094 if ( zSig0 == 0 ) {
1095 zSig0 = zSig1;
1096 zSig1 = 0;
1097 zExp -= 64;
1098 }
1099 shiftCount = countLeadingZeros64( zSig0 ) - 15;
1100 if ( 0 <= shiftCount ) {
1101 zSig2 = 0;
1102 shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
1103 }
1104 else {
1105 shift128ExtraRightJamming(
1106 zSig0, zSig1, 0, - shiftCount, &zSig0, &zSig1, &zSig2 );
1107 }
1108 zExp -= shiftCount;
1109 return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 );
1110
1111 }
1112
1113 #endif
1114
1115 /*----------------------------------------------------------------------------
1116 | Returns the result of converting the 32-bit two's complement integer `a'
1117 | to the single-precision floating-point format. The conversion is performed
1118 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1119 *----------------------------------------------------------------------------*/
1120
1121 float32 int32_to_float32( int32 a )
1122 {
1123 flag zSign;
1124
1125 if ( a == 0 ) return 0;
1126 if ( a == (sbits32) 0x80000000 ) return packFloat32( 1, 0x9E, 0 );
1127 zSign = ( a < 0 );
1128 return normalizeRoundAndPackFloat32( zSign, 0x9C, zSign ? - a : a );
1129
1130 }
1131
1132 /*----------------------------------------------------------------------------
1133 | Returns the result of converting the 32-bit two's complement integer `a'
1134 | to the double-precision floating-point format. The conversion is performed
1135 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1136 *----------------------------------------------------------------------------*/
1137
1138 float64 int32_to_float64( int32 a )
1139 {
1140 flag zSign;
1141 uint32 absA;
1142 int8 shiftCount;
1143 bits64 zSig;
1144
1145 if ( a == 0 ) return 0;
1146 zSign = ( a < 0 );
1147 absA = zSign ? - a : a;
1148 shiftCount = countLeadingZeros32( absA ) + 21;
1149 zSig = absA;
1150 return packFloat64( zSign, 0x432 - shiftCount, zSig<<shiftCount );
1151
1152 }
1153
1154 #ifdef FLOATX80
1155
1156 /*----------------------------------------------------------------------------
1157 | Returns the result of converting the 32-bit two's complement integer `a'
1158 | to the extended double-precision floating-point format. The conversion
1159 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
1160 | Arithmetic.
1161 *----------------------------------------------------------------------------*/
1162
1163 floatx80 int32_to_floatx80( int32 a )
1164 {
1165 flag zSign;
1166 uint32 absA;
1167 int8 shiftCount;
1168 bits64 zSig;
1169
1170 if ( a == 0 ) return packFloatx80( 0, 0, 0 );
1171 zSign = ( a < 0 );
1172 absA = zSign ? - a : a;
1173 shiftCount = countLeadingZeros32( absA ) + 32;
1174 zSig = absA;
1175 return packFloatx80( zSign, 0x403E - shiftCount, zSig<<shiftCount );
1176
1177 }
1178
1179 #endif
1180
1181 #ifdef FLOAT128
1182
1183 /*----------------------------------------------------------------------------
1184 | Returns the result of converting the 32-bit two's complement integer `a' to
1185 | the quadruple-precision floating-point format. The conversion is performed
1186 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1187 *----------------------------------------------------------------------------*/
1188
1189 float128 int32_to_float128( int32 a )
1190 {
1191 flag zSign;
1192 uint32 absA;
1193 int8 shiftCount;
1194 bits64 zSig0;
1195
1196 if ( a == 0 ) return packFloat128( 0, 0, 0, 0 );
1197 zSign = ( a < 0 );
1198 absA = zSign ? - a : a;
1199 shiftCount = countLeadingZeros32( absA ) + 17;
1200 zSig0 = absA;
1201 return packFloat128( zSign, 0x402E - shiftCount, zSig0<<shiftCount, 0 );
1202
1203 }
1204
1205 #endif
1206
1207 #ifndef SOFTFLOAT_FOR_GCC /* __floatdi?f is in libgcc2.c */
1208 /*----------------------------------------------------------------------------
1209 | Returns the result of converting the 64-bit two's complement integer `a'
1210 | to the single-precision floating-point format. The conversion is performed
1211 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1212 *----------------------------------------------------------------------------*/
1213
1214 float32 int64_to_float32( int64 a )
1215 {
1216 flag zSign;
1217 uint64 absA;
1218 int8 shiftCount;
1219
1220 if ( a == 0 ) return 0;
1221 zSign = ( a < 0 );
1222 absA = zSign ? - a : a;
1223 shiftCount = countLeadingZeros64( absA ) - 40;
1224 if ( 0 <= shiftCount ) {
1225 return packFloat32( zSign, 0x95 - shiftCount, absA<<shiftCount );
1226 }
1227 else {
1228 shiftCount += 7;
1229 if ( shiftCount < 0 ) {
1230 shift64RightJamming( absA, - shiftCount, &absA );
1231 }
1232 else {
1233 absA <<= shiftCount;
1234 }
1235 return roundAndPackFloat32( zSign, 0x9C - shiftCount, absA );
1236 }
1237
1238 }
1239
1240 /*----------------------------------------------------------------------------
1241 | Returns the result of converting the 64-bit two's complement integer `a'
1242 | to the double-precision floating-point format. The conversion is performed
1243 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1244 *----------------------------------------------------------------------------*/
1245
1246 float64 int64_to_float64( int64 a )
1247 {
1248 flag zSign;
1249
1250 if ( a == 0 ) return 0;
1251 if ( a == (sbits64) LIT64( 0x8000000000000000 ) ) {
1252 return packFloat64( 1, 0x43E, 0 );
1253 }
1254 zSign = ( a < 0 );
1255 return normalizeRoundAndPackFloat64( zSign, 0x43C, zSign ? - a : a );
1256
1257 }
1258
1259 #ifdef FLOATX80
1260
1261 /*----------------------------------------------------------------------------
1262 | Returns the result of converting the 64-bit two's complement integer `a'
1263 | to the extended double-precision floating-point format. The conversion
1264 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
1265 | Arithmetic.
1266 *----------------------------------------------------------------------------*/
1267
1268 floatx80 int64_to_floatx80( int64 a )
1269 {
1270 flag zSign;
1271 uint64 absA;
1272 int8 shiftCount;
1273
1274 if ( a == 0 ) return packFloatx80( 0, 0, 0 );
1275 zSign = ( a < 0 );
1276 absA = zSign ? - a : a;
1277 shiftCount = countLeadingZeros64( absA );
1278 return packFloatx80( zSign, 0x403E - shiftCount, absA<<shiftCount );
1279
1280 }
1281
1282 #endif
1283
1284 #ifdef FLOAT128
1285
1286 /*----------------------------------------------------------------------------
1287 | Returns the result of converting the 64-bit two's complement integer `a' to
1288 | the quadruple-precision floating-point format. The conversion is performed
1289 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1290 *----------------------------------------------------------------------------*/
1291
1292 float128 int64_to_float128( int64 a )
1293 {
1294 flag zSign;
1295 uint64 absA;
1296 int8 shiftCount;
1297 int32 zExp;
1298 bits64 zSig0, zSig1;
1299
1300 if ( a == 0 ) return packFloat128( 0, 0, 0, 0 );
1301 zSign = ( a < 0 );
1302 absA = zSign ? - a : a;
1303 shiftCount = countLeadingZeros64( absA ) + 49;
1304 zExp = 0x406E - shiftCount;
1305 if ( 64 <= shiftCount ) {
1306 zSig1 = 0;
1307 zSig0 = absA;
1308 shiftCount -= 64;
1309 }
1310 else {
1311 zSig1 = absA;
1312 zSig0 = 0;
1313 }
1314 shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
1315 return packFloat128( zSign, zExp, zSig0, zSig1 );
1316
1317 }
1318
1319 #endif
1320 #endif /* !SOFTFLOAT_FOR_GCC */
1321
1322 #ifndef SOFTFLOAT_FOR_GCC /* Not needed */
1323 /*----------------------------------------------------------------------------
1324 | Returns the result of converting the single-precision floating-point value
1325 | `a' to the 32-bit two's complement integer format. The conversion is
1326 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1327 | Arithmetic---which means in particular that the conversion is rounded
1328 | according to the current rounding mode. If `a' is a NaN, the largest
1329 | positive integer is returned. Otherwise, if the conversion overflows, the
1330 | largest integer with the same sign as `a' is returned.
1331 *----------------------------------------------------------------------------*/
1332
1333 int32 float32_to_int32( float32 a )
1334 {
1335 flag aSign;
1336 int16 aExp, shiftCount;
1337 bits32 aSig;
1338 bits64 aSig64;
1339
1340 aSig = extractFloat32Frac( a );
1341 aExp = extractFloat32Exp( a );
1342 aSign = extractFloat32Sign( a );
1343 if ( ( aExp == 0xFF ) && aSig ) aSign = 0;
1344 if ( aExp ) aSig |= 0x00800000;
1345 shiftCount = 0xAF - aExp;
1346 aSig64 = aSig;
1347 aSig64 <<= 32;
1348 if ( 0 < shiftCount ) shift64RightJamming( aSig64, shiftCount, &aSig64 );
1349 return roundAndPackInt32( aSign, aSig64 );
1350
1351 }
1352 #endif /* !SOFTFLOAT_FOR_GCC */
1353
1354 /*----------------------------------------------------------------------------
1355 | Returns the result of converting the single-precision floating-point value
1356 | `a' to the 32-bit two's complement integer format. The conversion is
1357 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1358 | Arithmetic, except that the conversion is always rounded toward zero.
1359 | If `a' is a NaN, the largest positive integer is returned. Otherwise, if
1360 | the conversion overflows, the largest integer with the same sign as `a' is
1361 | returned.
1362 *----------------------------------------------------------------------------*/
1363
1364 int32 float32_to_int32_round_to_zero( float32 a )
1365 {
1366 flag aSign;
1367 int16 aExp, shiftCount;
1368 bits32 aSig;
1369 int32 z;
1370
1371 aSig = extractFloat32Frac( a );
1372 aExp = extractFloat32Exp( a );
1373 aSign = extractFloat32Sign( a );
1374 shiftCount = aExp - 0x9E;
1375 if ( 0 <= shiftCount ) {
1376 if ( a != 0xCF000000 ) {
1377 float_raise( float_flag_invalid );
1378 if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) return 0x7FFFFFFF;
1379 }
1380 return (sbits32) 0x80000000;
1381 }
1382 else if ( aExp <= 0x7E ) {
1383 if ( aExp | aSig ) float_set_inexact();
1384 return 0;
1385 }
1386 aSig = ( aSig | 0x00800000 )<<8;
1387 z = aSig>>( - shiftCount );
1388 if ( (bits32) ( aSig<<( shiftCount & 31 ) ) ) {
1389 float_set_inexact();
1390 }
1391 if ( aSign ) z = - z;
1392 return z;
1393
1394 }
1395
1396 #ifndef SOFTFLOAT_FOR_GCC /* __fix?fdi provided by libgcc2.c */
1397 /*----------------------------------------------------------------------------
1398 | Returns the result of converting the single-precision floating-point value
1399 | `a' to the 64-bit two's complement integer format. The conversion is
1400 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1401 | Arithmetic---which means in particular that the conversion is rounded
1402 | according to the current rounding mode. If `a' is a NaN, the largest
1403 | positive integer is returned. Otherwise, if the conversion overflows, the
1404 | largest integer with the same sign as `a' is returned.
1405 *----------------------------------------------------------------------------*/
1406
1407 int64 float32_to_int64( float32 a )
1408 {
1409 flag aSign;
1410 int16 aExp, shiftCount;
1411 bits32 aSig;
1412 bits64 aSig64, aSigExtra;
1413
1414 aSig = extractFloat32Frac( a );
1415 aExp = extractFloat32Exp( a );
1416 aSign = extractFloat32Sign( a );
1417 shiftCount = 0xBE - aExp;
1418 if ( shiftCount < 0 ) {
1419 float_raise( float_flag_invalid );
1420 if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) {
1421 return LIT64( 0x7FFFFFFFFFFFFFFF );
1422 }
1423 return (sbits64) LIT64( 0x8000000000000000 );
1424 }
1425 if ( aExp ) aSig |= 0x00800000;
1426 aSig64 = aSig;
1427 aSig64 <<= 40;
1428 shift64ExtraRightJamming( aSig64, 0, shiftCount, &aSig64, &aSigExtra );
1429 return roundAndPackInt64( aSign, aSig64, aSigExtra );
1430
1431 }
1432
1433 /*----------------------------------------------------------------------------
1434 | Returns the result of converting the single-precision floating-point value
1435 | `a' to the 64-bit two's complement integer format. The conversion is
1436 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1437 | Arithmetic, except that the conversion is always rounded toward zero. If
1438 | `a' is a NaN, the largest positive integer is returned. Otherwise, if the
1439 | conversion overflows, the largest integer with the same sign as `a' is
1440 | returned.
1441 *----------------------------------------------------------------------------*/
1442
1443 int64 float32_to_int64_round_to_zero( float32 a )
1444 {
1445 flag aSign;
1446 int16 aExp, shiftCount;
1447 bits32 aSig;
1448 bits64 aSig64;
1449 int64 z;
1450
1451 aSig = extractFloat32Frac( a );
1452 aExp = extractFloat32Exp( a );
1453 aSign = extractFloat32Sign( a );
1454 shiftCount = aExp - 0xBE;
1455 if ( 0 <= shiftCount ) {
1456 if ( a != 0xDF000000 ) {
1457 float_raise( float_flag_invalid );
1458 if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) {
1459 return LIT64( 0x7FFFFFFFFFFFFFFF );
1460 }
1461 }
1462 return (sbits64) LIT64( 0x8000000000000000 );
1463 }
1464 else if ( aExp <= 0x7E ) {
1465 if ( aExp | aSig ) float_set_inexact();
1466 return 0;
1467 }
1468 aSig64 = aSig | 0x00800000;
1469 aSig64 <<= 40;
1470 z = aSig64>>( - shiftCount );
1471 if ( (bits64) ( aSig64<<( shiftCount & 63 ) ) ) {
1472 float_set_inexact();
1473 }
1474 if ( aSign ) z = - z;
1475 return z;
1476
1477 }
1478 #endif /* !SOFTFLOAT_FOR_GCC */
1479
1480 /*----------------------------------------------------------------------------
1481 | Returns the result of converting the single-precision floating-point value
1482 | `a' to the double-precision floating-point format. The conversion is
1483 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1484 | Arithmetic.
1485 *----------------------------------------------------------------------------*/
1486
1487 float64 float32_to_float64( float32 a )
1488 {
1489 flag aSign;
1490 int16 aExp;
1491 bits32 aSig;
1492
1493 aSig = extractFloat32Frac( a );
1494 aExp = extractFloat32Exp( a );
1495 aSign = extractFloat32Sign( a );
1496 if ( aExp == 0xFF ) {
1497 if ( aSig ) return commonNaNToFloat64( float32ToCommonNaN( a ) );
1498 return packFloat64( aSign, 0x7FF, 0 );
1499 }
1500 if ( aExp == 0 ) {
1501 if ( aSig == 0 ) return packFloat64( aSign, 0, 0 );
1502 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1503 --aExp;
1504 }
1505 return packFloat64( aSign, aExp + 0x380, ( (bits64) aSig )<<29 );
1506
1507 }
1508
1509 #ifdef FLOATX80
1510
1511 /*----------------------------------------------------------------------------
1512 | Returns the result of converting the single-precision floating-point value
1513 | `a' to the extended double-precision floating-point format. The conversion
1514 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
1515 | Arithmetic.
1516 *----------------------------------------------------------------------------*/
1517
1518 floatx80 float32_to_floatx80( float32 a )
1519 {
1520 flag aSign;
1521 int16 aExp;
1522 bits32 aSig;
1523
1524 aSig = extractFloat32Frac( a );
1525 aExp = extractFloat32Exp( a );
1526 aSign = extractFloat32Sign( a );
1527 if ( aExp == 0xFF ) {
1528 if ( aSig ) return commonNaNToFloatx80( float32ToCommonNaN( a ) );
1529 return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
1530 }
1531 if ( aExp == 0 ) {
1532 if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 );
1533 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1534 }
1535 aSig |= 0x00800000;
1536 return packFloatx80( aSign, aExp + 0x3F80, ( (bits64) aSig )<<40 );
1537
1538 }
1539
1540 #endif
1541
1542 #ifdef FLOAT128
1543
1544 /*----------------------------------------------------------------------------
1545 | Returns the result of converting the single-precision floating-point value
1546 | `a' to the double-precision floating-point format. The conversion is
1547 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1548 | Arithmetic.
1549 *----------------------------------------------------------------------------*/
1550
1551 float128 float32_to_float128( float32 a )
1552 {
1553 flag aSign;
1554 int16 aExp;
1555 bits32 aSig;
1556
1557 aSig = extractFloat32Frac( a );
1558 aExp = extractFloat32Exp( a );
1559 aSign = extractFloat32Sign( a );
1560 if ( aExp == 0xFF ) {
1561 if ( aSig ) return commonNaNToFloat128( float32ToCommonNaN( a ) );
1562 return packFloat128( aSign, 0x7FFF, 0, 0 );
1563 }
1564 if ( aExp == 0 ) {
1565 if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 );
1566 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1567 --aExp;
1568 }
1569 return packFloat128( aSign, aExp + 0x3F80, ( (bits64) aSig )<<25, 0 );
1570
1571 }
1572
1573 #endif
1574
1575 #ifndef SOFTFLOAT_FOR_GCC /* Not needed */
1576 /*----------------------------------------------------------------------------
1577 | Rounds the single-precision floating-point value `a' to an integer, and
1578 | returns the result as a single-precision floating-point value. The
1579 | operation is performed according to the IEC/IEEE Standard for Binary
1580 | Floating-Point Arithmetic.
1581 *----------------------------------------------------------------------------*/
1582
1583 float32 float32_round_to_int( float32 a )
1584 {
1585 flag aSign;
1586 int16 aExp;
1587 bits32 lastBitMask, roundBitsMask;
1588 int8 roundingMode;
1589 float32 z;
1590
1591 aExp = extractFloat32Exp( a );
1592 if ( 0x96 <= aExp ) {
1593 if ( ( aExp == 0xFF ) && extractFloat32Frac( a ) ) {
1594 return propagateFloat32NaN( a, a );
1595 }
1596 return a;
1597 }
1598 if ( aExp <= 0x7E ) {
1599 if ( (bits32) ( a<<1 ) == 0 ) return a;
1600 float_set_inexact();
1601 aSign = extractFloat32Sign( a );
1602 switch ( float_rounding_mode() ) {
1603 case float_round_nearest_even:
1604 if ( ( aExp == 0x7E ) && extractFloat32Frac( a ) ) {
1605 return packFloat32( aSign, 0x7F, 0 );
1606 }
1607 break;
1608 case float_round_down:
1609 return aSign ? 0xBF800000 : 0;
1610 case float_round_up:
1611 return aSign ? 0x80000000 : 0x3F800000;
1612 }
1613 return packFloat32( aSign, 0, 0 );
1614 }
1615 lastBitMask = 1;
1616 lastBitMask <<= 0x96 - aExp;
1617 roundBitsMask = lastBitMask - 1;
1618 z = a;
1619 roundingMode = float_rounding_mode();
1620 if ( roundingMode == float_round_nearest_even ) {
1621 z += lastBitMask>>1;
1622 if ( ( z & roundBitsMask ) == 0 ) z &= ~ lastBitMask;
1623 }
1624 else if ( roundingMode != float_round_to_zero ) {
1625 if ( extractFloat32Sign( z ) ^ ( roundingMode == float_round_up ) ) {
1626 z += roundBitsMask;
1627 }
1628 }
1629 z &= ~ roundBitsMask;
1630 if ( z != a ) float_set_inexact();
1631 return z;
1632
1633 }
1634 #endif /* !SOFTFLOAT_FOR_GCC */
1635
1636 /*----------------------------------------------------------------------------
1637 | Returns the result of adding the absolute values of the single-precision
1638 | floating-point values `a' and `b'. If `zSign' is 1, the sum is negated
1639 | before being returned. `zSign' is ignored if the result is a NaN.
1640 | The addition is performed according to the IEC/IEEE Standard for Binary
1641 | Floating-Point Arithmetic.
1642 *----------------------------------------------------------------------------*/
1643
1644 static float32 addFloat32Sigs( float32 a, float32 b, flag zSign )
1645 {
1646 int16 aExp, bExp, zExp;
1647 bits32 aSig, bSig, zSig;
1648 int16 expDiff;
1649
1650 aSig = extractFloat32Frac( a );
1651 aExp = extractFloat32Exp( a );
1652 bSig = extractFloat32Frac( b );
1653 bExp = extractFloat32Exp( b );
1654 expDiff = aExp - bExp;
1655 aSig <<= 6;
1656 bSig <<= 6;
1657 if ( 0 < expDiff ) {
1658 if ( aExp == 0xFF ) {
1659 if ( aSig ) return propagateFloat32NaN( a, b );
1660 return a;
1661 }
1662 if ( bExp == 0 ) {
1663 --expDiff;
1664 }
1665 else {
1666 bSig |= 0x20000000;
1667 }
1668 shift32RightJamming( bSig, expDiff, &bSig );
1669 zExp = aExp;
1670 }
1671 else if ( expDiff < 0 ) {
1672 if ( bExp == 0xFF ) {
1673 if ( bSig ) return propagateFloat32NaN( a, b );
1674 return packFloat32( zSign, 0xFF, 0 );
1675 }
1676 if ( aExp == 0 ) {
1677 ++expDiff;
1678 }
1679 else {
1680 aSig |= 0x20000000;
1681 }
1682 shift32RightJamming( aSig, - expDiff, &aSig );
1683 zExp = bExp;
1684 }
1685 else {
1686 if ( aExp == 0xFF ) {
1687 if ( aSig | bSig ) return propagateFloat32NaN( a, b );
1688 return a;
1689 }
1690 if ( aExp == 0 ) return packFloat32( zSign, 0, ( aSig + bSig )>>6 );
1691 zSig = 0x40000000 + aSig + bSig;
1692 zExp = aExp;
1693 goto roundAndPack;
1694 }
1695 aSig |= 0x20000000;
1696 zSig = ( aSig + bSig )<<1;
1697 --zExp;
1698 if ( (sbits32) zSig < 0 ) {
1699 zSig = aSig + bSig;
1700 ++zExp;
1701 }
1702 roundAndPack:
1703 return roundAndPackFloat32( zSign, zExp, zSig );
1704
1705 }
1706
1707 /*----------------------------------------------------------------------------
1708 | Returns the result of subtracting the absolute values of the single-
1709 | precision floating-point values `a' and `b'. If `zSign' is 1, the
1710 | difference is negated before being returned. `zSign' is ignored if the
1711 | result is a NaN. The subtraction is performed according to the IEC/IEEE
1712 | Standard for Binary Floating-Point Arithmetic.
1713 *----------------------------------------------------------------------------*/
1714
1715 static float32 subFloat32Sigs( float32 a, float32 b, flag zSign )
1716 {
1717 int16 aExp, bExp, zExp;
1718 bits32 aSig, bSig, zSig;
1719 int16 expDiff;
1720
1721 aSig = extractFloat32Frac( a );
1722 aExp = extractFloat32Exp( a );
1723 bSig = extractFloat32Frac( b );
1724 bExp = extractFloat32Exp( b );
1725 expDiff = aExp - bExp;
1726 aSig <<= 7;
1727 bSig <<= 7;
1728 if ( 0 < expDiff ) goto aExpBigger;
1729 if ( expDiff < 0 ) goto bExpBigger;
1730 if ( aExp == 0xFF ) {
1731 if ( aSig | bSig ) return propagateFloat32NaN( a, b );
1732 float_raise( float_flag_invalid );
1733 return float32_default_nan;
1734 }
1735 if ( aExp == 0 ) {
1736 aExp = 1;
1737 bExp = 1;
1738 }
1739 if ( bSig < aSig ) goto aBigger;
1740 if ( aSig < bSig ) goto bBigger;
1741 return packFloat32( float_rounding_mode() == float_round_down, 0, 0 );
1742 bExpBigger:
1743 if ( bExp == 0xFF ) {
1744 if ( bSig ) return propagateFloat32NaN( a, b );
1745 return packFloat32( zSign ^ 1, 0xFF, 0 );
1746 }
1747 if ( aExp == 0 ) {
1748 ++expDiff;
1749 }
1750 else {
1751 aSig |= 0x40000000;
1752 }
1753 shift32RightJamming( aSig, - expDiff, &aSig );
1754 bSig |= 0x40000000;
1755 bBigger:
1756 zSig = bSig - aSig;
1757 zExp = bExp;
1758 zSign ^= 1;
1759 goto normalizeRoundAndPack;
1760 aExpBigger:
1761 if ( aExp == 0xFF ) {
1762 if ( aSig ) return propagateFloat32NaN( a, b );
1763 return a;
1764 }
1765 if ( bExp == 0 ) {
1766 --expDiff;
1767 }
1768 else {
1769 bSig |= 0x40000000;
1770 }
1771 shift32RightJamming( bSig, expDiff, &bSig );
1772 aSig |= 0x40000000;
1773 aBigger:
1774 zSig = aSig - bSig;
1775 zExp = aExp;
1776 normalizeRoundAndPack:
1777 --zExp;
1778 return normalizeRoundAndPackFloat32( zSign, zExp, zSig );
1779
1780 }
1781
1782 /*----------------------------------------------------------------------------
1783 | Returns the result of adding the single-precision floating-point values `a'
1784 | and `b'. The operation is performed according to the IEC/IEEE Standard for
1785 | Binary Floating-Point Arithmetic.
1786 *----------------------------------------------------------------------------*/
1787
1788 float32 float32_add( float32 a, float32 b )
1789 {
1790 flag aSign, bSign;
1791
1792 aSign = extractFloat32Sign( a );
1793 bSign = extractFloat32Sign( b );
1794 if ( aSign == bSign ) {
1795 return addFloat32Sigs( a, b, aSign );
1796 }
1797 else {
1798 return subFloat32Sigs( a, b, aSign );
1799 }
1800
1801 }
1802
1803 /*----------------------------------------------------------------------------
1804 | Returns the result of subtracting the single-precision floating-point values
1805 | `a' and `b'. The operation is performed according to the IEC/IEEE Standard
1806 | for Binary Floating-Point Arithmetic.
1807 *----------------------------------------------------------------------------*/
1808
1809 float32 float32_sub( float32 a, float32 b )
1810 {
1811 flag aSign, bSign;
1812
1813 aSign = extractFloat32Sign( a );
1814 bSign = extractFloat32Sign( b );
1815 if ( aSign == bSign ) {
1816 return subFloat32Sigs( a, b, aSign );
1817 }
1818 else {
1819 return addFloat32Sigs( a, b, aSign );
1820 }
1821
1822 }
1823
1824 /*----------------------------------------------------------------------------
1825 | Returns the result of multiplying the single-precision floating-point values
1826 | `a' and `b'. The operation is performed according to the IEC/IEEE Standard
1827 | for Binary Floating-Point Arithmetic.
1828 *----------------------------------------------------------------------------*/
1829
1830 float32 float32_mul( float32 a, float32 b )
1831 {
1832 flag aSign, bSign, zSign;
1833 int16 aExp, bExp, zExp;
1834 bits32 aSig, bSig;
1835 bits64 zSig64;
1836 bits32 zSig;
1837
1838 aSig = extractFloat32Frac( a );
1839 aExp = extractFloat32Exp( a );
1840 aSign = extractFloat32Sign( a );
1841 bSig = extractFloat32Frac( b );
1842 bExp = extractFloat32Exp( b );
1843 bSign = extractFloat32Sign( b );
1844 zSign = aSign ^ bSign;
1845 if ( aExp == 0xFF ) {
1846 if ( aSig || ( ( bExp == 0xFF ) && bSig ) ) {
1847 return propagateFloat32NaN( a, b );
1848 }
1849 if ( ( bExp | bSig ) == 0 ) {
1850 float_raise( float_flag_invalid );
1851 return float32_default_nan;
1852 }
1853 return packFloat32( zSign, 0xFF, 0 );
1854 }
1855 if ( bExp == 0xFF ) {
1856 if ( bSig ) return propagateFloat32NaN( a, b );
1857 if ( ( aExp | aSig ) == 0 ) {
1858 float_raise( float_flag_invalid );
1859 return float32_default_nan;
1860 }
1861 return packFloat32( zSign, 0xFF, 0 );
1862 }
1863 if ( aExp == 0 ) {
1864 if ( aSig == 0 ) return packFloat32( zSign, 0, 0 );
1865 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1866 }
1867 if ( bExp == 0 ) {
1868 if ( bSig == 0 ) return packFloat32( zSign, 0, 0 );
1869 normalizeFloat32Subnormal( bSig, &bExp, &bSig );
1870 }
1871 zExp = aExp + bExp - 0x7F;
1872 aSig = ( aSig | 0x00800000 )<<7;
1873 bSig = ( bSig | 0x00800000 )<<8;
1874 shift64RightJamming( ( (bits64) aSig ) * bSig, 32, &zSig64 );
1875 zSig = zSig64;
1876 if ( 0 <= (sbits32) ( zSig<<1 ) ) {
1877 zSig <<= 1;
1878 --zExp;
1879 }
1880 return roundAndPackFloat32( zSign, zExp, zSig );
1881
1882 }
1883
1884 /*----------------------------------------------------------------------------
1885 | Returns the result of dividing the single-precision floating-point value `a'
1886 | by the corresponding value `b'. The operation is performed according to the
1887 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1888 *----------------------------------------------------------------------------*/
1889
1890 float32 float32_div( float32 a, float32 b )
1891 {
1892 flag aSign, bSign, zSign;
1893 int16 aExp, bExp, zExp;
1894 bits32 aSig, bSig, zSig;
1895
1896 aSig = extractFloat32Frac( a );
1897 aExp = extractFloat32Exp( a );
1898 aSign = extractFloat32Sign( a );
1899 bSig = extractFloat32Frac( b );
1900 bExp = extractFloat32Exp( b );
1901 bSign = extractFloat32Sign( b );
1902 zSign = aSign ^ bSign;
1903 if ( aExp == 0xFF ) {
1904 if ( aSig ) return propagateFloat32NaN( a, b );
1905 if ( bExp == 0xFF ) {
1906 if ( bSig ) return propagateFloat32NaN( a, b );
1907 float_raise( float_flag_invalid );
1908 return float32_default_nan;
1909 }
1910 return packFloat32( zSign, 0xFF, 0 );
1911 }
1912 if ( bExp == 0xFF ) {
1913 if ( bSig ) return propagateFloat32NaN( a, b );
1914 return packFloat32( zSign, 0, 0 );
1915 }
1916 if ( bExp == 0 ) {
1917 if ( bSig == 0 ) {
1918 if ( ( aExp | aSig ) == 0 ) {
1919 float_raise( float_flag_invalid );
1920 return float32_default_nan;
1921 }
1922 float_raise( float_flag_divbyzero );
1923 return packFloat32( zSign, 0xFF, 0 );
1924 }
1925 normalizeFloat32Subnormal( bSig, &bExp, &bSig );
1926 }
1927 if ( aExp == 0 ) {
1928 if ( aSig == 0 ) return packFloat32( zSign, 0, 0 );
1929 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1930 }
1931 zExp = aExp - bExp + 0x7D;
1932 aSig = ( aSig | 0x00800000 )<<7;
1933 bSig = ( bSig | 0x00800000 )<<8;
1934 if ( bSig <= ( aSig + aSig ) ) {
1935 aSig >>= 1;
1936 ++zExp;
1937 }
1938 zSig = ( ( (bits64) aSig )<<32 ) / bSig;
1939 if ( ( zSig & 0x3F ) == 0 ) {
1940 zSig |= ( (bits64) bSig * zSig != ( (bits64) aSig )<<32 );
1941 }
1942 return roundAndPackFloat32( zSign, zExp, zSig );
1943
1944 }
1945
1946 #ifndef SOFTFLOAT_FOR_GCC /* Not needed */
1947 /*----------------------------------------------------------------------------
1948 | Returns the remainder of the single-precision floating-point value `a'
1949 | with respect to the corresponding value `b'. The operation is performed
1950 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1951 *----------------------------------------------------------------------------*/
1952
1953 float32 float32_rem( float32 a, float32 b )
1954 {
1955 flag aSign, bSign __unused, zSign;
1956 int16 aExp, bExp, expDiff;
1957 bits32 aSig, bSig;
1958 bits32 q;
1959 bits64 aSig64, bSig64, q64;
1960 bits32 alternateASig;
1961 sbits32 sigMean;
1962
1963 aSig = extractFloat32Frac( a );
1964 aExp = extractFloat32Exp( a );
1965 aSign = extractFloat32Sign( a );
1966 bSig = extractFloat32Frac( b );
1967 bExp = extractFloat32Exp( b );
1968 bSign = extractFloat32Sign( b );
1969 if ( aExp == 0xFF ) {
1970 if ( aSig || ( ( bExp == 0xFF ) && bSig ) ) {
1971 return propagateFloat32NaN( a, b );
1972 }
1973 float_raise( float_flag_invalid );
1974 return float32_default_nan;
1975 }
1976 if ( bExp == 0xFF ) {
1977 if ( bSig ) return propagateFloat32NaN( a, b );
1978 return a;
1979 }
1980 if ( bExp == 0 ) {
1981 if ( bSig == 0 ) {
1982 float_raise( float_flag_invalid );
1983 return float32_default_nan;
1984 }
1985 normalizeFloat32Subnormal( bSig, &bExp, &bSig );
1986 }
1987 if ( aExp == 0 ) {
1988 if ( aSig == 0 ) return a;
1989 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1990 }
1991 expDiff = aExp - bExp;
1992 aSig |= 0x00800000;
1993 bSig |= 0x00800000;
1994 if ( expDiff < 32 ) {
1995 aSig <<= 8;
1996 bSig <<= 8;
1997 if ( expDiff < 0 ) {
1998 if ( expDiff < -1 ) return a;
1999 aSig >>= 1;
2000 }
2001 q = ( bSig <= aSig );
2002 if ( q ) aSig -= bSig;
2003 if ( 0 < expDiff ) {
2004 q = ( ( (bits64) aSig )<<32 ) / bSig;
2005 q >>= 32 - expDiff;
2006 bSig >>= 2;
2007 aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q;
2008 }
2009 else {
2010 aSig >>= 2;
2011 bSig >>= 2;
2012 }
2013 }
2014 else {
2015 if ( bSig <= aSig ) aSig -= bSig;
2016 aSig64 = ( (bits64) aSig )<<40;
2017 bSig64 = ( (bits64) bSig )<<40;
2018 expDiff -= 64;
2019 while ( 0 < expDiff ) {
2020 q64 = estimateDiv128To64( aSig64, 0, bSig64 );
2021 q64 = ( 2 < q64 ) ? q64 - 2 : 0;
2022 aSig64 = - ( ( bSig * q64 )<<38 );
2023 expDiff -= 62;
2024 }
2025 expDiff += 64;
2026 q64 = estimateDiv128To64( aSig64, 0, bSig64 );
2027 q64 = ( 2 < q64 ) ? q64 - 2 : 0;
2028 q = q64>>( 64 - expDiff );
2029 bSig <<= 6;
2030 aSig = ( ( aSig64>>33 )<<( expDiff - 1 ) ) - bSig * q;
2031 }
2032 do {
2033 alternateASig = aSig;
2034 ++q;
2035 aSig -= bSig;
2036 } while ( 0 <= (sbits32) aSig );
2037 sigMean = aSig + alternateASig;
2038 if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) {
2039 aSig = alternateASig;
2040 }
2041 zSign = ( (sbits32) aSig < 0 );
2042 if ( zSign ) aSig = - aSig;
2043 return normalizeRoundAndPackFloat32( aSign ^ zSign, bExp, aSig );
2044
2045 }
2046 #endif /* !SOFTFLOAT_FOR_GCC */
2047
2048 #ifndef SOFTFLOAT_FOR_GCC /* Not needed */
2049
2050 /*----------------------------------------------------------------------------
2051 | Returns the square root of the single-precision floating-point value `a'.
2052 | The operation is performed according to the IEC/IEEE Standard for Binary
2053 | Floating-Point Arithmetic.
2054 *----------------------------------------------------------------------------*/
2055
2056 float32 float32_sqrt( float32 a )
2057 {
2058 flag aSign;
2059 int16 aExp, zExp;
2060 bits32 aSig, zSig;
2061 bits64 rem, term;
2062
2063 aSig = extractFloat32Frac( a );
2064 aExp = extractFloat32Exp( a );
2065 aSign = extractFloat32Sign( a );
2066 if ( aExp == 0xFF ) {
2067 if ( aSig ) return propagateFloat32NaN( a, 0 );
2068 if ( ! aSign ) return a;
2069 float_raise( float_flag_invalid );
2070 return float32_default_nan;
2071 }
2072 if ( aSign ) {
2073 if ( ( aExp | aSig ) == 0 ) return a;
2074 float_raise( float_flag_invalid );
2075 return float32_default_nan;
2076 }
2077 if ( aExp == 0 ) {
2078 if ( aSig == 0 ) return 0;
2079 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
2080 }
2081 zExp = ( ( aExp - 0x7F )>>1 ) + 0x7E;
2082 aSig = ( aSig | 0x00800000 )<<8;
2083 zSig = estimateSqrt32( aExp, aSig ) + 2;
2084 if ( ( zSig & 0x7F ) <= 5 ) {
2085 if ( zSig < 2 ) {
2086 zSig = 0x7FFFFFFF;
2087 goto roundAndPack;
2088 }
2089 aSig >>= aExp & 1;
2090 term = ( (bits64) zSig ) * zSig;
2091 rem = ( ( (bits64) aSig )<<32 ) - term;
2092 while ( (sbits64) rem < 0 ) {
2093 --zSig;
2094 rem += ( ( (bits64) zSig )<<1 ) | 1;
2095 }
2096 zSig |= ( rem != 0 );
2097 }
2098 shift32RightJamming( zSig, 1, &zSig );
2099 roundAndPack:
2100 return roundAndPackFloat32( 0, zExp, zSig );
2101
2102 }
2103 #endif /* !SOFTFLOAT_FOR_GCC */
2104
2105 /*----------------------------------------------------------------------------
2106 | Returns 1 if the single-precision floating-point value `a' is equal to
2107 | the corresponding value `b', and 0 otherwise. The comparison is performed
2108 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2109 *----------------------------------------------------------------------------*/
2110
2111 flag float32_eq( float32 a, float32 b )
2112 {
2113
2114 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2115 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2116 ) {
2117 if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) {
2118 float_raise( float_flag_invalid );
2119 }
2120 return 0;
2121 }
2122 return ( a == b ) || ( (bits32) ( ( a | b )<<1 ) == 0 );
2123
2124 }
2125
2126 /*----------------------------------------------------------------------------
2127 | Returns 1 if the single-precision floating-point value `a' is less than
2128 | or equal to the corresponding value `b', and 0 otherwise. The comparison
2129 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
2130 | Arithmetic.
2131 *----------------------------------------------------------------------------*/
2132
2133 flag float32_le( float32 a, float32 b )
2134 {
2135 flag aSign, bSign;
2136
2137 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2138 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2139 ) {
2140 float_raise( float_flag_invalid );
2141 return 0;
2142 }
2143 aSign = extractFloat32Sign( a );
2144 bSign = extractFloat32Sign( b );
2145 if ( aSign != bSign ) return aSign || ( (bits32) ( ( a | b )<<1 ) == 0 );
2146 return ( a == b ) || ( aSign ^ ( a < b ) );
2147
2148 }
2149
2150 /*----------------------------------------------------------------------------
2151 | Returns 1 if the single-precision floating-point value `a' is less than
2152 | the corresponding value `b', and 0 otherwise. The comparison is performed
2153 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2154 *----------------------------------------------------------------------------*/
2155
2156 flag float32_lt( float32 a, float32 b )
2157 {
2158 flag aSign, bSign;
2159
2160 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2161 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2162 ) {
2163 float_raise( float_flag_invalid );
2164 return 0;
2165 }
2166 aSign = extractFloat32Sign( a );
2167 bSign = extractFloat32Sign( b );
2168 if ( aSign != bSign ) return aSign && ( (bits32) ( ( a | b )<<1 ) != 0 );
2169 return ( a != b ) && ( aSign ^ ( a < b ) );
2170
2171 }
2172
2173 #ifndef SOFTFLOAT_FOR_GCC /* Not needed */
2174 /*----------------------------------------------------------------------------
2175 | Returns 1 if the single-precision floating-point value `a' is equal to
2176 | the corresponding value `b', and 0 otherwise. The invalid exception is
2177 | raised if either operand is a NaN. Otherwise, the comparison is performed
2178 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2179 *----------------------------------------------------------------------------*/
2180
2181 flag float32_eq_signaling( float32 a, float32 b )
2182 {
2183
2184 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2185 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2186 ) {
2187 float_raise( float_flag_invalid );
2188 return 0;
2189 }
2190 return ( a == b ) || ( (bits32) ( ( a | b )<<1 ) == 0 );
2191
2192 }
2193
2194 /*----------------------------------------------------------------------------
2195 | Returns 1 if the single-precision floating-point value `a' is less than or
2196 | equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not
2197 | cause an exception. Otherwise, the comparison is performed according to the
2198 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2199 *----------------------------------------------------------------------------*/
2200
2201 flag float32_le_quiet( float32 a, float32 b )
2202 {
2203 flag aSign, bSign;
2204
2205 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2206 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2207 ) {
2208 if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) {
2209 float_raise( float_flag_invalid );
2210 }
2211 return 0;
2212 }
2213 aSign = extractFloat32Sign( a );
2214 bSign = extractFloat32Sign( b );
2215 if ( aSign != bSign ) return aSign || ( (bits32) ( ( a | b )<<1 ) == 0 );
2216 return ( a == b ) || ( aSign ^ ( a < b ) );
2217
2218 }
2219
2220 /*----------------------------------------------------------------------------
2221 | Returns 1 if the single-precision floating-point value `a' is less than
2222 | the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
2223 | exception. Otherwise, the comparison is performed according to the IEC/IEEE
2224 | Standard for Binary Floating-Point Arithmetic.
2225 *----------------------------------------------------------------------------*/
2226
2227 flag float32_lt_quiet( float32 a, float32 b )
2228 {
2229 flag aSign, bSign;
2230
2231 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2232 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2233 ) {
2234 if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) {
2235 float_raise( float_flag_invalid );
2236 }
2237 return 0;
2238 }
2239 aSign = extractFloat32Sign( a );
2240 bSign = extractFloat32Sign( b );
2241 if ( aSign != bSign ) return aSign && ( (bits32) ( ( a | b )<<1 ) != 0 );
2242 return ( a != b ) && ( aSign ^ ( a < b ) );
2243
2244 }
2245 #endif /* !SOFTFLOAT_FOR_GCC */
2246
2247 #ifndef SOFTFLOAT_FOR_GCC /* Not needed */
2248 /*----------------------------------------------------------------------------
2249 | Returns the result of converting the double-precision floating-point value
2250 | `a' to the 32-bit two's complement integer format. The conversion is
2251 | performed according to the IEC/IEEE Standard for Binary Floating-Point
2252 | Arithmetic---which means in particular that the conversion is rounded
2253 | according to the current rounding mode. If `a' is a NaN, the largest
2254 | positive integer is returned. Otherwise, if the conversion overflows, the
2255 | largest integer with the same sign as `a' is returned.
2256 *----------------------------------------------------------------------------*/
2257
2258 int32 float64_to_int32( float64 a )
2259 {
2260 flag aSign;
2261 int16 aExp, shiftCount;
2262 bits64 aSig;
2263
2264 aSig = extractFloat64Frac( a );
2265 aExp = extractFloat64Exp( a );
2266 aSign = extractFloat64Sign( a );
2267 if ( ( aExp == 0x7FF ) && aSig ) aSign = 0;
2268 if ( aExp ) aSig |= LIT64( 0x0010000000000000 );
2269 shiftCount = 0x42C - aExp;
2270 if ( 0 < shiftCount ) shift64RightJamming( aSig, shiftCount, &aSig );
2271 return roundAndPackInt32( aSign, aSig );
2272
2273 }
2274 #endif /* !SOFTFLOAT_FOR_GCC */
2275
2276 /*----------------------------------------------------------------------------
2277 | Returns the result of converting the double-precision floating-point value
2278 | `a' to the 32-bit two's complement integer format. The conversion is
2279 | performed according to the IEC/IEEE Standard for Binary Floating-Point
2280 | Arithmetic, except that the conversion is always rounded toward zero.
2281 | If `a' is a NaN, the largest positive integer is returned. Otherwise, if
2282 | the conversion overflows, the largest integer with the same sign as `a' is
2283 | returned.
2284 *----------------------------------------------------------------------------*/
2285
2286 int32 float64_to_int32_round_to_zero( float64 a )
2287 {
2288 flag aSign;
2289 int16 aExp, shiftCount;
2290 bits64 aSig, savedASig;
2291 int32 z;
2292
2293 aSig = extractFloat64Frac( a );
2294 aExp = extractFloat64Exp( a );
2295 aSign = extractFloat64Sign( a );
2296 if ( 0x41E < aExp ) {
2297 if ( ( aExp == 0x7FF ) && aSig ) aSign = 0;
2298 goto invalid;
2299 }
2300 else if ( aExp < 0x3FF ) {
2301 if ( aExp || aSig ) float_set_inexact();
2302 return 0;
2303 }
2304 aSig |= LIT64( 0x0010000000000000 );
2305 shiftCount = 0x433 - aExp;
2306 savedASig = aSig;
2307 aSig >>= shiftCount;
2308 z = aSig;
2309 if ( aSign ) z = - z;
2310 if ( ( z < 0 ) ^ aSign ) {
2311 invalid:
2312 float_raise( float_flag_invalid );
2313 return aSign ? (sbits32) 0x80000000 : 0x7FFFFFFF;
2314 }
2315 if ( ( aSig<<shiftCount ) != savedASig ) {
2316 float_set_inexact();
2317 }
2318 return z;
2319
2320 }
2321
2322 #ifndef SOFTFLOAT_FOR_GCC /* Not needed */
2323 /*----------------------------------------------------------------------------
2324 | Returns the result of converting the double-precision floating-point value
2325 | `a' to the 64-bit two's complement integer format. The conversion is
2326 | performed according to the IEC/IEEE Standard for Binary Floating-Point
2327 | Arithmetic---which means in particular that the conversion is rounded
2328 | according to the current rounding mode. If `a' is a NaN, the largest
2329 | positive integer is returned. Otherwise, if the conversion overflows, the
2330 | largest integer with the same sign as `a' is returned.
2331 *----------------------------------------------------------------------------*/
2332
2333 int64 float64_to_int64( float64 a )
2334 {
2335 flag aSign;
2336 int16 aExp, shiftCount;
2337 bits64 aSig, aSigExtra;
2338
2339 aSig = extractFloat64Frac( a );
2340 aExp = extractFloat64Exp( a );
2341 aSign = extractFloat64Sign( a );
2342 if ( aExp ) aSig |= LIT64( 0x0010000000000000 );
2343 shiftCount = 0x433 - aExp;
2344 if ( shiftCount <= 0 ) {
2345 if ( 0x43E < aExp ) {
2346 float_raise( float_flag_invalid );
2347 if ( ! aSign
2348 || ( ( aExp == 0x7FF )
2349 && ( aSig != LIT64( 0x0010000000000000 ) ) )
2350 ) {
2351 return LIT64( 0x7FFFFFFFFFFFFFFF );
2352 }
2353 return (sbits64) LIT64( 0x8000000000000000 );
2354 }
2355 aSigExtra = 0;
2356 aSig <<= - shiftCount;
2357 }
2358 else {
2359 shift64ExtraRightJamming( aSig, 0, shiftCount, &aSig, &aSigExtra );
2360 }
2361 return roundAndPackInt64( aSign, aSig, aSigExtra );
2362
2363 }
2364
2365 /* like above, but result is unsigned */
2366 uint64 float64_to_uint64( float64 a )
2367 {
2368 flag aSign;
2369 int16 aExp, shiftCount;
2370 bits64 aSig, aSigExtra;
2371
2372 aSig = extractFloat64Frac( a );
2373 aExp = extractFloat64Exp( a );
2374 aSign = extractFloat64Sign( a );
2375
2376 if (aSign) {
2377 return float64_to_int64(a);
2378 }
2379
2380 if ( aExp ) aSig |= LIT64( 0x0010000000000000 );
2381 shiftCount = 0x433 - aExp;
2382 if ( shiftCount <= 0 ) {
2383 if ( 0x43E < aExp ) {
2384 float_raise( float_flag_invalid );
2385 if ( ! aSign
2386 || ( ( aExp == 0x7FF )
2387 && ( aSig != LIT64( 0x0010000000000000 ) ) )
2388 ) {
2389 return LIT64( 0x7FFFFFFFFFFFFFFF );
2390 }
2391 return (sbits64) LIT64( 0x8000000000000000 );
2392 }
2393 aSigExtra = 0;
2394 aSig <<= - shiftCount;
2395 }
2396 else {
2397 shift64ExtraRightJamming( aSig, 0, shiftCount, &aSig, &aSigExtra );
2398 }
2399 return roundAndPackUInt64( aSig, aSigExtra );
2400
2401 }
2402
2403 /*----------------------------------------------------------------------------
2404 | Returns the result of converting the double-precision floating-point value
2405 | `a' to the 64-bit two's complement integer format. The conversion is
2406 | performed according to the IEC/IEEE Standard for Binary Floating-Point
2407 | Arithmetic, except that the conversion is always rounded toward zero.
2408 | If `a' is a NaN, the largest positive integer is returned. Otherwise, if
2409 | the conversion overflows, the largest integer with the same sign as `a' is
2410 | returned.
2411 *----------------------------------------------------------------------------*/
2412
2413 int64 float64_to_int64_round_to_zero( float64 a )
2414 {
2415 flag aSign;
2416 int16 aExp, shiftCount;
2417 bits64 aSig;
2418 int64 z;
2419
2420 aSig = extractFloat64Frac( a );
2421 aExp = extractFloat64Exp( a );
2422 aSign = extractFloat64Sign( a );
2423 if ( aExp ) aSig |= LIT64( 0x0010000000000000 );
2424 shiftCount = aExp - 0x433;
2425 if ( 0 <= shiftCount ) {
2426 if ( 0x43E <= aExp ) {
2427 if ( a != LIT64( 0xC3E0000000000000 ) ) {
2428 float_raise( float_flag_invalid );
2429 if ( ! aSign
2430 || ( ( aExp == 0x7FF )
2431 && ( aSig != LIT64( 0x0010000000000000 ) ) )
2432 ) {
2433 return LIT64( 0x7FFFFFFFFFFFFFFF );
2434 }
2435 }
2436 return (sbits64) LIT64( 0x8000000000000000 );
2437 }
2438 z = aSig<<shiftCount;
2439 }
2440 else {
2441 if ( aExp < 0x3FE ) {
2442 if ( aExp | aSig ) float_set_inexact();
2443 return 0;
2444 }
2445 z = aSig>>( - shiftCount );
2446 if ( (bits64) ( aSig<<( shiftCount & 63 ) ) ) {
2447 float_set_inexact();
2448 }
2449 }
2450 if ( aSign ) z = - z;
2451 return z;
2452
2453 }
2454 #endif /* !SOFTFLOAT_FOR_GCC */
2455
2456 /*----------------------------------------------------------------------------
2457 | Returns the result of converting the double-precision floating-point value
2458 | `a' to the single-precision floating-point format. The conversion is
2459 | performed according to the IEC/IEEE Standard for Binary Floating-Point
2460 | Arithmetic.
2461 *----------------------------------------------------------------------------*/
2462
2463 float32 float64_to_float32( float64 a )
2464 {
2465 flag aSign;
2466 int16 aExp;
2467 bits64 aSig;
2468 bits32 zSig;
2469
2470 aSig = extractFloat64Frac( a );
2471 aExp = extractFloat64Exp( a );
2472 aSign = extractFloat64Sign( a );
2473 if ( aExp == 0x7FF ) {
2474 if ( aSig ) return commonNaNToFloat32( float64ToCommonNaN( a ) );
2475 return packFloat32( aSign, 0xFF, 0 );
2476 }
2477 shift64RightJamming( aSig, 22, &aSig );
2478 zSig = aSig;
2479 if ( aExp || zSig ) {
2480 zSig |= 0x40000000;
2481 aExp -= 0x381;
2482 }
2483 return roundAndPackFloat32( aSign, aExp, zSig );
2484
2485 }
2486
2487 #ifdef FLOATX80
2488
2489 /*----------------------------------------------------------------------------
2490 | Returns the result of converting the double-precision floating-point value
2491 | `a' to the extended double-precision floating-point format. The conversion
2492 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
2493 | Arithmetic.
2494 *----------------------------------------------------------------------------*/
2495
2496 floatx80 float64_to_floatx80( float64 a )
2497 {
2498 flag aSign;
2499 int16 aExp;
2500 bits64 aSig;
2501
2502 aSig = extractFloat64Frac( a );
2503 aExp = extractFloat64Exp( a );
2504 aSign = extractFloat64Sign( a );
2505 if ( aExp == 0x7FF ) {
2506 if ( aSig ) return commonNaNToFloatx80( float64ToCommonNaN( a ) );
2507 return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
2508 }
2509 if ( aExp == 0 ) {
2510 if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 );
2511 normalizeFloat64Subnormal( aSig, &aExp, &aSig );
2512 }
2513 return
2514 packFloatx80(
2515 aSign, aExp + 0x3C00, ( aSig | LIT64( 0x0010000000000000 ) )<<11 );
2516
2517 }
2518
2519 #endif
2520
2521 #ifdef FLOAT128
2522
2523 /*----------------------------------------------------------------------------
2524 | Returns the result of converting the double-precision floating-point value
2525 | `a' to the quadruple-precision floating-point format. The conversion is
2526 | performed according to the IEC/IEEE Standard for Binary Floating-Point
2527 | Arithmetic.
2528 *----------------------------------------------------------------------------*/
2529
2530 float128 float64_to_float128( float64 a )
2531 {
2532 flag aSign;
2533 int16 aExp;
2534 bits64 aSig, zSig0, zSig1;
2535
2536 aSig = extractFloat64Frac( a );
2537 aExp = extractFloat64Exp( a );
2538 aSign = extractFloat64Sign( a );
2539 if ( aExp == 0x7FF ) {
2540 if ( aSig ) return commonNaNToFloat128( float64ToCommonNaN( a ) );
2541 return packFloat128( aSign, 0x7FFF, 0, 0 );
2542 }
2543 if ( aExp == 0 ) {
2544 if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 );
2545 normalizeFloat64Subnormal( aSig, &aExp, &aSig );
2546 --aExp;
2547 }
2548 shift128Right( aSig, 0, 4, &zSig0, &zSig1 );
2549 return packFloat128( aSign, aExp + 0x3C00, zSig0, zSig1 );
2550
2551 }
2552
2553 #endif
2554
2555 #ifndef SOFTFLOAT_FOR_GCC
2556 /*----------------------------------------------------------------------------
2557 | Rounds the double-precision floating-point value `a' to an integer, and
2558 | returns the result as a double-precision floating-point value. The
2559 | operation is performed according to the IEC/IEEE Standard for Binary
2560 | Floating-Point Arithmetic.
2561 *----------------------------------------------------------------------------*/
2562
2563 float64 float64_round_to_int( float64 a )
2564 {
2565 flag aSign;
2566 int16 aExp;
2567 bits64 lastBitMask, roundBitsMask;
2568 int8 roundingMode;
2569 float64 z;
2570
2571 aExp = extractFloat64Exp( a );
2572 if ( 0x433 <= aExp ) {
2573 if ( ( aExp == 0x7FF ) && extractFloat64Frac( a ) ) {
2574 return propagateFloat64NaN( a, a );
2575 }
2576 return a;
2577 }
2578 if ( aExp < 0x3FF ) {
2579 if ( (bits64) ( a<<1 ) == 0 ) return a;
2580 float_set_inexact();
2581 aSign = extractFloat64Sign( a );
2582 switch ( float_rounding_mode() ) {
2583 case float_round_nearest_even:
2584 if ( ( aExp == 0x3FE ) && extractFloat64Frac( a ) ) {
2585 return packFloat64( aSign, 0x3FF, 0 );
2586 }
2587 break;
2588 case float_round_down:
2589 return aSign ? LIT64( 0xBFF0000000000000 ) : 0;
2590 case float_round_up:
2591 return
2592 aSign ? LIT64( 0x8000000000000000 ) : LIT64( 0x3FF0000000000000 );
2593 }
2594 return packFloat64( aSign, 0, 0 );
2595 }
2596 lastBitMask = 1;
2597 lastBitMask <<= 0x433 - aExp;
2598 roundBitsMask = lastBitMask - 1;
2599 z = a;
2600 roundingMode = float_rounding_mode();
2601 if ( roundingMode == float_round_nearest_even ) {
2602 z += lastBitMask>>1;
2603 if ( ( z & roundBitsMask ) == 0 ) z &= ~ lastBitMask;
2604 }
2605 else if ( roundingMode != float_round_to_zero ) {
2606 if ( extractFloat64Sign( z ) ^ ( roundingMode == float_round_up ) ) {
2607 z += roundBitsMask;
2608 }
2609 }
2610 z &= ~ roundBitsMask;
2611 if ( z != a ) float_set_inexact();
2612 return z;
2613
2614 }
2615 #endif
2616
2617 /*----------------------------------------------------------------------------
2618 | Returns the result of adding the absolute values of the double-precision
2619 | floating-point values `a' and `b'. If `zSign' is 1, the sum is negated
2620 | before being returned. `zSign' is ignored if the result is a NaN.
2621 | The addition is performed according to the IEC/IEEE Standard for Binary
2622 | Floating-Point Arithmetic.
2623 *----------------------------------------------------------------------------*/
2624
2625 static float64 addFloat64Sigs( float64 a, float64 b, flag zSign )
2626 {
2627 int16 aExp, bExp, zExp;
2628 bits64 aSig, bSig, zSig;
2629 int16 expDiff;
2630
2631 aSig = extractFloat64Frac( a );
2632 aExp = extractFloat64Exp( a );
2633 bSig = extractFloat64Frac( b );
2634 bExp = extractFloat64Exp( b );
2635 expDiff = aExp - bExp;
2636 aSig <<= 9;
2637 bSig <<= 9;
2638 if ( 0 < expDiff ) {
2639 if ( aExp == 0x7FF ) {
2640 if ( aSig ) return propagateFloat64NaN( a, b );
2641 return a;
2642 }
2643 if ( bExp == 0 ) {
2644 --expDiff;
2645 }
2646 else {
2647 bSig |= LIT64( 0x2000000000000000 );
2648 }
2649 shift64RightJamming( bSig, expDiff, &bSig );
2650 zExp = aExp;
2651 }
2652 else if ( expDiff < 0 ) {
2653 if ( bExp == 0x7FF ) {
2654 if ( bSig ) return propagateFloat64NaN( a, b );
2655 return packFloat64( zSign, 0x7FF, 0 );
2656 }
2657 if ( aExp == 0 ) {
2658 ++expDiff;
2659 }
2660 else {
2661 aSig |= LIT64( 0x2000000000000000 );
2662 }
2663 shift64RightJamming( aSig, - expDiff, &aSig );
2664 zExp = bExp;
2665 }
2666 else {
2667 if ( aExp == 0x7FF ) {
2668 if ( aSig | bSig ) return propagateFloat64NaN( a, b );
2669 return a;
2670 }
2671 if ( aExp == 0 ) return packFloat64( zSign, 0, ( aSig + bSig )>>9 );
2672 zSig = LIT64( 0x4000000000000000 ) + aSig + bSig;
2673 zExp = aExp;
2674 goto roundAndPack;
2675 }
2676 aSig |= LIT64( 0x2000000000000000 );
2677 zSig = ( aSig + bSig )<<1;
2678 --zExp;
2679 if ( (sbits64) zSig < 0 ) {
2680 zSig = aSig + bSig;
2681 ++zExp;
2682 }
2683 roundAndPack:
2684 return roundAndPackFloat64( zSign, zExp, zSig );
2685
2686 }
2687
2688 /*----------------------------------------------------------------------------
2689 | Returns the result of subtracting the absolute values of the double-
2690 | precision floating-point values `a' and `b'. If `zSign' is 1, the
2691 | difference is negated before being returned. `zSign' is ignored if the
2692 | result is a NaN. The subtraction is performed according to the IEC/IEEE
2693 | Standard for Binary Floating-Point Arithmetic.
2694 *----------------------------------------------------------------------------*/
2695
2696 static float64 subFloat64Sigs( float64 a, float64 b, flag zSign )
2697 {
2698 int16 aExp, bExp, zExp;
2699 bits64 aSig, bSig, zSig;
2700 int16 expDiff;
2701
2702 aSig = extractFloat64Frac( a );
2703 aExp = extractFloat64Exp( a );
2704 bSig = extractFloat64Frac( b );
2705 bExp = extractFloat64Exp( b );
2706 expDiff = aExp - bExp;
2707 aSig <<= 10;
2708 bSig <<= 10;
2709 if ( 0 < expDiff ) goto aExpBigger;
2710 if ( expDiff < 0 ) goto bExpBigger;
2711 if ( aExp == 0x7FF ) {
2712 if ( aSig | bSig ) return propagateFloat64NaN( a, b );
2713 float_raise( float_flag_invalid );
2714 return float64_default_nan;
2715 }
2716 if ( aExp == 0 ) {
2717 aExp = 1;
2718 bExp = 1;
2719 }
2720 if ( bSig < aSig ) goto aBigger;
2721 if ( aSig < bSig ) goto bBigger;
2722 return packFloat64( float_rounding_mode() == float_round_down, 0, 0 );
2723 bExpBigger:
2724 if ( bExp == 0x7FF ) {
2725 if ( bSig ) return propagateFloat64NaN( a, b );
2726 return packFloat64( zSign ^ 1, 0x7FF, 0 );
2727 }
2728 if ( aExp == 0 ) {
2729 ++expDiff;
2730 }
2731 else {
2732 aSig |= LIT64( 0x4000000000000000 );
2733 }
2734 shift64RightJamming( aSig, - expDiff, &aSig );
2735 bSig |= LIT64( 0x4000000000000000 );
2736 bBigger:
2737 zSig = bSig - aSig;
2738 zExp = bExp;
2739 zSign ^= 1;
2740 goto normalizeRoundAndPack;
2741 aExpBigger:
2742 if ( aExp == 0x7FF ) {
2743 if ( aSig ) return propagateFloat64NaN( a, b );
2744 return a;
2745 }
2746 if ( bExp == 0 ) {
2747 --expDiff;
2748 }
2749 else {
2750 bSig |= LIT64( 0x4000000000000000 );
2751 }
2752 shift64RightJamming( bSig, expDiff, &bSig );
2753 aSig |= LIT64( 0x4000000000000000 );
2754 aBigger:
2755 zSig = aSig - bSig;
2756 zExp = aExp;
2757 normalizeRoundAndPack:
2758 --zExp;
2759 return normalizeRoundAndPackFloat64( zSign, zExp, zSig );
2760
2761 }
2762
2763 /*----------------------------------------------------------------------------
2764 | Returns the result of adding the double-precision floating-point values `a'
2765 | and `b'. The operation is performed according to the IEC/IEEE Standard for
2766 | Binary Floating-Point Arithmetic.
2767 *----------------------------------------------------------------------------*/
2768
2769 float64 float64_add( float64 a, float64 b )
2770 {
2771 flag aSign, bSign;
2772
2773 aSign = extractFloat64Sign( a );
2774 bSign = extractFloat64Sign( b );
2775 if ( aSign == bSign ) {
2776 return addFloat64Sigs( a, b, aSign );
2777 }
2778 else {
2779 return subFloat64Sigs( a, b, aSign );
2780 }
2781
2782 }
2783
2784 /*----------------------------------------------------------------------------
2785 | Returns the result of subtracting the double-precision floating-point values
2786 | `a' and `b'. The operation is performed according to the IEC/IEEE Standard
2787 | for Binary Floating-Point Arithmetic.
2788 *----------------------------------------------------------------------------*/
2789
2790 float64 float64_sub( float64 a, float64 b )
2791 {
2792 flag aSign, bSign;
2793
2794 aSign = extractFloat64Sign( a );
2795 bSign = extractFloat64Sign( b );
2796 if ( aSign == bSign ) {
2797 return subFloat64Sigs( a, b, aSign );
2798 }
2799 else {
2800 return addFloat64Sigs( a, b, aSign );
2801 }
2802
2803 }
2804
2805 /*----------------------------------------------------------------------------
2806 | Returns the result of multiplying the double-precision floating-point values
2807 | `a' and `b'. The operation is performed according to the IEC/IEEE Standard
2808 | for Binary Floating-Point Arithmetic.
2809 *----------------------------------------------------------------------------*/
2810
2811 float64 float64_mul( float64 a, float64 b )
2812 {
2813 flag aSign, bSign, zSign;
2814 int16 aExp, bExp, zExp;
2815 bits64 aSig, bSig, zSig0, zSig1;
2816
2817 aSig = extractFloat64Frac( a );
2818 aExp = extractFloat64Exp( a );
2819 aSign = extractFloat64Sign( a );
2820 bSig = extractFloat64Frac( b );
2821 bExp = extractFloat64Exp( b );
2822 bSign = extractFloat64Sign( b );
2823 zSign = aSign ^ bSign;
2824 if ( aExp == 0x7FF ) {
2825 if ( aSig || ( ( bExp == 0x7FF ) && bSig ) ) {
2826 return propagateFloat64NaN( a, b );
2827 }
2828 if ( ( bExp | bSig ) == 0 ) {
2829 float_raise( float_flag_invalid );
2830 return float64_default_nan;
2831 }
2832 return packFloat64( zSign, 0x7FF, 0 );
2833 }
2834 if ( bExp == 0x7FF ) {
2835 if ( bSig ) return propagateFloat64NaN( a, b );
2836 if ( ( aExp | aSig ) == 0 ) {
2837 float_raise( float_flag_invalid );
2838 return float64_default_nan;
2839 }
2840 return packFloat64( zSign, 0x7FF, 0 );
2841 }
2842 if ( aExp == 0 ) {
2843 if ( aSig == 0 ) return packFloat64( zSign, 0, 0 );
2844 normalizeFloat64Subnormal( aSig, &aExp, &aSig );
2845 }
2846 if ( bExp == 0 ) {
2847 if ( bSig == 0 ) return packFloat64( zSign, 0, 0 );
2848 normalizeFloat64Subnormal( bSig, &bExp, &bSig );
2849 }
2850 zExp = aExp + bExp - 0x3FF;
2851 aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<10;
2852 bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;
2853 mul64To128( aSig, bSig, &zSig0, &zSig1 );
2854 zSig0 |= ( zSig1 != 0 );
2855 if ( 0 <= (sbits64) ( zSig0<<1 ) ) {
2856 zSig0 <<= 1;
2857 --zExp;
2858 }
2859 return roundAndPackFloat64( zSign, zExp, zSig0 );
2860
2861 }
2862
2863 /*----------------------------------------------------------------------------
2864 | Returns the result of dividing the double-precision floating-point value `a'
2865 | by the corresponding value `b'. The operation is performed according to
2866 | the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2867 *----------------------------------------------------------------------------*/
2868
2869 float64 float64_div( float64 a, float64 b )
2870 {
2871 flag aSign, bSign, zSign;
2872 int16 aExp, bExp, zExp;
2873 bits64 aSig, bSig, zSig;
2874 bits64 rem0, rem1;
2875 bits64 term0, term1;
2876
2877 aSig = extractFloat64Frac( a );
2878 aExp = extractFloat64Exp( a );
2879 aSign = extractFloat64Sign( a );
2880 bSig = extractFloat64Frac( b );
2881 bExp = extractFloat64Exp( b );
2882 bSign = extractFloat64Sign( b );
2883 zSign = aSign ^ bSign;
2884 if ( aExp == 0x7FF ) {
2885 if ( aSig ) return propagateFloat64NaN( a, b );
2886 if ( bExp == 0x7FF ) {
2887 if ( bSig ) return propagateFloat64NaN( a, b );
2888 float_raise( float_flag_invalid );
2889 return float64_default_nan;
2890 }
2891 return packFloat64( zSign, 0x7FF, 0 );
2892 }
2893 if ( bExp == 0x7FF ) {
2894 if ( bSig ) return propagateFloat64NaN( a, b );
2895 return packFloat64( zSign, 0, 0 );
2896 }
2897 if ( bExp == 0 ) {
2898 if ( bSig == 0 ) {
2899 if ( ( aExp | aSig ) == 0 ) {
2900 float_raise( float_flag_invalid );
2901 return float64_default_nan;
2902 }
2903 float_raise( float_flag_divbyzero );
2904 return packFloat64( zSign, 0x7FF, 0 );
2905 }
2906 normalizeFloat64Subnormal( bSig, &bExp, &bSig );
2907 }
2908 if ( aExp == 0 ) {
2909 if ( aSig == 0 ) return packFloat64( zSign, 0, 0 );
2910 normalizeFloat64Subnormal( aSig, &aExp, &aSig );
2911 }
2912 zExp = aExp - bExp + 0x3FD;
2913 aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<10;
2914 bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;
2915 if ( bSig <= ( aSig + aSig ) ) {
2916 aSig >>= 1;
2917 ++zExp;
2918 }
2919 zSig = estimateDiv128To64( aSig, 0, bSig );
2920 if ( ( zSig & 0x1FF ) <= 2 ) {
2921 mul64To128( bSig, zSig, &term0, &term1 );
2922 sub128( aSig, 0, term0, term1, &rem0, &rem1 );
2923 while ( (sbits64) rem0 < 0 ) {
2924 --zSig;
2925 add128( rem0, rem1, 0, bSig, &rem0, &rem1 );
2926 }
2927 zSig |= ( rem1 != 0 );
2928 }
2929 return roundAndPackFloat64( zSign, zExp, zSig );
2930
2931 }
2932
2933 #ifndef SOFTFLOAT_FOR_GCC
2934 /*----------------------------------------------------------------------------
2935 | Returns the remainder of the double-precision floating-point value `a'
2936 | with respect to the corresponding value `b'. The operation is performed
2937 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2938 *----------------------------------------------------------------------------*/
2939
2940 float64 float64_rem( float64 a, float64 b )
2941 {
2942 flag aSign, bSign __unused, zSign;
2943 int16 aExp, bExp, expDiff;
2944 bits64 aSig, bSig;
2945 bits64 q, alternateASig;
2946 sbits64 sigMean;
2947
2948 aSig = extractFloat64Frac( a );
2949 aExp = extractFloat64Exp( a );
2950 aSign = extractFloat64Sign( a );
2951 bSig = extractFloat64Frac( b );
2952 bExp = extractFloat64Exp( b );
2953 bSign = extractFloat64Sign( b );
2954 if ( aExp == 0x7FF ) {
2955 if ( aSig || ( ( bExp == 0x7FF ) && bSig ) ) {
2956 return propagateFloat64NaN( a, b );
2957 }
2958 float_raise( float_flag_invalid );
2959 return float64_default_nan;
2960 }
2961 if ( bExp == 0x7FF ) {
2962 if ( bSig ) return propagateFloat64NaN( a, b );
2963 return a;
2964 }
2965 if ( bExp == 0 ) {
2966 if ( bSig == 0 ) {
2967 float_raise( float_flag_invalid );
2968 return float64_default_nan;
2969 }
2970 normalizeFloat64Subnormal( bSig, &bExp, &bSig );
2971 }
2972 if ( aExp == 0 ) {
2973 if ( aSig == 0 ) return a;
2974 normalizeFloat64Subnormal( aSig, &aExp, &aSig );
2975 }
2976 expDiff = aExp - bExp;
2977 aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<11;
2978 bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;
2979 if ( expDiff < 0 ) {
2980 if ( expDiff < -1 ) return a;
2981 aSig >>= 1;
2982 }
2983 q = ( bSig <= aSig );
2984 if ( q ) aSig -= bSig;
2985 expDiff -= 64;
2986 while ( 0 < expDiff ) {
2987 q = estimateDiv128To64( aSig, 0, bSig );
2988 q = ( 2 < q ) ? q - 2 : 0;
2989 aSig = - ( ( bSig>>2 ) * q );
2990 expDiff -= 62;
2991 }
2992 expDiff += 64;
2993 if ( 0 < expDiff ) {
2994 q = estimateDiv128To64( aSig, 0, bSig );
2995 q = ( 2 < q ) ? q - 2 : 0;
2996 q >>= 64 - expDiff;
2997 bSig >>= 2;
2998 aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q;
2999 }
3000 else {
3001 aSig >>= 2;
3002 bSig >>= 2;
3003 }
3004 do {
3005 alternateASig = aSig;
3006 ++q;
3007 aSig -= bSig;
3008 } while ( 0 <= (sbits64) aSig );
3009 sigMean = aSig + alternateASig;
3010 if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) {
3011 aSig = alternateASig;
3012 }
3013 zSign = ( (sbits64) aSig < 0 );
3014 if ( zSign ) aSig = - aSig;
3015 return normalizeRoundAndPackFloat64( aSign ^ zSign, bExp, aSig );
3016
3017 }
3018
3019 /*----------------------------------------------------------------------------
3020 | Returns the square root of the double-precision floating-point value `a'.
3021 | The operation is performed according to the IEC/IEEE Standard for Binary
3022 | Floating-Point Arithmetic.
3023 *----------------------------------------------------------------------------*/
3024
3025 float64 float64_sqrt( float64 a )
3026 {
3027 flag aSign;
3028 int16 aExp, zExp;
3029 bits64 aSig, zSig, doubleZSig;
3030 bits64 rem0, rem1, term0, term1;
3031
3032 aSig = extractFloat64Frac( a );
3033 aExp = extractFloat64Exp( a );
3034 aSign = extractFloat64Sign( a );
3035 if ( aExp == 0x7FF ) {
3036 if ( aSig ) return propagateFloat64NaN( a, a );
3037 if ( ! aSign ) return a;
3038 float_raise( float_flag_invalid );
3039 return float64_default_nan;
3040 }
3041 if ( aSign ) {
3042 if ( ( aExp | aSig ) == 0 ) return a;
3043 float_raise( float_flag_invalid );
3044 return float64_default_nan;
3045 }
3046 if ( aExp == 0 ) {
3047 if ( aSig == 0 ) return 0;
3048 normalizeFloat64Subnormal( aSig, &aExp, &aSig );
3049 }
3050 zExp = ( ( aExp - 0x3FF )>>1 ) + 0x3FE;
3051 aSig |= LIT64( 0x0010000000000000 );
3052 zSig = estimateSqrt32( aExp, aSig>>21 );
3053 aSig <<= 9 - ( aExp & 1 );
3054 zSig = estimateDiv128To64( aSig, 0, zSig<<32 ) + ( zSig<<30 );
3055 if ( ( zSig & 0x1FF ) <= 5 ) {
3056 doubleZSig = zSig<<1;
3057 mul64To128( zSig, zSig, &term0, &term1 );
3058 sub128( aSig, 0, term0, term1, &rem0, &rem1 );
3059 while ( (sbits64) rem0 < 0 ) {
3060 --zSig;
3061 doubleZSig -= 2;
3062 add128( rem0, rem1, zSig>>63, doubleZSig | 1, &rem0, &rem1 );
3063 }
3064 zSig |= ( ( rem0 | rem1 ) != 0 );
3065 }
3066 return roundAndPackFloat64( 0, zExp, zSig );
3067
3068 }
3069 #endif
3070
3071 /*----------------------------------------------------------------------------
3072 | Returns 1 if the double-precision floating-point value `a' is equal to the
3073 | corresponding value `b', and 0 otherwise. The comparison is performed
3074 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3075 *----------------------------------------------------------------------------*/
3076
3077 flag float64_eq( float64 a, float64 b )
3078 {
3079
3080 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
3081 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
3082 ) {
3083 if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) {
3084 float_raise( float_flag_invalid );
3085 }
3086 return 0;
3087 }
3088 return ( a == b ) ||
3089 ( (bits64) ( ( FLOAT64_DEMANGLE(a) | FLOAT64_DEMANGLE(b) )<<1 ) == 0 );
3090
3091 }
3092
3093 /*----------------------------------------------------------------------------
3094 | Returns 1 if the double-precision floating-point value `a' is less than or
3095 | equal to the corresponding value `b', and 0 otherwise. The comparison is
3096 | performed according to the IEC/IEEE Standard for Binary Floating-Point
3097 | Arithmetic.
3098 *----------------------------------------------------------------------------*/
3099
3100 flag float64_le( float64 a, float64 b )
3101 {
3102 flag aSign, bSign;
3103
3104 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
3105 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
3106 ) {
3107 float_raise( float_flag_invalid );
3108 return 0;
3109 }
3110 aSign = extractFloat64Sign( a );
3111 bSign = extractFloat64Sign( b );
3112 if ( aSign != bSign )
3113 return aSign ||
3114 ( (bits64) ( ( FLOAT64_DEMANGLE(a) | FLOAT64_DEMANGLE(b) )<<1 ) ==
3115 0 );
3116 return ( a == b ) ||
3117 ( aSign ^ ( FLOAT64_DEMANGLE(a) < FLOAT64_DEMANGLE(b) ) );
3118
3119 }
3120
3121 /*----------------------------------------------------------------------------
3122 | Returns 1 if the double-precision floating-point value `a' is less than
3123 | the corresponding value `b', and 0 otherwise. The comparison is performed
3124 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3125 *----------------------------------------------------------------------------*/
3126
3127 flag float64_lt( float64 a, float64 b )
3128 {
3129 flag aSign, bSign;
3130
3131 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
3132 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
3133 ) {
3134 float_raise( float_flag_invalid );
3135 return 0;
3136 }
3137 aSign = extractFloat64Sign( a );
3138 bSign = extractFloat64Sign( b );
3139 if ( aSign != bSign )
3140 return aSign &&
3141 ( (bits64) ( ( FLOAT64_DEMANGLE(a) | FLOAT64_DEMANGLE(b) )<<1 ) !=
3142 0 );
3143 return ( a != b ) &&
3144 ( aSign ^ ( FLOAT64_DEMANGLE(a) < FLOAT64_DEMANGLE(b) ) );
3145
3146 }
3147
3148 #ifndef SOFTFLOAT_FOR_GCC
3149 /*----------------------------------------------------------------------------
3150 | Returns 1 if the double-precision floating-point value `a' is equal to the
3151 | corresponding value `b', and 0 otherwise. The invalid exception is raised
3152 | if either operand is a NaN. Otherwise, the comparison is performed
3153 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3154 *----------------------------------------------------------------------------*/
3155
3156 flag float64_eq_signaling( float64 a, float64 b )
3157 {
3158
3159 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
3160 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
3161 ) {
3162 float_raise( float_flag_invalid );
3163 return 0;
3164 }
3165 return ( a == b ) || ( (bits64) ( ( a | b )<<1 ) == 0 );
3166
3167 }
3168
3169 /*----------------------------------------------------------------------------
3170 | Returns 1 if the double-precision floating-point value `a' is less than or
3171 | equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not
3172 | cause an exception. Otherwise, the comparison is performed according to the
3173 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3174 *----------------------------------------------------------------------------*/
3175
3176 flag float64_le_quiet( float64 a, float64 b )
3177 {
3178 flag aSign, bSign;
3179
3180 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
3181 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
3182 ) {
3183 if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) {
3184 float_raise( float_flag_invalid );
3185 }
3186 return 0;
3187 }
3188 aSign = extractFloat64Sign( a );
3189 bSign = extractFloat64Sign( b );
3190 if ( aSign != bSign ) return aSign || ( (bits64) ( ( a | b )<<1 ) == 0 );
3191 return ( a == b ) || ( aSign ^ ( a < b ) );
3192
3193 }
3194
3195 /*----------------------------------------------------------------------------
3196 | Returns 1 if the double-precision floating-point value `a' is less than
3197 | the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
3198 | exception. Otherwise, the comparison is performed according to the IEC/IEEE
3199 | Standard for Binary Floating-Point Arithmetic.
3200 *----------------------------------------------------------------------------*/
3201
3202 flag float64_lt_quiet( float64 a, float64 b )
3203 {
3204 flag aSign, bSign;
3205
3206 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
3207 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
3208 ) {
3209 if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) {
3210 float_raise( float_flag_invalid );
3211 }
3212 return 0;
3213 }
3214 aSign = extractFloat64Sign( a );
3215 bSign = extractFloat64Sign( b );
3216 if ( aSign != bSign ) return aSign && ( (bits64) ( ( a | b )<<1 ) != 0 );
3217 return ( a != b ) && ( aSign ^ ( a < b ) );
3218
3219 }
3220 #endif
3221
3222 #ifdef FLOATX80
3223
3224 /*----------------------------------------------------------------------------
3225 | Returns the result of converting the extended double-precision floating-
3226 | point value `a' to the 32-bit two's complement integer format. The
3227 | conversion is performed according to the IEC/IEEE Standard for Binary
3228 | Floating-Point Arithmetic---which means in particular that the conversion
3229 | is rounded according to the current rounding mode. If `a' is a NaN, the
3230 | largest positive integer is returned. Otherwise, if the conversion
3231 | overflows, the largest integer with the same sign as `a' is returned.
3232 *----------------------------------------------------------------------------*/
3233
3234 int32 floatx80_to_int32( floatx80 a )
3235 {
3236 flag aSign;
3237 int32 aExp, shiftCount;
3238 bits64 aSig;
3239
3240 aSig = extractFloatx80Frac( a );
3241 aExp = extractFloatx80Exp( a );
3242 aSign = extractFloatx80Sign( a );
3243 if ( ( aExp == 0x7FFF ) && (bits64) ( aSig<<1 ) ) aSign = 0;
3244 shiftCount = 0x4037 - aExp;
3245 if ( shiftCount <= 0 ) shiftCount = 1;
3246 shift64RightJamming( aSig, shiftCount, &aSig );
3247 return roundAndPackInt32( aSign, aSig );
3248
3249 }
3250
3251 /*----------------------------------------------------------------------------
3252 | Returns the result of converting the extended double-precision floating-
3253 | point value `a' to the 32-bit two's complement integer format. The
3254 | conversion is performed according to the IEC/IEEE Standard for Binary
3255 | Floating-Point Arithmetic, except that the conversion is always rounded
3256 | toward zero. If `a' is a NaN, the largest positive integer is returned.
3257 | Otherwise, if the conversion overflows, the largest integer with the same
3258 | sign as `a' is returned.
3259 *----------------------------------------------------------------------------*/
3260
3261 int32 floatx80_to_int32_round_to_zero( floatx80 a )
3262 {
3263 flag aSign;
3264 int32 aExp, shiftCount;
3265 bits64 aSig, savedASig;
3266 int32 z;
3267
3268 aSig = extractFloatx80Frac( a );
3269 aExp = extractFloatx80Exp( a );
3270 aSign = extractFloatx80Sign( a );
3271 if ( 0x401E < aExp ) {
3272 if ( ( aExp == 0x7FFF ) && (bits64) ( aSig<<1 ) ) aSign = 0;
3273 goto invalid;
3274 }
3275 else if ( aExp < 0x3FFF ) {
3276 if ( aExp || aSig ) float_set_inexact();
3277 return 0;
3278 }
3279 shiftCount = 0x403E - aExp;
3280 savedASig = aSig;
3281 aSig >>= shiftCount;
3282 z = aSig;
3283 if ( aSign ) z = - z;
3284 if ( ( z < 0 ) ^ aSign ) {
3285 invalid:
3286 float_raise( float_flag_invalid );
3287 return aSign ? (sbits32) 0x80000000 : 0x7FFFFFFF;
3288 }
3289 if ( ( aSig<<shiftCount ) != savedASig ) {
3290 float_set_inexact();
3291 }
3292 return z;
3293
3294 }
3295
3296 /*----------------------------------------------------------------------------
3297 | Returns the result of converting the extended double-precision floating-
3298 | point value `a' to the 64-bit two's complement integer format. The
3299 | conversion is performed according to the IEC/IEEE Standard for Binary
3300 | Floating-Point Arithmetic---which means in particular that the conversion
3301 | is rounded according to the current rounding mode. If `a' is a NaN,
3302 | the largest positive integer is returned. Otherwise, if the conversion
3303 | overflows, the largest integer with the same sign as `a' is returned.
3304 *----------------------------------------------------------------------------*/
3305
3306 int64 floatx80_to_int64( floatx80 a )
3307 {
3308 flag aSign;
3309 int32 aExp, shiftCount;
3310 bits64 aSig, aSigExtra;
3311
3312 aSig = extractFloatx80Frac( a );
3313 aExp = extractFloatx80Exp( a );
3314 aSign = extractFloatx80Sign( a );
3315 shiftCount = 0x403E - aExp;
3316 if ( shiftCount <= 0 ) {
3317 if ( shiftCount ) {
3318 float_raise( float_flag_invalid );
3319 if ( ! aSign
3320 || ( ( aExp == 0x7FFF )
3321 && ( aSig != LIT64( 0x8000000000000000 ) ) )
3322 ) {
3323 return LIT64( 0x7FFFFFFFFFFFFFFF );
3324 }
3325 return (sbits64) LIT64( 0x8000000000000000 );
3326 }
3327 aSigExtra = 0;
3328 }
3329 else {
3330 shift64ExtraRightJamming( aSig, 0, shiftCount, &aSig, &aSigExtra );
3331 }
3332 return roundAndPackInt64( aSign, aSig, aSigExtra );
3333
3334 }
3335
3336 /*----------------------------------------------------------------------------
3337 | Returns the result of converting the extended double-precision floating-
3338 | point value `a' to the 64-bit two's complement integer format. The
3339 | conversion is performed according to the IEC/IEEE Standard for Binary
3340 | Floating-Point Arithmetic, except that the conversion is always rounded
3341 | toward zero. If `a' is a NaN, the largest positive integer is returned.
3342 | Otherwise, if the conversion overflows, the largest integer with the same
3343 | sign as `a' is returned.
3344 *----------------------------------------------------------------------------*/
3345
3346 int64 floatx80_to_int64_round_to_zero( floatx80 a )
3347 {
3348 flag aSign;
3349 int32 aExp, shiftCount;
3350 bits64 aSig;
3351 int64 z;
3352
3353 aSig = extractFloatx80Frac( a );
3354 aExp = extractFloatx80Exp( a );
3355 aSign = extractFloatx80Sign( a );
3356 shiftCount = aExp - 0x403E;
3357 if ( 0 <= shiftCount ) {
3358 aSig &= LIT64( 0x7FFFFFFFFFFFFFFF );
3359 if ( ( a.high != 0xC03E ) || aSig ) {
3360 float_raise( float_flag_invalid );
3361 if ( ! aSign || ( ( aExp == 0x7FFF ) && aSig ) ) {
3362 return LIT64( 0x7FFFFFFFFFFFFFFF );
3363 }
3364 }
3365 return (sbits64) LIT64( 0x8000000000000000 );
3366 }
3367 else if ( aExp < 0x3FFF ) {
3368 if ( aExp | aSig ) float_set_inexact();
3369 return 0;
3370 }
3371 z = aSig>>( - shiftCount );
3372 if ( (bits64) ( aSig<<( shiftCount & 63 ) ) ) {
3373 float_set_inexact();
3374 }
3375 if ( aSign ) z = - z;
3376 return z;
3377
3378 }
3379
3380 /*----------------------------------------------------------------------------
3381 | Returns the result of converting the extended double-precision floating-
3382 | point value `a' to the single-precision floating-point format. The
3383 | conversion is performed according to the IEC/IEEE Standard for Binary
3384 | Floating-Point Arithmetic.
3385 *----------------------------------------------------------------------------*/
3386
3387 float32 floatx80_to_float32( floatx80 a )
3388 {
3389 flag aSign;
3390 int32 aExp;
3391 bits64 aSig;
3392
3393 aSig = extractFloatx80Frac( a );
3394 aExp = extractFloatx80Exp( a );
3395 aSign = extractFloatx80Sign( a );
3396 if ( aExp == 0x7FFF ) {
3397 if ( (bits64) ( aSig<<1 ) ) {
3398 return commonNaNToFloat32( floatx80ToCommonNaN( a ) );
3399 }
3400 return packFloat32( aSign, 0xFF, 0 );
3401 }
3402 shift64RightJamming( aSig, 33, &aSig );
3403 if ( aExp || aSig ) aExp -= 0x3F81;
3404 return roundAndPackFloat32( aSign, aExp, aSig );
3405
3406 }
3407
3408 /*----------------------------------------------------------------------------
3409 | Returns the result of converting the extended double-precision floating-
3410 | point value `a' to the double-precision floating-point format. The
3411 | conversion is performed according to the IEC/IEEE Standard for Binary
3412 | Floating-Point Arithmetic.
3413 *----------------------------------------------------------------------------*/
3414
3415 float64 floatx80_to_float64( floatx80 a )
3416 {
3417 flag aSign;
3418 int32 aExp;
3419 bits64 aSig, zSig;
3420
3421 aSig = extractFloatx80Frac( a );
3422 aExp = extractFloatx80Exp( a );
3423 aSign = extractFloatx80Sign( a );
3424 if ( aExp == 0x7FFF ) {
3425 if ( (bits64) ( aSig<<1 ) ) {
3426 return commonNaNToFloat64( floatx80ToCommonNaN( a ) );
3427 }
3428 return packFloat64( aSign, 0x7FF, 0 );
3429 }
3430 shift64RightJamming( aSig, 1, &zSig );
3431 if ( aExp || aSig ) aExp -= 0x3C01;
3432 return roundAndPackFloat64( aSign, aExp, zSig );
3433
3434 }
3435
3436 #ifdef FLOAT128
3437
3438 /*----------------------------------------------------------------------------
3439 | Returns the result of converting the extended double-precision floating-
3440 | point value `a' to the quadruple-precision floating-point format. The
3441 | conversion is performed according to the IEC/IEEE Standard for Binary
3442 | Floating-Point Arithmetic.
3443 *----------------------------------------------------------------------------*/
3444
3445 float128 floatx80_to_float128( floatx80 a )
3446 {
3447 flag aSign;
3448 int16 aExp;
3449 bits64 aSig, zSig0, zSig1;
3450
3451 aSig = extractFloatx80Frac( a );
3452 aExp = extractFloatx80Exp( a );
3453 aSign = extractFloatx80Sign( a );
3454 if ( ( aExp == 0x7FFF ) && (bits64) ( aSig<<1 ) ) {
3455 return commonNaNToFloat128( floatx80ToCommonNaN( a ) );
3456 }
3457 shift128Right( aSig<<1, 0, 16, &zSig0, &zSig1 );
3458 return packFloat128( aSign, aExp, zSig0, zSig1 );
3459
3460 }
3461
3462 #endif
3463
3464 /*----------------------------------------------------------------------------
3465 | Rounds the extended double-precision floating-point value `a' to an integer,
3466 | and returns the result as an extended quadruple-precision floating-point
3467 | value. The operation is performed according to the IEC/IEEE Standard for
3468 | Binary Floating-Point Arithmetic.
3469 *----------------------------------------------------------------------------*/
3470
3471 floatx80 floatx80_round_to_int( floatx80 a )
3472 {
3473 flag aSign;
3474 int32 aExp;
3475 bits64 lastBitMask, roundBitsMask;
3476 int8 roundingMode;
3477 floatx80 z;
3478
3479 aExp = extractFloatx80Exp( a );
3480 if ( 0x403E <= aExp ) {
3481 if ( ( aExp == 0x7FFF ) && (bits64) ( extractFloatx80Frac( a )<<1 ) ) {
3482 return propagateFloatx80NaN( a, a );
3483 }
3484 return a;
3485 }
3486 if ( aExp < 0x3FFF ) {
3487 if ( ( aExp == 0 )
3488 && ( (bits64) ( extractFloatx80Frac( a )<<1 ) == 0 ) ) {
3489 return a;
3490 }
3491 float_set_inexact();
3492 aSign = extractFloatx80Sign( a );
3493 switch ( float_rounding_mode() ) {
3494 case float_round_nearest_even:
3495 if ( ( aExp == 0x3FFE ) && (bits64) ( extractFloatx80Frac( a )<<1 )
3496 ) {
3497 return
3498 packFloatx80( aSign, 0x3FFF, LIT64( 0x8000000000000000 ) );
3499 }
3500 break;
3501 case float_round_down:
3502 return
3503 aSign ?
3504 packFloatx80( 1, 0x3FFF, LIT64( 0x8000000000000000 ) )
3505 : packFloatx80( 0, 0, 0 );
3506 case float_round_up:
3507 return
3508 aSign ? packFloatx80( 1, 0, 0 )
3509 : packFloatx80( 0, 0x3FFF, LIT64( 0x8000000000000000 ) );
3510 }
3511 return packFloatx80( aSign, 0, 0 );
3512 }
3513 lastBitMask = 1;
3514 lastBitMask <<= 0x403E - aExp;
3515 roundBitsMask = lastBitMask - 1;
3516 z = a;
3517 roundingMode = float_rounding_mode();
3518 if ( roundingMode == float_round_nearest_even ) {
3519 z.low += lastBitMask>>1;
3520 if ( ( z.low & roundBitsMask ) == 0 ) z.low &= ~ lastBitMask;
3521 }
3522 else if ( roundingMode != float_round_to_zero ) {
3523 if ( extractFloatx80Sign( z ) ^ ( roundingMode == float_round_up ) ) {
3524 z.low += roundBitsMask;
3525 }
3526 }
3527 z.low &= ~ roundBitsMask;
3528 if ( z.low == 0 ) {
3529 ++z.high;
3530 z.low = LIT64( 0x8000000000000000 );
3531 }
3532 if ( z.low != a.low ) float_set_inexact();
3533 return z;
3534
3535 }
3536
3537 /*----------------------------------------------------------------------------
3538 | Returns the result of adding the absolute values of the extended double-
3539 | precision floating-point values `a' and `b'. If `zSign' is 1, the sum is
3540 | negated before being returned. `zSign' is ignored if the result is a NaN.
3541 | The addition is performed according to the IEC/IEEE Standard for Binary
3542 | Floating-Point Arithmetic.
3543 *----------------------------------------------------------------------------*/
3544
3545 static floatx80 addFloatx80Sigs( floatx80 a, floatx80 b, flag zSign )
3546 {
3547 int32 aExp, bExp, zExp;
3548 bits64 aSig, bSig, zSig0, zSig1;
3549 int32 expDiff;
3550
3551 aSig = extractFloatx80Frac( a );
3552 aExp = extractFloatx80Exp( a );
3553 bSig = extractFloatx80Frac( b );
3554 bExp = extractFloatx80Exp( b );
3555 expDiff = aExp - bExp;
3556 if ( 0 < expDiff ) {
3557 if ( aExp == 0x7FFF ) {
3558 if ( (bits64) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b );
3559 return a;
3560 }
3561 if ( bExp == 0 ) --expDiff;
3562 shift64ExtraRightJamming( bSig, 0, expDiff, &bSig, &zSig1 );
3563 zExp = aExp;
3564 }
3565 else if ( expDiff < 0 ) {
3566 if ( bExp == 0x7FFF ) {
3567 if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b );
3568 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
3569 }
3570 if ( aExp == 0 ) ++expDiff;
3571 shift64ExtraRightJamming( aSig, 0, - expDiff, &aSig, &zSig1 );
3572 zExp = bExp;
3573 }
3574 else {
3575 if ( aExp == 0x7FFF ) {
3576 if ( (bits64) ( ( aSig | bSig )<<1 ) ) {
3577 return propagateFloatx80NaN( a, b );
3578 }
3579 return a;
3580 }
3581 zSig1 = 0;
3582 zSig0 = aSig + bSig;
3583 if ( aExp == 0 ) {
3584 normalizeFloatx80Subnormal( zSig0, &zExp, &zSig0 );
3585 goto roundAndPack;
3586 }
3587 zExp = aExp;
3588 goto shiftRight1;
3589 }
3590 zSig0 = aSig + bSig;
3591 if ( (sbits64) zSig0 < 0 ) goto roundAndPack;
3592 shiftRight1:
3593 shift64ExtraRightJamming( zSig0, zSig1, 1, &zSig0, &zSig1 );
3594 zSig0 |= LIT64( 0x8000000000000000 );
3595 ++zExp;
3596 roundAndPack:
3597 return
3598 roundAndPackFloatx80(
3599 floatx80_rounding_precision, zSign, zExp, zSig0, zSig1 );
3600
3601 }
3602
3603 /*----------------------------------------------------------------------------
3604 | Returns the result of subtracting the absolute values of the extended
3605 | double-precision floating-point values `a' and `b'. If `zSign' is 1, the
3606 | difference is negated before being returned. `zSign' is ignored if the
3607 | result is a NaN. The subtraction is performed according to the IEC/IEEE
3608 | Standard for Binary Floating-Point Arithmetic.
3609 *----------------------------------------------------------------------------*/
3610
3611 static floatx80 subFloatx80Sigs( floatx80 a, floatx80 b, flag zSign )
3612 {
3613 int32 aExp, bExp, zExp;
3614 bits64 aSig, bSig, zSig0, zSig1;
3615 int32 expDiff;
3616 floatx80 z;
3617
3618 aSig = extractFloatx80Frac( a );
3619 aExp = extractFloatx80Exp( a );
3620 bSig = extractFloatx80Frac( b );
3621 bExp = extractFloatx80Exp( b );
3622 expDiff = aExp - bExp;
3623 if ( 0 < expDiff ) goto aExpBigger;
3624 if ( expDiff < 0 ) goto bExpBigger;
3625 if ( aExp == 0x7FFF ) {
3626 if ( (bits64) ( ( aSig | bSig )<<1 ) ) {
3627 return propagateFloatx80NaN( a, b );
3628 }
3629 float_raise( float_flag_invalid );
3630 z.low = floatx80_default_nan_low;
3631 z.high = floatx80_default_nan_high;
3632 return z;
3633 }
3634 if ( aExp == 0 ) {
3635 aExp = 1;
3636 bExp = 1;
3637 }
3638 zSig1 = 0;
3639 if ( bSig < aSig ) goto aBigger;
3640 if ( aSig < bSig ) goto bBigger;
3641 return packFloatx80( float_rounding_mode() == float_round_down, 0, 0 );
3642 bExpBigger:
3643 if ( bExp == 0x7FFF ) {
3644 if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b );
3645 return packFloatx80( zSign ^ 1, 0x7FFF, LIT64( 0x8000000000000000 ) );
3646 }
3647 if ( aExp == 0 ) ++expDiff;
3648 shift128RightJamming( aSig, 0, - expDiff, &aSig, &zSig1 );
3649 bBigger:
3650 sub128( bSig, 0, aSig, zSig1, &zSig0, &zSig1 );
3651 zExp = bExp;
3652 zSign ^= 1;
3653 goto normalizeRoundAndPack;
3654 aExpBigger:
3655 if ( aExp == 0x7FFF ) {
3656 if ( (bits64) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b );
3657 return a;
3658 }
3659 if ( bExp == 0 ) --expDiff;
3660 shift128RightJamming( bSig, 0, expDiff, &bSig, &zSig1 );
3661 aBigger:
3662 sub128( aSig, 0, bSig, zSig1, &zSig0, &zSig1 );
3663 zExp = aExp;
3664 normalizeRoundAndPack:
3665 return
3666 normalizeRoundAndPackFloatx80(
3667 floatx80_rounding_precision, zSign, zExp, zSig0, zSig1 );
3668
3669 }
3670
3671 /*----------------------------------------------------------------------------
3672 | Returns the result of adding the extended double-precision floating-point
3673 | values `a' and `b'. The operation is performed according to the IEC/IEEE
3674 | Standard for Binary Floating-Point Arithmetic.
3675 *----------------------------------------------------------------------------*/
3676
3677 floatx80 floatx80_add( floatx80 a, floatx80 b )
3678 {
3679 flag aSign, bSign;
3680
3681 aSign = extractFloatx80Sign( a );
3682 bSign = extractFloatx80Sign( b );
3683 if ( aSign == bSign ) {
3684 return addFloatx80Sigs( a, b, aSign );
3685 }
3686 else {
3687 return subFloatx80Sigs( a, b, aSign );
3688 }
3689
3690 }
3691
3692 /*----------------------------------------------------------------------------
3693 | Returns the result of subtracting the extended double-precision floating-
3694 | point values `a' and `b'. The operation is performed according to the
3695 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3696 *----------------------------------------------------------------------------*/
3697
3698 floatx80 floatx80_sub( floatx80 a, floatx80 b )
3699 {
3700 flag aSign, bSign;
3701
3702 aSign = extractFloatx80Sign( a );
3703 bSign = extractFloatx80Sign( b );
3704 if ( aSign == bSign ) {
3705 return subFloatx80Sigs( a, b, aSign );
3706 }
3707 else {
3708 return addFloatx80Sigs( a, b, aSign );
3709 }
3710
3711 }
3712
3713 /*----------------------------------------------------------------------------
3714 | Returns the result of multiplying the extended double-precision floating-
3715 | point values `a' and `b'. The operation is performed according to the
3716 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3717 *----------------------------------------------------------------------------*/
3718
3719 floatx80 floatx80_mul( floatx80 a, floatx80 b )
3720 {
3721 flag aSign, bSign, zSign;
3722 int32 aExp, bExp, zExp;
3723 bits64 aSig, bSig, zSig0, zSig1;
3724 floatx80 z;
3725
3726 aSig = extractFloatx80Frac( a );
3727 aExp = extractFloatx80Exp( a );
3728 aSign = extractFloatx80Sign( a );
3729 bSig = extractFloatx80Frac( b );
3730 bExp = extractFloatx80Exp( b );
3731 bSign = extractFloatx80Sign( b );
3732 zSign = aSign ^ bSign;
3733 if ( aExp == 0x7FFF ) {
3734 if ( (bits64) ( aSig<<1 )
3735 || ( ( bExp == 0x7FFF ) && (bits64) ( bSig<<1 ) ) ) {
3736 return propagateFloatx80NaN( a, b );
3737 }
3738 if ( ( bExp | bSig ) == 0 ) goto invalid;
3739 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
3740 }
3741 if ( bExp == 0x7FFF ) {
3742 if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b );
3743 if ( ( aExp | aSig ) == 0 ) {
3744 invalid:
3745 float_raise( float_flag_invalid );
3746 z.low = floatx80_default_nan_low;
3747 z.high = floatx80_default_nan_high;
3748 return z;
3749 }
3750 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
3751 }
3752 if ( aExp == 0 ) {
3753 if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 );
3754 normalizeFloatx80Subnormal( aSig, &aExp, &aSig );
3755 }
3756 if ( bExp == 0 ) {
3757 if ( bSig == 0 ) return packFloatx80( zSign, 0, 0 );
3758 normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
3759 }
3760 zExp = aExp + bExp - 0x3FFE;
3761 mul64To128( aSig, bSig, &zSig0, &zSig1 );
3762 if ( 0 < (sbits64) zSig0 ) {
3763 shortShift128Left( zSig0, zSig1, 1, &zSig0, &zSig1 );
3764 --zExp;
3765 }
3766 return
3767 roundAndPackFloatx80(
3768 floatx80_rounding_precision, zSign, zExp, zSig0, zSig1 );
3769
3770 }
3771
3772 /*----------------------------------------------------------------------------
3773 | Returns the result of dividing the extended double-precision floating-point
3774 | value `a' by the corresponding value `b'. The operation is performed
3775 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3776 *----------------------------------------------------------------------------*/
3777
3778 floatx80 floatx80_div( floatx80 a, floatx80 b )
3779 {
3780 flag aSign, bSign, zSign;
3781 int32 aExp, bExp, zExp;
3782 bits64 aSig, bSig, zSig0, zSig1;
3783 bits64 rem0, rem1, rem2, term0, term1, term2;
3784 floatx80 z;
3785
3786 aSig = extractFloatx80Frac( a );
3787 aExp = extractFloatx80Exp( a );
3788 aSign = extractFloatx80Sign( a );
3789 bSig = extractFloatx80Frac( b );
3790 bExp = extractFloatx80Exp( b );
3791 bSign = extractFloatx80Sign( b );
3792 zSign = aSign ^ bSign;
3793 if ( aExp == 0x7FFF ) {
3794 if ( (bits64) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b );
3795 if ( bExp == 0x7FFF ) {
3796 if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b );
3797 goto invalid;
3798 }
3799 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
3800 }
3801 if ( bExp == 0x7FFF ) {
3802 if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b );
3803 return packFloatx80( zSign, 0, 0 );
3804 }
3805 if ( bExp == 0 ) {
3806 if ( bSig == 0 ) {
3807 if ( ( aExp | aSig ) == 0 ) {
3808 invalid:
3809 float_raise( float_flag_invalid );
3810 z.low = floatx80_default_nan_low;
3811 z.high = floatx80_default_nan_high;
3812 return z;
3813 }
3814 float_raise( float_flag_divbyzero );
3815 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
3816 }
3817 normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
3818 }
3819 if ( aExp == 0 ) {
3820 if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 );
3821 normalizeFloatx80Subnormal( aSig, &aExp, &aSig );
3822 }
3823 zExp = aExp - bExp + 0x3FFE;
3824 rem1 = 0;
3825 if ( bSig <= aSig ) {
3826 shift128Right( aSig, 0, 1, &aSig, &rem1 );
3827 ++zExp;
3828 }
3829 zSig0 = estimateDiv128To64( aSig, rem1, bSig );
3830 mul64To128( bSig, zSig0, &term0, &term1 );
3831 sub128( aSig, rem1, term0, term1, &rem0, &rem1 );
3832 while ( (sbits64) rem0 < 0 ) {
3833 --zSig0;
3834 add128( rem0, rem1, 0, bSig, &rem0, &rem1 );
3835 }
3836 zSig1 = estimateDiv128To64( rem1, 0, bSig );
3837 if ( (bits64) ( zSig1<<1 ) <= 8 ) {
3838 mul64To128( bSig, zSig1, &term1, &term2 );
3839 sub128( rem1, 0, term1, term2, &rem1, &rem2 );
3840 while ( (sbits64) rem1 < 0 ) {
3841 --zSig1;
3842 add128( rem1, rem2, 0, bSig, &rem1, &rem2 );
3843 }
3844 zSig1 |= ( ( rem1 | rem2 ) != 0 );
3845 }
3846 return
3847 roundAndPackFloatx80(
3848 floatx80_rounding_precision, zSign, zExp, zSig0, zSig1 );
3849
3850 }
3851
3852 /*----------------------------------------------------------------------------
3853 | Returns the remainder of the extended double-precision floating-point value
3854 | `a' with respect to the corresponding value `b'. The operation is performed
3855 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3856 *----------------------------------------------------------------------------*/
3857
3858 floatx80 floatx80_rem( floatx80 a, floatx80 b )
3859 {
3860 flag aSign, bSign, zSign;
3861 int32 aExp, bExp, expDiff;
3862 bits64 aSig0, aSig1, bSig;
3863 bits64 q, term0, term1, alternateASig0, alternateASig1;
3864 floatx80 z;
3865
3866 aSig0 = extractFloatx80Frac( a );
3867 aExp = extractFloatx80Exp( a );
3868 aSign = extractFloatx80Sign( a );
3869 bSig = extractFloatx80Frac( b );
3870 bExp = extractFloatx80Exp( b );
3871 bSign = extractFloatx80Sign( b );
3872 if ( aExp == 0x7FFF ) {
3873 if ( (bits64) ( aSig0<<1 )
3874 || ( ( bExp == 0x7FFF ) && (bits64) ( bSig<<1 ) ) ) {
3875 return propagateFloatx80NaN( a, b );
3876 }
3877 goto invalid;
3878 }
3879 if ( bExp == 0x7FFF ) {
3880 if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b );
3881 return a;
3882 }
3883 if ( bExp == 0 ) {
3884 if ( bSig == 0 ) {
3885 invalid:
3886 float_raise( float_flag_invalid );
3887 z.low = floatx80_default_nan_low;
3888 z.high = floatx80_default_nan_high;
3889 return z;
3890 }
3891 normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
3892 }
3893 if ( aExp == 0 ) {
3894 if ( (bits64) ( aSig0<<1 ) == 0 ) return a;
3895 normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 );
3896 }
3897 bSig |= LIT64( 0x8000000000000000 );
3898 zSign = aSign;
3899 expDiff = aExp - bExp;
3900 aSig1 = 0;
3901 if ( expDiff < 0 ) {
3902 if ( expDiff < -1 ) return a;
3903 shift128Right( aSig0, 0, 1, &aSig0, &aSig1 );
3904 expDiff = 0;
3905 }
3906 q = ( bSig <= aSig0 );
3907 if ( q ) aSig0 -= bSig;
3908 expDiff -= 64;
3909 while ( 0 < expDiff ) {
3910 q = estimateDiv128To64( aSig0, aSig1, bSig );
3911 q = ( 2 < q ) ? q - 2 : 0;
3912 mul64To128( bSig, q, &term0, &term1 );
3913 sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
3914 shortShift128Left( aSig0, aSig1, 62, &aSig0, &aSig1 );
3915 expDiff -= 62;
3916 }
3917 expDiff += 64;
3918 if ( 0 < expDiff ) {
3919 q = estimateDiv128To64( aSig0, aSig1, bSig );
3920 q = ( 2 < q ) ? q - 2 : 0;
3921 q >>= 64 - expDiff;
3922 mul64To128( bSig, q<<( 64 - expDiff ), &term0, &term1 );
3923 sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
3924 shortShift128Left( 0, bSig, 64 - expDiff, &term0, &term1 );
3925 while ( le128( term0, term1, aSig0, aSig1 ) ) {
3926 ++q;
3927 sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
3928 }
3929 }
3930 else {
3931 term1 = 0;
3932 term0 = bSig;
3933 }
3934 sub128( term0, term1, aSig0, aSig1, &alternateASig0, &alternateASig1 );
3935 if ( lt128( alternateASig0, alternateASig1, aSig0, aSig1 )
3936 || ( eq128( alternateASig0, alternateASig1, aSig0, aSig1 )
3937 && ( q & 1 ) )
3938 ) {
3939 aSig0 = alternateASig0;
3940 aSig1 = alternateASig1;
3941 zSign = ! zSign;
3942 }
3943 return
3944 normalizeRoundAndPackFloatx80(
3945 80, zSign, bExp + expDiff, aSig0, aSig1 );
3946
3947 }
3948
3949 /*----------------------------------------------------------------------------
3950 | Returns the square root of the extended double-precision floating-point
3951 | value `a'. The operation is performed according to the IEC/IEEE Standard
3952 | for Binary Floating-Point Arithmetic.
3953 *----------------------------------------------------------------------------*/
3954
3955 floatx80 floatx80_sqrt( floatx80 a )
3956 {
3957 flag aSign;
3958 int32 aExp, zExp;
3959 bits64 aSig0, aSig1, zSig0, zSig1, doubleZSig0;
3960 bits64 rem0, rem1, rem2, rem3, term0, term1, term2, term3;
3961 floatx80 z;
3962
3963 aSig0 = extractFloatx80Frac( a );
3964 aExp = extractFloatx80Exp( a );
3965 aSign = extractFloatx80Sign( a );
3966 if ( aExp == 0x7FFF ) {
3967 if ( (bits64) ( aSig0<<1 ) ) return propagateFloatx80NaN( a, a );
3968 if ( ! aSign ) return a;
3969 goto invalid;
3970 }
3971 if ( aSign ) {
3972 if ( ( aExp | aSig0 ) == 0 ) return a;
3973 invalid:
3974 float_raise( float_flag_invalid );
3975 z.low = floatx80_default_nan_low;
3976 z.high = floatx80_default_nan_high;
3977 return z;
3978 }
3979 if ( aExp == 0 ) {
3980 if ( aSig0 == 0 ) return packFloatx80( 0, 0, 0 );
3981 normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 );
3982 }
3983 zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFF;
3984 zSig0 = estimateSqrt32( aExp, aSig0>>32 );
3985 shift128Right( aSig0, 0, 2 + ( aExp & 1 ), &aSig0, &aSig1 );
3986 zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 );
3987 doubleZSig0 = zSig0<<1;
3988 mul64To128( zSig0, zSig0, &term0, &term1 );
3989 sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 );
3990 while ( (sbits64) rem0 < 0 ) {
3991 --zSig0;
3992 doubleZSig0 -= 2;
3993 add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 );
3994 }
3995 zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 );
3996 if ( ( zSig1 & LIT64( 0x3FFFFFFFFFFFFFFF ) ) <= 5 ) {
3997 if ( zSig1 == 0 ) zSig1 = 1;
3998 mul64To128( doubleZSig0, zSig1, &term1, &term2 );
3999 sub128( rem1, 0, term1, term2, &rem1, &rem2 );
4000 mul64To128( zSig1, zSig1, &term2, &term3 );
4001 sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 );
4002 while ( (sbits64) rem1 < 0 ) {
4003 --zSig1;
4004 shortShift128Left( 0, zSig1, 1, &term2, &term3 );
4005 term3 |= 1;
4006 term2 |= doubleZSig0;
4007 add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 );
4008 }
4009 zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
4010 }
4011 shortShift128Left( 0, zSig1, 1, &zSig0, &zSig1 );
4012 zSig0 |= doubleZSig0;
4013 return
4014 roundAndPackFloatx80(
4015 floatx80_rounding_precision, 0, zExp, zSig0, zSig1 );
4016
4017 }
4018
4019 /*----------------------------------------------------------------------------
4020 | Returns 1 if the extended double-precision floating-point value `a' is
4021 | equal to the corresponding value `b', and 0 otherwise. The comparison is
4022 | performed according to the IEC/IEEE Standard for Binary Floating-Point
4023 | Arithmetic.
4024 *----------------------------------------------------------------------------*/
4025
4026 flag floatx80_eq( floatx80 a, floatx80 b )
4027 {
4028
4029 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
4030 && (bits64) ( extractFloatx80Frac( a )<<1 ) )
4031 || ( ( extractFloatx80Exp( b ) == 0x7FFF )
4032 && (bits64) ( extractFloatx80Frac( b )<<1 ) )
4033 ) {
4034 if ( floatx80_is_signaling_nan( a )
4035 || floatx80_is_signaling_nan( b ) ) {
4036 float_raise( float_flag_invalid );
4037 }
4038 return 0;
4039 }
4040 return
4041 ( a.low == b.low )
4042 && ( ( a.high == b.high )
4043 || ( ( a.low == 0 )
4044 && ( (bits16) ( ( a.high | b.high )<<1 ) == 0 ) )
4045 );
4046
4047 }
4048
4049 /*----------------------------------------------------------------------------
4050 | Returns 1 if the extended double-precision floating-point value `a' is
4051 | less than or equal to the corresponding value `b', and 0 otherwise. The
4052 | comparison is performed according to the IEC/IEEE Standard for Binary
4053 | Floating-Point Arithmetic.
4054 *----------------------------------------------------------------------------*/
4055
4056 flag floatx80_le( floatx80 a, floatx80 b )
4057 {
4058 flag aSign, bSign;
4059
4060 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
4061 && (bits64) ( extractFloatx80Frac( a )<<1 ) )
4062 || ( ( extractFloatx80Exp( b ) == 0x7FFF )
4063 && (bits64) ( extractFloatx80Frac( b )<<1 ) )
4064 ) {
4065 float_raise( float_flag_invalid );
4066 return 0;
4067 }
4068 aSign = extractFloatx80Sign( a );
4069 bSign = extractFloatx80Sign( b );
4070 if ( aSign != bSign ) {
4071 return
4072 aSign
4073 || ( ( ( (bits16) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
4074 == 0 );
4075 }
4076 return
4077 aSign ? le128( b.high, b.low, a.high, a.low )
4078 : le128( a.high, a.low, b.high, b.low );
4079
4080 }
4081
4082 /*----------------------------------------------------------------------------
4083 | Returns 1 if the extended double-precision floating-point value `a' is
4084 | less than the corresponding value `b', and 0 otherwise. The comparison
4085 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
4086 | Arithmetic.
4087 *----------------------------------------------------------------------------*/
4088
4089 flag floatx80_lt( floatx80 a, floatx80 b )
4090 {
4091 flag aSign, bSign;
4092
4093 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
4094 && (bits64) ( extractFloatx80Frac( a )<<1 ) )
4095 || ( ( extractFloatx80Exp( b ) == 0x7FFF )
4096 && (bits64) ( extractFloatx80Frac( b )<<1 ) )
4097 ) {
4098 float_raise( float_flag_invalid );
4099 return 0;
4100 }
4101 aSign = extractFloatx80Sign( a );
4102 bSign = extractFloatx80Sign( b );
4103 if ( aSign != bSign ) {
4104 return
4105 aSign
4106 && ( ( ( (bits16) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
4107 != 0 );
4108 }
4109 return
4110 aSign ? lt128( b.high, b.low, a.high, a.low )
4111 : lt128( a.high, a.low, b.high, b.low );
4112
4113 }
4114
4115 /*----------------------------------------------------------------------------
4116 | Returns 1 if the extended double-precision floating-point value `a' is equal
4117 | to the corresponding value `b', and 0 otherwise. The invalid exception is
4118 | raised if either operand is a NaN. Otherwise, the comparison is performed
4119 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4120 *----------------------------------------------------------------------------*/
4121
4122 flag floatx80_eq_signaling( floatx80 a, floatx80 b )
4123 {
4124
4125 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
4126 && (bits64) ( extractFloatx80Frac( a )<<1 ) )
4127 || ( ( extractFloatx80Exp( b ) == 0x7FFF )
4128 && (bits64) ( extractFloatx80Frac( b )<<1 ) )
4129 ) {
4130 float_raise( float_flag_invalid );
4131 return 0;
4132 }
4133 return
4134 ( a.low == b.low )
4135 && ( ( a.high == b.high )
4136 || ( ( a.low == 0 )
4137 && ( (bits16) ( ( a.high | b.high )<<1 ) == 0 ) )
4138 );
4139
4140 }
4141
4142 /*----------------------------------------------------------------------------
4143 | Returns 1 if the extended double-precision floating-point value `a' is less
4144 | than or equal to the corresponding value `b', and 0 otherwise. Quiet NaNs
4145 | do not cause an exception. Otherwise, the comparison is performed according
4146 | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4147 *----------------------------------------------------------------------------*/
4148
4149 flag floatx80_le_quiet( floatx80 a, floatx80 b )
4150 {
4151 flag aSign, bSign;
4152
4153 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
4154 && (bits64) ( extractFloatx80Frac( a )<<1 ) )
4155 || ( ( extractFloatx80Exp( b ) == 0x7FFF )
4156 && (bits64) ( extractFloatx80Frac( b )<<1 ) )
4157 ) {
4158 if ( floatx80_is_signaling_nan( a )
4159 || floatx80_is_signaling_nan( b ) ) {
4160 float_raise( float_flag_invalid );
4161 }
4162 return 0;
4163 }
4164 aSign = extractFloatx80Sign( a );
4165 bSign = extractFloatx80Sign( b );
4166 if ( aSign != bSign ) {
4167 return
4168 aSign
4169 || ( ( ( (bits16) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
4170 == 0 );
4171 }
4172 return
4173 aSign ? le128( b.high, b.low, a.high, a.low )
4174 : le128( a.high, a.low, b.high, b.low );
4175
4176 }
4177
4178 /*----------------------------------------------------------------------------
4179 | Returns 1 if the extended double-precision floating-point value `a' is less
4180 | than the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause
4181 | an exception. Otherwise, the comparison is performed according to the
4182 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4183 *----------------------------------------------------------------------------*/
4184
4185 flag floatx80_lt_quiet( floatx80 a, floatx80 b )
4186 {
4187 flag aSign, bSign;
4188
4189 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
4190 && (bits64) ( extractFloatx80Frac( a )<<1 ) )
4191 || ( ( extractFloatx80Exp( b ) == 0x7FFF )
4192 && (bits64) ( extractFloatx80Frac( b )<<1 ) )
4193 ) {
4194 if ( floatx80_is_signaling_nan( a )
4195 || floatx80_is_signaling_nan( b ) ) {
4196 float_raise( float_flag_invalid );
4197 }
4198 return 0;
4199 }
4200 aSign = extractFloatx80Sign( a );
4201 bSign = extractFloatx80Sign( b );
4202 if ( aSign != bSign ) {
4203 return
4204 aSign
4205 && ( ( ( (bits16) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
4206 != 0 );
4207 }
4208 return
4209 aSign ? lt128( b.high, b.low, a.high, a.low )
4210 : lt128( a.high, a.low, b.high, b.low );
4211
4212 }
4213
4214 #endif
4215
4216 #ifdef FLOAT128
4217
4218 /*----------------------------------------------------------------------------
4219 | Returns the result of converting the quadruple-precision floating-point
4220 | value `a' to the 32-bit two's complement integer format. The conversion
4221 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
4222 | Arithmetic---which means in particular that the conversion is rounded
4223 | according to the current rounding mode. If `a' is a NaN, the largest
4224 | positive integer is returned. Otherwise, if the conversion overflows, the
4225 | largest integer with the same sign as `a' is returned.
4226 *----------------------------------------------------------------------------*/
4227
4228 int32 float128_to_int32( float128 a )
4229 {
4230 flag aSign;
4231 int32 aExp, shiftCount;
4232 bits64 aSig0, aSig1;
4233
4234 aSig1 = extractFloat128Frac1( a );
4235 aSig0 = extractFloat128Frac0( a );
4236 aExp = extractFloat128Exp( a );
4237 aSign = extractFloat128Sign( a );
4238 if ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) aSign = 0;
4239 if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
4240 aSig0 |= ( aSig1 != 0 );
4241 shiftCount = 0x4028 - aExp;
4242 if ( 0 < shiftCount ) shift64RightJamming( aSig0, shiftCount, &aSig0 );
4243 return roundAndPackInt32( aSign, aSig0 );
4244
4245 }
4246
4247 /*----------------------------------------------------------------------------
4248 | Returns the result of converting the quadruple-precision floating-point
4249 | value `a' to the 32-bit two's complement integer format. The conversion
4250 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
4251 | Arithmetic, except that the conversion is always rounded toward zero. If
4252 | `a' is a NaN, the largest positive integer is returned. Otherwise, if the
4253 | conversion overflows, the largest integer with the same sign as `a' is
4254 | returned.
4255 *----------------------------------------------------------------------------*/
4256
4257 int32 float128_to_int32_round_to_zero( float128 a )
4258 {
4259 flag aSign;
4260 int32 aExp, shiftCount;
4261 bits64 aSig0, aSig1, savedASig;
4262 int32 z;
4263
4264 aSig1 = extractFloat128Frac1( a );
4265 aSig0 = extractFloat128Frac0( a );
4266 aExp = extractFloat128Exp( a );
4267 aSign = extractFloat128Sign( a );
4268 aSig0 |= ( aSig1 != 0 );
4269 if ( 0x401E < aExp ) {
4270 if ( ( aExp == 0x7FFF ) && aSig0 ) aSign = 0;
4271 goto invalid;
4272 }
4273 else if ( aExp < 0x3FFF ) {
4274 if ( aExp || aSig0 ) float_set_inexact();
4275 return 0;
4276 }
4277 aSig0 |= LIT64( 0x0001000000000000 );
4278 shiftCount = 0x402F - aExp;
4279 savedASig = aSig0;
4280 aSig0 >>= shiftCount;
4281 z = aSig0;
4282 if ( aSign ) z = - z;
4283 if ( ( z < 0 ) ^ aSign ) {
4284 invalid:
4285 float_raise( float_flag_invalid );
4286 return aSign ? (sbits32) 0x80000000 : 0x7FFFFFFF;
4287 }
4288 if ( ( aSig0<<shiftCount ) != savedASig ) {
4289 float_set_inexact();
4290 }
4291 return z;
4292
4293 }
4294
4295 /*----------------------------------------------------------------------------
4296 | Returns the result of converting the quadruple-precision floating-point
4297 | value `a' to the 64-bit two's complement integer format. The conversion
4298 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
4299 | Arithmetic---which means in particular that the conversion is rounded
4300 | according to the current rounding mode. If `a' is a NaN, the largest
4301 | positive integer is returned. Otherwise, if the conversion overflows, the
4302 | largest integer with the same sign as `a' is returned.
4303 *----------------------------------------------------------------------------*/
4304
4305 int64 float128_to_int64( float128 a )
4306 {
4307 flag aSign;
4308 int32 aExp, shiftCount;
4309 bits64 aSig0, aSig1;
4310
4311 aSig1 = extractFloat128Frac1( a );
4312 aSig0 = extractFloat128Frac0( a );
4313 aExp = extractFloat128Exp( a );
4314 aSign = extractFloat128Sign( a );
4315 if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
4316 shiftCount = 0x402F - aExp;
4317 if ( shiftCount <= 0 ) {
4318 if ( 0x403E < aExp ) {
4319 float_raise( float_flag_invalid );
4320 if ( ! aSign
4321 || ( ( aExp == 0x7FFF )
4322 && ( aSig1 || ( aSig0 != LIT64( 0x0001000000000000 ) ) )
4323 )
4324 ) {
4325 return LIT64( 0x7FFFFFFFFFFFFFFF );
4326 }
4327 return (sbits64) LIT64( 0x8000000000000000 );
4328 }
4329 shortShift128Left( aSig0, aSig1, - shiftCount, &aSig0, &aSig1 );
4330 }
4331 else {
4332 shift64ExtraRightJamming( aSig0, aSig1, shiftCount, &aSig0, &aSig1 );
4333 }
4334 return roundAndPackInt64( aSign, aSig0, aSig1 );
4335
4336 }
4337
4338 /*----------------------------------------------------------------------------
4339 | Returns the result of converting the quadruple-precision floating-point
4340 | value `a' to the 64-bit two's complement integer format. The conversion
4341 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
4342 | Arithmetic, except that the conversion is always rounded toward zero.
4343 | If `a' is a NaN, the largest positive integer is returned. Otherwise, if
4344 | the conversion overflows, the largest integer with the same sign as `a' is
4345 | returned.
4346 *----------------------------------------------------------------------------*/
4347
4348 int64 float128_to_int64_round_to_zero( float128 a )
4349 {
4350 flag aSign;
4351 int32 aExp, shiftCount;
4352 bits64 aSig0, aSig1;
4353 int64 z;
4354
4355 aSig1 = extractFloat128Frac1( a );
4356 aSig0 = extractFloat128Frac0( a );
4357 aExp = extractFloat128Exp( a );
4358 aSign = extractFloat128Sign( a );
4359 if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
4360 shiftCount = aExp - 0x402F;
4361 if ( 0 < shiftCount ) {
4362 if ( 0x403E <= aExp ) {
4363 aSig0 &= LIT64( 0x0000FFFFFFFFFFFF );
4364 if ( ( a.high == LIT64( 0xC03E000000000000 ) )
4365 && ( aSig1 < LIT64( 0x0002000000000000 ) ) ) {
4366 if ( aSig1 ) float_set_inexact();
4367 }
4368 else {
4369 float_raise( float_flag_invalid );
4370 if ( ! aSign || ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) ) {
4371 return LIT64( 0x7FFFFFFFFFFFFFFF );
4372 }
4373 }
4374 return (sbits64) LIT64( 0x8000000000000000 );
4375 }
4376 z = ( aSig0<<shiftCount ) | ( aSig1>>( ( - shiftCount ) & 63 ) );
4377 if ( (bits64) ( aSig1<<shiftCount ) ) {
4378 float_set_inexact();
4379 }
4380 }
4381 else {
4382 if ( aExp < 0x3FFF ) {
4383 if ( aExp | aSig0 | aSig1 ) {
4384 float_set_inexact();
4385 }
4386 return 0;
4387 }
4388 z = aSig0>>( - shiftCount );
4389 if ( aSig1
4390 || ( shiftCount && (bits64) ( aSig0<<( shiftCount & 63 ) ) ) ) {
4391 float_set_inexact();
4392 }
4393 }
4394 if ( aSign ) z = - z;
4395 return z;
4396
4397 }
4398
4399 /*----------------------------------------------------------------------------
4400 | Returns the result of converting the quadruple-precision floating-point
4401 | value `a' to the single-precision floating-point format. The conversion
4402 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
4403 | Arithmetic.
4404 *----------------------------------------------------------------------------*/
4405
4406 float32 float128_to_float32( float128 a )
4407 {
4408 flag aSign;
4409 int32 aExp;
4410 bits64 aSig0, aSig1;
4411 bits32 zSig;
4412
4413 aSig1 = extractFloat128Frac1( a );
4414 aSig0 = extractFloat128Frac0( a );
4415 aExp = extractFloat128Exp( a );
4416 aSign = extractFloat128Sign( a );
4417 if ( aExp == 0x7FFF ) {
4418 if ( aSig0 | aSig1 ) {
4419 return commonNaNToFloat32( float128ToCommonNaN( a ) );
4420 }
4421 return packFloat32( aSign, 0xFF, 0 );
4422 }
4423 aSig0 |= ( aSig1 != 0 );
4424 shift64RightJamming( aSig0, 18, &aSig0 );
4425 zSig = aSig0;
4426 if ( aExp || zSig ) {
4427 zSig |= 0x40000000;
4428 aExp -= 0x3F81;
4429 }
4430 return roundAndPackFloat32( aSign, aExp, zSig );
4431
4432 }
4433
4434 /*----------------------------------------------------------------------------
4435 | Returns the result of converting the quadruple-precision floating-point
4436 | value `a' to the double-precision floating-point format. The conversion
4437 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
4438 | Arithmetic.
4439 *----------------------------------------------------------------------------*/
4440
4441 float64 float128_to_float64( float128 a )
4442 {
4443 flag aSign;
4444 int32 aExp;
4445 bits64 aSig0, aSig1;
4446
4447 aSig1 = extractFloat128Frac1( a );
4448 aSig0 = extractFloat128Frac0( a );
4449 aExp = extractFloat128Exp( a );
4450 aSign = extractFloat128Sign( a );
4451 if ( aExp == 0x7FFF ) {
4452 if ( aSig0 | aSig1 ) {
4453 return commonNaNToFloat64( float128ToCommonNaN( a ) );
4454 }
4455 return packFloat64( aSign, 0x7FF, 0 );
4456 }
4457 shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 );
4458 aSig0 |= ( aSig1 != 0 );
4459 if ( aExp || aSig0 ) {
4460 aSig0 |= LIT64( 0x4000000000000000 );
4461 aExp -= 0x3C01;
4462 }
4463 return roundAndPackFloat64( aSign, aExp, aSig0 );
4464
4465 }
4466
4467 #ifdef FLOATX80
4468
4469 /*----------------------------------------------------------------------------
4470 | Returns the result of converting the quadruple-precision floating-point
4471 | value `a' to the extended double-precision floating-point format. The
4472 | conversion is performed according to the IEC/IEEE Standard for Binary
4473 | Floating-Point Arithmetic.
4474 *----------------------------------------------------------------------------*/
4475
4476 floatx80 float128_to_floatx80( float128 a )
4477 {
4478 flag aSign;
4479 int32 aExp;
4480 bits64 aSig0, aSig1;
4481
4482 aSig1 = extractFloat128Frac1( a );
4483 aSig0 = extractFloat128Frac0( a );
4484 aExp = extractFloat128Exp( a );
4485 aSign = extractFloat128Sign( a );
4486 if ( aExp == 0x7FFF ) {
4487 if ( aSig0 | aSig1 ) {
4488 return commonNaNToFloatx80( float128ToCommonNaN( a ) );
4489 }
4490 return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
4491 }
4492 if ( aExp == 0 ) {
4493 if ( ( aSig0 | aSig1 ) == 0 ) return packFloatx80( aSign, 0, 0 );
4494 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
4495 }
4496 else {
4497 aSig0 |= LIT64( 0x0001000000000000 );
4498 }
4499 shortShift128Left( aSig0, aSig1, 15, &aSig0, &aSig1 );
4500 return roundAndPackFloatx80( 80, aSign, aExp, aSig0, aSig1 );
4501
4502 }
4503
4504 #endif
4505
4506 /*----------------------------------------------------------------------------
4507 | Rounds the quadruple-precision floating-point value `a' to an integer, and
4508 | returns the result as a quadruple-precision floating-point value. The
4509 | operation is performed according to the IEC/IEEE Standard for Binary
4510 | Floating-Point Arithmetic.
4511 *----------------------------------------------------------------------------*/
4512
4513 float128 float128_round_to_int( float128 a )
4514 {
4515 flag aSign;
4516 int32 aExp;
4517 bits64 lastBitMask, roundBitsMask;
4518 int8 roundingMode;
4519 float128 z;
4520
4521 aExp = extractFloat128Exp( a );
4522 if ( 0x402F <= aExp ) {
4523 if ( 0x406F <= aExp ) {
4524 if ( ( aExp == 0x7FFF )
4525 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) )
4526 ) {
4527 return propagateFloat128NaN( a, a );
4528 }
4529 return a;
4530 }
4531 lastBitMask = 1;
4532 lastBitMask = ( lastBitMask<<( 0x406E - aExp ) )<<1;
4533 roundBitsMask = lastBitMask - 1;
4534 z = a;
4535 roundingMode = float_rounding_mode();
4536 if ( roundingMode == float_round_nearest_even ) {
4537 if ( lastBitMask ) {
4538 add128( z.high, z.low, 0, lastBitMask>>1, &z.high, &z.low );
4539 if ( ( z.low & roundBitsMask ) == 0 ) z.low &= ~ lastBitMask;
4540 }
4541 else {
4542 if ( (sbits64) z.low < 0 ) {
4543 ++z.high;
4544 if ( (bits64) ( z.low<<1 ) == 0 ) z.high &= ~1;
4545 }
4546 }
4547 }
4548 else if ( roundingMode != float_round_to_zero ) {
4549 if ( extractFloat128Sign( z )
4550 ^ ( roundingMode == float_round_up ) ) {
4551 add128( z.high, z.low, 0, roundBitsMask, &z.high, &z.low );
4552 }
4553 }
4554 z.low &= ~ roundBitsMask;
4555 }
4556 else {
4557 if ( aExp < 0x3FFF ) {
4558 if ( ( ( (bits64) ( a.high<<1 ) ) | a.low ) == 0 ) return a;
4559 float_set_inexact();
4560 aSign = extractFloat128Sign( a );
4561 switch ( float_rounding_mode() ) {
4562 case float_round_nearest_even:
4563 if ( ( aExp == 0x3FFE )
4564 && ( extractFloat128Frac0( a )
4565 | extractFloat128Frac1( a ) )
4566 ) {
4567 return packFloat128( aSign, 0x3FFF, 0, 0 );
4568 }
4569 break;
4570 case float_round_down:
4571 return
4572 aSign ? packFloat128( 1, 0x3FFF, 0, 0 )
4573 : packFloat128( 0, 0, 0, 0 );
4574 case float_round_up:
4575 return
4576 aSign ? packFloat128( 1, 0, 0, 0 )
4577 : packFloat128( 0, 0x3FFF, 0, 0 );
4578 }
4579 return packFloat128( aSign, 0, 0, 0 );
4580 }
4581 lastBitMask = 1;
4582 lastBitMask <<= 0x402F - aExp;
4583 roundBitsMask = lastBitMask - 1;
4584 z.low = 0;
4585 z.high = a.high;
4586 roundingMode = float_rounding_mode();
4587 if ( roundingMode == float_round_nearest_even ) {
4588 z.high += lastBitMask>>1;
4589 if ( ( ( z.high & roundBitsMask ) | a.low ) == 0 ) {
4590 z.high &= ~ lastBitMask;
4591 }
4592 }
4593 else if ( roundingMode != float_round_to_zero ) {
4594 if ( extractFloat128Sign( z )
4595 ^ ( roundingMode == float_round_up ) ) {
4596 z.high |= ( a.low != 0 );
4597 z.high += roundBitsMask;
4598 }
4599 }
4600 z.high &= ~ roundBitsMask;
4601 }
4602 if ( ( z.low != a.low ) || ( z.high != a.high ) ) {
4603 float_set_inexact();
4604 }
4605 return z;
4606
4607 }
4608
4609 /*----------------------------------------------------------------------------
4610 | Returns the result of adding the absolute values of the quadruple-precision
4611 | floating-point values `a' and `b'. If `zSign' is 1, the sum is negated
4612 | before being returned. `zSign' is ignored if the result is a NaN.
4613 | The addition is performed according to the IEC/IEEE Standard for Binary
4614 | Floating-Point Arithmetic.
4615 *----------------------------------------------------------------------------*/
4616
4617 static float128 addFloat128Sigs( float128 a, float128 b, flag zSign )
4618 {
4619 int32 aExp, bExp, zExp;
4620 bits64 aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2;
4621 int32 expDiff;
4622
4623 aSig1 = extractFloat128Frac1( a );
4624 aSig0 = extractFloat128Frac0( a );
4625 aExp = extractFloat128Exp( a );
4626 bSig1 = extractFloat128Frac1( b );
4627 bSig0 = extractFloat128Frac0( b );
4628 bExp = extractFloat128Exp( b );
4629 expDiff = aExp - bExp;
4630 if ( 0 < expDiff ) {
4631 if ( aExp == 0x7FFF ) {
4632 if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, b );
4633 return a;
4634 }
4635 if ( bExp == 0 ) {
4636 --expDiff;
4637 }
4638 else {
4639 bSig0 |= LIT64( 0x0001000000000000 );
4640 }
4641 shift128ExtraRightJamming(
4642 bSig0, bSig1, 0, expDiff, &bSig0, &bSig1, &zSig2 );
4643 zExp = aExp;
4644 }
4645 else if ( expDiff < 0 ) {
4646 if ( bExp == 0x7FFF ) {
4647 if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b );
4648 return packFloat128( zSign, 0x7FFF, 0, 0 );
4649 }
4650 if ( aExp == 0 ) {
4651 ++expDiff;
4652 }
4653 else {
4654 aSig0 |= LIT64( 0x0001000000000000 );
4655 }
4656 shift128ExtraRightJamming(
4657 aSig0, aSig1, 0, - expDiff, &aSig0, &aSig1, &zSig2 );
4658 zExp = bExp;
4659 }
4660 else {
4661 if ( aExp == 0x7FFF ) {
4662 if ( aSig0 | aSig1 | bSig0 | bSig1 ) {
4663 return propagateFloat128NaN( a, b );
4664 }
4665 return a;
4666 }
4667 add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
4668 if ( aExp == 0 ) return packFloat128( zSign, 0, zSig0, zSig1 );
4669 zSig2 = 0;
4670 zSig0 |= LIT64( 0x0002000000000000 );
4671 zExp = aExp;
4672 goto shiftRight1;
4673 }
4674 aSig0 |= LIT64( 0x0001000000000000 );
4675 add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
4676 --zExp;
4677 if ( zSig0 < LIT64( 0x0002000000000000 ) ) goto roundAndPack;
4678 ++zExp;
4679 shiftRight1:
4680 shift128ExtraRightJamming(
4681 zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 );
4682 roundAndPack:
4683 return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 );
4684
4685 }
4686
4687 /*----------------------------------------------------------------------------
4688 | Returns the result of subtracting the absolute values of the quadruple-
4689 | precision floating-point values `a' and `b'. If `zSign' is 1, the
4690 | difference is negated before being returned. `zSign' is ignored if the
4691 | result is a NaN. The subtraction is performed according to the IEC/IEEE
4692 | Standard for Binary Floating-Point Arithmetic.
4693 *----------------------------------------------------------------------------*/
4694
4695 static float128 subFloat128Sigs( float128 a, float128 b, flag zSign )
4696 {
4697 int32 aExp, bExp, zExp;
4698 bits64 aSig0, aSig1, bSig0, bSig1, zSig0, zSig1;
4699 int32 expDiff;
4700 float128 z;
4701
4702 aSig1 = extractFloat128Frac1( a );
4703 aSig0 = extractFloat128Frac0( a );
4704 aExp = extractFloat128Exp( a );
4705 bSig1 = extractFloat128Frac1( b );
4706 bSig0 = extractFloat128Frac0( b );
4707 bExp = extractFloat128Exp( b );
4708 expDiff = aExp - bExp;
4709 shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 );
4710 shortShift128Left( bSig0, bSig1, 14, &bSig0, &bSig1 );
4711 if ( 0 < expDiff ) goto aExpBigger;
4712 if ( expDiff < 0 ) goto bExpBigger;
4713 if ( aExp == 0x7FFF ) {
4714 if ( aSig0 | aSig1 | bSig0 | bSig1 ) {
4715 return propagateFloat128NaN( a, b );
4716 }
4717 float_raise( float_flag_invalid );
4718 z.low = float128_default_nan_low;
4719 z.high = float128_default_nan_high;
4720 return z;
4721 }
4722 if ( aExp == 0 ) {
4723 aExp = 1;
4724 bExp = 1;
4725 }
4726 if ( bSig0 < aSig0 ) goto aBigger;
4727 if ( aSig0 < bSig0 ) goto bBigger;
4728 if ( bSig1 < aSig1 ) goto aBigger;
4729 if ( aSig1 < bSig1 ) goto bBigger;
4730 return packFloat128( float_rounding_mode() == float_round_down, 0, 0, 0 );
4731 bExpBigger:
4732 if ( bExp == 0x7FFF ) {
4733 if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b );
4734 return packFloat128( zSign ^ 1, 0x7FFF, 0, 0 );
4735 }
4736 if ( aExp == 0 ) {
4737 ++expDiff;
4738 }
4739 else {
4740 aSig0 |= LIT64( 0x4000000000000000 );
4741 }
4742 shift128RightJamming( aSig0, aSig1, - expDiff, &aSig0, &aSig1 );
4743 bSig0 |= LIT64( 0x4000000000000000 );
4744 bBigger:
4745 sub128( bSig0, bSig1, aSig0, aSig1, &zSig0, &zSig1 );
4746 zExp = bExp;
4747 zSign ^= 1;
4748 goto normalizeRoundAndPack;
4749 aExpBigger:
4750 if ( aExp == 0x7FFF ) {
4751 if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, b );
4752 return a;
4753 }
4754 if ( bExp == 0 ) {
4755 --expDiff;
4756 }
4757 else {
4758 bSig0 |= LIT64( 0x4000000000000000 );
4759 }
4760 shift128RightJamming( bSig0, bSig1, expDiff, &bSig0, &bSig1 );
4761 aSig0 |= LIT64( 0x4000000000000000 );
4762 aBigger:
4763 sub128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
4764 zExp = aExp;
4765 normalizeRoundAndPack:
4766 --zExp;
4767 return normalizeRoundAndPackFloat128( zSign, zExp - 14, zSig0, zSig1 );
4768
4769 }
4770
4771 /*----------------------------------------------------------------------------
4772 | Returns the result of adding the quadruple-precision floating-point values
4773 | `a' and `b'. The operation is performed according to the IEC/IEEE Standard
4774 | for Binary Floating-Point Arithmetic.
4775 *----------------------------------------------------------------------------*/
4776
4777 float128 float128_add( float128 a, float128 b )
4778 {
4779 flag aSign, bSign;
4780
4781 aSign = extractFloat128Sign( a );
4782 bSign = extractFloat128Sign( b );
4783 if ( aSign == bSign ) {
4784 return addFloat128Sigs( a, b, aSign );
4785 }
4786 else {
4787 return subFloat128Sigs( a, b, aSign );
4788 }
4789
4790 }
4791
4792 /*----------------------------------------------------------------------------
4793 | Returns the result of subtracting the quadruple-precision floating-point
4794 | values `a' and `b'. The operation is performed according to the IEC/IEEE
4795 | Standard for Binary Floating-Point Arithmetic.
4796 *----------------------------------------------------------------------------*/
4797
4798 float128 float128_sub( float128 a, float128 b )
4799 {
4800 flag aSign, bSign;
4801
4802 aSign = extractFloat128Sign( a );
4803 bSign = extractFloat128Sign( b );
4804 if ( aSign == bSign ) {
4805 return subFloat128Sigs( a, b, aSign );
4806 }
4807 else {
4808 return addFloat128Sigs( a, b, aSign );
4809 }
4810
4811 }
4812
4813 /*----------------------------------------------------------------------------
4814 | Returns the result of multiplying the quadruple-precision floating-point
4815 | values `a' and `b'. The operation is performed according to the IEC/IEEE
4816 | Standard for Binary Floating-Point Arithmetic.
4817 *----------------------------------------------------------------------------*/
4818
4819 float128 float128_mul( float128 a, float128 b )
4820 {
4821 flag aSign, bSign, zSign;
4822 int32 aExp, bExp, zExp;
4823 bits64 aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2, zSig3;
4824 float128 z;
4825
4826 aSig1 = extractFloat128Frac1( a );
4827 aSig0 = extractFloat128Frac0( a );
4828 aExp = extractFloat128Exp( a );
4829 aSign = extractFloat128Sign( a );
4830 bSig1 = extractFloat128Frac1( b );
4831 bSig0 = extractFloat128Frac0( b );
4832 bExp = extractFloat128Exp( b );
4833 bSign = extractFloat128Sign( b );
4834 zSign = aSign ^ bSign;
4835 if ( aExp == 0x7FFF ) {
4836 if ( ( aSig0 | aSig1 )
4837 || ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) {
4838 return propagateFloat128NaN( a, b );
4839 }
4840 if ( ( bExp | bSig0 | bSig1 ) == 0 ) goto invalid;
4841 return packFloat128( zSign, 0x7FFF, 0, 0 );
4842 }
4843 if ( bExp == 0x7FFF ) {
4844 if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b );
4845 if ( ( aExp | aSig0 | aSig1 ) == 0 ) {
4846 invalid:
4847 float_raise( float_flag_invalid );
4848 z.low = float128_default_nan_low;
4849 z.high = float128_default_nan_high;
4850 return z;
4851 }
4852 return packFloat128( zSign, 0x7FFF, 0, 0 );
4853 }
4854 if ( aExp == 0 ) {
4855 if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
4856 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
4857 }
4858 if ( bExp == 0 ) {
4859 if ( ( bSig0 | bSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
4860 normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
4861 }
4862 zExp = aExp + bExp - 0x4000;
4863 aSig0 |= LIT64( 0x0001000000000000 );
4864 shortShift128Left( bSig0, bSig1, 16, &bSig0, &bSig1 );
4865 mul128To256( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1, &zSig2, &zSig3 );
4866 add128( zSig0, zSig1, aSig0, aSig1, &zSig0, &zSig1 );
4867 zSig2 |= ( zSig3 != 0 );
4868 if ( LIT64( 0x0002000000000000 ) <= zSig0 ) {
4869 shift128ExtraRightJamming(
4870 zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 );
4871 ++zExp;
4872 }
4873 return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 );
4874
4875 }
4876
4877 /*----------------------------------------------------------------------------
4878 | Returns the result of dividing the quadruple-precision floating-point value
4879 | `a' by the corresponding value `b'. The operation is performed according to
4880 | the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4881 *----------------------------------------------------------------------------*/
4882
4883 float128 float128_div( float128 a, float128 b )
4884 {
4885 flag aSign, bSign, zSign;
4886 int32 aExp, bExp, zExp;
4887 bits64 aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2;
4888 bits64 rem0, rem1, rem2, rem3, term0, term1, term2, term3;
4889 float128 z;
4890
4891 aSig1 = extractFloat128Frac1( a );
4892 aSig0 = extractFloat128Frac0( a );
4893 aExp = extractFloat128Exp( a );
4894 aSign = extractFloat128Sign( a );
4895 bSig1 = extractFloat128Frac1( b );
4896 bSig0 = extractFloat128Frac0( b );
4897 bExp = extractFloat128Exp( b );
4898 bSign = extractFloat128Sign( b );
4899 zSign = aSign ^ bSign;
4900 if ( aExp == 0x7FFF ) {
4901 if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, b );
4902 if ( bExp == 0x7FFF ) {
4903 if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b );
4904 goto invalid;
4905 }
4906 return packFloat128( zSign, 0x7FFF, 0, 0 );
4907 }
4908 if ( bExp == 0x7FFF ) {
4909 if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b );
4910 return packFloat128( zSign, 0, 0, 0 );
4911 }
4912 if ( bExp == 0 ) {
4913 if ( ( bSig0 | bSig1 ) == 0 ) {
4914 if ( ( aExp | aSig0 | aSig1 ) == 0 ) {
4915 invalid:
4916 float_raise( float_flag_invalid );
4917 z.low = float128_default_nan_low;
4918 z.high = float128_default_nan_high;
4919 return z;
4920 }
4921 float_raise( float_flag_divbyzero );
4922 return packFloat128( zSign, 0x7FFF, 0, 0 );
4923 }
4924 normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
4925 }
4926 if ( aExp == 0 ) {
4927 if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
4928 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
4929 }
4930 zExp = aExp - bExp + 0x3FFD;
4931 shortShift128Left(
4932 aSig0 | LIT64( 0x0001000000000000 ), aSig1, 15, &aSig0, &aSig1 );
4933 shortShift128Left(
4934 bSig0 | LIT64( 0x0001000000000000 ), bSig1, 15, &bSig0, &bSig1 );
4935 if ( le128( bSig0, bSig1, aSig0, aSig1 ) ) {
4936 shift128Right( aSig0, aSig1, 1, &aSig0, &aSig1 );
4937 ++zExp;
4938 }
4939 zSig0 = estimateDiv128To64( aSig0, aSig1, bSig0 );
4940 mul128By64To192( bSig0, bSig1, zSig0, &term0, &term1, &term2 );
4941 sub192( aSig0, aSig1, 0, term0, term1, term2, &rem0, &rem1, &rem2 );
4942 while ( (sbits64) rem0 < 0 ) {
4943 --zSig0;
4944 add192( rem0, rem1, rem2, 0, bSig0, bSig1, &rem0, &rem1, &rem2 );
4945 }
4946 zSig1 = estimateDiv128To64( rem1, rem2, bSig0 );
4947 if ( ( zSig1 & 0x3FFF ) <= 4 ) {
4948 mul128By64To192( bSig0, bSig1, zSig1, &term1, &term2, &term3 );
4949 sub192( rem1, rem2, 0, term1, term2, term3, &rem1, &rem2, &rem3 );
4950 while ( (sbits64) rem1 < 0 ) {
4951 --zSig1;
4952 add192( rem1, rem2, rem3, 0, bSig0, bSig1, &rem1, &rem2, &rem3 );
4953 }
4954 zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
4955 }
4956 shift128ExtraRightJamming( zSig0, zSig1, 0, 15, &zSig0, &zSig1, &zSig2 );
4957 return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 );
4958
4959 }
4960
4961 /*----------------------------------------------------------------------------
4962 | Returns the remainder of the quadruple-precision floating-point value `a'
4963 | with respect to the corresponding value `b'. The operation is performed
4964 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4965 *----------------------------------------------------------------------------*/
4966
4967 float128 float128_rem( float128 a, float128 b )
4968 {
4969 flag aSign, bSign, zSign;
4970 int32 aExp, bExp, expDiff;
4971 bits64 aSig0, aSig1, bSig0, bSig1, q, term0, term1, term2;
4972 bits64 allZero, alternateASig0, alternateASig1, sigMean1;
4973 sbits64 sigMean0;
4974 float128 z;
4975
4976 aSig1 = extractFloat128Frac1( a );
4977 aSig0 = extractFloat128Frac0( a );
4978 aExp = extractFloat128Exp( a );
4979 aSign = extractFloat128Sign( a );
4980 bSig1 = extractFloat128Frac1( b );
4981 bSig0 = extractFloat128Frac0( b );
4982 bExp = extractFloat128Exp( b );
4983 bSign = extractFloat128Sign( b );
4984 if ( aExp == 0x7FFF ) {
4985 if ( ( aSig0 | aSig1 )
4986 || ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) {
4987 return propagateFloat128NaN( a, b );
4988 }
4989 goto invalid;
4990 }
4991 if ( bExp == 0x7FFF ) {
4992 if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b );
4993 return a;
4994 }
4995 if ( bExp == 0 ) {
4996 if ( ( bSig0 | bSig1 ) == 0 ) {
4997 invalid:
4998 float_raise( float_flag_invalid );
4999 z.low = float128_default_nan_low;
5000 z.high = float128_default_nan_high;
5001 return z;
5002 }
5003 normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
5004 }
5005 if ( aExp == 0 ) {
5006 if ( ( aSig0 | aSig1 ) == 0 ) return a;
5007 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
5008 }
5009 expDiff = aExp - bExp;
5010 if ( expDiff < -1 ) return a;
5011 shortShift128Left(
5012 aSig0 | LIT64( 0x0001000000000000 ),
5013 aSig1,
5014 15 - ( expDiff < 0 ),
5015 &aSig0,
5016 &aSig1
5017 );
5018 shortShift128Left(
5019 bSig0 | LIT64( 0x0001000000000000 ), bSig1, 15, &bSig0, &bSig1 );
5020 q = le128( bSig0, bSig1, aSig0, aSig1 );
5021 if ( q ) sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 );
5022 expDiff -= 64;
5023 while ( 0 < expDiff ) {
5024 q = estimateDiv128To64( aSig0, aSig1, bSig0 );
5025 q = ( 4 < q ) ? q - 4 : 0;
5026 mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 );
5027 shortShift192Left( term0, term1, term2, 61, &term1, &term2, &allZero );
5028 shortShift128Left( aSig0, aSig1, 61, &aSig0, &allZero );
5029 sub128( aSig0, 0, term1, term2, &aSig0, &aSig1 );
5030 expDiff -= 61;
5031 }
5032 if ( -64 < expDiff ) {
5033 q = estimateDiv128To64( aSig0, aSig1, bSig0 );
5034 q = ( 4 < q ) ? q - 4 : 0;
5035 q >>= - expDiff;
5036 shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 );
5037 expDiff += 52;
5038 if ( expDiff < 0 ) {
5039 shift128Right( aSig0, aSig1, - expDiff, &aSig0, &aSig1 );
5040 }
5041 else {
5042 shortShift128Left( aSig0, aSig1, expDiff, &aSig0, &aSig1 );
5043 }
5044 mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 );
5045 sub128( aSig0, aSig1, term1, term2, &aSig0, &aSig1 );
5046 }
5047 else {
5048 shift128Right( aSig0, aSig1, 12, &aSig0, &aSig1 );
5049 shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 );
5050 }
5051 do {
5052 alternateASig0 = aSig0;
5053 alternateASig1 = aSig1;
5054 ++q;
5055 sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 );
5056 } while ( 0 <= (sbits64) aSig0 );
5057 add128(
5058 aSig0, aSig1, alternateASig0, alternateASig1, &sigMean0, &sigMean1 );
5059 if ( ( sigMean0 < 0 )
5060 || ( ( ( sigMean0 | sigMean1 ) == 0 ) && ( q & 1 ) ) ) {
5061 aSig0 = alternateASig0;
5062 aSig1 = alternateASig1;
5063 }
5064 zSign = ( (sbits64) aSig0 < 0 );
5065 if ( zSign ) sub128( 0, 0, aSig0, aSig1, &aSig0, &aSig1 );
5066 return
5067 normalizeRoundAndPackFloat128( aSign ^ zSign, bExp - 4, aSig0, aSig1 );
5068
5069 }
5070
5071 /*----------------------------------------------------------------------------
5072 | Returns the square root of the quadruple-precision floating-point value `a'.
5073 | The operation is performed according to the IEC/IEEE Standard for Binary
5074 | Floating-Point Arithmetic.
5075 *----------------------------------------------------------------------------*/
5076
5077 float128 float128_sqrt( float128 a )
5078 {
5079 flag aSign;
5080 int32 aExp, zExp;
5081 bits64 aSig0, aSig1, zSig0, zSig1, zSig2, doubleZSig0;
5082 bits64 rem0, rem1, rem2, rem3, term0, term1, term2, term3;
5083 float128 z;
5084
5085 aSig1 = extractFloat128Frac1( a );
5086 aSig0 = extractFloat128Frac0( a );
5087 aExp = extractFloat128Exp( a );
5088 aSign = extractFloat128Sign( a );
5089 if ( aExp == 0x7FFF ) {
5090 if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, a );
5091 if ( ! aSign ) return a;
5092 goto invalid;
5093 }
5094 if ( aSign ) {
5095 if ( ( aExp | aSig0 | aSig1 ) == 0 ) return a;
5096 invalid:
5097 float_raise( float_flag_invalid );
5098 z.low = float128_default_nan_low;
5099 z.high = float128_default_nan_high;
5100 return z;
5101 }
5102 if ( aExp == 0 ) {
5103 if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( 0, 0, 0, 0 );
5104 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
5105 }
5106 zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFE;
5107 aSig0 |= LIT64( 0x0001000000000000 );
5108 zSig0 = estimateSqrt32( aExp, aSig0>>17 );
5109 shortShift128Left( aSig0, aSig1, 13 - ( aExp & 1 ), &aSig0, &aSig1 );
5110 zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 );
5111 doubleZSig0 = zSig0<<1;
5112 mul64To128( zSig0, zSig0, &term0, &term1 );
5113 sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 );
5114 while ( (sbits64) rem0 < 0 ) {
5115 --zSig0;
5116 doubleZSig0 -= 2;
5117 add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 );
5118 }
5119 zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 );
5120 if ( ( zSig1 & 0x1FFF ) <= 5 ) {
5121 if ( zSig1 == 0 ) zSig1 = 1;
5122 mul64To128( doubleZSig0, zSig1, &term1, &term2 );
5123 sub128( rem1, 0, term1, term2, &rem1, &rem2 );
5124 mul64To128( zSig1, zSig1, &term2, &term3 );
5125 sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 );
5126 while ( (sbits64) rem1 < 0 ) {
5127 --zSig1;
5128 shortShift128Left( 0, zSig1, 1, &term2, &term3 );
5129 term3 |= 1;
5130 term2 |= doubleZSig0;
5131 add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 );
5132 }
5133 zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
5134 }
5135 shift128ExtraRightJamming( zSig0, zSig1, 0, 14, &zSig0, &zSig1, &zSig2 );
5136 return roundAndPackFloat128( 0, zExp, zSig0, zSig1, zSig2 );
5137
5138 }
5139
5140 /*----------------------------------------------------------------------------
5141 | Returns 1 if the quadruple-precision floating-point value `a' is equal to
5142 | the corresponding value `b', and 0 otherwise. The comparison is performed
5143 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5144 *----------------------------------------------------------------------------*/
5145
5146 flag float128_eq( float128 a, float128 b )
5147 {
5148
5149 if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
5150 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
5151 || ( ( extractFloat128Exp( b ) == 0x7FFF )
5152 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
5153 ) {
5154 if ( float128_is_signaling_nan( a )
5155 || float128_is_signaling_nan( b ) ) {
5156 float_raise( float_flag_invalid );
5157 }
5158 return 0;
5159 }
5160 return
5161 ( a.low == b.low )
5162 && ( ( a.high == b.high )
5163 || ( ( a.low == 0 )
5164 && ( (bits64) ( ( a.high | b.high )<<1 ) == 0 ) )
5165 );
5166
5167 }
5168
5169 /*----------------------------------------------------------------------------
5170 | Returns 1 if the quadruple-precision floating-point value `a' is less than
5171 | or equal to the corresponding value `b', and 0 otherwise. The comparison
5172 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
5173 | Arithmetic.
5174 *----------------------------------------------------------------------------*/
5175
5176 flag float128_le( float128 a, float128 b )
5177 {
5178 flag aSign, bSign;
5179
5180 if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
5181 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
5182 || ( ( extractFloat128Exp( b ) == 0x7FFF )
5183 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
5184 ) {
5185 float_raise( float_flag_invalid );
5186 return 0;
5187 }
5188 aSign = extractFloat128Sign( a );
5189 bSign = extractFloat128Sign( b );
5190 if ( aSign != bSign ) {
5191 return
5192 aSign
5193 || ( ( ( (bits64) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5194 == 0 );
5195 }
5196 return
5197 aSign ? le128( b.high, b.low, a.high, a.low )
5198 : le128( a.high, a.low, b.high, b.low );
5199
5200 }
5201
5202 /*----------------------------------------------------------------------------
5203 | Returns 1 if the quadruple-precision floating-point value `a' is less than
5204 | the corresponding value `b', and 0 otherwise. The comparison is performed
5205 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5206 *----------------------------------------------------------------------------*/
5207
5208 flag float128_lt( float128 a, float128 b )
5209 {
5210 flag aSign, bSign;
5211
5212 if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
5213 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
5214 || ( ( extractFloat128Exp( b ) == 0x7FFF )
5215 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
5216 ) {
5217 float_raise( float_flag_invalid );
5218 return 0;
5219 }
5220 aSign = extractFloat128Sign( a );
5221 bSign = extractFloat128Sign( b );
5222 if ( aSign != bSign ) {
5223 return
5224 aSign
5225 && ( ( ( (bits64) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5226 != 0 );
5227 }
5228 return
5229 aSign ? lt128( b.high, b.low, a.high, a.low )
5230 : lt128( a.high, a.low, b.high, b.low );
5231
5232 }
5233
5234 /*----------------------------------------------------------------------------
5235 | Returns 1 if the quadruple-precision floating-point value `a' is equal to
5236 | the corresponding value `b', and 0 otherwise. The invalid exception is
5237 | raised if either operand is a NaN. Otherwise, the comparison is performed
5238 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5239 *----------------------------------------------------------------------------*/
5240
5241 flag float128_eq_signaling( float128 a, float128 b )
5242 {
5243
5244 if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
5245 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
5246 || ( ( extractFloat128Exp( b ) == 0x7FFF )
5247 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
5248 ) {
5249 float_raise( float_flag_invalid );
5250 return 0;
5251 }
5252 return
5253 ( a.low == b.low )
5254 && ( ( a.high == b.high )
5255 || ( ( a.low == 0 )
5256 && ( (bits64) ( ( a.high | b.high )<<1 ) == 0 ) )
5257 );
5258
5259 }
5260
5261 /*----------------------------------------------------------------------------
5262 | Returns 1 if the quadruple-precision floating-point value `a' is less than
5263 | or equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not
5264 | cause an exception. Otherwise, the comparison is performed according to the
5265 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5266 *----------------------------------------------------------------------------*/
5267
5268 flag float128_le_quiet( float128 a, float128 b )
5269 {
5270 flag aSign, bSign;
5271
5272 if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
5273 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
5274 || ( ( extractFloat128Exp( b ) == 0x7FFF )
5275 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
5276 ) {
5277 if ( float128_is_signaling_nan( a )
5278 || float128_is_signaling_nan( b ) ) {
5279 float_raise( float_flag_invalid );
5280 }
5281 return 0;
5282 }
5283 aSign = extractFloat128Sign( a );
5284 bSign = extractFloat128Sign( b );
5285 if ( aSign != bSign ) {
5286 return
5287 aSign
5288 || ( ( ( (bits64) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5289 == 0 );
5290 }
5291 return
5292 aSign ? le128( b.high, b.low, a.high, a.low )
5293 : le128( a.high, a.low, b.high, b.low );
5294
5295 }
5296
5297 /*----------------------------------------------------------------------------
5298 | Returns 1 if the quadruple-precision floating-point value `a' is less than
5299 | the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
5300 | exception. Otherwise, the comparison is performed according to the IEC/IEEE
5301 | Standard for Binary Floating-Point Arithmetic.
5302 *----------------------------------------------------------------------------*/
5303
5304 flag float128_lt_quiet( float128 a, float128 b )
5305 {
5306 flag aSign, bSign;
5307
5308 if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
5309 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
5310 || ( ( extractFloat128Exp( b ) == 0x7FFF )
5311 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
5312 ) {
5313 if ( float128_is_signaling_nan( a )
5314 || float128_is_signaling_nan( b ) ) {
5315 float_raise( float_flag_invalid );
5316 }
5317 return 0;
5318 }
5319 aSign = extractFloat128Sign( a );
5320 bSign = extractFloat128Sign( b );
5321 if ( aSign != bSign ) {
5322 return
5323 aSign
5324 && ( ( ( (bits64) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5325 != 0 );
5326 }
5327 return
5328 aSign ? lt128( b.high, b.low, a.high, a.low )
5329 : lt128( a.high, a.low, b.high, b.low );
5330
5331 }
5332
5333 #endif
5334
5335
5336 #if defined(SOFTFLOAT_FOR_GCC) && defined(SOFTFLOAT_NEED_FIXUNS)
5337
5338 /*
5339 * These two routines are not part of the original softfloat distribution.
5340 *
5341 * They are based on the corresponding conversions to integer but return
5342 * unsigned numbers instead since these functions are required by GCC.
5343 *
5344 * Added by Mark Brinicombe <mark (at) NetBSD.org> 27/09/97
5345 *
5346 * float64 version overhauled for SoftFloat 2a [bjh21 2000-07-15]
5347 */
5348
5349 /*----------------------------------------------------------------------------
5350 | Returns the result of converting the double-precision floating-point value
5351 | `a' to the 32-bit unsigned integer format. The conversion is
5352 | performed according to the IEC/IEEE Standard for Binary Floating-point
5353 | Arithmetic, except that the conversion is always rounded toward zero. If
5354 | `a' is a NaN, the largest positive integer is returned. If the conversion
5355 | overflows, the largest integer positive is returned.
5356 *----------------------------------------------------------------------------*/
5357
5358 uint32 float64_to_uint32_round_to_zero( float64 a )
5359 {
5360 flag aSign;
5361 int16 aExp, shiftCount;
5362 bits64 aSig, savedASig;
5363 uint32 z;
5364
5365 aSig = extractFloat64Frac( a );
5366 aExp = extractFloat64Exp( a );
5367 aSign = extractFloat64Sign( a );
5368
5369 if (aSign) {
5370 float_raise( float_flag_invalid );
5371 return(0);
5372 }
5373
5374 if ( 0x41E < aExp ) {
5375 float_raise( float_flag_invalid );
5376 return 0xffffffff;
5377 }
5378 else if ( aExp < 0x3FF ) {
5379 if ( aExp || aSig ) float_set_inexact();
5380 return 0;
5381 }
5382 aSig |= LIT64( 0x0010000000000000 );
5383 shiftCount = 0x433 - aExp;
5384 savedASig = aSig;
5385 aSig >>= shiftCount;
5386 z = aSig;
5387 if ( ( aSig<<shiftCount ) != savedASig ) {
5388 float_set_inexact();
5389 }
5390 return z;
5391
5392 }
5393
5394 /*----------------------------------------------------------------------------
5395 | Returns the result of converting the single-precision floating-point value
5396 | `a' to the 32-bit unsigned integer format. The conversion is
5397 | performed according to the IEC/IEEE Standard for Binary Floating-point
5398 | Arithmetic, except that the conversion is always rounded toward zero. If
5399 | `a' is a NaN, the largest positive integer is returned. If the conversion
5400 | overflows, the largest positive integer is returned.
5401 *----------------------------------------------------------------------------*/
5402
5403 uint32 float32_to_uint32_round_to_zero( float32 a )
5404 {
5405 flag aSign;
5406 int16 aExp, shiftCount;
5407 bits32 aSig;
5408 uint32 z;
5409
5410 aSig = extractFloat32Frac( a );
5411 aExp = extractFloat32Exp( a );
5412 aSign = extractFloat32Sign( a );
5413 shiftCount = aExp - 0x9E;
5414
5415 if (aSign) {
5416 float_raise( float_flag_invalid );
5417 return(0);
5418 }
5419 if ( 0 < shiftCount ) {
5420 float_raise( float_flag_invalid );
5421 return 0xFFFFFFFF;
5422 }
5423 else if ( aExp <= 0x7E ) {
5424 if ( aExp | aSig ) float_set_inexact();
5425 return 0;
5426 }
5427 aSig = ( aSig | 0x800000 )<<8;
5428 z = aSig>>( - shiftCount );
5429 if ( aSig<<( shiftCount & 31 ) ) {
5430 float_set_inexact();
5431 }
5432 return z;
5433
5434 }
5435
5436 #endif
5437
5438 #endif /* _STANDALONE */
5439