softfloat.c revision 1.1 1 1.1 ross /* $NetBSD: softfloat.c,v 1.1 2001/04/26 03:10:47 ross Exp $ */
2 1.1 ross
3 1.1 ross /*
4 1.1 ross * This version hacked for use with gcc -msoft-float by bjh21.
5 1.1 ross * (Mostly a case of #ifdefing out things GCC doesn't need or provides
6 1.1 ross * itself).
7 1.1 ross */
8 1.1 ross
9 1.1 ross /*
10 1.1 ross * Things you may want to define:
11 1.1 ross *
12 1.1 ross * SOFTFLOAT_FOR_GCC - build only those functions necessary for GCC (with
13 1.1 ross * -msoft-float) to work. Include "softfloat-for-gcc.h" to get them
14 1.1 ross * properly renamed.
15 1.1 ross */
16 1.1 ross
17 1.1 ross /*
18 1.1 ross ===============================================================================
19 1.1 ross
20 1.1 ross This C source file is part of the SoftFloat IEC/IEEE Floating-point
21 1.1 ross Arithmetic Package, Release 2a.
22 1.1 ross
23 1.1 ross Written by John R. Hauser. This work was made possible in part by the
24 1.1 ross International Computer Science Institute, located at Suite 600, 1947 Center
25 1.1 ross Street, Berkeley, California 94704. Funding was partially provided by the
26 1.1 ross National Science Foundation under grant MIP-9311980. The original version
27 1.1 ross of this code was written as part of a project to build a fixed-point vector
28 1.1 ross processor in collaboration with the University of California at Berkeley,
29 1.1 ross overseen by Profs. Nelson Morgan and John Wawrzynek. More information
30 1.1 ross is available through the Web page `http://HTTP.CS.Berkeley.EDU/~jhauser/
31 1.1 ross arithmetic/SoftFloat.html'.
32 1.1 ross
33 1.1 ross THIS SOFTWARE IS DISTRIBUTED AS IS, FOR FREE. Although reasonable effort
34 1.1 ross has been made to avoid it, THIS SOFTWARE MAY CONTAIN FAULTS THAT WILL AT
35 1.1 ross TIMES RESULT IN INCORRECT BEHAVIOR. USE OF THIS SOFTWARE IS RESTRICTED TO
36 1.1 ross PERSONS AND ORGANIZATIONS WHO CAN AND WILL TAKE FULL RESPONSIBILITY FOR ANY
37 1.1 ross AND ALL LOSSES, COSTS, OR OTHER PROBLEMS ARISING FROM ITS USE.
38 1.1 ross
39 1.1 ross Derivative works are acceptable, even for commercial purposes, so long as
40 1.1 ross (1) they include prominent notice that the work is derivative, and (2) they
41 1.1 ross include prominent notice akin to these four paragraphs for those parts of
42 1.1 ross this code that are retained.
43 1.1 ross
44 1.1 ross ===============================================================================
45 1.1 ross */
46 1.1 ross
47 1.1 ross #include <sys/cdefs.h>
48 1.1 ross #if defined(LIBC_SCCS) && !defined(lint)
49 1.1 ross __RCSID("$NetBSD: softfloat.c,v 1.1 2001/04/26 03:10:47 ross Exp $");
50 1.1 ross #endif /* LIBC_SCCS and not lint */
51 1.1 ross
52 1.1 ross #ifdef SOFTFLOAT_FOR_GCC
53 1.1 ross #include "softfloat-for-gcc.h"
54 1.1 ross #endif
55 1.1 ross
56 1.1 ross #include "milieu.h"
57 1.1 ross #include "softfloat.h"
58 1.1 ross
59 1.1 ross /*
60 1.1 ross * Conversions between floats as stored in memory and floats as
61 1.1 ross * SoftFloat uses them
62 1.1 ross */
63 1.1 ross #ifndef FLOAT64_DEMANGLE
64 1.1 ross #define FLOAT64_DEMANGLE(a) (a)
65 1.1 ross #endif
66 1.1 ross #ifndef FLOAT64_MANGLE
67 1.1 ross #define FLOAT64_MANGLE(a) (a)
68 1.1 ross #endif
69 1.1 ross
70 1.1 ross /*
71 1.1 ross -------------------------------------------------------------------------------
72 1.1 ross Floating-point rounding mode, extended double-precision rounding precision,
73 1.1 ross and exception flags.
74 1.1 ross -------------------------------------------------------------------------------
75 1.1 ross */
76 1.1 ross
77 1.1 ross /*
78 1.1 ross * XXX: This may cause options-MULTIPROCESSOR or thread problems someday.
79 1.1 ross * Right now, it does not. I've removed all other dynamic global
80 1.1 ross * variables. [ross]
81 1.1 ross */
82 1.1 ross #ifdef FLOATX80
83 1.1 ross int8 floatx80_rounding_precision = 80;
84 1.1 ross #endif
85 1.1 ross
86 1.1 ross /*
87 1.1 ross -------------------------------------------------------------------------------
88 1.1 ross Primitive arithmetic functions, including multi-word arithmetic, and
89 1.1 ross division and square root approximations. (Can be specialized to target if
90 1.1 ross desired.)
91 1.1 ross -------------------------------------------------------------------------------
92 1.1 ross */
93 1.1 ross #include "softfloat-macros.h"
94 1.1 ross
95 1.1 ross /*
96 1.1 ross -------------------------------------------------------------------------------
97 1.1 ross Functions and definitions to determine: (1) whether tininess for underflow
98 1.1 ross is detected before or after rounding by default, (2) what (if anything)
99 1.1 ross happens when exceptions are raised, (3) how signaling NaNs are distinguished
100 1.1 ross from quiet NaNs, (4) the default generated quiet NaNs, and (5) how NaNs
101 1.1 ross are propagated from function inputs to output. These details are target-
102 1.1 ross specific.
103 1.1 ross -------------------------------------------------------------------------------
104 1.1 ross */
105 1.1 ross #include "softfloat-specialize.h"
106 1.1 ross
107 1.1 ross #ifndef SOFTFLOAT_FOR_GCC /* Not used */
108 1.1 ross /*
109 1.1 ross -------------------------------------------------------------------------------
110 1.1 ross Takes a 64-bit fixed-point value `absZ' with binary point between bits 6
111 1.1 ross and 7, and returns the properly rounded 32-bit integer corresponding to the
112 1.1 ross input. If `zSign' is 1, the input is negated before being converted to an
113 1.1 ross integer. Bit 63 of `absZ' must be zero. Ordinarily, the fixed-point input
114 1.1 ross is simply rounded to an integer, with the inexact exception raised if the
115 1.1 ross input cannot be represented exactly as an integer. However, if the fixed-
116 1.1 ross point input is too large, the invalid exception is raised and the largest
117 1.1 ross positive or negative integer is returned.
118 1.1 ross -------------------------------------------------------------------------------
119 1.1 ross */
120 1.1 ross static int32 roundAndPackInt32( flag zSign, bits64 absZ )
121 1.1 ross {
122 1.1 ross int8 roundingMode;
123 1.1 ross flag roundNearestEven;
124 1.1 ross int8 roundIncrement, roundBits;
125 1.1 ross int32 z;
126 1.1 ross
127 1.1 ross roundingMode = float_rounding_mode();
128 1.1 ross roundNearestEven = ( roundingMode == float_round_nearest_even );
129 1.1 ross roundIncrement = 0x40;
130 1.1 ross if ( ! roundNearestEven ) {
131 1.1 ross if ( roundingMode == float_round_to_zero ) {
132 1.1 ross roundIncrement = 0;
133 1.1 ross }
134 1.1 ross else {
135 1.1 ross roundIncrement = 0x7F;
136 1.1 ross if ( zSign ) {
137 1.1 ross if ( roundingMode == float_round_up ) roundIncrement = 0;
138 1.1 ross }
139 1.1 ross else {
140 1.1 ross if ( roundingMode == float_round_down ) roundIncrement = 0;
141 1.1 ross }
142 1.1 ross }
143 1.1 ross }
144 1.1 ross roundBits = absZ & 0x7F;
145 1.1 ross absZ = ( absZ + roundIncrement )>>7;
146 1.1 ross absZ &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven );
147 1.1 ross z = absZ;
148 1.1 ross if ( zSign ) z = - z;
149 1.1 ross if ( ( absZ>>32 ) || ( z && ( ( z < 0 ) ^ zSign ) ) ) {
150 1.1 ross float_raise( float_flag_invalid );
151 1.1 ross return zSign ? (sbits32) 0x80000000 : 0x7FFFFFFF;
152 1.1 ross }
153 1.1 ross if ( roundBits ) float_set_inexact();
154 1.1 ross return z;
155 1.1 ross
156 1.1 ross }
157 1.1 ross
158 1.1 ross /*
159 1.1 ross -------------------------------------------------------------------------------
160 1.1 ross Takes the 128-bit fixed-point value formed by concatenating `absZ0' and
161 1.1 ross `absZ1', with binary point between bits 63 and 64 (between the input words),
162 1.1 ross and returns the properly rounded 64-bit integer corresponding to the input.
163 1.1 ross If `zSign' is 1, the input is negated before being converted to an integer.
164 1.1 ross Ordinarily, the fixed-point input is simply rounded to an integer, with
165 1.1 ross the inexact exception raised if the input cannot be represented exactly as
166 1.1 ross an integer. However, if the fixed-point input is too large, the invalid
167 1.1 ross exception is raised and the largest positive or negative integer is
168 1.1 ross returned.
169 1.1 ross -------------------------------------------------------------------------------
170 1.1 ross */
171 1.1 ross static int64 roundAndPackInt64( flag zSign, bits64 absZ0, bits64 absZ1 )
172 1.1 ross {
173 1.1 ross int8 roundingMode;
174 1.1 ross flag roundNearestEven, increment;
175 1.1 ross int64 z;
176 1.1 ross
177 1.1 ross roundingMode = float_rounding_mode();
178 1.1 ross roundNearestEven = ( roundingMode == float_round_nearest_even );
179 1.1 ross increment = ( (sbits64) absZ1 < 0 );
180 1.1 ross if ( ! roundNearestEven ) {
181 1.1 ross if ( roundingMode == float_round_to_zero ) {
182 1.1 ross increment = 0;
183 1.1 ross }
184 1.1 ross else {
185 1.1 ross if ( zSign ) {
186 1.1 ross increment = ( roundingMode == float_round_down ) && absZ1;
187 1.1 ross }
188 1.1 ross else {
189 1.1 ross increment = ( roundingMode == float_round_up ) && absZ1;
190 1.1 ross }
191 1.1 ross }
192 1.1 ross }
193 1.1 ross if ( increment ) {
194 1.1 ross ++absZ0;
195 1.1 ross if ( absZ0 == 0 ) goto overflow;
196 1.1 ross absZ0 &= ~ ( ( (bits64) ( absZ1<<1 ) == 0 ) & roundNearestEven );
197 1.1 ross }
198 1.1 ross z = absZ0;
199 1.1 ross if ( zSign ) z = - z;
200 1.1 ross if ( z && ( ( z < 0 ) ^ zSign ) ) {
201 1.1 ross overflow:
202 1.1 ross float_raise( float_flag_invalid );
203 1.1 ross return
204 1.1 ross zSign ? (sbits64) LIT64( 0x8000000000000000 )
205 1.1 ross : LIT64( 0x7FFFFFFFFFFFFFFF );
206 1.1 ross }
207 1.1 ross if ( absZ1 ) float_set_inexact();
208 1.1 ross return z;
209 1.1 ross
210 1.1 ross }
211 1.1 ross #endif
212 1.1 ross
213 1.1 ross /*
214 1.1 ross -------------------------------------------------------------------------------
215 1.1 ross Returns the fraction bits of the single-precision floating-point value `a'.
216 1.1 ross -------------------------------------------------------------------------------
217 1.1 ross */
218 1.1 ross INLINE bits32 extractFloat32Frac( float32 a )
219 1.1 ross {
220 1.1 ross
221 1.1 ross return a & 0x007FFFFF;
222 1.1 ross
223 1.1 ross }
224 1.1 ross
225 1.1 ross /*
226 1.1 ross -------------------------------------------------------------------------------
227 1.1 ross Returns the exponent bits of the single-precision floating-point value `a'.
228 1.1 ross -------------------------------------------------------------------------------
229 1.1 ross */
230 1.1 ross INLINE int16 extractFloat32Exp( float32 a )
231 1.1 ross {
232 1.1 ross
233 1.1 ross return ( a>>23 ) & 0xFF;
234 1.1 ross
235 1.1 ross }
236 1.1 ross
237 1.1 ross /*
238 1.1 ross -------------------------------------------------------------------------------
239 1.1 ross Returns the sign bit of the single-precision floating-point value `a'.
240 1.1 ross -------------------------------------------------------------------------------
241 1.1 ross */
242 1.1 ross INLINE flag extractFloat32Sign( float32 a )
243 1.1 ross {
244 1.1 ross
245 1.1 ross return a>>31;
246 1.1 ross
247 1.1 ross }
248 1.1 ross
249 1.1 ross /*
250 1.1 ross -------------------------------------------------------------------------------
251 1.1 ross Normalizes the subnormal single-precision floating-point value represented
252 1.1 ross by the denormalized significand `aSig'. The normalized exponent and
253 1.1 ross significand are stored at the locations pointed to by `zExpPtr' and
254 1.1 ross `zSigPtr', respectively.
255 1.1 ross -------------------------------------------------------------------------------
256 1.1 ross */
257 1.1 ross static void
258 1.1 ross normalizeFloat32Subnormal( bits32 aSig, int16 *zExpPtr, bits32 *zSigPtr )
259 1.1 ross {
260 1.1 ross int8 shiftCount;
261 1.1 ross
262 1.1 ross shiftCount = countLeadingZeros32( aSig ) - 8;
263 1.1 ross *zSigPtr = aSig<<shiftCount;
264 1.1 ross *zExpPtr = 1 - shiftCount;
265 1.1 ross
266 1.1 ross }
267 1.1 ross
268 1.1 ross /*
269 1.1 ross -------------------------------------------------------------------------------
270 1.1 ross Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
271 1.1 ross single-precision floating-point value, returning the result. After being
272 1.1 ross shifted into the proper positions, the three fields are simply added
273 1.1 ross together to form the result. This means that any integer portion of `zSig'
274 1.1 ross will be added into the exponent. Since a properly normalized significand
275 1.1 ross will have an integer portion equal to 1, the `zExp' input should be 1 less
276 1.1 ross than the desired result exponent whenever `zSig' is a complete, normalized
277 1.1 ross significand.
278 1.1 ross -------------------------------------------------------------------------------
279 1.1 ross */
280 1.1 ross INLINE float32 packFloat32( flag zSign, int16 zExp, bits32 zSig )
281 1.1 ross {
282 1.1 ross
283 1.1 ross return ( ( (bits32) zSign )<<31 ) + ( ( (bits32) zExp )<<23 ) + zSig;
284 1.1 ross
285 1.1 ross }
286 1.1 ross
287 1.1 ross /*
288 1.1 ross -------------------------------------------------------------------------------
289 1.1 ross Takes an abstract floating-point value having sign `zSign', exponent `zExp',
290 1.1 ross and significand `zSig', and returns the proper single-precision floating-
291 1.1 ross point value corresponding to the abstract input. Ordinarily, the abstract
292 1.1 ross value is simply rounded and packed into the single-precision format, with
293 1.1 ross the inexact exception raised if the abstract input cannot be represented
294 1.1 ross exactly. However, if the abstract value is too large, the overflow and
295 1.1 ross inexact exceptions are raised and an infinity or maximal finite value is
296 1.1 ross returned. If the abstract value is too small, the input value is rounded to
297 1.1 ross a subnormal number, and the underflow and inexact exceptions are raised if
298 1.1 ross the abstract input cannot be represented exactly as a subnormal single-
299 1.1 ross precision floating-point number.
300 1.1 ross The input significand `zSig' has its binary point between bits 30
301 1.1 ross and 29, which is 7 bits to the left of the usual location. This shifted
302 1.1 ross significand must be normalized or smaller. If `zSig' is not normalized,
303 1.1 ross `zExp' must be 0; in that case, the result returned is a subnormal number,
304 1.1 ross and it must not require rounding. In the usual case that `zSig' is
305 1.1 ross normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
306 1.1 ross The handling of underflow and overflow follows the IEC/IEEE Standard for
307 1.1 ross Binary Floating-Point Arithmetic.
308 1.1 ross -------------------------------------------------------------------------------
309 1.1 ross */
310 1.1 ross static float32 roundAndPackFloat32( flag zSign, int16 zExp, bits32 zSig )
311 1.1 ross {
312 1.1 ross int8 roundingMode;
313 1.1 ross flag roundNearestEven;
314 1.1 ross int8 roundIncrement, roundBits;
315 1.1 ross flag isTiny;
316 1.1 ross
317 1.1 ross roundingMode = float_rounding_mode();
318 1.1 ross roundNearestEven = ( roundingMode == float_round_nearest_even );
319 1.1 ross roundIncrement = 0x40;
320 1.1 ross if ( ! roundNearestEven ) {
321 1.1 ross if ( roundingMode == float_round_to_zero ) {
322 1.1 ross roundIncrement = 0;
323 1.1 ross }
324 1.1 ross else {
325 1.1 ross roundIncrement = 0x7F;
326 1.1 ross if ( zSign ) {
327 1.1 ross if ( roundingMode == float_round_up ) roundIncrement = 0;
328 1.1 ross }
329 1.1 ross else {
330 1.1 ross if ( roundingMode == float_round_down ) roundIncrement = 0;
331 1.1 ross }
332 1.1 ross }
333 1.1 ross }
334 1.1 ross roundBits = zSig & 0x7F;
335 1.1 ross if ( 0xFD <= (bits16) zExp ) {
336 1.1 ross if ( ( 0xFD < zExp )
337 1.1 ross || ( ( zExp == 0xFD )
338 1.1 ross && ( (sbits32) ( zSig + roundIncrement ) < 0 ) )
339 1.1 ross ) {
340 1.1 ross float_raise( float_flag_overflow | float_flag_inexact );
341 1.1 ross return packFloat32( zSign, 0xFF, 0 ) - ( roundIncrement == 0 );
342 1.1 ross }
343 1.1 ross if ( zExp < 0 ) {
344 1.1 ross isTiny =
345 1.1 ross ( float_detect_tininess == float_tininess_before_rounding )
346 1.1 ross || ( zExp < -1 )
347 1.1 ross || ( zSig + roundIncrement < 0x80000000 );
348 1.1 ross shift32RightJamming( zSig, - zExp, &zSig );
349 1.1 ross zExp = 0;
350 1.1 ross roundBits = zSig & 0x7F;
351 1.1 ross if ( isTiny && roundBits ) float_raise( float_flag_underflow );
352 1.1 ross }
353 1.1 ross }
354 1.1 ross if ( roundBits ) float_set_inexact();
355 1.1 ross zSig = ( zSig + roundIncrement )>>7;
356 1.1 ross zSig &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven );
357 1.1 ross if ( zSig == 0 ) zExp = 0;
358 1.1 ross return packFloat32( zSign, zExp, zSig );
359 1.1 ross
360 1.1 ross }
361 1.1 ross
362 1.1 ross /*
363 1.1 ross -------------------------------------------------------------------------------
364 1.1 ross Takes an abstract floating-point value having sign `zSign', exponent `zExp',
365 1.1 ross and significand `zSig', and returns the proper single-precision floating-
366 1.1 ross point value corresponding to the abstract input. This routine is just like
367 1.1 ross `roundAndPackFloat32' except that `zSig' does not have to be normalized.
368 1.1 ross Bit 31 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
369 1.1 ross floating-point exponent.
370 1.1 ross -------------------------------------------------------------------------------
371 1.1 ross */
372 1.1 ross static float32
373 1.1 ross normalizeRoundAndPackFloat32( flag zSign, int16 zExp, bits32 zSig )
374 1.1 ross {
375 1.1 ross int8 shiftCount;
376 1.1 ross
377 1.1 ross shiftCount = countLeadingZeros32( zSig ) - 1;
378 1.1 ross return roundAndPackFloat32( zSign, zExp - shiftCount, zSig<<shiftCount );
379 1.1 ross
380 1.1 ross }
381 1.1 ross
382 1.1 ross /*
383 1.1 ross -------------------------------------------------------------------------------
384 1.1 ross Returns the fraction bits of the double-precision floating-point value `a'.
385 1.1 ross -------------------------------------------------------------------------------
386 1.1 ross */
387 1.1 ross INLINE bits64 extractFloat64Frac( float64 a )
388 1.1 ross {
389 1.1 ross
390 1.1 ross return FLOAT64_DEMANGLE(a) & LIT64( 0x000FFFFFFFFFFFFF );
391 1.1 ross
392 1.1 ross }
393 1.1 ross
394 1.1 ross /*
395 1.1 ross -------------------------------------------------------------------------------
396 1.1 ross Returns the exponent bits of the double-precision floating-point value `a'.
397 1.1 ross -------------------------------------------------------------------------------
398 1.1 ross */
399 1.1 ross INLINE int16 extractFloat64Exp( float64 a )
400 1.1 ross {
401 1.1 ross
402 1.1 ross return ( FLOAT64_DEMANGLE(a)>>52 ) & 0x7FF;
403 1.1 ross
404 1.1 ross }
405 1.1 ross
406 1.1 ross /*
407 1.1 ross -------------------------------------------------------------------------------
408 1.1 ross Returns the sign bit of the double-precision floating-point value `a'.
409 1.1 ross -------------------------------------------------------------------------------
410 1.1 ross */
411 1.1 ross INLINE flag extractFloat64Sign( float64 a )
412 1.1 ross {
413 1.1 ross
414 1.1 ross return FLOAT64_DEMANGLE(a)>>63;
415 1.1 ross
416 1.1 ross }
417 1.1 ross
418 1.1 ross /*
419 1.1 ross -------------------------------------------------------------------------------
420 1.1 ross Normalizes the subnormal double-precision floating-point value represented
421 1.1 ross by the denormalized significand `aSig'. The normalized exponent and
422 1.1 ross significand are stored at the locations pointed to by `zExpPtr' and
423 1.1 ross `zSigPtr', respectively.
424 1.1 ross -------------------------------------------------------------------------------
425 1.1 ross */
426 1.1 ross static void
427 1.1 ross normalizeFloat64Subnormal( bits64 aSig, int16 *zExpPtr, bits64 *zSigPtr )
428 1.1 ross {
429 1.1 ross int8 shiftCount;
430 1.1 ross
431 1.1 ross shiftCount = countLeadingZeros64( aSig ) - 11;
432 1.1 ross *zSigPtr = aSig<<shiftCount;
433 1.1 ross *zExpPtr = 1 - shiftCount;
434 1.1 ross
435 1.1 ross }
436 1.1 ross
437 1.1 ross /*
438 1.1 ross -------------------------------------------------------------------------------
439 1.1 ross Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
440 1.1 ross double-precision floating-point value, returning the result. After being
441 1.1 ross shifted into the proper positions, the three fields are simply added
442 1.1 ross together to form the result. This means that any integer portion of `zSig'
443 1.1 ross will be added into the exponent. Since a properly normalized significand
444 1.1 ross will have an integer portion equal to 1, the `zExp' input should be 1 less
445 1.1 ross than the desired result exponent whenever `zSig' is a complete, normalized
446 1.1 ross significand.
447 1.1 ross -------------------------------------------------------------------------------
448 1.1 ross */
449 1.1 ross INLINE float64 packFloat64( flag zSign, int16 zExp, bits64 zSig )
450 1.1 ross {
451 1.1 ross
452 1.1 ross return FLOAT64_MANGLE( ( ( (bits64) zSign )<<63 ) +
453 1.1 ross ( ( (bits64) zExp )<<52 ) + zSig );
454 1.1 ross
455 1.1 ross }
456 1.1 ross
457 1.1 ross /*
458 1.1 ross -------------------------------------------------------------------------------
459 1.1 ross Takes an abstract floating-point value having sign `zSign', exponent `zExp',
460 1.1 ross and significand `zSig', and returns the proper double-precision floating-
461 1.1 ross point value corresponding to the abstract input. Ordinarily, the abstract
462 1.1 ross value is simply rounded and packed into the double-precision format, with
463 1.1 ross the inexact exception raised if the abstract input cannot be represented
464 1.1 ross exactly. However, if the abstract value is too large, the overflow and
465 1.1 ross inexact exceptions are raised and an infinity or maximal finite value is
466 1.1 ross returned. If the abstract value is too small, the input value is rounded to
467 1.1 ross a subnormal number, and the underflow and inexact exceptions are raised if
468 1.1 ross the abstract input cannot be represented exactly as a subnormal double-
469 1.1 ross precision floating-point number.
470 1.1 ross The input significand `zSig' has its binary point between bits 62
471 1.1 ross and 61, which is 10 bits to the left of the usual location. This shifted
472 1.1 ross significand must be normalized or smaller. If `zSig' is not normalized,
473 1.1 ross `zExp' must be 0; in that case, the result returned is a subnormal number,
474 1.1 ross and it must not require rounding. In the usual case that `zSig' is
475 1.1 ross normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
476 1.1 ross The handling of underflow and overflow follows the IEC/IEEE Standard for
477 1.1 ross Binary Floating-Point Arithmetic.
478 1.1 ross -------------------------------------------------------------------------------
479 1.1 ross */
480 1.1 ross static float64 roundAndPackFloat64( flag zSign, int16 zExp, bits64 zSig )
481 1.1 ross {
482 1.1 ross int8 roundingMode;
483 1.1 ross flag roundNearestEven;
484 1.1 ross int16 roundIncrement, roundBits;
485 1.1 ross flag isTiny;
486 1.1 ross
487 1.1 ross roundingMode = float_rounding_mode();
488 1.1 ross roundNearestEven = ( roundingMode == float_round_nearest_even );
489 1.1 ross roundIncrement = 0x200;
490 1.1 ross if ( ! roundNearestEven ) {
491 1.1 ross if ( roundingMode == float_round_to_zero ) {
492 1.1 ross roundIncrement = 0;
493 1.1 ross }
494 1.1 ross else {
495 1.1 ross roundIncrement = 0x3FF;
496 1.1 ross if ( zSign ) {
497 1.1 ross if ( roundingMode == float_round_up ) roundIncrement = 0;
498 1.1 ross }
499 1.1 ross else {
500 1.1 ross if ( roundingMode == float_round_down ) roundIncrement = 0;
501 1.1 ross }
502 1.1 ross }
503 1.1 ross }
504 1.1 ross roundBits = zSig & 0x3FF;
505 1.1 ross if ( 0x7FD <= (bits16) zExp ) {
506 1.1 ross if ( ( 0x7FD < zExp )
507 1.1 ross || ( ( zExp == 0x7FD )
508 1.1 ross && ( (sbits64) ( zSig + roundIncrement ) < 0 ) )
509 1.1 ross ) {
510 1.1 ross float_raise( float_flag_overflow | float_flag_inexact );
511 1.1 ross return FLOAT64_MANGLE(
512 1.1 ross FLOAT64_DEMANGLE(packFloat64( zSign, 0x7FF, 0 )) -
513 1.1 ross ( roundIncrement == 0 ));
514 1.1 ross }
515 1.1 ross if ( zExp < 0 ) {
516 1.1 ross isTiny =
517 1.1 ross ( float_detect_tininess == float_tininess_before_rounding )
518 1.1 ross || ( zExp < -1 )
519 1.1 ross || ( zSig + roundIncrement < LIT64( 0x8000000000000000 ) );
520 1.1 ross shift64RightJamming( zSig, - zExp, &zSig );
521 1.1 ross zExp = 0;
522 1.1 ross roundBits = zSig & 0x3FF;
523 1.1 ross if ( isTiny && roundBits ) float_raise( float_flag_underflow );
524 1.1 ross }
525 1.1 ross }
526 1.1 ross if ( roundBits ) float_set_inexact();
527 1.1 ross zSig = ( zSig + roundIncrement )>>10;
528 1.1 ross zSig &= ~ ( ( ( roundBits ^ 0x200 ) == 0 ) & roundNearestEven );
529 1.1 ross if ( zSig == 0 ) zExp = 0;
530 1.1 ross return packFloat64( zSign, zExp, zSig );
531 1.1 ross
532 1.1 ross }
533 1.1 ross
534 1.1 ross /*
535 1.1 ross -------------------------------------------------------------------------------
536 1.1 ross Takes an abstract floating-point value having sign `zSign', exponent `zExp',
537 1.1 ross and significand `zSig', and returns the proper double-precision floating-
538 1.1 ross point value corresponding to the abstract input. This routine is just like
539 1.1 ross `roundAndPackFloat64' except that `zSig' does not have to be normalized.
540 1.1 ross Bit 63 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
541 1.1 ross floating-point exponent.
542 1.1 ross -------------------------------------------------------------------------------
543 1.1 ross */
544 1.1 ross static float64
545 1.1 ross normalizeRoundAndPackFloat64( flag zSign, int16 zExp, bits64 zSig )
546 1.1 ross {
547 1.1 ross int8 shiftCount;
548 1.1 ross
549 1.1 ross shiftCount = countLeadingZeros64( zSig ) - 1;
550 1.1 ross return roundAndPackFloat64( zSign, zExp - shiftCount, zSig<<shiftCount );
551 1.1 ross
552 1.1 ross }
553 1.1 ross
554 1.1 ross #ifdef FLOATX80
555 1.1 ross
556 1.1 ross /*
557 1.1 ross -------------------------------------------------------------------------------
558 1.1 ross Returns the fraction bits of the extended double-precision floating-point
559 1.1 ross value `a'.
560 1.1 ross -------------------------------------------------------------------------------
561 1.1 ross */
562 1.1 ross INLINE bits64 extractFloatx80Frac( floatx80 a )
563 1.1 ross {
564 1.1 ross
565 1.1 ross return a.low;
566 1.1 ross
567 1.1 ross }
568 1.1 ross
569 1.1 ross /*
570 1.1 ross -------------------------------------------------------------------------------
571 1.1 ross Returns the exponent bits of the extended double-precision floating-point
572 1.1 ross value `a'.
573 1.1 ross -------------------------------------------------------------------------------
574 1.1 ross */
575 1.1 ross INLINE int32 extractFloatx80Exp( floatx80 a )
576 1.1 ross {
577 1.1 ross
578 1.1 ross return a.high & 0x7FFF;
579 1.1 ross
580 1.1 ross }
581 1.1 ross
582 1.1 ross /*
583 1.1 ross -------------------------------------------------------------------------------
584 1.1 ross Returns the sign bit of the extended double-precision floating-point value
585 1.1 ross `a'.
586 1.1 ross -------------------------------------------------------------------------------
587 1.1 ross */
588 1.1 ross INLINE flag extractFloatx80Sign( floatx80 a )
589 1.1 ross {
590 1.1 ross
591 1.1 ross return a.high>>15;
592 1.1 ross
593 1.1 ross }
594 1.1 ross
595 1.1 ross /*
596 1.1 ross -------------------------------------------------------------------------------
597 1.1 ross Normalizes the subnormal extended double-precision floating-point value
598 1.1 ross represented by the denormalized significand `aSig'. The normalized exponent
599 1.1 ross and significand are stored at the locations pointed to by `zExpPtr' and
600 1.1 ross `zSigPtr', respectively.
601 1.1 ross -------------------------------------------------------------------------------
602 1.1 ross */
603 1.1 ross static void
604 1.1 ross normalizeFloatx80Subnormal( bits64 aSig, int32 *zExpPtr, bits64 *zSigPtr )
605 1.1 ross {
606 1.1 ross int8 shiftCount;
607 1.1 ross
608 1.1 ross shiftCount = countLeadingZeros64( aSig );
609 1.1 ross *zSigPtr = aSig<<shiftCount;
610 1.1 ross *zExpPtr = 1 - shiftCount;
611 1.1 ross
612 1.1 ross }
613 1.1 ross
614 1.1 ross /*
615 1.1 ross -------------------------------------------------------------------------------
616 1.1 ross Packs the sign `zSign', exponent `zExp', and significand `zSig' into an
617 1.1 ross extended double-precision floating-point value, returning the result.
618 1.1 ross -------------------------------------------------------------------------------
619 1.1 ross */
620 1.1 ross INLINE floatx80 packFloatx80( flag zSign, int32 zExp, bits64 zSig )
621 1.1 ross {
622 1.1 ross floatx80 z;
623 1.1 ross
624 1.1 ross z.low = zSig;
625 1.1 ross z.high = ( ( (bits16) zSign )<<15 ) + zExp;
626 1.1 ross return z;
627 1.1 ross
628 1.1 ross }
629 1.1 ross
630 1.1 ross /*
631 1.1 ross -------------------------------------------------------------------------------
632 1.1 ross Takes an abstract floating-point value having sign `zSign', exponent `zExp',
633 1.1 ross and extended significand formed by the concatenation of `zSig0' and `zSig1',
634 1.1 ross and returns the proper extended double-precision floating-point value
635 1.1 ross corresponding to the abstract input. Ordinarily, the abstract value is
636 1.1 ross rounded and packed into the extended double-precision format, with the
637 1.1 ross inexact exception raised if the abstract input cannot be represented
638 1.1 ross exactly. However, if the abstract value is too large, the overflow and
639 1.1 ross inexact exceptions are raised and an infinity or maximal finite value is
640 1.1 ross returned. If the abstract value is too small, the input value is rounded to
641 1.1 ross a subnormal number, and the underflow and inexact exceptions are raised if
642 1.1 ross the abstract input cannot be represented exactly as a subnormal extended
643 1.1 ross double-precision floating-point number.
644 1.1 ross If `roundingPrecision' is 32 or 64, the result is rounded to the same
645 1.1 ross number of bits as single or double precision, respectively. Otherwise, the
646 1.1 ross result is rounded to the full precision of the extended double-precision
647 1.1 ross format.
648 1.1 ross The input significand must be normalized or smaller. If the input
649 1.1 ross significand is not normalized, `zExp' must be 0; in that case, the result
650 1.1 ross returned is a subnormal number, and it must not require rounding. The
651 1.1 ross handling of underflow and overflow follows the IEC/IEEE Standard for Binary
652 1.1 ross Floating-Point Arithmetic.
653 1.1 ross -------------------------------------------------------------------------------
654 1.1 ross */
655 1.1 ross static floatx80
656 1.1 ross roundAndPackFloatx80(
657 1.1 ross int8 roundingPrecision, flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1
658 1.1 ross )
659 1.1 ross {
660 1.1 ross int8 roundingMode;
661 1.1 ross flag roundNearestEven, increment, isTiny;
662 1.1 ross int64 roundIncrement, roundMask, roundBits;
663 1.1 ross
664 1.1 ross roundingMode = float_rounding_mode();
665 1.1 ross roundNearestEven = ( roundingMode == float_round_nearest_even );
666 1.1 ross if ( roundingPrecision == 80 ) goto precision80;
667 1.1 ross if ( roundingPrecision == 64 ) {
668 1.1 ross roundIncrement = LIT64( 0x0000000000000400 );
669 1.1 ross roundMask = LIT64( 0x00000000000007FF );
670 1.1 ross }
671 1.1 ross else if ( roundingPrecision == 32 ) {
672 1.1 ross roundIncrement = LIT64( 0x0000008000000000 );
673 1.1 ross roundMask = LIT64( 0x000000FFFFFFFFFF );
674 1.1 ross }
675 1.1 ross else {
676 1.1 ross goto precision80;
677 1.1 ross }
678 1.1 ross zSig0 |= ( zSig1 != 0 );
679 1.1 ross if ( ! roundNearestEven ) {
680 1.1 ross if ( roundingMode == float_round_to_zero ) {
681 1.1 ross roundIncrement = 0;
682 1.1 ross }
683 1.1 ross else {
684 1.1 ross roundIncrement = roundMask;
685 1.1 ross if ( zSign ) {
686 1.1 ross if ( roundingMode == float_round_up ) roundIncrement = 0;
687 1.1 ross }
688 1.1 ross else {
689 1.1 ross if ( roundingMode == float_round_down ) roundIncrement = 0;
690 1.1 ross }
691 1.1 ross }
692 1.1 ross }
693 1.1 ross roundBits = zSig0 & roundMask;
694 1.1 ross if ( 0x7FFD <= (bits32) ( zExp - 1 ) ) {
695 1.1 ross if ( ( 0x7FFE < zExp )
696 1.1 ross || ( ( zExp == 0x7FFE ) && ( zSig0 + roundIncrement < zSig0 ) )
697 1.1 ross ) {
698 1.1 ross goto overflow;
699 1.1 ross }
700 1.1 ross if ( zExp <= 0 ) {
701 1.1 ross isTiny =
702 1.1 ross ( float_detect_tininess == float_tininess_before_rounding )
703 1.1 ross || ( zExp < 0 )
704 1.1 ross || ( zSig0 <= zSig0 + roundIncrement );
705 1.1 ross shift64RightJamming( zSig0, 1 - zExp, &zSig0 );
706 1.1 ross zExp = 0;
707 1.1 ross roundBits = zSig0 & roundMask;
708 1.1 ross if ( isTiny && roundBits ) float_raise( float_flag_underflow );
709 1.1 ross if ( roundBits ) float_set_inexact();
710 1.1 ross zSig0 += roundIncrement;
711 1.1 ross if ( (sbits64) zSig0 < 0 ) zExp = 1;
712 1.1 ross roundIncrement = roundMask + 1;
713 1.1 ross if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) {
714 1.1 ross roundMask |= roundIncrement;
715 1.1 ross }
716 1.1 ross zSig0 &= ~ roundMask;
717 1.1 ross return packFloatx80( zSign, zExp, zSig0 );
718 1.1 ross }
719 1.1 ross }
720 1.1 ross if ( roundBits ) float_set_inexact();
721 1.1 ross zSig0 += roundIncrement;
722 1.1 ross if ( zSig0 < roundIncrement ) {
723 1.1 ross ++zExp;
724 1.1 ross zSig0 = LIT64( 0x8000000000000000 );
725 1.1 ross }
726 1.1 ross roundIncrement = roundMask + 1;
727 1.1 ross if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) {
728 1.1 ross roundMask |= roundIncrement;
729 1.1 ross }
730 1.1 ross zSig0 &= ~ roundMask;
731 1.1 ross if ( zSig0 == 0 ) zExp = 0;
732 1.1 ross return packFloatx80( zSign, zExp, zSig0 );
733 1.1 ross precision80:
734 1.1 ross increment = ( (sbits64) zSig1 < 0 );
735 1.1 ross if ( ! roundNearestEven ) {
736 1.1 ross if ( roundingMode == float_round_to_zero ) {
737 1.1 ross increment = 0;
738 1.1 ross }
739 1.1 ross else {
740 1.1 ross if ( zSign ) {
741 1.1 ross increment = ( roundingMode == float_round_down ) && zSig1;
742 1.1 ross }
743 1.1 ross else {
744 1.1 ross increment = ( roundingMode == float_round_up ) && zSig1;
745 1.1 ross }
746 1.1 ross }
747 1.1 ross }
748 1.1 ross if ( 0x7FFD <= (bits32) ( zExp - 1 ) ) {
749 1.1 ross if ( ( 0x7FFE < zExp )
750 1.1 ross || ( ( zExp == 0x7FFE )
751 1.1 ross && ( zSig0 == LIT64( 0xFFFFFFFFFFFFFFFF ) )
752 1.1 ross && increment
753 1.1 ross )
754 1.1 ross ) {
755 1.1 ross roundMask = 0;
756 1.1 ross overflow:
757 1.1 ross float_raise( float_flag_overflow | float_flag_inexact );
758 1.1 ross if ( ( roundingMode == float_round_to_zero )
759 1.1 ross || ( zSign && ( roundingMode == float_round_up ) )
760 1.1 ross || ( ! zSign && ( roundingMode == float_round_down ) )
761 1.1 ross ) {
762 1.1 ross return packFloatx80( zSign, 0x7FFE, ~ roundMask );
763 1.1 ross }
764 1.1 ross return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
765 1.1 ross }
766 1.1 ross if ( zExp <= 0 ) {
767 1.1 ross isTiny =
768 1.1 ross ( float_detect_tininess == float_tininess_before_rounding )
769 1.1 ross || ( zExp < 0 )
770 1.1 ross || ! increment
771 1.1 ross || ( zSig0 < LIT64( 0xFFFFFFFFFFFFFFFF ) );
772 1.1 ross shift64ExtraRightJamming( zSig0, zSig1, 1 - zExp, &zSig0, &zSig1 );
773 1.1 ross zExp = 0;
774 1.1 ross if ( isTiny && zSig1 ) float_raise( float_flag_underflow );
775 1.1 ross if ( zSig1 ) float_set_inexact();
776 1.1 ross if ( roundNearestEven ) {
777 1.1 ross increment = ( (sbits64) zSig1 < 0 );
778 1.1 ross }
779 1.1 ross else {
780 1.1 ross if ( zSign ) {
781 1.1 ross increment = ( roundingMode == float_round_down ) && zSig1;
782 1.1 ross }
783 1.1 ross else {
784 1.1 ross increment = ( roundingMode == float_round_up ) && zSig1;
785 1.1 ross }
786 1.1 ross }
787 1.1 ross if ( increment ) {
788 1.1 ross ++zSig0;
789 1.1 ross zSig0 &=
790 1.1 ross ~ ( ( (bits64) ( zSig1<<1 ) == 0 ) & roundNearestEven );
791 1.1 ross if ( (sbits64) zSig0 < 0 ) zExp = 1;
792 1.1 ross }
793 1.1 ross return packFloatx80( zSign, zExp, zSig0 );
794 1.1 ross }
795 1.1 ross }
796 1.1 ross if ( zSig1 ) float_set_inexact();
797 1.1 ross if ( increment ) {
798 1.1 ross ++zSig0;
799 1.1 ross if ( zSig0 == 0 ) {
800 1.1 ross ++zExp;
801 1.1 ross zSig0 = LIT64( 0x8000000000000000 );
802 1.1 ross }
803 1.1 ross else {
804 1.1 ross zSig0 &= ~ ( ( (bits64) ( zSig1<<1 ) == 0 ) & roundNearestEven );
805 1.1 ross }
806 1.1 ross }
807 1.1 ross else {
808 1.1 ross if ( zSig0 == 0 ) zExp = 0;
809 1.1 ross }
810 1.1 ross return packFloatx80( zSign, zExp, zSig0 );
811 1.1 ross
812 1.1 ross }
813 1.1 ross
814 1.1 ross /*
815 1.1 ross -------------------------------------------------------------------------------
816 1.1 ross Takes an abstract floating-point value having sign `zSign', exponent
817 1.1 ross `zExp', and significand formed by the concatenation of `zSig0' and `zSig1',
818 1.1 ross and returns the proper extended double-precision floating-point value
819 1.1 ross corresponding to the abstract input. This routine is just like
820 1.1 ross `roundAndPackFloatx80' except that the input significand does not have to be
821 1.1 ross normalized.
822 1.1 ross -------------------------------------------------------------------------------
823 1.1 ross */
824 1.1 ross static floatx80
825 1.1 ross normalizeRoundAndPackFloatx80(
826 1.1 ross int8 roundingPrecision, flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1
827 1.1 ross )
828 1.1 ross {
829 1.1 ross int8 shiftCount;
830 1.1 ross
831 1.1 ross if ( zSig0 == 0 ) {
832 1.1 ross zSig0 = zSig1;
833 1.1 ross zSig1 = 0;
834 1.1 ross zExp -= 64;
835 1.1 ross }
836 1.1 ross shiftCount = countLeadingZeros64( zSig0 );
837 1.1 ross shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
838 1.1 ross zExp -= shiftCount;
839 1.1 ross return
840 1.1 ross roundAndPackFloatx80( roundingPrecision, zSign, zExp, zSig0, zSig1 );
841 1.1 ross
842 1.1 ross }
843 1.1 ross
844 1.1 ross #endif
845 1.1 ross
846 1.1 ross #ifdef FLOAT128
847 1.1 ross
848 1.1 ross /*
849 1.1 ross -------------------------------------------------------------------------------
850 1.1 ross Returns the least-significant 64 fraction bits of the quadruple-precision
851 1.1 ross floating-point value `a'.
852 1.1 ross -------------------------------------------------------------------------------
853 1.1 ross */
854 1.1 ross INLINE bits64 extractFloat128Frac1( float128 a )
855 1.1 ross {
856 1.1 ross
857 1.1 ross return a.low;
858 1.1 ross
859 1.1 ross }
860 1.1 ross
861 1.1 ross /*
862 1.1 ross -------------------------------------------------------------------------------
863 1.1 ross Returns the most-significant 48 fraction bits of the quadruple-precision
864 1.1 ross floating-point value `a'.
865 1.1 ross -------------------------------------------------------------------------------
866 1.1 ross */
867 1.1 ross INLINE bits64 extractFloat128Frac0( float128 a )
868 1.1 ross {
869 1.1 ross
870 1.1 ross return a.high & LIT64( 0x0000FFFFFFFFFFFF );
871 1.1 ross
872 1.1 ross }
873 1.1 ross
874 1.1 ross /*
875 1.1 ross -------------------------------------------------------------------------------
876 1.1 ross Returns the exponent bits of the quadruple-precision floating-point value
877 1.1 ross `a'.
878 1.1 ross -------------------------------------------------------------------------------
879 1.1 ross */
880 1.1 ross INLINE int32 extractFloat128Exp( float128 a )
881 1.1 ross {
882 1.1 ross
883 1.1 ross return ( a.high>>48 ) & 0x7FFF;
884 1.1 ross
885 1.1 ross }
886 1.1 ross
887 1.1 ross /*
888 1.1 ross -------------------------------------------------------------------------------
889 1.1 ross Returns the sign bit of the quadruple-precision floating-point value `a'.
890 1.1 ross -------------------------------------------------------------------------------
891 1.1 ross */
892 1.1 ross INLINE flag extractFloat128Sign( float128 a )
893 1.1 ross {
894 1.1 ross
895 1.1 ross return a.high>>63;
896 1.1 ross
897 1.1 ross }
898 1.1 ross
899 1.1 ross /*
900 1.1 ross -------------------------------------------------------------------------------
901 1.1 ross Normalizes the subnormal quadruple-precision floating-point value
902 1.1 ross represented by the denormalized significand formed by the concatenation of
903 1.1 ross `aSig0' and `aSig1'. The normalized exponent is stored at the location
904 1.1 ross pointed to by `zExpPtr'. The most significant 49 bits of the normalized
905 1.1 ross significand are stored at the location pointed to by `zSig0Ptr', and the
906 1.1 ross least significant 64 bits of the normalized significand are stored at the
907 1.1 ross location pointed to by `zSig1Ptr'.
908 1.1 ross -------------------------------------------------------------------------------
909 1.1 ross */
910 1.1 ross static void
911 1.1 ross normalizeFloat128Subnormal(
912 1.1 ross bits64 aSig0,
913 1.1 ross bits64 aSig1,
914 1.1 ross int32 *zExpPtr,
915 1.1 ross bits64 *zSig0Ptr,
916 1.1 ross bits64 *zSig1Ptr
917 1.1 ross )
918 1.1 ross {
919 1.1 ross int8 shiftCount;
920 1.1 ross
921 1.1 ross if ( aSig0 == 0 ) {
922 1.1 ross shiftCount = countLeadingZeros64( aSig1 ) - 15;
923 1.1 ross if ( shiftCount < 0 ) {
924 1.1 ross *zSig0Ptr = aSig1>>( - shiftCount );
925 1.1 ross *zSig1Ptr = aSig1<<( shiftCount & 63 );
926 1.1 ross }
927 1.1 ross else {
928 1.1 ross *zSig0Ptr = aSig1<<shiftCount;
929 1.1 ross *zSig1Ptr = 0;
930 1.1 ross }
931 1.1 ross *zExpPtr = - shiftCount - 63;
932 1.1 ross }
933 1.1 ross else {
934 1.1 ross shiftCount = countLeadingZeros64( aSig0 ) - 15;
935 1.1 ross shortShift128Left( aSig0, aSig1, shiftCount, zSig0Ptr, zSig1Ptr );
936 1.1 ross *zExpPtr = 1 - shiftCount;
937 1.1 ross }
938 1.1 ross
939 1.1 ross }
940 1.1 ross
941 1.1 ross /*
942 1.1 ross -------------------------------------------------------------------------------
943 1.1 ross Packs the sign `zSign', the exponent `zExp', and the significand formed
944 1.1 ross by the concatenation of `zSig0' and `zSig1' into a quadruple-precision
945 1.1 ross floating-point value, returning the result. After being shifted into the
946 1.1 ross proper positions, the three fields `zSign', `zExp', and `zSig0' are simply
947 1.1 ross added together to form the most significant 32 bits of the result. This
948 1.1 ross means that any integer portion of `zSig0' will be added into the exponent.
949 1.1 ross Since a properly normalized significand will have an integer portion equal
950 1.1 ross to 1, the `zExp' input should be 1 less than the desired result exponent
951 1.1 ross whenever `zSig0' and `zSig1' concatenated form a complete, normalized
952 1.1 ross significand.
953 1.1 ross -------------------------------------------------------------------------------
954 1.1 ross */
955 1.1 ross INLINE float128
956 1.1 ross packFloat128( flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1 )
957 1.1 ross {
958 1.1 ross float128 z;
959 1.1 ross
960 1.1 ross z.low = zSig1;
961 1.1 ross z.high = ( ( (bits64) zSign )<<63 ) + ( ( (bits64) zExp )<<48 ) + zSig0;
962 1.1 ross return z;
963 1.1 ross
964 1.1 ross }
965 1.1 ross
966 1.1 ross /*
967 1.1 ross -------------------------------------------------------------------------------
968 1.1 ross Takes an abstract floating-point value having sign `zSign', exponent `zExp',
969 1.1 ross and extended significand formed by the concatenation of `zSig0', `zSig1',
970 1.1 ross and `zSig2', and returns the proper quadruple-precision floating-point value
971 1.1 ross corresponding to the abstract input. Ordinarily, the abstract value is
972 1.1 ross simply rounded and packed into the quadruple-precision format, with the
973 1.1 ross inexact exception raised if the abstract input cannot be represented
974 1.1 ross exactly. However, if the abstract value is too large, the overflow and
975 1.1 ross inexact exceptions are raised and an infinity or maximal finite value is
976 1.1 ross returned. If the abstract value is too small, the input value is rounded to
977 1.1 ross a subnormal number, and the underflow and inexact exceptions are raised if
978 1.1 ross the abstract input cannot be represented exactly as a subnormal quadruple-
979 1.1 ross precision floating-point number.
980 1.1 ross The input significand must be normalized or smaller. If the input
981 1.1 ross significand is not normalized, `zExp' must be 0; in that case, the result
982 1.1 ross returned is a subnormal number, and it must not require rounding. In the
983 1.1 ross usual case that the input significand is normalized, `zExp' must be 1 less
984 1.1 ross than the ``true'' floating-point exponent. The handling of underflow and
985 1.1 ross overflow follows the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
986 1.1 ross -------------------------------------------------------------------------------
987 1.1 ross */
988 1.1 ross static float128
989 1.1 ross roundAndPackFloat128(
990 1.1 ross flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1, bits64 zSig2 )
991 1.1 ross {
992 1.1 ross int8 roundingMode;
993 1.1 ross flag roundNearestEven, increment, isTiny;
994 1.1 ross
995 1.1 ross roundingMode = float_rounding_mode();
996 1.1 ross roundNearestEven = ( roundingMode == float_round_nearest_even );
997 1.1 ross increment = ( (sbits64) zSig2 < 0 );
998 1.1 ross if ( ! roundNearestEven ) {
999 1.1 ross if ( roundingMode == float_round_to_zero ) {
1000 1.1 ross increment = 0;
1001 1.1 ross }
1002 1.1 ross else {
1003 1.1 ross if ( zSign ) {
1004 1.1 ross increment = ( roundingMode == float_round_down ) && zSig2;
1005 1.1 ross }
1006 1.1 ross else {
1007 1.1 ross increment = ( roundingMode == float_round_up ) && zSig2;
1008 1.1 ross }
1009 1.1 ross }
1010 1.1 ross }
1011 1.1 ross if ( 0x7FFD <= (bits32) zExp ) {
1012 1.1 ross if ( ( 0x7FFD < zExp )
1013 1.1 ross || ( ( zExp == 0x7FFD )
1014 1.1 ross && eq128(
1015 1.1 ross LIT64( 0x0001FFFFFFFFFFFF ),
1016 1.1 ross LIT64( 0xFFFFFFFFFFFFFFFF ),
1017 1.1 ross zSig0,
1018 1.1 ross zSig1
1019 1.1 ross )
1020 1.1 ross && increment
1021 1.1 ross )
1022 1.1 ross ) {
1023 1.1 ross float_raise( float_flag_overflow | float_flag_inexact );
1024 1.1 ross if ( ( roundingMode == float_round_to_zero )
1025 1.1 ross || ( zSign && ( roundingMode == float_round_up ) )
1026 1.1 ross || ( ! zSign && ( roundingMode == float_round_down ) )
1027 1.1 ross ) {
1028 1.1 ross return
1029 1.1 ross packFloat128(
1030 1.1 ross zSign,
1031 1.1 ross 0x7FFE,
1032 1.1 ross LIT64( 0x0000FFFFFFFFFFFF ),
1033 1.1 ross LIT64( 0xFFFFFFFFFFFFFFFF )
1034 1.1 ross );
1035 1.1 ross }
1036 1.1 ross return packFloat128( zSign, 0x7FFF, 0, 0 );
1037 1.1 ross }
1038 1.1 ross if ( zExp < 0 ) {
1039 1.1 ross isTiny =
1040 1.1 ross ( float_detect_tininess == float_tininess_before_rounding )
1041 1.1 ross || ( zExp < -1 )
1042 1.1 ross || ! increment
1043 1.1 ross || lt128(
1044 1.1 ross zSig0,
1045 1.1 ross zSig1,
1046 1.1 ross LIT64( 0x0001FFFFFFFFFFFF ),
1047 1.1 ross LIT64( 0xFFFFFFFFFFFFFFFF )
1048 1.1 ross );
1049 1.1 ross shift128ExtraRightJamming(
1050 1.1 ross zSig0, zSig1, zSig2, - zExp, &zSig0, &zSig1, &zSig2 );
1051 1.1 ross zExp = 0;
1052 1.1 ross if ( isTiny && zSig2 ) float_raise( float_flag_underflow );
1053 1.1 ross if ( roundNearestEven ) {
1054 1.1 ross increment = ( (sbits64) zSig2 < 0 );
1055 1.1 ross }
1056 1.1 ross else {
1057 1.1 ross if ( zSign ) {
1058 1.1 ross increment = ( roundingMode == float_round_down ) && zSig2;
1059 1.1 ross }
1060 1.1 ross else {
1061 1.1 ross increment = ( roundingMode == float_round_up ) && zSig2;
1062 1.1 ross }
1063 1.1 ross }
1064 1.1 ross }
1065 1.1 ross }
1066 1.1 ross if ( zSig2 ) float_set_inexact();
1067 1.1 ross if ( increment ) {
1068 1.1 ross add128( zSig0, zSig1, 0, 1, &zSig0, &zSig1 );
1069 1.1 ross zSig1 &= ~ ( ( zSig2 + zSig2 == 0 ) & roundNearestEven );
1070 1.1 ross }
1071 1.1 ross else {
1072 1.1 ross if ( ( zSig0 | zSig1 ) == 0 ) zExp = 0;
1073 1.1 ross }
1074 1.1 ross return packFloat128( zSign, zExp, zSig0, zSig1 );
1075 1.1 ross
1076 1.1 ross }
1077 1.1 ross
1078 1.1 ross /*
1079 1.1 ross -------------------------------------------------------------------------------
1080 1.1 ross Takes an abstract floating-point value having sign `zSign', exponent `zExp',
1081 1.1 ross and significand formed by the concatenation of `zSig0' and `zSig1', and
1082 1.1 ross returns the proper quadruple-precision floating-point value corresponding
1083 1.1 ross to the abstract input. This routine is just like `roundAndPackFloat128'
1084 1.1 ross except that the input significand has fewer bits and does not have to be
1085 1.1 ross normalized. In all cases, `zExp' must be 1 less than the ``true'' floating-
1086 1.1 ross point exponent.
1087 1.1 ross -------------------------------------------------------------------------------
1088 1.1 ross */
1089 1.1 ross static float128
1090 1.1 ross normalizeRoundAndPackFloat128(
1091 1.1 ross flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1 )
1092 1.1 ross {
1093 1.1 ross int8 shiftCount;
1094 1.1 ross bits64 zSig2;
1095 1.1 ross
1096 1.1 ross if ( zSig0 == 0 ) {
1097 1.1 ross zSig0 = zSig1;
1098 1.1 ross zSig1 = 0;
1099 1.1 ross zExp -= 64;
1100 1.1 ross }
1101 1.1 ross shiftCount = countLeadingZeros64( zSig0 ) - 15;
1102 1.1 ross if ( 0 <= shiftCount ) {
1103 1.1 ross zSig2 = 0;
1104 1.1 ross shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
1105 1.1 ross }
1106 1.1 ross else {
1107 1.1 ross shift128ExtraRightJamming(
1108 1.1 ross zSig0, zSig1, 0, - shiftCount, &zSig0, &zSig1, &zSig2 );
1109 1.1 ross }
1110 1.1 ross zExp -= shiftCount;
1111 1.1 ross return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 );
1112 1.1 ross
1113 1.1 ross }
1114 1.1 ross
1115 1.1 ross #endif
1116 1.1 ross
1117 1.1 ross /*
1118 1.1 ross -------------------------------------------------------------------------------
1119 1.1 ross Returns the result of converting the 32-bit two's complement integer `a'
1120 1.1 ross to the single-precision floating-point format. The conversion is performed
1121 1.1 ross according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1122 1.1 ross -------------------------------------------------------------------------------
1123 1.1 ross */
1124 1.1 ross float32 int32_to_float32( int32 a )
1125 1.1 ross {
1126 1.1 ross flag zSign;
1127 1.1 ross
1128 1.1 ross if ( a == 0 ) return 0;
1129 1.1 ross if ( a == (sbits32) 0x80000000 ) return packFloat32( 1, 0x9E, 0 );
1130 1.1 ross zSign = ( a < 0 );
1131 1.1 ross return normalizeRoundAndPackFloat32( zSign, 0x9C, zSign ? - a : a );
1132 1.1 ross
1133 1.1 ross }
1134 1.1 ross
1135 1.1 ross /*
1136 1.1 ross -------------------------------------------------------------------------------
1137 1.1 ross Returns the result of converting the 32-bit two's complement integer `a'
1138 1.1 ross to the double-precision floating-point format. The conversion is performed
1139 1.1 ross according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1140 1.1 ross -------------------------------------------------------------------------------
1141 1.1 ross */
1142 1.1 ross float64 int32_to_float64( int32 a )
1143 1.1 ross {
1144 1.1 ross flag zSign;
1145 1.1 ross uint32 absA;
1146 1.1 ross int8 shiftCount;
1147 1.1 ross bits64 zSig;
1148 1.1 ross
1149 1.1 ross if ( a == 0 ) return 0;
1150 1.1 ross zSign = ( a < 0 );
1151 1.1 ross absA = zSign ? - a : a;
1152 1.1 ross shiftCount = countLeadingZeros32( absA ) + 21;
1153 1.1 ross zSig = absA;
1154 1.1 ross return packFloat64( zSign, 0x432 - shiftCount, zSig<<shiftCount );
1155 1.1 ross
1156 1.1 ross }
1157 1.1 ross
1158 1.1 ross #ifdef FLOATX80
1159 1.1 ross
1160 1.1 ross /*
1161 1.1 ross -------------------------------------------------------------------------------
1162 1.1 ross Returns the result of converting the 32-bit two's complement integer `a'
1163 1.1 ross to the extended double-precision floating-point format. The conversion
1164 1.1 ross is performed according to the IEC/IEEE Standard for Binary Floating-Point
1165 1.1 ross Arithmetic.
1166 1.1 ross -------------------------------------------------------------------------------
1167 1.1 ross */
1168 1.1 ross floatx80 int32_to_floatx80( int32 a )
1169 1.1 ross {
1170 1.1 ross flag zSign;
1171 1.1 ross uint32 absA;
1172 1.1 ross int8 shiftCount;
1173 1.1 ross bits64 zSig;
1174 1.1 ross
1175 1.1 ross if ( a == 0 ) return packFloatx80( 0, 0, 0 );
1176 1.1 ross zSign = ( a < 0 );
1177 1.1 ross absA = zSign ? - a : a;
1178 1.1 ross shiftCount = countLeadingZeros32( absA ) + 32;
1179 1.1 ross zSig = absA;
1180 1.1 ross return packFloatx80( zSign, 0x403E - shiftCount, zSig<<shiftCount );
1181 1.1 ross
1182 1.1 ross }
1183 1.1 ross
1184 1.1 ross #endif
1185 1.1 ross
1186 1.1 ross #ifdef FLOAT128
1187 1.1 ross
1188 1.1 ross /*
1189 1.1 ross -------------------------------------------------------------------------------
1190 1.1 ross Returns the result of converting the 32-bit two's complement integer `a' to
1191 1.1 ross the quadruple-precision floating-point format. The conversion is performed
1192 1.1 ross according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1193 1.1 ross -------------------------------------------------------------------------------
1194 1.1 ross */
1195 1.1 ross float128 int32_to_float128( int32 a )
1196 1.1 ross {
1197 1.1 ross flag zSign;
1198 1.1 ross uint32 absA;
1199 1.1 ross int8 shiftCount;
1200 1.1 ross bits64 zSig0;
1201 1.1 ross
1202 1.1 ross if ( a == 0 ) return packFloat128( 0, 0, 0, 0 );
1203 1.1 ross zSign = ( a < 0 );
1204 1.1 ross absA = zSign ? - a : a;
1205 1.1 ross shiftCount = countLeadingZeros32( absA ) + 17;
1206 1.1 ross zSig0 = absA;
1207 1.1 ross return packFloat128( zSign, 0x402E - shiftCount, zSig0<<shiftCount, 0 );
1208 1.1 ross
1209 1.1 ross }
1210 1.1 ross
1211 1.1 ross #endif
1212 1.1 ross
1213 1.1 ross #ifndef SOFTFLOAT_FOR_GCC /* __floatdi?f is in libgcc2.c */
1214 1.1 ross /*
1215 1.1 ross -------------------------------------------------------------------------------
1216 1.1 ross Returns the result of converting the 64-bit two's complement integer `a'
1217 1.1 ross to the single-precision floating-point format. The conversion is performed
1218 1.1 ross according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1219 1.1 ross -------------------------------------------------------------------------------
1220 1.1 ross */
1221 1.1 ross float32 int64_to_float32( int64 a )
1222 1.1 ross {
1223 1.1 ross flag zSign;
1224 1.1 ross uint64 absA;
1225 1.1 ross int8 shiftCount;
1226 1.1 ross
1227 1.1 ross if ( a == 0 ) return 0;
1228 1.1 ross zSign = ( a < 0 );
1229 1.1 ross absA = zSign ? - a : a;
1230 1.1 ross shiftCount = countLeadingZeros64( absA ) - 40;
1231 1.1 ross if ( 0 <= shiftCount ) {
1232 1.1 ross return packFloat32( zSign, 0x95 - shiftCount, absA<<shiftCount );
1233 1.1 ross }
1234 1.1 ross else {
1235 1.1 ross shiftCount += 7;
1236 1.1 ross if ( shiftCount < 0 ) {
1237 1.1 ross shift64RightJamming( absA, - shiftCount, &absA );
1238 1.1 ross }
1239 1.1 ross else {
1240 1.1 ross absA <<= shiftCount;
1241 1.1 ross }
1242 1.1 ross return roundAndPackFloat32( zSign, 0x9C - shiftCount, absA );
1243 1.1 ross }
1244 1.1 ross
1245 1.1 ross }
1246 1.1 ross
1247 1.1 ross /*
1248 1.1 ross -------------------------------------------------------------------------------
1249 1.1 ross Returns the result of converting the 64-bit two's complement integer `a'
1250 1.1 ross to the double-precision floating-point format. The conversion is performed
1251 1.1 ross according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1252 1.1 ross -------------------------------------------------------------------------------
1253 1.1 ross */
1254 1.1 ross float64 int64_to_float64( int64 a )
1255 1.1 ross {
1256 1.1 ross flag zSign;
1257 1.1 ross
1258 1.1 ross if ( a == 0 ) return 0;
1259 1.1 ross if ( a == (sbits64) LIT64( 0x8000000000000000 ) ) {
1260 1.1 ross return packFloat64( 1, 0x43E, 0 );
1261 1.1 ross }
1262 1.1 ross zSign = ( a < 0 );
1263 1.1 ross return normalizeRoundAndPackFloat64( zSign, 0x43C, zSign ? - a : a );
1264 1.1 ross
1265 1.1 ross }
1266 1.1 ross
1267 1.1 ross #ifdef FLOATX80
1268 1.1 ross
1269 1.1 ross /*
1270 1.1 ross -------------------------------------------------------------------------------
1271 1.1 ross Returns the result of converting the 64-bit two's complement integer `a'
1272 1.1 ross to the extended double-precision floating-point format. The conversion
1273 1.1 ross is performed according to the IEC/IEEE Standard for Binary Floating-Point
1274 1.1 ross Arithmetic.
1275 1.1 ross -------------------------------------------------------------------------------
1276 1.1 ross */
1277 1.1 ross floatx80 int64_to_floatx80( int64 a )
1278 1.1 ross {
1279 1.1 ross flag zSign;
1280 1.1 ross uint64 absA;
1281 1.1 ross int8 shiftCount;
1282 1.1 ross
1283 1.1 ross if ( a == 0 ) return packFloatx80( 0, 0, 0 );
1284 1.1 ross zSign = ( a < 0 );
1285 1.1 ross absA = zSign ? - a : a;
1286 1.1 ross shiftCount = countLeadingZeros64( absA );
1287 1.1 ross return packFloatx80( zSign, 0x403E - shiftCount, absA<<shiftCount );
1288 1.1 ross
1289 1.1 ross }
1290 1.1 ross
1291 1.1 ross #endif
1292 1.1 ross
1293 1.1 ross #ifdef FLOAT128
1294 1.1 ross
1295 1.1 ross /*
1296 1.1 ross -------------------------------------------------------------------------------
1297 1.1 ross Returns the result of converting the 64-bit two's complement integer `a' to
1298 1.1 ross the quadruple-precision floating-point format. The conversion is performed
1299 1.1 ross according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1300 1.1 ross -------------------------------------------------------------------------------
1301 1.1 ross */
1302 1.1 ross float128 int64_to_float128( int64 a )
1303 1.1 ross {
1304 1.1 ross flag zSign;
1305 1.1 ross uint64 absA;
1306 1.1 ross int8 shiftCount;
1307 1.1 ross int32 zExp;
1308 1.1 ross bits64 zSig0, zSig1;
1309 1.1 ross
1310 1.1 ross if ( a == 0 ) return packFloat128( 0, 0, 0, 0 );
1311 1.1 ross zSign = ( a < 0 );
1312 1.1 ross absA = zSign ? - a : a;
1313 1.1 ross shiftCount = countLeadingZeros64( absA ) + 49;
1314 1.1 ross zExp = 0x406E - shiftCount;
1315 1.1 ross if ( 64 <= shiftCount ) {
1316 1.1 ross zSig1 = 0;
1317 1.1 ross zSig0 = absA;
1318 1.1 ross shiftCount -= 64;
1319 1.1 ross }
1320 1.1 ross else {
1321 1.1 ross zSig1 = absA;
1322 1.1 ross zSig0 = 0;
1323 1.1 ross }
1324 1.1 ross shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
1325 1.1 ross return packFloat128( zSign, zExp, zSig0, zSig1 );
1326 1.1 ross
1327 1.1 ross }
1328 1.1 ross
1329 1.1 ross #endif
1330 1.1 ross #endif /* !SOFTFLOAT_FOR_GCC */
1331 1.1 ross
1332 1.1 ross #ifndef SOFTFLOAT_FOR_GCC /* Not needed */
1333 1.1 ross /*
1334 1.1 ross -------------------------------------------------------------------------------
1335 1.1 ross Returns the result of converting the single-precision floating-point value
1336 1.1 ross `a' to the 32-bit two's complement integer format. The conversion is
1337 1.1 ross performed according to the IEC/IEEE Standard for Binary Floating-Point
1338 1.1 ross Arithmetic---which means in particular that the conversion is rounded
1339 1.1 ross according to the current rounding mode. If `a' is a NaN, the largest
1340 1.1 ross positive integer is returned. Otherwise, if the conversion overflows, the
1341 1.1 ross largest integer with the same sign as `a' is returned.
1342 1.1 ross -------------------------------------------------------------------------------
1343 1.1 ross */
1344 1.1 ross int32 float32_to_int32( float32 a )
1345 1.1 ross {
1346 1.1 ross flag aSign;
1347 1.1 ross int16 aExp, shiftCount;
1348 1.1 ross bits32 aSig;
1349 1.1 ross bits64 aSig64;
1350 1.1 ross
1351 1.1 ross aSig = extractFloat32Frac( a );
1352 1.1 ross aExp = extractFloat32Exp( a );
1353 1.1 ross aSign = extractFloat32Sign( a );
1354 1.1 ross if ( ( aExp == 0xFF ) && aSig ) aSign = 0;
1355 1.1 ross if ( aExp ) aSig |= 0x00800000;
1356 1.1 ross shiftCount = 0xAF - aExp;
1357 1.1 ross aSig64 = aSig;
1358 1.1 ross aSig64 <<= 32;
1359 1.1 ross if ( 0 < shiftCount ) shift64RightJamming( aSig64, shiftCount, &aSig64 );
1360 1.1 ross return roundAndPackInt32( aSign, aSig64 );
1361 1.1 ross
1362 1.1 ross }
1363 1.1 ross #endif /* !SOFTFLOAT_FOR_GCC */
1364 1.1 ross
1365 1.1 ross /*
1366 1.1 ross -------------------------------------------------------------------------------
1367 1.1 ross Returns the result of converting the single-precision floating-point value
1368 1.1 ross `a' to the 32-bit two's complement integer format. The conversion is
1369 1.1 ross performed according to the IEC/IEEE Standard for Binary Floating-Point
1370 1.1 ross Arithmetic, except that the conversion is always rounded toward zero.
1371 1.1 ross If `a' is a NaN, the largest positive integer is returned. Otherwise, if
1372 1.1 ross the conversion overflows, the largest integer with the same sign as `a' is
1373 1.1 ross returned.
1374 1.1 ross -------------------------------------------------------------------------------
1375 1.1 ross */
1376 1.1 ross int32 float32_to_int32_round_to_zero( float32 a )
1377 1.1 ross {
1378 1.1 ross flag aSign;
1379 1.1 ross int16 aExp, shiftCount;
1380 1.1 ross bits32 aSig;
1381 1.1 ross int32 z;
1382 1.1 ross
1383 1.1 ross aSig = extractFloat32Frac( a );
1384 1.1 ross aExp = extractFloat32Exp( a );
1385 1.1 ross aSign = extractFloat32Sign( a );
1386 1.1 ross shiftCount = aExp - 0x9E;
1387 1.1 ross if ( 0 <= shiftCount ) {
1388 1.1 ross if ( a != 0xCF000000 ) {
1389 1.1 ross float_raise( float_flag_invalid );
1390 1.1 ross if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) return 0x7FFFFFFF;
1391 1.1 ross }
1392 1.1 ross return (sbits32) 0x80000000;
1393 1.1 ross }
1394 1.1 ross else if ( aExp <= 0x7E ) {
1395 1.1 ross if ( aExp | aSig ) float_set_inexact();
1396 1.1 ross return 0;
1397 1.1 ross }
1398 1.1 ross aSig = ( aSig | 0x00800000 )<<8;
1399 1.1 ross z = aSig>>( - shiftCount );
1400 1.1 ross if ( (bits32) ( aSig<<( shiftCount & 31 ) ) ) {
1401 1.1 ross float_set_inexact();
1402 1.1 ross }
1403 1.1 ross if ( aSign ) z = - z;
1404 1.1 ross return z;
1405 1.1 ross
1406 1.1 ross }
1407 1.1 ross
1408 1.1 ross #ifndef SOFTFLOAT_FOR_GCC /* __fix?fdi provided by libgcc2.c */
1409 1.1 ross /*
1410 1.1 ross -------------------------------------------------------------------------------
1411 1.1 ross Returns the result of converting the single-precision floating-point value
1412 1.1 ross `a' to the 64-bit two's complement integer format. The conversion is
1413 1.1 ross performed according to the IEC/IEEE Standard for Binary Floating-Point
1414 1.1 ross Arithmetic---which means in particular that the conversion is rounded
1415 1.1 ross according to the current rounding mode. If `a' is a NaN, the largest
1416 1.1 ross positive integer is returned. Otherwise, if the conversion overflows, the
1417 1.1 ross largest integer with the same sign as `a' is returned.
1418 1.1 ross -------------------------------------------------------------------------------
1419 1.1 ross */
1420 1.1 ross int64 float32_to_int64( float32 a )
1421 1.1 ross {
1422 1.1 ross flag aSign;
1423 1.1 ross int16 aExp, shiftCount;
1424 1.1 ross bits32 aSig;
1425 1.1 ross bits64 aSig64, aSigExtra;
1426 1.1 ross
1427 1.1 ross aSig = extractFloat32Frac( a );
1428 1.1 ross aExp = extractFloat32Exp( a );
1429 1.1 ross aSign = extractFloat32Sign( a );
1430 1.1 ross shiftCount = 0xBE - aExp;
1431 1.1 ross if ( shiftCount < 0 ) {
1432 1.1 ross float_raise( float_flag_invalid );
1433 1.1 ross if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) {
1434 1.1 ross return LIT64( 0x7FFFFFFFFFFFFFFF );
1435 1.1 ross }
1436 1.1 ross return (sbits64) LIT64( 0x8000000000000000 );
1437 1.1 ross }
1438 1.1 ross if ( aExp ) aSig |= 0x00800000;
1439 1.1 ross aSig64 = aSig;
1440 1.1 ross aSig64 <<= 40;
1441 1.1 ross shift64ExtraRightJamming( aSig64, 0, shiftCount, &aSig64, &aSigExtra );
1442 1.1 ross return roundAndPackInt64( aSign, aSig64, aSigExtra );
1443 1.1 ross
1444 1.1 ross }
1445 1.1 ross
1446 1.1 ross /*
1447 1.1 ross -------------------------------------------------------------------------------
1448 1.1 ross Returns the result of converting the single-precision floating-point value
1449 1.1 ross `a' to the 64-bit two's complement integer format. The conversion is
1450 1.1 ross performed according to the IEC/IEEE Standard for Binary Floating-Point
1451 1.1 ross Arithmetic, except that the conversion is always rounded toward zero. If
1452 1.1 ross `a' is a NaN, the largest positive integer is returned. Otherwise, if the
1453 1.1 ross conversion overflows, the largest integer with the same sign as `a' is
1454 1.1 ross returned.
1455 1.1 ross -------------------------------------------------------------------------------
1456 1.1 ross */
1457 1.1 ross int64 float32_to_int64_round_to_zero( float32 a )
1458 1.1 ross {
1459 1.1 ross flag aSign;
1460 1.1 ross int16 aExp, shiftCount;
1461 1.1 ross bits32 aSig;
1462 1.1 ross bits64 aSig64;
1463 1.1 ross int64 z;
1464 1.1 ross
1465 1.1 ross aSig = extractFloat32Frac( a );
1466 1.1 ross aExp = extractFloat32Exp( a );
1467 1.1 ross aSign = extractFloat32Sign( a );
1468 1.1 ross shiftCount = aExp - 0xBE;
1469 1.1 ross if ( 0 <= shiftCount ) {
1470 1.1 ross if ( a != 0xDF000000 ) {
1471 1.1 ross float_raise( float_flag_invalid );
1472 1.1 ross if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) {
1473 1.1 ross return LIT64( 0x7FFFFFFFFFFFFFFF );
1474 1.1 ross }
1475 1.1 ross }
1476 1.1 ross return (sbits64) LIT64( 0x8000000000000000 );
1477 1.1 ross }
1478 1.1 ross else if ( aExp <= 0x7E ) {
1479 1.1 ross if ( aExp | aSig ) float_set_inexact();
1480 1.1 ross return 0;
1481 1.1 ross }
1482 1.1 ross aSig64 = aSig | 0x00800000;
1483 1.1 ross aSig64 <<= 40;
1484 1.1 ross z = aSig64>>( - shiftCount );
1485 1.1 ross if ( (bits64) ( aSig64<<( shiftCount & 63 ) ) ) {
1486 1.1 ross float_set_inexact();
1487 1.1 ross }
1488 1.1 ross if ( aSign ) z = - z;
1489 1.1 ross return z;
1490 1.1 ross
1491 1.1 ross }
1492 1.1 ross #endif /* !SOFTFLOAT_FOR_GCC */
1493 1.1 ross
1494 1.1 ross /*
1495 1.1 ross -------------------------------------------------------------------------------
1496 1.1 ross Returns the result of converting the single-precision floating-point value
1497 1.1 ross `a' to the double-precision floating-point format. The conversion is
1498 1.1 ross performed according to the IEC/IEEE Standard for Binary Floating-Point
1499 1.1 ross Arithmetic.
1500 1.1 ross -------------------------------------------------------------------------------
1501 1.1 ross */
1502 1.1 ross float64 float32_to_float64( float32 a )
1503 1.1 ross {
1504 1.1 ross flag aSign;
1505 1.1 ross int16 aExp;
1506 1.1 ross bits32 aSig;
1507 1.1 ross
1508 1.1 ross aSig = extractFloat32Frac( a );
1509 1.1 ross aExp = extractFloat32Exp( a );
1510 1.1 ross aSign = extractFloat32Sign( a );
1511 1.1 ross if ( aExp == 0xFF ) {
1512 1.1 ross if ( aSig ) return commonNaNToFloat64( float32ToCommonNaN( a ) );
1513 1.1 ross return packFloat64( aSign, 0x7FF, 0 );
1514 1.1 ross }
1515 1.1 ross if ( aExp == 0 ) {
1516 1.1 ross if ( aSig == 0 ) return packFloat64( aSign, 0, 0 );
1517 1.1 ross normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1518 1.1 ross --aExp;
1519 1.1 ross }
1520 1.1 ross return packFloat64( aSign, aExp + 0x380, ( (bits64) aSig )<<29 );
1521 1.1 ross
1522 1.1 ross }
1523 1.1 ross
1524 1.1 ross #ifdef FLOATX80
1525 1.1 ross
1526 1.1 ross /*
1527 1.1 ross -------------------------------------------------------------------------------
1528 1.1 ross Returns the result of converting the single-precision floating-point value
1529 1.1 ross `a' to the extended double-precision floating-point format. The conversion
1530 1.1 ross is performed according to the IEC/IEEE Standard for Binary Floating-Point
1531 1.1 ross Arithmetic.
1532 1.1 ross -------------------------------------------------------------------------------
1533 1.1 ross */
1534 1.1 ross floatx80 float32_to_floatx80( float32 a )
1535 1.1 ross {
1536 1.1 ross flag aSign;
1537 1.1 ross int16 aExp;
1538 1.1 ross bits32 aSig;
1539 1.1 ross
1540 1.1 ross aSig = extractFloat32Frac( a );
1541 1.1 ross aExp = extractFloat32Exp( a );
1542 1.1 ross aSign = extractFloat32Sign( a );
1543 1.1 ross if ( aExp == 0xFF ) {
1544 1.1 ross if ( aSig ) return commonNaNToFloatx80( float32ToCommonNaN( a ) );
1545 1.1 ross return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
1546 1.1 ross }
1547 1.1 ross if ( aExp == 0 ) {
1548 1.1 ross if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 );
1549 1.1 ross normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1550 1.1 ross }
1551 1.1 ross aSig |= 0x00800000;
1552 1.1 ross return packFloatx80( aSign, aExp + 0x3F80, ( (bits64) aSig )<<40 );
1553 1.1 ross
1554 1.1 ross }
1555 1.1 ross
1556 1.1 ross #endif
1557 1.1 ross
1558 1.1 ross #ifdef FLOAT128
1559 1.1 ross
1560 1.1 ross /*
1561 1.1 ross -------------------------------------------------------------------------------
1562 1.1 ross Returns the result of converting the single-precision floating-point value
1563 1.1 ross `a' to the double-precision floating-point format. The conversion is
1564 1.1 ross performed according to the IEC/IEEE Standard for Binary Floating-Point
1565 1.1 ross Arithmetic.
1566 1.1 ross -------------------------------------------------------------------------------
1567 1.1 ross */
1568 1.1 ross float128 float32_to_float128( float32 a )
1569 1.1 ross {
1570 1.1 ross flag aSign;
1571 1.1 ross int16 aExp;
1572 1.1 ross bits32 aSig;
1573 1.1 ross
1574 1.1 ross aSig = extractFloat32Frac( a );
1575 1.1 ross aExp = extractFloat32Exp( a );
1576 1.1 ross aSign = extractFloat32Sign( a );
1577 1.1 ross if ( aExp == 0xFF ) {
1578 1.1 ross if ( aSig ) return commonNaNToFloat128( float32ToCommonNaN( a ) );
1579 1.1 ross return packFloat128( aSign, 0x7FFF, 0, 0 );
1580 1.1 ross }
1581 1.1 ross if ( aExp == 0 ) {
1582 1.1 ross if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 );
1583 1.1 ross normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1584 1.1 ross --aExp;
1585 1.1 ross }
1586 1.1 ross return packFloat128( aSign, aExp + 0x3F80, ( (bits64) aSig )<<25, 0 );
1587 1.1 ross
1588 1.1 ross }
1589 1.1 ross
1590 1.1 ross #endif
1591 1.1 ross
1592 1.1 ross #ifndef SOFTFLOAT_FOR_GCC /* Not needed */
1593 1.1 ross /*
1594 1.1 ross -------------------------------------------------------------------------------
1595 1.1 ross Rounds the single-precision floating-point value `a' to an integer, and
1596 1.1 ross returns the result as a single-precision floating-point value. The
1597 1.1 ross operation is performed according to the IEC/IEEE Standard for Binary
1598 1.1 ross Floating-Point Arithmetic.
1599 1.1 ross -------------------------------------------------------------------------------
1600 1.1 ross */
1601 1.1 ross float32 float32_round_to_int( float32 a )
1602 1.1 ross {
1603 1.1 ross flag aSign;
1604 1.1 ross int16 aExp;
1605 1.1 ross bits32 lastBitMask, roundBitsMask;
1606 1.1 ross int8 roundingMode;
1607 1.1 ross float32 z;
1608 1.1 ross
1609 1.1 ross aExp = extractFloat32Exp( a );
1610 1.1 ross if ( 0x96 <= aExp ) {
1611 1.1 ross if ( ( aExp == 0xFF ) && extractFloat32Frac( a ) ) {
1612 1.1 ross return propagateFloat32NaN( a, a );
1613 1.1 ross }
1614 1.1 ross return a;
1615 1.1 ross }
1616 1.1 ross if ( aExp <= 0x7E ) {
1617 1.1 ross if ( (bits32) ( a<<1 ) == 0 ) return a;
1618 1.1 ross float_set_inexact();
1619 1.1 ross aSign = extractFloat32Sign( a );
1620 1.1 ross switch ( float_rounding_mode() ) {
1621 1.1 ross case float_round_nearest_even:
1622 1.1 ross if ( ( aExp == 0x7E ) && extractFloat32Frac( a ) ) {
1623 1.1 ross return packFloat32( aSign, 0x7F, 0 );
1624 1.1 ross }
1625 1.1 ross break;
1626 1.1 ross case float_round_down:
1627 1.1 ross return aSign ? 0xBF800000 : 0;
1628 1.1 ross case float_round_up:
1629 1.1 ross return aSign ? 0x80000000 : 0x3F800000;
1630 1.1 ross }
1631 1.1 ross return packFloat32( aSign, 0, 0 );
1632 1.1 ross }
1633 1.1 ross lastBitMask = 1;
1634 1.1 ross lastBitMask <<= 0x96 - aExp;
1635 1.1 ross roundBitsMask = lastBitMask - 1;
1636 1.1 ross z = a;
1637 1.1 ross roundingMode = float_rounding_mode();
1638 1.1 ross if ( roundingMode == float_round_nearest_even ) {
1639 1.1 ross z += lastBitMask>>1;
1640 1.1 ross if ( ( z & roundBitsMask ) == 0 ) z &= ~ lastBitMask;
1641 1.1 ross }
1642 1.1 ross else if ( roundingMode != float_round_to_zero ) {
1643 1.1 ross if ( extractFloat32Sign( z ) ^ ( roundingMode == float_round_up ) ) {
1644 1.1 ross z += roundBitsMask;
1645 1.1 ross }
1646 1.1 ross }
1647 1.1 ross z &= ~ roundBitsMask;
1648 1.1 ross if ( z != a ) float_set_inexact();
1649 1.1 ross return z;
1650 1.1 ross
1651 1.1 ross }
1652 1.1 ross #endif /* !SOFTFLOAT_FOR_GCC */
1653 1.1 ross
1654 1.1 ross /*
1655 1.1 ross -------------------------------------------------------------------------------
1656 1.1 ross Returns the result of adding the absolute values of the single-precision
1657 1.1 ross floating-point values `a' and `b'. If `zSign' is 1, the sum is negated
1658 1.1 ross before being returned. `zSign' is ignored if the result is a NaN.
1659 1.1 ross The addition is performed according to the IEC/IEEE Standard for Binary
1660 1.1 ross Floating-Point Arithmetic.
1661 1.1 ross -------------------------------------------------------------------------------
1662 1.1 ross */
1663 1.1 ross static float32 addFloat32Sigs( float32 a, float32 b, flag zSign )
1664 1.1 ross {
1665 1.1 ross int16 aExp, bExp, zExp;
1666 1.1 ross bits32 aSig, bSig, zSig;
1667 1.1 ross int16 expDiff;
1668 1.1 ross
1669 1.1 ross aSig = extractFloat32Frac( a );
1670 1.1 ross aExp = extractFloat32Exp( a );
1671 1.1 ross bSig = extractFloat32Frac( b );
1672 1.1 ross bExp = extractFloat32Exp( b );
1673 1.1 ross expDiff = aExp - bExp;
1674 1.1 ross aSig <<= 6;
1675 1.1 ross bSig <<= 6;
1676 1.1 ross if ( 0 < expDiff ) {
1677 1.1 ross if ( aExp == 0xFF ) {
1678 1.1 ross if ( aSig ) return propagateFloat32NaN( a, b );
1679 1.1 ross return a;
1680 1.1 ross }
1681 1.1 ross if ( bExp == 0 ) {
1682 1.1 ross --expDiff;
1683 1.1 ross }
1684 1.1 ross else {
1685 1.1 ross bSig |= 0x20000000;
1686 1.1 ross }
1687 1.1 ross shift32RightJamming( bSig, expDiff, &bSig );
1688 1.1 ross zExp = aExp;
1689 1.1 ross }
1690 1.1 ross else if ( expDiff < 0 ) {
1691 1.1 ross if ( bExp == 0xFF ) {
1692 1.1 ross if ( bSig ) return propagateFloat32NaN( a, b );
1693 1.1 ross return packFloat32( zSign, 0xFF, 0 );
1694 1.1 ross }
1695 1.1 ross if ( aExp == 0 ) {
1696 1.1 ross ++expDiff;
1697 1.1 ross }
1698 1.1 ross else {
1699 1.1 ross aSig |= 0x20000000;
1700 1.1 ross }
1701 1.1 ross shift32RightJamming( aSig, - expDiff, &aSig );
1702 1.1 ross zExp = bExp;
1703 1.1 ross }
1704 1.1 ross else {
1705 1.1 ross if ( aExp == 0xFF ) {
1706 1.1 ross if ( aSig | bSig ) return propagateFloat32NaN( a, b );
1707 1.1 ross return a;
1708 1.1 ross }
1709 1.1 ross if ( aExp == 0 ) return packFloat32( zSign, 0, ( aSig + bSig )>>6 );
1710 1.1 ross zSig = 0x40000000 + aSig + bSig;
1711 1.1 ross zExp = aExp;
1712 1.1 ross goto roundAndPack;
1713 1.1 ross }
1714 1.1 ross aSig |= 0x20000000;
1715 1.1 ross zSig = ( aSig + bSig )<<1;
1716 1.1 ross --zExp;
1717 1.1 ross if ( (sbits32) zSig < 0 ) {
1718 1.1 ross zSig = aSig + bSig;
1719 1.1 ross ++zExp;
1720 1.1 ross }
1721 1.1 ross roundAndPack:
1722 1.1 ross return roundAndPackFloat32( zSign, zExp, zSig );
1723 1.1 ross
1724 1.1 ross }
1725 1.1 ross
1726 1.1 ross /*
1727 1.1 ross -------------------------------------------------------------------------------
1728 1.1 ross Returns the result of subtracting the absolute values of the single-
1729 1.1 ross precision floating-point values `a' and `b'. If `zSign' is 1, the
1730 1.1 ross difference is negated before being returned. `zSign' is ignored if the
1731 1.1 ross result is a NaN. The subtraction is performed according to the IEC/IEEE
1732 1.1 ross Standard for Binary Floating-Point Arithmetic.
1733 1.1 ross -------------------------------------------------------------------------------
1734 1.1 ross */
1735 1.1 ross static float32 subFloat32Sigs( float32 a, float32 b, flag zSign )
1736 1.1 ross {
1737 1.1 ross int16 aExp, bExp, zExp;
1738 1.1 ross bits32 aSig, bSig, zSig;
1739 1.1 ross int16 expDiff;
1740 1.1 ross
1741 1.1 ross aSig = extractFloat32Frac( a );
1742 1.1 ross aExp = extractFloat32Exp( a );
1743 1.1 ross bSig = extractFloat32Frac( b );
1744 1.1 ross bExp = extractFloat32Exp( b );
1745 1.1 ross expDiff = aExp - bExp;
1746 1.1 ross aSig <<= 7;
1747 1.1 ross bSig <<= 7;
1748 1.1 ross if ( 0 < expDiff ) goto aExpBigger;
1749 1.1 ross if ( expDiff < 0 ) goto bExpBigger;
1750 1.1 ross if ( aExp == 0xFF ) {
1751 1.1 ross if ( aSig | bSig ) return propagateFloat32NaN( a, b );
1752 1.1 ross float_raise( float_flag_invalid );
1753 1.1 ross return float32_default_nan;
1754 1.1 ross }
1755 1.1 ross if ( aExp == 0 ) {
1756 1.1 ross aExp = 1;
1757 1.1 ross bExp = 1;
1758 1.1 ross }
1759 1.1 ross if ( bSig < aSig ) goto aBigger;
1760 1.1 ross if ( aSig < bSig ) goto bBigger;
1761 1.1 ross return packFloat32( float_rounding_mode() == float_round_down, 0, 0 );
1762 1.1 ross bExpBigger:
1763 1.1 ross if ( bExp == 0xFF ) {
1764 1.1 ross if ( bSig ) return propagateFloat32NaN( a, b );
1765 1.1 ross return packFloat32( zSign ^ 1, 0xFF, 0 );
1766 1.1 ross }
1767 1.1 ross if ( aExp == 0 ) {
1768 1.1 ross ++expDiff;
1769 1.1 ross }
1770 1.1 ross else {
1771 1.1 ross aSig |= 0x40000000;
1772 1.1 ross }
1773 1.1 ross shift32RightJamming( aSig, - expDiff, &aSig );
1774 1.1 ross bSig |= 0x40000000;
1775 1.1 ross bBigger:
1776 1.1 ross zSig = bSig - aSig;
1777 1.1 ross zExp = bExp;
1778 1.1 ross zSign ^= 1;
1779 1.1 ross goto normalizeRoundAndPack;
1780 1.1 ross aExpBigger:
1781 1.1 ross if ( aExp == 0xFF ) {
1782 1.1 ross if ( aSig ) return propagateFloat32NaN( a, b );
1783 1.1 ross return a;
1784 1.1 ross }
1785 1.1 ross if ( bExp == 0 ) {
1786 1.1 ross --expDiff;
1787 1.1 ross }
1788 1.1 ross else {
1789 1.1 ross bSig |= 0x40000000;
1790 1.1 ross }
1791 1.1 ross shift32RightJamming( bSig, expDiff, &bSig );
1792 1.1 ross aSig |= 0x40000000;
1793 1.1 ross aBigger:
1794 1.1 ross zSig = aSig - bSig;
1795 1.1 ross zExp = aExp;
1796 1.1 ross normalizeRoundAndPack:
1797 1.1 ross --zExp;
1798 1.1 ross return normalizeRoundAndPackFloat32( zSign, zExp, zSig );
1799 1.1 ross
1800 1.1 ross }
1801 1.1 ross
1802 1.1 ross /*
1803 1.1 ross -------------------------------------------------------------------------------
1804 1.1 ross Returns the result of adding the single-precision floating-point values `a'
1805 1.1 ross and `b'. The operation is performed according to the IEC/IEEE Standard for
1806 1.1 ross Binary Floating-Point Arithmetic.
1807 1.1 ross -------------------------------------------------------------------------------
1808 1.1 ross */
1809 1.1 ross float32 float32_add( float32 a, float32 b )
1810 1.1 ross {
1811 1.1 ross flag aSign, bSign;
1812 1.1 ross
1813 1.1 ross aSign = extractFloat32Sign( a );
1814 1.1 ross bSign = extractFloat32Sign( b );
1815 1.1 ross if ( aSign == bSign ) {
1816 1.1 ross return addFloat32Sigs( a, b, aSign );
1817 1.1 ross }
1818 1.1 ross else {
1819 1.1 ross return subFloat32Sigs( a, b, aSign );
1820 1.1 ross }
1821 1.1 ross
1822 1.1 ross }
1823 1.1 ross
1824 1.1 ross /*
1825 1.1 ross -------------------------------------------------------------------------------
1826 1.1 ross Returns the result of subtracting the single-precision floating-point values
1827 1.1 ross `a' and `b'. The operation is performed according to the IEC/IEEE Standard
1828 1.1 ross for Binary Floating-Point Arithmetic.
1829 1.1 ross -------------------------------------------------------------------------------
1830 1.1 ross */
1831 1.1 ross float32 float32_sub( float32 a, float32 b )
1832 1.1 ross {
1833 1.1 ross flag aSign, bSign;
1834 1.1 ross
1835 1.1 ross aSign = extractFloat32Sign( a );
1836 1.1 ross bSign = extractFloat32Sign( b );
1837 1.1 ross if ( aSign == bSign ) {
1838 1.1 ross return subFloat32Sigs( a, b, aSign );
1839 1.1 ross }
1840 1.1 ross else {
1841 1.1 ross return addFloat32Sigs( a, b, aSign );
1842 1.1 ross }
1843 1.1 ross
1844 1.1 ross }
1845 1.1 ross
1846 1.1 ross /*
1847 1.1 ross -------------------------------------------------------------------------------
1848 1.1 ross Returns the result of multiplying the single-precision floating-point values
1849 1.1 ross `a' and `b'. The operation is performed according to the IEC/IEEE Standard
1850 1.1 ross for Binary Floating-Point Arithmetic.
1851 1.1 ross -------------------------------------------------------------------------------
1852 1.1 ross */
1853 1.1 ross float32 float32_mul( float32 a, float32 b )
1854 1.1 ross {
1855 1.1 ross flag aSign, bSign, zSign;
1856 1.1 ross int16 aExp, bExp, zExp;
1857 1.1 ross bits32 aSig, bSig;
1858 1.1 ross bits64 zSig64;
1859 1.1 ross bits32 zSig;
1860 1.1 ross
1861 1.1 ross aSig = extractFloat32Frac( a );
1862 1.1 ross aExp = extractFloat32Exp( a );
1863 1.1 ross aSign = extractFloat32Sign( a );
1864 1.1 ross bSig = extractFloat32Frac( b );
1865 1.1 ross bExp = extractFloat32Exp( b );
1866 1.1 ross bSign = extractFloat32Sign( b );
1867 1.1 ross zSign = aSign ^ bSign;
1868 1.1 ross if ( aExp == 0xFF ) {
1869 1.1 ross if ( aSig || ( ( bExp == 0xFF ) && bSig ) ) {
1870 1.1 ross return propagateFloat32NaN( a, b );
1871 1.1 ross }
1872 1.1 ross if ( ( bExp | bSig ) == 0 ) {
1873 1.1 ross float_raise( float_flag_invalid );
1874 1.1 ross return float32_default_nan;
1875 1.1 ross }
1876 1.1 ross return packFloat32( zSign, 0xFF, 0 );
1877 1.1 ross }
1878 1.1 ross if ( bExp == 0xFF ) {
1879 1.1 ross if ( bSig ) return propagateFloat32NaN( a, b );
1880 1.1 ross if ( ( aExp | aSig ) == 0 ) {
1881 1.1 ross float_raise( float_flag_invalid );
1882 1.1 ross return float32_default_nan;
1883 1.1 ross }
1884 1.1 ross return packFloat32( zSign, 0xFF, 0 );
1885 1.1 ross }
1886 1.1 ross if ( aExp == 0 ) {
1887 1.1 ross if ( aSig == 0 ) return packFloat32( zSign, 0, 0 );
1888 1.1 ross normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1889 1.1 ross }
1890 1.1 ross if ( bExp == 0 ) {
1891 1.1 ross if ( bSig == 0 ) return packFloat32( zSign, 0, 0 );
1892 1.1 ross normalizeFloat32Subnormal( bSig, &bExp, &bSig );
1893 1.1 ross }
1894 1.1 ross zExp = aExp + bExp - 0x7F;
1895 1.1 ross aSig = ( aSig | 0x00800000 )<<7;
1896 1.1 ross bSig = ( bSig | 0x00800000 )<<8;
1897 1.1 ross shift64RightJamming( ( (bits64) aSig ) * bSig, 32, &zSig64 );
1898 1.1 ross zSig = zSig64;
1899 1.1 ross if ( 0 <= (sbits32) ( zSig<<1 ) ) {
1900 1.1 ross zSig <<= 1;
1901 1.1 ross --zExp;
1902 1.1 ross }
1903 1.1 ross return roundAndPackFloat32( zSign, zExp, zSig );
1904 1.1 ross
1905 1.1 ross }
1906 1.1 ross
1907 1.1 ross /*
1908 1.1 ross -------------------------------------------------------------------------------
1909 1.1 ross Returns the result of dividing the single-precision floating-point value `a'
1910 1.1 ross by the corresponding value `b'. The operation is performed according to the
1911 1.1 ross IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1912 1.1 ross -------------------------------------------------------------------------------
1913 1.1 ross */
1914 1.1 ross float32 float32_div( float32 a, float32 b )
1915 1.1 ross {
1916 1.1 ross flag aSign, bSign, zSign;
1917 1.1 ross int16 aExp, bExp, zExp;
1918 1.1 ross bits32 aSig, bSig, zSig;
1919 1.1 ross
1920 1.1 ross aSig = extractFloat32Frac( a );
1921 1.1 ross aExp = extractFloat32Exp( a );
1922 1.1 ross aSign = extractFloat32Sign( a );
1923 1.1 ross bSig = extractFloat32Frac( b );
1924 1.1 ross bExp = extractFloat32Exp( b );
1925 1.1 ross bSign = extractFloat32Sign( b );
1926 1.1 ross zSign = aSign ^ bSign;
1927 1.1 ross if ( aExp == 0xFF ) {
1928 1.1 ross if ( aSig ) return propagateFloat32NaN( a, b );
1929 1.1 ross if ( bExp == 0xFF ) {
1930 1.1 ross if ( bSig ) return propagateFloat32NaN( a, b );
1931 1.1 ross float_raise( float_flag_invalid );
1932 1.1 ross return float32_default_nan;
1933 1.1 ross }
1934 1.1 ross return packFloat32( zSign, 0xFF, 0 );
1935 1.1 ross }
1936 1.1 ross if ( bExp == 0xFF ) {
1937 1.1 ross if ( bSig ) return propagateFloat32NaN( a, b );
1938 1.1 ross return packFloat32( zSign, 0, 0 );
1939 1.1 ross }
1940 1.1 ross if ( bExp == 0 ) {
1941 1.1 ross if ( bSig == 0 ) {
1942 1.1 ross if ( ( aExp | aSig ) == 0 ) {
1943 1.1 ross float_raise( float_flag_invalid );
1944 1.1 ross return float32_default_nan;
1945 1.1 ross }
1946 1.1 ross float_raise( float_flag_divbyzero );
1947 1.1 ross return packFloat32( zSign, 0xFF, 0 );
1948 1.1 ross }
1949 1.1 ross normalizeFloat32Subnormal( bSig, &bExp, &bSig );
1950 1.1 ross }
1951 1.1 ross if ( aExp == 0 ) {
1952 1.1 ross if ( aSig == 0 ) return packFloat32( zSign, 0, 0 );
1953 1.1 ross normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1954 1.1 ross }
1955 1.1 ross zExp = aExp - bExp + 0x7D;
1956 1.1 ross aSig = ( aSig | 0x00800000 )<<7;
1957 1.1 ross bSig = ( bSig | 0x00800000 )<<8;
1958 1.1 ross if ( bSig <= ( aSig + aSig ) ) {
1959 1.1 ross aSig >>= 1;
1960 1.1 ross ++zExp;
1961 1.1 ross }
1962 1.1 ross zSig = ( ( (bits64) aSig )<<32 ) / bSig;
1963 1.1 ross if ( ( zSig & 0x3F ) == 0 ) {
1964 1.1 ross zSig |= ( (bits64) bSig * zSig != ( (bits64) aSig )<<32 );
1965 1.1 ross }
1966 1.1 ross return roundAndPackFloat32( zSign, zExp, zSig );
1967 1.1 ross
1968 1.1 ross }
1969 1.1 ross
1970 1.1 ross #ifndef SOFTFLOAT_FOR_GCC /* Not needed */
1971 1.1 ross /*
1972 1.1 ross -------------------------------------------------------------------------------
1973 1.1 ross Returns the remainder of the single-precision floating-point value `a'
1974 1.1 ross with respect to the corresponding value `b'. The operation is performed
1975 1.1 ross according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1976 1.1 ross -------------------------------------------------------------------------------
1977 1.1 ross */
1978 1.1 ross float32 float32_rem( float32 a, float32 b )
1979 1.1 ross {
1980 1.1 ross flag aSign, bSign, zSign;
1981 1.1 ross int16 aExp, bExp, expDiff;
1982 1.1 ross bits32 aSig, bSig;
1983 1.1 ross bits32 q;
1984 1.1 ross bits64 aSig64, bSig64, q64;
1985 1.1 ross bits32 alternateASig;
1986 1.1 ross sbits32 sigMean;
1987 1.1 ross
1988 1.1 ross aSig = extractFloat32Frac( a );
1989 1.1 ross aExp = extractFloat32Exp( a );
1990 1.1 ross aSign = extractFloat32Sign( a );
1991 1.1 ross bSig = extractFloat32Frac( b );
1992 1.1 ross bExp = extractFloat32Exp( b );
1993 1.1 ross bSign = extractFloat32Sign( b );
1994 1.1 ross if ( aExp == 0xFF ) {
1995 1.1 ross if ( aSig || ( ( bExp == 0xFF ) && bSig ) ) {
1996 1.1 ross return propagateFloat32NaN( a, b );
1997 1.1 ross }
1998 1.1 ross float_raise( float_flag_invalid );
1999 1.1 ross return float32_default_nan;
2000 1.1 ross }
2001 1.1 ross if ( bExp == 0xFF ) {
2002 1.1 ross if ( bSig ) return propagateFloat32NaN( a, b );
2003 1.1 ross return a;
2004 1.1 ross }
2005 1.1 ross if ( bExp == 0 ) {
2006 1.1 ross if ( bSig == 0 ) {
2007 1.1 ross float_raise( float_flag_invalid );
2008 1.1 ross return float32_default_nan;
2009 1.1 ross }
2010 1.1 ross normalizeFloat32Subnormal( bSig, &bExp, &bSig );
2011 1.1 ross }
2012 1.1 ross if ( aExp == 0 ) {
2013 1.1 ross if ( aSig == 0 ) return a;
2014 1.1 ross normalizeFloat32Subnormal( aSig, &aExp, &aSig );
2015 1.1 ross }
2016 1.1 ross expDiff = aExp - bExp;
2017 1.1 ross aSig |= 0x00800000;
2018 1.1 ross bSig |= 0x00800000;
2019 1.1 ross if ( expDiff < 32 ) {
2020 1.1 ross aSig <<= 8;
2021 1.1 ross bSig <<= 8;
2022 1.1 ross if ( expDiff < 0 ) {
2023 1.1 ross if ( expDiff < -1 ) return a;
2024 1.1 ross aSig >>= 1;
2025 1.1 ross }
2026 1.1 ross q = ( bSig <= aSig );
2027 1.1 ross if ( q ) aSig -= bSig;
2028 1.1 ross if ( 0 < expDiff ) {
2029 1.1 ross q = ( ( (bits64) aSig )<<32 ) / bSig;
2030 1.1 ross q >>= 32 - expDiff;
2031 1.1 ross bSig >>= 2;
2032 1.1 ross aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q;
2033 1.1 ross }
2034 1.1 ross else {
2035 1.1 ross aSig >>= 2;
2036 1.1 ross bSig >>= 2;
2037 1.1 ross }
2038 1.1 ross }
2039 1.1 ross else {
2040 1.1 ross if ( bSig <= aSig ) aSig -= bSig;
2041 1.1 ross aSig64 = ( (bits64) aSig )<<40;
2042 1.1 ross bSig64 = ( (bits64) bSig )<<40;
2043 1.1 ross expDiff -= 64;
2044 1.1 ross while ( 0 < expDiff ) {
2045 1.1 ross q64 = estimateDiv128To64( aSig64, 0, bSig64 );
2046 1.1 ross q64 = ( 2 < q64 ) ? q64 - 2 : 0;
2047 1.1 ross aSig64 = - ( ( bSig * q64 )<<38 );
2048 1.1 ross expDiff -= 62;
2049 1.1 ross }
2050 1.1 ross expDiff += 64;
2051 1.1 ross q64 = estimateDiv128To64( aSig64, 0, bSig64 );
2052 1.1 ross q64 = ( 2 < q64 ) ? q64 - 2 : 0;
2053 1.1 ross q = q64>>( 64 - expDiff );
2054 1.1 ross bSig <<= 6;
2055 1.1 ross aSig = ( ( aSig64>>33 )<<( expDiff - 1 ) ) - bSig * q;
2056 1.1 ross }
2057 1.1 ross do {
2058 1.1 ross alternateASig = aSig;
2059 1.1 ross ++q;
2060 1.1 ross aSig -= bSig;
2061 1.1 ross } while ( 0 <= (sbits32) aSig );
2062 1.1 ross sigMean = aSig + alternateASig;
2063 1.1 ross if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) {
2064 1.1 ross aSig = alternateASig;
2065 1.1 ross }
2066 1.1 ross zSign = ( (sbits32) aSig < 0 );
2067 1.1 ross if ( zSign ) aSig = - aSig;
2068 1.1 ross return normalizeRoundAndPackFloat32( aSign ^ zSign, bExp, aSig );
2069 1.1 ross
2070 1.1 ross }
2071 1.1 ross #endif /* !SOFTFLOAT_FOR_GCC */
2072 1.1 ross
2073 1.1 ross #ifndef SOFTFLOAT_FOR_GCC /* Not needed */
2074 1.1 ross /*
2075 1.1 ross -------------------------------------------------------------------------------
2076 1.1 ross Returns the square root of the single-precision floating-point value `a'.
2077 1.1 ross The operation is performed according to the IEC/IEEE Standard for Binary
2078 1.1 ross Floating-Point Arithmetic.
2079 1.1 ross -------------------------------------------------------------------------------
2080 1.1 ross */
2081 1.1 ross float32 float32_sqrt( float32 a )
2082 1.1 ross {
2083 1.1 ross flag aSign;
2084 1.1 ross int16 aExp, zExp;
2085 1.1 ross bits32 aSig, zSig;
2086 1.1 ross bits64 rem, term;
2087 1.1 ross
2088 1.1 ross aSig = extractFloat32Frac( a );
2089 1.1 ross aExp = extractFloat32Exp( a );
2090 1.1 ross aSign = extractFloat32Sign( a );
2091 1.1 ross if ( aExp == 0xFF ) {
2092 1.1 ross if ( aSig ) return propagateFloat32NaN( a, 0 );
2093 1.1 ross if ( ! aSign ) return a;
2094 1.1 ross float_raise( float_flag_invalid );
2095 1.1 ross return float32_default_nan;
2096 1.1 ross }
2097 1.1 ross if ( aSign ) {
2098 1.1 ross if ( ( aExp | aSig ) == 0 ) return a;
2099 1.1 ross float_raise( float_flag_invalid );
2100 1.1 ross return float32_default_nan;
2101 1.1 ross }
2102 1.1 ross if ( aExp == 0 ) {
2103 1.1 ross if ( aSig == 0 ) return 0;
2104 1.1 ross normalizeFloat32Subnormal( aSig, &aExp, &aSig );
2105 1.1 ross }
2106 1.1 ross zExp = ( ( aExp - 0x7F )>>1 ) + 0x7E;
2107 1.1 ross aSig = ( aSig | 0x00800000 )<<8;
2108 1.1 ross zSig = estimateSqrt32( aExp, aSig ) + 2;
2109 1.1 ross if ( ( zSig & 0x7F ) <= 5 ) {
2110 1.1 ross if ( zSig < 2 ) {
2111 1.1 ross zSig = 0x7FFFFFFF;
2112 1.1 ross goto roundAndPack;
2113 1.1 ross }
2114 1.1 ross aSig >>= aExp & 1;
2115 1.1 ross term = ( (bits64) zSig ) * zSig;
2116 1.1 ross rem = ( ( (bits64) aSig )<<32 ) - term;
2117 1.1 ross while ( (sbits64) rem < 0 ) {
2118 1.1 ross --zSig;
2119 1.1 ross rem += ( ( (bits64) zSig )<<1 ) | 1;
2120 1.1 ross }
2121 1.1 ross zSig |= ( rem != 0 );
2122 1.1 ross }
2123 1.1 ross shift32RightJamming( zSig, 1, &zSig );
2124 1.1 ross roundAndPack:
2125 1.1 ross return roundAndPackFloat32( 0, zExp, zSig );
2126 1.1 ross
2127 1.1 ross }
2128 1.1 ross #endif /* !SOFTFLOAT_FOR_GCC */
2129 1.1 ross
2130 1.1 ross /*
2131 1.1 ross -------------------------------------------------------------------------------
2132 1.1 ross Returns 1 if the single-precision floating-point value `a' is equal to
2133 1.1 ross the corresponding value `b', and 0 otherwise. The comparison is performed
2134 1.1 ross according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2135 1.1 ross -------------------------------------------------------------------------------
2136 1.1 ross */
2137 1.1 ross flag float32_eq( float32 a, float32 b )
2138 1.1 ross {
2139 1.1 ross
2140 1.1 ross if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2141 1.1 ross || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2142 1.1 ross ) {
2143 1.1 ross if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) {
2144 1.1 ross float_raise( float_flag_invalid );
2145 1.1 ross }
2146 1.1 ross return 0;
2147 1.1 ross }
2148 1.1 ross return ( a == b ) || ( (bits32) ( ( a | b )<<1 ) == 0 );
2149 1.1 ross
2150 1.1 ross }
2151 1.1 ross
2152 1.1 ross /*
2153 1.1 ross -------------------------------------------------------------------------------
2154 1.1 ross Returns 1 if the single-precision floating-point value `a' is less than
2155 1.1 ross or equal to the corresponding value `b', and 0 otherwise. The comparison
2156 1.1 ross is performed according to the IEC/IEEE Standard for Binary Floating-Point
2157 1.1 ross Arithmetic.
2158 1.1 ross -------------------------------------------------------------------------------
2159 1.1 ross */
2160 1.1 ross flag float32_le( float32 a, float32 b )
2161 1.1 ross {
2162 1.1 ross flag aSign, bSign;
2163 1.1 ross
2164 1.1 ross if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2165 1.1 ross || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2166 1.1 ross ) {
2167 1.1 ross float_raise( float_flag_invalid );
2168 1.1 ross return 0;
2169 1.1 ross }
2170 1.1 ross aSign = extractFloat32Sign( a );
2171 1.1 ross bSign = extractFloat32Sign( b );
2172 1.1 ross if ( aSign != bSign ) return aSign || ( (bits32) ( ( a | b )<<1 ) == 0 );
2173 1.1 ross return ( a == b ) || ( aSign ^ ( a < b ) );
2174 1.1 ross
2175 1.1 ross }
2176 1.1 ross
2177 1.1 ross /*
2178 1.1 ross -------------------------------------------------------------------------------
2179 1.1 ross Returns 1 if the single-precision floating-point value `a' is less than
2180 1.1 ross the corresponding value `b', and 0 otherwise. The comparison is performed
2181 1.1 ross according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2182 1.1 ross -------------------------------------------------------------------------------
2183 1.1 ross */
2184 1.1 ross flag float32_lt( float32 a, float32 b )
2185 1.1 ross {
2186 1.1 ross flag aSign, bSign;
2187 1.1 ross
2188 1.1 ross if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2189 1.1 ross || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2190 1.1 ross ) {
2191 1.1 ross float_raise( float_flag_invalid );
2192 1.1 ross return 0;
2193 1.1 ross }
2194 1.1 ross aSign = extractFloat32Sign( a );
2195 1.1 ross bSign = extractFloat32Sign( b );
2196 1.1 ross if ( aSign != bSign ) return aSign && ( (bits32) ( ( a | b )<<1 ) != 0 );
2197 1.1 ross return ( a != b ) && ( aSign ^ ( a < b ) );
2198 1.1 ross
2199 1.1 ross }
2200 1.1 ross
2201 1.1 ross #ifndef SOFTFLOAT_FOR_GCC /* Not needed */
2202 1.1 ross /*
2203 1.1 ross -------------------------------------------------------------------------------
2204 1.1 ross Returns 1 if the single-precision floating-point value `a' is equal to
2205 1.1 ross the corresponding value `b', and 0 otherwise. The invalid exception is
2206 1.1 ross raised if either operand is a NaN. Otherwise, the comparison is performed
2207 1.1 ross according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2208 1.1 ross -------------------------------------------------------------------------------
2209 1.1 ross */
2210 1.1 ross flag float32_eq_signaling( float32 a, float32 b )
2211 1.1 ross {
2212 1.1 ross
2213 1.1 ross if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2214 1.1 ross || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2215 1.1 ross ) {
2216 1.1 ross float_raise( float_flag_invalid );
2217 1.1 ross return 0;
2218 1.1 ross }
2219 1.1 ross return ( a == b ) || ( (bits32) ( ( a | b )<<1 ) == 0 );
2220 1.1 ross
2221 1.1 ross }
2222 1.1 ross
2223 1.1 ross /*
2224 1.1 ross -------------------------------------------------------------------------------
2225 1.1 ross Returns 1 if the single-precision floating-point value `a' is less than or
2226 1.1 ross equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not
2227 1.1 ross cause an exception. Otherwise, the comparison is performed according to the
2228 1.1 ross IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2229 1.1 ross -------------------------------------------------------------------------------
2230 1.1 ross */
2231 1.1 ross flag float32_le_quiet( float32 a, float32 b )
2232 1.1 ross {
2233 1.1 ross flag aSign, bSign;
2234 1.1 ross
2235 1.1 ross if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2236 1.1 ross || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2237 1.1 ross ) {
2238 1.1 ross if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) {
2239 1.1 ross float_raise( float_flag_invalid );
2240 1.1 ross }
2241 1.1 ross return 0;
2242 1.1 ross }
2243 1.1 ross aSign = extractFloat32Sign( a );
2244 1.1 ross bSign = extractFloat32Sign( b );
2245 1.1 ross if ( aSign != bSign ) return aSign || ( (bits32) ( ( a | b )<<1 ) == 0 );
2246 1.1 ross return ( a == b ) || ( aSign ^ ( a < b ) );
2247 1.1 ross
2248 1.1 ross }
2249 1.1 ross
2250 1.1 ross /*
2251 1.1 ross -------------------------------------------------------------------------------
2252 1.1 ross Returns 1 if the single-precision floating-point value `a' is less than
2253 1.1 ross the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
2254 1.1 ross exception. Otherwise, the comparison is performed according to the IEC/IEEE
2255 1.1 ross Standard for Binary Floating-Point Arithmetic.
2256 1.1 ross -------------------------------------------------------------------------------
2257 1.1 ross */
2258 1.1 ross flag float32_lt_quiet( float32 a, float32 b )
2259 1.1 ross {
2260 1.1 ross flag aSign, bSign;
2261 1.1 ross
2262 1.1 ross if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2263 1.1 ross || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2264 1.1 ross ) {
2265 1.1 ross if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) {
2266 1.1 ross float_raise( float_flag_invalid );
2267 1.1 ross }
2268 1.1 ross return 0;
2269 1.1 ross }
2270 1.1 ross aSign = extractFloat32Sign( a );
2271 1.1 ross bSign = extractFloat32Sign( b );
2272 1.1 ross if ( aSign != bSign ) return aSign && ( (bits32) ( ( a | b )<<1 ) != 0 );
2273 1.1 ross return ( a != b ) && ( aSign ^ ( a < b ) );
2274 1.1 ross
2275 1.1 ross }
2276 1.1 ross #endif /* !SOFTFLOAT_FOR_GCC */
2277 1.1 ross
2278 1.1 ross #ifndef SOFTFLOAT_FOR_GCC /* Not needed */
2279 1.1 ross /*
2280 1.1 ross -------------------------------------------------------------------------------
2281 1.1 ross Returns the result of converting the double-precision floating-point value
2282 1.1 ross `a' to the 32-bit two's complement integer format. The conversion is
2283 1.1 ross performed according to the IEC/IEEE Standard for Binary Floating-Point
2284 1.1 ross Arithmetic---which means in particular that the conversion is rounded
2285 1.1 ross according to the current rounding mode. If `a' is a NaN, the largest
2286 1.1 ross positive integer is returned. Otherwise, if the conversion overflows, the
2287 1.1 ross largest integer with the same sign as `a' is returned.
2288 1.1 ross -------------------------------------------------------------------------------
2289 1.1 ross */
2290 1.1 ross int32 float64_to_int32( float64 a )
2291 1.1 ross {
2292 1.1 ross flag aSign;
2293 1.1 ross int16 aExp, shiftCount;
2294 1.1 ross bits64 aSig;
2295 1.1 ross
2296 1.1 ross aSig = extractFloat64Frac( a );
2297 1.1 ross aExp = extractFloat64Exp( a );
2298 1.1 ross aSign = extractFloat64Sign( a );
2299 1.1 ross if ( ( aExp == 0x7FF ) && aSig ) aSign = 0;
2300 1.1 ross if ( aExp ) aSig |= LIT64( 0x0010000000000000 );
2301 1.1 ross shiftCount = 0x42C - aExp;
2302 1.1 ross if ( 0 < shiftCount ) shift64RightJamming( aSig, shiftCount, &aSig );
2303 1.1 ross return roundAndPackInt32( aSign, aSig );
2304 1.1 ross
2305 1.1 ross }
2306 1.1 ross #endif /* !SOFTFLOAT_FOR_GCC */
2307 1.1 ross
2308 1.1 ross /*
2309 1.1 ross -------------------------------------------------------------------------------
2310 1.1 ross Returns the result of converting the double-precision floating-point value
2311 1.1 ross `a' to the 32-bit two's complement integer format. The conversion is
2312 1.1 ross performed according to the IEC/IEEE Standard for Binary Floating-Point
2313 1.1 ross Arithmetic, except that the conversion is always rounded toward zero.
2314 1.1 ross If `a' is a NaN, the largest positive integer is returned. Otherwise, if
2315 1.1 ross the conversion overflows, the largest integer with the same sign as `a' is
2316 1.1 ross returned.
2317 1.1 ross -------------------------------------------------------------------------------
2318 1.1 ross */
2319 1.1 ross int32 float64_to_int32_round_to_zero( float64 a )
2320 1.1 ross {
2321 1.1 ross flag aSign;
2322 1.1 ross int16 aExp, shiftCount;
2323 1.1 ross bits64 aSig, savedASig;
2324 1.1 ross int32 z;
2325 1.1 ross
2326 1.1 ross aSig = extractFloat64Frac( a );
2327 1.1 ross aExp = extractFloat64Exp( a );
2328 1.1 ross aSign = extractFloat64Sign( a );
2329 1.1 ross if ( 0x41E < aExp ) {
2330 1.1 ross if ( ( aExp == 0x7FF ) && aSig ) aSign = 0;
2331 1.1 ross goto invalid;
2332 1.1 ross }
2333 1.1 ross else if ( aExp < 0x3FF ) {
2334 1.1 ross if ( aExp || aSig ) float_set_inexact();
2335 1.1 ross return 0;
2336 1.1 ross }
2337 1.1 ross aSig |= LIT64( 0x0010000000000000 );
2338 1.1 ross shiftCount = 0x433 - aExp;
2339 1.1 ross savedASig = aSig;
2340 1.1 ross aSig >>= shiftCount;
2341 1.1 ross z = aSig;
2342 1.1 ross if ( aSign ) z = - z;
2343 1.1 ross if ( ( z < 0 ) ^ aSign ) {
2344 1.1 ross invalid:
2345 1.1 ross float_raise( float_flag_invalid );
2346 1.1 ross return aSign ? (sbits32) 0x80000000 : 0x7FFFFFFF;
2347 1.1 ross }
2348 1.1 ross if ( ( aSig<<shiftCount ) != savedASig ) {
2349 1.1 ross float_set_inexact();
2350 1.1 ross }
2351 1.1 ross return z;
2352 1.1 ross
2353 1.1 ross }
2354 1.1 ross
2355 1.1 ross #ifndef SOFTFLOAT_FOR_GCC /* Not needed */
2356 1.1 ross /*
2357 1.1 ross -------------------------------------------------------------------------------
2358 1.1 ross Returns the result of converting the double-precision floating-point value
2359 1.1 ross `a' to the 64-bit two's complement integer format. The conversion is
2360 1.1 ross performed according to the IEC/IEEE Standard for Binary Floating-Point
2361 1.1 ross Arithmetic---which means in particular that the conversion is rounded
2362 1.1 ross according to the current rounding mode. If `a' is a NaN, the largest
2363 1.1 ross positive integer is returned. Otherwise, if the conversion overflows, the
2364 1.1 ross largest integer with the same sign as `a' is returned.
2365 1.1 ross -------------------------------------------------------------------------------
2366 1.1 ross */
2367 1.1 ross int64 float64_to_int64( float64 a )
2368 1.1 ross {
2369 1.1 ross flag aSign;
2370 1.1 ross int16 aExp, shiftCount;
2371 1.1 ross bits64 aSig, aSigExtra;
2372 1.1 ross
2373 1.1 ross aSig = extractFloat64Frac( a );
2374 1.1 ross aExp = extractFloat64Exp( a );
2375 1.1 ross aSign = extractFloat64Sign( a );
2376 1.1 ross if ( aExp ) aSig |= LIT64( 0x0010000000000000 );
2377 1.1 ross shiftCount = 0x433 - aExp;
2378 1.1 ross if ( shiftCount <= 0 ) {
2379 1.1 ross if ( 0x43E < aExp ) {
2380 1.1 ross float_raise( float_flag_invalid );
2381 1.1 ross if ( ! aSign
2382 1.1 ross || ( ( aExp == 0x7FF )
2383 1.1 ross && ( aSig != LIT64( 0x0010000000000000 ) ) )
2384 1.1 ross ) {
2385 1.1 ross return LIT64( 0x7FFFFFFFFFFFFFFF );
2386 1.1 ross }
2387 1.1 ross return (sbits64) LIT64( 0x8000000000000000 );
2388 1.1 ross }
2389 1.1 ross aSigExtra = 0;
2390 1.1 ross aSig <<= - shiftCount;
2391 1.1 ross }
2392 1.1 ross else {
2393 1.1 ross shift64ExtraRightJamming( aSig, 0, shiftCount, &aSig, &aSigExtra );
2394 1.1 ross }
2395 1.1 ross return roundAndPackInt64( aSign, aSig, aSigExtra );
2396 1.1 ross
2397 1.1 ross }
2398 1.1 ross
2399 1.1 ross /*
2400 1.1 ross -------------------------------------------------------------------------------
2401 1.1 ross Returns the result of converting the double-precision floating-point value
2402 1.1 ross `a' to the 64-bit two's complement integer format. The conversion is
2403 1.1 ross performed according to the IEC/IEEE Standard for Binary Floating-Point
2404 1.1 ross Arithmetic, except that the conversion is always rounded toward zero.
2405 1.1 ross If `a' is a NaN, the largest positive integer is returned. Otherwise, if
2406 1.1 ross the conversion overflows, the largest integer with the same sign as `a' is
2407 1.1 ross returned.
2408 1.1 ross -------------------------------------------------------------------------------
2409 1.1 ross */
2410 1.1 ross int64 float64_to_int64_round_to_zero( float64 a )
2411 1.1 ross {
2412 1.1 ross flag aSign;
2413 1.1 ross int16 aExp, shiftCount;
2414 1.1 ross bits64 aSig;
2415 1.1 ross int64 z;
2416 1.1 ross
2417 1.1 ross aSig = extractFloat64Frac( a );
2418 1.1 ross aExp = extractFloat64Exp( a );
2419 1.1 ross aSign = extractFloat64Sign( a );
2420 1.1 ross if ( aExp ) aSig |= LIT64( 0x0010000000000000 );
2421 1.1 ross shiftCount = aExp - 0x433;
2422 1.1 ross if ( 0 <= shiftCount ) {
2423 1.1 ross if ( 0x43E <= aExp ) {
2424 1.1 ross if ( a != LIT64( 0xC3E0000000000000 ) ) {
2425 1.1 ross float_raise( float_flag_invalid );
2426 1.1 ross if ( ! aSign
2427 1.1 ross || ( ( aExp == 0x7FF )
2428 1.1 ross && ( aSig != LIT64( 0x0010000000000000 ) ) )
2429 1.1 ross ) {
2430 1.1 ross return LIT64( 0x7FFFFFFFFFFFFFFF );
2431 1.1 ross }
2432 1.1 ross }
2433 1.1 ross return (sbits64) LIT64( 0x8000000000000000 );
2434 1.1 ross }
2435 1.1 ross z = aSig<<shiftCount;
2436 1.1 ross }
2437 1.1 ross else {
2438 1.1 ross if ( aExp < 0x3FE ) {
2439 1.1 ross if ( aExp | aSig ) float_set_inexact();
2440 1.1 ross return 0;
2441 1.1 ross }
2442 1.1 ross z = aSig>>( - shiftCount );
2443 1.1 ross if ( (bits64) ( aSig<<( shiftCount & 63 ) ) ) {
2444 1.1 ross float_set_inexact();
2445 1.1 ross }
2446 1.1 ross }
2447 1.1 ross if ( aSign ) z = - z;
2448 1.1 ross return z;
2449 1.1 ross
2450 1.1 ross }
2451 1.1 ross #endif /* !SOFTFLOAT_FOR_GCC */
2452 1.1 ross
2453 1.1 ross /*
2454 1.1 ross -------------------------------------------------------------------------------
2455 1.1 ross Returns the result of converting the double-precision floating-point value
2456 1.1 ross `a' to the single-precision floating-point format. The conversion is
2457 1.1 ross performed according to the IEC/IEEE Standard for Binary Floating-Point
2458 1.1 ross Arithmetic.
2459 1.1 ross -------------------------------------------------------------------------------
2460 1.1 ross */
2461 1.1 ross float32 float64_to_float32( float64 a )
2462 1.1 ross {
2463 1.1 ross flag aSign;
2464 1.1 ross int16 aExp;
2465 1.1 ross bits64 aSig;
2466 1.1 ross bits32 zSig;
2467 1.1 ross
2468 1.1 ross aSig = extractFloat64Frac( a );
2469 1.1 ross aExp = extractFloat64Exp( a );
2470 1.1 ross aSign = extractFloat64Sign( a );
2471 1.1 ross if ( aExp == 0x7FF ) {
2472 1.1 ross if ( aSig ) return commonNaNToFloat32( float64ToCommonNaN( a ) );
2473 1.1 ross return packFloat32( aSign, 0xFF, 0 );
2474 1.1 ross }
2475 1.1 ross shift64RightJamming( aSig, 22, &aSig );
2476 1.1 ross zSig = aSig;
2477 1.1 ross if ( aExp || zSig ) {
2478 1.1 ross zSig |= 0x40000000;
2479 1.1 ross aExp -= 0x381;
2480 1.1 ross }
2481 1.1 ross return roundAndPackFloat32( aSign, aExp, zSig );
2482 1.1 ross
2483 1.1 ross }
2484 1.1 ross
2485 1.1 ross #ifdef FLOATX80
2486 1.1 ross
2487 1.1 ross /*
2488 1.1 ross -------------------------------------------------------------------------------
2489 1.1 ross Returns the result of converting the double-precision floating-point value
2490 1.1 ross `a' to the extended double-precision floating-point format. The conversion
2491 1.1 ross is performed according to the IEC/IEEE Standard for Binary Floating-Point
2492 1.1 ross Arithmetic.
2493 1.1 ross -------------------------------------------------------------------------------
2494 1.1 ross */
2495 1.1 ross floatx80 float64_to_floatx80( float64 a )
2496 1.1 ross {
2497 1.1 ross flag aSign;
2498 1.1 ross int16 aExp;
2499 1.1 ross bits64 aSig;
2500 1.1 ross
2501 1.1 ross aSig = extractFloat64Frac( a );
2502 1.1 ross aExp = extractFloat64Exp( a );
2503 1.1 ross aSign = extractFloat64Sign( a );
2504 1.1 ross if ( aExp == 0x7FF ) {
2505 1.1 ross if ( aSig ) return commonNaNToFloatx80( float64ToCommonNaN( a ) );
2506 1.1 ross return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
2507 1.1 ross }
2508 1.1 ross if ( aExp == 0 ) {
2509 1.1 ross if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 );
2510 1.1 ross normalizeFloat64Subnormal( aSig, &aExp, &aSig );
2511 1.1 ross }
2512 1.1 ross return
2513 1.1 ross packFloatx80(
2514 1.1 ross aSign, aExp + 0x3C00, ( aSig | LIT64( 0x0010000000000000 ) )<<11 );
2515 1.1 ross
2516 1.1 ross }
2517 1.1 ross
2518 1.1 ross #endif
2519 1.1 ross
2520 1.1 ross #ifdef FLOAT128
2521 1.1 ross
2522 1.1 ross /*
2523 1.1 ross -------------------------------------------------------------------------------
2524 1.1 ross Returns the result of converting the double-precision floating-point value
2525 1.1 ross `a' to the quadruple-precision floating-point format. The conversion is
2526 1.1 ross performed according to the IEC/IEEE Standard for Binary Floating-Point
2527 1.1 ross Arithmetic.
2528 1.1 ross -------------------------------------------------------------------------------
2529 1.1 ross */
2530 1.1 ross float128 float64_to_float128( float64 a )
2531 1.1 ross {
2532 1.1 ross flag aSign;
2533 1.1 ross int16 aExp;
2534 1.1 ross bits64 aSig, zSig0, zSig1;
2535 1.1 ross
2536 1.1 ross aSig = extractFloat64Frac( a );
2537 1.1 ross aExp = extractFloat64Exp( a );
2538 1.1 ross aSign = extractFloat64Sign( a );
2539 1.1 ross if ( aExp == 0x7FF ) {
2540 1.1 ross if ( aSig ) return commonNaNToFloat128( float64ToCommonNaN( a ) );
2541 1.1 ross return packFloat128( aSign, 0x7FFF, 0, 0 );
2542 1.1 ross }
2543 1.1 ross if ( aExp == 0 ) {
2544 1.1 ross if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 );
2545 1.1 ross normalizeFloat64Subnormal( aSig, &aExp, &aSig );
2546 1.1 ross --aExp;
2547 1.1 ross }
2548 1.1 ross shift128Right( aSig, 0, 4, &zSig0, &zSig1 );
2549 1.1 ross return packFloat128( aSign, aExp + 0x3C00, zSig0, zSig1 );
2550 1.1 ross
2551 1.1 ross }
2552 1.1 ross
2553 1.1 ross #endif
2554 1.1 ross
2555 1.1 ross #ifndef SOFTFLOAT_FOR_GCC
2556 1.1 ross /*
2557 1.1 ross -------------------------------------------------------------------------------
2558 1.1 ross Rounds the double-precision floating-point value `a' to an integer, and
2559 1.1 ross returns the result as a double-precision floating-point value. The
2560 1.1 ross operation is performed according to the IEC/IEEE Standard for Binary
2561 1.1 ross Floating-Point Arithmetic.
2562 1.1 ross -------------------------------------------------------------------------------
2563 1.1 ross */
2564 1.1 ross float64 float64_round_to_int( float64 a )
2565 1.1 ross {
2566 1.1 ross flag aSign;
2567 1.1 ross int16 aExp;
2568 1.1 ross bits64 lastBitMask, roundBitsMask;
2569 1.1 ross int8 roundingMode;
2570 1.1 ross float64 z;
2571 1.1 ross
2572 1.1 ross aExp = extractFloat64Exp( a );
2573 1.1 ross if ( 0x433 <= aExp ) {
2574 1.1 ross if ( ( aExp == 0x7FF ) && extractFloat64Frac( a ) ) {
2575 1.1 ross return propagateFloat64NaN( a, a );
2576 1.1 ross }
2577 1.1 ross return a;
2578 1.1 ross }
2579 1.1 ross if ( aExp < 0x3FF ) {
2580 1.1 ross if ( (bits64) ( a<<1 ) == 0 ) return a;
2581 1.1 ross float_set_inexact();
2582 1.1 ross aSign = extractFloat64Sign( a );
2583 1.1 ross switch ( float_rounding_mode() ) {
2584 1.1 ross case float_round_nearest_even:
2585 1.1 ross if ( ( aExp == 0x3FE ) && extractFloat64Frac( a ) ) {
2586 1.1 ross return packFloat64( aSign, 0x3FF, 0 );
2587 1.1 ross }
2588 1.1 ross break;
2589 1.1 ross case float_round_down:
2590 1.1 ross return aSign ? LIT64( 0xBFF0000000000000 ) : 0;
2591 1.1 ross case float_round_up:
2592 1.1 ross return
2593 1.1 ross aSign ? LIT64( 0x8000000000000000 ) : LIT64( 0x3FF0000000000000 );
2594 1.1 ross }
2595 1.1 ross return packFloat64( aSign, 0, 0 );
2596 1.1 ross }
2597 1.1 ross lastBitMask = 1;
2598 1.1 ross lastBitMask <<= 0x433 - aExp;
2599 1.1 ross roundBitsMask = lastBitMask - 1;
2600 1.1 ross z = a;
2601 1.1 ross roundingMode = float_rounding_mode();
2602 1.1 ross if ( roundingMode == float_round_nearest_even ) {
2603 1.1 ross z += lastBitMask>>1;
2604 1.1 ross if ( ( z & roundBitsMask ) == 0 ) z &= ~ lastBitMask;
2605 1.1 ross }
2606 1.1 ross else if ( roundingMode != float_round_to_zero ) {
2607 1.1 ross if ( extractFloat64Sign( z ) ^ ( roundingMode == float_round_up ) ) {
2608 1.1 ross z += roundBitsMask;
2609 1.1 ross }
2610 1.1 ross }
2611 1.1 ross z &= ~ roundBitsMask;
2612 1.1 ross if ( z != a ) float_set_inexact();
2613 1.1 ross return z;
2614 1.1 ross
2615 1.1 ross }
2616 1.1 ross #endif
2617 1.1 ross
2618 1.1 ross /*
2619 1.1 ross -------------------------------------------------------------------------------
2620 1.1 ross Returns the result of adding the absolute values of the double-precision
2621 1.1 ross floating-point values `a' and `b'. If `zSign' is 1, the sum is negated
2622 1.1 ross before being returned. `zSign' is ignored if the result is a NaN.
2623 1.1 ross The addition is performed according to the IEC/IEEE Standard for Binary
2624 1.1 ross Floating-Point Arithmetic.
2625 1.1 ross -------------------------------------------------------------------------------
2626 1.1 ross */
2627 1.1 ross static float64 addFloat64Sigs( float64 a, float64 b, flag zSign )
2628 1.1 ross {
2629 1.1 ross int16 aExp, bExp, zExp;
2630 1.1 ross bits64 aSig, bSig, zSig;
2631 1.1 ross int16 expDiff;
2632 1.1 ross
2633 1.1 ross aSig = extractFloat64Frac( a );
2634 1.1 ross aExp = extractFloat64Exp( a );
2635 1.1 ross bSig = extractFloat64Frac( b );
2636 1.1 ross bExp = extractFloat64Exp( b );
2637 1.1 ross expDiff = aExp - bExp;
2638 1.1 ross aSig <<= 9;
2639 1.1 ross bSig <<= 9;
2640 1.1 ross if ( 0 < expDiff ) {
2641 1.1 ross if ( aExp == 0x7FF ) {
2642 1.1 ross if ( aSig ) return propagateFloat64NaN( a, b );
2643 1.1 ross return a;
2644 1.1 ross }
2645 1.1 ross if ( bExp == 0 ) {
2646 1.1 ross --expDiff;
2647 1.1 ross }
2648 1.1 ross else {
2649 1.1 ross bSig |= LIT64( 0x2000000000000000 );
2650 1.1 ross }
2651 1.1 ross shift64RightJamming( bSig, expDiff, &bSig );
2652 1.1 ross zExp = aExp;
2653 1.1 ross }
2654 1.1 ross else if ( expDiff < 0 ) {
2655 1.1 ross if ( bExp == 0x7FF ) {
2656 1.1 ross if ( bSig ) return propagateFloat64NaN( a, b );
2657 1.1 ross return packFloat64( zSign, 0x7FF, 0 );
2658 1.1 ross }
2659 1.1 ross if ( aExp == 0 ) {
2660 1.1 ross ++expDiff;
2661 1.1 ross }
2662 1.1 ross else {
2663 1.1 ross aSig |= LIT64( 0x2000000000000000 );
2664 1.1 ross }
2665 1.1 ross shift64RightJamming( aSig, - expDiff, &aSig );
2666 1.1 ross zExp = bExp;
2667 1.1 ross }
2668 1.1 ross else {
2669 1.1 ross if ( aExp == 0x7FF ) {
2670 1.1 ross if ( aSig | bSig ) return propagateFloat64NaN( a, b );
2671 1.1 ross return a;
2672 1.1 ross }
2673 1.1 ross if ( aExp == 0 ) return packFloat64( zSign, 0, ( aSig + bSig )>>9 );
2674 1.1 ross zSig = LIT64( 0x4000000000000000 ) + aSig + bSig;
2675 1.1 ross zExp = aExp;
2676 1.1 ross goto roundAndPack;
2677 1.1 ross }
2678 1.1 ross aSig |= LIT64( 0x2000000000000000 );
2679 1.1 ross zSig = ( aSig + bSig )<<1;
2680 1.1 ross --zExp;
2681 1.1 ross if ( (sbits64) zSig < 0 ) {
2682 1.1 ross zSig = aSig + bSig;
2683 1.1 ross ++zExp;
2684 1.1 ross }
2685 1.1 ross roundAndPack:
2686 1.1 ross return roundAndPackFloat64( zSign, zExp, zSig );
2687 1.1 ross
2688 1.1 ross }
2689 1.1 ross
2690 1.1 ross /*
2691 1.1 ross -------------------------------------------------------------------------------
2692 1.1 ross Returns the result of subtracting the absolute values of the double-
2693 1.1 ross precision floating-point values `a' and `b'. If `zSign' is 1, the
2694 1.1 ross difference is negated before being returned. `zSign' is ignored if the
2695 1.1 ross result is a NaN. The subtraction is performed according to the IEC/IEEE
2696 1.1 ross Standard for Binary Floating-Point Arithmetic.
2697 1.1 ross -------------------------------------------------------------------------------
2698 1.1 ross */
2699 1.1 ross static float64 subFloat64Sigs( float64 a, float64 b, flag zSign )
2700 1.1 ross {
2701 1.1 ross int16 aExp, bExp, zExp;
2702 1.1 ross bits64 aSig, bSig, zSig;
2703 1.1 ross int16 expDiff;
2704 1.1 ross
2705 1.1 ross aSig = extractFloat64Frac( a );
2706 1.1 ross aExp = extractFloat64Exp( a );
2707 1.1 ross bSig = extractFloat64Frac( b );
2708 1.1 ross bExp = extractFloat64Exp( b );
2709 1.1 ross expDiff = aExp - bExp;
2710 1.1 ross aSig <<= 10;
2711 1.1 ross bSig <<= 10;
2712 1.1 ross if ( 0 < expDiff ) goto aExpBigger;
2713 1.1 ross if ( expDiff < 0 ) goto bExpBigger;
2714 1.1 ross if ( aExp == 0x7FF ) {
2715 1.1 ross if ( aSig | bSig ) return propagateFloat64NaN( a, b );
2716 1.1 ross float_raise( float_flag_invalid );
2717 1.1 ross return float64_default_nan;
2718 1.1 ross }
2719 1.1 ross if ( aExp == 0 ) {
2720 1.1 ross aExp = 1;
2721 1.1 ross bExp = 1;
2722 1.1 ross }
2723 1.1 ross if ( bSig < aSig ) goto aBigger;
2724 1.1 ross if ( aSig < bSig ) goto bBigger;
2725 1.1 ross return packFloat64( float_rounding_mode() == float_round_down, 0, 0 );
2726 1.1 ross bExpBigger:
2727 1.1 ross if ( bExp == 0x7FF ) {
2728 1.1 ross if ( bSig ) return propagateFloat64NaN( a, b );
2729 1.1 ross return packFloat64( zSign ^ 1, 0x7FF, 0 );
2730 1.1 ross }
2731 1.1 ross if ( aExp == 0 ) {
2732 1.1 ross ++expDiff;
2733 1.1 ross }
2734 1.1 ross else {
2735 1.1 ross aSig |= LIT64( 0x4000000000000000 );
2736 1.1 ross }
2737 1.1 ross shift64RightJamming( aSig, - expDiff, &aSig );
2738 1.1 ross bSig |= LIT64( 0x4000000000000000 );
2739 1.1 ross bBigger:
2740 1.1 ross zSig = bSig - aSig;
2741 1.1 ross zExp = bExp;
2742 1.1 ross zSign ^= 1;
2743 1.1 ross goto normalizeRoundAndPack;
2744 1.1 ross aExpBigger:
2745 1.1 ross if ( aExp == 0x7FF ) {
2746 1.1 ross if ( aSig ) return propagateFloat64NaN( a, b );
2747 1.1 ross return a;
2748 1.1 ross }
2749 1.1 ross if ( bExp == 0 ) {
2750 1.1 ross --expDiff;
2751 1.1 ross }
2752 1.1 ross else {
2753 1.1 ross bSig |= LIT64( 0x4000000000000000 );
2754 1.1 ross }
2755 1.1 ross shift64RightJamming( bSig, expDiff, &bSig );
2756 1.1 ross aSig |= LIT64( 0x4000000000000000 );
2757 1.1 ross aBigger:
2758 1.1 ross zSig = aSig - bSig;
2759 1.1 ross zExp = aExp;
2760 1.1 ross normalizeRoundAndPack:
2761 1.1 ross --zExp;
2762 1.1 ross return normalizeRoundAndPackFloat64( zSign, zExp, zSig );
2763 1.1 ross
2764 1.1 ross }
2765 1.1 ross
2766 1.1 ross /*
2767 1.1 ross -------------------------------------------------------------------------------
2768 1.1 ross Returns the result of adding the double-precision floating-point values `a'
2769 1.1 ross and `b'. The operation is performed according to the IEC/IEEE Standard for
2770 1.1 ross Binary Floating-Point Arithmetic.
2771 1.1 ross -------------------------------------------------------------------------------
2772 1.1 ross */
2773 1.1 ross float64 float64_add( float64 a, float64 b )
2774 1.1 ross {
2775 1.1 ross flag aSign, bSign;
2776 1.1 ross
2777 1.1 ross aSign = extractFloat64Sign( a );
2778 1.1 ross bSign = extractFloat64Sign( b );
2779 1.1 ross if ( aSign == bSign ) {
2780 1.1 ross return addFloat64Sigs( a, b, aSign );
2781 1.1 ross }
2782 1.1 ross else {
2783 1.1 ross return subFloat64Sigs( a, b, aSign );
2784 1.1 ross }
2785 1.1 ross
2786 1.1 ross }
2787 1.1 ross
2788 1.1 ross /*
2789 1.1 ross -------------------------------------------------------------------------------
2790 1.1 ross Returns the result of subtracting the double-precision floating-point values
2791 1.1 ross `a' and `b'. The operation is performed according to the IEC/IEEE Standard
2792 1.1 ross for Binary Floating-Point Arithmetic.
2793 1.1 ross -------------------------------------------------------------------------------
2794 1.1 ross */
2795 1.1 ross float64 float64_sub( float64 a, float64 b )
2796 1.1 ross {
2797 1.1 ross flag aSign, bSign;
2798 1.1 ross
2799 1.1 ross aSign = extractFloat64Sign( a );
2800 1.1 ross bSign = extractFloat64Sign( b );
2801 1.1 ross if ( aSign == bSign ) {
2802 1.1 ross return subFloat64Sigs( a, b, aSign );
2803 1.1 ross }
2804 1.1 ross else {
2805 1.1 ross return addFloat64Sigs( a, b, aSign );
2806 1.1 ross }
2807 1.1 ross
2808 1.1 ross }
2809 1.1 ross
2810 1.1 ross /*
2811 1.1 ross -------------------------------------------------------------------------------
2812 1.1 ross Returns the result of multiplying the double-precision floating-point values
2813 1.1 ross `a' and `b'. The operation is performed according to the IEC/IEEE Standard
2814 1.1 ross for Binary Floating-Point Arithmetic.
2815 1.1 ross -------------------------------------------------------------------------------
2816 1.1 ross */
2817 1.1 ross float64 float64_mul( float64 a, float64 b )
2818 1.1 ross {
2819 1.1 ross flag aSign, bSign, zSign;
2820 1.1 ross int16 aExp, bExp, zExp;
2821 1.1 ross bits64 aSig, bSig, zSig0, zSig1;
2822 1.1 ross
2823 1.1 ross aSig = extractFloat64Frac( a );
2824 1.1 ross aExp = extractFloat64Exp( a );
2825 1.1 ross aSign = extractFloat64Sign( a );
2826 1.1 ross bSig = extractFloat64Frac( b );
2827 1.1 ross bExp = extractFloat64Exp( b );
2828 1.1 ross bSign = extractFloat64Sign( b );
2829 1.1 ross zSign = aSign ^ bSign;
2830 1.1 ross if ( aExp == 0x7FF ) {
2831 1.1 ross if ( aSig || ( ( bExp == 0x7FF ) && bSig ) ) {
2832 1.1 ross return propagateFloat64NaN( a, b );
2833 1.1 ross }
2834 1.1 ross if ( ( bExp | bSig ) == 0 ) {
2835 1.1 ross float_raise( float_flag_invalid );
2836 1.1 ross return float64_default_nan;
2837 1.1 ross }
2838 1.1 ross return packFloat64( zSign, 0x7FF, 0 );
2839 1.1 ross }
2840 1.1 ross if ( bExp == 0x7FF ) {
2841 1.1 ross if ( bSig ) return propagateFloat64NaN( a, b );
2842 1.1 ross if ( ( aExp | aSig ) == 0 ) {
2843 1.1 ross float_raise( float_flag_invalid );
2844 1.1 ross return float64_default_nan;
2845 1.1 ross }
2846 1.1 ross return packFloat64( zSign, 0x7FF, 0 );
2847 1.1 ross }
2848 1.1 ross if ( aExp == 0 ) {
2849 1.1 ross if ( aSig == 0 ) return packFloat64( zSign, 0, 0 );
2850 1.1 ross normalizeFloat64Subnormal( aSig, &aExp, &aSig );
2851 1.1 ross }
2852 1.1 ross if ( bExp == 0 ) {
2853 1.1 ross if ( bSig == 0 ) return packFloat64( zSign, 0, 0 );
2854 1.1 ross normalizeFloat64Subnormal( bSig, &bExp, &bSig );
2855 1.1 ross }
2856 1.1 ross zExp = aExp + bExp - 0x3FF;
2857 1.1 ross aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<10;
2858 1.1 ross bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;
2859 1.1 ross mul64To128( aSig, bSig, &zSig0, &zSig1 );
2860 1.1 ross zSig0 |= ( zSig1 != 0 );
2861 1.1 ross if ( 0 <= (sbits64) ( zSig0<<1 ) ) {
2862 1.1 ross zSig0 <<= 1;
2863 1.1 ross --zExp;
2864 1.1 ross }
2865 1.1 ross return roundAndPackFloat64( zSign, zExp, zSig0 );
2866 1.1 ross
2867 1.1 ross }
2868 1.1 ross
2869 1.1 ross /*
2870 1.1 ross -------------------------------------------------------------------------------
2871 1.1 ross Returns the result of dividing the double-precision floating-point value `a'
2872 1.1 ross by the corresponding value `b'. The operation is performed according to
2873 1.1 ross the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2874 1.1 ross -------------------------------------------------------------------------------
2875 1.1 ross */
2876 1.1 ross float64 float64_div( float64 a, float64 b )
2877 1.1 ross {
2878 1.1 ross flag aSign, bSign, zSign;
2879 1.1 ross int16 aExp, bExp, zExp;
2880 1.1 ross bits64 aSig, bSig, zSig;
2881 1.1 ross bits64 rem0, rem1;
2882 1.1 ross bits64 term0, term1;
2883 1.1 ross
2884 1.1 ross aSig = extractFloat64Frac( a );
2885 1.1 ross aExp = extractFloat64Exp( a );
2886 1.1 ross aSign = extractFloat64Sign( a );
2887 1.1 ross bSig = extractFloat64Frac( b );
2888 1.1 ross bExp = extractFloat64Exp( b );
2889 1.1 ross bSign = extractFloat64Sign( b );
2890 1.1 ross zSign = aSign ^ bSign;
2891 1.1 ross if ( aExp == 0x7FF ) {
2892 1.1 ross if ( aSig ) return propagateFloat64NaN( a, b );
2893 1.1 ross if ( bExp == 0x7FF ) {
2894 1.1 ross if ( bSig ) return propagateFloat64NaN( a, b );
2895 1.1 ross float_raise( float_flag_invalid );
2896 1.1 ross return float64_default_nan;
2897 1.1 ross }
2898 1.1 ross return packFloat64( zSign, 0x7FF, 0 );
2899 1.1 ross }
2900 1.1 ross if ( bExp == 0x7FF ) {
2901 1.1 ross if ( bSig ) return propagateFloat64NaN( a, b );
2902 1.1 ross return packFloat64( zSign, 0, 0 );
2903 1.1 ross }
2904 1.1 ross if ( bExp == 0 ) {
2905 1.1 ross if ( bSig == 0 ) {
2906 1.1 ross if ( ( aExp | aSig ) == 0 ) {
2907 1.1 ross float_raise( float_flag_invalid );
2908 1.1 ross return float64_default_nan;
2909 1.1 ross }
2910 1.1 ross float_raise( float_flag_divbyzero );
2911 1.1 ross return packFloat64( zSign, 0x7FF, 0 );
2912 1.1 ross }
2913 1.1 ross normalizeFloat64Subnormal( bSig, &bExp, &bSig );
2914 1.1 ross }
2915 1.1 ross if ( aExp == 0 ) {
2916 1.1 ross if ( aSig == 0 ) return packFloat64( zSign, 0, 0 );
2917 1.1 ross normalizeFloat64Subnormal( aSig, &aExp, &aSig );
2918 1.1 ross }
2919 1.1 ross zExp = aExp - bExp + 0x3FD;
2920 1.1 ross aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<10;
2921 1.1 ross bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;
2922 1.1 ross if ( bSig <= ( aSig + aSig ) ) {
2923 1.1 ross aSig >>= 1;
2924 1.1 ross ++zExp;
2925 1.1 ross }
2926 1.1 ross zSig = estimateDiv128To64( aSig, 0, bSig );
2927 1.1 ross if ( ( zSig & 0x1FF ) <= 2 ) {
2928 1.1 ross mul64To128( bSig, zSig, &term0, &term1 );
2929 1.1 ross sub128( aSig, 0, term0, term1, &rem0, &rem1 );
2930 1.1 ross while ( (sbits64) rem0 < 0 ) {
2931 1.1 ross --zSig;
2932 1.1 ross add128( rem0, rem1, 0, bSig, &rem0, &rem1 );
2933 1.1 ross }
2934 1.1 ross zSig |= ( rem1 != 0 );
2935 1.1 ross }
2936 1.1 ross return roundAndPackFloat64( zSign, zExp, zSig );
2937 1.1 ross
2938 1.1 ross }
2939 1.1 ross
2940 1.1 ross #ifndef SOFTFLOAT_FOR_GCC
2941 1.1 ross /*
2942 1.1 ross -------------------------------------------------------------------------------
2943 1.1 ross Returns the remainder of the double-precision floating-point value `a'
2944 1.1 ross with respect to the corresponding value `b'. The operation is performed
2945 1.1 ross according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2946 1.1 ross -------------------------------------------------------------------------------
2947 1.1 ross */
2948 1.1 ross float64 float64_rem( float64 a, float64 b )
2949 1.1 ross {
2950 1.1 ross flag aSign, bSign, zSign;
2951 1.1 ross int16 aExp, bExp, expDiff;
2952 1.1 ross bits64 aSig, bSig;
2953 1.1 ross bits64 q, alternateASig;
2954 1.1 ross sbits64 sigMean;
2955 1.1 ross
2956 1.1 ross aSig = extractFloat64Frac( a );
2957 1.1 ross aExp = extractFloat64Exp( a );
2958 1.1 ross aSign = extractFloat64Sign( a );
2959 1.1 ross bSig = extractFloat64Frac( b );
2960 1.1 ross bExp = extractFloat64Exp( b );
2961 1.1 ross bSign = extractFloat64Sign( b );
2962 1.1 ross if ( aExp == 0x7FF ) {
2963 1.1 ross if ( aSig || ( ( bExp == 0x7FF ) && bSig ) ) {
2964 1.1 ross return propagateFloat64NaN( a, b );
2965 1.1 ross }
2966 1.1 ross float_raise( float_flag_invalid );
2967 1.1 ross return float64_default_nan;
2968 1.1 ross }
2969 1.1 ross if ( bExp == 0x7FF ) {
2970 1.1 ross if ( bSig ) return propagateFloat64NaN( a, b );
2971 1.1 ross return a;
2972 1.1 ross }
2973 1.1 ross if ( bExp == 0 ) {
2974 1.1 ross if ( bSig == 0 ) {
2975 1.1 ross float_raise( float_flag_invalid );
2976 1.1 ross return float64_default_nan;
2977 1.1 ross }
2978 1.1 ross normalizeFloat64Subnormal( bSig, &bExp, &bSig );
2979 1.1 ross }
2980 1.1 ross if ( aExp == 0 ) {
2981 1.1 ross if ( aSig == 0 ) return a;
2982 1.1 ross normalizeFloat64Subnormal( aSig, &aExp, &aSig );
2983 1.1 ross }
2984 1.1 ross expDiff = aExp - bExp;
2985 1.1 ross aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<11;
2986 1.1 ross bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;
2987 1.1 ross if ( expDiff < 0 ) {
2988 1.1 ross if ( expDiff < -1 ) return a;
2989 1.1 ross aSig >>= 1;
2990 1.1 ross }
2991 1.1 ross q = ( bSig <= aSig );
2992 1.1 ross if ( q ) aSig -= bSig;
2993 1.1 ross expDiff -= 64;
2994 1.1 ross while ( 0 < expDiff ) {
2995 1.1 ross q = estimateDiv128To64( aSig, 0, bSig );
2996 1.1 ross q = ( 2 < q ) ? q - 2 : 0;
2997 1.1 ross aSig = - ( ( bSig>>2 ) * q );
2998 1.1 ross expDiff -= 62;
2999 1.1 ross }
3000 1.1 ross expDiff += 64;
3001 1.1 ross if ( 0 < expDiff ) {
3002 1.1 ross q = estimateDiv128To64( aSig, 0, bSig );
3003 1.1 ross q = ( 2 < q ) ? q - 2 : 0;
3004 1.1 ross q >>= 64 - expDiff;
3005 1.1 ross bSig >>= 2;
3006 1.1 ross aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q;
3007 1.1 ross }
3008 1.1 ross else {
3009 1.1 ross aSig >>= 2;
3010 1.1 ross bSig >>= 2;
3011 1.1 ross }
3012 1.1 ross do {
3013 1.1 ross alternateASig = aSig;
3014 1.1 ross ++q;
3015 1.1 ross aSig -= bSig;
3016 1.1 ross } while ( 0 <= (sbits64) aSig );
3017 1.1 ross sigMean = aSig + alternateASig;
3018 1.1 ross if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) {
3019 1.1 ross aSig = alternateASig;
3020 1.1 ross }
3021 1.1 ross zSign = ( (sbits64) aSig < 0 );
3022 1.1 ross if ( zSign ) aSig = - aSig;
3023 1.1 ross return normalizeRoundAndPackFloat64( aSign ^ zSign, bExp, aSig );
3024 1.1 ross
3025 1.1 ross }
3026 1.1 ross
3027 1.1 ross /*
3028 1.1 ross -------------------------------------------------------------------------------
3029 1.1 ross Returns the square root of the double-precision floating-point value `a'.
3030 1.1 ross The operation is performed according to the IEC/IEEE Standard for Binary
3031 1.1 ross Floating-Point Arithmetic.
3032 1.1 ross -------------------------------------------------------------------------------
3033 1.1 ross */
3034 1.1 ross float64 float64_sqrt( float64 a )
3035 1.1 ross {
3036 1.1 ross flag aSign;
3037 1.1 ross int16 aExp, zExp;
3038 1.1 ross bits64 aSig, zSig, doubleZSig;
3039 1.1 ross bits64 rem0, rem1, term0, term1;
3040 1.1 ross
3041 1.1 ross aSig = extractFloat64Frac( a );
3042 1.1 ross aExp = extractFloat64Exp( a );
3043 1.1 ross aSign = extractFloat64Sign( a );
3044 1.1 ross if ( aExp == 0x7FF ) {
3045 1.1 ross if ( aSig ) return propagateFloat64NaN( a, a );
3046 1.1 ross if ( ! aSign ) return a;
3047 1.1 ross float_raise( float_flag_invalid );
3048 1.1 ross return float64_default_nan;
3049 1.1 ross }
3050 1.1 ross if ( aSign ) {
3051 1.1 ross if ( ( aExp | aSig ) == 0 ) return a;
3052 1.1 ross float_raise( float_flag_invalid );
3053 1.1 ross return float64_default_nan;
3054 1.1 ross }
3055 1.1 ross if ( aExp == 0 ) {
3056 1.1 ross if ( aSig == 0 ) return 0;
3057 1.1 ross normalizeFloat64Subnormal( aSig, &aExp, &aSig );
3058 1.1 ross }
3059 1.1 ross zExp = ( ( aExp - 0x3FF )>>1 ) + 0x3FE;
3060 1.1 ross aSig |= LIT64( 0x0010000000000000 );
3061 1.1 ross zSig = estimateSqrt32( aExp, aSig>>21 );
3062 1.1 ross aSig <<= 9 - ( aExp & 1 );
3063 1.1 ross zSig = estimateDiv128To64( aSig, 0, zSig<<32 ) + ( zSig<<30 );
3064 1.1 ross if ( ( zSig & 0x1FF ) <= 5 ) {
3065 1.1 ross doubleZSig = zSig<<1;
3066 1.1 ross mul64To128( zSig, zSig, &term0, &term1 );
3067 1.1 ross sub128( aSig, 0, term0, term1, &rem0, &rem1 );
3068 1.1 ross while ( (sbits64) rem0 < 0 ) {
3069 1.1 ross --zSig;
3070 1.1 ross doubleZSig -= 2;
3071 1.1 ross add128( rem0, rem1, zSig>>63, doubleZSig | 1, &rem0, &rem1 );
3072 1.1 ross }
3073 1.1 ross zSig |= ( ( rem0 | rem1 ) != 0 );
3074 1.1 ross }
3075 1.1 ross return roundAndPackFloat64( 0, zExp, zSig );
3076 1.1 ross
3077 1.1 ross }
3078 1.1 ross #endif
3079 1.1 ross
3080 1.1 ross /*
3081 1.1 ross -------------------------------------------------------------------------------
3082 1.1 ross Returns 1 if the double-precision floating-point value `a' is equal to the
3083 1.1 ross corresponding value `b', and 0 otherwise. The comparison is performed
3084 1.1 ross according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3085 1.1 ross -------------------------------------------------------------------------------
3086 1.1 ross */
3087 1.1 ross flag float64_eq( float64 a, float64 b )
3088 1.1 ross {
3089 1.1 ross
3090 1.1 ross if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
3091 1.1 ross || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
3092 1.1 ross ) {
3093 1.1 ross if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) {
3094 1.1 ross float_raise( float_flag_invalid );
3095 1.1 ross }
3096 1.1 ross return 0;
3097 1.1 ross }
3098 1.1 ross return ( a == b ) ||
3099 1.1 ross ( (bits64) ( ( FLOAT64_DEMANGLE(a) | FLOAT64_DEMANGLE(b) )<<1 ) == 0 );
3100 1.1 ross
3101 1.1 ross }
3102 1.1 ross
3103 1.1 ross /*
3104 1.1 ross -------------------------------------------------------------------------------
3105 1.1 ross Returns 1 if the double-precision floating-point value `a' is less than or
3106 1.1 ross equal to the corresponding value `b', and 0 otherwise. The comparison is
3107 1.1 ross performed according to the IEC/IEEE Standard for Binary Floating-Point
3108 1.1 ross Arithmetic.
3109 1.1 ross -------------------------------------------------------------------------------
3110 1.1 ross */
3111 1.1 ross flag float64_le( float64 a, float64 b )
3112 1.1 ross {
3113 1.1 ross flag aSign, bSign;
3114 1.1 ross
3115 1.1 ross if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
3116 1.1 ross || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
3117 1.1 ross ) {
3118 1.1 ross float_raise( float_flag_invalid );
3119 1.1 ross return 0;
3120 1.1 ross }
3121 1.1 ross aSign = extractFloat64Sign( a );
3122 1.1 ross bSign = extractFloat64Sign( b );
3123 1.1 ross if ( aSign != bSign )
3124 1.1 ross return aSign ||
3125 1.1 ross ( (bits64) ( ( FLOAT64_DEMANGLE(a) | FLOAT64_DEMANGLE(b) )<<1 ) ==
3126 1.1 ross 0 );
3127 1.1 ross return ( a == b ) ||
3128 1.1 ross ( aSign ^ ( FLOAT64_DEMANGLE(a) < FLOAT64_DEMANGLE(b) ) );
3129 1.1 ross
3130 1.1 ross }
3131 1.1 ross
3132 1.1 ross /*
3133 1.1 ross -------------------------------------------------------------------------------
3134 1.1 ross Returns 1 if the double-precision floating-point value `a' is less than
3135 1.1 ross the corresponding value `b', and 0 otherwise. The comparison is performed
3136 1.1 ross according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3137 1.1 ross -------------------------------------------------------------------------------
3138 1.1 ross */
3139 1.1 ross flag float64_lt( float64 a, float64 b )
3140 1.1 ross {
3141 1.1 ross flag aSign, bSign;
3142 1.1 ross
3143 1.1 ross if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
3144 1.1 ross || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
3145 1.1 ross ) {
3146 1.1 ross float_raise( float_flag_invalid );
3147 1.1 ross return 0;
3148 1.1 ross }
3149 1.1 ross aSign = extractFloat64Sign( a );
3150 1.1 ross bSign = extractFloat64Sign( b );
3151 1.1 ross if ( aSign != bSign )
3152 1.1 ross return aSign &&
3153 1.1 ross ( (bits64) ( ( FLOAT64_DEMANGLE(a) | FLOAT64_DEMANGLE(b) )<<1 ) !=
3154 1.1 ross 0 );
3155 1.1 ross return ( a != b ) &&
3156 1.1 ross ( aSign ^ ( FLOAT64_DEMANGLE(a) < FLOAT64_DEMANGLE(b) ) );
3157 1.1 ross
3158 1.1 ross }
3159 1.1 ross
3160 1.1 ross #ifndef SOFTFLOAT_FOR_GCC
3161 1.1 ross /*
3162 1.1 ross -------------------------------------------------------------------------------
3163 1.1 ross Returns 1 if the double-precision floating-point value `a' is equal to the
3164 1.1 ross corresponding value `b', and 0 otherwise. The invalid exception is raised
3165 1.1 ross if either operand is a NaN. Otherwise, the comparison is performed
3166 1.1 ross according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3167 1.1 ross -------------------------------------------------------------------------------
3168 1.1 ross */
3169 1.1 ross flag float64_eq_signaling( float64 a, float64 b )
3170 1.1 ross {
3171 1.1 ross
3172 1.1 ross if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
3173 1.1 ross || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
3174 1.1 ross ) {
3175 1.1 ross float_raise( float_flag_invalid );
3176 1.1 ross return 0;
3177 1.1 ross }
3178 1.1 ross return ( a == b ) || ( (bits64) ( ( a | b )<<1 ) == 0 );
3179 1.1 ross
3180 1.1 ross }
3181 1.1 ross
3182 1.1 ross /*
3183 1.1 ross -------------------------------------------------------------------------------
3184 1.1 ross Returns 1 if the double-precision floating-point value `a' is less than or
3185 1.1 ross equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not
3186 1.1 ross cause an exception. Otherwise, the comparison is performed according to the
3187 1.1 ross IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3188 1.1 ross -------------------------------------------------------------------------------
3189 1.1 ross */
3190 1.1 ross flag float64_le_quiet( float64 a, float64 b )
3191 1.1 ross {
3192 1.1 ross flag aSign, bSign;
3193 1.1 ross
3194 1.1 ross if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
3195 1.1 ross || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
3196 1.1 ross ) {
3197 1.1 ross if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) {
3198 1.1 ross float_raise( float_flag_invalid );
3199 1.1 ross }
3200 1.1 ross return 0;
3201 1.1 ross }
3202 1.1 ross aSign = extractFloat64Sign( a );
3203 1.1 ross bSign = extractFloat64Sign( b );
3204 1.1 ross if ( aSign != bSign ) return aSign || ( (bits64) ( ( a | b )<<1 ) == 0 );
3205 1.1 ross return ( a == b ) || ( aSign ^ ( a < b ) );
3206 1.1 ross
3207 1.1 ross }
3208 1.1 ross
3209 1.1 ross /*
3210 1.1 ross -------------------------------------------------------------------------------
3211 1.1 ross Returns 1 if the double-precision floating-point value `a' is less than
3212 1.1 ross the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
3213 1.1 ross exception. Otherwise, the comparison is performed according to the IEC/IEEE
3214 1.1 ross Standard for Binary Floating-Point Arithmetic.
3215 1.1 ross -------------------------------------------------------------------------------
3216 1.1 ross */
3217 1.1 ross flag float64_lt_quiet( float64 a, float64 b )
3218 1.1 ross {
3219 1.1 ross flag aSign, bSign;
3220 1.1 ross
3221 1.1 ross if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
3222 1.1 ross || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
3223 1.1 ross ) {
3224 1.1 ross if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) {
3225 1.1 ross float_raise( float_flag_invalid );
3226 1.1 ross }
3227 1.1 ross return 0;
3228 1.1 ross }
3229 1.1 ross aSign = extractFloat64Sign( a );
3230 1.1 ross bSign = extractFloat64Sign( b );
3231 1.1 ross if ( aSign != bSign ) return aSign && ( (bits64) ( ( a | b )<<1 ) != 0 );
3232 1.1 ross return ( a != b ) && ( aSign ^ ( a < b ) );
3233 1.1 ross
3234 1.1 ross }
3235 1.1 ross #endif
3236 1.1 ross
3237 1.1 ross #ifdef FLOATX80
3238 1.1 ross
3239 1.1 ross /*
3240 1.1 ross -------------------------------------------------------------------------------
3241 1.1 ross Returns the result of converting the extended double-precision floating-
3242 1.1 ross point value `a' to the 32-bit two's complement integer format. The
3243 1.1 ross conversion is performed according to the IEC/IEEE Standard for Binary
3244 1.1 ross Floating-Point Arithmetic---which means in particular that the conversion
3245 1.1 ross is rounded according to the current rounding mode. If `a' is a NaN, the
3246 1.1 ross largest positive integer is returned. Otherwise, if the conversion
3247 1.1 ross overflows, the largest integer with the same sign as `a' is returned.
3248 1.1 ross -------------------------------------------------------------------------------
3249 1.1 ross */
3250 1.1 ross int32 floatx80_to_int32( floatx80 a )
3251 1.1 ross {
3252 1.1 ross flag aSign;
3253 1.1 ross int32 aExp, shiftCount;
3254 1.1 ross bits64 aSig;
3255 1.1 ross
3256 1.1 ross aSig = extractFloatx80Frac( a );
3257 1.1 ross aExp = extractFloatx80Exp( a );
3258 1.1 ross aSign = extractFloatx80Sign( a );
3259 1.1 ross if ( ( aExp == 0x7FFF ) && (bits64) ( aSig<<1 ) ) aSign = 0;
3260 1.1 ross shiftCount = 0x4037 - aExp;
3261 1.1 ross if ( shiftCount <= 0 ) shiftCount = 1;
3262 1.1 ross shift64RightJamming( aSig, shiftCount, &aSig );
3263 1.1 ross return roundAndPackInt32( aSign, aSig );
3264 1.1 ross
3265 1.1 ross }
3266 1.1 ross
3267 1.1 ross /*
3268 1.1 ross -------------------------------------------------------------------------------
3269 1.1 ross Returns the result of converting the extended double-precision floating-
3270 1.1 ross point value `a' to the 32-bit two's complement integer format. The
3271 1.1 ross conversion is performed according to the IEC/IEEE Standard for Binary
3272 1.1 ross Floating-Point Arithmetic, except that the conversion is always rounded
3273 1.1 ross toward zero. If `a' is a NaN, the largest positive integer is returned.
3274 1.1 ross Otherwise, if the conversion overflows, the largest integer with the same
3275 1.1 ross sign as `a' is returned.
3276 1.1 ross -------------------------------------------------------------------------------
3277 1.1 ross */
3278 1.1 ross int32 floatx80_to_int32_round_to_zero( floatx80 a )
3279 1.1 ross {
3280 1.1 ross flag aSign;
3281 1.1 ross int32 aExp, shiftCount;
3282 1.1 ross bits64 aSig, savedASig;
3283 1.1 ross int32 z;
3284 1.1 ross
3285 1.1 ross aSig = extractFloatx80Frac( a );
3286 1.1 ross aExp = extractFloatx80Exp( a );
3287 1.1 ross aSign = extractFloatx80Sign( a );
3288 1.1 ross if ( 0x401E < aExp ) {
3289 1.1 ross if ( ( aExp == 0x7FFF ) && (bits64) ( aSig<<1 ) ) aSign = 0;
3290 1.1 ross goto invalid;
3291 1.1 ross }
3292 1.1 ross else if ( aExp < 0x3FFF ) {
3293 1.1 ross if ( aExp || aSig ) float_set_inexact();
3294 1.1 ross return 0;
3295 1.1 ross }
3296 1.1 ross shiftCount = 0x403E - aExp;
3297 1.1 ross savedASig = aSig;
3298 1.1 ross aSig >>= shiftCount;
3299 1.1 ross z = aSig;
3300 1.1 ross if ( aSign ) z = - z;
3301 1.1 ross if ( ( z < 0 ) ^ aSign ) {
3302 1.1 ross invalid:
3303 1.1 ross float_raise( float_flag_invalid );
3304 1.1 ross return aSign ? (sbits32) 0x80000000 : 0x7FFFFFFF;
3305 1.1 ross }
3306 1.1 ross if ( ( aSig<<shiftCount ) != savedASig ) {
3307 1.1 ross float_set_inexact();
3308 1.1 ross }
3309 1.1 ross return z;
3310 1.1 ross
3311 1.1 ross }
3312 1.1 ross
3313 1.1 ross /*
3314 1.1 ross -------------------------------------------------------------------------------
3315 1.1 ross Returns the result of converting the extended double-precision floating-
3316 1.1 ross point value `a' to the 64-bit two's complement integer format. The
3317 1.1 ross conversion is performed according to the IEC/IEEE Standard for Binary
3318 1.1 ross Floating-Point Arithmetic---which means in particular that the conversion
3319 1.1 ross is rounded according to the current rounding mode. If `a' is a NaN,
3320 1.1 ross the largest positive integer is returned. Otherwise, if the conversion
3321 1.1 ross overflows, the largest integer with the same sign as `a' is returned.
3322 1.1 ross -------------------------------------------------------------------------------
3323 1.1 ross */
3324 1.1 ross int64 floatx80_to_int64( floatx80 a )
3325 1.1 ross {
3326 1.1 ross flag aSign;
3327 1.1 ross int32 aExp, shiftCount;
3328 1.1 ross bits64 aSig, aSigExtra;
3329 1.1 ross
3330 1.1 ross aSig = extractFloatx80Frac( a );
3331 1.1 ross aExp = extractFloatx80Exp( a );
3332 1.1 ross aSign = extractFloatx80Sign( a );
3333 1.1 ross shiftCount = 0x403E - aExp;
3334 1.1 ross if ( shiftCount <= 0 ) {
3335 1.1 ross if ( shiftCount ) {
3336 1.1 ross float_raise( float_flag_invalid );
3337 1.1 ross if ( ! aSign
3338 1.1 ross || ( ( aExp == 0x7FFF )
3339 1.1 ross && ( aSig != LIT64( 0x8000000000000000 ) ) )
3340 1.1 ross ) {
3341 1.1 ross return LIT64( 0x7FFFFFFFFFFFFFFF );
3342 1.1 ross }
3343 1.1 ross return (sbits64) LIT64( 0x8000000000000000 );
3344 1.1 ross }
3345 1.1 ross aSigExtra = 0;
3346 1.1 ross }
3347 1.1 ross else {
3348 1.1 ross shift64ExtraRightJamming( aSig, 0, shiftCount, &aSig, &aSigExtra );
3349 1.1 ross }
3350 1.1 ross return roundAndPackInt64( aSign, aSig, aSigExtra );
3351 1.1 ross
3352 1.1 ross }
3353 1.1 ross
3354 1.1 ross /*
3355 1.1 ross -------------------------------------------------------------------------------
3356 1.1 ross Returns the result of converting the extended double-precision floating-
3357 1.1 ross point value `a' to the 64-bit two's complement integer format. The
3358 1.1 ross conversion is performed according to the IEC/IEEE Standard for Binary
3359 1.1 ross Floating-Point Arithmetic, except that the conversion is always rounded
3360 1.1 ross toward zero. If `a' is a NaN, the largest positive integer is returned.
3361 1.1 ross Otherwise, if the conversion overflows, the largest integer with the same
3362 1.1 ross sign as `a' is returned.
3363 1.1 ross -------------------------------------------------------------------------------
3364 1.1 ross */
3365 1.1 ross int64 floatx80_to_int64_round_to_zero( floatx80 a )
3366 1.1 ross {
3367 1.1 ross flag aSign;
3368 1.1 ross int32 aExp, shiftCount;
3369 1.1 ross bits64 aSig;
3370 1.1 ross int64 z;
3371 1.1 ross
3372 1.1 ross aSig = extractFloatx80Frac( a );
3373 1.1 ross aExp = extractFloatx80Exp( a );
3374 1.1 ross aSign = extractFloatx80Sign( a );
3375 1.1 ross shiftCount = aExp - 0x403E;
3376 1.1 ross if ( 0 <= shiftCount ) {
3377 1.1 ross aSig &= LIT64( 0x7FFFFFFFFFFFFFFF );
3378 1.1 ross if ( ( a.high != 0xC03E ) || aSig ) {
3379 1.1 ross float_raise( float_flag_invalid );
3380 1.1 ross if ( ! aSign || ( ( aExp == 0x7FFF ) && aSig ) ) {
3381 1.1 ross return LIT64( 0x7FFFFFFFFFFFFFFF );
3382 1.1 ross }
3383 1.1 ross }
3384 1.1 ross return (sbits64) LIT64( 0x8000000000000000 );
3385 1.1 ross }
3386 1.1 ross else if ( aExp < 0x3FFF ) {
3387 1.1 ross if ( aExp | aSig ) float_set_inexact();
3388 1.1 ross return 0;
3389 1.1 ross }
3390 1.1 ross z = aSig>>( - shiftCount );
3391 1.1 ross if ( (bits64) ( aSig<<( shiftCount & 63 ) ) ) {
3392 1.1 ross float_set_inexact();
3393 1.1 ross }
3394 1.1 ross if ( aSign ) z = - z;
3395 1.1 ross return z;
3396 1.1 ross
3397 1.1 ross }
3398 1.1 ross
3399 1.1 ross /*
3400 1.1 ross -------------------------------------------------------------------------------
3401 1.1 ross Returns the result of converting the extended double-precision floating-
3402 1.1 ross point value `a' to the single-precision floating-point format. The
3403 1.1 ross conversion is performed according to the IEC/IEEE Standard for Binary
3404 1.1 ross Floating-Point Arithmetic.
3405 1.1 ross -------------------------------------------------------------------------------
3406 1.1 ross */
3407 1.1 ross float32 floatx80_to_float32( floatx80 a )
3408 1.1 ross {
3409 1.1 ross flag aSign;
3410 1.1 ross int32 aExp;
3411 1.1 ross bits64 aSig;
3412 1.1 ross
3413 1.1 ross aSig = extractFloatx80Frac( a );
3414 1.1 ross aExp = extractFloatx80Exp( a );
3415 1.1 ross aSign = extractFloatx80Sign( a );
3416 1.1 ross if ( aExp == 0x7FFF ) {
3417 1.1 ross if ( (bits64) ( aSig<<1 ) ) {
3418 1.1 ross return commonNaNToFloat32( floatx80ToCommonNaN( a ) );
3419 1.1 ross }
3420 1.1 ross return packFloat32( aSign, 0xFF, 0 );
3421 1.1 ross }
3422 1.1 ross shift64RightJamming( aSig, 33, &aSig );
3423 1.1 ross if ( aExp || aSig ) aExp -= 0x3F81;
3424 1.1 ross return roundAndPackFloat32( aSign, aExp, aSig );
3425 1.1 ross
3426 1.1 ross }
3427 1.1 ross
3428 1.1 ross /*
3429 1.1 ross -------------------------------------------------------------------------------
3430 1.1 ross Returns the result of converting the extended double-precision floating-
3431 1.1 ross point value `a' to the double-precision floating-point format. The
3432 1.1 ross conversion is performed according to the IEC/IEEE Standard for Binary
3433 1.1 ross Floating-Point Arithmetic.
3434 1.1 ross -------------------------------------------------------------------------------
3435 1.1 ross */
3436 1.1 ross float64 floatx80_to_float64( floatx80 a )
3437 1.1 ross {
3438 1.1 ross flag aSign;
3439 1.1 ross int32 aExp;
3440 1.1 ross bits64 aSig, zSig;
3441 1.1 ross
3442 1.1 ross aSig = extractFloatx80Frac( a );
3443 1.1 ross aExp = extractFloatx80Exp( a );
3444 1.1 ross aSign = extractFloatx80Sign( a );
3445 1.1 ross if ( aExp == 0x7FFF ) {
3446 1.1 ross if ( (bits64) ( aSig<<1 ) ) {
3447 1.1 ross return commonNaNToFloat64( floatx80ToCommonNaN( a ) );
3448 1.1 ross }
3449 1.1 ross return packFloat64( aSign, 0x7FF, 0 );
3450 1.1 ross }
3451 1.1 ross shift64RightJamming( aSig, 1, &zSig );
3452 1.1 ross if ( aExp || aSig ) aExp -= 0x3C01;
3453 1.1 ross return roundAndPackFloat64( aSign, aExp, zSig );
3454 1.1 ross
3455 1.1 ross }
3456 1.1 ross
3457 1.1 ross #ifdef FLOAT128
3458 1.1 ross
3459 1.1 ross /*
3460 1.1 ross -------------------------------------------------------------------------------
3461 1.1 ross Returns the result of converting the extended double-precision floating-
3462 1.1 ross point value `a' to the quadruple-precision floating-point format. The
3463 1.1 ross conversion is performed according to the IEC/IEEE Standard for Binary
3464 1.1 ross Floating-Point Arithmetic.
3465 1.1 ross -------------------------------------------------------------------------------
3466 1.1 ross */
3467 1.1 ross float128 floatx80_to_float128( floatx80 a )
3468 1.1 ross {
3469 1.1 ross flag aSign;
3470 1.1 ross int16 aExp;
3471 1.1 ross bits64 aSig, zSig0, zSig1;
3472 1.1 ross
3473 1.1 ross aSig = extractFloatx80Frac( a );
3474 1.1 ross aExp = extractFloatx80Exp( a );
3475 1.1 ross aSign = extractFloatx80Sign( a );
3476 1.1 ross if ( ( aExp == 0x7FFF ) && (bits64) ( aSig<<1 ) ) {
3477 1.1 ross return commonNaNToFloat128( floatx80ToCommonNaN( a ) );
3478 1.1 ross }
3479 1.1 ross shift128Right( aSig<<1, 0, 16, &zSig0, &zSig1 );
3480 1.1 ross return packFloat128( aSign, aExp, zSig0, zSig1 );
3481 1.1 ross
3482 1.1 ross }
3483 1.1 ross
3484 1.1 ross #endif
3485 1.1 ross
3486 1.1 ross /*
3487 1.1 ross -------------------------------------------------------------------------------
3488 1.1 ross Rounds the extended double-precision floating-point value `a' to an integer,
3489 1.1 ross and returns the result as an extended quadruple-precision floating-point
3490 1.1 ross value. The operation is performed according to the IEC/IEEE Standard for
3491 1.1 ross Binary Floating-Point Arithmetic.
3492 1.1 ross -------------------------------------------------------------------------------
3493 1.1 ross */
3494 1.1 ross floatx80 floatx80_round_to_int( floatx80 a )
3495 1.1 ross {
3496 1.1 ross flag aSign;
3497 1.1 ross int32 aExp;
3498 1.1 ross bits64 lastBitMask, roundBitsMask;
3499 1.1 ross int8 roundingMode;
3500 1.1 ross floatx80 z;
3501 1.1 ross
3502 1.1 ross aExp = extractFloatx80Exp( a );
3503 1.1 ross if ( 0x403E <= aExp ) {
3504 1.1 ross if ( ( aExp == 0x7FFF ) && (bits64) ( extractFloatx80Frac( a )<<1 ) ) {
3505 1.1 ross return propagateFloatx80NaN( a, a );
3506 1.1 ross }
3507 1.1 ross return a;
3508 1.1 ross }
3509 1.1 ross if ( aExp < 0x3FFF ) {
3510 1.1 ross if ( ( aExp == 0 )
3511 1.1 ross && ( (bits64) ( extractFloatx80Frac( a )<<1 ) == 0 ) ) {
3512 1.1 ross return a;
3513 1.1 ross }
3514 1.1 ross float_set_inexact();
3515 1.1 ross aSign = extractFloatx80Sign( a );
3516 1.1 ross switch ( float_rounding_mode() ) {
3517 1.1 ross case float_round_nearest_even:
3518 1.1 ross if ( ( aExp == 0x3FFE ) && (bits64) ( extractFloatx80Frac( a )<<1 )
3519 1.1 ross ) {
3520 1.1 ross return
3521 1.1 ross packFloatx80( aSign, 0x3FFF, LIT64( 0x8000000000000000 ) );
3522 1.1 ross }
3523 1.1 ross break;
3524 1.1 ross case float_round_down:
3525 1.1 ross return
3526 1.1 ross aSign ?
3527 1.1 ross packFloatx80( 1, 0x3FFF, LIT64( 0x8000000000000000 ) )
3528 1.1 ross : packFloatx80( 0, 0, 0 );
3529 1.1 ross case float_round_up:
3530 1.1 ross return
3531 1.1 ross aSign ? packFloatx80( 1, 0, 0 )
3532 1.1 ross : packFloatx80( 0, 0x3FFF, LIT64( 0x8000000000000000 ) );
3533 1.1 ross }
3534 1.1 ross return packFloatx80( aSign, 0, 0 );
3535 1.1 ross }
3536 1.1 ross lastBitMask = 1;
3537 1.1 ross lastBitMask <<= 0x403E - aExp;
3538 1.1 ross roundBitsMask = lastBitMask - 1;
3539 1.1 ross z = a;
3540 1.1 ross roundingMode = float_rounding_mode();
3541 1.1 ross if ( roundingMode == float_round_nearest_even ) {
3542 1.1 ross z.low += lastBitMask>>1;
3543 1.1 ross if ( ( z.low & roundBitsMask ) == 0 ) z.low &= ~ lastBitMask;
3544 1.1 ross }
3545 1.1 ross else if ( roundingMode != float_round_to_zero ) {
3546 1.1 ross if ( extractFloatx80Sign( z ) ^ ( roundingMode == float_round_up ) ) {
3547 1.1 ross z.low += roundBitsMask;
3548 1.1 ross }
3549 1.1 ross }
3550 1.1 ross z.low &= ~ roundBitsMask;
3551 1.1 ross if ( z.low == 0 ) {
3552 1.1 ross ++z.high;
3553 1.1 ross z.low = LIT64( 0x8000000000000000 );
3554 1.1 ross }
3555 1.1 ross if ( z.low != a.low ) float_set_inexact();
3556 1.1 ross return z;
3557 1.1 ross
3558 1.1 ross }
3559 1.1 ross
3560 1.1 ross /*
3561 1.1 ross -------------------------------------------------------------------------------
3562 1.1 ross Returns the result of adding the absolute values of the extended double-
3563 1.1 ross precision floating-point values `a' and `b'. If `zSign' is 1, the sum is
3564 1.1 ross negated before being returned. `zSign' is ignored if the result is a NaN.
3565 1.1 ross The addition is performed according to the IEC/IEEE Standard for Binary
3566 1.1 ross Floating-Point Arithmetic.
3567 1.1 ross -------------------------------------------------------------------------------
3568 1.1 ross */
3569 1.1 ross static floatx80 addFloatx80Sigs( floatx80 a, floatx80 b, flag zSign )
3570 1.1 ross {
3571 1.1 ross int32 aExp, bExp, zExp;
3572 1.1 ross bits64 aSig, bSig, zSig0, zSig1;
3573 1.1 ross int32 expDiff;
3574 1.1 ross
3575 1.1 ross aSig = extractFloatx80Frac( a );
3576 1.1 ross aExp = extractFloatx80Exp( a );
3577 1.1 ross bSig = extractFloatx80Frac( b );
3578 1.1 ross bExp = extractFloatx80Exp( b );
3579 1.1 ross expDiff = aExp - bExp;
3580 1.1 ross if ( 0 < expDiff ) {
3581 1.1 ross if ( aExp == 0x7FFF ) {
3582 1.1 ross if ( (bits64) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b );
3583 1.1 ross return a;
3584 1.1 ross }
3585 1.1 ross if ( bExp == 0 ) --expDiff;
3586 1.1 ross shift64ExtraRightJamming( bSig, 0, expDiff, &bSig, &zSig1 );
3587 1.1 ross zExp = aExp;
3588 1.1 ross }
3589 1.1 ross else if ( expDiff < 0 ) {
3590 1.1 ross if ( bExp == 0x7FFF ) {
3591 1.1 ross if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b );
3592 1.1 ross return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
3593 1.1 ross }
3594 1.1 ross if ( aExp == 0 ) ++expDiff;
3595 1.1 ross shift64ExtraRightJamming( aSig, 0, - expDiff, &aSig, &zSig1 );
3596 1.1 ross zExp = bExp;
3597 1.1 ross }
3598 1.1 ross else {
3599 1.1 ross if ( aExp == 0x7FFF ) {
3600 1.1 ross if ( (bits64) ( ( aSig | bSig )<<1 ) ) {
3601 1.1 ross return propagateFloatx80NaN( a, b );
3602 1.1 ross }
3603 1.1 ross return a;
3604 1.1 ross }
3605 1.1 ross zSig1 = 0;
3606 1.1 ross zSig0 = aSig + bSig;
3607 1.1 ross if ( aExp == 0 ) {
3608 1.1 ross normalizeFloatx80Subnormal( zSig0, &zExp, &zSig0 );
3609 1.1 ross goto roundAndPack;
3610 1.1 ross }
3611 1.1 ross zExp = aExp;
3612 1.1 ross goto shiftRight1;
3613 1.1 ross }
3614 1.1 ross zSig0 = aSig + bSig;
3615 1.1 ross if ( (sbits64) zSig0 < 0 ) goto roundAndPack;
3616 1.1 ross shiftRight1:
3617 1.1 ross shift64ExtraRightJamming( zSig0, zSig1, 1, &zSig0, &zSig1 );
3618 1.1 ross zSig0 |= LIT64( 0x8000000000000000 );
3619 1.1 ross ++zExp;
3620 1.1 ross roundAndPack:
3621 1.1 ross return
3622 1.1 ross roundAndPackFloatx80(
3623 1.1 ross floatx80_rounding_precision, zSign, zExp, zSig0, zSig1 );
3624 1.1 ross
3625 1.1 ross }
3626 1.1 ross
3627 1.1 ross /*
3628 1.1 ross -------------------------------------------------------------------------------
3629 1.1 ross Returns the result of subtracting the absolute values of the extended
3630 1.1 ross double-precision floating-point values `a' and `b'. If `zSign' is 1, the
3631 1.1 ross difference is negated before being returned. `zSign' is ignored if the
3632 1.1 ross result is a NaN. The subtraction is performed according to the IEC/IEEE
3633 1.1 ross Standard for Binary Floating-Point Arithmetic.
3634 1.1 ross -------------------------------------------------------------------------------
3635 1.1 ross */
3636 1.1 ross static floatx80 subFloatx80Sigs( floatx80 a, floatx80 b, flag zSign )
3637 1.1 ross {
3638 1.1 ross int32 aExp, bExp, zExp;
3639 1.1 ross bits64 aSig, bSig, zSig0, zSig1;
3640 1.1 ross int32 expDiff;
3641 1.1 ross floatx80 z;
3642 1.1 ross
3643 1.1 ross aSig = extractFloatx80Frac( a );
3644 1.1 ross aExp = extractFloatx80Exp( a );
3645 1.1 ross bSig = extractFloatx80Frac( b );
3646 1.1 ross bExp = extractFloatx80Exp( b );
3647 1.1 ross expDiff = aExp - bExp;
3648 1.1 ross if ( 0 < expDiff ) goto aExpBigger;
3649 1.1 ross if ( expDiff < 0 ) goto bExpBigger;
3650 1.1 ross if ( aExp == 0x7FFF ) {
3651 1.1 ross if ( (bits64) ( ( aSig | bSig )<<1 ) ) {
3652 1.1 ross return propagateFloatx80NaN( a, b );
3653 1.1 ross }
3654 1.1 ross float_raise( float_flag_invalid );
3655 1.1 ross z.low = floatx80_default_nan_low;
3656 1.1 ross z.high = floatx80_default_nan_high;
3657 1.1 ross return z;
3658 1.1 ross }
3659 1.1 ross if ( aExp == 0 ) {
3660 1.1 ross aExp = 1;
3661 1.1 ross bExp = 1;
3662 1.1 ross }
3663 1.1 ross zSig1 = 0;
3664 1.1 ross if ( bSig < aSig ) goto aBigger;
3665 1.1 ross if ( aSig < bSig ) goto bBigger;
3666 1.1 ross return packFloatx80( float_rounding_mode() == float_round_down, 0, 0 );
3667 1.1 ross bExpBigger:
3668 1.1 ross if ( bExp == 0x7FFF ) {
3669 1.1 ross if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b );
3670 1.1 ross return packFloatx80( zSign ^ 1, 0x7FFF, LIT64( 0x8000000000000000 ) );
3671 1.1 ross }
3672 1.1 ross if ( aExp == 0 ) ++expDiff;
3673 1.1 ross shift128RightJamming( aSig, 0, - expDiff, &aSig, &zSig1 );
3674 1.1 ross bBigger:
3675 1.1 ross sub128( bSig, 0, aSig, zSig1, &zSig0, &zSig1 );
3676 1.1 ross zExp = bExp;
3677 1.1 ross zSign ^= 1;
3678 1.1 ross goto normalizeRoundAndPack;
3679 1.1 ross aExpBigger:
3680 1.1 ross if ( aExp == 0x7FFF ) {
3681 1.1 ross if ( (bits64) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b );
3682 1.1 ross return a;
3683 1.1 ross }
3684 1.1 ross if ( bExp == 0 ) --expDiff;
3685 1.1 ross shift128RightJamming( bSig, 0, expDiff, &bSig, &zSig1 );
3686 1.1 ross aBigger:
3687 1.1 ross sub128( aSig, 0, bSig, zSig1, &zSig0, &zSig1 );
3688 1.1 ross zExp = aExp;
3689 1.1 ross normalizeRoundAndPack:
3690 1.1 ross return
3691 1.1 ross normalizeRoundAndPackFloatx80(
3692 1.1 ross floatx80_rounding_precision, zSign, zExp, zSig0, zSig1 );
3693 1.1 ross
3694 1.1 ross }
3695 1.1 ross
3696 1.1 ross /*
3697 1.1 ross -------------------------------------------------------------------------------
3698 1.1 ross Returns the result of adding the extended double-precision floating-point
3699 1.1 ross values `a' and `b'. The operation is performed according to the IEC/IEEE
3700 1.1 ross Standard for Binary Floating-Point Arithmetic.
3701 1.1 ross -------------------------------------------------------------------------------
3702 1.1 ross */
3703 1.1 ross floatx80 floatx80_add( floatx80 a, floatx80 b )
3704 1.1 ross {
3705 1.1 ross flag aSign, bSign;
3706 1.1 ross
3707 1.1 ross aSign = extractFloatx80Sign( a );
3708 1.1 ross bSign = extractFloatx80Sign( b );
3709 1.1 ross if ( aSign == bSign ) {
3710 1.1 ross return addFloatx80Sigs( a, b, aSign );
3711 1.1 ross }
3712 1.1 ross else {
3713 1.1 ross return subFloatx80Sigs( a, b, aSign );
3714 1.1 ross }
3715 1.1 ross
3716 1.1 ross }
3717 1.1 ross
3718 1.1 ross /*
3719 1.1 ross -------------------------------------------------------------------------------
3720 1.1 ross Returns the result of subtracting the extended double-precision floating-
3721 1.1 ross point values `a' and `b'. The operation is performed according to the
3722 1.1 ross IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3723 1.1 ross -------------------------------------------------------------------------------
3724 1.1 ross */
3725 1.1 ross floatx80 floatx80_sub( floatx80 a, floatx80 b )
3726 1.1 ross {
3727 1.1 ross flag aSign, bSign;
3728 1.1 ross
3729 1.1 ross aSign = extractFloatx80Sign( a );
3730 1.1 ross bSign = extractFloatx80Sign( b );
3731 1.1 ross if ( aSign == bSign ) {
3732 1.1 ross return subFloatx80Sigs( a, b, aSign );
3733 1.1 ross }
3734 1.1 ross else {
3735 1.1 ross return addFloatx80Sigs( a, b, aSign );
3736 1.1 ross }
3737 1.1 ross
3738 1.1 ross }
3739 1.1 ross
3740 1.1 ross /*
3741 1.1 ross -------------------------------------------------------------------------------
3742 1.1 ross Returns the result of multiplying the extended double-precision floating-
3743 1.1 ross point values `a' and `b'. The operation is performed according to the
3744 1.1 ross IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3745 1.1 ross -------------------------------------------------------------------------------
3746 1.1 ross */
3747 1.1 ross floatx80 floatx80_mul( floatx80 a, floatx80 b )
3748 1.1 ross {
3749 1.1 ross flag aSign, bSign, zSign;
3750 1.1 ross int32 aExp, bExp, zExp;
3751 1.1 ross bits64 aSig, bSig, zSig0, zSig1;
3752 1.1 ross floatx80 z;
3753 1.1 ross
3754 1.1 ross aSig = extractFloatx80Frac( a );
3755 1.1 ross aExp = extractFloatx80Exp( a );
3756 1.1 ross aSign = extractFloatx80Sign( a );
3757 1.1 ross bSig = extractFloatx80Frac( b );
3758 1.1 ross bExp = extractFloatx80Exp( b );
3759 1.1 ross bSign = extractFloatx80Sign( b );
3760 1.1 ross zSign = aSign ^ bSign;
3761 1.1 ross if ( aExp == 0x7FFF ) {
3762 1.1 ross if ( (bits64) ( aSig<<1 )
3763 1.1 ross || ( ( bExp == 0x7FFF ) && (bits64) ( bSig<<1 ) ) ) {
3764 1.1 ross return propagateFloatx80NaN( a, b );
3765 1.1 ross }
3766 1.1 ross if ( ( bExp | bSig ) == 0 ) goto invalid;
3767 1.1 ross return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
3768 1.1 ross }
3769 1.1 ross if ( bExp == 0x7FFF ) {
3770 1.1 ross if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b );
3771 1.1 ross if ( ( aExp | aSig ) == 0 ) {
3772 1.1 ross invalid:
3773 1.1 ross float_raise( float_flag_invalid );
3774 1.1 ross z.low = floatx80_default_nan_low;
3775 1.1 ross z.high = floatx80_default_nan_high;
3776 1.1 ross return z;
3777 1.1 ross }
3778 1.1 ross return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
3779 1.1 ross }
3780 1.1 ross if ( aExp == 0 ) {
3781 1.1 ross if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 );
3782 1.1 ross normalizeFloatx80Subnormal( aSig, &aExp, &aSig );
3783 1.1 ross }
3784 1.1 ross if ( bExp == 0 ) {
3785 1.1 ross if ( bSig == 0 ) return packFloatx80( zSign, 0, 0 );
3786 1.1 ross normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
3787 1.1 ross }
3788 1.1 ross zExp = aExp + bExp - 0x3FFE;
3789 1.1 ross mul64To128( aSig, bSig, &zSig0, &zSig1 );
3790 1.1 ross if ( 0 < (sbits64) zSig0 ) {
3791 1.1 ross shortShift128Left( zSig0, zSig1, 1, &zSig0, &zSig1 );
3792 1.1 ross --zExp;
3793 1.1 ross }
3794 1.1 ross return
3795 1.1 ross roundAndPackFloatx80(
3796 1.1 ross floatx80_rounding_precision, zSign, zExp, zSig0, zSig1 );
3797 1.1 ross
3798 1.1 ross }
3799 1.1 ross
3800 1.1 ross /*
3801 1.1 ross -------------------------------------------------------------------------------
3802 1.1 ross Returns the result of dividing the extended double-precision floating-point
3803 1.1 ross value `a' by the corresponding value `b'. The operation is performed
3804 1.1 ross according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3805 1.1 ross -------------------------------------------------------------------------------
3806 1.1 ross */
3807 1.1 ross floatx80 floatx80_div( floatx80 a, floatx80 b )
3808 1.1 ross {
3809 1.1 ross flag aSign, bSign, zSign;
3810 1.1 ross int32 aExp, bExp, zExp;
3811 1.1 ross bits64 aSig, bSig, zSig0, zSig1;
3812 1.1 ross bits64 rem0, rem1, rem2, term0, term1, term2;
3813 1.1 ross floatx80 z;
3814 1.1 ross
3815 1.1 ross aSig = extractFloatx80Frac( a );
3816 1.1 ross aExp = extractFloatx80Exp( a );
3817 1.1 ross aSign = extractFloatx80Sign( a );
3818 1.1 ross bSig = extractFloatx80Frac( b );
3819 1.1 ross bExp = extractFloatx80Exp( b );
3820 1.1 ross bSign = extractFloatx80Sign( b );
3821 1.1 ross zSign = aSign ^ bSign;
3822 1.1 ross if ( aExp == 0x7FFF ) {
3823 1.1 ross if ( (bits64) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b );
3824 1.1 ross if ( bExp == 0x7FFF ) {
3825 1.1 ross if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b );
3826 1.1 ross goto invalid;
3827 1.1 ross }
3828 1.1 ross return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
3829 1.1 ross }
3830 1.1 ross if ( bExp == 0x7FFF ) {
3831 1.1 ross if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b );
3832 1.1 ross return packFloatx80( zSign, 0, 0 );
3833 1.1 ross }
3834 1.1 ross if ( bExp == 0 ) {
3835 1.1 ross if ( bSig == 0 ) {
3836 1.1 ross if ( ( aExp | aSig ) == 0 ) {
3837 1.1 ross invalid:
3838 1.1 ross float_raise( float_flag_invalid );
3839 1.1 ross z.low = floatx80_default_nan_low;
3840 1.1 ross z.high = floatx80_default_nan_high;
3841 1.1 ross return z;
3842 1.1 ross }
3843 1.1 ross float_raise( float_flag_divbyzero );
3844 1.1 ross return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
3845 1.1 ross }
3846 1.1 ross normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
3847 1.1 ross }
3848 1.1 ross if ( aExp == 0 ) {
3849 1.1 ross if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 );
3850 1.1 ross normalizeFloatx80Subnormal( aSig, &aExp, &aSig );
3851 1.1 ross }
3852 1.1 ross zExp = aExp - bExp + 0x3FFE;
3853 1.1 ross rem1 = 0;
3854 1.1 ross if ( bSig <= aSig ) {
3855 1.1 ross shift128Right( aSig, 0, 1, &aSig, &rem1 );
3856 1.1 ross ++zExp;
3857 1.1 ross }
3858 1.1 ross zSig0 = estimateDiv128To64( aSig, rem1, bSig );
3859 1.1 ross mul64To128( bSig, zSig0, &term0, &term1 );
3860 1.1 ross sub128( aSig, rem1, term0, term1, &rem0, &rem1 );
3861 1.1 ross while ( (sbits64) rem0 < 0 ) {
3862 1.1 ross --zSig0;
3863 1.1 ross add128( rem0, rem1, 0, bSig, &rem0, &rem1 );
3864 1.1 ross }
3865 1.1 ross zSig1 = estimateDiv128To64( rem1, 0, bSig );
3866 1.1 ross if ( (bits64) ( zSig1<<1 ) <= 8 ) {
3867 1.1 ross mul64To128( bSig, zSig1, &term1, &term2 );
3868 1.1 ross sub128( rem1, 0, term1, term2, &rem1, &rem2 );
3869 1.1 ross while ( (sbits64) rem1 < 0 ) {
3870 1.1 ross --zSig1;
3871 1.1 ross add128( rem1, rem2, 0, bSig, &rem1, &rem2 );
3872 1.1 ross }
3873 1.1 ross zSig1 |= ( ( rem1 | rem2 ) != 0 );
3874 1.1 ross }
3875 1.1 ross return
3876 1.1 ross roundAndPackFloatx80(
3877 1.1 ross floatx80_rounding_precision, zSign, zExp, zSig0, zSig1 );
3878 1.1 ross
3879 1.1 ross }
3880 1.1 ross
3881 1.1 ross /*
3882 1.1 ross -------------------------------------------------------------------------------
3883 1.1 ross Returns the remainder of the extended double-precision floating-point value
3884 1.1 ross `a' with respect to the corresponding value `b'. The operation is performed
3885 1.1 ross according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3886 1.1 ross -------------------------------------------------------------------------------
3887 1.1 ross */
3888 1.1 ross floatx80 floatx80_rem( floatx80 a, floatx80 b )
3889 1.1 ross {
3890 1.1 ross flag aSign, bSign, zSign;
3891 1.1 ross int32 aExp, bExp, expDiff;
3892 1.1 ross bits64 aSig0, aSig1, bSig;
3893 1.1 ross bits64 q, term0, term1, alternateASig0, alternateASig1;
3894 1.1 ross floatx80 z;
3895 1.1 ross
3896 1.1 ross aSig0 = extractFloatx80Frac( a );
3897 1.1 ross aExp = extractFloatx80Exp( a );
3898 1.1 ross aSign = extractFloatx80Sign( a );
3899 1.1 ross bSig = extractFloatx80Frac( b );
3900 1.1 ross bExp = extractFloatx80Exp( b );
3901 1.1 ross bSign = extractFloatx80Sign( b );
3902 1.1 ross if ( aExp == 0x7FFF ) {
3903 1.1 ross if ( (bits64) ( aSig0<<1 )
3904 1.1 ross || ( ( bExp == 0x7FFF ) && (bits64) ( bSig<<1 ) ) ) {
3905 1.1 ross return propagateFloatx80NaN( a, b );
3906 1.1 ross }
3907 1.1 ross goto invalid;
3908 1.1 ross }
3909 1.1 ross if ( bExp == 0x7FFF ) {
3910 1.1 ross if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b );
3911 1.1 ross return a;
3912 1.1 ross }
3913 1.1 ross if ( bExp == 0 ) {
3914 1.1 ross if ( bSig == 0 ) {
3915 1.1 ross invalid:
3916 1.1 ross float_raise( float_flag_invalid );
3917 1.1 ross z.low = floatx80_default_nan_low;
3918 1.1 ross z.high = floatx80_default_nan_high;
3919 1.1 ross return z;
3920 1.1 ross }
3921 1.1 ross normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
3922 1.1 ross }
3923 1.1 ross if ( aExp == 0 ) {
3924 1.1 ross if ( (bits64) ( aSig0<<1 ) == 0 ) return a;
3925 1.1 ross normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 );
3926 1.1 ross }
3927 1.1 ross bSig |= LIT64( 0x8000000000000000 );
3928 1.1 ross zSign = aSign;
3929 1.1 ross expDiff = aExp - bExp;
3930 1.1 ross aSig1 = 0;
3931 1.1 ross if ( expDiff < 0 ) {
3932 1.1 ross if ( expDiff < -1 ) return a;
3933 1.1 ross shift128Right( aSig0, 0, 1, &aSig0, &aSig1 );
3934 1.1 ross expDiff = 0;
3935 1.1 ross }
3936 1.1 ross q = ( bSig <= aSig0 );
3937 1.1 ross if ( q ) aSig0 -= bSig;
3938 1.1 ross expDiff -= 64;
3939 1.1 ross while ( 0 < expDiff ) {
3940 1.1 ross q = estimateDiv128To64( aSig0, aSig1, bSig );
3941 1.1 ross q = ( 2 < q ) ? q - 2 : 0;
3942 1.1 ross mul64To128( bSig, q, &term0, &term1 );
3943 1.1 ross sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
3944 1.1 ross shortShift128Left( aSig0, aSig1, 62, &aSig0, &aSig1 );
3945 1.1 ross expDiff -= 62;
3946 1.1 ross }
3947 1.1 ross expDiff += 64;
3948 1.1 ross if ( 0 < expDiff ) {
3949 1.1 ross q = estimateDiv128To64( aSig0, aSig1, bSig );
3950 1.1 ross q = ( 2 < q ) ? q - 2 : 0;
3951 1.1 ross q >>= 64 - expDiff;
3952 1.1 ross mul64To128( bSig, q<<( 64 - expDiff ), &term0, &term1 );
3953 1.1 ross sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
3954 1.1 ross shortShift128Left( 0, bSig, 64 - expDiff, &term0, &term1 );
3955 1.1 ross while ( le128( term0, term1, aSig0, aSig1 ) ) {
3956 1.1 ross ++q;
3957 1.1 ross sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
3958 1.1 ross }
3959 1.1 ross }
3960 1.1 ross else {
3961 1.1 ross term1 = 0;
3962 1.1 ross term0 = bSig;
3963 1.1 ross }
3964 1.1 ross sub128( term0, term1, aSig0, aSig1, &alternateASig0, &alternateASig1 );
3965 1.1 ross if ( lt128( alternateASig0, alternateASig1, aSig0, aSig1 )
3966 1.1 ross || ( eq128( alternateASig0, alternateASig1, aSig0, aSig1 )
3967 1.1 ross && ( q & 1 ) )
3968 1.1 ross ) {
3969 1.1 ross aSig0 = alternateASig0;
3970 1.1 ross aSig1 = alternateASig1;
3971 1.1 ross zSign = ! zSign;
3972 1.1 ross }
3973 1.1 ross return
3974 1.1 ross normalizeRoundAndPackFloatx80(
3975 1.1 ross 80, zSign, bExp + expDiff, aSig0, aSig1 );
3976 1.1 ross
3977 1.1 ross }
3978 1.1 ross
3979 1.1 ross /*
3980 1.1 ross -------------------------------------------------------------------------------
3981 1.1 ross Returns the square root of the extended double-precision floating-point
3982 1.1 ross value `a'. The operation is performed according to the IEC/IEEE Standard
3983 1.1 ross for Binary Floating-Point Arithmetic.
3984 1.1 ross -------------------------------------------------------------------------------
3985 1.1 ross */
3986 1.1 ross floatx80 floatx80_sqrt( floatx80 a )
3987 1.1 ross {
3988 1.1 ross flag aSign;
3989 1.1 ross int32 aExp, zExp;
3990 1.1 ross bits64 aSig0, aSig1, zSig0, zSig1, doubleZSig0;
3991 1.1 ross bits64 rem0, rem1, rem2, rem3, term0, term1, term2, term3;
3992 1.1 ross floatx80 z;
3993 1.1 ross
3994 1.1 ross aSig0 = extractFloatx80Frac( a );
3995 1.1 ross aExp = extractFloatx80Exp( a );
3996 1.1 ross aSign = extractFloatx80Sign( a );
3997 1.1 ross if ( aExp == 0x7FFF ) {
3998 1.1 ross if ( (bits64) ( aSig0<<1 ) ) return propagateFloatx80NaN( a, a );
3999 1.1 ross if ( ! aSign ) return a;
4000 1.1 ross goto invalid;
4001 1.1 ross }
4002 1.1 ross if ( aSign ) {
4003 1.1 ross if ( ( aExp | aSig0 ) == 0 ) return a;
4004 1.1 ross invalid:
4005 1.1 ross float_raise( float_flag_invalid );
4006 1.1 ross z.low = floatx80_default_nan_low;
4007 1.1 ross z.high = floatx80_default_nan_high;
4008 1.1 ross return z;
4009 1.1 ross }
4010 1.1 ross if ( aExp == 0 ) {
4011 1.1 ross if ( aSig0 == 0 ) return packFloatx80( 0, 0, 0 );
4012 1.1 ross normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 );
4013 1.1 ross }
4014 1.1 ross zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFF;
4015 1.1 ross zSig0 = estimateSqrt32( aExp, aSig0>>32 );
4016 1.1 ross shift128Right( aSig0, 0, 2 + ( aExp & 1 ), &aSig0, &aSig1 );
4017 1.1 ross zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 );
4018 1.1 ross doubleZSig0 = zSig0<<1;
4019 1.1 ross mul64To128( zSig0, zSig0, &term0, &term1 );
4020 1.1 ross sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 );
4021 1.1 ross while ( (sbits64) rem0 < 0 ) {
4022 1.1 ross --zSig0;
4023 1.1 ross doubleZSig0 -= 2;
4024 1.1 ross add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 );
4025 1.1 ross }
4026 1.1 ross zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 );
4027 1.1 ross if ( ( zSig1 & LIT64( 0x3FFFFFFFFFFFFFFF ) ) <= 5 ) {
4028 1.1 ross if ( zSig1 == 0 ) zSig1 = 1;
4029 1.1 ross mul64To128( doubleZSig0, zSig1, &term1, &term2 );
4030 1.1 ross sub128( rem1, 0, term1, term2, &rem1, &rem2 );
4031 1.1 ross mul64To128( zSig1, zSig1, &term2, &term3 );
4032 1.1 ross sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 );
4033 1.1 ross while ( (sbits64) rem1 < 0 ) {
4034 1.1 ross --zSig1;
4035 1.1 ross shortShift128Left( 0, zSig1, 1, &term2, &term3 );
4036 1.1 ross term3 |= 1;
4037 1.1 ross term2 |= doubleZSig0;
4038 1.1 ross add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 );
4039 1.1 ross }
4040 1.1 ross zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
4041 1.1 ross }
4042 1.1 ross shortShift128Left( 0, zSig1, 1, &zSig0, &zSig1 );
4043 1.1 ross zSig0 |= doubleZSig0;
4044 1.1 ross return
4045 1.1 ross roundAndPackFloatx80(
4046 1.1 ross floatx80_rounding_precision, 0, zExp, zSig0, zSig1 );
4047 1.1 ross
4048 1.1 ross }
4049 1.1 ross
4050 1.1 ross /*
4051 1.1 ross -------------------------------------------------------------------------------
4052 1.1 ross Returns 1 if the extended double-precision floating-point value `a' is
4053 1.1 ross equal to the corresponding value `b', and 0 otherwise. The comparison is
4054 1.1 ross performed according to the IEC/IEEE Standard for Binary Floating-Point
4055 1.1 ross Arithmetic.
4056 1.1 ross -------------------------------------------------------------------------------
4057 1.1 ross */
4058 1.1 ross flag floatx80_eq( floatx80 a, floatx80 b )
4059 1.1 ross {
4060 1.1 ross
4061 1.1 ross if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
4062 1.1 ross && (bits64) ( extractFloatx80Frac( a )<<1 ) )
4063 1.1 ross || ( ( extractFloatx80Exp( b ) == 0x7FFF )
4064 1.1 ross && (bits64) ( extractFloatx80Frac( b )<<1 ) )
4065 1.1 ross ) {
4066 1.1 ross if ( floatx80_is_signaling_nan( a )
4067 1.1 ross || floatx80_is_signaling_nan( b ) ) {
4068 1.1 ross float_raise( float_flag_invalid );
4069 1.1 ross }
4070 1.1 ross return 0;
4071 1.1 ross }
4072 1.1 ross return
4073 1.1 ross ( a.low == b.low )
4074 1.1 ross && ( ( a.high == b.high )
4075 1.1 ross || ( ( a.low == 0 )
4076 1.1 ross && ( (bits16) ( ( a.high | b.high )<<1 ) == 0 ) )
4077 1.1 ross );
4078 1.1 ross
4079 1.1 ross }
4080 1.1 ross
4081 1.1 ross /*
4082 1.1 ross -------------------------------------------------------------------------------
4083 1.1 ross Returns 1 if the extended double-precision floating-point value `a' is
4084 1.1 ross less than or equal to the corresponding value `b', and 0 otherwise. The
4085 1.1 ross comparison is performed according to the IEC/IEEE Standard for Binary
4086 1.1 ross Floating-Point Arithmetic.
4087 1.1 ross -------------------------------------------------------------------------------
4088 1.1 ross */
4089 1.1 ross flag floatx80_le( floatx80 a, floatx80 b )
4090 1.1 ross {
4091 1.1 ross flag aSign, bSign;
4092 1.1 ross
4093 1.1 ross if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
4094 1.1 ross && (bits64) ( extractFloatx80Frac( a )<<1 ) )
4095 1.1 ross || ( ( extractFloatx80Exp( b ) == 0x7FFF )
4096 1.1 ross && (bits64) ( extractFloatx80Frac( b )<<1 ) )
4097 1.1 ross ) {
4098 1.1 ross float_raise( float_flag_invalid );
4099 1.1 ross return 0;
4100 1.1 ross }
4101 1.1 ross aSign = extractFloatx80Sign( a );
4102 1.1 ross bSign = extractFloatx80Sign( b );
4103 1.1 ross if ( aSign != bSign ) {
4104 1.1 ross return
4105 1.1 ross aSign
4106 1.1 ross || ( ( ( (bits16) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
4107 1.1 ross == 0 );
4108 1.1 ross }
4109 1.1 ross return
4110 1.1 ross aSign ? le128( b.high, b.low, a.high, a.low )
4111 1.1 ross : le128( a.high, a.low, b.high, b.low );
4112 1.1 ross
4113 1.1 ross }
4114 1.1 ross
4115 1.1 ross /*
4116 1.1 ross -------------------------------------------------------------------------------
4117 1.1 ross Returns 1 if the extended double-precision floating-point value `a' is
4118 1.1 ross less than the corresponding value `b', and 0 otherwise. The comparison
4119 1.1 ross is performed according to the IEC/IEEE Standard for Binary Floating-Point
4120 1.1 ross Arithmetic.
4121 1.1 ross -------------------------------------------------------------------------------
4122 1.1 ross */
4123 1.1 ross flag floatx80_lt( floatx80 a, floatx80 b )
4124 1.1 ross {
4125 1.1 ross flag aSign, bSign;
4126 1.1 ross
4127 1.1 ross if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
4128 1.1 ross && (bits64) ( extractFloatx80Frac( a )<<1 ) )
4129 1.1 ross || ( ( extractFloatx80Exp( b ) == 0x7FFF )
4130 1.1 ross && (bits64) ( extractFloatx80Frac( b )<<1 ) )
4131 1.1 ross ) {
4132 1.1 ross float_raise( float_flag_invalid );
4133 1.1 ross return 0;
4134 1.1 ross }
4135 1.1 ross aSign = extractFloatx80Sign( a );
4136 1.1 ross bSign = extractFloatx80Sign( b );
4137 1.1 ross if ( aSign != bSign ) {
4138 1.1 ross return
4139 1.1 ross aSign
4140 1.1 ross && ( ( ( (bits16) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
4141 1.1 ross != 0 );
4142 1.1 ross }
4143 1.1 ross return
4144 1.1 ross aSign ? lt128( b.high, b.low, a.high, a.low )
4145 1.1 ross : lt128( a.high, a.low, b.high, b.low );
4146 1.1 ross
4147 1.1 ross }
4148 1.1 ross
4149 1.1 ross /*
4150 1.1 ross -------------------------------------------------------------------------------
4151 1.1 ross Returns 1 if the extended double-precision floating-point value `a' is equal
4152 1.1 ross to the corresponding value `b', and 0 otherwise. The invalid exception is
4153 1.1 ross raised if either operand is a NaN. Otherwise, the comparison is performed
4154 1.1 ross according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4155 1.1 ross -------------------------------------------------------------------------------
4156 1.1 ross */
4157 1.1 ross flag floatx80_eq_signaling( floatx80 a, floatx80 b )
4158 1.1 ross {
4159 1.1 ross
4160 1.1 ross if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
4161 1.1 ross && (bits64) ( extractFloatx80Frac( a )<<1 ) )
4162 1.1 ross || ( ( extractFloatx80Exp( b ) == 0x7FFF )
4163 1.1 ross && (bits64) ( extractFloatx80Frac( b )<<1 ) )
4164 1.1 ross ) {
4165 1.1 ross float_raise( float_flag_invalid );
4166 1.1 ross return 0;
4167 1.1 ross }
4168 1.1 ross return
4169 1.1 ross ( a.low == b.low )
4170 1.1 ross && ( ( a.high == b.high )
4171 1.1 ross || ( ( a.low == 0 )
4172 1.1 ross && ( (bits16) ( ( a.high | b.high )<<1 ) == 0 ) )
4173 1.1 ross );
4174 1.1 ross
4175 1.1 ross }
4176 1.1 ross
4177 1.1 ross /*
4178 1.1 ross -------------------------------------------------------------------------------
4179 1.1 ross Returns 1 if the extended double-precision floating-point value `a' is less
4180 1.1 ross than or equal to the corresponding value `b', and 0 otherwise. Quiet NaNs
4181 1.1 ross do not cause an exception. Otherwise, the comparison is performed according
4182 1.1 ross to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4183 1.1 ross -------------------------------------------------------------------------------
4184 1.1 ross */
4185 1.1 ross flag floatx80_le_quiet( floatx80 a, floatx80 b )
4186 1.1 ross {
4187 1.1 ross flag aSign, bSign;
4188 1.1 ross
4189 1.1 ross if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
4190 1.1 ross && (bits64) ( extractFloatx80Frac( a )<<1 ) )
4191 1.1 ross || ( ( extractFloatx80Exp( b ) == 0x7FFF )
4192 1.1 ross && (bits64) ( extractFloatx80Frac( b )<<1 ) )
4193 1.1 ross ) {
4194 1.1 ross if ( floatx80_is_signaling_nan( a )
4195 1.1 ross || floatx80_is_signaling_nan( b ) ) {
4196 1.1 ross float_raise( float_flag_invalid );
4197 1.1 ross }
4198 1.1 ross return 0;
4199 1.1 ross }
4200 1.1 ross aSign = extractFloatx80Sign( a );
4201 1.1 ross bSign = extractFloatx80Sign( b );
4202 1.1 ross if ( aSign != bSign ) {
4203 1.1 ross return
4204 1.1 ross aSign
4205 1.1 ross || ( ( ( (bits16) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
4206 1.1 ross == 0 );
4207 1.1 ross }
4208 1.1 ross return
4209 1.1 ross aSign ? le128( b.high, b.low, a.high, a.low )
4210 1.1 ross : le128( a.high, a.low, b.high, b.low );
4211 1.1 ross
4212 1.1 ross }
4213 1.1 ross
4214 1.1 ross /*
4215 1.1 ross -------------------------------------------------------------------------------
4216 1.1 ross Returns 1 if the extended double-precision floating-point value `a' is less
4217 1.1 ross than the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause
4218 1.1 ross an exception. Otherwise, the comparison is performed according to the
4219 1.1 ross IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4220 1.1 ross -------------------------------------------------------------------------------
4221 1.1 ross */
4222 1.1 ross flag floatx80_lt_quiet( floatx80 a, floatx80 b )
4223 1.1 ross {
4224 1.1 ross flag aSign, bSign;
4225 1.1 ross
4226 1.1 ross if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
4227 1.1 ross && (bits64) ( extractFloatx80Frac( a )<<1 ) )
4228 1.1 ross || ( ( extractFloatx80Exp( b ) == 0x7FFF )
4229 1.1 ross && (bits64) ( extractFloatx80Frac( b )<<1 ) )
4230 1.1 ross ) {
4231 1.1 ross if ( floatx80_is_signaling_nan( a )
4232 1.1 ross || floatx80_is_signaling_nan( b ) ) {
4233 1.1 ross float_raise( float_flag_invalid );
4234 1.1 ross }
4235 1.1 ross return 0;
4236 1.1 ross }
4237 1.1 ross aSign = extractFloatx80Sign( a );
4238 1.1 ross bSign = extractFloatx80Sign( b );
4239 1.1 ross if ( aSign != bSign ) {
4240 1.1 ross return
4241 1.1 ross aSign
4242 1.1 ross && ( ( ( (bits16) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
4243 1.1 ross != 0 );
4244 1.1 ross }
4245 1.1 ross return
4246 1.1 ross aSign ? lt128( b.high, b.low, a.high, a.low )
4247 1.1 ross : lt128( a.high, a.low, b.high, b.low );
4248 1.1 ross
4249 1.1 ross }
4250 1.1 ross
4251 1.1 ross #endif
4252 1.1 ross
4253 1.1 ross #ifdef FLOAT128
4254 1.1 ross
4255 1.1 ross /*
4256 1.1 ross -------------------------------------------------------------------------------
4257 1.1 ross Returns the result of converting the quadruple-precision floating-point
4258 1.1 ross value `a' to the 32-bit two's complement integer format. The conversion
4259 1.1 ross is performed according to the IEC/IEEE Standard for Binary Floating-Point
4260 1.1 ross Arithmetic---which means in particular that the conversion is rounded
4261 1.1 ross according to the current rounding mode. If `a' is a NaN, the largest
4262 1.1 ross positive integer is returned. Otherwise, if the conversion overflows, the
4263 1.1 ross largest integer with the same sign as `a' is returned.
4264 1.1 ross -------------------------------------------------------------------------------
4265 1.1 ross */
4266 1.1 ross int32 float128_to_int32( float128 a )
4267 1.1 ross {
4268 1.1 ross flag aSign;
4269 1.1 ross int32 aExp, shiftCount;
4270 1.1 ross bits64 aSig0, aSig1;
4271 1.1 ross
4272 1.1 ross aSig1 = extractFloat128Frac1( a );
4273 1.1 ross aSig0 = extractFloat128Frac0( a );
4274 1.1 ross aExp = extractFloat128Exp( a );
4275 1.1 ross aSign = extractFloat128Sign( a );
4276 1.1 ross if ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) aSign = 0;
4277 1.1 ross if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
4278 1.1 ross aSig0 |= ( aSig1 != 0 );
4279 1.1 ross shiftCount = 0x4028 - aExp;
4280 1.1 ross if ( 0 < shiftCount ) shift64RightJamming( aSig0, shiftCount, &aSig0 );
4281 1.1 ross return roundAndPackInt32( aSign, aSig0 );
4282 1.1 ross
4283 1.1 ross }
4284 1.1 ross
4285 1.1 ross /*
4286 1.1 ross -------------------------------------------------------------------------------
4287 1.1 ross Returns the result of converting the quadruple-precision floating-point
4288 1.1 ross value `a' to the 32-bit two's complement integer format. The conversion
4289 1.1 ross is performed according to the IEC/IEEE Standard for Binary Floating-Point
4290 1.1 ross Arithmetic, except that the conversion is always rounded toward zero. If
4291 1.1 ross `a' is a NaN, the largest positive integer is returned. Otherwise, if the
4292 1.1 ross conversion overflows, the largest integer with the same sign as `a' is
4293 1.1 ross returned.
4294 1.1 ross -------------------------------------------------------------------------------
4295 1.1 ross */
4296 1.1 ross int32 float128_to_int32_round_to_zero( float128 a )
4297 1.1 ross {
4298 1.1 ross flag aSign;
4299 1.1 ross int32 aExp, shiftCount;
4300 1.1 ross bits64 aSig0, aSig1, savedASig;
4301 1.1 ross int32 z;
4302 1.1 ross
4303 1.1 ross aSig1 = extractFloat128Frac1( a );
4304 1.1 ross aSig0 = extractFloat128Frac0( a );
4305 1.1 ross aExp = extractFloat128Exp( a );
4306 1.1 ross aSign = extractFloat128Sign( a );
4307 1.1 ross aSig0 |= ( aSig1 != 0 );
4308 1.1 ross if ( 0x401E < aExp ) {
4309 1.1 ross if ( ( aExp == 0x7FFF ) && aSig0 ) aSign = 0;
4310 1.1 ross goto invalid;
4311 1.1 ross }
4312 1.1 ross else if ( aExp < 0x3FFF ) {
4313 1.1 ross if ( aExp || aSig0 ) float_set_inexact();
4314 1.1 ross return 0;
4315 1.1 ross }
4316 1.1 ross aSig0 |= LIT64( 0x0001000000000000 );
4317 1.1 ross shiftCount = 0x402F - aExp;
4318 1.1 ross savedASig = aSig0;
4319 1.1 ross aSig0 >>= shiftCount;
4320 1.1 ross z = aSig0;
4321 1.1 ross if ( aSign ) z = - z;
4322 1.1 ross if ( ( z < 0 ) ^ aSign ) {
4323 1.1 ross invalid:
4324 1.1 ross float_raise( float_flag_invalid );
4325 1.1 ross return aSign ? (sbits32) 0x80000000 : 0x7FFFFFFF;
4326 1.1 ross }
4327 1.1 ross if ( ( aSig0<<shiftCount ) != savedASig ) {
4328 1.1 ross float_set_inexact();
4329 1.1 ross }
4330 1.1 ross return z;
4331 1.1 ross
4332 1.1 ross }
4333 1.1 ross
4334 1.1 ross /*
4335 1.1 ross -------------------------------------------------------------------------------
4336 1.1 ross Returns the result of converting the quadruple-precision floating-point
4337 1.1 ross value `a' to the 64-bit two's complement integer format. The conversion
4338 1.1 ross is performed according to the IEC/IEEE Standard for Binary Floating-Point
4339 1.1 ross Arithmetic---which means in particular that the conversion is rounded
4340 1.1 ross according to the current rounding mode. If `a' is a NaN, the largest
4341 1.1 ross positive integer is returned. Otherwise, if the conversion overflows, the
4342 1.1 ross largest integer with the same sign as `a' is returned.
4343 1.1 ross -------------------------------------------------------------------------------
4344 1.1 ross */
4345 1.1 ross int64 float128_to_int64( float128 a )
4346 1.1 ross {
4347 1.1 ross flag aSign;
4348 1.1 ross int32 aExp, shiftCount;
4349 1.1 ross bits64 aSig0, aSig1;
4350 1.1 ross
4351 1.1 ross aSig1 = extractFloat128Frac1( a );
4352 1.1 ross aSig0 = extractFloat128Frac0( a );
4353 1.1 ross aExp = extractFloat128Exp( a );
4354 1.1 ross aSign = extractFloat128Sign( a );
4355 1.1 ross if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
4356 1.1 ross shiftCount = 0x402F - aExp;
4357 1.1 ross if ( shiftCount <= 0 ) {
4358 1.1 ross if ( 0x403E < aExp ) {
4359 1.1 ross float_raise( float_flag_invalid );
4360 1.1 ross if ( ! aSign
4361 1.1 ross || ( ( aExp == 0x7FFF )
4362 1.1 ross && ( aSig1 || ( aSig0 != LIT64( 0x0001000000000000 ) ) )
4363 1.1 ross )
4364 1.1 ross ) {
4365 1.1 ross return LIT64( 0x7FFFFFFFFFFFFFFF );
4366 1.1 ross }
4367 1.1 ross return (sbits64) LIT64( 0x8000000000000000 );
4368 1.1 ross }
4369 1.1 ross shortShift128Left( aSig0, aSig1, - shiftCount, &aSig0, &aSig1 );
4370 1.1 ross }
4371 1.1 ross else {
4372 1.1 ross shift64ExtraRightJamming( aSig0, aSig1, shiftCount, &aSig0, &aSig1 );
4373 1.1 ross }
4374 1.1 ross return roundAndPackInt64( aSign, aSig0, aSig1 );
4375 1.1 ross
4376 1.1 ross }
4377 1.1 ross
4378 1.1 ross /*
4379 1.1 ross -------------------------------------------------------------------------------
4380 1.1 ross Returns the result of converting the quadruple-precision floating-point
4381 1.1 ross value `a' to the 64-bit two's complement integer format. The conversion
4382 1.1 ross is performed according to the IEC/IEEE Standard for Binary Floating-Point
4383 1.1 ross Arithmetic, except that the conversion is always rounded toward zero.
4384 1.1 ross If `a' is a NaN, the largest positive integer is returned. Otherwise, if
4385 1.1 ross the conversion overflows, the largest integer with the same sign as `a' is
4386 1.1 ross returned.
4387 1.1 ross -------------------------------------------------------------------------------
4388 1.1 ross */
4389 1.1 ross int64 float128_to_int64_round_to_zero( float128 a )
4390 1.1 ross {
4391 1.1 ross flag aSign;
4392 1.1 ross int32 aExp, shiftCount;
4393 1.1 ross bits64 aSig0, aSig1;
4394 1.1 ross int64 z;
4395 1.1 ross
4396 1.1 ross aSig1 = extractFloat128Frac1( a );
4397 1.1 ross aSig0 = extractFloat128Frac0( a );
4398 1.1 ross aExp = extractFloat128Exp( a );
4399 1.1 ross aSign = extractFloat128Sign( a );
4400 1.1 ross if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
4401 1.1 ross shiftCount = aExp - 0x402F;
4402 1.1 ross if ( 0 < shiftCount ) {
4403 1.1 ross if ( 0x403E <= aExp ) {
4404 1.1 ross aSig0 &= LIT64( 0x0000FFFFFFFFFFFF );
4405 1.1 ross if ( ( a.high == LIT64( 0xC03E000000000000 ) )
4406 1.1 ross && ( aSig1 < LIT64( 0x0002000000000000 ) ) ) {
4407 1.1 ross if ( aSig1 ) float_set_inexact();
4408 1.1 ross }
4409 1.1 ross else {
4410 1.1 ross float_raise( float_flag_invalid );
4411 1.1 ross if ( ! aSign || ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) ) {
4412 1.1 ross return LIT64( 0x7FFFFFFFFFFFFFFF );
4413 1.1 ross }
4414 1.1 ross }
4415 1.1 ross return (sbits64) LIT64( 0x8000000000000000 );
4416 1.1 ross }
4417 1.1 ross z = ( aSig0<<shiftCount ) | ( aSig1>>( ( - shiftCount ) & 63 ) );
4418 1.1 ross if ( (bits64) ( aSig1<<shiftCount ) ) {
4419 1.1 ross float_set_inexact();
4420 1.1 ross }
4421 1.1 ross }
4422 1.1 ross else {
4423 1.1 ross if ( aExp < 0x3FFF ) {
4424 1.1 ross if ( aExp | aSig0 | aSig1 ) {
4425 1.1 ross float_set_inexact();
4426 1.1 ross }
4427 1.1 ross return 0;
4428 1.1 ross }
4429 1.1 ross z = aSig0>>( - shiftCount );
4430 1.1 ross if ( aSig1
4431 1.1 ross || ( shiftCount && (bits64) ( aSig0<<( shiftCount & 63 ) ) ) ) {
4432 1.1 ross float_set_inexact();
4433 1.1 ross }
4434 1.1 ross }
4435 1.1 ross if ( aSign ) z = - z;
4436 1.1 ross return z;
4437 1.1 ross
4438 1.1 ross }
4439 1.1 ross
4440 1.1 ross /*
4441 1.1 ross -------------------------------------------------------------------------------
4442 1.1 ross Returns the result of converting the quadruple-precision floating-point
4443 1.1 ross value `a' to the single-precision floating-point format. The conversion
4444 1.1 ross is performed according to the IEC/IEEE Standard for Binary Floating-Point
4445 1.1 ross Arithmetic.
4446 1.1 ross -------------------------------------------------------------------------------
4447 1.1 ross */
4448 1.1 ross float32 float128_to_float32( float128 a )
4449 1.1 ross {
4450 1.1 ross flag aSign;
4451 1.1 ross int32 aExp;
4452 1.1 ross bits64 aSig0, aSig1;
4453 1.1 ross bits32 zSig;
4454 1.1 ross
4455 1.1 ross aSig1 = extractFloat128Frac1( a );
4456 1.1 ross aSig0 = extractFloat128Frac0( a );
4457 1.1 ross aExp = extractFloat128Exp( a );
4458 1.1 ross aSign = extractFloat128Sign( a );
4459 1.1 ross if ( aExp == 0x7FFF ) {
4460 1.1 ross if ( aSig0 | aSig1 ) {
4461 1.1 ross return commonNaNToFloat32( float128ToCommonNaN( a ) );
4462 1.1 ross }
4463 1.1 ross return packFloat32( aSign, 0xFF, 0 );
4464 1.1 ross }
4465 1.1 ross aSig0 |= ( aSig1 != 0 );
4466 1.1 ross shift64RightJamming( aSig0, 18, &aSig0 );
4467 1.1 ross zSig = aSig0;
4468 1.1 ross if ( aExp || zSig ) {
4469 1.1 ross zSig |= 0x40000000;
4470 1.1 ross aExp -= 0x3F81;
4471 1.1 ross }
4472 1.1 ross return roundAndPackFloat32( aSign, aExp, zSig );
4473 1.1 ross
4474 1.1 ross }
4475 1.1 ross
4476 1.1 ross /*
4477 1.1 ross -------------------------------------------------------------------------------
4478 1.1 ross Returns the result of converting the quadruple-precision floating-point
4479 1.1 ross value `a' to the double-precision floating-point format. The conversion
4480 1.1 ross is performed according to the IEC/IEEE Standard for Binary Floating-Point
4481 1.1 ross Arithmetic.
4482 1.1 ross -------------------------------------------------------------------------------
4483 1.1 ross */
4484 1.1 ross float64 float128_to_float64( float128 a )
4485 1.1 ross {
4486 1.1 ross flag aSign;
4487 1.1 ross int32 aExp;
4488 1.1 ross bits64 aSig0, aSig1;
4489 1.1 ross
4490 1.1 ross aSig1 = extractFloat128Frac1( a );
4491 1.1 ross aSig0 = extractFloat128Frac0( a );
4492 1.1 ross aExp = extractFloat128Exp( a );
4493 1.1 ross aSign = extractFloat128Sign( a );
4494 1.1 ross if ( aExp == 0x7FFF ) {
4495 1.1 ross if ( aSig0 | aSig1 ) {
4496 1.1 ross return commonNaNToFloat64( float128ToCommonNaN( a ) );
4497 1.1 ross }
4498 1.1 ross return packFloat64( aSign, 0x7FF, 0 );
4499 1.1 ross }
4500 1.1 ross shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 );
4501 1.1 ross aSig0 |= ( aSig1 != 0 );
4502 1.1 ross if ( aExp || aSig0 ) {
4503 1.1 ross aSig0 |= LIT64( 0x4000000000000000 );
4504 1.1 ross aExp -= 0x3C01;
4505 1.1 ross }
4506 1.1 ross return roundAndPackFloat64( aSign, aExp, aSig0 );
4507 1.1 ross
4508 1.1 ross }
4509 1.1 ross
4510 1.1 ross #ifdef FLOATX80
4511 1.1 ross
4512 1.1 ross /*
4513 1.1 ross -------------------------------------------------------------------------------
4514 1.1 ross Returns the result of converting the quadruple-precision floating-point
4515 1.1 ross value `a' to the extended double-precision floating-point format. The
4516 1.1 ross conversion is performed according to the IEC/IEEE Standard for Binary
4517 1.1 ross Floating-Point Arithmetic.
4518 1.1 ross -------------------------------------------------------------------------------
4519 1.1 ross */
4520 1.1 ross floatx80 float128_to_floatx80( float128 a )
4521 1.1 ross {
4522 1.1 ross flag aSign;
4523 1.1 ross int32 aExp;
4524 1.1 ross bits64 aSig0, aSig1;
4525 1.1 ross
4526 1.1 ross aSig1 = extractFloat128Frac1( a );
4527 1.1 ross aSig0 = extractFloat128Frac0( a );
4528 1.1 ross aExp = extractFloat128Exp( a );
4529 1.1 ross aSign = extractFloat128Sign( a );
4530 1.1 ross if ( aExp == 0x7FFF ) {
4531 1.1 ross if ( aSig0 | aSig1 ) {
4532 1.1 ross return commonNaNToFloatx80( float128ToCommonNaN( a ) );
4533 1.1 ross }
4534 1.1 ross return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
4535 1.1 ross }
4536 1.1 ross if ( aExp == 0 ) {
4537 1.1 ross if ( ( aSig0 | aSig1 ) == 0 ) return packFloatx80( aSign, 0, 0 );
4538 1.1 ross normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
4539 1.1 ross }
4540 1.1 ross else {
4541 1.1 ross aSig0 |= LIT64( 0x0001000000000000 );
4542 1.1 ross }
4543 1.1 ross shortShift128Left( aSig0, aSig1, 15, &aSig0, &aSig1 );
4544 1.1 ross return roundAndPackFloatx80( 80, aSign, aExp, aSig0, aSig1 );
4545 1.1 ross
4546 1.1 ross }
4547 1.1 ross
4548 1.1 ross #endif
4549 1.1 ross
4550 1.1 ross /*
4551 1.1 ross -------------------------------------------------------------------------------
4552 1.1 ross Rounds the quadruple-precision floating-point value `a' to an integer, and
4553 1.1 ross returns the result as a quadruple-precision floating-point value. The
4554 1.1 ross operation is performed according to the IEC/IEEE Standard for Binary
4555 1.1 ross Floating-Point Arithmetic.
4556 1.1 ross -------------------------------------------------------------------------------
4557 1.1 ross */
4558 1.1 ross float128 float128_round_to_int( float128 a )
4559 1.1 ross {
4560 1.1 ross flag aSign;
4561 1.1 ross int32 aExp;
4562 1.1 ross bits64 lastBitMask, roundBitsMask;
4563 1.1 ross int8 roundingMode;
4564 1.1 ross float128 z;
4565 1.1 ross
4566 1.1 ross aExp = extractFloat128Exp( a );
4567 1.1 ross if ( 0x402F <= aExp ) {
4568 1.1 ross if ( 0x406F <= aExp ) {
4569 1.1 ross if ( ( aExp == 0x7FFF )
4570 1.1 ross && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) )
4571 1.1 ross ) {
4572 1.1 ross return propagateFloat128NaN( a, a );
4573 1.1 ross }
4574 1.1 ross return a;
4575 1.1 ross }
4576 1.1 ross lastBitMask = 1;
4577 1.1 ross lastBitMask = ( lastBitMask<<( 0x406E - aExp ) )<<1;
4578 1.1 ross roundBitsMask = lastBitMask - 1;
4579 1.1 ross z = a;
4580 1.1 ross roundingMode = float_rounding_mode();
4581 1.1 ross if ( roundingMode == float_round_nearest_even ) {
4582 1.1 ross if ( lastBitMask ) {
4583 1.1 ross add128( z.high, z.low, 0, lastBitMask>>1, &z.high, &z.low );
4584 1.1 ross if ( ( z.low & roundBitsMask ) == 0 ) z.low &= ~ lastBitMask;
4585 1.1 ross }
4586 1.1 ross else {
4587 1.1 ross if ( (sbits64) z.low < 0 ) {
4588 1.1 ross ++z.high;
4589 1.1 ross if ( (bits64) ( z.low<<1 ) == 0 ) z.high &= ~1;
4590 1.1 ross }
4591 1.1 ross }
4592 1.1 ross }
4593 1.1 ross else if ( roundingMode != float_round_to_zero ) {
4594 1.1 ross if ( extractFloat128Sign( z )
4595 1.1 ross ^ ( roundingMode == float_round_up ) ) {
4596 1.1 ross add128( z.high, z.low, 0, roundBitsMask, &z.high, &z.low );
4597 1.1 ross }
4598 1.1 ross }
4599 1.1 ross z.low &= ~ roundBitsMask;
4600 1.1 ross }
4601 1.1 ross else {
4602 1.1 ross if ( aExp < 0x3FFF ) {
4603 1.1 ross if ( ( ( (bits64) ( a.high<<1 ) ) | a.low ) == 0 ) return a;
4604 1.1 ross float_set_inexact();
4605 1.1 ross aSign = extractFloat128Sign( a );
4606 1.1 ross switch ( float_rounding_mode() ) {
4607 1.1 ross case float_round_nearest_even:
4608 1.1 ross if ( ( aExp == 0x3FFE )
4609 1.1 ross && ( extractFloat128Frac0( a )
4610 1.1 ross | extractFloat128Frac1( a ) )
4611 1.1 ross ) {
4612 1.1 ross return packFloat128( aSign, 0x3FFF, 0, 0 );
4613 1.1 ross }
4614 1.1 ross break;
4615 1.1 ross case float_round_down:
4616 1.1 ross return
4617 1.1 ross aSign ? packFloat128( 1, 0x3FFF, 0, 0 )
4618 1.1 ross : packFloat128( 0, 0, 0, 0 );
4619 1.1 ross case float_round_up:
4620 1.1 ross return
4621 1.1 ross aSign ? packFloat128( 1, 0, 0, 0 )
4622 1.1 ross : packFloat128( 0, 0x3FFF, 0, 0 );
4623 1.1 ross }
4624 1.1 ross return packFloat128( aSign, 0, 0, 0 );
4625 1.1 ross }
4626 1.1 ross lastBitMask = 1;
4627 1.1 ross lastBitMask <<= 0x402F - aExp;
4628 1.1 ross roundBitsMask = lastBitMask - 1;
4629 1.1 ross z.low = 0;
4630 1.1 ross z.high = a.high;
4631 1.1 ross roundingMode = float_rounding_mode();
4632 1.1 ross if ( roundingMode == float_round_nearest_even ) {
4633 1.1 ross z.high += lastBitMask>>1;
4634 1.1 ross if ( ( ( z.high & roundBitsMask ) | a.low ) == 0 ) {
4635 1.1 ross z.high &= ~ lastBitMask;
4636 1.1 ross }
4637 1.1 ross }
4638 1.1 ross else if ( roundingMode != float_round_to_zero ) {
4639 1.1 ross if ( extractFloat128Sign( z )
4640 1.1 ross ^ ( roundingMode == float_round_up ) ) {
4641 1.1 ross z.high |= ( a.low != 0 );
4642 1.1 ross z.high += roundBitsMask;
4643 1.1 ross }
4644 1.1 ross }
4645 1.1 ross z.high &= ~ roundBitsMask;
4646 1.1 ross }
4647 1.1 ross if ( ( z.low != a.low ) || ( z.high != a.high ) ) {
4648 1.1 ross float_set_inexact();
4649 1.1 ross }
4650 1.1 ross return z;
4651 1.1 ross
4652 1.1 ross }
4653 1.1 ross
4654 1.1 ross /*
4655 1.1 ross -------------------------------------------------------------------------------
4656 1.1 ross Returns the result of adding the absolute values of the quadruple-precision
4657 1.1 ross floating-point values `a' and `b'. If `zSign' is 1, the sum is negated
4658 1.1 ross before being returned. `zSign' is ignored if the result is a NaN.
4659 1.1 ross The addition is performed according to the IEC/IEEE Standard for Binary
4660 1.1 ross Floating-Point Arithmetic.
4661 1.1 ross -------------------------------------------------------------------------------
4662 1.1 ross */
4663 1.1 ross static float128 addFloat128Sigs( float128 a, float128 b, flag zSign )
4664 1.1 ross {
4665 1.1 ross int32 aExp, bExp, zExp;
4666 1.1 ross bits64 aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2;
4667 1.1 ross int32 expDiff;
4668 1.1 ross
4669 1.1 ross aSig1 = extractFloat128Frac1( a );
4670 1.1 ross aSig0 = extractFloat128Frac0( a );
4671 1.1 ross aExp = extractFloat128Exp( a );
4672 1.1 ross bSig1 = extractFloat128Frac1( b );
4673 1.1 ross bSig0 = extractFloat128Frac0( b );
4674 1.1 ross bExp = extractFloat128Exp( b );
4675 1.1 ross expDiff = aExp - bExp;
4676 1.1 ross if ( 0 < expDiff ) {
4677 1.1 ross if ( aExp == 0x7FFF ) {
4678 1.1 ross if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, b );
4679 1.1 ross return a;
4680 1.1 ross }
4681 1.1 ross if ( bExp == 0 ) {
4682 1.1 ross --expDiff;
4683 1.1 ross }
4684 1.1 ross else {
4685 1.1 ross bSig0 |= LIT64( 0x0001000000000000 );
4686 1.1 ross }
4687 1.1 ross shift128ExtraRightJamming(
4688 1.1 ross bSig0, bSig1, 0, expDiff, &bSig0, &bSig1, &zSig2 );
4689 1.1 ross zExp = aExp;
4690 1.1 ross }
4691 1.1 ross else if ( expDiff < 0 ) {
4692 1.1 ross if ( bExp == 0x7FFF ) {
4693 1.1 ross if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b );
4694 1.1 ross return packFloat128( zSign, 0x7FFF, 0, 0 );
4695 1.1 ross }
4696 1.1 ross if ( aExp == 0 ) {
4697 1.1 ross ++expDiff;
4698 1.1 ross }
4699 1.1 ross else {
4700 1.1 ross aSig0 |= LIT64( 0x0001000000000000 );
4701 1.1 ross }
4702 1.1 ross shift128ExtraRightJamming(
4703 1.1 ross aSig0, aSig1, 0, - expDiff, &aSig0, &aSig1, &zSig2 );
4704 1.1 ross zExp = bExp;
4705 1.1 ross }
4706 1.1 ross else {
4707 1.1 ross if ( aExp == 0x7FFF ) {
4708 1.1 ross if ( aSig0 | aSig1 | bSig0 | bSig1 ) {
4709 1.1 ross return propagateFloat128NaN( a, b );
4710 1.1 ross }
4711 1.1 ross return a;
4712 1.1 ross }
4713 1.1 ross add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
4714 1.1 ross if ( aExp == 0 ) return packFloat128( zSign, 0, zSig0, zSig1 );
4715 1.1 ross zSig2 = 0;
4716 1.1 ross zSig0 |= LIT64( 0x0002000000000000 );
4717 1.1 ross zExp = aExp;
4718 1.1 ross goto shiftRight1;
4719 1.1 ross }
4720 1.1 ross aSig0 |= LIT64( 0x0001000000000000 );
4721 1.1 ross add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
4722 1.1 ross --zExp;
4723 1.1 ross if ( zSig0 < LIT64( 0x0002000000000000 ) ) goto roundAndPack;
4724 1.1 ross ++zExp;
4725 1.1 ross shiftRight1:
4726 1.1 ross shift128ExtraRightJamming(
4727 1.1 ross zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 );
4728 1.1 ross roundAndPack:
4729 1.1 ross return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 );
4730 1.1 ross
4731 1.1 ross }
4732 1.1 ross
4733 1.1 ross /*
4734 1.1 ross -------------------------------------------------------------------------------
4735 1.1 ross Returns the result of subtracting the absolute values of the quadruple-
4736 1.1 ross precision floating-point values `a' and `b'. If `zSign' is 1, the
4737 1.1 ross difference is negated before being returned. `zSign' is ignored if the
4738 1.1 ross result is a NaN. The subtraction is performed according to the IEC/IEEE
4739 1.1 ross Standard for Binary Floating-Point Arithmetic.
4740 1.1 ross -------------------------------------------------------------------------------
4741 1.1 ross */
4742 1.1 ross static float128 subFloat128Sigs( float128 a, float128 b, flag zSign )
4743 1.1 ross {
4744 1.1 ross int32 aExp, bExp, zExp;
4745 1.1 ross bits64 aSig0, aSig1, bSig0, bSig1, zSig0, zSig1;
4746 1.1 ross int32 expDiff;
4747 1.1 ross float128 z;
4748 1.1 ross
4749 1.1 ross aSig1 = extractFloat128Frac1( a );
4750 1.1 ross aSig0 = extractFloat128Frac0( a );
4751 1.1 ross aExp = extractFloat128Exp( a );
4752 1.1 ross bSig1 = extractFloat128Frac1( b );
4753 1.1 ross bSig0 = extractFloat128Frac0( b );
4754 1.1 ross bExp = extractFloat128Exp( b );
4755 1.1 ross expDiff = aExp - bExp;
4756 1.1 ross shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 );
4757 1.1 ross shortShift128Left( bSig0, bSig1, 14, &bSig0, &bSig1 );
4758 1.1 ross if ( 0 < expDiff ) goto aExpBigger;
4759 1.1 ross if ( expDiff < 0 ) goto bExpBigger;
4760 1.1 ross if ( aExp == 0x7FFF ) {
4761 1.1 ross if ( aSig0 | aSig1 | bSig0 | bSig1 ) {
4762 1.1 ross return propagateFloat128NaN( a, b );
4763 1.1 ross }
4764 1.1 ross float_raise( float_flag_invalid );
4765 1.1 ross z.low = float128_default_nan_low;
4766 1.1 ross z.high = float128_default_nan_high;
4767 1.1 ross return z;
4768 1.1 ross }
4769 1.1 ross if ( aExp == 0 ) {
4770 1.1 ross aExp = 1;
4771 1.1 ross bExp = 1;
4772 1.1 ross }
4773 1.1 ross if ( bSig0 < aSig0 ) goto aBigger;
4774 1.1 ross if ( aSig0 < bSig0 ) goto bBigger;
4775 1.1 ross if ( bSig1 < aSig1 ) goto aBigger;
4776 1.1 ross if ( aSig1 < bSig1 ) goto bBigger;
4777 1.1 ross return packFloat128( float_rounding_mode() == float_round_down, 0, 0, 0 );
4778 1.1 ross bExpBigger:
4779 1.1 ross if ( bExp == 0x7FFF ) {
4780 1.1 ross if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b );
4781 1.1 ross return packFloat128( zSign ^ 1, 0x7FFF, 0, 0 );
4782 1.1 ross }
4783 1.1 ross if ( aExp == 0 ) {
4784 1.1 ross ++expDiff;
4785 1.1 ross }
4786 1.1 ross else {
4787 1.1 ross aSig0 |= LIT64( 0x4000000000000000 );
4788 1.1 ross }
4789 1.1 ross shift128RightJamming( aSig0, aSig1, - expDiff, &aSig0, &aSig1 );
4790 1.1 ross bSig0 |= LIT64( 0x4000000000000000 );
4791 1.1 ross bBigger:
4792 1.1 ross sub128( bSig0, bSig1, aSig0, aSig1, &zSig0, &zSig1 );
4793 1.1 ross zExp = bExp;
4794 1.1 ross zSign ^= 1;
4795 1.1 ross goto normalizeRoundAndPack;
4796 1.1 ross aExpBigger:
4797 1.1 ross if ( aExp == 0x7FFF ) {
4798 1.1 ross if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, b );
4799 1.1 ross return a;
4800 1.1 ross }
4801 1.1 ross if ( bExp == 0 ) {
4802 1.1 ross --expDiff;
4803 1.1 ross }
4804 1.1 ross else {
4805 1.1 ross bSig0 |= LIT64( 0x4000000000000000 );
4806 1.1 ross }
4807 1.1 ross shift128RightJamming( bSig0, bSig1, expDiff, &bSig0, &bSig1 );
4808 1.1 ross aSig0 |= LIT64( 0x4000000000000000 );
4809 1.1 ross aBigger:
4810 1.1 ross sub128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
4811 1.1 ross zExp = aExp;
4812 1.1 ross normalizeRoundAndPack:
4813 1.1 ross --zExp;
4814 1.1 ross return normalizeRoundAndPackFloat128( zSign, zExp - 14, zSig0, zSig1 );
4815 1.1 ross
4816 1.1 ross }
4817 1.1 ross
4818 1.1 ross /*
4819 1.1 ross -------------------------------------------------------------------------------
4820 1.1 ross Returns the result of adding the quadruple-precision floating-point values
4821 1.1 ross `a' and `b'. The operation is performed according to the IEC/IEEE Standard
4822 1.1 ross for Binary Floating-Point Arithmetic.
4823 1.1 ross -------------------------------------------------------------------------------
4824 1.1 ross */
4825 1.1 ross float128 float128_add( float128 a, float128 b )
4826 1.1 ross {
4827 1.1 ross flag aSign, bSign;
4828 1.1 ross
4829 1.1 ross aSign = extractFloat128Sign( a );
4830 1.1 ross bSign = extractFloat128Sign( b );
4831 1.1 ross if ( aSign == bSign ) {
4832 1.1 ross return addFloat128Sigs( a, b, aSign );
4833 1.1 ross }
4834 1.1 ross else {
4835 1.1 ross return subFloat128Sigs( a, b, aSign );
4836 1.1 ross }
4837 1.1 ross
4838 1.1 ross }
4839 1.1 ross
4840 1.1 ross /*
4841 1.1 ross -------------------------------------------------------------------------------
4842 1.1 ross Returns the result of subtracting the quadruple-precision floating-point
4843 1.1 ross values `a' and `b'. The operation is performed according to the IEC/IEEE
4844 1.1 ross Standard for Binary Floating-Point Arithmetic.
4845 1.1 ross -------------------------------------------------------------------------------
4846 1.1 ross */
4847 1.1 ross float128 float128_sub( float128 a, float128 b )
4848 1.1 ross {
4849 1.1 ross flag aSign, bSign;
4850 1.1 ross
4851 1.1 ross aSign = extractFloat128Sign( a );
4852 1.1 ross bSign = extractFloat128Sign( b );
4853 1.1 ross if ( aSign == bSign ) {
4854 1.1 ross return subFloat128Sigs( a, b, aSign );
4855 1.1 ross }
4856 1.1 ross else {
4857 1.1 ross return addFloat128Sigs( a, b, aSign );
4858 1.1 ross }
4859 1.1 ross
4860 1.1 ross }
4861 1.1 ross
4862 1.1 ross /*
4863 1.1 ross -------------------------------------------------------------------------------
4864 1.1 ross Returns the result of multiplying the quadruple-precision floating-point
4865 1.1 ross values `a' and `b'. The operation is performed according to the IEC/IEEE
4866 1.1 ross Standard for Binary Floating-Point Arithmetic.
4867 1.1 ross -------------------------------------------------------------------------------
4868 1.1 ross */
4869 1.1 ross float128 float128_mul( float128 a, float128 b )
4870 1.1 ross {
4871 1.1 ross flag aSign, bSign, zSign;
4872 1.1 ross int32 aExp, bExp, zExp;
4873 1.1 ross bits64 aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2, zSig3;
4874 1.1 ross float128 z;
4875 1.1 ross
4876 1.1 ross aSig1 = extractFloat128Frac1( a );
4877 1.1 ross aSig0 = extractFloat128Frac0( a );
4878 1.1 ross aExp = extractFloat128Exp( a );
4879 1.1 ross aSign = extractFloat128Sign( a );
4880 1.1 ross bSig1 = extractFloat128Frac1( b );
4881 1.1 ross bSig0 = extractFloat128Frac0( b );
4882 1.1 ross bExp = extractFloat128Exp( b );
4883 1.1 ross bSign = extractFloat128Sign( b );
4884 1.1 ross zSign = aSign ^ bSign;
4885 1.1 ross if ( aExp == 0x7FFF ) {
4886 1.1 ross if ( ( aSig0 | aSig1 )
4887 1.1 ross || ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) {
4888 1.1 ross return propagateFloat128NaN( a, b );
4889 1.1 ross }
4890 1.1 ross if ( ( bExp | bSig0 | bSig1 ) == 0 ) goto invalid;
4891 1.1 ross return packFloat128( zSign, 0x7FFF, 0, 0 );
4892 1.1 ross }
4893 1.1 ross if ( bExp == 0x7FFF ) {
4894 1.1 ross if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b );
4895 1.1 ross if ( ( aExp | aSig0 | aSig1 ) == 0 ) {
4896 1.1 ross invalid:
4897 1.1 ross float_raise( float_flag_invalid );
4898 1.1 ross z.low = float128_default_nan_low;
4899 1.1 ross z.high = float128_default_nan_high;
4900 1.1 ross return z;
4901 1.1 ross }
4902 1.1 ross return packFloat128( zSign, 0x7FFF, 0, 0 );
4903 1.1 ross }
4904 1.1 ross if ( aExp == 0 ) {
4905 1.1 ross if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
4906 1.1 ross normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
4907 1.1 ross }
4908 1.1 ross if ( bExp == 0 ) {
4909 1.1 ross if ( ( bSig0 | bSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
4910 1.1 ross normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
4911 1.1 ross }
4912 1.1 ross zExp = aExp + bExp - 0x4000;
4913 1.1 ross aSig0 |= LIT64( 0x0001000000000000 );
4914 1.1 ross shortShift128Left( bSig0, bSig1, 16, &bSig0, &bSig1 );
4915 1.1 ross mul128To256( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1, &zSig2, &zSig3 );
4916 1.1 ross add128( zSig0, zSig1, aSig0, aSig1, &zSig0, &zSig1 );
4917 1.1 ross zSig2 |= ( zSig3 != 0 );
4918 1.1 ross if ( LIT64( 0x0002000000000000 ) <= zSig0 ) {
4919 1.1 ross shift128ExtraRightJamming(
4920 1.1 ross zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 );
4921 1.1 ross ++zExp;
4922 1.1 ross }
4923 1.1 ross return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 );
4924 1.1 ross
4925 1.1 ross }
4926 1.1 ross
4927 1.1 ross /*
4928 1.1 ross -------------------------------------------------------------------------------
4929 1.1 ross Returns the result of dividing the quadruple-precision floating-point value
4930 1.1 ross `a' by the corresponding value `b'. The operation is performed according to
4931 1.1 ross the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4932 1.1 ross -------------------------------------------------------------------------------
4933 1.1 ross */
4934 1.1 ross float128 float128_div( float128 a, float128 b )
4935 1.1 ross {
4936 1.1 ross flag aSign, bSign, zSign;
4937 1.1 ross int32 aExp, bExp, zExp;
4938 1.1 ross bits64 aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2;
4939 1.1 ross bits64 rem0, rem1, rem2, rem3, term0, term1, term2, term3;
4940 1.1 ross float128 z;
4941 1.1 ross
4942 1.1 ross aSig1 = extractFloat128Frac1( a );
4943 1.1 ross aSig0 = extractFloat128Frac0( a );
4944 1.1 ross aExp = extractFloat128Exp( a );
4945 1.1 ross aSign = extractFloat128Sign( a );
4946 1.1 ross bSig1 = extractFloat128Frac1( b );
4947 1.1 ross bSig0 = extractFloat128Frac0( b );
4948 1.1 ross bExp = extractFloat128Exp( b );
4949 1.1 ross bSign = extractFloat128Sign( b );
4950 1.1 ross zSign = aSign ^ bSign;
4951 1.1 ross if ( aExp == 0x7FFF ) {
4952 1.1 ross if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, b );
4953 1.1 ross if ( bExp == 0x7FFF ) {
4954 1.1 ross if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b );
4955 1.1 ross goto invalid;
4956 1.1 ross }
4957 1.1 ross return packFloat128( zSign, 0x7FFF, 0, 0 );
4958 1.1 ross }
4959 1.1 ross if ( bExp == 0x7FFF ) {
4960 1.1 ross if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b );
4961 1.1 ross return packFloat128( zSign, 0, 0, 0 );
4962 1.1 ross }
4963 1.1 ross if ( bExp == 0 ) {
4964 1.1 ross if ( ( bSig0 | bSig1 ) == 0 ) {
4965 1.1 ross if ( ( aExp | aSig0 | aSig1 ) == 0 ) {
4966 1.1 ross invalid:
4967 1.1 ross float_raise( float_flag_invalid );
4968 1.1 ross z.low = float128_default_nan_low;
4969 1.1 ross z.high = float128_default_nan_high;
4970 1.1 ross return z;
4971 1.1 ross }
4972 1.1 ross float_raise( float_flag_divbyzero );
4973 1.1 ross return packFloat128( zSign, 0x7FFF, 0, 0 );
4974 1.1 ross }
4975 1.1 ross normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
4976 1.1 ross }
4977 1.1 ross if ( aExp == 0 ) {
4978 1.1 ross if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
4979 1.1 ross normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
4980 1.1 ross }
4981 1.1 ross zExp = aExp - bExp + 0x3FFD;
4982 1.1 ross shortShift128Left(
4983 1.1 ross aSig0 | LIT64( 0x0001000000000000 ), aSig1, 15, &aSig0, &aSig1 );
4984 1.1 ross shortShift128Left(
4985 1.1 ross bSig0 | LIT64( 0x0001000000000000 ), bSig1, 15, &bSig0, &bSig1 );
4986 1.1 ross if ( le128( bSig0, bSig1, aSig0, aSig1 ) ) {
4987 1.1 ross shift128Right( aSig0, aSig1, 1, &aSig0, &aSig1 );
4988 1.1 ross ++zExp;
4989 1.1 ross }
4990 1.1 ross zSig0 = estimateDiv128To64( aSig0, aSig1, bSig0 );
4991 1.1 ross mul128By64To192( bSig0, bSig1, zSig0, &term0, &term1, &term2 );
4992 1.1 ross sub192( aSig0, aSig1, 0, term0, term1, term2, &rem0, &rem1, &rem2 );
4993 1.1 ross while ( (sbits64) rem0 < 0 ) {
4994 1.1 ross --zSig0;
4995 1.1 ross add192( rem0, rem1, rem2, 0, bSig0, bSig1, &rem0, &rem1, &rem2 );
4996 1.1 ross }
4997 1.1 ross zSig1 = estimateDiv128To64( rem1, rem2, bSig0 );
4998 1.1 ross if ( ( zSig1 & 0x3FFF ) <= 4 ) {
4999 1.1 ross mul128By64To192( bSig0, bSig1, zSig1, &term1, &term2, &term3 );
5000 1.1 ross sub192( rem1, rem2, 0, term1, term2, term3, &rem1, &rem2, &rem3 );
5001 1.1 ross while ( (sbits64) rem1 < 0 ) {
5002 1.1 ross --zSig1;
5003 1.1 ross add192( rem1, rem2, rem3, 0, bSig0, bSig1, &rem1, &rem2, &rem3 );
5004 1.1 ross }
5005 1.1 ross zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
5006 1.1 ross }
5007 1.1 ross shift128ExtraRightJamming( zSig0, zSig1, 0, 15, &zSig0, &zSig1, &zSig2 );
5008 1.1 ross return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 );
5009 1.1 ross
5010 1.1 ross }
5011 1.1 ross
5012 1.1 ross /*
5013 1.1 ross -------------------------------------------------------------------------------
5014 1.1 ross Returns the remainder of the quadruple-precision floating-point value `a'
5015 1.1 ross with respect to the corresponding value `b'. The operation is performed
5016 1.1 ross according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5017 1.1 ross -------------------------------------------------------------------------------
5018 1.1 ross */
5019 1.1 ross float128 float128_rem( float128 a, float128 b )
5020 1.1 ross {
5021 1.1 ross flag aSign, bSign, zSign;
5022 1.1 ross int32 aExp, bExp, expDiff;
5023 1.1 ross bits64 aSig0, aSig1, bSig0, bSig1, q, term0, term1, term2;
5024 1.1 ross bits64 allZero, alternateASig0, alternateASig1, sigMean1;
5025 1.1 ross sbits64 sigMean0;
5026 1.1 ross float128 z;
5027 1.1 ross
5028 1.1 ross aSig1 = extractFloat128Frac1( a );
5029 1.1 ross aSig0 = extractFloat128Frac0( a );
5030 1.1 ross aExp = extractFloat128Exp( a );
5031 1.1 ross aSign = extractFloat128Sign( a );
5032 1.1 ross bSig1 = extractFloat128Frac1( b );
5033 1.1 ross bSig0 = extractFloat128Frac0( b );
5034 1.1 ross bExp = extractFloat128Exp( b );
5035 1.1 ross bSign = extractFloat128Sign( b );
5036 1.1 ross if ( aExp == 0x7FFF ) {
5037 1.1 ross if ( ( aSig0 | aSig1 )
5038 1.1 ross || ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) {
5039 1.1 ross return propagateFloat128NaN( a, b );
5040 1.1 ross }
5041 1.1 ross goto invalid;
5042 1.1 ross }
5043 1.1 ross if ( bExp == 0x7FFF ) {
5044 1.1 ross if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b );
5045 1.1 ross return a;
5046 1.1 ross }
5047 1.1 ross if ( bExp == 0 ) {
5048 1.1 ross if ( ( bSig0 | bSig1 ) == 0 ) {
5049 1.1 ross invalid:
5050 1.1 ross float_raise( float_flag_invalid );
5051 1.1 ross z.low = float128_default_nan_low;
5052 1.1 ross z.high = float128_default_nan_high;
5053 1.1 ross return z;
5054 1.1 ross }
5055 1.1 ross normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
5056 1.1 ross }
5057 1.1 ross if ( aExp == 0 ) {
5058 1.1 ross if ( ( aSig0 | aSig1 ) == 0 ) return a;
5059 1.1 ross normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
5060 1.1 ross }
5061 1.1 ross expDiff = aExp - bExp;
5062 1.1 ross if ( expDiff < -1 ) return a;
5063 1.1 ross shortShift128Left(
5064 1.1 ross aSig0 | LIT64( 0x0001000000000000 ),
5065 1.1 ross aSig1,
5066 1.1 ross 15 - ( expDiff < 0 ),
5067 1.1 ross &aSig0,
5068 1.1 ross &aSig1
5069 1.1 ross );
5070 1.1 ross shortShift128Left(
5071 1.1 ross bSig0 | LIT64( 0x0001000000000000 ), bSig1, 15, &bSig0, &bSig1 );
5072 1.1 ross q = le128( bSig0, bSig1, aSig0, aSig1 );
5073 1.1 ross if ( q ) sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 );
5074 1.1 ross expDiff -= 64;
5075 1.1 ross while ( 0 < expDiff ) {
5076 1.1 ross q = estimateDiv128To64( aSig0, aSig1, bSig0 );
5077 1.1 ross q = ( 4 < q ) ? q - 4 : 0;
5078 1.1 ross mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 );
5079 1.1 ross shortShift192Left( term0, term1, term2, 61, &term1, &term2, &allZero );
5080 1.1 ross shortShift128Left( aSig0, aSig1, 61, &aSig0, &allZero );
5081 1.1 ross sub128( aSig0, 0, term1, term2, &aSig0, &aSig1 );
5082 1.1 ross expDiff -= 61;
5083 1.1 ross }
5084 1.1 ross if ( -64 < expDiff ) {
5085 1.1 ross q = estimateDiv128To64( aSig0, aSig1, bSig0 );
5086 1.1 ross q = ( 4 < q ) ? q - 4 : 0;
5087 1.1 ross q >>= - expDiff;
5088 1.1 ross shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 );
5089 1.1 ross expDiff += 52;
5090 1.1 ross if ( expDiff < 0 ) {
5091 1.1 ross shift128Right( aSig0, aSig1, - expDiff, &aSig0, &aSig1 );
5092 1.1 ross }
5093 1.1 ross else {
5094 1.1 ross shortShift128Left( aSig0, aSig1, expDiff, &aSig0, &aSig1 );
5095 1.1 ross }
5096 1.1 ross mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 );
5097 1.1 ross sub128( aSig0, aSig1, term1, term2, &aSig0, &aSig1 );
5098 1.1 ross }
5099 1.1 ross else {
5100 1.1 ross shift128Right( aSig0, aSig1, 12, &aSig0, &aSig1 );
5101 1.1 ross shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 );
5102 1.1 ross }
5103 1.1 ross do {
5104 1.1 ross alternateASig0 = aSig0;
5105 1.1 ross alternateASig1 = aSig1;
5106 1.1 ross ++q;
5107 1.1 ross sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 );
5108 1.1 ross } while ( 0 <= (sbits64) aSig0 );
5109 1.1 ross add128(
5110 1.1 ross aSig0, aSig1, alternateASig0, alternateASig1, &sigMean0, &sigMean1 );
5111 1.1 ross if ( ( sigMean0 < 0 )
5112 1.1 ross || ( ( ( sigMean0 | sigMean1 ) == 0 ) && ( q & 1 ) ) ) {
5113 1.1 ross aSig0 = alternateASig0;
5114 1.1 ross aSig1 = alternateASig1;
5115 1.1 ross }
5116 1.1 ross zSign = ( (sbits64) aSig0 < 0 );
5117 1.1 ross if ( zSign ) sub128( 0, 0, aSig0, aSig1, &aSig0, &aSig1 );
5118 1.1 ross return
5119 1.1 ross normalizeRoundAndPackFloat128( aSign ^ zSign, bExp - 4, aSig0, aSig1 );
5120 1.1 ross
5121 1.1 ross }
5122 1.1 ross
5123 1.1 ross /*
5124 1.1 ross -------------------------------------------------------------------------------
5125 1.1 ross Returns the square root of the quadruple-precision floating-point value `a'.
5126 1.1 ross The operation is performed according to the IEC/IEEE Standard for Binary
5127 1.1 ross Floating-Point Arithmetic.
5128 1.1 ross -------------------------------------------------------------------------------
5129 1.1 ross */
5130 1.1 ross float128 float128_sqrt( float128 a )
5131 1.1 ross {
5132 1.1 ross flag aSign;
5133 1.1 ross int32 aExp, zExp;
5134 1.1 ross bits64 aSig0, aSig1, zSig0, zSig1, zSig2, doubleZSig0;
5135 1.1 ross bits64 rem0, rem1, rem2, rem3, term0, term1, term2, term3;
5136 1.1 ross float128 z;
5137 1.1 ross
5138 1.1 ross aSig1 = extractFloat128Frac1( a );
5139 1.1 ross aSig0 = extractFloat128Frac0( a );
5140 1.1 ross aExp = extractFloat128Exp( a );
5141 1.1 ross aSign = extractFloat128Sign( a );
5142 1.1 ross if ( aExp == 0x7FFF ) {
5143 1.1 ross if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, a );
5144 1.1 ross if ( ! aSign ) return a;
5145 1.1 ross goto invalid;
5146 1.1 ross }
5147 1.1 ross if ( aSign ) {
5148 1.1 ross if ( ( aExp | aSig0 | aSig1 ) == 0 ) return a;
5149 1.1 ross invalid:
5150 1.1 ross float_raise( float_flag_invalid );
5151 1.1 ross z.low = float128_default_nan_low;
5152 1.1 ross z.high = float128_default_nan_high;
5153 1.1 ross return z;
5154 1.1 ross }
5155 1.1 ross if ( aExp == 0 ) {
5156 1.1 ross if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( 0, 0, 0, 0 );
5157 1.1 ross normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
5158 1.1 ross }
5159 1.1 ross zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFE;
5160 1.1 ross aSig0 |= LIT64( 0x0001000000000000 );
5161 1.1 ross zSig0 = estimateSqrt32( aExp, aSig0>>17 );
5162 1.1 ross shortShift128Left( aSig0, aSig1, 13 - ( aExp & 1 ), &aSig0, &aSig1 );
5163 1.1 ross zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 );
5164 1.1 ross doubleZSig0 = zSig0<<1;
5165 1.1 ross mul64To128( zSig0, zSig0, &term0, &term1 );
5166 1.1 ross sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 );
5167 1.1 ross while ( (sbits64) rem0 < 0 ) {
5168 1.1 ross --zSig0;
5169 1.1 ross doubleZSig0 -= 2;
5170 1.1 ross add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 );
5171 1.1 ross }
5172 1.1 ross zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 );
5173 1.1 ross if ( ( zSig1 & 0x1FFF ) <= 5 ) {
5174 1.1 ross if ( zSig1 == 0 ) zSig1 = 1;
5175 1.1 ross mul64To128( doubleZSig0, zSig1, &term1, &term2 );
5176 1.1 ross sub128( rem1, 0, term1, term2, &rem1, &rem2 );
5177 1.1 ross mul64To128( zSig1, zSig1, &term2, &term3 );
5178 1.1 ross sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 );
5179 1.1 ross while ( (sbits64) rem1 < 0 ) {
5180 1.1 ross --zSig1;
5181 1.1 ross shortShift128Left( 0, zSig1, 1, &term2, &term3 );
5182 1.1 ross term3 |= 1;
5183 1.1 ross term2 |= doubleZSig0;
5184 1.1 ross add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 );
5185 1.1 ross }
5186 1.1 ross zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
5187 1.1 ross }
5188 1.1 ross shift128ExtraRightJamming( zSig0, zSig1, 0, 14, &zSig0, &zSig1, &zSig2 );
5189 1.1 ross return roundAndPackFloat128( 0, zExp, zSig0, zSig1, zSig2 );
5190 1.1 ross
5191 1.1 ross }
5192 1.1 ross
5193 1.1 ross /*
5194 1.1 ross -------------------------------------------------------------------------------
5195 1.1 ross Returns 1 if the quadruple-precision floating-point value `a' is equal to
5196 1.1 ross the corresponding value `b', and 0 otherwise. The comparison is performed
5197 1.1 ross according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5198 1.1 ross -------------------------------------------------------------------------------
5199 1.1 ross */
5200 1.1 ross flag float128_eq( float128 a, float128 b )
5201 1.1 ross {
5202 1.1 ross
5203 1.1 ross if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
5204 1.1 ross && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
5205 1.1 ross || ( ( extractFloat128Exp( b ) == 0x7FFF )
5206 1.1 ross && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
5207 1.1 ross ) {
5208 1.1 ross if ( float128_is_signaling_nan( a )
5209 1.1 ross || float128_is_signaling_nan( b ) ) {
5210 1.1 ross float_raise( float_flag_invalid );
5211 1.1 ross }
5212 1.1 ross return 0;
5213 1.1 ross }
5214 1.1 ross return
5215 1.1 ross ( a.low == b.low )
5216 1.1 ross && ( ( a.high == b.high )
5217 1.1 ross || ( ( a.low == 0 )
5218 1.1 ross && ( (bits64) ( ( a.high | b.high )<<1 ) == 0 ) )
5219 1.1 ross );
5220 1.1 ross
5221 1.1 ross }
5222 1.1 ross
5223 1.1 ross /*
5224 1.1 ross -------------------------------------------------------------------------------
5225 1.1 ross Returns 1 if the quadruple-precision floating-point value `a' is less than
5226 1.1 ross or equal to the corresponding value `b', and 0 otherwise. The comparison
5227 1.1 ross is performed according to the IEC/IEEE Standard for Binary Floating-Point
5228 1.1 ross Arithmetic.
5229 1.1 ross -------------------------------------------------------------------------------
5230 1.1 ross */
5231 1.1 ross flag float128_le( float128 a, float128 b )
5232 1.1 ross {
5233 1.1 ross flag aSign, bSign;
5234 1.1 ross
5235 1.1 ross if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
5236 1.1 ross && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
5237 1.1 ross || ( ( extractFloat128Exp( b ) == 0x7FFF )
5238 1.1 ross && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
5239 1.1 ross ) {
5240 1.1 ross float_raise( float_flag_invalid );
5241 1.1 ross return 0;
5242 1.1 ross }
5243 1.1 ross aSign = extractFloat128Sign( a );
5244 1.1 ross bSign = extractFloat128Sign( b );
5245 1.1 ross if ( aSign != bSign ) {
5246 1.1 ross return
5247 1.1 ross aSign
5248 1.1 ross || ( ( ( (bits64) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5249 1.1 ross == 0 );
5250 1.1 ross }
5251 1.1 ross return
5252 1.1 ross aSign ? le128( b.high, b.low, a.high, a.low )
5253 1.1 ross : le128( a.high, a.low, b.high, b.low );
5254 1.1 ross
5255 1.1 ross }
5256 1.1 ross
5257 1.1 ross /*
5258 1.1 ross -------------------------------------------------------------------------------
5259 1.1 ross Returns 1 if the quadruple-precision floating-point value `a' is less than
5260 1.1 ross the corresponding value `b', and 0 otherwise. The comparison is performed
5261 1.1 ross according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5262 1.1 ross -------------------------------------------------------------------------------
5263 1.1 ross */
5264 1.1 ross flag float128_lt( float128 a, float128 b )
5265 1.1 ross {
5266 1.1 ross flag aSign, bSign;
5267 1.1 ross
5268 1.1 ross if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
5269 1.1 ross && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
5270 1.1 ross || ( ( extractFloat128Exp( b ) == 0x7FFF )
5271 1.1 ross && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
5272 1.1 ross ) {
5273 1.1 ross float_raise( float_flag_invalid );
5274 1.1 ross return 0;
5275 1.1 ross }
5276 1.1 ross aSign = extractFloat128Sign( a );
5277 1.1 ross bSign = extractFloat128Sign( b );
5278 1.1 ross if ( aSign != bSign ) {
5279 1.1 ross return
5280 1.1 ross aSign
5281 1.1 ross && ( ( ( (bits64) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5282 1.1 ross != 0 );
5283 1.1 ross }
5284 1.1 ross return
5285 1.1 ross aSign ? lt128( b.high, b.low, a.high, a.low )
5286 1.1 ross : lt128( a.high, a.low, b.high, b.low );
5287 1.1 ross
5288 1.1 ross }
5289 1.1 ross
5290 1.1 ross /*
5291 1.1 ross -------------------------------------------------------------------------------
5292 1.1 ross Returns 1 if the quadruple-precision floating-point value `a' is equal to
5293 1.1 ross the corresponding value `b', and 0 otherwise. The invalid exception is
5294 1.1 ross raised if either operand is a NaN. Otherwise, the comparison is performed
5295 1.1 ross according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5296 1.1 ross -------------------------------------------------------------------------------
5297 1.1 ross */
5298 1.1 ross flag float128_eq_signaling( float128 a, float128 b )
5299 1.1 ross {
5300 1.1 ross
5301 1.1 ross if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
5302 1.1 ross && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
5303 1.1 ross || ( ( extractFloat128Exp( b ) == 0x7FFF )
5304 1.1 ross && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
5305 1.1 ross ) {
5306 1.1 ross float_raise( float_flag_invalid );
5307 1.1 ross return 0;
5308 1.1 ross }
5309 1.1 ross return
5310 1.1 ross ( a.low == b.low )
5311 1.1 ross && ( ( a.high == b.high )
5312 1.1 ross || ( ( a.low == 0 )
5313 1.1 ross && ( (bits64) ( ( a.high | b.high )<<1 ) == 0 ) )
5314 1.1 ross );
5315 1.1 ross
5316 1.1 ross }
5317 1.1 ross
5318 1.1 ross /*
5319 1.1 ross -------------------------------------------------------------------------------
5320 1.1 ross Returns 1 if the quadruple-precision floating-point value `a' is less than
5321 1.1 ross or equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not
5322 1.1 ross cause an exception. Otherwise, the comparison is performed according to the
5323 1.1 ross IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5324 1.1 ross -------------------------------------------------------------------------------
5325 1.1 ross */
5326 1.1 ross flag float128_le_quiet( float128 a, float128 b )
5327 1.1 ross {
5328 1.1 ross flag aSign, bSign;
5329 1.1 ross
5330 1.1 ross if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
5331 1.1 ross && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
5332 1.1 ross || ( ( extractFloat128Exp( b ) == 0x7FFF )
5333 1.1 ross && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
5334 1.1 ross ) {
5335 1.1 ross if ( float128_is_signaling_nan( a )
5336 1.1 ross || float128_is_signaling_nan( b ) ) {
5337 1.1 ross float_raise( float_flag_invalid );
5338 1.1 ross }
5339 1.1 ross return 0;
5340 1.1 ross }
5341 1.1 ross aSign = extractFloat128Sign( a );
5342 1.1 ross bSign = extractFloat128Sign( b );
5343 1.1 ross if ( aSign != bSign ) {
5344 1.1 ross return
5345 1.1 ross aSign
5346 1.1 ross || ( ( ( (bits64) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5347 1.1 ross == 0 );
5348 1.1 ross }
5349 1.1 ross return
5350 1.1 ross aSign ? le128( b.high, b.low, a.high, a.low )
5351 1.1 ross : le128( a.high, a.low, b.high, b.low );
5352 1.1 ross
5353 1.1 ross }
5354 1.1 ross
5355 1.1 ross /*
5356 1.1 ross -------------------------------------------------------------------------------
5357 1.1 ross Returns 1 if the quadruple-precision floating-point value `a' is less than
5358 1.1 ross the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
5359 1.1 ross exception. Otherwise, the comparison is performed according to the IEC/IEEE
5360 1.1 ross Standard for Binary Floating-Point Arithmetic.
5361 1.1 ross -------------------------------------------------------------------------------
5362 1.1 ross */
5363 1.1 ross flag float128_lt_quiet( float128 a, float128 b )
5364 1.1 ross {
5365 1.1 ross flag aSign, bSign;
5366 1.1 ross
5367 1.1 ross if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
5368 1.1 ross && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
5369 1.1 ross || ( ( extractFloat128Exp( b ) == 0x7FFF )
5370 1.1 ross && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
5371 1.1 ross ) {
5372 1.1 ross if ( float128_is_signaling_nan( a )
5373 1.1 ross || float128_is_signaling_nan( b ) ) {
5374 1.1 ross float_raise( float_flag_invalid );
5375 1.1 ross }
5376 1.1 ross return 0;
5377 1.1 ross }
5378 1.1 ross aSign = extractFloat128Sign( a );
5379 1.1 ross bSign = extractFloat128Sign( b );
5380 1.1 ross if ( aSign != bSign ) {
5381 1.1 ross return
5382 1.1 ross aSign
5383 1.1 ross && ( ( ( (bits64) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5384 1.1 ross != 0 );
5385 1.1 ross }
5386 1.1 ross return
5387 1.1 ross aSign ? lt128( b.high, b.low, a.high, a.low )
5388 1.1 ross : lt128( a.high, a.low, b.high, b.low );
5389 1.1 ross
5390 1.1 ross }
5391 1.1 ross
5392 1.1 ross #endif
5393 1.1 ross
5394 1.1 ross
5395 1.1 ross #if defined(SOFTFLOAT_FOR_GCC) && defined(SOFTFLOAT_NEED_FIXUNS)
5396 1.1 ross
5397 1.1 ross /*
5398 1.1 ross * These two routines are not part of the original softfloat distribution.
5399 1.1 ross *
5400 1.1 ross * They are based on the corresponding conversions to integer but return
5401 1.1 ross * unsigned numbers instead since these functions are required by GCC.
5402 1.1 ross *
5403 1.1 ross * Added by Mark Brinicombe <mark (at) netbsd.org> 27/09/97
5404 1.1 ross *
5405 1.1 ross * float64 version overhauled for SoftFloat 2a [bjh21 2000-07-15]
5406 1.1 ross */
5407 1.1 ross
5408 1.1 ross /*
5409 1.1 ross -------------------------------------------------------------------------------
5410 1.1 ross Returns the result of converting the double-precision floating-point value
5411 1.1 ross `a' to the 32-bit unsigned integer format. The conversion is
5412 1.1 ross performed according to the IEC/IEEE Standard for Binary Floating-point
5413 1.1 ross Arithmetic, except that the conversion is always rounded toward zero. If
5414 1.1 ross `a' is a NaN, the largest positive integer is returned. If the conversion
5415 1.1 ross overflows, the largest integer positive is returned.
5416 1.1 ross -------------------------------------------------------------------------------
5417 1.1 ross */
5418 1.1 ross uint32 float64_to_uint32_round_to_zero( float64 a )
5419 1.1 ross {
5420 1.1 ross flag aSign;
5421 1.1 ross int16 aExp, shiftCount;
5422 1.1 ross bits64 aSig, savedASig;
5423 1.1 ross uint32 z;
5424 1.1 ross
5425 1.1 ross aSig = extractFloat64Frac( a );
5426 1.1 ross aExp = extractFloat64Exp( a );
5427 1.1 ross aSign = extractFloat64Sign( a );
5428 1.1 ross
5429 1.1 ross if (aSign) {
5430 1.1 ross float_raise( float_flag_invalid );
5431 1.1 ross return(0);
5432 1.1 ross }
5433 1.1 ross
5434 1.1 ross if ( 0x41E < aExp ) {
5435 1.1 ross float_raise( float_flag_invalid );
5436 1.1 ross return 0xffffffff;
5437 1.1 ross }
5438 1.1 ross else if ( aExp < 0x3FF ) {
5439 1.1 ross if ( aExp || aSig ) float_set_inexact();
5440 1.1 ross return 0;
5441 1.1 ross }
5442 1.1 ross aSig |= LIT64( 0x0010000000000000 );
5443 1.1 ross shiftCount = 0x433 - aExp;
5444 1.1 ross savedASig = aSig;
5445 1.1 ross aSig >>= shiftCount;
5446 1.1 ross z = aSig;
5447 1.1 ross if ( ( aSig<<shiftCount ) != savedASig ) {
5448 1.1 ross float_set_inexact();
5449 1.1 ross }
5450 1.1 ross return z;
5451 1.1 ross
5452 1.1 ross }
5453 1.1 ross
5454 1.1 ross /*
5455 1.1 ross -------------------------------------------------------------------------------
5456 1.1 ross Returns the result of converting the single-precision floating-point value
5457 1.1 ross `a' to the 32-bit unsigned integer format. The conversion is
5458 1.1 ross performed according to the IEC/IEEE Standard for Binary Floating-point
5459 1.1 ross Arithmetic, except that the conversion is always rounded toward zero. If
5460 1.1 ross `a' is a NaN, the largest positive integer is returned. If the conversion
5461 1.1 ross overflows, the largest positive integer is returned.
5462 1.1 ross -------------------------------------------------------------------------------
5463 1.1 ross */
5464 1.1 ross uint32 float32_to_uint32_round_to_zero( float32 a )
5465 1.1 ross {
5466 1.1 ross flag aSign;
5467 1.1 ross int16 aExp, shiftCount;
5468 1.1 ross bits32 aSig;
5469 1.1 ross uint32 z;
5470 1.1 ross
5471 1.1 ross aSig = extractFloat32Frac( a );
5472 1.1 ross aExp = extractFloat32Exp( a );
5473 1.1 ross aSign = extractFloat32Sign( a );
5474 1.1 ross shiftCount = aExp - 0x9E;
5475 1.1 ross
5476 1.1 ross if (aSign) {
5477 1.1 ross float_raise( float_flag_invalid );
5478 1.1 ross return(0);
5479 1.1 ross }
5480 1.1 ross if ( 0 < shiftCount ) {
5481 1.1 ross float_raise( float_flag_invalid );
5482 1.1 ross return 0xFFFFFFFF;
5483 1.1 ross }
5484 1.1 ross else if ( aExp <= 0x7E ) {
5485 1.1 ross if ( aExp | aSig ) float_set_inexact();
5486 1.1 ross return 0;
5487 1.1 ross }
5488 1.1 ross aSig = ( aSig | 0x800000 )<<8;
5489 1.1 ross z = aSig>>( - shiftCount );
5490 1.1 ross if ( aSig<<( shiftCount & 31 ) ) {
5491 1.1 ross float_set_inexact();
5492 1.1 ross }
5493 1.1 ross return z;
5494 1.1 ross
5495 1.1 ross }
5496 1.1 ross
5497 1.1 ross #endif
5498