softfloat.c revision 1.3 1 1.3 keihan /* $NetBSD: softfloat.c,v 1.3 2003/12/04 13:57:31 keihan Exp $ */
2 1.1 ross
3 1.1 ross /*
4 1.1 ross * This version hacked for use with gcc -msoft-float by bjh21.
5 1.1 ross * (Mostly a case of #ifdefing out things GCC doesn't need or provides
6 1.1 ross * itself).
7 1.1 ross */
8 1.1 ross
9 1.1 ross /*
10 1.1 ross * Things you may want to define:
11 1.1 ross *
12 1.1 ross * SOFTFLOAT_FOR_GCC - build only those functions necessary for GCC (with
13 1.1 ross * -msoft-float) to work. Include "softfloat-for-gcc.h" to get them
14 1.1 ross * properly renamed.
15 1.1 ross */
16 1.1 ross
17 1.1 ross /*
18 1.1 ross ===============================================================================
19 1.1 ross
20 1.1 ross This C source file is part of the SoftFloat IEC/IEEE Floating-point
21 1.1 ross Arithmetic Package, Release 2a.
22 1.1 ross
23 1.1 ross Written by John R. Hauser. This work was made possible in part by the
24 1.1 ross International Computer Science Institute, located at Suite 600, 1947 Center
25 1.1 ross Street, Berkeley, California 94704. Funding was partially provided by the
26 1.1 ross National Science Foundation under grant MIP-9311980. The original version
27 1.1 ross of this code was written as part of a project to build a fixed-point vector
28 1.1 ross processor in collaboration with the University of California at Berkeley,
29 1.1 ross overseen by Profs. Nelson Morgan and John Wawrzynek. More information
30 1.1 ross is available through the Web page `http://HTTP.CS.Berkeley.EDU/~jhauser/
31 1.1 ross arithmetic/SoftFloat.html'.
32 1.1 ross
33 1.1 ross THIS SOFTWARE IS DISTRIBUTED AS IS, FOR FREE. Although reasonable effort
34 1.1 ross has been made to avoid it, THIS SOFTWARE MAY CONTAIN FAULTS THAT WILL AT
35 1.1 ross TIMES RESULT IN INCORRECT BEHAVIOR. USE OF THIS SOFTWARE IS RESTRICTED TO
36 1.1 ross PERSONS AND ORGANIZATIONS WHO CAN AND WILL TAKE FULL RESPONSIBILITY FOR ANY
37 1.1 ross AND ALL LOSSES, COSTS, OR OTHER PROBLEMS ARISING FROM ITS USE.
38 1.1 ross
39 1.1 ross Derivative works are acceptable, even for commercial purposes, so long as
40 1.1 ross (1) they include prominent notice that the work is derivative, and (2) they
41 1.1 ross include prominent notice akin to these four paragraphs for those parts of
42 1.1 ross this code that are retained.
43 1.1 ross
44 1.1 ross ===============================================================================
45 1.1 ross */
46 1.1 ross
47 1.2 thorpej /* If you need this in a boot program, you have bigger problems... */
48 1.2 thorpej #ifndef _STANDALONE
49 1.2 thorpej
50 1.1 ross #include <sys/cdefs.h>
51 1.1 ross #if defined(LIBC_SCCS) && !defined(lint)
52 1.3 keihan __RCSID("$NetBSD: softfloat.c,v 1.3 2003/12/04 13:57:31 keihan Exp $");
53 1.1 ross #endif /* LIBC_SCCS and not lint */
54 1.1 ross
55 1.1 ross #ifdef SOFTFLOAT_FOR_GCC
56 1.1 ross #include "softfloat-for-gcc.h"
57 1.1 ross #endif
58 1.1 ross
59 1.1 ross #include "milieu.h"
60 1.1 ross #include "softfloat.h"
61 1.1 ross
62 1.1 ross /*
63 1.1 ross * Conversions between floats as stored in memory and floats as
64 1.1 ross * SoftFloat uses them
65 1.1 ross */
66 1.1 ross #ifndef FLOAT64_DEMANGLE
67 1.1 ross #define FLOAT64_DEMANGLE(a) (a)
68 1.1 ross #endif
69 1.1 ross #ifndef FLOAT64_MANGLE
70 1.1 ross #define FLOAT64_MANGLE(a) (a)
71 1.1 ross #endif
72 1.1 ross
73 1.1 ross /*
74 1.1 ross -------------------------------------------------------------------------------
75 1.1 ross Floating-point rounding mode, extended double-precision rounding precision,
76 1.1 ross and exception flags.
77 1.1 ross -------------------------------------------------------------------------------
78 1.1 ross */
79 1.1 ross
80 1.1 ross /*
81 1.1 ross * XXX: This may cause options-MULTIPROCESSOR or thread problems someday.
82 1.1 ross * Right now, it does not. I've removed all other dynamic global
83 1.1 ross * variables. [ross]
84 1.1 ross */
85 1.1 ross #ifdef FLOATX80
86 1.1 ross int8 floatx80_rounding_precision = 80;
87 1.1 ross #endif
88 1.1 ross
89 1.1 ross /*
90 1.1 ross -------------------------------------------------------------------------------
91 1.1 ross Primitive arithmetic functions, including multi-word arithmetic, and
92 1.1 ross division and square root approximations. (Can be specialized to target if
93 1.1 ross desired.)
94 1.1 ross -------------------------------------------------------------------------------
95 1.1 ross */
96 1.1 ross #include "softfloat-macros.h"
97 1.1 ross
98 1.1 ross /*
99 1.1 ross -------------------------------------------------------------------------------
100 1.1 ross Functions and definitions to determine: (1) whether tininess for underflow
101 1.1 ross is detected before or after rounding by default, (2) what (if anything)
102 1.1 ross happens when exceptions are raised, (3) how signaling NaNs are distinguished
103 1.1 ross from quiet NaNs, (4) the default generated quiet NaNs, and (5) how NaNs
104 1.1 ross are propagated from function inputs to output. These details are target-
105 1.1 ross specific.
106 1.1 ross -------------------------------------------------------------------------------
107 1.1 ross */
108 1.1 ross #include "softfloat-specialize.h"
109 1.1 ross
110 1.1 ross #ifndef SOFTFLOAT_FOR_GCC /* Not used */
111 1.1 ross /*
112 1.1 ross -------------------------------------------------------------------------------
113 1.1 ross Takes a 64-bit fixed-point value `absZ' with binary point between bits 6
114 1.1 ross and 7, and returns the properly rounded 32-bit integer corresponding to the
115 1.1 ross input. If `zSign' is 1, the input is negated before being converted to an
116 1.1 ross integer. Bit 63 of `absZ' must be zero. Ordinarily, the fixed-point input
117 1.1 ross is simply rounded to an integer, with the inexact exception raised if the
118 1.1 ross input cannot be represented exactly as an integer. However, if the fixed-
119 1.1 ross point input is too large, the invalid exception is raised and the largest
120 1.1 ross positive or negative integer is returned.
121 1.1 ross -------------------------------------------------------------------------------
122 1.1 ross */
123 1.1 ross static int32 roundAndPackInt32( flag zSign, bits64 absZ )
124 1.1 ross {
125 1.1 ross int8 roundingMode;
126 1.1 ross flag roundNearestEven;
127 1.1 ross int8 roundIncrement, roundBits;
128 1.1 ross int32 z;
129 1.1 ross
130 1.1 ross roundingMode = float_rounding_mode();
131 1.1 ross roundNearestEven = ( roundingMode == float_round_nearest_even );
132 1.1 ross roundIncrement = 0x40;
133 1.1 ross if ( ! roundNearestEven ) {
134 1.1 ross if ( roundingMode == float_round_to_zero ) {
135 1.1 ross roundIncrement = 0;
136 1.1 ross }
137 1.1 ross else {
138 1.1 ross roundIncrement = 0x7F;
139 1.1 ross if ( zSign ) {
140 1.1 ross if ( roundingMode == float_round_up ) roundIncrement = 0;
141 1.1 ross }
142 1.1 ross else {
143 1.1 ross if ( roundingMode == float_round_down ) roundIncrement = 0;
144 1.1 ross }
145 1.1 ross }
146 1.1 ross }
147 1.1 ross roundBits = absZ & 0x7F;
148 1.1 ross absZ = ( absZ + roundIncrement )>>7;
149 1.1 ross absZ &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven );
150 1.1 ross z = absZ;
151 1.1 ross if ( zSign ) z = - z;
152 1.1 ross if ( ( absZ>>32 ) || ( z && ( ( z < 0 ) ^ zSign ) ) ) {
153 1.1 ross float_raise( float_flag_invalid );
154 1.1 ross return zSign ? (sbits32) 0x80000000 : 0x7FFFFFFF;
155 1.1 ross }
156 1.1 ross if ( roundBits ) float_set_inexact();
157 1.1 ross return z;
158 1.1 ross
159 1.1 ross }
160 1.1 ross
161 1.1 ross /*
162 1.1 ross -------------------------------------------------------------------------------
163 1.1 ross Takes the 128-bit fixed-point value formed by concatenating `absZ0' and
164 1.1 ross `absZ1', with binary point between bits 63 and 64 (between the input words),
165 1.1 ross and returns the properly rounded 64-bit integer corresponding to the input.
166 1.1 ross If `zSign' is 1, the input is negated before being converted to an integer.
167 1.1 ross Ordinarily, the fixed-point input is simply rounded to an integer, with
168 1.1 ross the inexact exception raised if the input cannot be represented exactly as
169 1.1 ross an integer. However, if the fixed-point input is too large, the invalid
170 1.1 ross exception is raised and the largest positive or negative integer is
171 1.1 ross returned.
172 1.1 ross -------------------------------------------------------------------------------
173 1.1 ross */
174 1.1 ross static int64 roundAndPackInt64( flag zSign, bits64 absZ0, bits64 absZ1 )
175 1.1 ross {
176 1.1 ross int8 roundingMode;
177 1.1 ross flag roundNearestEven, increment;
178 1.1 ross int64 z;
179 1.1 ross
180 1.1 ross roundingMode = float_rounding_mode();
181 1.1 ross roundNearestEven = ( roundingMode == float_round_nearest_even );
182 1.1 ross increment = ( (sbits64) absZ1 < 0 );
183 1.1 ross if ( ! roundNearestEven ) {
184 1.1 ross if ( roundingMode == float_round_to_zero ) {
185 1.1 ross increment = 0;
186 1.1 ross }
187 1.1 ross else {
188 1.1 ross if ( zSign ) {
189 1.1 ross increment = ( roundingMode == float_round_down ) && absZ1;
190 1.1 ross }
191 1.1 ross else {
192 1.1 ross increment = ( roundingMode == float_round_up ) && absZ1;
193 1.1 ross }
194 1.1 ross }
195 1.1 ross }
196 1.1 ross if ( increment ) {
197 1.1 ross ++absZ0;
198 1.1 ross if ( absZ0 == 0 ) goto overflow;
199 1.1 ross absZ0 &= ~ ( ( (bits64) ( absZ1<<1 ) == 0 ) & roundNearestEven );
200 1.1 ross }
201 1.1 ross z = absZ0;
202 1.1 ross if ( zSign ) z = - z;
203 1.1 ross if ( z && ( ( z < 0 ) ^ zSign ) ) {
204 1.1 ross overflow:
205 1.1 ross float_raise( float_flag_invalid );
206 1.1 ross return
207 1.1 ross zSign ? (sbits64) LIT64( 0x8000000000000000 )
208 1.1 ross : LIT64( 0x7FFFFFFFFFFFFFFF );
209 1.1 ross }
210 1.1 ross if ( absZ1 ) float_set_inexact();
211 1.1 ross return z;
212 1.1 ross
213 1.1 ross }
214 1.1 ross #endif
215 1.1 ross
216 1.1 ross /*
217 1.1 ross -------------------------------------------------------------------------------
218 1.1 ross Returns the fraction bits of the single-precision floating-point value `a'.
219 1.1 ross -------------------------------------------------------------------------------
220 1.1 ross */
221 1.1 ross INLINE bits32 extractFloat32Frac( float32 a )
222 1.1 ross {
223 1.1 ross
224 1.1 ross return a & 0x007FFFFF;
225 1.1 ross
226 1.1 ross }
227 1.1 ross
228 1.1 ross /*
229 1.1 ross -------------------------------------------------------------------------------
230 1.1 ross Returns the exponent bits of the single-precision floating-point value `a'.
231 1.1 ross -------------------------------------------------------------------------------
232 1.1 ross */
233 1.1 ross INLINE int16 extractFloat32Exp( float32 a )
234 1.1 ross {
235 1.1 ross
236 1.1 ross return ( a>>23 ) & 0xFF;
237 1.1 ross
238 1.1 ross }
239 1.1 ross
240 1.1 ross /*
241 1.1 ross -------------------------------------------------------------------------------
242 1.1 ross Returns the sign bit of the single-precision floating-point value `a'.
243 1.1 ross -------------------------------------------------------------------------------
244 1.1 ross */
245 1.1 ross INLINE flag extractFloat32Sign( float32 a )
246 1.1 ross {
247 1.1 ross
248 1.1 ross return a>>31;
249 1.1 ross
250 1.1 ross }
251 1.1 ross
252 1.1 ross /*
253 1.1 ross -------------------------------------------------------------------------------
254 1.1 ross Normalizes the subnormal single-precision floating-point value represented
255 1.1 ross by the denormalized significand `aSig'. The normalized exponent and
256 1.1 ross significand are stored at the locations pointed to by `zExpPtr' and
257 1.1 ross `zSigPtr', respectively.
258 1.1 ross -------------------------------------------------------------------------------
259 1.1 ross */
260 1.1 ross static void
261 1.1 ross normalizeFloat32Subnormal( bits32 aSig, int16 *zExpPtr, bits32 *zSigPtr )
262 1.1 ross {
263 1.1 ross int8 shiftCount;
264 1.1 ross
265 1.1 ross shiftCount = countLeadingZeros32( aSig ) - 8;
266 1.1 ross *zSigPtr = aSig<<shiftCount;
267 1.1 ross *zExpPtr = 1 - shiftCount;
268 1.1 ross
269 1.1 ross }
270 1.1 ross
271 1.1 ross /*
272 1.1 ross -------------------------------------------------------------------------------
273 1.1 ross Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
274 1.1 ross single-precision floating-point value, returning the result. After being
275 1.1 ross shifted into the proper positions, the three fields are simply added
276 1.1 ross together to form the result. This means that any integer portion of `zSig'
277 1.1 ross will be added into the exponent. Since a properly normalized significand
278 1.1 ross will have an integer portion equal to 1, the `zExp' input should be 1 less
279 1.1 ross than the desired result exponent whenever `zSig' is a complete, normalized
280 1.1 ross significand.
281 1.1 ross -------------------------------------------------------------------------------
282 1.1 ross */
283 1.1 ross INLINE float32 packFloat32( flag zSign, int16 zExp, bits32 zSig )
284 1.1 ross {
285 1.1 ross
286 1.1 ross return ( ( (bits32) zSign )<<31 ) + ( ( (bits32) zExp )<<23 ) + zSig;
287 1.1 ross
288 1.1 ross }
289 1.1 ross
290 1.1 ross /*
291 1.1 ross -------------------------------------------------------------------------------
292 1.1 ross Takes an abstract floating-point value having sign `zSign', exponent `zExp',
293 1.1 ross and significand `zSig', and returns the proper single-precision floating-
294 1.1 ross point value corresponding to the abstract input. Ordinarily, the abstract
295 1.1 ross value is simply rounded and packed into the single-precision format, with
296 1.1 ross the inexact exception raised if the abstract input cannot be represented
297 1.1 ross exactly. However, if the abstract value is too large, the overflow and
298 1.1 ross inexact exceptions are raised and an infinity or maximal finite value is
299 1.1 ross returned. If the abstract value is too small, the input value is rounded to
300 1.1 ross a subnormal number, and the underflow and inexact exceptions are raised if
301 1.1 ross the abstract input cannot be represented exactly as a subnormal single-
302 1.1 ross precision floating-point number.
303 1.1 ross The input significand `zSig' has its binary point between bits 30
304 1.1 ross and 29, which is 7 bits to the left of the usual location. This shifted
305 1.1 ross significand must be normalized or smaller. If `zSig' is not normalized,
306 1.1 ross `zExp' must be 0; in that case, the result returned is a subnormal number,
307 1.1 ross and it must not require rounding. In the usual case that `zSig' is
308 1.1 ross normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
309 1.1 ross The handling of underflow and overflow follows the IEC/IEEE Standard for
310 1.1 ross Binary Floating-Point Arithmetic.
311 1.1 ross -------------------------------------------------------------------------------
312 1.1 ross */
313 1.1 ross static float32 roundAndPackFloat32( flag zSign, int16 zExp, bits32 zSig )
314 1.1 ross {
315 1.1 ross int8 roundingMode;
316 1.1 ross flag roundNearestEven;
317 1.1 ross int8 roundIncrement, roundBits;
318 1.1 ross flag isTiny;
319 1.1 ross
320 1.1 ross roundingMode = float_rounding_mode();
321 1.1 ross roundNearestEven = ( roundingMode == float_round_nearest_even );
322 1.1 ross roundIncrement = 0x40;
323 1.1 ross if ( ! roundNearestEven ) {
324 1.1 ross if ( roundingMode == float_round_to_zero ) {
325 1.1 ross roundIncrement = 0;
326 1.1 ross }
327 1.1 ross else {
328 1.1 ross roundIncrement = 0x7F;
329 1.1 ross if ( zSign ) {
330 1.1 ross if ( roundingMode == float_round_up ) roundIncrement = 0;
331 1.1 ross }
332 1.1 ross else {
333 1.1 ross if ( roundingMode == float_round_down ) roundIncrement = 0;
334 1.1 ross }
335 1.1 ross }
336 1.1 ross }
337 1.1 ross roundBits = zSig & 0x7F;
338 1.1 ross if ( 0xFD <= (bits16) zExp ) {
339 1.1 ross if ( ( 0xFD < zExp )
340 1.1 ross || ( ( zExp == 0xFD )
341 1.1 ross && ( (sbits32) ( zSig + roundIncrement ) < 0 ) )
342 1.1 ross ) {
343 1.1 ross float_raise( float_flag_overflow | float_flag_inexact );
344 1.1 ross return packFloat32( zSign, 0xFF, 0 ) - ( roundIncrement == 0 );
345 1.1 ross }
346 1.1 ross if ( zExp < 0 ) {
347 1.1 ross isTiny =
348 1.1 ross ( float_detect_tininess == float_tininess_before_rounding )
349 1.1 ross || ( zExp < -1 )
350 1.1 ross || ( zSig + roundIncrement < 0x80000000 );
351 1.1 ross shift32RightJamming( zSig, - zExp, &zSig );
352 1.1 ross zExp = 0;
353 1.1 ross roundBits = zSig & 0x7F;
354 1.1 ross if ( isTiny && roundBits ) float_raise( float_flag_underflow );
355 1.1 ross }
356 1.1 ross }
357 1.1 ross if ( roundBits ) float_set_inexact();
358 1.1 ross zSig = ( zSig + roundIncrement )>>7;
359 1.1 ross zSig &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven );
360 1.1 ross if ( zSig == 0 ) zExp = 0;
361 1.1 ross return packFloat32( zSign, zExp, zSig );
362 1.1 ross
363 1.1 ross }
364 1.1 ross
365 1.1 ross /*
366 1.1 ross -------------------------------------------------------------------------------
367 1.1 ross Takes an abstract floating-point value having sign `zSign', exponent `zExp',
368 1.1 ross and significand `zSig', and returns the proper single-precision floating-
369 1.1 ross point value corresponding to the abstract input. This routine is just like
370 1.1 ross `roundAndPackFloat32' except that `zSig' does not have to be normalized.
371 1.1 ross Bit 31 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
372 1.1 ross floating-point exponent.
373 1.1 ross -------------------------------------------------------------------------------
374 1.1 ross */
375 1.1 ross static float32
376 1.1 ross normalizeRoundAndPackFloat32( flag zSign, int16 zExp, bits32 zSig )
377 1.1 ross {
378 1.1 ross int8 shiftCount;
379 1.1 ross
380 1.1 ross shiftCount = countLeadingZeros32( zSig ) - 1;
381 1.1 ross return roundAndPackFloat32( zSign, zExp - shiftCount, zSig<<shiftCount );
382 1.1 ross
383 1.1 ross }
384 1.1 ross
385 1.1 ross /*
386 1.1 ross -------------------------------------------------------------------------------
387 1.1 ross Returns the fraction bits of the double-precision floating-point value `a'.
388 1.1 ross -------------------------------------------------------------------------------
389 1.1 ross */
390 1.1 ross INLINE bits64 extractFloat64Frac( float64 a )
391 1.1 ross {
392 1.1 ross
393 1.1 ross return FLOAT64_DEMANGLE(a) & LIT64( 0x000FFFFFFFFFFFFF );
394 1.1 ross
395 1.1 ross }
396 1.1 ross
397 1.1 ross /*
398 1.1 ross -------------------------------------------------------------------------------
399 1.1 ross Returns the exponent bits of the double-precision floating-point value `a'.
400 1.1 ross -------------------------------------------------------------------------------
401 1.1 ross */
402 1.1 ross INLINE int16 extractFloat64Exp( float64 a )
403 1.1 ross {
404 1.1 ross
405 1.1 ross return ( FLOAT64_DEMANGLE(a)>>52 ) & 0x7FF;
406 1.1 ross
407 1.1 ross }
408 1.1 ross
409 1.1 ross /*
410 1.1 ross -------------------------------------------------------------------------------
411 1.1 ross Returns the sign bit of the double-precision floating-point value `a'.
412 1.1 ross -------------------------------------------------------------------------------
413 1.1 ross */
414 1.1 ross INLINE flag extractFloat64Sign( float64 a )
415 1.1 ross {
416 1.1 ross
417 1.1 ross return FLOAT64_DEMANGLE(a)>>63;
418 1.1 ross
419 1.1 ross }
420 1.1 ross
421 1.1 ross /*
422 1.1 ross -------------------------------------------------------------------------------
423 1.1 ross Normalizes the subnormal double-precision floating-point value represented
424 1.1 ross by the denormalized significand `aSig'. The normalized exponent and
425 1.1 ross significand are stored at the locations pointed to by `zExpPtr' and
426 1.1 ross `zSigPtr', respectively.
427 1.1 ross -------------------------------------------------------------------------------
428 1.1 ross */
429 1.1 ross static void
430 1.1 ross normalizeFloat64Subnormal( bits64 aSig, int16 *zExpPtr, bits64 *zSigPtr )
431 1.1 ross {
432 1.1 ross int8 shiftCount;
433 1.1 ross
434 1.1 ross shiftCount = countLeadingZeros64( aSig ) - 11;
435 1.1 ross *zSigPtr = aSig<<shiftCount;
436 1.1 ross *zExpPtr = 1 - shiftCount;
437 1.1 ross
438 1.1 ross }
439 1.1 ross
440 1.1 ross /*
441 1.1 ross -------------------------------------------------------------------------------
442 1.1 ross Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
443 1.1 ross double-precision floating-point value, returning the result. After being
444 1.1 ross shifted into the proper positions, the three fields are simply added
445 1.1 ross together to form the result. This means that any integer portion of `zSig'
446 1.1 ross will be added into the exponent. Since a properly normalized significand
447 1.1 ross will have an integer portion equal to 1, the `zExp' input should be 1 less
448 1.1 ross than the desired result exponent whenever `zSig' is a complete, normalized
449 1.1 ross significand.
450 1.1 ross -------------------------------------------------------------------------------
451 1.1 ross */
452 1.1 ross INLINE float64 packFloat64( flag zSign, int16 zExp, bits64 zSig )
453 1.1 ross {
454 1.1 ross
455 1.1 ross return FLOAT64_MANGLE( ( ( (bits64) zSign )<<63 ) +
456 1.1 ross ( ( (bits64) zExp )<<52 ) + zSig );
457 1.1 ross
458 1.1 ross }
459 1.1 ross
460 1.1 ross /*
461 1.1 ross -------------------------------------------------------------------------------
462 1.1 ross Takes an abstract floating-point value having sign `zSign', exponent `zExp',
463 1.1 ross and significand `zSig', and returns the proper double-precision floating-
464 1.1 ross point value corresponding to the abstract input. Ordinarily, the abstract
465 1.1 ross value is simply rounded and packed into the double-precision format, with
466 1.1 ross the inexact exception raised if the abstract input cannot be represented
467 1.1 ross exactly. However, if the abstract value is too large, the overflow and
468 1.1 ross inexact exceptions are raised and an infinity or maximal finite value is
469 1.1 ross returned. If the abstract value is too small, the input value is rounded to
470 1.1 ross a subnormal number, and the underflow and inexact exceptions are raised if
471 1.1 ross the abstract input cannot be represented exactly as a subnormal double-
472 1.1 ross precision floating-point number.
473 1.1 ross The input significand `zSig' has its binary point between bits 62
474 1.1 ross and 61, which is 10 bits to the left of the usual location. This shifted
475 1.1 ross significand must be normalized or smaller. If `zSig' is not normalized,
476 1.1 ross `zExp' must be 0; in that case, the result returned is a subnormal number,
477 1.1 ross and it must not require rounding. In the usual case that `zSig' is
478 1.1 ross normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
479 1.1 ross The handling of underflow and overflow follows the IEC/IEEE Standard for
480 1.1 ross Binary Floating-Point Arithmetic.
481 1.1 ross -------------------------------------------------------------------------------
482 1.1 ross */
483 1.1 ross static float64 roundAndPackFloat64( flag zSign, int16 zExp, bits64 zSig )
484 1.1 ross {
485 1.1 ross int8 roundingMode;
486 1.1 ross flag roundNearestEven;
487 1.1 ross int16 roundIncrement, roundBits;
488 1.1 ross flag isTiny;
489 1.1 ross
490 1.1 ross roundingMode = float_rounding_mode();
491 1.1 ross roundNearestEven = ( roundingMode == float_round_nearest_even );
492 1.1 ross roundIncrement = 0x200;
493 1.1 ross if ( ! roundNearestEven ) {
494 1.1 ross if ( roundingMode == float_round_to_zero ) {
495 1.1 ross roundIncrement = 0;
496 1.1 ross }
497 1.1 ross else {
498 1.1 ross roundIncrement = 0x3FF;
499 1.1 ross if ( zSign ) {
500 1.1 ross if ( roundingMode == float_round_up ) roundIncrement = 0;
501 1.1 ross }
502 1.1 ross else {
503 1.1 ross if ( roundingMode == float_round_down ) roundIncrement = 0;
504 1.1 ross }
505 1.1 ross }
506 1.1 ross }
507 1.1 ross roundBits = zSig & 0x3FF;
508 1.1 ross if ( 0x7FD <= (bits16) zExp ) {
509 1.1 ross if ( ( 0x7FD < zExp )
510 1.1 ross || ( ( zExp == 0x7FD )
511 1.1 ross && ( (sbits64) ( zSig + roundIncrement ) < 0 ) )
512 1.1 ross ) {
513 1.1 ross float_raise( float_flag_overflow | float_flag_inexact );
514 1.1 ross return FLOAT64_MANGLE(
515 1.1 ross FLOAT64_DEMANGLE(packFloat64( zSign, 0x7FF, 0 )) -
516 1.1 ross ( roundIncrement == 0 ));
517 1.1 ross }
518 1.1 ross if ( zExp < 0 ) {
519 1.1 ross isTiny =
520 1.1 ross ( float_detect_tininess == float_tininess_before_rounding )
521 1.1 ross || ( zExp < -1 )
522 1.1 ross || ( zSig + roundIncrement < LIT64( 0x8000000000000000 ) );
523 1.1 ross shift64RightJamming( zSig, - zExp, &zSig );
524 1.1 ross zExp = 0;
525 1.1 ross roundBits = zSig & 0x3FF;
526 1.1 ross if ( isTiny && roundBits ) float_raise( float_flag_underflow );
527 1.1 ross }
528 1.1 ross }
529 1.1 ross if ( roundBits ) float_set_inexact();
530 1.1 ross zSig = ( zSig + roundIncrement )>>10;
531 1.1 ross zSig &= ~ ( ( ( roundBits ^ 0x200 ) == 0 ) & roundNearestEven );
532 1.1 ross if ( zSig == 0 ) zExp = 0;
533 1.1 ross return packFloat64( zSign, zExp, zSig );
534 1.1 ross
535 1.1 ross }
536 1.1 ross
537 1.1 ross /*
538 1.1 ross -------------------------------------------------------------------------------
539 1.1 ross Takes an abstract floating-point value having sign `zSign', exponent `zExp',
540 1.1 ross and significand `zSig', and returns the proper double-precision floating-
541 1.1 ross point value corresponding to the abstract input. This routine is just like
542 1.1 ross `roundAndPackFloat64' except that `zSig' does not have to be normalized.
543 1.1 ross Bit 63 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
544 1.1 ross floating-point exponent.
545 1.1 ross -------------------------------------------------------------------------------
546 1.1 ross */
547 1.1 ross static float64
548 1.1 ross normalizeRoundAndPackFloat64( flag zSign, int16 zExp, bits64 zSig )
549 1.1 ross {
550 1.1 ross int8 shiftCount;
551 1.1 ross
552 1.1 ross shiftCount = countLeadingZeros64( zSig ) - 1;
553 1.1 ross return roundAndPackFloat64( zSign, zExp - shiftCount, zSig<<shiftCount );
554 1.1 ross
555 1.1 ross }
556 1.1 ross
557 1.1 ross #ifdef FLOATX80
558 1.1 ross
559 1.1 ross /*
560 1.1 ross -------------------------------------------------------------------------------
561 1.1 ross Returns the fraction bits of the extended double-precision floating-point
562 1.1 ross value `a'.
563 1.1 ross -------------------------------------------------------------------------------
564 1.1 ross */
565 1.1 ross INLINE bits64 extractFloatx80Frac( floatx80 a )
566 1.1 ross {
567 1.1 ross
568 1.1 ross return a.low;
569 1.1 ross
570 1.1 ross }
571 1.1 ross
572 1.1 ross /*
573 1.1 ross -------------------------------------------------------------------------------
574 1.1 ross Returns the exponent bits of the extended double-precision floating-point
575 1.1 ross value `a'.
576 1.1 ross -------------------------------------------------------------------------------
577 1.1 ross */
578 1.1 ross INLINE int32 extractFloatx80Exp( floatx80 a )
579 1.1 ross {
580 1.1 ross
581 1.1 ross return a.high & 0x7FFF;
582 1.1 ross
583 1.1 ross }
584 1.1 ross
585 1.1 ross /*
586 1.1 ross -------------------------------------------------------------------------------
587 1.1 ross Returns the sign bit of the extended double-precision floating-point value
588 1.1 ross `a'.
589 1.1 ross -------------------------------------------------------------------------------
590 1.1 ross */
591 1.1 ross INLINE flag extractFloatx80Sign( floatx80 a )
592 1.1 ross {
593 1.1 ross
594 1.1 ross return a.high>>15;
595 1.1 ross
596 1.1 ross }
597 1.1 ross
598 1.1 ross /*
599 1.1 ross -------------------------------------------------------------------------------
600 1.1 ross Normalizes the subnormal extended double-precision floating-point value
601 1.1 ross represented by the denormalized significand `aSig'. The normalized exponent
602 1.1 ross and significand are stored at the locations pointed to by `zExpPtr' and
603 1.1 ross `zSigPtr', respectively.
604 1.1 ross -------------------------------------------------------------------------------
605 1.1 ross */
606 1.1 ross static void
607 1.1 ross normalizeFloatx80Subnormal( bits64 aSig, int32 *zExpPtr, bits64 *zSigPtr )
608 1.1 ross {
609 1.1 ross int8 shiftCount;
610 1.1 ross
611 1.1 ross shiftCount = countLeadingZeros64( aSig );
612 1.1 ross *zSigPtr = aSig<<shiftCount;
613 1.1 ross *zExpPtr = 1 - shiftCount;
614 1.1 ross
615 1.1 ross }
616 1.1 ross
617 1.1 ross /*
618 1.1 ross -------------------------------------------------------------------------------
619 1.1 ross Packs the sign `zSign', exponent `zExp', and significand `zSig' into an
620 1.1 ross extended double-precision floating-point value, returning the result.
621 1.1 ross -------------------------------------------------------------------------------
622 1.1 ross */
623 1.1 ross INLINE floatx80 packFloatx80( flag zSign, int32 zExp, bits64 zSig )
624 1.1 ross {
625 1.1 ross floatx80 z;
626 1.1 ross
627 1.1 ross z.low = zSig;
628 1.1 ross z.high = ( ( (bits16) zSign )<<15 ) + zExp;
629 1.1 ross return z;
630 1.1 ross
631 1.1 ross }
632 1.1 ross
633 1.1 ross /*
634 1.1 ross -------------------------------------------------------------------------------
635 1.1 ross Takes an abstract floating-point value having sign `zSign', exponent `zExp',
636 1.1 ross and extended significand formed by the concatenation of `zSig0' and `zSig1',
637 1.1 ross and returns the proper extended double-precision floating-point value
638 1.1 ross corresponding to the abstract input. Ordinarily, the abstract value is
639 1.1 ross rounded and packed into the extended double-precision format, with the
640 1.1 ross inexact exception raised if the abstract input cannot be represented
641 1.1 ross exactly. However, if the abstract value is too large, the overflow and
642 1.1 ross inexact exceptions are raised and an infinity or maximal finite value is
643 1.1 ross returned. If the abstract value is too small, the input value is rounded to
644 1.1 ross a subnormal number, and the underflow and inexact exceptions are raised if
645 1.1 ross the abstract input cannot be represented exactly as a subnormal extended
646 1.1 ross double-precision floating-point number.
647 1.1 ross If `roundingPrecision' is 32 or 64, the result is rounded to the same
648 1.1 ross number of bits as single or double precision, respectively. Otherwise, the
649 1.1 ross result is rounded to the full precision of the extended double-precision
650 1.1 ross format.
651 1.1 ross The input significand must be normalized or smaller. If the input
652 1.1 ross significand is not normalized, `zExp' must be 0; in that case, the result
653 1.1 ross returned is a subnormal number, and it must not require rounding. The
654 1.1 ross handling of underflow and overflow follows the IEC/IEEE Standard for Binary
655 1.1 ross Floating-Point Arithmetic.
656 1.1 ross -------------------------------------------------------------------------------
657 1.1 ross */
658 1.1 ross static floatx80
659 1.1 ross roundAndPackFloatx80(
660 1.1 ross int8 roundingPrecision, flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1
661 1.1 ross )
662 1.1 ross {
663 1.1 ross int8 roundingMode;
664 1.1 ross flag roundNearestEven, increment, isTiny;
665 1.1 ross int64 roundIncrement, roundMask, roundBits;
666 1.1 ross
667 1.1 ross roundingMode = float_rounding_mode();
668 1.1 ross roundNearestEven = ( roundingMode == float_round_nearest_even );
669 1.1 ross if ( roundingPrecision == 80 ) goto precision80;
670 1.1 ross if ( roundingPrecision == 64 ) {
671 1.1 ross roundIncrement = LIT64( 0x0000000000000400 );
672 1.1 ross roundMask = LIT64( 0x00000000000007FF );
673 1.1 ross }
674 1.1 ross else if ( roundingPrecision == 32 ) {
675 1.1 ross roundIncrement = LIT64( 0x0000008000000000 );
676 1.1 ross roundMask = LIT64( 0x000000FFFFFFFFFF );
677 1.1 ross }
678 1.1 ross else {
679 1.1 ross goto precision80;
680 1.1 ross }
681 1.1 ross zSig0 |= ( zSig1 != 0 );
682 1.1 ross if ( ! roundNearestEven ) {
683 1.1 ross if ( roundingMode == float_round_to_zero ) {
684 1.1 ross roundIncrement = 0;
685 1.1 ross }
686 1.1 ross else {
687 1.1 ross roundIncrement = roundMask;
688 1.1 ross if ( zSign ) {
689 1.1 ross if ( roundingMode == float_round_up ) roundIncrement = 0;
690 1.1 ross }
691 1.1 ross else {
692 1.1 ross if ( roundingMode == float_round_down ) roundIncrement = 0;
693 1.1 ross }
694 1.1 ross }
695 1.1 ross }
696 1.1 ross roundBits = zSig0 & roundMask;
697 1.1 ross if ( 0x7FFD <= (bits32) ( zExp - 1 ) ) {
698 1.1 ross if ( ( 0x7FFE < zExp )
699 1.1 ross || ( ( zExp == 0x7FFE ) && ( zSig0 + roundIncrement < zSig0 ) )
700 1.1 ross ) {
701 1.1 ross goto overflow;
702 1.1 ross }
703 1.1 ross if ( zExp <= 0 ) {
704 1.1 ross isTiny =
705 1.1 ross ( float_detect_tininess == float_tininess_before_rounding )
706 1.1 ross || ( zExp < 0 )
707 1.1 ross || ( zSig0 <= zSig0 + roundIncrement );
708 1.1 ross shift64RightJamming( zSig0, 1 - zExp, &zSig0 );
709 1.1 ross zExp = 0;
710 1.1 ross roundBits = zSig0 & roundMask;
711 1.1 ross if ( isTiny && roundBits ) float_raise( float_flag_underflow );
712 1.1 ross if ( roundBits ) float_set_inexact();
713 1.1 ross zSig0 += roundIncrement;
714 1.1 ross if ( (sbits64) zSig0 < 0 ) zExp = 1;
715 1.1 ross roundIncrement = roundMask + 1;
716 1.1 ross if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) {
717 1.1 ross roundMask |= roundIncrement;
718 1.1 ross }
719 1.1 ross zSig0 &= ~ roundMask;
720 1.1 ross return packFloatx80( zSign, zExp, zSig0 );
721 1.1 ross }
722 1.1 ross }
723 1.1 ross if ( roundBits ) float_set_inexact();
724 1.1 ross zSig0 += roundIncrement;
725 1.1 ross if ( zSig0 < roundIncrement ) {
726 1.1 ross ++zExp;
727 1.1 ross zSig0 = LIT64( 0x8000000000000000 );
728 1.1 ross }
729 1.1 ross roundIncrement = roundMask + 1;
730 1.1 ross if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) {
731 1.1 ross roundMask |= roundIncrement;
732 1.1 ross }
733 1.1 ross zSig0 &= ~ roundMask;
734 1.1 ross if ( zSig0 == 0 ) zExp = 0;
735 1.1 ross return packFloatx80( zSign, zExp, zSig0 );
736 1.1 ross precision80:
737 1.1 ross increment = ( (sbits64) zSig1 < 0 );
738 1.1 ross if ( ! roundNearestEven ) {
739 1.1 ross if ( roundingMode == float_round_to_zero ) {
740 1.1 ross increment = 0;
741 1.1 ross }
742 1.1 ross else {
743 1.1 ross if ( zSign ) {
744 1.1 ross increment = ( roundingMode == float_round_down ) && zSig1;
745 1.1 ross }
746 1.1 ross else {
747 1.1 ross increment = ( roundingMode == float_round_up ) && zSig1;
748 1.1 ross }
749 1.1 ross }
750 1.1 ross }
751 1.1 ross if ( 0x7FFD <= (bits32) ( zExp - 1 ) ) {
752 1.1 ross if ( ( 0x7FFE < zExp )
753 1.1 ross || ( ( zExp == 0x7FFE )
754 1.1 ross && ( zSig0 == LIT64( 0xFFFFFFFFFFFFFFFF ) )
755 1.1 ross && increment
756 1.1 ross )
757 1.1 ross ) {
758 1.1 ross roundMask = 0;
759 1.1 ross overflow:
760 1.1 ross float_raise( float_flag_overflow | float_flag_inexact );
761 1.1 ross if ( ( roundingMode == float_round_to_zero )
762 1.1 ross || ( zSign && ( roundingMode == float_round_up ) )
763 1.1 ross || ( ! zSign && ( roundingMode == float_round_down ) )
764 1.1 ross ) {
765 1.1 ross return packFloatx80( zSign, 0x7FFE, ~ roundMask );
766 1.1 ross }
767 1.1 ross return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
768 1.1 ross }
769 1.1 ross if ( zExp <= 0 ) {
770 1.1 ross isTiny =
771 1.1 ross ( float_detect_tininess == float_tininess_before_rounding )
772 1.1 ross || ( zExp < 0 )
773 1.1 ross || ! increment
774 1.1 ross || ( zSig0 < LIT64( 0xFFFFFFFFFFFFFFFF ) );
775 1.1 ross shift64ExtraRightJamming( zSig0, zSig1, 1 - zExp, &zSig0, &zSig1 );
776 1.1 ross zExp = 0;
777 1.1 ross if ( isTiny && zSig1 ) float_raise( float_flag_underflow );
778 1.1 ross if ( zSig1 ) float_set_inexact();
779 1.1 ross if ( roundNearestEven ) {
780 1.1 ross increment = ( (sbits64) zSig1 < 0 );
781 1.1 ross }
782 1.1 ross else {
783 1.1 ross if ( zSign ) {
784 1.1 ross increment = ( roundingMode == float_round_down ) && zSig1;
785 1.1 ross }
786 1.1 ross else {
787 1.1 ross increment = ( roundingMode == float_round_up ) && zSig1;
788 1.1 ross }
789 1.1 ross }
790 1.1 ross if ( increment ) {
791 1.1 ross ++zSig0;
792 1.1 ross zSig0 &=
793 1.1 ross ~ ( ( (bits64) ( zSig1<<1 ) == 0 ) & roundNearestEven );
794 1.1 ross if ( (sbits64) zSig0 < 0 ) zExp = 1;
795 1.1 ross }
796 1.1 ross return packFloatx80( zSign, zExp, zSig0 );
797 1.1 ross }
798 1.1 ross }
799 1.1 ross if ( zSig1 ) float_set_inexact();
800 1.1 ross if ( increment ) {
801 1.1 ross ++zSig0;
802 1.1 ross if ( zSig0 == 0 ) {
803 1.1 ross ++zExp;
804 1.1 ross zSig0 = LIT64( 0x8000000000000000 );
805 1.1 ross }
806 1.1 ross else {
807 1.1 ross zSig0 &= ~ ( ( (bits64) ( zSig1<<1 ) == 0 ) & roundNearestEven );
808 1.1 ross }
809 1.1 ross }
810 1.1 ross else {
811 1.1 ross if ( zSig0 == 0 ) zExp = 0;
812 1.1 ross }
813 1.1 ross return packFloatx80( zSign, zExp, zSig0 );
814 1.1 ross
815 1.1 ross }
816 1.1 ross
817 1.1 ross /*
818 1.1 ross -------------------------------------------------------------------------------
819 1.1 ross Takes an abstract floating-point value having sign `zSign', exponent
820 1.1 ross `zExp', and significand formed by the concatenation of `zSig0' and `zSig1',
821 1.1 ross and returns the proper extended double-precision floating-point value
822 1.1 ross corresponding to the abstract input. This routine is just like
823 1.1 ross `roundAndPackFloatx80' except that the input significand does not have to be
824 1.1 ross normalized.
825 1.1 ross -------------------------------------------------------------------------------
826 1.1 ross */
827 1.1 ross static floatx80
828 1.1 ross normalizeRoundAndPackFloatx80(
829 1.1 ross int8 roundingPrecision, flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1
830 1.1 ross )
831 1.1 ross {
832 1.1 ross int8 shiftCount;
833 1.1 ross
834 1.1 ross if ( zSig0 == 0 ) {
835 1.1 ross zSig0 = zSig1;
836 1.1 ross zSig1 = 0;
837 1.1 ross zExp -= 64;
838 1.1 ross }
839 1.1 ross shiftCount = countLeadingZeros64( zSig0 );
840 1.1 ross shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
841 1.1 ross zExp -= shiftCount;
842 1.1 ross return
843 1.1 ross roundAndPackFloatx80( roundingPrecision, zSign, zExp, zSig0, zSig1 );
844 1.1 ross
845 1.1 ross }
846 1.1 ross
847 1.1 ross #endif
848 1.1 ross
849 1.1 ross #ifdef FLOAT128
850 1.1 ross
851 1.1 ross /*
852 1.1 ross -------------------------------------------------------------------------------
853 1.1 ross Returns the least-significant 64 fraction bits of the quadruple-precision
854 1.1 ross floating-point value `a'.
855 1.1 ross -------------------------------------------------------------------------------
856 1.1 ross */
857 1.1 ross INLINE bits64 extractFloat128Frac1( float128 a )
858 1.1 ross {
859 1.1 ross
860 1.1 ross return a.low;
861 1.1 ross
862 1.1 ross }
863 1.1 ross
864 1.1 ross /*
865 1.1 ross -------------------------------------------------------------------------------
866 1.1 ross Returns the most-significant 48 fraction bits of the quadruple-precision
867 1.1 ross floating-point value `a'.
868 1.1 ross -------------------------------------------------------------------------------
869 1.1 ross */
870 1.1 ross INLINE bits64 extractFloat128Frac0( float128 a )
871 1.1 ross {
872 1.1 ross
873 1.1 ross return a.high & LIT64( 0x0000FFFFFFFFFFFF );
874 1.1 ross
875 1.1 ross }
876 1.1 ross
877 1.1 ross /*
878 1.1 ross -------------------------------------------------------------------------------
879 1.1 ross Returns the exponent bits of the quadruple-precision floating-point value
880 1.1 ross `a'.
881 1.1 ross -------------------------------------------------------------------------------
882 1.1 ross */
883 1.1 ross INLINE int32 extractFloat128Exp( float128 a )
884 1.1 ross {
885 1.1 ross
886 1.1 ross return ( a.high>>48 ) & 0x7FFF;
887 1.1 ross
888 1.1 ross }
889 1.1 ross
890 1.1 ross /*
891 1.1 ross -------------------------------------------------------------------------------
892 1.1 ross Returns the sign bit of the quadruple-precision floating-point value `a'.
893 1.1 ross -------------------------------------------------------------------------------
894 1.1 ross */
895 1.1 ross INLINE flag extractFloat128Sign( float128 a )
896 1.1 ross {
897 1.1 ross
898 1.1 ross return a.high>>63;
899 1.1 ross
900 1.1 ross }
901 1.1 ross
902 1.1 ross /*
903 1.1 ross -------------------------------------------------------------------------------
904 1.1 ross Normalizes the subnormal quadruple-precision floating-point value
905 1.1 ross represented by the denormalized significand formed by the concatenation of
906 1.1 ross `aSig0' and `aSig1'. The normalized exponent is stored at the location
907 1.1 ross pointed to by `zExpPtr'. The most significant 49 bits of the normalized
908 1.1 ross significand are stored at the location pointed to by `zSig0Ptr', and the
909 1.1 ross least significant 64 bits of the normalized significand are stored at the
910 1.1 ross location pointed to by `zSig1Ptr'.
911 1.1 ross -------------------------------------------------------------------------------
912 1.1 ross */
913 1.1 ross static void
914 1.1 ross normalizeFloat128Subnormal(
915 1.1 ross bits64 aSig0,
916 1.1 ross bits64 aSig1,
917 1.1 ross int32 *zExpPtr,
918 1.1 ross bits64 *zSig0Ptr,
919 1.1 ross bits64 *zSig1Ptr
920 1.1 ross )
921 1.1 ross {
922 1.1 ross int8 shiftCount;
923 1.1 ross
924 1.1 ross if ( aSig0 == 0 ) {
925 1.1 ross shiftCount = countLeadingZeros64( aSig1 ) - 15;
926 1.1 ross if ( shiftCount < 0 ) {
927 1.1 ross *zSig0Ptr = aSig1>>( - shiftCount );
928 1.1 ross *zSig1Ptr = aSig1<<( shiftCount & 63 );
929 1.1 ross }
930 1.1 ross else {
931 1.1 ross *zSig0Ptr = aSig1<<shiftCount;
932 1.1 ross *zSig1Ptr = 0;
933 1.1 ross }
934 1.1 ross *zExpPtr = - shiftCount - 63;
935 1.1 ross }
936 1.1 ross else {
937 1.1 ross shiftCount = countLeadingZeros64( aSig0 ) - 15;
938 1.1 ross shortShift128Left( aSig0, aSig1, shiftCount, zSig0Ptr, zSig1Ptr );
939 1.1 ross *zExpPtr = 1 - shiftCount;
940 1.1 ross }
941 1.1 ross
942 1.1 ross }
943 1.1 ross
944 1.1 ross /*
945 1.1 ross -------------------------------------------------------------------------------
946 1.1 ross Packs the sign `zSign', the exponent `zExp', and the significand formed
947 1.1 ross by the concatenation of `zSig0' and `zSig1' into a quadruple-precision
948 1.1 ross floating-point value, returning the result. After being shifted into the
949 1.1 ross proper positions, the three fields `zSign', `zExp', and `zSig0' are simply
950 1.1 ross added together to form the most significant 32 bits of the result. This
951 1.1 ross means that any integer portion of `zSig0' will be added into the exponent.
952 1.1 ross Since a properly normalized significand will have an integer portion equal
953 1.1 ross to 1, the `zExp' input should be 1 less than the desired result exponent
954 1.1 ross whenever `zSig0' and `zSig1' concatenated form a complete, normalized
955 1.1 ross significand.
956 1.1 ross -------------------------------------------------------------------------------
957 1.1 ross */
958 1.1 ross INLINE float128
959 1.1 ross packFloat128( flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1 )
960 1.1 ross {
961 1.1 ross float128 z;
962 1.1 ross
963 1.1 ross z.low = zSig1;
964 1.1 ross z.high = ( ( (bits64) zSign )<<63 ) + ( ( (bits64) zExp )<<48 ) + zSig0;
965 1.1 ross return z;
966 1.1 ross
967 1.1 ross }
968 1.1 ross
969 1.1 ross /*
970 1.1 ross -------------------------------------------------------------------------------
971 1.1 ross Takes an abstract floating-point value having sign `zSign', exponent `zExp',
972 1.1 ross and extended significand formed by the concatenation of `zSig0', `zSig1',
973 1.1 ross and `zSig2', and returns the proper quadruple-precision floating-point value
974 1.1 ross corresponding to the abstract input. Ordinarily, the abstract value is
975 1.1 ross simply rounded and packed into the quadruple-precision format, with the
976 1.1 ross inexact exception raised if the abstract input cannot be represented
977 1.1 ross exactly. However, if the abstract value is too large, the overflow and
978 1.1 ross inexact exceptions are raised and an infinity or maximal finite value is
979 1.1 ross returned. If the abstract value is too small, the input value is rounded to
980 1.1 ross a subnormal number, and the underflow and inexact exceptions are raised if
981 1.1 ross the abstract input cannot be represented exactly as a subnormal quadruple-
982 1.1 ross precision floating-point number.
983 1.1 ross The input significand must be normalized or smaller. If the input
984 1.1 ross significand is not normalized, `zExp' must be 0; in that case, the result
985 1.1 ross returned is a subnormal number, and it must not require rounding. In the
986 1.1 ross usual case that the input significand is normalized, `zExp' must be 1 less
987 1.1 ross than the ``true'' floating-point exponent. The handling of underflow and
988 1.1 ross overflow follows the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
989 1.1 ross -------------------------------------------------------------------------------
990 1.1 ross */
991 1.1 ross static float128
992 1.1 ross roundAndPackFloat128(
993 1.1 ross flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1, bits64 zSig2 )
994 1.1 ross {
995 1.1 ross int8 roundingMode;
996 1.1 ross flag roundNearestEven, increment, isTiny;
997 1.1 ross
998 1.1 ross roundingMode = float_rounding_mode();
999 1.1 ross roundNearestEven = ( roundingMode == float_round_nearest_even );
1000 1.1 ross increment = ( (sbits64) zSig2 < 0 );
1001 1.1 ross if ( ! roundNearestEven ) {
1002 1.1 ross if ( roundingMode == float_round_to_zero ) {
1003 1.1 ross increment = 0;
1004 1.1 ross }
1005 1.1 ross else {
1006 1.1 ross if ( zSign ) {
1007 1.1 ross increment = ( roundingMode == float_round_down ) && zSig2;
1008 1.1 ross }
1009 1.1 ross else {
1010 1.1 ross increment = ( roundingMode == float_round_up ) && zSig2;
1011 1.1 ross }
1012 1.1 ross }
1013 1.1 ross }
1014 1.1 ross if ( 0x7FFD <= (bits32) zExp ) {
1015 1.1 ross if ( ( 0x7FFD < zExp )
1016 1.1 ross || ( ( zExp == 0x7FFD )
1017 1.1 ross && eq128(
1018 1.1 ross LIT64( 0x0001FFFFFFFFFFFF ),
1019 1.1 ross LIT64( 0xFFFFFFFFFFFFFFFF ),
1020 1.1 ross zSig0,
1021 1.1 ross zSig1
1022 1.1 ross )
1023 1.1 ross && increment
1024 1.1 ross )
1025 1.1 ross ) {
1026 1.1 ross float_raise( float_flag_overflow | float_flag_inexact );
1027 1.1 ross if ( ( roundingMode == float_round_to_zero )
1028 1.1 ross || ( zSign && ( roundingMode == float_round_up ) )
1029 1.1 ross || ( ! zSign && ( roundingMode == float_round_down ) )
1030 1.1 ross ) {
1031 1.1 ross return
1032 1.1 ross packFloat128(
1033 1.1 ross zSign,
1034 1.1 ross 0x7FFE,
1035 1.1 ross LIT64( 0x0000FFFFFFFFFFFF ),
1036 1.1 ross LIT64( 0xFFFFFFFFFFFFFFFF )
1037 1.1 ross );
1038 1.1 ross }
1039 1.1 ross return packFloat128( zSign, 0x7FFF, 0, 0 );
1040 1.1 ross }
1041 1.1 ross if ( zExp < 0 ) {
1042 1.1 ross isTiny =
1043 1.1 ross ( float_detect_tininess == float_tininess_before_rounding )
1044 1.1 ross || ( zExp < -1 )
1045 1.1 ross || ! increment
1046 1.1 ross || lt128(
1047 1.1 ross zSig0,
1048 1.1 ross zSig1,
1049 1.1 ross LIT64( 0x0001FFFFFFFFFFFF ),
1050 1.1 ross LIT64( 0xFFFFFFFFFFFFFFFF )
1051 1.1 ross );
1052 1.1 ross shift128ExtraRightJamming(
1053 1.1 ross zSig0, zSig1, zSig2, - zExp, &zSig0, &zSig1, &zSig2 );
1054 1.1 ross zExp = 0;
1055 1.1 ross if ( isTiny && zSig2 ) float_raise( float_flag_underflow );
1056 1.1 ross if ( roundNearestEven ) {
1057 1.1 ross increment = ( (sbits64) zSig2 < 0 );
1058 1.1 ross }
1059 1.1 ross else {
1060 1.1 ross if ( zSign ) {
1061 1.1 ross increment = ( roundingMode == float_round_down ) && zSig2;
1062 1.1 ross }
1063 1.1 ross else {
1064 1.1 ross increment = ( roundingMode == float_round_up ) && zSig2;
1065 1.1 ross }
1066 1.1 ross }
1067 1.1 ross }
1068 1.1 ross }
1069 1.1 ross if ( zSig2 ) float_set_inexact();
1070 1.1 ross if ( increment ) {
1071 1.1 ross add128( zSig0, zSig1, 0, 1, &zSig0, &zSig1 );
1072 1.1 ross zSig1 &= ~ ( ( zSig2 + zSig2 == 0 ) & roundNearestEven );
1073 1.1 ross }
1074 1.1 ross else {
1075 1.1 ross if ( ( zSig0 | zSig1 ) == 0 ) zExp = 0;
1076 1.1 ross }
1077 1.1 ross return packFloat128( zSign, zExp, zSig0, zSig1 );
1078 1.1 ross
1079 1.1 ross }
1080 1.1 ross
1081 1.1 ross /*
1082 1.1 ross -------------------------------------------------------------------------------
1083 1.1 ross Takes an abstract floating-point value having sign `zSign', exponent `zExp',
1084 1.1 ross and significand formed by the concatenation of `zSig0' and `zSig1', and
1085 1.1 ross returns the proper quadruple-precision floating-point value corresponding
1086 1.1 ross to the abstract input. This routine is just like `roundAndPackFloat128'
1087 1.1 ross except that the input significand has fewer bits and does not have to be
1088 1.1 ross normalized. In all cases, `zExp' must be 1 less than the ``true'' floating-
1089 1.1 ross point exponent.
1090 1.1 ross -------------------------------------------------------------------------------
1091 1.1 ross */
1092 1.1 ross static float128
1093 1.1 ross normalizeRoundAndPackFloat128(
1094 1.1 ross flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1 )
1095 1.1 ross {
1096 1.1 ross int8 shiftCount;
1097 1.1 ross bits64 zSig2;
1098 1.1 ross
1099 1.1 ross if ( zSig0 == 0 ) {
1100 1.1 ross zSig0 = zSig1;
1101 1.1 ross zSig1 = 0;
1102 1.1 ross zExp -= 64;
1103 1.1 ross }
1104 1.1 ross shiftCount = countLeadingZeros64( zSig0 ) - 15;
1105 1.1 ross if ( 0 <= shiftCount ) {
1106 1.1 ross zSig2 = 0;
1107 1.1 ross shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
1108 1.1 ross }
1109 1.1 ross else {
1110 1.1 ross shift128ExtraRightJamming(
1111 1.1 ross zSig0, zSig1, 0, - shiftCount, &zSig0, &zSig1, &zSig2 );
1112 1.1 ross }
1113 1.1 ross zExp -= shiftCount;
1114 1.1 ross return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 );
1115 1.1 ross
1116 1.1 ross }
1117 1.1 ross
1118 1.1 ross #endif
1119 1.1 ross
1120 1.1 ross /*
1121 1.1 ross -------------------------------------------------------------------------------
1122 1.1 ross Returns the result of converting the 32-bit two's complement integer `a'
1123 1.1 ross to the single-precision floating-point format. The conversion is performed
1124 1.1 ross according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1125 1.1 ross -------------------------------------------------------------------------------
1126 1.1 ross */
1127 1.1 ross float32 int32_to_float32( int32 a )
1128 1.1 ross {
1129 1.1 ross flag zSign;
1130 1.1 ross
1131 1.1 ross if ( a == 0 ) return 0;
1132 1.1 ross if ( a == (sbits32) 0x80000000 ) return packFloat32( 1, 0x9E, 0 );
1133 1.1 ross zSign = ( a < 0 );
1134 1.1 ross return normalizeRoundAndPackFloat32( zSign, 0x9C, zSign ? - a : a );
1135 1.1 ross
1136 1.1 ross }
1137 1.1 ross
1138 1.1 ross /*
1139 1.1 ross -------------------------------------------------------------------------------
1140 1.1 ross Returns the result of converting the 32-bit two's complement integer `a'
1141 1.1 ross to the double-precision floating-point format. The conversion is performed
1142 1.1 ross according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1143 1.1 ross -------------------------------------------------------------------------------
1144 1.1 ross */
1145 1.1 ross float64 int32_to_float64( int32 a )
1146 1.1 ross {
1147 1.1 ross flag zSign;
1148 1.1 ross uint32 absA;
1149 1.1 ross int8 shiftCount;
1150 1.1 ross bits64 zSig;
1151 1.1 ross
1152 1.1 ross if ( a == 0 ) return 0;
1153 1.1 ross zSign = ( a < 0 );
1154 1.1 ross absA = zSign ? - a : a;
1155 1.1 ross shiftCount = countLeadingZeros32( absA ) + 21;
1156 1.1 ross zSig = absA;
1157 1.1 ross return packFloat64( zSign, 0x432 - shiftCount, zSig<<shiftCount );
1158 1.1 ross
1159 1.1 ross }
1160 1.1 ross
1161 1.1 ross #ifdef FLOATX80
1162 1.1 ross
1163 1.1 ross /*
1164 1.1 ross -------------------------------------------------------------------------------
1165 1.1 ross Returns the result of converting the 32-bit two's complement integer `a'
1166 1.1 ross to the extended double-precision floating-point format. The conversion
1167 1.1 ross is performed according to the IEC/IEEE Standard for Binary Floating-Point
1168 1.1 ross Arithmetic.
1169 1.1 ross -------------------------------------------------------------------------------
1170 1.1 ross */
1171 1.1 ross floatx80 int32_to_floatx80( int32 a )
1172 1.1 ross {
1173 1.1 ross flag zSign;
1174 1.1 ross uint32 absA;
1175 1.1 ross int8 shiftCount;
1176 1.1 ross bits64 zSig;
1177 1.1 ross
1178 1.1 ross if ( a == 0 ) return packFloatx80( 0, 0, 0 );
1179 1.1 ross zSign = ( a < 0 );
1180 1.1 ross absA = zSign ? - a : a;
1181 1.1 ross shiftCount = countLeadingZeros32( absA ) + 32;
1182 1.1 ross zSig = absA;
1183 1.1 ross return packFloatx80( zSign, 0x403E - shiftCount, zSig<<shiftCount );
1184 1.1 ross
1185 1.1 ross }
1186 1.1 ross
1187 1.1 ross #endif
1188 1.1 ross
1189 1.1 ross #ifdef FLOAT128
1190 1.1 ross
1191 1.1 ross /*
1192 1.1 ross -------------------------------------------------------------------------------
1193 1.1 ross Returns the result of converting the 32-bit two's complement integer `a' to
1194 1.1 ross the quadruple-precision floating-point format. The conversion is performed
1195 1.1 ross according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1196 1.1 ross -------------------------------------------------------------------------------
1197 1.1 ross */
1198 1.1 ross float128 int32_to_float128( int32 a )
1199 1.1 ross {
1200 1.1 ross flag zSign;
1201 1.1 ross uint32 absA;
1202 1.1 ross int8 shiftCount;
1203 1.1 ross bits64 zSig0;
1204 1.1 ross
1205 1.1 ross if ( a == 0 ) return packFloat128( 0, 0, 0, 0 );
1206 1.1 ross zSign = ( a < 0 );
1207 1.1 ross absA = zSign ? - a : a;
1208 1.1 ross shiftCount = countLeadingZeros32( absA ) + 17;
1209 1.1 ross zSig0 = absA;
1210 1.1 ross return packFloat128( zSign, 0x402E - shiftCount, zSig0<<shiftCount, 0 );
1211 1.1 ross
1212 1.1 ross }
1213 1.1 ross
1214 1.1 ross #endif
1215 1.1 ross
1216 1.1 ross #ifndef SOFTFLOAT_FOR_GCC /* __floatdi?f is in libgcc2.c */
1217 1.1 ross /*
1218 1.1 ross -------------------------------------------------------------------------------
1219 1.1 ross Returns the result of converting the 64-bit two's complement integer `a'
1220 1.1 ross to the single-precision floating-point format. The conversion is performed
1221 1.1 ross according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1222 1.1 ross -------------------------------------------------------------------------------
1223 1.1 ross */
1224 1.1 ross float32 int64_to_float32( int64 a )
1225 1.1 ross {
1226 1.1 ross flag zSign;
1227 1.1 ross uint64 absA;
1228 1.1 ross int8 shiftCount;
1229 1.1 ross
1230 1.1 ross if ( a == 0 ) return 0;
1231 1.1 ross zSign = ( a < 0 );
1232 1.1 ross absA = zSign ? - a : a;
1233 1.1 ross shiftCount = countLeadingZeros64( absA ) - 40;
1234 1.1 ross if ( 0 <= shiftCount ) {
1235 1.1 ross return packFloat32( zSign, 0x95 - shiftCount, absA<<shiftCount );
1236 1.1 ross }
1237 1.1 ross else {
1238 1.1 ross shiftCount += 7;
1239 1.1 ross if ( shiftCount < 0 ) {
1240 1.1 ross shift64RightJamming( absA, - shiftCount, &absA );
1241 1.1 ross }
1242 1.1 ross else {
1243 1.1 ross absA <<= shiftCount;
1244 1.1 ross }
1245 1.1 ross return roundAndPackFloat32( zSign, 0x9C - shiftCount, absA );
1246 1.1 ross }
1247 1.1 ross
1248 1.1 ross }
1249 1.1 ross
1250 1.1 ross /*
1251 1.1 ross -------------------------------------------------------------------------------
1252 1.1 ross Returns the result of converting the 64-bit two's complement integer `a'
1253 1.1 ross to the double-precision floating-point format. The conversion is performed
1254 1.1 ross according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1255 1.1 ross -------------------------------------------------------------------------------
1256 1.1 ross */
1257 1.1 ross float64 int64_to_float64( int64 a )
1258 1.1 ross {
1259 1.1 ross flag zSign;
1260 1.1 ross
1261 1.1 ross if ( a == 0 ) return 0;
1262 1.1 ross if ( a == (sbits64) LIT64( 0x8000000000000000 ) ) {
1263 1.1 ross return packFloat64( 1, 0x43E, 0 );
1264 1.1 ross }
1265 1.1 ross zSign = ( a < 0 );
1266 1.1 ross return normalizeRoundAndPackFloat64( zSign, 0x43C, zSign ? - a : a );
1267 1.1 ross
1268 1.1 ross }
1269 1.1 ross
1270 1.1 ross #ifdef FLOATX80
1271 1.1 ross
1272 1.1 ross /*
1273 1.1 ross -------------------------------------------------------------------------------
1274 1.1 ross Returns the result of converting the 64-bit two's complement integer `a'
1275 1.1 ross to the extended double-precision floating-point format. The conversion
1276 1.1 ross is performed according to the IEC/IEEE Standard for Binary Floating-Point
1277 1.1 ross Arithmetic.
1278 1.1 ross -------------------------------------------------------------------------------
1279 1.1 ross */
1280 1.1 ross floatx80 int64_to_floatx80( int64 a )
1281 1.1 ross {
1282 1.1 ross flag zSign;
1283 1.1 ross uint64 absA;
1284 1.1 ross int8 shiftCount;
1285 1.1 ross
1286 1.1 ross if ( a == 0 ) return packFloatx80( 0, 0, 0 );
1287 1.1 ross zSign = ( a < 0 );
1288 1.1 ross absA = zSign ? - a : a;
1289 1.1 ross shiftCount = countLeadingZeros64( absA );
1290 1.1 ross return packFloatx80( zSign, 0x403E - shiftCount, absA<<shiftCount );
1291 1.1 ross
1292 1.1 ross }
1293 1.1 ross
1294 1.1 ross #endif
1295 1.1 ross
1296 1.1 ross #ifdef FLOAT128
1297 1.1 ross
1298 1.1 ross /*
1299 1.1 ross -------------------------------------------------------------------------------
1300 1.1 ross Returns the result of converting the 64-bit two's complement integer `a' to
1301 1.1 ross the quadruple-precision floating-point format. The conversion is performed
1302 1.1 ross according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1303 1.1 ross -------------------------------------------------------------------------------
1304 1.1 ross */
1305 1.1 ross float128 int64_to_float128( int64 a )
1306 1.1 ross {
1307 1.1 ross flag zSign;
1308 1.1 ross uint64 absA;
1309 1.1 ross int8 shiftCount;
1310 1.1 ross int32 zExp;
1311 1.1 ross bits64 zSig0, zSig1;
1312 1.1 ross
1313 1.1 ross if ( a == 0 ) return packFloat128( 0, 0, 0, 0 );
1314 1.1 ross zSign = ( a < 0 );
1315 1.1 ross absA = zSign ? - a : a;
1316 1.1 ross shiftCount = countLeadingZeros64( absA ) + 49;
1317 1.1 ross zExp = 0x406E - shiftCount;
1318 1.1 ross if ( 64 <= shiftCount ) {
1319 1.1 ross zSig1 = 0;
1320 1.1 ross zSig0 = absA;
1321 1.1 ross shiftCount -= 64;
1322 1.1 ross }
1323 1.1 ross else {
1324 1.1 ross zSig1 = absA;
1325 1.1 ross zSig0 = 0;
1326 1.1 ross }
1327 1.1 ross shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
1328 1.1 ross return packFloat128( zSign, zExp, zSig0, zSig1 );
1329 1.1 ross
1330 1.1 ross }
1331 1.1 ross
1332 1.1 ross #endif
1333 1.1 ross #endif /* !SOFTFLOAT_FOR_GCC */
1334 1.1 ross
1335 1.1 ross #ifndef SOFTFLOAT_FOR_GCC /* Not needed */
1336 1.1 ross /*
1337 1.1 ross -------------------------------------------------------------------------------
1338 1.1 ross Returns the result of converting the single-precision floating-point value
1339 1.1 ross `a' to the 32-bit two's complement integer format. The conversion is
1340 1.1 ross performed according to the IEC/IEEE Standard for Binary Floating-Point
1341 1.1 ross Arithmetic---which means in particular that the conversion is rounded
1342 1.1 ross according to the current rounding mode. If `a' is a NaN, the largest
1343 1.1 ross positive integer is returned. Otherwise, if the conversion overflows, the
1344 1.1 ross largest integer with the same sign as `a' is returned.
1345 1.1 ross -------------------------------------------------------------------------------
1346 1.1 ross */
1347 1.1 ross int32 float32_to_int32( float32 a )
1348 1.1 ross {
1349 1.1 ross flag aSign;
1350 1.1 ross int16 aExp, shiftCount;
1351 1.1 ross bits32 aSig;
1352 1.1 ross bits64 aSig64;
1353 1.1 ross
1354 1.1 ross aSig = extractFloat32Frac( a );
1355 1.1 ross aExp = extractFloat32Exp( a );
1356 1.1 ross aSign = extractFloat32Sign( a );
1357 1.1 ross if ( ( aExp == 0xFF ) && aSig ) aSign = 0;
1358 1.1 ross if ( aExp ) aSig |= 0x00800000;
1359 1.1 ross shiftCount = 0xAF - aExp;
1360 1.1 ross aSig64 = aSig;
1361 1.1 ross aSig64 <<= 32;
1362 1.1 ross if ( 0 < shiftCount ) shift64RightJamming( aSig64, shiftCount, &aSig64 );
1363 1.1 ross return roundAndPackInt32( aSign, aSig64 );
1364 1.1 ross
1365 1.1 ross }
1366 1.1 ross #endif /* !SOFTFLOAT_FOR_GCC */
1367 1.1 ross
1368 1.1 ross /*
1369 1.1 ross -------------------------------------------------------------------------------
1370 1.1 ross Returns the result of converting the single-precision floating-point value
1371 1.1 ross `a' to the 32-bit two's complement integer format. The conversion is
1372 1.1 ross performed according to the IEC/IEEE Standard for Binary Floating-Point
1373 1.1 ross Arithmetic, except that the conversion is always rounded toward zero.
1374 1.1 ross If `a' is a NaN, the largest positive integer is returned. Otherwise, if
1375 1.1 ross the conversion overflows, the largest integer with the same sign as `a' is
1376 1.1 ross returned.
1377 1.1 ross -------------------------------------------------------------------------------
1378 1.1 ross */
1379 1.1 ross int32 float32_to_int32_round_to_zero( float32 a )
1380 1.1 ross {
1381 1.1 ross flag aSign;
1382 1.1 ross int16 aExp, shiftCount;
1383 1.1 ross bits32 aSig;
1384 1.1 ross int32 z;
1385 1.1 ross
1386 1.1 ross aSig = extractFloat32Frac( a );
1387 1.1 ross aExp = extractFloat32Exp( a );
1388 1.1 ross aSign = extractFloat32Sign( a );
1389 1.1 ross shiftCount = aExp - 0x9E;
1390 1.1 ross if ( 0 <= shiftCount ) {
1391 1.1 ross if ( a != 0xCF000000 ) {
1392 1.1 ross float_raise( float_flag_invalid );
1393 1.1 ross if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) return 0x7FFFFFFF;
1394 1.1 ross }
1395 1.1 ross return (sbits32) 0x80000000;
1396 1.1 ross }
1397 1.1 ross else if ( aExp <= 0x7E ) {
1398 1.1 ross if ( aExp | aSig ) float_set_inexact();
1399 1.1 ross return 0;
1400 1.1 ross }
1401 1.1 ross aSig = ( aSig | 0x00800000 )<<8;
1402 1.1 ross z = aSig>>( - shiftCount );
1403 1.1 ross if ( (bits32) ( aSig<<( shiftCount & 31 ) ) ) {
1404 1.1 ross float_set_inexact();
1405 1.1 ross }
1406 1.1 ross if ( aSign ) z = - z;
1407 1.1 ross return z;
1408 1.1 ross
1409 1.1 ross }
1410 1.1 ross
1411 1.1 ross #ifndef SOFTFLOAT_FOR_GCC /* __fix?fdi provided by libgcc2.c */
1412 1.1 ross /*
1413 1.1 ross -------------------------------------------------------------------------------
1414 1.1 ross Returns the result of converting the single-precision floating-point value
1415 1.1 ross `a' to the 64-bit two's complement integer format. The conversion is
1416 1.1 ross performed according to the IEC/IEEE Standard for Binary Floating-Point
1417 1.1 ross Arithmetic---which means in particular that the conversion is rounded
1418 1.1 ross according to the current rounding mode. If `a' is a NaN, the largest
1419 1.1 ross positive integer is returned. Otherwise, if the conversion overflows, the
1420 1.1 ross largest integer with the same sign as `a' is returned.
1421 1.1 ross -------------------------------------------------------------------------------
1422 1.1 ross */
1423 1.1 ross int64 float32_to_int64( float32 a )
1424 1.1 ross {
1425 1.1 ross flag aSign;
1426 1.1 ross int16 aExp, shiftCount;
1427 1.1 ross bits32 aSig;
1428 1.1 ross bits64 aSig64, aSigExtra;
1429 1.1 ross
1430 1.1 ross aSig = extractFloat32Frac( a );
1431 1.1 ross aExp = extractFloat32Exp( a );
1432 1.1 ross aSign = extractFloat32Sign( a );
1433 1.1 ross shiftCount = 0xBE - aExp;
1434 1.1 ross if ( shiftCount < 0 ) {
1435 1.1 ross float_raise( float_flag_invalid );
1436 1.1 ross if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) {
1437 1.1 ross return LIT64( 0x7FFFFFFFFFFFFFFF );
1438 1.1 ross }
1439 1.1 ross return (sbits64) LIT64( 0x8000000000000000 );
1440 1.1 ross }
1441 1.1 ross if ( aExp ) aSig |= 0x00800000;
1442 1.1 ross aSig64 = aSig;
1443 1.1 ross aSig64 <<= 40;
1444 1.1 ross shift64ExtraRightJamming( aSig64, 0, shiftCount, &aSig64, &aSigExtra );
1445 1.1 ross return roundAndPackInt64( aSign, aSig64, aSigExtra );
1446 1.1 ross
1447 1.1 ross }
1448 1.1 ross
1449 1.1 ross /*
1450 1.1 ross -------------------------------------------------------------------------------
1451 1.1 ross Returns the result of converting the single-precision floating-point value
1452 1.1 ross `a' to the 64-bit two's complement integer format. The conversion is
1453 1.1 ross performed according to the IEC/IEEE Standard for Binary Floating-Point
1454 1.1 ross Arithmetic, except that the conversion is always rounded toward zero. If
1455 1.1 ross `a' is a NaN, the largest positive integer is returned. Otherwise, if the
1456 1.1 ross conversion overflows, the largest integer with the same sign as `a' is
1457 1.1 ross returned.
1458 1.1 ross -------------------------------------------------------------------------------
1459 1.1 ross */
1460 1.1 ross int64 float32_to_int64_round_to_zero( float32 a )
1461 1.1 ross {
1462 1.1 ross flag aSign;
1463 1.1 ross int16 aExp, shiftCount;
1464 1.1 ross bits32 aSig;
1465 1.1 ross bits64 aSig64;
1466 1.1 ross int64 z;
1467 1.1 ross
1468 1.1 ross aSig = extractFloat32Frac( a );
1469 1.1 ross aExp = extractFloat32Exp( a );
1470 1.1 ross aSign = extractFloat32Sign( a );
1471 1.1 ross shiftCount = aExp - 0xBE;
1472 1.1 ross if ( 0 <= shiftCount ) {
1473 1.1 ross if ( a != 0xDF000000 ) {
1474 1.1 ross float_raise( float_flag_invalid );
1475 1.1 ross if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) {
1476 1.1 ross return LIT64( 0x7FFFFFFFFFFFFFFF );
1477 1.1 ross }
1478 1.1 ross }
1479 1.1 ross return (sbits64) LIT64( 0x8000000000000000 );
1480 1.1 ross }
1481 1.1 ross else if ( aExp <= 0x7E ) {
1482 1.1 ross if ( aExp | aSig ) float_set_inexact();
1483 1.1 ross return 0;
1484 1.1 ross }
1485 1.1 ross aSig64 = aSig | 0x00800000;
1486 1.1 ross aSig64 <<= 40;
1487 1.1 ross z = aSig64>>( - shiftCount );
1488 1.1 ross if ( (bits64) ( aSig64<<( shiftCount & 63 ) ) ) {
1489 1.1 ross float_set_inexact();
1490 1.1 ross }
1491 1.1 ross if ( aSign ) z = - z;
1492 1.1 ross return z;
1493 1.1 ross
1494 1.1 ross }
1495 1.1 ross #endif /* !SOFTFLOAT_FOR_GCC */
1496 1.1 ross
1497 1.1 ross /*
1498 1.1 ross -------------------------------------------------------------------------------
1499 1.1 ross Returns the result of converting the single-precision floating-point value
1500 1.1 ross `a' to the double-precision floating-point format. The conversion is
1501 1.1 ross performed according to the IEC/IEEE Standard for Binary Floating-Point
1502 1.1 ross Arithmetic.
1503 1.1 ross -------------------------------------------------------------------------------
1504 1.1 ross */
1505 1.1 ross float64 float32_to_float64( float32 a )
1506 1.1 ross {
1507 1.1 ross flag aSign;
1508 1.1 ross int16 aExp;
1509 1.1 ross bits32 aSig;
1510 1.1 ross
1511 1.1 ross aSig = extractFloat32Frac( a );
1512 1.1 ross aExp = extractFloat32Exp( a );
1513 1.1 ross aSign = extractFloat32Sign( a );
1514 1.1 ross if ( aExp == 0xFF ) {
1515 1.1 ross if ( aSig ) return commonNaNToFloat64( float32ToCommonNaN( a ) );
1516 1.1 ross return packFloat64( aSign, 0x7FF, 0 );
1517 1.1 ross }
1518 1.1 ross if ( aExp == 0 ) {
1519 1.1 ross if ( aSig == 0 ) return packFloat64( aSign, 0, 0 );
1520 1.1 ross normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1521 1.1 ross --aExp;
1522 1.1 ross }
1523 1.1 ross return packFloat64( aSign, aExp + 0x380, ( (bits64) aSig )<<29 );
1524 1.1 ross
1525 1.1 ross }
1526 1.1 ross
1527 1.1 ross #ifdef FLOATX80
1528 1.1 ross
1529 1.1 ross /*
1530 1.1 ross -------------------------------------------------------------------------------
1531 1.1 ross Returns the result of converting the single-precision floating-point value
1532 1.1 ross `a' to the extended double-precision floating-point format. The conversion
1533 1.1 ross is performed according to the IEC/IEEE Standard for Binary Floating-Point
1534 1.1 ross Arithmetic.
1535 1.1 ross -------------------------------------------------------------------------------
1536 1.1 ross */
1537 1.1 ross floatx80 float32_to_floatx80( float32 a )
1538 1.1 ross {
1539 1.1 ross flag aSign;
1540 1.1 ross int16 aExp;
1541 1.1 ross bits32 aSig;
1542 1.1 ross
1543 1.1 ross aSig = extractFloat32Frac( a );
1544 1.1 ross aExp = extractFloat32Exp( a );
1545 1.1 ross aSign = extractFloat32Sign( a );
1546 1.1 ross if ( aExp == 0xFF ) {
1547 1.1 ross if ( aSig ) return commonNaNToFloatx80( float32ToCommonNaN( a ) );
1548 1.1 ross return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
1549 1.1 ross }
1550 1.1 ross if ( aExp == 0 ) {
1551 1.1 ross if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 );
1552 1.1 ross normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1553 1.1 ross }
1554 1.1 ross aSig |= 0x00800000;
1555 1.1 ross return packFloatx80( aSign, aExp + 0x3F80, ( (bits64) aSig )<<40 );
1556 1.1 ross
1557 1.1 ross }
1558 1.1 ross
1559 1.1 ross #endif
1560 1.1 ross
1561 1.1 ross #ifdef FLOAT128
1562 1.1 ross
1563 1.1 ross /*
1564 1.1 ross -------------------------------------------------------------------------------
1565 1.1 ross Returns the result of converting the single-precision floating-point value
1566 1.1 ross `a' to the double-precision floating-point format. The conversion is
1567 1.1 ross performed according to the IEC/IEEE Standard for Binary Floating-Point
1568 1.1 ross Arithmetic.
1569 1.1 ross -------------------------------------------------------------------------------
1570 1.1 ross */
1571 1.1 ross float128 float32_to_float128( float32 a )
1572 1.1 ross {
1573 1.1 ross flag aSign;
1574 1.1 ross int16 aExp;
1575 1.1 ross bits32 aSig;
1576 1.1 ross
1577 1.1 ross aSig = extractFloat32Frac( a );
1578 1.1 ross aExp = extractFloat32Exp( a );
1579 1.1 ross aSign = extractFloat32Sign( a );
1580 1.1 ross if ( aExp == 0xFF ) {
1581 1.1 ross if ( aSig ) return commonNaNToFloat128( float32ToCommonNaN( a ) );
1582 1.1 ross return packFloat128( aSign, 0x7FFF, 0, 0 );
1583 1.1 ross }
1584 1.1 ross if ( aExp == 0 ) {
1585 1.1 ross if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 );
1586 1.1 ross normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1587 1.1 ross --aExp;
1588 1.1 ross }
1589 1.1 ross return packFloat128( aSign, aExp + 0x3F80, ( (bits64) aSig )<<25, 0 );
1590 1.1 ross
1591 1.1 ross }
1592 1.1 ross
1593 1.1 ross #endif
1594 1.1 ross
1595 1.1 ross #ifndef SOFTFLOAT_FOR_GCC /* Not needed */
1596 1.1 ross /*
1597 1.1 ross -------------------------------------------------------------------------------
1598 1.1 ross Rounds the single-precision floating-point value `a' to an integer, and
1599 1.1 ross returns the result as a single-precision floating-point value. The
1600 1.1 ross operation is performed according to the IEC/IEEE Standard for Binary
1601 1.1 ross Floating-Point Arithmetic.
1602 1.1 ross -------------------------------------------------------------------------------
1603 1.1 ross */
1604 1.1 ross float32 float32_round_to_int( float32 a )
1605 1.1 ross {
1606 1.1 ross flag aSign;
1607 1.1 ross int16 aExp;
1608 1.1 ross bits32 lastBitMask, roundBitsMask;
1609 1.1 ross int8 roundingMode;
1610 1.1 ross float32 z;
1611 1.1 ross
1612 1.1 ross aExp = extractFloat32Exp( a );
1613 1.1 ross if ( 0x96 <= aExp ) {
1614 1.1 ross if ( ( aExp == 0xFF ) && extractFloat32Frac( a ) ) {
1615 1.1 ross return propagateFloat32NaN( a, a );
1616 1.1 ross }
1617 1.1 ross return a;
1618 1.1 ross }
1619 1.1 ross if ( aExp <= 0x7E ) {
1620 1.1 ross if ( (bits32) ( a<<1 ) == 0 ) return a;
1621 1.1 ross float_set_inexact();
1622 1.1 ross aSign = extractFloat32Sign( a );
1623 1.1 ross switch ( float_rounding_mode() ) {
1624 1.1 ross case float_round_nearest_even:
1625 1.1 ross if ( ( aExp == 0x7E ) && extractFloat32Frac( a ) ) {
1626 1.1 ross return packFloat32( aSign, 0x7F, 0 );
1627 1.1 ross }
1628 1.1 ross break;
1629 1.1 ross case float_round_down:
1630 1.1 ross return aSign ? 0xBF800000 : 0;
1631 1.1 ross case float_round_up:
1632 1.1 ross return aSign ? 0x80000000 : 0x3F800000;
1633 1.1 ross }
1634 1.1 ross return packFloat32( aSign, 0, 0 );
1635 1.1 ross }
1636 1.1 ross lastBitMask = 1;
1637 1.1 ross lastBitMask <<= 0x96 - aExp;
1638 1.1 ross roundBitsMask = lastBitMask - 1;
1639 1.1 ross z = a;
1640 1.1 ross roundingMode = float_rounding_mode();
1641 1.1 ross if ( roundingMode == float_round_nearest_even ) {
1642 1.1 ross z += lastBitMask>>1;
1643 1.1 ross if ( ( z & roundBitsMask ) == 0 ) z &= ~ lastBitMask;
1644 1.1 ross }
1645 1.1 ross else if ( roundingMode != float_round_to_zero ) {
1646 1.1 ross if ( extractFloat32Sign( z ) ^ ( roundingMode == float_round_up ) ) {
1647 1.1 ross z += roundBitsMask;
1648 1.1 ross }
1649 1.1 ross }
1650 1.1 ross z &= ~ roundBitsMask;
1651 1.1 ross if ( z != a ) float_set_inexact();
1652 1.1 ross return z;
1653 1.1 ross
1654 1.1 ross }
1655 1.1 ross #endif /* !SOFTFLOAT_FOR_GCC */
1656 1.1 ross
1657 1.1 ross /*
1658 1.1 ross -------------------------------------------------------------------------------
1659 1.1 ross Returns the result of adding the absolute values of the single-precision
1660 1.1 ross floating-point values `a' and `b'. If `zSign' is 1, the sum is negated
1661 1.1 ross before being returned. `zSign' is ignored if the result is a NaN.
1662 1.1 ross The addition is performed according to the IEC/IEEE Standard for Binary
1663 1.1 ross Floating-Point Arithmetic.
1664 1.1 ross -------------------------------------------------------------------------------
1665 1.1 ross */
1666 1.1 ross static float32 addFloat32Sigs( float32 a, float32 b, flag zSign )
1667 1.1 ross {
1668 1.1 ross int16 aExp, bExp, zExp;
1669 1.1 ross bits32 aSig, bSig, zSig;
1670 1.1 ross int16 expDiff;
1671 1.1 ross
1672 1.1 ross aSig = extractFloat32Frac( a );
1673 1.1 ross aExp = extractFloat32Exp( a );
1674 1.1 ross bSig = extractFloat32Frac( b );
1675 1.1 ross bExp = extractFloat32Exp( b );
1676 1.1 ross expDiff = aExp - bExp;
1677 1.1 ross aSig <<= 6;
1678 1.1 ross bSig <<= 6;
1679 1.1 ross if ( 0 < expDiff ) {
1680 1.1 ross if ( aExp == 0xFF ) {
1681 1.1 ross if ( aSig ) return propagateFloat32NaN( a, b );
1682 1.1 ross return a;
1683 1.1 ross }
1684 1.1 ross if ( bExp == 0 ) {
1685 1.1 ross --expDiff;
1686 1.1 ross }
1687 1.1 ross else {
1688 1.1 ross bSig |= 0x20000000;
1689 1.1 ross }
1690 1.1 ross shift32RightJamming( bSig, expDiff, &bSig );
1691 1.1 ross zExp = aExp;
1692 1.1 ross }
1693 1.1 ross else if ( expDiff < 0 ) {
1694 1.1 ross if ( bExp == 0xFF ) {
1695 1.1 ross if ( bSig ) return propagateFloat32NaN( a, b );
1696 1.1 ross return packFloat32( zSign, 0xFF, 0 );
1697 1.1 ross }
1698 1.1 ross if ( aExp == 0 ) {
1699 1.1 ross ++expDiff;
1700 1.1 ross }
1701 1.1 ross else {
1702 1.1 ross aSig |= 0x20000000;
1703 1.1 ross }
1704 1.1 ross shift32RightJamming( aSig, - expDiff, &aSig );
1705 1.1 ross zExp = bExp;
1706 1.1 ross }
1707 1.1 ross else {
1708 1.1 ross if ( aExp == 0xFF ) {
1709 1.1 ross if ( aSig | bSig ) return propagateFloat32NaN( a, b );
1710 1.1 ross return a;
1711 1.1 ross }
1712 1.1 ross if ( aExp == 0 ) return packFloat32( zSign, 0, ( aSig + bSig )>>6 );
1713 1.1 ross zSig = 0x40000000 + aSig + bSig;
1714 1.1 ross zExp = aExp;
1715 1.1 ross goto roundAndPack;
1716 1.1 ross }
1717 1.1 ross aSig |= 0x20000000;
1718 1.1 ross zSig = ( aSig + bSig )<<1;
1719 1.1 ross --zExp;
1720 1.1 ross if ( (sbits32) zSig < 0 ) {
1721 1.1 ross zSig = aSig + bSig;
1722 1.1 ross ++zExp;
1723 1.1 ross }
1724 1.1 ross roundAndPack:
1725 1.1 ross return roundAndPackFloat32( zSign, zExp, zSig );
1726 1.1 ross
1727 1.1 ross }
1728 1.1 ross
1729 1.1 ross /*
1730 1.1 ross -------------------------------------------------------------------------------
1731 1.1 ross Returns the result of subtracting the absolute values of the single-
1732 1.1 ross precision floating-point values `a' and `b'. If `zSign' is 1, the
1733 1.1 ross difference is negated before being returned. `zSign' is ignored if the
1734 1.1 ross result is a NaN. The subtraction is performed according to the IEC/IEEE
1735 1.1 ross Standard for Binary Floating-Point Arithmetic.
1736 1.1 ross -------------------------------------------------------------------------------
1737 1.1 ross */
1738 1.1 ross static float32 subFloat32Sigs( float32 a, float32 b, flag zSign )
1739 1.1 ross {
1740 1.1 ross int16 aExp, bExp, zExp;
1741 1.1 ross bits32 aSig, bSig, zSig;
1742 1.1 ross int16 expDiff;
1743 1.1 ross
1744 1.1 ross aSig = extractFloat32Frac( a );
1745 1.1 ross aExp = extractFloat32Exp( a );
1746 1.1 ross bSig = extractFloat32Frac( b );
1747 1.1 ross bExp = extractFloat32Exp( b );
1748 1.1 ross expDiff = aExp - bExp;
1749 1.1 ross aSig <<= 7;
1750 1.1 ross bSig <<= 7;
1751 1.1 ross if ( 0 < expDiff ) goto aExpBigger;
1752 1.1 ross if ( expDiff < 0 ) goto bExpBigger;
1753 1.1 ross if ( aExp == 0xFF ) {
1754 1.1 ross if ( aSig | bSig ) return propagateFloat32NaN( a, b );
1755 1.1 ross float_raise( float_flag_invalid );
1756 1.1 ross return float32_default_nan;
1757 1.1 ross }
1758 1.1 ross if ( aExp == 0 ) {
1759 1.1 ross aExp = 1;
1760 1.1 ross bExp = 1;
1761 1.1 ross }
1762 1.1 ross if ( bSig < aSig ) goto aBigger;
1763 1.1 ross if ( aSig < bSig ) goto bBigger;
1764 1.1 ross return packFloat32( float_rounding_mode() == float_round_down, 0, 0 );
1765 1.1 ross bExpBigger:
1766 1.1 ross if ( bExp == 0xFF ) {
1767 1.1 ross if ( bSig ) return propagateFloat32NaN( a, b );
1768 1.1 ross return packFloat32( zSign ^ 1, 0xFF, 0 );
1769 1.1 ross }
1770 1.1 ross if ( aExp == 0 ) {
1771 1.1 ross ++expDiff;
1772 1.1 ross }
1773 1.1 ross else {
1774 1.1 ross aSig |= 0x40000000;
1775 1.1 ross }
1776 1.1 ross shift32RightJamming( aSig, - expDiff, &aSig );
1777 1.1 ross bSig |= 0x40000000;
1778 1.1 ross bBigger:
1779 1.1 ross zSig = bSig - aSig;
1780 1.1 ross zExp = bExp;
1781 1.1 ross zSign ^= 1;
1782 1.1 ross goto normalizeRoundAndPack;
1783 1.1 ross aExpBigger:
1784 1.1 ross if ( aExp == 0xFF ) {
1785 1.1 ross if ( aSig ) return propagateFloat32NaN( a, b );
1786 1.1 ross return a;
1787 1.1 ross }
1788 1.1 ross if ( bExp == 0 ) {
1789 1.1 ross --expDiff;
1790 1.1 ross }
1791 1.1 ross else {
1792 1.1 ross bSig |= 0x40000000;
1793 1.1 ross }
1794 1.1 ross shift32RightJamming( bSig, expDiff, &bSig );
1795 1.1 ross aSig |= 0x40000000;
1796 1.1 ross aBigger:
1797 1.1 ross zSig = aSig - bSig;
1798 1.1 ross zExp = aExp;
1799 1.1 ross normalizeRoundAndPack:
1800 1.1 ross --zExp;
1801 1.1 ross return normalizeRoundAndPackFloat32( zSign, zExp, zSig );
1802 1.1 ross
1803 1.1 ross }
1804 1.1 ross
1805 1.1 ross /*
1806 1.1 ross -------------------------------------------------------------------------------
1807 1.1 ross Returns the result of adding the single-precision floating-point values `a'
1808 1.1 ross and `b'. The operation is performed according to the IEC/IEEE Standard for
1809 1.1 ross Binary Floating-Point Arithmetic.
1810 1.1 ross -------------------------------------------------------------------------------
1811 1.1 ross */
1812 1.1 ross float32 float32_add( float32 a, float32 b )
1813 1.1 ross {
1814 1.1 ross flag aSign, bSign;
1815 1.1 ross
1816 1.1 ross aSign = extractFloat32Sign( a );
1817 1.1 ross bSign = extractFloat32Sign( b );
1818 1.1 ross if ( aSign == bSign ) {
1819 1.1 ross return addFloat32Sigs( a, b, aSign );
1820 1.1 ross }
1821 1.1 ross else {
1822 1.1 ross return subFloat32Sigs( a, b, aSign );
1823 1.1 ross }
1824 1.1 ross
1825 1.1 ross }
1826 1.1 ross
1827 1.1 ross /*
1828 1.1 ross -------------------------------------------------------------------------------
1829 1.1 ross Returns the result of subtracting the single-precision floating-point values
1830 1.1 ross `a' and `b'. The operation is performed according to the IEC/IEEE Standard
1831 1.1 ross for Binary Floating-Point Arithmetic.
1832 1.1 ross -------------------------------------------------------------------------------
1833 1.1 ross */
1834 1.1 ross float32 float32_sub( float32 a, float32 b )
1835 1.1 ross {
1836 1.1 ross flag aSign, bSign;
1837 1.1 ross
1838 1.1 ross aSign = extractFloat32Sign( a );
1839 1.1 ross bSign = extractFloat32Sign( b );
1840 1.1 ross if ( aSign == bSign ) {
1841 1.1 ross return subFloat32Sigs( a, b, aSign );
1842 1.1 ross }
1843 1.1 ross else {
1844 1.1 ross return addFloat32Sigs( a, b, aSign );
1845 1.1 ross }
1846 1.1 ross
1847 1.1 ross }
1848 1.1 ross
1849 1.1 ross /*
1850 1.1 ross -------------------------------------------------------------------------------
1851 1.1 ross Returns the result of multiplying the single-precision floating-point values
1852 1.1 ross `a' and `b'. The operation is performed according to the IEC/IEEE Standard
1853 1.1 ross for Binary Floating-Point Arithmetic.
1854 1.1 ross -------------------------------------------------------------------------------
1855 1.1 ross */
1856 1.1 ross float32 float32_mul( float32 a, float32 b )
1857 1.1 ross {
1858 1.1 ross flag aSign, bSign, zSign;
1859 1.1 ross int16 aExp, bExp, zExp;
1860 1.1 ross bits32 aSig, bSig;
1861 1.1 ross bits64 zSig64;
1862 1.1 ross bits32 zSig;
1863 1.1 ross
1864 1.1 ross aSig = extractFloat32Frac( a );
1865 1.1 ross aExp = extractFloat32Exp( a );
1866 1.1 ross aSign = extractFloat32Sign( a );
1867 1.1 ross bSig = extractFloat32Frac( b );
1868 1.1 ross bExp = extractFloat32Exp( b );
1869 1.1 ross bSign = extractFloat32Sign( b );
1870 1.1 ross zSign = aSign ^ bSign;
1871 1.1 ross if ( aExp == 0xFF ) {
1872 1.1 ross if ( aSig || ( ( bExp == 0xFF ) && bSig ) ) {
1873 1.1 ross return propagateFloat32NaN( a, b );
1874 1.1 ross }
1875 1.1 ross if ( ( bExp | bSig ) == 0 ) {
1876 1.1 ross float_raise( float_flag_invalid );
1877 1.1 ross return float32_default_nan;
1878 1.1 ross }
1879 1.1 ross return packFloat32( zSign, 0xFF, 0 );
1880 1.1 ross }
1881 1.1 ross if ( bExp == 0xFF ) {
1882 1.1 ross if ( bSig ) return propagateFloat32NaN( a, b );
1883 1.1 ross if ( ( aExp | aSig ) == 0 ) {
1884 1.1 ross float_raise( float_flag_invalid );
1885 1.1 ross return float32_default_nan;
1886 1.1 ross }
1887 1.1 ross return packFloat32( zSign, 0xFF, 0 );
1888 1.1 ross }
1889 1.1 ross if ( aExp == 0 ) {
1890 1.1 ross if ( aSig == 0 ) return packFloat32( zSign, 0, 0 );
1891 1.1 ross normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1892 1.1 ross }
1893 1.1 ross if ( bExp == 0 ) {
1894 1.1 ross if ( bSig == 0 ) return packFloat32( zSign, 0, 0 );
1895 1.1 ross normalizeFloat32Subnormal( bSig, &bExp, &bSig );
1896 1.1 ross }
1897 1.1 ross zExp = aExp + bExp - 0x7F;
1898 1.1 ross aSig = ( aSig | 0x00800000 )<<7;
1899 1.1 ross bSig = ( bSig | 0x00800000 )<<8;
1900 1.1 ross shift64RightJamming( ( (bits64) aSig ) * bSig, 32, &zSig64 );
1901 1.1 ross zSig = zSig64;
1902 1.1 ross if ( 0 <= (sbits32) ( zSig<<1 ) ) {
1903 1.1 ross zSig <<= 1;
1904 1.1 ross --zExp;
1905 1.1 ross }
1906 1.1 ross return roundAndPackFloat32( zSign, zExp, zSig );
1907 1.1 ross
1908 1.1 ross }
1909 1.1 ross
1910 1.1 ross /*
1911 1.1 ross -------------------------------------------------------------------------------
1912 1.1 ross Returns the result of dividing the single-precision floating-point value `a'
1913 1.1 ross by the corresponding value `b'. The operation is performed according to the
1914 1.1 ross IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1915 1.1 ross -------------------------------------------------------------------------------
1916 1.1 ross */
1917 1.1 ross float32 float32_div( float32 a, float32 b )
1918 1.1 ross {
1919 1.1 ross flag aSign, bSign, zSign;
1920 1.1 ross int16 aExp, bExp, zExp;
1921 1.1 ross bits32 aSig, bSig, zSig;
1922 1.1 ross
1923 1.1 ross aSig = extractFloat32Frac( a );
1924 1.1 ross aExp = extractFloat32Exp( a );
1925 1.1 ross aSign = extractFloat32Sign( a );
1926 1.1 ross bSig = extractFloat32Frac( b );
1927 1.1 ross bExp = extractFloat32Exp( b );
1928 1.1 ross bSign = extractFloat32Sign( b );
1929 1.1 ross zSign = aSign ^ bSign;
1930 1.1 ross if ( aExp == 0xFF ) {
1931 1.1 ross if ( aSig ) return propagateFloat32NaN( a, b );
1932 1.1 ross if ( bExp == 0xFF ) {
1933 1.1 ross if ( bSig ) return propagateFloat32NaN( a, b );
1934 1.1 ross float_raise( float_flag_invalid );
1935 1.1 ross return float32_default_nan;
1936 1.1 ross }
1937 1.1 ross return packFloat32( zSign, 0xFF, 0 );
1938 1.1 ross }
1939 1.1 ross if ( bExp == 0xFF ) {
1940 1.1 ross if ( bSig ) return propagateFloat32NaN( a, b );
1941 1.1 ross return packFloat32( zSign, 0, 0 );
1942 1.1 ross }
1943 1.1 ross if ( bExp == 0 ) {
1944 1.1 ross if ( bSig == 0 ) {
1945 1.1 ross if ( ( aExp | aSig ) == 0 ) {
1946 1.1 ross float_raise( float_flag_invalid );
1947 1.1 ross return float32_default_nan;
1948 1.1 ross }
1949 1.1 ross float_raise( float_flag_divbyzero );
1950 1.1 ross return packFloat32( zSign, 0xFF, 0 );
1951 1.1 ross }
1952 1.1 ross normalizeFloat32Subnormal( bSig, &bExp, &bSig );
1953 1.1 ross }
1954 1.1 ross if ( aExp == 0 ) {
1955 1.1 ross if ( aSig == 0 ) return packFloat32( zSign, 0, 0 );
1956 1.1 ross normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1957 1.1 ross }
1958 1.1 ross zExp = aExp - bExp + 0x7D;
1959 1.1 ross aSig = ( aSig | 0x00800000 )<<7;
1960 1.1 ross bSig = ( bSig | 0x00800000 )<<8;
1961 1.1 ross if ( bSig <= ( aSig + aSig ) ) {
1962 1.1 ross aSig >>= 1;
1963 1.1 ross ++zExp;
1964 1.1 ross }
1965 1.1 ross zSig = ( ( (bits64) aSig )<<32 ) / bSig;
1966 1.1 ross if ( ( zSig & 0x3F ) == 0 ) {
1967 1.1 ross zSig |= ( (bits64) bSig * zSig != ( (bits64) aSig )<<32 );
1968 1.1 ross }
1969 1.1 ross return roundAndPackFloat32( zSign, zExp, zSig );
1970 1.1 ross
1971 1.1 ross }
1972 1.1 ross
1973 1.1 ross #ifndef SOFTFLOAT_FOR_GCC /* Not needed */
1974 1.1 ross /*
1975 1.1 ross -------------------------------------------------------------------------------
1976 1.1 ross Returns the remainder of the single-precision floating-point value `a'
1977 1.1 ross with respect to the corresponding value `b'. The operation is performed
1978 1.1 ross according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1979 1.1 ross -------------------------------------------------------------------------------
1980 1.1 ross */
1981 1.1 ross float32 float32_rem( float32 a, float32 b )
1982 1.1 ross {
1983 1.1 ross flag aSign, bSign, zSign;
1984 1.1 ross int16 aExp, bExp, expDiff;
1985 1.1 ross bits32 aSig, bSig;
1986 1.1 ross bits32 q;
1987 1.1 ross bits64 aSig64, bSig64, q64;
1988 1.1 ross bits32 alternateASig;
1989 1.1 ross sbits32 sigMean;
1990 1.1 ross
1991 1.1 ross aSig = extractFloat32Frac( a );
1992 1.1 ross aExp = extractFloat32Exp( a );
1993 1.1 ross aSign = extractFloat32Sign( a );
1994 1.1 ross bSig = extractFloat32Frac( b );
1995 1.1 ross bExp = extractFloat32Exp( b );
1996 1.1 ross bSign = extractFloat32Sign( b );
1997 1.1 ross if ( aExp == 0xFF ) {
1998 1.1 ross if ( aSig || ( ( bExp == 0xFF ) && bSig ) ) {
1999 1.1 ross return propagateFloat32NaN( a, b );
2000 1.1 ross }
2001 1.1 ross float_raise( float_flag_invalid );
2002 1.1 ross return float32_default_nan;
2003 1.1 ross }
2004 1.1 ross if ( bExp == 0xFF ) {
2005 1.1 ross if ( bSig ) return propagateFloat32NaN( a, b );
2006 1.1 ross return a;
2007 1.1 ross }
2008 1.1 ross if ( bExp == 0 ) {
2009 1.1 ross if ( bSig == 0 ) {
2010 1.1 ross float_raise( float_flag_invalid );
2011 1.1 ross return float32_default_nan;
2012 1.1 ross }
2013 1.1 ross normalizeFloat32Subnormal( bSig, &bExp, &bSig );
2014 1.1 ross }
2015 1.1 ross if ( aExp == 0 ) {
2016 1.1 ross if ( aSig == 0 ) return a;
2017 1.1 ross normalizeFloat32Subnormal( aSig, &aExp, &aSig );
2018 1.1 ross }
2019 1.1 ross expDiff = aExp - bExp;
2020 1.1 ross aSig |= 0x00800000;
2021 1.1 ross bSig |= 0x00800000;
2022 1.1 ross if ( expDiff < 32 ) {
2023 1.1 ross aSig <<= 8;
2024 1.1 ross bSig <<= 8;
2025 1.1 ross if ( expDiff < 0 ) {
2026 1.1 ross if ( expDiff < -1 ) return a;
2027 1.1 ross aSig >>= 1;
2028 1.1 ross }
2029 1.1 ross q = ( bSig <= aSig );
2030 1.1 ross if ( q ) aSig -= bSig;
2031 1.1 ross if ( 0 < expDiff ) {
2032 1.1 ross q = ( ( (bits64) aSig )<<32 ) / bSig;
2033 1.1 ross q >>= 32 - expDiff;
2034 1.1 ross bSig >>= 2;
2035 1.1 ross aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q;
2036 1.1 ross }
2037 1.1 ross else {
2038 1.1 ross aSig >>= 2;
2039 1.1 ross bSig >>= 2;
2040 1.1 ross }
2041 1.1 ross }
2042 1.1 ross else {
2043 1.1 ross if ( bSig <= aSig ) aSig -= bSig;
2044 1.1 ross aSig64 = ( (bits64) aSig )<<40;
2045 1.1 ross bSig64 = ( (bits64) bSig )<<40;
2046 1.1 ross expDiff -= 64;
2047 1.1 ross while ( 0 < expDiff ) {
2048 1.1 ross q64 = estimateDiv128To64( aSig64, 0, bSig64 );
2049 1.1 ross q64 = ( 2 < q64 ) ? q64 - 2 : 0;
2050 1.1 ross aSig64 = - ( ( bSig * q64 )<<38 );
2051 1.1 ross expDiff -= 62;
2052 1.1 ross }
2053 1.1 ross expDiff += 64;
2054 1.1 ross q64 = estimateDiv128To64( aSig64, 0, bSig64 );
2055 1.1 ross q64 = ( 2 < q64 ) ? q64 - 2 : 0;
2056 1.1 ross q = q64>>( 64 - expDiff );
2057 1.1 ross bSig <<= 6;
2058 1.1 ross aSig = ( ( aSig64>>33 )<<( expDiff - 1 ) ) - bSig * q;
2059 1.1 ross }
2060 1.1 ross do {
2061 1.1 ross alternateASig = aSig;
2062 1.1 ross ++q;
2063 1.1 ross aSig -= bSig;
2064 1.1 ross } while ( 0 <= (sbits32) aSig );
2065 1.1 ross sigMean = aSig + alternateASig;
2066 1.1 ross if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) {
2067 1.1 ross aSig = alternateASig;
2068 1.1 ross }
2069 1.1 ross zSign = ( (sbits32) aSig < 0 );
2070 1.1 ross if ( zSign ) aSig = - aSig;
2071 1.1 ross return normalizeRoundAndPackFloat32( aSign ^ zSign, bExp, aSig );
2072 1.1 ross
2073 1.1 ross }
2074 1.1 ross #endif /* !SOFTFLOAT_FOR_GCC */
2075 1.1 ross
2076 1.1 ross #ifndef SOFTFLOAT_FOR_GCC /* Not needed */
2077 1.1 ross /*
2078 1.1 ross -------------------------------------------------------------------------------
2079 1.1 ross Returns the square root of the single-precision floating-point value `a'.
2080 1.1 ross The operation is performed according to the IEC/IEEE Standard for Binary
2081 1.1 ross Floating-Point Arithmetic.
2082 1.1 ross -------------------------------------------------------------------------------
2083 1.1 ross */
2084 1.1 ross float32 float32_sqrt( float32 a )
2085 1.1 ross {
2086 1.1 ross flag aSign;
2087 1.1 ross int16 aExp, zExp;
2088 1.1 ross bits32 aSig, zSig;
2089 1.1 ross bits64 rem, term;
2090 1.1 ross
2091 1.1 ross aSig = extractFloat32Frac( a );
2092 1.1 ross aExp = extractFloat32Exp( a );
2093 1.1 ross aSign = extractFloat32Sign( a );
2094 1.1 ross if ( aExp == 0xFF ) {
2095 1.1 ross if ( aSig ) return propagateFloat32NaN( a, 0 );
2096 1.1 ross if ( ! aSign ) return a;
2097 1.1 ross float_raise( float_flag_invalid );
2098 1.1 ross return float32_default_nan;
2099 1.1 ross }
2100 1.1 ross if ( aSign ) {
2101 1.1 ross if ( ( aExp | aSig ) == 0 ) return a;
2102 1.1 ross float_raise( float_flag_invalid );
2103 1.1 ross return float32_default_nan;
2104 1.1 ross }
2105 1.1 ross if ( aExp == 0 ) {
2106 1.1 ross if ( aSig == 0 ) return 0;
2107 1.1 ross normalizeFloat32Subnormal( aSig, &aExp, &aSig );
2108 1.1 ross }
2109 1.1 ross zExp = ( ( aExp - 0x7F )>>1 ) + 0x7E;
2110 1.1 ross aSig = ( aSig | 0x00800000 )<<8;
2111 1.1 ross zSig = estimateSqrt32( aExp, aSig ) + 2;
2112 1.1 ross if ( ( zSig & 0x7F ) <= 5 ) {
2113 1.1 ross if ( zSig < 2 ) {
2114 1.1 ross zSig = 0x7FFFFFFF;
2115 1.1 ross goto roundAndPack;
2116 1.1 ross }
2117 1.1 ross aSig >>= aExp & 1;
2118 1.1 ross term = ( (bits64) zSig ) * zSig;
2119 1.1 ross rem = ( ( (bits64) aSig )<<32 ) - term;
2120 1.1 ross while ( (sbits64) rem < 0 ) {
2121 1.1 ross --zSig;
2122 1.1 ross rem += ( ( (bits64) zSig )<<1 ) | 1;
2123 1.1 ross }
2124 1.1 ross zSig |= ( rem != 0 );
2125 1.1 ross }
2126 1.1 ross shift32RightJamming( zSig, 1, &zSig );
2127 1.1 ross roundAndPack:
2128 1.1 ross return roundAndPackFloat32( 0, zExp, zSig );
2129 1.1 ross
2130 1.1 ross }
2131 1.1 ross #endif /* !SOFTFLOAT_FOR_GCC */
2132 1.1 ross
2133 1.1 ross /*
2134 1.1 ross -------------------------------------------------------------------------------
2135 1.1 ross Returns 1 if the single-precision floating-point value `a' is equal to
2136 1.1 ross the corresponding value `b', and 0 otherwise. The comparison is performed
2137 1.1 ross according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2138 1.1 ross -------------------------------------------------------------------------------
2139 1.1 ross */
2140 1.1 ross flag float32_eq( float32 a, float32 b )
2141 1.1 ross {
2142 1.1 ross
2143 1.1 ross if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2144 1.1 ross || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2145 1.1 ross ) {
2146 1.1 ross if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) {
2147 1.1 ross float_raise( float_flag_invalid );
2148 1.1 ross }
2149 1.1 ross return 0;
2150 1.1 ross }
2151 1.1 ross return ( a == b ) || ( (bits32) ( ( a | b )<<1 ) == 0 );
2152 1.1 ross
2153 1.1 ross }
2154 1.1 ross
2155 1.1 ross /*
2156 1.1 ross -------------------------------------------------------------------------------
2157 1.1 ross Returns 1 if the single-precision floating-point value `a' is less than
2158 1.1 ross or equal to the corresponding value `b', and 0 otherwise. The comparison
2159 1.1 ross is performed according to the IEC/IEEE Standard for Binary Floating-Point
2160 1.1 ross Arithmetic.
2161 1.1 ross -------------------------------------------------------------------------------
2162 1.1 ross */
2163 1.1 ross flag float32_le( float32 a, float32 b )
2164 1.1 ross {
2165 1.1 ross flag aSign, bSign;
2166 1.1 ross
2167 1.1 ross if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2168 1.1 ross || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2169 1.1 ross ) {
2170 1.1 ross float_raise( float_flag_invalid );
2171 1.1 ross return 0;
2172 1.1 ross }
2173 1.1 ross aSign = extractFloat32Sign( a );
2174 1.1 ross bSign = extractFloat32Sign( b );
2175 1.1 ross if ( aSign != bSign ) return aSign || ( (bits32) ( ( a | b )<<1 ) == 0 );
2176 1.1 ross return ( a == b ) || ( aSign ^ ( a < b ) );
2177 1.1 ross
2178 1.1 ross }
2179 1.1 ross
2180 1.1 ross /*
2181 1.1 ross -------------------------------------------------------------------------------
2182 1.1 ross Returns 1 if the single-precision floating-point value `a' is less than
2183 1.1 ross the corresponding value `b', and 0 otherwise. The comparison is performed
2184 1.1 ross according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2185 1.1 ross -------------------------------------------------------------------------------
2186 1.1 ross */
2187 1.1 ross flag float32_lt( float32 a, float32 b )
2188 1.1 ross {
2189 1.1 ross flag aSign, bSign;
2190 1.1 ross
2191 1.1 ross if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2192 1.1 ross || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2193 1.1 ross ) {
2194 1.1 ross float_raise( float_flag_invalid );
2195 1.1 ross return 0;
2196 1.1 ross }
2197 1.1 ross aSign = extractFloat32Sign( a );
2198 1.1 ross bSign = extractFloat32Sign( b );
2199 1.1 ross if ( aSign != bSign ) return aSign && ( (bits32) ( ( a | b )<<1 ) != 0 );
2200 1.1 ross return ( a != b ) && ( aSign ^ ( a < b ) );
2201 1.1 ross
2202 1.1 ross }
2203 1.1 ross
2204 1.1 ross #ifndef SOFTFLOAT_FOR_GCC /* Not needed */
2205 1.1 ross /*
2206 1.1 ross -------------------------------------------------------------------------------
2207 1.1 ross Returns 1 if the single-precision floating-point value `a' is equal to
2208 1.1 ross the corresponding value `b', and 0 otherwise. The invalid exception is
2209 1.1 ross raised if either operand is a NaN. Otherwise, the comparison is performed
2210 1.1 ross according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2211 1.1 ross -------------------------------------------------------------------------------
2212 1.1 ross */
2213 1.1 ross flag float32_eq_signaling( float32 a, float32 b )
2214 1.1 ross {
2215 1.1 ross
2216 1.1 ross if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2217 1.1 ross || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2218 1.1 ross ) {
2219 1.1 ross float_raise( float_flag_invalid );
2220 1.1 ross return 0;
2221 1.1 ross }
2222 1.1 ross return ( a == b ) || ( (bits32) ( ( a | b )<<1 ) == 0 );
2223 1.1 ross
2224 1.1 ross }
2225 1.1 ross
2226 1.1 ross /*
2227 1.1 ross -------------------------------------------------------------------------------
2228 1.1 ross Returns 1 if the single-precision floating-point value `a' is less than or
2229 1.1 ross equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not
2230 1.1 ross cause an exception. Otherwise, the comparison is performed according to the
2231 1.1 ross IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2232 1.1 ross -------------------------------------------------------------------------------
2233 1.1 ross */
2234 1.1 ross flag float32_le_quiet( float32 a, float32 b )
2235 1.1 ross {
2236 1.1 ross flag aSign, bSign;
2237 1.1 ross
2238 1.1 ross if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2239 1.1 ross || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2240 1.1 ross ) {
2241 1.1 ross if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) {
2242 1.1 ross float_raise( float_flag_invalid );
2243 1.1 ross }
2244 1.1 ross return 0;
2245 1.1 ross }
2246 1.1 ross aSign = extractFloat32Sign( a );
2247 1.1 ross bSign = extractFloat32Sign( b );
2248 1.1 ross if ( aSign != bSign ) return aSign || ( (bits32) ( ( a | b )<<1 ) == 0 );
2249 1.1 ross return ( a == b ) || ( aSign ^ ( a < b ) );
2250 1.1 ross
2251 1.1 ross }
2252 1.1 ross
2253 1.1 ross /*
2254 1.1 ross -------------------------------------------------------------------------------
2255 1.1 ross Returns 1 if the single-precision floating-point value `a' is less than
2256 1.1 ross the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
2257 1.1 ross exception. Otherwise, the comparison is performed according to the IEC/IEEE
2258 1.1 ross Standard for Binary Floating-Point Arithmetic.
2259 1.1 ross -------------------------------------------------------------------------------
2260 1.1 ross */
2261 1.1 ross flag float32_lt_quiet( float32 a, float32 b )
2262 1.1 ross {
2263 1.1 ross flag aSign, bSign;
2264 1.1 ross
2265 1.1 ross if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2266 1.1 ross || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2267 1.1 ross ) {
2268 1.1 ross if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) {
2269 1.1 ross float_raise( float_flag_invalid );
2270 1.1 ross }
2271 1.1 ross return 0;
2272 1.1 ross }
2273 1.1 ross aSign = extractFloat32Sign( a );
2274 1.1 ross bSign = extractFloat32Sign( b );
2275 1.1 ross if ( aSign != bSign ) return aSign && ( (bits32) ( ( a | b )<<1 ) != 0 );
2276 1.1 ross return ( a != b ) && ( aSign ^ ( a < b ) );
2277 1.1 ross
2278 1.1 ross }
2279 1.1 ross #endif /* !SOFTFLOAT_FOR_GCC */
2280 1.1 ross
2281 1.1 ross #ifndef SOFTFLOAT_FOR_GCC /* Not needed */
2282 1.1 ross /*
2283 1.1 ross -------------------------------------------------------------------------------
2284 1.1 ross Returns the result of converting the double-precision floating-point value
2285 1.1 ross `a' to the 32-bit two's complement integer format. The conversion is
2286 1.1 ross performed according to the IEC/IEEE Standard for Binary Floating-Point
2287 1.1 ross Arithmetic---which means in particular that the conversion is rounded
2288 1.1 ross according to the current rounding mode. If `a' is a NaN, the largest
2289 1.1 ross positive integer is returned. Otherwise, if the conversion overflows, the
2290 1.1 ross largest integer with the same sign as `a' is returned.
2291 1.1 ross -------------------------------------------------------------------------------
2292 1.1 ross */
2293 1.1 ross int32 float64_to_int32( float64 a )
2294 1.1 ross {
2295 1.1 ross flag aSign;
2296 1.1 ross int16 aExp, shiftCount;
2297 1.1 ross bits64 aSig;
2298 1.1 ross
2299 1.1 ross aSig = extractFloat64Frac( a );
2300 1.1 ross aExp = extractFloat64Exp( a );
2301 1.1 ross aSign = extractFloat64Sign( a );
2302 1.1 ross if ( ( aExp == 0x7FF ) && aSig ) aSign = 0;
2303 1.1 ross if ( aExp ) aSig |= LIT64( 0x0010000000000000 );
2304 1.1 ross shiftCount = 0x42C - aExp;
2305 1.1 ross if ( 0 < shiftCount ) shift64RightJamming( aSig, shiftCount, &aSig );
2306 1.1 ross return roundAndPackInt32( aSign, aSig );
2307 1.1 ross
2308 1.1 ross }
2309 1.1 ross #endif /* !SOFTFLOAT_FOR_GCC */
2310 1.1 ross
2311 1.1 ross /*
2312 1.1 ross -------------------------------------------------------------------------------
2313 1.1 ross Returns the result of converting the double-precision floating-point value
2314 1.1 ross `a' to the 32-bit two's complement integer format. The conversion is
2315 1.1 ross performed according to the IEC/IEEE Standard for Binary Floating-Point
2316 1.1 ross Arithmetic, except that the conversion is always rounded toward zero.
2317 1.1 ross If `a' is a NaN, the largest positive integer is returned. Otherwise, if
2318 1.1 ross the conversion overflows, the largest integer with the same sign as `a' is
2319 1.1 ross returned.
2320 1.1 ross -------------------------------------------------------------------------------
2321 1.1 ross */
2322 1.1 ross int32 float64_to_int32_round_to_zero( float64 a )
2323 1.1 ross {
2324 1.1 ross flag aSign;
2325 1.1 ross int16 aExp, shiftCount;
2326 1.1 ross bits64 aSig, savedASig;
2327 1.1 ross int32 z;
2328 1.1 ross
2329 1.1 ross aSig = extractFloat64Frac( a );
2330 1.1 ross aExp = extractFloat64Exp( a );
2331 1.1 ross aSign = extractFloat64Sign( a );
2332 1.1 ross if ( 0x41E < aExp ) {
2333 1.1 ross if ( ( aExp == 0x7FF ) && aSig ) aSign = 0;
2334 1.1 ross goto invalid;
2335 1.1 ross }
2336 1.1 ross else if ( aExp < 0x3FF ) {
2337 1.1 ross if ( aExp || aSig ) float_set_inexact();
2338 1.1 ross return 0;
2339 1.1 ross }
2340 1.1 ross aSig |= LIT64( 0x0010000000000000 );
2341 1.1 ross shiftCount = 0x433 - aExp;
2342 1.1 ross savedASig = aSig;
2343 1.1 ross aSig >>= shiftCount;
2344 1.1 ross z = aSig;
2345 1.1 ross if ( aSign ) z = - z;
2346 1.1 ross if ( ( z < 0 ) ^ aSign ) {
2347 1.1 ross invalid:
2348 1.1 ross float_raise( float_flag_invalid );
2349 1.1 ross return aSign ? (sbits32) 0x80000000 : 0x7FFFFFFF;
2350 1.1 ross }
2351 1.1 ross if ( ( aSig<<shiftCount ) != savedASig ) {
2352 1.1 ross float_set_inexact();
2353 1.1 ross }
2354 1.1 ross return z;
2355 1.1 ross
2356 1.1 ross }
2357 1.1 ross
2358 1.1 ross #ifndef SOFTFLOAT_FOR_GCC /* Not needed */
2359 1.1 ross /*
2360 1.1 ross -------------------------------------------------------------------------------
2361 1.1 ross Returns the result of converting the double-precision floating-point value
2362 1.1 ross `a' to the 64-bit two's complement integer format. The conversion is
2363 1.1 ross performed according to the IEC/IEEE Standard for Binary Floating-Point
2364 1.1 ross Arithmetic---which means in particular that the conversion is rounded
2365 1.1 ross according to the current rounding mode. If `a' is a NaN, the largest
2366 1.1 ross positive integer is returned. Otherwise, if the conversion overflows, the
2367 1.1 ross largest integer with the same sign as `a' is returned.
2368 1.1 ross -------------------------------------------------------------------------------
2369 1.1 ross */
2370 1.1 ross int64 float64_to_int64( float64 a )
2371 1.1 ross {
2372 1.1 ross flag aSign;
2373 1.1 ross int16 aExp, shiftCount;
2374 1.1 ross bits64 aSig, aSigExtra;
2375 1.1 ross
2376 1.1 ross aSig = extractFloat64Frac( a );
2377 1.1 ross aExp = extractFloat64Exp( a );
2378 1.1 ross aSign = extractFloat64Sign( a );
2379 1.1 ross if ( aExp ) aSig |= LIT64( 0x0010000000000000 );
2380 1.1 ross shiftCount = 0x433 - aExp;
2381 1.1 ross if ( shiftCount <= 0 ) {
2382 1.1 ross if ( 0x43E < aExp ) {
2383 1.1 ross float_raise( float_flag_invalid );
2384 1.1 ross if ( ! aSign
2385 1.1 ross || ( ( aExp == 0x7FF )
2386 1.1 ross && ( aSig != LIT64( 0x0010000000000000 ) ) )
2387 1.1 ross ) {
2388 1.1 ross return LIT64( 0x7FFFFFFFFFFFFFFF );
2389 1.1 ross }
2390 1.1 ross return (sbits64) LIT64( 0x8000000000000000 );
2391 1.1 ross }
2392 1.1 ross aSigExtra = 0;
2393 1.1 ross aSig <<= - shiftCount;
2394 1.1 ross }
2395 1.1 ross else {
2396 1.1 ross shift64ExtraRightJamming( aSig, 0, shiftCount, &aSig, &aSigExtra );
2397 1.1 ross }
2398 1.1 ross return roundAndPackInt64( aSign, aSig, aSigExtra );
2399 1.1 ross
2400 1.1 ross }
2401 1.1 ross
2402 1.1 ross /*
2403 1.1 ross -------------------------------------------------------------------------------
2404 1.1 ross Returns the result of converting the double-precision floating-point value
2405 1.1 ross `a' to the 64-bit two's complement integer format. The conversion is
2406 1.1 ross performed according to the IEC/IEEE Standard for Binary Floating-Point
2407 1.1 ross Arithmetic, except that the conversion is always rounded toward zero.
2408 1.1 ross If `a' is a NaN, the largest positive integer is returned. Otherwise, if
2409 1.1 ross the conversion overflows, the largest integer with the same sign as `a' is
2410 1.1 ross returned.
2411 1.1 ross -------------------------------------------------------------------------------
2412 1.1 ross */
2413 1.1 ross int64 float64_to_int64_round_to_zero( float64 a )
2414 1.1 ross {
2415 1.1 ross flag aSign;
2416 1.1 ross int16 aExp, shiftCount;
2417 1.1 ross bits64 aSig;
2418 1.1 ross int64 z;
2419 1.1 ross
2420 1.1 ross aSig = extractFloat64Frac( a );
2421 1.1 ross aExp = extractFloat64Exp( a );
2422 1.1 ross aSign = extractFloat64Sign( a );
2423 1.1 ross if ( aExp ) aSig |= LIT64( 0x0010000000000000 );
2424 1.1 ross shiftCount = aExp - 0x433;
2425 1.1 ross if ( 0 <= shiftCount ) {
2426 1.1 ross if ( 0x43E <= aExp ) {
2427 1.1 ross if ( a != LIT64( 0xC3E0000000000000 ) ) {
2428 1.1 ross float_raise( float_flag_invalid );
2429 1.1 ross if ( ! aSign
2430 1.1 ross || ( ( aExp == 0x7FF )
2431 1.1 ross && ( aSig != LIT64( 0x0010000000000000 ) ) )
2432 1.1 ross ) {
2433 1.1 ross return LIT64( 0x7FFFFFFFFFFFFFFF );
2434 1.1 ross }
2435 1.1 ross }
2436 1.1 ross return (sbits64) LIT64( 0x8000000000000000 );
2437 1.1 ross }
2438 1.1 ross z = aSig<<shiftCount;
2439 1.1 ross }
2440 1.1 ross else {
2441 1.1 ross if ( aExp < 0x3FE ) {
2442 1.1 ross if ( aExp | aSig ) float_set_inexact();
2443 1.1 ross return 0;
2444 1.1 ross }
2445 1.1 ross z = aSig>>( - shiftCount );
2446 1.1 ross if ( (bits64) ( aSig<<( shiftCount & 63 ) ) ) {
2447 1.1 ross float_set_inexact();
2448 1.1 ross }
2449 1.1 ross }
2450 1.1 ross if ( aSign ) z = - z;
2451 1.1 ross return z;
2452 1.1 ross
2453 1.1 ross }
2454 1.1 ross #endif /* !SOFTFLOAT_FOR_GCC */
2455 1.1 ross
2456 1.1 ross /*
2457 1.1 ross -------------------------------------------------------------------------------
2458 1.1 ross Returns the result of converting the double-precision floating-point value
2459 1.1 ross `a' to the single-precision floating-point format. The conversion is
2460 1.1 ross performed according to the IEC/IEEE Standard for Binary Floating-Point
2461 1.1 ross Arithmetic.
2462 1.1 ross -------------------------------------------------------------------------------
2463 1.1 ross */
2464 1.1 ross float32 float64_to_float32( float64 a )
2465 1.1 ross {
2466 1.1 ross flag aSign;
2467 1.1 ross int16 aExp;
2468 1.1 ross bits64 aSig;
2469 1.1 ross bits32 zSig;
2470 1.1 ross
2471 1.1 ross aSig = extractFloat64Frac( a );
2472 1.1 ross aExp = extractFloat64Exp( a );
2473 1.1 ross aSign = extractFloat64Sign( a );
2474 1.1 ross if ( aExp == 0x7FF ) {
2475 1.1 ross if ( aSig ) return commonNaNToFloat32( float64ToCommonNaN( a ) );
2476 1.1 ross return packFloat32( aSign, 0xFF, 0 );
2477 1.1 ross }
2478 1.1 ross shift64RightJamming( aSig, 22, &aSig );
2479 1.1 ross zSig = aSig;
2480 1.1 ross if ( aExp || zSig ) {
2481 1.1 ross zSig |= 0x40000000;
2482 1.1 ross aExp -= 0x381;
2483 1.1 ross }
2484 1.1 ross return roundAndPackFloat32( aSign, aExp, zSig );
2485 1.1 ross
2486 1.1 ross }
2487 1.1 ross
2488 1.1 ross #ifdef FLOATX80
2489 1.1 ross
2490 1.1 ross /*
2491 1.1 ross -------------------------------------------------------------------------------
2492 1.1 ross Returns the result of converting the double-precision floating-point value
2493 1.1 ross `a' to the extended double-precision floating-point format. The conversion
2494 1.1 ross is performed according to the IEC/IEEE Standard for Binary Floating-Point
2495 1.1 ross Arithmetic.
2496 1.1 ross -------------------------------------------------------------------------------
2497 1.1 ross */
2498 1.1 ross floatx80 float64_to_floatx80( float64 a )
2499 1.1 ross {
2500 1.1 ross flag aSign;
2501 1.1 ross int16 aExp;
2502 1.1 ross bits64 aSig;
2503 1.1 ross
2504 1.1 ross aSig = extractFloat64Frac( a );
2505 1.1 ross aExp = extractFloat64Exp( a );
2506 1.1 ross aSign = extractFloat64Sign( a );
2507 1.1 ross if ( aExp == 0x7FF ) {
2508 1.1 ross if ( aSig ) return commonNaNToFloatx80( float64ToCommonNaN( a ) );
2509 1.1 ross return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
2510 1.1 ross }
2511 1.1 ross if ( aExp == 0 ) {
2512 1.1 ross if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 );
2513 1.1 ross normalizeFloat64Subnormal( aSig, &aExp, &aSig );
2514 1.1 ross }
2515 1.1 ross return
2516 1.1 ross packFloatx80(
2517 1.1 ross aSign, aExp + 0x3C00, ( aSig | LIT64( 0x0010000000000000 ) )<<11 );
2518 1.1 ross
2519 1.1 ross }
2520 1.1 ross
2521 1.1 ross #endif
2522 1.1 ross
2523 1.1 ross #ifdef FLOAT128
2524 1.1 ross
2525 1.1 ross /*
2526 1.1 ross -------------------------------------------------------------------------------
2527 1.1 ross Returns the result of converting the double-precision floating-point value
2528 1.1 ross `a' to the quadruple-precision floating-point format. The conversion is
2529 1.1 ross performed according to the IEC/IEEE Standard for Binary Floating-Point
2530 1.1 ross Arithmetic.
2531 1.1 ross -------------------------------------------------------------------------------
2532 1.1 ross */
2533 1.1 ross float128 float64_to_float128( float64 a )
2534 1.1 ross {
2535 1.1 ross flag aSign;
2536 1.1 ross int16 aExp;
2537 1.1 ross bits64 aSig, zSig0, zSig1;
2538 1.1 ross
2539 1.1 ross aSig = extractFloat64Frac( a );
2540 1.1 ross aExp = extractFloat64Exp( a );
2541 1.1 ross aSign = extractFloat64Sign( a );
2542 1.1 ross if ( aExp == 0x7FF ) {
2543 1.1 ross if ( aSig ) return commonNaNToFloat128( float64ToCommonNaN( a ) );
2544 1.1 ross return packFloat128( aSign, 0x7FFF, 0, 0 );
2545 1.1 ross }
2546 1.1 ross if ( aExp == 0 ) {
2547 1.1 ross if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 );
2548 1.1 ross normalizeFloat64Subnormal( aSig, &aExp, &aSig );
2549 1.1 ross --aExp;
2550 1.1 ross }
2551 1.1 ross shift128Right( aSig, 0, 4, &zSig0, &zSig1 );
2552 1.1 ross return packFloat128( aSign, aExp + 0x3C00, zSig0, zSig1 );
2553 1.1 ross
2554 1.1 ross }
2555 1.1 ross
2556 1.1 ross #endif
2557 1.1 ross
2558 1.1 ross #ifndef SOFTFLOAT_FOR_GCC
2559 1.1 ross /*
2560 1.1 ross -------------------------------------------------------------------------------
2561 1.1 ross Rounds the double-precision floating-point value `a' to an integer, and
2562 1.1 ross returns the result as a double-precision floating-point value. The
2563 1.1 ross operation is performed according to the IEC/IEEE Standard for Binary
2564 1.1 ross Floating-Point Arithmetic.
2565 1.1 ross -------------------------------------------------------------------------------
2566 1.1 ross */
2567 1.1 ross float64 float64_round_to_int( float64 a )
2568 1.1 ross {
2569 1.1 ross flag aSign;
2570 1.1 ross int16 aExp;
2571 1.1 ross bits64 lastBitMask, roundBitsMask;
2572 1.1 ross int8 roundingMode;
2573 1.1 ross float64 z;
2574 1.1 ross
2575 1.1 ross aExp = extractFloat64Exp( a );
2576 1.1 ross if ( 0x433 <= aExp ) {
2577 1.1 ross if ( ( aExp == 0x7FF ) && extractFloat64Frac( a ) ) {
2578 1.1 ross return propagateFloat64NaN( a, a );
2579 1.1 ross }
2580 1.1 ross return a;
2581 1.1 ross }
2582 1.1 ross if ( aExp < 0x3FF ) {
2583 1.1 ross if ( (bits64) ( a<<1 ) == 0 ) return a;
2584 1.1 ross float_set_inexact();
2585 1.1 ross aSign = extractFloat64Sign( a );
2586 1.1 ross switch ( float_rounding_mode() ) {
2587 1.1 ross case float_round_nearest_even:
2588 1.1 ross if ( ( aExp == 0x3FE ) && extractFloat64Frac( a ) ) {
2589 1.1 ross return packFloat64( aSign, 0x3FF, 0 );
2590 1.1 ross }
2591 1.1 ross break;
2592 1.1 ross case float_round_down:
2593 1.1 ross return aSign ? LIT64( 0xBFF0000000000000 ) : 0;
2594 1.1 ross case float_round_up:
2595 1.1 ross return
2596 1.1 ross aSign ? LIT64( 0x8000000000000000 ) : LIT64( 0x3FF0000000000000 );
2597 1.1 ross }
2598 1.1 ross return packFloat64( aSign, 0, 0 );
2599 1.1 ross }
2600 1.1 ross lastBitMask = 1;
2601 1.1 ross lastBitMask <<= 0x433 - aExp;
2602 1.1 ross roundBitsMask = lastBitMask - 1;
2603 1.1 ross z = a;
2604 1.1 ross roundingMode = float_rounding_mode();
2605 1.1 ross if ( roundingMode == float_round_nearest_even ) {
2606 1.1 ross z += lastBitMask>>1;
2607 1.1 ross if ( ( z & roundBitsMask ) == 0 ) z &= ~ lastBitMask;
2608 1.1 ross }
2609 1.1 ross else if ( roundingMode != float_round_to_zero ) {
2610 1.1 ross if ( extractFloat64Sign( z ) ^ ( roundingMode == float_round_up ) ) {
2611 1.1 ross z += roundBitsMask;
2612 1.1 ross }
2613 1.1 ross }
2614 1.1 ross z &= ~ roundBitsMask;
2615 1.1 ross if ( z != a ) float_set_inexact();
2616 1.1 ross return z;
2617 1.1 ross
2618 1.1 ross }
2619 1.1 ross #endif
2620 1.1 ross
2621 1.1 ross /*
2622 1.1 ross -------------------------------------------------------------------------------
2623 1.1 ross Returns the result of adding the absolute values of the double-precision
2624 1.1 ross floating-point values `a' and `b'. If `zSign' is 1, the sum is negated
2625 1.1 ross before being returned. `zSign' is ignored if the result is a NaN.
2626 1.1 ross The addition is performed according to the IEC/IEEE Standard for Binary
2627 1.1 ross Floating-Point Arithmetic.
2628 1.1 ross -------------------------------------------------------------------------------
2629 1.1 ross */
2630 1.1 ross static float64 addFloat64Sigs( float64 a, float64 b, flag zSign )
2631 1.1 ross {
2632 1.1 ross int16 aExp, bExp, zExp;
2633 1.1 ross bits64 aSig, bSig, zSig;
2634 1.1 ross int16 expDiff;
2635 1.1 ross
2636 1.1 ross aSig = extractFloat64Frac( a );
2637 1.1 ross aExp = extractFloat64Exp( a );
2638 1.1 ross bSig = extractFloat64Frac( b );
2639 1.1 ross bExp = extractFloat64Exp( b );
2640 1.1 ross expDiff = aExp - bExp;
2641 1.1 ross aSig <<= 9;
2642 1.1 ross bSig <<= 9;
2643 1.1 ross if ( 0 < expDiff ) {
2644 1.1 ross if ( aExp == 0x7FF ) {
2645 1.1 ross if ( aSig ) return propagateFloat64NaN( a, b );
2646 1.1 ross return a;
2647 1.1 ross }
2648 1.1 ross if ( bExp == 0 ) {
2649 1.1 ross --expDiff;
2650 1.1 ross }
2651 1.1 ross else {
2652 1.1 ross bSig |= LIT64( 0x2000000000000000 );
2653 1.1 ross }
2654 1.1 ross shift64RightJamming( bSig, expDiff, &bSig );
2655 1.1 ross zExp = aExp;
2656 1.1 ross }
2657 1.1 ross else if ( expDiff < 0 ) {
2658 1.1 ross if ( bExp == 0x7FF ) {
2659 1.1 ross if ( bSig ) return propagateFloat64NaN( a, b );
2660 1.1 ross return packFloat64( zSign, 0x7FF, 0 );
2661 1.1 ross }
2662 1.1 ross if ( aExp == 0 ) {
2663 1.1 ross ++expDiff;
2664 1.1 ross }
2665 1.1 ross else {
2666 1.1 ross aSig |= LIT64( 0x2000000000000000 );
2667 1.1 ross }
2668 1.1 ross shift64RightJamming( aSig, - expDiff, &aSig );
2669 1.1 ross zExp = bExp;
2670 1.1 ross }
2671 1.1 ross else {
2672 1.1 ross if ( aExp == 0x7FF ) {
2673 1.1 ross if ( aSig | bSig ) return propagateFloat64NaN( a, b );
2674 1.1 ross return a;
2675 1.1 ross }
2676 1.1 ross if ( aExp == 0 ) return packFloat64( zSign, 0, ( aSig + bSig )>>9 );
2677 1.1 ross zSig = LIT64( 0x4000000000000000 ) + aSig + bSig;
2678 1.1 ross zExp = aExp;
2679 1.1 ross goto roundAndPack;
2680 1.1 ross }
2681 1.1 ross aSig |= LIT64( 0x2000000000000000 );
2682 1.1 ross zSig = ( aSig + bSig )<<1;
2683 1.1 ross --zExp;
2684 1.1 ross if ( (sbits64) zSig < 0 ) {
2685 1.1 ross zSig = aSig + bSig;
2686 1.1 ross ++zExp;
2687 1.1 ross }
2688 1.1 ross roundAndPack:
2689 1.1 ross return roundAndPackFloat64( zSign, zExp, zSig );
2690 1.1 ross
2691 1.1 ross }
2692 1.1 ross
2693 1.1 ross /*
2694 1.1 ross -------------------------------------------------------------------------------
2695 1.1 ross Returns the result of subtracting the absolute values of the double-
2696 1.1 ross precision floating-point values `a' and `b'. If `zSign' is 1, the
2697 1.1 ross difference is negated before being returned. `zSign' is ignored if the
2698 1.1 ross result is a NaN. The subtraction is performed according to the IEC/IEEE
2699 1.1 ross Standard for Binary Floating-Point Arithmetic.
2700 1.1 ross -------------------------------------------------------------------------------
2701 1.1 ross */
2702 1.1 ross static float64 subFloat64Sigs( float64 a, float64 b, flag zSign )
2703 1.1 ross {
2704 1.1 ross int16 aExp, bExp, zExp;
2705 1.1 ross bits64 aSig, bSig, zSig;
2706 1.1 ross int16 expDiff;
2707 1.1 ross
2708 1.1 ross aSig = extractFloat64Frac( a );
2709 1.1 ross aExp = extractFloat64Exp( a );
2710 1.1 ross bSig = extractFloat64Frac( b );
2711 1.1 ross bExp = extractFloat64Exp( b );
2712 1.1 ross expDiff = aExp - bExp;
2713 1.1 ross aSig <<= 10;
2714 1.1 ross bSig <<= 10;
2715 1.1 ross if ( 0 < expDiff ) goto aExpBigger;
2716 1.1 ross if ( expDiff < 0 ) goto bExpBigger;
2717 1.1 ross if ( aExp == 0x7FF ) {
2718 1.1 ross if ( aSig | bSig ) return propagateFloat64NaN( a, b );
2719 1.1 ross float_raise( float_flag_invalid );
2720 1.1 ross return float64_default_nan;
2721 1.1 ross }
2722 1.1 ross if ( aExp == 0 ) {
2723 1.1 ross aExp = 1;
2724 1.1 ross bExp = 1;
2725 1.1 ross }
2726 1.1 ross if ( bSig < aSig ) goto aBigger;
2727 1.1 ross if ( aSig < bSig ) goto bBigger;
2728 1.1 ross return packFloat64( float_rounding_mode() == float_round_down, 0, 0 );
2729 1.1 ross bExpBigger:
2730 1.1 ross if ( bExp == 0x7FF ) {
2731 1.1 ross if ( bSig ) return propagateFloat64NaN( a, b );
2732 1.1 ross return packFloat64( zSign ^ 1, 0x7FF, 0 );
2733 1.1 ross }
2734 1.1 ross if ( aExp == 0 ) {
2735 1.1 ross ++expDiff;
2736 1.1 ross }
2737 1.1 ross else {
2738 1.1 ross aSig |= LIT64( 0x4000000000000000 );
2739 1.1 ross }
2740 1.1 ross shift64RightJamming( aSig, - expDiff, &aSig );
2741 1.1 ross bSig |= LIT64( 0x4000000000000000 );
2742 1.1 ross bBigger:
2743 1.1 ross zSig = bSig - aSig;
2744 1.1 ross zExp = bExp;
2745 1.1 ross zSign ^= 1;
2746 1.1 ross goto normalizeRoundAndPack;
2747 1.1 ross aExpBigger:
2748 1.1 ross if ( aExp == 0x7FF ) {
2749 1.1 ross if ( aSig ) return propagateFloat64NaN( a, b );
2750 1.1 ross return a;
2751 1.1 ross }
2752 1.1 ross if ( bExp == 0 ) {
2753 1.1 ross --expDiff;
2754 1.1 ross }
2755 1.1 ross else {
2756 1.1 ross bSig |= LIT64( 0x4000000000000000 );
2757 1.1 ross }
2758 1.1 ross shift64RightJamming( bSig, expDiff, &bSig );
2759 1.1 ross aSig |= LIT64( 0x4000000000000000 );
2760 1.1 ross aBigger:
2761 1.1 ross zSig = aSig - bSig;
2762 1.1 ross zExp = aExp;
2763 1.1 ross normalizeRoundAndPack:
2764 1.1 ross --zExp;
2765 1.1 ross return normalizeRoundAndPackFloat64( zSign, zExp, zSig );
2766 1.1 ross
2767 1.1 ross }
2768 1.1 ross
2769 1.1 ross /*
2770 1.1 ross -------------------------------------------------------------------------------
2771 1.1 ross Returns the result of adding the double-precision floating-point values `a'
2772 1.1 ross and `b'. The operation is performed according to the IEC/IEEE Standard for
2773 1.1 ross Binary Floating-Point Arithmetic.
2774 1.1 ross -------------------------------------------------------------------------------
2775 1.1 ross */
2776 1.1 ross float64 float64_add( float64 a, float64 b )
2777 1.1 ross {
2778 1.1 ross flag aSign, bSign;
2779 1.1 ross
2780 1.1 ross aSign = extractFloat64Sign( a );
2781 1.1 ross bSign = extractFloat64Sign( b );
2782 1.1 ross if ( aSign == bSign ) {
2783 1.1 ross return addFloat64Sigs( a, b, aSign );
2784 1.1 ross }
2785 1.1 ross else {
2786 1.1 ross return subFloat64Sigs( a, b, aSign );
2787 1.1 ross }
2788 1.1 ross
2789 1.1 ross }
2790 1.1 ross
2791 1.1 ross /*
2792 1.1 ross -------------------------------------------------------------------------------
2793 1.1 ross Returns the result of subtracting the double-precision floating-point values
2794 1.1 ross `a' and `b'. The operation is performed according to the IEC/IEEE Standard
2795 1.1 ross for Binary Floating-Point Arithmetic.
2796 1.1 ross -------------------------------------------------------------------------------
2797 1.1 ross */
2798 1.1 ross float64 float64_sub( float64 a, float64 b )
2799 1.1 ross {
2800 1.1 ross flag aSign, bSign;
2801 1.1 ross
2802 1.1 ross aSign = extractFloat64Sign( a );
2803 1.1 ross bSign = extractFloat64Sign( b );
2804 1.1 ross if ( aSign == bSign ) {
2805 1.1 ross return subFloat64Sigs( a, b, aSign );
2806 1.1 ross }
2807 1.1 ross else {
2808 1.1 ross return addFloat64Sigs( a, b, aSign );
2809 1.1 ross }
2810 1.1 ross
2811 1.1 ross }
2812 1.1 ross
2813 1.1 ross /*
2814 1.1 ross -------------------------------------------------------------------------------
2815 1.1 ross Returns the result of multiplying the double-precision floating-point values
2816 1.1 ross `a' and `b'. The operation is performed according to the IEC/IEEE Standard
2817 1.1 ross for Binary Floating-Point Arithmetic.
2818 1.1 ross -------------------------------------------------------------------------------
2819 1.1 ross */
2820 1.1 ross float64 float64_mul( float64 a, float64 b )
2821 1.1 ross {
2822 1.1 ross flag aSign, bSign, zSign;
2823 1.1 ross int16 aExp, bExp, zExp;
2824 1.1 ross bits64 aSig, bSig, zSig0, zSig1;
2825 1.1 ross
2826 1.1 ross aSig = extractFloat64Frac( a );
2827 1.1 ross aExp = extractFloat64Exp( a );
2828 1.1 ross aSign = extractFloat64Sign( a );
2829 1.1 ross bSig = extractFloat64Frac( b );
2830 1.1 ross bExp = extractFloat64Exp( b );
2831 1.1 ross bSign = extractFloat64Sign( b );
2832 1.1 ross zSign = aSign ^ bSign;
2833 1.1 ross if ( aExp == 0x7FF ) {
2834 1.1 ross if ( aSig || ( ( bExp == 0x7FF ) && bSig ) ) {
2835 1.1 ross return propagateFloat64NaN( a, b );
2836 1.1 ross }
2837 1.1 ross if ( ( bExp | bSig ) == 0 ) {
2838 1.1 ross float_raise( float_flag_invalid );
2839 1.1 ross return float64_default_nan;
2840 1.1 ross }
2841 1.1 ross return packFloat64( zSign, 0x7FF, 0 );
2842 1.1 ross }
2843 1.1 ross if ( bExp == 0x7FF ) {
2844 1.1 ross if ( bSig ) return propagateFloat64NaN( a, b );
2845 1.1 ross if ( ( aExp | aSig ) == 0 ) {
2846 1.1 ross float_raise( float_flag_invalid );
2847 1.1 ross return float64_default_nan;
2848 1.1 ross }
2849 1.1 ross return packFloat64( zSign, 0x7FF, 0 );
2850 1.1 ross }
2851 1.1 ross if ( aExp == 0 ) {
2852 1.1 ross if ( aSig == 0 ) return packFloat64( zSign, 0, 0 );
2853 1.1 ross normalizeFloat64Subnormal( aSig, &aExp, &aSig );
2854 1.1 ross }
2855 1.1 ross if ( bExp == 0 ) {
2856 1.1 ross if ( bSig == 0 ) return packFloat64( zSign, 0, 0 );
2857 1.1 ross normalizeFloat64Subnormal( bSig, &bExp, &bSig );
2858 1.1 ross }
2859 1.1 ross zExp = aExp + bExp - 0x3FF;
2860 1.1 ross aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<10;
2861 1.1 ross bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;
2862 1.1 ross mul64To128( aSig, bSig, &zSig0, &zSig1 );
2863 1.1 ross zSig0 |= ( zSig1 != 0 );
2864 1.1 ross if ( 0 <= (sbits64) ( zSig0<<1 ) ) {
2865 1.1 ross zSig0 <<= 1;
2866 1.1 ross --zExp;
2867 1.1 ross }
2868 1.1 ross return roundAndPackFloat64( zSign, zExp, zSig0 );
2869 1.1 ross
2870 1.1 ross }
2871 1.1 ross
2872 1.1 ross /*
2873 1.1 ross -------------------------------------------------------------------------------
2874 1.1 ross Returns the result of dividing the double-precision floating-point value `a'
2875 1.1 ross by the corresponding value `b'. The operation is performed according to
2876 1.1 ross the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2877 1.1 ross -------------------------------------------------------------------------------
2878 1.1 ross */
2879 1.1 ross float64 float64_div( float64 a, float64 b )
2880 1.1 ross {
2881 1.1 ross flag aSign, bSign, zSign;
2882 1.1 ross int16 aExp, bExp, zExp;
2883 1.1 ross bits64 aSig, bSig, zSig;
2884 1.1 ross bits64 rem0, rem1;
2885 1.1 ross bits64 term0, term1;
2886 1.1 ross
2887 1.1 ross aSig = extractFloat64Frac( a );
2888 1.1 ross aExp = extractFloat64Exp( a );
2889 1.1 ross aSign = extractFloat64Sign( a );
2890 1.1 ross bSig = extractFloat64Frac( b );
2891 1.1 ross bExp = extractFloat64Exp( b );
2892 1.1 ross bSign = extractFloat64Sign( b );
2893 1.1 ross zSign = aSign ^ bSign;
2894 1.1 ross if ( aExp == 0x7FF ) {
2895 1.1 ross if ( aSig ) return propagateFloat64NaN( a, b );
2896 1.1 ross if ( bExp == 0x7FF ) {
2897 1.1 ross if ( bSig ) return propagateFloat64NaN( a, b );
2898 1.1 ross float_raise( float_flag_invalid );
2899 1.1 ross return float64_default_nan;
2900 1.1 ross }
2901 1.1 ross return packFloat64( zSign, 0x7FF, 0 );
2902 1.1 ross }
2903 1.1 ross if ( bExp == 0x7FF ) {
2904 1.1 ross if ( bSig ) return propagateFloat64NaN( a, b );
2905 1.1 ross return packFloat64( zSign, 0, 0 );
2906 1.1 ross }
2907 1.1 ross if ( bExp == 0 ) {
2908 1.1 ross if ( bSig == 0 ) {
2909 1.1 ross if ( ( aExp | aSig ) == 0 ) {
2910 1.1 ross float_raise( float_flag_invalid );
2911 1.1 ross return float64_default_nan;
2912 1.1 ross }
2913 1.1 ross float_raise( float_flag_divbyzero );
2914 1.1 ross return packFloat64( zSign, 0x7FF, 0 );
2915 1.1 ross }
2916 1.1 ross normalizeFloat64Subnormal( bSig, &bExp, &bSig );
2917 1.1 ross }
2918 1.1 ross if ( aExp == 0 ) {
2919 1.1 ross if ( aSig == 0 ) return packFloat64( zSign, 0, 0 );
2920 1.1 ross normalizeFloat64Subnormal( aSig, &aExp, &aSig );
2921 1.1 ross }
2922 1.1 ross zExp = aExp - bExp + 0x3FD;
2923 1.1 ross aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<10;
2924 1.1 ross bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;
2925 1.1 ross if ( bSig <= ( aSig + aSig ) ) {
2926 1.1 ross aSig >>= 1;
2927 1.1 ross ++zExp;
2928 1.1 ross }
2929 1.1 ross zSig = estimateDiv128To64( aSig, 0, bSig );
2930 1.1 ross if ( ( zSig & 0x1FF ) <= 2 ) {
2931 1.1 ross mul64To128( bSig, zSig, &term0, &term1 );
2932 1.1 ross sub128( aSig, 0, term0, term1, &rem0, &rem1 );
2933 1.1 ross while ( (sbits64) rem0 < 0 ) {
2934 1.1 ross --zSig;
2935 1.1 ross add128( rem0, rem1, 0, bSig, &rem0, &rem1 );
2936 1.1 ross }
2937 1.1 ross zSig |= ( rem1 != 0 );
2938 1.1 ross }
2939 1.1 ross return roundAndPackFloat64( zSign, zExp, zSig );
2940 1.1 ross
2941 1.1 ross }
2942 1.1 ross
2943 1.1 ross #ifndef SOFTFLOAT_FOR_GCC
2944 1.1 ross /*
2945 1.1 ross -------------------------------------------------------------------------------
2946 1.1 ross Returns the remainder of the double-precision floating-point value `a'
2947 1.1 ross with respect to the corresponding value `b'. The operation is performed
2948 1.1 ross according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2949 1.1 ross -------------------------------------------------------------------------------
2950 1.1 ross */
2951 1.1 ross float64 float64_rem( float64 a, float64 b )
2952 1.1 ross {
2953 1.1 ross flag aSign, bSign, zSign;
2954 1.1 ross int16 aExp, bExp, expDiff;
2955 1.1 ross bits64 aSig, bSig;
2956 1.1 ross bits64 q, alternateASig;
2957 1.1 ross sbits64 sigMean;
2958 1.1 ross
2959 1.1 ross aSig = extractFloat64Frac( a );
2960 1.1 ross aExp = extractFloat64Exp( a );
2961 1.1 ross aSign = extractFloat64Sign( a );
2962 1.1 ross bSig = extractFloat64Frac( b );
2963 1.1 ross bExp = extractFloat64Exp( b );
2964 1.1 ross bSign = extractFloat64Sign( b );
2965 1.1 ross if ( aExp == 0x7FF ) {
2966 1.1 ross if ( aSig || ( ( bExp == 0x7FF ) && bSig ) ) {
2967 1.1 ross return propagateFloat64NaN( a, b );
2968 1.1 ross }
2969 1.1 ross float_raise( float_flag_invalid );
2970 1.1 ross return float64_default_nan;
2971 1.1 ross }
2972 1.1 ross if ( bExp == 0x7FF ) {
2973 1.1 ross if ( bSig ) return propagateFloat64NaN( a, b );
2974 1.1 ross return a;
2975 1.1 ross }
2976 1.1 ross if ( bExp == 0 ) {
2977 1.1 ross if ( bSig == 0 ) {
2978 1.1 ross float_raise( float_flag_invalid );
2979 1.1 ross return float64_default_nan;
2980 1.1 ross }
2981 1.1 ross normalizeFloat64Subnormal( bSig, &bExp, &bSig );
2982 1.1 ross }
2983 1.1 ross if ( aExp == 0 ) {
2984 1.1 ross if ( aSig == 0 ) return a;
2985 1.1 ross normalizeFloat64Subnormal( aSig, &aExp, &aSig );
2986 1.1 ross }
2987 1.1 ross expDiff = aExp - bExp;
2988 1.1 ross aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<11;
2989 1.1 ross bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;
2990 1.1 ross if ( expDiff < 0 ) {
2991 1.1 ross if ( expDiff < -1 ) return a;
2992 1.1 ross aSig >>= 1;
2993 1.1 ross }
2994 1.1 ross q = ( bSig <= aSig );
2995 1.1 ross if ( q ) aSig -= bSig;
2996 1.1 ross expDiff -= 64;
2997 1.1 ross while ( 0 < expDiff ) {
2998 1.1 ross q = estimateDiv128To64( aSig, 0, bSig );
2999 1.1 ross q = ( 2 < q ) ? q - 2 : 0;
3000 1.1 ross aSig = - ( ( bSig>>2 ) * q );
3001 1.1 ross expDiff -= 62;
3002 1.1 ross }
3003 1.1 ross expDiff += 64;
3004 1.1 ross if ( 0 < expDiff ) {
3005 1.1 ross q = estimateDiv128To64( aSig, 0, bSig );
3006 1.1 ross q = ( 2 < q ) ? q - 2 : 0;
3007 1.1 ross q >>= 64 - expDiff;
3008 1.1 ross bSig >>= 2;
3009 1.1 ross aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q;
3010 1.1 ross }
3011 1.1 ross else {
3012 1.1 ross aSig >>= 2;
3013 1.1 ross bSig >>= 2;
3014 1.1 ross }
3015 1.1 ross do {
3016 1.1 ross alternateASig = aSig;
3017 1.1 ross ++q;
3018 1.1 ross aSig -= bSig;
3019 1.1 ross } while ( 0 <= (sbits64) aSig );
3020 1.1 ross sigMean = aSig + alternateASig;
3021 1.1 ross if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) {
3022 1.1 ross aSig = alternateASig;
3023 1.1 ross }
3024 1.1 ross zSign = ( (sbits64) aSig < 0 );
3025 1.1 ross if ( zSign ) aSig = - aSig;
3026 1.1 ross return normalizeRoundAndPackFloat64( aSign ^ zSign, bExp, aSig );
3027 1.1 ross
3028 1.1 ross }
3029 1.1 ross
3030 1.1 ross /*
3031 1.1 ross -------------------------------------------------------------------------------
3032 1.1 ross Returns the square root of the double-precision floating-point value `a'.
3033 1.1 ross The operation is performed according to the IEC/IEEE Standard for Binary
3034 1.1 ross Floating-Point Arithmetic.
3035 1.1 ross -------------------------------------------------------------------------------
3036 1.1 ross */
3037 1.1 ross float64 float64_sqrt( float64 a )
3038 1.1 ross {
3039 1.1 ross flag aSign;
3040 1.1 ross int16 aExp, zExp;
3041 1.1 ross bits64 aSig, zSig, doubleZSig;
3042 1.1 ross bits64 rem0, rem1, term0, term1;
3043 1.1 ross
3044 1.1 ross aSig = extractFloat64Frac( a );
3045 1.1 ross aExp = extractFloat64Exp( a );
3046 1.1 ross aSign = extractFloat64Sign( a );
3047 1.1 ross if ( aExp == 0x7FF ) {
3048 1.1 ross if ( aSig ) return propagateFloat64NaN( a, a );
3049 1.1 ross if ( ! aSign ) return a;
3050 1.1 ross float_raise( float_flag_invalid );
3051 1.1 ross return float64_default_nan;
3052 1.1 ross }
3053 1.1 ross if ( aSign ) {
3054 1.1 ross if ( ( aExp | aSig ) == 0 ) return a;
3055 1.1 ross float_raise( float_flag_invalid );
3056 1.1 ross return float64_default_nan;
3057 1.1 ross }
3058 1.1 ross if ( aExp == 0 ) {
3059 1.1 ross if ( aSig == 0 ) return 0;
3060 1.1 ross normalizeFloat64Subnormal( aSig, &aExp, &aSig );
3061 1.1 ross }
3062 1.1 ross zExp = ( ( aExp - 0x3FF )>>1 ) + 0x3FE;
3063 1.1 ross aSig |= LIT64( 0x0010000000000000 );
3064 1.1 ross zSig = estimateSqrt32( aExp, aSig>>21 );
3065 1.1 ross aSig <<= 9 - ( aExp & 1 );
3066 1.1 ross zSig = estimateDiv128To64( aSig, 0, zSig<<32 ) + ( zSig<<30 );
3067 1.1 ross if ( ( zSig & 0x1FF ) <= 5 ) {
3068 1.1 ross doubleZSig = zSig<<1;
3069 1.1 ross mul64To128( zSig, zSig, &term0, &term1 );
3070 1.1 ross sub128( aSig, 0, term0, term1, &rem0, &rem1 );
3071 1.1 ross while ( (sbits64) rem0 < 0 ) {
3072 1.1 ross --zSig;
3073 1.1 ross doubleZSig -= 2;
3074 1.1 ross add128( rem0, rem1, zSig>>63, doubleZSig | 1, &rem0, &rem1 );
3075 1.1 ross }
3076 1.1 ross zSig |= ( ( rem0 | rem1 ) != 0 );
3077 1.1 ross }
3078 1.1 ross return roundAndPackFloat64( 0, zExp, zSig );
3079 1.1 ross
3080 1.1 ross }
3081 1.1 ross #endif
3082 1.1 ross
3083 1.1 ross /*
3084 1.1 ross -------------------------------------------------------------------------------
3085 1.1 ross Returns 1 if the double-precision floating-point value `a' is equal to the
3086 1.1 ross corresponding value `b', and 0 otherwise. The comparison is performed
3087 1.1 ross according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3088 1.1 ross -------------------------------------------------------------------------------
3089 1.1 ross */
3090 1.1 ross flag float64_eq( float64 a, float64 b )
3091 1.1 ross {
3092 1.1 ross
3093 1.1 ross if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
3094 1.1 ross || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
3095 1.1 ross ) {
3096 1.1 ross if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) {
3097 1.1 ross float_raise( float_flag_invalid );
3098 1.1 ross }
3099 1.1 ross return 0;
3100 1.1 ross }
3101 1.1 ross return ( a == b ) ||
3102 1.1 ross ( (bits64) ( ( FLOAT64_DEMANGLE(a) | FLOAT64_DEMANGLE(b) )<<1 ) == 0 );
3103 1.1 ross
3104 1.1 ross }
3105 1.1 ross
3106 1.1 ross /*
3107 1.1 ross -------------------------------------------------------------------------------
3108 1.1 ross Returns 1 if the double-precision floating-point value `a' is less than or
3109 1.1 ross equal to the corresponding value `b', and 0 otherwise. The comparison is
3110 1.1 ross performed according to the IEC/IEEE Standard for Binary Floating-Point
3111 1.1 ross Arithmetic.
3112 1.1 ross -------------------------------------------------------------------------------
3113 1.1 ross */
3114 1.1 ross flag float64_le( float64 a, float64 b )
3115 1.1 ross {
3116 1.1 ross flag aSign, bSign;
3117 1.1 ross
3118 1.1 ross if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
3119 1.1 ross || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
3120 1.1 ross ) {
3121 1.1 ross float_raise( float_flag_invalid );
3122 1.1 ross return 0;
3123 1.1 ross }
3124 1.1 ross aSign = extractFloat64Sign( a );
3125 1.1 ross bSign = extractFloat64Sign( b );
3126 1.1 ross if ( aSign != bSign )
3127 1.1 ross return aSign ||
3128 1.1 ross ( (bits64) ( ( FLOAT64_DEMANGLE(a) | FLOAT64_DEMANGLE(b) )<<1 ) ==
3129 1.1 ross 0 );
3130 1.1 ross return ( a == b ) ||
3131 1.1 ross ( aSign ^ ( FLOAT64_DEMANGLE(a) < FLOAT64_DEMANGLE(b) ) );
3132 1.1 ross
3133 1.1 ross }
3134 1.1 ross
3135 1.1 ross /*
3136 1.1 ross -------------------------------------------------------------------------------
3137 1.1 ross Returns 1 if the double-precision floating-point value `a' is less than
3138 1.1 ross the corresponding value `b', and 0 otherwise. The comparison is performed
3139 1.1 ross according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3140 1.1 ross -------------------------------------------------------------------------------
3141 1.1 ross */
3142 1.1 ross flag float64_lt( float64 a, float64 b )
3143 1.1 ross {
3144 1.1 ross flag aSign, bSign;
3145 1.1 ross
3146 1.1 ross if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
3147 1.1 ross || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
3148 1.1 ross ) {
3149 1.1 ross float_raise( float_flag_invalid );
3150 1.1 ross return 0;
3151 1.1 ross }
3152 1.1 ross aSign = extractFloat64Sign( a );
3153 1.1 ross bSign = extractFloat64Sign( b );
3154 1.1 ross if ( aSign != bSign )
3155 1.1 ross return aSign &&
3156 1.1 ross ( (bits64) ( ( FLOAT64_DEMANGLE(a) | FLOAT64_DEMANGLE(b) )<<1 ) !=
3157 1.1 ross 0 );
3158 1.1 ross return ( a != b ) &&
3159 1.1 ross ( aSign ^ ( FLOAT64_DEMANGLE(a) < FLOAT64_DEMANGLE(b) ) );
3160 1.1 ross
3161 1.1 ross }
3162 1.1 ross
3163 1.1 ross #ifndef SOFTFLOAT_FOR_GCC
3164 1.1 ross /*
3165 1.1 ross -------------------------------------------------------------------------------
3166 1.1 ross Returns 1 if the double-precision floating-point value `a' is equal to the
3167 1.1 ross corresponding value `b', and 0 otherwise. The invalid exception is raised
3168 1.1 ross if either operand is a NaN. Otherwise, the comparison is performed
3169 1.1 ross according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3170 1.1 ross -------------------------------------------------------------------------------
3171 1.1 ross */
3172 1.1 ross flag float64_eq_signaling( float64 a, float64 b )
3173 1.1 ross {
3174 1.1 ross
3175 1.1 ross if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
3176 1.1 ross || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
3177 1.1 ross ) {
3178 1.1 ross float_raise( float_flag_invalid );
3179 1.1 ross return 0;
3180 1.1 ross }
3181 1.1 ross return ( a == b ) || ( (bits64) ( ( a | b )<<1 ) == 0 );
3182 1.1 ross
3183 1.1 ross }
3184 1.1 ross
3185 1.1 ross /*
3186 1.1 ross -------------------------------------------------------------------------------
3187 1.1 ross Returns 1 if the double-precision floating-point value `a' is less than or
3188 1.1 ross equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not
3189 1.1 ross cause an exception. Otherwise, the comparison is performed according to the
3190 1.1 ross IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3191 1.1 ross -------------------------------------------------------------------------------
3192 1.1 ross */
3193 1.1 ross flag float64_le_quiet( float64 a, float64 b )
3194 1.1 ross {
3195 1.1 ross flag aSign, bSign;
3196 1.1 ross
3197 1.1 ross if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
3198 1.1 ross || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
3199 1.1 ross ) {
3200 1.1 ross if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) {
3201 1.1 ross float_raise( float_flag_invalid );
3202 1.1 ross }
3203 1.1 ross return 0;
3204 1.1 ross }
3205 1.1 ross aSign = extractFloat64Sign( a );
3206 1.1 ross bSign = extractFloat64Sign( b );
3207 1.1 ross if ( aSign != bSign ) return aSign || ( (bits64) ( ( a | b )<<1 ) == 0 );
3208 1.1 ross return ( a == b ) || ( aSign ^ ( a < b ) );
3209 1.1 ross
3210 1.1 ross }
3211 1.1 ross
3212 1.1 ross /*
3213 1.1 ross -------------------------------------------------------------------------------
3214 1.1 ross Returns 1 if the double-precision floating-point value `a' is less than
3215 1.1 ross the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
3216 1.1 ross exception. Otherwise, the comparison is performed according to the IEC/IEEE
3217 1.1 ross Standard for Binary Floating-Point Arithmetic.
3218 1.1 ross -------------------------------------------------------------------------------
3219 1.1 ross */
3220 1.1 ross flag float64_lt_quiet( float64 a, float64 b )
3221 1.1 ross {
3222 1.1 ross flag aSign, bSign;
3223 1.1 ross
3224 1.1 ross if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
3225 1.1 ross || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
3226 1.1 ross ) {
3227 1.1 ross if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) {
3228 1.1 ross float_raise( float_flag_invalid );
3229 1.1 ross }
3230 1.1 ross return 0;
3231 1.1 ross }
3232 1.1 ross aSign = extractFloat64Sign( a );
3233 1.1 ross bSign = extractFloat64Sign( b );
3234 1.1 ross if ( aSign != bSign ) return aSign && ( (bits64) ( ( a | b )<<1 ) != 0 );
3235 1.1 ross return ( a != b ) && ( aSign ^ ( a < b ) );
3236 1.1 ross
3237 1.1 ross }
3238 1.1 ross #endif
3239 1.1 ross
3240 1.1 ross #ifdef FLOATX80
3241 1.1 ross
3242 1.1 ross /*
3243 1.1 ross -------------------------------------------------------------------------------
3244 1.1 ross Returns the result of converting the extended double-precision floating-
3245 1.1 ross point value `a' to the 32-bit two's complement integer format. The
3246 1.1 ross conversion is performed according to the IEC/IEEE Standard for Binary
3247 1.1 ross Floating-Point Arithmetic---which means in particular that the conversion
3248 1.1 ross is rounded according to the current rounding mode. If `a' is a NaN, the
3249 1.1 ross largest positive integer is returned. Otherwise, if the conversion
3250 1.1 ross overflows, the largest integer with the same sign as `a' is returned.
3251 1.1 ross -------------------------------------------------------------------------------
3252 1.1 ross */
3253 1.1 ross int32 floatx80_to_int32( floatx80 a )
3254 1.1 ross {
3255 1.1 ross flag aSign;
3256 1.1 ross int32 aExp, shiftCount;
3257 1.1 ross bits64 aSig;
3258 1.1 ross
3259 1.1 ross aSig = extractFloatx80Frac( a );
3260 1.1 ross aExp = extractFloatx80Exp( a );
3261 1.1 ross aSign = extractFloatx80Sign( a );
3262 1.1 ross if ( ( aExp == 0x7FFF ) && (bits64) ( aSig<<1 ) ) aSign = 0;
3263 1.1 ross shiftCount = 0x4037 - aExp;
3264 1.1 ross if ( shiftCount <= 0 ) shiftCount = 1;
3265 1.1 ross shift64RightJamming( aSig, shiftCount, &aSig );
3266 1.1 ross return roundAndPackInt32( aSign, aSig );
3267 1.1 ross
3268 1.1 ross }
3269 1.1 ross
3270 1.1 ross /*
3271 1.1 ross -------------------------------------------------------------------------------
3272 1.1 ross Returns the result of converting the extended double-precision floating-
3273 1.1 ross point value `a' to the 32-bit two's complement integer format. The
3274 1.1 ross conversion is performed according to the IEC/IEEE Standard for Binary
3275 1.1 ross Floating-Point Arithmetic, except that the conversion is always rounded
3276 1.1 ross toward zero. If `a' is a NaN, the largest positive integer is returned.
3277 1.1 ross Otherwise, if the conversion overflows, the largest integer with the same
3278 1.1 ross sign as `a' is returned.
3279 1.1 ross -------------------------------------------------------------------------------
3280 1.1 ross */
3281 1.1 ross int32 floatx80_to_int32_round_to_zero( floatx80 a )
3282 1.1 ross {
3283 1.1 ross flag aSign;
3284 1.1 ross int32 aExp, shiftCount;
3285 1.1 ross bits64 aSig, savedASig;
3286 1.1 ross int32 z;
3287 1.1 ross
3288 1.1 ross aSig = extractFloatx80Frac( a );
3289 1.1 ross aExp = extractFloatx80Exp( a );
3290 1.1 ross aSign = extractFloatx80Sign( a );
3291 1.1 ross if ( 0x401E < aExp ) {
3292 1.1 ross if ( ( aExp == 0x7FFF ) && (bits64) ( aSig<<1 ) ) aSign = 0;
3293 1.1 ross goto invalid;
3294 1.1 ross }
3295 1.1 ross else if ( aExp < 0x3FFF ) {
3296 1.1 ross if ( aExp || aSig ) float_set_inexact();
3297 1.1 ross return 0;
3298 1.1 ross }
3299 1.1 ross shiftCount = 0x403E - aExp;
3300 1.1 ross savedASig = aSig;
3301 1.1 ross aSig >>= shiftCount;
3302 1.1 ross z = aSig;
3303 1.1 ross if ( aSign ) z = - z;
3304 1.1 ross if ( ( z < 0 ) ^ aSign ) {
3305 1.1 ross invalid:
3306 1.1 ross float_raise( float_flag_invalid );
3307 1.1 ross return aSign ? (sbits32) 0x80000000 : 0x7FFFFFFF;
3308 1.1 ross }
3309 1.1 ross if ( ( aSig<<shiftCount ) != savedASig ) {
3310 1.1 ross float_set_inexact();
3311 1.1 ross }
3312 1.1 ross return z;
3313 1.1 ross
3314 1.1 ross }
3315 1.1 ross
3316 1.1 ross /*
3317 1.1 ross -------------------------------------------------------------------------------
3318 1.1 ross Returns the result of converting the extended double-precision floating-
3319 1.1 ross point value `a' to the 64-bit two's complement integer format. The
3320 1.1 ross conversion is performed according to the IEC/IEEE Standard for Binary
3321 1.1 ross Floating-Point Arithmetic---which means in particular that the conversion
3322 1.1 ross is rounded according to the current rounding mode. If `a' is a NaN,
3323 1.1 ross the largest positive integer is returned. Otherwise, if the conversion
3324 1.1 ross overflows, the largest integer with the same sign as `a' is returned.
3325 1.1 ross -------------------------------------------------------------------------------
3326 1.1 ross */
3327 1.1 ross int64 floatx80_to_int64( floatx80 a )
3328 1.1 ross {
3329 1.1 ross flag aSign;
3330 1.1 ross int32 aExp, shiftCount;
3331 1.1 ross bits64 aSig, aSigExtra;
3332 1.1 ross
3333 1.1 ross aSig = extractFloatx80Frac( a );
3334 1.1 ross aExp = extractFloatx80Exp( a );
3335 1.1 ross aSign = extractFloatx80Sign( a );
3336 1.1 ross shiftCount = 0x403E - aExp;
3337 1.1 ross if ( shiftCount <= 0 ) {
3338 1.1 ross if ( shiftCount ) {
3339 1.1 ross float_raise( float_flag_invalid );
3340 1.1 ross if ( ! aSign
3341 1.1 ross || ( ( aExp == 0x7FFF )
3342 1.1 ross && ( aSig != LIT64( 0x8000000000000000 ) ) )
3343 1.1 ross ) {
3344 1.1 ross return LIT64( 0x7FFFFFFFFFFFFFFF );
3345 1.1 ross }
3346 1.1 ross return (sbits64) LIT64( 0x8000000000000000 );
3347 1.1 ross }
3348 1.1 ross aSigExtra = 0;
3349 1.1 ross }
3350 1.1 ross else {
3351 1.1 ross shift64ExtraRightJamming( aSig, 0, shiftCount, &aSig, &aSigExtra );
3352 1.1 ross }
3353 1.1 ross return roundAndPackInt64( aSign, aSig, aSigExtra );
3354 1.1 ross
3355 1.1 ross }
3356 1.1 ross
3357 1.1 ross /*
3358 1.1 ross -------------------------------------------------------------------------------
3359 1.1 ross Returns the result of converting the extended double-precision floating-
3360 1.1 ross point value `a' to the 64-bit two's complement integer format. The
3361 1.1 ross conversion is performed according to the IEC/IEEE Standard for Binary
3362 1.1 ross Floating-Point Arithmetic, except that the conversion is always rounded
3363 1.1 ross toward zero. If `a' is a NaN, the largest positive integer is returned.
3364 1.1 ross Otherwise, if the conversion overflows, the largest integer with the same
3365 1.1 ross sign as `a' is returned.
3366 1.1 ross -------------------------------------------------------------------------------
3367 1.1 ross */
3368 1.1 ross int64 floatx80_to_int64_round_to_zero( floatx80 a )
3369 1.1 ross {
3370 1.1 ross flag aSign;
3371 1.1 ross int32 aExp, shiftCount;
3372 1.1 ross bits64 aSig;
3373 1.1 ross int64 z;
3374 1.1 ross
3375 1.1 ross aSig = extractFloatx80Frac( a );
3376 1.1 ross aExp = extractFloatx80Exp( a );
3377 1.1 ross aSign = extractFloatx80Sign( a );
3378 1.1 ross shiftCount = aExp - 0x403E;
3379 1.1 ross if ( 0 <= shiftCount ) {
3380 1.1 ross aSig &= LIT64( 0x7FFFFFFFFFFFFFFF );
3381 1.1 ross if ( ( a.high != 0xC03E ) || aSig ) {
3382 1.1 ross float_raise( float_flag_invalid );
3383 1.1 ross if ( ! aSign || ( ( aExp == 0x7FFF ) && aSig ) ) {
3384 1.1 ross return LIT64( 0x7FFFFFFFFFFFFFFF );
3385 1.1 ross }
3386 1.1 ross }
3387 1.1 ross return (sbits64) LIT64( 0x8000000000000000 );
3388 1.1 ross }
3389 1.1 ross else if ( aExp < 0x3FFF ) {
3390 1.1 ross if ( aExp | aSig ) float_set_inexact();
3391 1.1 ross return 0;
3392 1.1 ross }
3393 1.1 ross z = aSig>>( - shiftCount );
3394 1.1 ross if ( (bits64) ( aSig<<( shiftCount & 63 ) ) ) {
3395 1.1 ross float_set_inexact();
3396 1.1 ross }
3397 1.1 ross if ( aSign ) z = - z;
3398 1.1 ross return z;
3399 1.1 ross
3400 1.1 ross }
3401 1.1 ross
3402 1.1 ross /*
3403 1.1 ross -------------------------------------------------------------------------------
3404 1.1 ross Returns the result of converting the extended double-precision floating-
3405 1.1 ross point value `a' to the single-precision floating-point format. The
3406 1.1 ross conversion is performed according to the IEC/IEEE Standard for Binary
3407 1.1 ross Floating-Point Arithmetic.
3408 1.1 ross -------------------------------------------------------------------------------
3409 1.1 ross */
3410 1.1 ross float32 floatx80_to_float32( floatx80 a )
3411 1.1 ross {
3412 1.1 ross flag aSign;
3413 1.1 ross int32 aExp;
3414 1.1 ross bits64 aSig;
3415 1.1 ross
3416 1.1 ross aSig = extractFloatx80Frac( a );
3417 1.1 ross aExp = extractFloatx80Exp( a );
3418 1.1 ross aSign = extractFloatx80Sign( a );
3419 1.1 ross if ( aExp == 0x7FFF ) {
3420 1.1 ross if ( (bits64) ( aSig<<1 ) ) {
3421 1.1 ross return commonNaNToFloat32( floatx80ToCommonNaN( a ) );
3422 1.1 ross }
3423 1.1 ross return packFloat32( aSign, 0xFF, 0 );
3424 1.1 ross }
3425 1.1 ross shift64RightJamming( aSig, 33, &aSig );
3426 1.1 ross if ( aExp || aSig ) aExp -= 0x3F81;
3427 1.1 ross return roundAndPackFloat32( aSign, aExp, aSig );
3428 1.1 ross
3429 1.1 ross }
3430 1.1 ross
3431 1.1 ross /*
3432 1.1 ross -------------------------------------------------------------------------------
3433 1.1 ross Returns the result of converting the extended double-precision floating-
3434 1.1 ross point value `a' to the double-precision floating-point format. The
3435 1.1 ross conversion is performed according to the IEC/IEEE Standard for Binary
3436 1.1 ross Floating-Point Arithmetic.
3437 1.1 ross -------------------------------------------------------------------------------
3438 1.1 ross */
3439 1.1 ross float64 floatx80_to_float64( floatx80 a )
3440 1.1 ross {
3441 1.1 ross flag aSign;
3442 1.1 ross int32 aExp;
3443 1.1 ross bits64 aSig, zSig;
3444 1.1 ross
3445 1.1 ross aSig = extractFloatx80Frac( a );
3446 1.1 ross aExp = extractFloatx80Exp( a );
3447 1.1 ross aSign = extractFloatx80Sign( a );
3448 1.1 ross if ( aExp == 0x7FFF ) {
3449 1.1 ross if ( (bits64) ( aSig<<1 ) ) {
3450 1.1 ross return commonNaNToFloat64( floatx80ToCommonNaN( a ) );
3451 1.1 ross }
3452 1.1 ross return packFloat64( aSign, 0x7FF, 0 );
3453 1.1 ross }
3454 1.1 ross shift64RightJamming( aSig, 1, &zSig );
3455 1.1 ross if ( aExp || aSig ) aExp -= 0x3C01;
3456 1.1 ross return roundAndPackFloat64( aSign, aExp, zSig );
3457 1.1 ross
3458 1.1 ross }
3459 1.1 ross
3460 1.1 ross #ifdef FLOAT128
3461 1.1 ross
3462 1.1 ross /*
3463 1.1 ross -------------------------------------------------------------------------------
3464 1.1 ross Returns the result of converting the extended double-precision floating-
3465 1.1 ross point value `a' to the quadruple-precision floating-point format. The
3466 1.1 ross conversion is performed according to the IEC/IEEE Standard for Binary
3467 1.1 ross Floating-Point Arithmetic.
3468 1.1 ross -------------------------------------------------------------------------------
3469 1.1 ross */
3470 1.1 ross float128 floatx80_to_float128( floatx80 a )
3471 1.1 ross {
3472 1.1 ross flag aSign;
3473 1.1 ross int16 aExp;
3474 1.1 ross bits64 aSig, zSig0, zSig1;
3475 1.1 ross
3476 1.1 ross aSig = extractFloatx80Frac( a );
3477 1.1 ross aExp = extractFloatx80Exp( a );
3478 1.1 ross aSign = extractFloatx80Sign( a );
3479 1.1 ross if ( ( aExp == 0x7FFF ) && (bits64) ( aSig<<1 ) ) {
3480 1.1 ross return commonNaNToFloat128( floatx80ToCommonNaN( a ) );
3481 1.1 ross }
3482 1.1 ross shift128Right( aSig<<1, 0, 16, &zSig0, &zSig1 );
3483 1.1 ross return packFloat128( aSign, aExp, zSig0, zSig1 );
3484 1.1 ross
3485 1.1 ross }
3486 1.1 ross
3487 1.1 ross #endif
3488 1.1 ross
3489 1.1 ross /*
3490 1.1 ross -------------------------------------------------------------------------------
3491 1.1 ross Rounds the extended double-precision floating-point value `a' to an integer,
3492 1.1 ross and returns the result as an extended quadruple-precision floating-point
3493 1.1 ross value. The operation is performed according to the IEC/IEEE Standard for
3494 1.1 ross Binary Floating-Point Arithmetic.
3495 1.1 ross -------------------------------------------------------------------------------
3496 1.1 ross */
3497 1.1 ross floatx80 floatx80_round_to_int( floatx80 a )
3498 1.1 ross {
3499 1.1 ross flag aSign;
3500 1.1 ross int32 aExp;
3501 1.1 ross bits64 lastBitMask, roundBitsMask;
3502 1.1 ross int8 roundingMode;
3503 1.1 ross floatx80 z;
3504 1.1 ross
3505 1.1 ross aExp = extractFloatx80Exp( a );
3506 1.1 ross if ( 0x403E <= aExp ) {
3507 1.1 ross if ( ( aExp == 0x7FFF ) && (bits64) ( extractFloatx80Frac( a )<<1 ) ) {
3508 1.1 ross return propagateFloatx80NaN( a, a );
3509 1.1 ross }
3510 1.1 ross return a;
3511 1.1 ross }
3512 1.1 ross if ( aExp < 0x3FFF ) {
3513 1.1 ross if ( ( aExp == 0 )
3514 1.1 ross && ( (bits64) ( extractFloatx80Frac( a )<<1 ) == 0 ) ) {
3515 1.1 ross return a;
3516 1.1 ross }
3517 1.1 ross float_set_inexact();
3518 1.1 ross aSign = extractFloatx80Sign( a );
3519 1.1 ross switch ( float_rounding_mode() ) {
3520 1.1 ross case float_round_nearest_even:
3521 1.1 ross if ( ( aExp == 0x3FFE ) && (bits64) ( extractFloatx80Frac( a )<<1 )
3522 1.1 ross ) {
3523 1.1 ross return
3524 1.1 ross packFloatx80( aSign, 0x3FFF, LIT64( 0x8000000000000000 ) );
3525 1.1 ross }
3526 1.1 ross break;
3527 1.1 ross case float_round_down:
3528 1.1 ross return
3529 1.1 ross aSign ?
3530 1.1 ross packFloatx80( 1, 0x3FFF, LIT64( 0x8000000000000000 ) )
3531 1.1 ross : packFloatx80( 0, 0, 0 );
3532 1.1 ross case float_round_up:
3533 1.1 ross return
3534 1.1 ross aSign ? packFloatx80( 1, 0, 0 )
3535 1.1 ross : packFloatx80( 0, 0x3FFF, LIT64( 0x8000000000000000 ) );
3536 1.1 ross }
3537 1.1 ross return packFloatx80( aSign, 0, 0 );
3538 1.1 ross }
3539 1.1 ross lastBitMask = 1;
3540 1.1 ross lastBitMask <<= 0x403E - aExp;
3541 1.1 ross roundBitsMask = lastBitMask - 1;
3542 1.1 ross z = a;
3543 1.1 ross roundingMode = float_rounding_mode();
3544 1.1 ross if ( roundingMode == float_round_nearest_even ) {
3545 1.1 ross z.low += lastBitMask>>1;
3546 1.1 ross if ( ( z.low & roundBitsMask ) == 0 ) z.low &= ~ lastBitMask;
3547 1.1 ross }
3548 1.1 ross else if ( roundingMode != float_round_to_zero ) {
3549 1.1 ross if ( extractFloatx80Sign( z ) ^ ( roundingMode == float_round_up ) ) {
3550 1.1 ross z.low += roundBitsMask;
3551 1.1 ross }
3552 1.1 ross }
3553 1.1 ross z.low &= ~ roundBitsMask;
3554 1.1 ross if ( z.low == 0 ) {
3555 1.1 ross ++z.high;
3556 1.1 ross z.low = LIT64( 0x8000000000000000 );
3557 1.1 ross }
3558 1.1 ross if ( z.low != a.low ) float_set_inexact();
3559 1.1 ross return z;
3560 1.1 ross
3561 1.1 ross }
3562 1.1 ross
3563 1.1 ross /*
3564 1.1 ross -------------------------------------------------------------------------------
3565 1.1 ross Returns the result of adding the absolute values of the extended double-
3566 1.1 ross precision floating-point values `a' and `b'. If `zSign' is 1, the sum is
3567 1.1 ross negated before being returned. `zSign' is ignored if the result is a NaN.
3568 1.1 ross The addition is performed according to the IEC/IEEE Standard for Binary
3569 1.1 ross Floating-Point Arithmetic.
3570 1.1 ross -------------------------------------------------------------------------------
3571 1.1 ross */
3572 1.1 ross static floatx80 addFloatx80Sigs( floatx80 a, floatx80 b, flag zSign )
3573 1.1 ross {
3574 1.1 ross int32 aExp, bExp, zExp;
3575 1.1 ross bits64 aSig, bSig, zSig0, zSig1;
3576 1.1 ross int32 expDiff;
3577 1.1 ross
3578 1.1 ross aSig = extractFloatx80Frac( a );
3579 1.1 ross aExp = extractFloatx80Exp( a );
3580 1.1 ross bSig = extractFloatx80Frac( b );
3581 1.1 ross bExp = extractFloatx80Exp( b );
3582 1.1 ross expDiff = aExp - bExp;
3583 1.1 ross if ( 0 < expDiff ) {
3584 1.1 ross if ( aExp == 0x7FFF ) {
3585 1.1 ross if ( (bits64) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b );
3586 1.1 ross return a;
3587 1.1 ross }
3588 1.1 ross if ( bExp == 0 ) --expDiff;
3589 1.1 ross shift64ExtraRightJamming( bSig, 0, expDiff, &bSig, &zSig1 );
3590 1.1 ross zExp = aExp;
3591 1.1 ross }
3592 1.1 ross else if ( expDiff < 0 ) {
3593 1.1 ross if ( bExp == 0x7FFF ) {
3594 1.1 ross if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b );
3595 1.1 ross return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
3596 1.1 ross }
3597 1.1 ross if ( aExp == 0 ) ++expDiff;
3598 1.1 ross shift64ExtraRightJamming( aSig, 0, - expDiff, &aSig, &zSig1 );
3599 1.1 ross zExp = bExp;
3600 1.1 ross }
3601 1.1 ross else {
3602 1.1 ross if ( aExp == 0x7FFF ) {
3603 1.1 ross if ( (bits64) ( ( aSig | bSig )<<1 ) ) {
3604 1.1 ross return propagateFloatx80NaN( a, b );
3605 1.1 ross }
3606 1.1 ross return a;
3607 1.1 ross }
3608 1.1 ross zSig1 = 0;
3609 1.1 ross zSig0 = aSig + bSig;
3610 1.1 ross if ( aExp == 0 ) {
3611 1.1 ross normalizeFloatx80Subnormal( zSig0, &zExp, &zSig0 );
3612 1.1 ross goto roundAndPack;
3613 1.1 ross }
3614 1.1 ross zExp = aExp;
3615 1.1 ross goto shiftRight1;
3616 1.1 ross }
3617 1.1 ross zSig0 = aSig + bSig;
3618 1.1 ross if ( (sbits64) zSig0 < 0 ) goto roundAndPack;
3619 1.1 ross shiftRight1:
3620 1.1 ross shift64ExtraRightJamming( zSig0, zSig1, 1, &zSig0, &zSig1 );
3621 1.1 ross zSig0 |= LIT64( 0x8000000000000000 );
3622 1.1 ross ++zExp;
3623 1.1 ross roundAndPack:
3624 1.1 ross return
3625 1.1 ross roundAndPackFloatx80(
3626 1.1 ross floatx80_rounding_precision, zSign, zExp, zSig0, zSig1 );
3627 1.1 ross
3628 1.1 ross }
3629 1.1 ross
3630 1.1 ross /*
3631 1.1 ross -------------------------------------------------------------------------------
3632 1.1 ross Returns the result of subtracting the absolute values of the extended
3633 1.1 ross double-precision floating-point values `a' and `b'. If `zSign' is 1, the
3634 1.1 ross difference is negated before being returned. `zSign' is ignored if the
3635 1.1 ross result is a NaN. The subtraction is performed according to the IEC/IEEE
3636 1.1 ross Standard for Binary Floating-Point Arithmetic.
3637 1.1 ross -------------------------------------------------------------------------------
3638 1.1 ross */
3639 1.1 ross static floatx80 subFloatx80Sigs( floatx80 a, floatx80 b, flag zSign )
3640 1.1 ross {
3641 1.1 ross int32 aExp, bExp, zExp;
3642 1.1 ross bits64 aSig, bSig, zSig0, zSig1;
3643 1.1 ross int32 expDiff;
3644 1.1 ross floatx80 z;
3645 1.1 ross
3646 1.1 ross aSig = extractFloatx80Frac( a );
3647 1.1 ross aExp = extractFloatx80Exp( a );
3648 1.1 ross bSig = extractFloatx80Frac( b );
3649 1.1 ross bExp = extractFloatx80Exp( b );
3650 1.1 ross expDiff = aExp - bExp;
3651 1.1 ross if ( 0 < expDiff ) goto aExpBigger;
3652 1.1 ross if ( expDiff < 0 ) goto bExpBigger;
3653 1.1 ross if ( aExp == 0x7FFF ) {
3654 1.1 ross if ( (bits64) ( ( aSig | bSig )<<1 ) ) {
3655 1.1 ross return propagateFloatx80NaN( a, b );
3656 1.1 ross }
3657 1.1 ross float_raise( float_flag_invalid );
3658 1.1 ross z.low = floatx80_default_nan_low;
3659 1.1 ross z.high = floatx80_default_nan_high;
3660 1.1 ross return z;
3661 1.1 ross }
3662 1.1 ross if ( aExp == 0 ) {
3663 1.1 ross aExp = 1;
3664 1.1 ross bExp = 1;
3665 1.1 ross }
3666 1.1 ross zSig1 = 0;
3667 1.1 ross if ( bSig < aSig ) goto aBigger;
3668 1.1 ross if ( aSig < bSig ) goto bBigger;
3669 1.1 ross return packFloatx80( float_rounding_mode() == float_round_down, 0, 0 );
3670 1.1 ross bExpBigger:
3671 1.1 ross if ( bExp == 0x7FFF ) {
3672 1.1 ross if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b );
3673 1.1 ross return packFloatx80( zSign ^ 1, 0x7FFF, LIT64( 0x8000000000000000 ) );
3674 1.1 ross }
3675 1.1 ross if ( aExp == 0 ) ++expDiff;
3676 1.1 ross shift128RightJamming( aSig, 0, - expDiff, &aSig, &zSig1 );
3677 1.1 ross bBigger:
3678 1.1 ross sub128( bSig, 0, aSig, zSig1, &zSig0, &zSig1 );
3679 1.1 ross zExp = bExp;
3680 1.1 ross zSign ^= 1;
3681 1.1 ross goto normalizeRoundAndPack;
3682 1.1 ross aExpBigger:
3683 1.1 ross if ( aExp == 0x7FFF ) {
3684 1.1 ross if ( (bits64) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b );
3685 1.1 ross return a;
3686 1.1 ross }
3687 1.1 ross if ( bExp == 0 ) --expDiff;
3688 1.1 ross shift128RightJamming( bSig, 0, expDiff, &bSig, &zSig1 );
3689 1.1 ross aBigger:
3690 1.1 ross sub128( aSig, 0, bSig, zSig1, &zSig0, &zSig1 );
3691 1.1 ross zExp = aExp;
3692 1.1 ross normalizeRoundAndPack:
3693 1.1 ross return
3694 1.1 ross normalizeRoundAndPackFloatx80(
3695 1.1 ross floatx80_rounding_precision, zSign, zExp, zSig0, zSig1 );
3696 1.1 ross
3697 1.1 ross }
3698 1.1 ross
3699 1.1 ross /*
3700 1.1 ross -------------------------------------------------------------------------------
3701 1.1 ross Returns the result of adding the extended double-precision floating-point
3702 1.1 ross values `a' and `b'. The operation is performed according to the IEC/IEEE
3703 1.1 ross Standard for Binary Floating-Point Arithmetic.
3704 1.1 ross -------------------------------------------------------------------------------
3705 1.1 ross */
3706 1.1 ross floatx80 floatx80_add( floatx80 a, floatx80 b )
3707 1.1 ross {
3708 1.1 ross flag aSign, bSign;
3709 1.1 ross
3710 1.1 ross aSign = extractFloatx80Sign( a );
3711 1.1 ross bSign = extractFloatx80Sign( b );
3712 1.1 ross if ( aSign == bSign ) {
3713 1.1 ross return addFloatx80Sigs( a, b, aSign );
3714 1.1 ross }
3715 1.1 ross else {
3716 1.1 ross return subFloatx80Sigs( a, b, aSign );
3717 1.1 ross }
3718 1.1 ross
3719 1.1 ross }
3720 1.1 ross
3721 1.1 ross /*
3722 1.1 ross -------------------------------------------------------------------------------
3723 1.1 ross Returns the result of subtracting the extended double-precision floating-
3724 1.1 ross point values `a' and `b'. The operation is performed according to the
3725 1.1 ross IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3726 1.1 ross -------------------------------------------------------------------------------
3727 1.1 ross */
3728 1.1 ross floatx80 floatx80_sub( floatx80 a, floatx80 b )
3729 1.1 ross {
3730 1.1 ross flag aSign, bSign;
3731 1.1 ross
3732 1.1 ross aSign = extractFloatx80Sign( a );
3733 1.1 ross bSign = extractFloatx80Sign( b );
3734 1.1 ross if ( aSign == bSign ) {
3735 1.1 ross return subFloatx80Sigs( a, b, aSign );
3736 1.1 ross }
3737 1.1 ross else {
3738 1.1 ross return addFloatx80Sigs( a, b, aSign );
3739 1.1 ross }
3740 1.1 ross
3741 1.1 ross }
3742 1.1 ross
3743 1.1 ross /*
3744 1.1 ross -------------------------------------------------------------------------------
3745 1.1 ross Returns the result of multiplying the extended double-precision floating-
3746 1.1 ross point values `a' and `b'. The operation is performed according to the
3747 1.1 ross IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3748 1.1 ross -------------------------------------------------------------------------------
3749 1.1 ross */
3750 1.1 ross floatx80 floatx80_mul( floatx80 a, floatx80 b )
3751 1.1 ross {
3752 1.1 ross flag aSign, bSign, zSign;
3753 1.1 ross int32 aExp, bExp, zExp;
3754 1.1 ross bits64 aSig, bSig, zSig0, zSig1;
3755 1.1 ross floatx80 z;
3756 1.1 ross
3757 1.1 ross aSig = extractFloatx80Frac( a );
3758 1.1 ross aExp = extractFloatx80Exp( a );
3759 1.1 ross aSign = extractFloatx80Sign( a );
3760 1.1 ross bSig = extractFloatx80Frac( b );
3761 1.1 ross bExp = extractFloatx80Exp( b );
3762 1.1 ross bSign = extractFloatx80Sign( b );
3763 1.1 ross zSign = aSign ^ bSign;
3764 1.1 ross if ( aExp == 0x7FFF ) {
3765 1.1 ross if ( (bits64) ( aSig<<1 )
3766 1.1 ross || ( ( bExp == 0x7FFF ) && (bits64) ( bSig<<1 ) ) ) {
3767 1.1 ross return propagateFloatx80NaN( a, b );
3768 1.1 ross }
3769 1.1 ross if ( ( bExp | bSig ) == 0 ) goto invalid;
3770 1.1 ross return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
3771 1.1 ross }
3772 1.1 ross if ( bExp == 0x7FFF ) {
3773 1.1 ross if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b );
3774 1.1 ross if ( ( aExp | aSig ) == 0 ) {
3775 1.1 ross invalid:
3776 1.1 ross float_raise( float_flag_invalid );
3777 1.1 ross z.low = floatx80_default_nan_low;
3778 1.1 ross z.high = floatx80_default_nan_high;
3779 1.1 ross return z;
3780 1.1 ross }
3781 1.1 ross return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
3782 1.1 ross }
3783 1.1 ross if ( aExp == 0 ) {
3784 1.1 ross if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 );
3785 1.1 ross normalizeFloatx80Subnormal( aSig, &aExp, &aSig );
3786 1.1 ross }
3787 1.1 ross if ( bExp == 0 ) {
3788 1.1 ross if ( bSig == 0 ) return packFloatx80( zSign, 0, 0 );
3789 1.1 ross normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
3790 1.1 ross }
3791 1.1 ross zExp = aExp + bExp - 0x3FFE;
3792 1.1 ross mul64To128( aSig, bSig, &zSig0, &zSig1 );
3793 1.1 ross if ( 0 < (sbits64) zSig0 ) {
3794 1.1 ross shortShift128Left( zSig0, zSig1, 1, &zSig0, &zSig1 );
3795 1.1 ross --zExp;
3796 1.1 ross }
3797 1.1 ross return
3798 1.1 ross roundAndPackFloatx80(
3799 1.1 ross floatx80_rounding_precision, zSign, zExp, zSig0, zSig1 );
3800 1.1 ross
3801 1.1 ross }
3802 1.1 ross
3803 1.1 ross /*
3804 1.1 ross -------------------------------------------------------------------------------
3805 1.1 ross Returns the result of dividing the extended double-precision floating-point
3806 1.1 ross value `a' by the corresponding value `b'. The operation is performed
3807 1.1 ross according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3808 1.1 ross -------------------------------------------------------------------------------
3809 1.1 ross */
3810 1.1 ross floatx80 floatx80_div( floatx80 a, floatx80 b )
3811 1.1 ross {
3812 1.1 ross flag aSign, bSign, zSign;
3813 1.1 ross int32 aExp, bExp, zExp;
3814 1.1 ross bits64 aSig, bSig, zSig0, zSig1;
3815 1.1 ross bits64 rem0, rem1, rem2, term0, term1, term2;
3816 1.1 ross floatx80 z;
3817 1.1 ross
3818 1.1 ross aSig = extractFloatx80Frac( a );
3819 1.1 ross aExp = extractFloatx80Exp( a );
3820 1.1 ross aSign = extractFloatx80Sign( a );
3821 1.1 ross bSig = extractFloatx80Frac( b );
3822 1.1 ross bExp = extractFloatx80Exp( b );
3823 1.1 ross bSign = extractFloatx80Sign( b );
3824 1.1 ross zSign = aSign ^ bSign;
3825 1.1 ross if ( aExp == 0x7FFF ) {
3826 1.1 ross if ( (bits64) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b );
3827 1.1 ross if ( bExp == 0x7FFF ) {
3828 1.1 ross if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b );
3829 1.1 ross goto invalid;
3830 1.1 ross }
3831 1.1 ross return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
3832 1.1 ross }
3833 1.1 ross if ( bExp == 0x7FFF ) {
3834 1.1 ross if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b );
3835 1.1 ross return packFloatx80( zSign, 0, 0 );
3836 1.1 ross }
3837 1.1 ross if ( bExp == 0 ) {
3838 1.1 ross if ( bSig == 0 ) {
3839 1.1 ross if ( ( aExp | aSig ) == 0 ) {
3840 1.1 ross invalid:
3841 1.1 ross float_raise( float_flag_invalid );
3842 1.1 ross z.low = floatx80_default_nan_low;
3843 1.1 ross z.high = floatx80_default_nan_high;
3844 1.1 ross return z;
3845 1.1 ross }
3846 1.1 ross float_raise( float_flag_divbyzero );
3847 1.1 ross return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
3848 1.1 ross }
3849 1.1 ross normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
3850 1.1 ross }
3851 1.1 ross if ( aExp == 0 ) {
3852 1.1 ross if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 );
3853 1.1 ross normalizeFloatx80Subnormal( aSig, &aExp, &aSig );
3854 1.1 ross }
3855 1.1 ross zExp = aExp - bExp + 0x3FFE;
3856 1.1 ross rem1 = 0;
3857 1.1 ross if ( bSig <= aSig ) {
3858 1.1 ross shift128Right( aSig, 0, 1, &aSig, &rem1 );
3859 1.1 ross ++zExp;
3860 1.1 ross }
3861 1.1 ross zSig0 = estimateDiv128To64( aSig, rem1, bSig );
3862 1.1 ross mul64To128( bSig, zSig0, &term0, &term1 );
3863 1.1 ross sub128( aSig, rem1, term0, term1, &rem0, &rem1 );
3864 1.1 ross while ( (sbits64) rem0 < 0 ) {
3865 1.1 ross --zSig0;
3866 1.1 ross add128( rem0, rem1, 0, bSig, &rem0, &rem1 );
3867 1.1 ross }
3868 1.1 ross zSig1 = estimateDiv128To64( rem1, 0, bSig );
3869 1.1 ross if ( (bits64) ( zSig1<<1 ) <= 8 ) {
3870 1.1 ross mul64To128( bSig, zSig1, &term1, &term2 );
3871 1.1 ross sub128( rem1, 0, term1, term2, &rem1, &rem2 );
3872 1.1 ross while ( (sbits64) rem1 < 0 ) {
3873 1.1 ross --zSig1;
3874 1.1 ross add128( rem1, rem2, 0, bSig, &rem1, &rem2 );
3875 1.1 ross }
3876 1.1 ross zSig1 |= ( ( rem1 | rem2 ) != 0 );
3877 1.1 ross }
3878 1.1 ross return
3879 1.1 ross roundAndPackFloatx80(
3880 1.1 ross floatx80_rounding_precision, zSign, zExp, zSig0, zSig1 );
3881 1.1 ross
3882 1.1 ross }
3883 1.1 ross
3884 1.1 ross /*
3885 1.1 ross -------------------------------------------------------------------------------
3886 1.1 ross Returns the remainder of the extended double-precision floating-point value
3887 1.1 ross `a' with respect to the corresponding value `b'. The operation is performed
3888 1.1 ross according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3889 1.1 ross -------------------------------------------------------------------------------
3890 1.1 ross */
3891 1.1 ross floatx80 floatx80_rem( floatx80 a, floatx80 b )
3892 1.1 ross {
3893 1.1 ross flag aSign, bSign, zSign;
3894 1.1 ross int32 aExp, bExp, expDiff;
3895 1.1 ross bits64 aSig0, aSig1, bSig;
3896 1.1 ross bits64 q, term0, term1, alternateASig0, alternateASig1;
3897 1.1 ross floatx80 z;
3898 1.1 ross
3899 1.1 ross aSig0 = extractFloatx80Frac( a );
3900 1.1 ross aExp = extractFloatx80Exp( a );
3901 1.1 ross aSign = extractFloatx80Sign( a );
3902 1.1 ross bSig = extractFloatx80Frac( b );
3903 1.1 ross bExp = extractFloatx80Exp( b );
3904 1.1 ross bSign = extractFloatx80Sign( b );
3905 1.1 ross if ( aExp == 0x7FFF ) {
3906 1.1 ross if ( (bits64) ( aSig0<<1 )
3907 1.1 ross || ( ( bExp == 0x7FFF ) && (bits64) ( bSig<<1 ) ) ) {
3908 1.1 ross return propagateFloatx80NaN( a, b );
3909 1.1 ross }
3910 1.1 ross goto invalid;
3911 1.1 ross }
3912 1.1 ross if ( bExp == 0x7FFF ) {
3913 1.1 ross if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b );
3914 1.1 ross return a;
3915 1.1 ross }
3916 1.1 ross if ( bExp == 0 ) {
3917 1.1 ross if ( bSig == 0 ) {
3918 1.1 ross invalid:
3919 1.1 ross float_raise( float_flag_invalid );
3920 1.1 ross z.low = floatx80_default_nan_low;
3921 1.1 ross z.high = floatx80_default_nan_high;
3922 1.1 ross return z;
3923 1.1 ross }
3924 1.1 ross normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
3925 1.1 ross }
3926 1.1 ross if ( aExp == 0 ) {
3927 1.1 ross if ( (bits64) ( aSig0<<1 ) == 0 ) return a;
3928 1.1 ross normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 );
3929 1.1 ross }
3930 1.1 ross bSig |= LIT64( 0x8000000000000000 );
3931 1.1 ross zSign = aSign;
3932 1.1 ross expDiff = aExp - bExp;
3933 1.1 ross aSig1 = 0;
3934 1.1 ross if ( expDiff < 0 ) {
3935 1.1 ross if ( expDiff < -1 ) return a;
3936 1.1 ross shift128Right( aSig0, 0, 1, &aSig0, &aSig1 );
3937 1.1 ross expDiff = 0;
3938 1.1 ross }
3939 1.1 ross q = ( bSig <= aSig0 );
3940 1.1 ross if ( q ) aSig0 -= bSig;
3941 1.1 ross expDiff -= 64;
3942 1.1 ross while ( 0 < expDiff ) {
3943 1.1 ross q = estimateDiv128To64( aSig0, aSig1, bSig );
3944 1.1 ross q = ( 2 < q ) ? q - 2 : 0;
3945 1.1 ross mul64To128( bSig, q, &term0, &term1 );
3946 1.1 ross sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
3947 1.1 ross shortShift128Left( aSig0, aSig1, 62, &aSig0, &aSig1 );
3948 1.1 ross expDiff -= 62;
3949 1.1 ross }
3950 1.1 ross expDiff += 64;
3951 1.1 ross if ( 0 < expDiff ) {
3952 1.1 ross q = estimateDiv128To64( aSig0, aSig1, bSig );
3953 1.1 ross q = ( 2 < q ) ? q - 2 : 0;
3954 1.1 ross q >>= 64 - expDiff;
3955 1.1 ross mul64To128( bSig, q<<( 64 - expDiff ), &term0, &term1 );
3956 1.1 ross sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
3957 1.1 ross shortShift128Left( 0, bSig, 64 - expDiff, &term0, &term1 );
3958 1.1 ross while ( le128( term0, term1, aSig0, aSig1 ) ) {
3959 1.1 ross ++q;
3960 1.1 ross sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
3961 1.1 ross }
3962 1.1 ross }
3963 1.1 ross else {
3964 1.1 ross term1 = 0;
3965 1.1 ross term0 = bSig;
3966 1.1 ross }
3967 1.1 ross sub128( term0, term1, aSig0, aSig1, &alternateASig0, &alternateASig1 );
3968 1.1 ross if ( lt128( alternateASig0, alternateASig1, aSig0, aSig1 )
3969 1.1 ross || ( eq128( alternateASig0, alternateASig1, aSig0, aSig1 )
3970 1.1 ross && ( q & 1 ) )
3971 1.1 ross ) {
3972 1.1 ross aSig0 = alternateASig0;
3973 1.1 ross aSig1 = alternateASig1;
3974 1.1 ross zSign = ! zSign;
3975 1.1 ross }
3976 1.1 ross return
3977 1.1 ross normalizeRoundAndPackFloatx80(
3978 1.1 ross 80, zSign, bExp + expDiff, aSig0, aSig1 );
3979 1.1 ross
3980 1.1 ross }
3981 1.1 ross
3982 1.1 ross /*
3983 1.1 ross -------------------------------------------------------------------------------
3984 1.1 ross Returns the square root of the extended double-precision floating-point
3985 1.1 ross value `a'. The operation is performed according to the IEC/IEEE Standard
3986 1.1 ross for Binary Floating-Point Arithmetic.
3987 1.1 ross -------------------------------------------------------------------------------
3988 1.1 ross */
3989 1.1 ross floatx80 floatx80_sqrt( floatx80 a )
3990 1.1 ross {
3991 1.1 ross flag aSign;
3992 1.1 ross int32 aExp, zExp;
3993 1.1 ross bits64 aSig0, aSig1, zSig0, zSig1, doubleZSig0;
3994 1.1 ross bits64 rem0, rem1, rem2, rem3, term0, term1, term2, term3;
3995 1.1 ross floatx80 z;
3996 1.1 ross
3997 1.1 ross aSig0 = extractFloatx80Frac( a );
3998 1.1 ross aExp = extractFloatx80Exp( a );
3999 1.1 ross aSign = extractFloatx80Sign( a );
4000 1.1 ross if ( aExp == 0x7FFF ) {
4001 1.1 ross if ( (bits64) ( aSig0<<1 ) ) return propagateFloatx80NaN( a, a );
4002 1.1 ross if ( ! aSign ) return a;
4003 1.1 ross goto invalid;
4004 1.1 ross }
4005 1.1 ross if ( aSign ) {
4006 1.1 ross if ( ( aExp | aSig0 ) == 0 ) return a;
4007 1.1 ross invalid:
4008 1.1 ross float_raise( float_flag_invalid );
4009 1.1 ross z.low = floatx80_default_nan_low;
4010 1.1 ross z.high = floatx80_default_nan_high;
4011 1.1 ross return z;
4012 1.1 ross }
4013 1.1 ross if ( aExp == 0 ) {
4014 1.1 ross if ( aSig0 == 0 ) return packFloatx80( 0, 0, 0 );
4015 1.1 ross normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 );
4016 1.1 ross }
4017 1.1 ross zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFF;
4018 1.1 ross zSig0 = estimateSqrt32( aExp, aSig0>>32 );
4019 1.1 ross shift128Right( aSig0, 0, 2 + ( aExp & 1 ), &aSig0, &aSig1 );
4020 1.1 ross zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 );
4021 1.1 ross doubleZSig0 = zSig0<<1;
4022 1.1 ross mul64To128( zSig0, zSig0, &term0, &term1 );
4023 1.1 ross sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 );
4024 1.1 ross while ( (sbits64) rem0 < 0 ) {
4025 1.1 ross --zSig0;
4026 1.1 ross doubleZSig0 -= 2;
4027 1.1 ross add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 );
4028 1.1 ross }
4029 1.1 ross zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 );
4030 1.1 ross if ( ( zSig1 & LIT64( 0x3FFFFFFFFFFFFFFF ) ) <= 5 ) {
4031 1.1 ross if ( zSig1 == 0 ) zSig1 = 1;
4032 1.1 ross mul64To128( doubleZSig0, zSig1, &term1, &term2 );
4033 1.1 ross sub128( rem1, 0, term1, term2, &rem1, &rem2 );
4034 1.1 ross mul64To128( zSig1, zSig1, &term2, &term3 );
4035 1.1 ross sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 );
4036 1.1 ross while ( (sbits64) rem1 < 0 ) {
4037 1.1 ross --zSig1;
4038 1.1 ross shortShift128Left( 0, zSig1, 1, &term2, &term3 );
4039 1.1 ross term3 |= 1;
4040 1.1 ross term2 |= doubleZSig0;
4041 1.1 ross add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 );
4042 1.1 ross }
4043 1.1 ross zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
4044 1.1 ross }
4045 1.1 ross shortShift128Left( 0, zSig1, 1, &zSig0, &zSig1 );
4046 1.1 ross zSig0 |= doubleZSig0;
4047 1.1 ross return
4048 1.1 ross roundAndPackFloatx80(
4049 1.1 ross floatx80_rounding_precision, 0, zExp, zSig0, zSig1 );
4050 1.1 ross
4051 1.1 ross }
4052 1.1 ross
4053 1.1 ross /*
4054 1.1 ross -------------------------------------------------------------------------------
4055 1.1 ross Returns 1 if the extended double-precision floating-point value `a' is
4056 1.1 ross equal to the corresponding value `b', and 0 otherwise. The comparison is
4057 1.1 ross performed according to the IEC/IEEE Standard for Binary Floating-Point
4058 1.1 ross Arithmetic.
4059 1.1 ross -------------------------------------------------------------------------------
4060 1.1 ross */
4061 1.1 ross flag floatx80_eq( floatx80 a, floatx80 b )
4062 1.1 ross {
4063 1.1 ross
4064 1.1 ross if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
4065 1.1 ross && (bits64) ( extractFloatx80Frac( a )<<1 ) )
4066 1.1 ross || ( ( extractFloatx80Exp( b ) == 0x7FFF )
4067 1.1 ross && (bits64) ( extractFloatx80Frac( b )<<1 ) )
4068 1.1 ross ) {
4069 1.1 ross if ( floatx80_is_signaling_nan( a )
4070 1.1 ross || floatx80_is_signaling_nan( b ) ) {
4071 1.1 ross float_raise( float_flag_invalid );
4072 1.1 ross }
4073 1.1 ross return 0;
4074 1.1 ross }
4075 1.1 ross return
4076 1.1 ross ( a.low == b.low )
4077 1.1 ross && ( ( a.high == b.high )
4078 1.1 ross || ( ( a.low == 0 )
4079 1.1 ross && ( (bits16) ( ( a.high | b.high )<<1 ) == 0 ) )
4080 1.1 ross );
4081 1.1 ross
4082 1.1 ross }
4083 1.1 ross
4084 1.1 ross /*
4085 1.1 ross -------------------------------------------------------------------------------
4086 1.1 ross Returns 1 if the extended double-precision floating-point value `a' is
4087 1.1 ross less than or equal to the corresponding value `b', and 0 otherwise. The
4088 1.1 ross comparison is performed according to the IEC/IEEE Standard for Binary
4089 1.1 ross Floating-Point Arithmetic.
4090 1.1 ross -------------------------------------------------------------------------------
4091 1.1 ross */
4092 1.1 ross flag floatx80_le( floatx80 a, floatx80 b )
4093 1.1 ross {
4094 1.1 ross flag aSign, bSign;
4095 1.1 ross
4096 1.1 ross if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
4097 1.1 ross && (bits64) ( extractFloatx80Frac( a )<<1 ) )
4098 1.1 ross || ( ( extractFloatx80Exp( b ) == 0x7FFF )
4099 1.1 ross && (bits64) ( extractFloatx80Frac( b )<<1 ) )
4100 1.1 ross ) {
4101 1.1 ross float_raise( float_flag_invalid );
4102 1.1 ross return 0;
4103 1.1 ross }
4104 1.1 ross aSign = extractFloatx80Sign( a );
4105 1.1 ross bSign = extractFloatx80Sign( b );
4106 1.1 ross if ( aSign != bSign ) {
4107 1.1 ross return
4108 1.1 ross aSign
4109 1.1 ross || ( ( ( (bits16) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
4110 1.1 ross == 0 );
4111 1.1 ross }
4112 1.1 ross return
4113 1.1 ross aSign ? le128( b.high, b.low, a.high, a.low )
4114 1.1 ross : le128( a.high, a.low, b.high, b.low );
4115 1.1 ross
4116 1.1 ross }
4117 1.1 ross
4118 1.1 ross /*
4119 1.1 ross -------------------------------------------------------------------------------
4120 1.1 ross Returns 1 if the extended double-precision floating-point value `a' is
4121 1.1 ross less than the corresponding value `b', and 0 otherwise. The comparison
4122 1.1 ross is performed according to the IEC/IEEE Standard for Binary Floating-Point
4123 1.1 ross Arithmetic.
4124 1.1 ross -------------------------------------------------------------------------------
4125 1.1 ross */
4126 1.1 ross flag floatx80_lt( floatx80 a, floatx80 b )
4127 1.1 ross {
4128 1.1 ross flag aSign, bSign;
4129 1.1 ross
4130 1.1 ross if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
4131 1.1 ross && (bits64) ( extractFloatx80Frac( a )<<1 ) )
4132 1.1 ross || ( ( extractFloatx80Exp( b ) == 0x7FFF )
4133 1.1 ross && (bits64) ( extractFloatx80Frac( b )<<1 ) )
4134 1.1 ross ) {
4135 1.1 ross float_raise( float_flag_invalid );
4136 1.1 ross return 0;
4137 1.1 ross }
4138 1.1 ross aSign = extractFloatx80Sign( a );
4139 1.1 ross bSign = extractFloatx80Sign( b );
4140 1.1 ross if ( aSign != bSign ) {
4141 1.1 ross return
4142 1.1 ross aSign
4143 1.1 ross && ( ( ( (bits16) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
4144 1.1 ross != 0 );
4145 1.1 ross }
4146 1.1 ross return
4147 1.1 ross aSign ? lt128( b.high, b.low, a.high, a.low )
4148 1.1 ross : lt128( a.high, a.low, b.high, b.low );
4149 1.1 ross
4150 1.1 ross }
4151 1.1 ross
4152 1.1 ross /*
4153 1.1 ross -------------------------------------------------------------------------------
4154 1.1 ross Returns 1 if the extended double-precision floating-point value `a' is equal
4155 1.1 ross to the corresponding value `b', and 0 otherwise. The invalid exception is
4156 1.1 ross raised if either operand is a NaN. Otherwise, the comparison is performed
4157 1.1 ross according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4158 1.1 ross -------------------------------------------------------------------------------
4159 1.1 ross */
4160 1.1 ross flag floatx80_eq_signaling( floatx80 a, floatx80 b )
4161 1.1 ross {
4162 1.1 ross
4163 1.1 ross if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
4164 1.1 ross && (bits64) ( extractFloatx80Frac( a )<<1 ) )
4165 1.1 ross || ( ( extractFloatx80Exp( b ) == 0x7FFF )
4166 1.1 ross && (bits64) ( extractFloatx80Frac( b )<<1 ) )
4167 1.1 ross ) {
4168 1.1 ross float_raise( float_flag_invalid );
4169 1.1 ross return 0;
4170 1.1 ross }
4171 1.1 ross return
4172 1.1 ross ( a.low == b.low )
4173 1.1 ross && ( ( a.high == b.high )
4174 1.1 ross || ( ( a.low == 0 )
4175 1.1 ross && ( (bits16) ( ( a.high | b.high )<<1 ) == 0 ) )
4176 1.1 ross );
4177 1.1 ross
4178 1.1 ross }
4179 1.1 ross
4180 1.1 ross /*
4181 1.1 ross -------------------------------------------------------------------------------
4182 1.1 ross Returns 1 if the extended double-precision floating-point value `a' is less
4183 1.1 ross than or equal to the corresponding value `b', and 0 otherwise. Quiet NaNs
4184 1.1 ross do not cause an exception. Otherwise, the comparison is performed according
4185 1.1 ross to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4186 1.1 ross -------------------------------------------------------------------------------
4187 1.1 ross */
4188 1.1 ross flag floatx80_le_quiet( floatx80 a, floatx80 b )
4189 1.1 ross {
4190 1.1 ross flag aSign, bSign;
4191 1.1 ross
4192 1.1 ross if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
4193 1.1 ross && (bits64) ( extractFloatx80Frac( a )<<1 ) )
4194 1.1 ross || ( ( extractFloatx80Exp( b ) == 0x7FFF )
4195 1.1 ross && (bits64) ( extractFloatx80Frac( b )<<1 ) )
4196 1.1 ross ) {
4197 1.1 ross if ( floatx80_is_signaling_nan( a )
4198 1.1 ross || floatx80_is_signaling_nan( b ) ) {
4199 1.1 ross float_raise( float_flag_invalid );
4200 1.1 ross }
4201 1.1 ross return 0;
4202 1.1 ross }
4203 1.1 ross aSign = extractFloatx80Sign( a );
4204 1.1 ross bSign = extractFloatx80Sign( b );
4205 1.1 ross if ( aSign != bSign ) {
4206 1.1 ross return
4207 1.1 ross aSign
4208 1.1 ross || ( ( ( (bits16) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
4209 1.1 ross == 0 );
4210 1.1 ross }
4211 1.1 ross return
4212 1.1 ross aSign ? le128( b.high, b.low, a.high, a.low )
4213 1.1 ross : le128( a.high, a.low, b.high, b.low );
4214 1.1 ross
4215 1.1 ross }
4216 1.1 ross
4217 1.1 ross /*
4218 1.1 ross -------------------------------------------------------------------------------
4219 1.1 ross Returns 1 if the extended double-precision floating-point value `a' is less
4220 1.1 ross than the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause
4221 1.1 ross an exception. Otherwise, the comparison is performed according to the
4222 1.1 ross IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4223 1.1 ross -------------------------------------------------------------------------------
4224 1.1 ross */
4225 1.1 ross flag floatx80_lt_quiet( floatx80 a, floatx80 b )
4226 1.1 ross {
4227 1.1 ross flag aSign, bSign;
4228 1.1 ross
4229 1.1 ross if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
4230 1.1 ross && (bits64) ( extractFloatx80Frac( a )<<1 ) )
4231 1.1 ross || ( ( extractFloatx80Exp( b ) == 0x7FFF )
4232 1.1 ross && (bits64) ( extractFloatx80Frac( b )<<1 ) )
4233 1.1 ross ) {
4234 1.1 ross if ( floatx80_is_signaling_nan( a )
4235 1.1 ross || floatx80_is_signaling_nan( b ) ) {
4236 1.1 ross float_raise( float_flag_invalid );
4237 1.1 ross }
4238 1.1 ross return 0;
4239 1.1 ross }
4240 1.1 ross aSign = extractFloatx80Sign( a );
4241 1.1 ross bSign = extractFloatx80Sign( b );
4242 1.1 ross if ( aSign != bSign ) {
4243 1.1 ross return
4244 1.1 ross aSign
4245 1.1 ross && ( ( ( (bits16) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
4246 1.1 ross != 0 );
4247 1.1 ross }
4248 1.1 ross return
4249 1.1 ross aSign ? lt128( b.high, b.low, a.high, a.low )
4250 1.1 ross : lt128( a.high, a.low, b.high, b.low );
4251 1.1 ross
4252 1.1 ross }
4253 1.1 ross
4254 1.1 ross #endif
4255 1.1 ross
4256 1.1 ross #ifdef FLOAT128
4257 1.1 ross
4258 1.1 ross /*
4259 1.1 ross -------------------------------------------------------------------------------
4260 1.1 ross Returns the result of converting the quadruple-precision floating-point
4261 1.1 ross value `a' to the 32-bit two's complement integer format. The conversion
4262 1.1 ross is performed according to the IEC/IEEE Standard for Binary Floating-Point
4263 1.1 ross Arithmetic---which means in particular that the conversion is rounded
4264 1.1 ross according to the current rounding mode. If `a' is a NaN, the largest
4265 1.1 ross positive integer is returned. Otherwise, if the conversion overflows, the
4266 1.1 ross largest integer with the same sign as `a' is returned.
4267 1.1 ross -------------------------------------------------------------------------------
4268 1.1 ross */
4269 1.1 ross int32 float128_to_int32( float128 a )
4270 1.1 ross {
4271 1.1 ross flag aSign;
4272 1.1 ross int32 aExp, shiftCount;
4273 1.1 ross bits64 aSig0, aSig1;
4274 1.1 ross
4275 1.1 ross aSig1 = extractFloat128Frac1( a );
4276 1.1 ross aSig0 = extractFloat128Frac0( a );
4277 1.1 ross aExp = extractFloat128Exp( a );
4278 1.1 ross aSign = extractFloat128Sign( a );
4279 1.1 ross if ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) aSign = 0;
4280 1.1 ross if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
4281 1.1 ross aSig0 |= ( aSig1 != 0 );
4282 1.1 ross shiftCount = 0x4028 - aExp;
4283 1.1 ross if ( 0 < shiftCount ) shift64RightJamming( aSig0, shiftCount, &aSig0 );
4284 1.1 ross return roundAndPackInt32( aSign, aSig0 );
4285 1.1 ross
4286 1.1 ross }
4287 1.1 ross
4288 1.1 ross /*
4289 1.1 ross -------------------------------------------------------------------------------
4290 1.1 ross Returns the result of converting the quadruple-precision floating-point
4291 1.1 ross value `a' to the 32-bit two's complement integer format. The conversion
4292 1.1 ross is performed according to the IEC/IEEE Standard for Binary Floating-Point
4293 1.1 ross Arithmetic, except that the conversion is always rounded toward zero. If
4294 1.1 ross `a' is a NaN, the largest positive integer is returned. Otherwise, if the
4295 1.1 ross conversion overflows, the largest integer with the same sign as `a' is
4296 1.1 ross returned.
4297 1.1 ross -------------------------------------------------------------------------------
4298 1.1 ross */
4299 1.1 ross int32 float128_to_int32_round_to_zero( float128 a )
4300 1.1 ross {
4301 1.1 ross flag aSign;
4302 1.1 ross int32 aExp, shiftCount;
4303 1.1 ross bits64 aSig0, aSig1, savedASig;
4304 1.1 ross int32 z;
4305 1.1 ross
4306 1.1 ross aSig1 = extractFloat128Frac1( a );
4307 1.1 ross aSig0 = extractFloat128Frac0( a );
4308 1.1 ross aExp = extractFloat128Exp( a );
4309 1.1 ross aSign = extractFloat128Sign( a );
4310 1.1 ross aSig0 |= ( aSig1 != 0 );
4311 1.1 ross if ( 0x401E < aExp ) {
4312 1.1 ross if ( ( aExp == 0x7FFF ) && aSig0 ) aSign = 0;
4313 1.1 ross goto invalid;
4314 1.1 ross }
4315 1.1 ross else if ( aExp < 0x3FFF ) {
4316 1.1 ross if ( aExp || aSig0 ) float_set_inexact();
4317 1.1 ross return 0;
4318 1.1 ross }
4319 1.1 ross aSig0 |= LIT64( 0x0001000000000000 );
4320 1.1 ross shiftCount = 0x402F - aExp;
4321 1.1 ross savedASig = aSig0;
4322 1.1 ross aSig0 >>= shiftCount;
4323 1.1 ross z = aSig0;
4324 1.1 ross if ( aSign ) z = - z;
4325 1.1 ross if ( ( z < 0 ) ^ aSign ) {
4326 1.1 ross invalid:
4327 1.1 ross float_raise( float_flag_invalid );
4328 1.1 ross return aSign ? (sbits32) 0x80000000 : 0x7FFFFFFF;
4329 1.1 ross }
4330 1.1 ross if ( ( aSig0<<shiftCount ) != savedASig ) {
4331 1.1 ross float_set_inexact();
4332 1.1 ross }
4333 1.1 ross return z;
4334 1.1 ross
4335 1.1 ross }
4336 1.1 ross
4337 1.1 ross /*
4338 1.1 ross -------------------------------------------------------------------------------
4339 1.1 ross Returns the result of converting the quadruple-precision floating-point
4340 1.1 ross value `a' to the 64-bit two's complement integer format. The conversion
4341 1.1 ross is performed according to the IEC/IEEE Standard for Binary Floating-Point
4342 1.1 ross Arithmetic---which means in particular that the conversion is rounded
4343 1.1 ross according to the current rounding mode. If `a' is a NaN, the largest
4344 1.1 ross positive integer is returned. Otherwise, if the conversion overflows, the
4345 1.1 ross largest integer with the same sign as `a' is returned.
4346 1.1 ross -------------------------------------------------------------------------------
4347 1.1 ross */
4348 1.1 ross int64 float128_to_int64( float128 a )
4349 1.1 ross {
4350 1.1 ross flag aSign;
4351 1.1 ross int32 aExp, shiftCount;
4352 1.1 ross bits64 aSig0, aSig1;
4353 1.1 ross
4354 1.1 ross aSig1 = extractFloat128Frac1( a );
4355 1.1 ross aSig0 = extractFloat128Frac0( a );
4356 1.1 ross aExp = extractFloat128Exp( a );
4357 1.1 ross aSign = extractFloat128Sign( a );
4358 1.1 ross if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
4359 1.1 ross shiftCount = 0x402F - aExp;
4360 1.1 ross if ( shiftCount <= 0 ) {
4361 1.1 ross if ( 0x403E < aExp ) {
4362 1.1 ross float_raise( float_flag_invalid );
4363 1.1 ross if ( ! aSign
4364 1.1 ross || ( ( aExp == 0x7FFF )
4365 1.1 ross && ( aSig1 || ( aSig0 != LIT64( 0x0001000000000000 ) ) )
4366 1.1 ross )
4367 1.1 ross ) {
4368 1.1 ross return LIT64( 0x7FFFFFFFFFFFFFFF );
4369 1.1 ross }
4370 1.1 ross return (sbits64) LIT64( 0x8000000000000000 );
4371 1.1 ross }
4372 1.1 ross shortShift128Left( aSig0, aSig1, - shiftCount, &aSig0, &aSig1 );
4373 1.1 ross }
4374 1.1 ross else {
4375 1.1 ross shift64ExtraRightJamming( aSig0, aSig1, shiftCount, &aSig0, &aSig1 );
4376 1.1 ross }
4377 1.1 ross return roundAndPackInt64( aSign, aSig0, aSig1 );
4378 1.1 ross
4379 1.1 ross }
4380 1.1 ross
4381 1.1 ross /*
4382 1.1 ross -------------------------------------------------------------------------------
4383 1.1 ross Returns the result of converting the quadruple-precision floating-point
4384 1.1 ross value `a' to the 64-bit two's complement integer format. The conversion
4385 1.1 ross is performed according to the IEC/IEEE Standard for Binary Floating-Point
4386 1.1 ross Arithmetic, except that the conversion is always rounded toward zero.
4387 1.1 ross If `a' is a NaN, the largest positive integer is returned. Otherwise, if
4388 1.1 ross the conversion overflows, the largest integer with the same sign as `a' is
4389 1.1 ross returned.
4390 1.1 ross -------------------------------------------------------------------------------
4391 1.1 ross */
4392 1.1 ross int64 float128_to_int64_round_to_zero( float128 a )
4393 1.1 ross {
4394 1.1 ross flag aSign;
4395 1.1 ross int32 aExp, shiftCount;
4396 1.1 ross bits64 aSig0, aSig1;
4397 1.1 ross int64 z;
4398 1.1 ross
4399 1.1 ross aSig1 = extractFloat128Frac1( a );
4400 1.1 ross aSig0 = extractFloat128Frac0( a );
4401 1.1 ross aExp = extractFloat128Exp( a );
4402 1.1 ross aSign = extractFloat128Sign( a );
4403 1.1 ross if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
4404 1.1 ross shiftCount = aExp - 0x402F;
4405 1.1 ross if ( 0 < shiftCount ) {
4406 1.1 ross if ( 0x403E <= aExp ) {
4407 1.1 ross aSig0 &= LIT64( 0x0000FFFFFFFFFFFF );
4408 1.1 ross if ( ( a.high == LIT64( 0xC03E000000000000 ) )
4409 1.1 ross && ( aSig1 < LIT64( 0x0002000000000000 ) ) ) {
4410 1.1 ross if ( aSig1 ) float_set_inexact();
4411 1.1 ross }
4412 1.1 ross else {
4413 1.1 ross float_raise( float_flag_invalid );
4414 1.1 ross if ( ! aSign || ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) ) {
4415 1.1 ross return LIT64( 0x7FFFFFFFFFFFFFFF );
4416 1.1 ross }
4417 1.1 ross }
4418 1.1 ross return (sbits64) LIT64( 0x8000000000000000 );
4419 1.1 ross }
4420 1.1 ross z = ( aSig0<<shiftCount ) | ( aSig1>>( ( - shiftCount ) & 63 ) );
4421 1.1 ross if ( (bits64) ( aSig1<<shiftCount ) ) {
4422 1.1 ross float_set_inexact();
4423 1.1 ross }
4424 1.1 ross }
4425 1.1 ross else {
4426 1.1 ross if ( aExp < 0x3FFF ) {
4427 1.1 ross if ( aExp | aSig0 | aSig1 ) {
4428 1.1 ross float_set_inexact();
4429 1.1 ross }
4430 1.1 ross return 0;
4431 1.1 ross }
4432 1.1 ross z = aSig0>>( - shiftCount );
4433 1.1 ross if ( aSig1
4434 1.1 ross || ( shiftCount && (bits64) ( aSig0<<( shiftCount & 63 ) ) ) ) {
4435 1.1 ross float_set_inexact();
4436 1.1 ross }
4437 1.1 ross }
4438 1.1 ross if ( aSign ) z = - z;
4439 1.1 ross return z;
4440 1.1 ross
4441 1.1 ross }
4442 1.1 ross
4443 1.1 ross /*
4444 1.1 ross -------------------------------------------------------------------------------
4445 1.1 ross Returns the result of converting the quadruple-precision floating-point
4446 1.1 ross value `a' to the single-precision floating-point format. The conversion
4447 1.1 ross is performed according to the IEC/IEEE Standard for Binary Floating-Point
4448 1.1 ross Arithmetic.
4449 1.1 ross -------------------------------------------------------------------------------
4450 1.1 ross */
4451 1.1 ross float32 float128_to_float32( float128 a )
4452 1.1 ross {
4453 1.1 ross flag aSign;
4454 1.1 ross int32 aExp;
4455 1.1 ross bits64 aSig0, aSig1;
4456 1.1 ross bits32 zSig;
4457 1.1 ross
4458 1.1 ross aSig1 = extractFloat128Frac1( a );
4459 1.1 ross aSig0 = extractFloat128Frac0( a );
4460 1.1 ross aExp = extractFloat128Exp( a );
4461 1.1 ross aSign = extractFloat128Sign( a );
4462 1.1 ross if ( aExp == 0x7FFF ) {
4463 1.1 ross if ( aSig0 | aSig1 ) {
4464 1.1 ross return commonNaNToFloat32( float128ToCommonNaN( a ) );
4465 1.1 ross }
4466 1.1 ross return packFloat32( aSign, 0xFF, 0 );
4467 1.1 ross }
4468 1.1 ross aSig0 |= ( aSig1 != 0 );
4469 1.1 ross shift64RightJamming( aSig0, 18, &aSig0 );
4470 1.1 ross zSig = aSig0;
4471 1.1 ross if ( aExp || zSig ) {
4472 1.1 ross zSig |= 0x40000000;
4473 1.1 ross aExp -= 0x3F81;
4474 1.1 ross }
4475 1.1 ross return roundAndPackFloat32( aSign, aExp, zSig );
4476 1.1 ross
4477 1.1 ross }
4478 1.1 ross
4479 1.1 ross /*
4480 1.1 ross -------------------------------------------------------------------------------
4481 1.1 ross Returns the result of converting the quadruple-precision floating-point
4482 1.1 ross value `a' to the double-precision floating-point format. The conversion
4483 1.1 ross is performed according to the IEC/IEEE Standard for Binary Floating-Point
4484 1.1 ross Arithmetic.
4485 1.1 ross -------------------------------------------------------------------------------
4486 1.1 ross */
4487 1.1 ross float64 float128_to_float64( float128 a )
4488 1.1 ross {
4489 1.1 ross flag aSign;
4490 1.1 ross int32 aExp;
4491 1.1 ross bits64 aSig0, aSig1;
4492 1.1 ross
4493 1.1 ross aSig1 = extractFloat128Frac1( a );
4494 1.1 ross aSig0 = extractFloat128Frac0( a );
4495 1.1 ross aExp = extractFloat128Exp( a );
4496 1.1 ross aSign = extractFloat128Sign( a );
4497 1.1 ross if ( aExp == 0x7FFF ) {
4498 1.1 ross if ( aSig0 | aSig1 ) {
4499 1.1 ross return commonNaNToFloat64( float128ToCommonNaN( a ) );
4500 1.1 ross }
4501 1.1 ross return packFloat64( aSign, 0x7FF, 0 );
4502 1.1 ross }
4503 1.1 ross shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 );
4504 1.1 ross aSig0 |= ( aSig1 != 0 );
4505 1.1 ross if ( aExp || aSig0 ) {
4506 1.1 ross aSig0 |= LIT64( 0x4000000000000000 );
4507 1.1 ross aExp -= 0x3C01;
4508 1.1 ross }
4509 1.1 ross return roundAndPackFloat64( aSign, aExp, aSig0 );
4510 1.1 ross
4511 1.1 ross }
4512 1.1 ross
4513 1.1 ross #ifdef FLOATX80
4514 1.1 ross
4515 1.1 ross /*
4516 1.1 ross -------------------------------------------------------------------------------
4517 1.1 ross Returns the result of converting the quadruple-precision floating-point
4518 1.1 ross value `a' to the extended double-precision floating-point format. The
4519 1.1 ross conversion is performed according to the IEC/IEEE Standard for Binary
4520 1.1 ross Floating-Point Arithmetic.
4521 1.1 ross -------------------------------------------------------------------------------
4522 1.1 ross */
4523 1.1 ross floatx80 float128_to_floatx80( float128 a )
4524 1.1 ross {
4525 1.1 ross flag aSign;
4526 1.1 ross int32 aExp;
4527 1.1 ross bits64 aSig0, aSig1;
4528 1.1 ross
4529 1.1 ross aSig1 = extractFloat128Frac1( a );
4530 1.1 ross aSig0 = extractFloat128Frac0( a );
4531 1.1 ross aExp = extractFloat128Exp( a );
4532 1.1 ross aSign = extractFloat128Sign( a );
4533 1.1 ross if ( aExp == 0x7FFF ) {
4534 1.1 ross if ( aSig0 | aSig1 ) {
4535 1.1 ross return commonNaNToFloatx80( float128ToCommonNaN( a ) );
4536 1.1 ross }
4537 1.1 ross return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
4538 1.1 ross }
4539 1.1 ross if ( aExp == 0 ) {
4540 1.1 ross if ( ( aSig0 | aSig1 ) == 0 ) return packFloatx80( aSign, 0, 0 );
4541 1.1 ross normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
4542 1.1 ross }
4543 1.1 ross else {
4544 1.1 ross aSig0 |= LIT64( 0x0001000000000000 );
4545 1.1 ross }
4546 1.1 ross shortShift128Left( aSig0, aSig1, 15, &aSig0, &aSig1 );
4547 1.1 ross return roundAndPackFloatx80( 80, aSign, aExp, aSig0, aSig1 );
4548 1.1 ross
4549 1.1 ross }
4550 1.1 ross
4551 1.1 ross #endif
4552 1.1 ross
4553 1.1 ross /*
4554 1.1 ross -------------------------------------------------------------------------------
4555 1.1 ross Rounds the quadruple-precision floating-point value `a' to an integer, and
4556 1.1 ross returns the result as a quadruple-precision floating-point value. The
4557 1.1 ross operation is performed according to the IEC/IEEE Standard for Binary
4558 1.1 ross Floating-Point Arithmetic.
4559 1.1 ross -------------------------------------------------------------------------------
4560 1.1 ross */
4561 1.1 ross float128 float128_round_to_int( float128 a )
4562 1.1 ross {
4563 1.1 ross flag aSign;
4564 1.1 ross int32 aExp;
4565 1.1 ross bits64 lastBitMask, roundBitsMask;
4566 1.1 ross int8 roundingMode;
4567 1.1 ross float128 z;
4568 1.1 ross
4569 1.1 ross aExp = extractFloat128Exp( a );
4570 1.1 ross if ( 0x402F <= aExp ) {
4571 1.1 ross if ( 0x406F <= aExp ) {
4572 1.1 ross if ( ( aExp == 0x7FFF )
4573 1.1 ross && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) )
4574 1.1 ross ) {
4575 1.1 ross return propagateFloat128NaN( a, a );
4576 1.1 ross }
4577 1.1 ross return a;
4578 1.1 ross }
4579 1.1 ross lastBitMask = 1;
4580 1.1 ross lastBitMask = ( lastBitMask<<( 0x406E - aExp ) )<<1;
4581 1.1 ross roundBitsMask = lastBitMask - 1;
4582 1.1 ross z = a;
4583 1.1 ross roundingMode = float_rounding_mode();
4584 1.1 ross if ( roundingMode == float_round_nearest_even ) {
4585 1.1 ross if ( lastBitMask ) {
4586 1.1 ross add128( z.high, z.low, 0, lastBitMask>>1, &z.high, &z.low );
4587 1.1 ross if ( ( z.low & roundBitsMask ) == 0 ) z.low &= ~ lastBitMask;
4588 1.1 ross }
4589 1.1 ross else {
4590 1.1 ross if ( (sbits64) z.low < 0 ) {
4591 1.1 ross ++z.high;
4592 1.1 ross if ( (bits64) ( z.low<<1 ) == 0 ) z.high &= ~1;
4593 1.1 ross }
4594 1.1 ross }
4595 1.1 ross }
4596 1.1 ross else if ( roundingMode != float_round_to_zero ) {
4597 1.1 ross if ( extractFloat128Sign( z )
4598 1.1 ross ^ ( roundingMode == float_round_up ) ) {
4599 1.1 ross add128( z.high, z.low, 0, roundBitsMask, &z.high, &z.low );
4600 1.1 ross }
4601 1.1 ross }
4602 1.1 ross z.low &= ~ roundBitsMask;
4603 1.1 ross }
4604 1.1 ross else {
4605 1.1 ross if ( aExp < 0x3FFF ) {
4606 1.1 ross if ( ( ( (bits64) ( a.high<<1 ) ) | a.low ) == 0 ) return a;
4607 1.1 ross float_set_inexact();
4608 1.1 ross aSign = extractFloat128Sign( a );
4609 1.1 ross switch ( float_rounding_mode() ) {
4610 1.1 ross case float_round_nearest_even:
4611 1.1 ross if ( ( aExp == 0x3FFE )
4612 1.1 ross && ( extractFloat128Frac0( a )
4613 1.1 ross | extractFloat128Frac1( a ) )
4614 1.1 ross ) {
4615 1.1 ross return packFloat128( aSign, 0x3FFF, 0, 0 );
4616 1.1 ross }
4617 1.1 ross break;
4618 1.1 ross case float_round_down:
4619 1.1 ross return
4620 1.1 ross aSign ? packFloat128( 1, 0x3FFF, 0, 0 )
4621 1.1 ross : packFloat128( 0, 0, 0, 0 );
4622 1.1 ross case float_round_up:
4623 1.1 ross return
4624 1.1 ross aSign ? packFloat128( 1, 0, 0, 0 )
4625 1.1 ross : packFloat128( 0, 0x3FFF, 0, 0 );
4626 1.1 ross }
4627 1.1 ross return packFloat128( aSign, 0, 0, 0 );
4628 1.1 ross }
4629 1.1 ross lastBitMask = 1;
4630 1.1 ross lastBitMask <<= 0x402F - aExp;
4631 1.1 ross roundBitsMask = lastBitMask - 1;
4632 1.1 ross z.low = 0;
4633 1.1 ross z.high = a.high;
4634 1.1 ross roundingMode = float_rounding_mode();
4635 1.1 ross if ( roundingMode == float_round_nearest_even ) {
4636 1.1 ross z.high += lastBitMask>>1;
4637 1.1 ross if ( ( ( z.high & roundBitsMask ) | a.low ) == 0 ) {
4638 1.1 ross z.high &= ~ lastBitMask;
4639 1.1 ross }
4640 1.1 ross }
4641 1.1 ross else if ( roundingMode != float_round_to_zero ) {
4642 1.1 ross if ( extractFloat128Sign( z )
4643 1.1 ross ^ ( roundingMode == float_round_up ) ) {
4644 1.1 ross z.high |= ( a.low != 0 );
4645 1.1 ross z.high += roundBitsMask;
4646 1.1 ross }
4647 1.1 ross }
4648 1.1 ross z.high &= ~ roundBitsMask;
4649 1.1 ross }
4650 1.1 ross if ( ( z.low != a.low ) || ( z.high != a.high ) ) {
4651 1.1 ross float_set_inexact();
4652 1.1 ross }
4653 1.1 ross return z;
4654 1.1 ross
4655 1.1 ross }
4656 1.1 ross
4657 1.1 ross /*
4658 1.1 ross -------------------------------------------------------------------------------
4659 1.1 ross Returns the result of adding the absolute values of the quadruple-precision
4660 1.1 ross floating-point values `a' and `b'. If `zSign' is 1, the sum is negated
4661 1.1 ross before being returned. `zSign' is ignored if the result is a NaN.
4662 1.1 ross The addition is performed according to the IEC/IEEE Standard for Binary
4663 1.1 ross Floating-Point Arithmetic.
4664 1.1 ross -------------------------------------------------------------------------------
4665 1.1 ross */
4666 1.1 ross static float128 addFloat128Sigs( float128 a, float128 b, flag zSign )
4667 1.1 ross {
4668 1.1 ross int32 aExp, bExp, zExp;
4669 1.1 ross bits64 aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2;
4670 1.1 ross int32 expDiff;
4671 1.1 ross
4672 1.1 ross aSig1 = extractFloat128Frac1( a );
4673 1.1 ross aSig0 = extractFloat128Frac0( a );
4674 1.1 ross aExp = extractFloat128Exp( a );
4675 1.1 ross bSig1 = extractFloat128Frac1( b );
4676 1.1 ross bSig0 = extractFloat128Frac0( b );
4677 1.1 ross bExp = extractFloat128Exp( b );
4678 1.1 ross expDiff = aExp - bExp;
4679 1.1 ross if ( 0 < expDiff ) {
4680 1.1 ross if ( aExp == 0x7FFF ) {
4681 1.1 ross if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, b );
4682 1.1 ross return a;
4683 1.1 ross }
4684 1.1 ross if ( bExp == 0 ) {
4685 1.1 ross --expDiff;
4686 1.1 ross }
4687 1.1 ross else {
4688 1.1 ross bSig0 |= LIT64( 0x0001000000000000 );
4689 1.1 ross }
4690 1.1 ross shift128ExtraRightJamming(
4691 1.1 ross bSig0, bSig1, 0, expDiff, &bSig0, &bSig1, &zSig2 );
4692 1.1 ross zExp = aExp;
4693 1.1 ross }
4694 1.1 ross else if ( expDiff < 0 ) {
4695 1.1 ross if ( bExp == 0x7FFF ) {
4696 1.1 ross if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b );
4697 1.1 ross return packFloat128( zSign, 0x7FFF, 0, 0 );
4698 1.1 ross }
4699 1.1 ross if ( aExp == 0 ) {
4700 1.1 ross ++expDiff;
4701 1.1 ross }
4702 1.1 ross else {
4703 1.1 ross aSig0 |= LIT64( 0x0001000000000000 );
4704 1.1 ross }
4705 1.1 ross shift128ExtraRightJamming(
4706 1.1 ross aSig0, aSig1, 0, - expDiff, &aSig0, &aSig1, &zSig2 );
4707 1.1 ross zExp = bExp;
4708 1.1 ross }
4709 1.1 ross else {
4710 1.1 ross if ( aExp == 0x7FFF ) {
4711 1.1 ross if ( aSig0 | aSig1 | bSig0 | bSig1 ) {
4712 1.1 ross return propagateFloat128NaN( a, b );
4713 1.1 ross }
4714 1.1 ross return a;
4715 1.1 ross }
4716 1.1 ross add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
4717 1.1 ross if ( aExp == 0 ) return packFloat128( zSign, 0, zSig0, zSig1 );
4718 1.1 ross zSig2 = 0;
4719 1.1 ross zSig0 |= LIT64( 0x0002000000000000 );
4720 1.1 ross zExp = aExp;
4721 1.1 ross goto shiftRight1;
4722 1.1 ross }
4723 1.1 ross aSig0 |= LIT64( 0x0001000000000000 );
4724 1.1 ross add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
4725 1.1 ross --zExp;
4726 1.1 ross if ( zSig0 < LIT64( 0x0002000000000000 ) ) goto roundAndPack;
4727 1.1 ross ++zExp;
4728 1.1 ross shiftRight1:
4729 1.1 ross shift128ExtraRightJamming(
4730 1.1 ross zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 );
4731 1.1 ross roundAndPack:
4732 1.1 ross return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 );
4733 1.1 ross
4734 1.1 ross }
4735 1.1 ross
4736 1.1 ross /*
4737 1.1 ross -------------------------------------------------------------------------------
4738 1.1 ross Returns the result of subtracting the absolute values of the quadruple-
4739 1.1 ross precision floating-point values `a' and `b'. If `zSign' is 1, the
4740 1.1 ross difference is negated before being returned. `zSign' is ignored if the
4741 1.1 ross result is a NaN. The subtraction is performed according to the IEC/IEEE
4742 1.1 ross Standard for Binary Floating-Point Arithmetic.
4743 1.1 ross -------------------------------------------------------------------------------
4744 1.1 ross */
4745 1.1 ross static float128 subFloat128Sigs( float128 a, float128 b, flag zSign )
4746 1.1 ross {
4747 1.1 ross int32 aExp, bExp, zExp;
4748 1.1 ross bits64 aSig0, aSig1, bSig0, bSig1, zSig0, zSig1;
4749 1.1 ross int32 expDiff;
4750 1.1 ross float128 z;
4751 1.1 ross
4752 1.1 ross aSig1 = extractFloat128Frac1( a );
4753 1.1 ross aSig0 = extractFloat128Frac0( a );
4754 1.1 ross aExp = extractFloat128Exp( a );
4755 1.1 ross bSig1 = extractFloat128Frac1( b );
4756 1.1 ross bSig0 = extractFloat128Frac0( b );
4757 1.1 ross bExp = extractFloat128Exp( b );
4758 1.1 ross expDiff = aExp - bExp;
4759 1.1 ross shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 );
4760 1.1 ross shortShift128Left( bSig0, bSig1, 14, &bSig0, &bSig1 );
4761 1.1 ross if ( 0 < expDiff ) goto aExpBigger;
4762 1.1 ross if ( expDiff < 0 ) goto bExpBigger;
4763 1.1 ross if ( aExp == 0x7FFF ) {
4764 1.1 ross if ( aSig0 | aSig1 | bSig0 | bSig1 ) {
4765 1.1 ross return propagateFloat128NaN( a, b );
4766 1.1 ross }
4767 1.1 ross float_raise( float_flag_invalid );
4768 1.1 ross z.low = float128_default_nan_low;
4769 1.1 ross z.high = float128_default_nan_high;
4770 1.1 ross return z;
4771 1.1 ross }
4772 1.1 ross if ( aExp == 0 ) {
4773 1.1 ross aExp = 1;
4774 1.1 ross bExp = 1;
4775 1.1 ross }
4776 1.1 ross if ( bSig0 < aSig0 ) goto aBigger;
4777 1.1 ross if ( aSig0 < bSig0 ) goto bBigger;
4778 1.1 ross if ( bSig1 < aSig1 ) goto aBigger;
4779 1.1 ross if ( aSig1 < bSig1 ) goto bBigger;
4780 1.1 ross return packFloat128( float_rounding_mode() == float_round_down, 0, 0, 0 );
4781 1.1 ross bExpBigger:
4782 1.1 ross if ( bExp == 0x7FFF ) {
4783 1.1 ross if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b );
4784 1.1 ross return packFloat128( zSign ^ 1, 0x7FFF, 0, 0 );
4785 1.1 ross }
4786 1.1 ross if ( aExp == 0 ) {
4787 1.1 ross ++expDiff;
4788 1.1 ross }
4789 1.1 ross else {
4790 1.1 ross aSig0 |= LIT64( 0x4000000000000000 );
4791 1.1 ross }
4792 1.1 ross shift128RightJamming( aSig0, aSig1, - expDiff, &aSig0, &aSig1 );
4793 1.1 ross bSig0 |= LIT64( 0x4000000000000000 );
4794 1.1 ross bBigger:
4795 1.1 ross sub128( bSig0, bSig1, aSig0, aSig1, &zSig0, &zSig1 );
4796 1.1 ross zExp = bExp;
4797 1.1 ross zSign ^= 1;
4798 1.1 ross goto normalizeRoundAndPack;
4799 1.1 ross aExpBigger:
4800 1.1 ross if ( aExp == 0x7FFF ) {
4801 1.1 ross if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, b );
4802 1.1 ross return a;
4803 1.1 ross }
4804 1.1 ross if ( bExp == 0 ) {
4805 1.1 ross --expDiff;
4806 1.1 ross }
4807 1.1 ross else {
4808 1.1 ross bSig0 |= LIT64( 0x4000000000000000 );
4809 1.1 ross }
4810 1.1 ross shift128RightJamming( bSig0, bSig1, expDiff, &bSig0, &bSig1 );
4811 1.1 ross aSig0 |= LIT64( 0x4000000000000000 );
4812 1.1 ross aBigger:
4813 1.1 ross sub128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
4814 1.1 ross zExp = aExp;
4815 1.1 ross normalizeRoundAndPack:
4816 1.1 ross --zExp;
4817 1.1 ross return normalizeRoundAndPackFloat128( zSign, zExp - 14, zSig0, zSig1 );
4818 1.1 ross
4819 1.1 ross }
4820 1.1 ross
4821 1.1 ross /*
4822 1.1 ross -------------------------------------------------------------------------------
4823 1.1 ross Returns the result of adding the quadruple-precision floating-point values
4824 1.1 ross `a' and `b'. The operation is performed according to the IEC/IEEE Standard
4825 1.1 ross for Binary Floating-Point Arithmetic.
4826 1.1 ross -------------------------------------------------------------------------------
4827 1.1 ross */
4828 1.1 ross float128 float128_add( float128 a, float128 b )
4829 1.1 ross {
4830 1.1 ross flag aSign, bSign;
4831 1.1 ross
4832 1.1 ross aSign = extractFloat128Sign( a );
4833 1.1 ross bSign = extractFloat128Sign( b );
4834 1.1 ross if ( aSign == bSign ) {
4835 1.1 ross return addFloat128Sigs( a, b, aSign );
4836 1.1 ross }
4837 1.1 ross else {
4838 1.1 ross return subFloat128Sigs( a, b, aSign );
4839 1.1 ross }
4840 1.1 ross
4841 1.1 ross }
4842 1.1 ross
4843 1.1 ross /*
4844 1.1 ross -------------------------------------------------------------------------------
4845 1.1 ross Returns the result of subtracting the quadruple-precision floating-point
4846 1.1 ross values `a' and `b'. The operation is performed according to the IEC/IEEE
4847 1.1 ross Standard for Binary Floating-Point Arithmetic.
4848 1.1 ross -------------------------------------------------------------------------------
4849 1.1 ross */
4850 1.1 ross float128 float128_sub( float128 a, float128 b )
4851 1.1 ross {
4852 1.1 ross flag aSign, bSign;
4853 1.1 ross
4854 1.1 ross aSign = extractFloat128Sign( a );
4855 1.1 ross bSign = extractFloat128Sign( b );
4856 1.1 ross if ( aSign == bSign ) {
4857 1.1 ross return subFloat128Sigs( a, b, aSign );
4858 1.1 ross }
4859 1.1 ross else {
4860 1.1 ross return addFloat128Sigs( a, b, aSign );
4861 1.1 ross }
4862 1.1 ross
4863 1.1 ross }
4864 1.1 ross
4865 1.1 ross /*
4866 1.1 ross -------------------------------------------------------------------------------
4867 1.1 ross Returns the result of multiplying the quadruple-precision floating-point
4868 1.1 ross values `a' and `b'. The operation is performed according to the IEC/IEEE
4869 1.1 ross Standard for Binary Floating-Point Arithmetic.
4870 1.1 ross -------------------------------------------------------------------------------
4871 1.1 ross */
4872 1.1 ross float128 float128_mul( float128 a, float128 b )
4873 1.1 ross {
4874 1.1 ross flag aSign, bSign, zSign;
4875 1.1 ross int32 aExp, bExp, zExp;
4876 1.1 ross bits64 aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2, zSig3;
4877 1.1 ross float128 z;
4878 1.1 ross
4879 1.1 ross aSig1 = extractFloat128Frac1( a );
4880 1.1 ross aSig0 = extractFloat128Frac0( a );
4881 1.1 ross aExp = extractFloat128Exp( a );
4882 1.1 ross aSign = extractFloat128Sign( a );
4883 1.1 ross bSig1 = extractFloat128Frac1( b );
4884 1.1 ross bSig0 = extractFloat128Frac0( b );
4885 1.1 ross bExp = extractFloat128Exp( b );
4886 1.1 ross bSign = extractFloat128Sign( b );
4887 1.1 ross zSign = aSign ^ bSign;
4888 1.1 ross if ( aExp == 0x7FFF ) {
4889 1.1 ross if ( ( aSig0 | aSig1 )
4890 1.1 ross || ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) {
4891 1.1 ross return propagateFloat128NaN( a, b );
4892 1.1 ross }
4893 1.1 ross if ( ( bExp | bSig0 | bSig1 ) == 0 ) goto invalid;
4894 1.1 ross return packFloat128( zSign, 0x7FFF, 0, 0 );
4895 1.1 ross }
4896 1.1 ross if ( bExp == 0x7FFF ) {
4897 1.1 ross if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b );
4898 1.1 ross if ( ( aExp | aSig0 | aSig1 ) == 0 ) {
4899 1.1 ross invalid:
4900 1.1 ross float_raise( float_flag_invalid );
4901 1.1 ross z.low = float128_default_nan_low;
4902 1.1 ross z.high = float128_default_nan_high;
4903 1.1 ross return z;
4904 1.1 ross }
4905 1.1 ross return packFloat128( zSign, 0x7FFF, 0, 0 );
4906 1.1 ross }
4907 1.1 ross if ( aExp == 0 ) {
4908 1.1 ross if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
4909 1.1 ross normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
4910 1.1 ross }
4911 1.1 ross if ( bExp == 0 ) {
4912 1.1 ross if ( ( bSig0 | bSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
4913 1.1 ross normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
4914 1.1 ross }
4915 1.1 ross zExp = aExp + bExp - 0x4000;
4916 1.1 ross aSig0 |= LIT64( 0x0001000000000000 );
4917 1.1 ross shortShift128Left( bSig0, bSig1, 16, &bSig0, &bSig1 );
4918 1.1 ross mul128To256( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1, &zSig2, &zSig3 );
4919 1.1 ross add128( zSig0, zSig1, aSig0, aSig1, &zSig0, &zSig1 );
4920 1.1 ross zSig2 |= ( zSig3 != 0 );
4921 1.1 ross if ( LIT64( 0x0002000000000000 ) <= zSig0 ) {
4922 1.1 ross shift128ExtraRightJamming(
4923 1.1 ross zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 );
4924 1.1 ross ++zExp;
4925 1.1 ross }
4926 1.1 ross return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 );
4927 1.1 ross
4928 1.1 ross }
4929 1.1 ross
4930 1.1 ross /*
4931 1.1 ross -------------------------------------------------------------------------------
4932 1.1 ross Returns the result of dividing the quadruple-precision floating-point value
4933 1.1 ross `a' by the corresponding value `b'. The operation is performed according to
4934 1.1 ross the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4935 1.1 ross -------------------------------------------------------------------------------
4936 1.1 ross */
4937 1.1 ross float128 float128_div( float128 a, float128 b )
4938 1.1 ross {
4939 1.1 ross flag aSign, bSign, zSign;
4940 1.1 ross int32 aExp, bExp, zExp;
4941 1.1 ross bits64 aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2;
4942 1.1 ross bits64 rem0, rem1, rem2, rem3, term0, term1, term2, term3;
4943 1.1 ross float128 z;
4944 1.1 ross
4945 1.1 ross aSig1 = extractFloat128Frac1( a );
4946 1.1 ross aSig0 = extractFloat128Frac0( a );
4947 1.1 ross aExp = extractFloat128Exp( a );
4948 1.1 ross aSign = extractFloat128Sign( a );
4949 1.1 ross bSig1 = extractFloat128Frac1( b );
4950 1.1 ross bSig0 = extractFloat128Frac0( b );
4951 1.1 ross bExp = extractFloat128Exp( b );
4952 1.1 ross bSign = extractFloat128Sign( b );
4953 1.1 ross zSign = aSign ^ bSign;
4954 1.1 ross if ( aExp == 0x7FFF ) {
4955 1.1 ross if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, b );
4956 1.1 ross if ( bExp == 0x7FFF ) {
4957 1.1 ross if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b );
4958 1.1 ross goto invalid;
4959 1.1 ross }
4960 1.1 ross return packFloat128( zSign, 0x7FFF, 0, 0 );
4961 1.1 ross }
4962 1.1 ross if ( bExp == 0x7FFF ) {
4963 1.1 ross if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b );
4964 1.1 ross return packFloat128( zSign, 0, 0, 0 );
4965 1.1 ross }
4966 1.1 ross if ( bExp == 0 ) {
4967 1.1 ross if ( ( bSig0 | bSig1 ) == 0 ) {
4968 1.1 ross if ( ( aExp | aSig0 | aSig1 ) == 0 ) {
4969 1.1 ross invalid:
4970 1.1 ross float_raise( float_flag_invalid );
4971 1.1 ross z.low = float128_default_nan_low;
4972 1.1 ross z.high = float128_default_nan_high;
4973 1.1 ross return z;
4974 1.1 ross }
4975 1.1 ross float_raise( float_flag_divbyzero );
4976 1.1 ross return packFloat128( zSign, 0x7FFF, 0, 0 );
4977 1.1 ross }
4978 1.1 ross normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
4979 1.1 ross }
4980 1.1 ross if ( aExp == 0 ) {
4981 1.1 ross if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
4982 1.1 ross normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
4983 1.1 ross }
4984 1.1 ross zExp = aExp - bExp + 0x3FFD;
4985 1.1 ross shortShift128Left(
4986 1.1 ross aSig0 | LIT64( 0x0001000000000000 ), aSig1, 15, &aSig0, &aSig1 );
4987 1.1 ross shortShift128Left(
4988 1.1 ross bSig0 | LIT64( 0x0001000000000000 ), bSig1, 15, &bSig0, &bSig1 );
4989 1.1 ross if ( le128( bSig0, bSig1, aSig0, aSig1 ) ) {
4990 1.1 ross shift128Right( aSig0, aSig1, 1, &aSig0, &aSig1 );
4991 1.1 ross ++zExp;
4992 1.1 ross }
4993 1.1 ross zSig0 = estimateDiv128To64( aSig0, aSig1, bSig0 );
4994 1.1 ross mul128By64To192( bSig0, bSig1, zSig0, &term0, &term1, &term2 );
4995 1.1 ross sub192( aSig0, aSig1, 0, term0, term1, term2, &rem0, &rem1, &rem2 );
4996 1.1 ross while ( (sbits64) rem0 < 0 ) {
4997 1.1 ross --zSig0;
4998 1.1 ross add192( rem0, rem1, rem2, 0, bSig0, bSig1, &rem0, &rem1, &rem2 );
4999 1.1 ross }
5000 1.1 ross zSig1 = estimateDiv128To64( rem1, rem2, bSig0 );
5001 1.1 ross if ( ( zSig1 & 0x3FFF ) <= 4 ) {
5002 1.1 ross mul128By64To192( bSig0, bSig1, zSig1, &term1, &term2, &term3 );
5003 1.1 ross sub192( rem1, rem2, 0, term1, term2, term3, &rem1, &rem2, &rem3 );
5004 1.1 ross while ( (sbits64) rem1 < 0 ) {
5005 1.1 ross --zSig1;
5006 1.1 ross add192( rem1, rem2, rem3, 0, bSig0, bSig1, &rem1, &rem2, &rem3 );
5007 1.1 ross }
5008 1.1 ross zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
5009 1.1 ross }
5010 1.1 ross shift128ExtraRightJamming( zSig0, zSig1, 0, 15, &zSig0, &zSig1, &zSig2 );
5011 1.1 ross return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 );
5012 1.1 ross
5013 1.1 ross }
5014 1.1 ross
5015 1.1 ross /*
5016 1.1 ross -------------------------------------------------------------------------------
5017 1.1 ross Returns the remainder of the quadruple-precision floating-point value `a'
5018 1.1 ross with respect to the corresponding value `b'. The operation is performed
5019 1.1 ross according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5020 1.1 ross -------------------------------------------------------------------------------
5021 1.1 ross */
5022 1.1 ross float128 float128_rem( float128 a, float128 b )
5023 1.1 ross {
5024 1.1 ross flag aSign, bSign, zSign;
5025 1.1 ross int32 aExp, bExp, expDiff;
5026 1.1 ross bits64 aSig0, aSig1, bSig0, bSig1, q, term0, term1, term2;
5027 1.1 ross bits64 allZero, alternateASig0, alternateASig1, sigMean1;
5028 1.1 ross sbits64 sigMean0;
5029 1.1 ross float128 z;
5030 1.1 ross
5031 1.1 ross aSig1 = extractFloat128Frac1( a );
5032 1.1 ross aSig0 = extractFloat128Frac0( a );
5033 1.1 ross aExp = extractFloat128Exp( a );
5034 1.1 ross aSign = extractFloat128Sign( a );
5035 1.1 ross bSig1 = extractFloat128Frac1( b );
5036 1.1 ross bSig0 = extractFloat128Frac0( b );
5037 1.1 ross bExp = extractFloat128Exp( b );
5038 1.1 ross bSign = extractFloat128Sign( b );
5039 1.1 ross if ( aExp == 0x7FFF ) {
5040 1.1 ross if ( ( aSig0 | aSig1 )
5041 1.1 ross || ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) {
5042 1.1 ross return propagateFloat128NaN( a, b );
5043 1.1 ross }
5044 1.1 ross goto invalid;
5045 1.1 ross }
5046 1.1 ross if ( bExp == 0x7FFF ) {
5047 1.1 ross if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b );
5048 1.1 ross return a;
5049 1.1 ross }
5050 1.1 ross if ( bExp == 0 ) {
5051 1.1 ross if ( ( bSig0 | bSig1 ) == 0 ) {
5052 1.1 ross invalid:
5053 1.1 ross float_raise( float_flag_invalid );
5054 1.1 ross z.low = float128_default_nan_low;
5055 1.1 ross z.high = float128_default_nan_high;
5056 1.1 ross return z;
5057 1.1 ross }
5058 1.1 ross normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
5059 1.1 ross }
5060 1.1 ross if ( aExp == 0 ) {
5061 1.1 ross if ( ( aSig0 | aSig1 ) == 0 ) return a;
5062 1.1 ross normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
5063 1.1 ross }
5064 1.1 ross expDiff = aExp - bExp;
5065 1.1 ross if ( expDiff < -1 ) return a;
5066 1.1 ross shortShift128Left(
5067 1.1 ross aSig0 | LIT64( 0x0001000000000000 ),
5068 1.1 ross aSig1,
5069 1.1 ross 15 - ( expDiff < 0 ),
5070 1.1 ross &aSig0,
5071 1.1 ross &aSig1
5072 1.1 ross );
5073 1.1 ross shortShift128Left(
5074 1.1 ross bSig0 | LIT64( 0x0001000000000000 ), bSig1, 15, &bSig0, &bSig1 );
5075 1.1 ross q = le128( bSig0, bSig1, aSig0, aSig1 );
5076 1.1 ross if ( q ) sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 );
5077 1.1 ross expDiff -= 64;
5078 1.1 ross while ( 0 < expDiff ) {
5079 1.1 ross q = estimateDiv128To64( aSig0, aSig1, bSig0 );
5080 1.1 ross q = ( 4 < q ) ? q - 4 : 0;
5081 1.1 ross mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 );
5082 1.1 ross shortShift192Left( term0, term1, term2, 61, &term1, &term2, &allZero );
5083 1.1 ross shortShift128Left( aSig0, aSig1, 61, &aSig0, &allZero );
5084 1.1 ross sub128( aSig0, 0, term1, term2, &aSig0, &aSig1 );
5085 1.1 ross expDiff -= 61;
5086 1.1 ross }
5087 1.1 ross if ( -64 < expDiff ) {
5088 1.1 ross q = estimateDiv128To64( aSig0, aSig1, bSig0 );
5089 1.1 ross q = ( 4 < q ) ? q - 4 : 0;
5090 1.1 ross q >>= - expDiff;
5091 1.1 ross shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 );
5092 1.1 ross expDiff += 52;
5093 1.1 ross if ( expDiff < 0 ) {
5094 1.1 ross shift128Right( aSig0, aSig1, - expDiff, &aSig0, &aSig1 );
5095 1.1 ross }
5096 1.1 ross else {
5097 1.1 ross shortShift128Left( aSig0, aSig1, expDiff, &aSig0, &aSig1 );
5098 1.1 ross }
5099 1.1 ross mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 );
5100 1.1 ross sub128( aSig0, aSig1, term1, term2, &aSig0, &aSig1 );
5101 1.1 ross }
5102 1.1 ross else {
5103 1.1 ross shift128Right( aSig0, aSig1, 12, &aSig0, &aSig1 );
5104 1.1 ross shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 );
5105 1.1 ross }
5106 1.1 ross do {
5107 1.1 ross alternateASig0 = aSig0;
5108 1.1 ross alternateASig1 = aSig1;
5109 1.1 ross ++q;
5110 1.1 ross sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 );
5111 1.1 ross } while ( 0 <= (sbits64) aSig0 );
5112 1.1 ross add128(
5113 1.1 ross aSig0, aSig1, alternateASig0, alternateASig1, &sigMean0, &sigMean1 );
5114 1.1 ross if ( ( sigMean0 < 0 )
5115 1.1 ross || ( ( ( sigMean0 | sigMean1 ) == 0 ) && ( q & 1 ) ) ) {
5116 1.1 ross aSig0 = alternateASig0;
5117 1.1 ross aSig1 = alternateASig1;
5118 1.1 ross }
5119 1.1 ross zSign = ( (sbits64) aSig0 < 0 );
5120 1.1 ross if ( zSign ) sub128( 0, 0, aSig0, aSig1, &aSig0, &aSig1 );
5121 1.1 ross return
5122 1.1 ross normalizeRoundAndPackFloat128( aSign ^ zSign, bExp - 4, aSig0, aSig1 );
5123 1.1 ross
5124 1.1 ross }
5125 1.1 ross
5126 1.1 ross /*
5127 1.1 ross -------------------------------------------------------------------------------
5128 1.1 ross Returns the square root of the quadruple-precision floating-point value `a'.
5129 1.1 ross The operation is performed according to the IEC/IEEE Standard for Binary
5130 1.1 ross Floating-Point Arithmetic.
5131 1.1 ross -------------------------------------------------------------------------------
5132 1.1 ross */
5133 1.1 ross float128 float128_sqrt( float128 a )
5134 1.1 ross {
5135 1.1 ross flag aSign;
5136 1.1 ross int32 aExp, zExp;
5137 1.1 ross bits64 aSig0, aSig1, zSig0, zSig1, zSig2, doubleZSig0;
5138 1.1 ross bits64 rem0, rem1, rem2, rem3, term0, term1, term2, term3;
5139 1.1 ross float128 z;
5140 1.1 ross
5141 1.1 ross aSig1 = extractFloat128Frac1( a );
5142 1.1 ross aSig0 = extractFloat128Frac0( a );
5143 1.1 ross aExp = extractFloat128Exp( a );
5144 1.1 ross aSign = extractFloat128Sign( a );
5145 1.1 ross if ( aExp == 0x7FFF ) {
5146 1.1 ross if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, a );
5147 1.1 ross if ( ! aSign ) return a;
5148 1.1 ross goto invalid;
5149 1.1 ross }
5150 1.1 ross if ( aSign ) {
5151 1.1 ross if ( ( aExp | aSig0 | aSig1 ) == 0 ) return a;
5152 1.1 ross invalid:
5153 1.1 ross float_raise( float_flag_invalid );
5154 1.1 ross z.low = float128_default_nan_low;
5155 1.1 ross z.high = float128_default_nan_high;
5156 1.1 ross return z;
5157 1.1 ross }
5158 1.1 ross if ( aExp == 0 ) {
5159 1.1 ross if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( 0, 0, 0, 0 );
5160 1.1 ross normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
5161 1.1 ross }
5162 1.1 ross zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFE;
5163 1.1 ross aSig0 |= LIT64( 0x0001000000000000 );
5164 1.1 ross zSig0 = estimateSqrt32( aExp, aSig0>>17 );
5165 1.1 ross shortShift128Left( aSig0, aSig1, 13 - ( aExp & 1 ), &aSig0, &aSig1 );
5166 1.1 ross zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 );
5167 1.1 ross doubleZSig0 = zSig0<<1;
5168 1.1 ross mul64To128( zSig0, zSig0, &term0, &term1 );
5169 1.1 ross sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 );
5170 1.1 ross while ( (sbits64) rem0 < 0 ) {
5171 1.1 ross --zSig0;
5172 1.1 ross doubleZSig0 -= 2;
5173 1.1 ross add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 );
5174 1.1 ross }
5175 1.1 ross zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 );
5176 1.1 ross if ( ( zSig1 & 0x1FFF ) <= 5 ) {
5177 1.1 ross if ( zSig1 == 0 ) zSig1 = 1;
5178 1.1 ross mul64To128( doubleZSig0, zSig1, &term1, &term2 );
5179 1.1 ross sub128( rem1, 0, term1, term2, &rem1, &rem2 );
5180 1.1 ross mul64To128( zSig1, zSig1, &term2, &term3 );
5181 1.1 ross sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 );
5182 1.1 ross while ( (sbits64) rem1 < 0 ) {
5183 1.1 ross --zSig1;
5184 1.1 ross shortShift128Left( 0, zSig1, 1, &term2, &term3 );
5185 1.1 ross term3 |= 1;
5186 1.1 ross term2 |= doubleZSig0;
5187 1.1 ross add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 );
5188 1.1 ross }
5189 1.1 ross zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
5190 1.1 ross }
5191 1.1 ross shift128ExtraRightJamming( zSig0, zSig1, 0, 14, &zSig0, &zSig1, &zSig2 );
5192 1.1 ross return roundAndPackFloat128( 0, zExp, zSig0, zSig1, zSig2 );
5193 1.1 ross
5194 1.1 ross }
5195 1.1 ross
5196 1.1 ross /*
5197 1.1 ross -------------------------------------------------------------------------------
5198 1.1 ross Returns 1 if the quadruple-precision floating-point value `a' is equal to
5199 1.1 ross the corresponding value `b', and 0 otherwise. The comparison is performed
5200 1.1 ross according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5201 1.1 ross -------------------------------------------------------------------------------
5202 1.1 ross */
5203 1.1 ross flag float128_eq( float128 a, float128 b )
5204 1.1 ross {
5205 1.1 ross
5206 1.1 ross if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
5207 1.1 ross && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
5208 1.1 ross || ( ( extractFloat128Exp( b ) == 0x7FFF )
5209 1.1 ross && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
5210 1.1 ross ) {
5211 1.1 ross if ( float128_is_signaling_nan( a )
5212 1.1 ross || float128_is_signaling_nan( b ) ) {
5213 1.1 ross float_raise( float_flag_invalid );
5214 1.1 ross }
5215 1.1 ross return 0;
5216 1.1 ross }
5217 1.1 ross return
5218 1.1 ross ( a.low == b.low )
5219 1.1 ross && ( ( a.high == b.high )
5220 1.1 ross || ( ( a.low == 0 )
5221 1.1 ross && ( (bits64) ( ( a.high | b.high )<<1 ) == 0 ) )
5222 1.1 ross );
5223 1.1 ross
5224 1.1 ross }
5225 1.1 ross
5226 1.1 ross /*
5227 1.1 ross -------------------------------------------------------------------------------
5228 1.1 ross Returns 1 if the quadruple-precision floating-point value `a' is less than
5229 1.1 ross or equal to the corresponding value `b', and 0 otherwise. The comparison
5230 1.1 ross is performed according to the IEC/IEEE Standard for Binary Floating-Point
5231 1.1 ross Arithmetic.
5232 1.1 ross -------------------------------------------------------------------------------
5233 1.1 ross */
5234 1.1 ross flag float128_le( float128 a, float128 b )
5235 1.1 ross {
5236 1.1 ross flag aSign, bSign;
5237 1.1 ross
5238 1.1 ross if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
5239 1.1 ross && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
5240 1.1 ross || ( ( extractFloat128Exp( b ) == 0x7FFF )
5241 1.1 ross && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
5242 1.1 ross ) {
5243 1.1 ross float_raise( float_flag_invalid );
5244 1.1 ross return 0;
5245 1.1 ross }
5246 1.1 ross aSign = extractFloat128Sign( a );
5247 1.1 ross bSign = extractFloat128Sign( b );
5248 1.1 ross if ( aSign != bSign ) {
5249 1.1 ross return
5250 1.1 ross aSign
5251 1.1 ross || ( ( ( (bits64) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5252 1.1 ross == 0 );
5253 1.1 ross }
5254 1.1 ross return
5255 1.1 ross aSign ? le128( b.high, b.low, a.high, a.low )
5256 1.1 ross : le128( a.high, a.low, b.high, b.low );
5257 1.1 ross
5258 1.1 ross }
5259 1.1 ross
5260 1.1 ross /*
5261 1.1 ross -------------------------------------------------------------------------------
5262 1.1 ross Returns 1 if the quadruple-precision floating-point value `a' is less than
5263 1.1 ross the corresponding value `b', and 0 otherwise. The comparison is performed
5264 1.1 ross according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5265 1.1 ross -------------------------------------------------------------------------------
5266 1.1 ross */
5267 1.1 ross flag float128_lt( float128 a, float128 b )
5268 1.1 ross {
5269 1.1 ross flag aSign, bSign;
5270 1.1 ross
5271 1.1 ross if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
5272 1.1 ross && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
5273 1.1 ross || ( ( extractFloat128Exp( b ) == 0x7FFF )
5274 1.1 ross && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
5275 1.1 ross ) {
5276 1.1 ross float_raise( float_flag_invalid );
5277 1.1 ross return 0;
5278 1.1 ross }
5279 1.1 ross aSign = extractFloat128Sign( a );
5280 1.1 ross bSign = extractFloat128Sign( b );
5281 1.1 ross if ( aSign != bSign ) {
5282 1.1 ross return
5283 1.1 ross aSign
5284 1.1 ross && ( ( ( (bits64) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5285 1.1 ross != 0 );
5286 1.1 ross }
5287 1.1 ross return
5288 1.1 ross aSign ? lt128( b.high, b.low, a.high, a.low )
5289 1.1 ross : lt128( a.high, a.low, b.high, b.low );
5290 1.1 ross
5291 1.1 ross }
5292 1.1 ross
5293 1.1 ross /*
5294 1.1 ross -------------------------------------------------------------------------------
5295 1.1 ross Returns 1 if the quadruple-precision floating-point value `a' is equal to
5296 1.1 ross the corresponding value `b', and 0 otherwise. The invalid exception is
5297 1.1 ross raised if either operand is a NaN. Otherwise, the comparison is performed
5298 1.1 ross according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5299 1.1 ross -------------------------------------------------------------------------------
5300 1.1 ross */
5301 1.1 ross flag float128_eq_signaling( float128 a, float128 b )
5302 1.1 ross {
5303 1.1 ross
5304 1.1 ross if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
5305 1.1 ross && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
5306 1.1 ross || ( ( extractFloat128Exp( b ) == 0x7FFF )
5307 1.1 ross && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
5308 1.1 ross ) {
5309 1.1 ross float_raise( float_flag_invalid );
5310 1.1 ross return 0;
5311 1.1 ross }
5312 1.1 ross return
5313 1.1 ross ( a.low == b.low )
5314 1.1 ross && ( ( a.high == b.high )
5315 1.1 ross || ( ( a.low == 0 )
5316 1.1 ross && ( (bits64) ( ( a.high | b.high )<<1 ) == 0 ) )
5317 1.1 ross );
5318 1.1 ross
5319 1.1 ross }
5320 1.1 ross
5321 1.1 ross /*
5322 1.1 ross -------------------------------------------------------------------------------
5323 1.1 ross Returns 1 if the quadruple-precision floating-point value `a' is less than
5324 1.1 ross or equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not
5325 1.1 ross cause an exception. Otherwise, the comparison is performed according to the
5326 1.1 ross IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5327 1.1 ross -------------------------------------------------------------------------------
5328 1.1 ross */
5329 1.1 ross flag float128_le_quiet( float128 a, float128 b )
5330 1.1 ross {
5331 1.1 ross flag aSign, bSign;
5332 1.1 ross
5333 1.1 ross if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
5334 1.1 ross && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
5335 1.1 ross || ( ( extractFloat128Exp( b ) == 0x7FFF )
5336 1.1 ross && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
5337 1.1 ross ) {
5338 1.1 ross if ( float128_is_signaling_nan( a )
5339 1.1 ross || float128_is_signaling_nan( b ) ) {
5340 1.1 ross float_raise( float_flag_invalid );
5341 1.1 ross }
5342 1.1 ross return 0;
5343 1.1 ross }
5344 1.1 ross aSign = extractFloat128Sign( a );
5345 1.1 ross bSign = extractFloat128Sign( b );
5346 1.1 ross if ( aSign != bSign ) {
5347 1.1 ross return
5348 1.1 ross aSign
5349 1.1 ross || ( ( ( (bits64) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5350 1.1 ross == 0 );
5351 1.1 ross }
5352 1.1 ross return
5353 1.1 ross aSign ? le128( b.high, b.low, a.high, a.low )
5354 1.1 ross : le128( a.high, a.low, b.high, b.low );
5355 1.1 ross
5356 1.1 ross }
5357 1.1 ross
5358 1.1 ross /*
5359 1.1 ross -------------------------------------------------------------------------------
5360 1.1 ross Returns 1 if the quadruple-precision floating-point value `a' is less than
5361 1.1 ross the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
5362 1.1 ross exception. Otherwise, the comparison is performed according to the IEC/IEEE
5363 1.1 ross Standard for Binary Floating-Point Arithmetic.
5364 1.1 ross -------------------------------------------------------------------------------
5365 1.1 ross */
5366 1.1 ross flag float128_lt_quiet( float128 a, float128 b )
5367 1.1 ross {
5368 1.1 ross flag aSign, bSign;
5369 1.1 ross
5370 1.1 ross if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
5371 1.1 ross && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
5372 1.1 ross || ( ( extractFloat128Exp( b ) == 0x7FFF )
5373 1.1 ross && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
5374 1.1 ross ) {
5375 1.1 ross if ( float128_is_signaling_nan( a )
5376 1.1 ross || float128_is_signaling_nan( b ) ) {
5377 1.1 ross float_raise( float_flag_invalid );
5378 1.1 ross }
5379 1.1 ross return 0;
5380 1.1 ross }
5381 1.1 ross aSign = extractFloat128Sign( a );
5382 1.1 ross bSign = extractFloat128Sign( b );
5383 1.1 ross if ( aSign != bSign ) {
5384 1.1 ross return
5385 1.1 ross aSign
5386 1.1 ross && ( ( ( (bits64) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5387 1.1 ross != 0 );
5388 1.1 ross }
5389 1.1 ross return
5390 1.1 ross aSign ? lt128( b.high, b.low, a.high, a.low )
5391 1.1 ross : lt128( a.high, a.low, b.high, b.low );
5392 1.1 ross
5393 1.1 ross }
5394 1.1 ross
5395 1.1 ross #endif
5396 1.1 ross
5397 1.1 ross
5398 1.1 ross #if defined(SOFTFLOAT_FOR_GCC) && defined(SOFTFLOAT_NEED_FIXUNS)
5399 1.1 ross
5400 1.1 ross /*
5401 1.1 ross * These two routines are not part of the original softfloat distribution.
5402 1.1 ross *
5403 1.1 ross * They are based on the corresponding conversions to integer but return
5404 1.1 ross * unsigned numbers instead since these functions are required by GCC.
5405 1.1 ross *
5406 1.3 keihan * Added by Mark Brinicombe <mark (at) NetBSD.org> 27/09/97
5407 1.1 ross *
5408 1.1 ross * float64 version overhauled for SoftFloat 2a [bjh21 2000-07-15]
5409 1.1 ross */
5410 1.1 ross
5411 1.1 ross /*
5412 1.1 ross -------------------------------------------------------------------------------
5413 1.1 ross Returns the result of converting the double-precision floating-point value
5414 1.1 ross `a' to the 32-bit unsigned integer format. The conversion is
5415 1.1 ross performed according to the IEC/IEEE Standard for Binary Floating-point
5416 1.1 ross Arithmetic, except that the conversion is always rounded toward zero. If
5417 1.1 ross `a' is a NaN, the largest positive integer is returned. If the conversion
5418 1.1 ross overflows, the largest integer positive is returned.
5419 1.1 ross -------------------------------------------------------------------------------
5420 1.1 ross */
5421 1.1 ross uint32 float64_to_uint32_round_to_zero( float64 a )
5422 1.1 ross {
5423 1.1 ross flag aSign;
5424 1.1 ross int16 aExp, shiftCount;
5425 1.1 ross bits64 aSig, savedASig;
5426 1.1 ross uint32 z;
5427 1.1 ross
5428 1.1 ross aSig = extractFloat64Frac( a );
5429 1.1 ross aExp = extractFloat64Exp( a );
5430 1.1 ross aSign = extractFloat64Sign( a );
5431 1.1 ross
5432 1.1 ross if (aSign) {
5433 1.1 ross float_raise( float_flag_invalid );
5434 1.1 ross return(0);
5435 1.1 ross }
5436 1.1 ross
5437 1.1 ross if ( 0x41E < aExp ) {
5438 1.1 ross float_raise( float_flag_invalid );
5439 1.1 ross return 0xffffffff;
5440 1.1 ross }
5441 1.1 ross else if ( aExp < 0x3FF ) {
5442 1.1 ross if ( aExp || aSig ) float_set_inexact();
5443 1.1 ross return 0;
5444 1.1 ross }
5445 1.1 ross aSig |= LIT64( 0x0010000000000000 );
5446 1.1 ross shiftCount = 0x433 - aExp;
5447 1.1 ross savedASig = aSig;
5448 1.1 ross aSig >>= shiftCount;
5449 1.1 ross z = aSig;
5450 1.1 ross if ( ( aSig<<shiftCount ) != savedASig ) {
5451 1.1 ross float_set_inexact();
5452 1.1 ross }
5453 1.1 ross return z;
5454 1.1 ross
5455 1.1 ross }
5456 1.1 ross
5457 1.1 ross /*
5458 1.1 ross -------------------------------------------------------------------------------
5459 1.1 ross Returns the result of converting the single-precision floating-point value
5460 1.1 ross `a' to the 32-bit unsigned integer format. The conversion is
5461 1.1 ross performed according to the IEC/IEEE Standard for Binary Floating-point
5462 1.1 ross Arithmetic, except that the conversion is always rounded toward zero. If
5463 1.1 ross `a' is a NaN, the largest positive integer is returned. If the conversion
5464 1.1 ross overflows, the largest positive integer is returned.
5465 1.1 ross -------------------------------------------------------------------------------
5466 1.1 ross */
5467 1.1 ross uint32 float32_to_uint32_round_to_zero( float32 a )
5468 1.1 ross {
5469 1.1 ross flag aSign;
5470 1.1 ross int16 aExp, shiftCount;
5471 1.1 ross bits32 aSig;
5472 1.1 ross uint32 z;
5473 1.1 ross
5474 1.1 ross aSig = extractFloat32Frac( a );
5475 1.1 ross aExp = extractFloat32Exp( a );
5476 1.1 ross aSign = extractFloat32Sign( a );
5477 1.1 ross shiftCount = aExp - 0x9E;
5478 1.1 ross
5479 1.1 ross if (aSign) {
5480 1.1 ross float_raise( float_flag_invalid );
5481 1.1 ross return(0);
5482 1.1 ross }
5483 1.1 ross if ( 0 < shiftCount ) {
5484 1.1 ross float_raise( float_flag_invalid );
5485 1.1 ross return 0xFFFFFFFF;
5486 1.1 ross }
5487 1.1 ross else if ( aExp <= 0x7E ) {
5488 1.1 ross if ( aExp | aSig ) float_set_inexact();
5489 1.1 ross return 0;
5490 1.1 ross }
5491 1.1 ross aSig = ( aSig | 0x800000 )<<8;
5492 1.1 ross z = aSig>>( - shiftCount );
5493 1.1 ross if ( aSig<<( shiftCount & 31 ) ) {
5494 1.1 ross float_set_inexact();
5495 1.1 ross }
5496 1.1 ross return z;
5497 1.1 ross
5498 1.1 ross }
5499 1.1 ross
5500 1.1 ross #endif
5501 1.2 thorpej
5502 1.2 thorpej #endif /* _STANDALONE */
5503