softfloat.c revision 1.6 1 1.6 martin /* $NetBSD: softfloat.c,v 1.6 2017/12/31 11:43:42 martin Exp $ */
2 1.1 ross
3 1.1 ross /*
4 1.1 ross * This version hacked for use with gcc -msoft-float by bjh21.
5 1.1 ross * (Mostly a case of #ifdefing out things GCC doesn't need or provides
6 1.1 ross * itself).
7 1.1 ross */
8 1.1 ross
9 1.1 ross /*
10 1.1 ross * Things you may want to define:
11 1.1 ross *
12 1.1 ross * SOFTFLOAT_FOR_GCC - build only those functions necessary for GCC (with
13 1.1 ross * -msoft-float) to work. Include "softfloat-for-gcc.h" to get them
14 1.1 ross * properly renamed.
15 1.1 ross */
16 1.1 ross
17 1.1 ross /*
18 1.1 ross ===============================================================================
19 1.1 ross
20 1.1 ross This C source file is part of the SoftFloat IEC/IEEE Floating-point
21 1.1 ross Arithmetic Package, Release 2a.
22 1.1 ross
23 1.1 ross Written by John R. Hauser. This work was made possible in part by the
24 1.1 ross International Computer Science Institute, located at Suite 600, 1947 Center
25 1.1 ross Street, Berkeley, California 94704. Funding was partially provided by the
26 1.1 ross National Science Foundation under grant MIP-9311980. The original version
27 1.1 ross of this code was written as part of a project to build a fixed-point vector
28 1.1 ross processor in collaboration with the University of California at Berkeley,
29 1.1 ross overseen by Profs. Nelson Morgan and John Wawrzynek. More information
30 1.1 ross is available through the Web page `http://HTTP.CS.Berkeley.EDU/~jhauser/
31 1.1 ross arithmetic/SoftFloat.html'.
32 1.1 ross
33 1.1 ross THIS SOFTWARE IS DISTRIBUTED AS IS, FOR FREE. Although reasonable effort
34 1.1 ross has been made to avoid it, THIS SOFTWARE MAY CONTAIN FAULTS THAT WILL AT
35 1.1 ross TIMES RESULT IN INCORRECT BEHAVIOR. USE OF THIS SOFTWARE IS RESTRICTED TO
36 1.1 ross PERSONS AND ORGANIZATIONS WHO CAN AND WILL TAKE FULL RESPONSIBILITY FOR ANY
37 1.1 ross AND ALL LOSSES, COSTS, OR OTHER PROBLEMS ARISING FROM ITS USE.
38 1.1 ross
39 1.1 ross Derivative works are acceptable, even for commercial purposes, so long as
40 1.1 ross (1) they include prominent notice that the work is derivative, and (2) they
41 1.1 ross include prominent notice akin to these four paragraphs for those parts of
42 1.1 ross this code that are retained.
43 1.1 ross
44 1.1 ross ===============================================================================
45 1.1 ross */
46 1.1 ross
47 1.2 thorpej /* If you need this in a boot program, you have bigger problems... */
48 1.2 thorpej #ifndef _STANDALONE
49 1.2 thorpej
50 1.1 ross #include <sys/cdefs.h>
51 1.1 ross #if defined(LIBC_SCCS) && !defined(lint)
52 1.6 martin __RCSID("$NetBSD: softfloat.c,v 1.6 2017/12/31 11:43:42 martin Exp $");
53 1.1 ross #endif /* LIBC_SCCS and not lint */
54 1.1 ross
55 1.1 ross #ifdef SOFTFLOAT_FOR_GCC
56 1.1 ross #include "softfloat-for-gcc.h"
57 1.1 ross #endif
58 1.1 ross
59 1.1 ross #include "milieu.h"
60 1.1 ross #include "softfloat.h"
61 1.1 ross
62 1.1 ross /*
63 1.1 ross * Conversions between floats as stored in memory and floats as
64 1.1 ross * SoftFloat uses them
65 1.1 ross */
66 1.1 ross #ifndef FLOAT64_DEMANGLE
67 1.1 ross #define FLOAT64_DEMANGLE(a) (a)
68 1.1 ross #endif
69 1.1 ross #ifndef FLOAT64_MANGLE
70 1.1 ross #define FLOAT64_MANGLE(a) (a)
71 1.1 ross #endif
72 1.1 ross
73 1.1 ross /*
74 1.1 ross -------------------------------------------------------------------------------
75 1.1 ross Floating-point rounding mode, extended double-precision rounding precision,
76 1.1 ross and exception flags.
77 1.1 ross -------------------------------------------------------------------------------
78 1.1 ross */
79 1.1 ross
80 1.1 ross /*
81 1.1 ross * XXX: This may cause options-MULTIPROCESSOR or thread problems someday.
82 1.1 ross * Right now, it does not. I've removed all other dynamic global
83 1.1 ross * variables. [ross]
84 1.1 ross */
85 1.1 ross #ifdef FLOATX80
86 1.1 ross int8 floatx80_rounding_precision = 80;
87 1.1 ross #endif
88 1.1 ross
89 1.1 ross /*
90 1.1 ross -------------------------------------------------------------------------------
91 1.1 ross Primitive arithmetic functions, including multi-word arithmetic, and
92 1.1 ross division and square root approximations. (Can be specialized to target if
93 1.1 ross desired.)
94 1.1 ross -------------------------------------------------------------------------------
95 1.1 ross */
96 1.1 ross #include "softfloat-macros.h"
97 1.1 ross
98 1.1 ross /*
99 1.1 ross -------------------------------------------------------------------------------
100 1.1 ross Functions and definitions to determine: (1) whether tininess for underflow
101 1.1 ross is detected before or after rounding by default, (2) what (if anything)
102 1.1 ross happens when exceptions are raised, (3) how signaling NaNs are distinguished
103 1.1 ross from quiet NaNs, (4) the default generated quiet NaNs, and (5) how NaNs
104 1.1 ross are propagated from function inputs to output. These details are target-
105 1.1 ross specific.
106 1.1 ross -------------------------------------------------------------------------------
107 1.1 ross */
108 1.1 ross #include "softfloat-specialize.h"
109 1.1 ross
110 1.1 ross #ifndef SOFTFLOAT_FOR_GCC /* Not used */
111 1.1 ross /*
112 1.1 ross -------------------------------------------------------------------------------
113 1.1 ross Takes a 64-bit fixed-point value `absZ' with binary point between bits 6
114 1.1 ross and 7, and returns the properly rounded 32-bit integer corresponding to the
115 1.1 ross input. If `zSign' is 1, the input is negated before being converted to an
116 1.1 ross integer. Bit 63 of `absZ' must be zero. Ordinarily, the fixed-point input
117 1.1 ross is simply rounded to an integer, with the inexact exception raised if the
118 1.1 ross input cannot be represented exactly as an integer. However, if the fixed-
119 1.1 ross point input is too large, the invalid exception is raised and the largest
120 1.1 ross positive or negative integer is returned.
121 1.1 ross -------------------------------------------------------------------------------
122 1.1 ross */
123 1.1 ross static int32 roundAndPackInt32( flag zSign, bits64 absZ )
124 1.1 ross {
125 1.1 ross int8 roundingMode;
126 1.1 ross flag roundNearestEven;
127 1.1 ross int8 roundIncrement, roundBits;
128 1.1 ross int32 z;
129 1.1 ross
130 1.1 ross roundingMode = float_rounding_mode();
131 1.1 ross roundNearestEven = ( roundingMode == float_round_nearest_even );
132 1.1 ross roundIncrement = 0x40;
133 1.1 ross if ( ! roundNearestEven ) {
134 1.1 ross if ( roundingMode == float_round_to_zero ) {
135 1.1 ross roundIncrement = 0;
136 1.1 ross }
137 1.1 ross else {
138 1.1 ross roundIncrement = 0x7F;
139 1.1 ross if ( zSign ) {
140 1.1 ross if ( roundingMode == float_round_up ) roundIncrement = 0;
141 1.1 ross }
142 1.1 ross else {
143 1.1 ross if ( roundingMode == float_round_down ) roundIncrement = 0;
144 1.1 ross }
145 1.1 ross }
146 1.1 ross }
147 1.1 ross roundBits = absZ & 0x7F;
148 1.1 ross absZ = ( absZ + roundIncrement )>>7;
149 1.1 ross absZ &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven );
150 1.1 ross z = absZ;
151 1.1 ross if ( zSign ) z = - z;
152 1.1 ross if ( ( absZ>>32 ) || ( z && ( ( z < 0 ) ^ zSign ) ) ) {
153 1.1 ross float_raise( float_flag_invalid );
154 1.1 ross return zSign ? (sbits32) 0x80000000 : 0x7FFFFFFF;
155 1.1 ross }
156 1.1 ross if ( roundBits ) float_set_inexact();
157 1.1 ross return z;
158 1.1 ross
159 1.1 ross }
160 1.1 ross
161 1.1 ross /*
162 1.1 ross -------------------------------------------------------------------------------
163 1.1 ross Takes the 128-bit fixed-point value formed by concatenating `absZ0' and
164 1.1 ross `absZ1', with binary point between bits 63 and 64 (between the input words),
165 1.1 ross and returns the properly rounded 64-bit integer corresponding to the input.
166 1.1 ross If `zSign' is 1, the input is negated before being converted to an integer.
167 1.1 ross Ordinarily, the fixed-point input is simply rounded to an integer, with
168 1.1 ross the inexact exception raised if the input cannot be represented exactly as
169 1.1 ross an integer. However, if the fixed-point input is too large, the invalid
170 1.1 ross exception is raised and the largest positive or negative integer is
171 1.1 ross returned.
172 1.1 ross -------------------------------------------------------------------------------
173 1.1 ross */
174 1.1 ross static int64 roundAndPackInt64( flag zSign, bits64 absZ0, bits64 absZ1 )
175 1.1 ross {
176 1.1 ross int8 roundingMode;
177 1.1 ross flag roundNearestEven, increment;
178 1.1 ross int64 z;
179 1.1 ross
180 1.1 ross roundingMode = float_rounding_mode();
181 1.1 ross roundNearestEven = ( roundingMode == float_round_nearest_even );
182 1.1 ross increment = ( (sbits64) absZ1 < 0 );
183 1.1 ross if ( ! roundNearestEven ) {
184 1.1 ross if ( roundingMode == float_round_to_zero ) {
185 1.1 ross increment = 0;
186 1.1 ross }
187 1.1 ross else {
188 1.1 ross if ( zSign ) {
189 1.1 ross increment = ( roundingMode == float_round_down ) && absZ1;
190 1.1 ross }
191 1.1 ross else {
192 1.1 ross increment = ( roundingMode == float_round_up ) && absZ1;
193 1.1 ross }
194 1.1 ross }
195 1.1 ross }
196 1.1 ross if ( increment ) {
197 1.1 ross ++absZ0;
198 1.1 ross if ( absZ0 == 0 ) goto overflow;
199 1.1 ross absZ0 &= ~ ( ( (bits64) ( absZ1<<1 ) == 0 ) & roundNearestEven );
200 1.1 ross }
201 1.1 ross z = absZ0;
202 1.1 ross if ( zSign ) z = - z;
203 1.1 ross if ( z && ( ( z < 0 ) ^ zSign ) ) {
204 1.1 ross overflow:
205 1.1 ross float_raise( float_flag_invalid );
206 1.1 ross return
207 1.1 ross zSign ? (sbits64) LIT64( 0x8000000000000000 )
208 1.1 ross : LIT64( 0x7FFFFFFFFFFFFFFF );
209 1.1 ross }
210 1.1 ross if ( absZ1 ) float_set_inexact();
211 1.1 ross return z;
212 1.1 ross
213 1.1 ross }
214 1.6 martin
215 1.6 martin /* same as above, but for unsigned values */
216 1.6 martin static uint64 roundAndPackUInt64( bits64 absZ0, bits64 absZ1 )
217 1.6 martin {
218 1.6 martin int8 roundingMode;
219 1.6 martin flag roundNearestEven, increment;
220 1.6 martin uint64 z;
221 1.6 martin
222 1.6 martin roundingMode = float_rounding_mode();
223 1.6 martin roundNearestEven = ( roundingMode == float_round_nearest_even );
224 1.6 martin increment = ( (sbits64) absZ1 < 0 );
225 1.6 martin if ( ! roundNearestEven ) {
226 1.6 martin if ( roundingMode == float_round_to_zero ) {
227 1.6 martin increment = 0;
228 1.6 martin }
229 1.6 martin else {
230 1.6 martin increment = ( roundingMode == float_round_up ) && absZ1;
231 1.6 martin }
232 1.6 martin }
233 1.6 martin if ( increment ) {
234 1.6 martin ++absZ0;
235 1.6 martin if ( absZ0 == 0 ) {
236 1.6 martin float_raise( float_flag_invalid );
237 1.6 martin return LIT64( 0x7FFFFFFFFFFFFFFF );
238 1.6 martin }
239 1.6 martin absZ0 &= ~ ( ( (bits64) ( absZ1<<1 ) == 0 ) & roundNearestEven );
240 1.6 martin }
241 1.6 martin z = absZ0;
242 1.6 martin if ( absZ1 ) float_set_inexact();
243 1.6 martin return z;
244 1.6 martin
245 1.6 martin }
246 1.1 ross #endif
247 1.1 ross
248 1.1 ross /*
249 1.1 ross -------------------------------------------------------------------------------
250 1.1 ross Returns the fraction bits of the single-precision floating-point value `a'.
251 1.1 ross -------------------------------------------------------------------------------
252 1.1 ross */
253 1.1 ross INLINE bits32 extractFloat32Frac( float32 a )
254 1.1 ross {
255 1.1 ross
256 1.1 ross return a & 0x007FFFFF;
257 1.1 ross
258 1.1 ross }
259 1.1 ross
260 1.1 ross /*
261 1.1 ross -------------------------------------------------------------------------------
262 1.1 ross Returns the exponent bits of the single-precision floating-point value `a'.
263 1.1 ross -------------------------------------------------------------------------------
264 1.1 ross */
265 1.1 ross INLINE int16 extractFloat32Exp( float32 a )
266 1.1 ross {
267 1.1 ross
268 1.1 ross return ( a>>23 ) & 0xFF;
269 1.1 ross
270 1.1 ross }
271 1.1 ross
272 1.1 ross /*
273 1.1 ross -------------------------------------------------------------------------------
274 1.1 ross Returns the sign bit of the single-precision floating-point value `a'.
275 1.1 ross -------------------------------------------------------------------------------
276 1.1 ross */
277 1.1 ross INLINE flag extractFloat32Sign( float32 a )
278 1.1 ross {
279 1.1 ross
280 1.1 ross return a>>31;
281 1.1 ross
282 1.1 ross }
283 1.1 ross
284 1.1 ross /*
285 1.1 ross -------------------------------------------------------------------------------
286 1.1 ross Normalizes the subnormal single-precision floating-point value represented
287 1.1 ross by the denormalized significand `aSig'. The normalized exponent and
288 1.1 ross significand are stored at the locations pointed to by `zExpPtr' and
289 1.1 ross `zSigPtr', respectively.
290 1.1 ross -------------------------------------------------------------------------------
291 1.1 ross */
292 1.1 ross static void
293 1.1 ross normalizeFloat32Subnormal( bits32 aSig, int16 *zExpPtr, bits32 *zSigPtr )
294 1.1 ross {
295 1.1 ross int8 shiftCount;
296 1.1 ross
297 1.1 ross shiftCount = countLeadingZeros32( aSig ) - 8;
298 1.1 ross *zSigPtr = aSig<<shiftCount;
299 1.1 ross *zExpPtr = 1 - shiftCount;
300 1.1 ross
301 1.1 ross }
302 1.1 ross
303 1.1 ross /*
304 1.1 ross -------------------------------------------------------------------------------
305 1.1 ross Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
306 1.1 ross single-precision floating-point value, returning the result. After being
307 1.1 ross shifted into the proper positions, the three fields are simply added
308 1.1 ross together to form the result. This means that any integer portion of `zSig'
309 1.1 ross will be added into the exponent. Since a properly normalized significand
310 1.1 ross will have an integer portion equal to 1, the `zExp' input should be 1 less
311 1.1 ross than the desired result exponent whenever `zSig' is a complete, normalized
312 1.1 ross significand.
313 1.1 ross -------------------------------------------------------------------------------
314 1.1 ross */
315 1.1 ross INLINE float32 packFloat32( flag zSign, int16 zExp, bits32 zSig )
316 1.1 ross {
317 1.1 ross
318 1.1 ross return ( ( (bits32) zSign )<<31 ) + ( ( (bits32) zExp )<<23 ) + zSig;
319 1.1 ross
320 1.1 ross }
321 1.1 ross
322 1.1 ross /*
323 1.1 ross -------------------------------------------------------------------------------
324 1.1 ross Takes an abstract floating-point value having sign `zSign', exponent `zExp',
325 1.1 ross and significand `zSig', and returns the proper single-precision floating-
326 1.1 ross point value corresponding to the abstract input. Ordinarily, the abstract
327 1.1 ross value is simply rounded and packed into the single-precision format, with
328 1.1 ross the inexact exception raised if the abstract input cannot be represented
329 1.1 ross exactly. However, if the abstract value is too large, the overflow and
330 1.1 ross inexact exceptions are raised and an infinity or maximal finite value is
331 1.1 ross returned. If the abstract value is too small, the input value is rounded to
332 1.1 ross a subnormal number, and the underflow and inexact exceptions are raised if
333 1.1 ross the abstract input cannot be represented exactly as a subnormal single-
334 1.1 ross precision floating-point number.
335 1.1 ross The input significand `zSig' has its binary point between bits 30
336 1.1 ross and 29, which is 7 bits to the left of the usual location. This shifted
337 1.1 ross significand must be normalized or smaller. If `zSig' is not normalized,
338 1.1 ross `zExp' must be 0; in that case, the result returned is a subnormal number,
339 1.1 ross and it must not require rounding. In the usual case that `zSig' is
340 1.1 ross normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
341 1.1 ross The handling of underflow and overflow follows the IEC/IEEE Standard for
342 1.1 ross Binary Floating-Point Arithmetic.
343 1.1 ross -------------------------------------------------------------------------------
344 1.1 ross */
345 1.1 ross static float32 roundAndPackFloat32( flag zSign, int16 zExp, bits32 zSig )
346 1.1 ross {
347 1.1 ross int8 roundingMode;
348 1.1 ross flag roundNearestEven;
349 1.1 ross int8 roundIncrement, roundBits;
350 1.1 ross flag isTiny;
351 1.1 ross
352 1.1 ross roundingMode = float_rounding_mode();
353 1.1 ross roundNearestEven = ( roundingMode == float_round_nearest_even );
354 1.1 ross roundIncrement = 0x40;
355 1.1 ross if ( ! roundNearestEven ) {
356 1.1 ross if ( roundingMode == float_round_to_zero ) {
357 1.1 ross roundIncrement = 0;
358 1.1 ross }
359 1.1 ross else {
360 1.1 ross roundIncrement = 0x7F;
361 1.1 ross if ( zSign ) {
362 1.1 ross if ( roundingMode == float_round_up ) roundIncrement = 0;
363 1.1 ross }
364 1.1 ross else {
365 1.1 ross if ( roundingMode == float_round_down ) roundIncrement = 0;
366 1.1 ross }
367 1.1 ross }
368 1.1 ross }
369 1.1 ross roundBits = zSig & 0x7F;
370 1.1 ross if ( 0xFD <= (bits16) zExp ) {
371 1.1 ross if ( ( 0xFD < zExp )
372 1.1 ross || ( ( zExp == 0xFD )
373 1.1 ross && ( (sbits32) ( zSig + roundIncrement ) < 0 ) )
374 1.1 ross ) {
375 1.1 ross float_raise( float_flag_overflow | float_flag_inexact );
376 1.1 ross return packFloat32( zSign, 0xFF, 0 ) - ( roundIncrement == 0 );
377 1.1 ross }
378 1.1 ross if ( zExp < 0 ) {
379 1.1 ross isTiny =
380 1.1 ross ( float_detect_tininess == float_tininess_before_rounding )
381 1.1 ross || ( zExp < -1 )
382 1.1 ross || ( zSig + roundIncrement < 0x80000000 );
383 1.1 ross shift32RightJamming( zSig, - zExp, &zSig );
384 1.1 ross zExp = 0;
385 1.1 ross roundBits = zSig & 0x7F;
386 1.1 ross if ( isTiny && roundBits ) float_raise( float_flag_underflow );
387 1.1 ross }
388 1.1 ross }
389 1.1 ross if ( roundBits ) float_set_inexact();
390 1.1 ross zSig = ( zSig + roundIncrement )>>7;
391 1.1 ross zSig &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven );
392 1.1 ross if ( zSig == 0 ) zExp = 0;
393 1.1 ross return packFloat32( zSign, zExp, zSig );
394 1.1 ross
395 1.1 ross }
396 1.1 ross
397 1.1 ross /*
398 1.1 ross -------------------------------------------------------------------------------
399 1.1 ross Takes an abstract floating-point value having sign `zSign', exponent `zExp',
400 1.1 ross and significand `zSig', and returns the proper single-precision floating-
401 1.1 ross point value corresponding to the abstract input. This routine is just like
402 1.1 ross `roundAndPackFloat32' except that `zSig' does not have to be normalized.
403 1.1 ross Bit 31 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
404 1.1 ross floating-point exponent.
405 1.1 ross -------------------------------------------------------------------------------
406 1.1 ross */
407 1.1 ross static float32
408 1.1 ross normalizeRoundAndPackFloat32( flag zSign, int16 zExp, bits32 zSig )
409 1.1 ross {
410 1.1 ross int8 shiftCount;
411 1.1 ross
412 1.1 ross shiftCount = countLeadingZeros32( zSig ) - 1;
413 1.1 ross return roundAndPackFloat32( zSign, zExp - shiftCount, zSig<<shiftCount );
414 1.1 ross
415 1.1 ross }
416 1.1 ross
417 1.1 ross /*
418 1.1 ross -------------------------------------------------------------------------------
419 1.1 ross Returns the fraction bits of the double-precision floating-point value `a'.
420 1.1 ross -------------------------------------------------------------------------------
421 1.1 ross */
422 1.1 ross INLINE bits64 extractFloat64Frac( float64 a )
423 1.1 ross {
424 1.1 ross
425 1.1 ross return FLOAT64_DEMANGLE(a) & LIT64( 0x000FFFFFFFFFFFFF );
426 1.1 ross
427 1.1 ross }
428 1.1 ross
429 1.1 ross /*
430 1.1 ross -------------------------------------------------------------------------------
431 1.1 ross Returns the exponent bits of the double-precision floating-point value `a'.
432 1.1 ross -------------------------------------------------------------------------------
433 1.1 ross */
434 1.1 ross INLINE int16 extractFloat64Exp( float64 a )
435 1.1 ross {
436 1.1 ross
437 1.1 ross return ( FLOAT64_DEMANGLE(a)>>52 ) & 0x7FF;
438 1.1 ross
439 1.1 ross }
440 1.1 ross
441 1.1 ross /*
442 1.1 ross -------------------------------------------------------------------------------
443 1.1 ross Returns the sign bit of the double-precision floating-point value `a'.
444 1.1 ross -------------------------------------------------------------------------------
445 1.1 ross */
446 1.1 ross INLINE flag extractFloat64Sign( float64 a )
447 1.1 ross {
448 1.1 ross
449 1.1 ross return FLOAT64_DEMANGLE(a)>>63;
450 1.1 ross
451 1.1 ross }
452 1.1 ross
453 1.1 ross /*
454 1.1 ross -------------------------------------------------------------------------------
455 1.1 ross Normalizes the subnormal double-precision floating-point value represented
456 1.1 ross by the denormalized significand `aSig'. The normalized exponent and
457 1.1 ross significand are stored at the locations pointed to by `zExpPtr' and
458 1.1 ross `zSigPtr', respectively.
459 1.1 ross -------------------------------------------------------------------------------
460 1.1 ross */
461 1.1 ross static void
462 1.1 ross normalizeFloat64Subnormal( bits64 aSig, int16 *zExpPtr, bits64 *zSigPtr )
463 1.1 ross {
464 1.1 ross int8 shiftCount;
465 1.1 ross
466 1.1 ross shiftCount = countLeadingZeros64( aSig ) - 11;
467 1.1 ross *zSigPtr = aSig<<shiftCount;
468 1.1 ross *zExpPtr = 1 - shiftCount;
469 1.1 ross
470 1.1 ross }
471 1.1 ross
472 1.1 ross /*
473 1.1 ross -------------------------------------------------------------------------------
474 1.1 ross Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
475 1.1 ross double-precision floating-point value, returning the result. After being
476 1.1 ross shifted into the proper positions, the three fields are simply added
477 1.1 ross together to form the result. This means that any integer portion of `zSig'
478 1.1 ross will be added into the exponent. Since a properly normalized significand
479 1.1 ross will have an integer portion equal to 1, the `zExp' input should be 1 less
480 1.1 ross than the desired result exponent whenever `zSig' is a complete, normalized
481 1.1 ross significand.
482 1.1 ross -------------------------------------------------------------------------------
483 1.1 ross */
484 1.1 ross INLINE float64 packFloat64( flag zSign, int16 zExp, bits64 zSig )
485 1.1 ross {
486 1.1 ross
487 1.1 ross return FLOAT64_MANGLE( ( ( (bits64) zSign )<<63 ) +
488 1.1 ross ( ( (bits64) zExp )<<52 ) + zSig );
489 1.1 ross
490 1.1 ross }
491 1.1 ross
492 1.1 ross /*
493 1.1 ross -------------------------------------------------------------------------------
494 1.1 ross Takes an abstract floating-point value having sign `zSign', exponent `zExp',
495 1.1 ross and significand `zSig', and returns the proper double-precision floating-
496 1.1 ross point value corresponding to the abstract input. Ordinarily, the abstract
497 1.1 ross value is simply rounded and packed into the double-precision format, with
498 1.1 ross the inexact exception raised if the abstract input cannot be represented
499 1.1 ross exactly. However, if the abstract value is too large, the overflow and
500 1.1 ross inexact exceptions are raised and an infinity or maximal finite value is
501 1.1 ross returned. If the abstract value is too small, the input value is rounded to
502 1.1 ross a subnormal number, and the underflow and inexact exceptions are raised if
503 1.1 ross the abstract input cannot be represented exactly as a subnormal double-
504 1.1 ross precision floating-point number.
505 1.1 ross The input significand `zSig' has its binary point between bits 62
506 1.1 ross and 61, which is 10 bits to the left of the usual location. This shifted
507 1.1 ross significand must be normalized or smaller. If `zSig' is not normalized,
508 1.1 ross `zExp' must be 0; in that case, the result returned is a subnormal number,
509 1.1 ross and it must not require rounding. In the usual case that `zSig' is
510 1.1 ross normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
511 1.1 ross The handling of underflow and overflow follows the IEC/IEEE Standard for
512 1.1 ross Binary Floating-Point Arithmetic.
513 1.1 ross -------------------------------------------------------------------------------
514 1.1 ross */
515 1.1 ross static float64 roundAndPackFloat64( flag zSign, int16 zExp, bits64 zSig )
516 1.1 ross {
517 1.1 ross int8 roundingMode;
518 1.1 ross flag roundNearestEven;
519 1.1 ross int16 roundIncrement, roundBits;
520 1.1 ross flag isTiny;
521 1.1 ross
522 1.1 ross roundingMode = float_rounding_mode();
523 1.1 ross roundNearestEven = ( roundingMode == float_round_nearest_even );
524 1.1 ross roundIncrement = 0x200;
525 1.1 ross if ( ! roundNearestEven ) {
526 1.1 ross if ( roundingMode == float_round_to_zero ) {
527 1.1 ross roundIncrement = 0;
528 1.1 ross }
529 1.1 ross else {
530 1.1 ross roundIncrement = 0x3FF;
531 1.1 ross if ( zSign ) {
532 1.1 ross if ( roundingMode == float_round_up ) roundIncrement = 0;
533 1.1 ross }
534 1.1 ross else {
535 1.1 ross if ( roundingMode == float_round_down ) roundIncrement = 0;
536 1.1 ross }
537 1.1 ross }
538 1.1 ross }
539 1.1 ross roundBits = zSig & 0x3FF;
540 1.1 ross if ( 0x7FD <= (bits16) zExp ) {
541 1.1 ross if ( ( 0x7FD < zExp )
542 1.1 ross || ( ( zExp == 0x7FD )
543 1.1 ross && ( (sbits64) ( zSig + roundIncrement ) < 0 ) )
544 1.1 ross ) {
545 1.1 ross float_raise( float_flag_overflow | float_flag_inexact );
546 1.1 ross return FLOAT64_MANGLE(
547 1.1 ross FLOAT64_DEMANGLE(packFloat64( zSign, 0x7FF, 0 )) -
548 1.1 ross ( roundIncrement == 0 ));
549 1.1 ross }
550 1.1 ross if ( zExp < 0 ) {
551 1.1 ross isTiny =
552 1.1 ross ( float_detect_tininess == float_tininess_before_rounding )
553 1.1 ross || ( zExp < -1 )
554 1.1 ross || ( zSig + roundIncrement < LIT64( 0x8000000000000000 ) );
555 1.1 ross shift64RightJamming( zSig, - zExp, &zSig );
556 1.1 ross zExp = 0;
557 1.1 ross roundBits = zSig & 0x3FF;
558 1.1 ross if ( isTiny && roundBits ) float_raise( float_flag_underflow );
559 1.1 ross }
560 1.1 ross }
561 1.1 ross if ( roundBits ) float_set_inexact();
562 1.1 ross zSig = ( zSig + roundIncrement )>>10;
563 1.1 ross zSig &= ~ ( ( ( roundBits ^ 0x200 ) == 0 ) & roundNearestEven );
564 1.1 ross if ( zSig == 0 ) zExp = 0;
565 1.1 ross return packFloat64( zSign, zExp, zSig );
566 1.1 ross
567 1.1 ross }
568 1.1 ross
569 1.1 ross /*
570 1.1 ross -------------------------------------------------------------------------------
571 1.1 ross Takes an abstract floating-point value having sign `zSign', exponent `zExp',
572 1.1 ross and significand `zSig', and returns the proper double-precision floating-
573 1.1 ross point value corresponding to the abstract input. This routine is just like
574 1.1 ross `roundAndPackFloat64' except that `zSig' does not have to be normalized.
575 1.1 ross Bit 63 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
576 1.1 ross floating-point exponent.
577 1.1 ross -------------------------------------------------------------------------------
578 1.1 ross */
579 1.1 ross static float64
580 1.1 ross normalizeRoundAndPackFloat64( flag zSign, int16 zExp, bits64 zSig )
581 1.1 ross {
582 1.1 ross int8 shiftCount;
583 1.1 ross
584 1.1 ross shiftCount = countLeadingZeros64( zSig ) - 1;
585 1.1 ross return roundAndPackFloat64( zSign, zExp - shiftCount, zSig<<shiftCount );
586 1.1 ross
587 1.1 ross }
588 1.1 ross
589 1.1 ross #ifdef FLOATX80
590 1.1 ross
591 1.1 ross /*
592 1.1 ross -------------------------------------------------------------------------------
593 1.1 ross Returns the fraction bits of the extended double-precision floating-point
594 1.1 ross value `a'.
595 1.1 ross -------------------------------------------------------------------------------
596 1.1 ross */
597 1.1 ross INLINE bits64 extractFloatx80Frac( floatx80 a )
598 1.1 ross {
599 1.1 ross
600 1.1 ross return a.low;
601 1.1 ross
602 1.1 ross }
603 1.1 ross
604 1.1 ross /*
605 1.1 ross -------------------------------------------------------------------------------
606 1.1 ross Returns the exponent bits of the extended double-precision floating-point
607 1.1 ross value `a'.
608 1.1 ross -------------------------------------------------------------------------------
609 1.1 ross */
610 1.1 ross INLINE int32 extractFloatx80Exp( floatx80 a )
611 1.1 ross {
612 1.1 ross
613 1.1 ross return a.high & 0x7FFF;
614 1.1 ross
615 1.1 ross }
616 1.1 ross
617 1.1 ross /*
618 1.1 ross -------------------------------------------------------------------------------
619 1.1 ross Returns the sign bit of the extended double-precision floating-point value
620 1.1 ross `a'.
621 1.1 ross -------------------------------------------------------------------------------
622 1.1 ross */
623 1.1 ross INLINE flag extractFloatx80Sign( floatx80 a )
624 1.1 ross {
625 1.1 ross
626 1.1 ross return a.high>>15;
627 1.1 ross
628 1.1 ross }
629 1.1 ross
630 1.1 ross /*
631 1.1 ross -------------------------------------------------------------------------------
632 1.1 ross Normalizes the subnormal extended double-precision floating-point value
633 1.1 ross represented by the denormalized significand `aSig'. The normalized exponent
634 1.1 ross and significand are stored at the locations pointed to by `zExpPtr' and
635 1.1 ross `zSigPtr', respectively.
636 1.1 ross -------------------------------------------------------------------------------
637 1.1 ross */
638 1.1 ross static void
639 1.1 ross normalizeFloatx80Subnormal( bits64 aSig, int32 *zExpPtr, bits64 *zSigPtr )
640 1.1 ross {
641 1.1 ross int8 shiftCount;
642 1.1 ross
643 1.1 ross shiftCount = countLeadingZeros64( aSig );
644 1.1 ross *zSigPtr = aSig<<shiftCount;
645 1.1 ross *zExpPtr = 1 - shiftCount;
646 1.1 ross
647 1.1 ross }
648 1.1 ross
649 1.1 ross /*
650 1.1 ross -------------------------------------------------------------------------------
651 1.1 ross Packs the sign `zSign', exponent `zExp', and significand `zSig' into an
652 1.1 ross extended double-precision floating-point value, returning the result.
653 1.1 ross -------------------------------------------------------------------------------
654 1.1 ross */
655 1.1 ross INLINE floatx80 packFloatx80( flag zSign, int32 zExp, bits64 zSig )
656 1.1 ross {
657 1.1 ross floatx80 z;
658 1.1 ross
659 1.1 ross z.low = zSig;
660 1.1 ross z.high = ( ( (bits16) zSign )<<15 ) + zExp;
661 1.1 ross return z;
662 1.1 ross
663 1.1 ross }
664 1.1 ross
665 1.1 ross /*
666 1.1 ross -------------------------------------------------------------------------------
667 1.1 ross Takes an abstract floating-point value having sign `zSign', exponent `zExp',
668 1.1 ross and extended significand formed by the concatenation of `zSig0' and `zSig1',
669 1.1 ross and returns the proper extended double-precision floating-point value
670 1.1 ross corresponding to the abstract input. Ordinarily, the abstract value is
671 1.1 ross rounded and packed into the extended double-precision format, with the
672 1.1 ross inexact exception raised if the abstract input cannot be represented
673 1.1 ross exactly. However, if the abstract value is too large, the overflow and
674 1.1 ross inexact exceptions are raised and an infinity or maximal finite value is
675 1.1 ross returned. If the abstract value is too small, the input value is rounded to
676 1.1 ross a subnormal number, and the underflow and inexact exceptions are raised if
677 1.1 ross the abstract input cannot be represented exactly as a subnormal extended
678 1.1 ross double-precision floating-point number.
679 1.1 ross If `roundingPrecision' is 32 or 64, the result is rounded to the same
680 1.1 ross number of bits as single or double precision, respectively. Otherwise, the
681 1.1 ross result is rounded to the full precision of the extended double-precision
682 1.1 ross format.
683 1.1 ross The input significand must be normalized or smaller. If the input
684 1.1 ross significand is not normalized, `zExp' must be 0; in that case, the result
685 1.1 ross returned is a subnormal number, and it must not require rounding. The
686 1.1 ross handling of underflow and overflow follows the IEC/IEEE Standard for Binary
687 1.1 ross Floating-Point Arithmetic.
688 1.1 ross -------------------------------------------------------------------------------
689 1.1 ross */
690 1.1 ross static floatx80
691 1.1 ross roundAndPackFloatx80(
692 1.1 ross int8 roundingPrecision, flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1
693 1.1 ross )
694 1.1 ross {
695 1.1 ross int8 roundingMode;
696 1.1 ross flag roundNearestEven, increment, isTiny;
697 1.1 ross int64 roundIncrement, roundMask, roundBits;
698 1.1 ross
699 1.1 ross roundingMode = float_rounding_mode();
700 1.1 ross roundNearestEven = ( roundingMode == float_round_nearest_even );
701 1.1 ross if ( roundingPrecision == 80 ) goto precision80;
702 1.1 ross if ( roundingPrecision == 64 ) {
703 1.1 ross roundIncrement = LIT64( 0x0000000000000400 );
704 1.1 ross roundMask = LIT64( 0x00000000000007FF );
705 1.1 ross }
706 1.1 ross else if ( roundingPrecision == 32 ) {
707 1.1 ross roundIncrement = LIT64( 0x0000008000000000 );
708 1.1 ross roundMask = LIT64( 0x000000FFFFFFFFFF );
709 1.1 ross }
710 1.1 ross else {
711 1.1 ross goto precision80;
712 1.1 ross }
713 1.1 ross zSig0 |= ( zSig1 != 0 );
714 1.1 ross if ( ! roundNearestEven ) {
715 1.1 ross if ( roundingMode == float_round_to_zero ) {
716 1.1 ross roundIncrement = 0;
717 1.1 ross }
718 1.1 ross else {
719 1.1 ross roundIncrement = roundMask;
720 1.1 ross if ( zSign ) {
721 1.1 ross if ( roundingMode == float_round_up ) roundIncrement = 0;
722 1.1 ross }
723 1.1 ross else {
724 1.1 ross if ( roundingMode == float_round_down ) roundIncrement = 0;
725 1.1 ross }
726 1.1 ross }
727 1.1 ross }
728 1.1 ross roundBits = zSig0 & roundMask;
729 1.1 ross if ( 0x7FFD <= (bits32) ( zExp - 1 ) ) {
730 1.1 ross if ( ( 0x7FFE < zExp )
731 1.1 ross || ( ( zExp == 0x7FFE ) && ( zSig0 + roundIncrement < zSig0 ) )
732 1.1 ross ) {
733 1.1 ross goto overflow;
734 1.1 ross }
735 1.1 ross if ( zExp <= 0 ) {
736 1.1 ross isTiny =
737 1.1 ross ( float_detect_tininess == float_tininess_before_rounding )
738 1.1 ross || ( zExp < 0 )
739 1.1 ross || ( zSig0 <= zSig0 + roundIncrement );
740 1.1 ross shift64RightJamming( zSig0, 1 - zExp, &zSig0 );
741 1.1 ross zExp = 0;
742 1.1 ross roundBits = zSig0 & roundMask;
743 1.1 ross if ( isTiny && roundBits ) float_raise( float_flag_underflow );
744 1.1 ross if ( roundBits ) float_set_inexact();
745 1.1 ross zSig0 += roundIncrement;
746 1.1 ross if ( (sbits64) zSig0 < 0 ) zExp = 1;
747 1.1 ross roundIncrement = roundMask + 1;
748 1.1 ross if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) {
749 1.1 ross roundMask |= roundIncrement;
750 1.1 ross }
751 1.1 ross zSig0 &= ~ roundMask;
752 1.1 ross return packFloatx80( zSign, zExp, zSig0 );
753 1.1 ross }
754 1.1 ross }
755 1.1 ross if ( roundBits ) float_set_inexact();
756 1.1 ross zSig0 += roundIncrement;
757 1.1 ross if ( zSig0 < roundIncrement ) {
758 1.1 ross ++zExp;
759 1.1 ross zSig0 = LIT64( 0x8000000000000000 );
760 1.1 ross }
761 1.1 ross roundIncrement = roundMask + 1;
762 1.1 ross if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) {
763 1.1 ross roundMask |= roundIncrement;
764 1.1 ross }
765 1.1 ross zSig0 &= ~ roundMask;
766 1.1 ross if ( zSig0 == 0 ) zExp = 0;
767 1.1 ross return packFloatx80( zSign, zExp, zSig0 );
768 1.1 ross precision80:
769 1.1 ross increment = ( (sbits64) zSig1 < 0 );
770 1.1 ross if ( ! roundNearestEven ) {
771 1.1 ross if ( roundingMode == float_round_to_zero ) {
772 1.1 ross increment = 0;
773 1.1 ross }
774 1.1 ross else {
775 1.1 ross if ( zSign ) {
776 1.1 ross increment = ( roundingMode == float_round_down ) && zSig1;
777 1.1 ross }
778 1.1 ross else {
779 1.1 ross increment = ( roundingMode == float_round_up ) && zSig1;
780 1.1 ross }
781 1.1 ross }
782 1.1 ross }
783 1.1 ross if ( 0x7FFD <= (bits32) ( zExp - 1 ) ) {
784 1.1 ross if ( ( 0x7FFE < zExp )
785 1.1 ross || ( ( zExp == 0x7FFE )
786 1.1 ross && ( zSig0 == LIT64( 0xFFFFFFFFFFFFFFFF ) )
787 1.1 ross && increment
788 1.1 ross )
789 1.1 ross ) {
790 1.1 ross roundMask = 0;
791 1.1 ross overflow:
792 1.1 ross float_raise( float_flag_overflow | float_flag_inexact );
793 1.1 ross if ( ( roundingMode == float_round_to_zero )
794 1.1 ross || ( zSign && ( roundingMode == float_round_up ) )
795 1.1 ross || ( ! zSign && ( roundingMode == float_round_down ) )
796 1.1 ross ) {
797 1.1 ross return packFloatx80( zSign, 0x7FFE, ~ roundMask );
798 1.1 ross }
799 1.1 ross return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
800 1.1 ross }
801 1.1 ross if ( zExp <= 0 ) {
802 1.1 ross isTiny =
803 1.1 ross ( float_detect_tininess == float_tininess_before_rounding )
804 1.1 ross || ( zExp < 0 )
805 1.1 ross || ! increment
806 1.1 ross || ( zSig0 < LIT64( 0xFFFFFFFFFFFFFFFF ) );
807 1.1 ross shift64ExtraRightJamming( zSig0, zSig1, 1 - zExp, &zSig0, &zSig1 );
808 1.1 ross zExp = 0;
809 1.1 ross if ( isTiny && zSig1 ) float_raise( float_flag_underflow );
810 1.1 ross if ( zSig1 ) float_set_inexact();
811 1.1 ross if ( roundNearestEven ) {
812 1.1 ross increment = ( (sbits64) zSig1 < 0 );
813 1.1 ross }
814 1.1 ross else {
815 1.1 ross if ( zSign ) {
816 1.1 ross increment = ( roundingMode == float_round_down ) && zSig1;
817 1.1 ross }
818 1.1 ross else {
819 1.1 ross increment = ( roundingMode == float_round_up ) && zSig1;
820 1.1 ross }
821 1.1 ross }
822 1.1 ross if ( increment ) {
823 1.1 ross ++zSig0;
824 1.1 ross zSig0 &=
825 1.1 ross ~ ( ( (bits64) ( zSig1<<1 ) == 0 ) & roundNearestEven );
826 1.1 ross if ( (sbits64) zSig0 < 0 ) zExp = 1;
827 1.1 ross }
828 1.1 ross return packFloatx80( zSign, zExp, zSig0 );
829 1.1 ross }
830 1.1 ross }
831 1.1 ross if ( zSig1 ) float_set_inexact();
832 1.1 ross if ( increment ) {
833 1.1 ross ++zSig0;
834 1.1 ross if ( zSig0 == 0 ) {
835 1.1 ross ++zExp;
836 1.1 ross zSig0 = LIT64( 0x8000000000000000 );
837 1.1 ross }
838 1.1 ross else {
839 1.1 ross zSig0 &= ~ ( ( (bits64) ( zSig1<<1 ) == 0 ) & roundNearestEven );
840 1.1 ross }
841 1.1 ross }
842 1.1 ross else {
843 1.1 ross if ( zSig0 == 0 ) zExp = 0;
844 1.1 ross }
845 1.1 ross return packFloatx80( zSign, zExp, zSig0 );
846 1.1 ross
847 1.1 ross }
848 1.1 ross
849 1.1 ross /*
850 1.1 ross -------------------------------------------------------------------------------
851 1.1 ross Takes an abstract floating-point value having sign `zSign', exponent
852 1.1 ross `zExp', and significand formed by the concatenation of `zSig0' and `zSig1',
853 1.1 ross and returns the proper extended double-precision floating-point value
854 1.1 ross corresponding to the abstract input. This routine is just like
855 1.1 ross `roundAndPackFloatx80' except that the input significand does not have to be
856 1.1 ross normalized.
857 1.1 ross -------------------------------------------------------------------------------
858 1.1 ross */
859 1.1 ross static floatx80
860 1.1 ross normalizeRoundAndPackFloatx80(
861 1.1 ross int8 roundingPrecision, flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1
862 1.1 ross )
863 1.1 ross {
864 1.1 ross int8 shiftCount;
865 1.1 ross
866 1.1 ross if ( zSig0 == 0 ) {
867 1.1 ross zSig0 = zSig1;
868 1.1 ross zSig1 = 0;
869 1.1 ross zExp -= 64;
870 1.1 ross }
871 1.1 ross shiftCount = countLeadingZeros64( zSig0 );
872 1.1 ross shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
873 1.1 ross zExp -= shiftCount;
874 1.1 ross return
875 1.1 ross roundAndPackFloatx80( roundingPrecision, zSign, zExp, zSig0, zSig1 );
876 1.1 ross
877 1.1 ross }
878 1.1 ross
879 1.1 ross #endif
880 1.1 ross
881 1.1 ross #ifdef FLOAT128
882 1.1 ross
883 1.1 ross /*
884 1.1 ross -------------------------------------------------------------------------------
885 1.1 ross Returns the least-significant 64 fraction bits of the quadruple-precision
886 1.1 ross floating-point value `a'.
887 1.1 ross -------------------------------------------------------------------------------
888 1.1 ross */
889 1.1 ross INLINE bits64 extractFloat128Frac1( float128 a )
890 1.1 ross {
891 1.1 ross
892 1.1 ross return a.low;
893 1.1 ross
894 1.1 ross }
895 1.1 ross
896 1.1 ross /*
897 1.1 ross -------------------------------------------------------------------------------
898 1.1 ross Returns the most-significant 48 fraction bits of the quadruple-precision
899 1.1 ross floating-point value `a'.
900 1.1 ross -------------------------------------------------------------------------------
901 1.1 ross */
902 1.1 ross INLINE bits64 extractFloat128Frac0( float128 a )
903 1.1 ross {
904 1.1 ross
905 1.1 ross return a.high & LIT64( 0x0000FFFFFFFFFFFF );
906 1.1 ross
907 1.1 ross }
908 1.1 ross
909 1.1 ross /*
910 1.1 ross -------------------------------------------------------------------------------
911 1.1 ross Returns the exponent bits of the quadruple-precision floating-point value
912 1.1 ross `a'.
913 1.1 ross -------------------------------------------------------------------------------
914 1.1 ross */
915 1.1 ross INLINE int32 extractFloat128Exp( float128 a )
916 1.1 ross {
917 1.1 ross
918 1.1 ross return ( a.high>>48 ) & 0x7FFF;
919 1.1 ross
920 1.1 ross }
921 1.1 ross
922 1.1 ross /*
923 1.1 ross -------------------------------------------------------------------------------
924 1.1 ross Returns the sign bit of the quadruple-precision floating-point value `a'.
925 1.1 ross -------------------------------------------------------------------------------
926 1.1 ross */
927 1.1 ross INLINE flag extractFloat128Sign( float128 a )
928 1.1 ross {
929 1.1 ross
930 1.1 ross return a.high>>63;
931 1.1 ross
932 1.1 ross }
933 1.1 ross
934 1.1 ross /*
935 1.1 ross -------------------------------------------------------------------------------
936 1.1 ross Normalizes the subnormal quadruple-precision floating-point value
937 1.1 ross represented by the denormalized significand formed by the concatenation of
938 1.1 ross `aSig0' and `aSig1'. The normalized exponent is stored at the location
939 1.1 ross pointed to by `zExpPtr'. The most significant 49 bits of the normalized
940 1.1 ross significand are stored at the location pointed to by `zSig0Ptr', and the
941 1.1 ross least significant 64 bits of the normalized significand are stored at the
942 1.1 ross location pointed to by `zSig1Ptr'.
943 1.1 ross -------------------------------------------------------------------------------
944 1.1 ross */
945 1.1 ross static void
946 1.1 ross normalizeFloat128Subnormal(
947 1.1 ross bits64 aSig0,
948 1.1 ross bits64 aSig1,
949 1.1 ross int32 *zExpPtr,
950 1.1 ross bits64 *zSig0Ptr,
951 1.1 ross bits64 *zSig1Ptr
952 1.1 ross )
953 1.1 ross {
954 1.1 ross int8 shiftCount;
955 1.1 ross
956 1.1 ross if ( aSig0 == 0 ) {
957 1.1 ross shiftCount = countLeadingZeros64( aSig1 ) - 15;
958 1.1 ross if ( shiftCount < 0 ) {
959 1.1 ross *zSig0Ptr = aSig1>>( - shiftCount );
960 1.1 ross *zSig1Ptr = aSig1<<( shiftCount & 63 );
961 1.1 ross }
962 1.1 ross else {
963 1.1 ross *zSig0Ptr = aSig1<<shiftCount;
964 1.1 ross *zSig1Ptr = 0;
965 1.1 ross }
966 1.1 ross *zExpPtr = - shiftCount - 63;
967 1.1 ross }
968 1.1 ross else {
969 1.1 ross shiftCount = countLeadingZeros64( aSig0 ) - 15;
970 1.1 ross shortShift128Left( aSig0, aSig1, shiftCount, zSig0Ptr, zSig1Ptr );
971 1.1 ross *zExpPtr = 1 - shiftCount;
972 1.1 ross }
973 1.1 ross
974 1.1 ross }
975 1.1 ross
976 1.1 ross /*
977 1.1 ross -------------------------------------------------------------------------------
978 1.1 ross Packs the sign `zSign', the exponent `zExp', and the significand formed
979 1.1 ross by the concatenation of `zSig0' and `zSig1' into a quadruple-precision
980 1.1 ross floating-point value, returning the result. After being shifted into the
981 1.1 ross proper positions, the three fields `zSign', `zExp', and `zSig0' are simply
982 1.1 ross added together to form the most significant 32 bits of the result. This
983 1.1 ross means that any integer portion of `zSig0' will be added into the exponent.
984 1.1 ross Since a properly normalized significand will have an integer portion equal
985 1.1 ross to 1, the `zExp' input should be 1 less than the desired result exponent
986 1.1 ross whenever `zSig0' and `zSig1' concatenated form a complete, normalized
987 1.1 ross significand.
988 1.1 ross -------------------------------------------------------------------------------
989 1.1 ross */
990 1.1 ross INLINE float128
991 1.1 ross packFloat128( flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1 )
992 1.1 ross {
993 1.1 ross float128 z;
994 1.1 ross
995 1.1 ross z.low = zSig1;
996 1.1 ross z.high = ( ( (bits64) zSign )<<63 ) + ( ( (bits64) zExp )<<48 ) + zSig0;
997 1.1 ross return z;
998 1.1 ross
999 1.1 ross }
1000 1.1 ross
1001 1.1 ross /*
1002 1.1 ross -------------------------------------------------------------------------------
1003 1.1 ross Takes an abstract floating-point value having sign `zSign', exponent `zExp',
1004 1.1 ross and extended significand formed by the concatenation of `zSig0', `zSig1',
1005 1.1 ross and `zSig2', and returns the proper quadruple-precision floating-point value
1006 1.1 ross corresponding to the abstract input. Ordinarily, the abstract value is
1007 1.1 ross simply rounded and packed into the quadruple-precision format, with the
1008 1.1 ross inexact exception raised if the abstract input cannot be represented
1009 1.1 ross exactly. However, if the abstract value is too large, the overflow and
1010 1.1 ross inexact exceptions are raised and an infinity or maximal finite value is
1011 1.1 ross returned. If the abstract value is too small, the input value is rounded to
1012 1.1 ross a subnormal number, and the underflow and inexact exceptions are raised if
1013 1.1 ross the abstract input cannot be represented exactly as a subnormal quadruple-
1014 1.1 ross precision floating-point number.
1015 1.1 ross The input significand must be normalized or smaller. If the input
1016 1.1 ross significand is not normalized, `zExp' must be 0; in that case, the result
1017 1.1 ross returned is a subnormal number, and it must not require rounding. In the
1018 1.1 ross usual case that the input significand is normalized, `zExp' must be 1 less
1019 1.1 ross than the ``true'' floating-point exponent. The handling of underflow and
1020 1.1 ross overflow follows the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1021 1.1 ross -------------------------------------------------------------------------------
1022 1.1 ross */
1023 1.1 ross static float128
1024 1.1 ross roundAndPackFloat128(
1025 1.1 ross flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1, bits64 zSig2 )
1026 1.1 ross {
1027 1.1 ross int8 roundingMode;
1028 1.1 ross flag roundNearestEven, increment, isTiny;
1029 1.1 ross
1030 1.1 ross roundingMode = float_rounding_mode();
1031 1.1 ross roundNearestEven = ( roundingMode == float_round_nearest_even );
1032 1.1 ross increment = ( (sbits64) zSig2 < 0 );
1033 1.1 ross if ( ! roundNearestEven ) {
1034 1.1 ross if ( roundingMode == float_round_to_zero ) {
1035 1.1 ross increment = 0;
1036 1.1 ross }
1037 1.1 ross else {
1038 1.1 ross if ( zSign ) {
1039 1.1 ross increment = ( roundingMode == float_round_down ) && zSig2;
1040 1.1 ross }
1041 1.1 ross else {
1042 1.1 ross increment = ( roundingMode == float_round_up ) && zSig2;
1043 1.1 ross }
1044 1.1 ross }
1045 1.1 ross }
1046 1.1 ross if ( 0x7FFD <= (bits32) zExp ) {
1047 1.1 ross if ( ( 0x7FFD < zExp )
1048 1.1 ross || ( ( zExp == 0x7FFD )
1049 1.1 ross && eq128(
1050 1.1 ross LIT64( 0x0001FFFFFFFFFFFF ),
1051 1.1 ross LIT64( 0xFFFFFFFFFFFFFFFF ),
1052 1.1 ross zSig0,
1053 1.1 ross zSig1
1054 1.1 ross )
1055 1.1 ross && increment
1056 1.1 ross )
1057 1.1 ross ) {
1058 1.1 ross float_raise( float_flag_overflow | float_flag_inexact );
1059 1.1 ross if ( ( roundingMode == float_round_to_zero )
1060 1.1 ross || ( zSign && ( roundingMode == float_round_up ) )
1061 1.1 ross || ( ! zSign && ( roundingMode == float_round_down ) )
1062 1.1 ross ) {
1063 1.1 ross return
1064 1.1 ross packFloat128(
1065 1.1 ross zSign,
1066 1.1 ross 0x7FFE,
1067 1.1 ross LIT64( 0x0000FFFFFFFFFFFF ),
1068 1.1 ross LIT64( 0xFFFFFFFFFFFFFFFF )
1069 1.1 ross );
1070 1.1 ross }
1071 1.1 ross return packFloat128( zSign, 0x7FFF, 0, 0 );
1072 1.1 ross }
1073 1.1 ross if ( zExp < 0 ) {
1074 1.1 ross isTiny =
1075 1.1 ross ( float_detect_tininess == float_tininess_before_rounding )
1076 1.1 ross || ( zExp < -1 )
1077 1.1 ross || ! increment
1078 1.1 ross || lt128(
1079 1.1 ross zSig0,
1080 1.1 ross zSig1,
1081 1.1 ross LIT64( 0x0001FFFFFFFFFFFF ),
1082 1.1 ross LIT64( 0xFFFFFFFFFFFFFFFF )
1083 1.1 ross );
1084 1.1 ross shift128ExtraRightJamming(
1085 1.1 ross zSig0, zSig1, zSig2, - zExp, &zSig0, &zSig1, &zSig2 );
1086 1.1 ross zExp = 0;
1087 1.1 ross if ( isTiny && zSig2 ) float_raise( float_flag_underflow );
1088 1.1 ross if ( roundNearestEven ) {
1089 1.1 ross increment = ( (sbits64) zSig2 < 0 );
1090 1.1 ross }
1091 1.1 ross else {
1092 1.1 ross if ( zSign ) {
1093 1.1 ross increment = ( roundingMode == float_round_down ) && zSig2;
1094 1.1 ross }
1095 1.1 ross else {
1096 1.1 ross increment = ( roundingMode == float_round_up ) && zSig2;
1097 1.1 ross }
1098 1.1 ross }
1099 1.1 ross }
1100 1.1 ross }
1101 1.1 ross if ( zSig2 ) float_set_inexact();
1102 1.1 ross if ( increment ) {
1103 1.1 ross add128( zSig0, zSig1, 0, 1, &zSig0, &zSig1 );
1104 1.1 ross zSig1 &= ~ ( ( zSig2 + zSig2 == 0 ) & roundNearestEven );
1105 1.1 ross }
1106 1.1 ross else {
1107 1.1 ross if ( ( zSig0 | zSig1 ) == 0 ) zExp = 0;
1108 1.1 ross }
1109 1.1 ross return packFloat128( zSign, zExp, zSig0, zSig1 );
1110 1.1 ross
1111 1.1 ross }
1112 1.1 ross
1113 1.1 ross /*
1114 1.1 ross -------------------------------------------------------------------------------
1115 1.1 ross Takes an abstract floating-point value having sign `zSign', exponent `zExp',
1116 1.1 ross and significand formed by the concatenation of `zSig0' and `zSig1', and
1117 1.1 ross returns the proper quadruple-precision floating-point value corresponding
1118 1.1 ross to the abstract input. This routine is just like `roundAndPackFloat128'
1119 1.1 ross except that the input significand has fewer bits and does not have to be
1120 1.1 ross normalized. In all cases, `zExp' must be 1 less than the ``true'' floating-
1121 1.1 ross point exponent.
1122 1.1 ross -------------------------------------------------------------------------------
1123 1.1 ross */
1124 1.1 ross static float128
1125 1.1 ross normalizeRoundAndPackFloat128(
1126 1.1 ross flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1 )
1127 1.1 ross {
1128 1.1 ross int8 shiftCount;
1129 1.1 ross bits64 zSig2;
1130 1.1 ross
1131 1.1 ross if ( zSig0 == 0 ) {
1132 1.1 ross zSig0 = zSig1;
1133 1.1 ross zSig1 = 0;
1134 1.1 ross zExp -= 64;
1135 1.1 ross }
1136 1.1 ross shiftCount = countLeadingZeros64( zSig0 ) - 15;
1137 1.1 ross if ( 0 <= shiftCount ) {
1138 1.1 ross zSig2 = 0;
1139 1.1 ross shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
1140 1.1 ross }
1141 1.1 ross else {
1142 1.1 ross shift128ExtraRightJamming(
1143 1.1 ross zSig0, zSig1, 0, - shiftCount, &zSig0, &zSig1, &zSig2 );
1144 1.1 ross }
1145 1.1 ross zExp -= shiftCount;
1146 1.1 ross return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 );
1147 1.1 ross
1148 1.1 ross }
1149 1.1 ross
1150 1.1 ross #endif
1151 1.1 ross
1152 1.1 ross /*
1153 1.1 ross -------------------------------------------------------------------------------
1154 1.1 ross Returns the result of converting the 32-bit two's complement integer `a'
1155 1.1 ross to the single-precision floating-point format. The conversion is performed
1156 1.1 ross according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1157 1.1 ross -------------------------------------------------------------------------------
1158 1.1 ross */
1159 1.1 ross float32 int32_to_float32( int32 a )
1160 1.1 ross {
1161 1.1 ross flag zSign;
1162 1.1 ross
1163 1.1 ross if ( a == 0 ) return 0;
1164 1.1 ross if ( a == (sbits32) 0x80000000 ) return packFloat32( 1, 0x9E, 0 );
1165 1.1 ross zSign = ( a < 0 );
1166 1.1 ross return normalizeRoundAndPackFloat32( zSign, 0x9C, zSign ? - a : a );
1167 1.1 ross
1168 1.1 ross }
1169 1.1 ross
1170 1.1 ross /*
1171 1.1 ross -------------------------------------------------------------------------------
1172 1.1 ross Returns the result of converting the 32-bit two's complement integer `a'
1173 1.1 ross to the double-precision floating-point format. The conversion is performed
1174 1.1 ross according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1175 1.1 ross -------------------------------------------------------------------------------
1176 1.1 ross */
1177 1.1 ross float64 int32_to_float64( int32 a )
1178 1.1 ross {
1179 1.1 ross flag zSign;
1180 1.1 ross uint32 absA;
1181 1.1 ross int8 shiftCount;
1182 1.1 ross bits64 zSig;
1183 1.1 ross
1184 1.1 ross if ( a == 0 ) return 0;
1185 1.1 ross zSign = ( a < 0 );
1186 1.1 ross absA = zSign ? - a : a;
1187 1.1 ross shiftCount = countLeadingZeros32( absA ) + 21;
1188 1.1 ross zSig = absA;
1189 1.1 ross return packFloat64( zSign, 0x432 - shiftCount, zSig<<shiftCount );
1190 1.1 ross
1191 1.1 ross }
1192 1.1 ross
1193 1.1 ross #ifdef FLOATX80
1194 1.1 ross
1195 1.1 ross /*
1196 1.1 ross -------------------------------------------------------------------------------
1197 1.1 ross Returns the result of converting the 32-bit two's complement integer `a'
1198 1.1 ross to the extended double-precision floating-point format. The conversion
1199 1.1 ross is performed according to the IEC/IEEE Standard for Binary Floating-Point
1200 1.1 ross Arithmetic.
1201 1.1 ross -------------------------------------------------------------------------------
1202 1.1 ross */
1203 1.1 ross floatx80 int32_to_floatx80( int32 a )
1204 1.1 ross {
1205 1.1 ross flag zSign;
1206 1.1 ross uint32 absA;
1207 1.1 ross int8 shiftCount;
1208 1.1 ross bits64 zSig;
1209 1.1 ross
1210 1.1 ross if ( a == 0 ) return packFloatx80( 0, 0, 0 );
1211 1.1 ross zSign = ( a < 0 );
1212 1.1 ross absA = zSign ? - a : a;
1213 1.1 ross shiftCount = countLeadingZeros32( absA ) + 32;
1214 1.1 ross zSig = absA;
1215 1.1 ross return packFloatx80( zSign, 0x403E - shiftCount, zSig<<shiftCount );
1216 1.1 ross
1217 1.1 ross }
1218 1.1 ross
1219 1.1 ross #endif
1220 1.1 ross
1221 1.1 ross #ifdef FLOAT128
1222 1.1 ross
1223 1.1 ross /*
1224 1.1 ross -------------------------------------------------------------------------------
1225 1.1 ross Returns the result of converting the 32-bit two's complement integer `a' to
1226 1.1 ross the quadruple-precision floating-point format. The conversion is performed
1227 1.1 ross according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1228 1.1 ross -------------------------------------------------------------------------------
1229 1.1 ross */
1230 1.1 ross float128 int32_to_float128( int32 a )
1231 1.1 ross {
1232 1.1 ross flag zSign;
1233 1.1 ross uint32 absA;
1234 1.1 ross int8 shiftCount;
1235 1.1 ross bits64 zSig0;
1236 1.1 ross
1237 1.1 ross if ( a == 0 ) return packFloat128( 0, 0, 0, 0 );
1238 1.1 ross zSign = ( a < 0 );
1239 1.1 ross absA = zSign ? - a : a;
1240 1.1 ross shiftCount = countLeadingZeros32( absA ) + 17;
1241 1.1 ross zSig0 = absA;
1242 1.1 ross return packFloat128( zSign, 0x402E - shiftCount, zSig0<<shiftCount, 0 );
1243 1.1 ross
1244 1.1 ross }
1245 1.1 ross
1246 1.1 ross #endif
1247 1.1 ross
1248 1.1 ross #ifndef SOFTFLOAT_FOR_GCC /* __floatdi?f is in libgcc2.c */
1249 1.1 ross /*
1250 1.1 ross -------------------------------------------------------------------------------
1251 1.1 ross Returns the result of converting the 64-bit two's complement integer `a'
1252 1.1 ross to the single-precision floating-point format. The conversion is performed
1253 1.1 ross according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1254 1.1 ross -------------------------------------------------------------------------------
1255 1.1 ross */
1256 1.1 ross float32 int64_to_float32( int64 a )
1257 1.1 ross {
1258 1.1 ross flag zSign;
1259 1.1 ross uint64 absA;
1260 1.1 ross int8 shiftCount;
1261 1.1 ross
1262 1.1 ross if ( a == 0 ) return 0;
1263 1.1 ross zSign = ( a < 0 );
1264 1.1 ross absA = zSign ? - a : a;
1265 1.1 ross shiftCount = countLeadingZeros64( absA ) - 40;
1266 1.1 ross if ( 0 <= shiftCount ) {
1267 1.1 ross return packFloat32( zSign, 0x95 - shiftCount, absA<<shiftCount );
1268 1.1 ross }
1269 1.1 ross else {
1270 1.1 ross shiftCount += 7;
1271 1.1 ross if ( shiftCount < 0 ) {
1272 1.1 ross shift64RightJamming( absA, - shiftCount, &absA );
1273 1.1 ross }
1274 1.1 ross else {
1275 1.1 ross absA <<= shiftCount;
1276 1.1 ross }
1277 1.1 ross return roundAndPackFloat32( zSign, 0x9C - shiftCount, absA );
1278 1.1 ross }
1279 1.1 ross
1280 1.1 ross }
1281 1.1 ross
1282 1.1 ross /*
1283 1.1 ross -------------------------------------------------------------------------------
1284 1.1 ross Returns the result of converting the 64-bit two's complement integer `a'
1285 1.1 ross to the double-precision floating-point format. The conversion is performed
1286 1.1 ross according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1287 1.1 ross -------------------------------------------------------------------------------
1288 1.1 ross */
1289 1.1 ross float64 int64_to_float64( int64 a )
1290 1.1 ross {
1291 1.1 ross flag zSign;
1292 1.1 ross
1293 1.1 ross if ( a == 0 ) return 0;
1294 1.1 ross if ( a == (sbits64) LIT64( 0x8000000000000000 ) ) {
1295 1.1 ross return packFloat64( 1, 0x43E, 0 );
1296 1.1 ross }
1297 1.1 ross zSign = ( a < 0 );
1298 1.1 ross return normalizeRoundAndPackFloat64( zSign, 0x43C, zSign ? - a : a );
1299 1.1 ross
1300 1.1 ross }
1301 1.1 ross
1302 1.1 ross #ifdef FLOATX80
1303 1.1 ross
1304 1.1 ross /*
1305 1.1 ross -------------------------------------------------------------------------------
1306 1.1 ross Returns the result of converting the 64-bit two's complement integer `a'
1307 1.1 ross to the extended double-precision floating-point format. The conversion
1308 1.1 ross is performed according to the IEC/IEEE Standard for Binary Floating-Point
1309 1.1 ross Arithmetic.
1310 1.1 ross -------------------------------------------------------------------------------
1311 1.1 ross */
1312 1.1 ross floatx80 int64_to_floatx80( int64 a )
1313 1.1 ross {
1314 1.1 ross flag zSign;
1315 1.1 ross uint64 absA;
1316 1.1 ross int8 shiftCount;
1317 1.1 ross
1318 1.1 ross if ( a == 0 ) return packFloatx80( 0, 0, 0 );
1319 1.1 ross zSign = ( a < 0 );
1320 1.1 ross absA = zSign ? - a : a;
1321 1.1 ross shiftCount = countLeadingZeros64( absA );
1322 1.1 ross return packFloatx80( zSign, 0x403E - shiftCount, absA<<shiftCount );
1323 1.1 ross
1324 1.1 ross }
1325 1.1 ross
1326 1.1 ross #endif
1327 1.1 ross
1328 1.1 ross #ifdef FLOAT128
1329 1.1 ross
1330 1.1 ross /*
1331 1.1 ross -------------------------------------------------------------------------------
1332 1.1 ross Returns the result of converting the 64-bit two's complement integer `a' to
1333 1.1 ross the quadruple-precision floating-point format. The conversion is performed
1334 1.1 ross according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1335 1.1 ross -------------------------------------------------------------------------------
1336 1.1 ross */
1337 1.1 ross float128 int64_to_float128( int64 a )
1338 1.1 ross {
1339 1.1 ross flag zSign;
1340 1.1 ross uint64 absA;
1341 1.1 ross int8 shiftCount;
1342 1.1 ross int32 zExp;
1343 1.1 ross bits64 zSig0, zSig1;
1344 1.1 ross
1345 1.1 ross if ( a == 0 ) return packFloat128( 0, 0, 0, 0 );
1346 1.1 ross zSign = ( a < 0 );
1347 1.1 ross absA = zSign ? - a : a;
1348 1.1 ross shiftCount = countLeadingZeros64( absA ) + 49;
1349 1.1 ross zExp = 0x406E - shiftCount;
1350 1.1 ross if ( 64 <= shiftCount ) {
1351 1.1 ross zSig1 = 0;
1352 1.1 ross zSig0 = absA;
1353 1.1 ross shiftCount -= 64;
1354 1.1 ross }
1355 1.1 ross else {
1356 1.1 ross zSig1 = absA;
1357 1.1 ross zSig0 = 0;
1358 1.1 ross }
1359 1.1 ross shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
1360 1.1 ross return packFloat128( zSign, zExp, zSig0, zSig1 );
1361 1.1 ross
1362 1.1 ross }
1363 1.1 ross
1364 1.1 ross #endif
1365 1.1 ross #endif /* !SOFTFLOAT_FOR_GCC */
1366 1.1 ross
1367 1.1 ross #ifndef SOFTFLOAT_FOR_GCC /* Not needed */
1368 1.1 ross /*
1369 1.1 ross -------------------------------------------------------------------------------
1370 1.1 ross Returns the result of converting the single-precision floating-point value
1371 1.1 ross `a' to the 32-bit two's complement integer format. The conversion is
1372 1.1 ross performed according to the IEC/IEEE Standard for Binary Floating-Point
1373 1.1 ross Arithmetic---which means in particular that the conversion is rounded
1374 1.1 ross according to the current rounding mode. If `a' is a NaN, the largest
1375 1.1 ross positive integer is returned. Otherwise, if the conversion overflows, the
1376 1.1 ross largest integer with the same sign as `a' is returned.
1377 1.1 ross -------------------------------------------------------------------------------
1378 1.1 ross */
1379 1.1 ross int32 float32_to_int32( float32 a )
1380 1.1 ross {
1381 1.1 ross flag aSign;
1382 1.1 ross int16 aExp, shiftCount;
1383 1.1 ross bits32 aSig;
1384 1.1 ross bits64 aSig64;
1385 1.1 ross
1386 1.1 ross aSig = extractFloat32Frac( a );
1387 1.1 ross aExp = extractFloat32Exp( a );
1388 1.1 ross aSign = extractFloat32Sign( a );
1389 1.1 ross if ( ( aExp == 0xFF ) && aSig ) aSign = 0;
1390 1.1 ross if ( aExp ) aSig |= 0x00800000;
1391 1.1 ross shiftCount = 0xAF - aExp;
1392 1.1 ross aSig64 = aSig;
1393 1.1 ross aSig64 <<= 32;
1394 1.1 ross if ( 0 < shiftCount ) shift64RightJamming( aSig64, shiftCount, &aSig64 );
1395 1.1 ross return roundAndPackInt32( aSign, aSig64 );
1396 1.1 ross
1397 1.1 ross }
1398 1.1 ross #endif /* !SOFTFLOAT_FOR_GCC */
1399 1.1 ross
1400 1.1 ross /*
1401 1.1 ross -------------------------------------------------------------------------------
1402 1.1 ross Returns the result of converting the single-precision floating-point value
1403 1.1 ross `a' to the 32-bit two's complement integer format. The conversion is
1404 1.1 ross performed according to the IEC/IEEE Standard for Binary Floating-Point
1405 1.1 ross Arithmetic, except that the conversion is always rounded toward zero.
1406 1.1 ross If `a' is a NaN, the largest positive integer is returned. Otherwise, if
1407 1.1 ross the conversion overflows, the largest integer with the same sign as `a' is
1408 1.1 ross returned.
1409 1.1 ross -------------------------------------------------------------------------------
1410 1.1 ross */
1411 1.1 ross int32 float32_to_int32_round_to_zero( float32 a )
1412 1.1 ross {
1413 1.1 ross flag aSign;
1414 1.1 ross int16 aExp, shiftCount;
1415 1.1 ross bits32 aSig;
1416 1.1 ross int32 z;
1417 1.1 ross
1418 1.1 ross aSig = extractFloat32Frac( a );
1419 1.1 ross aExp = extractFloat32Exp( a );
1420 1.1 ross aSign = extractFloat32Sign( a );
1421 1.1 ross shiftCount = aExp - 0x9E;
1422 1.1 ross if ( 0 <= shiftCount ) {
1423 1.1 ross if ( a != 0xCF000000 ) {
1424 1.1 ross float_raise( float_flag_invalid );
1425 1.1 ross if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) return 0x7FFFFFFF;
1426 1.1 ross }
1427 1.1 ross return (sbits32) 0x80000000;
1428 1.1 ross }
1429 1.1 ross else if ( aExp <= 0x7E ) {
1430 1.1 ross if ( aExp | aSig ) float_set_inexact();
1431 1.1 ross return 0;
1432 1.1 ross }
1433 1.1 ross aSig = ( aSig | 0x00800000 )<<8;
1434 1.1 ross z = aSig>>( - shiftCount );
1435 1.1 ross if ( (bits32) ( aSig<<( shiftCount & 31 ) ) ) {
1436 1.1 ross float_set_inexact();
1437 1.1 ross }
1438 1.1 ross if ( aSign ) z = - z;
1439 1.1 ross return z;
1440 1.1 ross
1441 1.1 ross }
1442 1.1 ross
1443 1.1 ross #ifndef SOFTFLOAT_FOR_GCC /* __fix?fdi provided by libgcc2.c */
1444 1.1 ross /*
1445 1.1 ross -------------------------------------------------------------------------------
1446 1.1 ross Returns the result of converting the single-precision floating-point value
1447 1.1 ross `a' to the 64-bit two's complement integer format. The conversion is
1448 1.1 ross performed according to the IEC/IEEE Standard for Binary Floating-Point
1449 1.1 ross Arithmetic---which means in particular that the conversion is rounded
1450 1.1 ross according to the current rounding mode. If `a' is a NaN, the largest
1451 1.1 ross positive integer is returned. Otherwise, if the conversion overflows, the
1452 1.1 ross largest integer with the same sign as `a' is returned.
1453 1.1 ross -------------------------------------------------------------------------------
1454 1.1 ross */
1455 1.1 ross int64 float32_to_int64( float32 a )
1456 1.1 ross {
1457 1.1 ross flag aSign;
1458 1.1 ross int16 aExp, shiftCount;
1459 1.1 ross bits32 aSig;
1460 1.1 ross bits64 aSig64, aSigExtra;
1461 1.1 ross
1462 1.1 ross aSig = extractFloat32Frac( a );
1463 1.1 ross aExp = extractFloat32Exp( a );
1464 1.1 ross aSign = extractFloat32Sign( a );
1465 1.1 ross shiftCount = 0xBE - aExp;
1466 1.1 ross if ( shiftCount < 0 ) {
1467 1.1 ross float_raise( float_flag_invalid );
1468 1.1 ross if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) {
1469 1.1 ross return LIT64( 0x7FFFFFFFFFFFFFFF );
1470 1.1 ross }
1471 1.1 ross return (sbits64) LIT64( 0x8000000000000000 );
1472 1.1 ross }
1473 1.1 ross if ( aExp ) aSig |= 0x00800000;
1474 1.1 ross aSig64 = aSig;
1475 1.1 ross aSig64 <<= 40;
1476 1.1 ross shift64ExtraRightJamming( aSig64, 0, shiftCount, &aSig64, &aSigExtra );
1477 1.1 ross return roundAndPackInt64( aSign, aSig64, aSigExtra );
1478 1.1 ross
1479 1.1 ross }
1480 1.1 ross
1481 1.1 ross /*
1482 1.1 ross -------------------------------------------------------------------------------
1483 1.1 ross Returns the result of converting the single-precision floating-point value
1484 1.1 ross `a' to the 64-bit two's complement integer format. The conversion is
1485 1.1 ross performed according to the IEC/IEEE Standard for Binary Floating-Point
1486 1.1 ross Arithmetic, except that the conversion is always rounded toward zero. If
1487 1.1 ross `a' is a NaN, the largest positive integer is returned. Otherwise, if the
1488 1.1 ross conversion overflows, the largest integer with the same sign as `a' is
1489 1.1 ross returned.
1490 1.1 ross -------------------------------------------------------------------------------
1491 1.1 ross */
1492 1.1 ross int64 float32_to_int64_round_to_zero( float32 a )
1493 1.1 ross {
1494 1.1 ross flag aSign;
1495 1.1 ross int16 aExp, shiftCount;
1496 1.1 ross bits32 aSig;
1497 1.1 ross bits64 aSig64;
1498 1.1 ross int64 z;
1499 1.1 ross
1500 1.1 ross aSig = extractFloat32Frac( a );
1501 1.1 ross aExp = extractFloat32Exp( a );
1502 1.1 ross aSign = extractFloat32Sign( a );
1503 1.1 ross shiftCount = aExp - 0xBE;
1504 1.1 ross if ( 0 <= shiftCount ) {
1505 1.1 ross if ( a != 0xDF000000 ) {
1506 1.1 ross float_raise( float_flag_invalid );
1507 1.1 ross if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) {
1508 1.1 ross return LIT64( 0x7FFFFFFFFFFFFFFF );
1509 1.1 ross }
1510 1.1 ross }
1511 1.1 ross return (sbits64) LIT64( 0x8000000000000000 );
1512 1.1 ross }
1513 1.1 ross else if ( aExp <= 0x7E ) {
1514 1.1 ross if ( aExp | aSig ) float_set_inexact();
1515 1.1 ross return 0;
1516 1.1 ross }
1517 1.1 ross aSig64 = aSig | 0x00800000;
1518 1.1 ross aSig64 <<= 40;
1519 1.1 ross z = aSig64>>( - shiftCount );
1520 1.1 ross if ( (bits64) ( aSig64<<( shiftCount & 63 ) ) ) {
1521 1.1 ross float_set_inexact();
1522 1.1 ross }
1523 1.1 ross if ( aSign ) z = - z;
1524 1.1 ross return z;
1525 1.1 ross
1526 1.1 ross }
1527 1.1 ross #endif /* !SOFTFLOAT_FOR_GCC */
1528 1.1 ross
1529 1.1 ross /*
1530 1.1 ross -------------------------------------------------------------------------------
1531 1.1 ross Returns the result of converting the single-precision floating-point value
1532 1.1 ross `a' to the double-precision floating-point format. The conversion is
1533 1.1 ross performed according to the IEC/IEEE Standard for Binary Floating-Point
1534 1.1 ross Arithmetic.
1535 1.1 ross -------------------------------------------------------------------------------
1536 1.1 ross */
1537 1.1 ross float64 float32_to_float64( float32 a )
1538 1.1 ross {
1539 1.1 ross flag aSign;
1540 1.1 ross int16 aExp;
1541 1.1 ross bits32 aSig;
1542 1.1 ross
1543 1.1 ross aSig = extractFloat32Frac( a );
1544 1.1 ross aExp = extractFloat32Exp( a );
1545 1.1 ross aSign = extractFloat32Sign( a );
1546 1.1 ross if ( aExp == 0xFF ) {
1547 1.1 ross if ( aSig ) return commonNaNToFloat64( float32ToCommonNaN( a ) );
1548 1.1 ross return packFloat64( aSign, 0x7FF, 0 );
1549 1.1 ross }
1550 1.1 ross if ( aExp == 0 ) {
1551 1.1 ross if ( aSig == 0 ) return packFloat64( aSign, 0, 0 );
1552 1.1 ross normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1553 1.1 ross --aExp;
1554 1.1 ross }
1555 1.1 ross return packFloat64( aSign, aExp + 0x380, ( (bits64) aSig )<<29 );
1556 1.1 ross
1557 1.1 ross }
1558 1.1 ross
1559 1.1 ross #ifdef FLOATX80
1560 1.1 ross
1561 1.1 ross /*
1562 1.1 ross -------------------------------------------------------------------------------
1563 1.1 ross Returns the result of converting the single-precision floating-point value
1564 1.1 ross `a' to the extended double-precision floating-point format. The conversion
1565 1.1 ross is performed according to the IEC/IEEE Standard for Binary Floating-Point
1566 1.1 ross Arithmetic.
1567 1.1 ross -------------------------------------------------------------------------------
1568 1.1 ross */
1569 1.1 ross floatx80 float32_to_floatx80( float32 a )
1570 1.1 ross {
1571 1.1 ross flag aSign;
1572 1.1 ross int16 aExp;
1573 1.1 ross bits32 aSig;
1574 1.1 ross
1575 1.1 ross aSig = extractFloat32Frac( a );
1576 1.1 ross aExp = extractFloat32Exp( a );
1577 1.1 ross aSign = extractFloat32Sign( a );
1578 1.1 ross if ( aExp == 0xFF ) {
1579 1.1 ross if ( aSig ) return commonNaNToFloatx80( float32ToCommonNaN( a ) );
1580 1.1 ross return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
1581 1.1 ross }
1582 1.1 ross if ( aExp == 0 ) {
1583 1.1 ross if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 );
1584 1.1 ross normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1585 1.1 ross }
1586 1.1 ross aSig |= 0x00800000;
1587 1.1 ross return packFloatx80( aSign, aExp + 0x3F80, ( (bits64) aSig )<<40 );
1588 1.1 ross
1589 1.1 ross }
1590 1.1 ross
1591 1.1 ross #endif
1592 1.1 ross
1593 1.1 ross #ifdef FLOAT128
1594 1.1 ross
1595 1.1 ross /*
1596 1.1 ross -------------------------------------------------------------------------------
1597 1.1 ross Returns the result of converting the single-precision floating-point value
1598 1.1 ross `a' to the double-precision floating-point format. The conversion is
1599 1.1 ross performed according to the IEC/IEEE Standard for Binary Floating-Point
1600 1.1 ross Arithmetic.
1601 1.1 ross -------------------------------------------------------------------------------
1602 1.1 ross */
1603 1.1 ross float128 float32_to_float128( float32 a )
1604 1.1 ross {
1605 1.1 ross flag aSign;
1606 1.1 ross int16 aExp;
1607 1.1 ross bits32 aSig;
1608 1.1 ross
1609 1.1 ross aSig = extractFloat32Frac( a );
1610 1.1 ross aExp = extractFloat32Exp( a );
1611 1.1 ross aSign = extractFloat32Sign( a );
1612 1.1 ross if ( aExp == 0xFF ) {
1613 1.1 ross if ( aSig ) return commonNaNToFloat128( float32ToCommonNaN( a ) );
1614 1.1 ross return packFloat128( aSign, 0x7FFF, 0, 0 );
1615 1.1 ross }
1616 1.1 ross if ( aExp == 0 ) {
1617 1.1 ross if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 );
1618 1.1 ross normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1619 1.1 ross --aExp;
1620 1.1 ross }
1621 1.1 ross return packFloat128( aSign, aExp + 0x3F80, ( (bits64) aSig )<<25, 0 );
1622 1.1 ross
1623 1.1 ross }
1624 1.1 ross
1625 1.1 ross #endif
1626 1.1 ross
1627 1.1 ross #ifndef SOFTFLOAT_FOR_GCC /* Not needed */
1628 1.1 ross /*
1629 1.1 ross -------------------------------------------------------------------------------
1630 1.1 ross Rounds the single-precision floating-point value `a' to an integer, and
1631 1.1 ross returns the result as a single-precision floating-point value. The
1632 1.1 ross operation is performed according to the IEC/IEEE Standard for Binary
1633 1.1 ross Floating-Point Arithmetic.
1634 1.1 ross -------------------------------------------------------------------------------
1635 1.1 ross */
1636 1.1 ross float32 float32_round_to_int( float32 a )
1637 1.1 ross {
1638 1.1 ross flag aSign;
1639 1.1 ross int16 aExp;
1640 1.1 ross bits32 lastBitMask, roundBitsMask;
1641 1.1 ross int8 roundingMode;
1642 1.1 ross float32 z;
1643 1.1 ross
1644 1.1 ross aExp = extractFloat32Exp( a );
1645 1.1 ross if ( 0x96 <= aExp ) {
1646 1.1 ross if ( ( aExp == 0xFF ) && extractFloat32Frac( a ) ) {
1647 1.1 ross return propagateFloat32NaN( a, a );
1648 1.1 ross }
1649 1.1 ross return a;
1650 1.1 ross }
1651 1.1 ross if ( aExp <= 0x7E ) {
1652 1.1 ross if ( (bits32) ( a<<1 ) == 0 ) return a;
1653 1.1 ross float_set_inexact();
1654 1.1 ross aSign = extractFloat32Sign( a );
1655 1.1 ross switch ( float_rounding_mode() ) {
1656 1.1 ross case float_round_nearest_even:
1657 1.1 ross if ( ( aExp == 0x7E ) && extractFloat32Frac( a ) ) {
1658 1.1 ross return packFloat32( aSign, 0x7F, 0 );
1659 1.1 ross }
1660 1.1 ross break;
1661 1.1 ross case float_round_down:
1662 1.1 ross return aSign ? 0xBF800000 : 0;
1663 1.1 ross case float_round_up:
1664 1.1 ross return aSign ? 0x80000000 : 0x3F800000;
1665 1.1 ross }
1666 1.1 ross return packFloat32( aSign, 0, 0 );
1667 1.1 ross }
1668 1.1 ross lastBitMask = 1;
1669 1.1 ross lastBitMask <<= 0x96 - aExp;
1670 1.1 ross roundBitsMask = lastBitMask - 1;
1671 1.1 ross z = a;
1672 1.1 ross roundingMode = float_rounding_mode();
1673 1.1 ross if ( roundingMode == float_round_nearest_even ) {
1674 1.1 ross z += lastBitMask>>1;
1675 1.1 ross if ( ( z & roundBitsMask ) == 0 ) z &= ~ lastBitMask;
1676 1.1 ross }
1677 1.1 ross else if ( roundingMode != float_round_to_zero ) {
1678 1.1 ross if ( extractFloat32Sign( z ) ^ ( roundingMode == float_round_up ) ) {
1679 1.1 ross z += roundBitsMask;
1680 1.1 ross }
1681 1.1 ross }
1682 1.1 ross z &= ~ roundBitsMask;
1683 1.1 ross if ( z != a ) float_set_inexact();
1684 1.1 ross return z;
1685 1.1 ross
1686 1.1 ross }
1687 1.1 ross #endif /* !SOFTFLOAT_FOR_GCC */
1688 1.1 ross
1689 1.1 ross /*
1690 1.1 ross -------------------------------------------------------------------------------
1691 1.1 ross Returns the result of adding the absolute values of the single-precision
1692 1.1 ross floating-point values `a' and `b'. If `zSign' is 1, the sum is negated
1693 1.1 ross before being returned. `zSign' is ignored if the result is a NaN.
1694 1.1 ross The addition is performed according to the IEC/IEEE Standard for Binary
1695 1.1 ross Floating-Point Arithmetic.
1696 1.1 ross -------------------------------------------------------------------------------
1697 1.1 ross */
1698 1.1 ross static float32 addFloat32Sigs( float32 a, float32 b, flag zSign )
1699 1.1 ross {
1700 1.1 ross int16 aExp, bExp, zExp;
1701 1.1 ross bits32 aSig, bSig, zSig;
1702 1.1 ross int16 expDiff;
1703 1.1 ross
1704 1.1 ross aSig = extractFloat32Frac( a );
1705 1.1 ross aExp = extractFloat32Exp( a );
1706 1.1 ross bSig = extractFloat32Frac( b );
1707 1.1 ross bExp = extractFloat32Exp( b );
1708 1.1 ross expDiff = aExp - bExp;
1709 1.1 ross aSig <<= 6;
1710 1.1 ross bSig <<= 6;
1711 1.1 ross if ( 0 < expDiff ) {
1712 1.1 ross if ( aExp == 0xFF ) {
1713 1.1 ross if ( aSig ) return propagateFloat32NaN( a, b );
1714 1.1 ross return a;
1715 1.1 ross }
1716 1.1 ross if ( bExp == 0 ) {
1717 1.1 ross --expDiff;
1718 1.1 ross }
1719 1.1 ross else {
1720 1.1 ross bSig |= 0x20000000;
1721 1.1 ross }
1722 1.1 ross shift32RightJamming( bSig, expDiff, &bSig );
1723 1.1 ross zExp = aExp;
1724 1.1 ross }
1725 1.1 ross else if ( expDiff < 0 ) {
1726 1.1 ross if ( bExp == 0xFF ) {
1727 1.1 ross if ( bSig ) return propagateFloat32NaN( a, b );
1728 1.1 ross return packFloat32( zSign, 0xFF, 0 );
1729 1.1 ross }
1730 1.1 ross if ( aExp == 0 ) {
1731 1.1 ross ++expDiff;
1732 1.1 ross }
1733 1.1 ross else {
1734 1.1 ross aSig |= 0x20000000;
1735 1.1 ross }
1736 1.1 ross shift32RightJamming( aSig, - expDiff, &aSig );
1737 1.1 ross zExp = bExp;
1738 1.1 ross }
1739 1.1 ross else {
1740 1.1 ross if ( aExp == 0xFF ) {
1741 1.1 ross if ( aSig | bSig ) return propagateFloat32NaN( a, b );
1742 1.1 ross return a;
1743 1.1 ross }
1744 1.1 ross if ( aExp == 0 ) return packFloat32( zSign, 0, ( aSig + bSig )>>6 );
1745 1.1 ross zSig = 0x40000000 + aSig + bSig;
1746 1.1 ross zExp = aExp;
1747 1.1 ross goto roundAndPack;
1748 1.1 ross }
1749 1.1 ross aSig |= 0x20000000;
1750 1.1 ross zSig = ( aSig + bSig )<<1;
1751 1.1 ross --zExp;
1752 1.1 ross if ( (sbits32) zSig < 0 ) {
1753 1.1 ross zSig = aSig + bSig;
1754 1.1 ross ++zExp;
1755 1.1 ross }
1756 1.1 ross roundAndPack:
1757 1.1 ross return roundAndPackFloat32( zSign, zExp, zSig );
1758 1.1 ross
1759 1.1 ross }
1760 1.1 ross
1761 1.1 ross /*
1762 1.1 ross -------------------------------------------------------------------------------
1763 1.1 ross Returns the result of subtracting the absolute values of the single-
1764 1.1 ross precision floating-point values `a' and `b'. If `zSign' is 1, the
1765 1.1 ross difference is negated before being returned. `zSign' is ignored if the
1766 1.1 ross result is a NaN. The subtraction is performed according to the IEC/IEEE
1767 1.1 ross Standard for Binary Floating-Point Arithmetic.
1768 1.1 ross -------------------------------------------------------------------------------
1769 1.1 ross */
1770 1.1 ross static float32 subFloat32Sigs( float32 a, float32 b, flag zSign )
1771 1.1 ross {
1772 1.1 ross int16 aExp, bExp, zExp;
1773 1.1 ross bits32 aSig, bSig, zSig;
1774 1.1 ross int16 expDiff;
1775 1.1 ross
1776 1.1 ross aSig = extractFloat32Frac( a );
1777 1.1 ross aExp = extractFloat32Exp( a );
1778 1.1 ross bSig = extractFloat32Frac( b );
1779 1.1 ross bExp = extractFloat32Exp( b );
1780 1.1 ross expDiff = aExp - bExp;
1781 1.1 ross aSig <<= 7;
1782 1.1 ross bSig <<= 7;
1783 1.1 ross if ( 0 < expDiff ) goto aExpBigger;
1784 1.1 ross if ( expDiff < 0 ) goto bExpBigger;
1785 1.1 ross if ( aExp == 0xFF ) {
1786 1.1 ross if ( aSig | bSig ) return propagateFloat32NaN( a, b );
1787 1.1 ross float_raise( float_flag_invalid );
1788 1.1 ross return float32_default_nan;
1789 1.1 ross }
1790 1.1 ross if ( aExp == 0 ) {
1791 1.1 ross aExp = 1;
1792 1.1 ross bExp = 1;
1793 1.1 ross }
1794 1.1 ross if ( bSig < aSig ) goto aBigger;
1795 1.1 ross if ( aSig < bSig ) goto bBigger;
1796 1.1 ross return packFloat32( float_rounding_mode() == float_round_down, 0, 0 );
1797 1.1 ross bExpBigger:
1798 1.1 ross if ( bExp == 0xFF ) {
1799 1.1 ross if ( bSig ) return propagateFloat32NaN( a, b );
1800 1.1 ross return packFloat32( zSign ^ 1, 0xFF, 0 );
1801 1.1 ross }
1802 1.1 ross if ( aExp == 0 ) {
1803 1.1 ross ++expDiff;
1804 1.1 ross }
1805 1.1 ross else {
1806 1.1 ross aSig |= 0x40000000;
1807 1.1 ross }
1808 1.1 ross shift32RightJamming( aSig, - expDiff, &aSig );
1809 1.1 ross bSig |= 0x40000000;
1810 1.1 ross bBigger:
1811 1.1 ross zSig = bSig - aSig;
1812 1.1 ross zExp = bExp;
1813 1.1 ross zSign ^= 1;
1814 1.1 ross goto normalizeRoundAndPack;
1815 1.1 ross aExpBigger:
1816 1.1 ross if ( aExp == 0xFF ) {
1817 1.1 ross if ( aSig ) return propagateFloat32NaN( a, b );
1818 1.1 ross return a;
1819 1.1 ross }
1820 1.1 ross if ( bExp == 0 ) {
1821 1.1 ross --expDiff;
1822 1.1 ross }
1823 1.1 ross else {
1824 1.1 ross bSig |= 0x40000000;
1825 1.1 ross }
1826 1.1 ross shift32RightJamming( bSig, expDiff, &bSig );
1827 1.1 ross aSig |= 0x40000000;
1828 1.1 ross aBigger:
1829 1.1 ross zSig = aSig - bSig;
1830 1.1 ross zExp = aExp;
1831 1.1 ross normalizeRoundAndPack:
1832 1.1 ross --zExp;
1833 1.1 ross return normalizeRoundAndPackFloat32( zSign, zExp, zSig );
1834 1.1 ross
1835 1.1 ross }
1836 1.1 ross
1837 1.1 ross /*
1838 1.1 ross -------------------------------------------------------------------------------
1839 1.1 ross Returns the result of adding the single-precision floating-point values `a'
1840 1.1 ross and `b'. The operation is performed according to the IEC/IEEE Standard for
1841 1.1 ross Binary Floating-Point Arithmetic.
1842 1.1 ross -------------------------------------------------------------------------------
1843 1.1 ross */
1844 1.1 ross float32 float32_add( float32 a, float32 b )
1845 1.1 ross {
1846 1.1 ross flag aSign, bSign;
1847 1.1 ross
1848 1.1 ross aSign = extractFloat32Sign( a );
1849 1.1 ross bSign = extractFloat32Sign( b );
1850 1.1 ross if ( aSign == bSign ) {
1851 1.1 ross return addFloat32Sigs( a, b, aSign );
1852 1.1 ross }
1853 1.1 ross else {
1854 1.1 ross return subFloat32Sigs( a, b, aSign );
1855 1.1 ross }
1856 1.1 ross
1857 1.1 ross }
1858 1.1 ross
1859 1.1 ross /*
1860 1.1 ross -------------------------------------------------------------------------------
1861 1.1 ross Returns the result of subtracting the single-precision floating-point values
1862 1.1 ross `a' and `b'. The operation is performed according to the IEC/IEEE Standard
1863 1.1 ross for Binary Floating-Point Arithmetic.
1864 1.1 ross -------------------------------------------------------------------------------
1865 1.1 ross */
1866 1.1 ross float32 float32_sub( float32 a, float32 b )
1867 1.1 ross {
1868 1.1 ross flag aSign, bSign;
1869 1.1 ross
1870 1.1 ross aSign = extractFloat32Sign( a );
1871 1.1 ross bSign = extractFloat32Sign( b );
1872 1.1 ross if ( aSign == bSign ) {
1873 1.1 ross return subFloat32Sigs( a, b, aSign );
1874 1.1 ross }
1875 1.1 ross else {
1876 1.1 ross return addFloat32Sigs( a, b, aSign );
1877 1.1 ross }
1878 1.1 ross
1879 1.1 ross }
1880 1.1 ross
1881 1.1 ross /*
1882 1.1 ross -------------------------------------------------------------------------------
1883 1.1 ross Returns the result of multiplying the single-precision floating-point values
1884 1.1 ross `a' and `b'. The operation is performed according to the IEC/IEEE Standard
1885 1.1 ross for Binary Floating-Point Arithmetic.
1886 1.1 ross -------------------------------------------------------------------------------
1887 1.1 ross */
1888 1.1 ross float32 float32_mul( float32 a, float32 b )
1889 1.1 ross {
1890 1.1 ross flag aSign, bSign, zSign;
1891 1.1 ross int16 aExp, bExp, zExp;
1892 1.1 ross bits32 aSig, bSig;
1893 1.1 ross bits64 zSig64;
1894 1.1 ross bits32 zSig;
1895 1.1 ross
1896 1.1 ross aSig = extractFloat32Frac( a );
1897 1.1 ross aExp = extractFloat32Exp( a );
1898 1.1 ross aSign = extractFloat32Sign( a );
1899 1.1 ross bSig = extractFloat32Frac( b );
1900 1.1 ross bExp = extractFloat32Exp( b );
1901 1.1 ross bSign = extractFloat32Sign( b );
1902 1.1 ross zSign = aSign ^ bSign;
1903 1.1 ross if ( aExp == 0xFF ) {
1904 1.1 ross if ( aSig || ( ( bExp == 0xFF ) && bSig ) ) {
1905 1.1 ross return propagateFloat32NaN( a, b );
1906 1.1 ross }
1907 1.1 ross if ( ( bExp | bSig ) == 0 ) {
1908 1.1 ross float_raise( float_flag_invalid );
1909 1.1 ross return float32_default_nan;
1910 1.1 ross }
1911 1.1 ross return packFloat32( zSign, 0xFF, 0 );
1912 1.1 ross }
1913 1.1 ross if ( bExp == 0xFF ) {
1914 1.1 ross if ( bSig ) return propagateFloat32NaN( a, b );
1915 1.1 ross if ( ( aExp | aSig ) == 0 ) {
1916 1.1 ross float_raise( float_flag_invalid );
1917 1.1 ross return float32_default_nan;
1918 1.1 ross }
1919 1.1 ross return packFloat32( zSign, 0xFF, 0 );
1920 1.1 ross }
1921 1.1 ross if ( aExp == 0 ) {
1922 1.1 ross if ( aSig == 0 ) return packFloat32( zSign, 0, 0 );
1923 1.1 ross normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1924 1.1 ross }
1925 1.1 ross if ( bExp == 0 ) {
1926 1.1 ross if ( bSig == 0 ) return packFloat32( zSign, 0, 0 );
1927 1.1 ross normalizeFloat32Subnormal( bSig, &bExp, &bSig );
1928 1.1 ross }
1929 1.1 ross zExp = aExp + bExp - 0x7F;
1930 1.1 ross aSig = ( aSig | 0x00800000 )<<7;
1931 1.1 ross bSig = ( bSig | 0x00800000 )<<8;
1932 1.1 ross shift64RightJamming( ( (bits64) aSig ) * bSig, 32, &zSig64 );
1933 1.1 ross zSig = zSig64;
1934 1.1 ross if ( 0 <= (sbits32) ( zSig<<1 ) ) {
1935 1.1 ross zSig <<= 1;
1936 1.1 ross --zExp;
1937 1.1 ross }
1938 1.1 ross return roundAndPackFloat32( zSign, zExp, zSig );
1939 1.1 ross
1940 1.1 ross }
1941 1.1 ross
1942 1.1 ross /*
1943 1.1 ross -------------------------------------------------------------------------------
1944 1.1 ross Returns the result of dividing the single-precision floating-point value `a'
1945 1.1 ross by the corresponding value `b'. The operation is performed according to the
1946 1.1 ross IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1947 1.1 ross -------------------------------------------------------------------------------
1948 1.1 ross */
1949 1.1 ross float32 float32_div( float32 a, float32 b )
1950 1.1 ross {
1951 1.1 ross flag aSign, bSign, zSign;
1952 1.1 ross int16 aExp, bExp, zExp;
1953 1.1 ross bits32 aSig, bSig, zSig;
1954 1.1 ross
1955 1.1 ross aSig = extractFloat32Frac( a );
1956 1.1 ross aExp = extractFloat32Exp( a );
1957 1.1 ross aSign = extractFloat32Sign( a );
1958 1.1 ross bSig = extractFloat32Frac( b );
1959 1.1 ross bExp = extractFloat32Exp( b );
1960 1.1 ross bSign = extractFloat32Sign( b );
1961 1.1 ross zSign = aSign ^ bSign;
1962 1.1 ross if ( aExp == 0xFF ) {
1963 1.1 ross if ( aSig ) return propagateFloat32NaN( a, b );
1964 1.1 ross if ( bExp == 0xFF ) {
1965 1.1 ross if ( bSig ) return propagateFloat32NaN( a, b );
1966 1.1 ross float_raise( float_flag_invalid );
1967 1.1 ross return float32_default_nan;
1968 1.1 ross }
1969 1.1 ross return packFloat32( zSign, 0xFF, 0 );
1970 1.1 ross }
1971 1.1 ross if ( bExp == 0xFF ) {
1972 1.1 ross if ( bSig ) return propagateFloat32NaN( a, b );
1973 1.1 ross return packFloat32( zSign, 0, 0 );
1974 1.1 ross }
1975 1.1 ross if ( bExp == 0 ) {
1976 1.1 ross if ( bSig == 0 ) {
1977 1.1 ross if ( ( aExp | aSig ) == 0 ) {
1978 1.1 ross float_raise( float_flag_invalid );
1979 1.1 ross return float32_default_nan;
1980 1.1 ross }
1981 1.1 ross float_raise( float_flag_divbyzero );
1982 1.1 ross return packFloat32( zSign, 0xFF, 0 );
1983 1.1 ross }
1984 1.1 ross normalizeFloat32Subnormal( bSig, &bExp, &bSig );
1985 1.1 ross }
1986 1.1 ross if ( aExp == 0 ) {
1987 1.1 ross if ( aSig == 0 ) return packFloat32( zSign, 0, 0 );
1988 1.1 ross normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1989 1.1 ross }
1990 1.1 ross zExp = aExp - bExp + 0x7D;
1991 1.1 ross aSig = ( aSig | 0x00800000 )<<7;
1992 1.1 ross bSig = ( bSig | 0x00800000 )<<8;
1993 1.1 ross if ( bSig <= ( aSig + aSig ) ) {
1994 1.1 ross aSig >>= 1;
1995 1.1 ross ++zExp;
1996 1.1 ross }
1997 1.1 ross zSig = ( ( (bits64) aSig )<<32 ) / bSig;
1998 1.1 ross if ( ( zSig & 0x3F ) == 0 ) {
1999 1.1 ross zSig |= ( (bits64) bSig * zSig != ( (bits64) aSig )<<32 );
2000 1.1 ross }
2001 1.1 ross return roundAndPackFloat32( zSign, zExp, zSig );
2002 1.1 ross
2003 1.1 ross }
2004 1.1 ross
2005 1.1 ross #ifndef SOFTFLOAT_FOR_GCC /* Not needed */
2006 1.1 ross /*
2007 1.1 ross -------------------------------------------------------------------------------
2008 1.1 ross Returns the remainder of the single-precision floating-point value `a'
2009 1.1 ross with respect to the corresponding value `b'. The operation is performed
2010 1.1 ross according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2011 1.1 ross -------------------------------------------------------------------------------
2012 1.1 ross */
2013 1.1 ross float32 float32_rem( float32 a, float32 b )
2014 1.1 ross {
2015 1.5 christos flag aSign, bSign __unused, zSign;
2016 1.1 ross int16 aExp, bExp, expDiff;
2017 1.1 ross bits32 aSig, bSig;
2018 1.1 ross bits32 q;
2019 1.1 ross bits64 aSig64, bSig64, q64;
2020 1.1 ross bits32 alternateASig;
2021 1.1 ross sbits32 sigMean;
2022 1.1 ross
2023 1.1 ross aSig = extractFloat32Frac( a );
2024 1.1 ross aExp = extractFloat32Exp( a );
2025 1.1 ross aSign = extractFloat32Sign( a );
2026 1.1 ross bSig = extractFloat32Frac( b );
2027 1.1 ross bExp = extractFloat32Exp( b );
2028 1.1 ross bSign = extractFloat32Sign( b );
2029 1.1 ross if ( aExp == 0xFF ) {
2030 1.1 ross if ( aSig || ( ( bExp == 0xFF ) && bSig ) ) {
2031 1.1 ross return propagateFloat32NaN( a, b );
2032 1.1 ross }
2033 1.1 ross float_raise( float_flag_invalid );
2034 1.1 ross return float32_default_nan;
2035 1.1 ross }
2036 1.1 ross if ( bExp == 0xFF ) {
2037 1.1 ross if ( bSig ) return propagateFloat32NaN( a, b );
2038 1.1 ross return a;
2039 1.1 ross }
2040 1.1 ross if ( bExp == 0 ) {
2041 1.1 ross if ( bSig == 0 ) {
2042 1.1 ross float_raise( float_flag_invalid );
2043 1.1 ross return float32_default_nan;
2044 1.1 ross }
2045 1.1 ross normalizeFloat32Subnormal( bSig, &bExp, &bSig );
2046 1.1 ross }
2047 1.1 ross if ( aExp == 0 ) {
2048 1.1 ross if ( aSig == 0 ) return a;
2049 1.1 ross normalizeFloat32Subnormal( aSig, &aExp, &aSig );
2050 1.1 ross }
2051 1.1 ross expDiff = aExp - bExp;
2052 1.1 ross aSig |= 0x00800000;
2053 1.1 ross bSig |= 0x00800000;
2054 1.1 ross if ( expDiff < 32 ) {
2055 1.1 ross aSig <<= 8;
2056 1.1 ross bSig <<= 8;
2057 1.1 ross if ( expDiff < 0 ) {
2058 1.1 ross if ( expDiff < -1 ) return a;
2059 1.1 ross aSig >>= 1;
2060 1.1 ross }
2061 1.1 ross q = ( bSig <= aSig );
2062 1.1 ross if ( q ) aSig -= bSig;
2063 1.1 ross if ( 0 < expDiff ) {
2064 1.1 ross q = ( ( (bits64) aSig )<<32 ) / bSig;
2065 1.1 ross q >>= 32 - expDiff;
2066 1.1 ross bSig >>= 2;
2067 1.1 ross aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q;
2068 1.1 ross }
2069 1.1 ross else {
2070 1.1 ross aSig >>= 2;
2071 1.1 ross bSig >>= 2;
2072 1.1 ross }
2073 1.1 ross }
2074 1.1 ross else {
2075 1.1 ross if ( bSig <= aSig ) aSig -= bSig;
2076 1.1 ross aSig64 = ( (bits64) aSig )<<40;
2077 1.1 ross bSig64 = ( (bits64) bSig )<<40;
2078 1.1 ross expDiff -= 64;
2079 1.1 ross while ( 0 < expDiff ) {
2080 1.1 ross q64 = estimateDiv128To64( aSig64, 0, bSig64 );
2081 1.1 ross q64 = ( 2 < q64 ) ? q64 - 2 : 0;
2082 1.1 ross aSig64 = - ( ( bSig * q64 )<<38 );
2083 1.1 ross expDiff -= 62;
2084 1.1 ross }
2085 1.1 ross expDiff += 64;
2086 1.1 ross q64 = estimateDiv128To64( aSig64, 0, bSig64 );
2087 1.1 ross q64 = ( 2 < q64 ) ? q64 - 2 : 0;
2088 1.1 ross q = q64>>( 64 - expDiff );
2089 1.1 ross bSig <<= 6;
2090 1.1 ross aSig = ( ( aSig64>>33 )<<( expDiff - 1 ) ) - bSig * q;
2091 1.1 ross }
2092 1.1 ross do {
2093 1.1 ross alternateASig = aSig;
2094 1.1 ross ++q;
2095 1.1 ross aSig -= bSig;
2096 1.1 ross } while ( 0 <= (sbits32) aSig );
2097 1.1 ross sigMean = aSig + alternateASig;
2098 1.1 ross if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) {
2099 1.1 ross aSig = alternateASig;
2100 1.1 ross }
2101 1.1 ross zSign = ( (sbits32) aSig < 0 );
2102 1.1 ross if ( zSign ) aSig = - aSig;
2103 1.1 ross return normalizeRoundAndPackFloat32( aSign ^ zSign, bExp, aSig );
2104 1.1 ross
2105 1.1 ross }
2106 1.1 ross #endif /* !SOFTFLOAT_FOR_GCC */
2107 1.1 ross
2108 1.1 ross #ifndef SOFTFLOAT_FOR_GCC /* Not needed */
2109 1.1 ross /*
2110 1.1 ross -------------------------------------------------------------------------------
2111 1.1 ross Returns the square root of the single-precision floating-point value `a'.
2112 1.1 ross The operation is performed according to the IEC/IEEE Standard for Binary
2113 1.1 ross Floating-Point Arithmetic.
2114 1.1 ross -------------------------------------------------------------------------------
2115 1.1 ross */
2116 1.1 ross float32 float32_sqrt( float32 a )
2117 1.1 ross {
2118 1.1 ross flag aSign;
2119 1.1 ross int16 aExp, zExp;
2120 1.1 ross bits32 aSig, zSig;
2121 1.1 ross bits64 rem, term;
2122 1.1 ross
2123 1.1 ross aSig = extractFloat32Frac( a );
2124 1.1 ross aExp = extractFloat32Exp( a );
2125 1.1 ross aSign = extractFloat32Sign( a );
2126 1.1 ross if ( aExp == 0xFF ) {
2127 1.1 ross if ( aSig ) return propagateFloat32NaN( a, 0 );
2128 1.1 ross if ( ! aSign ) return a;
2129 1.1 ross float_raise( float_flag_invalid );
2130 1.1 ross return float32_default_nan;
2131 1.1 ross }
2132 1.1 ross if ( aSign ) {
2133 1.1 ross if ( ( aExp | aSig ) == 0 ) return a;
2134 1.1 ross float_raise( float_flag_invalid );
2135 1.1 ross return float32_default_nan;
2136 1.1 ross }
2137 1.1 ross if ( aExp == 0 ) {
2138 1.1 ross if ( aSig == 0 ) return 0;
2139 1.1 ross normalizeFloat32Subnormal( aSig, &aExp, &aSig );
2140 1.1 ross }
2141 1.1 ross zExp = ( ( aExp - 0x7F )>>1 ) + 0x7E;
2142 1.1 ross aSig = ( aSig | 0x00800000 )<<8;
2143 1.1 ross zSig = estimateSqrt32( aExp, aSig ) + 2;
2144 1.1 ross if ( ( zSig & 0x7F ) <= 5 ) {
2145 1.1 ross if ( zSig < 2 ) {
2146 1.1 ross zSig = 0x7FFFFFFF;
2147 1.1 ross goto roundAndPack;
2148 1.1 ross }
2149 1.1 ross aSig >>= aExp & 1;
2150 1.1 ross term = ( (bits64) zSig ) * zSig;
2151 1.1 ross rem = ( ( (bits64) aSig )<<32 ) - term;
2152 1.1 ross while ( (sbits64) rem < 0 ) {
2153 1.1 ross --zSig;
2154 1.1 ross rem += ( ( (bits64) zSig )<<1 ) | 1;
2155 1.1 ross }
2156 1.1 ross zSig |= ( rem != 0 );
2157 1.1 ross }
2158 1.1 ross shift32RightJamming( zSig, 1, &zSig );
2159 1.1 ross roundAndPack:
2160 1.1 ross return roundAndPackFloat32( 0, zExp, zSig );
2161 1.1 ross
2162 1.1 ross }
2163 1.1 ross #endif /* !SOFTFLOAT_FOR_GCC */
2164 1.1 ross
2165 1.1 ross /*
2166 1.1 ross -------------------------------------------------------------------------------
2167 1.1 ross Returns 1 if the single-precision floating-point value `a' is equal to
2168 1.1 ross the corresponding value `b', and 0 otherwise. The comparison is performed
2169 1.1 ross according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2170 1.1 ross -------------------------------------------------------------------------------
2171 1.1 ross */
2172 1.1 ross flag float32_eq( float32 a, float32 b )
2173 1.1 ross {
2174 1.1 ross
2175 1.1 ross if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2176 1.1 ross || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2177 1.1 ross ) {
2178 1.1 ross if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) {
2179 1.1 ross float_raise( float_flag_invalid );
2180 1.1 ross }
2181 1.1 ross return 0;
2182 1.1 ross }
2183 1.1 ross return ( a == b ) || ( (bits32) ( ( a | b )<<1 ) == 0 );
2184 1.1 ross
2185 1.1 ross }
2186 1.1 ross
2187 1.1 ross /*
2188 1.1 ross -------------------------------------------------------------------------------
2189 1.1 ross Returns 1 if the single-precision floating-point value `a' is less than
2190 1.1 ross or equal to the corresponding value `b', and 0 otherwise. The comparison
2191 1.1 ross is performed according to the IEC/IEEE Standard for Binary Floating-Point
2192 1.1 ross Arithmetic.
2193 1.1 ross -------------------------------------------------------------------------------
2194 1.1 ross */
2195 1.1 ross flag float32_le( float32 a, float32 b )
2196 1.1 ross {
2197 1.1 ross flag aSign, bSign;
2198 1.1 ross
2199 1.1 ross if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2200 1.1 ross || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2201 1.1 ross ) {
2202 1.1 ross float_raise( float_flag_invalid );
2203 1.1 ross return 0;
2204 1.1 ross }
2205 1.1 ross aSign = extractFloat32Sign( a );
2206 1.1 ross bSign = extractFloat32Sign( b );
2207 1.1 ross if ( aSign != bSign ) return aSign || ( (bits32) ( ( a | b )<<1 ) == 0 );
2208 1.1 ross return ( a == b ) || ( aSign ^ ( a < b ) );
2209 1.1 ross
2210 1.1 ross }
2211 1.1 ross
2212 1.1 ross /*
2213 1.1 ross -------------------------------------------------------------------------------
2214 1.1 ross Returns 1 if the single-precision floating-point value `a' is less than
2215 1.1 ross the corresponding value `b', and 0 otherwise. The comparison is performed
2216 1.1 ross according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2217 1.1 ross -------------------------------------------------------------------------------
2218 1.1 ross */
2219 1.1 ross flag float32_lt( float32 a, float32 b )
2220 1.1 ross {
2221 1.1 ross flag aSign, bSign;
2222 1.1 ross
2223 1.1 ross if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2224 1.1 ross || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2225 1.1 ross ) {
2226 1.1 ross float_raise( float_flag_invalid );
2227 1.1 ross return 0;
2228 1.1 ross }
2229 1.1 ross aSign = extractFloat32Sign( a );
2230 1.1 ross bSign = extractFloat32Sign( b );
2231 1.1 ross if ( aSign != bSign ) return aSign && ( (bits32) ( ( a | b )<<1 ) != 0 );
2232 1.1 ross return ( a != b ) && ( aSign ^ ( a < b ) );
2233 1.1 ross
2234 1.1 ross }
2235 1.1 ross
2236 1.1 ross #ifndef SOFTFLOAT_FOR_GCC /* Not needed */
2237 1.1 ross /*
2238 1.1 ross -------------------------------------------------------------------------------
2239 1.1 ross Returns 1 if the single-precision floating-point value `a' is equal to
2240 1.1 ross the corresponding value `b', and 0 otherwise. The invalid exception is
2241 1.1 ross raised if either operand is a NaN. Otherwise, the comparison is performed
2242 1.1 ross according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2243 1.1 ross -------------------------------------------------------------------------------
2244 1.1 ross */
2245 1.1 ross flag float32_eq_signaling( float32 a, float32 b )
2246 1.1 ross {
2247 1.1 ross
2248 1.1 ross if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2249 1.1 ross || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2250 1.1 ross ) {
2251 1.1 ross float_raise( float_flag_invalid );
2252 1.1 ross return 0;
2253 1.1 ross }
2254 1.1 ross return ( a == b ) || ( (bits32) ( ( a | b )<<1 ) == 0 );
2255 1.1 ross
2256 1.1 ross }
2257 1.1 ross
2258 1.1 ross /*
2259 1.1 ross -------------------------------------------------------------------------------
2260 1.1 ross Returns 1 if the single-precision floating-point value `a' is less than or
2261 1.1 ross equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not
2262 1.1 ross cause an exception. Otherwise, the comparison is performed according to the
2263 1.1 ross IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2264 1.1 ross -------------------------------------------------------------------------------
2265 1.1 ross */
2266 1.1 ross flag float32_le_quiet( float32 a, float32 b )
2267 1.1 ross {
2268 1.1 ross flag aSign, bSign;
2269 1.1 ross
2270 1.1 ross if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2271 1.1 ross || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2272 1.1 ross ) {
2273 1.1 ross if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) {
2274 1.1 ross float_raise( float_flag_invalid );
2275 1.1 ross }
2276 1.1 ross return 0;
2277 1.1 ross }
2278 1.1 ross aSign = extractFloat32Sign( a );
2279 1.1 ross bSign = extractFloat32Sign( b );
2280 1.1 ross if ( aSign != bSign ) return aSign || ( (bits32) ( ( a | b )<<1 ) == 0 );
2281 1.1 ross return ( a == b ) || ( aSign ^ ( a < b ) );
2282 1.1 ross
2283 1.1 ross }
2284 1.1 ross
2285 1.1 ross /*
2286 1.1 ross -------------------------------------------------------------------------------
2287 1.1 ross Returns 1 if the single-precision floating-point value `a' is less than
2288 1.1 ross the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
2289 1.1 ross exception. Otherwise, the comparison is performed according to the IEC/IEEE
2290 1.1 ross Standard for Binary Floating-Point Arithmetic.
2291 1.1 ross -------------------------------------------------------------------------------
2292 1.1 ross */
2293 1.1 ross flag float32_lt_quiet( float32 a, float32 b )
2294 1.1 ross {
2295 1.1 ross flag aSign, bSign;
2296 1.1 ross
2297 1.1 ross if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2298 1.1 ross || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2299 1.1 ross ) {
2300 1.1 ross if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) {
2301 1.1 ross float_raise( float_flag_invalid );
2302 1.1 ross }
2303 1.1 ross return 0;
2304 1.1 ross }
2305 1.1 ross aSign = extractFloat32Sign( a );
2306 1.1 ross bSign = extractFloat32Sign( b );
2307 1.1 ross if ( aSign != bSign ) return aSign && ( (bits32) ( ( a | b )<<1 ) != 0 );
2308 1.1 ross return ( a != b ) && ( aSign ^ ( a < b ) );
2309 1.1 ross
2310 1.1 ross }
2311 1.1 ross #endif /* !SOFTFLOAT_FOR_GCC */
2312 1.1 ross
2313 1.1 ross #ifndef SOFTFLOAT_FOR_GCC /* Not needed */
2314 1.1 ross /*
2315 1.1 ross -------------------------------------------------------------------------------
2316 1.1 ross Returns the result of converting the double-precision floating-point value
2317 1.1 ross `a' to the 32-bit two's complement integer format. The conversion is
2318 1.1 ross performed according to the IEC/IEEE Standard for Binary Floating-Point
2319 1.1 ross Arithmetic---which means in particular that the conversion is rounded
2320 1.1 ross according to the current rounding mode. If `a' is a NaN, the largest
2321 1.1 ross positive integer is returned. Otherwise, if the conversion overflows, the
2322 1.1 ross largest integer with the same sign as `a' is returned.
2323 1.1 ross -------------------------------------------------------------------------------
2324 1.1 ross */
2325 1.1 ross int32 float64_to_int32( float64 a )
2326 1.1 ross {
2327 1.1 ross flag aSign;
2328 1.1 ross int16 aExp, shiftCount;
2329 1.1 ross bits64 aSig;
2330 1.1 ross
2331 1.1 ross aSig = extractFloat64Frac( a );
2332 1.1 ross aExp = extractFloat64Exp( a );
2333 1.1 ross aSign = extractFloat64Sign( a );
2334 1.1 ross if ( ( aExp == 0x7FF ) && aSig ) aSign = 0;
2335 1.1 ross if ( aExp ) aSig |= LIT64( 0x0010000000000000 );
2336 1.1 ross shiftCount = 0x42C - aExp;
2337 1.1 ross if ( 0 < shiftCount ) shift64RightJamming( aSig, shiftCount, &aSig );
2338 1.1 ross return roundAndPackInt32( aSign, aSig );
2339 1.1 ross
2340 1.1 ross }
2341 1.1 ross #endif /* !SOFTFLOAT_FOR_GCC */
2342 1.1 ross
2343 1.1 ross /*
2344 1.1 ross -------------------------------------------------------------------------------
2345 1.1 ross Returns the result of converting the double-precision floating-point value
2346 1.1 ross `a' to the 32-bit two's complement integer format. The conversion is
2347 1.1 ross performed according to the IEC/IEEE Standard for Binary Floating-Point
2348 1.1 ross Arithmetic, except that the conversion is always rounded toward zero.
2349 1.1 ross If `a' is a NaN, the largest positive integer is returned. Otherwise, if
2350 1.1 ross the conversion overflows, the largest integer with the same sign as `a' is
2351 1.1 ross returned.
2352 1.1 ross -------------------------------------------------------------------------------
2353 1.1 ross */
2354 1.1 ross int32 float64_to_int32_round_to_zero( float64 a )
2355 1.1 ross {
2356 1.1 ross flag aSign;
2357 1.1 ross int16 aExp, shiftCount;
2358 1.1 ross bits64 aSig, savedASig;
2359 1.1 ross int32 z;
2360 1.1 ross
2361 1.1 ross aSig = extractFloat64Frac( a );
2362 1.1 ross aExp = extractFloat64Exp( a );
2363 1.1 ross aSign = extractFloat64Sign( a );
2364 1.1 ross if ( 0x41E < aExp ) {
2365 1.1 ross if ( ( aExp == 0x7FF ) && aSig ) aSign = 0;
2366 1.1 ross goto invalid;
2367 1.1 ross }
2368 1.1 ross else if ( aExp < 0x3FF ) {
2369 1.1 ross if ( aExp || aSig ) float_set_inexact();
2370 1.1 ross return 0;
2371 1.1 ross }
2372 1.1 ross aSig |= LIT64( 0x0010000000000000 );
2373 1.1 ross shiftCount = 0x433 - aExp;
2374 1.1 ross savedASig = aSig;
2375 1.1 ross aSig >>= shiftCount;
2376 1.1 ross z = aSig;
2377 1.1 ross if ( aSign ) z = - z;
2378 1.1 ross if ( ( z < 0 ) ^ aSign ) {
2379 1.1 ross invalid:
2380 1.1 ross float_raise( float_flag_invalid );
2381 1.1 ross return aSign ? (sbits32) 0x80000000 : 0x7FFFFFFF;
2382 1.1 ross }
2383 1.1 ross if ( ( aSig<<shiftCount ) != savedASig ) {
2384 1.1 ross float_set_inexact();
2385 1.1 ross }
2386 1.1 ross return z;
2387 1.1 ross
2388 1.1 ross }
2389 1.1 ross
2390 1.1 ross #ifndef SOFTFLOAT_FOR_GCC /* Not needed */
2391 1.1 ross /*
2392 1.1 ross -------------------------------------------------------------------------------
2393 1.1 ross Returns the result of converting the double-precision floating-point value
2394 1.1 ross `a' to the 64-bit two's complement integer format. The conversion is
2395 1.1 ross performed according to the IEC/IEEE Standard for Binary Floating-Point
2396 1.1 ross Arithmetic---which means in particular that the conversion is rounded
2397 1.1 ross according to the current rounding mode. If `a' is a NaN, the largest
2398 1.1 ross positive integer is returned. Otherwise, if the conversion overflows, the
2399 1.1 ross largest integer with the same sign as `a' is returned.
2400 1.1 ross -------------------------------------------------------------------------------
2401 1.1 ross */
2402 1.1 ross int64 float64_to_int64( float64 a )
2403 1.1 ross {
2404 1.1 ross flag aSign;
2405 1.1 ross int16 aExp, shiftCount;
2406 1.1 ross bits64 aSig, aSigExtra;
2407 1.1 ross
2408 1.1 ross aSig = extractFloat64Frac( a );
2409 1.1 ross aExp = extractFloat64Exp( a );
2410 1.1 ross aSign = extractFloat64Sign( a );
2411 1.1 ross if ( aExp ) aSig |= LIT64( 0x0010000000000000 );
2412 1.1 ross shiftCount = 0x433 - aExp;
2413 1.1 ross if ( shiftCount <= 0 ) {
2414 1.1 ross if ( 0x43E < aExp ) {
2415 1.1 ross float_raise( float_flag_invalid );
2416 1.1 ross if ( ! aSign
2417 1.1 ross || ( ( aExp == 0x7FF )
2418 1.1 ross && ( aSig != LIT64( 0x0010000000000000 ) ) )
2419 1.1 ross ) {
2420 1.1 ross return LIT64( 0x7FFFFFFFFFFFFFFF );
2421 1.1 ross }
2422 1.1 ross return (sbits64) LIT64( 0x8000000000000000 );
2423 1.1 ross }
2424 1.1 ross aSigExtra = 0;
2425 1.1 ross aSig <<= - shiftCount;
2426 1.1 ross }
2427 1.1 ross else {
2428 1.1 ross shift64ExtraRightJamming( aSig, 0, shiftCount, &aSig, &aSigExtra );
2429 1.1 ross }
2430 1.1 ross return roundAndPackInt64( aSign, aSig, aSigExtra );
2431 1.1 ross
2432 1.1 ross }
2433 1.1 ross
2434 1.6 martin /* like above, but result is unsigned */
2435 1.6 martin uint64 float64_to_uint64( float64 a )
2436 1.6 martin {
2437 1.6 martin flag aSign;
2438 1.6 martin int16 aExp, shiftCount;
2439 1.6 martin bits64 aSig, aSigExtra;
2440 1.6 martin
2441 1.6 martin aSig = extractFloat64Frac( a );
2442 1.6 martin aExp = extractFloat64Exp( a );
2443 1.6 martin aSign = extractFloat64Sign( a );
2444 1.6 martin
2445 1.6 martin if (aSign) {
2446 1.6 martin return float64_to_int64(a);
2447 1.6 martin }
2448 1.6 martin
2449 1.6 martin if ( aExp ) aSig |= LIT64( 0x0010000000000000 );
2450 1.6 martin shiftCount = 0x433 - aExp;
2451 1.6 martin if ( shiftCount <= 0 ) {
2452 1.6 martin if ( 0x43E < aExp ) {
2453 1.6 martin float_raise( float_flag_invalid );
2454 1.6 martin if ( ! aSign
2455 1.6 martin || ( ( aExp == 0x7FF )
2456 1.6 martin && ( aSig != LIT64( 0x0010000000000000 ) ) )
2457 1.6 martin ) {
2458 1.6 martin return LIT64( 0x7FFFFFFFFFFFFFFF );
2459 1.6 martin }
2460 1.6 martin return (sbits64) LIT64( 0x8000000000000000 );
2461 1.6 martin }
2462 1.6 martin aSigExtra = 0;
2463 1.6 martin aSig <<= - shiftCount;
2464 1.6 martin }
2465 1.6 martin else {
2466 1.6 martin shift64ExtraRightJamming( aSig, 0, shiftCount, &aSig, &aSigExtra );
2467 1.6 martin }
2468 1.6 martin return roundAndPackUInt64( aSig, aSigExtra );
2469 1.6 martin
2470 1.6 martin }
2471 1.6 martin
2472 1.1 ross /*
2473 1.1 ross -------------------------------------------------------------------------------
2474 1.1 ross Returns the result of converting the double-precision floating-point value
2475 1.1 ross `a' to the 64-bit two's complement integer format. The conversion is
2476 1.1 ross performed according to the IEC/IEEE Standard for Binary Floating-Point
2477 1.1 ross Arithmetic, except that the conversion is always rounded toward zero.
2478 1.1 ross If `a' is a NaN, the largest positive integer is returned. Otherwise, if
2479 1.1 ross the conversion overflows, the largest integer with the same sign as `a' is
2480 1.1 ross returned.
2481 1.1 ross -------------------------------------------------------------------------------
2482 1.1 ross */
2483 1.1 ross int64 float64_to_int64_round_to_zero( float64 a )
2484 1.1 ross {
2485 1.1 ross flag aSign;
2486 1.1 ross int16 aExp, shiftCount;
2487 1.1 ross bits64 aSig;
2488 1.1 ross int64 z;
2489 1.1 ross
2490 1.1 ross aSig = extractFloat64Frac( a );
2491 1.1 ross aExp = extractFloat64Exp( a );
2492 1.1 ross aSign = extractFloat64Sign( a );
2493 1.1 ross if ( aExp ) aSig |= LIT64( 0x0010000000000000 );
2494 1.1 ross shiftCount = aExp - 0x433;
2495 1.1 ross if ( 0 <= shiftCount ) {
2496 1.1 ross if ( 0x43E <= aExp ) {
2497 1.1 ross if ( a != LIT64( 0xC3E0000000000000 ) ) {
2498 1.1 ross float_raise( float_flag_invalid );
2499 1.1 ross if ( ! aSign
2500 1.1 ross || ( ( aExp == 0x7FF )
2501 1.1 ross && ( aSig != LIT64( 0x0010000000000000 ) ) )
2502 1.1 ross ) {
2503 1.1 ross return LIT64( 0x7FFFFFFFFFFFFFFF );
2504 1.1 ross }
2505 1.1 ross }
2506 1.1 ross return (sbits64) LIT64( 0x8000000000000000 );
2507 1.1 ross }
2508 1.1 ross z = aSig<<shiftCount;
2509 1.1 ross }
2510 1.1 ross else {
2511 1.1 ross if ( aExp < 0x3FE ) {
2512 1.1 ross if ( aExp | aSig ) float_set_inexact();
2513 1.1 ross return 0;
2514 1.1 ross }
2515 1.1 ross z = aSig>>( - shiftCount );
2516 1.1 ross if ( (bits64) ( aSig<<( shiftCount & 63 ) ) ) {
2517 1.1 ross float_set_inexact();
2518 1.1 ross }
2519 1.1 ross }
2520 1.1 ross if ( aSign ) z = - z;
2521 1.1 ross return z;
2522 1.1 ross
2523 1.1 ross }
2524 1.1 ross #endif /* !SOFTFLOAT_FOR_GCC */
2525 1.1 ross
2526 1.1 ross /*
2527 1.1 ross -------------------------------------------------------------------------------
2528 1.1 ross Returns the result of converting the double-precision floating-point value
2529 1.1 ross `a' to the single-precision floating-point format. The conversion is
2530 1.1 ross performed according to the IEC/IEEE Standard for Binary Floating-Point
2531 1.1 ross Arithmetic.
2532 1.1 ross -------------------------------------------------------------------------------
2533 1.1 ross */
2534 1.1 ross float32 float64_to_float32( float64 a )
2535 1.1 ross {
2536 1.1 ross flag aSign;
2537 1.1 ross int16 aExp;
2538 1.1 ross bits64 aSig;
2539 1.1 ross bits32 zSig;
2540 1.1 ross
2541 1.1 ross aSig = extractFloat64Frac( a );
2542 1.1 ross aExp = extractFloat64Exp( a );
2543 1.1 ross aSign = extractFloat64Sign( a );
2544 1.1 ross if ( aExp == 0x7FF ) {
2545 1.1 ross if ( aSig ) return commonNaNToFloat32( float64ToCommonNaN( a ) );
2546 1.1 ross return packFloat32( aSign, 0xFF, 0 );
2547 1.1 ross }
2548 1.1 ross shift64RightJamming( aSig, 22, &aSig );
2549 1.1 ross zSig = aSig;
2550 1.1 ross if ( aExp || zSig ) {
2551 1.1 ross zSig |= 0x40000000;
2552 1.1 ross aExp -= 0x381;
2553 1.1 ross }
2554 1.1 ross return roundAndPackFloat32( aSign, aExp, zSig );
2555 1.1 ross
2556 1.1 ross }
2557 1.1 ross
2558 1.1 ross #ifdef FLOATX80
2559 1.1 ross
2560 1.1 ross /*
2561 1.1 ross -------------------------------------------------------------------------------
2562 1.1 ross Returns the result of converting the double-precision floating-point value
2563 1.1 ross `a' to the extended double-precision floating-point format. The conversion
2564 1.1 ross is performed according to the IEC/IEEE Standard for Binary Floating-Point
2565 1.1 ross Arithmetic.
2566 1.1 ross -------------------------------------------------------------------------------
2567 1.1 ross */
2568 1.1 ross floatx80 float64_to_floatx80( float64 a )
2569 1.1 ross {
2570 1.1 ross flag aSign;
2571 1.1 ross int16 aExp;
2572 1.1 ross bits64 aSig;
2573 1.1 ross
2574 1.1 ross aSig = extractFloat64Frac( a );
2575 1.1 ross aExp = extractFloat64Exp( a );
2576 1.1 ross aSign = extractFloat64Sign( a );
2577 1.1 ross if ( aExp == 0x7FF ) {
2578 1.1 ross if ( aSig ) return commonNaNToFloatx80( float64ToCommonNaN( a ) );
2579 1.1 ross return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
2580 1.1 ross }
2581 1.1 ross if ( aExp == 0 ) {
2582 1.1 ross if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 );
2583 1.1 ross normalizeFloat64Subnormal( aSig, &aExp, &aSig );
2584 1.1 ross }
2585 1.1 ross return
2586 1.1 ross packFloatx80(
2587 1.1 ross aSign, aExp + 0x3C00, ( aSig | LIT64( 0x0010000000000000 ) )<<11 );
2588 1.1 ross
2589 1.1 ross }
2590 1.1 ross
2591 1.1 ross #endif
2592 1.1 ross
2593 1.1 ross #ifdef FLOAT128
2594 1.1 ross
2595 1.1 ross /*
2596 1.1 ross -------------------------------------------------------------------------------
2597 1.1 ross Returns the result of converting the double-precision floating-point value
2598 1.1 ross `a' to the quadruple-precision floating-point format. The conversion is
2599 1.1 ross performed according to the IEC/IEEE Standard for Binary Floating-Point
2600 1.1 ross Arithmetic.
2601 1.1 ross -------------------------------------------------------------------------------
2602 1.1 ross */
2603 1.1 ross float128 float64_to_float128( float64 a )
2604 1.1 ross {
2605 1.1 ross flag aSign;
2606 1.1 ross int16 aExp;
2607 1.1 ross bits64 aSig, zSig0, zSig1;
2608 1.1 ross
2609 1.1 ross aSig = extractFloat64Frac( a );
2610 1.1 ross aExp = extractFloat64Exp( a );
2611 1.1 ross aSign = extractFloat64Sign( a );
2612 1.1 ross if ( aExp == 0x7FF ) {
2613 1.1 ross if ( aSig ) return commonNaNToFloat128( float64ToCommonNaN( a ) );
2614 1.1 ross return packFloat128( aSign, 0x7FFF, 0, 0 );
2615 1.1 ross }
2616 1.1 ross if ( aExp == 0 ) {
2617 1.1 ross if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 );
2618 1.1 ross normalizeFloat64Subnormal( aSig, &aExp, &aSig );
2619 1.1 ross --aExp;
2620 1.1 ross }
2621 1.1 ross shift128Right( aSig, 0, 4, &zSig0, &zSig1 );
2622 1.1 ross return packFloat128( aSign, aExp + 0x3C00, zSig0, zSig1 );
2623 1.1 ross
2624 1.1 ross }
2625 1.1 ross
2626 1.1 ross #endif
2627 1.1 ross
2628 1.1 ross #ifndef SOFTFLOAT_FOR_GCC
2629 1.1 ross /*
2630 1.1 ross -------------------------------------------------------------------------------
2631 1.1 ross Rounds the double-precision floating-point value `a' to an integer, and
2632 1.1 ross returns the result as a double-precision floating-point value. The
2633 1.1 ross operation is performed according to the IEC/IEEE Standard for Binary
2634 1.1 ross Floating-Point Arithmetic.
2635 1.1 ross -------------------------------------------------------------------------------
2636 1.1 ross */
2637 1.1 ross float64 float64_round_to_int( float64 a )
2638 1.1 ross {
2639 1.1 ross flag aSign;
2640 1.1 ross int16 aExp;
2641 1.1 ross bits64 lastBitMask, roundBitsMask;
2642 1.1 ross int8 roundingMode;
2643 1.1 ross float64 z;
2644 1.1 ross
2645 1.1 ross aExp = extractFloat64Exp( a );
2646 1.1 ross if ( 0x433 <= aExp ) {
2647 1.1 ross if ( ( aExp == 0x7FF ) && extractFloat64Frac( a ) ) {
2648 1.1 ross return propagateFloat64NaN( a, a );
2649 1.1 ross }
2650 1.1 ross return a;
2651 1.1 ross }
2652 1.1 ross if ( aExp < 0x3FF ) {
2653 1.1 ross if ( (bits64) ( a<<1 ) == 0 ) return a;
2654 1.1 ross float_set_inexact();
2655 1.1 ross aSign = extractFloat64Sign( a );
2656 1.1 ross switch ( float_rounding_mode() ) {
2657 1.1 ross case float_round_nearest_even:
2658 1.1 ross if ( ( aExp == 0x3FE ) && extractFloat64Frac( a ) ) {
2659 1.1 ross return packFloat64( aSign, 0x3FF, 0 );
2660 1.1 ross }
2661 1.1 ross break;
2662 1.1 ross case float_round_down:
2663 1.1 ross return aSign ? LIT64( 0xBFF0000000000000 ) : 0;
2664 1.1 ross case float_round_up:
2665 1.1 ross return
2666 1.1 ross aSign ? LIT64( 0x8000000000000000 ) : LIT64( 0x3FF0000000000000 );
2667 1.1 ross }
2668 1.1 ross return packFloat64( aSign, 0, 0 );
2669 1.1 ross }
2670 1.1 ross lastBitMask = 1;
2671 1.1 ross lastBitMask <<= 0x433 - aExp;
2672 1.1 ross roundBitsMask = lastBitMask - 1;
2673 1.1 ross z = a;
2674 1.1 ross roundingMode = float_rounding_mode();
2675 1.1 ross if ( roundingMode == float_round_nearest_even ) {
2676 1.1 ross z += lastBitMask>>1;
2677 1.1 ross if ( ( z & roundBitsMask ) == 0 ) z &= ~ lastBitMask;
2678 1.1 ross }
2679 1.1 ross else if ( roundingMode != float_round_to_zero ) {
2680 1.1 ross if ( extractFloat64Sign( z ) ^ ( roundingMode == float_round_up ) ) {
2681 1.1 ross z += roundBitsMask;
2682 1.1 ross }
2683 1.1 ross }
2684 1.1 ross z &= ~ roundBitsMask;
2685 1.1 ross if ( z != a ) float_set_inexact();
2686 1.1 ross return z;
2687 1.1 ross
2688 1.1 ross }
2689 1.1 ross #endif
2690 1.1 ross
2691 1.1 ross /*
2692 1.1 ross -------------------------------------------------------------------------------
2693 1.1 ross Returns the result of adding the absolute values of the double-precision
2694 1.1 ross floating-point values `a' and `b'. If `zSign' is 1, the sum is negated
2695 1.1 ross before being returned. `zSign' is ignored if the result is a NaN.
2696 1.1 ross The addition is performed according to the IEC/IEEE Standard for Binary
2697 1.1 ross Floating-Point Arithmetic.
2698 1.1 ross -------------------------------------------------------------------------------
2699 1.1 ross */
2700 1.1 ross static float64 addFloat64Sigs( float64 a, float64 b, flag zSign )
2701 1.1 ross {
2702 1.1 ross int16 aExp, bExp, zExp;
2703 1.1 ross bits64 aSig, bSig, zSig;
2704 1.1 ross int16 expDiff;
2705 1.1 ross
2706 1.1 ross aSig = extractFloat64Frac( a );
2707 1.1 ross aExp = extractFloat64Exp( a );
2708 1.1 ross bSig = extractFloat64Frac( b );
2709 1.1 ross bExp = extractFloat64Exp( b );
2710 1.1 ross expDiff = aExp - bExp;
2711 1.1 ross aSig <<= 9;
2712 1.1 ross bSig <<= 9;
2713 1.1 ross if ( 0 < expDiff ) {
2714 1.1 ross if ( aExp == 0x7FF ) {
2715 1.1 ross if ( aSig ) return propagateFloat64NaN( a, b );
2716 1.1 ross return a;
2717 1.1 ross }
2718 1.1 ross if ( bExp == 0 ) {
2719 1.1 ross --expDiff;
2720 1.1 ross }
2721 1.1 ross else {
2722 1.1 ross bSig |= LIT64( 0x2000000000000000 );
2723 1.1 ross }
2724 1.1 ross shift64RightJamming( bSig, expDiff, &bSig );
2725 1.1 ross zExp = aExp;
2726 1.1 ross }
2727 1.1 ross else if ( expDiff < 0 ) {
2728 1.1 ross if ( bExp == 0x7FF ) {
2729 1.1 ross if ( bSig ) return propagateFloat64NaN( a, b );
2730 1.1 ross return packFloat64( zSign, 0x7FF, 0 );
2731 1.1 ross }
2732 1.1 ross if ( aExp == 0 ) {
2733 1.1 ross ++expDiff;
2734 1.1 ross }
2735 1.1 ross else {
2736 1.1 ross aSig |= LIT64( 0x2000000000000000 );
2737 1.1 ross }
2738 1.1 ross shift64RightJamming( aSig, - expDiff, &aSig );
2739 1.1 ross zExp = bExp;
2740 1.1 ross }
2741 1.1 ross else {
2742 1.1 ross if ( aExp == 0x7FF ) {
2743 1.1 ross if ( aSig | bSig ) return propagateFloat64NaN( a, b );
2744 1.1 ross return a;
2745 1.1 ross }
2746 1.1 ross if ( aExp == 0 ) return packFloat64( zSign, 0, ( aSig + bSig )>>9 );
2747 1.1 ross zSig = LIT64( 0x4000000000000000 ) + aSig + bSig;
2748 1.1 ross zExp = aExp;
2749 1.1 ross goto roundAndPack;
2750 1.1 ross }
2751 1.1 ross aSig |= LIT64( 0x2000000000000000 );
2752 1.1 ross zSig = ( aSig + bSig )<<1;
2753 1.1 ross --zExp;
2754 1.1 ross if ( (sbits64) zSig < 0 ) {
2755 1.1 ross zSig = aSig + bSig;
2756 1.1 ross ++zExp;
2757 1.1 ross }
2758 1.1 ross roundAndPack:
2759 1.1 ross return roundAndPackFloat64( zSign, zExp, zSig );
2760 1.1 ross
2761 1.1 ross }
2762 1.1 ross
2763 1.1 ross /*
2764 1.1 ross -------------------------------------------------------------------------------
2765 1.1 ross Returns the result of subtracting the absolute values of the double-
2766 1.1 ross precision floating-point values `a' and `b'. If `zSign' is 1, the
2767 1.1 ross difference is negated before being returned. `zSign' is ignored if the
2768 1.1 ross result is a NaN. The subtraction is performed according to the IEC/IEEE
2769 1.1 ross Standard for Binary Floating-Point Arithmetic.
2770 1.1 ross -------------------------------------------------------------------------------
2771 1.1 ross */
2772 1.1 ross static float64 subFloat64Sigs( float64 a, float64 b, flag zSign )
2773 1.1 ross {
2774 1.1 ross int16 aExp, bExp, zExp;
2775 1.1 ross bits64 aSig, bSig, zSig;
2776 1.1 ross int16 expDiff;
2777 1.1 ross
2778 1.1 ross aSig = extractFloat64Frac( a );
2779 1.1 ross aExp = extractFloat64Exp( a );
2780 1.1 ross bSig = extractFloat64Frac( b );
2781 1.1 ross bExp = extractFloat64Exp( b );
2782 1.1 ross expDiff = aExp - bExp;
2783 1.1 ross aSig <<= 10;
2784 1.1 ross bSig <<= 10;
2785 1.1 ross if ( 0 < expDiff ) goto aExpBigger;
2786 1.1 ross if ( expDiff < 0 ) goto bExpBigger;
2787 1.1 ross if ( aExp == 0x7FF ) {
2788 1.1 ross if ( aSig | bSig ) return propagateFloat64NaN( a, b );
2789 1.1 ross float_raise( float_flag_invalid );
2790 1.1 ross return float64_default_nan;
2791 1.1 ross }
2792 1.1 ross if ( aExp == 0 ) {
2793 1.1 ross aExp = 1;
2794 1.1 ross bExp = 1;
2795 1.1 ross }
2796 1.1 ross if ( bSig < aSig ) goto aBigger;
2797 1.1 ross if ( aSig < bSig ) goto bBigger;
2798 1.1 ross return packFloat64( float_rounding_mode() == float_round_down, 0, 0 );
2799 1.1 ross bExpBigger:
2800 1.1 ross if ( bExp == 0x7FF ) {
2801 1.1 ross if ( bSig ) return propagateFloat64NaN( a, b );
2802 1.1 ross return packFloat64( zSign ^ 1, 0x7FF, 0 );
2803 1.1 ross }
2804 1.1 ross if ( aExp == 0 ) {
2805 1.1 ross ++expDiff;
2806 1.1 ross }
2807 1.1 ross else {
2808 1.1 ross aSig |= LIT64( 0x4000000000000000 );
2809 1.1 ross }
2810 1.1 ross shift64RightJamming( aSig, - expDiff, &aSig );
2811 1.1 ross bSig |= LIT64( 0x4000000000000000 );
2812 1.1 ross bBigger:
2813 1.1 ross zSig = bSig - aSig;
2814 1.1 ross zExp = bExp;
2815 1.1 ross zSign ^= 1;
2816 1.1 ross goto normalizeRoundAndPack;
2817 1.1 ross aExpBigger:
2818 1.1 ross if ( aExp == 0x7FF ) {
2819 1.1 ross if ( aSig ) return propagateFloat64NaN( a, b );
2820 1.1 ross return a;
2821 1.1 ross }
2822 1.1 ross if ( bExp == 0 ) {
2823 1.1 ross --expDiff;
2824 1.1 ross }
2825 1.1 ross else {
2826 1.1 ross bSig |= LIT64( 0x4000000000000000 );
2827 1.1 ross }
2828 1.1 ross shift64RightJamming( bSig, expDiff, &bSig );
2829 1.1 ross aSig |= LIT64( 0x4000000000000000 );
2830 1.1 ross aBigger:
2831 1.1 ross zSig = aSig - bSig;
2832 1.1 ross zExp = aExp;
2833 1.1 ross normalizeRoundAndPack:
2834 1.1 ross --zExp;
2835 1.1 ross return normalizeRoundAndPackFloat64( zSign, zExp, zSig );
2836 1.1 ross
2837 1.1 ross }
2838 1.1 ross
2839 1.1 ross /*
2840 1.1 ross -------------------------------------------------------------------------------
2841 1.1 ross Returns the result of adding the double-precision floating-point values `a'
2842 1.1 ross and `b'. The operation is performed according to the IEC/IEEE Standard for
2843 1.1 ross Binary Floating-Point Arithmetic.
2844 1.1 ross -------------------------------------------------------------------------------
2845 1.1 ross */
2846 1.1 ross float64 float64_add( float64 a, float64 b )
2847 1.1 ross {
2848 1.1 ross flag aSign, bSign;
2849 1.1 ross
2850 1.1 ross aSign = extractFloat64Sign( a );
2851 1.1 ross bSign = extractFloat64Sign( b );
2852 1.1 ross if ( aSign == bSign ) {
2853 1.1 ross return addFloat64Sigs( a, b, aSign );
2854 1.1 ross }
2855 1.1 ross else {
2856 1.1 ross return subFloat64Sigs( a, b, aSign );
2857 1.1 ross }
2858 1.1 ross
2859 1.1 ross }
2860 1.1 ross
2861 1.1 ross /*
2862 1.1 ross -------------------------------------------------------------------------------
2863 1.1 ross Returns the result of subtracting the double-precision floating-point values
2864 1.1 ross `a' and `b'. The operation is performed according to the IEC/IEEE Standard
2865 1.1 ross for Binary Floating-Point Arithmetic.
2866 1.1 ross -------------------------------------------------------------------------------
2867 1.1 ross */
2868 1.1 ross float64 float64_sub( float64 a, float64 b )
2869 1.1 ross {
2870 1.1 ross flag aSign, bSign;
2871 1.1 ross
2872 1.1 ross aSign = extractFloat64Sign( a );
2873 1.1 ross bSign = extractFloat64Sign( b );
2874 1.1 ross if ( aSign == bSign ) {
2875 1.1 ross return subFloat64Sigs( a, b, aSign );
2876 1.1 ross }
2877 1.1 ross else {
2878 1.1 ross return addFloat64Sigs( a, b, aSign );
2879 1.1 ross }
2880 1.1 ross
2881 1.1 ross }
2882 1.1 ross
2883 1.1 ross /*
2884 1.1 ross -------------------------------------------------------------------------------
2885 1.1 ross Returns the result of multiplying the double-precision floating-point values
2886 1.1 ross `a' and `b'. The operation is performed according to the IEC/IEEE Standard
2887 1.1 ross for Binary Floating-Point Arithmetic.
2888 1.1 ross -------------------------------------------------------------------------------
2889 1.1 ross */
2890 1.1 ross float64 float64_mul( float64 a, float64 b )
2891 1.1 ross {
2892 1.1 ross flag aSign, bSign, zSign;
2893 1.1 ross int16 aExp, bExp, zExp;
2894 1.1 ross bits64 aSig, bSig, zSig0, zSig1;
2895 1.1 ross
2896 1.1 ross aSig = extractFloat64Frac( a );
2897 1.1 ross aExp = extractFloat64Exp( a );
2898 1.1 ross aSign = extractFloat64Sign( a );
2899 1.1 ross bSig = extractFloat64Frac( b );
2900 1.1 ross bExp = extractFloat64Exp( b );
2901 1.1 ross bSign = extractFloat64Sign( b );
2902 1.1 ross zSign = aSign ^ bSign;
2903 1.1 ross if ( aExp == 0x7FF ) {
2904 1.1 ross if ( aSig || ( ( bExp == 0x7FF ) && bSig ) ) {
2905 1.1 ross return propagateFloat64NaN( a, b );
2906 1.1 ross }
2907 1.1 ross if ( ( bExp | bSig ) == 0 ) {
2908 1.1 ross float_raise( float_flag_invalid );
2909 1.1 ross return float64_default_nan;
2910 1.1 ross }
2911 1.1 ross return packFloat64( zSign, 0x7FF, 0 );
2912 1.1 ross }
2913 1.1 ross if ( bExp == 0x7FF ) {
2914 1.1 ross if ( bSig ) return propagateFloat64NaN( a, b );
2915 1.1 ross if ( ( aExp | aSig ) == 0 ) {
2916 1.1 ross float_raise( float_flag_invalid );
2917 1.1 ross return float64_default_nan;
2918 1.1 ross }
2919 1.1 ross return packFloat64( zSign, 0x7FF, 0 );
2920 1.1 ross }
2921 1.1 ross if ( aExp == 0 ) {
2922 1.1 ross if ( aSig == 0 ) return packFloat64( zSign, 0, 0 );
2923 1.1 ross normalizeFloat64Subnormal( aSig, &aExp, &aSig );
2924 1.1 ross }
2925 1.1 ross if ( bExp == 0 ) {
2926 1.1 ross if ( bSig == 0 ) return packFloat64( zSign, 0, 0 );
2927 1.1 ross normalizeFloat64Subnormal( bSig, &bExp, &bSig );
2928 1.1 ross }
2929 1.1 ross zExp = aExp + bExp - 0x3FF;
2930 1.1 ross aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<10;
2931 1.1 ross bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;
2932 1.1 ross mul64To128( aSig, bSig, &zSig0, &zSig1 );
2933 1.1 ross zSig0 |= ( zSig1 != 0 );
2934 1.1 ross if ( 0 <= (sbits64) ( zSig0<<1 ) ) {
2935 1.1 ross zSig0 <<= 1;
2936 1.1 ross --zExp;
2937 1.1 ross }
2938 1.1 ross return roundAndPackFloat64( zSign, zExp, zSig0 );
2939 1.1 ross
2940 1.1 ross }
2941 1.1 ross
2942 1.1 ross /*
2943 1.1 ross -------------------------------------------------------------------------------
2944 1.1 ross Returns the result of dividing the double-precision floating-point value `a'
2945 1.1 ross by the corresponding value `b'. The operation is performed according to
2946 1.1 ross the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2947 1.1 ross -------------------------------------------------------------------------------
2948 1.1 ross */
2949 1.1 ross float64 float64_div( float64 a, float64 b )
2950 1.1 ross {
2951 1.1 ross flag aSign, bSign, zSign;
2952 1.1 ross int16 aExp, bExp, zExp;
2953 1.1 ross bits64 aSig, bSig, zSig;
2954 1.1 ross bits64 rem0, rem1;
2955 1.1 ross bits64 term0, term1;
2956 1.1 ross
2957 1.1 ross aSig = extractFloat64Frac( a );
2958 1.1 ross aExp = extractFloat64Exp( a );
2959 1.1 ross aSign = extractFloat64Sign( a );
2960 1.1 ross bSig = extractFloat64Frac( b );
2961 1.1 ross bExp = extractFloat64Exp( b );
2962 1.1 ross bSign = extractFloat64Sign( b );
2963 1.1 ross zSign = aSign ^ bSign;
2964 1.1 ross if ( aExp == 0x7FF ) {
2965 1.1 ross if ( aSig ) return propagateFloat64NaN( a, b );
2966 1.1 ross if ( bExp == 0x7FF ) {
2967 1.1 ross if ( bSig ) return propagateFloat64NaN( a, b );
2968 1.1 ross float_raise( float_flag_invalid );
2969 1.1 ross return float64_default_nan;
2970 1.1 ross }
2971 1.1 ross return packFloat64( zSign, 0x7FF, 0 );
2972 1.1 ross }
2973 1.1 ross if ( bExp == 0x7FF ) {
2974 1.1 ross if ( bSig ) return propagateFloat64NaN( a, b );
2975 1.1 ross return packFloat64( zSign, 0, 0 );
2976 1.1 ross }
2977 1.1 ross if ( bExp == 0 ) {
2978 1.1 ross if ( bSig == 0 ) {
2979 1.1 ross if ( ( aExp | aSig ) == 0 ) {
2980 1.1 ross float_raise( float_flag_invalid );
2981 1.1 ross return float64_default_nan;
2982 1.1 ross }
2983 1.1 ross float_raise( float_flag_divbyzero );
2984 1.1 ross return packFloat64( zSign, 0x7FF, 0 );
2985 1.1 ross }
2986 1.1 ross normalizeFloat64Subnormal( bSig, &bExp, &bSig );
2987 1.1 ross }
2988 1.1 ross if ( aExp == 0 ) {
2989 1.1 ross if ( aSig == 0 ) return packFloat64( zSign, 0, 0 );
2990 1.1 ross normalizeFloat64Subnormal( aSig, &aExp, &aSig );
2991 1.1 ross }
2992 1.1 ross zExp = aExp - bExp + 0x3FD;
2993 1.1 ross aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<10;
2994 1.1 ross bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;
2995 1.1 ross if ( bSig <= ( aSig + aSig ) ) {
2996 1.1 ross aSig >>= 1;
2997 1.1 ross ++zExp;
2998 1.1 ross }
2999 1.1 ross zSig = estimateDiv128To64( aSig, 0, bSig );
3000 1.1 ross if ( ( zSig & 0x1FF ) <= 2 ) {
3001 1.1 ross mul64To128( bSig, zSig, &term0, &term1 );
3002 1.1 ross sub128( aSig, 0, term0, term1, &rem0, &rem1 );
3003 1.1 ross while ( (sbits64) rem0 < 0 ) {
3004 1.1 ross --zSig;
3005 1.1 ross add128( rem0, rem1, 0, bSig, &rem0, &rem1 );
3006 1.1 ross }
3007 1.1 ross zSig |= ( rem1 != 0 );
3008 1.1 ross }
3009 1.1 ross return roundAndPackFloat64( zSign, zExp, zSig );
3010 1.1 ross
3011 1.1 ross }
3012 1.1 ross
3013 1.1 ross #ifndef SOFTFLOAT_FOR_GCC
3014 1.1 ross /*
3015 1.1 ross -------------------------------------------------------------------------------
3016 1.1 ross Returns the remainder of the double-precision floating-point value `a'
3017 1.1 ross with respect to the corresponding value `b'. The operation is performed
3018 1.1 ross according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3019 1.1 ross -------------------------------------------------------------------------------
3020 1.1 ross */
3021 1.1 ross float64 float64_rem( float64 a, float64 b )
3022 1.1 ross {
3023 1.5 christos flag aSign, bSign __unused, zSign;
3024 1.1 ross int16 aExp, bExp, expDiff;
3025 1.1 ross bits64 aSig, bSig;
3026 1.1 ross bits64 q, alternateASig;
3027 1.1 ross sbits64 sigMean;
3028 1.1 ross
3029 1.1 ross aSig = extractFloat64Frac( a );
3030 1.1 ross aExp = extractFloat64Exp( a );
3031 1.1 ross aSign = extractFloat64Sign( a );
3032 1.1 ross bSig = extractFloat64Frac( b );
3033 1.1 ross bExp = extractFloat64Exp( b );
3034 1.1 ross bSign = extractFloat64Sign( b );
3035 1.1 ross if ( aExp == 0x7FF ) {
3036 1.1 ross if ( aSig || ( ( bExp == 0x7FF ) && bSig ) ) {
3037 1.1 ross return propagateFloat64NaN( a, b );
3038 1.1 ross }
3039 1.1 ross float_raise( float_flag_invalid );
3040 1.1 ross return float64_default_nan;
3041 1.1 ross }
3042 1.1 ross if ( bExp == 0x7FF ) {
3043 1.1 ross if ( bSig ) return propagateFloat64NaN( a, b );
3044 1.1 ross return a;
3045 1.1 ross }
3046 1.1 ross if ( bExp == 0 ) {
3047 1.1 ross if ( bSig == 0 ) {
3048 1.1 ross float_raise( float_flag_invalid );
3049 1.1 ross return float64_default_nan;
3050 1.1 ross }
3051 1.1 ross normalizeFloat64Subnormal( bSig, &bExp, &bSig );
3052 1.1 ross }
3053 1.1 ross if ( aExp == 0 ) {
3054 1.1 ross if ( aSig == 0 ) return a;
3055 1.1 ross normalizeFloat64Subnormal( aSig, &aExp, &aSig );
3056 1.1 ross }
3057 1.1 ross expDiff = aExp - bExp;
3058 1.1 ross aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<11;
3059 1.1 ross bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;
3060 1.1 ross if ( expDiff < 0 ) {
3061 1.1 ross if ( expDiff < -1 ) return a;
3062 1.1 ross aSig >>= 1;
3063 1.1 ross }
3064 1.1 ross q = ( bSig <= aSig );
3065 1.1 ross if ( q ) aSig -= bSig;
3066 1.1 ross expDiff -= 64;
3067 1.1 ross while ( 0 < expDiff ) {
3068 1.1 ross q = estimateDiv128To64( aSig, 0, bSig );
3069 1.1 ross q = ( 2 < q ) ? q - 2 : 0;
3070 1.1 ross aSig = - ( ( bSig>>2 ) * q );
3071 1.1 ross expDiff -= 62;
3072 1.1 ross }
3073 1.1 ross expDiff += 64;
3074 1.1 ross if ( 0 < expDiff ) {
3075 1.1 ross q = estimateDiv128To64( aSig, 0, bSig );
3076 1.1 ross q = ( 2 < q ) ? q - 2 : 0;
3077 1.1 ross q >>= 64 - expDiff;
3078 1.1 ross bSig >>= 2;
3079 1.1 ross aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q;
3080 1.1 ross }
3081 1.1 ross else {
3082 1.1 ross aSig >>= 2;
3083 1.1 ross bSig >>= 2;
3084 1.1 ross }
3085 1.1 ross do {
3086 1.1 ross alternateASig = aSig;
3087 1.1 ross ++q;
3088 1.1 ross aSig -= bSig;
3089 1.1 ross } while ( 0 <= (sbits64) aSig );
3090 1.1 ross sigMean = aSig + alternateASig;
3091 1.1 ross if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) {
3092 1.1 ross aSig = alternateASig;
3093 1.1 ross }
3094 1.1 ross zSign = ( (sbits64) aSig < 0 );
3095 1.1 ross if ( zSign ) aSig = - aSig;
3096 1.1 ross return normalizeRoundAndPackFloat64( aSign ^ zSign, bExp, aSig );
3097 1.1 ross
3098 1.1 ross }
3099 1.1 ross
3100 1.1 ross /*
3101 1.1 ross -------------------------------------------------------------------------------
3102 1.1 ross Returns the square root of the double-precision floating-point value `a'.
3103 1.1 ross The operation is performed according to the IEC/IEEE Standard for Binary
3104 1.1 ross Floating-Point Arithmetic.
3105 1.1 ross -------------------------------------------------------------------------------
3106 1.1 ross */
3107 1.1 ross float64 float64_sqrt( float64 a )
3108 1.1 ross {
3109 1.1 ross flag aSign;
3110 1.1 ross int16 aExp, zExp;
3111 1.1 ross bits64 aSig, zSig, doubleZSig;
3112 1.1 ross bits64 rem0, rem1, term0, term1;
3113 1.1 ross
3114 1.1 ross aSig = extractFloat64Frac( a );
3115 1.1 ross aExp = extractFloat64Exp( a );
3116 1.1 ross aSign = extractFloat64Sign( a );
3117 1.1 ross if ( aExp == 0x7FF ) {
3118 1.1 ross if ( aSig ) return propagateFloat64NaN( a, a );
3119 1.1 ross if ( ! aSign ) return a;
3120 1.1 ross float_raise( float_flag_invalid );
3121 1.1 ross return float64_default_nan;
3122 1.1 ross }
3123 1.1 ross if ( aSign ) {
3124 1.1 ross if ( ( aExp | aSig ) == 0 ) return a;
3125 1.1 ross float_raise( float_flag_invalid );
3126 1.1 ross return float64_default_nan;
3127 1.1 ross }
3128 1.1 ross if ( aExp == 0 ) {
3129 1.1 ross if ( aSig == 0 ) return 0;
3130 1.1 ross normalizeFloat64Subnormal( aSig, &aExp, &aSig );
3131 1.1 ross }
3132 1.1 ross zExp = ( ( aExp - 0x3FF )>>1 ) + 0x3FE;
3133 1.1 ross aSig |= LIT64( 0x0010000000000000 );
3134 1.1 ross zSig = estimateSqrt32( aExp, aSig>>21 );
3135 1.1 ross aSig <<= 9 - ( aExp & 1 );
3136 1.1 ross zSig = estimateDiv128To64( aSig, 0, zSig<<32 ) + ( zSig<<30 );
3137 1.1 ross if ( ( zSig & 0x1FF ) <= 5 ) {
3138 1.1 ross doubleZSig = zSig<<1;
3139 1.1 ross mul64To128( zSig, zSig, &term0, &term1 );
3140 1.1 ross sub128( aSig, 0, term0, term1, &rem0, &rem1 );
3141 1.1 ross while ( (sbits64) rem0 < 0 ) {
3142 1.1 ross --zSig;
3143 1.1 ross doubleZSig -= 2;
3144 1.1 ross add128( rem0, rem1, zSig>>63, doubleZSig | 1, &rem0, &rem1 );
3145 1.1 ross }
3146 1.1 ross zSig |= ( ( rem0 | rem1 ) != 0 );
3147 1.1 ross }
3148 1.1 ross return roundAndPackFloat64( 0, zExp, zSig );
3149 1.1 ross
3150 1.1 ross }
3151 1.1 ross #endif
3152 1.1 ross
3153 1.1 ross /*
3154 1.1 ross -------------------------------------------------------------------------------
3155 1.1 ross Returns 1 if the double-precision floating-point value `a' is equal to the
3156 1.1 ross corresponding value `b', and 0 otherwise. The comparison is performed
3157 1.1 ross according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3158 1.1 ross -------------------------------------------------------------------------------
3159 1.1 ross */
3160 1.1 ross flag float64_eq( float64 a, float64 b )
3161 1.1 ross {
3162 1.1 ross
3163 1.1 ross if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
3164 1.1 ross || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
3165 1.1 ross ) {
3166 1.1 ross if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) {
3167 1.1 ross float_raise( float_flag_invalid );
3168 1.1 ross }
3169 1.1 ross return 0;
3170 1.1 ross }
3171 1.1 ross return ( a == b ) ||
3172 1.1 ross ( (bits64) ( ( FLOAT64_DEMANGLE(a) | FLOAT64_DEMANGLE(b) )<<1 ) == 0 );
3173 1.1 ross
3174 1.1 ross }
3175 1.1 ross
3176 1.1 ross /*
3177 1.1 ross -------------------------------------------------------------------------------
3178 1.1 ross Returns 1 if the double-precision floating-point value `a' is less than or
3179 1.1 ross equal to the corresponding value `b', and 0 otherwise. The comparison is
3180 1.1 ross performed according to the IEC/IEEE Standard for Binary Floating-Point
3181 1.1 ross Arithmetic.
3182 1.1 ross -------------------------------------------------------------------------------
3183 1.1 ross */
3184 1.1 ross flag float64_le( float64 a, float64 b )
3185 1.1 ross {
3186 1.1 ross flag aSign, bSign;
3187 1.1 ross
3188 1.1 ross if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
3189 1.1 ross || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
3190 1.1 ross ) {
3191 1.1 ross float_raise( float_flag_invalid );
3192 1.1 ross return 0;
3193 1.1 ross }
3194 1.1 ross aSign = extractFloat64Sign( a );
3195 1.1 ross bSign = extractFloat64Sign( b );
3196 1.1 ross if ( aSign != bSign )
3197 1.1 ross return aSign ||
3198 1.1 ross ( (bits64) ( ( FLOAT64_DEMANGLE(a) | FLOAT64_DEMANGLE(b) )<<1 ) ==
3199 1.1 ross 0 );
3200 1.1 ross return ( a == b ) ||
3201 1.1 ross ( aSign ^ ( FLOAT64_DEMANGLE(a) < FLOAT64_DEMANGLE(b) ) );
3202 1.1 ross
3203 1.1 ross }
3204 1.1 ross
3205 1.1 ross /*
3206 1.1 ross -------------------------------------------------------------------------------
3207 1.1 ross Returns 1 if the double-precision floating-point value `a' is less than
3208 1.1 ross the corresponding value `b', and 0 otherwise. The comparison is performed
3209 1.1 ross according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3210 1.1 ross -------------------------------------------------------------------------------
3211 1.1 ross */
3212 1.1 ross flag float64_lt( float64 a, float64 b )
3213 1.1 ross {
3214 1.1 ross flag aSign, bSign;
3215 1.1 ross
3216 1.1 ross if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
3217 1.1 ross || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
3218 1.1 ross ) {
3219 1.1 ross float_raise( float_flag_invalid );
3220 1.1 ross return 0;
3221 1.1 ross }
3222 1.1 ross aSign = extractFloat64Sign( a );
3223 1.1 ross bSign = extractFloat64Sign( b );
3224 1.1 ross if ( aSign != bSign )
3225 1.1 ross return aSign &&
3226 1.1 ross ( (bits64) ( ( FLOAT64_DEMANGLE(a) | FLOAT64_DEMANGLE(b) )<<1 ) !=
3227 1.1 ross 0 );
3228 1.1 ross return ( a != b ) &&
3229 1.1 ross ( aSign ^ ( FLOAT64_DEMANGLE(a) < FLOAT64_DEMANGLE(b) ) );
3230 1.1 ross
3231 1.1 ross }
3232 1.1 ross
3233 1.1 ross #ifndef SOFTFLOAT_FOR_GCC
3234 1.1 ross /*
3235 1.1 ross -------------------------------------------------------------------------------
3236 1.1 ross Returns 1 if the double-precision floating-point value `a' is equal to the
3237 1.1 ross corresponding value `b', and 0 otherwise. The invalid exception is raised
3238 1.1 ross if either operand is a NaN. Otherwise, the comparison is performed
3239 1.1 ross according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3240 1.1 ross -------------------------------------------------------------------------------
3241 1.1 ross */
3242 1.1 ross flag float64_eq_signaling( float64 a, float64 b )
3243 1.1 ross {
3244 1.1 ross
3245 1.1 ross if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
3246 1.1 ross || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
3247 1.1 ross ) {
3248 1.1 ross float_raise( float_flag_invalid );
3249 1.1 ross return 0;
3250 1.1 ross }
3251 1.1 ross return ( a == b ) || ( (bits64) ( ( a | b )<<1 ) == 0 );
3252 1.1 ross
3253 1.1 ross }
3254 1.1 ross
3255 1.1 ross /*
3256 1.1 ross -------------------------------------------------------------------------------
3257 1.1 ross Returns 1 if the double-precision floating-point value `a' is less than or
3258 1.1 ross equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not
3259 1.1 ross cause an exception. Otherwise, the comparison is performed according to the
3260 1.1 ross IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3261 1.1 ross -------------------------------------------------------------------------------
3262 1.1 ross */
3263 1.1 ross flag float64_le_quiet( float64 a, float64 b )
3264 1.1 ross {
3265 1.1 ross flag aSign, bSign;
3266 1.1 ross
3267 1.1 ross if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
3268 1.1 ross || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
3269 1.1 ross ) {
3270 1.1 ross if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) {
3271 1.1 ross float_raise( float_flag_invalid );
3272 1.1 ross }
3273 1.1 ross return 0;
3274 1.1 ross }
3275 1.1 ross aSign = extractFloat64Sign( a );
3276 1.1 ross bSign = extractFloat64Sign( b );
3277 1.1 ross if ( aSign != bSign ) return aSign || ( (bits64) ( ( a | b )<<1 ) == 0 );
3278 1.1 ross return ( a == b ) || ( aSign ^ ( a < b ) );
3279 1.1 ross
3280 1.1 ross }
3281 1.1 ross
3282 1.1 ross /*
3283 1.1 ross -------------------------------------------------------------------------------
3284 1.1 ross Returns 1 if the double-precision floating-point value `a' is less than
3285 1.1 ross the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
3286 1.1 ross exception. Otherwise, the comparison is performed according to the IEC/IEEE
3287 1.1 ross Standard for Binary Floating-Point Arithmetic.
3288 1.1 ross -------------------------------------------------------------------------------
3289 1.1 ross */
3290 1.1 ross flag float64_lt_quiet( float64 a, float64 b )
3291 1.1 ross {
3292 1.1 ross flag aSign, bSign;
3293 1.1 ross
3294 1.1 ross if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
3295 1.1 ross || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
3296 1.1 ross ) {
3297 1.1 ross if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) {
3298 1.1 ross float_raise( float_flag_invalid );
3299 1.1 ross }
3300 1.1 ross return 0;
3301 1.1 ross }
3302 1.1 ross aSign = extractFloat64Sign( a );
3303 1.1 ross bSign = extractFloat64Sign( b );
3304 1.1 ross if ( aSign != bSign ) return aSign && ( (bits64) ( ( a | b )<<1 ) != 0 );
3305 1.1 ross return ( a != b ) && ( aSign ^ ( a < b ) );
3306 1.1 ross
3307 1.1 ross }
3308 1.1 ross #endif
3309 1.1 ross
3310 1.1 ross #ifdef FLOATX80
3311 1.1 ross
3312 1.1 ross /*
3313 1.1 ross -------------------------------------------------------------------------------
3314 1.1 ross Returns the result of converting the extended double-precision floating-
3315 1.1 ross point value `a' to the 32-bit two's complement integer format. The
3316 1.1 ross conversion is performed according to the IEC/IEEE Standard for Binary
3317 1.1 ross Floating-Point Arithmetic---which means in particular that the conversion
3318 1.1 ross is rounded according to the current rounding mode. If `a' is a NaN, the
3319 1.1 ross largest positive integer is returned. Otherwise, if the conversion
3320 1.1 ross overflows, the largest integer with the same sign as `a' is returned.
3321 1.1 ross -------------------------------------------------------------------------------
3322 1.1 ross */
3323 1.1 ross int32 floatx80_to_int32( floatx80 a )
3324 1.1 ross {
3325 1.1 ross flag aSign;
3326 1.1 ross int32 aExp, shiftCount;
3327 1.1 ross bits64 aSig;
3328 1.1 ross
3329 1.1 ross aSig = extractFloatx80Frac( a );
3330 1.1 ross aExp = extractFloatx80Exp( a );
3331 1.1 ross aSign = extractFloatx80Sign( a );
3332 1.1 ross if ( ( aExp == 0x7FFF ) && (bits64) ( aSig<<1 ) ) aSign = 0;
3333 1.1 ross shiftCount = 0x4037 - aExp;
3334 1.1 ross if ( shiftCount <= 0 ) shiftCount = 1;
3335 1.1 ross shift64RightJamming( aSig, shiftCount, &aSig );
3336 1.1 ross return roundAndPackInt32( aSign, aSig );
3337 1.1 ross
3338 1.1 ross }
3339 1.1 ross
3340 1.1 ross /*
3341 1.1 ross -------------------------------------------------------------------------------
3342 1.1 ross Returns the result of converting the extended double-precision floating-
3343 1.1 ross point value `a' to the 32-bit two's complement integer format. The
3344 1.1 ross conversion is performed according to the IEC/IEEE Standard for Binary
3345 1.1 ross Floating-Point Arithmetic, except that the conversion is always rounded
3346 1.1 ross toward zero. If `a' is a NaN, the largest positive integer is returned.
3347 1.1 ross Otherwise, if the conversion overflows, the largest integer with the same
3348 1.1 ross sign as `a' is returned.
3349 1.1 ross -------------------------------------------------------------------------------
3350 1.1 ross */
3351 1.1 ross int32 floatx80_to_int32_round_to_zero( floatx80 a )
3352 1.1 ross {
3353 1.1 ross flag aSign;
3354 1.1 ross int32 aExp, shiftCount;
3355 1.1 ross bits64 aSig, savedASig;
3356 1.1 ross int32 z;
3357 1.1 ross
3358 1.1 ross aSig = extractFloatx80Frac( a );
3359 1.1 ross aExp = extractFloatx80Exp( a );
3360 1.1 ross aSign = extractFloatx80Sign( a );
3361 1.1 ross if ( 0x401E < aExp ) {
3362 1.1 ross if ( ( aExp == 0x7FFF ) && (bits64) ( aSig<<1 ) ) aSign = 0;
3363 1.1 ross goto invalid;
3364 1.1 ross }
3365 1.1 ross else if ( aExp < 0x3FFF ) {
3366 1.1 ross if ( aExp || aSig ) float_set_inexact();
3367 1.1 ross return 0;
3368 1.1 ross }
3369 1.1 ross shiftCount = 0x403E - aExp;
3370 1.1 ross savedASig = aSig;
3371 1.1 ross aSig >>= shiftCount;
3372 1.1 ross z = aSig;
3373 1.1 ross if ( aSign ) z = - z;
3374 1.1 ross if ( ( z < 0 ) ^ aSign ) {
3375 1.1 ross invalid:
3376 1.1 ross float_raise( float_flag_invalid );
3377 1.1 ross return aSign ? (sbits32) 0x80000000 : 0x7FFFFFFF;
3378 1.1 ross }
3379 1.1 ross if ( ( aSig<<shiftCount ) != savedASig ) {
3380 1.1 ross float_set_inexact();
3381 1.1 ross }
3382 1.1 ross return z;
3383 1.1 ross
3384 1.1 ross }
3385 1.1 ross
3386 1.1 ross /*
3387 1.1 ross -------------------------------------------------------------------------------
3388 1.1 ross Returns the result of converting the extended double-precision floating-
3389 1.1 ross point value `a' to the 64-bit two's complement integer format. The
3390 1.1 ross conversion is performed according to the IEC/IEEE Standard for Binary
3391 1.1 ross Floating-Point Arithmetic---which means in particular that the conversion
3392 1.1 ross is rounded according to the current rounding mode. If `a' is a NaN,
3393 1.1 ross the largest positive integer is returned. Otherwise, if the conversion
3394 1.1 ross overflows, the largest integer with the same sign as `a' is returned.
3395 1.1 ross -------------------------------------------------------------------------------
3396 1.1 ross */
3397 1.1 ross int64 floatx80_to_int64( floatx80 a )
3398 1.1 ross {
3399 1.1 ross flag aSign;
3400 1.1 ross int32 aExp, shiftCount;
3401 1.1 ross bits64 aSig, aSigExtra;
3402 1.1 ross
3403 1.1 ross aSig = extractFloatx80Frac( a );
3404 1.1 ross aExp = extractFloatx80Exp( a );
3405 1.1 ross aSign = extractFloatx80Sign( a );
3406 1.1 ross shiftCount = 0x403E - aExp;
3407 1.1 ross if ( shiftCount <= 0 ) {
3408 1.1 ross if ( shiftCount ) {
3409 1.1 ross float_raise( float_flag_invalid );
3410 1.1 ross if ( ! aSign
3411 1.1 ross || ( ( aExp == 0x7FFF )
3412 1.1 ross && ( aSig != LIT64( 0x8000000000000000 ) ) )
3413 1.1 ross ) {
3414 1.1 ross return LIT64( 0x7FFFFFFFFFFFFFFF );
3415 1.1 ross }
3416 1.1 ross return (sbits64) LIT64( 0x8000000000000000 );
3417 1.1 ross }
3418 1.1 ross aSigExtra = 0;
3419 1.1 ross }
3420 1.1 ross else {
3421 1.1 ross shift64ExtraRightJamming( aSig, 0, shiftCount, &aSig, &aSigExtra );
3422 1.1 ross }
3423 1.1 ross return roundAndPackInt64( aSign, aSig, aSigExtra );
3424 1.1 ross
3425 1.1 ross }
3426 1.1 ross
3427 1.1 ross /*
3428 1.1 ross -------------------------------------------------------------------------------
3429 1.1 ross Returns the result of converting the extended double-precision floating-
3430 1.1 ross point value `a' to the 64-bit two's complement integer format. The
3431 1.1 ross conversion is performed according to the IEC/IEEE Standard for Binary
3432 1.1 ross Floating-Point Arithmetic, except that the conversion is always rounded
3433 1.1 ross toward zero. If `a' is a NaN, the largest positive integer is returned.
3434 1.1 ross Otherwise, if the conversion overflows, the largest integer with the same
3435 1.1 ross sign as `a' is returned.
3436 1.1 ross -------------------------------------------------------------------------------
3437 1.1 ross */
3438 1.1 ross int64 floatx80_to_int64_round_to_zero( floatx80 a )
3439 1.1 ross {
3440 1.1 ross flag aSign;
3441 1.1 ross int32 aExp, shiftCount;
3442 1.1 ross bits64 aSig;
3443 1.1 ross int64 z;
3444 1.1 ross
3445 1.1 ross aSig = extractFloatx80Frac( a );
3446 1.1 ross aExp = extractFloatx80Exp( a );
3447 1.1 ross aSign = extractFloatx80Sign( a );
3448 1.1 ross shiftCount = aExp - 0x403E;
3449 1.1 ross if ( 0 <= shiftCount ) {
3450 1.1 ross aSig &= LIT64( 0x7FFFFFFFFFFFFFFF );
3451 1.1 ross if ( ( a.high != 0xC03E ) || aSig ) {
3452 1.1 ross float_raise( float_flag_invalid );
3453 1.1 ross if ( ! aSign || ( ( aExp == 0x7FFF ) && aSig ) ) {
3454 1.1 ross return LIT64( 0x7FFFFFFFFFFFFFFF );
3455 1.1 ross }
3456 1.1 ross }
3457 1.1 ross return (sbits64) LIT64( 0x8000000000000000 );
3458 1.1 ross }
3459 1.1 ross else if ( aExp < 0x3FFF ) {
3460 1.1 ross if ( aExp | aSig ) float_set_inexact();
3461 1.1 ross return 0;
3462 1.1 ross }
3463 1.1 ross z = aSig>>( - shiftCount );
3464 1.1 ross if ( (bits64) ( aSig<<( shiftCount & 63 ) ) ) {
3465 1.1 ross float_set_inexact();
3466 1.1 ross }
3467 1.1 ross if ( aSign ) z = - z;
3468 1.1 ross return z;
3469 1.1 ross
3470 1.1 ross }
3471 1.1 ross
3472 1.1 ross /*
3473 1.1 ross -------------------------------------------------------------------------------
3474 1.1 ross Returns the result of converting the extended double-precision floating-
3475 1.1 ross point value `a' to the single-precision floating-point format. The
3476 1.1 ross conversion is performed according to the IEC/IEEE Standard for Binary
3477 1.1 ross Floating-Point Arithmetic.
3478 1.1 ross -------------------------------------------------------------------------------
3479 1.1 ross */
3480 1.1 ross float32 floatx80_to_float32( floatx80 a )
3481 1.1 ross {
3482 1.1 ross flag aSign;
3483 1.1 ross int32 aExp;
3484 1.1 ross bits64 aSig;
3485 1.1 ross
3486 1.1 ross aSig = extractFloatx80Frac( a );
3487 1.1 ross aExp = extractFloatx80Exp( a );
3488 1.1 ross aSign = extractFloatx80Sign( a );
3489 1.1 ross if ( aExp == 0x7FFF ) {
3490 1.1 ross if ( (bits64) ( aSig<<1 ) ) {
3491 1.1 ross return commonNaNToFloat32( floatx80ToCommonNaN( a ) );
3492 1.1 ross }
3493 1.1 ross return packFloat32( aSign, 0xFF, 0 );
3494 1.1 ross }
3495 1.1 ross shift64RightJamming( aSig, 33, &aSig );
3496 1.1 ross if ( aExp || aSig ) aExp -= 0x3F81;
3497 1.1 ross return roundAndPackFloat32( aSign, aExp, aSig );
3498 1.1 ross
3499 1.1 ross }
3500 1.1 ross
3501 1.1 ross /*
3502 1.1 ross -------------------------------------------------------------------------------
3503 1.1 ross Returns the result of converting the extended double-precision floating-
3504 1.1 ross point value `a' to the double-precision floating-point format. The
3505 1.1 ross conversion is performed according to the IEC/IEEE Standard for Binary
3506 1.1 ross Floating-Point Arithmetic.
3507 1.1 ross -------------------------------------------------------------------------------
3508 1.1 ross */
3509 1.1 ross float64 floatx80_to_float64( floatx80 a )
3510 1.1 ross {
3511 1.1 ross flag aSign;
3512 1.1 ross int32 aExp;
3513 1.1 ross bits64 aSig, zSig;
3514 1.1 ross
3515 1.1 ross aSig = extractFloatx80Frac( a );
3516 1.1 ross aExp = extractFloatx80Exp( a );
3517 1.1 ross aSign = extractFloatx80Sign( a );
3518 1.1 ross if ( aExp == 0x7FFF ) {
3519 1.1 ross if ( (bits64) ( aSig<<1 ) ) {
3520 1.1 ross return commonNaNToFloat64( floatx80ToCommonNaN( a ) );
3521 1.1 ross }
3522 1.1 ross return packFloat64( aSign, 0x7FF, 0 );
3523 1.1 ross }
3524 1.1 ross shift64RightJamming( aSig, 1, &zSig );
3525 1.1 ross if ( aExp || aSig ) aExp -= 0x3C01;
3526 1.1 ross return roundAndPackFloat64( aSign, aExp, zSig );
3527 1.1 ross
3528 1.1 ross }
3529 1.1 ross
3530 1.1 ross #ifdef FLOAT128
3531 1.1 ross
3532 1.1 ross /*
3533 1.1 ross -------------------------------------------------------------------------------
3534 1.1 ross Returns the result of converting the extended double-precision floating-
3535 1.1 ross point value `a' to the quadruple-precision floating-point format. The
3536 1.1 ross conversion is performed according to the IEC/IEEE Standard for Binary
3537 1.1 ross Floating-Point Arithmetic.
3538 1.1 ross -------------------------------------------------------------------------------
3539 1.1 ross */
3540 1.1 ross float128 floatx80_to_float128( floatx80 a )
3541 1.1 ross {
3542 1.1 ross flag aSign;
3543 1.1 ross int16 aExp;
3544 1.1 ross bits64 aSig, zSig0, zSig1;
3545 1.1 ross
3546 1.1 ross aSig = extractFloatx80Frac( a );
3547 1.1 ross aExp = extractFloatx80Exp( a );
3548 1.1 ross aSign = extractFloatx80Sign( a );
3549 1.1 ross if ( ( aExp == 0x7FFF ) && (bits64) ( aSig<<1 ) ) {
3550 1.1 ross return commonNaNToFloat128( floatx80ToCommonNaN( a ) );
3551 1.1 ross }
3552 1.1 ross shift128Right( aSig<<1, 0, 16, &zSig0, &zSig1 );
3553 1.1 ross return packFloat128( aSign, aExp, zSig0, zSig1 );
3554 1.1 ross
3555 1.1 ross }
3556 1.1 ross
3557 1.1 ross #endif
3558 1.1 ross
3559 1.1 ross /*
3560 1.1 ross -------------------------------------------------------------------------------
3561 1.1 ross Rounds the extended double-precision floating-point value `a' to an integer,
3562 1.1 ross and returns the result as an extended quadruple-precision floating-point
3563 1.1 ross value. The operation is performed according to the IEC/IEEE Standard for
3564 1.1 ross Binary Floating-Point Arithmetic.
3565 1.1 ross -------------------------------------------------------------------------------
3566 1.1 ross */
3567 1.1 ross floatx80 floatx80_round_to_int( floatx80 a )
3568 1.1 ross {
3569 1.1 ross flag aSign;
3570 1.1 ross int32 aExp;
3571 1.1 ross bits64 lastBitMask, roundBitsMask;
3572 1.1 ross int8 roundingMode;
3573 1.1 ross floatx80 z;
3574 1.1 ross
3575 1.1 ross aExp = extractFloatx80Exp( a );
3576 1.1 ross if ( 0x403E <= aExp ) {
3577 1.1 ross if ( ( aExp == 0x7FFF ) && (bits64) ( extractFloatx80Frac( a )<<1 ) ) {
3578 1.1 ross return propagateFloatx80NaN( a, a );
3579 1.1 ross }
3580 1.1 ross return a;
3581 1.1 ross }
3582 1.1 ross if ( aExp < 0x3FFF ) {
3583 1.1 ross if ( ( aExp == 0 )
3584 1.1 ross && ( (bits64) ( extractFloatx80Frac( a )<<1 ) == 0 ) ) {
3585 1.1 ross return a;
3586 1.1 ross }
3587 1.1 ross float_set_inexact();
3588 1.1 ross aSign = extractFloatx80Sign( a );
3589 1.1 ross switch ( float_rounding_mode() ) {
3590 1.1 ross case float_round_nearest_even:
3591 1.1 ross if ( ( aExp == 0x3FFE ) && (bits64) ( extractFloatx80Frac( a )<<1 )
3592 1.1 ross ) {
3593 1.1 ross return
3594 1.1 ross packFloatx80( aSign, 0x3FFF, LIT64( 0x8000000000000000 ) );
3595 1.1 ross }
3596 1.1 ross break;
3597 1.1 ross case float_round_down:
3598 1.1 ross return
3599 1.1 ross aSign ?
3600 1.1 ross packFloatx80( 1, 0x3FFF, LIT64( 0x8000000000000000 ) )
3601 1.1 ross : packFloatx80( 0, 0, 0 );
3602 1.1 ross case float_round_up:
3603 1.1 ross return
3604 1.1 ross aSign ? packFloatx80( 1, 0, 0 )
3605 1.1 ross : packFloatx80( 0, 0x3FFF, LIT64( 0x8000000000000000 ) );
3606 1.1 ross }
3607 1.1 ross return packFloatx80( aSign, 0, 0 );
3608 1.1 ross }
3609 1.1 ross lastBitMask = 1;
3610 1.1 ross lastBitMask <<= 0x403E - aExp;
3611 1.1 ross roundBitsMask = lastBitMask - 1;
3612 1.1 ross z = a;
3613 1.1 ross roundingMode = float_rounding_mode();
3614 1.1 ross if ( roundingMode == float_round_nearest_even ) {
3615 1.1 ross z.low += lastBitMask>>1;
3616 1.1 ross if ( ( z.low & roundBitsMask ) == 0 ) z.low &= ~ lastBitMask;
3617 1.1 ross }
3618 1.1 ross else if ( roundingMode != float_round_to_zero ) {
3619 1.1 ross if ( extractFloatx80Sign( z ) ^ ( roundingMode == float_round_up ) ) {
3620 1.1 ross z.low += roundBitsMask;
3621 1.1 ross }
3622 1.1 ross }
3623 1.1 ross z.low &= ~ roundBitsMask;
3624 1.1 ross if ( z.low == 0 ) {
3625 1.1 ross ++z.high;
3626 1.1 ross z.low = LIT64( 0x8000000000000000 );
3627 1.1 ross }
3628 1.1 ross if ( z.low != a.low ) float_set_inexact();
3629 1.1 ross return z;
3630 1.1 ross
3631 1.1 ross }
3632 1.1 ross
3633 1.1 ross /*
3634 1.1 ross -------------------------------------------------------------------------------
3635 1.1 ross Returns the result of adding the absolute values of the extended double-
3636 1.1 ross precision floating-point values `a' and `b'. If `zSign' is 1, the sum is
3637 1.1 ross negated before being returned. `zSign' is ignored if the result is a NaN.
3638 1.1 ross The addition is performed according to the IEC/IEEE Standard for Binary
3639 1.1 ross Floating-Point Arithmetic.
3640 1.1 ross -------------------------------------------------------------------------------
3641 1.1 ross */
3642 1.1 ross static floatx80 addFloatx80Sigs( floatx80 a, floatx80 b, flag zSign )
3643 1.1 ross {
3644 1.1 ross int32 aExp, bExp, zExp;
3645 1.1 ross bits64 aSig, bSig, zSig0, zSig1;
3646 1.1 ross int32 expDiff;
3647 1.1 ross
3648 1.1 ross aSig = extractFloatx80Frac( a );
3649 1.1 ross aExp = extractFloatx80Exp( a );
3650 1.1 ross bSig = extractFloatx80Frac( b );
3651 1.1 ross bExp = extractFloatx80Exp( b );
3652 1.1 ross expDiff = aExp - bExp;
3653 1.1 ross if ( 0 < expDiff ) {
3654 1.1 ross if ( aExp == 0x7FFF ) {
3655 1.1 ross if ( (bits64) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b );
3656 1.1 ross return a;
3657 1.1 ross }
3658 1.1 ross if ( bExp == 0 ) --expDiff;
3659 1.1 ross shift64ExtraRightJamming( bSig, 0, expDiff, &bSig, &zSig1 );
3660 1.1 ross zExp = aExp;
3661 1.1 ross }
3662 1.1 ross else if ( expDiff < 0 ) {
3663 1.1 ross if ( bExp == 0x7FFF ) {
3664 1.1 ross if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b );
3665 1.1 ross return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
3666 1.1 ross }
3667 1.1 ross if ( aExp == 0 ) ++expDiff;
3668 1.1 ross shift64ExtraRightJamming( aSig, 0, - expDiff, &aSig, &zSig1 );
3669 1.1 ross zExp = bExp;
3670 1.1 ross }
3671 1.1 ross else {
3672 1.1 ross if ( aExp == 0x7FFF ) {
3673 1.1 ross if ( (bits64) ( ( aSig | bSig )<<1 ) ) {
3674 1.1 ross return propagateFloatx80NaN( a, b );
3675 1.1 ross }
3676 1.1 ross return a;
3677 1.1 ross }
3678 1.1 ross zSig1 = 0;
3679 1.1 ross zSig0 = aSig + bSig;
3680 1.1 ross if ( aExp == 0 ) {
3681 1.1 ross normalizeFloatx80Subnormal( zSig0, &zExp, &zSig0 );
3682 1.1 ross goto roundAndPack;
3683 1.1 ross }
3684 1.1 ross zExp = aExp;
3685 1.1 ross goto shiftRight1;
3686 1.1 ross }
3687 1.1 ross zSig0 = aSig + bSig;
3688 1.1 ross if ( (sbits64) zSig0 < 0 ) goto roundAndPack;
3689 1.1 ross shiftRight1:
3690 1.1 ross shift64ExtraRightJamming( zSig0, zSig1, 1, &zSig0, &zSig1 );
3691 1.1 ross zSig0 |= LIT64( 0x8000000000000000 );
3692 1.1 ross ++zExp;
3693 1.1 ross roundAndPack:
3694 1.1 ross return
3695 1.1 ross roundAndPackFloatx80(
3696 1.1 ross floatx80_rounding_precision, zSign, zExp, zSig0, zSig1 );
3697 1.1 ross
3698 1.1 ross }
3699 1.1 ross
3700 1.1 ross /*
3701 1.1 ross -------------------------------------------------------------------------------
3702 1.1 ross Returns the result of subtracting the absolute values of the extended
3703 1.1 ross double-precision floating-point values `a' and `b'. If `zSign' is 1, the
3704 1.1 ross difference is negated before being returned. `zSign' is ignored if the
3705 1.1 ross result is a NaN. The subtraction is performed according to the IEC/IEEE
3706 1.1 ross Standard for Binary Floating-Point Arithmetic.
3707 1.1 ross -------------------------------------------------------------------------------
3708 1.1 ross */
3709 1.1 ross static floatx80 subFloatx80Sigs( floatx80 a, floatx80 b, flag zSign )
3710 1.1 ross {
3711 1.1 ross int32 aExp, bExp, zExp;
3712 1.1 ross bits64 aSig, bSig, zSig0, zSig1;
3713 1.1 ross int32 expDiff;
3714 1.1 ross floatx80 z;
3715 1.1 ross
3716 1.1 ross aSig = extractFloatx80Frac( a );
3717 1.1 ross aExp = extractFloatx80Exp( a );
3718 1.1 ross bSig = extractFloatx80Frac( b );
3719 1.1 ross bExp = extractFloatx80Exp( b );
3720 1.1 ross expDiff = aExp - bExp;
3721 1.1 ross if ( 0 < expDiff ) goto aExpBigger;
3722 1.1 ross if ( expDiff < 0 ) goto bExpBigger;
3723 1.1 ross if ( aExp == 0x7FFF ) {
3724 1.1 ross if ( (bits64) ( ( aSig | bSig )<<1 ) ) {
3725 1.1 ross return propagateFloatx80NaN( a, b );
3726 1.1 ross }
3727 1.1 ross float_raise( float_flag_invalid );
3728 1.1 ross z.low = floatx80_default_nan_low;
3729 1.1 ross z.high = floatx80_default_nan_high;
3730 1.1 ross return z;
3731 1.1 ross }
3732 1.1 ross if ( aExp == 0 ) {
3733 1.1 ross aExp = 1;
3734 1.1 ross bExp = 1;
3735 1.1 ross }
3736 1.1 ross zSig1 = 0;
3737 1.1 ross if ( bSig < aSig ) goto aBigger;
3738 1.1 ross if ( aSig < bSig ) goto bBigger;
3739 1.1 ross return packFloatx80( float_rounding_mode() == float_round_down, 0, 0 );
3740 1.1 ross bExpBigger:
3741 1.1 ross if ( bExp == 0x7FFF ) {
3742 1.1 ross if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b );
3743 1.1 ross return packFloatx80( zSign ^ 1, 0x7FFF, LIT64( 0x8000000000000000 ) );
3744 1.1 ross }
3745 1.1 ross if ( aExp == 0 ) ++expDiff;
3746 1.1 ross shift128RightJamming( aSig, 0, - expDiff, &aSig, &zSig1 );
3747 1.1 ross bBigger:
3748 1.1 ross sub128( bSig, 0, aSig, zSig1, &zSig0, &zSig1 );
3749 1.1 ross zExp = bExp;
3750 1.1 ross zSign ^= 1;
3751 1.1 ross goto normalizeRoundAndPack;
3752 1.1 ross aExpBigger:
3753 1.1 ross if ( aExp == 0x7FFF ) {
3754 1.1 ross if ( (bits64) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b );
3755 1.1 ross return a;
3756 1.1 ross }
3757 1.1 ross if ( bExp == 0 ) --expDiff;
3758 1.1 ross shift128RightJamming( bSig, 0, expDiff, &bSig, &zSig1 );
3759 1.1 ross aBigger:
3760 1.1 ross sub128( aSig, 0, bSig, zSig1, &zSig0, &zSig1 );
3761 1.1 ross zExp = aExp;
3762 1.1 ross normalizeRoundAndPack:
3763 1.1 ross return
3764 1.1 ross normalizeRoundAndPackFloatx80(
3765 1.1 ross floatx80_rounding_precision, zSign, zExp, zSig0, zSig1 );
3766 1.1 ross
3767 1.1 ross }
3768 1.1 ross
3769 1.1 ross /*
3770 1.1 ross -------------------------------------------------------------------------------
3771 1.1 ross Returns the result of adding the extended double-precision floating-point
3772 1.1 ross values `a' and `b'. The operation is performed according to the IEC/IEEE
3773 1.1 ross Standard for Binary Floating-Point Arithmetic.
3774 1.1 ross -------------------------------------------------------------------------------
3775 1.1 ross */
3776 1.1 ross floatx80 floatx80_add( floatx80 a, floatx80 b )
3777 1.1 ross {
3778 1.1 ross flag aSign, bSign;
3779 1.1 ross
3780 1.1 ross aSign = extractFloatx80Sign( a );
3781 1.1 ross bSign = extractFloatx80Sign( b );
3782 1.1 ross if ( aSign == bSign ) {
3783 1.1 ross return addFloatx80Sigs( a, b, aSign );
3784 1.1 ross }
3785 1.1 ross else {
3786 1.1 ross return subFloatx80Sigs( a, b, aSign );
3787 1.1 ross }
3788 1.1 ross
3789 1.1 ross }
3790 1.1 ross
3791 1.1 ross /*
3792 1.1 ross -------------------------------------------------------------------------------
3793 1.1 ross Returns the result of subtracting the extended double-precision floating-
3794 1.1 ross point values `a' and `b'. The operation is performed according to the
3795 1.1 ross IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3796 1.1 ross -------------------------------------------------------------------------------
3797 1.1 ross */
3798 1.1 ross floatx80 floatx80_sub( floatx80 a, floatx80 b )
3799 1.1 ross {
3800 1.1 ross flag aSign, bSign;
3801 1.1 ross
3802 1.1 ross aSign = extractFloatx80Sign( a );
3803 1.1 ross bSign = extractFloatx80Sign( b );
3804 1.1 ross if ( aSign == bSign ) {
3805 1.1 ross return subFloatx80Sigs( a, b, aSign );
3806 1.1 ross }
3807 1.1 ross else {
3808 1.1 ross return addFloatx80Sigs( a, b, aSign );
3809 1.1 ross }
3810 1.1 ross
3811 1.1 ross }
3812 1.1 ross
3813 1.1 ross /*
3814 1.1 ross -------------------------------------------------------------------------------
3815 1.1 ross Returns the result of multiplying the extended double-precision floating-
3816 1.1 ross point values `a' and `b'. The operation is performed according to the
3817 1.1 ross IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3818 1.1 ross -------------------------------------------------------------------------------
3819 1.1 ross */
3820 1.1 ross floatx80 floatx80_mul( floatx80 a, floatx80 b )
3821 1.1 ross {
3822 1.1 ross flag aSign, bSign, zSign;
3823 1.1 ross int32 aExp, bExp, zExp;
3824 1.1 ross bits64 aSig, bSig, zSig0, zSig1;
3825 1.1 ross floatx80 z;
3826 1.1 ross
3827 1.1 ross aSig = extractFloatx80Frac( a );
3828 1.1 ross aExp = extractFloatx80Exp( a );
3829 1.1 ross aSign = extractFloatx80Sign( a );
3830 1.1 ross bSig = extractFloatx80Frac( b );
3831 1.1 ross bExp = extractFloatx80Exp( b );
3832 1.1 ross bSign = extractFloatx80Sign( b );
3833 1.1 ross zSign = aSign ^ bSign;
3834 1.1 ross if ( aExp == 0x7FFF ) {
3835 1.1 ross if ( (bits64) ( aSig<<1 )
3836 1.1 ross || ( ( bExp == 0x7FFF ) && (bits64) ( bSig<<1 ) ) ) {
3837 1.1 ross return propagateFloatx80NaN( a, b );
3838 1.1 ross }
3839 1.1 ross if ( ( bExp | bSig ) == 0 ) goto invalid;
3840 1.1 ross return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
3841 1.1 ross }
3842 1.1 ross if ( bExp == 0x7FFF ) {
3843 1.1 ross if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b );
3844 1.1 ross if ( ( aExp | aSig ) == 0 ) {
3845 1.1 ross invalid:
3846 1.1 ross float_raise( float_flag_invalid );
3847 1.1 ross z.low = floatx80_default_nan_low;
3848 1.1 ross z.high = floatx80_default_nan_high;
3849 1.1 ross return z;
3850 1.1 ross }
3851 1.1 ross return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
3852 1.1 ross }
3853 1.1 ross if ( aExp == 0 ) {
3854 1.1 ross if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 );
3855 1.1 ross normalizeFloatx80Subnormal( aSig, &aExp, &aSig );
3856 1.1 ross }
3857 1.1 ross if ( bExp == 0 ) {
3858 1.1 ross if ( bSig == 0 ) return packFloatx80( zSign, 0, 0 );
3859 1.1 ross normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
3860 1.1 ross }
3861 1.1 ross zExp = aExp + bExp - 0x3FFE;
3862 1.1 ross mul64To128( aSig, bSig, &zSig0, &zSig1 );
3863 1.1 ross if ( 0 < (sbits64) zSig0 ) {
3864 1.1 ross shortShift128Left( zSig0, zSig1, 1, &zSig0, &zSig1 );
3865 1.1 ross --zExp;
3866 1.1 ross }
3867 1.1 ross return
3868 1.1 ross roundAndPackFloatx80(
3869 1.1 ross floatx80_rounding_precision, zSign, zExp, zSig0, zSig1 );
3870 1.1 ross
3871 1.1 ross }
3872 1.1 ross
3873 1.1 ross /*
3874 1.1 ross -------------------------------------------------------------------------------
3875 1.1 ross Returns the result of dividing the extended double-precision floating-point
3876 1.1 ross value `a' by the corresponding value `b'. The operation is performed
3877 1.1 ross according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3878 1.1 ross -------------------------------------------------------------------------------
3879 1.1 ross */
3880 1.1 ross floatx80 floatx80_div( floatx80 a, floatx80 b )
3881 1.1 ross {
3882 1.1 ross flag aSign, bSign, zSign;
3883 1.1 ross int32 aExp, bExp, zExp;
3884 1.1 ross bits64 aSig, bSig, zSig0, zSig1;
3885 1.1 ross bits64 rem0, rem1, rem2, term0, term1, term2;
3886 1.1 ross floatx80 z;
3887 1.1 ross
3888 1.1 ross aSig = extractFloatx80Frac( a );
3889 1.1 ross aExp = extractFloatx80Exp( a );
3890 1.1 ross aSign = extractFloatx80Sign( a );
3891 1.1 ross bSig = extractFloatx80Frac( b );
3892 1.1 ross bExp = extractFloatx80Exp( b );
3893 1.1 ross bSign = extractFloatx80Sign( b );
3894 1.1 ross zSign = aSign ^ bSign;
3895 1.1 ross if ( aExp == 0x7FFF ) {
3896 1.1 ross if ( (bits64) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b );
3897 1.1 ross if ( bExp == 0x7FFF ) {
3898 1.1 ross if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b );
3899 1.1 ross goto invalid;
3900 1.1 ross }
3901 1.1 ross return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
3902 1.1 ross }
3903 1.1 ross if ( bExp == 0x7FFF ) {
3904 1.1 ross if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b );
3905 1.1 ross return packFloatx80( zSign, 0, 0 );
3906 1.1 ross }
3907 1.1 ross if ( bExp == 0 ) {
3908 1.1 ross if ( bSig == 0 ) {
3909 1.1 ross if ( ( aExp | aSig ) == 0 ) {
3910 1.1 ross invalid:
3911 1.1 ross float_raise( float_flag_invalid );
3912 1.1 ross z.low = floatx80_default_nan_low;
3913 1.1 ross z.high = floatx80_default_nan_high;
3914 1.1 ross return z;
3915 1.1 ross }
3916 1.1 ross float_raise( float_flag_divbyzero );
3917 1.1 ross return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
3918 1.1 ross }
3919 1.1 ross normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
3920 1.1 ross }
3921 1.1 ross if ( aExp == 0 ) {
3922 1.1 ross if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 );
3923 1.1 ross normalizeFloatx80Subnormal( aSig, &aExp, &aSig );
3924 1.1 ross }
3925 1.1 ross zExp = aExp - bExp + 0x3FFE;
3926 1.1 ross rem1 = 0;
3927 1.1 ross if ( bSig <= aSig ) {
3928 1.1 ross shift128Right( aSig, 0, 1, &aSig, &rem1 );
3929 1.1 ross ++zExp;
3930 1.1 ross }
3931 1.1 ross zSig0 = estimateDiv128To64( aSig, rem1, bSig );
3932 1.1 ross mul64To128( bSig, zSig0, &term0, &term1 );
3933 1.1 ross sub128( aSig, rem1, term0, term1, &rem0, &rem1 );
3934 1.1 ross while ( (sbits64) rem0 < 0 ) {
3935 1.1 ross --zSig0;
3936 1.1 ross add128( rem0, rem1, 0, bSig, &rem0, &rem1 );
3937 1.1 ross }
3938 1.1 ross zSig1 = estimateDiv128To64( rem1, 0, bSig );
3939 1.1 ross if ( (bits64) ( zSig1<<1 ) <= 8 ) {
3940 1.1 ross mul64To128( bSig, zSig1, &term1, &term2 );
3941 1.1 ross sub128( rem1, 0, term1, term2, &rem1, &rem2 );
3942 1.1 ross while ( (sbits64) rem1 < 0 ) {
3943 1.1 ross --zSig1;
3944 1.1 ross add128( rem1, rem2, 0, bSig, &rem1, &rem2 );
3945 1.1 ross }
3946 1.1 ross zSig1 |= ( ( rem1 | rem2 ) != 0 );
3947 1.1 ross }
3948 1.1 ross return
3949 1.1 ross roundAndPackFloatx80(
3950 1.1 ross floatx80_rounding_precision, zSign, zExp, zSig0, zSig1 );
3951 1.1 ross
3952 1.1 ross }
3953 1.1 ross
3954 1.1 ross /*
3955 1.1 ross -------------------------------------------------------------------------------
3956 1.1 ross Returns the remainder of the extended double-precision floating-point value
3957 1.1 ross `a' with respect to the corresponding value `b'. The operation is performed
3958 1.1 ross according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3959 1.1 ross -------------------------------------------------------------------------------
3960 1.1 ross */
3961 1.1 ross floatx80 floatx80_rem( floatx80 a, floatx80 b )
3962 1.1 ross {
3963 1.1 ross flag aSign, bSign, zSign;
3964 1.1 ross int32 aExp, bExp, expDiff;
3965 1.1 ross bits64 aSig0, aSig1, bSig;
3966 1.1 ross bits64 q, term0, term1, alternateASig0, alternateASig1;
3967 1.1 ross floatx80 z;
3968 1.1 ross
3969 1.1 ross aSig0 = extractFloatx80Frac( a );
3970 1.1 ross aExp = extractFloatx80Exp( a );
3971 1.1 ross aSign = extractFloatx80Sign( a );
3972 1.1 ross bSig = extractFloatx80Frac( b );
3973 1.1 ross bExp = extractFloatx80Exp( b );
3974 1.1 ross bSign = extractFloatx80Sign( b );
3975 1.1 ross if ( aExp == 0x7FFF ) {
3976 1.1 ross if ( (bits64) ( aSig0<<1 )
3977 1.1 ross || ( ( bExp == 0x7FFF ) && (bits64) ( bSig<<1 ) ) ) {
3978 1.1 ross return propagateFloatx80NaN( a, b );
3979 1.1 ross }
3980 1.1 ross goto invalid;
3981 1.1 ross }
3982 1.1 ross if ( bExp == 0x7FFF ) {
3983 1.1 ross if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b );
3984 1.1 ross return a;
3985 1.1 ross }
3986 1.1 ross if ( bExp == 0 ) {
3987 1.1 ross if ( bSig == 0 ) {
3988 1.1 ross invalid:
3989 1.1 ross float_raise( float_flag_invalid );
3990 1.1 ross z.low = floatx80_default_nan_low;
3991 1.1 ross z.high = floatx80_default_nan_high;
3992 1.1 ross return z;
3993 1.1 ross }
3994 1.1 ross normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
3995 1.1 ross }
3996 1.1 ross if ( aExp == 0 ) {
3997 1.1 ross if ( (bits64) ( aSig0<<1 ) == 0 ) return a;
3998 1.1 ross normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 );
3999 1.1 ross }
4000 1.1 ross bSig |= LIT64( 0x8000000000000000 );
4001 1.1 ross zSign = aSign;
4002 1.1 ross expDiff = aExp - bExp;
4003 1.1 ross aSig1 = 0;
4004 1.1 ross if ( expDiff < 0 ) {
4005 1.1 ross if ( expDiff < -1 ) return a;
4006 1.1 ross shift128Right( aSig0, 0, 1, &aSig0, &aSig1 );
4007 1.1 ross expDiff = 0;
4008 1.1 ross }
4009 1.1 ross q = ( bSig <= aSig0 );
4010 1.1 ross if ( q ) aSig0 -= bSig;
4011 1.1 ross expDiff -= 64;
4012 1.1 ross while ( 0 < expDiff ) {
4013 1.1 ross q = estimateDiv128To64( aSig0, aSig1, bSig );
4014 1.1 ross q = ( 2 < q ) ? q - 2 : 0;
4015 1.1 ross mul64To128( bSig, q, &term0, &term1 );
4016 1.1 ross sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
4017 1.1 ross shortShift128Left( aSig0, aSig1, 62, &aSig0, &aSig1 );
4018 1.1 ross expDiff -= 62;
4019 1.1 ross }
4020 1.1 ross expDiff += 64;
4021 1.1 ross if ( 0 < expDiff ) {
4022 1.1 ross q = estimateDiv128To64( aSig0, aSig1, bSig );
4023 1.1 ross q = ( 2 < q ) ? q - 2 : 0;
4024 1.1 ross q >>= 64 - expDiff;
4025 1.1 ross mul64To128( bSig, q<<( 64 - expDiff ), &term0, &term1 );
4026 1.1 ross sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
4027 1.1 ross shortShift128Left( 0, bSig, 64 - expDiff, &term0, &term1 );
4028 1.1 ross while ( le128( term0, term1, aSig0, aSig1 ) ) {
4029 1.1 ross ++q;
4030 1.1 ross sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
4031 1.1 ross }
4032 1.1 ross }
4033 1.1 ross else {
4034 1.1 ross term1 = 0;
4035 1.1 ross term0 = bSig;
4036 1.1 ross }
4037 1.1 ross sub128( term0, term1, aSig0, aSig1, &alternateASig0, &alternateASig1 );
4038 1.1 ross if ( lt128( alternateASig0, alternateASig1, aSig0, aSig1 )
4039 1.1 ross || ( eq128( alternateASig0, alternateASig1, aSig0, aSig1 )
4040 1.1 ross && ( q & 1 ) )
4041 1.1 ross ) {
4042 1.1 ross aSig0 = alternateASig0;
4043 1.1 ross aSig1 = alternateASig1;
4044 1.1 ross zSign = ! zSign;
4045 1.1 ross }
4046 1.1 ross return
4047 1.1 ross normalizeRoundAndPackFloatx80(
4048 1.1 ross 80, zSign, bExp + expDiff, aSig0, aSig1 );
4049 1.1 ross
4050 1.1 ross }
4051 1.1 ross
4052 1.1 ross /*
4053 1.1 ross -------------------------------------------------------------------------------
4054 1.1 ross Returns the square root of the extended double-precision floating-point
4055 1.1 ross value `a'. The operation is performed according to the IEC/IEEE Standard
4056 1.1 ross for Binary Floating-Point Arithmetic.
4057 1.1 ross -------------------------------------------------------------------------------
4058 1.1 ross */
4059 1.1 ross floatx80 floatx80_sqrt( floatx80 a )
4060 1.1 ross {
4061 1.1 ross flag aSign;
4062 1.1 ross int32 aExp, zExp;
4063 1.1 ross bits64 aSig0, aSig1, zSig0, zSig1, doubleZSig0;
4064 1.1 ross bits64 rem0, rem1, rem2, rem3, term0, term1, term2, term3;
4065 1.1 ross floatx80 z;
4066 1.1 ross
4067 1.1 ross aSig0 = extractFloatx80Frac( a );
4068 1.1 ross aExp = extractFloatx80Exp( a );
4069 1.1 ross aSign = extractFloatx80Sign( a );
4070 1.1 ross if ( aExp == 0x7FFF ) {
4071 1.1 ross if ( (bits64) ( aSig0<<1 ) ) return propagateFloatx80NaN( a, a );
4072 1.1 ross if ( ! aSign ) return a;
4073 1.1 ross goto invalid;
4074 1.1 ross }
4075 1.1 ross if ( aSign ) {
4076 1.1 ross if ( ( aExp | aSig0 ) == 0 ) return a;
4077 1.1 ross invalid:
4078 1.1 ross float_raise( float_flag_invalid );
4079 1.1 ross z.low = floatx80_default_nan_low;
4080 1.1 ross z.high = floatx80_default_nan_high;
4081 1.1 ross return z;
4082 1.1 ross }
4083 1.1 ross if ( aExp == 0 ) {
4084 1.1 ross if ( aSig0 == 0 ) return packFloatx80( 0, 0, 0 );
4085 1.1 ross normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 );
4086 1.1 ross }
4087 1.1 ross zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFF;
4088 1.1 ross zSig0 = estimateSqrt32( aExp, aSig0>>32 );
4089 1.1 ross shift128Right( aSig0, 0, 2 + ( aExp & 1 ), &aSig0, &aSig1 );
4090 1.1 ross zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 );
4091 1.1 ross doubleZSig0 = zSig0<<1;
4092 1.1 ross mul64To128( zSig0, zSig0, &term0, &term1 );
4093 1.1 ross sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 );
4094 1.1 ross while ( (sbits64) rem0 < 0 ) {
4095 1.1 ross --zSig0;
4096 1.1 ross doubleZSig0 -= 2;
4097 1.1 ross add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 );
4098 1.1 ross }
4099 1.1 ross zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 );
4100 1.1 ross if ( ( zSig1 & LIT64( 0x3FFFFFFFFFFFFFFF ) ) <= 5 ) {
4101 1.1 ross if ( zSig1 == 0 ) zSig1 = 1;
4102 1.1 ross mul64To128( doubleZSig0, zSig1, &term1, &term2 );
4103 1.1 ross sub128( rem1, 0, term1, term2, &rem1, &rem2 );
4104 1.1 ross mul64To128( zSig1, zSig1, &term2, &term3 );
4105 1.1 ross sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 );
4106 1.1 ross while ( (sbits64) rem1 < 0 ) {
4107 1.1 ross --zSig1;
4108 1.1 ross shortShift128Left( 0, zSig1, 1, &term2, &term3 );
4109 1.1 ross term3 |= 1;
4110 1.1 ross term2 |= doubleZSig0;
4111 1.1 ross add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 );
4112 1.1 ross }
4113 1.1 ross zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
4114 1.1 ross }
4115 1.1 ross shortShift128Left( 0, zSig1, 1, &zSig0, &zSig1 );
4116 1.1 ross zSig0 |= doubleZSig0;
4117 1.1 ross return
4118 1.1 ross roundAndPackFloatx80(
4119 1.1 ross floatx80_rounding_precision, 0, zExp, zSig0, zSig1 );
4120 1.1 ross
4121 1.1 ross }
4122 1.1 ross
4123 1.1 ross /*
4124 1.1 ross -------------------------------------------------------------------------------
4125 1.1 ross Returns 1 if the extended double-precision floating-point value `a' is
4126 1.1 ross equal to the corresponding value `b', and 0 otherwise. The comparison is
4127 1.1 ross performed according to the IEC/IEEE Standard for Binary Floating-Point
4128 1.1 ross Arithmetic.
4129 1.1 ross -------------------------------------------------------------------------------
4130 1.1 ross */
4131 1.1 ross flag floatx80_eq( floatx80 a, floatx80 b )
4132 1.1 ross {
4133 1.1 ross
4134 1.1 ross if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
4135 1.1 ross && (bits64) ( extractFloatx80Frac( a )<<1 ) )
4136 1.1 ross || ( ( extractFloatx80Exp( b ) == 0x7FFF )
4137 1.1 ross && (bits64) ( extractFloatx80Frac( b )<<1 ) )
4138 1.1 ross ) {
4139 1.1 ross if ( floatx80_is_signaling_nan( a )
4140 1.1 ross || floatx80_is_signaling_nan( b ) ) {
4141 1.1 ross float_raise( float_flag_invalid );
4142 1.1 ross }
4143 1.1 ross return 0;
4144 1.1 ross }
4145 1.1 ross return
4146 1.1 ross ( a.low == b.low )
4147 1.1 ross && ( ( a.high == b.high )
4148 1.1 ross || ( ( a.low == 0 )
4149 1.1 ross && ( (bits16) ( ( a.high | b.high )<<1 ) == 0 ) )
4150 1.1 ross );
4151 1.1 ross
4152 1.1 ross }
4153 1.1 ross
4154 1.1 ross /*
4155 1.1 ross -------------------------------------------------------------------------------
4156 1.1 ross Returns 1 if the extended double-precision floating-point value `a' is
4157 1.1 ross less than or equal to the corresponding value `b', and 0 otherwise. The
4158 1.1 ross comparison is performed according to the IEC/IEEE Standard for Binary
4159 1.1 ross Floating-Point Arithmetic.
4160 1.1 ross -------------------------------------------------------------------------------
4161 1.1 ross */
4162 1.1 ross flag floatx80_le( floatx80 a, floatx80 b )
4163 1.1 ross {
4164 1.1 ross flag aSign, bSign;
4165 1.1 ross
4166 1.1 ross if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
4167 1.1 ross && (bits64) ( extractFloatx80Frac( a )<<1 ) )
4168 1.1 ross || ( ( extractFloatx80Exp( b ) == 0x7FFF )
4169 1.1 ross && (bits64) ( extractFloatx80Frac( b )<<1 ) )
4170 1.1 ross ) {
4171 1.1 ross float_raise( float_flag_invalid );
4172 1.1 ross return 0;
4173 1.1 ross }
4174 1.1 ross aSign = extractFloatx80Sign( a );
4175 1.1 ross bSign = extractFloatx80Sign( b );
4176 1.1 ross if ( aSign != bSign ) {
4177 1.1 ross return
4178 1.1 ross aSign
4179 1.1 ross || ( ( ( (bits16) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
4180 1.1 ross == 0 );
4181 1.1 ross }
4182 1.1 ross return
4183 1.1 ross aSign ? le128( b.high, b.low, a.high, a.low )
4184 1.1 ross : le128( a.high, a.low, b.high, b.low );
4185 1.1 ross
4186 1.1 ross }
4187 1.1 ross
4188 1.1 ross /*
4189 1.1 ross -------------------------------------------------------------------------------
4190 1.1 ross Returns 1 if the extended double-precision floating-point value `a' is
4191 1.1 ross less than the corresponding value `b', and 0 otherwise. The comparison
4192 1.1 ross is performed according to the IEC/IEEE Standard for Binary Floating-Point
4193 1.1 ross Arithmetic.
4194 1.1 ross -------------------------------------------------------------------------------
4195 1.1 ross */
4196 1.1 ross flag floatx80_lt( floatx80 a, floatx80 b )
4197 1.1 ross {
4198 1.1 ross flag aSign, bSign;
4199 1.1 ross
4200 1.1 ross if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
4201 1.1 ross && (bits64) ( extractFloatx80Frac( a )<<1 ) )
4202 1.1 ross || ( ( extractFloatx80Exp( b ) == 0x7FFF )
4203 1.1 ross && (bits64) ( extractFloatx80Frac( b )<<1 ) )
4204 1.1 ross ) {
4205 1.1 ross float_raise( float_flag_invalid );
4206 1.1 ross return 0;
4207 1.1 ross }
4208 1.1 ross aSign = extractFloatx80Sign( a );
4209 1.1 ross bSign = extractFloatx80Sign( b );
4210 1.1 ross if ( aSign != bSign ) {
4211 1.1 ross return
4212 1.1 ross aSign
4213 1.1 ross && ( ( ( (bits16) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
4214 1.1 ross != 0 );
4215 1.1 ross }
4216 1.1 ross return
4217 1.1 ross aSign ? lt128( b.high, b.low, a.high, a.low )
4218 1.1 ross : lt128( a.high, a.low, b.high, b.low );
4219 1.1 ross
4220 1.1 ross }
4221 1.1 ross
4222 1.1 ross /*
4223 1.1 ross -------------------------------------------------------------------------------
4224 1.1 ross Returns 1 if the extended double-precision floating-point value `a' is equal
4225 1.1 ross to the corresponding value `b', and 0 otherwise. The invalid exception is
4226 1.1 ross raised if either operand is a NaN. Otherwise, the comparison is performed
4227 1.1 ross according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4228 1.1 ross -------------------------------------------------------------------------------
4229 1.1 ross */
4230 1.1 ross flag floatx80_eq_signaling( floatx80 a, floatx80 b )
4231 1.1 ross {
4232 1.1 ross
4233 1.1 ross if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
4234 1.1 ross && (bits64) ( extractFloatx80Frac( a )<<1 ) )
4235 1.1 ross || ( ( extractFloatx80Exp( b ) == 0x7FFF )
4236 1.1 ross && (bits64) ( extractFloatx80Frac( b )<<1 ) )
4237 1.1 ross ) {
4238 1.1 ross float_raise( float_flag_invalid );
4239 1.1 ross return 0;
4240 1.1 ross }
4241 1.1 ross return
4242 1.1 ross ( a.low == b.low )
4243 1.1 ross && ( ( a.high == b.high )
4244 1.1 ross || ( ( a.low == 0 )
4245 1.1 ross && ( (bits16) ( ( a.high | b.high )<<1 ) == 0 ) )
4246 1.1 ross );
4247 1.1 ross
4248 1.1 ross }
4249 1.1 ross
4250 1.1 ross /*
4251 1.1 ross -------------------------------------------------------------------------------
4252 1.1 ross Returns 1 if the extended double-precision floating-point value `a' is less
4253 1.1 ross than or equal to the corresponding value `b', and 0 otherwise. Quiet NaNs
4254 1.1 ross do not cause an exception. Otherwise, the comparison is performed according
4255 1.1 ross to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4256 1.1 ross -------------------------------------------------------------------------------
4257 1.1 ross */
4258 1.1 ross flag floatx80_le_quiet( floatx80 a, floatx80 b )
4259 1.1 ross {
4260 1.1 ross flag aSign, bSign;
4261 1.1 ross
4262 1.1 ross if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
4263 1.1 ross && (bits64) ( extractFloatx80Frac( a )<<1 ) )
4264 1.1 ross || ( ( extractFloatx80Exp( b ) == 0x7FFF )
4265 1.1 ross && (bits64) ( extractFloatx80Frac( b )<<1 ) )
4266 1.1 ross ) {
4267 1.1 ross if ( floatx80_is_signaling_nan( a )
4268 1.1 ross || floatx80_is_signaling_nan( b ) ) {
4269 1.1 ross float_raise( float_flag_invalid );
4270 1.1 ross }
4271 1.1 ross return 0;
4272 1.1 ross }
4273 1.1 ross aSign = extractFloatx80Sign( a );
4274 1.1 ross bSign = extractFloatx80Sign( b );
4275 1.1 ross if ( aSign != bSign ) {
4276 1.1 ross return
4277 1.1 ross aSign
4278 1.1 ross || ( ( ( (bits16) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
4279 1.1 ross == 0 );
4280 1.1 ross }
4281 1.1 ross return
4282 1.1 ross aSign ? le128( b.high, b.low, a.high, a.low )
4283 1.1 ross : le128( a.high, a.low, b.high, b.low );
4284 1.1 ross
4285 1.1 ross }
4286 1.1 ross
4287 1.1 ross /*
4288 1.1 ross -------------------------------------------------------------------------------
4289 1.1 ross Returns 1 if the extended double-precision floating-point value `a' is less
4290 1.1 ross than the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause
4291 1.1 ross an exception. Otherwise, the comparison is performed according to the
4292 1.1 ross IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4293 1.1 ross -------------------------------------------------------------------------------
4294 1.1 ross */
4295 1.1 ross flag floatx80_lt_quiet( floatx80 a, floatx80 b )
4296 1.1 ross {
4297 1.1 ross flag aSign, bSign;
4298 1.1 ross
4299 1.1 ross if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
4300 1.1 ross && (bits64) ( extractFloatx80Frac( a )<<1 ) )
4301 1.1 ross || ( ( extractFloatx80Exp( b ) == 0x7FFF )
4302 1.1 ross && (bits64) ( extractFloatx80Frac( b )<<1 ) )
4303 1.1 ross ) {
4304 1.1 ross if ( floatx80_is_signaling_nan( a )
4305 1.1 ross || floatx80_is_signaling_nan( b ) ) {
4306 1.1 ross float_raise( float_flag_invalid );
4307 1.1 ross }
4308 1.1 ross return 0;
4309 1.1 ross }
4310 1.1 ross aSign = extractFloatx80Sign( a );
4311 1.1 ross bSign = extractFloatx80Sign( b );
4312 1.1 ross if ( aSign != bSign ) {
4313 1.1 ross return
4314 1.1 ross aSign
4315 1.1 ross && ( ( ( (bits16) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
4316 1.1 ross != 0 );
4317 1.1 ross }
4318 1.1 ross return
4319 1.1 ross aSign ? lt128( b.high, b.low, a.high, a.low )
4320 1.1 ross : lt128( a.high, a.low, b.high, b.low );
4321 1.1 ross
4322 1.1 ross }
4323 1.1 ross
4324 1.1 ross #endif
4325 1.1 ross
4326 1.1 ross #ifdef FLOAT128
4327 1.1 ross
4328 1.1 ross /*
4329 1.1 ross -------------------------------------------------------------------------------
4330 1.1 ross Returns the result of converting the quadruple-precision floating-point
4331 1.1 ross value `a' to the 32-bit two's complement integer format. The conversion
4332 1.1 ross is performed according to the IEC/IEEE Standard for Binary Floating-Point
4333 1.1 ross Arithmetic---which means in particular that the conversion is rounded
4334 1.1 ross according to the current rounding mode. If `a' is a NaN, the largest
4335 1.1 ross positive integer is returned. Otherwise, if the conversion overflows, the
4336 1.1 ross largest integer with the same sign as `a' is returned.
4337 1.1 ross -------------------------------------------------------------------------------
4338 1.1 ross */
4339 1.1 ross int32 float128_to_int32( float128 a )
4340 1.1 ross {
4341 1.1 ross flag aSign;
4342 1.1 ross int32 aExp, shiftCount;
4343 1.1 ross bits64 aSig0, aSig1;
4344 1.1 ross
4345 1.1 ross aSig1 = extractFloat128Frac1( a );
4346 1.1 ross aSig0 = extractFloat128Frac0( a );
4347 1.1 ross aExp = extractFloat128Exp( a );
4348 1.1 ross aSign = extractFloat128Sign( a );
4349 1.1 ross if ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) aSign = 0;
4350 1.1 ross if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
4351 1.1 ross aSig0 |= ( aSig1 != 0 );
4352 1.1 ross shiftCount = 0x4028 - aExp;
4353 1.1 ross if ( 0 < shiftCount ) shift64RightJamming( aSig0, shiftCount, &aSig0 );
4354 1.1 ross return roundAndPackInt32( aSign, aSig0 );
4355 1.1 ross
4356 1.1 ross }
4357 1.1 ross
4358 1.1 ross /*
4359 1.1 ross -------------------------------------------------------------------------------
4360 1.1 ross Returns the result of converting the quadruple-precision floating-point
4361 1.1 ross value `a' to the 32-bit two's complement integer format. The conversion
4362 1.1 ross is performed according to the IEC/IEEE Standard for Binary Floating-Point
4363 1.1 ross Arithmetic, except that the conversion is always rounded toward zero. If
4364 1.1 ross `a' is a NaN, the largest positive integer is returned. Otherwise, if the
4365 1.1 ross conversion overflows, the largest integer with the same sign as `a' is
4366 1.1 ross returned.
4367 1.1 ross -------------------------------------------------------------------------------
4368 1.1 ross */
4369 1.1 ross int32 float128_to_int32_round_to_zero( float128 a )
4370 1.1 ross {
4371 1.1 ross flag aSign;
4372 1.1 ross int32 aExp, shiftCount;
4373 1.1 ross bits64 aSig0, aSig1, savedASig;
4374 1.1 ross int32 z;
4375 1.1 ross
4376 1.1 ross aSig1 = extractFloat128Frac1( a );
4377 1.1 ross aSig0 = extractFloat128Frac0( a );
4378 1.1 ross aExp = extractFloat128Exp( a );
4379 1.1 ross aSign = extractFloat128Sign( a );
4380 1.1 ross aSig0 |= ( aSig1 != 0 );
4381 1.1 ross if ( 0x401E < aExp ) {
4382 1.1 ross if ( ( aExp == 0x7FFF ) && aSig0 ) aSign = 0;
4383 1.1 ross goto invalid;
4384 1.1 ross }
4385 1.1 ross else if ( aExp < 0x3FFF ) {
4386 1.1 ross if ( aExp || aSig0 ) float_set_inexact();
4387 1.1 ross return 0;
4388 1.1 ross }
4389 1.1 ross aSig0 |= LIT64( 0x0001000000000000 );
4390 1.1 ross shiftCount = 0x402F - aExp;
4391 1.1 ross savedASig = aSig0;
4392 1.1 ross aSig0 >>= shiftCount;
4393 1.1 ross z = aSig0;
4394 1.1 ross if ( aSign ) z = - z;
4395 1.1 ross if ( ( z < 0 ) ^ aSign ) {
4396 1.1 ross invalid:
4397 1.1 ross float_raise( float_flag_invalid );
4398 1.1 ross return aSign ? (sbits32) 0x80000000 : 0x7FFFFFFF;
4399 1.1 ross }
4400 1.1 ross if ( ( aSig0<<shiftCount ) != savedASig ) {
4401 1.1 ross float_set_inexact();
4402 1.1 ross }
4403 1.1 ross return z;
4404 1.1 ross
4405 1.1 ross }
4406 1.1 ross
4407 1.1 ross /*
4408 1.1 ross -------------------------------------------------------------------------------
4409 1.1 ross Returns the result of converting the quadruple-precision floating-point
4410 1.1 ross value `a' to the 64-bit two's complement integer format. The conversion
4411 1.1 ross is performed according to the IEC/IEEE Standard for Binary Floating-Point
4412 1.1 ross Arithmetic---which means in particular that the conversion is rounded
4413 1.1 ross according to the current rounding mode. If `a' is a NaN, the largest
4414 1.1 ross positive integer is returned. Otherwise, if the conversion overflows, the
4415 1.1 ross largest integer with the same sign as `a' is returned.
4416 1.1 ross -------------------------------------------------------------------------------
4417 1.1 ross */
4418 1.1 ross int64 float128_to_int64( float128 a )
4419 1.1 ross {
4420 1.1 ross flag aSign;
4421 1.1 ross int32 aExp, shiftCount;
4422 1.1 ross bits64 aSig0, aSig1;
4423 1.1 ross
4424 1.1 ross aSig1 = extractFloat128Frac1( a );
4425 1.1 ross aSig0 = extractFloat128Frac0( a );
4426 1.1 ross aExp = extractFloat128Exp( a );
4427 1.1 ross aSign = extractFloat128Sign( a );
4428 1.1 ross if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
4429 1.1 ross shiftCount = 0x402F - aExp;
4430 1.1 ross if ( shiftCount <= 0 ) {
4431 1.1 ross if ( 0x403E < aExp ) {
4432 1.1 ross float_raise( float_flag_invalid );
4433 1.1 ross if ( ! aSign
4434 1.1 ross || ( ( aExp == 0x7FFF )
4435 1.1 ross && ( aSig1 || ( aSig0 != LIT64( 0x0001000000000000 ) ) )
4436 1.1 ross )
4437 1.1 ross ) {
4438 1.1 ross return LIT64( 0x7FFFFFFFFFFFFFFF );
4439 1.1 ross }
4440 1.1 ross return (sbits64) LIT64( 0x8000000000000000 );
4441 1.1 ross }
4442 1.1 ross shortShift128Left( aSig0, aSig1, - shiftCount, &aSig0, &aSig1 );
4443 1.1 ross }
4444 1.1 ross else {
4445 1.1 ross shift64ExtraRightJamming( aSig0, aSig1, shiftCount, &aSig0, &aSig1 );
4446 1.1 ross }
4447 1.1 ross return roundAndPackInt64( aSign, aSig0, aSig1 );
4448 1.1 ross
4449 1.1 ross }
4450 1.1 ross
4451 1.1 ross /*
4452 1.1 ross -------------------------------------------------------------------------------
4453 1.1 ross Returns the result of converting the quadruple-precision floating-point
4454 1.1 ross value `a' to the 64-bit two's complement integer format. The conversion
4455 1.1 ross is performed according to the IEC/IEEE Standard for Binary Floating-Point
4456 1.1 ross Arithmetic, except that the conversion is always rounded toward zero.
4457 1.1 ross If `a' is a NaN, the largest positive integer is returned. Otherwise, if
4458 1.1 ross the conversion overflows, the largest integer with the same sign as `a' is
4459 1.1 ross returned.
4460 1.1 ross -------------------------------------------------------------------------------
4461 1.1 ross */
4462 1.1 ross int64 float128_to_int64_round_to_zero( float128 a )
4463 1.1 ross {
4464 1.1 ross flag aSign;
4465 1.1 ross int32 aExp, shiftCount;
4466 1.1 ross bits64 aSig0, aSig1;
4467 1.1 ross int64 z;
4468 1.1 ross
4469 1.1 ross aSig1 = extractFloat128Frac1( a );
4470 1.1 ross aSig0 = extractFloat128Frac0( a );
4471 1.1 ross aExp = extractFloat128Exp( a );
4472 1.1 ross aSign = extractFloat128Sign( a );
4473 1.1 ross if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
4474 1.1 ross shiftCount = aExp - 0x402F;
4475 1.1 ross if ( 0 < shiftCount ) {
4476 1.1 ross if ( 0x403E <= aExp ) {
4477 1.1 ross aSig0 &= LIT64( 0x0000FFFFFFFFFFFF );
4478 1.1 ross if ( ( a.high == LIT64( 0xC03E000000000000 ) )
4479 1.1 ross && ( aSig1 < LIT64( 0x0002000000000000 ) ) ) {
4480 1.1 ross if ( aSig1 ) float_set_inexact();
4481 1.1 ross }
4482 1.1 ross else {
4483 1.1 ross float_raise( float_flag_invalid );
4484 1.1 ross if ( ! aSign || ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) ) {
4485 1.1 ross return LIT64( 0x7FFFFFFFFFFFFFFF );
4486 1.1 ross }
4487 1.1 ross }
4488 1.1 ross return (sbits64) LIT64( 0x8000000000000000 );
4489 1.1 ross }
4490 1.1 ross z = ( aSig0<<shiftCount ) | ( aSig1>>( ( - shiftCount ) & 63 ) );
4491 1.1 ross if ( (bits64) ( aSig1<<shiftCount ) ) {
4492 1.1 ross float_set_inexact();
4493 1.1 ross }
4494 1.1 ross }
4495 1.1 ross else {
4496 1.1 ross if ( aExp < 0x3FFF ) {
4497 1.1 ross if ( aExp | aSig0 | aSig1 ) {
4498 1.1 ross float_set_inexact();
4499 1.1 ross }
4500 1.1 ross return 0;
4501 1.1 ross }
4502 1.1 ross z = aSig0>>( - shiftCount );
4503 1.1 ross if ( aSig1
4504 1.1 ross || ( shiftCount && (bits64) ( aSig0<<( shiftCount & 63 ) ) ) ) {
4505 1.1 ross float_set_inexact();
4506 1.1 ross }
4507 1.1 ross }
4508 1.1 ross if ( aSign ) z = - z;
4509 1.1 ross return z;
4510 1.1 ross
4511 1.1 ross }
4512 1.1 ross
4513 1.1 ross /*
4514 1.1 ross -------------------------------------------------------------------------------
4515 1.1 ross Returns the result of converting the quadruple-precision floating-point
4516 1.1 ross value `a' to the single-precision floating-point format. The conversion
4517 1.1 ross is performed according to the IEC/IEEE Standard for Binary Floating-Point
4518 1.1 ross Arithmetic.
4519 1.1 ross -------------------------------------------------------------------------------
4520 1.1 ross */
4521 1.1 ross float32 float128_to_float32( float128 a )
4522 1.1 ross {
4523 1.1 ross flag aSign;
4524 1.1 ross int32 aExp;
4525 1.1 ross bits64 aSig0, aSig1;
4526 1.1 ross bits32 zSig;
4527 1.1 ross
4528 1.1 ross aSig1 = extractFloat128Frac1( a );
4529 1.1 ross aSig0 = extractFloat128Frac0( a );
4530 1.1 ross aExp = extractFloat128Exp( a );
4531 1.1 ross aSign = extractFloat128Sign( a );
4532 1.1 ross if ( aExp == 0x7FFF ) {
4533 1.1 ross if ( aSig0 | aSig1 ) {
4534 1.1 ross return commonNaNToFloat32( float128ToCommonNaN( a ) );
4535 1.1 ross }
4536 1.1 ross return packFloat32( aSign, 0xFF, 0 );
4537 1.1 ross }
4538 1.1 ross aSig0 |= ( aSig1 != 0 );
4539 1.1 ross shift64RightJamming( aSig0, 18, &aSig0 );
4540 1.1 ross zSig = aSig0;
4541 1.1 ross if ( aExp || zSig ) {
4542 1.1 ross zSig |= 0x40000000;
4543 1.1 ross aExp -= 0x3F81;
4544 1.1 ross }
4545 1.1 ross return roundAndPackFloat32( aSign, aExp, zSig );
4546 1.1 ross
4547 1.1 ross }
4548 1.1 ross
4549 1.1 ross /*
4550 1.1 ross -------------------------------------------------------------------------------
4551 1.1 ross Returns the result of converting the quadruple-precision floating-point
4552 1.1 ross value `a' to the double-precision floating-point format. The conversion
4553 1.1 ross is performed according to the IEC/IEEE Standard for Binary Floating-Point
4554 1.1 ross Arithmetic.
4555 1.1 ross -------------------------------------------------------------------------------
4556 1.1 ross */
4557 1.1 ross float64 float128_to_float64( float128 a )
4558 1.1 ross {
4559 1.1 ross flag aSign;
4560 1.1 ross int32 aExp;
4561 1.1 ross bits64 aSig0, aSig1;
4562 1.1 ross
4563 1.1 ross aSig1 = extractFloat128Frac1( a );
4564 1.1 ross aSig0 = extractFloat128Frac0( a );
4565 1.1 ross aExp = extractFloat128Exp( a );
4566 1.1 ross aSign = extractFloat128Sign( a );
4567 1.1 ross if ( aExp == 0x7FFF ) {
4568 1.1 ross if ( aSig0 | aSig1 ) {
4569 1.1 ross return commonNaNToFloat64( float128ToCommonNaN( a ) );
4570 1.1 ross }
4571 1.1 ross return packFloat64( aSign, 0x7FF, 0 );
4572 1.1 ross }
4573 1.1 ross shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 );
4574 1.1 ross aSig0 |= ( aSig1 != 0 );
4575 1.1 ross if ( aExp || aSig0 ) {
4576 1.1 ross aSig0 |= LIT64( 0x4000000000000000 );
4577 1.1 ross aExp -= 0x3C01;
4578 1.1 ross }
4579 1.1 ross return roundAndPackFloat64( aSign, aExp, aSig0 );
4580 1.1 ross
4581 1.1 ross }
4582 1.1 ross
4583 1.1 ross #ifdef FLOATX80
4584 1.1 ross
4585 1.1 ross /*
4586 1.1 ross -------------------------------------------------------------------------------
4587 1.1 ross Returns the result of converting the quadruple-precision floating-point
4588 1.1 ross value `a' to the extended double-precision floating-point format. The
4589 1.1 ross conversion is performed according to the IEC/IEEE Standard for Binary
4590 1.1 ross Floating-Point Arithmetic.
4591 1.1 ross -------------------------------------------------------------------------------
4592 1.1 ross */
4593 1.1 ross floatx80 float128_to_floatx80( float128 a )
4594 1.1 ross {
4595 1.1 ross flag aSign;
4596 1.1 ross int32 aExp;
4597 1.1 ross bits64 aSig0, aSig1;
4598 1.1 ross
4599 1.1 ross aSig1 = extractFloat128Frac1( a );
4600 1.1 ross aSig0 = extractFloat128Frac0( a );
4601 1.1 ross aExp = extractFloat128Exp( a );
4602 1.1 ross aSign = extractFloat128Sign( a );
4603 1.1 ross if ( aExp == 0x7FFF ) {
4604 1.1 ross if ( aSig0 | aSig1 ) {
4605 1.1 ross return commonNaNToFloatx80( float128ToCommonNaN( a ) );
4606 1.1 ross }
4607 1.1 ross return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
4608 1.1 ross }
4609 1.1 ross if ( aExp == 0 ) {
4610 1.1 ross if ( ( aSig0 | aSig1 ) == 0 ) return packFloatx80( aSign, 0, 0 );
4611 1.1 ross normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
4612 1.1 ross }
4613 1.1 ross else {
4614 1.1 ross aSig0 |= LIT64( 0x0001000000000000 );
4615 1.1 ross }
4616 1.1 ross shortShift128Left( aSig0, aSig1, 15, &aSig0, &aSig1 );
4617 1.1 ross return roundAndPackFloatx80( 80, aSign, aExp, aSig0, aSig1 );
4618 1.1 ross
4619 1.1 ross }
4620 1.1 ross
4621 1.1 ross #endif
4622 1.1 ross
4623 1.1 ross /*
4624 1.1 ross -------------------------------------------------------------------------------
4625 1.1 ross Rounds the quadruple-precision floating-point value `a' to an integer, and
4626 1.1 ross returns the result as a quadruple-precision floating-point value. The
4627 1.1 ross operation is performed according to the IEC/IEEE Standard for Binary
4628 1.1 ross Floating-Point Arithmetic.
4629 1.1 ross -------------------------------------------------------------------------------
4630 1.1 ross */
4631 1.1 ross float128 float128_round_to_int( float128 a )
4632 1.1 ross {
4633 1.1 ross flag aSign;
4634 1.1 ross int32 aExp;
4635 1.1 ross bits64 lastBitMask, roundBitsMask;
4636 1.1 ross int8 roundingMode;
4637 1.1 ross float128 z;
4638 1.1 ross
4639 1.1 ross aExp = extractFloat128Exp( a );
4640 1.1 ross if ( 0x402F <= aExp ) {
4641 1.1 ross if ( 0x406F <= aExp ) {
4642 1.1 ross if ( ( aExp == 0x7FFF )
4643 1.1 ross && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) )
4644 1.1 ross ) {
4645 1.1 ross return propagateFloat128NaN( a, a );
4646 1.1 ross }
4647 1.1 ross return a;
4648 1.1 ross }
4649 1.1 ross lastBitMask = 1;
4650 1.1 ross lastBitMask = ( lastBitMask<<( 0x406E - aExp ) )<<1;
4651 1.1 ross roundBitsMask = lastBitMask - 1;
4652 1.1 ross z = a;
4653 1.1 ross roundingMode = float_rounding_mode();
4654 1.1 ross if ( roundingMode == float_round_nearest_even ) {
4655 1.1 ross if ( lastBitMask ) {
4656 1.1 ross add128( z.high, z.low, 0, lastBitMask>>1, &z.high, &z.low );
4657 1.1 ross if ( ( z.low & roundBitsMask ) == 0 ) z.low &= ~ lastBitMask;
4658 1.1 ross }
4659 1.1 ross else {
4660 1.1 ross if ( (sbits64) z.low < 0 ) {
4661 1.1 ross ++z.high;
4662 1.1 ross if ( (bits64) ( z.low<<1 ) == 0 ) z.high &= ~1;
4663 1.1 ross }
4664 1.1 ross }
4665 1.1 ross }
4666 1.1 ross else if ( roundingMode != float_round_to_zero ) {
4667 1.1 ross if ( extractFloat128Sign( z )
4668 1.1 ross ^ ( roundingMode == float_round_up ) ) {
4669 1.1 ross add128( z.high, z.low, 0, roundBitsMask, &z.high, &z.low );
4670 1.1 ross }
4671 1.1 ross }
4672 1.1 ross z.low &= ~ roundBitsMask;
4673 1.1 ross }
4674 1.1 ross else {
4675 1.1 ross if ( aExp < 0x3FFF ) {
4676 1.1 ross if ( ( ( (bits64) ( a.high<<1 ) ) | a.low ) == 0 ) return a;
4677 1.1 ross float_set_inexact();
4678 1.1 ross aSign = extractFloat128Sign( a );
4679 1.1 ross switch ( float_rounding_mode() ) {
4680 1.1 ross case float_round_nearest_even:
4681 1.1 ross if ( ( aExp == 0x3FFE )
4682 1.1 ross && ( extractFloat128Frac0( a )
4683 1.1 ross | extractFloat128Frac1( a ) )
4684 1.1 ross ) {
4685 1.1 ross return packFloat128( aSign, 0x3FFF, 0, 0 );
4686 1.1 ross }
4687 1.1 ross break;
4688 1.1 ross case float_round_down:
4689 1.1 ross return
4690 1.1 ross aSign ? packFloat128( 1, 0x3FFF, 0, 0 )
4691 1.1 ross : packFloat128( 0, 0, 0, 0 );
4692 1.1 ross case float_round_up:
4693 1.1 ross return
4694 1.1 ross aSign ? packFloat128( 1, 0, 0, 0 )
4695 1.1 ross : packFloat128( 0, 0x3FFF, 0, 0 );
4696 1.1 ross }
4697 1.1 ross return packFloat128( aSign, 0, 0, 0 );
4698 1.1 ross }
4699 1.1 ross lastBitMask = 1;
4700 1.1 ross lastBitMask <<= 0x402F - aExp;
4701 1.1 ross roundBitsMask = lastBitMask - 1;
4702 1.1 ross z.low = 0;
4703 1.1 ross z.high = a.high;
4704 1.1 ross roundingMode = float_rounding_mode();
4705 1.1 ross if ( roundingMode == float_round_nearest_even ) {
4706 1.1 ross z.high += lastBitMask>>1;
4707 1.1 ross if ( ( ( z.high & roundBitsMask ) | a.low ) == 0 ) {
4708 1.1 ross z.high &= ~ lastBitMask;
4709 1.1 ross }
4710 1.1 ross }
4711 1.1 ross else if ( roundingMode != float_round_to_zero ) {
4712 1.1 ross if ( extractFloat128Sign( z )
4713 1.1 ross ^ ( roundingMode == float_round_up ) ) {
4714 1.1 ross z.high |= ( a.low != 0 );
4715 1.1 ross z.high += roundBitsMask;
4716 1.1 ross }
4717 1.1 ross }
4718 1.1 ross z.high &= ~ roundBitsMask;
4719 1.1 ross }
4720 1.1 ross if ( ( z.low != a.low ) || ( z.high != a.high ) ) {
4721 1.1 ross float_set_inexact();
4722 1.1 ross }
4723 1.1 ross return z;
4724 1.1 ross
4725 1.1 ross }
4726 1.1 ross
4727 1.1 ross /*
4728 1.1 ross -------------------------------------------------------------------------------
4729 1.1 ross Returns the result of adding the absolute values of the quadruple-precision
4730 1.1 ross floating-point values `a' and `b'. If `zSign' is 1, the sum is negated
4731 1.1 ross before being returned. `zSign' is ignored if the result is a NaN.
4732 1.1 ross The addition is performed according to the IEC/IEEE Standard for Binary
4733 1.1 ross Floating-Point Arithmetic.
4734 1.1 ross -------------------------------------------------------------------------------
4735 1.1 ross */
4736 1.1 ross static float128 addFloat128Sigs( float128 a, float128 b, flag zSign )
4737 1.1 ross {
4738 1.1 ross int32 aExp, bExp, zExp;
4739 1.1 ross bits64 aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2;
4740 1.1 ross int32 expDiff;
4741 1.1 ross
4742 1.1 ross aSig1 = extractFloat128Frac1( a );
4743 1.1 ross aSig0 = extractFloat128Frac0( a );
4744 1.1 ross aExp = extractFloat128Exp( a );
4745 1.1 ross bSig1 = extractFloat128Frac1( b );
4746 1.1 ross bSig0 = extractFloat128Frac0( b );
4747 1.1 ross bExp = extractFloat128Exp( b );
4748 1.1 ross expDiff = aExp - bExp;
4749 1.1 ross if ( 0 < expDiff ) {
4750 1.1 ross if ( aExp == 0x7FFF ) {
4751 1.1 ross if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, b );
4752 1.1 ross return a;
4753 1.1 ross }
4754 1.1 ross if ( bExp == 0 ) {
4755 1.1 ross --expDiff;
4756 1.1 ross }
4757 1.1 ross else {
4758 1.1 ross bSig0 |= LIT64( 0x0001000000000000 );
4759 1.1 ross }
4760 1.1 ross shift128ExtraRightJamming(
4761 1.1 ross bSig0, bSig1, 0, expDiff, &bSig0, &bSig1, &zSig2 );
4762 1.1 ross zExp = aExp;
4763 1.1 ross }
4764 1.1 ross else if ( expDiff < 0 ) {
4765 1.1 ross if ( bExp == 0x7FFF ) {
4766 1.1 ross if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b );
4767 1.1 ross return packFloat128( zSign, 0x7FFF, 0, 0 );
4768 1.1 ross }
4769 1.1 ross if ( aExp == 0 ) {
4770 1.1 ross ++expDiff;
4771 1.1 ross }
4772 1.1 ross else {
4773 1.1 ross aSig0 |= LIT64( 0x0001000000000000 );
4774 1.1 ross }
4775 1.1 ross shift128ExtraRightJamming(
4776 1.1 ross aSig0, aSig1, 0, - expDiff, &aSig0, &aSig1, &zSig2 );
4777 1.1 ross zExp = bExp;
4778 1.1 ross }
4779 1.1 ross else {
4780 1.1 ross if ( aExp == 0x7FFF ) {
4781 1.1 ross if ( aSig0 | aSig1 | bSig0 | bSig1 ) {
4782 1.1 ross return propagateFloat128NaN( a, b );
4783 1.1 ross }
4784 1.1 ross return a;
4785 1.1 ross }
4786 1.1 ross add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
4787 1.1 ross if ( aExp == 0 ) return packFloat128( zSign, 0, zSig0, zSig1 );
4788 1.1 ross zSig2 = 0;
4789 1.1 ross zSig0 |= LIT64( 0x0002000000000000 );
4790 1.1 ross zExp = aExp;
4791 1.1 ross goto shiftRight1;
4792 1.1 ross }
4793 1.1 ross aSig0 |= LIT64( 0x0001000000000000 );
4794 1.1 ross add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
4795 1.1 ross --zExp;
4796 1.1 ross if ( zSig0 < LIT64( 0x0002000000000000 ) ) goto roundAndPack;
4797 1.1 ross ++zExp;
4798 1.1 ross shiftRight1:
4799 1.1 ross shift128ExtraRightJamming(
4800 1.1 ross zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 );
4801 1.1 ross roundAndPack:
4802 1.1 ross return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 );
4803 1.1 ross
4804 1.1 ross }
4805 1.1 ross
4806 1.1 ross /*
4807 1.1 ross -------------------------------------------------------------------------------
4808 1.1 ross Returns the result of subtracting the absolute values of the quadruple-
4809 1.1 ross precision floating-point values `a' and `b'. If `zSign' is 1, the
4810 1.1 ross difference is negated before being returned. `zSign' is ignored if the
4811 1.1 ross result is a NaN. The subtraction is performed according to the IEC/IEEE
4812 1.1 ross Standard for Binary Floating-Point Arithmetic.
4813 1.1 ross -------------------------------------------------------------------------------
4814 1.1 ross */
4815 1.1 ross static float128 subFloat128Sigs( float128 a, float128 b, flag zSign )
4816 1.1 ross {
4817 1.1 ross int32 aExp, bExp, zExp;
4818 1.1 ross bits64 aSig0, aSig1, bSig0, bSig1, zSig0, zSig1;
4819 1.1 ross int32 expDiff;
4820 1.1 ross float128 z;
4821 1.1 ross
4822 1.1 ross aSig1 = extractFloat128Frac1( a );
4823 1.1 ross aSig0 = extractFloat128Frac0( a );
4824 1.1 ross aExp = extractFloat128Exp( a );
4825 1.1 ross bSig1 = extractFloat128Frac1( b );
4826 1.1 ross bSig0 = extractFloat128Frac0( b );
4827 1.1 ross bExp = extractFloat128Exp( b );
4828 1.1 ross expDiff = aExp - bExp;
4829 1.1 ross shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 );
4830 1.1 ross shortShift128Left( bSig0, bSig1, 14, &bSig0, &bSig1 );
4831 1.1 ross if ( 0 < expDiff ) goto aExpBigger;
4832 1.1 ross if ( expDiff < 0 ) goto bExpBigger;
4833 1.1 ross if ( aExp == 0x7FFF ) {
4834 1.1 ross if ( aSig0 | aSig1 | bSig0 | bSig1 ) {
4835 1.1 ross return propagateFloat128NaN( a, b );
4836 1.1 ross }
4837 1.1 ross float_raise( float_flag_invalid );
4838 1.1 ross z.low = float128_default_nan_low;
4839 1.1 ross z.high = float128_default_nan_high;
4840 1.1 ross return z;
4841 1.1 ross }
4842 1.1 ross if ( aExp == 0 ) {
4843 1.1 ross aExp = 1;
4844 1.1 ross bExp = 1;
4845 1.1 ross }
4846 1.1 ross if ( bSig0 < aSig0 ) goto aBigger;
4847 1.1 ross if ( aSig0 < bSig0 ) goto bBigger;
4848 1.1 ross if ( bSig1 < aSig1 ) goto aBigger;
4849 1.1 ross if ( aSig1 < bSig1 ) goto bBigger;
4850 1.1 ross return packFloat128( float_rounding_mode() == float_round_down, 0, 0, 0 );
4851 1.1 ross bExpBigger:
4852 1.1 ross if ( bExp == 0x7FFF ) {
4853 1.1 ross if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b );
4854 1.1 ross return packFloat128( zSign ^ 1, 0x7FFF, 0, 0 );
4855 1.1 ross }
4856 1.1 ross if ( aExp == 0 ) {
4857 1.1 ross ++expDiff;
4858 1.1 ross }
4859 1.1 ross else {
4860 1.1 ross aSig0 |= LIT64( 0x4000000000000000 );
4861 1.1 ross }
4862 1.1 ross shift128RightJamming( aSig0, aSig1, - expDiff, &aSig0, &aSig1 );
4863 1.1 ross bSig0 |= LIT64( 0x4000000000000000 );
4864 1.1 ross bBigger:
4865 1.1 ross sub128( bSig0, bSig1, aSig0, aSig1, &zSig0, &zSig1 );
4866 1.1 ross zExp = bExp;
4867 1.1 ross zSign ^= 1;
4868 1.1 ross goto normalizeRoundAndPack;
4869 1.1 ross aExpBigger:
4870 1.1 ross if ( aExp == 0x7FFF ) {
4871 1.1 ross if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, b );
4872 1.1 ross return a;
4873 1.1 ross }
4874 1.1 ross if ( bExp == 0 ) {
4875 1.1 ross --expDiff;
4876 1.1 ross }
4877 1.1 ross else {
4878 1.1 ross bSig0 |= LIT64( 0x4000000000000000 );
4879 1.1 ross }
4880 1.1 ross shift128RightJamming( bSig0, bSig1, expDiff, &bSig0, &bSig1 );
4881 1.1 ross aSig0 |= LIT64( 0x4000000000000000 );
4882 1.1 ross aBigger:
4883 1.1 ross sub128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
4884 1.1 ross zExp = aExp;
4885 1.1 ross normalizeRoundAndPack:
4886 1.1 ross --zExp;
4887 1.1 ross return normalizeRoundAndPackFloat128( zSign, zExp - 14, zSig0, zSig1 );
4888 1.1 ross
4889 1.1 ross }
4890 1.1 ross
4891 1.1 ross /*
4892 1.1 ross -------------------------------------------------------------------------------
4893 1.1 ross Returns the result of adding the quadruple-precision floating-point values
4894 1.1 ross `a' and `b'. The operation is performed according to the IEC/IEEE Standard
4895 1.1 ross for Binary Floating-Point Arithmetic.
4896 1.1 ross -------------------------------------------------------------------------------
4897 1.1 ross */
4898 1.1 ross float128 float128_add( float128 a, float128 b )
4899 1.1 ross {
4900 1.1 ross flag aSign, bSign;
4901 1.1 ross
4902 1.1 ross aSign = extractFloat128Sign( a );
4903 1.1 ross bSign = extractFloat128Sign( b );
4904 1.1 ross if ( aSign == bSign ) {
4905 1.1 ross return addFloat128Sigs( a, b, aSign );
4906 1.1 ross }
4907 1.1 ross else {
4908 1.1 ross return subFloat128Sigs( a, b, aSign );
4909 1.1 ross }
4910 1.1 ross
4911 1.1 ross }
4912 1.1 ross
4913 1.1 ross /*
4914 1.1 ross -------------------------------------------------------------------------------
4915 1.1 ross Returns the result of subtracting the quadruple-precision floating-point
4916 1.1 ross values `a' and `b'. The operation is performed according to the IEC/IEEE
4917 1.1 ross Standard for Binary Floating-Point Arithmetic.
4918 1.1 ross -------------------------------------------------------------------------------
4919 1.1 ross */
4920 1.1 ross float128 float128_sub( float128 a, float128 b )
4921 1.1 ross {
4922 1.1 ross flag aSign, bSign;
4923 1.1 ross
4924 1.1 ross aSign = extractFloat128Sign( a );
4925 1.1 ross bSign = extractFloat128Sign( b );
4926 1.1 ross if ( aSign == bSign ) {
4927 1.1 ross return subFloat128Sigs( a, b, aSign );
4928 1.1 ross }
4929 1.1 ross else {
4930 1.1 ross return addFloat128Sigs( a, b, aSign );
4931 1.1 ross }
4932 1.1 ross
4933 1.1 ross }
4934 1.1 ross
4935 1.1 ross /*
4936 1.1 ross -------------------------------------------------------------------------------
4937 1.1 ross Returns the result of multiplying the quadruple-precision floating-point
4938 1.1 ross values `a' and `b'. The operation is performed according to the IEC/IEEE
4939 1.1 ross Standard for Binary Floating-Point Arithmetic.
4940 1.1 ross -------------------------------------------------------------------------------
4941 1.1 ross */
4942 1.1 ross float128 float128_mul( float128 a, float128 b )
4943 1.1 ross {
4944 1.1 ross flag aSign, bSign, zSign;
4945 1.1 ross int32 aExp, bExp, zExp;
4946 1.1 ross bits64 aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2, zSig3;
4947 1.1 ross float128 z;
4948 1.1 ross
4949 1.1 ross aSig1 = extractFloat128Frac1( a );
4950 1.1 ross aSig0 = extractFloat128Frac0( a );
4951 1.1 ross aExp = extractFloat128Exp( a );
4952 1.1 ross aSign = extractFloat128Sign( a );
4953 1.1 ross bSig1 = extractFloat128Frac1( b );
4954 1.1 ross bSig0 = extractFloat128Frac0( b );
4955 1.1 ross bExp = extractFloat128Exp( b );
4956 1.1 ross bSign = extractFloat128Sign( b );
4957 1.1 ross zSign = aSign ^ bSign;
4958 1.1 ross if ( aExp == 0x7FFF ) {
4959 1.1 ross if ( ( aSig0 | aSig1 )
4960 1.1 ross || ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) {
4961 1.1 ross return propagateFloat128NaN( a, b );
4962 1.1 ross }
4963 1.1 ross if ( ( bExp | bSig0 | bSig1 ) == 0 ) goto invalid;
4964 1.1 ross return packFloat128( zSign, 0x7FFF, 0, 0 );
4965 1.1 ross }
4966 1.1 ross if ( bExp == 0x7FFF ) {
4967 1.1 ross if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b );
4968 1.1 ross if ( ( aExp | aSig0 | aSig1 ) == 0 ) {
4969 1.1 ross invalid:
4970 1.1 ross float_raise( float_flag_invalid );
4971 1.1 ross z.low = float128_default_nan_low;
4972 1.1 ross z.high = float128_default_nan_high;
4973 1.1 ross return z;
4974 1.1 ross }
4975 1.1 ross return packFloat128( zSign, 0x7FFF, 0, 0 );
4976 1.1 ross }
4977 1.1 ross if ( aExp == 0 ) {
4978 1.1 ross if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
4979 1.1 ross normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
4980 1.1 ross }
4981 1.1 ross if ( bExp == 0 ) {
4982 1.1 ross if ( ( bSig0 | bSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
4983 1.1 ross normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
4984 1.1 ross }
4985 1.1 ross zExp = aExp + bExp - 0x4000;
4986 1.1 ross aSig0 |= LIT64( 0x0001000000000000 );
4987 1.1 ross shortShift128Left( bSig0, bSig1, 16, &bSig0, &bSig1 );
4988 1.1 ross mul128To256( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1, &zSig2, &zSig3 );
4989 1.1 ross add128( zSig0, zSig1, aSig0, aSig1, &zSig0, &zSig1 );
4990 1.1 ross zSig2 |= ( zSig3 != 0 );
4991 1.1 ross if ( LIT64( 0x0002000000000000 ) <= zSig0 ) {
4992 1.1 ross shift128ExtraRightJamming(
4993 1.1 ross zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 );
4994 1.1 ross ++zExp;
4995 1.1 ross }
4996 1.1 ross return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 );
4997 1.1 ross
4998 1.1 ross }
4999 1.1 ross
5000 1.1 ross /*
5001 1.1 ross -------------------------------------------------------------------------------
5002 1.1 ross Returns the result of dividing the quadruple-precision floating-point value
5003 1.1 ross `a' by the corresponding value `b'. The operation is performed according to
5004 1.1 ross the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5005 1.1 ross -------------------------------------------------------------------------------
5006 1.1 ross */
5007 1.1 ross float128 float128_div( float128 a, float128 b )
5008 1.1 ross {
5009 1.1 ross flag aSign, bSign, zSign;
5010 1.1 ross int32 aExp, bExp, zExp;
5011 1.1 ross bits64 aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2;
5012 1.1 ross bits64 rem0, rem1, rem2, rem3, term0, term1, term2, term3;
5013 1.1 ross float128 z;
5014 1.1 ross
5015 1.1 ross aSig1 = extractFloat128Frac1( a );
5016 1.1 ross aSig0 = extractFloat128Frac0( a );
5017 1.1 ross aExp = extractFloat128Exp( a );
5018 1.1 ross aSign = extractFloat128Sign( a );
5019 1.1 ross bSig1 = extractFloat128Frac1( b );
5020 1.1 ross bSig0 = extractFloat128Frac0( b );
5021 1.1 ross bExp = extractFloat128Exp( b );
5022 1.1 ross bSign = extractFloat128Sign( b );
5023 1.1 ross zSign = aSign ^ bSign;
5024 1.1 ross if ( aExp == 0x7FFF ) {
5025 1.1 ross if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, b );
5026 1.1 ross if ( bExp == 0x7FFF ) {
5027 1.1 ross if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b );
5028 1.1 ross goto invalid;
5029 1.1 ross }
5030 1.1 ross return packFloat128( zSign, 0x7FFF, 0, 0 );
5031 1.1 ross }
5032 1.1 ross if ( bExp == 0x7FFF ) {
5033 1.1 ross if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b );
5034 1.1 ross return packFloat128( zSign, 0, 0, 0 );
5035 1.1 ross }
5036 1.1 ross if ( bExp == 0 ) {
5037 1.1 ross if ( ( bSig0 | bSig1 ) == 0 ) {
5038 1.1 ross if ( ( aExp | aSig0 | aSig1 ) == 0 ) {
5039 1.1 ross invalid:
5040 1.1 ross float_raise( float_flag_invalid );
5041 1.1 ross z.low = float128_default_nan_low;
5042 1.1 ross z.high = float128_default_nan_high;
5043 1.1 ross return z;
5044 1.1 ross }
5045 1.1 ross float_raise( float_flag_divbyzero );
5046 1.1 ross return packFloat128( zSign, 0x7FFF, 0, 0 );
5047 1.1 ross }
5048 1.1 ross normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
5049 1.1 ross }
5050 1.1 ross if ( aExp == 0 ) {
5051 1.1 ross if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
5052 1.1 ross normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
5053 1.1 ross }
5054 1.1 ross zExp = aExp - bExp + 0x3FFD;
5055 1.1 ross shortShift128Left(
5056 1.1 ross aSig0 | LIT64( 0x0001000000000000 ), aSig1, 15, &aSig0, &aSig1 );
5057 1.1 ross shortShift128Left(
5058 1.1 ross bSig0 | LIT64( 0x0001000000000000 ), bSig1, 15, &bSig0, &bSig1 );
5059 1.1 ross if ( le128( bSig0, bSig1, aSig0, aSig1 ) ) {
5060 1.1 ross shift128Right( aSig0, aSig1, 1, &aSig0, &aSig1 );
5061 1.1 ross ++zExp;
5062 1.1 ross }
5063 1.1 ross zSig0 = estimateDiv128To64( aSig0, aSig1, bSig0 );
5064 1.1 ross mul128By64To192( bSig0, bSig1, zSig0, &term0, &term1, &term2 );
5065 1.1 ross sub192( aSig0, aSig1, 0, term0, term1, term2, &rem0, &rem1, &rem2 );
5066 1.1 ross while ( (sbits64) rem0 < 0 ) {
5067 1.1 ross --zSig0;
5068 1.1 ross add192( rem0, rem1, rem2, 0, bSig0, bSig1, &rem0, &rem1, &rem2 );
5069 1.1 ross }
5070 1.1 ross zSig1 = estimateDiv128To64( rem1, rem2, bSig0 );
5071 1.1 ross if ( ( zSig1 & 0x3FFF ) <= 4 ) {
5072 1.1 ross mul128By64To192( bSig0, bSig1, zSig1, &term1, &term2, &term3 );
5073 1.1 ross sub192( rem1, rem2, 0, term1, term2, term3, &rem1, &rem2, &rem3 );
5074 1.1 ross while ( (sbits64) rem1 < 0 ) {
5075 1.1 ross --zSig1;
5076 1.1 ross add192( rem1, rem2, rem3, 0, bSig0, bSig1, &rem1, &rem2, &rem3 );
5077 1.1 ross }
5078 1.1 ross zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
5079 1.1 ross }
5080 1.1 ross shift128ExtraRightJamming( zSig0, zSig1, 0, 15, &zSig0, &zSig1, &zSig2 );
5081 1.1 ross return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 );
5082 1.1 ross
5083 1.1 ross }
5084 1.1 ross
5085 1.1 ross /*
5086 1.1 ross -------------------------------------------------------------------------------
5087 1.1 ross Returns the remainder of the quadruple-precision floating-point value `a'
5088 1.1 ross with respect to the corresponding value `b'. The operation is performed
5089 1.1 ross according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5090 1.1 ross -------------------------------------------------------------------------------
5091 1.1 ross */
5092 1.1 ross float128 float128_rem( float128 a, float128 b )
5093 1.1 ross {
5094 1.1 ross flag aSign, bSign, zSign;
5095 1.1 ross int32 aExp, bExp, expDiff;
5096 1.1 ross bits64 aSig0, aSig1, bSig0, bSig1, q, term0, term1, term2;
5097 1.1 ross bits64 allZero, alternateASig0, alternateASig1, sigMean1;
5098 1.1 ross sbits64 sigMean0;
5099 1.1 ross float128 z;
5100 1.1 ross
5101 1.1 ross aSig1 = extractFloat128Frac1( a );
5102 1.1 ross aSig0 = extractFloat128Frac0( a );
5103 1.1 ross aExp = extractFloat128Exp( a );
5104 1.1 ross aSign = extractFloat128Sign( a );
5105 1.1 ross bSig1 = extractFloat128Frac1( b );
5106 1.1 ross bSig0 = extractFloat128Frac0( b );
5107 1.1 ross bExp = extractFloat128Exp( b );
5108 1.1 ross bSign = extractFloat128Sign( b );
5109 1.1 ross if ( aExp == 0x7FFF ) {
5110 1.1 ross if ( ( aSig0 | aSig1 )
5111 1.1 ross || ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) {
5112 1.1 ross return propagateFloat128NaN( a, b );
5113 1.1 ross }
5114 1.1 ross goto invalid;
5115 1.1 ross }
5116 1.1 ross if ( bExp == 0x7FFF ) {
5117 1.1 ross if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b );
5118 1.1 ross return a;
5119 1.1 ross }
5120 1.1 ross if ( bExp == 0 ) {
5121 1.1 ross if ( ( bSig0 | bSig1 ) == 0 ) {
5122 1.1 ross invalid:
5123 1.1 ross float_raise( float_flag_invalid );
5124 1.1 ross z.low = float128_default_nan_low;
5125 1.1 ross z.high = float128_default_nan_high;
5126 1.1 ross return z;
5127 1.1 ross }
5128 1.1 ross normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
5129 1.1 ross }
5130 1.1 ross if ( aExp == 0 ) {
5131 1.1 ross if ( ( aSig0 | aSig1 ) == 0 ) return a;
5132 1.1 ross normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
5133 1.1 ross }
5134 1.1 ross expDiff = aExp - bExp;
5135 1.1 ross if ( expDiff < -1 ) return a;
5136 1.1 ross shortShift128Left(
5137 1.1 ross aSig0 | LIT64( 0x0001000000000000 ),
5138 1.1 ross aSig1,
5139 1.1 ross 15 - ( expDiff < 0 ),
5140 1.1 ross &aSig0,
5141 1.1 ross &aSig1
5142 1.1 ross );
5143 1.1 ross shortShift128Left(
5144 1.1 ross bSig0 | LIT64( 0x0001000000000000 ), bSig1, 15, &bSig0, &bSig1 );
5145 1.1 ross q = le128( bSig0, bSig1, aSig0, aSig1 );
5146 1.1 ross if ( q ) sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 );
5147 1.1 ross expDiff -= 64;
5148 1.1 ross while ( 0 < expDiff ) {
5149 1.1 ross q = estimateDiv128To64( aSig0, aSig1, bSig0 );
5150 1.1 ross q = ( 4 < q ) ? q - 4 : 0;
5151 1.1 ross mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 );
5152 1.1 ross shortShift192Left( term0, term1, term2, 61, &term1, &term2, &allZero );
5153 1.1 ross shortShift128Left( aSig0, aSig1, 61, &aSig0, &allZero );
5154 1.1 ross sub128( aSig0, 0, term1, term2, &aSig0, &aSig1 );
5155 1.1 ross expDiff -= 61;
5156 1.1 ross }
5157 1.1 ross if ( -64 < expDiff ) {
5158 1.1 ross q = estimateDiv128To64( aSig0, aSig1, bSig0 );
5159 1.1 ross q = ( 4 < q ) ? q - 4 : 0;
5160 1.1 ross q >>= - expDiff;
5161 1.1 ross shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 );
5162 1.1 ross expDiff += 52;
5163 1.1 ross if ( expDiff < 0 ) {
5164 1.1 ross shift128Right( aSig0, aSig1, - expDiff, &aSig0, &aSig1 );
5165 1.1 ross }
5166 1.1 ross else {
5167 1.1 ross shortShift128Left( aSig0, aSig1, expDiff, &aSig0, &aSig1 );
5168 1.1 ross }
5169 1.1 ross mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 );
5170 1.1 ross sub128( aSig0, aSig1, term1, term2, &aSig0, &aSig1 );
5171 1.1 ross }
5172 1.1 ross else {
5173 1.1 ross shift128Right( aSig0, aSig1, 12, &aSig0, &aSig1 );
5174 1.1 ross shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 );
5175 1.1 ross }
5176 1.1 ross do {
5177 1.1 ross alternateASig0 = aSig0;
5178 1.1 ross alternateASig1 = aSig1;
5179 1.1 ross ++q;
5180 1.1 ross sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 );
5181 1.1 ross } while ( 0 <= (sbits64) aSig0 );
5182 1.1 ross add128(
5183 1.1 ross aSig0, aSig1, alternateASig0, alternateASig1, &sigMean0, &sigMean1 );
5184 1.1 ross if ( ( sigMean0 < 0 )
5185 1.1 ross || ( ( ( sigMean0 | sigMean1 ) == 0 ) && ( q & 1 ) ) ) {
5186 1.1 ross aSig0 = alternateASig0;
5187 1.1 ross aSig1 = alternateASig1;
5188 1.1 ross }
5189 1.1 ross zSign = ( (sbits64) aSig0 < 0 );
5190 1.1 ross if ( zSign ) sub128( 0, 0, aSig0, aSig1, &aSig0, &aSig1 );
5191 1.1 ross return
5192 1.1 ross normalizeRoundAndPackFloat128( aSign ^ zSign, bExp - 4, aSig0, aSig1 );
5193 1.1 ross
5194 1.1 ross }
5195 1.1 ross
5196 1.1 ross /*
5197 1.1 ross -------------------------------------------------------------------------------
5198 1.1 ross Returns the square root of the quadruple-precision floating-point value `a'.
5199 1.1 ross The operation is performed according to the IEC/IEEE Standard for Binary
5200 1.1 ross Floating-Point Arithmetic.
5201 1.1 ross -------------------------------------------------------------------------------
5202 1.1 ross */
5203 1.1 ross float128 float128_sqrt( float128 a )
5204 1.1 ross {
5205 1.1 ross flag aSign;
5206 1.1 ross int32 aExp, zExp;
5207 1.1 ross bits64 aSig0, aSig1, zSig0, zSig1, zSig2, doubleZSig0;
5208 1.1 ross bits64 rem0, rem1, rem2, rem3, term0, term1, term2, term3;
5209 1.1 ross float128 z;
5210 1.1 ross
5211 1.1 ross aSig1 = extractFloat128Frac1( a );
5212 1.1 ross aSig0 = extractFloat128Frac0( a );
5213 1.1 ross aExp = extractFloat128Exp( a );
5214 1.1 ross aSign = extractFloat128Sign( a );
5215 1.1 ross if ( aExp == 0x7FFF ) {
5216 1.1 ross if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, a );
5217 1.1 ross if ( ! aSign ) return a;
5218 1.1 ross goto invalid;
5219 1.1 ross }
5220 1.1 ross if ( aSign ) {
5221 1.1 ross if ( ( aExp | aSig0 | aSig1 ) == 0 ) return a;
5222 1.1 ross invalid:
5223 1.1 ross float_raise( float_flag_invalid );
5224 1.1 ross z.low = float128_default_nan_low;
5225 1.1 ross z.high = float128_default_nan_high;
5226 1.1 ross return z;
5227 1.1 ross }
5228 1.1 ross if ( aExp == 0 ) {
5229 1.1 ross if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( 0, 0, 0, 0 );
5230 1.1 ross normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
5231 1.1 ross }
5232 1.1 ross zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFE;
5233 1.1 ross aSig0 |= LIT64( 0x0001000000000000 );
5234 1.1 ross zSig0 = estimateSqrt32( aExp, aSig0>>17 );
5235 1.1 ross shortShift128Left( aSig0, aSig1, 13 - ( aExp & 1 ), &aSig0, &aSig1 );
5236 1.1 ross zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 );
5237 1.1 ross doubleZSig0 = zSig0<<1;
5238 1.1 ross mul64To128( zSig0, zSig0, &term0, &term1 );
5239 1.1 ross sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 );
5240 1.1 ross while ( (sbits64) rem0 < 0 ) {
5241 1.1 ross --zSig0;
5242 1.1 ross doubleZSig0 -= 2;
5243 1.1 ross add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 );
5244 1.1 ross }
5245 1.1 ross zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 );
5246 1.1 ross if ( ( zSig1 & 0x1FFF ) <= 5 ) {
5247 1.1 ross if ( zSig1 == 0 ) zSig1 = 1;
5248 1.1 ross mul64To128( doubleZSig0, zSig1, &term1, &term2 );
5249 1.1 ross sub128( rem1, 0, term1, term2, &rem1, &rem2 );
5250 1.1 ross mul64To128( zSig1, zSig1, &term2, &term3 );
5251 1.1 ross sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 );
5252 1.1 ross while ( (sbits64) rem1 < 0 ) {
5253 1.1 ross --zSig1;
5254 1.1 ross shortShift128Left( 0, zSig1, 1, &term2, &term3 );
5255 1.1 ross term3 |= 1;
5256 1.1 ross term2 |= doubleZSig0;
5257 1.1 ross add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 );
5258 1.1 ross }
5259 1.1 ross zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
5260 1.1 ross }
5261 1.1 ross shift128ExtraRightJamming( zSig0, zSig1, 0, 14, &zSig0, &zSig1, &zSig2 );
5262 1.1 ross return roundAndPackFloat128( 0, zExp, zSig0, zSig1, zSig2 );
5263 1.1 ross
5264 1.1 ross }
5265 1.1 ross
5266 1.1 ross /*
5267 1.1 ross -------------------------------------------------------------------------------
5268 1.1 ross Returns 1 if the quadruple-precision floating-point value `a' is equal to
5269 1.1 ross the corresponding value `b', and 0 otherwise. The comparison is performed
5270 1.1 ross according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5271 1.1 ross -------------------------------------------------------------------------------
5272 1.1 ross */
5273 1.1 ross flag float128_eq( float128 a, float128 b )
5274 1.1 ross {
5275 1.1 ross
5276 1.1 ross if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
5277 1.1 ross && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
5278 1.1 ross || ( ( extractFloat128Exp( b ) == 0x7FFF )
5279 1.1 ross && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
5280 1.1 ross ) {
5281 1.1 ross if ( float128_is_signaling_nan( a )
5282 1.1 ross || float128_is_signaling_nan( b ) ) {
5283 1.1 ross float_raise( float_flag_invalid );
5284 1.1 ross }
5285 1.1 ross return 0;
5286 1.1 ross }
5287 1.1 ross return
5288 1.1 ross ( a.low == b.low )
5289 1.1 ross && ( ( a.high == b.high )
5290 1.1 ross || ( ( a.low == 0 )
5291 1.1 ross && ( (bits64) ( ( a.high | b.high )<<1 ) == 0 ) )
5292 1.1 ross );
5293 1.1 ross
5294 1.1 ross }
5295 1.1 ross
5296 1.1 ross /*
5297 1.1 ross -------------------------------------------------------------------------------
5298 1.1 ross Returns 1 if the quadruple-precision floating-point value `a' is less than
5299 1.1 ross or equal to the corresponding value `b', and 0 otherwise. The comparison
5300 1.1 ross is performed according to the IEC/IEEE Standard for Binary Floating-Point
5301 1.1 ross Arithmetic.
5302 1.1 ross -------------------------------------------------------------------------------
5303 1.1 ross */
5304 1.1 ross flag float128_le( float128 a, float128 b )
5305 1.1 ross {
5306 1.1 ross flag aSign, bSign;
5307 1.1 ross
5308 1.1 ross if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
5309 1.1 ross && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
5310 1.1 ross || ( ( extractFloat128Exp( b ) == 0x7FFF )
5311 1.1 ross && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
5312 1.1 ross ) {
5313 1.1 ross float_raise( float_flag_invalid );
5314 1.1 ross return 0;
5315 1.1 ross }
5316 1.1 ross aSign = extractFloat128Sign( a );
5317 1.1 ross bSign = extractFloat128Sign( b );
5318 1.1 ross if ( aSign != bSign ) {
5319 1.1 ross return
5320 1.1 ross aSign
5321 1.1 ross || ( ( ( (bits64) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5322 1.1 ross == 0 );
5323 1.1 ross }
5324 1.1 ross return
5325 1.1 ross aSign ? le128( b.high, b.low, a.high, a.low )
5326 1.1 ross : le128( a.high, a.low, b.high, b.low );
5327 1.1 ross
5328 1.1 ross }
5329 1.1 ross
5330 1.1 ross /*
5331 1.1 ross -------------------------------------------------------------------------------
5332 1.1 ross Returns 1 if the quadruple-precision floating-point value `a' is less than
5333 1.1 ross the corresponding value `b', and 0 otherwise. The comparison is performed
5334 1.1 ross according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5335 1.1 ross -------------------------------------------------------------------------------
5336 1.1 ross */
5337 1.1 ross flag float128_lt( float128 a, float128 b )
5338 1.1 ross {
5339 1.1 ross flag aSign, bSign;
5340 1.1 ross
5341 1.1 ross if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
5342 1.1 ross && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
5343 1.1 ross || ( ( extractFloat128Exp( b ) == 0x7FFF )
5344 1.1 ross && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
5345 1.1 ross ) {
5346 1.1 ross float_raise( float_flag_invalid );
5347 1.1 ross return 0;
5348 1.1 ross }
5349 1.1 ross aSign = extractFloat128Sign( a );
5350 1.1 ross bSign = extractFloat128Sign( b );
5351 1.1 ross if ( aSign != bSign ) {
5352 1.1 ross return
5353 1.1 ross aSign
5354 1.1 ross && ( ( ( (bits64) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5355 1.1 ross != 0 );
5356 1.1 ross }
5357 1.1 ross return
5358 1.1 ross aSign ? lt128( b.high, b.low, a.high, a.low )
5359 1.1 ross : lt128( a.high, a.low, b.high, b.low );
5360 1.1 ross
5361 1.1 ross }
5362 1.1 ross
5363 1.1 ross /*
5364 1.1 ross -------------------------------------------------------------------------------
5365 1.1 ross Returns 1 if the quadruple-precision floating-point value `a' is equal to
5366 1.1 ross the corresponding value `b', and 0 otherwise. The invalid exception is
5367 1.1 ross raised if either operand is a NaN. Otherwise, the comparison is performed
5368 1.1 ross according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5369 1.1 ross -------------------------------------------------------------------------------
5370 1.1 ross */
5371 1.1 ross flag float128_eq_signaling( float128 a, float128 b )
5372 1.1 ross {
5373 1.1 ross
5374 1.1 ross if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
5375 1.1 ross && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
5376 1.1 ross || ( ( extractFloat128Exp( b ) == 0x7FFF )
5377 1.1 ross && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
5378 1.1 ross ) {
5379 1.1 ross float_raise( float_flag_invalid );
5380 1.1 ross return 0;
5381 1.1 ross }
5382 1.1 ross return
5383 1.1 ross ( a.low == b.low )
5384 1.1 ross && ( ( a.high == b.high )
5385 1.1 ross || ( ( a.low == 0 )
5386 1.1 ross && ( (bits64) ( ( a.high | b.high )<<1 ) == 0 ) )
5387 1.1 ross );
5388 1.1 ross
5389 1.1 ross }
5390 1.1 ross
5391 1.1 ross /*
5392 1.1 ross -------------------------------------------------------------------------------
5393 1.1 ross Returns 1 if the quadruple-precision floating-point value `a' is less than
5394 1.1 ross or equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not
5395 1.1 ross cause an exception. Otherwise, the comparison is performed according to the
5396 1.1 ross IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5397 1.1 ross -------------------------------------------------------------------------------
5398 1.1 ross */
5399 1.1 ross flag float128_le_quiet( float128 a, float128 b )
5400 1.1 ross {
5401 1.1 ross flag aSign, bSign;
5402 1.1 ross
5403 1.1 ross if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
5404 1.1 ross && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
5405 1.1 ross || ( ( extractFloat128Exp( b ) == 0x7FFF )
5406 1.1 ross && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
5407 1.1 ross ) {
5408 1.1 ross if ( float128_is_signaling_nan( a )
5409 1.1 ross || float128_is_signaling_nan( b ) ) {
5410 1.1 ross float_raise( float_flag_invalid );
5411 1.1 ross }
5412 1.1 ross return 0;
5413 1.1 ross }
5414 1.1 ross aSign = extractFloat128Sign( a );
5415 1.1 ross bSign = extractFloat128Sign( b );
5416 1.1 ross if ( aSign != bSign ) {
5417 1.1 ross return
5418 1.1 ross aSign
5419 1.1 ross || ( ( ( (bits64) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5420 1.1 ross == 0 );
5421 1.1 ross }
5422 1.1 ross return
5423 1.1 ross aSign ? le128( b.high, b.low, a.high, a.low )
5424 1.1 ross : le128( a.high, a.low, b.high, b.low );
5425 1.1 ross
5426 1.1 ross }
5427 1.1 ross
5428 1.1 ross /*
5429 1.1 ross -------------------------------------------------------------------------------
5430 1.1 ross Returns 1 if the quadruple-precision floating-point value `a' is less than
5431 1.1 ross the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
5432 1.1 ross exception. Otherwise, the comparison is performed according to the IEC/IEEE
5433 1.1 ross Standard for Binary Floating-Point Arithmetic.
5434 1.1 ross -------------------------------------------------------------------------------
5435 1.1 ross */
5436 1.1 ross flag float128_lt_quiet( float128 a, float128 b )
5437 1.1 ross {
5438 1.1 ross flag aSign, bSign;
5439 1.1 ross
5440 1.1 ross if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
5441 1.1 ross && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
5442 1.1 ross || ( ( extractFloat128Exp( b ) == 0x7FFF )
5443 1.1 ross && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
5444 1.1 ross ) {
5445 1.1 ross if ( float128_is_signaling_nan( a )
5446 1.1 ross || float128_is_signaling_nan( b ) ) {
5447 1.1 ross float_raise( float_flag_invalid );
5448 1.1 ross }
5449 1.1 ross return 0;
5450 1.1 ross }
5451 1.1 ross aSign = extractFloat128Sign( a );
5452 1.1 ross bSign = extractFloat128Sign( b );
5453 1.1 ross if ( aSign != bSign ) {
5454 1.1 ross return
5455 1.1 ross aSign
5456 1.1 ross && ( ( ( (bits64) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5457 1.1 ross != 0 );
5458 1.1 ross }
5459 1.1 ross return
5460 1.1 ross aSign ? lt128( b.high, b.low, a.high, a.low )
5461 1.1 ross : lt128( a.high, a.low, b.high, b.low );
5462 1.1 ross
5463 1.1 ross }
5464 1.1 ross
5465 1.1 ross #endif
5466 1.1 ross
5467 1.1 ross
5468 1.1 ross #if defined(SOFTFLOAT_FOR_GCC) && defined(SOFTFLOAT_NEED_FIXUNS)
5469 1.1 ross
5470 1.1 ross /*
5471 1.1 ross * These two routines are not part of the original softfloat distribution.
5472 1.1 ross *
5473 1.1 ross * They are based on the corresponding conversions to integer but return
5474 1.1 ross * unsigned numbers instead since these functions are required by GCC.
5475 1.1 ross *
5476 1.3 keihan * Added by Mark Brinicombe <mark (at) NetBSD.org> 27/09/97
5477 1.1 ross *
5478 1.1 ross * float64 version overhauled for SoftFloat 2a [bjh21 2000-07-15]
5479 1.1 ross */
5480 1.1 ross
5481 1.1 ross /*
5482 1.1 ross -------------------------------------------------------------------------------
5483 1.1 ross Returns the result of converting the double-precision floating-point value
5484 1.1 ross `a' to the 32-bit unsigned integer format. The conversion is
5485 1.1 ross performed according to the IEC/IEEE Standard for Binary Floating-point
5486 1.1 ross Arithmetic, except that the conversion is always rounded toward zero. If
5487 1.1 ross `a' is a NaN, the largest positive integer is returned. If the conversion
5488 1.1 ross overflows, the largest integer positive is returned.
5489 1.1 ross -------------------------------------------------------------------------------
5490 1.1 ross */
5491 1.1 ross uint32 float64_to_uint32_round_to_zero( float64 a )
5492 1.1 ross {
5493 1.1 ross flag aSign;
5494 1.1 ross int16 aExp, shiftCount;
5495 1.1 ross bits64 aSig, savedASig;
5496 1.1 ross uint32 z;
5497 1.1 ross
5498 1.1 ross aSig = extractFloat64Frac( a );
5499 1.1 ross aExp = extractFloat64Exp( a );
5500 1.1 ross aSign = extractFloat64Sign( a );
5501 1.1 ross
5502 1.1 ross if (aSign) {
5503 1.1 ross float_raise( float_flag_invalid );
5504 1.1 ross return(0);
5505 1.1 ross }
5506 1.1 ross
5507 1.1 ross if ( 0x41E < aExp ) {
5508 1.1 ross float_raise( float_flag_invalid );
5509 1.1 ross return 0xffffffff;
5510 1.1 ross }
5511 1.1 ross else if ( aExp < 0x3FF ) {
5512 1.1 ross if ( aExp || aSig ) float_set_inexact();
5513 1.1 ross return 0;
5514 1.1 ross }
5515 1.1 ross aSig |= LIT64( 0x0010000000000000 );
5516 1.1 ross shiftCount = 0x433 - aExp;
5517 1.1 ross savedASig = aSig;
5518 1.1 ross aSig >>= shiftCount;
5519 1.1 ross z = aSig;
5520 1.1 ross if ( ( aSig<<shiftCount ) != savedASig ) {
5521 1.1 ross float_set_inexact();
5522 1.1 ross }
5523 1.1 ross return z;
5524 1.1 ross
5525 1.1 ross }
5526 1.1 ross
5527 1.1 ross /*
5528 1.1 ross -------------------------------------------------------------------------------
5529 1.1 ross Returns the result of converting the single-precision floating-point value
5530 1.1 ross `a' to the 32-bit unsigned integer format. The conversion is
5531 1.1 ross performed according to the IEC/IEEE Standard for Binary Floating-point
5532 1.1 ross Arithmetic, except that the conversion is always rounded toward zero. If
5533 1.1 ross `a' is a NaN, the largest positive integer is returned. If the conversion
5534 1.1 ross overflows, the largest positive integer is returned.
5535 1.1 ross -------------------------------------------------------------------------------
5536 1.1 ross */
5537 1.1 ross uint32 float32_to_uint32_round_to_zero( float32 a )
5538 1.1 ross {
5539 1.1 ross flag aSign;
5540 1.1 ross int16 aExp, shiftCount;
5541 1.1 ross bits32 aSig;
5542 1.1 ross uint32 z;
5543 1.1 ross
5544 1.1 ross aSig = extractFloat32Frac( a );
5545 1.1 ross aExp = extractFloat32Exp( a );
5546 1.1 ross aSign = extractFloat32Sign( a );
5547 1.1 ross shiftCount = aExp - 0x9E;
5548 1.1 ross
5549 1.1 ross if (aSign) {
5550 1.1 ross float_raise( float_flag_invalid );
5551 1.1 ross return(0);
5552 1.1 ross }
5553 1.1 ross if ( 0 < shiftCount ) {
5554 1.1 ross float_raise( float_flag_invalid );
5555 1.1 ross return 0xFFFFFFFF;
5556 1.1 ross }
5557 1.1 ross else if ( aExp <= 0x7E ) {
5558 1.1 ross if ( aExp | aSig ) float_set_inexact();
5559 1.1 ross return 0;
5560 1.1 ross }
5561 1.1 ross aSig = ( aSig | 0x800000 )<<8;
5562 1.1 ross z = aSig>>( - shiftCount );
5563 1.1 ross if ( aSig<<( shiftCount & 31 ) ) {
5564 1.1 ross float_set_inexact();
5565 1.1 ross }
5566 1.1 ross return z;
5567 1.1 ross
5568 1.1 ross }
5569 1.1 ross
5570 1.1 ross #endif
5571 1.2 thorpej
5572 1.2 thorpej #endif /* _STANDALONE */
5573