softfloat.c revision 1.4.94.1 1 1.4.94.1 cliff /* $NetBSD: softfloat.c,v 1.4.94.1 2011/02/05 06:00:14 cliff Exp $ */
2 1.1 ross
3 1.1 ross /*
4 1.1 ross * This version hacked for use with gcc -msoft-float by bjh21.
5 1.1 ross * (Mostly a case of #ifdefing out things GCC doesn't need or provides
6 1.1 ross * itself).
7 1.1 ross */
8 1.1 ross
9 1.1 ross /*
10 1.1 ross * Things you may want to define:
11 1.1 ross *
12 1.1 ross * SOFTFLOAT_FOR_GCC - build only those functions necessary for GCC (with
13 1.1 ross * -msoft-float) to work. Include "softfloat-for-gcc.h" to get them
14 1.1 ross * properly renamed.
15 1.1 ross */
16 1.1 ross
17 1.1 ross /*
18 1.1 ross ===============================================================================
19 1.1 ross
20 1.1 ross This C source file is part of the SoftFloat IEC/IEEE Floating-point
21 1.1 ross Arithmetic Package, Release 2a.
22 1.1 ross
23 1.1 ross Written by John R. Hauser. This work was made possible in part by the
24 1.1 ross International Computer Science Institute, located at Suite 600, 1947 Center
25 1.1 ross Street, Berkeley, California 94704. Funding was partially provided by the
26 1.1 ross National Science Foundation under grant MIP-9311980. The original version
27 1.1 ross of this code was written as part of a project to build a fixed-point vector
28 1.1 ross processor in collaboration with the University of California at Berkeley,
29 1.1 ross overseen by Profs. Nelson Morgan and John Wawrzynek. More information
30 1.1 ross is available through the Web page `http://HTTP.CS.Berkeley.EDU/~jhauser/
31 1.1 ross arithmetic/SoftFloat.html'.
32 1.1 ross
33 1.1 ross THIS SOFTWARE IS DISTRIBUTED AS IS, FOR FREE. Although reasonable effort
34 1.1 ross has been made to avoid it, THIS SOFTWARE MAY CONTAIN FAULTS THAT WILL AT
35 1.1 ross TIMES RESULT IN INCORRECT BEHAVIOR. USE OF THIS SOFTWARE IS RESTRICTED TO
36 1.1 ross PERSONS AND ORGANIZATIONS WHO CAN AND WILL TAKE FULL RESPONSIBILITY FOR ANY
37 1.1 ross AND ALL LOSSES, COSTS, OR OTHER PROBLEMS ARISING FROM ITS USE.
38 1.1 ross
39 1.1 ross Derivative works are acceptable, even for commercial purposes, so long as
40 1.1 ross (1) they include prominent notice that the work is derivative, and (2) they
41 1.1 ross include prominent notice akin to these four paragraphs for those parts of
42 1.1 ross this code that are retained.
43 1.1 ross
44 1.1 ross ===============================================================================
45 1.1 ross */
46 1.1 ross
47 1.2 thorpej /* If you need this in a boot program, you have bigger problems... */
48 1.2 thorpej #ifndef _STANDALONE
49 1.2 thorpej
50 1.4.94.1 cliff #include "opt_multiprocessor.h"
51 1.4.94.1 cliff
52 1.1 ross #include <sys/cdefs.h>
53 1.1 ross #if defined(LIBC_SCCS) && !defined(lint)
54 1.4.94.1 cliff __RCSID("$NetBSD: softfloat.c,v 1.4.94.1 2011/02/05 06:00:14 cliff Exp $");
55 1.1 ross #endif /* LIBC_SCCS and not lint */
56 1.1 ross
57 1.1 ross #ifdef SOFTFLOAT_FOR_GCC
58 1.1 ross #include "softfloat-for-gcc.h"
59 1.1 ross #endif
60 1.1 ross
61 1.1 ross #include "milieu.h"
62 1.1 ross #include "softfloat.h"
63 1.1 ross
64 1.1 ross /*
65 1.1 ross * Conversions between floats as stored in memory and floats as
66 1.1 ross * SoftFloat uses them
67 1.1 ross */
68 1.1 ross #ifndef FLOAT64_DEMANGLE
69 1.1 ross #define FLOAT64_DEMANGLE(a) (a)
70 1.1 ross #endif
71 1.1 ross #ifndef FLOAT64_MANGLE
72 1.1 ross #define FLOAT64_MANGLE(a) (a)
73 1.1 ross #endif
74 1.1 ross
75 1.1 ross /*
76 1.1 ross -------------------------------------------------------------------------------
77 1.1 ross Floating-point rounding mode, extended double-precision rounding precision,
78 1.1 ross and exception flags.
79 1.1 ross -------------------------------------------------------------------------------
80 1.1 ross */
81 1.1 ross
82 1.1 ross /*
83 1.1 ross * XXX: This may cause options-MULTIPROCESSOR or thread problems someday.
84 1.1 ross * Right now, it does not. I've removed all other dynamic global
85 1.1 ross * variables. [ross]
86 1.1 ross */
87 1.1 ross #ifdef FLOATX80
88 1.1 ross int8 floatx80_rounding_precision = 80;
89 1.1 ross #endif
90 1.1 ross
91 1.1 ross /*
92 1.1 ross -------------------------------------------------------------------------------
93 1.1 ross Primitive arithmetic functions, including multi-word arithmetic, and
94 1.1 ross division and square root approximations. (Can be specialized to target if
95 1.1 ross desired.)
96 1.1 ross -------------------------------------------------------------------------------
97 1.1 ross */
98 1.1 ross #include "softfloat-macros.h"
99 1.1 ross
100 1.1 ross /*
101 1.1 ross -------------------------------------------------------------------------------
102 1.1 ross Functions and definitions to determine: (1) whether tininess for underflow
103 1.1 ross is detected before or after rounding by default, (2) what (if anything)
104 1.1 ross happens when exceptions are raised, (3) how signaling NaNs are distinguished
105 1.1 ross from quiet NaNs, (4) the default generated quiet NaNs, and (5) how NaNs
106 1.1 ross are propagated from function inputs to output. These details are target-
107 1.1 ross specific.
108 1.1 ross -------------------------------------------------------------------------------
109 1.1 ross */
110 1.1 ross #include "softfloat-specialize.h"
111 1.1 ross
112 1.1 ross #ifndef SOFTFLOAT_FOR_GCC /* Not used */
113 1.1 ross /*
114 1.1 ross -------------------------------------------------------------------------------
115 1.1 ross Takes a 64-bit fixed-point value `absZ' with binary point between bits 6
116 1.1 ross and 7, and returns the properly rounded 32-bit integer corresponding to the
117 1.1 ross input. If `zSign' is 1, the input is negated before being converted to an
118 1.1 ross integer. Bit 63 of `absZ' must be zero. Ordinarily, the fixed-point input
119 1.1 ross is simply rounded to an integer, with the inexact exception raised if the
120 1.1 ross input cannot be represented exactly as an integer. However, if the fixed-
121 1.1 ross point input is too large, the invalid exception is raised and the largest
122 1.1 ross positive or negative integer is returned.
123 1.1 ross -------------------------------------------------------------------------------
124 1.1 ross */
125 1.1 ross static int32 roundAndPackInt32( flag zSign, bits64 absZ )
126 1.1 ross {
127 1.1 ross int8 roundingMode;
128 1.1 ross flag roundNearestEven;
129 1.1 ross int8 roundIncrement, roundBits;
130 1.1 ross int32 z;
131 1.1 ross
132 1.1 ross roundingMode = float_rounding_mode();
133 1.1 ross roundNearestEven = ( roundingMode == float_round_nearest_even );
134 1.1 ross roundIncrement = 0x40;
135 1.1 ross if ( ! roundNearestEven ) {
136 1.1 ross if ( roundingMode == float_round_to_zero ) {
137 1.1 ross roundIncrement = 0;
138 1.1 ross }
139 1.1 ross else {
140 1.1 ross roundIncrement = 0x7F;
141 1.1 ross if ( zSign ) {
142 1.1 ross if ( roundingMode == float_round_up ) roundIncrement = 0;
143 1.1 ross }
144 1.1 ross else {
145 1.1 ross if ( roundingMode == float_round_down ) roundIncrement = 0;
146 1.1 ross }
147 1.1 ross }
148 1.1 ross }
149 1.1 ross roundBits = absZ & 0x7F;
150 1.1 ross absZ = ( absZ + roundIncrement )>>7;
151 1.1 ross absZ &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven );
152 1.1 ross z = absZ;
153 1.1 ross if ( zSign ) z = - z;
154 1.1 ross if ( ( absZ>>32 ) || ( z && ( ( z < 0 ) ^ zSign ) ) ) {
155 1.1 ross float_raise( float_flag_invalid );
156 1.1 ross return zSign ? (sbits32) 0x80000000 : 0x7FFFFFFF;
157 1.1 ross }
158 1.1 ross if ( roundBits ) float_set_inexact();
159 1.1 ross return z;
160 1.1 ross
161 1.1 ross }
162 1.1 ross
163 1.1 ross /*
164 1.1 ross -------------------------------------------------------------------------------
165 1.1 ross Takes the 128-bit fixed-point value formed by concatenating `absZ0' and
166 1.1 ross `absZ1', with binary point between bits 63 and 64 (between the input words),
167 1.1 ross and returns the properly rounded 64-bit integer corresponding to the input.
168 1.1 ross If `zSign' is 1, the input is negated before being converted to an integer.
169 1.1 ross Ordinarily, the fixed-point input is simply rounded to an integer, with
170 1.1 ross the inexact exception raised if the input cannot be represented exactly as
171 1.1 ross an integer. However, if the fixed-point input is too large, the invalid
172 1.1 ross exception is raised and the largest positive or negative integer is
173 1.1 ross returned.
174 1.1 ross -------------------------------------------------------------------------------
175 1.1 ross */
176 1.1 ross static int64 roundAndPackInt64( flag zSign, bits64 absZ0, bits64 absZ1 )
177 1.1 ross {
178 1.1 ross int8 roundingMode;
179 1.1 ross flag roundNearestEven, increment;
180 1.1 ross int64 z;
181 1.1 ross
182 1.1 ross roundingMode = float_rounding_mode();
183 1.1 ross roundNearestEven = ( roundingMode == float_round_nearest_even );
184 1.1 ross increment = ( (sbits64) absZ1 < 0 );
185 1.1 ross if ( ! roundNearestEven ) {
186 1.1 ross if ( roundingMode == float_round_to_zero ) {
187 1.1 ross increment = 0;
188 1.1 ross }
189 1.1 ross else {
190 1.1 ross if ( zSign ) {
191 1.1 ross increment = ( roundingMode == float_round_down ) && absZ1;
192 1.1 ross }
193 1.1 ross else {
194 1.1 ross increment = ( roundingMode == float_round_up ) && absZ1;
195 1.1 ross }
196 1.1 ross }
197 1.1 ross }
198 1.1 ross if ( increment ) {
199 1.1 ross ++absZ0;
200 1.1 ross if ( absZ0 == 0 ) goto overflow;
201 1.1 ross absZ0 &= ~ ( ( (bits64) ( absZ1<<1 ) == 0 ) & roundNearestEven );
202 1.1 ross }
203 1.1 ross z = absZ0;
204 1.1 ross if ( zSign ) z = - z;
205 1.1 ross if ( z && ( ( z < 0 ) ^ zSign ) ) {
206 1.1 ross overflow:
207 1.1 ross float_raise( float_flag_invalid );
208 1.1 ross return
209 1.1 ross zSign ? (sbits64) LIT64( 0x8000000000000000 )
210 1.1 ross : LIT64( 0x7FFFFFFFFFFFFFFF );
211 1.1 ross }
212 1.1 ross if ( absZ1 ) float_set_inexact();
213 1.1 ross return z;
214 1.1 ross
215 1.1 ross }
216 1.1 ross #endif
217 1.1 ross
218 1.1 ross /*
219 1.1 ross -------------------------------------------------------------------------------
220 1.1 ross Returns the fraction bits of the single-precision floating-point value `a'.
221 1.1 ross -------------------------------------------------------------------------------
222 1.1 ross */
223 1.1 ross INLINE bits32 extractFloat32Frac( float32 a )
224 1.1 ross {
225 1.1 ross
226 1.1 ross return a & 0x007FFFFF;
227 1.1 ross
228 1.1 ross }
229 1.1 ross
230 1.1 ross /*
231 1.1 ross -------------------------------------------------------------------------------
232 1.1 ross Returns the exponent bits of the single-precision floating-point value `a'.
233 1.1 ross -------------------------------------------------------------------------------
234 1.1 ross */
235 1.1 ross INLINE int16 extractFloat32Exp( float32 a )
236 1.1 ross {
237 1.1 ross
238 1.1 ross return ( a>>23 ) & 0xFF;
239 1.1 ross
240 1.1 ross }
241 1.1 ross
242 1.1 ross /*
243 1.1 ross -------------------------------------------------------------------------------
244 1.1 ross Returns the sign bit of the single-precision floating-point value `a'.
245 1.1 ross -------------------------------------------------------------------------------
246 1.1 ross */
247 1.1 ross INLINE flag extractFloat32Sign( float32 a )
248 1.1 ross {
249 1.1 ross
250 1.1 ross return a>>31;
251 1.1 ross
252 1.1 ross }
253 1.1 ross
254 1.1 ross /*
255 1.1 ross -------------------------------------------------------------------------------
256 1.1 ross Normalizes the subnormal single-precision floating-point value represented
257 1.1 ross by the denormalized significand `aSig'. The normalized exponent and
258 1.1 ross significand are stored at the locations pointed to by `zExpPtr' and
259 1.1 ross `zSigPtr', respectively.
260 1.1 ross -------------------------------------------------------------------------------
261 1.1 ross */
262 1.1 ross static void
263 1.1 ross normalizeFloat32Subnormal( bits32 aSig, int16 *zExpPtr, bits32 *zSigPtr )
264 1.1 ross {
265 1.1 ross int8 shiftCount;
266 1.1 ross
267 1.1 ross shiftCount = countLeadingZeros32( aSig ) - 8;
268 1.1 ross *zSigPtr = aSig<<shiftCount;
269 1.1 ross *zExpPtr = 1 - shiftCount;
270 1.1 ross
271 1.1 ross }
272 1.1 ross
273 1.1 ross /*
274 1.1 ross -------------------------------------------------------------------------------
275 1.1 ross Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
276 1.1 ross single-precision floating-point value, returning the result. After being
277 1.1 ross shifted into the proper positions, the three fields are simply added
278 1.1 ross together to form the result. This means that any integer portion of `zSig'
279 1.1 ross will be added into the exponent. Since a properly normalized significand
280 1.1 ross will have an integer portion equal to 1, the `zExp' input should be 1 less
281 1.1 ross than the desired result exponent whenever `zSig' is a complete, normalized
282 1.1 ross significand.
283 1.1 ross -------------------------------------------------------------------------------
284 1.1 ross */
285 1.1 ross INLINE float32 packFloat32( flag zSign, int16 zExp, bits32 zSig )
286 1.1 ross {
287 1.1 ross
288 1.1 ross return ( ( (bits32) zSign )<<31 ) + ( ( (bits32) zExp )<<23 ) + zSig;
289 1.1 ross
290 1.1 ross }
291 1.1 ross
292 1.1 ross /*
293 1.1 ross -------------------------------------------------------------------------------
294 1.1 ross Takes an abstract floating-point value having sign `zSign', exponent `zExp',
295 1.1 ross and significand `zSig', and returns the proper single-precision floating-
296 1.1 ross point value corresponding to the abstract input. Ordinarily, the abstract
297 1.1 ross value is simply rounded and packed into the single-precision format, with
298 1.1 ross the inexact exception raised if the abstract input cannot be represented
299 1.1 ross exactly. However, if the abstract value is too large, the overflow and
300 1.1 ross inexact exceptions are raised and an infinity or maximal finite value is
301 1.1 ross returned. If the abstract value is too small, the input value is rounded to
302 1.1 ross a subnormal number, and the underflow and inexact exceptions are raised if
303 1.1 ross the abstract input cannot be represented exactly as a subnormal single-
304 1.1 ross precision floating-point number.
305 1.1 ross The input significand `zSig' has its binary point between bits 30
306 1.1 ross and 29, which is 7 bits to the left of the usual location. This shifted
307 1.1 ross significand must be normalized or smaller. If `zSig' is not normalized,
308 1.1 ross `zExp' must be 0; in that case, the result returned is a subnormal number,
309 1.1 ross and it must not require rounding. In the usual case that `zSig' is
310 1.1 ross normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
311 1.1 ross The handling of underflow and overflow follows the IEC/IEEE Standard for
312 1.1 ross Binary Floating-Point Arithmetic.
313 1.1 ross -------------------------------------------------------------------------------
314 1.1 ross */
315 1.1 ross static float32 roundAndPackFloat32( flag zSign, int16 zExp, bits32 zSig )
316 1.1 ross {
317 1.1 ross int8 roundingMode;
318 1.1 ross flag roundNearestEven;
319 1.1 ross int8 roundIncrement, roundBits;
320 1.1 ross flag isTiny;
321 1.1 ross
322 1.1 ross roundingMode = float_rounding_mode();
323 1.1 ross roundNearestEven = ( roundingMode == float_round_nearest_even );
324 1.1 ross roundIncrement = 0x40;
325 1.1 ross if ( ! roundNearestEven ) {
326 1.1 ross if ( roundingMode == float_round_to_zero ) {
327 1.1 ross roundIncrement = 0;
328 1.1 ross }
329 1.1 ross else {
330 1.1 ross roundIncrement = 0x7F;
331 1.1 ross if ( zSign ) {
332 1.1 ross if ( roundingMode == float_round_up ) roundIncrement = 0;
333 1.1 ross }
334 1.1 ross else {
335 1.1 ross if ( roundingMode == float_round_down ) roundIncrement = 0;
336 1.1 ross }
337 1.1 ross }
338 1.1 ross }
339 1.1 ross roundBits = zSig & 0x7F;
340 1.1 ross if ( 0xFD <= (bits16) zExp ) {
341 1.1 ross if ( ( 0xFD < zExp )
342 1.1 ross || ( ( zExp == 0xFD )
343 1.1 ross && ( (sbits32) ( zSig + roundIncrement ) < 0 ) )
344 1.1 ross ) {
345 1.1 ross float_raise( float_flag_overflow | float_flag_inexact );
346 1.1 ross return packFloat32( zSign, 0xFF, 0 ) - ( roundIncrement == 0 );
347 1.1 ross }
348 1.1 ross if ( zExp < 0 ) {
349 1.1 ross isTiny =
350 1.1 ross ( float_detect_tininess == float_tininess_before_rounding )
351 1.1 ross || ( zExp < -1 )
352 1.1 ross || ( zSig + roundIncrement < 0x80000000 );
353 1.1 ross shift32RightJamming( zSig, - zExp, &zSig );
354 1.1 ross zExp = 0;
355 1.1 ross roundBits = zSig & 0x7F;
356 1.1 ross if ( isTiny && roundBits ) float_raise( float_flag_underflow );
357 1.1 ross }
358 1.1 ross }
359 1.1 ross if ( roundBits ) float_set_inexact();
360 1.1 ross zSig = ( zSig + roundIncrement )>>7;
361 1.1 ross zSig &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven );
362 1.1 ross if ( zSig == 0 ) zExp = 0;
363 1.1 ross return packFloat32( zSign, zExp, zSig );
364 1.1 ross
365 1.1 ross }
366 1.1 ross
367 1.1 ross /*
368 1.1 ross -------------------------------------------------------------------------------
369 1.1 ross Takes an abstract floating-point value having sign `zSign', exponent `zExp',
370 1.1 ross and significand `zSig', and returns the proper single-precision floating-
371 1.1 ross point value corresponding to the abstract input. This routine is just like
372 1.1 ross `roundAndPackFloat32' except that `zSig' does not have to be normalized.
373 1.1 ross Bit 31 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
374 1.1 ross floating-point exponent.
375 1.1 ross -------------------------------------------------------------------------------
376 1.1 ross */
377 1.1 ross static float32
378 1.1 ross normalizeRoundAndPackFloat32( flag zSign, int16 zExp, bits32 zSig )
379 1.1 ross {
380 1.1 ross int8 shiftCount;
381 1.1 ross
382 1.1 ross shiftCount = countLeadingZeros32( zSig ) - 1;
383 1.1 ross return roundAndPackFloat32( zSign, zExp - shiftCount, zSig<<shiftCount );
384 1.1 ross
385 1.1 ross }
386 1.1 ross
387 1.1 ross /*
388 1.1 ross -------------------------------------------------------------------------------
389 1.1 ross Returns the fraction bits of the double-precision floating-point value `a'.
390 1.1 ross -------------------------------------------------------------------------------
391 1.1 ross */
392 1.1 ross INLINE bits64 extractFloat64Frac( float64 a )
393 1.1 ross {
394 1.1 ross
395 1.1 ross return FLOAT64_DEMANGLE(a) & LIT64( 0x000FFFFFFFFFFFFF );
396 1.1 ross
397 1.1 ross }
398 1.1 ross
399 1.1 ross /*
400 1.1 ross -------------------------------------------------------------------------------
401 1.1 ross Returns the exponent bits of the double-precision floating-point value `a'.
402 1.1 ross -------------------------------------------------------------------------------
403 1.1 ross */
404 1.1 ross INLINE int16 extractFloat64Exp( float64 a )
405 1.1 ross {
406 1.1 ross
407 1.1 ross return ( FLOAT64_DEMANGLE(a)>>52 ) & 0x7FF;
408 1.1 ross
409 1.1 ross }
410 1.1 ross
411 1.1 ross /*
412 1.1 ross -------------------------------------------------------------------------------
413 1.1 ross Returns the sign bit of the double-precision floating-point value `a'.
414 1.1 ross -------------------------------------------------------------------------------
415 1.1 ross */
416 1.1 ross INLINE flag extractFloat64Sign( float64 a )
417 1.1 ross {
418 1.1 ross
419 1.1 ross return FLOAT64_DEMANGLE(a)>>63;
420 1.1 ross
421 1.1 ross }
422 1.1 ross
423 1.1 ross /*
424 1.1 ross -------------------------------------------------------------------------------
425 1.1 ross Normalizes the subnormal double-precision floating-point value represented
426 1.1 ross by the denormalized significand `aSig'. The normalized exponent and
427 1.1 ross significand are stored at the locations pointed to by `zExpPtr' and
428 1.1 ross `zSigPtr', respectively.
429 1.1 ross -------------------------------------------------------------------------------
430 1.1 ross */
431 1.1 ross static void
432 1.1 ross normalizeFloat64Subnormal( bits64 aSig, int16 *zExpPtr, bits64 *zSigPtr )
433 1.1 ross {
434 1.1 ross int8 shiftCount;
435 1.1 ross
436 1.1 ross shiftCount = countLeadingZeros64( aSig ) - 11;
437 1.1 ross *zSigPtr = aSig<<shiftCount;
438 1.1 ross *zExpPtr = 1 - shiftCount;
439 1.1 ross
440 1.1 ross }
441 1.1 ross
442 1.1 ross /*
443 1.1 ross -------------------------------------------------------------------------------
444 1.1 ross Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
445 1.1 ross double-precision floating-point value, returning the result. After being
446 1.1 ross shifted into the proper positions, the three fields are simply added
447 1.1 ross together to form the result. This means that any integer portion of `zSig'
448 1.1 ross will be added into the exponent. Since a properly normalized significand
449 1.1 ross will have an integer portion equal to 1, the `zExp' input should be 1 less
450 1.1 ross than the desired result exponent whenever `zSig' is a complete, normalized
451 1.1 ross significand.
452 1.1 ross -------------------------------------------------------------------------------
453 1.1 ross */
454 1.1 ross INLINE float64 packFloat64( flag zSign, int16 zExp, bits64 zSig )
455 1.1 ross {
456 1.1 ross
457 1.1 ross return FLOAT64_MANGLE( ( ( (bits64) zSign )<<63 ) +
458 1.1 ross ( ( (bits64) zExp )<<52 ) + zSig );
459 1.1 ross
460 1.1 ross }
461 1.1 ross
462 1.1 ross /*
463 1.1 ross -------------------------------------------------------------------------------
464 1.1 ross Takes an abstract floating-point value having sign `zSign', exponent `zExp',
465 1.1 ross and significand `zSig', and returns the proper double-precision floating-
466 1.1 ross point value corresponding to the abstract input. Ordinarily, the abstract
467 1.1 ross value is simply rounded and packed into the double-precision format, with
468 1.1 ross the inexact exception raised if the abstract input cannot be represented
469 1.1 ross exactly. However, if the abstract value is too large, the overflow and
470 1.1 ross inexact exceptions are raised and an infinity or maximal finite value is
471 1.1 ross returned. If the abstract value is too small, the input value is rounded to
472 1.1 ross a subnormal number, and the underflow and inexact exceptions are raised if
473 1.1 ross the abstract input cannot be represented exactly as a subnormal double-
474 1.1 ross precision floating-point number.
475 1.1 ross The input significand `zSig' has its binary point between bits 62
476 1.1 ross and 61, which is 10 bits to the left of the usual location. This shifted
477 1.1 ross significand must be normalized or smaller. If `zSig' is not normalized,
478 1.1 ross `zExp' must be 0; in that case, the result returned is a subnormal number,
479 1.1 ross and it must not require rounding. In the usual case that `zSig' is
480 1.1 ross normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
481 1.1 ross The handling of underflow and overflow follows the IEC/IEEE Standard for
482 1.1 ross Binary Floating-Point Arithmetic.
483 1.1 ross -------------------------------------------------------------------------------
484 1.1 ross */
485 1.1 ross static float64 roundAndPackFloat64( flag zSign, int16 zExp, bits64 zSig )
486 1.1 ross {
487 1.1 ross int8 roundingMode;
488 1.1 ross flag roundNearestEven;
489 1.1 ross int16 roundIncrement, roundBits;
490 1.1 ross flag isTiny;
491 1.1 ross
492 1.1 ross roundingMode = float_rounding_mode();
493 1.1 ross roundNearestEven = ( roundingMode == float_round_nearest_even );
494 1.1 ross roundIncrement = 0x200;
495 1.1 ross if ( ! roundNearestEven ) {
496 1.1 ross if ( roundingMode == float_round_to_zero ) {
497 1.1 ross roundIncrement = 0;
498 1.1 ross }
499 1.1 ross else {
500 1.1 ross roundIncrement = 0x3FF;
501 1.1 ross if ( zSign ) {
502 1.1 ross if ( roundingMode == float_round_up ) roundIncrement = 0;
503 1.1 ross }
504 1.1 ross else {
505 1.1 ross if ( roundingMode == float_round_down ) roundIncrement = 0;
506 1.1 ross }
507 1.1 ross }
508 1.1 ross }
509 1.1 ross roundBits = zSig & 0x3FF;
510 1.1 ross if ( 0x7FD <= (bits16) zExp ) {
511 1.1 ross if ( ( 0x7FD < zExp )
512 1.1 ross || ( ( zExp == 0x7FD )
513 1.1 ross && ( (sbits64) ( zSig + roundIncrement ) < 0 ) )
514 1.1 ross ) {
515 1.1 ross float_raise( float_flag_overflow | float_flag_inexact );
516 1.1 ross return FLOAT64_MANGLE(
517 1.1 ross FLOAT64_DEMANGLE(packFloat64( zSign, 0x7FF, 0 )) -
518 1.1 ross ( roundIncrement == 0 ));
519 1.1 ross }
520 1.1 ross if ( zExp < 0 ) {
521 1.1 ross isTiny =
522 1.1 ross ( float_detect_tininess == float_tininess_before_rounding )
523 1.1 ross || ( zExp < -1 )
524 1.1 ross || ( zSig + roundIncrement < LIT64( 0x8000000000000000 ) );
525 1.1 ross shift64RightJamming( zSig, - zExp, &zSig );
526 1.1 ross zExp = 0;
527 1.1 ross roundBits = zSig & 0x3FF;
528 1.1 ross if ( isTiny && roundBits ) float_raise( float_flag_underflow );
529 1.1 ross }
530 1.1 ross }
531 1.1 ross if ( roundBits ) float_set_inexact();
532 1.1 ross zSig = ( zSig + roundIncrement )>>10;
533 1.1 ross zSig &= ~ ( ( ( roundBits ^ 0x200 ) == 0 ) & roundNearestEven );
534 1.1 ross if ( zSig == 0 ) zExp = 0;
535 1.1 ross return packFloat64( zSign, zExp, zSig );
536 1.1 ross
537 1.1 ross }
538 1.1 ross
539 1.1 ross /*
540 1.1 ross -------------------------------------------------------------------------------
541 1.1 ross Takes an abstract floating-point value having sign `zSign', exponent `zExp',
542 1.1 ross and significand `zSig', and returns the proper double-precision floating-
543 1.1 ross point value corresponding to the abstract input. This routine is just like
544 1.1 ross `roundAndPackFloat64' except that `zSig' does not have to be normalized.
545 1.1 ross Bit 63 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
546 1.1 ross floating-point exponent.
547 1.1 ross -------------------------------------------------------------------------------
548 1.1 ross */
549 1.1 ross static float64
550 1.1 ross normalizeRoundAndPackFloat64( flag zSign, int16 zExp, bits64 zSig )
551 1.1 ross {
552 1.1 ross int8 shiftCount;
553 1.1 ross
554 1.1 ross shiftCount = countLeadingZeros64( zSig ) - 1;
555 1.1 ross return roundAndPackFloat64( zSign, zExp - shiftCount, zSig<<shiftCount );
556 1.1 ross
557 1.1 ross }
558 1.1 ross
559 1.1 ross #ifdef FLOATX80
560 1.1 ross
561 1.1 ross /*
562 1.1 ross -------------------------------------------------------------------------------
563 1.1 ross Returns the fraction bits of the extended double-precision floating-point
564 1.1 ross value `a'.
565 1.1 ross -------------------------------------------------------------------------------
566 1.1 ross */
567 1.1 ross INLINE bits64 extractFloatx80Frac( floatx80 a )
568 1.1 ross {
569 1.1 ross
570 1.1 ross return a.low;
571 1.1 ross
572 1.1 ross }
573 1.1 ross
574 1.1 ross /*
575 1.1 ross -------------------------------------------------------------------------------
576 1.1 ross Returns the exponent bits of the extended double-precision floating-point
577 1.1 ross value `a'.
578 1.1 ross -------------------------------------------------------------------------------
579 1.1 ross */
580 1.1 ross INLINE int32 extractFloatx80Exp( floatx80 a )
581 1.1 ross {
582 1.1 ross
583 1.1 ross return a.high & 0x7FFF;
584 1.1 ross
585 1.1 ross }
586 1.1 ross
587 1.1 ross /*
588 1.1 ross -------------------------------------------------------------------------------
589 1.1 ross Returns the sign bit of the extended double-precision floating-point value
590 1.1 ross `a'.
591 1.1 ross -------------------------------------------------------------------------------
592 1.1 ross */
593 1.1 ross INLINE flag extractFloatx80Sign( floatx80 a )
594 1.1 ross {
595 1.1 ross
596 1.1 ross return a.high>>15;
597 1.1 ross
598 1.1 ross }
599 1.1 ross
600 1.1 ross /*
601 1.1 ross -------------------------------------------------------------------------------
602 1.1 ross Normalizes the subnormal extended double-precision floating-point value
603 1.1 ross represented by the denormalized significand `aSig'. The normalized exponent
604 1.1 ross and significand are stored at the locations pointed to by `zExpPtr' and
605 1.1 ross `zSigPtr', respectively.
606 1.1 ross -------------------------------------------------------------------------------
607 1.1 ross */
608 1.1 ross static void
609 1.1 ross normalizeFloatx80Subnormal( bits64 aSig, int32 *zExpPtr, bits64 *zSigPtr )
610 1.1 ross {
611 1.1 ross int8 shiftCount;
612 1.1 ross
613 1.1 ross shiftCount = countLeadingZeros64( aSig );
614 1.1 ross *zSigPtr = aSig<<shiftCount;
615 1.1 ross *zExpPtr = 1 - shiftCount;
616 1.1 ross
617 1.1 ross }
618 1.1 ross
619 1.1 ross /*
620 1.1 ross -------------------------------------------------------------------------------
621 1.1 ross Packs the sign `zSign', exponent `zExp', and significand `zSig' into an
622 1.1 ross extended double-precision floating-point value, returning the result.
623 1.1 ross -------------------------------------------------------------------------------
624 1.1 ross */
625 1.1 ross INLINE floatx80 packFloatx80( flag zSign, int32 zExp, bits64 zSig )
626 1.1 ross {
627 1.1 ross floatx80 z;
628 1.1 ross
629 1.1 ross z.low = zSig;
630 1.1 ross z.high = ( ( (bits16) zSign )<<15 ) + zExp;
631 1.1 ross return z;
632 1.1 ross
633 1.1 ross }
634 1.1 ross
635 1.1 ross /*
636 1.1 ross -------------------------------------------------------------------------------
637 1.1 ross Takes an abstract floating-point value having sign `zSign', exponent `zExp',
638 1.1 ross and extended significand formed by the concatenation of `zSig0' and `zSig1',
639 1.1 ross and returns the proper extended double-precision floating-point value
640 1.1 ross corresponding to the abstract input. Ordinarily, the abstract value is
641 1.1 ross rounded and packed into the extended double-precision format, with the
642 1.1 ross inexact exception raised if the abstract input cannot be represented
643 1.1 ross exactly. However, if the abstract value is too large, the overflow and
644 1.1 ross inexact exceptions are raised and an infinity or maximal finite value is
645 1.1 ross returned. If the abstract value is too small, the input value is rounded to
646 1.1 ross a subnormal number, and the underflow and inexact exceptions are raised if
647 1.1 ross the abstract input cannot be represented exactly as a subnormal extended
648 1.1 ross double-precision floating-point number.
649 1.1 ross If `roundingPrecision' is 32 or 64, the result is rounded to the same
650 1.1 ross number of bits as single or double precision, respectively. Otherwise, the
651 1.1 ross result is rounded to the full precision of the extended double-precision
652 1.1 ross format.
653 1.1 ross The input significand must be normalized or smaller. If the input
654 1.1 ross significand is not normalized, `zExp' must be 0; in that case, the result
655 1.1 ross returned is a subnormal number, and it must not require rounding. The
656 1.1 ross handling of underflow and overflow follows the IEC/IEEE Standard for Binary
657 1.1 ross Floating-Point Arithmetic.
658 1.1 ross -------------------------------------------------------------------------------
659 1.1 ross */
660 1.1 ross static floatx80
661 1.1 ross roundAndPackFloatx80(
662 1.1 ross int8 roundingPrecision, flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1
663 1.1 ross )
664 1.1 ross {
665 1.1 ross int8 roundingMode;
666 1.1 ross flag roundNearestEven, increment, isTiny;
667 1.1 ross int64 roundIncrement, roundMask, roundBits;
668 1.1 ross
669 1.1 ross roundingMode = float_rounding_mode();
670 1.1 ross roundNearestEven = ( roundingMode == float_round_nearest_even );
671 1.1 ross if ( roundingPrecision == 80 ) goto precision80;
672 1.1 ross if ( roundingPrecision == 64 ) {
673 1.1 ross roundIncrement = LIT64( 0x0000000000000400 );
674 1.1 ross roundMask = LIT64( 0x00000000000007FF );
675 1.1 ross }
676 1.1 ross else if ( roundingPrecision == 32 ) {
677 1.1 ross roundIncrement = LIT64( 0x0000008000000000 );
678 1.1 ross roundMask = LIT64( 0x000000FFFFFFFFFF );
679 1.1 ross }
680 1.1 ross else {
681 1.1 ross goto precision80;
682 1.1 ross }
683 1.1 ross zSig0 |= ( zSig1 != 0 );
684 1.1 ross if ( ! roundNearestEven ) {
685 1.1 ross if ( roundingMode == float_round_to_zero ) {
686 1.1 ross roundIncrement = 0;
687 1.1 ross }
688 1.1 ross else {
689 1.1 ross roundIncrement = roundMask;
690 1.1 ross if ( zSign ) {
691 1.1 ross if ( roundingMode == float_round_up ) roundIncrement = 0;
692 1.1 ross }
693 1.1 ross else {
694 1.1 ross if ( roundingMode == float_round_down ) roundIncrement = 0;
695 1.1 ross }
696 1.1 ross }
697 1.1 ross }
698 1.1 ross roundBits = zSig0 & roundMask;
699 1.1 ross if ( 0x7FFD <= (bits32) ( zExp - 1 ) ) {
700 1.1 ross if ( ( 0x7FFE < zExp )
701 1.1 ross || ( ( zExp == 0x7FFE ) && ( zSig0 + roundIncrement < zSig0 ) )
702 1.1 ross ) {
703 1.1 ross goto overflow;
704 1.1 ross }
705 1.1 ross if ( zExp <= 0 ) {
706 1.1 ross isTiny =
707 1.1 ross ( float_detect_tininess == float_tininess_before_rounding )
708 1.1 ross || ( zExp < 0 )
709 1.1 ross || ( zSig0 <= zSig0 + roundIncrement );
710 1.1 ross shift64RightJamming( zSig0, 1 - zExp, &zSig0 );
711 1.1 ross zExp = 0;
712 1.1 ross roundBits = zSig0 & roundMask;
713 1.1 ross if ( isTiny && roundBits ) float_raise( float_flag_underflow );
714 1.1 ross if ( roundBits ) float_set_inexact();
715 1.1 ross zSig0 += roundIncrement;
716 1.1 ross if ( (sbits64) zSig0 < 0 ) zExp = 1;
717 1.1 ross roundIncrement = roundMask + 1;
718 1.1 ross if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) {
719 1.1 ross roundMask |= roundIncrement;
720 1.1 ross }
721 1.1 ross zSig0 &= ~ roundMask;
722 1.1 ross return packFloatx80( zSign, zExp, zSig0 );
723 1.1 ross }
724 1.1 ross }
725 1.1 ross if ( roundBits ) float_set_inexact();
726 1.1 ross zSig0 += roundIncrement;
727 1.1 ross if ( zSig0 < roundIncrement ) {
728 1.1 ross ++zExp;
729 1.1 ross zSig0 = LIT64( 0x8000000000000000 );
730 1.1 ross }
731 1.1 ross roundIncrement = roundMask + 1;
732 1.1 ross if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) {
733 1.1 ross roundMask |= roundIncrement;
734 1.1 ross }
735 1.1 ross zSig0 &= ~ roundMask;
736 1.1 ross if ( zSig0 == 0 ) zExp = 0;
737 1.1 ross return packFloatx80( zSign, zExp, zSig0 );
738 1.1 ross precision80:
739 1.1 ross increment = ( (sbits64) zSig1 < 0 );
740 1.1 ross if ( ! roundNearestEven ) {
741 1.1 ross if ( roundingMode == float_round_to_zero ) {
742 1.1 ross increment = 0;
743 1.1 ross }
744 1.1 ross else {
745 1.1 ross if ( zSign ) {
746 1.1 ross increment = ( roundingMode == float_round_down ) && zSig1;
747 1.1 ross }
748 1.1 ross else {
749 1.1 ross increment = ( roundingMode == float_round_up ) && zSig1;
750 1.1 ross }
751 1.1 ross }
752 1.1 ross }
753 1.1 ross if ( 0x7FFD <= (bits32) ( zExp - 1 ) ) {
754 1.1 ross if ( ( 0x7FFE < zExp )
755 1.1 ross || ( ( zExp == 0x7FFE )
756 1.1 ross && ( zSig0 == LIT64( 0xFFFFFFFFFFFFFFFF ) )
757 1.1 ross && increment
758 1.1 ross )
759 1.1 ross ) {
760 1.1 ross roundMask = 0;
761 1.1 ross overflow:
762 1.1 ross float_raise( float_flag_overflow | float_flag_inexact );
763 1.1 ross if ( ( roundingMode == float_round_to_zero )
764 1.1 ross || ( zSign && ( roundingMode == float_round_up ) )
765 1.1 ross || ( ! zSign && ( roundingMode == float_round_down ) )
766 1.1 ross ) {
767 1.1 ross return packFloatx80( zSign, 0x7FFE, ~ roundMask );
768 1.1 ross }
769 1.1 ross return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
770 1.1 ross }
771 1.1 ross if ( zExp <= 0 ) {
772 1.1 ross isTiny =
773 1.1 ross ( float_detect_tininess == float_tininess_before_rounding )
774 1.1 ross || ( zExp < 0 )
775 1.1 ross || ! increment
776 1.1 ross || ( zSig0 < LIT64( 0xFFFFFFFFFFFFFFFF ) );
777 1.1 ross shift64ExtraRightJamming( zSig0, zSig1, 1 - zExp, &zSig0, &zSig1 );
778 1.1 ross zExp = 0;
779 1.1 ross if ( isTiny && zSig1 ) float_raise( float_flag_underflow );
780 1.1 ross if ( zSig1 ) float_set_inexact();
781 1.1 ross if ( roundNearestEven ) {
782 1.1 ross increment = ( (sbits64) zSig1 < 0 );
783 1.1 ross }
784 1.1 ross else {
785 1.1 ross if ( zSign ) {
786 1.1 ross increment = ( roundingMode == float_round_down ) && zSig1;
787 1.1 ross }
788 1.1 ross else {
789 1.1 ross increment = ( roundingMode == float_round_up ) && zSig1;
790 1.1 ross }
791 1.1 ross }
792 1.1 ross if ( increment ) {
793 1.1 ross ++zSig0;
794 1.1 ross zSig0 &=
795 1.1 ross ~ ( ( (bits64) ( zSig1<<1 ) == 0 ) & roundNearestEven );
796 1.1 ross if ( (sbits64) zSig0 < 0 ) zExp = 1;
797 1.1 ross }
798 1.1 ross return packFloatx80( zSign, zExp, zSig0 );
799 1.1 ross }
800 1.1 ross }
801 1.1 ross if ( zSig1 ) float_set_inexact();
802 1.1 ross if ( increment ) {
803 1.1 ross ++zSig0;
804 1.1 ross if ( zSig0 == 0 ) {
805 1.1 ross ++zExp;
806 1.1 ross zSig0 = LIT64( 0x8000000000000000 );
807 1.1 ross }
808 1.1 ross else {
809 1.1 ross zSig0 &= ~ ( ( (bits64) ( zSig1<<1 ) == 0 ) & roundNearestEven );
810 1.1 ross }
811 1.1 ross }
812 1.1 ross else {
813 1.1 ross if ( zSig0 == 0 ) zExp = 0;
814 1.1 ross }
815 1.1 ross return packFloatx80( zSign, zExp, zSig0 );
816 1.1 ross
817 1.1 ross }
818 1.1 ross
819 1.1 ross /*
820 1.1 ross -------------------------------------------------------------------------------
821 1.1 ross Takes an abstract floating-point value having sign `zSign', exponent
822 1.1 ross `zExp', and significand formed by the concatenation of `zSig0' and `zSig1',
823 1.1 ross and returns the proper extended double-precision floating-point value
824 1.1 ross corresponding to the abstract input. This routine is just like
825 1.1 ross `roundAndPackFloatx80' except that the input significand does not have to be
826 1.1 ross normalized.
827 1.1 ross -------------------------------------------------------------------------------
828 1.1 ross */
829 1.1 ross static floatx80
830 1.1 ross normalizeRoundAndPackFloatx80(
831 1.1 ross int8 roundingPrecision, flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1
832 1.1 ross )
833 1.1 ross {
834 1.1 ross int8 shiftCount;
835 1.1 ross
836 1.1 ross if ( zSig0 == 0 ) {
837 1.1 ross zSig0 = zSig1;
838 1.1 ross zSig1 = 0;
839 1.1 ross zExp -= 64;
840 1.1 ross }
841 1.1 ross shiftCount = countLeadingZeros64( zSig0 );
842 1.1 ross shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
843 1.1 ross zExp -= shiftCount;
844 1.1 ross return
845 1.1 ross roundAndPackFloatx80( roundingPrecision, zSign, zExp, zSig0, zSig1 );
846 1.1 ross
847 1.1 ross }
848 1.1 ross
849 1.1 ross #endif
850 1.1 ross
851 1.1 ross #ifdef FLOAT128
852 1.1 ross
853 1.1 ross /*
854 1.1 ross -------------------------------------------------------------------------------
855 1.1 ross Returns the least-significant 64 fraction bits of the quadruple-precision
856 1.1 ross floating-point value `a'.
857 1.1 ross -------------------------------------------------------------------------------
858 1.1 ross */
859 1.1 ross INLINE bits64 extractFloat128Frac1( float128 a )
860 1.1 ross {
861 1.1 ross
862 1.1 ross return a.low;
863 1.1 ross
864 1.1 ross }
865 1.1 ross
866 1.1 ross /*
867 1.1 ross -------------------------------------------------------------------------------
868 1.1 ross Returns the most-significant 48 fraction bits of the quadruple-precision
869 1.1 ross floating-point value `a'.
870 1.1 ross -------------------------------------------------------------------------------
871 1.1 ross */
872 1.1 ross INLINE bits64 extractFloat128Frac0( float128 a )
873 1.1 ross {
874 1.1 ross
875 1.1 ross return a.high & LIT64( 0x0000FFFFFFFFFFFF );
876 1.1 ross
877 1.1 ross }
878 1.1 ross
879 1.1 ross /*
880 1.1 ross -------------------------------------------------------------------------------
881 1.1 ross Returns the exponent bits of the quadruple-precision floating-point value
882 1.1 ross `a'.
883 1.1 ross -------------------------------------------------------------------------------
884 1.1 ross */
885 1.1 ross INLINE int32 extractFloat128Exp( float128 a )
886 1.1 ross {
887 1.1 ross
888 1.1 ross return ( a.high>>48 ) & 0x7FFF;
889 1.1 ross
890 1.1 ross }
891 1.1 ross
892 1.1 ross /*
893 1.1 ross -------------------------------------------------------------------------------
894 1.1 ross Returns the sign bit of the quadruple-precision floating-point value `a'.
895 1.1 ross -------------------------------------------------------------------------------
896 1.1 ross */
897 1.1 ross INLINE flag extractFloat128Sign( float128 a )
898 1.1 ross {
899 1.1 ross
900 1.1 ross return a.high>>63;
901 1.1 ross
902 1.1 ross }
903 1.1 ross
904 1.1 ross /*
905 1.1 ross -------------------------------------------------------------------------------
906 1.1 ross Normalizes the subnormal quadruple-precision floating-point value
907 1.1 ross represented by the denormalized significand formed by the concatenation of
908 1.1 ross `aSig0' and `aSig1'. The normalized exponent is stored at the location
909 1.1 ross pointed to by `zExpPtr'. The most significant 49 bits of the normalized
910 1.1 ross significand are stored at the location pointed to by `zSig0Ptr', and the
911 1.1 ross least significant 64 bits of the normalized significand are stored at the
912 1.1 ross location pointed to by `zSig1Ptr'.
913 1.1 ross -------------------------------------------------------------------------------
914 1.1 ross */
915 1.1 ross static void
916 1.1 ross normalizeFloat128Subnormal(
917 1.1 ross bits64 aSig0,
918 1.1 ross bits64 aSig1,
919 1.1 ross int32 *zExpPtr,
920 1.1 ross bits64 *zSig0Ptr,
921 1.1 ross bits64 *zSig1Ptr
922 1.1 ross )
923 1.1 ross {
924 1.1 ross int8 shiftCount;
925 1.1 ross
926 1.1 ross if ( aSig0 == 0 ) {
927 1.1 ross shiftCount = countLeadingZeros64( aSig1 ) - 15;
928 1.1 ross if ( shiftCount < 0 ) {
929 1.1 ross *zSig0Ptr = aSig1>>( - shiftCount );
930 1.1 ross *zSig1Ptr = aSig1<<( shiftCount & 63 );
931 1.1 ross }
932 1.1 ross else {
933 1.1 ross *zSig0Ptr = aSig1<<shiftCount;
934 1.1 ross *zSig1Ptr = 0;
935 1.1 ross }
936 1.1 ross *zExpPtr = - shiftCount - 63;
937 1.1 ross }
938 1.1 ross else {
939 1.1 ross shiftCount = countLeadingZeros64( aSig0 ) - 15;
940 1.1 ross shortShift128Left( aSig0, aSig1, shiftCount, zSig0Ptr, zSig1Ptr );
941 1.1 ross *zExpPtr = 1 - shiftCount;
942 1.1 ross }
943 1.1 ross
944 1.1 ross }
945 1.1 ross
946 1.1 ross /*
947 1.1 ross -------------------------------------------------------------------------------
948 1.1 ross Packs the sign `zSign', the exponent `zExp', and the significand formed
949 1.1 ross by the concatenation of `zSig0' and `zSig1' into a quadruple-precision
950 1.1 ross floating-point value, returning the result. After being shifted into the
951 1.1 ross proper positions, the three fields `zSign', `zExp', and `zSig0' are simply
952 1.1 ross added together to form the most significant 32 bits of the result. This
953 1.1 ross means that any integer portion of `zSig0' will be added into the exponent.
954 1.1 ross Since a properly normalized significand will have an integer portion equal
955 1.1 ross to 1, the `zExp' input should be 1 less than the desired result exponent
956 1.1 ross whenever `zSig0' and `zSig1' concatenated form a complete, normalized
957 1.1 ross significand.
958 1.1 ross -------------------------------------------------------------------------------
959 1.1 ross */
960 1.1 ross INLINE float128
961 1.1 ross packFloat128( flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1 )
962 1.1 ross {
963 1.1 ross float128 z;
964 1.1 ross
965 1.1 ross z.low = zSig1;
966 1.1 ross z.high = ( ( (bits64) zSign )<<63 ) + ( ( (bits64) zExp )<<48 ) + zSig0;
967 1.1 ross return z;
968 1.1 ross
969 1.1 ross }
970 1.1 ross
971 1.1 ross /*
972 1.1 ross -------------------------------------------------------------------------------
973 1.1 ross Takes an abstract floating-point value having sign `zSign', exponent `zExp',
974 1.1 ross and extended significand formed by the concatenation of `zSig0', `zSig1',
975 1.1 ross and `zSig2', and returns the proper quadruple-precision floating-point value
976 1.1 ross corresponding to the abstract input. Ordinarily, the abstract value is
977 1.1 ross simply rounded and packed into the quadruple-precision format, with the
978 1.1 ross inexact exception raised if the abstract input cannot be represented
979 1.1 ross exactly. However, if the abstract value is too large, the overflow and
980 1.1 ross inexact exceptions are raised and an infinity or maximal finite value is
981 1.1 ross returned. If the abstract value is too small, the input value is rounded to
982 1.1 ross a subnormal number, and the underflow and inexact exceptions are raised if
983 1.1 ross the abstract input cannot be represented exactly as a subnormal quadruple-
984 1.1 ross precision floating-point number.
985 1.1 ross The input significand must be normalized or smaller. If the input
986 1.1 ross significand is not normalized, `zExp' must be 0; in that case, the result
987 1.1 ross returned is a subnormal number, and it must not require rounding. In the
988 1.1 ross usual case that the input significand is normalized, `zExp' must be 1 less
989 1.1 ross than the ``true'' floating-point exponent. The handling of underflow and
990 1.1 ross overflow follows the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
991 1.1 ross -------------------------------------------------------------------------------
992 1.1 ross */
993 1.1 ross static float128
994 1.1 ross roundAndPackFloat128(
995 1.1 ross flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1, bits64 zSig2 )
996 1.1 ross {
997 1.1 ross int8 roundingMode;
998 1.1 ross flag roundNearestEven, increment, isTiny;
999 1.1 ross
1000 1.1 ross roundingMode = float_rounding_mode();
1001 1.1 ross roundNearestEven = ( roundingMode == float_round_nearest_even );
1002 1.1 ross increment = ( (sbits64) zSig2 < 0 );
1003 1.1 ross if ( ! roundNearestEven ) {
1004 1.1 ross if ( roundingMode == float_round_to_zero ) {
1005 1.1 ross increment = 0;
1006 1.1 ross }
1007 1.1 ross else {
1008 1.1 ross if ( zSign ) {
1009 1.1 ross increment = ( roundingMode == float_round_down ) && zSig2;
1010 1.1 ross }
1011 1.1 ross else {
1012 1.1 ross increment = ( roundingMode == float_round_up ) && zSig2;
1013 1.1 ross }
1014 1.1 ross }
1015 1.1 ross }
1016 1.1 ross if ( 0x7FFD <= (bits32) zExp ) {
1017 1.1 ross if ( ( 0x7FFD < zExp )
1018 1.1 ross || ( ( zExp == 0x7FFD )
1019 1.1 ross && eq128(
1020 1.1 ross LIT64( 0x0001FFFFFFFFFFFF ),
1021 1.1 ross LIT64( 0xFFFFFFFFFFFFFFFF ),
1022 1.1 ross zSig0,
1023 1.1 ross zSig1
1024 1.1 ross )
1025 1.1 ross && increment
1026 1.1 ross )
1027 1.1 ross ) {
1028 1.1 ross float_raise( float_flag_overflow | float_flag_inexact );
1029 1.1 ross if ( ( roundingMode == float_round_to_zero )
1030 1.1 ross || ( zSign && ( roundingMode == float_round_up ) )
1031 1.1 ross || ( ! zSign && ( roundingMode == float_round_down ) )
1032 1.1 ross ) {
1033 1.1 ross return
1034 1.1 ross packFloat128(
1035 1.1 ross zSign,
1036 1.1 ross 0x7FFE,
1037 1.1 ross LIT64( 0x0000FFFFFFFFFFFF ),
1038 1.1 ross LIT64( 0xFFFFFFFFFFFFFFFF )
1039 1.1 ross );
1040 1.1 ross }
1041 1.1 ross return packFloat128( zSign, 0x7FFF, 0, 0 );
1042 1.1 ross }
1043 1.1 ross if ( zExp < 0 ) {
1044 1.1 ross isTiny =
1045 1.1 ross ( float_detect_tininess == float_tininess_before_rounding )
1046 1.1 ross || ( zExp < -1 )
1047 1.1 ross || ! increment
1048 1.1 ross || lt128(
1049 1.1 ross zSig0,
1050 1.1 ross zSig1,
1051 1.1 ross LIT64( 0x0001FFFFFFFFFFFF ),
1052 1.1 ross LIT64( 0xFFFFFFFFFFFFFFFF )
1053 1.1 ross );
1054 1.1 ross shift128ExtraRightJamming(
1055 1.1 ross zSig0, zSig1, zSig2, - zExp, &zSig0, &zSig1, &zSig2 );
1056 1.1 ross zExp = 0;
1057 1.1 ross if ( isTiny && zSig2 ) float_raise( float_flag_underflow );
1058 1.1 ross if ( roundNearestEven ) {
1059 1.1 ross increment = ( (sbits64) zSig2 < 0 );
1060 1.1 ross }
1061 1.1 ross else {
1062 1.1 ross if ( zSign ) {
1063 1.1 ross increment = ( roundingMode == float_round_down ) && zSig2;
1064 1.1 ross }
1065 1.1 ross else {
1066 1.1 ross increment = ( roundingMode == float_round_up ) && zSig2;
1067 1.1 ross }
1068 1.1 ross }
1069 1.1 ross }
1070 1.1 ross }
1071 1.1 ross if ( zSig2 ) float_set_inexact();
1072 1.1 ross if ( increment ) {
1073 1.1 ross add128( zSig0, zSig1, 0, 1, &zSig0, &zSig1 );
1074 1.1 ross zSig1 &= ~ ( ( zSig2 + zSig2 == 0 ) & roundNearestEven );
1075 1.1 ross }
1076 1.1 ross else {
1077 1.1 ross if ( ( zSig0 | zSig1 ) == 0 ) zExp = 0;
1078 1.1 ross }
1079 1.1 ross return packFloat128( zSign, zExp, zSig0, zSig1 );
1080 1.1 ross
1081 1.1 ross }
1082 1.1 ross
1083 1.1 ross /*
1084 1.1 ross -------------------------------------------------------------------------------
1085 1.1 ross Takes an abstract floating-point value having sign `zSign', exponent `zExp',
1086 1.1 ross and significand formed by the concatenation of `zSig0' and `zSig1', and
1087 1.1 ross returns the proper quadruple-precision floating-point value corresponding
1088 1.1 ross to the abstract input. This routine is just like `roundAndPackFloat128'
1089 1.1 ross except that the input significand has fewer bits and does not have to be
1090 1.1 ross normalized. In all cases, `zExp' must be 1 less than the ``true'' floating-
1091 1.1 ross point exponent.
1092 1.1 ross -------------------------------------------------------------------------------
1093 1.1 ross */
1094 1.1 ross static float128
1095 1.1 ross normalizeRoundAndPackFloat128(
1096 1.1 ross flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1 )
1097 1.1 ross {
1098 1.1 ross int8 shiftCount;
1099 1.1 ross bits64 zSig2;
1100 1.1 ross
1101 1.1 ross if ( zSig0 == 0 ) {
1102 1.1 ross zSig0 = zSig1;
1103 1.1 ross zSig1 = 0;
1104 1.1 ross zExp -= 64;
1105 1.1 ross }
1106 1.1 ross shiftCount = countLeadingZeros64( zSig0 ) - 15;
1107 1.1 ross if ( 0 <= shiftCount ) {
1108 1.1 ross zSig2 = 0;
1109 1.1 ross shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
1110 1.1 ross }
1111 1.1 ross else {
1112 1.1 ross shift128ExtraRightJamming(
1113 1.1 ross zSig0, zSig1, 0, - shiftCount, &zSig0, &zSig1, &zSig2 );
1114 1.1 ross }
1115 1.1 ross zExp -= shiftCount;
1116 1.1 ross return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 );
1117 1.1 ross
1118 1.1 ross }
1119 1.1 ross
1120 1.1 ross #endif
1121 1.1 ross
1122 1.1 ross /*
1123 1.1 ross -------------------------------------------------------------------------------
1124 1.1 ross Returns the result of converting the 32-bit two's complement integer `a'
1125 1.1 ross to the single-precision floating-point format. The conversion is performed
1126 1.1 ross according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1127 1.1 ross -------------------------------------------------------------------------------
1128 1.1 ross */
1129 1.1 ross float32 int32_to_float32( int32 a )
1130 1.1 ross {
1131 1.1 ross flag zSign;
1132 1.1 ross
1133 1.1 ross if ( a == 0 ) return 0;
1134 1.1 ross if ( a == (sbits32) 0x80000000 ) return packFloat32( 1, 0x9E, 0 );
1135 1.1 ross zSign = ( a < 0 );
1136 1.1 ross return normalizeRoundAndPackFloat32( zSign, 0x9C, zSign ? - a : a );
1137 1.1 ross
1138 1.1 ross }
1139 1.1 ross
1140 1.1 ross /*
1141 1.1 ross -------------------------------------------------------------------------------
1142 1.1 ross Returns the result of converting the 32-bit two's complement integer `a'
1143 1.1 ross to the double-precision floating-point format. The conversion is performed
1144 1.1 ross according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1145 1.1 ross -------------------------------------------------------------------------------
1146 1.1 ross */
1147 1.1 ross float64 int32_to_float64( int32 a )
1148 1.1 ross {
1149 1.1 ross flag zSign;
1150 1.1 ross uint32 absA;
1151 1.1 ross int8 shiftCount;
1152 1.1 ross bits64 zSig;
1153 1.1 ross
1154 1.1 ross if ( a == 0 ) return 0;
1155 1.1 ross zSign = ( a < 0 );
1156 1.1 ross absA = zSign ? - a : a;
1157 1.1 ross shiftCount = countLeadingZeros32( absA ) + 21;
1158 1.1 ross zSig = absA;
1159 1.1 ross return packFloat64( zSign, 0x432 - shiftCount, zSig<<shiftCount );
1160 1.1 ross
1161 1.1 ross }
1162 1.1 ross
1163 1.1 ross #ifdef FLOATX80
1164 1.1 ross
1165 1.1 ross /*
1166 1.1 ross -------------------------------------------------------------------------------
1167 1.1 ross Returns the result of converting the 32-bit two's complement integer `a'
1168 1.1 ross to the extended double-precision floating-point format. The conversion
1169 1.1 ross is performed according to the IEC/IEEE Standard for Binary Floating-Point
1170 1.1 ross Arithmetic.
1171 1.1 ross -------------------------------------------------------------------------------
1172 1.1 ross */
1173 1.1 ross floatx80 int32_to_floatx80( int32 a )
1174 1.1 ross {
1175 1.1 ross flag zSign;
1176 1.1 ross uint32 absA;
1177 1.1 ross int8 shiftCount;
1178 1.1 ross bits64 zSig;
1179 1.1 ross
1180 1.1 ross if ( a == 0 ) return packFloatx80( 0, 0, 0 );
1181 1.1 ross zSign = ( a < 0 );
1182 1.1 ross absA = zSign ? - a : a;
1183 1.1 ross shiftCount = countLeadingZeros32( absA ) + 32;
1184 1.1 ross zSig = absA;
1185 1.1 ross return packFloatx80( zSign, 0x403E - shiftCount, zSig<<shiftCount );
1186 1.1 ross
1187 1.1 ross }
1188 1.1 ross
1189 1.1 ross #endif
1190 1.1 ross
1191 1.1 ross #ifdef FLOAT128
1192 1.1 ross
1193 1.1 ross /*
1194 1.1 ross -------------------------------------------------------------------------------
1195 1.1 ross Returns the result of converting the 32-bit two's complement integer `a' to
1196 1.1 ross the quadruple-precision floating-point format. The conversion is performed
1197 1.1 ross according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1198 1.1 ross -------------------------------------------------------------------------------
1199 1.1 ross */
1200 1.1 ross float128 int32_to_float128( int32 a )
1201 1.1 ross {
1202 1.1 ross flag zSign;
1203 1.1 ross uint32 absA;
1204 1.1 ross int8 shiftCount;
1205 1.1 ross bits64 zSig0;
1206 1.1 ross
1207 1.1 ross if ( a == 0 ) return packFloat128( 0, 0, 0, 0 );
1208 1.1 ross zSign = ( a < 0 );
1209 1.1 ross absA = zSign ? - a : a;
1210 1.1 ross shiftCount = countLeadingZeros32( absA ) + 17;
1211 1.1 ross zSig0 = absA;
1212 1.1 ross return packFloat128( zSign, 0x402E - shiftCount, zSig0<<shiftCount, 0 );
1213 1.1 ross
1214 1.1 ross }
1215 1.1 ross
1216 1.1 ross #endif
1217 1.1 ross
1218 1.1 ross #ifndef SOFTFLOAT_FOR_GCC /* __floatdi?f is in libgcc2.c */
1219 1.1 ross /*
1220 1.1 ross -------------------------------------------------------------------------------
1221 1.1 ross Returns the result of converting the 64-bit two's complement integer `a'
1222 1.1 ross to the single-precision floating-point format. The conversion is performed
1223 1.1 ross according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1224 1.1 ross -------------------------------------------------------------------------------
1225 1.1 ross */
1226 1.1 ross float32 int64_to_float32( int64 a )
1227 1.1 ross {
1228 1.1 ross flag zSign;
1229 1.1 ross uint64 absA;
1230 1.1 ross int8 shiftCount;
1231 1.1 ross
1232 1.1 ross if ( a == 0 ) return 0;
1233 1.1 ross zSign = ( a < 0 );
1234 1.1 ross absA = zSign ? - a : a;
1235 1.1 ross shiftCount = countLeadingZeros64( absA ) - 40;
1236 1.1 ross if ( 0 <= shiftCount ) {
1237 1.1 ross return packFloat32( zSign, 0x95 - shiftCount, absA<<shiftCount );
1238 1.1 ross }
1239 1.1 ross else {
1240 1.1 ross shiftCount += 7;
1241 1.1 ross if ( shiftCount < 0 ) {
1242 1.1 ross shift64RightJamming( absA, - shiftCount, &absA );
1243 1.1 ross }
1244 1.1 ross else {
1245 1.1 ross absA <<= shiftCount;
1246 1.1 ross }
1247 1.1 ross return roundAndPackFloat32( zSign, 0x9C - shiftCount, absA );
1248 1.1 ross }
1249 1.1 ross
1250 1.1 ross }
1251 1.1 ross
1252 1.1 ross /*
1253 1.1 ross -------------------------------------------------------------------------------
1254 1.1 ross Returns the result of converting the 64-bit two's complement integer `a'
1255 1.1 ross to the double-precision floating-point format. The conversion is performed
1256 1.1 ross according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1257 1.1 ross -------------------------------------------------------------------------------
1258 1.1 ross */
1259 1.1 ross float64 int64_to_float64( int64 a )
1260 1.1 ross {
1261 1.1 ross flag zSign;
1262 1.1 ross
1263 1.1 ross if ( a == 0 ) return 0;
1264 1.1 ross if ( a == (sbits64) LIT64( 0x8000000000000000 ) ) {
1265 1.1 ross return packFloat64( 1, 0x43E, 0 );
1266 1.1 ross }
1267 1.1 ross zSign = ( a < 0 );
1268 1.1 ross return normalizeRoundAndPackFloat64( zSign, 0x43C, zSign ? - a : a );
1269 1.1 ross
1270 1.1 ross }
1271 1.1 ross
1272 1.1 ross #ifdef FLOATX80
1273 1.1 ross
1274 1.1 ross /*
1275 1.1 ross -------------------------------------------------------------------------------
1276 1.1 ross Returns the result of converting the 64-bit two's complement integer `a'
1277 1.1 ross to the extended double-precision floating-point format. The conversion
1278 1.1 ross is performed according to the IEC/IEEE Standard for Binary Floating-Point
1279 1.1 ross Arithmetic.
1280 1.1 ross -------------------------------------------------------------------------------
1281 1.1 ross */
1282 1.1 ross floatx80 int64_to_floatx80( int64 a )
1283 1.1 ross {
1284 1.1 ross flag zSign;
1285 1.1 ross uint64 absA;
1286 1.1 ross int8 shiftCount;
1287 1.1 ross
1288 1.1 ross if ( a == 0 ) return packFloatx80( 0, 0, 0 );
1289 1.1 ross zSign = ( a < 0 );
1290 1.1 ross absA = zSign ? - a : a;
1291 1.1 ross shiftCount = countLeadingZeros64( absA );
1292 1.1 ross return packFloatx80( zSign, 0x403E - shiftCount, absA<<shiftCount );
1293 1.1 ross
1294 1.1 ross }
1295 1.1 ross
1296 1.1 ross #endif
1297 1.1 ross
1298 1.1 ross #ifdef FLOAT128
1299 1.1 ross
1300 1.1 ross /*
1301 1.1 ross -------------------------------------------------------------------------------
1302 1.1 ross Returns the result of converting the 64-bit two's complement integer `a' to
1303 1.1 ross the quadruple-precision floating-point format. The conversion is performed
1304 1.1 ross according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1305 1.1 ross -------------------------------------------------------------------------------
1306 1.1 ross */
1307 1.1 ross float128 int64_to_float128( int64 a )
1308 1.1 ross {
1309 1.1 ross flag zSign;
1310 1.1 ross uint64 absA;
1311 1.1 ross int8 shiftCount;
1312 1.1 ross int32 zExp;
1313 1.1 ross bits64 zSig0, zSig1;
1314 1.1 ross
1315 1.1 ross if ( a == 0 ) return packFloat128( 0, 0, 0, 0 );
1316 1.1 ross zSign = ( a < 0 );
1317 1.1 ross absA = zSign ? - a : a;
1318 1.1 ross shiftCount = countLeadingZeros64( absA ) + 49;
1319 1.1 ross zExp = 0x406E - shiftCount;
1320 1.1 ross if ( 64 <= shiftCount ) {
1321 1.1 ross zSig1 = 0;
1322 1.1 ross zSig0 = absA;
1323 1.1 ross shiftCount -= 64;
1324 1.1 ross }
1325 1.1 ross else {
1326 1.1 ross zSig1 = absA;
1327 1.1 ross zSig0 = 0;
1328 1.1 ross }
1329 1.1 ross shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
1330 1.1 ross return packFloat128( zSign, zExp, zSig0, zSig1 );
1331 1.1 ross
1332 1.1 ross }
1333 1.1 ross
1334 1.1 ross #endif
1335 1.1 ross #endif /* !SOFTFLOAT_FOR_GCC */
1336 1.1 ross
1337 1.1 ross #ifndef SOFTFLOAT_FOR_GCC /* Not needed */
1338 1.1 ross /*
1339 1.1 ross -------------------------------------------------------------------------------
1340 1.1 ross Returns the result of converting the single-precision floating-point value
1341 1.1 ross `a' to the 32-bit two's complement integer format. The conversion is
1342 1.1 ross performed according to the IEC/IEEE Standard for Binary Floating-Point
1343 1.1 ross Arithmetic---which means in particular that the conversion is rounded
1344 1.1 ross according to the current rounding mode. If `a' is a NaN, the largest
1345 1.1 ross positive integer is returned. Otherwise, if the conversion overflows, the
1346 1.1 ross largest integer with the same sign as `a' is returned.
1347 1.1 ross -------------------------------------------------------------------------------
1348 1.1 ross */
1349 1.1 ross int32 float32_to_int32( float32 a )
1350 1.1 ross {
1351 1.1 ross flag aSign;
1352 1.1 ross int16 aExp, shiftCount;
1353 1.1 ross bits32 aSig;
1354 1.1 ross bits64 aSig64;
1355 1.1 ross
1356 1.1 ross aSig = extractFloat32Frac( a );
1357 1.1 ross aExp = extractFloat32Exp( a );
1358 1.1 ross aSign = extractFloat32Sign( a );
1359 1.1 ross if ( ( aExp == 0xFF ) && aSig ) aSign = 0;
1360 1.1 ross if ( aExp ) aSig |= 0x00800000;
1361 1.1 ross shiftCount = 0xAF - aExp;
1362 1.1 ross aSig64 = aSig;
1363 1.1 ross aSig64 <<= 32;
1364 1.1 ross if ( 0 < shiftCount ) shift64RightJamming( aSig64, shiftCount, &aSig64 );
1365 1.1 ross return roundAndPackInt32( aSign, aSig64 );
1366 1.1 ross
1367 1.1 ross }
1368 1.1 ross #endif /* !SOFTFLOAT_FOR_GCC */
1369 1.1 ross
1370 1.1 ross /*
1371 1.1 ross -------------------------------------------------------------------------------
1372 1.1 ross Returns the result of converting the single-precision floating-point value
1373 1.1 ross `a' to the 32-bit two's complement integer format. The conversion is
1374 1.1 ross performed according to the IEC/IEEE Standard for Binary Floating-Point
1375 1.1 ross Arithmetic, except that the conversion is always rounded toward zero.
1376 1.1 ross If `a' is a NaN, the largest positive integer is returned. Otherwise, if
1377 1.1 ross the conversion overflows, the largest integer with the same sign as `a' is
1378 1.1 ross returned.
1379 1.1 ross -------------------------------------------------------------------------------
1380 1.1 ross */
1381 1.1 ross int32 float32_to_int32_round_to_zero( float32 a )
1382 1.1 ross {
1383 1.1 ross flag aSign;
1384 1.1 ross int16 aExp, shiftCount;
1385 1.1 ross bits32 aSig;
1386 1.1 ross int32 z;
1387 1.1 ross
1388 1.1 ross aSig = extractFloat32Frac( a );
1389 1.1 ross aExp = extractFloat32Exp( a );
1390 1.1 ross aSign = extractFloat32Sign( a );
1391 1.1 ross shiftCount = aExp - 0x9E;
1392 1.1 ross if ( 0 <= shiftCount ) {
1393 1.1 ross if ( a != 0xCF000000 ) {
1394 1.1 ross float_raise( float_flag_invalid );
1395 1.1 ross if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) return 0x7FFFFFFF;
1396 1.1 ross }
1397 1.1 ross return (sbits32) 0x80000000;
1398 1.1 ross }
1399 1.1 ross else if ( aExp <= 0x7E ) {
1400 1.1 ross if ( aExp | aSig ) float_set_inexact();
1401 1.1 ross return 0;
1402 1.1 ross }
1403 1.1 ross aSig = ( aSig | 0x00800000 )<<8;
1404 1.1 ross z = aSig>>( - shiftCount );
1405 1.1 ross if ( (bits32) ( aSig<<( shiftCount & 31 ) ) ) {
1406 1.1 ross float_set_inexact();
1407 1.1 ross }
1408 1.1 ross if ( aSign ) z = - z;
1409 1.1 ross return z;
1410 1.1 ross
1411 1.1 ross }
1412 1.1 ross
1413 1.1 ross #ifndef SOFTFLOAT_FOR_GCC /* __fix?fdi provided by libgcc2.c */
1414 1.1 ross /*
1415 1.1 ross -------------------------------------------------------------------------------
1416 1.1 ross Returns the result of converting the single-precision floating-point value
1417 1.1 ross `a' to the 64-bit two's complement integer format. The conversion is
1418 1.1 ross performed according to the IEC/IEEE Standard for Binary Floating-Point
1419 1.1 ross Arithmetic---which means in particular that the conversion is rounded
1420 1.1 ross according to the current rounding mode. If `a' is a NaN, the largest
1421 1.1 ross positive integer is returned. Otherwise, if the conversion overflows, the
1422 1.1 ross largest integer with the same sign as `a' is returned.
1423 1.1 ross -------------------------------------------------------------------------------
1424 1.1 ross */
1425 1.1 ross int64 float32_to_int64( float32 a )
1426 1.1 ross {
1427 1.1 ross flag aSign;
1428 1.1 ross int16 aExp, shiftCount;
1429 1.1 ross bits32 aSig;
1430 1.1 ross bits64 aSig64, aSigExtra;
1431 1.1 ross
1432 1.1 ross aSig = extractFloat32Frac( a );
1433 1.1 ross aExp = extractFloat32Exp( a );
1434 1.1 ross aSign = extractFloat32Sign( a );
1435 1.1 ross shiftCount = 0xBE - aExp;
1436 1.1 ross if ( shiftCount < 0 ) {
1437 1.1 ross float_raise( float_flag_invalid );
1438 1.1 ross if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) {
1439 1.1 ross return LIT64( 0x7FFFFFFFFFFFFFFF );
1440 1.1 ross }
1441 1.1 ross return (sbits64) LIT64( 0x8000000000000000 );
1442 1.1 ross }
1443 1.1 ross if ( aExp ) aSig |= 0x00800000;
1444 1.1 ross aSig64 = aSig;
1445 1.1 ross aSig64 <<= 40;
1446 1.1 ross shift64ExtraRightJamming( aSig64, 0, shiftCount, &aSig64, &aSigExtra );
1447 1.1 ross return roundAndPackInt64( aSign, aSig64, aSigExtra );
1448 1.1 ross
1449 1.1 ross }
1450 1.1 ross
1451 1.1 ross /*
1452 1.1 ross -------------------------------------------------------------------------------
1453 1.1 ross Returns the result of converting the single-precision floating-point value
1454 1.1 ross `a' to the 64-bit two's complement integer format. The conversion is
1455 1.1 ross performed according to the IEC/IEEE Standard for Binary Floating-Point
1456 1.1 ross Arithmetic, except that the conversion is always rounded toward zero. If
1457 1.1 ross `a' is a NaN, the largest positive integer is returned. Otherwise, if the
1458 1.1 ross conversion overflows, the largest integer with the same sign as `a' is
1459 1.1 ross returned.
1460 1.1 ross -------------------------------------------------------------------------------
1461 1.1 ross */
1462 1.1 ross int64 float32_to_int64_round_to_zero( float32 a )
1463 1.1 ross {
1464 1.1 ross flag aSign;
1465 1.1 ross int16 aExp, shiftCount;
1466 1.1 ross bits32 aSig;
1467 1.1 ross bits64 aSig64;
1468 1.1 ross int64 z;
1469 1.1 ross
1470 1.1 ross aSig = extractFloat32Frac( a );
1471 1.1 ross aExp = extractFloat32Exp( a );
1472 1.1 ross aSign = extractFloat32Sign( a );
1473 1.1 ross shiftCount = aExp - 0xBE;
1474 1.1 ross if ( 0 <= shiftCount ) {
1475 1.1 ross if ( a != 0xDF000000 ) {
1476 1.1 ross float_raise( float_flag_invalid );
1477 1.1 ross if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) {
1478 1.1 ross return LIT64( 0x7FFFFFFFFFFFFFFF );
1479 1.1 ross }
1480 1.1 ross }
1481 1.1 ross return (sbits64) LIT64( 0x8000000000000000 );
1482 1.1 ross }
1483 1.1 ross else if ( aExp <= 0x7E ) {
1484 1.1 ross if ( aExp | aSig ) float_set_inexact();
1485 1.1 ross return 0;
1486 1.1 ross }
1487 1.1 ross aSig64 = aSig | 0x00800000;
1488 1.1 ross aSig64 <<= 40;
1489 1.1 ross z = aSig64>>( - shiftCount );
1490 1.1 ross if ( (bits64) ( aSig64<<( shiftCount & 63 ) ) ) {
1491 1.1 ross float_set_inexact();
1492 1.1 ross }
1493 1.1 ross if ( aSign ) z = - z;
1494 1.1 ross return z;
1495 1.1 ross
1496 1.1 ross }
1497 1.1 ross #endif /* !SOFTFLOAT_FOR_GCC */
1498 1.1 ross
1499 1.1 ross /*
1500 1.1 ross -------------------------------------------------------------------------------
1501 1.1 ross Returns the result of converting the single-precision floating-point value
1502 1.1 ross `a' to the double-precision floating-point format. The conversion is
1503 1.1 ross performed according to the IEC/IEEE Standard for Binary Floating-Point
1504 1.1 ross Arithmetic.
1505 1.1 ross -------------------------------------------------------------------------------
1506 1.1 ross */
1507 1.1 ross float64 float32_to_float64( float32 a )
1508 1.1 ross {
1509 1.1 ross flag aSign;
1510 1.1 ross int16 aExp;
1511 1.1 ross bits32 aSig;
1512 1.1 ross
1513 1.1 ross aSig = extractFloat32Frac( a );
1514 1.1 ross aExp = extractFloat32Exp( a );
1515 1.1 ross aSign = extractFloat32Sign( a );
1516 1.1 ross if ( aExp == 0xFF ) {
1517 1.1 ross if ( aSig ) return commonNaNToFloat64( float32ToCommonNaN( a ) );
1518 1.1 ross return packFloat64( aSign, 0x7FF, 0 );
1519 1.1 ross }
1520 1.1 ross if ( aExp == 0 ) {
1521 1.1 ross if ( aSig == 0 ) return packFloat64( aSign, 0, 0 );
1522 1.1 ross normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1523 1.1 ross --aExp;
1524 1.1 ross }
1525 1.1 ross return packFloat64( aSign, aExp + 0x380, ( (bits64) aSig )<<29 );
1526 1.1 ross
1527 1.1 ross }
1528 1.1 ross
1529 1.1 ross #ifdef FLOATX80
1530 1.1 ross
1531 1.1 ross /*
1532 1.1 ross -------------------------------------------------------------------------------
1533 1.1 ross Returns the result of converting the single-precision floating-point value
1534 1.1 ross `a' to the extended double-precision floating-point format. The conversion
1535 1.1 ross is performed according to the IEC/IEEE Standard for Binary Floating-Point
1536 1.1 ross Arithmetic.
1537 1.1 ross -------------------------------------------------------------------------------
1538 1.1 ross */
1539 1.1 ross floatx80 float32_to_floatx80( float32 a )
1540 1.1 ross {
1541 1.1 ross flag aSign;
1542 1.1 ross int16 aExp;
1543 1.1 ross bits32 aSig;
1544 1.1 ross
1545 1.1 ross aSig = extractFloat32Frac( a );
1546 1.1 ross aExp = extractFloat32Exp( a );
1547 1.1 ross aSign = extractFloat32Sign( a );
1548 1.1 ross if ( aExp == 0xFF ) {
1549 1.1 ross if ( aSig ) return commonNaNToFloatx80( float32ToCommonNaN( a ) );
1550 1.1 ross return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
1551 1.1 ross }
1552 1.1 ross if ( aExp == 0 ) {
1553 1.1 ross if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 );
1554 1.1 ross normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1555 1.1 ross }
1556 1.1 ross aSig |= 0x00800000;
1557 1.1 ross return packFloatx80( aSign, aExp + 0x3F80, ( (bits64) aSig )<<40 );
1558 1.1 ross
1559 1.1 ross }
1560 1.1 ross
1561 1.1 ross #endif
1562 1.1 ross
1563 1.1 ross #ifdef FLOAT128
1564 1.1 ross
1565 1.1 ross /*
1566 1.1 ross -------------------------------------------------------------------------------
1567 1.1 ross Returns the result of converting the single-precision floating-point value
1568 1.1 ross `a' to the double-precision floating-point format. The conversion is
1569 1.1 ross performed according to the IEC/IEEE Standard for Binary Floating-Point
1570 1.1 ross Arithmetic.
1571 1.1 ross -------------------------------------------------------------------------------
1572 1.1 ross */
1573 1.1 ross float128 float32_to_float128( float32 a )
1574 1.1 ross {
1575 1.1 ross flag aSign;
1576 1.1 ross int16 aExp;
1577 1.1 ross bits32 aSig;
1578 1.1 ross
1579 1.1 ross aSig = extractFloat32Frac( a );
1580 1.1 ross aExp = extractFloat32Exp( a );
1581 1.1 ross aSign = extractFloat32Sign( a );
1582 1.1 ross if ( aExp == 0xFF ) {
1583 1.1 ross if ( aSig ) return commonNaNToFloat128( float32ToCommonNaN( a ) );
1584 1.1 ross return packFloat128( aSign, 0x7FFF, 0, 0 );
1585 1.1 ross }
1586 1.1 ross if ( aExp == 0 ) {
1587 1.1 ross if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 );
1588 1.1 ross normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1589 1.1 ross --aExp;
1590 1.1 ross }
1591 1.1 ross return packFloat128( aSign, aExp + 0x3F80, ( (bits64) aSig )<<25, 0 );
1592 1.1 ross
1593 1.1 ross }
1594 1.1 ross
1595 1.1 ross #endif
1596 1.1 ross
1597 1.1 ross #ifndef SOFTFLOAT_FOR_GCC /* Not needed */
1598 1.1 ross /*
1599 1.1 ross -------------------------------------------------------------------------------
1600 1.1 ross Rounds the single-precision floating-point value `a' to an integer, and
1601 1.1 ross returns the result as a single-precision floating-point value. The
1602 1.1 ross operation is performed according to the IEC/IEEE Standard for Binary
1603 1.1 ross Floating-Point Arithmetic.
1604 1.1 ross -------------------------------------------------------------------------------
1605 1.1 ross */
1606 1.1 ross float32 float32_round_to_int( float32 a )
1607 1.1 ross {
1608 1.1 ross flag aSign;
1609 1.1 ross int16 aExp;
1610 1.1 ross bits32 lastBitMask, roundBitsMask;
1611 1.1 ross int8 roundingMode;
1612 1.1 ross float32 z;
1613 1.1 ross
1614 1.1 ross aExp = extractFloat32Exp( a );
1615 1.1 ross if ( 0x96 <= aExp ) {
1616 1.1 ross if ( ( aExp == 0xFF ) && extractFloat32Frac( a ) ) {
1617 1.1 ross return propagateFloat32NaN( a, a );
1618 1.1 ross }
1619 1.1 ross return a;
1620 1.1 ross }
1621 1.1 ross if ( aExp <= 0x7E ) {
1622 1.1 ross if ( (bits32) ( a<<1 ) == 0 ) return a;
1623 1.1 ross float_set_inexact();
1624 1.1 ross aSign = extractFloat32Sign( a );
1625 1.1 ross switch ( float_rounding_mode() ) {
1626 1.1 ross case float_round_nearest_even:
1627 1.1 ross if ( ( aExp == 0x7E ) && extractFloat32Frac( a ) ) {
1628 1.1 ross return packFloat32( aSign, 0x7F, 0 );
1629 1.1 ross }
1630 1.1 ross break;
1631 1.1 ross case float_round_down:
1632 1.1 ross return aSign ? 0xBF800000 : 0;
1633 1.1 ross case float_round_up:
1634 1.1 ross return aSign ? 0x80000000 : 0x3F800000;
1635 1.1 ross }
1636 1.1 ross return packFloat32( aSign, 0, 0 );
1637 1.1 ross }
1638 1.1 ross lastBitMask = 1;
1639 1.1 ross lastBitMask <<= 0x96 - aExp;
1640 1.1 ross roundBitsMask = lastBitMask - 1;
1641 1.1 ross z = a;
1642 1.1 ross roundingMode = float_rounding_mode();
1643 1.1 ross if ( roundingMode == float_round_nearest_even ) {
1644 1.1 ross z += lastBitMask>>1;
1645 1.1 ross if ( ( z & roundBitsMask ) == 0 ) z &= ~ lastBitMask;
1646 1.1 ross }
1647 1.1 ross else if ( roundingMode != float_round_to_zero ) {
1648 1.1 ross if ( extractFloat32Sign( z ) ^ ( roundingMode == float_round_up ) ) {
1649 1.1 ross z += roundBitsMask;
1650 1.1 ross }
1651 1.1 ross }
1652 1.1 ross z &= ~ roundBitsMask;
1653 1.1 ross if ( z != a ) float_set_inexact();
1654 1.1 ross return z;
1655 1.1 ross
1656 1.1 ross }
1657 1.1 ross #endif /* !SOFTFLOAT_FOR_GCC */
1658 1.1 ross
1659 1.1 ross /*
1660 1.1 ross -------------------------------------------------------------------------------
1661 1.1 ross Returns the result of adding the absolute values of the single-precision
1662 1.1 ross floating-point values `a' and `b'. If `zSign' is 1, the sum is negated
1663 1.1 ross before being returned. `zSign' is ignored if the result is a NaN.
1664 1.1 ross The addition is performed according to the IEC/IEEE Standard for Binary
1665 1.1 ross Floating-Point Arithmetic.
1666 1.1 ross -------------------------------------------------------------------------------
1667 1.1 ross */
1668 1.1 ross static float32 addFloat32Sigs( float32 a, float32 b, flag zSign )
1669 1.1 ross {
1670 1.1 ross int16 aExp, bExp, zExp;
1671 1.1 ross bits32 aSig, bSig, zSig;
1672 1.1 ross int16 expDiff;
1673 1.1 ross
1674 1.1 ross aSig = extractFloat32Frac( a );
1675 1.1 ross aExp = extractFloat32Exp( a );
1676 1.1 ross bSig = extractFloat32Frac( b );
1677 1.1 ross bExp = extractFloat32Exp( b );
1678 1.1 ross expDiff = aExp - bExp;
1679 1.1 ross aSig <<= 6;
1680 1.1 ross bSig <<= 6;
1681 1.1 ross if ( 0 < expDiff ) {
1682 1.1 ross if ( aExp == 0xFF ) {
1683 1.1 ross if ( aSig ) return propagateFloat32NaN( a, b );
1684 1.1 ross return a;
1685 1.1 ross }
1686 1.1 ross if ( bExp == 0 ) {
1687 1.1 ross --expDiff;
1688 1.1 ross }
1689 1.1 ross else {
1690 1.1 ross bSig |= 0x20000000;
1691 1.1 ross }
1692 1.1 ross shift32RightJamming( bSig, expDiff, &bSig );
1693 1.1 ross zExp = aExp;
1694 1.1 ross }
1695 1.1 ross else if ( expDiff < 0 ) {
1696 1.1 ross if ( bExp == 0xFF ) {
1697 1.1 ross if ( bSig ) return propagateFloat32NaN( a, b );
1698 1.1 ross return packFloat32( zSign, 0xFF, 0 );
1699 1.1 ross }
1700 1.1 ross if ( aExp == 0 ) {
1701 1.1 ross ++expDiff;
1702 1.1 ross }
1703 1.1 ross else {
1704 1.1 ross aSig |= 0x20000000;
1705 1.1 ross }
1706 1.1 ross shift32RightJamming( aSig, - expDiff, &aSig );
1707 1.1 ross zExp = bExp;
1708 1.1 ross }
1709 1.1 ross else {
1710 1.1 ross if ( aExp == 0xFF ) {
1711 1.1 ross if ( aSig | bSig ) return propagateFloat32NaN( a, b );
1712 1.1 ross return a;
1713 1.1 ross }
1714 1.1 ross if ( aExp == 0 ) return packFloat32( zSign, 0, ( aSig + bSig )>>6 );
1715 1.1 ross zSig = 0x40000000 + aSig + bSig;
1716 1.1 ross zExp = aExp;
1717 1.1 ross goto roundAndPack;
1718 1.1 ross }
1719 1.1 ross aSig |= 0x20000000;
1720 1.1 ross zSig = ( aSig + bSig )<<1;
1721 1.1 ross --zExp;
1722 1.1 ross if ( (sbits32) zSig < 0 ) {
1723 1.1 ross zSig = aSig + bSig;
1724 1.1 ross ++zExp;
1725 1.1 ross }
1726 1.1 ross roundAndPack:
1727 1.1 ross return roundAndPackFloat32( zSign, zExp, zSig );
1728 1.1 ross
1729 1.1 ross }
1730 1.1 ross
1731 1.1 ross /*
1732 1.1 ross -------------------------------------------------------------------------------
1733 1.1 ross Returns the result of subtracting the absolute values of the single-
1734 1.1 ross precision floating-point values `a' and `b'. If `zSign' is 1, the
1735 1.1 ross difference is negated before being returned. `zSign' is ignored if the
1736 1.1 ross result is a NaN. The subtraction is performed according to the IEC/IEEE
1737 1.1 ross Standard for Binary Floating-Point Arithmetic.
1738 1.1 ross -------------------------------------------------------------------------------
1739 1.1 ross */
1740 1.1 ross static float32 subFloat32Sigs( float32 a, float32 b, flag zSign )
1741 1.1 ross {
1742 1.1 ross int16 aExp, bExp, zExp;
1743 1.1 ross bits32 aSig, bSig, zSig;
1744 1.1 ross int16 expDiff;
1745 1.1 ross
1746 1.1 ross aSig = extractFloat32Frac( a );
1747 1.1 ross aExp = extractFloat32Exp( a );
1748 1.1 ross bSig = extractFloat32Frac( b );
1749 1.1 ross bExp = extractFloat32Exp( b );
1750 1.1 ross expDiff = aExp - bExp;
1751 1.1 ross aSig <<= 7;
1752 1.1 ross bSig <<= 7;
1753 1.1 ross if ( 0 < expDiff ) goto aExpBigger;
1754 1.1 ross if ( expDiff < 0 ) goto bExpBigger;
1755 1.1 ross if ( aExp == 0xFF ) {
1756 1.1 ross if ( aSig | bSig ) return propagateFloat32NaN( a, b );
1757 1.1 ross float_raise( float_flag_invalid );
1758 1.1 ross return float32_default_nan;
1759 1.1 ross }
1760 1.1 ross if ( aExp == 0 ) {
1761 1.1 ross aExp = 1;
1762 1.1 ross bExp = 1;
1763 1.1 ross }
1764 1.1 ross if ( bSig < aSig ) goto aBigger;
1765 1.1 ross if ( aSig < bSig ) goto bBigger;
1766 1.1 ross return packFloat32( float_rounding_mode() == float_round_down, 0, 0 );
1767 1.1 ross bExpBigger:
1768 1.1 ross if ( bExp == 0xFF ) {
1769 1.1 ross if ( bSig ) return propagateFloat32NaN( a, b );
1770 1.1 ross return packFloat32( zSign ^ 1, 0xFF, 0 );
1771 1.1 ross }
1772 1.1 ross if ( aExp == 0 ) {
1773 1.1 ross ++expDiff;
1774 1.1 ross }
1775 1.1 ross else {
1776 1.1 ross aSig |= 0x40000000;
1777 1.1 ross }
1778 1.1 ross shift32RightJamming( aSig, - expDiff, &aSig );
1779 1.1 ross bSig |= 0x40000000;
1780 1.1 ross bBigger:
1781 1.1 ross zSig = bSig - aSig;
1782 1.1 ross zExp = bExp;
1783 1.1 ross zSign ^= 1;
1784 1.1 ross goto normalizeRoundAndPack;
1785 1.1 ross aExpBigger:
1786 1.1 ross if ( aExp == 0xFF ) {
1787 1.1 ross if ( aSig ) return propagateFloat32NaN( a, b );
1788 1.1 ross return a;
1789 1.1 ross }
1790 1.1 ross if ( bExp == 0 ) {
1791 1.1 ross --expDiff;
1792 1.1 ross }
1793 1.1 ross else {
1794 1.1 ross bSig |= 0x40000000;
1795 1.1 ross }
1796 1.1 ross shift32RightJamming( bSig, expDiff, &bSig );
1797 1.1 ross aSig |= 0x40000000;
1798 1.1 ross aBigger:
1799 1.1 ross zSig = aSig - bSig;
1800 1.1 ross zExp = aExp;
1801 1.1 ross normalizeRoundAndPack:
1802 1.1 ross --zExp;
1803 1.1 ross return normalizeRoundAndPackFloat32( zSign, zExp, zSig );
1804 1.1 ross
1805 1.1 ross }
1806 1.1 ross
1807 1.1 ross /*
1808 1.1 ross -------------------------------------------------------------------------------
1809 1.1 ross Returns the result of adding the single-precision floating-point values `a'
1810 1.1 ross and `b'. The operation is performed according to the IEC/IEEE Standard for
1811 1.1 ross Binary Floating-Point Arithmetic.
1812 1.1 ross -------------------------------------------------------------------------------
1813 1.1 ross */
1814 1.1 ross float32 float32_add( float32 a, float32 b )
1815 1.1 ross {
1816 1.1 ross flag aSign, bSign;
1817 1.1 ross
1818 1.1 ross aSign = extractFloat32Sign( a );
1819 1.1 ross bSign = extractFloat32Sign( b );
1820 1.1 ross if ( aSign == bSign ) {
1821 1.1 ross return addFloat32Sigs( a, b, aSign );
1822 1.1 ross }
1823 1.1 ross else {
1824 1.1 ross return subFloat32Sigs( a, b, aSign );
1825 1.1 ross }
1826 1.1 ross
1827 1.1 ross }
1828 1.1 ross
1829 1.1 ross /*
1830 1.1 ross -------------------------------------------------------------------------------
1831 1.1 ross Returns the result of subtracting the single-precision floating-point values
1832 1.1 ross `a' and `b'. The operation is performed according to the IEC/IEEE Standard
1833 1.1 ross for Binary Floating-Point Arithmetic.
1834 1.1 ross -------------------------------------------------------------------------------
1835 1.1 ross */
1836 1.1 ross float32 float32_sub( float32 a, float32 b )
1837 1.1 ross {
1838 1.1 ross flag aSign, bSign;
1839 1.1 ross
1840 1.1 ross aSign = extractFloat32Sign( a );
1841 1.1 ross bSign = extractFloat32Sign( b );
1842 1.1 ross if ( aSign == bSign ) {
1843 1.1 ross return subFloat32Sigs( a, b, aSign );
1844 1.1 ross }
1845 1.1 ross else {
1846 1.1 ross return addFloat32Sigs( a, b, aSign );
1847 1.1 ross }
1848 1.1 ross
1849 1.1 ross }
1850 1.1 ross
1851 1.1 ross /*
1852 1.1 ross -------------------------------------------------------------------------------
1853 1.1 ross Returns the result of multiplying the single-precision floating-point values
1854 1.1 ross `a' and `b'. The operation is performed according to the IEC/IEEE Standard
1855 1.1 ross for Binary Floating-Point Arithmetic.
1856 1.1 ross -------------------------------------------------------------------------------
1857 1.1 ross */
1858 1.1 ross float32 float32_mul( float32 a, float32 b )
1859 1.1 ross {
1860 1.1 ross flag aSign, bSign, zSign;
1861 1.1 ross int16 aExp, bExp, zExp;
1862 1.1 ross bits32 aSig, bSig;
1863 1.1 ross bits64 zSig64;
1864 1.1 ross bits32 zSig;
1865 1.1 ross
1866 1.1 ross aSig = extractFloat32Frac( a );
1867 1.1 ross aExp = extractFloat32Exp( a );
1868 1.1 ross aSign = extractFloat32Sign( a );
1869 1.1 ross bSig = extractFloat32Frac( b );
1870 1.1 ross bExp = extractFloat32Exp( b );
1871 1.1 ross bSign = extractFloat32Sign( b );
1872 1.1 ross zSign = aSign ^ bSign;
1873 1.1 ross if ( aExp == 0xFF ) {
1874 1.1 ross if ( aSig || ( ( bExp == 0xFF ) && bSig ) ) {
1875 1.1 ross return propagateFloat32NaN( a, b );
1876 1.1 ross }
1877 1.1 ross if ( ( bExp | bSig ) == 0 ) {
1878 1.1 ross float_raise( float_flag_invalid );
1879 1.1 ross return float32_default_nan;
1880 1.1 ross }
1881 1.1 ross return packFloat32( zSign, 0xFF, 0 );
1882 1.1 ross }
1883 1.1 ross if ( bExp == 0xFF ) {
1884 1.1 ross if ( bSig ) return propagateFloat32NaN( a, b );
1885 1.1 ross if ( ( aExp | aSig ) == 0 ) {
1886 1.1 ross float_raise( float_flag_invalid );
1887 1.1 ross return float32_default_nan;
1888 1.1 ross }
1889 1.1 ross return packFloat32( zSign, 0xFF, 0 );
1890 1.1 ross }
1891 1.1 ross if ( aExp == 0 ) {
1892 1.1 ross if ( aSig == 0 ) return packFloat32( zSign, 0, 0 );
1893 1.1 ross normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1894 1.1 ross }
1895 1.1 ross if ( bExp == 0 ) {
1896 1.1 ross if ( bSig == 0 ) return packFloat32( zSign, 0, 0 );
1897 1.1 ross normalizeFloat32Subnormal( bSig, &bExp, &bSig );
1898 1.1 ross }
1899 1.1 ross zExp = aExp + bExp - 0x7F;
1900 1.1 ross aSig = ( aSig | 0x00800000 )<<7;
1901 1.1 ross bSig = ( bSig | 0x00800000 )<<8;
1902 1.1 ross shift64RightJamming( ( (bits64) aSig ) * bSig, 32, &zSig64 );
1903 1.1 ross zSig = zSig64;
1904 1.1 ross if ( 0 <= (sbits32) ( zSig<<1 ) ) {
1905 1.1 ross zSig <<= 1;
1906 1.1 ross --zExp;
1907 1.1 ross }
1908 1.1 ross return roundAndPackFloat32( zSign, zExp, zSig );
1909 1.1 ross
1910 1.1 ross }
1911 1.1 ross
1912 1.1 ross /*
1913 1.1 ross -------------------------------------------------------------------------------
1914 1.1 ross Returns the result of dividing the single-precision floating-point value `a'
1915 1.1 ross by the corresponding value `b'. The operation is performed according to the
1916 1.1 ross IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1917 1.1 ross -------------------------------------------------------------------------------
1918 1.1 ross */
1919 1.1 ross float32 float32_div( float32 a, float32 b )
1920 1.1 ross {
1921 1.1 ross flag aSign, bSign, zSign;
1922 1.1 ross int16 aExp, bExp, zExp;
1923 1.1 ross bits32 aSig, bSig, zSig;
1924 1.1 ross
1925 1.1 ross aSig = extractFloat32Frac( a );
1926 1.1 ross aExp = extractFloat32Exp( a );
1927 1.1 ross aSign = extractFloat32Sign( a );
1928 1.1 ross bSig = extractFloat32Frac( b );
1929 1.1 ross bExp = extractFloat32Exp( b );
1930 1.1 ross bSign = extractFloat32Sign( b );
1931 1.1 ross zSign = aSign ^ bSign;
1932 1.1 ross if ( aExp == 0xFF ) {
1933 1.1 ross if ( aSig ) return propagateFloat32NaN( a, b );
1934 1.1 ross if ( bExp == 0xFF ) {
1935 1.1 ross if ( bSig ) return propagateFloat32NaN( a, b );
1936 1.1 ross float_raise( float_flag_invalid );
1937 1.1 ross return float32_default_nan;
1938 1.1 ross }
1939 1.1 ross return packFloat32( zSign, 0xFF, 0 );
1940 1.1 ross }
1941 1.1 ross if ( bExp == 0xFF ) {
1942 1.1 ross if ( bSig ) return propagateFloat32NaN( a, b );
1943 1.1 ross return packFloat32( zSign, 0, 0 );
1944 1.1 ross }
1945 1.1 ross if ( bExp == 0 ) {
1946 1.1 ross if ( bSig == 0 ) {
1947 1.1 ross if ( ( aExp | aSig ) == 0 ) {
1948 1.1 ross float_raise( float_flag_invalid );
1949 1.1 ross return float32_default_nan;
1950 1.1 ross }
1951 1.1 ross float_raise( float_flag_divbyzero );
1952 1.1 ross return packFloat32( zSign, 0xFF, 0 );
1953 1.1 ross }
1954 1.1 ross normalizeFloat32Subnormal( bSig, &bExp, &bSig );
1955 1.1 ross }
1956 1.1 ross if ( aExp == 0 ) {
1957 1.1 ross if ( aSig == 0 ) return packFloat32( zSign, 0, 0 );
1958 1.1 ross normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1959 1.1 ross }
1960 1.1 ross zExp = aExp - bExp + 0x7D;
1961 1.1 ross aSig = ( aSig | 0x00800000 )<<7;
1962 1.1 ross bSig = ( bSig | 0x00800000 )<<8;
1963 1.1 ross if ( bSig <= ( aSig + aSig ) ) {
1964 1.1 ross aSig >>= 1;
1965 1.1 ross ++zExp;
1966 1.1 ross }
1967 1.1 ross zSig = ( ( (bits64) aSig )<<32 ) / bSig;
1968 1.1 ross if ( ( zSig & 0x3F ) == 0 ) {
1969 1.1 ross zSig |= ( (bits64) bSig * zSig != ( (bits64) aSig )<<32 );
1970 1.1 ross }
1971 1.1 ross return roundAndPackFloat32( zSign, zExp, zSig );
1972 1.1 ross
1973 1.1 ross }
1974 1.1 ross
1975 1.1 ross #ifndef SOFTFLOAT_FOR_GCC /* Not needed */
1976 1.1 ross /*
1977 1.1 ross -------------------------------------------------------------------------------
1978 1.1 ross Returns the remainder of the single-precision floating-point value `a'
1979 1.1 ross with respect to the corresponding value `b'. The operation is performed
1980 1.1 ross according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1981 1.1 ross -------------------------------------------------------------------------------
1982 1.1 ross */
1983 1.1 ross float32 float32_rem( float32 a, float32 b )
1984 1.1 ross {
1985 1.1 ross flag aSign, bSign, zSign;
1986 1.1 ross int16 aExp, bExp, expDiff;
1987 1.1 ross bits32 aSig, bSig;
1988 1.1 ross bits32 q;
1989 1.1 ross bits64 aSig64, bSig64, q64;
1990 1.1 ross bits32 alternateASig;
1991 1.1 ross sbits32 sigMean;
1992 1.1 ross
1993 1.1 ross aSig = extractFloat32Frac( a );
1994 1.1 ross aExp = extractFloat32Exp( a );
1995 1.1 ross aSign = extractFloat32Sign( a );
1996 1.1 ross bSig = extractFloat32Frac( b );
1997 1.1 ross bExp = extractFloat32Exp( b );
1998 1.1 ross bSign = extractFloat32Sign( b );
1999 1.1 ross if ( aExp == 0xFF ) {
2000 1.1 ross if ( aSig || ( ( bExp == 0xFF ) && bSig ) ) {
2001 1.1 ross return propagateFloat32NaN( a, b );
2002 1.1 ross }
2003 1.1 ross float_raise( float_flag_invalid );
2004 1.1 ross return float32_default_nan;
2005 1.1 ross }
2006 1.1 ross if ( bExp == 0xFF ) {
2007 1.1 ross if ( bSig ) return propagateFloat32NaN( a, b );
2008 1.1 ross return a;
2009 1.1 ross }
2010 1.1 ross if ( bExp == 0 ) {
2011 1.1 ross if ( bSig == 0 ) {
2012 1.1 ross float_raise( float_flag_invalid );
2013 1.1 ross return float32_default_nan;
2014 1.1 ross }
2015 1.1 ross normalizeFloat32Subnormal( bSig, &bExp, &bSig );
2016 1.1 ross }
2017 1.1 ross if ( aExp == 0 ) {
2018 1.1 ross if ( aSig == 0 ) return a;
2019 1.1 ross normalizeFloat32Subnormal( aSig, &aExp, &aSig );
2020 1.1 ross }
2021 1.1 ross expDiff = aExp - bExp;
2022 1.1 ross aSig |= 0x00800000;
2023 1.1 ross bSig |= 0x00800000;
2024 1.1 ross if ( expDiff < 32 ) {
2025 1.1 ross aSig <<= 8;
2026 1.1 ross bSig <<= 8;
2027 1.1 ross if ( expDiff < 0 ) {
2028 1.1 ross if ( expDiff < -1 ) return a;
2029 1.1 ross aSig >>= 1;
2030 1.1 ross }
2031 1.1 ross q = ( bSig <= aSig );
2032 1.1 ross if ( q ) aSig -= bSig;
2033 1.1 ross if ( 0 < expDiff ) {
2034 1.1 ross q = ( ( (bits64) aSig )<<32 ) / bSig;
2035 1.1 ross q >>= 32 - expDiff;
2036 1.1 ross bSig >>= 2;
2037 1.1 ross aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q;
2038 1.1 ross }
2039 1.1 ross else {
2040 1.1 ross aSig >>= 2;
2041 1.1 ross bSig >>= 2;
2042 1.1 ross }
2043 1.1 ross }
2044 1.1 ross else {
2045 1.1 ross if ( bSig <= aSig ) aSig -= bSig;
2046 1.1 ross aSig64 = ( (bits64) aSig )<<40;
2047 1.1 ross bSig64 = ( (bits64) bSig )<<40;
2048 1.1 ross expDiff -= 64;
2049 1.1 ross while ( 0 < expDiff ) {
2050 1.1 ross q64 = estimateDiv128To64( aSig64, 0, bSig64 );
2051 1.1 ross q64 = ( 2 < q64 ) ? q64 - 2 : 0;
2052 1.1 ross aSig64 = - ( ( bSig * q64 )<<38 );
2053 1.1 ross expDiff -= 62;
2054 1.1 ross }
2055 1.1 ross expDiff += 64;
2056 1.1 ross q64 = estimateDiv128To64( aSig64, 0, bSig64 );
2057 1.1 ross q64 = ( 2 < q64 ) ? q64 - 2 : 0;
2058 1.1 ross q = q64>>( 64 - expDiff );
2059 1.1 ross bSig <<= 6;
2060 1.1 ross aSig = ( ( aSig64>>33 )<<( expDiff - 1 ) ) - bSig * q;
2061 1.1 ross }
2062 1.1 ross do {
2063 1.1 ross alternateASig = aSig;
2064 1.1 ross ++q;
2065 1.1 ross aSig -= bSig;
2066 1.1 ross } while ( 0 <= (sbits32) aSig );
2067 1.1 ross sigMean = aSig + alternateASig;
2068 1.1 ross if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) {
2069 1.1 ross aSig = alternateASig;
2070 1.1 ross }
2071 1.1 ross zSign = ( (sbits32) aSig < 0 );
2072 1.1 ross if ( zSign ) aSig = - aSig;
2073 1.1 ross return normalizeRoundAndPackFloat32( aSign ^ zSign, bExp, aSig );
2074 1.1 ross
2075 1.1 ross }
2076 1.1 ross #endif /* !SOFTFLOAT_FOR_GCC */
2077 1.1 ross
2078 1.1 ross #ifndef SOFTFLOAT_FOR_GCC /* Not needed */
2079 1.1 ross /*
2080 1.1 ross -------------------------------------------------------------------------------
2081 1.1 ross Returns the square root of the single-precision floating-point value `a'.
2082 1.1 ross The operation is performed according to the IEC/IEEE Standard for Binary
2083 1.1 ross Floating-Point Arithmetic.
2084 1.1 ross -------------------------------------------------------------------------------
2085 1.1 ross */
2086 1.1 ross float32 float32_sqrt( float32 a )
2087 1.1 ross {
2088 1.1 ross flag aSign;
2089 1.1 ross int16 aExp, zExp;
2090 1.1 ross bits32 aSig, zSig;
2091 1.1 ross bits64 rem, term;
2092 1.1 ross
2093 1.1 ross aSig = extractFloat32Frac( a );
2094 1.1 ross aExp = extractFloat32Exp( a );
2095 1.1 ross aSign = extractFloat32Sign( a );
2096 1.1 ross if ( aExp == 0xFF ) {
2097 1.1 ross if ( aSig ) return propagateFloat32NaN( a, 0 );
2098 1.1 ross if ( ! aSign ) return a;
2099 1.1 ross float_raise( float_flag_invalid );
2100 1.1 ross return float32_default_nan;
2101 1.1 ross }
2102 1.1 ross if ( aSign ) {
2103 1.1 ross if ( ( aExp | aSig ) == 0 ) return a;
2104 1.1 ross float_raise( float_flag_invalid );
2105 1.1 ross return float32_default_nan;
2106 1.1 ross }
2107 1.1 ross if ( aExp == 0 ) {
2108 1.1 ross if ( aSig == 0 ) return 0;
2109 1.1 ross normalizeFloat32Subnormal( aSig, &aExp, &aSig );
2110 1.1 ross }
2111 1.1 ross zExp = ( ( aExp - 0x7F )>>1 ) + 0x7E;
2112 1.1 ross aSig = ( aSig | 0x00800000 )<<8;
2113 1.1 ross zSig = estimateSqrt32( aExp, aSig ) + 2;
2114 1.1 ross if ( ( zSig & 0x7F ) <= 5 ) {
2115 1.1 ross if ( zSig < 2 ) {
2116 1.1 ross zSig = 0x7FFFFFFF;
2117 1.1 ross goto roundAndPack;
2118 1.1 ross }
2119 1.1 ross aSig >>= aExp & 1;
2120 1.1 ross term = ( (bits64) zSig ) * zSig;
2121 1.1 ross rem = ( ( (bits64) aSig )<<32 ) - term;
2122 1.1 ross while ( (sbits64) rem < 0 ) {
2123 1.1 ross --zSig;
2124 1.1 ross rem += ( ( (bits64) zSig )<<1 ) | 1;
2125 1.1 ross }
2126 1.1 ross zSig |= ( rem != 0 );
2127 1.1 ross }
2128 1.1 ross shift32RightJamming( zSig, 1, &zSig );
2129 1.1 ross roundAndPack:
2130 1.1 ross return roundAndPackFloat32( 0, zExp, zSig );
2131 1.1 ross
2132 1.1 ross }
2133 1.1 ross #endif /* !SOFTFLOAT_FOR_GCC */
2134 1.1 ross
2135 1.1 ross /*
2136 1.1 ross -------------------------------------------------------------------------------
2137 1.1 ross Returns 1 if the single-precision floating-point value `a' is equal to
2138 1.1 ross the corresponding value `b', and 0 otherwise. The comparison is performed
2139 1.1 ross according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2140 1.1 ross -------------------------------------------------------------------------------
2141 1.1 ross */
2142 1.1 ross flag float32_eq( float32 a, float32 b )
2143 1.1 ross {
2144 1.1 ross
2145 1.1 ross if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2146 1.1 ross || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2147 1.1 ross ) {
2148 1.1 ross if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) {
2149 1.1 ross float_raise( float_flag_invalid );
2150 1.1 ross }
2151 1.1 ross return 0;
2152 1.1 ross }
2153 1.1 ross return ( a == b ) || ( (bits32) ( ( a | b )<<1 ) == 0 );
2154 1.1 ross
2155 1.1 ross }
2156 1.1 ross
2157 1.1 ross /*
2158 1.1 ross -------------------------------------------------------------------------------
2159 1.1 ross Returns 1 if the single-precision floating-point value `a' is less than
2160 1.1 ross or equal to the corresponding value `b', and 0 otherwise. The comparison
2161 1.1 ross is performed according to the IEC/IEEE Standard for Binary Floating-Point
2162 1.1 ross Arithmetic.
2163 1.1 ross -------------------------------------------------------------------------------
2164 1.1 ross */
2165 1.1 ross flag float32_le( float32 a, float32 b )
2166 1.1 ross {
2167 1.1 ross flag aSign, bSign;
2168 1.1 ross
2169 1.1 ross if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2170 1.1 ross || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2171 1.1 ross ) {
2172 1.1 ross float_raise( float_flag_invalid );
2173 1.1 ross return 0;
2174 1.1 ross }
2175 1.1 ross aSign = extractFloat32Sign( a );
2176 1.1 ross bSign = extractFloat32Sign( b );
2177 1.1 ross if ( aSign != bSign ) return aSign || ( (bits32) ( ( a | b )<<1 ) == 0 );
2178 1.1 ross return ( a == b ) || ( aSign ^ ( a < b ) );
2179 1.1 ross
2180 1.1 ross }
2181 1.1 ross
2182 1.1 ross /*
2183 1.1 ross -------------------------------------------------------------------------------
2184 1.1 ross Returns 1 if the single-precision floating-point value `a' is less than
2185 1.1 ross the corresponding value `b', and 0 otherwise. The comparison is performed
2186 1.1 ross according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2187 1.1 ross -------------------------------------------------------------------------------
2188 1.1 ross */
2189 1.1 ross flag float32_lt( float32 a, float32 b )
2190 1.1 ross {
2191 1.1 ross flag aSign, bSign;
2192 1.1 ross
2193 1.1 ross if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2194 1.1 ross || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2195 1.1 ross ) {
2196 1.1 ross float_raise( float_flag_invalid );
2197 1.1 ross return 0;
2198 1.1 ross }
2199 1.1 ross aSign = extractFloat32Sign( a );
2200 1.1 ross bSign = extractFloat32Sign( b );
2201 1.1 ross if ( aSign != bSign ) return aSign && ( (bits32) ( ( a | b )<<1 ) != 0 );
2202 1.1 ross return ( a != b ) && ( aSign ^ ( a < b ) );
2203 1.1 ross
2204 1.1 ross }
2205 1.1 ross
2206 1.1 ross #ifndef SOFTFLOAT_FOR_GCC /* Not needed */
2207 1.1 ross /*
2208 1.1 ross -------------------------------------------------------------------------------
2209 1.1 ross Returns 1 if the single-precision floating-point value `a' is equal to
2210 1.1 ross the corresponding value `b', and 0 otherwise. The invalid exception is
2211 1.1 ross raised if either operand is a NaN. Otherwise, the comparison is performed
2212 1.1 ross according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2213 1.1 ross -------------------------------------------------------------------------------
2214 1.1 ross */
2215 1.1 ross flag float32_eq_signaling( float32 a, float32 b )
2216 1.1 ross {
2217 1.1 ross
2218 1.1 ross if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2219 1.1 ross || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2220 1.1 ross ) {
2221 1.1 ross float_raise( float_flag_invalid );
2222 1.1 ross return 0;
2223 1.1 ross }
2224 1.1 ross return ( a == b ) || ( (bits32) ( ( a | b )<<1 ) == 0 );
2225 1.1 ross
2226 1.1 ross }
2227 1.1 ross
2228 1.1 ross /*
2229 1.1 ross -------------------------------------------------------------------------------
2230 1.1 ross Returns 1 if the single-precision floating-point value `a' is less than or
2231 1.1 ross equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not
2232 1.1 ross cause an exception. Otherwise, the comparison is performed according to the
2233 1.1 ross IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2234 1.1 ross -------------------------------------------------------------------------------
2235 1.1 ross */
2236 1.1 ross flag float32_le_quiet( float32 a, float32 b )
2237 1.1 ross {
2238 1.1 ross flag aSign, bSign;
2239 1.1 ross
2240 1.1 ross if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2241 1.1 ross || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2242 1.1 ross ) {
2243 1.1 ross if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) {
2244 1.1 ross float_raise( float_flag_invalid );
2245 1.1 ross }
2246 1.1 ross return 0;
2247 1.1 ross }
2248 1.1 ross aSign = extractFloat32Sign( a );
2249 1.1 ross bSign = extractFloat32Sign( b );
2250 1.1 ross if ( aSign != bSign ) return aSign || ( (bits32) ( ( a | b )<<1 ) == 0 );
2251 1.1 ross return ( a == b ) || ( aSign ^ ( a < b ) );
2252 1.1 ross
2253 1.1 ross }
2254 1.1 ross
2255 1.1 ross /*
2256 1.1 ross -------------------------------------------------------------------------------
2257 1.1 ross Returns 1 if the single-precision floating-point value `a' is less than
2258 1.1 ross the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
2259 1.1 ross exception. Otherwise, the comparison is performed according to the IEC/IEEE
2260 1.1 ross Standard for Binary Floating-Point Arithmetic.
2261 1.1 ross -------------------------------------------------------------------------------
2262 1.1 ross */
2263 1.1 ross flag float32_lt_quiet( float32 a, float32 b )
2264 1.1 ross {
2265 1.1 ross flag aSign, bSign;
2266 1.1 ross
2267 1.1 ross if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2268 1.1 ross || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2269 1.1 ross ) {
2270 1.1 ross if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) {
2271 1.1 ross float_raise( float_flag_invalid );
2272 1.1 ross }
2273 1.1 ross return 0;
2274 1.1 ross }
2275 1.1 ross aSign = extractFloat32Sign( a );
2276 1.1 ross bSign = extractFloat32Sign( b );
2277 1.1 ross if ( aSign != bSign ) return aSign && ( (bits32) ( ( a | b )<<1 ) != 0 );
2278 1.1 ross return ( a != b ) && ( aSign ^ ( a < b ) );
2279 1.1 ross
2280 1.1 ross }
2281 1.1 ross #endif /* !SOFTFLOAT_FOR_GCC */
2282 1.1 ross
2283 1.1 ross #ifndef SOFTFLOAT_FOR_GCC /* Not needed */
2284 1.1 ross /*
2285 1.1 ross -------------------------------------------------------------------------------
2286 1.1 ross Returns the result of converting the double-precision floating-point value
2287 1.1 ross `a' to the 32-bit two's complement integer format. The conversion is
2288 1.1 ross performed according to the IEC/IEEE Standard for Binary Floating-Point
2289 1.1 ross Arithmetic---which means in particular that the conversion is rounded
2290 1.1 ross according to the current rounding mode. If `a' is a NaN, the largest
2291 1.1 ross positive integer is returned. Otherwise, if the conversion overflows, the
2292 1.1 ross largest integer with the same sign as `a' is returned.
2293 1.1 ross -------------------------------------------------------------------------------
2294 1.1 ross */
2295 1.1 ross int32 float64_to_int32( float64 a )
2296 1.1 ross {
2297 1.1 ross flag aSign;
2298 1.1 ross int16 aExp, shiftCount;
2299 1.1 ross bits64 aSig;
2300 1.1 ross
2301 1.1 ross aSig = extractFloat64Frac( a );
2302 1.1 ross aExp = extractFloat64Exp( a );
2303 1.1 ross aSign = extractFloat64Sign( a );
2304 1.1 ross if ( ( aExp == 0x7FF ) && aSig ) aSign = 0;
2305 1.1 ross if ( aExp ) aSig |= LIT64( 0x0010000000000000 );
2306 1.1 ross shiftCount = 0x42C - aExp;
2307 1.1 ross if ( 0 < shiftCount ) shift64RightJamming( aSig, shiftCount, &aSig );
2308 1.1 ross return roundAndPackInt32( aSign, aSig );
2309 1.1 ross
2310 1.1 ross }
2311 1.1 ross #endif /* !SOFTFLOAT_FOR_GCC */
2312 1.1 ross
2313 1.1 ross /*
2314 1.1 ross -------------------------------------------------------------------------------
2315 1.1 ross Returns the result of converting the double-precision floating-point value
2316 1.1 ross `a' to the 32-bit two's complement integer format. The conversion is
2317 1.1 ross performed according to the IEC/IEEE Standard for Binary Floating-Point
2318 1.1 ross Arithmetic, except that the conversion is always rounded toward zero.
2319 1.1 ross If `a' is a NaN, the largest positive integer is returned. Otherwise, if
2320 1.1 ross the conversion overflows, the largest integer with the same sign as `a' is
2321 1.1 ross returned.
2322 1.1 ross -------------------------------------------------------------------------------
2323 1.1 ross */
2324 1.1 ross int32 float64_to_int32_round_to_zero( float64 a )
2325 1.1 ross {
2326 1.1 ross flag aSign;
2327 1.1 ross int16 aExp, shiftCount;
2328 1.1 ross bits64 aSig, savedASig;
2329 1.1 ross int32 z;
2330 1.1 ross
2331 1.1 ross aSig = extractFloat64Frac( a );
2332 1.1 ross aExp = extractFloat64Exp( a );
2333 1.1 ross aSign = extractFloat64Sign( a );
2334 1.1 ross if ( 0x41E < aExp ) {
2335 1.1 ross if ( ( aExp == 0x7FF ) && aSig ) aSign = 0;
2336 1.1 ross goto invalid;
2337 1.1 ross }
2338 1.1 ross else if ( aExp < 0x3FF ) {
2339 1.1 ross if ( aExp || aSig ) float_set_inexact();
2340 1.1 ross return 0;
2341 1.1 ross }
2342 1.1 ross aSig |= LIT64( 0x0010000000000000 );
2343 1.1 ross shiftCount = 0x433 - aExp;
2344 1.1 ross savedASig = aSig;
2345 1.1 ross aSig >>= shiftCount;
2346 1.1 ross z = aSig;
2347 1.1 ross if ( aSign ) z = - z;
2348 1.1 ross if ( ( z < 0 ) ^ aSign ) {
2349 1.1 ross invalid:
2350 1.1 ross float_raise( float_flag_invalid );
2351 1.1 ross return aSign ? (sbits32) 0x80000000 : 0x7FFFFFFF;
2352 1.1 ross }
2353 1.1 ross if ( ( aSig<<shiftCount ) != savedASig ) {
2354 1.1 ross float_set_inexact();
2355 1.1 ross }
2356 1.1 ross return z;
2357 1.1 ross
2358 1.1 ross }
2359 1.1 ross
2360 1.1 ross #ifndef SOFTFLOAT_FOR_GCC /* Not needed */
2361 1.1 ross /*
2362 1.1 ross -------------------------------------------------------------------------------
2363 1.1 ross Returns the result of converting the double-precision floating-point value
2364 1.1 ross `a' to the 64-bit two's complement integer format. The conversion is
2365 1.1 ross performed according to the IEC/IEEE Standard for Binary Floating-Point
2366 1.1 ross Arithmetic---which means in particular that the conversion is rounded
2367 1.1 ross according to the current rounding mode. If `a' is a NaN, the largest
2368 1.1 ross positive integer is returned. Otherwise, if the conversion overflows, the
2369 1.1 ross largest integer with the same sign as `a' is returned.
2370 1.1 ross -------------------------------------------------------------------------------
2371 1.1 ross */
2372 1.1 ross int64 float64_to_int64( float64 a )
2373 1.1 ross {
2374 1.1 ross flag aSign;
2375 1.1 ross int16 aExp, shiftCount;
2376 1.1 ross bits64 aSig, aSigExtra;
2377 1.1 ross
2378 1.1 ross aSig = extractFloat64Frac( a );
2379 1.1 ross aExp = extractFloat64Exp( a );
2380 1.1 ross aSign = extractFloat64Sign( a );
2381 1.1 ross if ( aExp ) aSig |= LIT64( 0x0010000000000000 );
2382 1.1 ross shiftCount = 0x433 - aExp;
2383 1.1 ross if ( shiftCount <= 0 ) {
2384 1.1 ross if ( 0x43E < aExp ) {
2385 1.1 ross float_raise( float_flag_invalid );
2386 1.1 ross if ( ! aSign
2387 1.1 ross || ( ( aExp == 0x7FF )
2388 1.1 ross && ( aSig != LIT64( 0x0010000000000000 ) ) )
2389 1.1 ross ) {
2390 1.1 ross return LIT64( 0x7FFFFFFFFFFFFFFF );
2391 1.1 ross }
2392 1.1 ross return (sbits64) LIT64( 0x8000000000000000 );
2393 1.1 ross }
2394 1.1 ross aSigExtra = 0;
2395 1.1 ross aSig <<= - shiftCount;
2396 1.1 ross }
2397 1.1 ross else {
2398 1.1 ross shift64ExtraRightJamming( aSig, 0, shiftCount, &aSig, &aSigExtra );
2399 1.1 ross }
2400 1.1 ross return roundAndPackInt64( aSign, aSig, aSigExtra );
2401 1.1 ross
2402 1.1 ross }
2403 1.1 ross
2404 1.1 ross /*
2405 1.1 ross -------------------------------------------------------------------------------
2406 1.1 ross Returns the result of converting the double-precision floating-point value
2407 1.1 ross `a' to the 64-bit two's complement integer format. The conversion is
2408 1.1 ross performed according to the IEC/IEEE Standard for Binary Floating-Point
2409 1.1 ross Arithmetic, except that the conversion is always rounded toward zero.
2410 1.1 ross If `a' is a NaN, the largest positive integer is returned. Otherwise, if
2411 1.1 ross the conversion overflows, the largest integer with the same sign as `a' is
2412 1.1 ross returned.
2413 1.1 ross -------------------------------------------------------------------------------
2414 1.1 ross */
2415 1.1 ross int64 float64_to_int64_round_to_zero( float64 a )
2416 1.1 ross {
2417 1.1 ross flag aSign;
2418 1.1 ross int16 aExp, shiftCount;
2419 1.1 ross bits64 aSig;
2420 1.1 ross int64 z;
2421 1.1 ross
2422 1.1 ross aSig = extractFloat64Frac( a );
2423 1.1 ross aExp = extractFloat64Exp( a );
2424 1.1 ross aSign = extractFloat64Sign( a );
2425 1.1 ross if ( aExp ) aSig |= LIT64( 0x0010000000000000 );
2426 1.1 ross shiftCount = aExp - 0x433;
2427 1.1 ross if ( 0 <= shiftCount ) {
2428 1.1 ross if ( 0x43E <= aExp ) {
2429 1.1 ross if ( a != LIT64( 0xC3E0000000000000 ) ) {
2430 1.1 ross float_raise( float_flag_invalid );
2431 1.1 ross if ( ! aSign
2432 1.1 ross || ( ( aExp == 0x7FF )
2433 1.1 ross && ( aSig != LIT64( 0x0010000000000000 ) ) )
2434 1.1 ross ) {
2435 1.1 ross return LIT64( 0x7FFFFFFFFFFFFFFF );
2436 1.1 ross }
2437 1.1 ross }
2438 1.1 ross return (sbits64) LIT64( 0x8000000000000000 );
2439 1.1 ross }
2440 1.1 ross z = aSig<<shiftCount;
2441 1.1 ross }
2442 1.1 ross else {
2443 1.1 ross if ( aExp < 0x3FE ) {
2444 1.1 ross if ( aExp | aSig ) float_set_inexact();
2445 1.1 ross return 0;
2446 1.1 ross }
2447 1.1 ross z = aSig>>( - shiftCount );
2448 1.1 ross if ( (bits64) ( aSig<<( shiftCount & 63 ) ) ) {
2449 1.1 ross float_set_inexact();
2450 1.1 ross }
2451 1.1 ross }
2452 1.1 ross if ( aSign ) z = - z;
2453 1.1 ross return z;
2454 1.1 ross
2455 1.1 ross }
2456 1.1 ross #endif /* !SOFTFLOAT_FOR_GCC */
2457 1.1 ross
2458 1.1 ross /*
2459 1.1 ross -------------------------------------------------------------------------------
2460 1.1 ross Returns the result of converting the double-precision floating-point value
2461 1.1 ross `a' to the single-precision floating-point format. The conversion is
2462 1.1 ross performed according to the IEC/IEEE Standard for Binary Floating-Point
2463 1.1 ross Arithmetic.
2464 1.1 ross -------------------------------------------------------------------------------
2465 1.1 ross */
2466 1.1 ross float32 float64_to_float32( float64 a )
2467 1.1 ross {
2468 1.1 ross flag aSign;
2469 1.1 ross int16 aExp;
2470 1.1 ross bits64 aSig;
2471 1.1 ross bits32 zSig;
2472 1.1 ross
2473 1.1 ross aSig = extractFloat64Frac( a );
2474 1.1 ross aExp = extractFloat64Exp( a );
2475 1.1 ross aSign = extractFloat64Sign( a );
2476 1.1 ross if ( aExp == 0x7FF ) {
2477 1.1 ross if ( aSig ) return commonNaNToFloat32( float64ToCommonNaN( a ) );
2478 1.1 ross return packFloat32( aSign, 0xFF, 0 );
2479 1.1 ross }
2480 1.1 ross shift64RightJamming( aSig, 22, &aSig );
2481 1.1 ross zSig = aSig;
2482 1.1 ross if ( aExp || zSig ) {
2483 1.1 ross zSig |= 0x40000000;
2484 1.1 ross aExp -= 0x381;
2485 1.1 ross }
2486 1.1 ross return roundAndPackFloat32( aSign, aExp, zSig );
2487 1.1 ross
2488 1.1 ross }
2489 1.1 ross
2490 1.1 ross #ifdef FLOATX80
2491 1.1 ross
2492 1.1 ross /*
2493 1.1 ross -------------------------------------------------------------------------------
2494 1.1 ross Returns the result of converting the double-precision floating-point value
2495 1.1 ross `a' to the extended double-precision floating-point format. The conversion
2496 1.1 ross is performed according to the IEC/IEEE Standard for Binary Floating-Point
2497 1.1 ross Arithmetic.
2498 1.1 ross -------------------------------------------------------------------------------
2499 1.1 ross */
2500 1.1 ross floatx80 float64_to_floatx80( float64 a )
2501 1.1 ross {
2502 1.1 ross flag aSign;
2503 1.1 ross int16 aExp;
2504 1.1 ross bits64 aSig;
2505 1.1 ross
2506 1.1 ross aSig = extractFloat64Frac( a );
2507 1.1 ross aExp = extractFloat64Exp( a );
2508 1.1 ross aSign = extractFloat64Sign( a );
2509 1.1 ross if ( aExp == 0x7FF ) {
2510 1.1 ross if ( aSig ) return commonNaNToFloatx80( float64ToCommonNaN( a ) );
2511 1.1 ross return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
2512 1.1 ross }
2513 1.1 ross if ( aExp == 0 ) {
2514 1.1 ross if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 );
2515 1.1 ross normalizeFloat64Subnormal( aSig, &aExp, &aSig );
2516 1.1 ross }
2517 1.1 ross return
2518 1.1 ross packFloatx80(
2519 1.1 ross aSign, aExp + 0x3C00, ( aSig | LIT64( 0x0010000000000000 ) )<<11 );
2520 1.1 ross
2521 1.1 ross }
2522 1.1 ross
2523 1.1 ross #endif
2524 1.1 ross
2525 1.1 ross #ifdef FLOAT128
2526 1.1 ross
2527 1.1 ross /*
2528 1.1 ross -------------------------------------------------------------------------------
2529 1.1 ross Returns the result of converting the double-precision floating-point value
2530 1.1 ross `a' to the quadruple-precision floating-point format. The conversion is
2531 1.1 ross performed according to the IEC/IEEE Standard for Binary Floating-Point
2532 1.1 ross Arithmetic.
2533 1.1 ross -------------------------------------------------------------------------------
2534 1.1 ross */
2535 1.1 ross float128 float64_to_float128( float64 a )
2536 1.1 ross {
2537 1.1 ross flag aSign;
2538 1.1 ross int16 aExp;
2539 1.1 ross bits64 aSig, zSig0, zSig1;
2540 1.1 ross
2541 1.1 ross aSig = extractFloat64Frac( a );
2542 1.1 ross aExp = extractFloat64Exp( a );
2543 1.1 ross aSign = extractFloat64Sign( a );
2544 1.1 ross if ( aExp == 0x7FF ) {
2545 1.1 ross if ( aSig ) return commonNaNToFloat128( float64ToCommonNaN( a ) );
2546 1.1 ross return packFloat128( aSign, 0x7FFF, 0, 0 );
2547 1.1 ross }
2548 1.1 ross if ( aExp == 0 ) {
2549 1.1 ross if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 );
2550 1.1 ross normalizeFloat64Subnormal( aSig, &aExp, &aSig );
2551 1.1 ross --aExp;
2552 1.1 ross }
2553 1.1 ross shift128Right( aSig, 0, 4, &zSig0, &zSig1 );
2554 1.1 ross return packFloat128( aSign, aExp + 0x3C00, zSig0, zSig1 );
2555 1.1 ross
2556 1.1 ross }
2557 1.1 ross
2558 1.1 ross #endif
2559 1.1 ross
2560 1.1 ross #ifndef SOFTFLOAT_FOR_GCC
2561 1.1 ross /*
2562 1.1 ross -------------------------------------------------------------------------------
2563 1.1 ross Rounds the double-precision floating-point value `a' to an integer, and
2564 1.1 ross returns the result as a double-precision floating-point value. The
2565 1.1 ross operation is performed according to the IEC/IEEE Standard for Binary
2566 1.1 ross Floating-Point Arithmetic.
2567 1.1 ross -------------------------------------------------------------------------------
2568 1.1 ross */
2569 1.1 ross float64 float64_round_to_int( float64 a )
2570 1.1 ross {
2571 1.1 ross flag aSign;
2572 1.1 ross int16 aExp;
2573 1.1 ross bits64 lastBitMask, roundBitsMask;
2574 1.1 ross int8 roundingMode;
2575 1.1 ross float64 z;
2576 1.1 ross
2577 1.1 ross aExp = extractFloat64Exp( a );
2578 1.1 ross if ( 0x433 <= aExp ) {
2579 1.1 ross if ( ( aExp == 0x7FF ) && extractFloat64Frac( a ) ) {
2580 1.1 ross return propagateFloat64NaN( a, a );
2581 1.1 ross }
2582 1.1 ross return a;
2583 1.1 ross }
2584 1.1 ross if ( aExp < 0x3FF ) {
2585 1.1 ross if ( (bits64) ( a<<1 ) == 0 ) return a;
2586 1.1 ross float_set_inexact();
2587 1.1 ross aSign = extractFloat64Sign( a );
2588 1.1 ross switch ( float_rounding_mode() ) {
2589 1.1 ross case float_round_nearest_even:
2590 1.1 ross if ( ( aExp == 0x3FE ) && extractFloat64Frac( a ) ) {
2591 1.1 ross return packFloat64( aSign, 0x3FF, 0 );
2592 1.1 ross }
2593 1.1 ross break;
2594 1.1 ross case float_round_down:
2595 1.1 ross return aSign ? LIT64( 0xBFF0000000000000 ) : 0;
2596 1.1 ross case float_round_up:
2597 1.1 ross return
2598 1.1 ross aSign ? LIT64( 0x8000000000000000 ) : LIT64( 0x3FF0000000000000 );
2599 1.1 ross }
2600 1.1 ross return packFloat64( aSign, 0, 0 );
2601 1.1 ross }
2602 1.1 ross lastBitMask = 1;
2603 1.1 ross lastBitMask <<= 0x433 - aExp;
2604 1.1 ross roundBitsMask = lastBitMask - 1;
2605 1.1 ross z = a;
2606 1.1 ross roundingMode = float_rounding_mode();
2607 1.1 ross if ( roundingMode == float_round_nearest_even ) {
2608 1.1 ross z += lastBitMask>>1;
2609 1.1 ross if ( ( z & roundBitsMask ) == 0 ) z &= ~ lastBitMask;
2610 1.1 ross }
2611 1.1 ross else if ( roundingMode != float_round_to_zero ) {
2612 1.1 ross if ( extractFloat64Sign( z ) ^ ( roundingMode == float_round_up ) ) {
2613 1.1 ross z += roundBitsMask;
2614 1.1 ross }
2615 1.1 ross }
2616 1.1 ross z &= ~ roundBitsMask;
2617 1.1 ross if ( z != a ) float_set_inexact();
2618 1.1 ross return z;
2619 1.1 ross
2620 1.1 ross }
2621 1.1 ross #endif
2622 1.1 ross
2623 1.1 ross /*
2624 1.1 ross -------------------------------------------------------------------------------
2625 1.1 ross Returns the result of adding the absolute values of the double-precision
2626 1.1 ross floating-point values `a' and `b'. If `zSign' is 1, the sum is negated
2627 1.1 ross before being returned. `zSign' is ignored if the result is a NaN.
2628 1.1 ross The addition is performed according to the IEC/IEEE Standard for Binary
2629 1.1 ross Floating-Point Arithmetic.
2630 1.1 ross -------------------------------------------------------------------------------
2631 1.1 ross */
2632 1.1 ross static float64 addFloat64Sigs( float64 a, float64 b, flag zSign )
2633 1.1 ross {
2634 1.1 ross int16 aExp, bExp, zExp;
2635 1.1 ross bits64 aSig, bSig, zSig;
2636 1.1 ross int16 expDiff;
2637 1.1 ross
2638 1.1 ross aSig = extractFloat64Frac( a );
2639 1.1 ross aExp = extractFloat64Exp( a );
2640 1.1 ross bSig = extractFloat64Frac( b );
2641 1.1 ross bExp = extractFloat64Exp( b );
2642 1.1 ross expDiff = aExp - bExp;
2643 1.1 ross aSig <<= 9;
2644 1.1 ross bSig <<= 9;
2645 1.1 ross if ( 0 < expDiff ) {
2646 1.1 ross if ( aExp == 0x7FF ) {
2647 1.1 ross if ( aSig ) return propagateFloat64NaN( a, b );
2648 1.1 ross return a;
2649 1.1 ross }
2650 1.1 ross if ( bExp == 0 ) {
2651 1.1 ross --expDiff;
2652 1.1 ross }
2653 1.1 ross else {
2654 1.1 ross bSig |= LIT64( 0x2000000000000000 );
2655 1.1 ross }
2656 1.1 ross shift64RightJamming( bSig, expDiff, &bSig );
2657 1.1 ross zExp = aExp;
2658 1.1 ross }
2659 1.1 ross else if ( expDiff < 0 ) {
2660 1.1 ross if ( bExp == 0x7FF ) {
2661 1.1 ross if ( bSig ) return propagateFloat64NaN( a, b );
2662 1.1 ross return packFloat64( zSign, 0x7FF, 0 );
2663 1.1 ross }
2664 1.1 ross if ( aExp == 0 ) {
2665 1.1 ross ++expDiff;
2666 1.1 ross }
2667 1.1 ross else {
2668 1.1 ross aSig |= LIT64( 0x2000000000000000 );
2669 1.1 ross }
2670 1.1 ross shift64RightJamming( aSig, - expDiff, &aSig );
2671 1.1 ross zExp = bExp;
2672 1.1 ross }
2673 1.1 ross else {
2674 1.1 ross if ( aExp == 0x7FF ) {
2675 1.1 ross if ( aSig | bSig ) return propagateFloat64NaN( a, b );
2676 1.1 ross return a;
2677 1.1 ross }
2678 1.1 ross if ( aExp == 0 ) return packFloat64( zSign, 0, ( aSig + bSig )>>9 );
2679 1.1 ross zSig = LIT64( 0x4000000000000000 ) + aSig + bSig;
2680 1.1 ross zExp = aExp;
2681 1.1 ross goto roundAndPack;
2682 1.1 ross }
2683 1.1 ross aSig |= LIT64( 0x2000000000000000 );
2684 1.1 ross zSig = ( aSig + bSig )<<1;
2685 1.1 ross --zExp;
2686 1.1 ross if ( (sbits64) zSig < 0 ) {
2687 1.1 ross zSig = aSig + bSig;
2688 1.1 ross ++zExp;
2689 1.1 ross }
2690 1.1 ross roundAndPack:
2691 1.1 ross return roundAndPackFloat64( zSign, zExp, zSig );
2692 1.1 ross
2693 1.1 ross }
2694 1.1 ross
2695 1.1 ross /*
2696 1.1 ross -------------------------------------------------------------------------------
2697 1.1 ross Returns the result of subtracting the absolute values of the double-
2698 1.1 ross precision floating-point values `a' and `b'. If `zSign' is 1, the
2699 1.1 ross difference is negated before being returned. `zSign' is ignored if the
2700 1.1 ross result is a NaN. The subtraction is performed according to the IEC/IEEE
2701 1.1 ross Standard for Binary Floating-Point Arithmetic.
2702 1.1 ross -------------------------------------------------------------------------------
2703 1.1 ross */
2704 1.1 ross static float64 subFloat64Sigs( float64 a, float64 b, flag zSign )
2705 1.1 ross {
2706 1.1 ross int16 aExp, bExp, zExp;
2707 1.1 ross bits64 aSig, bSig, zSig;
2708 1.1 ross int16 expDiff;
2709 1.1 ross
2710 1.1 ross aSig = extractFloat64Frac( a );
2711 1.1 ross aExp = extractFloat64Exp( a );
2712 1.1 ross bSig = extractFloat64Frac( b );
2713 1.1 ross bExp = extractFloat64Exp( b );
2714 1.1 ross expDiff = aExp - bExp;
2715 1.1 ross aSig <<= 10;
2716 1.1 ross bSig <<= 10;
2717 1.1 ross if ( 0 < expDiff ) goto aExpBigger;
2718 1.1 ross if ( expDiff < 0 ) goto bExpBigger;
2719 1.1 ross if ( aExp == 0x7FF ) {
2720 1.1 ross if ( aSig | bSig ) return propagateFloat64NaN( a, b );
2721 1.1 ross float_raise( float_flag_invalid );
2722 1.1 ross return float64_default_nan;
2723 1.1 ross }
2724 1.1 ross if ( aExp == 0 ) {
2725 1.1 ross aExp = 1;
2726 1.1 ross bExp = 1;
2727 1.1 ross }
2728 1.1 ross if ( bSig < aSig ) goto aBigger;
2729 1.1 ross if ( aSig < bSig ) goto bBigger;
2730 1.1 ross return packFloat64( float_rounding_mode() == float_round_down, 0, 0 );
2731 1.1 ross bExpBigger:
2732 1.1 ross if ( bExp == 0x7FF ) {
2733 1.1 ross if ( bSig ) return propagateFloat64NaN( a, b );
2734 1.1 ross return packFloat64( zSign ^ 1, 0x7FF, 0 );
2735 1.1 ross }
2736 1.1 ross if ( aExp == 0 ) {
2737 1.1 ross ++expDiff;
2738 1.1 ross }
2739 1.1 ross else {
2740 1.1 ross aSig |= LIT64( 0x4000000000000000 );
2741 1.1 ross }
2742 1.1 ross shift64RightJamming( aSig, - expDiff, &aSig );
2743 1.1 ross bSig |= LIT64( 0x4000000000000000 );
2744 1.1 ross bBigger:
2745 1.1 ross zSig = bSig - aSig;
2746 1.1 ross zExp = bExp;
2747 1.1 ross zSign ^= 1;
2748 1.1 ross goto normalizeRoundAndPack;
2749 1.1 ross aExpBigger:
2750 1.1 ross if ( aExp == 0x7FF ) {
2751 1.1 ross if ( aSig ) return propagateFloat64NaN( a, b );
2752 1.1 ross return a;
2753 1.1 ross }
2754 1.1 ross if ( bExp == 0 ) {
2755 1.1 ross --expDiff;
2756 1.1 ross }
2757 1.1 ross else {
2758 1.1 ross bSig |= LIT64( 0x4000000000000000 );
2759 1.1 ross }
2760 1.1 ross shift64RightJamming( bSig, expDiff, &bSig );
2761 1.1 ross aSig |= LIT64( 0x4000000000000000 );
2762 1.1 ross aBigger:
2763 1.1 ross zSig = aSig - bSig;
2764 1.1 ross zExp = aExp;
2765 1.1 ross normalizeRoundAndPack:
2766 1.1 ross --zExp;
2767 1.1 ross return normalizeRoundAndPackFloat64( zSign, zExp, zSig );
2768 1.1 ross
2769 1.1 ross }
2770 1.1 ross
2771 1.1 ross /*
2772 1.1 ross -------------------------------------------------------------------------------
2773 1.1 ross Returns the result of adding the double-precision floating-point values `a'
2774 1.1 ross and `b'. The operation is performed according to the IEC/IEEE Standard for
2775 1.1 ross Binary Floating-Point Arithmetic.
2776 1.1 ross -------------------------------------------------------------------------------
2777 1.1 ross */
2778 1.1 ross float64 float64_add( float64 a, float64 b )
2779 1.1 ross {
2780 1.1 ross flag aSign, bSign;
2781 1.1 ross
2782 1.1 ross aSign = extractFloat64Sign( a );
2783 1.1 ross bSign = extractFloat64Sign( b );
2784 1.1 ross if ( aSign == bSign ) {
2785 1.1 ross return addFloat64Sigs( a, b, aSign );
2786 1.1 ross }
2787 1.1 ross else {
2788 1.1 ross return subFloat64Sigs( a, b, aSign );
2789 1.1 ross }
2790 1.1 ross
2791 1.1 ross }
2792 1.1 ross
2793 1.1 ross /*
2794 1.1 ross -------------------------------------------------------------------------------
2795 1.1 ross Returns the result of subtracting the double-precision floating-point values
2796 1.1 ross `a' and `b'. The operation is performed according to the IEC/IEEE Standard
2797 1.1 ross for Binary Floating-Point Arithmetic.
2798 1.1 ross -------------------------------------------------------------------------------
2799 1.1 ross */
2800 1.1 ross float64 float64_sub( float64 a, float64 b )
2801 1.1 ross {
2802 1.1 ross flag aSign, bSign;
2803 1.1 ross
2804 1.1 ross aSign = extractFloat64Sign( a );
2805 1.1 ross bSign = extractFloat64Sign( b );
2806 1.1 ross if ( aSign == bSign ) {
2807 1.1 ross return subFloat64Sigs( a, b, aSign );
2808 1.1 ross }
2809 1.1 ross else {
2810 1.1 ross return addFloat64Sigs( a, b, aSign );
2811 1.1 ross }
2812 1.1 ross
2813 1.1 ross }
2814 1.1 ross
2815 1.1 ross /*
2816 1.1 ross -------------------------------------------------------------------------------
2817 1.1 ross Returns the result of multiplying the double-precision floating-point values
2818 1.1 ross `a' and `b'. The operation is performed according to the IEC/IEEE Standard
2819 1.1 ross for Binary Floating-Point Arithmetic.
2820 1.1 ross -------------------------------------------------------------------------------
2821 1.1 ross */
2822 1.1 ross float64 float64_mul( float64 a, float64 b )
2823 1.1 ross {
2824 1.1 ross flag aSign, bSign, zSign;
2825 1.1 ross int16 aExp, bExp, zExp;
2826 1.1 ross bits64 aSig, bSig, zSig0, zSig1;
2827 1.1 ross
2828 1.1 ross aSig = extractFloat64Frac( a );
2829 1.1 ross aExp = extractFloat64Exp( a );
2830 1.1 ross aSign = extractFloat64Sign( a );
2831 1.1 ross bSig = extractFloat64Frac( b );
2832 1.1 ross bExp = extractFloat64Exp( b );
2833 1.1 ross bSign = extractFloat64Sign( b );
2834 1.1 ross zSign = aSign ^ bSign;
2835 1.1 ross if ( aExp == 0x7FF ) {
2836 1.1 ross if ( aSig || ( ( bExp == 0x7FF ) && bSig ) ) {
2837 1.1 ross return propagateFloat64NaN( a, b );
2838 1.1 ross }
2839 1.1 ross if ( ( bExp | bSig ) == 0 ) {
2840 1.1 ross float_raise( float_flag_invalid );
2841 1.1 ross return float64_default_nan;
2842 1.1 ross }
2843 1.1 ross return packFloat64( zSign, 0x7FF, 0 );
2844 1.1 ross }
2845 1.1 ross if ( bExp == 0x7FF ) {
2846 1.1 ross if ( bSig ) return propagateFloat64NaN( a, b );
2847 1.1 ross if ( ( aExp | aSig ) == 0 ) {
2848 1.1 ross float_raise( float_flag_invalid );
2849 1.1 ross return float64_default_nan;
2850 1.1 ross }
2851 1.1 ross return packFloat64( zSign, 0x7FF, 0 );
2852 1.1 ross }
2853 1.1 ross if ( aExp == 0 ) {
2854 1.1 ross if ( aSig == 0 ) return packFloat64( zSign, 0, 0 );
2855 1.1 ross normalizeFloat64Subnormal( aSig, &aExp, &aSig );
2856 1.1 ross }
2857 1.1 ross if ( bExp == 0 ) {
2858 1.1 ross if ( bSig == 0 ) return packFloat64( zSign, 0, 0 );
2859 1.1 ross normalizeFloat64Subnormal( bSig, &bExp, &bSig );
2860 1.1 ross }
2861 1.1 ross zExp = aExp + bExp - 0x3FF;
2862 1.1 ross aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<10;
2863 1.1 ross bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;
2864 1.1 ross mul64To128( aSig, bSig, &zSig0, &zSig1 );
2865 1.1 ross zSig0 |= ( zSig1 != 0 );
2866 1.1 ross if ( 0 <= (sbits64) ( zSig0<<1 ) ) {
2867 1.1 ross zSig0 <<= 1;
2868 1.1 ross --zExp;
2869 1.1 ross }
2870 1.1 ross return roundAndPackFloat64( zSign, zExp, zSig0 );
2871 1.1 ross
2872 1.1 ross }
2873 1.1 ross
2874 1.1 ross /*
2875 1.1 ross -------------------------------------------------------------------------------
2876 1.1 ross Returns the result of dividing the double-precision floating-point value `a'
2877 1.1 ross by the corresponding value `b'. The operation is performed according to
2878 1.1 ross the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2879 1.1 ross -------------------------------------------------------------------------------
2880 1.1 ross */
2881 1.1 ross float64 float64_div( float64 a, float64 b )
2882 1.1 ross {
2883 1.1 ross flag aSign, bSign, zSign;
2884 1.1 ross int16 aExp, bExp, zExp;
2885 1.1 ross bits64 aSig, bSig, zSig;
2886 1.1 ross bits64 rem0, rem1;
2887 1.1 ross bits64 term0, term1;
2888 1.1 ross
2889 1.1 ross aSig = extractFloat64Frac( a );
2890 1.1 ross aExp = extractFloat64Exp( a );
2891 1.1 ross aSign = extractFloat64Sign( a );
2892 1.1 ross bSig = extractFloat64Frac( b );
2893 1.1 ross bExp = extractFloat64Exp( b );
2894 1.1 ross bSign = extractFloat64Sign( b );
2895 1.1 ross zSign = aSign ^ bSign;
2896 1.1 ross if ( aExp == 0x7FF ) {
2897 1.1 ross if ( aSig ) return propagateFloat64NaN( a, b );
2898 1.1 ross if ( bExp == 0x7FF ) {
2899 1.1 ross if ( bSig ) return propagateFloat64NaN( a, b );
2900 1.1 ross float_raise( float_flag_invalid );
2901 1.1 ross return float64_default_nan;
2902 1.1 ross }
2903 1.1 ross return packFloat64( zSign, 0x7FF, 0 );
2904 1.1 ross }
2905 1.1 ross if ( bExp == 0x7FF ) {
2906 1.1 ross if ( bSig ) return propagateFloat64NaN( a, b );
2907 1.1 ross return packFloat64( zSign, 0, 0 );
2908 1.1 ross }
2909 1.1 ross if ( bExp == 0 ) {
2910 1.1 ross if ( bSig == 0 ) {
2911 1.1 ross if ( ( aExp | aSig ) == 0 ) {
2912 1.1 ross float_raise( float_flag_invalid );
2913 1.1 ross return float64_default_nan;
2914 1.1 ross }
2915 1.1 ross float_raise( float_flag_divbyzero );
2916 1.1 ross return packFloat64( zSign, 0x7FF, 0 );
2917 1.1 ross }
2918 1.1 ross normalizeFloat64Subnormal( bSig, &bExp, &bSig );
2919 1.1 ross }
2920 1.1 ross if ( aExp == 0 ) {
2921 1.1 ross if ( aSig == 0 ) return packFloat64( zSign, 0, 0 );
2922 1.1 ross normalizeFloat64Subnormal( aSig, &aExp, &aSig );
2923 1.1 ross }
2924 1.1 ross zExp = aExp - bExp + 0x3FD;
2925 1.1 ross aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<10;
2926 1.1 ross bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;
2927 1.1 ross if ( bSig <= ( aSig + aSig ) ) {
2928 1.1 ross aSig >>= 1;
2929 1.1 ross ++zExp;
2930 1.1 ross }
2931 1.1 ross zSig = estimateDiv128To64( aSig, 0, bSig );
2932 1.1 ross if ( ( zSig & 0x1FF ) <= 2 ) {
2933 1.1 ross mul64To128( bSig, zSig, &term0, &term1 );
2934 1.1 ross sub128( aSig, 0, term0, term1, &rem0, &rem1 );
2935 1.1 ross while ( (sbits64) rem0 < 0 ) {
2936 1.1 ross --zSig;
2937 1.1 ross add128( rem0, rem1, 0, bSig, &rem0, &rem1 );
2938 1.1 ross }
2939 1.1 ross zSig |= ( rem1 != 0 );
2940 1.1 ross }
2941 1.1 ross return roundAndPackFloat64( zSign, zExp, zSig );
2942 1.1 ross
2943 1.1 ross }
2944 1.1 ross
2945 1.1 ross #ifndef SOFTFLOAT_FOR_GCC
2946 1.1 ross /*
2947 1.1 ross -------------------------------------------------------------------------------
2948 1.1 ross Returns the remainder of the double-precision floating-point value `a'
2949 1.1 ross with respect to the corresponding value `b'. The operation is performed
2950 1.1 ross according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2951 1.1 ross -------------------------------------------------------------------------------
2952 1.1 ross */
2953 1.1 ross float64 float64_rem( float64 a, float64 b )
2954 1.1 ross {
2955 1.1 ross flag aSign, bSign, zSign;
2956 1.1 ross int16 aExp, bExp, expDiff;
2957 1.1 ross bits64 aSig, bSig;
2958 1.1 ross bits64 q, alternateASig;
2959 1.1 ross sbits64 sigMean;
2960 1.1 ross
2961 1.1 ross aSig = extractFloat64Frac( a );
2962 1.1 ross aExp = extractFloat64Exp( a );
2963 1.1 ross aSign = extractFloat64Sign( a );
2964 1.1 ross bSig = extractFloat64Frac( b );
2965 1.1 ross bExp = extractFloat64Exp( b );
2966 1.1 ross bSign = extractFloat64Sign( b );
2967 1.1 ross if ( aExp == 0x7FF ) {
2968 1.1 ross if ( aSig || ( ( bExp == 0x7FF ) && bSig ) ) {
2969 1.1 ross return propagateFloat64NaN( a, b );
2970 1.1 ross }
2971 1.1 ross float_raise( float_flag_invalid );
2972 1.1 ross return float64_default_nan;
2973 1.1 ross }
2974 1.1 ross if ( bExp == 0x7FF ) {
2975 1.1 ross if ( bSig ) return propagateFloat64NaN( a, b );
2976 1.1 ross return a;
2977 1.1 ross }
2978 1.1 ross if ( bExp == 0 ) {
2979 1.1 ross if ( bSig == 0 ) {
2980 1.1 ross float_raise( float_flag_invalid );
2981 1.1 ross return float64_default_nan;
2982 1.1 ross }
2983 1.1 ross normalizeFloat64Subnormal( bSig, &bExp, &bSig );
2984 1.1 ross }
2985 1.1 ross if ( aExp == 0 ) {
2986 1.1 ross if ( aSig == 0 ) return a;
2987 1.1 ross normalizeFloat64Subnormal( aSig, &aExp, &aSig );
2988 1.1 ross }
2989 1.1 ross expDiff = aExp - bExp;
2990 1.1 ross aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<11;
2991 1.1 ross bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;
2992 1.1 ross if ( expDiff < 0 ) {
2993 1.1 ross if ( expDiff < -1 ) return a;
2994 1.1 ross aSig >>= 1;
2995 1.1 ross }
2996 1.1 ross q = ( bSig <= aSig );
2997 1.1 ross if ( q ) aSig -= bSig;
2998 1.1 ross expDiff -= 64;
2999 1.1 ross while ( 0 < expDiff ) {
3000 1.1 ross q = estimateDiv128To64( aSig, 0, bSig );
3001 1.1 ross q = ( 2 < q ) ? q - 2 : 0;
3002 1.1 ross aSig = - ( ( bSig>>2 ) * q );
3003 1.1 ross expDiff -= 62;
3004 1.1 ross }
3005 1.1 ross expDiff += 64;
3006 1.1 ross if ( 0 < expDiff ) {
3007 1.1 ross q = estimateDiv128To64( aSig, 0, bSig );
3008 1.1 ross q = ( 2 < q ) ? q - 2 : 0;
3009 1.1 ross q >>= 64 - expDiff;
3010 1.1 ross bSig >>= 2;
3011 1.1 ross aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q;
3012 1.1 ross }
3013 1.1 ross else {
3014 1.1 ross aSig >>= 2;
3015 1.1 ross bSig >>= 2;
3016 1.1 ross }
3017 1.1 ross do {
3018 1.1 ross alternateASig = aSig;
3019 1.1 ross ++q;
3020 1.1 ross aSig -= bSig;
3021 1.1 ross } while ( 0 <= (sbits64) aSig );
3022 1.1 ross sigMean = aSig + alternateASig;
3023 1.1 ross if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) {
3024 1.1 ross aSig = alternateASig;
3025 1.1 ross }
3026 1.1 ross zSign = ( (sbits64) aSig < 0 );
3027 1.1 ross if ( zSign ) aSig = - aSig;
3028 1.1 ross return normalizeRoundAndPackFloat64( aSign ^ zSign, bExp, aSig );
3029 1.1 ross
3030 1.1 ross }
3031 1.1 ross
3032 1.1 ross /*
3033 1.1 ross -------------------------------------------------------------------------------
3034 1.1 ross Returns the square root of the double-precision floating-point value `a'.
3035 1.1 ross The operation is performed according to the IEC/IEEE Standard for Binary
3036 1.1 ross Floating-Point Arithmetic.
3037 1.1 ross -------------------------------------------------------------------------------
3038 1.1 ross */
3039 1.1 ross float64 float64_sqrt( float64 a )
3040 1.1 ross {
3041 1.1 ross flag aSign;
3042 1.1 ross int16 aExp, zExp;
3043 1.1 ross bits64 aSig, zSig, doubleZSig;
3044 1.1 ross bits64 rem0, rem1, term0, term1;
3045 1.1 ross
3046 1.1 ross aSig = extractFloat64Frac( a );
3047 1.1 ross aExp = extractFloat64Exp( a );
3048 1.1 ross aSign = extractFloat64Sign( a );
3049 1.1 ross if ( aExp == 0x7FF ) {
3050 1.1 ross if ( aSig ) return propagateFloat64NaN( a, a );
3051 1.1 ross if ( ! aSign ) return a;
3052 1.1 ross float_raise( float_flag_invalid );
3053 1.1 ross return float64_default_nan;
3054 1.1 ross }
3055 1.1 ross if ( aSign ) {
3056 1.1 ross if ( ( aExp | aSig ) == 0 ) return a;
3057 1.1 ross float_raise( float_flag_invalid );
3058 1.1 ross return float64_default_nan;
3059 1.1 ross }
3060 1.1 ross if ( aExp == 0 ) {
3061 1.1 ross if ( aSig == 0 ) return 0;
3062 1.1 ross normalizeFloat64Subnormal( aSig, &aExp, &aSig );
3063 1.1 ross }
3064 1.1 ross zExp = ( ( aExp - 0x3FF )>>1 ) + 0x3FE;
3065 1.1 ross aSig |= LIT64( 0x0010000000000000 );
3066 1.1 ross zSig = estimateSqrt32( aExp, aSig>>21 );
3067 1.1 ross aSig <<= 9 - ( aExp & 1 );
3068 1.1 ross zSig = estimateDiv128To64( aSig, 0, zSig<<32 ) + ( zSig<<30 );
3069 1.1 ross if ( ( zSig & 0x1FF ) <= 5 ) {
3070 1.1 ross doubleZSig = zSig<<1;
3071 1.1 ross mul64To128( zSig, zSig, &term0, &term1 );
3072 1.1 ross sub128( aSig, 0, term0, term1, &rem0, &rem1 );
3073 1.1 ross while ( (sbits64) rem0 < 0 ) {
3074 1.1 ross --zSig;
3075 1.1 ross doubleZSig -= 2;
3076 1.1 ross add128( rem0, rem1, zSig>>63, doubleZSig | 1, &rem0, &rem1 );
3077 1.1 ross }
3078 1.1 ross zSig |= ( ( rem0 | rem1 ) != 0 );
3079 1.1 ross }
3080 1.1 ross return roundAndPackFloat64( 0, zExp, zSig );
3081 1.1 ross
3082 1.1 ross }
3083 1.1 ross #endif
3084 1.1 ross
3085 1.1 ross /*
3086 1.1 ross -------------------------------------------------------------------------------
3087 1.1 ross Returns 1 if the double-precision floating-point value `a' is equal to the
3088 1.1 ross corresponding value `b', and 0 otherwise. The comparison is performed
3089 1.1 ross according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3090 1.1 ross -------------------------------------------------------------------------------
3091 1.1 ross */
3092 1.1 ross flag float64_eq( float64 a, float64 b )
3093 1.1 ross {
3094 1.1 ross
3095 1.1 ross if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
3096 1.1 ross || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
3097 1.1 ross ) {
3098 1.1 ross if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) {
3099 1.1 ross float_raise( float_flag_invalid );
3100 1.1 ross }
3101 1.1 ross return 0;
3102 1.1 ross }
3103 1.1 ross return ( a == b ) ||
3104 1.1 ross ( (bits64) ( ( FLOAT64_DEMANGLE(a) | FLOAT64_DEMANGLE(b) )<<1 ) == 0 );
3105 1.1 ross
3106 1.1 ross }
3107 1.1 ross
3108 1.1 ross /*
3109 1.1 ross -------------------------------------------------------------------------------
3110 1.1 ross Returns 1 if the double-precision floating-point value `a' is less than or
3111 1.1 ross equal to the corresponding value `b', and 0 otherwise. The comparison is
3112 1.1 ross performed according to the IEC/IEEE Standard for Binary Floating-Point
3113 1.1 ross Arithmetic.
3114 1.1 ross -------------------------------------------------------------------------------
3115 1.1 ross */
3116 1.1 ross flag float64_le( float64 a, float64 b )
3117 1.1 ross {
3118 1.1 ross flag aSign, bSign;
3119 1.1 ross
3120 1.1 ross if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
3121 1.1 ross || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
3122 1.1 ross ) {
3123 1.1 ross float_raise( float_flag_invalid );
3124 1.1 ross return 0;
3125 1.1 ross }
3126 1.1 ross aSign = extractFloat64Sign( a );
3127 1.1 ross bSign = extractFloat64Sign( b );
3128 1.1 ross if ( aSign != bSign )
3129 1.1 ross return aSign ||
3130 1.1 ross ( (bits64) ( ( FLOAT64_DEMANGLE(a) | FLOAT64_DEMANGLE(b) )<<1 ) ==
3131 1.1 ross 0 );
3132 1.1 ross return ( a == b ) ||
3133 1.1 ross ( aSign ^ ( FLOAT64_DEMANGLE(a) < FLOAT64_DEMANGLE(b) ) );
3134 1.1 ross
3135 1.1 ross }
3136 1.1 ross
3137 1.1 ross /*
3138 1.1 ross -------------------------------------------------------------------------------
3139 1.1 ross Returns 1 if the double-precision floating-point value `a' is less than
3140 1.1 ross the corresponding value `b', and 0 otherwise. The comparison is performed
3141 1.1 ross according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3142 1.1 ross -------------------------------------------------------------------------------
3143 1.1 ross */
3144 1.1 ross flag float64_lt( float64 a, float64 b )
3145 1.1 ross {
3146 1.1 ross flag aSign, bSign;
3147 1.1 ross
3148 1.1 ross if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
3149 1.1 ross || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
3150 1.1 ross ) {
3151 1.1 ross float_raise( float_flag_invalid );
3152 1.1 ross return 0;
3153 1.1 ross }
3154 1.1 ross aSign = extractFloat64Sign( a );
3155 1.1 ross bSign = extractFloat64Sign( b );
3156 1.1 ross if ( aSign != bSign )
3157 1.1 ross return aSign &&
3158 1.1 ross ( (bits64) ( ( FLOAT64_DEMANGLE(a) | FLOAT64_DEMANGLE(b) )<<1 ) !=
3159 1.1 ross 0 );
3160 1.1 ross return ( a != b ) &&
3161 1.1 ross ( aSign ^ ( FLOAT64_DEMANGLE(a) < FLOAT64_DEMANGLE(b) ) );
3162 1.1 ross
3163 1.1 ross }
3164 1.1 ross
3165 1.1 ross #ifndef SOFTFLOAT_FOR_GCC
3166 1.1 ross /*
3167 1.1 ross -------------------------------------------------------------------------------
3168 1.1 ross Returns 1 if the double-precision floating-point value `a' is equal to the
3169 1.1 ross corresponding value `b', and 0 otherwise. The invalid exception is raised
3170 1.1 ross if either operand is a NaN. Otherwise, the comparison is performed
3171 1.1 ross according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3172 1.1 ross -------------------------------------------------------------------------------
3173 1.1 ross */
3174 1.1 ross flag float64_eq_signaling( float64 a, float64 b )
3175 1.1 ross {
3176 1.1 ross
3177 1.1 ross if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
3178 1.1 ross || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
3179 1.1 ross ) {
3180 1.1 ross float_raise( float_flag_invalid );
3181 1.1 ross return 0;
3182 1.1 ross }
3183 1.1 ross return ( a == b ) || ( (bits64) ( ( a | b )<<1 ) == 0 );
3184 1.1 ross
3185 1.1 ross }
3186 1.1 ross
3187 1.1 ross /*
3188 1.1 ross -------------------------------------------------------------------------------
3189 1.1 ross Returns 1 if the double-precision floating-point value `a' is less than or
3190 1.1 ross equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not
3191 1.1 ross cause an exception. Otherwise, the comparison is performed according to the
3192 1.1 ross IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3193 1.1 ross -------------------------------------------------------------------------------
3194 1.1 ross */
3195 1.1 ross flag float64_le_quiet( float64 a, float64 b )
3196 1.1 ross {
3197 1.1 ross flag aSign, bSign;
3198 1.1 ross
3199 1.1 ross if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
3200 1.1 ross || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
3201 1.1 ross ) {
3202 1.1 ross if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) {
3203 1.1 ross float_raise( float_flag_invalid );
3204 1.1 ross }
3205 1.1 ross return 0;
3206 1.1 ross }
3207 1.1 ross aSign = extractFloat64Sign( a );
3208 1.1 ross bSign = extractFloat64Sign( b );
3209 1.1 ross if ( aSign != bSign ) return aSign || ( (bits64) ( ( a | b )<<1 ) == 0 );
3210 1.1 ross return ( a == b ) || ( aSign ^ ( a < b ) );
3211 1.1 ross
3212 1.1 ross }
3213 1.1 ross
3214 1.1 ross /*
3215 1.1 ross -------------------------------------------------------------------------------
3216 1.1 ross Returns 1 if the double-precision floating-point value `a' is less than
3217 1.1 ross the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
3218 1.1 ross exception. Otherwise, the comparison is performed according to the IEC/IEEE
3219 1.1 ross Standard for Binary Floating-Point Arithmetic.
3220 1.1 ross -------------------------------------------------------------------------------
3221 1.1 ross */
3222 1.1 ross flag float64_lt_quiet( float64 a, float64 b )
3223 1.1 ross {
3224 1.1 ross flag aSign, bSign;
3225 1.1 ross
3226 1.1 ross if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
3227 1.1 ross || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
3228 1.1 ross ) {
3229 1.1 ross if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) {
3230 1.1 ross float_raise( float_flag_invalid );
3231 1.1 ross }
3232 1.1 ross return 0;
3233 1.1 ross }
3234 1.1 ross aSign = extractFloat64Sign( a );
3235 1.1 ross bSign = extractFloat64Sign( b );
3236 1.1 ross if ( aSign != bSign ) return aSign && ( (bits64) ( ( a | b )<<1 ) != 0 );
3237 1.1 ross return ( a != b ) && ( aSign ^ ( a < b ) );
3238 1.1 ross
3239 1.1 ross }
3240 1.1 ross #endif
3241 1.1 ross
3242 1.1 ross #ifdef FLOATX80
3243 1.1 ross
3244 1.1 ross /*
3245 1.1 ross -------------------------------------------------------------------------------
3246 1.1 ross Returns the result of converting the extended double-precision floating-
3247 1.1 ross point value `a' to the 32-bit two's complement integer format. The
3248 1.1 ross conversion is performed according to the IEC/IEEE Standard for Binary
3249 1.1 ross Floating-Point Arithmetic---which means in particular that the conversion
3250 1.1 ross is rounded according to the current rounding mode. If `a' is a NaN, the
3251 1.1 ross largest positive integer is returned. Otherwise, if the conversion
3252 1.1 ross overflows, the largest integer with the same sign as `a' is returned.
3253 1.1 ross -------------------------------------------------------------------------------
3254 1.1 ross */
3255 1.1 ross int32 floatx80_to_int32( floatx80 a )
3256 1.1 ross {
3257 1.1 ross flag aSign;
3258 1.1 ross int32 aExp, shiftCount;
3259 1.1 ross bits64 aSig;
3260 1.1 ross
3261 1.1 ross aSig = extractFloatx80Frac( a );
3262 1.1 ross aExp = extractFloatx80Exp( a );
3263 1.1 ross aSign = extractFloatx80Sign( a );
3264 1.1 ross if ( ( aExp == 0x7FFF ) && (bits64) ( aSig<<1 ) ) aSign = 0;
3265 1.1 ross shiftCount = 0x4037 - aExp;
3266 1.1 ross if ( shiftCount <= 0 ) shiftCount = 1;
3267 1.1 ross shift64RightJamming( aSig, shiftCount, &aSig );
3268 1.1 ross return roundAndPackInt32( aSign, aSig );
3269 1.1 ross
3270 1.1 ross }
3271 1.1 ross
3272 1.1 ross /*
3273 1.1 ross -------------------------------------------------------------------------------
3274 1.1 ross Returns the result of converting the extended double-precision floating-
3275 1.1 ross point value `a' to the 32-bit two's complement integer format. The
3276 1.1 ross conversion is performed according to the IEC/IEEE Standard for Binary
3277 1.1 ross Floating-Point Arithmetic, except that the conversion is always rounded
3278 1.1 ross toward zero. If `a' is a NaN, the largest positive integer is returned.
3279 1.1 ross Otherwise, if the conversion overflows, the largest integer with the same
3280 1.1 ross sign as `a' is returned.
3281 1.1 ross -------------------------------------------------------------------------------
3282 1.1 ross */
3283 1.1 ross int32 floatx80_to_int32_round_to_zero( floatx80 a )
3284 1.1 ross {
3285 1.1 ross flag aSign;
3286 1.1 ross int32 aExp, shiftCount;
3287 1.1 ross bits64 aSig, savedASig;
3288 1.1 ross int32 z;
3289 1.1 ross
3290 1.1 ross aSig = extractFloatx80Frac( a );
3291 1.1 ross aExp = extractFloatx80Exp( a );
3292 1.1 ross aSign = extractFloatx80Sign( a );
3293 1.1 ross if ( 0x401E < aExp ) {
3294 1.1 ross if ( ( aExp == 0x7FFF ) && (bits64) ( aSig<<1 ) ) aSign = 0;
3295 1.1 ross goto invalid;
3296 1.1 ross }
3297 1.1 ross else if ( aExp < 0x3FFF ) {
3298 1.1 ross if ( aExp || aSig ) float_set_inexact();
3299 1.1 ross return 0;
3300 1.1 ross }
3301 1.1 ross shiftCount = 0x403E - aExp;
3302 1.1 ross savedASig = aSig;
3303 1.1 ross aSig >>= shiftCount;
3304 1.1 ross z = aSig;
3305 1.1 ross if ( aSign ) z = - z;
3306 1.1 ross if ( ( z < 0 ) ^ aSign ) {
3307 1.1 ross invalid:
3308 1.1 ross float_raise( float_flag_invalid );
3309 1.1 ross return aSign ? (sbits32) 0x80000000 : 0x7FFFFFFF;
3310 1.1 ross }
3311 1.1 ross if ( ( aSig<<shiftCount ) != savedASig ) {
3312 1.1 ross float_set_inexact();
3313 1.1 ross }
3314 1.1 ross return z;
3315 1.1 ross
3316 1.1 ross }
3317 1.1 ross
3318 1.1 ross /*
3319 1.1 ross -------------------------------------------------------------------------------
3320 1.1 ross Returns the result of converting the extended double-precision floating-
3321 1.1 ross point value `a' to the 64-bit two's complement integer format. The
3322 1.1 ross conversion is performed according to the IEC/IEEE Standard for Binary
3323 1.1 ross Floating-Point Arithmetic---which means in particular that the conversion
3324 1.1 ross is rounded according to the current rounding mode. If `a' is a NaN,
3325 1.1 ross the largest positive integer is returned. Otherwise, if the conversion
3326 1.1 ross overflows, the largest integer with the same sign as `a' is returned.
3327 1.1 ross -------------------------------------------------------------------------------
3328 1.1 ross */
3329 1.1 ross int64 floatx80_to_int64( floatx80 a )
3330 1.1 ross {
3331 1.1 ross flag aSign;
3332 1.1 ross int32 aExp, shiftCount;
3333 1.1 ross bits64 aSig, aSigExtra;
3334 1.1 ross
3335 1.1 ross aSig = extractFloatx80Frac( a );
3336 1.1 ross aExp = extractFloatx80Exp( a );
3337 1.1 ross aSign = extractFloatx80Sign( a );
3338 1.1 ross shiftCount = 0x403E - aExp;
3339 1.1 ross if ( shiftCount <= 0 ) {
3340 1.1 ross if ( shiftCount ) {
3341 1.1 ross float_raise( float_flag_invalid );
3342 1.1 ross if ( ! aSign
3343 1.1 ross || ( ( aExp == 0x7FFF )
3344 1.1 ross && ( aSig != LIT64( 0x8000000000000000 ) ) )
3345 1.1 ross ) {
3346 1.1 ross return LIT64( 0x7FFFFFFFFFFFFFFF );
3347 1.1 ross }
3348 1.1 ross return (sbits64) LIT64( 0x8000000000000000 );
3349 1.1 ross }
3350 1.1 ross aSigExtra = 0;
3351 1.1 ross }
3352 1.1 ross else {
3353 1.1 ross shift64ExtraRightJamming( aSig, 0, shiftCount, &aSig, &aSigExtra );
3354 1.1 ross }
3355 1.1 ross return roundAndPackInt64( aSign, aSig, aSigExtra );
3356 1.1 ross
3357 1.1 ross }
3358 1.1 ross
3359 1.1 ross /*
3360 1.1 ross -------------------------------------------------------------------------------
3361 1.1 ross Returns the result of converting the extended double-precision floating-
3362 1.1 ross point value `a' to the 64-bit two's complement integer format. The
3363 1.1 ross conversion is performed according to the IEC/IEEE Standard for Binary
3364 1.1 ross Floating-Point Arithmetic, except that the conversion is always rounded
3365 1.1 ross toward zero. If `a' is a NaN, the largest positive integer is returned.
3366 1.1 ross Otherwise, if the conversion overflows, the largest integer with the same
3367 1.1 ross sign as `a' is returned.
3368 1.1 ross -------------------------------------------------------------------------------
3369 1.1 ross */
3370 1.1 ross int64 floatx80_to_int64_round_to_zero( floatx80 a )
3371 1.1 ross {
3372 1.1 ross flag aSign;
3373 1.1 ross int32 aExp, shiftCount;
3374 1.1 ross bits64 aSig;
3375 1.1 ross int64 z;
3376 1.1 ross
3377 1.1 ross aSig = extractFloatx80Frac( a );
3378 1.1 ross aExp = extractFloatx80Exp( a );
3379 1.1 ross aSign = extractFloatx80Sign( a );
3380 1.1 ross shiftCount = aExp - 0x403E;
3381 1.1 ross if ( 0 <= shiftCount ) {
3382 1.1 ross aSig &= LIT64( 0x7FFFFFFFFFFFFFFF );
3383 1.1 ross if ( ( a.high != 0xC03E ) || aSig ) {
3384 1.1 ross float_raise( float_flag_invalid );
3385 1.1 ross if ( ! aSign || ( ( aExp == 0x7FFF ) && aSig ) ) {
3386 1.1 ross return LIT64( 0x7FFFFFFFFFFFFFFF );
3387 1.1 ross }
3388 1.1 ross }
3389 1.1 ross return (sbits64) LIT64( 0x8000000000000000 );
3390 1.1 ross }
3391 1.1 ross else if ( aExp < 0x3FFF ) {
3392 1.1 ross if ( aExp | aSig ) float_set_inexact();
3393 1.1 ross return 0;
3394 1.1 ross }
3395 1.1 ross z = aSig>>( - shiftCount );
3396 1.1 ross if ( (bits64) ( aSig<<( shiftCount & 63 ) ) ) {
3397 1.1 ross float_set_inexact();
3398 1.1 ross }
3399 1.1 ross if ( aSign ) z = - z;
3400 1.1 ross return z;
3401 1.1 ross
3402 1.1 ross }
3403 1.1 ross
3404 1.1 ross /*
3405 1.1 ross -------------------------------------------------------------------------------
3406 1.1 ross Returns the result of converting the extended double-precision floating-
3407 1.1 ross point value `a' to the single-precision floating-point format. The
3408 1.1 ross conversion is performed according to the IEC/IEEE Standard for Binary
3409 1.1 ross Floating-Point Arithmetic.
3410 1.1 ross -------------------------------------------------------------------------------
3411 1.1 ross */
3412 1.1 ross float32 floatx80_to_float32( floatx80 a )
3413 1.1 ross {
3414 1.1 ross flag aSign;
3415 1.1 ross int32 aExp;
3416 1.1 ross bits64 aSig;
3417 1.1 ross
3418 1.1 ross aSig = extractFloatx80Frac( a );
3419 1.1 ross aExp = extractFloatx80Exp( a );
3420 1.1 ross aSign = extractFloatx80Sign( a );
3421 1.1 ross if ( aExp == 0x7FFF ) {
3422 1.1 ross if ( (bits64) ( aSig<<1 ) ) {
3423 1.1 ross return commonNaNToFloat32( floatx80ToCommonNaN( a ) );
3424 1.1 ross }
3425 1.1 ross return packFloat32( aSign, 0xFF, 0 );
3426 1.1 ross }
3427 1.1 ross shift64RightJamming( aSig, 33, &aSig );
3428 1.1 ross if ( aExp || aSig ) aExp -= 0x3F81;
3429 1.1 ross return roundAndPackFloat32( aSign, aExp, aSig );
3430 1.1 ross
3431 1.1 ross }
3432 1.1 ross
3433 1.1 ross /*
3434 1.1 ross -------------------------------------------------------------------------------
3435 1.1 ross Returns the result of converting the extended double-precision floating-
3436 1.1 ross point value `a' to the double-precision floating-point format. The
3437 1.1 ross conversion is performed according to the IEC/IEEE Standard for Binary
3438 1.1 ross Floating-Point Arithmetic.
3439 1.1 ross -------------------------------------------------------------------------------
3440 1.1 ross */
3441 1.1 ross float64 floatx80_to_float64( floatx80 a )
3442 1.1 ross {
3443 1.1 ross flag aSign;
3444 1.1 ross int32 aExp;
3445 1.1 ross bits64 aSig, zSig;
3446 1.1 ross
3447 1.1 ross aSig = extractFloatx80Frac( a );
3448 1.1 ross aExp = extractFloatx80Exp( a );
3449 1.1 ross aSign = extractFloatx80Sign( a );
3450 1.1 ross if ( aExp == 0x7FFF ) {
3451 1.1 ross if ( (bits64) ( aSig<<1 ) ) {
3452 1.1 ross return commonNaNToFloat64( floatx80ToCommonNaN( a ) );
3453 1.1 ross }
3454 1.1 ross return packFloat64( aSign, 0x7FF, 0 );
3455 1.1 ross }
3456 1.1 ross shift64RightJamming( aSig, 1, &zSig );
3457 1.1 ross if ( aExp || aSig ) aExp -= 0x3C01;
3458 1.1 ross return roundAndPackFloat64( aSign, aExp, zSig );
3459 1.1 ross
3460 1.1 ross }
3461 1.1 ross
3462 1.1 ross #ifdef FLOAT128
3463 1.1 ross
3464 1.1 ross /*
3465 1.1 ross -------------------------------------------------------------------------------
3466 1.1 ross Returns the result of converting the extended double-precision floating-
3467 1.1 ross point value `a' to the quadruple-precision floating-point format. The
3468 1.1 ross conversion is performed according to the IEC/IEEE Standard for Binary
3469 1.1 ross Floating-Point Arithmetic.
3470 1.1 ross -------------------------------------------------------------------------------
3471 1.1 ross */
3472 1.1 ross float128 floatx80_to_float128( floatx80 a )
3473 1.1 ross {
3474 1.1 ross flag aSign;
3475 1.1 ross int16 aExp;
3476 1.1 ross bits64 aSig, zSig0, zSig1;
3477 1.1 ross
3478 1.1 ross aSig = extractFloatx80Frac( a );
3479 1.1 ross aExp = extractFloatx80Exp( a );
3480 1.1 ross aSign = extractFloatx80Sign( a );
3481 1.1 ross if ( ( aExp == 0x7FFF ) && (bits64) ( aSig<<1 ) ) {
3482 1.1 ross return commonNaNToFloat128( floatx80ToCommonNaN( a ) );
3483 1.1 ross }
3484 1.1 ross shift128Right( aSig<<1, 0, 16, &zSig0, &zSig1 );
3485 1.1 ross return packFloat128( aSign, aExp, zSig0, zSig1 );
3486 1.1 ross
3487 1.1 ross }
3488 1.1 ross
3489 1.1 ross #endif
3490 1.1 ross
3491 1.1 ross /*
3492 1.1 ross -------------------------------------------------------------------------------
3493 1.1 ross Rounds the extended double-precision floating-point value `a' to an integer,
3494 1.1 ross and returns the result as an extended quadruple-precision floating-point
3495 1.1 ross value. The operation is performed according to the IEC/IEEE Standard for
3496 1.1 ross Binary Floating-Point Arithmetic.
3497 1.1 ross -------------------------------------------------------------------------------
3498 1.1 ross */
3499 1.1 ross floatx80 floatx80_round_to_int( floatx80 a )
3500 1.1 ross {
3501 1.1 ross flag aSign;
3502 1.1 ross int32 aExp;
3503 1.1 ross bits64 lastBitMask, roundBitsMask;
3504 1.1 ross int8 roundingMode;
3505 1.1 ross floatx80 z;
3506 1.1 ross
3507 1.1 ross aExp = extractFloatx80Exp( a );
3508 1.1 ross if ( 0x403E <= aExp ) {
3509 1.1 ross if ( ( aExp == 0x7FFF ) && (bits64) ( extractFloatx80Frac( a )<<1 ) ) {
3510 1.1 ross return propagateFloatx80NaN( a, a );
3511 1.1 ross }
3512 1.1 ross return a;
3513 1.1 ross }
3514 1.1 ross if ( aExp < 0x3FFF ) {
3515 1.1 ross if ( ( aExp == 0 )
3516 1.1 ross && ( (bits64) ( extractFloatx80Frac( a )<<1 ) == 0 ) ) {
3517 1.1 ross return a;
3518 1.1 ross }
3519 1.1 ross float_set_inexact();
3520 1.1 ross aSign = extractFloatx80Sign( a );
3521 1.1 ross switch ( float_rounding_mode() ) {
3522 1.1 ross case float_round_nearest_even:
3523 1.1 ross if ( ( aExp == 0x3FFE ) && (bits64) ( extractFloatx80Frac( a )<<1 )
3524 1.1 ross ) {
3525 1.1 ross return
3526 1.1 ross packFloatx80( aSign, 0x3FFF, LIT64( 0x8000000000000000 ) );
3527 1.1 ross }
3528 1.1 ross break;
3529 1.1 ross case float_round_down:
3530 1.1 ross return
3531 1.1 ross aSign ?
3532 1.1 ross packFloatx80( 1, 0x3FFF, LIT64( 0x8000000000000000 ) )
3533 1.1 ross : packFloatx80( 0, 0, 0 );
3534 1.1 ross case float_round_up:
3535 1.1 ross return
3536 1.1 ross aSign ? packFloatx80( 1, 0, 0 )
3537 1.1 ross : packFloatx80( 0, 0x3FFF, LIT64( 0x8000000000000000 ) );
3538 1.1 ross }
3539 1.1 ross return packFloatx80( aSign, 0, 0 );
3540 1.1 ross }
3541 1.1 ross lastBitMask = 1;
3542 1.1 ross lastBitMask <<= 0x403E - aExp;
3543 1.1 ross roundBitsMask = lastBitMask - 1;
3544 1.1 ross z = a;
3545 1.1 ross roundingMode = float_rounding_mode();
3546 1.1 ross if ( roundingMode == float_round_nearest_even ) {
3547 1.1 ross z.low += lastBitMask>>1;
3548 1.1 ross if ( ( z.low & roundBitsMask ) == 0 ) z.low &= ~ lastBitMask;
3549 1.1 ross }
3550 1.1 ross else if ( roundingMode != float_round_to_zero ) {
3551 1.1 ross if ( extractFloatx80Sign( z ) ^ ( roundingMode == float_round_up ) ) {
3552 1.1 ross z.low += roundBitsMask;
3553 1.1 ross }
3554 1.1 ross }
3555 1.1 ross z.low &= ~ roundBitsMask;
3556 1.1 ross if ( z.low == 0 ) {
3557 1.1 ross ++z.high;
3558 1.1 ross z.low = LIT64( 0x8000000000000000 );
3559 1.1 ross }
3560 1.1 ross if ( z.low != a.low ) float_set_inexact();
3561 1.1 ross return z;
3562 1.1 ross
3563 1.1 ross }
3564 1.1 ross
3565 1.1 ross /*
3566 1.1 ross -------------------------------------------------------------------------------
3567 1.1 ross Returns the result of adding the absolute values of the extended double-
3568 1.1 ross precision floating-point values `a' and `b'. If `zSign' is 1, the sum is
3569 1.1 ross negated before being returned. `zSign' is ignored if the result is a NaN.
3570 1.1 ross The addition is performed according to the IEC/IEEE Standard for Binary
3571 1.1 ross Floating-Point Arithmetic.
3572 1.1 ross -------------------------------------------------------------------------------
3573 1.1 ross */
3574 1.1 ross static floatx80 addFloatx80Sigs( floatx80 a, floatx80 b, flag zSign )
3575 1.1 ross {
3576 1.1 ross int32 aExp, bExp, zExp;
3577 1.1 ross bits64 aSig, bSig, zSig0, zSig1;
3578 1.1 ross int32 expDiff;
3579 1.1 ross
3580 1.1 ross aSig = extractFloatx80Frac( a );
3581 1.1 ross aExp = extractFloatx80Exp( a );
3582 1.1 ross bSig = extractFloatx80Frac( b );
3583 1.1 ross bExp = extractFloatx80Exp( b );
3584 1.1 ross expDiff = aExp - bExp;
3585 1.1 ross if ( 0 < expDiff ) {
3586 1.1 ross if ( aExp == 0x7FFF ) {
3587 1.1 ross if ( (bits64) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b );
3588 1.1 ross return a;
3589 1.1 ross }
3590 1.1 ross if ( bExp == 0 ) --expDiff;
3591 1.1 ross shift64ExtraRightJamming( bSig, 0, expDiff, &bSig, &zSig1 );
3592 1.1 ross zExp = aExp;
3593 1.1 ross }
3594 1.1 ross else if ( expDiff < 0 ) {
3595 1.1 ross if ( bExp == 0x7FFF ) {
3596 1.1 ross if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b );
3597 1.1 ross return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
3598 1.1 ross }
3599 1.1 ross if ( aExp == 0 ) ++expDiff;
3600 1.1 ross shift64ExtraRightJamming( aSig, 0, - expDiff, &aSig, &zSig1 );
3601 1.1 ross zExp = bExp;
3602 1.1 ross }
3603 1.1 ross else {
3604 1.1 ross if ( aExp == 0x7FFF ) {
3605 1.1 ross if ( (bits64) ( ( aSig | bSig )<<1 ) ) {
3606 1.1 ross return propagateFloatx80NaN( a, b );
3607 1.1 ross }
3608 1.1 ross return a;
3609 1.1 ross }
3610 1.1 ross zSig1 = 0;
3611 1.1 ross zSig0 = aSig + bSig;
3612 1.1 ross if ( aExp == 0 ) {
3613 1.1 ross normalizeFloatx80Subnormal( zSig0, &zExp, &zSig0 );
3614 1.1 ross goto roundAndPack;
3615 1.1 ross }
3616 1.1 ross zExp = aExp;
3617 1.1 ross goto shiftRight1;
3618 1.1 ross }
3619 1.1 ross zSig0 = aSig + bSig;
3620 1.1 ross if ( (sbits64) zSig0 < 0 ) goto roundAndPack;
3621 1.1 ross shiftRight1:
3622 1.1 ross shift64ExtraRightJamming( zSig0, zSig1, 1, &zSig0, &zSig1 );
3623 1.1 ross zSig0 |= LIT64( 0x8000000000000000 );
3624 1.1 ross ++zExp;
3625 1.1 ross roundAndPack:
3626 1.1 ross return
3627 1.1 ross roundAndPackFloatx80(
3628 1.1 ross floatx80_rounding_precision, zSign, zExp, zSig0, zSig1 );
3629 1.1 ross
3630 1.1 ross }
3631 1.1 ross
3632 1.1 ross /*
3633 1.1 ross -------------------------------------------------------------------------------
3634 1.1 ross Returns the result of subtracting the absolute values of the extended
3635 1.1 ross double-precision floating-point values `a' and `b'. If `zSign' is 1, the
3636 1.1 ross difference is negated before being returned. `zSign' is ignored if the
3637 1.1 ross result is a NaN. The subtraction is performed according to the IEC/IEEE
3638 1.1 ross Standard for Binary Floating-Point Arithmetic.
3639 1.1 ross -------------------------------------------------------------------------------
3640 1.1 ross */
3641 1.1 ross static floatx80 subFloatx80Sigs( floatx80 a, floatx80 b, flag zSign )
3642 1.1 ross {
3643 1.1 ross int32 aExp, bExp, zExp;
3644 1.1 ross bits64 aSig, bSig, zSig0, zSig1;
3645 1.1 ross int32 expDiff;
3646 1.1 ross floatx80 z;
3647 1.1 ross
3648 1.1 ross aSig = extractFloatx80Frac( a );
3649 1.1 ross aExp = extractFloatx80Exp( a );
3650 1.1 ross bSig = extractFloatx80Frac( b );
3651 1.1 ross bExp = extractFloatx80Exp( b );
3652 1.1 ross expDiff = aExp - bExp;
3653 1.1 ross if ( 0 < expDiff ) goto aExpBigger;
3654 1.1 ross if ( expDiff < 0 ) goto bExpBigger;
3655 1.1 ross if ( aExp == 0x7FFF ) {
3656 1.1 ross if ( (bits64) ( ( aSig | bSig )<<1 ) ) {
3657 1.1 ross return propagateFloatx80NaN( a, b );
3658 1.1 ross }
3659 1.1 ross float_raise( float_flag_invalid );
3660 1.1 ross z.low = floatx80_default_nan_low;
3661 1.1 ross z.high = floatx80_default_nan_high;
3662 1.1 ross return z;
3663 1.1 ross }
3664 1.1 ross if ( aExp == 0 ) {
3665 1.1 ross aExp = 1;
3666 1.1 ross bExp = 1;
3667 1.1 ross }
3668 1.1 ross zSig1 = 0;
3669 1.1 ross if ( bSig < aSig ) goto aBigger;
3670 1.1 ross if ( aSig < bSig ) goto bBigger;
3671 1.1 ross return packFloatx80( float_rounding_mode() == float_round_down, 0, 0 );
3672 1.1 ross bExpBigger:
3673 1.1 ross if ( bExp == 0x7FFF ) {
3674 1.1 ross if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b );
3675 1.1 ross return packFloatx80( zSign ^ 1, 0x7FFF, LIT64( 0x8000000000000000 ) );
3676 1.1 ross }
3677 1.1 ross if ( aExp == 0 ) ++expDiff;
3678 1.1 ross shift128RightJamming( aSig, 0, - expDiff, &aSig, &zSig1 );
3679 1.1 ross bBigger:
3680 1.1 ross sub128( bSig, 0, aSig, zSig1, &zSig0, &zSig1 );
3681 1.1 ross zExp = bExp;
3682 1.1 ross zSign ^= 1;
3683 1.1 ross goto normalizeRoundAndPack;
3684 1.1 ross aExpBigger:
3685 1.1 ross if ( aExp == 0x7FFF ) {
3686 1.1 ross if ( (bits64) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b );
3687 1.1 ross return a;
3688 1.1 ross }
3689 1.1 ross if ( bExp == 0 ) --expDiff;
3690 1.1 ross shift128RightJamming( bSig, 0, expDiff, &bSig, &zSig1 );
3691 1.1 ross aBigger:
3692 1.1 ross sub128( aSig, 0, bSig, zSig1, &zSig0, &zSig1 );
3693 1.1 ross zExp = aExp;
3694 1.1 ross normalizeRoundAndPack:
3695 1.1 ross return
3696 1.1 ross normalizeRoundAndPackFloatx80(
3697 1.1 ross floatx80_rounding_precision, zSign, zExp, zSig0, zSig1 );
3698 1.1 ross
3699 1.1 ross }
3700 1.1 ross
3701 1.1 ross /*
3702 1.1 ross -------------------------------------------------------------------------------
3703 1.1 ross Returns the result of adding the extended double-precision floating-point
3704 1.1 ross values `a' and `b'. The operation is performed according to the IEC/IEEE
3705 1.1 ross Standard for Binary Floating-Point Arithmetic.
3706 1.1 ross -------------------------------------------------------------------------------
3707 1.1 ross */
3708 1.1 ross floatx80 floatx80_add( floatx80 a, floatx80 b )
3709 1.1 ross {
3710 1.1 ross flag aSign, bSign;
3711 1.1 ross
3712 1.1 ross aSign = extractFloatx80Sign( a );
3713 1.1 ross bSign = extractFloatx80Sign( b );
3714 1.1 ross if ( aSign == bSign ) {
3715 1.1 ross return addFloatx80Sigs( a, b, aSign );
3716 1.1 ross }
3717 1.1 ross else {
3718 1.1 ross return subFloatx80Sigs( a, b, aSign );
3719 1.1 ross }
3720 1.1 ross
3721 1.1 ross }
3722 1.1 ross
3723 1.1 ross /*
3724 1.1 ross -------------------------------------------------------------------------------
3725 1.1 ross Returns the result of subtracting the extended double-precision floating-
3726 1.1 ross point values `a' and `b'. The operation is performed according to the
3727 1.1 ross IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3728 1.1 ross -------------------------------------------------------------------------------
3729 1.1 ross */
3730 1.1 ross floatx80 floatx80_sub( floatx80 a, floatx80 b )
3731 1.1 ross {
3732 1.1 ross flag aSign, bSign;
3733 1.1 ross
3734 1.1 ross aSign = extractFloatx80Sign( a );
3735 1.1 ross bSign = extractFloatx80Sign( b );
3736 1.1 ross if ( aSign == bSign ) {
3737 1.1 ross return subFloatx80Sigs( a, b, aSign );
3738 1.1 ross }
3739 1.1 ross else {
3740 1.1 ross return addFloatx80Sigs( a, b, aSign );
3741 1.1 ross }
3742 1.1 ross
3743 1.1 ross }
3744 1.1 ross
3745 1.1 ross /*
3746 1.1 ross -------------------------------------------------------------------------------
3747 1.1 ross Returns the result of multiplying the extended double-precision floating-
3748 1.1 ross point values `a' and `b'. The operation is performed according to the
3749 1.1 ross IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3750 1.1 ross -------------------------------------------------------------------------------
3751 1.1 ross */
3752 1.1 ross floatx80 floatx80_mul( floatx80 a, floatx80 b )
3753 1.1 ross {
3754 1.1 ross flag aSign, bSign, zSign;
3755 1.1 ross int32 aExp, bExp, zExp;
3756 1.1 ross bits64 aSig, bSig, zSig0, zSig1;
3757 1.1 ross floatx80 z;
3758 1.1 ross
3759 1.1 ross aSig = extractFloatx80Frac( a );
3760 1.1 ross aExp = extractFloatx80Exp( a );
3761 1.1 ross aSign = extractFloatx80Sign( a );
3762 1.1 ross bSig = extractFloatx80Frac( b );
3763 1.1 ross bExp = extractFloatx80Exp( b );
3764 1.1 ross bSign = extractFloatx80Sign( b );
3765 1.1 ross zSign = aSign ^ bSign;
3766 1.1 ross if ( aExp == 0x7FFF ) {
3767 1.1 ross if ( (bits64) ( aSig<<1 )
3768 1.1 ross || ( ( bExp == 0x7FFF ) && (bits64) ( bSig<<1 ) ) ) {
3769 1.1 ross return propagateFloatx80NaN( a, b );
3770 1.1 ross }
3771 1.1 ross if ( ( bExp | bSig ) == 0 ) goto invalid;
3772 1.1 ross return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
3773 1.1 ross }
3774 1.1 ross if ( bExp == 0x7FFF ) {
3775 1.1 ross if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b );
3776 1.1 ross if ( ( aExp | aSig ) == 0 ) {
3777 1.1 ross invalid:
3778 1.1 ross float_raise( float_flag_invalid );
3779 1.1 ross z.low = floatx80_default_nan_low;
3780 1.1 ross z.high = floatx80_default_nan_high;
3781 1.1 ross return z;
3782 1.1 ross }
3783 1.1 ross return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
3784 1.1 ross }
3785 1.1 ross if ( aExp == 0 ) {
3786 1.1 ross if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 );
3787 1.1 ross normalizeFloatx80Subnormal( aSig, &aExp, &aSig );
3788 1.1 ross }
3789 1.1 ross if ( bExp == 0 ) {
3790 1.1 ross if ( bSig == 0 ) return packFloatx80( zSign, 0, 0 );
3791 1.1 ross normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
3792 1.1 ross }
3793 1.1 ross zExp = aExp + bExp - 0x3FFE;
3794 1.1 ross mul64To128( aSig, bSig, &zSig0, &zSig1 );
3795 1.1 ross if ( 0 < (sbits64) zSig0 ) {
3796 1.1 ross shortShift128Left( zSig0, zSig1, 1, &zSig0, &zSig1 );
3797 1.1 ross --zExp;
3798 1.1 ross }
3799 1.1 ross return
3800 1.1 ross roundAndPackFloatx80(
3801 1.1 ross floatx80_rounding_precision, zSign, zExp, zSig0, zSig1 );
3802 1.1 ross
3803 1.1 ross }
3804 1.1 ross
3805 1.1 ross /*
3806 1.1 ross -------------------------------------------------------------------------------
3807 1.1 ross Returns the result of dividing the extended double-precision floating-point
3808 1.1 ross value `a' by the corresponding value `b'. The operation is performed
3809 1.1 ross according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3810 1.1 ross -------------------------------------------------------------------------------
3811 1.1 ross */
3812 1.1 ross floatx80 floatx80_div( floatx80 a, floatx80 b )
3813 1.1 ross {
3814 1.1 ross flag aSign, bSign, zSign;
3815 1.1 ross int32 aExp, bExp, zExp;
3816 1.1 ross bits64 aSig, bSig, zSig0, zSig1;
3817 1.1 ross bits64 rem0, rem1, rem2, term0, term1, term2;
3818 1.1 ross floatx80 z;
3819 1.1 ross
3820 1.1 ross aSig = extractFloatx80Frac( a );
3821 1.1 ross aExp = extractFloatx80Exp( a );
3822 1.1 ross aSign = extractFloatx80Sign( a );
3823 1.1 ross bSig = extractFloatx80Frac( b );
3824 1.1 ross bExp = extractFloatx80Exp( b );
3825 1.1 ross bSign = extractFloatx80Sign( b );
3826 1.1 ross zSign = aSign ^ bSign;
3827 1.1 ross if ( aExp == 0x7FFF ) {
3828 1.1 ross if ( (bits64) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b );
3829 1.1 ross if ( bExp == 0x7FFF ) {
3830 1.1 ross if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b );
3831 1.1 ross goto invalid;
3832 1.1 ross }
3833 1.1 ross return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
3834 1.1 ross }
3835 1.1 ross if ( bExp == 0x7FFF ) {
3836 1.1 ross if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b );
3837 1.1 ross return packFloatx80( zSign, 0, 0 );
3838 1.1 ross }
3839 1.1 ross if ( bExp == 0 ) {
3840 1.1 ross if ( bSig == 0 ) {
3841 1.1 ross if ( ( aExp | aSig ) == 0 ) {
3842 1.1 ross invalid:
3843 1.1 ross float_raise( float_flag_invalid );
3844 1.1 ross z.low = floatx80_default_nan_low;
3845 1.1 ross z.high = floatx80_default_nan_high;
3846 1.1 ross return z;
3847 1.1 ross }
3848 1.1 ross float_raise( float_flag_divbyzero );
3849 1.1 ross return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
3850 1.1 ross }
3851 1.1 ross normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
3852 1.1 ross }
3853 1.1 ross if ( aExp == 0 ) {
3854 1.1 ross if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 );
3855 1.1 ross normalizeFloatx80Subnormal( aSig, &aExp, &aSig );
3856 1.1 ross }
3857 1.1 ross zExp = aExp - bExp + 0x3FFE;
3858 1.1 ross rem1 = 0;
3859 1.1 ross if ( bSig <= aSig ) {
3860 1.1 ross shift128Right( aSig, 0, 1, &aSig, &rem1 );
3861 1.1 ross ++zExp;
3862 1.1 ross }
3863 1.1 ross zSig0 = estimateDiv128To64( aSig, rem1, bSig );
3864 1.1 ross mul64To128( bSig, zSig0, &term0, &term1 );
3865 1.1 ross sub128( aSig, rem1, term0, term1, &rem0, &rem1 );
3866 1.1 ross while ( (sbits64) rem0 < 0 ) {
3867 1.1 ross --zSig0;
3868 1.1 ross add128( rem0, rem1, 0, bSig, &rem0, &rem1 );
3869 1.1 ross }
3870 1.1 ross zSig1 = estimateDiv128To64( rem1, 0, bSig );
3871 1.1 ross if ( (bits64) ( zSig1<<1 ) <= 8 ) {
3872 1.1 ross mul64To128( bSig, zSig1, &term1, &term2 );
3873 1.1 ross sub128( rem1, 0, term1, term2, &rem1, &rem2 );
3874 1.1 ross while ( (sbits64) rem1 < 0 ) {
3875 1.1 ross --zSig1;
3876 1.1 ross add128( rem1, rem2, 0, bSig, &rem1, &rem2 );
3877 1.1 ross }
3878 1.1 ross zSig1 |= ( ( rem1 | rem2 ) != 0 );
3879 1.1 ross }
3880 1.1 ross return
3881 1.1 ross roundAndPackFloatx80(
3882 1.1 ross floatx80_rounding_precision, zSign, zExp, zSig0, zSig1 );
3883 1.1 ross
3884 1.1 ross }
3885 1.1 ross
3886 1.1 ross /*
3887 1.1 ross -------------------------------------------------------------------------------
3888 1.1 ross Returns the remainder of the extended double-precision floating-point value
3889 1.1 ross `a' with respect to the corresponding value `b'. The operation is performed
3890 1.1 ross according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3891 1.1 ross -------------------------------------------------------------------------------
3892 1.1 ross */
3893 1.1 ross floatx80 floatx80_rem( floatx80 a, floatx80 b )
3894 1.1 ross {
3895 1.1 ross flag aSign, bSign, zSign;
3896 1.1 ross int32 aExp, bExp, expDiff;
3897 1.1 ross bits64 aSig0, aSig1, bSig;
3898 1.1 ross bits64 q, term0, term1, alternateASig0, alternateASig1;
3899 1.1 ross floatx80 z;
3900 1.1 ross
3901 1.1 ross aSig0 = extractFloatx80Frac( a );
3902 1.1 ross aExp = extractFloatx80Exp( a );
3903 1.1 ross aSign = extractFloatx80Sign( a );
3904 1.1 ross bSig = extractFloatx80Frac( b );
3905 1.1 ross bExp = extractFloatx80Exp( b );
3906 1.1 ross bSign = extractFloatx80Sign( b );
3907 1.1 ross if ( aExp == 0x7FFF ) {
3908 1.1 ross if ( (bits64) ( aSig0<<1 )
3909 1.1 ross || ( ( bExp == 0x7FFF ) && (bits64) ( bSig<<1 ) ) ) {
3910 1.1 ross return propagateFloatx80NaN( a, b );
3911 1.1 ross }
3912 1.1 ross goto invalid;
3913 1.1 ross }
3914 1.1 ross if ( bExp == 0x7FFF ) {
3915 1.1 ross if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b );
3916 1.1 ross return a;
3917 1.1 ross }
3918 1.1 ross if ( bExp == 0 ) {
3919 1.1 ross if ( bSig == 0 ) {
3920 1.1 ross invalid:
3921 1.1 ross float_raise( float_flag_invalid );
3922 1.1 ross z.low = floatx80_default_nan_low;
3923 1.1 ross z.high = floatx80_default_nan_high;
3924 1.1 ross return z;
3925 1.1 ross }
3926 1.1 ross normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
3927 1.1 ross }
3928 1.1 ross if ( aExp == 0 ) {
3929 1.1 ross if ( (bits64) ( aSig0<<1 ) == 0 ) return a;
3930 1.1 ross normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 );
3931 1.1 ross }
3932 1.1 ross bSig |= LIT64( 0x8000000000000000 );
3933 1.1 ross zSign = aSign;
3934 1.1 ross expDiff = aExp - bExp;
3935 1.1 ross aSig1 = 0;
3936 1.1 ross if ( expDiff < 0 ) {
3937 1.1 ross if ( expDiff < -1 ) return a;
3938 1.1 ross shift128Right( aSig0, 0, 1, &aSig0, &aSig1 );
3939 1.1 ross expDiff = 0;
3940 1.1 ross }
3941 1.1 ross q = ( bSig <= aSig0 );
3942 1.1 ross if ( q ) aSig0 -= bSig;
3943 1.1 ross expDiff -= 64;
3944 1.1 ross while ( 0 < expDiff ) {
3945 1.1 ross q = estimateDiv128To64( aSig0, aSig1, bSig );
3946 1.1 ross q = ( 2 < q ) ? q - 2 : 0;
3947 1.1 ross mul64To128( bSig, q, &term0, &term1 );
3948 1.1 ross sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
3949 1.1 ross shortShift128Left( aSig0, aSig1, 62, &aSig0, &aSig1 );
3950 1.1 ross expDiff -= 62;
3951 1.1 ross }
3952 1.1 ross expDiff += 64;
3953 1.1 ross if ( 0 < expDiff ) {
3954 1.1 ross q = estimateDiv128To64( aSig0, aSig1, bSig );
3955 1.1 ross q = ( 2 < q ) ? q - 2 : 0;
3956 1.1 ross q >>= 64 - expDiff;
3957 1.1 ross mul64To128( bSig, q<<( 64 - expDiff ), &term0, &term1 );
3958 1.1 ross sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
3959 1.1 ross shortShift128Left( 0, bSig, 64 - expDiff, &term0, &term1 );
3960 1.1 ross while ( le128( term0, term1, aSig0, aSig1 ) ) {
3961 1.1 ross ++q;
3962 1.1 ross sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
3963 1.1 ross }
3964 1.1 ross }
3965 1.1 ross else {
3966 1.1 ross term1 = 0;
3967 1.1 ross term0 = bSig;
3968 1.1 ross }
3969 1.1 ross sub128( term0, term1, aSig0, aSig1, &alternateASig0, &alternateASig1 );
3970 1.1 ross if ( lt128( alternateASig0, alternateASig1, aSig0, aSig1 )
3971 1.1 ross || ( eq128( alternateASig0, alternateASig1, aSig0, aSig1 )
3972 1.1 ross && ( q & 1 ) )
3973 1.1 ross ) {
3974 1.1 ross aSig0 = alternateASig0;
3975 1.1 ross aSig1 = alternateASig1;
3976 1.1 ross zSign = ! zSign;
3977 1.1 ross }
3978 1.1 ross return
3979 1.1 ross normalizeRoundAndPackFloatx80(
3980 1.1 ross 80, zSign, bExp + expDiff, aSig0, aSig1 );
3981 1.1 ross
3982 1.1 ross }
3983 1.1 ross
3984 1.1 ross /*
3985 1.1 ross -------------------------------------------------------------------------------
3986 1.1 ross Returns the square root of the extended double-precision floating-point
3987 1.1 ross value `a'. The operation is performed according to the IEC/IEEE Standard
3988 1.1 ross for Binary Floating-Point Arithmetic.
3989 1.1 ross -------------------------------------------------------------------------------
3990 1.1 ross */
3991 1.1 ross floatx80 floatx80_sqrt( floatx80 a )
3992 1.1 ross {
3993 1.1 ross flag aSign;
3994 1.1 ross int32 aExp, zExp;
3995 1.1 ross bits64 aSig0, aSig1, zSig0, zSig1, doubleZSig0;
3996 1.1 ross bits64 rem0, rem1, rem2, rem3, term0, term1, term2, term3;
3997 1.1 ross floatx80 z;
3998 1.1 ross
3999 1.1 ross aSig0 = extractFloatx80Frac( a );
4000 1.1 ross aExp = extractFloatx80Exp( a );
4001 1.1 ross aSign = extractFloatx80Sign( a );
4002 1.1 ross if ( aExp == 0x7FFF ) {
4003 1.1 ross if ( (bits64) ( aSig0<<1 ) ) return propagateFloatx80NaN( a, a );
4004 1.1 ross if ( ! aSign ) return a;
4005 1.1 ross goto invalid;
4006 1.1 ross }
4007 1.1 ross if ( aSign ) {
4008 1.1 ross if ( ( aExp | aSig0 ) == 0 ) return a;
4009 1.1 ross invalid:
4010 1.1 ross float_raise( float_flag_invalid );
4011 1.1 ross z.low = floatx80_default_nan_low;
4012 1.1 ross z.high = floatx80_default_nan_high;
4013 1.1 ross return z;
4014 1.1 ross }
4015 1.1 ross if ( aExp == 0 ) {
4016 1.1 ross if ( aSig0 == 0 ) return packFloatx80( 0, 0, 0 );
4017 1.1 ross normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 );
4018 1.1 ross }
4019 1.1 ross zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFF;
4020 1.1 ross zSig0 = estimateSqrt32( aExp, aSig0>>32 );
4021 1.1 ross shift128Right( aSig0, 0, 2 + ( aExp & 1 ), &aSig0, &aSig1 );
4022 1.1 ross zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 );
4023 1.1 ross doubleZSig0 = zSig0<<1;
4024 1.1 ross mul64To128( zSig0, zSig0, &term0, &term1 );
4025 1.1 ross sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 );
4026 1.1 ross while ( (sbits64) rem0 < 0 ) {
4027 1.1 ross --zSig0;
4028 1.1 ross doubleZSig0 -= 2;
4029 1.1 ross add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 );
4030 1.1 ross }
4031 1.1 ross zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 );
4032 1.1 ross if ( ( zSig1 & LIT64( 0x3FFFFFFFFFFFFFFF ) ) <= 5 ) {
4033 1.1 ross if ( zSig1 == 0 ) zSig1 = 1;
4034 1.1 ross mul64To128( doubleZSig0, zSig1, &term1, &term2 );
4035 1.1 ross sub128( rem1, 0, term1, term2, &rem1, &rem2 );
4036 1.1 ross mul64To128( zSig1, zSig1, &term2, &term3 );
4037 1.1 ross sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 );
4038 1.1 ross while ( (sbits64) rem1 < 0 ) {
4039 1.1 ross --zSig1;
4040 1.1 ross shortShift128Left( 0, zSig1, 1, &term2, &term3 );
4041 1.1 ross term3 |= 1;
4042 1.1 ross term2 |= doubleZSig0;
4043 1.1 ross add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 );
4044 1.1 ross }
4045 1.1 ross zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
4046 1.1 ross }
4047 1.1 ross shortShift128Left( 0, zSig1, 1, &zSig0, &zSig1 );
4048 1.1 ross zSig0 |= doubleZSig0;
4049 1.1 ross return
4050 1.1 ross roundAndPackFloatx80(
4051 1.1 ross floatx80_rounding_precision, 0, zExp, zSig0, zSig1 );
4052 1.1 ross
4053 1.1 ross }
4054 1.1 ross
4055 1.1 ross /*
4056 1.1 ross -------------------------------------------------------------------------------
4057 1.1 ross Returns 1 if the extended double-precision floating-point value `a' is
4058 1.1 ross equal to the corresponding value `b', and 0 otherwise. The comparison is
4059 1.1 ross performed according to the IEC/IEEE Standard for Binary Floating-Point
4060 1.1 ross Arithmetic.
4061 1.1 ross -------------------------------------------------------------------------------
4062 1.1 ross */
4063 1.1 ross flag floatx80_eq( floatx80 a, floatx80 b )
4064 1.1 ross {
4065 1.1 ross
4066 1.1 ross if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
4067 1.1 ross && (bits64) ( extractFloatx80Frac( a )<<1 ) )
4068 1.1 ross || ( ( extractFloatx80Exp( b ) == 0x7FFF )
4069 1.1 ross && (bits64) ( extractFloatx80Frac( b )<<1 ) )
4070 1.1 ross ) {
4071 1.1 ross if ( floatx80_is_signaling_nan( a )
4072 1.1 ross || floatx80_is_signaling_nan( b ) ) {
4073 1.1 ross float_raise( float_flag_invalid );
4074 1.1 ross }
4075 1.1 ross return 0;
4076 1.1 ross }
4077 1.1 ross return
4078 1.1 ross ( a.low == b.low )
4079 1.1 ross && ( ( a.high == b.high )
4080 1.1 ross || ( ( a.low == 0 )
4081 1.1 ross && ( (bits16) ( ( a.high | b.high )<<1 ) == 0 ) )
4082 1.1 ross );
4083 1.1 ross
4084 1.1 ross }
4085 1.1 ross
4086 1.1 ross /*
4087 1.1 ross -------------------------------------------------------------------------------
4088 1.1 ross Returns 1 if the extended double-precision floating-point value `a' is
4089 1.1 ross less than or equal to the corresponding value `b', and 0 otherwise. The
4090 1.1 ross comparison is performed according to the IEC/IEEE Standard for Binary
4091 1.1 ross Floating-Point Arithmetic.
4092 1.1 ross -------------------------------------------------------------------------------
4093 1.1 ross */
4094 1.1 ross flag floatx80_le( floatx80 a, floatx80 b )
4095 1.1 ross {
4096 1.1 ross flag aSign, bSign;
4097 1.1 ross
4098 1.1 ross if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
4099 1.1 ross && (bits64) ( extractFloatx80Frac( a )<<1 ) )
4100 1.1 ross || ( ( extractFloatx80Exp( b ) == 0x7FFF )
4101 1.1 ross && (bits64) ( extractFloatx80Frac( b )<<1 ) )
4102 1.1 ross ) {
4103 1.1 ross float_raise( float_flag_invalid );
4104 1.1 ross return 0;
4105 1.1 ross }
4106 1.1 ross aSign = extractFloatx80Sign( a );
4107 1.1 ross bSign = extractFloatx80Sign( b );
4108 1.1 ross if ( aSign != bSign ) {
4109 1.1 ross return
4110 1.1 ross aSign
4111 1.1 ross || ( ( ( (bits16) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
4112 1.1 ross == 0 );
4113 1.1 ross }
4114 1.1 ross return
4115 1.1 ross aSign ? le128( b.high, b.low, a.high, a.low )
4116 1.1 ross : le128( a.high, a.low, b.high, b.low );
4117 1.1 ross
4118 1.1 ross }
4119 1.1 ross
4120 1.1 ross /*
4121 1.1 ross -------------------------------------------------------------------------------
4122 1.1 ross Returns 1 if the extended double-precision floating-point value `a' is
4123 1.1 ross less than the corresponding value `b', and 0 otherwise. The comparison
4124 1.1 ross is performed according to the IEC/IEEE Standard for Binary Floating-Point
4125 1.1 ross Arithmetic.
4126 1.1 ross -------------------------------------------------------------------------------
4127 1.1 ross */
4128 1.1 ross flag floatx80_lt( floatx80 a, floatx80 b )
4129 1.1 ross {
4130 1.1 ross flag aSign, bSign;
4131 1.1 ross
4132 1.1 ross if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
4133 1.1 ross && (bits64) ( extractFloatx80Frac( a )<<1 ) )
4134 1.1 ross || ( ( extractFloatx80Exp( b ) == 0x7FFF )
4135 1.1 ross && (bits64) ( extractFloatx80Frac( b )<<1 ) )
4136 1.1 ross ) {
4137 1.1 ross float_raise( float_flag_invalid );
4138 1.1 ross return 0;
4139 1.1 ross }
4140 1.1 ross aSign = extractFloatx80Sign( a );
4141 1.1 ross bSign = extractFloatx80Sign( b );
4142 1.1 ross if ( aSign != bSign ) {
4143 1.1 ross return
4144 1.1 ross aSign
4145 1.1 ross && ( ( ( (bits16) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
4146 1.1 ross != 0 );
4147 1.1 ross }
4148 1.1 ross return
4149 1.1 ross aSign ? lt128( b.high, b.low, a.high, a.low )
4150 1.1 ross : lt128( a.high, a.low, b.high, b.low );
4151 1.1 ross
4152 1.1 ross }
4153 1.1 ross
4154 1.1 ross /*
4155 1.1 ross -------------------------------------------------------------------------------
4156 1.1 ross Returns 1 if the extended double-precision floating-point value `a' is equal
4157 1.1 ross to the corresponding value `b', and 0 otherwise. The invalid exception is
4158 1.1 ross raised if either operand is a NaN. Otherwise, the comparison is performed
4159 1.1 ross according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4160 1.1 ross -------------------------------------------------------------------------------
4161 1.1 ross */
4162 1.1 ross flag floatx80_eq_signaling( floatx80 a, floatx80 b )
4163 1.1 ross {
4164 1.1 ross
4165 1.1 ross if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
4166 1.1 ross && (bits64) ( extractFloatx80Frac( a )<<1 ) )
4167 1.1 ross || ( ( extractFloatx80Exp( b ) == 0x7FFF )
4168 1.1 ross && (bits64) ( extractFloatx80Frac( b )<<1 ) )
4169 1.1 ross ) {
4170 1.1 ross float_raise( float_flag_invalid );
4171 1.1 ross return 0;
4172 1.1 ross }
4173 1.1 ross return
4174 1.1 ross ( a.low == b.low )
4175 1.1 ross && ( ( a.high == b.high )
4176 1.1 ross || ( ( a.low == 0 )
4177 1.1 ross && ( (bits16) ( ( a.high | b.high )<<1 ) == 0 ) )
4178 1.1 ross );
4179 1.1 ross
4180 1.1 ross }
4181 1.1 ross
4182 1.1 ross /*
4183 1.1 ross -------------------------------------------------------------------------------
4184 1.1 ross Returns 1 if the extended double-precision floating-point value `a' is less
4185 1.1 ross than or equal to the corresponding value `b', and 0 otherwise. Quiet NaNs
4186 1.1 ross do not cause an exception. Otherwise, the comparison is performed according
4187 1.1 ross to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4188 1.1 ross -------------------------------------------------------------------------------
4189 1.1 ross */
4190 1.1 ross flag floatx80_le_quiet( floatx80 a, floatx80 b )
4191 1.1 ross {
4192 1.1 ross flag aSign, bSign;
4193 1.1 ross
4194 1.1 ross if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
4195 1.1 ross && (bits64) ( extractFloatx80Frac( a )<<1 ) )
4196 1.1 ross || ( ( extractFloatx80Exp( b ) == 0x7FFF )
4197 1.1 ross && (bits64) ( extractFloatx80Frac( b )<<1 ) )
4198 1.1 ross ) {
4199 1.1 ross if ( floatx80_is_signaling_nan( a )
4200 1.1 ross || floatx80_is_signaling_nan( b ) ) {
4201 1.1 ross float_raise( float_flag_invalid );
4202 1.1 ross }
4203 1.1 ross return 0;
4204 1.1 ross }
4205 1.1 ross aSign = extractFloatx80Sign( a );
4206 1.1 ross bSign = extractFloatx80Sign( b );
4207 1.1 ross if ( aSign != bSign ) {
4208 1.1 ross return
4209 1.1 ross aSign
4210 1.1 ross || ( ( ( (bits16) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
4211 1.1 ross == 0 );
4212 1.1 ross }
4213 1.1 ross return
4214 1.1 ross aSign ? le128( b.high, b.low, a.high, a.low )
4215 1.1 ross : le128( a.high, a.low, b.high, b.low );
4216 1.1 ross
4217 1.1 ross }
4218 1.1 ross
4219 1.1 ross /*
4220 1.1 ross -------------------------------------------------------------------------------
4221 1.1 ross Returns 1 if the extended double-precision floating-point value `a' is less
4222 1.1 ross than the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause
4223 1.1 ross an exception. Otherwise, the comparison is performed according to the
4224 1.1 ross IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4225 1.1 ross -------------------------------------------------------------------------------
4226 1.1 ross */
4227 1.1 ross flag floatx80_lt_quiet( floatx80 a, floatx80 b )
4228 1.1 ross {
4229 1.1 ross flag aSign, bSign;
4230 1.1 ross
4231 1.1 ross if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
4232 1.1 ross && (bits64) ( extractFloatx80Frac( a )<<1 ) )
4233 1.1 ross || ( ( extractFloatx80Exp( b ) == 0x7FFF )
4234 1.1 ross && (bits64) ( extractFloatx80Frac( b )<<1 ) )
4235 1.1 ross ) {
4236 1.1 ross if ( floatx80_is_signaling_nan( a )
4237 1.1 ross || floatx80_is_signaling_nan( b ) ) {
4238 1.1 ross float_raise( float_flag_invalid );
4239 1.1 ross }
4240 1.1 ross return 0;
4241 1.1 ross }
4242 1.1 ross aSign = extractFloatx80Sign( a );
4243 1.1 ross bSign = extractFloatx80Sign( b );
4244 1.1 ross if ( aSign != bSign ) {
4245 1.1 ross return
4246 1.1 ross aSign
4247 1.1 ross && ( ( ( (bits16) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
4248 1.1 ross != 0 );
4249 1.1 ross }
4250 1.1 ross return
4251 1.1 ross aSign ? lt128( b.high, b.low, a.high, a.low )
4252 1.1 ross : lt128( a.high, a.low, b.high, b.low );
4253 1.1 ross
4254 1.1 ross }
4255 1.1 ross
4256 1.1 ross #endif
4257 1.1 ross
4258 1.1 ross #ifdef FLOAT128
4259 1.1 ross
4260 1.1 ross /*
4261 1.1 ross -------------------------------------------------------------------------------
4262 1.1 ross Returns the result of converting the quadruple-precision floating-point
4263 1.1 ross value `a' to the 32-bit two's complement integer format. The conversion
4264 1.1 ross is performed according to the IEC/IEEE Standard for Binary Floating-Point
4265 1.1 ross Arithmetic---which means in particular that the conversion is rounded
4266 1.1 ross according to the current rounding mode. If `a' is a NaN, the largest
4267 1.1 ross positive integer is returned. Otherwise, if the conversion overflows, the
4268 1.1 ross largest integer with the same sign as `a' is returned.
4269 1.1 ross -------------------------------------------------------------------------------
4270 1.1 ross */
4271 1.1 ross int32 float128_to_int32( float128 a )
4272 1.1 ross {
4273 1.1 ross flag aSign;
4274 1.1 ross int32 aExp, shiftCount;
4275 1.1 ross bits64 aSig0, aSig1;
4276 1.1 ross
4277 1.1 ross aSig1 = extractFloat128Frac1( a );
4278 1.1 ross aSig0 = extractFloat128Frac0( a );
4279 1.1 ross aExp = extractFloat128Exp( a );
4280 1.1 ross aSign = extractFloat128Sign( a );
4281 1.1 ross if ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) aSign = 0;
4282 1.1 ross if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
4283 1.1 ross aSig0 |= ( aSig1 != 0 );
4284 1.1 ross shiftCount = 0x4028 - aExp;
4285 1.1 ross if ( 0 < shiftCount ) shift64RightJamming( aSig0, shiftCount, &aSig0 );
4286 1.1 ross return roundAndPackInt32( aSign, aSig0 );
4287 1.1 ross
4288 1.1 ross }
4289 1.1 ross
4290 1.1 ross /*
4291 1.1 ross -------------------------------------------------------------------------------
4292 1.1 ross Returns the result of converting the quadruple-precision floating-point
4293 1.1 ross value `a' to the 32-bit two's complement integer format. The conversion
4294 1.1 ross is performed according to the IEC/IEEE Standard for Binary Floating-Point
4295 1.1 ross Arithmetic, except that the conversion is always rounded toward zero. If
4296 1.1 ross `a' is a NaN, the largest positive integer is returned. Otherwise, if the
4297 1.1 ross conversion overflows, the largest integer with the same sign as `a' is
4298 1.1 ross returned.
4299 1.1 ross -------------------------------------------------------------------------------
4300 1.1 ross */
4301 1.1 ross int32 float128_to_int32_round_to_zero( float128 a )
4302 1.1 ross {
4303 1.1 ross flag aSign;
4304 1.1 ross int32 aExp, shiftCount;
4305 1.1 ross bits64 aSig0, aSig1, savedASig;
4306 1.1 ross int32 z;
4307 1.1 ross
4308 1.1 ross aSig1 = extractFloat128Frac1( a );
4309 1.1 ross aSig0 = extractFloat128Frac0( a );
4310 1.1 ross aExp = extractFloat128Exp( a );
4311 1.1 ross aSign = extractFloat128Sign( a );
4312 1.1 ross aSig0 |= ( aSig1 != 0 );
4313 1.1 ross if ( 0x401E < aExp ) {
4314 1.1 ross if ( ( aExp == 0x7FFF ) && aSig0 ) aSign = 0;
4315 1.1 ross goto invalid;
4316 1.1 ross }
4317 1.1 ross else if ( aExp < 0x3FFF ) {
4318 1.1 ross if ( aExp || aSig0 ) float_set_inexact();
4319 1.1 ross return 0;
4320 1.1 ross }
4321 1.1 ross aSig0 |= LIT64( 0x0001000000000000 );
4322 1.1 ross shiftCount = 0x402F - aExp;
4323 1.1 ross savedASig = aSig0;
4324 1.1 ross aSig0 >>= shiftCount;
4325 1.1 ross z = aSig0;
4326 1.1 ross if ( aSign ) z = - z;
4327 1.1 ross if ( ( z < 0 ) ^ aSign ) {
4328 1.1 ross invalid:
4329 1.1 ross float_raise( float_flag_invalid );
4330 1.1 ross return aSign ? (sbits32) 0x80000000 : 0x7FFFFFFF;
4331 1.1 ross }
4332 1.1 ross if ( ( aSig0<<shiftCount ) != savedASig ) {
4333 1.1 ross float_set_inexact();
4334 1.1 ross }
4335 1.1 ross return z;
4336 1.1 ross
4337 1.1 ross }
4338 1.1 ross
4339 1.1 ross /*
4340 1.1 ross -------------------------------------------------------------------------------
4341 1.1 ross Returns the result of converting the quadruple-precision floating-point
4342 1.1 ross value `a' to the 64-bit two's complement integer format. The conversion
4343 1.1 ross is performed according to the IEC/IEEE Standard for Binary Floating-Point
4344 1.1 ross Arithmetic---which means in particular that the conversion is rounded
4345 1.1 ross according to the current rounding mode. If `a' is a NaN, the largest
4346 1.1 ross positive integer is returned. Otherwise, if the conversion overflows, the
4347 1.1 ross largest integer with the same sign as `a' is returned.
4348 1.1 ross -------------------------------------------------------------------------------
4349 1.1 ross */
4350 1.1 ross int64 float128_to_int64( float128 a )
4351 1.1 ross {
4352 1.1 ross flag aSign;
4353 1.1 ross int32 aExp, shiftCount;
4354 1.1 ross bits64 aSig0, aSig1;
4355 1.1 ross
4356 1.1 ross aSig1 = extractFloat128Frac1( a );
4357 1.1 ross aSig0 = extractFloat128Frac0( a );
4358 1.1 ross aExp = extractFloat128Exp( a );
4359 1.1 ross aSign = extractFloat128Sign( a );
4360 1.1 ross if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
4361 1.1 ross shiftCount = 0x402F - aExp;
4362 1.1 ross if ( shiftCount <= 0 ) {
4363 1.1 ross if ( 0x403E < aExp ) {
4364 1.1 ross float_raise( float_flag_invalid );
4365 1.1 ross if ( ! aSign
4366 1.1 ross || ( ( aExp == 0x7FFF )
4367 1.1 ross && ( aSig1 || ( aSig0 != LIT64( 0x0001000000000000 ) ) )
4368 1.1 ross )
4369 1.1 ross ) {
4370 1.1 ross return LIT64( 0x7FFFFFFFFFFFFFFF );
4371 1.1 ross }
4372 1.1 ross return (sbits64) LIT64( 0x8000000000000000 );
4373 1.1 ross }
4374 1.1 ross shortShift128Left( aSig0, aSig1, - shiftCount, &aSig0, &aSig1 );
4375 1.1 ross }
4376 1.1 ross else {
4377 1.1 ross shift64ExtraRightJamming( aSig0, aSig1, shiftCount, &aSig0, &aSig1 );
4378 1.1 ross }
4379 1.1 ross return roundAndPackInt64( aSign, aSig0, aSig1 );
4380 1.1 ross
4381 1.1 ross }
4382 1.1 ross
4383 1.1 ross /*
4384 1.1 ross -------------------------------------------------------------------------------
4385 1.1 ross Returns the result of converting the quadruple-precision floating-point
4386 1.1 ross value `a' to the 64-bit two's complement integer format. The conversion
4387 1.1 ross is performed according to the IEC/IEEE Standard for Binary Floating-Point
4388 1.1 ross Arithmetic, except that the conversion is always rounded toward zero.
4389 1.1 ross If `a' is a NaN, the largest positive integer is returned. Otherwise, if
4390 1.1 ross the conversion overflows, the largest integer with the same sign as `a' is
4391 1.1 ross returned.
4392 1.1 ross -------------------------------------------------------------------------------
4393 1.1 ross */
4394 1.1 ross int64 float128_to_int64_round_to_zero( float128 a )
4395 1.1 ross {
4396 1.1 ross flag aSign;
4397 1.1 ross int32 aExp, shiftCount;
4398 1.1 ross bits64 aSig0, aSig1;
4399 1.1 ross int64 z;
4400 1.1 ross
4401 1.1 ross aSig1 = extractFloat128Frac1( a );
4402 1.1 ross aSig0 = extractFloat128Frac0( a );
4403 1.1 ross aExp = extractFloat128Exp( a );
4404 1.1 ross aSign = extractFloat128Sign( a );
4405 1.1 ross if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
4406 1.1 ross shiftCount = aExp - 0x402F;
4407 1.1 ross if ( 0 < shiftCount ) {
4408 1.1 ross if ( 0x403E <= aExp ) {
4409 1.1 ross aSig0 &= LIT64( 0x0000FFFFFFFFFFFF );
4410 1.1 ross if ( ( a.high == LIT64( 0xC03E000000000000 ) )
4411 1.1 ross && ( aSig1 < LIT64( 0x0002000000000000 ) ) ) {
4412 1.1 ross if ( aSig1 ) float_set_inexact();
4413 1.1 ross }
4414 1.1 ross else {
4415 1.1 ross float_raise( float_flag_invalid );
4416 1.1 ross if ( ! aSign || ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) ) {
4417 1.1 ross return LIT64( 0x7FFFFFFFFFFFFFFF );
4418 1.1 ross }
4419 1.1 ross }
4420 1.1 ross return (sbits64) LIT64( 0x8000000000000000 );
4421 1.1 ross }
4422 1.1 ross z = ( aSig0<<shiftCount ) | ( aSig1>>( ( - shiftCount ) & 63 ) );
4423 1.1 ross if ( (bits64) ( aSig1<<shiftCount ) ) {
4424 1.1 ross float_set_inexact();
4425 1.1 ross }
4426 1.1 ross }
4427 1.1 ross else {
4428 1.1 ross if ( aExp < 0x3FFF ) {
4429 1.1 ross if ( aExp | aSig0 | aSig1 ) {
4430 1.1 ross float_set_inexact();
4431 1.1 ross }
4432 1.1 ross return 0;
4433 1.1 ross }
4434 1.1 ross z = aSig0>>( - shiftCount );
4435 1.1 ross if ( aSig1
4436 1.1 ross || ( shiftCount && (bits64) ( aSig0<<( shiftCount & 63 ) ) ) ) {
4437 1.1 ross float_set_inexact();
4438 1.1 ross }
4439 1.1 ross }
4440 1.1 ross if ( aSign ) z = - z;
4441 1.1 ross return z;
4442 1.1 ross
4443 1.1 ross }
4444 1.1 ross
4445 1.1 ross /*
4446 1.1 ross -------------------------------------------------------------------------------
4447 1.1 ross Returns the result of converting the quadruple-precision floating-point
4448 1.1 ross value `a' to the single-precision floating-point format. The conversion
4449 1.1 ross is performed according to the IEC/IEEE Standard for Binary Floating-Point
4450 1.1 ross Arithmetic.
4451 1.1 ross -------------------------------------------------------------------------------
4452 1.1 ross */
4453 1.1 ross float32 float128_to_float32( float128 a )
4454 1.1 ross {
4455 1.1 ross flag aSign;
4456 1.1 ross int32 aExp;
4457 1.1 ross bits64 aSig0, aSig1;
4458 1.1 ross bits32 zSig;
4459 1.1 ross
4460 1.1 ross aSig1 = extractFloat128Frac1( a );
4461 1.1 ross aSig0 = extractFloat128Frac0( a );
4462 1.1 ross aExp = extractFloat128Exp( a );
4463 1.1 ross aSign = extractFloat128Sign( a );
4464 1.1 ross if ( aExp == 0x7FFF ) {
4465 1.1 ross if ( aSig0 | aSig1 ) {
4466 1.1 ross return commonNaNToFloat32( float128ToCommonNaN( a ) );
4467 1.1 ross }
4468 1.1 ross return packFloat32( aSign, 0xFF, 0 );
4469 1.1 ross }
4470 1.1 ross aSig0 |= ( aSig1 != 0 );
4471 1.1 ross shift64RightJamming( aSig0, 18, &aSig0 );
4472 1.1 ross zSig = aSig0;
4473 1.1 ross if ( aExp || zSig ) {
4474 1.1 ross zSig |= 0x40000000;
4475 1.1 ross aExp -= 0x3F81;
4476 1.1 ross }
4477 1.1 ross return roundAndPackFloat32( aSign, aExp, zSig );
4478 1.1 ross
4479 1.1 ross }
4480 1.1 ross
4481 1.1 ross /*
4482 1.1 ross -------------------------------------------------------------------------------
4483 1.1 ross Returns the result of converting the quadruple-precision floating-point
4484 1.1 ross value `a' to the double-precision floating-point format. The conversion
4485 1.1 ross is performed according to the IEC/IEEE Standard for Binary Floating-Point
4486 1.1 ross Arithmetic.
4487 1.1 ross -------------------------------------------------------------------------------
4488 1.1 ross */
4489 1.1 ross float64 float128_to_float64( float128 a )
4490 1.1 ross {
4491 1.1 ross flag aSign;
4492 1.1 ross int32 aExp;
4493 1.1 ross bits64 aSig0, aSig1;
4494 1.1 ross
4495 1.1 ross aSig1 = extractFloat128Frac1( a );
4496 1.1 ross aSig0 = extractFloat128Frac0( a );
4497 1.1 ross aExp = extractFloat128Exp( a );
4498 1.1 ross aSign = extractFloat128Sign( a );
4499 1.1 ross if ( aExp == 0x7FFF ) {
4500 1.1 ross if ( aSig0 | aSig1 ) {
4501 1.1 ross return commonNaNToFloat64( float128ToCommonNaN( a ) );
4502 1.1 ross }
4503 1.1 ross return packFloat64( aSign, 0x7FF, 0 );
4504 1.1 ross }
4505 1.1 ross shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 );
4506 1.1 ross aSig0 |= ( aSig1 != 0 );
4507 1.1 ross if ( aExp || aSig0 ) {
4508 1.1 ross aSig0 |= LIT64( 0x4000000000000000 );
4509 1.1 ross aExp -= 0x3C01;
4510 1.1 ross }
4511 1.1 ross return roundAndPackFloat64( aSign, aExp, aSig0 );
4512 1.1 ross
4513 1.1 ross }
4514 1.1 ross
4515 1.1 ross #ifdef FLOATX80
4516 1.1 ross
4517 1.1 ross /*
4518 1.1 ross -------------------------------------------------------------------------------
4519 1.1 ross Returns the result of converting the quadruple-precision floating-point
4520 1.1 ross value `a' to the extended double-precision floating-point format. The
4521 1.1 ross conversion is performed according to the IEC/IEEE Standard for Binary
4522 1.1 ross Floating-Point Arithmetic.
4523 1.1 ross -------------------------------------------------------------------------------
4524 1.1 ross */
4525 1.1 ross floatx80 float128_to_floatx80( float128 a )
4526 1.1 ross {
4527 1.1 ross flag aSign;
4528 1.1 ross int32 aExp;
4529 1.1 ross bits64 aSig0, aSig1;
4530 1.1 ross
4531 1.1 ross aSig1 = extractFloat128Frac1( a );
4532 1.1 ross aSig0 = extractFloat128Frac0( a );
4533 1.1 ross aExp = extractFloat128Exp( a );
4534 1.1 ross aSign = extractFloat128Sign( a );
4535 1.1 ross if ( aExp == 0x7FFF ) {
4536 1.1 ross if ( aSig0 | aSig1 ) {
4537 1.1 ross return commonNaNToFloatx80( float128ToCommonNaN( a ) );
4538 1.1 ross }
4539 1.1 ross return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
4540 1.1 ross }
4541 1.1 ross if ( aExp == 0 ) {
4542 1.1 ross if ( ( aSig0 | aSig1 ) == 0 ) return packFloatx80( aSign, 0, 0 );
4543 1.1 ross normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
4544 1.1 ross }
4545 1.1 ross else {
4546 1.1 ross aSig0 |= LIT64( 0x0001000000000000 );
4547 1.1 ross }
4548 1.1 ross shortShift128Left( aSig0, aSig1, 15, &aSig0, &aSig1 );
4549 1.1 ross return roundAndPackFloatx80( 80, aSign, aExp, aSig0, aSig1 );
4550 1.1 ross
4551 1.1 ross }
4552 1.1 ross
4553 1.1 ross #endif
4554 1.1 ross
4555 1.1 ross /*
4556 1.1 ross -------------------------------------------------------------------------------
4557 1.1 ross Rounds the quadruple-precision floating-point value `a' to an integer, and
4558 1.1 ross returns the result as a quadruple-precision floating-point value. The
4559 1.1 ross operation is performed according to the IEC/IEEE Standard for Binary
4560 1.1 ross Floating-Point Arithmetic.
4561 1.1 ross -------------------------------------------------------------------------------
4562 1.1 ross */
4563 1.1 ross float128 float128_round_to_int( float128 a )
4564 1.1 ross {
4565 1.1 ross flag aSign;
4566 1.1 ross int32 aExp;
4567 1.1 ross bits64 lastBitMask, roundBitsMask;
4568 1.1 ross int8 roundingMode;
4569 1.1 ross float128 z;
4570 1.1 ross
4571 1.1 ross aExp = extractFloat128Exp( a );
4572 1.1 ross if ( 0x402F <= aExp ) {
4573 1.1 ross if ( 0x406F <= aExp ) {
4574 1.1 ross if ( ( aExp == 0x7FFF )
4575 1.1 ross && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) )
4576 1.1 ross ) {
4577 1.1 ross return propagateFloat128NaN( a, a );
4578 1.1 ross }
4579 1.1 ross return a;
4580 1.1 ross }
4581 1.1 ross lastBitMask = 1;
4582 1.1 ross lastBitMask = ( lastBitMask<<( 0x406E - aExp ) )<<1;
4583 1.1 ross roundBitsMask = lastBitMask - 1;
4584 1.1 ross z = a;
4585 1.1 ross roundingMode = float_rounding_mode();
4586 1.1 ross if ( roundingMode == float_round_nearest_even ) {
4587 1.1 ross if ( lastBitMask ) {
4588 1.1 ross add128( z.high, z.low, 0, lastBitMask>>1, &z.high, &z.low );
4589 1.1 ross if ( ( z.low & roundBitsMask ) == 0 ) z.low &= ~ lastBitMask;
4590 1.1 ross }
4591 1.1 ross else {
4592 1.1 ross if ( (sbits64) z.low < 0 ) {
4593 1.1 ross ++z.high;
4594 1.1 ross if ( (bits64) ( z.low<<1 ) == 0 ) z.high &= ~1;
4595 1.1 ross }
4596 1.1 ross }
4597 1.1 ross }
4598 1.1 ross else if ( roundingMode != float_round_to_zero ) {
4599 1.1 ross if ( extractFloat128Sign( z )
4600 1.1 ross ^ ( roundingMode == float_round_up ) ) {
4601 1.1 ross add128( z.high, z.low, 0, roundBitsMask, &z.high, &z.low );
4602 1.1 ross }
4603 1.1 ross }
4604 1.1 ross z.low &= ~ roundBitsMask;
4605 1.1 ross }
4606 1.1 ross else {
4607 1.1 ross if ( aExp < 0x3FFF ) {
4608 1.1 ross if ( ( ( (bits64) ( a.high<<1 ) ) | a.low ) == 0 ) return a;
4609 1.1 ross float_set_inexact();
4610 1.1 ross aSign = extractFloat128Sign( a );
4611 1.1 ross switch ( float_rounding_mode() ) {
4612 1.1 ross case float_round_nearest_even:
4613 1.1 ross if ( ( aExp == 0x3FFE )
4614 1.1 ross && ( extractFloat128Frac0( a )
4615 1.1 ross | extractFloat128Frac1( a ) )
4616 1.1 ross ) {
4617 1.1 ross return packFloat128( aSign, 0x3FFF, 0, 0 );
4618 1.1 ross }
4619 1.1 ross break;
4620 1.1 ross case float_round_down:
4621 1.1 ross return
4622 1.1 ross aSign ? packFloat128( 1, 0x3FFF, 0, 0 )
4623 1.1 ross : packFloat128( 0, 0, 0, 0 );
4624 1.1 ross case float_round_up:
4625 1.1 ross return
4626 1.1 ross aSign ? packFloat128( 1, 0, 0, 0 )
4627 1.1 ross : packFloat128( 0, 0x3FFF, 0, 0 );
4628 1.1 ross }
4629 1.1 ross return packFloat128( aSign, 0, 0, 0 );
4630 1.1 ross }
4631 1.1 ross lastBitMask = 1;
4632 1.1 ross lastBitMask <<= 0x402F - aExp;
4633 1.1 ross roundBitsMask = lastBitMask - 1;
4634 1.1 ross z.low = 0;
4635 1.1 ross z.high = a.high;
4636 1.1 ross roundingMode = float_rounding_mode();
4637 1.1 ross if ( roundingMode == float_round_nearest_even ) {
4638 1.1 ross z.high += lastBitMask>>1;
4639 1.1 ross if ( ( ( z.high & roundBitsMask ) | a.low ) == 0 ) {
4640 1.1 ross z.high &= ~ lastBitMask;
4641 1.1 ross }
4642 1.1 ross }
4643 1.1 ross else if ( roundingMode != float_round_to_zero ) {
4644 1.1 ross if ( extractFloat128Sign( z )
4645 1.1 ross ^ ( roundingMode == float_round_up ) ) {
4646 1.1 ross z.high |= ( a.low != 0 );
4647 1.1 ross z.high += roundBitsMask;
4648 1.1 ross }
4649 1.1 ross }
4650 1.1 ross z.high &= ~ roundBitsMask;
4651 1.1 ross }
4652 1.1 ross if ( ( z.low != a.low ) || ( z.high != a.high ) ) {
4653 1.1 ross float_set_inexact();
4654 1.1 ross }
4655 1.1 ross return z;
4656 1.1 ross
4657 1.1 ross }
4658 1.1 ross
4659 1.1 ross /*
4660 1.1 ross -------------------------------------------------------------------------------
4661 1.1 ross Returns the result of adding the absolute values of the quadruple-precision
4662 1.1 ross floating-point values `a' and `b'. If `zSign' is 1, the sum is negated
4663 1.1 ross before being returned. `zSign' is ignored if the result is a NaN.
4664 1.1 ross The addition is performed according to the IEC/IEEE Standard for Binary
4665 1.1 ross Floating-Point Arithmetic.
4666 1.1 ross -------------------------------------------------------------------------------
4667 1.1 ross */
4668 1.1 ross static float128 addFloat128Sigs( float128 a, float128 b, flag zSign )
4669 1.1 ross {
4670 1.1 ross int32 aExp, bExp, zExp;
4671 1.1 ross bits64 aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2;
4672 1.1 ross int32 expDiff;
4673 1.1 ross
4674 1.1 ross aSig1 = extractFloat128Frac1( a );
4675 1.1 ross aSig0 = extractFloat128Frac0( a );
4676 1.1 ross aExp = extractFloat128Exp( a );
4677 1.1 ross bSig1 = extractFloat128Frac1( b );
4678 1.1 ross bSig0 = extractFloat128Frac0( b );
4679 1.1 ross bExp = extractFloat128Exp( b );
4680 1.1 ross expDiff = aExp - bExp;
4681 1.1 ross if ( 0 < expDiff ) {
4682 1.1 ross if ( aExp == 0x7FFF ) {
4683 1.1 ross if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, b );
4684 1.1 ross return a;
4685 1.1 ross }
4686 1.1 ross if ( bExp == 0 ) {
4687 1.1 ross --expDiff;
4688 1.1 ross }
4689 1.1 ross else {
4690 1.1 ross bSig0 |= LIT64( 0x0001000000000000 );
4691 1.1 ross }
4692 1.1 ross shift128ExtraRightJamming(
4693 1.1 ross bSig0, bSig1, 0, expDiff, &bSig0, &bSig1, &zSig2 );
4694 1.1 ross zExp = aExp;
4695 1.1 ross }
4696 1.1 ross else if ( expDiff < 0 ) {
4697 1.1 ross if ( bExp == 0x7FFF ) {
4698 1.1 ross if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b );
4699 1.1 ross return packFloat128( zSign, 0x7FFF, 0, 0 );
4700 1.1 ross }
4701 1.1 ross if ( aExp == 0 ) {
4702 1.1 ross ++expDiff;
4703 1.1 ross }
4704 1.1 ross else {
4705 1.1 ross aSig0 |= LIT64( 0x0001000000000000 );
4706 1.1 ross }
4707 1.1 ross shift128ExtraRightJamming(
4708 1.1 ross aSig0, aSig1, 0, - expDiff, &aSig0, &aSig1, &zSig2 );
4709 1.1 ross zExp = bExp;
4710 1.1 ross }
4711 1.1 ross else {
4712 1.1 ross if ( aExp == 0x7FFF ) {
4713 1.1 ross if ( aSig0 | aSig1 | bSig0 | bSig1 ) {
4714 1.1 ross return propagateFloat128NaN( a, b );
4715 1.1 ross }
4716 1.1 ross return a;
4717 1.1 ross }
4718 1.1 ross add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
4719 1.1 ross if ( aExp == 0 ) return packFloat128( zSign, 0, zSig0, zSig1 );
4720 1.1 ross zSig2 = 0;
4721 1.1 ross zSig0 |= LIT64( 0x0002000000000000 );
4722 1.1 ross zExp = aExp;
4723 1.1 ross goto shiftRight1;
4724 1.1 ross }
4725 1.1 ross aSig0 |= LIT64( 0x0001000000000000 );
4726 1.1 ross add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
4727 1.1 ross --zExp;
4728 1.1 ross if ( zSig0 < LIT64( 0x0002000000000000 ) ) goto roundAndPack;
4729 1.1 ross ++zExp;
4730 1.1 ross shiftRight1:
4731 1.1 ross shift128ExtraRightJamming(
4732 1.1 ross zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 );
4733 1.1 ross roundAndPack:
4734 1.1 ross return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 );
4735 1.1 ross
4736 1.1 ross }
4737 1.1 ross
4738 1.1 ross /*
4739 1.1 ross -------------------------------------------------------------------------------
4740 1.1 ross Returns the result of subtracting the absolute values of the quadruple-
4741 1.1 ross precision floating-point values `a' and `b'. If `zSign' is 1, the
4742 1.1 ross difference is negated before being returned. `zSign' is ignored if the
4743 1.1 ross result is a NaN. The subtraction is performed according to the IEC/IEEE
4744 1.1 ross Standard for Binary Floating-Point Arithmetic.
4745 1.1 ross -------------------------------------------------------------------------------
4746 1.1 ross */
4747 1.1 ross static float128 subFloat128Sigs( float128 a, float128 b, flag zSign )
4748 1.1 ross {
4749 1.1 ross int32 aExp, bExp, zExp;
4750 1.1 ross bits64 aSig0, aSig1, bSig0, bSig1, zSig0, zSig1;
4751 1.1 ross int32 expDiff;
4752 1.1 ross float128 z;
4753 1.1 ross
4754 1.1 ross aSig1 = extractFloat128Frac1( a );
4755 1.1 ross aSig0 = extractFloat128Frac0( a );
4756 1.1 ross aExp = extractFloat128Exp( a );
4757 1.1 ross bSig1 = extractFloat128Frac1( b );
4758 1.1 ross bSig0 = extractFloat128Frac0( b );
4759 1.1 ross bExp = extractFloat128Exp( b );
4760 1.1 ross expDiff = aExp - bExp;
4761 1.1 ross shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 );
4762 1.1 ross shortShift128Left( bSig0, bSig1, 14, &bSig0, &bSig1 );
4763 1.1 ross if ( 0 < expDiff ) goto aExpBigger;
4764 1.1 ross if ( expDiff < 0 ) goto bExpBigger;
4765 1.1 ross if ( aExp == 0x7FFF ) {
4766 1.1 ross if ( aSig0 | aSig1 | bSig0 | bSig1 ) {
4767 1.1 ross return propagateFloat128NaN( a, b );
4768 1.1 ross }
4769 1.1 ross float_raise( float_flag_invalid );
4770 1.1 ross z.low = float128_default_nan_low;
4771 1.1 ross z.high = float128_default_nan_high;
4772 1.1 ross return z;
4773 1.1 ross }
4774 1.1 ross if ( aExp == 0 ) {
4775 1.1 ross aExp = 1;
4776 1.1 ross bExp = 1;
4777 1.1 ross }
4778 1.1 ross if ( bSig0 < aSig0 ) goto aBigger;
4779 1.1 ross if ( aSig0 < bSig0 ) goto bBigger;
4780 1.1 ross if ( bSig1 < aSig1 ) goto aBigger;
4781 1.1 ross if ( aSig1 < bSig1 ) goto bBigger;
4782 1.1 ross return packFloat128( float_rounding_mode() == float_round_down, 0, 0, 0 );
4783 1.1 ross bExpBigger:
4784 1.1 ross if ( bExp == 0x7FFF ) {
4785 1.1 ross if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b );
4786 1.1 ross return packFloat128( zSign ^ 1, 0x7FFF, 0, 0 );
4787 1.1 ross }
4788 1.1 ross if ( aExp == 0 ) {
4789 1.1 ross ++expDiff;
4790 1.1 ross }
4791 1.1 ross else {
4792 1.1 ross aSig0 |= LIT64( 0x4000000000000000 );
4793 1.1 ross }
4794 1.1 ross shift128RightJamming( aSig0, aSig1, - expDiff, &aSig0, &aSig1 );
4795 1.1 ross bSig0 |= LIT64( 0x4000000000000000 );
4796 1.1 ross bBigger:
4797 1.1 ross sub128( bSig0, bSig1, aSig0, aSig1, &zSig0, &zSig1 );
4798 1.1 ross zExp = bExp;
4799 1.1 ross zSign ^= 1;
4800 1.1 ross goto normalizeRoundAndPack;
4801 1.1 ross aExpBigger:
4802 1.1 ross if ( aExp == 0x7FFF ) {
4803 1.1 ross if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, b );
4804 1.1 ross return a;
4805 1.1 ross }
4806 1.1 ross if ( bExp == 0 ) {
4807 1.1 ross --expDiff;
4808 1.1 ross }
4809 1.1 ross else {
4810 1.1 ross bSig0 |= LIT64( 0x4000000000000000 );
4811 1.1 ross }
4812 1.1 ross shift128RightJamming( bSig0, bSig1, expDiff, &bSig0, &bSig1 );
4813 1.1 ross aSig0 |= LIT64( 0x4000000000000000 );
4814 1.1 ross aBigger:
4815 1.1 ross sub128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
4816 1.1 ross zExp = aExp;
4817 1.1 ross normalizeRoundAndPack:
4818 1.1 ross --zExp;
4819 1.1 ross return normalizeRoundAndPackFloat128( zSign, zExp - 14, zSig0, zSig1 );
4820 1.1 ross
4821 1.1 ross }
4822 1.1 ross
4823 1.1 ross /*
4824 1.1 ross -------------------------------------------------------------------------------
4825 1.1 ross Returns the result of adding the quadruple-precision floating-point values
4826 1.1 ross `a' and `b'. The operation is performed according to the IEC/IEEE Standard
4827 1.1 ross for Binary Floating-Point Arithmetic.
4828 1.1 ross -------------------------------------------------------------------------------
4829 1.1 ross */
4830 1.1 ross float128 float128_add( float128 a, float128 b )
4831 1.1 ross {
4832 1.1 ross flag aSign, bSign;
4833 1.1 ross
4834 1.1 ross aSign = extractFloat128Sign( a );
4835 1.1 ross bSign = extractFloat128Sign( b );
4836 1.1 ross if ( aSign == bSign ) {
4837 1.1 ross return addFloat128Sigs( a, b, aSign );
4838 1.1 ross }
4839 1.1 ross else {
4840 1.1 ross return subFloat128Sigs( a, b, aSign );
4841 1.1 ross }
4842 1.1 ross
4843 1.1 ross }
4844 1.1 ross
4845 1.1 ross /*
4846 1.1 ross -------------------------------------------------------------------------------
4847 1.1 ross Returns the result of subtracting the quadruple-precision floating-point
4848 1.1 ross values `a' and `b'. The operation is performed according to the IEC/IEEE
4849 1.1 ross Standard for Binary Floating-Point Arithmetic.
4850 1.1 ross -------------------------------------------------------------------------------
4851 1.1 ross */
4852 1.1 ross float128 float128_sub( float128 a, float128 b )
4853 1.1 ross {
4854 1.1 ross flag aSign, bSign;
4855 1.1 ross
4856 1.1 ross aSign = extractFloat128Sign( a );
4857 1.1 ross bSign = extractFloat128Sign( b );
4858 1.1 ross if ( aSign == bSign ) {
4859 1.1 ross return subFloat128Sigs( a, b, aSign );
4860 1.1 ross }
4861 1.1 ross else {
4862 1.1 ross return addFloat128Sigs( a, b, aSign );
4863 1.1 ross }
4864 1.1 ross
4865 1.1 ross }
4866 1.1 ross
4867 1.1 ross /*
4868 1.1 ross -------------------------------------------------------------------------------
4869 1.1 ross Returns the result of multiplying the quadruple-precision floating-point
4870 1.1 ross values `a' and `b'. The operation is performed according to the IEC/IEEE
4871 1.1 ross Standard for Binary Floating-Point Arithmetic.
4872 1.1 ross -------------------------------------------------------------------------------
4873 1.1 ross */
4874 1.1 ross float128 float128_mul( float128 a, float128 b )
4875 1.1 ross {
4876 1.1 ross flag aSign, bSign, zSign;
4877 1.1 ross int32 aExp, bExp, zExp;
4878 1.1 ross bits64 aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2, zSig3;
4879 1.1 ross float128 z;
4880 1.1 ross
4881 1.1 ross aSig1 = extractFloat128Frac1( a );
4882 1.1 ross aSig0 = extractFloat128Frac0( a );
4883 1.1 ross aExp = extractFloat128Exp( a );
4884 1.1 ross aSign = extractFloat128Sign( a );
4885 1.1 ross bSig1 = extractFloat128Frac1( b );
4886 1.1 ross bSig0 = extractFloat128Frac0( b );
4887 1.1 ross bExp = extractFloat128Exp( b );
4888 1.1 ross bSign = extractFloat128Sign( b );
4889 1.1 ross zSign = aSign ^ bSign;
4890 1.1 ross if ( aExp == 0x7FFF ) {
4891 1.1 ross if ( ( aSig0 | aSig1 )
4892 1.1 ross || ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) {
4893 1.1 ross return propagateFloat128NaN( a, b );
4894 1.1 ross }
4895 1.1 ross if ( ( bExp | bSig0 | bSig1 ) == 0 ) goto invalid;
4896 1.1 ross return packFloat128( zSign, 0x7FFF, 0, 0 );
4897 1.1 ross }
4898 1.1 ross if ( bExp == 0x7FFF ) {
4899 1.1 ross if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b );
4900 1.1 ross if ( ( aExp | aSig0 | aSig1 ) == 0 ) {
4901 1.1 ross invalid:
4902 1.1 ross float_raise( float_flag_invalid );
4903 1.1 ross z.low = float128_default_nan_low;
4904 1.1 ross z.high = float128_default_nan_high;
4905 1.1 ross return z;
4906 1.1 ross }
4907 1.1 ross return packFloat128( zSign, 0x7FFF, 0, 0 );
4908 1.1 ross }
4909 1.1 ross if ( aExp == 0 ) {
4910 1.1 ross if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
4911 1.1 ross normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
4912 1.1 ross }
4913 1.1 ross if ( bExp == 0 ) {
4914 1.1 ross if ( ( bSig0 | bSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
4915 1.1 ross normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
4916 1.1 ross }
4917 1.1 ross zExp = aExp + bExp - 0x4000;
4918 1.1 ross aSig0 |= LIT64( 0x0001000000000000 );
4919 1.1 ross shortShift128Left( bSig0, bSig1, 16, &bSig0, &bSig1 );
4920 1.1 ross mul128To256( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1, &zSig2, &zSig3 );
4921 1.1 ross add128( zSig0, zSig1, aSig0, aSig1, &zSig0, &zSig1 );
4922 1.1 ross zSig2 |= ( zSig3 != 0 );
4923 1.1 ross if ( LIT64( 0x0002000000000000 ) <= zSig0 ) {
4924 1.1 ross shift128ExtraRightJamming(
4925 1.1 ross zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 );
4926 1.1 ross ++zExp;
4927 1.1 ross }
4928 1.1 ross return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 );
4929 1.1 ross
4930 1.1 ross }
4931 1.1 ross
4932 1.1 ross /*
4933 1.1 ross -------------------------------------------------------------------------------
4934 1.1 ross Returns the result of dividing the quadruple-precision floating-point value
4935 1.1 ross `a' by the corresponding value `b'. The operation is performed according to
4936 1.1 ross the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4937 1.1 ross -------------------------------------------------------------------------------
4938 1.1 ross */
4939 1.1 ross float128 float128_div( float128 a, float128 b )
4940 1.1 ross {
4941 1.1 ross flag aSign, bSign, zSign;
4942 1.1 ross int32 aExp, bExp, zExp;
4943 1.1 ross bits64 aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2;
4944 1.1 ross bits64 rem0, rem1, rem2, rem3, term0, term1, term2, term3;
4945 1.1 ross float128 z;
4946 1.1 ross
4947 1.1 ross aSig1 = extractFloat128Frac1( a );
4948 1.1 ross aSig0 = extractFloat128Frac0( a );
4949 1.1 ross aExp = extractFloat128Exp( a );
4950 1.1 ross aSign = extractFloat128Sign( a );
4951 1.1 ross bSig1 = extractFloat128Frac1( b );
4952 1.1 ross bSig0 = extractFloat128Frac0( b );
4953 1.1 ross bExp = extractFloat128Exp( b );
4954 1.1 ross bSign = extractFloat128Sign( b );
4955 1.1 ross zSign = aSign ^ bSign;
4956 1.1 ross if ( aExp == 0x7FFF ) {
4957 1.1 ross if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, b );
4958 1.1 ross if ( bExp == 0x7FFF ) {
4959 1.1 ross if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b );
4960 1.1 ross goto invalid;
4961 1.1 ross }
4962 1.1 ross return packFloat128( zSign, 0x7FFF, 0, 0 );
4963 1.1 ross }
4964 1.1 ross if ( bExp == 0x7FFF ) {
4965 1.1 ross if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b );
4966 1.1 ross return packFloat128( zSign, 0, 0, 0 );
4967 1.1 ross }
4968 1.1 ross if ( bExp == 0 ) {
4969 1.1 ross if ( ( bSig0 | bSig1 ) == 0 ) {
4970 1.1 ross if ( ( aExp | aSig0 | aSig1 ) == 0 ) {
4971 1.1 ross invalid:
4972 1.1 ross float_raise( float_flag_invalid );
4973 1.1 ross z.low = float128_default_nan_low;
4974 1.1 ross z.high = float128_default_nan_high;
4975 1.1 ross return z;
4976 1.1 ross }
4977 1.1 ross float_raise( float_flag_divbyzero );
4978 1.1 ross return packFloat128( zSign, 0x7FFF, 0, 0 );
4979 1.1 ross }
4980 1.1 ross normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
4981 1.1 ross }
4982 1.1 ross if ( aExp == 0 ) {
4983 1.1 ross if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
4984 1.1 ross normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
4985 1.1 ross }
4986 1.1 ross zExp = aExp - bExp + 0x3FFD;
4987 1.1 ross shortShift128Left(
4988 1.1 ross aSig0 | LIT64( 0x0001000000000000 ), aSig1, 15, &aSig0, &aSig1 );
4989 1.1 ross shortShift128Left(
4990 1.1 ross bSig0 | LIT64( 0x0001000000000000 ), bSig1, 15, &bSig0, &bSig1 );
4991 1.1 ross if ( le128( bSig0, bSig1, aSig0, aSig1 ) ) {
4992 1.1 ross shift128Right( aSig0, aSig1, 1, &aSig0, &aSig1 );
4993 1.1 ross ++zExp;
4994 1.1 ross }
4995 1.1 ross zSig0 = estimateDiv128To64( aSig0, aSig1, bSig0 );
4996 1.1 ross mul128By64To192( bSig0, bSig1, zSig0, &term0, &term1, &term2 );
4997 1.1 ross sub192( aSig0, aSig1, 0, term0, term1, term2, &rem0, &rem1, &rem2 );
4998 1.1 ross while ( (sbits64) rem0 < 0 ) {
4999 1.1 ross --zSig0;
5000 1.1 ross add192( rem0, rem1, rem2, 0, bSig0, bSig1, &rem0, &rem1, &rem2 );
5001 1.1 ross }
5002 1.1 ross zSig1 = estimateDiv128To64( rem1, rem2, bSig0 );
5003 1.1 ross if ( ( zSig1 & 0x3FFF ) <= 4 ) {
5004 1.1 ross mul128By64To192( bSig0, bSig1, zSig1, &term1, &term2, &term3 );
5005 1.1 ross sub192( rem1, rem2, 0, term1, term2, term3, &rem1, &rem2, &rem3 );
5006 1.1 ross while ( (sbits64) rem1 < 0 ) {
5007 1.1 ross --zSig1;
5008 1.1 ross add192( rem1, rem2, rem3, 0, bSig0, bSig1, &rem1, &rem2, &rem3 );
5009 1.1 ross }
5010 1.1 ross zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
5011 1.1 ross }
5012 1.1 ross shift128ExtraRightJamming( zSig0, zSig1, 0, 15, &zSig0, &zSig1, &zSig2 );
5013 1.1 ross return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 );
5014 1.1 ross
5015 1.1 ross }
5016 1.1 ross
5017 1.1 ross /*
5018 1.1 ross -------------------------------------------------------------------------------
5019 1.1 ross Returns the remainder of the quadruple-precision floating-point value `a'
5020 1.1 ross with respect to the corresponding value `b'. The operation is performed
5021 1.1 ross according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5022 1.1 ross -------------------------------------------------------------------------------
5023 1.1 ross */
5024 1.1 ross float128 float128_rem( float128 a, float128 b )
5025 1.1 ross {
5026 1.1 ross flag aSign, bSign, zSign;
5027 1.1 ross int32 aExp, bExp, expDiff;
5028 1.1 ross bits64 aSig0, aSig1, bSig0, bSig1, q, term0, term1, term2;
5029 1.1 ross bits64 allZero, alternateASig0, alternateASig1, sigMean1;
5030 1.1 ross sbits64 sigMean0;
5031 1.1 ross float128 z;
5032 1.1 ross
5033 1.1 ross aSig1 = extractFloat128Frac1( a );
5034 1.1 ross aSig0 = extractFloat128Frac0( a );
5035 1.1 ross aExp = extractFloat128Exp( a );
5036 1.1 ross aSign = extractFloat128Sign( a );
5037 1.1 ross bSig1 = extractFloat128Frac1( b );
5038 1.1 ross bSig0 = extractFloat128Frac0( b );
5039 1.1 ross bExp = extractFloat128Exp( b );
5040 1.1 ross bSign = extractFloat128Sign( b );
5041 1.1 ross if ( aExp == 0x7FFF ) {
5042 1.1 ross if ( ( aSig0 | aSig1 )
5043 1.1 ross || ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) {
5044 1.1 ross return propagateFloat128NaN( a, b );
5045 1.1 ross }
5046 1.1 ross goto invalid;
5047 1.1 ross }
5048 1.1 ross if ( bExp == 0x7FFF ) {
5049 1.1 ross if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b );
5050 1.1 ross return a;
5051 1.1 ross }
5052 1.1 ross if ( bExp == 0 ) {
5053 1.1 ross if ( ( bSig0 | bSig1 ) == 0 ) {
5054 1.1 ross invalid:
5055 1.1 ross float_raise( float_flag_invalid );
5056 1.1 ross z.low = float128_default_nan_low;
5057 1.1 ross z.high = float128_default_nan_high;
5058 1.1 ross return z;
5059 1.1 ross }
5060 1.1 ross normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
5061 1.1 ross }
5062 1.1 ross if ( aExp == 0 ) {
5063 1.1 ross if ( ( aSig0 | aSig1 ) == 0 ) return a;
5064 1.1 ross normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
5065 1.1 ross }
5066 1.1 ross expDiff = aExp - bExp;
5067 1.1 ross if ( expDiff < -1 ) return a;
5068 1.1 ross shortShift128Left(
5069 1.1 ross aSig0 | LIT64( 0x0001000000000000 ),
5070 1.1 ross aSig1,
5071 1.1 ross 15 - ( expDiff < 0 ),
5072 1.1 ross &aSig0,
5073 1.1 ross &aSig1
5074 1.1 ross );
5075 1.1 ross shortShift128Left(
5076 1.1 ross bSig0 | LIT64( 0x0001000000000000 ), bSig1, 15, &bSig0, &bSig1 );
5077 1.1 ross q = le128( bSig0, bSig1, aSig0, aSig1 );
5078 1.1 ross if ( q ) sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 );
5079 1.1 ross expDiff -= 64;
5080 1.1 ross while ( 0 < expDiff ) {
5081 1.1 ross q = estimateDiv128To64( aSig0, aSig1, bSig0 );
5082 1.1 ross q = ( 4 < q ) ? q - 4 : 0;
5083 1.1 ross mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 );
5084 1.1 ross shortShift192Left( term0, term1, term2, 61, &term1, &term2, &allZero );
5085 1.1 ross shortShift128Left( aSig0, aSig1, 61, &aSig0, &allZero );
5086 1.1 ross sub128( aSig0, 0, term1, term2, &aSig0, &aSig1 );
5087 1.1 ross expDiff -= 61;
5088 1.1 ross }
5089 1.1 ross if ( -64 < expDiff ) {
5090 1.1 ross q = estimateDiv128To64( aSig0, aSig1, bSig0 );
5091 1.1 ross q = ( 4 < q ) ? q - 4 : 0;
5092 1.1 ross q >>= - expDiff;
5093 1.1 ross shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 );
5094 1.1 ross expDiff += 52;
5095 1.1 ross if ( expDiff < 0 ) {
5096 1.1 ross shift128Right( aSig0, aSig1, - expDiff, &aSig0, &aSig1 );
5097 1.1 ross }
5098 1.1 ross else {
5099 1.1 ross shortShift128Left( aSig0, aSig1, expDiff, &aSig0, &aSig1 );
5100 1.1 ross }
5101 1.1 ross mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 );
5102 1.1 ross sub128( aSig0, aSig1, term1, term2, &aSig0, &aSig1 );
5103 1.1 ross }
5104 1.1 ross else {
5105 1.1 ross shift128Right( aSig0, aSig1, 12, &aSig0, &aSig1 );
5106 1.1 ross shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 );
5107 1.1 ross }
5108 1.1 ross do {
5109 1.1 ross alternateASig0 = aSig0;
5110 1.1 ross alternateASig1 = aSig1;
5111 1.1 ross ++q;
5112 1.1 ross sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 );
5113 1.1 ross } while ( 0 <= (sbits64) aSig0 );
5114 1.1 ross add128(
5115 1.1 ross aSig0, aSig1, alternateASig0, alternateASig1, &sigMean0, &sigMean1 );
5116 1.1 ross if ( ( sigMean0 < 0 )
5117 1.1 ross || ( ( ( sigMean0 | sigMean1 ) == 0 ) && ( q & 1 ) ) ) {
5118 1.1 ross aSig0 = alternateASig0;
5119 1.1 ross aSig1 = alternateASig1;
5120 1.1 ross }
5121 1.1 ross zSign = ( (sbits64) aSig0 < 0 );
5122 1.1 ross if ( zSign ) sub128( 0, 0, aSig0, aSig1, &aSig0, &aSig1 );
5123 1.1 ross return
5124 1.1 ross normalizeRoundAndPackFloat128( aSign ^ zSign, bExp - 4, aSig0, aSig1 );
5125 1.1 ross
5126 1.1 ross }
5127 1.1 ross
5128 1.1 ross /*
5129 1.1 ross -------------------------------------------------------------------------------
5130 1.1 ross Returns the square root of the quadruple-precision floating-point value `a'.
5131 1.1 ross The operation is performed according to the IEC/IEEE Standard for Binary
5132 1.1 ross Floating-Point Arithmetic.
5133 1.1 ross -------------------------------------------------------------------------------
5134 1.1 ross */
5135 1.1 ross float128 float128_sqrt( float128 a )
5136 1.1 ross {
5137 1.1 ross flag aSign;
5138 1.1 ross int32 aExp, zExp;
5139 1.1 ross bits64 aSig0, aSig1, zSig0, zSig1, zSig2, doubleZSig0;
5140 1.1 ross bits64 rem0, rem1, rem2, rem3, term0, term1, term2, term3;
5141 1.1 ross float128 z;
5142 1.1 ross
5143 1.1 ross aSig1 = extractFloat128Frac1( a );
5144 1.1 ross aSig0 = extractFloat128Frac0( a );
5145 1.1 ross aExp = extractFloat128Exp( a );
5146 1.1 ross aSign = extractFloat128Sign( a );
5147 1.1 ross if ( aExp == 0x7FFF ) {
5148 1.1 ross if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, a );
5149 1.1 ross if ( ! aSign ) return a;
5150 1.1 ross goto invalid;
5151 1.1 ross }
5152 1.1 ross if ( aSign ) {
5153 1.1 ross if ( ( aExp | aSig0 | aSig1 ) == 0 ) return a;
5154 1.1 ross invalid:
5155 1.1 ross float_raise( float_flag_invalid );
5156 1.1 ross z.low = float128_default_nan_low;
5157 1.1 ross z.high = float128_default_nan_high;
5158 1.1 ross return z;
5159 1.1 ross }
5160 1.1 ross if ( aExp == 0 ) {
5161 1.1 ross if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( 0, 0, 0, 0 );
5162 1.1 ross normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
5163 1.1 ross }
5164 1.1 ross zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFE;
5165 1.1 ross aSig0 |= LIT64( 0x0001000000000000 );
5166 1.1 ross zSig0 = estimateSqrt32( aExp, aSig0>>17 );
5167 1.1 ross shortShift128Left( aSig0, aSig1, 13 - ( aExp & 1 ), &aSig0, &aSig1 );
5168 1.1 ross zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 );
5169 1.1 ross doubleZSig0 = zSig0<<1;
5170 1.1 ross mul64To128( zSig0, zSig0, &term0, &term1 );
5171 1.1 ross sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 );
5172 1.1 ross while ( (sbits64) rem0 < 0 ) {
5173 1.1 ross --zSig0;
5174 1.1 ross doubleZSig0 -= 2;
5175 1.1 ross add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 );
5176 1.1 ross }
5177 1.1 ross zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 );
5178 1.1 ross if ( ( zSig1 & 0x1FFF ) <= 5 ) {
5179 1.1 ross if ( zSig1 == 0 ) zSig1 = 1;
5180 1.1 ross mul64To128( doubleZSig0, zSig1, &term1, &term2 );
5181 1.1 ross sub128( rem1, 0, term1, term2, &rem1, &rem2 );
5182 1.1 ross mul64To128( zSig1, zSig1, &term2, &term3 );
5183 1.1 ross sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 );
5184 1.1 ross while ( (sbits64) rem1 < 0 ) {
5185 1.1 ross --zSig1;
5186 1.1 ross shortShift128Left( 0, zSig1, 1, &term2, &term3 );
5187 1.1 ross term3 |= 1;
5188 1.1 ross term2 |= doubleZSig0;
5189 1.1 ross add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 );
5190 1.1 ross }
5191 1.1 ross zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
5192 1.1 ross }
5193 1.1 ross shift128ExtraRightJamming( zSig0, zSig1, 0, 14, &zSig0, &zSig1, &zSig2 );
5194 1.1 ross return roundAndPackFloat128( 0, zExp, zSig0, zSig1, zSig2 );
5195 1.1 ross
5196 1.1 ross }
5197 1.1 ross
5198 1.1 ross /*
5199 1.1 ross -------------------------------------------------------------------------------
5200 1.1 ross Returns 1 if the quadruple-precision floating-point value `a' is equal to
5201 1.1 ross the corresponding value `b', and 0 otherwise. The comparison is performed
5202 1.1 ross according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5203 1.1 ross -------------------------------------------------------------------------------
5204 1.1 ross */
5205 1.1 ross flag float128_eq( float128 a, float128 b )
5206 1.1 ross {
5207 1.1 ross
5208 1.1 ross if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
5209 1.1 ross && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
5210 1.1 ross || ( ( extractFloat128Exp( b ) == 0x7FFF )
5211 1.1 ross && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
5212 1.1 ross ) {
5213 1.1 ross if ( float128_is_signaling_nan( a )
5214 1.1 ross || float128_is_signaling_nan( b ) ) {
5215 1.1 ross float_raise( float_flag_invalid );
5216 1.1 ross }
5217 1.1 ross return 0;
5218 1.1 ross }
5219 1.1 ross return
5220 1.1 ross ( a.low == b.low )
5221 1.1 ross && ( ( a.high == b.high )
5222 1.1 ross || ( ( a.low == 0 )
5223 1.1 ross && ( (bits64) ( ( a.high | b.high )<<1 ) == 0 ) )
5224 1.1 ross );
5225 1.1 ross
5226 1.1 ross }
5227 1.1 ross
5228 1.1 ross /*
5229 1.1 ross -------------------------------------------------------------------------------
5230 1.1 ross Returns 1 if the quadruple-precision floating-point value `a' is less than
5231 1.1 ross or equal to the corresponding value `b', and 0 otherwise. The comparison
5232 1.1 ross is performed according to the IEC/IEEE Standard for Binary Floating-Point
5233 1.1 ross Arithmetic.
5234 1.1 ross -------------------------------------------------------------------------------
5235 1.1 ross */
5236 1.1 ross flag float128_le( float128 a, float128 b )
5237 1.1 ross {
5238 1.1 ross flag aSign, bSign;
5239 1.1 ross
5240 1.1 ross if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
5241 1.1 ross && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
5242 1.1 ross || ( ( extractFloat128Exp( b ) == 0x7FFF )
5243 1.1 ross && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
5244 1.1 ross ) {
5245 1.1 ross float_raise( float_flag_invalid );
5246 1.1 ross return 0;
5247 1.1 ross }
5248 1.1 ross aSign = extractFloat128Sign( a );
5249 1.1 ross bSign = extractFloat128Sign( b );
5250 1.1 ross if ( aSign != bSign ) {
5251 1.1 ross return
5252 1.1 ross aSign
5253 1.1 ross || ( ( ( (bits64) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5254 1.1 ross == 0 );
5255 1.1 ross }
5256 1.1 ross return
5257 1.1 ross aSign ? le128( b.high, b.low, a.high, a.low )
5258 1.1 ross : le128( a.high, a.low, b.high, b.low );
5259 1.1 ross
5260 1.1 ross }
5261 1.1 ross
5262 1.1 ross /*
5263 1.1 ross -------------------------------------------------------------------------------
5264 1.1 ross Returns 1 if the quadruple-precision floating-point value `a' is less than
5265 1.1 ross the corresponding value `b', and 0 otherwise. The comparison is performed
5266 1.1 ross according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5267 1.1 ross -------------------------------------------------------------------------------
5268 1.1 ross */
5269 1.1 ross flag float128_lt( float128 a, float128 b )
5270 1.1 ross {
5271 1.1 ross flag aSign, bSign;
5272 1.1 ross
5273 1.1 ross if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
5274 1.1 ross && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
5275 1.1 ross || ( ( extractFloat128Exp( b ) == 0x7FFF )
5276 1.1 ross && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
5277 1.1 ross ) {
5278 1.1 ross float_raise( float_flag_invalid );
5279 1.1 ross return 0;
5280 1.1 ross }
5281 1.1 ross aSign = extractFloat128Sign( a );
5282 1.1 ross bSign = extractFloat128Sign( b );
5283 1.1 ross if ( aSign != bSign ) {
5284 1.1 ross return
5285 1.1 ross aSign
5286 1.1 ross && ( ( ( (bits64) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5287 1.1 ross != 0 );
5288 1.1 ross }
5289 1.1 ross return
5290 1.1 ross aSign ? lt128( b.high, b.low, a.high, a.low )
5291 1.1 ross : lt128( a.high, a.low, b.high, b.low );
5292 1.1 ross
5293 1.1 ross }
5294 1.1 ross
5295 1.1 ross /*
5296 1.1 ross -------------------------------------------------------------------------------
5297 1.1 ross Returns 1 if the quadruple-precision floating-point value `a' is equal to
5298 1.1 ross the corresponding value `b', and 0 otherwise. The invalid exception is
5299 1.1 ross raised if either operand is a NaN. Otherwise, the comparison is performed
5300 1.1 ross according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5301 1.1 ross -------------------------------------------------------------------------------
5302 1.1 ross */
5303 1.1 ross flag float128_eq_signaling( float128 a, float128 b )
5304 1.1 ross {
5305 1.1 ross
5306 1.1 ross if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
5307 1.1 ross && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
5308 1.1 ross || ( ( extractFloat128Exp( b ) == 0x7FFF )
5309 1.1 ross && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
5310 1.1 ross ) {
5311 1.1 ross float_raise( float_flag_invalid );
5312 1.1 ross return 0;
5313 1.1 ross }
5314 1.1 ross return
5315 1.1 ross ( a.low == b.low )
5316 1.1 ross && ( ( a.high == b.high )
5317 1.1 ross || ( ( a.low == 0 )
5318 1.1 ross && ( (bits64) ( ( a.high | b.high )<<1 ) == 0 ) )
5319 1.1 ross );
5320 1.1 ross
5321 1.1 ross }
5322 1.1 ross
5323 1.1 ross /*
5324 1.1 ross -------------------------------------------------------------------------------
5325 1.1 ross Returns 1 if the quadruple-precision floating-point value `a' is less than
5326 1.1 ross or equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not
5327 1.1 ross cause an exception. Otherwise, the comparison is performed according to the
5328 1.1 ross IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5329 1.1 ross -------------------------------------------------------------------------------
5330 1.1 ross */
5331 1.1 ross flag float128_le_quiet( float128 a, float128 b )
5332 1.1 ross {
5333 1.1 ross flag aSign, bSign;
5334 1.1 ross
5335 1.1 ross if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
5336 1.1 ross && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
5337 1.1 ross || ( ( extractFloat128Exp( b ) == 0x7FFF )
5338 1.1 ross && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
5339 1.1 ross ) {
5340 1.1 ross if ( float128_is_signaling_nan( a )
5341 1.1 ross || float128_is_signaling_nan( b ) ) {
5342 1.1 ross float_raise( float_flag_invalid );
5343 1.1 ross }
5344 1.1 ross return 0;
5345 1.1 ross }
5346 1.1 ross aSign = extractFloat128Sign( a );
5347 1.1 ross bSign = extractFloat128Sign( b );
5348 1.1 ross if ( aSign != bSign ) {
5349 1.1 ross return
5350 1.1 ross aSign
5351 1.1 ross || ( ( ( (bits64) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5352 1.1 ross == 0 );
5353 1.1 ross }
5354 1.1 ross return
5355 1.1 ross aSign ? le128( b.high, b.low, a.high, a.low )
5356 1.1 ross : le128( a.high, a.low, b.high, b.low );
5357 1.1 ross
5358 1.1 ross }
5359 1.1 ross
5360 1.1 ross /*
5361 1.1 ross -------------------------------------------------------------------------------
5362 1.1 ross Returns 1 if the quadruple-precision floating-point value `a' is less than
5363 1.1 ross the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
5364 1.1 ross exception. Otherwise, the comparison is performed according to the IEC/IEEE
5365 1.1 ross Standard for Binary Floating-Point Arithmetic.
5366 1.1 ross -------------------------------------------------------------------------------
5367 1.1 ross */
5368 1.1 ross flag float128_lt_quiet( float128 a, float128 b )
5369 1.1 ross {
5370 1.1 ross flag aSign, bSign;
5371 1.1 ross
5372 1.1 ross if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
5373 1.1 ross && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
5374 1.1 ross || ( ( extractFloat128Exp( b ) == 0x7FFF )
5375 1.1 ross && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
5376 1.1 ross ) {
5377 1.1 ross if ( float128_is_signaling_nan( a )
5378 1.1 ross || float128_is_signaling_nan( b ) ) {
5379 1.1 ross float_raise( float_flag_invalid );
5380 1.1 ross }
5381 1.1 ross return 0;
5382 1.1 ross }
5383 1.1 ross aSign = extractFloat128Sign( a );
5384 1.1 ross bSign = extractFloat128Sign( b );
5385 1.1 ross if ( aSign != bSign ) {
5386 1.1 ross return
5387 1.1 ross aSign
5388 1.1 ross && ( ( ( (bits64) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5389 1.1 ross != 0 );
5390 1.1 ross }
5391 1.1 ross return
5392 1.1 ross aSign ? lt128( b.high, b.low, a.high, a.low )
5393 1.1 ross : lt128( a.high, a.low, b.high, b.low );
5394 1.1 ross
5395 1.1 ross }
5396 1.1 ross
5397 1.1 ross #endif
5398 1.1 ross
5399 1.1 ross
5400 1.1 ross #if defined(SOFTFLOAT_FOR_GCC) && defined(SOFTFLOAT_NEED_FIXUNS)
5401 1.1 ross
5402 1.1 ross /*
5403 1.1 ross * These two routines are not part of the original softfloat distribution.
5404 1.1 ross *
5405 1.1 ross * They are based on the corresponding conversions to integer but return
5406 1.1 ross * unsigned numbers instead since these functions are required by GCC.
5407 1.1 ross *
5408 1.3 keihan * Added by Mark Brinicombe <mark (at) NetBSD.org> 27/09/97
5409 1.1 ross *
5410 1.1 ross * float64 version overhauled for SoftFloat 2a [bjh21 2000-07-15]
5411 1.1 ross */
5412 1.1 ross
5413 1.1 ross /*
5414 1.1 ross -------------------------------------------------------------------------------
5415 1.1 ross Returns the result of converting the double-precision floating-point value
5416 1.1 ross `a' to the 32-bit unsigned integer format. The conversion is
5417 1.1 ross performed according to the IEC/IEEE Standard for Binary Floating-point
5418 1.1 ross Arithmetic, except that the conversion is always rounded toward zero. If
5419 1.1 ross `a' is a NaN, the largest positive integer is returned. If the conversion
5420 1.1 ross overflows, the largest integer positive is returned.
5421 1.1 ross -------------------------------------------------------------------------------
5422 1.1 ross */
5423 1.1 ross uint32 float64_to_uint32_round_to_zero( float64 a )
5424 1.1 ross {
5425 1.1 ross flag aSign;
5426 1.1 ross int16 aExp, shiftCount;
5427 1.1 ross bits64 aSig, savedASig;
5428 1.1 ross uint32 z;
5429 1.1 ross
5430 1.1 ross aSig = extractFloat64Frac( a );
5431 1.1 ross aExp = extractFloat64Exp( a );
5432 1.1 ross aSign = extractFloat64Sign( a );
5433 1.1 ross
5434 1.1 ross if (aSign) {
5435 1.1 ross float_raise( float_flag_invalid );
5436 1.1 ross return(0);
5437 1.1 ross }
5438 1.1 ross
5439 1.1 ross if ( 0x41E < aExp ) {
5440 1.1 ross float_raise( float_flag_invalid );
5441 1.1 ross return 0xffffffff;
5442 1.1 ross }
5443 1.1 ross else if ( aExp < 0x3FF ) {
5444 1.1 ross if ( aExp || aSig ) float_set_inexact();
5445 1.1 ross return 0;
5446 1.1 ross }
5447 1.1 ross aSig |= LIT64( 0x0010000000000000 );
5448 1.1 ross shiftCount = 0x433 - aExp;
5449 1.1 ross savedASig = aSig;
5450 1.1 ross aSig >>= shiftCount;
5451 1.1 ross z = aSig;
5452 1.1 ross if ( ( aSig<<shiftCount ) != savedASig ) {
5453 1.1 ross float_set_inexact();
5454 1.1 ross }
5455 1.1 ross return z;
5456 1.1 ross
5457 1.1 ross }
5458 1.1 ross
5459 1.1 ross /*
5460 1.1 ross -------------------------------------------------------------------------------
5461 1.1 ross Returns the result of converting the single-precision floating-point value
5462 1.1 ross `a' to the 32-bit unsigned integer format. The conversion is
5463 1.1 ross performed according to the IEC/IEEE Standard for Binary Floating-point
5464 1.1 ross Arithmetic, except that the conversion is always rounded toward zero. If
5465 1.1 ross `a' is a NaN, the largest positive integer is returned. If the conversion
5466 1.1 ross overflows, the largest positive integer is returned.
5467 1.1 ross -------------------------------------------------------------------------------
5468 1.1 ross */
5469 1.1 ross uint32 float32_to_uint32_round_to_zero( float32 a )
5470 1.1 ross {
5471 1.1 ross flag aSign;
5472 1.1 ross int16 aExp, shiftCount;
5473 1.1 ross bits32 aSig;
5474 1.1 ross uint32 z;
5475 1.1 ross
5476 1.1 ross aSig = extractFloat32Frac( a );
5477 1.1 ross aExp = extractFloat32Exp( a );
5478 1.1 ross aSign = extractFloat32Sign( a );
5479 1.1 ross shiftCount = aExp - 0x9E;
5480 1.1 ross
5481 1.1 ross if (aSign) {
5482 1.1 ross float_raise( float_flag_invalid );
5483 1.1 ross return(0);
5484 1.1 ross }
5485 1.1 ross if ( 0 < shiftCount ) {
5486 1.1 ross float_raise( float_flag_invalid );
5487 1.1 ross return 0xFFFFFFFF;
5488 1.1 ross }
5489 1.1 ross else if ( aExp <= 0x7E ) {
5490 1.1 ross if ( aExp | aSig ) float_set_inexact();
5491 1.1 ross return 0;
5492 1.1 ross }
5493 1.1 ross aSig = ( aSig | 0x800000 )<<8;
5494 1.1 ross z = aSig>>( - shiftCount );
5495 1.1 ross if ( aSig<<( shiftCount & 31 ) ) {
5496 1.1 ross float_set_inexact();
5497 1.1 ross }
5498 1.1 ross return z;
5499 1.1 ross
5500 1.1 ross }
5501 1.1 ross
5502 1.1 ross #endif
5503 1.2 thorpej
5504 1.2 thorpej #endif /* _STANDALONE */
5505