softfloat.c revision 1.7 1 1.7 thorpej /* $NetBSD: softfloat.c,v 1.7 2020/09/02 03:45:54 thorpej Exp $ */
2 1.1 ross
3 1.1 ross /*
4 1.1 ross * This version hacked for use with gcc -msoft-float by bjh21.
5 1.1 ross * (Mostly a case of #ifdefing out things GCC doesn't need or provides
6 1.1 ross * itself).
7 1.1 ross */
8 1.1 ross
9 1.1 ross /*
10 1.1 ross * Things you may want to define:
11 1.1 ross *
12 1.1 ross * SOFTFLOAT_FOR_GCC - build only those functions necessary for GCC (with
13 1.1 ross * -msoft-float) to work. Include "softfloat-for-gcc.h" to get them
14 1.1 ross * properly renamed.
15 1.1 ross */
16 1.1 ross
17 1.7 thorpej /*============================================================================
18 1.1 ross
19 1.7 thorpej This C source file is part of the SoftFloat IEC/IEEE Floating-point Arithmetic
20 1.7 thorpej Package, Release 2b.
21 1.1 ross
22 1.1 ross Written by John R. Hauser. This work was made possible in part by the
23 1.1 ross International Computer Science Institute, located at Suite 600, 1947 Center
24 1.1 ross Street, Berkeley, California 94704. Funding was partially provided by the
25 1.1 ross National Science Foundation under grant MIP-9311980. The original version
26 1.1 ross of this code was written as part of a project to build a fixed-point vector
27 1.1 ross processor in collaboration with the University of California at Berkeley,
28 1.1 ross overseen by Profs. Nelson Morgan and John Wawrzynek. More information
29 1.7 thorpej is available through the Web page `http://www.cs.berkeley.edu/~jhauser/
30 1.1 ross arithmetic/SoftFloat.html'.
31 1.1 ross
32 1.7 thorpej THIS SOFTWARE IS DISTRIBUTED AS IS, FOR FREE. Although reasonable effort has
33 1.7 thorpej been made to avoid it, THIS SOFTWARE MAY CONTAIN FAULTS THAT WILL AT TIMES
34 1.7 thorpej RESULT IN INCORRECT BEHAVIOR. USE OF THIS SOFTWARE IS RESTRICTED TO PERSONS
35 1.7 thorpej AND ORGANIZATIONS WHO CAN AND WILL TAKE FULL RESPONSIBILITY FOR ALL LOSSES,
36 1.7 thorpej COSTS, OR OTHER PROBLEMS THEY INCUR DUE TO THE SOFTWARE, AND WHO FURTHERMORE
37 1.7 thorpej EFFECTIVELY INDEMNIFY JOHN HAUSER AND THE INTERNATIONAL COMPUTER SCIENCE
38 1.7 thorpej INSTITUTE (possibly via similar legal warning) AGAINST ALL LOSSES, COSTS, OR
39 1.7 thorpej OTHER PROBLEMS INCURRED BY THEIR CUSTOMERS AND CLIENTS DUE TO THE SOFTWARE.
40 1.1 ross
41 1.1 ross Derivative works are acceptable, even for commercial purposes, so long as
42 1.7 thorpej (1) the source code for the derivative work includes prominent notice that
43 1.7 thorpej the work is derivative, and (2) the source code includes prominent notice with
44 1.7 thorpej these four paragraphs for those parts of this code that are retained.
45 1.1 ross
46 1.7 thorpej =============================================================================*/
47 1.1 ross
48 1.2 thorpej /* If you need this in a boot program, you have bigger problems... */
49 1.2 thorpej #ifndef _STANDALONE
50 1.2 thorpej
51 1.1 ross #include <sys/cdefs.h>
52 1.1 ross #if defined(LIBC_SCCS) && !defined(lint)
53 1.7 thorpej __RCSID("$NetBSD: softfloat.c,v 1.7 2020/09/02 03:45:54 thorpej Exp $");
54 1.1 ross #endif /* LIBC_SCCS and not lint */
55 1.1 ross
56 1.1 ross #ifdef SOFTFLOAT_FOR_GCC
57 1.1 ross #include "softfloat-for-gcc.h"
58 1.1 ross #endif
59 1.1 ross
60 1.1 ross #include "milieu.h"
61 1.1 ross #include "softfloat.h"
62 1.1 ross
63 1.1 ross /*
64 1.1 ross * Conversions between floats as stored in memory and floats as
65 1.1 ross * SoftFloat uses them
66 1.1 ross */
67 1.1 ross #ifndef FLOAT64_DEMANGLE
68 1.1 ross #define FLOAT64_DEMANGLE(a) (a)
69 1.1 ross #endif
70 1.1 ross #ifndef FLOAT64_MANGLE
71 1.1 ross #define FLOAT64_MANGLE(a) (a)
72 1.1 ross #endif
73 1.1 ross
74 1.7 thorpej /*----------------------------------------------------------------------------
75 1.7 thorpej | Floating-point rounding mode, extended double-precision rounding precision,
76 1.7 thorpej | and exception flags.
77 1.7 thorpej *----------------------------------------------------------------------------*/
78 1.1 ross /*
79 1.1 ross * XXX: This may cause options-MULTIPROCESSOR or thread problems someday.
80 1.1 ross * Right now, it does not. I've removed all other dynamic global
81 1.1 ross * variables. [ross]
82 1.1 ross */
83 1.1 ross #ifdef FLOATX80
84 1.1 ross int8 floatx80_rounding_precision = 80;
85 1.1 ross #endif
86 1.1 ross
87 1.7 thorpej /*----------------------------------------------------------------------------
88 1.7 thorpej | Primitive arithmetic functions, including multi-word arithmetic, and
89 1.7 thorpej | division and square root approximations. (Can be specialized to target if
90 1.7 thorpej | desired.)
91 1.7 thorpej *----------------------------------------------------------------------------*/
92 1.1 ross #include "softfloat-macros.h"
93 1.1 ross
94 1.7 thorpej /*----------------------------------------------------------------------------
95 1.7 thorpej | Functions and definitions to determine: (1) whether tininess for underflow
96 1.7 thorpej | is detected before or after rounding by default, (2) what (if anything)
97 1.7 thorpej | happens when exceptions are raised, (3) how signaling NaNs are distinguished
98 1.7 thorpej | from quiet NaNs, (4) the default generated quiet NaNs, and (5) how NaNs
99 1.7 thorpej | are propagated from function inputs to output. These details are target-
100 1.7 thorpej | specific.
101 1.7 thorpej *----------------------------------------------------------------------------*/
102 1.1 ross #include "softfloat-specialize.h"
103 1.1 ross
104 1.1 ross #ifndef SOFTFLOAT_FOR_GCC /* Not used */
105 1.7 thorpej /*----------------------------------------------------------------------------
106 1.7 thorpej | Takes a 64-bit fixed-point value `absZ' with binary point between bits 6
107 1.7 thorpej | and 7, and returns the properly rounded 32-bit integer corresponding to the
108 1.7 thorpej | input. If `zSign' is 1, the input is negated before being converted to an
109 1.7 thorpej | integer. Bit 63 of `absZ' must be zero. Ordinarily, the fixed-point input
110 1.7 thorpej | is simply rounded to an integer, with the inexact exception raised if the
111 1.7 thorpej | input cannot be represented exactly as an integer. However, if the fixed-
112 1.7 thorpej | point input is too large, the invalid exception is raised and the largest
113 1.7 thorpej | positive or negative integer is returned.
114 1.7 thorpej *----------------------------------------------------------------------------*/
115 1.7 thorpej
116 1.1 ross static int32 roundAndPackInt32( flag zSign, bits64 absZ )
117 1.1 ross {
118 1.1 ross int8 roundingMode;
119 1.1 ross flag roundNearestEven;
120 1.1 ross int8 roundIncrement, roundBits;
121 1.1 ross int32 z;
122 1.1 ross
123 1.1 ross roundingMode = float_rounding_mode();
124 1.1 ross roundNearestEven = ( roundingMode == float_round_nearest_even );
125 1.1 ross roundIncrement = 0x40;
126 1.1 ross if ( ! roundNearestEven ) {
127 1.1 ross if ( roundingMode == float_round_to_zero ) {
128 1.1 ross roundIncrement = 0;
129 1.1 ross }
130 1.1 ross else {
131 1.1 ross roundIncrement = 0x7F;
132 1.1 ross if ( zSign ) {
133 1.1 ross if ( roundingMode == float_round_up ) roundIncrement = 0;
134 1.1 ross }
135 1.1 ross else {
136 1.1 ross if ( roundingMode == float_round_down ) roundIncrement = 0;
137 1.1 ross }
138 1.1 ross }
139 1.1 ross }
140 1.1 ross roundBits = absZ & 0x7F;
141 1.1 ross absZ = ( absZ + roundIncrement )>>7;
142 1.1 ross absZ &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven );
143 1.1 ross z = absZ;
144 1.1 ross if ( zSign ) z = - z;
145 1.1 ross if ( ( absZ>>32 ) || ( z && ( ( z < 0 ) ^ zSign ) ) ) {
146 1.1 ross float_raise( float_flag_invalid );
147 1.1 ross return zSign ? (sbits32) 0x80000000 : 0x7FFFFFFF;
148 1.1 ross }
149 1.1 ross if ( roundBits ) float_set_inexact();
150 1.1 ross return z;
151 1.1 ross
152 1.1 ross }
153 1.1 ross
154 1.7 thorpej /*----------------------------------------------------------------------------
155 1.7 thorpej | Takes the 128-bit fixed-point value formed by concatenating `absZ0' and
156 1.7 thorpej | `absZ1', with binary point between bits 63 and 64 (between the input words),
157 1.7 thorpej | and returns the properly rounded 64-bit integer corresponding to the input.
158 1.7 thorpej | If `zSign' is 1, the input is negated before being converted to an integer.
159 1.7 thorpej | Ordinarily, the fixed-point input is simply rounded to an integer, with
160 1.7 thorpej | the inexact exception raised if the input cannot be represented exactly as
161 1.7 thorpej | an integer. However, if the fixed-point input is too large, the invalid
162 1.7 thorpej | exception is raised and the largest positive or negative integer is
163 1.7 thorpej | returned.
164 1.7 thorpej *----------------------------------------------------------------------------*/
165 1.7 thorpej
166 1.1 ross static int64 roundAndPackInt64( flag zSign, bits64 absZ0, bits64 absZ1 )
167 1.1 ross {
168 1.1 ross int8 roundingMode;
169 1.1 ross flag roundNearestEven, increment;
170 1.1 ross int64 z;
171 1.1 ross
172 1.1 ross roundingMode = float_rounding_mode();
173 1.1 ross roundNearestEven = ( roundingMode == float_round_nearest_even );
174 1.1 ross increment = ( (sbits64) absZ1 < 0 );
175 1.1 ross if ( ! roundNearestEven ) {
176 1.1 ross if ( roundingMode == float_round_to_zero ) {
177 1.1 ross increment = 0;
178 1.1 ross }
179 1.1 ross else {
180 1.1 ross if ( zSign ) {
181 1.1 ross increment = ( roundingMode == float_round_down ) && absZ1;
182 1.1 ross }
183 1.1 ross else {
184 1.1 ross increment = ( roundingMode == float_round_up ) && absZ1;
185 1.1 ross }
186 1.1 ross }
187 1.1 ross }
188 1.1 ross if ( increment ) {
189 1.1 ross ++absZ0;
190 1.1 ross if ( absZ0 == 0 ) goto overflow;
191 1.1 ross absZ0 &= ~ ( ( (bits64) ( absZ1<<1 ) == 0 ) & roundNearestEven );
192 1.1 ross }
193 1.1 ross z = absZ0;
194 1.1 ross if ( zSign ) z = - z;
195 1.1 ross if ( z && ( ( z < 0 ) ^ zSign ) ) {
196 1.1 ross overflow:
197 1.1 ross float_raise( float_flag_invalid );
198 1.1 ross return
199 1.1 ross zSign ? (sbits64) LIT64( 0x8000000000000000 )
200 1.1 ross : LIT64( 0x7FFFFFFFFFFFFFFF );
201 1.1 ross }
202 1.1 ross if ( absZ1 ) float_set_inexact();
203 1.1 ross return z;
204 1.1 ross
205 1.1 ross }
206 1.6 martin
207 1.6 martin /* same as above, but for unsigned values */
208 1.6 martin static uint64 roundAndPackUInt64( bits64 absZ0, bits64 absZ1 )
209 1.6 martin {
210 1.6 martin int8 roundingMode;
211 1.6 martin flag roundNearestEven, increment;
212 1.6 martin uint64 z;
213 1.6 martin
214 1.6 martin roundingMode = float_rounding_mode();
215 1.6 martin roundNearestEven = ( roundingMode == float_round_nearest_even );
216 1.6 martin increment = ( (sbits64) absZ1 < 0 );
217 1.6 martin if ( ! roundNearestEven ) {
218 1.6 martin if ( roundingMode == float_round_to_zero ) {
219 1.6 martin increment = 0;
220 1.6 martin }
221 1.6 martin else {
222 1.6 martin increment = ( roundingMode == float_round_up ) && absZ1;
223 1.6 martin }
224 1.6 martin }
225 1.6 martin if ( increment ) {
226 1.6 martin ++absZ0;
227 1.6 martin if ( absZ0 == 0 ) {
228 1.6 martin float_raise( float_flag_invalid );
229 1.6 martin return LIT64( 0x7FFFFFFFFFFFFFFF );
230 1.6 martin }
231 1.6 martin absZ0 &= ~ ( ( (bits64) ( absZ1<<1 ) == 0 ) & roundNearestEven );
232 1.6 martin }
233 1.6 martin z = absZ0;
234 1.6 martin if ( absZ1 ) float_set_inexact();
235 1.6 martin return z;
236 1.6 martin
237 1.6 martin }
238 1.7 thorpej #endif /* SOFTFLOAT_FOR_GCC */
239 1.7 thorpej
240 1.7 thorpej /*----------------------------------------------------------------------------
241 1.7 thorpej | Returns the fraction bits of the single-precision floating-point value `a'.
242 1.7 thorpej *----------------------------------------------------------------------------*/
243 1.1 ross
244 1.1 ross INLINE bits32 extractFloat32Frac( float32 a )
245 1.1 ross {
246 1.1 ross
247 1.1 ross return a & 0x007FFFFF;
248 1.1 ross
249 1.1 ross }
250 1.1 ross
251 1.7 thorpej /*----------------------------------------------------------------------------
252 1.7 thorpej | Returns the exponent bits of the single-precision floating-point value `a'.
253 1.7 thorpej *----------------------------------------------------------------------------*/
254 1.7 thorpej
255 1.1 ross INLINE int16 extractFloat32Exp( float32 a )
256 1.1 ross {
257 1.1 ross
258 1.1 ross return ( a>>23 ) & 0xFF;
259 1.1 ross
260 1.1 ross }
261 1.1 ross
262 1.7 thorpej /*----------------------------------------------------------------------------
263 1.7 thorpej | Returns the sign bit of the single-precision floating-point value `a'.
264 1.7 thorpej *----------------------------------------------------------------------------*/
265 1.7 thorpej
266 1.1 ross INLINE flag extractFloat32Sign( float32 a )
267 1.1 ross {
268 1.1 ross
269 1.1 ross return a>>31;
270 1.1 ross
271 1.1 ross }
272 1.1 ross
273 1.7 thorpej /*----------------------------------------------------------------------------
274 1.7 thorpej | Normalizes the subnormal single-precision floating-point value represented
275 1.7 thorpej | by the denormalized significand `aSig'. The normalized exponent and
276 1.7 thorpej | significand are stored at the locations pointed to by `zExpPtr' and
277 1.7 thorpej | `zSigPtr', respectively.
278 1.7 thorpej *----------------------------------------------------------------------------*/
279 1.7 thorpej
280 1.1 ross static void
281 1.1 ross normalizeFloat32Subnormal( bits32 aSig, int16 *zExpPtr, bits32 *zSigPtr )
282 1.1 ross {
283 1.1 ross int8 shiftCount;
284 1.1 ross
285 1.1 ross shiftCount = countLeadingZeros32( aSig ) - 8;
286 1.1 ross *zSigPtr = aSig<<shiftCount;
287 1.1 ross *zExpPtr = 1 - shiftCount;
288 1.1 ross
289 1.1 ross }
290 1.1 ross
291 1.7 thorpej /*----------------------------------------------------------------------------
292 1.7 thorpej | Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
293 1.7 thorpej | single-precision floating-point value, returning the result. After being
294 1.7 thorpej | shifted into the proper positions, the three fields are simply added
295 1.7 thorpej | together to form the result. This means that any integer portion of `zSig'
296 1.7 thorpej | will be added into the exponent. Since a properly normalized significand
297 1.7 thorpej | will have an integer portion equal to 1, the `zExp' input should be 1 less
298 1.7 thorpej | than the desired result exponent whenever `zSig' is a complete, normalized
299 1.7 thorpej | significand.
300 1.7 thorpej *----------------------------------------------------------------------------*/
301 1.7 thorpej
302 1.1 ross INLINE float32 packFloat32( flag zSign, int16 zExp, bits32 zSig )
303 1.1 ross {
304 1.1 ross
305 1.1 ross return ( ( (bits32) zSign )<<31 ) + ( ( (bits32) zExp )<<23 ) + zSig;
306 1.1 ross
307 1.1 ross }
308 1.1 ross
309 1.7 thorpej /*----------------------------------------------------------------------------
310 1.7 thorpej | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
311 1.7 thorpej | and significand `zSig', and returns the proper single-precision floating-
312 1.7 thorpej | point value corresponding to the abstract input. Ordinarily, the abstract
313 1.7 thorpej | value is simply rounded and packed into the single-precision format, with
314 1.7 thorpej | the inexact exception raised if the abstract input cannot be represented
315 1.7 thorpej | exactly. However, if the abstract value is too large, the overflow and
316 1.7 thorpej | inexact exceptions are raised and an infinity or maximal finite value is
317 1.7 thorpej | returned. If the abstract value is too small, the input value is rounded to
318 1.7 thorpej | a subnormal number, and the underflow and inexact exceptions are raised if
319 1.7 thorpej | the abstract input cannot be represented exactly as a subnormal single-
320 1.7 thorpej | precision floating-point number.
321 1.7 thorpej | The input significand `zSig' has its binary point between bits 30
322 1.7 thorpej | and 29, which is 7 bits to the left of the usual location. This shifted
323 1.7 thorpej | significand must be normalized or smaller. If `zSig' is not normalized,
324 1.7 thorpej | `zExp' must be 0; in that case, the result returned is a subnormal number,
325 1.7 thorpej | and it must not require rounding. In the usual case that `zSig' is
326 1.7 thorpej | normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
327 1.7 thorpej | The handling of underflow and overflow follows the IEC/IEEE Standard for
328 1.7 thorpej | Binary Floating-Point Arithmetic.
329 1.7 thorpej *----------------------------------------------------------------------------*/
330 1.7 thorpej
331 1.1 ross static float32 roundAndPackFloat32( flag zSign, int16 zExp, bits32 zSig )
332 1.1 ross {
333 1.1 ross int8 roundingMode;
334 1.1 ross flag roundNearestEven;
335 1.1 ross int8 roundIncrement, roundBits;
336 1.1 ross flag isTiny;
337 1.1 ross
338 1.1 ross roundingMode = float_rounding_mode();
339 1.1 ross roundNearestEven = ( roundingMode == float_round_nearest_even );
340 1.1 ross roundIncrement = 0x40;
341 1.1 ross if ( ! roundNearestEven ) {
342 1.1 ross if ( roundingMode == float_round_to_zero ) {
343 1.1 ross roundIncrement = 0;
344 1.1 ross }
345 1.1 ross else {
346 1.1 ross roundIncrement = 0x7F;
347 1.1 ross if ( zSign ) {
348 1.1 ross if ( roundingMode == float_round_up ) roundIncrement = 0;
349 1.1 ross }
350 1.1 ross else {
351 1.1 ross if ( roundingMode == float_round_down ) roundIncrement = 0;
352 1.1 ross }
353 1.1 ross }
354 1.1 ross }
355 1.1 ross roundBits = zSig & 0x7F;
356 1.1 ross if ( 0xFD <= (bits16) zExp ) {
357 1.1 ross if ( ( 0xFD < zExp )
358 1.1 ross || ( ( zExp == 0xFD )
359 1.1 ross && ( (sbits32) ( zSig + roundIncrement ) < 0 ) )
360 1.1 ross ) {
361 1.1 ross float_raise( float_flag_overflow | float_flag_inexact );
362 1.1 ross return packFloat32( zSign, 0xFF, 0 ) - ( roundIncrement == 0 );
363 1.1 ross }
364 1.1 ross if ( zExp < 0 ) {
365 1.1 ross isTiny =
366 1.1 ross ( float_detect_tininess == float_tininess_before_rounding )
367 1.1 ross || ( zExp < -1 )
368 1.1 ross || ( zSig + roundIncrement < 0x80000000 );
369 1.1 ross shift32RightJamming( zSig, - zExp, &zSig );
370 1.1 ross zExp = 0;
371 1.1 ross roundBits = zSig & 0x7F;
372 1.1 ross if ( isTiny && roundBits ) float_raise( float_flag_underflow );
373 1.1 ross }
374 1.1 ross }
375 1.1 ross if ( roundBits ) float_set_inexact();
376 1.1 ross zSig = ( zSig + roundIncrement )>>7;
377 1.1 ross zSig &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven );
378 1.1 ross if ( zSig == 0 ) zExp = 0;
379 1.1 ross return packFloat32( zSign, zExp, zSig );
380 1.1 ross
381 1.1 ross }
382 1.1 ross
383 1.7 thorpej /*----------------------------------------------------------------------------
384 1.7 thorpej | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
385 1.7 thorpej | and significand `zSig', and returns the proper single-precision floating-
386 1.7 thorpej | point value corresponding to the abstract input. This routine is just like
387 1.7 thorpej | `roundAndPackFloat32' except that `zSig' does not have to be normalized.
388 1.7 thorpej | Bit 31 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
389 1.7 thorpej | floating-point exponent.
390 1.7 thorpej *----------------------------------------------------------------------------*/
391 1.7 thorpej
392 1.1 ross static float32
393 1.1 ross normalizeRoundAndPackFloat32( flag zSign, int16 zExp, bits32 zSig )
394 1.1 ross {
395 1.1 ross int8 shiftCount;
396 1.1 ross
397 1.1 ross shiftCount = countLeadingZeros32( zSig ) - 1;
398 1.1 ross return roundAndPackFloat32( zSign, zExp - shiftCount, zSig<<shiftCount );
399 1.1 ross
400 1.1 ross }
401 1.1 ross
402 1.7 thorpej /*----------------------------------------------------------------------------
403 1.7 thorpej | Returns the fraction bits of the double-precision floating-point value `a'.
404 1.7 thorpej *----------------------------------------------------------------------------*/
405 1.7 thorpej
406 1.1 ross INLINE bits64 extractFloat64Frac( float64 a )
407 1.1 ross {
408 1.1 ross
409 1.1 ross return FLOAT64_DEMANGLE(a) & LIT64( 0x000FFFFFFFFFFFFF );
410 1.1 ross
411 1.1 ross }
412 1.1 ross
413 1.7 thorpej /*----------------------------------------------------------------------------
414 1.7 thorpej | Returns the exponent bits of the double-precision floating-point value `a'.
415 1.7 thorpej *----------------------------------------------------------------------------*/
416 1.7 thorpej
417 1.1 ross INLINE int16 extractFloat64Exp( float64 a )
418 1.1 ross {
419 1.1 ross
420 1.1 ross return ( FLOAT64_DEMANGLE(a)>>52 ) & 0x7FF;
421 1.1 ross
422 1.1 ross }
423 1.7 thorpej /*----------------------------------------------------------------------------
424 1.7 thorpej | Returns the sign bit of the double-precision floating-point value `a'.
425 1.7 thorpej *----------------------------------------------------------------------------*/
426 1.1 ross
427 1.1 ross INLINE flag extractFloat64Sign( float64 a )
428 1.1 ross {
429 1.1 ross
430 1.1 ross return FLOAT64_DEMANGLE(a)>>63;
431 1.1 ross
432 1.1 ross }
433 1.1 ross
434 1.7 thorpej /*----------------------------------------------------------------------------
435 1.7 thorpej | Normalizes the subnormal double-precision floating-point value represented
436 1.7 thorpej | by the denormalized significand `aSig'. The normalized exponent and
437 1.7 thorpej | significand are stored at the locations pointed to by `zExpPtr' and
438 1.7 thorpej | `zSigPtr', respectively.
439 1.7 thorpej *----------------------------------------------------------------------------*/
440 1.7 thorpej
441 1.1 ross static void
442 1.1 ross normalizeFloat64Subnormal( bits64 aSig, int16 *zExpPtr, bits64 *zSigPtr )
443 1.1 ross {
444 1.1 ross int8 shiftCount;
445 1.1 ross
446 1.1 ross shiftCount = countLeadingZeros64( aSig ) - 11;
447 1.1 ross *zSigPtr = aSig<<shiftCount;
448 1.1 ross *zExpPtr = 1 - shiftCount;
449 1.1 ross
450 1.1 ross }
451 1.1 ross
452 1.7 thorpej /*----------------------------------------------------------------------------
453 1.7 thorpej | Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
454 1.7 thorpej | double-precision floating-point value, returning the result. After being
455 1.7 thorpej | shifted into the proper positions, the three fields are simply added
456 1.7 thorpej | together to form the result. This means that any integer portion of `zSig'
457 1.7 thorpej | will be added into the exponent. Since a properly normalized significand
458 1.7 thorpej | will have an integer portion equal to 1, the `zExp' input should be 1 less
459 1.7 thorpej | than the desired result exponent whenever `zSig' is a complete, normalized
460 1.7 thorpej | significand.
461 1.7 thorpej *----------------------------------------------------------------------------*/
462 1.7 thorpej
463 1.1 ross INLINE float64 packFloat64( flag zSign, int16 zExp, bits64 zSig )
464 1.1 ross {
465 1.1 ross
466 1.1 ross return FLOAT64_MANGLE( ( ( (bits64) zSign )<<63 ) +
467 1.1 ross ( ( (bits64) zExp )<<52 ) + zSig );
468 1.1 ross
469 1.1 ross }
470 1.1 ross
471 1.7 thorpej /*----------------------------------------------------------------------------
472 1.7 thorpej | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
473 1.7 thorpej | and significand `zSig', and returns the proper double-precision floating-
474 1.7 thorpej | point value corresponding to the abstract input. Ordinarily, the abstract
475 1.7 thorpej | value is simply rounded and packed into the double-precision format, with
476 1.7 thorpej | the inexact exception raised if the abstract input cannot be represented
477 1.7 thorpej | exactly. However, if the abstract value is too large, the overflow and
478 1.7 thorpej | inexact exceptions are raised and an infinity or maximal finite value is
479 1.7 thorpej | returned. If the abstract value is too small, the input value is rounded
480 1.7 thorpej | to a subnormal number, and the underflow and inexact exceptions are raised
481 1.7 thorpej | if the abstract input cannot be represented exactly as a subnormal double-
482 1.7 thorpej | precision floating-point number.
483 1.7 thorpej | The input significand `zSig' has its binary point between bits 62
484 1.7 thorpej | and 61, which is 10 bits to the left of the usual location. This shifted
485 1.7 thorpej | significand must be normalized or smaller. If `zSig' is not normalized,
486 1.7 thorpej | `zExp' must be 0; in that case, the result returned is a subnormal number,
487 1.7 thorpej | and it must not require rounding. In the usual case that `zSig' is
488 1.7 thorpej | normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
489 1.7 thorpej | The handling of underflow and overflow follows the IEC/IEEE Standard for
490 1.7 thorpej | Binary Floating-Point Arithmetic.
491 1.7 thorpej *----------------------------------------------------------------------------*/
492 1.7 thorpej
493 1.1 ross static float64 roundAndPackFloat64( flag zSign, int16 zExp, bits64 zSig )
494 1.1 ross {
495 1.1 ross int8 roundingMode;
496 1.1 ross flag roundNearestEven;
497 1.1 ross int16 roundIncrement, roundBits;
498 1.1 ross flag isTiny;
499 1.1 ross
500 1.1 ross roundingMode = float_rounding_mode();
501 1.1 ross roundNearestEven = ( roundingMode == float_round_nearest_even );
502 1.1 ross roundIncrement = 0x200;
503 1.1 ross if ( ! roundNearestEven ) {
504 1.1 ross if ( roundingMode == float_round_to_zero ) {
505 1.1 ross roundIncrement = 0;
506 1.1 ross }
507 1.1 ross else {
508 1.1 ross roundIncrement = 0x3FF;
509 1.1 ross if ( zSign ) {
510 1.1 ross if ( roundingMode == float_round_up ) roundIncrement = 0;
511 1.1 ross }
512 1.1 ross else {
513 1.1 ross if ( roundingMode == float_round_down ) roundIncrement = 0;
514 1.1 ross }
515 1.1 ross }
516 1.1 ross }
517 1.1 ross roundBits = zSig & 0x3FF;
518 1.1 ross if ( 0x7FD <= (bits16) zExp ) {
519 1.1 ross if ( ( 0x7FD < zExp )
520 1.1 ross || ( ( zExp == 0x7FD )
521 1.1 ross && ( (sbits64) ( zSig + roundIncrement ) < 0 ) )
522 1.1 ross ) {
523 1.1 ross float_raise( float_flag_overflow | float_flag_inexact );
524 1.1 ross return FLOAT64_MANGLE(
525 1.1 ross FLOAT64_DEMANGLE(packFloat64( zSign, 0x7FF, 0 )) -
526 1.1 ross ( roundIncrement == 0 ));
527 1.1 ross }
528 1.1 ross if ( zExp < 0 ) {
529 1.1 ross isTiny =
530 1.1 ross ( float_detect_tininess == float_tininess_before_rounding )
531 1.1 ross || ( zExp < -1 )
532 1.1 ross || ( zSig + roundIncrement < LIT64( 0x8000000000000000 ) );
533 1.1 ross shift64RightJamming( zSig, - zExp, &zSig );
534 1.1 ross zExp = 0;
535 1.1 ross roundBits = zSig & 0x3FF;
536 1.1 ross if ( isTiny && roundBits ) float_raise( float_flag_underflow );
537 1.1 ross }
538 1.1 ross }
539 1.1 ross if ( roundBits ) float_set_inexact();
540 1.1 ross zSig = ( zSig + roundIncrement )>>10;
541 1.1 ross zSig &= ~ ( ( ( roundBits ^ 0x200 ) == 0 ) & roundNearestEven );
542 1.1 ross if ( zSig == 0 ) zExp = 0;
543 1.1 ross return packFloat64( zSign, zExp, zSig );
544 1.1 ross
545 1.1 ross }
546 1.1 ross
547 1.7 thorpej /*----------------------------------------------------------------------------
548 1.7 thorpej | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
549 1.7 thorpej | and significand `zSig', and returns the proper double-precision floating-
550 1.7 thorpej | point value corresponding to the abstract input. This routine is just like
551 1.7 thorpej | `roundAndPackFloat64' except that `zSig' does not have to be normalized.
552 1.7 thorpej | Bit 63 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
553 1.7 thorpej | floating-point exponent.
554 1.7 thorpej *----------------------------------------------------------------------------*/
555 1.7 thorpej
556 1.1 ross static float64
557 1.1 ross normalizeRoundAndPackFloat64( flag zSign, int16 zExp, bits64 zSig )
558 1.1 ross {
559 1.1 ross int8 shiftCount;
560 1.1 ross
561 1.1 ross shiftCount = countLeadingZeros64( zSig ) - 1;
562 1.1 ross return roundAndPackFloat64( zSign, zExp - shiftCount, zSig<<shiftCount );
563 1.1 ross
564 1.1 ross }
565 1.1 ross
566 1.1 ross #ifdef FLOATX80
567 1.1 ross
568 1.7 thorpej /*----------------------------------------------------------------------------
569 1.7 thorpej | Returns the fraction bits of the extended double-precision floating-point
570 1.7 thorpej | value `a'.
571 1.7 thorpej *----------------------------------------------------------------------------*/
572 1.7 thorpej
573 1.1 ross INLINE bits64 extractFloatx80Frac( floatx80 a )
574 1.1 ross {
575 1.1 ross
576 1.1 ross return a.low;
577 1.1 ross
578 1.1 ross }
579 1.1 ross
580 1.7 thorpej /*----------------------------------------------------------------------------
581 1.7 thorpej | Returns the exponent bits of the extended double-precision floating-point
582 1.7 thorpej | value `a'.
583 1.7 thorpej *----------------------------------------------------------------------------*/
584 1.7 thorpej
585 1.1 ross INLINE int32 extractFloatx80Exp( floatx80 a )
586 1.1 ross {
587 1.1 ross
588 1.1 ross return a.high & 0x7FFF;
589 1.1 ross
590 1.1 ross }
591 1.1 ross
592 1.7 thorpej /*----------------------------------------------------------------------------
593 1.7 thorpej | Returns the sign bit of the extended double-precision floating-point value
594 1.7 thorpej | `a'.
595 1.7 thorpej *----------------------------------------------------------------------------*/
596 1.7 thorpej
597 1.1 ross INLINE flag extractFloatx80Sign( floatx80 a )
598 1.1 ross {
599 1.1 ross
600 1.1 ross return a.high>>15;
601 1.1 ross
602 1.1 ross }
603 1.1 ross
604 1.7 thorpej /*----------------------------------------------------------------------------
605 1.7 thorpej | Normalizes the subnormal extended double-precision floating-point value
606 1.7 thorpej | represented by the denormalized significand `aSig'. The normalized exponent
607 1.7 thorpej | and significand are stored at the locations pointed to by `zExpPtr' and
608 1.7 thorpej | `zSigPtr', respectively.
609 1.7 thorpej *----------------------------------------------------------------------------*/
610 1.7 thorpej
611 1.1 ross static void
612 1.1 ross normalizeFloatx80Subnormal( bits64 aSig, int32 *zExpPtr, bits64 *zSigPtr )
613 1.1 ross {
614 1.1 ross int8 shiftCount;
615 1.1 ross
616 1.1 ross shiftCount = countLeadingZeros64( aSig );
617 1.1 ross *zSigPtr = aSig<<shiftCount;
618 1.1 ross *zExpPtr = 1 - shiftCount;
619 1.1 ross
620 1.1 ross }
621 1.1 ross
622 1.7 thorpej /*----------------------------------------------------------------------------
623 1.7 thorpej | Packs the sign `zSign', exponent `zExp', and significand `zSig' into an
624 1.7 thorpej | extended double-precision floating-point value, returning the result.
625 1.7 thorpej *----------------------------------------------------------------------------*/
626 1.7 thorpej
627 1.1 ross INLINE floatx80 packFloatx80( flag zSign, int32 zExp, bits64 zSig )
628 1.1 ross {
629 1.1 ross floatx80 z;
630 1.1 ross
631 1.1 ross z.low = zSig;
632 1.1 ross z.high = ( ( (bits16) zSign )<<15 ) + zExp;
633 1.1 ross return z;
634 1.1 ross
635 1.1 ross }
636 1.1 ross
637 1.7 thorpej /*----------------------------------------------------------------------------
638 1.7 thorpej | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
639 1.7 thorpej | and extended significand formed by the concatenation of `zSig0' and `zSig1',
640 1.7 thorpej | and returns the proper extended double-precision floating-point value
641 1.7 thorpej | corresponding to the abstract input. Ordinarily, the abstract value is
642 1.7 thorpej | rounded and packed into the extended double-precision format, with the
643 1.7 thorpej | inexact exception raised if the abstract input cannot be represented
644 1.7 thorpej | exactly. However, if the abstract value is too large, the overflow and
645 1.7 thorpej | inexact exceptions are raised and an infinity or maximal finite value is
646 1.7 thorpej | returned. If the abstract value is too small, the input value is rounded to
647 1.7 thorpej | a subnormal number, and the underflow and inexact exceptions are raised if
648 1.7 thorpej | the abstract input cannot be represented exactly as a subnormal extended
649 1.7 thorpej | double-precision floating-point number.
650 1.7 thorpej | If `roundingPrecision' is 32 or 64, the result is rounded to the same
651 1.7 thorpej | number of bits as single or double precision, respectively. Otherwise, the
652 1.7 thorpej | result is rounded to the full precision of the extended double-precision
653 1.7 thorpej | format.
654 1.7 thorpej | The input significand must be normalized or smaller. If the input
655 1.7 thorpej | significand is not normalized, `zExp' must be 0; in that case, the result
656 1.7 thorpej | returned is a subnormal number, and it must not require rounding. The
657 1.7 thorpej | handling of underflow and overflow follows the IEC/IEEE Standard for Binary
658 1.7 thorpej | Floating-Point Arithmetic.
659 1.7 thorpej *----------------------------------------------------------------------------*/
660 1.7 thorpej
661 1.1 ross static floatx80
662 1.1 ross roundAndPackFloatx80(
663 1.1 ross int8 roundingPrecision, flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1
664 1.1 ross )
665 1.1 ross {
666 1.1 ross int8 roundingMode;
667 1.1 ross flag roundNearestEven, increment, isTiny;
668 1.1 ross int64 roundIncrement, roundMask, roundBits;
669 1.1 ross
670 1.1 ross roundingMode = float_rounding_mode();
671 1.1 ross roundNearestEven = ( roundingMode == float_round_nearest_even );
672 1.1 ross if ( roundingPrecision == 80 ) goto precision80;
673 1.1 ross if ( roundingPrecision == 64 ) {
674 1.1 ross roundIncrement = LIT64( 0x0000000000000400 );
675 1.1 ross roundMask = LIT64( 0x00000000000007FF );
676 1.1 ross }
677 1.1 ross else if ( roundingPrecision == 32 ) {
678 1.1 ross roundIncrement = LIT64( 0x0000008000000000 );
679 1.1 ross roundMask = LIT64( 0x000000FFFFFFFFFF );
680 1.1 ross }
681 1.1 ross else {
682 1.1 ross goto precision80;
683 1.1 ross }
684 1.1 ross zSig0 |= ( zSig1 != 0 );
685 1.1 ross if ( ! roundNearestEven ) {
686 1.1 ross if ( roundingMode == float_round_to_zero ) {
687 1.1 ross roundIncrement = 0;
688 1.1 ross }
689 1.1 ross else {
690 1.1 ross roundIncrement = roundMask;
691 1.1 ross if ( zSign ) {
692 1.1 ross if ( roundingMode == float_round_up ) roundIncrement = 0;
693 1.1 ross }
694 1.1 ross else {
695 1.1 ross if ( roundingMode == float_round_down ) roundIncrement = 0;
696 1.1 ross }
697 1.1 ross }
698 1.1 ross }
699 1.1 ross roundBits = zSig0 & roundMask;
700 1.1 ross if ( 0x7FFD <= (bits32) ( zExp - 1 ) ) {
701 1.1 ross if ( ( 0x7FFE < zExp )
702 1.1 ross || ( ( zExp == 0x7FFE ) && ( zSig0 + roundIncrement < zSig0 ) )
703 1.1 ross ) {
704 1.1 ross goto overflow;
705 1.1 ross }
706 1.1 ross if ( zExp <= 0 ) {
707 1.1 ross isTiny =
708 1.1 ross ( float_detect_tininess == float_tininess_before_rounding )
709 1.1 ross || ( zExp < 0 )
710 1.1 ross || ( zSig0 <= zSig0 + roundIncrement );
711 1.1 ross shift64RightJamming( zSig0, 1 - zExp, &zSig0 );
712 1.1 ross zExp = 0;
713 1.1 ross roundBits = zSig0 & roundMask;
714 1.1 ross if ( isTiny && roundBits ) float_raise( float_flag_underflow );
715 1.1 ross if ( roundBits ) float_set_inexact();
716 1.1 ross zSig0 += roundIncrement;
717 1.1 ross if ( (sbits64) zSig0 < 0 ) zExp = 1;
718 1.1 ross roundIncrement = roundMask + 1;
719 1.1 ross if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) {
720 1.1 ross roundMask |= roundIncrement;
721 1.1 ross }
722 1.1 ross zSig0 &= ~ roundMask;
723 1.1 ross return packFloatx80( zSign, zExp, zSig0 );
724 1.1 ross }
725 1.1 ross }
726 1.1 ross if ( roundBits ) float_set_inexact();
727 1.1 ross zSig0 += roundIncrement;
728 1.1 ross if ( zSig0 < roundIncrement ) {
729 1.1 ross ++zExp;
730 1.1 ross zSig0 = LIT64( 0x8000000000000000 );
731 1.1 ross }
732 1.1 ross roundIncrement = roundMask + 1;
733 1.1 ross if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) {
734 1.1 ross roundMask |= roundIncrement;
735 1.1 ross }
736 1.1 ross zSig0 &= ~ roundMask;
737 1.1 ross if ( zSig0 == 0 ) zExp = 0;
738 1.1 ross return packFloatx80( zSign, zExp, zSig0 );
739 1.1 ross precision80:
740 1.1 ross increment = ( (sbits64) zSig1 < 0 );
741 1.1 ross if ( ! roundNearestEven ) {
742 1.1 ross if ( roundingMode == float_round_to_zero ) {
743 1.1 ross increment = 0;
744 1.1 ross }
745 1.1 ross else {
746 1.1 ross if ( zSign ) {
747 1.1 ross increment = ( roundingMode == float_round_down ) && zSig1;
748 1.1 ross }
749 1.1 ross else {
750 1.1 ross increment = ( roundingMode == float_round_up ) && zSig1;
751 1.1 ross }
752 1.1 ross }
753 1.1 ross }
754 1.1 ross if ( 0x7FFD <= (bits32) ( zExp - 1 ) ) {
755 1.1 ross if ( ( 0x7FFE < zExp )
756 1.1 ross || ( ( zExp == 0x7FFE )
757 1.1 ross && ( zSig0 == LIT64( 0xFFFFFFFFFFFFFFFF ) )
758 1.1 ross && increment
759 1.1 ross )
760 1.1 ross ) {
761 1.1 ross roundMask = 0;
762 1.1 ross overflow:
763 1.1 ross float_raise( float_flag_overflow | float_flag_inexact );
764 1.1 ross if ( ( roundingMode == float_round_to_zero )
765 1.1 ross || ( zSign && ( roundingMode == float_round_up ) )
766 1.1 ross || ( ! zSign && ( roundingMode == float_round_down ) )
767 1.1 ross ) {
768 1.1 ross return packFloatx80( zSign, 0x7FFE, ~ roundMask );
769 1.1 ross }
770 1.1 ross return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
771 1.1 ross }
772 1.1 ross if ( zExp <= 0 ) {
773 1.1 ross isTiny =
774 1.1 ross ( float_detect_tininess == float_tininess_before_rounding )
775 1.1 ross || ( zExp < 0 )
776 1.1 ross || ! increment
777 1.1 ross || ( zSig0 < LIT64( 0xFFFFFFFFFFFFFFFF ) );
778 1.1 ross shift64ExtraRightJamming( zSig0, zSig1, 1 - zExp, &zSig0, &zSig1 );
779 1.1 ross zExp = 0;
780 1.1 ross if ( isTiny && zSig1 ) float_raise( float_flag_underflow );
781 1.1 ross if ( zSig1 ) float_set_inexact();
782 1.1 ross if ( roundNearestEven ) {
783 1.1 ross increment = ( (sbits64) zSig1 < 0 );
784 1.1 ross }
785 1.1 ross else {
786 1.1 ross if ( zSign ) {
787 1.1 ross increment = ( roundingMode == float_round_down ) && zSig1;
788 1.1 ross }
789 1.1 ross else {
790 1.1 ross increment = ( roundingMode == float_round_up ) && zSig1;
791 1.1 ross }
792 1.1 ross }
793 1.1 ross if ( increment ) {
794 1.1 ross ++zSig0;
795 1.1 ross zSig0 &=
796 1.1 ross ~ ( ( (bits64) ( zSig1<<1 ) == 0 ) & roundNearestEven );
797 1.1 ross if ( (sbits64) zSig0 < 0 ) zExp = 1;
798 1.1 ross }
799 1.1 ross return packFloatx80( zSign, zExp, zSig0 );
800 1.1 ross }
801 1.1 ross }
802 1.1 ross if ( zSig1 ) float_set_inexact();
803 1.1 ross if ( increment ) {
804 1.1 ross ++zSig0;
805 1.1 ross if ( zSig0 == 0 ) {
806 1.1 ross ++zExp;
807 1.1 ross zSig0 = LIT64( 0x8000000000000000 );
808 1.1 ross }
809 1.1 ross else {
810 1.1 ross zSig0 &= ~ ( ( (bits64) ( zSig1<<1 ) == 0 ) & roundNearestEven );
811 1.1 ross }
812 1.1 ross }
813 1.1 ross else {
814 1.1 ross if ( zSig0 == 0 ) zExp = 0;
815 1.1 ross }
816 1.1 ross return packFloatx80( zSign, zExp, zSig0 );
817 1.1 ross
818 1.1 ross }
819 1.1 ross
820 1.7 thorpej /*----------------------------------------------------------------------------
821 1.7 thorpej | Takes an abstract floating-point value having sign `zSign', exponent
822 1.7 thorpej | `zExp', and significand formed by the concatenation of `zSig0' and `zSig1',
823 1.7 thorpej | and returns the proper extended double-precision floating-point value
824 1.7 thorpej | corresponding to the abstract input. This routine is just like
825 1.7 thorpej | `roundAndPackFloatx80' except that the input significand does not have to be
826 1.7 thorpej | normalized.
827 1.7 thorpej *----------------------------------------------------------------------------*/
828 1.7 thorpej
829 1.1 ross static floatx80
830 1.1 ross normalizeRoundAndPackFloatx80(
831 1.1 ross int8 roundingPrecision, flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1
832 1.1 ross )
833 1.1 ross {
834 1.1 ross int8 shiftCount;
835 1.1 ross
836 1.1 ross if ( zSig0 == 0 ) {
837 1.1 ross zSig0 = zSig1;
838 1.1 ross zSig1 = 0;
839 1.1 ross zExp -= 64;
840 1.1 ross }
841 1.1 ross shiftCount = countLeadingZeros64( zSig0 );
842 1.1 ross shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
843 1.1 ross zExp -= shiftCount;
844 1.1 ross return
845 1.1 ross roundAndPackFloatx80( roundingPrecision, zSign, zExp, zSig0, zSig1 );
846 1.1 ross
847 1.1 ross }
848 1.1 ross
849 1.1 ross #endif
850 1.1 ross
851 1.1 ross #ifdef FLOAT128
852 1.1 ross
853 1.7 thorpej /*----------------------------------------------------------------------------
854 1.7 thorpej | Returns the least-significant 64 fraction bits of the quadruple-precision
855 1.7 thorpej | floating-point value `a'.
856 1.7 thorpej *----------------------------------------------------------------------------*/
857 1.7 thorpej
858 1.1 ross INLINE bits64 extractFloat128Frac1( float128 a )
859 1.1 ross {
860 1.1 ross
861 1.1 ross return a.low;
862 1.1 ross
863 1.1 ross }
864 1.1 ross
865 1.7 thorpej /*----------------------------------------------------------------------------
866 1.7 thorpej | Returns the most-significant 48 fraction bits of the quadruple-precision
867 1.7 thorpej | floating-point value `a'.
868 1.7 thorpej *----------------------------------------------------------------------------*/
869 1.7 thorpej
870 1.1 ross INLINE bits64 extractFloat128Frac0( float128 a )
871 1.1 ross {
872 1.1 ross
873 1.1 ross return a.high & LIT64( 0x0000FFFFFFFFFFFF );
874 1.1 ross
875 1.1 ross }
876 1.1 ross
877 1.7 thorpej /*----------------------------------------------------------------------------
878 1.7 thorpej | Returns the exponent bits of the quadruple-precision floating-point value
879 1.7 thorpej | `a'.
880 1.7 thorpej *----------------------------------------------------------------------------*/
881 1.7 thorpej
882 1.1 ross INLINE int32 extractFloat128Exp( float128 a )
883 1.1 ross {
884 1.1 ross
885 1.1 ross return ( a.high>>48 ) & 0x7FFF;
886 1.1 ross
887 1.1 ross }
888 1.1 ross
889 1.7 thorpej
890 1.7 thorpej /*----------------------------------------------------------------------------
891 1.7 thorpej | Returns the sign bit of the quadruple-precision floating-point value `a'.
892 1.7 thorpej *----------------------------------------------------------------------------*/
893 1.7 thorpej
894 1.1 ross INLINE flag extractFloat128Sign( float128 a )
895 1.1 ross {
896 1.1 ross
897 1.1 ross return a.high>>63;
898 1.1 ross
899 1.1 ross }
900 1.1 ross
901 1.7 thorpej /*----------------------------------------------------------------------------
902 1.7 thorpej | Normalizes the subnormal quadruple-precision floating-point value
903 1.7 thorpej | represented by the denormalized significand formed by the concatenation of
904 1.7 thorpej | `aSig0' and `aSig1'. The normalized exponent is stored at the location
905 1.7 thorpej | pointed to by `zExpPtr'. The most significant 49 bits of the normalized
906 1.7 thorpej | significand are stored at the location pointed to by `zSig0Ptr', and the
907 1.7 thorpej | least significant 64 bits of the normalized significand are stored at the
908 1.7 thorpej | location pointed to by `zSig1Ptr'.
909 1.7 thorpej *----------------------------------------------------------------------------*/
910 1.7 thorpej
911 1.1 ross static void
912 1.1 ross normalizeFloat128Subnormal(
913 1.1 ross bits64 aSig0,
914 1.1 ross bits64 aSig1,
915 1.1 ross int32 *zExpPtr,
916 1.1 ross bits64 *zSig0Ptr,
917 1.1 ross bits64 *zSig1Ptr
918 1.1 ross )
919 1.1 ross {
920 1.1 ross int8 shiftCount;
921 1.1 ross
922 1.1 ross if ( aSig0 == 0 ) {
923 1.1 ross shiftCount = countLeadingZeros64( aSig1 ) - 15;
924 1.1 ross if ( shiftCount < 0 ) {
925 1.1 ross *zSig0Ptr = aSig1>>( - shiftCount );
926 1.1 ross *zSig1Ptr = aSig1<<( shiftCount & 63 );
927 1.1 ross }
928 1.1 ross else {
929 1.1 ross *zSig0Ptr = aSig1<<shiftCount;
930 1.1 ross *zSig1Ptr = 0;
931 1.1 ross }
932 1.1 ross *zExpPtr = - shiftCount - 63;
933 1.1 ross }
934 1.1 ross else {
935 1.1 ross shiftCount = countLeadingZeros64( aSig0 ) - 15;
936 1.1 ross shortShift128Left( aSig0, aSig1, shiftCount, zSig0Ptr, zSig1Ptr );
937 1.1 ross *zExpPtr = 1 - shiftCount;
938 1.1 ross }
939 1.1 ross
940 1.1 ross }
941 1.1 ross
942 1.7 thorpej /*----------------------------------------------------------------------------
943 1.7 thorpej | Packs the sign `zSign', the exponent `zExp', and the significand formed
944 1.7 thorpej | by the concatenation of `zSig0' and `zSig1' into a quadruple-precision
945 1.7 thorpej | floating-point value, returning the result. After being shifted into the
946 1.7 thorpej | proper positions, the three fields `zSign', `zExp', and `zSig0' are simply
947 1.7 thorpej | added together to form the most significant 32 bits of the result. This
948 1.7 thorpej | means that any integer portion of `zSig0' will be added into the exponent.
949 1.7 thorpej | Since a properly normalized significand will have an integer portion equal
950 1.7 thorpej | to 1, the `zExp' input should be 1 less than the desired result exponent
951 1.7 thorpej | whenever `zSig0' and `zSig1' concatenated form a complete, normalized
952 1.7 thorpej | significand.
953 1.7 thorpej *----------------------------------------------------------------------------*/
954 1.7 thorpej
955 1.1 ross INLINE float128
956 1.1 ross packFloat128( flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1 )
957 1.1 ross {
958 1.1 ross float128 z;
959 1.1 ross
960 1.1 ross z.low = zSig1;
961 1.1 ross z.high = ( ( (bits64) zSign )<<63 ) + ( ( (bits64) zExp )<<48 ) + zSig0;
962 1.1 ross return z;
963 1.1 ross
964 1.1 ross }
965 1.1 ross
966 1.7 thorpej /*----------------------------------------------------------------------------
967 1.7 thorpej | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
968 1.7 thorpej | and extended significand formed by the concatenation of `zSig0', `zSig1',
969 1.7 thorpej | and `zSig2', and returns the proper quadruple-precision floating-point value
970 1.7 thorpej | corresponding to the abstract input. Ordinarily, the abstract value is
971 1.7 thorpej | simply rounded and packed into the quadruple-precision format, with the
972 1.7 thorpej | inexact exception raised if the abstract input cannot be represented
973 1.7 thorpej | exactly. However, if the abstract value is too large, the overflow and
974 1.7 thorpej | inexact exceptions are raised and an infinity or maximal finite value is
975 1.7 thorpej | returned. If the abstract value is too small, the input value is rounded to
976 1.7 thorpej | a subnormal number, and the underflow and inexact exceptions are raised if
977 1.7 thorpej | the abstract input cannot be represented exactly as a subnormal quadruple-
978 1.7 thorpej | precision floating-point number.
979 1.7 thorpej | The input significand must be normalized or smaller. If the input
980 1.7 thorpej | significand is not normalized, `zExp' must be 0; in that case, the result
981 1.7 thorpej | returned is a subnormal number, and it must not require rounding. In the
982 1.7 thorpej | usual case that the input significand is normalized, `zExp' must be 1 less
983 1.7 thorpej | than the ``true'' floating-point exponent. The handling of underflow and
984 1.7 thorpej | overflow follows the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
985 1.7 thorpej *----------------------------------------------------------------------------*/
986 1.7 thorpej
987 1.1 ross static float128
988 1.1 ross roundAndPackFloat128(
989 1.1 ross flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1, bits64 zSig2 )
990 1.1 ross {
991 1.1 ross int8 roundingMode;
992 1.1 ross flag roundNearestEven, increment, isTiny;
993 1.1 ross
994 1.1 ross roundingMode = float_rounding_mode();
995 1.1 ross roundNearestEven = ( roundingMode == float_round_nearest_even );
996 1.1 ross increment = ( (sbits64) zSig2 < 0 );
997 1.1 ross if ( ! roundNearestEven ) {
998 1.1 ross if ( roundingMode == float_round_to_zero ) {
999 1.1 ross increment = 0;
1000 1.1 ross }
1001 1.1 ross else {
1002 1.1 ross if ( zSign ) {
1003 1.1 ross increment = ( roundingMode == float_round_down ) && zSig2;
1004 1.1 ross }
1005 1.1 ross else {
1006 1.1 ross increment = ( roundingMode == float_round_up ) && zSig2;
1007 1.1 ross }
1008 1.1 ross }
1009 1.1 ross }
1010 1.1 ross if ( 0x7FFD <= (bits32) zExp ) {
1011 1.1 ross if ( ( 0x7FFD < zExp )
1012 1.1 ross || ( ( zExp == 0x7FFD )
1013 1.1 ross && eq128(
1014 1.1 ross LIT64( 0x0001FFFFFFFFFFFF ),
1015 1.1 ross LIT64( 0xFFFFFFFFFFFFFFFF ),
1016 1.1 ross zSig0,
1017 1.1 ross zSig1
1018 1.1 ross )
1019 1.1 ross && increment
1020 1.1 ross )
1021 1.1 ross ) {
1022 1.1 ross float_raise( float_flag_overflow | float_flag_inexact );
1023 1.1 ross if ( ( roundingMode == float_round_to_zero )
1024 1.1 ross || ( zSign && ( roundingMode == float_round_up ) )
1025 1.1 ross || ( ! zSign && ( roundingMode == float_round_down ) )
1026 1.1 ross ) {
1027 1.1 ross return
1028 1.1 ross packFloat128(
1029 1.1 ross zSign,
1030 1.1 ross 0x7FFE,
1031 1.1 ross LIT64( 0x0000FFFFFFFFFFFF ),
1032 1.1 ross LIT64( 0xFFFFFFFFFFFFFFFF )
1033 1.1 ross );
1034 1.1 ross }
1035 1.1 ross return packFloat128( zSign, 0x7FFF, 0, 0 );
1036 1.1 ross }
1037 1.1 ross if ( zExp < 0 ) {
1038 1.1 ross isTiny =
1039 1.1 ross ( float_detect_tininess == float_tininess_before_rounding )
1040 1.1 ross || ( zExp < -1 )
1041 1.1 ross || ! increment
1042 1.1 ross || lt128(
1043 1.1 ross zSig0,
1044 1.1 ross zSig1,
1045 1.1 ross LIT64( 0x0001FFFFFFFFFFFF ),
1046 1.1 ross LIT64( 0xFFFFFFFFFFFFFFFF )
1047 1.1 ross );
1048 1.1 ross shift128ExtraRightJamming(
1049 1.1 ross zSig0, zSig1, zSig2, - zExp, &zSig0, &zSig1, &zSig2 );
1050 1.1 ross zExp = 0;
1051 1.1 ross if ( isTiny && zSig2 ) float_raise( float_flag_underflow );
1052 1.1 ross if ( roundNearestEven ) {
1053 1.1 ross increment = ( (sbits64) zSig2 < 0 );
1054 1.1 ross }
1055 1.1 ross else {
1056 1.1 ross if ( zSign ) {
1057 1.1 ross increment = ( roundingMode == float_round_down ) && zSig2;
1058 1.1 ross }
1059 1.1 ross else {
1060 1.1 ross increment = ( roundingMode == float_round_up ) && zSig2;
1061 1.1 ross }
1062 1.1 ross }
1063 1.1 ross }
1064 1.1 ross }
1065 1.1 ross if ( zSig2 ) float_set_inexact();
1066 1.1 ross if ( increment ) {
1067 1.1 ross add128( zSig0, zSig1, 0, 1, &zSig0, &zSig1 );
1068 1.1 ross zSig1 &= ~ ( ( zSig2 + zSig2 == 0 ) & roundNearestEven );
1069 1.1 ross }
1070 1.1 ross else {
1071 1.1 ross if ( ( zSig0 | zSig1 ) == 0 ) zExp = 0;
1072 1.1 ross }
1073 1.1 ross return packFloat128( zSign, zExp, zSig0, zSig1 );
1074 1.1 ross
1075 1.1 ross }
1076 1.1 ross
1077 1.7 thorpej /*----------------------------------------------------------------------------
1078 1.7 thorpej | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
1079 1.7 thorpej | and significand formed by the concatenation of `zSig0' and `zSig1', and
1080 1.7 thorpej | returns the proper quadruple-precision floating-point value corresponding
1081 1.7 thorpej | to the abstract input. This routine is just like `roundAndPackFloat128'
1082 1.7 thorpej | except that the input significand has fewer bits and does not have to be
1083 1.7 thorpej | normalized. In all cases, `zExp' must be 1 less than the ``true'' floating-
1084 1.7 thorpej | point exponent.
1085 1.7 thorpej *----------------------------------------------------------------------------*/
1086 1.7 thorpej
1087 1.1 ross static float128
1088 1.1 ross normalizeRoundAndPackFloat128(
1089 1.1 ross flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1 )
1090 1.1 ross {
1091 1.1 ross int8 shiftCount;
1092 1.1 ross bits64 zSig2;
1093 1.1 ross
1094 1.1 ross if ( zSig0 == 0 ) {
1095 1.1 ross zSig0 = zSig1;
1096 1.1 ross zSig1 = 0;
1097 1.1 ross zExp -= 64;
1098 1.1 ross }
1099 1.1 ross shiftCount = countLeadingZeros64( zSig0 ) - 15;
1100 1.1 ross if ( 0 <= shiftCount ) {
1101 1.1 ross zSig2 = 0;
1102 1.1 ross shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
1103 1.1 ross }
1104 1.1 ross else {
1105 1.1 ross shift128ExtraRightJamming(
1106 1.1 ross zSig0, zSig1, 0, - shiftCount, &zSig0, &zSig1, &zSig2 );
1107 1.1 ross }
1108 1.1 ross zExp -= shiftCount;
1109 1.1 ross return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 );
1110 1.1 ross
1111 1.1 ross }
1112 1.1 ross
1113 1.1 ross #endif
1114 1.1 ross
1115 1.7 thorpej /*----------------------------------------------------------------------------
1116 1.7 thorpej | Returns the result of converting the 32-bit two's complement integer `a'
1117 1.7 thorpej | to the single-precision floating-point format. The conversion is performed
1118 1.7 thorpej | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1119 1.7 thorpej *----------------------------------------------------------------------------*/
1120 1.7 thorpej
1121 1.1 ross float32 int32_to_float32( int32 a )
1122 1.1 ross {
1123 1.1 ross flag zSign;
1124 1.1 ross
1125 1.1 ross if ( a == 0 ) return 0;
1126 1.1 ross if ( a == (sbits32) 0x80000000 ) return packFloat32( 1, 0x9E, 0 );
1127 1.1 ross zSign = ( a < 0 );
1128 1.1 ross return normalizeRoundAndPackFloat32( zSign, 0x9C, zSign ? - a : a );
1129 1.1 ross
1130 1.1 ross }
1131 1.1 ross
1132 1.7 thorpej /*----------------------------------------------------------------------------
1133 1.7 thorpej | Returns the result of converting the 32-bit two's complement integer `a'
1134 1.7 thorpej | to the double-precision floating-point format. The conversion is performed
1135 1.7 thorpej | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1136 1.7 thorpej *----------------------------------------------------------------------------*/
1137 1.7 thorpej
1138 1.1 ross float64 int32_to_float64( int32 a )
1139 1.1 ross {
1140 1.1 ross flag zSign;
1141 1.1 ross uint32 absA;
1142 1.1 ross int8 shiftCount;
1143 1.1 ross bits64 zSig;
1144 1.1 ross
1145 1.1 ross if ( a == 0 ) return 0;
1146 1.1 ross zSign = ( a < 0 );
1147 1.1 ross absA = zSign ? - a : a;
1148 1.1 ross shiftCount = countLeadingZeros32( absA ) + 21;
1149 1.1 ross zSig = absA;
1150 1.1 ross return packFloat64( zSign, 0x432 - shiftCount, zSig<<shiftCount );
1151 1.1 ross
1152 1.1 ross }
1153 1.1 ross
1154 1.1 ross #ifdef FLOATX80
1155 1.1 ross
1156 1.7 thorpej /*----------------------------------------------------------------------------
1157 1.7 thorpej | Returns the result of converting the 32-bit two's complement integer `a'
1158 1.7 thorpej | to the extended double-precision floating-point format. The conversion
1159 1.7 thorpej | is performed according to the IEC/IEEE Standard for Binary Floating-Point
1160 1.7 thorpej | Arithmetic.
1161 1.7 thorpej *----------------------------------------------------------------------------*/
1162 1.7 thorpej
1163 1.1 ross floatx80 int32_to_floatx80( int32 a )
1164 1.1 ross {
1165 1.1 ross flag zSign;
1166 1.1 ross uint32 absA;
1167 1.1 ross int8 shiftCount;
1168 1.1 ross bits64 zSig;
1169 1.1 ross
1170 1.1 ross if ( a == 0 ) return packFloatx80( 0, 0, 0 );
1171 1.1 ross zSign = ( a < 0 );
1172 1.1 ross absA = zSign ? - a : a;
1173 1.1 ross shiftCount = countLeadingZeros32( absA ) + 32;
1174 1.1 ross zSig = absA;
1175 1.1 ross return packFloatx80( zSign, 0x403E - shiftCount, zSig<<shiftCount );
1176 1.1 ross
1177 1.1 ross }
1178 1.1 ross
1179 1.1 ross #endif
1180 1.1 ross
1181 1.1 ross #ifdef FLOAT128
1182 1.1 ross
1183 1.7 thorpej /*----------------------------------------------------------------------------
1184 1.7 thorpej | Returns the result of converting the 32-bit two's complement integer `a' to
1185 1.7 thorpej | the quadruple-precision floating-point format. The conversion is performed
1186 1.7 thorpej | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1187 1.7 thorpej *----------------------------------------------------------------------------*/
1188 1.7 thorpej
1189 1.1 ross float128 int32_to_float128( int32 a )
1190 1.1 ross {
1191 1.1 ross flag zSign;
1192 1.1 ross uint32 absA;
1193 1.1 ross int8 shiftCount;
1194 1.1 ross bits64 zSig0;
1195 1.1 ross
1196 1.1 ross if ( a == 0 ) return packFloat128( 0, 0, 0, 0 );
1197 1.1 ross zSign = ( a < 0 );
1198 1.1 ross absA = zSign ? - a : a;
1199 1.1 ross shiftCount = countLeadingZeros32( absA ) + 17;
1200 1.1 ross zSig0 = absA;
1201 1.1 ross return packFloat128( zSign, 0x402E - shiftCount, zSig0<<shiftCount, 0 );
1202 1.1 ross
1203 1.1 ross }
1204 1.1 ross
1205 1.1 ross #endif
1206 1.1 ross
1207 1.1 ross #ifndef SOFTFLOAT_FOR_GCC /* __floatdi?f is in libgcc2.c */
1208 1.7 thorpej /*----------------------------------------------------------------------------
1209 1.7 thorpej | Returns the result of converting the 64-bit two's complement integer `a'
1210 1.7 thorpej | to the single-precision floating-point format. The conversion is performed
1211 1.7 thorpej | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1212 1.7 thorpej *----------------------------------------------------------------------------*/
1213 1.7 thorpej
1214 1.1 ross float32 int64_to_float32( int64 a )
1215 1.1 ross {
1216 1.1 ross flag zSign;
1217 1.1 ross uint64 absA;
1218 1.1 ross int8 shiftCount;
1219 1.1 ross
1220 1.1 ross if ( a == 0 ) return 0;
1221 1.1 ross zSign = ( a < 0 );
1222 1.1 ross absA = zSign ? - a : a;
1223 1.1 ross shiftCount = countLeadingZeros64( absA ) - 40;
1224 1.1 ross if ( 0 <= shiftCount ) {
1225 1.1 ross return packFloat32( zSign, 0x95 - shiftCount, absA<<shiftCount );
1226 1.1 ross }
1227 1.1 ross else {
1228 1.1 ross shiftCount += 7;
1229 1.1 ross if ( shiftCount < 0 ) {
1230 1.1 ross shift64RightJamming( absA, - shiftCount, &absA );
1231 1.1 ross }
1232 1.1 ross else {
1233 1.1 ross absA <<= shiftCount;
1234 1.1 ross }
1235 1.1 ross return roundAndPackFloat32( zSign, 0x9C - shiftCount, absA );
1236 1.1 ross }
1237 1.1 ross
1238 1.1 ross }
1239 1.1 ross
1240 1.7 thorpej /*----------------------------------------------------------------------------
1241 1.7 thorpej | Returns the result of converting the 64-bit two's complement integer `a'
1242 1.7 thorpej | to the double-precision floating-point format. The conversion is performed
1243 1.7 thorpej | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1244 1.7 thorpej *----------------------------------------------------------------------------*/
1245 1.7 thorpej
1246 1.1 ross float64 int64_to_float64( int64 a )
1247 1.1 ross {
1248 1.1 ross flag zSign;
1249 1.1 ross
1250 1.1 ross if ( a == 0 ) return 0;
1251 1.1 ross if ( a == (sbits64) LIT64( 0x8000000000000000 ) ) {
1252 1.1 ross return packFloat64( 1, 0x43E, 0 );
1253 1.1 ross }
1254 1.1 ross zSign = ( a < 0 );
1255 1.1 ross return normalizeRoundAndPackFloat64( zSign, 0x43C, zSign ? - a : a );
1256 1.1 ross
1257 1.1 ross }
1258 1.1 ross
1259 1.1 ross #ifdef FLOATX80
1260 1.1 ross
1261 1.7 thorpej /*----------------------------------------------------------------------------
1262 1.7 thorpej | Returns the result of converting the 64-bit two's complement integer `a'
1263 1.7 thorpej | to the extended double-precision floating-point format. The conversion
1264 1.7 thorpej | is performed according to the IEC/IEEE Standard for Binary Floating-Point
1265 1.7 thorpej | Arithmetic.
1266 1.7 thorpej *----------------------------------------------------------------------------*/
1267 1.7 thorpej
1268 1.1 ross floatx80 int64_to_floatx80( int64 a )
1269 1.1 ross {
1270 1.1 ross flag zSign;
1271 1.1 ross uint64 absA;
1272 1.1 ross int8 shiftCount;
1273 1.1 ross
1274 1.1 ross if ( a == 0 ) return packFloatx80( 0, 0, 0 );
1275 1.1 ross zSign = ( a < 0 );
1276 1.1 ross absA = zSign ? - a : a;
1277 1.1 ross shiftCount = countLeadingZeros64( absA );
1278 1.1 ross return packFloatx80( zSign, 0x403E - shiftCount, absA<<shiftCount );
1279 1.1 ross
1280 1.1 ross }
1281 1.1 ross
1282 1.1 ross #endif
1283 1.1 ross
1284 1.1 ross #ifdef FLOAT128
1285 1.1 ross
1286 1.7 thorpej /*----------------------------------------------------------------------------
1287 1.7 thorpej | Returns the result of converting the 64-bit two's complement integer `a' to
1288 1.7 thorpej | the quadruple-precision floating-point format. The conversion is performed
1289 1.7 thorpej | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1290 1.7 thorpej *----------------------------------------------------------------------------*/
1291 1.7 thorpej
1292 1.1 ross float128 int64_to_float128( int64 a )
1293 1.1 ross {
1294 1.1 ross flag zSign;
1295 1.1 ross uint64 absA;
1296 1.1 ross int8 shiftCount;
1297 1.1 ross int32 zExp;
1298 1.1 ross bits64 zSig0, zSig1;
1299 1.1 ross
1300 1.1 ross if ( a == 0 ) return packFloat128( 0, 0, 0, 0 );
1301 1.1 ross zSign = ( a < 0 );
1302 1.1 ross absA = zSign ? - a : a;
1303 1.1 ross shiftCount = countLeadingZeros64( absA ) + 49;
1304 1.1 ross zExp = 0x406E - shiftCount;
1305 1.1 ross if ( 64 <= shiftCount ) {
1306 1.1 ross zSig1 = 0;
1307 1.1 ross zSig0 = absA;
1308 1.1 ross shiftCount -= 64;
1309 1.1 ross }
1310 1.1 ross else {
1311 1.1 ross zSig1 = absA;
1312 1.1 ross zSig0 = 0;
1313 1.1 ross }
1314 1.1 ross shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
1315 1.1 ross return packFloat128( zSign, zExp, zSig0, zSig1 );
1316 1.1 ross
1317 1.1 ross }
1318 1.1 ross
1319 1.1 ross #endif
1320 1.1 ross #endif /* !SOFTFLOAT_FOR_GCC */
1321 1.1 ross
1322 1.1 ross #ifndef SOFTFLOAT_FOR_GCC /* Not needed */
1323 1.7 thorpej /*----------------------------------------------------------------------------
1324 1.7 thorpej | Returns the result of converting the single-precision floating-point value
1325 1.7 thorpej | `a' to the 32-bit two's complement integer format. The conversion is
1326 1.7 thorpej | performed according to the IEC/IEEE Standard for Binary Floating-Point
1327 1.7 thorpej | Arithmetic---which means in particular that the conversion is rounded
1328 1.7 thorpej | according to the current rounding mode. If `a' is a NaN, the largest
1329 1.7 thorpej | positive integer is returned. Otherwise, if the conversion overflows, the
1330 1.7 thorpej | largest integer with the same sign as `a' is returned.
1331 1.7 thorpej *----------------------------------------------------------------------------*/
1332 1.7 thorpej
1333 1.1 ross int32 float32_to_int32( float32 a )
1334 1.1 ross {
1335 1.1 ross flag aSign;
1336 1.1 ross int16 aExp, shiftCount;
1337 1.1 ross bits32 aSig;
1338 1.1 ross bits64 aSig64;
1339 1.1 ross
1340 1.1 ross aSig = extractFloat32Frac( a );
1341 1.1 ross aExp = extractFloat32Exp( a );
1342 1.1 ross aSign = extractFloat32Sign( a );
1343 1.1 ross if ( ( aExp == 0xFF ) && aSig ) aSign = 0;
1344 1.1 ross if ( aExp ) aSig |= 0x00800000;
1345 1.1 ross shiftCount = 0xAF - aExp;
1346 1.1 ross aSig64 = aSig;
1347 1.1 ross aSig64 <<= 32;
1348 1.1 ross if ( 0 < shiftCount ) shift64RightJamming( aSig64, shiftCount, &aSig64 );
1349 1.1 ross return roundAndPackInt32( aSign, aSig64 );
1350 1.1 ross
1351 1.1 ross }
1352 1.1 ross #endif /* !SOFTFLOAT_FOR_GCC */
1353 1.1 ross
1354 1.7 thorpej /*----------------------------------------------------------------------------
1355 1.7 thorpej | Returns the result of converting the single-precision floating-point value
1356 1.7 thorpej | `a' to the 32-bit two's complement integer format. The conversion is
1357 1.7 thorpej | performed according to the IEC/IEEE Standard for Binary Floating-Point
1358 1.7 thorpej | Arithmetic, except that the conversion is always rounded toward zero.
1359 1.7 thorpej | If `a' is a NaN, the largest positive integer is returned. Otherwise, if
1360 1.7 thorpej | the conversion overflows, the largest integer with the same sign as `a' is
1361 1.7 thorpej | returned.
1362 1.7 thorpej *----------------------------------------------------------------------------*/
1363 1.7 thorpej
1364 1.1 ross int32 float32_to_int32_round_to_zero( float32 a )
1365 1.1 ross {
1366 1.1 ross flag aSign;
1367 1.1 ross int16 aExp, shiftCount;
1368 1.1 ross bits32 aSig;
1369 1.1 ross int32 z;
1370 1.1 ross
1371 1.1 ross aSig = extractFloat32Frac( a );
1372 1.1 ross aExp = extractFloat32Exp( a );
1373 1.1 ross aSign = extractFloat32Sign( a );
1374 1.1 ross shiftCount = aExp - 0x9E;
1375 1.1 ross if ( 0 <= shiftCount ) {
1376 1.1 ross if ( a != 0xCF000000 ) {
1377 1.1 ross float_raise( float_flag_invalid );
1378 1.1 ross if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) return 0x7FFFFFFF;
1379 1.1 ross }
1380 1.1 ross return (sbits32) 0x80000000;
1381 1.1 ross }
1382 1.1 ross else if ( aExp <= 0x7E ) {
1383 1.1 ross if ( aExp | aSig ) float_set_inexact();
1384 1.1 ross return 0;
1385 1.1 ross }
1386 1.1 ross aSig = ( aSig | 0x00800000 )<<8;
1387 1.1 ross z = aSig>>( - shiftCount );
1388 1.1 ross if ( (bits32) ( aSig<<( shiftCount & 31 ) ) ) {
1389 1.1 ross float_set_inexact();
1390 1.1 ross }
1391 1.1 ross if ( aSign ) z = - z;
1392 1.1 ross return z;
1393 1.1 ross
1394 1.1 ross }
1395 1.1 ross
1396 1.1 ross #ifndef SOFTFLOAT_FOR_GCC /* __fix?fdi provided by libgcc2.c */
1397 1.7 thorpej /*----------------------------------------------------------------------------
1398 1.7 thorpej | Returns the result of converting the single-precision floating-point value
1399 1.7 thorpej | `a' to the 64-bit two's complement integer format. The conversion is
1400 1.7 thorpej | performed according to the IEC/IEEE Standard for Binary Floating-Point
1401 1.7 thorpej | Arithmetic---which means in particular that the conversion is rounded
1402 1.7 thorpej | according to the current rounding mode. If `a' is a NaN, the largest
1403 1.7 thorpej | positive integer is returned. Otherwise, if the conversion overflows, the
1404 1.7 thorpej | largest integer with the same sign as `a' is returned.
1405 1.7 thorpej *----------------------------------------------------------------------------*/
1406 1.7 thorpej
1407 1.1 ross int64 float32_to_int64( float32 a )
1408 1.1 ross {
1409 1.1 ross flag aSign;
1410 1.1 ross int16 aExp, shiftCount;
1411 1.1 ross bits32 aSig;
1412 1.1 ross bits64 aSig64, aSigExtra;
1413 1.1 ross
1414 1.1 ross aSig = extractFloat32Frac( a );
1415 1.1 ross aExp = extractFloat32Exp( a );
1416 1.1 ross aSign = extractFloat32Sign( a );
1417 1.1 ross shiftCount = 0xBE - aExp;
1418 1.1 ross if ( shiftCount < 0 ) {
1419 1.1 ross float_raise( float_flag_invalid );
1420 1.1 ross if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) {
1421 1.1 ross return LIT64( 0x7FFFFFFFFFFFFFFF );
1422 1.1 ross }
1423 1.1 ross return (sbits64) LIT64( 0x8000000000000000 );
1424 1.1 ross }
1425 1.1 ross if ( aExp ) aSig |= 0x00800000;
1426 1.1 ross aSig64 = aSig;
1427 1.1 ross aSig64 <<= 40;
1428 1.1 ross shift64ExtraRightJamming( aSig64, 0, shiftCount, &aSig64, &aSigExtra );
1429 1.1 ross return roundAndPackInt64( aSign, aSig64, aSigExtra );
1430 1.1 ross
1431 1.1 ross }
1432 1.1 ross
1433 1.7 thorpej /*----------------------------------------------------------------------------
1434 1.7 thorpej | Returns the result of converting the single-precision floating-point value
1435 1.7 thorpej | `a' to the 64-bit two's complement integer format. The conversion is
1436 1.7 thorpej | performed according to the IEC/IEEE Standard for Binary Floating-Point
1437 1.7 thorpej | Arithmetic, except that the conversion is always rounded toward zero. If
1438 1.7 thorpej | `a' is a NaN, the largest positive integer is returned. Otherwise, if the
1439 1.7 thorpej | conversion overflows, the largest integer with the same sign as `a' is
1440 1.7 thorpej | returned.
1441 1.7 thorpej *----------------------------------------------------------------------------*/
1442 1.7 thorpej
1443 1.1 ross int64 float32_to_int64_round_to_zero( float32 a )
1444 1.1 ross {
1445 1.1 ross flag aSign;
1446 1.1 ross int16 aExp, shiftCount;
1447 1.1 ross bits32 aSig;
1448 1.1 ross bits64 aSig64;
1449 1.1 ross int64 z;
1450 1.1 ross
1451 1.1 ross aSig = extractFloat32Frac( a );
1452 1.1 ross aExp = extractFloat32Exp( a );
1453 1.1 ross aSign = extractFloat32Sign( a );
1454 1.1 ross shiftCount = aExp - 0xBE;
1455 1.1 ross if ( 0 <= shiftCount ) {
1456 1.1 ross if ( a != 0xDF000000 ) {
1457 1.1 ross float_raise( float_flag_invalid );
1458 1.1 ross if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) {
1459 1.1 ross return LIT64( 0x7FFFFFFFFFFFFFFF );
1460 1.1 ross }
1461 1.1 ross }
1462 1.1 ross return (sbits64) LIT64( 0x8000000000000000 );
1463 1.1 ross }
1464 1.1 ross else if ( aExp <= 0x7E ) {
1465 1.1 ross if ( aExp | aSig ) float_set_inexact();
1466 1.1 ross return 0;
1467 1.1 ross }
1468 1.1 ross aSig64 = aSig | 0x00800000;
1469 1.1 ross aSig64 <<= 40;
1470 1.1 ross z = aSig64>>( - shiftCount );
1471 1.1 ross if ( (bits64) ( aSig64<<( shiftCount & 63 ) ) ) {
1472 1.1 ross float_set_inexact();
1473 1.1 ross }
1474 1.1 ross if ( aSign ) z = - z;
1475 1.1 ross return z;
1476 1.1 ross
1477 1.1 ross }
1478 1.1 ross #endif /* !SOFTFLOAT_FOR_GCC */
1479 1.1 ross
1480 1.7 thorpej /*----------------------------------------------------------------------------
1481 1.7 thorpej | Returns the result of converting the single-precision floating-point value
1482 1.7 thorpej | `a' to the double-precision floating-point format. The conversion is
1483 1.7 thorpej | performed according to the IEC/IEEE Standard for Binary Floating-Point
1484 1.7 thorpej | Arithmetic.
1485 1.7 thorpej *----------------------------------------------------------------------------*/
1486 1.7 thorpej
1487 1.1 ross float64 float32_to_float64( float32 a )
1488 1.1 ross {
1489 1.1 ross flag aSign;
1490 1.1 ross int16 aExp;
1491 1.1 ross bits32 aSig;
1492 1.1 ross
1493 1.1 ross aSig = extractFloat32Frac( a );
1494 1.1 ross aExp = extractFloat32Exp( a );
1495 1.1 ross aSign = extractFloat32Sign( a );
1496 1.1 ross if ( aExp == 0xFF ) {
1497 1.1 ross if ( aSig ) return commonNaNToFloat64( float32ToCommonNaN( a ) );
1498 1.1 ross return packFloat64( aSign, 0x7FF, 0 );
1499 1.1 ross }
1500 1.1 ross if ( aExp == 0 ) {
1501 1.1 ross if ( aSig == 0 ) return packFloat64( aSign, 0, 0 );
1502 1.1 ross normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1503 1.1 ross --aExp;
1504 1.1 ross }
1505 1.1 ross return packFloat64( aSign, aExp + 0x380, ( (bits64) aSig )<<29 );
1506 1.1 ross
1507 1.1 ross }
1508 1.1 ross
1509 1.1 ross #ifdef FLOATX80
1510 1.1 ross
1511 1.7 thorpej /*----------------------------------------------------------------------------
1512 1.7 thorpej | Returns the result of converting the single-precision floating-point value
1513 1.7 thorpej | `a' to the extended double-precision floating-point format. The conversion
1514 1.7 thorpej | is performed according to the IEC/IEEE Standard for Binary Floating-Point
1515 1.7 thorpej | Arithmetic.
1516 1.7 thorpej *----------------------------------------------------------------------------*/
1517 1.7 thorpej
1518 1.1 ross floatx80 float32_to_floatx80( float32 a )
1519 1.1 ross {
1520 1.1 ross flag aSign;
1521 1.1 ross int16 aExp;
1522 1.1 ross bits32 aSig;
1523 1.1 ross
1524 1.1 ross aSig = extractFloat32Frac( a );
1525 1.1 ross aExp = extractFloat32Exp( a );
1526 1.1 ross aSign = extractFloat32Sign( a );
1527 1.1 ross if ( aExp == 0xFF ) {
1528 1.1 ross if ( aSig ) return commonNaNToFloatx80( float32ToCommonNaN( a ) );
1529 1.1 ross return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
1530 1.1 ross }
1531 1.1 ross if ( aExp == 0 ) {
1532 1.1 ross if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 );
1533 1.1 ross normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1534 1.1 ross }
1535 1.1 ross aSig |= 0x00800000;
1536 1.1 ross return packFloatx80( aSign, aExp + 0x3F80, ( (bits64) aSig )<<40 );
1537 1.1 ross
1538 1.1 ross }
1539 1.1 ross
1540 1.1 ross #endif
1541 1.1 ross
1542 1.1 ross #ifdef FLOAT128
1543 1.1 ross
1544 1.7 thorpej /*----------------------------------------------------------------------------
1545 1.7 thorpej | Returns the result of converting the single-precision floating-point value
1546 1.7 thorpej | `a' to the double-precision floating-point format. The conversion is
1547 1.7 thorpej | performed according to the IEC/IEEE Standard for Binary Floating-Point
1548 1.7 thorpej | Arithmetic.
1549 1.7 thorpej *----------------------------------------------------------------------------*/
1550 1.7 thorpej
1551 1.1 ross float128 float32_to_float128( float32 a )
1552 1.1 ross {
1553 1.1 ross flag aSign;
1554 1.1 ross int16 aExp;
1555 1.1 ross bits32 aSig;
1556 1.1 ross
1557 1.1 ross aSig = extractFloat32Frac( a );
1558 1.1 ross aExp = extractFloat32Exp( a );
1559 1.1 ross aSign = extractFloat32Sign( a );
1560 1.1 ross if ( aExp == 0xFF ) {
1561 1.1 ross if ( aSig ) return commonNaNToFloat128( float32ToCommonNaN( a ) );
1562 1.1 ross return packFloat128( aSign, 0x7FFF, 0, 0 );
1563 1.1 ross }
1564 1.1 ross if ( aExp == 0 ) {
1565 1.1 ross if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 );
1566 1.1 ross normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1567 1.1 ross --aExp;
1568 1.1 ross }
1569 1.1 ross return packFloat128( aSign, aExp + 0x3F80, ( (bits64) aSig )<<25, 0 );
1570 1.1 ross
1571 1.1 ross }
1572 1.1 ross
1573 1.1 ross #endif
1574 1.1 ross
1575 1.1 ross #ifndef SOFTFLOAT_FOR_GCC /* Not needed */
1576 1.7 thorpej /*----------------------------------------------------------------------------
1577 1.7 thorpej | Rounds the single-precision floating-point value `a' to an integer, and
1578 1.7 thorpej | returns the result as a single-precision floating-point value. The
1579 1.7 thorpej | operation is performed according to the IEC/IEEE Standard for Binary
1580 1.7 thorpej | Floating-Point Arithmetic.
1581 1.7 thorpej *----------------------------------------------------------------------------*/
1582 1.7 thorpej
1583 1.1 ross float32 float32_round_to_int( float32 a )
1584 1.1 ross {
1585 1.1 ross flag aSign;
1586 1.1 ross int16 aExp;
1587 1.1 ross bits32 lastBitMask, roundBitsMask;
1588 1.1 ross int8 roundingMode;
1589 1.1 ross float32 z;
1590 1.1 ross
1591 1.1 ross aExp = extractFloat32Exp( a );
1592 1.1 ross if ( 0x96 <= aExp ) {
1593 1.1 ross if ( ( aExp == 0xFF ) && extractFloat32Frac( a ) ) {
1594 1.1 ross return propagateFloat32NaN( a, a );
1595 1.1 ross }
1596 1.1 ross return a;
1597 1.1 ross }
1598 1.1 ross if ( aExp <= 0x7E ) {
1599 1.1 ross if ( (bits32) ( a<<1 ) == 0 ) return a;
1600 1.1 ross float_set_inexact();
1601 1.1 ross aSign = extractFloat32Sign( a );
1602 1.1 ross switch ( float_rounding_mode() ) {
1603 1.1 ross case float_round_nearest_even:
1604 1.1 ross if ( ( aExp == 0x7E ) && extractFloat32Frac( a ) ) {
1605 1.1 ross return packFloat32( aSign, 0x7F, 0 );
1606 1.1 ross }
1607 1.1 ross break;
1608 1.1 ross case float_round_down:
1609 1.1 ross return aSign ? 0xBF800000 : 0;
1610 1.1 ross case float_round_up:
1611 1.1 ross return aSign ? 0x80000000 : 0x3F800000;
1612 1.1 ross }
1613 1.1 ross return packFloat32( aSign, 0, 0 );
1614 1.1 ross }
1615 1.1 ross lastBitMask = 1;
1616 1.1 ross lastBitMask <<= 0x96 - aExp;
1617 1.1 ross roundBitsMask = lastBitMask - 1;
1618 1.1 ross z = a;
1619 1.1 ross roundingMode = float_rounding_mode();
1620 1.1 ross if ( roundingMode == float_round_nearest_even ) {
1621 1.1 ross z += lastBitMask>>1;
1622 1.1 ross if ( ( z & roundBitsMask ) == 0 ) z &= ~ lastBitMask;
1623 1.1 ross }
1624 1.1 ross else if ( roundingMode != float_round_to_zero ) {
1625 1.1 ross if ( extractFloat32Sign( z ) ^ ( roundingMode == float_round_up ) ) {
1626 1.1 ross z += roundBitsMask;
1627 1.1 ross }
1628 1.1 ross }
1629 1.1 ross z &= ~ roundBitsMask;
1630 1.1 ross if ( z != a ) float_set_inexact();
1631 1.1 ross return z;
1632 1.1 ross
1633 1.1 ross }
1634 1.1 ross #endif /* !SOFTFLOAT_FOR_GCC */
1635 1.1 ross
1636 1.7 thorpej /*----------------------------------------------------------------------------
1637 1.7 thorpej | Returns the result of adding the absolute values of the single-precision
1638 1.7 thorpej | floating-point values `a' and `b'. If `zSign' is 1, the sum is negated
1639 1.7 thorpej | before being returned. `zSign' is ignored if the result is a NaN.
1640 1.7 thorpej | The addition is performed according to the IEC/IEEE Standard for Binary
1641 1.7 thorpej | Floating-Point Arithmetic.
1642 1.7 thorpej *----------------------------------------------------------------------------*/
1643 1.7 thorpej
1644 1.1 ross static float32 addFloat32Sigs( float32 a, float32 b, flag zSign )
1645 1.1 ross {
1646 1.1 ross int16 aExp, bExp, zExp;
1647 1.1 ross bits32 aSig, bSig, zSig;
1648 1.1 ross int16 expDiff;
1649 1.1 ross
1650 1.1 ross aSig = extractFloat32Frac( a );
1651 1.1 ross aExp = extractFloat32Exp( a );
1652 1.1 ross bSig = extractFloat32Frac( b );
1653 1.1 ross bExp = extractFloat32Exp( b );
1654 1.1 ross expDiff = aExp - bExp;
1655 1.1 ross aSig <<= 6;
1656 1.1 ross bSig <<= 6;
1657 1.1 ross if ( 0 < expDiff ) {
1658 1.1 ross if ( aExp == 0xFF ) {
1659 1.1 ross if ( aSig ) return propagateFloat32NaN( a, b );
1660 1.1 ross return a;
1661 1.1 ross }
1662 1.1 ross if ( bExp == 0 ) {
1663 1.1 ross --expDiff;
1664 1.1 ross }
1665 1.1 ross else {
1666 1.1 ross bSig |= 0x20000000;
1667 1.1 ross }
1668 1.1 ross shift32RightJamming( bSig, expDiff, &bSig );
1669 1.1 ross zExp = aExp;
1670 1.1 ross }
1671 1.1 ross else if ( expDiff < 0 ) {
1672 1.1 ross if ( bExp == 0xFF ) {
1673 1.1 ross if ( bSig ) return propagateFloat32NaN( a, b );
1674 1.1 ross return packFloat32( zSign, 0xFF, 0 );
1675 1.1 ross }
1676 1.1 ross if ( aExp == 0 ) {
1677 1.1 ross ++expDiff;
1678 1.1 ross }
1679 1.1 ross else {
1680 1.1 ross aSig |= 0x20000000;
1681 1.1 ross }
1682 1.1 ross shift32RightJamming( aSig, - expDiff, &aSig );
1683 1.1 ross zExp = bExp;
1684 1.1 ross }
1685 1.1 ross else {
1686 1.1 ross if ( aExp == 0xFF ) {
1687 1.1 ross if ( aSig | bSig ) return propagateFloat32NaN( a, b );
1688 1.1 ross return a;
1689 1.1 ross }
1690 1.1 ross if ( aExp == 0 ) return packFloat32( zSign, 0, ( aSig + bSig )>>6 );
1691 1.1 ross zSig = 0x40000000 + aSig + bSig;
1692 1.1 ross zExp = aExp;
1693 1.1 ross goto roundAndPack;
1694 1.1 ross }
1695 1.1 ross aSig |= 0x20000000;
1696 1.1 ross zSig = ( aSig + bSig )<<1;
1697 1.1 ross --zExp;
1698 1.1 ross if ( (sbits32) zSig < 0 ) {
1699 1.1 ross zSig = aSig + bSig;
1700 1.1 ross ++zExp;
1701 1.1 ross }
1702 1.1 ross roundAndPack:
1703 1.1 ross return roundAndPackFloat32( zSign, zExp, zSig );
1704 1.1 ross
1705 1.1 ross }
1706 1.1 ross
1707 1.7 thorpej /*----------------------------------------------------------------------------
1708 1.7 thorpej | Returns the result of subtracting the absolute values of the single-
1709 1.7 thorpej | precision floating-point values `a' and `b'. If `zSign' is 1, the
1710 1.7 thorpej | difference is negated before being returned. `zSign' is ignored if the
1711 1.7 thorpej | result is a NaN. The subtraction is performed according to the IEC/IEEE
1712 1.7 thorpej | Standard for Binary Floating-Point Arithmetic.
1713 1.7 thorpej *----------------------------------------------------------------------------*/
1714 1.7 thorpej
1715 1.1 ross static float32 subFloat32Sigs( float32 a, float32 b, flag zSign )
1716 1.1 ross {
1717 1.1 ross int16 aExp, bExp, zExp;
1718 1.1 ross bits32 aSig, bSig, zSig;
1719 1.1 ross int16 expDiff;
1720 1.1 ross
1721 1.1 ross aSig = extractFloat32Frac( a );
1722 1.1 ross aExp = extractFloat32Exp( a );
1723 1.1 ross bSig = extractFloat32Frac( b );
1724 1.1 ross bExp = extractFloat32Exp( b );
1725 1.1 ross expDiff = aExp - bExp;
1726 1.1 ross aSig <<= 7;
1727 1.1 ross bSig <<= 7;
1728 1.1 ross if ( 0 < expDiff ) goto aExpBigger;
1729 1.1 ross if ( expDiff < 0 ) goto bExpBigger;
1730 1.1 ross if ( aExp == 0xFF ) {
1731 1.1 ross if ( aSig | bSig ) return propagateFloat32NaN( a, b );
1732 1.1 ross float_raise( float_flag_invalid );
1733 1.1 ross return float32_default_nan;
1734 1.1 ross }
1735 1.1 ross if ( aExp == 0 ) {
1736 1.1 ross aExp = 1;
1737 1.1 ross bExp = 1;
1738 1.1 ross }
1739 1.1 ross if ( bSig < aSig ) goto aBigger;
1740 1.1 ross if ( aSig < bSig ) goto bBigger;
1741 1.1 ross return packFloat32( float_rounding_mode() == float_round_down, 0, 0 );
1742 1.1 ross bExpBigger:
1743 1.1 ross if ( bExp == 0xFF ) {
1744 1.1 ross if ( bSig ) return propagateFloat32NaN( a, b );
1745 1.1 ross return packFloat32( zSign ^ 1, 0xFF, 0 );
1746 1.1 ross }
1747 1.1 ross if ( aExp == 0 ) {
1748 1.1 ross ++expDiff;
1749 1.1 ross }
1750 1.1 ross else {
1751 1.1 ross aSig |= 0x40000000;
1752 1.1 ross }
1753 1.1 ross shift32RightJamming( aSig, - expDiff, &aSig );
1754 1.1 ross bSig |= 0x40000000;
1755 1.1 ross bBigger:
1756 1.1 ross zSig = bSig - aSig;
1757 1.1 ross zExp = bExp;
1758 1.1 ross zSign ^= 1;
1759 1.1 ross goto normalizeRoundAndPack;
1760 1.1 ross aExpBigger:
1761 1.1 ross if ( aExp == 0xFF ) {
1762 1.1 ross if ( aSig ) return propagateFloat32NaN( a, b );
1763 1.1 ross return a;
1764 1.1 ross }
1765 1.1 ross if ( bExp == 0 ) {
1766 1.1 ross --expDiff;
1767 1.1 ross }
1768 1.1 ross else {
1769 1.1 ross bSig |= 0x40000000;
1770 1.1 ross }
1771 1.1 ross shift32RightJamming( bSig, expDiff, &bSig );
1772 1.1 ross aSig |= 0x40000000;
1773 1.1 ross aBigger:
1774 1.1 ross zSig = aSig - bSig;
1775 1.1 ross zExp = aExp;
1776 1.1 ross normalizeRoundAndPack:
1777 1.1 ross --zExp;
1778 1.1 ross return normalizeRoundAndPackFloat32( zSign, zExp, zSig );
1779 1.1 ross
1780 1.1 ross }
1781 1.1 ross
1782 1.7 thorpej /*----------------------------------------------------------------------------
1783 1.7 thorpej | Returns the result of adding the single-precision floating-point values `a'
1784 1.7 thorpej | and `b'. The operation is performed according to the IEC/IEEE Standard for
1785 1.7 thorpej | Binary Floating-Point Arithmetic.
1786 1.7 thorpej *----------------------------------------------------------------------------*/
1787 1.7 thorpej
1788 1.1 ross float32 float32_add( float32 a, float32 b )
1789 1.1 ross {
1790 1.1 ross flag aSign, bSign;
1791 1.1 ross
1792 1.1 ross aSign = extractFloat32Sign( a );
1793 1.1 ross bSign = extractFloat32Sign( b );
1794 1.1 ross if ( aSign == bSign ) {
1795 1.1 ross return addFloat32Sigs( a, b, aSign );
1796 1.1 ross }
1797 1.1 ross else {
1798 1.1 ross return subFloat32Sigs( a, b, aSign );
1799 1.1 ross }
1800 1.1 ross
1801 1.1 ross }
1802 1.1 ross
1803 1.7 thorpej /*----------------------------------------------------------------------------
1804 1.7 thorpej | Returns the result of subtracting the single-precision floating-point values
1805 1.7 thorpej | `a' and `b'. The operation is performed according to the IEC/IEEE Standard
1806 1.7 thorpej | for Binary Floating-Point Arithmetic.
1807 1.7 thorpej *----------------------------------------------------------------------------*/
1808 1.7 thorpej
1809 1.1 ross float32 float32_sub( float32 a, float32 b )
1810 1.1 ross {
1811 1.1 ross flag aSign, bSign;
1812 1.1 ross
1813 1.1 ross aSign = extractFloat32Sign( a );
1814 1.1 ross bSign = extractFloat32Sign( b );
1815 1.1 ross if ( aSign == bSign ) {
1816 1.1 ross return subFloat32Sigs( a, b, aSign );
1817 1.1 ross }
1818 1.1 ross else {
1819 1.1 ross return addFloat32Sigs( a, b, aSign );
1820 1.1 ross }
1821 1.1 ross
1822 1.1 ross }
1823 1.1 ross
1824 1.7 thorpej /*----------------------------------------------------------------------------
1825 1.7 thorpej | Returns the result of multiplying the single-precision floating-point values
1826 1.7 thorpej | `a' and `b'. The operation is performed according to the IEC/IEEE Standard
1827 1.7 thorpej | for Binary Floating-Point Arithmetic.
1828 1.7 thorpej *----------------------------------------------------------------------------*/
1829 1.7 thorpej
1830 1.1 ross float32 float32_mul( float32 a, float32 b )
1831 1.1 ross {
1832 1.1 ross flag aSign, bSign, zSign;
1833 1.1 ross int16 aExp, bExp, zExp;
1834 1.1 ross bits32 aSig, bSig;
1835 1.1 ross bits64 zSig64;
1836 1.1 ross bits32 zSig;
1837 1.1 ross
1838 1.1 ross aSig = extractFloat32Frac( a );
1839 1.1 ross aExp = extractFloat32Exp( a );
1840 1.1 ross aSign = extractFloat32Sign( a );
1841 1.1 ross bSig = extractFloat32Frac( b );
1842 1.1 ross bExp = extractFloat32Exp( b );
1843 1.1 ross bSign = extractFloat32Sign( b );
1844 1.1 ross zSign = aSign ^ bSign;
1845 1.1 ross if ( aExp == 0xFF ) {
1846 1.1 ross if ( aSig || ( ( bExp == 0xFF ) && bSig ) ) {
1847 1.1 ross return propagateFloat32NaN( a, b );
1848 1.1 ross }
1849 1.1 ross if ( ( bExp | bSig ) == 0 ) {
1850 1.1 ross float_raise( float_flag_invalid );
1851 1.1 ross return float32_default_nan;
1852 1.1 ross }
1853 1.1 ross return packFloat32( zSign, 0xFF, 0 );
1854 1.1 ross }
1855 1.1 ross if ( bExp == 0xFF ) {
1856 1.1 ross if ( bSig ) return propagateFloat32NaN( a, b );
1857 1.1 ross if ( ( aExp | aSig ) == 0 ) {
1858 1.1 ross float_raise( float_flag_invalid );
1859 1.1 ross return float32_default_nan;
1860 1.1 ross }
1861 1.1 ross return packFloat32( zSign, 0xFF, 0 );
1862 1.1 ross }
1863 1.1 ross if ( aExp == 0 ) {
1864 1.1 ross if ( aSig == 0 ) return packFloat32( zSign, 0, 0 );
1865 1.1 ross normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1866 1.1 ross }
1867 1.1 ross if ( bExp == 0 ) {
1868 1.1 ross if ( bSig == 0 ) return packFloat32( zSign, 0, 0 );
1869 1.1 ross normalizeFloat32Subnormal( bSig, &bExp, &bSig );
1870 1.1 ross }
1871 1.1 ross zExp = aExp + bExp - 0x7F;
1872 1.1 ross aSig = ( aSig | 0x00800000 )<<7;
1873 1.1 ross bSig = ( bSig | 0x00800000 )<<8;
1874 1.1 ross shift64RightJamming( ( (bits64) aSig ) * bSig, 32, &zSig64 );
1875 1.1 ross zSig = zSig64;
1876 1.1 ross if ( 0 <= (sbits32) ( zSig<<1 ) ) {
1877 1.1 ross zSig <<= 1;
1878 1.1 ross --zExp;
1879 1.1 ross }
1880 1.1 ross return roundAndPackFloat32( zSign, zExp, zSig );
1881 1.1 ross
1882 1.1 ross }
1883 1.1 ross
1884 1.7 thorpej /*----------------------------------------------------------------------------
1885 1.7 thorpej | Returns the result of dividing the single-precision floating-point value `a'
1886 1.7 thorpej | by the corresponding value `b'. The operation is performed according to the
1887 1.7 thorpej | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1888 1.7 thorpej *----------------------------------------------------------------------------*/
1889 1.7 thorpej
1890 1.1 ross float32 float32_div( float32 a, float32 b )
1891 1.1 ross {
1892 1.1 ross flag aSign, bSign, zSign;
1893 1.1 ross int16 aExp, bExp, zExp;
1894 1.1 ross bits32 aSig, bSig, zSig;
1895 1.1 ross
1896 1.1 ross aSig = extractFloat32Frac( a );
1897 1.1 ross aExp = extractFloat32Exp( a );
1898 1.1 ross aSign = extractFloat32Sign( a );
1899 1.1 ross bSig = extractFloat32Frac( b );
1900 1.1 ross bExp = extractFloat32Exp( b );
1901 1.1 ross bSign = extractFloat32Sign( b );
1902 1.1 ross zSign = aSign ^ bSign;
1903 1.1 ross if ( aExp == 0xFF ) {
1904 1.1 ross if ( aSig ) return propagateFloat32NaN( a, b );
1905 1.1 ross if ( bExp == 0xFF ) {
1906 1.1 ross if ( bSig ) return propagateFloat32NaN( a, b );
1907 1.1 ross float_raise( float_flag_invalid );
1908 1.1 ross return float32_default_nan;
1909 1.1 ross }
1910 1.1 ross return packFloat32( zSign, 0xFF, 0 );
1911 1.1 ross }
1912 1.1 ross if ( bExp == 0xFF ) {
1913 1.1 ross if ( bSig ) return propagateFloat32NaN( a, b );
1914 1.1 ross return packFloat32( zSign, 0, 0 );
1915 1.1 ross }
1916 1.1 ross if ( bExp == 0 ) {
1917 1.1 ross if ( bSig == 0 ) {
1918 1.1 ross if ( ( aExp | aSig ) == 0 ) {
1919 1.1 ross float_raise( float_flag_invalid );
1920 1.1 ross return float32_default_nan;
1921 1.1 ross }
1922 1.1 ross float_raise( float_flag_divbyzero );
1923 1.1 ross return packFloat32( zSign, 0xFF, 0 );
1924 1.1 ross }
1925 1.1 ross normalizeFloat32Subnormal( bSig, &bExp, &bSig );
1926 1.1 ross }
1927 1.1 ross if ( aExp == 0 ) {
1928 1.1 ross if ( aSig == 0 ) return packFloat32( zSign, 0, 0 );
1929 1.1 ross normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1930 1.1 ross }
1931 1.1 ross zExp = aExp - bExp + 0x7D;
1932 1.1 ross aSig = ( aSig | 0x00800000 )<<7;
1933 1.1 ross bSig = ( bSig | 0x00800000 )<<8;
1934 1.1 ross if ( bSig <= ( aSig + aSig ) ) {
1935 1.1 ross aSig >>= 1;
1936 1.1 ross ++zExp;
1937 1.1 ross }
1938 1.1 ross zSig = ( ( (bits64) aSig )<<32 ) / bSig;
1939 1.1 ross if ( ( zSig & 0x3F ) == 0 ) {
1940 1.1 ross zSig |= ( (bits64) bSig * zSig != ( (bits64) aSig )<<32 );
1941 1.1 ross }
1942 1.1 ross return roundAndPackFloat32( zSign, zExp, zSig );
1943 1.1 ross
1944 1.1 ross }
1945 1.1 ross
1946 1.1 ross #ifndef SOFTFLOAT_FOR_GCC /* Not needed */
1947 1.7 thorpej /*----------------------------------------------------------------------------
1948 1.7 thorpej | Returns the remainder of the single-precision floating-point value `a'
1949 1.7 thorpej | with respect to the corresponding value `b'. The operation is performed
1950 1.7 thorpej | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1951 1.7 thorpej *----------------------------------------------------------------------------*/
1952 1.7 thorpej
1953 1.1 ross float32 float32_rem( float32 a, float32 b )
1954 1.1 ross {
1955 1.5 christos flag aSign, bSign __unused, zSign;
1956 1.1 ross int16 aExp, bExp, expDiff;
1957 1.1 ross bits32 aSig, bSig;
1958 1.1 ross bits32 q;
1959 1.1 ross bits64 aSig64, bSig64, q64;
1960 1.1 ross bits32 alternateASig;
1961 1.1 ross sbits32 sigMean;
1962 1.1 ross
1963 1.1 ross aSig = extractFloat32Frac( a );
1964 1.1 ross aExp = extractFloat32Exp( a );
1965 1.1 ross aSign = extractFloat32Sign( a );
1966 1.1 ross bSig = extractFloat32Frac( b );
1967 1.1 ross bExp = extractFloat32Exp( b );
1968 1.1 ross bSign = extractFloat32Sign( b );
1969 1.1 ross if ( aExp == 0xFF ) {
1970 1.1 ross if ( aSig || ( ( bExp == 0xFF ) && bSig ) ) {
1971 1.1 ross return propagateFloat32NaN( a, b );
1972 1.1 ross }
1973 1.1 ross float_raise( float_flag_invalid );
1974 1.1 ross return float32_default_nan;
1975 1.1 ross }
1976 1.1 ross if ( bExp == 0xFF ) {
1977 1.1 ross if ( bSig ) return propagateFloat32NaN( a, b );
1978 1.1 ross return a;
1979 1.1 ross }
1980 1.1 ross if ( bExp == 0 ) {
1981 1.1 ross if ( bSig == 0 ) {
1982 1.1 ross float_raise( float_flag_invalid );
1983 1.1 ross return float32_default_nan;
1984 1.1 ross }
1985 1.1 ross normalizeFloat32Subnormal( bSig, &bExp, &bSig );
1986 1.1 ross }
1987 1.1 ross if ( aExp == 0 ) {
1988 1.1 ross if ( aSig == 0 ) return a;
1989 1.1 ross normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1990 1.1 ross }
1991 1.1 ross expDiff = aExp - bExp;
1992 1.1 ross aSig |= 0x00800000;
1993 1.1 ross bSig |= 0x00800000;
1994 1.1 ross if ( expDiff < 32 ) {
1995 1.1 ross aSig <<= 8;
1996 1.1 ross bSig <<= 8;
1997 1.1 ross if ( expDiff < 0 ) {
1998 1.1 ross if ( expDiff < -1 ) return a;
1999 1.1 ross aSig >>= 1;
2000 1.1 ross }
2001 1.1 ross q = ( bSig <= aSig );
2002 1.1 ross if ( q ) aSig -= bSig;
2003 1.1 ross if ( 0 < expDiff ) {
2004 1.1 ross q = ( ( (bits64) aSig )<<32 ) / bSig;
2005 1.1 ross q >>= 32 - expDiff;
2006 1.1 ross bSig >>= 2;
2007 1.1 ross aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q;
2008 1.1 ross }
2009 1.1 ross else {
2010 1.1 ross aSig >>= 2;
2011 1.1 ross bSig >>= 2;
2012 1.1 ross }
2013 1.1 ross }
2014 1.1 ross else {
2015 1.1 ross if ( bSig <= aSig ) aSig -= bSig;
2016 1.1 ross aSig64 = ( (bits64) aSig )<<40;
2017 1.1 ross bSig64 = ( (bits64) bSig )<<40;
2018 1.1 ross expDiff -= 64;
2019 1.1 ross while ( 0 < expDiff ) {
2020 1.1 ross q64 = estimateDiv128To64( aSig64, 0, bSig64 );
2021 1.1 ross q64 = ( 2 < q64 ) ? q64 - 2 : 0;
2022 1.1 ross aSig64 = - ( ( bSig * q64 )<<38 );
2023 1.1 ross expDiff -= 62;
2024 1.1 ross }
2025 1.1 ross expDiff += 64;
2026 1.1 ross q64 = estimateDiv128To64( aSig64, 0, bSig64 );
2027 1.1 ross q64 = ( 2 < q64 ) ? q64 - 2 : 0;
2028 1.1 ross q = q64>>( 64 - expDiff );
2029 1.1 ross bSig <<= 6;
2030 1.1 ross aSig = ( ( aSig64>>33 )<<( expDiff - 1 ) ) - bSig * q;
2031 1.1 ross }
2032 1.1 ross do {
2033 1.1 ross alternateASig = aSig;
2034 1.1 ross ++q;
2035 1.1 ross aSig -= bSig;
2036 1.1 ross } while ( 0 <= (sbits32) aSig );
2037 1.1 ross sigMean = aSig + alternateASig;
2038 1.1 ross if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) {
2039 1.1 ross aSig = alternateASig;
2040 1.1 ross }
2041 1.1 ross zSign = ( (sbits32) aSig < 0 );
2042 1.1 ross if ( zSign ) aSig = - aSig;
2043 1.1 ross return normalizeRoundAndPackFloat32( aSign ^ zSign, bExp, aSig );
2044 1.1 ross
2045 1.1 ross }
2046 1.1 ross #endif /* !SOFTFLOAT_FOR_GCC */
2047 1.1 ross
2048 1.1 ross #ifndef SOFTFLOAT_FOR_GCC /* Not needed */
2049 1.7 thorpej
2050 1.7 thorpej /*----------------------------------------------------------------------------
2051 1.7 thorpej | Returns the square root of the single-precision floating-point value `a'.
2052 1.7 thorpej | The operation is performed according to the IEC/IEEE Standard for Binary
2053 1.7 thorpej | Floating-Point Arithmetic.
2054 1.7 thorpej *----------------------------------------------------------------------------*/
2055 1.7 thorpej
2056 1.1 ross float32 float32_sqrt( float32 a )
2057 1.1 ross {
2058 1.1 ross flag aSign;
2059 1.1 ross int16 aExp, zExp;
2060 1.1 ross bits32 aSig, zSig;
2061 1.1 ross bits64 rem, term;
2062 1.1 ross
2063 1.1 ross aSig = extractFloat32Frac( a );
2064 1.1 ross aExp = extractFloat32Exp( a );
2065 1.1 ross aSign = extractFloat32Sign( a );
2066 1.1 ross if ( aExp == 0xFF ) {
2067 1.1 ross if ( aSig ) return propagateFloat32NaN( a, 0 );
2068 1.1 ross if ( ! aSign ) return a;
2069 1.1 ross float_raise( float_flag_invalid );
2070 1.1 ross return float32_default_nan;
2071 1.1 ross }
2072 1.1 ross if ( aSign ) {
2073 1.1 ross if ( ( aExp | aSig ) == 0 ) return a;
2074 1.1 ross float_raise( float_flag_invalid );
2075 1.1 ross return float32_default_nan;
2076 1.1 ross }
2077 1.1 ross if ( aExp == 0 ) {
2078 1.1 ross if ( aSig == 0 ) return 0;
2079 1.1 ross normalizeFloat32Subnormal( aSig, &aExp, &aSig );
2080 1.1 ross }
2081 1.1 ross zExp = ( ( aExp - 0x7F )>>1 ) + 0x7E;
2082 1.1 ross aSig = ( aSig | 0x00800000 )<<8;
2083 1.1 ross zSig = estimateSqrt32( aExp, aSig ) + 2;
2084 1.1 ross if ( ( zSig & 0x7F ) <= 5 ) {
2085 1.1 ross if ( zSig < 2 ) {
2086 1.1 ross zSig = 0x7FFFFFFF;
2087 1.1 ross goto roundAndPack;
2088 1.1 ross }
2089 1.1 ross aSig >>= aExp & 1;
2090 1.1 ross term = ( (bits64) zSig ) * zSig;
2091 1.1 ross rem = ( ( (bits64) aSig )<<32 ) - term;
2092 1.1 ross while ( (sbits64) rem < 0 ) {
2093 1.1 ross --zSig;
2094 1.1 ross rem += ( ( (bits64) zSig )<<1 ) | 1;
2095 1.1 ross }
2096 1.1 ross zSig |= ( rem != 0 );
2097 1.1 ross }
2098 1.1 ross shift32RightJamming( zSig, 1, &zSig );
2099 1.1 ross roundAndPack:
2100 1.1 ross return roundAndPackFloat32( 0, zExp, zSig );
2101 1.1 ross
2102 1.1 ross }
2103 1.1 ross #endif /* !SOFTFLOAT_FOR_GCC */
2104 1.1 ross
2105 1.7 thorpej /*----------------------------------------------------------------------------
2106 1.7 thorpej | Returns 1 if the single-precision floating-point value `a' is equal to
2107 1.7 thorpej | the corresponding value `b', and 0 otherwise. The comparison is performed
2108 1.7 thorpej | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2109 1.7 thorpej *----------------------------------------------------------------------------*/
2110 1.7 thorpej
2111 1.1 ross flag float32_eq( float32 a, float32 b )
2112 1.1 ross {
2113 1.1 ross
2114 1.1 ross if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2115 1.1 ross || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2116 1.1 ross ) {
2117 1.1 ross if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) {
2118 1.1 ross float_raise( float_flag_invalid );
2119 1.1 ross }
2120 1.1 ross return 0;
2121 1.1 ross }
2122 1.1 ross return ( a == b ) || ( (bits32) ( ( a | b )<<1 ) == 0 );
2123 1.1 ross
2124 1.1 ross }
2125 1.1 ross
2126 1.7 thorpej /*----------------------------------------------------------------------------
2127 1.7 thorpej | Returns 1 if the single-precision floating-point value `a' is less than
2128 1.7 thorpej | or equal to the corresponding value `b', and 0 otherwise. The comparison
2129 1.7 thorpej | is performed according to the IEC/IEEE Standard for Binary Floating-Point
2130 1.7 thorpej | Arithmetic.
2131 1.7 thorpej *----------------------------------------------------------------------------*/
2132 1.7 thorpej
2133 1.1 ross flag float32_le( float32 a, float32 b )
2134 1.1 ross {
2135 1.1 ross flag aSign, bSign;
2136 1.1 ross
2137 1.1 ross if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2138 1.1 ross || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2139 1.1 ross ) {
2140 1.1 ross float_raise( float_flag_invalid );
2141 1.1 ross return 0;
2142 1.1 ross }
2143 1.1 ross aSign = extractFloat32Sign( a );
2144 1.1 ross bSign = extractFloat32Sign( b );
2145 1.1 ross if ( aSign != bSign ) return aSign || ( (bits32) ( ( a | b )<<1 ) == 0 );
2146 1.1 ross return ( a == b ) || ( aSign ^ ( a < b ) );
2147 1.1 ross
2148 1.1 ross }
2149 1.1 ross
2150 1.7 thorpej /*----------------------------------------------------------------------------
2151 1.7 thorpej | Returns 1 if the single-precision floating-point value `a' is less than
2152 1.7 thorpej | the corresponding value `b', and 0 otherwise. The comparison is performed
2153 1.7 thorpej | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2154 1.7 thorpej *----------------------------------------------------------------------------*/
2155 1.7 thorpej
2156 1.1 ross flag float32_lt( float32 a, float32 b )
2157 1.1 ross {
2158 1.1 ross flag aSign, bSign;
2159 1.1 ross
2160 1.1 ross if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2161 1.1 ross || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2162 1.1 ross ) {
2163 1.1 ross float_raise( float_flag_invalid );
2164 1.1 ross return 0;
2165 1.1 ross }
2166 1.1 ross aSign = extractFloat32Sign( a );
2167 1.1 ross bSign = extractFloat32Sign( b );
2168 1.1 ross if ( aSign != bSign ) return aSign && ( (bits32) ( ( a | b )<<1 ) != 0 );
2169 1.1 ross return ( a != b ) && ( aSign ^ ( a < b ) );
2170 1.1 ross
2171 1.1 ross }
2172 1.1 ross
2173 1.1 ross #ifndef SOFTFLOAT_FOR_GCC /* Not needed */
2174 1.7 thorpej /*----------------------------------------------------------------------------
2175 1.7 thorpej | Returns 1 if the single-precision floating-point value `a' is equal to
2176 1.7 thorpej | the corresponding value `b', and 0 otherwise. The invalid exception is
2177 1.7 thorpej | raised if either operand is a NaN. Otherwise, the comparison is performed
2178 1.7 thorpej | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2179 1.7 thorpej *----------------------------------------------------------------------------*/
2180 1.7 thorpej
2181 1.1 ross flag float32_eq_signaling( float32 a, float32 b )
2182 1.1 ross {
2183 1.1 ross
2184 1.1 ross if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2185 1.1 ross || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2186 1.1 ross ) {
2187 1.1 ross float_raise( float_flag_invalid );
2188 1.1 ross return 0;
2189 1.1 ross }
2190 1.1 ross return ( a == b ) || ( (bits32) ( ( a | b )<<1 ) == 0 );
2191 1.1 ross
2192 1.1 ross }
2193 1.1 ross
2194 1.7 thorpej /*----------------------------------------------------------------------------
2195 1.7 thorpej | Returns 1 if the single-precision floating-point value `a' is less than or
2196 1.7 thorpej | equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not
2197 1.7 thorpej | cause an exception. Otherwise, the comparison is performed according to the
2198 1.7 thorpej | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2199 1.7 thorpej *----------------------------------------------------------------------------*/
2200 1.7 thorpej
2201 1.1 ross flag float32_le_quiet( float32 a, float32 b )
2202 1.1 ross {
2203 1.1 ross flag aSign, bSign;
2204 1.1 ross
2205 1.1 ross if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2206 1.1 ross || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2207 1.1 ross ) {
2208 1.1 ross if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) {
2209 1.1 ross float_raise( float_flag_invalid );
2210 1.1 ross }
2211 1.1 ross return 0;
2212 1.1 ross }
2213 1.1 ross aSign = extractFloat32Sign( a );
2214 1.1 ross bSign = extractFloat32Sign( b );
2215 1.1 ross if ( aSign != bSign ) return aSign || ( (bits32) ( ( a | b )<<1 ) == 0 );
2216 1.1 ross return ( a == b ) || ( aSign ^ ( a < b ) );
2217 1.1 ross
2218 1.1 ross }
2219 1.1 ross
2220 1.7 thorpej /*----------------------------------------------------------------------------
2221 1.7 thorpej | Returns 1 if the single-precision floating-point value `a' is less than
2222 1.7 thorpej | the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
2223 1.7 thorpej | exception. Otherwise, the comparison is performed according to the IEC/IEEE
2224 1.7 thorpej | Standard for Binary Floating-Point Arithmetic.
2225 1.7 thorpej *----------------------------------------------------------------------------*/
2226 1.7 thorpej
2227 1.1 ross flag float32_lt_quiet( float32 a, float32 b )
2228 1.1 ross {
2229 1.1 ross flag aSign, bSign;
2230 1.1 ross
2231 1.1 ross if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2232 1.1 ross || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2233 1.1 ross ) {
2234 1.1 ross if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) {
2235 1.1 ross float_raise( float_flag_invalid );
2236 1.1 ross }
2237 1.1 ross return 0;
2238 1.1 ross }
2239 1.1 ross aSign = extractFloat32Sign( a );
2240 1.1 ross bSign = extractFloat32Sign( b );
2241 1.1 ross if ( aSign != bSign ) return aSign && ( (bits32) ( ( a | b )<<1 ) != 0 );
2242 1.1 ross return ( a != b ) && ( aSign ^ ( a < b ) );
2243 1.1 ross
2244 1.1 ross }
2245 1.1 ross #endif /* !SOFTFLOAT_FOR_GCC */
2246 1.1 ross
2247 1.1 ross #ifndef SOFTFLOAT_FOR_GCC /* Not needed */
2248 1.7 thorpej /*----------------------------------------------------------------------------
2249 1.7 thorpej | Returns the result of converting the double-precision floating-point value
2250 1.7 thorpej | `a' to the 32-bit two's complement integer format. The conversion is
2251 1.7 thorpej | performed according to the IEC/IEEE Standard for Binary Floating-Point
2252 1.7 thorpej | Arithmetic---which means in particular that the conversion is rounded
2253 1.7 thorpej | according to the current rounding mode. If `a' is a NaN, the largest
2254 1.7 thorpej | positive integer is returned. Otherwise, if the conversion overflows, the
2255 1.7 thorpej | largest integer with the same sign as `a' is returned.
2256 1.7 thorpej *----------------------------------------------------------------------------*/
2257 1.7 thorpej
2258 1.1 ross int32 float64_to_int32( float64 a )
2259 1.1 ross {
2260 1.1 ross flag aSign;
2261 1.1 ross int16 aExp, shiftCount;
2262 1.1 ross bits64 aSig;
2263 1.1 ross
2264 1.1 ross aSig = extractFloat64Frac( a );
2265 1.1 ross aExp = extractFloat64Exp( a );
2266 1.1 ross aSign = extractFloat64Sign( a );
2267 1.1 ross if ( ( aExp == 0x7FF ) && aSig ) aSign = 0;
2268 1.1 ross if ( aExp ) aSig |= LIT64( 0x0010000000000000 );
2269 1.1 ross shiftCount = 0x42C - aExp;
2270 1.1 ross if ( 0 < shiftCount ) shift64RightJamming( aSig, shiftCount, &aSig );
2271 1.1 ross return roundAndPackInt32( aSign, aSig );
2272 1.1 ross
2273 1.1 ross }
2274 1.1 ross #endif /* !SOFTFLOAT_FOR_GCC */
2275 1.1 ross
2276 1.7 thorpej /*----------------------------------------------------------------------------
2277 1.7 thorpej | Returns the result of converting the double-precision floating-point value
2278 1.7 thorpej | `a' to the 32-bit two's complement integer format. The conversion is
2279 1.7 thorpej | performed according to the IEC/IEEE Standard for Binary Floating-Point
2280 1.7 thorpej | Arithmetic, except that the conversion is always rounded toward zero.
2281 1.7 thorpej | If `a' is a NaN, the largest positive integer is returned. Otherwise, if
2282 1.7 thorpej | the conversion overflows, the largest integer with the same sign as `a' is
2283 1.7 thorpej | returned.
2284 1.7 thorpej *----------------------------------------------------------------------------*/
2285 1.7 thorpej
2286 1.1 ross int32 float64_to_int32_round_to_zero( float64 a )
2287 1.1 ross {
2288 1.1 ross flag aSign;
2289 1.1 ross int16 aExp, shiftCount;
2290 1.1 ross bits64 aSig, savedASig;
2291 1.1 ross int32 z;
2292 1.1 ross
2293 1.1 ross aSig = extractFloat64Frac( a );
2294 1.1 ross aExp = extractFloat64Exp( a );
2295 1.1 ross aSign = extractFloat64Sign( a );
2296 1.1 ross if ( 0x41E < aExp ) {
2297 1.1 ross if ( ( aExp == 0x7FF ) && aSig ) aSign = 0;
2298 1.1 ross goto invalid;
2299 1.1 ross }
2300 1.1 ross else if ( aExp < 0x3FF ) {
2301 1.1 ross if ( aExp || aSig ) float_set_inexact();
2302 1.1 ross return 0;
2303 1.1 ross }
2304 1.1 ross aSig |= LIT64( 0x0010000000000000 );
2305 1.1 ross shiftCount = 0x433 - aExp;
2306 1.1 ross savedASig = aSig;
2307 1.1 ross aSig >>= shiftCount;
2308 1.1 ross z = aSig;
2309 1.1 ross if ( aSign ) z = - z;
2310 1.1 ross if ( ( z < 0 ) ^ aSign ) {
2311 1.1 ross invalid:
2312 1.1 ross float_raise( float_flag_invalid );
2313 1.1 ross return aSign ? (sbits32) 0x80000000 : 0x7FFFFFFF;
2314 1.1 ross }
2315 1.1 ross if ( ( aSig<<shiftCount ) != savedASig ) {
2316 1.1 ross float_set_inexact();
2317 1.1 ross }
2318 1.1 ross return z;
2319 1.1 ross
2320 1.1 ross }
2321 1.1 ross
2322 1.1 ross #ifndef SOFTFLOAT_FOR_GCC /* Not needed */
2323 1.7 thorpej /*----------------------------------------------------------------------------
2324 1.7 thorpej | Returns the result of converting the double-precision floating-point value
2325 1.7 thorpej | `a' to the 64-bit two's complement integer format. The conversion is
2326 1.7 thorpej | performed according to the IEC/IEEE Standard for Binary Floating-Point
2327 1.7 thorpej | Arithmetic---which means in particular that the conversion is rounded
2328 1.7 thorpej | according to the current rounding mode. If `a' is a NaN, the largest
2329 1.7 thorpej | positive integer is returned. Otherwise, if the conversion overflows, the
2330 1.7 thorpej | largest integer with the same sign as `a' is returned.
2331 1.7 thorpej *----------------------------------------------------------------------------*/
2332 1.7 thorpej
2333 1.1 ross int64 float64_to_int64( float64 a )
2334 1.1 ross {
2335 1.1 ross flag aSign;
2336 1.1 ross int16 aExp, shiftCount;
2337 1.1 ross bits64 aSig, aSigExtra;
2338 1.1 ross
2339 1.1 ross aSig = extractFloat64Frac( a );
2340 1.1 ross aExp = extractFloat64Exp( a );
2341 1.1 ross aSign = extractFloat64Sign( a );
2342 1.1 ross if ( aExp ) aSig |= LIT64( 0x0010000000000000 );
2343 1.1 ross shiftCount = 0x433 - aExp;
2344 1.1 ross if ( shiftCount <= 0 ) {
2345 1.1 ross if ( 0x43E < aExp ) {
2346 1.1 ross float_raise( float_flag_invalid );
2347 1.1 ross if ( ! aSign
2348 1.1 ross || ( ( aExp == 0x7FF )
2349 1.1 ross && ( aSig != LIT64( 0x0010000000000000 ) ) )
2350 1.1 ross ) {
2351 1.1 ross return LIT64( 0x7FFFFFFFFFFFFFFF );
2352 1.1 ross }
2353 1.1 ross return (sbits64) LIT64( 0x8000000000000000 );
2354 1.1 ross }
2355 1.1 ross aSigExtra = 0;
2356 1.1 ross aSig <<= - shiftCount;
2357 1.1 ross }
2358 1.1 ross else {
2359 1.1 ross shift64ExtraRightJamming( aSig, 0, shiftCount, &aSig, &aSigExtra );
2360 1.1 ross }
2361 1.1 ross return roundAndPackInt64( aSign, aSig, aSigExtra );
2362 1.1 ross
2363 1.1 ross }
2364 1.1 ross
2365 1.6 martin /* like above, but result is unsigned */
2366 1.6 martin uint64 float64_to_uint64( float64 a )
2367 1.6 martin {
2368 1.6 martin flag aSign;
2369 1.6 martin int16 aExp, shiftCount;
2370 1.6 martin bits64 aSig, aSigExtra;
2371 1.6 martin
2372 1.6 martin aSig = extractFloat64Frac( a );
2373 1.6 martin aExp = extractFloat64Exp( a );
2374 1.6 martin aSign = extractFloat64Sign( a );
2375 1.6 martin
2376 1.6 martin if (aSign) {
2377 1.6 martin return float64_to_int64(a);
2378 1.6 martin }
2379 1.6 martin
2380 1.6 martin if ( aExp ) aSig |= LIT64( 0x0010000000000000 );
2381 1.6 martin shiftCount = 0x433 - aExp;
2382 1.6 martin if ( shiftCount <= 0 ) {
2383 1.6 martin if ( 0x43E < aExp ) {
2384 1.6 martin float_raise( float_flag_invalid );
2385 1.6 martin if ( ! aSign
2386 1.6 martin || ( ( aExp == 0x7FF )
2387 1.6 martin && ( aSig != LIT64( 0x0010000000000000 ) ) )
2388 1.6 martin ) {
2389 1.6 martin return LIT64( 0x7FFFFFFFFFFFFFFF );
2390 1.6 martin }
2391 1.6 martin return (sbits64) LIT64( 0x8000000000000000 );
2392 1.6 martin }
2393 1.6 martin aSigExtra = 0;
2394 1.6 martin aSig <<= - shiftCount;
2395 1.6 martin }
2396 1.6 martin else {
2397 1.6 martin shift64ExtraRightJamming( aSig, 0, shiftCount, &aSig, &aSigExtra );
2398 1.6 martin }
2399 1.6 martin return roundAndPackUInt64( aSig, aSigExtra );
2400 1.6 martin
2401 1.6 martin }
2402 1.6 martin
2403 1.7 thorpej /*----------------------------------------------------------------------------
2404 1.7 thorpej | Returns the result of converting the double-precision floating-point value
2405 1.7 thorpej | `a' to the 64-bit two's complement integer format. The conversion is
2406 1.7 thorpej | performed according to the IEC/IEEE Standard for Binary Floating-Point
2407 1.7 thorpej | Arithmetic, except that the conversion is always rounded toward zero.
2408 1.7 thorpej | If `a' is a NaN, the largest positive integer is returned. Otherwise, if
2409 1.7 thorpej | the conversion overflows, the largest integer with the same sign as `a' is
2410 1.7 thorpej | returned.
2411 1.7 thorpej *----------------------------------------------------------------------------*/
2412 1.7 thorpej
2413 1.1 ross int64 float64_to_int64_round_to_zero( float64 a )
2414 1.1 ross {
2415 1.1 ross flag aSign;
2416 1.1 ross int16 aExp, shiftCount;
2417 1.1 ross bits64 aSig;
2418 1.1 ross int64 z;
2419 1.1 ross
2420 1.1 ross aSig = extractFloat64Frac( a );
2421 1.1 ross aExp = extractFloat64Exp( a );
2422 1.1 ross aSign = extractFloat64Sign( a );
2423 1.1 ross if ( aExp ) aSig |= LIT64( 0x0010000000000000 );
2424 1.1 ross shiftCount = aExp - 0x433;
2425 1.1 ross if ( 0 <= shiftCount ) {
2426 1.1 ross if ( 0x43E <= aExp ) {
2427 1.1 ross if ( a != LIT64( 0xC3E0000000000000 ) ) {
2428 1.1 ross float_raise( float_flag_invalid );
2429 1.1 ross if ( ! aSign
2430 1.1 ross || ( ( aExp == 0x7FF )
2431 1.1 ross && ( aSig != LIT64( 0x0010000000000000 ) ) )
2432 1.1 ross ) {
2433 1.1 ross return LIT64( 0x7FFFFFFFFFFFFFFF );
2434 1.1 ross }
2435 1.1 ross }
2436 1.1 ross return (sbits64) LIT64( 0x8000000000000000 );
2437 1.1 ross }
2438 1.1 ross z = aSig<<shiftCount;
2439 1.1 ross }
2440 1.1 ross else {
2441 1.1 ross if ( aExp < 0x3FE ) {
2442 1.1 ross if ( aExp | aSig ) float_set_inexact();
2443 1.1 ross return 0;
2444 1.1 ross }
2445 1.1 ross z = aSig>>( - shiftCount );
2446 1.1 ross if ( (bits64) ( aSig<<( shiftCount & 63 ) ) ) {
2447 1.1 ross float_set_inexact();
2448 1.1 ross }
2449 1.1 ross }
2450 1.1 ross if ( aSign ) z = - z;
2451 1.1 ross return z;
2452 1.1 ross
2453 1.1 ross }
2454 1.1 ross #endif /* !SOFTFLOAT_FOR_GCC */
2455 1.1 ross
2456 1.7 thorpej /*----------------------------------------------------------------------------
2457 1.7 thorpej | Returns the result of converting the double-precision floating-point value
2458 1.7 thorpej | `a' to the single-precision floating-point format. The conversion is
2459 1.7 thorpej | performed according to the IEC/IEEE Standard for Binary Floating-Point
2460 1.7 thorpej | Arithmetic.
2461 1.7 thorpej *----------------------------------------------------------------------------*/
2462 1.7 thorpej
2463 1.1 ross float32 float64_to_float32( float64 a )
2464 1.1 ross {
2465 1.1 ross flag aSign;
2466 1.1 ross int16 aExp;
2467 1.1 ross bits64 aSig;
2468 1.1 ross bits32 zSig;
2469 1.1 ross
2470 1.1 ross aSig = extractFloat64Frac( a );
2471 1.1 ross aExp = extractFloat64Exp( a );
2472 1.1 ross aSign = extractFloat64Sign( a );
2473 1.1 ross if ( aExp == 0x7FF ) {
2474 1.1 ross if ( aSig ) return commonNaNToFloat32( float64ToCommonNaN( a ) );
2475 1.1 ross return packFloat32( aSign, 0xFF, 0 );
2476 1.1 ross }
2477 1.1 ross shift64RightJamming( aSig, 22, &aSig );
2478 1.1 ross zSig = aSig;
2479 1.1 ross if ( aExp || zSig ) {
2480 1.1 ross zSig |= 0x40000000;
2481 1.1 ross aExp -= 0x381;
2482 1.1 ross }
2483 1.1 ross return roundAndPackFloat32( aSign, aExp, zSig );
2484 1.1 ross
2485 1.1 ross }
2486 1.1 ross
2487 1.1 ross #ifdef FLOATX80
2488 1.1 ross
2489 1.7 thorpej /*----------------------------------------------------------------------------
2490 1.7 thorpej | Returns the result of converting the double-precision floating-point value
2491 1.7 thorpej | `a' to the extended double-precision floating-point format. The conversion
2492 1.7 thorpej | is performed according to the IEC/IEEE Standard for Binary Floating-Point
2493 1.7 thorpej | Arithmetic.
2494 1.7 thorpej *----------------------------------------------------------------------------*/
2495 1.7 thorpej
2496 1.1 ross floatx80 float64_to_floatx80( float64 a )
2497 1.1 ross {
2498 1.1 ross flag aSign;
2499 1.1 ross int16 aExp;
2500 1.1 ross bits64 aSig;
2501 1.1 ross
2502 1.1 ross aSig = extractFloat64Frac( a );
2503 1.1 ross aExp = extractFloat64Exp( a );
2504 1.1 ross aSign = extractFloat64Sign( a );
2505 1.1 ross if ( aExp == 0x7FF ) {
2506 1.1 ross if ( aSig ) return commonNaNToFloatx80( float64ToCommonNaN( a ) );
2507 1.1 ross return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
2508 1.1 ross }
2509 1.1 ross if ( aExp == 0 ) {
2510 1.1 ross if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 );
2511 1.1 ross normalizeFloat64Subnormal( aSig, &aExp, &aSig );
2512 1.1 ross }
2513 1.1 ross return
2514 1.1 ross packFloatx80(
2515 1.1 ross aSign, aExp + 0x3C00, ( aSig | LIT64( 0x0010000000000000 ) )<<11 );
2516 1.1 ross
2517 1.1 ross }
2518 1.1 ross
2519 1.1 ross #endif
2520 1.1 ross
2521 1.1 ross #ifdef FLOAT128
2522 1.1 ross
2523 1.7 thorpej /*----------------------------------------------------------------------------
2524 1.7 thorpej | Returns the result of converting the double-precision floating-point value
2525 1.7 thorpej | `a' to the quadruple-precision floating-point format. The conversion is
2526 1.7 thorpej | performed according to the IEC/IEEE Standard for Binary Floating-Point
2527 1.7 thorpej | Arithmetic.
2528 1.7 thorpej *----------------------------------------------------------------------------*/
2529 1.7 thorpej
2530 1.1 ross float128 float64_to_float128( float64 a )
2531 1.1 ross {
2532 1.1 ross flag aSign;
2533 1.1 ross int16 aExp;
2534 1.1 ross bits64 aSig, zSig0, zSig1;
2535 1.1 ross
2536 1.1 ross aSig = extractFloat64Frac( a );
2537 1.1 ross aExp = extractFloat64Exp( a );
2538 1.1 ross aSign = extractFloat64Sign( a );
2539 1.1 ross if ( aExp == 0x7FF ) {
2540 1.1 ross if ( aSig ) return commonNaNToFloat128( float64ToCommonNaN( a ) );
2541 1.1 ross return packFloat128( aSign, 0x7FFF, 0, 0 );
2542 1.1 ross }
2543 1.1 ross if ( aExp == 0 ) {
2544 1.1 ross if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 );
2545 1.1 ross normalizeFloat64Subnormal( aSig, &aExp, &aSig );
2546 1.1 ross --aExp;
2547 1.1 ross }
2548 1.1 ross shift128Right( aSig, 0, 4, &zSig0, &zSig1 );
2549 1.1 ross return packFloat128( aSign, aExp + 0x3C00, zSig0, zSig1 );
2550 1.1 ross
2551 1.1 ross }
2552 1.1 ross
2553 1.1 ross #endif
2554 1.1 ross
2555 1.1 ross #ifndef SOFTFLOAT_FOR_GCC
2556 1.7 thorpej /*----------------------------------------------------------------------------
2557 1.7 thorpej | Rounds the double-precision floating-point value `a' to an integer, and
2558 1.7 thorpej | returns the result as a double-precision floating-point value. The
2559 1.7 thorpej | operation is performed according to the IEC/IEEE Standard for Binary
2560 1.7 thorpej | Floating-Point Arithmetic.
2561 1.7 thorpej *----------------------------------------------------------------------------*/
2562 1.7 thorpej
2563 1.1 ross float64 float64_round_to_int( float64 a )
2564 1.1 ross {
2565 1.1 ross flag aSign;
2566 1.1 ross int16 aExp;
2567 1.1 ross bits64 lastBitMask, roundBitsMask;
2568 1.1 ross int8 roundingMode;
2569 1.1 ross float64 z;
2570 1.1 ross
2571 1.1 ross aExp = extractFloat64Exp( a );
2572 1.1 ross if ( 0x433 <= aExp ) {
2573 1.1 ross if ( ( aExp == 0x7FF ) && extractFloat64Frac( a ) ) {
2574 1.1 ross return propagateFloat64NaN( a, a );
2575 1.1 ross }
2576 1.1 ross return a;
2577 1.1 ross }
2578 1.1 ross if ( aExp < 0x3FF ) {
2579 1.1 ross if ( (bits64) ( a<<1 ) == 0 ) return a;
2580 1.1 ross float_set_inexact();
2581 1.1 ross aSign = extractFloat64Sign( a );
2582 1.1 ross switch ( float_rounding_mode() ) {
2583 1.1 ross case float_round_nearest_even:
2584 1.1 ross if ( ( aExp == 0x3FE ) && extractFloat64Frac( a ) ) {
2585 1.1 ross return packFloat64( aSign, 0x3FF, 0 );
2586 1.1 ross }
2587 1.1 ross break;
2588 1.1 ross case float_round_down:
2589 1.1 ross return aSign ? LIT64( 0xBFF0000000000000 ) : 0;
2590 1.1 ross case float_round_up:
2591 1.1 ross return
2592 1.1 ross aSign ? LIT64( 0x8000000000000000 ) : LIT64( 0x3FF0000000000000 );
2593 1.1 ross }
2594 1.1 ross return packFloat64( aSign, 0, 0 );
2595 1.1 ross }
2596 1.1 ross lastBitMask = 1;
2597 1.1 ross lastBitMask <<= 0x433 - aExp;
2598 1.1 ross roundBitsMask = lastBitMask - 1;
2599 1.1 ross z = a;
2600 1.1 ross roundingMode = float_rounding_mode();
2601 1.1 ross if ( roundingMode == float_round_nearest_even ) {
2602 1.1 ross z += lastBitMask>>1;
2603 1.1 ross if ( ( z & roundBitsMask ) == 0 ) z &= ~ lastBitMask;
2604 1.1 ross }
2605 1.1 ross else if ( roundingMode != float_round_to_zero ) {
2606 1.1 ross if ( extractFloat64Sign( z ) ^ ( roundingMode == float_round_up ) ) {
2607 1.1 ross z += roundBitsMask;
2608 1.1 ross }
2609 1.1 ross }
2610 1.1 ross z &= ~ roundBitsMask;
2611 1.1 ross if ( z != a ) float_set_inexact();
2612 1.1 ross return z;
2613 1.1 ross
2614 1.1 ross }
2615 1.1 ross #endif
2616 1.1 ross
2617 1.7 thorpej /*----------------------------------------------------------------------------
2618 1.7 thorpej | Returns the result of adding the absolute values of the double-precision
2619 1.7 thorpej | floating-point values `a' and `b'. If `zSign' is 1, the sum is negated
2620 1.7 thorpej | before being returned. `zSign' is ignored if the result is a NaN.
2621 1.7 thorpej | The addition is performed according to the IEC/IEEE Standard for Binary
2622 1.7 thorpej | Floating-Point Arithmetic.
2623 1.7 thorpej *----------------------------------------------------------------------------*/
2624 1.7 thorpej
2625 1.1 ross static float64 addFloat64Sigs( float64 a, float64 b, flag zSign )
2626 1.1 ross {
2627 1.1 ross int16 aExp, bExp, zExp;
2628 1.1 ross bits64 aSig, bSig, zSig;
2629 1.1 ross int16 expDiff;
2630 1.1 ross
2631 1.1 ross aSig = extractFloat64Frac( a );
2632 1.1 ross aExp = extractFloat64Exp( a );
2633 1.1 ross bSig = extractFloat64Frac( b );
2634 1.1 ross bExp = extractFloat64Exp( b );
2635 1.1 ross expDiff = aExp - bExp;
2636 1.1 ross aSig <<= 9;
2637 1.1 ross bSig <<= 9;
2638 1.1 ross if ( 0 < expDiff ) {
2639 1.1 ross if ( aExp == 0x7FF ) {
2640 1.1 ross if ( aSig ) return propagateFloat64NaN( a, b );
2641 1.1 ross return a;
2642 1.1 ross }
2643 1.1 ross if ( bExp == 0 ) {
2644 1.1 ross --expDiff;
2645 1.1 ross }
2646 1.1 ross else {
2647 1.1 ross bSig |= LIT64( 0x2000000000000000 );
2648 1.1 ross }
2649 1.1 ross shift64RightJamming( bSig, expDiff, &bSig );
2650 1.1 ross zExp = aExp;
2651 1.1 ross }
2652 1.1 ross else if ( expDiff < 0 ) {
2653 1.1 ross if ( bExp == 0x7FF ) {
2654 1.1 ross if ( bSig ) return propagateFloat64NaN( a, b );
2655 1.1 ross return packFloat64( zSign, 0x7FF, 0 );
2656 1.1 ross }
2657 1.1 ross if ( aExp == 0 ) {
2658 1.1 ross ++expDiff;
2659 1.1 ross }
2660 1.1 ross else {
2661 1.1 ross aSig |= LIT64( 0x2000000000000000 );
2662 1.1 ross }
2663 1.1 ross shift64RightJamming( aSig, - expDiff, &aSig );
2664 1.1 ross zExp = bExp;
2665 1.1 ross }
2666 1.1 ross else {
2667 1.1 ross if ( aExp == 0x7FF ) {
2668 1.1 ross if ( aSig | bSig ) return propagateFloat64NaN( a, b );
2669 1.1 ross return a;
2670 1.1 ross }
2671 1.1 ross if ( aExp == 0 ) return packFloat64( zSign, 0, ( aSig + bSig )>>9 );
2672 1.1 ross zSig = LIT64( 0x4000000000000000 ) + aSig + bSig;
2673 1.1 ross zExp = aExp;
2674 1.1 ross goto roundAndPack;
2675 1.1 ross }
2676 1.1 ross aSig |= LIT64( 0x2000000000000000 );
2677 1.1 ross zSig = ( aSig + bSig )<<1;
2678 1.1 ross --zExp;
2679 1.1 ross if ( (sbits64) zSig < 0 ) {
2680 1.1 ross zSig = aSig + bSig;
2681 1.1 ross ++zExp;
2682 1.1 ross }
2683 1.1 ross roundAndPack:
2684 1.1 ross return roundAndPackFloat64( zSign, zExp, zSig );
2685 1.1 ross
2686 1.1 ross }
2687 1.1 ross
2688 1.7 thorpej /*----------------------------------------------------------------------------
2689 1.7 thorpej | Returns the result of subtracting the absolute values of the double-
2690 1.7 thorpej | precision floating-point values `a' and `b'. If `zSign' is 1, the
2691 1.7 thorpej | difference is negated before being returned. `zSign' is ignored if the
2692 1.7 thorpej | result is a NaN. The subtraction is performed according to the IEC/IEEE
2693 1.7 thorpej | Standard for Binary Floating-Point Arithmetic.
2694 1.7 thorpej *----------------------------------------------------------------------------*/
2695 1.7 thorpej
2696 1.1 ross static float64 subFloat64Sigs( float64 a, float64 b, flag zSign )
2697 1.1 ross {
2698 1.1 ross int16 aExp, bExp, zExp;
2699 1.1 ross bits64 aSig, bSig, zSig;
2700 1.1 ross int16 expDiff;
2701 1.1 ross
2702 1.1 ross aSig = extractFloat64Frac( a );
2703 1.1 ross aExp = extractFloat64Exp( a );
2704 1.1 ross bSig = extractFloat64Frac( b );
2705 1.1 ross bExp = extractFloat64Exp( b );
2706 1.1 ross expDiff = aExp - bExp;
2707 1.1 ross aSig <<= 10;
2708 1.1 ross bSig <<= 10;
2709 1.1 ross if ( 0 < expDiff ) goto aExpBigger;
2710 1.1 ross if ( expDiff < 0 ) goto bExpBigger;
2711 1.1 ross if ( aExp == 0x7FF ) {
2712 1.1 ross if ( aSig | bSig ) return propagateFloat64NaN( a, b );
2713 1.1 ross float_raise( float_flag_invalid );
2714 1.1 ross return float64_default_nan;
2715 1.1 ross }
2716 1.1 ross if ( aExp == 0 ) {
2717 1.1 ross aExp = 1;
2718 1.1 ross bExp = 1;
2719 1.1 ross }
2720 1.1 ross if ( bSig < aSig ) goto aBigger;
2721 1.1 ross if ( aSig < bSig ) goto bBigger;
2722 1.1 ross return packFloat64( float_rounding_mode() == float_round_down, 0, 0 );
2723 1.1 ross bExpBigger:
2724 1.1 ross if ( bExp == 0x7FF ) {
2725 1.1 ross if ( bSig ) return propagateFloat64NaN( a, b );
2726 1.1 ross return packFloat64( zSign ^ 1, 0x7FF, 0 );
2727 1.1 ross }
2728 1.1 ross if ( aExp == 0 ) {
2729 1.1 ross ++expDiff;
2730 1.1 ross }
2731 1.1 ross else {
2732 1.1 ross aSig |= LIT64( 0x4000000000000000 );
2733 1.1 ross }
2734 1.1 ross shift64RightJamming( aSig, - expDiff, &aSig );
2735 1.1 ross bSig |= LIT64( 0x4000000000000000 );
2736 1.1 ross bBigger:
2737 1.1 ross zSig = bSig - aSig;
2738 1.1 ross zExp = bExp;
2739 1.1 ross zSign ^= 1;
2740 1.1 ross goto normalizeRoundAndPack;
2741 1.1 ross aExpBigger:
2742 1.1 ross if ( aExp == 0x7FF ) {
2743 1.1 ross if ( aSig ) return propagateFloat64NaN( a, b );
2744 1.1 ross return a;
2745 1.1 ross }
2746 1.1 ross if ( bExp == 0 ) {
2747 1.1 ross --expDiff;
2748 1.1 ross }
2749 1.1 ross else {
2750 1.1 ross bSig |= LIT64( 0x4000000000000000 );
2751 1.1 ross }
2752 1.1 ross shift64RightJamming( bSig, expDiff, &bSig );
2753 1.1 ross aSig |= LIT64( 0x4000000000000000 );
2754 1.1 ross aBigger:
2755 1.1 ross zSig = aSig - bSig;
2756 1.1 ross zExp = aExp;
2757 1.1 ross normalizeRoundAndPack:
2758 1.1 ross --zExp;
2759 1.1 ross return normalizeRoundAndPackFloat64( zSign, zExp, zSig );
2760 1.1 ross
2761 1.1 ross }
2762 1.1 ross
2763 1.7 thorpej /*----------------------------------------------------------------------------
2764 1.7 thorpej | Returns the result of adding the double-precision floating-point values `a'
2765 1.7 thorpej | and `b'. The operation is performed according to the IEC/IEEE Standard for
2766 1.7 thorpej | Binary Floating-Point Arithmetic.
2767 1.7 thorpej *----------------------------------------------------------------------------*/
2768 1.7 thorpej
2769 1.1 ross float64 float64_add( float64 a, float64 b )
2770 1.1 ross {
2771 1.1 ross flag aSign, bSign;
2772 1.1 ross
2773 1.1 ross aSign = extractFloat64Sign( a );
2774 1.1 ross bSign = extractFloat64Sign( b );
2775 1.1 ross if ( aSign == bSign ) {
2776 1.1 ross return addFloat64Sigs( a, b, aSign );
2777 1.1 ross }
2778 1.1 ross else {
2779 1.1 ross return subFloat64Sigs( a, b, aSign );
2780 1.1 ross }
2781 1.1 ross
2782 1.1 ross }
2783 1.1 ross
2784 1.7 thorpej /*----------------------------------------------------------------------------
2785 1.7 thorpej | Returns the result of subtracting the double-precision floating-point values
2786 1.7 thorpej | `a' and `b'. The operation is performed according to the IEC/IEEE Standard
2787 1.7 thorpej | for Binary Floating-Point Arithmetic.
2788 1.7 thorpej *----------------------------------------------------------------------------*/
2789 1.7 thorpej
2790 1.1 ross float64 float64_sub( float64 a, float64 b )
2791 1.1 ross {
2792 1.1 ross flag aSign, bSign;
2793 1.1 ross
2794 1.1 ross aSign = extractFloat64Sign( a );
2795 1.1 ross bSign = extractFloat64Sign( b );
2796 1.1 ross if ( aSign == bSign ) {
2797 1.1 ross return subFloat64Sigs( a, b, aSign );
2798 1.1 ross }
2799 1.1 ross else {
2800 1.1 ross return addFloat64Sigs( a, b, aSign );
2801 1.1 ross }
2802 1.1 ross
2803 1.1 ross }
2804 1.1 ross
2805 1.7 thorpej /*----------------------------------------------------------------------------
2806 1.7 thorpej | Returns the result of multiplying the double-precision floating-point values
2807 1.7 thorpej | `a' and `b'. The operation is performed according to the IEC/IEEE Standard
2808 1.7 thorpej | for Binary Floating-Point Arithmetic.
2809 1.7 thorpej *----------------------------------------------------------------------------*/
2810 1.7 thorpej
2811 1.1 ross float64 float64_mul( float64 a, float64 b )
2812 1.1 ross {
2813 1.1 ross flag aSign, bSign, zSign;
2814 1.1 ross int16 aExp, bExp, zExp;
2815 1.1 ross bits64 aSig, bSig, zSig0, zSig1;
2816 1.1 ross
2817 1.1 ross aSig = extractFloat64Frac( a );
2818 1.1 ross aExp = extractFloat64Exp( a );
2819 1.1 ross aSign = extractFloat64Sign( a );
2820 1.1 ross bSig = extractFloat64Frac( b );
2821 1.1 ross bExp = extractFloat64Exp( b );
2822 1.1 ross bSign = extractFloat64Sign( b );
2823 1.1 ross zSign = aSign ^ bSign;
2824 1.1 ross if ( aExp == 0x7FF ) {
2825 1.1 ross if ( aSig || ( ( bExp == 0x7FF ) && bSig ) ) {
2826 1.1 ross return propagateFloat64NaN( a, b );
2827 1.1 ross }
2828 1.1 ross if ( ( bExp | bSig ) == 0 ) {
2829 1.1 ross float_raise( float_flag_invalid );
2830 1.1 ross return float64_default_nan;
2831 1.1 ross }
2832 1.1 ross return packFloat64( zSign, 0x7FF, 0 );
2833 1.1 ross }
2834 1.1 ross if ( bExp == 0x7FF ) {
2835 1.1 ross if ( bSig ) return propagateFloat64NaN( a, b );
2836 1.1 ross if ( ( aExp | aSig ) == 0 ) {
2837 1.1 ross float_raise( float_flag_invalid );
2838 1.1 ross return float64_default_nan;
2839 1.1 ross }
2840 1.1 ross return packFloat64( zSign, 0x7FF, 0 );
2841 1.1 ross }
2842 1.1 ross if ( aExp == 0 ) {
2843 1.1 ross if ( aSig == 0 ) return packFloat64( zSign, 0, 0 );
2844 1.1 ross normalizeFloat64Subnormal( aSig, &aExp, &aSig );
2845 1.1 ross }
2846 1.1 ross if ( bExp == 0 ) {
2847 1.1 ross if ( bSig == 0 ) return packFloat64( zSign, 0, 0 );
2848 1.1 ross normalizeFloat64Subnormal( bSig, &bExp, &bSig );
2849 1.1 ross }
2850 1.1 ross zExp = aExp + bExp - 0x3FF;
2851 1.1 ross aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<10;
2852 1.1 ross bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;
2853 1.1 ross mul64To128( aSig, bSig, &zSig0, &zSig1 );
2854 1.1 ross zSig0 |= ( zSig1 != 0 );
2855 1.1 ross if ( 0 <= (sbits64) ( zSig0<<1 ) ) {
2856 1.1 ross zSig0 <<= 1;
2857 1.1 ross --zExp;
2858 1.1 ross }
2859 1.1 ross return roundAndPackFloat64( zSign, zExp, zSig0 );
2860 1.1 ross
2861 1.1 ross }
2862 1.1 ross
2863 1.7 thorpej /*----------------------------------------------------------------------------
2864 1.7 thorpej | Returns the result of dividing the double-precision floating-point value `a'
2865 1.7 thorpej | by the corresponding value `b'. The operation is performed according to
2866 1.7 thorpej | the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2867 1.7 thorpej *----------------------------------------------------------------------------*/
2868 1.7 thorpej
2869 1.1 ross float64 float64_div( float64 a, float64 b )
2870 1.1 ross {
2871 1.1 ross flag aSign, bSign, zSign;
2872 1.1 ross int16 aExp, bExp, zExp;
2873 1.1 ross bits64 aSig, bSig, zSig;
2874 1.1 ross bits64 rem0, rem1;
2875 1.1 ross bits64 term0, term1;
2876 1.1 ross
2877 1.1 ross aSig = extractFloat64Frac( a );
2878 1.1 ross aExp = extractFloat64Exp( a );
2879 1.1 ross aSign = extractFloat64Sign( a );
2880 1.1 ross bSig = extractFloat64Frac( b );
2881 1.1 ross bExp = extractFloat64Exp( b );
2882 1.1 ross bSign = extractFloat64Sign( b );
2883 1.1 ross zSign = aSign ^ bSign;
2884 1.1 ross if ( aExp == 0x7FF ) {
2885 1.1 ross if ( aSig ) return propagateFloat64NaN( a, b );
2886 1.1 ross if ( bExp == 0x7FF ) {
2887 1.1 ross if ( bSig ) return propagateFloat64NaN( a, b );
2888 1.1 ross float_raise( float_flag_invalid );
2889 1.1 ross return float64_default_nan;
2890 1.1 ross }
2891 1.1 ross return packFloat64( zSign, 0x7FF, 0 );
2892 1.1 ross }
2893 1.1 ross if ( bExp == 0x7FF ) {
2894 1.1 ross if ( bSig ) return propagateFloat64NaN( a, b );
2895 1.1 ross return packFloat64( zSign, 0, 0 );
2896 1.1 ross }
2897 1.1 ross if ( bExp == 0 ) {
2898 1.1 ross if ( bSig == 0 ) {
2899 1.1 ross if ( ( aExp | aSig ) == 0 ) {
2900 1.1 ross float_raise( float_flag_invalid );
2901 1.1 ross return float64_default_nan;
2902 1.1 ross }
2903 1.1 ross float_raise( float_flag_divbyzero );
2904 1.1 ross return packFloat64( zSign, 0x7FF, 0 );
2905 1.1 ross }
2906 1.1 ross normalizeFloat64Subnormal( bSig, &bExp, &bSig );
2907 1.1 ross }
2908 1.1 ross if ( aExp == 0 ) {
2909 1.1 ross if ( aSig == 0 ) return packFloat64( zSign, 0, 0 );
2910 1.1 ross normalizeFloat64Subnormal( aSig, &aExp, &aSig );
2911 1.1 ross }
2912 1.1 ross zExp = aExp - bExp + 0x3FD;
2913 1.1 ross aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<10;
2914 1.1 ross bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;
2915 1.1 ross if ( bSig <= ( aSig + aSig ) ) {
2916 1.1 ross aSig >>= 1;
2917 1.1 ross ++zExp;
2918 1.1 ross }
2919 1.1 ross zSig = estimateDiv128To64( aSig, 0, bSig );
2920 1.1 ross if ( ( zSig & 0x1FF ) <= 2 ) {
2921 1.1 ross mul64To128( bSig, zSig, &term0, &term1 );
2922 1.1 ross sub128( aSig, 0, term0, term1, &rem0, &rem1 );
2923 1.1 ross while ( (sbits64) rem0 < 0 ) {
2924 1.1 ross --zSig;
2925 1.1 ross add128( rem0, rem1, 0, bSig, &rem0, &rem1 );
2926 1.1 ross }
2927 1.1 ross zSig |= ( rem1 != 0 );
2928 1.1 ross }
2929 1.1 ross return roundAndPackFloat64( zSign, zExp, zSig );
2930 1.1 ross
2931 1.1 ross }
2932 1.1 ross
2933 1.1 ross #ifndef SOFTFLOAT_FOR_GCC
2934 1.7 thorpej /*----------------------------------------------------------------------------
2935 1.7 thorpej | Returns the remainder of the double-precision floating-point value `a'
2936 1.7 thorpej | with respect to the corresponding value `b'. The operation is performed
2937 1.7 thorpej | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2938 1.7 thorpej *----------------------------------------------------------------------------*/
2939 1.7 thorpej
2940 1.1 ross float64 float64_rem( float64 a, float64 b )
2941 1.1 ross {
2942 1.5 christos flag aSign, bSign __unused, zSign;
2943 1.1 ross int16 aExp, bExp, expDiff;
2944 1.1 ross bits64 aSig, bSig;
2945 1.1 ross bits64 q, alternateASig;
2946 1.1 ross sbits64 sigMean;
2947 1.1 ross
2948 1.1 ross aSig = extractFloat64Frac( a );
2949 1.1 ross aExp = extractFloat64Exp( a );
2950 1.1 ross aSign = extractFloat64Sign( a );
2951 1.1 ross bSig = extractFloat64Frac( b );
2952 1.1 ross bExp = extractFloat64Exp( b );
2953 1.1 ross bSign = extractFloat64Sign( b );
2954 1.1 ross if ( aExp == 0x7FF ) {
2955 1.1 ross if ( aSig || ( ( bExp == 0x7FF ) && bSig ) ) {
2956 1.1 ross return propagateFloat64NaN( a, b );
2957 1.1 ross }
2958 1.1 ross float_raise( float_flag_invalid );
2959 1.1 ross return float64_default_nan;
2960 1.1 ross }
2961 1.1 ross if ( bExp == 0x7FF ) {
2962 1.1 ross if ( bSig ) return propagateFloat64NaN( a, b );
2963 1.1 ross return a;
2964 1.1 ross }
2965 1.1 ross if ( bExp == 0 ) {
2966 1.1 ross if ( bSig == 0 ) {
2967 1.1 ross float_raise( float_flag_invalid );
2968 1.1 ross return float64_default_nan;
2969 1.1 ross }
2970 1.1 ross normalizeFloat64Subnormal( bSig, &bExp, &bSig );
2971 1.1 ross }
2972 1.1 ross if ( aExp == 0 ) {
2973 1.1 ross if ( aSig == 0 ) return a;
2974 1.1 ross normalizeFloat64Subnormal( aSig, &aExp, &aSig );
2975 1.1 ross }
2976 1.1 ross expDiff = aExp - bExp;
2977 1.1 ross aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<11;
2978 1.1 ross bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;
2979 1.1 ross if ( expDiff < 0 ) {
2980 1.1 ross if ( expDiff < -1 ) return a;
2981 1.1 ross aSig >>= 1;
2982 1.1 ross }
2983 1.1 ross q = ( bSig <= aSig );
2984 1.1 ross if ( q ) aSig -= bSig;
2985 1.1 ross expDiff -= 64;
2986 1.1 ross while ( 0 < expDiff ) {
2987 1.1 ross q = estimateDiv128To64( aSig, 0, bSig );
2988 1.1 ross q = ( 2 < q ) ? q - 2 : 0;
2989 1.1 ross aSig = - ( ( bSig>>2 ) * q );
2990 1.1 ross expDiff -= 62;
2991 1.1 ross }
2992 1.1 ross expDiff += 64;
2993 1.1 ross if ( 0 < expDiff ) {
2994 1.1 ross q = estimateDiv128To64( aSig, 0, bSig );
2995 1.1 ross q = ( 2 < q ) ? q - 2 : 0;
2996 1.1 ross q >>= 64 - expDiff;
2997 1.1 ross bSig >>= 2;
2998 1.1 ross aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q;
2999 1.1 ross }
3000 1.1 ross else {
3001 1.1 ross aSig >>= 2;
3002 1.1 ross bSig >>= 2;
3003 1.1 ross }
3004 1.1 ross do {
3005 1.1 ross alternateASig = aSig;
3006 1.1 ross ++q;
3007 1.1 ross aSig -= bSig;
3008 1.1 ross } while ( 0 <= (sbits64) aSig );
3009 1.1 ross sigMean = aSig + alternateASig;
3010 1.1 ross if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) {
3011 1.1 ross aSig = alternateASig;
3012 1.1 ross }
3013 1.1 ross zSign = ( (sbits64) aSig < 0 );
3014 1.1 ross if ( zSign ) aSig = - aSig;
3015 1.1 ross return normalizeRoundAndPackFloat64( aSign ^ zSign, bExp, aSig );
3016 1.1 ross
3017 1.1 ross }
3018 1.1 ross
3019 1.7 thorpej /*----------------------------------------------------------------------------
3020 1.7 thorpej | Returns the square root of the double-precision floating-point value `a'.
3021 1.7 thorpej | The operation is performed according to the IEC/IEEE Standard for Binary
3022 1.7 thorpej | Floating-Point Arithmetic.
3023 1.7 thorpej *----------------------------------------------------------------------------*/
3024 1.7 thorpej
3025 1.1 ross float64 float64_sqrt( float64 a )
3026 1.1 ross {
3027 1.1 ross flag aSign;
3028 1.1 ross int16 aExp, zExp;
3029 1.1 ross bits64 aSig, zSig, doubleZSig;
3030 1.1 ross bits64 rem0, rem1, term0, term1;
3031 1.1 ross
3032 1.1 ross aSig = extractFloat64Frac( a );
3033 1.1 ross aExp = extractFloat64Exp( a );
3034 1.1 ross aSign = extractFloat64Sign( a );
3035 1.1 ross if ( aExp == 0x7FF ) {
3036 1.1 ross if ( aSig ) return propagateFloat64NaN( a, a );
3037 1.1 ross if ( ! aSign ) return a;
3038 1.1 ross float_raise( float_flag_invalid );
3039 1.1 ross return float64_default_nan;
3040 1.1 ross }
3041 1.1 ross if ( aSign ) {
3042 1.1 ross if ( ( aExp | aSig ) == 0 ) return a;
3043 1.1 ross float_raise( float_flag_invalid );
3044 1.1 ross return float64_default_nan;
3045 1.1 ross }
3046 1.1 ross if ( aExp == 0 ) {
3047 1.1 ross if ( aSig == 0 ) return 0;
3048 1.1 ross normalizeFloat64Subnormal( aSig, &aExp, &aSig );
3049 1.1 ross }
3050 1.1 ross zExp = ( ( aExp - 0x3FF )>>1 ) + 0x3FE;
3051 1.1 ross aSig |= LIT64( 0x0010000000000000 );
3052 1.1 ross zSig = estimateSqrt32( aExp, aSig>>21 );
3053 1.1 ross aSig <<= 9 - ( aExp & 1 );
3054 1.1 ross zSig = estimateDiv128To64( aSig, 0, zSig<<32 ) + ( zSig<<30 );
3055 1.1 ross if ( ( zSig & 0x1FF ) <= 5 ) {
3056 1.1 ross doubleZSig = zSig<<1;
3057 1.1 ross mul64To128( zSig, zSig, &term0, &term1 );
3058 1.1 ross sub128( aSig, 0, term0, term1, &rem0, &rem1 );
3059 1.1 ross while ( (sbits64) rem0 < 0 ) {
3060 1.1 ross --zSig;
3061 1.1 ross doubleZSig -= 2;
3062 1.1 ross add128( rem0, rem1, zSig>>63, doubleZSig | 1, &rem0, &rem1 );
3063 1.1 ross }
3064 1.1 ross zSig |= ( ( rem0 | rem1 ) != 0 );
3065 1.1 ross }
3066 1.1 ross return roundAndPackFloat64( 0, zExp, zSig );
3067 1.1 ross
3068 1.1 ross }
3069 1.1 ross #endif
3070 1.1 ross
3071 1.7 thorpej /*----------------------------------------------------------------------------
3072 1.7 thorpej | Returns 1 if the double-precision floating-point value `a' is equal to the
3073 1.7 thorpej | corresponding value `b', and 0 otherwise. The comparison is performed
3074 1.7 thorpej | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3075 1.7 thorpej *----------------------------------------------------------------------------*/
3076 1.7 thorpej
3077 1.1 ross flag float64_eq( float64 a, float64 b )
3078 1.1 ross {
3079 1.1 ross
3080 1.1 ross if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
3081 1.1 ross || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
3082 1.1 ross ) {
3083 1.1 ross if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) {
3084 1.1 ross float_raise( float_flag_invalid );
3085 1.1 ross }
3086 1.1 ross return 0;
3087 1.1 ross }
3088 1.1 ross return ( a == b ) ||
3089 1.1 ross ( (bits64) ( ( FLOAT64_DEMANGLE(a) | FLOAT64_DEMANGLE(b) )<<1 ) == 0 );
3090 1.1 ross
3091 1.1 ross }
3092 1.1 ross
3093 1.7 thorpej /*----------------------------------------------------------------------------
3094 1.7 thorpej | Returns 1 if the double-precision floating-point value `a' is less than or
3095 1.7 thorpej | equal to the corresponding value `b', and 0 otherwise. The comparison is
3096 1.7 thorpej | performed according to the IEC/IEEE Standard for Binary Floating-Point
3097 1.7 thorpej | Arithmetic.
3098 1.7 thorpej *----------------------------------------------------------------------------*/
3099 1.7 thorpej
3100 1.1 ross flag float64_le( float64 a, float64 b )
3101 1.1 ross {
3102 1.1 ross flag aSign, bSign;
3103 1.1 ross
3104 1.1 ross if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
3105 1.1 ross || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
3106 1.1 ross ) {
3107 1.1 ross float_raise( float_flag_invalid );
3108 1.1 ross return 0;
3109 1.1 ross }
3110 1.1 ross aSign = extractFloat64Sign( a );
3111 1.1 ross bSign = extractFloat64Sign( b );
3112 1.1 ross if ( aSign != bSign )
3113 1.1 ross return aSign ||
3114 1.1 ross ( (bits64) ( ( FLOAT64_DEMANGLE(a) | FLOAT64_DEMANGLE(b) )<<1 ) ==
3115 1.1 ross 0 );
3116 1.1 ross return ( a == b ) ||
3117 1.1 ross ( aSign ^ ( FLOAT64_DEMANGLE(a) < FLOAT64_DEMANGLE(b) ) );
3118 1.1 ross
3119 1.1 ross }
3120 1.1 ross
3121 1.7 thorpej /*----------------------------------------------------------------------------
3122 1.7 thorpej | Returns 1 if the double-precision floating-point value `a' is less than
3123 1.7 thorpej | the corresponding value `b', and 0 otherwise. The comparison is performed
3124 1.7 thorpej | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3125 1.7 thorpej *----------------------------------------------------------------------------*/
3126 1.7 thorpej
3127 1.1 ross flag float64_lt( float64 a, float64 b )
3128 1.1 ross {
3129 1.1 ross flag aSign, bSign;
3130 1.1 ross
3131 1.1 ross if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
3132 1.1 ross || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
3133 1.1 ross ) {
3134 1.1 ross float_raise( float_flag_invalid );
3135 1.1 ross return 0;
3136 1.1 ross }
3137 1.1 ross aSign = extractFloat64Sign( a );
3138 1.1 ross bSign = extractFloat64Sign( b );
3139 1.1 ross if ( aSign != bSign )
3140 1.1 ross return aSign &&
3141 1.1 ross ( (bits64) ( ( FLOAT64_DEMANGLE(a) | FLOAT64_DEMANGLE(b) )<<1 ) !=
3142 1.1 ross 0 );
3143 1.1 ross return ( a != b ) &&
3144 1.1 ross ( aSign ^ ( FLOAT64_DEMANGLE(a) < FLOAT64_DEMANGLE(b) ) );
3145 1.1 ross
3146 1.1 ross }
3147 1.1 ross
3148 1.1 ross #ifndef SOFTFLOAT_FOR_GCC
3149 1.7 thorpej /*----------------------------------------------------------------------------
3150 1.7 thorpej | Returns 1 if the double-precision floating-point value `a' is equal to the
3151 1.7 thorpej | corresponding value `b', and 0 otherwise. The invalid exception is raised
3152 1.7 thorpej | if either operand is a NaN. Otherwise, the comparison is performed
3153 1.7 thorpej | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3154 1.7 thorpej *----------------------------------------------------------------------------*/
3155 1.7 thorpej
3156 1.1 ross flag float64_eq_signaling( float64 a, float64 b )
3157 1.1 ross {
3158 1.1 ross
3159 1.1 ross if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
3160 1.1 ross || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
3161 1.1 ross ) {
3162 1.1 ross float_raise( float_flag_invalid );
3163 1.1 ross return 0;
3164 1.1 ross }
3165 1.1 ross return ( a == b ) || ( (bits64) ( ( a | b )<<1 ) == 0 );
3166 1.1 ross
3167 1.1 ross }
3168 1.1 ross
3169 1.7 thorpej /*----------------------------------------------------------------------------
3170 1.7 thorpej | Returns 1 if the double-precision floating-point value `a' is less than or
3171 1.7 thorpej | equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not
3172 1.7 thorpej | cause an exception. Otherwise, the comparison is performed according to the
3173 1.7 thorpej | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3174 1.7 thorpej *----------------------------------------------------------------------------*/
3175 1.7 thorpej
3176 1.1 ross flag float64_le_quiet( float64 a, float64 b )
3177 1.1 ross {
3178 1.1 ross flag aSign, bSign;
3179 1.1 ross
3180 1.1 ross if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
3181 1.1 ross || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
3182 1.1 ross ) {
3183 1.1 ross if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) {
3184 1.1 ross float_raise( float_flag_invalid );
3185 1.1 ross }
3186 1.1 ross return 0;
3187 1.1 ross }
3188 1.1 ross aSign = extractFloat64Sign( a );
3189 1.1 ross bSign = extractFloat64Sign( b );
3190 1.1 ross if ( aSign != bSign ) return aSign || ( (bits64) ( ( a | b )<<1 ) == 0 );
3191 1.1 ross return ( a == b ) || ( aSign ^ ( a < b ) );
3192 1.1 ross
3193 1.1 ross }
3194 1.1 ross
3195 1.7 thorpej /*----------------------------------------------------------------------------
3196 1.7 thorpej | Returns 1 if the double-precision floating-point value `a' is less than
3197 1.7 thorpej | the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
3198 1.7 thorpej | exception. Otherwise, the comparison is performed according to the IEC/IEEE
3199 1.7 thorpej | Standard for Binary Floating-Point Arithmetic.
3200 1.7 thorpej *----------------------------------------------------------------------------*/
3201 1.7 thorpej
3202 1.1 ross flag float64_lt_quiet( float64 a, float64 b )
3203 1.1 ross {
3204 1.1 ross flag aSign, bSign;
3205 1.1 ross
3206 1.1 ross if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
3207 1.1 ross || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
3208 1.1 ross ) {
3209 1.1 ross if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) {
3210 1.1 ross float_raise( float_flag_invalid );
3211 1.1 ross }
3212 1.1 ross return 0;
3213 1.1 ross }
3214 1.1 ross aSign = extractFloat64Sign( a );
3215 1.1 ross bSign = extractFloat64Sign( b );
3216 1.1 ross if ( aSign != bSign ) return aSign && ( (bits64) ( ( a | b )<<1 ) != 0 );
3217 1.1 ross return ( a != b ) && ( aSign ^ ( a < b ) );
3218 1.1 ross
3219 1.1 ross }
3220 1.1 ross #endif
3221 1.1 ross
3222 1.1 ross #ifdef FLOATX80
3223 1.1 ross
3224 1.7 thorpej /*----------------------------------------------------------------------------
3225 1.7 thorpej | Returns the result of converting the extended double-precision floating-
3226 1.7 thorpej | point value `a' to the 32-bit two's complement integer format. The
3227 1.7 thorpej | conversion is performed according to the IEC/IEEE Standard for Binary
3228 1.7 thorpej | Floating-Point Arithmetic---which means in particular that the conversion
3229 1.7 thorpej | is rounded according to the current rounding mode. If `a' is a NaN, the
3230 1.7 thorpej | largest positive integer is returned. Otherwise, if the conversion
3231 1.7 thorpej | overflows, the largest integer with the same sign as `a' is returned.
3232 1.7 thorpej *----------------------------------------------------------------------------*/
3233 1.7 thorpej
3234 1.1 ross int32 floatx80_to_int32( floatx80 a )
3235 1.1 ross {
3236 1.1 ross flag aSign;
3237 1.1 ross int32 aExp, shiftCount;
3238 1.1 ross bits64 aSig;
3239 1.1 ross
3240 1.1 ross aSig = extractFloatx80Frac( a );
3241 1.1 ross aExp = extractFloatx80Exp( a );
3242 1.1 ross aSign = extractFloatx80Sign( a );
3243 1.1 ross if ( ( aExp == 0x7FFF ) && (bits64) ( aSig<<1 ) ) aSign = 0;
3244 1.1 ross shiftCount = 0x4037 - aExp;
3245 1.1 ross if ( shiftCount <= 0 ) shiftCount = 1;
3246 1.1 ross shift64RightJamming( aSig, shiftCount, &aSig );
3247 1.1 ross return roundAndPackInt32( aSign, aSig );
3248 1.1 ross
3249 1.1 ross }
3250 1.1 ross
3251 1.7 thorpej /*----------------------------------------------------------------------------
3252 1.7 thorpej | Returns the result of converting the extended double-precision floating-
3253 1.7 thorpej | point value `a' to the 32-bit two's complement integer format. The
3254 1.7 thorpej | conversion is performed according to the IEC/IEEE Standard for Binary
3255 1.7 thorpej | Floating-Point Arithmetic, except that the conversion is always rounded
3256 1.7 thorpej | toward zero. If `a' is a NaN, the largest positive integer is returned.
3257 1.7 thorpej | Otherwise, if the conversion overflows, the largest integer with the same
3258 1.7 thorpej | sign as `a' is returned.
3259 1.7 thorpej *----------------------------------------------------------------------------*/
3260 1.7 thorpej
3261 1.1 ross int32 floatx80_to_int32_round_to_zero( floatx80 a )
3262 1.1 ross {
3263 1.1 ross flag aSign;
3264 1.1 ross int32 aExp, shiftCount;
3265 1.1 ross bits64 aSig, savedASig;
3266 1.1 ross int32 z;
3267 1.1 ross
3268 1.1 ross aSig = extractFloatx80Frac( a );
3269 1.1 ross aExp = extractFloatx80Exp( a );
3270 1.1 ross aSign = extractFloatx80Sign( a );
3271 1.1 ross if ( 0x401E < aExp ) {
3272 1.1 ross if ( ( aExp == 0x7FFF ) && (bits64) ( aSig<<1 ) ) aSign = 0;
3273 1.1 ross goto invalid;
3274 1.1 ross }
3275 1.1 ross else if ( aExp < 0x3FFF ) {
3276 1.1 ross if ( aExp || aSig ) float_set_inexact();
3277 1.1 ross return 0;
3278 1.1 ross }
3279 1.1 ross shiftCount = 0x403E - aExp;
3280 1.1 ross savedASig = aSig;
3281 1.1 ross aSig >>= shiftCount;
3282 1.1 ross z = aSig;
3283 1.1 ross if ( aSign ) z = - z;
3284 1.1 ross if ( ( z < 0 ) ^ aSign ) {
3285 1.1 ross invalid:
3286 1.1 ross float_raise( float_flag_invalid );
3287 1.1 ross return aSign ? (sbits32) 0x80000000 : 0x7FFFFFFF;
3288 1.1 ross }
3289 1.1 ross if ( ( aSig<<shiftCount ) != savedASig ) {
3290 1.1 ross float_set_inexact();
3291 1.1 ross }
3292 1.1 ross return z;
3293 1.1 ross
3294 1.1 ross }
3295 1.1 ross
3296 1.7 thorpej /*----------------------------------------------------------------------------
3297 1.7 thorpej | Returns the result of converting the extended double-precision floating-
3298 1.7 thorpej | point value `a' to the 64-bit two's complement integer format. The
3299 1.7 thorpej | conversion is performed according to the IEC/IEEE Standard for Binary
3300 1.7 thorpej | Floating-Point Arithmetic---which means in particular that the conversion
3301 1.7 thorpej | is rounded according to the current rounding mode. If `a' is a NaN,
3302 1.7 thorpej | the largest positive integer is returned. Otherwise, if the conversion
3303 1.7 thorpej | overflows, the largest integer with the same sign as `a' is returned.
3304 1.7 thorpej *----------------------------------------------------------------------------*/
3305 1.7 thorpej
3306 1.1 ross int64 floatx80_to_int64( floatx80 a )
3307 1.1 ross {
3308 1.1 ross flag aSign;
3309 1.1 ross int32 aExp, shiftCount;
3310 1.1 ross bits64 aSig, aSigExtra;
3311 1.1 ross
3312 1.1 ross aSig = extractFloatx80Frac( a );
3313 1.1 ross aExp = extractFloatx80Exp( a );
3314 1.1 ross aSign = extractFloatx80Sign( a );
3315 1.1 ross shiftCount = 0x403E - aExp;
3316 1.1 ross if ( shiftCount <= 0 ) {
3317 1.1 ross if ( shiftCount ) {
3318 1.1 ross float_raise( float_flag_invalid );
3319 1.1 ross if ( ! aSign
3320 1.1 ross || ( ( aExp == 0x7FFF )
3321 1.1 ross && ( aSig != LIT64( 0x8000000000000000 ) ) )
3322 1.1 ross ) {
3323 1.1 ross return LIT64( 0x7FFFFFFFFFFFFFFF );
3324 1.1 ross }
3325 1.1 ross return (sbits64) LIT64( 0x8000000000000000 );
3326 1.1 ross }
3327 1.1 ross aSigExtra = 0;
3328 1.1 ross }
3329 1.1 ross else {
3330 1.1 ross shift64ExtraRightJamming( aSig, 0, shiftCount, &aSig, &aSigExtra );
3331 1.1 ross }
3332 1.1 ross return roundAndPackInt64( aSign, aSig, aSigExtra );
3333 1.1 ross
3334 1.1 ross }
3335 1.1 ross
3336 1.7 thorpej /*----------------------------------------------------------------------------
3337 1.7 thorpej | Returns the result of converting the extended double-precision floating-
3338 1.7 thorpej | point value `a' to the 64-bit two's complement integer format. The
3339 1.7 thorpej | conversion is performed according to the IEC/IEEE Standard for Binary
3340 1.7 thorpej | Floating-Point Arithmetic, except that the conversion is always rounded
3341 1.7 thorpej | toward zero. If `a' is a NaN, the largest positive integer is returned.
3342 1.7 thorpej | Otherwise, if the conversion overflows, the largest integer with the same
3343 1.7 thorpej | sign as `a' is returned.
3344 1.7 thorpej *----------------------------------------------------------------------------*/
3345 1.7 thorpej
3346 1.1 ross int64 floatx80_to_int64_round_to_zero( floatx80 a )
3347 1.1 ross {
3348 1.1 ross flag aSign;
3349 1.1 ross int32 aExp, shiftCount;
3350 1.1 ross bits64 aSig;
3351 1.1 ross int64 z;
3352 1.1 ross
3353 1.1 ross aSig = extractFloatx80Frac( a );
3354 1.1 ross aExp = extractFloatx80Exp( a );
3355 1.1 ross aSign = extractFloatx80Sign( a );
3356 1.1 ross shiftCount = aExp - 0x403E;
3357 1.1 ross if ( 0 <= shiftCount ) {
3358 1.1 ross aSig &= LIT64( 0x7FFFFFFFFFFFFFFF );
3359 1.1 ross if ( ( a.high != 0xC03E ) || aSig ) {
3360 1.1 ross float_raise( float_flag_invalid );
3361 1.1 ross if ( ! aSign || ( ( aExp == 0x7FFF ) && aSig ) ) {
3362 1.1 ross return LIT64( 0x7FFFFFFFFFFFFFFF );
3363 1.1 ross }
3364 1.1 ross }
3365 1.1 ross return (sbits64) LIT64( 0x8000000000000000 );
3366 1.1 ross }
3367 1.1 ross else if ( aExp < 0x3FFF ) {
3368 1.1 ross if ( aExp | aSig ) float_set_inexact();
3369 1.1 ross return 0;
3370 1.1 ross }
3371 1.1 ross z = aSig>>( - shiftCount );
3372 1.1 ross if ( (bits64) ( aSig<<( shiftCount & 63 ) ) ) {
3373 1.1 ross float_set_inexact();
3374 1.1 ross }
3375 1.1 ross if ( aSign ) z = - z;
3376 1.1 ross return z;
3377 1.1 ross
3378 1.1 ross }
3379 1.1 ross
3380 1.7 thorpej /*----------------------------------------------------------------------------
3381 1.7 thorpej | Returns the result of converting the extended double-precision floating-
3382 1.7 thorpej | point value `a' to the single-precision floating-point format. The
3383 1.7 thorpej | conversion is performed according to the IEC/IEEE Standard for Binary
3384 1.7 thorpej | Floating-Point Arithmetic.
3385 1.7 thorpej *----------------------------------------------------------------------------*/
3386 1.7 thorpej
3387 1.1 ross float32 floatx80_to_float32( floatx80 a )
3388 1.1 ross {
3389 1.1 ross flag aSign;
3390 1.1 ross int32 aExp;
3391 1.1 ross bits64 aSig;
3392 1.1 ross
3393 1.1 ross aSig = extractFloatx80Frac( a );
3394 1.1 ross aExp = extractFloatx80Exp( a );
3395 1.1 ross aSign = extractFloatx80Sign( a );
3396 1.1 ross if ( aExp == 0x7FFF ) {
3397 1.1 ross if ( (bits64) ( aSig<<1 ) ) {
3398 1.1 ross return commonNaNToFloat32( floatx80ToCommonNaN( a ) );
3399 1.1 ross }
3400 1.1 ross return packFloat32( aSign, 0xFF, 0 );
3401 1.1 ross }
3402 1.1 ross shift64RightJamming( aSig, 33, &aSig );
3403 1.1 ross if ( aExp || aSig ) aExp -= 0x3F81;
3404 1.1 ross return roundAndPackFloat32( aSign, aExp, aSig );
3405 1.1 ross
3406 1.1 ross }
3407 1.1 ross
3408 1.7 thorpej /*----------------------------------------------------------------------------
3409 1.7 thorpej | Returns the result of converting the extended double-precision floating-
3410 1.7 thorpej | point value `a' to the double-precision floating-point format. The
3411 1.7 thorpej | conversion is performed according to the IEC/IEEE Standard for Binary
3412 1.7 thorpej | Floating-Point Arithmetic.
3413 1.7 thorpej *----------------------------------------------------------------------------*/
3414 1.7 thorpej
3415 1.1 ross float64 floatx80_to_float64( floatx80 a )
3416 1.1 ross {
3417 1.1 ross flag aSign;
3418 1.1 ross int32 aExp;
3419 1.1 ross bits64 aSig, zSig;
3420 1.1 ross
3421 1.1 ross aSig = extractFloatx80Frac( a );
3422 1.1 ross aExp = extractFloatx80Exp( a );
3423 1.1 ross aSign = extractFloatx80Sign( a );
3424 1.1 ross if ( aExp == 0x7FFF ) {
3425 1.1 ross if ( (bits64) ( aSig<<1 ) ) {
3426 1.1 ross return commonNaNToFloat64( floatx80ToCommonNaN( a ) );
3427 1.1 ross }
3428 1.1 ross return packFloat64( aSign, 0x7FF, 0 );
3429 1.1 ross }
3430 1.1 ross shift64RightJamming( aSig, 1, &zSig );
3431 1.1 ross if ( aExp || aSig ) aExp -= 0x3C01;
3432 1.1 ross return roundAndPackFloat64( aSign, aExp, zSig );
3433 1.1 ross
3434 1.1 ross }
3435 1.1 ross
3436 1.1 ross #ifdef FLOAT128
3437 1.1 ross
3438 1.7 thorpej /*----------------------------------------------------------------------------
3439 1.7 thorpej | Returns the result of converting the extended double-precision floating-
3440 1.7 thorpej | point value `a' to the quadruple-precision floating-point format. The
3441 1.7 thorpej | conversion is performed according to the IEC/IEEE Standard for Binary
3442 1.7 thorpej | Floating-Point Arithmetic.
3443 1.7 thorpej *----------------------------------------------------------------------------*/
3444 1.7 thorpej
3445 1.1 ross float128 floatx80_to_float128( floatx80 a )
3446 1.1 ross {
3447 1.1 ross flag aSign;
3448 1.1 ross int16 aExp;
3449 1.1 ross bits64 aSig, zSig0, zSig1;
3450 1.1 ross
3451 1.1 ross aSig = extractFloatx80Frac( a );
3452 1.1 ross aExp = extractFloatx80Exp( a );
3453 1.1 ross aSign = extractFloatx80Sign( a );
3454 1.1 ross if ( ( aExp == 0x7FFF ) && (bits64) ( aSig<<1 ) ) {
3455 1.1 ross return commonNaNToFloat128( floatx80ToCommonNaN( a ) );
3456 1.1 ross }
3457 1.1 ross shift128Right( aSig<<1, 0, 16, &zSig0, &zSig1 );
3458 1.1 ross return packFloat128( aSign, aExp, zSig0, zSig1 );
3459 1.1 ross
3460 1.1 ross }
3461 1.1 ross
3462 1.1 ross #endif
3463 1.1 ross
3464 1.7 thorpej /*----------------------------------------------------------------------------
3465 1.7 thorpej | Rounds the extended double-precision floating-point value `a' to an integer,
3466 1.7 thorpej | and returns the result as an extended quadruple-precision floating-point
3467 1.7 thorpej | value. The operation is performed according to the IEC/IEEE Standard for
3468 1.7 thorpej | Binary Floating-Point Arithmetic.
3469 1.7 thorpej *----------------------------------------------------------------------------*/
3470 1.7 thorpej
3471 1.1 ross floatx80 floatx80_round_to_int( floatx80 a )
3472 1.1 ross {
3473 1.1 ross flag aSign;
3474 1.1 ross int32 aExp;
3475 1.1 ross bits64 lastBitMask, roundBitsMask;
3476 1.1 ross int8 roundingMode;
3477 1.1 ross floatx80 z;
3478 1.1 ross
3479 1.1 ross aExp = extractFloatx80Exp( a );
3480 1.1 ross if ( 0x403E <= aExp ) {
3481 1.1 ross if ( ( aExp == 0x7FFF ) && (bits64) ( extractFloatx80Frac( a )<<1 ) ) {
3482 1.1 ross return propagateFloatx80NaN( a, a );
3483 1.1 ross }
3484 1.1 ross return a;
3485 1.1 ross }
3486 1.1 ross if ( aExp < 0x3FFF ) {
3487 1.1 ross if ( ( aExp == 0 )
3488 1.1 ross && ( (bits64) ( extractFloatx80Frac( a )<<1 ) == 0 ) ) {
3489 1.1 ross return a;
3490 1.1 ross }
3491 1.1 ross float_set_inexact();
3492 1.1 ross aSign = extractFloatx80Sign( a );
3493 1.1 ross switch ( float_rounding_mode() ) {
3494 1.1 ross case float_round_nearest_even:
3495 1.1 ross if ( ( aExp == 0x3FFE ) && (bits64) ( extractFloatx80Frac( a )<<1 )
3496 1.1 ross ) {
3497 1.1 ross return
3498 1.1 ross packFloatx80( aSign, 0x3FFF, LIT64( 0x8000000000000000 ) );
3499 1.1 ross }
3500 1.1 ross break;
3501 1.1 ross case float_round_down:
3502 1.1 ross return
3503 1.1 ross aSign ?
3504 1.1 ross packFloatx80( 1, 0x3FFF, LIT64( 0x8000000000000000 ) )
3505 1.1 ross : packFloatx80( 0, 0, 0 );
3506 1.1 ross case float_round_up:
3507 1.1 ross return
3508 1.1 ross aSign ? packFloatx80( 1, 0, 0 )
3509 1.1 ross : packFloatx80( 0, 0x3FFF, LIT64( 0x8000000000000000 ) );
3510 1.1 ross }
3511 1.1 ross return packFloatx80( aSign, 0, 0 );
3512 1.1 ross }
3513 1.1 ross lastBitMask = 1;
3514 1.1 ross lastBitMask <<= 0x403E - aExp;
3515 1.1 ross roundBitsMask = lastBitMask - 1;
3516 1.1 ross z = a;
3517 1.1 ross roundingMode = float_rounding_mode();
3518 1.1 ross if ( roundingMode == float_round_nearest_even ) {
3519 1.1 ross z.low += lastBitMask>>1;
3520 1.1 ross if ( ( z.low & roundBitsMask ) == 0 ) z.low &= ~ lastBitMask;
3521 1.1 ross }
3522 1.1 ross else if ( roundingMode != float_round_to_zero ) {
3523 1.1 ross if ( extractFloatx80Sign( z ) ^ ( roundingMode == float_round_up ) ) {
3524 1.1 ross z.low += roundBitsMask;
3525 1.1 ross }
3526 1.1 ross }
3527 1.1 ross z.low &= ~ roundBitsMask;
3528 1.1 ross if ( z.low == 0 ) {
3529 1.1 ross ++z.high;
3530 1.1 ross z.low = LIT64( 0x8000000000000000 );
3531 1.1 ross }
3532 1.1 ross if ( z.low != a.low ) float_set_inexact();
3533 1.1 ross return z;
3534 1.1 ross
3535 1.1 ross }
3536 1.1 ross
3537 1.7 thorpej /*----------------------------------------------------------------------------
3538 1.7 thorpej | Returns the result of adding the absolute values of the extended double-
3539 1.7 thorpej | precision floating-point values `a' and `b'. If `zSign' is 1, the sum is
3540 1.7 thorpej | negated before being returned. `zSign' is ignored if the result is a NaN.
3541 1.7 thorpej | The addition is performed according to the IEC/IEEE Standard for Binary
3542 1.7 thorpej | Floating-Point Arithmetic.
3543 1.7 thorpej *----------------------------------------------------------------------------*/
3544 1.7 thorpej
3545 1.1 ross static floatx80 addFloatx80Sigs( floatx80 a, floatx80 b, flag zSign )
3546 1.1 ross {
3547 1.1 ross int32 aExp, bExp, zExp;
3548 1.1 ross bits64 aSig, bSig, zSig0, zSig1;
3549 1.1 ross int32 expDiff;
3550 1.1 ross
3551 1.1 ross aSig = extractFloatx80Frac( a );
3552 1.1 ross aExp = extractFloatx80Exp( a );
3553 1.1 ross bSig = extractFloatx80Frac( b );
3554 1.1 ross bExp = extractFloatx80Exp( b );
3555 1.1 ross expDiff = aExp - bExp;
3556 1.1 ross if ( 0 < expDiff ) {
3557 1.1 ross if ( aExp == 0x7FFF ) {
3558 1.1 ross if ( (bits64) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b );
3559 1.1 ross return a;
3560 1.1 ross }
3561 1.1 ross if ( bExp == 0 ) --expDiff;
3562 1.1 ross shift64ExtraRightJamming( bSig, 0, expDiff, &bSig, &zSig1 );
3563 1.1 ross zExp = aExp;
3564 1.1 ross }
3565 1.1 ross else if ( expDiff < 0 ) {
3566 1.1 ross if ( bExp == 0x7FFF ) {
3567 1.1 ross if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b );
3568 1.1 ross return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
3569 1.1 ross }
3570 1.1 ross if ( aExp == 0 ) ++expDiff;
3571 1.1 ross shift64ExtraRightJamming( aSig, 0, - expDiff, &aSig, &zSig1 );
3572 1.1 ross zExp = bExp;
3573 1.1 ross }
3574 1.1 ross else {
3575 1.1 ross if ( aExp == 0x7FFF ) {
3576 1.1 ross if ( (bits64) ( ( aSig | bSig )<<1 ) ) {
3577 1.1 ross return propagateFloatx80NaN( a, b );
3578 1.1 ross }
3579 1.1 ross return a;
3580 1.1 ross }
3581 1.1 ross zSig1 = 0;
3582 1.1 ross zSig0 = aSig + bSig;
3583 1.1 ross if ( aExp == 0 ) {
3584 1.1 ross normalizeFloatx80Subnormal( zSig0, &zExp, &zSig0 );
3585 1.1 ross goto roundAndPack;
3586 1.1 ross }
3587 1.1 ross zExp = aExp;
3588 1.1 ross goto shiftRight1;
3589 1.1 ross }
3590 1.1 ross zSig0 = aSig + bSig;
3591 1.1 ross if ( (sbits64) zSig0 < 0 ) goto roundAndPack;
3592 1.1 ross shiftRight1:
3593 1.1 ross shift64ExtraRightJamming( zSig0, zSig1, 1, &zSig0, &zSig1 );
3594 1.1 ross zSig0 |= LIT64( 0x8000000000000000 );
3595 1.1 ross ++zExp;
3596 1.1 ross roundAndPack:
3597 1.1 ross return
3598 1.1 ross roundAndPackFloatx80(
3599 1.1 ross floatx80_rounding_precision, zSign, zExp, zSig0, zSig1 );
3600 1.1 ross
3601 1.1 ross }
3602 1.1 ross
3603 1.7 thorpej /*----------------------------------------------------------------------------
3604 1.7 thorpej | Returns the result of subtracting the absolute values of the extended
3605 1.7 thorpej | double-precision floating-point values `a' and `b'. If `zSign' is 1, the
3606 1.7 thorpej | difference is negated before being returned. `zSign' is ignored if the
3607 1.7 thorpej | result is a NaN. The subtraction is performed according to the IEC/IEEE
3608 1.7 thorpej | Standard for Binary Floating-Point Arithmetic.
3609 1.7 thorpej *----------------------------------------------------------------------------*/
3610 1.7 thorpej
3611 1.1 ross static floatx80 subFloatx80Sigs( floatx80 a, floatx80 b, flag zSign )
3612 1.1 ross {
3613 1.1 ross int32 aExp, bExp, zExp;
3614 1.1 ross bits64 aSig, bSig, zSig0, zSig1;
3615 1.1 ross int32 expDiff;
3616 1.1 ross floatx80 z;
3617 1.1 ross
3618 1.1 ross aSig = extractFloatx80Frac( a );
3619 1.1 ross aExp = extractFloatx80Exp( a );
3620 1.1 ross bSig = extractFloatx80Frac( b );
3621 1.1 ross bExp = extractFloatx80Exp( b );
3622 1.1 ross expDiff = aExp - bExp;
3623 1.1 ross if ( 0 < expDiff ) goto aExpBigger;
3624 1.1 ross if ( expDiff < 0 ) goto bExpBigger;
3625 1.1 ross if ( aExp == 0x7FFF ) {
3626 1.1 ross if ( (bits64) ( ( aSig | bSig )<<1 ) ) {
3627 1.1 ross return propagateFloatx80NaN( a, b );
3628 1.1 ross }
3629 1.1 ross float_raise( float_flag_invalid );
3630 1.1 ross z.low = floatx80_default_nan_low;
3631 1.1 ross z.high = floatx80_default_nan_high;
3632 1.1 ross return z;
3633 1.1 ross }
3634 1.1 ross if ( aExp == 0 ) {
3635 1.1 ross aExp = 1;
3636 1.1 ross bExp = 1;
3637 1.1 ross }
3638 1.1 ross zSig1 = 0;
3639 1.1 ross if ( bSig < aSig ) goto aBigger;
3640 1.1 ross if ( aSig < bSig ) goto bBigger;
3641 1.1 ross return packFloatx80( float_rounding_mode() == float_round_down, 0, 0 );
3642 1.1 ross bExpBigger:
3643 1.1 ross if ( bExp == 0x7FFF ) {
3644 1.1 ross if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b );
3645 1.1 ross return packFloatx80( zSign ^ 1, 0x7FFF, LIT64( 0x8000000000000000 ) );
3646 1.1 ross }
3647 1.1 ross if ( aExp == 0 ) ++expDiff;
3648 1.1 ross shift128RightJamming( aSig, 0, - expDiff, &aSig, &zSig1 );
3649 1.1 ross bBigger:
3650 1.1 ross sub128( bSig, 0, aSig, zSig1, &zSig0, &zSig1 );
3651 1.1 ross zExp = bExp;
3652 1.1 ross zSign ^= 1;
3653 1.1 ross goto normalizeRoundAndPack;
3654 1.1 ross aExpBigger:
3655 1.1 ross if ( aExp == 0x7FFF ) {
3656 1.1 ross if ( (bits64) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b );
3657 1.1 ross return a;
3658 1.1 ross }
3659 1.1 ross if ( bExp == 0 ) --expDiff;
3660 1.1 ross shift128RightJamming( bSig, 0, expDiff, &bSig, &zSig1 );
3661 1.1 ross aBigger:
3662 1.1 ross sub128( aSig, 0, bSig, zSig1, &zSig0, &zSig1 );
3663 1.1 ross zExp = aExp;
3664 1.1 ross normalizeRoundAndPack:
3665 1.1 ross return
3666 1.1 ross normalizeRoundAndPackFloatx80(
3667 1.1 ross floatx80_rounding_precision, zSign, zExp, zSig0, zSig1 );
3668 1.1 ross
3669 1.1 ross }
3670 1.1 ross
3671 1.7 thorpej /*----------------------------------------------------------------------------
3672 1.7 thorpej | Returns the result of adding the extended double-precision floating-point
3673 1.7 thorpej | values `a' and `b'. The operation is performed according to the IEC/IEEE
3674 1.7 thorpej | Standard for Binary Floating-Point Arithmetic.
3675 1.7 thorpej *----------------------------------------------------------------------------*/
3676 1.7 thorpej
3677 1.1 ross floatx80 floatx80_add( floatx80 a, floatx80 b )
3678 1.1 ross {
3679 1.1 ross flag aSign, bSign;
3680 1.1 ross
3681 1.1 ross aSign = extractFloatx80Sign( a );
3682 1.1 ross bSign = extractFloatx80Sign( b );
3683 1.1 ross if ( aSign == bSign ) {
3684 1.1 ross return addFloatx80Sigs( a, b, aSign );
3685 1.1 ross }
3686 1.1 ross else {
3687 1.1 ross return subFloatx80Sigs( a, b, aSign );
3688 1.1 ross }
3689 1.1 ross
3690 1.1 ross }
3691 1.1 ross
3692 1.7 thorpej /*----------------------------------------------------------------------------
3693 1.7 thorpej | Returns the result of subtracting the extended double-precision floating-
3694 1.7 thorpej | point values `a' and `b'. The operation is performed according to the
3695 1.7 thorpej | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3696 1.7 thorpej *----------------------------------------------------------------------------*/
3697 1.7 thorpej
3698 1.1 ross floatx80 floatx80_sub( floatx80 a, floatx80 b )
3699 1.1 ross {
3700 1.1 ross flag aSign, bSign;
3701 1.1 ross
3702 1.1 ross aSign = extractFloatx80Sign( a );
3703 1.1 ross bSign = extractFloatx80Sign( b );
3704 1.1 ross if ( aSign == bSign ) {
3705 1.1 ross return subFloatx80Sigs( a, b, aSign );
3706 1.1 ross }
3707 1.1 ross else {
3708 1.1 ross return addFloatx80Sigs( a, b, aSign );
3709 1.1 ross }
3710 1.1 ross
3711 1.1 ross }
3712 1.1 ross
3713 1.7 thorpej /*----------------------------------------------------------------------------
3714 1.7 thorpej | Returns the result of multiplying the extended double-precision floating-
3715 1.7 thorpej | point values `a' and `b'. The operation is performed according to the
3716 1.7 thorpej | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3717 1.7 thorpej *----------------------------------------------------------------------------*/
3718 1.7 thorpej
3719 1.1 ross floatx80 floatx80_mul( floatx80 a, floatx80 b )
3720 1.1 ross {
3721 1.1 ross flag aSign, bSign, zSign;
3722 1.1 ross int32 aExp, bExp, zExp;
3723 1.1 ross bits64 aSig, bSig, zSig0, zSig1;
3724 1.1 ross floatx80 z;
3725 1.1 ross
3726 1.1 ross aSig = extractFloatx80Frac( a );
3727 1.1 ross aExp = extractFloatx80Exp( a );
3728 1.1 ross aSign = extractFloatx80Sign( a );
3729 1.1 ross bSig = extractFloatx80Frac( b );
3730 1.1 ross bExp = extractFloatx80Exp( b );
3731 1.1 ross bSign = extractFloatx80Sign( b );
3732 1.1 ross zSign = aSign ^ bSign;
3733 1.1 ross if ( aExp == 0x7FFF ) {
3734 1.1 ross if ( (bits64) ( aSig<<1 )
3735 1.1 ross || ( ( bExp == 0x7FFF ) && (bits64) ( bSig<<1 ) ) ) {
3736 1.1 ross return propagateFloatx80NaN( a, b );
3737 1.1 ross }
3738 1.1 ross if ( ( bExp | bSig ) == 0 ) goto invalid;
3739 1.1 ross return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
3740 1.1 ross }
3741 1.1 ross if ( bExp == 0x7FFF ) {
3742 1.1 ross if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b );
3743 1.1 ross if ( ( aExp | aSig ) == 0 ) {
3744 1.1 ross invalid:
3745 1.1 ross float_raise( float_flag_invalid );
3746 1.1 ross z.low = floatx80_default_nan_low;
3747 1.1 ross z.high = floatx80_default_nan_high;
3748 1.1 ross return z;
3749 1.1 ross }
3750 1.1 ross return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
3751 1.1 ross }
3752 1.1 ross if ( aExp == 0 ) {
3753 1.1 ross if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 );
3754 1.1 ross normalizeFloatx80Subnormal( aSig, &aExp, &aSig );
3755 1.1 ross }
3756 1.1 ross if ( bExp == 0 ) {
3757 1.1 ross if ( bSig == 0 ) return packFloatx80( zSign, 0, 0 );
3758 1.1 ross normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
3759 1.1 ross }
3760 1.1 ross zExp = aExp + bExp - 0x3FFE;
3761 1.1 ross mul64To128( aSig, bSig, &zSig0, &zSig1 );
3762 1.1 ross if ( 0 < (sbits64) zSig0 ) {
3763 1.1 ross shortShift128Left( zSig0, zSig1, 1, &zSig0, &zSig1 );
3764 1.1 ross --zExp;
3765 1.1 ross }
3766 1.1 ross return
3767 1.1 ross roundAndPackFloatx80(
3768 1.1 ross floatx80_rounding_precision, zSign, zExp, zSig0, zSig1 );
3769 1.1 ross
3770 1.1 ross }
3771 1.1 ross
3772 1.7 thorpej /*----------------------------------------------------------------------------
3773 1.7 thorpej | Returns the result of dividing the extended double-precision floating-point
3774 1.7 thorpej | value `a' by the corresponding value `b'. The operation is performed
3775 1.7 thorpej | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3776 1.7 thorpej *----------------------------------------------------------------------------*/
3777 1.7 thorpej
3778 1.1 ross floatx80 floatx80_div( floatx80 a, floatx80 b )
3779 1.1 ross {
3780 1.1 ross flag aSign, bSign, zSign;
3781 1.1 ross int32 aExp, bExp, zExp;
3782 1.1 ross bits64 aSig, bSig, zSig0, zSig1;
3783 1.1 ross bits64 rem0, rem1, rem2, term0, term1, term2;
3784 1.1 ross floatx80 z;
3785 1.1 ross
3786 1.1 ross aSig = extractFloatx80Frac( a );
3787 1.1 ross aExp = extractFloatx80Exp( a );
3788 1.1 ross aSign = extractFloatx80Sign( a );
3789 1.1 ross bSig = extractFloatx80Frac( b );
3790 1.1 ross bExp = extractFloatx80Exp( b );
3791 1.1 ross bSign = extractFloatx80Sign( b );
3792 1.1 ross zSign = aSign ^ bSign;
3793 1.1 ross if ( aExp == 0x7FFF ) {
3794 1.1 ross if ( (bits64) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b );
3795 1.1 ross if ( bExp == 0x7FFF ) {
3796 1.1 ross if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b );
3797 1.1 ross goto invalid;
3798 1.1 ross }
3799 1.1 ross return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
3800 1.1 ross }
3801 1.1 ross if ( bExp == 0x7FFF ) {
3802 1.1 ross if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b );
3803 1.1 ross return packFloatx80( zSign, 0, 0 );
3804 1.1 ross }
3805 1.1 ross if ( bExp == 0 ) {
3806 1.1 ross if ( bSig == 0 ) {
3807 1.1 ross if ( ( aExp | aSig ) == 0 ) {
3808 1.1 ross invalid:
3809 1.1 ross float_raise( float_flag_invalid );
3810 1.1 ross z.low = floatx80_default_nan_low;
3811 1.1 ross z.high = floatx80_default_nan_high;
3812 1.1 ross return z;
3813 1.1 ross }
3814 1.1 ross float_raise( float_flag_divbyzero );
3815 1.1 ross return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
3816 1.1 ross }
3817 1.1 ross normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
3818 1.1 ross }
3819 1.1 ross if ( aExp == 0 ) {
3820 1.1 ross if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 );
3821 1.1 ross normalizeFloatx80Subnormal( aSig, &aExp, &aSig );
3822 1.1 ross }
3823 1.1 ross zExp = aExp - bExp + 0x3FFE;
3824 1.1 ross rem1 = 0;
3825 1.1 ross if ( bSig <= aSig ) {
3826 1.1 ross shift128Right( aSig, 0, 1, &aSig, &rem1 );
3827 1.1 ross ++zExp;
3828 1.1 ross }
3829 1.1 ross zSig0 = estimateDiv128To64( aSig, rem1, bSig );
3830 1.1 ross mul64To128( bSig, zSig0, &term0, &term1 );
3831 1.1 ross sub128( aSig, rem1, term0, term1, &rem0, &rem1 );
3832 1.1 ross while ( (sbits64) rem0 < 0 ) {
3833 1.1 ross --zSig0;
3834 1.1 ross add128( rem0, rem1, 0, bSig, &rem0, &rem1 );
3835 1.1 ross }
3836 1.1 ross zSig1 = estimateDiv128To64( rem1, 0, bSig );
3837 1.1 ross if ( (bits64) ( zSig1<<1 ) <= 8 ) {
3838 1.1 ross mul64To128( bSig, zSig1, &term1, &term2 );
3839 1.1 ross sub128( rem1, 0, term1, term2, &rem1, &rem2 );
3840 1.1 ross while ( (sbits64) rem1 < 0 ) {
3841 1.1 ross --zSig1;
3842 1.1 ross add128( rem1, rem2, 0, bSig, &rem1, &rem2 );
3843 1.1 ross }
3844 1.1 ross zSig1 |= ( ( rem1 | rem2 ) != 0 );
3845 1.1 ross }
3846 1.1 ross return
3847 1.1 ross roundAndPackFloatx80(
3848 1.1 ross floatx80_rounding_precision, zSign, zExp, zSig0, zSig1 );
3849 1.1 ross
3850 1.1 ross }
3851 1.1 ross
3852 1.7 thorpej /*----------------------------------------------------------------------------
3853 1.7 thorpej | Returns the remainder of the extended double-precision floating-point value
3854 1.7 thorpej | `a' with respect to the corresponding value `b'. The operation is performed
3855 1.7 thorpej | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3856 1.7 thorpej *----------------------------------------------------------------------------*/
3857 1.7 thorpej
3858 1.1 ross floatx80 floatx80_rem( floatx80 a, floatx80 b )
3859 1.1 ross {
3860 1.1 ross flag aSign, bSign, zSign;
3861 1.1 ross int32 aExp, bExp, expDiff;
3862 1.1 ross bits64 aSig0, aSig1, bSig;
3863 1.1 ross bits64 q, term0, term1, alternateASig0, alternateASig1;
3864 1.1 ross floatx80 z;
3865 1.1 ross
3866 1.1 ross aSig0 = extractFloatx80Frac( a );
3867 1.1 ross aExp = extractFloatx80Exp( a );
3868 1.1 ross aSign = extractFloatx80Sign( a );
3869 1.1 ross bSig = extractFloatx80Frac( b );
3870 1.1 ross bExp = extractFloatx80Exp( b );
3871 1.1 ross bSign = extractFloatx80Sign( b );
3872 1.1 ross if ( aExp == 0x7FFF ) {
3873 1.1 ross if ( (bits64) ( aSig0<<1 )
3874 1.1 ross || ( ( bExp == 0x7FFF ) && (bits64) ( bSig<<1 ) ) ) {
3875 1.1 ross return propagateFloatx80NaN( a, b );
3876 1.1 ross }
3877 1.1 ross goto invalid;
3878 1.1 ross }
3879 1.1 ross if ( bExp == 0x7FFF ) {
3880 1.1 ross if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b );
3881 1.1 ross return a;
3882 1.1 ross }
3883 1.1 ross if ( bExp == 0 ) {
3884 1.1 ross if ( bSig == 0 ) {
3885 1.1 ross invalid:
3886 1.1 ross float_raise( float_flag_invalid );
3887 1.1 ross z.low = floatx80_default_nan_low;
3888 1.1 ross z.high = floatx80_default_nan_high;
3889 1.1 ross return z;
3890 1.1 ross }
3891 1.1 ross normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
3892 1.1 ross }
3893 1.1 ross if ( aExp == 0 ) {
3894 1.1 ross if ( (bits64) ( aSig0<<1 ) == 0 ) return a;
3895 1.1 ross normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 );
3896 1.1 ross }
3897 1.1 ross bSig |= LIT64( 0x8000000000000000 );
3898 1.1 ross zSign = aSign;
3899 1.1 ross expDiff = aExp - bExp;
3900 1.1 ross aSig1 = 0;
3901 1.1 ross if ( expDiff < 0 ) {
3902 1.1 ross if ( expDiff < -1 ) return a;
3903 1.1 ross shift128Right( aSig0, 0, 1, &aSig0, &aSig1 );
3904 1.1 ross expDiff = 0;
3905 1.1 ross }
3906 1.1 ross q = ( bSig <= aSig0 );
3907 1.1 ross if ( q ) aSig0 -= bSig;
3908 1.1 ross expDiff -= 64;
3909 1.1 ross while ( 0 < expDiff ) {
3910 1.1 ross q = estimateDiv128To64( aSig0, aSig1, bSig );
3911 1.1 ross q = ( 2 < q ) ? q - 2 : 0;
3912 1.1 ross mul64To128( bSig, q, &term0, &term1 );
3913 1.1 ross sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
3914 1.1 ross shortShift128Left( aSig0, aSig1, 62, &aSig0, &aSig1 );
3915 1.1 ross expDiff -= 62;
3916 1.1 ross }
3917 1.1 ross expDiff += 64;
3918 1.1 ross if ( 0 < expDiff ) {
3919 1.1 ross q = estimateDiv128To64( aSig0, aSig1, bSig );
3920 1.1 ross q = ( 2 < q ) ? q - 2 : 0;
3921 1.1 ross q >>= 64 - expDiff;
3922 1.1 ross mul64To128( bSig, q<<( 64 - expDiff ), &term0, &term1 );
3923 1.1 ross sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
3924 1.1 ross shortShift128Left( 0, bSig, 64 - expDiff, &term0, &term1 );
3925 1.1 ross while ( le128( term0, term1, aSig0, aSig1 ) ) {
3926 1.1 ross ++q;
3927 1.1 ross sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
3928 1.1 ross }
3929 1.1 ross }
3930 1.1 ross else {
3931 1.1 ross term1 = 0;
3932 1.1 ross term0 = bSig;
3933 1.1 ross }
3934 1.1 ross sub128( term0, term1, aSig0, aSig1, &alternateASig0, &alternateASig1 );
3935 1.1 ross if ( lt128( alternateASig0, alternateASig1, aSig0, aSig1 )
3936 1.1 ross || ( eq128( alternateASig0, alternateASig1, aSig0, aSig1 )
3937 1.1 ross && ( q & 1 ) )
3938 1.1 ross ) {
3939 1.1 ross aSig0 = alternateASig0;
3940 1.1 ross aSig1 = alternateASig1;
3941 1.1 ross zSign = ! zSign;
3942 1.1 ross }
3943 1.1 ross return
3944 1.1 ross normalizeRoundAndPackFloatx80(
3945 1.1 ross 80, zSign, bExp + expDiff, aSig0, aSig1 );
3946 1.1 ross
3947 1.1 ross }
3948 1.1 ross
3949 1.7 thorpej /*----------------------------------------------------------------------------
3950 1.7 thorpej | Returns the square root of the extended double-precision floating-point
3951 1.7 thorpej | value `a'. The operation is performed according to the IEC/IEEE Standard
3952 1.7 thorpej | for Binary Floating-Point Arithmetic.
3953 1.7 thorpej *----------------------------------------------------------------------------*/
3954 1.7 thorpej
3955 1.1 ross floatx80 floatx80_sqrt( floatx80 a )
3956 1.1 ross {
3957 1.1 ross flag aSign;
3958 1.1 ross int32 aExp, zExp;
3959 1.1 ross bits64 aSig0, aSig1, zSig0, zSig1, doubleZSig0;
3960 1.1 ross bits64 rem0, rem1, rem2, rem3, term0, term1, term2, term3;
3961 1.1 ross floatx80 z;
3962 1.1 ross
3963 1.1 ross aSig0 = extractFloatx80Frac( a );
3964 1.1 ross aExp = extractFloatx80Exp( a );
3965 1.1 ross aSign = extractFloatx80Sign( a );
3966 1.1 ross if ( aExp == 0x7FFF ) {
3967 1.1 ross if ( (bits64) ( aSig0<<1 ) ) return propagateFloatx80NaN( a, a );
3968 1.1 ross if ( ! aSign ) return a;
3969 1.1 ross goto invalid;
3970 1.1 ross }
3971 1.1 ross if ( aSign ) {
3972 1.1 ross if ( ( aExp | aSig0 ) == 0 ) return a;
3973 1.1 ross invalid:
3974 1.1 ross float_raise( float_flag_invalid );
3975 1.1 ross z.low = floatx80_default_nan_low;
3976 1.1 ross z.high = floatx80_default_nan_high;
3977 1.1 ross return z;
3978 1.1 ross }
3979 1.1 ross if ( aExp == 0 ) {
3980 1.1 ross if ( aSig0 == 0 ) return packFloatx80( 0, 0, 0 );
3981 1.1 ross normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 );
3982 1.1 ross }
3983 1.1 ross zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFF;
3984 1.1 ross zSig0 = estimateSqrt32( aExp, aSig0>>32 );
3985 1.1 ross shift128Right( aSig0, 0, 2 + ( aExp & 1 ), &aSig0, &aSig1 );
3986 1.1 ross zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 );
3987 1.1 ross doubleZSig0 = zSig0<<1;
3988 1.1 ross mul64To128( zSig0, zSig0, &term0, &term1 );
3989 1.1 ross sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 );
3990 1.1 ross while ( (sbits64) rem0 < 0 ) {
3991 1.1 ross --zSig0;
3992 1.1 ross doubleZSig0 -= 2;
3993 1.1 ross add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 );
3994 1.1 ross }
3995 1.1 ross zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 );
3996 1.1 ross if ( ( zSig1 & LIT64( 0x3FFFFFFFFFFFFFFF ) ) <= 5 ) {
3997 1.1 ross if ( zSig1 == 0 ) zSig1 = 1;
3998 1.1 ross mul64To128( doubleZSig0, zSig1, &term1, &term2 );
3999 1.1 ross sub128( rem1, 0, term1, term2, &rem1, &rem2 );
4000 1.1 ross mul64To128( zSig1, zSig1, &term2, &term3 );
4001 1.1 ross sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 );
4002 1.1 ross while ( (sbits64) rem1 < 0 ) {
4003 1.1 ross --zSig1;
4004 1.1 ross shortShift128Left( 0, zSig1, 1, &term2, &term3 );
4005 1.1 ross term3 |= 1;
4006 1.1 ross term2 |= doubleZSig0;
4007 1.1 ross add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 );
4008 1.1 ross }
4009 1.1 ross zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
4010 1.1 ross }
4011 1.1 ross shortShift128Left( 0, zSig1, 1, &zSig0, &zSig1 );
4012 1.1 ross zSig0 |= doubleZSig0;
4013 1.1 ross return
4014 1.1 ross roundAndPackFloatx80(
4015 1.1 ross floatx80_rounding_precision, 0, zExp, zSig0, zSig1 );
4016 1.1 ross
4017 1.1 ross }
4018 1.1 ross
4019 1.7 thorpej /*----------------------------------------------------------------------------
4020 1.7 thorpej | Returns 1 if the extended double-precision floating-point value `a' is
4021 1.7 thorpej | equal to the corresponding value `b', and 0 otherwise. The comparison is
4022 1.7 thorpej | performed according to the IEC/IEEE Standard for Binary Floating-Point
4023 1.7 thorpej | Arithmetic.
4024 1.7 thorpej *----------------------------------------------------------------------------*/
4025 1.7 thorpej
4026 1.1 ross flag floatx80_eq( floatx80 a, floatx80 b )
4027 1.1 ross {
4028 1.1 ross
4029 1.1 ross if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
4030 1.1 ross && (bits64) ( extractFloatx80Frac( a )<<1 ) )
4031 1.1 ross || ( ( extractFloatx80Exp( b ) == 0x7FFF )
4032 1.1 ross && (bits64) ( extractFloatx80Frac( b )<<1 ) )
4033 1.1 ross ) {
4034 1.1 ross if ( floatx80_is_signaling_nan( a )
4035 1.1 ross || floatx80_is_signaling_nan( b ) ) {
4036 1.1 ross float_raise( float_flag_invalid );
4037 1.1 ross }
4038 1.1 ross return 0;
4039 1.1 ross }
4040 1.1 ross return
4041 1.1 ross ( a.low == b.low )
4042 1.1 ross && ( ( a.high == b.high )
4043 1.1 ross || ( ( a.low == 0 )
4044 1.1 ross && ( (bits16) ( ( a.high | b.high )<<1 ) == 0 ) )
4045 1.1 ross );
4046 1.1 ross
4047 1.1 ross }
4048 1.1 ross
4049 1.7 thorpej /*----------------------------------------------------------------------------
4050 1.7 thorpej | Returns 1 if the extended double-precision floating-point value `a' is
4051 1.7 thorpej | less than or equal to the corresponding value `b', and 0 otherwise. The
4052 1.7 thorpej | comparison is performed according to the IEC/IEEE Standard for Binary
4053 1.7 thorpej | Floating-Point Arithmetic.
4054 1.7 thorpej *----------------------------------------------------------------------------*/
4055 1.7 thorpej
4056 1.1 ross flag floatx80_le( floatx80 a, floatx80 b )
4057 1.1 ross {
4058 1.1 ross flag aSign, bSign;
4059 1.1 ross
4060 1.1 ross if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
4061 1.1 ross && (bits64) ( extractFloatx80Frac( a )<<1 ) )
4062 1.1 ross || ( ( extractFloatx80Exp( b ) == 0x7FFF )
4063 1.1 ross && (bits64) ( extractFloatx80Frac( b )<<1 ) )
4064 1.1 ross ) {
4065 1.1 ross float_raise( float_flag_invalid );
4066 1.1 ross return 0;
4067 1.1 ross }
4068 1.1 ross aSign = extractFloatx80Sign( a );
4069 1.1 ross bSign = extractFloatx80Sign( b );
4070 1.1 ross if ( aSign != bSign ) {
4071 1.1 ross return
4072 1.1 ross aSign
4073 1.1 ross || ( ( ( (bits16) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
4074 1.1 ross == 0 );
4075 1.1 ross }
4076 1.1 ross return
4077 1.1 ross aSign ? le128( b.high, b.low, a.high, a.low )
4078 1.1 ross : le128( a.high, a.low, b.high, b.low );
4079 1.1 ross
4080 1.1 ross }
4081 1.1 ross
4082 1.7 thorpej /*----------------------------------------------------------------------------
4083 1.7 thorpej | Returns 1 if the extended double-precision floating-point value `a' is
4084 1.7 thorpej | less than the corresponding value `b', and 0 otherwise. The comparison
4085 1.7 thorpej | is performed according to the IEC/IEEE Standard for Binary Floating-Point
4086 1.7 thorpej | Arithmetic.
4087 1.7 thorpej *----------------------------------------------------------------------------*/
4088 1.7 thorpej
4089 1.1 ross flag floatx80_lt( floatx80 a, floatx80 b )
4090 1.1 ross {
4091 1.1 ross flag aSign, bSign;
4092 1.1 ross
4093 1.1 ross if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
4094 1.1 ross && (bits64) ( extractFloatx80Frac( a )<<1 ) )
4095 1.1 ross || ( ( extractFloatx80Exp( b ) == 0x7FFF )
4096 1.1 ross && (bits64) ( extractFloatx80Frac( b )<<1 ) )
4097 1.1 ross ) {
4098 1.1 ross float_raise( float_flag_invalid );
4099 1.1 ross return 0;
4100 1.1 ross }
4101 1.1 ross aSign = extractFloatx80Sign( a );
4102 1.1 ross bSign = extractFloatx80Sign( b );
4103 1.1 ross if ( aSign != bSign ) {
4104 1.1 ross return
4105 1.1 ross aSign
4106 1.1 ross && ( ( ( (bits16) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
4107 1.1 ross != 0 );
4108 1.1 ross }
4109 1.1 ross return
4110 1.1 ross aSign ? lt128( b.high, b.low, a.high, a.low )
4111 1.1 ross : lt128( a.high, a.low, b.high, b.low );
4112 1.1 ross
4113 1.1 ross }
4114 1.1 ross
4115 1.7 thorpej /*----------------------------------------------------------------------------
4116 1.7 thorpej | Returns 1 if the extended double-precision floating-point value `a' is equal
4117 1.7 thorpej | to the corresponding value `b', and 0 otherwise. The invalid exception is
4118 1.7 thorpej | raised if either operand is a NaN. Otherwise, the comparison is performed
4119 1.7 thorpej | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4120 1.7 thorpej *----------------------------------------------------------------------------*/
4121 1.7 thorpej
4122 1.1 ross flag floatx80_eq_signaling( floatx80 a, floatx80 b )
4123 1.1 ross {
4124 1.1 ross
4125 1.1 ross if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
4126 1.1 ross && (bits64) ( extractFloatx80Frac( a )<<1 ) )
4127 1.1 ross || ( ( extractFloatx80Exp( b ) == 0x7FFF )
4128 1.1 ross && (bits64) ( extractFloatx80Frac( b )<<1 ) )
4129 1.1 ross ) {
4130 1.1 ross float_raise( float_flag_invalid );
4131 1.1 ross return 0;
4132 1.1 ross }
4133 1.1 ross return
4134 1.1 ross ( a.low == b.low )
4135 1.1 ross && ( ( a.high == b.high )
4136 1.1 ross || ( ( a.low == 0 )
4137 1.1 ross && ( (bits16) ( ( a.high | b.high )<<1 ) == 0 ) )
4138 1.1 ross );
4139 1.1 ross
4140 1.1 ross }
4141 1.1 ross
4142 1.7 thorpej /*----------------------------------------------------------------------------
4143 1.7 thorpej | Returns 1 if the extended double-precision floating-point value `a' is less
4144 1.7 thorpej | than or equal to the corresponding value `b', and 0 otherwise. Quiet NaNs
4145 1.7 thorpej | do not cause an exception. Otherwise, the comparison is performed according
4146 1.7 thorpej | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4147 1.7 thorpej *----------------------------------------------------------------------------*/
4148 1.7 thorpej
4149 1.1 ross flag floatx80_le_quiet( floatx80 a, floatx80 b )
4150 1.1 ross {
4151 1.1 ross flag aSign, bSign;
4152 1.1 ross
4153 1.1 ross if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
4154 1.1 ross && (bits64) ( extractFloatx80Frac( a )<<1 ) )
4155 1.1 ross || ( ( extractFloatx80Exp( b ) == 0x7FFF )
4156 1.1 ross && (bits64) ( extractFloatx80Frac( b )<<1 ) )
4157 1.1 ross ) {
4158 1.1 ross if ( floatx80_is_signaling_nan( a )
4159 1.1 ross || floatx80_is_signaling_nan( b ) ) {
4160 1.1 ross float_raise( float_flag_invalid );
4161 1.1 ross }
4162 1.1 ross return 0;
4163 1.1 ross }
4164 1.1 ross aSign = extractFloatx80Sign( a );
4165 1.1 ross bSign = extractFloatx80Sign( b );
4166 1.1 ross if ( aSign != bSign ) {
4167 1.1 ross return
4168 1.1 ross aSign
4169 1.1 ross || ( ( ( (bits16) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
4170 1.1 ross == 0 );
4171 1.1 ross }
4172 1.1 ross return
4173 1.1 ross aSign ? le128( b.high, b.low, a.high, a.low )
4174 1.1 ross : le128( a.high, a.low, b.high, b.low );
4175 1.1 ross
4176 1.1 ross }
4177 1.1 ross
4178 1.7 thorpej /*----------------------------------------------------------------------------
4179 1.7 thorpej | Returns 1 if the extended double-precision floating-point value `a' is less
4180 1.7 thorpej | than the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause
4181 1.7 thorpej | an exception. Otherwise, the comparison is performed according to the
4182 1.7 thorpej | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4183 1.7 thorpej *----------------------------------------------------------------------------*/
4184 1.7 thorpej
4185 1.1 ross flag floatx80_lt_quiet( floatx80 a, floatx80 b )
4186 1.1 ross {
4187 1.1 ross flag aSign, bSign;
4188 1.1 ross
4189 1.1 ross if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
4190 1.1 ross && (bits64) ( extractFloatx80Frac( a )<<1 ) )
4191 1.1 ross || ( ( extractFloatx80Exp( b ) == 0x7FFF )
4192 1.1 ross && (bits64) ( extractFloatx80Frac( b )<<1 ) )
4193 1.1 ross ) {
4194 1.1 ross if ( floatx80_is_signaling_nan( a )
4195 1.1 ross || floatx80_is_signaling_nan( b ) ) {
4196 1.1 ross float_raise( float_flag_invalid );
4197 1.1 ross }
4198 1.1 ross return 0;
4199 1.1 ross }
4200 1.1 ross aSign = extractFloatx80Sign( a );
4201 1.1 ross bSign = extractFloatx80Sign( b );
4202 1.1 ross if ( aSign != bSign ) {
4203 1.1 ross return
4204 1.1 ross aSign
4205 1.1 ross && ( ( ( (bits16) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
4206 1.1 ross != 0 );
4207 1.1 ross }
4208 1.1 ross return
4209 1.1 ross aSign ? lt128( b.high, b.low, a.high, a.low )
4210 1.1 ross : lt128( a.high, a.low, b.high, b.low );
4211 1.1 ross
4212 1.1 ross }
4213 1.1 ross
4214 1.1 ross #endif
4215 1.1 ross
4216 1.1 ross #ifdef FLOAT128
4217 1.1 ross
4218 1.7 thorpej /*----------------------------------------------------------------------------
4219 1.7 thorpej | Returns the result of converting the quadruple-precision floating-point
4220 1.7 thorpej | value `a' to the 32-bit two's complement integer format. The conversion
4221 1.7 thorpej | is performed according to the IEC/IEEE Standard for Binary Floating-Point
4222 1.7 thorpej | Arithmetic---which means in particular that the conversion is rounded
4223 1.7 thorpej | according to the current rounding mode. If `a' is a NaN, the largest
4224 1.7 thorpej | positive integer is returned. Otherwise, if the conversion overflows, the
4225 1.7 thorpej | largest integer with the same sign as `a' is returned.
4226 1.7 thorpej *----------------------------------------------------------------------------*/
4227 1.7 thorpej
4228 1.1 ross int32 float128_to_int32( float128 a )
4229 1.1 ross {
4230 1.1 ross flag aSign;
4231 1.1 ross int32 aExp, shiftCount;
4232 1.1 ross bits64 aSig0, aSig1;
4233 1.1 ross
4234 1.1 ross aSig1 = extractFloat128Frac1( a );
4235 1.1 ross aSig0 = extractFloat128Frac0( a );
4236 1.1 ross aExp = extractFloat128Exp( a );
4237 1.1 ross aSign = extractFloat128Sign( a );
4238 1.1 ross if ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) aSign = 0;
4239 1.1 ross if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
4240 1.1 ross aSig0 |= ( aSig1 != 0 );
4241 1.1 ross shiftCount = 0x4028 - aExp;
4242 1.1 ross if ( 0 < shiftCount ) shift64RightJamming( aSig0, shiftCount, &aSig0 );
4243 1.1 ross return roundAndPackInt32( aSign, aSig0 );
4244 1.1 ross
4245 1.1 ross }
4246 1.1 ross
4247 1.7 thorpej /*----------------------------------------------------------------------------
4248 1.7 thorpej | Returns the result of converting the quadruple-precision floating-point
4249 1.7 thorpej | value `a' to the 32-bit two's complement integer format. The conversion
4250 1.7 thorpej | is performed according to the IEC/IEEE Standard for Binary Floating-Point
4251 1.7 thorpej | Arithmetic, except that the conversion is always rounded toward zero. If
4252 1.7 thorpej | `a' is a NaN, the largest positive integer is returned. Otherwise, if the
4253 1.7 thorpej | conversion overflows, the largest integer with the same sign as `a' is
4254 1.7 thorpej | returned.
4255 1.7 thorpej *----------------------------------------------------------------------------*/
4256 1.7 thorpej
4257 1.1 ross int32 float128_to_int32_round_to_zero( float128 a )
4258 1.1 ross {
4259 1.1 ross flag aSign;
4260 1.1 ross int32 aExp, shiftCount;
4261 1.1 ross bits64 aSig0, aSig1, savedASig;
4262 1.1 ross int32 z;
4263 1.1 ross
4264 1.1 ross aSig1 = extractFloat128Frac1( a );
4265 1.1 ross aSig0 = extractFloat128Frac0( a );
4266 1.1 ross aExp = extractFloat128Exp( a );
4267 1.1 ross aSign = extractFloat128Sign( a );
4268 1.1 ross aSig0 |= ( aSig1 != 0 );
4269 1.1 ross if ( 0x401E < aExp ) {
4270 1.1 ross if ( ( aExp == 0x7FFF ) && aSig0 ) aSign = 0;
4271 1.1 ross goto invalid;
4272 1.1 ross }
4273 1.1 ross else if ( aExp < 0x3FFF ) {
4274 1.1 ross if ( aExp || aSig0 ) float_set_inexact();
4275 1.1 ross return 0;
4276 1.1 ross }
4277 1.1 ross aSig0 |= LIT64( 0x0001000000000000 );
4278 1.1 ross shiftCount = 0x402F - aExp;
4279 1.1 ross savedASig = aSig0;
4280 1.1 ross aSig0 >>= shiftCount;
4281 1.1 ross z = aSig0;
4282 1.1 ross if ( aSign ) z = - z;
4283 1.1 ross if ( ( z < 0 ) ^ aSign ) {
4284 1.1 ross invalid:
4285 1.1 ross float_raise( float_flag_invalid );
4286 1.1 ross return aSign ? (sbits32) 0x80000000 : 0x7FFFFFFF;
4287 1.1 ross }
4288 1.1 ross if ( ( aSig0<<shiftCount ) != savedASig ) {
4289 1.1 ross float_set_inexact();
4290 1.1 ross }
4291 1.1 ross return z;
4292 1.1 ross
4293 1.1 ross }
4294 1.1 ross
4295 1.7 thorpej /*----------------------------------------------------------------------------
4296 1.7 thorpej | Returns the result of converting the quadruple-precision floating-point
4297 1.7 thorpej | value `a' to the 64-bit two's complement integer format. The conversion
4298 1.7 thorpej | is performed according to the IEC/IEEE Standard for Binary Floating-Point
4299 1.7 thorpej | Arithmetic---which means in particular that the conversion is rounded
4300 1.7 thorpej | according to the current rounding mode. If `a' is a NaN, the largest
4301 1.7 thorpej | positive integer is returned. Otherwise, if the conversion overflows, the
4302 1.7 thorpej | largest integer with the same sign as `a' is returned.
4303 1.7 thorpej *----------------------------------------------------------------------------*/
4304 1.7 thorpej
4305 1.1 ross int64 float128_to_int64( float128 a )
4306 1.1 ross {
4307 1.1 ross flag aSign;
4308 1.1 ross int32 aExp, shiftCount;
4309 1.1 ross bits64 aSig0, aSig1;
4310 1.1 ross
4311 1.1 ross aSig1 = extractFloat128Frac1( a );
4312 1.1 ross aSig0 = extractFloat128Frac0( a );
4313 1.1 ross aExp = extractFloat128Exp( a );
4314 1.1 ross aSign = extractFloat128Sign( a );
4315 1.1 ross if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
4316 1.1 ross shiftCount = 0x402F - aExp;
4317 1.1 ross if ( shiftCount <= 0 ) {
4318 1.1 ross if ( 0x403E < aExp ) {
4319 1.1 ross float_raise( float_flag_invalid );
4320 1.1 ross if ( ! aSign
4321 1.1 ross || ( ( aExp == 0x7FFF )
4322 1.1 ross && ( aSig1 || ( aSig0 != LIT64( 0x0001000000000000 ) ) )
4323 1.1 ross )
4324 1.1 ross ) {
4325 1.1 ross return LIT64( 0x7FFFFFFFFFFFFFFF );
4326 1.1 ross }
4327 1.1 ross return (sbits64) LIT64( 0x8000000000000000 );
4328 1.1 ross }
4329 1.1 ross shortShift128Left( aSig0, aSig1, - shiftCount, &aSig0, &aSig1 );
4330 1.1 ross }
4331 1.1 ross else {
4332 1.1 ross shift64ExtraRightJamming( aSig0, aSig1, shiftCount, &aSig0, &aSig1 );
4333 1.1 ross }
4334 1.1 ross return roundAndPackInt64( aSign, aSig0, aSig1 );
4335 1.1 ross
4336 1.1 ross }
4337 1.1 ross
4338 1.7 thorpej /*----------------------------------------------------------------------------
4339 1.7 thorpej | Returns the result of converting the quadruple-precision floating-point
4340 1.7 thorpej | value `a' to the 64-bit two's complement integer format. The conversion
4341 1.7 thorpej | is performed according to the IEC/IEEE Standard for Binary Floating-Point
4342 1.7 thorpej | Arithmetic, except that the conversion is always rounded toward zero.
4343 1.7 thorpej | If `a' is a NaN, the largest positive integer is returned. Otherwise, if
4344 1.7 thorpej | the conversion overflows, the largest integer with the same sign as `a' is
4345 1.7 thorpej | returned.
4346 1.7 thorpej *----------------------------------------------------------------------------*/
4347 1.7 thorpej
4348 1.1 ross int64 float128_to_int64_round_to_zero( float128 a )
4349 1.1 ross {
4350 1.1 ross flag aSign;
4351 1.1 ross int32 aExp, shiftCount;
4352 1.1 ross bits64 aSig0, aSig1;
4353 1.1 ross int64 z;
4354 1.1 ross
4355 1.1 ross aSig1 = extractFloat128Frac1( a );
4356 1.1 ross aSig0 = extractFloat128Frac0( a );
4357 1.1 ross aExp = extractFloat128Exp( a );
4358 1.1 ross aSign = extractFloat128Sign( a );
4359 1.1 ross if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
4360 1.1 ross shiftCount = aExp - 0x402F;
4361 1.1 ross if ( 0 < shiftCount ) {
4362 1.1 ross if ( 0x403E <= aExp ) {
4363 1.1 ross aSig0 &= LIT64( 0x0000FFFFFFFFFFFF );
4364 1.1 ross if ( ( a.high == LIT64( 0xC03E000000000000 ) )
4365 1.1 ross && ( aSig1 < LIT64( 0x0002000000000000 ) ) ) {
4366 1.1 ross if ( aSig1 ) float_set_inexact();
4367 1.1 ross }
4368 1.1 ross else {
4369 1.1 ross float_raise( float_flag_invalid );
4370 1.1 ross if ( ! aSign || ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) ) {
4371 1.1 ross return LIT64( 0x7FFFFFFFFFFFFFFF );
4372 1.1 ross }
4373 1.1 ross }
4374 1.1 ross return (sbits64) LIT64( 0x8000000000000000 );
4375 1.1 ross }
4376 1.1 ross z = ( aSig0<<shiftCount ) | ( aSig1>>( ( - shiftCount ) & 63 ) );
4377 1.1 ross if ( (bits64) ( aSig1<<shiftCount ) ) {
4378 1.1 ross float_set_inexact();
4379 1.1 ross }
4380 1.1 ross }
4381 1.1 ross else {
4382 1.1 ross if ( aExp < 0x3FFF ) {
4383 1.1 ross if ( aExp | aSig0 | aSig1 ) {
4384 1.1 ross float_set_inexact();
4385 1.1 ross }
4386 1.1 ross return 0;
4387 1.1 ross }
4388 1.1 ross z = aSig0>>( - shiftCount );
4389 1.1 ross if ( aSig1
4390 1.1 ross || ( shiftCount && (bits64) ( aSig0<<( shiftCount & 63 ) ) ) ) {
4391 1.1 ross float_set_inexact();
4392 1.1 ross }
4393 1.1 ross }
4394 1.1 ross if ( aSign ) z = - z;
4395 1.1 ross return z;
4396 1.1 ross
4397 1.1 ross }
4398 1.1 ross
4399 1.7 thorpej /*----------------------------------------------------------------------------
4400 1.7 thorpej | Returns the result of converting the quadruple-precision floating-point
4401 1.7 thorpej | value `a' to the single-precision floating-point format. The conversion
4402 1.7 thorpej | is performed according to the IEC/IEEE Standard for Binary Floating-Point
4403 1.7 thorpej | Arithmetic.
4404 1.7 thorpej *----------------------------------------------------------------------------*/
4405 1.7 thorpej
4406 1.1 ross float32 float128_to_float32( float128 a )
4407 1.1 ross {
4408 1.1 ross flag aSign;
4409 1.1 ross int32 aExp;
4410 1.1 ross bits64 aSig0, aSig1;
4411 1.1 ross bits32 zSig;
4412 1.1 ross
4413 1.1 ross aSig1 = extractFloat128Frac1( a );
4414 1.1 ross aSig0 = extractFloat128Frac0( a );
4415 1.1 ross aExp = extractFloat128Exp( a );
4416 1.1 ross aSign = extractFloat128Sign( a );
4417 1.1 ross if ( aExp == 0x7FFF ) {
4418 1.1 ross if ( aSig0 | aSig1 ) {
4419 1.1 ross return commonNaNToFloat32( float128ToCommonNaN( a ) );
4420 1.1 ross }
4421 1.1 ross return packFloat32( aSign, 0xFF, 0 );
4422 1.1 ross }
4423 1.1 ross aSig0 |= ( aSig1 != 0 );
4424 1.1 ross shift64RightJamming( aSig0, 18, &aSig0 );
4425 1.1 ross zSig = aSig0;
4426 1.1 ross if ( aExp || zSig ) {
4427 1.1 ross zSig |= 0x40000000;
4428 1.1 ross aExp -= 0x3F81;
4429 1.1 ross }
4430 1.1 ross return roundAndPackFloat32( aSign, aExp, zSig );
4431 1.1 ross
4432 1.1 ross }
4433 1.1 ross
4434 1.7 thorpej /*----------------------------------------------------------------------------
4435 1.7 thorpej | Returns the result of converting the quadruple-precision floating-point
4436 1.7 thorpej | value `a' to the double-precision floating-point format. The conversion
4437 1.7 thorpej | is performed according to the IEC/IEEE Standard for Binary Floating-Point
4438 1.7 thorpej | Arithmetic.
4439 1.7 thorpej *----------------------------------------------------------------------------*/
4440 1.7 thorpej
4441 1.1 ross float64 float128_to_float64( float128 a )
4442 1.1 ross {
4443 1.1 ross flag aSign;
4444 1.1 ross int32 aExp;
4445 1.1 ross bits64 aSig0, aSig1;
4446 1.1 ross
4447 1.1 ross aSig1 = extractFloat128Frac1( a );
4448 1.1 ross aSig0 = extractFloat128Frac0( a );
4449 1.1 ross aExp = extractFloat128Exp( a );
4450 1.1 ross aSign = extractFloat128Sign( a );
4451 1.1 ross if ( aExp == 0x7FFF ) {
4452 1.1 ross if ( aSig0 | aSig1 ) {
4453 1.1 ross return commonNaNToFloat64( float128ToCommonNaN( a ) );
4454 1.1 ross }
4455 1.1 ross return packFloat64( aSign, 0x7FF, 0 );
4456 1.1 ross }
4457 1.1 ross shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 );
4458 1.1 ross aSig0 |= ( aSig1 != 0 );
4459 1.1 ross if ( aExp || aSig0 ) {
4460 1.1 ross aSig0 |= LIT64( 0x4000000000000000 );
4461 1.1 ross aExp -= 0x3C01;
4462 1.1 ross }
4463 1.1 ross return roundAndPackFloat64( aSign, aExp, aSig0 );
4464 1.1 ross
4465 1.1 ross }
4466 1.1 ross
4467 1.1 ross #ifdef FLOATX80
4468 1.1 ross
4469 1.7 thorpej /*----------------------------------------------------------------------------
4470 1.7 thorpej | Returns the result of converting the quadruple-precision floating-point
4471 1.7 thorpej | value `a' to the extended double-precision floating-point format. The
4472 1.7 thorpej | conversion is performed according to the IEC/IEEE Standard for Binary
4473 1.7 thorpej | Floating-Point Arithmetic.
4474 1.7 thorpej *----------------------------------------------------------------------------*/
4475 1.7 thorpej
4476 1.1 ross floatx80 float128_to_floatx80( float128 a )
4477 1.1 ross {
4478 1.1 ross flag aSign;
4479 1.1 ross int32 aExp;
4480 1.1 ross bits64 aSig0, aSig1;
4481 1.1 ross
4482 1.1 ross aSig1 = extractFloat128Frac1( a );
4483 1.1 ross aSig0 = extractFloat128Frac0( a );
4484 1.1 ross aExp = extractFloat128Exp( a );
4485 1.1 ross aSign = extractFloat128Sign( a );
4486 1.1 ross if ( aExp == 0x7FFF ) {
4487 1.1 ross if ( aSig0 | aSig1 ) {
4488 1.1 ross return commonNaNToFloatx80( float128ToCommonNaN( a ) );
4489 1.1 ross }
4490 1.1 ross return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
4491 1.1 ross }
4492 1.1 ross if ( aExp == 0 ) {
4493 1.1 ross if ( ( aSig0 | aSig1 ) == 0 ) return packFloatx80( aSign, 0, 0 );
4494 1.1 ross normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
4495 1.1 ross }
4496 1.1 ross else {
4497 1.1 ross aSig0 |= LIT64( 0x0001000000000000 );
4498 1.1 ross }
4499 1.1 ross shortShift128Left( aSig0, aSig1, 15, &aSig0, &aSig1 );
4500 1.1 ross return roundAndPackFloatx80( 80, aSign, aExp, aSig0, aSig1 );
4501 1.1 ross
4502 1.1 ross }
4503 1.1 ross
4504 1.1 ross #endif
4505 1.1 ross
4506 1.7 thorpej /*----------------------------------------------------------------------------
4507 1.7 thorpej | Rounds the quadruple-precision floating-point value `a' to an integer, and
4508 1.7 thorpej | returns the result as a quadruple-precision floating-point value. The
4509 1.7 thorpej | operation is performed according to the IEC/IEEE Standard for Binary
4510 1.7 thorpej | Floating-Point Arithmetic.
4511 1.7 thorpej *----------------------------------------------------------------------------*/
4512 1.7 thorpej
4513 1.1 ross float128 float128_round_to_int( float128 a )
4514 1.1 ross {
4515 1.1 ross flag aSign;
4516 1.1 ross int32 aExp;
4517 1.1 ross bits64 lastBitMask, roundBitsMask;
4518 1.1 ross int8 roundingMode;
4519 1.1 ross float128 z;
4520 1.1 ross
4521 1.1 ross aExp = extractFloat128Exp( a );
4522 1.1 ross if ( 0x402F <= aExp ) {
4523 1.1 ross if ( 0x406F <= aExp ) {
4524 1.1 ross if ( ( aExp == 0x7FFF )
4525 1.1 ross && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) )
4526 1.1 ross ) {
4527 1.1 ross return propagateFloat128NaN( a, a );
4528 1.1 ross }
4529 1.1 ross return a;
4530 1.1 ross }
4531 1.1 ross lastBitMask = 1;
4532 1.1 ross lastBitMask = ( lastBitMask<<( 0x406E - aExp ) )<<1;
4533 1.1 ross roundBitsMask = lastBitMask - 1;
4534 1.1 ross z = a;
4535 1.1 ross roundingMode = float_rounding_mode();
4536 1.1 ross if ( roundingMode == float_round_nearest_even ) {
4537 1.1 ross if ( lastBitMask ) {
4538 1.1 ross add128( z.high, z.low, 0, lastBitMask>>1, &z.high, &z.low );
4539 1.1 ross if ( ( z.low & roundBitsMask ) == 0 ) z.low &= ~ lastBitMask;
4540 1.1 ross }
4541 1.1 ross else {
4542 1.1 ross if ( (sbits64) z.low < 0 ) {
4543 1.1 ross ++z.high;
4544 1.1 ross if ( (bits64) ( z.low<<1 ) == 0 ) z.high &= ~1;
4545 1.1 ross }
4546 1.1 ross }
4547 1.1 ross }
4548 1.1 ross else if ( roundingMode != float_round_to_zero ) {
4549 1.1 ross if ( extractFloat128Sign( z )
4550 1.1 ross ^ ( roundingMode == float_round_up ) ) {
4551 1.1 ross add128( z.high, z.low, 0, roundBitsMask, &z.high, &z.low );
4552 1.1 ross }
4553 1.1 ross }
4554 1.1 ross z.low &= ~ roundBitsMask;
4555 1.1 ross }
4556 1.1 ross else {
4557 1.1 ross if ( aExp < 0x3FFF ) {
4558 1.1 ross if ( ( ( (bits64) ( a.high<<1 ) ) | a.low ) == 0 ) return a;
4559 1.1 ross float_set_inexact();
4560 1.1 ross aSign = extractFloat128Sign( a );
4561 1.1 ross switch ( float_rounding_mode() ) {
4562 1.1 ross case float_round_nearest_even:
4563 1.1 ross if ( ( aExp == 0x3FFE )
4564 1.1 ross && ( extractFloat128Frac0( a )
4565 1.1 ross | extractFloat128Frac1( a ) )
4566 1.1 ross ) {
4567 1.1 ross return packFloat128( aSign, 0x3FFF, 0, 0 );
4568 1.1 ross }
4569 1.1 ross break;
4570 1.1 ross case float_round_down:
4571 1.1 ross return
4572 1.1 ross aSign ? packFloat128( 1, 0x3FFF, 0, 0 )
4573 1.1 ross : packFloat128( 0, 0, 0, 0 );
4574 1.1 ross case float_round_up:
4575 1.1 ross return
4576 1.1 ross aSign ? packFloat128( 1, 0, 0, 0 )
4577 1.1 ross : packFloat128( 0, 0x3FFF, 0, 0 );
4578 1.1 ross }
4579 1.1 ross return packFloat128( aSign, 0, 0, 0 );
4580 1.1 ross }
4581 1.1 ross lastBitMask = 1;
4582 1.1 ross lastBitMask <<= 0x402F - aExp;
4583 1.1 ross roundBitsMask = lastBitMask - 1;
4584 1.1 ross z.low = 0;
4585 1.1 ross z.high = a.high;
4586 1.1 ross roundingMode = float_rounding_mode();
4587 1.1 ross if ( roundingMode == float_round_nearest_even ) {
4588 1.1 ross z.high += lastBitMask>>1;
4589 1.1 ross if ( ( ( z.high & roundBitsMask ) | a.low ) == 0 ) {
4590 1.1 ross z.high &= ~ lastBitMask;
4591 1.1 ross }
4592 1.1 ross }
4593 1.1 ross else if ( roundingMode != float_round_to_zero ) {
4594 1.1 ross if ( extractFloat128Sign( z )
4595 1.1 ross ^ ( roundingMode == float_round_up ) ) {
4596 1.1 ross z.high |= ( a.low != 0 );
4597 1.1 ross z.high += roundBitsMask;
4598 1.1 ross }
4599 1.1 ross }
4600 1.1 ross z.high &= ~ roundBitsMask;
4601 1.1 ross }
4602 1.1 ross if ( ( z.low != a.low ) || ( z.high != a.high ) ) {
4603 1.1 ross float_set_inexact();
4604 1.1 ross }
4605 1.1 ross return z;
4606 1.1 ross
4607 1.1 ross }
4608 1.1 ross
4609 1.7 thorpej /*----------------------------------------------------------------------------
4610 1.7 thorpej | Returns the result of adding the absolute values of the quadruple-precision
4611 1.7 thorpej | floating-point values `a' and `b'. If `zSign' is 1, the sum is negated
4612 1.7 thorpej | before being returned. `zSign' is ignored if the result is a NaN.
4613 1.7 thorpej | The addition is performed according to the IEC/IEEE Standard for Binary
4614 1.7 thorpej | Floating-Point Arithmetic.
4615 1.7 thorpej *----------------------------------------------------------------------------*/
4616 1.7 thorpej
4617 1.1 ross static float128 addFloat128Sigs( float128 a, float128 b, flag zSign )
4618 1.1 ross {
4619 1.1 ross int32 aExp, bExp, zExp;
4620 1.1 ross bits64 aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2;
4621 1.1 ross int32 expDiff;
4622 1.1 ross
4623 1.1 ross aSig1 = extractFloat128Frac1( a );
4624 1.1 ross aSig0 = extractFloat128Frac0( a );
4625 1.1 ross aExp = extractFloat128Exp( a );
4626 1.1 ross bSig1 = extractFloat128Frac1( b );
4627 1.1 ross bSig0 = extractFloat128Frac0( b );
4628 1.1 ross bExp = extractFloat128Exp( b );
4629 1.1 ross expDiff = aExp - bExp;
4630 1.1 ross if ( 0 < expDiff ) {
4631 1.1 ross if ( aExp == 0x7FFF ) {
4632 1.1 ross if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, b );
4633 1.1 ross return a;
4634 1.1 ross }
4635 1.1 ross if ( bExp == 0 ) {
4636 1.1 ross --expDiff;
4637 1.1 ross }
4638 1.1 ross else {
4639 1.1 ross bSig0 |= LIT64( 0x0001000000000000 );
4640 1.1 ross }
4641 1.1 ross shift128ExtraRightJamming(
4642 1.1 ross bSig0, bSig1, 0, expDiff, &bSig0, &bSig1, &zSig2 );
4643 1.1 ross zExp = aExp;
4644 1.1 ross }
4645 1.1 ross else if ( expDiff < 0 ) {
4646 1.1 ross if ( bExp == 0x7FFF ) {
4647 1.1 ross if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b );
4648 1.1 ross return packFloat128( zSign, 0x7FFF, 0, 0 );
4649 1.1 ross }
4650 1.1 ross if ( aExp == 0 ) {
4651 1.1 ross ++expDiff;
4652 1.1 ross }
4653 1.1 ross else {
4654 1.1 ross aSig0 |= LIT64( 0x0001000000000000 );
4655 1.1 ross }
4656 1.1 ross shift128ExtraRightJamming(
4657 1.1 ross aSig0, aSig1, 0, - expDiff, &aSig0, &aSig1, &zSig2 );
4658 1.1 ross zExp = bExp;
4659 1.1 ross }
4660 1.1 ross else {
4661 1.1 ross if ( aExp == 0x7FFF ) {
4662 1.1 ross if ( aSig0 | aSig1 | bSig0 | bSig1 ) {
4663 1.1 ross return propagateFloat128NaN( a, b );
4664 1.1 ross }
4665 1.1 ross return a;
4666 1.1 ross }
4667 1.1 ross add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
4668 1.1 ross if ( aExp == 0 ) return packFloat128( zSign, 0, zSig0, zSig1 );
4669 1.1 ross zSig2 = 0;
4670 1.1 ross zSig0 |= LIT64( 0x0002000000000000 );
4671 1.1 ross zExp = aExp;
4672 1.1 ross goto shiftRight1;
4673 1.1 ross }
4674 1.1 ross aSig0 |= LIT64( 0x0001000000000000 );
4675 1.1 ross add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
4676 1.1 ross --zExp;
4677 1.1 ross if ( zSig0 < LIT64( 0x0002000000000000 ) ) goto roundAndPack;
4678 1.1 ross ++zExp;
4679 1.1 ross shiftRight1:
4680 1.1 ross shift128ExtraRightJamming(
4681 1.1 ross zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 );
4682 1.1 ross roundAndPack:
4683 1.1 ross return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 );
4684 1.1 ross
4685 1.1 ross }
4686 1.1 ross
4687 1.7 thorpej /*----------------------------------------------------------------------------
4688 1.7 thorpej | Returns the result of subtracting the absolute values of the quadruple-
4689 1.7 thorpej | precision floating-point values `a' and `b'. If `zSign' is 1, the
4690 1.7 thorpej | difference is negated before being returned. `zSign' is ignored if the
4691 1.7 thorpej | result is a NaN. The subtraction is performed according to the IEC/IEEE
4692 1.7 thorpej | Standard for Binary Floating-Point Arithmetic.
4693 1.7 thorpej *----------------------------------------------------------------------------*/
4694 1.7 thorpej
4695 1.1 ross static float128 subFloat128Sigs( float128 a, float128 b, flag zSign )
4696 1.1 ross {
4697 1.1 ross int32 aExp, bExp, zExp;
4698 1.1 ross bits64 aSig0, aSig1, bSig0, bSig1, zSig0, zSig1;
4699 1.1 ross int32 expDiff;
4700 1.1 ross float128 z;
4701 1.1 ross
4702 1.1 ross aSig1 = extractFloat128Frac1( a );
4703 1.1 ross aSig0 = extractFloat128Frac0( a );
4704 1.1 ross aExp = extractFloat128Exp( a );
4705 1.1 ross bSig1 = extractFloat128Frac1( b );
4706 1.1 ross bSig0 = extractFloat128Frac0( b );
4707 1.1 ross bExp = extractFloat128Exp( b );
4708 1.1 ross expDiff = aExp - bExp;
4709 1.1 ross shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 );
4710 1.1 ross shortShift128Left( bSig0, bSig1, 14, &bSig0, &bSig1 );
4711 1.1 ross if ( 0 < expDiff ) goto aExpBigger;
4712 1.1 ross if ( expDiff < 0 ) goto bExpBigger;
4713 1.1 ross if ( aExp == 0x7FFF ) {
4714 1.1 ross if ( aSig0 | aSig1 | bSig0 | bSig1 ) {
4715 1.1 ross return propagateFloat128NaN( a, b );
4716 1.1 ross }
4717 1.1 ross float_raise( float_flag_invalid );
4718 1.1 ross z.low = float128_default_nan_low;
4719 1.1 ross z.high = float128_default_nan_high;
4720 1.1 ross return z;
4721 1.1 ross }
4722 1.1 ross if ( aExp == 0 ) {
4723 1.1 ross aExp = 1;
4724 1.1 ross bExp = 1;
4725 1.1 ross }
4726 1.1 ross if ( bSig0 < aSig0 ) goto aBigger;
4727 1.1 ross if ( aSig0 < bSig0 ) goto bBigger;
4728 1.1 ross if ( bSig1 < aSig1 ) goto aBigger;
4729 1.1 ross if ( aSig1 < bSig1 ) goto bBigger;
4730 1.1 ross return packFloat128( float_rounding_mode() == float_round_down, 0, 0, 0 );
4731 1.1 ross bExpBigger:
4732 1.1 ross if ( bExp == 0x7FFF ) {
4733 1.1 ross if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b );
4734 1.1 ross return packFloat128( zSign ^ 1, 0x7FFF, 0, 0 );
4735 1.1 ross }
4736 1.1 ross if ( aExp == 0 ) {
4737 1.1 ross ++expDiff;
4738 1.1 ross }
4739 1.1 ross else {
4740 1.1 ross aSig0 |= LIT64( 0x4000000000000000 );
4741 1.1 ross }
4742 1.1 ross shift128RightJamming( aSig0, aSig1, - expDiff, &aSig0, &aSig1 );
4743 1.1 ross bSig0 |= LIT64( 0x4000000000000000 );
4744 1.1 ross bBigger:
4745 1.1 ross sub128( bSig0, bSig1, aSig0, aSig1, &zSig0, &zSig1 );
4746 1.1 ross zExp = bExp;
4747 1.1 ross zSign ^= 1;
4748 1.1 ross goto normalizeRoundAndPack;
4749 1.1 ross aExpBigger:
4750 1.1 ross if ( aExp == 0x7FFF ) {
4751 1.1 ross if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, b );
4752 1.1 ross return a;
4753 1.1 ross }
4754 1.1 ross if ( bExp == 0 ) {
4755 1.1 ross --expDiff;
4756 1.1 ross }
4757 1.1 ross else {
4758 1.1 ross bSig0 |= LIT64( 0x4000000000000000 );
4759 1.1 ross }
4760 1.1 ross shift128RightJamming( bSig0, bSig1, expDiff, &bSig0, &bSig1 );
4761 1.1 ross aSig0 |= LIT64( 0x4000000000000000 );
4762 1.1 ross aBigger:
4763 1.1 ross sub128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
4764 1.1 ross zExp = aExp;
4765 1.1 ross normalizeRoundAndPack:
4766 1.1 ross --zExp;
4767 1.1 ross return normalizeRoundAndPackFloat128( zSign, zExp - 14, zSig0, zSig1 );
4768 1.1 ross
4769 1.1 ross }
4770 1.1 ross
4771 1.7 thorpej /*----------------------------------------------------------------------------
4772 1.7 thorpej | Returns the result of adding the quadruple-precision floating-point values
4773 1.7 thorpej | `a' and `b'. The operation is performed according to the IEC/IEEE Standard
4774 1.7 thorpej | for Binary Floating-Point Arithmetic.
4775 1.7 thorpej *----------------------------------------------------------------------------*/
4776 1.7 thorpej
4777 1.1 ross float128 float128_add( float128 a, float128 b )
4778 1.1 ross {
4779 1.1 ross flag aSign, bSign;
4780 1.1 ross
4781 1.1 ross aSign = extractFloat128Sign( a );
4782 1.1 ross bSign = extractFloat128Sign( b );
4783 1.1 ross if ( aSign == bSign ) {
4784 1.1 ross return addFloat128Sigs( a, b, aSign );
4785 1.1 ross }
4786 1.1 ross else {
4787 1.1 ross return subFloat128Sigs( a, b, aSign );
4788 1.1 ross }
4789 1.1 ross
4790 1.1 ross }
4791 1.1 ross
4792 1.7 thorpej /*----------------------------------------------------------------------------
4793 1.7 thorpej | Returns the result of subtracting the quadruple-precision floating-point
4794 1.7 thorpej | values `a' and `b'. The operation is performed according to the IEC/IEEE
4795 1.7 thorpej | Standard for Binary Floating-Point Arithmetic.
4796 1.7 thorpej *----------------------------------------------------------------------------*/
4797 1.7 thorpej
4798 1.1 ross float128 float128_sub( float128 a, float128 b )
4799 1.1 ross {
4800 1.1 ross flag aSign, bSign;
4801 1.1 ross
4802 1.1 ross aSign = extractFloat128Sign( a );
4803 1.1 ross bSign = extractFloat128Sign( b );
4804 1.1 ross if ( aSign == bSign ) {
4805 1.1 ross return subFloat128Sigs( a, b, aSign );
4806 1.1 ross }
4807 1.1 ross else {
4808 1.1 ross return addFloat128Sigs( a, b, aSign );
4809 1.1 ross }
4810 1.1 ross
4811 1.1 ross }
4812 1.1 ross
4813 1.7 thorpej /*----------------------------------------------------------------------------
4814 1.7 thorpej | Returns the result of multiplying the quadruple-precision floating-point
4815 1.7 thorpej | values `a' and `b'. The operation is performed according to the IEC/IEEE
4816 1.7 thorpej | Standard for Binary Floating-Point Arithmetic.
4817 1.7 thorpej *----------------------------------------------------------------------------*/
4818 1.7 thorpej
4819 1.1 ross float128 float128_mul( float128 a, float128 b )
4820 1.1 ross {
4821 1.1 ross flag aSign, bSign, zSign;
4822 1.1 ross int32 aExp, bExp, zExp;
4823 1.1 ross bits64 aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2, zSig3;
4824 1.1 ross float128 z;
4825 1.1 ross
4826 1.1 ross aSig1 = extractFloat128Frac1( a );
4827 1.1 ross aSig0 = extractFloat128Frac0( a );
4828 1.1 ross aExp = extractFloat128Exp( a );
4829 1.1 ross aSign = extractFloat128Sign( a );
4830 1.1 ross bSig1 = extractFloat128Frac1( b );
4831 1.1 ross bSig0 = extractFloat128Frac0( b );
4832 1.1 ross bExp = extractFloat128Exp( b );
4833 1.1 ross bSign = extractFloat128Sign( b );
4834 1.1 ross zSign = aSign ^ bSign;
4835 1.1 ross if ( aExp == 0x7FFF ) {
4836 1.1 ross if ( ( aSig0 | aSig1 )
4837 1.1 ross || ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) {
4838 1.1 ross return propagateFloat128NaN( a, b );
4839 1.1 ross }
4840 1.1 ross if ( ( bExp | bSig0 | bSig1 ) == 0 ) goto invalid;
4841 1.1 ross return packFloat128( zSign, 0x7FFF, 0, 0 );
4842 1.1 ross }
4843 1.1 ross if ( bExp == 0x7FFF ) {
4844 1.1 ross if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b );
4845 1.1 ross if ( ( aExp | aSig0 | aSig1 ) == 0 ) {
4846 1.1 ross invalid:
4847 1.1 ross float_raise( float_flag_invalid );
4848 1.1 ross z.low = float128_default_nan_low;
4849 1.1 ross z.high = float128_default_nan_high;
4850 1.1 ross return z;
4851 1.1 ross }
4852 1.1 ross return packFloat128( zSign, 0x7FFF, 0, 0 );
4853 1.1 ross }
4854 1.1 ross if ( aExp == 0 ) {
4855 1.1 ross if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
4856 1.1 ross normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
4857 1.1 ross }
4858 1.1 ross if ( bExp == 0 ) {
4859 1.1 ross if ( ( bSig0 | bSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
4860 1.1 ross normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
4861 1.1 ross }
4862 1.1 ross zExp = aExp + bExp - 0x4000;
4863 1.1 ross aSig0 |= LIT64( 0x0001000000000000 );
4864 1.1 ross shortShift128Left( bSig0, bSig1, 16, &bSig0, &bSig1 );
4865 1.1 ross mul128To256( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1, &zSig2, &zSig3 );
4866 1.1 ross add128( zSig0, zSig1, aSig0, aSig1, &zSig0, &zSig1 );
4867 1.1 ross zSig2 |= ( zSig3 != 0 );
4868 1.1 ross if ( LIT64( 0x0002000000000000 ) <= zSig0 ) {
4869 1.1 ross shift128ExtraRightJamming(
4870 1.1 ross zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 );
4871 1.1 ross ++zExp;
4872 1.1 ross }
4873 1.1 ross return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 );
4874 1.1 ross
4875 1.1 ross }
4876 1.1 ross
4877 1.7 thorpej /*----------------------------------------------------------------------------
4878 1.7 thorpej | Returns the result of dividing the quadruple-precision floating-point value
4879 1.7 thorpej | `a' by the corresponding value `b'. The operation is performed according to
4880 1.7 thorpej | the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4881 1.7 thorpej *----------------------------------------------------------------------------*/
4882 1.7 thorpej
4883 1.1 ross float128 float128_div( float128 a, float128 b )
4884 1.1 ross {
4885 1.1 ross flag aSign, bSign, zSign;
4886 1.1 ross int32 aExp, bExp, zExp;
4887 1.1 ross bits64 aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2;
4888 1.1 ross bits64 rem0, rem1, rem2, rem3, term0, term1, term2, term3;
4889 1.1 ross float128 z;
4890 1.1 ross
4891 1.1 ross aSig1 = extractFloat128Frac1( a );
4892 1.1 ross aSig0 = extractFloat128Frac0( a );
4893 1.1 ross aExp = extractFloat128Exp( a );
4894 1.1 ross aSign = extractFloat128Sign( a );
4895 1.1 ross bSig1 = extractFloat128Frac1( b );
4896 1.1 ross bSig0 = extractFloat128Frac0( b );
4897 1.1 ross bExp = extractFloat128Exp( b );
4898 1.1 ross bSign = extractFloat128Sign( b );
4899 1.1 ross zSign = aSign ^ bSign;
4900 1.1 ross if ( aExp == 0x7FFF ) {
4901 1.1 ross if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, b );
4902 1.1 ross if ( bExp == 0x7FFF ) {
4903 1.1 ross if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b );
4904 1.1 ross goto invalid;
4905 1.1 ross }
4906 1.1 ross return packFloat128( zSign, 0x7FFF, 0, 0 );
4907 1.1 ross }
4908 1.1 ross if ( bExp == 0x7FFF ) {
4909 1.1 ross if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b );
4910 1.1 ross return packFloat128( zSign, 0, 0, 0 );
4911 1.1 ross }
4912 1.1 ross if ( bExp == 0 ) {
4913 1.1 ross if ( ( bSig0 | bSig1 ) == 0 ) {
4914 1.1 ross if ( ( aExp | aSig0 | aSig1 ) == 0 ) {
4915 1.1 ross invalid:
4916 1.1 ross float_raise( float_flag_invalid );
4917 1.1 ross z.low = float128_default_nan_low;
4918 1.1 ross z.high = float128_default_nan_high;
4919 1.1 ross return z;
4920 1.1 ross }
4921 1.1 ross float_raise( float_flag_divbyzero );
4922 1.1 ross return packFloat128( zSign, 0x7FFF, 0, 0 );
4923 1.1 ross }
4924 1.1 ross normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
4925 1.1 ross }
4926 1.1 ross if ( aExp == 0 ) {
4927 1.1 ross if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
4928 1.1 ross normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
4929 1.1 ross }
4930 1.1 ross zExp = aExp - bExp + 0x3FFD;
4931 1.1 ross shortShift128Left(
4932 1.1 ross aSig0 | LIT64( 0x0001000000000000 ), aSig1, 15, &aSig0, &aSig1 );
4933 1.1 ross shortShift128Left(
4934 1.1 ross bSig0 | LIT64( 0x0001000000000000 ), bSig1, 15, &bSig0, &bSig1 );
4935 1.1 ross if ( le128( bSig0, bSig1, aSig0, aSig1 ) ) {
4936 1.1 ross shift128Right( aSig0, aSig1, 1, &aSig0, &aSig1 );
4937 1.1 ross ++zExp;
4938 1.1 ross }
4939 1.1 ross zSig0 = estimateDiv128To64( aSig0, aSig1, bSig0 );
4940 1.1 ross mul128By64To192( bSig0, bSig1, zSig0, &term0, &term1, &term2 );
4941 1.1 ross sub192( aSig0, aSig1, 0, term0, term1, term2, &rem0, &rem1, &rem2 );
4942 1.1 ross while ( (sbits64) rem0 < 0 ) {
4943 1.1 ross --zSig0;
4944 1.1 ross add192( rem0, rem1, rem2, 0, bSig0, bSig1, &rem0, &rem1, &rem2 );
4945 1.1 ross }
4946 1.1 ross zSig1 = estimateDiv128To64( rem1, rem2, bSig0 );
4947 1.1 ross if ( ( zSig1 & 0x3FFF ) <= 4 ) {
4948 1.1 ross mul128By64To192( bSig0, bSig1, zSig1, &term1, &term2, &term3 );
4949 1.1 ross sub192( rem1, rem2, 0, term1, term2, term3, &rem1, &rem2, &rem3 );
4950 1.1 ross while ( (sbits64) rem1 < 0 ) {
4951 1.1 ross --zSig1;
4952 1.1 ross add192( rem1, rem2, rem3, 0, bSig0, bSig1, &rem1, &rem2, &rem3 );
4953 1.1 ross }
4954 1.1 ross zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
4955 1.1 ross }
4956 1.1 ross shift128ExtraRightJamming( zSig0, zSig1, 0, 15, &zSig0, &zSig1, &zSig2 );
4957 1.1 ross return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 );
4958 1.1 ross
4959 1.1 ross }
4960 1.1 ross
4961 1.7 thorpej /*----------------------------------------------------------------------------
4962 1.7 thorpej | Returns the remainder of the quadruple-precision floating-point value `a'
4963 1.7 thorpej | with respect to the corresponding value `b'. The operation is performed
4964 1.7 thorpej | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4965 1.7 thorpej *----------------------------------------------------------------------------*/
4966 1.7 thorpej
4967 1.1 ross float128 float128_rem( float128 a, float128 b )
4968 1.1 ross {
4969 1.1 ross flag aSign, bSign, zSign;
4970 1.1 ross int32 aExp, bExp, expDiff;
4971 1.1 ross bits64 aSig0, aSig1, bSig0, bSig1, q, term0, term1, term2;
4972 1.1 ross bits64 allZero, alternateASig0, alternateASig1, sigMean1;
4973 1.1 ross sbits64 sigMean0;
4974 1.1 ross float128 z;
4975 1.1 ross
4976 1.1 ross aSig1 = extractFloat128Frac1( a );
4977 1.1 ross aSig0 = extractFloat128Frac0( a );
4978 1.1 ross aExp = extractFloat128Exp( a );
4979 1.1 ross aSign = extractFloat128Sign( a );
4980 1.1 ross bSig1 = extractFloat128Frac1( b );
4981 1.1 ross bSig0 = extractFloat128Frac0( b );
4982 1.1 ross bExp = extractFloat128Exp( b );
4983 1.1 ross bSign = extractFloat128Sign( b );
4984 1.1 ross if ( aExp == 0x7FFF ) {
4985 1.1 ross if ( ( aSig0 | aSig1 )
4986 1.1 ross || ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) {
4987 1.1 ross return propagateFloat128NaN( a, b );
4988 1.1 ross }
4989 1.1 ross goto invalid;
4990 1.1 ross }
4991 1.1 ross if ( bExp == 0x7FFF ) {
4992 1.1 ross if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b );
4993 1.1 ross return a;
4994 1.1 ross }
4995 1.1 ross if ( bExp == 0 ) {
4996 1.1 ross if ( ( bSig0 | bSig1 ) == 0 ) {
4997 1.1 ross invalid:
4998 1.1 ross float_raise( float_flag_invalid );
4999 1.1 ross z.low = float128_default_nan_low;
5000 1.1 ross z.high = float128_default_nan_high;
5001 1.1 ross return z;
5002 1.1 ross }
5003 1.1 ross normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
5004 1.1 ross }
5005 1.1 ross if ( aExp == 0 ) {
5006 1.1 ross if ( ( aSig0 | aSig1 ) == 0 ) return a;
5007 1.1 ross normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
5008 1.1 ross }
5009 1.1 ross expDiff = aExp - bExp;
5010 1.1 ross if ( expDiff < -1 ) return a;
5011 1.1 ross shortShift128Left(
5012 1.1 ross aSig0 | LIT64( 0x0001000000000000 ),
5013 1.1 ross aSig1,
5014 1.1 ross 15 - ( expDiff < 0 ),
5015 1.1 ross &aSig0,
5016 1.1 ross &aSig1
5017 1.1 ross );
5018 1.1 ross shortShift128Left(
5019 1.1 ross bSig0 | LIT64( 0x0001000000000000 ), bSig1, 15, &bSig0, &bSig1 );
5020 1.1 ross q = le128( bSig0, bSig1, aSig0, aSig1 );
5021 1.1 ross if ( q ) sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 );
5022 1.1 ross expDiff -= 64;
5023 1.1 ross while ( 0 < expDiff ) {
5024 1.1 ross q = estimateDiv128To64( aSig0, aSig1, bSig0 );
5025 1.1 ross q = ( 4 < q ) ? q - 4 : 0;
5026 1.1 ross mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 );
5027 1.1 ross shortShift192Left( term0, term1, term2, 61, &term1, &term2, &allZero );
5028 1.1 ross shortShift128Left( aSig0, aSig1, 61, &aSig0, &allZero );
5029 1.1 ross sub128( aSig0, 0, term1, term2, &aSig0, &aSig1 );
5030 1.1 ross expDiff -= 61;
5031 1.1 ross }
5032 1.1 ross if ( -64 < expDiff ) {
5033 1.1 ross q = estimateDiv128To64( aSig0, aSig1, bSig0 );
5034 1.1 ross q = ( 4 < q ) ? q - 4 : 0;
5035 1.1 ross q >>= - expDiff;
5036 1.1 ross shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 );
5037 1.1 ross expDiff += 52;
5038 1.1 ross if ( expDiff < 0 ) {
5039 1.1 ross shift128Right( aSig0, aSig1, - expDiff, &aSig0, &aSig1 );
5040 1.1 ross }
5041 1.1 ross else {
5042 1.1 ross shortShift128Left( aSig0, aSig1, expDiff, &aSig0, &aSig1 );
5043 1.1 ross }
5044 1.1 ross mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 );
5045 1.1 ross sub128( aSig0, aSig1, term1, term2, &aSig0, &aSig1 );
5046 1.1 ross }
5047 1.1 ross else {
5048 1.1 ross shift128Right( aSig0, aSig1, 12, &aSig0, &aSig1 );
5049 1.1 ross shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 );
5050 1.1 ross }
5051 1.1 ross do {
5052 1.1 ross alternateASig0 = aSig0;
5053 1.1 ross alternateASig1 = aSig1;
5054 1.1 ross ++q;
5055 1.1 ross sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 );
5056 1.1 ross } while ( 0 <= (sbits64) aSig0 );
5057 1.1 ross add128(
5058 1.1 ross aSig0, aSig1, alternateASig0, alternateASig1, &sigMean0, &sigMean1 );
5059 1.1 ross if ( ( sigMean0 < 0 )
5060 1.1 ross || ( ( ( sigMean0 | sigMean1 ) == 0 ) && ( q & 1 ) ) ) {
5061 1.1 ross aSig0 = alternateASig0;
5062 1.1 ross aSig1 = alternateASig1;
5063 1.1 ross }
5064 1.1 ross zSign = ( (sbits64) aSig0 < 0 );
5065 1.1 ross if ( zSign ) sub128( 0, 0, aSig0, aSig1, &aSig0, &aSig1 );
5066 1.1 ross return
5067 1.1 ross normalizeRoundAndPackFloat128( aSign ^ zSign, bExp - 4, aSig0, aSig1 );
5068 1.1 ross
5069 1.1 ross }
5070 1.1 ross
5071 1.7 thorpej /*----------------------------------------------------------------------------
5072 1.7 thorpej | Returns the square root of the quadruple-precision floating-point value `a'.
5073 1.7 thorpej | The operation is performed according to the IEC/IEEE Standard for Binary
5074 1.7 thorpej | Floating-Point Arithmetic.
5075 1.7 thorpej *----------------------------------------------------------------------------*/
5076 1.7 thorpej
5077 1.1 ross float128 float128_sqrt( float128 a )
5078 1.1 ross {
5079 1.1 ross flag aSign;
5080 1.1 ross int32 aExp, zExp;
5081 1.1 ross bits64 aSig0, aSig1, zSig0, zSig1, zSig2, doubleZSig0;
5082 1.1 ross bits64 rem0, rem1, rem2, rem3, term0, term1, term2, term3;
5083 1.1 ross float128 z;
5084 1.1 ross
5085 1.1 ross aSig1 = extractFloat128Frac1( a );
5086 1.1 ross aSig0 = extractFloat128Frac0( a );
5087 1.1 ross aExp = extractFloat128Exp( a );
5088 1.1 ross aSign = extractFloat128Sign( a );
5089 1.1 ross if ( aExp == 0x7FFF ) {
5090 1.1 ross if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, a );
5091 1.1 ross if ( ! aSign ) return a;
5092 1.1 ross goto invalid;
5093 1.1 ross }
5094 1.1 ross if ( aSign ) {
5095 1.1 ross if ( ( aExp | aSig0 | aSig1 ) == 0 ) return a;
5096 1.1 ross invalid:
5097 1.1 ross float_raise( float_flag_invalid );
5098 1.1 ross z.low = float128_default_nan_low;
5099 1.1 ross z.high = float128_default_nan_high;
5100 1.1 ross return z;
5101 1.1 ross }
5102 1.1 ross if ( aExp == 0 ) {
5103 1.1 ross if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( 0, 0, 0, 0 );
5104 1.1 ross normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
5105 1.1 ross }
5106 1.1 ross zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFE;
5107 1.1 ross aSig0 |= LIT64( 0x0001000000000000 );
5108 1.1 ross zSig0 = estimateSqrt32( aExp, aSig0>>17 );
5109 1.1 ross shortShift128Left( aSig0, aSig1, 13 - ( aExp & 1 ), &aSig0, &aSig1 );
5110 1.1 ross zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 );
5111 1.1 ross doubleZSig0 = zSig0<<1;
5112 1.1 ross mul64To128( zSig0, zSig0, &term0, &term1 );
5113 1.1 ross sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 );
5114 1.1 ross while ( (sbits64) rem0 < 0 ) {
5115 1.1 ross --zSig0;
5116 1.1 ross doubleZSig0 -= 2;
5117 1.1 ross add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 );
5118 1.1 ross }
5119 1.1 ross zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 );
5120 1.1 ross if ( ( zSig1 & 0x1FFF ) <= 5 ) {
5121 1.1 ross if ( zSig1 == 0 ) zSig1 = 1;
5122 1.1 ross mul64To128( doubleZSig0, zSig1, &term1, &term2 );
5123 1.1 ross sub128( rem1, 0, term1, term2, &rem1, &rem2 );
5124 1.1 ross mul64To128( zSig1, zSig1, &term2, &term3 );
5125 1.1 ross sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 );
5126 1.1 ross while ( (sbits64) rem1 < 0 ) {
5127 1.1 ross --zSig1;
5128 1.1 ross shortShift128Left( 0, zSig1, 1, &term2, &term3 );
5129 1.1 ross term3 |= 1;
5130 1.1 ross term2 |= doubleZSig0;
5131 1.1 ross add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 );
5132 1.1 ross }
5133 1.1 ross zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
5134 1.1 ross }
5135 1.1 ross shift128ExtraRightJamming( zSig0, zSig1, 0, 14, &zSig0, &zSig1, &zSig2 );
5136 1.1 ross return roundAndPackFloat128( 0, zExp, zSig0, zSig1, zSig2 );
5137 1.1 ross
5138 1.1 ross }
5139 1.1 ross
5140 1.7 thorpej /*----------------------------------------------------------------------------
5141 1.7 thorpej | Returns 1 if the quadruple-precision floating-point value `a' is equal to
5142 1.7 thorpej | the corresponding value `b', and 0 otherwise. The comparison is performed
5143 1.7 thorpej | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5144 1.7 thorpej *----------------------------------------------------------------------------*/
5145 1.7 thorpej
5146 1.1 ross flag float128_eq( float128 a, float128 b )
5147 1.1 ross {
5148 1.1 ross
5149 1.1 ross if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
5150 1.1 ross && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
5151 1.1 ross || ( ( extractFloat128Exp( b ) == 0x7FFF )
5152 1.1 ross && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
5153 1.1 ross ) {
5154 1.1 ross if ( float128_is_signaling_nan( a )
5155 1.1 ross || float128_is_signaling_nan( b ) ) {
5156 1.1 ross float_raise( float_flag_invalid );
5157 1.1 ross }
5158 1.1 ross return 0;
5159 1.1 ross }
5160 1.1 ross return
5161 1.1 ross ( a.low == b.low )
5162 1.1 ross && ( ( a.high == b.high )
5163 1.1 ross || ( ( a.low == 0 )
5164 1.1 ross && ( (bits64) ( ( a.high | b.high )<<1 ) == 0 ) )
5165 1.1 ross );
5166 1.1 ross
5167 1.1 ross }
5168 1.1 ross
5169 1.7 thorpej /*----------------------------------------------------------------------------
5170 1.7 thorpej | Returns 1 if the quadruple-precision floating-point value `a' is less than
5171 1.7 thorpej | or equal to the corresponding value `b', and 0 otherwise. The comparison
5172 1.7 thorpej | is performed according to the IEC/IEEE Standard for Binary Floating-Point
5173 1.7 thorpej | Arithmetic.
5174 1.7 thorpej *----------------------------------------------------------------------------*/
5175 1.7 thorpej
5176 1.1 ross flag float128_le( float128 a, float128 b )
5177 1.1 ross {
5178 1.1 ross flag aSign, bSign;
5179 1.1 ross
5180 1.1 ross if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
5181 1.1 ross && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
5182 1.1 ross || ( ( extractFloat128Exp( b ) == 0x7FFF )
5183 1.1 ross && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
5184 1.1 ross ) {
5185 1.1 ross float_raise( float_flag_invalid );
5186 1.1 ross return 0;
5187 1.1 ross }
5188 1.1 ross aSign = extractFloat128Sign( a );
5189 1.1 ross bSign = extractFloat128Sign( b );
5190 1.1 ross if ( aSign != bSign ) {
5191 1.1 ross return
5192 1.1 ross aSign
5193 1.1 ross || ( ( ( (bits64) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5194 1.1 ross == 0 );
5195 1.1 ross }
5196 1.1 ross return
5197 1.1 ross aSign ? le128( b.high, b.low, a.high, a.low )
5198 1.1 ross : le128( a.high, a.low, b.high, b.low );
5199 1.1 ross
5200 1.1 ross }
5201 1.1 ross
5202 1.7 thorpej /*----------------------------------------------------------------------------
5203 1.7 thorpej | Returns 1 if the quadruple-precision floating-point value `a' is less than
5204 1.7 thorpej | the corresponding value `b', and 0 otherwise. The comparison is performed
5205 1.7 thorpej | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5206 1.7 thorpej *----------------------------------------------------------------------------*/
5207 1.7 thorpej
5208 1.1 ross flag float128_lt( float128 a, float128 b )
5209 1.1 ross {
5210 1.1 ross flag aSign, bSign;
5211 1.1 ross
5212 1.1 ross if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
5213 1.1 ross && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
5214 1.1 ross || ( ( extractFloat128Exp( b ) == 0x7FFF )
5215 1.1 ross && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
5216 1.1 ross ) {
5217 1.1 ross float_raise( float_flag_invalid );
5218 1.1 ross return 0;
5219 1.1 ross }
5220 1.1 ross aSign = extractFloat128Sign( a );
5221 1.1 ross bSign = extractFloat128Sign( b );
5222 1.1 ross if ( aSign != bSign ) {
5223 1.1 ross return
5224 1.1 ross aSign
5225 1.1 ross && ( ( ( (bits64) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5226 1.1 ross != 0 );
5227 1.1 ross }
5228 1.1 ross return
5229 1.1 ross aSign ? lt128( b.high, b.low, a.high, a.low )
5230 1.1 ross : lt128( a.high, a.low, b.high, b.low );
5231 1.1 ross
5232 1.1 ross }
5233 1.1 ross
5234 1.7 thorpej /*----------------------------------------------------------------------------
5235 1.7 thorpej | Returns 1 if the quadruple-precision floating-point value `a' is equal to
5236 1.7 thorpej | the corresponding value `b', and 0 otherwise. The invalid exception is
5237 1.7 thorpej | raised if either operand is a NaN. Otherwise, the comparison is performed
5238 1.7 thorpej | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5239 1.7 thorpej *----------------------------------------------------------------------------*/
5240 1.7 thorpej
5241 1.1 ross flag float128_eq_signaling( float128 a, float128 b )
5242 1.1 ross {
5243 1.1 ross
5244 1.1 ross if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
5245 1.1 ross && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
5246 1.1 ross || ( ( extractFloat128Exp( b ) == 0x7FFF )
5247 1.1 ross && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
5248 1.1 ross ) {
5249 1.1 ross float_raise( float_flag_invalid );
5250 1.1 ross return 0;
5251 1.1 ross }
5252 1.1 ross return
5253 1.1 ross ( a.low == b.low )
5254 1.1 ross && ( ( a.high == b.high )
5255 1.1 ross || ( ( a.low == 0 )
5256 1.1 ross && ( (bits64) ( ( a.high | b.high )<<1 ) == 0 ) )
5257 1.1 ross );
5258 1.1 ross
5259 1.1 ross }
5260 1.1 ross
5261 1.7 thorpej /*----------------------------------------------------------------------------
5262 1.7 thorpej | Returns 1 if the quadruple-precision floating-point value `a' is less than
5263 1.7 thorpej | or equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not
5264 1.7 thorpej | cause an exception. Otherwise, the comparison is performed according to the
5265 1.7 thorpej | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5266 1.7 thorpej *----------------------------------------------------------------------------*/
5267 1.7 thorpej
5268 1.1 ross flag float128_le_quiet( float128 a, float128 b )
5269 1.1 ross {
5270 1.1 ross flag aSign, bSign;
5271 1.1 ross
5272 1.1 ross if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
5273 1.1 ross && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
5274 1.1 ross || ( ( extractFloat128Exp( b ) == 0x7FFF )
5275 1.1 ross && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
5276 1.1 ross ) {
5277 1.1 ross if ( float128_is_signaling_nan( a )
5278 1.1 ross || float128_is_signaling_nan( b ) ) {
5279 1.1 ross float_raise( float_flag_invalid );
5280 1.1 ross }
5281 1.1 ross return 0;
5282 1.1 ross }
5283 1.1 ross aSign = extractFloat128Sign( a );
5284 1.1 ross bSign = extractFloat128Sign( b );
5285 1.1 ross if ( aSign != bSign ) {
5286 1.1 ross return
5287 1.1 ross aSign
5288 1.1 ross || ( ( ( (bits64) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5289 1.1 ross == 0 );
5290 1.1 ross }
5291 1.1 ross return
5292 1.1 ross aSign ? le128( b.high, b.low, a.high, a.low )
5293 1.1 ross : le128( a.high, a.low, b.high, b.low );
5294 1.1 ross
5295 1.1 ross }
5296 1.1 ross
5297 1.7 thorpej /*----------------------------------------------------------------------------
5298 1.7 thorpej | Returns 1 if the quadruple-precision floating-point value `a' is less than
5299 1.7 thorpej | the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
5300 1.7 thorpej | exception. Otherwise, the comparison is performed according to the IEC/IEEE
5301 1.7 thorpej | Standard for Binary Floating-Point Arithmetic.
5302 1.7 thorpej *----------------------------------------------------------------------------*/
5303 1.7 thorpej
5304 1.1 ross flag float128_lt_quiet( float128 a, float128 b )
5305 1.1 ross {
5306 1.1 ross flag aSign, bSign;
5307 1.1 ross
5308 1.1 ross if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
5309 1.1 ross && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
5310 1.1 ross || ( ( extractFloat128Exp( b ) == 0x7FFF )
5311 1.1 ross && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
5312 1.1 ross ) {
5313 1.1 ross if ( float128_is_signaling_nan( a )
5314 1.1 ross || float128_is_signaling_nan( b ) ) {
5315 1.1 ross float_raise( float_flag_invalid );
5316 1.1 ross }
5317 1.1 ross return 0;
5318 1.1 ross }
5319 1.1 ross aSign = extractFloat128Sign( a );
5320 1.1 ross bSign = extractFloat128Sign( b );
5321 1.1 ross if ( aSign != bSign ) {
5322 1.1 ross return
5323 1.1 ross aSign
5324 1.1 ross && ( ( ( (bits64) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5325 1.1 ross != 0 );
5326 1.1 ross }
5327 1.1 ross return
5328 1.1 ross aSign ? lt128( b.high, b.low, a.high, a.low )
5329 1.1 ross : lt128( a.high, a.low, b.high, b.low );
5330 1.1 ross
5331 1.1 ross }
5332 1.1 ross
5333 1.1 ross #endif
5334 1.1 ross
5335 1.1 ross
5336 1.1 ross #if defined(SOFTFLOAT_FOR_GCC) && defined(SOFTFLOAT_NEED_FIXUNS)
5337 1.1 ross
5338 1.1 ross /*
5339 1.1 ross * These two routines are not part of the original softfloat distribution.
5340 1.1 ross *
5341 1.1 ross * They are based on the corresponding conversions to integer but return
5342 1.1 ross * unsigned numbers instead since these functions are required by GCC.
5343 1.1 ross *
5344 1.3 keihan * Added by Mark Brinicombe <mark (at) NetBSD.org> 27/09/97
5345 1.1 ross *
5346 1.1 ross * float64 version overhauled for SoftFloat 2a [bjh21 2000-07-15]
5347 1.1 ross */
5348 1.1 ross
5349 1.7 thorpej /*----------------------------------------------------------------------------
5350 1.7 thorpej | Returns the result of converting the double-precision floating-point value
5351 1.7 thorpej | `a' to the 32-bit unsigned integer format. The conversion is
5352 1.7 thorpej | performed according to the IEC/IEEE Standard for Binary Floating-point
5353 1.7 thorpej | Arithmetic, except that the conversion is always rounded toward zero. If
5354 1.7 thorpej | `a' is a NaN, the largest positive integer is returned. If the conversion
5355 1.7 thorpej | overflows, the largest integer positive is returned.
5356 1.7 thorpej *----------------------------------------------------------------------------*/
5357 1.7 thorpej
5358 1.1 ross uint32 float64_to_uint32_round_to_zero( float64 a )
5359 1.1 ross {
5360 1.1 ross flag aSign;
5361 1.1 ross int16 aExp, shiftCount;
5362 1.1 ross bits64 aSig, savedASig;
5363 1.1 ross uint32 z;
5364 1.1 ross
5365 1.1 ross aSig = extractFloat64Frac( a );
5366 1.1 ross aExp = extractFloat64Exp( a );
5367 1.1 ross aSign = extractFloat64Sign( a );
5368 1.1 ross
5369 1.1 ross if (aSign) {
5370 1.1 ross float_raise( float_flag_invalid );
5371 1.1 ross return(0);
5372 1.1 ross }
5373 1.1 ross
5374 1.1 ross if ( 0x41E < aExp ) {
5375 1.1 ross float_raise( float_flag_invalid );
5376 1.1 ross return 0xffffffff;
5377 1.1 ross }
5378 1.1 ross else if ( aExp < 0x3FF ) {
5379 1.1 ross if ( aExp || aSig ) float_set_inexact();
5380 1.1 ross return 0;
5381 1.1 ross }
5382 1.1 ross aSig |= LIT64( 0x0010000000000000 );
5383 1.1 ross shiftCount = 0x433 - aExp;
5384 1.1 ross savedASig = aSig;
5385 1.1 ross aSig >>= shiftCount;
5386 1.1 ross z = aSig;
5387 1.1 ross if ( ( aSig<<shiftCount ) != savedASig ) {
5388 1.1 ross float_set_inexact();
5389 1.1 ross }
5390 1.1 ross return z;
5391 1.1 ross
5392 1.1 ross }
5393 1.1 ross
5394 1.7 thorpej /*----------------------------------------------------------------------------
5395 1.7 thorpej | Returns the result of converting the single-precision floating-point value
5396 1.7 thorpej | `a' to the 32-bit unsigned integer format. The conversion is
5397 1.7 thorpej | performed according to the IEC/IEEE Standard for Binary Floating-point
5398 1.7 thorpej | Arithmetic, except that the conversion is always rounded toward zero. If
5399 1.7 thorpej | `a' is a NaN, the largest positive integer is returned. If the conversion
5400 1.7 thorpej | overflows, the largest positive integer is returned.
5401 1.7 thorpej *----------------------------------------------------------------------------*/
5402 1.7 thorpej
5403 1.1 ross uint32 float32_to_uint32_round_to_zero( float32 a )
5404 1.1 ross {
5405 1.1 ross flag aSign;
5406 1.1 ross int16 aExp, shiftCount;
5407 1.1 ross bits32 aSig;
5408 1.1 ross uint32 z;
5409 1.1 ross
5410 1.1 ross aSig = extractFloat32Frac( a );
5411 1.1 ross aExp = extractFloat32Exp( a );
5412 1.1 ross aSign = extractFloat32Sign( a );
5413 1.1 ross shiftCount = aExp - 0x9E;
5414 1.1 ross
5415 1.1 ross if (aSign) {
5416 1.1 ross float_raise( float_flag_invalid );
5417 1.1 ross return(0);
5418 1.1 ross }
5419 1.1 ross if ( 0 < shiftCount ) {
5420 1.1 ross float_raise( float_flag_invalid );
5421 1.1 ross return 0xFFFFFFFF;
5422 1.1 ross }
5423 1.1 ross else if ( aExp <= 0x7E ) {
5424 1.1 ross if ( aExp | aSig ) float_set_inexact();
5425 1.1 ross return 0;
5426 1.1 ross }
5427 1.1 ross aSig = ( aSig | 0x800000 )<<8;
5428 1.1 ross z = aSig>>( - shiftCount );
5429 1.1 ross if ( aSig<<( shiftCount & 31 ) ) {
5430 1.1 ross float_set_inexact();
5431 1.1 ross }
5432 1.1 ross return z;
5433 1.1 ross
5434 1.1 ross }
5435 1.1 ross
5436 1.1 ross #endif
5437 1.2 thorpej
5438 1.2 thorpej #endif /* _STANDALONE */
5439