softfloat.c revision 1.11.2.2 1 1.11.2.2 tls /* $NetBSD: softfloat.c,v 1.11.2.2 2014/08/20 00:02:15 tls Exp $ */
2 1.1 bjh21
3 1.1 bjh21 /*
4 1.1 bjh21 * This version hacked for use with gcc -msoft-float by bjh21.
5 1.1 bjh21 * (Mostly a case of #ifdefing out things GCC doesn't need or provides
6 1.1 bjh21 * itself).
7 1.1 bjh21 */
8 1.1 bjh21
9 1.1 bjh21 /*
10 1.1 bjh21 * Things you may want to define:
11 1.1 bjh21 *
12 1.1 bjh21 * SOFTFLOAT_FOR_GCC - build only those functions necessary for GCC (with
13 1.1 bjh21 * -msoft-float) to work. Include "softfloat-for-gcc.h" to get them
14 1.1 bjh21 * properly renamed.
15 1.1 bjh21 */
16 1.1 bjh21
17 1.1 bjh21 /*
18 1.1 bjh21 ===============================================================================
19 1.1 bjh21
20 1.1 bjh21 This C source file is part of the SoftFloat IEC/IEEE Floating-point
21 1.1 bjh21 Arithmetic Package, Release 2a.
22 1.1 bjh21
23 1.1 bjh21 Written by John R. Hauser. This work was made possible in part by the
24 1.1 bjh21 International Computer Science Institute, located at Suite 600, 1947 Center
25 1.1 bjh21 Street, Berkeley, California 94704. Funding was partially provided by the
26 1.1 bjh21 National Science Foundation under grant MIP-9311980. The original version
27 1.1 bjh21 of this code was written as part of a project to build a fixed-point vector
28 1.1 bjh21 processor in collaboration with the University of California at Berkeley,
29 1.1 bjh21 overseen by Profs. Nelson Morgan and John Wawrzynek. More information
30 1.1 bjh21 is available through the Web page `http://HTTP.CS.Berkeley.EDU/~jhauser/
31 1.1 bjh21 arithmetic/SoftFloat.html'.
32 1.1 bjh21
33 1.1 bjh21 THIS SOFTWARE IS DISTRIBUTED AS IS, FOR FREE. Although reasonable effort
34 1.1 bjh21 has been made to avoid it, THIS SOFTWARE MAY CONTAIN FAULTS THAT WILL AT
35 1.1 bjh21 TIMES RESULT IN INCORRECT BEHAVIOR. USE OF THIS SOFTWARE IS RESTRICTED TO
36 1.1 bjh21 PERSONS AND ORGANIZATIONS WHO CAN AND WILL TAKE FULL RESPONSIBILITY FOR ANY
37 1.1 bjh21 AND ALL LOSSES, COSTS, OR OTHER PROBLEMS ARISING FROM ITS USE.
38 1.1 bjh21
39 1.1 bjh21 Derivative works are acceptable, even for commercial purposes, so long as
40 1.1 bjh21 (1) they include prominent notice that the work is derivative, and (2) they
41 1.1 bjh21 include prominent notice akin to these four paragraphs for those parts of
42 1.1 bjh21 this code that are retained.
43 1.1 bjh21
44 1.1 bjh21 ===============================================================================
45 1.1 bjh21 */
46 1.1 bjh21
47 1.1 bjh21 #include <sys/cdefs.h>
48 1.1 bjh21 #if defined(LIBC_SCCS) && !defined(lint)
49 1.11.2.2 tls __RCSID("$NetBSD: softfloat.c,v 1.11.2.2 2014/08/20 00:02:15 tls Exp $");
50 1.1 bjh21 #endif /* LIBC_SCCS and not lint */
51 1.1 bjh21
52 1.1 bjh21 #ifdef SOFTFLOAT_FOR_GCC
53 1.1 bjh21 #include "softfloat-for-gcc.h"
54 1.1 bjh21 #endif
55 1.1 bjh21
56 1.1 bjh21 #include "milieu.h"
57 1.1 bjh21 #include "softfloat.h"
58 1.1 bjh21
59 1.1 bjh21 /*
60 1.1 bjh21 * Conversions between floats as stored in memory and floats as
61 1.1 bjh21 * SoftFloat uses them
62 1.1 bjh21 */
63 1.1 bjh21 #ifndef FLOAT64_DEMANGLE
64 1.1 bjh21 #define FLOAT64_DEMANGLE(a) (a)
65 1.1 bjh21 #endif
66 1.1 bjh21 #ifndef FLOAT64_MANGLE
67 1.1 bjh21 #define FLOAT64_MANGLE(a) (a)
68 1.1 bjh21 #endif
69 1.1 bjh21
70 1.1 bjh21 /*
71 1.1 bjh21 -------------------------------------------------------------------------------
72 1.1 bjh21 Floating-point rounding mode, extended double-precision rounding precision,
73 1.1 bjh21 and exception flags.
74 1.1 bjh21 -------------------------------------------------------------------------------
75 1.1 bjh21 */
76 1.11.2.1 tls #ifndef set_float_rounding_mode
77 1.1 bjh21 fp_rnd float_rounding_mode = float_round_nearest_even;
78 1.1 bjh21 fp_except float_exception_flags = 0;
79 1.11.2.1 tls #endif
80 1.11.2.1 tls #ifndef set_float_exception_inexact_flag
81 1.11.2.1 tls #define set_float_exception_inexact_flag() \
82 1.11.2.1 tls ((void)(float_exception_flags |= float_flag_inexact))
83 1.11.2.1 tls #endif
84 1.1 bjh21 #ifdef FLOATX80
85 1.1 bjh21 int8 floatx80_rounding_precision = 80;
86 1.1 bjh21 #endif
87 1.1 bjh21
88 1.1 bjh21 /*
89 1.1 bjh21 -------------------------------------------------------------------------------
90 1.1 bjh21 Primitive arithmetic functions, including multi-word arithmetic, and
91 1.1 bjh21 division and square root approximations. (Can be specialized to target if
92 1.1 bjh21 desired.)
93 1.1 bjh21 -------------------------------------------------------------------------------
94 1.1 bjh21 */
95 1.1 bjh21 #include "softfloat-macros"
96 1.1 bjh21
97 1.1 bjh21 /*
98 1.1 bjh21 -------------------------------------------------------------------------------
99 1.1 bjh21 Functions and definitions to determine: (1) whether tininess for underflow
100 1.1 bjh21 is detected before or after rounding by default, (2) what (if anything)
101 1.1 bjh21 happens when exceptions are raised, (3) how signaling NaNs are distinguished
102 1.1 bjh21 from quiet NaNs, (4) the default generated quiet NaNs, and (5) how NaNs
103 1.1 bjh21 are propagated from function inputs to output. These details are target-
104 1.1 bjh21 specific.
105 1.1 bjh21 -------------------------------------------------------------------------------
106 1.1 bjh21 */
107 1.1 bjh21 #include "softfloat-specialize"
108 1.1 bjh21
109 1.1 bjh21 #if !defined(SOFTFLOAT_FOR_GCC) || defined(FLOATX80) || defined(FLOAT128)
110 1.1 bjh21 /*
111 1.1 bjh21 -------------------------------------------------------------------------------
112 1.1 bjh21 Takes a 64-bit fixed-point value `absZ' with binary point between bits 6
113 1.1 bjh21 and 7, and returns the properly rounded 32-bit integer corresponding to the
114 1.1 bjh21 input. If `zSign' is 1, the input is negated before being converted to an
115 1.1 bjh21 integer. Bit 63 of `absZ' must be zero. Ordinarily, the fixed-point input
116 1.1 bjh21 is simply rounded to an integer, with the inexact exception raised if the
117 1.1 bjh21 input cannot be represented exactly as an integer. However, if the fixed-
118 1.1 bjh21 point input is too large, the invalid exception is raised and the largest
119 1.1 bjh21 positive or negative integer is returned.
120 1.1 bjh21 -------------------------------------------------------------------------------
121 1.1 bjh21 */
122 1.1 bjh21 static int32 roundAndPackInt32( flag zSign, bits64 absZ )
123 1.1 bjh21 {
124 1.1 bjh21 int8 roundingMode;
125 1.1 bjh21 flag roundNearestEven;
126 1.1 bjh21 int8 roundIncrement, roundBits;
127 1.1 bjh21 int32 z;
128 1.1 bjh21
129 1.1 bjh21 roundingMode = float_rounding_mode;
130 1.1 bjh21 roundNearestEven = ( roundingMode == float_round_nearest_even );
131 1.1 bjh21 roundIncrement = 0x40;
132 1.1 bjh21 if ( ! roundNearestEven ) {
133 1.1 bjh21 if ( roundingMode == float_round_to_zero ) {
134 1.1 bjh21 roundIncrement = 0;
135 1.1 bjh21 }
136 1.1 bjh21 else {
137 1.1 bjh21 roundIncrement = 0x7F;
138 1.1 bjh21 if ( zSign ) {
139 1.1 bjh21 if ( roundingMode == float_round_up ) roundIncrement = 0;
140 1.1 bjh21 }
141 1.1 bjh21 else {
142 1.1 bjh21 if ( roundingMode == float_round_down ) roundIncrement = 0;
143 1.1 bjh21 }
144 1.1 bjh21 }
145 1.1 bjh21 }
146 1.10 christos roundBits = (int8)(absZ & 0x7F);
147 1.1 bjh21 absZ = ( absZ + roundIncrement )>>7;
148 1.1 bjh21 absZ &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven );
149 1.10 christos z = (int32)absZ;
150 1.1 bjh21 if ( zSign ) z = - z;
151 1.1 bjh21 if ( ( absZ>>32 ) || ( z && ( ( z < 0 ) ^ zSign ) ) ) {
152 1.1 bjh21 float_raise( float_flag_invalid );
153 1.1 bjh21 return zSign ? (sbits32) 0x80000000 : 0x7FFFFFFF;
154 1.1 bjh21 }
155 1.11.2.1 tls if ( roundBits ) set_float_exception_inexact_flag();
156 1.1 bjh21 return z;
157 1.1 bjh21
158 1.1 bjh21 }
159 1.1 bjh21
160 1.1 bjh21 /*
161 1.1 bjh21 -------------------------------------------------------------------------------
162 1.1 bjh21 Takes the 128-bit fixed-point value formed by concatenating `absZ0' and
163 1.1 bjh21 `absZ1', with binary point between bits 63 and 64 (between the input words),
164 1.1 bjh21 and returns the properly rounded 64-bit integer corresponding to the input.
165 1.1 bjh21 If `zSign' is 1, the input is negated before being converted to an integer.
166 1.1 bjh21 Ordinarily, the fixed-point input is simply rounded to an integer, with
167 1.1 bjh21 the inexact exception raised if the input cannot be represented exactly as
168 1.1 bjh21 an integer. However, if the fixed-point input is too large, the invalid
169 1.1 bjh21 exception is raised and the largest positive or negative integer is
170 1.1 bjh21 returned.
171 1.1 bjh21 -------------------------------------------------------------------------------
172 1.1 bjh21 */
173 1.1 bjh21 static int64 roundAndPackInt64( flag zSign, bits64 absZ0, bits64 absZ1 )
174 1.1 bjh21 {
175 1.1 bjh21 int8 roundingMode;
176 1.1 bjh21 flag roundNearestEven, increment;
177 1.1 bjh21 int64 z;
178 1.1 bjh21
179 1.1 bjh21 roundingMode = float_rounding_mode;
180 1.1 bjh21 roundNearestEven = ( roundingMode == float_round_nearest_even );
181 1.1 bjh21 increment = ( (sbits64) absZ1 < 0 );
182 1.1 bjh21 if ( ! roundNearestEven ) {
183 1.1 bjh21 if ( roundingMode == float_round_to_zero ) {
184 1.1 bjh21 increment = 0;
185 1.1 bjh21 }
186 1.1 bjh21 else {
187 1.1 bjh21 if ( zSign ) {
188 1.1 bjh21 increment = ( roundingMode == float_round_down ) && absZ1;
189 1.1 bjh21 }
190 1.1 bjh21 else {
191 1.1 bjh21 increment = ( roundingMode == float_round_up ) && absZ1;
192 1.1 bjh21 }
193 1.1 bjh21 }
194 1.1 bjh21 }
195 1.1 bjh21 if ( increment ) {
196 1.1 bjh21 ++absZ0;
197 1.1 bjh21 if ( absZ0 == 0 ) goto overflow;
198 1.1 bjh21 absZ0 &= ~ ( ( (bits64) ( absZ1<<1 ) == 0 ) & roundNearestEven );
199 1.1 bjh21 }
200 1.1 bjh21 z = absZ0;
201 1.1 bjh21 if ( zSign ) z = - z;
202 1.1 bjh21 if ( z && ( ( z < 0 ) ^ zSign ) ) {
203 1.1 bjh21 overflow:
204 1.1 bjh21 float_raise( float_flag_invalid );
205 1.1 bjh21 return
206 1.1 bjh21 zSign ? (sbits64) LIT64( 0x8000000000000000 )
207 1.1 bjh21 : LIT64( 0x7FFFFFFFFFFFFFFF );
208 1.1 bjh21 }
209 1.11.2.1 tls if ( absZ1 ) set_float_exception_inexact_flag();
210 1.1 bjh21 return z;
211 1.1 bjh21
212 1.1 bjh21 }
213 1.1 bjh21 #endif
214 1.1 bjh21
215 1.1 bjh21 /*
216 1.1 bjh21 -------------------------------------------------------------------------------
217 1.1 bjh21 Returns the fraction bits of the single-precision floating-point value `a'.
218 1.1 bjh21 -------------------------------------------------------------------------------
219 1.1 bjh21 */
220 1.1 bjh21 INLINE bits32 extractFloat32Frac( float32 a )
221 1.1 bjh21 {
222 1.1 bjh21
223 1.1 bjh21 return a & 0x007FFFFF;
224 1.1 bjh21
225 1.1 bjh21 }
226 1.1 bjh21
227 1.1 bjh21 /*
228 1.1 bjh21 -------------------------------------------------------------------------------
229 1.1 bjh21 Returns the exponent bits of the single-precision floating-point value `a'.
230 1.1 bjh21 -------------------------------------------------------------------------------
231 1.1 bjh21 */
232 1.1 bjh21 INLINE int16 extractFloat32Exp( float32 a )
233 1.1 bjh21 {
234 1.1 bjh21
235 1.1 bjh21 return ( a>>23 ) & 0xFF;
236 1.1 bjh21
237 1.1 bjh21 }
238 1.1 bjh21
239 1.1 bjh21 /*
240 1.1 bjh21 -------------------------------------------------------------------------------
241 1.1 bjh21 Returns the sign bit of the single-precision floating-point value `a'.
242 1.1 bjh21 -------------------------------------------------------------------------------
243 1.1 bjh21 */
244 1.1 bjh21 INLINE flag extractFloat32Sign( float32 a )
245 1.1 bjh21 {
246 1.1 bjh21
247 1.1 bjh21 return a>>31;
248 1.1 bjh21
249 1.1 bjh21 }
250 1.1 bjh21
251 1.1 bjh21 /*
252 1.1 bjh21 -------------------------------------------------------------------------------
253 1.1 bjh21 Normalizes the subnormal single-precision floating-point value represented
254 1.1 bjh21 by the denormalized significand `aSig'. The normalized exponent and
255 1.1 bjh21 significand are stored at the locations pointed to by `zExpPtr' and
256 1.1 bjh21 `zSigPtr', respectively.
257 1.1 bjh21 -------------------------------------------------------------------------------
258 1.1 bjh21 */
259 1.1 bjh21 static void
260 1.1 bjh21 normalizeFloat32Subnormal( bits32 aSig, int16 *zExpPtr, bits32 *zSigPtr )
261 1.1 bjh21 {
262 1.1 bjh21 int8 shiftCount;
263 1.1 bjh21
264 1.1 bjh21 shiftCount = countLeadingZeros32( aSig ) - 8;
265 1.1 bjh21 *zSigPtr = aSig<<shiftCount;
266 1.1 bjh21 *zExpPtr = 1 - shiftCount;
267 1.1 bjh21
268 1.1 bjh21 }
269 1.1 bjh21
270 1.1 bjh21 /*
271 1.1 bjh21 -------------------------------------------------------------------------------
272 1.1 bjh21 Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
273 1.1 bjh21 single-precision floating-point value, returning the result. After being
274 1.1 bjh21 shifted into the proper positions, the three fields are simply added
275 1.1 bjh21 together to form the result. This means that any integer portion of `zSig'
276 1.1 bjh21 will be added into the exponent. Since a properly normalized significand
277 1.1 bjh21 will have an integer portion equal to 1, the `zExp' input should be 1 less
278 1.1 bjh21 than the desired result exponent whenever `zSig' is a complete, normalized
279 1.1 bjh21 significand.
280 1.1 bjh21 -------------------------------------------------------------------------------
281 1.1 bjh21 */
282 1.1 bjh21 INLINE float32 packFloat32( flag zSign, int16 zExp, bits32 zSig )
283 1.1 bjh21 {
284 1.1 bjh21
285 1.1 bjh21 return ( ( (bits32) zSign )<<31 ) + ( ( (bits32) zExp )<<23 ) + zSig;
286 1.1 bjh21
287 1.1 bjh21 }
288 1.1 bjh21
289 1.1 bjh21 /*
290 1.1 bjh21 -------------------------------------------------------------------------------
291 1.1 bjh21 Takes an abstract floating-point value having sign `zSign', exponent `zExp',
292 1.1 bjh21 and significand `zSig', and returns the proper single-precision floating-
293 1.1 bjh21 point value corresponding to the abstract input. Ordinarily, the abstract
294 1.1 bjh21 value is simply rounded and packed into the single-precision format, with
295 1.1 bjh21 the inexact exception raised if the abstract input cannot be represented
296 1.1 bjh21 exactly. However, if the abstract value is too large, the overflow and
297 1.1 bjh21 inexact exceptions are raised and an infinity or maximal finite value is
298 1.1 bjh21 returned. If the abstract value is too small, the input value is rounded to
299 1.1 bjh21 a subnormal number, and the underflow and inexact exceptions are raised if
300 1.1 bjh21 the abstract input cannot be represented exactly as a subnormal single-
301 1.1 bjh21 precision floating-point number.
302 1.1 bjh21 The input significand `zSig' has its binary point between bits 30
303 1.1 bjh21 and 29, which is 7 bits to the left of the usual location. This shifted
304 1.1 bjh21 significand must be normalized or smaller. If `zSig' is not normalized,
305 1.1 bjh21 `zExp' must be 0; in that case, the result returned is a subnormal number,
306 1.1 bjh21 and it must not require rounding. In the usual case that `zSig' is
307 1.1 bjh21 normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
308 1.1 bjh21 The handling of underflow and overflow follows the IEC/IEEE Standard for
309 1.1 bjh21 Binary Floating-Point Arithmetic.
310 1.1 bjh21 -------------------------------------------------------------------------------
311 1.1 bjh21 */
312 1.1 bjh21 static float32 roundAndPackFloat32( flag zSign, int16 zExp, bits32 zSig )
313 1.1 bjh21 {
314 1.1 bjh21 int8 roundingMode;
315 1.1 bjh21 flag roundNearestEven;
316 1.1 bjh21 int8 roundIncrement, roundBits;
317 1.1 bjh21 flag isTiny;
318 1.1 bjh21
319 1.1 bjh21 roundingMode = float_rounding_mode;
320 1.1 bjh21 roundNearestEven = ( roundingMode == float_round_nearest_even );
321 1.1 bjh21 roundIncrement = 0x40;
322 1.1 bjh21 if ( ! roundNearestEven ) {
323 1.1 bjh21 if ( roundingMode == float_round_to_zero ) {
324 1.1 bjh21 roundIncrement = 0;
325 1.1 bjh21 }
326 1.1 bjh21 else {
327 1.1 bjh21 roundIncrement = 0x7F;
328 1.1 bjh21 if ( zSign ) {
329 1.1 bjh21 if ( roundingMode == float_round_up ) roundIncrement = 0;
330 1.1 bjh21 }
331 1.1 bjh21 else {
332 1.1 bjh21 if ( roundingMode == float_round_down ) roundIncrement = 0;
333 1.1 bjh21 }
334 1.1 bjh21 }
335 1.1 bjh21 }
336 1.1 bjh21 roundBits = zSig & 0x7F;
337 1.1 bjh21 if ( 0xFD <= (bits16) zExp ) {
338 1.1 bjh21 if ( ( 0xFD < zExp )
339 1.1 bjh21 || ( ( zExp == 0xFD )
340 1.1 bjh21 && ( (sbits32) ( zSig + roundIncrement ) < 0 ) )
341 1.1 bjh21 ) {
342 1.1 bjh21 float_raise( float_flag_overflow | float_flag_inexact );
343 1.1 bjh21 return packFloat32( zSign, 0xFF, 0 ) - ( roundIncrement == 0 );
344 1.1 bjh21 }
345 1.1 bjh21 if ( zExp < 0 ) {
346 1.1 bjh21 isTiny =
347 1.1 bjh21 ( float_detect_tininess == float_tininess_before_rounding )
348 1.1 bjh21 || ( zExp < -1 )
349 1.10 christos || ( zSig + roundIncrement < 0x80000000U );
350 1.1 bjh21 shift32RightJamming( zSig, - zExp, &zSig );
351 1.1 bjh21 zExp = 0;
352 1.1 bjh21 roundBits = zSig & 0x7F;
353 1.1 bjh21 if ( isTiny && roundBits ) float_raise( float_flag_underflow );
354 1.1 bjh21 }
355 1.1 bjh21 }
356 1.11.2.1 tls if ( roundBits ) set_float_exception_inexact_flag();
357 1.1 bjh21 zSig = ( zSig + roundIncrement )>>7;
358 1.1 bjh21 zSig &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven );
359 1.1 bjh21 if ( zSig == 0 ) zExp = 0;
360 1.1 bjh21 return packFloat32( zSign, zExp, zSig );
361 1.1 bjh21
362 1.1 bjh21 }
363 1.1 bjh21
364 1.1 bjh21 /*
365 1.1 bjh21 -------------------------------------------------------------------------------
366 1.1 bjh21 Takes an abstract floating-point value having sign `zSign', exponent `zExp',
367 1.1 bjh21 and significand `zSig', and returns the proper single-precision floating-
368 1.1 bjh21 point value corresponding to the abstract input. This routine is just like
369 1.1 bjh21 `roundAndPackFloat32' except that `zSig' does not have to be normalized.
370 1.1 bjh21 Bit 31 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
371 1.1 bjh21 floating-point exponent.
372 1.1 bjh21 -------------------------------------------------------------------------------
373 1.1 bjh21 */
374 1.1 bjh21 static float32
375 1.1 bjh21 normalizeRoundAndPackFloat32( flag zSign, int16 zExp, bits32 zSig )
376 1.1 bjh21 {
377 1.1 bjh21 int8 shiftCount;
378 1.1 bjh21
379 1.1 bjh21 shiftCount = countLeadingZeros32( zSig ) - 1;
380 1.1 bjh21 return roundAndPackFloat32( zSign, zExp - shiftCount, zSig<<shiftCount );
381 1.1 bjh21
382 1.1 bjh21 }
383 1.1 bjh21
384 1.1 bjh21 /*
385 1.1 bjh21 -------------------------------------------------------------------------------
386 1.1 bjh21 Returns the fraction bits of the double-precision floating-point value `a'.
387 1.1 bjh21 -------------------------------------------------------------------------------
388 1.1 bjh21 */
389 1.1 bjh21 INLINE bits64 extractFloat64Frac( float64 a )
390 1.1 bjh21 {
391 1.1 bjh21
392 1.1 bjh21 return FLOAT64_DEMANGLE(a) & LIT64( 0x000FFFFFFFFFFFFF );
393 1.1 bjh21
394 1.1 bjh21 }
395 1.1 bjh21
396 1.1 bjh21 /*
397 1.1 bjh21 -------------------------------------------------------------------------------
398 1.1 bjh21 Returns the exponent bits of the double-precision floating-point value `a'.
399 1.1 bjh21 -------------------------------------------------------------------------------
400 1.1 bjh21 */
401 1.1 bjh21 INLINE int16 extractFloat64Exp( float64 a )
402 1.1 bjh21 {
403 1.1 bjh21
404 1.10 christos return (int16)((FLOAT64_DEMANGLE(a) >> 52) & 0x7FF);
405 1.1 bjh21
406 1.1 bjh21 }
407 1.1 bjh21
408 1.1 bjh21 /*
409 1.1 bjh21 -------------------------------------------------------------------------------
410 1.1 bjh21 Returns the sign bit of the double-precision floating-point value `a'.
411 1.1 bjh21 -------------------------------------------------------------------------------
412 1.1 bjh21 */
413 1.1 bjh21 INLINE flag extractFloat64Sign( float64 a )
414 1.1 bjh21 {
415 1.1 bjh21
416 1.10 christos return (flag)(FLOAT64_DEMANGLE(a) >> 63);
417 1.1 bjh21
418 1.1 bjh21 }
419 1.1 bjh21
420 1.1 bjh21 /*
421 1.1 bjh21 -------------------------------------------------------------------------------
422 1.1 bjh21 Normalizes the subnormal double-precision floating-point value represented
423 1.1 bjh21 by the denormalized significand `aSig'. The normalized exponent and
424 1.1 bjh21 significand are stored at the locations pointed to by `zExpPtr' and
425 1.1 bjh21 `zSigPtr', respectively.
426 1.1 bjh21 -------------------------------------------------------------------------------
427 1.1 bjh21 */
428 1.1 bjh21 static void
429 1.1 bjh21 normalizeFloat64Subnormal( bits64 aSig, int16 *zExpPtr, bits64 *zSigPtr )
430 1.1 bjh21 {
431 1.1 bjh21 int8 shiftCount;
432 1.1 bjh21
433 1.1 bjh21 shiftCount = countLeadingZeros64( aSig ) - 11;
434 1.1 bjh21 *zSigPtr = aSig<<shiftCount;
435 1.1 bjh21 *zExpPtr = 1 - shiftCount;
436 1.1 bjh21
437 1.1 bjh21 }
438 1.1 bjh21
439 1.1 bjh21 /*
440 1.1 bjh21 -------------------------------------------------------------------------------
441 1.1 bjh21 Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
442 1.1 bjh21 double-precision floating-point value, returning the result. After being
443 1.1 bjh21 shifted into the proper positions, the three fields are simply added
444 1.1 bjh21 together to form the result. This means that any integer portion of `zSig'
445 1.1 bjh21 will be added into the exponent. Since a properly normalized significand
446 1.1 bjh21 will have an integer portion equal to 1, the `zExp' input should be 1 less
447 1.1 bjh21 than the desired result exponent whenever `zSig' is a complete, normalized
448 1.1 bjh21 significand.
449 1.1 bjh21 -------------------------------------------------------------------------------
450 1.1 bjh21 */
451 1.1 bjh21 INLINE float64 packFloat64( flag zSign, int16 zExp, bits64 zSig )
452 1.1 bjh21 {
453 1.1 bjh21
454 1.1 bjh21 return FLOAT64_MANGLE( ( ( (bits64) zSign )<<63 ) +
455 1.1 bjh21 ( ( (bits64) zExp )<<52 ) + zSig );
456 1.1 bjh21
457 1.1 bjh21 }
458 1.1 bjh21
459 1.1 bjh21 /*
460 1.1 bjh21 -------------------------------------------------------------------------------
461 1.1 bjh21 Takes an abstract floating-point value having sign `zSign', exponent `zExp',
462 1.1 bjh21 and significand `zSig', and returns the proper double-precision floating-
463 1.1 bjh21 point value corresponding to the abstract input. Ordinarily, the abstract
464 1.1 bjh21 value is simply rounded and packed into the double-precision format, with
465 1.1 bjh21 the inexact exception raised if the abstract input cannot be represented
466 1.1 bjh21 exactly. However, if the abstract value is too large, the overflow and
467 1.1 bjh21 inexact exceptions are raised and an infinity or maximal finite value is
468 1.1 bjh21 returned. If the abstract value is too small, the input value is rounded to
469 1.1 bjh21 a subnormal number, and the underflow and inexact exceptions are raised if
470 1.1 bjh21 the abstract input cannot be represented exactly as a subnormal double-
471 1.1 bjh21 precision floating-point number.
472 1.1 bjh21 The input significand `zSig' has its binary point between bits 62
473 1.1 bjh21 and 61, which is 10 bits to the left of the usual location. This shifted
474 1.1 bjh21 significand must be normalized or smaller. If `zSig' is not normalized,
475 1.1 bjh21 `zExp' must be 0; in that case, the result returned is a subnormal number,
476 1.1 bjh21 and it must not require rounding. In the usual case that `zSig' is
477 1.1 bjh21 normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
478 1.1 bjh21 The handling of underflow and overflow follows the IEC/IEEE Standard for
479 1.1 bjh21 Binary Floating-Point Arithmetic.
480 1.1 bjh21 -------------------------------------------------------------------------------
481 1.1 bjh21 */
482 1.1 bjh21 static float64 roundAndPackFloat64( flag zSign, int16 zExp, bits64 zSig )
483 1.1 bjh21 {
484 1.1 bjh21 int8 roundingMode;
485 1.1 bjh21 flag roundNearestEven;
486 1.1 bjh21 int16 roundIncrement, roundBits;
487 1.1 bjh21 flag isTiny;
488 1.1 bjh21
489 1.1 bjh21 roundingMode = float_rounding_mode;
490 1.1 bjh21 roundNearestEven = ( roundingMode == float_round_nearest_even );
491 1.1 bjh21 roundIncrement = 0x200;
492 1.1 bjh21 if ( ! roundNearestEven ) {
493 1.1 bjh21 if ( roundingMode == float_round_to_zero ) {
494 1.1 bjh21 roundIncrement = 0;
495 1.1 bjh21 }
496 1.1 bjh21 else {
497 1.1 bjh21 roundIncrement = 0x3FF;
498 1.1 bjh21 if ( zSign ) {
499 1.1 bjh21 if ( roundingMode == float_round_up ) roundIncrement = 0;
500 1.1 bjh21 }
501 1.1 bjh21 else {
502 1.1 bjh21 if ( roundingMode == float_round_down ) roundIncrement = 0;
503 1.1 bjh21 }
504 1.1 bjh21 }
505 1.1 bjh21 }
506 1.10 christos roundBits = (int16)(zSig & 0x3FF);
507 1.1 bjh21 if ( 0x7FD <= (bits16) zExp ) {
508 1.1 bjh21 if ( ( 0x7FD < zExp )
509 1.1 bjh21 || ( ( zExp == 0x7FD )
510 1.1 bjh21 && ( (sbits64) ( zSig + roundIncrement ) < 0 ) )
511 1.1 bjh21 ) {
512 1.1 bjh21 float_raise( float_flag_overflow | float_flag_inexact );
513 1.1 bjh21 return FLOAT64_MANGLE(
514 1.1 bjh21 FLOAT64_DEMANGLE(packFloat64( zSign, 0x7FF, 0 )) -
515 1.1 bjh21 ( roundIncrement == 0 ));
516 1.1 bjh21 }
517 1.1 bjh21 if ( zExp < 0 ) {
518 1.1 bjh21 isTiny =
519 1.1 bjh21 ( float_detect_tininess == float_tininess_before_rounding )
520 1.1 bjh21 || ( zExp < -1 )
521 1.10 christos || ( zSig + roundIncrement < (bits64)LIT64( 0x8000000000000000 ) );
522 1.1 bjh21 shift64RightJamming( zSig, - zExp, &zSig );
523 1.1 bjh21 zExp = 0;
524 1.10 christos roundBits = (int16)(zSig & 0x3FF);
525 1.1 bjh21 if ( isTiny && roundBits ) float_raise( float_flag_underflow );
526 1.1 bjh21 }
527 1.1 bjh21 }
528 1.11.2.1 tls if ( roundBits ) set_float_exception_inexact_flag();
529 1.1 bjh21 zSig = ( zSig + roundIncrement )>>10;
530 1.1 bjh21 zSig &= ~ ( ( ( roundBits ^ 0x200 ) == 0 ) & roundNearestEven );
531 1.1 bjh21 if ( zSig == 0 ) zExp = 0;
532 1.1 bjh21 return packFloat64( zSign, zExp, zSig );
533 1.1 bjh21
534 1.1 bjh21 }
535 1.1 bjh21
536 1.1 bjh21 /*
537 1.1 bjh21 -------------------------------------------------------------------------------
538 1.1 bjh21 Takes an abstract floating-point value having sign `zSign', exponent `zExp',
539 1.1 bjh21 and significand `zSig', and returns the proper double-precision floating-
540 1.1 bjh21 point value corresponding to the abstract input. This routine is just like
541 1.1 bjh21 `roundAndPackFloat64' except that `zSig' does not have to be normalized.
542 1.1 bjh21 Bit 63 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
543 1.1 bjh21 floating-point exponent.
544 1.1 bjh21 -------------------------------------------------------------------------------
545 1.1 bjh21 */
546 1.1 bjh21 static float64
547 1.1 bjh21 normalizeRoundAndPackFloat64( flag zSign, int16 zExp, bits64 zSig )
548 1.1 bjh21 {
549 1.1 bjh21 int8 shiftCount;
550 1.1 bjh21
551 1.1 bjh21 shiftCount = countLeadingZeros64( zSig ) - 1;
552 1.1 bjh21 return roundAndPackFloat64( zSign, zExp - shiftCount, zSig<<shiftCount );
553 1.1 bjh21
554 1.1 bjh21 }
555 1.1 bjh21
556 1.1 bjh21 #ifdef FLOATX80
557 1.1 bjh21
558 1.1 bjh21 /*
559 1.1 bjh21 -------------------------------------------------------------------------------
560 1.1 bjh21 Returns the fraction bits of the extended double-precision floating-point
561 1.1 bjh21 value `a'.
562 1.1 bjh21 -------------------------------------------------------------------------------
563 1.1 bjh21 */
564 1.1 bjh21 INLINE bits64 extractFloatx80Frac( floatx80 a )
565 1.1 bjh21 {
566 1.1 bjh21
567 1.1 bjh21 return a.low;
568 1.1 bjh21
569 1.1 bjh21 }
570 1.1 bjh21
571 1.1 bjh21 /*
572 1.1 bjh21 -------------------------------------------------------------------------------
573 1.1 bjh21 Returns the exponent bits of the extended double-precision floating-point
574 1.1 bjh21 value `a'.
575 1.1 bjh21 -------------------------------------------------------------------------------
576 1.1 bjh21 */
577 1.1 bjh21 INLINE int32 extractFloatx80Exp( floatx80 a )
578 1.1 bjh21 {
579 1.1 bjh21
580 1.1 bjh21 return a.high & 0x7FFF;
581 1.1 bjh21
582 1.1 bjh21 }
583 1.1 bjh21
584 1.1 bjh21 /*
585 1.1 bjh21 -------------------------------------------------------------------------------
586 1.1 bjh21 Returns the sign bit of the extended double-precision floating-point value
587 1.1 bjh21 `a'.
588 1.1 bjh21 -------------------------------------------------------------------------------
589 1.1 bjh21 */
590 1.1 bjh21 INLINE flag extractFloatx80Sign( floatx80 a )
591 1.1 bjh21 {
592 1.1 bjh21
593 1.1 bjh21 return a.high>>15;
594 1.1 bjh21
595 1.1 bjh21 }
596 1.1 bjh21
597 1.1 bjh21 /*
598 1.1 bjh21 -------------------------------------------------------------------------------
599 1.1 bjh21 Normalizes the subnormal extended double-precision floating-point value
600 1.1 bjh21 represented by the denormalized significand `aSig'. The normalized exponent
601 1.1 bjh21 and significand are stored at the locations pointed to by `zExpPtr' and
602 1.1 bjh21 `zSigPtr', respectively.
603 1.1 bjh21 -------------------------------------------------------------------------------
604 1.1 bjh21 */
605 1.1 bjh21 static void
606 1.1 bjh21 normalizeFloatx80Subnormal( bits64 aSig, int32 *zExpPtr, bits64 *zSigPtr )
607 1.1 bjh21 {
608 1.1 bjh21 int8 shiftCount;
609 1.1 bjh21
610 1.1 bjh21 shiftCount = countLeadingZeros64( aSig );
611 1.1 bjh21 *zSigPtr = aSig<<shiftCount;
612 1.1 bjh21 *zExpPtr = 1 - shiftCount;
613 1.1 bjh21
614 1.1 bjh21 }
615 1.1 bjh21
616 1.1 bjh21 /*
617 1.1 bjh21 -------------------------------------------------------------------------------
618 1.1 bjh21 Packs the sign `zSign', exponent `zExp', and significand `zSig' into an
619 1.1 bjh21 extended double-precision floating-point value, returning the result.
620 1.1 bjh21 -------------------------------------------------------------------------------
621 1.1 bjh21 */
622 1.1 bjh21 INLINE floatx80 packFloatx80( flag zSign, int32 zExp, bits64 zSig )
623 1.1 bjh21 {
624 1.1 bjh21 floatx80 z;
625 1.1 bjh21
626 1.1 bjh21 z.low = zSig;
627 1.1 bjh21 z.high = ( ( (bits16) zSign )<<15 ) + zExp;
628 1.1 bjh21 return z;
629 1.1 bjh21
630 1.1 bjh21 }
631 1.1 bjh21
632 1.1 bjh21 /*
633 1.1 bjh21 -------------------------------------------------------------------------------
634 1.1 bjh21 Takes an abstract floating-point value having sign `zSign', exponent `zExp',
635 1.1 bjh21 and extended significand formed by the concatenation of `zSig0' and `zSig1',
636 1.1 bjh21 and returns the proper extended double-precision floating-point value
637 1.1 bjh21 corresponding to the abstract input. Ordinarily, the abstract value is
638 1.1 bjh21 rounded and packed into the extended double-precision format, with the
639 1.1 bjh21 inexact exception raised if the abstract input cannot be represented
640 1.1 bjh21 exactly. However, if the abstract value is too large, the overflow and
641 1.1 bjh21 inexact exceptions are raised and an infinity or maximal finite value is
642 1.1 bjh21 returned. If the abstract value is too small, the input value is rounded to
643 1.1 bjh21 a subnormal number, and the underflow and inexact exceptions are raised if
644 1.1 bjh21 the abstract input cannot be represented exactly as a subnormal extended
645 1.1 bjh21 double-precision floating-point number.
646 1.1 bjh21 If `roundingPrecision' is 32 or 64, the result is rounded to the same
647 1.1 bjh21 number of bits as single or double precision, respectively. Otherwise, the
648 1.1 bjh21 result is rounded to the full precision of the extended double-precision
649 1.1 bjh21 format.
650 1.1 bjh21 The input significand must be normalized or smaller. If the input
651 1.1 bjh21 significand is not normalized, `zExp' must be 0; in that case, the result
652 1.1 bjh21 returned is a subnormal number, and it must not require rounding. The
653 1.1 bjh21 handling of underflow and overflow follows the IEC/IEEE Standard for Binary
654 1.1 bjh21 Floating-Point Arithmetic.
655 1.1 bjh21 -------------------------------------------------------------------------------
656 1.1 bjh21 */
657 1.1 bjh21 static floatx80
658 1.1 bjh21 roundAndPackFloatx80(
659 1.1 bjh21 int8 roundingPrecision, flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1
660 1.1 bjh21 )
661 1.1 bjh21 {
662 1.1 bjh21 int8 roundingMode;
663 1.1 bjh21 flag roundNearestEven, increment, isTiny;
664 1.1 bjh21 int64 roundIncrement, roundMask, roundBits;
665 1.1 bjh21
666 1.1 bjh21 roundingMode = float_rounding_mode;
667 1.1 bjh21 roundNearestEven = ( roundingMode == float_round_nearest_even );
668 1.1 bjh21 if ( roundingPrecision == 80 ) goto precision80;
669 1.1 bjh21 if ( roundingPrecision == 64 ) {
670 1.1 bjh21 roundIncrement = LIT64( 0x0000000000000400 );
671 1.1 bjh21 roundMask = LIT64( 0x00000000000007FF );
672 1.1 bjh21 }
673 1.1 bjh21 else if ( roundingPrecision == 32 ) {
674 1.1 bjh21 roundIncrement = LIT64( 0x0000008000000000 );
675 1.1 bjh21 roundMask = LIT64( 0x000000FFFFFFFFFF );
676 1.1 bjh21 }
677 1.1 bjh21 else {
678 1.1 bjh21 goto precision80;
679 1.1 bjh21 }
680 1.1 bjh21 zSig0 |= ( zSig1 != 0 );
681 1.1 bjh21 if ( ! roundNearestEven ) {
682 1.1 bjh21 if ( roundingMode == float_round_to_zero ) {
683 1.1 bjh21 roundIncrement = 0;
684 1.1 bjh21 }
685 1.1 bjh21 else {
686 1.1 bjh21 roundIncrement = roundMask;
687 1.1 bjh21 if ( zSign ) {
688 1.1 bjh21 if ( roundingMode == float_round_up ) roundIncrement = 0;
689 1.1 bjh21 }
690 1.1 bjh21 else {
691 1.1 bjh21 if ( roundingMode == float_round_down ) roundIncrement = 0;
692 1.1 bjh21 }
693 1.1 bjh21 }
694 1.1 bjh21 }
695 1.1 bjh21 roundBits = zSig0 & roundMask;
696 1.1 bjh21 if ( 0x7FFD <= (bits32) ( zExp - 1 ) ) {
697 1.1 bjh21 if ( ( 0x7FFE < zExp )
698 1.1 bjh21 || ( ( zExp == 0x7FFE ) && ( zSig0 + roundIncrement < zSig0 ) )
699 1.1 bjh21 ) {
700 1.1 bjh21 goto overflow;
701 1.1 bjh21 }
702 1.1 bjh21 if ( zExp <= 0 ) {
703 1.1 bjh21 isTiny =
704 1.1 bjh21 ( float_detect_tininess == float_tininess_before_rounding )
705 1.1 bjh21 || ( zExp < 0 )
706 1.1 bjh21 || ( zSig0 <= zSig0 + roundIncrement );
707 1.1 bjh21 shift64RightJamming( zSig0, 1 - zExp, &zSig0 );
708 1.1 bjh21 zExp = 0;
709 1.1 bjh21 roundBits = zSig0 & roundMask;
710 1.1 bjh21 if ( isTiny && roundBits ) float_raise( float_flag_underflow );
711 1.11.2.1 tls if ( roundBits ) set_float_exception_inexact_flag();
712 1.1 bjh21 zSig0 += roundIncrement;
713 1.1 bjh21 if ( (sbits64) zSig0 < 0 ) zExp = 1;
714 1.1 bjh21 roundIncrement = roundMask + 1;
715 1.1 bjh21 if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) {
716 1.1 bjh21 roundMask |= roundIncrement;
717 1.1 bjh21 }
718 1.1 bjh21 zSig0 &= ~ roundMask;
719 1.1 bjh21 return packFloatx80( zSign, zExp, zSig0 );
720 1.1 bjh21 }
721 1.1 bjh21 }
722 1.11.2.1 tls if ( roundBits ) set_float_exception_inexact_flag();
723 1.1 bjh21 zSig0 += roundIncrement;
724 1.1 bjh21 if ( zSig0 < roundIncrement ) {
725 1.1 bjh21 ++zExp;
726 1.1 bjh21 zSig0 = LIT64( 0x8000000000000000 );
727 1.1 bjh21 }
728 1.1 bjh21 roundIncrement = roundMask + 1;
729 1.1 bjh21 if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) {
730 1.1 bjh21 roundMask |= roundIncrement;
731 1.1 bjh21 }
732 1.1 bjh21 zSig0 &= ~ roundMask;
733 1.1 bjh21 if ( zSig0 == 0 ) zExp = 0;
734 1.1 bjh21 return packFloatx80( zSign, zExp, zSig0 );
735 1.1 bjh21 precision80:
736 1.1 bjh21 increment = ( (sbits64) zSig1 < 0 );
737 1.1 bjh21 if ( ! roundNearestEven ) {
738 1.1 bjh21 if ( roundingMode == float_round_to_zero ) {
739 1.1 bjh21 increment = 0;
740 1.1 bjh21 }
741 1.1 bjh21 else {
742 1.1 bjh21 if ( zSign ) {
743 1.1 bjh21 increment = ( roundingMode == float_round_down ) && zSig1;
744 1.1 bjh21 }
745 1.1 bjh21 else {
746 1.1 bjh21 increment = ( roundingMode == float_round_up ) && zSig1;
747 1.1 bjh21 }
748 1.1 bjh21 }
749 1.1 bjh21 }
750 1.1 bjh21 if ( 0x7FFD <= (bits32) ( zExp - 1 ) ) {
751 1.1 bjh21 if ( ( 0x7FFE < zExp )
752 1.1 bjh21 || ( ( zExp == 0x7FFE )
753 1.1 bjh21 && ( zSig0 == LIT64( 0xFFFFFFFFFFFFFFFF ) )
754 1.1 bjh21 && increment
755 1.1 bjh21 )
756 1.1 bjh21 ) {
757 1.1 bjh21 roundMask = 0;
758 1.1 bjh21 overflow:
759 1.1 bjh21 float_raise( float_flag_overflow | float_flag_inexact );
760 1.1 bjh21 if ( ( roundingMode == float_round_to_zero )
761 1.1 bjh21 || ( zSign && ( roundingMode == float_round_up ) )
762 1.1 bjh21 || ( ! zSign && ( roundingMode == float_round_down ) )
763 1.1 bjh21 ) {
764 1.1 bjh21 return packFloatx80( zSign, 0x7FFE, ~ roundMask );
765 1.1 bjh21 }
766 1.1 bjh21 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
767 1.1 bjh21 }
768 1.1 bjh21 if ( zExp <= 0 ) {
769 1.1 bjh21 isTiny =
770 1.1 bjh21 ( float_detect_tininess == float_tininess_before_rounding )
771 1.1 bjh21 || ( zExp < 0 )
772 1.1 bjh21 || ! increment
773 1.1 bjh21 || ( zSig0 < LIT64( 0xFFFFFFFFFFFFFFFF ) );
774 1.1 bjh21 shift64ExtraRightJamming( zSig0, zSig1, 1 - zExp, &zSig0, &zSig1 );
775 1.1 bjh21 zExp = 0;
776 1.1 bjh21 if ( isTiny && zSig1 ) float_raise( float_flag_underflow );
777 1.11.2.1 tls if ( zSig1 ) set_float_exception_inexact_flag();
778 1.1 bjh21 if ( roundNearestEven ) {
779 1.1 bjh21 increment = ( (sbits64) zSig1 < 0 );
780 1.1 bjh21 }
781 1.1 bjh21 else {
782 1.1 bjh21 if ( zSign ) {
783 1.1 bjh21 increment = ( roundingMode == float_round_down ) && zSig1;
784 1.1 bjh21 }
785 1.1 bjh21 else {
786 1.1 bjh21 increment = ( roundingMode == float_round_up ) && zSig1;
787 1.1 bjh21 }
788 1.1 bjh21 }
789 1.1 bjh21 if ( increment ) {
790 1.1 bjh21 ++zSig0;
791 1.1 bjh21 zSig0 &=
792 1.1 bjh21 ~ ( ( (bits64) ( zSig1<<1 ) == 0 ) & roundNearestEven );
793 1.1 bjh21 if ( (sbits64) zSig0 < 0 ) zExp = 1;
794 1.1 bjh21 }
795 1.1 bjh21 return packFloatx80( zSign, zExp, zSig0 );
796 1.1 bjh21 }
797 1.1 bjh21 }
798 1.11.2.1 tls if ( zSig1 ) set_float_exception_inexact_flag();
799 1.1 bjh21 if ( increment ) {
800 1.1 bjh21 ++zSig0;
801 1.1 bjh21 if ( zSig0 == 0 ) {
802 1.1 bjh21 ++zExp;
803 1.1 bjh21 zSig0 = LIT64( 0x8000000000000000 );
804 1.1 bjh21 }
805 1.1 bjh21 else {
806 1.1 bjh21 zSig0 &= ~ ( ( (bits64) ( zSig1<<1 ) == 0 ) & roundNearestEven );
807 1.1 bjh21 }
808 1.1 bjh21 }
809 1.1 bjh21 else {
810 1.1 bjh21 if ( zSig0 == 0 ) zExp = 0;
811 1.1 bjh21 }
812 1.1 bjh21 return packFloatx80( zSign, zExp, zSig0 );
813 1.1 bjh21
814 1.1 bjh21 }
815 1.1 bjh21
816 1.1 bjh21 /*
817 1.1 bjh21 -------------------------------------------------------------------------------
818 1.1 bjh21 Takes an abstract floating-point value having sign `zSign', exponent
819 1.1 bjh21 `zExp', and significand formed by the concatenation of `zSig0' and `zSig1',
820 1.1 bjh21 and returns the proper extended double-precision floating-point value
821 1.1 bjh21 corresponding to the abstract input. This routine is just like
822 1.1 bjh21 `roundAndPackFloatx80' except that the input significand does not have to be
823 1.1 bjh21 normalized.
824 1.1 bjh21 -------------------------------------------------------------------------------
825 1.1 bjh21 */
826 1.1 bjh21 static floatx80
827 1.1 bjh21 normalizeRoundAndPackFloatx80(
828 1.1 bjh21 int8 roundingPrecision, flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1
829 1.1 bjh21 )
830 1.1 bjh21 {
831 1.1 bjh21 int8 shiftCount;
832 1.1 bjh21
833 1.1 bjh21 if ( zSig0 == 0 ) {
834 1.1 bjh21 zSig0 = zSig1;
835 1.1 bjh21 zSig1 = 0;
836 1.1 bjh21 zExp -= 64;
837 1.1 bjh21 }
838 1.1 bjh21 shiftCount = countLeadingZeros64( zSig0 );
839 1.1 bjh21 shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
840 1.1 bjh21 zExp -= shiftCount;
841 1.1 bjh21 return
842 1.1 bjh21 roundAndPackFloatx80( roundingPrecision, zSign, zExp, zSig0, zSig1 );
843 1.1 bjh21
844 1.1 bjh21 }
845 1.1 bjh21
846 1.1 bjh21 #endif
847 1.1 bjh21
848 1.1 bjh21 #ifdef FLOAT128
849 1.1 bjh21
850 1.1 bjh21 /*
851 1.1 bjh21 -------------------------------------------------------------------------------
852 1.1 bjh21 Returns the least-significant 64 fraction bits of the quadruple-precision
853 1.1 bjh21 floating-point value `a'.
854 1.1 bjh21 -------------------------------------------------------------------------------
855 1.1 bjh21 */
856 1.1 bjh21 INLINE bits64 extractFloat128Frac1( float128 a )
857 1.1 bjh21 {
858 1.1 bjh21
859 1.1 bjh21 return a.low;
860 1.1 bjh21
861 1.1 bjh21 }
862 1.1 bjh21
863 1.1 bjh21 /*
864 1.1 bjh21 -------------------------------------------------------------------------------
865 1.1 bjh21 Returns the most-significant 48 fraction bits of the quadruple-precision
866 1.1 bjh21 floating-point value `a'.
867 1.1 bjh21 -------------------------------------------------------------------------------
868 1.1 bjh21 */
869 1.1 bjh21 INLINE bits64 extractFloat128Frac0( float128 a )
870 1.1 bjh21 {
871 1.1 bjh21
872 1.1 bjh21 return a.high & LIT64( 0x0000FFFFFFFFFFFF );
873 1.1 bjh21
874 1.1 bjh21 }
875 1.1 bjh21
876 1.1 bjh21 /*
877 1.1 bjh21 -------------------------------------------------------------------------------
878 1.1 bjh21 Returns the exponent bits of the quadruple-precision floating-point value
879 1.1 bjh21 `a'.
880 1.1 bjh21 -------------------------------------------------------------------------------
881 1.1 bjh21 */
882 1.1 bjh21 INLINE int32 extractFloat128Exp( float128 a )
883 1.1 bjh21 {
884 1.1 bjh21
885 1.10 christos return (int32)((a.high >> 48) & 0x7FFF);
886 1.1 bjh21
887 1.1 bjh21 }
888 1.1 bjh21
889 1.1 bjh21 /*
890 1.1 bjh21 -------------------------------------------------------------------------------
891 1.1 bjh21 Returns the sign bit of the quadruple-precision floating-point value `a'.
892 1.1 bjh21 -------------------------------------------------------------------------------
893 1.1 bjh21 */
894 1.1 bjh21 INLINE flag extractFloat128Sign( float128 a )
895 1.1 bjh21 {
896 1.1 bjh21
897 1.10 christos return (flag)(a.high >> 63);
898 1.1 bjh21
899 1.1 bjh21 }
900 1.1 bjh21
901 1.1 bjh21 /*
902 1.1 bjh21 -------------------------------------------------------------------------------
903 1.1 bjh21 Normalizes the subnormal quadruple-precision floating-point value
904 1.1 bjh21 represented by the denormalized significand formed by the concatenation of
905 1.1 bjh21 `aSig0' and `aSig1'. The normalized exponent is stored at the location
906 1.1 bjh21 pointed to by `zExpPtr'. The most significant 49 bits of the normalized
907 1.1 bjh21 significand are stored at the location pointed to by `zSig0Ptr', and the
908 1.1 bjh21 least significant 64 bits of the normalized significand are stored at the
909 1.1 bjh21 location pointed to by `zSig1Ptr'.
910 1.1 bjh21 -------------------------------------------------------------------------------
911 1.1 bjh21 */
912 1.1 bjh21 static void
913 1.1 bjh21 normalizeFloat128Subnormal(
914 1.1 bjh21 bits64 aSig0,
915 1.1 bjh21 bits64 aSig1,
916 1.1 bjh21 int32 *zExpPtr,
917 1.1 bjh21 bits64 *zSig0Ptr,
918 1.1 bjh21 bits64 *zSig1Ptr
919 1.1 bjh21 )
920 1.1 bjh21 {
921 1.1 bjh21 int8 shiftCount;
922 1.1 bjh21
923 1.1 bjh21 if ( aSig0 == 0 ) {
924 1.1 bjh21 shiftCount = countLeadingZeros64( aSig1 ) - 15;
925 1.1 bjh21 if ( shiftCount < 0 ) {
926 1.1 bjh21 *zSig0Ptr = aSig1>>( - shiftCount );
927 1.1 bjh21 *zSig1Ptr = aSig1<<( shiftCount & 63 );
928 1.1 bjh21 }
929 1.1 bjh21 else {
930 1.1 bjh21 *zSig0Ptr = aSig1<<shiftCount;
931 1.1 bjh21 *zSig1Ptr = 0;
932 1.1 bjh21 }
933 1.1 bjh21 *zExpPtr = - shiftCount - 63;
934 1.1 bjh21 }
935 1.1 bjh21 else {
936 1.1 bjh21 shiftCount = countLeadingZeros64( aSig0 ) - 15;
937 1.1 bjh21 shortShift128Left( aSig0, aSig1, shiftCount, zSig0Ptr, zSig1Ptr );
938 1.1 bjh21 *zExpPtr = 1 - shiftCount;
939 1.1 bjh21 }
940 1.1 bjh21
941 1.1 bjh21 }
942 1.1 bjh21
943 1.1 bjh21 /*
944 1.1 bjh21 -------------------------------------------------------------------------------
945 1.1 bjh21 Packs the sign `zSign', the exponent `zExp', and the significand formed
946 1.1 bjh21 by the concatenation of `zSig0' and `zSig1' into a quadruple-precision
947 1.1 bjh21 floating-point value, returning the result. After being shifted into the
948 1.1 bjh21 proper positions, the three fields `zSign', `zExp', and `zSig0' are simply
949 1.1 bjh21 added together to form the most significant 32 bits of the result. This
950 1.1 bjh21 means that any integer portion of `zSig0' will be added into the exponent.
951 1.1 bjh21 Since a properly normalized significand will have an integer portion equal
952 1.1 bjh21 to 1, the `zExp' input should be 1 less than the desired result exponent
953 1.1 bjh21 whenever `zSig0' and `zSig1' concatenated form a complete, normalized
954 1.1 bjh21 significand.
955 1.1 bjh21 -------------------------------------------------------------------------------
956 1.1 bjh21 */
957 1.1 bjh21 INLINE float128
958 1.1 bjh21 packFloat128( flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1 )
959 1.1 bjh21 {
960 1.1 bjh21 float128 z;
961 1.1 bjh21
962 1.1 bjh21 z.low = zSig1;
963 1.1 bjh21 z.high = ( ( (bits64) zSign )<<63 ) + ( ( (bits64) zExp )<<48 ) + zSig0;
964 1.1 bjh21 return z;
965 1.1 bjh21
966 1.1 bjh21 }
967 1.1 bjh21
968 1.1 bjh21 /*
969 1.1 bjh21 -------------------------------------------------------------------------------
970 1.1 bjh21 Takes an abstract floating-point value having sign `zSign', exponent `zExp',
971 1.1 bjh21 and extended significand formed by the concatenation of `zSig0', `zSig1',
972 1.1 bjh21 and `zSig2', and returns the proper quadruple-precision floating-point value
973 1.1 bjh21 corresponding to the abstract input. Ordinarily, the abstract value is
974 1.1 bjh21 simply rounded and packed into the quadruple-precision format, with the
975 1.1 bjh21 inexact exception raised if the abstract input cannot be represented
976 1.1 bjh21 exactly. However, if the abstract value is too large, the overflow and
977 1.1 bjh21 inexact exceptions are raised and an infinity or maximal finite value is
978 1.1 bjh21 returned. If the abstract value is too small, the input value is rounded to
979 1.1 bjh21 a subnormal number, and the underflow and inexact exceptions are raised if
980 1.1 bjh21 the abstract input cannot be represented exactly as a subnormal quadruple-
981 1.1 bjh21 precision floating-point number.
982 1.1 bjh21 The input significand must be normalized or smaller. If the input
983 1.1 bjh21 significand is not normalized, `zExp' must be 0; in that case, the result
984 1.1 bjh21 returned is a subnormal number, and it must not require rounding. In the
985 1.1 bjh21 usual case that the input significand is normalized, `zExp' must be 1 less
986 1.1 bjh21 than the ``true'' floating-point exponent. The handling of underflow and
987 1.1 bjh21 overflow follows the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
988 1.1 bjh21 -------------------------------------------------------------------------------
989 1.1 bjh21 */
990 1.1 bjh21 static float128
991 1.1 bjh21 roundAndPackFloat128(
992 1.1 bjh21 flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1, bits64 zSig2 )
993 1.1 bjh21 {
994 1.1 bjh21 int8 roundingMode;
995 1.1 bjh21 flag roundNearestEven, increment, isTiny;
996 1.1 bjh21
997 1.1 bjh21 roundingMode = float_rounding_mode;
998 1.1 bjh21 roundNearestEven = ( roundingMode == float_round_nearest_even );
999 1.1 bjh21 increment = ( (sbits64) zSig2 < 0 );
1000 1.1 bjh21 if ( ! roundNearestEven ) {
1001 1.1 bjh21 if ( roundingMode == float_round_to_zero ) {
1002 1.1 bjh21 increment = 0;
1003 1.1 bjh21 }
1004 1.1 bjh21 else {
1005 1.1 bjh21 if ( zSign ) {
1006 1.1 bjh21 increment = ( roundingMode == float_round_down ) && zSig2;
1007 1.1 bjh21 }
1008 1.1 bjh21 else {
1009 1.1 bjh21 increment = ( roundingMode == float_round_up ) && zSig2;
1010 1.1 bjh21 }
1011 1.1 bjh21 }
1012 1.1 bjh21 }
1013 1.1 bjh21 if ( 0x7FFD <= (bits32) zExp ) {
1014 1.1 bjh21 if ( ( 0x7FFD < zExp )
1015 1.1 bjh21 || ( ( zExp == 0x7FFD )
1016 1.1 bjh21 && eq128(
1017 1.1 bjh21 LIT64( 0x0001FFFFFFFFFFFF ),
1018 1.1 bjh21 LIT64( 0xFFFFFFFFFFFFFFFF ),
1019 1.1 bjh21 zSig0,
1020 1.1 bjh21 zSig1
1021 1.1 bjh21 )
1022 1.1 bjh21 && increment
1023 1.1 bjh21 )
1024 1.1 bjh21 ) {
1025 1.1 bjh21 float_raise( float_flag_overflow | float_flag_inexact );
1026 1.1 bjh21 if ( ( roundingMode == float_round_to_zero )
1027 1.1 bjh21 || ( zSign && ( roundingMode == float_round_up ) )
1028 1.1 bjh21 || ( ! zSign && ( roundingMode == float_round_down ) )
1029 1.1 bjh21 ) {
1030 1.1 bjh21 return
1031 1.1 bjh21 packFloat128(
1032 1.1 bjh21 zSign,
1033 1.1 bjh21 0x7FFE,
1034 1.1 bjh21 LIT64( 0x0000FFFFFFFFFFFF ),
1035 1.1 bjh21 LIT64( 0xFFFFFFFFFFFFFFFF )
1036 1.1 bjh21 );
1037 1.1 bjh21 }
1038 1.1 bjh21 return packFloat128( zSign, 0x7FFF, 0, 0 );
1039 1.1 bjh21 }
1040 1.1 bjh21 if ( zExp < 0 ) {
1041 1.1 bjh21 isTiny =
1042 1.1 bjh21 ( float_detect_tininess == float_tininess_before_rounding )
1043 1.1 bjh21 || ( zExp < -1 )
1044 1.1 bjh21 || ! increment
1045 1.1 bjh21 || lt128(
1046 1.1 bjh21 zSig0,
1047 1.1 bjh21 zSig1,
1048 1.1 bjh21 LIT64( 0x0001FFFFFFFFFFFF ),
1049 1.1 bjh21 LIT64( 0xFFFFFFFFFFFFFFFF )
1050 1.1 bjh21 );
1051 1.1 bjh21 shift128ExtraRightJamming(
1052 1.1 bjh21 zSig0, zSig1, zSig2, - zExp, &zSig0, &zSig1, &zSig2 );
1053 1.1 bjh21 zExp = 0;
1054 1.1 bjh21 if ( isTiny && zSig2 ) float_raise( float_flag_underflow );
1055 1.1 bjh21 if ( roundNearestEven ) {
1056 1.1 bjh21 increment = ( (sbits64) zSig2 < 0 );
1057 1.1 bjh21 }
1058 1.1 bjh21 else {
1059 1.1 bjh21 if ( zSign ) {
1060 1.1 bjh21 increment = ( roundingMode == float_round_down ) && zSig2;
1061 1.1 bjh21 }
1062 1.1 bjh21 else {
1063 1.1 bjh21 increment = ( roundingMode == float_round_up ) && zSig2;
1064 1.1 bjh21 }
1065 1.1 bjh21 }
1066 1.1 bjh21 }
1067 1.1 bjh21 }
1068 1.11.2.1 tls if ( zSig2 ) set_float_exception_inexact_flag();
1069 1.1 bjh21 if ( increment ) {
1070 1.1 bjh21 add128( zSig0, zSig1, 0, 1, &zSig0, &zSig1 );
1071 1.1 bjh21 zSig1 &= ~ ( ( zSig2 + zSig2 == 0 ) & roundNearestEven );
1072 1.1 bjh21 }
1073 1.1 bjh21 else {
1074 1.1 bjh21 if ( ( zSig0 | zSig1 ) == 0 ) zExp = 0;
1075 1.1 bjh21 }
1076 1.1 bjh21 return packFloat128( zSign, zExp, zSig0, zSig1 );
1077 1.1 bjh21
1078 1.1 bjh21 }
1079 1.1 bjh21
1080 1.1 bjh21 /*
1081 1.1 bjh21 -------------------------------------------------------------------------------
1082 1.1 bjh21 Takes an abstract floating-point value having sign `zSign', exponent `zExp',
1083 1.1 bjh21 and significand formed by the concatenation of `zSig0' and `zSig1', and
1084 1.1 bjh21 returns the proper quadruple-precision floating-point value corresponding
1085 1.1 bjh21 to the abstract input. This routine is just like `roundAndPackFloat128'
1086 1.1 bjh21 except that the input significand has fewer bits and does not have to be
1087 1.1 bjh21 normalized. In all cases, `zExp' must be 1 less than the ``true'' floating-
1088 1.1 bjh21 point exponent.
1089 1.1 bjh21 -------------------------------------------------------------------------------
1090 1.1 bjh21 */
1091 1.1 bjh21 static float128
1092 1.1 bjh21 normalizeRoundAndPackFloat128(
1093 1.1 bjh21 flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1 )
1094 1.1 bjh21 {
1095 1.1 bjh21 int8 shiftCount;
1096 1.1 bjh21 bits64 zSig2;
1097 1.1 bjh21
1098 1.1 bjh21 if ( zSig0 == 0 ) {
1099 1.1 bjh21 zSig0 = zSig1;
1100 1.1 bjh21 zSig1 = 0;
1101 1.1 bjh21 zExp -= 64;
1102 1.1 bjh21 }
1103 1.1 bjh21 shiftCount = countLeadingZeros64( zSig0 ) - 15;
1104 1.1 bjh21 if ( 0 <= shiftCount ) {
1105 1.1 bjh21 zSig2 = 0;
1106 1.1 bjh21 shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
1107 1.1 bjh21 }
1108 1.1 bjh21 else {
1109 1.1 bjh21 shift128ExtraRightJamming(
1110 1.1 bjh21 zSig0, zSig1, 0, - shiftCount, &zSig0, &zSig1, &zSig2 );
1111 1.1 bjh21 }
1112 1.1 bjh21 zExp -= shiftCount;
1113 1.1 bjh21 return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 );
1114 1.1 bjh21
1115 1.1 bjh21 }
1116 1.1 bjh21
1117 1.1 bjh21 #endif
1118 1.1 bjh21
1119 1.1 bjh21 /*
1120 1.1 bjh21 -------------------------------------------------------------------------------
1121 1.1 bjh21 Returns the result of converting the 32-bit two's complement integer `a'
1122 1.1 bjh21 to the single-precision floating-point format. The conversion is performed
1123 1.1 bjh21 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1124 1.1 bjh21 -------------------------------------------------------------------------------
1125 1.1 bjh21 */
1126 1.1 bjh21 float32 int32_to_float32( int32 a )
1127 1.1 bjh21 {
1128 1.1 bjh21 flag zSign;
1129 1.1 bjh21
1130 1.1 bjh21 if ( a == 0 ) return 0;
1131 1.1 bjh21 if ( a == (sbits32) 0x80000000 ) return packFloat32( 1, 0x9E, 0 );
1132 1.1 bjh21 zSign = ( a < 0 );
1133 1.10 christos return normalizeRoundAndPackFloat32(zSign, 0x9C, (uint32)(zSign ? - a : a));
1134 1.1 bjh21
1135 1.1 bjh21 }
1136 1.1 bjh21
1137 1.6 matt float32 uint32_to_float32( uint32 a )
1138 1.6 matt {
1139 1.6 matt if ( a == 0 ) return 0;
1140 1.6 matt if ( a & (bits32) 0x80000000 )
1141 1.6 matt return normalizeRoundAndPackFloat32( 0, 0x9D, a >> 1 );
1142 1.6 matt return normalizeRoundAndPackFloat32( 0, 0x9C, a );
1143 1.6 matt }
1144 1.6 matt
1145 1.6 matt
1146 1.1 bjh21 /*
1147 1.1 bjh21 -------------------------------------------------------------------------------
1148 1.1 bjh21 Returns the result of converting the 32-bit two's complement integer `a'
1149 1.1 bjh21 to the double-precision floating-point format. The conversion is performed
1150 1.1 bjh21 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1151 1.1 bjh21 -------------------------------------------------------------------------------
1152 1.1 bjh21 */
1153 1.1 bjh21 float64 int32_to_float64( int32 a )
1154 1.1 bjh21 {
1155 1.1 bjh21 flag zSign;
1156 1.1 bjh21 uint32 absA;
1157 1.1 bjh21 int8 shiftCount;
1158 1.1 bjh21 bits64 zSig;
1159 1.1 bjh21
1160 1.1 bjh21 if ( a == 0 ) return 0;
1161 1.1 bjh21 zSign = ( a < 0 );
1162 1.1 bjh21 absA = zSign ? - a : a;
1163 1.1 bjh21 shiftCount = countLeadingZeros32( absA ) + 21;
1164 1.1 bjh21 zSig = absA;
1165 1.1 bjh21 return packFloat64( zSign, 0x432 - shiftCount, zSig<<shiftCount );
1166 1.1 bjh21
1167 1.1 bjh21 }
1168 1.1 bjh21
1169 1.6 matt float64 uint32_to_float64( uint32 a )
1170 1.6 matt {
1171 1.6 matt int8 shiftCount;
1172 1.6 matt bits64 zSig = a;
1173 1.6 matt
1174 1.6 matt if ( a == 0 ) return 0;
1175 1.6 matt shiftCount = countLeadingZeros32( a ) + 21;
1176 1.6 matt return packFloat64( 0, 0x432 - shiftCount, zSig<<shiftCount );
1177 1.6 matt
1178 1.6 matt }
1179 1.6 matt
1180 1.1 bjh21 #ifdef FLOATX80
1181 1.1 bjh21
1182 1.1 bjh21 /*
1183 1.1 bjh21 -------------------------------------------------------------------------------
1184 1.1 bjh21 Returns the result of converting the 32-bit two's complement integer `a'
1185 1.1 bjh21 to the extended double-precision floating-point format. The conversion
1186 1.1 bjh21 is performed according to the IEC/IEEE Standard for Binary Floating-Point
1187 1.1 bjh21 Arithmetic.
1188 1.1 bjh21 -------------------------------------------------------------------------------
1189 1.1 bjh21 */
1190 1.1 bjh21 floatx80 int32_to_floatx80( int32 a )
1191 1.1 bjh21 {
1192 1.1 bjh21 flag zSign;
1193 1.1 bjh21 uint32 absA;
1194 1.1 bjh21 int8 shiftCount;
1195 1.1 bjh21 bits64 zSig;
1196 1.1 bjh21
1197 1.1 bjh21 if ( a == 0 ) return packFloatx80( 0, 0, 0 );
1198 1.1 bjh21 zSign = ( a < 0 );
1199 1.1 bjh21 absA = zSign ? - a : a;
1200 1.1 bjh21 shiftCount = countLeadingZeros32( absA ) + 32;
1201 1.1 bjh21 zSig = absA;
1202 1.1 bjh21 return packFloatx80( zSign, 0x403E - shiftCount, zSig<<shiftCount );
1203 1.1 bjh21
1204 1.1 bjh21 }
1205 1.1 bjh21
1206 1.6 matt floatx80 uint32_to_floatx80( uint32 a )
1207 1.6 matt {
1208 1.6 matt int8 shiftCount;
1209 1.6 matt bits64 zSig = a;
1210 1.6 matt
1211 1.6 matt if ( a == 0 ) return packFloatx80( 0, 0, 0 );
1212 1.6 matt shiftCount = countLeadingZeros32( a ) + 32;
1213 1.6 matt return packFloatx80( 0, 0x403E - shiftCount, zSig<<shiftCount );
1214 1.6 matt
1215 1.6 matt }
1216 1.6 matt
1217 1.1 bjh21 #endif
1218 1.1 bjh21
1219 1.1 bjh21 #ifdef FLOAT128
1220 1.1 bjh21
1221 1.1 bjh21 /*
1222 1.1 bjh21 -------------------------------------------------------------------------------
1223 1.1 bjh21 Returns the result of converting the 32-bit two's complement integer `a' to
1224 1.1 bjh21 the quadruple-precision floating-point format. The conversion is performed
1225 1.1 bjh21 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1226 1.1 bjh21 -------------------------------------------------------------------------------
1227 1.1 bjh21 */
1228 1.1 bjh21 float128 int32_to_float128( int32 a )
1229 1.1 bjh21 {
1230 1.1 bjh21 flag zSign;
1231 1.1 bjh21 uint32 absA;
1232 1.1 bjh21 int8 shiftCount;
1233 1.1 bjh21 bits64 zSig0;
1234 1.1 bjh21
1235 1.1 bjh21 if ( a == 0 ) return packFloat128( 0, 0, 0, 0 );
1236 1.1 bjh21 zSign = ( a < 0 );
1237 1.1 bjh21 absA = zSign ? - a : a;
1238 1.1 bjh21 shiftCount = countLeadingZeros32( absA ) + 17;
1239 1.1 bjh21 zSig0 = absA;
1240 1.1 bjh21 return packFloat128( zSign, 0x402E - shiftCount, zSig0<<shiftCount, 0 );
1241 1.1 bjh21
1242 1.1 bjh21 }
1243 1.1 bjh21
1244 1.6 matt float128 uint32_to_float128( uint32 a )
1245 1.6 matt {
1246 1.6 matt int8 shiftCount;
1247 1.6 matt bits64 zSig0 = a;
1248 1.6 matt
1249 1.6 matt if ( a == 0 ) return packFloat128( 0, 0, 0, 0 );
1250 1.6 matt shiftCount = countLeadingZeros32( a ) + 17;
1251 1.6 matt return packFloat128( 0, 0x402E - shiftCount, zSig0<<shiftCount, 0 );
1252 1.6 matt
1253 1.6 matt }
1254 1.6 matt
1255 1.1 bjh21 #endif
1256 1.1 bjh21
1257 1.1 bjh21 #ifndef SOFTFLOAT_FOR_GCC /* __floatdi?f is in libgcc2.c */
1258 1.1 bjh21 /*
1259 1.1 bjh21 -------------------------------------------------------------------------------
1260 1.1 bjh21 Returns the result of converting the 64-bit two's complement integer `a'
1261 1.1 bjh21 to the single-precision floating-point format. The conversion is performed
1262 1.1 bjh21 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1263 1.1 bjh21 -------------------------------------------------------------------------------
1264 1.1 bjh21 */
1265 1.1 bjh21 float32 int64_to_float32( int64 a )
1266 1.1 bjh21 {
1267 1.1 bjh21 flag zSign;
1268 1.1 bjh21 uint64 absA;
1269 1.1 bjh21 int8 shiftCount;
1270 1.1 bjh21
1271 1.1 bjh21 if ( a == 0 ) return 0;
1272 1.1 bjh21 zSign = ( a < 0 );
1273 1.1 bjh21 absA = zSign ? - a : a;
1274 1.1 bjh21 shiftCount = countLeadingZeros64( absA ) - 40;
1275 1.1 bjh21 if ( 0 <= shiftCount ) {
1276 1.1 bjh21 return packFloat32( zSign, 0x95 - shiftCount, absA<<shiftCount );
1277 1.1 bjh21 }
1278 1.1 bjh21 else {
1279 1.1 bjh21 shiftCount += 7;
1280 1.1 bjh21 if ( shiftCount < 0 ) {
1281 1.1 bjh21 shift64RightJamming( absA, - shiftCount, &absA );
1282 1.1 bjh21 }
1283 1.1 bjh21 else {
1284 1.1 bjh21 absA <<= shiftCount;
1285 1.1 bjh21 }
1286 1.1 bjh21 return roundAndPackFloat32( zSign, 0x9C - shiftCount, absA );
1287 1.1 bjh21 }
1288 1.1 bjh21
1289 1.1 bjh21 }
1290 1.1 bjh21
1291 1.1 bjh21 /*
1292 1.1 bjh21 -------------------------------------------------------------------------------
1293 1.1 bjh21 Returns the result of converting the 64-bit two's complement integer `a'
1294 1.1 bjh21 to the double-precision floating-point format. The conversion is performed
1295 1.1 bjh21 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1296 1.1 bjh21 -------------------------------------------------------------------------------
1297 1.1 bjh21 */
1298 1.1 bjh21 float64 int64_to_float64( int64 a )
1299 1.1 bjh21 {
1300 1.1 bjh21 flag zSign;
1301 1.1 bjh21
1302 1.1 bjh21 if ( a == 0 ) return 0;
1303 1.1 bjh21 if ( a == (sbits64) LIT64( 0x8000000000000000 ) ) {
1304 1.1 bjh21 return packFloat64( 1, 0x43E, 0 );
1305 1.1 bjh21 }
1306 1.1 bjh21 zSign = ( a < 0 );
1307 1.1 bjh21 return normalizeRoundAndPackFloat64( zSign, 0x43C, zSign ? - a : a );
1308 1.1 bjh21
1309 1.1 bjh21 }
1310 1.1 bjh21
1311 1.1 bjh21 #ifdef FLOATX80
1312 1.1 bjh21
1313 1.1 bjh21 /*
1314 1.1 bjh21 -------------------------------------------------------------------------------
1315 1.1 bjh21 Returns the result of converting the 64-bit two's complement integer `a'
1316 1.1 bjh21 to the extended double-precision floating-point format. The conversion
1317 1.1 bjh21 is performed according to the IEC/IEEE Standard for Binary Floating-Point
1318 1.1 bjh21 Arithmetic.
1319 1.1 bjh21 -------------------------------------------------------------------------------
1320 1.1 bjh21 */
1321 1.1 bjh21 floatx80 int64_to_floatx80( int64 a )
1322 1.1 bjh21 {
1323 1.1 bjh21 flag zSign;
1324 1.1 bjh21 uint64 absA;
1325 1.1 bjh21 int8 shiftCount;
1326 1.1 bjh21
1327 1.1 bjh21 if ( a == 0 ) return packFloatx80( 0, 0, 0 );
1328 1.1 bjh21 zSign = ( a < 0 );
1329 1.1 bjh21 absA = zSign ? - a : a;
1330 1.1 bjh21 shiftCount = countLeadingZeros64( absA );
1331 1.1 bjh21 return packFloatx80( zSign, 0x403E - shiftCount, absA<<shiftCount );
1332 1.1 bjh21
1333 1.1 bjh21 }
1334 1.1 bjh21
1335 1.1 bjh21 #endif
1336 1.1 bjh21
1337 1.1 bjh21 #endif /* !SOFTFLOAT_FOR_GCC */
1338 1.1 bjh21
1339 1.1 bjh21 #ifdef FLOAT128
1340 1.1 bjh21
1341 1.1 bjh21 /*
1342 1.1 bjh21 -------------------------------------------------------------------------------
1343 1.1 bjh21 Returns the result of converting the 64-bit two's complement integer `a' to
1344 1.1 bjh21 the quadruple-precision floating-point format. The conversion is performed
1345 1.1 bjh21 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1346 1.1 bjh21 -------------------------------------------------------------------------------
1347 1.1 bjh21 */
1348 1.1 bjh21 float128 int64_to_float128( int64 a )
1349 1.1 bjh21 {
1350 1.1 bjh21 flag zSign;
1351 1.1 bjh21 uint64 absA;
1352 1.1 bjh21 int8 shiftCount;
1353 1.1 bjh21 int32 zExp;
1354 1.1 bjh21 bits64 zSig0, zSig1;
1355 1.1 bjh21
1356 1.1 bjh21 if ( a == 0 ) return packFloat128( 0, 0, 0, 0 );
1357 1.1 bjh21 zSign = ( a < 0 );
1358 1.1 bjh21 absA = zSign ? - a : a;
1359 1.1 bjh21 shiftCount = countLeadingZeros64( absA ) + 49;
1360 1.1 bjh21 zExp = 0x406E - shiftCount;
1361 1.1 bjh21 if ( 64 <= shiftCount ) {
1362 1.1 bjh21 zSig1 = 0;
1363 1.1 bjh21 zSig0 = absA;
1364 1.1 bjh21 shiftCount -= 64;
1365 1.1 bjh21 }
1366 1.1 bjh21 else {
1367 1.1 bjh21 zSig1 = absA;
1368 1.1 bjh21 zSig0 = 0;
1369 1.1 bjh21 }
1370 1.1 bjh21 shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
1371 1.1 bjh21 return packFloat128( zSign, zExp, zSig0, zSig1 );
1372 1.1 bjh21
1373 1.1 bjh21 }
1374 1.1 bjh21
1375 1.1 bjh21 #endif
1376 1.1 bjh21
1377 1.1 bjh21 #ifndef SOFTFLOAT_FOR_GCC /* Not needed */
1378 1.1 bjh21 /*
1379 1.1 bjh21 -------------------------------------------------------------------------------
1380 1.1 bjh21 Returns the result of converting the single-precision floating-point value
1381 1.1 bjh21 `a' to the 32-bit two's complement integer format. The conversion is
1382 1.1 bjh21 performed according to the IEC/IEEE Standard for Binary Floating-Point
1383 1.1 bjh21 Arithmetic---which means in particular that the conversion is rounded
1384 1.1 bjh21 according to the current rounding mode. If `a' is a NaN, the largest
1385 1.1 bjh21 positive integer is returned. Otherwise, if the conversion overflows, the
1386 1.1 bjh21 largest integer with the same sign as `a' is returned.
1387 1.1 bjh21 -------------------------------------------------------------------------------
1388 1.1 bjh21 */
1389 1.1 bjh21 int32 float32_to_int32( float32 a )
1390 1.1 bjh21 {
1391 1.1 bjh21 flag aSign;
1392 1.1 bjh21 int16 aExp, shiftCount;
1393 1.1 bjh21 bits32 aSig;
1394 1.1 bjh21 bits64 aSig64;
1395 1.1 bjh21
1396 1.1 bjh21 aSig = extractFloat32Frac( a );
1397 1.1 bjh21 aExp = extractFloat32Exp( a );
1398 1.1 bjh21 aSign = extractFloat32Sign( a );
1399 1.1 bjh21 if ( ( aExp == 0xFF ) && aSig ) aSign = 0;
1400 1.1 bjh21 if ( aExp ) aSig |= 0x00800000;
1401 1.1 bjh21 shiftCount = 0xAF - aExp;
1402 1.1 bjh21 aSig64 = aSig;
1403 1.1 bjh21 aSig64 <<= 32;
1404 1.1 bjh21 if ( 0 < shiftCount ) shift64RightJamming( aSig64, shiftCount, &aSig64 );
1405 1.1 bjh21 return roundAndPackInt32( aSign, aSig64 );
1406 1.1 bjh21
1407 1.1 bjh21 }
1408 1.1 bjh21 #endif /* !SOFTFLOAT_FOR_GCC */
1409 1.1 bjh21
1410 1.1 bjh21 /*
1411 1.1 bjh21 -------------------------------------------------------------------------------
1412 1.1 bjh21 Returns the result of converting the single-precision floating-point value
1413 1.1 bjh21 `a' to the 32-bit two's complement integer format. The conversion is
1414 1.1 bjh21 performed according to the IEC/IEEE Standard for Binary Floating-Point
1415 1.1 bjh21 Arithmetic, except that the conversion is always rounded toward zero.
1416 1.1 bjh21 If `a' is a NaN, the largest positive integer is returned. Otherwise, if
1417 1.1 bjh21 the conversion overflows, the largest integer with the same sign as `a' is
1418 1.1 bjh21 returned.
1419 1.1 bjh21 -------------------------------------------------------------------------------
1420 1.1 bjh21 */
1421 1.1 bjh21 int32 float32_to_int32_round_to_zero( float32 a )
1422 1.1 bjh21 {
1423 1.1 bjh21 flag aSign;
1424 1.1 bjh21 int16 aExp, shiftCount;
1425 1.1 bjh21 bits32 aSig;
1426 1.1 bjh21 int32 z;
1427 1.1 bjh21
1428 1.1 bjh21 aSig = extractFloat32Frac( a );
1429 1.1 bjh21 aExp = extractFloat32Exp( a );
1430 1.1 bjh21 aSign = extractFloat32Sign( a );
1431 1.1 bjh21 shiftCount = aExp - 0x9E;
1432 1.1 bjh21 if ( 0 <= shiftCount ) {
1433 1.1 bjh21 if ( a != 0xCF000000 ) {
1434 1.1 bjh21 float_raise( float_flag_invalid );
1435 1.1 bjh21 if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) return 0x7FFFFFFF;
1436 1.1 bjh21 }
1437 1.1 bjh21 return (sbits32) 0x80000000;
1438 1.1 bjh21 }
1439 1.1 bjh21 else if ( aExp <= 0x7E ) {
1440 1.11.2.1 tls if ( aExp | aSig ) set_float_exception_inexact_flag();
1441 1.1 bjh21 return 0;
1442 1.1 bjh21 }
1443 1.1 bjh21 aSig = ( aSig | 0x00800000 )<<8;
1444 1.1 bjh21 z = aSig>>( - shiftCount );
1445 1.1 bjh21 if ( (bits32) ( aSig<<( shiftCount & 31 ) ) ) {
1446 1.11.2.1 tls set_float_exception_inexact_flag();
1447 1.1 bjh21 }
1448 1.1 bjh21 if ( aSign ) z = - z;
1449 1.1 bjh21 return z;
1450 1.1 bjh21
1451 1.1 bjh21 }
1452 1.1 bjh21
1453 1.1 bjh21 #ifndef SOFTFLOAT_FOR_GCC /* __fix?fdi provided by libgcc2.c */
1454 1.1 bjh21 /*
1455 1.1 bjh21 -------------------------------------------------------------------------------
1456 1.1 bjh21 Returns the result of converting the single-precision floating-point value
1457 1.1 bjh21 `a' to the 64-bit two's complement integer format. The conversion is
1458 1.1 bjh21 performed according to the IEC/IEEE Standard for Binary Floating-Point
1459 1.1 bjh21 Arithmetic---which means in particular that the conversion is rounded
1460 1.1 bjh21 according to the current rounding mode. If `a' is a NaN, the largest
1461 1.1 bjh21 positive integer is returned. Otherwise, if the conversion overflows, the
1462 1.1 bjh21 largest integer with the same sign as `a' is returned.
1463 1.1 bjh21 -------------------------------------------------------------------------------
1464 1.1 bjh21 */
1465 1.1 bjh21 int64 float32_to_int64( float32 a )
1466 1.1 bjh21 {
1467 1.1 bjh21 flag aSign;
1468 1.1 bjh21 int16 aExp, shiftCount;
1469 1.1 bjh21 bits32 aSig;
1470 1.1 bjh21 bits64 aSig64, aSigExtra;
1471 1.1 bjh21
1472 1.1 bjh21 aSig = extractFloat32Frac( a );
1473 1.1 bjh21 aExp = extractFloat32Exp( a );
1474 1.1 bjh21 aSign = extractFloat32Sign( a );
1475 1.1 bjh21 shiftCount = 0xBE - aExp;
1476 1.1 bjh21 if ( shiftCount < 0 ) {
1477 1.1 bjh21 float_raise( float_flag_invalid );
1478 1.1 bjh21 if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) {
1479 1.1 bjh21 return LIT64( 0x7FFFFFFFFFFFFFFF );
1480 1.1 bjh21 }
1481 1.1 bjh21 return (sbits64) LIT64( 0x8000000000000000 );
1482 1.1 bjh21 }
1483 1.1 bjh21 if ( aExp ) aSig |= 0x00800000;
1484 1.1 bjh21 aSig64 = aSig;
1485 1.1 bjh21 aSig64 <<= 40;
1486 1.1 bjh21 shift64ExtraRightJamming( aSig64, 0, shiftCount, &aSig64, &aSigExtra );
1487 1.1 bjh21 return roundAndPackInt64( aSign, aSig64, aSigExtra );
1488 1.1 bjh21
1489 1.1 bjh21 }
1490 1.1 bjh21
1491 1.1 bjh21 /*
1492 1.1 bjh21 -------------------------------------------------------------------------------
1493 1.1 bjh21 Returns the result of converting the single-precision floating-point value
1494 1.1 bjh21 `a' to the 64-bit two's complement integer format. The conversion is
1495 1.1 bjh21 performed according to the IEC/IEEE Standard for Binary Floating-Point
1496 1.1 bjh21 Arithmetic, except that the conversion is always rounded toward zero. If
1497 1.1 bjh21 `a' is a NaN, the largest positive integer is returned. Otherwise, if the
1498 1.1 bjh21 conversion overflows, the largest integer with the same sign as `a' is
1499 1.1 bjh21 returned.
1500 1.1 bjh21 -------------------------------------------------------------------------------
1501 1.1 bjh21 */
1502 1.1 bjh21 int64 float32_to_int64_round_to_zero( float32 a )
1503 1.1 bjh21 {
1504 1.1 bjh21 flag aSign;
1505 1.1 bjh21 int16 aExp, shiftCount;
1506 1.1 bjh21 bits32 aSig;
1507 1.1 bjh21 bits64 aSig64;
1508 1.1 bjh21 int64 z;
1509 1.1 bjh21
1510 1.1 bjh21 aSig = extractFloat32Frac( a );
1511 1.1 bjh21 aExp = extractFloat32Exp( a );
1512 1.1 bjh21 aSign = extractFloat32Sign( a );
1513 1.1 bjh21 shiftCount = aExp - 0xBE;
1514 1.1 bjh21 if ( 0 <= shiftCount ) {
1515 1.1 bjh21 if ( a != 0xDF000000 ) {
1516 1.1 bjh21 float_raise( float_flag_invalid );
1517 1.1 bjh21 if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) {
1518 1.1 bjh21 return LIT64( 0x7FFFFFFFFFFFFFFF );
1519 1.1 bjh21 }
1520 1.1 bjh21 }
1521 1.1 bjh21 return (sbits64) LIT64( 0x8000000000000000 );
1522 1.1 bjh21 }
1523 1.1 bjh21 else if ( aExp <= 0x7E ) {
1524 1.11.2.1 tls if ( aExp | aSig ) set_float_exception_inexact_flag();
1525 1.1 bjh21 return 0;
1526 1.1 bjh21 }
1527 1.1 bjh21 aSig64 = aSig | 0x00800000;
1528 1.1 bjh21 aSig64 <<= 40;
1529 1.1 bjh21 z = aSig64>>( - shiftCount );
1530 1.1 bjh21 if ( (bits64) ( aSig64<<( shiftCount & 63 ) ) ) {
1531 1.11.2.1 tls set_float_exception_inexact_flag();
1532 1.1 bjh21 }
1533 1.1 bjh21 if ( aSign ) z = - z;
1534 1.1 bjh21 return z;
1535 1.1 bjh21
1536 1.1 bjh21 }
1537 1.1 bjh21 #endif /* !SOFTFLOAT_FOR_GCC */
1538 1.1 bjh21
1539 1.1 bjh21 /*
1540 1.1 bjh21 -------------------------------------------------------------------------------
1541 1.1 bjh21 Returns the result of converting the single-precision floating-point value
1542 1.1 bjh21 `a' to the double-precision floating-point format. The conversion is
1543 1.1 bjh21 performed according to the IEC/IEEE Standard for Binary Floating-Point
1544 1.1 bjh21 Arithmetic.
1545 1.1 bjh21 -------------------------------------------------------------------------------
1546 1.1 bjh21 */
1547 1.1 bjh21 float64 float32_to_float64( float32 a )
1548 1.1 bjh21 {
1549 1.1 bjh21 flag aSign;
1550 1.1 bjh21 int16 aExp;
1551 1.1 bjh21 bits32 aSig;
1552 1.1 bjh21
1553 1.1 bjh21 aSig = extractFloat32Frac( a );
1554 1.1 bjh21 aExp = extractFloat32Exp( a );
1555 1.1 bjh21 aSign = extractFloat32Sign( a );
1556 1.1 bjh21 if ( aExp == 0xFF ) {
1557 1.1 bjh21 if ( aSig ) return commonNaNToFloat64( float32ToCommonNaN( a ) );
1558 1.1 bjh21 return packFloat64( aSign, 0x7FF, 0 );
1559 1.1 bjh21 }
1560 1.1 bjh21 if ( aExp == 0 ) {
1561 1.1 bjh21 if ( aSig == 0 ) return packFloat64( aSign, 0, 0 );
1562 1.1 bjh21 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1563 1.1 bjh21 --aExp;
1564 1.1 bjh21 }
1565 1.1 bjh21 return packFloat64( aSign, aExp + 0x380, ( (bits64) aSig )<<29 );
1566 1.1 bjh21
1567 1.1 bjh21 }
1568 1.1 bjh21
1569 1.1 bjh21 #ifdef FLOATX80
1570 1.1 bjh21
1571 1.1 bjh21 /*
1572 1.1 bjh21 -------------------------------------------------------------------------------
1573 1.1 bjh21 Returns the result of converting the single-precision floating-point value
1574 1.1 bjh21 `a' to the extended double-precision floating-point format. The conversion
1575 1.1 bjh21 is performed according to the IEC/IEEE Standard for Binary Floating-Point
1576 1.1 bjh21 Arithmetic.
1577 1.1 bjh21 -------------------------------------------------------------------------------
1578 1.1 bjh21 */
1579 1.1 bjh21 floatx80 float32_to_floatx80( float32 a )
1580 1.1 bjh21 {
1581 1.1 bjh21 flag aSign;
1582 1.1 bjh21 int16 aExp;
1583 1.1 bjh21 bits32 aSig;
1584 1.1 bjh21
1585 1.1 bjh21 aSig = extractFloat32Frac( a );
1586 1.1 bjh21 aExp = extractFloat32Exp( a );
1587 1.1 bjh21 aSign = extractFloat32Sign( a );
1588 1.1 bjh21 if ( aExp == 0xFF ) {
1589 1.1 bjh21 if ( aSig ) return commonNaNToFloatx80( float32ToCommonNaN( a ) );
1590 1.1 bjh21 return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
1591 1.1 bjh21 }
1592 1.1 bjh21 if ( aExp == 0 ) {
1593 1.1 bjh21 if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 );
1594 1.1 bjh21 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1595 1.1 bjh21 }
1596 1.1 bjh21 aSig |= 0x00800000;
1597 1.1 bjh21 return packFloatx80( aSign, aExp + 0x3F80, ( (bits64) aSig )<<40 );
1598 1.1 bjh21
1599 1.1 bjh21 }
1600 1.1 bjh21
1601 1.1 bjh21 #endif
1602 1.1 bjh21
1603 1.1 bjh21 #ifdef FLOAT128
1604 1.1 bjh21
1605 1.1 bjh21 /*
1606 1.1 bjh21 -------------------------------------------------------------------------------
1607 1.1 bjh21 Returns the result of converting the single-precision floating-point value
1608 1.1 bjh21 `a' to the double-precision floating-point format. The conversion is
1609 1.1 bjh21 performed according to the IEC/IEEE Standard for Binary Floating-Point
1610 1.1 bjh21 Arithmetic.
1611 1.1 bjh21 -------------------------------------------------------------------------------
1612 1.1 bjh21 */
1613 1.1 bjh21 float128 float32_to_float128( float32 a )
1614 1.1 bjh21 {
1615 1.1 bjh21 flag aSign;
1616 1.1 bjh21 int16 aExp;
1617 1.1 bjh21 bits32 aSig;
1618 1.1 bjh21
1619 1.1 bjh21 aSig = extractFloat32Frac( a );
1620 1.1 bjh21 aExp = extractFloat32Exp( a );
1621 1.1 bjh21 aSign = extractFloat32Sign( a );
1622 1.1 bjh21 if ( aExp == 0xFF ) {
1623 1.1 bjh21 if ( aSig ) return commonNaNToFloat128( float32ToCommonNaN( a ) );
1624 1.1 bjh21 return packFloat128( aSign, 0x7FFF, 0, 0 );
1625 1.1 bjh21 }
1626 1.1 bjh21 if ( aExp == 0 ) {
1627 1.1 bjh21 if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 );
1628 1.1 bjh21 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1629 1.1 bjh21 --aExp;
1630 1.1 bjh21 }
1631 1.1 bjh21 return packFloat128( aSign, aExp + 0x3F80, ( (bits64) aSig )<<25, 0 );
1632 1.1 bjh21
1633 1.1 bjh21 }
1634 1.1 bjh21
1635 1.1 bjh21 #endif
1636 1.1 bjh21
1637 1.1 bjh21 #ifndef SOFTFLOAT_FOR_GCC /* Not needed */
1638 1.1 bjh21 /*
1639 1.1 bjh21 -------------------------------------------------------------------------------
1640 1.1 bjh21 Rounds the single-precision floating-point value `a' to an integer, and
1641 1.1 bjh21 returns the result as a single-precision floating-point value. The
1642 1.1 bjh21 operation is performed according to the IEC/IEEE Standard for Binary
1643 1.1 bjh21 Floating-Point Arithmetic.
1644 1.1 bjh21 -------------------------------------------------------------------------------
1645 1.1 bjh21 */
1646 1.1 bjh21 float32 float32_round_to_int( float32 a )
1647 1.1 bjh21 {
1648 1.1 bjh21 flag aSign;
1649 1.1 bjh21 int16 aExp;
1650 1.1 bjh21 bits32 lastBitMask, roundBitsMask;
1651 1.1 bjh21 int8 roundingMode;
1652 1.1 bjh21 float32 z;
1653 1.1 bjh21
1654 1.1 bjh21 aExp = extractFloat32Exp( a );
1655 1.1 bjh21 if ( 0x96 <= aExp ) {
1656 1.1 bjh21 if ( ( aExp == 0xFF ) && extractFloat32Frac( a ) ) {
1657 1.1 bjh21 return propagateFloat32NaN( a, a );
1658 1.1 bjh21 }
1659 1.1 bjh21 return a;
1660 1.1 bjh21 }
1661 1.1 bjh21 if ( aExp <= 0x7E ) {
1662 1.1 bjh21 if ( (bits32) ( a<<1 ) == 0 ) return a;
1663 1.11.2.1 tls set_float_exception_inexact_flag();
1664 1.1 bjh21 aSign = extractFloat32Sign( a );
1665 1.1 bjh21 switch ( float_rounding_mode ) {
1666 1.1 bjh21 case float_round_nearest_even:
1667 1.1 bjh21 if ( ( aExp == 0x7E ) && extractFloat32Frac( a ) ) {
1668 1.1 bjh21 return packFloat32( aSign, 0x7F, 0 );
1669 1.1 bjh21 }
1670 1.1 bjh21 break;
1671 1.1 bjh21 case float_round_to_zero:
1672 1.1 bjh21 break;
1673 1.1 bjh21 case float_round_down:
1674 1.1 bjh21 return aSign ? 0xBF800000 : 0;
1675 1.1 bjh21 case float_round_up:
1676 1.1 bjh21 return aSign ? 0x80000000 : 0x3F800000;
1677 1.1 bjh21 }
1678 1.1 bjh21 return packFloat32( aSign, 0, 0 );
1679 1.1 bjh21 }
1680 1.1 bjh21 lastBitMask = 1;
1681 1.1 bjh21 lastBitMask <<= 0x96 - aExp;
1682 1.1 bjh21 roundBitsMask = lastBitMask - 1;
1683 1.1 bjh21 z = a;
1684 1.1 bjh21 roundingMode = float_rounding_mode;
1685 1.1 bjh21 if ( roundingMode == float_round_nearest_even ) {
1686 1.1 bjh21 z += lastBitMask>>1;
1687 1.1 bjh21 if ( ( z & roundBitsMask ) == 0 ) z &= ~ lastBitMask;
1688 1.1 bjh21 }
1689 1.1 bjh21 else if ( roundingMode != float_round_to_zero ) {
1690 1.1 bjh21 if ( extractFloat32Sign( z ) ^ ( roundingMode == float_round_up ) ) {
1691 1.1 bjh21 z += roundBitsMask;
1692 1.1 bjh21 }
1693 1.1 bjh21 }
1694 1.1 bjh21 z &= ~ roundBitsMask;
1695 1.11.2.1 tls if ( z != a ) set_float_exception_inexact_flag();
1696 1.1 bjh21 return z;
1697 1.1 bjh21
1698 1.1 bjh21 }
1699 1.1 bjh21 #endif /* !SOFTFLOAT_FOR_GCC */
1700 1.1 bjh21
1701 1.1 bjh21 /*
1702 1.1 bjh21 -------------------------------------------------------------------------------
1703 1.1 bjh21 Returns the result of adding the absolute values of the single-precision
1704 1.1 bjh21 floating-point values `a' and `b'. If `zSign' is 1, the sum is negated
1705 1.1 bjh21 before being returned. `zSign' is ignored if the result is a NaN.
1706 1.1 bjh21 The addition is performed according to the IEC/IEEE Standard for Binary
1707 1.1 bjh21 Floating-Point Arithmetic.
1708 1.1 bjh21 -------------------------------------------------------------------------------
1709 1.1 bjh21 */
1710 1.1 bjh21 static float32 addFloat32Sigs( float32 a, float32 b, flag zSign )
1711 1.1 bjh21 {
1712 1.1 bjh21 int16 aExp, bExp, zExp;
1713 1.1 bjh21 bits32 aSig, bSig, zSig;
1714 1.1 bjh21 int16 expDiff;
1715 1.1 bjh21
1716 1.1 bjh21 aSig = extractFloat32Frac( a );
1717 1.1 bjh21 aExp = extractFloat32Exp( a );
1718 1.1 bjh21 bSig = extractFloat32Frac( b );
1719 1.1 bjh21 bExp = extractFloat32Exp( b );
1720 1.1 bjh21 expDiff = aExp - bExp;
1721 1.1 bjh21 aSig <<= 6;
1722 1.1 bjh21 bSig <<= 6;
1723 1.1 bjh21 if ( 0 < expDiff ) {
1724 1.1 bjh21 if ( aExp == 0xFF ) {
1725 1.1 bjh21 if ( aSig ) return propagateFloat32NaN( a, b );
1726 1.1 bjh21 return a;
1727 1.1 bjh21 }
1728 1.1 bjh21 if ( bExp == 0 ) {
1729 1.1 bjh21 --expDiff;
1730 1.1 bjh21 }
1731 1.1 bjh21 else {
1732 1.1 bjh21 bSig |= 0x20000000;
1733 1.1 bjh21 }
1734 1.1 bjh21 shift32RightJamming( bSig, expDiff, &bSig );
1735 1.1 bjh21 zExp = aExp;
1736 1.1 bjh21 }
1737 1.1 bjh21 else if ( expDiff < 0 ) {
1738 1.1 bjh21 if ( bExp == 0xFF ) {
1739 1.1 bjh21 if ( bSig ) return propagateFloat32NaN( a, b );
1740 1.1 bjh21 return packFloat32( zSign, 0xFF, 0 );
1741 1.1 bjh21 }
1742 1.1 bjh21 if ( aExp == 0 ) {
1743 1.1 bjh21 ++expDiff;
1744 1.1 bjh21 }
1745 1.1 bjh21 else {
1746 1.1 bjh21 aSig |= 0x20000000;
1747 1.1 bjh21 }
1748 1.1 bjh21 shift32RightJamming( aSig, - expDiff, &aSig );
1749 1.1 bjh21 zExp = bExp;
1750 1.1 bjh21 }
1751 1.1 bjh21 else {
1752 1.1 bjh21 if ( aExp == 0xFF ) {
1753 1.1 bjh21 if ( aSig | bSig ) return propagateFloat32NaN( a, b );
1754 1.1 bjh21 return a;
1755 1.1 bjh21 }
1756 1.1 bjh21 if ( aExp == 0 ) return packFloat32( zSign, 0, ( aSig + bSig )>>6 );
1757 1.1 bjh21 zSig = 0x40000000 + aSig + bSig;
1758 1.1 bjh21 zExp = aExp;
1759 1.1 bjh21 goto roundAndPack;
1760 1.1 bjh21 }
1761 1.1 bjh21 aSig |= 0x20000000;
1762 1.1 bjh21 zSig = ( aSig + bSig )<<1;
1763 1.1 bjh21 --zExp;
1764 1.1 bjh21 if ( (sbits32) zSig < 0 ) {
1765 1.1 bjh21 zSig = aSig + bSig;
1766 1.1 bjh21 ++zExp;
1767 1.1 bjh21 }
1768 1.1 bjh21 roundAndPack:
1769 1.1 bjh21 return roundAndPackFloat32( zSign, zExp, zSig );
1770 1.1 bjh21
1771 1.1 bjh21 }
1772 1.1 bjh21
1773 1.1 bjh21 /*
1774 1.1 bjh21 -------------------------------------------------------------------------------
1775 1.1 bjh21 Returns the result of subtracting the absolute values of the single-
1776 1.1 bjh21 precision floating-point values `a' and `b'. If `zSign' is 1, the
1777 1.1 bjh21 difference is negated before being returned. `zSign' is ignored if the
1778 1.1 bjh21 result is a NaN. The subtraction is performed according to the IEC/IEEE
1779 1.1 bjh21 Standard for Binary Floating-Point Arithmetic.
1780 1.1 bjh21 -------------------------------------------------------------------------------
1781 1.1 bjh21 */
1782 1.1 bjh21 static float32 subFloat32Sigs( float32 a, float32 b, flag zSign )
1783 1.1 bjh21 {
1784 1.1 bjh21 int16 aExp, bExp, zExp;
1785 1.1 bjh21 bits32 aSig, bSig, zSig;
1786 1.1 bjh21 int16 expDiff;
1787 1.1 bjh21
1788 1.1 bjh21 aSig = extractFloat32Frac( a );
1789 1.1 bjh21 aExp = extractFloat32Exp( a );
1790 1.1 bjh21 bSig = extractFloat32Frac( b );
1791 1.1 bjh21 bExp = extractFloat32Exp( b );
1792 1.1 bjh21 expDiff = aExp - bExp;
1793 1.1 bjh21 aSig <<= 7;
1794 1.1 bjh21 bSig <<= 7;
1795 1.1 bjh21 if ( 0 < expDiff ) goto aExpBigger;
1796 1.1 bjh21 if ( expDiff < 0 ) goto bExpBigger;
1797 1.1 bjh21 if ( aExp == 0xFF ) {
1798 1.1 bjh21 if ( aSig | bSig ) return propagateFloat32NaN( a, b );
1799 1.1 bjh21 float_raise( float_flag_invalid );
1800 1.1 bjh21 return float32_default_nan;
1801 1.1 bjh21 }
1802 1.1 bjh21 if ( aExp == 0 ) {
1803 1.1 bjh21 aExp = 1;
1804 1.1 bjh21 bExp = 1;
1805 1.1 bjh21 }
1806 1.1 bjh21 if ( bSig < aSig ) goto aBigger;
1807 1.1 bjh21 if ( aSig < bSig ) goto bBigger;
1808 1.1 bjh21 return packFloat32( float_rounding_mode == float_round_down, 0, 0 );
1809 1.1 bjh21 bExpBigger:
1810 1.1 bjh21 if ( bExp == 0xFF ) {
1811 1.1 bjh21 if ( bSig ) return propagateFloat32NaN( a, b );
1812 1.1 bjh21 return packFloat32( zSign ^ 1, 0xFF, 0 );
1813 1.1 bjh21 }
1814 1.1 bjh21 if ( aExp == 0 ) {
1815 1.1 bjh21 ++expDiff;
1816 1.1 bjh21 }
1817 1.1 bjh21 else {
1818 1.1 bjh21 aSig |= 0x40000000;
1819 1.1 bjh21 }
1820 1.1 bjh21 shift32RightJamming( aSig, - expDiff, &aSig );
1821 1.1 bjh21 bSig |= 0x40000000;
1822 1.1 bjh21 bBigger:
1823 1.1 bjh21 zSig = bSig - aSig;
1824 1.1 bjh21 zExp = bExp;
1825 1.1 bjh21 zSign ^= 1;
1826 1.1 bjh21 goto normalizeRoundAndPack;
1827 1.1 bjh21 aExpBigger:
1828 1.1 bjh21 if ( aExp == 0xFF ) {
1829 1.1 bjh21 if ( aSig ) return propagateFloat32NaN( a, b );
1830 1.1 bjh21 return a;
1831 1.1 bjh21 }
1832 1.1 bjh21 if ( bExp == 0 ) {
1833 1.1 bjh21 --expDiff;
1834 1.1 bjh21 }
1835 1.1 bjh21 else {
1836 1.1 bjh21 bSig |= 0x40000000;
1837 1.1 bjh21 }
1838 1.1 bjh21 shift32RightJamming( bSig, expDiff, &bSig );
1839 1.1 bjh21 aSig |= 0x40000000;
1840 1.1 bjh21 aBigger:
1841 1.1 bjh21 zSig = aSig - bSig;
1842 1.1 bjh21 zExp = aExp;
1843 1.1 bjh21 normalizeRoundAndPack:
1844 1.1 bjh21 --zExp;
1845 1.1 bjh21 return normalizeRoundAndPackFloat32( zSign, zExp, zSig );
1846 1.1 bjh21
1847 1.1 bjh21 }
1848 1.1 bjh21
1849 1.1 bjh21 /*
1850 1.1 bjh21 -------------------------------------------------------------------------------
1851 1.1 bjh21 Returns the result of adding the single-precision floating-point values `a'
1852 1.1 bjh21 and `b'. The operation is performed according to the IEC/IEEE Standard for
1853 1.1 bjh21 Binary Floating-Point Arithmetic.
1854 1.1 bjh21 -------------------------------------------------------------------------------
1855 1.1 bjh21 */
1856 1.1 bjh21 float32 float32_add( float32 a, float32 b )
1857 1.1 bjh21 {
1858 1.1 bjh21 flag aSign, bSign;
1859 1.1 bjh21
1860 1.1 bjh21 aSign = extractFloat32Sign( a );
1861 1.1 bjh21 bSign = extractFloat32Sign( b );
1862 1.1 bjh21 if ( aSign == bSign ) {
1863 1.1 bjh21 return addFloat32Sigs( a, b, aSign );
1864 1.1 bjh21 }
1865 1.1 bjh21 else {
1866 1.1 bjh21 return subFloat32Sigs( a, b, aSign );
1867 1.1 bjh21 }
1868 1.1 bjh21
1869 1.1 bjh21 }
1870 1.1 bjh21
1871 1.1 bjh21 /*
1872 1.1 bjh21 -------------------------------------------------------------------------------
1873 1.1 bjh21 Returns the result of subtracting the single-precision floating-point values
1874 1.1 bjh21 `a' and `b'. The operation is performed according to the IEC/IEEE Standard
1875 1.1 bjh21 for Binary Floating-Point Arithmetic.
1876 1.1 bjh21 -------------------------------------------------------------------------------
1877 1.1 bjh21 */
1878 1.1 bjh21 float32 float32_sub( float32 a, float32 b )
1879 1.1 bjh21 {
1880 1.1 bjh21 flag aSign, bSign;
1881 1.1 bjh21
1882 1.1 bjh21 aSign = extractFloat32Sign( a );
1883 1.1 bjh21 bSign = extractFloat32Sign( b );
1884 1.1 bjh21 if ( aSign == bSign ) {
1885 1.1 bjh21 return subFloat32Sigs( a, b, aSign );
1886 1.1 bjh21 }
1887 1.1 bjh21 else {
1888 1.1 bjh21 return addFloat32Sigs( a, b, aSign );
1889 1.1 bjh21 }
1890 1.1 bjh21
1891 1.1 bjh21 }
1892 1.1 bjh21
1893 1.1 bjh21 /*
1894 1.1 bjh21 -------------------------------------------------------------------------------
1895 1.1 bjh21 Returns the result of multiplying the single-precision floating-point values
1896 1.1 bjh21 `a' and `b'. The operation is performed according to the IEC/IEEE Standard
1897 1.1 bjh21 for Binary Floating-Point Arithmetic.
1898 1.1 bjh21 -------------------------------------------------------------------------------
1899 1.1 bjh21 */
1900 1.1 bjh21 float32 float32_mul( float32 a, float32 b )
1901 1.1 bjh21 {
1902 1.1 bjh21 flag aSign, bSign, zSign;
1903 1.1 bjh21 int16 aExp, bExp, zExp;
1904 1.1 bjh21 bits32 aSig, bSig;
1905 1.1 bjh21 bits64 zSig64;
1906 1.1 bjh21 bits32 zSig;
1907 1.1 bjh21
1908 1.1 bjh21 aSig = extractFloat32Frac( a );
1909 1.1 bjh21 aExp = extractFloat32Exp( a );
1910 1.1 bjh21 aSign = extractFloat32Sign( a );
1911 1.1 bjh21 bSig = extractFloat32Frac( b );
1912 1.1 bjh21 bExp = extractFloat32Exp( b );
1913 1.1 bjh21 bSign = extractFloat32Sign( b );
1914 1.1 bjh21 zSign = aSign ^ bSign;
1915 1.1 bjh21 if ( aExp == 0xFF ) {
1916 1.1 bjh21 if ( aSig || ( ( bExp == 0xFF ) && bSig ) ) {
1917 1.1 bjh21 return propagateFloat32NaN( a, b );
1918 1.1 bjh21 }
1919 1.1 bjh21 if ( ( bExp | bSig ) == 0 ) {
1920 1.1 bjh21 float_raise( float_flag_invalid );
1921 1.1 bjh21 return float32_default_nan;
1922 1.1 bjh21 }
1923 1.1 bjh21 return packFloat32( zSign, 0xFF, 0 );
1924 1.1 bjh21 }
1925 1.1 bjh21 if ( bExp == 0xFF ) {
1926 1.1 bjh21 if ( bSig ) return propagateFloat32NaN( a, b );
1927 1.1 bjh21 if ( ( aExp | aSig ) == 0 ) {
1928 1.1 bjh21 float_raise( float_flag_invalid );
1929 1.1 bjh21 return float32_default_nan;
1930 1.1 bjh21 }
1931 1.1 bjh21 return packFloat32( zSign, 0xFF, 0 );
1932 1.1 bjh21 }
1933 1.1 bjh21 if ( aExp == 0 ) {
1934 1.1 bjh21 if ( aSig == 0 ) return packFloat32( zSign, 0, 0 );
1935 1.1 bjh21 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1936 1.1 bjh21 }
1937 1.1 bjh21 if ( bExp == 0 ) {
1938 1.1 bjh21 if ( bSig == 0 ) return packFloat32( zSign, 0, 0 );
1939 1.1 bjh21 normalizeFloat32Subnormal( bSig, &bExp, &bSig );
1940 1.1 bjh21 }
1941 1.1 bjh21 zExp = aExp + bExp - 0x7F;
1942 1.1 bjh21 aSig = ( aSig | 0x00800000 )<<7;
1943 1.1 bjh21 bSig = ( bSig | 0x00800000 )<<8;
1944 1.1 bjh21 shift64RightJamming( ( (bits64) aSig ) * bSig, 32, &zSig64 );
1945 1.10 christos zSig = (bits32)zSig64;
1946 1.1 bjh21 if ( 0 <= (sbits32) ( zSig<<1 ) ) {
1947 1.1 bjh21 zSig <<= 1;
1948 1.1 bjh21 --zExp;
1949 1.1 bjh21 }
1950 1.1 bjh21 return roundAndPackFloat32( zSign, zExp, zSig );
1951 1.1 bjh21
1952 1.1 bjh21 }
1953 1.1 bjh21
1954 1.1 bjh21 /*
1955 1.1 bjh21 -------------------------------------------------------------------------------
1956 1.1 bjh21 Returns the result of dividing the single-precision floating-point value `a'
1957 1.1 bjh21 by the corresponding value `b'. The operation is performed according to the
1958 1.1 bjh21 IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1959 1.1 bjh21 -------------------------------------------------------------------------------
1960 1.1 bjh21 */
1961 1.1 bjh21 float32 float32_div( float32 a, float32 b )
1962 1.1 bjh21 {
1963 1.1 bjh21 flag aSign, bSign, zSign;
1964 1.1 bjh21 int16 aExp, bExp, zExp;
1965 1.1 bjh21 bits32 aSig, bSig, zSig;
1966 1.1 bjh21
1967 1.1 bjh21 aSig = extractFloat32Frac( a );
1968 1.1 bjh21 aExp = extractFloat32Exp( a );
1969 1.1 bjh21 aSign = extractFloat32Sign( a );
1970 1.1 bjh21 bSig = extractFloat32Frac( b );
1971 1.1 bjh21 bExp = extractFloat32Exp( b );
1972 1.1 bjh21 bSign = extractFloat32Sign( b );
1973 1.1 bjh21 zSign = aSign ^ bSign;
1974 1.1 bjh21 if ( aExp == 0xFF ) {
1975 1.1 bjh21 if ( aSig ) return propagateFloat32NaN( a, b );
1976 1.1 bjh21 if ( bExp == 0xFF ) {
1977 1.1 bjh21 if ( bSig ) return propagateFloat32NaN( a, b );
1978 1.1 bjh21 float_raise( float_flag_invalid );
1979 1.1 bjh21 return float32_default_nan;
1980 1.1 bjh21 }
1981 1.1 bjh21 return packFloat32( zSign, 0xFF, 0 );
1982 1.1 bjh21 }
1983 1.1 bjh21 if ( bExp == 0xFF ) {
1984 1.1 bjh21 if ( bSig ) return propagateFloat32NaN( a, b );
1985 1.1 bjh21 return packFloat32( zSign, 0, 0 );
1986 1.1 bjh21 }
1987 1.1 bjh21 if ( bExp == 0 ) {
1988 1.1 bjh21 if ( bSig == 0 ) {
1989 1.1 bjh21 if ( ( aExp | aSig ) == 0 ) {
1990 1.1 bjh21 float_raise( float_flag_invalid );
1991 1.1 bjh21 return float32_default_nan;
1992 1.1 bjh21 }
1993 1.1 bjh21 float_raise( float_flag_divbyzero );
1994 1.1 bjh21 return packFloat32( zSign, 0xFF, 0 );
1995 1.1 bjh21 }
1996 1.1 bjh21 normalizeFloat32Subnormal( bSig, &bExp, &bSig );
1997 1.1 bjh21 }
1998 1.1 bjh21 if ( aExp == 0 ) {
1999 1.1 bjh21 if ( aSig == 0 ) return packFloat32( zSign, 0, 0 );
2000 1.1 bjh21 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
2001 1.1 bjh21 }
2002 1.1 bjh21 zExp = aExp - bExp + 0x7D;
2003 1.1 bjh21 aSig = ( aSig | 0x00800000 )<<7;
2004 1.1 bjh21 bSig = ( bSig | 0x00800000 )<<8;
2005 1.1 bjh21 if ( bSig <= ( aSig + aSig ) ) {
2006 1.1 bjh21 aSig >>= 1;
2007 1.1 bjh21 ++zExp;
2008 1.1 bjh21 }
2009 1.11 matt zSig = (bits32)((((bits64) aSig) << 32) / bSig);
2010 1.1 bjh21 if ( ( zSig & 0x3F ) == 0 ) {
2011 1.1 bjh21 zSig |= ( (bits64) bSig * zSig != ( (bits64) aSig )<<32 );
2012 1.1 bjh21 }
2013 1.1 bjh21 return roundAndPackFloat32( zSign, zExp, zSig );
2014 1.1 bjh21
2015 1.1 bjh21 }
2016 1.1 bjh21
2017 1.1 bjh21 #ifndef SOFTFLOAT_FOR_GCC /* Not needed */
2018 1.1 bjh21 /*
2019 1.1 bjh21 -------------------------------------------------------------------------------
2020 1.1 bjh21 Returns the remainder of the single-precision floating-point value `a'
2021 1.1 bjh21 with respect to the corresponding value `b'. The operation is performed
2022 1.1 bjh21 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2023 1.1 bjh21 -------------------------------------------------------------------------------
2024 1.1 bjh21 */
2025 1.1 bjh21 float32 float32_rem( float32 a, float32 b )
2026 1.1 bjh21 {
2027 1.1 bjh21 flag aSign, bSign, zSign;
2028 1.1 bjh21 int16 aExp, bExp, expDiff;
2029 1.1 bjh21 bits32 aSig, bSig;
2030 1.1 bjh21 bits32 q;
2031 1.1 bjh21 bits64 aSig64, bSig64, q64;
2032 1.1 bjh21 bits32 alternateASig;
2033 1.1 bjh21 sbits32 sigMean;
2034 1.1 bjh21
2035 1.1 bjh21 aSig = extractFloat32Frac( a );
2036 1.1 bjh21 aExp = extractFloat32Exp( a );
2037 1.1 bjh21 aSign = extractFloat32Sign( a );
2038 1.1 bjh21 bSig = extractFloat32Frac( b );
2039 1.1 bjh21 bExp = extractFloat32Exp( b );
2040 1.1 bjh21 bSign = extractFloat32Sign( b );
2041 1.1 bjh21 if ( aExp == 0xFF ) {
2042 1.1 bjh21 if ( aSig || ( ( bExp == 0xFF ) && bSig ) ) {
2043 1.1 bjh21 return propagateFloat32NaN( a, b );
2044 1.1 bjh21 }
2045 1.1 bjh21 float_raise( float_flag_invalid );
2046 1.1 bjh21 return float32_default_nan;
2047 1.1 bjh21 }
2048 1.1 bjh21 if ( bExp == 0xFF ) {
2049 1.1 bjh21 if ( bSig ) return propagateFloat32NaN( a, b );
2050 1.1 bjh21 return a;
2051 1.1 bjh21 }
2052 1.1 bjh21 if ( bExp == 0 ) {
2053 1.1 bjh21 if ( bSig == 0 ) {
2054 1.1 bjh21 float_raise( float_flag_invalid );
2055 1.1 bjh21 return float32_default_nan;
2056 1.1 bjh21 }
2057 1.1 bjh21 normalizeFloat32Subnormal( bSig, &bExp, &bSig );
2058 1.1 bjh21 }
2059 1.1 bjh21 if ( aExp == 0 ) {
2060 1.1 bjh21 if ( aSig == 0 ) return a;
2061 1.1 bjh21 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
2062 1.1 bjh21 }
2063 1.1 bjh21 expDiff = aExp - bExp;
2064 1.1 bjh21 aSig |= 0x00800000;
2065 1.1 bjh21 bSig |= 0x00800000;
2066 1.1 bjh21 if ( expDiff < 32 ) {
2067 1.1 bjh21 aSig <<= 8;
2068 1.1 bjh21 bSig <<= 8;
2069 1.1 bjh21 if ( expDiff < 0 ) {
2070 1.1 bjh21 if ( expDiff < -1 ) return a;
2071 1.1 bjh21 aSig >>= 1;
2072 1.1 bjh21 }
2073 1.1 bjh21 q = ( bSig <= aSig );
2074 1.1 bjh21 if ( q ) aSig -= bSig;
2075 1.1 bjh21 if ( 0 < expDiff ) {
2076 1.1 bjh21 q = ( ( (bits64) aSig )<<32 ) / bSig;
2077 1.1 bjh21 q >>= 32 - expDiff;
2078 1.1 bjh21 bSig >>= 2;
2079 1.1 bjh21 aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q;
2080 1.1 bjh21 }
2081 1.1 bjh21 else {
2082 1.1 bjh21 aSig >>= 2;
2083 1.1 bjh21 bSig >>= 2;
2084 1.1 bjh21 }
2085 1.1 bjh21 }
2086 1.1 bjh21 else {
2087 1.1 bjh21 if ( bSig <= aSig ) aSig -= bSig;
2088 1.1 bjh21 aSig64 = ( (bits64) aSig )<<40;
2089 1.1 bjh21 bSig64 = ( (bits64) bSig )<<40;
2090 1.1 bjh21 expDiff -= 64;
2091 1.1 bjh21 while ( 0 < expDiff ) {
2092 1.1 bjh21 q64 = estimateDiv128To64( aSig64, 0, bSig64 );
2093 1.1 bjh21 q64 = ( 2 < q64 ) ? q64 - 2 : 0;
2094 1.1 bjh21 aSig64 = - ( ( bSig * q64 )<<38 );
2095 1.1 bjh21 expDiff -= 62;
2096 1.1 bjh21 }
2097 1.1 bjh21 expDiff += 64;
2098 1.1 bjh21 q64 = estimateDiv128To64( aSig64, 0, bSig64 );
2099 1.1 bjh21 q64 = ( 2 < q64 ) ? q64 - 2 : 0;
2100 1.1 bjh21 q = q64>>( 64 - expDiff );
2101 1.1 bjh21 bSig <<= 6;
2102 1.1 bjh21 aSig = ( ( aSig64>>33 )<<( expDiff - 1 ) ) - bSig * q;
2103 1.1 bjh21 }
2104 1.1 bjh21 do {
2105 1.1 bjh21 alternateASig = aSig;
2106 1.1 bjh21 ++q;
2107 1.1 bjh21 aSig -= bSig;
2108 1.1 bjh21 } while ( 0 <= (sbits32) aSig );
2109 1.1 bjh21 sigMean = aSig + alternateASig;
2110 1.1 bjh21 if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) {
2111 1.1 bjh21 aSig = alternateASig;
2112 1.1 bjh21 }
2113 1.1 bjh21 zSign = ( (sbits32) aSig < 0 );
2114 1.1 bjh21 if ( zSign ) aSig = - aSig;
2115 1.1 bjh21 return normalizeRoundAndPackFloat32( aSign ^ zSign, bExp, aSig );
2116 1.1 bjh21
2117 1.1 bjh21 }
2118 1.1 bjh21 #endif /* !SOFTFLOAT_FOR_GCC */
2119 1.1 bjh21
2120 1.1 bjh21 #ifndef SOFTFLOAT_FOR_GCC /* Not needed */
2121 1.1 bjh21 /*
2122 1.1 bjh21 -------------------------------------------------------------------------------
2123 1.1 bjh21 Returns the square root of the single-precision floating-point value `a'.
2124 1.1 bjh21 The operation is performed according to the IEC/IEEE Standard for Binary
2125 1.1 bjh21 Floating-Point Arithmetic.
2126 1.1 bjh21 -------------------------------------------------------------------------------
2127 1.1 bjh21 */
2128 1.1 bjh21 float32 float32_sqrt( float32 a )
2129 1.1 bjh21 {
2130 1.1 bjh21 flag aSign;
2131 1.1 bjh21 int16 aExp, zExp;
2132 1.1 bjh21 bits32 aSig, zSig;
2133 1.1 bjh21 bits64 rem, term;
2134 1.1 bjh21
2135 1.1 bjh21 aSig = extractFloat32Frac( a );
2136 1.1 bjh21 aExp = extractFloat32Exp( a );
2137 1.1 bjh21 aSign = extractFloat32Sign( a );
2138 1.1 bjh21 if ( aExp == 0xFF ) {
2139 1.1 bjh21 if ( aSig ) return propagateFloat32NaN( a, 0 );
2140 1.1 bjh21 if ( ! aSign ) return a;
2141 1.1 bjh21 float_raise( float_flag_invalid );
2142 1.1 bjh21 return float32_default_nan;
2143 1.1 bjh21 }
2144 1.1 bjh21 if ( aSign ) {
2145 1.1 bjh21 if ( ( aExp | aSig ) == 0 ) return a;
2146 1.1 bjh21 float_raise( float_flag_invalid );
2147 1.1 bjh21 return float32_default_nan;
2148 1.1 bjh21 }
2149 1.1 bjh21 if ( aExp == 0 ) {
2150 1.1 bjh21 if ( aSig == 0 ) return 0;
2151 1.1 bjh21 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
2152 1.1 bjh21 }
2153 1.1 bjh21 zExp = ( ( aExp - 0x7F )>>1 ) + 0x7E;
2154 1.1 bjh21 aSig = ( aSig | 0x00800000 )<<8;
2155 1.1 bjh21 zSig = estimateSqrt32( aExp, aSig ) + 2;
2156 1.1 bjh21 if ( ( zSig & 0x7F ) <= 5 ) {
2157 1.1 bjh21 if ( zSig < 2 ) {
2158 1.1 bjh21 zSig = 0x7FFFFFFF;
2159 1.1 bjh21 goto roundAndPack;
2160 1.1 bjh21 }
2161 1.1 bjh21 aSig >>= aExp & 1;
2162 1.1 bjh21 term = ( (bits64) zSig ) * zSig;
2163 1.1 bjh21 rem = ( ( (bits64) aSig )<<32 ) - term;
2164 1.1 bjh21 while ( (sbits64) rem < 0 ) {
2165 1.1 bjh21 --zSig;
2166 1.1 bjh21 rem += ( ( (bits64) zSig )<<1 ) | 1;
2167 1.1 bjh21 }
2168 1.1 bjh21 zSig |= ( rem != 0 );
2169 1.1 bjh21 }
2170 1.1 bjh21 shift32RightJamming( zSig, 1, &zSig );
2171 1.1 bjh21 roundAndPack:
2172 1.1 bjh21 return roundAndPackFloat32( 0, zExp, zSig );
2173 1.1 bjh21
2174 1.1 bjh21 }
2175 1.1 bjh21 #endif /* !SOFTFLOAT_FOR_GCC */
2176 1.1 bjh21
2177 1.1 bjh21 /*
2178 1.1 bjh21 -------------------------------------------------------------------------------
2179 1.1 bjh21 Returns 1 if the single-precision floating-point value `a' is equal to
2180 1.1 bjh21 the corresponding value `b', and 0 otherwise. The comparison is performed
2181 1.1 bjh21 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2182 1.1 bjh21 -------------------------------------------------------------------------------
2183 1.1 bjh21 */
2184 1.1 bjh21 flag float32_eq( float32 a, float32 b )
2185 1.1 bjh21 {
2186 1.1 bjh21
2187 1.1 bjh21 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2188 1.1 bjh21 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2189 1.1 bjh21 ) {
2190 1.1 bjh21 if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) {
2191 1.1 bjh21 float_raise( float_flag_invalid );
2192 1.1 bjh21 }
2193 1.1 bjh21 return 0;
2194 1.1 bjh21 }
2195 1.1 bjh21 return ( a == b ) || ( (bits32) ( ( a | b )<<1 ) == 0 );
2196 1.1 bjh21
2197 1.1 bjh21 }
2198 1.1 bjh21
2199 1.1 bjh21 /*
2200 1.1 bjh21 -------------------------------------------------------------------------------
2201 1.1 bjh21 Returns 1 if the single-precision floating-point value `a' is less than
2202 1.1 bjh21 or equal to the corresponding value `b', and 0 otherwise. The comparison
2203 1.1 bjh21 is performed according to the IEC/IEEE Standard for Binary Floating-Point
2204 1.1 bjh21 Arithmetic.
2205 1.1 bjh21 -------------------------------------------------------------------------------
2206 1.1 bjh21 */
2207 1.1 bjh21 flag float32_le( float32 a, float32 b )
2208 1.1 bjh21 {
2209 1.1 bjh21 flag aSign, bSign;
2210 1.1 bjh21
2211 1.1 bjh21 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2212 1.1 bjh21 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2213 1.1 bjh21 ) {
2214 1.1 bjh21 float_raise( float_flag_invalid );
2215 1.1 bjh21 return 0;
2216 1.1 bjh21 }
2217 1.1 bjh21 aSign = extractFloat32Sign( a );
2218 1.1 bjh21 bSign = extractFloat32Sign( b );
2219 1.1 bjh21 if ( aSign != bSign ) return aSign || ( (bits32) ( ( a | b )<<1 ) == 0 );
2220 1.1 bjh21 return ( a == b ) || ( aSign ^ ( a < b ) );
2221 1.1 bjh21
2222 1.1 bjh21 }
2223 1.1 bjh21
2224 1.1 bjh21 /*
2225 1.1 bjh21 -------------------------------------------------------------------------------
2226 1.1 bjh21 Returns 1 if the single-precision floating-point value `a' is less than
2227 1.1 bjh21 the corresponding value `b', and 0 otherwise. The comparison is performed
2228 1.1 bjh21 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2229 1.1 bjh21 -------------------------------------------------------------------------------
2230 1.1 bjh21 */
2231 1.1 bjh21 flag float32_lt( float32 a, float32 b )
2232 1.1 bjh21 {
2233 1.1 bjh21 flag aSign, bSign;
2234 1.1 bjh21
2235 1.1 bjh21 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2236 1.1 bjh21 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2237 1.1 bjh21 ) {
2238 1.1 bjh21 float_raise( float_flag_invalid );
2239 1.1 bjh21 return 0;
2240 1.1 bjh21 }
2241 1.1 bjh21 aSign = extractFloat32Sign( a );
2242 1.1 bjh21 bSign = extractFloat32Sign( b );
2243 1.1 bjh21 if ( aSign != bSign ) return aSign && ( (bits32) ( ( a | b )<<1 ) != 0 );
2244 1.1 bjh21 return ( a != b ) && ( aSign ^ ( a < b ) );
2245 1.1 bjh21
2246 1.1 bjh21 }
2247 1.1 bjh21
2248 1.1 bjh21 #ifndef SOFTFLOAT_FOR_GCC /* Not needed */
2249 1.1 bjh21 /*
2250 1.1 bjh21 -------------------------------------------------------------------------------
2251 1.1 bjh21 Returns 1 if the single-precision floating-point value `a' is equal to
2252 1.1 bjh21 the corresponding value `b', and 0 otherwise. The invalid exception is
2253 1.1 bjh21 raised if either operand is a NaN. Otherwise, the comparison is performed
2254 1.1 bjh21 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2255 1.1 bjh21 -------------------------------------------------------------------------------
2256 1.1 bjh21 */
2257 1.1 bjh21 flag float32_eq_signaling( float32 a, float32 b )
2258 1.1 bjh21 {
2259 1.1 bjh21
2260 1.1 bjh21 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2261 1.1 bjh21 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2262 1.1 bjh21 ) {
2263 1.1 bjh21 float_raise( float_flag_invalid );
2264 1.1 bjh21 return 0;
2265 1.1 bjh21 }
2266 1.1 bjh21 return ( a == b ) || ( (bits32) ( ( a | b )<<1 ) == 0 );
2267 1.1 bjh21
2268 1.1 bjh21 }
2269 1.1 bjh21
2270 1.1 bjh21 /*
2271 1.1 bjh21 -------------------------------------------------------------------------------
2272 1.1 bjh21 Returns 1 if the single-precision floating-point value `a' is less than or
2273 1.1 bjh21 equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not
2274 1.1 bjh21 cause an exception. Otherwise, the comparison is performed according to the
2275 1.1 bjh21 IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2276 1.1 bjh21 -------------------------------------------------------------------------------
2277 1.1 bjh21 */
2278 1.1 bjh21 flag float32_le_quiet( float32 a, float32 b )
2279 1.1 bjh21 {
2280 1.1 bjh21 flag aSign, bSign;
2281 1.1 bjh21
2282 1.1 bjh21 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2283 1.1 bjh21 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2284 1.1 bjh21 ) {
2285 1.1 bjh21 if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) {
2286 1.1 bjh21 float_raise( float_flag_invalid );
2287 1.1 bjh21 }
2288 1.1 bjh21 return 0;
2289 1.1 bjh21 }
2290 1.1 bjh21 aSign = extractFloat32Sign( a );
2291 1.1 bjh21 bSign = extractFloat32Sign( b );
2292 1.1 bjh21 if ( aSign != bSign ) return aSign || ( (bits32) ( ( a | b )<<1 ) == 0 );
2293 1.1 bjh21 return ( a == b ) || ( aSign ^ ( a < b ) );
2294 1.1 bjh21
2295 1.1 bjh21 }
2296 1.1 bjh21
2297 1.1 bjh21 /*
2298 1.1 bjh21 -------------------------------------------------------------------------------
2299 1.1 bjh21 Returns 1 if the single-precision floating-point value `a' is less than
2300 1.1 bjh21 the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
2301 1.1 bjh21 exception. Otherwise, the comparison is performed according to the IEC/IEEE
2302 1.1 bjh21 Standard for Binary Floating-Point Arithmetic.
2303 1.1 bjh21 -------------------------------------------------------------------------------
2304 1.1 bjh21 */
2305 1.1 bjh21 flag float32_lt_quiet( float32 a, float32 b )
2306 1.1 bjh21 {
2307 1.1 bjh21 flag aSign, bSign;
2308 1.1 bjh21
2309 1.1 bjh21 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2310 1.1 bjh21 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2311 1.1 bjh21 ) {
2312 1.1 bjh21 if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) {
2313 1.1 bjh21 float_raise( float_flag_invalid );
2314 1.1 bjh21 }
2315 1.1 bjh21 return 0;
2316 1.1 bjh21 }
2317 1.1 bjh21 aSign = extractFloat32Sign( a );
2318 1.1 bjh21 bSign = extractFloat32Sign( b );
2319 1.1 bjh21 if ( aSign != bSign ) return aSign && ( (bits32) ( ( a | b )<<1 ) != 0 );
2320 1.1 bjh21 return ( a != b ) && ( aSign ^ ( a < b ) );
2321 1.1 bjh21
2322 1.1 bjh21 }
2323 1.1 bjh21 #endif /* !SOFTFLOAT_FOR_GCC */
2324 1.1 bjh21
2325 1.1 bjh21 #ifndef SOFTFLOAT_FOR_GCC /* Not needed */
2326 1.1 bjh21 /*
2327 1.1 bjh21 -------------------------------------------------------------------------------
2328 1.1 bjh21 Returns the result of converting the double-precision floating-point value
2329 1.1 bjh21 `a' to the 32-bit two's complement integer format. The conversion is
2330 1.1 bjh21 performed according to the IEC/IEEE Standard for Binary Floating-Point
2331 1.1 bjh21 Arithmetic---which means in particular that the conversion is rounded
2332 1.1 bjh21 according to the current rounding mode. If `a' is a NaN, the largest
2333 1.1 bjh21 positive integer is returned. Otherwise, if the conversion overflows, the
2334 1.1 bjh21 largest integer with the same sign as `a' is returned.
2335 1.1 bjh21 -------------------------------------------------------------------------------
2336 1.1 bjh21 */
2337 1.1 bjh21 int32 float64_to_int32( float64 a )
2338 1.1 bjh21 {
2339 1.1 bjh21 flag aSign;
2340 1.1 bjh21 int16 aExp, shiftCount;
2341 1.1 bjh21 bits64 aSig;
2342 1.1 bjh21
2343 1.1 bjh21 aSig = extractFloat64Frac( a );
2344 1.1 bjh21 aExp = extractFloat64Exp( a );
2345 1.1 bjh21 aSign = extractFloat64Sign( a );
2346 1.1 bjh21 if ( ( aExp == 0x7FF ) && aSig ) aSign = 0;
2347 1.1 bjh21 if ( aExp ) aSig |= LIT64( 0x0010000000000000 );
2348 1.1 bjh21 shiftCount = 0x42C - aExp;
2349 1.1 bjh21 if ( 0 < shiftCount ) shift64RightJamming( aSig, shiftCount, &aSig );
2350 1.1 bjh21 return roundAndPackInt32( aSign, aSig );
2351 1.1 bjh21
2352 1.1 bjh21 }
2353 1.1 bjh21 #endif /* !SOFTFLOAT_FOR_GCC */
2354 1.1 bjh21
2355 1.1 bjh21 /*
2356 1.1 bjh21 -------------------------------------------------------------------------------
2357 1.1 bjh21 Returns the result of converting the double-precision floating-point value
2358 1.1 bjh21 `a' to the 32-bit two's complement integer format. The conversion is
2359 1.1 bjh21 performed according to the IEC/IEEE Standard for Binary Floating-Point
2360 1.1 bjh21 Arithmetic, except that the conversion is always rounded toward zero.
2361 1.1 bjh21 If `a' is a NaN, the largest positive integer is returned. Otherwise, if
2362 1.1 bjh21 the conversion overflows, the largest integer with the same sign as `a' is
2363 1.1 bjh21 returned.
2364 1.1 bjh21 -------------------------------------------------------------------------------
2365 1.1 bjh21 */
2366 1.1 bjh21 int32 float64_to_int32_round_to_zero( float64 a )
2367 1.1 bjh21 {
2368 1.1 bjh21 flag aSign;
2369 1.1 bjh21 int16 aExp, shiftCount;
2370 1.1 bjh21 bits64 aSig, savedASig;
2371 1.1 bjh21 int32 z;
2372 1.1 bjh21
2373 1.1 bjh21 aSig = extractFloat64Frac( a );
2374 1.1 bjh21 aExp = extractFloat64Exp( a );
2375 1.1 bjh21 aSign = extractFloat64Sign( a );
2376 1.1 bjh21 if ( 0x41E < aExp ) {
2377 1.1 bjh21 if ( ( aExp == 0x7FF ) && aSig ) aSign = 0;
2378 1.1 bjh21 goto invalid;
2379 1.1 bjh21 }
2380 1.1 bjh21 else if ( aExp < 0x3FF ) {
2381 1.11.2.1 tls if ( aExp || aSig ) set_float_exception_inexact_flag();
2382 1.1 bjh21 return 0;
2383 1.1 bjh21 }
2384 1.1 bjh21 aSig |= LIT64( 0x0010000000000000 );
2385 1.1 bjh21 shiftCount = 0x433 - aExp;
2386 1.1 bjh21 savedASig = aSig;
2387 1.1 bjh21 aSig >>= shiftCount;
2388 1.10 christos z = (int32)aSig;
2389 1.1 bjh21 if ( aSign ) z = - z;
2390 1.1 bjh21 if ( ( z < 0 ) ^ aSign ) {
2391 1.1 bjh21 invalid:
2392 1.1 bjh21 float_raise( float_flag_invalid );
2393 1.1 bjh21 return aSign ? (sbits32) 0x80000000 : 0x7FFFFFFF;
2394 1.1 bjh21 }
2395 1.1 bjh21 if ( ( aSig<<shiftCount ) != savedASig ) {
2396 1.11.2.1 tls set_float_exception_inexact_flag();
2397 1.1 bjh21 }
2398 1.1 bjh21 return z;
2399 1.1 bjh21
2400 1.1 bjh21 }
2401 1.1 bjh21
2402 1.1 bjh21 #ifndef SOFTFLOAT_FOR_GCC /* Not needed */
2403 1.1 bjh21 /*
2404 1.1 bjh21 -------------------------------------------------------------------------------
2405 1.1 bjh21 Returns the result of converting the double-precision floating-point value
2406 1.1 bjh21 `a' to the 64-bit two's complement integer format. The conversion is
2407 1.1 bjh21 performed according to the IEC/IEEE Standard for Binary Floating-Point
2408 1.1 bjh21 Arithmetic---which means in particular that the conversion is rounded
2409 1.1 bjh21 according to the current rounding mode. If `a' is a NaN, the largest
2410 1.1 bjh21 positive integer is returned. Otherwise, if the conversion overflows, the
2411 1.1 bjh21 largest integer with the same sign as `a' is returned.
2412 1.1 bjh21 -------------------------------------------------------------------------------
2413 1.1 bjh21 */
2414 1.1 bjh21 int64 float64_to_int64( float64 a )
2415 1.1 bjh21 {
2416 1.1 bjh21 flag aSign;
2417 1.1 bjh21 int16 aExp, shiftCount;
2418 1.1 bjh21 bits64 aSig, aSigExtra;
2419 1.1 bjh21
2420 1.1 bjh21 aSig = extractFloat64Frac( a );
2421 1.1 bjh21 aExp = extractFloat64Exp( a );
2422 1.1 bjh21 aSign = extractFloat64Sign( a );
2423 1.1 bjh21 if ( aExp ) aSig |= LIT64( 0x0010000000000000 );
2424 1.1 bjh21 shiftCount = 0x433 - aExp;
2425 1.1 bjh21 if ( shiftCount <= 0 ) {
2426 1.1 bjh21 if ( 0x43E < aExp ) {
2427 1.1 bjh21 float_raise( float_flag_invalid );
2428 1.1 bjh21 if ( ! aSign
2429 1.1 bjh21 || ( ( aExp == 0x7FF )
2430 1.1 bjh21 && ( aSig != LIT64( 0x0010000000000000 ) ) )
2431 1.1 bjh21 ) {
2432 1.1 bjh21 return LIT64( 0x7FFFFFFFFFFFFFFF );
2433 1.1 bjh21 }
2434 1.1 bjh21 return (sbits64) LIT64( 0x8000000000000000 );
2435 1.1 bjh21 }
2436 1.1 bjh21 aSigExtra = 0;
2437 1.1 bjh21 aSig <<= - shiftCount;
2438 1.1 bjh21 }
2439 1.1 bjh21 else {
2440 1.1 bjh21 shift64ExtraRightJamming( aSig, 0, shiftCount, &aSig, &aSigExtra );
2441 1.1 bjh21 }
2442 1.1 bjh21 return roundAndPackInt64( aSign, aSig, aSigExtra );
2443 1.1 bjh21
2444 1.1 bjh21 }
2445 1.1 bjh21
2446 1.1 bjh21 /*
2447 1.1 bjh21 -------------------------------------------------------------------------------
2448 1.1 bjh21 Returns the result of converting the double-precision floating-point value
2449 1.1 bjh21 `a' to the 64-bit two's complement integer format. The conversion is
2450 1.1 bjh21 performed according to the IEC/IEEE Standard for Binary Floating-Point
2451 1.1 bjh21 Arithmetic, except that the conversion is always rounded toward zero.
2452 1.1 bjh21 If `a' is a NaN, the largest positive integer is returned. Otherwise, if
2453 1.1 bjh21 the conversion overflows, the largest integer with the same sign as `a' is
2454 1.1 bjh21 returned.
2455 1.1 bjh21 -------------------------------------------------------------------------------
2456 1.1 bjh21 */
2457 1.1 bjh21 int64 float64_to_int64_round_to_zero( float64 a )
2458 1.1 bjh21 {
2459 1.1 bjh21 flag aSign;
2460 1.1 bjh21 int16 aExp, shiftCount;
2461 1.1 bjh21 bits64 aSig;
2462 1.1 bjh21 int64 z;
2463 1.1 bjh21
2464 1.1 bjh21 aSig = extractFloat64Frac( a );
2465 1.1 bjh21 aExp = extractFloat64Exp( a );
2466 1.1 bjh21 aSign = extractFloat64Sign( a );
2467 1.1 bjh21 if ( aExp ) aSig |= LIT64( 0x0010000000000000 );
2468 1.1 bjh21 shiftCount = aExp - 0x433;
2469 1.1 bjh21 if ( 0 <= shiftCount ) {
2470 1.1 bjh21 if ( 0x43E <= aExp ) {
2471 1.1 bjh21 if ( a != LIT64( 0xC3E0000000000000 ) ) {
2472 1.1 bjh21 float_raise( float_flag_invalid );
2473 1.1 bjh21 if ( ! aSign
2474 1.1 bjh21 || ( ( aExp == 0x7FF )
2475 1.1 bjh21 && ( aSig != LIT64( 0x0010000000000000 ) ) )
2476 1.1 bjh21 ) {
2477 1.1 bjh21 return LIT64( 0x7FFFFFFFFFFFFFFF );
2478 1.1 bjh21 }
2479 1.1 bjh21 }
2480 1.1 bjh21 return (sbits64) LIT64( 0x8000000000000000 );
2481 1.1 bjh21 }
2482 1.1 bjh21 z = aSig<<shiftCount;
2483 1.1 bjh21 }
2484 1.1 bjh21 else {
2485 1.1 bjh21 if ( aExp < 0x3FE ) {
2486 1.11.2.1 tls if ( aExp | aSig ) set_float_exception_inexact_flag();
2487 1.1 bjh21 return 0;
2488 1.1 bjh21 }
2489 1.1 bjh21 z = aSig>>( - shiftCount );
2490 1.1 bjh21 if ( (bits64) ( aSig<<( shiftCount & 63 ) ) ) {
2491 1.11.2.1 tls set_float_exception_inexact_flag();
2492 1.1 bjh21 }
2493 1.1 bjh21 }
2494 1.1 bjh21 if ( aSign ) z = - z;
2495 1.1 bjh21 return z;
2496 1.1 bjh21
2497 1.1 bjh21 }
2498 1.1 bjh21 #endif /* !SOFTFLOAT_FOR_GCC */
2499 1.1 bjh21
2500 1.1 bjh21 /*
2501 1.1 bjh21 -------------------------------------------------------------------------------
2502 1.1 bjh21 Returns the result of converting the double-precision floating-point value
2503 1.1 bjh21 `a' to the single-precision floating-point format. The conversion is
2504 1.1 bjh21 performed according to the IEC/IEEE Standard for Binary Floating-Point
2505 1.1 bjh21 Arithmetic.
2506 1.1 bjh21 -------------------------------------------------------------------------------
2507 1.1 bjh21 */
2508 1.1 bjh21 float32 float64_to_float32( float64 a )
2509 1.1 bjh21 {
2510 1.1 bjh21 flag aSign;
2511 1.1 bjh21 int16 aExp;
2512 1.1 bjh21 bits64 aSig;
2513 1.1 bjh21 bits32 zSig;
2514 1.1 bjh21
2515 1.1 bjh21 aSig = extractFloat64Frac( a );
2516 1.1 bjh21 aExp = extractFloat64Exp( a );
2517 1.1 bjh21 aSign = extractFloat64Sign( a );
2518 1.1 bjh21 if ( aExp == 0x7FF ) {
2519 1.1 bjh21 if ( aSig ) return commonNaNToFloat32( float64ToCommonNaN( a ) );
2520 1.1 bjh21 return packFloat32( aSign, 0xFF, 0 );
2521 1.1 bjh21 }
2522 1.1 bjh21 shift64RightJamming( aSig, 22, &aSig );
2523 1.10 christos zSig = (bits32)aSig;
2524 1.1 bjh21 if ( aExp || zSig ) {
2525 1.1 bjh21 zSig |= 0x40000000;
2526 1.1 bjh21 aExp -= 0x381;
2527 1.1 bjh21 }
2528 1.1 bjh21 return roundAndPackFloat32( aSign, aExp, zSig );
2529 1.1 bjh21
2530 1.1 bjh21 }
2531 1.1 bjh21
2532 1.1 bjh21 #ifdef FLOATX80
2533 1.1 bjh21
2534 1.1 bjh21 /*
2535 1.1 bjh21 -------------------------------------------------------------------------------
2536 1.1 bjh21 Returns the result of converting the double-precision floating-point value
2537 1.1 bjh21 `a' to the extended double-precision floating-point format. The conversion
2538 1.1 bjh21 is performed according to the IEC/IEEE Standard for Binary Floating-Point
2539 1.1 bjh21 Arithmetic.
2540 1.1 bjh21 -------------------------------------------------------------------------------
2541 1.1 bjh21 */
2542 1.1 bjh21 floatx80 float64_to_floatx80( float64 a )
2543 1.1 bjh21 {
2544 1.1 bjh21 flag aSign;
2545 1.1 bjh21 int16 aExp;
2546 1.1 bjh21 bits64 aSig;
2547 1.1 bjh21
2548 1.1 bjh21 aSig = extractFloat64Frac( a );
2549 1.1 bjh21 aExp = extractFloat64Exp( a );
2550 1.1 bjh21 aSign = extractFloat64Sign( a );
2551 1.1 bjh21 if ( aExp == 0x7FF ) {
2552 1.1 bjh21 if ( aSig ) return commonNaNToFloatx80( float64ToCommonNaN( a ) );
2553 1.1 bjh21 return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
2554 1.1 bjh21 }
2555 1.1 bjh21 if ( aExp == 0 ) {
2556 1.1 bjh21 if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 );
2557 1.1 bjh21 normalizeFloat64Subnormal( aSig, &aExp, &aSig );
2558 1.1 bjh21 }
2559 1.1 bjh21 return
2560 1.1 bjh21 packFloatx80(
2561 1.1 bjh21 aSign, aExp + 0x3C00, ( aSig | LIT64( 0x0010000000000000 ) )<<11 );
2562 1.1 bjh21
2563 1.1 bjh21 }
2564 1.1 bjh21
2565 1.1 bjh21 #endif
2566 1.1 bjh21
2567 1.1 bjh21 #ifdef FLOAT128
2568 1.1 bjh21
2569 1.1 bjh21 /*
2570 1.1 bjh21 -------------------------------------------------------------------------------
2571 1.1 bjh21 Returns the result of converting the double-precision floating-point value
2572 1.1 bjh21 `a' to the quadruple-precision floating-point format. The conversion is
2573 1.1 bjh21 performed according to the IEC/IEEE Standard for Binary Floating-Point
2574 1.1 bjh21 Arithmetic.
2575 1.1 bjh21 -------------------------------------------------------------------------------
2576 1.1 bjh21 */
2577 1.1 bjh21 float128 float64_to_float128( float64 a )
2578 1.1 bjh21 {
2579 1.1 bjh21 flag aSign;
2580 1.1 bjh21 int16 aExp;
2581 1.1 bjh21 bits64 aSig, zSig0, zSig1;
2582 1.1 bjh21
2583 1.1 bjh21 aSig = extractFloat64Frac( a );
2584 1.1 bjh21 aExp = extractFloat64Exp( a );
2585 1.1 bjh21 aSign = extractFloat64Sign( a );
2586 1.1 bjh21 if ( aExp == 0x7FF ) {
2587 1.1 bjh21 if ( aSig ) return commonNaNToFloat128( float64ToCommonNaN( a ) );
2588 1.1 bjh21 return packFloat128( aSign, 0x7FFF, 0, 0 );
2589 1.1 bjh21 }
2590 1.1 bjh21 if ( aExp == 0 ) {
2591 1.1 bjh21 if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 );
2592 1.1 bjh21 normalizeFloat64Subnormal( aSig, &aExp, &aSig );
2593 1.1 bjh21 --aExp;
2594 1.1 bjh21 }
2595 1.1 bjh21 shift128Right( aSig, 0, 4, &zSig0, &zSig1 );
2596 1.1 bjh21 return packFloat128( aSign, aExp + 0x3C00, zSig0, zSig1 );
2597 1.1 bjh21
2598 1.1 bjh21 }
2599 1.1 bjh21
2600 1.1 bjh21 #endif
2601 1.1 bjh21
2602 1.1 bjh21 #ifndef SOFTFLOAT_FOR_GCC
2603 1.1 bjh21 /*
2604 1.1 bjh21 -------------------------------------------------------------------------------
2605 1.1 bjh21 Rounds the double-precision floating-point value `a' to an integer, and
2606 1.1 bjh21 returns the result as a double-precision floating-point value. The
2607 1.1 bjh21 operation is performed according to the IEC/IEEE Standard for Binary
2608 1.1 bjh21 Floating-Point Arithmetic.
2609 1.1 bjh21 -------------------------------------------------------------------------------
2610 1.1 bjh21 */
2611 1.1 bjh21 float64 float64_round_to_int( float64 a )
2612 1.1 bjh21 {
2613 1.1 bjh21 flag aSign;
2614 1.1 bjh21 int16 aExp;
2615 1.1 bjh21 bits64 lastBitMask, roundBitsMask;
2616 1.1 bjh21 int8 roundingMode;
2617 1.1 bjh21 float64 z;
2618 1.1 bjh21
2619 1.1 bjh21 aExp = extractFloat64Exp( a );
2620 1.1 bjh21 if ( 0x433 <= aExp ) {
2621 1.1 bjh21 if ( ( aExp == 0x7FF ) && extractFloat64Frac( a ) ) {
2622 1.1 bjh21 return propagateFloat64NaN( a, a );
2623 1.1 bjh21 }
2624 1.1 bjh21 return a;
2625 1.1 bjh21 }
2626 1.1 bjh21 if ( aExp < 0x3FF ) {
2627 1.1 bjh21 if ( (bits64) ( a<<1 ) == 0 ) return a;
2628 1.11.2.1 tls set_float_exception_inexact_flag();
2629 1.1 bjh21 aSign = extractFloat64Sign( a );
2630 1.1 bjh21 switch ( float_rounding_mode ) {
2631 1.1 bjh21 case float_round_nearest_even:
2632 1.1 bjh21 if ( ( aExp == 0x3FE ) && extractFloat64Frac( a ) ) {
2633 1.1 bjh21 return packFloat64( aSign, 0x3FF, 0 );
2634 1.1 bjh21 }
2635 1.1 bjh21 break;
2636 1.1 bjh21 case float_round_to_zero:
2637 1.1 bjh21 break;
2638 1.1 bjh21 case float_round_down:
2639 1.1 bjh21 return aSign ? LIT64( 0xBFF0000000000000 ) : 0;
2640 1.1 bjh21 case float_round_up:
2641 1.1 bjh21 return
2642 1.1 bjh21 aSign ? LIT64( 0x8000000000000000 ) : LIT64( 0x3FF0000000000000 );
2643 1.1 bjh21 }
2644 1.1 bjh21 return packFloat64( aSign, 0, 0 );
2645 1.1 bjh21 }
2646 1.1 bjh21 lastBitMask = 1;
2647 1.1 bjh21 lastBitMask <<= 0x433 - aExp;
2648 1.1 bjh21 roundBitsMask = lastBitMask - 1;
2649 1.1 bjh21 z = a;
2650 1.1 bjh21 roundingMode = float_rounding_mode;
2651 1.1 bjh21 if ( roundingMode == float_round_nearest_even ) {
2652 1.1 bjh21 z += lastBitMask>>1;
2653 1.1 bjh21 if ( ( z & roundBitsMask ) == 0 ) z &= ~ lastBitMask;
2654 1.1 bjh21 }
2655 1.1 bjh21 else if ( roundingMode != float_round_to_zero ) {
2656 1.1 bjh21 if ( extractFloat64Sign( z ) ^ ( roundingMode == float_round_up ) ) {
2657 1.1 bjh21 z += roundBitsMask;
2658 1.1 bjh21 }
2659 1.1 bjh21 }
2660 1.1 bjh21 z &= ~ roundBitsMask;
2661 1.11.2.1 tls if ( z != a ) set_float_exception_inexact_flag();
2662 1.1 bjh21 return z;
2663 1.1 bjh21
2664 1.1 bjh21 }
2665 1.1 bjh21 #endif
2666 1.1 bjh21
2667 1.1 bjh21 /*
2668 1.1 bjh21 -------------------------------------------------------------------------------
2669 1.1 bjh21 Returns the result of adding the absolute values of the double-precision
2670 1.1 bjh21 floating-point values `a' and `b'. If `zSign' is 1, the sum is negated
2671 1.1 bjh21 before being returned. `zSign' is ignored if the result is a NaN.
2672 1.1 bjh21 The addition is performed according to the IEC/IEEE Standard for Binary
2673 1.1 bjh21 Floating-Point Arithmetic.
2674 1.1 bjh21 -------------------------------------------------------------------------------
2675 1.1 bjh21 */
2676 1.1 bjh21 static float64 addFloat64Sigs( float64 a, float64 b, flag zSign )
2677 1.1 bjh21 {
2678 1.1 bjh21 int16 aExp, bExp, zExp;
2679 1.1 bjh21 bits64 aSig, bSig, zSig;
2680 1.1 bjh21 int16 expDiff;
2681 1.1 bjh21
2682 1.1 bjh21 aSig = extractFloat64Frac( a );
2683 1.1 bjh21 aExp = extractFloat64Exp( a );
2684 1.1 bjh21 bSig = extractFloat64Frac( b );
2685 1.1 bjh21 bExp = extractFloat64Exp( b );
2686 1.1 bjh21 expDiff = aExp - bExp;
2687 1.1 bjh21 aSig <<= 9;
2688 1.1 bjh21 bSig <<= 9;
2689 1.1 bjh21 if ( 0 < expDiff ) {
2690 1.1 bjh21 if ( aExp == 0x7FF ) {
2691 1.1 bjh21 if ( aSig ) return propagateFloat64NaN( a, b );
2692 1.1 bjh21 return a;
2693 1.1 bjh21 }
2694 1.1 bjh21 if ( bExp == 0 ) {
2695 1.1 bjh21 --expDiff;
2696 1.1 bjh21 }
2697 1.1 bjh21 else {
2698 1.1 bjh21 bSig |= LIT64( 0x2000000000000000 );
2699 1.1 bjh21 }
2700 1.1 bjh21 shift64RightJamming( bSig, expDiff, &bSig );
2701 1.1 bjh21 zExp = aExp;
2702 1.1 bjh21 }
2703 1.1 bjh21 else if ( expDiff < 0 ) {
2704 1.1 bjh21 if ( bExp == 0x7FF ) {
2705 1.1 bjh21 if ( bSig ) return propagateFloat64NaN( a, b );
2706 1.1 bjh21 return packFloat64( zSign, 0x7FF, 0 );
2707 1.1 bjh21 }
2708 1.1 bjh21 if ( aExp == 0 ) {
2709 1.1 bjh21 ++expDiff;
2710 1.1 bjh21 }
2711 1.1 bjh21 else {
2712 1.1 bjh21 aSig |= LIT64( 0x2000000000000000 );
2713 1.1 bjh21 }
2714 1.1 bjh21 shift64RightJamming( aSig, - expDiff, &aSig );
2715 1.1 bjh21 zExp = bExp;
2716 1.1 bjh21 }
2717 1.1 bjh21 else {
2718 1.1 bjh21 if ( aExp == 0x7FF ) {
2719 1.1 bjh21 if ( aSig | bSig ) return propagateFloat64NaN( a, b );
2720 1.1 bjh21 return a;
2721 1.1 bjh21 }
2722 1.1 bjh21 if ( aExp == 0 ) return packFloat64( zSign, 0, ( aSig + bSig )>>9 );
2723 1.1 bjh21 zSig = LIT64( 0x4000000000000000 ) + aSig + bSig;
2724 1.1 bjh21 zExp = aExp;
2725 1.1 bjh21 goto roundAndPack;
2726 1.1 bjh21 }
2727 1.1 bjh21 aSig |= LIT64( 0x2000000000000000 );
2728 1.1 bjh21 zSig = ( aSig + bSig )<<1;
2729 1.1 bjh21 --zExp;
2730 1.1 bjh21 if ( (sbits64) zSig < 0 ) {
2731 1.1 bjh21 zSig = aSig + bSig;
2732 1.1 bjh21 ++zExp;
2733 1.1 bjh21 }
2734 1.1 bjh21 roundAndPack:
2735 1.1 bjh21 return roundAndPackFloat64( zSign, zExp, zSig );
2736 1.1 bjh21
2737 1.1 bjh21 }
2738 1.1 bjh21
2739 1.1 bjh21 /*
2740 1.1 bjh21 -------------------------------------------------------------------------------
2741 1.1 bjh21 Returns the result of subtracting the absolute values of the double-
2742 1.1 bjh21 precision floating-point values `a' and `b'. If `zSign' is 1, the
2743 1.1 bjh21 difference is negated before being returned. `zSign' is ignored if the
2744 1.1 bjh21 result is a NaN. The subtraction is performed according to the IEC/IEEE
2745 1.1 bjh21 Standard for Binary Floating-Point Arithmetic.
2746 1.1 bjh21 -------------------------------------------------------------------------------
2747 1.1 bjh21 */
2748 1.1 bjh21 static float64 subFloat64Sigs( float64 a, float64 b, flag zSign )
2749 1.1 bjh21 {
2750 1.1 bjh21 int16 aExp, bExp, zExp;
2751 1.1 bjh21 bits64 aSig, bSig, zSig;
2752 1.1 bjh21 int16 expDiff;
2753 1.1 bjh21
2754 1.1 bjh21 aSig = extractFloat64Frac( a );
2755 1.1 bjh21 aExp = extractFloat64Exp( a );
2756 1.1 bjh21 bSig = extractFloat64Frac( b );
2757 1.1 bjh21 bExp = extractFloat64Exp( b );
2758 1.1 bjh21 expDiff = aExp - bExp;
2759 1.1 bjh21 aSig <<= 10;
2760 1.1 bjh21 bSig <<= 10;
2761 1.1 bjh21 if ( 0 < expDiff ) goto aExpBigger;
2762 1.1 bjh21 if ( expDiff < 0 ) goto bExpBigger;
2763 1.1 bjh21 if ( aExp == 0x7FF ) {
2764 1.1 bjh21 if ( aSig | bSig ) return propagateFloat64NaN( a, b );
2765 1.1 bjh21 float_raise( float_flag_invalid );
2766 1.1 bjh21 return float64_default_nan;
2767 1.1 bjh21 }
2768 1.1 bjh21 if ( aExp == 0 ) {
2769 1.1 bjh21 aExp = 1;
2770 1.1 bjh21 bExp = 1;
2771 1.1 bjh21 }
2772 1.1 bjh21 if ( bSig < aSig ) goto aBigger;
2773 1.1 bjh21 if ( aSig < bSig ) goto bBigger;
2774 1.1 bjh21 return packFloat64( float_rounding_mode == float_round_down, 0, 0 );
2775 1.1 bjh21 bExpBigger:
2776 1.1 bjh21 if ( bExp == 0x7FF ) {
2777 1.1 bjh21 if ( bSig ) return propagateFloat64NaN( a, b );
2778 1.1 bjh21 return packFloat64( zSign ^ 1, 0x7FF, 0 );
2779 1.1 bjh21 }
2780 1.1 bjh21 if ( aExp == 0 ) {
2781 1.1 bjh21 ++expDiff;
2782 1.1 bjh21 }
2783 1.1 bjh21 else {
2784 1.1 bjh21 aSig |= LIT64( 0x4000000000000000 );
2785 1.1 bjh21 }
2786 1.1 bjh21 shift64RightJamming( aSig, - expDiff, &aSig );
2787 1.1 bjh21 bSig |= LIT64( 0x4000000000000000 );
2788 1.1 bjh21 bBigger:
2789 1.1 bjh21 zSig = bSig - aSig;
2790 1.1 bjh21 zExp = bExp;
2791 1.1 bjh21 zSign ^= 1;
2792 1.1 bjh21 goto normalizeRoundAndPack;
2793 1.1 bjh21 aExpBigger:
2794 1.1 bjh21 if ( aExp == 0x7FF ) {
2795 1.1 bjh21 if ( aSig ) return propagateFloat64NaN( a, b );
2796 1.1 bjh21 return a;
2797 1.1 bjh21 }
2798 1.1 bjh21 if ( bExp == 0 ) {
2799 1.1 bjh21 --expDiff;
2800 1.1 bjh21 }
2801 1.1 bjh21 else {
2802 1.1 bjh21 bSig |= LIT64( 0x4000000000000000 );
2803 1.1 bjh21 }
2804 1.1 bjh21 shift64RightJamming( bSig, expDiff, &bSig );
2805 1.1 bjh21 aSig |= LIT64( 0x4000000000000000 );
2806 1.1 bjh21 aBigger:
2807 1.1 bjh21 zSig = aSig - bSig;
2808 1.1 bjh21 zExp = aExp;
2809 1.1 bjh21 normalizeRoundAndPack:
2810 1.1 bjh21 --zExp;
2811 1.1 bjh21 return normalizeRoundAndPackFloat64( zSign, zExp, zSig );
2812 1.1 bjh21
2813 1.1 bjh21 }
2814 1.1 bjh21
2815 1.1 bjh21 /*
2816 1.1 bjh21 -------------------------------------------------------------------------------
2817 1.1 bjh21 Returns the result of adding the double-precision floating-point values `a'
2818 1.1 bjh21 and `b'. The operation is performed according to the IEC/IEEE Standard for
2819 1.1 bjh21 Binary Floating-Point Arithmetic.
2820 1.1 bjh21 -------------------------------------------------------------------------------
2821 1.1 bjh21 */
2822 1.1 bjh21 float64 float64_add( float64 a, float64 b )
2823 1.1 bjh21 {
2824 1.1 bjh21 flag aSign, bSign;
2825 1.1 bjh21
2826 1.1 bjh21 aSign = extractFloat64Sign( a );
2827 1.1 bjh21 bSign = extractFloat64Sign( b );
2828 1.1 bjh21 if ( aSign == bSign ) {
2829 1.1 bjh21 return addFloat64Sigs( a, b, aSign );
2830 1.1 bjh21 }
2831 1.1 bjh21 else {
2832 1.1 bjh21 return subFloat64Sigs( a, b, aSign );
2833 1.1 bjh21 }
2834 1.1 bjh21
2835 1.1 bjh21 }
2836 1.1 bjh21
2837 1.1 bjh21 /*
2838 1.1 bjh21 -------------------------------------------------------------------------------
2839 1.1 bjh21 Returns the result of subtracting the double-precision floating-point values
2840 1.1 bjh21 `a' and `b'. The operation is performed according to the IEC/IEEE Standard
2841 1.1 bjh21 for Binary Floating-Point Arithmetic.
2842 1.1 bjh21 -------------------------------------------------------------------------------
2843 1.1 bjh21 */
2844 1.1 bjh21 float64 float64_sub( float64 a, float64 b )
2845 1.1 bjh21 {
2846 1.1 bjh21 flag aSign, bSign;
2847 1.1 bjh21
2848 1.1 bjh21 aSign = extractFloat64Sign( a );
2849 1.1 bjh21 bSign = extractFloat64Sign( b );
2850 1.1 bjh21 if ( aSign == bSign ) {
2851 1.1 bjh21 return subFloat64Sigs( a, b, aSign );
2852 1.1 bjh21 }
2853 1.1 bjh21 else {
2854 1.1 bjh21 return addFloat64Sigs( a, b, aSign );
2855 1.1 bjh21 }
2856 1.1 bjh21
2857 1.1 bjh21 }
2858 1.1 bjh21
2859 1.1 bjh21 /*
2860 1.1 bjh21 -------------------------------------------------------------------------------
2861 1.1 bjh21 Returns the result of multiplying the double-precision floating-point values
2862 1.1 bjh21 `a' and `b'. The operation is performed according to the IEC/IEEE Standard
2863 1.1 bjh21 for Binary Floating-Point Arithmetic.
2864 1.1 bjh21 -------------------------------------------------------------------------------
2865 1.1 bjh21 */
2866 1.1 bjh21 float64 float64_mul( float64 a, float64 b )
2867 1.1 bjh21 {
2868 1.1 bjh21 flag aSign, bSign, zSign;
2869 1.1 bjh21 int16 aExp, bExp, zExp;
2870 1.1 bjh21 bits64 aSig, bSig, zSig0, zSig1;
2871 1.1 bjh21
2872 1.1 bjh21 aSig = extractFloat64Frac( a );
2873 1.1 bjh21 aExp = extractFloat64Exp( a );
2874 1.1 bjh21 aSign = extractFloat64Sign( a );
2875 1.1 bjh21 bSig = extractFloat64Frac( b );
2876 1.1 bjh21 bExp = extractFloat64Exp( b );
2877 1.1 bjh21 bSign = extractFloat64Sign( b );
2878 1.1 bjh21 zSign = aSign ^ bSign;
2879 1.1 bjh21 if ( aExp == 0x7FF ) {
2880 1.1 bjh21 if ( aSig || ( ( bExp == 0x7FF ) && bSig ) ) {
2881 1.1 bjh21 return propagateFloat64NaN( a, b );
2882 1.1 bjh21 }
2883 1.1 bjh21 if ( ( bExp | bSig ) == 0 ) {
2884 1.1 bjh21 float_raise( float_flag_invalid );
2885 1.1 bjh21 return float64_default_nan;
2886 1.1 bjh21 }
2887 1.1 bjh21 return packFloat64( zSign, 0x7FF, 0 );
2888 1.1 bjh21 }
2889 1.1 bjh21 if ( bExp == 0x7FF ) {
2890 1.1 bjh21 if ( bSig ) return propagateFloat64NaN( a, b );
2891 1.1 bjh21 if ( ( aExp | aSig ) == 0 ) {
2892 1.1 bjh21 float_raise( float_flag_invalid );
2893 1.1 bjh21 return float64_default_nan;
2894 1.1 bjh21 }
2895 1.1 bjh21 return packFloat64( zSign, 0x7FF, 0 );
2896 1.1 bjh21 }
2897 1.1 bjh21 if ( aExp == 0 ) {
2898 1.1 bjh21 if ( aSig == 0 ) return packFloat64( zSign, 0, 0 );
2899 1.1 bjh21 normalizeFloat64Subnormal( aSig, &aExp, &aSig );
2900 1.1 bjh21 }
2901 1.1 bjh21 if ( bExp == 0 ) {
2902 1.1 bjh21 if ( bSig == 0 ) return packFloat64( zSign, 0, 0 );
2903 1.1 bjh21 normalizeFloat64Subnormal( bSig, &bExp, &bSig );
2904 1.1 bjh21 }
2905 1.1 bjh21 zExp = aExp + bExp - 0x3FF;
2906 1.1 bjh21 aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<10;
2907 1.1 bjh21 bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;
2908 1.1 bjh21 mul64To128( aSig, bSig, &zSig0, &zSig1 );
2909 1.1 bjh21 zSig0 |= ( zSig1 != 0 );
2910 1.1 bjh21 if ( 0 <= (sbits64) ( zSig0<<1 ) ) {
2911 1.1 bjh21 zSig0 <<= 1;
2912 1.1 bjh21 --zExp;
2913 1.1 bjh21 }
2914 1.1 bjh21 return roundAndPackFloat64( zSign, zExp, zSig0 );
2915 1.1 bjh21
2916 1.1 bjh21 }
2917 1.1 bjh21
2918 1.1 bjh21 /*
2919 1.1 bjh21 -------------------------------------------------------------------------------
2920 1.1 bjh21 Returns the result of dividing the double-precision floating-point value `a'
2921 1.1 bjh21 by the corresponding value `b'. The operation is performed according to
2922 1.1 bjh21 the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2923 1.1 bjh21 -------------------------------------------------------------------------------
2924 1.1 bjh21 */
2925 1.1 bjh21 float64 float64_div( float64 a, float64 b )
2926 1.1 bjh21 {
2927 1.1 bjh21 flag aSign, bSign, zSign;
2928 1.1 bjh21 int16 aExp, bExp, zExp;
2929 1.1 bjh21 bits64 aSig, bSig, zSig;
2930 1.1 bjh21 bits64 rem0, rem1;
2931 1.1 bjh21 bits64 term0, term1;
2932 1.1 bjh21
2933 1.1 bjh21 aSig = extractFloat64Frac( a );
2934 1.1 bjh21 aExp = extractFloat64Exp( a );
2935 1.1 bjh21 aSign = extractFloat64Sign( a );
2936 1.1 bjh21 bSig = extractFloat64Frac( b );
2937 1.1 bjh21 bExp = extractFloat64Exp( b );
2938 1.1 bjh21 bSign = extractFloat64Sign( b );
2939 1.1 bjh21 zSign = aSign ^ bSign;
2940 1.1 bjh21 if ( aExp == 0x7FF ) {
2941 1.1 bjh21 if ( aSig ) return propagateFloat64NaN( a, b );
2942 1.1 bjh21 if ( bExp == 0x7FF ) {
2943 1.1 bjh21 if ( bSig ) return propagateFloat64NaN( a, b );
2944 1.1 bjh21 float_raise( float_flag_invalid );
2945 1.1 bjh21 return float64_default_nan;
2946 1.1 bjh21 }
2947 1.1 bjh21 return packFloat64( zSign, 0x7FF, 0 );
2948 1.1 bjh21 }
2949 1.1 bjh21 if ( bExp == 0x7FF ) {
2950 1.1 bjh21 if ( bSig ) return propagateFloat64NaN( a, b );
2951 1.1 bjh21 return packFloat64( zSign, 0, 0 );
2952 1.1 bjh21 }
2953 1.1 bjh21 if ( bExp == 0 ) {
2954 1.1 bjh21 if ( bSig == 0 ) {
2955 1.1 bjh21 if ( ( aExp | aSig ) == 0 ) {
2956 1.1 bjh21 float_raise( float_flag_invalid );
2957 1.1 bjh21 return float64_default_nan;
2958 1.1 bjh21 }
2959 1.1 bjh21 float_raise( float_flag_divbyzero );
2960 1.1 bjh21 return packFloat64( zSign, 0x7FF, 0 );
2961 1.1 bjh21 }
2962 1.1 bjh21 normalizeFloat64Subnormal( bSig, &bExp, &bSig );
2963 1.1 bjh21 }
2964 1.1 bjh21 if ( aExp == 0 ) {
2965 1.1 bjh21 if ( aSig == 0 ) return packFloat64( zSign, 0, 0 );
2966 1.1 bjh21 normalizeFloat64Subnormal( aSig, &aExp, &aSig );
2967 1.1 bjh21 }
2968 1.1 bjh21 zExp = aExp - bExp + 0x3FD;
2969 1.1 bjh21 aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<10;
2970 1.1 bjh21 bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;
2971 1.1 bjh21 if ( bSig <= ( aSig + aSig ) ) {
2972 1.1 bjh21 aSig >>= 1;
2973 1.1 bjh21 ++zExp;
2974 1.1 bjh21 }
2975 1.1 bjh21 zSig = estimateDiv128To64( aSig, 0, bSig );
2976 1.1 bjh21 if ( ( zSig & 0x1FF ) <= 2 ) {
2977 1.1 bjh21 mul64To128( bSig, zSig, &term0, &term1 );
2978 1.1 bjh21 sub128( aSig, 0, term0, term1, &rem0, &rem1 );
2979 1.1 bjh21 while ( (sbits64) rem0 < 0 ) {
2980 1.1 bjh21 --zSig;
2981 1.1 bjh21 add128( rem0, rem1, 0, bSig, &rem0, &rem1 );
2982 1.1 bjh21 }
2983 1.1 bjh21 zSig |= ( rem1 != 0 );
2984 1.1 bjh21 }
2985 1.1 bjh21 return roundAndPackFloat64( zSign, zExp, zSig );
2986 1.1 bjh21
2987 1.1 bjh21 }
2988 1.1 bjh21
2989 1.1 bjh21 #ifndef SOFTFLOAT_FOR_GCC
2990 1.1 bjh21 /*
2991 1.1 bjh21 -------------------------------------------------------------------------------
2992 1.1 bjh21 Returns the remainder of the double-precision floating-point value `a'
2993 1.1 bjh21 with respect to the corresponding value `b'. The operation is performed
2994 1.1 bjh21 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2995 1.1 bjh21 -------------------------------------------------------------------------------
2996 1.1 bjh21 */
2997 1.1 bjh21 float64 float64_rem( float64 a, float64 b )
2998 1.1 bjh21 {
2999 1.1 bjh21 flag aSign, bSign, zSign;
3000 1.1 bjh21 int16 aExp, bExp, expDiff;
3001 1.1 bjh21 bits64 aSig, bSig;
3002 1.1 bjh21 bits64 q, alternateASig;
3003 1.1 bjh21 sbits64 sigMean;
3004 1.1 bjh21
3005 1.1 bjh21 aSig = extractFloat64Frac( a );
3006 1.1 bjh21 aExp = extractFloat64Exp( a );
3007 1.1 bjh21 aSign = extractFloat64Sign( a );
3008 1.1 bjh21 bSig = extractFloat64Frac( b );
3009 1.1 bjh21 bExp = extractFloat64Exp( b );
3010 1.1 bjh21 bSign = extractFloat64Sign( b );
3011 1.1 bjh21 if ( aExp == 0x7FF ) {
3012 1.1 bjh21 if ( aSig || ( ( bExp == 0x7FF ) && bSig ) ) {
3013 1.1 bjh21 return propagateFloat64NaN( a, b );
3014 1.1 bjh21 }
3015 1.1 bjh21 float_raise( float_flag_invalid );
3016 1.1 bjh21 return float64_default_nan;
3017 1.1 bjh21 }
3018 1.1 bjh21 if ( bExp == 0x7FF ) {
3019 1.1 bjh21 if ( bSig ) return propagateFloat64NaN( a, b );
3020 1.1 bjh21 return a;
3021 1.1 bjh21 }
3022 1.1 bjh21 if ( bExp == 0 ) {
3023 1.1 bjh21 if ( bSig == 0 ) {
3024 1.1 bjh21 float_raise( float_flag_invalid );
3025 1.1 bjh21 return float64_default_nan;
3026 1.1 bjh21 }
3027 1.1 bjh21 normalizeFloat64Subnormal( bSig, &bExp, &bSig );
3028 1.1 bjh21 }
3029 1.1 bjh21 if ( aExp == 0 ) {
3030 1.1 bjh21 if ( aSig == 0 ) return a;
3031 1.1 bjh21 normalizeFloat64Subnormal( aSig, &aExp, &aSig );
3032 1.1 bjh21 }
3033 1.1 bjh21 expDiff = aExp - bExp;
3034 1.1 bjh21 aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<11;
3035 1.1 bjh21 bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;
3036 1.1 bjh21 if ( expDiff < 0 ) {
3037 1.1 bjh21 if ( expDiff < -1 ) return a;
3038 1.1 bjh21 aSig >>= 1;
3039 1.1 bjh21 }
3040 1.1 bjh21 q = ( bSig <= aSig );
3041 1.1 bjh21 if ( q ) aSig -= bSig;
3042 1.1 bjh21 expDiff -= 64;
3043 1.1 bjh21 while ( 0 < expDiff ) {
3044 1.1 bjh21 q = estimateDiv128To64( aSig, 0, bSig );
3045 1.1 bjh21 q = ( 2 < q ) ? q - 2 : 0;
3046 1.1 bjh21 aSig = - ( ( bSig>>2 ) * q );
3047 1.1 bjh21 expDiff -= 62;
3048 1.1 bjh21 }
3049 1.1 bjh21 expDiff += 64;
3050 1.1 bjh21 if ( 0 < expDiff ) {
3051 1.1 bjh21 q = estimateDiv128To64( aSig, 0, bSig );
3052 1.1 bjh21 q = ( 2 < q ) ? q - 2 : 0;
3053 1.1 bjh21 q >>= 64 - expDiff;
3054 1.1 bjh21 bSig >>= 2;
3055 1.1 bjh21 aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q;
3056 1.1 bjh21 }
3057 1.1 bjh21 else {
3058 1.1 bjh21 aSig >>= 2;
3059 1.1 bjh21 bSig >>= 2;
3060 1.1 bjh21 }
3061 1.1 bjh21 do {
3062 1.1 bjh21 alternateASig = aSig;
3063 1.1 bjh21 ++q;
3064 1.1 bjh21 aSig -= bSig;
3065 1.1 bjh21 } while ( 0 <= (sbits64) aSig );
3066 1.1 bjh21 sigMean = aSig + alternateASig;
3067 1.1 bjh21 if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) {
3068 1.1 bjh21 aSig = alternateASig;
3069 1.1 bjh21 }
3070 1.1 bjh21 zSign = ( (sbits64) aSig < 0 );
3071 1.1 bjh21 if ( zSign ) aSig = - aSig;
3072 1.1 bjh21 return normalizeRoundAndPackFloat64( aSign ^ zSign, bExp, aSig );
3073 1.1 bjh21
3074 1.1 bjh21 }
3075 1.1 bjh21
3076 1.1 bjh21 /*
3077 1.1 bjh21 -------------------------------------------------------------------------------
3078 1.1 bjh21 Returns the square root of the double-precision floating-point value `a'.
3079 1.1 bjh21 The operation is performed according to the IEC/IEEE Standard for Binary
3080 1.1 bjh21 Floating-Point Arithmetic.
3081 1.1 bjh21 -------------------------------------------------------------------------------
3082 1.1 bjh21 */
3083 1.1 bjh21 float64 float64_sqrt( float64 a )
3084 1.1 bjh21 {
3085 1.1 bjh21 flag aSign;
3086 1.1 bjh21 int16 aExp, zExp;
3087 1.1 bjh21 bits64 aSig, zSig, doubleZSig;
3088 1.1 bjh21 bits64 rem0, rem1, term0, term1;
3089 1.1 bjh21
3090 1.1 bjh21 aSig = extractFloat64Frac( a );
3091 1.1 bjh21 aExp = extractFloat64Exp( a );
3092 1.1 bjh21 aSign = extractFloat64Sign( a );
3093 1.1 bjh21 if ( aExp == 0x7FF ) {
3094 1.1 bjh21 if ( aSig ) return propagateFloat64NaN( a, a );
3095 1.1 bjh21 if ( ! aSign ) return a;
3096 1.1 bjh21 float_raise( float_flag_invalid );
3097 1.1 bjh21 return float64_default_nan;
3098 1.1 bjh21 }
3099 1.1 bjh21 if ( aSign ) {
3100 1.1 bjh21 if ( ( aExp | aSig ) == 0 ) return a;
3101 1.1 bjh21 float_raise( float_flag_invalid );
3102 1.1 bjh21 return float64_default_nan;
3103 1.1 bjh21 }
3104 1.1 bjh21 if ( aExp == 0 ) {
3105 1.1 bjh21 if ( aSig == 0 ) return 0;
3106 1.1 bjh21 normalizeFloat64Subnormal( aSig, &aExp, &aSig );
3107 1.1 bjh21 }
3108 1.1 bjh21 zExp = ( ( aExp - 0x3FF )>>1 ) + 0x3FE;
3109 1.1 bjh21 aSig |= LIT64( 0x0010000000000000 );
3110 1.1 bjh21 zSig = estimateSqrt32( aExp, aSig>>21 );
3111 1.1 bjh21 aSig <<= 9 - ( aExp & 1 );
3112 1.1 bjh21 zSig = estimateDiv128To64( aSig, 0, zSig<<32 ) + ( zSig<<30 );
3113 1.1 bjh21 if ( ( zSig & 0x1FF ) <= 5 ) {
3114 1.1 bjh21 doubleZSig = zSig<<1;
3115 1.1 bjh21 mul64To128( zSig, zSig, &term0, &term1 );
3116 1.1 bjh21 sub128( aSig, 0, term0, term1, &rem0, &rem1 );
3117 1.1 bjh21 while ( (sbits64) rem0 < 0 ) {
3118 1.1 bjh21 --zSig;
3119 1.1 bjh21 doubleZSig -= 2;
3120 1.1 bjh21 add128( rem0, rem1, zSig>>63, doubleZSig | 1, &rem0, &rem1 );
3121 1.1 bjh21 }
3122 1.1 bjh21 zSig |= ( ( rem0 | rem1 ) != 0 );
3123 1.1 bjh21 }
3124 1.1 bjh21 return roundAndPackFloat64( 0, zExp, zSig );
3125 1.1 bjh21
3126 1.1 bjh21 }
3127 1.1 bjh21 #endif
3128 1.1 bjh21
3129 1.1 bjh21 /*
3130 1.1 bjh21 -------------------------------------------------------------------------------
3131 1.1 bjh21 Returns 1 if the double-precision floating-point value `a' is equal to the
3132 1.1 bjh21 corresponding value `b', and 0 otherwise. The comparison is performed
3133 1.1 bjh21 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3134 1.1 bjh21 -------------------------------------------------------------------------------
3135 1.1 bjh21 */
3136 1.1 bjh21 flag float64_eq( float64 a, float64 b )
3137 1.1 bjh21 {
3138 1.1 bjh21
3139 1.1 bjh21 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
3140 1.1 bjh21 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
3141 1.1 bjh21 ) {
3142 1.1 bjh21 if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) {
3143 1.1 bjh21 float_raise( float_flag_invalid );
3144 1.1 bjh21 }
3145 1.1 bjh21 return 0;
3146 1.1 bjh21 }
3147 1.1 bjh21 return ( a == b ) ||
3148 1.1 bjh21 ( (bits64) ( ( FLOAT64_DEMANGLE(a) | FLOAT64_DEMANGLE(b) )<<1 ) == 0 );
3149 1.1 bjh21
3150 1.1 bjh21 }
3151 1.1 bjh21
3152 1.1 bjh21 /*
3153 1.1 bjh21 -------------------------------------------------------------------------------
3154 1.1 bjh21 Returns 1 if the double-precision floating-point value `a' is less than or
3155 1.1 bjh21 equal to the corresponding value `b', and 0 otherwise. The comparison is
3156 1.1 bjh21 performed according to the IEC/IEEE Standard for Binary Floating-Point
3157 1.1 bjh21 Arithmetic.
3158 1.1 bjh21 -------------------------------------------------------------------------------
3159 1.1 bjh21 */
3160 1.1 bjh21 flag float64_le( float64 a, float64 b )
3161 1.1 bjh21 {
3162 1.1 bjh21 flag aSign, bSign;
3163 1.1 bjh21
3164 1.1 bjh21 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
3165 1.1 bjh21 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
3166 1.1 bjh21 ) {
3167 1.1 bjh21 float_raise( float_flag_invalid );
3168 1.1 bjh21 return 0;
3169 1.1 bjh21 }
3170 1.1 bjh21 aSign = extractFloat64Sign( a );
3171 1.1 bjh21 bSign = extractFloat64Sign( b );
3172 1.1 bjh21 if ( aSign != bSign )
3173 1.1 bjh21 return aSign ||
3174 1.1 bjh21 ( (bits64) ( ( FLOAT64_DEMANGLE(a) | FLOAT64_DEMANGLE(b) )<<1 ) ==
3175 1.1 bjh21 0 );
3176 1.1 bjh21 return ( a == b ) ||
3177 1.1 bjh21 ( aSign ^ ( FLOAT64_DEMANGLE(a) < FLOAT64_DEMANGLE(b) ) );
3178 1.1 bjh21
3179 1.1 bjh21 }
3180 1.1 bjh21
3181 1.1 bjh21 /*
3182 1.1 bjh21 -------------------------------------------------------------------------------
3183 1.1 bjh21 Returns 1 if the double-precision floating-point value `a' is less than
3184 1.1 bjh21 the corresponding value `b', and 0 otherwise. The comparison is performed
3185 1.1 bjh21 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3186 1.1 bjh21 -------------------------------------------------------------------------------
3187 1.1 bjh21 */
3188 1.1 bjh21 flag float64_lt( float64 a, float64 b )
3189 1.1 bjh21 {
3190 1.1 bjh21 flag aSign, bSign;
3191 1.1 bjh21
3192 1.1 bjh21 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
3193 1.1 bjh21 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
3194 1.1 bjh21 ) {
3195 1.1 bjh21 float_raise( float_flag_invalid );
3196 1.1 bjh21 return 0;
3197 1.1 bjh21 }
3198 1.1 bjh21 aSign = extractFloat64Sign( a );
3199 1.1 bjh21 bSign = extractFloat64Sign( b );
3200 1.1 bjh21 if ( aSign != bSign )
3201 1.1 bjh21 return aSign &&
3202 1.1 bjh21 ( (bits64) ( ( FLOAT64_DEMANGLE(a) | FLOAT64_DEMANGLE(b) )<<1 ) !=
3203 1.1 bjh21 0 );
3204 1.1 bjh21 return ( a != b ) &&
3205 1.1 bjh21 ( aSign ^ ( FLOAT64_DEMANGLE(a) < FLOAT64_DEMANGLE(b) ) );
3206 1.1 bjh21
3207 1.1 bjh21 }
3208 1.1 bjh21
3209 1.1 bjh21 #ifndef SOFTFLOAT_FOR_GCC
3210 1.1 bjh21 /*
3211 1.1 bjh21 -------------------------------------------------------------------------------
3212 1.1 bjh21 Returns 1 if the double-precision floating-point value `a' is equal to the
3213 1.1 bjh21 corresponding value `b', and 0 otherwise. The invalid exception is raised
3214 1.1 bjh21 if either operand is a NaN. Otherwise, the comparison is performed
3215 1.1 bjh21 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3216 1.1 bjh21 -------------------------------------------------------------------------------
3217 1.1 bjh21 */
3218 1.1 bjh21 flag float64_eq_signaling( float64 a, float64 b )
3219 1.1 bjh21 {
3220 1.1 bjh21
3221 1.1 bjh21 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
3222 1.1 bjh21 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
3223 1.1 bjh21 ) {
3224 1.1 bjh21 float_raise( float_flag_invalid );
3225 1.1 bjh21 return 0;
3226 1.1 bjh21 }
3227 1.1 bjh21 return ( a == b ) || ( (bits64) ( ( a | b )<<1 ) == 0 );
3228 1.1 bjh21
3229 1.1 bjh21 }
3230 1.1 bjh21
3231 1.1 bjh21 /*
3232 1.1 bjh21 -------------------------------------------------------------------------------
3233 1.1 bjh21 Returns 1 if the double-precision floating-point value `a' is less than or
3234 1.1 bjh21 equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not
3235 1.1 bjh21 cause an exception. Otherwise, the comparison is performed according to the
3236 1.1 bjh21 IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3237 1.1 bjh21 -------------------------------------------------------------------------------
3238 1.1 bjh21 */
3239 1.1 bjh21 flag float64_le_quiet( float64 a, float64 b )
3240 1.1 bjh21 {
3241 1.1 bjh21 flag aSign, bSign;
3242 1.1 bjh21
3243 1.1 bjh21 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
3244 1.1 bjh21 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
3245 1.1 bjh21 ) {
3246 1.1 bjh21 if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) {
3247 1.1 bjh21 float_raise( float_flag_invalid );
3248 1.1 bjh21 }
3249 1.1 bjh21 return 0;
3250 1.1 bjh21 }
3251 1.1 bjh21 aSign = extractFloat64Sign( a );
3252 1.1 bjh21 bSign = extractFloat64Sign( b );
3253 1.1 bjh21 if ( aSign != bSign ) return aSign || ( (bits64) ( ( a | b )<<1 ) == 0 );
3254 1.1 bjh21 return ( a == b ) || ( aSign ^ ( a < b ) );
3255 1.1 bjh21
3256 1.1 bjh21 }
3257 1.1 bjh21
3258 1.1 bjh21 /*
3259 1.1 bjh21 -------------------------------------------------------------------------------
3260 1.1 bjh21 Returns 1 if the double-precision floating-point value `a' is less than
3261 1.1 bjh21 the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
3262 1.1 bjh21 exception. Otherwise, the comparison is performed according to the IEC/IEEE
3263 1.1 bjh21 Standard for Binary Floating-Point Arithmetic.
3264 1.1 bjh21 -------------------------------------------------------------------------------
3265 1.1 bjh21 */
3266 1.1 bjh21 flag float64_lt_quiet( float64 a, float64 b )
3267 1.1 bjh21 {
3268 1.1 bjh21 flag aSign, bSign;
3269 1.1 bjh21
3270 1.1 bjh21 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
3271 1.1 bjh21 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
3272 1.1 bjh21 ) {
3273 1.1 bjh21 if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) {
3274 1.1 bjh21 float_raise( float_flag_invalid );
3275 1.1 bjh21 }
3276 1.1 bjh21 return 0;
3277 1.1 bjh21 }
3278 1.1 bjh21 aSign = extractFloat64Sign( a );
3279 1.1 bjh21 bSign = extractFloat64Sign( b );
3280 1.1 bjh21 if ( aSign != bSign ) return aSign && ( (bits64) ( ( a | b )<<1 ) != 0 );
3281 1.1 bjh21 return ( a != b ) && ( aSign ^ ( a < b ) );
3282 1.1 bjh21
3283 1.1 bjh21 }
3284 1.1 bjh21 #endif
3285 1.1 bjh21
3286 1.1 bjh21 #ifdef FLOATX80
3287 1.1 bjh21
3288 1.1 bjh21 /*
3289 1.1 bjh21 -------------------------------------------------------------------------------
3290 1.1 bjh21 Returns the result of converting the extended double-precision floating-
3291 1.1 bjh21 point value `a' to the 32-bit two's complement integer format. The
3292 1.1 bjh21 conversion is performed according to the IEC/IEEE Standard for Binary
3293 1.1 bjh21 Floating-Point Arithmetic---which means in particular that the conversion
3294 1.1 bjh21 is rounded according to the current rounding mode. If `a' is a NaN, the
3295 1.1 bjh21 largest positive integer is returned. Otherwise, if the conversion
3296 1.1 bjh21 overflows, the largest integer with the same sign as `a' is returned.
3297 1.1 bjh21 -------------------------------------------------------------------------------
3298 1.1 bjh21 */
3299 1.1 bjh21 int32 floatx80_to_int32( floatx80 a )
3300 1.1 bjh21 {
3301 1.1 bjh21 flag aSign;
3302 1.1 bjh21 int32 aExp, shiftCount;
3303 1.1 bjh21 bits64 aSig;
3304 1.1 bjh21
3305 1.1 bjh21 aSig = extractFloatx80Frac( a );
3306 1.1 bjh21 aExp = extractFloatx80Exp( a );
3307 1.1 bjh21 aSign = extractFloatx80Sign( a );
3308 1.1 bjh21 if ( ( aExp == 0x7FFF ) && (bits64) ( aSig<<1 ) ) aSign = 0;
3309 1.1 bjh21 shiftCount = 0x4037 - aExp;
3310 1.1 bjh21 if ( shiftCount <= 0 ) shiftCount = 1;
3311 1.1 bjh21 shift64RightJamming( aSig, shiftCount, &aSig );
3312 1.1 bjh21 return roundAndPackInt32( aSign, aSig );
3313 1.1 bjh21
3314 1.1 bjh21 }
3315 1.1 bjh21
3316 1.1 bjh21 /*
3317 1.1 bjh21 -------------------------------------------------------------------------------
3318 1.1 bjh21 Returns the result of converting the extended double-precision floating-
3319 1.1 bjh21 point value `a' to the 32-bit two's complement integer format. The
3320 1.1 bjh21 conversion is performed according to the IEC/IEEE Standard for Binary
3321 1.1 bjh21 Floating-Point Arithmetic, except that the conversion is always rounded
3322 1.1 bjh21 toward zero. If `a' is a NaN, the largest positive integer is returned.
3323 1.1 bjh21 Otherwise, if the conversion overflows, the largest integer with the same
3324 1.1 bjh21 sign as `a' is returned.
3325 1.1 bjh21 -------------------------------------------------------------------------------
3326 1.1 bjh21 */
3327 1.1 bjh21 int32 floatx80_to_int32_round_to_zero( floatx80 a )
3328 1.1 bjh21 {
3329 1.1 bjh21 flag aSign;
3330 1.1 bjh21 int32 aExp, shiftCount;
3331 1.1 bjh21 bits64 aSig, savedASig;
3332 1.1 bjh21 int32 z;
3333 1.1 bjh21
3334 1.1 bjh21 aSig = extractFloatx80Frac( a );
3335 1.1 bjh21 aExp = extractFloatx80Exp( a );
3336 1.1 bjh21 aSign = extractFloatx80Sign( a );
3337 1.1 bjh21 if ( 0x401E < aExp ) {
3338 1.1 bjh21 if ( ( aExp == 0x7FFF ) && (bits64) ( aSig<<1 ) ) aSign = 0;
3339 1.1 bjh21 goto invalid;
3340 1.1 bjh21 }
3341 1.1 bjh21 else if ( aExp < 0x3FFF ) {
3342 1.11.2.1 tls if ( aExp || aSig ) set_float_exception_inexact_flag();
3343 1.1 bjh21 return 0;
3344 1.1 bjh21 }
3345 1.1 bjh21 shiftCount = 0x403E - aExp;
3346 1.1 bjh21 savedASig = aSig;
3347 1.1 bjh21 aSig >>= shiftCount;
3348 1.1 bjh21 z = aSig;
3349 1.1 bjh21 if ( aSign ) z = - z;
3350 1.1 bjh21 if ( ( z < 0 ) ^ aSign ) {
3351 1.1 bjh21 invalid:
3352 1.1 bjh21 float_raise( float_flag_invalid );
3353 1.1 bjh21 return aSign ? (sbits32) 0x80000000 : 0x7FFFFFFF;
3354 1.1 bjh21 }
3355 1.1 bjh21 if ( ( aSig<<shiftCount ) != savedASig ) {
3356 1.11.2.1 tls set_float_exception_inexact_flag();
3357 1.1 bjh21 }
3358 1.1 bjh21 return z;
3359 1.1 bjh21
3360 1.1 bjh21 }
3361 1.1 bjh21
3362 1.1 bjh21 /*
3363 1.1 bjh21 -------------------------------------------------------------------------------
3364 1.1 bjh21 Returns the result of converting the extended double-precision floating-
3365 1.1 bjh21 point value `a' to the 64-bit two's complement integer format. The
3366 1.1 bjh21 conversion is performed according to the IEC/IEEE Standard for Binary
3367 1.1 bjh21 Floating-Point Arithmetic---which means in particular that the conversion
3368 1.1 bjh21 is rounded according to the current rounding mode. If `a' is a NaN,
3369 1.1 bjh21 the largest positive integer is returned. Otherwise, if the conversion
3370 1.1 bjh21 overflows, the largest integer with the same sign as `a' is returned.
3371 1.1 bjh21 -------------------------------------------------------------------------------
3372 1.1 bjh21 */
3373 1.1 bjh21 int64 floatx80_to_int64( floatx80 a )
3374 1.1 bjh21 {
3375 1.1 bjh21 flag aSign;
3376 1.1 bjh21 int32 aExp, shiftCount;
3377 1.1 bjh21 bits64 aSig, aSigExtra;
3378 1.1 bjh21
3379 1.1 bjh21 aSig = extractFloatx80Frac( a );
3380 1.1 bjh21 aExp = extractFloatx80Exp( a );
3381 1.1 bjh21 aSign = extractFloatx80Sign( a );
3382 1.1 bjh21 shiftCount = 0x403E - aExp;
3383 1.1 bjh21 if ( shiftCount <= 0 ) {
3384 1.1 bjh21 if ( shiftCount ) {
3385 1.1 bjh21 float_raise( float_flag_invalid );
3386 1.1 bjh21 if ( ! aSign
3387 1.1 bjh21 || ( ( aExp == 0x7FFF )
3388 1.1 bjh21 && ( aSig != LIT64( 0x8000000000000000 ) ) )
3389 1.1 bjh21 ) {
3390 1.1 bjh21 return LIT64( 0x7FFFFFFFFFFFFFFF );
3391 1.1 bjh21 }
3392 1.1 bjh21 return (sbits64) LIT64( 0x8000000000000000 );
3393 1.1 bjh21 }
3394 1.1 bjh21 aSigExtra = 0;
3395 1.1 bjh21 }
3396 1.1 bjh21 else {
3397 1.1 bjh21 shift64ExtraRightJamming( aSig, 0, shiftCount, &aSig, &aSigExtra );
3398 1.1 bjh21 }
3399 1.1 bjh21 return roundAndPackInt64( aSign, aSig, aSigExtra );
3400 1.1 bjh21
3401 1.1 bjh21 }
3402 1.1 bjh21
3403 1.1 bjh21 /*
3404 1.1 bjh21 -------------------------------------------------------------------------------
3405 1.1 bjh21 Returns the result of converting the extended double-precision floating-
3406 1.1 bjh21 point value `a' to the 64-bit two's complement integer format. The
3407 1.1 bjh21 conversion is performed according to the IEC/IEEE Standard for Binary
3408 1.1 bjh21 Floating-Point Arithmetic, except that the conversion is always rounded
3409 1.1 bjh21 toward zero. If `a' is a NaN, the largest positive integer is returned.
3410 1.1 bjh21 Otherwise, if the conversion overflows, the largest integer with the same
3411 1.1 bjh21 sign as `a' is returned.
3412 1.1 bjh21 -------------------------------------------------------------------------------
3413 1.1 bjh21 */
3414 1.1 bjh21 int64 floatx80_to_int64_round_to_zero( floatx80 a )
3415 1.1 bjh21 {
3416 1.1 bjh21 flag aSign;
3417 1.1 bjh21 int32 aExp, shiftCount;
3418 1.1 bjh21 bits64 aSig;
3419 1.1 bjh21 int64 z;
3420 1.1 bjh21
3421 1.1 bjh21 aSig = extractFloatx80Frac( a );
3422 1.1 bjh21 aExp = extractFloatx80Exp( a );
3423 1.1 bjh21 aSign = extractFloatx80Sign( a );
3424 1.1 bjh21 shiftCount = aExp - 0x403E;
3425 1.1 bjh21 if ( 0 <= shiftCount ) {
3426 1.1 bjh21 aSig &= LIT64( 0x7FFFFFFFFFFFFFFF );
3427 1.1 bjh21 if ( ( a.high != 0xC03E ) || aSig ) {
3428 1.1 bjh21 float_raise( float_flag_invalid );
3429 1.1 bjh21 if ( ! aSign || ( ( aExp == 0x7FFF ) && aSig ) ) {
3430 1.1 bjh21 return LIT64( 0x7FFFFFFFFFFFFFFF );
3431 1.1 bjh21 }
3432 1.1 bjh21 }
3433 1.1 bjh21 return (sbits64) LIT64( 0x8000000000000000 );
3434 1.1 bjh21 }
3435 1.1 bjh21 else if ( aExp < 0x3FFF ) {
3436 1.11.2.1 tls if ( aExp | aSig ) set_float_exception_inexact_flag();
3437 1.1 bjh21 return 0;
3438 1.1 bjh21 }
3439 1.1 bjh21 z = aSig>>( - shiftCount );
3440 1.1 bjh21 if ( (bits64) ( aSig<<( shiftCount & 63 ) ) ) {
3441 1.11.2.1 tls set_float_exception_inexact_flag();
3442 1.1 bjh21 }
3443 1.1 bjh21 if ( aSign ) z = - z;
3444 1.1 bjh21 return z;
3445 1.1 bjh21
3446 1.1 bjh21 }
3447 1.1 bjh21
3448 1.1 bjh21 /*
3449 1.1 bjh21 -------------------------------------------------------------------------------
3450 1.1 bjh21 Returns the result of converting the extended double-precision floating-
3451 1.1 bjh21 point value `a' to the single-precision floating-point format. The
3452 1.1 bjh21 conversion is performed according to the IEC/IEEE Standard for Binary
3453 1.1 bjh21 Floating-Point Arithmetic.
3454 1.1 bjh21 -------------------------------------------------------------------------------
3455 1.1 bjh21 */
3456 1.1 bjh21 float32 floatx80_to_float32( floatx80 a )
3457 1.1 bjh21 {
3458 1.1 bjh21 flag aSign;
3459 1.1 bjh21 int32 aExp;
3460 1.1 bjh21 bits64 aSig;
3461 1.1 bjh21
3462 1.1 bjh21 aSig = extractFloatx80Frac( a );
3463 1.1 bjh21 aExp = extractFloatx80Exp( a );
3464 1.1 bjh21 aSign = extractFloatx80Sign( a );
3465 1.1 bjh21 if ( aExp == 0x7FFF ) {
3466 1.1 bjh21 if ( (bits64) ( aSig<<1 ) ) {
3467 1.1 bjh21 return commonNaNToFloat32( floatx80ToCommonNaN( a ) );
3468 1.1 bjh21 }
3469 1.1 bjh21 return packFloat32( aSign, 0xFF, 0 );
3470 1.1 bjh21 }
3471 1.1 bjh21 shift64RightJamming( aSig, 33, &aSig );
3472 1.1 bjh21 if ( aExp || aSig ) aExp -= 0x3F81;
3473 1.1 bjh21 return roundAndPackFloat32( aSign, aExp, aSig );
3474 1.1 bjh21
3475 1.1 bjh21 }
3476 1.1 bjh21
3477 1.1 bjh21 /*
3478 1.1 bjh21 -------------------------------------------------------------------------------
3479 1.1 bjh21 Returns the result of converting the extended double-precision floating-
3480 1.1 bjh21 point value `a' to the double-precision floating-point format. The
3481 1.1 bjh21 conversion is performed according to the IEC/IEEE Standard for Binary
3482 1.1 bjh21 Floating-Point Arithmetic.
3483 1.1 bjh21 -------------------------------------------------------------------------------
3484 1.1 bjh21 */
3485 1.1 bjh21 float64 floatx80_to_float64( floatx80 a )
3486 1.1 bjh21 {
3487 1.1 bjh21 flag aSign;
3488 1.1 bjh21 int32 aExp;
3489 1.1 bjh21 bits64 aSig, zSig;
3490 1.1 bjh21
3491 1.1 bjh21 aSig = extractFloatx80Frac( a );
3492 1.1 bjh21 aExp = extractFloatx80Exp( a );
3493 1.1 bjh21 aSign = extractFloatx80Sign( a );
3494 1.1 bjh21 if ( aExp == 0x7FFF ) {
3495 1.1 bjh21 if ( (bits64) ( aSig<<1 ) ) {
3496 1.1 bjh21 return commonNaNToFloat64( floatx80ToCommonNaN( a ) );
3497 1.1 bjh21 }
3498 1.1 bjh21 return packFloat64( aSign, 0x7FF, 0 );
3499 1.1 bjh21 }
3500 1.1 bjh21 shift64RightJamming( aSig, 1, &zSig );
3501 1.1 bjh21 if ( aExp || aSig ) aExp -= 0x3C01;
3502 1.1 bjh21 return roundAndPackFloat64( aSign, aExp, zSig );
3503 1.1 bjh21
3504 1.1 bjh21 }
3505 1.1 bjh21
3506 1.1 bjh21 #ifdef FLOAT128
3507 1.1 bjh21
3508 1.1 bjh21 /*
3509 1.1 bjh21 -------------------------------------------------------------------------------
3510 1.1 bjh21 Returns the result of converting the extended double-precision floating-
3511 1.1 bjh21 point value `a' to the quadruple-precision floating-point format. The
3512 1.1 bjh21 conversion is performed according to the IEC/IEEE Standard for Binary
3513 1.1 bjh21 Floating-Point Arithmetic.
3514 1.1 bjh21 -------------------------------------------------------------------------------
3515 1.1 bjh21 */
3516 1.1 bjh21 float128 floatx80_to_float128( floatx80 a )
3517 1.1 bjh21 {
3518 1.1 bjh21 flag aSign;
3519 1.1 bjh21 int16 aExp;
3520 1.1 bjh21 bits64 aSig, zSig0, zSig1;
3521 1.1 bjh21
3522 1.1 bjh21 aSig = extractFloatx80Frac( a );
3523 1.1 bjh21 aExp = extractFloatx80Exp( a );
3524 1.1 bjh21 aSign = extractFloatx80Sign( a );
3525 1.1 bjh21 if ( ( aExp == 0x7FFF ) && (bits64) ( aSig<<1 ) ) {
3526 1.1 bjh21 return commonNaNToFloat128( floatx80ToCommonNaN( a ) );
3527 1.1 bjh21 }
3528 1.1 bjh21 shift128Right( aSig<<1, 0, 16, &zSig0, &zSig1 );
3529 1.1 bjh21 return packFloat128( aSign, aExp, zSig0, zSig1 );
3530 1.1 bjh21
3531 1.1 bjh21 }
3532 1.1 bjh21
3533 1.1 bjh21 #endif
3534 1.1 bjh21
3535 1.1 bjh21 /*
3536 1.1 bjh21 -------------------------------------------------------------------------------
3537 1.1 bjh21 Rounds the extended double-precision floating-point value `a' to an integer,
3538 1.1 bjh21 and returns the result as an extended quadruple-precision floating-point
3539 1.1 bjh21 value. The operation is performed according to the IEC/IEEE Standard for
3540 1.1 bjh21 Binary Floating-Point Arithmetic.
3541 1.1 bjh21 -------------------------------------------------------------------------------
3542 1.1 bjh21 */
3543 1.1 bjh21 floatx80 floatx80_round_to_int( floatx80 a )
3544 1.1 bjh21 {
3545 1.1 bjh21 flag aSign;
3546 1.1 bjh21 int32 aExp;
3547 1.1 bjh21 bits64 lastBitMask, roundBitsMask;
3548 1.1 bjh21 int8 roundingMode;
3549 1.1 bjh21 floatx80 z;
3550 1.1 bjh21
3551 1.1 bjh21 aExp = extractFloatx80Exp( a );
3552 1.1 bjh21 if ( 0x403E <= aExp ) {
3553 1.1 bjh21 if ( ( aExp == 0x7FFF ) && (bits64) ( extractFloatx80Frac( a )<<1 ) ) {
3554 1.1 bjh21 return propagateFloatx80NaN( a, a );
3555 1.1 bjh21 }
3556 1.1 bjh21 return a;
3557 1.1 bjh21 }
3558 1.1 bjh21 if ( aExp < 0x3FFF ) {
3559 1.1 bjh21 if ( ( aExp == 0 )
3560 1.1 bjh21 && ( (bits64) ( extractFloatx80Frac( a )<<1 ) == 0 ) ) {
3561 1.1 bjh21 return a;
3562 1.1 bjh21 }
3563 1.11.2.1 tls set_float_exception_inexact_flag();
3564 1.1 bjh21 aSign = extractFloatx80Sign( a );
3565 1.1 bjh21 switch ( float_rounding_mode ) {
3566 1.1 bjh21 case float_round_nearest_even:
3567 1.1 bjh21 if ( ( aExp == 0x3FFE ) && (bits64) ( extractFloatx80Frac( a )<<1 )
3568 1.1 bjh21 ) {
3569 1.1 bjh21 return
3570 1.1 bjh21 packFloatx80( aSign, 0x3FFF, LIT64( 0x8000000000000000 ) );
3571 1.1 bjh21 }
3572 1.1 bjh21 break;
3573 1.1 bjh21 case float_round_to_zero:
3574 1.1 bjh21 break;
3575 1.1 bjh21 case float_round_down:
3576 1.1 bjh21 return
3577 1.1 bjh21 aSign ?
3578 1.1 bjh21 packFloatx80( 1, 0x3FFF, LIT64( 0x8000000000000000 ) )
3579 1.1 bjh21 : packFloatx80( 0, 0, 0 );
3580 1.1 bjh21 case float_round_up:
3581 1.1 bjh21 return
3582 1.1 bjh21 aSign ? packFloatx80( 1, 0, 0 )
3583 1.1 bjh21 : packFloatx80( 0, 0x3FFF, LIT64( 0x8000000000000000 ) );
3584 1.1 bjh21 }
3585 1.1 bjh21 return packFloatx80( aSign, 0, 0 );
3586 1.1 bjh21 }
3587 1.1 bjh21 lastBitMask = 1;
3588 1.1 bjh21 lastBitMask <<= 0x403E - aExp;
3589 1.1 bjh21 roundBitsMask = lastBitMask - 1;
3590 1.1 bjh21 z = a;
3591 1.1 bjh21 roundingMode = float_rounding_mode;
3592 1.1 bjh21 if ( roundingMode == float_round_nearest_even ) {
3593 1.1 bjh21 z.low += lastBitMask>>1;
3594 1.1 bjh21 if ( ( z.low & roundBitsMask ) == 0 ) z.low &= ~ lastBitMask;
3595 1.1 bjh21 }
3596 1.1 bjh21 else if ( roundingMode != float_round_to_zero ) {
3597 1.1 bjh21 if ( extractFloatx80Sign( z ) ^ ( roundingMode == float_round_up ) ) {
3598 1.1 bjh21 z.low += roundBitsMask;
3599 1.1 bjh21 }
3600 1.1 bjh21 }
3601 1.1 bjh21 z.low &= ~ roundBitsMask;
3602 1.1 bjh21 if ( z.low == 0 ) {
3603 1.1 bjh21 ++z.high;
3604 1.1 bjh21 z.low = LIT64( 0x8000000000000000 );
3605 1.1 bjh21 }
3606 1.11.2.1 tls if ( z.low != a.low ) set_float_exception_inexact_flag();
3607 1.1 bjh21 return z;
3608 1.1 bjh21
3609 1.1 bjh21 }
3610 1.1 bjh21
3611 1.1 bjh21 /*
3612 1.1 bjh21 -------------------------------------------------------------------------------
3613 1.1 bjh21 Returns the result of adding the absolute values of the extended double-
3614 1.1 bjh21 precision floating-point values `a' and `b'. If `zSign' is 1, the sum is
3615 1.1 bjh21 negated before being returned. `zSign' is ignored if the result is a NaN.
3616 1.1 bjh21 The addition is performed according to the IEC/IEEE Standard for Binary
3617 1.1 bjh21 Floating-Point Arithmetic.
3618 1.1 bjh21 -------------------------------------------------------------------------------
3619 1.1 bjh21 */
3620 1.1 bjh21 static floatx80 addFloatx80Sigs( floatx80 a, floatx80 b, flag zSign )
3621 1.1 bjh21 {
3622 1.1 bjh21 int32 aExp, bExp, zExp;
3623 1.1 bjh21 bits64 aSig, bSig, zSig0, zSig1;
3624 1.1 bjh21 int32 expDiff;
3625 1.1 bjh21
3626 1.1 bjh21 aSig = extractFloatx80Frac( a );
3627 1.1 bjh21 aExp = extractFloatx80Exp( a );
3628 1.1 bjh21 bSig = extractFloatx80Frac( b );
3629 1.1 bjh21 bExp = extractFloatx80Exp( b );
3630 1.1 bjh21 expDiff = aExp - bExp;
3631 1.1 bjh21 if ( 0 < expDiff ) {
3632 1.1 bjh21 if ( aExp == 0x7FFF ) {
3633 1.1 bjh21 if ( (bits64) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b );
3634 1.1 bjh21 return a;
3635 1.1 bjh21 }
3636 1.1 bjh21 if ( bExp == 0 ) --expDiff;
3637 1.1 bjh21 shift64ExtraRightJamming( bSig, 0, expDiff, &bSig, &zSig1 );
3638 1.1 bjh21 zExp = aExp;
3639 1.1 bjh21 }
3640 1.1 bjh21 else if ( expDiff < 0 ) {
3641 1.1 bjh21 if ( bExp == 0x7FFF ) {
3642 1.1 bjh21 if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b );
3643 1.1 bjh21 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
3644 1.1 bjh21 }
3645 1.1 bjh21 if ( aExp == 0 ) ++expDiff;
3646 1.1 bjh21 shift64ExtraRightJamming( aSig, 0, - expDiff, &aSig, &zSig1 );
3647 1.1 bjh21 zExp = bExp;
3648 1.1 bjh21 }
3649 1.1 bjh21 else {
3650 1.1 bjh21 if ( aExp == 0x7FFF ) {
3651 1.1 bjh21 if ( (bits64) ( ( aSig | bSig )<<1 ) ) {
3652 1.1 bjh21 return propagateFloatx80NaN( a, b );
3653 1.1 bjh21 }
3654 1.1 bjh21 return a;
3655 1.1 bjh21 }
3656 1.1 bjh21 zSig1 = 0;
3657 1.1 bjh21 zSig0 = aSig + bSig;
3658 1.1 bjh21 if ( aExp == 0 ) {
3659 1.1 bjh21 normalizeFloatx80Subnormal( zSig0, &zExp, &zSig0 );
3660 1.1 bjh21 goto roundAndPack;
3661 1.1 bjh21 }
3662 1.1 bjh21 zExp = aExp;
3663 1.1 bjh21 goto shiftRight1;
3664 1.1 bjh21 }
3665 1.1 bjh21 zSig0 = aSig + bSig;
3666 1.1 bjh21 if ( (sbits64) zSig0 < 0 ) goto roundAndPack;
3667 1.1 bjh21 shiftRight1:
3668 1.1 bjh21 shift64ExtraRightJamming( zSig0, zSig1, 1, &zSig0, &zSig1 );
3669 1.1 bjh21 zSig0 |= LIT64( 0x8000000000000000 );
3670 1.1 bjh21 ++zExp;
3671 1.1 bjh21 roundAndPack:
3672 1.1 bjh21 return
3673 1.1 bjh21 roundAndPackFloatx80(
3674 1.1 bjh21 floatx80_rounding_precision, zSign, zExp, zSig0, zSig1 );
3675 1.1 bjh21
3676 1.1 bjh21 }
3677 1.1 bjh21
3678 1.1 bjh21 /*
3679 1.1 bjh21 -------------------------------------------------------------------------------
3680 1.1 bjh21 Returns the result of subtracting the absolute values of the extended
3681 1.1 bjh21 double-precision floating-point values `a' and `b'. If `zSign' is 1, the
3682 1.1 bjh21 difference is negated before being returned. `zSign' is ignored if the
3683 1.1 bjh21 result is a NaN. The subtraction is performed according to the IEC/IEEE
3684 1.1 bjh21 Standard for Binary Floating-Point Arithmetic.
3685 1.1 bjh21 -------------------------------------------------------------------------------
3686 1.1 bjh21 */
3687 1.1 bjh21 static floatx80 subFloatx80Sigs( floatx80 a, floatx80 b, flag zSign )
3688 1.1 bjh21 {
3689 1.1 bjh21 int32 aExp, bExp, zExp;
3690 1.1 bjh21 bits64 aSig, bSig, zSig0, zSig1;
3691 1.1 bjh21 int32 expDiff;
3692 1.1 bjh21 floatx80 z;
3693 1.1 bjh21
3694 1.1 bjh21 aSig = extractFloatx80Frac( a );
3695 1.1 bjh21 aExp = extractFloatx80Exp( a );
3696 1.1 bjh21 bSig = extractFloatx80Frac( b );
3697 1.1 bjh21 bExp = extractFloatx80Exp( b );
3698 1.1 bjh21 expDiff = aExp - bExp;
3699 1.1 bjh21 if ( 0 < expDiff ) goto aExpBigger;
3700 1.1 bjh21 if ( expDiff < 0 ) goto bExpBigger;
3701 1.1 bjh21 if ( aExp == 0x7FFF ) {
3702 1.1 bjh21 if ( (bits64) ( ( aSig | bSig )<<1 ) ) {
3703 1.1 bjh21 return propagateFloatx80NaN( a, b );
3704 1.1 bjh21 }
3705 1.1 bjh21 float_raise( float_flag_invalid );
3706 1.1 bjh21 z.low = floatx80_default_nan_low;
3707 1.1 bjh21 z.high = floatx80_default_nan_high;
3708 1.1 bjh21 return z;
3709 1.1 bjh21 }
3710 1.1 bjh21 if ( aExp == 0 ) {
3711 1.1 bjh21 aExp = 1;
3712 1.1 bjh21 bExp = 1;
3713 1.1 bjh21 }
3714 1.1 bjh21 zSig1 = 0;
3715 1.1 bjh21 if ( bSig < aSig ) goto aBigger;
3716 1.1 bjh21 if ( aSig < bSig ) goto bBigger;
3717 1.1 bjh21 return packFloatx80( float_rounding_mode == float_round_down, 0, 0 );
3718 1.1 bjh21 bExpBigger:
3719 1.1 bjh21 if ( bExp == 0x7FFF ) {
3720 1.1 bjh21 if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b );
3721 1.1 bjh21 return packFloatx80( zSign ^ 1, 0x7FFF, LIT64( 0x8000000000000000 ) );
3722 1.1 bjh21 }
3723 1.1 bjh21 if ( aExp == 0 ) ++expDiff;
3724 1.1 bjh21 shift128RightJamming( aSig, 0, - expDiff, &aSig, &zSig1 );
3725 1.1 bjh21 bBigger:
3726 1.1 bjh21 sub128( bSig, 0, aSig, zSig1, &zSig0, &zSig1 );
3727 1.1 bjh21 zExp = bExp;
3728 1.1 bjh21 zSign ^= 1;
3729 1.1 bjh21 goto normalizeRoundAndPack;
3730 1.1 bjh21 aExpBigger:
3731 1.1 bjh21 if ( aExp == 0x7FFF ) {
3732 1.1 bjh21 if ( (bits64) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b );
3733 1.1 bjh21 return a;
3734 1.1 bjh21 }
3735 1.1 bjh21 if ( bExp == 0 ) --expDiff;
3736 1.1 bjh21 shift128RightJamming( bSig, 0, expDiff, &bSig, &zSig1 );
3737 1.1 bjh21 aBigger:
3738 1.1 bjh21 sub128( aSig, 0, bSig, zSig1, &zSig0, &zSig1 );
3739 1.1 bjh21 zExp = aExp;
3740 1.1 bjh21 normalizeRoundAndPack:
3741 1.1 bjh21 return
3742 1.1 bjh21 normalizeRoundAndPackFloatx80(
3743 1.1 bjh21 floatx80_rounding_precision, zSign, zExp, zSig0, zSig1 );
3744 1.1 bjh21
3745 1.1 bjh21 }
3746 1.1 bjh21
3747 1.1 bjh21 /*
3748 1.1 bjh21 -------------------------------------------------------------------------------
3749 1.1 bjh21 Returns the result of adding the extended double-precision floating-point
3750 1.1 bjh21 values `a' and `b'. The operation is performed according to the IEC/IEEE
3751 1.1 bjh21 Standard for Binary Floating-Point Arithmetic.
3752 1.1 bjh21 -------------------------------------------------------------------------------
3753 1.1 bjh21 */
3754 1.1 bjh21 floatx80 floatx80_add( floatx80 a, floatx80 b )
3755 1.1 bjh21 {
3756 1.1 bjh21 flag aSign, bSign;
3757 1.1 bjh21
3758 1.1 bjh21 aSign = extractFloatx80Sign( a );
3759 1.1 bjh21 bSign = extractFloatx80Sign( b );
3760 1.1 bjh21 if ( aSign == bSign ) {
3761 1.1 bjh21 return addFloatx80Sigs( a, b, aSign );
3762 1.1 bjh21 }
3763 1.1 bjh21 else {
3764 1.1 bjh21 return subFloatx80Sigs( a, b, aSign );
3765 1.1 bjh21 }
3766 1.1 bjh21
3767 1.1 bjh21 }
3768 1.1 bjh21
3769 1.1 bjh21 /*
3770 1.1 bjh21 -------------------------------------------------------------------------------
3771 1.1 bjh21 Returns the result of subtracting the extended double-precision floating-
3772 1.1 bjh21 point values `a' and `b'. The operation is performed according to the
3773 1.1 bjh21 IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3774 1.1 bjh21 -------------------------------------------------------------------------------
3775 1.1 bjh21 */
3776 1.1 bjh21 floatx80 floatx80_sub( floatx80 a, floatx80 b )
3777 1.1 bjh21 {
3778 1.1 bjh21 flag aSign, bSign;
3779 1.1 bjh21
3780 1.1 bjh21 aSign = extractFloatx80Sign( a );
3781 1.1 bjh21 bSign = extractFloatx80Sign( b );
3782 1.1 bjh21 if ( aSign == bSign ) {
3783 1.1 bjh21 return subFloatx80Sigs( a, b, aSign );
3784 1.1 bjh21 }
3785 1.1 bjh21 else {
3786 1.1 bjh21 return addFloatx80Sigs( a, b, aSign );
3787 1.1 bjh21 }
3788 1.1 bjh21
3789 1.1 bjh21 }
3790 1.1 bjh21
3791 1.1 bjh21 /*
3792 1.1 bjh21 -------------------------------------------------------------------------------
3793 1.1 bjh21 Returns the result of multiplying the extended double-precision floating-
3794 1.1 bjh21 point values `a' and `b'. The operation is performed according to the
3795 1.1 bjh21 IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3796 1.1 bjh21 -------------------------------------------------------------------------------
3797 1.1 bjh21 */
3798 1.1 bjh21 floatx80 floatx80_mul( floatx80 a, floatx80 b )
3799 1.1 bjh21 {
3800 1.1 bjh21 flag aSign, bSign, zSign;
3801 1.1 bjh21 int32 aExp, bExp, zExp;
3802 1.1 bjh21 bits64 aSig, bSig, zSig0, zSig1;
3803 1.1 bjh21 floatx80 z;
3804 1.1 bjh21
3805 1.1 bjh21 aSig = extractFloatx80Frac( a );
3806 1.1 bjh21 aExp = extractFloatx80Exp( a );
3807 1.1 bjh21 aSign = extractFloatx80Sign( a );
3808 1.1 bjh21 bSig = extractFloatx80Frac( b );
3809 1.1 bjh21 bExp = extractFloatx80Exp( b );
3810 1.1 bjh21 bSign = extractFloatx80Sign( b );
3811 1.1 bjh21 zSign = aSign ^ bSign;
3812 1.1 bjh21 if ( aExp == 0x7FFF ) {
3813 1.1 bjh21 if ( (bits64) ( aSig<<1 )
3814 1.1 bjh21 || ( ( bExp == 0x7FFF ) && (bits64) ( bSig<<1 ) ) ) {
3815 1.1 bjh21 return propagateFloatx80NaN( a, b );
3816 1.1 bjh21 }
3817 1.1 bjh21 if ( ( bExp | bSig ) == 0 ) goto invalid;
3818 1.1 bjh21 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
3819 1.1 bjh21 }
3820 1.1 bjh21 if ( bExp == 0x7FFF ) {
3821 1.1 bjh21 if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b );
3822 1.1 bjh21 if ( ( aExp | aSig ) == 0 ) {
3823 1.1 bjh21 invalid:
3824 1.1 bjh21 float_raise( float_flag_invalid );
3825 1.1 bjh21 z.low = floatx80_default_nan_low;
3826 1.1 bjh21 z.high = floatx80_default_nan_high;
3827 1.1 bjh21 return z;
3828 1.1 bjh21 }
3829 1.1 bjh21 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
3830 1.1 bjh21 }
3831 1.1 bjh21 if ( aExp == 0 ) {
3832 1.1 bjh21 if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 );
3833 1.1 bjh21 normalizeFloatx80Subnormal( aSig, &aExp, &aSig );
3834 1.1 bjh21 }
3835 1.1 bjh21 if ( bExp == 0 ) {
3836 1.1 bjh21 if ( bSig == 0 ) return packFloatx80( zSign, 0, 0 );
3837 1.1 bjh21 normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
3838 1.1 bjh21 }
3839 1.1 bjh21 zExp = aExp + bExp - 0x3FFE;
3840 1.1 bjh21 mul64To128( aSig, bSig, &zSig0, &zSig1 );
3841 1.1 bjh21 if ( 0 < (sbits64) zSig0 ) {
3842 1.1 bjh21 shortShift128Left( zSig0, zSig1, 1, &zSig0, &zSig1 );
3843 1.1 bjh21 --zExp;
3844 1.1 bjh21 }
3845 1.1 bjh21 return
3846 1.1 bjh21 roundAndPackFloatx80(
3847 1.1 bjh21 floatx80_rounding_precision, zSign, zExp, zSig0, zSig1 );
3848 1.1 bjh21
3849 1.1 bjh21 }
3850 1.1 bjh21
3851 1.1 bjh21 /*
3852 1.1 bjh21 -------------------------------------------------------------------------------
3853 1.1 bjh21 Returns the result of dividing the extended double-precision floating-point
3854 1.1 bjh21 value `a' by the corresponding value `b'. The operation is performed
3855 1.1 bjh21 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3856 1.1 bjh21 -------------------------------------------------------------------------------
3857 1.1 bjh21 */
3858 1.1 bjh21 floatx80 floatx80_div( floatx80 a, floatx80 b )
3859 1.1 bjh21 {
3860 1.1 bjh21 flag aSign, bSign, zSign;
3861 1.1 bjh21 int32 aExp, bExp, zExp;
3862 1.1 bjh21 bits64 aSig, bSig, zSig0, zSig1;
3863 1.1 bjh21 bits64 rem0, rem1, rem2, term0, term1, term2;
3864 1.1 bjh21 floatx80 z;
3865 1.1 bjh21
3866 1.1 bjh21 aSig = extractFloatx80Frac( a );
3867 1.1 bjh21 aExp = extractFloatx80Exp( a );
3868 1.1 bjh21 aSign = extractFloatx80Sign( a );
3869 1.1 bjh21 bSig = extractFloatx80Frac( b );
3870 1.1 bjh21 bExp = extractFloatx80Exp( b );
3871 1.1 bjh21 bSign = extractFloatx80Sign( b );
3872 1.1 bjh21 zSign = aSign ^ bSign;
3873 1.1 bjh21 if ( aExp == 0x7FFF ) {
3874 1.1 bjh21 if ( (bits64) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b );
3875 1.1 bjh21 if ( bExp == 0x7FFF ) {
3876 1.1 bjh21 if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b );
3877 1.1 bjh21 goto invalid;
3878 1.1 bjh21 }
3879 1.1 bjh21 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
3880 1.1 bjh21 }
3881 1.1 bjh21 if ( bExp == 0x7FFF ) {
3882 1.1 bjh21 if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b );
3883 1.1 bjh21 return packFloatx80( zSign, 0, 0 );
3884 1.1 bjh21 }
3885 1.1 bjh21 if ( bExp == 0 ) {
3886 1.1 bjh21 if ( bSig == 0 ) {
3887 1.1 bjh21 if ( ( aExp | aSig ) == 0 ) {
3888 1.1 bjh21 invalid:
3889 1.1 bjh21 float_raise( float_flag_invalid );
3890 1.1 bjh21 z.low = floatx80_default_nan_low;
3891 1.1 bjh21 z.high = floatx80_default_nan_high;
3892 1.1 bjh21 return z;
3893 1.1 bjh21 }
3894 1.1 bjh21 float_raise( float_flag_divbyzero );
3895 1.1 bjh21 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
3896 1.1 bjh21 }
3897 1.1 bjh21 normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
3898 1.1 bjh21 }
3899 1.1 bjh21 if ( aExp == 0 ) {
3900 1.1 bjh21 if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 );
3901 1.1 bjh21 normalizeFloatx80Subnormal( aSig, &aExp, &aSig );
3902 1.1 bjh21 }
3903 1.1 bjh21 zExp = aExp - bExp + 0x3FFE;
3904 1.1 bjh21 rem1 = 0;
3905 1.1 bjh21 if ( bSig <= aSig ) {
3906 1.1 bjh21 shift128Right( aSig, 0, 1, &aSig, &rem1 );
3907 1.1 bjh21 ++zExp;
3908 1.1 bjh21 }
3909 1.1 bjh21 zSig0 = estimateDiv128To64( aSig, rem1, bSig );
3910 1.1 bjh21 mul64To128( bSig, zSig0, &term0, &term1 );
3911 1.1 bjh21 sub128( aSig, rem1, term0, term1, &rem0, &rem1 );
3912 1.1 bjh21 while ( (sbits64) rem0 < 0 ) {
3913 1.1 bjh21 --zSig0;
3914 1.1 bjh21 add128( rem0, rem1, 0, bSig, &rem0, &rem1 );
3915 1.1 bjh21 }
3916 1.1 bjh21 zSig1 = estimateDiv128To64( rem1, 0, bSig );
3917 1.1 bjh21 if ( (bits64) ( zSig1<<1 ) <= 8 ) {
3918 1.1 bjh21 mul64To128( bSig, zSig1, &term1, &term2 );
3919 1.1 bjh21 sub128( rem1, 0, term1, term2, &rem1, &rem2 );
3920 1.1 bjh21 while ( (sbits64) rem1 < 0 ) {
3921 1.1 bjh21 --zSig1;
3922 1.1 bjh21 add128( rem1, rem2, 0, bSig, &rem1, &rem2 );
3923 1.1 bjh21 }
3924 1.1 bjh21 zSig1 |= ( ( rem1 | rem2 ) != 0 );
3925 1.1 bjh21 }
3926 1.1 bjh21 return
3927 1.1 bjh21 roundAndPackFloatx80(
3928 1.1 bjh21 floatx80_rounding_precision, zSign, zExp, zSig0, zSig1 );
3929 1.1 bjh21
3930 1.1 bjh21 }
3931 1.1 bjh21
3932 1.1 bjh21 /*
3933 1.1 bjh21 -------------------------------------------------------------------------------
3934 1.1 bjh21 Returns the remainder of the extended double-precision floating-point value
3935 1.1 bjh21 `a' with respect to the corresponding value `b'. The operation is performed
3936 1.1 bjh21 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3937 1.1 bjh21 -------------------------------------------------------------------------------
3938 1.1 bjh21 */
3939 1.1 bjh21 floatx80 floatx80_rem( floatx80 a, floatx80 b )
3940 1.1 bjh21 {
3941 1.1 bjh21 flag aSign, bSign, zSign;
3942 1.1 bjh21 int32 aExp, bExp, expDiff;
3943 1.1 bjh21 bits64 aSig0, aSig1, bSig;
3944 1.1 bjh21 bits64 q, term0, term1, alternateASig0, alternateASig1;
3945 1.1 bjh21 floatx80 z;
3946 1.1 bjh21
3947 1.1 bjh21 aSig0 = extractFloatx80Frac( a );
3948 1.1 bjh21 aExp = extractFloatx80Exp( a );
3949 1.1 bjh21 aSign = extractFloatx80Sign( a );
3950 1.1 bjh21 bSig = extractFloatx80Frac( b );
3951 1.1 bjh21 bExp = extractFloatx80Exp( b );
3952 1.1 bjh21 bSign = extractFloatx80Sign( b );
3953 1.1 bjh21 if ( aExp == 0x7FFF ) {
3954 1.1 bjh21 if ( (bits64) ( aSig0<<1 )
3955 1.1 bjh21 || ( ( bExp == 0x7FFF ) && (bits64) ( bSig<<1 ) ) ) {
3956 1.1 bjh21 return propagateFloatx80NaN( a, b );
3957 1.1 bjh21 }
3958 1.1 bjh21 goto invalid;
3959 1.1 bjh21 }
3960 1.1 bjh21 if ( bExp == 0x7FFF ) {
3961 1.1 bjh21 if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b );
3962 1.1 bjh21 return a;
3963 1.1 bjh21 }
3964 1.1 bjh21 if ( bExp == 0 ) {
3965 1.1 bjh21 if ( bSig == 0 ) {
3966 1.1 bjh21 invalid:
3967 1.1 bjh21 float_raise( float_flag_invalid );
3968 1.1 bjh21 z.low = floatx80_default_nan_low;
3969 1.1 bjh21 z.high = floatx80_default_nan_high;
3970 1.1 bjh21 return z;
3971 1.1 bjh21 }
3972 1.1 bjh21 normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
3973 1.1 bjh21 }
3974 1.1 bjh21 if ( aExp == 0 ) {
3975 1.1 bjh21 if ( (bits64) ( aSig0<<1 ) == 0 ) return a;
3976 1.1 bjh21 normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 );
3977 1.1 bjh21 }
3978 1.1 bjh21 bSig |= LIT64( 0x8000000000000000 );
3979 1.1 bjh21 zSign = aSign;
3980 1.1 bjh21 expDiff = aExp - bExp;
3981 1.1 bjh21 aSig1 = 0;
3982 1.1 bjh21 if ( expDiff < 0 ) {
3983 1.1 bjh21 if ( expDiff < -1 ) return a;
3984 1.1 bjh21 shift128Right( aSig0, 0, 1, &aSig0, &aSig1 );
3985 1.1 bjh21 expDiff = 0;
3986 1.1 bjh21 }
3987 1.1 bjh21 q = ( bSig <= aSig0 );
3988 1.1 bjh21 if ( q ) aSig0 -= bSig;
3989 1.1 bjh21 expDiff -= 64;
3990 1.1 bjh21 while ( 0 < expDiff ) {
3991 1.1 bjh21 q = estimateDiv128To64( aSig0, aSig1, bSig );
3992 1.1 bjh21 q = ( 2 < q ) ? q - 2 : 0;
3993 1.1 bjh21 mul64To128( bSig, q, &term0, &term1 );
3994 1.1 bjh21 sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
3995 1.1 bjh21 shortShift128Left( aSig0, aSig1, 62, &aSig0, &aSig1 );
3996 1.1 bjh21 expDiff -= 62;
3997 1.1 bjh21 }
3998 1.1 bjh21 expDiff += 64;
3999 1.1 bjh21 if ( 0 < expDiff ) {
4000 1.1 bjh21 q = estimateDiv128To64( aSig0, aSig1, bSig );
4001 1.1 bjh21 q = ( 2 < q ) ? q - 2 : 0;
4002 1.1 bjh21 q >>= 64 - expDiff;
4003 1.1 bjh21 mul64To128( bSig, q<<( 64 - expDiff ), &term0, &term1 );
4004 1.1 bjh21 sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
4005 1.1 bjh21 shortShift128Left( 0, bSig, 64 - expDiff, &term0, &term1 );
4006 1.1 bjh21 while ( le128( term0, term1, aSig0, aSig1 ) ) {
4007 1.1 bjh21 ++q;
4008 1.1 bjh21 sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
4009 1.1 bjh21 }
4010 1.1 bjh21 }
4011 1.1 bjh21 else {
4012 1.1 bjh21 term1 = 0;
4013 1.1 bjh21 term0 = bSig;
4014 1.1 bjh21 }
4015 1.1 bjh21 sub128( term0, term1, aSig0, aSig1, &alternateASig0, &alternateASig1 );
4016 1.1 bjh21 if ( lt128( alternateASig0, alternateASig1, aSig0, aSig1 )
4017 1.1 bjh21 || ( eq128( alternateASig0, alternateASig1, aSig0, aSig1 )
4018 1.1 bjh21 && ( q & 1 ) )
4019 1.1 bjh21 ) {
4020 1.1 bjh21 aSig0 = alternateASig0;
4021 1.1 bjh21 aSig1 = alternateASig1;
4022 1.1 bjh21 zSign = ! zSign;
4023 1.1 bjh21 }
4024 1.1 bjh21 return
4025 1.1 bjh21 normalizeRoundAndPackFloatx80(
4026 1.1 bjh21 80, zSign, bExp + expDiff, aSig0, aSig1 );
4027 1.1 bjh21
4028 1.1 bjh21 }
4029 1.1 bjh21
4030 1.1 bjh21 /*
4031 1.1 bjh21 -------------------------------------------------------------------------------
4032 1.1 bjh21 Returns the square root of the extended double-precision floating-point
4033 1.1 bjh21 value `a'. The operation is performed according to the IEC/IEEE Standard
4034 1.1 bjh21 for Binary Floating-Point Arithmetic.
4035 1.1 bjh21 -------------------------------------------------------------------------------
4036 1.1 bjh21 */
4037 1.1 bjh21 floatx80 floatx80_sqrt( floatx80 a )
4038 1.1 bjh21 {
4039 1.1 bjh21 flag aSign;
4040 1.1 bjh21 int32 aExp, zExp;
4041 1.1 bjh21 bits64 aSig0, aSig1, zSig0, zSig1, doubleZSig0;
4042 1.1 bjh21 bits64 rem0, rem1, rem2, rem3, term0, term1, term2, term3;
4043 1.1 bjh21 floatx80 z;
4044 1.1 bjh21
4045 1.1 bjh21 aSig0 = extractFloatx80Frac( a );
4046 1.1 bjh21 aExp = extractFloatx80Exp( a );
4047 1.1 bjh21 aSign = extractFloatx80Sign( a );
4048 1.1 bjh21 if ( aExp == 0x7FFF ) {
4049 1.1 bjh21 if ( (bits64) ( aSig0<<1 ) ) return propagateFloatx80NaN( a, a );
4050 1.1 bjh21 if ( ! aSign ) return a;
4051 1.1 bjh21 goto invalid;
4052 1.1 bjh21 }
4053 1.1 bjh21 if ( aSign ) {
4054 1.1 bjh21 if ( ( aExp | aSig0 ) == 0 ) return a;
4055 1.1 bjh21 invalid:
4056 1.1 bjh21 float_raise( float_flag_invalid );
4057 1.1 bjh21 z.low = floatx80_default_nan_low;
4058 1.1 bjh21 z.high = floatx80_default_nan_high;
4059 1.1 bjh21 return z;
4060 1.1 bjh21 }
4061 1.1 bjh21 if ( aExp == 0 ) {
4062 1.1 bjh21 if ( aSig0 == 0 ) return packFloatx80( 0, 0, 0 );
4063 1.1 bjh21 normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 );
4064 1.1 bjh21 }
4065 1.1 bjh21 zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFF;
4066 1.1 bjh21 zSig0 = estimateSqrt32( aExp, aSig0>>32 );
4067 1.1 bjh21 shift128Right( aSig0, 0, 2 + ( aExp & 1 ), &aSig0, &aSig1 );
4068 1.1 bjh21 zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 );
4069 1.1 bjh21 doubleZSig0 = zSig0<<1;
4070 1.1 bjh21 mul64To128( zSig0, zSig0, &term0, &term1 );
4071 1.1 bjh21 sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 );
4072 1.1 bjh21 while ( (sbits64) rem0 < 0 ) {
4073 1.1 bjh21 --zSig0;
4074 1.1 bjh21 doubleZSig0 -= 2;
4075 1.1 bjh21 add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 );
4076 1.1 bjh21 }
4077 1.1 bjh21 zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 );
4078 1.1 bjh21 if ( ( zSig1 & LIT64( 0x3FFFFFFFFFFFFFFF ) ) <= 5 ) {
4079 1.1 bjh21 if ( zSig1 == 0 ) zSig1 = 1;
4080 1.1 bjh21 mul64To128( doubleZSig0, zSig1, &term1, &term2 );
4081 1.1 bjh21 sub128( rem1, 0, term1, term2, &rem1, &rem2 );
4082 1.1 bjh21 mul64To128( zSig1, zSig1, &term2, &term3 );
4083 1.1 bjh21 sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 );
4084 1.1 bjh21 while ( (sbits64) rem1 < 0 ) {
4085 1.1 bjh21 --zSig1;
4086 1.1 bjh21 shortShift128Left( 0, zSig1, 1, &term2, &term3 );
4087 1.1 bjh21 term3 |= 1;
4088 1.1 bjh21 term2 |= doubleZSig0;
4089 1.1 bjh21 add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 );
4090 1.1 bjh21 }
4091 1.1 bjh21 zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
4092 1.1 bjh21 }
4093 1.1 bjh21 shortShift128Left( 0, zSig1, 1, &zSig0, &zSig1 );
4094 1.1 bjh21 zSig0 |= doubleZSig0;
4095 1.1 bjh21 return
4096 1.1 bjh21 roundAndPackFloatx80(
4097 1.1 bjh21 floatx80_rounding_precision, 0, zExp, zSig0, zSig1 );
4098 1.1 bjh21
4099 1.1 bjh21 }
4100 1.1 bjh21
4101 1.1 bjh21 /*
4102 1.1 bjh21 -------------------------------------------------------------------------------
4103 1.1 bjh21 Returns 1 if the extended double-precision floating-point value `a' is
4104 1.1 bjh21 equal to the corresponding value `b', and 0 otherwise. The comparison is
4105 1.1 bjh21 performed according to the IEC/IEEE Standard for Binary Floating-Point
4106 1.1 bjh21 Arithmetic.
4107 1.1 bjh21 -------------------------------------------------------------------------------
4108 1.1 bjh21 */
4109 1.1 bjh21 flag floatx80_eq( floatx80 a, floatx80 b )
4110 1.1 bjh21 {
4111 1.1 bjh21
4112 1.1 bjh21 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
4113 1.1 bjh21 && (bits64) ( extractFloatx80Frac( a )<<1 ) )
4114 1.1 bjh21 || ( ( extractFloatx80Exp( b ) == 0x7FFF )
4115 1.1 bjh21 && (bits64) ( extractFloatx80Frac( b )<<1 ) )
4116 1.1 bjh21 ) {
4117 1.1 bjh21 if ( floatx80_is_signaling_nan( a )
4118 1.1 bjh21 || floatx80_is_signaling_nan( b ) ) {
4119 1.1 bjh21 float_raise( float_flag_invalid );
4120 1.1 bjh21 }
4121 1.1 bjh21 return 0;
4122 1.1 bjh21 }
4123 1.1 bjh21 return
4124 1.1 bjh21 ( a.low == b.low )
4125 1.1 bjh21 && ( ( a.high == b.high )
4126 1.1 bjh21 || ( ( a.low == 0 )
4127 1.1 bjh21 && ( (bits16) ( ( a.high | b.high )<<1 ) == 0 ) )
4128 1.1 bjh21 );
4129 1.1 bjh21
4130 1.1 bjh21 }
4131 1.1 bjh21
4132 1.1 bjh21 /*
4133 1.1 bjh21 -------------------------------------------------------------------------------
4134 1.1 bjh21 Returns 1 if the extended double-precision floating-point value `a' is
4135 1.1 bjh21 less than or equal to the corresponding value `b', and 0 otherwise. The
4136 1.1 bjh21 comparison is performed according to the IEC/IEEE Standard for Binary
4137 1.1 bjh21 Floating-Point Arithmetic.
4138 1.1 bjh21 -------------------------------------------------------------------------------
4139 1.1 bjh21 */
4140 1.1 bjh21 flag floatx80_le( floatx80 a, floatx80 b )
4141 1.1 bjh21 {
4142 1.1 bjh21 flag aSign, bSign;
4143 1.1 bjh21
4144 1.1 bjh21 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
4145 1.1 bjh21 && (bits64) ( extractFloatx80Frac( a )<<1 ) )
4146 1.1 bjh21 || ( ( extractFloatx80Exp( b ) == 0x7FFF )
4147 1.1 bjh21 && (bits64) ( extractFloatx80Frac( b )<<1 ) )
4148 1.1 bjh21 ) {
4149 1.1 bjh21 float_raise( float_flag_invalid );
4150 1.1 bjh21 return 0;
4151 1.1 bjh21 }
4152 1.1 bjh21 aSign = extractFloatx80Sign( a );
4153 1.1 bjh21 bSign = extractFloatx80Sign( b );
4154 1.1 bjh21 if ( aSign != bSign ) {
4155 1.1 bjh21 return
4156 1.1 bjh21 aSign
4157 1.1 bjh21 || ( ( ( (bits16) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
4158 1.1 bjh21 == 0 );
4159 1.1 bjh21 }
4160 1.1 bjh21 return
4161 1.1 bjh21 aSign ? le128( b.high, b.low, a.high, a.low )
4162 1.1 bjh21 : le128( a.high, a.low, b.high, b.low );
4163 1.1 bjh21
4164 1.1 bjh21 }
4165 1.1 bjh21
4166 1.1 bjh21 /*
4167 1.1 bjh21 -------------------------------------------------------------------------------
4168 1.1 bjh21 Returns 1 if the extended double-precision floating-point value `a' is
4169 1.1 bjh21 less than the corresponding value `b', and 0 otherwise. The comparison
4170 1.1 bjh21 is performed according to the IEC/IEEE Standard for Binary Floating-Point
4171 1.1 bjh21 Arithmetic.
4172 1.1 bjh21 -------------------------------------------------------------------------------
4173 1.1 bjh21 */
4174 1.1 bjh21 flag floatx80_lt( floatx80 a, floatx80 b )
4175 1.1 bjh21 {
4176 1.1 bjh21 flag aSign, bSign;
4177 1.1 bjh21
4178 1.1 bjh21 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
4179 1.1 bjh21 && (bits64) ( extractFloatx80Frac( a )<<1 ) )
4180 1.1 bjh21 || ( ( extractFloatx80Exp( b ) == 0x7FFF )
4181 1.1 bjh21 && (bits64) ( extractFloatx80Frac( b )<<1 ) )
4182 1.1 bjh21 ) {
4183 1.1 bjh21 float_raise( float_flag_invalid );
4184 1.1 bjh21 return 0;
4185 1.1 bjh21 }
4186 1.1 bjh21 aSign = extractFloatx80Sign( a );
4187 1.1 bjh21 bSign = extractFloatx80Sign( b );
4188 1.1 bjh21 if ( aSign != bSign ) {
4189 1.1 bjh21 return
4190 1.1 bjh21 aSign
4191 1.1 bjh21 && ( ( ( (bits16) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
4192 1.1 bjh21 != 0 );
4193 1.1 bjh21 }
4194 1.1 bjh21 return
4195 1.1 bjh21 aSign ? lt128( b.high, b.low, a.high, a.low )
4196 1.1 bjh21 : lt128( a.high, a.low, b.high, b.low );
4197 1.1 bjh21
4198 1.1 bjh21 }
4199 1.1 bjh21
4200 1.1 bjh21 /*
4201 1.1 bjh21 -------------------------------------------------------------------------------
4202 1.1 bjh21 Returns 1 if the extended double-precision floating-point value `a' is equal
4203 1.1 bjh21 to the corresponding value `b', and 0 otherwise. The invalid exception is
4204 1.1 bjh21 raised if either operand is a NaN. Otherwise, the comparison is performed
4205 1.1 bjh21 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4206 1.1 bjh21 -------------------------------------------------------------------------------
4207 1.1 bjh21 */
4208 1.1 bjh21 flag floatx80_eq_signaling( floatx80 a, floatx80 b )
4209 1.1 bjh21 {
4210 1.1 bjh21
4211 1.1 bjh21 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
4212 1.1 bjh21 && (bits64) ( extractFloatx80Frac( a )<<1 ) )
4213 1.1 bjh21 || ( ( extractFloatx80Exp( b ) == 0x7FFF )
4214 1.1 bjh21 && (bits64) ( extractFloatx80Frac( b )<<1 ) )
4215 1.1 bjh21 ) {
4216 1.1 bjh21 float_raise( float_flag_invalid );
4217 1.1 bjh21 return 0;
4218 1.1 bjh21 }
4219 1.1 bjh21 return
4220 1.1 bjh21 ( a.low == b.low )
4221 1.1 bjh21 && ( ( a.high == b.high )
4222 1.1 bjh21 || ( ( a.low == 0 )
4223 1.1 bjh21 && ( (bits16) ( ( a.high | b.high )<<1 ) == 0 ) )
4224 1.1 bjh21 );
4225 1.1 bjh21
4226 1.1 bjh21 }
4227 1.1 bjh21
4228 1.1 bjh21 /*
4229 1.1 bjh21 -------------------------------------------------------------------------------
4230 1.1 bjh21 Returns 1 if the extended double-precision floating-point value `a' is less
4231 1.1 bjh21 than or equal to the corresponding value `b', and 0 otherwise. Quiet NaNs
4232 1.1 bjh21 do not cause an exception. Otherwise, the comparison is performed according
4233 1.1 bjh21 to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4234 1.1 bjh21 -------------------------------------------------------------------------------
4235 1.1 bjh21 */
4236 1.1 bjh21 flag floatx80_le_quiet( floatx80 a, floatx80 b )
4237 1.1 bjh21 {
4238 1.1 bjh21 flag aSign, bSign;
4239 1.1 bjh21
4240 1.1 bjh21 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
4241 1.1 bjh21 && (bits64) ( extractFloatx80Frac( a )<<1 ) )
4242 1.1 bjh21 || ( ( extractFloatx80Exp( b ) == 0x7FFF )
4243 1.1 bjh21 && (bits64) ( extractFloatx80Frac( b )<<1 ) )
4244 1.1 bjh21 ) {
4245 1.1 bjh21 if ( floatx80_is_signaling_nan( a )
4246 1.1 bjh21 || floatx80_is_signaling_nan( b ) ) {
4247 1.1 bjh21 float_raise( float_flag_invalid );
4248 1.1 bjh21 }
4249 1.1 bjh21 return 0;
4250 1.1 bjh21 }
4251 1.1 bjh21 aSign = extractFloatx80Sign( a );
4252 1.1 bjh21 bSign = extractFloatx80Sign( b );
4253 1.1 bjh21 if ( aSign != bSign ) {
4254 1.1 bjh21 return
4255 1.1 bjh21 aSign
4256 1.1 bjh21 || ( ( ( (bits16) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
4257 1.1 bjh21 == 0 );
4258 1.1 bjh21 }
4259 1.1 bjh21 return
4260 1.1 bjh21 aSign ? le128( b.high, b.low, a.high, a.low )
4261 1.1 bjh21 : le128( a.high, a.low, b.high, b.low );
4262 1.1 bjh21
4263 1.1 bjh21 }
4264 1.1 bjh21
4265 1.1 bjh21 /*
4266 1.1 bjh21 -------------------------------------------------------------------------------
4267 1.1 bjh21 Returns 1 if the extended double-precision floating-point value `a' is less
4268 1.1 bjh21 than the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause
4269 1.1 bjh21 an exception. Otherwise, the comparison is performed according to the
4270 1.1 bjh21 IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4271 1.1 bjh21 -------------------------------------------------------------------------------
4272 1.1 bjh21 */
4273 1.1 bjh21 flag floatx80_lt_quiet( floatx80 a, floatx80 b )
4274 1.1 bjh21 {
4275 1.1 bjh21 flag aSign, bSign;
4276 1.1 bjh21
4277 1.1 bjh21 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
4278 1.1 bjh21 && (bits64) ( extractFloatx80Frac( a )<<1 ) )
4279 1.1 bjh21 || ( ( extractFloatx80Exp( b ) == 0x7FFF )
4280 1.1 bjh21 && (bits64) ( extractFloatx80Frac( b )<<1 ) )
4281 1.1 bjh21 ) {
4282 1.1 bjh21 if ( floatx80_is_signaling_nan( a )
4283 1.1 bjh21 || floatx80_is_signaling_nan( b ) ) {
4284 1.1 bjh21 float_raise( float_flag_invalid );
4285 1.1 bjh21 }
4286 1.1 bjh21 return 0;
4287 1.1 bjh21 }
4288 1.1 bjh21 aSign = extractFloatx80Sign( a );
4289 1.1 bjh21 bSign = extractFloatx80Sign( b );
4290 1.1 bjh21 if ( aSign != bSign ) {
4291 1.1 bjh21 return
4292 1.1 bjh21 aSign
4293 1.1 bjh21 && ( ( ( (bits16) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
4294 1.1 bjh21 != 0 );
4295 1.1 bjh21 }
4296 1.1 bjh21 return
4297 1.1 bjh21 aSign ? lt128( b.high, b.low, a.high, a.low )
4298 1.1 bjh21 : lt128( a.high, a.low, b.high, b.low );
4299 1.1 bjh21
4300 1.1 bjh21 }
4301 1.1 bjh21
4302 1.1 bjh21 #endif
4303 1.1 bjh21
4304 1.1 bjh21 #ifdef FLOAT128
4305 1.1 bjh21
4306 1.1 bjh21 /*
4307 1.1 bjh21 -------------------------------------------------------------------------------
4308 1.1 bjh21 Returns the result of converting the quadruple-precision floating-point
4309 1.1 bjh21 value `a' to the 32-bit two's complement integer format. The conversion
4310 1.1 bjh21 is performed according to the IEC/IEEE Standard for Binary Floating-Point
4311 1.1 bjh21 Arithmetic---which means in particular that the conversion is rounded
4312 1.1 bjh21 according to the current rounding mode. If `a' is a NaN, the largest
4313 1.1 bjh21 positive integer is returned. Otherwise, if the conversion overflows, the
4314 1.1 bjh21 largest integer with the same sign as `a' is returned.
4315 1.1 bjh21 -------------------------------------------------------------------------------
4316 1.1 bjh21 */
4317 1.1 bjh21 int32 float128_to_int32( float128 a )
4318 1.1 bjh21 {
4319 1.1 bjh21 flag aSign;
4320 1.1 bjh21 int32 aExp, shiftCount;
4321 1.1 bjh21 bits64 aSig0, aSig1;
4322 1.1 bjh21
4323 1.1 bjh21 aSig1 = extractFloat128Frac1( a );
4324 1.1 bjh21 aSig0 = extractFloat128Frac0( a );
4325 1.1 bjh21 aExp = extractFloat128Exp( a );
4326 1.1 bjh21 aSign = extractFloat128Sign( a );
4327 1.1 bjh21 if ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) aSign = 0;
4328 1.1 bjh21 if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
4329 1.1 bjh21 aSig0 |= ( aSig1 != 0 );
4330 1.1 bjh21 shiftCount = 0x4028 - aExp;
4331 1.1 bjh21 if ( 0 < shiftCount ) shift64RightJamming( aSig0, shiftCount, &aSig0 );
4332 1.1 bjh21 return roundAndPackInt32( aSign, aSig0 );
4333 1.1 bjh21
4334 1.1 bjh21 }
4335 1.1 bjh21
4336 1.1 bjh21 /*
4337 1.1 bjh21 -------------------------------------------------------------------------------
4338 1.1 bjh21 Returns the result of converting the quadruple-precision floating-point
4339 1.1 bjh21 value `a' to the 32-bit two's complement integer format. The conversion
4340 1.1 bjh21 is performed according to the IEC/IEEE Standard for Binary Floating-Point
4341 1.1 bjh21 Arithmetic, except that the conversion is always rounded toward zero. If
4342 1.1 bjh21 `a' is a NaN, the largest positive integer is returned. Otherwise, if the
4343 1.1 bjh21 conversion overflows, the largest integer with the same sign as `a' is
4344 1.1 bjh21 returned.
4345 1.1 bjh21 -------------------------------------------------------------------------------
4346 1.1 bjh21 */
4347 1.1 bjh21 int32 float128_to_int32_round_to_zero( float128 a )
4348 1.1 bjh21 {
4349 1.1 bjh21 flag aSign;
4350 1.1 bjh21 int32 aExp, shiftCount;
4351 1.1 bjh21 bits64 aSig0, aSig1, savedASig;
4352 1.1 bjh21 int32 z;
4353 1.1 bjh21
4354 1.1 bjh21 aSig1 = extractFloat128Frac1( a );
4355 1.1 bjh21 aSig0 = extractFloat128Frac0( a );
4356 1.1 bjh21 aExp = extractFloat128Exp( a );
4357 1.1 bjh21 aSign = extractFloat128Sign( a );
4358 1.1 bjh21 aSig0 |= ( aSig1 != 0 );
4359 1.1 bjh21 if ( 0x401E < aExp ) {
4360 1.1 bjh21 if ( ( aExp == 0x7FFF ) && aSig0 ) aSign = 0;
4361 1.1 bjh21 goto invalid;
4362 1.1 bjh21 }
4363 1.1 bjh21 else if ( aExp < 0x3FFF ) {
4364 1.11.2.1 tls if ( aExp || aSig0 ) set_float_exception_inexact_flag();
4365 1.1 bjh21 return 0;
4366 1.1 bjh21 }
4367 1.1 bjh21 aSig0 |= LIT64( 0x0001000000000000 );
4368 1.1 bjh21 shiftCount = 0x402F - aExp;
4369 1.1 bjh21 savedASig = aSig0;
4370 1.1 bjh21 aSig0 >>= shiftCount;
4371 1.10 christos z = (int32)aSig0;
4372 1.1 bjh21 if ( aSign ) z = - z;
4373 1.1 bjh21 if ( ( z < 0 ) ^ aSign ) {
4374 1.1 bjh21 invalid:
4375 1.1 bjh21 float_raise( float_flag_invalid );
4376 1.1 bjh21 return aSign ? (sbits32) 0x80000000 : 0x7FFFFFFF;
4377 1.1 bjh21 }
4378 1.1 bjh21 if ( ( aSig0<<shiftCount ) != savedASig ) {
4379 1.11.2.1 tls set_float_exception_inexact_flag();
4380 1.1 bjh21 }
4381 1.1 bjh21 return z;
4382 1.1 bjh21
4383 1.1 bjh21 }
4384 1.1 bjh21
4385 1.1 bjh21 /*
4386 1.1 bjh21 -------------------------------------------------------------------------------
4387 1.1 bjh21 Returns the result of converting the quadruple-precision floating-point
4388 1.1 bjh21 value `a' to the 64-bit two's complement integer format. The conversion
4389 1.1 bjh21 is performed according to the IEC/IEEE Standard for Binary Floating-Point
4390 1.1 bjh21 Arithmetic---which means in particular that the conversion is rounded
4391 1.1 bjh21 according to the current rounding mode. If `a' is a NaN, the largest
4392 1.1 bjh21 positive integer is returned. Otherwise, if the conversion overflows, the
4393 1.1 bjh21 largest integer with the same sign as `a' is returned.
4394 1.1 bjh21 -------------------------------------------------------------------------------
4395 1.1 bjh21 */
4396 1.1 bjh21 int64 float128_to_int64( float128 a )
4397 1.1 bjh21 {
4398 1.1 bjh21 flag aSign;
4399 1.1 bjh21 int32 aExp, shiftCount;
4400 1.1 bjh21 bits64 aSig0, aSig1;
4401 1.1 bjh21
4402 1.1 bjh21 aSig1 = extractFloat128Frac1( a );
4403 1.1 bjh21 aSig0 = extractFloat128Frac0( a );
4404 1.1 bjh21 aExp = extractFloat128Exp( a );
4405 1.1 bjh21 aSign = extractFloat128Sign( a );
4406 1.1 bjh21 if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
4407 1.1 bjh21 shiftCount = 0x402F - aExp;
4408 1.1 bjh21 if ( shiftCount <= 0 ) {
4409 1.1 bjh21 if ( 0x403E < aExp ) {
4410 1.1 bjh21 float_raise( float_flag_invalid );
4411 1.1 bjh21 if ( ! aSign
4412 1.1 bjh21 || ( ( aExp == 0x7FFF )
4413 1.1 bjh21 && ( aSig1 || ( aSig0 != LIT64( 0x0001000000000000 ) ) )
4414 1.1 bjh21 )
4415 1.1 bjh21 ) {
4416 1.1 bjh21 return LIT64( 0x7FFFFFFFFFFFFFFF );
4417 1.1 bjh21 }
4418 1.1 bjh21 return (sbits64) LIT64( 0x8000000000000000 );
4419 1.1 bjh21 }
4420 1.1 bjh21 shortShift128Left( aSig0, aSig1, - shiftCount, &aSig0, &aSig1 );
4421 1.1 bjh21 }
4422 1.1 bjh21 else {
4423 1.1 bjh21 shift64ExtraRightJamming( aSig0, aSig1, shiftCount, &aSig0, &aSig1 );
4424 1.1 bjh21 }
4425 1.1 bjh21 return roundAndPackInt64( aSign, aSig0, aSig1 );
4426 1.1 bjh21
4427 1.1 bjh21 }
4428 1.1 bjh21
4429 1.1 bjh21 /*
4430 1.1 bjh21 -------------------------------------------------------------------------------
4431 1.1 bjh21 Returns the result of converting the quadruple-precision floating-point
4432 1.1 bjh21 value `a' to the 64-bit two's complement integer format. The conversion
4433 1.1 bjh21 is performed according to the IEC/IEEE Standard for Binary Floating-Point
4434 1.1 bjh21 Arithmetic, except that the conversion is always rounded toward zero.
4435 1.1 bjh21 If `a' is a NaN, the largest positive integer is returned. Otherwise, if
4436 1.1 bjh21 the conversion overflows, the largest integer with the same sign as `a' is
4437 1.1 bjh21 returned.
4438 1.1 bjh21 -------------------------------------------------------------------------------
4439 1.1 bjh21 */
4440 1.1 bjh21 int64 float128_to_int64_round_to_zero( float128 a )
4441 1.1 bjh21 {
4442 1.1 bjh21 flag aSign;
4443 1.1 bjh21 int32 aExp, shiftCount;
4444 1.1 bjh21 bits64 aSig0, aSig1;
4445 1.1 bjh21 int64 z;
4446 1.1 bjh21
4447 1.1 bjh21 aSig1 = extractFloat128Frac1( a );
4448 1.1 bjh21 aSig0 = extractFloat128Frac0( a );
4449 1.1 bjh21 aExp = extractFloat128Exp( a );
4450 1.1 bjh21 aSign = extractFloat128Sign( a );
4451 1.1 bjh21 if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
4452 1.1 bjh21 shiftCount = aExp - 0x402F;
4453 1.1 bjh21 if ( 0 < shiftCount ) {
4454 1.1 bjh21 if ( 0x403E <= aExp ) {
4455 1.1 bjh21 aSig0 &= LIT64( 0x0000FFFFFFFFFFFF );
4456 1.1 bjh21 if ( ( a.high == LIT64( 0xC03E000000000000 ) )
4457 1.1 bjh21 && ( aSig1 < LIT64( 0x0002000000000000 ) ) ) {
4458 1.11.2.1 tls if ( aSig1 ) set_float_exception_inexact_flag();
4459 1.1 bjh21 }
4460 1.1 bjh21 else {
4461 1.1 bjh21 float_raise( float_flag_invalid );
4462 1.1 bjh21 if ( ! aSign || ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) ) {
4463 1.1 bjh21 return LIT64( 0x7FFFFFFFFFFFFFFF );
4464 1.1 bjh21 }
4465 1.1 bjh21 }
4466 1.1 bjh21 return (sbits64) LIT64( 0x8000000000000000 );
4467 1.1 bjh21 }
4468 1.1 bjh21 z = ( aSig0<<shiftCount ) | ( aSig1>>( ( - shiftCount ) & 63 ) );
4469 1.1 bjh21 if ( (bits64) ( aSig1<<shiftCount ) ) {
4470 1.11.2.1 tls set_float_exception_inexact_flag();
4471 1.1 bjh21 }
4472 1.1 bjh21 }
4473 1.1 bjh21 else {
4474 1.1 bjh21 if ( aExp < 0x3FFF ) {
4475 1.1 bjh21 if ( aExp | aSig0 | aSig1 ) {
4476 1.11.2.1 tls set_float_exception_inexact_flag();
4477 1.1 bjh21 }
4478 1.1 bjh21 return 0;
4479 1.1 bjh21 }
4480 1.1 bjh21 z = aSig0>>( - shiftCount );
4481 1.1 bjh21 if ( aSig1
4482 1.1 bjh21 || ( shiftCount && (bits64) ( aSig0<<( shiftCount & 63 ) ) ) ) {
4483 1.11.2.1 tls set_float_exception_inexact_flag();
4484 1.1 bjh21 }
4485 1.1 bjh21 }
4486 1.1 bjh21 if ( aSign ) z = - z;
4487 1.1 bjh21 return z;
4488 1.1 bjh21
4489 1.1 bjh21 }
4490 1.1 bjh21
4491 1.8 matt #if (defined(SOFTFLOATSPARC64_FOR_GCC) || defined(SOFTFLOAT_FOR_GCC)) \
4492 1.8 matt && defined(SOFTFLOAT_NEED_FIXUNS)
4493 1.1 bjh21 /*
4494 1.4 martin * just like above - but do not care for overflow of signed results
4495 1.4 martin */
4496 1.4 martin uint64 float128_to_uint64_round_to_zero( float128 a )
4497 1.4 martin {
4498 1.4 martin flag aSign;
4499 1.4 martin int32 aExp, shiftCount;
4500 1.4 martin bits64 aSig0, aSig1;
4501 1.4 martin uint64 z;
4502 1.4 martin
4503 1.4 martin aSig1 = extractFloat128Frac1( a );
4504 1.4 martin aSig0 = extractFloat128Frac0( a );
4505 1.4 martin aExp = extractFloat128Exp( a );
4506 1.4 martin aSign = extractFloat128Sign( a );
4507 1.4 martin if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
4508 1.4 martin shiftCount = aExp - 0x402F;
4509 1.4 martin if ( 0 < shiftCount ) {
4510 1.4 martin if ( 0x403F <= aExp ) {
4511 1.4 martin aSig0 &= LIT64( 0x0000FFFFFFFFFFFF );
4512 1.4 martin if ( ( a.high == LIT64( 0xC03E000000000000 ) )
4513 1.4 martin && ( aSig1 < LIT64( 0x0002000000000000 ) ) ) {
4514 1.11.2.1 tls if ( aSig1 ) set_float_exception_inexact_flag();
4515 1.4 martin }
4516 1.4 martin else {
4517 1.4 martin float_raise( float_flag_invalid );
4518 1.4 martin }
4519 1.5 martin return LIT64( 0xFFFFFFFFFFFFFFFF );
4520 1.4 martin }
4521 1.4 martin z = ( aSig0<<shiftCount ) | ( aSig1>>( ( - shiftCount ) & 63 ) );
4522 1.4 martin if ( (bits64) ( aSig1<<shiftCount ) ) {
4523 1.11.2.1 tls set_float_exception_inexact_flag();
4524 1.4 martin }
4525 1.4 martin }
4526 1.4 martin else {
4527 1.4 martin if ( aExp < 0x3FFF ) {
4528 1.4 martin if ( aExp | aSig0 | aSig1 ) {
4529 1.11.2.1 tls set_float_exception_inexact_flag();
4530 1.4 martin }
4531 1.4 martin return 0;
4532 1.4 martin }
4533 1.4 martin z = aSig0>>( - shiftCount );
4534 1.4 martin if (aSig1 || ( shiftCount && (bits64) ( aSig0<<( shiftCount & 63 ) ) ) ) {
4535 1.11.2.1 tls set_float_exception_inexact_flag();
4536 1.4 martin }
4537 1.4 martin }
4538 1.4 martin if ( aSign ) z = - z;
4539 1.4 martin return z;
4540 1.4 martin
4541 1.4 martin }
4542 1.8 matt #endif /* (SOFTFLOATSPARC64_FOR_GCC || SOFTFLOAT_FOR_GCC) && SOFTFLOAT_NEED_FIXUNS */
4543 1.4 martin
4544 1.4 martin /*
4545 1.1 bjh21 -------------------------------------------------------------------------------
4546 1.1 bjh21 Returns the result of converting the quadruple-precision floating-point
4547 1.1 bjh21 value `a' to the single-precision floating-point format. The conversion
4548 1.1 bjh21 is performed according to the IEC/IEEE Standard for Binary Floating-Point
4549 1.1 bjh21 Arithmetic.
4550 1.1 bjh21 -------------------------------------------------------------------------------
4551 1.1 bjh21 */
4552 1.1 bjh21 float32 float128_to_float32( float128 a )
4553 1.1 bjh21 {
4554 1.1 bjh21 flag aSign;
4555 1.1 bjh21 int32 aExp;
4556 1.1 bjh21 bits64 aSig0, aSig1;
4557 1.1 bjh21 bits32 zSig;
4558 1.1 bjh21
4559 1.1 bjh21 aSig1 = extractFloat128Frac1( a );
4560 1.1 bjh21 aSig0 = extractFloat128Frac0( a );
4561 1.1 bjh21 aExp = extractFloat128Exp( a );
4562 1.1 bjh21 aSign = extractFloat128Sign( a );
4563 1.1 bjh21 if ( aExp == 0x7FFF ) {
4564 1.1 bjh21 if ( aSig0 | aSig1 ) {
4565 1.1 bjh21 return commonNaNToFloat32( float128ToCommonNaN( a ) );
4566 1.1 bjh21 }
4567 1.1 bjh21 return packFloat32( aSign, 0xFF, 0 );
4568 1.1 bjh21 }
4569 1.1 bjh21 aSig0 |= ( aSig1 != 0 );
4570 1.1 bjh21 shift64RightJamming( aSig0, 18, &aSig0 );
4571 1.10 christos zSig = (bits32)aSig0;
4572 1.1 bjh21 if ( aExp || zSig ) {
4573 1.1 bjh21 zSig |= 0x40000000;
4574 1.1 bjh21 aExp -= 0x3F81;
4575 1.1 bjh21 }
4576 1.1 bjh21 return roundAndPackFloat32( aSign, aExp, zSig );
4577 1.1 bjh21
4578 1.1 bjh21 }
4579 1.1 bjh21
4580 1.1 bjh21 /*
4581 1.1 bjh21 -------------------------------------------------------------------------------
4582 1.1 bjh21 Returns the result of converting the quadruple-precision floating-point
4583 1.1 bjh21 value `a' to the double-precision floating-point format. The conversion
4584 1.1 bjh21 is performed according to the IEC/IEEE Standard for Binary Floating-Point
4585 1.1 bjh21 Arithmetic.
4586 1.1 bjh21 -------------------------------------------------------------------------------
4587 1.1 bjh21 */
4588 1.1 bjh21 float64 float128_to_float64( float128 a )
4589 1.1 bjh21 {
4590 1.1 bjh21 flag aSign;
4591 1.1 bjh21 int32 aExp;
4592 1.1 bjh21 bits64 aSig0, aSig1;
4593 1.1 bjh21
4594 1.1 bjh21 aSig1 = extractFloat128Frac1( a );
4595 1.1 bjh21 aSig0 = extractFloat128Frac0( a );
4596 1.1 bjh21 aExp = extractFloat128Exp( a );
4597 1.1 bjh21 aSign = extractFloat128Sign( a );
4598 1.1 bjh21 if ( aExp == 0x7FFF ) {
4599 1.1 bjh21 if ( aSig0 | aSig1 ) {
4600 1.1 bjh21 return commonNaNToFloat64( float128ToCommonNaN( a ) );
4601 1.1 bjh21 }
4602 1.1 bjh21 return packFloat64( aSign, 0x7FF, 0 );
4603 1.1 bjh21 }
4604 1.1 bjh21 shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 );
4605 1.1 bjh21 aSig0 |= ( aSig1 != 0 );
4606 1.1 bjh21 if ( aExp || aSig0 ) {
4607 1.1 bjh21 aSig0 |= LIT64( 0x4000000000000000 );
4608 1.1 bjh21 aExp -= 0x3C01;
4609 1.1 bjh21 }
4610 1.1 bjh21 return roundAndPackFloat64( aSign, aExp, aSig0 );
4611 1.1 bjh21
4612 1.1 bjh21 }
4613 1.1 bjh21
4614 1.1 bjh21 #ifdef FLOATX80
4615 1.1 bjh21
4616 1.1 bjh21 /*
4617 1.1 bjh21 -------------------------------------------------------------------------------
4618 1.1 bjh21 Returns the result of converting the quadruple-precision floating-point
4619 1.1 bjh21 value `a' to the extended double-precision floating-point format. The
4620 1.1 bjh21 conversion is performed according to the IEC/IEEE Standard for Binary
4621 1.1 bjh21 Floating-Point Arithmetic.
4622 1.1 bjh21 -------------------------------------------------------------------------------
4623 1.1 bjh21 */
4624 1.1 bjh21 floatx80 float128_to_floatx80( float128 a )
4625 1.1 bjh21 {
4626 1.1 bjh21 flag aSign;
4627 1.1 bjh21 int32 aExp;
4628 1.1 bjh21 bits64 aSig0, aSig1;
4629 1.1 bjh21
4630 1.1 bjh21 aSig1 = extractFloat128Frac1( a );
4631 1.1 bjh21 aSig0 = extractFloat128Frac0( a );
4632 1.1 bjh21 aExp = extractFloat128Exp( a );
4633 1.1 bjh21 aSign = extractFloat128Sign( a );
4634 1.1 bjh21 if ( aExp == 0x7FFF ) {
4635 1.1 bjh21 if ( aSig0 | aSig1 ) {
4636 1.1 bjh21 return commonNaNToFloatx80( float128ToCommonNaN( a ) );
4637 1.1 bjh21 }
4638 1.1 bjh21 return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
4639 1.1 bjh21 }
4640 1.1 bjh21 if ( aExp == 0 ) {
4641 1.1 bjh21 if ( ( aSig0 | aSig1 ) == 0 ) return packFloatx80( aSign, 0, 0 );
4642 1.1 bjh21 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
4643 1.1 bjh21 }
4644 1.1 bjh21 else {
4645 1.1 bjh21 aSig0 |= LIT64( 0x0001000000000000 );
4646 1.1 bjh21 }
4647 1.1 bjh21 shortShift128Left( aSig0, aSig1, 15, &aSig0, &aSig1 );
4648 1.1 bjh21 return roundAndPackFloatx80( 80, aSign, aExp, aSig0, aSig1 );
4649 1.1 bjh21
4650 1.1 bjh21 }
4651 1.1 bjh21
4652 1.1 bjh21 #endif
4653 1.1 bjh21
4654 1.1 bjh21 /*
4655 1.1 bjh21 -------------------------------------------------------------------------------
4656 1.1 bjh21 Rounds the quadruple-precision floating-point value `a' to an integer, and
4657 1.1 bjh21 returns the result as a quadruple-precision floating-point value. The
4658 1.1 bjh21 operation is performed according to the IEC/IEEE Standard for Binary
4659 1.1 bjh21 Floating-Point Arithmetic.
4660 1.1 bjh21 -------------------------------------------------------------------------------
4661 1.1 bjh21 */
4662 1.1 bjh21 float128 float128_round_to_int( float128 a )
4663 1.1 bjh21 {
4664 1.1 bjh21 flag aSign;
4665 1.1 bjh21 int32 aExp;
4666 1.1 bjh21 bits64 lastBitMask, roundBitsMask;
4667 1.1 bjh21 int8 roundingMode;
4668 1.1 bjh21 float128 z;
4669 1.1 bjh21
4670 1.1 bjh21 aExp = extractFloat128Exp( a );
4671 1.1 bjh21 if ( 0x402F <= aExp ) {
4672 1.1 bjh21 if ( 0x406F <= aExp ) {
4673 1.1 bjh21 if ( ( aExp == 0x7FFF )
4674 1.1 bjh21 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) )
4675 1.1 bjh21 ) {
4676 1.1 bjh21 return propagateFloat128NaN( a, a );
4677 1.1 bjh21 }
4678 1.1 bjh21 return a;
4679 1.1 bjh21 }
4680 1.1 bjh21 lastBitMask = 1;
4681 1.1 bjh21 lastBitMask = ( lastBitMask<<( 0x406E - aExp ) )<<1;
4682 1.1 bjh21 roundBitsMask = lastBitMask - 1;
4683 1.1 bjh21 z = a;
4684 1.1 bjh21 roundingMode = float_rounding_mode;
4685 1.1 bjh21 if ( roundingMode == float_round_nearest_even ) {
4686 1.1 bjh21 if ( lastBitMask ) {
4687 1.1 bjh21 add128( z.high, z.low, 0, lastBitMask>>1, &z.high, &z.low );
4688 1.1 bjh21 if ( ( z.low & roundBitsMask ) == 0 ) z.low &= ~ lastBitMask;
4689 1.1 bjh21 }
4690 1.1 bjh21 else {
4691 1.1 bjh21 if ( (sbits64) z.low < 0 ) {
4692 1.1 bjh21 ++z.high;
4693 1.1 bjh21 if ( (bits64) ( z.low<<1 ) == 0 ) z.high &= ~1;
4694 1.1 bjh21 }
4695 1.1 bjh21 }
4696 1.1 bjh21 }
4697 1.1 bjh21 else if ( roundingMode != float_round_to_zero ) {
4698 1.1 bjh21 if ( extractFloat128Sign( z )
4699 1.1 bjh21 ^ ( roundingMode == float_round_up ) ) {
4700 1.1 bjh21 add128( z.high, z.low, 0, roundBitsMask, &z.high, &z.low );
4701 1.1 bjh21 }
4702 1.1 bjh21 }
4703 1.1 bjh21 z.low &= ~ roundBitsMask;
4704 1.1 bjh21 }
4705 1.1 bjh21 else {
4706 1.1 bjh21 if ( aExp < 0x3FFF ) {
4707 1.1 bjh21 if ( ( ( (bits64) ( a.high<<1 ) ) | a.low ) == 0 ) return a;
4708 1.11.2.1 tls set_float_exception_inexact_flag();
4709 1.1 bjh21 aSign = extractFloat128Sign( a );
4710 1.1 bjh21 switch ( float_rounding_mode ) {
4711 1.1 bjh21 case float_round_nearest_even:
4712 1.1 bjh21 if ( ( aExp == 0x3FFE )
4713 1.1 bjh21 && ( extractFloat128Frac0( a )
4714 1.1 bjh21 | extractFloat128Frac1( a ) )
4715 1.1 bjh21 ) {
4716 1.1 bjh21 return packFloat128( aSign, 0x3FFF, 0, 0 );
4717 1.1 bjh21 }
4718 1.1 bjh21 break;
4719 1.1 bjh21 case float_round_to_zero:
4720 1.1 bjh21 break;
4721 1.1 bjh21 case float_round_down:
4722 1.1 bjh21 return
4723 1.1 bjh21 aSign ? packFloat128( 1, 0x3FFF, 0, 0 )
4724 1.1 bjh21 : packFloat128( 0, 0, 0, 0 );
4725 1.1 bjh21 case float_round_up:
4726 1.1 bjh21 return
4727 1.1 bjh21 aSign ? packFloat128( 1, 0, 0, 0 )
4728 1.1 bjh21 : packFloat128( 0, 0x3FFF, 0, 0 );
4729 1.1 bjh21 }
4730 1.1 bjh21 return packFloat128( aSign, 0, 0, 0 );
4731 1.1 bjh21 }
4732 1.1 bjh21 lastBitMask = 1;
4733 1.1 bjh21 lastBitMask <<= 0x402F - aExp;
4734 1.1 bjh21 roundBitsMask = lastBitMask - 1;
4735 1.1 bjh21 z.low = 0;
4736 1.1 bjh21 z.high = a.high;
4737 1.1 bjh21 roundingMode = float_rounding_mode;
4738 1.1 bjh21 if ( roundingMode == float_round_nearest_even ) {
4739 1.1 bjh21 z.high += lastBitMask>>1;
4740 1.1 bjh21 if ( ( ( z.high & roundBitsMask ) | a.low ) == 0 ) {
4741 1.1 bjh21 z.high &= ~ lastBitMask;
4742 1.1 bjh21 }
4743 1.1 bjh21 }
4744 1.1 bjh21 else if ( roundingMode != float_round_to_zero ) {
4745 1.1 bjh21 if ( extractFloat128Sign( z )
4746 1.1 bjh21 ^ ( roundingMode == float_round_up ) ) {
4747 1.1 bjh21 z.high |= ( a.low != 0 );
4748 1.1 bjh21 z.high += roundBitsMask;
4749 1.1 bjh21 }
4750 1.1 bjh21 }
4751 1.1 bjh21 z.high &= ~ roundBitsMask;
4752 1.1 bjh21 }
4753 1.1 bjh21 if ( ( z.low != a.low ) || ( z.high != a.high ) ) {
4754 1.11.2.1 tls set_float_exception_inexact_flag();
4755 1.1 bjh21 }
4756 1.1 bjh21 return z;
4757 1.1 bjh21
4758 1.1 bjh21 }
4759 1.1 bjh21
4760 1.1 bjh21 /*
4761 1.1 bjh21 -------------------------------------------------------------------------------
4762 1.1 bjh21 Returns the result of adding the absolute values of the quadruple-precision
4763 1.1 bjh21 floating-point values `a' and `b'. If `zSign' is 1, the sum is negated
4764 1.1 bjh21 before being returned. `zSign' is ignored if the result is a NaN.
4765 1.1 bjh21 The addition is performed according to the IEC/IEEE Standard for Binary
4766 1.1 bjh21 Floating-Point Arithmetic.
4767 1.1 bjh21 -------------------------------------------------------------------------------
4768 1.1 bjh21 */
4769 1.1 bjh21 static float128 addFloat128Sigs( float128 a, float128 b, flag zSign )
4770 1.1 bjh21 {
4771 1.1 bjh21 int32 aExp, bExp, zExp;
4772 1.1 bjh21 bits64 aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2;
4773 1.1 bjh21 int32 expDiff;
4774 1.1 bjh21
4775 1.1 bjh21 aSig1 = extractFloat128Frac1( a );
4776 1.1 bjh21 aSig0 = extractFloat128Frac0( a );
4777 1.1 bjh21 aExp = extractFloat128Exp( a );
4778 1.1 bjh21 bSig1 = extractFloat128Frac1( b );
4779 1.1 bjh21 bSig0 = extractFloat128Frac0( b );
4780 1.1 bjh21 bExp = extractFloat128Exp( b );
4781 1.1 bjh21 expDiff = aExp - bExp;
4782 1.1 bjh21 if ( 0 < expDiff ) {
4783 1.1 bjh21 if ( aExp == 0x7FFF ) {
4784 1.1 bjh21 if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, b );
4785 1.1 bjh21 return a;
4786 1.1 bjh21 }
4787 1.1 bjh21 if ( bExp == 0 ) {
4788 1.1 bjh21 --expDiff;
4789 1.1 bjh21 }
4790 1.1 bjh21 else {
4791 1.1 bjh21 bSig0 |= LIT64( 0x0001000000000000 );
4792 1.1 bjh21 }
4793 1.1 bjh21 shift128ExtraRightJamming(
4794 1.1 bjh21 bSig0, bSig1, 0, expDiff, &bSig0, &bSig1, &zSig2 );
4795 1.1 bjh21 zExp = aExp;
4796 1.1 bjh21 }
4797 1.1 bjh21 else if ( expDiff < 0 ) {
4798 1.1 bjh21 if ( bExp == 0x7FFF ) {
4799 1.1 bjh21 if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b );
4800 1.1 bjh21 return packFloat128( zSign, 0x7FFF, 0, 0 );
4801 1.1 bjh21 }
4802 1.1 bjh21 if ( aExp == 0 ) {
4803 1.1 bjh21 ++expDiff;
4804 1.1 bjh21 }
4805 1.1 bjh21 else {
4806 1.1 bjh21 aSig0 |= LIT64( 0x0001000000000000 );
4807 1.1 bjh21 }
4808 1.1 bjh21 shift128ExtraRightJamming(
4809 1.1 bjh21 aSig0, aSig1, 0, - expDiff, &aSig0, &aSig1, &zSig2 );
4810 1.1 bjh21 zExp = bExp;
4811 1.1 bjh21 }
4812 1.1 bjh21 else {
4813 1.1 bjh21 if ( aExp == 0x7FFF ) {
4814 1.1 bjh21 if ( aSig0 | aSig1 | bSig0 | bSig1 ) {
4815 1.1 bjh21 return propagateFloat128NaN( a, b );
4816 1.1 bjh21 }
4817 1.1 bjh21 return a;
4818 1.1 bjh21 }
4819 1.1 bjh21 add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
4820 1.1 bjh21 if ( aExp == 0 ) return packFloat128( zSign, 0, zSig0, zSig1 );
4821 1.1 bjh21 zSig2 = 0;
4822 1.1 bjh21 zSig0 |= LIT64( 0x0002000000000000 );
4823 1.1 bjh21 zExp = aExp;
4824 1.1 bjh21 goto shiftRight1;
4825 1.1 bjh21 }
4826 1.1 bjh21 aSig0 |= LIT64( 0x0001000000000000 );
4827 1.1 bjh21 add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
4828 1.1 bjh21 --zExp;
4829 1.1 bjh21 if ( zSig0 < LIT64( 0x0002000000000000 ) ) goto roundAndPack;
4830 1.1 bjh21 ++zExp;
4831 1.1 bjh21 shiftRight1:
4832 1.1 bjh21 shift128ExtraRightJamming(
4833 1.1 bjh21 zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 );
4834 1.1 bjh21 roundAndPack:
4835 1.1 bjh21 return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 );
4836 1.1 bjh21
4837 1.1 bjh21 }
4838 1.1 bjh21
4839 1.1 bjh21 /*
4840 1.1 bjh21 -------------------------------------------------------------------------------
4841 1.1 bjh21 Returns the result of subtracting the absolute values of the quadruple-
4842 1.1 bjh21 precision floating-point values `a' and `b'. If `zSign' is 1, the
4843 1.1 bjh21 difference is negated before being returned. `zSign' is ignored if the
4844 1.1 bjh21 result is a NaN. The subtraction is performed according to the IEC/IEEE
4845 1.1 bjh21 Standard for Binary Floating-Point Arithmetic.
4846 1.1 bjh21 -------------------------------------------------------------------------------
4847 1.1 bjh21 */
4848 1.1 bjh21 static float128 subFloat128Sigs( float128 a, float128 b, flag zSign )
4849 1.1 bjh21 {
4850 1.1 bjh21 int32 aExp, bExp, zExp;
4851 1.1 bjh21 bits64 aSig0, aSig1, bSig0, bSig1, zSig0, zSig1;
4852 1.1 bjh21 int32 expDiff;
4853 1.1 bjh21 float128 z;
4854 1.1 bjh21
4855 1.1 bjh21 aSig1 = extractFloat128Frac1( a );
4856 1.1 bjh21 aSig0 = extractFloat128Frac0( a );
4857 1.1 bjh21 aExp = extractFloat128Exp( a );
4858 1.1 bjh21 bSig1 = extractFloat128Frac1( b );
4859 1.1 bjh21 bSig0 = extractFloat128Frac0( b );
4860 1.1 bjh21 bExp = extractFloat128Exp( b );
4861 1.1 bjh21 expDiff = aExp - bExp;
4862 1.1 bjh21 shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 );
4863 1.1 bjh21 shortShift128Left( bSig0, bSig1, 14, &bSig0, &bSig1 );
4864 1.1 bjh21 if ( 0 < expDiff ) goto aExpBigger;
4865 1.1 bjh21 if ( expDiff < 0 ) goto bExpBigger;
4866 1.1 bjh21 if ( aExp == 0x7FFF ) {
4867 1.1 bjh21 if ( aSig0 | aSig1 | bSig0 | bSig1 ) {
4868 1.1 bjh21 return propagateFloat128NaN( a, b );
4869 1.1 bjh21 }
4870 1.1 bjh21 float_raise( float_flag_invalid );
4871 1.1 bjh21 z.low = float128_default_nan_low;
4872 1.1 bjh21 z.high = float128_default_nan_high;
4873 1.1 bjh21 return z;
4874 1.1 bjh21 }
4875 1.1 bjh21 if ( aExp == 0 ) {
4876 1.1 bjh21 aExp = 1;
4877 1.1 bjh21 bExp = 1;
4878 1.1 bjh21 }
4879 1.1 bjh21 if ( bSig0 < aSig0 ) goto aBigger;
4880 1.1 bjh21 if ( aSig0 < bSig0 ) goto bBigger;
4881 1.1 bjh21 if ( bSig1 < aSig1 ) goto aBigger;
4882 1.1 bjh21 if ( aSig1 < bSig1 ) goto bBigger;
4883 1.1 bjh21 return packFloat128( float_rounding_mode == float_round_down, 0, 0, 0 );
4884 1.1 bjh21 bExpBigger:
4885 1.1 bjh21 if ( bExp == 0x7FFF ) {
4886 1.1 bjh21 if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b );
4887 1.1 bjh21 return packFloat128( zSign ^ 1, 0x7FFF, 0, 0 );
4888 1.1 bjh21 }
4889 1.1 bjh21 if ( aExp == 0 ) {
4890 1.1 bjh21 ++expDiff;
4891 1.1 bjh21 }
4892 1.1 bjh21 else {
4893 1.1 bjh21 aSig0 |= LIT64( 0x4000000000000000 );
4894 1.1 bjh21 }
4895 1.1 bjh21 shift128RightJamming( aSig0, aSig1, - expDiff, &aSig0, &aSig1 );
4896 1.1 bjh21 bSig0 |= LIT64( 0x4000000000000000 );
4897 1.1 bjh21 bBigger:
4898 1.1 bjh21 sub128( bSig0, bSig1, aSig0, aSig1, &zSig0, &zSig1 );
4899 1.1 bjh21 zExp = bExp;
4900 1.1 bjh21 zSign ^= 1;
4901 1.1 bjh21 goto normalizeRoundAndPack;
4902 1.1 bjh21 aExpBigger:
4903 1.1 bjh21 if ( aExp == 0x7FFF ) {
4904 1.1 bjh21 if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, b );
4905 1.1 bjh21 return a;
4906 1.1 bjh21 }
4907 1.1 bjh21 if ( bExp == 0 ) {
4908 1.1 bjh21 --expDiff;
4909 1.1 bjh21 }
4910 1.1 bjh21 else {
4911 1.1 bjh21 bSig0 |= LIT64( 0x4000000000000000 );
4912 1.1 bjh21 }
4913 1.1 bjh21 shift128RightJamming( bSig0, bSig1, expDiff, &bSig0, &bSig1 );
4914 1.1 bjh21 aSig0 |= LIT64( 0x4000000000000000 );
4915 1.1 bjh21 aBigger:
4916 1.1 bjh21 sub128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
4917 1.1 bjh21 zExp = aExp;
4918 1.1 bjh21 normalizeRoundAndPack:
4919 1.1 bjh21 --zExp;
4920 1.1 bjh21 return normalizeRoundAndPackFloat128( zSign, zExp - 14, zSig0, zSig1 );
4921 1.1 bjh21
4922 1.1 bjh21 }
4923 1.1 bjh21
4924 1.1 bjh21 /*
4925 1.1 bjh21 -------------------------------------------------------------------------------
4926 1.1 bjh21 Returns the result of adding the quadruple-precision floating-point values
4927 1.1 bjh21 `a' and `b'. The operation is performed according to the IEC/IEEE Standard
4928 1.1 bjh21 for Binary Floating-Point Arithmetic.
4929 1.1 bjh21 -------------------------------------------------------------------------------
4930 1.1 bjh21 */
4931 1.1 bjh21 float128 float128_add( float128 a, float128 b )
4932 1.1 bjh21 {
4933 1.1 bjh21 flag aSign, bSign;
4934 1.1 bjh21
4935 1.1 bjh21 aSign = extractFloat128Sign( a );
4936 1.1 bjh21 bSign = extractFloat128Sign( b );
4937 1.1 bjh21 if ( aSign == bSign ) {
4938 1.1 bjh21 return addFloat128Sigs( a, b, aSign );
4939 1.1 bjh21 }
4940 1.1 bjh21 else {
4941 1.1 bjh21 return subFloat128Sigs( a, b, aSign );
4942 1.1 bjh21 }
4943 1.1 bjh21
4944 1.1 bjh21 }
4945 1.1 bjh21
4946 1.1 bjh21 /*
4947 1.1 bjh21 -------------------------------------------------------------------------------
4948 1.1 bjh21 Returns the result of subtracting the quadruple-precision floating-point
4949 1.1 bjh21 values `a' and `b'. The operation is performed according to the IEC/IEEE
4950 1.1 bjh21 Standard for Binary Floating-Point Arithmetic.
4951 1.1 bjh21 -------------------------------------------------------------------------------
4952 1.1 bjh21 */
4953 1.1 bjh21 float128 float128_sub( float128 a, float128 b )
4954 1.1 bjh21 {
4955 1.1 bjh21 flag aSign, bSign;
4956 1.1 bjh21
4957 1.1 bjh21 aSign = extractFloat128Sign( a );
4958 1.1 bjh21 bSign = extractFloat128Sign( b );
4959 1.1 bjh21 if ( aSign == bSign ) {
4960 1.1 bjh21 return subFloat128Sigs( a, b, aSign );
4961 1.1 bjh21 }
4962 1.1 bjh21 else {
4963 1.1 bjh21 return addFloat128Sigs( a, b, aSign );
4964 1.1 bjh21 }
4965 1.1 bjh21
4966 1.1 bjh21 }
4967 1.1 bjh21
4968 1.1 bjh21 /*
4969 1.1 bjh21 -------------------------------------------------------------------------------
4970 1.1 bjh21 Returns the result of multiplying the quadruple-precision floating-point
4971 1.1 bjh21 values `a' and `b'. The operation is performed according to the IEC/IEEE
4972 1.1 bjh21 Standard for Binary Floating-Point Arithmetic.
4973 1.1 bjh21 -------------------------------------------------------------------------------
4974 1.1 bjh21 */
4975 1.1 bjh21 float128 float128_mul( float128 a, float128 b )
4976 1.1 bjh21 {
4977 1.1 bjh21 flag aSign, bSign, zSign;
4978 1.1 bjh21 int32 aExp, bExp, zExp;
4979 1.1 bjh21 bits64 aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2, zSig3;
4980 1.1 bjh21 float128 z;
4981 1.1 bjh21
4982 1.1 bjh21 aSig1 = extractFloat128Frac1( a );
4983 1.1 bjh21 aSig0 = extractFloat128Frac0( a );
4984 1.1 bjh21 aExp = extractFloat128Exp( a );
4985 1.1 bjh21 aSign = extractFloat128Sign( a );
4986 1.1 bjh21 bSig1 = extractFloat128Frac1( b );
4987 1.1 bjh21 bSig0 = extractFloat128Frac0( b );
4988 1.1 bjh21 bExp = extractFloat128Exp( b );
4989 1.1 bjh21 bSign = extractFloat128Sign( b );
4990 1.1 bjh21 zSign = aSign ^ bSign;
4991 1.1 bjh21 if ( aExp == 0x7FFF ) {
4992 1.1 bjh21 if ( ( aSig0 | aSig1 )
4993 1.1 bjh21 || ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) {
4994 1.1 bjh21 return propagateFloat128NaN( a, b );
4995 1.1 bjh21 }
4996 1.1 bjh21 if ( ( bExp | bSig0 | bSig1 ) == 0 ) goto invalid;
4997 1.1 bjh21 return packFloat128( zSign, 0x7FFF, 0, 0 );
4998 1.1 bjh21 }
4999 1.1 bjh21 if ( bExp == 0x7FFF ) {
5000 1.1 bjh21 if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b );
5001 1.1 bjh21 if ( ( aExp | aSig0 | aSig1 ) == 0 ) {
5002 1.1 bjh21 invalid:
5003 1.1 bjh21 float_raise( float_flag_invalid );
5004 1.1 bjh21 z.low = float128_default_nan_low;
5005 1.1 bjh21 z.high = float128_default_nan_high;
5006 1.1 bjh21 return z;
5007 1.1 bjh21 }
5008 1.1 bjh21 return packFloat128( zSign, 0x7FFF, 0, 0 );
5009 1.1 bjh21 }
5010 1.1 bjh21 if ( aExp == 0 ) {
5011 1.1 bjh21 if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
5012 1.1 bjh21 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
5013 1.1 bjh21 }
5014 1.1 bjh21 if ( bExp == 0 ) {
5015 1.1 bjh21 if ( ( bSig0 | bSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
5016 1.1 bjh21 normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
5017 1.1 bjh21 }
5018 1.1 bjh21 zExp = aExp + bExp - 0x4000;
5019 1.1 bjh21 aSig0 |= LIT64( 0x0001000000000000 );
5020 1.1 bjh21 shortShift128Left( bSig0, bSig1, 16, &bSig0, &bSig1 );
5021 1.1 bjh21 mul128To256( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1, &zSig2, &zSig3 );
5022 1.1 bjh21 add128( zSig0, zSig1, aSig0, aSig1, &zSig0, &zSig1 );
5023 1.1 bjh21 zSig2 |= ( zSig3 != 0 );
5024 1.1 bjh21 if ( LIT64( 0x0002000000000000 ) <= zSig0 ) {
5025 1.1 bjh21 shift128ExtraRightJamming(
5026 1.1 bjh21 zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 );
5027 1.1 bjh21 ++zExp;
5028 1.1 bjh21 }
5029 1.1 bjh21 return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 );
5030 1.1 bjh21
5031 1.1 bjh21 }
5032 1.1 bjh21
5033 1.1 bjh21 /*
5034 1.1 bjh21 -------------------------------------------------------------------------------
5035 1.1 bjh21 Returns the result of dividing the quadruple-precision floating-point value
5036 1.1 bjh21 `a' by the corresponding value `b'. The operation is performed according to
5037 1.1 bjh21 the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5038 1.1 bjh21 -------------------------------------------------------------------------------
5039 1.1 bjh21 */
5040 1.1 bjh21 float128 float128_div( float128 a, float128 b )
5041 1.1 bjh21 {
5042 1.1 bjh21 flag aSign, bSign, zSign;
5043 1.1 bjh21 int32 aExp, bExp, zExp;
5044 1.1 bjh21 bits64 aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2;
5045 1.1 bjh21 bits64 rem0, rem1, rem2, rem3, term0, term1, term2, term3;
5046 1.1 bjh21 float128 z;
5047 1.1 bjh21
5048 1.1 bjh21 aSig1 = extractFloat128Frac1( a );
5049 1.1 bjh21 aSig0 = extractFloat128Frac0( a );
5050 1.1 bjh21 aExp = extractFloat128Exp( a );
5051 1.1 bjh21 aSign = extractFloat128Sign( a );
5052 1.1 bjh21 bSig1 = extractFloat128Frac1( b );
5053 1.1 bjh21 bSig0 = extractFloat128Frac0( b );
5054 1.1 bjh21 bExp = extractFloat128Exp( b );
5055 1.1 bjh21 bSign = extractFloat128Sign( b );
5056 1.1 bjh21 zSign = aSign ^ bSign;
5057 1.1 bjh21 if ( aExp == 0x7FFF ) {
5058 1.1 bjh21 if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, b );
5059 1.1 bjh21 if ( bExp == 0x7FFF ) {
5060 1.1 bjh21 if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b );
5061 1.1 bjh21 goto invalid;
5062 1.1 bjh21 }
5063 1.1 bjh21 return packFloat128( zSign, 0x7FFF, 0, 0 );
5064 1.1 bjh21 }
5065 1.1 bjh21 if ( bExp == 0x7FFF ) {
5066 1.1 bjh21 if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b );
5067 1.1 bjh21 return packFloat128( zSign, 0, 0, 0 );
5068 1.1 bjh21 }
5069 1.1 bjh21 if ( bExp == 0 ) {
5070 1.1 bjh21 if ( ( bSig0 | bSig1 ) == 0 ) {
5071 1.1 bjh21 if ( ( aExp | aSig0 | aSig1 ) == 0 ) {
5072 1.1 bjh21 invalid:
5073 1.1 bjh21 float_raise( float_flag_invalid );
5074 1.1 bjh21 z.low = float128_default_nan_low;
5075 1.1 bjh21 z.high = float128_default_nan_high;
5076 1.1 bjh21 return z;
5077 1.1 bjh21 }
5078 1.1 bjh21 float_raise( float_flag_divbyzero );
5079 1.1 bjh21 return packFloat128( zSign, 0x7FFF, 0, 0 );
5080 1.1 bjh21 }
5081 1.1 bjh21 normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
5082 1.1 bjh21 }
5083 1.1 bjh21 if ( aExp == 0 ) {
5084 1.1 bjh21 if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
5085 1.1 bjh21 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
5086 1.1 bjh21 }
5087 1.1 bjh21 zExp = aExp - bExp + 0x3FFD;
5088 1.1 bjh21 shortShift128Left(
5089 1.1 bjh21 aSig0 | LIT64( 0x0001000000000000 ), aSig1, 15, &aSig0, &aSig1 );
5090 1.1 bjh21 shortShift128Left(
5091 1.1 bjh21 bSig0 | LIT64( 0x0001000000000000 ), bSig1, 15, &bSig0, &bSig1 );
5092 1.1 bjh21 if ( le128( bSig0, bSig1, aSig0, aSig1 ) ) {
5093 1.1 bjh21 shift128Right( aSig0, aSig1, 1, &aSig0, &aSig1 );
5094 1.1 bjh21 ++zExp;
5095 1.1 bjh21 }
5096 1.1 bjh21 zSig0 = estimateDiv128To64( aSig0, aSig1, bSig0 );
5097 1.1 bjh21 mul128By64To192( bSig0, bSig1, zSig0, &term0, &term1, &term2 );
5098 1.1 bjh21 sub192( aSig0, aSig1, 0, term0, term1, term2, &rem0, &rem1, &rem2 );
5099 1.1 bjh21 while ( (sbits64) rem0 < 0 ) {
5100 1.1 bjh21 --zSig0;
5101 1.1 bjh21 add192( rem0, rem1, rem2, 0, bSig0, bSig1, &rem0, &rem1, &rem2 );
5102 1.1 bjh21 }
5103 1.1 bjh21 zSig1 = estimateDiv128To64( rem1, rem2, bSig0 );
5104 1.1 bjh21 if ( ( zSig1 & 0x3FFF ) <= 4 ) {
5105 1.1 bjh21 mul128By64To192( bSig0, bSig1, zSig1, &term1, &term2, &term3 );
5106 1.1 bjh21 sub192( rem1, rem2, 0, term1, term2, term3, &rem1, &rem2, &rem3 );
5107 1.1 bjh21 while ( (sbits64) rem1 < 0 ) {
5108 1.1 bjh21 --zSig1;
5109 1.1 bjh21 add192( rem1, rem2, rem3, 0, bSig0, bSig1, &rem1, &rem2, &rem3 );
5110 1.1 bjh21 }
5111 1.1 bjh21 zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
5112 1.1 bjh21 }
5113 1.1 bjh21 shift128ExtraRightJamming( zSig0, zSig1, 0, 15, &zSig0, &zSig1, &zSig2 );
5114 1.1 bjh21 return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 );
5115 1.1 bjh21
5116 1.1 bjh21 }
5117 1.1 bjh21
5118 1.1 bjh21 /*
5119 1.1 bjh21 -------------------------------------------------------------------------------
5120 1.1 bjh21 Returns the remainder of the quadruple-precision floating-point value `a'
5121 1.1 bjh21 with respect to the corresponding value `b'. The operation is performed
5122 1.1 bjh21 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5123 1.1 bjh21 -------------------------------------------------------------------------------
5124 1.1 bjh21 */
5125 1.1 bjh21 float128 float128_rem( float128 a, float128 b )
5126 1.1 bjh21 {
5127 1.9 martin flag aSign, zSign;
5128 1.1 bjh21 int32 aExp, bExp, expDiff;
5129 1.1 bjh21 bits64 aSig0, aSig1, bSig0, bSig1, q, term0, term1, term2;
5130 1.1 bjh21 bits64 allZero, alternateASig0, alternateASig1, sigMean1;
5131 1.1 bjh21 sbits64 sigMean0;
5132 1.1 bjh21 float128 z;
5133 1.1 bjh21
5134 1.1 bjh21 aSig1 = extractFloat128Frac1( a );
5135 1.1 bjh21 aSig0 = extractFloat128Frac0( a );
5136 1.1 bjh21 aExp = extractFloat128Exp( a );
5137 1.1 bjh21 aSign = extractFloat128Sign( a );
5138 1.1 bjh21 bSig1 = extractFloat128Frac1( b );
5139 1.1 bjh21 bSig0 = extractFloat128Frac0( b );
5140 1.1 bjh21 bExp = extractFloat128Exp( b );
5141 1.1 bjh21 if ( aExp == 0x7FFF ) {
5142 1.1 bjh21 if ( ( aSig0 | aSig1 )
5143 1.1 bjh21 || ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) {
5144 1.1 bjh21 return propagateFloat128NaN( a, b );
5145 1.1 bjh21 }
5146 1.1 bjh21 goto invalid;
5147 1.1 bjh21 }
5148 1.1 bjh21 if ( bExp == 0x7FFF ) {
5149 1.1 bjh21 if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b );
5150 1.1 bjh21 return a;
5151 1.1 bjh21 }
5152 1.1 bjh21 if ( bExp == 0 ) {
5153 1.1 bjh21 if ( ( bSig0 | bSig1 ) == 0 ) {
5154 1.1 bjh21 invalid:
5155 1.1 bjh21 float_raise( float_flag_invalid );
5156 1.1 bjh21 z.low = float128_default_nan_low;
5157 1.1 bjh21 z.high = float128_default_nan_high;
5158 1.1 bjh21 return z;
5159 1.1 bjh21 }
5160 1.1 bjh21 normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
5161 1.1 bjh21 }
5162 1.1 bjh21 if ( aExp == 0 ) {
5163 1.1 bjh21 if ( ( aSig0 | aSig1 ) == 0 ) return a;
5164 1.1 bjh21 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
5165 1.1 bjh21 }
5166 1.1 bjh21 expDiff = aExp - bExp;
5167 1.1 bjh21 if ( expDiff < -1 ) return a;
5168 1.1 bjh21 shortShift128Left(
5169 1.1 bjh21 aSig0 | LIT64( 0x0001000000000000 ),
5170 1.1 bjh21 aSig1,
5171 1.1 bjh21 15 - ( expDiff < 0 ),
5172 1.1 bjh21 &aSig0,
5173 1.1 bjh21 &aSig1
5174 1.1 bjh21 );
5175 1.1 bjh21 shortShift128Left(
5176 1.1 bjh21 bSig0 | LIT64( 0x0001000000000000 ), bSig1, 15, &bSig0, &bSig1 );
5177 1.1 bjh21 q = le128( bSig0, bSig1, aSig0, aSig1 );
5178 1.1 bjh21 if ( q ) sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 );
5179 1.1 bjh21 expDiff -= 64;
5180 1.1 bjh21 while ( 0 < expDiff ) {
5181 1.1 bjh21 q = estimateDiv128To64( aSig0, aSig1, bSig0 );
5182 1.1 bjh21 q = ( 4 < q ) ? q - 4 : 0;
5183 1.1 bjh21 mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 );
5184 1.1 bjh21 shortShift192Left( term0, term1, term2, 61, &term1, &term2, &allZero );
5185 1.1 bjh21 shortShift128Left( aSig0, aSig1, 61, &aSig0, &allZero );
5186 1.1 bjh21 sub128( aSig0, 0, term1, term2, &aSig0, &aSig1 );
5187 1.1 bjh21 expDiff -= 61;
5188 1.1 bjh21 }
5189 1.1 bjh21 if ( -64 < expDiff ) {
5190 1.1 bjh21 q = estimateDiv128To64( aSig0, aSig1, bSig0 );
5191 1.1 bjh21 q = ( 4 < q ) ? q - 4 : 0;
5192 1.1 bjh21 q >>= - expDiff;
5193 1.1 bjh21 shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 );
5194 1.1 bjh21 expDiff += 52;
5195 1.1 bjh21 if ( expDiff < 0 ) {
5196 1.1 bjh21 shift128Right( aSig0, aSig1, - expDiff, &aSig0, &aSig1 );
5197 1.1 bjh21 }
5198 1.1 bjh21 else {
5199 1.1 bjh21 shortShift128Left( aSig0, aSig1, expDiff, &aSig0, &aSig1 );
5200 1.1 bjh21 }
5201 1.1 bjh21 mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 );
5202 1.1 bjh21 sub128( aSig0, aSig1, term1, term2, &aSig0, &aSig1 );
5203 1.1 bjh21 }
5204 1.1 bjh21 else {
5205 1.1 bjh21 shift128Right( aSig0, aSig1, 12, &aSig0, &aSig1 );
5206 1.1 bjh21 shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 );
5207 1.1 bjh21 }
5208 1.1 bjh21 do {
5209 1.1 bjh21 alternateASig0 = aSig0;
5210 1.1 bjh21 alternateASig1 = aSig1;
5211 1.1 bjh21 ++q;
5212 1.1 bjh21 sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 );
5213 1.1 bjh21 } while ( 0 <= (sbits64) aSig0 );
5214 1.1 bjh21 add128(
5215 1.3 mrg aSig0, aSig1, alternateASig0, alternateASig1, (bits64 *)&sigMean0, &sigMean1 );
5216 1.1 bjh21 if ( ( sigMean0 < 0 )
5217 1.1 bjh21 || ( ( ( sigMean0 | sigMean1 ) == 0 ) && ( q & 1 ) ) ) {
5218 1.1 bjh21 aSig0 = alternateASig0;
5219 1.1 bjh21 aSig1 = alternateASig1;
5220 1.1 bjh21 }
5221 1.1 bjh21 zSign = ( (sbits64) aSig0 < 0 );
5222 1.1 bjh21 if ( zSign ) sub128( 0, 0, aSig0, aSig1, &aSig0, &aSig1 );
5223 1.1 bjh21 return
5224 1.1 bjh21 normalizeRoundAndPackFloat128( aSign ^ zSign, bExp - 4, aSig0, aSig1 );
5225 1.1 bjh21
5226 1.1 bjh21 }
5227 1.1 bjh21
5228 1.1 bjh21 /*
5229 1.1 bjh21 -------------------------------------------------------------------------------
5230 1.1 bjh21 Returns the square root of the quadruple-precision floating-point value `a'.
5231 1.1 bjh21 The operation is performed according to the IEC/IEEE Standard for Binary
5232 1.1 bjh21 Floating-Point Arithmetic.
5233 1.1 bjh21 -------------------------------------------------------------------------------
5234 1.1 bjh21 */
5235 1.1 bjh21 float128 float128_sqrt( float128 a )
5236 1.1 bjh21 {
5237 1.1 bjh21 flag aSign;
5238 1.1 bjh21 int32 aExp, zExp;
5239 1.1 bjh21 bits64 aSig0, aSig1, zSig0, zSig1, zSig2, doubleZSig0;
5240 1.1 bjh21 bits64 rem0, rem1, rem2, rem3, term0, term1, term2, term3;
5241 1.1 bjh21 float128 z;
5242 1.1 bjh21
5243 1.1 bjh21 aSig1 = extractFloat128Frac1( a );
5244 1.1 bjh21 aSig0 = extractFloat128Frac0( a );
5245 1.1 bjh21 aExp = extractFloat128Exp( a );
5246 1.1 bjh21 aSign = extractFloat128Sign( a );
5247 1.1 bjh21 if ( aExp == 0x7FFF ) {
5248 1.1 bjh21 if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, a );
5249 1.1 bjh21 if ( ! aSign ) return a;
5250 1.1 bjh21 goto invalid;
5251 1.1 bjh21 }
5252 1.1 bjh21 if ( aSign ) {
5253 1.1 bjh21 if ( ( aExp | aSig0 | aSig1 ) == 0 ) return a;
5254 1.1 bjh21 invalid:
5255 1.1 bjh21 float_raise( float_flag_invalid );
5256 1.1 bjh21 z.low = float128_default_nan_low;
5257 1.1 bjh21 z.high = float128_default_nan_high;
5258 1.1 bjh21 return z;
5259 1.1 bjh21 }
5260 1.1 bjh21 if ( aExp == 0 ) {
5261 1.1 bjh21 if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( 0, 0, 0, 0 );
5262 1.1 bjh21 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
5263 1.1 bjh21 }
5264 1.11.2.2 tls zExp = (int32) ( (aExp - 0x3FFF) >> 1) + 0x3FFE;
5265 1.1 bjh21 aSig0 |= LIT64( 0x0001000000000000 );
5266 1.10 christos zSig0 = estimateSqrt32((int16)aExp, (bits32)(aSig0>>17));
5267 1.1 bjh21 shortShift128Left( aSig0, aSig1, 13 - ( aExp & 1 ), &aSig0, &aSig1 );
5268 1.1 bjh21 zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 );
5269 1.1 bjh21 doubleZSig0 = zSig0<<1;
5270 1.1 bjh21 mul64To128( zSig0, zSig0, &term0, &term1 );
5271 1.1 bjh21 sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 );
5272 1.1 bjh21 while ( (sbits64) rem0 < 0 ) {
5273 1.1 bjh21 --zSig0;
5274 1.1 bjh21 doubleZSig0 -= 2;
5275 1.1 bjh21 add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 );
5276 1.1 bjh21 }
5277 1.1 bjh21 zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 );
5278 1.1 bjh21 if ( ( zSig1 & 0x1FFF ) <= 5 ) {
5279 1.1 bjh21 if ( zSig1 == 0 ) zSig1 = 1;
5280 1.1 bjh21 mul64To128( doubleZSig0, zSig1, &term1, &term2 );
5281 1.1 bjh21 sub128( rem1, 0, term1, term2, &rem1, &rem2 );
5282 1.1 bjh21 mul64To128( zSig1, zSig1, &term2, &term3 );
5283 1.1 bjh21 sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 );
5284 1.1 bjh21 while ( (sbits64) rem1 < 0 ) {
5285 1.1 bjh21 --zSig1;
5286 1.1 bjh21 shortShift128Left( 0, zSig1, 1, &term2, &term3 );
5287 1.1 bjh21 term3 |= 1;
5288 1.1 bjh21 term2 |= doubleZSig0;
5289 1.1 bjh21 add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 );
5290 1.1 bjh21 }
5291 1.1 bjh21 zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
5292 1.1 bjh21 }
5293 1.1 bjh21 shift128ExtraRightJamming( zSig0, zSig1, 0, 14, &zSig0, &zSig1, &zSig2 );
5294 1.1 bjh21 return roundAndPackFloat128( 0, zExp, zSig0, zSig1, zSig2 );
5295 1.1 bjh21
5296 1.1 bjh21 }
5297 1.1 bjh21
5298 1.1 bjh21 /*
5299 1.1 bjh21 -------------------------------------------------------------------------------
5300 1.1 bjh21 Returns 1 if the quadruple-precision floating-point value `a' is equal to
5301 1.1 bjh21 the corresponding value `b', and 0 otherwise. The comparison is performed
5302 1.1 bjh21 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5303 1.1 bjh21 -------------------------------------------------------------------------------
5304 1.1 bjh21 */
5305 1.1 bjh21 flag float128_eq( float128 a, float128 b )
5306 1.1 bjh21 {
5307 1.1 bjh21
5308 1.1 bjh21 if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
5309 1.1 bjh21 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
5310 1.1 bjh21 || ( ( extractFloat128Exp( b ) == 0x7FFF )
5311 1.1 bjh21 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
5312 1.1 bjh21 ) {
5313 1.1 bjh21 if ( float128_is_signaling_nan( a )
5314 1.1 bjh21 || float128_is_signaling_nan( b ) ) {
5315 1.1 bjh21 float_raise( float_flag_invalid );
5316 1.1 bjh21 }
5317 1.1 bjh21 return 0;
5318 1.1 bjh21 }
5319 1.1 bjh21 return
5320 1.1 bjh21 ( a.low == b.low )
5321 1.1 bjh21 && ( ( a.high == b.high )
5322 1.1 bjh21 || ( ( a.low == 0 )
5323 1.1 bjh21 && ( (bits64) ( ( a.high | b.high )<<1 ) == 0 ) )
5324 1.1 bjh21 );
5325 1.1 bjh21
5326 1.1 bjh21 }
5327 1.1 bjh21
5328 1.1 bjh21 /*
5329 1.1 bjh21 -------------------------------------------------------------------------------
5330 1.1 bjh21 Returns 1 if the quadruple-precision floating-point value `a' is less than
5331 1.1 bjh21 or equal to the corresponding value `b', and 0 otherwise. The comparison
5332 1.1 bjh21 is performed according to the IEC/IEEE Standard for Binary Floating-Point
5333 1.1 bjh21 Arithmetic.
5334 1.1 bjh21 -------------------------------------------------------------------------------
5335 1.1 bjh21 */
5336 1.1 bjh21 flag float128_le( float128 a, float128 b )
5337 1.1 bjh21 {
5338 1.1 bjh21 flag aSign, bSign;
5339 1.1 bjh21
5340 1.1 bjh21 if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
5341 1.1 bjh21 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
5342 1.1 bjh21 || ( ( extractFloat128Exp( b ) == 0x7FFF )
5343 1.1 bjh21 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
5344 1.1 bjh21 ) {
5345 1.1 bjh21 float_raise( float_flag_invalid );
5346 1.1 bjh21 return 0;
5347 1.1 bjh21 }
5348 1.1 bjh21 aSign = extractFloat128Sign( a );
5349 1.1 bjh21 bSign = extractFloat128Sign( b );
5350 1.1 bjh21 if ( aSign != bSign ) {
5351 1.1 bjh21 return
5352 1.1 bjh21 aSign
5353 1.1 bjh21 || ( ( ( (bits64) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5354 1.1 bjh21 == 0 );
5355 1.1 bjh21 }
5356 1.1 bjh21 return
5357 1.1 bjh21 aSign ? le128( b.high, b.low, a.high, a.low )
5358 1.1 bjh21 : le128( a.high, a.low, b.high, b.low );
5359 1.1 bjh21
5360 1.1 bjh21 }
5361 1.1 bjh21
5362 1.1 bjh21 /*
5363 1.1 bjh21 -------------------------------------------------------------------------------
5364 1.1 bjh21 Returns 1 if the quadruple-precision floating-point value `a' is less than
5365 1.1 bjh21 the corresponding value `b', and 0 otherwise. The comparison is performed
5366 1.1 bjh21 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5367 1.1 bjh21 -------------------------------------------------------------------------------
5368 1.1 bjh21 */
5369 1.1 bjh21 flag float128_lt( float128 a, float128 b )
5370 1.1 bjh21 {
5371 1.1 bjh21 flag aSign, bSign;
5372 1.1 bjh21
5373 1.1 bjh21 if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
5374 1.1 bjh21 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
5375 1.1 bjh21 || ( ( extractFloat128Exp( b ) == 0x7FFF )
5376 1.1 bjh21 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
5377 1.1 bjh21 ) {
5378 1.1 bjh21 float_raise( float_flag_invalid );
5379 1.1 bjh21 return 0;
5380 1.1 bjh21 }
5381 1.1 bjh21 aSign = extractFloat128Sign( a );
5382 1.1 bjh21 bSign = extractFloat128Sign( b );
5383 1.1 bjh21 if ( aSign != bSign ) {
5384 1.1 bjh21 return
5385 1.1 bjh21 aSign
5386 1.1 bjh21 && ( ( ( (bits64) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5387 1.1 bjh21 != 0 );
5388 1.1 bjh21 }
5389 1.1 bjh21 return
5390 1.1 bjh21 aSign ? lt128( b.high, b.low, a.high, a.low )
5391 1.1 bjh21 : lt128( a.high, a.low, b.high, b.low );
5392 1.1 bjh21
5393 1.1 bjh21 }
5394 1.1 bjh21
5395 1.1 bjh21 /*
5396 1.1 bjh21 -------------------------------------------------------------------------------
5397 1.1 bjh21 Returns 1 if the quadruple-precision floating-point value `a' is equal to
5398 1.1 bjh21 the corresponding value `b', and 0 otherwise. The invalid exception is
5399 1.1 bjh21 raised if either operand is a NaN. Otherwise, the comparison is performed
5400 1.1 bjh21 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5401 1.1 bjh21 -------------------------------------------------------------------------------
5402 1.1 bjh21 */
5403 1.1 bjh21 flag float128_eq_signaling( float128 a, float128 b )
5404 1.1 bjh21 {
5405 1.1 bjh21
5406 1.1 bjh21 if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
5407 1.1 bjh21 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
5408 1.1 bjh21 || ( ( extractFloat128Exp( b ) == 0x7FFF )
5409 1.1 bjh21 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
5410 1.1 bjh21 ) {
5411 1.1 bjh21 float_raise( float_flag_invalid );
5412 1.1 bjh21 return 0;
5413 1.1 bjh21 }
5414 1.1 bjh21 return
5415 1.1 bjh21 ( a.low == b.low )
5416 1.1 bjh21 && ( ( a.high == b.high )
5417 1.1 bjh21 || ( ( a.low == 0 )
5418 1.1 bjh21 && ( (bits64) ( ( a.high | b.high )<<1 ) == 0 ) )
5419 1.1 bjh21 );
5420 1.1 bjh21
5421 1.1 bjh21 }
5422 1.1 bjh21
5423 1.1 bjh21 /*
5424 1.1 bjh21 -------------------------------------------------------------------------------
5425 1.1 bjh21 Returns 1 if the quadruple-precision floating-point value `a' is less than
5426 1.1 bjh21 or equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not
5427 1.1 bjh21 cause an exception. Otherwise, the comparison is performed according to the
5428 1.1 bjh21 IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5429 1.1 bjh21 -------------------------------------------------------------------------------
5430 1.1 bjh21 */
5431 1.1 bjh21 flag float128_le_quiet( float128 a, float128 b )
5432 1.1 bjh21 {
5433 1.1 bjh21 flag aSign, bSign;
5434 1.1 bjh21
5435 1.1 bjh21 if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
5436 1.1 bjh21 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
5437 1.1 bjh21 || ( ( extractFloat128Exp( b ) == 0x7FFF )
5438 1.1 bjh21 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
5439 1.1 bjh21 ) {
5440 1.1 bjh21 if ( float128_is_signaling_nan( a )
5441 1.1 bjh21 || float128_is_signaling_nan( b ) ) {
5442 1.1 bjh21 float_raise( float_flag_invalid );
5443 1.1 bjh21 }
5444 1.1 bjh21 return 0;
5445 1.1 bjh21 }
5446 1.1 bjh21 aSign = extractFloat128Sign( a );
5447 1.1 bjh21 bSign = extractFloat128Sign( b );
5448 1.1 bjh21 if ( aSign != bSign ) {
5449 1.1 bjh21 return
5450 1.1 bjh21 aSign
5451 1.1 bjh21 || ( ( ( (bits64) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5452 1.1 bjh21 == 0 );
5453 1.1 bjh21 }
5454 1.1 bjh21 return
5455 1.1 bjh21 aSign ? le128( b.high, b.low, a.high, a.low )
5456 1.1 bjh21 : le128( a.high, a.low, b.high, b.low );
5457 1.1 bjh21
5458 1.1 bjh21 }
5459 1.1 bjh21
5460 1.1 bjh21 /*
5461 1.1 bjh21 -------------------------------------------------------------------------------
5462 1.1 bjh21 Returns 1 if the quadruple-precision floating-point value `a' is less than
5463 1.1 bjh21 the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
5464 1.1 bjh21 exception. Otherwise, the comparison is performed according to the IEC/IEEE
5465 1.1 bjh21 Standard for Binary Floating-Point Arithmetic.
5466 1.1 bjh21 -------------------------------------------------------------------------------
5467 1.1 bjh21 */
5468 1.1 bjh21 flag float128_lt_quiet( float128 a, float128 b )
5469 1.1 bjh21 {
5470 1.1 bjh21 flag aSign, bSign;
5471 1.1 bjh21
5472 1.1 bjh21 if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
5473 1.1 bjh21 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
5474 1.1 bjh21 || ( ( extractFloat128Exp( b ) == 0x7FFF )
5475 1.1 bjh21 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
5476 1.1 bjh21 ) {
5477 1.1 bjh21 if ( float128_is_signaling_nan( a )
5478 1.1 bjh21 || float128_is_signaling_nan( b ) ) {
5479 1.1 bjh21 float_raise( float_flag_invalid );
5480 1.1 bjh21 }
5481 1.1 bjh21 return 0;
5482 1.1 bjh21 }
5483 1.1 bjh21 aSign = extractFloat128Sign( a );
5484 1.1 bjh21 bSign = extractFloat128Sign( b );
5485 1.1 bjh21 if ( aSign != bSign ) {
5486 1.1 bjh21 return
5487 1.1 bjh21 aSign
5488 1.1 bjh21 && ( ( ( (bits64) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5489 1.1 bjh21 != 0 );
5490 1.1 bjh21 }
5491 1.1 bjh21 return
5492 1.1 bjh21 aSign ? lt128( b.high, b.low, a.high, a.low )
5493 1.1 bjh21 : lt128( a.high, a.low, b.high, b.low );
5494 1.1 bjh21
5495 1.1 bjh21 }
5496 1.1 bjh21
5497 1.1 bjh21 #endif
5498 1.1 bjh21
5499 1.1 bjh21
5500 1.1 bjh21 #if defined(SOFTFLOAT_FOR_GCC) && defined(SOFTFLOAT_NEED_FIXUNS)
5501 1.1 bjh21
5502 1.1 bjh21 /*
5503 1.1 bjh21 * These two routines are not part of the original softfloat distribution.
5504 1.1 bjh21 *
5505 1.1 bjh21 * They are based on the corresponding conversions to integer but return
5506 1.1 bjh21 * unsigned numbers instead since these functions are required by GCC.
5507 1.1 bjh21 *
5508 1.2 salo * Added by Mark Brinicombe <mark (at) NetBSD.org> 27/09/97
5509 1.1 bjh21 *
5510 1.1 bjh21 * float64 version overhauled for SoftFloat 2a [bjh21 2000-07-15]
5511 1.1 bjh21 */
5512 1.1 bjh21
5513 1.1 bjh21 /*
5514 1.1 bjh21 -------------------------------------------------------------------------------
5515 1.1 bjh21 Returns the result of converting the double-precision floating-point value
5516 1.1 bjh21 `a' to the 32-bit unsigned integer format. The conversion is
5517 1.1 bjh21 performed according to the IEC/IEEE Standard for Binary Floating-point
5518 1.1 bjh21 Arithmetic, except that the conversion is always rounded toward zero. If
5519 1.1 bjh21 `a' is a NaN, the largest positive integer is returned. If the conversion
5520 1.1 bjh21 overflows, the largest integer positive is returned.
5521 1.1 bjh21 -------------------------------------------------------------------------------
5522 1.1 bjh21 */
5523 1.1 bjh21 uint32 float64_to_uint32_round_to_zero( float64 a )
5524 1.1 bjh21 {
5525 1.1 bjh21 flag aSign;
5526 1.1 bjh21 int16 aExp, shiftCount;
5527 1.1 bjh21 bits64 aSig, savedASig;
5528 1.1 bjh21 uint32 z;
5529 1.1 bjh21
5530 1.1 bjh21 aSig = extractFloat64Frac( a );
5531 1.1 bjh21 aExp = extractFloat64Exp( a );
5532 1.1 bjh21 aSign = extractFloat64Sign( a );
5533 1.1 bjh21
5534 1.1 bjh21 if (aSign) {
5535 1.1 bjh21 float_raise( float_flag_invalid );
5536 1.1 bjh21 return(0);
5537 1.1 bjh21 }
5538 1.1 bjh21
5539 1.1 bjh21 if ( 0x41E < aExp ) {
5540 1.1 bjh21 float_raise( float_flag_invalid );
5541 1.1 bjh21 return 0xffffffff;
5542 1.1 bjh21 }
5543 1.1 bjh21 else if ( aExp < 0x3FF ) {
5544 1.11.2.1 tls if ( aExp || aSig ) set_float_exception_inexact_flag();
5545 1.1 bjh21 return 0;
5546 1.1 bjh21 }
5547 1.1 bjh21 aSig |= LIT64( 0x0010000000000000 );
5548 1.1 bjh21 shiftCount = 0x433 - aExp;
5549 1.1 bjh21 savedASig = aSig;
5550 1.1 bjh21 aSig >>= shiftCount;
5551 1.10 christos z = (uint32)aSig;
5552 1.1 bjh21 if ( ( aSig<<shiftCount ) != savedASig ) {
5553 1.11.2.1 tls set_float_exception_inexact_flag();
5554 1.1 bjh21 }
5555 1.1 bjh21 return z;
5556 1.1 bjh21
5557 1.1 bjh21 }
5558 1.1 bjh21
5559 1.1 bjh21 /*
5560 1.1 bjh21 -------------------------------------------------------------------------------
5561 1.1 bjh21 Returns the result of converting the single-precision floating-point value
5562 1.1 bjh21 `a' to the 32-bit unsigned integer format. The conversion is
5563 1.1 bjh21 performed according to the IEC/IEEE Standard for Binary Floating-point
5564 1.1 bjh21 Arithmetic, except that the conversion is always rounded toward zero. If
5565 1.1 bjh21 `a' is a NaN, the largest positive integer is returned. If the conversion
5566 1.1 bjh21 overflows, the largest positive integer is returned.
5567 1.1 bjh21 -------------------------------------------------------------------------------
5568 1.1 bjh21 */
5569 1.1 bjh21 uint32 float32_to_uint32_round_to_zero( float32 a )
5570 1.1 bjh21 {
5571 1.1 bjh21 flag aSign;
5572 1.1 bjh21 int16 aExp, shiftCount;
5573 1.1 bjh21 bits32 aSig;
5574 1.1 bjh21 uint32 z;
5575 1.1 bjh21
5576 1.1 bjh21 aSig = extractFloat32Frac( a );
5577 1.1 bjh21 aExp = extractFloat32Exp( a );
5578 1.1 bjh21 aSign = extractFloat32Sign( a );
5579 1.1 bjh21 shiftCount = aExp - 0x9E;
5580 1.1 bjh21
5581 1.1 bjh21 if (aSign) {
5582 1.1 bjh21 float_raise( float_flag_invalid );
5583 1.1 bjh21 return(0);
5584 1.1 bjh21 }
5585 1.1 bjh21 if ( 0 < shiftCount ) {
5586 1.1 bjh21 float_raise( float_flag_invalid );
5587 1.1 bjh21 return 0xFFFFFFFF;
5588 1.1 bjh21 }
5589 1.1 bjh21 else if ( aExp <= 0x7E ) {
5590 1.11.2.1 tls if ( aExp | aSig ) set_float_exception_inexact_flag();
5591 1.1 bjh21 return 0;
5592 1.1 bjh21 }
5593 1.1 bjh21 aSig = ( aSig | 0x800000 )<<8;
5594 1.1 bjh21 z = aSig>>( - shiftCount );
5595 1.1 bjh21 if ( aSig<<( shiftCount & 31 ) ) {
5596 1.11.2.1 tls set_float_exception_inexact_flag();
5597 1.1 bjh21 }
5598 1.1 bjh21 return z;
5599 1.1 bjh21
5600 1.1 bjh21 }
5601 1.1 bjh21
5602 1.1 bjh21 #endif
5603